1930 files changed, 750549 insertions, 0 deletions
diff --git a/src/rocksdb/.circleci/config.yml b/src/rocksdb/.circleci/config.yml
new file mode 100644
index 000000000..d9386aab1
--- /dev/null
+++ b/src/rocksdb/.circleci/config.yml
@@ -0,0 +1,898 @@
+version: 2.1
+
+orbs:
+  win: circleci/windows@5.0.0
+
+commands:
+  install-cmake-on-macos:
+    steps:
+      - run:
+          name: Install cmake on macos
+          command: |
+            HOMEBREW_NO_AUTO_UPDATE=1 brew install cmake
+
+  install-jdk8-on-macos:
+    steps:
+      - run:
+          name: Install JDK 8 on macos
+          command: |
+            brew install --cask adoptopenjdk/openjdk/adoptopenjdk8
+
+  increase-max-open-files-on-macos:
+    steps:
+      - run:
+          name: Increase max open files
+          command: |
+            sudo sysctl -w kern.maxfiles=1048576
+            sudo sysctl -w kern.maxfilesperproc=1048576
+            sudo launchctl limit maxfiles 1048576
+
+  pre-steps:
+    steps:
+      - checkout
+      - run:
+          name: Setup Environment Variables
+          command: |
+            echo "export GTEST_THROW_ON_FAILURE=0" >> $BASH_ENV
+            echo "export GTEST_OUTPUT=\"xml:/tmp/test-results/\"" >> $BASH_ENV
+            echo "export SKIP_FORMAT_BUCK_CHECKS=1" >> $BASH_ENV
+            echo "export GTEST_COLOR=1" >> $BASH_ENV
+            echo "export CTEST_OUTPUT_ON_FAILURE=1" >> $BASH_ENV
+            echo "export CTEST_TEST_TIMEOUT=300" >> $BASH_ENV
+            echo "export ZLIB_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/zlib" >> $BASH_ENV
+            echo "export BZIP2_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/bzip2" >> $BASH_ENV
+            echo "export SNAPPY_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/snappy" >> $BASH_ENV
+            echo "export LZ4_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/lz4" >> $BASH_ENV
+            echo "export ZSTD_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/zstd" >> $BASH_ENV
+
+  windows-build-steps:
+    steps:
+      - checkout
+      - run:
+          name: "Install thirdparty dependencies"
+          command: |
+            echo "Installing CMake..."
+            choco install cmake --installargs 'ADD_CMAKE_TO_PATH=System' -y
+            mkdir $Env:THIRDPARTY_HOME
+            cd $Env:THIRDPARTY_HOME
+            echo "Building Snappy dependency..."
+            curl https://github.com/google/snappy/archive/refs/tags/1.1.8.zip -O snappy-1.1.8.zip
+            unzip -q snappy-1.1.8.zip
+            cd snappy-1.1.8
+            mkdir build
+            cd build
+            & $Env:CMAKE_BIN -G "$Env:CMAKE_GENERATOR" ..
+            msbuild.exe Snappy.sln -maxCpuCount -property:Configuration=Debug -property:Platform=x64
+      - run:
+          name: "Build RocksDB"
+          command: |
+            mkdir build
+            cd build
+            & $Env:CMAKE_BIN -G "$Env:CMAKE_GENERATOR" -DCMAKE_BUILD_TYPE=Debug -DOPTDBG=1 -DPORTABLE=1 -DSNAPPY=1 -DJNI=1 ..
+            cd ..
+            echo "Building with VS version: $Env:CMAKE_GENERATOR"
+            msbuild.exe build/rocksdb.sln -maxCpuCount -property:Configuration=Debug -property:Platform=x64
+      - run:
+          name: "Test RocksDB"
+          shell: powershell.exe
+          command: |
+            build_tools\run_ci_db_test.ps1 -SuiteRun db_basic_test,db_test,db_test2,db_merge_operand_test,bloom_test,c_test,coding_test,crc32c_test,dynamic_bloom_test,env_basic_test,env_test,hash_test,random_test -Concurrency 16
+  pre-steps-macos:
+      steps:
+        - pre-steps
+
+  post-steps:
+    steps:
+      - store_test_results: # store test result if there's any
+          path: /tmp/test-results
+      - store_artifacts: # store LOG for debugging if there's any
+          path: LOG
+      - run: # on fail, compress Test Logs for diagnosing the issue
+          name: Compress Test Logs
+          command: tar -cvzf t.tar.gz t
+          when: on_fail
+      - store_artifacts: # on fail, store Test Logs for diagnosing the issue
+          path: t.tar.gz
+          destination: test_logs
+          when: on_fail
+      - run: # store core dumps if there's any
+          command: |
+            mkdir -p /tmp/core_dumps
+            cp core.* /tmp/core_dumps
+          when: on_fail
+      - store_artifacts:
+          path: /tmp/core_dumps
+          when: on_fail
+
+  upgrade-cmake:
+    steps:
+      - run:
+          name: Upgrade cmake
+          command: |
+            sudo apt remove --purge cmake
+            sudo snap install cmake --classic
+
+  install-gflags:
+    steps:
+      - run:
+          name: Install gflags
+          command: |
+            sudo apt-get update -y && sudo apt-get install -y libgflags-dev
+
+  install-gflags-on-macos:
+    steps:
+      - run:
+          name: Install gflags on macos
+          command: |
+            HOMEBREW_NO_AUTO_UPDATE=1 brew install gflags
+
+  setup-folly:
+    steps:
+      - run:
+          name: Checkout folly sources
+          command: |
+            make checkout_folly
+
+  build-folly:
+    steps:
+      - run:
+          name: Build folly and dependencies
+          command: |
+            make build_folly
+
+  build-for-benchmarks:
+    steps:
+      - pre-steps
+      - run:
+          name: "Linux build for benchmarks"
+          command: #sized for the resource-class rocksdb-benchmark-sys1
+            make V=1 J=8 -j8 release
+
+  perform-benchmarks:
+    steps:
+      - run:
+          name: "Test low-variance benchmarks"
+          command: ./tools/benchmark_ci.py --db_dir /tmp/rocksdb-benchmark-datadir --output_dir /tmp/benchmark-results --num_keys 10000000
+          environment:
+            LD_LIBRARY_PATH: /usr/local/lib
+            # How long to run parts of the test(s)
+            DURATION_RO: 400
+            DURATION_RW: 700
+            # Keep threads within physical capacity of server (much lower than default)
+            NUM_THREADS: 1
+            MAX_BACKGROUND_JOBS: 3
+            # Don't run a couple of "optional" initial tests
+            CI_TESTS_ONLY: "true"
+            # Reduce configured size of levels to ensure more levels in the leveled compaction LSM tree
+            WRITE_BUFFER_SIZE_MB: 16
+            TARGET_FILE_SIZE_BASE_MB: 16
+            MAX_BYTES_FOR_LEVEL_BASE_MB: 64
+            # The benchmark host has 32GB memory
+            # The following values are tailored to work with that
+            # Note, tests may not exercise the targeted issues if the memory is increased on new test hosts.
+
+
+  post-benchmarks:
+    steps:
+      - store_artifacts: # store the benchmark output
+          path: /tmp/benchmark-results
+          destination: test_logs
+      - run:
+          name: Send benchmark report to visualisation
+          command: |
+            set +e
+            set +o pipefail
+            ./build_tools/benchmark_log_tool.py --tsvfile /tmp/benchmark-results/report.tsv --esdocument https://search-rocksdb-bench-k2izhptfeap2hjfxteolsgsynm.us-west-2.es.amazonaws.com/bench_test3_rix/_doc
+            true
+
+executors:
+  linux-docker:
+    docker:
+      # The image configuration is build_tools/ubuntu20_image/Dockerfile
+      # To update and build the image:
+      #  $ cd build_tools/ubuntu20_image
+      #  $ docker build -t zjay437/rocksdb:0.5 .
+      #  $ docker push zjay437/rocksdb:0.5
+      # `zjay437` is the account name for zjay@meta.com which readwrite token is shared internally. To login:
+      #  $ docker login --username zjay437
+      # Or please feel free to change it to your docker hub account for hosting the image, meta employee should already have the account and able to login with SSO.
+      # To avoid impacting the existing CI runs, please bump the version every time creating a new image
+      # to run the CI image environment locally:
+      #  $ docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -it zjay437/rocksdb:0.5 bash
+      # option `--cap-add=SYS_PTRACE --security-opt seccomp=unconfined` is used to enable gdb to attach an existing process
+      - image: zjay437/rocksdb:0.6
+
+jobs:
+  build-macos:
+    macos:
+      xcode: 12.5.1
+    resource_class: large
+    environment:
+      ROCKSDB_DISABLE_JEMALLOC: 1 # jemalloc cause env_test hang, disable it for now
+    steps:
+      - increase-max-open-files-on-macos
+      - install-gflags-on-macos
+      - pre-steps-macos
+      - run: ulimit -S -n `ulimit -H -n` && OPT=-DCIRCLECI make V=1 J=32 -j32 all
+      - post-steps
+
+  build-macos-cmake:
+    macos:
+      xcode: 12.5.1
+    resource_class: large
+    parameters:
+      run_even_tests:
+        description: run even or odd tests, used to split tests to 2 groups
+        type: boolean
+        default: true
+    steps:
+      - increase-max-open-files-on-macos
+      - install-cmake-on-macos
+      - install-gflags-on-macos
+      - pre-steps-macos
+      - run:
+          name: "cmake generate project file"
+          command: ulimit -S -n `ulimit -H -n` && mkdir build && cd build && cmake -DWITH_GFLAGS=1 ..
+      - run:
+          name: "Build tests"
+          command: cd build && make V=1 -j32
+      - when:
+          condition: << parameters.run_even_tests >>
+          steps:
+            - run:
+                name: "Run even tests"
+                command: ulimit -S -n `ulimit -H -n` && cd build && ctest -j32 -I 0,,2
+      - when:
+          condition:
+            not: << parameters.run_even_tests >>
+          steps:
+            - run:
+                name: "Run odd tests"
+                command: ulimit -S -n `ulimit -H -n` && cd build && ctest -j32 -I 1,,2
+      - post-steps
+
+  build-linux:
+    executor: linux-docker
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - run: make V=1 J=32 -j32 check
+      - post-steps
+
+  build-linux-encrypted_env-no_compression:
+    executor: linux-docker
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - run: ENCRYPTED_ENV=1 ROCKSDB_DISABLE_SNAPPY=1 ROCKSDB_DISABLE_ZLIB=1 ROCKSDB_DISABLE_BZIP=1 ROCKSDB_DISABLE_LZ4=1 ROCKSDB_DISABLE_ZSTD=1 make V=1 J=32 -j32 check
+      - run: |
+          ./sst_dump --help | grep -E -q 'Supported compression types: kNoCompression$' # Verify no compiled in compression
+      - post-steps
+
+  build-linux-shared_lib-alt_namespace-status_checked:
+    executor: linux-docker
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - run: ASSERT_STATUS_CHECKED=1 TEST_UINT128_COMPAT=1 ROCKSDB_MODIFY_NPHASH=1 LIB_MODE=shared OPT="-DROCKSDB_NAMESPACE=alternative_rocksdb_ns" make V=1 -j32 check
+      - post-steps
+
+  build-linux-release:
+    executor: linux-docker
+    resource_class: 2xlarge
+    steps:
+      - checkout # check out the code in the project directory
+      - run: make V=1 -j32 release
+      - run: ./db_stress --version # ensure with gflags
+      - run: make clean
+      - run: apt-get remove -y libgflags-dev
+      - run: make V=1 -j32 release
+      - run: if ./db_stress --version; then false; else true; fi # ensure without gflags
+      - post-steps
+
+  build-linux-release-rtti:
+    executor: linux-docker
+    resource_class: xlarge
+    steps:
+      - checkout # check out the code in the project directory
+      - run: USE_RTTI=1 DEBUG_LEVEL=0 make V=1 -j16 static_lib tools db_bench
+      - run: ./db_stress --version # ensure with gflags
+      - run: make clean
+      - run: apt-get remove -y libgflags-dev
+      - run: USE_RTTI=1 DEBUG_LEVEL=0 make V=1 -j16 static_lib tools db_bench
+      - run: if ./db_stress --version; then false; else true; fi # ensure without gflags
+
+  build-linux-lite:
+    executor: linux-docker
+    resource_class: large
+    steps:
+      - pre-steps
+      - run: LITE=1 make V=1 J=8 -j8 check
+      - post-steps
+
+  build-linux-lite-release:
+    executor: linux-docker
+    resource_class: large
+    steps:
+      - checkout # check out the code in the project directory
+      - run: LITE=1 make V=1 -j8 release
+      - run: ./db_stress --version # ensure with gflags
+      - run: make clean
+      - run: apt-get remove -y libgflags-dev
+      - run: LITE=1 make V=1 -j8 release
+      - run: if ./db_stress --version; then false; else true; fi # ensure without gflags
+      - post-steps
+
+  build-linux-clang-no_test_run:
+    executor: linux-docker
+    resource_class: xlarge
+    steps:
+      - checkout # check out the code in the project directory
+      - run: CC=clang CXX=clang++ USE_CLANG=1 PORTABLE=1 make V=1 -j16 all
+      - post-steps
+
+  build-linux-clang10-asan:
+    executor: linux-docker
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - run: COMPILE_WITH_ASAN=1 CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j32 check # aligned new doesn't work for reason we haven't figured out
+      - post-steps
+
+  build-linux-clang10-mini-tsan:
+    executor: linux-docker
+    resource_class: 2xlarge+
+    steps:
+      - pre-steps
+      - run: COMPILE_WITH_TSAN=1 CC=clang-13 CXX=clang++-13 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j32 check
+      - post-steps
+
+  build-linux-clang10-ubsan:
+    executor: linux-docker
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - run: COMPILE_WITH_UBSAN=1 OPT="-fsanitize-blacklist=.circleci/ubsan_suppression_list.txt" CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j32 ubsan_check # aligned new doesn't work for reason we haven't figured out
+      - post-steps
+
+  build-linux-valgrind:
+    executor: linux-docker
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - run: PORTABLE=1 make V=1 -j32 valgrind_test
+      - post-steps
+
+  build-linux-clang10-clang-analyze:
+    executor: linux-docker
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - run: CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 CLANG_ANALYZER="/usr/bin/clang++-10" CLANG_SCAN_BUILD=scan-build-10 USE_CLANG=1 make V=1 -j32 analyze # aligned new doesn't work for reason we haven't figured out. For unknown, reason passing "clang++-10" as CLANG_ANALYZER doesn't work, and we need a full path.
+      - post-steps
+      - run:
+          name: "compress test report"
+          command: tar -cvzf scan_build_report.tar.gz scan_build_report
+          when: on_fail
+      - store_artifacts:
+          path: scan_build_report.tar.gz
+          destination: scan_build_report
+          when: on_fail
+
+  build-linux-runner:
+    machine: true
+    resource_class: facebook/rocksdb-benchmark-sys1
+    steps:
+      - pre-steps
+      - run:
+          name: "Checked Linux build (Runner)"
+          command: make V=1 J=8 -j8 check
+          environment:
+            LD_LIBRARY_PATH: /usr/local/lib
+      - post-steps
+
+  build-linux-cmake-with-folly:
+    executor: linux-docker
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - setup-folly
+      - build-folly
+      - run: (mkdir build && cd build && cmake -DUSE_FOLLY=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 .. && make V=1 -j20 && ctest -j20)
+      - post-steps
+
+  build-linux-cmake-with-folly-lite-no-test:
+    executor: linux-docker
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - setup-folly
+      - run: (mkdir build && cd build && cmake -DUSE_FOLLY_LITE=1 -DWITH_GFLAGS=1 .. && make V=1 -j20)
+      - post-steps
+
+  build-linux-cmake-with-benchmark:
+    executor: linux-docker
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - run: mkdir build && cd build && cmake -DWITH_GFLAGS=1 -DWITH_BENCHMARK=1 .. && make V=1 -j20 && ctest -j20
+      - post-steps
+
+  build-linux-unity-and-headers:
+    docker: # executor type
+      - image: gcc:latest
+    environment:
+      EXTRA_CXXFLAGS: -mno-avx512f # Warnings-as-error in avx512fintrin.h, would be used on newer hardware
+    resource_class: large
+    steps:
+      - checkout # check out the code in the project directory
+      - run: apt-get update -y && apt-get install -y libgflags-dev
+      - run: make V=1 -j8 unity_test
+      - run: make V=1 -j8 -k check-headers # could be moved to a different build
+      - post-steps
+
+  build-linux-gcc-7-with-folly:
+    executor: linux-docker
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - setup-folly
+      - build-folly
+      - run: USE_FOLLY=1 CC=gcc-7 CXX=g++-7 V=1 make -j32 check
+      - post-steps
+
+  build-linux-gcc-7-with-folly-lite-no-test:
+    executor: linux-docker
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - setup-folly
+      - run: USE_FOLLY_LITE=1 CC=gcc-7 CXX=g++-7 V=1 make -j32 all
+      - post-steps
+
+  build-linux-gcc-8-no_test_run:
+    executor: linux-docker
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - run: CC=gcc-8 CXX=g++-8 V=1 make -j32 all
+      - post-steps
+
+  build-linux-cmake-with-folly-coroutines:
+    executor: linux-docker
+    resource_class: 2xlarge
+    environment:
+      CC: gcc-10
+      CXX: g++-10
+    steps:
+      - pre-steps
+      - setup-folly
+      - build-folly
+      - run: (mkdir build && cd build && cmake -DUSE_COROUTINES=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 .. && make V=1 -j20 && ctest -j20)
+      - post-steps
+
+  build-linux-gcc-10-cxx20-no_test_run:
+    executor: linux-docker
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - run: CC=gcc-10 CXX=g++-10 V=1 ROCKSDB_CXX_STANDARD=c++20 make -j32 all
+      - post-steps
+
+  build-linux-gcc-11-no_test_run:
+    executor: linux-docker
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - run: CC=gcc-11 CXX=g++-11 V=1 make -j32 all microbench
+      - post-steps
+
+  build-linux-clang-13-no_test_run:
+    executor: linux-docker
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 make -j32 all microbench
+      - post-steps
+
+  # Ensure ASAN+UBSAN with folly, and full testsuite with clang 13
+  build-linux-clang-13-asan-ubsan-with-folly:
+    executor: linux-docker
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - setup-folly
+      - build-folly
+      - run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check
+      - post-steps
+
+  # This job is only to make sure the microbench tests are able to run, the benchmark result is not meaningful as the CI host is changing.
+  build-linux-run-microbench:
+    executor: linux-docker
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - run: DEBUG_LEVEL=0 make -j32 run_microbench
+      - post-steps
+
+  build-linux-mini-crashtest:
+    executor: linux-docker
+    resource_class: large
+    steps:
+      - pre-steps
+      - run: ulimit -S -n `ulimit -H -n` && make V=1 -j8 CRASH_TEST_EXT_ARGS='--duration=960 --max_key=2500000' blackbox_crash_test_with_atomic_flush
+      - post-steps
+
+  build-linux-crashtest-tiered-storage-bb:
+    executor: linux-docker
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - run:
+          name: "run crashtest"
+          command: ulimit -S -n `ulimit -H -n` && make V=1 -j32 CRASH_TEST_EXT_ARGS=--duration=10800 blackbox_crash_test_with_tiered_storage
+          no_output_timeout: 100m
+      - post-steps
+
+  build-linux-crashtest-tiered-storage-wb:
+    executor: linux-docker
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - run:
+          name: "run crashtest"
+          command: ulimit -S -n `ulimit -H -n` && make V=1 -j32 CRASH_TEST_EXT_ARGS=--duration=10800 whitebox_crash_test_with_tiered_storage
+          no_output_timeout: 100m
+      - post-steps
+
+  build-windows-vs2022:
+    executor:
+      name: win/server-2022
+      size: 2xlarge
+    environment:
+      THIRDPARTY_HOME: C:/Users/circleci/thirdparty
+      CMAKE_HOME: C:/Program Files/CMake
+      CMAKE_BIN: C:/Program Files/CMake/bin/cmake.exe
+      SNAPPY_HOME: C:/Users/circleci/thirdparty/snappy-1.1.8
+      SNAPPY_INCLUDE: C:/Users/circleci/thirdparty/snappy-1.1.8;C:/Users/circleci/thirdparty/snappy-1.1.8/build
+      SNAPPY_LIB_DEBUG: C:/Users/circleci/thirdparty/snappy-1.1.8/build/Debug/snappy.lib
+      CMAKE_GENERATOR: Visual Studio 17 2022
+    steps:
+      - windows-build-steps
+
+  build-windows-vs2019:
+    executor:
+      name: win/server-2019
+      size: 2xlarge
+    environment:
+      THIRDPARTY_HOME: C:/Users/circleci/thirdparty
+      CMAKE_HOME: C:/Program Files/CMake
+      CMAKE_BIN: C:/Program Files/CMake/bin/cmake.exe
+      SNAPPY_HOME: C:/Users/circleci/thirdparty/snappy-1.1.8
+      SNAPPY_INCLUDE: C:/Users/circleci/thirdparty/snappy-1.1.8;C:/Users/circleci/thirdparty/snappy-1.1.8/build
+      SNAPPY_LIB_DEBUG: C:/Users/circleci/thirdparty/snappy-1.1.8/build/Debug/snappy.lib
+      CMAKE_GENERATOR: Visual Studio 16 2019
+    steps:
+      - windows-build-steps
+
+  build-linux-java:
+    executor: linux-docker
+    resource_class: large
+    steps:
+      - pre-steps
+      - run:
+          name: "Set Java Environment"
+          command: |
+            echo "JAVA_HOME=${JAVA_HOME}"
+            echo 'export PATH=$JAVA_HOME/bin:$PATH' >> $BASH_ENV
+            which java && java -version
+            which javac && javac -version
+      - run:
+          name: "Test RocksDBJava"
+          command: make V=1 J=8 -j8 jtest
+      - post-steps
+
+  build-linux-java-static:
+    executor: linux-docker
+    resource_class: large
+    steps:
+      - pre-steps
+      - run:
+          name: "Set Java Environment"
+          command: |
+            echo "JAVA_HOME=${JAVA_HOME}"
+            echo 'export PATH=$JAVA_HOME/bin:$PATH' >> $BASH_ENV
+            which java && java -version
+            which javac && javac -version
+      - run:
+          name: "Build RocksDBJava Static Library"
+          command: make V=1 J=8 -j8 rocksdbjavastatic
+      - post-steps
+
+  build-macos-java:
+    macos:
+      xcode: 12.5.1
+    resource_class: large
+    environment:
+      JAVA_HOME: /Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home
+      ROCKSDB_DISABLE_JEMALLOC: 1 # jemalloc causes java 8 crash
+    steps:
+      - increase-max-open-files-on-macos
+      - install-gflags-on-macos
+      - install-jdk8-on-macos
+      - pre-steps-macos
+      - run:
+          name: "Set Java Environment"
+          command: |
+            echo "JAVA_HOME=${JAVA_HOME}"
+            echo 'export PATH=$JAVA_HOME/bin:$PATH' >> $BASH_ENV
+            which java && java -version
+            which javac && javac -version
+      - run:
+          name: "Test RocksDBJava"
+          command: make V=1 J=16 -j16 jtest
+          no_output_timeout: 20m
+      - post-steps
+
+  build-macos-java-static:
+    macos:
+      xcode: 12.5.1
+    resource_class: large
+    environment:
+      JAVA_HOME: /Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home
+    steps:
+      - increase-max-open-files-on-macos
+      - install-gflags-on-macos
+      - install-cmake-on-macos
+      - install-jdk8-on-macos
+      - pre-steps-macos
+      - run:
+          name: "Set Java Environment"
+          command: |
+            echo "JAVA_HOME=${JAVA_HOME}"
+            echo 'export PATH=$JAVA_HOME/bin:$PATH' >> $BASH_ENV
+            which java && java -version
+            which javac && javac -version
+      - run:
+          name: "Build RocksDBJava x86 and ARM Static Libraries"
+          command: make V=1 J=16 -j16 rocksdbjavastaticosx
+          no_output_timeout: 20m
+      - post-steps
+
+  build-macos-java-static-universal:
+    macos:
+      xcode: 12.5.1
+    resource_class: large
+    environment:
+      JAVA_HOME: /Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home
+    steps:
+      - increase-max-open-files-on-macos
+      - install-gflags-on-macos
+      - install-cmake-on-macos
+      - install-jdk8-on-macos
+      - pre-steps-macos
+      - run:
+          name: "Set Java Environment"
+          command: |
+            echo "JAVA_HOME=${JAVA_HOME}"
+            echo 'export PATH=$JAVA_HOME/bin:$PATH' >> $BASH_ENV
+            which java && java -version
+            which javac && javac -version
+      - run:
+          name: "Build RocksDBJava Universal Binary Static Library"
+          command: make V=1 J=16 -j16 rocksdbjavastaticosx_ub
+          no_output_timeout: 20m
+      - post-steps
+
+  build-examples:
+    executor: linux-docker
+    resource_class: large
+    steps:
+      - pre-steps
+      - run:
+          name: "Build examples"
+          command: |
+            make V=1 -j4 static_lib && cd examples && make V=1 -j4
+      - post-steps
+
+  build-cmake-mingw:
+    executor: linux-docker
+    resource_class: large
+    steps:
+      - pre-steps
+      - run: update-alternatives --set x86_64-w64-mingw32-g++ /usr/bin/x86_64-w64-mingw32-g++-posix
+      - run:
+          name: "Build cmake-mingw"
+          command: |
+            export PATH=$JAVA_HOME/bin:$PATH
+            echo "JAVA_HOME=${JAVA_HOME}"
+            which java && java -version
+            which javac && javac -version
+            mkdir build && cd build && cmake -DJNI=1 -DWITH_GFLAGS=OFF .. -DCMAKE_C_COMPILER=x86_64-w64-mingw32-gcc -DCMAKE_CXX_COMPILER=x86_64-w64-mingw32-g++ -DCMAKE_SYSTEM_NAME=Windows && make -j4 rocksdb rocksdbjni
+      - post-steps
+
+  build-linux-non-shm:
+    executor: linux-docker
+    resource_class: 2xlarge
+    environment:
+      TEST_TMPDIR: /tmp/rocksdb_test_tmp
+    steps:
+      - pre-steps
+      - run: make V=1 -j32 check
+      - post-steps
+
+  build-linux-arm-test-full:
+    machine:
+      image: ubuntu-2004:202111-02
+    resource_class: arm.large
+    steps:
+      - pre-steps
+      - install-gflags
+      - run: make V=1 J=4 -j4 check
+      - post-steps
+
+  build-linux-arm:
+    machine:
+      image: ubuntu-2004:202111-02
+    resource_class: arm.large
+    steps:
+      - pre-steps
+      - install-gflags
+      - run: ROCKSDBTESTS_PLATFORM_DEPENDENT=only make V=1 J=4 -j4 all_but_some_tests check_some
+      - post-steps
+
+  build-linux-arm-cmake-no_test_run:
+    machine:
+      image: ubuntu-2004:202111-02
+    resource_class: arm.large
+    environment:
+      JAVA_HOME: /usr/lib/jvm/java-8-openjdk-arm64
+    steps:
+      - pre-steps
+      - install-gflags
+      - run:
+          name: "Set Java Environment"
+          command: |
+            echo "JAVA_HOME=${JAVA_HOME}"
+            echo 'export PATH=$JAVA_HOME/bin:$PATH' >> $BASH_ENV
+            which java && java -version
+            which javac && javac -version
+      - run:
+          name: "Build with cmake"
+          command: |
+            mkdir build
+            cd build
+            cmake -DCMAKE_BUILD_TYPE=Release -DWITH_TESTS=0 -DWITH_GFLAGS=1 -DWITH_BENCHMARK_TOOLS=0 -DWITH_TOOLS=0 -DWITH_CORE_TOOLS=1 ..
+            make -j4
+      - run:
+          name: "Build Java with cmake"
+          command: |
+            rm -rf build
+            mkdir build
+            cd build
+            cmake -DJNI=1 -DCMAKE_BUILD_TYPE=Release -DWITH_GFLAGS=1 ..
+            make -j4 rocksdb rocksdbjni
+      - post-steps
+
+  build-format-compatible:
+    executor: linux-docker
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - run:
+          name: "test"
+          command: |
+            export TEST_TMPDIR=/dev/shm/rocksdb
+            rm -rf /dev/shm/rocksdb
+            mkdir /dev/shm/rocksdb
+            tools/check_format_compatible.sh
+      - post-steps
+
+  build-fuzzers:
+    executor: linux-docker
+    resource_class: large
+    steps:
+      - pre-steps
+      - run:
+          name: "Build rocksdb lib"
+          command: CC=clang-13 CXX=clang++-13 USE_CLANG=1 make -j4 static_lib
+      - run:
+          name: "Build fuzzers"
+          command: cd fuzz && make sst_file_writer_fuzzer db_fuzzer db_map_fuzzer
+      - post-steps
+
+  benchmark-linux: #use a private Circle CI runner (resource_class) to run the job
+    machine: true
+    resource_class: facebook/rocksdb-benchmark-sys1
+    steps:
+      - build-for-benchmarks
+      - perform-benchmarks
+      - post-benchmarks
+
+workflows:
+  version: 2
+  jobs-linux-run-tests:
+    jobs:
+      - build-linux
+      - build-linux-cmake-with-folly
+      - build-linux-cmake-with-folly-lite-no-test
+      - build-linux-gcc-7-with-folly
+      - build-linux-gcc-7-with-folly-lite-no-test
+      - build-linux-cmake-with-folly-coroutines
+      - build-linux-cmake-with-benchmark
+      - build-linux-encrypted_env-no_compression
+      - build-linux-lite
+  jobs-linux-run-tests-san:
+    jobs:
+      - build-linux-clang10-asan
+      - build-linux-clang10-ubsan
+      - build-linux-clang10-mini-tsan
+      - build-linux-shared_lib-alt_namespace-status_checked
+  jobs-linux-no-test-run:
+    jobs:
+      - build-linux-release
+      - build-linux-release-rtti
+      - build-linux-lite-release
+      - build-examples
+      - build-fuzzers
+      - build-linux-clang-no_test_run
+      - build-linux-clang-13-no_test_run
+      - build-linux-gcc-8-no_test_run
+      - build-linux-gcc-10-cxx20-no_test_run
+      - build-linux-gcc-11-no_test_run
+      - build-linux-arm-cmake-no_test_run
+  jobs-linux-other-checks:
+    jobs:
+      - build-linux-clang10-clang-analyze
+      - build-linux-unity-and-headers
+      - build-linux-mini-crashtest
+  jobs-windows:
+    jobs:
+      - build-windows-vs2022
+      - build-windows-vs2019
+      - build-cmake-mingw
+  jobs-java:
+    jobs:
+      - build-linux-java
+      - build-linux-java-static
+      - build-macos-java
+      - build-macos-java-static
+      - build-macos-java-static-universal
+  jobs-macos:
+    jobs:
+      - build-macos
+      - build-macos-cmake:
+          run_even_tests: true
+      - build-macos-cmake:
+          run_even_tests: false
+  jobs-linux-arm:
+    jobs:
+      - build-linux-arm
+  build-fuzzers:
+    jobs:
+      - build-fuzzers
+  benchmark-linux:
+    triggers:
+      - schedule:
+          cron: "0 * * * *"
+          filters:
+            branches:
+              only:
+                - main
+    jobs:
+      - benchmark-linux
+  nightly:
+    triggers:
+      - schedule:
+          cron: "0 9 * * *"
+          filters:
+            branches:
+              only:
+                - main
+    jobs:
+      - build-format-compatible
+      - build-linux-arm-test-full
+      - build-linux-run-microbench
+      - build-linux-non-shm
+      - build-linux-clang-13-asan-ubsan-with-folly
+      - build-linux-valgrind
diff --git a/src/rocksdb/.circleci/ubsan_suppression_list.txt b/src/rocksdb/.circleci/ubsan_suppression_list.txt
new file mode 100644
index 000000000..d7db81806
--- /dev/null
+++ b/src/rocksdb/.circleci/ubsan_suppression_list.txt
@@ -0,0 +1,6 @@
+# Supress UBSAN warnings related to stl_tree.h, e.g.
+# UndefinedBehaviorSanitizer: undefined-behavior /usr/bin/../lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/bits/stl_tree.h:1505:43 in 
+# /usr/bin/../lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/bits/stl_tree.h:1505:43:
+# runtime error: upcast of address 0x000001fa8820 with insufficient space for an object of type
+# 'std::_Rb_tree_node<std::pair<const std::__cxx11::basic_string<char>, rocksdb::(anonymous namespace)::LockHoldingInfo> >'
+src:*bits/stl_tree.h
diff --git a/src/rocksdb/.clang-format b/src/rocksdb/.clang-format
new file mode 100644
index 000000000..7c279811a
--- /dev/null
+++ b/src/rocksdb/.clang-format
@@ -0,0 +1,5 @@
+# Complete list of style options can be found at: 
+# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
+---
+BasedOnStyle: Google
+...
diff --git a/src/rocksdb/.github/workflows/sanity_check.yml b/src/rocksdb/.github/workflows/sanity_check.yml
new file mode 100644
index 000000000..6ee53ce1b
--- /dev/null
+++ b/src/rocksdb/.github/workflows/sanity_check.yml
@@ -0,0 +1,47 @@
+name: Check buck targets and code format
+on: [push, pull_request]
+permissions:
+  contents: read
+
+jobs:
+  check:
+    name: Check TARGETS file and code format
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout feature branch
+      uses: actions/checkout@v2
+      with:
+        fetch-depth: 0
+
+    - name: Fetch from upstream
+      run: |
+        git remote add upstream https://github.com/facebook/rocksdb.git && git fetch upstream
+
+    - name: Where am I
+      run: |
+        echo git status && git status
+        echo "git remote -v" && git remote -v
+        echo git branch && git branch
+
+    - name: Setup Python
+      uses: actions/setup-python@v1
+
+    - name: Install Dependencies
+      run: python -m pip install --upgrade pip
+
+    - name: Install argparse
+      run: pip install argparse
+
+    - name: Download clang-format-diff.py
+      uses: wei/wget@v1
+      with:
+        args: https://raw.githubusercontent.com/llvm/llvm-project/release/12.x/clang/tools/clang-format/clang-format-diff.py
+
+    - name: Check format
+      run: VERBOSE_CHECK=1 make check-format
+
+    - name: Compare buckify output
+      run: make check-buck-targets
+
+    - name: Simple source code checks
+      run: make check-sources
diff --git a/src/rocksdb/.gitignore b/src/rocksdb/.gitignore
new file mode 100644
index 000000000..1ff5b7437
--- /dev/null
+++ b/src/rocksdb/.gitignore
@@ -0,0 +1,97 @@
+make_config.mk
+rocksdb.pc
+
+*.a
+*.arc
+*.d
+*.dylib*
+*.gcda
+*.gcno
+*.o
+*.o.tmp
+*.so
+*.so.*
+*_test
+*_bench
+*_stress
+*.out
+*.class
+*.jar
+*.*jnilib*
+*.d-e
+*.o-*
+*.swp
+*~
+*.vcxproj
+*.vcxproj.filters
+*.sln
+*.cmake
+.watchmanconfig
+CMakeCache.txt
+CMakeFiles/
+build/
+
+ldb
+manifest_dump
+sst_dump
+blob_dump
+block_cache_trace_analyzer
+tools/block_cache_analyzer/*.pyc
+column_aware_encoding_exp
+util/build_version.cc
+build_tools/VALGRIND_LOGS/
+coverage/COVERAGE_REPORT
+.gdbhistory
+.gdb_history
+package/
+unity.a
+tags
+etags
+rocksdb_dump
+rocksdb_undump
+db_test2
+trace_analyzer
+block_cache_trace_analyzer
+io_tracer_parser
+.DS_Store
+.vs
+.vscode
+.clangd
+
+java/out
+java/target
+java/test-libs
+java/*.log
+java/include/org_rocksdb_*.h
+
+.idea/
+*.iml
+
+rocksdb.cc
+rocksdb.h
+unity.cc
+java/crossbuild/.vagrant
+.vagrant/
+java/**/*.asc
+java/javadoc
+
+scan_build_report/
+t
+LOG
+
+db_logs/
+tp2/
+fbcode/
+fbcode
+buckifier/*.pyc
+buckifier/__pycache__
+
+compile_commands.json
+clang-format-diff.py
+.py3/
+
+fuzz/proto/gen/
+fuzz/crash-*
+
+cmake-build-*
+third-party/folly/
diff --git a/src/rocksdb/.lgtm.yml b/src/rocksdb/.lgtm.yml
new file mode 100644
index 000000000..12d6f1d4e
--- /dev/null
+++ b/src/rocksdb/.lgtm.yml
@@ -0,0 +1,4 @@
+extraction:
+  cpp:
+    index:
+      build_command: make static_lib
diff --git a/src/rocksdb/.watchmanconfig b/src/rocksdb/.watchmanconfig
new file mode 100644
index 000000000..e5b450d7b
--- /dev/null
+++ b/src/rocksdb/.watchmanconfig
@@ -0,0 +1,6 @@
+{
+  "content_hash_warming": true,
+  "content_hash_max_items": 333333,
+  "hint_num_files_per_dir": 8,
+  "fsevents_latency": 0.05
+}
diff --git a/src/rocksdb/AUTHORS b/src/rocksdb/AUTHORS
new file mode 100644
index 000000000..a451875f1
--- /dev/null
+++ b/src/rocksdb/AUTHORS
@@ -0,0 +1,12 @@
+Facebook Inc.
+Facebook Engineering Team
+
+Google Inc.
+# Initial version authors:
+Jeffrey Dean <jeff@google.com>
+Sanjay Ghemawat <sanjay@google.com>
+
+# Partial list of contributors:
+Kevin Regan <kevin.d.regan@gmail.com>
+Johan Bilien <jobi@litl.com>
+Matthew Von-Maszewski <https://github.com/matthewvon> (Basho Technologies)
diff --git a/src/rocksdb/CMakeLists.txt b/src/rocksdb/CMakeLists.txt
new file mode 100644
index 000000000..dbef05902
--- /dev/null
+++ b/src/rocksdb/CMakeLists.txt
@@ -0,0 +1,1607 @@
+# Prerequisites for Windows:
+#     This cmake build is for Windows 64-bit only.
+#
+# Prerequisites:
+#     You must have at least Visual Studio 2019. Start the Developer Command Prompt window that is a part of Visual Studio installation.
+#     Run the build commands from within the Developer Command Prompt window to have paths to the compiler and runtime libraries set.
+#     You must have git.exe in your %PATH% environment variable.
+#
+# To build Rocksdb for Windows is as easy as 1-2-3-4-5:
+#
+# 1. Update paths to third-party libraries in thirdparty.inc file
+# 2. Create a new directory for build artifacts
+#        mkdir build
+#        cd build
+# 3. Run cmake to generate project files for Windows, add more options to enable required third-party libraries.
+#    See thirdparty.inc for more information.
+#        sample command: cmake -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Release -DWITH_GFLAGS=1 -DWITH_SNAPPY=1 -DWITH_JEMALLOC=1 -DWITH_JNI=1 ..
+# 4. Then build the project in debug mode (you may want to add /m[:<N>] flag to run msbuild in <N> parallel threads
+#                                          or simply /m to use all avail cores)
+#        msbuild rocksdb.sln
+#
+#        rocksdb.sln build features exclusions of test only code in Release. If you build ALL_BUILD then everything
+#        will be attempted but test only code does not build in Release mode.
+#
+# 5. And release mode (/m[:<N>] is also supported)
+#        msbuild rocksdb.sln /p:Configuration=Release
+#
+# Linux:
+#
+# 1. Install a recent toolchain if you're on a older distro. C++17 required (GCC >= 7, Clang >= 5)
+# 2. mkdir build; cd build
+# 3. cmake ..
+# 4. make -j
+
+cmake_minimum_required(VERSION 3.10)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/modules/")
+include(ReadVersion)
+include(GoogleTest)
+get_rocksdb_version(rocksdb_VERSION)
+project(rocksdb
+  VERSION ${rocksdb_VERSION}
+  DESCRIPTION "An embeddable persistent key-value store for fast storage"
+  HOMEPAGE_URL https://rocksdb.org/
+  LANGUAGES CXX C ASM)
+
+if(POLICY CMP0042)
+  cmake_policy(SET CMP0042 NEW)
+endif()
+
+if(NOT CMAKE_BUILD_TYPE)
+  if(EXISTS "${CMAKE_SOURCE_DIR}/.git")
+    set(default_build_type "Debug")
+  else()
+    set(default_build_type "RelWithDebInfo")
+  endif()
+  set(CMAKE_BUILD_TYPE "${default_build_type}" CACHE STRING
+    "Default BUILD_TYPE is ${default_build_type}" FORCE)
+endif()
+
+find_program(CCACHE_FOUND ccache)
+if(CCACHE_FOUND)
+  set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
+  set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache)
+endif(CCACHE_FOUND)
+
+option(WITH_JEMALLOC "build with JeMalloc" OFF)
+option(WITH_LIBURING "build with liburing" ON)
+option(WITH_SNAPPY "build with SNAPPY" OFF)
+option(WITH_LZ4 "build with lz4" OFF)
+option(WITH_ZLIB "build with zlib" OFF)
+option(WITH_ZSTD "build with zstd" OFF)
+option(WITH_WINDOWS_UTF8_FILENAMES "use UTF8 as characterset for opening files, regardles of the system code page" OFF)
+if (WITH_WINDOWS_UTF8_FILENAMES)
+  add_definitions(-DROCKSDB_WINDOWS_UTF8_FILENAMES)
+endif()
+option(ROCKSDB_BUILD_SHARED "Build shared versions of the RocksDB libraries" ON)
+
+if ($ENV{CIRCLECI})
+  message(STATUS "Build for CircieCI env, a few tests may be disabled")
+  add_definitions(-DCIRCLECI)
+endif()
+
+if( NOT DEFINED CMAKE_CXX_STANDARD )
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+
+include(CMakeDependentOption)
+
+if(MSVC)
+  option(WITH_GFLAGS "build with GFlags" OFF)
+  option(WITH_XPRESS "build with windows built in compression" OFF)
+  option(ROCKSDB_SKIP_THIRDPARTY "skip thirdparty.inc" OFF)
+
+  if(NOT ROCKSDB_SKIP_THIRDPARTY)
+    include(${CMAKE_CURRENT_SOURCE_DIR}/thirdparty.inc)
+  endif()
+else()
+  if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD" AND NOT CMAKE_SYSTEM_NAME MATCHES "kFreeBSD")
+    # FreeBSD has jemalloc as default malloc
+    # but it does not have all the jemalloc files in include/...
+    set(WITH_JEMALLOC ON)
+  else()
+    if(WITH_JEMALLOC)
+      find_package(JeMalloc REQUIRED)
+      add_definitions(-DROCKSDB_JEMALLOC -DJEMALLOC_NO_DEMANGLE)
+      list(APPEND THIRDPARTY_LIBS JeMalloc::JeMalloc)
+    endif()
+  endif()
+
+  if(MINGW)
+    option(WITH_GFLAGS "build with GFlags" OFF)
+  else()
+    option(WITH_GFLAGS "build with GFlags" ON)
+  endif()
+  set(GFLAGS_LIB)
+  if(WITH_GFLAGS)
+    # Config with namespace available since gflags 2.2.2
+    option(GFLAGS_USE_TARGET_NAMESPACE "Use gflags import target with namespace." ON)
+    find_package(gflags CONFIG)
+    if(gflags_FOUND)
+      if(TARGET ${GFLAGS_TARGET})
+        # Config with GFLAGS_TARGET available since gflags 2.2.0
+        set(GFLAGS_LIB ${GFLAGS_TARGET})
+      else()
+        # Config with GFLAGS_LIBRARIES available since gflags 2.1.0
+        set(GFLAGS_LIB ${gflags_LIBRARIES})
+      endif()
+    else()
+      find_package(gflags REQUIRED)
+      set(GFLAGS_LIB gflags::gflags)
+    endif()
+    include_directories(${GFLAGS_INCLUDE_DIR})
+    list(APPEND THIRDPARTY_LIBS ${GFLAGS_LIB})
+    add_definitions(-DGFLAGS=1)
+  endif()
+
+  if(WITH_SNAPPY)
+    find_package(Snappy CONFIG)
+    if(NOT Snappy_FOUND)
+      find_package(Snappy REQUIRED)
+    endif()
+    add_definitions(-DSNAPPY)
+    list(APPEND THIRDPARTY_LIBS Snappy::snappy)
+  endif()
+
+  if(WITH_ZLIB)
+    find_package(ZLIB REQUIRED)
+    add_definitions(-DZLIB)
+    list(APPEND THIRDPARTY_LIBS ZLIB::ZLIB)
+  endif()
+
+  option(WITH_BZ2 "build with bzip2" OFF)
+  if(WITH_BZ2)
+    find_package(BZip2 REQUIRED)
+    add_definitions(-DBZIP2)
+    if(BZIP2_INCLUDE_DIRS)
+      include_directories(${BZIP2_INCLUDE_DIRS})
+    else()
+      include_directories(${BZIP2_INCLUDE_DIR})
+    endif()
+    list(APPEND THIRDPARTY_LIBS ${BZIP2_LIBRARIES})
+  endif()
+
+  if(WITH_LZ4)
+    find_package(lz4 REQUIRED)
+    add_definitions(-DLZ4)
+    list(APPEND THIRDPARTY_LIBS lz4::lz4)
+  endif()
+
+  if(WITH_ZSTD)
+    find_package(zstd REQUIRED)
+    add_definitions(-DZSTD)
+    include_directories(${ZSTD_INCLUDE_DIR})
+    list(APPEND THIRDPARTY_LIBS zstd::zstd)
+  endif()
+endif()
+
+option(WITH_MD_LIBRARY "build with MD" ON)
+if(WIN32 AND MSVC)
+  if(WITH_MD_LIBRARY)
+    set(RUNTIME_LIBRARY "MD")
+  else()
+    set(RUNTIME_LIBRARY "MT")
+  endif()
+endif()
+
+if(MSVC)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi /nologo /EHsc /GS /Gd /GR /GF /fp:precise /Zc:wchar_t /Zc:forScope /errorReport:queue")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /d2Zi+ /W4 /wd4127 /wd4800 /wd4996 /wd4351 /wd4100 /wd4204 /wd4324")
+else()
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W -Wextra -Wall -pthread")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wsign-compare -Wshadow -Wno-unused-parameter -Wno-unused-variable -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers -Wno-strict-aliasing -Wno-invalid-offsetof")
+  if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wstrict-prototypes")
+  endif()
+  if(MINGW)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-format")
+    add_definitions(-D_POSIX_C_SOURCE=1)
+  endif()
+  if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer")
+    include(CheckCXXCompilerFlag)
+    CHECK_CXX_COMPILER_FLAG("-momit-leaf-frame-pointer" HAVE_OMIT_LEAF_FRAME_POINTER)
+    if(HAVE_OMIT_LEAF_FRAME_POINTER)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -momit-leaf-frame-pointer")
+    endif()
+  endif()
+endif()
+
+include(CheckCCompilerFlag)
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64")
+  CHECK_C_COMPILER_FLAG("-mcpu=power9" HAS_POWER9)
+  if(HAS_POWER9)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=power9 -mtune=power9")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=power9 -mtune=power9")
+  else()
+    CHECK_C_COMPILER_FLAG("-mcpu=power8" HAS_POWER8)
+    if(HAS_POWER8)
+      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=power8 -mtune=power8")
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=power8 -mtune=power8")
+    endif(HAS_POWER8)
+  endif(HAS_POWER9)
+  CHECK_C_COMPILER_FLAG("-maltivec" HAS_ALTIVEC)
+  if(HAS_ALTIVEC)
+    message(STATUS " HAS_ALTIVEC yes")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maltivec")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec")
+  endif(HAS_ALTIVEC)
+endif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64")
+
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64|AARCH64")
+        CHECK_C_COMPILER_FLAG("-march=armv8-a+crc+crypto" HAS_ARMV8_CRC)
+  if(HAS_ARMV8_CRC)
+    message(STATUS " HAS_ARMV8_CRC yes")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc+crypto -Wno-unused-function")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a+crc+crypto -Wno-unused-function")
+  endif(HAS_ARMV8_CRC)
+endif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64|AARCH64")
+
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x")
+  CHECK_C_COMPILER_FLAG("-march=native" HAS_S390X_MARCH_NATIVE)
+  if(HAS_S390X_MARCH_NATIVE)
+    message(STATUS " HAS_S390X_MARCH_NATIVE yes")
+  endif(HAS_S390X_MARCH_NATIVE)
+endif(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x")
+
+option(PORTABLE "build a portable binary" OFF)
+option(FORCE_SSE42 "force building with SSE4.2, even when PORTABLE=ON" OFF)
+option(FORCE_AVX "force building with AVX, even when PORTABLE=ON" OFF)
+option(FORCE_AVX2 "force building with AVX2, even when PORTABLE=ON" OFF)
+if(PORTABLE)
+  add_definitions(-DROCKSDB_PORTABLE)
+
+  # MSVC does not need a separate compiler flag to enable SSE4.2; if nmmintrin.h
+  # is available, it is available by default.
+  if(FORCE_SSE42 AND NOT MSVC)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -mpclmul")
+  endif()
+  if(MSVC)
+    if(FORCE_AVX)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
+    endif()
+    # MSVC automatically enables BMI / lzcnt with AVX2.
+    if(FORCE_AVX2)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
+    endif()
+  else()
+    if(FORCE_AVX)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx")
+    endif()
+    if(FORCE_AVX2)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -mbmi -mlzcnt")
+    endif()
+    if(CMAKE_SYSTEM_PROCESSOR MATCHES "^s390x")
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z196")
+    endif()
+  endif()
+else()
+  if(MSVC)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
+  else()
+    if(CMAKE_SYSTEM_PROCESSOR MATCHES "^s390x" AND NOT HAS_S390X_MARCH_NATIVE)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z196")
+    elseif(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64" AND NOT HAS_ARMV8_CRC)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
+    endif()
+  endif()
+endif()
+
+include(CheckCXXSourceCompiles)
+set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
+if(NOT MSVC)
+  set(CMAKE_REQUIRED_FLAGS "-msse4.2 -mpclmul")
+endif()
+
+CHECK_CXX_SOURCE_COMPILES("
+#include <cstdint>
+#include <nmmintrin.h>
+#include <wmmintrin.h>
+int main() {
+  volatile uint32_t x = _mm_crc32_u32(0, 0);
+  const auto a = _mm_set_epi64x(0, 0);
+  const auto b = _mm_set_epi64x(0, 0);
+  const auto c = _mm_clmulepi64_si128(a, b, 0x00);
+  auto d = _mm_cvtsi128_si64(c);
+}
+" HAVE_SSE42)
+if(HAVE_SSE42)
+  add_definitions(-DHAVE_SSE42)
+  add_definitions(-DHAVE_PCLMUL)
+elseif(FORCE_SSE42)
+  message(FATAL_ERROR "FORCE_SSE42=ON but unable to compile with SSE4.2 enabled")
+endif()
+
+# Check if -latomic is required or not
+if (NOT MSVC)
+  set(CMAKE_REQUIRED_FLAGS "--std=c++17")
+  CHECK_CXX_SOURCE_COMPILES("
+#include <atomic>
+std::atomic<uint64_t> x(0);
+int main() {
+  uint64_t i = x.load(std::memory_order_relaxed);
+  bool b = x.is_lock_free();
+  return 0;
+}
+" BUILTIN_ATOMIC)
+  if (NOT BUILTIN_ATOMIC)
+    #TODO: Check if -latomic exists
+    list(APPEND THIRDPARTY_LIBS atomic)
+  endif()
+endif()
+
+if (WITH_LIBURING)
+  find_package(uring)
+  if (uring_FOUND)
+    add_definitions(-DROCKSDB_IOURING_PRESENT)
+    list(APPEND THIRDPARTY_LIBS uring::uring)
+  endif()
+endif()
+
+# Reset the required flags
+set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
+
+option(WITH_IOSTATS_CONTEXT "Enable IO stats context" ON)
+if (NOT WITH_IOSTATS_CONTEXT)
+  add_definitions(-DNIOSTATS_CONTEXT)
+endif()
+
+option(WITH_PERF_CONTEXT "Enable perf context" ON)
+if (NOT WITH_PERF_CONTEXT)
+  add_definitions(-DNPERF_CONTEXT)
+endif()
+
+option(FAIL_ON_WARNINGS "Treat compile warnings as errors" ON)
+if(FAIL_ON_WARNINGS)
+  if(MSVC)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /WX")
+  else() # assume GCC
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
+  endif()
+endif()
+
+option(WITH_ASAN "build with ASAN" OFF)
+if(WITH_ASAN)
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=address")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address")
+  if(WITH_JEMALLOC)
+    message(FATAL "ASAN does not work well with JeMalloc")
+  endif()
+endif()
+
+option(WITH_TSAN "build with TSAN" OFF)
+if(WITH_TSAN)
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=thread -Wl,-pie")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread -fPIC")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=thread -fPIC")
+  if(WITH_JEMALLOC)
+    message(FATAL "TSAN does not work well with JeMalloc")
+  endif()
+endif()
+
+option(WITH_UBSAN "build with UBSAN" OFF)
+if(WITH_UBSAN)
+  add_definitions(-DROCKSDB_UBSAN_RUN)
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=undefined")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=undefined")
+  if(WITH_JEMALLOC)
+    message(FATAL "UBSAN does not work well with JeMalloc")
+  endif()
+endif()
+
+option(WITH_NUMA "build with NUMA policy support" OFF)
+if(WITH_NUMA)
+  find_package(NUMA REQUIRED)
+  add_definitions(-DNUMA)
+  include_directories(${NUMA_INCLUDE_DIR})
+  list(APPEND THIRDPARTY_LIBS NUMA::NUMA)
+endif()
+
+option(WITH_TBB "build with Threading Building Blocks (TBB)" OFF)
+if(WITH_TBB)
+  find_package(TBB REQUIRED)
+  add_definitions(-DTBB)
+  list(APPEND THIRDPARTY_LIBS TBB::TBB)
+endif()
+
+# Stall notifications eat some performance from inserts
+option(DISABLE_STALL_NOTIF "Build with stall notifications" OFF)
+if(DISABLE_STALL_NOTIF)
+  add_definitions(-DROCKSDB_DISABLE_STALL_NOTIFICATION)
+endif()
+
+option(WITH_DYNAMIC_EXTENSION "build with dynamic extension support" OFF)
+if(NOT WITH_DYNAMIC_EXTENSION)
+  add_definitions(-DROCKSDB_NO_DYNAMIC_EXTENSION)
+endif()
+
+option(ASSERT_STATUS_CHECKED "build with assert status checked" OFF)
+if (ASSERT_STATUS_CHECKED)
+  message(STATUS "Build with assert status checked")
+  add_definitions(-DROCKSDB_ASSERT_STATUS_CHECKED)
+endif()
+
+
+# RTTI is by default AUTO which enables it in debug and disables it in release.
+set(USE_RTTI AUTO CACHE STRING "Enable RTTI in builds")
+set_property(CACHE USE_RTTI PROPERTY STRINGS AUTO ON OFF)
+if(USE_RTTI STREQUAL "AUTO")
+  message(STATUS "Enabling RTTI in Debug builds only (default)")
+  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DROCKSDB_USE_RTTI")
+  if(MSVC)
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /GR-")
+  else()
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fno-rtti")
+  endif()
+elseif(USE_RTTI)
+  message(STATUS "Enabling RTTI in all builds")
+  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DROCKSDB_USE_RTTI")
+  set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DROCKSDB_USE_RTTI")
+else()
+  if(MSVC)
+    message(STATUS "Disabling RTTI in Release builds. Always on in Debug.")
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DROCKSDB_USE_RTTI")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /GR-")
+  else()
+    message(STATUS "Disabling RTTI in all builds")
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fno-rtti")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fno-rtti")
+  endif()
+endif()
+
+# Used to run CI build and tests so we can run faster
+option(OPTDBG "Build optimized debug build with MSVC" OFF)
+option(WITH_RUNTIME_DEBUG "build with debug version of runtime library" ON)
+if(MSVC)
+  if(OPTDBG)
+    message(STATUS "Debug optimization is enabled")
+    set(CMAKE_CXX_FLAGS_DEBUG "/Oxt")
+  else()
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Od /RTC1")
+
+    # Minimal Build is deprecated after MSVC 2015
+    if( MSVC_VERSION GREATER 1900 )
+      set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Gm-")
+    else()
+      set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Gm")
+    endif()
+
+  endif()
+  if(WITH_RUNTIME_DEBUG)
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /${RUNTIME_LIBRARY}d")
+  else()
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /${RUNTIME_LIBRARY}")
+  endif()
+  set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Oxt /Zp8 /Gm- /Gy /${RUNTIME_LIBRARY}")
+
+  set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /DEBUG")
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /DEBUG")
+endif()
+
+if(CMAKE_COMPILER_IS_GNUCXX)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-builtin-memcmp")
+endif()
+
+option(ROCKSDB_LITE "Build RocksDBLite version" OFF)
+if(ROCKSDB_LITE)
+  add_definitions(-DROCKSDB_LITE)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions -Os")
+endif()
+
+if(CMAKE_SYSTEM_NAME MATCHES "Cygwin")
+  add_definitions(-fno-builtin-memcmp -DCYGWIN)
+elseif(CMAKE_SYSTEM_NAME MATCHES "Darwin")
+  add_definitions(-DOS_MACOSX)
+elseif(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  add_definitions(-DOS_LINUX)
+elseif(CMAKE_SYSTEM_NAME MATCHES "SunOS")
+  add_definitions(-DOS_SOLARIS)
+elseif(CMAKE_SYSTEM_NAME MATCHES "kFreeBSD")
+  add_definitions(-DOS_GNU_KFREEBSD)
+elseif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
+  add_definitions(-DOS_FREEBSD)
+elseif(CMAKE_SYSTEM_NAME MATCHES "NetBSD")
+  add_definitions(-DOS_NETBSD)
+elseif(CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
+  add_definitions(-DOS_OPENBSD)
+elseif(CMAKE_SYSTEM_NAME MATCHES "DragonFly")
+  add_definitions(-DOS_DRAGONFLYBSD)
+elseif(CMAKE_SYSTEM_NAME MATCHES "Android")
+  add_definitions(-DOS_ANDROID)
+elseif(CMAKE_SYSTEM_NAME MATCHES "Windows")
+  add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN64 -DNOMINMAX)
+  if(MINGW)
+    add_definitions(-D_WIN32_WINNT=_WIN32_WINNT_VISTA)
+  endif()
+endif()
+
+if(NOT WIN32)
+  add_definitions(-DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX)
+endif()
+
+option(WITH_FALLOCATE "build with fallocate" ON)
+if(WITH_FALLOCATE)
+  CHECK_CXX_SOURCE_COMPILES("
+#include <fcntl.h>
+#include <linux/falloc.h>
+int main() {
+ int fd = open(\"/dev/null\", 0);
+ fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, 1024);
+}
+" HAVE_FALLOCATE)
+  if(HAVE_FALLOCATE)
+    add_definitions(-DROCKSDB_FALLOCATE_PRESENT)
+  endif()
+endif()
+
+CHECK_CXX_SOURCE_COMPILES("
+#include <fcntl.h>
+int main() {
+  int fd = open(\"/dev/null\", 0);
+  sync_file_range(fd, 0, 1024, SYNC_FILE_RANGE_WRITE);
+}
+" HAVE_SYNC_FILE_RANGE_WRITE)
+if(HAVE_SYNC_FILE_RANGE_WRITE)
+  add_definitions(-DROCKSDB_RANGESYNC_PRESENT)
+endif()
+
+CHECK_CXX_SOURCE_COMPILES("
+#include <pthread.h>
+int main() {
+  (void) PTHREAD_MUTEX_ADAPTIVE_NP;
+}
+" HAVE_PTHREAD_MUTEX_ADAPTIVE_NP)
+if(HAVE_PTHREAD_MUTEX_ADAPTIVE_NP)
+  add_definitions(-DROCKSDB_PTHREAD_ADAPTIVE_MUTEX)
+endif()
+
+include(CheckCXXSymbolExists)
+if(CMAKE_SYSTEM_NAME MATCHES "^FreeBSD")
+  check_cxx_symbol_exists(malloc_usable_size malloc_np.h HAVE_MALLOC_USABLE_SIZE)
+else()
+  check_cxx_symbol_exists(malloc_usable_size malloc.h HAVE_MALLOC_USABLE_SIZE)
+endif()
+if(HAVE_MALLOC_USABLE_SIZE)
+  add_definitions(-DROCKSDB_MALLOC_USABLE_SIZE)
+endif()
+
+check_cxx_symbol_exists(sched_getcpu sched.h HAVE_SCHED_GETCPU)
+if(HAVE_SCHED_GETCPU)
+  add_definitions(-DROCKSDB_SCHED_GETCPU_PRESENT)
+endif()
+
+check_cxx_symbol_exists(getauxval auvx.h HAVE_AUXV_GETAUXVAL)
+if(HAVE_AUXV_GETAUXVAL)
+  add_definitions(-DROCKSDB_AUXV_GETAUXVAL_PRESENT)
+endif()
+
+check_cxx_symbol_exists(F_FULLFSYNC "fcntl.h" HAVE_FULLFSYNC)
+if(HAVE_FULLFSYNC)
+  add_definitions(-DHAVE_FULLFSYNC)
+endif()
+
+include_directories(${PROJECT_SOURCE_DIR})
+include_directories(${PROJECT_SOURCE_DIR}/include)
+
+if(USE_COROUTINES)
+  if(USE_FOLLY OR USE_FOLLY_LITE)
+    message(FATAL_ERROR "Please specify exactly one of USE_COROUTINES,"
+    " USE_FOLLY, and USE_FOLLY_LITE")
+  endif()
+  set(CMAKE_CXX_STANDARD 20)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fcoroutines -Wno-maybe-uninitialized")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-redundant-move")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-invalid-memory-model")
+  add_compile_definitions(USE_COROUTINES)
+  set(USE_FOLLY 1)
+endif()
+
+if(USE_FOLLY)
+  if(USE_FOLLY_LITE)
+    message(FATAL_ERROR "Please specify one of USE_FOLLY or USE_FOLLY_LITE")
+  endif()
+  if(ROCKSDB_BUILD_SHARED)
+    message(FATAL_ERROR "Cannot build RocksDB shared library with folly")
+  endif()
+  set(ROCKSDB_BUILD_SHARED OFF)
+  set(GFLAGS_SHARED FALSE)
+  find_package(folly)
+  # If cmake could not find the folly-config.cmake file, fall back
+  # to looking in third-party/folly for folly and its dependencies
+  if(NOT FOLLY_LIBRARIES)
+    exec_program(python3 ${PROJECT_SOURCE_DIR}/third-party/folly ARGS
+    build/fbcode_builder/getdeps.py show-inst-dir OUTPUT_VARIABLE
+    FOLLY_INST_PATH)
+    exec_program(ls ARGS -d ${FOLLY_INST_PATH}/../boost* OUTPUT_VARIABLE
+    BOOST_INST_PATH)
+    exec_program(ls ARGS -d ${FOLLY_INST_PATH}/../fmt* OUTPUT_VARIABLE
+    FMT_INST_PATH)
+    exec_program(ls ARGS -d ${FOLLY_INST_PATH}/../gflags* OUTPUT_VARIABLE
+    GFLAGS_INST_PATH)
+    set(Boost_DIR ${BOOST_INST_PATH}/lib/cmake/Boost-1.78.0)
+    if(EXISTS ${FMT_INST_PATH}/lib64)
+      set(fmt_DIR ${FMT_INST_PATH}/lib64/cmake/fmt)
+    else()
+      set(fmt_DIR ${FMT_INST_PATH}/lib/cmake/fmt)
+    endif()
+    set(gflags_DIR ${GFLAGS_INST_PATH}/lib/cmake/gflags)
+
+    exec_program(sed ARGS -i 's/gflags_shared//g'
+    ${FOLLY_INST_PATH}/lib/cmake/folly/folly-targets.cmake)
+
+    include(${FOLLY_INST_PATH}/lib/cmake/folly/folly-config.cmake)
+  endif()
+
+  add_compile_definitions(USE_FOLLY FOLLY_NO_CONFIG HAVE_CXX11_ATOMIC)
+  list(APPEND THIRDPARTY_LIBS Folly::folly)
+  set(FOLLY_LIBS Folly::folly)
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--copy-dt-needed-entries")
+endif()
+find_package(Threads REQUIRED)
+
+# Main library source code
+
+set(SOURCES
+        cache/cache.cc
+        cache/cache_entry_roles.cc
+        cache/cache_key.cc
+        cache/cache_reservation_manager.cc
+        cache/charged_cache.cc
+        cache/clock_cache.cc
+        cache/compressed_secondary_cache.cc
+        cache/lru_cache.cc
+        cache/secondary_cache.cc
+        cache/sharded_cache.cc
+        db/arena_wrapped_db_iter.cc
+        db/blob/blob_contents.cc
+        db/blob/blob_fetcher.cc
+        db/blob/blob_file_addition.cc
+        db/blob/blob_file_builder.cc
+        db/blob/blob_file_cache.cc
+        db/blob/blob_file_garbage.cc
+        db/blob/blob_file_meta.cc
+        db/blob/blob_file_reader.cc
+        db/blob/blob_garbage_meter.cc
+        db/blob/blob_log_format.cc
+        db/blob/blob_log_sequential_reader.cc
+        db/blob/blob_log_writer.cc
+        db/blob/blob_source.cc
+        db/blob/prefetch_buffer_collection.cc
+        db/builder.cc
+        db/c.cc
+        db/column_family.cc
+        db/compaction/compaction.cc
+        db/compaction/compaction_iterator.cc
+        db/compaction/compaction_picker.cc
+        db/compaction/compaction_job.cc
+        db/compaction/compaction_picker_fifo.cc
+        db/compaction/compaction_picker_level.cc
+        db/compaction/compaction_picker_universal.cc
+        db/compaction/compaction_service_job.cc
+        db/compaction/compaction_state.cc
+        db/compaction/compaction_outputs.cc
+        db/compaction/sst_partitioner.cc
+        db/compaction/subcompaction_state.cc
+        db/convenience.cc
+        db/db_filesnapshot.cc
+        db/db_impl/compacted_db_impl.cc
+        db/db_impl/db_impl.cc
+        db/db_impl/db_impl_write.cc
+        db/db_impl/db_impl_compaction_flush.cc
+        db/db_impl/db_impl_files.cc
+        db/db_impl/db_impl_open.cc
+        db/db_impl/db_impl_debug.cc
+        db/db_impl/db_impl_experimental.cc
+        db/db_impl/db_impl_readonly.cc
+        db/db_impl/db_impl_secondary.cc
+        db/db_info_dumper.cc
+        db/db_iter.cc
+        db/dbformat.cc
+        db/error_handler.cc
+        db/event_helpers.cc
+        db/experimental.cc
+        db/external_sst_file_ingestion_job.cc
+        db/file_indexer.cc
+        db/flush_job.cc
+        db/flush_scheduler.cc
+        db/forward_iterator.cc
+        db/import_column_family_job.cc
+        db/internal_stats.cc
+        db/logs_with_prep_tracker.cc
+        db/log_reader.cc
+        db/log_writer.cc
+        db/malloc_stats.cc
+        db/memtable.cc
+        db/memtable_list.cc
+        db/merge_helper.cc
+        db/merge_operator.cc
+        db/output_validator.cc
+        db/periodic_task_scheduler.cc
+        db/range_del_aggregator.cc
+        db/range_tombstone_fragmenter.cc
+        db/repair.cc
+        db/seqno_to_time_mapping.cc
+        db/snapshot_impl.cc
+        db/table_cache.cc
+        db/table_properties_collector.cc
+        db/transaction_log_impl.cc
+        db/trim_history_scheduler.cc
+        db/version_builder.cc
+        db/version_edit.cc
+        db/version_edit_handler.cc
+        db/version_set.cc
+        db/wal_edit.cc
+        db/wal_manager.cc
+        db/wide/wide_column_serialization.cc
+        db/wide/wide_columns.cc
+        db/write_batch.cc
+        db/write_batch_base.cc
+        db/write_controller.cc
+        db/write_thread.cc
+        env/composite_env.cc
+        env/env.cc
+        env/env_chroot.cc
+        env/env_encryption.cc
+        env/file_system.cc
+        env/file_system_tracer.cc
+        env/fs_remap.cc
+        env/mock_env.cc
+        env/unique_id_gen.cc
+        file/delete_scheduler.cc
+        file/file_prefetch_buffer.cc
+        file/file_util.cc
+        file/filename.cc
+        file/line_file_reader.cc
+        file/random_access_file_reader.cc
+        file/read_write_util.cc
+        file/readahead_raf.cc
+        file/sequence_file_reader.cc
+        file/sst_file_manager_impl.cc
+        file/writable_file_writer.cc
+        logging/auto_roll_logger.cc
+        logging/event_logger.cc
+        logging/log_buffer.cc
+        memory/arena.cc
+        memory/concurrent_arena.cc
+        memory/jemalloc_nodump_allocator.cc
+        memory/memkind_kmem_allocator.cc
+        memory/memory_allocator.cc
+        memtable/alloc_tracker.cc
+        memtable/hash_linklist_rep.cc
+        memtable/hash_skiplist_rep.cc
+        memtable/skiplistrep.cc
+        memtable/vectorrep.cc
+        memtable/write_buffer_manager.cc
+        monitoring/histogram.cc
+        monitoring/histogram_windowing.cc
+        monitoring/in_memory_stats_history.cc
+        monitoring/instrumented_mutex.cc
+        monitoring/iostats_context.cc
+        monitoring/perf_context.cc
+        monitoring/perf_level.cc
+        monitoring/persistent_stats_history.cc
+        monitoring/statistics.cc
+        monitoring/thread_status_impl.cc
+        monitoring/thread_status_updater.cc
+        monitoring/thread_status_util.cc
+        monitoring/thread_status_util_debug.cc
+        options/cf_options.cc
+        options/configurable.cc
+        options/customizable.cc
+        options/db_options.cc
+        options/options.cc
+        options/options_helper.cc
+        options/options_parser.cc
+        port/stack_trace.cc
+        table/adaptive/adaptive_table_factory.cc
+        table/block_based/binary_search_index_reader.cc
+        table/block_based/block.cc
+        table/block_based/block_based_table_builder.cc
+        table/block_based/block_based_table_factory.cc
+        table/block_based/block_based_table_iterator.cc
+        table/block_based/block_based_table_reader.cc
+        table/block_based/block_builder.cc
+        table/block_based/block_prefetcher.cc
+        table/block_based/block_prefix_index.cc
+        table/block_based/data_block_hash_index.cc
+        table/block_based/data_block_footer.cc
+        table/block_based/filter_block_reader_common.cc
+        table/block_based/filter_policy.cc
+        table/block_based/flush_block_policy.cc
+        table/block_based/full_filter_block.cc
+        table/block_based/hash_index_reader.cc
+        table/block_based/index_builder.cc
+        table/block_based/index_reader_common.cc
+        table/block_based/parsed_full_filter_block.cc
+        table/block_based/partitioned_filter_block.cc
+        table/block_based/partitioned_index_iterator.cc
+        table/block_based/partitioned_index_reader.cc
+        table/block_based/reader_common.cc
+        table/block_based/uncompression_dict_reader.cc
+        table/block_fetcher.cc
+        table/cuckoo/cuckoo_table_builder.cc
+        table/cuckoo/cuckoo_table_factory.cc
+        table/cuckoo/cuckoo_table_reader.cc
+        table/format.cc
+        table/get_context.cc
+        table/iterator.cc
+        table/merging_iterator.cc
+        table/meta_blocks.cc
+        table/persistent_cache_helper.cc
+        table/plain/plain_table_bloom.cc
+        table/plain/plain_table_builder.cc
+        table/plain/plain_table_factory.cc
+        table/plain/plain_table_index.cc
+        table/plain/plain_table_key_coding.cc
+        table/plain/plain_table_reader.cc
+        table/sst_file_dumper.cc
+        table/sst_file_reader.cc
+        table/sst_file_writer.cc
+        table/table_factory.cc
+        table/table_properties.cc
+        table/two_level_iterator.cc
+        table/unique_id.cc
+        test_util/sync_point.cc
+        test_util/sync_point_impl.cc
+        test_util/testutil.cc
+        test_util/transaction_test_util.cc
+        tools/block_cache_analyzer/block_cache_trace_analyzer.cc
+        tools/dump/db_dump_tool.cc
+        tools/io_tracer_parser_tool.cc
+        tools/ldb_cmd.cc
+        tools/ldb_tool.cc
+        tools/sst_dump_tool.cc
+        tools/trace_analyzer_tool.cc
+        trace_replay/block_cache_tracer.cc
+        trace_replay/io_tracer.cc
+        trace_replay/trace_record_handler.cc
+        trace_replay/trace_record_result.cc
+        trace_replay/trace_record.cc
+        trace_replay/trace_replay.cc
+        util/async_file_reader.cc
+        util/cleanable.cc
+        util/coding.cc
+        util/compaction_job_stats_impl.cc
+        util/comparator.cc
+        util/compression.cc
+        util/compression_context_cache.cc
+        util/concurrent_task_limiter_impl.cc
+        util/crc32c.cc
+        util/dynamic_bloom.cc
+        util/hash.cc
+        util/murmurhash.cc
+        util/random.cc
+        util/rate_limiter.cc
+        util/ribbon_config.cc
+        util/slice.cc
+        util/file_checksum_helper.cc
+        util/status.cc
+        util/stderr_logger.cc
+        util/string_util.cc
+        util/thread_local.cc
+        util/threadpool_imp.cc
+        util/xxhash.cc
+        utilities/agg_merge/agg_merge.cc
+        utilities/backup/backup_engine.cc
+        utilities/blob_db/blob_compaction_filter.cc
+        utilities/blob_db/blob_db.cc
+        utilities/blob_db/blob_db_impl.cc
+        utilities/blob_db/blob_db_impl_filesnapshot.cc
+        utilities/blob_db/blob_dump_tool.cc
+        utilities/blob_db/blob_file.cc
+        utilities/cache_dump_load.cc
+        utilities/cache_dump_load_impl.cc
+        utilities/cassandra/cassandra_compaction_filter.cc
+        utilities/cassandra/format.cc
+        utilities/cassandra/merge_operator.cc
+        utilities/checkpoint/checkpoint_impl.cc
+        utilities/compaction_filters.cc
+        utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc
+        utilities/counted_fs.cc
+        utilities/debug.cc
+        utilities/env_mirror.cc
+        utilities/env_timed.cc
+        utilities/fault_injection_env.cc
+        utilities/fault_injection_fs.cc
+        utilities/fault_injection_secondary_cache.cc
+        utilities/leveldb_options/leveldb_options.cc
+        utilities/memory/memory_util.cc
+        utilities/merge_operators.cc
+        utilities/merge_operators/bytesxor.cc
+        utilities/merge_operators/max.cc
+        utilities/merge_operators/put.cc
+        utilities/merge_operators/sortlist.cc
+        utilities/merge_operators/string_append/stringappend.cc
+        utilities/merge_operators/string_append/stringappend2.cc
+        utilities/merge_operators/uint64add.cc
+        utilities/object_registry.cc
+        utilities/option_change_migration/option_change_migration.cc
+        utilities/options/options_util.cc
+        utilities/persistent_cache/block_cache_tier.cc
+        utilities/persistent_cache/block_cache_tier_file.cc
+        utilities/persistent_cache/block_cache_tier_metadata.cc
+        utilities/persistent_cache/persistent_cache_tier.cc
+        utilities/persistent_cache/volatile_tier_impl.cc
+        utilities/simulator_cache/cache_simulator.cc
+        utilities/simulator_cache/sim_cache.cc
+        utilities/table_properties_collectors/compact_on_deletion_collector.cc
+        utilities/trace/file_trace_reader_writer.cc
+        utilities/trace/replayer_impl.cc
+        utilities/transactions/lock/lock_manager.cc
+        utilities/transactions/lock/point/point_lock_tracker.cc
+        utilities/transactions/lock/point/point_lock_manager.cc
+        utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc
+        utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc
+        utilities/transactions/optimistic_transaction_db_impl.cc
+        utilities/transactions/optimistic_transaction.cc
+        utilities/transactions/pessimistic_transaction.cc
+        utilities/transactions/pessimistic_transaction_db.cc
+        utilities/transactions/snapshot_checker.cc
+        utilities/transactions/transaction_base.cc
+        utilities/transactions/transaction_db_mutex_impl.cc
+        utilities/transactions/transaction_util.cc
+        utilities/transactions/write_prepared_txn.cc
+        utilities/transactions/write_prepared_txn_db.cc
+        utilities/transactions/write_unprepared_txn.cc
+        utilities/transactions/write_unprepared_txn_db.cc
+        utilities/ttl/db_ttl_impl.cc
+        utilities/wal_filter.cc
+        utilities/write_batch_with_index/write_batch_with_index.cc
+        utilities/write_batch_with_index/write_batch_with_index_internal.cc)
+
+list(APPEND SOURCES
+  utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc
+  utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc
+  utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc
+  utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc
+  utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc
+  utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc
+  utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc
+  utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc
+  utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc
+  utilities/transactions/lock/range/range_tree/lib/standalone_port.cc
+  utilities/transactions/lock/range/range_tree/lib/util/dbt.cc
+  utilities/transactions/lock/range/range_tree/lib/util/memarena.cc)
+
+message(STATUS "ROCKSDB_PLUGINS: ${ROCKSDB_PLUGINS}")
+if ( ROCKSDB_PLUGINS )
+  string(REPLACE " " ";" PLUGINS ${ROCKSDB_PLUGINS})
+  foreach (plugin ${PLUGINS})
+    add_subdirectory("plugin/${plugin}")
+    foreach (src ${${plugin}_SOURCES})
+      list(APPEND SOURCES plugin/${plugin}/${src})
+      set_source_files_properties(
+        plugin/${plugin}/${src}
+        PROPERTIES COMPILE_FLAGS "${${plugin}_COMPILE_FLAGS}")
+    endforeach()
+    foreach (path ${${plugin}_INCLUDE_PATHS})
+      include_directories(${path})
+    endforeach()
+    foreach (lib ${${plugin}_LIBS})
+      list(APPEND THIRDPARTY_LIBS ${lib})
+    endforeach()
+    foreach (link_path ${${plugin}_LINK_PATHS})
+      link_directories(AFTER ${link_path})
+    endforeach()
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${${plugin}_CMAKE_SHARED_LINKER_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${${plugin}_CMAKE_EXE_LINKER_FLAGS}")
+  endforeach()
+endif()
+
+if(HAVE_SSE42 AND NOT MSVC)
+  set_source_files_properties(
+    util/crc32c.cc
+    PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul")
+endif()
+
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64")
+  list(APPEND SOURCES
+    util/crc32c_ppc.c
+    util/crc32c_ppc_asm.S)
+endif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64")
+
+if(HAS_ARMV8_CRC)
+  list(APPEND SOURCES
+    util/crc32c_arm64.cc)
+endif(HAS_ARMV8_CRC)
+
+if(WIN32)
+  list(APPEND SOURCES
+    port/win/io_win.cc
+    port/win/env_win.cc
+    port/win/env_default.cc
+    port/win/port_win.cc
+    port/win/win_logger.cc
+    port/win/win_thread.cc)
+if(WITH_XPRESS)
+  list(APPEND SOURCES
+    port/win/xpress_win.cc)
+endif()
+
+if(WITH_JEMALLOC)
+  list(APPEND SOURCES
+    port/win/win_jemalloc.cc)
+endif()
+
+else()
+  list(APPEND SOURCES
+    port/port_posix.cc
+    env/env_posix.cc
+    env/fs_posix.cc
+    env/io_posix.cc)
+endif()
+
+if(USE_FOLLY_LITE)
+  list(APPEND SOURCES
+    third-party/folly/folly/container/detail/F14Table.cpp
+    third-party/folly/folly/detail/Futex.cpp
+    third-party/folly/folly/lang/SafeAssert.cpp
+    third-party/folly/folly/lang/ToAscii.cpp
+    third-party/folly/folly/ScopeGuard.cpp
+    third-party/folly/folly/synchronization/AtomicNotification.cpp
+    third-party/folly/folly/synchronization/DistributedMutex.cpp
+    third-party/folly/folly/synchronization/ParkingLot.cpp)
+  include_directories(${PROJECT_SOURCE_DIR}/third-party/folly)
+  add_definitions(-DUSE_FOLLY -DFOLLY_NO_CONFIG)
+  list(APPEND THIRDPARTY_LIBS glog)
+endif()
+
+set(ROCKSDB_STATIC_LIB rocksdb${ARTIFACT_SUFFIX})
+set(ROCKSDB_SHARED_LIB rocksdb-shared${ARTIFACT_SUFFIX})
+
+
+if(WIN32)
+  set(SYSTEM_LIBS ${SYSTEM_LIBS} shlwapi.lib rpcrt4.lib)
+else()
+  set(SYSTEM_LIBS ${CMAKE_THREAD_LIBS_INIT})
+endif()
+
+set(ROCKSDB_PLUGIN_EXTERNS "")
+set(ROCKSDB_PLUGIN_BUILTINS "")
+message(STATUS "ROCKSDB PLUGINS TO BUILD ${ROCKSDB_PLUGINS}")
+foreach(PLUGIN IN LISTS PLUGINS)
+  set(PLUGIN_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/plugin/${PLUGIN}/")
+  message(STATUS "PLUGIN ${PLUGIN} including rocksb plugin ${PLUGIN_ROOT}")
+  set(PLUGINMKFILE "${PLUGIN_ROOT}${PLUGIN}.mk")
+  if (NOT EXISTS ${PLUGINMKFILE})
+    message(FATAL_ERROR "PLUGIN ${PLUGIN} Missing plugin makefile: ${PLUGINMKFILE}")
+  endif()
+  file(READ ${PLUGINMKFILE} PLUGINMK)
+
+  string(REGEX MATCH "SOURCES = ([^\n]*)" FOO ${PLUGINMK})
+  set(MK_SOURCES ${CMAKE_MATCH_1})
+  separate_arguments(MK_SOURCES)
+  foreach(MK_FILE IN LISTS MK_SOURCES)
+    list(APPEND SOURCES "${PLUGIN_ROOT}${MK_FILE}")
+    message(STATUS "PLUGIN ${PLUGIN} Appending ${PLUGIN_ROOT}${MK_FILE} to SOURCES")
+  endforeach()
+
+  string(REGEX MATCH "_FUNC = ([^\n]*)" FOO ${PLUGINMK})
+  if (NOT ${CMAKE_MATCH_1} STREQUAL "")
+    string(APPEND ROCKSDB_PLUGIN_BUILTINS "{\"${PLUGIN}\", " ${CMAKE_MATCH_1} "},")
+    string(APPEND ROCKSDB_PLUGIN_EXTERNS "int " ${CMAKE_MATCH_1} "(ROCKSDB_NAMESPACE::ObjectLibrary&, const std::string&); ")
+  endif()
+
+  string(REGEX MATCH "_LIBS = ([^\n]*)" FOO ${PLUGINMK})
+  separate_arguments(CMAKE_MATCH_1)
+  foreach(MK_LIB IN LISTS CMAKE_MATCH_1)
+    list(APPEND THIRDPARTY_LIBS "${MK_LIB}")
+  endforeach()
+  message(STATUS "PLUGIN ${PLUGIN} THIRDPARTY_LIBS=${THIRDPARTY_LIBS}")
+
+  #TODO: We need to set any compile/link-time flags and add any link libraries
+endforeach()
+
+string(TIMESTAMP TS "%Y-%m-%d %H:%M:%S" UTC)
+set(BUILD_DATE "${TS}" CACHE STRING "the time we first built rocksdb")
+
+find_package(Git)
+
+if(GIT_FOUND AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
+  execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" OUTPUT_VARIABLE GIT_SHA COMMAND "${GIT_EXECUTABLE}" rev-parse HEAD )
+  execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" RESULT_VARIABLE GIT_MOD COMMAND "${GIT_EXECUTABLE}" diff-index HEAD --quiet)
+  execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" OUTPUT_VARIABLE GIT_DATE COMMAND "${GIT_EXECUTABLE}" log -1 --date=format:"%Y-%m-%d %T" --format="%ad")
+  execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" OUTPUT_VARIABLE GIT_TAG RESULT_VARIABLE rv COMMAND "${GIT_EXECUTABLE}" symbolic-ref -q --short HEAD OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if (rv AND NOT rv EQUAL 0)
+    execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" OUTPUT_VARIABLE GIT_TAG COMMAND "${GIT_EXECUTABLE}" describe --tags --exact-match OUTPUT_STRIP_TRAILING_WHITESPACE)
+  endif()
+else()
+  set(GIT_SHA 0)
+  set(GIT_MOD 1)
+endif()
+string(REGEX REPLACE "[^0-9a-fA-F]+" "" GIT_SHA "${GIT_SHA}")
+string(REGEX REPLACE "[^0-9: /-]+" "" GIT_DATE "${GIT_DATE}")
+
+set(BUILD_VERSION_CC ${CMAKE_BINARY_DIR}/build_version.cc)
+configure_file(util/build_version.cc.in ${BUILD_VERSION_CC} @ONLY)
+
+add_library(${ROCKSDB_STATIC_LIB} STATIC ${SOURCES} ${BUILD_VERSION_CC})
+target_link_libraries(${ROCKSDB_STATIC_LIB} PRIVATE
+  ${THIRDPARTY_LIBS} ${SYSTEM_LIBS})
+
+if(ROCKSDB_BUILD_SHARED)
+  add_library(${ROCKSDB_SHARED_LIB} SHARED ${SOURCES} ${BUILD_VERSION_CC})
+  target_link_libraries(${ROCKSDB_SHARED_LIB} PRIVATE
+    ${THIRDPARTY_LIBS} ${SYSTEM_LIBS})
+
+  if(WIN32)
+    set_target_properties(${ROCKSDB_SHARED_LIB} PROPERTIES
+      COMPILE_DEFINITIONS "ROCKSDB_DLL;ROCKSDB_LIBRARY_EXPORTS")
+    if(MSVC)
+      set_target_properties(${ROCKSDB_STATIC_LIB} PROPERTIES
+        COMPILE_FLAGS "/Fd${CMAKE_CFG_INTDIR}/${ROCKSDB_STATIC_LIB}.pdb")
+      set_target_properties(${ROCKSDB_SHARED_LIB} PROPERTIES
+        COMPILE_FLAGS "/Fd${CMAKE_CFG_INTDIR}/${ROCKSDB_SHARED_LIB}.pdb")
+    endif()
+  else()
+    set_target_properties(${ROCKSDB_SHARED_LIB} PROPERTIES
+                          LINKER_LANGUAGE CXX
+                          VERSION ${rocksdb_VERSION}
+                          SOVERSION ${rocksdb_VERSION_MAJOR}
+                          OUTPUT_NAME "rocksdb${ARTIFACT_SUFFIX}")
+  endif()
+endif()
+
+if(ROCKSDB_BUILD_SHARED AND NOT WIN32)
+  set(ROCKSDB_LIB ${ROCKSDB_SHARED_LIB})
+else()
+  set(ROCKSDB_LIB ${ROCKSDB_STATIC_LIB})
+endif()
+
+option(WITH_JNI "build with JNI" OFF)
+# Tests are excluded from Release builds
+CMAKE_DEPENDENT_OPTION(WITH_TESTS "build with tests" ON
+  "CMAKE_BUILD_TYPE STREQUAL Debug" OFF)
+option(WITH_BENCHMARK_TOOLS "build with benchmarks" ON)
+option(WITH_CORE_TOOLS "build with ldb and sst_dump" ON)
+option(WITH_TOOLS "build with tools" ON)
+
+if(WITH_TESTS OR WITH_BENCHMARK_TOOLS OR WITH_TOOLS OR WITH_JNI OR JNI)
+  include_directories(SYSTEM ${PROJECT_SOURCE_DIR}/third-party/gtest-1.8.1/fused-src)
+endif()
+if(WITH_JNI OR JNI)
+  message(STATUS "JNI library is enabled")
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/java)
+else()
+  message(STATUS "JNI library is disabled")
+endif()
+
+# Installation and packaging
+if(WIN32)
+  option(ROCKSDB_INSTALL_ON_WINDOWS "Enable install target on Windows" OFF)
+endif()
+if(NOT WIN32 OR ROCKSDB_INSTALL_ON_WINDOWS)
+  if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+    if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
+      # Change default installation prefix on Linux to /usr
+      set(CMAKE_INSTALL_PREFIX /usr CACHE PATH "Install path prefix, prepended onto install directories." FORCE)
+    endif()
+  endif()
+
+  include(GNUInstallDirs)
+  include(CMakePackageConfigHelpers)
+
+  set(package_config_destination ${CMAKE_INSTALL_LIBDIR}/cmake/rocksdb)
+
+  configure_package_config_file(
+    ${CMAKE_CURRENT_LIST_DIR}/cmake/RocksDBConfig.cmake.in RocksDBConfig.cmake
+    INSTALL_DESTINATION ${package_config_destination}
+  )
+
+  write_basic_package_version_file(
+    RocksDBConfigVersion.cmake
+    VERSION ${rocksdb_VERSION}
+    COMPATIBILITY SameMajorVersion
+  )
+
+  configure_file(
+    ${PROJECT_NAME}.pc.in
+    ${PROJECT_NAME}.pc
+    @ONLY
+  )
+
+  install(DIRECTORY include/rocksdb COMPONENT devel DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")
+
+  foreach (plugin ${PLUGINS})
+    foreach (header ${${plugin}_HEADERS})
+      install(FILES plugin/${plugin}/${header} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rocksdb/plugin/${plugin})
+    endforeach()
+  endforeach()
+
+  install(DIRECTORY "${PROJECT_SOURCE_DIR}/cmake/modules" COMPONENT devel DESTINATION ${package_config_destination})
+
+  install(
+    TARGETS ${ROCKSDB_STATIC_LIB}
+    EXPORT RocksDBTargets
+    COMPONENT devel
+    ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+    INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
+  )
+
+  if(ROCKSDB_BUILD_SHARED)
+    install(
+      TARGETS ${ROCKSDB_SHARED_LIB}
+      EXPORT RocksDBTargets
+      COMPONENT runtime
+      ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+      RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
+      LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+      INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
+    )
+  endif()
+
+  install(
+    EXPORT RocksDBTargets
+    COMPONENT devel
+    DESTINATION ${package_config_destination}
+    NAMESPACE RocksDB::
+  )
+
+  install(
+    FILES
+    ${CMAKE_CURRENT_BINARY_DIR}/RocksDBConfig.cmake
+    ${CMAKE_CURRENT_BINARY_DIR}/RocksDBConfigVersion.cmake
+    COMPONENT devel
+    DESTINATION ${package_config_destination}
+  )
+
+  install(
+    FILES
+    ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc
+    COMPONENT devel
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig
+  )
+endif()
+
+option(WITH_ALL_TESTS "Build all test, rather than a small subset" ON)
+
+if(WITH_TESTS OR WITH_BENCHMARK_TOOLS)
+  add_subdirectory(third-party/gtest-1.8.1/fused-src/gtest)
+  add_library(testharness STATIC
+  test_util/mock_time_env.cc
+  test_util/testharness.cc)
+  target_link_libraries(testharness gtest)
+endif()
+
+if(WITH_TESTS)
+  set(TESTS
+        db/db_basic_test.cc
+        env/env_basic_test.cc
+  )
+  if(WITH_ALL_TESTS)
+    list(APPEND TESTS
+        cache/cache_reservation_manager_test.cc
+        cache/cache_test.cc
+        cache/compressed_secondary_cache_test.cc
+        cache/lru_cache_test.cc
+        db/blob/blob_counting_iterator_test.cc
+        db/blob/blob_file_addition_test.cc
+        db/blob/blob_file_builder_test.cc
+        db/blob/blob_file_cache_test.cc
+        db/blob/blob_file_garbage_test.cc
+        db/blob/blob_file_reader_test.cc
+        db/blob/blob_garbage_meter_test.cc
+        db/blob/blob_source_test.cc
+        db/blob/db_blob_basic_test.cc
+        db/blob/db_blob_compaction_test.cc
+        db/blob/db_blob_corruption_test.cc
+        db/blob/db_blob_index_test.cc
+        db/column_family_test.cc
+        db/compact_files_test.cc
+        db/compaction/clipping_iterator_test.cc
+        db/compaction/compaction_job_stats_test.cc
+        db/compaction/compaction_job_test.cc
+        db/compaction/compaction_iterator_test.cc
+        db/compaction/compaction_picker_test.cc
+        db/compaction/compaction_service_test.cc
+        db/compaction/tiered_compaction_test.cc
+        db/comparator_db_test.cc
+        db/corruption_test.cc
+        db/cuckoo_table_db_test.cc
+        db/db_readonly_with_timestamp_test.cc
+        db/db_with_timestamp_basic_test.cc
+        db/db_block_cache_test.cc
+        db/db_bloom_filter_test.cc
+        db/db_compaction_filter_test.cc
+        db/db_compaction_test.cc
+        db/db_dynamic_level_test.cc
+        db/db_encryption_test.cc
+        db/db_flush_test.cc
+        db/db_inplace_update_test.cc
+        db/db_io_failure_test.cc
+        db/db_iter_test.cc
+        db/db_iter_stress_test.cc
+        db/db_iterator_test.cc
+        db/db_kv_checksum_test.cc
+        db/db_log_iter_test.cc
+        db/db_memtable_test.cc
+        db/db_merge_operator_test.cc
+        db/db_merge_operand_test.cc
+        db/db_options_test.cc
+        db/db_properties_test.cc
+        db/db_range_del_test.cc
+        db/db_rate_limiter_test.cc
+        db/db_secondary_test.cc
+        db/db_sst_test.cc
+        db/db_statistics_test.cc
+        db/db_table_properties_test.cc
+        db/db_tailing_iter_test.cc
+        db/db_test.cc
+        db/db_test2.cc
+        db/db_logical_block_size_cache_test.cc
+        db/db_universal_compaction_test.cc
+        db/db_wal_test.cc
+        db/db_with_timestamp_compaction_test.cc
+        db/db_write_buffer_manager_test.cc
+        db/db_write_test.cc
+        db/dbformat_test.cc
+        db/deletefile_test.cc
+        db/error_handler_fs_test.cc
+        db/obsolete_files_test.cc
+        db/external_sst_file_basic_test.cc
+        db/external_sst_file_test.cc
+        db/fault_injection_test.cc
+        db/file_indexer_test.cc
+        db/filename_test.cc
+        db/flush_job_test.cc
+        db/import_column_family_test.cc
+        db/listener_test.cc
+        db/log_test.cc
+        db/manual_compaction_test.cc
+        db/memtable_list_test.cc
+        db/merge_helper_test.cc
+        db/merge_test.cc
+        db/options_file_test.cc
+        db/perf_context_test.cc
+        db/periodic_task_scheduler_test.cc
+        db/plain_table_db_test.cc
+        db/seqno_time_test.cc
+        db/prefix_test.cc
+        db/range_del_aggregator_test.cc
+        db/range_tombstone_fragmenter_test.cc
+        db/repair_test.cc
+        db/table_properties_collector_test.cc
+        db/version_builder_test.cc
+        db/version_edit_test.cc
+        db/version_set_test.cc
+        db/wal_manager_test.cc
+        db/wal_edit_test.cc
+        db/wide/db_wide_basic_test.cc
+        db/wide/wide_column_serialization_test.cc
+        db/write_batch_test.cc
+        db/write_callback_test.cc
+        db/write_controller_test.cc
+        env/env_test.cc
+        env/io_posix_test.cc
+        env/mock_env_test.cc
+        file/delete_scheduler_test.cc
+        file/prefetch_test.cc
+        file/random_access_file_reader_test.cc
+        logging/auto_roll_logger_test.cc
+        logging/env_logger_test.cc
+        logging/event_logger_test.cc
+        memory/arena_test.cc
+        memory/memory_allocator_test.cc
+        memtable/inlineskiplist_test.cc
+        memtable/skiplist_test.cc
+        memtable/write_buffer_manager_test.cc
+        monitoring/histogram_test.cc
+        monitoring/iostats_context_test.cc
+        monitoring/statistics_test.cc
+        monitoring/stats_history_test.cc
+        options/configurable_test.cc
+        options/customizable_test.cc
+        options/options_settable_test.cc
+        options/options_test.cc
+        table/block_based/block_based_table_reader_test.cc
+        table/block_based/block_test.cc
+        table/block_based/data_block_hash_index_test.cc
+        table/block_based/full_filter_block_test.cc
+        table/block_based/partitioned_filter_block_test.cc
+        table/cleanable_test.cc
+        table/cuckoo/cuckoo_table_builder_test.cc
+        table/cuckoo/cuckoo_table_reader_test.cc
+        table/merger_test.cc
+        table/sst_file_reader_test.cc
+        table/table_test.cc
+        table/block_fetcher_test.cc
+        test_util/testutil_test.cc
+        trace_replay/block_cache_tracer_test.cc
+        trace_replay/io_tracer_test.cc
+        tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
+        tools/io_tracer_parser_test.cc
+        tools/ldb_cmd_test.cc
+        tools/reduce_levels_test.cc
+        tools/sst_dump_test.cc
+        tools/trace_analyzer_test.cc
+        util/autovector_test.cc
+        util/bloom_test.cc
+        util/coding_test.cc
+        util/crc32c_test.cc
+        util/defer_test.cc
+        util/dynamic_bloom_test.cc
+        util/file_reader_writer_test.cc
+        util/filelock_test.cc
+        util/hash_test.cc
+        util/heap_test.cc
+        util/random_test.cc
+        util/rate_limiter_test.cc
+        util/repeatable_thread_test.cc
+        util/ribbon_test.cc
+        util/slice_test.cc
+        util/slice_transform_test.cc
+        util/timer_queue_test.cc
+        util/timer_test.cc
+        util/thread_list_test.cc
+        util/thread_local_test.cc
+        util/work_queue_test.cc
+        utilities/agg_merge/agg_merge_test.cc
+        utilities/backup/backup_engine_test.cc
+        utilities/blob_db/blob_db_test.cc
+        utilities/cassandra/cassandra_functional_test.cc
+        utilities/cassandra/cassandra_format_test.cc
+        utilities/cassandra/cassandra_row_merge_test.cc
+        utilities/cassandra/cassandra_serialize_test.cc
+        utilities/checkpoint/checkpoint_test.cc
+        utilities/env_timed_test.cc
+        utilities/memory/memory_test.cc
+        utilities/merge_operators/string_append/stringappend_test.cc
+        utilities/object_registry_test.cc
+        utilities/option_change_migration/option_change_migration_test.cc
+        utilities/options/options_util_test.cc
+        utilities/persistent_cache/hash_table_test.cc
+        utilities/persistent_cache/persistent_cache_test.cc
+        utilities/simulator_cache/cache_simulator_test.cc
+        utilities/simulator_cache/sim_cache_test.cc
+        utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
+        utilities/transactions/optimistic_transaction_test.cc
+        utilities/transactions/transaction_test.cc
+        utilities/transactions/lock/point/point_lock_manager_test.cc
+        utilities/transactions/write_committed_transaction_ts_test.cc
+        utilities/transactions/write_prepared_transaction_test.cc
+        utilities/transactions/write_unprepared_transaction_test.cc
+        utilities/transactions/lock/range/range_locking_test.cc
+        utilities/transactions/timestamped_snapshot_test.cc
+        utilities/ttl/ttl_test.cc
+        utilities/util_merge_operators_test.cc
+        utilities/write_batch_with_index/write_batch_with_index_test.cc
+    )
+  endif()
+
+  set(TESTUTIL_SOURCE
+      db/db_test_util.cc
+      db/db_with_timestamp_test_util.cc
+      monitoring/thread_status_updater_debug.cc
+      table/mock_table.cc
+      utilities/agg_merge/test_agg_merge.cc
+      utilities/cassandra/test_utils.cc
+  )
+  enable_testing()
+  add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND})
+  set(TESTUTILLIB testutillib${ARTIFACT_SUFFIX})
+  add_library(${TESTUTILLIB} STATIC ${TESTUTIL_SOURCE})
+  target_link_libraries(${TESTUTILLIB} ${ROCKSDB_LIB} ${FOLLY_LIBS})
+  if(MSVC)
+    set_target_properties(${TESTUTILLIB} PROPERTIES COMPILE_FLAGS "/Fd${CMAKE_CFG_INTDIR}/testutillib${ARTIFACT_SUFFIX}.pdb")
+  endif()
+  set_target_properties(${TESTUTILLIB}
+        PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD_RELEASE 1
+        EXCLUDE_FROM_DEFAULT_BUILD_MINRELEASE 1
+        EXCLUDE_FROM_DEFAULT_BUILD_RELWITHDEBINFO 1
+  )
+
+  foreach(sourcefile ${TESTS})
+      get_filename_component(exename ${sourcefile} NAME_WE)
+      add_executable(${exename}${ARTIFACT_SUFFIX} ${sourcefile})
+      set_target_properties(${exename}${ARTIFACT_SUFFIX}
+        PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD_RELEASE 1
+        EXCLUDE_FROM_DEFAULT_BUILD_MINRELEASE 1
+        EXCLUDE_FROM_DEFAULT_BUILD_RELWITHDEBINFO 1
+        OUTPUT_NAME ${exename}${ARTIFACT_SUFFIX}
+      )
+      target_link_libraries(${exename}${ARTIFACT_SUFFIX} testutillib${ARTIFACT_SUFFIX} testharness gtest ${THIRDPARTY_LIBS} ${ROCKSDB_LIB})
+      if(NOT "${exename}" MATCHES "db_sanity_test")
+        gtest_discover_tests(${exename} DISCOVERY_TIMEOUT 120)
+        add_dependencies(check ${exename}${ARTIFACT_SUFFIX})
+      endif()
+  endforeach(sourcefile ${TESTS})
+
+  if(WIN32)
+    # C executables must link to a shared object
+    if(ROCKSDB_BUILD_SHARED)
+      set(ROCKSDB_LIB_FOR_C ${ROCKSDB_SHARED_LIB})
+    else()
+      set(ROCKSDB_LIB_FOR_C OFF)
+    endif()
+  else()
+    set(ROCKSDB_LIB_FOR_C ${ROCKSDB_LIB})
+  endif()
+
+  if(ROCKSDB_LIB_FOR_C)
+    set(C_TESTS db/c_test.c)
+    add_executable(c_test db/c_test.c)
+    target_link_libraries(c_test ${ROCKSDB_LIB_FOR_C} testharness)
+    add_test(NAME c_test COMMAND c_test${ARTIFACT_SUFFIX})
+    add_dependencies(check c_test)
+  endif()
+endif()
+
+if(WITH_BENCHMARK_TOOLS)
+  add_executable(db_bench${ARTIFACT_SUFFIX}
+    tools/simulated_hybrid_file_system.cc
+    tools/db_bench.cc
+    tools/db_bench_tool.cc)
+  target_link_libraries(db_bench${ARTIFACT_SUFFIX}
+    ${ROCKSDB_LIB} ${THIRDPARTY_LIBS})
+
+  add_executable(cache_bench${ARTIFACT_SUFFIX}
+    cache/cache_bench.cc
+    cache/cache_bench_tool.cc)
+  target_link_libraries(cache_bench${ARTIFACT_SUFFIX}
+    ${ROCKSDB_LIB} ${GFLAGS_LIB} ${FOLLY_LIBS})
+
+  add_executable(memtablerep_bench${ARTIFACT_SUFFIX}
+    memtable/memtablerep_bench.cc)
+  target_link_libraries(memtablerep_bench${ARTIFACT_SUFFIX}
+    ${ROCKSDB_LIB} ${GFLAGS_LIB} ${FOLLY_LIBS})
+
+  add_executable(range_del_aggregator_bench${ARTIFACT_SUFFIX}
+    db/range_del_aggregator_bench.cc)
+  target_link_libraries(range_del_aggregator_bench${ARTIFACT_SUFFIX}
+    ${ROCKSDB_LIB} ${GFLAGS_LIB} ${FOLLY_LIBS})
+
+  add_executable(table_reader_bench${ARTIFACT_SUFFIX}
+    table/table_reader_bench.cc)
+  target_link_libraries(table_reader_bench${ARTIFACT_SUFFIX}
+    ${ROCKSDB_LIB} testharness ${GFLAGS_LIB} ${FOLLY_LIBS})
+
+  add_executable(filter_bench${ARTIFACT_SUFFIX}
+    util/filter_bench.cc)
+  target_link_libraries(filter_bench${ARTIFACT_SUFFIX}
+    ${ROCKSDB_LIB} ${GFLAGS_LIB} ${FOLLY_LIBS})
+
+  add_executable(hash_table_bench${ARTIFACT_SUFFIX}
+    utilities/persistent_cache/hash_table_bench.cc)
+  target_link_libraries(hash_table_bench${ARTIFACT_SUFFIX}
+    ${ROCKSDB_LIB} ${GFLAGS_LIB} ${FOLLY_LIBS})
+endif()
+
+option(WITH_TRACE_TOOLS "build with trace tools" ON)
+if(WITH_TRACE_TOOLS)
+  add_executable(block_cache_trace_analyzer${ARTIFACT_SUFFIX}
+    tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc)
+  target_link_libraries(block_cache_trace_analyzer${ARTIFACT_SUFFIX}
+    ${ROCKSDB_LIB} ${GFLAGS_LIB} ${FOLLY_LIBS})
+
+  add_executable(trace_analyzer${ARTIFACT_SUFFIX}
+    tools/trace_analyzer.cc)
+  target_link_libraries(trace_analyzer${ARTIFACT_SUFFIX}
+    ${ROCKSDB_LIB} ${GFLAGS_LIB} ${FOLLY_LIBS})
+
+endif()
+
+if(WITH_CORE_TOOLS OR WITH_TOOLS)
+  add_subdirectory(tools)
+  add_custom_target(core_tools
+    DEPENDS ${core_tool_deps})
+endif()
+
+if(WITH_TOOLS)
+  add_subdirectory(db_stress_tool)
+  add_custom_target(tools
+    DEPENDS ${tool_deps})
+endif()
+
+option(WITH_EXAMPLES "build with examples" OFF)
+if(WITH_EXAMPLES)
+  add_subdirectory(examples)
+endif()
+
+option(WITH_BENCHMARK "build benchmark tests" OFF)
+if(WITH_BENCHMARK)
+  add_subdirectory(${PROJECT_SOURCE_DIR}/microbench/)
+endif()
diff --git a/src/rocksdb/CODE_OF_CONDUCT.md b/src/rocksdb/CODE_OF_CONDUCT.md
new file mode 100644
index 000000000..d1abc700d
--- /dev/null
+++ b/src/rocksdb/CODE_OF_CONDUCT.md
@@ -0,0 +1,77 @@
+# Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <opensource-conduct@fb.com>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
+
diff --git a/src/rocksdb/CONTRIBUTING.md b/src/rocksdb/CONTRIBUTING.md
new file mode 100644
index 000000000..190100b42
--- /dev/null
+++ b/src/rocksdb/CONTRIBUTING.md
@@ -0,0 +1,17 @@
+# Contributing to RocksDB
+
+## Code of Conduct
+The code of conduct is described in [`CODE_OF_CONDUCT.md`](CODE_OF_CONDUCT.md)
+
+## Contributor License Agreement ("CLA")
+
+In order to accept your pull request, we need you to submit a CLA. You
+only need to do this once, so if you've done this for another Facebook
+open source project, you're good to go. If you are submitting a pull
+request for the first time, just let us know that you have completed
+the CLA and we can cross-check with your GitHub username.
+
+Complete your CLA here: <https://code.facebook.com/cla>
+
+If you prefer to sign a paper copy, we can send you a PDF.  Send us an 
+e-mail or create a new github issue to request the CLA in PDF format.
diff --git a/src/rocksdb/COPYING b/src/rocksdb/COPYING
new file mode 100644
index 000000000..d159169d1
--- /dev/null
+++ b/src/rocksdb/COPYING
@@ -0,0 +1,339 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                            NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/src/rocksdb/DEFAULT_OPTIONS_HISTORY.md b/src/rocksdb/DEFAULT_OPTIONS_HISTORY.md
new file mode 100644
index 000000000..82c64d523
--- /dev/null
+++ b/src/rocksdb/DEFAULT_OPTIONS_HISTORY.md
@@ -0,0 +1,24 @@
+# RocksDB default options change log (NO LONGER MAINTAINED)
+## Unreleased
+* delayed_write_rate takes the rate given by rate_limiter if not specified.
+
+## 5.2
+* Change the default of delayed slowdown value to 16MB/s and further increase the L0 stop condition to 36 files.
+
+## 5.0 (11/17/2016)
+* Options::allow_concurrent_memtable_write and Options::enable_write_thread_adaptive_yield are now true by default
+* Options.level0_stop_writes_trigger default value changes from 24 to 32.
+
+## 4.8.0 (5/2/2016)
+* options.max_open_files changes from 5000 to -1. It improves performance, but users need to set file descriptor limit to be large enough and watch memory usage for index and bloom filters.
+* options.base_background_compactions changes from max_background_compactions to 1. When users set higher max_background_compactions but the write throughput is not high, the writes are less spiky to disks.
+* options.wal_recovery_mode changes from kTolerateCorruptedTailRecords to kPointInTimeRecovery. Avoid some false positive when file system or hardware reorder the writes for file data and metadata.
+
+## 4.7.0 (4/8/2016)
+* options.write_buffer_size changes from 4MB to 64MB.
+* options.target_file_size_base changes from 2MB to 64MB.
+* options.max_bytes_for_level_base changes from 10MB to 256MB.
+* options.soft_pending_compaction_bytes_limit changes from 0 (disabled) to 64GB.
+* options.hard_pending_compaction_bytes_limit changes from 0 (disabled) to 256GB.
+* table_cache_numshardbits changes from 4 to 6.
+* max_file_opening_threads changes from 1 to 16.
diff --git a/src/rocksdb/DUMP_FORMAT.md b/src/rocksdb/DUMP_FORMAT.md
new file mode 100644
index 000000000..009dabad5
--- /dev/null
+++ b/src/rocksdb/DUMP_FORMAT.md
@@ -0,0 +1,16 @@
+## RocksDB dump format
+
+The version 1 RocksDB dump format is fairly simple:
+
+1) The dump starts with the magic 8 byte identifier "ROCKDUMP"
+
+2) The magic is followed by an 8 byte big-endian version which is 0x00000001.
+
+3) Next are arbitrarily sized chunks of bytes prepended by 4 byte little endian number indicating how large each chunk is.
+
+4) The first chunk is special and is a json string indicating some things about the creation of this dump.  It contains the following keys:
+* database-path: The path of the database this dump was created from.
+* hostname: The hostname of the machine where the dump was created.
+* creation-time: Unix seconds since epoc when this dump was created.
+
+5) Following the info dump the slices paired into are key/value pairs.
diff --git a/src/rocksdb/HISTORY.md b/src/rocksdb/HISTORY.md
new file mode 100644
index 000000000..fd7fa0710
--- /dev/null
+++ b/src/rocksdb/HISTORY.md
@@ -0,0 +1,2266 @@
+# Rocksdb Change Log
+## 7.9.2 (12/21/2022)
+### Bug Fixes
+* Fixed a heap use after free bug in async scan prefetching when the scan thread and another thread try to read and load the same seek block into cache.
+
+## 7.9.1 (12/8/2022)
+### Bug Fixes
+* Fixed a regression in iterator where range tombstones after `iterate_upper_bound` is processed.
+* Fixed a memory leak in MultiGet with async_io read option, caused by IO errors during table file open
+
+### Behavior changes
+* Make best-efforts recovery verify SST unique ID before Version construction (#10962)
+
+## 7.9.0 (11/21/2022)
+### Performance Improvements
+* Fixed an iterator performance regression for delete range users when scanning through a consecutive sequence of range tombstones (#10877).
+
+### Bug Fixes
+* Fix memory corruption error in scans if async_io is enabled. Memory corruption happened if there is IOError while reading the data leading to empty buffer and other buffer already in progress of async read goes again for reading.
+* Fix failed memtable flush retry bug that could cause wrongly ordered updates, which would surface to writers as `Status::Corruption` in case of `force_consistency_checks=true` (default). It affects use cases that enable both parallel flush (`max_background_flushes > 1` or `max_background_jobs >= 8`) and non-default memtable count (`max_write_buffer_number > 2`).
+* Fixed an issue where the `READ_NUM_MERGE_OPERANDS` ticker was not updated when the base key-value or tombstone was read from an SST file.
+* Fixed a memory safety bug when using a SecondaryCache with `block_cache_compressed`. `block_cache_compressed` no longer attempts to use SecondaryCache features.
+* Fixed a regression in scan for async_io. During seek, valid buffers were getting cleared causing a regression.
+* Tiered Storage: fixed excessive keys written to penultimate level in non-debug builds.
+
+### New Features
+* Add basic support for user-defined timestamp to Merge (#10819).
+* Add stats for ReadAsync time spent and async read errors.
+* Basic support for the wide-column data model is now available. Wide-column entities can be stored using the `PutEntity` API, and retrieved using `GetEntity` and the new `columns` API of iterator. For compatibility, the classic APIs `Get` and `MultiGet`, as well as iterator's `value` API return the value of the anonymous default column of wide-column entities; also, `GetEntity` and iterator's `columns` return any plain key-values in the form of an entity which only has the anonymous default column. `Merge` (and `GetMergeOperands`) currently also apply to the default column; any other columns of entities are unaffected by `Merge` operations. Note that some features like compaction filters, transactions, user-defined timestamps, and the SST file writer do not yet support wide-column entities; also, there is currently no `MultiGet`-like API to retrieve multiple entities at once. We plan to gradually close the above gaps and also implement new features like column-level operations (e.g. updating or querying only certain columns of an entity).
+* Marked HyperClockCache as a production-ready alternative to LRUCache for the block cache. HyperClockCache greatly improves hot-path CPU efficiency under high parallel load or high contention, with some documented caveats and limitations. As much as 4.5x higher ops/sec vs. LRUCache has been seen in db_bench under high parallel load.
+* Add periodic diagnostics to info_log (LOG file) for HyperClockCache block cache if performance is degraded by bad `estimated_entry_charge` option.
+
+### Public API Changes
+* Marked `block_cache_compressed` as a deprecated feature. Use SecondaryCache instead.
+* Added a `SecondaryCache::InsertSaved()` API, with default implementation depending on `Insert()`. Some implementations might need to add a custom implementation of `InsertSaved()`. (Details in API comments.)
+
+## 7.8.0 (10/22/2022)
+### New Features
+* `DeleteRange()` now supports user-defined timestamp.
+* Provide support for async_io with tailing iterators when ReadOptions.tailing is enabled during scans.
+* Tiered Storage: allow data moving up from the last level to the penultimate level if the input level is penultimate level or above.
+* Added `DB::Properties::kFastBlockCacheEntryStats`, which is similar to `DB::Properties::kBlockCacheEntryStats`, except returns cached (stale) values in more cases to reduce overhead.
+* FIFO compaction now supports migrating from a multi-level DB via DB::Open(). During the migration phase, FIFO compaction picker will:
+* picks the sst file with the smallest starting key in the bottom-most non-empty level.
+* Note that during the migration phase, the file purge order will only be an approximation of "FIFO" as files in lower-level might sometime contain newer keys than files in upper-level.
+* Added an option `ignore_max_compaction_bytes_for_input` to ignore max_compaction_bytes limit when adding files to be compacted from input level. This should help reduce write amplification. The option is enabled by default.
+* Tiered Storage: allow data moving up from the last level even if it's a last level only compaction, as long as the penultimate level is empty.
+* Add a new option IOOptions.do_not_recurse that can be used by underlying file systems to skip recursing through sub directories and list only files in GetChildren API.
+* Add option `preserve_internal_time_seconds` to preserve the time information for the latest data. Which can be used to determine the age of data when `preclude_last_level_data_seconds` is enabled. The time information is attached with SST in table property `rocksdb.seqno.time.map` which can be parsed by tool ldb or sst_dump.
+
+### Bug Fixes
+* Fix a bug in io_uring_prep_cancel in AbortIO API for posix which expects sqe->addr to match with read request submitted and wrong paramter was being passed.
+* Fixed a regression in iterator performance when the entire DB is a single memtable introduced in #10449. The fix is in #10705 and #10716.
+* Fixed an optimistic transaction validation bug caused by DBImpl::GetLatestSequenceForKey() returning non-latest seq for merge (#10724).
+* Fixed a bug in iterator refresh which could segfault for DeleteRange users (#10739).
+* Fixed a bug causing manual flush with `flush_opts.wait=false` to stall when database has stopped all writes (#10001).
+* Fixed a bug in iterator refresh that was not freeing up SuperVersion, which could cause excessive resource pinniung (#10770).
+* Fixed a bug where RocksDB could be doing compaction endlessly when allow_ingest_behind is true and the bottommost level is not filled (#10767).
+* Fixed a memory safety bug in experimental HyperClockCache (#10768)
+* Fixed some cases where `ldb update_manifest` and `ldb unsafe_remove_sst_file` are not usable because they were requiring the DB files to match the existing manifest state (before updating the manifest to match a desired state).
+
+### Performance Improvements
+* Try to align the compaction output file boundaries to the next level ones, which can reduce more than 10% compaction load for the default level compaction. The feature is enabled by default, to disable, set `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size` to false. As a side effect, it can create SSTs larger than the target_file_size (capped at 2x target_file_size) or smaller files.
+* Improve RoundRobin TTL compaction, which is going to be the same as normal RoundRobin compaction to move the compaction cursor.
+* Fix a small CPU regression caused by a change that UserComparatorWrapper was made Customizable, because Customizable itself has small CPU overhead for initialization.
+
+### Behavior Changes
+* Sanitize min_write_buffer_number_to_merge to 1 if atomic flush is enabled to prevent unexpected data loss when WAL is disabled in a multi-column-family setting (#10773).
+* With periodic stat dumper waits up every options.stats_dump_period_sec seconds, it won't dump stats for a CF if it has no change in the period, unless 7 periods have been skipped.
+* Only periodic stats dumper triggered by options.stats_dump_period_sec will update stats interval. Ones triggered by DB::GetProperty() will not update stats interval and will report based on an interval since the last time stats dump period.
+
+### Public API changes
+* Make kXXH3 checksum the new default, because it is faster on common hardware, especially with kCRC32c affected by a performance bug in some versions of clang (https://github.com/facebook/rocksdb/issues/9891). DBs written with this new setting can be read by RocksDB 6.27 and newer.
+* Refactor the classes, APIs and data structures for block cache tracing to allow a user provided trace writer to be used. Introduced an abstract BlockCacheTraceWriter class that takes a structured BlockCacheTraceRecord. The BlockCacheTraceWriter implementation can then format and log the record in whatever way it sees fit. The default BlockCacheTraceWriterImpl does file tracing using a user provided TraceWriter. More details in rocksdb/includb/block_cache_trace_writer.h.
+
+## 7.7.0 (09/18/2022)
+### Bug Fixes
+* Fixed a hang when an operation such as `GetLiveFiles` or `CreateNewBackup` is asked to trigger and wait for memtable flush on a read-only DB. Such indirect requests for memtable flush are now ignored on a read-only DB.
+* Fixed bug where `FlushWAL(true /* sync */)` (used by `GetLiveFilesStorageInfo()`, which is used by checkpoint and backup) could cause parallel writes at the tail of a WAL file to never be synced.
+* Fix periodic_task unable to re-register the same task type, which may cause `SetOptions()` fail to update periodical_task time like: `stats_dump_period_sec`, `stats_persist_period_sec`.
+* Fixed a bug in the rocksdb.prefetched.bytes.discarded stat. It was counting the prefetch buffer size, rather than the actual number of bytes discarded from the buffer.
+* Fix bug where the directory containing CURRENT can left unsynced after CURRENT is updated to point to the latest MANIFEST, which leads to risk of unsync data loss of CURRENT.
+* Update rocksdb.multiget.io.batch.size stat in non-async MultiGet as well.
+* Fix a bug in key range overlap checking with concurrent compactions when user-defined timestamp is enabled. User-defined timestamps should be EXCLUDED when checking if two ranges overlap.
+* Fixed a bug where the blob cache prepopulating logic did not consider the secondary cache (see #10603).
+* Fixed the rocksdb.num.sst.read.per.level, rocksdb.num.index.and.filter.blocks.read.per.level and rocksdb.num.level.read.per.multiget stats in the MultiGet coroutines
+
+### Public API changes
+* Add `rocksdb_column_family_handle_get_id`, `rocksdb_column_family_handle_get_name` to get name, id of column family in C API
+* Add a new stat rocksdb.async.prefetch.abort.micros to measure time spent waiting for async prefetch reads to abort
+
+### Java API Changes
+* Add CompactionPriority.RoundRobin.
+* Revert to using the default metadata charge policy when creating an LRU cache via the Java API.
+
+### Behavior Change
+* DBOptions::verify_sst_unique_id_in_manifest is now an on-by-default feature that verifies SST file identity whenever they are opened by a DB, rather than only at DB::Open time.
+* Right now, when the option migration tool (OptionChangeMigration()) migrates to FIFO compaction, it compacts all the data into one single SST file and move to L0. This might create a problem for some users: the giant file may be soon deleted to satisfy max_table_files_size, and might cayse the DB to be almost empty. We change the behavior so that the files are cut to be smaller, but these files might not follow the data insertion order. With the change, after the migration, migrated data might not be dropped by insertion order by FIFO compaction.
+* When a block is firstly found from `CompressedSecondaryCache`, we just insert a dummy block into the primary cache and don’t erase the block from `CompressedSecondaryCache`. A standalone handle is returned to the caller. Only if the block is found again from `CompressedSecondaryCache` before the dummy block is evicted, we erase the block from `CompressedSecondaryCache` and insert it into the primary cache.
+* When a block is firstly evicted from the primary cache to `CompressedSecondaryCache`, we just insert a dummy block in `CompressedSecondaryCache`. Only if it is evicted again before the dummy block is evicted from the cache, it is treated as a hot block and is inserted into `CompressedSecondaryCache`.
+* Improved the estimation of memory used by cached blobs by taking into account the size of the object owning the blob value and also the allocator overhead if `malloc_usable_size` is available (see #10583).
+* Blob values now have their own category in the cache occupancy statistics, as opposed to being lumped into the "Misc" bucket (see #10601).
+* Change the optimize_multiget_for_io experimental ReadOptions flag to default on.
+
+### New Features
+*  RocksDB does internal auto prefetching if it notices 2 sequential reads if readahead_size is not specified. New option `num_file_reads_for_auto_readahead` is added in BlockBasedTableOptions which indicates after how many sequential reads internal auto prefetching should be start (default is 2).
+* Added new perf context counters `block_cache_standalone_handle_count`, `block_cache_real_handle_count`,`compressed_sec_cache_insert_real_count`, `compressed_sec_cache_insert_dummy_count`, `compressed_sec_cache_uncompressed_bytes`, and `compressed_sec_cache_compressed_bytes`.
+* Memory for blobs which are to be inserted into the blob cache is now allocated using the cache's allocator (see #10628 and #10647).
+* HyperClockCache is an experimental, lock-free Cache alternative for block cache that offers much improved CPU efficiency under high parallel load or high contention, with some caveats. As much as 4.5x higher ops/sec vs. LRUCache has been seen in db_bench under high parallel load.
+* `CompressedSecondaryCacheOptions::enable_custom_split_merge` is added for enabling the custom split and merge feature, which split the compressed value into chunks so that they may better fit jemalloc bins.
+
+### Performance Improvements
+* Iterator performance is improved for `DeleteRange()` users. Internally, iterator will skip to the end of a range tombstone when possible, instead of looping through each key and check individually if a key is range deleted.
+* Eliminated some allocations and copies in the blob read path. Also, `PinnableSlice` now only points to the blob value and pins the backing resource (cache entry or buffer) in all cases, instead of containing a copy of the blob value. See #10625 and #10647.
+* In case of scans with async_io enabled, few optimizations have been added to issue more asynchronous requests in parallel in order to avoid synchronous prefetching.
+* `DeleteRange()` users should see improvement in get/iterator performance from mutable memtable (see #10547).
+
+## 7.6.0 (08/19/2022)
+### New Features
+* Added `prepopulate_blob_cache` to ColumnFamilyOptions. If enabled, prepopulate warm/hot blobs which are already in memory into blob cache at the time of flush. On a flush, the blob that is in memory (in memtables) get flushed to the device. If using Direct IO, additional IO is incurred to read this blob back into memory again, which is avoided by enabling this option. This further helps if the workload exhibits high temporal locality, where most of the reads go to recently written data. This also helps in case of the remote file system since it involves network traffic and higher latencies.
+* Support using secondary cache with the blob cache. When creating a blob cache, the user can set a secondary blob cache by configuring `secondary_cache` in LRUCacheOptions.
+* Charge memory usage of blob cache when the backing cache of the blob cache and the block cache are different. If an operation reserving memory for blob cache exceeds the avaible space left in the block cache at some point (i.e, causing a cache full under `LRUCacheOptions::strict_capacity_limit` = true), creation will fail with `Status::MemoryLimit()`. To opt in this feature, enable charging `CacheEntryRole::kBlobCache` in `BlockBasedTableOptions::cache_usage_options`.
+* Improve subcompaction range partition so that it is likely to be more even. More evenly distribution of subcompaction will improve compaction throughput for some workloads. All input files' index blocks to sample some anchor key points from which we pick positions to partition the input range. This would introduce some CPU overhead in compaction preparation phase, if subcompaction is enabled, but it should be a small fraction of the CPU usage of the whole compaction process. This also brings a behavier change: subcompaction number is much more likely to maxed out than before.
+* Add CompactionPri::kRoundRobin, a compaction picking mode that cycles through all the files with a compact cursor in a round-robin manner. This feature is available since 7.5.
+* Provide support for subcompactions for user_defined_timestamp.
+* Added an option `memtable_protection_bytes_per_key` that turns on memtable per key-value checksum protection. Each memtable entry will be suffixed by a checksum that is computed during writes, and verified in reads/compaction. Detected corruption will be logged and with corruption status returned to user.
+* Added a blob-specific cache priority level - bottom level. Blobs are typically lower-value targets for caching than data blocks, since 1) with BlobDB, data blocks containing blob references conceptually form an index structure which has to be consulted before we can read the blob value, and 2) cached blobs represent only a single key-value, while cached data blocks generally contain multiple KVs. The user can specify the new option `low_pri_pool_ratio` in `LRUCacheOptions` to configure the ratio of capacity reserved for low priority cache entries (and therefore the remaining ratio is the space reserved for the bottom level), or configuring the new argument `low_pri_pool_ratio` in `NewLRUCache()` to achieve the same effect.
+
+### Public API changes
+* Removed Customizable support for RateLimiter and removed its CreateFromString() and Type() functions.
+* `CompactRangeOptions::exclusive_manual_compaction` is now false by default. This ensures RocksDB does not introduce artificial parallelism limitations by default.
+* Tiered Storage: change `bottommost_temperture` to `last_level_temperture`. The old option name is kept only for migration, please use the new option. The behavior is changed to apply temperature for the `last_level` SST files only.
+* Added a new experimental ReadOption flag called optimize_multiget_for_io, which when set attempts to reduce MultiGet latency by spawning coroutines for keys in multiple levels.
+
+### Bug Fixes
+* Fix a bug starting in 7.4.0 in which some fsync operations might be skipped in a DB after any DropColumnFamily on that DB, until it is re-opened. This can lead to data loss on power loss. (For custom FileSystem implementations, this could lead to `FSDirectory::Fsync` or `FSDirectory::Close` after the first `FSDirectory::Close`; Also, valgrind could report call to `close()` with `fd=-1`.)
+* Fix a bug where `GenericRateLimiter` could revert the bandwidth set dynamically using `SetBytesPerSecond()` when a user configures a structure enclosing it, e.g., using `GetOptionsFromString()` to configure an `Options` that references an existing `RateLimiter` object.
+* Fix race conditions in `GenericRateLimiter`.
+* Fix a bug in `FIFOCompactionPicker::PickTTLCompaction` where total_size calculating might cause underflow
+* Fix data race bug in hash linked list memtable. With this bug, read request might temporarily miss an old record in the memtable in a race condition to the hash bucket.
+* Fix a bug that `best_efforts_recovery` may fail to open the db with mmap read.
+* Fixed a bug where blobs read during compaction would pollute the cache.
+* Fixed a data race in LRUCache when used with a secondary_cache.
+* Fixed a bug where blobs read by iterators would be inserted into the cache even with the `fill_cache` read option set to false.
+* Fixed the segfault caused by `AllocateData()` in `CompressedSecondaryCache::SplitValueIntoChunks()` and `MergeChunksIntoValueTest`.
+* Fixed a bug in BlobDB where a mix of inlined and blob values could result in an incorrect value being passed to the compaction filter (see #10391).
+* Fixed a memory leak bug in stress tests caused by `FaultInjectionSecondaryCache`.
+
+### Behavior Change
+* Added checksum handshake during the copying of decompressed WAL fragment. This together with #9875, #10037, #10212, #10114 and #10319 provides end-to-end integrity protection for write batch during recovery.
+* To minimize the internal fragmentation caused by the variable size of the compressed blocks in `CompressedSecondaryCache`, the original block is split according to the jemalloc bin size in `Insert()` and then merged back in `Lookup()`.
+* PosixLogger is removed and by default EnvLogger will be used for info logging. The behavior of the two loggers should be very similar when using the default Posix Env.
+* Remove [min|max]_timestamp from VersionEdit for now since they are not tracked in MANIFEST anyway but consume two empty std::string (up to 64 bytes) for each file. Should they be added back in the future, we should store them more compactly.
+* Improve universal tiered storage compaction picker to avoid extra major compaction triggered by size amplification. If `preclude_last_level_data_seconds` is enabled, the size amplification is calculated within non last_level data only which skip the last level and use the penultimate level as the size base.
+* If an error is hit when writing to a file (append, sync, etc), RocksDB is more strict with not issuing more operations to it, except closing the file, with exceptions of some WAL file operations in error recovery path.
+* A `WriteBufferManager` constructed with `allow_stall == false` will no longer trigger write stall implicitly by thrashing until memtable count limit is reached. Instead, a column family can continue accumulating writes while that CF is flushing, which means memory may increase. Users who prefer stalling writes must now explicitly set `allow_stall == true`.
+* Add `CompressedSecondaryCache` into the stress tests.
+* Block cache keys have changed, which will cause any persistent caches to miss between versions.
+
+### Performance Improvements
+* Instead of constructing `FragmentedRangeTombstoneList` during every read operation, it is now constructed once and stored in immutable memtables. This improves speed of querying range tombstones from immutable memtables.
+* When using iterators with the integrated BlobDB implementation, blob cache handles are now released immediately when the iterator's position changes.
+* MultiGet can now do more IO in parallel by reading data blocks from SST files in multiple levels, if the optimize_multiget_for_io ReadOption flag is set.
+
+## 7.5.0 (07/15/2022)
+### New Features
+* Mempurge option flag `experimental_mempurge_threshold` is now a ColumnFamilyOptions and can now be dynamically configured using `SetOptions()`.
+* Support backward iteration when `ReadOptions::iter_start_ts` is set.
+* Provide support for ReadOptions.async_io with direct_io to improve Seek latency by using async IO to parallelize child iterator seek and doing asynchronous prefetching on sequential scans.
+* Added support for blob caching in order to cache frequently used blobs for BlobDB.
+  * User can configure the new ColumnFamilyOptions `blob_cache` to enable/disable blob caching.
+  * Either sharing the backend cache with the block cache or using a completely separate cache is supported.
+  * A new abstraction interface called `BlobSource` for blob read logic gives all users access to blobs, whether they are in the blob cache, secondary cache, or (remote) storage. Blobs can be potentially read both while handling user reads (`Get`, `MultiGet`, or iterator) and during compaction (while dealing with compaction filters, Merges, or garbage collection) but eventually all blob reads go through `Version::GetBlob` or, for MultiGet, `Version::MultiGetBlob` (and then get dispatched to the interface -- `BlobSource`).
+* Add experimental tiered compaction feature `AdvancedColumnFamilyOptions::preclude_last_level_data_seconds`, which makes sure the new data inserted within preclude_last_level_data_seconds won't be placed on cold tier (the feature is not complete).
+
+### Public API changes
+* Add metadata related structs and functions in C API, including
+  * `rocksdb_get_column_family_metadata()` and `rocksdb_get_column_family_metadata_cf()` to obtain `rocksdb_column_family_metadata_t`.
+  * `rocksdb_column_family_metadata_t` and its get functions & destroy function.
+  * `rocksdb_level_metadata_t` and its and its get functions & destroy function.
+  * `rocksdb_file_metadata_t` and its and get functions & destroy functions.
+* Add suggest_compact_range() and suggest_compact_range_cf() to C API.
+* When using block cache strict capacity limit (`LRUCache` with `strict_capacity_limit=true`), DB operations now fail with Status code `kAborted` subcode `kMemoryLimit` (`IsMemoryLimit()`) instead of `kIncomplete` (`IsIncomplete()`) when the capacity limit is reached, because Incomplete can mean other specific things for some operations. In more detail, `Cache::Insert()` now returns the updated Status code and this usually propagates through RocksDB to the user on failure.
+* NewClockCache calls temporarily return an LRUCache (with similar characteristics as the desired ClockCache). This is because ClockCache is being replaced by a new version (the old one had unknown bugs) but this is still under development.
+* Add two functions `int ReserveThreads(int threads_to_be_reserved)` and `int ReleaseThreads(threads_to_be_released)` into `Env` class. In the default implementation, both return 0. Newly added `xxxEnv` class that inherits `Env` should implement these two functions for thread reservation/releasing features.
+* Add `rocksdb_options_get_prepopulate_blob_cache` and `rocksdb_options_set_prepopulate_blob_cache` to C API.
+* Add `prepopulateBlobCache` and `setPrepopulateBlobCache` to Java API.
+
+### Bug Fixes
+* Fix a bug in which backup/checkpoint can include a WAL deleted by RocksDB.
+* Fix a bug where concurrent compactions might cause unnecessary further write stalling. In some cases, this might cause write rate to drop to minimum.
+* Fix a bug in Logger where if dbname and db_log_dir are on different filesystems, dbname creation would fail wrt to db_log_dir path returning an error and fails to open the DB.
+* Fix a CPU and memory efficiency issue introduce by https://github.com/facebook/rocksdb/pull/8336 which made InternalKeyComparator configurable as an unintended side effect.
+
+## Behavior Change
+* In leveled compaction with dynamic levelling, level multiplier is not anymore adjusted due to oversized L0. Instead, compaction score is adjusted by increasing size level target by adding incoming bytes from upper levels. This would deprioritize compactions from upper levels if more data from L0 is coming. This is to fix some unnecessary full stalling due to drastic change of level targets, while not wasting write bandwidth for compaction while writes are overloaded.
+* For track_and_verify_wals_in_manifest, revert to the original behavior before #10087: syncing of live WAL file is not tracked, and we track only the synced sizes of **closed** WALs. (PR #10330).
+* WAL compression now computes/verifies checksum during compression/decompression.
+
+### Performance Improvements
+* Rather than doing total sort against all files in a level, SortFileByOverlappingRatio() to only find the top 50 files based on score. This can improve write throughput for the use cases where data is loaded in increasing key order and there are a lot of files in one LSM-tree, where applying compaction results is the bottleneck.
+* In leveled compaction, L0->L1 trivial move will allow more than one file to be moved in one compaction. This would allow L0 files to be moved down faster when data is loaded in sequential order, making slowdown or stop condition harder to hit. Also seek L0->L1 trivial move when only some files qualify.
+* In leveled compaction, try to trivial move more than one files if possible, up to 4 files or max_compaction_bytes. This is to allow higher write throughput for some use cases where data is loaded in sequential order, where appying compaction results is the bottleneck.
+
+## 7.4.0 (06/19/2022)
+### Bug Fixes
+* Fixed a bug in calculating key-value integrity protection for users of in-place memtable updates. In particular, the affected users would be those who configure `protection_bytes_per_key > 0` on `WriteBatch` or `WriteOptions`, and configure `inplace_callback != nullptr`.
+* Fixed a bug where a snapshot taken during SST file ingestion would be unstable.
+* Fixed a bug for non-TransactionDB with avoid_flush_during_recovery = true and TransactionDB where in case of crash, min_log_number_to_keep may not change on recovery and persisting a new MANIFEST with advanced log_numbers for some column families, results in "column family inconsistency" error on second recovery. As a solution, RocksDB will persist the new MANIFEST after successfully syncing the new WAL. If a future recovery starts from the new MANIFEST, then it means the new WAL is successfully synced. Due to the sentinel empty write batch at the beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point. If future recovery starts from the old MANIFEST, it means the writing the new MANIFEST failed. We won't have the "SST ahead of WAL" error.
+* Fixed a bug where RocksDB DB::Open() may creates and writes to two new MANIFEST files even before recovery succeeds. Now writes to MANIFEST are persisted only after recovery is successful.
+* Fix a race condition in WAL size tracking which is caused by an unsafe iterator access after container is changed.
+* Fix unprotected concurrent accesses to `WritableFileWriter::filesize_` by `DB::SyncWAL()` and `DB::Put()` in two write queue mode.
+* Fix a bug in WAL tracking. Before this PR (#10087), calling `SyncWAL()` on the only WAL file of the db will not log the event in MANIFEST, thus allowing a subsequent `DB::Open` even if the WAL file is missing or corrupted.
+* Fix a bug that could return wrong results with `index_type=kHashSearch` and using `SetOptions` to change the `prefix_extractor`.
+* Fixed a bug in WAL tracking with wal_compression. WAL compression writes a kSetCompressionType record which is not associated with any sequence number. As result, WalManager::GetSortedWalsOfType() will skip these WALs and not return them to caller, e.g. Checkpoint, Backup, causing the operations to fail.
+* Avoid a crash if the IDENTITY file is accidentally truncated to empty. A new DB ID will be written and generated on Open.
+* Fixed a possible corruption for users of `manual_wal_flush` and/or `FlushWAL(true /* sync */)`, together with `track_and_verify_wals_in_manifest == true`. For those users, losing unsynced data (e.g., due to power loss) could make future DB opens fail with a `Status::Corruption` complaining about missing WAL data.
+* Fixed a bug in `WriteBatchInternal::Append()` where WAL termination point in write batch was not considered and the function appends an incorrect number of checksums.
+* Fixed a crash bug introduced in 7.3.0 affecting users of MultiGet with `kDataBlockBinaryAndHash`.
+
+### Public API changes
+* Add new API GetUnixTime in Snapshot class which returns the unix time at which Snapshot is taken.
+* Add transaction `get_pinned` and `multi_get` to C API.
+* Add two-phase commit support to C API.
+* Add `rocksdb_transaction_get_writebatch_wi` and `rocksdb_transaction_rebuild_from_writebatch` to C API.
+* Add `rocksdb_options_get_blob_file_starting_level` and `rocksdb_options_set_blob_file_starting_level` to C API.
+* Add `blobFileStartingLevel` and `setBlobFileStartingLevel` to Java API.
+* Add SingleDelete for DB in C API
+* Add User Defined Timestamp in C API.
+  * `rocksdb_comparator_with_ts_create` to create timestamp aware comparator
+  * Put, Get, Delete, SingleDelete, MultiGet APIs has corresponding timestamp aware APIs with suffix `with_ts`
+  * And Add C API's for Transaction, SstFileWriter, Compaction as mentioned [here](https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-(Experimental))
+* The contract for implementations of Comparator::IsSameLengthImmediateSuccessor has been updated to work around a design bug in `auto_prefix_mode`.
+* The API documentation for `auto_prefix_mode` now notes some corner cases in which it returns different results than `total_order_seek`, due to design bugs that are not easily fixed. Users using built-in comparators and keys at least the size of a fixed prefix length are not affected.
+* Obsoleted the NUM_DATA_BLOCKS_READ_PER_LEVEL stat and introduced the NUM_LEVEL_READ_PER_MULTIGET and MULTIGET_COROUTINE_COUNT stats
+* Introduced `WriteOptions::protection_bytes_per_key`, which can be used to enable key-value integrity protection for live updates.
+
+### New Features
+* Add FileSystem::ReadAsync API in io_tracing
+* Add blob garbage collection parameters `blob_garbage_collection_policy` and `blob_garbage_collection_age_cutoff` to both force-enable and force-disable GC, as well as selectively override age cutoff when using CompactRange.
+* Add an extra sanity check in `GetSortedWalFiles()` (also used by `GetLiveFilesStorageInfo()`, `BackupEngine`, and `Checkpoint`) to reduce risk of successfully created backup or checkpoint failing to open because of missing WAL file.
+* Add a new column family option `blob_file_starting_level` to enable writing blob files during flushes and compactions starting from the specified LSM tree level.
+* Add support for timestamped snapshots (#9879)
+* Provide support for AbortIO in posix to cancel submitted asynchronous requests using io_uring.
+* Add support for rate-limiting batched `MultiGet()` APIs
+* Added several new tickers, perf context statistics, and DB properties to BlobDB
+  * Added new DB properties "rocksdb.blob-cache-capacity", "rocksdb.blob-cache-usage", "rocksdb.blob-cache-pinned-usage" to show blob cache usage.
+  * Added new perf context statistics `blob_cache_hit_count`, `blob_read_count`, `blob_read_byte`, `blob_read_time`, `blob_checksum_time` and `blob_decompress_time`.
+  * Added new tickers `BLOB_DB_CACHE_MISS`, `BLOB_DB_CACHE_HIT`, `BLOB_DB_CACHE_ADD`, `BLOB_DB_CACHE_ADD_FAILURES`, `BLOB_DB_CACHE_BYTES_READ` and `BLOB_DB_CACHE_BYTES_WRITE`.
+
+### Behavior changes
+* DB::Open(), DB::OpenAsSecondary() will fail if a Logger cannot be created (#9984)
+* DB::Write does not hold global `mutex_` if this db instance does not need to switch wal and mem-table (#7516).
+* Removed support for reading Bloom filters using obsolete block-based filter format. (Support for writing such filters was dropped in 7.0.) For good read performance on old DBs using these filters, a full compaction is required.
+* Per KV checksum in write batch is verified before a write batch is written to WAL to detect any corruption to the write batch (#10114).
+
+### Performance Improvements
+* When compiled with folly (Meta-internal integration; experimental in open source build), improve the locking performance (CPU efficiency) of LRUCache by using folly DistributedMutex in place of standard mutex.
+
+## 7.3.0 (05/20/2022)
+### Bug Fixes
+* Fixed a bug where manual flush would block forever even though flush options had wait=false.
+* Fixed a bug where RocksDB could corrupt DBs with `avoid_flush_during_recovery == true` by removing valid WALs, leading to `Status::Corruption` with message like "SST file is ahead of WALs" when attempting to reopen.
+* Fixed a bug in async_io path where incorrect length of data is read by FilePrefetchBuffer if data is consumed from two populated buffers and request for more data is sent.
+* Fixed a CompactionFilter bug. Compaction filter used to use `Delete` to remove keys, even if the keys should be removed with `SingleDelete`. Mixing `Delete` and `SingleDelete` may cause undefined behavior.
+* Fixed a bug in `WritableFileWriter::WriteDirect` and `WritableFileWriter::WriteDirectWithChecksum`. The rate_limiter_priority specified in ReadOptions was not passed to the RateLimiter when requesting a token.
+* Fixed a bug which might cause process crash when I/O error happens when reading an index block in MultiGet().
+
+### New Features
+* DB::GetLiveFilesStorageInfo is ready for production use.
+* Add new stats PREFETCHED_BYTES_DISCARDED which records number of prefetched bytes discarded by RocksDB FilePrefetchBuffer on destruction and POLL_WAIT_MICROS records wait time for FS::Poll API completion.
+* RemoteCompaction supports table_properties_collector_factories override on compaction worker.
+* Start tracking SST unique id in MANIFEST, which will be used to verify with SST properties during DB open to make sure the SST file is not overwritten or misplaced. A db option `verify_sst_unique_id_in_manifest` is introduced to enable/disable the verification, if enabled all SST files will be opened during DB-open to verify the unique id (default is false), so it's recommended to use it with `max_open_files = -1` to pre-open the files.
+* Added the ability to concurrently read data blocks from multiple files in a level in batched MultiGet. This can be enabled by setting the async_io option in ReadOptions. Using this feature requires a FileSystem that supports ReadAsync (PosixFileSystem is not supported yet for this), and for RocksDB to be compiled with folly and c++20.
+* Charge memory usage of file metadata. RocksDB holds one file metadata structure in-memory per on-disk table file. If an operation reserving memory for file metadata exceeds the avaible space left in the block
+cache at some point (i.e, causing a cache full under `LRUCacheOptions::strict_capacity_limit` = true), creation will fail with `Status::MemoryLimit()`. To opt in this feature,  enable charging `CacheEntryRole::kFileMetadata` in `BlockBasedTableOptions::cache_usage_options`.
+
+### Public API changes
+* Add rollback_deletion_type_callback to TransactionDBOptions so that write-prepared transactions know whether to issue a Delete or SingleDelete to cancel a previous key written during prior prepare phase. The PR aims to prevent mixing SingleDeletes and Deletes for the same key that can lead to undefined behaviors for write-prepared transactions.
+* EXPERIMENTAL: Add new API AbortIO in file_system to abort the read requests submitted asynchronously.
+* CompactionFilter::Decision has a new value: kRemoveWithSingleDelete. If CompactionFilter returns this decision, then CompactionIterator will use `SingleDelete` to mark a key as removed.
+* Renamed CompactionFilter::Decision::kRemoveWithSingleDelete to kPurge since the latter sounds more general and hides the implementation details of how compaction iterator handles keys.
+* Added ability to specify functions for Prepare and Validate to OptionsTypeInfo.  Added methods to OptionTypeInfo to set the functions via an API.  These methods are intended for RocksDB plugin developers for configuration management.
+* Added a new immutable db options, enforce_single_del_contracts. If set to false (default is true), compaction will NOT fail due to a single delete followed by a delete for the same key. The purpose of this temporay option is to help existing use cases migrate.
+* Introduce `BlockBasedTableOptions::cache_usage_options` and use that to replace `BlockBasedTableOptions::reserve_table_builder_memory` and  `BlockBasedTableOptions::reserve_table_reader_memory`.
+* Changed `GetUniqueIdFromTableProperties` to return a 128-bit unique identifier, which will be the standard size now. The old functionality (192-bit) is available from `GetExtendedUniqueIdFromTableProperties`. Both functions are no longer "experimental" and are ready for production use.
+* In IOOptions, mark `prio` as deprecated for future removal.
+* In `file_system.h`, mark `IOPriority` as deprecated for future removal.
+* Add an option, `CompressionOptions::use_zstd_dict_trainer`, to indicate whether zstd dictionary trainer should be used for generating zstd compression dictionaries. The default value of this option is true for backward compatibility. When this option is set to false, zstd API `ZDICT_finalizeDictionary` is used to generate compression dictionaries.
+* Seek API which positions itself every LevelIterator on the correct data block in the correct SST file which can be parallelized if ReadOptions.async_io option is enabled.
+* Add new stat number_async_seek in PerfContext that indicates number of async calls made by seek to prefetch data.
+* Add support for user-defined timestamps to read only DB.
+
+### Bug Fixes
+* RocksDB calls FileSystem::Poll API during FilePrefetchBuffer destruction which impacts performance as it waits for read requets completion which is not needed anymore. Calling FileSystem::AbortIO to abort those requests instead fixes that performance issue.
+* Fixed unnecessary block cache contention when queries within a MultiGet batch and across parallel batches access the same data block, which previously could cause severely degraded performance in this unusual case. (In more typical MultiGet cases, this fix is expected to yield a small or negligible performance improvement.)
+
+### Behavior changes
+* Enforce the existing contract of SingleDelete so that SingleDelete cannot be mixed with Delete because it leads to undefined behavior. Fix a number of unit tests that violate the contract but happen to pass.
+* ldb `--try_load_options` default to true if `--db` is specified and not creating a new DB, the user can still explicitly disable that by `--try_load_options=false` (or explicitly enable that by `--try_load_options`).
+* During Flush write or Compaction write/read, the WriteController is used to determine whether DB writes are stalled or slowed down. The priority (Env::IOPriority) can then be determined accordingly and be passed in IOOptions to the file system.
+
+### Performance Improvements
+* Avoid calling malloc_usable_size() in LRU Cache's mutex.
+* Reduce DB mutex holding time when finding obsolete files to delete. When a file is trivial moved to another level, the internal files will be referenced twice internally and sometimes opened twice too. If a deletion candidate file is not the last reference, we need to destroy the reference and close the file but not deleting the file. Right now we determine it by building a set of all live files. With the improvement, we check the file against all live LSM-tree versions instead.
+
+## 7.2.0 (04/15/2022)
+### Bug Fixes
+* Fixed bug which caused rocksdb failure in the situation when rocksdb was accessible using UNC path
+* Fixed a race condition when 2PC is disabled and WAL tracking in the MANIFEST is enabled. The race condition is between two background flush threads trying to install flush results, causing a WAL deletion not tracked in the MANIFEST. A future DB open may fail.
+* Fixed a heap use-after-free race with DropColumnFamily.
+* Fixed a bug that `rocksdb.read.block.compaction.micros` cannot track compaction stats (#9722).
+* Fixed `file_type`, `relative_filename` and `directory` fields returned by `GetLiveFilesMetaData()`, which were added in inheriting from `FileStorageInfo`.
+* Fixed a bug affecting `track_and_verify_wals_in_manifest`. Without the fix, application may see "open error: Corruption: Missing WAL with log number" while trying to open the db. The corruption is a false alarm but prevents DB open (#9766).
+* Fix segfault in FilePrefetchBuffer with async_io as it doesn't wait for pending jobs to complete on destruction.
+* Fix ERROR_HANDLER_AUTORESUME_RETRY_COUNT stat whose value was set wrong in portal.h
+* Fixed a bug for non-TransactionDB with avoid_flush_during_recovery = true and TransactionDB where in case of crash, min_log_number_to_keep may not change on recovery and persisting a new MANIFEST with advanced log_numbers for some column families, results in "column family inconsistency" error on second recovery. As a solution the corrupted WALs whose numbers are larger than the corrupted wal and smaller than the new WAL will be moved to archive folder.
+* Fixed a bug in RocksDB DB::Open() which may creates and writes to two new MANIFEST files even before recovery succeeds. Now writes to MANIFEST are persisted only after recovery is successful.
+
+### New Features
+* For db_bench when --seed=0 or --seed is not set then it uses the current time as the seed value. Previously it used the value 1000.
+* For db_bench when --benchmark lists multiple tests and each test uses a seed for a RNG then the seeds across tests will no longer be repeated.
+* Added an option to dynamically charge an updating estimated memory usage of block-based table reader to block cache if block cache available. To enable this feature, set `BlockBasedTableOptions::reserve_table_reader_memory = true`.
+* Add new stat ASYNC_READ_BYTES that calculates number of bytes read during async read call and users can check if async code path is being called by RocksDB internal automatic prefetching for sequential reads.
+* Enable async prefetching if ReadOptions.readahead_size is set along with ReadOptions.async_io in FilePrefetchBuffer.
+* Add event listener support on remote compaction compactor side.
+* Added a dedicated integer DB property `rocksdb.live-blob-file-garbage-size` that exposes the total amount of garbage in the blob files in the current version.
+* RocksDB does internal auto prefetching if it notices sequential reads. It starts with readahead size `initial_auto_readahead_size` which now can be configured through BlockBasedTableOptions.
+* Add a merge operator that allows users to register specific aggregation function so that they can does aggregation using different aggregation types for different keys. See comments in include/rocksdb/utilities/agg_merge.h for actual usage. The feature is experimental and the format is subject to change and we won't provide a migration tool.
+* Meta-internal / Experimental: Improve CPU performance by replacing many uses of std::unordered_map with folly::F14FastMap when RocksDB is compiled together with Folly.
+* Experimental: Add CompressedSecondaryCache, a concrete implementation of rocksdb::SecondaryCache, that integrates with compression libraries (e.g. LZ4) to hold compressed blocks.
+
+### Behavior changes
+* Disallow usage of commit-time-write-batch for write-prepared/write-unprepared transactions if TransactionOptions::use_only_the_last_commit_time_batch_for_recovery is false to prevent two (or more) uncommitted versions of the same key in the database. Otherwise, bottommost compaction may violate the internal key uniqueness invariant of SSTs if the sequence numbers of both internal keys are zeroed out (#9794).
+* Make DB::GetUpdatesSince() return NotSupported early for write-prepared/write-unprepared transactions, as the API contract indicates.
+
+### Public API changes
+* Exposed APIs to examine results of block cache stats collections in a structured way. In particular, users of `GetMapProperty()` with property `kBlockCacheEntryStats` can now use the functions in `BlockCacheEntryStatsMapKeys` to find stats in the map.
+* Add `fail_if_not_bottommost_level` to IngestExternalFileOptions so that ingestion will fail if the file(s) cannot be ingested to the bottommost level.
+* Add output parameter `is_in_sec_cache` to `SecondaryCache::Lookup()`. It is to indicate whether the handle is possibly erased from the secondary cache after the Lookup.
+
+## 7.1.0 (03/23/2022)
+### New Features
+* Allow WriteBatchWithIndex to index a WriteBatch that includes keys with user-defined timestamps. The index itself does not have timestamp.
+* Add support for user-defined timestamps to write-committed transaction without API change. The `TransactionDB` layer APIs do not allow timestamps because we require that all user-defined-timestamps-aware operations go through the `Transaction` APIs.
+* Added BlobDB options to `ldb`
+* `BlockBasedTableOptions::detect_filter_construct_corruption` can now be dynamically configured using `DB::SetOptions`.
+* Automatically recover from retryable read IO errors during backgorund flush/compaction.
+* Experimental support for preserving file Temperatures through backup and restore, and for updating DB metadata for outside changes to file Temperature (`UpdateManifestForFilesState` or `ldb update_manifest --update_temperatures`).
+* Experimental support for async_io in ReadOptions which is used by FilePrefetchBuffer to prefetch some of the data asynchronously,  if reads are sequential and auto readahead is enabled by rocksdb internally.
+
+### Bug Fixes
+* Fixed a major performance bug in which Bloom filters generated by pre-7.0 releases are not read by early 7.0.x releases (and vice-versa) due to changes to FilterPolicy::Name() in #9590. This can severely impact read performance and read I/O on upgrade or downgrade with existing DB, but not data correctness.
+* Fixed a data race on `versions_` between `DBImpl::ResumeImpl()` and threads waiting for recovery to complete (#9496)
+* Fixed a bug caused by race among flush, incoming writes and taking snapshots. Queries to snapshots created with these race condition can return incorrect result, e.g. resurfacing deleted data.
+* Fixed a bug that DB flush uses `options.compression` even `options.compression_per_level` is set.
+* Fixed a bug that DisableManualCompaction may assert when disable an unscheduled manual compaction.
+* Fix a race condition when cancel manual compaction with `DisableManualCompaction`. Also DB close can cancel the manual compaction thread.
+* Fixed a potential timer crash when open close DB concurrently.
+* Fixed a race condition for `alive_log_files_` in non-two-write-queues mode. The race is between the write_thread_ in WriteToWAL() and another thread executing `FindObsoleteFiles()`. The race condition will be caught if `__glibcxx_requires_nonempty` is enabled.
+* Fixed a bug that `Iterator::Refresh()` reads stale keys after DeleteRange() performed.
+* Fixed a race condition when disable and re-enable manual compaction.
+* Fixed automatic error recovery failure in atomic flush.
+* Fixed a race condition when mmaping a WritableFile on POSIX.
+
+### Public API changes
+* Added pure virtual FilterPolicy::CompatibilityName(), which is needed for fixing major performance bug involving FilterPolicy naming in SST metadata without affecting Customizable aspect of FilterPolicy. This change only affects those with their own custom or wrapper FilterPolicy classes.
+* `options.compression_per_level` is dynamically changeable with `SetOptions()`.
+* Added `WriteOptions::rate_limiter_priority`. When set to something other than `Env::IO_TOTAL`, the internal rate limiter (`DBOptions::rate_limiter`) will be charged at the specified priority for writes associated with the API to which the `WriteOptions` was provided. Currently the support covers automatic WAL flushes, which happen during live updates (`Put()`, `Write()`, `Delete()`, etc.) when `WriteOptions::disableWAL == false` and `DBOptions::manual_wal_flush == false`.
+* Add DB::OpenAndTrimHistory API. This API will open DB and trim data to the timestamp specified by trim_ts (The data with timestamp larger than specified trim bound will be removed). This API should only be used at a timestamp-enabled column families recovery. If the column family doesn't have timestamp enabled, this API won't trim any data on that column family. This API is not compatible with avoid_flush_during_recovery option.
+* Remove BlockBasedTableOptions.hash_index_allow_collision which already takes no effect.
+
+## 7.0.0 (02/20/2022)
+### Bug Fixes
+* Fixed a major bug in which batched MultiGet could return old values for keys deleted by DeleteRange when memtable Bloom filter is enabled (memtable_prefix_bloom_size_ratio > 0). (The fix includes a substantial MultiGet performance improvement in the unusual case of both memtable_whole_key_filtering and prefix_extractor.)
+* Fixed more cases of EventListener::OnTableFileCreated called with OK status, file_size==0, and no SST file kept. Now the status is Aborted.
+* Fixed a read-after-free bug in `DB::GetMergeOperands()`.
+* Fix a data loss bug for 2PC write-committed transaction caused by concurrent transaction commit and memtable switch (#9571).
+* Fixed NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, NUM_DATA_BLOCKS_READ_PER_LEVEL, and NUM_SST_READ_PER_LEVEL stats to be reported once per MultiGet batch per level.
+
+### Performance Improvements
+* Mitigated the overhead of building the file location hash table used by the online LSM tree consistency checks, which can improve performance for certain workloads (see #9351).
+* Switched to using a sorted `std::vector` instead of `std::map` for storing the metadata objects for blob files, which can improve performance for certain workloads, especially when the number of blob files is high.
+* DisableManualCompaction() doesn't have to wait scheduled manual compaction to be executed in thread-pool to cancel the job.
+
+### Public API changes
+* Require C++17 compatible compiler (GCC >= 7, Clang >= 5, Visual Studio >= 2017) for compiling RocksDB and any code using RocksDB headers. See #9388.
+* Added `ReadOptions::rate_limiter_priority`. When set to something other than `Env::IO_TOTAL`, the internal rate limiter (`DBOptions::rate_limiter`) will be charged at the specified priority for file reads associated with the API to which the `ReadOptions` was provided.
+* Remove HDFS support from main repo.
+* Remove librados support from main repo.
+* Remove obsolete backupable_db.h and type alias `BackupableDBOptions`. Use backup_engine.h and `BackupEngineOptions`. Similar renamings are in the C and Java APIs.
+* Removed obsolete utility_db.h and `UtilityDB::OpenTtlDB`. Use db_ttl.h and `DBWithTTL::Open`.
+* Remove deprecated API DB::AddFile from main repo.
+* Remove deprecated API ObjectLibrary::Register() and the (now obsolete) Regex public API. Use ObjectLibrary::AddFactory() with PatternEntry instead.
+* Remove deprecated option DBOption::table_cache_remove_scan_count_limit.
+* Remove deprecated API AdvancedColumnFamilyOptions::soft_rate_limit.
+* Remove deprecated API AdvancedColumnFamilyOptions::hard_rate_limit.
+* Remove deprecated API DBOption::base_background_compactions.
+* Remove deprecated API DBOptions::purge_redundant_kvs_while_flush.
+* Remove deprecated overloads of API DB::CompactRange.
+* Remove deprecated option DBOptions::skip_log_error_on_recovery.
+* Remove ReadOptions::iter_start_seqnum which has been deprecated.
+* Remove DBOptions::preserved_deletes and DB::SetPreserveDeletesSequenceNumber().
+* Remove deprecated API AdvancedColumnFamilyOptions::rate_limit_delay_max_milliseconds.
+* Removed timestamp from WriteOptions. Accordingly, added to DB APIs Put, Delete, SingleDelete, etc. accepting an additional argument 'timestamp'. Added Put, Delete, SingleDelete, etc to WriteBatch accepting an additional argument 'timestamp'. Removed WriteBatch::AssignTimestamps(vector<Slice>) API. Renamed WriteBatch::AssignTimestamp() to WriteBatch::UpdateTimestamps() with clarified comments.
+* Changed type of cache buffer passed to `Cache::CreateCallback` from `void*` to `const void*`.
+* Significant updates to FilterPolicy-related APIs and configuration:
+  * Remove public API support for deprecated, inefficient block-based filter (use_block_based_builder=true).
+    * Old code and configuration strings that would enable it now quietly enable full filters instead, though any built-in FilterPolicy can still read block-based filters. This includes changing the longstanding default behavior of the Java API.
+    * Remove deprecated FilterPolicy::CreateFilter() and FilterPolicy::KeyMayMatch()
+    * Remove `rocksdb_filterpolicy_create()` from C API, as the only C API support for custom filter policies is now obsolete.
+    * If temporary memory usage in full filter creation is a problem, consider using partitioned filters, smaller SST files, or setting reserve_table_builder_memory=true.
+  * Remove support for "filter_policy=experimental_ribbon" configuration
+  string. Use something like "filter_policy=ribbonfilter:10" instead.
+  * Allow configuration string like "filter_policy=bloomfilter:10" without
+  bool, to minimize acknowledgement of obsolete block-based filter.
+  * Made FilterPolicy Customizable. Configuration of filter_policy is now accurately saved in OPTIONS file and can be loaded with LoadOptionsFromFile. (Loading an OPTIONS file generated by a previous version only enables reading and using existing filters, not generating new filters. Previously, no filter_policy would be configured from a saved OPTIONS file.)
+  * Change meaning of nullptr return from GetBuilderWithContext() from "use
+    block-based filter" to "generate no filter in this case."
+    * Also, when user specifies bits_per_key < 0.5, we now round this down
+    to "no filter" because we expect a filter with >= 80% FP rate is
+    unlikely to be worth the CPU cost of accessing it (esp with
+    cache_index_and_filter_blocks=1 or partition_filters=1).
+    * bits_per_key >= 0.5 and < 1.0 is still rounded up to 1.0 (for 62% FP
+    rate)
+  * Remove class definitions for FilterBitsBuilder and FilterBitsReader from
+    public API, so these can evolve more easily as implementation details.
+    Custom FilterPolicy can still decide what kind of built-in filter to use
+    under what conditions.
+  * Also removed deprecated functions
+    * FilterPolicy::GetFilterBitsBuilder()
+    * NewExperimentalRibbonFilterPolicy()
+  * Remove default implementations of
+    * FilterPolicy::GetBuilderWithContext()
+* Remove default implementation of Name() from FileSystemWrapper.
+* Rename `SizeApproximationOptions.include_memtabtles` to `SizeApproximationOptions.include_memtables`.
+* Remove deprecated option DBOptions::max_mem_compaction_level.
+* Return Status::InvalidArgument from ObjectRegistry::NewObject if a factory exists but the object ould not be created (returns NotFound if the factory is missing).
+* Remove deprecated overloads of API DB::GetApproximateSizes.
+* Remove deprecated option DBOptions::new_table_reader_for_compaction_inputs.
+* Add Transaction::SetReadTimestampForValidation() and Transaction::SetCommitTimestamp(). Default impl returns NotSupported().
+* Add support for decimal patterns to ObjectLibrary::PatternEntry
+* Remove deprecated remote compaction APIs `CompactionService::Start()` and `CompactionService::WaitForComplete()`. Please use `CompactionService::StartV2()`, `CompactionService::WaitForCompleteV2()` instead, which provides the same information plus extra data like priority, db_id, etc.
+* `ColumnFamilyOptions::OldDefaults` and `DBOptions::OldDefaults` are marked deprecated, as they are no longer maintained.
+* Add subcompaction callback APIs: `OnSubcompactionBegin()` and `OnSubcompactionCompleted()`.
+* Add file Temperature information to `FileOperationInfo` in event listener API.
+* Change the type of SizeApproximationFlags from enum to enum class. Also update the signature of DB::GetApproximateSizes API from uint8_t to SizeApproximationFlags.
+* Add Temperature hints information from RocksDB in API `NewSequentialFile()`. backup and checkpoint operations need to open the source files with `NewSequentialFile()`, which will have the temperature hints. Other operations are not covered.
+
+### Behavior Changes
+* Disallow the combination of DBOptions.use_direct_io_for_flush_and_compaction == true and DBOptions.writable_file_max_buffer_size == 0. This combination can cause WritableFileWriter::Append() to loop forever, and it does not make much sense in direct IO.
+* `ReadOptions::total_order_seek` no longer affects `DB::Get()`. The original motivation for this interaction has been obsolete since RocksDB has been able to detect whether the current prefix extractor is compatible with that used to generate table files, probably RocksDB 5.14.0.
+
+## New Features
+* Introduced an option `BlockBasedTableOptions::detect_filter_construct_corruption` for detecting corruption during Bloom Filter (format_version >= 5) and Ribbon Filter construction.
+* Improved the SstDumpTool to read the comparator from table properties and use it to read the SST File.
+* Extended the column family statistics in the info log so the total amount of garbage in the blob files and the blob file space amplification factor are also logged. Also exposed the blob file space amp via the `rocksdb.blob-stats` DB property.
+* Introduced the API rocksdb_create_dir_if_missing in c.h that calls underlying file system's CreateDirIfMissing API to create the directory.
+* Added last level and non-last level read statistics: `LAST_LEVEL_READ_*`, `NON_LAST_LEVEL_READ_*`.
+* Experimental: Add support for new APIs ReadAsync in FSRandomAccessFile that reads the data asynchronously and Poll API in FileSystem that checks if requested read request has completed or not. ReadAsync takes a callback function. Poll API checks for completion of read IO requests and  should call callback functions to indicate completion of read requests.
+
+## 6.29.0 (01/21/2022)
+Note: The next release will be major release 7.0. See https://github.com/facebook/rocksdb/issues/9390 for more info.
+### Public API change
+* Added values to `TraceFilterType`: `kTraceFilterIteratorSeek`, `kTraceFilterIteratorSeekForPrev`, and `kTraceFilterMultiGet`. They can be set in `TraceOptions` to filter out the operation types after which they are named.
+* Added `TraceOptions::preserve_write_order`. When enabled it  guarantees write records are traced in the same order they are logged to WAL and applied to the DB. By default it is disabled (false) to match the legacy behavior and prevent regression.
+* Made the Env class extend the Customizable class.  Implementations need to be registered with the ObjectRegistry and to implement a Name() method in order to be created via this method.
+* `Options::OldDefaults` is marked deprecated, as it is no longer maintained.
+* Add ObjectLibrary::AddFactory and ObjectLibrary::PatternEntry classes.  This method and associated class are the preferred mechanism for registering factories with the ObjectLibrary going forward.  The ObjectLibrary::Register method, which uses regular expressions and may be problematic, is deprecated and will be in a future release.
+* Changed `BlockBasedTableOptions::block_size` from `size_t` to `uint64_t`.
+* Added API warning against using `Iterator::Refresh()` together with `DB::DeleteRange()`, which are incompatible and have always risked causing the refreshed iterator to return incorrect results.
+* Made `AdvancedColumnFamilyOptions.bottommost_temperature` dynamically changeable with `SetOptions()`.
+
+### Behavior Changes
+* `DB::DestroyColumnFamilyHandle()` will return Status::InvalidArgument() if called with `DB::DefaultColumnFamily()`.
+* On 32-bit platforms, mmap reads are no longer quietly disabled, just discouraged.
+
+### New Features
+* Added `Options::DisableExtraChecks()` that can be used to improve peak write performance by disabling checks that should not be necessary in the absence of software logic errors or CPU+memory hardware errors. (Default options are slowly moving toward some performance overheads for extra correctness checking.)
+
+### Performance Improvements
+* Improved read performance when a prefix extractor is used (Seek, Get, MultiGet), even compared to version 6.25 baseline (see bug fix below), by optimizing the common case of prefix extractor compatible with table file and unchanging.
+
+### Bug Fixes
+* Fix a bug that FlushMemTable may return ok even flush not succeed.
+* Fixed a bug of Sync() and Fsync() not using `fcntl(F_FULLFSYNC)` on OS X and iOS.
+* Fixed a significant performance regression in version 6.26 when a prefix extractor is used on the read path (Seek, Get, MultiGet). (Excessive time was spent in SliceTransform::AsString().)
+* Fixed a race condition in SstFileManagerImpl error recovery code that can cause a crash during process shutdown.
+
+### New Features
+* Added RocksJava support for MacOS universal binary (ARM+x86)
+
+## 6.28.0 (2021-12-17)
+### New Features
+* Introduced 'CommitWithTimestamp' as a new tag. Currently, there is no API for user to trigger a write with this tag to the WAL. This is part of the efforts to support write-commited transactions with user-defined timestamps.
+* Introduce SimulatedHybridFileSystem which can help simulating HDD latency in db_bench. Tiered Storage latency simulation can be enabled using -simulate_hybrid_fs_file (note that it doesn't work if db_bench is interrupted in the middle). -simulate_hdd can also be used to simulate all files on HDD.
+
+### Bug Fixes
+* Fixed a bug in rocksdb automatic implicit prefetching which got broken because of new feature adaptive_readahead and internal prefetching got disabled when iterator moves from one file to next.
+* Fixed a bug in TableOptions.prepopulate_block_cache which causes segmentation fault when used with TableOptions.partition_filters = true and TableOptions.cache_index_and_filter_blocks = true.
+* Fixed a bug affecting custom memtable factories which are not registered with the `ObjectRegistry`. The bug could result in failure to save the OPTIONS file.
+* Fixed a bug causing two duplicate entries to be appended to a file opened in non-direct mode and tracked by `FaultInjectionTestFS`.
+* Fixed a bug in TableOptions.prepopulate_block_cache to support block-based filters also.
+* Block cache keys no longer use `FSRandomAccessFile::GetUniqueId()` (previously used when available), so a filesystem recycling unique ids can no longer lead to incorrect result or crash (#7405). For files generated by RocksDB >= 6.24, the cache keys are stable across DB::Open and DB directory move / copy / import / export / migration, etc. Although collisions are still theoretically possible, they are (a) impossible in many common cases, (b) not dependent on environmental factors, and (c) much less likely than a CPU miscalculation while executing RocksDB.
+* Fixed a bug in C bindings causing iterator to return incorrect result (#9343).
+
+### Behavior Changes
+* MemTableList::TrimHistory now use allocated bytes when max_write_buffer_size_to_maintain > 0(default in TrasactionDB, introduced in PR#5022) Fix #8371.
+
+### Public API change
+* Extend WriteBatch::AssignTimestamp and AssignTimestamps API so that both functions can accept an optional `checker` argument that performs additional checking on timestamp sizes.
+* Introduce a new EventListener callback that will be called upon the end of automatic error recovery.
+* Add IncreaseFullHistoryTsLow API so users can advance each column family's full_history_ts_low seperately.
+* Add GetFullHistoryTsLow API so users can query current full_history_low value of specified column family.
+
+### Performance Improvements
+* Replaced map property `TableProperties::properties_offsets`  with uint64_t property `external_sst_file_global_seqno_offset` to save table properties's memory.
+* Block cache accesses are faster by RocksDB using cache keys of fixed size (16 bytes).
+
+### Java API Changes
+* Removed Java API `TableProperties.getPropertiesOffsets()` as it exposed internal details to external users.
+
+## 6.27.0 (2021-11-19)
+### New Features
+* Added new ChecksumType kXXH3 which is faster than kCRC32c on almost all x86\_64 hardware.
+* Added a new online consistency check for BlobDB which validates that the number/total size of garbage blobs does not exceed the number/total size of all blobs in any given blob file.
+* Provided support for tracking per-sst user-defined timestamp information in MANIFEST.
+* Added new option "adaptive_readahead" in ReadOptions. For iterators, RocksDB does auto-readahead on noticing sequential reads and by enabling this option, readahead_size of current file (if reads are sequential) will be carried forward to next file instead of starting from the scratch at each level (except L0 level files). If reads are not sequential it will fall back to 8KB. This option is applicable only for RocksDB internal prefetch buffer and isn't supported with underlying file system prefetching.
+* Added the read count and read bytes related stats to Statistics for tiered storage hot, warm, and cold file reads.
+* Added an option to dynamically charge an updating estimated memory usage of block-based table building to block cache if block cache available. It currently only includes charging memory usage of constructing (new) Bloom Filter and Ribbon Filter to block cache. To enable this feature, set `BlockBasedTableOptions::reserve_table_builder_memory = true`.
+* Add a new API OnIOError in listener.h that notifies listeners when an IO error occurs during FileSystem operation along with filename, status etc.
+* Added compaction readahead support for blob files to the integrated BlobDB implementation, which can improve compaction performance when the database resides on higher-latency storage like HDDs or remote filesystems. Readahead can be configured using the column family option `blob_compaction_readahead_size`.
+
+### Bug Fixes
+* Prevent a `CompactRange()` with `CompactRangeOptions::change_level == true` from possibly causing corruption to the LSM state (overlapping files within a level) when run in parallel with another manual compaction. Note that setting `force_consistency_checks == true` (the default) would cause the DB to enter read-only mode in this scenario and return `Status::Corruption`, rather than committing any corruption.
+* Fixed a bug in CompactionIterator when write-prepared transaction is used. A released earliest write conflict snapshot may cause assertion failure in dbg mode and unexpected key in opt mode.
+* Fix ticker WRITE_WITH_WAL("rocksdb.write.wal"), this bug is caused by a bad extra `RecordTick(stats_, WRITE_WITH_WAL)` (at 2 place), this fix remove the extra `RecordTick`s and fix the corresponding test case.
+* EventListener::OnTableFileCreated was previously called with OK status and file_size==0 in cases of no SST file contents written (because there was no content to add) and the empty file deleted before calling the listener. Now the status is Aborted.
+* Fixed a bug in CompactionIterator when write-preared transaction is used. Releasing earliest_snapshot during compaction may cause a SingleDelete to be output after a PUT of the same user key whose seq has been zeroed.
+* Added input sanitization on negative bytes passed into `GenericRateLimiter::Request`.
+* Fixed an assertion failure in CompactionIterator when write-prepared transaction is used. We prove that certain operations can lead to a Delete being followed by a SingleDelete (same user key). We can drop the SingleDelete.
+* Fixed a bug of timestamp-based GC which can cause all versions of a key under full_history_ts_low to be dropped. This bug will be triggered when some of the ikeys' timestamps are lower than full_history_ts_low, while others are newer.
+* In some cases outside of the DB read and compaction paths, SST block checksums are now checked where they were not before.
+* Explicitly check for and disallow the `BlockBasedTableOptions` if insertion into one of {`block_cache`, `block_cache_compressed`, `persistent_cache`} can show up in another of these. (RocksDB expects to be able to use the same key for different physical data among tiers.)
+* Users who configured a dedicated thread pool for bottommost compactions by explicitly adding threads to the `Env::Priority::BOTTOM` pool will no longer see RocksDB schedule automatic compactions exceeding the DB's compaction concurrency limit. For details on per-DB compaction concurrency limit, see API docs of `max_background_compactions` and `max_background_jobs`.
+* Fixed a bug of background flush thread picking more memtables to flush and prematurely advancing column family's log_number.
+* Fixed an assertion failure in ManifestTailer.
+* Fixed a bug that could, with WAL enabled, cause backups, checkpoints, and `GetSortedWalFiles()` to fail randomly with an error like `IO error: 001234.log: No such file or directory`
+
+### Behavior Changes
+* `NUM_FILES_IN_SINGLE_COMPACTION` was only counting the first input level files, now it's including all input files.
+* `TransactionUtil::CheckKeyForConflicts` can also perform conflict-checking based on user-defined timestamps in addition to sequence numbers.
+* Removed `GenericRateLimiter`'s minimum refill bytes per period previously enforced.
+
+### Public API change
+* When options.ttl is used with leveled compaction with compactinon priority kMinOverlappingRatio, files exceeding half of TTL value will be prioritized more, so that by the time TTL is reached, fewer extra compactions will be scheduled to clear them up. At the same time, when compacting files with data older than half of TTL, output files may be cut off based on those files' boundaries, in order for the early TTL compaction to work properly.
+* Made FileSystem and RateLimiter extend the Customizable class and added a CreateFromString method.  Implementations need to be registered with the ObjectRegistry and to implement a Name() method in order to be created via this method.
+* Clarified in API comments that RocksDB is not exception safe for callbacks and custom extensions. An exception propagating into RocksDB can lead to undefined behavior, including data loss, unreported corruption, deadlocks, and more.
+* Marked `WriteBufferManager` as `final` because it is not intended for extension.
+* Removed unimportant implementation details from table_properties.h
+* Add API `FSDirectory::FsyncWithDirOptions()`, which provides extra information like directory fsync reason in `DirFsyncOptions`. File system like btrfs is using that to skip directory fsync for creating a new file, or when renaming a file, fsync the target file instead of the directory, which improves the `DB::Open()` speed by ~20%.
+* `DB::Open()` is not going be blocked by obsolete file purge if `DBOptions::avoid_unnecessary_blocking_io` is set to true.
+* In builds where glibc provides `gettid()`, info log ("LOG" file) lines now print a system-wide thread ID from `gettid()` instead of the process-local `pthread_self()`. For all users, the thread ID format is changed from hexadecimal to decimal integer.
+* In builds where glibc provides `pthread_setname_np()`, the background thread names no longer contain an ID suffix. For example, "rocksdb:bottom7" (and all other threads in the `Env::Priority::BOTTOM` pool) are now named "rocksdb:bottom". Previously large thread pools could breach the name size limit (e.g., naming "rocksdb:bottom10" would fail).
+* Deprecating `ReadOptions::iter_start_seqnum` and `DBOptions::preserve_deletes`, please try using user defined timestamp feature instead. The options will be removed in a future release, currently it logs a warning message when using.
+
+### Performance Improvements
+* Released some memory related to filter construction earlier in `BlockBasedTableBuilder` for `FullFilter` and `PartitionedFilter` case (#9070)
+
+### Behavior Changes
+* `NUM_FILES_IN_SINGLE_COMPACTION` was only counting the first input level files, now it's including all input files.
+
+## 6.26.0 (2021-10-20)
+### Bug Fixes
+* Fixes a bug in directed IO mode when calling MultiGet() for blobs in the same blob file. The bug is caused by not sorting the blob read requests by file offsets.
+* Fix the incorrect disabling of SST rate limited deletion when the WAL and DB are in different directories. Only WAL rate limited deletion should be disabled if its in a different directory.
+* Fix `DisableManualCompaction()` to cancel compactions even when they are waiting on automatic compactions to drain due to `CompactRangeOptions::exclusive_manual_compactions == true`.
+* Fix contract of `Env::ReopenWritableFile()` and `FileSystem::ReopenWritableFile()` to specify any existing file must not be deleted or truncated.
+* Fixed bug in calls to `IngestExternalFiles()` with files for multiple column families. The bug could have introduced a delay in ingested file keys becoming visible after `IngestExternalFiles()` returned. Furthermore, mutations to ingested file keys while they were invisible could have been dropped (not necessarily immediately).
+* Fixed a possible race condition impacting users of `WriteBufferManager` who constructed it with `allow_stall == true`. The race condition led to undefined behavior (in our experience, typically a process crash).
+* Fixed a bug where stalled writes would remain stalled forever after the user calls `WriteBufferManager::SetBufferSize()` with `new_size == 0` to dynamically disable memory limiting.
+* Make `DB::close()` thread-safe.
+* Fix a bug in atomic flush where one bg flush thread will wait forever for a preceding bg flush thread to commit its result to MANIFEST but encounters an error which is mapped to a soft error (DB not stopped).
+* Fix a bug in `BackupEngine` where some internal callers of `GenericRateLimiter::Request()` do not honor `bytes <= GetSingleBurstBytes()`.
+
+### New Features
+* Print information about blob files when using "ldb list_live_files_metadata"
+* Provided support for SingleDelete with user defined timestamp.
+* Experimental new function DB::GetLiveFilesStorageInfo offers essentially a unified version of other functions like GetLiveFiles, GetLiveFilesChecksumInfo, and GetSortedWalFiles. Checkpoints and backups could show small behavioral changes and/or improved performance as they now use this new API.
+* Add remote compaction read/write bytes statistics: `REMOTE_COMPACT_READ_BYTES`, `REMOTE_COMPACT_WRITE_BYTES`.
+* Introduce an experimental feature to dump out the blocks from block cache and insert them to the secondary cache to reduce the cache warmup time (e.g., used while migrating DB instance). More information are in `class CacheDumper` and `CacheDumpedLoader` at `rocksdb/utilities/cache_dump_load.h` Note that, this feature is subject to the potential change in the future, it is still experimental.
+* Introduced a new BlobDB configuration option `blob_garbage_collection_force_threshold`, which can be used to trigger compactions targeting the SST files which reference the oldest blob files when the ratio of garbage in those blob files meets or exceeds the specified threshold. This can reduce space amplification with skewed workloads where the affected SST files might not otherwise get picked up for compaction.
+* Added EXPERIMENTAL support for table file (SST) unique identifiers that are stable and universally unique, available with new function `GetUniqueIdFromTableProperties`. Only SST files from RocksDB >= 6.24 support unique IDs.
+* Added `GetMapProperty()` support for "rocksdb.dbstats" (`DB::Properties::kDBStats`). As a map property, it includes DB-level internal stats accumulated over the DB's lifetime, such as user write related stats and uptime.
+
+### Public API change
+* Made SystemClock extend the Customizable class and added a CreateFromString method.  Implementations need to be registered with the ObjectRegistry and to implement a Name() method in order to be created via this method.
+* Made SliceTransform extend the Customizable class and added a CreateFromString method.  Implementations need to be registered with the ObjectRegistry and to implement a Name() method in order to be created via this method.  The Capped and Prefixed transform classes return a short name (no length); use GetId for the fully qualified name.
+* Made FileChecksumGenFactory, SstPartitionerFactory, TablePropertiesCollectorFactory, and WalFilter extend the Customizable class and added a CreateFromString method.
+* Some fields of SstFileMetaData are deprecated for compatibility with new base class FileStorageInfo.
+* Add `file_temperature` to `IngestExternalFileArg` such that when ingesting SST files, we are able to indicate the temperature of the this batch of files.
+* If `DB::Close()` failed with a non aborted status, calling `DB::Close()` again will return the original status instead of Status::OK.
+* Add CacheTier to advanced_options.h to describe the cache tier we used. Add a `lowest_used_cache_tier` option to `DBOptions` (immutable) and pass it to BlockBasedTableReader. By default it is `CacheTier::kNonVolatileBlockTier`, which means, we always use both block cache (kVolatileTier) and secondary cache (kNonVolatileBlockTier). By set it to `CacheTier::kVolatileTier`, the DB will not use the secondary cache.
+* Even when options.max_compaction_bytes is hit, compaction output files are only cut when it aligns with grandparent files' boundaries. options.max_compaction_bytes could be slightly violated with the change, but the violation is no more than one target SST file size, which is usually much smaller.
+
+### Performance Improvements
+* Improved CPU efficiency of building block-based table (SST) files (#9039 and #9040).
+
+### Java API Changes
+* Add Java API bindings for new integrated BlobDB options
+* `keyMayExist()` supports ByteBuffer.
+* Fix multiget throwing Null Pointer Exception for num of keys > 70k (https://github.com/facebook/rocksdb/issues/8039).
+
+## 6.25.0 (2021-09-20)
+### Bug Fixes
+* Allow secondary instance to refresh iterator. Assign read seq after referencing SuperVersion.
+* Fixed a bug of secondary instance's last_sequence going backward, and reads on the secondary fail to see recent updates from the primary.
+* Fixed a bug that could lead to duplicate DB ID or DB session ID in POSIX environments without /proc/sys/kernel/random/uuid.
+* Fix a race in DumpStats() with column family destruction due to not taking a Ref on each entry while iterating the ColumnFamilySet.
+* Fix a race in item ref counting in LRUCache when promoting an item from the SecondaryCache.
+* Fix a race in BackupEngine if RateLimiter is reconfigured during concurrent Restore operations.
+* Fix a bug on POSIX in which failure to create a lock file (e.g. out of space) can prevent future LockFile attempts in the same process on the same file from succeeding.
+* Fix a bug that backup_rate_limiter and restore_rate_limiter in BackupEngine could not limit read rates.
+* Fix the implementation of `prepopulate_block_cache = kFlushOnly` to only apply to flushes rather than to all generated files.
+* Fix WAL log data corruption when using DBOptions.manual_wal_flush(true) and WriteOptions.sync(true) together. The sync WAL should work with locked log_write_mutex_.
+* Add checks for validity of the IO uring completion queue entries, and fail the BlockBasedTableReader MultiGet sub-batch if there's an invalid completion
+* Add an interface RocksDbIOUringEnable() that, if defined by the user, will allow them to enable/disable the use of IO uring by RocksDB
+* Fix the bug that when direct I/O is used and MultiRead() returns a short result, RandomAccessFileReader::MultiRead() still returns full size buffer, with returned short value together with some data in original buffer. This bug is unlikely cause incorrect results, because (1) since FileSystem layer is expected to retry on short result, returning short results is only possible when asking more bytes in the end of the file, which RocksDB doesn't do when using MultiRead(); (2) checksum is unlikely to match.
+
+### New Features
+* RemoteCompaction's interface now includes `db_name`, `db_id`, `session_id`, which could help the user uniquely identify compaction job between db instances and sessions.
+* Added a ticker statistic, "rocksdb.verify_checksum.read.bytes", reporting how many bytes were read from file to serve `VerifyChecksum()` and `VerifyFileChecksums()` queries.
+* Added ticker statistics, "rocksdb.backup.read.bytes" and "rocksdb.backup.write.bytes", reporting how many bytes were read and written during backup.
+* Added properties for BlobDB: `rocksdb.num-blob-files`, `rocksdb.blob-stats`, `rocksdb.total-blob-file-size`, and `rocksdb.live-blob-file-size`. The existing property `rocksdb.estimate_live-data-size` was also extended to include live bytes residing in blob files.
+* Added two new RateLimiter IOPriorities: `Env::IO_USER`,`Env::IO_MID`. `Env::IO_USER` will have superior priority over all other RateLimiter IOPriorities without being subject to fair scheduling constraint.
+* `SstFileWriter` now supports `Put`s and `Delete`s with user-defined timestamps. Note that the ingestion logic itself is not timestamp-aware yet.
+* Allow a single write batch to include keys from multiple column families whose timestamps' formats can differ. For example, some column families may disable timestamp, while others enable timestamp.
+* Add compaction priority information in RemoteCompaction, which can be used to schedule high priority job first.
+* Added new callback APIs `OnBlobFileCreationStarted`,`OnBlobFileCreated`and `OnBlobFileDeleted` in `EventListener` class of listener.h. It notifies listeners during creation/deletion of individual blob files in Integrated BlobDB. It also log blob file creation finished event and deletion event in LOG file.
+* Batch blob read requests for `DB::MultiGet` using `MultiRead`.
+* Add support for fallback to local compaction, the user can return `CompactionServiceJobStatus::kUseLocal` to instruct RocksDB to run the compaction locally instead of waiting for the remote compaction result.
+* Add built-in rate limiter's implementation of `RateLimiter::GetTotalPendingRequest(int64_t* total_pending_requests, const Env::IOPriority pri)` for the total number of requests that are pending for bytes in the rate limiter.
+* Charge memory usage during data buffering, from which training samples are gathered for dictionary compression, to block cache. Unbuffering data can now be triggered if the block cache becomes full and `strict_capacity_limit=true` for the block cache, in addition to existing conditions that can trigger unbuffering.
+
+### Public API change
+* Remove obsolete implementation details FullKey and ParseFullKey from public API
+* Change `SstFileMetaData::size` from `size_t` to `uint64_t`.
+* Made Statistics extend the Customizable class and added a CreateFromString method.  Implementations of Statistics need to be registered with the ObjectRegistry and to implement a Name() method in order to be created via this method.
+* Extended `FlushJobInfo` and `CompactionJobInfo` in listener.h to provide information about the blob files generated by a flush/compaction and garbage collected during compaction in Integrated BlobDB. Added struct members `blob_file_addition_infos` and `blob_file_garbage_infos` that contain this information.
+* Extended parameter `output_file_names` of `CompactFiles` API to also include paths of the blob files generated by the compaction in Integrated BlobDB.
+* Most `BackupEngine` functions now return `IOStatus` instead of `Status`. Most existing code should be compatible with this change but some calls might need to be updated.
+* Add a new field `level_at_creation` in `TablePropertiesCollectorFactory::Context` to capture the level at creating the SST file (i.e, table), of which the properties are being collected.
+
+### Miscellaneous
+* Add a paranoid check where in case FileSystem layer doesn't fill the buffer but returns succeed, checksum is unlikely to match even if buffer contains a previous block. The byte modified is not useful anyway, so it isn't expected to change any behavior when FileSystem is satisfying its contract.
+
+## 6.24.0 (2021-08-20)
+### Bug Fixes
+* If the primary's CURRENT file is missing or inaccessible, the secondary instance should not hang repeatedly trying to switch to a new MANIFEST. It should instead return the error code encountered while accessing the file.
+* Restoring backups with BackupEngine is now a logically atomic operation, so that if a restore operation is interrupted, DB::Open on it will fail. Using BackupEngineOptions::sync (default) ensures atomicity even in case of power loss or OS crash.
+* Fixed a race related to the destruction of `ColumnFamilyData` objects. The earlier logic unlocked the DB mutex before destroying the thread-local `SuperVersion` pointers, which could result in a process crash if another thread managed to get a reference to the `ColumnFamilyData` object.
+* Removed a call to `RenameFile()` on a non-existent info log file ("LOG") when opening a new DB. Such a call was guaranteed to fail though did not impact applications since we swallowed the error. Now we also stopped swallowing errors in renaming "LOG" file.
+* Fixed an issue where `OnFlushCompleted` was not called for atomic flush.
+* Fixed a bug affecting the batched `MultiGet` API when used with keys spanning multiple column families and `sorted_input == false`.
+* Fixed a potential incorrect result in opt mode and assertion failures caused by releasing snapshot(s) during compaction.
+* Fixed passing of BlobFileCompletionCallback to Compaction job and Atomic flush job which was default paramter (nullptr). BlobFileCompletitionCallback is internal callback that manages addition of blob files to SSTFileManager.
+* Fixed MultiGet not updating the block_read_count and block_read_byte PerfContext counters.
+
+### New Features
+* Made the EventListener extend the Customizable class.
+* EventListeners that have a non-empty Name() and that are registered with the ObjectRegistry can now be serialized to/from the OPTIONS file.
+* Insert warm blocks (data blocks, uncompressed dict blocks, index and filter blocks) in Block cache during flush under option BlockBasedTableOptions.prepopulate_block_cache. Previously it was enabled for only data blocks.
+* BlockBasedTableOptions.prepopulate_block_cache can be dynamically configured using DB::SetOptions.
+* Add CompactionOptionsFIFO.age_for_warm, which allows RocksDB to move old files to warm tier in FIFO compactions. Note that file temperature is still an experimental feature.
+* Add a comment to suggest btrfs user to disable file preallocation by setting `options.allow_fallocate=false`.
+* Fast forward option in Trace replay changed to double type to allow replaying at a lower speed, by settings the value between 0 and 1. This option can be set via `ReplayOptions` in `Replayer::Replay()`, or via `--trace_replay_fast_forward` in db_bench.
+* Add property `LiveSstFilesSizeAtTemperature` to retrieve sst file size at different temperature.
+* Added a stat rocksdb.secondary.cache.hits.
+* Added a PerfContext counter secondary_cache_hit_count.
+* The integrated BlobDB implementation now supports the tickers `BLOB_DB_BLOB_FILE_BYTES_READ`, `BLOB_DB_GC_NUM_KEYS_RELOCATED`, and `BLOB_DB_GC_BYTES_RELOCATED`, as well as the histograms `BLOB_DB_COMPRESSION_MICROS` and `BLOB_DB_DECOMPRESSION_MICROS`.
+* Added hybrid configuration of Ribbon filter and Bloom filter where some LSM levels use Ribbon for memory space efficiency and some use Bloom for speed. See NewRibbonFilterPolicy. This also changes the default behavior of NewRibbonFilterPolicy to use Bloom for flushes under Leveled and Universal compaction and Ribbon otherwise. The C API function `rocksdb_filterpolicy_create_ribbon` is unchanged but adds new `rocksdb_filterpolicy_create_ribbon_hybrid`.
+
+### Public API change
+* Added APIs to decode and replay trace file via Replayer class. Added `DB::NewDefaultReplayer()` to create a default Replayer instance. Added `TraceReader::Reset()` to restart reading a trace file. Created trace_record.h, trace_record_result.h and utilities/replayer.h files to access the decoded Trace records, replay them, and query the actual operation results.
+* Added Configurable::GetOptionsMap to the public API for use in creating new Customizable classes.
+* Generalized bits_per_key parameters in C API from int to double for greater configurability. Although this is a compatible change for existing C source code, anything depending on C API signatures, such as foreign function interfaces, will need to be updated.
+
+### Performance Improvements
+* Try to avoid updating DBOptions if `SetDBOptions()` does not change any option value.
+
+### Behavior Changes
+* `StringAppendOperator` additionally accepts a string as the delimiter.
+* BackupEngineOptions::sync (default true) now applies to restoring backups in addition to creating backups. This could slow down restores, but ensures they are fully persisted before returning OK. (Consider increasing max_background_operations to improve performance.)
+
+## 6.23.0 (2021-07-16)
+### Behavior Changes
+* Obsolete keys in the bottommost level that were preserved for a snapshot will now be cleaned upon snapshot release in all cases. This form of compaction (snapshot release triggered compaction) previously had an artificial limitation that multiple tombstones needed to be present.
+### Bug Fixes
+* Blob file checksums are now printed in hexadecimal format when using the `manifest_dump` `ldb` command.
+* `GetLiveFilesMetaData()` now populates the `temperature`, `oldest_ancester_time`, and `file_creation_time` fields of its `LiveFileMetaData` results when the information is available. Previously these fields always contained zero indicating unknown.
+* Fix mismatches of OnCompaction{Begin,Completed} in case of DisableManualCompaction().
+* Fix continuous logging of an existing background error on every user write
+* Fix a bug that `Get()` return Status::OK() and an empty value for non-existent key when `read_options.read_tier = kBlockCacheTier`.
+* Fix a bug that stat in `get_context` didn't accumulate to statistics when query is failed.
+* Fixed handling of DBOptions::wal_dir with LoadLatestOptions() or ldb --try_load_options on a copied or moved DB. Previously, when the WAL directory is same as DB directory (default), a copied or moved DB would reference the old path of the DB as the WAL directory, potentially corrupting both copies. Under this change, the wal_dir from DB::GetOptions() or LoadLatestOptions() may now be empty, indicating that the current DB directory is used for WALs. This is also a subtle API change.
+
+### New Features
+* ldb has a new feature, `list_live_files_metadata`, that shows the live SST files, as well as their LSM storage level and the column family they belong to.
+* The new BlobDB implementation now tracks the amount of garbage in each blob file in the MANIFEST.
+* Integrated BlobDB now supports Merge with base values (Put/Delete etc.).
+* RemoteCompaction supports sub-compaction, the job_id in the user interface is changed from `int` to `uint64_t` to support sub-compaction id.
+* Expose statistics option in RemoteCompaction worker.
+
+### Public API change
+* Added APIs to the Customizable class to allow developers to create their own Customizable classes.  Created the utilities/customizable_util.h file to contain helper methods for developing new Customizable classes.
+* Change signature of SecondaryCache::Name().  Make SecondaryCache customizable and add SecondaryCache::CreateFromString method.
+
+## 6.22.0 (2021-06-18)
+### Behavior Changes
+* Added two additional tickers, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH and MEMTABLE_GARBAGE_BYTES_AT_FLUSH. These stats can be used to estimate the ratio of "garbage" (outdated) bytes in the memtable that are discarded at flush time.
+* Added API comments clarifying safe usage of Disable/EnableManualCompaction and EventListener callbacks for compaction.
+### Bug Fixes
+* fs_posix.cc GetFreeSpace() always report disk space available to root even when running as non-root.  Linux defaults often have disk mounts with 5 to 10 percent of total space reserved only for root.  Out of space could result for non-root users.
+* Subcompactions are now disabled when user-defined timestamps are used, since the subcompaction boundary picking logic is currently not timestamp-aware, which could lead to incorrect results when different subcompactions process keys that only differ by timestamp.
+* Fix an issue that `DeleteFilesInRange()` may cause ongoing compaction reports corruption exception, or ASSERT for debug build. There's no actual data loss or corruption that we find.
+* Fixed confusingly duplicated output in LOG for periodic stats ("DUMPING STATS"), including "Compaction Stats" and "File Read Latency Histogram By Level".
+* Fixed performance bugs in background gathering of block cache entry statistics, that could consume a lot of CPU when there are many column families with a shared block cache.
+
+### New Features
+* Marked the Ribbon filter and optimize_filters_for_memory features as production-ready, each enabling memory savings for Bloom-like filters. Use `NewRibbonFilterPolicy` in place of `NewBloomFilterPolicy` to use Ribbon filters instead of Bloom, or `ribbonfilter` in place of `bloomfilter` in configuration string.
+* Allow `DBWithTTL` to use `DeleteRange` api just like other DBs. `DeleteRangeCF()` which executes `WriteBatchInternal::DeleteRange()` has been added to the handler in `DBWithTTLImpl::Write()` to implement it.
+* Add BlockBasedTableOptions.prepopulate_block_cache.  If enabled, it prepopulate warm/hot data blocks which are already in memory into block cache at the time of flush. On a flush, the data block that is in memory (in memtables) get flushed to the device. If using Direct IO, additional IO is incurred to read this data back into memory again, which is avoided by enabling this option and it also helps with Distributed FileSystem. More details in include/rocksdb/table.h.
+* Added a `cancel` field to `CompactRangeOptions`, allowing individual in-process manual range compactions to be cancelled.
+
+### New Features
+* Added BlobMetaData to the ColumnFamilyMetaData to return information about blob files
+
+### Public API change
+* Added GetAllColumnFamilyMetaData API to retrieve the ColumnFamilyMetaData about all column families.
+
+## 6.21.0 (2021-05-21)
+### Bug Fixes
+* Fixed a bug in handling file rename error in distributed/network file systems when the server succeeds but client returns error. The bug can cause CURRENT file to point to non-existing MANIFEST file, thus DB cannot be opened.
+* Fixed a bug where ingested files were written with incorrect boundary key metadata. In rare cases this could have led to a level's files being wrongly ordered and queries for the boundary keys returning wrong results.
+* Fixed a data race between insertion into memtables and the retrieval of the DB properties `rocksdb.cur-size-active-mem-table`, `rocksdb.cur-size-all-mem-tables`, and `rocksdb.size-all-mem-tables`.
+* Fixed the false-positive alert when recovering from the WAL file. Avoid reporting "SST file is ahead of WAL" on a newly created empty column family, if the previous WAL file is corrupted.
+* Fixed a bug where `GetLiveFiles()` output included a non-existent file called "OPTIONS-000000". Backups and checkpoints, which use `GetLiveFiles()`, failed on DBs impacted by this bug. Read-write DBs were impacted when the latest OPTIONS file failed to write and `fail_if_options_file_error == false`. Read-only DBs were impacted when no OPTIONS files existed.
+* Handle return code by io_uring_submit_and_wait() and io_uring_wait_cqe().
+* In the IngestExternalFile() API, only try to sync the ingested file if the file is linked and the FileSystem/Env supports reopening a writable file.
+* Fixed a bug that `AdvancedColumnFamilyOptions.max_compaction_bytes` is under-calculated for manual compaction (`CompactRange()`). Manual compaction is split to multiple compactions if the compaction size exceed the `max_compaction_bytes`. The bug creates much larger compaction which size exceed the user setting. On the other hand, larger manual compaction size can increase the subcompaction parallelism, you can tune that by setting `max_compaction_bytes`.
+
+### Behavior Changes
+* Due to the fix of false-postive alert of "SST file is ahead of WAL", all the CFs with no SST file (CF empty) will bypass the consistency check. We fixed a false-positive, but introduced a very rare true-negative which will be triggered in the following conditions: A CF with some delete operations in the last a few queries which will result in an empty CF (those are flushed to SST file and a compaction triggered which combines this file and all other SST files and generates an empty CF, or there is another reason to write a manifest entry for this CF after a flush that generates no SST file from an empty CF). The deletion entries are logged in a WAL and this WAL was corrupted, while the CF's log number points to the next WAL (due to the flush). Therefore, the DB can only recover to the point without these trailing deletions and cause the inconsistent DB status.
+
+### New Features
+* Add new option allow_stall passed during instance creation of WriteBufferManager. When allow_stall is set, WriteBufferManager will stall all writers shared across multiple DBs and columns if memory usage goes beyond specified WriteBufferManager::buffer_size (soft limit). Stall will be cleared when memory is freed after flush and memory usage goes down below buffer_size.
+* Allow `CompactionFilter`s to apply in more table file creation scenarios such as flush and recovery. For compatibility, `CompactionFilter`s by default apply during compaction. Users can customize this behavior by overriding `CompactionFilterFactory::ShouldFilterTableFileCreation()`.
+* Added more fields to FilterBuildingContext with LSM details, for custom filter policies that vary behavior based on where they are in the LSM-tree.
+* Added DB::Properties::kBlockCacheEntryStats for querying statistics on what percentage of block cache is used by various kinds of blocks, etc. using DB::GetProperty and DB::GetMapProperty. The same information is now dumped to info LOG periodically according to `stats_dump_period_sec`.
+* Add an experimental Remote Compaction feature, which allows the user to run Compaction on a different host or process. The feature is still under development, currently only works on some basic use cases. The interface will be changed without backward/forward compatibility support.
+* RocksDB would validate total entries read in flush, and compare with counter inserted into it. If flush_verify_memtable_count = true (default), flush will fail. Otherwise, only log to info logs.
+* Add `TableProperties::num_filter_entries`, which can be used with `TableProperties::filter_size` to calculate the effective bits per filter entry (unique user key or prefix) for a table file.
+
+### Performance Improvements
+* BlockPrefetcher is used by iterators to prefetch data if they anticipate more data to be used in future. It is enabled implicitly by rocksdb. Added change to take in account read pattern if reads are sequential. This would disable prefetching for random reads in MultiGet and iterators as readahead_size is increased exponential doing large prefetches.
+
+### Public API change
+* Removed a parameter from TableFactory::NewTableBuilder, which should not be called by user code because TableBuilder is not a public API.
+* Removed unused structure `CompactionFilterContext`.
+* The `skip_filters` parameter to SstFileWriter is now considered deprecated. Use `BlockBasedTableOptions::filter_policy` to control generation of filters.
+* ClockCache is known to have bugs that could lead to crash or corruption, so should not be used until fixed. Use NewLRUCache instead.
+* Added a new pure virtual function `ApplyToAllEntries` to `Cache`, to replace `ApplyToAllCacheEntries`. Custom `Cache` implementations must add an implementation. Because this function is for gathering statistics, an empty implementation could be acceptable for some applications.
+* Added the ObjectRegistry to the ConfigOptions class.  This registry instance will be used to find any customizable loadable objects during initialization.
+* Expanded the ObjectRegistry functionality to allow nested ObjectRegistry instances.  Added methods to register a set of functions with the registry/library as a group.
+* Deprecated backupable_db.h and BackupableDBOptions in favor of new versions with appropriate names: backup_engine.h and BackupEngineOptions. Old API compatibility is preserved.
+
+### Default Option Change
+* When options.arena_block_size <= 0 (default value 0), still use writer_buffer_size / 8 but cap to 1MB. Too large alloation size might not be friendly to allocator and might cause performance issues in extreme cases.
+
+### Build
+* By default, try to build with liburing. For make, if ROCKSDB_USE_IO_URING is not set, treat as enable, which means RocksDB will try to build with liburing. Users can disable it with ROCKSDB_USE_IO_URING=0. For cmake, add WITH_LIBURING to control it, with default on.
+
+## 6.20.0 (2021-04-16)
+### Behavior Changes
+* `ColumnFamilyOptions::sample_for_compression` now takes effect for creation of all block-based tables. Previously it only took effect for block-based tables created by flush.
+* `CompactFiles()` can no longer compact files from lower level to up level, which has the risk to corrupt DB (details: #8063). The validation is also added to all compactions.
+* Fixed some cases in which DB::OpenForReadOnly() could write to the filesystem. If you want a Logger with a read-only DB, you must now set DBOptions::info_log yourself, such as using CreateLoggerFromOptions().
+* get_iostats_context() will never return nullptr. If thread-local support is not available, and user does not opt-out iostats context, then compilation will fail. The same applies to perf context as well.
+* Added support for WriteBatchWithIndex::NewIteratorWithBase when overwrite_key=false.  Previously, this combination was not supported and would assert or return nullptr.
+* Improve the behavior of WriteBatchWithIndex for Merge operations.  Now more operations may be stored in order to return the correct merged result.
+
+### Bug Fixes
+* Use thread-safe `strerror_r()` to get error messages.
+* Fixed a potential hang in shutdown for a DB whose `Env` has high-pri thread pool disabled (`Env::GetBackgroundThreads(Env::Priority::HIGH) == 0`)
+* Made BackupEngine thread-safe and added documentation comments to clarify what is safe for multiple BackupEngine objects accessing the same backup directory.
+* Fixed crash (divide by zero) when compression dictionary is applied to a file containing only range tombstones.
+* Fixed a backward iteration bug with partitioned filter enabled: not including the prefix of the last key of the previous filter partition in current filter partition can cause wrong iteration result.
+* Fixed a bug that allowed `DBOptions::max_open_files` to be set with a non-negative integer with `ColumnFamilyOptions::compaction_style = kCompactionStyleFIFO`.
+
+### Performance Improvements
+* On ARM platform, use `yield` instead of `wfe` to relax cpu to gain better performance.
+
+### Public API change
+* Added `TableProperties::slow_compression_estimated_data_size` and `TableProperties::fast_compression_estimated_data_size`. When `ColumnFamilyOptions::sample_for_compression > 0`, they estimate what `TableProperties::data_size` would have been if the "fast" or "slow" (see `ColumnFamilyOptions::sample_for_compression` API doc for definitions) compression had been used instead.
+* Update DB::StartIOTrace and remove Env object from the arguments as its redundant and DB already has Env object that is passed down to IOTracer::StartIOTrace
+* Added `FlushReason::kWalFull`, which is reported when a memtable is flushed due to the WAL reaching its size limit; those flushes were previously reported as `FlushReason::kWriteBufferManager`. Also, changed the reason for flushes triggered by the write buffer manager to `FlushReason::kWriteBufferManager`; they were previously reported as `FlushReason::kWriteBufferFull`.
+* Extend file_checksum_dump ldb command and DB::GetLiveFilesChecksumInfo API for IntegratedBlobDB and get checksum of blob files along with SST files.
+
+### New Features
+* Added the ability to open BackupEngine backups as read-only DBs, using BackupInfo::name_for_open and env_for_open provided by BackupEngine::GetBackupInfo() with include_file_details=true.
+* Added BackupEngine support for integrated BlobDB, with blob files shared between backups when table files are shared. Because of current limitations, blob files always use the kLegacyCrc32cAndFileSize naming scheme, and incremental backups must read and checksum all blob files in a DB, even for files that are already backed up.
+* Added an optional output parameter to BackupEngine::CreateNewBackup(WithMetadata) to return the BackupID of the new backup.
+* Added BackupEngine::GetBackupInfo / GetLatestBackupInfo for querying individual backups.
+* Made the Ribbon filter a long-term supported feature in terms of the SST schema(compatible with version >= 6.15.0) though the API for enabling it is expected to change.
+
+## 6.19.0 (2021-03-21)
+### Bug Fixes
+* Fixed the truncation error found in APIs/tools when dumping block-based SST files in a human-readable format. After fix, the block-based table can be fully dumped as a readable file.
+* When hitting a write slowdown condition, no write delay (previously 1 millisecond) is imposed until `delayed_write_rate` is actually exceeded, with an initial burst allowance of 1 millisecond worth of bytes. Also, beyond the initial burst allowance, `delayed_write_rate` is now more strictly enforced, especially with multiple column families.
+
+### Public API change
+* Changed default `BackupableDBOptions::share_files_with_checksum` to `true` and deprecated `false` because of potential for data loss. Note that accepting this change in behavior can temporarily increase backup data usage because files are not shared between backups using the two different settings. Also removed obsolete option kFlagMatchInterimNaming.
+* Add a new option BlockBasedTableOptions::max_auto_readahead_size. RocksDB does auto-readahead for iterators on noticing more than two reads for a table file if user doesn't provide readahead_size. The readahead starts at 8KB and doubles on every additional read upto max_auto_readahead_size and now max_auto_readahead_size can be configured dynamically as well. Found that 256 KB readahead size provides the best performance, based on experiments, for auto readahead. Experiment data is in PR #3282. If value is set 0 then no automatic prefetching will be done by rocksdb. Also changing the value will only affect files opened after the change.
+* Add suppport to extend DB::VerifyFileChecksums API to also verify blob files checksum.
+* When using the new BlobDB, the amount of data written by flushes/compactions is now broken down into table files and blob files in the compaction statistics; namely, Write(GB) denotes the amount of data written to table files, while Wblob(GB) means the amount of data written to blob files.
+* New default BlockBasedTableOptions::format_version=5 to enable new Bloom filter implementation by default, compatible with RocksDB versions >= 6.6.0.
+* Add new SetBufferSize API to WriteBufferManager to allow dynamic management of memory allotted to all write buffers.  This allows user code to adjust memory monitoring provided by WriteBufferManager as process memory needs change datasets grow and shrink.
+* Clarified the required semantics of Read() functions in FileSystem and Env APIs. Please ensure any custom implementations are compliant.
+* For the new integrated BlobDB implementation, compaction statistics now include the amount of data read from blob files during compaction (due to garbage collection or compaction filters). Write amplification metrics have also been extended to account for data read from blob files.
+* Add EqualWithoutTimestamp() to Comparator.
+* Extend support to track blob files in SSTFileManager whenever a blob file is created/deleted. Blob files will be scheduled to delete via SSTFileManager and SStFileManager will now take blob files in account while calculating size and space limits along with SST files.
+* Add new Append and PositionedAppend API with checksum handoff to legacy Env.
+
+### New Features
+* Support compaction filters for the new implementation of BlobDB. Add `FilterBlobByKey()` to `CompactionFilter`. Subclasses can override this method so that compaction filters can determine whether the actual blob value has to be read during compaction. Use a new `kUndetermined` in `CompactionFilter::Decision` to indicated that further action is necessary for compaction filter to make a decision.
+* Add support to extend retrieval of checksums for blob files from the MANIFEST when checkpointing. During backup, rocksdb can detect corruption in blob files  during file copies.
+* Add new options for db_bench --benchmarks: flush, waitforcompaction, compact0, compact1.
+* Add an option to BackupEngine::GetBackupInfo to include the name and size of each backed-up file. Especially in the presence of file sharing among backups, this offers detailed insight into backup space usage.
+* Enable backward iteration on keys with user-defined timestamps.
+* Add statistics and info log for error handler: counters for bg error, bg io error, bg retryable io error, auto resume count, auto resume total retry number, and auto resume sucess; Histogram for auto resume retry count in each recovery call. Note that, each auto resume attempt will have one or multiple retries.
+
+### Behavior Changes
+* During flush, only WAL sync retryable IO error is mapped to hard error, which will stall the writes. When WAL is used but only SST file write has retryable IO error, it will be mapped to soft error and write will not be affected.
+
+## 6.18.0 (2021-02-19)
+### Behavior Changes
+* When retryable IO error occurs during compaction, it is mapped to soft error and set the BG error. However, auto resume is not called to clean the soft error since compaction will reschedule by itself. In this change, When retryable IO error occurs during compaction, BG error is not set. User will be informed the error via EventHelper.
+* Introduce a new trace file format for query tracing and replay and trace file version is bump up to 0.2. A payload map is added as the first portion of the payload. We will not have backward compatible issues when adding new entries to trace records. Added the iterator_upper_bound and iterator_lower_bound in Seek and SeekForPrev tracing function. Added them as the new payload member for iterator tracing.
+
+### New Features
+* Add support for key-value integrity protection in live updates from the user buffers provided to `WriteBatch` through the write to RocksDB's in-memory update buffer (memtable). This is intended to detect some cases of in-memory data corruption, due to either software or hardware errors. Users can enable protection by constructing their `WriteBatch` with `protection_bytes_per_key == 8`.
+* Add support for updating `full_history_ts_low` option in manual compaction, which is for old timestamp data GC.
+* Add a mechanism for using Makefile to build external plugin code into the RocksDB libraries/binaries. This intends to simplify compatibility and distribution for plugins (e.g., special-purpose `FileSystem`s) whose source code resides outside the RocksDB repo. See "plugin/README.md" for developer details, and "PLUGINS.md" for a listing of available plugins.
+* Added memory pre-fetching for experimental Ribbon filter, which especially optimizes performance with batched MultiGet.
+* A new, experimental version of BlobDB (key-value separation) is now available. The new implementation is integrated into the RocksDB core, i.e. it is accessible via the usual `rocksdb::DB` API, as opposed to the separate `rocksdb::blob_db::BlobDB` interface used by the earlier version, and can be configured on a per-column family basis using the configuration options `enable_blob_files`, `min_blob_size`, `blob_file_size`, `blob_compression_type`, `enable_blob_garbage_collection`, and `blob_garbage_collection_age_cutoff`. It extends RocksDB's consistency guarantees to blobs, and offers more features and better performance. Note that some features, most notably `Merge`, compaction filters, and backup/restore are not yet supported, and there is no support for migrating a database created by the old implementation.
+
+### Bug Fixes
+* Since 6.15.0, `TransactionDB` returns error `Status`es from calls to `DeleteRange()` and calls to `Write()` where the `WriteBatch` contains a range deletion. Previously such operations may have succeeded while not providing the expected transactional guarantees. There are certain cases where range deletion can still be used on such DBs; see the API doc on `TransactionDB::DeleteRange()` for details.
+* `OptimisticTransactionDB` now returns error `Status`es from calls to `DeleteRange()` and calls to `Write()` where the `WriteBatch` contains a range deletion. Previously such operations may have succeeded while not providing the expected transactional guarantees.
+* Fix `WRITE_PREPARED`, `WRITE_UNPREPARED` TransactionDB `MultiGet()` may return uncommitted data with snapshot.
+* In DB::OpenForReadOnly, if any error happens while checking Manifest file path, it was overridden by Status::NotFound. It has been fixed and now actual error is returned.
+
+### Public API Change
+* Added a "only_mutable_options" flag to the ConfigOptions.  When this flag is "true", the Configurable functions and convenience methods (such as GetDBOptionsFromString) will only deal with options that are marked as mutable.  When this flag is true, only options marked as mutable can be configured (a Status::InvalidArgument will be returned) and options not marked as mutable will not be returned or compared.  The default is "false", meaning to compare all options.
+* Add new Append and PositionedAppend APIs to FileSystem to bring the data verification information (data checksum information) from upper layer (e.g., WritableFileWriter) to the storage layer. In this way, the customized FileSystem is able to verify the correctness of data being written to the storage on time. Add checksum_handoff_file_types to DBOptions. User can use this option to control which file types (Currently supported file tyes: kWALFile, kTableFile, kDescriptorFile.) should use the new Append and PositionedAppend APIs to handoff the verification information. Currently, RocksDB only use crc32c to calculate the checksum for write handoff.
+* Add an option, `CompressionOptions::max_dict_buffer_bytes`, to limit the in-memory buffering for selecting samples for generating/training a dictionary. The limit is currently loosely adhered to.
+
+
+## 6.17.0 (2021-01-15)
+### Behavior Changes
+* When verifying full file checksum with `DB::VerifyFileChecksums()`, we now fail with `Status::InvalidArgument` if the name of the checksum generator used for verification does not match the name of the checksum generator used for protecting the file when it was created.
+* Since RocksDB does not continue write the same file if a file write fails for any reason, the file scope write IO error is treated the same as retryable IO error. More information about error handling of file scope IO error is included in `ErrorHandler::SetBGError`.
+
+### Bug Fixes
+* Version older than 6.15 cannot decode VersionEdits `WalAddition` and `WalDeletion`, fixed this by changing the encoded format of them to be ignorable by older versions.
+* Fix a race condition between DB startups and shutdowns in managing the periodic background worker threads. One effect of this race condition could be the process being terminated.
+
+### Public API Change
+* Add a public API WriteBufferManager::dummy_entries_in_cache_usage() which reports the size of dummy entries stored in cache (passed to WriteBufferManager). Dummy entries are used to account for DataBlocks.
+* Add a SystemClock class that contains the time-related methods from Env.  The original methods in Env may be deprecated in a future release.  This class will allow easier testing, development, and expansion of time-related features.
+* Add a public API GetRocksBuildProperties and GetRocksBuildInfoAsString to get properties about the current build.  These properties may include settings related to the GIT settings (branch, timestamp).  This change also sets the "build date" based on the GIT properties, rather than the actual build time, thereby enabling more reproducible builds.
+
+## 6.16.0 (2020-12-18)
+### Behavior Changes
+* Attempting to write a merge operand without explicitly configuring `merge_operator` now fails immediately, causing the DB to enter read-only mode. Previously, failure was deferred until the `merge_operator` was needed by a user read or a background operation.
+
+### Bug Fixes
+* Truncated WALs ending in incomplete records can no longer produce gaps in the recovered data when `WALRecoveryMode::kPointInTimeRecovery` is used. Gaps are still possible when WALs are truncated exactly on record boundaries; for complete protection, users should enable `track_and_verify_wals_in_manifest`.
+* Fix a bug where compressed blocks read by MultiGet are not inserted into the compressed block cache when use_direct_reads = true.
+* Fixed the issue of full scanning on obsolete files when there are too many outstanding compactions with ConcurrentTaskLimiter enabled.
+* Fixed the logic of populating native data structure for `read_amp_bytes_per_bit` during OPTIONS file parsing on big-endian architecture. Without this fix, original code introduced in PR7659, when running on big-endian machine, can mistakenly store read_amp_bytes_per_bit (an uint32) in little endian format. Future access to `read_amp_bytes_per_bit` will give wrong values. Little endian architecture is not affected.
+* Fixed prefix extractor with timestamp issues.
+* Fixed a bug in atomic flush: in two-phase commit mode, the minimum WAL log number to keep is incorrect.
+* Fixed a bug related to checkpoint in PR7789: if there are multiple column families, and the checkpoint is not opened as read only, then in rare cases, data loss may happen in the checkpoint. Since backup engine relies on checkpoint, it may also be affected.
+* When ldb --try_load_options is used with the --column_family option, the ColumnFamilyOptions for the specified column family was not loaded from the OPTIONS file. Fix it so its loaded from OPTIONS and then overridden with command line overrides.
+
+### New Features
+* User defined timestamp feature supports `CompactRange` and `GetApproximateSizes`.
+* Support getting aggregated table properties (kAggregatedTableProperties and kAggregatedTablePropertiesAtLevel) with DB::GetMapProperty, for easier access to the data in a structured format.
+* Experimental option BlockBasedTableOptions::optimize_filters_for_memory now works with experimental Ribbon filter (as well as Bloom filter).
+
+### Public API Change
+* Deprecated public but rarely-used FilterBitsBuilder::CalculateNumEntry, which is replaced with ApproximateNumEntries taking a size_t parameter and returning size_t.
+* To improve portability the functions `Env::GetChildren` and `Env::GetChildrenFileAttributes` will no longer return entries for the special directories `.` or `..`.
+* Added a new option `track_and_verify_wals_in_manifest`. If `true`, the log numbers and sizes of the synced WALs are tracked in MANIFEST, then during DB recovery, if a synced WAL is missing from disk, or the WAL's size does not match the recorded size in MANIFEST, an error will be reported and the recovery will be aborted. Note that this option does not work with secondary instance.
+* `rocksdb_approximate_sizes` and `rocksdb_approximate_sizes_cf` in the C API now requires an error pointer (`char** errptr`) for receiving any error.
+* All overloads of DB::GetApproximateSizes now return Status, so that any failure to obtain the sizes is indicated to the caller.
+
+## 6.15.0 (2020-11-13)
+### Bug Fixes
+* Fixed a bug in the following combination of features: indexes with user keys (`format_version >= 3`), indexes are partitioned (`index_type == kTwoLevelIndexSearch`), and some index partitions are pinned in memory (`BlockBasedTableOptions::pin_l0_filter_and_index_blocks_in_cache`). The bug could cause keys to be truncated when read from the index leading to wrong read results or other unexpected behavior.
+* Fixed a bug when indexes are partitioned (`index_type == kTwoLevelIndexSearch`), some index partitions are pinned in memory (`BlockBasedTableOptions::pin_l0_filter_and_index_blocks_in_cache`), and partitions reads could be mixed between block cache and directly from the file (e.g., with `enable_index_compression == 1` and `mmap_read == 1`, partitions that were stored uncompressed due to poor compression ratio would be read directly from the file via mmap, while partitions that were stored compressed would be read from block cache). The bug could cause index partitions to be mistakenly considered empty during reads leading to wrong read results.
+* Since 6.12, memtable lookup should report unrecognized value_type as corruption (#7121).
+* Since 6.14, fix false positive flush/compaction `Status::Corruption` failure when `paranoid_file_checks == true` and range tombstones were written to the compaction output files.
+* Since 6.14, fix a bug that could cause a stalled write to crash with mixed of slowdown and no_slowdown writes (`WriteOptions.no_slowdown=true`).
+* Fixed a bug which causes hang in closing DB when refit level is set in opt build. It was because ContinueBackgroundWork() was called in assert statement which is a no op. It was introduced in 6.14.
+* Fixed a bug which causes Get() to return incorrect result when a key's merge operand is applied twice. This can occur if the thread performing Get() runs concurrently with a background flush thread and another thread writing to the MANIFEST file (PR6069).
+* Reverted a behavior change silently introduced in 6.14.2, in which the effects of the `ignore_unknown_options` flag (used in option parsing/loading functions) changed.
+* Reverted a behavior change silently introduced in 6.14, in which options parsing/loading functions began returning `NotFound` instead of `InvalidArgument` for option names not available in the present version.
+* Fixed MultiGet bugs it doesn't return valid data with user defined timestamp.
+* Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before `TableBuilder::Finish()` in compaction job. For example, the `NeedCompact()` method of `CompactOnDeletionCollector` returned by built-in `CompactOnDeletionCollectorFactory` requires `BlockBasedTable::Finish()` to return the correct result. The bug can cause a compaction-generated file not to be marked for future compaction based on deletion ratio.
+* Fixed a seek issue with prefix extractor and timestamp.
+* Fixed a bug of encoding and parsing BlockBasedTableOptions::read_amp_bytes_per_bit as a 64-bit integer.
+* Fixed a bug of a recovery corner case, details in PR7621.
+
+### Public API Change
+* Deprecate `BlockBasedTableOptions::pin_l0_filter_and_index_blocks_in_cache` and `BlockBasedTableOptions::pin_top_level_index_and_filter`. These options still take effect until users migrate to the replacement APIs in `BlockBasedTableOptions::metadata_cache_options`. Migration guidance can be found in the API comments on the deprecated options.
+* Add new API `DB::VerifyFileChecksums` to verify SST file checksum with corresponding entries in the MANIFEST if present. Current implementation requires scanning and recomputing file checksums.
+
+### Behavior Changes
+* The dictionary compression settings specified in `ColumnFamilyOptions::compression_opts` now additionally affect files generated by flush and compaction to non-bottommost level. Previously those settings at most affected files generated by compaction to bottommost level, depending on whether `ColumnFamilyOptions::bottommost_compression_opts` overrode them. Users who relied on dictionary compression settings in `ColumnFamilyOptions::compression_opts` affecting only the bottommost level can keep the behavior by moving their dictionary settings to `ColumnFamilyOptions::bottommost_compression_opts` and setting its `enabled` flag.
+* When the `enabled` flag is set in `ColumnFamilyOptions::bottommost_compression_opts`, those compression options now take effect regardless of the value in `ColumnFamilyOptions::bottommost_compression`. Previously, those compression options only took effect when `ColumnFamilyOptions::bottommost_compression != kDisableCompressionOption`. Now, they additionally take effect when `ColumnFamilyOptions::bottommost_compression == kDisableCompressionOption` (such a setting causes bottommost compression type to fall back to `ColumnFamilyOptions::compression_per_level` if configured, and otherwise fall back to `ColumnFamilyOptions::compression`).
+
+### New Features
+* An EXPERIMENTAL new Bloom alternative that saves about 30% space compared to Bloom filters, with about 3-4x construction time and similar query times is available using NewExperimentalRibbonFilterPolicy.
+
+## 6.14 (2020-10-09)
+### Bug fixes
+* Fixed a bug after a `CompactRange()` with `CompactRangeOptions::change_level` set fails due to a conflict in the level change step, which caused all subsequent calls to `CompactRange()` with `CompactRangeOptions::change_level` set to incorrectly fail with a `Status::NotSupported("another thread is refitting")` error.
+* Fixed a bug that the bottom most level compaction could still be a trivial move even if `BottommostLevelCompaction.kForce` or `kForceOptimized` is set.
+
+### Public API Change
+* The methods to create and manage EncrypedEnv have been changed.  The EncryptionProvider is now passed to NewEncryptedEnv as a shared pointer, rather than a raw pointer.  Comparably, the CTREncryptedProvider now takes a shared pointer, rather than a reference, to a BlockCipher.  CreateFromString methods have been added to BlockCipher and EncryptionProvider to provide a single API by which different ciphers and providers can be created, respectively.
+* The internal classes (CTREncryptionProvider, ROT13BlockCipher, CTRCipherStream) associated with the EncryptedEnv have been moved out of the public API.  To create a CTREncryptionProvider, one can either use EncryptionProvider::NewCTRProvider, or EncryptionProvider::CreateFromString("CTR").  To create a new ROT13BlockCipher, one can either use BlockCipher::NewROT13Cipher or BlockCipher::CreateFromString("ROT13").
+* The EncryptionProvider::AddCipher method has been added to allow keys to be added to an EncryptionProvider.  This API will allow future providers to support multiple cipher keys.
+* Add a new option "allow_data_in_errors". When this new option is set by users, it allows users to opt-in to get error messages containing corrupted keys/values. Corrupt keys, values will be logged in the messages, logs, status etc. that will help users with the useful information regarding affected data. By default value of this option is set false to prevent users data to be exposed in the messages so currently, data will be redacted from logs, messages, status by default.
+* AdvancedColumnFamilyOptions::force_consistency_checks is now true by default, for more proactive DB corruption detection at virtually no cost (estimated two extra CPU cycles per million on a major production workload). Corruptions reported by these checks now mention "force_consistency_checks" in case a false positive corruption report is suspected and the option needs to be disabled (unlikely). Since existing column families have a saved setting for force_consistency_checks, only new column families will pick up the new default.
+
+### General Improvements
+* The settings of the DBOptions and ColumnFamilyOptions are now managed by Configurable objects (see New Features).  The same convenience methods to configure these options still exist but the backend implementation has been unified under a common implementation.
+
+### New Features
+
+* Methods to configure serialize, and compare -- such as TableFactory -- are exposed directly through the Configurable base class (from which these objects inherit).  This change will allow for better and more thorough configuration management and retrieval in the future.  The options for a Configurable object can be set via the ConfigureFromMap, ConfigureFromString, or ConfigureOption method.  The serialized version of the options of an object can be retrieved via the GetOptionString, ToString, or GetOption methods.  The list of options supported by an object can be obtained via the GetOptionNames method.  The "raw" object (such as the BlockBasedTableOption) for an option may be retrieved via the GetOptions method.  Configurable options can be compared via the AreEquivalent method.  The settings within a Configurable object may be validated via the ValidateOptions method.  The object may be intialized (at which point only mutable options may be updated) via the PrepareOptions method.
+* Introduce options.check_flush_compaction_key_order with default value to be true. With this option, during flush and compaction, key order will be checked when writing to each SST file. If the order is violated, the flush or compaction will fail.
+* Added is_full_compaction to CompactionJobStats, so that the information is available through the EventListener interface.
+* Add more stats for MultiGet in Histogram to get number of data blocks, index blocks, filter blocks and sst files read from file system per level.
+* SST files have a new table property called db_host_id, which is set to the hostname by default. A new option in DBOptions, db_host_id, allows the property value to be overridden with a user specified string, or disable it completely by making the option string empty.
+* Methods to create customizable extensions -- such as TableFactory -- are exposed directly through the Customizable base class (from which these objects inherit).  This change will allow these Customizable classes to be loaded and configured in a standard way (via CreateFromString).  More information on how to write and use Customizable classes is in the customizable.h header file.
+
+## 6.13 (2020-09-12)
+### Bug fixes
+* Fix a performance regression introduced in 6.4 that makes a upper bound check for every Next() even if keys are within a data block that is within the upper bound.
+* Fix a possible corruption to the LSM state (overlapping files within a level) when a `CompactRange()` for refitting levels (`CompactRangeOptions::change_level == true`) and another manual compaction are executed in parallel.
+* Sanitize `recycle_log_file_num` to zero when the user attempts to enable it in combination with `WALRecoveryMode::kTolerateCorruptedTailRecords`. Previously the two features were allowed together, which compromised the user's configured crash-recovery guarantees.
+* Fix a bug where a level refitting in CompactRange() might race with an automatic compaction that puts the data to the target level of the refitting. The bug has been there for years.
+* Fixed a bug in version 6.12 in which BackupEngine::CreateNewBackup could fail intermittently with non-OK status when backing up a read-write DB configured with a DBOptions::file_checksum_gen_factory.
+* Fix useless no-op compactions scheduled upon snapshot release when options.disable-auto-compactions = true.
+* Fix a bug when max_write_buffer_size_to_maintain is set, immutable flushed memtable destruction is delayed until the next super version is installed. A memtable is not added to delete list because of its reference hold by super version and super version doesn't switch because of empt delete list. So memory usage keeps on increasing beyond write_buffer_size + max_write_buffer_size_to_maintain.
+* Avoid converting MERGES to PUTS when allow_ingest_behind is true.
+* Fix compression dictionary sampling together with `SstFileWriter`. Previously, the dictionary would be trained/finalized immediately with zero samples. Now, the whole `SstFileWriter` file is buffered in memory and then sampled.
+* Fix a bug with `avoid_unnecessary_blocking_io=1` and creating backups (BackupEngine::CreateNewBackup) or checkpoints (Checkpoint::Create). With this setting and WAL enabled, these operations could randomly fail with non-OK status.
+* Fix a bug in which bottommost compaction continues to advance the underlying InternalIterator to skip tombstones even after shutdown.
+
+### New Features
+* A new field `std::string requested_checksum_func_name` is added to `FileChecksumGenContext`, which enables the checksum factory to create generators for a suite of different functions.
+* Added a new subcommand, `ldb unsafe_remove_sst_file`, which removes a lost or corrupt SST file from a DB's metadata. This command involves data loss and must not be used on a live DB.
+
+### Performance Improvements
+* Reduce thread number for multiple DB instances by re-using one global thread for statistics dumping and persisting.
+* Reduce write-amp in heavy write bursts in `kCompactionStyleLevel` compaction style with `level_compaction_dynamic_level_bytes` set.
+* BackupEngine incremental backups no longer read DB table files that are already saved to a shared part of the backup directory, unless `share_files_with_checksum` is used with `kLegacyCrc32cAndFileSize` naming (discouraged).
+  * For `share_files_with_checksum`, we are confident there is no regression (vs. pre-6.12) in detecting DB or backup corruption at backup creation time, mostly because the old design did not leverage this extra checksum computation for detecting inconsistencies at backup creation time.
+  * For `share_table_files` without "checksum" (not recommended), there is a regression in detecting fundamentally unsafe use of the option, greatly mitigated by file size checking (under "Behavior Changes"). Almost no reason to use `share_files_with_checksum=false` should remain.
+  * `DB::VerifyChecksum` and `BackupEngine::VerifyBackup` with checksum checking are still able to catch corruptions that `CreateNewBackup` does not.
+
+### Public API Change
+* Expose kTypeDeleteWithTimestamp in EntryType and update GetEntryType() accordingly.
+* Added file_checksum and file_checksum_func_name to TableFileCreationInfo, which can pass the table file checksum information through the OnTableFileCreated callback during flush and compaction.
+* A warning is added to `DB::DeleteFile()` API describing its known problems and deprecation plan.
+* Add a new stats level, i.e. StatsLevel::kExceptTickers (PR7329) to exclude tickers even if application passes a non-null Statistics object.
+* Added a new status code IOStatus::IOFenced() for the Env/FileSystem to indicate that writes from this instance are fenced off. Like any other background error, this error is returned to the user in Put/Merge/Delete/Flush calls and can be checked using Status::IsIOFenced().
+
+### Behavior Changes
+* File abstraction `FSRandomAccessFile.Prefetch()` default return status is changed from `OK` to `NotSupported`. If the user inherited file doesn't implement prefetch, RocksDB will create internal prefetch buffer to improve read performance.
+* When retryabel IO error happens during Flush (manifest write error is excluded) and WAL is disabled, originally it is mapped to kHardError. Now,it is mapped to soft error. So DB will not stall the writes unless the memtable is full. At the same time, when auto resume is triggered to recover the retryable IO error during Flush, SwitchMemtable is not called to avoid generating to many small immutable memtables. If WAL is enabled, no behavior changes.
+* When considering whether a table file is already backed up in a shared part of backup directory, BackupEngine would already query the sizes of source (DB) and pre-existing destination (backup) files. BackupEngine now uses these file sizes to detect corruption, as at least one of (a) old backup, (b) backup in progress, or (c) current DB is corrupt if there's a size mismatch.
+
+### Others
+* Error in prefetching partitioned index blocks will not be swallowed. It will fail the query and return the IOError users.
+
+## 6.12 (2020-07-28)
+### Public API Change
+* Encryption file classes now exposed for inheritance in env_encryption.h
+* File I/O listener is extended to cover more I/O operations. Now class `EventListener` in listener.h contains new callback functions: `OnFileFlushFinish()`, `OnFileSyncFinish()`, `OnFileRangeSyncFinish()`, `OnFileTruncateFinish()`, and ``OnFileCloseFinish()``.
+* `FileOperationInfo` now reports `duration` measured by `std::chrono::steady_clock` and `start_ts` measured by `std::chrono::system_clock` instead of start and finish timestamps measured by `system_clock`. Note that `system_clock` is called before `steady_clock` in program order at operation starts.
+* `DB::GetDbSessionId(std::string& session_id)` is added. `session_id` stores a unique identifier that gets reset every time the DB is opened. This DB session ID should be unique among all open DB instances on all hosts, and should be unique among re-openings of the same or other DBs. This identifier is recorded in the LOG file on the line starting with "DB Session ID:".
+* `DB::OpenForReadOnly()` now returns `Status::NotFound` when the specified DB directory does not exist. Previously the error returned depended on the underlying `Env`. This change is available in all 6.11 releases as well.
+* A parameter `verify_with_checksum` is added to `BackupEngine::VerifyBackup`, which is false by default. If it is ture, `BackupEngine::VerifyBackup` verifies checksums and file sizes of backup files. Pass `false` for `verify_with_checksum` to maintain the previous behavior and performance of `BackupEngine::VerifyBackup`, by only verifying sizes of backup files.
+
+### Behavior Changes
+* Best-efforts recovery ignores CURRENT file completely. If CURRENT file is missing during recovery, best-efforts recovery still proceeds with MANIFEST file(s).
+* In best-efforts recovery, an error that is not Corruption or IOError::kNotFound or IOError::kPathNotFound will be overwritten silently. Fix this by checking all non-ok cases and return early.
+* When `file_checksum_gen_factory` is set to `GetFileChecksumGenCrc32cFactory()`, BackupEngine will compare the crc32c checksums of table files computed when creating a backup to the expected checksums stored in the DB manifest, and will fail `CreateNewBackup()` on mismatch (corruption). If the `file_checksum_gen_factory` is not set or set to any other customized factory, there is no checksum verification to detect if SST files in a DB are corrupt when read, copied, and independently checksummed by BackupEngine.
+* When a DB sets `stats_dump_period_sec > 0`, either as the initial value for DB open or as a dynamic option change, the first stats dump is staggered in the following X seconds, where X is an integer in `[0, stats_dump_period_sec)`. Subsequent stats dumps are still spaced `stats_dump_period_sec` seconds apart.
+* When the paranoid_file_checks option is true, a hash is generated of all keys and values are generated when the SST file is written, and then the values are read back in to validate the file.  A corruption is signaled if the two hashes do not match.
+
+### Bug fixes
+* Compressed block cache was automatically disabled with read-only DBs by mistake. Now it is fixed: compressed block cache will be in effective with read-only DB too.
+* Fix a bug of wrong iterator result if another thread finishes an update and a DB flush between two statement.
+* Disable file deletion after MANIFEST write/sync failure until db re-open or Resume() so that subsequent re-open will not see MANIFEST referencing deleted SSTs.
+* Fix a bug when index_type == kTwoLevelIndexSearch in PartitionedIndexBuilder to update FlushPolicy to point to internal key partitioner when it changes from user-key mode to internal-key mode in index partition.
+* Make compaction report InternalKey corruption while iterating over the input.
+* Fix a bug which may cause MultiGet to be slow because it may read more data than requested, but this won't affect correctness. The bug was introduced in 6.10 release.
+* Fail recovery and report once hitting a physical log record checksum mismatch, while reading MANIFEST. RocksDB should not continue processing the MANIFEST any further.
+* Fixed a bug in size-amp-triggered and periodic-triggered universal compaction, where the compression settings for the first input level were used rather than the compression settings for the output (bottom) level.
+
+### New Features
+* DB identity (`db_id`) and DB session identity (`db_session_id`) are added to table properties and stored in SST files. SST files generated from SstFileWriter and Repairer have DB identity “SST Writer” and “DB Repairer”, respectively. Their DB session IDs are generated in the same way as `DB::GetDbSessionId`. The session ID for SstFileWriter (resp., Repairer) resets every time `SstFileWriter::Open` (resp., `Repairer::Run`) is called.
+* Added experimental option BlockBasedTableOptions::optimize_filters_for_memory for reducing allocated memory size of Bloom filters (~10% savings with Jemalloc) while preserving the same general accuracy. To have an effect, the option requires format_version=5 and malloc_usable_size. Enabling this option is forward and backward compatible with existing format_version=5.
+* `BackupableDBOptions::share_files_with_checksum_naming` is added with new default behavior for naming backup files with `share_files_with_checksum`, to address performance and backup integrity issues. See API comments for details.
+* Added auto resume function to automatically recover the DB from background Retryable IO Error. When retryable IOError happens during flush and WAL write, the error is mapped to Hard Error and DB will be in read mode. When retryable IO Error happens during compaction, the error will be mapped to Soft Error. DB is still in write/read mode. Autoresume function will create a thread for a DB to call DB->ResumeImpl() to try the recover for Retryable IO Error during flush and WAL write. Compaction will be rescheduled by itself if retryable IO Error happens. Auto resume may also cause other Retryable IO Error during the recovery, so the recovery will fail. Retry the auto resume may solve the issue, so we use max_bgerror_resume_count to decide how many resume cycles will be tried in total. If it is <=0, auto resume retryable IO Error is disabled. Default is INT_MAX, which will lead to a infinit auto resume. bgerror_resume_retry_interval decides the time interval between two auto resumes.
+* Option `max_subcompactions` can be set dynamically using DB::SetDBOptions().
+* Added experimental ColumnFamilyOptions::sst_partitioner_factory to define determine the partitioning of sst files. This helps compaction to split the files on interesting boundaries (key prefixes) to make propagation of sst files less write amplifying (covering the whole key space).
+
+### Performance Improvements
+* Eliminate key copies for internal comparisons while accessing ingested block-based tables.
+* Reduce key comparisons during random access in all block-based tables.
+* BackupEngine avoids unnecessary repeated checksum computation for backing up a table file to the `shared_checksum` directory when using `share_files_with_checksum_naming = kUseDbSessionId` (new default), except on SST files generated before this version of RocksDB, which fall back on using `kLegacyCrc32cAndFileSize`.
+
+## 6.11 (2020-06-12)
+### Bug Fixes
+* Fix consistency checking error swallowing in some cases when options.force_consistency_checks = true.
+* Fix possible false NotFound status from batched MultiGet using index type kHashSearch.
+* Fix corruption caused by enabling delete triggered compaction (NewCompactOnDeletionCollectorFactory) in universal compaction mode, along with parallel compactions. The bug can result in two parallel compactions picking the same input files, resulting in the DB resurrecting older and deleted versions of some keys.
+* Fix a use-after-free bug in best-efforts recovery. column_family_memtables_ needs to point to valid ColumnFamilySet.
+* Let best-efforts recovery ignore corrupted files during table loading.
+* Fix corrupt key read from ingested file when iterator direction switches from reverse to forward at a key that is a prefix of another key in the same file. It is only possible in files with a non-zero global seqno.
+* Fix abnormally large estimate from GetApproximateSizes when a range starts near the end of one SST file and near the beginning of another. Now GetApproximateSizes consistently and fairly includes the size of SST metadata in addition to data blocks, attributing metadata proportionally among the data blocks based on their size.
+* Fix potential file descriptor leakage in PosixEnv's IsDirectory() and NewRandomAccessFile().
+* Fix false negative from the VerifyChecksum() API when there is a checksum mismatch in an index partition block in a BlockBasedTable format table file (index_type is kTwoLevelIndexSearch).
+* Fix sst_dump to return non-zero exit code if the specified file is not a recognized SST file or fails requested checks.
+* Fix incorrect results from batched MultiGet for duplicate keys, when the duplicate key matches the largest key of an SST file and the value type for the key in the file is a merge value.
+
+### Public API Change
+* Flush(..., column_family) may return Status::ColumnFamilyDropped() instead of Status::InvalidArgument() if column_family is dropped while processing the flush request.
+* BlobDB now explicitly disallows using the default column family's storage directories as blob directory.
+* DeleteRange now returns `Status::InvalidArgument` if the range's end key comes before its start key according to the user comparator. Previously the behavior was undefined.
+* ldb now uses options.force_consistency_checks = true by default and "--disable_consistency_checks" is added to disable it.
+* DB::OpenForReadOnly no longer creates files or directories if the named DB does not exist, unless create_if_missing is set to true.
+* The consistency checks that validate LSM state changes (table file additions/deletions during flushes and compactions) are now stricter, more efficient, and no longer optional, i.e. they are performed even if `force_consistency_checks` is `false`.
+* Disable delete triggered compaction (NewCompactOnDeletionCollectorFactory) in universal compaction mode and num_levels = 1 in order to avoid a corruption bug.
+* `pin_l0_filter_and_index_blocks_in_cache` no longer applies to L0 files larger than `1.5 * write_buffer_size` to give more predictable memory usage. Such L0 files may exist due to intra-L0 compaction, external file ingestion, or user dynamically changing `write_buffer_size` (note, however, that files that are already pinned will continue being pinned, even after such a dynamic change).
+* In point-in-time wal recovery mode, fail database recovery in case of IOError while reading the WAL to avoid data loss.
+* A new method `Env::LowerThreadPoolCPUPriority(Priority, CpuPriority)` is added to `Env` to be able to lower to a specific priority such as `CpuPriority::kIdle`.
+
+### New Features
+* sst_dump to add a new --readahead_size argument. Users can specify read size when scanning the data. Sst_dump also tries to prefetch tail part of the SST files so usually some number of I/Os are saved there too.
+* Generate file checksum in SstFileWriter if Options.file_checksum_gen_factory is set. The checksum and checksum function name are stored in ExternalSstFileInfo after the sst file write is finished.
+* Add a value_size_soft_limit in read options which limits the cumulative value size of keys read in batches in MultiGet. Once the cumulative value size of found keys exceeds read_options.value_size_soft_limit, all the remaining keys are returned with status Abort without further finding their values. By default the value_size_soft_limit is std::numeric_limits<uint64_t>::max().
+* Enable SST file ingestion with file checksum information when calling IngestExternalFiles(const std::vector<IngestExternalFileArg>& args). Added files_checksums and files_checksum_func_names to IngestExternalFileArg such that user can ingest the sst files with their file checksum information. Added verify_file_checksum to IngestExternalFileOptions (default is True). To be backward compatible, if DB does not enable file checksum or user does not provide checksum information (vectors of files_checksums and files_checksum_func_names are both empty), verification of file checksum is always sucessful. If DB enables file checksum, DB will always generate the checksum for each ingested SST file during Prepare stage of ingestion and store the checksum in Manifest, unless verify_file_checksum is False and checksum information is provided by the application. In this case, we only verify the checksum function name and directly store the ingested checksum in Manifest. If verify_file_checksum is set to True, DB will verify the ingested checksum and function name with the genrated ones. Any mismatch will fail the ingestion. Note that, if IngestExternalFileOptions::write_global_seqno is True, the seqno will be changed in the ingested file. Therefore, the checksum of the file will be changed. In this case, a new checksum will be generated after the seqno is updated and be stored in the Manifest.
+
+### Performance Improvements
+* Eliminate redundant key comparisons during random access in block-based tables.
+
+## 6.10 (2020-05-02)
+### Bug Fixes
+* Fix wrong result being read from ingested file. May happen when a key in the file happen to be prefix of another key also in the file. The issue can further cause more data corruption. The issue exists with rocksdb >= 5.0.0 since DB::IngestExternalFile() was introduced.
+* Finish implementation of BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey. It's now ready for use. Significantly reduces read amplification in some setups, especially for iterator seeks.
+* Fix a bug by updating CURRENT file so that it points to the correct MANIFEST file after best-efforts recovery.
+* Fixed a bug where ColumnFamilyHandle objects were not cleaned up in case an error happened during BlobDB's open after the base DB had been opened.
+* Fix a potential undefined behavior caused by trying to dereference nullable pointer (timestamp argument) in DB::MultiGet.
+* Fix a bug caused by not including user timestamp in MultiGet LookupKey construction. This can lead to wrong query result since the trailing bytes of a user key, if not shorter than timestamp, will be mistaken for user timestamp.
+* Fix a bug caused by using wrong compare function when sorting the input keys of MultiGet with timestamps.
+* Upgraded version of bzip library (1.0.6 -> 1.0.8) used with RocksJava to address potential vulnerabilities if an attacker can manipulate compressed data saved and loaded by RocksDB (not normal). See issue #6703.
+
+### Public API Change
+* Add a ConfigOptions argument to the APIs dealing with converting options to and from strings and files.  The ConfigOptions is meant to replace some of the options (such as input_strings_escaped and ignore_unknown_options) and allow for more parameters to be passed in the future without changing the function signature.
+* Add NewFileChecksumGenCrc32cFactory to the file checksum public API, such that the builtin Crc32c based file checksum generator factory can be used by applications.
+* Add IsDirectory to Env and FS to indicate if a path is a directory.
+
+### New Features
+* Added support for pipelined & parallel compression optimization for `BlockBasedTableBuilder`. This optimization makes block building, block compression and block appending a pipeline, and uses multiple threads to accelerate block compression. Users can set `CompressionOptions::parallel_threads` greater than 1 to enable compression parallelism. This feature is experimental for now.
+* Provide an allocator for memkind to be used with block cache. This is to work with memory technologies (Intel DCPMM is one such technology currently available) that require different libraries for allocation and management (such as PMDK and memkind). The high capacities available make it possible to provision large caches (up to several TBs in size) beyond what is achievable with DRAM.
+* Option `max_background_flushes` can be set dynamically using DB::SetDBOptions().
+* Added functionality in sst_dump tool to check the compressed file size for different compression levels and print the time spent on compressing files with each compression type. Added arguments `--compression_level_from` and `--compression_level_to` to report size of all compression levels and one compression_type must be specified with it so that it will report compressed sizes of one compression type with different levels.
+* Added statistics for redundant insertions into block cache: rocksdb.block.cache.*add.redundant. (There is currently no coordination to ensure that only one thread loads a table block when many threads are trying to access that same table block.)
+
+### Bug Fixes
+* Fix a bug when making options.bottommost_compression, options.compression_opts and options.bottommost_compression_opts dynamically changeable: the modified values are not written to option files or returned back to users when being queried.
+* Fix a bug where index key comparisons were unaccounted in `PerfContext::user_key_comparison_count` for lookups in files written with `format_version >= 3`.
+* Fix many bloom.filter statistics not being updated in batch MultiGet.
+
+### Performance Improvements
+* Improve performance of batch MultiGet with partitioned filters, by sharing block cache lookups to applicable filter blocks.
+* Reduced memory copies when fetching and uncompressing compressed blocks from sst files.
+
+## 6.9.0 (2020-03-29)
+### Behavior changes
+* Since RocksDB 6.8, ttl-based FIFO compaction can drop a file whose oldest key becomes older than options.ttl while others have not. This fix reverts this and makes ttl-based FIFO compaction use the file's flush time as the criterion. This fix also requires that max_open_files = -1 and compaction_options_fifo.allow_compaction = false to function properly.
+
+### Public API Change
+* Fix spelling so that API now has correctly spelled transaction state name `COMMITTED`, while the old misspelled `COMMITED` is still available as an alias.
+* Updated default format_version in BlockBasedTableOptions from 2 to 4. SST files generated with the new default can be read by RocksDB versions 5.16 and newer, and use more efficient encoding of keys in index blocks.
+* A new parameter `CreateBackupOptions` is added to both `BackupEngine::CreateNewBackup` and `BackupEngine::CreateNewBackupWithMetadata`, you can decrease CPU priority of `BackupEngine`'s background threads by setting `decrease_background_thread_cpu_priority` and `background_thread_cpu_priority` in `CreateBackupOptions`.
+* Updated the public API of SST file checksum. Introduce the FileChecksumGenFactory to create the FileChecksumGenerator for each SST file, such that the FileChecksumGenerator is not shared and it can be more general for checksum implementations. Changed the FileChecksumGenerator interface from Value, Extend, and GetChecksum to Update, Finalize, and GetChecksum. Finalize should be only called once after all data is processed to generate the final checksum. Temproal data should be maintained by the FileChecksumGenerator object itself and finally it can return the checksum string.
+
+### Bug Fixes
+* Fix a bug where range tombstone blocks in ingested files were cached incorrectly during ingestion. If range tombstones were read from those incorrectly cached blocks, the keys they covered would be exposed.
+* Fix a data race that might cause crash when calling DB::GetCreationTimeOfOldestFile() by a small chance. The bug was introduced in 6.6 Release.
+* Fix a bug where a boolean value optimize_filters_for_hits was for max threads when calling load table handles after a flush or compaction. The value is correct to 1. The bug should not cause user visible problems.
+* Fix a bug which might crash the service when write buffer manager fails to insert the dummy handle to the block cache.
+
+### Performance Improvements
+* In CompactRange, for levels starting from 0, if the level does not have any file with any key falling in the specified range, the level is skipped. So instead of always compacting from level 0, the compaction starts from the first level with keys in the specified range until the last such level.
+* Reduced memory copy when reading sst footer and blobdb in direct IO mode.
+* When restarting a database with large numbers of sst files, large amount of CPU time is spent on getting logical block size of the sst files, which slows down the starting progress, this inefficiency is optimized away with an internal cache for the logical block sizes.
+
+### New Features
+* Basic support for user timestamp in iterator. Seek/SeekToFirst/Next and lower/upper bounds are supported. Reverse iteration is not supported. Merge is not considered.
+* When file lock failure when the lock is held by the current process, return acquiring time and thread ID in the error message.
+* Added a new option, best_efforts_recovery (default: false), to allow database to open in a db dir with missing table files. During best efforts recovery, missing table files are ignored, and database recovers to the most recent state without missing table file. Cross-column-family consistency is not guaranteed even if WAL is enabled.
+* options.bottommost_compression, options.compression_opts and options.bottommost_compression_opts are now dynamically changeable.
+
+## 6.8.0 (2020-02-24)
+### Java API Changes
+* Major breaking changes to Java comparators, toward standardizing on ByteBuffer for performant, locale-neutral operations on keys (#6252).
+* Added overloads of common API methods using direct ByteBuffers for keys and values (#2283).
+
+### Bug Fixes
+* Fix incorrect results while block-based table uses kHashSearch, together with Prev()/SeekForPrev().
+* Fix a bug that prevents opening a DB after two consecutive crash with TransactionDB, where the first crash recovers from a corrupted WAL with kPointInTimeRecovery but the second cannot.
+* Fixed issue #6316 that can cause a corruption of the MANIFEST file in the middle when writing to it fails due to no disk space.
+* Add DBOptions::skip_checking_sst_file_sizes_on_db_open. It disables potentially expensive checking of all sst file sizes in DB::Open().
+* BlobDB now ignores trivially moved files when updating the mapping between blob files and SSTs. This should mitigate issue #6338 where out of order flush/compaction notifications could trigger an assertion with the earlier code.
+* Batched MultiGet() ignores IO errors while reading data blocks, causing it to potentially continue looking for a key and returning stale results.
+* `WriteBatchWithIndex::DeleteRange` returns `Status::NotSupported`. Previously it returned success even though reads on the batch did not account for range tombstones. The corresponding language bindings now cannot be used. In C, that includes `rocksdb_writebatch_wi_delete_range`, `rocksdb_writebatch_wi_delete_range_cf`, `rocksdb_writebatch_wi_delete_rangev`, and `rocksdb_writebatch_wi_delete_rangev_cf`. In Java, that includes `WriteBatchWithIndex::deleteRange`.
+* Assign new MANIFEST file number when caller tries to create a new MANIFEST by calling LogAndApply(..., new_descriptor_log=true). This bug can cause MANIFEST being overwritten during recovery if options.write_dbid_to_manifest = true and there are WAL file(s).
+
+### Performance Improvements
+* Perfom readahead when reading from option files. Inside DB, options.log_readahead_size will be used as the readahead size. In other cases, a default 512KB is used.
+
+### Public API Change
+* The BlobDB garbage collector now emits the statistics `BLOB_DB_GC_NUM_FILES` (number of blob files obsoleted during GC), `BLOB_DB_GC_NUM_NEW_FILES` (number of new blob files generated during GC), `BLOB_DB_GC_FAILURES` (number of failed GC passes), `BLOB_DB_GC_NUM_KEYS_RELOCATED` (number of blobs relocated during GC), and `BLOB_DB_GC_BYTES_RELOCATED` (total size of blobs relocated during GC). On the other hand, the following statistics, which are not relevant for the new GC implementation, are now deprecated: `BLOB_DB_GC_NUM_KEYS_OVERWRITTEN`, `BLOB_DB_GC_NUM_KEYS_EXPIRED`, `BLOB_DB_GC_BYTES_OVERWRITTEN`, `BLOB_DB_GC_BYTES_EXPIRED`, and `BLOB_DB_GC_MICROS`.
+* Disable recycle_log_file_num when an inconsistent recovery modes are requested: kPointInTimeRecovery and kAbsoluteConsistency
+
+### New Features
+* Added the checksum for each SST file generated by Flush or Compaction. Added sst_file_checksum_func to Options such that user can plugin their own SST file checksum function via override the FileChecksumFunc class. If user does not set the sst_file_checksum_func, SST file checksum calculation will not be enabled. The checksum information inlcuding uint32_t checksum value and a checksum function name (string). The checksum information is stored in FileMetadata in version store and also logged to MANIFEST. A new tool is added to LDB such that user can dump out a list of file checksum information from MANIFEST (stored in an unordered_map).
+* `db_bench` now supports `value_size_distribution_type`, `value_size_min`, `value_size_max` options for generating random variable sized value. Added `blob_db_compression_type` option for BlobDB to enable blob compression.
+* Replace RocksDB namespace "rocksdb" with flag "ROCKSDB_NAMESPACE" which if is not defined, defined as "rocksdb" in header file rocksdb_namespace.h.
+
+## 6.7.0 (2020-01-21)
+### Public API Change
+* Added a rocksdb::FileSystem class in include/rocksdb/file_system.h to encapsulate file creation/read/write operations, and an option DBOptions::file_system to allow a user to pass in an instance of rocksdb::FileSystem. If its a non-null value, this will take precendence over DBOptions::env for file operations. A new API rocksdb::FileSystem::Default() returns a platform default object. The DBOptions::env option and Env::Default() API will continue to be used for threading and other OS related functions, and where DBOptions::file_system is not specified, for file operations. For storage developers who are accustomed to rocksdb::Env, the interface in rocksdb::FileSystem is new and will probably undergo some changes as more storage systems are ported to it from rocksdb::Env. As of now, no env other than Posix has been ported to the new interface.
+* A new rocksdb::NewSstFileManager() API that allows the caller to pass in separate Env and FileSystem objects.
+* Changed Java API for RocksDB.keyMayExist functions to use Holder<byte[]> instead of StringBuilder, so that retrieved values need not decode to Strings.
+* A new `OptimisticTransactionDBOptions` Option that allows users to configure occ validation policy. The default policy changes from kValidateSerial to kValidateParallel to reduce mutex contention.
+
+### Bug Fixes
+* Fix a bug that can cause unnecessary bg thread to be scheduled(#6104).
+* Fix crash caused by concurrent CF iterations and drops(#6147).
+* Fix a race condition for cfd->log_number_ between manifest switch and memtable switch (PR 6249) when number of column families is greater than 1.
+* Fix a bug on fractional cascading index when multiple files at the same level contain the same smallest user key, and those user keys are for merge operands. In this case, Get() the exact key may miss some merge operands.
+* Delcare kHashSearch index type feature-incompatible with index_block_restart_interval larger than 1.
+* Fixed an issue where the thread pools were not resized upon setting `max_background_jobs` dynamically through the `SetDBOptions` interface.
+* Fix a bug that can cause write threads to hang when a slowdown/stall happens and there is a mix of writers with WriteOptions::no_slowdown set/unset.
+* Fixed an issue where an incorrect "number of input records" value was used to compute the "records dropped" statistics for compactions.
+* Fix a regression bug that causes segfault when hash is used, max_open_files != -1 and total order seek is used and switched back.
+
+### New Features
+* It is now possible to enable periodic compactions for the base DB when using BlobDB.
+* BlobDB now garbage collects non-TTL blobs when `enable_garbage_collection` is set to `true` in `BlobDBOptions`. Garbage collection is performed during compaction: any valid blobs located in the oldest N files (where N is the number of non-TTL blob files multiplied by the value of `BlobDBOptions::garbage_collection_cutoff`) encountered during compaction get relocated to new blob files, and old blob files are dropped once they are no longer needed. Note: we recommend enabling periodic compactions for the base DB when using this feature to deal with the case when some old blob files are kept alive by SSTs that otherwise do not get picked for compaction.
+* `db_bench` now supports the `garbage_collection_cutoff` option for BlobDB.
+* Introduce ReadOptions.auto_prefix_mode. When set to true, iterator will return the same result as total order seek, but may choose to use prefix seek internally based on seek key and iterator upper bound.
+* MultiGet() can use IO Uring to parallelize read from the same SST file. This featuer is by default disabled. It can be enabled with environment variable ROCKSDB_USE_IO_URING.
+
+## 6.6.2 (2020-01-13)
+### Bug Fixes
+* Fixed a bug where non-L0 compaction input files were not considered to compute the `creation_time` of new compaction outputs.
+
+## 6.6.1 (2020-01-02)
+### Bug Fixes
+* Fix a bug in WriteBatchWithIndex::MultiGetFromBatchAndDB, which is called by Transaction::MultiGet, that causes due to stale pointer access when the number of keys is > 32
+* Fixed two performance issues related to memtable history trimming. First, a new SuperVersion is now created only if some memtables were actually trimmed. Second, trimming is only scheduled if there is at least one flushed memtable that is kept in memory for the purposes of transaction conflict checking.
+* BlobDB no longer updates the SST to blob file mapping upon failed compactions.
+* Fix a bug in which a snapshot read through an iterator could be affected by a DeleteRange after the snapshot (#6062).
+* Fixed a bug where BlobDB was comparing the `ColumnFamilyHandle` pointers themselves instead of only the column family IDs when checking whether an API call uses the default column family or not.
+* Delete superversions in BackgroundCallPurge.
+* Fix use-after-free and double-deleting files in BackgroundCallPurge().
+
+## 6.6.0 (2019-11-25)
+### Bug Fixes
+* Fix data corruption caused by output of intra-L0 compaction on ingested file not being placed in correct order in L0.
+* Fix a data race between Version::GetColumnFamilyMetaData() and Compaction::MarkFilesBeingCompacted() for access to being_compacted (#6056). The current fix acquires the db mutex during Version::GetColumnFamilyMetaData(), which may cause regression.
+* Fix a bug in DBIter that is_blob_ state isn't updated when iterating backward using seek.
+* Fix a bug when format_version=3, partitioned filters, and prefix search are used in conjunction. The bug could result into Seek::(prefix) returning NotFound for an existing prefix.
+* Revert the feature "Merging iterator to avoid child iterator reseek for some cases (#5286)" since it might cause strong results when reseek happens with a different iterator upper bound.
+* Fix a bug causing a crash during ingest external file when background compaction cause severe error (file not found).
+* Fix a bug when partitioned filters and prefix search are used in conjunction, ::SeekForPrev could return invalid for an existing prefix. ::SeekForPrev might be called by the user, or internally on ::Prev, or within ::Seek if the return value involves Delete or a Merge operand.
+* Fix OnFlushCompleted fired before flush result persisted in MANIFEST when there's concurrent flush job. The bug exists since OnFlushCompleted was introduced in rocksdb 3.8.
+* Fixed an sst_dump crash on some plain table SST files.
+* Fixed a memory leak in some error cases of opening plain table SST files.
+* Fix a bug when a crash happens while calling WriteLevel0TableForRecovery for multiple column families, leading to a column family's log number greater than the first corrutped log number when the DB is being opened in PointInTime recovery mode during next recovery attempt (#5856).
+
+### New Features
+* Universal compaction to support options.periodic_compaction_seconds. A full compaction will be triggered if any file is over the threshold.
+* `GetLiveFilesMetaData` and `GetColumnFamilyMetaData` now expose the file number of SST files as well as the oldest blob file referenced by each SST.
+* A batched MultiGet API (DB::MultiGet()) that supports retrieving keys from multiple column families.
+* Full and partitioned filters in the block-based table use an improved Bloom filter implementation, enabled with format_version 5 (or above) because previous releases cannot read this filter. This replacement is faster and more accurate, especially for high bits per key or millions of keys in a single (full) filter. For example, the new Bloom filter has the same false positive rate at 9.55 bits per key as the old one at 10 bits per key, and a lower false positive rate at 16 bits per key than the old one at 100 bits per key.
+* Added AVX2 instructions to USE_SSE builds to accelerate the new Bloom filter and XXH3-based hash function on compatible x86_64 platforms (Haswell and later, ~2014).
+* Support options.ttl or options.periodic_compaction_seconds with options.max_open_files = -1. File's oldest ancester time and file creation time will be written to manifest. If it is availalbe, this information will be used instead of creation_time and file_creation_time in table properties.
+* Setting options.ttl for universal compaction now has the same meaning as setting periodic_compaction_seconds.
+* SstFileMetaData also returns file creation time and oldest ancester time.
+* The `sst_dump` command line tool `recompress` command now displays how many blocks were compressed and how many were not, in particular how many were not compressed because the compression ratio was not met (12.5% threshold for GoodCompressionRatio), as seen in the `number.block.not_compressed` counter stat since version 6.0.0.
+* The block cache usage is now takes into account the overhead of metadata per each entry. This results into more accurate management of memory. A side-effect of this feature is that less items are fit into the block cache of the same size, which would result to higher cache miss rates. This can be remedied by increasing the block cache size or passing kDontChargeCacheMetadata to its constuctor to restore the old behavior.
+* When using BlobDB, a mapping is maintained and persisted in the MANIFEST between each SST file and the oldest non-TTL blob file it references.
+* `db_bench` now supports and by default issues non-TTL Puts to BlobDB. TTL Puts can be enabled by specifying a non-zero value for the `blob_db_max_ttl_range` command line parameter explicitly.
+* `sst_dump` now supports printing BlobDB blob indexes in a human-readable format. This can be enabled by specifying the `decode_blob_index` flag on the command line.
+* A number of new information elements are now exposed through the EventListener interface. For flushes, the file numbers of the new SST file and the oldest blob file referenced by the SST are propagated. For compactions, the level, file number, and the oldest blob file referenced are passed to the client for each compaction input and output file.
+
+### Public API Change
+* RocksDB release 4.1 or older will not be able to open DB generated by the new release. 4.2 was released on Feb 23, 2016.
+* TTL Compactions in Level compaction style now initiate successive cascading compactions on a key range so that it reaches the bottom level quickly on TTL expiry. `creation_time` table property for compaction output files is now set to the minimum of the creation times of all compaction inputs.
+* With FIFO compaction style, options.periodic_compaction_seconds will have the same meaning as options.ttl. Whichever stricter will be used. With the default options.periodic_compaction_seconds value with options.ttl's default of 0, RocksDB will give a default of 30 days.
+* Added an API GetCreationTimeOfOldestFile(uint64_t* creation_time) to get the file_creation_time of the oldest SST file in the DB.
+* FilterPolicy now exposes additional API to make it possible to choose filter configurations based on context, such as table level and compaction style. See `LevelAndStyleCustomFilterPolicy` in db_bloom_filter_test.cc. While most existing custom implementations of FilterPolicy should continue to work as before, those wrapping the return of NewBloomFilterPolicy will require overriding new function `GetBuilderWithContext()`, because calling `GetFilterBitsBuilder()` on the FilterPolicy returned by NewBloomFilterPolicy is no longer supported.
+* An unlikely usage of FilterPolicy is no longer supported. Calling GetFilterBitsBuilder() on the FilterPolicy returned by NewBloomFilterPolicy will now cause an assertion violation in debug builds, because RocksDB has internally migrated to a more elaborate interface that is expected to evolve further. Custom implementations of FilterPolicy should work as before, except those wrapping the return of NewBloomFilterPolicy, which will require a new override of a protected function in FilterPolicy.
+* NewBloomFilterPolicy now takes bits_per_key as a double instead of an int. This permits finer control over the memory vs. accuracy trade-off in the new Bloom filter implementation and should not change source code compatibility.
+* The option BackupableDBOptions::max_valid_backups_to_open is now only used when opening BackupEngineReadOnly. When opening a read/write BackupEngine, anything but the default value logs a warning and is treated as the default. This change ensures that backup deletion has proper accounting of shared files to ensure they are deleted when no longer referenced by a backup.
+* Deprecate `snap_refresh_nanos` option.
+* Added DisableManualCompaction/EnableManualCompaction to stop and resume manual compaction.
+* Add TryCatchUpWithPrimary() to StackableDB in non-LITE mode.
+* Add a new Env::LoadEnv() overloaded function to return a shared_ptr to Env.
+* Flush sets file name to "(nil)" for OnTableFileCreationCompleted() if the flush does not produce any L0. This can happen if the file is empty thus delete by RocksDB.
+
+### Default Option Changes
+* Changed the default value of periodic_compaction_seconds to `UINT64_MAX - 1` which allows RocksDB to auto-tune periodic compaction scheduling. When using the default value, periodic compactions are now auto-enabled if a compaction filter is used. A value of `0` will turn off the feature completely.
+* Changed the default value of ttl to `UINT64_MAX - 1` which allows RocksDB to auto-tune ttl value. When using the default value, TTL will be auto-enabled to 30 days, when the feature is supported. To revert the old behavior, you can explicitly set it to 0.
+
+### Performance Improvements
+* For 64-bit hashing, RocksDB is standardizing on a slightly modified preview version of XXH3. This function is now used for many non-persisted hashes, along with fastrange64() in place of the modulus operator, and some benchmarks show a slight improvement.
+* Level iterator to invlidate the iterator more often in prefix seek and the level is filtered out by prefix bloom.
+
+## 6.5.2 (2019-11-15)
+### Bug Fixes
+* Fix a assertion failure in MultiGet() when BlockBasedTableOptions::no_block_cache is true and there is no compressed block cache
+* Fix a buffer overrun problem in BlockBasedTable::MultiGet() when compression is enabled and no compressed block cache is configured.
+* If a call to BackupEngine::PurgeOldBackups or BackupEngine::DeleteBackup suffered a crash, power failure, or I/O error, files could be left over from old backups that could only be purged with a call to GarbageCollect. Any call to PurgeOldBackups, DeleteBackup, or GarbageCollect should now suffice to purge such files.
+
+## 6.5.1 (2019-10-16)
+### Bug Fixes
+* Revert the feature "Merging iterator to avoid child iterator reseek for some cases (#5286)" since it might cause strange results when reseek happens with a different iterator upper bound.
+* Fix a bug in BlockBasedTableIterator that might return incorrect results when reseek happens with a different iterator upper bound.
+* Fix a bug when partitioned filters and prefix search are used in conjunction, ::SeekForPrev could return invalid for an existing prefix. ::SeekForPrev might be called by the user, or internally on ::Prev, or within ::Seek if the return value involves Delete or a Merge operand.
+
+## 6.5.0 (2019-09-13)
+### Bug Fixes
+* Fixed a number of data races in BlobDB.
+* Fix a bug where the compaction snapshot refresh feature is not disabled as advertised when `snap_refresh_nanos` is set to 0..
+* Fix bloom filter lookups by the MultiGet batching API when BlockBasedTableOptions::whole_key_filtering is false, by checking that a key is in the perfix_extractor domain and extracting the prefix before looking up.
+* Fix a bug in file ingestion caused by incorrect file number allocation when the number of column families involved in the ingestion exceeds 2.
+
+### New Features
+* Introduced DBOptions::max_write_batch_group_size_bytes to configure maximum limit on number of bytes that are written in a single batch of WAL or memtable write. It is followed when the leader write size is larger than 1/8 of this limit.
+* VerifyChecksum() by default will issue readahead. Allow ReadOptions to be passed in to those functions to override the readhead size. For checksum verifying before external SST file ingestion, a new option IngestExternalFileOptions.verify_checksums_readahead_size, is added for this readahead setting.
+* When user uses options.force_consistency_check in RocksDb, instead of crashing the process, we now pass the error back to the users without killing the process.
+* Add an option `memtable_insert_hint_per_batch` to WriteOptions. If it is true, each WriteBatch will maintain its own insert hints for each memtable in concurrent write. See include/rocksdb/options.h for more details.
+
+### Public API Change
+* Added max_write_buffer_size_to_maintain option to better control memory usage of immutable memtables.
+* Added a lightweight API GetCurrentWalFile() to get last live WAL filename and size. Meant to be used as a helper for backup/restore tooling in a larger ecosystem such as MySQL with a MyRocks storage engine.
+* The MemTable Bloom filter, when enabled, now always uses cache locality. Options::bloom_locality now only affects the PlainTable SST format.
+
+### Performance Improvements
+* Improve the speed of the MemTable Bloom filter, reducing the write overhead of enabling it by 1/3 to 1/2, with similar benefit to read performance.
+
+## 6.4.0 (2019-07-30)
+### Default Option Change
+* LRUCacheOptions.high_pri_pool_ratio is set to 0.5 (previously 0.0) by default, which means that by default midpoint insertion is enabled. The same change is made for the default value of high_pri_pool_ratio argument in NewLRUCache(). When block cache is not explicitly created, the small block cache created by BlockBasedTable will still has this option to be 0.0.
+* Change BlockBasedTableOptions.cache_index_and_filter_blocks_with_high_priority's default value from false to true.
+
+### Public API Change
+* Filter and compression dictionary blocks are now handled similarly to data blocks with regards to the block cache: instead of storing objects in the cache, only the blocks themselves are cached. In addition, filter and compression dictionary blocks (as well as filter partitions) no longer get evicted from the cache when a table is closed.
+* Due to the above refactoring, block cache eviction statistics for filter and compression dictionary blocks are temporarily broken. We plan to reintroduce them in a later phase.
+* The semantics of the per-block-type block read counts in the performance context now match those of the generic block_read_count.
+* Errors related to the retrieval of the compression dictionary are now propagated to the user.
+* db_bench adds a "benchmark" stats_history, which prints out the whole stats history.
+* Overload GetAllKeyVersions() to support non-default column family.
+* Added new APIs ExportColumnFamily() and CreateColumnFamilyWithImport() to support export and import of a Column Family. https://github.com/facebook/rocksdb/issues/3469
+* ldb sometimes uses a string-append merge operator if no merge operator is passed in. This is to allow users to print keys from a DB with a merge operator.
+* Replaces old Registra with ObjectRegistry to allow user to create custom object from string, also add LoadEnv() to Env.
+* Added new overload of GetApproximateSizes which gets SizeApproximationOptions object and returns a Status. The older overloads are redirecting their calls to this new method and no longer assert if the include_flags doesn't have either of INCLUDE_MEMTABLES or INCLUDE_FILES bits set. It's recommended to use the new method only, as it is more type safe and returns a meaningful status in case of errors.
+* LDBCommandRunner::RunCommand() to return the status code as an integer, rather than call exit() using the code.
+
+### New Features
+* Add argument `--secondary_path` to ldb to open the database as the secondary instance. This would keep the original DB intact.
+* Compression dictionary blocks are now prefetched and pinned in the cache (based on the customer's settings) the same way as index and filter blocks.
+* Added DBOptions::log_readahead_size which specifies the number of bytes to prefetch when reading the log. This is mostly useful for reading a remotely located log, as it can save the number of round-trips. If 0 (default), then the prefetching is disabled.
+* Added new option in SizeApproximationOptions used with DB::GetApproximateSizes. When approximating the files total size that is used to store a keys range, allow approximation with an error margin of up to total_files_size * files_size_error_margin. This allows to take some shortcuts in files size approximation, resulting in better performance, while guaranteeing the resulting error is within a reasonable margin.
+* Support loading custom objects in unit tests. In the affected unit tests, RocksDB will create custom Env objects based on environment variable TEST_ENV_URI. Users need to make sure custom object types are properly registered. For example, a static library should expose a `RegisterCustomObjects` function. By linking the unit test binary with the static library, the unit test can execute this function.
+
+### Performance Improvements
+* Reduce iterator key comparison for upper/lower bound check.
+* Improve performance of row_cache: make reads with newer snapshots than data in an SST file share the same cache key, except in some transaction cases.
+* The compression dictionary is no longer copied to a new object upon retrieval.
+
+### Bug Fixes
+* Fix ingested file and directory not being fsync.
+* Return TryAgain status in place of Corruption when new tail is not visible to TransactionLogIterator.
+* Fixed a regression where the fill_cache read option also affected index blocks.
+* Fixed an issue where using cache_index_and_filter_blocks==false affected partitions of partitioned indexes/filters as well.
+
+## 6.3.2 (2019-08-15)
+### Public API Change
+* The semantics of the per-block-type block read counts in the performance context now match those of the generic block_read_count.
+
+### Bug Fixes
+* Fixed a regression where the fill_cache read option also affected index blocks.
+* Fixed an issue where using cache_index_and_filter_blocks==false affected partitions of partitioned indexes as well.
+
+## 6.3.1 (2019-07-24)
+### Bug Fixes
+* Fix auto rolling bug introduced in 6.3.0, which causes segfault if log file creation fails.
+
+## 6.3.0 (2019-06-18)
+### Public API Change
+* Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released.
+* Index blocks are now handled similarly to data blocks with regards to the block cache: instead of storing objects in the cache, only the blocks themselves are cached. In addition, index blocks no longer get evicted from the cache when a table is closed, can now use the compressed block cache (if any), and can be shared among multiple table readers.
+* Partitions of partitioned indexes no longer affect the read amplification statistics.
+* Due to the above refactoring, block cache eviction statistics for indexes are temporarily broken. We plan to reintroduce them in a later phase.
+* options.keep_log_file_num will be enforced strictly all the time. File names of all log files will be tracked, which may take significantly amount of memory if options.keep_log_file_num is large and either of options.max_log_file_size or options.log_file_time_to_roll is set.
+* Add initial support for Get/Put with user timestamps. Users can specify timestamps via ReadOptions and WriteOptions when calling DB::Get and DB::Put.
+* Accessing a partition of a partitioned filter or index through a pinned reference is no longer considered a cache hit.
+* Add C bindings for secondary instance, i.e. DBImplSecondary.
+* Rate limited deletion of WALs is only enabled if DBOptions::wal_dir is not set, or explicitly set to db_name passed to DB::Open and DBOptions::db_paths is empty, or same as db_paths[0].path
+
+### New Features
+* Add an option `snap_refresh_nanos` (default to 0) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
+* Add an option `unordered_write` which trades snapshot guarantees with higher write throughput. When used with WRITE_PREPARED transactions with two_write_queues=true, it offers higher throughput with however no compromise on guarantees.
+* Allow DBImplSecondary to remove memtables with obsolete data after replaying MANIFEST and WAL.
+* Add an option `failed_move_fall_back_to_copy` (default is true) for external SST ingestion. When `move_files` is true and hard link fails, ingestion falls back to copy if `failed_move_fall_back_to_copy` is true. Otherwise, ingestion reports an error.
+* Add command `list_file_range_deletes` in ldb, which prints out tombstones in SST files.
+
+### Performance Improvements
+* Reduce binary search when iterator reseek into the same data block.
+* DBIter::Next() can skip user key checking if previous entry's seqnum is 0.
+* Merging iterator to avoid child iterator reseek for some cases
+* Log Writer will flush after finishing the whole record, rather than a fragment.
+* Lower MultiGet batching API latency by reading data blocks from disk in parallel
+
+### General Improvements
+* Added new status code kColumnFamilyDropped to distinguish between Column Family Dropped and DB Shutdown in progress.
+* Improve ColumnFamilyOptions validation when creating a new column family.
+
+### Bug Fixes
+* Fix a bug in WAL replay of secondary instance by skipping write batches with older sequence numbers than the current last sequence number.
+* Fix flush's/compaction's merge processing logic which allowed `Put`s covered by range tombstones to reappear. Note `Put`s may exist even if the user only ever called `Merge()` due to an internal conversion during compaction to the bottommost level.
+* Fix/improve memtable earliest sequence assignment and WAL replay so that WAL entries of unflushed column families will not be skipped after replaying the MANIFEST and increasing db sequence due to another flushed/compacted column family.
+* Fix a bug caused by secondary not skipping the beginning of new MANIFEST.
+* On DB open, delete WAL trash files left behind in wal_dir
+
+## 6.2.0 (2019-04-30)
+### New Features
+* Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`.
+* Improve range scan performance by avoiding per-key upper bound check in BlockBasedTableIterator.
+* Introduce Periodic Compaction for Level style compaction. Files are re-compacted periodically and put in the same level.
+* Block-based table index now contains exact highest key in the file, rather than an upper bound. This may improve Get() and iterator Seek() performance in some situations, especially when direct IO is enabled and block cache is disabled. A setting BlockBasedTableOptions::index_shortening is introduced to control this behavior. Set it to kShortenSeparatorsAndSuccessor to get the old behavior.
+* When reading from option file/string/map, customized envs can be filled according to object registry.
+* Improve range scan performance when using explicit user readahead by not creating new table readers for every iterator.
+* Add index type BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey. It significantly reduces read amplification in some setups, especially for iterator seeks. It's not fully implemented yet: IO errors are not handled right.
+
+### Public API Change
+* Change the behavior of OptimizeForPointLookup(): move away from hash-based block-based-table index, and use whole key memtable filtering.
+* Change the behavior of OptimizeForSmallDb(): use a 16MB block cache, put index and filter blocks into it, and cost the memtable size to it. DBOptions.OptimizeForSmallDb() and ColumnFamilyOptions.OptimizeForSmallDb() start to take an optional cache object.
+* Added BottommostLevelCompaction::kForceOptimized to avoid double compacting newly compacted files in the bottommost level compaction of manual compaction. Note this option may prohibit the manual compaction to produce a single file in the bottommost level.
+
+### Bug Fixes
+* Adjust WriteBufferManager's dummy entry size to block cache from 1MB to 256KB.
+* Fix a race condition between WritePrepared::Get and ::Put with duplicate keys.
+* Fix crash when memtable prefix bloom is enabled and read/write a key out of domain of prefix extractor.
+* Close a WAL file before another thread deletes it.
+* Fix an assertion failure `IsFlushPending() == true` caused by one bg thread releasing the db mutex in ~ColumnFamilyData and another thread clearing `flush_requested_` flag.
+
+## 6.1.1 (2019-04-09)
+### New Features
+* When reading from option file/string/map, customized comparators and/or merge operators can be filled according to object registry.
+
+### Public API Change
+
+### Bug Fixes
+* Fix a bug in 2PC where a sequence of txn prepare, memtable flush, and crash could result in losing the prepared transaction.
+* Fix a bug in Encryption Env which could cause encrypted files to be read beyond file boundaries.
+
+## 6.1.0 (2019-03-27)
+### New Features
+* Introduce two more stats levels, kExceptHistogramOrTimers and kExceptTimers.
+* Added a feature to perform data-block sampling for compressibility, and report stats to user.
+* Add support for trace filtering.
+* Add DBOptions.avoid_unnecessary_blocking_io. If true, we avoid file deletion when destroying ColumnFamilyHandle and Iterator. Instead, a job is scheduled to delete the files in background.
+
+### Public API Change
+* Remove bundled fbson library.
+* statistics.stats_level_ becomes atomic. It is preferred to use statistics.set_stats_level() and statistics.get_stats_level() to access it.
+* Introduce a new IOError subcode, PathNotFound, to indicate trying to open a nonexistent file or directory for read.
+* Add initial support for multiple db instances sharing the same data in single-writer, multi-reader mode.
+* Removed some "using std::xxx" from public headers.
+
+### Bug Fixes
+* Fix JEMALLOC_CXX_THROW macro missing from older Jemalloc versions, causing build failures on some platforms.
+* Fix SstFileReader not able to open file ingested with write_glbal_seqno=true.
+
+## 6.0.0 (2019-02-19)
+### New Features
+* Enabled checkpoint on readonly db (DBImplReadOnly).
+* Make DB ignore dropped column families while committing results of atomic flush.
+* RocksDB may choose to preopen some files even if options.max_open_files != -1. This may make DB open slightly longer.
+* For users of dictionary compression with ZSTD v0.7.0+, we now reuse the same digested dictionary when compressing each of an SST file's data blocks for faster compression speeds.
+* For all users of dictionary compression who set `cache_index_and_filter_blocks == true`, we now store dictionary data used for decompression in the block cache for better control over memory usage. For users of ZSTD v1.1.4+ who compile with -DZSTD_STATIC_LINKING_ONLY, this includes a digested dictionary, which is used to increase decompression speed.
+* Add support for block checksums verification for external SST files before ingestion.
+* Introduce stats history which periodically saves Statistics snapshots and added `GetStatsHistory` API to retrieve these snapshots.
+* Add a place holder in manifest which indicate a record from future that can be safely ignored.
+* Add support for trace sampling.
+* Enable properties block checksum verification for block-based tables.
+* For all users of dictionary compression, we now generate a separate dictionary for compressing each bottom-level SST file. Previously we reused a single dictionary for a whole compaction to bottom level. The new approach achieves better compression ratios; however, it uses more memory and CPU for buffering/sampling data blocks and training dictionaries.
+* Add whole key bloom filter support in memtable.
+* Files written by `SstFileWriter` will now use dictionary compression if it is configured in the file writer's `CompressionOptions`.
+
+### Public API Change
+* Disallow CompactionFilter::IgnoreSnapshots() = false, because it is not very useful and the behavior is confusing. The filter will filter everything if there is no snapshot declared by the time the compaction starts. However, users can define a snapshot after the compaction starts and before it finishes and this new snapshot won't be repeatable, because after the compaction finishes, some keys may be dropped.
+* CompactionPri = kMinOverlappingRatio also uses compensated file size, which boosts file with lots of tombstones to be compacted first.
+* Transaction::GetForUpdate is extended with a do_validate parameter with default value of true. If false it skips validating the snapshot before doing the read. Similarly ::Merge, ::Put, ::Delete, and ::SingleDelete are extended with assume_tracked with default value of false. If true it indicates that call is assumed to be after a ::GetForUpdate.
+* `TableProperties::num_entries` and `TableProperties::num_deletions` now also account for number of range tombstones.
+* Remove geodb, spatial_db, document_db, json_document, date_tiered_db, and redis_lists.
+* With "ldb ----try_load_options", when wal_dir specified by the option file doesn't exist, ignore it.
+* Change time resolution in FileOperationInfo.
+* Deleting Blob files also go through SStFileManager.
+* Remove CuckooHash memtable.
+* The counter stat `number.block.not_compressed` now also counts blocks not compressed due to poor compression ratio.
+* Remove ttl option from `CompactionOptionsFIFO`. The option has been deprecated and ttl in `ColumnFamilyOptions` is used instead.
+* Support SST file ingestion across multiple column families via DB::IngestExternalFiles. See the function's comment about atomicity.
+* Remove Lua compaction filter.
+
+### Bug Fixes
+* Fix a deadlock caused by compaction and file ingestion waiting for each other in the event of write stalls.
+* Fix a memory leak when files with range tombstones are read in mmap mode and block cache is enabled
+* Fix handling of corrupt range tombstone blocks such that corruptions cannot cause deleted keys to reappear
+* Lock free MultiGet
+* Fix incorrect `NotFound` point lookup result when querying the endpoint of a file that has been extended by a range tombstone.
+* Fix with pipelined write, write leaders's callback failure lead to the whole write group fail.
+
+### Change Default Options
+* Change options.compaction_pri's default to kMinOverlappingRatio
+
+## 5.18.0 (2018-11-30)
+### New Features
+* Introduced `JemallocNodumpAllocator` memory allocator. When being use, block cache will be excluded from core dump.
+* Introduced `PerfContextByLevel` as part of `PerfContext` which allows storing perf context at each level. Also replaced `__thread` with `thread_local` keyword for perf_context. Added per-level perf context for bloom filter and `Get` query.
+* With level_compaction_dynamic_level_bytes = true, level multiplier may be adjusted automatically when Level 0 to 1 compaction is lagged behind.
+* Introduced DB option `atomic_flush`. If true, RocksDB supports flushing multiple column families and atomically committing the result to MANIFEST. Useful when WAL is disabled.
+* Added `num_deletions` and `num_merge_operands` members to `TableProperties`.
+* Added "rocksdb.min-obsolete-sst-number-to-keep" DB property that reports the lower bound on SST file numbers that are being kept from deletion, even if the SSTs are obsolete.
+* Add xxhash64 checksum support
+* Introduced `MemoryAllocator`, which lets the user specify custom memory allocator for block based table.
+* Improved `DeleteRange` to prevent read performance degradation. The feature is no longer marked as experimental.
+
+### Public API Change
+* `DBOptions::use_direct_reads` now affects reads issued by `BackupEngine` on the database's SSTs.
+* `NO_ITERATORS` is divided into two counters `NO_ITERATOR_CREATED` and `NO_ITERATOR_DELETE`. Both of them are only increasing now, just as other counters.
+
+### Bug Fixes
+* Fix corner case where a write group leader blocked due to write stall blocks other writers in queue with WriteOptions::no_slowdown set.
+* Fix in-memory range tombstone truncation to avoid erroneously covering newer keys at a lower level, and include range tombstones in compacted files whose largest key is the range tombstone's start key.
+* Properly set the stop key for a truncated manual CompactRange
+* Fix slow flush/compaction when DB contains many snapshots. The problem became noticeable to us in DBs with 100,000+ snapshots, though it will affect others at different thresholds.
+* Fix the bug that WriteBatchWithIndex's SeekForPrev() doesn't see the entries with the same key.
+* Fix the bug where user comparator was sometimes fed with InternalKey instead of the user key. The bug manifests when during GenerateBottommostFiles.
+* Fix a bug in WritePrepared txns where if the number of old snapshots goes beyond the snapshot cache size (128 default) the rest will not be checked when evicting a commit entry from the commit cache.
+* Fixed Get correctness bug in the presence of range tombstones where merge operands covered by a range tombstone always result in NotFound.
+* Start populating `NO_FILE_CLOSES` ticker statistic, which was always zero previously.
+* The default value of NewBloomFilterPolicy()'s argument use_block_based_builder is changed to false. Note that this new default may cause large temp memory usage when building very large SST files.
+
+## 5.17.0 (2018-10-05)
+### Public API Change
+* `OnTableFileCreated` will now be called for empty files generated during compaction. In that case, `TableFileCreationInfo::file_path` will be "(nil)" and `TableFileCreationInfo::file_size` will be zero.
+* Add `FlushOptions::allow_write_stall`, which controls whether Flush calls start working immediately, even if it causes user writes to stall, or will wait until flush can be performed without causing write stall (similar to `CompactRangeOptions::allow_write_stall`). Note that the default value is false, meaning we add delay to Flush calls until stalling can be avoided when possible. This is behavior change compared to previous RocksDB versions, where Flush calls didn't check if they might cause stall or not.
+* Application using PessimisticTransactionDB is expected to rollback/commit recovered transactions before starting new ones. This assumption is used to skip concurrency control during recovery.
+* Expose column family id to `OnCompactionCompleted`.
+
+### New Features
+* TransactionOptions::skip_concurrency_control allows pessimistic transactions to skip the overhead of concurrency control. Could be used for optimizing certain transactions or during recovery.
+
+### Bug Fixes
+* Avoid creating empty SSTs and subsequently deleting them in certain cases during compaction.
+* Sync CURRENT file contents during checkpoint.
+
+## 5.16.3 (2018-10-01)
+### Bug Fixes
+* Fix crash caused when `CompactFiles` run with `CompactionOptions::compression == CompressionType::kDisableCompressionOption`. Now that setting causes the compression type to be chosen according to the column family-wide compression options.
+
+## 5.16.2 (2018-09-21)
+### Bug Fixes
+* Fix bug in partition filters with format_version=4.
+
+## 5.16.1 (2018-09-17)
+### Bug Fixes
+* Remove trace_analyzer_tool from rocksdb_lib target in TARGETS file.
+* Fix RocksDB Java build and tests.
+* Remove sync point in Block destructor.
+
+## 5.16.0 (2018-08-21)
+### Public API Change
+* The merge operands are passed to `MergeOperator::ShouldMerge` in the reversed order relative to how they were merged (passed to FullMerge or FullMergeV2) for performance reasons
+* GetAllKeyVersions() to take an extra argument of `max_num_ikeys`.
+* Using ZSTD dictionary trainer (i.e., setting `CompressionOptions::zstd_max_train_bytes` to a nonzero value) now requires ZSTD version 1.1.3 or later.
+
+### New Features
+* Changes the format of index blocks by delta encoding the index values, which are the block handles. This saves the encoding of BlockHandle::offset of the non-head index entries in each restart interval. The feature is backward compatible but not forward compatible. It is disabled by default unless format_version 4 or above is used.
+* Add a new tool: trace_analyzer. Trace_analyzer analyzes the trace file generated by using trace_replay API. It can convert the binary format trace file to a human readable txt file, output the statistics of the analyzed query types such as access statistics and size statistics, combining the dumped whole key space file to analyze, support query correlation analyzing, and etc. Current supported query types are: Get, Put, Delete, SingleDelete, DeleteRange, Merge, Iterator (Seek, SeekForPrev only).
+* Add hash index support to data blocks, which helps reducing the cpu utilization of point-lookup operations. This feature is backward compatible with the data block created without the hash index. It is disabled by default unless BlockBasedTableOptions::data_block_index_type is set to data_block_index_type = kDataBlockBinaryAndHash.
+
+### Bug Fixes
+* Fix a bug in misreporting the estimated partition index size in properties block.
+
+## 5.15.0 (2018-07-17)
+### Public API Change
+* Remove managed iterator. ReadOptions.managed is not effective anymore.
+* For bottommost_compression, a compatible CompressionOptions is added via `bottommost_compression_opts`. To keep backward compatible, a new boolean `enabled` is added to CompressionOptions. For compression_opts, it will be always used no matter what value of `enabled` is. For bottommost_compression_opts, it will only be used when user set `enabled=true`, otherwise, compression_opts will be used for bottommost_compression as default.
+* With LRUCache, when high_pri_pool_ratio > 0, midpoint insertion strategy will be enabled to put low-pri items to the tail of low-pri list (the midpoint) when they first inserted into the cache. This is to make cache entries never get hit age out faster, improving cache efficiency when large background scan presents.
+* For users of `Statistics` objects created via `CreateDBStatistics()`, the format of the string returned by its `ToString()` method has changed.
+* The "rocksdb.num.entries" table property no longer counts range deletion tombstones as entries.
+
+### New Features
+* Changes the format of index blocks by storing the key in their raw form rather than converting them to InternalKey. This saves 8 bytes per index key. The feature is backward compatible but not forward compatible. It is disabled by default unless format_version 3 or above is used.
+* Avoid memcpy when reading mmap files with OpenReadOnly and max_open_files==-1.
+* Support dynamically changing `ColumnFamilyOptions::ttl` via `SetOptions()`.
+* Add a new table property, "rocksdb.num.range-deletions", which counts the number of range deletion tombstones in the table.
+* Improve the performance of iterators doing long range scans by using readahead, when using direct IO.
+* pin_top_level_index_and_filter (default true) in BlockBasedTableOptions can be used in combination with cache_index_and_filter_blocks to prefetch and pin the top-level index of partitioned index and filter blocks in cache. It has no impact when cache_index_and_filter_blocks is false.
+* Write properties meta-block at the end of block-based table to save read-ahead IO.
+
+### Bug Fixes
+* Fix deadlock with enable_pipelined_write=true and max_successive_merges > 0
+* Check conflict at output level in CompactFiles.
+* Fix corruption in non-iterator reads when mmap is used for file reads
+* Fix bug with prefix search in partition filters where a shared prefix would be ignored from the later partitions. The bug could report an eixstent key as missing. The bug could be triggered if prefix_extractor is set and partition filters is enabled.
+* Change default value of `bytes_max_delete_chunk` to 0 in NewSstFileManager() as it doesn't work well with checkpoints.
+* Fix a bug caused by not copying the block trailer with compressed SST file, direct IO, prefetcher and no compressed block cache.
+* Fix write can stuck indefinitely if enable_pipelined_write=true. The issue exists since pipelined write was introduced in 5.5.0.
+
+## 5.14.0 (2018-05-16)
+### Public API Change
+* Add a BlockBasedTableOption to align uncompressed data blocks on the smaller of block size or page size boundary, to reduce flash reads by avoiding reads spanning 4K pages.
+* The background thread naming convention changed (on supporting platforms) to "rocksdb:<thread pool priority><thread number>", e.g., "rocksdb:low0".
+* Add a new ticker stat rocksdb.number.multiget.keys.found to count number of keys successfully read in MultiGet calls
+* Touch-up to write-related counters in PerfContext. New counters added: write_scheduling_flushes_compactions_time, write_thread_wait_nanos. Counters whose behavior was fixed or modified: write_memtable_time, write_pre_and_post_process_time, write_delay_time.
+* Posix Env's NewRandomRWFile() will fail if the file doesn't exist.
+* Now, `DBOptions::use_direct_io_for_flush_and_compaction` only applies to background writes, and `DBOptions::use_direct_reads` applies to both user reads and background reads. This conforms with Linux's `open(2)` manpage, which advises against simultaneously reading a file in buffered and direct modes, due to possibly undefined behavior and degraded performance.
+* Iterator::Valid() always returns false if !status().ok(). So, now when doing a Seek() followed by some Next()s, there's no need to check status() after every operation.
+* Iterator::Seek()/SeekForPrev()/SeekToFirst()/SeekToLast() always resets status().
+* Introduced `CompressionOptions::kDefaultCompressionLevel`, which is a generic way to tell RocksDB to use the compression library's default level. It is now the default value for `CompressionOptions::level`. Previously the level defaulted to -1, which gave poor compression ratios in ZSTD.
+
+### New Features
+* Introduce TTL for level compaction so that all files older than ttl go through the compaction process to get rid of old data.
+* TransactionDBOptions::write_policy can be configured to enable WritePrepared 2PC transactions. Read more about them in the wiki.
+* Add DB properties "rocksdb.block-cache-capacity", "rocksdb.block-cache-usage", "rocksdb.block-cache-pinned-usage" to show block cache usage.
+* Add `Env::LowerThreadPoolCPUPriority(Priority)` method, which lowers the CPU priority of background (esp. compaction) threads to minimize interference with foreground tasks.
+* Fsync parent directory after deleting a file in delete scheduler.
+* In level-based compaction, if bottom-pri thread pool was setup via `Env::SetBackgroundThreads()`, compactions to the bottom level will be delegated to that thread pool.
+* `prefix_extractor` has been moved from ImmutableCFOptions to MutableCFOptions, meaning it can be dynamically changed without a DB restart.
+
+### Bug Fixes
+* Fsync after writing global seq number to the ingestion file in ExternalSstFileIngestionJob.
+* Fix WAL corruption caused by race condition between user write thread and FlushWAL when two_write_queue is not set.
+* Fix `BackupableDBOptions::max_valid_backups_to_open` to not delete backup files when refcount cannot be accurately determined.
+* Fix memory leak when pin_l0_filter_and_index_blocks_in_cache is used with partitioned filters
+* Disable rollback of merge operands in WritePrepared transactions to work around an issue in MyRocks. It can be enabled back by setting TransactionDBOptions::rollback_merge_operands to true.
+* Fix wrong results by ReverseBytewiseComparator::FindShortSuccessor()
+
+### Java API Changes
+* Add `BlockBasedTableConfig.setBlockCache` to allow sharing a block cache across DB instances.
+* Added SstFileManager to the Java API to allow managing SST files across DB instances.
+
+## 5.13.0 (2018-03-20)
+### Public API Change
+* RocksDBOptionsParser::Parse()'s `ignore_unknown_options` argument will only be effective if the option file shows it is generated using a higher version of RocksDB than the current version.
+* Remove CompactionEventListener.
+
+### New Features
+* SstFileManager now can cancel compactions if they will result in max space errors. SstFileManager users can also use SetCompactionBufferSize to specify how much space must be leftover during a compaction for auxiliary file functions such as logging and flushing.
+* Avoid unnecessarily flushing in `CompactRange()` when the range specified by the user does not overlap unflushed memtables.
+* If `ColumnFamilyOptions::max_subcompactions` is set greater than one, we now parallelize large manual level-based compactions.
+* Add "rocksdb.live-sst-files-size" DB property to return total bytes of all SST files belong to the latest LSM tree.
+* NewSstFileManager to add an argument bytes_max_delete_chunk with default 64MB. With this argument, a file larger than 64MB will be ftruncated multiple times based on this size.
+
+### Bug Fixes
+* Fix a leak in prepared_section_completed_ where the zeroed entries would not removed from the map.
+* Fix WAL corruption caused by race condition between user write thread and backup/checkpoint thread.
+
+## 5.12.0 (2018-02-14)
+### Public API Change
+* Iterator::SeekForPrev is now a pure virtual method. This is to prevent user who implement the Iterator interface fail to implement SeekForPrev by mistake.
+* Add `include_end` option to make the range end exclusive when `include_end == false` in `DeleteFilesInRange()`.
+* Add `CompactRangeOptions::allow_write_stall`, which makes `CompactRange` start working immediately, even if it causes user writes to stall. The default value is false, meaning we add delay to `CompactRange` calls until stalling can be avoided when possible. Note this delay is not present in previous RocksDB versions.
+* Creating checkpoint with empty directory now returns `Status::InvalidArgument`; previously, it returned `Status::IOError`.
+* Adds a BlockBasedTableOption to turn off index block compression.
+* Close() method now returns a status when closing a db.
+
+### New Features
+* Improve the performance of iterators doing long range scans by using readahead.
+* Add new function `DeleteFilesInRanges()` to delete files in multiple ranges at once for better performance.
+* FreeBSD build support for RocksDB and RocksJava.
+* Improved performance of long range scans with readahead.
+* Updated to and now continuously tested in Visual Studio 2017.
+
+### Bug Fixes
+* Fix `DisableFileDeletions()` followed by `GetSortedWalFiles()` to not return obsolete WAL files that `PurgeObsoleteFiles()` is going to delete.
+* Fix Handle error return from WriteBuffer() during WAL file close and DB close.
+* Fix advance reservation of arena block addresses.
+* Fix handling of empty string as checkpoint directory.
+
+## 5.11.0 (2018-01-08)
+### Public API Change
+* Add `autoTune` and `getBytesPerSecond()` to RocksJava RateLimiter
+
+### New Features
+* Add a new histogram stat called rocksdb.db.flush.micros for memtable flush.
+* Add "--use_txn" option to use transactional API in db_stress.
+* Disable onboard cache for compaction output in Windows platform.
+* Improve the performance of iterators doing long range scans by using readahead.
+
+### Bug Fixes
+* Fix a stack-use-after-scope bug in ForwardIterator.
+* Fix builds on platforms including Linux, Windows, and PowerPC.
+* Fix buffer overrun in backup engine for DBs with huge number of files.
+* Fix a mislabel bug for bottom-pri compaction threads.
+* Fix DB::Flush() keep waiting after flush finish under certain condition.
+
+## 5.10.0 (2017-12-11)
+### Public API Change
+* When running `make` with environment variable `USE_SSE` set and `PORTABLE` unset, will use all machine features available locally. Previously this combination only compiled SSE-related features.
+
+### New Features
+* Provide lifetime hints when writing files on Linux. This reduces hardware write-amp on storage devices supporting multiple streams.
+* Add a DB stat, `NUMBER_ITER_SKIP`, which returns how many internal keys were skipped during iterations (e.g., due to being tombstones or duplicate versions of a key).
+* Add PerfContext counters, `key_lock_wait_count` and `key_lock_wait_time`, which measure the number of times transactions wait on key locks and total amount of time waiting.
+
+### Bug Fixes
+* Fix IOError on WAL write doesn't propagate to write group follower
+* Make iterator invalid on merge error.
+* Fix performance issue in `IngestExternalFile()` affecting databases with large number of SST files.
+* Fix possible corruption to LSM structure when `DeleteFilesInRange()` deletes a subset of files spanned by a `DeleteRange()` marker.
+
+## 5.9.0 (2017-11-01)
+### Public API Change
+* `BackupableDBOptions::max_valid_backups_to_open == 0` now means no backups will be opened during BackupEngine initialization. Previously this condition disabled limiting backups opened.
+* `DBOptions::preserve_deletes` is a new option that allows one to specify that DB should not drop tombstones for regular deletes if they have sequence number larger than what was set by the new API call `DB::SetPreserveDeletesSequenceNumber(SequenceNumber seqnum)`. Disabled by default.
+* API call `DB::SetPreserveDeletesSequenceNumber(SequenceNumber seqnum)` was added, users who wish to preserve deletes are expected to periodically call this function to advance the cutoff seqnum (all deletes made before this seqnum can be dropped by DB). It's user responsibility to figure out how to advance the seqnum in the way so the tombstones are kept for the desired period of time, yet are eventually processed in time and don't eat up too much space.
+* `ReadOptions::iter_start_seqnum` was added;
+if set to something > 0 user will see 2 changes in iterators behavior 1) only keys written with sequence larger than this parameter would be returned and 2) the `Slice` returned by iter->key() now points to the memory that keep User-oriented representation of the internal key, rather than user key. New struct `FullKey` was added to represent internal keys, along with a new helper function `ParseFullKey(const Slice& internal_key, FullKey* result);`.
+* Deprecate trash_dir param in NewSstFileManager, right now we will rename deleted files to <name>.trash instead of moving them to trash directory
+* Allow setting a custom trash/DB size ratio limit in the SstFileManager, after which files that are to be scheduled for deletion are deleted immediately, regardless of any delete ratelimit.
+* Return an error on write if write_options.sync = true and write_options.disableWAL = true to warn user of inconsistent options. Previously we will not write to WAL and not respecting the sync options in this case.
+
+### New Features
+* CRC32C is now using the 3-way pipelined SSE algorithm `crc32c_3way` on supported platforms to improve performance. The system will choose to use this algorithm on supported platforms automatically whenever possible. If PCLMULQDQ is not supported it will fall back to the old Fast_CRC32 algorithm.
+* `DBOptions::writable_file_max_buffer_size` can now be changed dynamically.
+* `DBOptions::bytes_per_sync`, `DBOptions::compaction_readahead_size`, and `DBOptions::wal_bytes_per_sync` can now be changed dynamically, `DBOptions::wal_bytes_per_sync` will flush all memtables and switch to a new WAL file.
+* Support dynamic adjustment of rate limit according to demand for background I/O. It can be enabled by passing `true` to the `auto_tuned` parameter in `NewGenericRateLimiter()`. The value passed as `rate_bytes_per_sec` will still be respected as an upper-bound.
+* Support dynamically changing `ColumnFamilyOptions::compaction_options_fifo`.
+* Introduce `EventListener::OnStallConditionsChanged()` callback. Users can implement it to be notified when user writes are stalled, stopped, or resumed.
+* Add a new db property "rocksdb.estimate-oldest-key-time" to return oldest data timestamp. The property is available only for FIFO compaction with compaction_options_fifo.allow_compaction = false.
+* Upon snapshot release, recompact bottommost files containing deleted/overwritten keys that previously could not be dropped due to the snapshot. This alleviates space-amp caused by long-held snapshots.
+* Support lower bound on iterators specified via `ReadOptions::iterate_lower_bound`.
+* Support for differential snapshots (via iterator emitting the sequence of key-values representing the difference between DB state at two different sequence numbers). Supports preserving and emitting puts and regular deletes, doesn't support SingleDeletes, MergeOperator, Blobs and Range Deletes.
+
+### Bug Fixes
+* Fix a potential data inconsistency issue during point-in-time recovery. `DB:Open()` will abort if column family inconsistency is found during PIT recovery.
+* Fix possible metadata corruption in databases using `DeleteRange()`.
+
+## 5.8.0 (2017-08-30)
+### Public API Change
+* Users of `Statistics::getHistogramString()` will see fewer histogram buckets and different bucket endpoints.
+* `Slice::compare` and BytewiseComparator `Compare` no longer accept `Slice`s containing nullptr.
+* `Transaction::Get` and `Transaction::GetForUpdate` variants with `PinnableSlice` added.
+
+### New Features
+* Add Iterator::Refresh(), which allows users to update the iterator state so that they can avoid some initialization costs of recreating iterators.
+* Replace dynamic_cast<> (except unit test) so people can choose to build with RTTI off. With make, release mode is by default built with -fno-rtti and debug mode is built without it. Users can override it by setting USE_RTTI=0 or 1.
+* Universal compactions including the bottom level can be executed in a dedicated thread pool. This alleviates head-of-line blocking in the compaction queue, which cause write stalling, particularly in multi-instance use cases. Users can enable this feature via `Env::SetBackgroundThreads(N, Env::Priority::BOTTOM)`, where `N > 0`.
+* Allow merge operator to be called even with a single merge operand during compactions, by appropriately overriding `MergeOperator::AllowSingleOperand`.
+* Add `DB::VerifyChecksum()`, which verifies the checksums in all SST files in a running DB.
+* Block-based table support for disabling checksums by setting `BlockBasedTableOptions::checksum = kNoChecksum`.
+
+### Bug Fixes
+* Fix wrong latencies in `rocksdb.db.get.micros`, `rocksdb.db.write.micros`, and `rocksdb.sst.read.micros`.
+* Fix incorrect dropping of deletions during intra-L0 compaction.
+* Fix transient reappearance of keys covered by range deletions when memtable prefix bloom filter is enabled.
+* Fix potentially wrong file smallest key when range deletions separated by snapshot are written together.
+
+## 5.7.0 (2017-07-13)
+### Public API Change
+* DB property "rocksdb.sstables" now prints keys in hex form.
+
+### New Features
+* Measure estimated number of reads per file. The information can be accessed through DB::GetColumnFamilyMetaData or "rocksdb.sstables" DB property.
+* RateLimiter support for throttling background reads, or throttling the sum of background reads and writes. This can give more predictable I/O usage when compaction reads more data than it writes, e.g., due to lots of deletions.
+* [Experimental] FIFO compaction with TTL support. It can be enabled by setting CompactionOptionsFIFO.ttl > 0.
+* Introduce `EventListener::OnBackgroundError()` callback. Users can implement it to be notified of errors causing the DB to enter read-only mode, and optionally override them.
+* Partitioned Index/Filters exiting the experimental mode. To enable partitioned indexes set index_type to kTwoLevelIndexSearch and to further enable partitioned filters set partition_filters to true. To configure the partition size set metadata_block_size.
+
+
+### Bug Fixes
+* Fix discarding empty compaction output files when `DeleteRange()` is used together with subcompactions.
+
+## 5.6.0 (2017-06-06)
+### Public API Change
+* Scheduling flushes and compactions in the same thread pool is no longer supported by setting `max_background_flushes=0`. Instead, users can achieve this by configuring their high-pri thread pool to have zero threads.
+* Replace `Options::max_background_flushes`, `Options::max_background_compactions`, and `Options::base_background_compactions` all with `Options::max_background_jobs`, which automatically decides how many threads to allocate towards flush/compaction.
+* options.delayed_write_rate by default take the value of options.rate_limiter rate.
+* Replace global variable `IOStatsContext iostats_context` with `IOStatsContext* get_iostats_context()`; replace global variable `PerfContext perf_context` with `PerfContext* get_perf_context()`.
+
+### New Features
+* Change ticker/histogram statistics implementations to use core-local storage. This improves aggregation speed compared to our previous thread-local approach, particularly for applications with many threads.
+* Users can pass a cache object to write buffer manager, so that they can cap memory usage for memtable and block cache using one single limit.
+* Flush will be triggered when 7/8 of the limit introduced by write_buffer_manager or db_write_buffer_size is triggered, so that the hard threshold is hard to hit.
+* Introduce WriteOptions.low_pri. If it is true, low priority writes will be throttled if the compaction is behind.
+* `DB::IngestExternalFile()` now supports ingesting files into a database containing range deletions.
+
+### Bug Fixes
+* Shouldn't ignore return value of fsync() in flush.
+
+## 5.5.0 (2017-05-17)
+### New Features
+* FIFO compaction to support Intra L0 compaction too with CompactionOptionsFIFO.allow_compaction=true.
+* DB::ResetStats() to reset internal stats.
+* Statistics::Reset() to reset user stats.
+* ldb add option --try_load_options, which will open DB with its own option file.
+* Introduce WriteBatch::PopSavePoint to pop the most recent save point explicitly.
+* Support dynamically change `max_open_files` option via SetDBOptions()
+* Added DB::CreateColumnFamilie() and DB::DropColumnFamilies() to bulk create/drop column families.
+* Add debugging function `GetAllKeyVersions` to see internal versions of a range of keys.
+* Support file ingestion with universal compaction style
+* Support file ingestion behind with option `allow_ingest_behind`
+* New option enable_pipelined_write which may improve write throughput in case writing from multiple threads and WAL enabled.
+
+### Bug Fixes
+* Fix the bug that Direct I/O uses direct reads for non-SST file
+
+## 5.4.0 (2017-04-11)
+### Public API Change
+* random_access_max_buffer_size no longer has any effect
+* Removed Env::EnableReadAhead(), Env::ShouldForwardRawRequest()
+* Support dynamically change `stats_dump_period_sec` option via SetDBOptions().
+* Added ReadOptions::max_skippable_internal_keys to set a threshold to fail a request as incomplete when too many keys are being skipped when using iterators.
+* DB::Get in place of std::string accepts PinnableSlice, which avoids the extra memcpy of value to std::string in most of cases.
+    * PinnableSlice releases the pinned resources that contain the value when it is destructed or when ::Reset() is called on it.
+    * The old API that accepts std::string, although discouraged, is still supported.
+* Replace Options::use_direct_writes with Options::use_direct_io_for_flush_and_compaction. Read Direct IO wiki for details.
+* Added CompactionEventListener and EventListener::OnFlushBegin interfaces.
+
+### New Features
+* Memtable flush can be avoided during checkpoint creation if total log file size is smaller than a threshold specified by the user.
+* Introduce level-based L0->L0 compactions to reduce file count, so write delays are incurred less often.
+* (Experimental) Partitioning filters which creates an index on the partitions. The feature can be enabled by setting partition_filters when using kFullFilter. Currently the feature also requires two-level indexing to be enabled. Number of partitions is the same as the number of partitions for indexes, which is controlled by metadata_block_size.
+
+## 5.3.0 (2017-03-08)
+### Public API Change
+* Remove disableDataSync option.
+* Remove timeout_hint_us option from WriteOptions. The option has been deprecated and has no effect since 3.13.0.
+* Remove option min_partial_merge_operands. Partial merge operands will always be merged in flush or compaction if there are more than one.
+* Remove option verify_checksums_in_compaction. Compaction will always verify checksum.
+
+### Bug Fixes
+* Fix the bug that iterator may skip keys
+
+## 5.2.0 (2017-02-08)
+### Public API Change
+* NewLRUCache() will determine number of shard bits automatically based on capacity, if the user doesn't pass one. This also impacts the default block cache when the user doesn't explicit provide one.
+* Change the default of delayed slowdown value to 16MB/s and further increase the L0 stop condition to 36 files.
+* Options::use_direct_writes and Options::use_direct_reads are now ready to use.
+* (Experimental) Two-level indexing that partition the index and creates a 2nd level index on the partitions. The feature can be enabled by setting kTwoLevelIndexSearch as IndexType and configuring index_per_partition.
+
+### New Features
+* Added new overloaded function GetApproximateSizes that allows to specify if memtable stats should be computed only without computing SST files' stats approximations.
+* Added new function GetApproximateMemTableStats that approximates both number of records and size of memtables.
+* Add Direct I/O mode for SST file I/O
+
+### Bug Fixes
+* RangeSync() should work if ROCKSDB_FALLOCATE_PRESENT is not set
+* Fix wrong results in a data race case in Get()
+* Some fixes related to 2PC.
+* Fix bugs of data corruption in direct I/O
+
+## 5.1.0 (2017-01-13)
+* Support dynamically change `delete_obsolete_files_period_micros` option via SetDBOptions().
+* Added EventListener::OnExternalFileIngested which will be called when IngestExternalFile() add a file successfully.
+* BackupEngine::Open and BackupEngineReadOnly::Open now always return error statuses matching those of the backup Env.
+
+### Bug Fixes
+* Fix the bug that if 2PC is enabled, checkpoints may loss some recent transactions.
+* When file copying is needed when creating checkpoints or bulk loading files, fsync the file after the file copying.
+
+## 5.0.0 (2016-11-17)
+### Public API Change
+* Options::max_bytes_for_level_multiplier is now a double along with all getters and setters.
+* Support dynamically change `delayed_write_rate` and `max_total_wal_size` options via SetDBOptions().
+* Introduce DB::DeleteRange for optimized deletion of large ranges of contiguous keys.
+* Support dynamically change `delayed_write_rate` option via SetDBOptions().
+* Options::allow_concurrent_memtable_write and Options::enable_write_thread_adaptive_yield are now true by default.
+* Remove Tickers::SEQUENCE_NUMBER to avoid confusion if statistics object is shared among RocksDB instance. Alternatively DB::GetLatestSequenceNumber() can be used to get the same value.
+* Options.level0_stop_writes_trigger default value changes from 24 to 32.
+* New compaction filter API: CompactionFilter::FilterV2(). Allows to drop ranges of keys.
+* Removed flashcache support.
+* DB::AddFile() is deprecated and is replaced with DB::IngestExternalFile(). DB::IngestExternalFile() remove all the restrictions that existed for DB::AddFile.
+
+### New Features
+* Add avoid_flush_during_shutdown option, which speeds up DB shutdown by not flushing unpersisted data (i.e. with disableWAL = true). Unpersisted data will be lost. The options is dynamically changeable via SetDBOptions().
+* Add memtable_insert_with_hint_prefix_extractor option. The option is mean to reduce CPU usage for inserting keys into memtable, if keys can be group by prefix and insert for each prefix are sequential or almost sequential. See include/rocksdb/options.h for more details.
+* Add LuaCompactionFilter in utilities.  This allows developers to write compaction filters in Lua.  To use this feature, LUA_PATH needs to be set to the root directory of Lua.
+* No longer populate "LATEST_BACKUP" file in backup directory, which formerly contained the number of the latest backup. The latest backup can be determined by finding the highest numbered file in the "meta/" subdirectory.
+
+## 4.13.0 (2016-10-18)
+### Public API Change
+* DB::GetOptions() reflect dynamic changed options (i.e. through DB::SetOptions()) and return copy of options instead of reference.
+* Added Statistics::getAndResetTickerCount().
+
+### New Features
+* Add DB::SetDBOptions() to dynamic change base_background_compactions and max_background_compactions.
+* Added Iterator::SeekForPrev(). This new API will seek to the last key that less than or equal to the target key.
+
+## 4.12.0 (2016-09-12)
+### Public API Change
+* CancelAllBackgroundWork() flushes all memtables for databases containing writes that have bypassed the WAL (writes issued with WriteOptions::disableWAL=true) before shutting down background threads.
+* Merge options source_compaction_factor, max_grandparent_overlap_bytes and expanded_compaction_factor into max_compaction_bytes.
+* Remove ImmutableCFOptions.
+* Add a compression type ZSTD, which can work with ZSTD 0.8.0 or up. Still keep ZSTDNotFinal for compatibility reasons.
+
+### New Features
+* Introduce NewClockCache, which is based on CLOCK algorithm with better concurrent performance in some cases. It can be used to replace the default LRU-based block cache and table cache. To use it, RocksDB need to be linked with TBB lib.
+* Change ticker/histogram statistics implementations to accumulate data in thread-local storage, which improves CPU performance by reducing cache coherency costs. Callers of CreateDBStatistics do not need to change anything to use this feature.
+* Block cache mid-point insertion, where index and filter block are inserted into LRU block cache with higher priority. The feature can be enabled by setting BlockBasedTableOptions::cache_index_and_filter_blocks_with_high_priority to true and high_pri_pool_ratio > 0 when creating NewLRUCache.
+
+## 4.11.0 (2016-08-01)
+### Public API Change
+* options.memtable_prefix_bloom_huge_page_tlb_size => memtable_huge_page_size. When it is set, RocksDB will try to allocate memory from huge page for memtable too, rather than just memtable bloom filter.
+
+### New Features
+* A tool to migrate DB after options change. See include/rocksdb/utilities/option_change_migration.h.
+* Add ReadOptions.background_purge_on_iterator_cleanup. If true, we avoid file deletion when destroying iterators.
+
+## 4.10.0 (2016-07-05)
+### Public API Change
+* options.memtable_prefix_bloom_bits changes to options.memtable_prefix_bloom_bits_ratio and deprecate options.memtable_prefix_bloom_probes
+* enum type CompressionType and PerfLevel changes from char to unsigned char. Value of all PerfLevel shift by one.
+* Deprecate options.filter_deletes.
+
+### New Features
+* Add avoid_flush_during_recovery option.
+* Add a read option background_purge_on_iterator_cleanup to avoid deleting files in foreground when destroying iterators. Instead, a job is scheduled in high priority queue and would be executed in a separate background thread.
+* RepairDB support for column families. RepairDB now associates data with non-default column families using information embedded in the SST/WAL files (4.7 or later). For data written by 4.6 or earlier, RepairDB associates it with the default column family.
+* Add options.write_buffer_manager which allows users to control total memtable sizes across multiple DB instances.
+
+## 4.9.0 (2016-06-09)
+### Public API changes
+* Add bottommost_compression option, This option can be used to set a specific compression algorithm for the bottommost level (Last level containing files in the DB).
+* Introduce CompactionJobInfo::compression, This field state the compression algorithm used to generate the output files of the compaction.
+* Deprecate BlockBaseTableOptions.hash_index_allow_collision=false
+* Deprecate options builder (GetOptions()).
+
+### New Features
+* Introduce NewSimCache() in rocksdb/utilities/sim_cache.h. This function creates a block cache that is able to give simulation results (mainly hit rate) of simulating block behavior with a configurable cache size.
+
+## 4.8.0 (2016-05-02)
+### Public API Change
+* Allow preset compression dictionary for improved compression of block-based tables. This is supported for zlib, zstd, and lz4. The compression dictionary's size is configurable via CompressionOptions::max_dict_bytes.
+* Delete deprecated classes for creating backups (BackupableDB) and restoring from backups (RestoreBackupableDB). Now, BackupEngine should be used for creating backups, and BackupEngineReadOnly should be used for restorations. For more details, see https://github.com/facebook/rocksdb/wiki/How-to-backup-RocksDB%3F
+* Expose estimate of per-level compression ratio via DB property: "rocksdb.compression-ratio-at-levelN".
+* Added EventListener::OnTableFileCreationStarted. EventListener::OnTableFileCreated will be called on failure case. User can check creation status via TableFileCreationInfo::status.
+
+### New Features
+* Add ReadOptions::readahead_size. If non-zero, NewIterator will create a new table reader which performs reads of the given size.
+
+## 4.7.0 (2016-04-08)
+### Public API Change
+* rename options compaction_measure_io_stats to report_bg_io_stats and include flush too.
+* Change some default options. Now default options will optimize for server-workloads. Also enable slowdown and full stop triggers for pending compaction bytes. These changes may cause sub-optimal performance or significant increase of resource usage. To avoid these risks, users can open existing RocksDB with options extracted from RocksDB option files. See https://github.com/facebook/rocksdb/wiki/RocksDB-Options-File for how to use RocksDB option files. Or you can call Options.OldDefaults() to recover old defaults. DEFAULT_OPTIONS_HISTORY.md will track change history of default options.
+
+## 4.6.0 (2016-03-10)
+### Public API Changes
+* Change default of BlockBasedTableOptions.format_version to 2. It means default DB created by 4.6 or up cannot be opened by RocksDB version 3.9 or earlier.
+* Added strict_capacity_limit option to NewLRUCache. If the flag is set to true, insert to cache will fail if no enough capacity can be free. Signature of Cache::Insert() is updated accordingly.
+* Tickers [NUMBER_DB_NEXT, NUMBER_DB_PREV, NUMBER_DB_NEXT_FOUND, NUMBER_DB_PREV_FOUND, ITER_BYTES_READ] are not updated immediately. The are updated when the Iterator is deleted.
+* Add monotonically increasing counter (DB property "rocksdb.current-super-version-number") that increments upon any change to the LSM tree.
+
+### New Features
+* Add CompactionPri::kMinOverlappingRatio, a compaction picking mode friendly to write amplification.
+* Deprecate Iterator::IsKeyPinned() and replace it with Iterator::GetProperty() with prop_name="rocksdb.iterator.is.key.pinned"
+
+## 4.5.0 (2016-02-05)
+### Public API Changes
+* Add a new perf context level between kEnableCount and kEnableTime. Level 2 now does not include timers for mutexes.
+* Statistics of mutex operation durations will not be measured by default. If you want to have them enabled, you need to set Statistics::stats_level_ to kAll.
+* DBOptions::delete_scheduler and NewDeleteScheduler() are removed, please use DBOptions::sst_file_manager and NewSstFileManager() instead
+
+### New Features
+* ldb tool now supports operations to non-default column families.
+* Add kPersistedTier to ReadTier.  This option allows Get and MultiGet to read only the persited data and skip mem-tables if writes were done with disableWAL = true.
+* Add DBOptions::sst_file_manager. Use NewSstFileManager() in include/rocksdb/sst_file_manager.h to create a SstFileManager that can be used to track the total size of SST files and control the SST files deletion rate.
+
+## 4.4.0 (2016-01-14)
+### Public API Changes
+* Change names in CompactionPri and add a new one.
+* Deprecate options.soft_rate_limit and add options.soft_pending_compaction_bytes_limit.
+* If options.max_write_buffer_number > 3, writes will be slowed down when writing to the last write buffer to delay a full stop.
+* Introduce CompactionJobInfo::compaction_reason, this field include the reason to trigger the compaction.
+* After slow down is triggered, if estimated pending compaction bytes keep increasing, slowdown more.
+* Increase default options.delayed_write_rate to 2MB/s.
+* Added a new parameter --path to ldb tool. --path accepts the name of either MANIFEST, SST or a WAL file. Either --db or --path can be used when calling ldb.
+
+## 4.3.0 (2015-12-08)
+### New Features
+* CompactionFilter has new member function called IgnoreSnapshots which allows CompactionFilter to be called even if there are snapshots later than the key.
+* RocksDB will now persist options under the same directory as the RocksDB database on successful DB::Open, CreateColumnFamily, DropColumnFamily, and SetOptions.
+* Introduce LoadLatestOptions() in rocksdb/utilities/options_util.h.  This function can construct the latest DBOptions / ColumnFamilyOptions used by the specified RocksDB intance.
+* Introduce CheckOptionsCompatibility() in rocksdb/utilities/options_util.h.  This function checks whether the input set of options is able to open the specified DB successfully.
+
+### Public API Changes
+* When options.db_write_buffer_size triggers, only the column family with the largest column family size will be flushed, not all the column families.
+
+## 4.2.0 (2015-11-09)
+### New Features
+* Introduce CreateLoggerFromOptions(), this function create a Logger for provided DBOptions.
+* Add GetAggregatedIntProperty(), which returns the sum of the GetIntProperty of all the column families.
+* Add MemoryUtil in rocksdb/utilities/memory.h.  It currently offers a way to get the memory usage by type from a list rocksdb instances.
+
+### Public API Changes
+* CompactionFilter::Context includes information of Column Family ID
+* The need-compaction hint given by TablePropertiesCollector::NeedCompact() will be persistent and recoverable after DB recovery. This introduces a breaking format change. If you use this experimental feature, including NewCompactOnDeletionCollectorFactory() in the new version, you may not be able to directly downgrade the DB back to version 4.0 or lower.
+* TablePropertiesCollectorFactory::CreateTablePropertiesCollector() now takes an option Context, containing the information of column family ID for the file being written.
+* Remove DefaultCompactionFilterFactory.
+
+
+## 4.1.0 (2015-10-08)
+### New Features
+* Added single delete operation as a more efficient way to delete keys that have not been overwritten.
+* Added experimental AddFile() to DB interface that allow users to add files created by SstFileWriter into an empty Database, see include/rocksdb/sst_file_writer.h and DB::AddFile() for more info.
+* Added support for opening SST files with .ldb suffix which enables opening LevelDB databases.
+* CompactionFilter now supports filtering of merge operands and merge results.
+
+### Public API Changes
+* Added SingleDelete() to the DB interface.
+* Added AddFile() to DB interface.
+* Added SstFileWriter class.
+* CompactionFilter has a new method FilterMergeOperand() that RocksDB applies to every merge operand during compaction to decide whether to filter the operand.
+* We removed CompactionFilterV2 interfaces from include/rocksdb/compaction_filter.h. The functionality was deprecated already in version 3.13.
+
+## 4.0.0 (2015-09-09)
+### New Features
+* Added support for transactions.  See include/rocksdb/utilities/transaction.h for more info.
+* DB::GetProperty() now accepts "rocksdb.aggregated-table-properties" and "rocksdb.aggregated-table-properties-at-levelN", in which case it returns aggregated table properties of the target column family, or the aggregated table properties of the specified level N if the "at-level" version is used.
+* Add compression option kZSTDNotFinalCompression for people to experiment ZSTD although its format is not finalized.
+* We removed the need for LATEST_BACKUP file in BackupEngine. We still keep writing it when we create new backups (because of backward compatibility), but we don't read it anymore.
+
+### Public API Changes
+* Removed class Env::RandomRWFile and Env::NewRandomRWFile().
+* Renamed DBOptions.num_subcompactions to DBOptions.max_subcompactions to make the name better match the actual functionality of the option.
+* Added Equal() method to the Comparator interface that can optionally be overwritten in cases where equality comparisons can be done more efficiently than three-way comparisons.
+* Previous 'experimental' OptimisticTransaction class has been replaced by Transaction class.
+
+## 3.13.0 (2015-08-06)
+### New Features
+* RollbackToSavePoint() in WriteBatch/WriteBatchWithIndex
+* Add NewCompactOnDeletionCollectorFactory() in utilities/table_properties_collectors, which allows rocksdb to mark a SST file as need-compaction when it observes at least D deletion entries in any N consecutive entries in that SST file.  Note that this feature depends on an experimental NeedCompact() API --- the result of this API will not persist after DB restart.
+* Add DBOptions::delete_scheduler. Use NewDeleteScheduler() in include/rocksdb/delete_scheduler.h to create a DeleteScheduler that can be shared among multiple RocksDB instances to control the file deletion rate of SST files that exist in the first db_path.
+
+### Public API Changes
+* Deprecated WriteOptions::timeout_hint_us. We no longer support write timeout. If you really need this option, talk to us and we might consider returning it.
+* Deprecated purge_redundant_kvs_while_flush option.
+* Removed BackupEngine::NewBackupEngine() and NewReadOnlyBackupEngine() that were deprecated in RocksDB 3.8. Please use BackupEngine::Open() instead.
+* Deprecated Compaction Filter V2. We are not aware of any existing use-cases. If you use this filter, your compile will break with RocksDB 3.13. Please let us know if you use it and we'll put it back in RocksDB 3.14.
+* Env::FileExists now returns a Status instead of a boolean
+* Add statistics::getHistogramString() to print detailed distribution of a histogram metric.
+* Add DBOptions::skip_stats_update_on_db_open.  When it is on, DB::Open() will run faster as it skips the random reads required for loading necessary stats from SST files to optimize compaction.
+
+## 3.12.0 (2015-07-02)
+### New Features
+* Added experimental support for optimistic transactions.  See include/rocksdb/utilities/optimistic_transaction.h for more info.
+* Added a new way to report QPS from db_bench (check out --report_file and --report_interval_seconds)
+* Added a cache for individual rows. See DBOptions::row_cache for more info.
+* Several new features on EventListener (see include/rocksdb/listener.h):
+ - OnCompactionCompleted() now returns per-compaction job statistics, defined in include/rocksdb/compaction_job_stats.h.
+ - Added OnTableFileCreated() and OnTableFileDeleted().
+* Add compaction_options_universal.enable_trivial_move to true, to allow trivial move while performing universal compaction. Trivial move will happen only when all the input files are non overlapping.
+
+### Public API changes
+* EventListener::OnFlushCompleted() now passes FlushJobInfo instead of a list of parameters.
+* DB::GetDbIdentity() is now a const function.  If this function is overridden in your application, be sure to also make GetDbIdentity() const to avoid compile error.
+* Move listeners from ColumnFamilyOptions to DBOptions.
+* Add max_write_buffer_number_to_maintain option
+* DB::CompactRange()'s parameter reduce_level is changed to change_level, to allow users to move levels to lower levels if allowed. It can be used to migrate a DB from options.level_compaction_dynamic_level_bytes=false to options.level_compaction_dynamic_level_bytes.true.
+* Change default value for options.compaction_filter_factory and options.compaction_filter_factory_v2 to nullptr instead of DefaultCompactionFilterFactory and DefaultCompactionFilterFactoryV2.
+* If CancelAllBackgroundWork is called without doing a flush after doing loads with WAL disabled, the changes which haven't been flushed before the call to CancelAllBackgroundWork will be lost.
+* WBWIIterator::Entry() now returns WriteEntry instead of `const WriteEntry&`
+* options.hard_rate_limit is deprecated.
+* When options.soft_rate_limit or options.level0_slowdown_writes_trigger is triggered, the way to slow down writes is changed to: write rate to DB is limited to to options.delayed_write_rate.
+* DB::GetApproximateSizes() adds a parameter to allow the estimation to include data in mem table, with default to be not to include. It is now only supported in skip list mem table.
+* DB::CompactRange() now accept CompactRangeOptions instead of multiple parameters. CompactRangeOptions is defined in include/rocksdb/options.h.
+* CompactRange() will now skip bottommost level compaction for level based compaction if there is no compaction filter, bottommost_level_compaction is introduced in CompactRangeOptions to control when it's possible to skip bottommost level compaction. This mean that if you want the compaction to produce a single file you need to set bottommost_level_compaction to BottommostLevelCompaction::kForce.
+* Add Cache.GetPinnedUsage() to get the size of memory occupied by entries that are in use by the system.
+* DB:Open() will fail if the compression specified in Options is not linked with the binary. If you see this failure, recompile RocksDB with compression libraries present on your system. Also, previously our default compression was snappy. This behavior is now changed. Now, the default compression is snappy only if it's available on the system. If it isn't we change the default to kNoCompression.
+* We changed how we account for memory used in block cache. Previously, we only counted the sum of block sizes currently present in block cache. Now, we count the actual memory usage of the blocks. For example, a block of size 4.5KB will use 8KB memory with jemalloc. This might decrease your memory usage and possibly decrease performance. Increase block cache size if you see this happening after an upgrade.
+* Add BackupEngineImpl.options_.max_background_operations to specify the maximum number of operations that may be performed in parallel. Add support for parallelized backup and restore.
+* Add DB::SyncWAL() that does a WAL sync without blocking writers.
+
+## 3.11.0 (2015-05-19)
+### New Features
+* Added a new API Cache::SetCapacity(size_t capacity) to dynamically change the maximum configured capacity of the cache. If the new capacity is less than the existing cache usage, the implementation will try to lower the usage by evicting the necessary number of elements following a strict LRU policy.
+* Added an experimental API for handling flashcache devices (blacklists background threads from caching their reads) -- NewFlashcacheAwareEnv
+* If universal compaction is used and options.num_levels > 1, compact files are tried to be stored in none-L0 with smaller files based on options.target_file_size_base. The limitation of DB size when using universal compaction is greatly mitigated by using more levels. You can set num_levels = 1 to make universal compaction behave as before. If you set num_levels > 1 and want to roll back to a previous version, you need to compact all files to a big file in level 0 (by setting target_file_size_base to be large and CompactRange(<cf_handle>, nullptr, nullptr, true, 0) and reopen the DB with the same version to rewrite the manifest, and then you can open it using previous releases.
+* More information about rocksdb background threads are available in Env::GetThreadList(), including the number of bytes read / written by a compaction job, mem-table size and current number of bytes written by a flush job and many more.  Check include/rocksdb/thread_status.h for more detail.
+
+### Public API changes
+* TablePropertiesCollector::AddUserKey() is added to replace TablePropertiesCollector::Add(). AddUserKey() exposes key type, sequence number and file size up to now to users.
+* DBOptions::bytes_per_sync used to apply to both WAL and table files. As of 3.11 it applies only to table files. If you want to use this option to sync WAL in the background, please use wal_bytes_per_sync
+
+## 3.10.0 (2015-03-24)
+### New Features
+* GetThreadStatus() is now able to report detailed thread status, including:
+ - Thread Operation including flush and compaction.
+ - The stage of the current thread operation.
+ - The elapsed time in micros since the current thread operation started.
+ More information can be found in include/rocksdb/thread_status.h.  In addition, when running db_bench with --thread_status_per_interval, db_bench will also report thread status periodically.
+* Changed the LRU caching algorithm so that referenced blocks (by iterators) are never evicted. This change made parameter removeScanCountLimit obsolete. Because of that NewLRUCache doesn't take three arguments anymore. table_cache_remove_scan_limit option is also removed
+* By default we now optimize the compilation for the compilation platform (using -march=native). If you want to build portable binary, use 'PORTABLE=1' before the make command.
+* We now allow level-compaction to place files in different paths by
+  specifying them in db_paths along with the target_size.
+  Lower numbered levels will be placed earlier in the db_paths and higher
+  numbered levels will be placed later in the db_paths vector.
+* Potentially big performance improvements if you're using RocksDB with lots of column families (100-1000)
+* Added BlockBasedTableOptions.format_version option, which allows user to specify which version of block based table he wants. As a general guideline, newer versions have more features, but might not be readable by older versions of RocksDB.
+* Added new block based table format (version 2), which you can enable by setting BlockBasedTableOptions.format_version = 2. This format changes how we encode size information in compressed blocks and should help with memory allocations if you're using Zlib or BZip2 compressions.
+* MemEnv (env that stores data in memory) is now available in default library build. You can create it by calling NewMemEnv().
+* Add SliceTransform.SameResultWhenAppended() to help users determine it is safe to apply prefix bloom/hash.
+* Block based table now makes use of prefix bloom filter if it is a full fulter.
+* Block based table remembers whether a whole key or prefix based bloom filter is supported in SST files. Do a sanity check when reading the file with users' configuration.
+* Fixed a bug in ReadOnlyBackupEngine that deleted corrupted backups in some cases, even though the engine was ReadOnly
+* options.level_compaction_dynamic_level_bytes, a feature to allow RocksDB to pick dynamic base of bytes for levels. With this feature turned on, we will automatically adjust max bytes for each level. The goal of this feature is to have lower bound on size amplification. For more details, see comments in options.h.
+* Added an abstract base class WriteBatchBase for write batches
+* Fixed a bug where we start deleting files of a dropped column families even if there are still live references to it
+
+### Public API changes
+* Deprecated skip_log_error_on_recovery and table_cache_remove_scan_count_limit options.
+* Logger method logv with log level parameter is now virtual
+
+### RocksJava
+* Added compression per level API.
+* MemEnv is now available in RocksJava via RocksMemEnv class.
+* lz4 compression is now included in rocksjava static library when running `make rocksdbjavastatic`.
+* Overflowing a size_t when setting rocksdb options now throws an IllegalArgumentException, which removes the necessity for a developer to catch these Exceptions explicitly.
+
+## 3.9.0 (2014-12-08)
+
+### New Features
+* Add rocksdb::GetThreadList(), which in the future will return the current status of all
+  rocksdb-related threads.  We will have more code instruments in the following RocksDB
+  releases.
+* Change convert function in rocksdb/utilities/convenience.h to return Status instead of boolean.
+  Also add support for nested options in convert function
+
+### Public API changes
+* New API to create a checkpoint added. Given a directory name, creates a new
+  database which is an image of the existing database.
+* New API LinkFile added to Env. If you implement your own Env class, an
+  implementation of the API LinkFile will have to be provided.
+* MemTableRep takes MemTableAllocator instead of Arena
+
+### Improvements
+* RocksDBLite library now becomes smaller and will be compiled with -fno-exceptions flag.
+
+## 3.8.0 (2014-11-14)
+
+### Public API changes
+* BackupEngine::NewBackupEngine() was deprecated; please use BackupEngine::Open() from now on.
+* BackupableDB/RestoreBackupableDB have new GarbageCollect() methods, which will clean up files from corrupt and obsolete backups.
+* BackupableDB/RestoreBackupableDB have new GetCorruptedBackups() methods which list corrupt backups.
+
+### Cleanup
+* Bunch of code cleanup, some extra warnings turned on (-Wshadow, -Wshorten-64-to-32, -Wnon-virtual-dtor)
+
+### New features
+* CompactFiles and EventListener, although they are still in experimental state
+* Full ColumnFamily support in RocksJava.
+
+## 3.7.0 (2014-11-06)
+### Public API changes
+* Introduce SetOptions() API to allow adjusting a subset of options dynamically online
+* Introduce 4 new convenient functions for converting Options from string: GetColumnFamilyOptionsFromMap(), GetColumnFamilyOptionsFromString(), GetDBOptionsFromMap(), GetDBOptionsFromString()
+* Remove WriteBatchWithIndex.Delete() overloads using SliceParts
+* When opening a DB, if options.max_background_compactions is larger than the existing low pri pool of options.env, it will enlarge it. Similarly, options.max_background_flushes is larger than the existing high pri pool of options.env, it will enlarge it.
+
+## 3.6.0 (2014-10-07)
+### Disk format changes
+* If you're using RocksDB on ARM platforms and you're using default bloom filter, there is a disk format change you need to be aware of. There are three steps you need to do when you convert to new release: 1. turn off filter policy, 2. compact the whole database, 3. turn on filter policy
+
+### Behavior changes
+* We have refactored our system of stalling writes.  Any stall-related statistics' meanings are changed. Instead of per-write stall counts, we now count stalls per-epoch, where epochs are periods between flushes and compactions. You'll find more information in our Tuning Perf Guide once we release RocksDB 3.6.
+* When disableDataSync=true, we no longer sync the MANIFEST file.
+* Add identity_as_first_hash property to CuckooTable. SST file needs to be rebuilt to be opened by reader properly.
+
+### Public API changes
+* Change target_file_size_base type to uint64_t from int.
+* Remove allow_thread_local. This feature was proved to be stable, so we are turning it always-on.
+
+## 3.5.0 (2014-09-03)
+### New Features
+* Add include/utilities/write_batch_with_index.h, providing a utility class to query data out of WriteBatch when building it.
+* Move BlockBasedTable related options to BlockBasedTableOptions from Options. Change corresponding JNI interface. Options affected include:
+  no_block_cache, block_cache, block_cache_compressed, block_size, block_size_deviation, block_restart_interval, filter_policy, whole_key_filtering. filter_policy is changed to shared_ptr from a raw pointer.
+* Remove deprecated options: disable_seek_compaction and db_stats_log_interval
+* OptimizeForPointLookup() takes one parameter for block cache size. It now builds hash index, bloom filter, and block cache.
+
+### Public API changes
+* The Prefix Extractor used with V2 compaction filters is now passed user key to SliceTransform::Transform instead of unparsed RocksDB key.
+
+## 3.4.0 (2014-08-18)
+### New Features
+* Support Multiple DB paths in universal style compactions
+* Add feature of storing plain table index and bloom filter in SST file.
+* CompactRange() will never output compacted files to level 0. This used to be the case when all the compaction input files were at level 0.
+* Added iterate_upper_bound to define the extent upto which the forward iterator will return entries. This will prevent iterating over delete markers and overwritten entries for edge cases where you want to break out the iterator anyways. This may improve performance in case there are a large number of delete markers or overwritten entries.
+
+### Public API changes
+* DBOptions.db_paths now is a vector of a DBPath structure which indicates both of path and target size
+* NewPlainTableFactory instead of bunch of parameters now accepts PlainTableOptions, which is defined in include/rocksdb/table.h
+* Moved include/utilities/*.h to include/rocksdb/utilities/*.h
+* Statistics APIs now take uint32_t as type instead of Tickers. Also make two access functions getTickerCount and histogramData const
+* Add DB property rocksdb.estimate-num-keys, estimated number of live keys in DB.
+* Add DB::GetIntProperty(), which returns DB properties that are integer as uint64_t.
+* The Prefix Extractor used with V2 compaction filters is now passed user key to SliceTransform::Transform instead of unparsed RocksDB key.
+
+## 3.3.0 (2014-07-10)
+### New Features
+* Added JSON API prototype.
+* HashLinklist reduces performance outlier caused by skewed bucket by switching data in the bucket from linked list to skip list. Add parameter threshold_use_skiplist in NewHashLinkListRepFactory().
+* RocksDB is now able to reclaim storage space more effectively during the compaction process.  This is done by compensating the size of each deletion entry by the 2X average value size, which makes compaction to be triggered by deletion entries more easily.
+* Add TimeOut API to write.  Now WriteOptions have a variable called timeout_hint_us.  With timeout_hint_us set to non-zero, any write associated with this timeout_hint_us may be aborted when it runs longer than the specified timeout_hint_us, and it is guaranteed that any write completes earlier than the specified time-out will not be aborted due to the time-out condition.
+* Add a rate_limiter option, which controls total throughput of flush and compaction. The throughput is specified in bytes/sec. Flush always has precedence over compaction when available bandwidth is constrained.
+
+### Public API changes
+* Removed NewTotalOrderPlainTableFactory because it is not used and implemented semantically incorrect.
+
+## 3.2.0 (2014-06-20)
+
+### Public API changes
+* We removed seek compaction as a concept from RocksDB because:
+1) It makes more sense for spinning disk workloads, while RocksDB is primarily designed for flash and memory,
+2) It added some complexity to the important code-paths,
+3) None of our internal customers were really using it.
+Because of that, Options::disable_seek_compaction is now obsolete. It is still a parameter in Options, so it does not break the build, but it does not have any effect. We plan to completely remove it at some point, so we ask users to please remove this option from your code base.
+* Add two parameters to NewHashLinkListRepFactory() for logging on too many entries in a hash bucket when flushing.
+* Added new option BlockBasedTableOptions::hash_index_allow_collision. When enabled, prefix hash index for block-based table will not store prefix and allow hash collision, reducing memory consumption.
+
+### New Features
+* PlainTable now supports a new key encoding: for keys of the same prefix, the prefix is only written once. It can be enabled through encoding_type parameter of NewPlainTableFactory()
+* Add AdaptiveTableFactory, which is used to convert from a DB of PlainTable to BlockBasedTabe, or vise versa. It can be created using NewAdaptiveTableFactory()
+
+### Performance Improvements
+* Tailing Iterator re-implemeted with ForwardIterator + Cascading Search Hint , see ~20% throughput improvement.
+
+## 3.1.0 (2014-05-21)
+
+### Public API changes
+* Replaced ColumnFamilyOptions::table_properties_collectors with ColumnFamilyOptions::table_properties_collector_factories
+
+### New Features
+* Hash index for block-based table will be materialized and reconstructed more efficiently. Previously hash index is constructed by scanning the whole table during every table open.
+* FIFO compaction style
+
+## 3.0.0 (2014-05-05)
+
+### Public API changes
+* Added _LEVEL to all InfoLogLevel enums
+* Deprecated ReadOptions.prefix and ReadOptions.prefix_seek. Seek() defaults to prefix-based seek when Options.prefix_extractor is supplied. More detail is documented in https://github.com/facebook/rocksdb/wiki/Prefix-Seek-API-Changes
+* MemTableRepFactory::CreateMemTableRep() takes info logger as an extra parameter.
+
+### New Features
+* Column family support
+* Added an option to use different checksum functions in BlockBasedTableOptions
+* Added ApplyToAllCacheEntries() function to Cache
+
+## 2.8.0 (2014-04-04)
+
+* Removed arena.h from public header files.
+* By default, checksums are verified on every read from database
+* Change default value of several options, including: paranoid_checks=true, max_open_files=5000, level0_slowdown_writes_trigger=20, level0_stop_writes_trigger=24, disable_seek_compaction=true, max_background_flushes=1 and allow_mmap_writes=false
+* Added is_manual_compaction to CompactionFilter::Context
+* Added "virtual void WaitForJoin()" in class Env. Default operation is no-op.
+* Removed BackupEngine::DeleteBackupsNewerThan() function
+* Added new option -- verify_checksums_in_compaction
+* Changed Options.prefix_extractor from raw pointer to shared_ptr (take ownership)
+  Changed HashSkipListRepFactory and HashLinkListRepFactory constructor to not take SliceTransform object (use Options.prefix_extractor implicitly)
+* Added Env::GetThreadPoolQueueLen(), which returns the waiting queue length of thread pools
+* Added a command "checkconsistency" in ldb tool, which checks
+  if file system state matches DB state (file existence and file sizes)
+* Separate options related to block based table to a new struct BlockBasedTableOptions.
+* WriteBatch has a new function Count() to return total size in the batch, and Data() now returns a reference instead of a copy
+* Add more counters to perf context.
+* Supports several more DB properties: compaction-pending, background-errors and cur-size-active-mem-table.
+
+### New Features
+* If we find one truncated record at the end of the MANIFEST or WAL files,
+  we will ignore it. We assume that writers of these records were interrupted
+  and that we can safely ignore it.
+* A new SST format "PlainTable" is added, which is optimized for memory-only workloads. It can be created through NewPlainTableFactory() or NewTotalOrderPlainTableFactory().
+* A new mem table implementation hash linked list optimizing for the case that there are only few keys for each prefix, which can be created through NewHashLinkListRepFactory().
+* Merge operator supports a new function PartialMergeMulti() to allow users to do partial merges against multiple operands.
+* Now compaction filter has a V2 interface. It buffers the kv-pairs sharing the same key prefix, process them in batches, and return the batched results back to DB. The new interface uses a new structure CompactionFilterContext for the same purpose as CompactionFilter::Context in V1.
+* Geo-spatial support for locations and radial-search.
+
+## 2.7.0 (2014-01-28)
+
+### Public API changes
+
+* Renamed `StackableDB::GetRawDB()` to `StackableDB::GetBaseDB()`.
+* Renamed `WriteBatch::Data()` `const std::string& Data() const`.
+* Renamed class `TableStats` to `TableProperties`.
+* Deleted class `PrefixHashRepFactory`. Please use `NewHashSkipListRepFactory()` instead.
+* Supported multi-threaded `EnableFileDeletions()` and `DisableFileDeletions()`.
+* Added `DB::GetOptions()`.
+* Added `DB::GetDbIdentity()`.
+
+### New Features
+
+* Added [BackupableDB](https://github.com/facebook/rocksdb/wiki/How-to-backup-RocksDB%3F)
+* Implemented [TailingIterator](https://github.com/facebook/rocksdb/wiki/Tailing-Iterator), a special type of iterator that
+  doesn't create a snapshot (can be used to read newly inserted data)
+  and is optimized for doing sequential reads.
+* Added property block for table, which allows (1) a table to store
+  its metadata and (2) end user to collect and store properties they
+  are interested in.
+* Enabled caching index and filter block in block cache (turned off by default).
+* Supported error report when doing manual compaction.
+* Supported additional Linux platform flavors and Mac OS.
+* Put with `SliceParts` - Variant of `Put()` that gathers output like `writev(2)`
+* Bug fixes and code refactor for compatibility with upcoming Column
+  Family feature.
+
+### Performance Improvements
+
+* Huge benchmark performance improvements by multiple efforts. For example, increase in readonly QPS from about 530k in 2.6 release to 1.1 million in 2.7 [1]
+* Speeding up a way RocksDB deleted obsolete files - no longer listing the whole directory under a lock -- decrease in p99
+* Use raw pointer instead of shared pointer for statistics: [5b825d](https://github.com/facebook/rocksdb/commit/5b825d6964e26ec3b4bb6faa708ebb1787f1d7bd) -- huge increase in performance -- shared pointers are slow
+* Optimized locking for `Get()` -- [1fdb3f](https://github.com/facebook/rocksdb/commit/1fdb3f7dc60e96394e3e5b69a46ede5d67fb976c) -- 1.5x QPS increase for some workloads
+* Cache speedup - [e8d40c3](https://github.com/facebook/rocksdb/commit/e8d40c31b3cca0c3e1ae9abe9b9003b1288026a9)
+* Implemented autovector, which allocates first N elements on stack. Most of vectors in RocksDB are small. Also, we never want to allocate heap objects while holding a mutex. -- [c01676e4](https://github.com/facebook/rocksdb/commit/c01676e46d3be08c3c140361ef1f5884f47d3b3c)
+* Lots of efforts to move malloc, memcpy and IO outside of locks
diff --git a/src/rocksdb/INSTALL.md b/src/rocksdb/INSTALL.md
new file mode 100644
index 000000000..7d3b14779
--- /dev/null
+++ b/src/rocksdb/INSTALL.md
@@ -0,0 +1,212 @@
+## Compilation
+
+**Important**: If you plan to run RocksDB in production, don't compile using default
+`make` or `make all`. That will compile RocksDB in debug mode, which is much slower
+than release mode.
+
+RocksDB's library should be able to compile without any dependency installed,
+although we recommend installing some compression libraries (see below).
+We do depend on newer gcc/clang with C++17 support (GCC >= 7, Clang >= 5).
+
+There are few options when compiling RocksDB:
+
+* [recommended] `make static_lib` will compile librocksdb.a, RocksDB static library. Compiles static library in release mode.
+
+* `make shared_lib` will compile librocksdb.so, RocksDB shared library. Compiles shared library in release mode.
+
+* `make check` will compile and run all the unit tests. `make check` will compile RocksDB in debug mode.
+
+* `make all` will compile our static library, and all our tools and unit tests. Our tools
+depend on gflags. You will need to have gflags installed to run `make all`. This will compile RocksDB in debug mode. Don't
+use binaries compiled by `make all` in production.
+
+* By default the binary we produce is optimized for the platform you're compiling on
+(`-march=native` or the equivalent). SSE4.2 will thus be enabled automatically if your
+CPU supports it. To print a warning if your CPU does not support SSE4.2, build with
+`USE_SSE=1 make static_lib` or, if using CMake, `cmake -DFORCE_SSE42=ON`. If you want
+to build a portable binary, add `PORTABLE=1` before your make commands, like this:
+`PORTABLE=1 make static_lib`.
+
+## Dependencies
+
+* You can link RocksDB with following compression libraries:
+  - [zlib](http://www.zlib.net/) - a library for data compression.
+  - [bzip2](http://www.bzip.org/) - a library for data compression.
+  - [lz4](https://github.com/lz4/lz4) - a library for extremely fast data compression.
+  - [snappy](http://google.github.io/snappy/) - a library for fast
+      data compression.
+  - [zstandard](http://www.zstd.net) - Fast real-time compression
+      algorithm.
+
+* All our tools depend on:
+  - [gflags](https://gflags.github.io/gflags/) - a library that handles
+      command line flags processing. You can compile rocksdb library even
+      if you don't have gflags installed.
+
+* `make check` will also check code formatting, which requires [clang-format](https://clang.llvm.org/docs/ClangFormat.html)
+
+* If you wish to build the RocksJava static target, then cmake is required for building Snappy.
+
+* If you wish to run microbench (e.g, `make microbench`, `make ribbon_bench` or `cmake -DWITH_BENCHMARK=1`), Google benchmark >= 1.6.0 is needed.
+
+## Supported platforms
+
+* **Linux - Ubuntu**
+    * Upgrade your gcc to version at least 7 to get C++17 support.
+    * Install gflags. First, try: `sudo apt-get install libgflags-dev`
+      If this doesn't work and you're using Ubuntu, here's a nice tutorial:
+      (http://askubuntu.com/questions/312173/installing-gflags-12-04)
+    * Install snappy. This is usually as easy as:
+      `sudo apt-get install libsnappy-dev`.
+    * Install zlib. Try: `sudo apt-get install zlib1g-dev`.
+    * Install bzip2: `sudo apt-get install libbz2-dev`.
+    * Install lz4: `sudo apt-get install liblz4-dev`.
+    * Install zstandard: `sudo apt-get install libzstd-dev`.
+
+* **Linux - CentOS / RHEL**
+    * Upgrade your gcc to version at least 7 to get C++17 support
+    * Install gflags:
+
+              git clone https://github.com/gflags/gflags.git
+              cd gflags
+              git checkout v2.0
+              ./configure && make && sudo make install
+
+      **Notice**: Once installed, please add the include path for gflags to your `CPATH` environment variable and the
+      lib path to `LIBRARY_PATH`. If installed with default settings, the include path will be `/usr/local/include`
+      and the lib path will be `/usr/local/lib`.
+
+    * Install snappy:
+
+              sudo yum install snappy snappy-devel
+
+    * Install zlib:
+
+              sudo yum install zlib zlib-devel
+
+    * Install bzip2:
+
+              sudo yum install bzip2 bzip2-devel
+
+    * Install lz4:
+
+              sudo yum install lz4-devel
+
+    * Install ASAN (optional for debugging):
+
+              sudo yum install libasan
+
+    * Install zstandard:
+        * With [EPEL](https://fedoraproject.org/wiki/EPEL):
+
+              sudo yum install libzstd-devel
+
+        * With CentOS 8:
+
+              sudo dnf install libzstd-devel
+
+        * From source:
+
+              wget https://github.com/facebook/zstd/archive/v1.1.3.tar.gz
+              mv v1.1.3.tar.gz zstd-1.1.3.tar.gz
+              tar zxvf zstd-1.1.3.tar.gz
+              cd zstd-1.1.3
+              make && sudo make install
+
+* **OS X**:
+    * Install latest C++ compiler that supports C++ 17:
+        * Update XCode:  run `xcode-select --install` (or install it from XCode App's settting).
+        * Install via [homebrew](http://brew.sh/).
+            * If you're first time developer in MacOS, you still need to run: `xcode-select --install` in your command line.
+            * run `brew tap homebrew/versions; brew install gcc7 --use-llvm` to install gcc 7 (or higher).
+    * run `brew install rocksdb`
+
+* **FreeBSD** (11.01):
+
+    * You can either install RocksDB from the Ports system using `cd /usr/ports/databases/rocksdb && make install`, or you can follow the details below to install dependencies and compile from source code:
+
+    * Install the dependencies for RocksDB:
+
+        export BATCH=YES
+        cd /usr/ports/devel/gmake && make install
+        cd /usr/ports/devel/gflags && make install
+
+        cd /usr/ports/archivers/snappy && make install
+        cd /usr/ports/archivers/bzip2 && make install
+        cd /usr/ports/archivers/liblz4 && make install
+        cd /usr/ports/archivesrs/zstd && make install
+
+        cd /usr/ports/devel/git && make install
+
+
+    * Install the dependencies for RocksJava (optional):
+
+        export BATCH=yes
+        cd /usr/ports/java/openjdk7 && make install
+
+    * Build RocksDB from source:
+        cd ~
+        git clone https://github.com/facebook/rocksdb.git
+        cd rocksdb
+        gmake static_lib
+
+    * Build RocksJava from source (optional):
+        cd rocksdb
+        export JAVA_HOME=/usr/local/openjdk7
+        gmake rocksdbjava
+
+* **OpenBSD** (6.3/-current):
+
+    * As RocksDB is not available in the ports yet you have to build it on your own:
+
+    * Install the dependencies for RocksDB:
+
+        pkg_add gmake gflags snappy bzip2 lz4 zstd git jdk bash findutils gnuwatch
+
+    * Build RocksDB from source:
+
+        cd ~
+        git clone https://github.com/facebook/rocksdb.git
+        cd rocksdb
+        gmake static_lib
+
+    * Build RocksJava from source (optional):
+
+        cd rocksdb
+        export JAVA_HOME=/usr/local/jdk-1.8.0
+        export PATH=$PATH:/usr/local/jdk-1.8.0/bin
+        gmake rocksdbjava
+
+* **iOS**:
+  * Run: `TARGET_OS=IOS make static_lib`. When building the project which uses rocksdb iOS library, make sure to define two important pre-processing macros: `ROCKSDB_LITE` and `IOS_CROSS_COMPILE`.
+
+* **Windows** (Visual Studio 2017 to up):
+  * Read and follow the instructions at CMakeLists.txt
+  * Or install via [vcpkg](https://github.com/microsoft/vcpkg)
+       * run `vcpkg install rocksdb:x64-windows`
+
+* **AIX 6.1**
+    * Install AIX Toolbox rpms with gcc
+    * Use these environment variables:
+
+             export PORTABLE=1
+             export CC=gcc
+             export AR="ar -X64"
+             export EXTRA_ARFLAGS=-X64
+             export EXTRA_CFLAGS=-maix64
+             export EXTRA_CXXFLAGS=-maix64
+             export PLATFORM_LDFLAGS="-static-libstdc++ -static-libgcc"
+             export LIBPATH=/opt/freeware/lib
+             export JAVA_HOME=/usr/java8_64
+             export PATH=/opt/freeware/bin:$PATH
+
+* **Solaris Sparc**
+    * Install GCC 7 and higher.
+    * Use these environment variables:
+
+             export CC=gcc
+             export EXTRA_CFLAGS=-m64
+             export EXTRA_CXXFLAGS=-m64
+             export EXTRA_LDFLAGS=-m64
+             export PORTABLE=1
+             export PLATFORM_LDFLAGS="-static-libstdc++ -static-libgcc"
diff --git a/src/rocksdb/LANGUAGE-BINDINGS.md b/src/rocksdb/LANGUAGE-BINDINGS.md
new file mode 100644
index 000000000..f45680e61
--- /dev/null
+++ b/src/rocksdb/LANGUAGE-BINDINGS.md
@@ -0,0 +1,26 @@
+This is the list of all known third-party language bindings for RocksDB. If something is missing, please open a pull request to add it.
+
+* Java - https://github.com/facebook/rocksdb/tree/main/java
+* Python
+    * http://python-rocksdb.readthedocs.io/en/latest/
+    * http://pyrocksdb.readthedocs.org/en/latest/ (unmaintained)
+* Perl - https://metacpan.org/pod/RocksDB
+* Node.js - https://npmjs.org/package/rocksdb
+* Go 
+  * https://github.com/linxGnu/grocksdb
+  * https://github.com/tecbot/gorocksdb (unmaintained)
+* Ruby - http://rubygems.org/gems/rocksdb-ruby
+* Haskell - https://hackage.haskell.org/package/rocksdb-haskell
+* PHP - https://github.com/Photonios/rocksdb-php
+* C#
+    * https://github.com/warrenfalk/rocksdb-sharp
+    * https://github.com/curiosity-ai/rocksdb-sharp
+* Rust
+    * https://github.com/pingcap/rust-rocksdb (used in production fork of https://github.com/spacejam/rust-rocksdb)
+    * https://github.com/spacejam/rust-rocksdb
+    * https://github.com/bh1xuw/rust-rocks
+* D programming language - https://github.com/b1naryth1ef/rocksdb
+* Erlang - https://gitlab.com/barrel-db/erlang-rocksdb
+* Elixir - https://github.com/urbint/rox
+* Nim - https://github.com/status-im/nim-rocksdb
+* Swift and Objective-C (iOS/OSX) - https://github.com/iabudiab/ObjectiveRocks 
diff --git a/src/rocksdb/LICENSE.Apache b/src/rocksdb/LICENSE.Apache
new file mode 100644
index 000000000..d64569567
--- /dev/null
+++ b/src/rocksdb/LICENSE.Apache
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/src/rocksdb/LICENSE.leveldb b/src/rocksdb/LICENSE.leveldb
new file mode 100644
index 000000000..7108b0bfb
--- /dev/null
+++ b/src/rocksdb/LICENSE.leveldb
@@ -0,0 +1,29 @@
+This contains code that is from LevelDB, and that code is under the following license:
+
+Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/src/rocksdb/Makefile b/src/rocksdb/Makefile
new file mode 100644
index 000000000..06f2e32a2
--- /dev/null
+++ b/src/rocksdb/Makefile
@@ -0,0 +1,2596 @@
+# Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+# Inherit some settings from environment variables, if available
+
+#-----------------------------------------------
+
+BASH_EXISTS := $(shell which bash)
+SHELL := $(shell which bash)
+include common.mk
+
+CLEAN_FILES = # deliberately empty, so we can append below.
+CFLAGS += ${EXTRA_CFLAGS}
+CXXFLAGS += ${EXTRA_CXXFLAGS}
+LDFLAGS += $(EXTRA_LDFLAGS)
+MACHINE ?= $(shell uname -m)
+ARFLAGS = ${EXTRA_ARFLAGS} rs
+STRIPFLAGS = -S -x
+
+# Transform parallel LOG output into something more readable.
+perl_command = perl -n \
+  -e '@a=split("\t",$$_,-1); $$t=$$a[8];'				\
+  -e '$$t =~ /.*if\s\[\[\s"(.*?\.[\w\/]+)/ and $$t=$$1;'		\
+  -e '$$t =~ s,^\./,,;'							\
+  -e '$$t =~ s, >.*,,; chomp $$t;'					\
+  -e '$$t =~ /.*--gtest_filter=(.*?\.[\w\/]+)/ and $$t=$$1;'		\
+  -e 'printf "%7.3f %s %s\n", $$a[3], $$a[6] == 0 ? "PASS" : "FAIL", $$t'
+quoted_perl_command = $(subst ','\'',$(perl_command))
+
+# DEBUG_LEVEL can have three values:
+# * DEBUG_LEVEL=2; this is the ultimate debug mode. It will compile rocksdb
+# without any optimizations. To compile with level 2, issue `make dbg`
+# * DEBUG_LEVEL=1; debug level 1 enables all assertions and debug code, but
+# compiles rocksdb with -O2 optimizations. this is the default debug level.
+# `make all` or `make <binary_target>` compile RocksDB with debug level 1.
+# We use this debug level when developing RocksDB.
+# * DEBUG_LEVEL=0; this is the debug level we use for release. If you're
+# running rocksdb in production you most definitely want to compile RocksDB
+# with debug level 0. To compile with level 0, run `make shared_lib`,
+# `make install-shared`, `make static_lib`, `make install-static` or
+# `make install`
+
+# Set the default DEBUG_LEVEL to 1
+DEBUG_LEVEL?=1
+
+# LIB_MODE says whether or not to use/build "shared" or "static" libraries.
+# Mode "static" means to link against static libraries (.a)
+# Mode "shared" means to link against shared libraries (.so, .sl, .dylib, etc)
+#
+# Set the default LIB_MODE to static
+LIB_MODE?=static
+
+# OBJ_DIR is where the object files reside.  Default to the current directory
+OBJ_DIR?=.
+
+# Check the MAKECMDGOALS to set the DEBUG_LEVEL and LIB_MODE appropriately
+
+ifneq ($(filter clean release install, $(MAKECMDGOALS)),)
+	DEBUG_LEVEL=0
+endif
+ifneq ($(filter dbg, $(MAKECMDGOALS)),)
+	DEBUG_LEVEL=2
+else ifneq ($(filter shared_lib install-shared, $(MAKECMDGOALS)),)
+	DEBUG_LEVEL=0
+	LIB_MODE=shared
+else ifneq ($(filter static_lib install-static, $(MAKECMDGOALS)),)
+	DEBUG_LEVEL=0
+	LIB_MODE=static
+else ifneq ($(filter jtest rocksdbjava%, $(MAKECMDGOALS)),)
+	OBJ_DIR=jl
+	LIB_MODE=shared
+	ifneq ($(findstring rocksdbjavastatic, $(MAKECMDGOALS)),)
+		OBJ_DIR=jls
+		ifneq ($(DEBUG_LEVEL),2)
+			DEBUG_LEVEL=0
+		endif
+		ifeq ($(MAKECMDGOALS),rocksdbjavastaticpublish)
+			DEBUG_LEVEL=0
+		endif
+	endif
+endif
+
+$(info $$DEBUG_LEVEL is ${DEBUG_LEVEL})
+
+# Lite build flag.
+LITE ?= 0
+ifeq ($(LITE), 0)
+ifneq ($(filter -DROCKSDB_LITE,$(OPT)),)
+  # Be backward compatible and support older format where OPT=-DROCKSDB_LITE is
+  # specified instead of LITE=1 on the command line.
+  LITE=1
+endif
+else ifeq ($(LITE), 1)
+ifeq ($(filter -DROCKSDB_LITE,$(OPT)),)
+	OPT += -DROCKSDB_LITE
+endif
+endif
+
+# Figure out optimize level.
+ifneq ($(DEBUG_LEVEL), 2)
+ifeq ($(LITE), 0)
+	OPTIMIZE_LEVEL ?= -O2
+else
+	OPTIMIZE_LEVEL ?= -Os
+endif
+endif
+# `OPTIMIZE_LEVEL` is empty when the user does not set it and `DEBUG_LEVEL=2`.
+# In that case, the compiler default (`-O0` for gcc and clang) will be used.
+OPT += $(OPTIMIZE_LEVEL)
+
+# compile with -O2 if debug level is not 2
+ifneq ($(DEBUG_LEVEL), 2)
+OPT += -fno-omit-frame-pointer
+# Skip for archs that don't support -momit-leaf-frame-pointer
+ifeq (,$(shell $(CXX) -fsyntax-only -momit-leaf-frame-pointer -xc /dev/null 2>&1))
+OPT += -momit-leaf-frame-pointer
+endif
+endif
+
+ifeq (,$(shell $(CXX) -fsyntax-only -maltivec -xc /dev/null 2>&1))
+CXXFLAGS += -DHAS_ALTIVEC
+CFLAGS += -DHAS_ALTIVEC
+HAS_ALTIVEC=1
+endif
+
+ifeq (,$(shell $(CXX) -fsyntax-only -mcpu=power8 -xc /dev/null 2>&1))
+CXXFLAGS += -DHAVE_POWER8
+CFLAGS +=  -DHAVE_POWER8
+HAVE_POWER8=1
+endif
+
+# if we're compiling for shared libraries, add the shared flags
+ifeq ($(LIB_MODE),shared)
+CXXFLAGS += $(PLATFORM_SHARED_CFLAGS) -DROCKSDB_DLL
+CFLAGS +=  $(PLATFORM_SHARED_CFLAGS) -DROCKSDB_DLL
+endif
+
+GIT_COMMAND ?= git
+ifeq ($(USE_COROUTINES), 1)
+	USE_FOLLY = 1
+	# glog/logging.h requires HAVE_CXX11_ATOMIC
+	OPT += -DUSE_COROUTINES -DHAVE_CXX11_ATOMIC
+	ROCKSDB_CXX_STANDARD = c++2a
+	USE_RTTI = 1
+ifneq ($(USE_CLANG), 1)
+	ROCKSDB_CXX_STANDARD = c++20
+	PLATFORM_CXXFLAGS += -fcoroutines
+endif
+endif
+
+# if we're compiling for release, compile without debug code (-DNDEBUG)
+ifeq ($(DEBUG_LEVEL),0)
+OPT += -DNDEBUG
+
+ifneq ($(USE_RTTI), 1)
+	CXXFLAGS += -fno-rtti
+else
+	CXXFLAGS += -DROCKSDB_USE_RTTI
+endif
+else
+ifneq ($(USE_RTTI), 0)
+	CXXFLAGS += -DROCKSDB_USE_RTTI
+else
+	CXXFLAGS += -fno-rtti
+endif
+
+ifdef ASSERT_STATUS_CHECKED
+# For ASC, turn off constructor elision, preventing the case where a constructor returned
+# by a method may pass the ASC check if the status is checked in the inner method.  Forcing
+# the copy constructor to be invoked disables the optimization and will cause the calling method
+# to check the status in order to prevent an error from being raised.
+PLATFORM_CXXFLAGS += -fno-elide-constructors
+ifeq ($(filter -DROCKSDB_ASSERT_STATUS_CHECKED,$(OPT)),)
+	OPT += -DROCKSDB_ASSERT_STATUS_CHECKED
+endif
+endif
+
+$(warning Warning: Compiling in debug mode. Don't use the resulting binary in production)
+endif
+
+# `USE_LTO=1` enables link-time optimizations. Among other things, this enables
+# more devirtualization opportunities and inlining across translation units.
+# This can save significant overhead introduced by RocksDB's pluggable
+# interfaces/internal abstractions, like in the iterator hierarchy. It works
+# better when combined with profile-guided optimizations (not currently
+# supported natively in Makefile).
+ifeq ($(USE_LTO), 1)
+	CXXFLAGS += -flto
+	LDFLAGS += -flto -fuse-linker-plugin
+endif
+
+# `COERCE_CONTEXT_SWITCH=1` will inject spurious wakeup and
+# random length of sleep or context switch at critical
+# points (e.g, before acquring db mutex) in RocksDB.
+# In this way, it coerces as many excution orders as possible in the hope of
+# exposing the problematic excution order
+COERCE_CONTEXT_SWITCH ?= 0
+ifeq ($(COERCE_CONTEXT_SWITCH), 1)
+OPT += -DCOERCE_CONTEXT_SWITCH
+endif
+
+#-----------------------------------------------
+include src.mk
+
+AM_DEFAULT_VERBOSITY ?= 0
+
+AM_V_GEN = $(am__v_GEN_$(V))
+am__v_GEN_ = $(am__v_GEN_$(AM_DEFAULT_VERBOSITY))
+am__v_GEN_0 = @echo "  GEN     " $@;
+am__v_GEN_1 =
+AM_V_at = $(am__v_at_$(V))
+am__v_at_ = $(am__v_at_$(AM_DEFAULT_VERBOSITY))
+am__v_at_0 = @
+am__v_at_1 =
+
+AM_V_CC = $(am__v_CC_$(V))
+am__v_CC_ = $(am__v_CC_$(AM_DEFAULT_VERBOSITY))
+am__v_CC_0 = @echo "  CC      " $@;
+am__v_CC_1 =
+
+AM_V_CCLD = $(am__v_CCLD_$(V))
+am__v_CCLD_ = $(am__v_CCLD_$(AM_DEFAULT_VERBOSITY))
+ifneq ($(SKIP_LINK), 1)
+am__v_CCLD_0 = @echo "  CCLD    " $@;
+am__v_CCLD_1 =
+else
+am__v_CCLD_0 = @echo "  !CCLD   " $@; true skip
+am__v_CCLD_1 = true skip
+endif
+AM_V_AR = $(am__v_AR_$(V))
+am__v_AR_ = $(am__v_AR_$(AM_DEFAULT_VERBOSITY))
+am__v_AR_0 = @echo "  AR      " $@;
+am__v_AR_1 =
+
+AM_LINK = $(AM_V_CCLD)$(CXX) -L. $(patsubst lib%.a, -l%, $(patsubst lib%.$(PLATFORM_SHARED_EXT), -l%, $^)) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+AM_SHARE = $(AM_V_CCLD) $(CXX) $(PLATFORM_SHARED_LDFLAGS)$@ -L. $(patsubst lib%.$(PLATFORM_SHARED_EXT), -l%, $^) $(EXEC_LDFLAGS) $(LDFLAGS) -o $@
+
+# Detect what platform we're building on.
+# Export some common variables that might have been passed as Make variables
+# instead of environment variables.
+dummy := $(shell (export ROCKSDB_ROOT="$(CURDIR)"; \
+                  export CXXFLAGS="$(EXTRA_CXXFLAGS)"; \
+                  export LDFLAGS="$(EXTRA_LDFLAGS)"; \
+                  export COMPILE_WITH_ASAN="$(COMPILE_WITH_ASAN)"; \
+                  export COMPILE_WITH_TSAN="$(COMPILE_WITH_TSAN)"; \
+                  export COMPILE_WITH_UBSAN="$(COMPILE_WITH_UBSAN)"; \
+                  export PORTABLE="$(PORTABLE)"; \
+                  export ROCKSDB_NO_FBCODE="$(ROCKSDB_NO_FBCODE)"; \
+                  export USE_CLANG="$(USE_CLANG)"; \
+                  export LIB_MODE="$(LIB_MODE)"; \
+		  export ROCKSDB_CXX_STANDARD="$(ROCKSDB_CXX_STANDARD)"; \
+		  export USE_FOLLY="$(USE_FOLLY)"; \
+                  "$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/make_config.mk"))
+# this file is generated by the previous line to set build flags and sources
+include make_config.mk
+
+ROCKSDB_PLUGIN_MKS = $(foreach plugin, $(ROCKSDB_PLUGINS), plugin/$(plugin)/*.mk)
+include $(ROCKSDB_PLUGIN_MKS)
+ROCKSDB_PLUGIN_PROTO =ROCKSDB_NAMESPACE::ObjectLibrary\&, const std::string\&
+ROCKSDB_PLUGIN_SOURCES = $(foreach p, $(ROCKSDB_PLUGINS), $(foreach source, $($(p)_SOURCES), plugin/$(p)/$(source)))
+ROCKSDB_PLUGIN_HEADERS = $(foreach p, $(ROCKSDB_PLUGINS), $(foreach header, $($(p)_HEADERS), plugin/$(p)/$(header)))
+ROCKSDB_PLUGIN_LIBS = $(foreach p, $(ROCKSDB_PLUGINS), $(foreach lib, $($(p)_LIBS), -l$(lib)))
+ROCKSDB_PLUGIN_W_FUNCS = $(foreach p, $(ROCKSDB_PLUGINS), $(if $($(p)_FUNC), $(p)))
+ROCKSDB_PLUGIN_EXTERNS = $(foreach p, $(ROCKSDB_PLUGIN_W_FUNCS), int $($(p)_FUNC)($(ROCKSDB_PLUGIN_PROTO));)
+ROCKSDB_PLUGIN_BUILTINS = $(foreach p, $(ROCKSDB_PLUGIN_W_FUNCS), {\"$(p)\"\, $($(p)_FUNC)}\,)
+ROCKSDB_PLUGIN_LDFLAGS = $(foreach plugin, $(ROCKSDB_PLUGINS), $($(plugin)_LDFLAGS))
+ROCKSDB_PLUGIN_PKGCONFIG_REQUIRES = $(foreach plugin, $(ROCKSDB_PLUGINS), $($(plugin)_PKGCONFIG_REQUIRES))
+
+CXXFLAGS += $(foreach plugin, $(ROCKSDB_PLUGINS), $($(plugin)_CXXFLAGS))
+PLATFORM_LDFLAGS += $(ROCKSDB_PLUGIN_LDFLAGS)
+
+# Patch up the link flags for JNI from the plugins
+JAVA_LDFLAGS += $(ROCKSDB_PLUGIN_LDFLAGS)
+JAVA_STATIC_LDFLAGS += $(ROCKSDB_PLUGIN_LDFLAGS)
+
+# Patch up the list of java native sources with files from the plugins
+ROCKSDB_PLUGIN_JNI_NATIVE_SOURCES = $(foreach plugin, $(ROCKSDB_PLUGINS), $(foreach source, $($(plugin)_JNI_NATIVE_SOURCES), plugin/$(plugin)/$(source)))
+ALL_JNI_NATIVE_SOURCES = $(JNI_NATIVE_SOURCES) $(ROCKSDB_PLUGIN_JNI_NATIVE_SOURCES)
+ROCKSDB_PLUGIN_JNI_CXX_INCLUDEFLAGS = $(foreach plugin, $(ROCKSDB_PLUGINS), -I./plugin/$(plugin))
+
+ifneq ($(strip $(ROCKSDB_PLUGIN_PKGCONFIG_REQUIRES)),)
+LDFLAGS := $(LDFLAGS) $(shell pkg-config --libs $(ROCKSDB_PLUGIN_PKGCONFIG_REQUIRES))
+ifneq ($(.SHELLSTATUS),0)
+$(error pkg-config failed)
+endif
+CXXFLAGS := $(CXXFLAGS) $(shell pkg-config --cflags $(ROCKSDB_PLUGIN_PKGCONFIG_REQUIRES))
+ifneq ($(.SHELLSTATUS),0)
+$(error pkg-config failed)
+endif
+endif
+
+CXXFLAGS += $(ARCHFLAG)
+
+ifeq (,$(shell $(CXX) -fsyntax-only -march=armv8-a+crc+crypto -xc /dev/null 2>&1))
+ifneq ($(PLATFORM),OS_MACOSX)
+CXXFLAGS += -march=armv8-a+crc+crypto
+CFLAGS += -march=armv8-a+crc+crypto
+ARMCRC_SOURCE=1
+endif
+endif
+
+export JAVAC_ARGS
+CLEAN_FILES += make_config.mk rocksdb.pc
+
+ifeq ($(V), 1)
+$(info $(shell uname -a))
+$(info $(shell $(CC) --version))
+$(info $(shell $(CXX) --version))
+endif
+
+missing_make_config_paths := $(shell				\
+	grep "\./\S*\|/\S*" -o $(CURDIR)/make_config.mk | 	\
+	while read path;					\
+		do [ -e $$path ] || echo $$path; 		\
+	done | sort | uniq | grep -v "/DOES/NOT/EXIST")
+
+$(foreach path, $(missing_make_config_paths), \
+	$(warning Warning: $(path) does not exist))
+
+ifeq ($(PLATFORM), OS_AIX)
+# no debug info
+else ifneq ($(PLATFORM), IOS)
+CFLAGS += -g
+CXXFLAGS += -g
+else
+# no debug info for IOS, that will make our library big
+OPT += -DNDEBUG
+endif
+
+ifeq ($(PLATFORM), OS_AIX)
+ARFLAGS = -X64 rs
+STRIPFLAGS = -X64 -x
+endif
+
+ifeq ($(PLATFORM), OS_SOLARIS)
+	PLATFORM_CXXFLAGS += -D _GLIBCXX_USE_C99
+endif
+ifneq ($(filter -DROCKSDB_LITE,$(OPT)),)
+	# found
+	CFLAGS += -fno-exceptions
+	CXXFLAGS += -fno-exceptions
+	# LUA is not supported under ROCKSDB_LITE
+	LUA_PATH =
+endif
+
+ifeq ($(LIB_MODE),shared)
+# So that binaries are executable from build location, in addition to install location
+EXEC_LDFLAGS += -Wl,-rpath -Wl,'$$ORIGIN'
+endif
+
+ifeq ($(PLATFORM), OS_MACOSX)
+ifeq ($(ARCHFLAG), -arch arm64)
+ifneq ($(MACHINE), arm64)
+# If we're building on a non-arm64 machine but targeting arm64 Mac, we need to disable
+# linking with jemalloc (as it won't be arm64-compatible) and remove some other options
+# set during platform detection
+DISABLE_JEMALLOC=1
+PLATFORM_CFLAGS := $(filter-out -march=native -DHAVE_SSE42 -DHAVE_AVX2, $(PLATFORM_CFLAGS))
+PLATFORM_CXXFLAGS := $(filter-out -march=native -DHAVE_SSE42 -DHAVE_AVX2, $(PLATFORM_CXXFLAGS))
+endif
+endif
+endif
+
+# ASAN doesn't work well with jemalloc. If we're compiling with ASAN, we should use regular malloc.
+ifdef COMPILE_WITH_ASAN
+	DISABLE_JEMALLOC=1
+	ASAN_OPTIONS?=detect_stack_use_after_return=1
+	export ASAN_OPTIONS
+	EXEC_LDFLAGS += -fsanitize=address
+	PLATFORM_CCFLAGS += -fsanitize=address
+	PLATFORM_CXXFLAGS += -fsanitize=address
+ifeq ($(LIB_MODE),shared)
+ifdef USE_CLANG
+# Fix false ODR violation; see https://github.com/google/sanitizers/issues/1017
+	EXEC_LDFLAGS += -mllvm -asan-use-private-alias=1
+	PLATFORM_CXXFLAGS += -mllvm -asan-use-private-alias=1
+endif
+endif
+endif
+
+# TSAN doesn't work well with jemalloc. If we're compiling with TSAN, we should use regular malloc.
+ifdef COMPILE_WITH_TSAN
+	DISABLE_JEMALLOC=1
+	EXEC_LDFLAGS += -fsanitize=thread
+	PLATFORM_CCFLAGS += -fsanitize=thread -fPIC -DFOLLY_SANITIZE_THREAD
+	PLATFORM_CXXFLAGS += -fsanitize=thread -fPIC -DFOLLY_SANITIZE_THREAD
+        # Turn off -pg when enabling TSAN testing, because that induces
+        # a link failure.  TODO: find the root cause
+	PROFILING_FLAGS =
+	# LUA is not supported under TSAN
+	LUA_PATH =
+	# Limit keys for crash test under TSAN to avoid error:
+	# "ThreadSanitizer: DenseSlabAllocator overflow. Dying."
+	CRASH_TEST_EXT_ARGS += --max_key=1000000
+endif
+
+# AIX doesn't work with -pg
+ifeq ($(PLATFORM), OS_AIX)
+	PROFILING_FLAGS =
+endif
+
+# USAN doesn't work well with jemalloc. If we're compiling with USAN, we should use regular malloc.
+ifdef COMPILE_WITH_UBSAN
+	DISABLE_JEMALLOC=1
+	# Suppress alignment warning because murmurhash relies on casting unaligned
+	# memory to integer. Fixing it may cause performance regression. 3-way crc32
+	# relies on it too, although it can be rewritten to eliminate with minimal
+	# performance regression.
+	EXEC_LDFLAGS += -fsanitize=undefined -fno-sanitize-recover=all
+	PLATFORM_CCFLAGS += -fsanitize=undefined -fno-sanitize-recover=all -DROCKSDB_UBSAN_RUN
+	PLATFORM_CXXFLAGS += -fsanitize=undefined -fno-sanitize-recover=all -DROCKSDB_UBSAN_RUN
+endif
+
+ifdef ROCKSDB_VALGRIND_RUN
+	PLATFORM_CCFLAGS += -DROCKSDB_VALGRIND_RUN
+	PLATFORM_CXXFLAGS += -DROCKSDB_VALGRIND_RUN
+endif
+ifdef ROCKSDB_FULL_VALGRIND_RUN
+	# Some tests are slow when run under valgrind and are only run when
+	# explicitly requested via the ROCKSDB_FULL_VALGRIND_RUN compiler flag.
+	PLATFORM_CCFLAGS += -DROCKSDB_VALGRIND_RUN -DROCKSDB_FULL_VALGRIND_RUN
+	PLATFORM_CXXFLAGS += -DROCKSDB_VALGRIND_RUN -DROCKSDB_FULL_VALGRIND_RUN
+endif
+
+ifndef DISABLE_JEMALLOC
+	ifdef JEMALLOC
+		PLATFORM_CXXFLAGS += -DROCKSDB_JEMALLOC -DJEMALLOC_NO_DEMANGLE
+		PLATFORM_CCFLAGS  += -DROCKSDB_JEMALLOC -DJEMALLOC_NO_DEMANGLE
+		ifeq ($(USE_FOLLY),1)
+			PLATFORM_CXXFLAGS += -DUSE_JEMALLOC
+			PLATFORM_CCFLAGS  += -DUSE_JEMALLOC
+		endif
+		ifeq ($(USE_FOLLY_LITE),1)
+			PLATFORM_CXXFLAGS += -DUSE_JEMALLOC
+			PLATFORM_CCFLAGS  += -DUSE_JEMALLOC
+		endif
+	endif
+	ifdef WITH_JEMALLOC_FLAG
+		PLATFORM_LDFLAGS += -ljemalloc
+		JAVA_LDFLAGS += -ljemalloc
+	endif
+	EXEC_LDFLAGS := $(JEMALLOC_LIB) $(EXEC_LDFLAGS)
+	PLATFORM_CXXFLAGS += $(JEMALLOC_INCLUDE)
+	PLATFORM_CCFLAGS += $(JEMALLOC_INCLUDE)
+endif
+
+ifndef USE_FOLLY
+	USE_FOLLY=0
+endif
+
+ifndef GTEST_THROW_ON_FAILURE
+	export GTEST_THROW_ON_FAILURE=1
+endif
+ifndef GTEST_HAS_EXCEPTIONS
+	export GTEST_HAS_EXCEPTIONS=1
+endif
+
+GTEST_DIR = third-party/gtest-1.8.1/fused-src
+# AIX: pre-defined system headers are surrounded by an extern "C" block
+ifeq ($(PLATFORM), OS_AIX)
+	PLATFORM_CCFLAGS += -I$(GTEST_DIR)
+	PLATFORM_CXXFLAGS += -I$(GTEST_DIR)
+else
+	PLATFORM_CCFLAGS += -isystem $(GTEST_DIR)
+	PLATFORM_CXXFLAGS += -isystem $(GTEST_DIR)
+endif
+
+# This provides a Makefile simulation of a Meta-internal folly integration.
+# It is not validated for general use.
+#
+# USE_FOLLY links the build targets with libfolly.a. The latter could be
+# built using 'make build_folly', or built externally and specified in
+# the CXXFLAGS and EXTRA_LDFLAGS env variables. The build_detect_platform
+# script tries to detect if an external folly dependency has been specified.
+# If not, it exports FOLLY_PATH to the path of the installed Folly and
+# dependency libraries.
+#
+# USE_FOLLY_LITE cherry picks source files from Folly to include in the
+# RocksDB library. Its faster and has fewer dependencies on 3rd party
+# libraries, but with limited functionality. For example, coroutine
+# functionality is not available.
+ifeq ($(USE_FOLLY),1)
+ifeq ($(USE_FOLLY_LITE),1)
+$(error Please specify only one of USE_FOLLY and USE_FOLLY_LITE)
+endif
+ifneq ($(strip $(FOLLY_PATH)),)
+	BOOST_PATH = $(shell (ls -d $(FOLLY_PATH)/../boost*))
+	DBL_CONV_PATH = $(shell (ls -d $(FOLLY_PATH)/../double-conversion*))
+	GFLAGS_PATH = $(shell (ls -d $(FOLLY_PATH)/../gflags*))
+	GLOG_PATH = $(shell (ls -d $(FOLLY_PATH)/../glog*))
+	LIBEVENT_PATH = $(shell (ls -d $(FOLLY_PATH)/../libevent*))
+	XZ_PATH = $(shell (ls -d $(FOLLY_PATH)/../xz*))
+	LIBSODIUM_PATH = $(shell (ls -d $(FOLLY_PATH)/../libsodium*))
+	FMT_PATH = $(shell (ls -d $(FOLLY_PATH)/../fmt*))
+
+	# For some reason, glog and fmt libraries are under either lib or lib64
+	GLOG_LIB_PATH = $(shell (ls -d $(GLOG_PATH)/lib*))
+	FMT_LIB_PATH = $(shell (ls -d $(FMT_PATH)/lib*))
+
+	# AIX: pre-defined system headers are surrounded by an extern "C" block
+	ifeq ($(PLATFORM), OS_AIX)
+		PLATFORM_CCFLAGS += -I$(BOOST_PATH)/include -I$(DBL_CONV_PATH)/include -I$(GLOG_PATH)/include -I$(LIBEVENT_PATH)/include -I$(XZ_PATH)/include -I$(LIBSODIUM_PATH)/include -I$(FOLLY_PATH)/include -I$(FMT_PATH)/include
+		PLATFORM_CXXFLAGS += -I$(BOOST_PATH)/include -I$(DBL_CONV_PATH)/include -I$(GLOG_PATH)/include -I$(LIBEVENT_PATH)/include -I$(XZ_PATH)/include -I$(LIBSODIUM_PATH)/include -I$(FOLLY_PATH)/include -I$(FMT_PATH)/include
+	else
+		PLATFORM_CCFLAGS += -isystem $(BOOST_PATH)/include -isystem $(DBL_CONV_PATH)/include -isystem $(GLOG_PATH)/include -isystem $(LIBEVENT_PATH)/include -isystem $(XZ_PATH)/include -isystem $(LIBSODIUM_PATH)/include -isystem $(FOLLY_PATH)/include -isystem $(FMT_PATH)/include
+		PLATFORM_CXXFLAGS += -isystem $(BOOST_PATH)/include -isystem $(DBL_CONV_PATH)/include -isystem $(GLOG_PATH)/include -isystem $(LIBEVENT_PATH)/include -isystem $(XZ_PATH)/include -isystem $(LIBSODIUM_PATH)/include -isystem $(FOLLY_PATH)/include -isystem $(FMT_PATH)/include
+	endif
+
+	# Add -ldl at the end as gcc resolves a symbol in a library by searching only in libraries specified later
+	# in the command line
+	PLATFORM_LDFLAGS += $(FOLLY_PATH)/lib/libfolly.a $(BOOST_PATH)/lib/libboost_context.a $(BOOST_PATH)/lib/libboost_filesystem.a $(BOOST_PATH)/lib/libboost_atomic.a $(BOOST_PATH)/lib/libboost_program_options.a $(BOOST_PATH)/lib/libboost_regex.a $(BOOST_PATH)/lib/libboost_system.a $(BOOST_PATH)/lib/libboost_thread.a $(DBL_CONV_PATH)/lib/libdouble-conversion.a $(FMT_LIB_PATH)/libfmt.a $(GLOG_LIB_PATH)/libglog.so $(GFLAGS_PATH)/lib/libgflags.so.2.2 $(LIBEVENT_PATH)/lib/libevent-2.1.so -ldl
+	PLATFORM_LDFLAGS += -Wl,-rpath=$(GFLAGS_PATH)/lib -Wl,-rpath=$(GLOG_LIB_PATH) -Wl,-rpath=$(LIBEVENT_PATH)/lib -Wl,-rpath=$(LIBSODIUM_PATH)/lib -Wl,-rpath=$(LIBEVENT_PATH)/lib
+endif
+	PLATFORM_CCFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG
+	PLATFORM_CXXFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG
+endif
+
+ifeq ($(USE_FOLLY_LITE),1)
+	# Path to the Folly source code and include files
+	FOLLY_DIR = ./third-party/folly
+	# AIX: pre-defined system headers are surrounded by an extern "C" block
+	ifeq ($(PLATFORM), OS_AIX)
+		PLATFORM_CCFLAGS += -I$(FOLLY_DIR)
+		PLATFORM_CXXFLAGS += -I$(FOLLY_DIR)
+	else
+		PLATFORM_CCFLAGS += -isystem $(FOLLY_DIR)
+		PLATFORM_CXXFLAGS += -isystem $(FOLLY_DIR)
+	endif
+	PLATFORM_CCFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG
+	PLATFORM_CXXFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG
+# TODO: fix linking with fbcode compiler config
+	PLATFORM_LDFLAGS += -lglog
+endif
+
+ifdef TEST_CACHE_LINE_SIZE
+  PLATFORM_CCFLAGS += -DTEST_CACHE_LINE_SIZE=$(TEST_CACHE_LINE_SIZE)
+  PLATFORM_CXXFLAGS += -DTEST_CACHE_LINE_SIZE=$(TEST_CACHE_LINE_SIZE)
+endif
+ifdef TEST_UINT128_COMPAT
+  PLATFORM_CCFLAGS += -DTEST_UINT128_COMPAT=1
+  PLATFORM_CXXFLAGS += -DTEST_UINT128_COMPAT=1
+endif
+ifdef ROCKSDB_MODIFY_NPHASH
+  PLATFORM_CCFLAGS += -DROCKSDB_MODIFY_NPHASH=1
+  PLATFORM_CXXFLAGS += -DROCKSDB_MODIFY_NPHASH=1
+endif
+
+# This (the first rule) must depend on "all".
+default: all
+
+WARNING_FLAGS = -W -Wextra -Wall -Wsign-compare -Wshadow \
+  -Wunused-parameter
+
+ifeq (,$(filter amd64, $(MACHINE)))
+	C_WARNING_FLAGS = -Wstrict-prototypes
+endif
+
+ifdef USE_CLANG
+	# Used by some teams in Facebook
+	WARNING_FLAGS += -Wshift-sign-overflow
+endif
+
+ifeq ($(PLATFORM), OS_OPENBSD)
+	WARNING_FLAGS += -Wno-unused-lambda-capture
+endif
+
+ifndef DISABLE_WARNING_AS_ERROR
+	WARNING_FLAGS += -Werror
+endif
+
+
+ifdef LUA_PATH
+
+ifndef LUA_INCLUDE
+LUA_INCLUDE=$(LUA_PATH)/include
+endif
+
+LUA_INCLUDE_FILE=$(LUA_INCLUDE)/lualib.h
+
+ifeq ("$(wildcard $(LUA_INCLUDE_FILE))", "")
+# LUA_INCLUDE_FILE does not exist
+$(error Cannot find lualib.h under $(LUA_INCLUDE).  Try to specify both LUA_PATH and LUA_INCLUDE manually)
+endif
+LUA_FLAGS = -I$(LUA_INCLUDE) -DLUA -DLUA_COMPAT_ALL
+CFLAGS += $(LUA_FLAGS)
+CXXFLAGS += $(LUA_FLAGS)
+
+ifndef LUA_LIB
+LUA_LIB = $(LUA_PATH)/lib/liblua.a
+endif
+ifeq ("$(wildcard $(LUA_LIB))", "") # LUA_LIB does not exist
+$(error $(LUA_LIB) does not exist.  Try to specify both LUA_PATH and LUA_LIB manually)
+endif
+EXEC_LDFLAGS += $(LUA_LIB)
+
+endif
+
+ifeq ($(NO_THREEWAY_CRC32C), 1)
+	CXXFLAGS += -DNO_THREEWAY_CRC32C
+endif
+
+CFLAGS += $(C_WARNING_FLAGS) $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
+CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers
+
+# Allow offsetof to work on non-standard layout types. Some compiler could
+# completely reject our usage of offsetof, but we will solve that when it
+# happens.
+CXXFLAGS += -Wno-invalid-offsetof
+
+LDFLAGS += $(PLATFORM_LDFLAGS)
+
+LIB_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(LIB_SOURCES))
+LIB_OBJECTS += $(patsubst %.cc, $(OBJ_DIR)/%.o, $(ROCKSDB_PLUGIN_SOURCES))
+ifeq ($(HAVE_POWER8),1)
+LIB_OBJECTS += $(patsubst %.c, $(OBJ_DIR)/%.o, $(LIB_SOURCES_C))
+LIB_OBJECTS += $(patsubst %.S, $(OBJ_DIR)/%.o, $(LIB_SOURCES_ASM))
+endif
+
+ifeq ($(USE_FOLLY_LITE),1)
+  LIB_OBJECTS += $(patsubst %.cpp, $(OBJ_DIR)/%.o, $(FOLLY_SOURCES))
+endif
+
+# range_tree is not compatible with non GNU libc on ppc64
+# see https://jira.percona.com/browse/PS-7559
+ifneq ($(PPC_LIBC_IS_GNU),0)
+  LIB_OBJECTS += $(patsubst %.cc, $(OBJ_DIR)/%.o, $(RANGE_TREE_SOURCES))
+endif
+
+GTEST = $(OBJ_DIR)/$(GTEST_DIR)/gtest/gtest-all.o
+TESTUTIL = $(OBJ_DIR)/test_util/testutil.o
+TESTHARNESS = $(OBJ_DIR)/test_util/testharness.o $(TESTUTIL) $(GTEST)
+VALGRIND_ERROR = 2
+VALGRIND_VER := $(join $(VALGRIND_VER),valgrind)
+
+VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full
+# Not yet supported: --show-leak-kinds=definite,possible,reachable --errors-for-leak-kinds=definite,possible,reachable
+
+TEST_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES)) $(GTEST)
+BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(BENCH_LIB_SOURCES))
+CACHE_BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(CACHE_BENCH_LIB_SOURCES))
+TOOL_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(TOOL_LIB_SOURCES))
+ANALYZE_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(ANALYZER_LIB_SOURCES))
+STRESS_OBJECTS =  $(patsubst %.cc, $(OBJ_DIR)/%.o, $(STRESS_LIB_SOURCES))
+
+# Exclude build_version.cc -- a generated source file -- from all sources.  Not needed for dependencies
+ALL_SOURCES  = $(filter-out util/build_version.cc, $(LIB_SOURCES)) $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES) $(GTEST_DIR)/gtest/gtest-all.cc
+ALL_SOURCES += $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(CACHE_BENCH_LIB_SOURCES) $(ANALYZER_LIB_SOURCES) $(STRESS_LIB_SOURCES)
+ALL_SOURCES += $(TEST_MAIN_SOURCES) $(TOOL_MAIN_SOURCES) $(BENCH_MAIN_SOURCES)
+ALL_SOURCES += $(ROCKSDB_PLUGIN_SOURCES)
+
+TESTS = $(patsubst %.cc, %, $(notdir $(TEST_MAIN_SOURCES)))
+TESTS += $(patsubst %.c, %, $(notdir $(TEST_MAIN_SOURCES_C)))
+
+# `make check-headers` to very that each header file includes its own
+# dependencies
+ifneq ($(filter check-headers, $(MAKECMDGOALS)),)
+# TODO: add/support JNI headers
+	DEV_HEADER_DIRS := $(sort include/ $(dir $(ALL_SOURCES)))
+# Some headers like in port/ are platform-specific
+	DEV_HEADERS := $(shell $(FIND) $(DEV_HEADER_DIRS) -type f -name '*.h' | grep -E -v 'port/|plugin/|lua/|range_tree/')
+else
+	DEV_HEADERS :=
+endif
+HEADER_OK_FILES = $(patsubst %.h, %.h.ok, $(DEV_HEADERS))
+
+AM_V_CCH = $(am__v_CCH_$(V))
+am__v_CCH_ = $(am__v_CCH_$(AM_DEFAULT_VERBOSITY))
+am__v_CCH_0 = @echo "  CC.h    " $<;
+am__v_CCH_1 =
+
+%.h.ok: %.h # .h.ok not actually created, so re-checked on each invocation
+# -DROCKSDB_NAMESPACE=42 ensures the namespace header is included
+	$(AM_V_CCH) echo '#include "$<"' | $(CXX) $(CXXFLAGS) -DROCKSDB_NAMESPACE=42 -x c++ -c - -o /dev/null
+
+check-headers: $(HEADER_OK_FILES)
+
+# options_settable_test doesn't pass with UBSAN as we use hack in the test
+ifdef ASSERT_STATUS_CHECKED
+# TODO: finish fixing all tests to pass this check
+TESTS_FAILING_ASC = \
+	c_test \
+	env_test \
+	range_locking_test \
+	testutil_test \
+
+# Since we have very few ASC exclusions left, excluding them from
+# the build is the most convenient way to exclude them from testing
+TESTS := $(filter-out $(TESTS_FAILING_ASC),$(TESTS))
+endif
+
+ROCKSDBTESTS_SUBSET ?= $(TESTS)
+
+# c_test - doesn't use gtest
+# env_test - suspicious use of test::TmpDir
+# deletefile_test - serial because it generates giant temporary files in
+#   its various tests. Parallel can fill up your /dev/shm
+# db_bloom_filter_test - serial because excessive space usage by instances
+#   of DBFilterConstructionReserveMemoryTestWithParam can fill up /dev/shm
+NON_PARALLEL_TEST = \
+	c_test \
+	env_test \
+	deletefile_test \
+	db_bloom_filter_test \
+
+PARALLEL_TEST = $(filter-out $(NON_PARALLEL_TEST), $(TESTS))
+
+# Not necessarily well thought out or up-to-date, but matches old list
+TESTS_PLATFORM_DEPENDENT := \
+	db_basic_test \
+	db_blob_basic_test \
+	db_encryption_test \
+	external_sst_file_basic_test \
+	auto_roll_logger_test \
+	bloom_test \
+	dynamic_bloom_test \
+	c_test \
+	checkpoint_test \
+	crc32c_test \
+	coding_test \
+	inlineskiplist_test \
+	env_basic_test \
+	env_test \
+	env_logger_test \
+	io_posix_test \
+	hash_test \
+	random_test \
+	ribbon_test \
+	thread_local_test \
+	work_queue_test \
+	rate_limiter_test \
+	perf_context_test \
+	iostats_context_test \
+
+# Sort ROCKSDBTESTS_SUBSET for filtering, except db_test is special (expensive)
+# so is placed first (out-of-order)
+ROCKSDBTESTS_SUBSET := $(filter db_test, $(ROCKSDBTESTS_SUBSET)) $(sort $(filter-out db_test, $(ROCKSDBTESTS_SUBSET)))
+
+ifdef ROCKSDBTESTS_START
+        ROCKSDBTESTS_SUBSET := $(shell echo $(ROCKSDBTESTS_SUBSET) | sed 's/^.*$(ROCKSDBTESTS_START)/$(ROCKSDBTESTS_START)/')
+endif
+
+ifdef ROCKSDBTESTS_END
+        ROCKSDBTESTS_SUBSET := $(shell echo $(ROCKSDBTESTS_SUBSET) | sed 's/$(ROCKSDBTESTS_END).*//')
+endif
+
+ifeq ($(ROCKSDBTESTS_PLATFORM_DEPENDENT), only)
+        ROCKSDBTESTS_SUBSET := $(filter $(TESTS_PLATFORM_DEPENDENT), $(ROCKSDBTESTS_SUBSET))
+else ifeq ($(ROCKSDBTESTS_PLATFORM_DEPENDENT), exclude)
+        ROCKSDBTESTS_SUBSET := $(filter-out $(TESTS_PLATFORM_DEPENDENT), $(ROCKSDBTESTS_SUBSET))
+endif
+
+# bench_tool_analyer main is in bench_tool_analyzer_tool, or this would be simpler...
+TOOLS = $(patsubst %.cc, %, $(notdir $(patsubst %_tool.cc, %.cc, $(TOOLS_MAIN_SOURCES))))
+
+TEST_LIBS = \
+	librocksdb_env_basic_test.a
+
+# TODO: add back forward_iterator_bench, after making it build in all environemnts.
+BENCHMARKS = $(patsubst %.cc, %, $(notdir $(BENCH_MAIN_SOURCES)))
+
+MICROBENCHS = $(patsubst %.cc, %, $(notdir $(MICROBENCH_SOURCES)))
+
+# if user didn't config LIBNAME, set the default
+ifeq ($(LIBNAME),)
+  LIBNAME=librocksdb
+# we should only run rocksdb in production with DEBUG_LEVEL 0
+ifneq ($(DEBUG_LEVEL),0)
+  LIBDEBUG=_debug
+endif
+endif
+STATIC_LIBRARY = ${LIBNAME}$(LIBDEBUG).a
+STATIC_TEST_LIBRARY =  ${LIBNAME}_test$(LIBDEBUG).a
+STATIC_TOOLS_LIBRARY = ${LIBNAME}_tools$(LIBDEBUG).a
+STATIC_STRESS_LIBRARY = ${LIBNAME}_stress$(LIBDEBUG).a
+
+ALL_STATIC_LIBS = $(STATIC_LIBRARY) $(STATIC_TEST_LIBRARY) $(STATIC_TOOLS_LIBRARY) $(STATIC_STRESS_LIBRARY)
+
+SHARED_TEST_LIBRARY =  ${LIBNAME}_test$(LIBDEBUG).$(PLATFORM_SHARED_EXT)
+SHARED_TOOLS_LIBRARY = ${LIBNAME}_tools$(LIBDEBUG).$(PLATFORM_SHARED_EXT)
+SHARED_STRESS_LIBRARY = ${LIBNAME}_stress$(LIBDEBUG).$(PLATFORM_SHARED_EXT)
+
+ALL_SHARED_LIBS = $(SHARED1) $(SHARED2) $(SHARED3) $(SHARED4) $(SHARED_TEST_LIBRARY) $(SHARED_TOOLS_LIBRARY) $(SHARED_STRESS_LIBRARY)
+
+ifeq ($(LIB_MODE),shared)
+LIBRARY=$(SHARED1)
+TEST_LIBRARY=$(SHARED_TEST_LIBRARY)
+TOOLS_LIBRARY=$(SHARED_TOOLS_LIBRARY)
+STRESS_LIBRARY=$(SHARED_STRESS_LIBRARY)
+CLOUD_LIBRARY=$(SHARED_CLOUD_LIBRARY)
+else
+LIBRARY=$(STATIC_LIBRARY)
+TEST_LIBRARY=$(STATIC_TEST_LIBRARY)
+TOOLS_LIBRARY=$(STATIC_TOOLS_LIBRARY)
+endif
+STRESS_LIBRARY=$(STATIC_STRESS_LIBRARY)
+
+ROCKSDB_MAJOR = $(shell grep -E "ROCKSDB_MAJOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
+ROCKSDB_MINOR = $(shell grep -E "ROCKSDB_MINOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
+ROCKSDB_PATCH = $(shell grep -E "ROCKSDB_PATCH.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
+
+# If NO_UPDATE_BUILD_VERSION is set we don't update util/build_version.cc, but
+# the file needs to already exist or else the build will fail
+ifndef NO_UPDATE_BUILD_VERSION
+
+# By default, use the current date-time as the date.  If there are no changes,
+# we will use the last commit date instead.
+build_date := $(shell date "+%Y-%m-%d %T")
+
+ifdef FORCE_GIT_SHA
+	git_sha := $(FORCE_GIT_SHA)
+	git_mod := 1
+	git_date := $(build_date)
+else
+	git_sha := $(shell git rev-parse HEAD 2>/dev/null)
+	git_tag  := $(shell git symbolic-ref -q --short HEAD 2> /dev/null || git describe --tags --exact-match 2>/dev/null)
+	git_mod  := $(shell git diff-index HEAD --quiet 2>/dev/null; echo $$?)
+	git_date := $(shell git log -1 --date=format:"%Y-%m-%d %T" --format="%ad" 2>/dev/null)
+endif
+gen_build_version = sed -e s/@GIT_SHA@/$(git_sha)/ -e s:@GIT_TAG@:"$(git_tag)": -e s/@GIT_MOD@/"$(git_mod)"/ -e s/@BUILD_DATE@/"$(build_date)"/ -e s/@GIT_DATE@/"$(git_date)"/ -e s/@ROCKSDB_PLUGIN_BUILTINS@/'$(ROCKSDB_PLUGIN_BUILTINS)'/ -e s/@ROCKSDB_PLUGIN_EXTERNS@/"$(ROCKSDB_PLUGIN_EXTERNS)"/ util/build_version.cc.in
+
+# Record the version of the source that we are compiling.
+# We keep a record of the git revision in this file.  It is then built
+# as a regular source file as part of the compilation process.
+# One can run "strings executable_filename | grep _build_" to find
+# the version of the source that we used to build the executable file.
+util/build_version.cc: $(filter-out $(OBJ_DIR)/util/build_version.o, $(LIB_OBJECTS)) util/build_version.cc.in
+	$(AM_V_GEN)rm -f $@-t
+	$(AM_V_at)$(gen_build_version) > $@
+endif
+CLEAN_FILES += util/build_version.cc
+
+default: all
+
+#-----------------------------------------------
+# Create platform independent shared libraries.
+#-----------------------------------------------
+ifneq ($(PLATFORM_SHARED_EXT),)
+
+ifneq ($(PLATFORM_SHARED_VERSIONED),true)
+SHARED1 = ${LIBNAME}$(LIBDEBUG).$(PLATFORM_SHARED_EXT)
+SHARED2 = $(SHARED1)
+SHARED3 = $(SHARED1)
+SHARED4 = $(SHARED1)
+SHARED = $(SHARED1)
+else
+SHARED_MAJOR = $(ROCKSDB_MAJOR)
+SHARED_MINOR = $(ROCKSDB_MINOR)
+SHARED_PATCH = $(ROCKSDB_PATCH)
+SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
+ifeq ($(PLATFORM), OS_MACOSX)
+SHARED_OSX = $(LIBNAME)$(LIBDEBUG).$(SHARED_MAJOR)
+SHARED2 = $(SHARED_OSX).$(PLATFORM_SHARED_EXT)
+SHARED3 = $(SHARED_OSX).$(SHARED_MINOR).$(PLATFORM_SHARED_EXT)
+SHARED4 = $(SHARED_OSX).$(SHARED_MINOR).$(SHARED_PATCH).$(PLATFORM_SHARED_EXT)
+else
+SHARED2 = $(SHARED1).$(SHARED_MAJOR)
+SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR)
+SHARED4 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR).$(SHARED_PATCH)
+endif # MACOSX
+SHARED = $(SHARED1) $(SHARED2) $(SHARED3) $(SHARED4)
+$(SHARED1): $(SHARED4) $(SHARED2)
+	ln -fs $(SHARED4) $(SHARED1)
+$(SHARED2): $(SHARED4) $(SHARED3)
+	ln -fs $(SHARED4) $(SHARED2)
+$(SHARED3): $(SHARED4)
+	ln -fs $(SHARED4) $(SHARED3)
+
+endif   # PLATFORM_SHARED_VERSIONED
+$(SHARED4): $(LIB_OBJECTS)
+	$(AM_V_CCLD) $(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED3) $(LIB_OBJECTS) $(LDFLAGS) -o $@
+endif  # PLATFORM_SHARED_EXT
+
+.PHONY: check clean coverage ldb_tests package dbg gen-pc build_size \
+	release tags tags0 valgrind_check format static_lib shared_lib all \
+	rocksdbjavastatic rocksdbjava install install-static install-shared \
+	uninstall analyze tools tools_lib check-headers checkout_folly
+
+all: $(LIBRARY) $(BENCHMARKS) tools tools_lib test_libs $(TESTS)
+
+all_but_some_tests: $(LIBRARY) $(BENCHMARKS) tools tools_lib test_libs $(ROCKSDBTESTS_SUBSET)
+
+static_lib: $(STATIC_LIBRARY)
+
+shared_lib: $(SHARED)
+
+stress_lib: $(STRESS_LIBRARY)
+
+tools: $(TOOLS)
+
+tools_lib: $(TOOLS_LIBRARY)
+
+test_libs: $(TEST_LIBS)
+
+benchmarks: $(BENCHMARKS)
+
+microbench: $(MICROBENCHS)
+
+run_microbench: $(MICROBENCHS)
+	for t in $(MICROBENCHS); do echo "===== Running benchmark $$t (`date`)"; ./$$t || exit 1; done;
+
+dbg: $(LIBRARY) $(BENCHMARKS) tools $(TESTS)
+
+# creates library and programs
+release: clean
+	LIB_MODE=$(LIB_MODE) DEBUG_LEVEL=0 $(MAKE) $(LIBRARY) tools db_bench
+
+coverage: clean
+	COVERAGEFLAGS="-fprofile-arcs -ftest-coverage" LDFLAGS+="-lgcov" $(MAKE) J=1 all check
+	cd coverage && ./coverage_test.sh
+	# Delete intermediate files
+	$(FIND) . -type f \( -name "*.gcda" -o -name "*.gcno" \) -exec rm -f {} \;
+
+# Run all tests in parallel, accumulating per-test logs in t/log-*.
+#
+# Each t/run-* file is a tiny generated bourne shell script that invokes one of
+# sub-tests. Why use a file for this?  Because that makes the invocation of
+# parallel below simpler, which in turn makes the parsing of parallel's
+# LOG simpler (the latter is for live monitoring as parallel
+# tests run).
+#
+# Test names are extracted by running tests with --gtest_list_tests.
+# This filter removes the "#"-introduced comments, and expands to
+# fully-qualified names by changing input like this:
+#
+#   DBTest.
+#     Empty
+#     WriteEmptyBatch
+#   MultiThreaded/MultiThreadedDBTest.
+#     MultiThreaded/0  # GetParam() = 0
+#     MultiThreaded/1  # GetParam() = 1
+#
+# into this:
+#
+#   DBTest.Empty
+#   DBTest.WriteEmptyBatch
+#   MultiThreaded/MultiThreadedDBTest.MultiThreaded/0
+#   MultiThreaded/MultiThreadedDBTest.MultiThreaded/1
+#
+
+parallel_tests = $(patsubst %,parallel_%,$(PARALLEL_TEST))
+.PHONY: gen_parallel_tests $(parallel_tests)
+$(parallel_tests):
+	$(AM_V_at)TEST_BINARY=$(patsubst parallel_%,%,$@); \
+  TEST_NAMES=` \
+    (./$$TEST_BINARY --gtest_list_tests || echo "  $${TEST_BINARY}__list_tests_failure") \
+    | awk '/^[^ ]/ { prefix = $$1 } /^[ ]/ { print prefix $$1 }'`; \
+	echo "  Generating parallel test scripts for $$TEST_BINARY"; \
+	for TEST_NAME in $$TEST_NAMES; do \
+		TEST_SCRIPT=t/run-$$TEST_BINARY-$${TEST_NAME//\//-}; \
+    printf '%s\n' \
+      '#!/bin/sh' \
+      "d=\$(TEST_TMPDIR)$$TEST_SCRIPT" \
+      'mkdir -p $$d' \
+      "TEST_TMPDIR=\$$d $(DRIVER) ./$$TEST_BINARY --gtest_filter=$$TEST_NAME" \
+		> $$TEST_SCRIPT; \
+		chmod a=rx $$TEST_SCRIPT; \
+	done
+
+gen_parallel_tests:
+	$(AM_V_at)mkdir -p t
+	$(AM_V_at)$(FIND) t -type f -name 'run-*' -exec rm -f {} \;
+	$(MAKE) $(parallel_tests)
+
+# Reorder input lines (which are one per test) so that the
+# longest-running tests appear first in the output.
+# Do this by prefixing each selected name with its duration,
+# sort the resulting names, and remove the leading numbers.
+# FIXME: the "100" we prepend is a fake time, for now.
+# FIXME: squirrel away timings from each run and use them
+# (when present) on subsequent runs to order these tests.
+#
+# Without this reordering, these two tests would happen to start only
+# after almost all other tests had completed, thus adding 100 seconds
+# to the duration of parallel "make check".  That's the difference
+# between 4 minutes (old) and 2m20s (new).
+#
+# 152.120 PASS t/DBTest.FileCreationRandomFailure
+# 107.816 PASS t/DBTest.EncodeDecompressedBlockSizeTest
+#
+slow_test_regexp = \
+	^.*MySQLStyleTransactionTest.*$$|^.*SnapshotConcurrentAccessTest.*$$|^.*SeqAdvanceConcurrentTest.*$$|^t/run-table_test-HarnessTest.Randomized$$|^t/run-db_test-.*(?:FileCreationRandomFailure|EncodeDecompressedBlockSizeTest)$$|^.*RecoverFromCorruptedWALWithoutFlush$$
+prioritize_long_running_tests =						\
+  perl -pe 's,($(slow_test_regexp)),100 $$1,'				\
+    | sort -k1,1gr							\
+    | sed 's/^[.0-9]* //'
+
+# "make check" uses
+# Run with "make J=1 check" to disable parallelism in "make check".
+# Run with "make J=200% check" to run two parallel jobs per core.
+# The default is to run one job per core (J=100%).
+# See "man parallel" for its "-j ..." option.
+J ?= 100%
+
+# Use this regexp to select the subset of tests whose names match.
+tests-regexp = .
+EXCLUDE_TESTS_REGEX ?= "^$$"
+
+ifeq ($(PRINT_PARALLEL_OUTPUTS), 1)
+	parallel_redir =
+else ifeq ($(QUIET_PARALLEL_TESTS), 1)
+	parallel_redir = >& t/$(test_log_prefix)log-{/}
+else
+# Default: print failure output only, as it happens
+# Note: gnu_parallel --eta is now always used, but has been modified to provide
+# only infrequent updates when not connected to a terminal. (CircleCI will
+# kill a job if no output for 10min.)
+	parallel_redir = >& t/$(test_log_prefix)log-{/} || bash -c "cat t/$(test_log_prefix)log-{/}; exit $$?"
+endif
+
+.PHONY: check_0
+check_0:
+	printf '%s\n' ''						\
+	  'To monitor subtest <duration,pass/fail,name>,'		\
+	  '  run "make watch-log" in a separate window' '';		\
+	{ \
+		printf './%s\n' $(filter-out $(PARALLEL_TEST),$(TESTS)); \
+		find t -name 'run-*' -print; \
+	} \
+	  | $(prioritize_long_running_tests)				\
+	  | grep -E '$(tests-regexp)'					\
+	  | grep -E -v '$(EXCLUDE_TESTS_REGEX)'					\
+	  | build_tools/gnu_parallel -j$(J) --plain --joblog=LOG --eta --gnu \
+	    --tmpdir=$(TEST_TMPDIR) '{} $(parallel_redir)' ; \
+	parallel_retcode=$$? ; \
+	awk '{ if ($$7 != 0 || $$8 != 0) { if ($$7 == "Exitval") { h = $$0; } else { if (!f) print h; print; f = 1 } } } END { if(f) exit 1; }' < LOG ; \
+	awk_retcode=$$?; \
+	if [ $$parallel_retcode -ne 0 ] || [ $$awk_retcode -ne 0 ] ; then exit 1 ; fi
+
+valgrind-exclude-regexp = InlineSkipTest.ConcurrentInsert|TransactionStressTest.DeadlockStress|DBCompactionTest.SuggestCompactRangeNoTwoLevel0Compactions|BackupableDBTest.RateLimiting|DBTest.CloseSpeedup|DBTest.ThreadStatusFlush|DBTest.RateLimitingTest|DBTest.EncodeDecompressedBlockSizeTest|FaultInjectionTest.UninstalledCompaction|HarnessTest.Randomized|ExternalSSTFileTest.CompactDuringAddFileRandom|ExternalSSTFileTest.IngestFileWithGlobalSeqnoRandomized|MySQLStyleTransactionTest.TransactionStressTest
+
+.PHONY: valgrind_check_0
+valgrind_check_0: test_log_prefix := valgrind_
+valgrind_check_0:
+	printf '%s\n' ''						\
+	  'To monitor subtest <duration,pass/fail,name>,'		\
+	  '  run "make watch-log" in a separate window' '';		\
+	{								\
+	  printf './%s\n' $(filter-out $(PARALLEL_TEST) %skiplist_test options_settable_test, $(TESTS));		\
+	  find t -name 'run-*' -print; \
+	}								\
+	  | $(prioritize_long_running_tests)				\
+	  | grep -E '$(tests-regexp)'					\
+	  | grep -E -v '$(valgrind-exclude-regexp)'					\
+	  | build_tools/gnu_parallel -j$(J) --plain --joblog=LOG --eta --gnu \
+	   --tmpdir=$(TEST_TMPDIR) \
+	   '(if [[ "{}" == "./"* ]] ; then $(DRIVER) {}; else {}; fi) \
+	  $(parallel_redir)' \
+
+CLEAN_FILES += t LOG $(TEST_TMPDIR)
+
+# When running parallel "make check", you can monitor its progress
+# from another window.
+# Run "make watch_LOG" to show the duration,PASS/FAIL,name of parallel
+# tests as they are being run.  We sort them so that longer-running ones
+# appear at the top of the list and any failing tests remain at the top
+# regardless of their duration. As with any use of "watch", hit ^C to
+# interrupt.
+watch-log:
+	$(WATCH) --interval=0 'sort -k7,7nr -k4,4gr LOG|$(quoted_perl_command)'
+
+dump-log:
+	bash -c '$(quoted_perl_command)' < LOG
+
+# If J != 1 and GNU parallel is installed, run the tests in parallel,
+# via the check_0 rule above.  Otherwise, run them sequentially.
+check: all
+	$(MAKE) gen_parallel_tests
+	$(AM_V_GEN)if test "$(J)" != 1                                  \
+	    && (build_tools/gnu_parallel --gnu --help 2>/dev/null) |                    \
+	        grep -q 'GNU Parallel';                                 \
+	then                                                            \
+	    $(MAKE) T="$$t" check_0;                       \
+	else                                                            \
+	    for t in $(TESTS); do                                       \
+	      echo "===== Running $$t (`date`)"; ./$$t || exit 1; done;          \
+	fi
+	rm -rf $(TEST_TMPDIR)
+ifneq ($(PLATFORM), OS_AIX)
+	$(PYTHON) tools/check_all_python.py
+ifeq ($(filter -DROCKSDB_LITE,$(OPT)),)
+ifndef ASSERT_STATUS_CHECKED # not yet working with these tests
+	$(PYTHON) tools/ldb_test.py
+	sh tools/rocksdb_dump_test.sh
+endif
+endif
+endif
+ifndef SKIP_FORMAT_BUCK_CHECKS
+	$(MAKE) check-format
+	$(MAKE) check-buck-targets
+	$(MAKE) check-sources
+endif
+
+# TODO add ldb_tests
+check_some: $(ROCKSDBTESTS_SUBSET)
+	for t in $(ROCKSDBTESTS_SUBSET); do echo "===== Running $$t (`date`)"; ./$$t || exit 1; done
+
+.PHONY: ldb_tests
+ldb_tests: ldb
+	$(PYTHON) tools/ldb_test.py
+
+include crash_test.mk
+
+asan_check: clean
+	COMPILE_WITH_ASAN=1 $(MAKE) check -j32
+	$(MAKE) clean
+
+asan_crash_test: clean
+	COMPILE_WITH_ASAN=1 $(MAKE) crash_test
+	$(MAKE) clean
+
+whitebox_asan_crash_test: clean
+	COMPILE_WITH_ASAN=1 $(MAKE) whitebox_crash_test
+	$(MAKE) clean
+
+blackbox_asan_crash_test: clean
+	COMPILE_WITH_ASAN=1 $(MAKE) blackbox_crash_test
+	$(MAKE) clean
+
+asan_crash_test_with_atomic_flush: clean
+	COMPILE_WITH_ASAN=1 $(MAKE) crash_test_with_atomic_flush
+	$(MAKE) clean
+
+asan_crash_test_with_txn: clean
+	COMPILE_WITH_ASAN=1 $(MAKE) crash_test_with_txn
+	$(MAKE) clean
+
+asan_crash_test_with_best_efforts_recovery: clean
+	COMPILE_WITH_ASAN=1 $(MAKE) crash_test_with_best_efforts_recovery
+	$(MAKE) clean
+
+ubsan_check: clean
+	COMPILE_WITH_UBSAN=1 $(MAKE) check -j32
+	$(MAKE) clean
+
+ubsan_crash_test: clean
+	COMPILE_WITH_UBSAN=1 $(MAKE) crash_test
+	$(MAKE) clean
+
+whitebox_ubsan_crash_test: clean
+	COMPILE_WITH_UBSAN=1 $(MAKE) whitebox_crash_test
+	$(MAKE) clean
+
+blackbox_ubsan_crash_test: clean
+	COMPILE_WITH_UBSAN=1 $(MAKE) blackbox_crash_test
+	$(MAKE) clean
+
+ubsan_crash_test_with_atomic_flush: clean
+	COMPILE_WITH_UBSAN=1 $(MAKE) crash_test_with_atomic_flush
+	$(MAKE) clean
+
+ubsan_crash_test_with_txn: clean
+	COMPILE_WITH_UBSAN=1 $(MAKE) crash_test_with_txn
+	$(MAKE) clean
+
+ubsan_crash_test_with_best_efforts_recovery: clean
+	COMPILE_WITH_UBSAN=1 $(MAKE) crash_test_with_best_efforts_recovery
+	$(MAKE) clean
+
+full_valgrind_test:
+	ROCKSDB_FULL_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 $(MAKE) valgrind_check
+
+full_valgrind_test_some:
+	ROCKSDB_FULL_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 $(MAKE) valgrind_check_some
+
+valgrind_test:
+	ROCKSDB_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 $(MAKE) valgrind_check
+
+valgrind_test_some:
+	ROCKSDB_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 $(MAKE) valgrind_check_some
+
+valgrind_check: $(TESTS)
+	$(MAKE) DRIVER="$(VALGRIND_VER) $(VALGRIND_OPTS)" gen_parallel_tests
+	$(AM_V_GEN)if test "$(J)" != 1                                  \
+	    && (build_tools/gnu_parallel --gnu --help 2>/dev/null) |    \
+	        grep -q 'GNU Parallel';                                 \
+	then                                                            \
+	  $(MAKE)                                                       \
+	  DRIVER="$(VALGRIND_VER) $(VALGRIND_OPTS)" valgrind_check_0;   \
+	else                                                            \
+		for t in $(filter-out %skiplist_test options_settable_test,$(TESTS)); do \
+			$(VALGRIND_VER) $(VALGRIND_OPTS) ./$$t; \
+			ret_code=$$?; \
+			if [ $$ret_code -ne 0 ]; then \
+				exit $$ret_code; \
+			fi; \
+		done; \
+	fi
+
+valgrind_check_some: $(ROCKSDBTESTS_SUBSET)
+	for t in $(ROCKSDBTESTS_SUBSET); do \
+		$(VALGRIND_VER) $(VALGRIND_OPTS) ./$$t; \
+		ret_code=$$?; \
+		if [ $$ret_code -ne 0 ]; then \
+			exit $$ret_code; \
+		fi; \
+	done
+
+test_names = \
+  ./db_test --gtest_list_tests						\
+    | perl -n								\
+      -e 's/ *\#.*//;'							\
+      -e '/^(\s*)(\S+)/; !$$1 and do {$$p=$$2; break};'			\
+      -e 'print qq! $$p$$2!'
+
+analyze: clean
+	USE_CLANG=1 $(MAKE) analyze_incremental
+
+analyze_incremental:
+	$(CLANG_SCAN_BUILD) --use-analyzer=$(CLANG_ANALYZER) \
+		--use-c++=$(CXX) --use-cc=$(CC) --status-bugs \
+		-o $(CURDIR)/scan_build_report \
+		$(MAKE) SKIP_LINK=1 dbg
+
+CLEAN_FILES += unity.cc
+unity.cc: Makefile util/build_version.cc.in
+	rm -f $@ $@-t
+	$(AM_V_at)$(gen_build_version) > util/build_version.cc
+	for source_file in $(LIB_SOURCES); do \
+		echo "#include \"$$source_file\"" >> $@-t; \
+	done
+	chmod a=r $@-t
+	mv $@-t $@
+
+unity.a: $(OBJ_DIR)/unity.o
+	$(AM_V_AR)rm -f $@
+	$(AM_V_at)$(AR) $(ARFLAGS) $@ $(OBJ_DIR)/unity.o
+
+
+# try compiling db_test with unity
+unity_test: $(OBJ_DIR)/db/db_basic_test.o $(OBJ_DIR)/db/db_test_util.o $(TEST_OBJECTS) $(TOOL_OBJECTS) unity.a
+	$(AM_LINK)
+	./unity_test
+
+rocksdb.h rocksdb.cc: build_tools/amalgamate.py Makefile $(LIB_SOURCES) unity.cc
+	build_tools/amalgamate.py -I. -i./include unity.cc -x include/rocksdb/c.h -H rocksdb.h -o rocksdb.cc
+
+clean: clean-ext-libraries-all clean-rocks clean-rocksjava
+
+clean-not-downloaded: clean-ext-libraries-bin clean-rocks clean-not-downloaded-rocksjava
+
+clean-rocks:
+	echo shared=$(ALL_SHARED_LIBS)
+	echo static=$(ALL_STATIC_LIBS)
+	rm -f $(BENCHMARKS) $(TOOLS) $(TESTS) $(PARALLEL_TEST) $(ALL_STATIC_LIBS) $(ALL_SHARED_LIBS) $(MICROBENCHS)
+	rm -rf $(CLEAN_FILES) ios-x86 ios-arm scan_build_report
+	$(FIND) . -name "*.[oda]" -exec rm -f {} \;
+	$(FIND) . -type f \( -name "*.gcda" -o -name "*.gcno" \) -exec rm -f {} \;
+
+clean-rocksjava: clean-rocks
+	rm -rf jl jls
+	cd java && $(MAKE) clean
+
+clean-not-downloaded-rocksjava:
+	cd java && $(MAKE) clean-not-downloaded
+
+clean-ext-libraries-all:
+	rm -rf bzip2* snappy* zlib* lz4* zstd*
+
+clean-ext-libraries-bin:
+	find . -maxdepth 1 -type d \( -name bzip2\* -or -name snappy\* -or -name zlib\* -or -name lz4\* -or -name zstd\* \) -prune -exec rm -rf {} \;
+
+tags:
+	ctags -R .
+	cscope -b `$(FIND) . -name '*.cc'` `$(FIND) . -name '*.h'` `$(FIND) . -name '*.c'`
+	ctags -e -R -o etags *
+
+tags0:
+	ctags -R .
+	cscope -b `$(FIND) . -name '*.cc' -and ! -name '*_test.cc'` \
+		  `$(FIND) . -name '*.c' -and ! -name '*_test.c'` \
+		  `$(FIND) . -name '*.h' -and ! -name '*_test.h'`
+	ctags -e -R -o etags *
+
+format:
+	build_tools/format-diff.sh
+
+check-format:
+	build_tools/format-diff.sh -c
+
+check-buck-targets:
+	buckifier/check_buck_targets.sh
+
+check-sources:
+	build_tools/check-sources.sh
+
+package:
+	bash build_tools/make_package.sh $(SHARED_MAJOR).$(SHARED_MINOR)
+
+# ---------------------------------------------------------------------------
+# 	Unit tests and tools
+# ---------------------------------------------------------------------------
+$(STATIC_LIBRARY): $(LIB_OBJECTS)
+	$(AM_V_AR)rm -f $@ $(SHARED1) $(SHARED2) $(SHARED3) $(SHARED4)
+	$(AM_V_at)$(AR) $(ARFLAGS) $@ $(LIB_OBJECTS)
+
+$(STATIC_TEST_LIBRARY): $(TEST_OBJECTS)
+	$(AM_V_AR)rm -f $@ $(SHARED_TEST_LIBRARY)
+	$(AM_V_at)$(AR) $(ARFLAGS) $@ $^
+
+$(STATIC_TOOLS_LIBRARY): $(TOOL_OBJECTS)
+	$(AM_V_AR)rm -f $@ $(SHARED_TOOLS_LIBRARY)
+	$(AM_V_at)$(AR) $(ARFLAGS) $@ $^
+
+$(STATIC_STRESS_LIBRARY): $(ANALYZE_OBJECTS) $(STRESS_OBJECTS) $(TESTUTIL)
+	$(AM_V_AR)rm -f $@ $(SHARED_STRESS_LIBRARY)
+	$(AM_V_at)$(AR) $(ARFLAGS) $@ $^
+
+$(SHARED_TEST_LIBRARY): $(TEST_OBJECTS) $(SHARED1)
+	$(AM_V_AR)rm -f $@ $(STATIC_TEST_LIBRARY)
+	$(AM_SHARE)
+
+$(SHARED_TOOLS_LIBRARY): $(TOOL_OBJECTS) $(SHARED1)
+	$(AM_V_AR)rm -f $@ $(STATIC_TOOLS_LIBRARY)
+	$(AM_SHARE)
+
+$(SHARED_STRESS_LIBRARY): $(ANALYZE_OBJECTS) $(STRESS_OBJECTS) $(TESTUTIL) $(SHARED_TOOLS_LIBRARY) $(SHARED1)
+	$(AM_V_AR)rm -f $@ $(STATIC_STRESS_LIBRARY)
+	$(AM_SHARE)
+
+librocksdb_env_basic_test.a: $(OBJ_DIR)/env/env_basic_test.o $(LIB_OBJECTS) $(TESTHARNESS)
+	$(AM_V_AR)rm -f $@
+	$(AM_V_at)$(AR) $(ARFLAGS) $@ $^
+
+db_bench: $(OBJ_DIR)/tools/db_bench.o $(BENCH_OBJECTS) $(TESTUTIL) $(LIBRARY)
+	$(AM_LINK)
+
+trace_analyzer: $(OBJ_DIR)/tools/trace_analyzer.o $(ANALYZE_OBJECTS) $(TOOLS_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+block_cache_trace_analyzer: $(OBJ_DIR)/tools/block_cache_analyzer/block_cache_trace_analyzer_tool.o $(ANALYZE_OBJECTS) $(TOOLS_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+cache_bench: $(OBJ_DIR)/cache/cache_bench.o $(CACHE_BENCH_OBJECTS) $(LIBRARY)
+	$(AM_LINK)
+
+persistent_cache_bench: $(OBJ_DIR)/utilities/persistent_cache/persistent_cache_bench.o $(LIBRARY)
+	$(AM_LINK)
+
+memtablerep_bench: $(OBJ_DIR)/memtable/memtablerep_bench.o $(LIBRARY)
+	$(AM_LINK)
+
+filter_bench: $(OBJ_DIR)/util/filter_bench.o $(LIBRARY)
+	$(AM_LINK)
+
+db_stress: $(OBJ_DIR)/db_stress_tool/db_stress.o $(STRESS_LIBRARY) $(TOOLS_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+write_stress: $(OBJ_DIR)/tools/write_stress.o $(LIBRARY)
+	$(AM_LINK)
+
+db_sanity_test: $(OBJ_DIR)/tools/db_sanity_test.o $(LIBRARY)
+	$(AM_LINK)
+
+db_repl_stress: $(OBJ_DIR)/tools/db_repl_stress.o $(LIBRARY)
+	$(AM_LINK)
+
+arena_test: $(OBJ_DIR)/memory/arena_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+memory_allocator_test: memory/memory_allocator_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+autovector_test: $(OBJ_DIR)/util/autovector_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+column_family_test: $(OBJ_DIR)/db/column_family_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+table_properties_collector_test: $(OBJ_DIR)/db/table_properties_collector_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+bloom_test: $(OBJ_DIR)/util/bloom_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+dynamic_bloom_test: $(OBJ_DIR)/util/dynamic_bloom_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+c_test: $(OBJ_DIR)/db/c_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+cache_test: $(OBJ_DIR)/cache/cache_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+coding_test: $(OBJ_DIR)/util/coding_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+hash_test: $(OBJ_DIR)/util/hash_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+random_test: $(OBJ_DIR)/util/random_test.o  $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+ribbon_test: $(OBJ_DIR)/util/ribbon_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+option_change_migration_test: $(OBJ_DIR)/utilities/option_change_migration/option_change_migration_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+agg_merge_test: $(OBJ_DIR)/utilities/agg_merge/agg_merge_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+stringappend_test: $(OBJ_DIR)/utilities/merge_operators/string_append/stringappend_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+cassandra_format_test: $(OBJ_DIR)/utilities/cassandra/cassandra_format_test.o $(OBJ_DIR)/utilities/cassandra/test_utils.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+cassandra_functional_test: $(OBJ_DIR)/utilities/cassandra/cassandra_functional_test.o $(OBJ_DIR)/utilities/cassandra/test_utils.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+cassandra_row_merge_test: $(OBJ_DIR)/utilities/cassandra/cassandra_row_merge_test.o $(OBJ_DIR)/utilities/cassandra/test_utils.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+cassandra_serialize_test: $(OBJ_DIR)/utilities/cassandra/cassandra_serialize_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+hash_table_test: $(OBJ_DIR)/utilities/persistent_cache/hash_table_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+histogram_test: $(OBJ_DIR)/monitoring/histogram_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+thread_local_test: $(OBJ_DIR)/util/thread_local_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+work_queue_test: $(OBJ_DIR)/util/work_queue_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+corruption_test: $(OBJ_DIR)/db/corruption_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+crc32c_test: $(OBJ_DIR)/util/crc32c_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+slice_test: $(OBJ_DIR)/util/slice_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+slice_transform_test: $(OBJ_DIR)/util/slice_transform_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_basic_test: $(OBJ_DIR)/db/db_basic_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_blob_basic_test: $(OBJ_DIR)/db/blob/db_blob_basic_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_blob_compaction_test: $(OBJ_DIR)/db/blob/db_blob_compaction_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_readonly_with_timestamp_test: $(OBJ_DIR)/db/db_readonly_with_timestamp_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_wide_basic_test: $(OBJ_DIR)/db/wide/db_wide_basic_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_with_timestamp_basic_test: $(OBJ_DIR)/db/db_with_timestamp_basic_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_with_timestamp_compaction_test: db/db_with_timestamp_compaction_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_encryption_test: $(OBJ_DIR)/db/db_encryption_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_test: $(OBJ_DIR)/db/db_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_test2: $(OBJ_DIR)/db/db_test2.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_logical_block_size_cache_test: $(OBJ_DIR)/db/db_logical_block_size_cache_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_blob_index_test: $(OBJ_DIR)/db/blob/db_blob_index_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_block_cache_test: $(OBJ_DIR)/db/db_block_cache_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_bloom_filter_test: $(OBJ_DIR)/db/db_bloom_filter_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_log_iter_test: $(OBJ_DIR)/db/db_log_iter_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_compaction_filter_test: $(OBJ_DIR)/db/db_compaction_filter_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_compaction_test: $(OBJ_DIR)/db/db_compaction_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_dynamic_level_test: $(OBJ_DIR)/db/db_dynamic_level_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_flush_test: $(OBJ_DIR)/db/db_flush_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_inplace_update_test: $(OBJ_DIR)/db/db_inplace_update_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_iterator_test: $(OBJ_DIR)/db/db_iterator_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_kv_checksum_test: $(OBJ_DIR)/db/db_kv_checksum_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_memtable_test: $(OBJ_DIR)/db/db_memtable_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_merge_operator_test: $(OBJ_DIR)/db/db_merge_operator_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_merge_operand_test: $(OBJ_DIR)/db/db_merge_operand_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_options_test: $(OBJ_DIR)/db/db_options_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_range_del_test: $(OBJ_DIR)/db/db_range_del_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_rate_limiter_test: $(OBJ_DIR)/db/db_rate_limiter_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_sst_test: $(OBJ_DIR)/db/db_sst_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_statistics_test: $(OBJ_DIR)/db/db_statistics_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_write_test: $(OBJ_DIR)/db/db_write_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+error_handler_fs_test: $(OBJ_DIR)/db/error_handler_fs_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+external_sst_file_basic_test: $(OBJ_DIR)/db/external_sst_file_basic_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+external_sst_file_test: $(OBJ_DIR)/db/external_sst_file_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+import_column_family_test: $(OBJ_DIR)/db/import_column_family_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_tailing_iter_test: $(OBJ_DIR)/db/db_tailing_iter_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_iter_test: $(OBJ_DIR)/db/db_iter_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_iter_stress_test: $(OBJ_DIR)/db/db_iter_stress_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_universal_compaction_test: $(OBJ_DIR)/db/db_universal_compaction_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_wal_test: $(OBJ_DIR)/db/db_wal_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_io_failure_test: $(OBJ_DIR)/db/db_io_failure_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_properties_test: $(OBJ_DIR)/db/db_properties_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_table_properties_test: $(OBJ_DIR)/db/db_table_properties_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+log_write_bench: $(OBJ_DIR)/util/log_write_bench.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK) $(PROFILING_FLAGS)
+
+seqno_time_test: $(OBJ_DIR)/db/seqno_time_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+plain_table_db_test: $(OBJ_DIR)/db/plain_table_db_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+comparator_db_test: $(OBJ_DIR)/db/comparator_db_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+table_reader_bench: $(OBJ_DIR)/table/table_reader_bench.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK) $(PROFILING_FLAGS)
+
+perf_context_test: $(OBJ_DIR)/db/perf_context_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+prefix_test: $(OBJ_DIR)/db/prefix_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+backup_engine_test: $(OBJ_DIR)/utilities/backup/backup_engine_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+checkpoint_test: $(OBJ_DIR)/utilities/checkpoint/checkpoint_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+cache_simulator_test: $(OBJ_DIR)/utilities/simulator_cache/cache_simulator_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+sim_cache_test: $(OBJ_DIR)/utilities/simulator_cache/sim_cache_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+env_mirror_test: $(OBJ_DIR)/utilities/env_mirror_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+env_timed_test: $(OBJ_DIR)/utilities/env_timed_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+object_registry_test: $(OBJ_DIR)/utilities/object_registry_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+ttl_test: $(OBJ_DIR)/utilities/ttl/ttl_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+write_batch_with_index_test: $(OBJ_DIR)/utilities/write_batch_with_index/write_batch_with_index_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+flush_job_test: $(OBJ_DIR)/db/flush_job_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+compaction_iterator_test: $(OBJ_DIR)/db/compaction/compaction_iterator_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+compaction_job_test: $(OBJ_DIR)/db/compaction/compaction_job_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+compaction_job_stats_test: $(OBJ_DIR)/db/compaction/compaction_job_stats_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+compaction_service_test: $(OBJ_DIR)/db/compaction/compaction_service_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+compact_on_deletion_collector_test: $(OBJ_DIR)/utilities/table_properties_collectors/compact_on_deletion_collector_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+wal_manager_test: $(OBJ_DIR)/db/wal_manager_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+wal_edit_test: $(OBJ_DIR)/db/wal_edit_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+dbformat_test: $(OBJ_DIR)/db/dbformat_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+env_basic_test: $(OBJ_DIR)/env/env_basic_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+env_test: $(OBJ_DIR)/env/env_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+io_posix_test: $(OBJ_DIR)/env/io_posix_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+fault_injection_test: $(OBJ_DIR)/db/fault_injection_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+rate_limiter_test: $(OBJ_DIR)/util/rate_limiter_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+delete_scheduler_test: $(OBJ_DIR)/file/delete_scheduler_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+filename_test: $(OBJ_DIR)/db/filename_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+random_access_file_reader_test: $(OBJ_DIR)/file/random_access_file_reader_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+file_reader_writer_test: $(OBJ_DIR)/util/file_reader_writer_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+block_based_table_reader_test: table/block_based/block_based_table_reader_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+full_filter_block_test: $(OBJ_DIR)/table/block_based/full_filter_block_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+partitioned_filter_block_test: $(OBJ_DIR)/table/block_based/partitioned_filter_block_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+log_test: $(OBJ_DIR)/db/log_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+cleanable_test: $(OBJ_DIR)/table/cleanable_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+table_test: $(OBJ_DIR)/table/table_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+block_fetcher_test: table/block_fetcher_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+block_test: $(OBJ_DIR)/table/block_based/block_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+data_block_hash_index_test: $(OBJ_DIR)/table/block_based/data_block_hash_index_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+inlineskiplist_test: $(OBJ_DIR)/memtable/inlineskiplist_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+skiplist_test: $(OBJ_DIR)/memtable/skiplist_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+write_buffer_manager_test: $(OBJ_DIR)/memtable/write_buffer_manager_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+version_edit_test: $(OBJ_DIR)/db/version_edit_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+version_set_test: $(OBJ_DIR)/db/version_set_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+compaction_picker_test: $(OBJ_DIR)/db/compaction/compaction_picker_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+version_builder_test: $(OBJ_DIR)/db/version_builder_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+file_indexer_test: $(OBJ_DIR)/db/file_indexer_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+reduce_levels_test: $(OBJ_DIR)/tools/reduce_levels_test.o $(TOOLS_LIBRARY) $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+write_batch_test: $(OBJ_DIR)/db/write_batch_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+write_controller_test: $(OBJ_DIR)/db/write_controller_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+merge_helper_test: $(OBJ_DIR)/db/merge_helper_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+memory_test: $(OBJ_DIR)/utilities/memory/memory_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+merge_test: $(OBJ_DIR)/db/merge_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+merger_test: $(OBJ_DIR)/table/merger_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+util_merge_operators_test: $(OBJ_DIR)/utilities/util_merge_operators_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+options_file_test: $(OBJ_DIR)/db/options_file_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+deletefile_test: $(OBJ_DIR)/db/deletefile_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+obsolete_files_test: $(OBJ_DIR)/db/obsolete_files_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+rocksdb_dump: $(OBJ_DIR)/tools/dump/rocksdb_dump.o $(LIBRARY)
+	$(AM_LINK)
+
+rocksdb_undump: $(OBJ_DIR)/tools/dump/rocksdb_undump.o $(LIBRARY)
+	$(AM_LINK)
+
+cuckoo_table_builder_test: $(OBJ_DIR)/table/cuckoo/cuckoo_table_builder_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+cuckoo_table_reader_test: $(OBJ_DIR)/table/cuckoo/cuckoo_table_reader_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+cuckoo_table_db_test: $(OBJ_DIR)/db/cuckoo_table_db_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+listener_test: $(OBJ_DIR)/db/listener_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+thread_list_test: $(OBJ_DIR)/util/thread_list_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+compact_files_test: $(OBJ_DIR)/db/compact_files_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+configurable_test: options/configurable_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+customizable_test: options/customizable_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+options_test: $(OBJ_DIR)/options/options_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+options_settable_test: $(OBJ_DIR)/options/options_settable_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+options_util_test: $(OBJ_DIR)/utilities/options/options_util_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_bench_tool_test: $(OBJ_DIR)/tools/db_bench_tool_test.o $(BENCH_OBJECTS) $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+trace_analyzer_test: $(OBJ_DIR)/tools/trace_analyzer_test.o $(ANALYZE_OBJECTS) $(TOOLS_LIBRARY) $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+event_logger_test: $(OBJ_DIR)/logging/event_logger_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+timer_queue_test: $(OBJ_DIR)/util/timer_queue_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+sst_dump_test: $(OBJ_DIR)/tools/sst_dump_test.o $(TOOLS_LIBRARY) $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+optimistic_transaction_test: $(OBJ_DIR)/utilities/transactions/optimistic_transaction_test.o  $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+mock_env_test : $(OBJ_DIR)/env/mock_env_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+manual_compaction_test: $(OBJ_DIR)/db/manual_compaction_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+filelock_test: $(OBJ_DIR)/util/filelock_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+auto_roll_logger_test: $(OBJ_DIR)/logging/auto_roll_logger_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+env_logger_test: $(OBJ_DIR)/logging/env_logger_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+memtable_list_test: $(OBJ_DIR)/db/memtable_list_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+write_callback_test: $(OBJ_DIR)/db/write_callback_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+heap_test: $(OBJ_DIR)/util/heap_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+point_lock_manager_test: utilities/transactions/lock/point/point_lock_manager_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+transaction_test: $(OBJ_DIR)/utilities/transactions/transaction_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+write_committed_transaction_ts_test: $(OBJ_DIR)/utilities/transactions/write_committed_transaction_ts_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+write_prepared_transaction_test: $(OBJ_DIR)/utilities/transactions/write_prepared_transaction_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+write_unprepared_transaction_test: $(OBJ_DIR)/utilities/transactions/write_unprepared_transaction_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+timestamped_snapshot_test: $(OBJ_DIR)/utilities/transactions/timestamped_snapshot_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+tiered_compaction_test: $(OBJ_DIR)/db/compaction/tiered_compaction_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+sst_dump: $(OBJ_DIR)/tools/sst_dump.o $(TOOLS_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+blob_dump: $(OBJ_DIR)/tools/blob_dump.o $(TOOLS_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+repair_test: $(OBJ_DIR)/db/repair_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+ldb_cmd_test: $(OBJ_DIR)/tools/ldb_cmd_test.o $(TOOLS_LIBRARY) $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+ldb: $(OBJ_DIR)/tools/ldb.o $(TOOLS_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+iostats_context_test: $(OBJ_DIR)/monitoring/iostats_context_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_V_CCLD)$(CXX) $^ $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
+
+persistent_cache_test: $(OBJ_DIR)/utilities/persistent_cache/persistent_cache_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+statistics_test: $(OBJ_DIR)/monitoring/statistics_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+stats_history_test: $(OBJ_DIR)/monitoring/stats_history_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+compressed_secondary_cache_test: $(OBJ_DIR)/cache/compressed_secondary_cache_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+lru_cache_test: $(OBJ_DIR)/cache/lru_cache_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+range_del_aggregator_test: $(OBJ_DIR)/db/range_del_aggregator_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+range_del_aggregator_bench: $(OBJ_DIR)/db/range_del_aggregator_bench.o $(LIBRARY)
+	$(AM_LINK)
+
+blob_db_test: $(OBJ_DIR)/utilities/blob_db/blob_db_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+repeatable_thread_test: $(OBJ_DIR)/util/repeatable_thread_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+range_locking_test: utilities/transactions/lock/range/range_locking_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+range_tombstone_fragmenter_test: $(OBJ_DIR)/db/range_tombstone_fragmenter_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+sst_file_reader_test: $(OBJ_DIR)/table/sst_file_reader_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_secondary_test: $(OBJ_DIR)/db/db_secondary_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+block_cache_tracer_test: $(OBJ_DIR)/trace_replay/block_cache_tracer_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+block_cache_trace_analyzer_test: $(OBJ_DIR)/tools/block_cache_analyzer/block_cache_trace_analyzer_test.o $(OBJ_DIR)/tools/block_cache_analyzer/block_cache_trace_analyzer.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+defer_test: $(OBJ_DIR)/util/defer_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+blob_counting_iterator_test: $(OBJ_DIR)/db/blob/blob_counting_iterator_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+blob_file_addition_test: $(OBJ_DIR)/db/blob/blob_file_addition_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+blob_file_builder_test: $(OBJ_DIR)/db/blob/blob_file_builder_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+blob_file_cache_test: $(OBJ_DIR)/db/blob/blob_file_cache_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+blob_file_garbage_test: $(OBJ_DIR)/db/blob/blob_file_garbage_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+blob_file_reader_test: $(OBJ_DIR)/db/blob/blob_file_reader_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+blob_source_test: $(OBJ_DIR)/db/blob/blob_source_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+blob_garbage_meter_test: $(OBJ_DIR)/db/blob/blob_garbage_meter_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+timer_test: $(OBJ_DIR)/util/timer_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+periodic_task_scheduler_test: $(OBJ_DIR)/db/periodic_task_scheduler_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+testutil_test: $(OBJ_DIR)/test_util/testutil_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+io_tracer_test: $(OBJ_DIR)/trace_replay/io_tracer_test.o $(OBJ_DIR)/trace_replay/io_tracer.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+prefetch_test: $(OBJ_DIR)/file/prefetch_test.o  $(OBJ_DIR)/tools/io_tracer_parser_tool.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+io_tracer_parser_test: $(OBJ_DIR)/tools/io_tracer_parser_test.o $(OBJ_DIR)/tools/io_tracer_parser_tool.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+io_tracer_parser: $(OBJ_DIR)/tools/io_tracer_parser.o $(TOOLS_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_blob_corruption_test: $(OBJ_DIR)/db/blob/db_blob_corruption_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_write_buffer_manager_test: $(OBJ_DIR)/db/db_write_buffer_manager_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+clipping_iterator_test: $(OBJ_DIR)/db/compaction/clipping_iterator_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+ribbon_bench: $(OBJ_DIR)/microbench/ribbon_bench.o $(LIBRARY)
+	$(AM_LINK)
+
+db_basic_bench: $(OBJ_DIR)/microbench/db_basic_bench.o $(LIBRARY)
+	$(AM_LINK)
+
+cache_reservation_manager_test: $(OBJ_DIR)/cache/cache_reservation_manager_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+wide_column_serialization_test: $(OBJ_DIR)/db/wide/wide_column_serialization_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+#-------------------------------------------------
+# make install related stuff
+PREFIX ?= /usr/local
+LIBDIR ?= $(PREFIX)/lib
+INSTALL_LIBDIR = $(DESTDIR)$(LIBDIR)
+
+uninstall:
+	rm -rf $(DESTDIR)$(PREFIX)/include/rocksdb \
+	  $(INSTALL_LIBDIR)/$(LIBRARY) \
+	  $(INSTALL_LIBDIR)/$(SHARED4) \
+	  $(INSTALL_LIBDIR)/$(SHARED3) \
+	  $(INSTALL_LIBDIR)/$(SHARED2) \
+	  $(INSTALL_LIBDIR)/$(SHARED1) \
+	  $(INSTALL_LIBDIR)/pkgconfig/rocksdb.pc
+
+install-headers: gen-pc
+	install -d $(INSTALL_LIBDIR)
+	install -d $(INSTALL_LIBDIR)/pkgconfig
+	for header_dir in `$(FIND) "include/rocksdb" -type d`; do \
+		install -d $(DESTDIR)/$(PREFIX)/$$header_dir; \
+	done
+	for header in `$(FIND) "include/rocksdb" -type f -name *.h`; do \
+		install -C -m 644 $$header $(DESTDIR)/$(PREFIX)/$$header; \
+	done
+	for header in $(ROCKSDB_PLUGIN_HEADERS); do \
+		install -d $(DESTDIR)/$(PREFIX)/include/rocksdb/`dirname $$header`; \
+		install -C -m 644 $$header $(DESTDIR)/$(PREFIX)/include/rocksdb/$$header; \
+	done
+	install -C -m 644 rocksdb.pc $(INSTALL_LIBDIR)/pkgconfig/rocksdb.pc
+
+install-static: install-headers $(LIBRARY)
+	install -d $(INSTALL_LIBDIR)
+	install -C -m 755 $(LIBRARY) $(INSTALL_LIBDIR)
+
+install-shared: install-headers $(SHARED4)
+	install -d $(INSTALL_LIBDIR)
+	install -C -m 755 $(SHARED4) $(INSTALL_LIBDIR)
+	ln -fs $(SHARED4) $(INSTALL_LIBDIR)/$(SHARED3)
+	ln -fs $(SHARED4) $(INSTALL_LIBDIR)/$(SHARED2)
+	ln -fs $(SHARED4) $(INSTALL_LIBDIR)/$(SHARED1)
+
+# install static by default + install shared if it exists
+install: install-static
+	[ -e $(SHARED4) ] && $(MAKE) install-shared || :
+
+# Generate the pkg-config file
+gen-pc:
+	-echo 'prefix=$(PREFIX)' > rocksdb.pc
+	-echo 'exec_prefix=$${prefix}' >> rocksdb.pc
+	-echo 'includedir=$${prefix}/include' >> rocksdb.pc
+	-echo 'libdir=$(LIBDIR)' >> rocksdb.pc
+	-echo '' >> rocksdb.pc
+	-echo 'Name: rocksdb' >> rocksdb.pc
+	-echo 'Description: An embeddable persistent key-value store for fast storage' >> rocksdb.pc
+	-echo Version: $(shell ./build_tools/version.sh full) >> rocksdb.pc
+	-echo 'Libs: -L$${libdir} $(EXEC_LDFLAGS) -lrocksdb' >> rocksdb.pc
+	-echo 'Libs.private: $(PLATFORM_LDFLAGS)' >> rocksdb.pc
+	-echo 'Cflags: -I$${includedir} $(PLATFORM_CXXFLAGS)' >> rocksdb.pc
+	-echo 'Requires: $(subst ",,$(ROCKSDB_PLUGIN_PKGCONFIG_REQUIRES))' >> rocksdb.pc
+
+#-------------------------------------------------
+
+
+# ---------------------------------------------------------------------------
+# Jni stuff
+# ---------------------------------------------------------------------------
+JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux
+ifeq ($(PLATFORM), OS_SOLARIS)
+	ARCH := $(shell isainfo -b)
+else ifeq ($(PLATFORM), OS_OPENBSD)
+	ifneq (,$(filter amd64 ppc64 ppc64le s390x arm64 aarch64 sparc64, $(MACHINE)))
+		ARCH := 64
+	else
+		ARCH := 32
+	endif
+else
+	ARCH := $(shell getconf LONG_BIT)
+endif
+
+ifeq ($(shell ldd /usr/bin/env 2>/dev/null | grep -q musl; echo $$?),0)
+        JNI_LIBC = musl
+# GNU LibC (or glibc) is so pervasive we can assume it is the default
+# else
+#        JNI_LIBC = glibc
+endif
+
+ifneq ($(origin JNI_LIBC), undefined)
+  JNI_LIBC_POSTFIX = -$(JNI_LIBC)
+endif
+
+ifeq (,$(ROCKSDBJNILIB))
+ifneq (,$(filter ppc% s390x arm64 aarch64 sparc64, $(MACHINE)))
+	ROCKSDBJNILIB = librocksdbjni-linux-$(MACHINE)$(JNI_LIBC_POSTFIX).so
+else
+	ROCKSDBJNILIB = librocksdbjni-linux$(ARCH)$(JNI_LIBC_POSTFIX).so
+endif
+endif
+ROCKSDB_JAVA_VERSION ?= $(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)
+ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-linux$(ARCH)$(JNI_LIBC_POSTFIX).jar
+ROCKSDB_JAR_ALL = rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar
+ROCKSDB_JAVADOCS_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-javadoc.jar
+ROCKSDB_SOURCES_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-sources.jar
+SHA256_CMD = sha256sum
+
+ZLIB_VER ?= 1.2.13
+ZLIB_SHA256 ?= b3a24de97a8fdbc835b9833169501030b8977031bcb54b3b3ac13740f846ab30
+ZLIB_DOWNLOAD_BASE ?= http://zlib.net
+BZIP2_VER ?= 1.0.8
+BZIP2_SHA256 ?= ab5a03176ee106d3f0fa90e381da478ddae405918153cca248e682cd0c4a2269
+BZIP2_DOWNLOAD_BASE ?= http://sourceware.org/pub/bzip2
+SNAPPY_VER ?= 1.1.8
+SNAPPY_SHA256 ?= 16b677f07832a612b0836178db7f374e414f94657c138e6993cbfc5dcc58651f
+SNAPPY_DOWNLOAD_BASE ?= https://github.com/google/snappy/archive
+LZ4_VER ?= 1.9.3
+LZ4_SHA256 ?= 030644df4611007ff7dc962d981f390361e6c97a34e5cbc393ddfbe019ffe2c1
+LZ4_DOWNLOAD_BASE ?= https://github.com/lz4/lz4/archive
+ZSTD_VER ?= 1.4.9
+ZSTD_SHA256 ?= acf714d98e3db7b876e5b540cbf6dee298f60eb3c0723104f6d3f065cd60d6a8
+ZSTD_DOWNLOAD_BASE ?= https://github.com/facebook/zstd/archive
+CURL_SSL_OPTS ?= --tlsv1
+
+ifeq ($(PLATFORM), OS_MACOSX)
+ifeq (,$(findstring librocksdbjni-osx,$(ROCKSDBJNILIB)))
+ifeq ($(MACHINE),arm64)
+	ROCKSDBJNILIB = librocksdbjni-osx-arm64.jnilib
+else ifeq ($(MACHINE),x86_64)
+	ROCKSDBJNILIB = librocksdbjni-osx-x86_64.jnilib
+else
+	ROCKSDBJNILIB = librocksdbjni-osx.jnilib
+endif
+endif
+	ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-osx.jar
+	SHA256_CMD = openssl sha256 -r
+ifneq ("$(wildcard $(JAVA_HOME)/include/darwin)","")
+	JAVA_INCLUDE = -I$(JAVA_HOME)/include -I $(JAVA_HOME)/include/darwin
+else
+	JAVA_INCLUDE = -I/System/Library/Frameworks/JavaVM.framework/Headers/
+endif
+endif
+
+ifeq ($(PLATFORM), OS_FREEBSD)
+	JAVA_INCLUDE = -I$(JAVA_HOME)/include -I$(JAVA_HOME)/include/freebsd
+	ROCKSDBJNILIB = librocksdbjni-freebsd$(ARCH).so
+	ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-freebsd$(ARCH).jar
+endif
+ifeq ($(PLATFORM), OS_SOLARIS)
+	ROCKSDBJNILIB = librocksdbjni-solaris$(ARCH).so
+	ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-solaris$(ARCH).jar
+	JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/solaris
+	SHA256_CMD = digest -a sha256
+endif
+ifeq ($(PLATFORM), OS_AIX)
+	JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/aix
+	ROCKSDBJNILIB = librocksdbjni-aix.so
+	EXTRACT_SOURCES = gunzip < TAR_GZ | tar xvf -
+	SNAPPY_MAKE_TARGET = libsnappy.la
+endif
+ifeq ($(PLATFORM), OS_OPENBSD)
+	JAVA_INCLUDE = -I$(JAVA_HOME)/include -I$(JAVA_HOME)/include/openbsd
+	ROCKSDBJNILIB = librocksdbjni-openbsd$(ARCH).so
+	ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-openbsd$(ARCH).jar
+endif
+export SHA256_CMD
+
+zlib-$(ZLIB_VER).tar.gz:
+	curl --fail --output zlib-$(ZLIB_VER).tar.gz --location ${ZLIB_DOWNLOAD_BASE}/zlib-$(ZLIB_VER).tar.gz
+	ZLIB_SHA256_ACTUAL=`$(SHA256_CMD) zlib-$(ZLIB_VER).tar.gz | cut -d ' ' -f 1`; \
+	if [ "$(ZLIB_SHA256)" != "$$ZLIB_SHA256_ACTUAL" ]; then \
+		echo zlib-$(ZLIB_VER).tar.gz checksum mismatch, expected=\"$(ZLIB_SHA256)\" actual=\"$$ZLIB_SHA256_ACTUAL\"; \
+		exit 1; \
+	fi
+
+libz.a: zlib-$(ZLIB_VER).tar.gz
+	-rm -rf zlib-$(ZLIB_VER)
+	tar xvzf zlib-$(ZLIB_VER).tar.gz
+	if [ -n"$(ARCHFLAG)" ]; then \
+		cd zlib-$(ZLIB_VER) && CFLAGS='-fPIC ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' ./configure --static --archs="$(ARCHFLAG)" && $(MAKE);  \
+	else \
+		cd zlib-$(ZLIB_VER) && CFLAGS='-fPIC ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' ./configure --static && $(MAKE);  \
+	fi
+	cp zlib-$(ZLIB_VER)/libz.a .
+
+bzip2-$(BZIP2_VER).tar.gz:
+	curl --fail --output bzip2-$(BZIP2_VER).tar.gz --location ${CURL_SSL_OPTS} ${BZIP2_DOWNLOAD_BASE}/bzip2-$(BZIP2_VER).tar.gz
+	BZIP2_SHA256_ACTUAL=`$(SHA256_CMD) bzip2-$(BZIP2_VER).tar.gz | cut -d ' ' -f 1`; \
+	if [ "$(BZIP2_SHA256)" != "$$BZIP2_SHA256_ACTUAL" ]; then \
+		echo bzip2-$(BZIP2_VER).tar.gz checksum mismatch, expected=\"$(BZIP2_SHA256)\" actual=\"$$BZIP2_SHA256_ACTUAL\"; \
+		exit 1; \
+	fi
+
+libbz2.a: bzip2-$(BZIP2_VER).tar.gz
+	-rm -rf bzip2-$(BZIP2_VER)
+	tar xvzf bzip2-$(BZIP2_VER).tar.gz
+	cd bzip2-$(BZIP2_VER) && $(MAKE) CFLAGS='-fPIC -O2 -g -D_FILE_OFFSET_BITS=64 $(ARCHFLAG) ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' AR='ar ${EXTRA_ARFLAGS}' libbz2.a
+	cp bzip2-$(BZIP2_VER)/libbz2.a .
+
+snappy-$(SNAPPY_VER).tar.gz:
+	curl --fail --output snappy-$(SNAPPY_VER).tar.gz --location ${CURL_SSL_OPTS} ${SNAPPY_DOWNLOAD_BASE}/$(SNAPPY_VER).tar.gz
+	SNAPPY_SHA256_ACTUAL=`$(SHA256_CMD) snappy-$(SNAPPY_VER).tar.gz | cut -d ' ' -f 1`; \
+	if [ "$(SNAPPY_SHA256)" != "$$SNAPPY_SHA256_ACTUAL" ]; then \
+		echo snappy-$(SNAPPY_VER).tar.gz checksum mismatch, expected=\"$(SNAPPY_SHA256)\" actual=\"$$SNAPPY_SHA256_ACTUAL\"; \
+		exit 1; \
+	fi
+
+libsnappy.a: snappy-$(SNAPPY_VER).tar.gz
+	-rm -rf snappy-$(SNAPPY_VER)
+	tar xvzf snappy-$(SNAPPY_VER).tar.gz
+	mkdir snappy-$(SNAPPY_VER)/build
+	cd snappy-$(SNAPPY_VER)/build && CFLAGS='$(ARCHFLAG) ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' CXXFLAGS='$(ARCHFLAG) ${JAVA_STATIC_DEPS_CXXFLAGS} ${EXTRA_CXXFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' cmake -DCMAKE_POSITION_INDEPENDENT_CODE=ON ${PLATFORM_CMAKE_FLAGS} .. && $(MAKE) ${SNAPPY_MAKE_TARGET}
+	cp snappy-$(SNAPPY_VER)/build/libsnappy.a .
+
+lz4-$(LZ4_VER).tar.gz:
+	curl --fail --output lz4-$(LZ4_VER).tar.gz --location ${CURL_SSL_OPTS} ${LZ4_DOWNLOAD_BASE}/v$(LZ4_VER).tar.gz
+	LZ4_SHA256_ACTUAL=`$(SHA256_CMD) lz4-$(LZ4_VER).tar.gz | cut -d ' ' -f 1`; \
+	if [ "$(LZ4_SHA256)" != "$$LZ4_SHA256_ACTUAL" ]; then \
+		echo lz4-$(LZ4_VER).tar.gz checksum mismatch, expected=\"$(LZ4_SHA256)\" actual=\"$$LZ4_SHA256_ACTUAL\"; \
+		exit 1; \
+	fi
+
+liblz4.a: lz4-$(LZ4_VER).tar.gz
+	-rm -rf lz4-$(LZ4_VER)
+	tar xvzf lz4-$(LZ4_VER).tar.gz
+	cd lz4-$(LZ4_VER)/lib && $(MAKE) CFLAGS='-fPIC -O2 $(ARCHFLAG) ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' all
+	cp lz4-$(LZ4_VER)/lib/liblz4.a .
+
+zstd-$(ZSTD_VER).tar.gz:
+	curl --fail --output zstd-$(ZSTD_VER).tar.gz --location ${CURL_SSL_OPTS} ${ZSTD_DOWNLOAD_BASE}/v$(ZSTD_VER).tar.gz
+	ZSTD_SHA256_ACTUAL=`$(SHA256_CMD) zstd-$(ZSTD_VER).tar.gz | cut -d ' ' -f 1`; \
+	if [ "$(ZSTD_SHA256)" != "$$ZSTD_SHA256_ACTUAL" ]; then \
+		echo zstd-$(ZSTD_VER).tar.gz checksum mismatch, expected=\"$(ZSTD_SHA256)\" actual=\"$$ZSTD_SHA256_ACTUAL\"; \
+		exit 1; \
+	fi
+
+libzstd.a: zstd-$(ZSTD_VER).tar.gz
+	-rm -rf zstd-$(ZSTD_VER)
+	tar xvzf zstd-$(ZSTD_VER).tar.gz
+	cd zstd-$(ZSTD_VER)/lib && DESTDIR=. PREFIX= $(MAKE) CFLAGS='-fPIC -O2 $(ARCHFLAG) ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' libzstd.a
+	cp zstd-$(ZSTD_VER)/lib/libzstd.a .
+
+# A version of each $(LIB_OBJECTS) compiled with -fPIC and a fixed set of static compression libraries
+ifneq ($(ROCKSDB_JAVA_NO_COMPRESSION), 1)
+JAVA_COMPRESSIONS = libz.a libbz2.a libsnappy.a liblz4.a libzstd.a
+endif
+
+JAVA_STATIC_FLAGS = -DZLIB -DBZIP2 -DSNAPPY -DLZ4 -DZSTD
+JAVA_STATIC_INCLUDES = -I./zlib-$(ZLIB_VER) -I./bzip2-$(BZIP2_VER) -I./snappy-$(SNAPPY_VER) -I./snappy-$(SNAPPY_VER)/build -I./lz4-$(LZ4_VER)/lib -I./zstd-$(ZSTD_VER)/lib -I./zstd-$(ZSTD_VER)/lib/dictBuilder
+
+ifneq ($(findstring rocksdbjavastatic, $(filter-out rocksdbjavastatic_deps, $(MAKECMDGOALS))),)
+CXXFLAGS += $(JAVA_STATIC_FLAGS) $(JAVA_STATIC_INCLUDES)
+CFLAGS += $(JAVA_STATIC_FLAGS) $(JAVA_STATIC_INCLUDES)
+endif
+rocksdbjavastatic:
+ifeq ($(JAVA_HOME),)
+	$(error JAVA_HOME is not set)
+endif
+	$(MAKE) rocksdbjavastatic_deps
+	$(MAKE) rocksdbjavastatic_libobjects
+	$(MAKE) rocksdbjavastatic_javalib
+	$(MAKE) rocksdbjava_jar
+
+rocksdbjavastaticosx: rocksdbjavastaticosx_archs
+	cd java; $(JAR_CMD)  -cf target/$(ROCKSDB_JAR) HISTORY*.md
+	cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR) librocksdbjni-osx-x86_64.jnilib librocksdbjni-osx-arm64.jnilib
+	cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
+	openssl sha1 java/target/$(ROCKSDB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR).sha1
+
+rocksdbjavastaticosx_ub: rocksdbjavastaticosx_archs
+	cd java/target; lipo -create -output librocksdbjni-osx.jnilib librocksdbjni-osx-x86_64.jnilib librocksdbjni-osx-arm64.jnilib
+	cd java; $(JAR_CMD)  -cf target/$(ROCKSDB_JAR) HISTORY*.md
+	cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR) librocksdbjni-osx.jnilib
+	cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
+	openssl sha1 java/target/$(ROCKSDB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR).sha1
+
+rocksdbjavastaticosx_archs:
+	$(MAKE) rocksdbjavastaticosx_arch_x86_64
+	$(MAKE) rocksdbjavastaticosx_arch_arm64
+
+rocksdbjavastaticosx_arch_%:
+ifeq ($(JAVA_HOME),)
+	$(error JAVA_HOME is not set)
+endif
+	$(MAKE) clean-ext-libraries-bin
+	$(MAKE) clean-rocks
+	ARCHFLAG="-arch $*" $(MAKE) rocksdbjavastatic_deps
+	ARCHFLAG="-arch $*" $(MAKE) rocksdbjavastatic_libobjects
+	ARCHFLAG="-arch $*" ROCKSDBJNILIB="librocksdbjni-osx-$*.jnilib" $(MAKE) rocksdbjavastatic_javalib
+
+ifeq ($(JAR_CMD),)
+ifneq ($(JAVA_HOME),)
+JAR_CMD := $(JAVA_HOME)/bin/jar
+else
+JAR_CMD := jar
+endif
+endif
+rocksdbjavastatic_javalib:
+	cd java; $(MAKE) javalib
+	rm -f java/target/$(ROCKSDBJNILIB)
+	$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC \
+	  -o ./java/target/$(ROCKSDBJNILIB) $(ALL_JNI_NATIVE_SOURCES) \
+	  $(LIB_OBJECTS) $(COVERAGEFLAGS) \
+	  $(JAVA_COMPRESSIONS) $(JAVA_STATIC_LDFLAGS)
+	cd java/target;if [ "$(DEBUG_LEVEL)" == "0" ]; then \
+		strip $(STRIPFLAGS) $(ROCKSDBJNILIB); \
+	fi
+
+rocksdbjava_jar:
+	cd java; $(JAR_CMD)  -cf target/$(ROCKSDB_JAR) HISTORY*.md
+	cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB)
+	cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
+	openssl sha1 java/target/$(ROCKSDB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR).sha1
+
+rocksdbjava_javadocs_jar:
+	cd java/target/apidocs; $(JAR_CMD) -cf ../$(ROCKSDB_JAVADOCS_JAR) *
+	openssl sha1 java/target/$(ROCKSDB_JAVADOCS_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAVADOCS_JAR).sha1
+
+rocksdbjava_sources_jar:
+	cd java/src/main/java; $(JAR_CMD) -cf ../../../target/$(ROCKSDB_SOURCES_JAR) org
+	openssl sha1 java/target/$(ROCKSDB_SOURCES_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_SOURCES_JAR).sha1
+
+rocksdbjavastatic_deps: $(JAVA_COMPRESSIONS)
+
+rocksdbjavastatic_libobjects: $(LIB_OBJECTS)
+
+rocksdbjavastaticrelease: rocksdbjavastaticosx rocksdbjava_javadocs_jar rocksdbjava_sources_jar
+	cd java/crossbuild && (vagrant destroy -f || true) && vagrant up linux32 && vagrant halt linux32 && vagrant up linux64 && vagrant halt linux64 && vagrant up linux64-musl && vagrant halt linux64-musl
+	cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md
+	cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib
+	cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class
+	openssl sha1 java/target/$(ROCKSDB_JAR_ALL) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR_ALL).sha1
+
+rocksdbjavastaticreleasedocker: rocksdbjavastaticosx rocksdbjavastaticdockerx86 rocksdbjavastaticdockerx86_64 rocksdbjavastaticdockerx86musl rocksdbjavastaticdockerx86_64musl rocksdbjava_javadocs_jar rocksdbjava_sources_jar
+	cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md
+	cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib
+	cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class
+	openssl sha1 java/target/$(ROCKSDB_JAR_ALL) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR_ALL).sha1
+
+rocksdbjavastaticdockerx86:
+	mkdir -p java/target
+	docker run --rm --name rocksdb_linux_x86-be --platform linux/386 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos6_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
+
+rocksdbjavastaticdockerx86_64:
+	mkdir -p java/target
+	docker run --rm --name rocksdb_linux_x64-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos6_x64-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
+
+rocksdbjavastaticdockerppc64le:
+	mkdir -p java/target
+	docker run --rm --name rocksdb_linux_ppc64le-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos7_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
+
+rocksdbjavastaticdockerarm64v8:
+	mkdir -p java/target
+	docker run --rm --name rocksdb_linux_arm64v8-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos7_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
+
+rocksdbjavastaticdockers390x:
+	mkdir -p java/target
+	docker run --rm --name rocksdb_linux_s390x-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:ubuntu18_s390x-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
+
+rocksdbjavastaticdockerx86musl:
+	mkdir -p java/target
+	docker run --rm --name rocksdb_linux_x86-musl-be --platform linux/386 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-alpine.sh
+
+rocksdbjavastaticdockerx86_64musl:
+	mkdir -p java/target
+	docker run --rm --name rocksdb_linux_x64-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_x64-be /rocksdb-host/java/crossbuild/docker-build-linux-alpine.sh
+
+rocksdbjavastaticdockerppc64lemusl:
+	mkdir -p java/target
+	docker run --rm --name rocksdb_linux_ppc64le-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux-alpine.sh
+
+rocksdbjavastaticdockerarm64v8musl:
+	mkdir -p java/target
+	docker run --rm --name rocksdb_linux_arm64v8-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux-alpine.sh
+
+rocksdbjavastaticdockers390xmusl:
+	mkdir -p java/target
+	docker run --rm --name rocksdb_linux_s390x-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_s390x-be /rocksdb-host/java/crossbuild/docker-build-linux-alpine.sh
+
+rocksdbjavastaticpublish: rocksdbjavastaticrelease rocksdbjavastaticpublishcentral
+
+rocksdbjavastaticpublishdocker: rocksdbjavastaticreleasedocker rocksdbjavastaticpublishcentral
+
+ROCKSDB_JAVA_RELEASE_CLASSIFIERS = javadoc sources linux64 linux32 linux64-musl linux32-musl osx win64
+
+rocksdbjavastaticpublishcentral: rocksdbjavageneratepom
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/pom.xml -Dfile=java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar
+	$(foreach classifier, $(ROCKSDB_JAVA_RELEASE_CLASSIFIERS), mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/pom.xml -Dfile=java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar -Dclassifier=$(classifier);)
+
+rocksdbjavageneratepom:
+	cd java;cat pom.xml.template | sed 's/\$${ROCKSDB_JAVA_VERSION}/$(ROCKSDB_JAVA_VERSION)/' > pom.xml
+
+rocksdbjavastaticnexusbundlejar: rocksdbjavageneratepom
+	openssl sha1 -r java/pom.xml | awk '{  print $$1 }' > java/target/pom.xml.sha1
+	openssl sha1 -r java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar | awk '{  print $$1 }' > java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar.sha1
+	$(foreach classifier, $(ROCKSDB_JAVA_RELEASE_CLASSIFIERS), openssl sha1 -r java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar | awk '{  print $$1 }' > java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar.sha1;)
+	gpg --yes --output java/target/pom.xml.asc -ab java/pom.xml
+	gpg --yes -ab java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar
+	$(foreach classifier, $(ROCKSDB_JAVA_RELEASE_CLASSIFIERS), gpg --yes -ab java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar;)
+	$(JAR_CMD) cvf java/target/nexus-bundle-rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar -C java pom.xml -C java/target pom.xml.sha1 -C java/target pom.xml.asc -C java/target rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar -C java/target rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar.sha1 -C java/target rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar.asc
+	$(foreach classifier, $(ROCKSDB_JAVA_RELEASE_CLASSIFIERS), $(JAR_CMD) uf java/target/nexus-bundle-rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar -C java/target rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar -C java/target rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar.sha1 -C java/target rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar.asc;)
+
+
+# A version of each $(LIBOBJECTS) compiled with -fPIC
+
+jl/%.o: %.cc
+	$(AM_V_CC)mkdir -p $(@D) && $(CXX) $(CXXFLAGS) -fPIC -c $< -o $@ $(COVERAGEFLAGS)
+
+rocksdbjava: $(LIB_OBJECTS)
+ifeq ($(JAVA_HOME),)
+	$(error JAVA_HOME is not set)
+endif
+	$(AM_V_GEN)cd java; $(MAKE) javalib;
+	$(AM_V_at)rm -f ./java/target/$(ROCKSDBJNILIB)
+	$(AM_V_at)$(CXX) $(CXXFLAGS) -I./java/. -I./java/rocksjni $(JAVA_INCLUDE) $(ROCKSDB_PLUGIN_JNI_CXX_INCLUDEFLAGS) -shared -fPIC -o ./java/target/$(ROCKSDBJNILIB) $(ALL_JNI_NATIVE_SOURCES) $(LIB_OBJECTS) $(JAVA_LDFLAGS) $(COVERAGEFLAGS)
+	$(AM_V_at)cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR) HISTORY*.md
+	$(AM_V_at)cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB)
+	$(AM_V_at)cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
+	$(AM_V_at)openssl sha1 java/target/$(ROCKSDB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR).sha1
+
+jclean:
+	cd java;$(MAKE) clean;
+
+jtest_compile: rocksdbjava
+	cd java;$(MAKE) java_test
+
+jtest_run:
+	cd java;$(MAKE) run_test
+
+jtest: rocksdbjava
+	cd java;$(MAKE) sample test
+
+jdb_bench:
+	cd java;$(MAKE) db_bench;
+
+commit_prereq:
+	echo "TODO: bring this back using parts of old precommit_checker.py and rocksdb-lego-determinator"
+	false # J=$(J) build_tools/precommit_checker.py unit clang_unit release clang_release tsan asan ubsan lite unit_non_shm
+	# $(MAKE) clean && $(MAKE) jclean && $(MAKE) rocksdbjava;
+
+# For public CI runs, checkout folly in a way that can build with RocksDB.
+# This is mostly intended as a test-only simulation of Meta-internal folly
+# integration.
+checkout_folly:
+	if [ -e third-party/folly ]; then \
+		cd third-party/folly && ${GIT_COMMAND} fetch origin; \
+	else \
+		cd third-party && ${GIT_COMMAND} clone https://github.com/facebook/folly.git; \
+	fi
+	@# Pin to a particular version for public CI, so that PR authors don't
+	@# need to worry about folly breaking our integration. Update periodically
+	cd third-party/folly && git reset --hard beacd86d63cd71c904632262e6c36f60874d78ba
+	@# A hack to remove boost dependency.
+	@# NOTE: this hack is only needed if building using USE_FOLLY_LITE
+	perl -pi -e 's/^(#include <boost)/\/\/$$1/' third-party/folly/folly/functional/Invoke.h
+	@# NOTE: this hack is required for clang in some cases
+	perl -pi -e 's/int rv = syscall/int rv = (int)syscall/' third-party/folly/folly/detail/Futex.cpp
+	@# NOTE: this hack is required for gcc in some cases
+	perl -pi -e 's/(__has_include.<experimental.memory_resource>.)/__cpp_rtti && $$1/' third-party/folly/folly/memory/MemoryResource.h
+
+build_folly:
+	FOLLY_INST_PATH=`cd third-party/folly; $(PYTHON) build/fbcode_builder/getdeps.py show-inst-dir`; \
+	if [ "$$FOLLY_INST_PATH" ]; then \
+		rm -rf $${FOLLY_INST_PATH}/../../*; \
+	else \
+		echo "Please run checkout_folly first"; \
+		false; \
+	fi
+	# Restore the original version of Invoke.h with boost dependency
+	cd third-party/folly && ${GIT_COMMAND} checkout folly/functional/Invoke.h
+	cd third-party/folly && MAYBE_AVX2=`echo $(CXXFLAGS) | grep -o -- -DHAVE_AVX2 | sed 's/-DHAVE_AVX2/-mavx2/g' || true` && \
+		CXXFLAGS=" $$MAYBE_AVX2 -DHAVE_CXX11_ATOMIC " $(PYTHON) build/fbcode_builder/getdeps.py build --no-tests
+
+# ---------------------------------------------------------------------------
+#   Build size testing
+# ---------------------------------------------------------------------------
+
+REPORT_BUILD_STATISTIC?=echo STATISTIC:
+
+build_size:
+	# === normal build, static ===
+	$(MAKE) clean
+	$(MAKE) static_lib
+	$(REPORT_BUILD_STATISTIC) rocksdb.build_size.static_lib $$(stat --printf="%s" librocksdb.a)
+	strip librocksdb.a
+	$(REPORT_BUILD_STATISTIC) rocksdb.build_size.static_lib_stripped $$(stat --printf="%s" librocksdb.a)
+	# === normal build, shared ===
+	$(MAKE) clean
+	$(MAKE) shared_lib
+	$(REPORT_BUILD_STATISTIC) rocksdb.build_size.shared_lib $$(stat --printf="%s" `readlink -f librocksdb.so`)
+	strip `readlink -f librocksdb.so`
+	$(REPORT_BUILD_STATISTIC) rocksdb.build_size.shared_lib_stripped $$(stat --printf="%s" `readlink -f librocksdb.so`)
+	# === lite build, static ===
+	$(MAKE) clean
+	$(MAKE) LITE=1 static_lib
+	$(REPORT_BUILD_STATISTIC) rocksdb.build_size.static_lib_lite $$(stat --printf="%s" librocksdb.a)
+	strip librocksdb.a
+	$(REPORT_BUILD_STATISTIC) rocksdb.build_size.static_lib_lite_stripped $$(stat --printf="%s" librocksdb.a)
+	# === lite build, shared ===
+	$(MAKE) clean
+	$(MAKE) LITE=1 shared_lib
+	$(REPORT_BUILD_STATISTIC) rocksdb.build_size.shared_lib_lite $$(stat --printf="%s" `readlink -f librocksdb.so`)
+	strip `readlink -f librocksdb.so`
+	$(REPORT_BUILD_STATISTIC) rocksdb.build_size.shared_lib_lite_stripped $$(stat --printf="%s" `readlink -f librocksdb.so`)
+
+# ---------------------------------------------------------------------------
+#  	Platform-specific compilation
+# ---------------------------------------------------------------------------
+
+ifeq ($(PLATFORM), IOS)
+# For iOS, create universal object files to be used on both the simulator and
+# a device.
+XCODEROOT=$(shell xcode-select -print-path)
+PLATFORMSROOT=$(XCODEROOT)/Platforms
+SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer
+DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer
+IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/version CFBundleShortVersionString)
+
+.cc.o:
+	mkdir -p ios-x86/$(dir $@)
+	$(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -arch x86_64 -c $< -o ios-x86/$@
+	mkdir -p ios-arm/$(dir $@)
+	xcrun -sdk iphoneos $(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -arch armv7s -arch arm64 -c $< -o ios-arm/$@
+	lipo ios-x86/$@ ios-arm/$@ -create -output $@
+
+.c.o:
+	mkdir -p ios-x86/$(dir $@)
+	$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -arch x86_64 -c $< -o ios-x86/$@
+	mkdir -p ios-arm/$(dir $@)
+	xcrun -sdk iphoneos $(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -arch armv7s -arch arm64 -c $< -o ios-arm/$@
+	lipo ios-x86/$@ ios-arm/$@ -create -output $@
+
+else
+ifeq ($(HAVE_POWER8),1)
+$(OBJ_DIR)/util/crc32c_ppc.o: util/crc32c_ppc.c
+	$(AM_V_CC)$(CC) $(CFLAGS) -c $< -o $@
+
+$(OBJ_DIR)/util/crc32c_ppc_asm.o: util/crc32c_ppc_asm.S
+	$(AM_V_CC)$(CC) $(CFLAGS) -c $< -o $@
+endif
+$(OBJ_DIR)/%.o: %.cc
+	$(AM_V_CC)mkdir -p $(@D) && $(CXX) $(CXXFLAGS) -c $< -o $@ $(COVERAGEFLAGS)
+
+$(OBJ_DIR)/%.o: %.cpp
+	$(AM_V_CC)mkdir -p $(@D) && $(CXX) $(CXXFLAGS) -c $< -o $@ $(COVERAGEFLAGS)
+
+$(OBJ_DIR)/%.o: %.c
+	$(AM_V_CC)$(CC) $(CFLAGS) -c $< -o $@
+endif
+
+# ---------------------------------------------------------------------------
+#  	Source files dependencies detection
+# ---------------------------------------------------------------------------
+# If skip dependencies is ON, skip including the dep files
+ifneq ($(SKIP_DEPENDS), 1)
+DEPFILES = $(patsubst %.cc, $(OBJ_DIR)/%.cc.d, $(ALL_SOURCES))
+DEPFILES+ = $(patsubst %.c, $(OBJ_DIR)/%.c.d, $(LIB_SOURCES_C) $(TEST_MAIN_SOURCES_C))
+ifeq ($(USE_FOLLY_LITE),1)
+  DEPFILES +=$(patsubst %.cpp, $(OBJ_DIR)/%.cpp.d, $(FOLLY_SOURCES))
+endif
+endif
+
+# Add proper dependency support so changing a .h file forces a .cc file to
+# rebuild.
+
+# The .d file indicates .cc file's dependencies on .h files. We generate such
+# dependency by g++'s -MM option, whose output is a make dependency rule.
+$(OBJ_DIR)/%.cc.d: %.cc
+	@mkdir -p $(@D) && $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) \
+	  -MM -MT'$@' -MT'$(<:.cc=.o)' -MT'$(<:%.cc=$(OBJ_DIR)/%.o)' \
+          "$<" -o '$@'
+
+$(OBJ_DIR)/%.cpp.d: %.cpp
+	@mkdir -p $(@D) && $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) \
+	  -MM -MT'$@' -MT'$(<:.cpp=.o)' -MT'$(<:%.cpp=$(OBJ_DIR)/%.o)' \
+          "$<" -o '$@'
+
+ifeq ($(HAVE_POWER8),1)
+DEPFILES_C = $(patsubst %.c, $(OBJ_DIR)/%.c.d, $(LIB_SOURCES_C))
+DEPFILES_ASM = $(patsubst %.S, $(OBJ_DIR)/%.S.d, $(LIB_SOURCES_ASM))
+
+$(OBJ_DIR)/%.c.d: %.c
+	@$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) \
+	  -MM -MT'$@' -MT'$(<:.c=.o)' "$<" -o '$@'
+
+$(OBJ_DIR)/%.S.d: %.S
+	@$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) \
+	  -MM -MT'$@' -MT'$(<:.S=.o)' "$<" -o '$@'
+
+$(DEPFILES_C): %.c.d
+
+$(DEPFILES_ASM): %.S.d
+depend: $(DEPFILES) $(DEPFILES_C) $(DEPFILES_ASM)
+else
+depend: $(DEPFILES)
+endif
+
+build_subset_tests: $(ROCKSDBTESTS_SUBSET)
+	$(AM_V_GEN)if [ -n "$${ROCKSDBTESTS_SUBSET_TESTS_TO_FILE}" ]; then echo "$(ROCKSDBTESTS_SUBSET)" > "$${ROCKSDBTESTS_SUBSET_TESTS_TO_FILE}"; else echo "$(ROCKSDBTESTS_SUBSET)"; fi
+
+list_all_tests:
+	echo "$(ROCKSDBTESTS_SUBSET)"
+
+# Remove the rules for which dependencies should not be generated and see if any are left.
+#If so, include the dependencies; if not, do not include the dependency files
+ROCKS_DEP_RULES=$(filter-out clean format check-format check-buck-targets check-headers check-sources jclean jtest package analyze tags rocksdbjavastatic% unity.% unity_test checkout_folly, $(MAKECMDGOALS))
+ifneq ("$(ROCKS_DEP_RULES)", "")
+-include $(DEPFILES)
+endif
diff --git a/src/rocksdb/PLUGINS.md b/src/rocksdb/PLUGINS.md
new file mode 100644
index 000000000..60a1e6590
--- /dev/null
+++ b/src/rocksdb/PLUGINS.md
@@ -0,0 +1,7 @@
+This is the list of all known third-party plugins for RocksDB. If something is missing, please open a pull request to add it.
+
+* [Dedupfs](https://github.com/ajkr/dedupfs): an example for plugin developers to reference
+* [HDFS](https://github.com/riversand963/rocksdb-hdfs-env): an Env used for interacting with HDFS. Migrated from main RocksDB repo
+* [ZenFS](https://github.com/westerndigitalcorporation/zenfs): a file system for zoned block devices
+* [RADOS](https://github.com/riversand963/rocksdb-rados-env): an Env used for interacting with RADOS. Migrated from RocksDB main repo.
+* [PMEM](https://github.com/pmem/pmem-rocksdb-plugin): a collection of plugins to enable Persistent Memory on RocksDB.
diff --git a/src/rocksdb/README.md b/src/rocksdb/README.md
new file mode 100644
index 000000000..25989d346
--- /dev/null
+++ b/src/rocksdb/README.md
@@ -0,0 +1,31 @@
+## RocksDB: A Persistent Key-Value Store for Flash and RAM Storage
+
+[![CircleCI Status](https://circleci.com/gh/facebook/rocksdb.svg?style=svg)](https://circleci.com/gh/facebook/rocksdb)
+[![Appveyor Build status](https://ci.appveyor.com/api/projects/status/fbgfu0so3afcno78/branch/main?svg=true)](https://ci.appveyor.com/project/Facebook/rocksdb/branch/main)
+[![PPC64le Build Status](http://140-211-168-68-openstack.osuosl.org:8080/buildStatus/icon?job=rocksdb&style=plastic)](http://140-211-168-68-openstack.osuosl.org:8080/job/rocksdb)
+
+RocksDB is developed and maintained by Facebook Database Engineering Team.
+It is built on earlier work on [LevelDB](https://github.com/google/leveldb) by Sanjay Ghemawat (sanjay@google.com)
+and Jeff Dean (jeff@google.com)
+
+This code is a library that forms the core building block for a fast
+key-value server, especially suited for storing data on flash drives.
+It has a Log-Structured-Merge-Database (LSM) design with flexible tradeoffs
+between Write-Amplification-Factor (WAF), Read-Amplification-Factor (RAF)
+and Space-Amplification-Factor (SAF). It has multi-threaded compactions,
+making it especially suitable for storing multiple terabytes of data in a
+single database.
+
+Start with example usage here: https://github.com/facebook/rocksdb/tree/main/examples
+
+See the [github wiki](https://github.com/facebook/rocksdb/wiki) for more explanation.
+
+The public interface is in `include/`.  Callers should not include or
+rely on the details of any other header files in this package.  Those
+internal APIs may be changed without warning.
+
+Questions and discussions are welcome on the [RocksDB Developers Public](https://www.facebook.com/groups/rocksdb.dev/) Facebook group and [email list](https://groups.google.com/g/rocksdb) on Google Groups.
+
+## License
+
+RocksDB is dual-licensed under both the GPLv2 (found in the COPYING file in the root directory) and Apache 2.0 License (found in the LICENSE.Apache file in the root directory).  You may select, at your option, one of the above-listed licenses.
diff --git a/src/rocksdb/ROCKSDB_LITE.md b/src/rocksdb/ROCKSDB_LITE.md
new file mode 100644
index 000000000..166426c60
--- /dev/null
+++ b/src/rocksdb/ROCKSDB_LITE.md
@@ -0,0 +1,21 @@
+# RocksDBLite
+
+RocksDBLite is a project focused on mobile use cases, which don't need a lot of fancy things we've built for server workloads and they are very sensitive to binary size. For that reason, we added a compile flag ROCKSDB_LITE that comments out a lot of the nonessential code and keeps the binary lean.
+
+Some examples of the features disabled by ROCKSDB_LITE:
+* compiled-in support for LDB tool
+* No backup engine
+* No support for replication (which we provide in form of TransactionalIterator)
+* No advanced monitoring tools
+* No special-purpose memtables that are highly optimized for specific use cases
+* No Transactions
+
+When adding a new big feature to RocksDB, please add ROCKSDB_LITE compile guard if:
+* Nobody from mobile really needs your feature,
+* Your feature is adding a lot of weight to the binary.
+
+Don't add ROCKSDB_LITE compile guard if:
+* It would introduce a lot of code complexity. Compile guards make code harder to read. It's a trade-off.
+* Your feature is not adding a lot of weight.
+
+If unsure, ask. :)
diff --git a/src/rocksdb/TARGETS b/src/rocksdb/TARGETS
new file mode 100644
index 000000000..40f0717b3
--- /dev/null
+++ b/src/rocksdb/TARGETS
@@ -0,0 +1,5921 @@
+# This file @generated by:
+#$ python3 buckifier/buckify_rocksdb.py
+# --> DO NOT EDIT MANUALLY <--
+# This file is a Facebook-specific integration for buck builds, so can
+# only be validated by Facebook employees.
+#
+# @noautodeps @nocodemods
+load("//rocks/buckifier:defs.bzl", "cpp_library_wrapper","rocks_cpp_library_wrapper","cpp_binary_wrapper","cpp_unittest_wrapper","fancy_bench_wrapper","add_c_test_wrapper")
+
+
+cpp_library_wrapper(name="rocksdb_lib", srcs=[
+        "cache/cache.cc",
+        "cache/cache_entry_roles.cc",
+        "cache/cache_key.cc",
+        "cache/cache_reservation_manager.cc",
+        "cache/charged_cache.cc",
+        "cache/clock_cache.cc",
+        "cache/compressed_secondary_cache.cc",
+        "cache/lru_cache.cc",
+        "cache/secondary_cache.cc",
+        "cache/sharded_cache.cc",
+        "db/arena_wrapped_db_iter.cc",
+        "db/blob/blob_contents.cc",
+        "db/blob/blob_fetcher.cc",
+        "db/blob/blob_file_addition.cc",
+        "db/blob/blob_file_builder.cc",
+        "db/blob/blob_file_cache.cc",
+        "db/blob/blob_file_garbage.cc",
+        "db/blob/blob_file_meta.cc",
+        "db/blob/blob_file_reader.cc",
+        "db/blob/blob_garbage_meter.cc",
+        "db/blob/blob_log_format.cc",
+        "db/blob/blob_log_sequential_reader.cc",
+        "db/blob/blob_log_writer.cc",
+        "db/blob/blob_source.cc",
+        "db/blob/prefetch_buffer_collection.cc",
+        "db/builder.cc",
+        "db/c.cc",
+        "db/column_family.cc",
+        "db/compaction/compaction.cc",
+        "db/compaction/compaction_iterator.cc",
+        "db/compaction/compaction_job.cc",
+        "db/compaction/compaction_outputs.cc",
+        "db/compaction/compaction_picker.cc",
+        "db/compaction/compaction_picker_fifo.cc",
+        "db/compaction/compaction_picker_level.cc",
+        "db/compaction/compaction_picker_universal.cc",
+        "db/compaction/compaction_service_job.cc",
+        "db/compaction/compaction_state.cc",
+        "db/compaction/sst_partitioner.cc",
+        "db/compaction/subcompaction_state.cc",
+        "db/convenience.cc",
+        "db/db_filesnapshot.cc",
+        "db/db_impl/compacted_db_impl.cc",
+        "db/db_impl/db_impl.cc",
+        "db/db_impl/db_impl_compaction_flush.cc",
+        "db/db_impl/db_impl_debug.cc",
+        "db/db_impl/db_impl_experimental.cc",
+        "db/db_impl/db_impl_files.cc",
+        "db/db_impl/db_impl_open.cc",
+        "db/db_impl/db_impl_readonly.cc",
+        "db/db_impl/db_impl_secondary.cc",
+        "db/db_impl/db_impl_write.cc",
+        "db/db_info_dumper.cc",
+        "db/db_iter.cc",
+        "db/dbformat.cc",
+        "db/error_handler.cc",
+        "db/event_helpers.cc",
+        "db/experimental.cc",
+        "db/external_sst_file_ingestion_job.cc",
+        "db/file_indexer.cc",
+        "db/flush_job.cc",
+        "db/flush_scheduler.cc",
+        "db/forward_iterator.cc",
+        "db/import_column_family_job.cc",
+        "db/internal_stats.cc",
+        "db/log_reader.cc",
+        "db/log_writer.cc",
+        "db/logs_with_prep_tracker.cc",
+        "db/malloc_stats.cc",
+        "db/memtable.cc",
+        "db/memtable_list.cc",
+        "db/merge_helper.cc",
+        "db/merge_operator.cc",
+        "db/output_validator.cc",
+        "db/periodic_task_scheduler.cc",
+        "db/range_del_aggregator.cc",
+        "db/range_tombstone_fragmenter.cc",
+        "db/repair.cc",
+        "db/seqno_to_time_mapping.cc",
+        "db/snapshot_impl.cc",
+        "db/table_cache.cc",
+        "db/table_properties_collector.cc",
+        "db/transaction_log_impl.cc",
+        "db/trim_history_scheduler.cc",
+        "db/version_builder.cc",
+        "db/version_edit.cc",
+        "db/version_edit_handler.cc",
+        "db/version_set.cc",
+        "db/wal_edit.cc",
+        "db/wal_manager.cc",
+        "db/wide/wide_column_serialization.cc",
+        "db/wide/wide_columns.cc",
+        "db/write_batch.cc",
+        "db/write_batch_base.cc",
+        "db/write_controller.cc",
+        "db/write_thread.cc",
+        "env/composite_env.cc",
+        "env/env.cc",
+        "env/env_chroot.cc",
+        "env/env_encryption.cc",
+        "env/env_posix.cc",
+        "env/file_system.cc",
+        "env/file_system_tracer.cc",
+        "env/fs_posix.cc",
+        "env/fs_remap.cc",
+        "env/io_posix.cc",
+        "env/mock_env.cc",
+        "env/unique_id_gen.cc",
+        "file/delete_scheduler.cc",
+        "file/file_prefetch_buffer.cc",
+        "file/file_util.cc",
+        "file/filename.cc",
+        "file/line_file_reader.cc",
+        "file/random_access_file_reader.cc",
+        "file/read_write_util.cc",
+        "file/readahead_raf.cc",
+        "file/sequence_file_reader.cc",
+        "file/sst_file_manager_impl.cc",
+        "file/writable_file_writer.cc",
+        "logging/auto_roll_logger.cc",
+        "logging/event_logger.cc",
+        "logging/log_buffer.cc",
+        "memory/arena.cc",
+        "memory/concurrent_arena.cc",
+        "memory/jemalloc_nodump_allocator.cc",
+        "memory/memkind_kmem_allocator.cc",
+        "memory/memory_allocator.cc",
+        "memtable/alloc_tracker.cc",
+        "memtable/hash_linklist_rep.cc",
+        "memtable/hash_skiplist_rep.cc",
+        "memtable/skiplistrep.cc",
+        "memtable/vectorrep.cc",
+        "memtable/write_buffer_manager.cc",
+        "monitoring/histogram.cc",
+        "monitoring/histogram_windowing.cc",
+        "monitoring/in_memory_stats_history.cc",
+        "monitoring/instrumented_mutex.cc",
+        "monitoring/iostats_context.cc",
+        "monitoring/perf_context.cc",
+        "monitoring/perf_level.cc",
+        "monitoring/persistent_stats_history.cc",
+        "monitoring/statistics.cc",
+        "monitoring/thread_status_impl.cc",
+        "monitoring/thread_status_updater.cc",
+        "monitoring/thread_status_updater_debug.cc",
+        "monitoring/thread_status_util.cc",
+        "monitoring/thread_status_util_debug.cc",
+        "options/cf_options.cc",
+        "options/configurable.cc",
+        "options/customizable.cc",
+        "options/db_options.cc",
+        "options/options.cc",
+        "options/options_helper.cc",
+        "options/options_parser.cc",
+        "port/port_posix.cc",
+        "port/stack_trace.cc",
+        "port/win/env_default.cc",
+        "port/win/env_win.cc",
+        "port/win/io_win.cc",
+        "port/win/port_win.cc",
+        "port/win/win_logger.cc",
+        "port/win/win_thread.cc",
+        "table/adaptive/adaptive_table_factory.cc",
+        "table/block_based/binary_search_index_reader.cc",
+        "table/block_based/block.cc",
+        "table/block_based/block_based_table_builder.cc",
+        "table/block_based/block_based_table_factory.cc",
+        "table/block_based/block_based_table_iterator.cc",
+        "table/block_based/block_based_table_reader.cc",
+        "table/block_based/block_builder.cc",
+        "table/block_based/block_prefetcher.cc",
+        "table/block_based/block_prefix_index.cc",
+        "table/block_based/data_block_footer.cc",
+        "table/block_based/data_block_hash_index.cc",
+        "table/block_based/filter_block_reader_common.cc",
+        "table/block_based/filter_policy.cc",
+        "table/block_based/flush_block_policy.cc",
+        "table/block_based/full_filter_block.cc",
+        "table/block_based/hash_index_reader.cc",
+        "table/block_based/index_builder.cc",
+        "table/block_based/index_reader_common.cc",
+        "table/block_based/parsed_full_filter_block.cc",
+        "table/block_based/partitioned_filter_block.cc",
+        "table/block_based/partitioned_index_iterator.cc",
+        "table/block_based/partitioned_index_reader.cc",
+        "table/block_based/reader_common.cc",
+        "table/block_based/uncompression_dict_reader.cc",
+        "table/block_fetcher.cc",
+        "table/cuckoo/cuckoo_table_builder.cc",
+        "table/cuckoo/cuckoo_table_factory.cc",
+        "table/cuckoo/cuckoo_table_reader.cc",
+        "table/format.cc",
+        "table/get_context.cc",
+        "table/iterator.cc",
+        "table/merging_iterator.cc",
+        "table/meta_blocks.cc",
+        "table/persistent_cache_helper.cc",
+        "table/plain/plain_table_bloom.cc",
+        "table/plain/plain_table_builder.cc",
+        "table/plain/plain_table_factory.cc",
+        "table/plain/plain_table_index.cc",
+        "table/plain/plain_table_key_coding.cc",
+        "table/plain/plain_table_reader.cc",
+        "table/sst_file_dumper.cc",
+        "table/sst_file_reader.cc",
+        "table/sst_file_writer.cc",
+        "table/table_factory.cc",
+        "table/table_properties.cc",
+        "table/two_level_iterator.cc",
+        "table/unique_id.cc",
+        "test_util/sync_point.cc",
+        "test_util/sync_point_impl.cc",
+        "test_util/transaction_test_util.cc",
+        "tools/dump/db_dump_tool.cc",
+        "tools/io_tracer_parser_tool.cc",
+        "tools/ldb_cmd.cc",
+        "tools/ldb_tool.cc",
+        "tools/sst_dump_tool.cc",
+        "trace_replay/block_cache_tracer.cc",
+        "trace_replay/io_tracer.cc",
+        "trace_replay/trace_record.cc",
+        "trace_replay/trace_record_handler.cc",
+        "trace_replay/trace_record_result.cc",
+        "trace_replay/trace_replay.cc",
+        "util/async_file_reader.cc",
+        "util/build_version.cc",
+        "util/cleanable.cc",
+        "util/coding.cc",
+        "util/compaction_job_stats_impl.cc",
+        "util/comparator.cc",
+        "util/compression.cc",
+        "util/compression_context_cache.cc",
+        "util/concurrent_task_limiter_impl.cc",
+        "util/crc32c.cc",
+        "util/crc32c_arm64.cc",
+        "util/dynamic_bloom.cc",
+        "util/file_checksum_helper.cc",
+        "util/hash.cc",
+        "util/murmurhash.cc",
+        "util/random.cc",
+        "util/rate_limiter.cc",
+        "util/ribbon_config.cc",
+        "util/slice.cc",
+        "util/status.cc",
+        "util/stderr_logger.cc",
+        "util/string_util.cc",
+        "util/thread_local.cc",
+        "util/threadpool_imp.cc",
+        "util/xxhash.cc",
+        "utilities/agg_merge/agg_merge.cc",
+        "utilities/backup/backup_engine.cc",
+        "utilities/blob_db/blob_compaction_filter.cc",
+        "utilities/blob_db/blob_db.cc",
+        "utilities/blob_db/blob_db_impl.cc",
+        "utilities/blob_db/blob_db_impl_filesnapshot.cc",
+        "utilities/blob_db/blob_dump_tool.cc",
+        "utilities/blob_db/blob_file.cc",
+        "utilities/cache_dump_load.cc",
+        "utilities/cache_dump_load_impl.cc",
+        "utilities/cassandra/cassandra_compaction_filter.cc",
+        "utilities/cassandra/format.cc",
+        "utilities/cassandra/merge_operator.cc",
+        "utilities/checkpoint/checkpoint_impl.cc",
+        "utilities/compaction_filters.cc",
+        "utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc",
+        "utilities/convenience/info_log_finder.cc",
+        "utilities/counted_fs.cc",
+        "utilities/debug.cc",
+        "utilities/env_mirror.cc",
+        "utilities/env_timed.cc",
+        "utilities/fault_injection_env.cc",
+        "utilities/fault_injection_fs.cc",
+        "utilities/fault_injection_secondary_cache.cc",
+        "utilities/leveldb_options/leveldb_options.cc",
+        "utilities/memory/memory_util.cc",
+        "utilities/merge_operators.cc",
+        "utilities/merge_operators/bytesxor.cc",
+        "utilities/merge_operators/max.cc",
+        "utilities/merge_operators/put.cc",
+        "utilities/merge_operators/sortlist.cc",
+        "utilities/merge_operators/string_append/stringappend.cc",
+        "utilities/merge_operators/string_append/stringappend2.cc",
+        "utilities/merge_operators/uint64add.cc",
+        "utilities/object_registry.cc",
+        "utilities/option_change_migration/option_change_migration.cc",
+        "utilities/options/options_util.cc",
+        "utilities/persistent_cache/block_cache_tier.cc",
+        "utilities/persistent_cache/block_cache_tier_file.cc",
+        "utilities/persistent_cache/block_cache_tier_metadata.cc",
+        "utilities/persistent_cache/persistent_cache_tier.cc",
+        "utilities/persistent_cache/volatile_tier_impl.cc",
+        "utilities/simulator_cache/cache_simulator.cc",
+        "utilities/simulator_cache/sim_cache.cc",
+        "utilities/table_properties_collectors/compact_on_deletion_collector.cc",
+        "utilities/trace/file_trace_reader_writer.cc",
+        "utilities/trace/replayer_impl.cc",
+        "utilities/transactions/lock/lock_manager.cc",
+        "utilities/transactions/lock/point/point_lock_manager.cc",
+        "utilities/transactions/lock/point/point_lock_tracker.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc",
+        "utilities/transactions/lock/range/range_tree/lib/standalone_port.cc",
+        "utilities/transactions/lock/range/range_tree/lib/util/dbt.cc",
+        "utilities/transactions/lock/range/range_tree/lib/util/memarena.cc",
+        "utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc",
+        "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc",
+        "utilities/transactions/optimistic_transaction.cc",
+        "utilities/transactions/optimistic_transaction_db_impl.cc",
+        "utilities/transactions/pessimistic_transaction.cc",
+        "utilities/transactions/pessimistic_transaction_db.cc",
+        "utilities/transactions/snapshot_checker.cc",
+        "utilities/transactions/transaction_base.cc",
+        "utilities/transactions/transaction_db_mutex_impl.cc",
+        "utilities/transactions/transaction_util.cc",
+        "utilities/transactions/write_prepared_txn.cc",
+        "utilities/transactions/write_prepared_txn_db.cc",
+        "utilities/transactions/write_unprepared_txn.cc",
+        "utilities/transactions/write_unprepared_txn_db.cc",
+        "utilities/ttl/db_ttl_impl.cc",
+        "utilities/wal_filter.cc",
+        "utilities/write_batch_with_index/write_batch_with_index.cc",
+        "utilities/write_batch_with_index/write_batch_with_index_internal.cc",
+    ], deps=[
+        "//folly/container:f14_hash",
+        "//folly/experimental/coro:blocking_wait",
+        "//folly/experimental/coro:collect",
+        "//folly/experimental/coro:coroutine",
+        "//folly/experimental/coro:task",
+        "//folly/synchronization:distributed_mutex",
+    ], headers=None, link_whole=False, extra_test_libs=False)
+
+cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[
+        "cache/cache.cc",
+        "cache/cache_entry_roles.cc",
+        "cache/cache_key.cc",
+        "cache/cache_reservation_manager.cc",
+        "cache/charged_cache.cc",
+        "cache/clock_cache.cc",
+        "cache/compressed_secondary_cache.cc",
+        "cache/lru_cache.cc",
+        "cache/secondary_cache.cc",
+        "cache/sharded_cache.cc",
+        "db/arena_wrapped_db_iter.cc",
+        "db/blob/blob_contents.cc",
+        "db/blob/blob_fetcher.cc",
+        "db/blob/blob_file_addition.cc",
+        "db/blob/blob_file_builder.cc",
+        "db/blob/blob_file_cache.cc",
+        "db/blob/blob_file_garbage.cc",
+        "db/blob/blob_file_meta.cc",
+        "db/blob/blob_file_reader.cc",
+        "db/blob/blob_garbage_meter.cc",
+        "db/blob/blob_log_format.cc",
+        "db/blob/blob_log_sequential_reader.cc",
+        "db/blob/blob_log_writer.cc",
+        "db/blob/blob_source.cc",
+        "db/blob/prefetch_buffer_collection.cc",
+        "db/builder.cc",
+        "db/c.cc",
+        "db/column_family.cc",
+        "db/compaction/compaction.cc",
+        "db/compaction/compaction_iterator.cc",
+        "db/compaction/compaction_job.cc",
+        "db/compaction/compaction_outputs.cc",
+        "db/compaction/compaction_picker.cc",
+        "db/compaction/compaction_picker_fifo.cc",
+        "db/compaction/compaction_picker_level.cc",
+        "db/compaction/compaction_picker_universal.cc",
+        "db/compaction/compaction_service_job.cc",
+        "db/compaction/compaction_state.cc",
+        "db/compaction/sst_partitioner.cc",
+        "db/compaction/subcompaction_state.cc",
+        "db/convenience.cc",
+        "db/db_filesnapshot.cc",
+        "db/db_impl/compacted_db_impl.cc",
+        "db/db_impl/db_impl.cc",
+        "db/db_impl/db_impl_compaction_flush.cc",
+        "db/db_impl/db_impl_debug.cc",
+        "db/db_impl/db_impl_experimental.cc",
+        "db/db_impl/db_impl_files.cc",
+        "db/db_impl/db_impl_open.cc",
+        "db/db_impl/db_impl_readonly.cc",
+        "db/db_impl/db_impl_secondary.cc",
+        "db/db_impl/db_impl_write.cc",
+        "db/db_info_dumper.cc",
+        "db/db_iter.cc",
+        "db/dbformat.cc",
+        "db/error_handler.cc",
+        "db/event_helpers.cc",
+        "db/experimental.cc",
+        "db/external_sst_file_ingestion_job.cc",
+        "db/file_indexer.cc",
+        "db/flush_job.cc",
+        "db/flush_scheduler.cc",
+        "db/forward_iterator.cc",
+        "db/import_column_family_job.cc",
+        "db/internal_stats.cc",
+        "db/log_reader.cc",
+        "db/log_writer.cc",
+        "db/logs_with_prep_tracker.cc",
+        "db/malloc_stats.cc",
+        "db/memtable.cc",
+        "db/memtable_list.cc",
+        "db/merge_helper.cc",
+        "db/merge_operator.cc",
+        "db/output_validator.cc",
+        "db/periodic_task_scheduler.cc",
+        "db/range_del_aggregator.cc",
+        "db/range_tombstone_fragmenter.cc",
+        "db/repair.cc",
+        "db/seqno_to_time_mapping.cc",
+        "db/snapshot_impl.cc",
+        "db/table_cache.cc",
+        "db/table_properties_collector.cc",
+        "db/transaction_log_impl.cc",
+        "db/trim_history_scheduler.cc",
+        "db/version_builder.cc",
+        "db/version_edit.cc",
+        "db/version_edit_handler.cc",
+        "db/version_set.cc",
+        "db/wal_edit.cc",
+        "db/wal_manager.cc",
+        "db/wide/wide_column_serialization.cc",
+        "db/wide/wide_columns.cc",
+        "db/write_batch.cc",
+        "db/write_batch_base.cc",
+        "db/write_controller.cc",
+        "db/write_thread.cc",
+        "env/composite_env.cc",
+        "env/env.cc",
+        "env/env_chroot.cc",
+        "env/env_encryption.cc",
+        "env/env_posix.cc",
+        "env/file_system.cc",
+        "env/file_system_tracer.cc",
+        "env/fs_posix.cc",
+        "env/fs_remap.cc",
+        "env/io_posix.cc",
+        "env/mock_env.cc",
+        "env/unique_id_gen.cc",
+        "file/delete_scheduler.cc",
+        "file/file_prefetch_buffer.cc",
+        "file/file_util.cc",
+        "file/filename.cc",
+        "file/line_file_reader.cc",
+        "file/random_access_file_reader.cc",
+        "file/read_write_util.cc",
+        "file/readahead_raf.cc",
+        "file/sequence_file_reader.cc",
+        "file/sst_file_manager_impl.cc",
+        "file/writable_file_writer.cc",
+        "logging/auto_roll_logger.cc",
+        "logging/event_logger.cc",
+        "logging/log_buffer.cc",
+        "memory/arena.cc",
+        "memory/concurrent_arena.cc",
+        "memory/jemalloc_nodump_allocator.cc",
+        "memory/memkind_kmem_allocator.cc",
+        "memory/memory_allocator.cc",
+        "memtable/alloc_tracker.cc",
+        "memtable/hash_linklist_rep.cc",
+        "memtable/hash_skiplist_rep.cc",
+        "memtable/skiplistrep.cc",
+        "memtable/vectorrep.cc",
+        "memtable/write_buffer_manager.cc",
+        "monitoring/histogram.cc",
+        "monitoring/histogram_windowing.cc",
+        "monitoring/in_memory_stats_history.cc",
+        "monitoring/instrumented_mutex.cc",
+        "monitoring/iostats_context.cc",
+        "monitoring/perf_context.cc",
+        "monitoring/perf_level.cc",
+        "monitoring/persistent_stats_history.cc",
+        "monitoring/statistics.cc",
+        "monitoring/thread_status_impl.cc",
+        "monitoring/thread_status_updater.cc",
+        "monitoring/thread_status_updater_debug.cc",
+        "monitoring/thread_status_util.cc",
+        "monitoring/thread_status_util_debug.cc",
+        "options/cf_options.cc",
+        "options/configurable.cc",
+        "options/customizable.cc",
+        "options/db_options.cc",
+        "options/options.cc",
+        "options/options_helper.cc",
+        "options/options_parser.cc",
+        "port/port_posix.cc",
+        "port/stack_trace.cc",
+        "port/win/env_default.cc",
+        "port/win/env_win.cc",
+        "port/win/io_win.cc",
+        "port/win/port_win.cc",
+        "port/win/win_logger.cc",
+        "port/win/win_thread.cc",
+        "table/adaptive/adaptive_table_factory.cc",
+        "table/block_based/binary_search_index_reader.cc",
+        "table/block_based/block.cc",
+        "table/block_based/block_based_table_builder.cc",
+        "table/block_based/block_based_table_factory.cc",
+        "table/block_based/block_based_table_iterator.cc",
+        "table/block_based/block_based_table_reader.cc",
+        "table/block_based/block_builder.cc",
+        "table/block_based/block_prefetcher.cc",
+        "table/block_based/block_prefix_index.cc",
+        "table/block_based/data_block_footer.cc",
+        "table/block_based/data_block_hash_index.cc",
+        "table/block_based/filter_block_reader_common.cc",
+        "table/block_based/filter_policy.cc",
+        "table/block_based/flush_block_policy.cc",
+        "table/block_based/full_filter_block.cc",
+        "table/block_based/hash_index_reader.cc",
+        "table/block_based/index_builder.cc",
+        "table/block_based/index_reader_common.cc",
+        "table/block_based/parsed_full_filter_block.cc",
+        "table/block_based/partitioned_filter_block.cc",
+        "table/block_based/partitioned_index_iterator.cc",
+        "table/block_based/partitioned_index_reader.cc",
+        "table/block_based/reader_common.cc",
+        "table/block_based/uncompression_dict_reader.cc",
+        "table/block_fetcher.cc",
+        "table/cuckoo/cuckoo_table_builder.cc",
+        "table/cuckoo/cuckoo_table_factory.cc",
+        "table/cuckoo/cuckoo_table_reader.cc",
+        "table/format.cc",
+        "table/get_context.cc",
+        "table/iterator.cc",
+        "table/merging_iterator.cc",
+        "table/meta_blocks.cc",
+        "table/persistent_cache_helper.cc",
+        "table/plain/plain_table_bloom.cc",
+        "table/plain/plain_table_builder.cc",
+        "table/plain/plain_table_factory.cc",
+        "table/plain/plain_table_index.cc",
+        "table/plain/plain_table_key_coding.cc",
+        "table/plain/plain_table_reader.cc",
+        "table/sst_file_dumper.cc",
+        "table/sst_file_reader.cc",
+        "table/sst_file_writer.cc",
+        "table/table_factory.cc",
+        "table/table_properties.cc",
+        "table/two_level_iterator.cc",
+        "table/unique_id.cc",
+        "test_util/sync_point.cc",
+        "test_util/sync_point_impl.cc",
+        "test_util/transaction_test_util.cc",
+        "tools/dump/db_dump_tool.cc",
+        "tools/io_tracer_parser_tool.cc",
+        "tools/ldb_cmd.cc",
+        "tools/ldb_tool.cc",
+        "tools/sst_dump_tool.cc",
+        "trace_replay/block_cache_tracer.cc",
+        "trace_replay/io_tracer.cc",
+        "trace_replay/trace_record.cc",
+        "trace_replay/trace_record_handler.cc",
+        "trace_replay/trace_record_result.cc",
+        "trace_replay/trace_replay.cc",
+        "util/async_file_reader.cc",
+        "util/build_version.cc",
+        "util/cleanable.cc",
+        "util/coding.cc",
+        "util/compaction_job_stats_impl.cc",
+        "util/comparator.cc",
+        "util/compression.cc",
+        "util/compression_context_cache.cc",
+        "util/concurrent_task_limiter_impl.cc",
+        "util/crc32c.cc",
+        "util/crc32c_arm64.cc",
+        "util/dynamic_bloom.cc",
+        "util/file_checksum_helper.cc",
+        "util/hash.cc",
+        "util/murmurhash.cc",
+        "util/random.cc",
+        "util/rate_limiter.cc",
+        "util/ribbon_config.cc",
+        "util/slice.cc",
+        "util/status.cc",
+        "util/stderr_logger.cc",
+        "util/string_util.cc",
+        "util/thread_local.cc",
+        "util/threadpool_imp.cc",
+        "util/xxhash.cc",
+        "utilities/agg_merge/agg_merge.cc",
+        "utilities/backup/backup_engine.cc",
+        "utilities/blob_db/blob_compaction_filter.cc",
+        "utilities/blob_db/blob_db.cc",
+        "utilities/blob_db/blob_db_impl.cc",
+        "utilities/blob_db/blob_db_impl_filesnapshot.cc",
+        "utilities/blob_db/blob_dump_tool.cc",
+        "utilities/blob_db/blob_file.cc",
+        "utilities/cache_dump_load.cc",
+        "utilities/cache_dump_load_impl.cc",
+        "utilities/cassandra/cassandra_compaction_filter.cc",
+        "utilities/cassandra/format.cc",
+        "utilities/cassandra/merge_operator.cc",
+        "utilities/checkpoint/checkpoint_impl.cc",
+        "utilities/compaction_filters.cc",
+        "utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc",
+        "utilities/convenience/info_log_finder.cc",
+        "utilities/counted_fs.cc",
+        "utilities/debug.cc",
+        "utilities/env_mirror.cc",
+        "utilities/env_timed.cc",
+        "utilities/fault_injection_env.cc",
+        "utilities/fault_injection_fs.cc",
+        "utilities/fault_injection_secondary_cache.cc",
+        "utilities/leveldb_options/leveldb_options.cc",
+        "utilities/memory/memory_util.cc",
+        "utilities/merge_operators.cc",
+        "utilities/merge_operators/bytesxor.cc",
+        "utilities/merge_operators/max.cc",
+        "utilities/merge_operators/put.cc",
+        "utilities/merge_operators/sortlist.cc",
+        "utilities/merge_operators/string_append/stringappend.cc",
+        "utilities/merge_operators/string_append/stringappend2.cc",
+        "utilities/merge_operators/uint64add.cc",
+        "utilities/object_registry.cc",
+        "utilities/option_change_migration/option_change_migration.cc",
+        "utilities/options/options_util.cc",
+        "utilities/persistent_cache/block_cache_tier.cc",
+        "utilities/persistent_cache/block_cache_tier_file.cc",
+        "utilities/persistent_cache/block_cache_tier_metadata.cc",
+        "utilities/persistent_cache/persistent_cache_tier.cc",
+        "utilities/persistent_cache/volatile_tier_impl.cc",
+        "utilities/simulator_cache/cache_simulator.cc",
+        "utilities/simulator_cache/sim_cache.cc",
+        "utilities/table_properties_collectors/compact_on_deletion_collector.cc",
+        "utilities/trace/file_trace_reader_writer.cc",
+        "utilities/trace/replayer_impl.cc",
+        "utilities/transactions/lock/lock_manager.cc",
+        "utilities/transactions/lock/point/point_lock_manager.cc",
+        "utilities/transactions/lock/point/point_lock_tracker.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc",
+        "utilities/transactions/lock/range/range_tree/lib/standalone_port.cc",
+        "utilities/transactions/lock/range/range_tree/lib/util/dbt.cc",
+        "utilities/transactions/lock/range/range_tree/lib/util/memarena.cc",
+        "utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc",
+        "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc",
+        "utilities/transactions/optimistic_transaction.cc",
+        "utilities/transactions/optimistic_transaction_db_impl.cc",
+        "utilities/transactions/pessimistic_transaction.cc",
+        "utilities/transactions/pessimistic_transaction_db.cc",
+        "utilities/transactions/snapshot_checker.cc",
+        "utilities/transactions/transaction_base.cc",
+        "utilities/transactions/transaction_db_mutex_impl.cc",
+        "utilities/transactions/transaction_util.cc",
+        "utilities/transactions/write_prepared_txn.cc",
+        "utilities/transactions/write_prepared_txn_db.cc",
+        "utilities/transactions/write_unprepared_txn.cc",
+        "utilities/transactions/write_unprepared_txn_db.cc",
+        "utilities/ttl/db_ttl_impl.cc",
+        "utilities/wal_filter.cc",
+        "utilities/write_batch_with_index/write_batch_with_index.cc",
+        "utilities/write_batch_with_index/write_batch_with_index_internal.cc",
+    ], deps=[
+        "//folly/container:f14_hash",
+        "//folly/experimental/coro:blocking_wait",
+        "//folly/experimental/coro:collect",
+        "//folly/experimental/coro:coroutine",
+        "//folly/experimental/coro:task",
+        "//folly/synchronization:distributed_mutex",
+    ], headers=None, link_whole=True, extra_test_libs=False)
+
+cpp_library_wrapper(name="rocksdb_test_lib", srcs=[
+        "db/db_test_util.cc",
+        "db/db_with_timestamp_test_util.cc",
+        "table/mock_table.cc",
+        "test_util/mock_time_env.cc",
+        "test_util/testharness.cc",
+        "test_util/testutil.cc",
+        "tools/block_cache_analyzer/block_cache_trace_analyzer.cc",
+        "tools/trace_analyzer_tool.cc",
+        "utilities/agg_merge/test_agg_merge.cc",
+        "utilities/cassandra/test_utils.cc",
+    ], deps=[":rocksdb_lib"], headers=None, link_whole=False, extra_test_libs=True)
+
+cpp_library_wrapper(name="rocksdb_tools_lib", srcs=[
+        "test_util/testutil.cc",
+        "tools/block_cache_analyzer/block_cache_trace_analyzer.cc",
+        "tools/db_bench_tool.cc",
+        "tools/simulated_hybrid_file_system.cc",
+        "tools/trace_analyzer_tool.cc",
+    ], deps=[":rocksdb_lib"], headers=None, link_whole=False, extra_test_libs=False)
+
+cpp_library_wrapper(name="rocksdb_cache_bench_tools_lib", srcs=["cache/cache_bench_tool.cc"], deps=[":rocksdb_lib"], headers=None, link_whole=False, extra_test_libs=False)
+
+rocks_cpp_library_wrapper(name="rocksdb_stress_lib", srcs=[
+        "db_stress_tool/batched_ops_stress.cc",
+        "db_stress_tool/cf_consistency_stress.cc",
+        "db_stress_tool/db_stress_common.cc",
+        "db_stress_tool/db_stress_driver.cc",
+        "db_stress_tool/db_stress_gflags.cc",
+        "db_stress_tool/db_stress_listener.cc",
+        "db_stress_tool/db_stress_shared_state.cc",
+        "db_stress_tool/db_stress_stat.cc",
+        "db_stress_tool/db_stress_test_base.cc",
+        "db_stress_tool/db_stress_tool.cc",
+        "db_stress_tool/expected_state.cc",
+        "db_stress_tool/multi_ops_txns_stress.cc",
+        "db_stress_tool/no_batched_ops_stress.cc",
+        "test_util/testutil.cc",
+        "tools/block_cache_analyzer/block_cache_trace_analyzer.cc",
+        "tools/trace_analyzer_tool.cc",
+    ], headers=None)
+
+
+cpp_binary_wrapper(name="db_stress", srcs=["db_stress_tool/db_stress.cc"], deps=[":rocksdb_stress_lib"], extra_preprocessor_flags=[], extra_bench_libs=False)
+
+cpp_binary_wrapper(name="ribbon_bench", srcs=["microbench/ribbon_bench.cc"], deps=[], extra_preprocessor_flags=[], extra_bench_libs=True)
+
+cpp_binary_wrapper(name="db_basic_bench", srcs=["microbench/db_basic_bench.cc"], deps=[], extra_preprocessor_flags=[], extra_bench_libs=True)
+
+add_c_test_wrapper()
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_0", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                                'get_mean',
+                                                                                                                                                                'threads',
+                                                                                                                                                                'real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1/iterations:51200/threads:8': ['real_time',
+                                                                                                                                       'put_mean',
+                                                                                                                                       'cpu_time',
+                                                                                                                                       'db_size',
+                                                                                                                                       'threads'],
+                    'DBPut/comp_style:2/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:0/iterations:51200/threads:8': ['real_time',
+                                                                                                                                       'cpu_time',
+                                                                                                                                       'db_size',
+                                                                                                                                       'threads']},
+ 'ribbon_bench': {'FilterBuild/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                              'cpu_time',
+                                                                                              'threads',
+                                                                                              'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                  'cpu_time',
+                                                                                                  'threads',
+                                                                                                  'size'],
+                  'FilterQueryNegative/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryPositive/filter_impl:2/bits_per_key:10/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads'],
+                  'FilterQueryPositive/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads'],
+                  'FilterQueryPositive/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads'],
+                  'FilterQueryPositive/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads']}}, slow=False, expected_runtime=2438, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_1", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                                'get_mean',
+                                                                                                                                                                'threads',
+                                                                                                                                                                'real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                                'get_mean',
+                                                                                                                                                                'threads',
+                                                                                                                                                                'real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBPut/comp_style:2/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:409600/threads:1': ['real_time',
+                                                                                                                                        'put_mean',
+                                                                                                                                        'cpu_time',
+                                                                                                                                        'db_size',
+                                                                                                                                        'threads'],
+                    'DBPut/comp_style:2/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1/iterations:51200/threads:8': ['real_time',
+                                                                                                                                       'put_mean',
+                                                                                                                                       'cpu_time',
+                                                                                                                                       'db_size',
+                                                                                                                                       'threads']},
+ 'ribbon_bench': {'FilterBuild/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                               'cpu_time',
+                                                                                               'threads',
+                                                                                               'size'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads',
+                                                                                                         'fp_pct'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:20/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads'],
+                  'FilterQueryPositive/filter_impl:2/bits_per_key:20/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads'],
+                  'FilterQueryPositive/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads']}}, slow=False, expected_runtime=2437, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_2", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                                'get_mean',
+                                                                                                                                                                'threads',
+                                                                                                                                                                'real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'neg_qu_pct'],
+                    'DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:0/iterations:409600/threads:1': ['real_time',
+                                                                                                                                        'cpu_time',
+                                                                                                                                        'db_size',
+                                                                                                                                        'threads'],
+                    'DBPut/comp_style:2/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:0/iterations:409600/threads:1': ['real_time',
+                                                                                                                                        'cpu_time',
+                                                                                                                                        'db_size',
+                                                                                                                                        'threads']}}, slow=False, expected_runtime=2446, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_3", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'db_size',
+                                                                                                                                                              'neg_qu_pct',
+                                                                                                                                                              'threads'],
+                    'DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                                'get_mean',
+                                                                                                                                                                'threads',
+                                                                                                                                                                'real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'neg_qu_pct'],
+                    'DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:1/iterations:51200/threads:8': ['real_time',
+                                                                                                                                       'cpu_time',
+                                                                                                                                       'db_size',
+                                                                                                                                       'threads'],
+                    'DataBlockSeek/iterations:1000000': ['real_time',
+                                                         'cpu_time',
+                                                         'seek_ns',
+                                                         'threads']},
+ 'ribbon_bench': {'FilterBuild/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                               'cpu_time',
+                                                                                               'threads',
+                                                                                               'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                  'cpu_time',
+                                                                                                  'threads',
+                                                                                                  'size'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads',
+                                                                                                         'fp_pct'],
+                  'FilterQueryNegative/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryPositive/filter_impl:2/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads']}}, slow=False, expected_runtime=2437, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_4", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:1/iterations:51200/threads:8': ['real_time',
+                                                                                                                                       'cpu_time',
+                                                                                                                                       'db_size',
+                                                                                                                                       'threads'],
+                    'RandomAccessFileReaderRead/enable_statistics:1/iterations:1000000': ['real_time',
+                                                                                          'cpu_time',
+                                                                                          'threads']},
+ 'ribbon_bench': {'FilterBuild/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                              'cpu_time',
+                                                                                              'threads',
+                                                                                              'size'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads',
+                                                                                                          'fp_pct'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads',
+                                                                                                         'fp_pct'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads',
+                                                                                                          'fp_pct'],
+                  'FilterQueryPositive/filter_impl:2/bits_per_key:20/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads']}}, slow=False, expected_runtime=2437, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_5", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'db_size',
+                                                                                                                                                              'neg_qu_pct',
+                                                                                                                                                              'threads'],
+                    'DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBPut/comp_style:2/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:1/iterations:51200/threads:8': ['real_time',
+                                                                                                                                       'cpu_time',
+                                                                                                                                       'db_size',
+                                                                                                                                       'threads']},
+ 'ribbon_bench': {'FilterBuild/filter_impl:2/bits_per_key:20/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                              'cpu_time',
+                                                                                              'threads',
+                                                                                              'size'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:20/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads',
+                                                                                                          'fp_pct'],
+                  'FilterQueryNegative/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads'],
+                  'FilterQueryPositive/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads'],
+                  'FilterQueryPositive/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads'],
+                  'FilterQueryPositive/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads']}}, slow=False, expected_runtime=2437, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_6", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBPut/comp_style:2/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:1/iterations:409600/threads:1': ['real_time',
+                                                                                                                                        'cpu_time',
+                                                                                                                                        'db_size',
+                                                                                                                                        'threads'],
+                    'DBPut/comp_style:2/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:51200/threads:8': ['real_time',
+                                                                                                                                       'put_mean',
+                                                                                                                                       'cpu_time',
+                                                                                                                                       'db_size',
+                                                                                                                                       'threads']},
+ 'ribbon_bench': {'FilterBuild/filter_impl:2/bits_per_key:10/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                               'cpu_time',
+                                                                                               'threads',
+                                                                                               'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                  'cpu_time',
+                                                                                                  'threads',
+                                                                                                  'size'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryPositive/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads']}}, slow=False, expected_runtime=2437, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_7", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:51200/threads:8': ['real_time',
+                                                                                                                                       'put_mean',
+                                                                                                                                       'cpu_time',
+                                                                                                                                       'db_size',
+                                                                                                                                       'threads'],
+                    'DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1/iterations:409600/threads:1': ['real_time',
+                                                                                                                                        'put_mean',
+                                                                                                                                        'cpu_time',
+                                                                                                                                        'db_size',
+                                                                                                                                        'threads'],
+                    'RandomAccessFileReaderRead/enable_statistics:0/iterations:1000000': ['real_time',
+                                                                                          'cpu_time',
+                                                                                          'threads']},
+ 'ribbon_bench': {'FilterBuild/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                              'cpu_time',
+                                                                                              'threads',
+                                                                                              'size'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:10/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct']}}, slow=False, expected_runtime=2438, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_8", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:0/iterations:409600/threads:1': ['real_time',
+                                                                                                                                        'cpu_time',
+                                                                                                                                        'db_size',
+                                                                                                                                        'threads'],
+                    'DBPut/comp_style:2/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1/iterations:409600/threads:1': ['real_time',
+                                                                                                                                        'put_mean',
+                                                                                                                                        'cpu_time',
+                                                                                                                                        'db_size',
+                                                                                                                                        'threads']},
+ 'ribbon_bench': {'FilterBuild/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:20/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:20/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads',
+                                                                                                         'fp_pct'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads']}}, slow=False, expected_runtime=2437, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_9", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                                'get_mean',
+                                                                                                                                                                'threads',
+                                                                                                                                                                'real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'neg_qu_pct'],
+                    'DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:0/iterations:51200/threads:8': ['real_time',
+                                                                                                                                       'cpu_time',
+                                                                                                                                       'db_size',
+                                                                                                                                       'threads'],
+                    'DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:1/iterations:409600/threads:1': ['real_time',
+                                                                                                                                        'cpu_time',
+                                                                                                                                        'db_size',
+                                                                                                                                        'threads']},
+ 'ribbon_bench': {'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                               'cpu_time',
+                                                                                               'threads',
+                                                                                               'size'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads',
+                                                                                                          'fp_pct'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads']}}, slow=False, expected_runtime=2437, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_10", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                                'get_mean',
+                                                                                                                                                                'threads',
+                                                                                                                                                                'real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1/iterations:409600/threads:1': ['real_time',
+                                                                                                                                        'put_mean',
+                                                                                                                                        'cpu_time',
+                                                                                                                                        'db_size',
+                                                                                                                                        'threads']},
+ 'ribbon_bench': {'FilterBuild/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                              'cpu_time',
+                                                                                              'threads',
+                                                                                              'size'],
+                  'FilterQueryPositive/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads']}}, slow=False, expected_runtime=2437, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_11", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                                'get_mean',
+                                                                                                                                                                'threads',
+                                                                                                                                                                'real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'neg_qu_pct'],
+                    'DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                                'get_mean',
+                                                                                                                                                                'threads',
+                                                                                                                                                                'real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'neg_qu_pct'],
+                    'DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                                'get_mean',
+                                                                                                                                                                'threads',
+                                                                                                                                                                'real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'neg_qu_pct'],
+                    'DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:409600/threads:1': ['real_time',
+                                                                                                                                        'put_mean',
+                                                                                                                                        'cpu_time',
+                                                                                                                                        'db_size',
+                                                                                                                                        'threads'],
+                    'DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:51200/threads:8': ['real_time',
+                                                                                                                                       'put_mean',
+                                                                                                                                       'cpu_time',
+                                                                                                                                       'db_size',
+                                                                                                                                       'threads']}}, slow=False, expected_runtime=2446, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_12", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                                'get_mean',
+                                                                                                                                                                'threads',
+                                                                                                                                                                'real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'neg_qu_pct'],
+                    'DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct']},
+ 'ribbon_bench': {'FilterBuild/filter_impl:0/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                               'cpu_time',
+                                                                                               'threads',
+                                                                                               'size'],
+                  'FilterBuild/filter_impl:0/bits_per_key:20/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                  'cpu_time',
+                                                                                                  'threads',
+                                                                                                  'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                               'cpu_time',
+                                                                                               'threads',
+                                                                                               'size'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads',
+                                                                                                          'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:20/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads',
+                                                                                                         'fp_pct'],
+                  'FilterQueryNegative/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads',
+                                                                                                         'fp_pct'],
+                  'FilterQueryNegative/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads',
+                                                                                                          'fp_pct'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads'],
+                  'FilterQueryPositive/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads']}}, slow=False, expected_runtime=2437, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_13", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:1/iterations:409600/threads:1': ['real_time',
+                                                                                                                                        'cpu_time',
+                                                                                                                                        'db_size',
+                                                                                                                                        'threads'],
+                    'DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:409600/threads:1': ['real_time',
+                                                                                                                                        'put_mean',
+                                                                                                                                        'cpu_time',
+                                                                                                                                        'db_size',
+                                                                                                                                        'threads']},
+ 'ribbon_bench': {'FilterBuild/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                  'cpu_time',
+                                                                                                  'threads',
+                                                                                                  'size'],
+                  'FilterQueryNegative/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryPositive/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads'],
+                  'FilterQueryPositive/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads']}}, slow=False, expected_runtime=2437, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_14", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                                'get_mean',
+                                                                                                                                                                'threads',
+                                                                                                                                                                'real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'neg_qu_pct'],
+                    'DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:0/iterations:51200/threads:8': ['real_time',
+                                                                                                                                       'cpu_time',
+                                                                                                                                       'db_size',
+                                                                                                                                       'threads'],
+                    'DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1/iterations:51200/threads:8': ['real_time',
+                                                                                                                                       'put_mean',
+                                                                                                                                       'cpu_time',
+                                                                                                                                       'db_size',
+                                                                                                                                       'threads']},
+ 'ribbon_bench': {'FilterBuild/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                              'cpu_time',
+                                                                                              'threads',
+                                                                                              'size'],
+                  'FilterBuild/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                  'cpu_time',
+                                                                                                  'threads',
+                                                                                                  'size'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads']}}, slow=False, expected_runtime=2437, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_0_slow", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'db_size',
+                                                                                                                                                              'neg_qu_pct',
+                                                                                                                                                              'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                                'get_mean',
+                                                                                                                                                                'threads',
+                                                                                                                                                                'real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'neg_qu_pct'],
+                    'DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:409600/threads:1': ['real_time',
+                                                                                                                                        'put_mean',
+                                                                                                                                        'cpu_time',
+                                                                                                                                        'db_size',
+                                                                                                                                        'threads'],
+                    'IteratorPrev/comp_style:1/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorPrev/comp_style:2/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                         'cpu_time',
+                                                                                                                                         'db_size',
+                                                                                                                                         'threads'],
+                    'PrefixSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                   'cpu_time',
+                                                                                                                                                   'db_size',
+                                                                                                                                                   'threads'],
+                    'PrefixSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads']},
+ 'ribbon_bench': {'FilterBuild/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                              'cpu_time',
+                                                                                              'threads',
+                                                                                              'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                  'cpu_time',
+                                                                                                  'threads',
+                                                                                                  'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                               'cpu_time',
+                                                                                               'threads',
+                                                                                               'size'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads',
+                                                                                                         'fp_pct'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads'],
+                  'FilterQueryPositive/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads']}}, slow=True, expected_runtime=88891, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_1_slow", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'db_size',
+                                                                                                                                                              'neg_qu_pct',
+                                                                                                                                                              'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                                'get_mean',
+                                                                                                                                                                'threads',
+                                                                                                                                                                'real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'neg_qu_pct'],
+                    'DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:409600/threads:1': ['real_time',
+                                                                                                                                        'put_mean',
+                                                                                                                                        'cpu_time',
+                                                                                                                                        'db_size',
+                                                                                                                                        'threads'],
+                    'IteratorPrev/comp_style:1/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorPrev/comp_style:2/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                         'cpu_time',
+                                                                                                                                         'db_size',
+                                                                                                                                         'threads'],
+                    'PrefixSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                   'cpu_time',
+                                                                                                                                                   'db_size',
+                                                                                                                                                   'threads'],
+                    'PrefixSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads']},
+ 'ribbon_bench': {'FilterBuild/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                              'cpu_time',
+                                                                                              'threads',
+                                                                                              'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                  'cpu_time',
+                                                                                                  'threads',
+                                                                                                  'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                               'cpu_time',
+                                                                                               'threads',
+                                                                                               'size'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads',
+                                                                                                         'fp_pct'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads'],
+                  'FilterQueryPositive/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads']}}, slow=True, expected_runtime=88804, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_2_slow", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'db_size',
+                                                                                                                                                              'neg_qu_pct',
+                                                                                                                                                              'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                                'get_mean',
+                                                                                                                                                                'threads',
+                                                                                                                                                                'real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'neg_qu_pct'],
+                    'DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:409600/threads:1': ['real_time',
+                                                                                                                                        'put_mean',
+                                                                                                                                        'cpu_time',
+                                                                                                                                        'db_size',
+                                                                                                                                        'threads'],
+                    'IteratorPrev/comp_style:1/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorPrev/comp_style:2/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                         'cpu_time',
+                                                                                                                                         'db_size',
+                                                                                                                                         'threads'],
+                    'PrefixSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                   'cpu_time',
+                                                                                                                                                   'db_size',
+                                                                                                                                                   'threads'],
+                    'PrefixSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads']},
+ 'ribbon_bench': {'FilterBuild/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                              'cpu_time',
+                                                                                              'threads',
+                                                                                              'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                  'cpu_time',
+                                                                                                  'threads',
+                                                                                                  'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                               'cpu_time',
+                                                                                               'threads',
+                                                                                               'size'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads',
+                                                                                                         'fp_pct'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads'],
+                  'FilterQueryPositive/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads']}}, slow=True, expected_runtime=88803, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_3_slow", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'db_size',
+                                                                                                                                                              'neg_qu_pct',
+                                                                                                                                                              'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                                'get_mean',
+                                                                                                                                                                'threads',
+                                                                                                                                                                'real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'neg_qu_pct'],
+                    'DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:409600/threads:1': ['real_time',
+                                                                                                                                        'put_mean',
+                                                                                                                                        'cpu_time',
+                                                                                                                                        'db_size',
+                                                                                                                                        'threads'],
+                    'IteratorPrev/comp_style:1/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorPrev/comp_style:2/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                         'cpu_time',
+                                                                                                                                         'db_size',
+                                                                                                                                         'threads'],
+                    'PrefixSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                   'cpu_time',
+                                                                                                                                                   'db_size',
+                                                                                                                                                   'threads'],
+                    'PrefixSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads']},
+ 'ribbon_bench': {'FilterBuild/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                              'cpu_time',
+                                                                                              'threads',
+                                                                                              'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                  'cpu_time',
+                                                                                                  'threads',
+                                                                                                  'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                               'cpu_time',
+                                                                                               'threads',
+                                                                                               'size'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads',
+                                                                                                         'fp_pct'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads'],
+                  'FilterQueryPositive/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads']}}, slow=True, expected_runtime=88891, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_4_slow", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'db_size',
+                                                                                                                                                              'neg_qu_pct',
+                                                                                                                                                              'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                                'get_mean',
+                                                                                                                                                                'threads',
+                                                                                                                                                                'real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'neg_qu_pct'],
+                    'DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:409600/threads:1': ['real_time',
+                                                                                                                                        'put_mean',
+                                                                                                                                        'cpu_time',
+                                                                                                                                        'db_size',
+                                                                                                                                        'threads'],
+                    'IteratorPrev/comp_style:1/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorPrev/comp_style:2/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                         'cpu_time',
+                                                                                                                                         'db_size',
+                                                                                                                                         'threads'],
+                    'PrefixSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                   'cpu_time',
+                                                                                                                                                   'db_size',
+                                                                                                                                                   'threads'],
+                    'PrefixSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads']},
+ 'ribbon_bench': {'FilterBuild/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                              'cpu_time',
+                                                                                              'threads',
+                                                                                              'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                  'cpu_time',
+                                                                                                  'threads',
+                                                                                                  'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                               'cpu_time',
+                                                                                               'threads',
+                                                                                               'size'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads',
+                                                                                                         'fp_pct'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads'],
+                  'FilterQueryPositive/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads']}}, slow=True, expected_runtime=88809, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_5_slow", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'db_size',
+                                                                                                                                                              'neg_qu_pct',
+                                                                                                                                                              'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                                'get_mean',
+                                                                                                                                                                'threads',
+                                                                                                                                                                'real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'neg_qu_pct'],
+                    'DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:409600/threads:1': ['real_time',
+                                                                                                                                        'put_mean',
+                                                                                                                                        'cpu_time',
+                                                                                                                                        'db_size',
+                                                                                                                                        'threads'],
+                    'IteratorPrev/comp_style:1/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorPrev/comp_style:2/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                         'cpu_time',
+                                                                                                                                         'db_size',
+                                                                                                                                         'threads'],
+                    'PrefixSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                   'cpu_time',
+                                                                                                                                                   'db_size',
+                                                                                                                                                   'threads'],
+                    'PrefixSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads']},
+ 'ribbon_bench': {'FilterBuild/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                              'cpu_time',
+                                                                                              'threads',
+                                                                                              'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                  'cpu_time',
+                                                                                                  'threads',
+                                                                                                  'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                               'cpu_time',
+                                                                                               'threads',
+                                                                                               'size'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads',
+                                                                                                         'fp_pct'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads'],
+                  'FilterQueryPositive/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads']}}, slow=True, expected_runtime=88803, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_6_slow", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'db_size',
+                                                                                                                                                              'neg_qu_pct',
+                                                                                                                                                              'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                                'get_mean',
+                                                                                                                                                                'threads',
+                                                                                                                                                                'real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'neg_qu_pct'],
+                    'DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:409600/threads:1': ['real_time',
+                                                                                                                                        'put_mean',
+                                                                                                                                        'cpu_time',
+                                                                                                                                        'db_size',
+                                                                                                                                        'threads'],
+                    'IteratorPrev/comp_style:1/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorPrev/comp_style:2/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                         'cpu_time',
+                                                                                                                                         'db_size',
+                                                                                                                                         'threads'],
+                    'PrefixSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                   'cpu_time',
+                                                                                                                                                   'db_size',
+                                                                                                                                                   'threads'],
+                    'PrefixSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads']},
+ 'ribbon_bench': {'FilterBuild/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                              'cpu_time',
+                                                                                              'threads',
+                                                                                              'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                  'cpu_time',
+                                                                                                  'threads',
+                                                                                                  'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                               'cpu_time',
+                                                                                               'threads',
+                                                                                               'size'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads',
+                                                                                                         'fp_pct'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads'],
+                  'FilterQueryPositive/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads']}}, slow=True, expected_runtime=88813, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_7_slow", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'db_size',
+                                                                                                                                                              'neg_qu_pct',
+                                                                                                                                                              'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                                'get_mean',
+                                                                                                                                                                'threads',
+                                                                                                                                                                'real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'neg_qu_pct'],
+                    'DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:409600/threads:1': ['real_time',
+                                                                                                                                        'put_mean',
+                                                                                                                                        'cpu_time',
+                                                                                                                                        'db_size',
+                                                                                                                                        'threads'],
+                    'IteratorPrev/comp_style:1/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorPrev/comp_style:2/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                         'cpu_time',
+                                                                                                                                         'db_size',
+                                                                                                                                         'threads'],
+                    'PrefixSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                   'cpu_time',
+                                                                                                                                                   'db_size',
+                                                                                                                                                   'threads'],
+                    'PrefixSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads']},
+ 'ribbon_bench': {'FilterBuild/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                              'cpu_time',
+                                                                                              'threads',
+                                                                                              'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                  'cpu_time',
+                                                                                                  'threads',
+                                                                                                  'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                               'cpu_time',
+                                                                                               'threads',
+                                                                                               'size'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads',
+                                                                                                         'fp_pct'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads'],
+                  'FilterQueryPositive/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads']}}, slow=True, expected_runtime=88813, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_8_slow", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'db_size',
+                                                                                                                                                              'neg_qu_pct',
+                                                                                                                                                              'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                                'get_mean',
+                                                                                                                                                                'threads',
+                                                                                                                                                                'real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'neg_qu_pct'],
+                    'DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:409600/threads:1': ['real_time',
+                                                                                                                                        'put_mean',
+                                                                                                                                        'cpu_time',
+                                                                                                                                        'db_size',
+                                                                                                                                        'threads'],
+                    'IteratorPrev/comp_style:1/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorPrev/comp_style:2/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                         'cpu_time',
+                                                                                                                                         'db_size',
+                                                                                                                                         'threads'],
+                    'PrefixSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                   'cpu_time',
+                                                                                                                                                   'db_size',
+                                                                                                                                                   'threads'],
+                    'PrefixSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads']},
+ 'ribbon_bench': {'FilterBuild/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                              'cpu_time',
+                                                                                              'threads',
+                                                                                              'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                  'cpu_time',
+                                                                                                  'threads',
+                                                                                                  'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                               'cpu_time',
+                                                                                               'threads',
+                                                                                               'size'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads',
+                                                                                                         'fp_pct'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads'],
+                  'FilterQueryPositive/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads']}}, slow=True, expected_runtime=88709, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_9_slow", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'db_size',
+                                                                                                                                                              'neg_qu_pct',
+                                                                                                                                                              'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                                'get_mean',
+                                                                                                                                                                'threads',
+                                                                                                                                                                'real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'neg_qu_pct'],
+                    'DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:409600/threads:1': ['real_time',
+                                                                                                                                        'put_mean',
+                                                                                                                                        'cpu_time',
+                                                                                                                                        'db_size',
+                                                                                                                                        'threads'],
+                    'IteratorPrev/comp_style:1/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorPrev/comp_style:2/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                         'cpu_time',
+                                                                                                                                         'db_size',
+                                                                                                                                         'threads'],
+                    'PrefixSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                   'cpu_time',
+                                                                                                                                                   'db_size',
+                                                                                                                                                   'threads'],
+                    'PrefixSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads']},
+ 'ribbon_bench': {'FilterBuild/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                              'cpu_time',
+                                                                                              'threads',
+                                                                                              'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                  'cpu_time',
+                                                                                                  'threads',
+                                                                                                  'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                               'cpu_time',
+                                                                                               'threads',
+                                                                                               'size'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads',
+                                                                                                         'fp_pct'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads'],
+                  'FilterQueryPositive/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads']}}, slow=True, expected_runtime=88711, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_10_slow", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'db_size',
+                                                                                                                                                              'neg_qu_pct',
+                                                                                                                                                              'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                                'get_mean',
+                                                                                                                                                                'threads',
+                                                                                                                                                                'real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'neg_qu_pct'],
+                    'DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:409600/threads:1': ['real_time',
+                                                                                                                                        'put_mean',
+                                                                                                                                        'cpu_time',
+                                                                                                                                        'db_size',
+                                                                                                                                        'threads'],
+                    'IteratorPrev/comp_style:1/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorPrev/comp_style:2/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                         'cpu_time',
+                                                                                                                                         'db_size',
+                                                                                                                                         'threads'],
+                    'PrefixSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                   'cpu_time',
+                                                                                                                                                   'db_size',
+                                                                                                                                                   'threads'],
+                    'PrefixSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads']},
+ 'ribbon_bench': {'FilterBuild/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                              'cpu_time',
+                                                                                              'threads',
+                                                                                              'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                  'cpu_time',
+                                                                                                  'threads',
+                                                                                                  'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                               'cpu_time',
+                                                                                               'threads',
+                                                                                               'size'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads',
+                                                                                                         'fp_pct'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads'],
+                  'FilterQueryPositive/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads']}}, slow=True, expected_runtime=88819, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_11_slow", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'db_size',
+                                                                                                                                                              'neg_qu_pct',
+                                                                                                                                                              'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                                'get_mean',
+                                                                                                                                                                'threads',
+                                                                                                                                                                'real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'neg_qu_pct'],
+                    'DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:409600/threads:1': ['real_time',
+                                                                                                                                        'put_mean',
+                                                                                                                                        'cpu_time',
+                                                                                                                                        'db_size',
+                                                                                                                                        'threads'],
+                    'IteratorPrev/comp_style:1/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorPrev/comp_style:2/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                         'cpu_time',
+                                                                                                                                         'db_size',
+                                                                                                                                         'threads'],
+                    'PrefixSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                   'cpu_time',
+                                                                                                                                                   'db_size',
+                                                                                                                                                   'threads'],
+                    'PrefixSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads']},
+ 'ribbon_bench': {'FilterBuild/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                              'cpu_time',
+                                                                                              'threads',
+                                                                                              'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                  'cpu_time',
+                                                                                                  'threads',
+                                                                                                  'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                               'cpu_time',
+                                                                                               'threads',
+                                                                                               'size'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads',
+                                                                                                         'fp_pct'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads'],
+                  'FilterQueryPositive/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads']}}, slow=True, expected_runtime=88711, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_12_slow", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'db_size',
+                                                                                                                                                              'neg_qu_pct',
+                                                                                                                                                              'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                                'get_mean',
+                                                                                                                                                                'threads',
+                                                                                                                                                                'real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'neg_qu_pct'],
+                    'DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:409600/threads:1': ['real_time',
+                                                                                                                                        'put_mean',
+                                                                                                                                        'cpu_time',
+                                                                                                                                        'db_size',
+                                                                                                                                        'threads'],
+                    'IteratorPrev/comp_style:1/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorPrev/comp_style:2/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                         'cpu_time',
+                                                                                                                                         'db_size',
+                                                                                                                                         'threads'],
+                    'PrefixSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                   'cpu_time',
+                                                                                                                                                   'db_size',
+                                                                                                                                                   'threads'],
+                    'PrefixSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads']},
+ 'ribbon_bench': {'FilterBuild/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                              'cpu_time',
+                                                                                              'threads',
+                                                                                              'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                  'cpu_time',
+                                                                                                  'threads',
+                                                                                                  'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                               'cpu_time',
+                                                                                               'threads',
+                                                                                               'size'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads',
+                                                                                                         'fp_pct'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads'],
+                  'FilterQueryPositive/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads']}}, slow=True, expected_runtime=88709, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_13_slow", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'db_size',
+                                                                                                                                                              'neg_qu_pct',
+                                                                                                                                                              'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                                'get_mean',
+                                                                                                                                                                'threads',
+                                                                                                                                                                'real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'neg_qu_pct'],
+                    'DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:409600/threads:1': ['real_time',
+                                                                                                                                        'put_mean',
+                                                                                                                                        'cpu_time',
+                                                                                                                                        'db_size',
+                                                                                                                                        'threads'],
+                    'IteratorPrev/comp_style:1/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorPrev/comp_style:2/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                         'cpu_time',
+                                                                                                                                         'db_size',
+                                                                                                                                         'threads'],
+                    'PrefixSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                   'cpu_time',
+                                                                                                                                                   'db_size',
+                                                                                                                                                   'threads'],
+                    'PrefixSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads']},
+ 'ribbon_bench': {'FilterBuild/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                              'cpu_time',
+                                                                                              'threads',
+                                                                                              'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                  'cpu_time',
+                                                                                                  'threads',
+                                                                                                  'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                               'cpu_time',
+                                                                                               'threads',
+                                                                                               'size'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads',
+                                                                                                         'fp_pct'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads'],
+                  'FilterQueryPositive/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads']}}, slow=True, expected_runtime=88709, sl_iterations=3, regression_threshold=10)
+
+
+fancy_bench_wrapper(suite_name="rocksdb_microbench_suite_14_slow", binary_to_bench_to_metric_list_map={'db_basic_bench': {'DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'db_size',
+                                                                                                                                                                'neg_qu_pct',
+                                                                                                                                                                'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'db_size',
+                                                                                                                                                              'neg_qu_pct',
+                                                                                                                                                              'threads'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                               'get_mean',
+                                                                                                                                                               'threads',
+                                                                                                                                                               'real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'neg_qu_pct'],
+                    'DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                               'cpu_time',
+                                                                                                                                                               'db_size',
+                                                                                                                                                               'neg_qu_pct',
+                                                                                                                                                               'threads'],
+                    'DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['db_size',
+                                                                                                                                                              'get_mean',
+                                                                                                                                                              'threads',
+                                                                                                                                                              'real_time',
+                                                                                                                                                              'cpu_time',
+                                                                                                                                                              'neg_qu_pct'],
+                    'DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['db_size',
+                                                                                                                                                                'get_mean',
+                                                                                                                                                                'threads',
+                                                                                                                                                                'real_time',
+                                                                                                                                                                'cpu_time',
+                                                                                                                                                                'neg_qu_pct'],
+                    'DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:409600/threads:1': ['real_time',
+                                                                                                                                        'put_mean',
+                                                                                                                                        'cpu_time',
+                                                                                                                                        'db_size',
+                                                                                                                                        'threads'],
+                    'IteratorPrev/comp_style:1/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorPrev/comp_style:2/max_data:536870912/per_key_size:256/iterations:10240': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'db_size',
+                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                     'cpu_time',
+                                                                                                                                                                     'db_size',
+                                                                                                                                                                     'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                       'cpu_time',
+                                                                                                                                                                       'db_size',
+                                                                                                                                                                       'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1': ['real_time',
+                                                                                                                                                                      'cpu_time',
+                                                                                                                                                                      'db_size',
+                                                                                                                                                                      'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/enable_filter:0/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                  'cpu_time',
+                                                                                                                                                  'db_size',
+                                                                                                                                                  'threads'],
+                    'PrefixSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/enable_filter:0/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads'],
+                    'PrefixSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                         'cpu_time',
+                                                                                                                                         'db_size',
+                                                                                                                                         'threads'],
+                    'PrefixSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:1280/threads:8': ['real_time',
+                                                                                                                                                   'cpu_time',
+                                                                                                                                                   'db_size',
+                                                                                                                                                   'threads'],
+                    'PrefixSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240': ['real_time',
+                                                                                                                                          'cpu_time',
+                                                                                                                                          'db_size',
+                                                                                                                                          'threads']},
+ 'ribbon_bench': {'FilterBuild/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                              'cpu_time',
+                                                                                              'threads',
+                                                                                              'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                  'cpu_time',
+                                                                                                  'threads',
+                                                                                                  'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                 'cpu_time',
+                                                                                                 'threads',
+                                                                                                 'size'],
+                  'FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                               'cpu_time',
+                                                                                               'threads',
+                                                                                               'size'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1024': ['real_time',
+                                                                                                      'cpu_time',
+                                                                                                      'threads',
+                                                                                                      'fp_pct'],
+                  'FilterQueryNegative/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1024': ['real_time',
+                                                                                                       'cpu_time',
+                                                                                                       'threads',
+                                                                                                       'fp_pct'],
+                  'FilterQueryNegative/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads',
+                                                                                                         'fp_pct'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1048576': ['real_time',
+                                                                                                         'cpu_time',
+                                                                                                         'threads'],
+                  'FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads'],
+                  'FilterQueryPositive/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1048576': ['real_time',
+                                                                                                          'cpu_time',
+                                                                                                          'threads']}}, slow=True, expected_runtime=88711, sl_iterations=3, regression_threshold=10)
+
+
+        # Generate a test rule for each entry in ROCKS_TESTS
+        # Do not build the tests in opt mode, since SyncPoint and other test code
+        # will not be included.
+
+cpp_unittest_wrapper(name="agg_merge_test",
+            srcs=["utilities/agg_merge/agg_merge_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="arena_test",
+            srcs=["memory/arena_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="auto_roll_logger_test",
+            srcs=["logging/auto_roll_logger_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="autovector_test",
+            srcs=["util/autovector_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="backup_engine_test",
+            srcs=["utilities/backup/backup_engine_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="blob_counting_iterator_test",
+            srcs=["db/blob/blob_counting_iterator_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="blob_db_test",
+            srcs=["utilities/blob_db/blob_db_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="blob_file_addition_test",
+            srcs=["db/blob/blob_file_addition_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="blob_file_builder_test",
+            srcs=["db/blob/blob_file_builder_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="blob_file_cache_test",
+            srcs=["db/blob/blob_file_cache_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="blob_file_garbage_test",
+            srcs=["db/blob/blob_file_garbage_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="blob_file_reader_test",
+            srcs=["db/blob/blob_file_reader_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="blob_garbage_meter_test",
+            srcs=["db/blob/blob_garbage_meter_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="blob_source_test",
+            srcs=["db/blob/blob_source_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="block_based_table_reader_test",
+            srcs=["table/block_based/block_based_table_reader_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="block_cache_trace_analyzer_test",
+            srcs=["tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="block_cache_tracer_test",
+            srcs=["trace_replay/block_cache_tracer_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="block_fetcher_test",
+            srcs=["table/block_fetcher_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="block_test",
+            srcs=["table/block_based/block_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="bloom_test",
+            srcs=["util/bloom_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="cache_reservation_manager_test",
+            srcs=["cache/cache_reservation_manager_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="cache_simulator_test",
+            srcs=["utilities/simulator_cache/cache_simulator_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="cache_test",
+            srcs=["cache/cache_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="cassandra_format_test",
+            srcs=["utilities/cassandra/cassandra_format_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="cassandra_functional_test",
+            srcs=["utilities/cassandra/cassandra_functional_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="cassandra_row_merge_test",
+            srcs=["utilities/cassandra/cassandra_row_merge_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="cassandra_serialize_test",
+            srcs=["utilities/cassandra/cassandra_serialize_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="checkpoint_test",
+            srcs=["utilities/checkpoint/checkpoint_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="cleanable_test",
+            srcs=["table/cleanable_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="clipping_iterator_test",
+            srcs=["db/compaction/clipping_iterator_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="coding_test",
+            srcs=["util/coding_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="column_family_test",
+            srcs=["db/column_family_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="compact_files_test",
+            srcs=["db/compact_files_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="compact_on_deletion_collector_test",
+            srcs=["utilities/table_properties_collectors/compact_on_deletion_collector_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="compaction_iterator_test",
+            srcs=["db/compaction/compaction_iterator_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="compaction_job_stats_test",
+            srcs=["db/compaction/compaction_job_stats_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="compaction_job_test",
+            srcs=["db/compaction/compaction_job_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="compaction_picker_test",
+            srcs=["db/compaction/compaction_picker_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="compaction_service_test",
+            srcs=["db/compaction/compaction_service_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="comparator_db_test",
+            srcs=["db/comparator_db_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="compressed_secondary_cache_test",
+            srcs=["cache/compressed_secondary_cache_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="configurable_test",
+            srcs=["options/configurable_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="corruption_test",
+            srcs=["db/corruption_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="crc32c_test",
+            srcs=["util/crc32c_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="cuckoo_table_builder_test",
+            srcs=["table/cuckoo/cuckoo_table_builder_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="cuckoo_table_db_test",
+            srcs=["db/cuckoo_table_db_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="cuckoo_table_reader_test",
+            srcs=["table/cuckoo/cuckoo_table_reader_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="customizable_test",
+            srcs=["options/customizable_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="data_block_hash_index_test",
+            srcs=["table/block_based/data_block_hash_index_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_basic_test",
+            srcs=["db/db_basic_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_blob_basic_test",
+            srcs=["db/blob/db_blob_basic_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_blob_compaction_test",
+            srcs=["db/blob/db_blob_compaction_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_blob_corruption_test",
+            srcs=["db/blob/db_blob_corruption_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_blob_index_test",
+            srcs=["db/blob/db_blob_index_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_block_cache_test",
+            srcs=["db/db_block_cache_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_bloom_filter_test",
+            srcs=["db/db_bloom_filter_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_compaction_filter_test",
+            srcs=["db/db_compaction_filter_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_compaction_test",
+            srcs=["db/db_compaction_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_dynamic_level_test",
+            srcs=["db/db_dynamic_level_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_encryption_test",
+            srcs=["db/db_encryption_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_flush_test",
+            srcs=["db/db_flush_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_inplace_update_test",
+            srcs=["db/db_inplace_update_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_io_failure_test",
+            srcs=["db/db_io_failure_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_iter_stress_test",
+            srcs=["db/db_iter_stress_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_iter_test",
+            srcs=["db/db_iter_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_iterator_test",
+            srcs=["db/db_iterator_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_kv_checksum_test",
+            srcs=["db/db_kv_checksum_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_log_iter_test",
+            srcs=["db/db_log_iter_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_logical_block_size_cache_test",
+            srcs=["db/db_logical_block_size_cache_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_memtable_test",
+            srcs=["db/db_memtable_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_merge_operand_test",
+            srcs=["db/db_merge_operand_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_merge_operator_test",
+            srcs=["db/db_merge_operator_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_options_test",
+            srcs=["db/db_options_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_properties_test",
+            srcs=["db/db_properties_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_range_del_test",
+            srcs=["db/db_range_del_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_rate_limiter_test",
+            srcs=["db/db_rate_limiter_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_readonly_with_timestamp_test",
+            srcs=["db/db_readonly_with_timestamp_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_secondary_test",
+            srcs=["db/db_secondary_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_sst_test",
+            srcs=["db/db_sst_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_statistics_test",
+            srcs=["db/db_statistics_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_table_properties_test",
+            srcs=["db/db_table_properties_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_tailing_iter_test",
+            srcs=["db/db_tailing_iter_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_test",
+            srcs=["db/db_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_test2",
+            srcs=["db/db_test2.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_universal_compaction_test",
+            srcs=["db/db_universal_compaction_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_wal_test",
+            srcs=["db/db_wal_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_wide_basic_test",
+            srcs=["db/wide/db_wide_basic_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_with_timestamp_basic_test",
+            srcs=["db/db_with_timestamp_basic_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_with_timestamp_compaction_test",
+            srcs=["db/db_with_timestamp_compaction_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_write_buffer_manager_test",
+            srcs=["db/db_write_buffer_manager_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="db_write_test",
+            srcs=["db/db_write_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="dbformat_test",
+            srcs=["db/dbformat_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="defer_test",
+            srcs=["util/defer_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="delete_scheduler_test",
+            srcs=["file/delete_scheduler_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="deletefile_test",
+            srcs=["db/deletefile_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="dynamic_bloom_test",
+            srcs=["util/dynamic_bloom_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_library_wrapper(name="env_basic_test_lib", srcs=["env/env_basic_test.cc"], deps=[":rocksdb_test_lib"], headers=None, link_whole=False, extra_test_libs=True)
+
+cpp_unittest_wrapper(name="env_basic_test",
+            srcs=["env/env_basic_test.cc"],
+            deps=[":env_basic_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="env_logger_test",
+            srcs=["logging/env_logger_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="env_test",
+            srcs=["env/env_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="env_timed_test",
+            srcs=["utilities/env_timed_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="error_handler_fs_test",
+            srcs=["db/error_handler_fs_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="event_logger_test",
+            srcs=["logging/event_logger_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="external_sst_file_basic_test",
+            srcs=["db/external_sst_file_basic_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="external_sst_file_test",
+            srcs=["db/external_sst_file_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="fault_injection_test",
+            srcs=["db/fault_injection_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="file_indexer_test",
+            srcs=["db/file_indexer_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="file_reader_writer_test",
+            srcs=["util/file_reader_writer_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="filelock_test",
+            srcs=["util/filelock_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="filename_test",
+            srcs=["db/filename_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="flush_job_test",
+            srcs=["db/flush_job_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="full_filter_block_test",
+            srcs=["table/block_based/full_filter_block_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="hash_table_test",
+            srcs=["utilities/persistent_cache/hash_table_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="hash_test",
+            srcs=["util/hash_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="heap_test",
+            srcs=["util/heap_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="histogram_test",
+            srcs=["monitoring/histogram_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="import_column_family_test",
+            srcs=["db/import_column_family_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="inlineskiplist_test",
+            srcs=["memtable/inlineskiplist_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="io_posix_test",
+            srcs=["env/io_posix_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="io_tracer_parser_test",
+            srcs=["tools/io_tracer_parser_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="io_tracer_test",
+            srcs=["trace_replay/io_tracer_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="iostats_context_test",
+            srcs=["monitoring/iostats_context_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="ldb_cmd_test",
+            srcs=["tools/ldb_cmd_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="listener_test",
+            srcs=["db/listener_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="log_test",
+            srcs=["db/log_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="lru_cache_test",
+            srcs=["cache/lru_cache_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="manual_compaction_test",
+            srcs=["db/manual_compaction_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="memory_allocator_test",
+            srcs=["memory/memory_allocator_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="memory_test",
+            srcs=["utilities/memory/memory_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="memtable_list_test",
+            srcs=["db/memtable_list_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="merge_helper_test",
+            srcs=["db/merge_helper_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="merge_test",
+            srcs=["db/merge_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="merger_test",
+            srcs=["table/merger_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="mock_env_test",
+            srcs=["env/mock_env_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="object_registry_test",
+            srcs=["utilities/object_registry_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="obsolete_files_test",
+            srcs=["db/obsolete_files_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="optimistic_transaction_test",
+            srcs=["utilities/transactions/optimistic_transaction_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="option_change_migration_test",
+            srcs=["utilities/option_change_migration/option_change_migration_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="options_file_test",
+            srcs=["db/options_file_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="options_settable_test",
+            srcs=["options/options_settable_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="options_test",
+            srcs=["options/options_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="options_util_test",
+            srcs=["utilities/options/options_util_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="partitioned_filter_block_test",
+            srcs=["table/block_based/partitioned_filter_block_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="perf_context_test",
+            srcs=["db/perf_context_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="periodic_task_scheduler_test",
+            srcs=["db/periodic_task_scheduler_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="persistent_cache_test",
+            srcs=["utilities/persistent_cache/persistent_cache_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="plain_table_db_test",
+            srcs=["db/plain_table_db_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="point_lock_manager_test",
+            srcs=["utilities/transactions/lock/point/point_lock_manager_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="prefetch_test",
+            srcs=["file/prefetch_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="prefix_test",
+            srcs=["db/prefix_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="random_access_file_reader_test",
+            srcs=["file/random_access_file_reader_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="random_test",
+            srcs=["util/random_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="range_del_aggregator_test",
+            srcs=["db/range_del_aggregator_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="range_locking_test",
+            srcs=["utilities/transactions/lock/range/range_locking_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="range_tombstone_fragmenter_test",
+            srcs=["db/range_tombstone_fragmenter_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="rate_limiter_test",
+            srcs=["util/rate_limiter_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="reduce_levels_test",
+            srcs=["tools/reduce_levels_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="repair_test",
+            srcs=["db/repair_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="repeatable_thread_test",
+            srcs=["util/repeatable_thread_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="ribbon_test",
+            srcs=["util/ribbon_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="seqno_time_test",
+            srcs=["db/seqno_time_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="sim_cache_test",
+            srcs=["utilities/simulator_cache/sim_cache_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="skiplist_test",
+            srcs=["memtable/skiplist_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="slice_test",
+            srcs=["util/slice_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="slice_transform_test",
+            srcs=["util/slice_transform_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="sst_dump_test",
+            srcs=["tools/sst_dump_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="sst_file_reader_test",
+            srcs=["table/sst_file_reader_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="statistics_test",
+            srcs=["monitoring/statistics_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="stats_history_test",
+            srcs=["monitoring/stats_history_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="stringappend_test",
+            srcs=["utilities/merge_operators/string_append/stringappend_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="table_properties_collector_test",
+            srcs=["db/table_properties_collector_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="table_test",
+            srcs=["table/table_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="testutil_test",
+            srcs=["test_util/testutil_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="thread_list_test",
+            srcs=["util/thread_list_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="thread_local_test",
+            srcs=["util/thread_local_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="tiered_compaction_test",
+            srcs=["db/compaction/tiered_compaction_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="timer_queue_test",
+            srcs=["util/timer_queue_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="timer_test",
+            srcs=["util/timer_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="timestamped_snapshot_test",
+            srcs=["utilities/transactions/timestamped_snapshot_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="trace_analyzer_test",
+            srcs=["tools/trace_analyzer_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="transaction_test",
+            srcs=["utilities/transactions/transaction_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="ttl_test",
+            srcs=["utilities/ttl/ttl_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="util_merge_operators_test",
+            srcs=["utilities/util_merge_operators_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="version_builder_test",
+            srcs=["db/version_builder_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="version_edit_test",
+            srcs=["db/version_edit_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="version_set_test",
+            srcs=["db/version_set_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="wal_manager_test",
+            srcs=["db/wal_manager_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="wide_column_serialization_test",
+            srcs=["db/wide/wide_column_serialization_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="work_queue_test",
+            srcs=["util/work_queue_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="write_batch_test",
+            srcs=["db/write_batch_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="write_batch_with_index_test",
+            srcs=["utilities/write_batch_with_index/write_batch_with_index_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="write_buffer_manager_test",
+            srcs=["memtable/write_buffer_manager_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="write_callback_test",
+            srcs=["db/write_callback_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="write_committed_transaction_ts_test",
+            srcs=["utilities/transactions/write_committed_transaction_ts_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="write_controller_test",
+            srcs=["db/write_controller_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="write_prepared_transaction_test",
+            srcs=["utilities/transactions/write_prepared_transaction_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="write_unprepared_transaction_test",
+            srcs=["utilities/transactions/write_unprepared_transaction_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
diff --git a/src/rocksdb/USERS.md b/src/rocksdb/USERS.md
new file mode 100644
index 000000000..be42b9b0c
--- /dev/null
+++ b/src/rocksdb/USERS.md
@@ -0,0 +1,128 @@
+This document lists users of RocksDB and their use cases. If you are using RocksDB, please open a pull request and add yourself to the list.
+
+## Facebook
+At Facebook, we use RocksDB as storage engines in multiple data management services and a backend for many different stateful services, including:
+
+1. MyRocks -- https://github.com/MySQLOnRocksDB/mysql-5.6
+2. MongoRocks -- https://github.com/mongodb-partners/mongo-rocks
+3. ZippyDB --  Facebook's distributed key-value store with Paxos-style replication, built on top of RocksDB.[1] https://www.youtube.com/watch?v=DfiN7pG0D0khtt
+4. Laser -- Laser is a high query throughput, low (millisecond) latency, key-value storage service built on top of RocksDB.[1]
+4. Dragon -- a distributed graph query engine. https://code.facebook.com/posts/1737605303120405/dragon-a-distributed-graph-query-engine/
+5. Stylus -- a low-level stream processing framework writtenin C++.[1]
+6. LogDevice -- a distributed data store for logs [2]
+
+[1] https://research.facebook.com/publications/realtime-data-processing-at-facebook/
+
+[2] https://code.facebook.com/posts/357056558062811/logdevice-a-distributed-data-store-for-logs/
+
+## LinkedIn
+Two different use cases at Linkedin are using RocksDB as a storage engine:
+
+1. LinkedIn's follow feed for storing user's activities. Check out the blog post: https://engineering.linkedin.com/blog/2016/03/followfeed--linkedin-s-feed-made-faster-and-smarter
+2. Apache Samza, open source framework for stream processing
+
+Learn more about those use cases in a Tech Talk by Ankit Gupta and Naveen Somasundaram: http://www.youtube.com/watch?v=plqVp_OnSzg
+
+## Yahoo
+Yahoo is using RocksDB as a storage engine for their biggest distributed data store Sherpa. Learn more about it here: http://yahooeng.tumblr.com/post/120730204806/sherpa-scales-new-heights
+
+## Baidu
+[Apache Doris](http://doris.apache.org/master/en/) is a MPP analytical database engine released by Baidu. It [uses RocksDB](http://doris.apache.org/master/en/administrator-guide/operation/tablet-meta-tool.html) to manage its tablet's metadata.
+
+## CockroachDB
+CockroachDB is an open-source geo-replicated transactional database. They are using RocksDB as their storage engine. Check out their github: https://github.com/cockroachdb/cockroach
+
+## DNANexus
+DNANexus is using RocksDB to speed up processing of genomics data.
+You can learn more from this great blog post by Mike Lin: http://devblog.dnanexus.com/faster-bam-sorting-with-samtools-and-rocksdb/
+
+## Iron.io
+Iron.io is using RocksDB as a storage engine for their distributed queueing system.
+Learn more from Tech Talk by Reed Allman: http://www.youtube.com/watch?v=HTjt6oj-RL4
+
+## Tango Me
+Tango is using RocksDB as a graph storage to store all users' connection data and other social activity data.
+
+## Turn
+Turn is using RocksDB as a storage layer for their key/value store, serving at peak 2.4MM QPS out of different datacenters.
+Check out our RocksDB Protobuf merge operator at: https://github.com/vladb38/rocksdb_protobuf
+
+## Santander UK/Cloudera Profession Services
+Check out their blog post: http://blog.cloudera.com/blog/2015/08/inside-santanders-near-real-time-data-ingest-architecture/
+
+## Airbnb
+Airbnb is using RocksDB as a storage engine for their personalized search service. You can learn more about it here: https://www.youtube.com/watch?v=ASQ6XMtogMs
+
+## Alluxio
+[Alluxio](https://www.alluxio.io) uses RocksDB to serve and scale file system metadata to beyond 1 Billion files. The detailed design and implementation is described in this engineering blog:
+https://www.alluxio.io/blog/scalable-metadata-service-in-alluxio-storing-billions-of-files/
+
+## Pinterest
+Pinterest's Object Retrieval System uses RocksDB for storage: https://www.youtube.com/watch?v=MtFEVEs_2Vo
+
+## Smyte
+[Smyte](https://www.smyte.com/) uses RocksDB as the storage layer for their core key-value storage, high-performance counters and time-windowed HyperLogLog services.
+
+## Rakuten Marketing
+[Rakuten Marketing](https://marketing.rakuten.com/) uses RocksDB as the disk cache layer for the real-time bidding service in their Performance DSP.
+
+## VWO, Wingify
+[VWO's](https://vwo.com/) Smart Code checker and URL helper uses RocksDB to store all the URLs where VWO's Smart Code is installed.
+
+## quasardb
+[quasardb](https://www.quasardb.net) is a high-performance, distributed, transactional key-value database that integrates well with in-memory analytics engines such as Apache Spark.
+quasardb uses a heavily tuned RocksDB as its persistence layer.
+
+## Netflix
+[Netflix](http://techblog.netflix.com/2016/05/application-data-caching-using-ssds.html) Netflix uses RocksDB on AWS EC2 instances with local SSD drives to cache application data.
+
+## TiKV
+[TiKV](https://github.com/pingcap/tikv) is a GEO-replicated, high-performance, distributed, transactional key-value database. TiKV is powered by Rust and Raft. TiKV uses RocksDB as its persistence layer.
+
+## Apache Flink
+[Apache Flink](https://flink.apache.org/news/2016/03/08/release-1.0.0.html) uses RocksDB to store state locally on a machine.
+
+## Dgraph
+[Dgraph](https://github.com/dgraph-io/dgraph) is an open-source, scalable, distributed, low latency, high throughput Graph database .They use RocksDB to store state locally on a machine.
+
+## Uber
+[Uber](http://eng.uber.com/cherami/) uses RocksDB as a durable and scalable task queue.
+
+## 360 Pika
+[360](http://www.360.cn/) [Pika](https://github.com/Qihoo360/pika) is a nosql compatible with redis. With the huge amount of data stored, redis may suffer for a capacity bottleneck, and pika was born for solving it. It has widely been used in many companies.
+
+## LzLabs
+LzLabs is using RocksDB as a storage engine in their multi-database distributed framework to store application configuration and user data.
+
+## ProfaneDB
+[ProfaneDB](https://profanedb.gitlab.io/) is a database for Protocol Buffers, and uses RocksDB for storage. It is accessible via gRPC, and the schema is defined using directly `.proto` files.
+
+## IOTA Foundation
+ [IOTA Foundation](https://www.iota.org/) is using RocksDB in the [IOTA Reference Implementation (IRI)](https://github.com/iotaledger/iri) to store the local state of the Tangle. The Tangle is the first open-source distributed ledger powering the future of the Internet of Things.
+
+## Avrio Project
+ [Avrio Project](http://avrio-project.github.io/avrio.network/) is using RocksDB in [Avrio ](https://github.com/avrio-project/avrio) to store blocks, account balances and data and other blockchain-releated data. Avrio is a multiblockchain decentralized cryptocurrency empowering monetary transactions.
+
+## Crux
+[Crux](https://github.com/juxt/crux) is a document database that uses RocksDB for local [EAV](https://en.wikipedia.org/wiki/Entity%E2%80%93attribute%E2%80%93value_model) index storage to enable point-in-time bitemporal Datalog queries. The "unbundled" architecture uses Kafka to provide horizontal scalability.
+
+## Nebula Graph
+[Nebula Graph](https://github.com/vesoft-inc/nebula) is a distributed, scalable, lightning-fast, open source graph database capable of hosting super large scale graphs with dozens of billions of vertices (nodes) and trillions of edges, with milliseconds of latency.
+
+## YugabyteDB
+[YugabyteDB](https://www.yugabyte.com/) is an open source, high performance, distributed SQL database that uses RocksDB as its storage layer. For more information, please see https://github.com/yugabyte/yugabyte-db/.
+
+## ArangoDB
+[ArangoDB](https://www.arangodb.com/) is a native multi-model database with flexible data models for documents, graphs, and key-values, for building high performance applications using a convenient SQL-like query language or JavaScript extensions. It uses RocksDB as its storage engine.
+
+## Milvus
+[Milvus](https://milvus.io/) is an open source vector database for unstructured data. It uses RocksDB not only as one of the supported kv storage engines, but also as a message queue.
+
+## Kafka
+[Kafka](https://kafka.apache.org/) is an open-source distributed event streaming platform, it uses RocksDB to store state in Kafka Streams: https://www.confluent.io/blog/how-to-tune-rocksdb-kafka-streams-state-stores-performance/.
+
+## Solana Labs
+[Solana](https://github.com/solana-labs/solana) is a fast, secure, scalable, and decentralized blockchain.  It uses RocksDB as the underlying storage for its ledger store.
+
+## Others
+More databases using RocksDB can be found at [dbdb.io](https://dbdb.io/browse?embeds=rocksdb).
diff --git a/src/rocksdb/Vagrantfile b/src/rocksdb/Vagrantfile
new file mode 100644
index 000000000..07f2e99fd
--- /dev/null
+++ b/src/rocksdb/Vagrantfile
@@ -0,0 +1,39 @@
+# Vagrant file
+Vagrant.configure("2") do |config|
+
+  config.vm.provider "virtualbox" do |v|
+    v.memory = 4096
+    v.cpus = 2
+  end
+
+  config.vm.define "ubuntu14" do |box|
+    box.vm.box = "ubuntu/trusty64"
+  end
+
+  config.vm.define "centos65" do |box|
+    box.vm.box = "chef/centos-6.5"
+  end
+
+  config.vm.define "centos7" do |box|
+    box.vm.box = "centos/7"
+    box.vm.provision "shell", path: "build_tools/setup_centos7.sh"
+  end
+
+  config.vm.define "FreeBSD10" do |box|
+    box.vm.guest = :freebsd
+    box.vm.box = "robin/freebsd-10"
+    # FreeBSD does not support 'mount_virtualbox_shared_folder', use NFS
+    box.vm.synced_folder ".", "/vagrant", :nfs => true, id: "vagrant-root"
+    box.vm.network "private_network", ip: "10.0.1.10"
+
+    # build everything after creating VM, skip using --no-provision
+    box.vm.provision "shell", inline: <<-SCRIPT
+      pkg install -y gmake clang35
+      export CXX=/usr/local/bin/clang++35
+      cd /vagrant
+      gmake clean
+      gmake all OPT=-g
+    SCRIPT
+  end
+
+end
diff --git a/src/rocksdb/WINDOWS_PORT.md b/src/rocksdb/WINDOWS_PORT.md
new file mode 100644
index 000000000..a6e4f93dd
--- /dev/null
+++ b/src/rocksdb/WINDOWS_PORT.md
@@ -0,0 +1,228 @@
+# Microsoft Contribution Notes
+
+## Contributors
+* Alexander Zinoviev https://github.com/zinoale
+* Dmitri Smirnov https://github.com/yuslepukhin
+* Praveen Rao  https://github.com/PraveenSinghRao
+* Sherlock Huang  https://github.com/SherlockNoMad
+
+## Introduction
+RocksDB is a well proven open source key-value persistent store, optimized for fast storage. It provides scalability with number of CPUs and storage IOPS, to support IO-bound, in-memory and write-once workloads, most importantly, to be flexible to allow for innovation.
+
+As Microsoft Bing team we have been continuously pushing hard to improve the scalability, efficiency of platform and eventually benefit Bing end-user satisfaction.  We would like to explore the opportunity to embrace open source, RocksDB here, to use, enhance and customize for our usage, and also contribute back to the RocksDB community. Herein, we are pleased to offer this RocksDB port for Windows platform.
+
+These notes describe some decisions and changes we had to make with regards to porting RocksDB on Windows. We hope this will help both reviewers and users of the Windows port.
+We are open for comments and improvements.
+
+## OS specifics
+All of the porting, testing and benchmarking was done on Windows Server 2012 R2 Datacenter 64-bit but to the best of our knowledge there is not a specific API we used during porting that is unsupported on other Windows OS after Vista.
+
+## Porting goals
+We strive to achieve the following goals:
+* make use of the existing porting interface of RocksDB
+* make minimum [WY2]modifications within platform independent code.
+* make all unit test pass both in debug and release builds. 
+  * Note: latest introduction of SyncPoint seems to disable running db_test in Release.
+* make performance on par with published benchmarks accounting for HW differences
+* we would like to keep the port code inline with the main branch with no forking
+
+## Build system
+We have chosen CMake as a widely accepted build system to build the Windows port. It is very fast and convenient. 
+
+At the same time it generates Visual Studio projects that are both usable from a command line and IDE.
+
+The top-level CMakeLists.txt file contains description of all targets and build rules. It also provides brief instructions on how to build the software for Windows. One more build related file is thirdparty.inc that also resides on the top level. This file must be edited to point to actual third party libraries location.
+We think that it would be beneficial to merge the existing make-based build system and the new cmake-based build system into a single one to use on all platforms.
+
+All building and testing was done for 64-bit. We have not conducted any testing for 32-bit and early reports indicate that it will not run on 32-bit.
+
+## C++ and STL notes
+We had to make some minimum changes within the portable files that either account for OS differences or the shortcomings of C++11 support in the current version of the MS compiler. Most or all of them are expected to be fixed in the upcoming compiler releases.
+
+We plan to use this port for our business purposes here at Bing and this provided business justification for this port. This also means, we do not have at present to choose the compiler version at will.
+
+* Certain headers that are not present and not necessary on Windows were simply `#ifndef OS_WIN` in a few places (`unistd.h`)
+* All posix specific headers were replaced to port/port.h which worked well
+* Replaced `dirent.h` for `port/port_dirent.h` (very few places) with the implementation of the relevant interfaces within `rocksdb::port` namespace
+* Replaced `sys/time.h` to `port/sys_time.h` (few places) implemented equivalents within `rocksdb::port`
+* `printf %z` specification is not supported on Windows. To imitate existing standards we came up with a string macro `ROCKSDB_PRIszt` which expands to `zu` on posix systems and to `Iu` on windows.
+* in class member initialization were moved to a __ctors in some cases
+* `constexpr` is not supported. We had to replace `std::numeric_limits<>::max/min()` to its C macros for constants. Sometimes we had to make class members `static const` and place a definition within a .cc file.
+* `constexpr` for functions was replaced to a template specialization (1 place)
+* Union members that have non-trivial constructors were replaced to `char[]` in one place along with bug fixes (spatial experimental feature)
+* Zero-sized arrays are deemed a non-standard extension which we converted to 1 size array and that should work well for the purposes of these classes.
+* `std::chrono` lacks nanoseconds support (fixed in the upcoming release of the STL) and we had to use `QueryPerfCounter()` within env_win.cc
+* Function local statics initialization is still not safe. Used `std::once` to mitigate within WinEnv.
+
+## Windows Environments notes
+We endeavored to make it functionally on par with posix_env. This means we replicated the functionality of the thread pool and other things as precise as possible, including:
+* Replicate posix logic using std:thread primitives.
+* Implement all posix_env disk access functionality.
+* Set `use_os_buffer=false` to disable OS disk buffering for WinWritableFile and WinRandomAccessFile.
+* Replace `pread/pwrite` with `WriteFile/ReadFile` with `OVERLAPPED` structure.
+* Use `SetFileInformationByHandle` to compensate absence of `fallocate`.
+
+### In detail
+Even though Windows provides its own efficient thread-pool implementation we chose to replicate posix logic using `std::thread` primitives. This allows anyone to quickly detect any changes within the posix source code and replicate them within windows env. This has proven to work very well. At the same time for anyone who wishes to replace the built-in thread-pool can do so using RocksDB stackable environments.
+
+For disk access we implemented all of the functionality present within the posix_env which includes memory mapped files, random access, rate-limiter support etc.
+The `use_os_buffer` flag on Posix platforms currently denotes disabling read-ahead log via `fadvise` mechanism. Windows does not have `fadvise` system call. What is more, it implements disk cache in a way that differs from Linux greatly. It's not an uncommon practice on Windows to perform un-buffered disk access to gain control of the memory consumption. We think that in our use case this may also be a good configuration option at the expense of disk throughput. To compensate one may increase the configured in-memory cache size instead. Thus we have chosen  `use_os_buffer=false` to disable OS disk buffering for `WinWritableFile` and `WinRandomAccessFile`. The OS imposes restrictions on the alignment of the disk offsets, buffers used and the amount of data that is read/written when accessing files in un-buffered mode. When the option is true, the classes behave in a standard way. This allows to perform writes and reads in cases when un-buffered access does not make sense such as WAL and MANIFEST.
+
+We have replaced `pread/pwrite` with `WriteFile/ReadFile` with `OVERLAPPED` structure so we can atomically seek to the position of the disk operation but still perform the operation synchronously. Thus we able to emulate that functionality of `pread/pwrite` reasonably well. The only difference is that the file pointer is not returned to its original position but that hardly matters given the random nature of access.
+
+We used `SetFileInformationByHandle` both to truncate files after writing a full final page to disk and to pre-allocate disk space for faster I/O thus compensating for the absence of `fallocate` although some differences remain. For example, the pre-allocated space is not filled with zeros like on Linux, however, on a positive note, the end of file position is also not modified after pre-allocation.
+
+RocksDB renames, copies and deletes files at will even though they may be opened with another handle at the same time. We had to relax and allow nearly all the concurrent access permissions possible.
+
+## Thread-Local Storage
+Thread-Local storage plays a significant role for RocksDB performance. Rather than creating a separate implementation we chose to create inline wrappers that forward `pthread_specific` calls to Windows `Tls` interfaces within `rocksdb::port` namespace. This leaves the existing meat of the logic in tact and unchanged and just as maintainable.
+
+To mitigate the lack of thread local storage cleanup on thread-exit we added a limited amount of windows specific code within the same thread_local.cc file that injects a cleanup callback into a `"__tls"` structure within `".CRT$XLB"` data segment. This approach guarantees that the callback is invoked regardless of whether RocksDB used within an executable, standalone DLL or within another DLL.
+
+## Jemalloc usage
+
+When RocksDB is used with Jemalloc the latter needs to be initialized before any of the C++ globals or statics. To accomplish that we injected an initialization routine into `".CRT$XCT"` that is automatically invoked by the runtime before initializing static objects. je-uninit is queued to `atexit()`. 
+
+The jemalloc redirecting `new/delete` global operators are used by the linker providing certain conditions are met. See build section in these notes.
+
+## Stack Trace and Unhandled Exception Handler
+
+We decided not to implement these two features because the hosting program as a rule has these two things in it.
+We experienced no inconveniences debugging issues in the debugger or analyzing process dumps if need be and thus we did not
+see this as a priority.
+
+## Performance results
+### Setup
+All of the benchmarks are run on the same set of machines. Here are the details of the test setup:
+* 2 Intel(R) Xeon(R) E5 2450 0 @ 2.10 GHz (total 16 cores)
+* 2 XK0480GDQPH SSD Device, total 894GB free disk
+* Machine has 128 GB of RAM
+* Operating System: Windows Server 2012 R2 Datacenter
+* 100 Million keys; each key is of size 10 bytes, each value is of size 800 bytes
+* total database size is ~76GB
+* The performance result is based on RocksDB 3.11.
+* The parameters used, unless specified, were exactly the same as published in the GitHub Wiki page. 
+
+### RocksDB on flash storage
+
+#### Test 1. Bulk Load of keys in Random Order
+
+Version 3.11 
+
+* Total Run Time: 17.6 min
+* Fillrandom: 5.480 micros/op 182465 ops/sec;  142.0 MB/s
+* Compact: 486056544.000 micros/op 0 ops/sec
+
+Version 3.10 
+
+* Total Run Time: 16.2 min 
+* Fillrandom: 5.018 micros/op 199269 ops/sec;  155.1 MB/s 
+* Compact: 441313173.000 micros/op 0 ops/sec; 
+
+
+#### Test 2. Bulk Load of keys in Sequential Order
+
+Version 3.11 
+
+* Fillseq: 4.944 micros/op 202k ops/sec;  157.4 MB/s
+
+Version 3.10
+
+* Fillseq: 4.105 micros/op 243.6k ops/sec;  189.6 MB/s 
+
+
+#### Test 3. Random Write
+
+Version 3.11 
+
+* Unbuffered I/O enabled
+* Overwrite: 52.661 micros/op 18.9k ops/sec;   14.8 MB/s
+
+Version 3.10
+
+* Unbuffered I/O enabled 
+* Overwrite: 52.661 micros/op 18.9k ops/sec; 
+
+
+#### Test 4. Random Read
+
+Version 3.11 
+
+* Unbuffered I/O enabled
+* Readrandom: 15.716 micros/op 63.6k ops/sec; 49.5 MB/s 
+
+Version 3.10
+
+* Unbuffered I/O enabled 
+* Readrandom: 15.548 micros/op 64.3k ops/sec; 
+
+
+#### Test 5. Multi-threaded read and single-threaded write
+
+Version 3.11
+
+* Unbuffered I/O enabled
+* Readwhilewriting: 25.128 micros/op 39.7k ops/sec; 
+
+Version 3.10
+
+* Unbuffered I/O enabled 
+* Readwhilewriting: 24.854 micros/op 40.2k ops/sec; 
+
+
+### RocksDB In Memory 
+
+#### Test 1. Point Lookup
+
+Version 3.11
+
+80K writes/sec
+* Write Rate Achieved: 40.5k write/sec;
+* Readwhilewriting: 0.314 micros/op 3187455 ops/sec;  364.8 MB/s (715454999 of 715454999 found)
+
+Version 3.10
+
+* Write Rate Achieved:  50.6k write/sec 
+* Readwhilewriting: 0.316 micros/op 3162028 ops/sec; (719576999 of 719576999 found) 
+
+
+*10K writes/sec*
+
+Version 3.11
+
+* Write Rate Achieved: 5.8k/s write/sec
+* Readwhilewriting: 0.246 micros/op 4062669 ops/sec;  464.9 MB/s (915481999 of 915481999 found)
+
+Version 3.10
+
+* Write Rate Achieved: 5.8k/s write/sec 
+* Readwhilewriting: 0.244 micros/op 4106253 ops/sec; (927986999 of 927986999 found) 
+
+
+#### Test 2. Prefix Range Query
+
+Version 3.11
+
+80K writes/sec
+* Write Rate Achieved:  46.3k/s write/sec
+* Readwhilewriting: 0.362 micros/op 2765052 ops/sec;  316.4 MB/s (611549999 of 611549999 found)
+
+Version 3.10
+
+* Write Rate Achieved: 45.8k/s write/sec 
+* Readwhilewriting: 0.317 micros/op 3154941 ops/sec; (708158999 of 708158999 found) 
+
+Version 3.11
+
+10K writes/sec
+* Write Rate Achieved: 5.78k write/sec
+* Readwhilewriting: 0.269 micros/op 3716692 ops/sec;  425.3 MB/s (837401999 of 837401999 found)
+
+Version 3.10
+
+* Write Rate Achieved: 5.7k write/sec 
+* Readwhilewriting: 0.261 micros/op 3830152 ops/sec; (863482999 of 863482999 found) 
+
+
+We think that there is still big room to improve the performance, which will be an ongoing effort for us.
+
diff --git a/src/rocksdb/buckifier/bench-slow.json b/src/rocksdb/buckifier/bench-slow.json
new file mode 100644
index 000000000..948b3de29
--- /dev/null
+++ b/src/rocksdb/buckifier/bench-slow.json
@@ -0,0 +1,6163 @@
+[
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 1712.344628
+          }
+        ],
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 479.941992
+          }
+        ],
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 1811.998557
+          }
+        ],
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 549.901612
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4687.250475
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4818.164105
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 623.018994
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 1997.259639
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 3910.179634
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 3217.553693
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4454.911311
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4195.372795
+          }
+        ],
+        "DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1/iterations:409600/threads:1": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 360.304737
+          }
+        ],
+        "DBPut/comp_style:1/max_data:107374182400/per_key_size:1024/enable_statistics:0/wal:1/iterations:51200/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2141.057905
+          }
+        ],
+        "DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:51200/threads:8": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 379.059546
+          }
+        ],
+        "DBPut/comp_style:2/max_data:107374182400/per_key_size:1024/enable_statistics:0/wal:0/iterations:51200/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2249.892332
+          }
+        ],
+        "IteratorNext/comp_style:2/max_data:536870912/per_key_size:256/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2421.708898
+          }
+        ],
+        "IteratorPrev/comp_style:2/max_data:134217728/per_key_size:256/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1210.835611
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1930.985912
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3730.560675
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3805.367942
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1905.101414
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1346.866095
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2911.433188
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3265.867193
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3500.869188
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2764.891509
+          }
+        ],
+        "ManualFlush/key_num:65536/per_key_size:1024/iterations:1": [
+          "real_time",
+          "db_size",
+          "flush_write_bytes",
+          "flush_time",
+          "threads",
+          {
+            "est_runtime": 8213.664958
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1174.289332
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1844.656254
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1871.728091
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1138.131879
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/enable_filter:1/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 934.128086
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/enable_filter:1/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2050.952519
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/enable_filter:1/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2119.677364
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1620.549616
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1540.049484
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 88891,
+    "name": "rocksdb_microbench_suite_0",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  },
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 1774.099605
+          }
+        ],
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 1905.399998
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 3808.99857
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4820.729905
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 651.103057
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 2073.571864
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 501.900122
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 2279.957943
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 517.245591
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4470.408695
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4697.266228
+          }
+        ],
+        "DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:409600/threads:1": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 378.530036
+          }
+        ],
+        "DBPut/comp_style:1/max_data:107374182400/per_key_size:1024/enable_statistics:1/wal:1/iterations:51200/threads:8": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2153.984468
+          }
+        ],
+        "DBPut/comp_style:2/max_data:107374182400/per_key_size:1024/enable_statistics:0/wal:0/iterations:409600/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2425.675621
+          }
+        ],
+        "DBPut/comp_style:2/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1/iterations:51200/threads:8": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 271.474125
+          }
+        ],
+        "IteratorNext/comp_style:0/max_data:536870912/per_key_size:1024/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2680.208144
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1925.787501
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2138.845144
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3240.472721
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3743.777606
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1889.280273
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1987.727061
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1324.811274
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1351.651528
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2898.268666
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3265.897167
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 4230.885188
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1610.559355
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1479.488999
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1833.697116
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3543.222366
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3977.653591
+          }
+        ],
+        "ManualFlush/key_num:65536/per_key_size:256/iterations:1": [
+          "real_time",
+          "db_size",
+          "flush_write_bytes",
+          "flush_time",
+          "threads",
+          {
+            "est_runtime": 8022.977415
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/enable_filter:0/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 900.408425
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1086.553529
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1151.02499
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1789.665713
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 88804,
+    "name": "rocksdb_microbench_suite_1",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  },
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBClose/iterations:200": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 7875.438371
+          }
+        ],
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 1809.467556
+          }
+        ],
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 694.578514
+          }
+        ],
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 2007.832073
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 3049.786949
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 3155.268413
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 471.459789
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 513.865636
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 3323.251926
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4287.119322
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4701.167388
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4478.402506
+          }
+        ],
+        "DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:1/iterations:409600/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 271.500867
+          }
+        ],
+        "DBPut/comp_style:2/max_data:107374182400/per_key_size:1024/enable_statistics:0/wal:1/iterations:51200/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2156.637367
+          }
+        ],
+        "DBPut/comp_style:2/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:51200/threads:8": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 378.808772
+          }
+        ],
+        "IteratorPrev/comp_style:0/max_data:536870912/per_key_size:1024/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2535.71873
+          }
+        ],
+        "IteratorPrev/comp_style:1/max_data:134217728/per_key_size:1024/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 850.797161
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1870.864429
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1925.978405
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2138.98528
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 4824.26486
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3568.048799
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1308.673423
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1845.907547
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1909.476304
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3816.002101
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1480.235109
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2428.851465
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3978.136634
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3745.274233
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2299.541611
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1152.817144
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/enable_filter:0/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2087.829967
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1138.066264
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1619.322144
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1715.83787
+          }
+        ],
+        "SimpleGetWithPerfContext/iterations:1000000": [
+          "block_seek_nanos",
+          "get_post_process_time",
+          "db_size",
+          "get_snapshot_time",
+          "block_read_time",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "get_from_output_files_time",
+          "new_table_block_iter_nanos",
+          "get_cpu_nanos",
+          "user_key_comparison_count",
+          "neg_qu_pct",
+          "block_checksum_time",
+          "get_from_table_nanos",
+          {
+            "est_runtime": 1387.59016
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 88803,
+    "name": "rocksdb_microbench_suite_2",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  },
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 1812.356957
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 3809.037737
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 514.201628
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 3294.815224
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4002.903452
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 649.806897
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 498.522549
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 1839.233063
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 1936.181117
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4471.890902
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4264.24321
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4701.026835
+          }
+        ],
+        "DBOpen/iterations:200": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 7875.060419
+          }
+        ],
+        "DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1/iterations:51200/threads:8": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 360.330841
+          }
+        ],
+        "DBPut/comp_style:1/max_data:107374182400/per_key_size:1024/enable_statistics:1/wal:0/iterations:409600/threads:1": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2423.955419
+          }
+        ],
+        "DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:0/iterations:409600/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 379.060989
+          }
+        ],
+        "DBPut/comp_style:2/max_data:107374182400/per_key_size:1024/enable_statistics:0/wal:1/iterations:409600/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2154.010254
+          }
+        ],
+        "IteratorNext/comp_style:0/max_data:536870912/per_key_size:256/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2682.352114
+          }
+        ],
+        "IteratorNext/comp_style:2/max_data:134217728/per_key_size:1024/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 868.172285
+          }
+        ],
+        "IteratorPrev/comp_style:2/max_data:536870912/per_key_size:1024/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2085.936855
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1972.741386
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1869.527457
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2138.868392
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3745.558567
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3240.715255
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 4832.025298
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3572.947368
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1290.200332
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1903.644533
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2910.058307
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1762.345037
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1441.761423
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1609.655022
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2278.376767
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1153.828342
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1408.600116
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1136.291725
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 88891,
+    "name": "rocksdb_microbench_suite_3",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  },
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 2898.72783
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 1997.299167
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 532.687188
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 2110.493928
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 2772.793109
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4294.689809
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4002.927614
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4482.402368
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4940.110072
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 651.121374
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 1844.537642
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 497.710941
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 1936.622943
+          }
+        ],
+        "DBPut/comp_style:0/max_data:107374182400/per_key_size:1024/enable_statistics:0/wal:0/iterations:409600/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2419.132481
+          }
+        ],
+        "DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:0/iterations:51200/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 378.529031
+          }
+        ],
+        "DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1/iterations:51200/threads:8": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 271.438709
+          }
+        ],
+        "DBPut/comp_style:2/max_data:107374182400/per_key_size:1024/enable_statistics:1/wal:1/iterations:409600/threads:1": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2149.877539
+          }
+        ],
+        "IteratorNext/comp_style:1/max_data:134217728/per_key_size:1024/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 869.224505
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2052.861678
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 4748.430327
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3240.626869
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3579.521522
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1296.53393
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1809.525026
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1384.672588
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1904.994109
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3817.997968
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1480.208318
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1613.568431
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3356.701738
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2239.977922
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3751.327826
+          }
+        ],
+        "ManualFlush/key_num:8192/per_key_size:1024/iterations:1": [
+          "real_time",
+          "db_size",
+          "flush_write_bytes",
+          "flush_time",
+          "threads",
+          {
+            "est_runtime": 7597.332723
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/enable_filter:1/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1122.154905
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1870.157606
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1739.825979
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1151.690848
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 88809,
+    "name": "rocksdb_microbench_suite_4",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  },
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 1802.08561
+          }
+        ],
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 624.132525
+          }
+        ],
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 1911.844737
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 2007.84697
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 2089.31663
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4497.182856
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4942.804943
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 1883.654744
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 498.06214
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 1937.167999
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 532.702176
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4302.580281
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 3341.771318
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4003.101384
+          }
+        ],
+        "DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:51200/threads:8": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 378.81272
+          }
+        ],
+        "DBPut/comp_style:1/max_data:107374182400/per_key_size:1024/enable_statistics:1/wal:1/iterations:409600/threads:1": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2157.656253
+          }
+        ],
+        "DBPut/comp_style:2/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1/iterations:409600/threads:1": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 271.523747
+          }
+        ],
+        "IteratorPrev/comp_style:0/max_data:536870912/per_key_size:256/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2535.78947
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2139.039426
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3754.165204
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 4752.499353
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1298.604072
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2998.907938
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3820.35491
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3169.465918
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1835.11354
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1608.421875
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3624.747853
+          }
+        ],
+        "ManualFlush/key_num:8192/per_key_size:256/iterations:1": [
+          "real_time",
+          "db_size",
+          "flush_write_bytes",
+          "flush_time",
+          "threads",
+          {
+            "est_runtime": 7557.736147
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1173.706929
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:1/enable_filter:1/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 976.018148
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/enable_filter:0/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1021.383458
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/enable_filter:0/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2300.700002
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/enable_filter:1/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2447.080986
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1773.944316
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:0/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1408.654342
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/enable_filter:0/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1423.89826
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 88803,
+    "name": "rocksdb_microbench_suite_5",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  },
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 658.890773
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 2073.344847
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 532.66627
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4499.103078
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 3823.666372
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 497.077261
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 1937.011319
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4003.152699
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4358.242188
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4944.478659
+          }
+        ],
+        "DBPut/comp_style:0/max_data:107374182400/per_key_size:1024/enable_statistics:1/wal:0/iterations:51200/threads:8": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2201.425905
+          }
+        ],
+        "DBPut/comp_style:0/max_data:107374182400/per_key_size:1024/enable_statistics:1/wal:1/iterations:51200/threads:8": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2153.250796
+          }
+        ],
+        "DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:0/iterations:409600/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 378.432186
+          }
+        ],
+        "DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:1/iterations:51200/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 271.397936
+          }
+        ],
+        "IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1156.84164
+          }
+        ],
+        "IteratorPrev/comp_style:0/max_data:134217728/per_key_size:256/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 871.341464
+          }
+        ],
+        "IteratorPrev/comp_style:2/max_data:134217728/per_key_size:1024/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1115.478679
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2136.922862
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1975.004341
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3179.966272
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3769.15995
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 4788.410331
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1307.377381
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1909.364449
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1369.449436
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2912.547738
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3633.675713
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2820.855412
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1479.955208
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1614.562603
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3350.108415
+          }
+        ],
+        "ManualFlush/key_num:1024/per_key_size:1024/iterations:1": [
+          "real_time",
+          "db_size",
+          "flush_write_bytes",
+          "flush_time",
+          "threads",
+          {
+            "est_runtime": 7422.165355
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1844.168905
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1870.857522
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/enable_filter:0/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2402.328004
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1739.59811
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1809.865409
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 88813,
+    "name": "rocksdb_microbench_suite_6",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  },
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 609.749844
+          }
+        ],
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 551.801692
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4512.440338
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 3823.643659
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 1885.071291
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 497.546918
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 1936.92061
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4003.12535
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 3196.497448
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4346.91794
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4946.256596
+          }
+        ],
+        "DBPut/comp_style:0/max_data:107374182400/per_key_size:1024/enable_statistics:0/wal:1/iterations:409600/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2141.297863
+          }
+        ],
+        "DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:0/iterations:51200/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 378.198303
+          }
+        ],
+        "DBPut/comp_style:2/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:1/iterations:51200/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 271.417422
+          }
+        ],
+        "IteratorNext/comp_style:2/max_data:134217728/per_key_size:256/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 926.477715
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1974.320666
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2136.585708
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 4788.447975
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1296.875867
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1906.215547
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2911.477908
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1835.00962
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1607.553483
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3352.038124
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3625.351224
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2814.166152
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3769.100313
+          }
+        ],
+        "ManualFlush/key_num:1024/per_key_size:256/iterations:1": [
+          "real_time",
+          "db_size",
+          "flush_write_bytes",
+          "flush_time",
+          "threads",
+          {
+            "est_runtime": 7418.474332
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1171.889503
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/enable_filter:1/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1078.456365
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1801.200304
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/enable_filter:1/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2245.828345
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/enable_filter:1/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2392.943018
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2049.704921
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1773.705301
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/enable_filter:0/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1410.843275
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/enable_filter:1/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1424.541953
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 88813,
+    "name": "rocksdb_microbench_suite_7",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  },
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 694.642444
+          }
+        ],
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 583.226959
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4633.862465
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 451.741737
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 510.357826
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 2108.477344
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4443.989929
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 5215.078632
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 1937.731137
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 3402.268174
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4948.420446
+          }
+        ],
+        "DBPut/comp_style:1/max_data:107374182400/per_key_size:1024/enable_statistics:0/wal:0/iterations:51200/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2200.5005
+          }
+        ],
+        "DBPut/comp_style:1/max_data:107374182400/per_key_size:1024/enable_statistics:0/wal:1/iterations:409600/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2140.350383
+          }
+        ],
+        "DBPut/comp_style:2/max_data:107374182400/per_key_size:1024/enable_statistics:1/wal:0/iterations:409600/threads:1": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2473.029062
+          }
+        ],
+        "DBPut/comp_style:2/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:0/iterations:409600/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 377.799421
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1923.154067
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3832.441913
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3138.670605
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3781.656053
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 4788.510789
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3675.134037
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3265.739118
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 4152.531212
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1623.097106
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1338.666598
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2888.963384
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1006.703673
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1181.657807
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/enable_filter:0/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2304.097915
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:1/enable_filter:1/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1412.397492
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/enable_filter:0/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2015.978912
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1859.4561
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1824.916416
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1144.514094
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/enable_filter:1/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1776.109199
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1602.162741
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/enable_filter:1/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1896.713718
+          }
+        ],
+        "RandomAccessFileReaderRead/enable_statistics:1/iterations:1000000": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 5.370559
+          }
+        ]
+      },
+      "ribbon_bench": {
+        "FilterBuild/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 2.171919
+          }
+        ],
+        "FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.38186
+          }
+        ],
+        "FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 9.184089
+          }
+        ],
+        "FilterQueryNegative/filter_impl:0/bits_per_key:20/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 1.54233
+          }
+        ],
+        "FilterQueryNegative/filter_impl:2/bits_per_key:10/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 1.501483
+          }
+        ],
+        "FilterQueryNegative/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 1.737477
+          }
+        ],
+        "FilterQueryNegative/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 82.415772
+          }
+        ],
+        "FilterQueryPositive/filter_impl:0/bits_per_key:20/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 1.399281
+          }
+        ],
+        "FilterQueryPositive/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 7.024791
+          }
+        ],
+        "FilterQueryPositive/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 39.015621
+          }
+        ],
+        "FilterQueryPositive/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 1.325264
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 88709,
+    "name": "rocksdb_microbench_suite_8",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  },
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 454.919549
+          }
+        ],
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 1712.215589
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4633.82088
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 3152.808792
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 3771.29009
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 5211.225659
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 510.387506
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 2139.111714
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4443.929027
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 1858.972054
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 696.590699
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 578.538571
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4949.611852
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4145.973837
+          }
+        ],
+        "DBPut/comp_style:2/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:0/iterations:51200/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 377.905737
+          }
+        ],
+        "IteratorPrev/comp_style:1/max_data:134217728/per_key_size:256/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 977.874953
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2012.604554
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1942.165369
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3259.631453
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 4795.420211
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1901.309617
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1918.997951
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1324.859234
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3677.093111
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3837.191621
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1572.19154
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3412.779413
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2348.374525
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2186.840634
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2888.917261
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1174.452593
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1816.741937
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/enable_filter:0/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2448.88995
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1138.391757
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:1/enable_filter:1/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1411.663065
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/enable_filter:0/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2090.841484
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1779.515791
+          }
+        ]
+      },
+      "ribbon_bench": {
+        "FilterBuild/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.194392
+          }
+        ],
+        "FilterBuild/filter_impl:2/bits_per_key:10/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 2.60738
+          }
+        ],
+        "FilterBuild/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 6.614245
+          }
+        ],
+        "FilterBuild/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.348746
+          }
+        ],
+        "FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 10.971779
+          }
+        ],
+        "FilterQueryNegative/filter_impl:2/bits_per_key:10/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 15.272086
+          }
+        ],
+        "FilterQueryNegative/filter_impl:2/bits_per_key:20/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 1.855877
+          }
+        ],
+        "FilterQueryNegative/filter_impl:2/bits_per_key:20/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 8.032293
+          }
+        ],
+        "FilterQueryNegative/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 1.621375
+          }
+        ],
+        "FilterQueryNegative/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 102.227551
+          }
+        ],
+        "FilterQueryPositive/filter_impl:2/bits_per_key:20/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 1.309645
+          }
+        ],
+        "FilterQueryPositive/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 1.448994
+          }
+        ],
+        "FilterQueryPositive/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 1.453854
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 88711,
+    "name": "rocksdb_microbench_suite_9",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  },
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 1775.789377
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4633.864621
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 3155.239662
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 3790.561434
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 458.336347
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 510.88842
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 2108.496423
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 557.886298
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 3365.458112
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4145.931239
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 5211.138859
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 694.628355
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4451.395377
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4950.489144
+          }
+        ],
+        "DBPut/comp_style:0/max_data:107374182400/per_key_size:1024/enable_statistics:0/wal:1/iterations:51200/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2140.891996
+          }
+        ],
+        "DBPut/comp_style:0/max_data:107374182400/per_key_size:1024/enable_statistics:1/wal:0/iterations:409600/threads:1": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2473.313812
+          }
+        ],
+        "DBPut/comp_style:0/max_data:107374182400/per_key_size:1024/enable_statistics:1/wal:1/iterations:409600/threads:1": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2167.374891
+          }
+        ],
+        "DBPut/comp_style:1/max_data:107374182400/per_key_size:1024/enable_statistics:1/wal:0/iterations:51200/threads:8": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2350.198115
+          }
+        ],
+        "DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:1/iterations:51200/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 270.221479
+          }
+        ],
+        "DBPut/comp_style:2/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:409600/threads:1": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 378.072129
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2013.660732
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1850.931894
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1941.981279
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1917.767466
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3832.592151
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 4788.541415
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1903.058732
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1343.860292
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3673.431012
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3265.79571
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1830.816871
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1623.280031
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2889.021135
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/enable_filter:0/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1018.424915
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1143.235807
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:1/enable_filter:0/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1582.515889
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/enable_filter:0/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1187.009146
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/enable_filter:1/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1421.928748
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 88819,
+    "name": "rocksdb_microbench_suite_10",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  },
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 2007.895851
+          }
+        ],
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 551.935026
+          }
+        ],
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 1911.863937
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4666.475079
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4809.492906
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 3790.625424
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 387.072927
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 2106.161292
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 1961.147307
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4437.328726
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 5173.846426
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 1859.150921
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 504.228523
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4950.729688
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4041.39475
+          }
+        ],
+        "DBPut/comp_style:0/max_data:107374182400/per_key_size:1024/enable_statistics:0/wal:0/iterations:51200/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2473.08233
+          }
+        ],
+        "DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:1/iterations:409600/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 360.402539
+          }
+        ],
+        "IteratorNext/comp_style:2/max_data:536870912/per_key_size:1024/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2192.682661
+          }
+        ],
+        "IteratorPrev/comp_style:0/max_data:134217728/per_key_size:1024/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 849.453776
+          }
+        ],
+        "IteratorPrev/comp_style:1/max_data:536870912/per_key_size:1024/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2325.086056
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1893.637846
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2139.270344
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3098.564345
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3857.029519
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3481.020105
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1340.699259
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3715.527152
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3246.485924
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1414.497171
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1641.591443
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1573.758377
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2888.996421
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:1/enable_filter:0/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1001.540766
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1187.719701
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1145.748787
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/enable_filter:0/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1776.225564
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/enable_filter:0/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1823.654928
+          }
+        ],
+        "RandomAccessFileReaderRead/enable_statistics:0/iterations:1000000": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 2.984478
+          }
+        ]
+      },
+      "ribbon_bench": {
+        "FilterBuild/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.311687
+          }
+        ],
+        "FilterBuild/filter_impl:2/bits_per_key:10/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.439296
+          }
+        ],
+        "FilterBuild/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.536606
+          }
+        ],
+        "FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 6.421361
+          }
+        ],
+        "FilterQueryNegative/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 1.672049
+          }
+        ],
+        "FilterQueryNegative/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 7.789373
+          }
+        ],
+        "FilterQueryNegative/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 15.718081
+          }
+        ],
+        "FilterQueryPositive/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 1.26261
+          }
+        ],
+        "FilterQueryPositive/filter_impl:0/bits_per_key:20/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 11.512964
+          }
+        ],
+        "FilterQueryPositive/filter_impl:2/bits_per_key:10/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 1.49311
+          }
+        ],
+        "FilterQueryPositive/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 14.576404
+          }
+        ],
+        "FilterQueryPositive/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 1.35537
+          }
+        ],
+        "FilterQueryPositive/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 54.977852
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 88711,
+    "name": "rocksdb_microbench_suite_11",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  },
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 403.095413
+          }
+        ],
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 505.270116
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 3154.721755
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4687.221851
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 3771.266874
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4804.836604
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 1959.030698
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 557.749335
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 2109.882711
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 2007.868277
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 3823.708033
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 5173.766594
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4195.243654
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4396.047348
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4952.523661
+          }
+        ],
+        "DBPut/comp_style:2/max_data:107374182400/per_key_size:1024/enable_statistics:1/wal:0/iterations:51200/threads:8": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2312.17547
+          }
+        ],
+        "DBPut/comp_style:2/max_data:107374182400/per_key_size:1024/enable_statistics:1/wal:1/iterations:51200/threads:8": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2199.255566
+          }
+        ],
+        "DBPut/comp_style:2/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:1/iterations:409600/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 360.355195
+          }
+        ],
+        "DataBlockSeek/iterations:1000000": [
+          "real_time",
+          "cpu_time",
+          "seek_ns",
+          "threads",
+          {
+            "est_runtime": 16.815497
+          }
+        ],
+        "IteratorNext/comp_style:1/max_data:134217728/per_key_size:256/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 998.011759
+          }
+        ],
+        "IteratorNextWithPerfContext/iterations:100000": [
+          "find_next_user_entry_time",
+          "db_size",
+          "threads",
+          "internal_key_skipped_count",
+          "real_time",
+          "cpu_time",
+          "iter_next_cpu_nanos",
+          "user_key_comparison_count",
+          {
+            "est_runtime": 829.288054
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1896.181275
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1852.19042
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2139.477433
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1912.38273
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1331.360875
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3420.568984
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3657.279035
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3246.623059
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1572.433675
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2460.911144
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2888.95434
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1138.435281
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:1/enable_filter:0/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1411.742372
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/enable_filter:1/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1178.105028
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/enable_filter:0/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1777.084988
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1678.61669
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:0/enable_filter:1/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1826.405941
+          }
+        ]
+      },
+      "ribbon_bench": {
+        "FilterBuild/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.654238
+          }
+        ],
+        "FilterBuild/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.453843
+          }
+        ],
+        "FilterBuild/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.52447
+          }
+        ],
+        "FilterBuild/filter_impl:0/bits_per_key:20/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 2.169285
+          }
+        ],
+        "FilterBuild/filter_impl:2/bits_per_key:20/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.335641
+          }
+        ],
+        "FilterBuild/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.288164
+          }
+        ],
+        "FilterQueryNegative/filter_impl:0/bits_per_key:20/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 15.548585
+          }
+        ],
+        "FilterQueryNegative/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 7.097205
+          }
+        ],
+        "FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 1.389287
+          }
+        ],
+        "FilterQueryPositive/filter_impl:2/bits_per_key:10/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 14.556778
+          }
+        ],
+        "FilterQueryPositive/filter_impl:2/bits_per_key:20/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 6.387351
+          }
+        ],
+        "FilterQueryPositive/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 47.662364
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 88709,
+    "name": "rocksdb_microbench_suite_12",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  },
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 446.597044
+          }
+        ],
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 507.805668
+          }
+        ],
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 607.525698
+          }
+        ],
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 1937.632471
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 3154.693409
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4666.088632
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4803.243479
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 3772.288434
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 2092.642473
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4425.053714
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 5173.355427
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 1889.78671
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 694.715237
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4145.945651
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4967.640229
+          }
+        ],
+        "DBPut/comp_style:1/max_data:107374182400/per_key_size:1024/enable_statistics:0/wal:0/iterations:409600/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2352.471126
+          }
+        ],
+        "DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1/iterations:409600/threads:1": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 360.424861
+          }
+        ],
+        "IteratorNext/comp_style:0/max_data:134217728/per_key_size:1024/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1000.224231
+          }
+        ],
+        "IteratorNext/comp_style:1/max_data:536870912/per_key_size:1024/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2192.668329
+          }
+        ],
+        "IteratorNext/comp_style:1/max_data:536870912/per_key_size:256/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2480.631028
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1866.980315
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2139.269926
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1917.557528
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2031.89305
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1331.768126
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3678.933137
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3833.856318
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3262.461026
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1659.435948
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1572.985396
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3407.503485
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2839.546177
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1183.238636
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:0/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1412.015301
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1815.899454
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1150.426956
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:0/enable_filter:0/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1778.155184
+          }
+        ]
+      },
+      "ribbon_bench": {
+        "FilterBuild/filter_impl:0/bits_per_key:20/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.478298
+          }
+        ],
+        "FilterBuild/filter_impl:2/bits_per_key:20/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.381091
+          }
+        ],
+        "FilterBuild/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 7.809054
+          }
+        ],
+        "FilterQueryNegative/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 1.57804
+          }
+        ],
+        "FilterQueryNegative/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 2.799623
+          }
+        ],
+        "FilterQueryNegative/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 129.266223
+          }
+        ],
+        "FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 1.761922
+          }
+        ],
+        "FilterQueryPositive/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 6.667228
+          }
+        ],
+        "FilterQueryPositive/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 1.320525
+          }
+        ],
+        "FilterQueryPositive/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 1.395642
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 88709,
+    "name": "rocksdb_microbench_suite_13",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  },
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 434.616062
+          }
+        ],
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 510.156004
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 3050.425322
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4667.45867
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 5040.173716
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 3790.602375
+          }
+        ],
+        "DBGet/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 4804.316933
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 605.701828
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 4442.475931
+          }
+        ],
+        "DBGet/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 5087.274609
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 1899.844152
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 694.658916
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 1938.347133
+          }
+        ],
+        "DBGet/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 3910.096422
+          }
+        ],
+        "DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:409600/threads:1": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 377.714354
+          }
+        ],
+        "IteratorPrev/comp_style:1/max_data:536870912/per_key_size:256/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2370.382861
+          }
+        ],
+        "IteratorPrev/comp_style:2/max_data:536870912/per_key_size:256/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2484.65835
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1923.233472
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2139.237156
+          }
+        ],
+        "IteratorSeek/comp_style:0/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3241.70317
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3684.276482
+          }
+        ],
+        "IteratorSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2896.854937
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1830.886558
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1337.374059
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1658.766047
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1572.454657
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 3479.088621
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2186.948587
+          }
+        ],
+        "IteratorSeek/comp_style:2/max_data:536870912/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 4024.639105
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:0/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1005.603266
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1181.26407
+          }
+        ],
+        "PrefixSeek/comp_style:0/max_data:536870912/per_key_size:256/enable_statistics:1/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1849.199043
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1412.076623
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:536870912/per_key_size:1024/enable_statistics:1/enable_filter:0/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2016.125995
+          }
+        ],
+        "PrefixSeek/comp_style:1/max_data:536870912/per_key_size:256/enable_statistics:0/enable_filter:1/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 2092.152047
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1141.714694
+          }
+        ],
+        "PrefixSeek/comp_style:2/max_data:536870912/per_key_size:1024/enable_statistics:0/enable_filter:1/iterations:10240": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 1776.709162
+          }
+        ]
+      },
+      "ribbon_bench": {
+        "FilterBuild/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.397602
+          }
+        ],
+        "FilterBuild/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.238302
+          }
+        ],
+        "FilterBuild/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 2.653086
+          }
+        ],
+        "FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 7.89775
+          }
+        ],
+        "FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.364915
+          }
+        ],
+        "FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 1.638573
+          }
+        ],
+        "FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 1.331795
+          }
+        ],
+        "FilterQueryNegative/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 1.821543
+          }
+        ],
+        "FilterQueryNegative/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 1.467467
+          }
+        ],
+        "FilterQueryNegative/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 67.68153
+          }
+        ],
+        "FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 6.708367
+          }
+        ],
+        "FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 10.709789
+          }
+        ],
+        "FilterQueryPositive/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 44.929381
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 88711,
+    "name": "rocksdb_microbench_suite_14",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  }
+]
diff --git a/src/rocksdb/buckifier/bench.json b/src/rocksdb/buckifier/bench.json
new file mode 100644
index 000000000..e1ea99f25
--- /dev/null
+++ b/src/rocksdb/buckifier/bench.json
@@ -0,0 +1,1594 @@
+[
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 510.387506
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 497.077261
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 696.590699
+          }
+        ],
+        "DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1/iterations:51200/threads:8": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 271.438709
+          }
+        ],
+        "DBPut/comp_style:2/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:0/iterations:51200/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 377.905737
+          }
+        ]
+      },
+      "ribbon_bench": {
+        "FilterBuild/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.194392
+          }
+        ],
+        "FilterBuild/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 6.614245
+          }
+        ],
+        "FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 9.184089
+          }
+        ],
+        "FilterQueryNegative/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 1.737477
+          }
+        ],
+        "FilterQueryPositive/filter_impl:2/bits_per_key:10/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 1.49311
+          }
+        ],
+        "FilterQueryPositive/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 14.576404
+          }
+        ],
+        "FilterQueryPositive/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 1.35537
+          }
+        ],
+        "FilterQueryPositive/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 47.662364
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 2438,
+    "name": "rocksdb_microbench_suite_0",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  },
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 510.88842
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 497.546918
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 694.715237
+          }
+        ],
+        "DBPut/comp_style:2/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:409600/threads:1": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 378.072129
+          }
+        ],
+        "DBPut/comp_style:2/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1/iterations:51200/threads:8": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 271.474125
+          }
+        ]
+      },
+      "ribbon_bench": {
+        "FilterBuild/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.536606
+          }
+        ],
+        "FilterQueryNegative/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 7.097205
+          }
+        ],
+        "FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 1.389287
+          }
+        ],
+        "FilterQueryPositive/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 1.26261
+          }
+        ],
+        "FilterQueryPositive/filter_impl:0/bits_per_key:20/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 11.512964
+          }
+        ],
+        "FilterQueryPositive/filter_impl:2/bits_per_key:20/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 6.387351
+          }
+        ],
+        "FilterQueryPositive/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 54.977852
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 2437,
+    "name": "rocksdb_microbench_suite_1",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  },
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 479.941992
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 513.865636
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 694.658916
+          }
+        ],
+        "DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:0/iterations:409600/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 379.060989
+          }
+        ],
+        "DBPut/comp_style:2/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:0/iterations:409600/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 377.799421
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 2446,
+    "name": "rocksdb_microbench_suite_2",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  },
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 434.616062
+          }
+        ],
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 694.642444
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 471.459789
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 514.201628
+          }
+        ],
+        "DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:1/iterations:51200/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 271.397936
+          }
+        ],
+        "DataBlockSeek/iterations:1000000": [
+          "real_time",
+          "cpu_time",
+          "seek_ns",
+          "threads",
+          {
+            "est_runtime": 16.815497
+          }
+        ]
+      },
+      "ribbon_bench": {
+        "FilterBuild/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.311687
+          }
+        ],
+        "FilterBuild/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.453843
+          }
+        ],
+        "FilterBuild/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 7.809054
+          }
+        ],
+        "FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 6.421361
+          }
+        ],
+        "FilterQueryNegative/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 1.57804
+          }
+        ],
+        "FilterQueryPositive/filter_impl:2/bits_per_key:10/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 14.556778
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 2437,
+    "name": "rocksdb_microbench_suite_3",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  },
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 451.741737
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 458.336347
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 694.628355
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 517.245591
+          }
+        ],
+        "DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:1/iterations:51200/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 270.221479
+          }
+        ],
+        "RandomAccessFileReaderRead/enable_statistics:1/iterations:1000000": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 5.370559
+          }
+        ]
+      },
+      "ribbon_bench": {
+        "FilterBuild/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.397602
+          }
+        ],
+        "FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 10.971779
+          }
+        ],
+        "FilterQueryNegative/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 7.789373
+          }
+        ],
+        "FilterQueryNegative/filter_impl:0/bits_per_key:20/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 1.54233
+          }
+        ],
+        "FilterQueryNegative/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 15.718081
+          }
+        ],
+        "FilterQueryPositive/filter_impl:2/bits_per_key:20/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 1.309645
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 2437,
+    "name": "rocksdb_microbench_suite_4",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  },
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 454.919549
+          }
+        ],
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 403.095413
+          }
+        ],
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:1280/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 694.578514
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 532.66627
+          }
+        ],
+        "DBPut/comp_style:2/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:1/iterations:51200/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 271.417422
+          }
+        ]
+      },
+      "ribbon_bench": {
+        "FilterBuild/filter_impl:2/bits_per_key:20/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.335641
+          }
+        ],
+        "FilterQueryNegative/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 1.672049
+          }
+        ],
+        "FilterQueryNegative/filter_impl:0/bits_per_key:20/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 15.548585
+          }
+        ],
+        "FilterQueryNegative/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 2.799623
+          }
+        ],
+        "FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 10.709789
+          }
+        ],
+        "FilterQueryPositive/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 7.024791
+          }
+        ],
+        "FilterQueryPositive/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 39.015621
+          }
+        ],
+        "FilterQueryPositive/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 1.453854
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 2437,
+    "name": "rocksdb_microbench_suite_5",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  },
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 658.890773
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 532.687188
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 498.522549
+          }
+        ],
+        "DBPut/comp_style:2/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:1/iterations:409600/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 360.355195
+          }
+        ],
+        "DBPut/comp_style:2/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:51200/threads:8": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 378.808772
+          }
+        ]
+      },
+      "ribbon_bench": {
+        "FilterBuild/filter_impl:2/bits_per_key:10/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.439296
+          }
+        ],
+        "FilterBuild/filter_impl:2/bits_per_key:10/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 2.60738
+          }
+        ],
+        "FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 1.638573
+          }
+        ],
+        "FilterQueryPositive/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 1.320525
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 2437,
+    "name": "rocksdb_microbench_suite_6",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  },
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 505.270116
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 651.121374
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 532.702176
+          }
+        ],
+        "DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:51200/threads:8": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 378.81272
+          }
+        ],
+        "DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1/iterations:409600/threads:1": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 360.424861
+          }
+        ],
+        "RandomAccessFileReaderRead/enable_statistics:0/iterations:1000000": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 2.984478
+          }
+        ]
+      },
+      "ribbon_bench": {
+        "FilterBuild/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.238302
+          }
+        ],
+        "FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.38186
+          }
+        ],
+        "FilterQueryNegative/filter_impl:2/bits_per_key:10/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 1.821543
+          }
+        ],
+        "FilterQueryNegative/filter_impl:2/bits_per_key:10/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 1.501483
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 2438,
+    "name": "rocksdb_microbench_suite_7",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  },
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 549.901612
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 651.103057
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 498.06214
+          }
+        ],
+        "DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:0/iterations:409600/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 378.432186
+          }
+        ],
+        "DBPut/comp_style:2/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1/iterations:409600/threads:1": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 271.523747
+          }
+        ]
+      },
+      "ribbon_bench": {
+        "FilterBuild/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.52447
+          }
+        ],
+        "FilterBuild/filter_impl:2/bits_per_key:20/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.381091
+          }
+        ],
+        "FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 7.89775
+          }
+        ],
+        "FilterQueryNegative/filter_impl:2/bits_per_key:20/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 1.855877
+          }
+        ],
+        "FilterQueryNegative/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 67.68153
+          }
+        ],
+        "FilterQueryPositive/filter_impl:0/bits_per_key:20/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 6.667228
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 2437,
+    "name": "rocksdb_microbench_suite_8",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  },
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 551.801692
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 649.806897
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 497.710941
+          }
+        ],
+        "DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:0/iterations:51200/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 378.198303
+          }
+        ],
+        "DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:1/iterations:409600/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 271.500867
+          }
+        ]
+      },
+      "ribbon_bench": {
+        "FilterBuild/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.364915
+          }
+        ],
+        "FilterQueryNegative/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 1.467467
+          }
+        ],
+        "FilterQueryNegative/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 82.415772
+          }
+        ],
+        "FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 1.761922
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 2437,
+    "name": "rocksdb_microbench_suite_9",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  },
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 624.132525
+          }
+        ],
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 510.156004
+          }
+        ],
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 551.935026
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 387.072927
+          }
+        ],
+        "DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1/iterations:409600/threads:1": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 360.304737
+          }
+        ]
+      },
+      "ribbon_bench": {
+        "FilterBuild/filter_impl:3/bits_per_key:10/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.288164
+          }
+        ],
+        "FilterQueryPositive/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 1.395642
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 2437,
+    "name": "rocksdb_microbench_suite_10",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  },
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 507.805668
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 557.749335
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 623.018994
+          }
+        ],
+        "DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:409600/threads:1": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 377.714354
+          }
+        ],
+        "DBPut/comp_style:1/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:51200/threads:8": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 379.059546
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 2446,
+    "name": "rocksdb_microbench_suite_11",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  },
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 446.597044
+          }
+        ],
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 609.749844
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 510.357826
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 557.886298
+          }
+        ]
+      },
+      "ribbon_bench": {
+        "FilterBuild/filter_impl:0/bits_per_key:20/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.478298
+          }
+        ],
+        "FilterBuild/filter_impl:0/bits_per_key:20/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 2.169285
+          }
+        ],
+        "FilterBuild/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.348746
+          }
+        ],
+        "FilterQueryNegative/filter_impl:2/bits_per_key:10/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 15.272086
+          }
+        ],
+        "FilterQueryNegative/filter_impl:2/bits_per_key:20/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 8.032293
+          }
+        ],
+        "FilterQueryNegative/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 102.227551
+          }
+        ],
+        "FilterQueryNegative/filter_impl:3/bits_per_key:20/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 129.266223
+          }
+        ],
+        "FilterQueryPositive/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 6.708367
+          }
+        ],
+        "FilterQueryPositive/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 44.929381
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 2437,
+    "name": "rocksdb_microbench_suite_12",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  },
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 607.525698
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 504.228523
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 578.538571
+          }
+        ],
+        "DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:1/iterations:409600/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 360.402539
+          }
+        ],
+        "DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:0/iterations:409600/threads:1": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 378.530036
+          }
+        ]
+      },
+      "ribbon_bench": {
+        "FilterBuild/filter_impl:2/bits_per_key:20/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 2.653086
+          }
+        ],
+        "FilterQueryNegative/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 1.621375
+          }
+        ],
+        "FilterQueryPositive/filter_impl:3/bits_per_key:10/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 1.325264
+          }
+        ],
+        "FilterQueryPositive/filter_impl:3/bits_per_key:20/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 1.448994
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 2437,
+    "name": "rocksdb_microbench_suite_13",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  },
+  {
+    "benchmarks": {
+      "db_basic_bench": {
+        "DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:1/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 583.226959
+          }
+        ],
+        "DBGet/comp_style:1/max_data:134217728/per_key_size:256/enable_statistics:0/negative_query:1/enable_filter:0/iterations:10240/threads:1": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "neg_qu_pct",
+          "threads",
+          {
+            "est_runtime": 605.701828
+          }
+        ],
+        "DBGet/comp_style:2/max_data:134217728/per_key_size:1024/enable_statistics:1/negative_query:0/enable_filter:0/iterations:10240/threads:1": [
+          "db_size",
+          "get_mean",
+          "threads",
+          "real_time",
+          "cpu_time",
+          "neg_qu_pct",
+          {
+            "est_runtime": 501.900122
+          }
+        ],
+        "DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:0/wal:0/iterations:51200/threads:8": [
+          "real_time",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 378.529031
+          }
+        ],
+        "DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1/iterations:51200/threads:8": [
+          "real_time",
+          "put_mean",
+          "cpu_time",
+          "db_size",
+          "threads",
+          {
+            "est_runtime": 360.330841
+          }
+        ]
+      },
+      "ribbon_bench": {
+        "FilterBuild/filter_impl:0/bits_per_key:10/key_len_avg:10/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 1.654238
+          }
+        ],
+        "FilterBuild/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1048576": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "size",
+          {
+            "est_runtime": 2.171919
+          }
+        ],
+        "FilterQueryNegative/filter_impl:0/bits_per_key:10/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          "fp_pct",
+          {
+            "est_runtime": 1.331795
+          }
+        ],
+        "FilterQueryPositive/filter_impl:0/bits_per_key:20/key_len_avg:100/entry_num:1024": [
+          "real_time",
+          "cpu_time",
+          "threads",
+          {
+            "est_runtime": 1.399281
+          }
+        ]
+      }
+    },
+    "do_not_reflow": false,
+    "expected_runtime_one_iter": 2437,
+    "name": "rocksdb_microbench_suite_14",
+    "regression_threshold": 10,
+    "sl_iterations": 3
+  }
+]
diff --git a/src/rocksdb/buckifier/buckify_rocksdb.py b/src/rocksdb/buckifier/buckify_rocksdb.py
new file mode 100755
index 000000000..ac09c0519
--- /dev/null
+++ b/src/rocksdb/buckifier/buckify_rocksdb.py
@@ -0,0 +1,340 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+try:
+    from builtins import str
+except ImportError:
+    from __builtin__ import str
+import fnmatch
+import json
+import os
+import sys
+
+from targets_builder import TARGETSBuilder
+
+from util import ColorString
+
+# This script generates TARGETS file for Buck.
+# Buck is a build tool specifying dependencies among different build targets.
+# User can pass extra dependencies as a JSON object via command line, and this
+# script can include these dependencies in the generate TARGETS file.
+# Usage:
+# $python3 buckifier/buckify_rocksdb.py
+# (This generates a TARGET file without user-specified dependency for unit
+# tests.)
+# $python3 buckifier/buckify_rocksdb.py \
+#        '{"fake": {
+#                      "extra_deps": [":test_dep", "//fakes/module:mock1"],
+#                      "extra_compiler_flags": ["-DROCKSDB_LITE", "-Os"]
+#                  }
+#         }'
+# (Generated TARGETS file has test_dep and mock1 as dependencies for RocksDB
+# unit tests, and will use the extra_compiler_flags to compile the unit test
+# source.)
+
+# tests to export as libraries for inclusion in other projects
+_EXPORTED_TEST_LIBS = ["env_basic_test"]
+
+# Parse src.mk files as a Dictionary of
+# VAR_NAME => list of files
+def parse_src_mk(repo_path):
+    src_mk = repo_path + "/src.mk"
+    src_files = {}
+    for line in open(src_mk):
+        line = line.strip()
+        if len(line) == 0 or line[0] == "#":
+            continue
+        if "=" in line:
+            current_src = line.split("=")[0].strip()
+            src_files[current_src] = []
+        elif ".c" in line:
+            src_path = line.split("\\")[0].strip()
+            src_files[current_src].append(src_path)
+    return src_files
+
+
+# get all .cc / .c files
+def get_cc_files(repo_path):
+    cc_files = []
+    for root, _dirnames, filenames in os.walk(
+        repo_path
+    ):  # noqa: B007 T25377293 Grandfathered in
+        root = root[(len(repo_path) + 1) :]
+        if "java" in root:
+            # Skip java
+            continue
+        for filename in fnmatch.filter(filenames, "*.cc"):
+            cc_files.append(os.path.join(root, filename))
+        for filename in fnmatch.filter(filenames, "*.c"):
+            cc_files.append(os.path.join(root, filename))
+    return cc_files
+
+
+# Get non_parallel tests from Makefile
+def get_non_parallel_tests(repo_path):
+    Makefile = repo_path + "/Makefile"
+
+    s = set({})
+
+    found_non_parallel_tests = False
+    for line in open(Makefile):
+        line = line.strip()
+        if line.startswith("NON_PARALLEL_TEST ="):
+            found_non_parallel_tests = True
+        elif found_non_parallel_tests:
+            if line.endswith("\\"):
+                # remove the trailing \
+                line = line[:-1]
+                line = line.strip()
+                s.add(line)
+            else:
+                # we consumed all the non_parallel tests
+                break
+
+    return s
+
+
+# Parse extra dependencies passed by user from command line
+def get_dependencies():
+    deps_map = {"": {"extra_deps": [], "extra_compiler_flags": []}}
+    if len(sys.argv) < 2:
+        return deps_map
+
+    def encode_dict(data):
+        rv = {}
+        for k, v in data.items():
+            if isinstance(v, dict):
+                v = encode_dict(v)
+            rv[k] = v
+        return rv
+
+    extra_deps = json.loads(sys.argv[1], object_hook=encode_dict)
+    for target_alias, deps in extra_deps.items():
+        deps_map[target_alias] = deps
+    return deps_map
+
+
+# Prepare TARGETS file for buck
+def generate_targets(repo_path, deps_map):
+    print(ColorString.info("Generating TARGETS"))
+    # parsed src.mk file
+    src_mk = parse_src_mk(repo_path)
+    # get all .cc files
+    cc_files = get_cc_files(repo_path)
+    # get non_parallel tests from Makefile
+    non_parallel_tests = get_non_parallel_tests(repo_path)
+
+    if src_mk is None or cc_files is None or non_parallel_tests is None:
+        return False
+
+    extra_argv = ""
+    if len(sys.argv) >= 2:
+        # Heuristically quote and canonicalize whitespace for inclusion
+        # in how the file was generated.
+        extra_argv = " '{0}'".format(" ".join(sys.argv[1].split()))
+
+    TARGETS = TARGETSBuilder("%s/TARGETS" % repo_path, extra_argv)
+
+    # rocksdb_lib
+    TARGETS.add_library(
+        "rocksdb_lib",
+        src_mk["LIB_SOURCES"] +
+        # always add range_tree, it's only excluded on ppc64, which we don't use internally
+        src_mk["RANGE_TREE_SOURCES"] + src_mk["TOOL_LIB_SOURCES"],
+        deps=[
+            "//folly/container:f14_hash",
+            "//folly/experimental/coro:blocking_wait",
+            "//folly/experimental/coro:collect",
+            "//folly/experimental/coro:coroutine",
+            "//folly/experimental/coro:task",
+            "//folly/synchronization:distributed_mutex",
+        ],
+    )
+    # rocksdb_whole_archive_lib
+    TARGETS.add_library(
+        "rocksdb_whole_archive_lib",
+        src_mk["LIB_SOURCES"] +
+        # always add range_tree, it's only excluded on ppc64, which we don't use internally
+        src_mk["RANGE_TREE_SOURCES"] + src_mk["TOOL_LIB_SOURCES"],
+        deps=[
+            "//folly/container:f14_hash",
+            "//folly/experimental/coro:blocking_wait",
+            "//folly/experimental/coro:collect",
+            "//folly/experimental/coro:coroutine",
+            "//folly/experimental/coro:task",
+            "//folly/synchronization:distributed_mutex",
+        ],
+        headers=None,
+        extra_external_deps="",
+        link_whole=True,
+    )
+    # rocksdb_test_lib
+    TARGETS.add_library(
+        "rocksdb_test_lib",
+        src_mk.get("MOCK_LIB_SOURCES", [])
+        + src_mk.get("TEST_LIB_SOURCES", [])
+        + src_mk.get("EXP_LIB_SOURCES", [])
+        + src_mk.get("ANALYZER_LIB_SOURCES", []),
+        [":rocksdb_lib"],
+        extra_test_libs=True,
+    )
+    # rocksdb_tools_lib
+    TARGETS.add_library(
+        "rocksdb_tools_lib",
+        src_mk.get("BENCH_LIB_SOURCES", [])
+        + src_mk.get("ANALYZER_LIB_SOURCES", [])
+        + ["test_util/testutil.cc"],
+        [":rocksdb_lib"],
+    )
+    # rocksdb_cache_bench_tools_lib
+    TARGETS.add_library(
+        "rocksdb_cache_bench_tools_lib",
+        src_mk.get("CACHE_BENCH_LIB_SOURCES", []),
+        [":rocksdb_lib"],
+    )
+    # rocksdb_stress_lib
+    TARGETS.add_rocksdb_library(
+        "rocksdb_stress_lib",
+        src_mk.get("ANALYZER_LIB_SOURCES", [])
+        + src_mk.get("STRESS_LIB_SOURCES", [])
+        + ["test_util/testutil.cc"],
+    )
+    # db_stress binary
+    TARGETS.add_binary(
+        "db_stress", ["db_stress_tool/db_stress.cc"], [":rocksdb_stress_lib"]
+    )
+    # bench binaries
+    for src in src_mk.get("MICROBENCH_SOURCES", []):
+        name = src.rsplit("/", 1)[1].split(".")[0] if "/" in src else src.split(".")[0]
+        TARGETS.add_binary(name, [src], [], extra_bench_libs=True)
+    print("Extra dependencies:\n{0}".format(json.dumps(deps_map)))
+
+    # Dictionary test executable name -> relative source file path
+    test_source_map = {}
+
+    # c_test.c is added through TARGETS.add_c_test(). If there
+    # are more than one .c test file, we need to extend
+    # TARGETS.add_c_test() to include other C tests too.
+    for test_src in src_mk.get("TEST_MAIN_SOURCES_C", []):
+        if test_src != "db/c_test.c":
+            print("Don't know how to deal with " + test_src)
+            return False
+    TARGETS.add_c_test()
+
+    try:
+        with open(f"{repo_path}/buckifier/bench.json") as json_file:
+            fast_fancy_bench_config_list = json.load(json_file)
+            for config_dict in fast_fancy_bench_config_list:
+                clean_benchmarks = {}
+                benchmarks = config_dict["benchmarks"]
+                for binary, benchmark_dict in benchmarks.items():
+                    clean_benchmarks[binary] = {}
+                    for benchmark, overloaded_metric_list in benchmark_dict.items():
+                        clean_benchmarks[binary][benchmark] = []
+                        for metric in overloaded_metric_list:
+                            if not isinstance(metric, dict):
+                                clean_benchmarks[binary][benchmark].append(metric)
+                TARGETS.add_fancy_bench_config(
+                    config_dict["name"],
+                    clean_benchmarks,
+                    False,
+                    config_dict["expected_runtime_one_iter"],
+                    config_dict["sl_iterations"],
+                    config_dict["regression_threshold"],
+                )
+
+        with open(f"{repo_path}/buckifier/bench-slow.json") as json_file:
+            slow_fancy_bench_config_list = json.load(json_file)
+            for config_dict in slow_fancy_bench_config_list:
+                clean_benchmarks = {}
+                benchmarks = config_dict["benchmarks"]
+                for binary, benchmark_dict in benchmarks.items():
+                    clean_benchmarks[binary] = {}
+                    for benchmark, overloaded_metric_list in benchmark_dict.items():
+                        clean_benchmarks[binary][benchmark] = []
+                        for metric in overloaded_metric_list:
+                            if not isinstance(metric, dict):
+                                clean_benchmarks[binary][benchmark].append(metric)
+            for config_dict in slow_fancy_bench_config_list:
+                TARGETS.add_fancy_bench_config(
+                    config_dict["name"] + "_slow",
+                    clean_benchmarks,
+                    True,
+                    config_dict["expected_runtime_one_iter"],
+                    config_dict["sl_iterations"],
+                    config_dict["regression_threshold"],
+                )
+    # it is better servicelab experiments break
+    # than rocksdb github ci
+    except Exception:
+        pass
+
+    TARGETS.add_test_header()
+
+    for test_src in src_mk.get("TEST_MAIN_SOURCES", []):
+        test = test_src.split(".c")[0].strip().split("/")[-1].strip()
+        test_source_map[test] = test_src
+        print("" + test + " " + test_src)
+
+    for target_alias, deps in deps_map.items():
+        for test, test_src in sorted(test_source_map.items()):
+            if len(test) == 0:
+                print(ColorString.warning("Failed to get test name for %s" % test_src))
+                continue
+
+            test_target_name = test if not target_alias else test + "_" + target_alias
+
+            if test in _EXPORTED_TEST_LIBS:
+                test_library = "%s_lib" % test_target_name
+                TARGETS.add_library(
+                    test_library,
+                    [test_src],
+                    deps=[":rocksdb_test_lib"],
+                    extra_test_libs=True,
+                )
+                TARGETS.register_test(
+                    test_target_name,
+                    test_src,
+                    deps=json.dumps(deps["extra_deps"] + [":" + test_library]),
+                    extra_compiler_flags=json.dumps(deps["extra_compiler_flags"]),
+                )
+            else:
+                TARGETS.register_test(
+                    test_target_name,
+                    test_src,
+                    deps=json.dumps(deps["extra_deps"] + [":rocksdb_test_lib"]),
+                    extra_compiler_flags=json.dumps(deps["extra_compiler_flags"]),
+                )
+
+    print(ColorString.info("Generated TARGETS Summary:"))
+    print(ColorString.info("- %d libs" % TARGETS.total_lib))
+    print(ColorString.info("- %d binarys" % TARGETS.total_bin))
+    print(ColorString.info("- %d tests" % TARGETS.total_test))
+    return True
+
+
+def get_rocksdb_path():
+    # rocksdb = {script_dir}/..
+    script_dir = os.path.dirname(sys.argv[0])
+    script_dir = os.path.abspath(script_dir)
+    rocksdb_path = os.path.abspath(os.path.join(script_dir, "../"))
+
+    return rocksdb_path
+
+
+def exit_with_error(msg):
+    print(ColorString.error(msg))
+    sys.exit(1)
+
+
+def main():
+    deps_map = get_dependencies()
+    # Generate TARGETS file for buck
+    ok = generate_targets(get_rocksdb_path(), deps_map)
+    if not ok:
+        exit_with_error("Failed to generate TARGETS files")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/rocksdb/buckifier/check_buck_targets.sh b/src/rocksdb/buckifier/check_buck_targets.sh
new file mode 100755
index 000000000..66c83c52f
--- /dev/null
+++ b/src/rocksdb/buckifier/check_buck_targets.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# If clang_format_diff.py command is not specfied, we assume we are able to
+# access directly without any path.
+
+TGT_DIFF=`git diff TARGETS | head -n 1`
+
+if [ ! -z "$TGT_DIFF" ]
+then
+  echo "TARGETS file has uncommitted changes. Skip this check."
+  exit 0
+fi
+
+echo Backup original TARGETS file.
+
+cp TARGETS TARGETS.bkp
+
+${PYTHON:-python3} buckifier/buckify_rocksdb.py
+
+TGT_DIFF=`git diff TARGETS | head -n 1`
+
+if [ -z "$TGT_DIFF" ]
+then
+  mv TARGETS.bkp TARGETS
+  exit 0
+else
+  echo "Please run '${PYTHON:-python3} buckifier/buckify_rocksdb.py' to update TARGETS file."
+  echo "Do not manually update TARGETS file."
+  ${PYTHON:-python3} --version
+  mv TARGETS.bkp TARGETS
+  exit 1
+fi
diff --git a/src/rocksdb/buckifier/rocks_test_runner.sh b/src/rocksdb/buckifier/rocks_test_runner.sh
new file mode 100755
index 000000000..77f8f23c5
--- /dev/null
+++ b/src/rocksdb/buckifier/rocks_test_runner.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Create a tmp directory for the test to use
+TEST_DIR=$(mktemp -d /dev/shm/fbcode_rocksdb_XXXXXXX)
+# shellcheck disable=SC2068
+TEST_TMPDIR="$TEST_DIR" $@ && rm -rf "$TEST_DIR"
diff --git a/src/rocksdb/buckifier/targets_builder.py b/src/rocksdb/buckifier/targets_builder.py
new file mode 100644
index 000000000..343b2207d
--- /dev/null
+++ b/src/rocksdb/buckifier/targets_builder.py
@@ -0,0 +1,150 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+try:
+    from builtins import object, str
+except ImportError:
+    from __builtin__ import object, str
+import pprint
+
+import targets_cfg
+
+
+def pretty_list(lst, indent=8):
+    if lst is None or len(lst) == 0:
+        return ""
+
+    if len(lst) == 1:
+        return '"%s"' % lst[0]
+
+    separator = '",\n%s"' % (" " * indent)
+    res = separator.join(sorted(lst))
+    res = "\n" + (" " * indent) + '"' + res + '",\n' + (" " * (indent - 4))
+    return res
+
+
+class TARGETSBuilder(object):
+    def __init__(self, path, extra_argv):
+        self.path = path
+        header = targets_cfg.rocksdb_target_header_template.format(
+            extra_argv=extra_argv
+        )
+        with open(path, "wb") as targets_file:
+            targets_file.write(header.encode("utf-8"))
+        self.total_lib = 0
+        self.total_bin = 0
+        self.total_test = 0
+        self.tests_cfg = ""
+
+    def add_library(
+        self,
+        name,
+        srcs,
+        deps=None,
+        headers=None,
+        extra_external_deps="",
+        link_whole=False,
+        external_dependencies=None,
+        extra_test_libs=False,
+    ):
+        if headers is not None:
+            headers = "[" + pretty_list(headers) + "]"
+        with open(self.path, "ab") as targets_file:
+            targets_file.write(
+                targets_cfg.library_template.format(
+                    name=name,
+                    srcs=pretty_list(srcs),
+                    headers=headers,
+                    deps=pretty_list(deps),
+                    extra_external_deps=extra_external_deps,
+                    link_whole=link_whole,
+                    external_dependencies=pretty_list(external_dependencies),
+                    extra_test_libs=extra_test_libs,
+                ).encode("utf-8")
+            )
+        self.total_lib = self.total_lib + 1
+
+    def add_rocksdb_library(self, name, srcs, headers=None, external_dependencies=None):
+        if headers is not None:
+            headers = "[" + pretty_list(headers) + "]"
+        with open(self.path, "ab") as targets_file:
+            targets_file.write(
+                targets_cfg.rocksdb_library_template.format(
+                    name=name,
+                    srcs=pretty_list(srcs),
+                    headers=headers,
+                    external_dependencies=pretty_list(external_dependencies),
+                ).encode("utf-8")
+            )
+        self.total_lib = self.total_lib + 1
+
+    def add_binary(
+        self,
+        name,
+        srcs,
+        deps=None,
+        extra_preprocessor_flags=None,
+        extra_bench_libs=False,
+    ):
+        with open(self.path, "ab") as targets_file:
+            targets_file.write(
+                targets_cfg.binary_template.format(
+                    name=name,
+                    srcs=pretty_list(srcs),
+                    deps=pretty_list(deps),
+                    extra_preprocessor_flags=pretty_list(extra_preprocessor_flags),
+                    extra_bench_libs=extra_bench_libs,
+                ).encode("utf-8")
+            )
+        self.total_bin = self.total_bin + 1
+
+    def add_c_test(self):
+        with open(self.path, "ab") as targets_file:
+            targets_file.write(
+                b"""
+add_c_test_wrapper()
+"""
+            )
+
+    def add_test_header(self):
+        with open(self.path, "ab") as targets_file:
+            targets_file.write(
+                b"""
+        # Generate a test rule for each entry in ROCKS_TESTS
+        # Do not build the tests in opt mode, since SyncPoint and other test code
+        # will not be included.
+"""
+            )
+
+    def add_fancy_bench_config(
+        self,
+        name,
+        bench_config,
+        slow,
+        expected_runtime,
+        sl_iterations,
+        regression_threshold,
+    ):
+        with open(self.path, "ab") as targets_file:
+            targets_file.write(
+                targets_cfg.fancy_bench_template.format(
+                    name=name,
+                    bench_config=pprint.pformat(bench_config),
+                    slow=slow,
+                    expected_runtime=expected_runtime,
+                    sl_iterations=sl_iterations,
+                    regression_threshold=regression_threshold,
+                ).encode("utf-8")
+            )
+
+    def register_test(self, test_name, src, deps, extra_compiler_flags):
+        with open(self.path, "ab") as targets_file:
+            targets_file.write(
+                targets_cfg.unittests_template.format(
+                    test_name=test_name,
+                    test_cc=str(src),
+                    deps=deps,
+                    extra_compiler_flags=extra_compiler_flags,
+                ).encode("utf-8")
+            )
+        self.total_test = self.total_test + 1
diff --git a/src/rocksdb/buckifier/targets_cfg.py b/src/rocksdb/buckifier/targets_cfg.py
new file mode 100644
index 000000000..491c34d6e
--- /dev/null
+++ b/src/rocksdb/buckifier/targets_cfg.py
@@ -0,0 +1,41 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+rocksdb_target_header_template = """# This file \100generated by:
+#$ python3 buckifier/buckify_rocksdb.py{extra_argv}
+# --> DO NOT EDIT MANUALLY <--
+# This file is a Facebook-specific integration for buck builds, so can
+# only be validated by Facebook employees.
+#
+# @noautodeps @nocodemods
+load("//rocks/buckifier:defs.bzl", "cpp_library_wrapper","rocks_cpp_library_wrapper","cpp_binary_wrapper","cpp_unittest_wrapper","fancy_bench_wrapper","add_c_test_wrapper")
+
+"""
+
+
+library_template = """
+cpp_library_wrapper(name="{name}", srcs=[{srcs}], deps=[{deps}], headers={headers}, link_whole={link_whole}, extra_test_libs={extra_test_libs})
+"""
+
+rocksdb_library_template = """
+rocks_cpp_library_wrapper(name="{name}", srcs=[{srcs}], headers={headers})
+
+"""
+
+
+binary_template = """
+cpp_binary_wrapper(name="{name}", srcs=[{srcs}], deps=[{deps}], extra_preprocessor_flags=[{extra_preprocessor_flags}], extra_bench_libs={extra_bench_libs})
+"""
+
+unittests_template = """
+cpp_unittest_wrapper(name="{test_name}",
+            srcs=["{test_cc}"],
+            deps={deps},
+            extra_compiler_flags={extra_compiler_flags})
+
+"""
+
+fancy_bench_template = """
+fancy_bench_wrapper(suite_name="{name}", binary_to_bench_to_metric_list_map={bench_config}, slow={slow}, expected_runtime={expected_runtime}, sl_iterations={sl_iterations}, regression_threshold={regression_threshold})
+
+"""
diff --git a/src/rocksdb/buckifier/util.py b/src/rocksdb/buckifier/util.py
new file mode 100644
index 000000000..8943fed2b
--- /dev/null
+++ b/src/rocksdb/buckifier/util.py
@@ -0,0 +1,118 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+"""
+This module keeps commonly used components.
+"""
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+try:
+    from builtins import object
+except ImportError:
+    from __builtin__ import object
+import os
+import subprocess
+import sys
+import time
+
+
+class ColorString(object):
+    """Generate colorful strings on terminal"""
+
+    HEADER = "\033[95m"
+    BLUE = "\033[94m"
+    GREEN = "\033[92m"
+    WARNING = "\033[93m"
+    FAIL = "\033[91m"
+    ENDC = "\033[0m"
+
+    @staticmethod
+    def _make_color_str(text, color):
+        # In Python2, default encoding for unicode string is ASCII
+        if sys.version_info.major <= 2:
+            return "".join([color, text.encode("utf-8"), ColorString.ENDC])
+        # From Python3, default encoding for unicode string is UTF-8
+        return "".join([color, text, ColorString.ENDC])
+
+    @staticmethod
+    def ok(text):
+        if ColorString.is_disabled:
+            return text
+        return ColorString._make_color_str(text, ColorString.GREEN)
+
+    @staticmethod
+    def info(text):
+        if ColorString.is_disabled:
+            return text
+        return ColorString._make_color_str(text, ColorString.BLUE)
+
+    @staticmethod
+    def header(text):
+        if ColorString.is_disabled:
+            return text
+        return ColorString._make_color_str(text, ColorString.HEADER)
+
+    @staticmethod
+    def error(text):
+        if ColorString.is_disabled:
+            return text
+        return ColorString._make_color_str(text, ColorString.FAIL)
+
+    @staticmethod
+    def warning(text):
+        if ColorString.is_disabled:
+            return text
+        return ColorString._make_color_str(text, ColorString.WARNING)
+
+    is_disabled = False
+
+
+def run_shell_command(shell_cmd, cmd_dir=None):
+    """Run a single shell command.
+    @returns a tuple of shell command return code, stdout, stderr"""
+
+    if cmd_dir is not None and not os.path.exists(cmd_dir):
+        run_shell_command("mkdir -p %s" % cmd_dir)
+
+    start = time.time()
+    print("\t>>> Running: " + shell_cmd)
+    p = subprocess.Popen(  # noqa
+        shell_cmd,
+        shell=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        cwd=cmd_dir,
+    )
+    stdout, stderr = p.communicate()
+    end = time.time()
+
+    # Report time if we spent more than 5 minutes executing a command
+    execution_time = end - start
+    if execution_time > (60 * 5):
+        mins = execution_time / 60
+        secs = execution_time % 60
+        print("\t>time spent: %d minutes %d seconds" % (mins, secs))
+
+    return p.returncode, stdout, stderr
+
+
+def run_shell_commands(shell_cmds, cmd_dir=None, verbose=False):
+    """Execute a sequence of shell commands, which is equivalent to
+    running `cmd1 && cmd2 && cmd3`
+    @returns boolean indication if all commands succeeds.
+    """
+
+    if cmd_dir:
+        print("\t=== Set current working directory => %s" % cmd_dir)
+
+    for shell_cmd in shell_cmds:
+        ret_code, stdout, stderr = run_shell_command(shell_cmd, cmd_dir)
+        if stdout:
+            if verbose or ret_code != 0:
+                print(ColorString.info("stdout: \n"), stdout)
+        if stderr:
+            # contents in stderr is not necessarily to be error messages.
+            if verbose or ret_code != 0:
+                print(ColorString.error("stderr: \n"), stderr)
+        if ret_code != 0:
+            return False
+
+    return True
diff --git a/src/rocksdb/build_tools/amalgamate.py b/src/rocksdb/build_tools/amalgamate.py
new file mode 100755
index 000000000..f79e9075e
--- /dev/null
+++ b/src/rocksdb/build_tools/amalgamate.py
@@ -0,0 +1,168 @@
+#!/usr/bin/python
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+# amalgamate.py creates an amalgamation from a unity build.
+# It can be run with either Python 2 or 3.
+# An amalgamation consists of a header that includes the contents of all public
+# headers and a source file that includes the contents of all source files and
+# private headers.
+#
+# This script works by starting with the unity build file and recursively expanding
+# #include directives. If the #include is found in a public include directory,
+# that header is expanded into the amalgamation header.
+#
+# A particular header is only expanded once, so this script will
+# break if there are multiple inclusions of the same header that are expected to
+# expand differently. Similarly, this type of code causes issues:
+#
+# #ifdef FOO
+#   #include "bar.h"
+#   // code here
+# #else
+#   #include "bar.h"            // oops, doesn't get expanded
+#   // different code here
+# #endif
+#
+# The solution is to move the include out of the #ifdef.
+
+from __future__ import print_function
+
+import argparse
+import re
+import sys
+from os import path
+
+include_re = re.compile('^[ \t]*#include[ \t]+"(.*)"[ \t]*$')
+included = set()
+excluded = set()
+
+
+def find_header(name, abs_path, include_paths):
+    samedir = path.join(path.dirname(abs_path), name)
+    if path.exists(samedir):
+        return samedir
+    for include_path in include_paths:
+        include_path = path.join(include_path, name)
+        if path.exists(include_path):
+            return include_path
+    return None
+
+
+def expand_include(
+    include_path,
+    f,
+    abs_path,
+    source_out,
+    header_out,
+    include_paths,
+    public_include_paths,
+):
+    if include_path in included:
+        return False
+
+    included.add(include_path)
+    with open(include_path) as f:
+        print('#line 1 "{}"'.format(include_path), file=source_out)
+        process_file(
+            f, include_path, source_out, header_out, include_paths, public_include_paths
+        )
+    return True
+
+
+def process_file(
+    f, abs_path, source_out, header_out, include_paths, public_include_paths
+):
+    for (line, text) in enumerate(f):
+        m = include_re.match(text)
+        if m:
+            filename = m.groups()[0]
+            # first check private headers
+            include_path = find_header(filename, abs_path, include_paths)
+            if include_path:
+                if include_path in excluded:
+                    source_out.write(text)
+                    expanded = False
+                else:
+                    expanded = expand_include(
+                        include_path,
+                        f,
+                        abs_path,
+                        source_out,
+                        header_out,
+                        include_paths,
+                        public_include_paths,
+                    )
+            else:
+                # now try public headers
+                include_path = find_header(filename, abs_path, public_include_paths)
+                if include_path:
+                    # found public header
+                    expanded = False
+                    if include_path in excluded:
+                        source_out.write(text)
+                    else:
+                        expand_include(
+                            include_path,
+                            f,
+                            abs_path,
+                            header_out,
+                            None,
+                            public_include_paths,
+                            [],
+                        )
+                else:
+                    sys.exit(
+                        "unable to find {}, included in {} on line {}".format(
+                            filename, abs_path, line
+                        )
+                    )
+
+            if expanded:
+                print('#line {} "{}"'.format(line + 1, abs_path), file=source_out)
+        elif text != "#pragma once\n":
+            source_out.write(text)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Transform a unity build into an amalgamation"
+    )
+    parser.add_argument("source", help="source file")
+    parser.add_argument(
+        "-I",
+        action="append",
+        dest="include_paths",
+        help="include paths for private headers",
+    )
+    parser.add_argument(
+        "-i",
+        action="append",
+        dest="public_include_paths",
+        help="include paths for public headers",
+    )
+    parser.add_argument(
+        "-x", action="append", dest="excluded", help="excluded header files"
+    )
+    parser.add_argument("-o", dest="source_out", help="output C++ file", required=True)
+    parser.add_argument(
+        "-H", dest="header_out", help="output C++ header file", required=True
+    )
+    args = parser.parse_args()
+
+    include_paths = list(map(path.abspath, args.include_paths or []))
+    public_include_paths = list(map(path.abspath, args.public_include_paths or []))
+    excluded.update(map(path.abspath, args.excluded or []))
+    filename = args.source
+    abs_path = path.abspath(filename)
+    with open(filename) as f, open(args.source_out, "w") as source_out, open(
+        args.header_out, "w"
+    ) as header_out:
+        print('#line 1 "{}"'.format(filename), file=source_out)
+        print('#include "{}"'.format(header_out.name), file=source_out)
+        process_file(
+            f, abs_path, source_out, header_out, include_paths, public_include_paths
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/rocksdb/build_tools/benchmark_log_tool.py b/src/rocksdb/build_tools/benchmark_log_tool.py
new file mode 100755
index 000000000..d1ad45911
--- /dev/null
+++ b/src/rocksdb/build_tools/benchmark_log_tool.py
@@ -0,0 +1,238 @@
+#!/usr/bin/env python3
+#  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+"""Access the results of benchmark runs
+Send these results on to OpenSearch graphing service
+"""
+
+import argparse
+import itertools
+import logging
+import os
+import re
+import sys
+
+import requests
+from dateutil import parser
+
+logging.basicConfig(level=logging.DEBUG)
+
+
+class Configuration:
+    opensearch_user = os.environ["ES_USER"]
+    opensearch_pass = os.environ["ES_PASS"]
+
+
+class BenchmarkResultException(Exception):
+    def __init__(self, message, content):
+        super().__init__(self, message)
+        self.content = content
+
+
+class BenchmarkUtils:
+
+    expected_keys = [
+        "ops_sec",
+        "mb_sec",
+        "lsm_sz",
+        "blob_sz",
+        "c_wgb",
+        "w_amp",
+        "c_mbps",
+        "c_wsecs",
+        "c_csecs",
+        "b_rgb",
+        "b_wgb",
+        "usec_op",
+        "p50",
+        "p99",
+        "p99.9",
+        "p99.99",
+        "pmax",
+        "uptime",
+        "stall%",
+        "Nstall",
+        "u_cpu",
+        "s_cpu",
+        "rss",
+        "test",
+        "date",
+        "version",
+        "job_id",
+    ]
+
+    def sanity_check(row):
+        if "test" not in row:
+            logging.debug(f"not 'test' in row: {row}")
+            return False
+        if row["test"] == "":
+            logging.debug(f"row['test'] == '': {row}")
+            return False
+        if "date" not in row:
+            logging.debug(f"not 'date' in row: {row}")
+            return False
+        if "ops_sec" not in row:
+            logging.debug(f"not 'ops_sec' in row: {row}")
+            return False
+        try:
+            _ = int(row["ops_sec"])
+        except (ValueError, TypeError):
+            logging.debug(f"int(row['ops_sec']): {row}")
+            return False
+        try:
+            (_, _) = parser.parse(row["date"], fuzzy_with_tokens=True)
+        except (parser.ParserError):
+            logging.error(
+                f"parser.parse((row['date']): not a valid format for date in row: {row}"
+            )
+            return False
+        return True
+
+    def conform_opensearch(row):
+        (dt, _) = parser.parse(row["date"], fuzzy_with_tokens=True)
+        # create a test_date field, which was previously what was expected
+        # repair the date field, which has what can be a WRONG ISO FORMAT, (no leading 0 on single-digit day-of-month)
+        # e.g. 2022-07-1T00:14:55 should be 2022-07-01T00:14:55
+        row["test_date"] = dt.isoformat()
+        row["date"] = dt.isoformat()
+        return {key.replace(".", "_"): value for key, value in row.items()}
+
+
+class ResultParser:
+    def __init__(self, field="(\w|[+-:.%])+", intrafield="(\s)+", separator="\t"):
+        self.field = re.compile(field)
+        self.intra = re.compile(intrafield)
+        self.sep = re.compile(separator)
+
+    def ignore(self, l_in: str):
+        if len(l_in) == 0:
+            return True
+        if l_in[0:1] == "#":
+            return True
+        return False
+
+    def line(self, line_in: str):
+        """Parse a line into items
+        Being clever about separators
+        """
+        line = line_in
+        row = []
+        while line != "":
+            match_item = self.field.match(line)
+            if match_item:
+                item = match_item.group(0)
+                row.append(item)
+                line = line[len(item) :]
+            else:
+                match_intra = self.intra.match(line)
+                if match_intra:
+                    intra = match_intra.group(0)
+                    # Count the separators
+                    # If there are >1 then generate extra blank fields
+                    # White space with no true separators fakes up a single separator
+                    tabbed = self.sep.split(intra)
+                    sep_count = len(tabbed) - 1
+                    if sep_count == 0:
+                        sep_count = 1
+                    for _ in range(sep_count - 1):
+                        row.append("")
+                    line = line[len(intra) :]
+                else:
+                    raise BenchmarkResultException(
+                        "Invalid TSV line", f"{line_in} at {line}"
+                    )
+        return row
+
+    def parse(self, lines):
+        """Parse something that iterates lines"""
+        rows = [self.line(line) for line in lines if not self.ignore(line)]
+        header = rows[0]
+        width = len(header)
+        records = [
+            {k: v for (k, v) in itertools.zip_longest(header, row[:width])}
+            for row in rows[1:]
+        ]
+        return records
+
+
+def load_report_from_tsv(filename: str):
+    file = open(filename, "r")
+    contents = file.readlines()
+    file.close()
+    parser = ResultParser()
+    report = parser.parse(contents)
+    logging.debug(f"Loaded TSV Report: {report}")
+    return report
+
+
+def push_report_to_opensearch(report, esdocument):
+    sanitized = [
+        BenchmarkUtils.conform_opensearch(row)
+        for row in report
+        if BenchmarkUtils.sanity_check(row)
+    ]
+    logging.debug(
+        f"upload {len(sanitized)} sane of {len(report)} benchmarks to opensearch"
+    )
+    for single_benchmark in sanitized:
+        logging.debug(f"upload benchmark: {single_benchmark}")
+        response = requests.post(
+            esdocument,
+            json=single_benchmark,
+            auth=(os.environ["ES_USER"], os.environ["ES_PASS"]),
+        )
+        logging.debug(
+            f"Sent to OpenSearch, status: {response.status_code}, result: {response.text}"
+        )
+        response.raise_for_status()
+
+
+def push_report_to_null(report):
+
+    for row in report:
+        if BenchmarkUtils.sanity_check(row):
+            logging.debug(f"row {row}")
+            conformed = BenchmarkUtils.conform_opensearch(row)
+            logging.debug(f"conformed row {conformed}")
+
+
+def main():
+    """Tool for fetching, parsing and uploading benchmark results to OpenSearch / ElasticSearch
+    This tool will
+
+    (1) Open a local tsv benchmark report file
+    (2) Upload to OpenSearch document, via https/JSON
+    """
+
+    parser = argparse.ArgumentParser(description="CircleCI benchmark scraper.")
+
+    # --tsvfile is the name of the file to read results from
+    # --esdocument is the ElasticSearch document to push these results into
+    #
+    parser.add_argument(
+        "--tsvfile",
+        default="build_tools/circle_api_scraper_input.txt",
+        help="File from which to read tsv report",
+    )
+    parser.add_argument(
+        "--esdocument",
+        help="ElasticSearch/OpenSearch document URL to upload report into",
+    )
+    parser.add_argument(
+        "--upload", choices=["opensearch", "none"], default="opensearch"
+    )
+
+    args = parser.parse_args()
+    logging.debug(f"Arguments: {args}")
+    reports = load_report_from_tsv(args.tsvfile)
+    if args.upload == "opensearch":
+        push_report_to_opensearch(reports, args.esdocument)
+    else:
+        push_report_to_null(reports)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/rocksdb/build_tools/build_detect_platform b/src/rocksdb/build_tools/build_detect_platform
new file mode 100755
index 000000000..15129411a
--- /dev/null
+++ b/src/rocksdb/build_tools/build_detect_platform
@@ -0,0 +1,906 @@
+#!/usr/bin/env bash
+#
+# Detects OS we're compiling on and outputs a file specified by the first
+# argument, which in turn gets read while processing Makefile.
+#
+# The output will set the following variables:
+#   CC                          C Compiler path
+#   CXX                         C++ Compiler path
+#   PLATFORM_LDFLAGS            Linker flags
+#   JAVA_LDFLAGS                Linker flags for RocksDBJava
+#   JAVA_STATIC_LDFLAGS         Linker flags for RocksDBJava static build
+#   JAVAC_ARGS                  Arguments for javac
+#   PLATFORM_SHARED_EXT         Extension for shared libraries
+#   PLATFORM_SHARED_LDFLAGS     Flags for building shared library
+#   PLATFORM_SHARED_CFLAGS      Flags for compiling objects for shared library
+#   PLATFORM_CCFLAGS            C compiler flags
+#   PLATFORM_CXXFLAGS           C++ compiler flags.  Will contain:
+#   PLATFORM_SHARED_VERSIONED   Set to 'true' if platform supports versioned
+#                               shared libraries, empty otherwise.
+#   FIND			Command for the find utility
+#   WATCH			Command for the watch utility
+#
+# The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following:
+#
+#       -DROCKSDB_PLATFORM_POSIX    if posix-platform based
+#       -DSNAPPY                    if the Snappy library is present
+#       -DLZ4                       if the LZ4 library is present
+#       -DZSTD                      if the ZSTD library is present
+#       -DNUMA                      if the NUMA library is present
+#       -DTBB                       if the TBB library is present
+#       -DMEMKIND                   if the memkind library is present
+#
+# Using gflags in rocksdb:
+# Our project depends on gflags, which requires users to take some extra steps
+# before they can compile the whole repository:
+#   1. Install gflags. You may download it from here:
+#      https://gflags.github.io/gflags/ (Mac users can `brew install gflags`)
+#   2. Once installed, add the include path for gflags to your CPATH env var and
+#      the lib path to LIBRARY_PATH. If installed with default settings, the lib
+#      will be /usr/local/lib and the include path will be /usr/local/include
+
+OUTPUT=$1
+if test -z "$OUTPUT"; then
+  echo "usage: $0 <output-filename>" >&2
+  exit 1
+fi
+
+# we depend on C++17, but should be compatible with newer standards
+if [ "$ROCKSDB_CXX_STANDARD" ]; then
+  PLATFORM_CXXFLAGS="-std=$ROCKSDB_CXX_STANDARD"
+else
+  PLATFORM_CXXFLAGS="-std=c++17"
+fi
+
+# we currently depend on POSIX platform
+COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX"
+
+# Default to fbcode gcc on internal fb machines
+if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then
+    FBCODE_BUILD="true"
+    # If we're compiling with TSAN or shared lib, we need pic build
+    PIC_BUILD=$COMPILE_WITH_TSAN
+    if [ "$LIB_MODE" == "shared" ]; then
+      PIC_BUILD=1
+    fi
+    if [ -n "$ROCKSDB_FBCODE_BUILD_WITH_PLATFORM010" ]; then
+      source "$PWD/build_tools/fbcode_config_platform010.sh"
+    elif [ -n "$ROCKSDB_FBCODE_BUILD_WITH_PLATFORM009" ]; then
+      source "$PWD/build_tools/fbcode_config_platform009.sh"
+    else
+      source "$PWD/build_tools/fbcode_config_platform009.sh"
+    fi
+fi
+
+# Delete existing output, if it exists
+rm -f "$OUTPUT"
+touch "$OUTPUT"
+
+if test -z "$CC"; then
+    if [ -x "$(command -v cc)" ]; then
+        CC=cc
+    elif [ -x "$(command -v clang)" ]; then
+        CC=clang
+    else
+        CC=cc
+    fi
+fi
+
+if test -z "$CXX"; then
+    if [ -x "$(command -v g++)" ]; then
+        CXX=g++
+    elif [ -x "$(command -v clang++)" ]; then
+        CXX=clang++
+    else
+        CXX=g++
+    fi
+fi
+
+if test -z "$AR"; then
+    if [ -x "$(command -v gcc-ar)" ]; then
+        AR=gcc-ar
+    elif [ -x "$(command -v llvm-ar)" ]; then
+        AR=llvm-ar
+    else
+        AR=ar
+    fi
+fi
+
+# Detect OS
+if test -z "$TARGET_OS"; then
+    TARGET_OS=`uname -s`
+fi
+
+if test -z "$TARGET_ARCHITECTURE"; then
+    TARGET_ARCHITECTURE=`uname -m`
+fi
+
+if test -z "$CLANG_SCAN_BUILD"; then
+    CLANG_SCAN_BUILD=scan-build
+fi
+
+if test -z "$CLANG_ANALYZER"; then
+    CLANG_ANALYZER=$(command -v clang++ 2> /dev/null)
+fi
+
+if test -z "$FIND"; then
+    FIND=find
+fi
+
+if test -z "$WATCH"; then
+    WATCH=watch
+fi
+
+COMMON_FLAGS="$COMMON_FLAGS ${CFLAGS}"
+CROSS_COMPILE=
+PLATFORM_CCFLAGS=
+PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS"
+PLATFORM_SHARED_EXT="so"
+PLATFORM_SHARED_LDFLAGS="-Wl,--no-as-needed -shared -Wl,-soname -Wl,"
+PLATFORM_SHARED_CFLAGS="-fPIC"
+PLATFORM_SHARED_VERSIONED=true
+
+# generic port files (working on all platform by #ifdef) go directly in /port
+GENERIC_PORT_FILES=`cd "$ROCKSDB_ROOT"; find port -name '*.cc' | tr "\n" " "`
+
+# On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp
+case "$TARGET_OS" in
+    Darwin)
+        PLATFORM=OS_MACOSX
+        COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX"
+        PLATFORM_SHARED_EXT=dylib
+        PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
+        # PORT_FILES=port/darwin/darwin_specific.cc
+        ;;
+    IOS)
+        PLATFORM=IOS
+        COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX -DIOS_CROSS_COMPILE -DROCKSDB_LITE"
+        PLATFORM_SHARED_EXT=dylib
+        PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
+        CROSS_COMPILE=true
+        PLATFORM_SHARED_VERSIONED=
+        ;;
+    Linux)
+        PLATFORM=OS_LINUX
+        COMMON_FLAGS="$COMMON_FLAGS -DOS_LINUX"
+        if [ -z "$USE_CLANG" ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp"
+        else
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -latomic"
+        fi
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt -ldl"
+        if test -z "$ROCKSDB_USE_IO_URING"; then
+            ROCKSDB_USE_IO_URING=1
+        fi
+        if test "$ROCKSDB_USE_IO_URING" -ne 0; then
+            # check for liburing
+            $CXX $PLATFORM_CXXFLAGS -x c++ - -luring -o test.o 2>/dev/null  <<EOF
+              #include <liburing.h>
+              int main() {
+                struct io_uring ring;
+                io_uring_queue_init(1, &ring, 0);
+                return 0;
+              }
+EOF
+            if [ "$?" = 0 ]; then
+                PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -luring"
+                COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_IOURING_PRESENT"
+            fi
+        fi
+        # PORT_FILES=port/linux/linux_specific.cc
+        ;;
+    SunOS)
+        PLATFORM=OS_SOLARIS
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_SOLARIS -m64"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt -static-libstdc++ -static-libgcc -m64"
+        # PORT_FILES=port/sunos/sunos_specific.cc
+        ;;
+    AIX)
+        PLATFORM=OS_AIX
+        CC=gcc
+        COMMON_FLAGS="$COMMON_FLAGS -maix64 -pthread -fno-builtin-memcmp -D_REENTRANT -DOS_AIX -D__STDC_FORMAT_MACROS"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -pthread -lpthread -lrt -maix64 -static-libstdc++ -static-libgcc"
+        # PORT_FILES=port/aix/aix_specific.cc
+        ;;
+    FreeBSD)
+        PLATFORM=OS_FREEBSD
+        CXX=clang++
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_FREEBSD"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread"
+        # PORT_FILES=port/freebsd/freebsd_specific.cc
+        ;;
+    GNU/kFreeBSD)
+        PLATFORM=OS_GNU_KFREEBSD
+        COMMON_FLAGS="$COMMON_FLAGS -DOS_GNU_KFREEBSD"
+        if [ -z "$USE_CLANG" ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp"
+        else
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -latomic"
+        fi
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
+        # PORT_FILES=port/gnu_kfreebsd/gnu_kfreebsd_specific.cc
+        ;;
+    NetBSD)
+        PLATFORM=OS_NETBSD
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lgcc_s"
+        # PORT_FILES=port/netbsd/netbsd_specific.cc
+        ;;
+    OpenBSD)
+        PLATFORM=OS_OPENBSD
+	CXX=clang++
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_OPENBSD"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -pthread"
+        # PORT_FILES=port/openbsd/openbsd_specific.cc
+	FIND=gfind
+	WATCH=gnuwatch
+        ;;
+    DragonFly)
+        PLATFORM=OS_DRAGONFLYBSD
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_DRAGONFLYBSD"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread"
+        # PORT_FILES=port/dragonfly/dragonfly_specific.cc
+        ;;
+    Cygwin)
+        PLATFORM=CYGWIN
+        PLATFORM_SHARED_CFLAGS=""
+        PLATFORM_CXXFLAGS="-std=gnu++11"
+        COMMON_FLAGS="$COMMON_FLAGS -DCYGWIN"
+        if [ -z "$USE_CLANG" ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp"
+        else
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -latomic"
+        fi
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
+        # PORT_FILES=port/linux/linux_specific.cc
+        ;;
+    OS_ANDROID_CROSSCOMPILE)
+        PLATFORM=OS_ANDROID
+	COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_ANDROID -DROCKSDB_PLATFORM_POSIX"
+	PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS "  # All pthread features are in the Android C library
+        # PORT_FILES=port/android/android.cc
+        CROSS_COMPILE=true
+        ;;
+    *)
+        echo "Unknown platform!" >&2
+        exit 1
+esac
+
+PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}"
+JAVA_LDFLAGS="$PLATFORM_LDFLAGS"
+JAVA_STATIC_LDFLAGS="$PLATFORM_LDFLAGS"
+JAVAC_ARGS="-source 8"
+
+if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then
+    # Cross-compiling; do not try any compilation tests.
+    # Also don't need any compilation tests if compiling on fbcode
+    if [ "$FBCODE_BUILD" = "true" ]; then
+      # Enable backtrace on fbcode since the necessary libraries are present
+      COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_BACKTRACE"
+      FOLLY_DIR="third-party/folly"
+    fi
+    true
+else
+    if ! test $ROCKSDB_DISABLE_FALLOCATE; then
+        # Test whether fallocate is available
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null  <<EOF
+          #include <fcntl.h>
+          #include <linux/falloc.h>
+          int main() {
+      int fd = open("/dev/null", 0);
+      fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, 1024);
+          }
+EOF
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_FALLOCATE_PRESENT"
+        fi
+    fi
+
+    if ! test $ROCKSDB_DISABLE_SNAPPY; then
+        # Test whether Snappy library is installed
+        # http://code.google.com/p/snappy/
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null  <<EOF
+          #include <snappy.h>
+          int main() {}
+EOF
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DSNAPPY"
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lsnappy"
+            JAVA_LDFLAGS="$JAVA_LDFLAGS -lsnappy"
+        fi
+    fi
+
+    if ! test $ROCKSDB_DISABLE_GFLAGS; then
+        # Test whether gflags library is installed
+        # http://gflags.github.io/gflags/
+        # check if the namespace is gflags
+        if $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null << EOF
+          #include <gflags/gflags.h>
+          using namespace GFLAGS_NAMESPACE;
+          int main() {}
+EOF
+        then
+          COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=1"
+          PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
+        # check if namespace is gflags
+        elif $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null << EOF
+            #include <gflags/gflags.h>
+            using namespace gflags;
+            int main() {}
+EOF
+        then
+          COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=1 -DGFLAGS_NAMESPACE=gflags"
+          PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
+        # check if namespace is google
+        elif $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null << EOF
+            #include <gflags/gflags.h>
+            using namespace google;
+            int main() {}
+EOF
+        then
+          COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=1 -DGFLAGS_NAMESPACE=google"
+          PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
+        fi
+    fi
+
+    if ! test $ROCKSDB_DISABLE_ZLIB; then
+        # Test whether zlib library is installed
+        $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o test.o 2>/dev/null  <<EOF
+          #include <zlib.h>
+          int main() {}
+EOF
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DZLIB"
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lz"
+            JAVA_LDFLAGS="$JAVA_LDFLAGS -lz"
+        fi
+    fi
+
+    if ! test $ROCKSDB_DISABLE_BZIP; then
+        # Test whether bzip library is installed
+        $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o test.o 2>/dev/null  <<EOF
+          #include <bzlib.h>
+          int main() {}
+EOF
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DBZIP2"
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lbz2"
+            JAVA_LDFLAGS="$JAVA_LDFLAGS -lbz2"
+        fi
+    fi
+
+    if ! test $ROCKSDB_DISABLE_LZ4; then
+        # Test whether lz4 library is installed
+        $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o test.o 2>/dev/null  <<EOF
+          #include <lz4.h>
+          #include <lz4hc.h>
+          int main() {}
+EOF
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DLZ4"
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -llz4"
+            JAVA_LDFLAGS="$JAVA_LDFLAGS -llz4"
+        fi
+    fi
+
+    if ! test $ROCKSDB_DISABLE_ZSTD; then
+        # Test whether zstd library is installed
+        $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+          #include <zstd.h>
+          int main() {}
+EOF
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DZSTD"
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lzstd"
+            JAVA_LDFLAGS="$JAVA_LDFLAGS -lzstd"
+        fi
+    fi
+
+    if ! test $ROCKSDB_DISABLE_NUMA; then
+        # Test whether numa is available
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o -lnuma 2>/dev/null  <<EOF
+          #include <numa.h>
+          #include <numaif.h>
+          int main() {}
+EOF
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DNUMA"
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lnuma"
+            JAVA_LDFLAGS="$JAVA_LDFLAGS -lnuma"
+        fi
+    fi
+
+    if ! test $ROCKSDB_DISABLE_TBB; then
+        # Test whether tbb is available
+        $CXX $PLATFORM_CXXFLAGS $LDFLAGS -x c++ - -o test.o -ltbb 2>/dev/null  <<EOF
+          #include <tbb/tbb.h>
+          int main() {}
+EOF
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DTBB"
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ltbb"
+            JAVA_LDFLAGS="$JAVA_LDFLAGS -ltbb"
+        fi
+    fi
+
+    if ! test $ROCKSDB_DISABLE_JEMALLOC; then
+        # Test whether jemalloc is available
+        if echo 'int main() {}' | $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o -ljemalloc \
+          2>/dev/null; then
+            # This will enable some preprocessor identifiers in the Makefile
+            JEMALLOC=1
+            # JEMALLOC can be enabled either using the flag (like here) or by
+            # providing direct link to the jemalloc library
+            WITH_JEMALLOC_FLAG=1
+            # check for JEMALLOC installed with HomeBrew
+            if [ "$PLATFORM" == "OS_MACOSX" ]; then
+                if hash brew 2>/dev/null && brew ls --versions jemalloc > /dev/null; then
+                    JEMALLOC_VER=$(brew ls --versions jemalloc | tail -n 1 | cut -f 2 -d ' ')
+                    JEMALLOC_INCLUDE="-I/usr/local/Cellar/jemalloc/${JEMALLOC_VER}/include"
+                    JEMALLOC_LIB="/usr/local/Cellar/jemalloc/${JEMALLOC_VER}/lib/libjemalloc_pic.a"
+                    PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS $JEMALLOC_LIB"
+                    JAVA_STATIC_LDFLAGS="$JAVA_STATIC_LDFLAGS $JEMALLOC_LIB"
+                fi
+            fi
+        fi
+    fi
+    if ! test $JEMALLOC && ! test $ROCKSDB_DISABLE_TCMALLOC; then
+        # jemalloc is not available. Let's try tcmalloc
+        if echo 'int main() {}' | $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o \
+          -ltcmalloc 2>/dev/null; then
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ltcmalloc"
+            JAVA_LDFLAGS="$JAVA_LDFLAGS -ltcmalloc"
+        fi
+    fi
+
+    if ! test $ROCKSDB_DISABLE_MALLOC_USABLE_SIZE; then
+        # Test whether malloc_usable_size is available
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null  <<EOF
+          #include <malloc.h>
+          int main() {
+            size_t res = malloc_usable_size(0);
+            (void)res;
+            return 0;
+          }
+EOF
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_MALLOC_USABLE_SIZE"
+        fi
+    fi
+
+    if ! test $ROCKSDB_DISABLE_MEMKIND; then
+        # Test whether memkind library is installed
+        $CXX $PLATFORM_CXXFLAGS $LDFLAGS -x c++ - -o test.o -lmemkind 2>/dev/null  <<EOF
+          #include <memkind.h>
+          int main() {
+            memkind_malloc(MEMKIND_DAX_KMEM, 1024);
+            return 0;
+          }
+EOF
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DMEMKIND"
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lmemkind"
+            JAVA_LDFLAGS="$JAVA_LDFLAGS -lmemkind"
+        fi
+    fi
+
+    if ! test $ROCKSDB_DISABLE_PTHREAD_MUTEX_ADAPTIVE_NP; then
+        # Test whether PTHREAD_MUTEX_ADAPTIVE_NP mutex type is available
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null  <<EOF
+          #include <pthread.h>
+          int main() {
+            int x = PTHREAD_MUTEX_ADAPTIVE_NP;
+            (void)x;
+            return 0;
+          }
+EOF
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_PTHREAD_ADAPTIVE_MUTEX"
+        fi
+    fi
+
+    if ! test $ROCKSDB_DISABLE_BACKTRACE; then
+        # Test whether backtrace is available
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null  <<EOF
+          #include <execinfo.h>
+          int main() {
+            void* frames[1];
+            backtrace_symbols(frames, backtrace(frames, 1));
+            return 0;
+          }
+EOF
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_BACKTRACE"
+        else
+            # Test whether execinfo library is installed
+            $CXX $PLATFORM_CXXFLAGS -lexecinfo -x c++ - -o test.o 2>/dev/null  <<EOF
+              #include <execinfo.h>
+              int main() {
+                void* frames[1];
+                backtrace_symbols(frames, backtrace(frames, 1));
+              }
+EOF
+            if [ "$?" = 0 ]; then
+                COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_BACKTRACE"
+                PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lexecinfo"
+                JAVA_LDFLAGS="$JAVA_LDFLAGS -lexecinfo"
+            fi
+        fi
+    fi
+
+    if ! test $ROCKSDB_DISABLE_PG; then
+        # Test if -pg is supported
+        $CXX $PLATFORM_CXXFLAGS -pg -x c++ - -o test.o 2>/dev/null  <<EOF
+          int main() {
+            return 0;
+          }
+EOF
+        if [ "$?" = 0 ]; then
+            PROFILING_FLAGS=-pg
+        fi
+    fi
+
+    if ! test $ROCKSDB_DISABLE_SYNC_FILE_RANGE; then
+        # Test whether sync_file_range is supported for compatibility with an old glibc
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null  <<EOF
+          #include <fcntl.h>
+          int main() {
+            int fd = open("/dev/null", 0);
+            sync_file_range(fd, 0, 1024, SYNC_FILE_RANGE_WRITE);
+          }
+EOF
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_RANGESYNC_PRESENT"
+        fi
+    fi
+
+    if ! test $ROCKSDB_DISABLE_SCHED_GETCPU; then
+        # Test whether sched_getcpu is supported
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null  <<EOF
+          #include <sched.h>
+          int main() {
+            int cpuid = sched_getcpu();
+            (void)cpuid;
+          }
+EOF
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_SCHED_GETCPU_PRESENT"
+        fi
+    fi
+
+    if ! test $ROCKSDB_DISABLE_AUXV_GETAUXVAL; then
+        # Test whether getauxval is supported
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null  <<EOF
+          #include <sys/auxv.h>
+          int main() {
+            uint64_t auxv = getauxval(AT_HWCAP);
+            (void)auxv;
+          }
+EOF
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_AUXV_GETAUXVAL_PRESENT"
+        fi
+    fi
+
+    if ! test $ROCKSDB_DISABLE_ALIGNED_NEW; then
+        # Test whether c++17 aligned-new is supported
+        $CXX $PLATFORM_CXXFLAGS -faligned-new -x c++ - -o test.o 2>/dev/null <<EOF
+            struct alignas(1024) t {int a;};
+            int main() {}
+EOF
+        if [ "$?" = 0 ]; then
+            PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS -faligned-new -DHAVE_ALIGNED_NEW"
+        fi
+    fi
+    if ! test $ROCKSDB_DISABLE_BENCHMARK; then
+        # Test whether google benchmark is available
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null -lbenchmark -lpthread 2>/dev/null  <<EOF
+          #include <benchmark/benchmark.h>
+          int main() {}
+EOF
+        if [ "$?" = 0 ]; then
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lbenchmark"
+        fi
+    fi
+    if test $USE_FOLLY; then
+        # Test whether libfolly library is installed
+        $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+          #include <folly/synchronization/DistributedMutex.h>
+          int main() {}
+EOF
+        if [ "$?" != 0 ]; then
+          FOLLY_DIR="./third-party/folly"
+        fi
+    fi
+
+fi
+
+# TODO(tec): Fix -Wshorten-64-to-32 errors on FreeBSD and enable the warning.
+# -Wshorten-64-to-32 breaks compilation on FreeBSD aarch64 and i386
+if ! { [ "$TARGET_OS" = FreeBSD ] && [ "$TARGET_ARCHITECTURE" = arm64 -o "$TARGET_ARCHITECTURE" = i386 ]; }; then
+  # Test whether -Wshorten-64-to-32 is available
+  $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o -Wshorten-64-to-32 2>/dev/null  <<EOF
+    int main() {}
+EOF
+  if [ "$?" = 0 ]; then
+    COMMON_FLAGS="$COMMON_FLAGS -Wshorten-64-to-32"
+  fi
+fi
+
+if test "0$PORTABLE" -eq 0; then
+  if test -n "`echo $TARGET_ARCHITECTURE | grep ^ppc64`"; then
+    # Tune for this POWER processor, treating '+' models as base models
+    POWER=`LD_SHOW_AUXV=1 /bin/true | grep AT_PLATFORM | grep -E -o power[0-9]+`
+    COMMON_FLAGS="$COMMON_FLAGS -mcpu=$POWER -mtune=$POWER "
+  elif test -n "`echo $TARGET_ARCHITECTURE | grep -e^arm -e^aarch64`"; then
+    # TODO: Handle this with approprite options.
+    COMMON_FLAGS="$COMMON_FLAGS"
+  elif test -n "`echo $TARGET_ARCHITECTURE | grep ^aarch64`"; then
+    COMMON_FLAGS="$COMMON_FLAGS"
+  elif test -n "`echo $TARGET_ARCHITECTURE | grep ^s390x`"; then
+    if echo 'int main() {}' | $CXX $PLATFORM_CXXFLAGS -x c++ \
+      -march=native - -o /dev/null 2>/dev/null; then
+      COMMON_FLAGS="$COMMON_FLAGS -march=native "
+    else
+      COMMON_FLAGS="$COMMON_FLAGS -march=z196 "
+    fi
+    COMMON_FLAGS="$COMMON_FLAGS"
+  elif test -n "`echo $TARGET_ARCHITECTURE | grep ^riscv64`"; then
+    RISC_ISA=$(cat /proc/cpuinfo | grep isa | head -1 | cut --delimiter=: -f 2 | cut -b 2-)
+    COMMON_FLAGS="$COMMON_FLAGS -march=${RISC_ISA}"
+  elif [ "$TARGET_OS" == "IOS" ]; then
+    COMMON_FLAGS="$COMMON_FLAGS"
+  elif [ "$TARGET_OS" == "AIX" ] || [ "$TARGET_OS" == "SunOS" ]; then
+    # TODO: Not sure why we don't use -march=native on these OSes
+    if test "$USE_SSE"; then
+      TRY_SSE_ETC="1"
+    fi
+  else
+    COMMON_FLAGS="$COMMON_FLAGS -march=native "
+  fi
+else
+  # PORTABLE=1
+  if test "$USE_SSE"; then
+    TRY_SSE_ETC="1"
+  fi
+
+  if test -n "`echo $TARGET_ARCHITECTURE | grep ^s390x`"; then
+    COMMON_FLAGS="$COMMON_FLAGS -march=z196 "
+  fi
+
+  if test -n "`echo $TARGET_ARCHITECTURE | grep ^riscv64`"; then
+    RISC_ISA=$(cat /proc/cpuinfo | grep isa | head -1 | cut --delimiter=: -f 2 | cut -b 2-)
+    COMMON_FLAGS="$COMMON_FLAGS -march=${RISC_ISA}"
+  fi
+
+  if [[ "${PLATFORM}" == "OS_MACOSX" ]]; then
+    # For portability compile for macOS 10.13 (2017) or newer
+    COMMON_FLAGS="$COMMON_FLAGS -mmacosx-version-min=10.13"
+    PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -mmacosx-version-min=10.13"
+    # -mmacosx-version-min must come first here.
+    PLATFORM_SHARED_LDFLAGS="-mmacosx-version-min=10.13 $PLATFORM_SHARED_LDFLAGS"
+    PLATFORM_CMAKE_FLAGS="-DCMAKE_OSX_DEPLOYMENT_TARGET=10.13"
+    JAVA_STATIC_DEPS_COMMON_FLAGS="-mmacosx-version-min=10.13"
+    JAVA_STATIC_DEPS_LDFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS"
+    JAVA_STATIC_DEPS_CCFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS"
+    JAVA_STATIC_DEPS_CXXFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS"
+  fi
+fi
+
+if test -n "`echo $TARGET_ARCHITECTURE | grep ^ppc64`"; then
+  # check for GNU libc on ppc64
+  $CXX -x c++ - -o /dev/null 2>/dev/null <<EOF
+    #include <stdio.h>
+    #include <stdlib.h>
+    #include <gnu/libc-version.h>
+
+    int main(int argc, char *argv[]) {
+      printf("GNU libc version: %s\n", gnu_get_libc_version());
+      return 0;
+    }
+EOF
+  if [ "$?" != 0 ]; then
+      PPC_LIBC_IS_GNU=0
+  fi
+fi
+
+if test "$TRY_SSE_ETC"; then
+  # The USE_SSE flag now means "attempt to compile with widely-available
+  # Intel architecture extensions utilized by specific optimizations in the
+  # source code." It's a qualifier on PORTABLE=1 that means "mostly portable."
+  # It doesn't even really check that your current CPU is compatible.
+  #
+  # SSE4.2 available since nehalem, ca. 2008-2010
+  # Includes POPCNT for BitsSetToOne, BitParity
+  TRY_SSE42="-msse4.2"
+  # PCLMUL available since westmere, ca. 2010-2011
+  TRY_PCLMUL="-mpclmul"
+  # AVX2 available since haswell, ca. 2013-2015
+  TRY_AVX2="-mavx2"
+  # BMI available since haswell, ca. 2013-2015
+  # Primarily for TZCNT for CountTrailingZeroBits
+  TRY_BMI="-mbmi"
+  # LZCNT available since haswell, ca. 2013-2015
+  # For FloorLog2
+  TRY_LZCNT="-mlzcnt"
+fi
+
+$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_SSE42 -x c++ - -o test.o 2>/dev/null <<EOF
+  #include <cstdint>
+  #include <nmmintrin.h>
+  int main() {
+    volatile uint32_t x = _mm_crc32_u32(0, 0);
+    (void)x;
+  }
+EOF
+if [ "$?" = 0 ]; then
+  COMMON_FLAGS="$COMMON_FLAGS $TRY_SSE42 -DHAVE_SSE42"
+elif test "$USE_SSE"; then
+  echo "warning: USE_SSE specified but compiler could not use SSE intrinsics, disabling" >&2
+fi
+
+$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_PCLMUL -x c++ - -o test.o 2>/dev/null <<EOF
+  #include <cstdint>
+  #include <wmmintrin.h>
+  int main() {
+    const auto a = _mm_set_epi64x(0, 0);
+    const auto b = _mm_set_epi64x(0, 0);
+    const auto c = _mm_clmulepi64_si128(a, b, 0x00);
+    auto d = _mm_cvtsi128_si64(c);
+    (void)d;
+  }
+EOF
+if [ "$?" = 0 ]; then
+  COMMON_FLAGS="$COMMON_FLAGS $TRY_PCLMUL -DHAVE_PCLMUL"
+elif test "$USE_SSE"; then
+  echo "warning: USE_SSE specified but compiler could not use PCLMUL intrinsics, disabling" >&2
+fi
+
+$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_AVX2 -x c++ - -o test.o 2>/dev/null <<EOF
+  #include <cstdint>
+  #include <immintrin.h>
+  int main() {
+    const auto a = _mm256_setr_epi32(0, 1, 2, 3, 4, 7, 6, 5);
+    const auto b = _mm256_permutevar8x32_epi32(a, a);
+    (void)b;
+  }
+EOF
+if [ "$?" = 0 ]; then
+  COMMON_FLAGS="$COMMON_FLAGS $TRY_AVX2 -DHAVE_AVX2"
+elif test "$USE_SSE"; then
+  echo "warning: USE_SSE specified but compiler could not use AVX2 intrinsics, disabling" >&2
+fi
+
+$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_BMI -x c++ - -o test.o 2>/dev/null <<EOF
+  #include <cstdint>
+  #include <immintrin.h>
+  int main(int argc, char *argv[]) {
+    (void)argv;
+    return (int)_tzcnt_u64((uint64_t)argc);
+  }
+EOF
+if [ "$?" = 0 ]; then
+  COMMON_FLAGS="$COMMON_FLAGS $TRY_BMI -DHAVE_BMI"
+elif test "$USE_SSE"; then
+  echo "warning: USE_SSE specified but compiler could not use BMI intrinsics, disabling" >&2
+fi
+
+$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_LZCNT -x c++ - -o test.o 2>/dev/null <<EOF
+  #include <cstdint>
+  #include <immintrin.h>
+  int main(int argc, char *argv[]) {
+    (void)argv;
+    return (int)_lzcnt_u64((uint64_t)argc);
+  }
+EOF
+if [ "$?" = 0 ]; then
+  COMMON_FLAGS="$COMMON_FLAGS $TRY_LZCNT -DHAVE_LZCNT"
+elif test "$USE_SSE"; then
+  echo "warning: USE_SSE specified but compiler could not use LZCNT intrinsics, disabling" >&2
+fi
+
+$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o test.o 2>/dev/null <<EOF
+  #include <cstdint>
+  int main() {
+    uint64_t a = 0xffffFFFFffffFFFF;
+    __uint128_t b = __uint128_t(a) * a;
+    a = static_cast<uint64_t>(b >> 64);
+    (void)a;
+  }
+EOF
+if [ "$?" = 0 ]; then
+  COMMON_FLAGS="$COMMON_FLAGS -DHAVE_UINT128_EXTENSION"
+fi
+
+if [ "$FBCODE_BUILD" != "true" -a "$PLATFORM" = OS_LINUX ]; then
+  $CXX $COMMON_FLAGS $PLATFORM_SHARED_CFLAGS -x c++ -c - -o test_dl.o 2>/dev/null <<EOF
+  void dummy_func() {}
+EOF
+  if [ "$?" = 0 ]; then
+    $CXX $COMMON_FLAGS $PLATFORM_SHARED_LDFLAGS test_dl.o -o test.o 2>/dev/null
+    if [ "$?" = 0 ]; then
+      EXEC_LDFLAGS+="-ldl"
+      rm -f test_dl.o
+    fi
+  fi
+fi
+
+# check for F_FULLFSYNC
+$CXX $PLATFORM_CXXFALGS -x c++ - -o test.o 2>/dev/null  <<EOF
+  #include <fcntl.h>
+  int main() {
+    fcntl(0, F_FULLFSYNC);
+    return 0;
+  }
+EOF
+if [ "$?" = 0 ]; then
+  COMMON_FLAGS="$COMMON_FLAGS -DHAVE_FULLFSYNC"
+fi
+
+rm -f test.o test_dl.o
+
+# Get the path for the folly installation dir
+if [ "$USE_FOLLY" ]; then
+  if [ "$FOLLY_DIR" ]; then
+    FOLLY_PATH=`cd $FOLLY_DIR && $PYTHON build/fbcode_builder/getdeps.py show-inst-dir folly`
+  fi
+fi
+
+PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
+PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"
+
+VALGRIND_VER="$VALGRIND_VER"
+
+ROCKSDB_MAJOR=`build_tools/version.sh major`
+ROCKSDB_MINOR=`build_tools/version.sh minor`
+ROCKSDB_PATCH=`build_tools/version.sh patch`
+
+echo "CC=$CC" >> "$OUTPUT"
+echo "CXX=$CXX" >> "$OUTPUT"
+echo "AR=$AR" >> "$OUTPUT"
+echo "PLATFORM=$PLATFORM" >> "$OUTPUT"
+echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> "$OUTPUT"
+echo "PLATFORM_CMAKE_FLAGS=$PLATFORM_CMAKE_FLAGS" >> "$OUTPUT"
+echo "JAVA_LDFLAGS=$JAVA_LDFLAGS" >> "$OUTPUT"
+echo "JAVA_STATIC_LDFLAGS=$JAVA_STATIC_LDFLAGS" >> "$OUTPUT"
+echo "JAVA_STATIC_DEPS_CCFLAGS=$JAVA_STATIC_DEPS_CCFLAGS" >> "$OUTPUT"
+echo "JAVA_STATIC_DEPS_CXXFLAGS=$JAVA_STATIC_DEPS_CXXFLAGS" >> "$OUTPUT"
+echo "JAVA_STATIC_DEPS_LDFLAGS=$JAVA_STATIC_DEPS_LDFLAGS" >> "$OUTPUT"
+echo "JAVAC_ARGS=$JAVAC_ARGS" >> "$OUTPUT"
+echo "VALGRIND_VER=$VALGRIND_VER" >> "$OUTPUT"
+echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> "$OUTPUT"
+echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> "$OUTPUT"
+echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> "$OUTPUT"
+echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> "$OUTPUT"
+echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> "$OUTPUT"
+echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> "$OUTPUT"
+echo "EXEC_LDFLAGS=$EXEC_LDFLAGS" >> "$OUTPUT"
+echo "JEMALLOC_INCLUDE=$JEMALLOC_INCLUDE" >> "$OUTPUT"
+echo "JEMALLOC_LIB=$JEMALLOC_LIB" >> "$OUTPUT"
+echo "ROCKSDB_MAJOR=$ROCKSDB_MAJOR" >> "$OUTPUT"
+echo "ROCKSDB_MINOR=$ROCKSDB_MINOR" >> "$OUTPUT"
+echo "ROCKSDB_PATCH=$ROCKSDB_PATCH" >> "$OUTPUT"
+echo "CLANG_SCAN_BUILD=$CLANG_SCAN_BUILD" >> "$OUTPUT"
+echo "CLANG_ANALYZER=$CLANG_ANALYZER" >> "$OUTPUT"
+echo "PROFILING_FLAGS=$PROFILING_FLAGS" >> "$OUTPUT"
+echo "FIND=$FIND" >> "$OUTPUT"
+echo "WATCH=$WATCH" >> "$OUTPUT"
+echo "FOLLY_PATH=$FOLLY_PATH" >> "$OUTPUT"
+
+# This will enable some related identifiers for the preprocessor
+if test -n "$JEMALLOC"; then
+  echo "JEMALLOC=1" >> "$OUTPUT"
+fi
+# Indicates that jemalloc should be enabled using -ljemalloc flag
+# The alternative is to porvide a direct link to the library via JEMALLOC_LIB
+# and JEMALLOC_INCLUDE
+if test -n "$WITH_JEMALLOC_FLAG"; then
+  echo "WITH_JEMALLOC_FLAG=$WITH_JEMALLOC_FLAG" >> "$OUTPUT"
+fi
+echo "LUA_PATH=$LUA_PATH" >> "$OUTPUT"
+if test -n "$USE_FOLLY"; then
+  echo "USE_FOLLY=$USE_FOLLY" >> "$OUTPUT"
+fi
+if test -n "$PPC_LIBC_IS_GNU"; then
+  echo "PPC_LIBC_IS_GNU=$PPC_LIBC_IS_GNU" >> "$OUTPUT"
+fi
diff --git a/src/rocksdb/build_tools/check-sources.sh b/src/rocksdb/build_tools/check-sources.sh
new file mode 100755
index 000000000..5672f7b2b
--- /dev/null
+++ b/src/rocksdb/build_tools/check-sources.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+# Check for some simple mistakes that should prevent commit or push
+
+BAD=""
+
+git grep -n 'namespace rocksdb' -- '*.[ch]*'
+if [ "$?" != "1" ]; then
+  echo "^^^^^ Do not hardcode namespace rocksdb. Use ROCKSDB_NAMESPACE"
+  BAD=1
+fi
+
+git grep -n -i 'nocommit' -- ':!build_tools/check-sources.sh'
+if [ "$?" != "1" ]; then
+  echo "^^^^^ Code was not intended to be committed"
+  BAD=1
+fi
+
+git grep -n 'include <rocksdb/' -- ':!build_tools/check-sources.sh'
+if [ "$?" != "1" ]; then
+  echo '^^^^^ Use double-quotes as in #include "rocksdb/something.h"'
+  BAD=1
+fi
+
+git grep -n 'include "include/rocksdb/' -- ':!build_tools/check-sources.sh'
+if [ "$?" != "1" ]; then
+  echo '^^^^^ Use #include "rocksdb/something.h" instead of #include "include/rocksdb/something.h"'
+  BAD=1
+fi
+
+git grep -n 'using namespace' -- ':!build_tools' ':!docs' \
+    ':!third-party/folly/folly/lang/Align.h' \
+    ':!third-party/gtest-1.8.1/fused-src/gtest/gtest.h'
+if [ "$?" != "1" ]; then
+  echo '^^^^ Do not use "using namespace"'
+  BAD=1
+fi
+
+git grep -n -P "[\x80-\xFF]" -- ':!docs' ':!*.md'
+if [ "$?" != "1" ]; then
+  echo '^^^^ Use only ASCII characters in source files'
+  BAD=1
+fi
+
+if [ "$BAD" ]; then
+  exit 1
+fi
diff --git a/src/rocksdb/build_tools/dependencies_platform009.sh b/src/rocksdb/build_tools/dependencies_platform009.sh
new file mode 100644
index 000000000..ce8dd4e06
--- /dev/null
+++ b/src/rocksdb/build_tools/dependencies_platform009.sh
@@ -0,0 +1,22 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+GCC_BASE=/mnt/gvfs/third-party2/gcc/1795efe5f06778c15a92c8f9a2aba5dc496d9d4d/9.x/centos7-native/3bed279
+CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/7318eaac22659b6ff2fe43918e4b69fd0772a8a7/9.0.0/platform009/651ee30
+LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/4959b39cfbe5965a37c861c4c327fa7c5c759b87/9.x/platform009/9202ce7
+GLIBC_BASE=/mnt/gvfs/third-party2/glibc/45ce3375cdc77ecb2520bbf8f0ecddd3f98efd7a/2.30/platform009/f259413
+SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/be4de3205e029101b18aa8103daa696c2bef3b19/1.1.3/platform009/7f3b187
+ZLIB_BASE=/mnt/gvfs/third-party2/zlib/3c160ac5c67e257501e24c6c1d00ad5e01d73db6/1.2.8/platform009/7f3b187
+BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/73a237ac5bc0a5f5d67b39b8d253cfebaab88684/1.0.6/platform009/7f3b187
+LZ4_BASE=/mnt/gvfs/third-party2/lz4/6ca38d3c390be2774d61a300f151464bbd632d62/1.9.1/platform009/7f3b187
+ZSTD_BASE=/mnt/gvfs/third-party2/zstd/64c58a207d2495e83abc57a500a956df09b79a7c/1.4.x/platform009/ba86d1f
+GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/824d0a8a5abb5b121afd1b35fc3896407ea50092/2.2.0/platform009/7f3b187
+JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/b62912d333ef33f9760efa6219dbe3fe6abb3b0e/master/platform009/c305944
+NUMA_BASE=/mnt/gvfs/third-party2/numa/0af65f71e23a67bf65dc91b11f95caa39325c432/2.0.11/platform009/7f3b187
+LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/02486dac347645d31dce116f44e1de3177315be2/1.4/platform009/5191652
+TBB_BASE=/mnt/gvfs/third-party2/tbb/2e0ec671e550bfca347300bf3f789d9c0fff24ad/2018_U5/platform009/7f3b187
+LIBURING_BASE=/mnt/gvfs/third-party2/liburing/70dbd9cfee63a25611417d09433a86d7711b3990/20200729/platform009/7f3b187
+KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/32b8a2407b634df3f8f948ba373fc4acc6a18296/fb/platform009/da39a3e
+BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/08634589372fa5f237bfd374e8c644a8364e78c1/2.32/platform009/ba86d1f/
+VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/6ae525939ad02e5e676855082fbbc7828dbafeac/3.15.0/platform009/7f3b187
+LUA_BASE=/mnt/gvfs/third-party2/lua/162efd9561a3d21f6869f4814011e9cf1b3ff4dc/5.3.4/platform009/a6271c4
+BENCHMARK_BASE=/mnt/gvfs/third-party2/benchmark/30bf49ad6414325e17f3425b0edcb64239427ae3/1.6.1/platform009/7f3b187
+GLOG_BASE=/mnt/gvfs/third-party2/glog/32d751bd5673375b438158717ab6a57c1cc57e3d/0.3.2_fb/platform009/10a364d
diff --git a/src/rocksdb/build_tools/dependencies_platform010.sh b/src/rocksdb/build_tools/dependencies_platform010.sh
new file mode 100644
index 000000000..38bd346d4
--- /dev/null
+++ b/src/rocksdb/build_tools/dependencies_platform010.sh
@@ -0,0 +1,22 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# The file is generated using update_dependencies.sh.
+GCC_BASE=/mnt/gvfs/third-party2/gcc/e40bde78650fa91b8405a857e3f10bf336633fb0/11.x/centos7-native/886b5eb
+CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/2043340983c032915adbb6f78903dc855b65aee8/12/platform010/9520e0f
+LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/c00dcc6a3e4125c7e8b248e9a79c14b78ac9e0ca/11.x/platform010/5684a5a
+GLIBC_BASE=/mnt/gvfs/third-party2/glibc/0b9c8e4b060eda62f3bc1c6127bbe1256697569b/2.34/platform010/f259413
+SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/bc9647f7912b131315827d65cb6189c21f381d05/1.1.3/platform010/76ebdda
+ZLIB_BASE=/mnt/gvfs/third-party2/zlib/a6f5f3f1d063d2d00cd02fc12f0f05fc3ab3a994/1.2.11/platform010/76ebdda
+BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/09703139cfc376bd8a82642385a0e97726b28287/1.0.6/platform010/76ebdda
+LZ4_BASE=/mnt/gvfs/third-party2/lz4/60220d6a5bf7722b9cc239a1368c596619b12060/1.9.1/platform010/76ebdda
+ZSTD_BASE=/mnt/gvfs/third-party2/zstd/50eace8143eaaea9473deae1f3283e0049e05633/1.4.x/platform010/64091f4
+GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/5d27e5919771603da06000a027b12f799e58a4f7/2.2.0/platform010/76ebdda
+JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/b62912d333ef33f9760efa6219dbe3fe6abb3b0e/master/platform010/f57cc4a
+NUMA_BASE=/mnt/gvfs/third-party2/numa/6b412770957aa3c8a87e5e0dcd8cc2f45f393bc0/2.0.11/platform010/76ebdda
+LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/52f69816e936e147664ad717eb71a1a0e9dc973a/1.4/platform010/5074a48
+TBB_BASE=/mnt/gvfs/third-party2/tbb/c9cc192099fa84c0dcd0ffeedd44a373ad6e4925/2018_U5/platform010/76ebdda
+LIBURING_BASE=/mnt/gvfs/third-party2/liburing/a98e2d137007e3ebf7f33bd6f99c2c56bdaf8488/20210212/platform010/76ebdda
+BENCHMARK_BASE=/mnt/gvfs/third-party2/benchmark/780c7a0f9cf0967961e69ad08e61cddd85d61821/trunk/platform010/76ebdda
+KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/02d9f76aaaba580611cf75e741753c800c7fdc12/fb/platform010/da39a3e
+BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/938dc3f064ef3a48c0446f5b11d788d50b3eb5ee/2.37/centos7-native/da39a3e
+VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/429a6b3203eb415f1599bd15183659153129188e/3.15.0/platform010/76ebdda
+LUA_BASE=/mnt/gvfs/third-party2/lua/363787fa5cac2a8aa20638909210443278fa138e/5.3.4/platform010/9079c97
diff --git a/src/rocksdb/build_tools/dockerbuild.sh b/src/rocksdb/build_tools/dockerbuild.sh
new file mode 100755
index 000000000..c0caede4a
--- /dev/null
+++ b/src/rocksdb/build_tools/dockerbuild.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+docker run -v $PWD:/rocks -w /rocks buildpack-deps make
diff --git a/src/rocksdb/build_tools/error_filter.py b/src/rocksdb/build_tools/error_filter.py
new file mode 100644
index 000000000..c42df1f91
--- /dev/null
+++ b/src/rocksdb/build_tools/error_filter.py
@@ -0,0 +1,181 @@
+#  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+"""Filter for error messages in test output:
+    - Receives merged stdout/stderr from test on stdin
+    - Finds patterns of known error messages for test name (first argument)
+    - Prints those error messages to stdout
+"""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import re
+import sys
+
+
+class ErrorParserBase(object):
+    def parse_error(self, line):
+        """Parses a line of test output. If it contains an error, returns a
+        formatted message describing the error; otherwise, returns None.
+        Subclasses must override this method.
+        """
+        raise NotImplementedError
+
+
+class GTestErrorParser(ErrorParserBase):
+    """A parser that remembers the last test that began running so it can print
+    that test's name upon detecting failure.
+    """
+
+    _GTEST_NAME_PATTERN = re.compile(r"\[ RUN      \] (\S+)$")
+    # format: '<filename or "unknown file">:<line #>: Failure'
+    _GTEST_FAIL_PATTERN = re.compile(r"(unknown file|\S+:\d+): Failure$")
+
+    def __init__(self):
+        self._last_gtest_name = "Unknown test"
+
+    def parse_error(self, line):
+        gtest_name_match = self._GTEST_NAME_PATTERN.match(line)
+        if gtest_name_match:
+            self._last_gtest_name = gtest_name_match.group(1)
+            return None
+        gtest_fail_match = self._GTEST_FAIL_PATTERN.match(line)
+        if gtest_fail_match:
+            return "%s failed: %s" % (self._last_gtest_name, gtest_fail_match.group(1))
+        return None
+
+
+class MatchErrorParser(ErrorParserBase):
+    """A simple parser that returns the whole line if it matches the pattern."""
+
+    def __init__(self, pattern):
+        self._pattern = re.compile(pattern)
+
+    def parse_error(self, line):
+        if self._pattern.match(line):
+            return line
+        return None
+
+
+class CompilerErrorParser(MatchErrorParser):
+    def __init__(self):
+        # format (compile error):
+        #   '<filename>:<line #>:<column #>: error: <error msg>'
+        # format (link error):
+        #   '<filename>:<line #>: error: <error msg>'
+        # The below regex catches both
+        super(CompilerErrorParser, self).__init__(r"\S+:\d+: error:")
+
+
+class ScanBuildErrorParser(MatchErrorParser):
+    def __init__(self):
+        super(ScanBuildErrorParser, self).__init__(r"scan-build: \d+ bugs found.$")
+
+
+class DbCrashErrorParser(MatchErrorParser):
+    def __init__(self):
+        super(DbCrashErrorParser, self).__init__(r"\*\*\*.*\^$|TEST FAILED.")
+
+
+class WriteStressErrorParser(MatchErrorParser):
+    def __init__(self):
+        super(WriteStressErrorParser, self).__init__(
+            r"ERROR: write_stress died with exitcode=\d+"
+        )
+
+
+class AsanErrorParser(MatchErrorParser):
+    def __init__(self):
+        super(AsanErrorParser, self).__init__(r"==\d+==ERROR: AddressSanitizer:")
+
+
+class UbsanErrorParser(MatchErrorParser):
+    def __init__(self):
+        # format: '<filename>:<line #>:<column #>: runtime error: <error msg>'
+        super(UbsanErrorParser, self).__init__(r"\S+:\d+:\d+: runtime error:")
+
+
+class ValgrindErrorParser(MatchErrorParser):
+    def __init__(self):
+        # just grab the summary, valgrind doesn't clearly distinguish errors
+        # from other log messages.
+        super(ValgrindErrorParser, self).__init__(r"==\d+== ERROR SUMMARY:")
+
+
+class CompatErrorParser(MatchErrorParser):
+    def __init__(self):
+        super(CompatErrorParser, self).__init__(r"==== .*[Ee]rror.* ====$")
+
+
+class TsanErrorParser(MatchErrorParser):
+    def __init__(self):
+        super(TsanErrorParser, self).__init__(r"WARNING: ThreadSanitizer:")
+
+
+_TEST_NAME_TO_PARSERS = {
+    "punit": [CompilerErrorParser, GTestErrorParser],
+    "unit": [CompilerErrorParser, GTestErrorParser],
+    "release": [CompilerErrorParser, GTestErrorParser],
+    "unit_481": [CompilerErrorParser, GTestErrorParser],
+    "release_481": [CompilerErrorParser, GTestErrorParser],
+    "clang_unit": [CompilerErrorParser, GTestErrorParser],
+    "clang_release": [CompilerErrorParser, GTestErrorParser],
+    "clang_analyze": [CompilerErrorParser, ScanBuildErrorParser],
+    "code_cov": [CompilerErrorParser, GTestErrorParser],
+    "unity": [CompilerErrorParser, GTestErrorParser],
+    "lite": [CompilerErrorParser],
+    "lite_test": [CompilerErrorParser, GTestErrorParser],
+    "stress_crash": [CompilerErrorParser, DbCrashErrorParser],
+    "stress_crash_with_atomic_flush": [CompilerErrorParser, DbCrashErrorParser],
+    "stress_crash_with_txn": [CompilerErrorParser, DbCrashErrorParser],
+    "write_stress": [CompilerErrorParser, WriteStressErrorParser],
+    "asan": [CompilerErrorParser, GTestErrorParser, AsanErrorParser],
+    "asan_crash": [CompilerErrorParser, AsanErrorParser, DbCrashErrorParser],
+    "asan_crash_with_atomic_flush": [
+        CompilerErrorParser,
+        AsanErrorParser,
+        DbCrashErrorParser,
+    ],
+    "asan_crash_with_txn": [CompilerErrorParser, AsanErrorParser, DbCrashErrorParser],
+    "ubsan": [CompilerErrorParser, GTestErrorParser, UbsanErrorParser],
+    "ubsan_crash": [CompilerErrorParser, UbsanErrorParser, DbCrashErrorParser],
+    "ubsan_crash_with_atomic_flush": [
+        CompilerErrorParser,
+        UbsanErrorParser,
+        DbCrashErrorParser,
+    ],
+    "ubsan_crash_with_txn": [CompilerErrorParser, UbsanErrorParser, DbCrashErrorParser],
+    "valgrind": [CompilerErrorParser, GTestErrorParser, ValgrindErrorParser],
+    "tsan": [CompilerErrorParser, GTestErrorParser, TsanErrorParser],
+    "format_compatible": [CompilerErrorParser, CompatErrorParser],
+    "run_format_compatible": [CompilerErrorParser, CompatErrorParser],
+    "no_compression": [CompilerErrorParser, GTestErrorParser],
+    "run_no_compression": [CompilerErrorParser, GTestErrorParser],
+    "regression": [CompilerErrorParser],
+    "run_regression": [CompilerErrorParser],
+}
+
+
+def main():
+    if len(sys.argv) != 2:
+        return "Usage: %s <test name>" % sys.argv[0]
+    test_name = sys.argv[1]
+    if test_name not in _TEST_NAME_TO_PARSERS:
+        return "Unknown test name: %s" % test_name
+
+    error_parsers = []
+    for parser_cls in _TEST_NAME_TO_PARSERS[test_name]:
+        error_parsers.append(parser_cls())
+
+    for line in sys.stdin:
+        line = line.strip()
+        for error_parser in error_parsers:
+            error_msg = error_parser.parse_error(line)
+            if error_msg is not None:
+                print(error_msg)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/rocksdb/build_tools/fb_compile_mongo.sh b/src/rocksdb/build_tools/fb_compile_mongo.sh
new file mode 100755
index 000000000..ec733cdf1
--- /dev/null
+++ b/src/rocksdb/build_tools/fb_compile_mongo.sh
@@ -0,0 +1,55 @@
+#!/bin/sh
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# fail early
+set -e
+
+if test -z $ROCKSDB_PATH; then
+  ROCKSDB_PATH=~/rocksdb
+fi
+source $ROCKSDB_PATH/build_tools/fbcode_config4.8.1.sh
+
+EXTRA_LDFLAGS=""
+
+if test -z $ALLOC; then
+  # default
+  ALLOC=tcmalloc
+elif [[ $ALLOC == "jemalloc" ]]; then
+  ALLOC=system
+  EXTRA_LDFLAGS+=" -Wl,--whole-archive $JEMALLOC_LIB -Wl,--no-whole-archive"
+fi
+
+# we need to force mongo to use static library, not shared
+STATIC_LIB_DEP_DIR='build/static_library_dependencies'
+test -d $STATIC_LIB_DEP_DIR || mkdir $STATIC_LIB_DEP_DIR
+test -h $STATIC_LIB_DEP_DIR/`basename $SNAPPY_LIBS` || ln -s $SNAPPY_LIBS $STATIC_LIB_DEP_DIR
+test -h $STATIC_LIB_DEP_DIR/`basename $LZ4_LIBS` || ln -s $LZ4_LIBS $STATIC_LIB_DEP_DIR
+
+EXTRA_LDFLAGS+=" -L $STATIC_LIB_DEP_DIR"
+
+set -x
+
+EXTRA_CMD=""
+if ! test -e version.json; then
+  # this is Mongo 3.0
+  EXTRA_CMD="--rocksdb \
+    --variant-dir=linux2/norm
+    --cxx=${CXX} \
+    --cc=${CC} \
+    --use-system-zlib"  # add this line back to normal code path
+                        # when https://jira.mongodb.org/browse/SERVER-19123 is resolved
+fi
+
+scons \
+  LINKFLAGS="$EXTRA_LDFLAGS $EXEC_LDFLAGS $PLATFORM_LDFLAGS" \
+  CCFLAGS="$CXXFLAGS -L $STATIC_LIB_DEP_DIR" \
+  LIBS="lz4 gcc stdc++" \
+  LIBPATH="$ROCKSDB_PATH" \
+  CPPPATH="$ROCKSDB_PATH/include" \
+  -j32 \
+  --allocator=$ALLOC \
+  --nostrip \
+  --opt=on \
+  --disable-minimum-compiler-version-enforcement \
+  --use-system-snappy \
+  --disable-warnings-as-errors \
+  $EXTRA_CMD $*
diff --git a/src/rocksdb/build_tools/fbcode_config.sh b/src/rocksdb/build_tools/fbcode_config.sh
new file mode 100644
index 000000000..cf3c355b1
--- /dev/null
+++ b/src/rocksdb/build_tools/fbcode_config.sh
@@ -0,0 +1,175 @@
+#!/bin/sh
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+# Set environment variables so that we can compile rocksdb using
+# fbcode settings.  It uses the latest g++ and clang compilers and also
+# uses jemalloc
+# Environment variables that change the behavior of this script:
+# PIC_BUILD -- if true, it will only take pic versions of libraries from fbcode. libraries that don't have pic variant will not be included
+
+
+BASEDIR=`dirname $BASH_SOURCE`
+source "$BASEDIR/dependencies.sh"
+
+CFLAGS=""
+
+# libgcc
+LIBGCC_INCLUDE="$LIBGCC_BASE/include"
+LIBGCC_LIBS=" -L $LIBGCC_BASE/lib"
+
+# glibc
+GLIBC_INCLUDE="$GLIBC_BASE/include"
+GLIBC_LIBS=" -L $GLIBC_BASE/lib"
+
+if ! test $ROCKSDB_DISABLE_SNAPPY; then
+  # snappy
+  SNAPPY_INCLUDE=" -I $SNAPPY_BASE/include/"
+  if test -z $PIC_BUILD; then
+    SNAPPY_LIBS=" $SNAPPY_BASE/lib/libsnappy.a"
+  else
+    SNAPPY_LIBS=" $SNAPPY_BASE/lib/libsnappy_pic.a"
+  fi
+  CFLAGS+=" -DSNAPPY"
+fi
+
+if test -z $PIC_BUILD; then
+  if ! test $ROCKSDB_DISABLE_ZLIB; then
+    # location of zlib headers and libraries
+    ZLIB_INCLUDE=" -I $ZLIB_BASE/include/"
+    ZLIB_LIBS=" $ZLIB_BASE/lib/libz.a"
+    CFLAGS+=" -DZLIB"
+  fi
+
+  if ! test $ROCKSDB_DISABLE_BZIP; then
+    # location of bzip headers and libraries
+    BZIP_INCLUDE=" -I $BZIP2_BASE/include/"
+    BZIP_LIBS=" $BZIP2_BASE/lib/libbz2.a"
+    CFLAGS+=" -DBZIP2"
+  fi
+
+  if ! test $ROCKSDB_DISABLE_LZ4; then
+    LZ4_INCLUDE=" -I $LZ4_BASE/include/"
+    LZ4_LIBS=" $LZ4_BASE/lib/liblz4.a"
+    CFLAGS+=" -DLZ4"
+  fi
+fi
+
+if ! test $ROCKSDB_DISABLE_ZSTD; then
+  ZSTD_INCLUDE=" -I $ZSTD_BASE/include/"
+  if test -z $PIC_BUILD; then
+    ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd.a"
+  else
+    ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd_pic.a"
+  fi
+  CFLAGS+=" -DZSTD -DZSTD_STATIC_LINKING_ONLY"
+fi
+
+# location of gflags headers and libraries
+GFLAGS_INCLUDE=" -I $GFLAGS_BASE/include/"
+if test -z $PIC_BUILD; then
+  GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags.a"
+else
+  GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags_pic.a"
+fi
+CFLAGS+=" -DGFLAGS=gflags"
+
+# location of jemalloc
+JEMALLOC_INCLUDE=" -I $JEMALLOC_BASE/include/"
+JEMALLOC_LIB=" $JEMALLOC_BASE/lib/libjemalloc.a"
+
+if test -z $PIC_BUILD; then
+  # location of numa
+  NUMA_INCLUDE=" -I $NUMA_BASE/include/"
+  NUMA_LIB=" $NUMA_BASE/lib/libnuma.a"
+  CFLAGS+=" -DNUMA"
+
+  # location of libunwind
+  LIBUNWIND="$LIBUNWIND_BASE/lib/libunwind.a"
+fi
+
+# location of TBB
+TBB_INCLUDE=" -isystem $TBB_BASE/include/"
+if test -z $PIC_BUILD; then
+  TBB_LIBS="$TBB_BASE/lib/libtbb.a"
+else
+  TBB_LIBS="$TBB_BASE/lib/libtbb_pic.a"
+fi
+CFLAGS+=" -DTBB"
+
+test "$USE_SSE" || USE_SSE=1
+export USE_SSE
+test "$PORTABLE" || PORTABLE=1
+export PORTABLE
+
+BINUTILS="$BINUTILS_BASE/bin"
+AR="$BINUTILS/ar"
+
+DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $ZSTD_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE $TBB_INCLUDE"
+
+STDLIBS="-L $GCC_BASE/lib64"
+
+CLANG_BIN="$CLANG_BASE/bin"
+CLANG_LIB="$CLANG_BASE/lib"
+CLANG_SRC="$CLANG_BASE/../../src"
+
+CLANG_ANALYZER="$CLANG_BIN/clang++"
+CLANG_SCAN_BUILD="$CLANG_SRC/llvm/tools/clang/tools/scan-build/bin/scan-build"
+
+if [ -z "$USE_CLANG" ]; then
+  # gcc
+  CC="$GCC_BASE/bin/gcc"
+  CXX="$GCC_BASE/bin/g++"
+  AR="$GCC_BASE/bin/gcc-ar"
+
+  CFLAGS+=" -B$BINUTILS/gold"
+  CFLAGS+=" -isystem $GLIBC_INCLUDE"
+  CFLAGS+=" -isystem $LIBGCC_INCLUDE"
+  JEMALLOC=1
+else
+  # clang
+  CLANG_INCLUDE="$CLANG_LIB/clang/stable/include"
+  CC="$CLANG_BIN/clang"
+  CXX="$CLANG_BIN/clang++"
+  AR="$CLANG_BIN/llvm-ar"
+
+  KERNEL_HEADERS_INCLUDE="$KERNEL_HEADERS_BASE/include"
+
+  CFLAGS+=" -B$BINUTILS/gold -nostdinc -nostdlib"
+  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/5.x "
+  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/5.x/x86_64-facebook-linux "
+  CFLAGS+=" -isystem $GLIBC_INCLUDE"
+  CFLAGS+=" -isystem $LIBGCC_INCLUDE"
+  CFLAGS+=" -isystem $CLANG_INCLUDE"
+  CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux "
+  CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE "
+  CFLAGS+=" -Wno-expansion-to-defined "
+  CXXFLAGS="-nostdinc++"
+fi
+
+CFLAGS+=" $DEPS_INCLUDE"
+CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DHAVE_SSE42"
+CXXFLAGS+=" $CFLAGS"
+
+EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS"
+EXEC_LDFLAGS+=" -B$BINUTILS/gold"
+EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/gcc-5-glibc-2.23/lib/ld.so"
+EXEC_LDFLAGS+=" $LIBUNWIND"
+EXEC_LDFLAGS+=" -Wl,-rpath=/usr/local/fbcode/gcc-5-glibc-2.23/lib"
+# required by libtbb
+EXEC_LDFLAGS+=" -ldl"
+
+PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++"
+
+EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $TBB_LIBS"
+
+VALGRIND_VER="$VALGRIND_BASE/bin/"
+
+LUA_PATH="$LUA_BASE"
+
+if test -z $PIC_BUILD; then
+  LUA_LIB=" $LUA_PATH/lib/liblua.a"
+else
+  LUA_LIB=" $LUA_PATH/lib/liblua_pic.a"
+fi
+
+export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD LUA_PATH LUA_LIB
diff --git a/src/rocksdb/build_tools/fbcode_config_platform009.sh b/src/rocksdb/build_tools/fbcode_config_platform009.sh
new file mode 100644
index 000000000..8c8ba092c
--- /dev/null
+++ b/src/rocksdb/build_tools/fbcode_config_platform009.sh
@@ -0,0 +1,170 @@
+#!/bin/sh
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+# Set environment variables so that we can compile rocksdb using
+# fbcode settings.  It uses the latest g++ and clang compilers and also
+# uses jemalloc
+# Environment variables that change the behavior of this script:
+# PIC_BUILD -- if true, it will only take pic versions of libraries from fbcode. libraries that don't have pic variant will not be included
+
+
+BASEDIR=`dirname $BASH_SOURCE`
+source "$BASEDIR/dependencies_platform009.sh"
+
+CFLAGS=""
+
+# libgcc
+LIBGCC_INCLUDE="$LIBGCC_BASE/include/c++/9.3.0 -I $LIBGCC_BASE/include/c++/9.3.0/backward"
+LIBGCC_LIBS=" -L $LIBGCC_BASE/lib"
+
+# glibc
+GLIBC_INCLUDE="$GLIBC_BASE/include"
+GLIBC_LIBS=" -L $GLIBC_BASE/lib"
+
+if test -z $PIC_BUILD; then
+  MAYBE_PIC=
+else
+  MAYBE_PIC=_pic
+fi
+
+if ! test $ROCKSDB_DISABLE_SNAPPY; then
+  # snappy
+  SNAPPY_INCLUDE=" -I $SNAPPY_BASE/include/"
+  SNAPPY_LIBS=" $SNAPPY_BASE/lib/libsnappy${MAYBE_PIC}.a"
+  CFLAGS+=" -DSNAPPY"
+fi
+
+if ! test $ROCKSDB_DISABLE_ZLIB; then
+  # location of zlib headers and libraries
+  ZLIB_INCLUDE=" -I $ZLIB_BASE/include/"
+  ZLIB_LIBS=" $ZLIB_BASE/lib/libz${MAYBE_PIC}.a"
+  CFLAGS+=" -DZLIB"
+fi
+
+if ! test $ROCKSDB_DISABLE_BZIP; then
+  # location of bzip headers and libraries
+  BZIP_INCLUDE=" -I $BZIP2_BASE/include/"
+  BZIP_LIBS=" $BZIP2_BASE/lib/libbz2${MAYBE_PIC}.a"
+  CFLAGS+=" -DBZIP2"
+fi
+
+if ! test $ROCKSDB_DISABLE_LZ4; then
+  LZ4_INCLUDE=" -I $LZ4_BASE/include/"
+  LZ4_LIBS=" $LZ4_BASE/lib/liblz4${MAYBE_PIC}.a"
+  CFLAGS+=" -DLZ4"
+fi
+
+if ! test $ROCKSDB_DISABLE_ZSTD; then
+  ZSTD_INCLUDE=" -I $ZSTD_BASE/include/"
+  ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd${MAYBE_PIC}.a"
+  CFLAGS+=" -DZSTD"
+fi
+
+# location of gflags headers and libraries
+GFLAGS_INCLUDE=" -I $GFLAGS_BASE/include/"
+GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags${MAYBE_PIC}.a"
+CFLAGS+=" -DGFLAGS=gflags"
+
+BENCHMARK_INCLUDE=" -I $BENCHMARK_BASE/include/"
+BENCHMARK_LIBS=" $BENCHMARK_BASE/lib/libbenchmark${MAYBE_PIC}.a"
+
+GLOG_INCLUDE=" -I $GLOG_BASE/include/"
+GLOG_LIBS=" $GLOG_BASE/lib/libglog${MAYBE_PIC}.a"
+
+# location of jemalloc
+JEMALLOC_INCLUDE=" -I $JEMALLOC_BASE/include/"
+JEMALLOC_LIB=" $JEMALLOC_BASE/lib/libjemalloc${MAYBE_PIC}.a"
+
+# location of numa
+NUMA_INCLUDE=" -I $NUMA_BASE/include/"
+NUMA_LIB=" $NUMA_BASE/lib/libnuma${MAYBE_PIC}.a"
+CFLAGS+=" -DNUMA"
+
+# location of libunwind
+LIBUNWIND="$LIBUNWIND_BASE/lib/libunwind${MAYBE_PIC}.a"
+
+# location of TBB
+TBB_INCLUDE=" -isystem $TBB_BASE/include/"
+TBB_LIBS="$TBB_BASE/lib/libtbb${MAYBE_PIC}.a"
+CFLAGS+=" -DTBB"
+
+# location of LIBURING
+LIBURING_INCLUDE=" -isystem $LIBURING_BASE/include/"
+LIBURING_LIBS="$LIBURING_BASE/lib/liburing${MAYBE_PIC}.a"
+CFLAGS+=" -DLIBURING"
+
+test "$USE_SSE" || USE_SSE=1
+export USE_SSE
+test "$PORTABLE" || PORTABLE=1
+export PORTABLE
+
+BINUTILS="$BINUTILS_BASE/bin"
+AR="$BINUTILS/ar"
+AS="$BINUTILS/as"
+
+DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $ZSTD_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE $TBB_INCLUDE $LIBURING_INCLUDE $BENCHMARK_INCLUDE $GLOG_INCLUDE"
+
+STDLIBS="-L $GCC_BASE/lib64"
+
+CLANG_BIN="$CLANG_BASE/bin"
+CLANG_LIB="$CLANG_BASE/lib"
+CLANG_SRC="$CLANG_BASE/../../src"
+
+CLANG_ANALYZER="$CLANG_BIN/clang++"
+CLANG_SCAN_BUILD="$CLANG_SRC/llvm/clang/tools/scan-build/bin/scan-build"
+
+if [ -z "$USE_CLANG" ]; then
+  # gcc
+  CC="$GCC_BASE/bin/gcc"
+  CXX="$GCC_BASE/bin/g++"
+  AR="$GCC_BASE/bin/gcc-ar"
+
+  CFLAGS+=" -B$BINUTILS"
+  CFLAGS+=" -isystem $LIBGCC_INCLUDE"
+  CFLAGS+=" -isystem $GLIBC_INCLUDE"
+  JEMALLOC=1
+else
+  # clang
+  CLANG_INCLUDE="$CLANG_LIB/clang/stable/include"
+  CC="$CLANG_BIN/clang"
+  CXX="$CLANG_BIN/clang++"
+  AR="$CLANG_BIN/llvm-ar"
+
+  KERNEL_HEADERS_INCLUDE="$KERNEL_HEADERS_BASE/include"
+
+  CFLAGS+=" -B$BINUTILS -nostdinc -nostdlib"
+  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/9.x "
+  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/9.x/x86_64-facebook-linux "
+  CFLAGS+=" -isystem $GLIBC_INCLUDE"
+  CFLAGS+=" -isystem $LIBGCC_INCLUDE"
+  CFLAGS+=" -isystem $CLANG_INCLUDE"
+  CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux "
+  CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE "
+  CFLAGS+=" -Wno-expansion-to-defined "
+  CXXFLAGS="-nostdinc++"
+fi
+
+CFLAGS+=" $DEPS_INCLUDE"
+CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DHAVE_SSE42 -DROCKSDB_IOURING_PRESENT"
+CXXFLAGS+=" $CFLAGS"
+
+EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS $LIBURING_LIBS $BENCHMARK_LIBS"
+EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/platform009/lib/ld.so"
+EXEC_LDFLAGS+=" $LIBUNWIND"
+EXEC_LDFLAGS+=" -Wl,-rpath=/usr/local/fbcode/platform009/lib"
+EXEC_LDFLAGS+=" -Wl,-rpath=$GCC_BASE/lib64"
+# required by libtbb
+EXEC_LDFLAGS+=" -ldl"
+
+PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++"
+PLATFORM_LDFLAGS+=" -B$BINUTILS"
+
+EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $TBB_LIBS $LIBURING_LIBS $BENCHMARK_LIBS"
+
+VALGRIND_VER="$VALGRIND_BASE/bin/"
+
+# lua not supported because it's on track for deprecation, I think
+LUA_PATH=
+LUA_LIB=
+
+export CC CXX AR AS CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD LUA_PATH LUA_LIB
diff --git a/src/rocksdb/build_tools/fbcode_config_platform010.sh b/src/rocksdb/build_tools/fbcode_config_platform010.sh
new file mode 100644
index 000000000..babe92c41
--- /dev/null
+++ b/src/rocksdb/build_tools/fbcode_config_platform010.sh
@@ -0,0 +1,175 @@
+#!/bin/sh
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+# Set environment variables so that we can compile rocksdb using
+# fbcode settings.  It uses the latest g++ and clang compilers and also
+# uses jemalloc
+# Environment variables that change the behavior of this script:
+# PIC_BUILD -- if true, it will only take pic versions of libraries from fbcode. libraries that don't have pic variant will not be included
+
+
+BASEDIR=`dirname $BASH_SOURCE`
+source "$BASEDIR/dependencies_platform010.sh"
+
+# Disallow using libraries from default locations as they might not be compatible with platform010 libraries.
+CFLAGS=" --sysroot=/DOES/NOT/EXIST"
+
+# libgcc
+LIBGCC_INCLUDE="$LIBGCC_BASE/include/c++/trunk"
+LIBGCC_LIBS=" -L $LIBGCC_BASE/lib -B$LIBGCC_BASE/lib/gcc/x86_64-facebook-linux/trunk/"
+
+# glibc
+GLIBC_INCLUDE="$GLIBC_BASE/include"
+GLIBC_LIBS=" -L $GLIBC_BASE/lib"
+GLIBC_LIBS+=" -B$GLIBC_BASE/lib"
+
+if test -z $PIC_BUILD; then
+  MAYBE_PIC=
+else
+  MAYBE_PIC=_pic
+fi
+
+if ! test $ROCKSDB_DISABLE_SNAPPY; then
+  # snappy
+  SNAPPY_INCLUDE=" -I $SNAPPY_BASE/include/"
+  SNAPPY_LIBS=" $SNAPPY_BASE/lib/libsnappy${MAYBE_PIC}.a"
+  CFLAGS+=" -DSNAPPY"
+fi
+
+if ! test $ROCKSDB_DISABLE_ZLIB; then
+  # location of zlib headers and libraries
+  ZLIB_INCLUDE=" -I $ZLIB_BASE/include/"
+  ZLIB_LIBS=" $ZLIB_BASE/lib/libz${MAYBE_PIC}.a"
+  CFLAGS+=" -DZLIB"
+fi
+
+if ! test $ROCKSDB_DISABLE_BZIP; then
+  # location of bzip headers and libraries
+  BZIP_INCLUDE=" -I $BZIP2_BASE/include/"
+  BZIP_LIBS=" $BZIP2_BASE/lib/libbz2${MAYBE_PIC}.a"
+  CFLAGS+=" -DBZIP2"
+fi
+
+if ! test $ROCKSDB_DISABLE_LZ4; then
+  LZ4_INCLUDE=" -I $LZ4_BASE/include/"
+  LZ4_LIBS=" $LZ4_BASE/lib/liblz4${MAYBE_PIC}.a"
+  CFLAGS+=" -DLZ4"
+fi
+
+if ! test $ROCKSDB_DISABLE_ZSTD; then
+  ZSTD_INCLUDE=" -I $ZSTD_BASE/include/"
+  ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd${MAYBE_PIC}.a"
+  CFLAGS+=" -DZSTD"
+fi
+
+# location of gflags headers and libraries
+GFLAGS_INCLUDE=" -I $GFLAGS_BASE/include/"
+GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags${MAYBE_PIC}.a"
+CFLAGS+=" -DGFLAGS=gflags"
+
+BENCHMARK_INCLUDE=" -I $BENCHMARK_BASE/include/"
+BENCHMARK_LIBS=" $BENCHMARK_BASE/lib/libbenchmark${MAYBE_PIC}.a"
+
+# location of jemalloc
+JEMALLOC_INCLUDE=" -I $JEMALLOC_BASE/include/"
+JEMALLOC_LIB=" $JEMALLOC_BASE/lib/libjemalloc${MAYBE_PIC}.a"
+
+# location of numa
+NUMA_INCLUDE=" -I $NUMA_BASE/include/"
+NUMA_LIB=" $NUMA_BASE/lib/libnuma${MAYBE_PIC}.a"
+CFLAGS+=" -DNUMA"
+
+# location of libunwind
+LIBUNWIND="$LIBUNWIND_BASE/lib/libunwind${MAYBE_PIC}.a"
+
+# location of TBB
+TBB_INCLUDE=" -isystem $TBB_BASE/include/"
+TBB_LIBS="$TBB_BASE/lib/libtbb${MAYBE_PIC}.a"
+CFLAGS+=" -DTBB"
+
+# location of LIBURING
+LIBURING_INCLUDE=" -isystem $LIBURING_BASE/include/"
+LIBURING_LIBS="$LIBURING_BASE/lib/liburing${MAYBE_PIC}.a"
+CFLAGS+=" -DLIBURING"
+
+test "$USE_SSE" || USE_SSE=1
+export USE_SSE
+test "$PORTABLE" || PORTABLE=1
+export PORTABLE
+
+BINUTILS="$BINUTILS_BASE/bin"
+AR="$BINUTILS/ar"
+AS="$BINUTILS/as"
+
+DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $ZSTD_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE $TBB_INCLUDE $LIBURING_INCLUDE $BENCHMARK_INCLUDE"
+
+STDLIBS="-L $GCC_BASE/lib64"
+
+CLANG_BIN="$CLANG_BASE/bin"
+CLANG_LIB="$CLANG_BASE/lib"
+CLANG_SRC="$CLANG_BASE/../../src"
+
+CLANG_ANALYZER="$CLANG_BIN/clang++"
+CLANG_SCAN_BUILD="$CLANG_SRC/llvm/clang/tools/scan-build/bin/scan-build"
+
+if [ -z "$USE_CLANG" ]; then
+  # gcc
+  CC="$GCC_BASE/bin/gcc"
+  CXX="$GCC_BASE/bin/g++"
+  AR="$GCC_BASE/bin/gcc-ar"
+
+  CFLAGS+=" -B$BINUTILS -nostdinc -nostdlib"
+  CFLAGS+=" -I$GCC_BASE/include"
+  CFLAGS+=" -isystem $GCC_BASE/lib/gcc/x86_64-redhat-linux-gnu/11.2.1/include"
+  CFLAGS+=" -isystem $GCC_BASE/lib/gcc/x86_64-redhat-linux-gnu/11.2.1/install-tools/include"
+  CFLAGS+=" -isystem $GCC_BASE/lib/gcc/x86_64-redhat-linux-gnu/11.2.1/include-fixed/"
+  CFLAGS+=" -isystem $LIBGCC_INCLUDE"
+  CFLAGS+=" -isystem $GLIBC_INCLUDE"
+  CFLAGS+=" -I$GLIBC_INCLUDE"
+  CFLAGS+=" -I$LIBGCC_BASE/include"
+  CFLAGS+=" -I$LIBGCC_BASE/include/c++/11.x/"
+  CFLAGS+=" -I$LIBGCC_BASE/include/c++/11.x/x86_64-facebook-linux/"
+  CFLAGS+=" -I$LIBGCC_BASE/include/c++/11.x/backward"
+  CFLAGS+=" -isystem $GLIBC_INCLUDE -I$GLIBC_INCLUDE"
+  JEMALLOC=1
+else
+  # clang
+  CLANG_INCLUDE="$CLANG_LIB/clang/stable/include"
+  CC="$CLANG_BIN/clang"
+  CXX="$CLANG_BIN/clang++"
+  AR="$CLANG_BIN/llvm-ar"
+
+  CFLAGS+=" -B$BINUTILS -nostdinc -nostdlib"
+  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/trunk "
+  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/trunk/x86_64-facebook-linux "
+  CFLAGS+=" -isystem $GLIBC_INCLUDE"
+  CFLAGS+=" -isystem $LIBGCC_INCLUDE"
+  CFLAGS+=" -isystem $CLANG_INCLUDE"
+  CFLAGS+=" -Wno-expansion-to-defined "
+  CXXFLAGS="-nostdinc++"
+fi
+
+KERNEL_HEADERS_INCLUDE="$KERNEL_HEADERS_BASE/include"
+CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux "
+CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE "
+
+CFLAGS+=" $DEPS_INCLUDE"
+CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DHAVE_SSE42 -DROCKSDB_IOURING_PRESENT"
+CXXFLAGS+=" $CFLAGS"
+
+EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS $LIBURING_LIBS $BENCHMARK_LIBS"
+EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/platform010/lib/ld.so"
+EXEC_LDFLAGS+=" $LIBUNWIND"
+EXEC_LDFLAGS+=" -Wl,-rpath=/usr/local/fbcode/platform010/lib"
+EXEC_LDFLAGS+=" -Wl,-rpath=$GCC_BASE/lib64"
+# required by libtbb
+EXEC_LDFLAGS+=" -ldl"
+
+PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++"
+PLATFORM_LDFLAGS+=" -B$BINUTILS"
+
+EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $TBB_LIBS $LIBURING_LIBS $BENCHMARK_LIBS"
+
+VALGRIND_VER="$VALGRIND_BASE/bin/"
+
+export CC CXX AR AS CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD LUA_PATH LUA_LIB
diff --git a/src/rocksdb/build_tools/format-diff.sh b/src/rocksdb/build_tools/format-diff.sh
new file mode 100755
index 000000000..62e8834f7
--- /dev/null
+++ b/src/rocksdb/build_tools/format-diff.sh
@@ -0,0 +1,203 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# If clang_format_diff.py command is not specfied, we assume we are able to
+# access directly without any path.
+
+print_usage () {
+  echo "Usage:"
+  echo "format-diff.sh [OPTIONS]"
+  echo "-c: check only."
+  echo "-h: print this message."
+}
+
+while getopts ':ch' OPTION; do
+  case "$OPTION" in
+    c)
+      CHECK_ONLY=1
+      ;;
+    h)
+      print_usage
+      exit 1
+      ;;
+    ?)
+      print_usage
+      exit 1
+      ;;
+  esac
+done
+
+REPO_ROOT="$(git rev-parse --show-toplevel)"
+
+if [ "$CLANG_FORMAT_DIFF" ]; then
+  echo "Note: CLANG_FORMAT_DIFF='$CLANG_FORMAT_DIFF'"
+  # Dry run to confirm dependencies like argparse
+  if $CLANG_FORMAT_DIFF --help >/dev/null < /dev/null; then
+    true #Good
+  else
+    exit 128
+  fi
+else
+  # First try directly executing the possibilities
+  if clang-format-diff --help &> /dev/null < /dev/null; then
+    CLANG_FORMAT_DIFF=clang-format-diff
+  elif clang-format-diff.py --help &> /dev/null < /dev/null; then
+    CLANG_FORMAT_DIFF=clang-format-diff.py
+  elif $REPO_ROOT/clang-format-diff.py --help &> /dev/null < /dev/null; then
+    CLANG_FORMAT_DIFF=$REPO_ROOT/clang-format-diff.py
+  else
+    # This probably means we need to directly invoke the interpreter.
+    # But first find clang-format-diff.py
+    if [ -f "$REPO_ROOT/clang-format-diff.py" ]; then
+      CFD_PATH="$REPO_ROOT/clang-format-diff.py"
+    elif which clang-format-diff.py &> /dev/null; then
+      CFD_PATH="$(which clang-format-diff.py)"
+    else
+      echo "You didn't have clang-format-diff.py and/or clang-format available in your computer!"
+      echo "You can download clang-format-diff.py by running: "
+      echo "    curl --location https://raw.githubusercontent.com/llvm/llvm-project/main/clang/tools/clang-format/clang-format-diff.py -o ${REPO_ROOT}/clang-format-diff.py"
+      echo "You should make sure the downloaded script is not compromised."
+      echo "You can download clang-format by running:"
+      echo "    brew install clang-format"
+      echo "  Or"
+      echo "    apt install clang-format"
+      echo "  This might work too:"
+      echo "    yum install git-clang-format"
+      echo "Then make sure clang-format is available and executable from \$PATH:"
+      echo "    clang-format --version"
+      exit 128
+    fi
+    # Check argparse pre-req on interpreter, or it will fail
+    if echo import argparse | ${PYTHON:-python3}; then
+      true # Good
+    else
+      echo "To run clang-format-diff.py, we'll need the library "argparse" to be"
+      echo "installed. You can try either of the follow ways to install it:"
+      echo "  1. Manually download argparse: https://pypi.python.org/pypi/argparse"
+      echo "  2. easy_install argparse (if you have easy_install)"
+      echo "  3. pip install argparse (if you have pip)"
+      exit 129
+    fi
+    # Unfortunately, some machines have a Python2 clang-format-diff.py
+    # installed but only a Python3 interpreter installed. Unfortunately,
+    # automatic 2to3 migration is insufficient, so suggest downloading latest.
+    if grep -q "print '" "$CFD_PATH" && \
+       ${PYTHON:-python3} --version | grep -q 'ython 3'; then
+      echo "You have clang-format-diff.py for Python 2 but are using a Python 3"
+      echo "interpreter (${PYTHON:-python3})."
+      echo "You can download clang-format-diff.py for Python 3 by running: "
+      echo "    curl --location https://raw.githubusercontent.com/llvm/llvm-project/main/clang/tools/clang-format/clang-format-diff.py -o ${REPO_ROOT}/clang-format-diff.py"
+      echo "You should make sure the downloaded script is not compromised."
+      exit 130
+    fi
+    CLANG_FORMAT_DIFF="${PYTHON:-python3} $CFD_PATH"
+    # This had better work after all those checks
+    if $CLANG_FORMAT_DIFF --help >/dev/null < /dev/null; then
+      true #Good
+    else
+      exit 128
+    fi
+  fi
+fi
+
+# TODO(kailiu) following work is not complete since we still need to figure
+# out how to add the modified files done pre-commit hook to git's commit index.
+#
+# Check if this script has already been added to pre-commit hook.
+# Will suggest user to add this script to pre-commit hook if their pre-commit
+# is empty.
+# PRE_COMMIT_SCRIPT_PATH="`git rev-parse --show-toplevel`/.git/hooks/pre-commit"
+# if ! ls $PRE_COMMIT_SCRIPT_PATH &> /dev/null
+# then
+#   echo "Would you like to add this script to pre-commit hook, which will do "
+#   echo -n "the format check for all the affected lines before you check in (y/n):"
+#   read add_to_hook
+#   if [ "$add_to_hook" == "y" ]
+#   then
+#     ln -s `git rev-parse --show-toplevel`/build_tools/format-diff.sh $PRE_COMMIT_SCRIPT_PATH
+#   fi
+# fi
+set -e
+
+uncommitted_code=`git diff HEAD`
+
+# If there's no uncommitted changes, we assume user are doing post-commit
+# format check, in which case we'll try to check the modified lines vs. the
+# facebook/rocksdb.git main branch. Otherwise, we'll check format of the
+# uncommitted code only.
+if [ -z "$uncommitted_code" ]
+then
+  # Attempt to get name of facebook/rocksdb.git remote.
+  [ "$FORMAT_REMOTE" ] || FORMAT_REMOTE="$(LC_ALL=POSIX LANG=POSIX git remote -v | grep 'facebook/rocksdb.git' | head -n 1 | cut -f 1)"
+  # Fall back on 'origin' if that fails
+  [ "$FORMAT_REMOTE" ] || FORMAT_REMOTE=origin
+  # Use main branch from that remote
+  [ "$FORMAT_UPSTREAM" ] || FORMAT_UPSTREAM="$FORMAT_REMOTE/$(LC_ALL=POSIX LANG=POSIX git remote show $FORMAT_REMOTE | sed -n '/HEAD branch/s/.*: //p')"
+  # Get the common ancestor with that remote branch. Everything after that
+  # common ancestor would be considered the contents of a pull request, so
+  # should be relevant for formatting fixes.
+  FORMAT_UPSTREAM_MERGE_BASE="$(git merge-base "$FORMAT_UPSTREAM" HEAD)"
+  # Get the differences
+  diffs=$(git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" | $CLANG_FORMAT_DIFF -p 1)
+  echo "Checking format of changes not yet in $FORMAT_UPSTREAM..."
+else
+  # Check the format of uncommitted lines,
+  diffs=$(git diff -U0 HEAD | $CLANG_FORMAT_DIFF -p 1)
+  echo "Checking format of uncommitted changes..."
+fi
+
+if [ -z "$diffs" ]
+then
+  echo "Nothing needs to be reformatted!"
+  exit 0
+elif [ $CHECK_ONLY ]
+then
+  echo "Your change has unformatted code. Please run make format!"
+  if [ $VERBOSE_CHECK ]; then
+    clang-format --version
+    echo "$diffs"
+  fi
+  exit 1
+fi
+
+# Highlight the insertion/deletion from the clang-format-diff.py's output
+COLOR_END="\033[0m"
+COLOR_RED="\033[0;31m"
+COLOR_GREEN="\033[0;32m"
+
+echo -e "Detect lines that doesn't follow the format rules:\r"
+# Add the color to the diff. lines added will be green; lines removed will be red.
+echo "$diffs" |
+  sed -e "s/\(^-.*$\)/`echo -e \"$COLOR_RED\1$COLOR_END\"`/" |
+  sed -e "s/\(^+.*$\)/`echo -e \"$COLOR_GREEN\1$COLOR_END\"`/"
+
+echo -e "Would you like to fix the format automatically (y/n): \c"
+
+# Make sure under any mode, we can read user input.
+exec < /dev/tty
+read to_fix
+
+if [ "$to_fix" != "y" ]
+then
+  exit 1
+fi
+
+# Do in-place format adjustment.
+if [ -z "$uncommitted_code" ]
+then
+  git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" | $CLANG_FORMAT_DIFF -i -p 1
+else
+  git diff -U0 HEAD | $CLANG_FORMAT_DIFF -i -p 1
+fi
+echo "Files reformatted!"
+
+# Amend to last commit if user do the post-commit format check
+if [ -z "$uncommitted_code" ]; then
+  echo -e "Would you like to amend the changes to last commit (`git log HEAD --oneline | head -1`)? (y/n): \c"
+  read to_amend
+
+  if [ "$to_amend" == "y" ]
+  then
+    git commit -a --amend --reuse-message HEAD
+    echo "Amended to last commit"
+  fi
+fi
diff --git a/src/rocksdb/build_tools/gnu_parallel b/src/rocksdb/build_tools/gnu_parallel
new file mode 100755
index 000000000..3365f46ba
--- /dev/null
+++ b/src/rocksdb/build_tools/gnu_parallel
@@ -0,0 +1,7971 @@
+#!/usr/bin/env perl
+
+# Copyright (C) 2007,2008,2009,2010,2011,2012,2013,2014 Ole Tange and
+# Free Software Foundation, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>
+# or write to the Free Software Foundation, Inc., 51 Franklin St,
+# Fifth Floor, Boston, MA 02110-1301 USA
+
+# open3 used in Job::start
+use IPC::Open3;
+# &WNOHANG used in reaper
+use POSIX qw(:sys_wait_h setsid ceil :errno_h);
+# gensym used in Job::start
+use Symbol qw(gensym);
+# tempfile used in Job::start
+use File::Temp qw(tempfile tempdir);
+# mkpath used in openresultsfile
+use File::Path;
+# GetOptions used in get_options_from_array
+use Getopt::Long;
+# Used to ensure code quality
+use strict;
+use File::Basename;
+
+if(not $ENV{HOME}) {
+    # $ENV{HOME} is sometimes not set if called from PHP
+    ::warning("\$HOME not set. Using /tmp\n");
+    $ENV{HOME} = "/tmp";
+}
+
+save_stdin_stdout_stderr();
+save_original_signal_handler();
+parse_options();
+::debug("init", "Open file descriptors: ", join(" ",keys %Global::fd), "\n");
+my $number_of_args;
+if($Global::max_number_of_args) {
+    $number_of_args=$Global::max_number_of_args;
+} elsif ($opt::X or $opt::m or $opt::xargs) {
+    $number_of_args = undef;
+} else {
+    $number_of_args = 1;
+}
+
+my @command;
+@command = @ARGV;
+
+my @fhlist;
+if($opt::pipepart) {
+    @fhlist = map { open_or_exit($_) } "/dev/null";
+} else {
+    @fhlist = map { open_or_exit($_) } @opt::a;
+    if(not @fhlist and not $opt::pipe) {
+	@fhlist = (*STDIN);
+    }
+}
+
+if($opt::skip_first_line) {
+    # Skip the first line for the first file handle
+    my $fh = $fhlist[0];
+    <$fh>;
+}
+if($opt::header and not $opt::pipe) {
+    my $fh = $fhlist[0];
+    # split with colsep or \t
+    # $header force $colsep = \t if undef?
+    my $delimiter = $opt::colsep;
+    $delimiter ||= "\$";
+    my $id = 1;
+    for my $fh (@fhlist) {
+	my $line = <$fh>;
+	chomp($line);
+	::debug("init", "Delimiter: '$delimiter'");
+	for my $s (split /$delimiter/o, $line) {
+	    ::debug("init", "Colname: '$s'");
+	    # Replace {colname} with {2}
+	    # TODO accept configurable short hands
+	    # TODO how to deal with headers in {=...=}
+	    for(@command) {
+	      s:\{$s(|/|//|\.|/\.)\}:\{$id$1\}:g;
+	    }
+	    $Global::input_source_header{$id} = $s;
+	    $id++;
+	}
+    }
+} else {
+    my $id = 1;
+    for my $fh (@fhlist) {
+	$Global::input_source_header{$id} = $id;
+	$id++;
+    }
+}
+
+if($opt::filter_hosts and (@opt::sshlogin or @opt::sshloginfile)) {
+    # Parallel check all hosts are up. Remove hosts that are down
+    filter_hosts();
+}
+
+if($opt::nonall or $opt::onall) {
+    onall(@command);
+    wait_and_exit(min(undef_as_zero($Global::exitstatus),254));
+}
+
+# TODO --transfer foo/./bar --cleanup
+# multiple --transfer and --basefile with different /./
+
+$Global::JobQueue = JobQueue->new(
+    \@command,\@fhlist,$Global::ContextReplace,$number_of_args,\@Global::ret_files);
+
+if($opt::eta or $opt::bar) {
+    # Count the number of jobs before starting any
+    $Global::JobQueue->total_jobs();
+}
+if($opt::pipepart) {
+    @Global::cat_partials = map { pipe_part_files($_) } @opt::a;
+    # Unget the command as many times as there are parts
+    $Global::JobQueue->{'commandlinequeue'}->unget(
+	map { $Global::JobQueue->{'commandlinequeue'}->get() } @Global::cat_partials
+	);
+}
+for my $sshlogin (values %Global::host) {
+    $sshlogin->max_jobs_running();
+}
+
+init_run_jobs();
+my $sem;
+if($Global::semaphore) {
+    $sem = acquire_semaphore();
+}
+$SIG{TERM} = \&start_no_new_jobs;
+
+start_more_jobs();
+if(not $opt::pipepart) {
+    if($opt::pipe) {
+	spreadstdin();
+    }
+}
+::debug("init", "Start draining\n");
+drain_job_queue();
+::debug("init", "Done draining\n");
+reaper();
+::debug("init", "Done reaping\n");
+if($opt::pipe and @opt::a) {
+    for my $job (@Global::tee_jobs) {
+	unlink $job->fh(2,"name");
+	$job->set_fh(2,"name","");
+	$job->print();
+	unlink $job->fh(1,"name");
+    }
+}
+::debug("init", "Cleaning\n");
+cleanup();
+if($Global::semaphore) {
+    $sem->release();
+}
+for(keys %Global::sshmaster) {
+    kill "TERM", $_;
+}
+::debug("init", "Halt\n");
+if($opt::halt_on_error) {
+    wait_and_exit($Global::halt_on_error_exitstatus);
+} else {
+    wait_and_exit(min(undef_as_zero($Global::exitstatus),254));
+}
+
+sub __PIPE_MODE__ {}
+
+sub pipe_part_files {
+    # Input:
+    #   $file = the file to read
+    # Returns:
+    #   @commands that will cat_partial each part
+    my ($file) = @_;
+    my $buf = "";
+    my $header = find_header(\$buf,open_or_exit($file));
+    # find positions
+    my @pos = find_split_positions($file,$opt::blocksize,length $header);
+    # Make @cat_partials
+    my @cat_partials = ();
+    for(my $i=0; $i<$#pos; $i++) {
+	push @cat_partials, cat_partial($file, 0, length($header), $pos[$i], $pos[$i+1]);
+    }
+    # Remote exec should look like:
+    #  ssh -oLogLevel=quiet lo  'eval `echo $SHELL | grep "/t\{0,1\}csh" > /dev/null  && echo setenv PARALLEL_SEQ '$PARALLEL_SEQ'\;  setenv PARALLEL_PID '$PARALLEL_PID'  || echo PARALLEL_SEQ='$PARALLEL_SEQ'\;export PARALLEL_SEQ\;  PARALLEL_PID='$PARALLEL_PID'\;export PARALLEL_PID` ;'  tty\ \>/dev/null\ \&\&\ stty\ isig\ -onlcr\ -echo\;echo\ \$SHELL\ \|\ grep\ \"/t\\\{0,1\\\}csh\"\ \>\ /dev/null\ \&\&\ setenv\ FOO\ /tmp/foo\ \|\|\ export\ FOO=/tmp/foo\; \(wc\ -\ \$FOO\)
+    # ssh -tt not allowed. Remote will die due to broken pipe anyway.
+    # TODO test remote with --fifo / --cat
+    return @cat_partials;
+}
+
+sub find_header {
+    # Input:
+    #   $buf_ref = reference to read-in buffer
+    #   $fh = filehandle to read from
+    # Uses:
+    #   $opt::header
+    #   $opt::blocksize
+    # Returns:
+    #   $header string
+    my ($buf_ref, $fh) = @_;
+    my $header = "";
+    if($opt::header) {
+	if($opt::header eq ":") { $opt::header = "(.*\n)"; }
+	# Number = number of lines
+	$opt::header =~ s/^(\d+)$/"(.*\n)"x$1/e;
+	while(read($fh,substr($$buf_ref,length $$buf_ref,0),$opt::blocksize)) {
+	    if($$buf_ref=~s/^($opt::header)//) {
+		$header = $1;
+		last;
+	    }
+	}
+    }
+    return $header;
+}
+
+sub find_split_positions {
+    # Input:
+    #   $file = the file to read
+    #   $block = (minimal) --block-size of each chunk
+    #   $headerlen = length of header to be skipped
+    # Uses:
+    #   $opt::recstart
+    #   $opt::recend
+    # Returns:
+    #   @positions of block start/end
+    my($file, $block, $headerlen) = @_;
+    my $size = -s $file;
+    $block = int $block;
+    # The optimal dd blocksize for mint, redhat, solaris, openbsd = 2^17..2^20
+    # The optimal dd blocksize for freebsd = 2^15..2^17
+    my $dd_block_size = 131072; # 2^17
+    my @pos;
+    my ($recstart,$recend) = recstartrecend();
+    my $recendrecstart = $recend.$recstart;
+    my $fh = ::open_or_exit($file);
+    push(@pos,$headerlen);
+    for(my $pos = $block+$headerlen; $pos < $size; $pos += $block) {
+	my $buf;
+	seek($fh, $pos, 0) || die;
+	while(read($fh,substr($buf,length $buf,0),$dd_block_size)) {
+	    if($opt::regexp) {
+		# If match /$recend$recstart/ => Record position
+		if($buf =~ /(.*$recend)$recstart/os) {
+		    my $i = length($1);
+		    push(@pos,$pos+$i);
+		    # Start looking for next record _after_ this match
+		    $pos += $i;
+		    last;
+		}
+	    } else {
+		# If match $recend$recstart => Record position
+		my $i = index($buf,$recendrecstart);
+		if($i != -1) {
+		    push(@pos,$pos+$i);
+		    # Start looking for next record _after_ this match
+		    $pos += $i;
+		    last;
+		}
+	    }
+	}
+    }
+    push(@pos,$size);
+    close $fh;
+    return @pos;
+}
+
+sub cat_partial {
+    # Input:
+    #   $file = the file to read
+    #   ($start, $end, [$start2, $end2, ...]) = start byte, end byte
+    # Returns:
+    #   Efficient perl command to copy $start..$end, $start2..$end2, ... to stdout
+    my($file, @start_end) = @_;
+    my($start, $i);
+    # Convert start_end to start_len
+    my @start_len = map { if(++$i % 2) { $start = $_; } else { $_-$start } } @start_end;
+    return "<". shell_quote_scalar($file) .
+	q{ perl -e 'while(@ARGV) { sysseek(STDIN,shift,0) || die; $left = shift; while($read = sysread(STDIN,$buf, ($left > 32768 ? 32768 : $left))){ $left -= $read; syswrite(STDOUT,$buf); } }' } .
+	" @start_len";
+}
+
+sub spreadstdin {
+    # read a record
+    # Spawn a job and print the record to it.
+    # Uses:
+    #   $opt::blocksize
+    #   STDIN
+    #   $opr::r
+    #   $Global::max_lines
+    #   $Global::max_number_of_args
+    #   $opt::regexp
+    #   $Global::start_no_new_jobs
+    #   $opt::roundrobin
+    #   %Global::running
+
+    my $buf = "";
+    my ($recstart,$recend) = recstartrecend();
+    my $recendrecstart = $recend.$recstart;
+    my $chunk_number = 1;
+    my $one_time_through;
+    my $blocksize = $opt::blocksize;
+    my $in = *STDIN;
+    my $header = find_header(\$buf,$in);
+    while(1) {
+      my $anything_written = 0;
+      if(not read($in,substr($buf,length $buf,0),$blocksize)) {
+	  # End-of-file
+	  $chunk_number != 1 and last;
+	  # Force the while-loop once if everything was read by header reading
+	  $one_time_through++ and last;
+      }
+      if($opt::r) {
+	  # Remove empty lines
+	  $buf =~ s/^\s*\n//gm;
+	  if(length $buf == 0) {
+	      next;
+	  }
+      }
+      if($Global::max_lines and not $Global::max_number_of_args) {
+	  # Read n-line records
+	  my $n_lines = $buf =~ tr/\n/\n/;
+	  my $last_newline_pos = rindex($buf,"\n");
+	  while($n_lines % $Global::max_lines) {
+	      $n_lines--;
+	      $last_newline_pos = rindex($buf,"\n",$last_newline_pos-1);
+	  }
+	  # Chop at $last_newline_pos as that is where n-line record ends
+	  $anything_written +=
+	      write_record_to_pipe($chunk_number++,\$header,\$buf,
+				   $recstart,$recend,$last_newline_pos+1);
+	  substr($buf,0,$last_newline_pos+1) = "";
+      } elsif($opt::regexp) {
+	  if($Global::max_number_of_args) {
+	      # -N => (start..*?end){n}
+	      # -L -N => (start..*?end){n*l}
+	      my $read_n_lines = $Global::max_number_of_args * ($Global::max_lines || 1);
+	      while($buf =~ s/((?:$recstart.*?$recend){$read_n_lines})($recstart.*)$/$2/os) {
+		  # Copy to modifiable variable
+		  my $b = $1;
+		  $anything_written +=
+		      write_record_to_pipe($chunk_number++,\$header,\$b,
+					   $recstart,$recend,length $1);
+	      }
+	  } else {
+	      # Find the last recend-recstart in $buf
+	      if($buf =~ s/(.*$recend)($recstart.*?)$/$2/os) {
+		  # Copy to modifiable variable
+		  my $b = $1;
+		  $anything_written +=
+		      write_record_to_pipe($chunk_number++,\$header,\$b,
+					   $recstart,$recend,length $1);
+	      }
+	  }
+      } else {
+	  if($Global::max_number_of_args) {
+	      # -N => (start..*?end){n}
+	      my $i = 0;
+	      my $read_n_lines = $Global::max_number_of_args * ($Global::max_lines || 1);
+	      while(($i = nindex(\$buf,$recendrecstart,$read_n_lines)) != -1) {
+		  $i += length $recend; # find the actual splitting location
+		  $anything_written +=
+		      write_record_to_pipe($chunk_number++,\$header,\$buf,
+					   $recstart,$recend,$i);
+		  substr($buf,0,$i) = "";
+	      }
+	  } else {
+	      # Find the last recend-recstart in $buf
+	      my $i = rindex($buf,$recendrecstart);
+	      if($i != -1) {
+		  $i += length $recend; # find the actual splitting location
+		  $anything_written +=
+		      write_record_to_pipe($chunk_number++,\$header,\$buf,
+					   $recstart,$recend,$i);
+		  substr($buf,0,$i) = "";
+	      }
+	  }
+      }
+      if(not $anything_written and not eof($in)) {
+	  # Nothing was written - maybe the block size < record size?
+	  # Increase blocksize exponentially
+	  my $old_blocksize = $blocksize;
+	  $blocksize = ceil($blocksize * 1.3 + 1);
+	  ::warning("A record was longer than $old_blocksize. " .
+		    "Increasing to --blocksize $blocksize\n");
+      }
+    }
+    ::debug("init", "Done reading input\n");
+
+    # If there is anything left in the buffer write it
+    substr($buf,0,0) = "";
+    write_record_to_pipe($chunk_number++,\$header,\$buf,$recstart,$recend,length $buf);
+
+    $Global::start_no_new_jobs ||= 1;
+    if($opt::roundrobin) {
+	for my $job (values %Global::running) {
+	    close $job->fh(0,"w");
+	}
+	my %incomplete_jobs = %Global::running;
+	my $sleep = 1;
+	while(keys %incomplete_jobs) {
+	    my $something_written = 0;
+	    for my $pid (keys %incomplete_jobs) {
+		my $job = $incomplete_jobs{$pid};
+		if($job->stdin_buffer_length()) {
+		    $something_written += $job->non_block_write();
+		} else {
+		    delete $incomplete_jobs{$pid}
+		}
+	    }
+	    if($something_written) {
+		$sleep = $sleep/2+0.001;
+	    }
+	    $sleep = ::reap_usleep($sleep);
+	}
+    }
+}
+
+sub recstartrecend {
+    # Uses:
+    #   $opt::recstart
+    #   $opt::recend
+    # Returns:
+    #   $recstart,$recend with default values and regexp conversion
+    my($recstart,$recend);
+    if(defined($opt::recstart) and defined($opt::recend)) {
+	# If both --recstart and --recend is given then both must match
+	$recstart = $opt::recstart;
+	$recend = $opt::recend;
+    } elsif(defined($opt::recstart)) {
+	# If --recstart is given it must match start of record
+	$recstart = $opt::recstart;
+	$recend = "";
+    } elsif(defined($opt::recend)) {
+	# If --recend is given then it must match end of record
+	$recstart = "";
+	$recend = $opt::recend;
+    }
+
+    if($opt::regexp) {
+	# If $recstart/$recend contains '|' this should only apply to the regexp
+	$recstart = "(?:".$recstart.")";
+	$recend = "(?:".$recend.")";
+    } else {
+	# $recstart/$recend = printf strings (\n)
+	$recstart =~ s/\\([0rnt\'\"\\])/"qq|\\$1|"/gee;
+	$recend =~ s/\\([0rnt\'\"\\])/"qq|\\$1|"/gee;
+    }
+    return ($recstart,$recend);
+}
+
+sub nindex {
+    # See if string is in buffer N times
+    # Returns:
+    #   the position where the Nth copy is found
+    my ($buf_ref, $str, $n) = @_;
+    my $i = 0;
+    for(1..$n) {
+	$i = index($$buf_ref,$str,$i+1);
+	if($i == -1) { last }
+    }
+    return $i;
+}
+
+{
+    my @robin_queue;
+
+    sub round_robin_write {
+	# Input:
+	#   $header_ref = ref to $header string
+	#   $block_ref = ref to $block to be written
+	#   $recstart = record start string
+	#   $recend = record end string
+	#   $endpos = end position of $block
+	# Uses:
+	#   %Global::running
+	my ($header_ref,$block_ref,$recstart,$recend,$endpos) = @_;
+	my $something_written = 0;
+	my $block_passed = 0;
+	my $sleep = 1;
+	while(not $block_passed) {
+	    # Continue flushing existing buffers
+	    # until one is empty and a new block is passed
+	    # Make a queue to spread the blocks evenly
+	    if(not @robin_queue) {
+		push @robin_queue, values %Global::running;
+	    }
+	    while(my $job = shift @robin_queue) {
+		if($job->stdin_buffer_length() > 0) {
+		    $something_written += $job->non_block_write();
+		} else {
+		    $job->set_stdin_buffer($header_ref,$block_ref,$endpos,$recstart,$recend);
+		    $block_passed = 1;
+		    $job->set_virgin(0);
+		    $something_written += $job->non_block_write();
+		    last;
+		}
+	    }
+	    $sleep = ::reap_usleep($sleep);
+	}
+	return $something_written;
+    }
+}
+
+sub write_record_to_pipe {
+    # Fork then
+    # Write record from pos 0 .. $endpos to pipe
+    # Input:
+    #   $chunk_number = sequence number - to see if already run
+    #   $header_ref = reference to header string to prepend
+    #   $record_ref = reference to record to write
+    #   $recstart = start string of record
+    #   $recend = end string of record
+    #   $endpos = position in $record_ref where record ends
+    # Uses:
+    #   $Global::job_already_run
+    #   $opt::roundrobin
+    #   @Global::virgin_jobs
+    # Returns:
+    #   Number of chunks written (0 or 1)
+    my ($chunk_number,$header_ref,$record_ref,$recstart,$recend,$endpos) = @_;
+    if($endpos == 0) { return 0; }
+    if(vec($Global::job_already_run,$chunk_number,1)) { return 1; }
+    if($opt::roundrobin) {
+	return round_robin_write($header_ref,$record_ref,$recstart,$recend,$endpos);
+    }
+    # If no virgin found, backoff
+    my $sleep = 0.0001; # 0.01 ms - better performance on highend
+    while(not @Global::virgin_jobs) {
+	::debug("pipe", "No virgin jobs");
+	$sleep = ::reap_usleep($sleep);
+	# Jobs may not be started because of loadavg
+	# or too little time between each ssh login.
+	start_more_jobs();
+    }
+    my $job = shift @Global::virgin_jobs;
+    # Job is no longer virgin
+    $job->set_virgin(0);
+    if(fork()) {
+	# Skip
+    } else {
+	# Chop of at $endpos as we do not know how many rec_sep will
+	# be removed.
+	substr($$record_ref,$endpos,length $$record_ref) = "";
+	# Remove rec_sep
+	if($opt::remove_rec_sep) {
+	    Job::remove_rec_sep($record_ref,$recstart,$recend);
+	}
+	$job->write($header_ref);
+	$job->write($record_ref);
+	close $job->fh(0,"w");
+	exit(0);
+    }
+    close $job->fh(0,"w");
+    return 1;
+}
+
+sub __SEM_MODE__ {}
+
+sub acquire_semaphore {
+    # Acquires semaphore. If needed: spawns to the background
+    # Uses:
+    #   @Global::host
+    # Returns:
+    #   The semaphore to be released when jobs is complete
+    $Global::host{':'} = SSHLogin->new(":");
+    my $sem = Semaphore->new($Semaphore::name,$Global::host{':'}->max_jobs_running());
+    $sem->acquire();
+    if($Semaphore::fg) {
+	# skip
+    } else {
+	# If run in the background, the PID will change
+	# therefore release and re-acquire the semaphore
+	$sem->release();
+	if(fork()) {
+	    exit(0);
+	} else {
+	    # child
+	    # Get a semaphore for this pid
+	    ::die_bug("Can't start a new session: $!") if setsid() == -1;
+	    $sem = Semaphore->new($Semaphore::name,$Global::host{':'}->max_jobs_running());
+	    $sem->acquire();
+	}
+    }
+    return $sem;
+}
+
+sub __PARSE_OPTIONS__ {}
+
+sub options_hash {
+    # Returns:
+    #   %hash = the GetOptions config
+    return
+	("debug|D=s" => \$opt::D,
+	 "xargs" => \$opt::xargs,
+	 "m" => \$opt::m,
+	 "X" => \$opt::X,
+	 "v" => \@opt::v,
+	 "joblog=s" => \$opt::joblog,
+	 "results|result|res=s" => \$opt::results,
+	 "resume" => \$opt::resume,
+	 "resume-failed|resumefailed" => \$opt::resume_failed,
+	 "silent" => \$opt::silent,
+	 #"silent-error|silenterror" => \$opt::silent_error,
+	 "keep-order|keeporder|k" => \$opt::keeporder,
+	 "group" => \$opt::group,
+	 "g" => \$opt::retired,
+	 "ungroup|u" => \$opt::ungroup,
+	 "linebuffer|linebuffered|line-buffer|line-buffered" => \$opt::linebuffer,
+	 "tmux" => \$opt::tmux,
+	 "null|0" => \$opt::0,
+	 "quote|q" => \$opt::q,
+	 # Replacement strings
+	 "parens=s" => \$opt::parens,
+	 "rpl=s" => \@opt::rpl,
+	 "plus" => \$opt::plus,
+	 "I=s" => \$opt::I,
+	 "extensionreplace|er=s" => \$opt::U,
+	 "U=s" => \$opt::retired,
+	 "basenamereplace|bnr=s" => \$opt::basenamereplace,
+	 "dirnamereplace|dnr=s" => \$opt::dirnamereplace,
+	 "basenameextensionreplace|bner=s" => \$opt::basenameextensionreplace,
+	 "seqreplace=s" => \$opt::seqreplace,
+	 "slotreplace=s" => \$opt::slotreplace,
+	 "jobs|j=s" => \$opt::jobs,
+	 "delay=f" => \$opt::delay,
+	 "sshdelay=f" => \$opt::sshdelay,
+	 "load=s" => \$opt::load,
+	 "noswap" => \$opt::noswap,
+	 "max-line-length-allowed" => \$opt::max_line_length_allowed,
+	 "number-of-cpus" => \$opt::number_of_cpus,
+	 "number-of-cores" => \$opt::number_of_cores,
+	 "use-cpus-instead-of-cores" => \$opt::use_cpus_instead_of_cores,
+	 "shellquote|shell_quote|shell-quote" => \$opt::shellquote,
+	 "nice=i" => \$opt::nice,
+	 "timeout=s" => \$opt::timeout,
+	 "tag" => \$opt::tag,
+	 "tagstring|tag-string=s" => \$opt::tagstring,
+	 "onall" => \$opt::onall,
+	 "nonall" => \$opt::nonall,
+	 "filter-hosts|filterhosts|filter-host" => \$opt::filter_hosts,
+	 "sshlogin|S=s" => \@opt::sshlogin,
+	 "sshloginfile|slf=s" => \@opt::sshloginfile,
+	 "controlmaster|M" => \$opt::controlmaster,
+	 "return=s" => \@opt::return,
+	 "trc=s" => \@opt::trc,
+	 "transfer" => \$opt::transfer,
+	 "cleanup" => \$opt::cleanup,
+	 "basefile|bf=s" => \@opt::basefile,
+	 "B=s" => \$opt::retired,
+	 "ctrlc|ctrl-c" => \$opt::ctrlc,
+	 "noctrlc|no-ctrlc|no-ctrl-c" => \$opt::noctrlc,
+	 "workdir|work-dir|wd=s" => \$opt::workdir,
+	 "W=s" => \$opt::retired,
+	 "tmpdir=s" => \$opt::tmpdir,
+	 "tempdir=s" => \$opt::tmpdir,
+	 "use-compress-program|compress-program=s" => \$opt::compress_program,
+	 "use-decompress-program|decompress-program=s" => \$opt::decompress_program,
+	 "compress" => \$opt::compress,
+	 "tty" => \$opt::tty,
+	 "T" => \$opt::retired,
+	 "halt-on-error|halt=s" => \$opt::halt_on_error,
+	 "H=i" => \$opt::retired,
+	 "retries=i" => \$opt::retries,
+	 "dry-run|dryrun" => \$opt::dryrun,
+	 "progress" => \$opt::progress,
+	 "eta" => \$opt::eta,
+	 "bar" => \$opt::bar,
+	 "arg-sep|argsep=s" => \$opt::arg_sep,
+	 "arg-file-sep|argfilesep=s" => \$opt::arg_file_sep,
+	 "trim=s" => \$opt::trim,
+	 "env=s" => \@opt::env,
+	 "recordenv|record-env" => \$opt::record_env,
+	 "plain" => \$opt::plain,
+	 "profile|J=s" => \@opt::profile,
+	 "pipe|spreadstdin" => \$opt::pipe,
+	 "robin|round-robin|roundrobin" => \$opt::roundrobin,
+	 "recstart=s" => \$opt::recstart,
+	 "recend=s" => \$opt::recend,
+	 "regexp|regex" => \$opt::regexp,
+	 "remove-rec-sep|removerecsep|rrs" => \$opt::remove_rec_sep,
+	 "files|output-as-files|outputasfiles" => \$opt::files,
+	 "block|block-size|blocksize=s" => \$opt::blocksize,
+	 "tollef" => \$opt::retired,
+	 "gnu" => \$opt::gnu,
+	 "xapply" => \$opt::xapply,
+	 "bibtex" => \$opt::bibtex,
+	 "nn|nonotice|no-notice" => \$opt::no_notice,
+	 # xargs-compatibility - implemented, man, testsuite
+	 "max-procs|P=s" => \$opt::jobs,
+	 "delimiter|d=s" => \$opt::d,
+	 "max-chars|s=i" => \$opt::max_chars,
+	 "arg-file|a=s" => \@opt::a,
+	 "no-run-if-empty|r" => \$opt::r,
+	 "replace|i:s" => \$opt::i,
+	 "E=s" => \$opt::eof,
+	 "eof|e:s" => \$opt::eof,
+	 "max-args|n=i" => \$opt::max_args,
+	 "max-replace-args|N=i" => \$opt::max_replace_args,
+	 "colsep|col-sep|C=s" => \$opt::colsep,
+	 "help|h" => \$opt::help,
+	 "L=f" => \$opt::L,
+	 "max-lines|l:f" => \$opt::max_lines,
+	 "interactive|p" => \$opt::p,
+	 "verbose|t" => \$opt::verbose,
+	 "version|V" => \$opt::version,
+	 "minversion|min-version=i" => \$opt::minversion,
+	 "show-limits|showlimits" => \$opt::show_limits,
+	 "exit|x" => \$opt::x,
+	 # Semaphore
+	 "semaphore" => \$opt::semaphore,
+	 "semaphoretimeout=i" => \$opt::semaphoretimeout,
+	 "semaphorename|id=s" => \$opt::semaphorename,
+	 "fg" => \$opt::fg,
+	 "bg" => \$opt::bg,
+	 "wait" => \$opt::wait,
+	 # Shebang #!/usr/bin/parallel --shebang
+	 "shebang|hashbang" => \$opt::shebang,
+	 "internal-pipe-means-argfiles" => \$opt::internal_pipe_means_argfiles,
+	 "Y" => \$opt::retired,
+         "skip-first-line" => \$opt::skip_first_line,
+	 "header=s" => \$opt::header,
+	 "cat" => \$opt::cat,
+	 "fifo" => \$opt::fifo,
+	 "pipepart|pipe-part" => \$opt::pipepart,
+	 "hgrp|hostgroup|hostgroups" => \$opt::hostgroups,
+	);
+}
+
+sub get_options_from_array {
+    # Run GetOptions on @array
+    # Input:
+    #   $array_ref = ref to @ARGV to parse
+    #   @keep_only = Keep only these options
+    # Uses:
+    #   @ARGV
+    # Returns:
+    #   true if parsing worked
+    #   false if parsing failed
+    #   @$array_ref is changed
+    my ($array_ref, @keep_only) = @_;
+    if(not @$array_ref) {
+	# Empty array: No need to look more at that
+	return 1;
+    }
+    # A bit of shuffling of @ARGV needed as GetOptionsFromArray is not
+    # supported everywhere
+    my @save_argv;
+    my $this_is_ARGV = (\@::ARGV == $array_ref);
+    if(not $this_is_ARGV) {
+	@save_argv = @::ARGV;
+	@::ARGV = @{$array_ref};
+    }
+    # If @keep_only set: Ignore all values except @keep_only
+    my %options = options_hash();
+    if(@keep_only) {
+	my (%keep,@dummy);
+	@keep{@keep_only} = @keep_only;
+	for my $k (grep { not $keep{$_} } keys %options) {
+	    # Store the value of the option in @dummy
+	    $options{$k} = \@dummy;
+	}
+    }
+    my $retval = GetOptions(%options);
+    if(not $this_is_ARGV) {
+	@{$array_ref} = @::ARGV;
+	@::ARGV = @save_argv;
+    }
+    return $retval;
+}
+
+sub parse_options {
+    # Returns: N/A
+    # Defaults:
+    $Global::version = 20141122;
+    $Global::progname = 'parallel';
+    $Global::infinity = 2**31;
+    $Global::debug = 0;
+    $Global::verbose = 0;
+    $Global::quoting = 0;
+    # Read only table with default --rpl values
+    %Global::replace =
+	(
+	 '{}'   => '',
+	 '{#}'  => '1 $_=$job->seq()',
+	 '{%}'  => '1 $_=$job->slot()',
+	 '{/}'  => 's:.*/::',
+	 '{//}' => '$Global::use{"File::Basename"} ||= eval "use File::Basename; 1;"; $_ = dirname($_);',
+	 '{/.}' => 's:.*/::; s:\.[^/.]+$::;',
+	 '{.}'  => 's:\.[^/.]+$::',
+	);
+    %Global::plus =
+	(
+	 # {} = {+/}/{/}
+	 #    = {.}.{+.}     = {+/}/{/.}.{+.}
+	 #    = {..}.{+..}   = {+/}/{/..}.{+..}
+	 #    = {...}.{+...} = {+/}/{/...}.{+...}
+	 '{+/}' => 's:/[^/]*$::',
+	 '{+.}' => 's:.*\.::',
+	 '{+..}' => 's:.*\.([^.]*\.):$1:',
+	 '{+...}' => 's:.*\.([^.]*\.[^.]*\.):$1:',
+	 '{..}' => 's:\.[^/.]+$::; s:\.[^/.]+$::',
+	 '{...}' => 's:\.[^/.]+$::; s:\.[^/.]+$::; s:\.[^/.]+$::',
+	 '{/..}' => 's:.*/::; s:\.[^/.]+$::; s:\.[^/.]+$::',
+	 '{/...}' => 's:.*/::; s:\.[^/.]+$::; s:\.[^/.]+$::; s:\.[^/.]+$::',
+	);
+    # Modifiable copy of %Global::replace
+    %Global::rpl = %Global::replace;
+    $Global::parens = "{==}";
+    $/="\n";
+    $Global::ignore_empty = 0;
+    $Global::interactive = 0;
+    $Global::stderr_verbose = 0;
+    $Global::default_simultaneous_sshlogins = 9;
+    $Global::exitstatus = 0;
+    $Global::halt_on_error_exitstatus = 0;
+    $Global::arg_sep = ":::";
+    $Global::arg_file_sep = "::::";
+    $Global::trim = 'n';
+    $Global::max_jobs_running = 0;
+    $Global::job_already_run = '';
+    $ENV{'TMPDIR'} ||= "/tmp";
+
+    @ARGV=read_options();
+
+    if(@opt::v) { $Global::verbose = $#opt::v+1; } # Convert -v -v to v=2
+    $Global::debug = $opt::D;
+    $Global::shell = $ENV{'PARALLEL_SHELL'} || parent_shell($$) || $ENV{'SHELL'} || "/bin/sh";
+    if(defined $opt::X) { $Global::ContextReplace = 1; }
+    if(defined $opt::silent) { $Global::verbose = 0; }
+    if(defined $opt::0) { $/ = "\0"; }
+    if(defined $opt::d) { my $e="sprintf \"$opt::d\""; $/ = eval $e; }
+    if(defined $opt::p) { $Global::interactive = $opt::p; }
+    if(defined $opt::q) { $Global::quoting = 1; }
+    if(defined $opt::r) { $Global::ignore_empty = 1; }
+    if(defined $opt::verbose) { $Global::stderr_verbose = 1; }
+    # Deal with --rpl
+    sub rpl {
+	# Modify %Global::rpl
+	# Replace $old with $new
+	my ($old,$new) =  @_;
+	if($old ne $new) {
+	    $Global::rpl{$new} = $Global::rpl{$old};
+	    delete $Global::rpl{$old};
+	}
+    }
+    if(defined $opt::parens) { $Global::parens = $opt::parens; }
+    my $parenslen = 0.5*length $Global::parens;
+    $Global::parensleft = substr($Global::parens,0,$parenslen);
+    $Global::parensright = substr($Global::parens,$parenslen);
+    if(defined $opt::plus) { %Global::rpl = (%Global::plus,%Global::rpl); }
+    if(defined $opt::I) { rpl('{}',$opt::I); }
+    if(defined $opt::U) { rpl('{.}',$opt::U); }
+    if(defined $opt::i and $opt::i) { rpl('{}',$opt::i); }
+    if(defined $opt::basenamereplace) { rpl('{/}',$opt::basenamereplace); }
+    if(defined $opt::dirnamereplace) { rpl('{//}',$opt::dirnamereplace); }
+    if(defined $opt::seqreplace) { rpl('{#}',$opt::seqreplace); }
+    if(defined $opt::slotreplace) { rpl('{%}',$opt::slotreplace); }
+    if(defined $opt::basenameextensionreplace) {
+       rpl('{/.}',$opt::basenameextensionreplace);
+    }
+    for(@opt::rpl) {
+	# Create $Global::rpl entries for --rpl options
+	# E.g: "{..} s:\.[^.]+$:;s:\.[^.]+$:;"
+	my ($shorthand,$long) = split/ /,$_,2;
+	$Global::rpl{$shorthand} = $long;
+    }
+    if(defined $opt::eof) { $Global::end_of_file_string = $opt::eof; }
+    if(defined $opt::max_args) { $Global::max_number_of_args = $opt::max_args; }
+    if(defined $opt::timeout) { $Global::timeoutq = TimeoutQueue->new($opt::timeout); }
+    if(defined $opt::tmpdir) { $ENV{'TMPDIR'} = $opt::tmpdir; }
+    if(defined $opt::help) { die_usage(); }
+    if(defined $opt::colsep) { $Global::trim = 'lr'; }
+    if(defined $opt::header) { $opt::colsep = defined $opt::colsep ? $opt::colsep : "\t"; }
+    if(defined $opt::trim) { $Global::trim = $opt::trim; }
+    if(defined $opt::arg_sep) { $Global::arg_sep = $opt::arg_sep; }
+    if(defined $opt::arg_file_sep) { $Global::arg_file_sep = $opt::arg_file_sep; }
+    if(defined $opt::number_of_cpus) { print SSHLogin::no_of_cpus(),"\n"; wait_and_exit(0); }
+    if(defined $opt::number_of_cores) {
+        print SSHLogin::no_of_cores(),"\n"; wait_and_exit(0);
+    }
+    if(defined $opt::max_line_length_allowed) {
+        print Limits::Command::real_max_length(),"\n"; wait_and_exit(0);
+    }
+    if(defined $opt::version) { version(); wait_and_exit(0); }
+    if(defined $opt::bibtex) { bibtex(); wait_and_exit(0); }
+    if(defined $opt::record_env) { record_env(); wait_and_exit(0); }
+    if(defined $opt::show_limits) { show_limits(); }
+    if(@opt::sshlogin) { @Global::sshlogin = @opt::sshlogin; }
+    if(@opt::sshloginfile) { read_sshloginfiles(@opt::sshloginfile); }
+    if(@opt::return) { push @Global::ret_files, @opt::return; }
+    if(not defined $opt::recstart and
+       not defined $opt::recend) { $opt::recend = "\n"; }
+    if(not defined $opt::blocksize) { $opt::blocksize = "1M"; }
+    $opt::blocksize = multiply_binary_prefix($opt::blocksize);
+    if(defined $opt::controlmaster) { $opt::noctrlc = 1; }
+    if(defined $opt::semaphore) { $Global::semaphore = 1; }
+    if(defined $opt::semaphoretimeout) { $Global::semaphore = 1; }
+    if(defined $opt::semaphorename) { $Global::semaphore = 1; }
+    if(defined $opt::fg) { $Global::semaphore = 1; }
+    if(defined $opt::bg) { $Global::semaphore = 1; }
+    if(defined $opt::wait) { $Global::semaphore = 1; }
+    if(defined $opt::halt_on_error and
+       $opt::halt_on_error=~/%/) { $opt::halt_on_error /= 100; }
+    if(defined $opt::timeout and $opt::timeout !~ /^\d+(\.\d+)?%?$/) {
+	::error("--timeout must be seconds or percentage\n");
+	wait_and_exit(255);
+    }
+    if(defined $opt::minversion) {
+	print $Global::version,"\n";
+	if($Global::version < $opt::minversion) {
+	    wait_and_exit(255);
+	} else {
+	    wait_and_exit(0);
+	}
+    }
+    if(not defined $opt::delay) {
+	# Set --delay to --sshdelay if not set
+	$opt::delay = $opt::sshdelay;
+    }
+    if($opt::compress_program) {
+	$opt::compress = 1;
+	$opt::decompress_program ||= $opt::compress_program." -dc";
+    }
+    if($opt::compress) {
+	my ($compress, $decompress) = find_compression_program();
+	$opt::compress_program ||= $compress;
+	$opt::decompress_program ||= $decompress;
+    }
+    if(defined $opt::nonall) {
+	# Append a dummy empty argument
+	push @ARGV, $Global::arg_sep, "";
+    }
+    if(defined $opt::tty) {
+        # Defaults for --tty: -j1 -u
+        # Can be overridden with -jXXX -g
+        if(not defined $opt::jobs) {
+            $opt::jobs = 1;
+        }
+        if(not defined $opt::group) {
+            $opt::ungroup = 0;
+        }
+    }
+    if(@opt::trc) {
+        push @Global::ret_files, @opt::trc;
+        $opt::transfer = 1;
+        $opt::cleanup = 1;
+    }
+    if(defined $opt::max_lines) {
+	if($opt::max_lines eq "-0") {
+	    # -l -0 (swallowed -0)
+	    $opt::max_lines = 1;
+	    $opt::0 = 1;
+	    $/ = "\0";
+	} elsif ($opt::max_lines == 0) {
+	    # If not given (or if 0 is given) => 1
+	    $opt::max_lines = 1;
+	}
+	$Global::max_lines = $opt::max_lines;
+	if(not $opt::pipe) {
+	    # --pipe -L means length of record - not max_number_of_args
+	    $Global::max_number_of_args ||= $Global::max_lines;
+	}
+    }
+
+    # Read more than one arg at a time (-L, -N)
+    if(defined $opt::L) {
+	$Global::max_lines = $opt::L;
+	if(not $opt::pipe) {
+	    # --pipe -L means length of record - not max_number_of_args
+	    $Global::max_number_of_args ||= $Global::max_lines;
+	}
+    }
+    if(defined $opt::max_replace_args) {
+	$Global::max_number_of_args = $opt::max_replace_args;
+	$Global::ContextReplace = 1;
+    }
+    if((defined $opt::L or defined $opt::max_replace_args)
+       and
+       not ($opt::xargs or $opt::m)) {
+	$Global::ContextReplace = 1;
+    }
+    if(defined $opt::tag and not defined $opt::tagstring) {
+	$opt::tagstring = "\257<\257>"; # Default = {}
+    }
+    if(defined $opt::pipepart and
+       (defined $opt::L or defined $opt::max_lines
+	or defined $opt::max_replace_args)) {
+	::error("--pipepart is incompatible with --max-replace-args, ",
+		"--max-lines, and -L.\n");
+	wait_and_exit(255);
+    }
+    if(grep /^$Global::arg_sep$|^$Global::arg_file_sep$/o, @ARGV) {
+        # Deal with ::: and ::::
+        @ARGV=read_args_from_command_line();
+    }
+
+    # Semaphore defaults
+    # Must be done before computing number of processes and max_line_length
+    # because when running as a semaphore GNU Parallel does not read args
+    $Global::semaphore ||= ($0 =~ m:(^|/)sem$:); # called as 'sem'
+    if($Global::semaphore) {
+        # A semaphore does not take input from neither stdin nor file
+        @opt::a = ("/dev/null");
+        push(@Global::unget_argv, [Arg->new("")]);
+        $Semaphore::timeout = $opt::semaphoretimeout || 0;
+        if(defined $opt::semaphorename) {
+            $Semaphore::name = $opt::semaphorename;
+        } else {
+            $Semaphore::name = `tty`;
+            chomp $Semaphore::name;
+        }
+        $Semaphore::fg = $opt::fg;
+        $Semaphore::wait = $opt::wait;
+        $Global::default_simultaneous_sshlogins = 1;
+        if(not defined $opt::jobs) {
+            $opt::jobs = 1;
+        }
+	if($Global::interactive and $opt::bg) {
+	    ::error("Jobs running in the ".
+		    "background cannot be interactive.\n");
+            ::wait_and_exit(255);
+	}
+    }
+    if(defined $opt::eta) {
+        $opt::progress = $opt::eta;
+    }
+    if(defined $opt::bar) {
+        $opt::progress = $opt::bar;
+    }
+    if(defined $opt::retired) {
+	    ::error("-g has been retired. Use --group.\n");
+	    ::error("-B has been retired. Use --bf.\n");
+	    ::error("-T has been retired. Use --tty.\n");
+	    ::error("-U has been retired. Use --er.\n");
+	    ::error("-W has been retired. Use --wd.\n");
+	    ::error("-Y has been retired. Use --shebang.\n");
+	    ::error("-H has been retired. Use --halt.\n");
+	    ::error("--tollef has been retired. Use -u -q --arg-sep -- and --load for -l.\n");
+            ::wait_and_exit(255);
+    }
+    citation_notice();
+
+    parse_sshlogin();
+    parse_env_var();
+
+    if(remote_hosts() and ($opt::X or $opt::m or $opt::xargs)) {
+        # As we do not know the max line length on the remote machine
+        # long commands generated by xargs may fail
+        # If opt_N is set, it is probably safe
+        ::warning("Using -X or -m with --sshlogin may fail.\n");
+    }
+
+    if(not defined $opt::jobs) {
+        $opt::jobs = "100%";
+    }
+    open_joblog();
+}
+
+sub env_quote {
+    # Input:
+    #   $v = value to quote
+    # Returns:
+    #   $v = value quoted as environment variable
+    my $v = $_[0];
+    $v =~ s/([\\])/\\$1/g;
+    $v =~ s/([\[\] \#\'\&\<\>\(\)\;\{\}\t\"\$\`\*\174\!\?\~])/\\$1/g;
+    $v =~ s/\n/"\n"/g;
+    return $v;
+}
+
+sub record_env {
+    # Record current %ENV-keys in ~/.parallel/ignored_vars
+    # Returns: N/A
+    my $ignore_filename = $ENV{'HOME'} . "/.parallel/ignored_vars";
+    if(open(my $vars_fh, ">", $ignore_filename)) {
+	print $vars_fh map { $_,"\n" } keys %ENV;
+    } else {
+	::error("Cannot write to $ignore_filename\n");
+	::wait_and_exit(255);
+    }
+}
+
+sub parse_env_var {
+    # Parse --env and set $Global::envvar, $Global::envwarn and $Global::envvarlen
+    #
+    # Bash functions must be parsed to export them remotely
+    #   Pre-shellshock style bash function:
+    #     myfunc=() {...
+    #   Post-shellshock style bash function:
+    #     BASH_FUNC_myfunc()=() {...
+    #
+    # Uses:
+    #   $Global::envvar = eval string that will set variables in both bash and csh
+    #   $Global::envwarn = If functions are used: Give warning in csh
+    #   $Global::envvarlen = length of $Global::envvar
+    #   @opt::env
+    #   $Global::shell
+    #   %ENV
+    # Returns: N/A
+    $Global::envvar = "";
+    $Global::envwarn = "";
+    my @vars = ('parallel_bash_environment');
+    for my $varstring (@opt::env) {
+        # Split up --env VAR1,VAR2
+	push @vars, split /,/, $varstring;
+    }
+    if(grep { /^_$/ } @vars) {
+	# --env _
+	# Include all vars that are not in a clean environment
+	if(open(my $vars_fh, "<", $ENV{'HOME'} . "/.parallel/ignored_vars")) {
+	    my @ignore = <$vars_fh>;
+	    chomp @ignore;
+	    my %ignore;
+	    @ignore{@ignore} = @ignore;
+	    close $vars_fh;
+	    push @vars, grep { not defined $ignore{$_} } keys %ENV;
+	    @vars = grep { not /^_$/ } @vars;
+	} else {
+	    ::error("Run '$Global::progname --record-env' in a clean environment first.\n");
+	    ::wait_and_exit(255);
+	}
+    }
+    # Duplicate vars as BASH functions to include post-shellshock functions.
+    # So --env myfunc should also look for BASH_FUNC_myfunc()
+    @vars = map { $_, "BASH_FUNC_$_()" } @vars;
+    # Keep only defined variables
+    @vars = grep { defined($ENV{$_}) } @vars;
+    # Pre-shellshock style bash function:
+    #   myfunc=() {  echo myfunc
+    #   }
+    # Post-shellshock style bash function:
+    #   BASH_FUNC_myfunc()=() {  echo myfunc
+    #   }
+    my @bash_functions = grep { substr($ENV{$_},0,4) eq "() {" } @vars;
+    my @non_functions = grep { substr($ENV{$_},0,4) ne "() {" } @vars;
+    if(@bash_functions) {
+	# Functions are not supported for all shells
+	if($Global::shell !~ m:/(bash|rbash|zsh|rzsh|dash|ksh):) {
+	    ::warning("Shell functions may not be supported in $Global::shell\n");
+	}
+    }
+
+    # Pre-shellschock names are without ()
+    my @bash_pre_shellshock = grep { not /\(\)/ } @bash_functions;
+    # Post-shellschock names are with ()
+    my @bash_post_shellshock = grep { /\(\)/ } @bash_functions;
+
+    my @qcsh = (map { my $a=$_; "setenv $a " . env_quote($ENV{$a})  }
+		grep { not /^parallel_bash_environment$/ } @non_functions);
+    my @qbash = (map { my $a=$_; "export $a=" . env_quote($ENV{$a}) }
+		 @non_functions, @bash_pre_shellshock);
+
+    push @qbash, map { my $a=$_; "eval $a\"\$$a\"" } @bash_pre_shellshock;
+    push @qbash, map { /BASH_FUNC_(.*)\(\)/; "$1 $ENV{$_}" } @bash_post_shellshock;
+
+    #ssh -tt -oLogLevel=quiet lo 'eval `echo PARALLEL_SEQ='$PARALLEL_SEQ'\;export PARALLEL_SEQ\;  PARALLEL_PID='$PARALLEL_PID'\;export PARALLEL_PID` ;'  tty\ \>/dev/null\ \&\&\ stty\ isig\ -onlcr\ -echo\;echo\ \$SHELL\ \|\ grep\ \"/t\\\{0,1\\\}csh\"\ \>\ /dev/null\ \&\&\ setenv\ BASH_FUNC_myfunc\ \\\(\\\)\\\ \\\{\\\ \\\ echo\\\ a\"'
+    #'\"\\\}\ \|\|\  myfunc\(\)\ \{\ \ echo\ a'
+    #'\}\ \;myfunc\ 1;
+
+    # Check if any variables contain \n
+    if(my @v = map { s/BASH_FUNC_(.*)\(\)/$1/; $_ } grep { $ENV{$_}=~/\n/ } @vars) {
+	# \n is bad for csh and will cause it to fail.
+	$Global::envwarn = ::shell_quote_scalar(q{echo $SHELL | grep -E "/t?csh" > /dev/null && echo CSH/TCSH DO NOT SUPPORT newlines IN VARIABLES/FUNCTIONS. Unset }."@v".q{ && exec false;}."\n\n") . $Global::envwarn;
+    }
+
+    if(not @qcsh) { push @qcsh, "true"; }
+    if(not @qbash) { push @qbash, "true"; }
+    # Create lines like:
+    # echo $SHELL | grep "/t\\{0,1\\}csh" >/dev/null && setenv V1 val1 && setenv V2 val2 || export V1=val1 && export V2=val2 ; echo "$V1$V2"
+    if(@vars) {
+	$Global::envvar .=
+	    join"",
+	    (q{echo $SHELL | grep "/t\\{0,1\\}csh" > /dev/null && }
+	     . join(" && ", @qcsh)
+	     . q{ || }
+	     . join(" && ", @qbash)
+	     .q{;});
+	if($ENV{'parallel_bash_environment'}) {
+	    $Global::envvar .= 'eval "$parallel_bash_environment";'."\n";
+	}
+    }
+    $Global::envvarlen = length $Global::envvar;
+}
+
+sub open_joblog {
+    # Open joblog as specified by --joblog
+    # Uses:
+    #   $opt::resume
+    #   $opt::resume_failed
+    #   $opt::joblog
+    #   $opt::results
+    #   $Global::job_already_run
+    #   %Global::fd
+    my $append = 0;
+    if(($opt::resume or $opt::resume_failed)
+       and
+       not ($opt::joblog or $opt::results)) {
+        ::error("--resume and --resume-failed require --joblog or --results.\n");
+	::wait_and_exit(255);
+    }
+    if($opt::joblog) {
+	if($opt::resume || $opt::resume_failed) {
+	    if(open(my $joblog_fh, "<", $opt::joblog)) {
+		# Read the joblog
+		$append = <$joblog_fh>; # If there is a header: Open as append later
+		my $joblog_regexp;
+		if($opt::resume_failed) {
+		    # Make a regexp that only matches commands with exit+signal=0
+		    # 4 host 1360490623.067 3.445 1023 1222 0 0 command
+		    $joblog_regexp='^(\d+)(?:\t[^\t]+){5}\t0\t0\t';
+		} else {
+		    # Just match the job number
+		    $joblog_regexp='^(\d+)';
+		}
+		while(<$joblog_fh>) {
+		    if(/$joblog_regexp/o) {
+			# This is 30% faster than set_job_already_run($1);
+			vec($Global::job_already_run,($1||0),1) = 1;
+		    } elsif(not /\d+\s+[^\s]+\s+([0-9.]+\s+){6}/) {
+			::error("Format of '$opt::joblog' is wrong: $_");
+			::wait_and_exit(255);
+		    }
+		}
+		close $joblog_fh;
+	    }
+	}
+	if($append) {
+	    # Append to joblog
+	    if(not open($Global::joblog, ">>", $opt::joblog)) {
+		::error("Cannot append to --joblog $opt::joblog.\n");
+		::wait_and_exit(255);
+	    }
+	} else {
+	    if($opt::joblog eq "-") {
+		# Use STDOUT as joblog
+		$Global::joblog = $Global::fd{1};
+	    } elsif(not open($Global::joblog, ">", $opt::joblog)) {
+		# Overwrite the joblog
+		::error("Cannot write to --joblog $opt::joblog.\n");
+		::wait_and_exit(255);
+	    }
+	    print $Global::joblog
+		join("\t", "Seq", "Host", "Starttime", "JobRuntime",
+		     "Send", "Receive", "Exitval", "Signal", "Command"
+		). "\n";
+	}
+    }
+}
+
+sub find_compression_program {
+    # Find a fast compression program
+    # Returns:
+    #   $compress_program = compress program with options
+    #   $decompress_program = decompress program with options
+
+    # Search for these. Sorted by speed
+    my @prg = qw(lzop pigz pxz gzip plzip pbzip2 lzma xz lzip bzip2);
+    for my $p (@prg) {
+	if(which($p)) {
+	    return ("$p -c -1","$p -dc");
+	}
+    }
+    # Fall back to cat
+    return ("cat","cat");
+}
+
+
+sub read_options {
+    # Read options from command line, profile and $PARALLEL
+    # Uses:
+    #   $opt::shebang_wrap
+    #   $opt::shebang
+    #   @ARGV
+    #   $opt::plain
+    #   @opt::profile
+    #   $ENV{'HOME'}
+    #   $ENV{'PARALLEL'}
+    # Returns:
+    #   @ARGV_no_opt = @ARGV without --options
+
+    # This must be done first as this may exec myself
+    if(defined $ARGV[0] and ($ARGV[0] =~ /^--shebang/ or
+			     $ARGV[0] =~ /^--shebang-?wrap/ or
+			     $ARGV[0] =~ /^--hashbang/)) {
+        # Program is called from #! line in script
+	# remove --shebang-wrap if it is set
+        $opt::shebang_wrap = ($ARGV[0] =~ s/^--shebang-?wrap *//);
+	# remove --shebang if it is set
+	$opt::shebang = ($ARGV[0] =~ s/^--shebang *//);
+	# remove --hashbang if it is set
+        $opt::shebang .= ($ARGV[0] =~ s/^--hashbang *//);
+	if($opt::shebang) {
+	    my $argfile = shell_quote_scalar(pop @ARGV);
+	    # exec myself to split $ARGV[0] into separate fields
+	    exec "$0 --skip-first-line -a $argfile @ARGV";
+	}
+	if($opt::shebang_wrap) {
+            my @options;
+	    my @parser;
+	    if ($^O eq 'freebsd') {
+		# FreeBSD's #! puts different values in @ARGV than Linux' does.
+		my @nooptions = @ARGV;
+		get_options_from_array(\@nooptions);
+		while($#ARGV > $#nooptions) {
+		    push @options, shift @ARGV;
+		}
+		while(@ARGV and $ARGV[0] ne ":::") {
+		    push @parser, shift @ARGV;
+		}
+		if(@ARGV and $ARGV[0] eq ":::") {
+		    shift @ARGV;
+		}
+	    } else {
+		@options = shift @ARGV;
+	    }
+	    my $script = shell_quote_scalar(shift @ARGV);
+	    # exec myself to split $ARGV[0] into separate fields
+	    exec "$0 --internal-pipe-means-argfiles @options @parser $script ::: @ARGV";
+	}
+    }
+
+    Getopt::Long::Configure("bundling","require_order");
+    my @ARGV_copy = @ARGV;
+    # Check if there is a --profile to set @opt::profile
+    get_options_from_array(\@ARGV_copy,"profile|J=s","plain") || die_usage();
+    my @ARGV_profile = ();
+    my @ARGV_env = ();
+    if(not $opt::plain) {
+	# Add options from .parallel/config and other profiles
+	my @config_profiles = (
+	    "/etc/parallel/config",
+	    $ENV{'HOME'}."/.parallel/config",
+	    $ENV{'HOME'}."/.parallelrc");
+	my @profiles = @config_profiles;
+	if(@opt::profile) {
+	    # --profile overrides default profiles
+	    @profiles = ();
+	    for my $profile (@opt::profile) {
+		if(-r $profile) {
+		    push @profiles, $profile;
+		} else {
+		    push @profiles, $ENV{'HOME'}."/.parallel/".$profile;
+		}
+	    }
+	}
+	for my $profile (@profiles) {
+	    if(-r $profile) {
+		open (my $in_fh, "<", $profile) || ::die_bug("read-profile: $profile");
+		while(<$in_fh>) {
+		    /^\s*\#/ and next;
+		    chomp;
+		    push @ARGV_profile, shellwords($_);
+		}
+		close $in_fh;
+	    } else {
+		if(grep /^$profile$/, @config_profiles) {
+		    # config file is not required to exist
+		} else {
+		    ::error("$profile not readable.\n");
+		    wait_and_exit(255);
+		}
+	    }
+	}
+	# Add options from shell variable $PARALLEL
+	if($ENV{'PARALLEL'}) {
+	    @ARGV_env = shellwords($ENV{'PARALLEL'});
+	}
+    }
+    Getopt::Long::Configure("bundling","require_order");
+    get_options_from_array(\@ARGV_profile) || die_usage();
+    get_options_from_array(\@ARGV_env) || die_usage();
+    get_options_from_array(\@ARGV) || die_usage();
+
+    # Prepend non-options to @ARGV (such as commands like 'nice')
+    unshift @ARGV, @ARGV_profile, @ARGV_env;
+    return @ARGV;
+}
+
+sub read_args_from_command_line {
+    # Arguments given on the command line after:
+    #   ::: ($Global::arg_sep)
+    #   :::: ($Global::arg_file_sep)
+    # Removes the arguments from @ARGV and:
+    # - puts filenames into -a
+    # - puts arguments into files and add the files to -a
+    # Input:
+    #   @::ARGV = command option ::: arg arg arg :::: argfiles
+    # Uses:
+    #   $Global::arg_sep
+    #   $Global::arg_file_sep
+    #   $opt::internal_pipe_means_argfiles
+    #   $opt::pipe
+    #   @opt::a
+    # Returns:
+    #   @argv_no_argsep = @::ARGV without ::: and :::: and following args
+    my @new_argv = ();
+    for(my $arg = shift @ARGV; @ARGV; $arg = shift @ARGV) {
+        if($arg eq $Global::arg_sep
+	   or
+	   $arg eq $Global::arg_file_sep) {
+	    my $group = $arg; # This group of arguments is args or argfiles
+	    my @group;
+	    while(defined ($arg = shift @ARGV)) {
+		if($arg eq $Global::arg_sep
+		   or
+		   $arg eq $Global::arg_file_sep) {
+		    # exit while loop if finding new separator
+		    last;
+		} else {
+		    # If not hitting ::: or ::::
+		    # Append it to the group
+		    push @group, $arg;
+		}
+	    }
+
+	    if($group eq $Global::arg_file_sep
+	       or ($opt::internal_pipe_means_argfiles and $opt::pipe)
+		) {
+		# Group of file names on the command line.
+		# Append args into -a
+		push @opt::a, @group;
+	    } elsif($group eq $Global::arg_sep) {
+		# Group of arguments on the command line.
+		# Put them into a file.
+		# Create argfile
+		my ($outfh,$name) = ::tmpfile(SUFFIX => ".arg");
+		unlink($name);
+		# Put args into argfile
+		print $outfh map { $_,$/ } @group;
+		seek $outfh, 0, 0;
+		# Append filehandle to -a
+		push @opt::a, $outfh;
+	    } else {
+		::die_bug("Unknown command line group: $group");
+	    }
+	    if(defined($arg)) {
+		# $arg is ::: or ::::
+		redo;
+	    } else {
+		# $arg is undef -> @ARGV empty
+		last;
+	    }
+	}
+	push @new_argv, $arg;
+    }
+    # Output: @ARGV = command to run with options
+    return @new_argv;
+}
+
+sub cleanup {
+    # Returns: N/A
+    if(@opt::basefile) { cleanup_basefile(); }
+}
+
+sub __QUOTING_ARGUMENTS_FOR_SHELL__ {}
+
+sub shell_quote {
+    # Input:
+    #   @strings = strings to be quoted
+    # Output:
+    #   @shell_quoted_strings = string quoted with \ as needed by the shell
+    my @strings = (@_);
+    for my $a (@strings) {
+        $a =~ s/([\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\*\>\<\~\|\; \"\!\$\&\'\202-\377])/\\$1/g;
+        $a =~ s/[\n]/'\n'/g; # filenames with '\n' is quoted using \'
+    }
+    return wantarray ? @strings : "@strings";
+}
+
+sub shell_quote_empty {
+    # Inputs:
+    #   @strings = strings to be quoted
+    # Returns:
+    #   @quoted_strings = empty strings quoted as ''.
+    my @strings = shell_quote(@_);
+    for my $a (@strings) {
+	if($a eq "") {
+	    $a = "''";
+	}
+    }
+    return wantarray ? @strings : "@strings";
+}
+
+sub shell_quote_scalar {
+    # Quote the string so shell will not expand any special chars
+    # Inputs:
+    #   $string = string to be quoted
+    # Returns:
+    #   $shell_quoted = string quoted with \ as needed by the shell
+    my $a = $_[0];
+    if(defined $a) {
+	# $a =~ s/([\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\*\>\<\~\|\; \"\!\$\&\'\202-\377])/\\$1/g;
+	# This is 1% faster than the above
+	$a =~ s/[\002-\011\013-\032\\\#\?\`\(\)\{\}\[\]\*\>\<\~\|\; \"\!\$\&\'\202-\377]/\\$&/go;
+	$a =~ s/[\n]/'\n'/go; # filenames with '\n' is quoted using \'
+    }
+    return $a;
+}
+
+sub shell_quote_file {
+    # Quote the string so shell will not expand any special chars and prepend ./ if needed
+    # Input:
+    #   $filename = filename to be shell quoted
+    # Returns:
+    #   $quoted_filename = filename quoted with \ as needed by the shell and ./ if needed
+    my $a = shell_quote_scalar(shift);
+    if(defined $a) {
+	if($a =~ m:^/: or $a =~ m:^\./:) {
+	    # /abs/path or ./rel/path => skip
+	} else {
+	    # rel/path => ./rel/path
+	    $a = "./".$a;
+	}
+    }
+    return $a;
+}
+
+sub shellwords {
+    # Input:
+    #   $string = shell line
+    # Returns:
+    #   @shell_words = $string split into words as shell would do
+    $Global::use{"Text::ParseWords"} ||= eval "use Text::ParseWords; 1;";
+    return Text::ParseWords::shellwords(@_);
+}
+
+
+sub __FILEHANDLES__ {}
+
+
+sub save_stdin_stdout_stderr {
+    # Remember the original STDIN, STDOUT and STDERR
+    # and file descriptors opened by the shell (e.g. 3>/tmp/foo)
+    # Uses:
+    #   %Global::fd
+    #   $Global::original_stderr
+    #   $Global::original_stdin
+    # Returns: N/A
+
+    # Find file descriptors that are already opened (by the shell)
+    for my $fdno (1..61) {
+	# /dev/fd/62 and above are used by bash for <(cmd)
+	my $fh;
+	# 2-argument-open is used to be compatible with old perl 5.8.0
+	# bug #43570: Perl 5.8.0 creates 61 files
+	if(open($fh,">&=$fdno")) {
+	    $Global::fd{$fdno}=$fh;
+	}
+    }
+    open $Global::original_stderr, ">&", "STDERR" or
+	::die_bug("Can't dup STDERR: $!");
+    open $Global::original_stdin, "<&", "STDIN" or
+	::die_bug("Can't dup STDIN: $!");
+    $Global::is_terminal = (-t $Global::original_stderr) && !$ENV{'CIRCLECI'} && !$ENV{'TRAVIS'};
+}
+
+sub enough_file_handles {
+    # Check that we have enough filehandles available for starting
+    # another job
+    # Uses:
+    #   $opt::ungroup
+    #   %Global::fd
+    # Returns:
+    #   1 if ungrouped (thus not needing extra filehandles)
+    #   0 if too few filehandles
+    #   1 if enough filehandles
+    if(not $opt::ungroup) {
+        my %fh;
+        my $enough_filehandles = 1;
+  	# perl uses 7 filehandles for something?
+        # open3 uses 2 extra filehandles temporarily
+        # We need a filehandle for each redirected file descriptor
+	# (normally just STDOUT and STDERR)
+	for my $i (1..(7+2+keys %Global::fd)) {
+            $enough_filehandles &&= open($fh{$i}, "<", "/dev/null");
+        }
+        for (values %fh) { close $_; }
+        return $enough_filehandles;
+    } else {
+	# Ungrouped does not need extra file handles
+	return 1;
+    }
+}
+
+sub open_or_exit {
+    # Open a file name or exit if the file cannot be opened
+    # Inputs:
+    #   $file = filehandle or filename to open
+    # Uses:
+    #   $Global::stdin_in_opt_a
+    #   $Global::original_stdin
+    # Returns:
+    #   $fh = file handle to read-opened file
+    my $file = shift;
+    if($file eq "-") {
+	$Global::stdin_in_opt_a = 1;
+	return ($Global::original_stdin || *STDIN);
+    }
+    if(ref $file eq "GLOB") {
+	# This is an open filehandle
+	return $file;
+    }
+    my $fh = gensym;
+    if(not open($fh, "<", $file)) {
+        ::error("Cannot open input file `$file': No such file or directory.\n");
+        wait_and_exit(255);
+    }
+    return $fh;
+}
+
+sub __RUNNING_THE_JOBS_AND_PRINTING_PROGRESS__ {}
+
+# Variable structure:
+#
+#    $Global::running{$pid} = Pointer to Job-object
+#    @Global::virgin_jobs = Pointer to Job-object that have received no input
+#    $Global::host{$sshlogin} = Pointer to SSHLogin-object
+#    $Global::total_running = total number of running jobs
+#    $Global::total_started = total jobs started
+
+sub init_run_jobs {
+    $Global::total_running = 0;
+    $Global::total_started = 0;
+    $Global::tty_taken = 0;
+    $SIG{USR1} = \&list_running_jobs;
+    $SIG{USR2} = \&toggle_progress;
+    if(@opt::basefile) { setup_basefile(); }
+}
+
+{
+    my $last_time;
+    my %last_mtime;
+
+sub start_more_jobs {
+    # Run start_another_job() but only if:
+    #   * not $Global::start_no_new_jobs set
+    #   * not JobQueue is empty
+    #   * not load on server is too high
+    #   * not server swapping
+    #   * not too short time since last remote login
+    # Uses:
+    #   $Global::max_procs_file
+    #   $Global::max_procs_file_last_mod
+    #   %Global::host
+    #   @opt::sshloginfile
+    #   $Global::start_no_new_jobs
+    #   $opt::filter_hosts
+    #   $Global::JobQueue
+    #   $opt::pipe
+    #   $opt::load
+    #   $opt::noswap
+    #   $opt::delay
+    #   $Global::newest_starttime
+    # Returns:
+    #   $jobs_started = number of jobs started
+    my $jobs_started = 0;
+    my $jobs_started_this_round = 0;
+    if($Global::start_no_new_jobs) {
+	return $jobs_started;
+    }
+    if(time - ($last_time||0) > 1) {
+	# At most do this every second
+	$last_time = time;
+	if($Global::max_procs_file) {
+	    # --jobs filename
+	    my $mtime = (stat($Global::max_procs_file))[9];
+	    if($mtime > $Global::max_procs_file_last_mod) {
+		# file changed: Force re-computing max_jobs_running
+		$Global::max_procs_file_last_mod = $mtime;
+		for my $sshlogin (values %Global::host) {
+		    $sshlogin->set_max_jobs_running(undef);
+		}
+	    }
+	}
+	if(@opt::sshloginfile) {
+	    # Is --sshloginfile changed?
+	    for my $slf (@opt::sshloginfile) {
+		my $actual_file = expand_slf_shorthand($slf);
+		my $mtime = (stat($actual_file))[9];
+		$last_mtime{$actual_file} ||= $mtime;
+		if($mtime - $last_mtime{$actual_file} > 1) {
+		    ::debug("run","--sshloginfile $actual_file changed. reload\n");
+		    $last_mtime{$actual_file} = $mtime;
+		    # Reload $slf
+		    # Empty sshlogins
+		    @Global::sshlogin = ();
+		    for (values %Global::host) {
+			# Don't start new jobs on any host
+			# except the ones added back later
+			$_->set_max_jobs_running(0);
+		    }
+		    # This will set max_jobs_running on the SSHlogins
+		    read_sshloginfile($actual_file);
+		    parse_sshlogin();
+		    $opt::filter_hosts and filter_hosts();
+		    setup_basefile();
+		}
+	    }
+	}
+    }
+    do {
+	$jobs_started_this_round = 0;
+	# This will start 1 job on each --sshlogin (if possible)
+	# thus distribute the jobs on the --sshlogins round robin
+
+	for my $sshlogin (values %Global::host) {
+	    if($Global::JobQueue->empty() and not $opt::pipe) {
+		# No more jobs in the queue
+		last;
+	    }
+	    debug("run", "Running jobs before on ", $sshlogin->string(), ": ",
+		  $sshlogin->jobs_running(), "\n");
+	    if ($sshlogin->jobs_running() < $sshlogin->max_jobs_running()) {
+		if($opt::load and $sshlogin->loadavg_too_high()) {
+		    # The load is too high or unknown
+		    next;
+		}
+		if($opt::noswap and $sshlogin->swapping()) {
+		    # The server is swapping
+		    next;
+		}
+		if($sshlogin->too_fast_remote_login()) {
+		    # It has been too short since
+		    next;
+		}
+		if($opt::delay and $opt::delay > ::now() - $Global::newest_starttime) {
+		    # It has been too short since last start
+		    next;
+		}
+		debug("run", $sshlogin->string(), " has ", $sshlogin->jobs_running(),
+		      " out of ", $sshlogin->max_jobs_running(),
+		      " jobs running. Start another.\n");
+		if(start_another_job($sshlogin) == 0) {
+		    # No more jobs to start on this $sshlogin
+		    debug("run","No jobs started on ", $sshlogin->string(), "\n");
+		    next;
+		}
+		$sshlogin->inc_jobs_running();
+		$sshlogin->set_last_login_at(::now());
+		$jobs_started++;
+		$jobs_started_this_round++;
+	    }
+	    debug("run","Running jobs after on ", $sshlogin->string(), ": ",
+		  $sshlogin->jobs_running(), " of ",
+		  $sshlogin->max_jobs_running(), "\n");
+	}
+    } while($jobs_started_this_round);
+
+    return $jobs_started;
+}
+}
+
+{
+    my $no_more_file_handles_warned;
+
+sub start_another_job {
+    # If there are enough filehandles
+    #   and JobQueue not empty
+    #   and not $job is in joblog
+    # Then grab a job from Global::JobQueue,
+    #   start it at sshlogin
+    #   mark it as virgin_job
+    # Inputs:
+    #   $sshlogin = the SSHLogin to start the job on
+    # Uses:
+    #   $Global::JobQueue
+    #   $opt::pipe
+    #   $opt::results
+    #   $opt::resume
+    #   @Global::virgin_jobs
+    # Returns:
+    #   1 if another jobs was started
+    #   0 otherwise
+    my $sshlogin = shift;
+    # Do we have enough file handles to start another job?
+    if(enough_file_handles()) {
+        if($Global::JobQueue->empty() and not $opt::pipe) {
+            # No more commands to run
+	    debug("start", "Not starting: JobQueue empty\n");
+	    return 0;
+        } else {
+            my $job;
+	    # Skip jobs already in job log
+	    # Skip jobs already in results
+            do {
+		$job = get_job_with_sshlogin($sshlogin);
+		if(not defined $job) {
+		    # No command available for that sshlogin
+		    debug("start", "Not starting: no jobs available for ",
+			  $sshlogin->string(), "\n");
+		    return 0;
+		}
+	    } while ($job->is_already_in_joblog()
+		     or
+		     ($opt::results and $opt::resume and $job->is_already_in_results()));
+	    debug("start", "Command to run on '", $job->sshlogin()->string(), "': '",
+		  $job->replaced(),"'\n");
+            if($job->start()) {
+		if($opt::pipe) {
+		    push(@Global::virgin_jobs,$job);
+		}
+                debug("start", "Started as seq ", $job->seq(),
+		      " pid:", $job->pid(), "\n");
+                return 1;
+            } else {
+                # Not enough processes to run the job.
+		# Put it back on the queue.
+		$Global::JobQueue->unget($job);
+		# Count down the number of jobs to run for this SSHLogin.
+		my $max = $sshlogin->max_jobs_running();
+		if($max > 1) { $max--; } else {
+		    ::error("No more processes: cannot run a single job. Something is wrong.\n");
+		    ::wait_and_exit(255);
+		}
+		$sshlogin->set_max_jobs_running($max);
+		# Sleep up to 300 ms to give other processes time to die
+		::usleep(rand()*300);
+		::warning("No more processes: ",
+			  "Decreasing number of running jobs to $max. ",
+			  "Raising ulimit -u or /etc/security/limits.conf may help.\n");
+		return 0;
+            }
+        }
+    } else {
+        # No more file handles
+	$no_more_file_handles_warned++ or
+	    ::warning("No more file handles. ",
+		      "Raising ulimit -n or /etc/security/limits.conf may help.\n");
+        return 0;
+    }
+}
+}
+
+$opt::min_progress_interval = 0;
+
+sub init_progress {
+    # Uses:
+    #   $opt::bar
+    # Returns:
+    #   list of computers for progress output
+    $|=1;
+    if (not $Global::is_terminal) {
+      $opt::min_progress_interval = 30;
+    }
+    if($opt::bar) {
+	return("","");
+    }
+    my %progress = progress();
+    return ("\nComputers / CPU cores / Max jobs to run\n",
+            $progress{'workerlist'});
+}
+
+sub drain_job_queue {
+    # Uses:
+    #   $opt::progress
+    #   $Global::original_stderr
+    #   $Global::total_running
+    #   $Global::max_jobs_running
+    #   %Global::running
+    #   $Global::JobQueue
+    #   %Global::host
+    #   $Global::start_no_new_jobs
+    # Returns: N/A
+    if($opt::progress) {
+        print $Global::original_stderr init_progress();
+    }
+    my $last_header="";
+    my $sleep = 0.2;
+    my $last_left = 1000000000;
+    my $last_progress_time = 0;
+    my $ps_reported = 0;
+    do {
+        while($Global::total_running > 0) {
+            debug($Global::total_running, "==", scalar
+		  keys %Global::running," slots: ", $Global::max_jobs_running);
+	    if($opt::pipe) {
+		# When using --pipe sometimes file handles are not closed properly
+		for my $job (values %Global::running) {
+		    close $job->fh(0,"w");
+		}
+	    }
+            # When not connected to terminal, assume CI (e.g. CircleCI). In
+            # that case we want occasional progress output to prevent abort
+            # due to timeout with no output, but we also need to stop sending
+            # progress output if there has been no actual progress, so that
+            # the job can time out appropriately (CirecleCI: 10m) in case of
+            # a hung test. But without special output, it is extremely
+            # annoying to diagnose which test is hung, so we add that using
+            # `ps` below.
+            if($opt::progress and
+               ($Global::is_terminal or (time() - $last_progress_time) >= 30)) {
+                my %progress = progress();
+                if($last_header ne $progress{'header'}) {
+                    print $Global::original_stderr "\n", $progress{'header'}, "\n";
+                    $last_header = $progress{'header'};
+                }
+                if ($Global::is_terminal) {
+                    print $Global::original_stderr "\r",$progress{'status'};
+                }
+                if ($last_left > $Global::left) {
+                    if (not $Global::is_terminal) {
+                        print $Global::original_stderr $progress{'status'},"\n";
+                    }
+                    $last_progress_time = time();
+                    $ps_reported = 0;
+                } elsif (not $ps_reported and (time() - $last_progress_time) >= 60) {
+                    # No progress in at least 60 seconds: run ps
+                    print $Global::original_stderr "\n";
+                    my $script_dir = ::dirname($0);
+                    system("$script_dir/ps_with_stack || ps -wwf");
+                    $ps_reported = 1;
+                }
+                $last_left = $Global::left;
+                flush $Global::original_stderr;
+            }
+	    if($Global::total_running < $Global::max_jobs_running
+	       and not $Global::JobQueue->empty()) {
+		# These jobs may not be started because of loadavg
+		# or too little time between each ssh login.
+		if(start_more_jobs() > 0) {
+		    # Exponential back-on if jobs were started
+		    $sleep = $sleep/2+0.001;
+		}
+	    }
+            # Sometimes SIGCHLD is not registered, so force reaper
+	    $sleep = ::reap_usleep($sleep);
+        }
+        if(not $Global::JobQueue->empty()) {
+	    # These jobs may not be started:
+	    # * because there the --filter-hosts has removed all
+	    if(not %Global::host) {
+		::error("There are no hosts left to run on.\n");
+		::wait_and_exit(255);
+	    }
+	    # * because of loadavg
+	    # * because of too little time between each ssh login.
+            start_more_jobs();
+	    $sleep = ::reap_usleep($sleep);
+	    if($Global::max_jobs_running == 0) {
+		::warning("There are no job slots available. Increase --jobs.\n");
+	    }
+        }
+    } while ($Global::total_running > 0
+	     or
+	     not $Global::start_no_new_jobs and not $Global::JobQueue->empty());
+    if($opt::progress) {
+	my %progress = progress();
+	print $Global::original_stderr $opt::progress_sep, $progress{'status'}, "\n";
+	flush $Global::original_stderr;
+    }
+}
+
+sub toggle_progress {
+    # Turn on/off progress view
+    # Uses:
+    #   $opt::progress
+    #   $Global::original_stderr
+    # Returns: N/A
+    $opt::progress = not $opt::progress;
+    if($opt::progress) {
+        print $Global::original_stderr init_progress();
+    }
+}
+
+sub progress {
+    # Uses:
+    #   $opt::bar
+    #   $opt::eta
+    #   %Global::host
+    #   $Global::total_started
+    # Returns:
+    #   $workerlist = list of workers
+    #   $header = that will fit on the screen
+    #   $status = message that will fit on the screen
+    if($opt::bar) {
+	return ("workerlist" => "", "header" => "", "status" => bar());
+    }
+    my $eta = "";
+    my ($status,$header)=("","");
+    if($opt::eta) {
+      my($total, $completed, $left, $pctcomplete, $avgtime, $this_eta) =
+        compute_eta();
+      $eta = sprintf("ETA: %ds Left: %d AVG: %.2fs  ",
+        $this_eta, $left, $avgtime);
+      $Global::left = $left;
+    }
+    my $termcols = terminal_columns();
+    my @workers = sort keys %Global::host;
+    my %sshlogin = map { $_ eq ":" ? ($_=>"local") : ($_=>$_) } @workers;
+    my $workerno = 1;
+    my %workerno = map { ($_=>$workerno++) } @workers;
+    my $workerlist = "";
+    for my $w (@workers) {
+        $workerlist .=
+        $workerno{$w}.":".$sshlogin{$w} ." / ".
+            ($Global::host{$w}->ncpus() || "-")." / ".
+            $Global::host{$w}->max_jobs_running()."\n";
+    }
+    $status = "x"x($termcols+1);
+    if(length $status > $termcols) {
+        # sshlogin1:XX/XX/XX%/XX.Xs sshlogin2:XX/XX/XX%/XX.Xs sshlogin3:XX/XX/XX%/XX.Xs
+        $header = "Computer:jobs running/jobs completed/%of started jobs/Average seconds to complete";
+        $status = $eta .
+            join(" ",map
+                 {
+                     if($Global::total_started) {
+                         my $completed = ($Global::host{$_}->jobs_completed()||0);
+                         my $running = $Global::host{$_}->jobs_running();
+                         my $time = $completed ? (time-$^T)/($completed) : "0";
+                         sprintf("%s:%d/%d/%d%%/%.1fs ",
+                                 $sshlogin{$_}, $running, $completed,
+                                 ($running+$completed)*100
+                                 / $Global::total_started, $time);
+                     }
+                 } @workers);
+    }
+    if(length $status > $termcols) {
+        # 1:XX/XX/XX%/XX.Xs 2:XX/XX/XX%/XX.Xs 3:XX/XX/XX%/XX.Xs 4:XX/XX/XX%/XX.Xs
+        $header = "Computer:jobs running/jobs completed/%of started jobs";
+        $status = $eta .
+            join(" ",map
+                 {
+                     my $completed = ($Global::host{$_}->jobs_completed()||0);
+                     my $running = $Global::host{$_}->jobs_running();
+                     my $time = $completed ? (time-$^T)/($completed) : "0";
+                     sprintf("%s:%d/%d/%d%%/%.1fs ",
+                             $workerno{$_}, $running, $completed,
+                             ($running+$completed)*100
+                             / $Global::total_started, $time);
+                 } @workers);
+    }
+    if(length $status > $termcols) {
+        # sshlogin1:XX/XX/XX% sshlogin2:XX/XX/XX% sshlogin3:XX/XX/XX%
+        $header = "Computer:jobs running/jobs completed/%of started jobs";
+        $status = $eta .
+            join(" ",map
+                 { sprintf("%s:%d/%d/%d%%",
+                           $sshlogin{$_},
+                           $Global::host{$_}->jobs_running(),
+                           ($Global::host{$_}->jobs_completed()||0),
+                           ($Global::host{$_}->jobs_running()+
+                            ($Global::host{$_}->jobs_completed()||0))*100
+                           / $Global::total_started) }
+                 @workers);
+    }
+    if(length $status > $termcols) {
+        # 1:XX/XX/XX% 2:XX/XX/XX% 3:XX/XX/XX% 4:XX/XX/XX% 5:XX/XX/XX% 6:XX/XX/XX%
+        $header = "Computer:jobs running/jobs completed/%of started jobs";
+        $status = $eta .
+            join(" ",map
+                 { sprintf("%s:%d/%d/%d%%",
+                           $workerno{$_},
+                           $Global::host{$_}->jobs_running(),
+                           ($Global::host{$_}->jobs_completed()||0),
+                           ($Global::host{$_}->jobs_running()+
+                            ($Global::host{$_}->jobs_completed()||0))*100
+                           / $Global::total_started) }
+                 @workers);
+    }
+    if(length $status > $termcols) {
+        # sshlogin1:XX/XX/XX% sshlogin2:XX/XX/XX% sshlogin3:XX/XX sshlogin4:XX/XX
+        $header = "Computer:jobs running/jobs completed";
+        $status = $eta .
+            join(" ",map
+                       { sprintf("%s:%d/%d",
+                                 $sshlogin{$_}, $Global::host{$_}->jobs_running(),
+                                 ($Global::host{$_}->jobs_completed()||0)) }
+                       @workers);
+    }
+    if(length $status > $termcols) {
+        # sshlogin1:XX/XX sshlogin2:XX/XX sshlogin3:XX/XX sshlogin4:XX/XX
+        $header = "Computer:jobs running/jobs completed";
+        $status = $eta .
+            join(" ",map
+                       { sprintf("%s:%d/%d",
+                                 $sshlogin{$_}, $Global::host{$_}->jobs_running(),
+                                 ($Global::host{$_}->jobs_completed()||0)) }
+                       @workers);
+    }
+    if(length $status > $termcols) {
+        # 1:XX/XX 2:XX/XX 3:XX/XX 4:XX/XX 5:XX/XX 6:XX/XX
+        $header = "Computer:jobs running/jobs completed";
+        $status = $eta .
+            join(" ",map
+                       { sprintf("%s:%d/%d",
+                                 $workerno{$_}, $Global::host{$_}->jobs_running(),
+                                 ($Global::host{$_}->jobs_completed()||0)) }
+                       @workers);
+    }
+    if(length $status > $termcols) {
+        # sshlogin1:XX sshlogin2:XX sshlogin3:XX sshlogin4:XX sshlogin5:XX
+        $header = "Computer:jobs completed";
+        $status = $eta .
+            join(" ",map
+                       { sprintf("%s:%d",
+                                 $sshlogin{$_},
+                                 ($Global::host{$_}->jobs_completed()||0)) }
+                       @workers);
+    }
+    if(length $status > $termcols) {
+        # 1:XX 2:XX 3:XX 4:XX 5:XX 6:XX
+        $header = "Computer:jobs completed";
+        $status = $eta .
+            join(" ",map
+                       { sprintf("%s:%d",
+                                 $workerno{$_},
+                                 ($Global::host{$_}->jobs_completed()||0)) }
+                       @workers);
+    }
+    return ("workerlist" => $workerlist, "header" => $header, "status" => $status);
+}
+
+{
+    my ($total, $first_completed, $smoothed_avg_time);
+
+    sub compute_eta {
+	# Calculate important numbers for ETA
+	# Returns:
+	#   $total = number of jobs in total
+	#   $completed = number of jobs completed
+	#   $left = number of jobs left
+	#   $pctcomplete = percent of jobs completed
+	#   $avgtime = averaged time
+	#   $eta = smoothed eta
+	$total ||= $Global::JobQueue->total_jobs();
+	my $completed = 0;
+        for(values %Global::host) { $completed += $_->jobs_completed() }
+	my $left = $total - $completed;
+	if(not $completed) {
+	    return($total, $completed, $left, 0, 0, 0);
+	}
+	my $pctcomplete = $completed / $total;
+	$first_completed ||= time;
+	my $timepassed = (time - $first_completed);
+	my $avgtime = $timepassed / $completed;
+	$smoothed_avg_time ||= $avgtime;
+	# Smooth the eta so it does not jump wildly
+	$smoothed_avg_time = (1 - $pctcomplete) * $smoothed_avg_time +
+	    $pctcomplete * $avgtime;
+	my $eta = int($left * $smoothed_avg_time);
+	return($total, $completed, $left, $pctcomplete, $avgtime, $eta);
+    }
+}
+
+{
+    my ($rev,$reset);
+
+    sub bar {
+	# Return:
+	#   $status = bar with eta, completed jobs, arg and pct
+	$rev ||= "\033[7m";
+	$reset ||= "\033[0m";
+	my($total, $completed, $left, $pctcomplete, $avgtime, $eta) =
+	    compute_eta();
+	my $arg = $Global::newest_job ?
+	    $Global::newest_job->{'commandline'}->replace_placeholders(["\257<\257>"],0,0) : "";
+	# These chars mess up display in the terminal
+	$arg =~ tr/[\011-\016\033\302-\365]//d;
+	my $bar_text =
+	    sprintf("%d%% %d:%d=%ds %s",
+		    $pctcomplete*100, $completed, $left, $eta, $arg);
+	my $terminal_width = terminal_columns();
+	my $s = sprintf("%-${terminal_width}s",
+			substr($bar_text." "x$terminal_width,
+			       0,$terminal_width));
+	my $width = int($terminal_width * $pctcomplete);
+	substr($s,$width,0) = $reset;
+	my $zenity = sprintf("%-${terminal_width}s",
+			     substr("#   $eta sec $arg",
+				    0,$terminal_width));
+	$s = "\r" . $zenity . "\r" . $pctcomplete*100 . # Prefix with zenity header
+	    "\r" . $rev . $s . $reset;
+	return $s;
+    }
+}
+
+{
+    my ($columns,$last_column_time);
+
+    sub terminal_columns {
+	# Get the number of columns of the display
+	# Returns:
+	#   number of columns of the screen
+	if(not $columns or $last_column_time < time) {
+	    $last_column_time = time;
+	    $columns = $ENV{'COLUMNS'};
+	    if(not $columns) {
+		my $resize = qx{ resize 2>/dev/null };
+		$resize =~ /COLUMNS=(\d+);/ and do { $columns = $1; };
+	    }
+	    $columns ||= 80;
+	}
+	return $columns;
+    }
+}
+
+sub get_job_with_sshlogin {
+    # Returns:
+    #   next job object for $sshlogin if any available
+    my $sshlogin = shift;
+    my $job = undef;
+
+    if ($opt::hostgroups) {
+	my @other_hostgroup_jobs = ();
+
+        while($job = $Global::JobQueue->get()) {
+	    if($sshlogin->in_hostgroups($job->hostgroups())) {
+		# Found a job for this hostgroup
+		last;
+	    } else {
+		# This job was not in the hostgroups of $sshlogin
+                push @other_hostgroup_jobs, $job;
+            }
+        }
+	$Global::JobQueue->unget(@other_hostgroup_jobs);
+	if(not defined $job) {
+	    # No more jobs
+	    return undef;
+	}
+    } else {
+        $job = $Global::JobQueue->get();
+        if(not defined $job) {
+            # No more jobs
+	    ::debug("start", "No more jobs: JobQueue empty\n");
+            return undef;
+        }
+    }
+
+    my $clean_command = $job->replaced();
+    if($clean_command =~ /^\s*$/) {
+        # Do not run empty lines
+        if(not $Global::JobQueue->empty()) {
+            return get_job_with_sshlogin($sshlogin);
+        } else {
+            return undef;
+        }
+    }
+    $job->set_sshlogin($sshlogin);
+    if($opt::retries and $clean_command and
+       $job->failed_here()) {
+        # This command with these args failed for this sshlogin
+        my ($no_of_failed_sshlogins,$min_failures) = $job->min_failed();
+	# Only look at the Global::host that have > 0 jobslots
+        if($no_of_failed_sshlogins == grep { $_->max_jobs_running() > 0 } values %Global::host
+	   and $job->failed_here() == $min_failures) {
+            # It failed the same or more times on another host:
+            # run it on this host
+        } else {
+            # If it failed fewer times on another host:
+            # Find another job to run
+            my $nextjob;
+            if(not $Global::JobQueue->empty()) {
+		# This can potentially recurse for all args
+                no warnings 'recursion';
+                $nextjob = get_job_with_sshlogin($sshlogin);
+            }
+            # Push the command back on the queue
+            $Global::JobQueue->unget($job);
+            return $nextjob;
+        }
+    }
+    return $job;
+}
+
+sub __REMOTE_SSH__ {}
+
+sub read_sshloginfiles {
+    # Returns: N/A
+    for my $s (@_) {
+	read_sshloginfile(expand_slf_shorthand($s));
+    }
+}
+
+sub expand_slf_shorthand {
+    my $file = shift;
+    if($file eq "-") {
+	# skip: It is stdin
+    } elsif($file eq "..") {
+        $file = $ENV{'HOME'}."/.parallel/sshloginfile";
+    } elsif($file eq ".") {
+        $file = "/etc/parallel/sshloginfile";
+    } elsif(not -r $file) {
+	if(not -r $ENV{'HOME'}."/.parallel/".$file) {
+		# Try prepending ~/.parallel
+		::error("Cannot open $file.\n");
+		::wait_and_exit(255);
+	} else {
+	    $file = $ENV{'HOME'}."/.parallel/".$file;
+	}
+    }
+    return $file;
+}
+
+sub read_sshloginfile {
+    # Returns: N/A
+    my $file = shift;
+    my $close = 1;
+    my $in_fh;
+    ::debug("init","--slf ",$file);
+    if($file eq "-") {
+	$in_fh = *STDIN;
+	$close = 0;
+    } else {
+	if(not open($in_fh, "<", $file)) {
+	    # Try the filename
+	    ::error("Cannot open $file.\n");
+	    ::wait_and_exit(255);
+	}
+    }
+    while(<$in_fh>) {
+        chomp;
+        /^\s*#/ and next;
+        /^\s*$/ and next;
+        push @Global::sshlogin, $_;
+    }
+    if($close) {
+	close $in_fh;
+    }
+}
+
+sub parse_sshlogin {
+    # Returns: N/A
+    my @login;
+    if(not @Global::sshlogin) { @Global::sshlogin = (":"); }
+    for my $sshlogin (@Global::sshlogin) {
+        # Split up -S sshlogin,sshlogin
+        for my $s (split /,/, $sshlogin) {
+            if ($s eq ".." or $s eq "-") {
+		# This may add to @Global::sshlogin - possibly bug
+		read_sshloginfile(expand_slf_shorthand($s));
+            } else {
+                push (@login, $s);
+            }
+        }
+    }
+    $Global::minimal_command_line_length = 8_000_000;
+    my @allowed_hostgroups;
+    for my $ncpu_sshlogin_string (::uniq(@login)) {
+	my $sshlogin = SSHLogin->new($ncpu_sshlogin_string);
+	my $sshlogin_string = $sshlogin->string();
+	if($sshlogin_string eq "") {
+	    # This is an ssh group: -S @webservers
+	    push @allowed_hostgroups, $sshlogin->hostgroups();
+	    next;
+	}
+	if($Global::host{$sshlogin_string}) {
+	    # This sshlogin has already been added:
+	    # It is probably a host that has come back
+	    # Set the max_jobs_running back to the original
+	    debug("run","Already seen $sshlogin_string\n");
+	    if($sshlogin->{'ncpus'}) {
+		# If ncpus set by '#/' of the sshlogin, overwrite it:
+		$Global::host{$sshlogin_string}->set_ncpus($sshlogin->ncpus());
+	    }
+	    $Global::host{$sshlogin_string}->set_max_jobs_running(undef);
+	    next;
+	}
+	if($sshlogin_string eq ":") {
+	    $sshlogin->set_maxlength(Limits::Command::max_length());
+	} else {
+	    # If all chars needs to be quoted, every other character will be \
+	    $sshlogin->set_maxlength(int(Limits::Command::max_length()/2));
+	}
+	$Global::minimal_command_line_length =
+	    ::min($Global::minimal_command_line_length, $sshlogin->maxlength());
+        $Global::host{$sshlogin_string} = $sshlogin;
+    }
+    if(@allowed_hostgroups) {
+	# Remove hosts that are not in these groups
+	while (my ($string, $sshlogin) = each %Global::host) {
+	    if(not $sshlogin->in_hostgroups(@allowed_hostgroups)) {
+		delete $Global::host{$string};
+	    }
+	}
+    }
+
+    # debug("start", "sshlogin: ", my_dump(%Global::host),"\n");
+    if($opt::transfer or @opt::return or $opt::cleanup or @opt::basefile) {
+        if(not remote_hosts()) {
+            # There are no remote hosts
+            if(@opt::trc) {
+		::warning("--trc ignored as there are no remote --sshlogin.\n");
+            } elsif (defined $opt::transfer) {
+		::warning("--transfer ignored as there are no remote --sshlogin.\n");
+            } elsif (@opt::return) {
+                ::warning("--return ignored as there are no remote --sshlogin.\n");
+            } elsif (defined $opt::cleanup) {
+		::warning("--cleanup ignored as there are no remote --sshlogin.\n");
+            } elsif (@opt::basefile) {
+                ::warning("--basefile ignored as there are no remote --sshlogin.\n");
+            }
+        }
+    }
+}
+
+sub remote_hosts {
+    # Return sshlogins that are not ':'
+    # Returns:
+    #   list of sshlogins with ':' removed
+    return grep !/^:$/, keys %Global::host;
+}
+
+sub setup_basefile {
+    # Transfer basefiles to each $sshlogin
+    # This needs to be done before first jobs on $sshlogin is run
+    # Returns: N/A
+    my $cmd = "";
+    my $rsync_destdir;
+    my $workdir;
+    for my $sshlogin (values %Global::host) {
+      if($sshlogin->string() eq ":") { next }
+      for my $file (@opt::basefile) {
+	if($file !~ m:^/: and $opt::workdir eq "...") {
+	  ::error("Work dir '...' will not work with relative basefiles\n");
+	  ::wait_and_exit(255);
+	}
+	$workdir ||= Job->new("")->workdir();
+	$cmd .= $sshlogin->rsync_transfer_cmd($file,$workdir) . "&";
+      }
+    }
+    $cmd .= "wait;";
+    debug("init", "basesetup: $cmd\n");
+    print `$cmd`;
+}
+
+sub cleanup_basefile {
+    # Remove the basefiles transferred
+    # Returns: N/A
+    my $cmd="";
+    my $workdir = Job->new("")->workdir();
+    for my $sshlogin (values %Global::host) {
+        if($sshlogin->string() eq ":") { next }
+        for my $file (@opt::basefile) {
+	  $cmd .= $sshlogin->cleanup_cmd($file,$workdir)."&";
+        }
+    }
+    $cmd .= "wait;";
+    debug("init", "basecleanup: $cmd\n");
+    print `$cmd`;
+}
+
+sub filter_hosts {
+    my(@cores, @cpus, @maxline, @echo);
+    my $envvar = ::shell_quote_scalar($Global::envvar);
+    while (my ($host, $sshlogin) = each %Global::host) {
+	if($host eq ":") { next }
+	# The 'true' is used to get the $host out later
+	my $sshcmd = "true $host;" . $sshlogin->sshcommand()." ".$sshlogin->serverlogin();
+	push(@cores, $host."\t".$sshcmd." ".$envvar." parallel --number-of-cores\n\0");
+	push(@cpus, $host."\t".$sshcmd." ".$envvar." parallel --number-of-cpus\n\0");
+	push(@maxline, $host."\t".$sshcmd." ".$envvar." parallel --max-line-length-allowed\n\0");
+	# 'echo' is used to get the best possible value for an ssh login time
+	push(@echo, $host."\t".$sshcmd." echo\n\0");
+    }
+    my ($fh, $tmpfile) = ::tmpfile(SUFFIX => ".ssh");
+    print $fh @cores, @cpus, @maxline, @echo;
+    close $fh;
+    # --timeout 5: Setting up an SSH connection and running a simple
+    #              command should never take > 5 sec.
+    # --delay 0.1: If multiple sshlogins use the same proxy the delay
+    #              will make it less likely to overload the ssh daemon.
+    # --retries 3: If the ssh daemon it overloaded, try 3 times
+    # -s 16000: Half of the max line on UnixWare
+    my $cmd = "cat $tmpfile | $0 -j0 --timeout 5 -s 16000 --joblog - --plain --delay 0.1 --retries 3 --tag --tagstring {1} -0 --colsep '\t' -k eval {2} 2>/dev/null";
+    ::debug("init", $cmd, "\n");
+    open(my $host_fh, "-|", $cmd) || ::die_bug("parallel host check: $cmd");
+    my (%ncores, %ncpus, %time_to_login, %maxlen, %echo, @down_hosts);
+    my $prepend = "";
+    while(<$host_fh>) {
+	if(/\'$/) {
+	    # if last char = ' then append next line
+	    # This may be due to quoting of $Global::envvar
+	    $prepend .= $_;
+	    next;
+	}
+	$_ = $prepend . $_;
+	$prepend = "";
+	chomp;
+	my @col = split /\t/, $_;
+	if(defined $col[6]) {
+	    # This is a line from --joblog
+	    # seq host time spent sent received exit signal command
+	    # 2 : 1372607672.654 0.675 0 0 0 0 eval true\ m\;ssh\ m\ parallel\ --number-of-cores
+	    if($col[0] eq "Seq" and $col[1] eq "Host" and
+		    $col[2] eq "Starttime") {
+		# Header => skip
+		next;
+	    }
+	    # Get server from: eval true server\;
+	    $col[8] =~ /eval true..([^;]+).;/ or ::die_bug("col8 does not contain host: $col[8]");
+	    my $host = $1;
+	    $host =~ tr/\\//d;
+	    $Global::host{$host} or next;
+	    if($col[6] eq "255" or $col[7] eq "15") {
+		# exit == 255 or signal == 15: ssh failed
+		# Remove sshlogin
+		::debug("init", "--filtered $host\n");
+		push(@down_hosts, $host);
+		@down_hosts = uniq(@down_hosts);
+	    } elsif($col[6] eq "127") {
+		# signal == 127: parallel not installed remote
+		# Set ncpus and ncores = 1
+		::warning("Could not figure out ",
+			  "number of cpus on $host. Using 1.\n");
+		$ncores{$host} = 1;
+		$ncpus{$host} = 1;
+		$maxlen{$host} = Limits::Command::max_length();
+	    } elsif($col[0] =~ /^\d+$/ and $Global::host{$host}) {
+		# Remember how log it took to log in
+		# 2 : 1372607672.654 0.675 0 0 0 0 eval true\ m\;ssh\ m\ echo
+		$time_to_login{$host} = ::min($time_to_login{$host},$col[3]);
+	    } else {
+		::die_bug("host check unmatched long jobline: $_");
+	    }
+	} elsif($Global::host{$col[0]}) {
+	    # This output from --number-of-cores, --number-of-cpus,
+	    # --max-line-length-allowed
+	    # ncores: server       8
+	    # ncpus:  server       2
+	    # maxlen: server       131071
+	    if(not $ncores{$col[0]}) {
+		$ncores{$col[0]} = $col[1];
+	    } elsif(not $ncpus{$col[0]}) {
+		$ncpus{$col[0]} = $col[1];
+	    } elsif(not $maxlen{$col[0]}) {
+		$maxlen{$col[0]} = $col[1];
+	    } elsif(not $echo{$col[0]}) {
+		$echo{$col[0]} = $col[1];
+	    } elsif(m/perl: warning:|LANGUAGE =|LC_ALL =|LANG =|are supported and installed/) {
+		# Skip these:
+		# perl: warning: Setting locale failed.
+		# perl: warning: Please check that your locale settings:
+		#         LANGUAGE = (unset),
+		#         LC_ALL = (unset),
+		#         LANG = "en_US.UTF-8"
+		#     are supported and installed on your system.
+		# perl: warning: Falling back to the standard locale ("C").
+	    } else {
+		::die_bug("host check too many col0: $_");
+	    }
+	} else {
+	    ::die_bug("host check unmatched short jobline ($col[0]): $_");
+	}
+    }
+    close $host_fh;
+    $Global::debug or unlink $tmpfile;
+    delete @Global::host{@down_hosts};
+    @down_hosts and ::warning("Removed @down_hosts\n");
+    $Global::minimal_command_line_length = 8_000_000;
+    while (my ($sshlogin, $obj) = each %Global::host) {
+	if($sshlogin eq ":") { next }
+	$ncpus{$sshlogin} or ::die_bug("ncpus missing: ".$obj->serverlogin());
+	$ncores{$sshlogin} or ::die_bug("ncores missing: ".$obj->serverlogin());
+	$time_to_login{$sshlogin} or ::die_bug("time_to_login missing: ".$obj->serverlogin());
+	$maxlen{$sshlogin} or ::die_bug("maxlen missing: ".$obj->serverlogin());
+	if($opt::use_cpus_instead_of_cores) {
+	    $obj->set_ncpus($ncpus{$sshlogin});
+	} else {
+	    $obj->set_ncpus($ncores{$sshlogin});
+	}
+	$obj->set_time_to_login($time_to_login{$sshlogin});
+        $obj->set_maxlength($maxlen{$sshlogin});
+	$Global::minimal_command_line_length =
+	    ::min($Global::minimal_command_line_length,
+		  int($maxlen{$sshlogin}/2));
+	::debug("init", "Timing from -S:$sshlogin ncpus:",$ncpus{$sshlogin},
+		" ncores:", $ncores{$sshlogin},
+		" time_to_login:", $time_to_login{$sshlogin},
+		" maxlen:", $maxlen{$sshlogin},
+		" min_max_len:", $Global::minimal_command_line_length,"\n");
+    }
+}
+
+sub onall {
+    sub tmp_joblog {
+	my $joblog = shift;
+	if(not defined $joblog) {
+	    return undef;
+	}
+	my ($fh, $tmpfile) = ::tmpfile(SUFFIX => ".log");
+	close $fh;
+	return $tmpfile;
+    }
+    my @command = @_;
+    if($Global::quoting) {
+       @command = shell_quote_empty(@command);
+    }
+
+    # Copy all @fhlist into tempfiles
+    my @argfiles = ();
+    for my $fh (@fhlist) {
+	my ($outfh, $name) = ::tmpfile(SUFFIX => ".all", UNLINK => 1);
+	print $outfh (<$fh>);
+	close $outfh;
+	push @argfiles, $name;
+    }
+    if(@opt::basefile) { setup_basefile(); }
+    # for each sshlogin do:
+    # parallel -S $sshlogin $command :::: @argfiles
+    #
+    # Pass some of the options to the sub-parallels, not all of them as
+    # -P should only go to the first, and -S should not be copied at all.
+    my $options =
+	join(" ",
+	     ((defined $opt::jobs) ? "-P $opt::jobs" : ""),
+	     ((defined $opt::linebuffer) ? "--linebuffer" : ""),
+	     ((defined $opt::ungroup) ? "-u" : ""),
+	     ((defined $opt::group) ? "-g" : ""),
+	     ((defined $opt::keeporder) ? "--keeporder" : ""),
+	     ((defined $opt::D) ? "-D $opt::D" : ""),
+	     ((defined $opt::plain) ? "--plain" : ""),
+	     ((defined $opt::max_chars) ? "--max-chars ".$opt::max_chars : ""),
+	);
+    my $suboptions =
+	join(" ",
+	     ((defined $opt::ungroup) ? "-u" : ""),
+	     ((defined $opt::linebuffer) ? "--linebuffer" : ""),
+	     ((defined $opt::group) ? "-g" : ""),
+	     ((defined $opt::files) ? "--files" : ""),
+	     ((defined $opt::keeporder) ? "--keeporder" : ""),
+	     ((defined $opt::colsep) ? "--colsep ".shell_quote($opt::colsep) : ""),
+	     ((@opt::v) ? "-vv" : ""),
+	     ((defined $opt::D) ? "-D $opt::D" : ""),
+	     ((defined $opt::timeout) ? "--timeout ".$opt::timeout : ""),
+	     ((defined $opt::plain) ? "--plain" : ""),
+	     ((defined $opt::retries) ? "--retries ".$opt::retries : ""),
+	     ((defined $opt::max_chars) ? "--max-chars ".$opt::max_chars : ""),
+	     ((defined $opt::arg_sep) ? "--arg-sep ".$opt::arg_sep : ""),
+	     ((defined $opt::arg_file_sep) ? "--arg-file-sep ".$opt::arg_file_sep : ""),
+	     (@opt::env ? map { "--env ".::shell_quote_scalar($_) } @opt::env : ""),
+	);
+    ::debug("init", "| $0 $options\n");
+    open(my $parallel_fh, "|-", "$0 --no-notice -j0 $options") ||
+	::die_bug("This does not run GNU Parallel: $0 $options");
+    my @joblogs;
+    for my $host (sort keys %Global::host) {
+	my $sshlogin = $Global::host{$host};
+	my $joblog = tmp_joblog($opt::joblog);
+	if($joblog) {
+	    push @joblogs, $joblog;
+	    $joblog = "--joblog $joblog";
+	}
+	my $quad = $opt::arg_file_sep || "::::";
+	::debug("init", "$0 $suboptions -j1 $joblog ",
+	    ((defined $opt::tag) ?
+	     "--tagstring ".shell_quote_scalar($sshlogin->string()) : ""),
+	     " -S ", shell_quote_scalar($sshlogin->string())," ",
+	     join(" ",shell_quote(@command))," $quad @argfiles\n");
+	print $parallel_fh "$0 $suboptions -j1 $joblog ",
+	    ((defined $opt::tag) ?
+	     "--tagstring ".shell_quote_scalar($sshlogin->string()) : ""),
+	     " -S ", shell_quote_scalar($sshlogin->string())," ",
+	     join(" ",shell_quote(@command))," $quad @argfiles\n";
+    }
+    close $parallel_fh;
+    $Global::exitstatus = $? >> 8;
+    debug("init", "--onall exitvalue ", $?);
+    if(@opt::basefile) { cleanup_basefile(); }
+    $Global::debug or unlink(@argfiles);
+    my %seen;
+    for my $joblog (@joblogs) {
+	# Append to $joblog
+	open(my $fh, "<", $joblog) || ::die_bug("Cannot open tmp joblog $joblog");
+	# Skip first line (header);
+	<$fh>;
+	print $Global::joblog (<$fh>);
+	close $fh;
+	unlink($joblog);
+    }
+}
+
+sub __SIGNAL_HANDLING__ {}
+
+sub save_original_signal_handler {
+    # Remember the original signal handler
+    # Returns: N/A
+    $SIG{TERM} ||= sub { exit 0; }; # $SIG{TERM} is not set on Mac OS X
+    $SIG{INT} = sub { if($opt::tmux) { qx { tmux kill-session -t p$$ }; }
+		      unlink keys %Global::unlink; exit -1  };
+    $SIG{TERM} = sub { if($opt::tmux) { qx { tmux kill-session -t p$$ }; }
+		      unlink keys %Global::unlink; exit -1  };
+    %Global::original_sig = %SIG;
+    $SIG{TERM} = sub {}; # Dummy until jobs really start
+}
+
+sub list_running_jobs {
+    # Returns: N/A
+    for my $v (values %Global::running) {
+        print $Global::original_stderr "$Global::progname: ",$v->replaced(),"\n";
+    }
+}
+
+sub start_no_new_jobs {
+    # Returns: N/A
+    $SIG{TERM} = $Global::original_sig{TERM};
+    print $Global::original_stderr
+        ("$Global::progname: SIGTERM received. No new jobs will be started.\n",
+         "$Global::progname: Waiting for these ", scalar(keys %Global::running),
+         " jobs to finish. Send SIGTERM again to stop now.\n");
+    list_running_jobs();
+    $Global::start_no_new_jobs ||= 1;
+}
+
+sub reaper {
+    # A job finished.
+    # Print the output.
+    # Start another job
+    # Returns: N/A
+    my $stiff;
+    my $children_reaped = 0;
+    debug("run", "Reaper ");
+    while (($stiff = waitpid(-1, &WNOHANG)) > 0) {
+	$children_reaped++;
+        if($Global::sshmaster{$stiff}) {
+            # This is one of the ssh -M: ignore
+            next;
+        }
+        my $job = $Global::running{$stiff};
+	# '-a <(seq 10)' will give us a pid not in %Global::running
+        $job or next;
+        $job->set_exitstatus($? >> 8);
+        $job->set_exitsignal($? & 127);
+        debug("run", "died (", $job->exitstatus(), "): ", $job->seq());
+        $job->set_endtime(::now());
+        if($stiff == $Global::tty_taken) {
+            # The process that died had the tty => release it
+            $Global::tty_taken = 0;
+        }
+
+        if(not $job->should_be_retried()) {
+	    # The job is done
+	    # Free the jobslot
+	    push @Global::slots, $job->slot();
+	    if($opt::timeout) {
+		# Update average runtime for timeout
+		$Global::timeoutq->update_delta_time($job->runtime());
+	    }
+            # Force printing now if the job failed and we are going to exit
+            my $print_now = ($opt::halt_on_error and $opt::halt_on_error == 2
+			     and $job->exitstatus());
+            if($opt::keeporder and not $print_now) {
+                print_earlier_jobs($job);
+            } else {
+                $job->print();
+            }
+            if($job->exitstatus()) {
+		process_failed_job($job);
+	    }
+
+        }
+        my $sshlogin = $job->sshlogin();
+        $sshlogin->dec_jobs_running();
+        $sshlogin->inc_jobs_completed();
+        $Global::total_running--;
+        delete $Global::running{$stiff};
+	start_more_jobs();
+    }
+    debug("run", "done ");
+    return $children_reaped;
+}
+
+sub process_failed_job {
+    # The jobs had a exit status <> 0, so error
+    # Returns: N/A
+    my $job = shift;
+    $Global::exitstatus++;
+    $Global::total_failed++;
+    if($opt::halt_on_error) {
+	if($opt::halt_on_error == 1
+	   or
+	   ($opt::halt_on_error < 1 and $Global::total_failed > 3
+	    and
+	    $Global::total_failed / $Global::total_started > $opt::halt_on_error)) {
+	    # If halt on error == 1 or --halt 10%
+	    # we should gracefully exit
+	    print $Global::original_stderr
+		("$Global::progname: Starting no more jobs. ",
+		 "Waiting for ", scalar(keys %Global::running),
+		 " jobs to finish. This job failed:\n",
+		 $job->replaced(),"\n");
+	    $Global::start_no_new_jobs ||= 1;
+	    $Global::halt_on_error_exitstatus = $job->exitstatus();
+	} elsif($opt::halt_on_error == 2) {
+	    # If halt on error == 2 we should exit immediately
+	    print $Global::original_stderr
+		("$Global::progname: This job failed:\n",
+		 $job->replaced(),"\n");
+	    exit ($job->exitstatus());
+	}
+    }
+}
+
+{
+    my (%print_later,$job_end_sequence);
+
+    sub print_earlier_jobs {
+	# Print jobs completed earlier
+	# Returns: N/A
+	my $job = shift;
+	$print_later{$job->seq()} = $job;
+	$job_end_sequence ||= 1;
+	debug("run", "Looking for: $job_end_sequence ",
+	      "Current: ", $job->seq(), "\n");
+	for(my $j = $print_later{$job_end_sequence};
+	    $j or vec($Global::job_already_run,$job_end_sequence,1);
+	    $job_end_sequence++,
+	    $j = $print_later{$job_end_sequence}) {
+	    debug("run", "Found job end $job_end_sequence");
+	    if($j) {
+		$j->print();
+		delete $print_later{$job_end_sequence};
+	    }
+	}
+    }
+}
+
+sub __USAGE__ {}
+
+sub wait_and_exit {
+    # If we do not wait, we sometimes get segfault
+    # Returns: N/A
+    my $error = shift;
+    if($error) {
+	# Kill all without printing
+	for my $job (values %Global::running) {
+	    $job->kill("TERM");
+	    $job->kill("TERM");
+	}
+    }
+    for (keys %Global::unkilled_children) {
+        kill 9, $_;
+        waitpid($_,0);
+        delete $Global::unkilled_children{$_};
+    }
+    wait();
+    exit($error);
+}
+
+sub die_usage {
+    # Returns: N/A
+    usage();
+    wait_and_exit(255);
+}
+
+sub usage {
+    # Returns: N/A
+    print join
+	("\n",
+	 "Usage:",
+	 "",
+	 "$Global::progname [options] [command [arguments]] < list_of_arguments",
+	 "$Global::progname [options] [command [arguments]] (::: arguments|:::: argfile(s))...",
+	 "cat ... | $Global::progname --pipe [options] [command [arguments]]",
+	 "",
+	 "-j n            Run n jobs in parallel",
+	 "-k              Keep same order",
+	 "-X              Multiple arguments with context replace",
+	 "--colsep regexp Split input on regexp for positional replacements",
+	 "{} {.} {/} {/.} {#} {%} {= perl code =} Replacement strings",
+	 "{3} {3.} {3/} {3/.} {=3 perl code =}    Positional replacement strings",
+	 "With --plus:    {} = {+/}/{/} = {.}.{+.} = {+/}/{/.}.{+.} = {..}.{+..} =",
+	 "                {+/}/{/..}.{+..} = {...}.{+...} = {+/}/{/...}.{+...}",
+	 "",
+	 "-S sshlogin     Example: foo\@server.example.com",
+	 "--slf ..        Use ~/.parallel/sshloginfile as the list of sshlogins",
+	 "--trc {}.bar    Shorthand for --transfer --return {}.bar --cleanup",
+	 "--onall         Run the given command with argument on all sshlogins",
+	 "--nonall        Run the given command with no arguments on all sshlogins",
+	 "",
+	 "--pipe          Split stdin (standard input) to multiple jobs.",
+	 "--recend str    Record end separator for --pipe.",
+	 "--recstart str  Record start separator for --pipe.",
+	 "",
+	 "See 'man $Global::progname' for details",
+	 "",
+	 "When using programs that use GNU Parallel to process data for publication please cite:",
+	 "",
+	 "O. Tange (2011): GNU Parallel - The Command-Line Power Tool,",
+	 ";login: The USENIX Magazine, February 2011:42-47.",
+	 "",
+	 "Or you can get GNU Parallel without this requirement by paying 10000 EUR.",
+	 "");
+}
+
+
+sub citation_notice {
+    # if --no-notice or --plain: do nothing
+    # if stderr redirected: do nothing
+    # if ~/.parallel/will-cite: do nothing
+    # else: print citation notice to stderr
+    if($opt::no_notice
+       or
+       $opt::plain
+       or
+       not -t $Global::original_stderr
+       or
+       -e $ENV{'HOME'}."/.parallel/will-cite") {
+	# skip
+    } else {
+	print $Global::original_stderr
+	    ("When using programs that use GNU Parallel to process data for publication please cite:\n",
+	     "\n",
+	     "  O. Tange (2011): GNU Parallel - The Command-Line Power Tool,\n",
+	     "  ;login: The USENIX Magazine, February 2011:42-47.\n",
+	     "\n",
+	     "This helps funding further development; and it won't cost you a cent.\n",
+	     "Or you can get GNU Parallel without this requirement by paying 10000 EUR.\n",
+	     "\n",
+	     "To silence this citation notice run 'parallel --bibtex' once or use '--no-notice'.\n\n",
+	    );
+	flush $Global::original_stderr;
+    }
+}
+
+
+sub warning {
+    my @w = @_;
+    my $fh = $Global::original_stderr || *STDERR;
+    my $prog = $Global::progname || "parallel";
+    print $fh $prog, ": Warning: ", @w;
+}
+
+
+sub error {
+    my @w = @_;
+    my $fh = $Global::original_stderr || *STDERR;
+    my $prog = $Global::progname || "parallel";
+    print $fh $prog, ": Error: ", @w;
+}
+
+
+sub die_bug {
+    my $bugid = shift;
+    print STDERR
+	("$Global::progname: This should not happen. You have found a bug.\n",
+	 "Please contact <parallel\@gnu.org> and include:\n",
+	 "* The version number: $Global::version\n",
+	 "* The bugid: $bugid\n",
+	 "* The command line being run\n",
+	 "* The files being read (put the files on a webserver if they are big)\n",
+	 "\n",
+	 "If you get the error on smaller/fewer files, please include those instead.\n");
+    ::wait_and_exit(255);
+}
+
+sub version {
+    # Returns: N/A
+    if($opt::tollef and not $opt::gnu) {
+	print "WARNING: YOU ARE USING --tollef. IF THINGS ARE ACTING WEIRD USE --gnu.\n";
+    }
+    print join("\n",
+               "GNU $Global::progname $Global::version",
+               "Copyright (C) 2007,2008,2009,2010,2011,2012,2013,2014 Ole Tange and Free Software Foundation, Inc.",
+               "License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>",
+               "This is free software: you are free to change and redistribute it.",
+               "GNU $Global::progname comes with no warranty.",
+               "",
+               "Web site: http://www.gnu.org/software/${Global::progname}\n",
+	       "When using programs that use GNU Parallel to process data for publication please cite:\n",
+	       "O. Tange (2011): GNU Parallel - The Command-Line Power Tool, ",
+	       ";login: The USENIX Magazine, February 2011:42-47.\n",
+	       "Or you can get GNU Parallel without this requirement by paying 10000 EUR.\n",
+        );
+}
+
+sub bibtex {
+    # Returns: N/A
+    if($opt::tollef and not $opt::gnu) {
+	print "WARNING: YOU ARE USING --tollef. IF THINGS ARE ACTING WEIRD USE --gnu.\n";
+    }
+    print join("\n",
+	       "When using programs that use GNU Parallel to process data for publication please cite:",
+	       "",
+               "\@article{Tange2011a,",
+	       " title = {GNU Parallel - The Command-Line Power Tool},",
+	       " author = {O. Tange},",
+	       " address = {Frederiksberg, Denmark},",
+	       " journal = {;login: The USENIX Magazine},",
+	       " month = {Feb},",
+	       " number = {1},",
+	       " volume = {36},",
+	       " url = {http://www.gnu.org/s/parallel},",
+	       " year = {2011},",
+	       " pages = {42-47}",
+	       "}",
+	       "",
+	       "(Feel free to use \\nocite{Tange2011a})",
+	       "",
+	       "This helps funding further development.",
+	       "",
+	       "Or you can get GNU Parallel without this requirement by paying 10000 EUR.",
+	       ""
+        );
+    while(not -e $ENV{'HOME'}."/.parallel/will-cite") {
+	print "\nType: 'will cite' and press enter.\n> ";
+	my $input = <STDIN>;
+	if($input =~ /will cite/i) {
+	    mkdir $ENV{'HOME'}."/.parallel";
+	    open (my $fh, ">", $ENV{'HOME'}."/.parallel/will-cite")
+		|| ::die_bug("Cannot write: ".$ENV{'HOME'}."/.parallel/will-cite");
+	    close $fh;
+	    print "\nThank you for your support. It is much appreciated. The citation\n",
+	    "notice is now silenced.\n";
+	}
+    }
+}
+
+sub show_limits {
+    # Returns: N/A
+    print("Maximal size of command: ",Limits::Command::real_max_length(),"\n",
+          "Maximal used size of command: ",Limits::Command::max_length(),"\n",
+          "\n",
+          "Execution of  will continue now, and it will try to read its input\n",
+          "and run commands; if this is not what you wanted to happen, please\n",
+          "press CTRL-D or CTRL-C\n");
+}
+
+sub __GENERIC_COMMON_FUNCTION__ {}
+
+sub uniq {
+    # Remove duplicates and return unique values
+    return keys %{{ map { $_ => 1 } @_ }};
+}
+
+sub min {
+    # Returns:
+    #   Minimum value of array
+    my $min;
+    for (@_) {
+        # Skip undefs
+        defined $_ or next;
+        defined $min or do { $min = $_; next; }; # Set $_ to the first non-undef
+        $min = ($min < $_) ? $min : $_;
+    }
+    return $min;
+}
+
+sub max {
+    # Returns:
+    #   Maximum value of array
+    my $max;
+    for (@_) {
+        # Skip undefs
+        defined $_ or next;
+        defined $max or do { $max = $_; next; }; # Set $_ to the first non-undef
+        $max = ($max > $_) ? $max : $_;
+    }
+    return $max;
+}
+
+sub sum {
+    # Returns:
+    #   Sum of values of array
+    my @args = @_;
+    my $sum = 0;
+    for (@args) {
+        # Skip undefs
+        $_ and do { $sum += $_; }
+    }
+    return $sum;
+}
+
+sub undef_as_zero {
+    my $a = shift;
+    return $a ? $a : 0;
+}
+
+sub undef_as_empty {
+    my $a = shift;
+    return $a ? $a : "";
+}
+
+{
+    my $hostname;
+    sub hostname {
+	if(not $hostname) {
+	    $hostname = `hostname`;
+	    chomp($hostname);
+	    $hostname ||= "nohostname";
+	}
+	return $hostname;
+    }
+}
+
+sub which {
+    # Input:
+    #   @programs = programs to find the path to
+    # Returns:
+    #   @full_path = full paths to @programs. Nothing if not found
+    my @which;
+    for my $prg (@_) {
+	push @which, map { $_."/".$prg } grep { -x $_."/".$prg } split(":",$ENV{'PATH'});
+    }
+    return @which;
+}
+
+{
+    my ($regexp,%fakename);
+
+    sub parent_shell {
+	# Input:
+	#   $pid = pid to see if (grand)*parent is a shell
+	# Returns:
+	#   $shellpath = path to shell - undef if no shell found
+	my $pid = shift;
+	if(not $regexp) {
+	    # All shells known to mankind
+	    #
+	    # ash bash csh dash fdsh fish fizsh ksh ksh93 mksh pdksh
+	    # posh rbash rush rzsh sash sh static-sh tcsh yash zsh
+	    my @shells = qw(ash bash csh dash fdsh fish fizsh ksh
+                            ksh93 mksh pdksh posh rbash rush rzsh
+                            sash sh static-sh tcsh yash zsh -sh -csh);
+	    # Can be formatted as:
+	    #   [sh]  -sh  sh  busybox sh
+	    #   /bin/sh /sbin/sh /opt/csw/sh
+	    # NOT: foo.sh sshd crash flush pdflush scosh fsflush ssh
+	    my $shell = "(?:".join("|",@shells).")";
+	    $regexp = '^((\[)('. $shell. ')(\])|(|\S+/|busybox )('. $shell. '))($| )';
+	    %fakename = (
+		# csh and tcsh disguise themselves as -sh/-csh
+		"-sh" => ["csh", "tcsh"],
+		"-csh" => ["tcsh", "csh"],
+		);
+	}
+	my ($children_of_ref, $parent_of_ref, $name_of_ref) = pid_table();
+	my $shellpath;
+	my $testpid = $pid;
+	while($testpid) {
+	    ::debug("init", "shell? ". $name_of_ref->{$testpid}."\n");
+	    if($name_of_ref->{$testpid} =~ /$regexp/o) {
+		::debug("init", "which ".($3||$6)." => ");
+		$shellpath = (which($3 || $6,@{$fakename{$3 || $6}}))[0];
+		::debug("init", "shell path $shellpath\n");
+		$shellpath and last;
+	    }
+	    $testpid = $parent_of_ref->{$testpid};
+	}
+	return $shellpath;
+    }
+}
+
+{
+    my %pid_parentpid_cmd;
+
+    sub pid_table {
+	# Returns:
+	#   %children_of = { pid -> children of pid }
+	#   %parent_of = { pid -> pid of parent }
+	#   %name_of = { pid -> commandname }
+
+       	if(not %pid_parentpid_cmd) {
+	    # Filter for SysV-style `ps`
+	    my $sysv = q( ps -ef | perl -ane '1..1 and /^(.*)CO?MM?A?N?D/ and $s=length $1;).
+		q(s/^.{$s}//; print "@F[1,2] $_"' );
+	    # BSD-style `ps`
+	    my $bsd = q(ps -o pid,ppid,command -ax);
+	    %pid_parentpid_cmd =
+	    (
+	     'aix' => $sysv,
+	     'cygwin' => $sysv,
+	     'msys' => $sysv,
+	     'dec_osf' => $sysv,
+	     'darwin' => $bsd,
+	     'dragonfly' => $bsd,
+	     'freebsd' => $bsd,
+	     'gnu' => $sysv,
+	     'hpux' => $sysv,
+	     'linux' => $sysv,
+	     'mirbsd' => $bsd,
+	     'netbsd' => $bsd,
+	     'nto' => $sysv,
+	     'openbsd' => $bsd,
+	     'solaris' => $sysv,
+	     'svr5' => $sysv,
+	    );
+	}
+	$pid_parentpid_cmd{$^O} or ::die_bug("pid_parentpid_cmd for $^O missing");
+
+	my (@pidtable,%parent_of,%children_of,%name_of);
+	# Table with pid -> children of pid
+	@pidtable = `$pid_parentpid_cmd{$^O}`;
+	my $p=$$;
+	for (@pidtable) {
+	    # must match: 24436 21224 busybox ash
+	    /(\S+)\s+(\S+)\s+(\S+.*)/ or ::die_bug("pidtable format: $_");
+	    $parent_of{$1} = $2;
+	    push @{$children_of{$2}}, $1;
+	    $name_of{$1} = $3;
+	}
+	return(\%children_of, \%parent_of, \%name_of);
+    }
+}
+
+sub reap_usleep {
+    # Reap dead children.
+    # If no dead children: Sleep specified amount with exponential backoff
+    # Input:
+    #   $ms = milliseconds to sleep
+    # Returns:
+    #   $ms/2+0.001 if children reaped
+    #   $ms*1.1 if no children reaped
+    my $ms = shift;
+    if(reaper()) {
+	# Sleep exponentially shorter (1/2^n) if a job finished
+	return $ms/2+0.001;
+    } else {
+	if($opt::timeout) {
+	    $Global::timeoutq->process_timeouts();
+	}
+	usleep($ms);
+	Job::exit_if_disk_full();
+	if($opt::linebuffer) {
+	    for my $job (values %Global::running) {
+		$job->print();
+	    }
+	}
+	# Sleep exponentially longer (1.1^n) if a job did not finish
+	# though at most 1000 ms.
+	return (($ms < 1000) ? ($ms * 1.1) : ($ms));
+    }
+}
+
+sub usleep {
+    # Sleep this many milliseconds.
+    # Input:
+    #   $ms = milliseconds to sleep
+    my $ms = shift;
+    ::debug(int($ms),"ms ");
+    select(undef, undef, undef, $ms/1000);
+}
+
+sub now {
+    # Returns time since epoch as in seconds with 3 decimals
+    # Uses:
+    #   @Global::use
+    # Returns:
+    #   $time = time now with millisecond accuracy
+    if(not $Global::use{"Time::HiRes"}) {
+	if(eval "use Time::HiRes qw ( time );") {
+	    eval "sub TimeHiRestime { return Time::HiRes::time };";
+	} else {
+	    eval "sub TimeHiRestime { return time() };";
+	}
+	$Global::use{"Time::HiRes"} = 1;
+    }
+
+    return (int(TimeHiRestime()*1000))/1000;
+}
+
+sub multiply_binary_prefix {
+    # Evalualte numbers with binary prefix
+    # Ki=2^10, Mi=2^20, Gi=2^30, Ti=2^40, Pi=2^50, Ei=2^70, Zi=2^80, Yi=2^80
+    # ki=2^10, mi=2^20, gi=2^30, ti=2^40, pi=2^50, ei=2^70, zi=2^80, yi=2^80
+    # K =2^10, M =2^20, G =2^30, T =2^40, P =2^50, E =2^70, Z =2^80, Y =2^80
+    # k =10^3, m =10^6, g =10^9, t=10^12, p=10^15, e=10^18, z=10^21, y=10^24
+    # 13G = 13*1024*1024*1024 = 13958643712
+    # Input:
+    #   $s = string with prefixes
+    # Returns:
+    #   $value = int with prefixes multiplied
+    my $s = shift;
+    $s =~ s/ki/*1024/gi;
+    $s =~ s/mi/*1024*1024/gi;
+    $s =~ s/gi/*1024*1024*1024/gi;
+    $s =~ s/ti/*1024*1024*1024*1024/gi;
+    $s =~ s/pi/*1024*1024*1024*1024*1024/gi;
+    $s =~ s/ei/*1024*1024*1024*1024*1024*1024/gi;
+    $s =~ s/zi/*1024*1024*1024*1024*1024*1024*1024/gi;
+    $s =~ s/yi/*1024*1024*1024*1024*1024*1024*1024*1024/gi;
+    $s =~ s/xi/*1024*1024*1024*1024*1024*1024*1024*1024*1024/gi;
+
+    $s =~ s/K/*1024/g;
+    $s =~ s/M/*1024*1024/g;
+    $s =~ s/G/*1024*1024*1024/g;
+    $s =~ s/T/*1024*1024*1024*1024/g;
+    $s =~ s/P/*1024*1024*1024*1024*1024/g;
+    $s =~ s/E/*1024*1024*1024*1024*1024*1024/g;
+    $s =~ s/Z/*1024*1024*1024*1024*1024*1024*1024/g;
+    $s =~ s/Y/*1024*1024*1024*1024*1024*1024*1024*1024/g;
+    $s =~ s/X/*1024*1024*1024*1024*1024*1024*1024*1024*1024/g;
+
+    $s =~ s/k/*1000/g;
+    $s =~ s/m/*1000*1000/g;
+    $s =~ s/g/*1000*1000*1000/g;
+    $s =~ s/t/*1000*1000*1000*1000/g;
+    $s =~ s/p/*1000*1000*1000*1000*1000/g;
+    $s =~ s/e/*1000*1000*1000*1000*1000*1000/g;
+    $s =~ s/z/*1000*1000*1000*1000*1000*1000*1000/g;
+    $s =~ s/y/*1000*1000*1000*1000*1000*1000*1000*1000/g;
+    $s =~ s/x/*1000*1000*1000*1000*1000*1000*1000*1000*1000/g;
+
+    $s = eval $s;
+    ::debug($s);
+    return $s;
+}
+
+sub tmpfile {
+    # Create tempfile as $TMPDIR/parXXXXX
+    # Returns:
+    #   $filename = file name created
+    return ::tempfile(DIR=>$ENV{'TMPDIR'}, TEMPLATE => 'parXXXXX', @_);
+}
+
+sub __DEBUGGING__ {}
+
+sub debug {
+    # Uses:
+    #   $Global::debug
+    #   %Global::fd
+    # Returns: N/A
+    $Global::debug or return;
+    @_ = grep { defined $_ ? $_ : "" } @_;
+    if($Global::debug eq "all" or $Global::debug eq $_[0]) {
+	if($Global::fd{1}) {
+	    # Original stdout was saved
+	    my $stdout = $Global::fd{1};
+	    print $stdout @_[1..$#_];
+	} else {
+	    print @_[1..$#_];
+	}
+    }
+}
+
+sub my_memory_usage {
+    # Returns:
+    #   memory usage if found
+    #   0 otherwise
+    use strict;
+    use FileHandle;
+
+    my $pid = $$;
+    if(-e "/proc/$pid/stat") {
+        my $fh = FileHandle->new("</proc/$pid/stat");
+
+        my $data = <$fh>;
+        chomp $data;
+        $fh->close;
+
+        my @procinfo = split(/\s+/,$data);
+
+        return undef_as_zero($procinfo[22]);
+    } else {
+        return 0;
+    }
+}
+
+sub my_size {
+    # Returns:
+    #   $size = size of object if Devel::Size is installed
+    #   -1 otherwise
+    my @size_this = (@_);
+    eval "use Devel::Size qw(size total_size)";
+    if ($@) {
+        return -1;
+    } else {
+        return total_size(@_);
+    }
+}
+
+sub my_dump {
+    # Returns:
+    #   ascii expression of object if Data::Dump(er) is installed
+    #   error code otherwise
+    my @dump_this = (@_);
+    eval "use Data::Dump qw(dump);";
+    if ($@) {
+        # Data::Dump not installed
+        eval "use Data::Dumper;";
+        if ($@) {
+            my $err =  "Neither Data::Dump nor Data::Dumper is installed\n".
+                "Not dumping output\n";
+            print $Global::original_stderr $err;
+            return $err;
+        } else {
+            return Dumper(@dump_this);
+        }
+    } else {
+	# Create a dummy Data::Dump:dump as Hans Schou sometimes has
+	# it undefined
+	eval "sub Data::Dump:dump {}";
+        eval "use Data::Dump qw(dump);";
+        return (Data::Dump::dump(@dump_this));
+    }
+}
+
+sub my_croak {
+    eval "use Carp; 1";
+    $Carp::Verbose = 1;
+    croak(@_);
+}
+
+sub my_carp {
+    eval "use Carp; 1";
+    $Carp::Verbose = 1;
+    carp(@_);
+}
+
+sub __OBJECT_ORIENTED_PARTS__ {}
+
+package SSHLogin;
+
+sub new {
+    my $class = shift;
+    my $sshlogin_string = shift;
+    my $ncpus;
+    my %hostgroups;
+    # SSHLogins can have these formats:
+    #   @grp+grp/ncpu//usr/bin/ssh user@server
+    #   ncpu//usr/bin/ssh user@server
+    #   /usr/bin/ssh user@server
+    #   user@server
+    #   ncpu/user@server
+    #   @grp+grp/user@server
+    if($sshlogin_string =~ s:^\@([^/]+)/?::) {
+        # Look for SSHLogin hostgroups
+        %hostgroups = map { $_ => 1 } split(/\+/, $1);
+    }
+    if ($sshlogin_string =~ s:^(\d+)/::) {
+        # Override default autodetected ncpus unless missing
+        $ncpus = $1;
+    }
+    my $string = $sshlogin_string;
+    # An SSHLogin is always in the hostgroup of its $string-name
+    $hostgroups{$string} = 1;
+    @Global::hostgroups{keys %hostgroups} = values %hostgroups;
+    my @unget = ();
+    my $no_slash_string = $string;
+    $no_slash_string =~ s/[^-a-z0-9:]/_/gi;
+    return bless {
+        'string' => $string,
+        'jobs_running' => 0,
+        'jobs_completed' => 0,
+        'maxlength' => undef,
+        'max_jobs_running' => undef,
+        'orig_max_jobs_running' => undef,
+        'ncpus' => $ncpus,
+        'hostgroups' => \%hostgroups,
+        'sshcommand' => undef,
+        'serverlogin' => undef,
+        'control_path_dir' => undef,
+        'control_path' => undef,
+	'time_to_login' => undef,
+	'last_login_at' => undef,
+        'loadavg_file' => $ENV{'HOME'} . "/.parallel/tmp/loadavg-" .
+            $no_slash_string,
+        'loadavg' => undef,
+	'last_loadavg_update' => 0,
+        'swap_activity_file' => $ENV{'HOME'} . "/.parallel/tmp/swap_activity-" .
+            $no_slash_string,
+        'swap_activity' => undef,
+    }, ref($class) || $class;
+}
+
+sub DESTROY {
+    my $self = shift;
+    # Remove temporary files if they are created.
+    unlink $self->{'loadavg_file'};
+    unlink $self->{'swap_activity_file'};
+}
+
+sub string {
+    my $self = shift;
+    return $self->{'string'};
+}
+
+sub jobs_running {
+    my $self = shift;
+
+    return ($self->{'jobs_running'} || "0");
+}
+
+sub inc_jobs_running {
+    my $self = shift;
+    $self->{'jobs_running'}++;
+}
+
+sub dec_jobs_running {
+    my $self = shift;
+    $self->{'jobs_running'}--;
+}
+
+sub set_maxlength {
+    my $self = shift;
+    $self->{'maxlength'} = shift;
+}
+
+sub maxlength {
+    my $self = shift;
+    return $self->{'maxlength'};
+}
+
+sub jobs_completed {
+    my $self = shift;
+    return $self->{'jobs_completed'};
+}
+
+sub in_hostgroups {
+    # Input:
+    #   @hostgroups = the hostgroups to look for
+    # Returns:
+    #   true if intersection of @hostgroups and the hostgroups of this
+    #        SSHLogin is non-empty
+    my $self = shift;
+    return grep { defined $self->{'hostgroups'}{$_} } @_;
+}
+
+sub hostgroups {
+    my $self = shift;
+    return keys %{$self->{'hostgroups'}};
+}
+
+sub inc_jobs_completed {
+    my $self = shift;
+    $self->{'jobs_completed'}++;
+}
+
+sub set_max_jobs_running {
+    my $self = shift;
+    if(defined $self->{'max_jobs_running'}) {
+        $Global::max_jobs_running -= $self->{'max_jobs_running'};
+    }
+    $self->{'max_jobs_running'} = shift;
+    if(defined $self->{'max_jobs_running'}) {
+        # max_jobs_running could be resat if -j is a changed file
+        $Global::max_jobs_running += $self->{'max_jobs_running'};
+    }
+    # Initialize orig to the first non-zero value that comes around
+    $self->{'orig_max_jobs_running'} ||= $self->{'max_jobs_running'};
+}
+
+sub swapping {
+    my $self = shift;
+    my $swapping = $self->swap_activity();
+    return (not defined $swapping or $swapping)
+}
+
+sub swap_activity {
+    # If the currently known swap activity is too old:
+    #   Recompute a new one in the background
+    # Returns:
+    #   last swap activity computed
+    my $self = shift;
+    # Should we update the swap_activity file?
+    my $update_swap_activity_file = 0;
+    if(-r $self->{'swap_activity_file'}) {
+        open(my $swap_fh, "<", $self->{'swap_activity_file'}) || ::die_bug("swap_activity_file-r");
+        my $swap_out = <$swap_fh>;
+        close $swap_fh;
+        if($swap_out =~ /^(\d+)$/) {
+            $self->{'swap_activity'} = $1;
+            ::debug("swap", "New swap_activity: ", $self->{'swap_activity'});
+        }
+        ::debug("swap", "Last update: ", $self->{'last_swap_activity_update'});
+        if(time - $self->{'last_swap_activity_update'} > 10) {
+            # last swap activity update was started 10 seconds ago
+            ::debug("swap", "Older than 10 sec: ", $self->{'swap_activity_file'});
+            $update_swap_activity_file = 1;
+        }
+    } else {
+        ::debug("swap", "No swap_activity file: ", $self->{'swap_activity_file'});
+        $self->{'swap_activity'} = undef;
+        $update_swap_activity_file = 1;
+    }
+    if($update_swap_activity_file) {
+        ::debug("swap", "Updating swap_activity file ", $self->{'swap_activity_file'});
+        $self->{'last_swap_activity_update'} = time;
+        -e $ENV{'HOME'}."/.parallel" or mkdir $ENV{'HOME'}."/.parallel";
+        -e $ENV{'HOME'}."/.parallel/tmp" or mkdir $ENV{'HOME'}."/.parallel/tmp";
+        my $swap_activity;
+	$swap_activity = swapactivityscript();
+        if($self->{'string'} ne ":") {
+            $swap_activity = $self->sshcommand() . " " . $self->serverlogin() . " " .
+		::shell_quote_scalar($swap_activity);
+        }
+        # Run swap_activity measuring.
+        # As the command can take long to run if run remote
+        # save it to a tmp file before moving it to the correct file
+        my $file = $self->{'swap_activity_file'};
+        my ($dummy_fh, $tmpfile) = ::tmpfile(SUFFIX => ".swp");
+	::debug("swap", "\n", $swap_activity, "\n");
+        qx{ ($swap_activity > $tmpfile && mv $tmpfile $file || rm $tmpfile) & };
+    }
+    return $self->{'swap_activity'};
+}
+
+{
+    my $script;
+
+    sub swapactivityscript {
+	# Returns:
+	#   shellscript for detecting swap activity
+	#
+	# arguments for vmstat are OS dependant
+	# swap_in and swap_out are in different columns depending on OS
+	#
+	if(not $script) {
+	    my %vmstat = (
+		# linux: $7*$8
+		# $ vmstat 1 2
+		# procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu----
+		#  r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa
+		#  5  0  51208 1701096 198012 18857888    0    0    37   153   28   19 56 11 33  1
+		#  3  0  51208 1701288 198012 18857972    0    0     0     0 3638 10412 15  3 82  0
+		'linux' => ['vmstat 1 2 | tail -n1', '$7*$8'],
+
+		# solaris: $6*$7
+		# $ vmstat -S 1 2
+		#  kthr      memory            page            disk          faults      cpu
+		#  r b w   swap  free  si  so pi po fr de sr s3 s4 -- --   in   sy   cs us sy id
+		#  0 0 0 4628952 3208408 0  0  3  1  1  0  0 -0  2  0  0  263  613  246  1  2 97
+		#  0 0 0 4552504 3166360 0  0  0  0  0  0  0  0  0  0  0  246  213  240  1  1 98
+		'solaris' => ['vmstat -S 1 2 | tail -1', '$6*$7'],
+
+		# darwin (macosx): $21*$22
+		# $ vm_stat -c 2 1
+		# Mach Virtual Memory Statistics: (page size of 4096 bytes)
+		#     free   active   specul inactive throttle    wired  prgable   faults     copy    0fill reactive   purged file-backed anonymous cmprssed cmprssor  dcomprs   comprs  pageins  pageout  swapins swapouts
+		#   346306   829050    74871   606027        0   240231    90367  544858K 62343596  270837K    14178   415070      570102    939846      356      370      116      922  4019813        4        0        0
+		#   345740   830383    74875   606031        0   239234    90369     2696      359      553        0        0      570110    941179      356      370        0        0        0        0        0        0
+		'darwin' => ['vm_stat -c 2 1 | tail -n1', '$21*$22'],
+
+		# ultrix: $12*$13
+		# $ vmstat -S 1 2
+		#  procs      faults    cpu      memory              page             disk
+		#  r b w   in  sy  cs us sy id  avm  fre  si so  pi  po  fr  de  sr s0
+		#  1 0 0    4  23   2  3  0 97 7743 217k   0  0   0   0   0   0   0  0
+		#  1 0 0    6  40   8  0  1 99 7743 217k   0  0   3   0   0   0   0  0
+		'ultrix' => ['vmstat -S 1 2 | tail -1', '$12*$13'],
+
+		# aix: $6*$7
+		# $ vmstat 1 2
+		# System configuration: lcpu=1 mem=2048MB
+		#
+		# kthr    memory              page              faults        cpu
+		# ----- ----------- ------------------------ ------------ -----------
+		#  r  b   avm   fre  re  pi  po  fr   sr  cy  in   sy  cs us sy id wa
+		#  0  0 333933 241803   0   0   0   0    0   0  10  143  90  0  0 99  0
+		#  0  0 334125 241569   0   0   0   0    0   0  37 5368 184  0  9 86  5
+		'aix' => ['vmstat 1 2 | tail -n1', '$6*$7'],
+
+		# freebsd: $8*$9
+		# $ vmstat -H 1 2
+		#  procs      memory      page                    disks     faults         cpu
+		#  r b w     avm    fre   flt  re  pi  po    fr  sr ad0 ad1   in   sy   cs us sy id
+		#  1 0 0  596716   19560    32   0   0   0    33   8   0   0   11  220  277  0  0 99
+		#  0 0 0  596716   19560     2   0   0   0     0   0   0   0   11  144  263  0  1 99
+		'freebsd' => ['vmstat -H 1 2 | tail -n1', '$8*$9'],
+
+		# mirbsd: $8*$9
+		# $ vmstat 1 2
+		#  procs   memory        page                    disks     traps         cpu
+		#  r b w    avm    fre   flt  re  pi  po  fr  sr wd0 cd0  int   sys   cs us sy id
+		#  0 0 0  25776 164968    34   0   0   0   0   0   0   0  230   259   38  4  0 96
+		#  0 0 0  25776 164968    24   0   0   0   0   0   0   0  237   275   37  0  0 100
+		'mirbsd' => ['vmstat 1 2 | tail -n1', '$8*$9'],
+
+		# netbsd: $7*$8
+		# $ vmstat 1 2
+		#  procs    memory      page                       disks   faults      cpu
+		#  r b      avm    fre  flt  re  pi   po   fr   sr w0 w1   in   sy  cs us sy id
+		#  0 0   138452   6012   54   0   0    0    1    2  3  0    4  100  23  0  0 100
+		#  0 0   138456   6008    1   0   0    0    0    0  0  0    7   26  19  0 0 100
+		'netbsd' => ['vmstat 1 2 | tail -n1', '$7*$8'],
+
+		# openbsd: $8*$9
+		# $ vmstat 1 2
+		#  procs    memory       page                    disks    traps          cpu
+		#  r b w    avm     fre  flt  re  pi  po  fr  sr wd0 wd1  int   sys   cs us sy id
+		#  0 0 0  76596  109944   73   0   0   0   0   0   0   1    5   259   22  0  1 99
+		#  0 0 0  76604  109936   24   0   0   0   0   0   0   0    7   114   20  0  1 99
+		'openbsd' => ['vmstat 1 2 | tail -n1', '$8*$9'],
+
+		# hpux: $8*$9
+		# $ vmstat 1 2
+		#          procs           memory                   page                              faults       cpu
+		#     r     b     w      avm    free   re   at    pi   po    fr   de    sr     in     sy    cs  us sy id
+		#     1     0     0   247211  216476    4    1     0    0     0    0     0    102  73005    54   6 11 83
+		#     1     0     0   247211  216421   43    9     0    0     0    0     0    144   1675    96  25269512791222387000 25269512791222387000 105
+		'hpux' => ['vmstat 1 2 | tail -n1', '$8*$9'],
+
+		# dec_osf (tru64): $11*$12
+		# $ vmstat  1 2
+		# Virtual Memory Statistics: (pagesize = 8192)
+		#   procs      memory        pages                            intr       cpu
+		#   r   w   u  act free wire fault  cow zero react  pin pout  in  sy  cs us sy id
+		#   3 181  36  51K 1895 8696  348M  59M 122M   259  79M    0   5 218 302  4  1 94
+		#   3 181  36  51K 1893 8696     3   15   21     0   28    0   4  81 321  1  1 98
+		'dec_osf' => ['vmstat 1 2 | tail -n1', '$11*$12'],
+
+		# gnu (hurd): $7*$8
+		# $ vmstat -k 1 2
+		# (pagesize: 4, size: 512288, swap size: 894972)
+		#   free   actv  inact  wired   zeroed  react    pgins   pgouts  pfaults  cowpfs hrat    caobj  cache swfree
+		# 371940  30844  89228  20276   298348      0    48192    19016   756105   99808  98%      876  20628 894972
+		# 371940  30844  89228  20276       +0     +0       +0       +0      +42      +2  98%      876  20628 894972
+		'gnu' => ['vmstat -k 1 2 | tail -n1', '$7*$8'],
+
+		# -nto (qnx has no swap)
+		#-irix
+		#-svr5 (scosysv)
+		);
+	    my $perlscript = "";
+	    for my $os (keys %vmstat) {
+		#q[ { vmstat 1 2 2> /dev/null || vmstat -c 1 2; } | ].
+		#   q[ awk 'NR!=4{next} NF==17||NF==16{print $7*$8} NF==22{print $21*$22} {exit}' ];
+		$vmstat{$os}[1] =~ s/\$/\\\\\\\$/g; # $ => \\\$
+		$perlscript .= 'if($^O eq "'.$os.'") { print `'.$vmstat{$os}[0].' | awk "{print ' .
+		    $vmstat{$os}[1] . '}"` }';
+	    }
+	    $perlscript = "perl -e " . ::shell_quote_scalar($perlscript);
+	    $script = $Global::envvar. " " .$perlscript;
+	}
+	return $script;
+    }
+}
+
+sub too_fast_remote_login {
+    my $self = shift;
+    if($self->{'last_login_at'} and $self->{'time_to_login'}) {
+	# sshd normally allows 10 simultaneous logins
+	# A login takes time_to_login
+	# So time_to_login/5 should be safe
+	# If now <= last_login + time_to_login/5: Then it is too soon.
+	my $too_fast = (::now() <= $self->{'last_login_at'}
+			+ $self->{'time_to_login'}/5);
+	::debug("run", "Too fast? $too_fast ");
+	return $too_fast;
+    } else {
+	# No logins so far (or time_to_login not computed): it is not too fast
+	return 0;
+    }
+}
+
+sub last_login_at {
+    my $self = shift;
+    return $self->{'last_login_at'};
+}
+
+sub set_last_login_at {
+    my $self = shift;
+    $self->{'last_login_at'} = shift;
+}
+
+sub loadavg_too_high {
+    my $self = shift;
+    my $loadavg = $self->loadavg();
+    return (not defined $loadavg or
+            $loadavg > $self->max_loadavg());
+}
+
+sub loadavg {
+    # If the currently know loadavg is too old:
+    #   Recompute a new one in the background
+    # The load average is computed as the number of processes waiting for disk
+    # or CPU right now. So it is the server load this instant and not averaged over
+    # several minutes. This is needed so GNU Parallel will at most start one job
+    # that will push the load over the limit.
+    #
+    # Returns:
+    #   $last_loadavg = last load average computed (undef if none)
+    my $self = shift;
+    # Should we update the loadavg file?
+    my $update_loadavg_file = 0;
+    if(open(my $load_fh, "<", $self->{'loadavg_file'})) {
+	local $/ = undef;
+        my $load_out = <$load_fh>;
+        close $load_fh;
+	my $load =()= ($load_out=~/(^[DR]....[^\[])/gm);
+        if($load > 0) {
+	    # load is overestimated by 1
+            $self->{'loadavg'} = $load - 1;
+            ::debug("load", "New loadavg: ", $self->{'loadavg'});
+        } else {
+	    ::die_bug("loadavg_invalid_content: $load_out");
+	}
+        ::debug("load", "Last update: ", $self->{'last_loadavg_update'});
+        if(time - $self->{'last_loadavg_update'} > 10) {
+            # last loadavg was started 10 seconds ago
+            ::debug("load", time - $self->{'last_loadavg_update'}, " secs old: ",
+		    $self->{'loadavg_file'});
+            $update_loadavg_file = 1;
+        }
+    } else {
+        ::debug("load", "No loadavg file: ", $self->{'loadavg_file'});
+        $self->{'loadavg'} = undef;
+        $update_loadavg_file = 1;
+    }
+    if($update_loadavg_file) {
+        ::debug("load", "Updating loadavg file", $self->{'loadavg_file'}, "\n");
+        $self->{'last_loadavg_update'} = time;
+        -e $ENV{'HOME'}."/.parallel" or mkdir $ENV{'HOME'}."/.parallel";
+        -e $ENV{'HOME'}."/.parallel/tmp" or mkdir $ENV{'HOME'}."/.parallel/tmp";
+        my $cmd = "";
+        if($self->{'string'} ne ":") {
+	    $cmd = $self->sshcommand() . " " . $self->serverlogin() . " ";
+	}
+	# TODO Is is called 'ps ax -o state,command' on other platforms?
+	$cmd .= "ps ax -o state,command";
+        # As the command can take long to run if run remote
+        # save it to a tmp file before moving it to the correct file
+        my $file = $self->{'loadavg_file'};
+        my ($dummy_fh, $tmpfile) = ::tmpfile(SUFFIX => ".loa");
+        qx{ ($cmd > $tmpfile && mv $tmpfile $file || rm $tmpfile) & };
+    }
+    return $self->{'loadavg'};
+}
+
+sub max_loadavg {
+    my $self = shift;
+    # If --load is a file it might be changed
+    if($Global::max_load_file) {
+	my $mtime = (stat($Global::max_load_file))[9];
+	if($mtime > $Global::max_load_file_last_mod) {
+	    $Global::max_load_file_last_mod = $mtime;
+	    for my $sshlogin (values %Global::host) {
+		$sshlogin->set_max_loadavg(undef);
+	    }
+	}
+    }
+    if(not defined $self->{'max_loadavg'}) {
+        $self->{'max_loadavg'} =
+            $self->compute_max_loadavg($opt::load);
+    }
+    ::debug("load", "max_loadavg: ", $self->string(), " ", $self->{'max_loadavg'});
+    return $self->{'max_loadavg'};
+}
+
+sub set_max_loadavg {
+    my $self = shift;
+    $self->{'max_loadavg'} = shift;
+}
+
+sub compute_max_loadavg {
+    # Parse the max loadaverage that the user asked for using --load
+    # Returns:
+    #   max loadaverage
+    my $self = shift;
+    my $loadspec = shift;
+    my $load;
+    if(defined $loadspec) {
+        if($loadspec =~ /^\+(\d+)$/) {
+            # E.g. --load +2
+            my $j = $1;
+            $load =
+                $self->ncpus() + $j;
+        } elsif ($loadspec =~ /^-(\d+)$/) {
+            # E.g. --load -2
+            my $j = $1;
+            $load =
+                $self->ncpus() - $j;
+        } elsif ($loadspec =~ /^(\d+)\%$/) {
+            my $j = $1;
+            $load =
+                $self->ncpus() * $j / 100;
+        } elsif ($loadspec =~ /^(\d+(\.\d+)?)$/) {
+            $load = $1;
+        } elsif (-f $loadspec) {
+            $Global::max_load_file = $loadspec;
+            $Global::max_load_file_last_mod = (stat($Global::max_load_file))[9];
+            if(open(my $in_fh, "<", $Global::max_load_file)) {
+                my $opt_load_file = join("",<$in_fh>);
+                close $in_fh;
+                $load = $self->compute_max_loadavg($opt_load_file);
+            } else {
+                print $Global::original_stderr "Cannot open $loadspec\n";
+                ::wait_and_exit(255);
+            }
+        } else {
+            print $Global::original_stderr "Parsing of --load failed\n";
+            ::die_usage();
+        }
+        if($load < 0.01) {
+            $load = 0.01;
+        }
+    }
+    return $load;
+}
+
+sub time_to_login {
+    my $self = shift;
+    return $self->{'time_to_login'};
+}
+
+sub set_time_to_login {
+    my $self = shift;
+    $self->{'time_to_login'} = shift;
+}
+
+sub max_jobs_running {
+    my $self = shift;
+    if(not defined $self->{'max_jobs_running'}) {
+        my $nproc = $self->compute_number_of_processes($opt::jobs);
+        $self->set_max_jobs_running($nproc);
+    }
+    return $self->{'max_jobs_running'};
+}
+
+sub orig_max_jobs_running {
+    my $self = shift;
+    return $self->{'orig_max_jobs_running'};
+}
+
+sub compute_number_of_processes {
+    # Number of processes wanted and limited by system resources
+    # Returns:
+    #   Number of processes
+    my $self = shift;
+    my $opt_P = shift;
+    my $wanted_processes = $self->user_requested_processes($opt_P);
+    if(not defined $wanted_processes) {
+        $wanted_processes = $Global::default_simultaneous_sshlogins;
+    }
+    ::debug("load", "Wanted procs: $wanted_processes\n");
+    my $system_limit =
+        $self->processes_available_by_system_limit($wanted_processes);
+    ::debug("load", "Limited to procs: $system_limit\n");
+    return $system_limit;
+}
+
+sub processes_available_by_system_limit {
+    # If the wanted number of processes is bigger than the system limits:
+    # Limit them to the system limits
+    # Limits are: File handles, number of input lines, processes,
+    # and taking > 1 second to spawn 10 extra processes
+    # Returns:
+    #   Number of processes
+    my $self = shift;
+    my $wanted_processes = shift;
+
+    my $system_limit = 0;
+    my @jobs = ();
+    my $job;
+    my @args = ();
+    my $arg;
+    my $more_filehandles = 1;
+    my $max_system_proc_reached = 0;
+    my $slow_spawining_warning_printed = 0;
+    my $time = time;
+    my %fh;
+    my @children;
+
+    # Reserve filehandles
+    # perl uses 7 filehandles for something?
+    # parallel uses 1 for memory_usage
+    # parallel uses 4 for ?
+    for my $i (1..12) {
+        open($fh{"init-$i"}, "<", "/dev/null");
+    }
+
+    for(1..2) {
+        # System process limit
+        my $child;
+        if($child = fork()) {
+            push (@children,$child);
+            $Global::unkilled_children{$child} = 1;
+        } elsif(defined $child) {
+            # The child takes one process slot
+            # It will be killed later
+            $SIG{TERM} = $Global::original_sig{TERM};
+            sleep 10000000;
+            exit(0);
+        } else {
+            $max_system_proc_reached = 1;
+        }
+    }
+    my $count_jobs_already_read = $Global::JobQueue->next_seq();
+    my $wait_time_for_getting_args = 0;
+    my $start_time = time;
+    while(1) {
+        $system_limit >= $wanted_processes and last;
+        not $more_filehandles and last;
+        $max_system_proc_reached and last;
+	my $before_getting_arg = time;
+        if($Global::semaphore or $opt::pipe) {
+	    # Skip: No need to get args
+        } elsif(defined $opt::retries and $count_jobs_already_read) {
+            # For retries we may need to run all jobs on this sshlogin
+            # so include the already read jobs for this sshlogin
+            $count_jobs_already_read--;
+        } else {
+            if($opt::X or $opt::m) {
+                # The arguments may have to be re-spread over several jobslots
+                # So pessimistically only read one arg per jobslot
+                # instead of a full commandline
+                if($Global::JobQueue->{'commandlinequeue'}->{'arg_queue'}->empty()) {
+		    if($Global::JobQueue->empty()) {
+			last;
+		    } else {
+			($job) = $Global::JobQueue->get();
+			push(@jobs, $job);
+		    }
+		} else {
+		    ($arg) = $Global::JobQueue->{'commandlinequeue'}->{'arg_queue'}->get();
+		    push(@args, $arg);
+		}
+            } else {
+                # If there are no more command lines, then we have a process
+                # per command line, so no need to go further
+                $Global::JobQueue->empty() and last;
+                ($job) = $Global::JobQueue->get();
+                push(@jobs, $job);
+	    }
+        }
+	$wait_time_for_getting_args += time - $before_getting_arg;
+        $system_limit++;
+
+        # Every simultaneous process uses 2 filehandles when grouping
+        # Every simultaneous process uses 2 filehandles when compressing
+        $more_filehandles = open($fh{$system_limit*10}, "<", "/dev/null")
+            && open($fh{$system_limit*10+2}, "<", "/dev/null")
+            && open($fh{$system_limit*10+3}, "<", "/dev/null")
+            && open($fh{$system_limit*10+4}, "<", "/dev/null");
+
+        # System process limit
+        my $child;
+        if($child = fork()) {
+            push (@children,$child);
+            $Global::unkilled_children{$child} = 1;
+        } elsif(defined $child) {
+            # The child takes one process slot
+            # It will be killed later
+            $SIG{TERM} = $Global::original_sig{TERM};
+            sleep 10000000;
+            exit(0);
+        } else {
+            $max_system_proc_reached = 1;
+        }
+	my $forktime = time - $time - $wait_time_for_getting_args;
+        ::debug("run", "Time to fork $system_limit procs: $wait_time_for_getting_args ",
+		$forktime,
+		" (processes so far: ", $system_limit,")\n");
+        if($system_limit > 10 and
+	   $forktime > 1 and
+	   $forktime > $system_limit * 0.01
+	   and not $slow_spawining_warning_printed) {
+            # It took more than 0.01 second to fork a processes on avg.
+            # Give the user a warning. He can press Ctrl-C if this
+            # sucks.
+            print $Global::original_stderr
+                ("parallel: Warning: Starting $system_limit processes took > $forktime sec.\n",
+                 "Consider adjusting -j. Press CTRL-C to stop.\n");
+            $slow_spawining_warning_printed = 1;
+        }
+    }
+    # Cleanup: Close the files
+    for (values %fh) { close $_ }
+    # Cleanup: Kill the children
+    for my $pid (@children) {
+        kill 9, $pid;
+        waitpid($pid,0);
+        delete $Global::unkilled_children{$pid};
+    }
+    # Cleanup: Unget the command_lines or the @args
+    $Global::JobQueue->{'commandlinequeue'}->{'arg_queue'}->unget(@args);
+    $Global::JobQueue->unget(@jobs);
+    if($system_limit < $wanted_processes) {
+	# The system_limit is less than the wanted_processes
+	if($system_limit < 1 and not $Global::JobQueue->empty()) {
+	    ::warning("Cannot spawn any jobs. Raising ulimit -u or /etc/security/limits.conf\n",
+		      "or /proc/sys/kernel/pid_max may help.\n");
+	    ::wait_and_exit(255);
+	}
+	if(not $more_filehandles) {
+	    ::warning("Only enough file handles to run ", $system_limit, " jobs in parallel.\n",
+		      "Running 'parallel -j0 -N", $system_limit, " --pipe parallel -j0' or ",
+		      "raising ulimit -n or /etc/security/limits.conf may help.\n");
+	}
+	if($max_system_proc_reached) {
+	    ::warning("Only enough available processes to run ", $system_limit,
+		      " jobs in parallel. Raising ulimit -u or /etc/security/limits.conf\n",
+		      "or /proc/sys/kernel/pid_max may help.\n");
+	}
+    }
+    if($] == 5.008008 and $system_limit > 1000) {
+	# https://savannah.gnu.org/bugs/?36942
+	$system_limit = 1000;
+    }
+    if($Global::JobQueue->empty()) {
+	$system_limit ||= 1;
+    }
+    if($self->string() ne ":" and
+       $system_limit > $Global::default_simultaneous_sshlogins) {
+        $system_limit =
+            $self->simultaneous_sshlogin_limit($system_limit);
+    }
+    return $system_limit;
+}
+
+sub simultaneous_sshlogin_limit {
+    # Test by logging in wanted number of times simultaneously
+    # Returns:
+    #   min($wanted_processes,$working_simultaneous_ssh_logins-1)
+    my $self = shift;
+    my $wanted_processes = shift;
+    if($self->{'time_to_login'}) {
+	return $wanted_processes;
+    }
+
+    # Try twice because it guesses wrong sometimes
+    # Choose the minimal
+    my $ssh_limit =
+        ::min($self->simultaneous_sshlogin($wanted_processes),
+	      $self->simultaneous_sshlogin($wanted_processes));
+    if($ssh_limit < $wanted_processes) {
+        my $serverlogin = $self->serverlogin();
+        ::warning("ssh to $serverlogin only allows ",
+		  "for $ssh_limit simultaneous logins.\n",
+		  "You may raise this by changing ",
+		  "/etc/ssh/sshd_config:MaxStartups and MaxSessions on $serverlogin.\n",
+		  "Using only ",$ssh_limit-1," connections ",
+		  "to avoid race conditions.\n");
+    }
+    # Race condition can cause problem if using all sshs.
+    if($ssh_limit > 1) { $ssh_limit -= 1; }
+    return $ssh_limit;
+}
+
+sub simultaneous_sshlogin {
+    # Using $sshlogin try to see if we can do $wanted_processes
+    # simultaneous logins
+    # (ssh host echo simultaneouslogin & ssh host echo simultaneouslogin & ...)|grep simul|wc -l
+    # Returns:
+    #   Number of succesful logins
+    my $self = shift;
+    my $wanted_processes = shift;
+    my $sshcmd = $self->sshcommand();
+    my $serverlogin = $self->serverlogin();
+    my $sshdelay = $opt::sshdelay ? "sleep $opt::sshdelay;" : "";
+    my $cmd = "$sshdelay$sshcmd $serverlogin echo simultaneouslogin </dev/null 2>&1 &"x$wanted_processes;
+    ::debug("init", "Trying $wanted_processes logins at $serverlogin\n");
+    open (my $simul_fh, "-|", "($cmd)|grep simultaneouslogin | wc -l") or
+	::die_bug("simultaneouslogin");
+    my $ssh_limit = <$simul_fh>;
+    close $simul_fh;
+    chomp $ssh_limit;
+    return $ssh_limit;
+}
+
+sub set_ncpus {
+    my $self = shift;
+    $self->{'ncpus'} = shift;
+}
+
+sub user_requested_processes {
+    # Parse the number of processes that the user asked for using -j
+    # Returns:
+    #   the number of processes to run on this sshlogin
+    my $self = shift;
+    my $opt_P = shift;
+    my $processes;
+    if(defined $opt_P) {
+        if($opt_P =~ /^\+(\d+)$/) {
+            # E.g. -P +2
+            my $j = $1;
+            $processes =
+                $self->ncpus() + $j;
+        } elsif ($opt_P =~ /^-(\d+)$/) {
+            # E.g. -P -2
+            my $j = $1;
+            $processes =
+                $self->ncpus() - $j;
+        } elsif ($opt_P =~ /^(\d+(\.\d+)?)\%$/) {
+            # E.g. -P 10.5%
+            my $j = $1;
+            $processes =
+                $self->ncpus() * $j / 100;
+        } elsif ($opt_P =~ /^(\d+)$/) {
+            $processes = $1;
+            if($processes == 0) {
+                # -P 0 = infinity (or at least close)
+                $processes = $Global::infinity;
+            }
+        } elsif (-f $opt_P) {
+            $Global::max_procs_file = $opt_P;
+            $Global::max_procs_file_last_mod = (stat($Global::max_procs_file))[9];
+            if(open(my $in_fh, "<", $Global::max_procs_file)) {
+                my $opt_P_file = join("",<$in_fh>);
+                close $in_fh;
+                $processes = $self->user_requested_processes($opt_P_file);
+            } else {
+                ::error("Cannot open $opt_P.\n");
+                ::wait_and_exit(255);
+            }
+        } else {
+            ::error("Parsing of --jobs/-j/--max-procs/-P failed.\n");
+            ::die_usage();
+        }
+	$processes = ::ceil($processes);
+    }
+    return $processes;
+}
+
+sub ncpus {
+    my $self = shift;
+    if(not defined $self->{'ncpus'}) {
+        my $sshcmd = $self->sshcommand();
+        my $serverlogin = $self->serverlogin();
+        if($serverlogin eq ":") {
+            if($opt::use_cpus_instead_of_cores) {
+                $self->{'ncpus'} = no_of_cpus();
+            } else {
+                $self->{'ncpus'} = no_of_cores();
+            }
+        } else {
+            my $ncpu;
+	    my $sqe = ::shell_quote_scalar($Global::envvar);
+            if($opt::use_cpus_instead_of_cores) {
+                $ncpu = qx(echo|$sshcmd $serverlogin $sqe parallel --number-of-cpus);
+            } else {
+		::debug("init",qq(echo|$sshcmd $serverlogin $sqe parallel --number-of-cores\n));
+                $ncpu = qx(echo|$sshcmd $serverlogin $sqe parallel --number-of-cores);
+            }
+	    chomp $ncpu;
+            if($ncpu =~ /^\s*[0-9]+\s*$/s) {
+                $self->{'ncpus'} = $ncpu;
+            } else {
+                ::warning("Could not figure out ",
+			  "number of cpus on $serverlogin ($ncpu). Using 1.\n");
+                $self->{'ncpus'} = 1;
+            }
+        }
+    }
+    return $self->{'ncpus'};
+}
+
+sub no_of_cpus {
+    # Returns:
+    #   Number of physical CPUs
+    local $/="\n"; # If delimiter is set, then $/ will be wrong
+    my $no_of_cpus;
+    if ($^O eq 'linux') {
+        $no_of_cpus = no_of_cpus_gnu_linux() || no_of_cores_gnu_linux();
+    } elsif ($^O eq 'freebsd') {
+        $no_of_cpus = no_of_cpus_freebsd();
+    } elsif ($^O eq 'netbsd') {
+        $no_of_cpus = no_of_cpus_netbsd();
+    } elsif ($^O eq 'openbsd') {
+        $no_of_cpus = no_of_cpus_openbsd();
+    } elsif ($^O eq 'gnu') {
+        $no_of_cpus = no_of_cpus_hurd();
+    } elsif ($^O eq 'darwin') {
+	$no_of_cpus = no_of_cpus_darwin();
+    } elsif ($^O eq 'solaris') {
+        $no_of_cpus = no_of_cpus_solaris();
+    } elsif ($^O eq 'aix') {
+        $no_of_cpus = no_of_cpus_aix();
+    } elsif ($^O eq 'hpux') {
+        $no_of_cpus = no_of_cpus_hpux();
+    } elsif ($^O eq 'nto') {
+        $no_of_cpus = no_of_cpus_qnx();
+    } elsif ($^O eq 'svr5') {
+        $no_of_cpus = no_of_cpus_openserver();
+    } elsif ($^O eq 'irix') {
+        $no_of_cpus = no_of_cpus_irix();
+    } elsif ($^O eq 'dec_osf') {
+        $no_of_cpus = no_of_cpus_tru64();
+    } else {
+	$no_of_cpus = (no_of_cpus_gnu_linux()
+		       || no_of_cpus_freebsd()
+		       || no_of_cpus_netbsd()
+		       || no_of_cpus_openbsd()
+		       || no_of_cpus_hurd()
+		       || no_of_cpus_darwin()
+		       || no_of_cpus_solaris()
+		       || no_of_cpus_aix()
+		       || no_of_cpus_hpux()
+		       || no_of_cpus_qnx()
+		       || no_of_cpus_openserver()
+		       || no_of_cpus_irix()
+		       || no_of_cpus_tru64()
+			# Number of cores is better than no guess for #CPUs
+		       || nproc()
+	    );
+    }
+    if($no_of_cpus) {
+	chomp $no_of_cpus;
+        return $no_of_cpus;
+    } else {
+        ::warning("Cannot figure out number of cpus. Using 1.\n");
+        return 1;
+    }
+}
+
+sub no_of_cores {
+    # Returns:
+    #   Number of CPU cores
+    local $/="\n"; # If delimiter is set, then $/ will be wrong
+    my $no_of_cores;
+    if ($^O eq 'linux') {
+	$no_of_cores = no_of_cores_gnu_linux();
+    } elsif ($^O eq 'freebsd') {
+        $no_of_cores = no_of_cores_freebsd();
+    } elsif ($^O eq 'netbsd') {
+        $no_of_cores = no_of_cores_netbsd();
+    } elsif ($^O eq 'openbsd') {
+        $no_of_cores = no_of_cores_openbsd();
+    } elsif ($^O eq 'gnu') {
+        $no_of_cores = no_of_cores_hurd();
+    } elsif ($^O eq 'darwin') {
+	$no_of_cores = no_of_cores_darwin();
+    } elsif ($^O eq 'solaris') {
+	$no_of_cores = no_of_cores_solaris();
+    } elsif ($^O eq 'aix') {
+        $no_of_cores = no_of_cores_aix();
+    } elsif ($^O eq 'hpux') {
+        $no_of_cores = no_of_cores_hpux();
+    } elsif ($^O eq 'nto') {
+        $no_of_cores = no_of_cores_qnx();
+    } elsif ($^O eq 'svr5') {
+        $no_of_cores = no_of_cores_openserver();
+    } elsif ($^O eq 'irix') {
+        $no_of_cores = no_of_cores_irix();
+    } elsif ($^O eq 'dec_osf') {
+        $no_of_cores = no_of_cores_tru64();
+    } else {
+	$no_of_cores = (no_of_cores_gnu_linux()
+			|| no_of_cores_freebsd()
+			|| no_of_cores_netbsd()
+			|| no_of_cores_openbsd()
+			|| no_of_cores_hurd()
+			|| no_of_cores_darwin()
+			|| no_of_cores_solaris()
+			|| no_of_cores_aix()
+			|| no_of_cores_hpux()
+			|| no_of_cores_qnx()
+			|| no_of_cores_openserver()
+			|| no_of_cores_irix()
+			|| no_of_cores_tru64()
+			|| nproc()
+	    );
+    }
+    if($no_of_cores) {
+	chomp $no_of_cores;
+        return $no_of_cores;
+    } else {
+        ::warning("Cannot figure out number of CPU cores. Using 1.\n");
+        return 1;
+    }
+}
+
+sub nproc {
+    # Returns:
+    #   Number of cores using `nproc`
+    my $no_of_cores = `nproc 2>/dev/null`;
+    return $no_of_cores;
+}
+
+sub no_of_cpus_gnu_linux {
+    # Returns:
+    #   Number of physical CPUs on GNU/Linux
+    #   undef if not GNU/Linux
+    my $no_of_cpus;
+    my $no_of_cores;
+    if(-e "/proc/cpuinfo") {
+        $no_of_cpus = 0;
+        $no_of_cores = 0;
+        my %seen;
+        open(my $in_fh, "<", "/proc/cpuinfo") || return undef;
+        while(<$in_fh>) {
+            if(/^physical id.*[:](.*)/ and not $seen{$1}++) {
+                $no_of_cpus++;
+            }
+            /^processor.*[:]/i and $no_of_cores++;
+        }
+        close $in_fh;
+    }
+    return ($no_of_cpus||$no_of_cores);
+}
+
+sub no_of_cores_gnu_linux {
+    # Returns:
+    #   Number of CPU cores on GNU/Linux
+    #   undef if not GNU/Linux
+    my $no_of_cores;
+    if(-e "/proc/cpuinfo") {
+        $no_of_cores = 0;
+        open(my $in_fh, "<", "/proc/cpuinfo") || return undef;
+        while(<$in_fh>) {
+            /^processor.*[:]/i and $no_of_cores++;
+        }
+        close $in_fh;
+    }
+    return $no_of_cores;
+}
+
+sub no_of_cpus_freebsd {
+    # Returns:
+    #   Number of physical CPUs on FreeBSD
+    #   undef if not FreeBSD
+    my $no_of_cpus =
+	(`sysctl -a dev.cpu 2>/dev/null | grep \%parent | awk '{ print \$2 }' | uniq | wc -l | awk '{ print \$1 }'`
+	 or
+	 `sysctl hw.ncpu 2>/dev/null | awk '{ print \$2 }'`);
+    chomp $no_of_cpus;
+    return $no_of_cpus;
+}
+
+sub no_of_cores_freebsd {
+    # Returns:
+    #   Number of CPU cores on FreeBSD
+    #   undef if not FreeBSD
+    my $no_of_cores =
+	(`sysctl hw.ncpu 2>/dev/null | awk '{ print \$2 }'`
+	 or
+	 `sysctl -a hw  2>/dev/null | grep [^a-z]logicalcpu[^a-z] | awk '{ print \$2 }'`);
+    chomp $no_of_cores;
+    return $no_of_cores;
+}
+
+sub no_of_cpus_netbsd {
+    # Returns:
+    #   Number of physical CPUs on NetBSD
+    #   undef if not NetBSD
+    my $no_of_cpus = `sysctl -n hw.ncpu 2>/dev/null`;
+    chomp $no_of_cpus;
+    return $no_of_cpus;
+}
+
+sub no_of_cores_netbsd {
+    # Returns:
+    #   Number of CPU cores on NetBSD
+    #   undef if not NetBSD
+    my $no_of_cores = `sysctl -n hw.ncpu 2>/dev/null`;
+    chomp $no_of_cores;
+    return $no_of_cores;
+}
+
+sub no_of_cpus_openbsd {
+    # Returns:
+    #   Number of physical CPUs on OpenBSD
+    #   undef if not OpenBSD
+    my $no_of_cpus = `sysctl -n hw.ncpu 2>/dev/null`;
+    chomp $no_of_cpus;
+    return $no_of_cpus;
+}
+
+sub no_of_cores_openbsd {
+    # Returns:
+    #   Number of CPU cores on OpenBSD
+    #   undef if not OpenBSD
+    my $no_of_cores = `sysctl -n hw.ncpu 2>/dev/null`;
+    chomp $no_of_cores;
+    return $no_of_cores;
+}
+
+sub no_of_cpus_hurd {
+    # Returns:
+    #   Number of physical CPUs on HURD
+    #   undef if not HURD
+    my $no_of_cpus = `nproc`;
+    chomp $no_of_cpus;
+    return $no_of_cpus;
+}
+
+sub no_of_cores_hurd {
+    # Returns:
+    #   Number of physical CPUs on HURD
+    #   undef if not HURD
+    my $no_of_cores = `nproc`;
+    chomp $no_of_cores;
+    return $no_of_cores;
+}
+
+sub no_of_cpus_darwin {
+    # Returns:
+    #   Number of physical CPUs on Mac Darwin
+    #   undef if not Mac Darwin
+    my $no_of_cpus =
+	(`sysctl -n hw.physicalcpu 2>/dev/null`
+	 or
+	 `sysctl -a hw 2>/dev/null | grep [^a-z]physicalcpu[^a-z] | awk '{ print \$2 }'`);
+    return $no_of_cpus;
+}
+
+sub no_of_cores_darwin {
+    # Returns:
+    #   Number of CPU cores on Mac Darwin
+    #   undef if not Mac Darwin
+    my $no_of_cores =
+	(`sysctl -n hw.logicalcpu 2>/dev/null`
+	 or
+	 `sysctl -a hw  2>/dev/null | grep [^a-z]logicalcpu[^a-z] | awk '{ print \$2 }'`);
+    return $no_of_cores;
+}
+
+sub no_of_cpus_solaris {
+    # Returns:
+    #   Number of physical CPUs on Solaris
+    #   undef if not Solaris
+    if(-x "/usr/sbin/psrinfo") {
+        my @psrinfo = `/usr/sbin/psrinfo`;
+        if($#psrinfo >= 0) {
+            return $#psrinfo +1;
+        }
+    }
+    if(-x "/usr/sbin/prtconf") {
+        my @prtconf = `/usr/sbin/prtconf | grep cpu..instance`;
+        if($#prtconf >= 0) {
+            return $#prtconf +1;
+        }
+    }
+    return undef;
+}
+
+sub no_of_cores_solaris {
+    # Returns:
+    #   Number of CPU cores on Solaris
+    #   undef if not Solaris
+    if(-x "/usr/sbin/psrinfo") {
+        my @psrinfo = `/usr/sbin/psrinfo`;
+        if($#psrinfo >= 0) {
+            return $#psrinfo +1;
+        }
+    }
+    if(-x "/usr/sbin/prtconf") {
+        my @prtconf = `/usr/sbin/prtconf | grep cpu..instance`;
+        if($#prtconf >= 0) {
+            return $#prtconf +1;
+        }
+    }
+    return undef;
+}
+
+sub no_of_cpus_aix {
+    # Returns:
+    #   Number of physical CPUs on AIX
+    #   undef if not AIX
+    my $no_of_cpus = 0;
+    if(-x "/usr/sbin/lscfg") {
+	open(my $in_fh, "-|", "/usr/sbin/lscfg -vs |grep proc | wc -l|tr -d ' '")
+	    || return undef;
+	$no_of_cpus = <$in_fh>;
+	chomp ($no_of_cpus);
+	close $in_fh;
+    }
+    return $no_of_cpus;
+}
+
+sub no_of_cores_aix {
+    # Returns:
+    #   Number of CPU cores on AIX
+    #   undef if not AIX
+    my $no_of_cores;
+    if(-x "/usr/bin/vmstat") {
+	open(my $in_fh, "-|", "/usr/bin/vmstat 1 1") || return undef;
+	while(<$in_fh>) {
+	    /lcpu=([0-9]*) / and $no_of_cores = $1;
+	}
+	close $in_fh;
+    }
+    return $no_of_cores;
+}
+
+sub no_of_cpus_hpux {
+    # Returns:
+    #   Number of physical CPUs on HP-UX
+    #   undef if not HP-UX
+    my $no_of_cpus =
+        (`/usr/bin/mpsched -s 2>&1 | grep 'Locality Domain Count' | awk '{ print \$4 }'`);
+    return $no_of_cpus;
+}
+
+sub no_of_cores_hpux {
+    # Returns:
+    #   Number of CPU cores on HP-UX
+    #   undef if not HP-UX
+    my $no_of_cores =
+        (`/usr/bin/mpsched -s 2>&1 | grep 'Processor Count' | awk '{ print \$3 }'`);
+    return $no_of_cores;
+}
+
+sub no_of_cpus_qnx {
+    # Returns:
+    #   Number of physical CPUs on QNX
+    #   undef if not QNX
+    # BUG: It is now known how to calculate this.
+    my $no_of_cpus = 0;
+    return $no_of_cpus;
+}
+
+sub no_of_cores_qnx {
+    # Returns:
+    #   Number of CPU cores on QNX
+    #   undef if not QNX
+    # BUG: It is now known how to calculate this.
+    my $no_of_cores = 0;
+    return $no_of_cores;
+}
+
+sub no_of_cpus_openserver {
+    # Returns:
+    #   Number of physical CPUs on SCO OpenServer
+    #   undef if not SCO OpenServer
+    my $no_of_cpus = 0;
+    if(-x "/usr/sbin/psrinfo") {
+        my @psrinfo = `/usr/sbin/psrinfo`;
+        if($#psrinfo >= 0) {
+            return $#psrinfo +1;
+        }
+    }
+    return $no_of_cpus;
+}
+
+sub no_of_cores_openserver {
+    # Returns:
+    #   Number of CPU cores on SCO OpenServer
+    #   undef if not SCO OpenServer
+    my $no_of_cores = 0;
+    if(-x "/usr/sbin/psrinfo") {
+        my @psrinfo = `/usr/sbin/psrinfo`;
+        if($#psrinfo >= 0) {
+            return $#psrinfo +1;
+        }
+    }
+    return $no_of_cores;
+}
+
+sub no_of_cpus_irix {
+    # Returns:
+    #   Number of physical CPUs on IRIX
+    #   undef if not IRIX
+    my $no_of_cpus = `hinv | grep HZ | grep Processor | awk '{print \$1}'`;
+    return $no_of_cpus;
+}
+
+sub no_of_cores_irix {
+    # Returns:
+    #   Number of CPU cores on IRIX
+    #   undef if not IRIX
+    my $no_of_cores = `hinv | grep HZ | grep Processor | awk '{print \$1}'`;
+    return $no_of_cores;
+}
+
+sub no_of_cpus_tru64 {
+    # Returns:
+    #   Number of physical CPUs on Tru64
+    #   undef if not Tru64
+    my $no_of_cpus = `sizer -pr`;
+    return $no_of_cpus;
+}
+
+sub no_of_cores_tru64 {
+    # Returns:
+    #   Number of CPU cores on Tru64
+    #   undef if not Tru64
+    my $no_of_cores = `sizer -pr`;
+    return $no_of_cores;
+}
+
+sub sshcommand {
+    my $self = shift;
+    if (not defined $self->{'sshcommand'}) {
+        $self->sshcommand_of_sshlogin();
+    }
+    return $self->{'sshcommand'};
+}
+
+sub serverlogin {
+    my $self = shift;
+    if (not defined $self->{'serverlogin'}) {
+        $self->sshcommand_of_sshlogin();
+    }
+    return $self->{'serverlogin'};
+}
+
+sub sshcommand_of_sshlogin {
+    # 'server' -> ('ssh -S /tmp/parallel-ssh-RANDOM/host-','server')
+    # 'user@server' -> ('ssh','user@server')
+    # 'myssh user@server' -> ('myssh','user@server')
+    # 'myssh -l user server' -> ('myssh -l user','server')
+    # '/usr/bin/myssh -l user server' -> ('/usr/bin/myssh -l user','server')
+    # Returns:
+    #   sshcommand - defaults to 'ssh'
+    #   login@host
+    my $self = shift;
+    my ($sshcmd, $serverlogin);
+    if($self->{'string'} =~ /(.+) (\S+)$/) {
+        # Own ssh command
+        $sshcmd = $1; $serverlogin = $2;
+    } else {
+        # Normal ssh
+        if($opt::controlmaster) {
+            # Use control_path to make ssh faster
+            my $control_path = $self->control_path_dir()."/ssh-%r@%h:%p";
+            $sshcmd = "ssh -S ".$control_path;
+            $serverlogin = $self->{'string'};
+            if(not $self->{'control_path'}{$control_path}++) {
+                # Master is not running for this control_path
+                # Start it
+                my $pid = fork();
+                if($pid) {
+                    $Global::sshmaster{$pid} ||= 1;
+                } else {
+		    $SIG{'TERM'} = undef;
+                    # Ignore the 'foo' being printed
+                    open(STDOUT,">","/dev/null");
+                    # OpenSSH_3.6.1p2 gives 'tcgetattr: Invalid argument' with -tt
+                    # STDERR >/dev/null to ignore "process_mux_new_session: tcgetattr: Invalid argument"
+                    open(STDERR,">","/dev/null");
+                    open(STDIN,"<","/dev/null");
+                    # Run a sleep that outputs data, so it will discover if the ssh connection closes.
+                    my $sleep = ::shell_quote_scalar('$|=1;while(1){sleep 1;print "foo\n"}');
+                    my @master = ("ssh", "-tt", "-MTS", $control_path, $serverlogin, "perl", "-e", $sleep);
+                    exec(@master);
+                }
+            }
+        } else {
+            $sshcmd = "ssh"; $serverlogin = $self->{'string'};
+        }
+    }
+    $self->{'sshcommand'} = $sshcmd;
+    $self->{'serverlogin'} = $serverlogin;
+}
+
+sub control_path_dir {
+    # Returns:
+    #   path to directory
+    my $self = shift;
+    if(not defined $self->{'control_path_dir'}) {
+        -e $ENV{'HOME'}."/.parallel" or mkdir $ENV{'HOME'}."/.parallel";
+        -e $ENV{'HOME'}."/.parallel/tmp" or mkdir $ENV{'HOME'}."/.parallel/tmp";
+        $self->{'control_path_dir'} =
+	    File::Temp::tempdir($ENV{'HOME'}
+				. "/.parallel/tmp/control_path_dir-XXXX",
+				CLEANUP => 1);
+    }
+    return $self->{'control_path_dir'};
+}
+
+sub rsync_transfer_cmd {
+    # Command to run to transfer a file
+    # Input:
+    #   $file = filename of file to transfer
+    #   $workdir = destination dir
+    # Returns:
+    #   $cmd = rsync command to run to transfer $file ("" if unreadable)
+    my $self = shift;
+    my $file = shift;
+    my $workdir = shift;
+    if(not -r $file) {
+	::warning($file, " is not readable and will not be transferred.\n");
+	return "true";
+    }
+    my $rsync_destdir;
+    if($file =~ m:^/:) {
+	# rsync /foo/bar /
+	$rsync_destdir = "/";
+    } else {
+	$rsync_destdir = ::shell_quote_file($workdir);
+    }
+    $file = ::shell_quote_file($file);
+    my $sshcmd = $self->sshcommand();
+    my $rsync_opt = "-rlDzR -e" . ::shell_quote_scalar($sshcmd);
+    my $serverlogin = $self->serverlogin();
+    # Make dir if it does not exist
+    return "( $sshcmd $serverlogin mkdir -p $rsync_destdir;" .
+	rsync()." $rsync_opt $file $serverlogin:$rsync_destdir )";
+}
+
+sub cleanup_cmd {
+    # Command to run to remove the remote file
+    # Input:
+    #   $file = filename to remove
+    #   $workdir = destination dir
+    # Returns:
+    #   $cmd = ssh command to run to remove $file and empty parent dirs
+    my $self = shift;
+    my $file = shift;
+    my $workdir = shift;
+    my $f = $file;
+    if($f =~ m:/\./:) {
+	# foo/bar/./baz/quux => workdir/baz/quux
+	# /foo/bar/./baz/quux => workdir/baz/quux
+	$f =~ s:.*/\./:$workdir/:;
+    } elsif($f =~ m:^[^/]:) {
+	# foo/bar => workdir/foo/bar
+	$f = $workdir."/".$f;
+    }
+    my @subdirs = split m:/:, ::dirname($f);
+    my @rmdir;
+    my $dir = "";
+    for(@subdirs) {
+	$dir .= $_."/";
+	unshift @rmdir, ::shell_quote_file($dir);
+    }
+    my $rmdir = @rmdir ? "rmdir @rmdir 2>/dev/null;" : "";
+    if(defined $opt::workdir and $opt::workdir eq "...") {
+	$rmdir .= "rm -rf " . ::shell_quote_file($workdir).';';
+    }
+
+    $f = ::shell_quote_file($f);
+    my $sshcmd = $self->sshcommand();
+    my $serverlogin = $self->serverlogin();
+    return "$sshcmd $serverlogin ".::shell_quote_scalar("(rm -f $f; $rmdir)");
+}
+
+{
+    my $rsync;
+
+    sub rsync {
+	# rsync 3.1.x uses protocol 31 which is unsupported by 2.5.7.
+	# If the version >= 3.1.0: downgrade to protocol 30
+	if(not $rsync) {
+	    my @out = `rsync --version`;
+	    for (@out) {
+		if(/version (\d+.\d+)(.\d+)?/) {
+		    if($1 >= 3.1) {
+			# Version 3.1.0 or later: Downgrade to protocol 30
+			$rsync = "rsync --protocol 30";
+		    } else {
+			$rsync = "rsync";
+		    }
+		}
+	    }
+	    $rsync or ::die_bug("Cannot figure out version of rsync: @out");
+	}
+	return $rsync;
+    }
+}
+
+
+package JobQueue;
+
+sub new {
+    my $class = shift;
+    my $commandref = shift;
+    my $read_from = shift;
+    my $context_replace = shift;
+    my $max_number_of_args = shift;
+    my $return_files = shift;
+    my $commandlinequeue = CommandLineQueue->new
+	($commandref, $read_from, $context_replace, $max_number_of_args,
+	 $return_files);
+    my @unget = ();
+    return bless {
+        'unget' => \@unget,
+        'commandlinequeue' => $commandlinequeue,
+        'total_jobs' => undef,
+    }, ref($class) || $class;
+}
+
+sub get {
+    my $self = shift;
+
+    if(@{$self->{'unget'}}) {
+        my $job = shift @{$self->{'unget'}};
+        return ($job);
+    } else {
+        my $commandline = $self->{'commandlinequeue'}->get();
+        if(defined $commandline) {
+            my $job = Job->new($commandline);
+            return $job;
+        } else {
+            return undef;
+        }
+    }
+}
+
+sub unget {
+    my $self = shift;
+    unshift @{$self->{'unget'}}, @_;
+}
+
+sub empty {
+    my $self = shift;
+    my $empty = (not @{$self->{'unget'}})
+	&& $self->{'commandlinequeue'}->empty();
+    ::debug("run", "JobQueue->empty $empty ");
+    return $empty;
+}
+
+sub total_jobs {
+    my $self = shift;
+    if(not defined $self->{'total_jobs'}) {
+        my $job;
+        my @queue;
+	my $start = time;
+        while($job = $self->get()) {
+	    if(time - $start > 10) {
+		::warning("Reading all arguments takes longer than 10 seconds.\n");
+		$opt::eta && ::warning("Consider removing --eta.\n");
+		$opt::bar && ::warning("Consider removing --bar.\n");
+		last;
+	    }
+            push @queue, $job;
+        }
+        while($job = $self->get()) {
+            push @queue, $job;
+        }
+
+        $self->unget(@queue);
+        $self->{'total_jobs'} = $#queue+1;
+    }
+    return $self->{'total_jobs'};
+}
+
+sub next_seq {
+    my $self = shift;
+
+    return $self->{'commandlinequeue'}->seq();
+}
+
+sub quote_args {
+    my $self = shift;
+    return $self->{'commandlinequeue'}->quote_args();
+}
+
+
+package Job;
+
+sub new {
+    my $class = shift;
+    my $commandlineref = shift;
+    return bless {
+        'commandline' => $commandlineref, # CommandLine object
+        'workdir' => undef, # --workdir
+        'stdin' => undef, # filehandle for stdin (used for --pipe)
+	# filename for writing stdout to (used for --files)
+        'remaining' => "", # remaining data not sent to stdin (used for --pipe)
+	'datawritten' => 0, # amount of data sent via stdin (used for --pipe)
+        'transfersize' => 0, # size of files using --transfer
+        'returnsize' => 0, # size of files using --return
+        'pid' => undef,
+        # hash of { SSHLogins => number of times the command failed there }
+        'failed' => undef,
+        'sshlogin' => undef,
+        # The commandline wrapped with rsync and ssh
+        'sshlogin_wrap' => undef,
+        'exitstatus' => undef,
+        'exitsignal' => undef,
+	# Timestamp for timeout if any
+	'timeout' => undef,
+	'virgin' => 1,
+    }, ref($class) || $class;
+}
+
+sub replaced {
+    my $self = shift;
+    $self->{'commandline'} or ::die_bug("commandline empty");
+    return $self->{'commandline'}->replaced();
+}
+
+sub seq {
+    my $self = shift;
+    return $self->{'commandline'}->seq();
+}
+
+sub slot {
+    my $self = shift;
+    return $self->{'commandline'}->slot();
+}
+
+{
+    my($cattail);
+
+    sub cattail {
+	# Returns:
+	#   $cattail = perl program for: cattail "decompress program" writerpid [file_to_decompress or stdin] [file_to_unlink]
+	if(not $cattail) {
+	    $cattail = q{
+		# cat followed by tail.
+		# If $writerpid dead: finish after this round
+		use Fcntl;
+
+		$|=1;
+
+		my ($cmd, $writerpid, $read_file, $unlink_file) = @ARGV;
+		if($read_file) {
+		    open(IN,"<",$read_file) || die("cattail: Cannot open $read_file");
+		} else {
+		    *IN = *STDIN;
+		}
+
+		my $flags;
+		fcntl(IN, F_GETFL, $flags) || die $!; # Get the current flags on the filehandle
+		$flags |= O_NONBLOCK; # Add non-blocking to the flags
+		fcntl(IN, F_SETFL, $flags) || die $!; # Set the flags on the filehandle
+		open(OUT,"|-",$cmd) || die("cattail: Cannot run $cmd");
+
+		while(1) {
+		    # clear EOF
+		    seek(IN,0,1);
+		    my $writer_running = kill 0, $writerpid;
+		    $read = sysread(IN,$buf,32768);
+		    if($read) {
+			# We can unlink the file now: The writer has written something
+			-e $unlink_file and unlink $unlink_file;
+			# Blocking print
+			while($buf) {
+			    my $bytes_written = syswrite(OUT,$buf);
+			    # syswrite may be interrupted by SIGHUP
+			    substr($buf,0,$bytes_written) = "";
+			}
+			# Something printed: Wait less next time
+			$sleep /= 2;
+		    } else {
+			if(eof(IN) and not $writer_running) {
+			    # Writer dead: There will never be more to read => exit
+			    exit;
+			}
+			# TODO This could probably be done more efficiently using select(2)
+			# Nothing read: Wait longer before next read
+			# Up to 30 milliseconds
+			$sleep = ($sleep < 30) ? ($sleep * 1.001 + 0.01) : ($sleep);
+			usleep($sleep);
+		    }
+		}
+
+		sub usleep {
+		    # Sleep this many milliseconds.
+		    my $secs = shift;
+		    select(undef, undef, undef, $secs/1000);
+		}
+	    };
+	    $cattail =~ s/#.*//mg;
+	    $cattail =~ s/\s+/ /g;
+	}
+	return $cattail;
+    }
+}
+
+sub openoutputfiles {
+    # Open files for STDOUT and STDERR
+    # Set file handles in $self->fh
+    my $self = shift;
+    my ($outfhw, $errfhw, $outname, $errname);
+    if($opt::results) {
+	my $args_as_dirname = $self->{'commandline'}->args_as_dirname();
+	# Output in: prefix/name1/val1/name2/val2/stdout
+	my $dir = $opt::results."/".$args_as_dirname;
+	if(eval{ File::Path::mkpath($dir); }) {
+	    # OK
+	} else {
+	    # mkpath failed: Argument probably too long.
+	    # Set $Global::max_file_length, which will keep the individual
+	    # dir names shorter than the max length
+	    max_file_name_length($opt::results);
+	    $args_as_dirname = $self->{'commandline'}->args_as_dirname();
+	    # prefix/name1/val1/name2/val2/
+	    $dir = $opt::results."/".$args_as_dirname;
+	    File::Path::mkpath($dir);
+	}
+	# prefix/name1/val1/name2/val2/stdout
+	$outname = "$dir/stdout";
+	if(not open($outfhw, "+>", $outname)) {
+	    ::error("Cannot write to `$outname'.\n");
+	    ::wait_and_exit(255);
+	}
+	# prefix/name1/val1/name2/val2/stderr
+	$errname = "$dir/stderr";
+	if(not open($errfhw, "+>", $errname)) {
+	    ::error("Cannot write to `$errname'.\n");
+	    ::wait_and_exit(255);
+	}
+	$self->set_fh(1,"unlink","");
+	$self->set_fh(2,"unlink","");
+    } elsif(not $opt::ungroup) {
+	# To group we create temporary files for STDOUT and STDERR
+	# To avoid the cleanup unlink the files immediately (but keep them open)
+	if(@Global::tee_jobs) {
+	    # files must be removed when the tee is done
+	} elsif($opt::files) {
+	    ($outfhw, $outname) = ::tmpfile(SUFFIX => ".par");
+	    ($errfhw, $errname) = ::tmpfile(SUFFIX => ".par");
+	    # --files => only remove stderr
+	    $self->set_fh(1,"unlink","");
+	    $self->set_fh(2,"unlink",$errname);
+	} else {
+	    ($outfhw, $outname) = ::tmpfile(SUFFIX => ".par");
+	    ($errfhw, $errname) = ::tmpfile(SUFFIX => ".par");
+	    $self->set_fh(1,"unlink",$outname);
+	    $self->set_fh(2,"unlink",$errname);
+	}
+    } else {
+	# --ungroup
+	open($outfhw,">&",$Global::fd{1}) || die;
+	open($errfhw,">&",$Global::fd{2}) || die;
+	# File name must be empty as it will otherwise be printed
+	$outname = "";
+	$errname = "";
+	$self->set_fh(1,"unlink",$outname);
+	$self->set_fh(2,"unlink",$errname);
+    }
+    # Set writing FD
+    $self->set_fh(1,'w',$outfhw);
+    $self->set_fh(2,'w',$errfhw);
+    $self->set_fh(1,'name',$outname);
+    $self->set_fh(2,'name',$errname);
+    if($opt::compress) {
+	# Send stdout to stdin for $opt::compress_program(1)
+	# Send stderr to stdin for $opt::compress_program(2)
+	# cattail get pid:  $pid = $self->fh($fdno,'rpid');
+	my $cattail = cattail();
+	for my $fdno (1,2) {
+	    my $wpid = open(my $fdw,"|-","$opt::compress_program >>".
+			    $self->fh($fdno,'name')) || die $?;
+	    $self->set_fh($fdno,'w',$fdw);
+	    $self->set_fh($fdno,'wpid',$wpid);
+	    my $rpid = open(my $fdr, "-|", "perl", "-e", $cattail,
+			    $opt::decompress_program, $wpid,
+			    $self->fh($fdno,'name'),$self->fh($fdno,'unlink')) || die $?;
+	    $self->set_fh($fdno,'r',$fdr);
+	    $self->set_fh($fdno,'rpid',$rpid);
+	}
+    } elsif(not $opt::ungroup) {
+	# Set reading FD if using --group (--ungroup does not need)
+	for my $fdno (1,2) {
+	    # Re-open the file for reading
+	    # so fdw can be closed separately
+	    # and fdr can be seeked separately (for --line-buffer)
+	    open(my $fdr,"<", $self->fh($fdno,'name')) ||
+		::die_bug("fdr: Cannot open ".$self->fh($fdno,'name'));
+	    $self->set_fh($fdno,'r',$fdr);
+            # Unlink if required
+	    $Global::debug or unlink $self->fh($fdno,"unlink");
+	}
+    }
+    if($opt::linebuffer) {
+	# Set non-blocking when using --linebuffer
+	$Global::use{"Fcntl"} ||= eval "use Fcntl qw(:DEFAULT :flock); 1;";
+	for my $fdno (1,2) {
+	    my $fdr = $self->fh($fdno,'r');
+	    my $flags;
+	    fcntl($fdr, &F_GETFL, $flags) || die $!; # Get the current flags on the filehandle
+	    $flags |= &O_NONBLOCK; # Add non-blocking to the flags
+	    fcntl($fdr, &F_SETFL, $flags) || die $!; # Set the flags on the filehandle
+	}
+    }
+}
+
+sub max_file_name_length {
+    # Figure out the max length of a subdir
+    # TODO and the max total length
+    # Ext4 = 255,130816
+    my $testdir = shift;
+
+    my $upper = 8_000_000;
+    my $len = 8;
+    my $dir="x"x$len;
+    do {
+	rmdir($testdir."/".$dir);
+	$len *= 16;
+	$dir="x"x$len;
+    } while (mkdir $testdir."/".$dir);
+    # Then search for the actual max length between $len/16 and $len
+    my $min = $len/16;
+    my $max = $len;
+    while($max-$min > 5) {
+	# If we are within 5 chars of the exact value:
+	# it is not worth the extra time to find the exact value
+	my $test = int(($min+$max)/2);
+	$dir="x"x$test;
+	if(mkdir $testdir."/".$dir) {
+	    rmdir($testdir."/".$dir);
+	    $min = $test;
+	} else {
+	    $max = $test;
+	}
+    }
+    $Global::max_file_length = $min;
+    return $min;
+}
+
+sub set_fh {
+    # Set file handle
+    my ($self, $fd_no, $key, $fh) = @_;
+    $self->{'fd'}{$fd_no,$key} = $fh;
+}
+
+sub fh {
+    # Get file handle
+    my ($self, $fd_no, $key) = @_;
+    return $self->{'fd'}{$fd_no,$key};
+}
+
+sub write {
+    my $self = shift;
+    my $remaining_ref = shift;
+    my $stdin_fh = $self->fh(0,"w");
+    syswrite($stdin_fh,$$remaining_ref);
+}
+
+sub set_stdin_buffer {
+    # Copy stdin buffer from $block_ref up to $endpos
+    # Prepend with $header_ref
+    # Remove $recstart and $recend if needed
+    # Input:
+    #   $header_ref = ref to $header to prepend
+    #   $block_ref = ref to $block to pass on
+    #   $endpos = length of $block to pass on
+    #   $recstart = --recstart regexp
+    #   $recend = --recend regexp
+    # Returns:
+    #   N/A
+    my $self = shift;
+    my ($header_ref,$block_ref,$endpos,$recstart,$recend) = @_;
+    $self->{'stdin_buffer'} = ($self->virgin() ? $$header_ref : "").substr($$block_ref,0,$endpos);
+    if($opt::remove_rec_sep) {
+	remove_rec_sep(\$self->{'stdin_buffer'},$recstart,$recend);
+    }
+    $self->{'stdin_buffer_length'} = length $self->{'stdin_buffer'};
+    $self->{'stdin_buffer_pos'} = 0;
+}
+
+sub stdin_buffer_length {
+    my $self = shift;
+    return $self->{'stdin_buffer_length'};
+}
+
+sub remove_rec_sep {
+    my ($block_ref,$recstart,$recend) = @_;
+    # Remove record separator
+    $$block_ref =~ s/$recend$recstart//gos;
+    $$block_ref =~ s/^$recstart//os;
+    $$block_ref =~ s/$recend$//os;
+}
+
+sub non_block_write {
+    my $self = shift;
+    my $something_written = 0;
+    use POSIX qw(:errno_h);
+#    use Fcntl;
+#    my $flags = '';
+    for my $buf (substr($self->{'stdin_buffer'},$self->{'stdin_buffer_pos'})) {
+	my $in = $self->fh(0,"w");
+#	fcntl($in, F_GETFL, $flags)
+#	    or die "Couldn't get flags for HANDLE : $!\n";
+#	$flags |= O_NONBLOCK;
+#	fcntl($in, F_SETFL, $flags)
+#	    or die "Couldn't set flags for HANDLE: $!\n";
+	my $rv = syswrite($in, $buf);
+	if (!defined($rv) && $! == EAGAIN) {
+	    # would block
+	    $something_written = 0;
+	} elsif ($self->{'stdin_buffer_pos'}+$rv != $self->{'stdin_buffer_length'}) {
+	    # incomplete write
+	    # Remove the written part
+	    $self->{'stdin_buffer_pos'} += $rv;
+	    $something_written = $rv;
+	} else {
+	    # successfully wrote everything
+	    my $a="";
+	    $self->set_stdin_buffer(\$a,\$a,"","");
+	    $something_written = $rv;
+	}
+    }
+
+    ::debug("pipe", "Non-block: ", $something_written);
+    return $something_written;
+}
+
+
+sub virgin {
+    my $self = shift;
+    return $self->{'virgin'};
+}
+
+sub set_virgin {
+    my $self = shift;
+    $self->{'virgin'} = shift;
+}
+
+sub pid {
+    my $self = shift;
+    return $self->{'pid'};
+}
+
+sub set_pid {
+    my $self = shift;
+    $self->{'pid'} = shift;
+}
+
+sub starttime {
+    # Returns:
+    #   UNIX-timestamp this job started
+    my $self = shift;
+    return sprintf("%.3f",$self->{'starttime'});
+}
+
+sub set_starttime {
+    my $self = shift;
+    my $starttime = shift || ::now();
+    $self->{'starttime'} = $starttime;
+}
+
+sub runtime {
+    # Returns:
+    #   Run time in seconds
+    my $self = shift;
+    return sprintf("%.3f",int(($self->endtime() - $self->starttime())*1000)/1000);
+}
+
+sub endtime {
+    # Returns:
+    #   UNIX-timestamp this job ended
+    #   0 if not ended yet
+    my $self = shift;
+    return ($self->{'endtime'} || 0);
+}
+
+sub set_endtime {
+    my $self = shift;
+    my $endtime = shift;
+    $self->{'endtime'} = $endtime;
+}
+
+sub timedout {
+    # Is the job timedout?
+    # Input:
+    #   $delta_time = time that the job may run
+    # Returns:
+    #   True or false
+    my $self = shift;
+    my $delta_time = shift;
+    return time > $self->{'starttime'} + $delta_time;
+}
+
+sub kill {
+    # Kill the job.
+    # Send the signals to (grand)*children and pid.
+    # If no signals: TERM TERM KILL
+    # Wait 200 ms after each TERM.
+    # Input:
+    #   @signals = signals to send
+    my $self = shift;
+    my @signals = @_;
+    my @family_pids = $self->family_pids();
+    # Record this jobs as failed
+    $self->set_exitstatus(-1);
+    # Send two TERMs to give time to clean up
+    ::debug("run", "Kill seq ", $self->seq(), "\n");
+    my @send_signals = @signals || ("TERM", "TERM", "KILL");
+    for my $signal (@send_signals) {
+	my $alive = 0;
+	for my $pid (@family_pids) {
+	    if(kill 0, $pid) {
+		# The job still running
+		kill $signal, $pid;
+		$alive = 1;
+	    }
+	}
+	# If a signal was given as input, do not do the sleep below
+	@signals and next;
+
+	if($signal eq "TERM" and $alive) {
+	    # Wait up to 200 ms between TERMs - but only if any pids are alive
+	    my $sleep = 1;
+	    for (my $sleepsum = 0; kill 0, $family_pids[0] and $sleepsum < 200;
+		 $sleepsum += $sleep) {
+		$sleep = ::reap_usleep($sleep);
+	    }
+	}
+    }
+}
+
+sub family_pids {
+    # Find the pids with this->pid as (grand)*parent
+    # Returns:
+    #   @pids = pids of (grand)*children
+    my $self = shift;
+    my $pid = $self->pid();
+    my @pids;
+
+    my ($children_of_ref, $parent_of_ref, $name_of_ref) = ::pid_table();
+
+    my @more = ($pid);
+    # While more (grand)*children
+    while(@more) {
+	my @m;
+	push @pids, @more;
+	for my $parent (@more) {
+	    if($children_of_ref->{$parent}) {
+		# add the children of this parent
+		push @m, @{$children_of_ref->{$parent}};
+	    }
+	}
+	@more = @m;
+    }
+    return (@pids);
+}
+
+sub failed {
+    # return number of times failed for this $sshlogin
+    # Input:
+    #   $sshlogin
+    # Returns:
+    #   Number of times failed for $sshlogin
+    my $self = shift;
+    my $sshlogin = shift;
+    return $self->{'failed'}{$sshlogin};
+}
+
+sub failed_here {
+    # return number of times failed for the current $sshlogin
+    # Returns:
+    #   Number of times failed for this sshlogin
+    my $self = shift;
+    return $self->{'failed'}{$self->sshlogin()};
+}
+
+sub add_failed {
+    # increase the number of times failed for this $sshlogin
+    my $self = shift;
+    my $sshlogin = shift;
+    $self->{'failed'}{$sshlogin}++;
+}
+
+sub add_failed_here {
+    # increase the number of times failed for the current $sshlogin
+    my $self = shift;
+    $self->{'failed'}{$self->sshlogin()}++;
+}
+
+sub reset_failed {
+    # increase the number of times failed for this $sshlogin
+    my $self = shift;
+    my $sshlogin = shift;
+    delete $self->{'failed'}{$sshlogin};
+}
+
+sub reset_failed_here {
+    # increase the number of times failed for this $sshlogin
+    my $self = shift;
+    delete $self->{'failed'}{$self->sshlogin()};
+}
+
+sub min_failed {
+    # Returns:
+    #   the number of sshlogins this command has failed on
+    #   the minimal number of times this command has failed
+    my $self = shift;
+    my $min_failures =
+	::min(map { $self->{'failed'}{$_} } keys %{$self->{'failed'}});
+    my $number_of_sshlogins_failed_on = scalar keys %{$self->{'failed'}};
+    return ($number_of_sshlogins_failed_on,$min_failures);
+}
+
+sub total_failed {
+    # Returns:
+    #   $total_failures = the number of times this command has failed
+    my $self = shift;
+    my $total_failures = 0;
+    for (values %{$self->{'failed'}}) {
+	$total_failures += $_;
+    }
+    return $total_failures;
+}
+
+sub wrapped {
+    # Wrap command with:
+    # * --shellquote
+    # * --nice
+    # * --cat
+    # * --fifo
+    # * --sshlogin
+    # * --pipepart (@Global::cat_partials)
+    # * --pipe
+    # * --tmux
+    # The ordering of the wrapping is important:
+    # * --nice/--cat/--fifo should be done on the remote machine
+    # * --pipepart/--pipe should be done on the local machine inside --tmux
+    # Uses:
+    #   $Global::envvar
+    #   $opt::shellquote
+    #   $opt::nice
+    #   $Global::shell
+    #   $opt::cat
+    #   $opt::fifo
+    #   @Global::cat_partials
+    #   $opt::pipe
+    #   $opt::tmux
+    # Returns:
+    #   $self->{'wrapped'} = the command wrapped with the above
+    my $self = shift;
+    if(not defined $self->{'wrapped'}) {
+	my $command = $Global::envvar.$self->replaced();
+	if($opt::shellquote) {
+	    # Prepend echo
+	    # and quote twice
+	    $command = "echo " .
+		::shell_quote_scalar(::shell_quote_scalar($command));
+	}
+	if($opt::nice) {
+	    # Prepend \nice -n19 $SHELL -c
+	    # and quote.
+	    # The '\' before nice is needed to avoid tcsh's built-in
+	    $command = '\nice'. " -n". $opt::nice. " ".
+		$Global::shell. " -c ".
+		::shell_quote_scalar($command);
+	}
+	if($opt::cat) {
+	    # Prepend 'cat > {};'
+	    # Append '_EXIT=$?;(rm {};exit $_EXIT)'
+	    $command =
+		$self->{'commandline'}->replace_placeholders(["cat > \257<\257>; "], 0, 0).
+		$command.
+		$self->{'commandline'}->replace_placeholders(
+		    ["; _EXIT=\$?; rm \257<\257>; exit \$_EXIT"], 0, 0);
+	} elsif($opt::fifo) {
+	    # Prepend 'mkfifo {}; ('
+	    # Append ') & _PID=$!; cat > {}; wait $_PID; _EXIT=$?;(rm {};exit $_EXIT)'
+	    $command =
+		$self->{'commandline'}->replace_placeholders(["mkfifo \257<\257>; ("], 0, 0).
+		$command.
+		$self->{'commandline'}->replace_placeholders([") & _PID=\$!; cat > \257<\257>; ",
+					    "wait \$_PID; _EXIT=\$?; ",
+					    "rm \257<\257>; exit \$_EXIT"],
+					    0,0);
+	}
+	# Wrap with ssh + tranferring of files
+	$command = $self->sshlogin_wrap($command);
+	if(@Global::cat_partials) {
+	    # Prepend:
+	    # < /tmp/foo perl -e 'while(@ARGV) { sysseek(STDIN,shift,0) || die; $left = shift; while($read = sysread(STDIN,$buf, ($left > 32768 ? 32768 : $left))){ $left -= $read; syswrite(STDOUT,$buf); } }'  0 0 0 11 |
+	    $command = (shift @Global::cat_partials). "|". "(". $command. ")";
+	} elsif($opt::pipe) {
+	    # Prepend EOF-detector to avoid starting $command if EOF.
+	    # The $tmpfile might exist if run on a remote system - we accept that risk
+	    my ($dummy_fh, $tmpfile) = ::tmpfile(SUFFIX => ".chr");
+	    # Unlink to avoid leaving files if --dry-run or --sshlogin
+	    unlink $tmpfile;
+	    $command =
+		# Exit value:
+		#   empty input = true
+		#   some input = exit val from command
+		qq{ sh -c 'dd bs=1 count=1 of=$tmpfile 2>/dev/null'; }.
+		qq{ test \! -s "$tmpfile" && rm -f "$tmpfile" && exec true; }.
+		qq{ (cat $tmpfile; rm $tmpfile; cat - ) | }.
+		"($command);";
+	}
+	if($opt::tmux) {
+	    # Wrap command with 'tmux'
+	    $command = $self->tmux_wrap($command);
+	}
+	$self->{'wrapped'} = $command;
+    }
+    return $self->{'wrapped'};
+}
+
+sub set_sshlogin {
+    my $self = shift;
+    my $sshlogin = shift;
+    $self->{'sshlogin'} = $sshlogin;
+    delete $self->{'sshlogin_wrap'}; # If sshlogin is changed the wrap is wrong
+    delete $self->{'wrapped'};
+}
+
+sub sshlogin {
+    my $self = shift;
+    return $self->{'sshlogin'};
+}
+
+sub sshlogin_wrap {
+    # Wrap the command with the commands needed to run remotely
+    # Returns:
+    #   $self->{'sshlogin_wrap'} = command wrapped with ssh+transfer commands
+    my $self = shift;
+    my $command = shift;
+    if(not defined $self->{'sshlogin_wrap'}) {
+	my $sshlogin = $self->sshlogin();
+	my $sshcmd = $sshlogin->sshcommand();
+	my $serverlogin = $sshlogin->serverlogin();
+	my ($pre,$post,$cleanup)=("","","");
+
+	if($serverlogin eq ":") {
+	    # No transfer neeeded
+	    $self->{'sshlogin_wrap'} = $command;
+	} else {
+	    # --transfer
+	    $pre .= $self->sshtransfer();
+	    # --return
+	    $post .= $self->sshreturn();
+	    # --cleanup
+	    $post .= $self->sshcleanup();
+	    if($post) {
+		# We need to save the exit status of the job
+		$post = '_EXIT_status=$?; ' . $post . ' exit $_EXIT_status;';
+	    }
+	    # If the remote login shell is (t)csh then use 'setenv'
+	    # otherwise use 'export'
+	    # We cannot use parse_env_var(), as PARALLEL_SEQ changes
+	    # for each command
+	    my $parallel_env =
+		($Global::envwarn
+		 . q{ 'eval `echo $SHELL | grep "/t\\{0,1\\}csh" > /dev/null }
+		 . q{ && echo setenv PARALLEL_SEQ '$PARALLEL_SEQ'\; }
+		 . q{ setenv PARALLEL_PID '$PARALLEL_PID' }
+		 . q{ || echo PARALLEL_SEQ='$PARALLEL_SEQ'\;export PARALLEL_SEQ\; }
+		 . q{ PARALLEL_PID='$PARALLEL_PID'\;export PARALLEL_PID` ;' });
+	    my $remote_pre = "";
+	    my $ssh_options = "";
+	    if(($opt::pipe or $opt::pipepart) and $opt::ctrlc
+	       or
+	       not ($opt::pipe or $opt::pipepart) and not $opt::noctrlc) {
+		# TODO Determine if this is needed
+		# Propagating CTRL-C to kill remote jobs requires
+		# remote jobs to be run with a terminal.
+		$ssh_options = "-tt -oLogLevel=quiet";
+#		$ssh_options = "";
+		# tty - check if we have a tty.
+		# stty:
+		#   -onlcr - make output 8-bit clean
+		#   isig - pass CTRL-C as signal
+		#   -echo - do not echo input
+		$remote_pre .= ::shell_quote_scalar('tty >/dev/null && stty isig -onlcr -echo;');
+	    }
+	    if($opt::workdir) {
+		my $wd = ::shell_quote_file($self->workdir());
+		$remote_pre .= ::shell_quote_scalar("mkdir -p ") . $wd .
+		    ::shell_quote_scalar("; cd ") . $wd .
+		    # exit 255 (instead of exec false) would be the correct thing,
+		    # but that fails on tcsh
+		    ::shell_quote_scalar(qq{ || exec false;});
+	    }
+	    # This script is to solve the problem of
+	    # * not mixing STDERR and STDOUT
+	    # * terminating with ctrl-c
+	    # It works on Linux but not Solaris
+	    # Finishes on Solaris, but wrong exit code:
+	    # $SIG{CHLD} = sub {exit ($?&127 ? 128+($?&127) : 1+$?>>8)};
+	    # Hangs on Solaris, but correct exit code on Linux:
+	    # $SIG{CHLD} = sub { $done = 1 };
+	    # $p->poll;
+	    my $signal_script = "perl -e '".
+	    q{
+		use IO::Poll;
+                $SIG{CHLD} = sub { $done = 1 };
+		$p = IO::Poll->new;
+		$p->mask(STDOUT, POLLHUP);
+		$pid=fork; unless($pid) {setpgrp; exec $ENV{SHELL}, "-c", @ARGV; die "exec: $!\n"}
+                $p->poll;
+		kill SIGHUP, -${pid} unless $done;
+		wait; exit ($?&127 ? 128+($?&127) : 1+$?>>8)
+            } . "' ";
+	    $signal_script =~ s/\s+/ /g;
+
+	    $self->{'sshlogin_wrap'} =
+		($pre
+		 . "$sshcmd $ssh_options $serverlogin $parallel_env "
+		 . $remote_pre
+#		 . ::shell_quote_scalar($signal_script . ::shell_quote_scalar($command))
+		 . ::shell_quote_scalar($command)
+		 . ";"
+		 . $post);
+	}
+    }
+    return $self->{'sshlogin_wrap'};
+}
+
+sub transfer {
+    # Files to transfer
+    # Returns:
+    #   @transfer - File names of files to transfer
+    my $self = shift;
+    my @transfer = ();
+    $self->{'transfersize'} = 0;
+    if($opt::transfer) {
+	for my $record (@{$self->{'commandline'}{'arg_list'}}) {
+	    # Merge arguments from records into args
+	    for my $arg (@$record) {
+		CORE::push @transfer, $arg->orig();
+		# filesize
+		if(-e $arg->orig()) {
+		    $self->{'transfersize'} += (stat($arg->orig()))[7];
+		}
+	    }
+	}
+    }
+    return @transfer;
+}
+
+sub transfersize {
+    my $self = shift;
+    return $self->{'transfersize'};
+}
+
+sub sshtransfer {
+    # Returns for each transfer file:
+    #   rsync $file remote:$workdir
+    my $self = shift;
+    my @pre;
+    my $sshlogin = $self->sshlogin();
+    my $workdir = $self->workdir();
+    for my $file ($self->transfer()) {
+      push @pre, $sshlogin->rsync_transfer_cmd($file,$workdir).";";
+    }
+    return join("",@pre);
+}
+
+sub return {
+    # Files to return
+    # Non-quoted and with {...} substituted
+    # Returns:
+    #   @non_quoted_filenames
+    my $self = shift;
+    return $self->{'commandline'}->
+	replace_placeholders($self->{'commandline'}{'return_files'},0,0);
+}
+
+sub returnsize {
+    # This is called after the job has finished
+    # Returns:
+    #   $number_of_bytes transferred in return
+    my $self = shift;
+    for my $file ($self->return()) {
+	if(-e $file) {
+	    $self->{'returnsize'} += (stat($file))[7];
+	}
+    }
+    return $self->{'returnsize'};
+}
+
+sub sshreturn {
+    # Returns for each return-file:
+    #   rsync remote:$workdir/$file .
+    my $self = shift;
+    my $sshlogin = $self->sshlogin();
+    my $sshcmd = $sshlogin->sshcommand();
+    my $serverlogin = $sshlogin->serverlogin();
+    my $rsync_opt = "-rlDzR -e".::shell_quote_scalar($sshcmd);
+    my $pre = "";
+    for my $file ($self->return()) {
+	$file =~ s:^\./::g; # Remove ./ if any
+	my $relpath = ($file !~ m:^/:); # Is the path relative?
+	my $cd = "";
+	my $wd = "";
+	if($relpath) {
+	    #   rsync -avR /foo/./bar/baz.c remote:/tmp/
+	    # == (on old systems)
+	    #   rsync -avR --rsync-path="cd /foo; rsync" remote:bar/baz.c /tmp/
+	    $wd = ::shell_quote_file($self->workdir()."/");
+	}
+	# Only load File::Basename if actually needed
+	$Global::use{"File::Basename"} ||= eval "use File::Basename; 1;";
+	# dir/./file means relative to dir, so remove dir on remote
+	$file =~ m:(.*)/\./:;
+	my $basedir = $1 ? ::shell_quote_file($1."/") : "";
+	my $nobasedir = $file;
+	$nobasedir =~ s:.*/\./::;
+	$cd = ::shell_quote_file(::dirname($nobasedir));
+	my $rsync_cd = '--rsync-path='.::shell_quote_scalar("cd $wd$cd; rsync");
+	my $basename = ::shell_quote_scalar(::shell_quote_file(basename($file)));
+	# --return
+	#   mkdir -p /home/tange/dir/subdir/;
+        #   rsync (--protocol 30) -rlDzR --rsync-path="cd /home/tange/dir/subdir/; rsync"
+        #   server:file.gz /home/tange/dir/subdir/
+	$pre .= "mkdir -p $basedir$cd; ".$sshlogin->rsync()." $rsync_cd $rsync_opt $serverlogin:".
+	     $basename . " ".$basedir.$cd.";";
+    }
+    return $pre;
+}
+
+sub sshcleanup {
+    # Return the sshcommand needed to remove the file
+    # Returns:
+    #   ssh command needed to remove files from sshlogin
+    my $self = shift;
+    my $sshlogin = $self->sshlogin();
+    my $sshcmd = $sshlogin->sshcommand();
+    my $serverlogin = $sshlogin->serverlogin();
+    my $workdir = $self->workdir();
+    my $cleancmd = "";
+
+    for my $file ($self->cleanup()) {
+	my @subworkdirs = parentdirs_of($file);
+	$cleancmd .= $sshlogin->cleanup_cmd($file,$workdir).";";
+    }
+    if(defined $opt::workdir and $opt::workdir eq "...") {
+	$cleancmd .= "$sshcmd $serverlogin rm -rf " . ::shell_quote_scalar($workdir).';';
+    }
+    return $cleancmd;
+}
+
+sub cleanup {
+    # Returns:
+    #   Files to remove at cleanup
+    my $self = shift;
+    if($opt::cleanup) {
+	my @transfer = $self->transfer();
+	my @return = $self->return();
+	return (@transfer,@return);
+    } else {
+	return ();
+    }
+}
+
+sub workdir {
+    # Returns:
+    #   the workdir on a remote machine
+    my $self = shift;
+    if(not defined $self->{'workdir'}) {
+	my $workdir;
+	if(defined $opt::workdir) {
+	    if($opt::workdir eq ".") {
+		# . means current dir
+		my $home = $ENV{'HOME'};
+		eval 'use Cwd';
+		my $cwd = cwd();
+		$workdir = $cwd;
+		if($home) {
+		    # If homedir exists: remove the homedir from
+		    # workdir if cwd starts with homedir
+		    # E.g. /home/foo/my/dir => my/dir
+		    # E.g. /tmp/my/dir => /tmp/my/dir
+		    my ($home_dev, $home_ino) = (stat($home))[0,1];
+		    my $parent = "";
+		    my @dir_parts = split(m:/:,$cwd);
+		    my $part;
+		    while(defined ($part = shift @dir_parts)) {
+			$part eq "" and next;
+			$parent .= "/".$part;
+			my ($parent_dev, $parent_ino) = (stat($parent))[0,1];
+			if($parent_dev == $home_dev and $parent_ino == $home_ino) {
+			    # dev and ino is the same: We found the homedir.
+			    $workdir = join("/",@dir_parts);
+			    last;
+			}
+		    }
+		}
+		if($workdir eq "") {
+		    $workdir = ".";
+		}
+	    } elsif($opt::workdir eq "...") {
+		$workdir = ".parallel/tmp/" . ::hostname() . "-" . $$
+		    . "-" . $self->seq();
+	    } else {
+		$workdir = $opt::workdir;
+		# Rsync treats /./ special. We don't want that
+		$workdir =~ s:/\./:/:g; # Remove /./
+		$workdir =~ s:/+$::; # Remove ending / if any
+		$workdir =~ s:^\./::g; # Remove starting ./ if any
+	    }
+	} else {
+	    $workdir = ".";
+	}
+	$self->{'workdir'} = ::shell_quote_scalar($workdir);
+    }
+    return $self->{'workdir'};
+}
+
+sub parentdirs_of {
+    # Return:
+    #   all parentdirs except . of this dir or file - sorted desc by length
+    my $d = shift;
+    my @parents = ();
+    while($d =~ s:/[^/]+$::) {
+	if($d ne ".") {
+	    push @parents, $d;
+	}
+    }
+    return @parents;
+}
+
+sub start {
+    # Setup STDOUT and STDERR for a job and start it.
+    # Returns:
+    #   job-object or undef if job not to run
+    my $job = shift;
+    # Get the shell command to be executed (possibly with ssh infront).
+    my $command = $job->wrapped();
+
+    if($Global::interactive or $Global::stderr_verbose) {
+	if($Global::interactive) {
+	    print $Global::original_stderr "$command ?...";
+	    open(my $tty_fh, "<", "/dev/tty") || ::die_bug("interactive-tty");
+	    my $answer = <$tty_fh>;
+	    close $tty_fh;
+	    my $run_yes = ($answer =~ /^\s*y/i);
+	    if (not $run_yes) {
+		$command = "true"; # Run the command 'true'
+	    }
+	} else {
+	    print $Global::original_stderr "$command\n";
+	}
+    }
+
+    my $pid;
+    $job->openoutputfiles();
+    my($stdout_fh,$stderr_fh) = ($job->fh(1,"w"),$job->fh(2,"w"));
+    local (*IN,*OUT,*ERR);
+    open OUT, '>&', $stdout_fh or ::die_bug("Can't redirect STDOUT: $!");
+    open ERR, '>&', $stderr_fh or ::die_bug("Can't dup STDOUT: $!");
+
+    if(($opt::dryrun or $Global::verbose) and $opt::ungroup) {
+	if($Global::verbose <= 1) {
+	    print $stdout_fh $job->replaced(),"\n";
+	} else {
+	    # Verbose level > 1: Print the rsync and stuff
+	    print $stdout_fh $command,"\n";
+	}
+    }
+    if($opt::dryrun) {
+	$command = "true";
+    }
+    $ENV{'PARALLEL_SEQ'} = $job->seq();
+    $ENV{'PARALLEL_PID'} = $$;
+    ::debug("run", $Global::total_running, " processes . Starting (",
+	    $job->seq(), "): $command\n");
+    if($opt::pipe) {
+	my ($stdin_fh);
+	# The eval is needed to catch exception from open3
+	eval {
+	    $pid = ::open3($stdin_fh, ">&OUT", ">&ERR", $Global::shell, "-c", $command) ||
+		::die_bug("open3-pipe");
+	    1;
+	};
+	$job->set_fh(0,"w",$stdin_fh);
+    } elsif(@opt::a and not $Global::stdin_in_opt_a and $job->seq() == 1
+	    and $job->sshlogin()->string() eq ":") {
+	# Give STDIN to the first job if using -a (but only if running
+	# locally - otherwise CTRL-C does not work for other jobs Bug#36585)
+	*IN = *STDIN;
+	# The eval is needed to catch exception from open3
+	eval {
+	    $pid = ::open3("<&IN", ">&OUT", ">&ERR", $Global::shell, "-c", $command) ||
+		::die_bug("open3-a");
+	    1;
+	};
+	# Re-open to avoid complaining
+	open(STDIN, "<&", $Global::original_stdin)
+	    or ::die_bug("dup-\$Global::original_stdin: $!");
+    } elsif ($opt::tty and not $Global::tty_taken and -c "/dev/tty" and
+	     open(my $devtty_fh, "<", "/dev/tty")) {
+	# Give /dev/tty to the command if no one else is using it
+	*IN = $devtty_fh;
+	# The eval is needed to catch exception from open3
+	eval {
+	    $pid = ::open3("<&IN", ">&OUT", ">&ERR", $Global::shell, "-c", $command) ||
+		::die_bug("open3-/dev/tty");
+	    $Global::tty_taken = $pid;
+	    close $devtty_fh;
+	    1;
+	};
+    } else {
+	# The eval is needed to catch exception from open3
+	eval {
+	    $pid = ::open3(::gensym, ">&OUT", ">&ERR", $Global::shell, "-c", $command) ||
+		::die_bug("open3-gensym");
+	    1;
+	};
+    }
+    if($pid) {
+	# A job was started
+	$Global::total_running++;
+	$Global::total_started++;
+	$job->set_pid($pid);
+	$job->set_starttime();
+	$Global::running{$job->pid()} = $job;
+	if($opt::timeout) {
+	    $Global::timeoutq->insert($job);
+	}
+	$Global::newest_job = $job;
+	$Global::newest_starttime = ::now();
+	return $job;
+    } else {
+	# No more processes
+	::debug("run", "Cannot spawn more jobs.\n");
+	return undef;
+    }
+}
+
+sub tmux_wrap {
+    # Wrap command with tmux for session pPID
+    # Input:
+    #   $actual_command = the actual command being run (incl ssh wrap)
+    my $self = shift;
+    my $actual_command = shift;
+    # Temporary file name. Used for fifo to communicate exit val
+    my ($fh, $tmpfile) = ::tmpfile(SUFFIX => ".tmx");
+    $Global::unlink{$tmpfile}=1;
+    close $fh;
+    unlink $tmpfile;
+    my $visual_command = $self->replaced();
+    my $title = $visual_command;
+    # ; causes problems
+    # ascii 194-245 annoys tmux
+    $title =~ tr/[\011-\016;\302-\365]//d;
+
+    my $tmux;
+    if($Global::total_running == 0) {
+	$tmux = "tmux new-session -s p$$ -d -n ".
+	    ::shell_quote_scalar($title);
+	print $Global::original_stderr "See output with: tmux attach -t p$$\n";
+    } else {
+	$tmux = "tmux new-window -t p$$ -n ".::shell_quote_scalar($title);
+    }
+    return "mkfifo $tmpfile; $tmux ".
+	# Run in tmux
+	::shell_quote_scalar(
+	"(".$actual_command.');(echo $?$status;echo 255) >'.$tmpfile."&".
+	"echo ".::shell_quote_scalar($visual_command).";".
+	"echo \007Job finished at: `date`;sleep 10").
+	# Run outside tmux
+	# Read the first line from the fifo and use that as status code
+	";  exit `perl -ne 'unlink \$ARGV; 1..1 and print' $tmpfile` ";
+}
+
+sub is_already_in_results {
+    # Do we already have results for this job?
+    # Returns:
+    #   $job_already_run = bool whether there is output for this or not
+    my $job = $_[0];
+    my $args_as_dirname = $job->{'commandline'}->args_as_dirname();
+    # prefix/name1/val1/name2/val2/
+    my $dir = $opt::results."/".$args_as_dirname;
+    ::debug("run", "Test $dir/stdout", -e "$dir/stdout", "\n");
+    return -e "$dir/stdout";
+}
+
+sub is_already_in_joblog {
+    my $job = shift;
+    return vec($Global::job_already_run,$job->seq(),1);
+}
+
+sub set_job_in_joblog {
+    my $job = shift;
+    vec($Global::job_already_run,$job->seq(),1) = 1;
+}
+
+sub should_be_retried {
+    # Should this job be retried?
+    # Returns
+    #   0 - do not retry
+    #   1 - job queued for retry
+    my $self = shift;
+    if (not $opt::retries) {
+	return 0;
+    }
+    if(not $self->exitstatus()) {
+	# Completed with success. If there is a recorded failure: forget it
+	$self->reset_failed_here();
+	return 0
+    } else {
+	# The job failed. Should it be retried?
+	$self->add_failed_here();
+	if($self->total_failed() == $opt::retries) {
+	    # This has been retried enough
+	    return 0;
+	} else {
+	    # This command should be retried
+	    $self->set_endtime(undef);
+	    $Global::JobQueue->unget($self);
+	    ::debug("run", "Retry ", $self->seq(), "\n");
+	    return 1;
+	}
+    }
+}
+
+sub print {
+    # Print the output of the jobs
+    # Returns: N/A
+
+    my $self = shift;
+    ::debug("print", ">>joboutput ", $self->replaced(), "\n");
+    if($opt::dryrun) {
+	# Nothing was printed to this job:
+	# cleanup tmp files if --files was set
+	unlink $self->fh(1,"name");
+    }
+    if($opt::pipe and $self->virgin()) {
+	# Skip --joblog, --dryrun, --verbose
+    } else {
+	if($Global::joblog and defined $self->{'exitstatus'}) {
+	    # Add to joblog when finished
+	    $self->print_joblog();
+	}
+
+	# Printing is only relevant for grouped/--line-buffer output.
+	$opt::ungroup and return;
+	# Check for disk full
+	exit_if_disk_full();
+
+	if(($opt::dryrun or $Global::verbose)
+	   and
+	   not $self->{'verbose_printed'}) {
+	    $self->{'verbose_printed'}++;
+	    if($Global::verbose <= 1) {
+		print STDOUT $self->replaced(),"\n";
+	    } else {
+		# Verbose level > 1: Print the rsync and stuff
+		print STDOUT $self->wrapped(),"\n";
+	    }
+	    # If STDOUT and STDERR are merged,
+	    # we want the command to be printed first
+	    # so flush to avoid STDOUT being buffered
+	    flush STDOUT;
+	}
+    }
+    for my $fdno (sort { $a <=> $b } keys %Global::fd) {
+	# Sort by file descriptor numerically: 1,2,3,..,9,10,11
+	$fdno == 0 and next;
+	my $out_fd = $Global::fd{$fdno};
+	my $in_fh = $self->fh($fdno,"r");
+	if(not $in_fh) {
+	    if(not $Job::file_descriptor_warning_printed{$fdno}++) {
+		# ::warning("File descriptor $fdno not defined\n");
+	    }
+	    next;
+	}
+	::debug("print", "File descriptor $fdno (", $self->fh($fdno,"name"), "):");
+	if($opt::files) {
+	    # If --compress: $in_fh must be closed first.
+	    close $self->fh($fdno,"w");
+	    close $in_fh;
+	    if($opt::pipe and $self->virgin()) {
+		# Nothing was printed to this job:
+		# cleanup unused tmp files if --files was set
+		for my $fdno (1,2) {
+		    unlink $self->fh($fdno,"name");
+		    unlink $self->fh($fdno,"unlink");
+		}
+	    } elsif($fdno == 1 and $self->fh($fdno,"name")) {
+		print $out_fd $self->fh($fdno,"name"),"\n";
+	    }
+	} elsif($opt::linebuffer) {
+	    # Line buffered print out
+	    $self->linebuffer_print($fdno,$in_fh,$out_fd);
+	} else {
+	    my $buf;
+	    close $self->fh($fdno,"w");
+	    seek $in_fh, 0, 0;
+	    # $in_fh is now ready for reading at position 0
+	    if($opt::tag or defined $opt::tagstring) {
+		my $tag = $self->tag();
+		if($fdno == 2) {
+		    # OpenSSH_3.6.1p2 gives 'tcgetattr: Invalid argument' with -tt
+		    # This is a crappy way of ignoring it.
+		    while(<$in_fh>) {
+			if(/^(client_process_control: )?tcgetattr: Invalid argument\n/) {
+			    # Skip
+			} else {
+			    print $out_fd $tag,$_;
+			}
+			# At most run the loop once
+			last;
+		    }
+		}
+		while(<$in_fh>) {
+		    print $out_fd $tag,$_;
+		}
+	    } else {
+		my $buf;
+		if($fdno == 2) {
+		    # OpenSSH_3.6.1p2 gives 'tcgetattr: Invalid argument' with -tt
+		    # This is a crappy way of ignoring it.
+		    sysread($in_fh,$buf,1_000);
+		    $buf =~ s/^(client_process_control: )?tcgetattr: Invalid argument\n//;
+		    print $out_fd $buf;
+		}
+		while(sysread($in_fh,$buf,32768)) {
+		    print $out_fd $buf;
+		}
+	    }
+	    close $in_fh;
+	}
+	flush $out_fd;
+    }
+    ::debug("print", "<<joboutput @command\n");
+}
+
+sub linebuffer_print {
+    my $self = shift;
+    my ($fdno,$in_fh,$out_fd) = @_;
+    my $partial = \$self->{'partial_line',$fdno};
+
+    if(defined $self->{'exitstatus'}) {
+	# If the job is dead: close printing fh. Needed for --compress
+	close $self->fh($fdno,"w");
+	if($opt::compress) {
+	    # Blocked reading in final round
+	    $Global::use{"Fcntl"} ||= eval "use Fcntl qw(:DEFAULT :flock); 1;";
+	    for my $fdno (1,2) {
+		my $fdr = $self->fh($fdno,'r');
+		my $flags;
+		fcntl($fdr, &F_GETFL, $flags) || die $!; # Get the current flags on the filehandle
+		$flags &= ~&O_NONBLOCK; # Remove non-blocking to the flags
+		fcntl($fdr, &F_SETFL, $flags) || die $!; # Set the flags on the filehandle
+	    }
+	}
+    }
+    # This seek will clear EOF
+    seek $in_fh, tell($in_fh), 0;
+    # The read is non-blocking: The $in_fh is set to non-blocking.
+    # 32768 --tag = 5.1s
+    # 327680 --tag = 4.4s
+    # 1024000 --tag = 4.4s
+    # 3276800 --tag = 4.3s
+    # 32768000 --tag = 4.7s
+    # 10240000 --tag = 4.3s
+    while(read($in_fh,substr($$partial,length $$partial),3276800)) {
+	# Append to $$partial
+	# Find the last \n
+	my $i = rindex($$partial,"\n");
+	if($i != -1) {
+	    # One or more complete lines were found
+	    if($fdno == 2 and not $self->{'printed_first_line',$fdno}++) {
+		# OpenSSH_3.6.1p2 gives 'tcgetattr: Invalid argument' with -tt
+		# This is a crappy way of ignoring it.
+		$$partial =~ s/^(client_process_control: )?tcgetattr: Invalid argument\n//;
+		# Length of partial line has changed: Find the last \n again
+		$i = rindex($$partial,"\n");
+	    }
+	    if($opt::tag or defined $opt::tagstring) {
+		# Replace ^ with $tag within the full line
+		my $tag = $self->tag();
+		substr($$partial,0,$i+1) =~ s/^/$tag/gm;
+		# Length of partial line has changed: Find the last \n again
+		$i = rindex($$partial,"\n");
+	    }
+	    # Print up to and including the last \n
+	    print $out_fd substr($$partial,0,$i+1);
+	    # Remove the printed part
+	    substr($$partial,0,$i+1)="";
+	}
+    }
+    if(defined $self->{'exitstatus'}) {
+	# If the job is dead: print the remaining partial line
+	# read remaining
+	if($$partial and ($opt::tag or defined $opt::tagstring)) {
+	    my $tag = $self->tag();
+	    $$partial =~ s/^/$tag/gm;
+	}
+	print $out_fd $$partial;
+	# Release the memory
+	$$partial = undef;
+	if($self->fh($fdno,"rpid") and CORE::kill 0, $self->fh($fdno,"rpid")) {
+	    # decompress still running
+	} else {
+	    # decompress done: close fh
+	    close $in_fh;
+	}
+    }
+}
+
+sub print_joblog {
+    my $self = shift;
+    my $cmd;
+    if($Global::verbose <= 1) {
+	$cmd = $self->replaced();
+    } else {
+	# Verbose level > 1: Print the rsync and stuff
+	$cmd = "@command";
+    }
+    print $Global::joblog
+	join("\t", $self->seq(), $self->sshlogin()->string(),
+	     $self->starttime(), sprintf("%10.3f",$self->runtime()),
+	     $self->transfersize(), $self->returnsize(),
+	     $self->exitstatus(), $self->exitsignal(), $cmd
+	). "\n";
+    flush $Global::joblog;
+    $self->set_job_in_joblog();
+}
+
+sub tag {
+    my $self = shift;
+    if(not defined $self->{'tag'}) {
+	$self->{'tag'} = $self->{'commandline'}->
+	    replace_placeholders([$opt::tagstring],0,0)."\t";
+    }
+    return $self->{'tag'};
+}
+
+sub hostgroups {
+    my $self = shift;
+    if(not defined $self->{'hostgroups'}) {
+	$self->{'hostgroups'} = $self->{'commandline'}->{'arg_list'}[0][0]->{'hostgroups'};
+    }
+    return @{$self->{'hostgroups'}};
+}
+
+sub exitstatus {
+    my $self = shift;
+    return $self->{'exitstatus'};
+}
+
+sub set_exitstatus {
+    my $self = shift;
+    my $exitstatus = shift;
+    if($exitstatus) {
+	# Overwrite status if non-zero
+	$self->{'exitstatus'} = $exitstatus;
+    } else {
+	# Set status but do not overwrite
+	# Status may have been set by --timeout
+	$self->{'exitstatus'} ||= $exitstatus;
+    }
+}
+
+sub exitsignal {
+    my $self = shift;
+    return $self->{'exitsignal'};
+}
+
+sub set_exitsignal {
+    my $self = shift;
+    my $exitsignal = shift;
+    $self->{'exitsignal'} = $exitsignal;
+}
+
+{
+    my ($disk_full_fh, $b8193, $name);
+    sub exit_if_disk_full {
+	# Checks if $TMPDIR is full by writing 8kb to a tmpfile
+	# If the disk is full: Exit immediately.
+	# Returns:
+	#   N/A
+	if(not $disk_full_fh) {
+	    ($disk_full_fh, $name) = ::tmpfile(SUFFIX => ".df");
+	    unlink $name;
+	    $b8193 = "x"x8193;
+	}
+	# Linux does not discover if a disk is full if writing <= 8192
+	# Tested on:
+	# bfs btrfs cramfs ext2 ext3 ext4 ext4dev jffs2 jfs minix msdos
+	# ntfs reiserfs tmpfs ubifs vfat xfs
+	# TODO this should be tested on different OS similar to this:
+	#
+	# doit() {
+	#   sudo mount /dev/ram0 /mnt/loop; sudo chmod 1777 /mnt/loop
+	#   seq 100000 | parallel --tmpdir /mnt/loop/ true &
+	#   seq 6900000 > /mnt/loop/i && echo seq OK
+	#   seq 6980868 > /mnt/loop/i
+	#   seq 10000 > /mnt/loop/ii
+	#   sleep 3
+	#   sudo umount /mnt/loop/ || sudo umount -l /mnt/loop/
+	#   echo >&2
+	# }
+	print $disk_full_fh $b8193;
+	if(not $disk_full_fh
+	   or
+	   tell $disk_full_fh == 0) {
+	    ::error("Output is incomplete. Cannot append to buffer file in $ENV{'TMPDIR'}. Is the disk full?\n");
+	    ::error("Change \$TMPDIR with --tmpdir or use --compress.\n");
+	    ::wait_and_exit(255);
+	}
+	truncate $disk_full_fh, 0;
+	seek($disk_full_fh, 0, 0) || die;
+    }
+}
+
+
+package CommandLine;
+
+sub new {
+    my $class = shift;
+    my $seq = shift;
+    my $commandref = shift;
+    $commandref || die;
+    my $arg_queue = shift;
+    my $context_replace = shift;
+    my $max_number_of_args = shift; # for -N and normal (-n1)
+    my $return_files = shift;
+    my $replacecount_ref = shift;
+    my $len_ref = shift;
+    my %replacecount = %$replacecount_ref;
+    my %len = %$len_ref;
+    for (keys %$replacecount_ref) {
+	# Total length of this replacement string {} replaced with all args
+	$len{$_} = 0;
+    }
+    return bless {
+	'command' => $commandref,
+	'seq' => $seq,
+	'len' => \%len,
+	'arg_list' => [],
+	'arg_queue' => $arg_queue,
+	'max_number_of_args' => $max_number_of_args,
+	'replacecount' => \%replacecount,
+	'context_replace' => $context_replace,
+	'return_files' => $return_files,
+	'replaced' => undef,
+    }, ref($class) || $class;
+}
+
+sub seq {
+    my $self = shift;
+    return $self->{'seq'};
+}
+
+{
+    my $max_slot_number;
+
+    sub slot {
+	# Find the number of a free job slot and return it
+	# Uses:
+	#   @Global::slots
+	# Returns:
+	#   $jobslot = number of jobslot
+	my $self = shift;
+	if(not $self->{'slot'}) {
+	    if(not @Global::slots) {
+		# $Global::max_slot_number will typically be $Global::max_jobs_running
+		push @Global::slots, ++$max_slot_number;
+	    }
+	    $self->{'slot'} = shift @Global::slots;
+	}
+	return $self->{'slot'};
+    }
+}
+
+sub populate {
+    # Add arguments from arg_queue until the number of arguments or
+    # max line length is reached
+    # Uses:
+    #   $Global::minimal_command_line_length
+    #   $opt::cat
+    #   $opt::fifo
+    #   $Global::JobQueue
+    #   $opt::m
+    #   $opt::X
+    #   $CommandLine::already_spread
+    #   $Global::max_jobs_running
+    # Returns: N/A
+    my $self = shift;
+    my $next_arg;
+    my $max_len = $Global::minimal_command_line_length || Limits::Command::max_length();
+
+    if($opt::cat or $opt::fifo) {
+	# Generate a tempfile name that will be used as {}
+	my($outfh,$name) = ::tmpfile(SUFFIX => ".pip");
+	close $outfh;
+	# Unlink is needed if: ssh otheruser@localhost
+	unlink $name;
+	$Global::JobQueue->{'commandlinequeue'}->{'arg_queue'}->unget([Arg->new($name)]);
+    }
+
+    while (not $self->{'arg_queue'}->empty()) {
+	$next_arg = $self->{'arg_queue'}->get();
+	if(not defined $next_arg) {
+	    next;
+	}
+	$self->push($next_arg);
+	if($self->len() >= $max_len) {
+	    # Command length is now > max_length
+	    # If there are arguments: remove the last
+	    # If there are no arguments: Error
+	    # TODO stuff about -x opt_x
+	    if($self->number_of_args() > 1) {
+		# There is something to work on
+		$self->{'arg_queue'}->unget($self->pop());
+		last;
+	    } else {
+		my $args = join(" ", map { $_->orig() } @$next_arg);
+		::error("Command line too long (",
+			$self->len(), " >= ",
+			$max_len,
+			") at number ",
+			$self->{'arg_queue'}->arg_number(),
+			": ".
+			(substr($args,0,50))."...\n");
+		$self->{'arg_queue'}->unget($self->pop());
+		::wait_and_exit(255);
+	    }
+	}
+
+	if(defined $self->{'max_number_of_args'}) {
+	    if($self->number_of_args() >= $self->{'max_number_of_args'}) {
+		last;
+	    }
+	}
+    }
+    if(($opt::m or $opt::X) and not $CommandLine::already_spread
+       and $self->{'arg_queue'}->empty() and $Global::max_jobs_running) {
+	# -m or -X and EOF => Spread the arguments over all jobslots
+	# (unless they are already spread)
+	$CommandLine::already_spread ||= 1;
+	if($self->number_of_args() > 1) {
+	    $self->{'max_number_of_args'} =
+		::ceil($self->number_of_args()/$Global::max_jobs_running);
+	    $Global::JobQueue->{'commandlinequeue'}->{'max_number_of_args'} =
+		$self->{'max_number_of_args'};
+	    $self->{'arg_queue'}->unget($self->pop_all());
+	    while($self->number_of_args() < $self->{'max_number_of_args'}) {
+		$self->push($self->{'arg_queue'}->get());
+	    }
+	}
+    }
+}
+
+sub push {
+    # Add one or more records as arguments
+    # Returns: N/A
+    my $self = shift;
+    my $record = shift;
+    push @{$self->{'arg_list'}}, $record;
+
+    my $quote_arg = $Global::noquote ? 0 : not $Global::quoting;
+    my $rep;
+    for my $arg (@$record) {
+	if(defined $arg) {
+	    for my $perlexpr (keys %{$self->{'replacecount'}}) {
+		# 50% faster than below
+		$self->{'len'}{$perlexpr} += length $arg->replace($perlexpr,$quote_arg,$self);
+		# $rep = $arg->replace($perlexpr,$quote_arg,$self);
+		# $self->{'len'}{$perlexpr} += length $rep;
+		# ::debug("length", "Length: ", length $rep,
+		# "(", $perlexpr, "=>", $rep, ")\n");
+	    }
+	}
+    }
+}
+
+sub pop {
+    # Remove last argument
+    # Returns:
+    #   the last record
+    my $self = shift;
+    my $record = pop @{$self->{'arg_list'}};
+    my $quote_arg = $Global::noquote ? 0 : not $Global::quoting;
+    for my $arg (@$record) {
+	if(defined $arg) {
+	    for my $perlexpr (keys %{$self->{'replacecount'}}) {
+		$self->{'len'}{$perlexpr} -=
+		    length $arg->replace($perlexpr,$quote_arg,$self);
+	    }
+	}
+    }
+    return $record;
+}
+
+sub pop_all {
+    # Remove all arguments and zeros the length of replacement strings
+    # Returns:
+    #   all records
+    my $self = shift;
+    my @popped = @{$self->{'arg_list'}};
+    for my $replacement_string (keys %{$self->{'replacecount'}}) {
+	$self->{'len'}{$replacement_string} = 0;
+    }
+    $self->{'arg_list'} = [];
+    return @popped;
+}
+
+sub number_of_args {
+    # The number of records
+    # Returns:
+    #   number of records
+    my $self = shift;
+    # Ftq rudef oaawuq ime dqxqmeqp az 2011-01-24 mzp ime iaz nk MQhmd
+    # Mdzrvadp Nvmdymeaz az 2011-04-10. Ftue oaawuq dqxqmeqp az
+    # 2013-08-18 ue m nuf tmdpqd me kag tmhq fa geq daf14. Bxqmeq
+    # qymux oaawuq@fmzsq.pw itqz kag dqmp ftue.
+    #
+    # U my ftq ymuzfmuzqd ar m buqoq ar rdqq earfimdq omxxqp SZG
+    # Bmdmxxqx. Rdqq earfimdq sgmdmzfqqe kag mooqee fa ftq eagdoq
+    # oapq, ngf U tmhq nqqz iazpqduzs tai ymzk mofgmxxk _dqmp_ ftq
+    # eagdoq oapq.
+    #
+    # Fa fqef ftue U bgf uz m oayyqzf fqxxuzs bqabxq fa qymux yq itqz
+    # ftqk dqmp ftue. Ftq oayyqzf ime bgf uz m eqofuaz ar ftq oapq
+    # ftmf za azq iagxp xaaw fa ruj ad uybdahq ftq earfimdq - ea ftq
+    # eagdoq oapq qcguhmxqzf fa m pgefk oadzqd. Fa ymwq egdq ftq
+    # oayyqzf iagxp zaf etai gb ur eayq azq vgef sdqbbqp ftdagst ftq
+    # eagdoq oapq U daf13'qp ftq eagdoq oapq
+    # tffb://qz.iuwubqpum.ads/iuwu/DAF13
+    #
+    # 2.5 yazfte xmfqd U dqoquhqp mz qymux rday eayqazq ita zaf azxk
+    # ymzmsqp fa ruzp ftq oayyqzf, ngf mxea ymzmsqp fa sgqee ftq oapq
+    # tmp fa nq daf13'qp.
+    #
+    # Ftue nduzse yq fa ftq oazoxgeuaz ftmf ftqdq _mdq_ bqabxq, ita
+    # mdq zaf mrruxumfqp iuft ftq bdavqof, ftmf iuxx dqmp ftq eagdoq
+    # oapq - ftagst uf ymk zaf tmbbqz hqdk arfqz.
+    #
+    # This is really the number of records
+    return $#{$self->{'arg_list'}}+1;
+}
+
+sub number_of_recargs {
+    # The number of args in records
+    # Returns:
+    #   number of args records
+    my $self = shift;
+    my $sum = 0;
+    my $nrec = scalar @{$self->{'arg_list'}};
+    if($nrec) {
+	$sum = $nrec * (scalar @{$self->{'arg_list'}[0]});
+    }
+    return $sum;
+}
+
+sub args_as_string {
+    # Returns:
+    #  all unmodified arguments joined with ' ' (similar to {})
+    my $self = shift;
+    return (join " ", map { $_->orig() }
+	    map { @$_ } @{$self->{'arg_list'}});
+}
+
+sub args_as_dirname {
+    # Returns:
+    #  all unmodified arguments joined with '/' (similar to {})
+    #  \t \0 \\ and / are quoted as: \t \0 \\ \_
+    # If $Global::max_file_length: Keep subdirs < $Global::max_file_length
+    my $self = shift;
+    my @res = ();
+
+    for my $rec_ref (@{$self->{'arg_list'}}) {
+	# If headers are used, sort by them.
+	# Otherwise keep the order from the command line.
+	my @header_indexes_sorted = header_indexes_sorted($#$rec_ref+1);
+	for my $n (@header_indexes_sorted) {
+	    CORE::push(@res,
+		 $Global::input_source_header{$n},
+		 map { my $s = $_;
+		       #  \t \0 \\ and / are quoted as: \t \0 \\ \_
+		       $s =~ s/\\/\\\\/g;
+		       $s =~ s/\t/\\t/g;
+		       $s =~ s/\0/\\0/g;
+		       $s =~ s:/:\\_:g;
+		       if($Global::max_file_length) {
+			   # Keep each subdir shorter than the longest
+			   # allowed file name
+			   $s = substr($s,0,$Global::max_file_length);
+		       }
+		       $s; }
+		 $rec_ref->[$n-1]->orig());
+	}
+    }
+    return join "/", @res;
+}
+
+sub header_indexes_sorted {
+    # Sort headers first by number then by name.
+    # E.g.: 1a 1b 11a 11b
+    # Returns:
+    #  Indexes of %Global::input_source_header sorted
+    my $max_col = shift;
+
+    no warnings 'numeric';
+    for my $col (1 .. $max_col) {
+	# Make sure the header is defined. If it is not: use column number
+	if(not defined $Global::input_source_header{$col}) {
+	    $Global::input_source_header{$col} = $col;
+	}
+    }
+    my @header_indexes_sorted = sort {
+	# Sort headers numerically then asciibetically
+	$Global::input_source_header{$a} <=> $Global::input_source_header{$b}
+	or
+	    $Global::input_source_header{$a} cmp $Global::input_source_header{$b}
+    } 1 .. $max_col;
+    return @header_indexes_sorted;
+}
+
+sub len {
+    # Uses:
+    #   $opt::shellquote
+    # The length of the command line with args substituted
+    my $self = shift;
+    my $len = 0;
+    # Add length of the original command with no args
+    # Length of command w/ all replacement args removed
+    $len += $self->{'len'}{'noncontext'} + @{$self->{'command'}} -1;
+    ::debug("length", "noncontext + command: $len\n");
+    my $recargs = $self->number_of_recargs();
+    if($self->{'context_replace'}) {
+	# Context is duplicated for each arg
+	$len += $recargs * $self->{'len'}{'context'};
+	for my $replstring (keys %{$self->{'replacecount'}}) {
+	    # If the replacements string is more than once: mulitply its length
+	    $len += $self->{'len'}{$replstring} *
+		$self->{'replacecount'}{$replstring};
+	    ::debug("length", $replstring, " ", $self->{'len'}{$replstring}, "*",
+		    $self->{'replacecount'}{$replstring}, "\n");
+	}
+	# echo 11 22 33 44 55 66 77 88 99 1010
+	# echo 1 2 3 4 5 6 7 8 9 10 1 2 3 4 5 6 7 8 9 10
+	# 5 +  ctxgrp*arg
+	::debug("length", "Ctxgrp: ", $self->{'len'}{'contextgroups'},
+		" Groups: ", $self->{'len'}{'noncontextgroups'}, "\n");
+	# Add space between context groups
+	$len += ($recargs-1) * ($self->{'len'}{'contextgroups'});
+    } else {
+	# Each replacement string may occur several times
+	# Add the length for each time
+	$len += 1*$self->{'len'}{'context'};
+	::debug("length", "context+noncontext + command: $len\n");
+	for my $replstring (keys %{$self->{'replacecount'}}) {
+	    # (space between regargs + length of replacement)
+	    # * number this replacement is used
+	    $len += ($recargs -1 + $self->{'len'}{$replstring}) *
+		$self->{'replacecount'}{$replstring};
+	}
+    }
+    if($opt::nice) {
+	# Pessimistic length if --nice is set
+	# Worse than worst case: every char needs to be quoted with \
+	$len *= 2;
+    }
+    if($Global::quoting) {
+	# Pessimistic length if -q is set
+	# Worse than worst case: every char needs to be quoted with \
+	$len *= 2;
+    }
+    if($opt::shellquote) {
+	# Pessimistic length if --shellquote is set
+	# Worse than worst case: every char needs to be quoted with \ twice
+	$len *= 4;
+    }
+    # If we are using --env, add the prefix for that, too.
+    $len += $Global::envvarlen;
+
+    return $len;
+}
+
+sub replaced {
+    # Uses:
+    #   $Global::noquote
+    #   $Global::quoting
+    # Returns:
+    #   $replaced = command with place holders replaced and prepended
+    my $self = shift;
+    if(not defined $self->{'replaced'}) {
+	# Don't quote arguments if the input is the full command line
+	my $quote_arg = $Global::noquote ? 0 : not $Global::quoting;
+	$self->{'replaced'} = $self->replace_placeholders($self->{'command'},$Global::quoting,$quote_arg);
+	my $len = length $self->{'replaced'};
+	if ($len != $self->len()) {
+	    ::debug("length", $len, " != ", $self->len(), " ", $self->{'replaced'}, "\n");
+	} else {
+	    ::debug("length", $len, " == ", $self->len(), " ", $self->{'replaced'}, "\n");
+	}
+    }
+    return $self->{'replaced'};
+}
+
+sub replace_placeholders {
+    # Replace foo{}bar with fooargbar
+    # Input:
+    #   $targetref = command as shell words
+    #   $quote = should everything be quoted?
+    #   $quote_arg = should replaced arguments be quoted?
+    # Returns:
+    #   @target with placeholders replaced
+    my $self = shift;
+    my $targetref = shift;
+    my $quote = shift;
+    my $quote_arg = shift;
+    my $context_replace = $self->{'context_replace'};
+    my @target = @$targetref;
+    ::debug("replace", "Replace @target\n");
+    # -X = context replace
+    # maybe multiple input sources
+    # maybe --xapply
+    if(not @target) {
+	# @target is empty: Return empty array
+	return @target;
+    }
+    # Fish out the words that have replacement strings in them
+    my %word;
+    for (@target) {
+	my $tt = $_;
+	::debug("replace", "Target: $tt");
+	# a{1}b{}c{}d
+	# a{=1 $_=$_ =}b{= $_=$_ =}c{= $_=$_ =}d
+	# a\257<1 $_=$_ \257>b\257< $_=$_ \257>c\257< $_=$_ \257>d
+	#    A B C => aAbA B CcA B Cd
+	# -X A B C => aAbAcAd aAbBcBd aAbCcCd
+
+	if($context_replace) {
+	    while($tt =~ s/([^\s\257]*  # before {=
+                     (?:
+                      \257<       # {=
+                      [^\257]*?   # The perl expression
+                      \257>       # =}
+                      [^\s\257]*  # after =}
+                     )+)/ /x) {
+		# $1 = pre \257 perlexpr \257 post
+		$word{"$1"} ||= 1;
+	    }
+	} else {
+	    while($tt =~ s/( (?: \257<([^\257]*?)\257>) )//x) {
+		# $f = \257 perlexpr \257
+		$word{$1} ||= 1;
+	    }
+	}
+    }
+    my @word = keys %word;
+
+    my %replace;
+    my @arg;
+    for my $record (@{$self->{'arg_list'}}) {
+	# $self->{'arg_list'} = [ [Arg11, Arg12], [Arg21, Arg22], [Arg31, Arg32] ]
+	# Merge arg-objects from records into @arg for easy access
+	CORE::push @arg, @$record;
+    }
+    # Add one arg if empty to allow {#} and {%} to be computed only once
+    if(not @arg) { @arg = (Arg->new("")); }
+    # Number of arguments - used for positional arguments
+    my $n = $#_+1;
+
+    # This is actually a CommandLine-object,
+    # but it looks nice to be able to say {= $job->slot() =}
+    my $job = $self;
+    for my $word (@word) {
+	# word = AB \257< perlexpr \257> CD \257< perlexpr \257> EF
+	my $w = $word;
+	::debug("replace", "Replacing in $w\n");
+
+	# Replace positional arguments
+	$w =~ s< ([^\s\257]*)  # before {=
+                 \257<         # {=
+                 (-?\d+)       # Position (eg. -2 or 3)
+                 ([^\257]*?)   # The perl expression
+                 \257>         # =}
+                 ([^\s\257]*)  # after =}
+               >
+	   { $1. # Context (pre)
+		 (
+		 $arg[$2 > 0 ? $2-1 : $n+$2] ? # If defined: replace
+		 $arg[$2 > 0 ? $2-1 : $n+$2]->replace($3,$quote_arg,$self)
+		 : "")
+		 .$4 }egx;# Context (post)
+	::debug("replace", "Positional replaced $word with: $w\n");
+
+	if($w !~ /\257/) {
+	    # No more replacement strings in $w: No need to do more
+	    if($quote) {
+		CORE::push(@{$replace{::shell_quote($word)}}, $w);
+	    } else {
+		CORE::push(@{$replace{$word}}, $w);
+	    }
+	    next;
+	}
+	# for each arg:
+	#   compute replacement for each string
+	#   replace replacement strings with replacement in the word value
+	#   push to replace word value
+	::debug("replace", "Positional done: $w\n");
+	for my $arg (@arg) {
+	    my $val = $w;
+	    my $number_of_replacements = 0;
+	    for my $perlexpr (keys %{$self->{'replacecount'}}) {
+		# Replace {= perl expr =} with value for each arg
+		$number_of_replacements +=
+		    $val =~ s{\257<\Q$perlexpr\E\257>}
+		{$arg ? $arg->replace($perlexpr,$quote_arg,$self) : ""}eg;
+	    }
+	    my $ww = $word;
+	    if($quote) {
+		$ww = ::shell_quote_scalar($word);
+		$val = ::shell_quote_scalar($val);
+	    }
+	    if($number_of_replacements) {
+		CORE::push(@{$replace{$ww}}, $val);
+	    }
+	}
+    }
+
+    if($quote) {
+	@target = ::shell_quote(@target);
+    }
+    # ::debug("replace", "%replace=",::my_dump(%replace),"\n");
+    if(%replace) {
+	# Substitute the replace strings with the replacement values
+	# Must be sorted by length if a short word is a substring of a long word
+	my $regexp = join('|', map { my $s = $_; $s =~ s/(\W)/\\$1/g; $s }
+			  sort { length $b <=> length $a } keys %replace);
+	for(@target) {
+	    s/($regexp)/join(" ",@{$replace{$1}})/ge;
+	}
+    }
+    ::debug("replace", "Return @target\n");
+    return wantarray ? @target : "@target";
+}
+
+
+package CommandLineQueue;
+
+sub new {
+    my $class = shift;
+    my $commandref = shift;
+    my $read_from = shift;
+    my $context_replace = shift;
+    my $max_number_of_args = shift;
+    my $return_files = shift;
+    my @unget = ();
+    my ($count,%replacecount,$posrpl,$perlexpr,%len);
+    my @command = @$commandref;
+    # If the first command start with '-' it is probably an option
+    if($command[0] =~ /^\s*(-\S+)/) {
+	# Is this really a command in $PATH starting with '-'?
+	my $cmd = $1;
+	if(not ::which($cmd)) {
+	    ::error("Command ($cmd) starts with '-'. Is this a wrong option?\n");
+	    ::wait_and_exit(255);
+	}
+    }
+    # Replace replacement strings with {= perl expr =}
+    # Protect matching inside {= perl expr =}
+    # by replacing {= and =} with \257< and \257>
+    for(@command) {
+	if(/\257/) {
+	    ::error("Command cannot contain the character \257. Use a function for that.\n");
+	    ::wait_and_exit(255);
+	}
+	s/\Q$Global::parensleft\E(.*?)\Q$Global::parensright\E/\257<$1\257>/gx;
+    }
+    for my $rpl (keys %Global::rpl) {
+	# Replace the short hand string with the {= perl expr =} in $command and $opt::tagstring
+	# Avoid replacing inside existing {= perl expr =}
+	for(@command,@Global::ret_files) {
+	    while(s/((^|\257>)[^\257]*?) # Don't replace after \257 unless \257>
+                  \Q$rpl\E/$1\257<$Global::rpl{$rpl}\257>/xg) {
+	    }
+	}
+	if(defined $opt::tagstring) {
+	    for($opt::tagstring) {
+		while(s/((^|\257>)[^\257]*?) # Don't replace after \257 unless \257>
+                      \Q$rpl\E/$1\257<$Global::rpl{$rpl}\257>/x) {}
+	    }
+	}
+	# Do the same for the positional replacement strings
+	# A bit harder as we have to put in the position number
+	$posrpl = $rpl;
+	if($posrpl =~ s/^\{//) {
+	    # Only do this if the shorthand start with {
+	    for(@command,@Global::ret_files) {
+		s/\{(-?\d+)\Q$posrpl\E/\257<$1 $Global::rpl{$rpl}\257>/g;
+	    }
+	    if(defined $opt::tagstring) {
+		$opt::tagstring =~ s/\{(-?\d+)\Q$posrpl\E/\257<$1 $perlexpr\257>/g;
+	    }
+	}
+    }
+    my $sum = 0;
+    while($sum == 0) {
+	# Count how many times each replacement string is used
+	my @cmd = @command;
+	my $contextlen = 0;
+	my $noncontextlen = 0;
+	my $contextgroups = 0;
+	for my $c (@cmd) {
+	    while($c =~ s/ \257<([^\257]*?)\257> /\000/x) {
+		# %replacecount = { "perlexpr" => number of times seen }
+		# e.g { "$_++" => 2 }
+		$replacecount{$1} ++;
+		$sum++;
+	    }
+	    # Measure the length of the context around the {= perl expr =}
+	    # Use that {=...=} has been replaced with \000 above
+	    # So there is no need to deal with \257<
+	    while($c =~ s/ (\S*\000\S*) //x) {
+		my $w = $1;
+		$w =~ tr/\000//d; # Remove all \000's
+		$contextlen += length($w);
+		$contextgroups++;
+	    }
+	    # All {= perl expr =} have been removed: The rest is non-context
+	    $noncontextlen += length $c;
+	}
+	if($opt::tagstring) {
+	    my $t = $opt::tagstring;
+	    while($t =~ s/ \257<([^\257]*)\257> //x) {
+		# %replacecount = { "perlexpr" => number of times seen }
+		# e.g { "$_++" => 2 }
+		# But for tagstring we just need to mark it as seen
+		$replacecount{$1}||=1;
+	    }
+	}
+
+	$len{'context'} = 0+$contextlen;
+	$len{'noncontext'} = $noncontextlen;
+	$len{'contextgroups'} = $contextgroups;
+	$len{'noncontextgroups'} = @cmd-$contextgroups;
+	::debug("length", "@command Context: ", $len{'context'},
+		" Non: ", $len{'noncontext'}, " Ctxgrp: ", $len{'contextgroups'},
+		" NonCtxGrp: ", $len{'noncontextgroups'}, "\n");
+	if($sum == 0) {
+	    # Default command = {}
+	    # If not replacement string: append {}
+	    if(not @command) {
+		@command = ("\257<\257>");
+		$Global::noquote = 1;
+	    } elsif(($opt::pipe or $opt::pipepart)
+		    and not $opt::fifo and not $opt::cat) {
+		# With --pipe / --pipe-part you can have no replacement
+		last;
+	    } else {
+		# Append {} to the command if there are no {...}'s and no {=...=}
+		push @command, ("\257<\257>");
+	    }
+	}
+    }
+
+    return bless {
+	'unget' => \@unget,
+	'command' => \@command,
+	'replacecount' => \%replacecount,
+	'arg_queue' => RecordQueue->new($read_from,$opt::colsep),
+	'context_replace' => $context_replace,
+	'len' => \%len,
+	'max_number_of_args' => $max_number_of_args,
+	'size' => undef,
+	'return_files' => $return_files,
+	'seq' => 1,
+    }, ref($class) || $class;
+}
+
+sub get {
+    my $self = shift;
+    if(@{$self->{'unget'}}) {
+	my $cmd_line = shift @{$self->{'unget'}};
+	return ($cmd_line);
+    } else {
+	my $cmd_line;
+	$cmd_line = CommandLine->new($self->seq(),
+				     $self->{'command'},
+				     $self->{'arg_queue'},
+				     $self->{'context_replace'},
+				     $self->{'max_number_of_args'},
+				     $self->{'return_files'},
+				     $self->{'replacecount'},
+				     $self->{'len'},
+	    );
+	$cmd_line->populate();
+	::debug("init","cmd_line->number_of_args ",
+		$cmd_line->number_of_args(), "\n");
+	if($opt::pipe or $opt::pipepart) {
+	    if($cmd_line->replaced() eq "") {
+		# Empty command - pipe requires a command
+		::error("--pipe must have a command to pipe into (e.g. 'cat').\n");
+		::wait_and_exit(255);
+	    }
+	} else {
+	    if($cmd_line->number_of_args() == 0) {
+		# We did not get more args - maybe at EOF string?
+		return undef;
+	    } elsif($cmd_line->replaced() eq "") {
+		# Empty command - get the next instead
+		return $self->get();
+	    }
+	}
+	$self->set_seq($self->seq()+1);
+	return $cmd_line;
+    }
+}
+
+sub unget {
+    my $self = shift;
+    unshift @{$self->{'unget'}}, @_;
+}
+
+sub empty {
+    my $self = shift;
+    my $empty = (not @{$self->{'unget'}}) && $self->{'arg_queue'}->empty();
+    ::debug("run", "CommandLineQueue->empty $empty");
+    return $empty;
+}
+
+sub seq {
+    my $self = shift;
+    return $self->{'seq'};
+}
+
+sub set_seq {
+    my $self = shift;
+    $self->{'seq'} = shift;
+}
+
+sub quote_args {
+    my $self = shift;
+    # If there is not command emulate |bash
+    return $self->{'command'};
+}
+
+sub size {
+    my $self = shift;
+    if(not $self->{'size'}) {
+	my @all_lines = ();
+	while(not $self->{'arg_queue'}->empty()) {
+	    push @all_lines, CommandLine->new($self->{'command'},
+					      $self->{'arg_queue'},
+					      $self->{'context_replace'},
+					      $self->{'max_number_of_args'});
+	}
+	$self->{'size'} = @all_lines;
+	$self->unget(@all_lines);
+    }
+    return $self->{'size'};
+}
+
+
+package Limits::Command;
+
+# Maximal command line length (for -m and -X)
+sub max_length {
+    # Find the max_length of a command line and cache it
+    # Returns:
+    #   number of chars on the longest command line allowed
+    if(not $Limits::Command::line_max_len) {
+	# Disk cache of max command line length
+	my $len_cache = $ENV{'HOME'} . "/.parallel/tmp/linelen-" . ::hostname();
+	my $cached_limit;
+	if(-e $len_cache) {
+	    open(my $fh, "<", $len_cache) || ::die_bug("Cannot read $len_cache");
+	    $cached_limit = <$fh>;
+	    close $fh;
+	} else {
+	    $cached_limit = real_max_length();
+	    # If $HOME is write protected: Do not fail
+	    mkdir($ENV{'HOME'} . "/.parallel");
+	    mkdir($ENV{'HOME'} . "/.parallel/tmp");
+	    open(my $fh, ">", $len_cache);
+	    print $fh $cached_limit;
+	    close $fh;
+	}
+	$Limits::Command::line_max_len = $cached_limit;
+	if($opt::max_chars) {
+	    if($opt::max_chars <= $cached_limit) {
+		$Limits::Command::line_max_len = $opt::max_chars;
+	    } else {
+		::warning("Value for -s option ",
+			  "should be < $cached_limit.\n");
+	    }
+	}
+    }
+    return $Limits::Command::line_max_len;
+}
+
+sub real_max_length {
+    # Find the max_length of a command line
+    # Returns:
+    #   The maximal command line length
+    # Use an upper bound of 8 MB if the shell allows for for infinite long lengths
+    my $upper = 8_000_000;
+    my $len = 8;
+    do {
+	if($len > $upper) { return $len };
+	$len *= 16;
+    } while (is_acceptable_command_line_length($len));
+    # Then search for the actual max length between 0 and upper bound
+    return binary_find_max_length(int($len/16),$len);
+}
+
+sub binary_find_max_length {
+    # Given a lower and upper bound find the max_length of a command line
+    # Returns:
+    #   number of chars on the longest command line allowed
+    my ($lower, $upper) = (@_);
+    if($lower == $upper or $lower == $upper-1) { return $lower; }
+    my $middle = int (($upper-$lower)/2 + $lower);
+    ::debug("init", "Maxlen: $lower,$upper,$middle : ");
+    if (is_acceptable_command_line_length($middle)) {
+	return binary_find_max_length($middle,$upper);
+    } else {
+	return binary_find_max_length($lower,$middle);
+    }
+}
+
+sub is_acceptable_command_line_length {
+    # Test if a command line of this length can run
+    # Returns:
+    #   0 if the command line length is too long
+    #   1 otherwise
+    my $len = shift;
+
+    local *STDERR;
+    open (STDERR, ">", "/dev/null");
+    system "true "."x"x$len;
+    close STDERR;
+    ::debug("init", "$len=$? ");
+    return not $?;
+}
+
+
+package RecordQueue;
+
+sub new {
+    my $class = shift;
+    my $fhs = shift;
+    my $colsep = shift;
+    my @unget = ();
+    my $arg_sub_queue;
+    if($colsep) {
+	# Open one file with colsep
+	$arg_sub_queue = RecordColQueue->new($fhs);
+    } else {
+	# Open one or more files if multiple -a
+	$arg_sub_queue = MultifileQueue->new($fhs);
+    }
+    return bless {
+	'unget' => \@unget,
+	'arg_number' => 0,
+	'arg_sub_queue' => $arg_sub_queue,
+    }, ref($class) || $class;
+}
+
+sub get {
+    # Returns:
+    #   reference to array of Arg-objects
+    my $self = shift;
+    if(@{$self->{'unget'}}) {
+	$self->{'arg_number'}++;
+	return shift @{$self->{'unget'}};
+    }
+    my $ret = $self->{'arg_sub_queue'}->get();
+    if(defined $Global::max_number_of_args
+       and $Global::max_number_of_args == 0) {
+	::debug("run", "Read 1 but return 0 args\n");
+	return [Arg->new("")];
+    } else {
+	return $ret;
+    }
+}
+
+sub unget {
+    my $self = shift;
+    ::debug("run", "RecordQueue-unget '@_'\n");
+    $self->{'arg_number'} -= @_;
+    unshift @{$self->{'unget'}}, @_;
+}
+
+sub empty {
+    my $self = shift;
+    my $empty = not @{$self->{'unget'}};
+    $empty &&= $self->{'arg_sub_queue'}->empty();
+    ::debug("run", "RecordQueue->empty $empty");
+    return $empty;
+}
+
+sub arg_number {
+    my $self = shift;
+    return $self->{'arg_number'};
+}
+
+
+package RecordColQueue;
+
+sub new {
+    my $class = shift;
+    my $fhs = shift;
+    my @unget = ();
+    my $arg_sub_queue = MultifileQueue->new($fhs);
+    return bless {
+	'unget' => \@unget,
+	'arg_sub_queue' => $arg_sub_queue,
+    }, ref($class) || $class;
+}
+
+sub get {
+    # Returns:
+    #   reference to array of Arg-objects
+    my $self = shift;
+    if(@{$self->{'unget'}}) {
+	return shift @{$self->{'unget'}};
+    }
+    my $unget_ref=$self->{'unget'};
+    if($self->{'arg_sub_queue'}->empty()) {
+	return undef;
+    }
+    my $in_record = $self->{'arg_sub_queue'}->get();
+    if(defined $in_record) {
+	my @out_record = ();
+	for my $arg (@$in_record) {
+	    ::debug("run", "RecordColQueue::arg $arg\n");
+	    my $line = $arg->orig();
+	    ::debug("run", "line='$line'\n");
+	    if($line ne "") {
+		for my $s (split /$opt::colsep/o, $line, -1) {
+		    push @out_record, Arg->new($s);
+		}
+	    } else {
+		push @out_record, Arg->new("");
+	    }
+	}
+	return \@out_record;
+    } else {
+	return undef;
+    }
+}
+
+sub unget {
+    my $self = shift;
+    ::debug("run", "RecordColQueue-unget '@_'\n");
+    unshift @{$self->{'unget'}}, @_;
+}
+
+sub empty {
+    my $self = shift;
+    my $empty = (not @{$self->{'unget'}} and $self->{'arg_sub_queue'}->empty());
+    ::debug("run", "RecordColQueue->empty $empty");
+    return $empty;
+}
+
+
+package MultifileQueue;
+
+@Global::unget_argv=();
+
+sub new {
+    my $class = shift;
+    my $fhs = shift;
+    for my $fh (@$fhs) {
+	if(-t $fh) {
+	    ::warning("Input is read from the terminal. ".
+		      "Only experts do this on purpose. ".
+		      "Press CTRL-D to exit.\n");
+	}
+    }
+    return bless {
+	'unget' => \@Global::unget_argv,
+	'fhs' => $fhs,
+	'arg_matrix' => undef,
+    }, ref($class) || $class;
+}
+
+sub get {
+    my $self = shift;
+    if($opt::xapply) {
+	return $self->xapply_get();
+    } else {
+	return $self->nest_get();
+    }
+}
+
+sub unget {
+    my $self = shift;
+    ::debug("run", "MultifileQueue-unget '@_'\n");
+    unshift @{$self->{'unget'}}, @_;
+}
+
+sub empty {
+    my $self = shift;
+    my $empty = (not @Global::unget_argv
+		 and not @{$self->{'unget'}});
+    for my $fh (@{$self->{'fhs'}}) {
+	$empty &&= eof($fh);
+    }
+    ::debug("run", "MultifileQueue->empty $empty ");
+    return $empty;
+}
+
+sub xapply_get {
+    my $self = shift;
+    if(@{$self->{'unget'}}) {
+	return shift @{$self->{'unget'}};
+    }
+    my @record = ();
+    my $prepend = undef;
+    my $empty = 1;
+    for my $fh (@{$self->{'fhs'}}) {
+	my $arg = read_arg_from_fh($fh);
+	if(defined $arg) {
+	    # Record $arg for recycling at end of file
+	    push @{$self->{'arg_matrix'}{$fh}}, $arg;
+	    push @record, $arg;
+	    $empty = 0;
+	} else {
+	    ::debug("run", "EOA ");
+	    # End of file: Recycle arguments
+	    push @{$self->{'arg_matrix'}{$fh}}, shift @{$self->{'arg_matrix'}{$fh}};
+	    # return last @{$args->{'args'}{$fh}};
+	    push @record, @{$self->{'arg_matrix'}{$fh}}[-1];
+	}
+    }
+    if($empty) {
+	return undef;
+    } else {
+	return \@record;
+    }
+}
+
+sub nest_get {
+    my $self = shift;
+    if(@{$self->{'unget'}}) {
+	return shift @{$self->{'unget'}};
+    }
+    my @record = ();
+    my $prepend = undef;
+    my $empty = 1;
+    my $no_of_inputsources = $#{$self->{'fhs'}} + 1;
+    if(not $self->{'arg_matrix'}) {
+	# Initialize @arg_matrix with one arg from each file
+	# read one line from each file
+	my @first_arg_set;
+	my $all_empty = 1;
+	for (my $fhno = 0; $fhno < $no_of_inputsources ; $fhno++) {
+	    my $arg = read_arg_from_fh($self->{'fhs'}[$fhno]);
+	    if(defined $arg) {
+		$all_empty = 0;
+	    }
+	    $self->{'arg_matrix'}[$fhno][0] = $arg || Arg->new("");
+	    push @first_arg_set, $self->{'arg_matrix'}[$fhno][0];
+	}
+	if($all_empty) {
+	    # All filehandles were at eof or eof-string
+	    return undef;
+	}
+	return [@first_arg_set];
+    }
+
+    # Treat the case with one input source special.  For multiple
+    # input sources we need to remember all previously read values to
+    # generate all combinations. But for one input source we can
+    # forget the value after first use.
+    if($no_of_inputsources == 1) {
+	my $arg = read_arg_from_fh($self->{'fhs'}[0]);
+	if(defined($arg)) {
+	    return [$arg];
+	}
+	return undef;
+    }
+    for (my $fhno = $no_of_inputsources - 1; $fhno >= 0; $fhno--) {
+	if(eof($self->{'fhs'}[$fhno])) {
+	    next;
+	} else {
+	    # read one
+	    my $arg = read_arg_from_fh($self->{'fhs'}[$fhno]);
+	    defined($arg) || next; # If we just read an EOF string: Treat this as EOF
+	    my $len = $#{$self->{'arg_matrix'}[$fhno]} + 1;
+	    $self->{'arg_matrix'}[$fhno][$len] = $arg;
+	    # make all new combinations
+	    my @combarg = ();
+	    for (my $fhn = 0; $fhn < $no_of_inputsources; $fhn++) {
+		push @combarg, [0, $#{$self->{'arg_matrix'}[$fhn]}];
+	    }
+	    $combarg[$fhno] = [$len,$len]; # Find only combinations with this new entry
+	    # map combinations
+	    # [ 1, 3, 7 ], [ 2, 4, 1 ]
+	    # =>
+	    # [ m[0][1], m[1][3], m[3][7] ], [ m[0][2], m[1][4], m[2][1] ]
+	    my @mapped;
+	    for my $c (expand_combinations(@combarg)) {
+		my @a;
+		for my $n (0 .. $no_of_inputsources - 1 ) {
+		    push @a,  $self->{'arg_matrix'}[$n][$$c[$n]];
+		}
+		push @mapped, \@a;
+	    }
+	    # append the mapped to the ungotten arguments
+	    push @{$self->{'unget'}}, @mapped;
+	    # get the first
+	    return shift @{$self->{'unget'}};
+	}
+    }
+    # all are eof or at EOF string; return from the unget queue
+    return shift @{$self->{'unget'}};
+}
+
+sub read_arg_from_fh {
+    # Read one Arg from filehandle
+    # Returns:
+    #   Arg-object with one read line
+    #   undef if end of file
+    my $fh = shift;
+    my $prepend = undef;
+    my $arg;
+    do {{
+	# This makes 10% faster
+	if(not ($arg = <$fh>)) {
+	    if(defined $prepend) {
+		return Arg->new($prepend);
+	    } else {
+		return undef;
+	    }
+	}
+#	::debug("run", "read $arg\n");
+	# Remove delimiter
+	$arg =~ s:$/$::;
+	if($Global::end_of_file_string and
+	   $arg eq $Global::end_of_file_string) {
+	    # Ignore the rest of input file
+	    close $fh;
+	    ::debug("run", "EOF-string ($arg) met\n");
+	    if(defined $prepend) {
+		return Arg->new($prepend);
+	    } else {
+		return undef;
+	    }
+	}
+	if(defined $prepend) {
+	    $arg = $prepend.$arg; # For line continuation
+	    $prepend = undef; #undef;
+	}
+	if($Global::ignore_empty) {
+	    if($arg =~ /^\s*$/) {
+		redo; # Try the next line
+	    }
+	}
+	if($Global::max_lines) {
+	    if($arg =~ /\s$/) {
+		# Trailing space => continued on next line
+		$prepend = $arg;
+		redo;
+	    }
+	}
+    }} while (1 == 0); # Dummy loop {{}} for redo
+    if(defined $arg) {
+	return Arg->new($arg);
+    } else {
+	::die_bug("multiread arg undefined");
+    }
+}
+
+sub expand_combinations {
+    # Input:
+    #   ([xmin,xmax], [ymin,ymax], ...)
+    # Returns: ([x,y,...],[x,y,...])
+    # where xmin <= x <= xmax and ymin <= y <= ymax
+    my $minmax_ref = shift;
+    my $xmin = $$minmax_ref[0];
+    my $xmax = $$minmax_ref[1];
+    my @p;
+    if(@_) {
+	# If there are more columns: Compute those recursively
+	my @rest = expand_combinations(@_);
+	for(my $x = $xmin; $x <= $xmax; $x++) {
+	    push @p, map { [$x, @$_] } @rest;
+	}
+    } else {
+	for(my $x = $xmin; $x <= $xmax; $x++) {
+	    push @p, [$x];
+	}
+    }
+    return @p;
+}
+
+
+package Arg;
+
+sub new {
+    my $class = shift;
+    my $orig = shift;
+    my @hostgroups;
+    if($opt::hostgroups) {
+	if($orig =~ s:@(.+)::) {
+	    # We found hostgroups on the arg
+	    @hostgroups = split(/\+/, $1);
+	    if(not grep { defined $Global::hostgroups{$_} } @hostgroups) {
+		::warning("No such hostgroup (@hostgroups)\n");
+		@hostgroups = (keys %Global::hostgroups);
+	    }
+        } else {
+	    @hostgroups = (keys %Global::hostgroups);
+	}
+    }
+    return bless {
+	'orig' => $orig,
+	'hostgroups' => \@hostgroups,
+    }, ref($class) || $class;
+}
+
+sub replace {
+    # Calculates the corresponding value for a given perl expression
+    # Returns:
+    #   The calculated string (quoted if asked for)
+    my $self = shift;
+    my $perlexpr = shift; # E.g. $_=$_ or s/.gz//
+    my $quote = (shift) ? 1 : 0; # should the string be quoted?
+    # This is actually a CommandLine-object,
+    # but it looks nice to be able to say {= $job->slot() =}
+    my $job = shift;
+    $perlexpr =~ s/^-?\d+ //; # Positional replace treated as normal replace
+    if(not defined $self->{"rpl",0,$perlexpr}) {
+	local $_;
+	if($Global::trim eq "n") {
+	    $_ = $self->{'orig'};
+	} else {
+	    $_ = trim_of($self->{'orig'});
+	}
+	::debug("replace", "eval ", $perlexpr, " ", $_, "\n");
+	if(not $Global::perleval{$perlexpr}) {
+	    # Make an anonymous function of the $perlexpr
+	    # And more importantly: Compile it only once
+	    if($Global::perleval{$perlexpr} =
+	       eval('sub { no strict; no warnings; my $job = shift; '.
+		    $perlexpr.' }')) {
+		# All is good
+	    } else {
+		# The eval failed. Maybe $perlexpr is invalid perl?
+		::error("Cannot use $perlexpr: $@\n");
+		::wait_and_exit(255);
+	    }
+	}
+	# Execute the function
+	$Global::perleval{$perlexpr}->($job);
+	$self->{"rpl",0,$perlexpr} = $_;
+    }
+    if(not defined $self->{"rpl",$quote,$perlexpr}) {
+	$self->{"rpl",1,$perlexpr} =
+	    ::shell_quote_scalar($self->{"rpl",0,$perlexpr});
+    }
+    return $self->{"rpl",$quote,$perlexpr};
+}
+
+sub orig {
+    my $self = shift;
+    return $self->{'orig'};
+}
+
+sub trim_of {
+    # Removes white space as specifed by --trim:
+    # n = nothing
+    # l = start
+    # r = end
+    # lr|rl = both
+    # Returns:
+    #   string with white space removed as needed
+    my @strings = map { defined $_ ? $_ : "" } (@_);
+    my $arg;
+    if($Global::trim eq "n") {
+	# skip
+    } elsif($Global::trim eq "l") {
+	for my $arg (@strings) { $arg =~ s/^\s+//; }
+    } elsif($Global::trim eq "r") {
+	for my $arg (@strings) { $arg =~ s/\s+$//; }
+    } elsif($Global::trim eq "rl" or $Global::trim eq "lr") {
+	for my $arg (@strings) { $arg =~ s/^\s+//; $arg =~ s/\s+$//; }
+    } else {
+	::error("--trim must be one of: r l rl lr.\n");
+	::wait_and_exit(255);
+    }
+    return wantarray ? @strings : "@strings";
+}
+
+
+package TimeoutQueue;
+
+sub new {
+    my $class = shift;
+    my $delta_time = shift;
+    my ($pct);
+    if($delta_time =~ /(\d+(\.\d+)?)%/) {
+	# Timeout in percent
+	$pct = $1/100;
+	$delta_time = 1_000_000;
+    }
+    return bless {
+	'queue' => [],
+	'delta_time' => $delta_time,
+	'pct' => $pct,
+	'remedian_idx' => 0,
+	'remedian_arr' => [],
+	'remedian' => undef,
+    }, ref($class) || $class;
+}
+
+sub delta_time {
+    my $self = shift;
+    return $self->{'delta_time'};
+}
+
+sub set_delta_time {
+    my $self = shift;
+    $self->{'delta_time'} = shift;
+}
+
+sub remedian {
+    my $self = shift;
+    return $self->{'remedian'};
+}
+
+sub set_remedian {
+    # Set median of the last 999^3 (=997002999) values using Remedian
+    #
+    # Rousseeuw, Peter J., and Gilbert W. Bassett Jr. "The remedian: A
+    # robust averaging method for large data sets." Journal of the
+    # American Statistical Association 85.409 (1990): 97-104.
+    my $self = shift;
+    my $val = shift;
+    my $i = $self->{'remedian_idx'}++;
+    my $rref = $self->{'remedian_arr'};
+    $rref->[0][$i%999] = $val;
+    $rref->[1][$i/999%999] = (sort @{$rref->[0]})[$#{$rref->[0]}/2];
+    $rref->[2][$i/999/999%999] = (sort @{$rref->[1]})[$#{$rref->[1]}/2];
+    $self->{'remedian'} = (sort @{$rref->[2]})[$#{$rref->[2]}/2];
+}
+
+sub update_delta_time {
+    # Update delta_time based on runtime of finished job if timeout is
+    # a percentage
+    my $self = shift;
+    my $runtime = shift;
+    if($self->{'pct'}) {
+	$self->set_remedian($runtime);
+	$self->{'delta_time'} = $self->{'pct'} * $self->remedian();
+	::debug("run", "Timeout: $self->{'delta_time'}s ");
+    }
+}
+
+sub process_timeouts {
+    # Check if there was a timeout
+    my $self = shift;
+    # $self->{'queue'} is sorted by start time
+    while (@{$self->{'queue'}}) {
+	my $job = $self->{'queue'}[0];
+	if($job->endtime()) {
+	    # Job already finished. No need to timeout the job
+	    # This could be because of --keep-order
+	    shift @{$self->{'queue'}};
+	} elsif($job->timedout($self->{'delta_time'})) {
+	    # Need to shift off queue before kill
+	    # because kill calls usleep that calls process_timeouts
+	    shift @{$self->{'queue'}};
+	    $job->kill();
+	} else {
+	    # Because they are sorted by start time the rest are later
+	    last;
+	}
+    }
+}
+
+sub insert {
+    my $self = shift;
+    my $in = shift;
+    push @{$self->{'queue'}}, $in;
+}
+
+
+package Semaphore;
+
+# This package provides a counting semaphore
+#
+# If a process dies without releasing the semaphore the next process
+# that needs that entry will clean up dead semaphores
+#
+# The semaphores are stored in ~/.parallel/semaphores/id-<name> Each
+# file in ~/.parallel/semaphores/id-<name>/ is the process ID of the
+# process holding the entry. If the process dies, the entry can be
+# taken by another process.
+
+sub new {
+    my $class = shift;
+    my $id = shift;
+    my $count = shift;
+    $id=~s/([^-_a-z0-9])/unpack("H*",$1)/ige; # Convert non-word chars to hex
+    $id="id-".$id; # To distinguish it from a process id
+    my $parallel_dir = $ENV{'HOME'}."/.parallel";
+    -d $parallel_dir or mkdir_or_die($parallel_dir);
+    my $parallel_locks = $parallel_dir."/semaphores";
+    -d $parallel_locks or mkdir_or_die($parallel_locks);
+    my $lockdir = "$parallel_locks/$id";
+    my $lockfile = $lockdir.".lock";
+    if($count < 1) { ::die_bug("semaphore-count: $count"); }
+    return bless {
+	'lockfile' => $lockfile,
+	'lockfh' => Symbol::gensym(),
+	'lockdir' => $lockdir,
+	'id' => $id,
+	'idfile' => $lockdir."/".$id,
+	'pid' => $$,
+	'pidfile' => $lockdir."/".$$.'@'.::hostname(),
+	'count' => $count + 1 # nlinks returns a link for the 'id-' as well
+    }, ref($class) || $class;
+}
+
+sub acquire {
+    my $self = shift;
+    my $sleep = 1; # 1 ms
+    my $start_time = time;
+    while(1) {
+	$self->atomic_link_if_count_less_than() and last;
+	::debug("sem", "Remove dead locks");
+	my $lockdir = $self->{'lockdir'};
+	for my $d (glob "$lockdir/*") {
+	    ::debug("sem", "Lock $d $lockdir\n");
+	    $d =~ m:$lockdir/([0-9]+)\@([-\._a-z0-9]+)$:o or next;
+	    my ($pid, $host) = ($1, $2);
+	    if($host eq ::hostname()) {
+		if(not kill 0, $1) {
+		    ::debug("sem", "Dead: $d");
+		    unlink $d;
+		} else {
+		    ::debug("sem", "Alive: $d");
+		}
+	    }
+	}
+	# try again
+	$self->atomic_link_if_count_less_than() and last;
+	# Retry slower and slower up to 1 second
+	$sleep = ($sleep < 1000) ? ($sleep * 1.1) : ($sleep);
+	# Random to avoid every sleeping job waking up at the same time
+	::usleep(rand()*$sleep);
+	if(defined($opt::timeout) and
+	   $start_time + $opt::timeout > time) {
+	    # Acquire the lock anyway
+	    if(not -e $self->{'idfile'}) {
+		open (my $fh, ">", $self->{'idfile'}) or
+		    ::die_bug("timeout_write_idfile: $self->{'idfile'}");
+		close $fh;
+	    }
+	    link $self->{'idfile'}, $self->{'pidfile'};
+	    last;
+	}
+    }
+    ::debug("sem", "acquired $self->{'pid'}\n");
+}
+
+sub release {
+    my $self = shift;
+    unlink $self->{'pidfile'};
+    if($self->nlinks() == 1) {
+	# This is the last link, so atomic cleanup
+	$self->lock();
+	if($self->nlinks() == 1) {
+	    unlink $self->{'idfile'};
+	    rmdir $self->{'lockdir'};
+	}
+	$self->unlock();
+    }
+    ::debug("run", "released $self->{'pid'}\n");
+}
+
+sub _release {
+    my $self = shift;
+
+    unlink $self->{'pidfile'};
+    $self->lock();
+    my $nlinks = $self->nlinks();
+    ::debug("sem", $nlinks, "<", $self->{'count'});
+    if($nlinks-- > 1) {
+       unlink $self->{'idfile'};
+       open (my $fh, ">", $self->{'idfile'}) or
+           ::die_bug("write_idfile: $self->{'idfile'}");
+       print $fh "#"x$nlinks;
+       close $fh;
+    } else {
+       unlink $self->{'idfile'};
+       rmdir $self->{'lockdir'};
+    }
+    $self->unlock();
+    ::debug("sem", "released $self->{'pid'}\n");
+}
+
+sub atomic_link_if_count_less_than {
+    # Link $file1 to $file2 if nlinks to $file1 < $count
+    my $self = shift;
+    my $retval = 0;
+    $self->lock();
+    ::debug($self->nlinks(), "<", $self->{'count'});
+    if($self->nlinks() < $self->{'count'}) {
+	-d $self->{'lockdir'} or mkdir_or_die($self->{'lockdir'});
+	if(not -e $self->{'idfile'}) {
+	    open (my $fh, ">", $self->{'idfile'}) or
+		::die_bug("write_idfile: $self->{'idfile'}");
+	    close $fh;
+	}
+	$retval = link $self->{'idfile'}, $self->{'pidfile'};
+    }
+    $self->unlock();
+    ::debug("run", "atomic $retval");
+    return $retval;
+}
+
+sub _atomic_link_if_count_less_than {
+    # Link $file1 to $file2 if nlinks to $file1 < $count
+    my $self = shift;
+    my $retval = 0;
+    $self->lock();
+    my $nlinks = $self->nlinks();
+    ::debug("sem", $nlinks, "<", $self->{'count'});
+    if($nlinks++ < $self->{'count'}) {
+	-d $self->{'lockdir'} or mkdir_or_die($self->{'lockdir'});
+	if(not -e $self->{'idfile'}) {
+	    open (my $fh, ">", $self->{'idfile'}) or
+		::die_bug("write_idfile: $self->{'idfile'}");
+	    close $fh;
+	}
+	open (my $fh, ">", $self->{'idfile'}) or
+	    ::die_bug("write_idfile: $self->{'idfile'}");
+	print $fh "#"x$nlinks;
+	close $fh;
+	$retval = link $self->{'idfile'}, $self->{'pidfile'};
+    }
+    $self->unlock();
+    ::debug("sem", "atomic $retval");
+    return $retval;
+}
+
+sub nlinks {
+    my $self = shift;
+    if(-e $self->{'idfile'}) {
+	::debug("sem", "nlinks", (stat(_))[3], "size", (stat(_))[7], "\n");
+	return (stat(_))[3];
+    } else {
+	return 0;
+    }
+}
+
+sub lock {
+    my $self = shift;
+    my $sleep = 100; # 100 ms
+    my $total_sleep = 0;
+    $Global::use{"Fcntl"} ||= eval "use Fcntl qw(:DEFAULT :flock); 1;";
+    my $locked = 0;
+    while(not $locked) {
+	if(tell($self->{'lockfh'}) == -1) {
+	    # File not open
+	    open($self->{'lockfh'}, ">", $self->{'lockfile'})
+		or ::debug("run", "Cannot open $self->{'lockfile'}");
+	}
+	if($self->{'lockfh'}) {
+	    # File is open
+	    chmod 0666, $self->{'lockfile'}; # assuming you want it a+rw
+	    if(flock($self->{'lockfh'}, LOCK_EX()|LOCK_NB())) {
+		# The file is locked: No need to retry
+		$locked = 1;
+		last;
+	    } else {
+		if ($! =~ m/Function not implemented/) {
+		    ::warning("flock: $!");
+		    ::warning("Will wait for a random while\n");
+		    ::usleep(rand(5000));
+		    # File cannot be locked: No need to retry
+		    $locked = 2;
+		    last;
+		}
+	    }
+	}
+	# Locking failed in first round
+	# Sleep and try again
+	$sleep = ($sleep < 1000) ? ($sleep * 1.1) : ($sleep);
+	# Random to avoid every sleeping job waking up at the same time
+	::usleep(rand()*$sleep);
+	$total_sleep += $sleep;
+	if($opt::semaphoretimeout) {
+	    if($total_sleep/1000 > $opt::semaphoretimeout) {
+		# Timeout: bail out
+		::warning("Semaphore timed out. Ignoring timeout.");
+		$locked = 3;
+		last;
+	    }
+	} else {
+	    if($total_sleep/1000 > 30) {
+		::warning("Semaphore stuck for 30 seconds. Consider using --semaphoretimeout.");
+	    }
+	}
+    }
+    ::debug("run", "locked $self->{'lockfile'}");
+}
+
+sub unlock {
+    my $self = shift;
+    unlink $self->{'lockfile'};
+    close $self->{'lockfh'};
+    ::debug("run", "unlocked\n");
+}
+
+sub mkdir_or_die {
+    # If dir is not writable: die
+    my $dir = shift;
+    my @dir_parts = split(m:/:,$dir);
+    my ($ddir,$part);
+    while(defined ($part = shift @dir_parts)) {
+	$part eq "" and next;
+	$ddir .= "/".$part;
+	-d $ddir and next;
+	mkdir $ddir;
+    }
+    if(not -w $dir) {
+	::error("Cannot write to $dir: $!\n");
+	::wait_and_exit(255);
+    }
+}
+
+# Keep perl -w happy
+$opt::x = $Semaphore::timeout = $Semaphore::wait =
+$Job::file_descriptor_warning_printed = 0;
diff --git a/src/rocksdb/build_tools/make_package.sh b/src/rocksdb/build_tools/make_package.sh
new file mode 100755
index 000000000..68a5d8a72
--- /dev/null
+++ b/src/rocksdb/build_tools/make_package.sh
@@ -0,0 +1,129 @@
+# shellcheck disable=SC1113
+#/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+set -e
+
+function log() {
+  echo "[+] $1"
+}
+
+function fatal() {
+  echo "[!] $1"
+  exit 1
+}
+
+function platform() {
+  local  __resultvar=$1
+  if [[ -f "/etc/yum.conf" ]]; then
+    eval $__resultvar="centos"
+  elif [[ -f "/etc/dpkg/dpkg.cfg" ]]; then
+    eval $__resultvar="ubuntu"
+  else
+    fatal "Unknwon operating system"
+  fi
+}
+platform OS
+
+function package() {
+  if [[ $OS = "ubuntu" ]]; then
+    if dpkg --get-selections | grep --quiet $1; then
+      log "$1 is already installed. skipping."
+    else
+      # shellcheck disable=SC2068
+      apt-get install $@ -y
+    fi
+  elif [[ $OS = "centos" ]]; then
+    if rpm -qa | grep --quiet $1; then
+      log "$1 is already installed. skipping."
+    else
+      # shellcheck disable=SC2068
+      yum install $@ -y
+    fi
+  fi
+}
+
+function detect_fpm_output() {
+  if [[ $OS = "ubuntu" ]]; then
+    export FPM_OUTPUT=deb
+  elif [[ $OS = "centos" ]]; then
+    export FPM_OUTPUT=rpm
+  fi
+}
+detect_fpm_output
+
+function gem_install() {
+  if gem list | grep --quiet $1; then
+    log "$1 is already installed. skipping."
+  else
+    # shellcheck disable=SC2068
+    gem install $@
+  fi
+}
+
+function main() {
+  if [[ $# -ne 1 ]]; then
+    fatal "Usage: $0 <rocksdb_version>"
+  else
+    log "using rocksdb version: $1"
+  fi
+
+  if [[ -d /vagrant ]]; then
+    if [[ $OS = "ubuntu" ]]; then
+      package g++-4.8
+      export CXX=g++-4.8
+
+      # the deb would depend on libgflags2, but the static lib is the only thing
+      # installed by make install
+      package libgflags-dev
+
+      package ruby-all-dev
+    elif [[ $OS = "centos" ]]; then
+      pushd /etc/yum.repos.d
+      if [[ ! -f /etc/yum.repos.d/devtools-1.1.repo ]]; then
+        wget http://people.centos.org/tru/devtools-1.1/devtools-1.1.repo
+      fi
+      package devtoolset-1.1-gcc --enablerepo=testing-1.1-devtools-6
+      package devtoolset-1.1-gcc-c++ --enablerepo=testing-1.1-devtools-6
+      export CC=/opt/centos/devtoolset-1.1/root/usr/bin/gcc
+      export CPP=/opt/centos/devtoolset-1.1/root/usr/bin/cpp
+      export CXX=/opt/centos/devtoolset-1.1/root/usr/bin/c++
+      export PATH=$PATH:/opt/centos/devtoolset-1.1/root/usr/bin
+      popd
+      if ! rpm -qa | grep --quiet gflags; then
+        rpm -i https://github.com/schuhschuh/gflags/releases/download/v2.1.0/gflags-devel-2.1.0-1.amd64.rpm
+      fi
+
+      package ruby
+      package ruby-devel
+      package rubygems
+      package rpm-build
+    fi
+  fi
+  gem_install fpm
+
+  make static_lib
+  LIBDIR=/usr/lib
+  if [[ $FPM_OUTPUT = "rpm" ]]; then
+      LIBDIR=$(rpm --eval '%_libdir')
+  fi
+
+  rm -rf package
+  make install DESTDIR=package PREFIX=/usr LIBDIR=$LIBDIR
+
+  fpm \
+    -s dir \
+    -t $FPM_OUTPUT \
+    -C package \
+    -n rocksdb \
+    -v $1 \
+    --url http://rocksdb.org/ \
+    -m rocksdb@fb.com \
+    --license BSD \
+    --vendor Facebook \
+    --description "RocksDB is an embeddable persistent key-value store for fast storage." \
+    usr
+}
+
+# shellcheck disable=SC2068
+main $@
diff --git a/src/rocksdb/build_tools/ps_with_stack b/src/rocksdb/build_tools/ps_with_stack
new file mode 100755
index 000000000..ee4256965
--- /dev/null
+++ b/src/rocksdb/build_tools/ps_with_stack
@@ -0,0 +1,38 @@
+#!/usr/bin/env perl
+
+use strict;
+
+open(my $ps, "-|", "ps -wwf");
+my $cols_known = 0;
+my $cmd_col = 0;
+my $pid_col = 0;
+while (<$ps>) {
+  print;
+  my @cols = split(/\s+/);
+
+  if (!$cols_known && /CMD/) {
+    # Parse relevant ps column headers
+    for (my $i = 0; $i <= $#cols; $i++) {
+      if ($cols[$i] eq "CMD") {
+        $cmd_col = $i;
+      }
+      if ($cols[$i] eq "PID") {
+        $pid_col = $i;
+      }
+    }
+    $cols_known = 1;
+  } else {
+    my $pid = $cols[$pid_col];
+    my $cmd = $cols[$cmd_col];
+    # Match numeric PID and relative path command
+    # -> The intention is only to dump stack traces for hangs in code under
+    # test, which means we probably just built it and are executing by
+    # relative path (e.g. ./my_test or foo/bar_test) rather then by absolute
+    # path (e.g. /usr/bin/time) or PATH search (e.g. grep).
+    if ($pid =~ /^[0-9]+$/ && $cmd =~ /^[^\/ ]+[\/]/) {
+      print "Dumping stacks for $pid...\n";
+      system("pstack $pid || gdb -batch -p $pid -ex 'thread apply all bt'");
+    }
+  }
+}
+close $ps;
diff --git a/src/rocksdb/build_tools/regression_build_test.sh b/src/rocksdb/build_tools/regression_build_test.sh
new file mode 100755
index 000000000..0baeca983
--- /dev/null
+++ b/src/rocksdb/build_tools/regression_build_test.sh
@@ -0,0 +1,396 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+set -e
+
+NUM=10000000
+
+if [ $# -eq 1 ];then
+  DATA_DIR=$1
+elif [ $# -eq 2 ];then
+  DATA_DIR=$1
+  STAT_FILE=$2
+fi
+
+# On the production build servers, set data and stat
+# files/directories not in /tmp or else the tempdir cleaning
+# scripts will make you very unhappy.
+DATA_DIR=${DATA_DIR:-$(mktemp -t -d rocksdb_XXXX)}
+STAT_FILE=${STAT_FILE:-$(mktemp -t -u rocksdb_test_stats_XXXX)}
+
+function cleanup {
+  rm -rf $DATA_DIR
+  rm -f $STAT_FILE.*
+}
+
+trap cleanup EXIT
+
+make release
+
+# measure fillseq + fill up the DB for overwrite benchmark
+./db_bench \
+    --benchmarks=fillseq \
+    --db=$DATA_DIR \
+    --use_existing_db=0 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --writes=$NUM \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_wal=1 \
+    --sync=0  > ${STAT_FILE}.fillseq
+
+# measure overwrite performance
+./db_bench \
+    --benchmarks=overwrite \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --writes=$((NUM / 10)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6  \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=8 > ${STAT_FILE}.overwrite
+
+# fill up the db for readrandom benchmark (1GB total size)
+./db_bench \
+    --benchmarks=fillseq \
+    --db=$DATA_DIR \
+    --use_existing_db=0 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --writes=$NUM \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=1 > /dev/null
+
+# measure readrandom with 6GB block cache
+./db_bench \
+    --benchmarks=readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --reads=$((NUM / 5)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readrandom
+
+# measure readrandom with 6GB block cache and tailing iterator
+./db_bench \
+    --benchmarks=readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --reads=$((NUM / 5)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --use_tailing_iterator=1 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readrandomtailing
+
+# measure readrandom with 100MB block cache
+./db_bench \
+    --benchmarks=readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --reads=$((NUM / 5)) \
+    --cache_size=104857600 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readrandomsmallblockcache
+
+# measure readrandom with 8k data in memtable
+./db_bench \
+    --benchmarks=overwrite,readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --reads=$((NUM / 5)) \
+    --writes=512 \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --write_buffer_size=1000000000 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readrandom_mem_sst
+
+
+# fill up the db for readrandom benchmark with filluniquerandom (1GB total size)
+./db_bench \
+    --benchmarks=filluniquerandom \
+    --db=$DATA_DIR \
+    --use_existing_db=0 \
+    --bloom_bits=10 \
+    --num=$((NUM / 4)) \
+    --writes=$((NUM / 4)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=1 > /dev/null
+
+# dummy test just to compact the data
+./db_bench \
+    --benchmarks=readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$((NUM / 1000)) \
+    --reads=$((NUM / 1000)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > /dev/null
+
+# measure readrandom after load with filluniquerandom with 6GB block cache
+./db_bench \
+    --benchmarks=readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$((NUM / 4)) \
+    --reads=$((NUM / 4)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --disable_auto_compactions=1 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readrandom_filluniquerandom
+
+# measure readwhilewriting after load with filluniquerandom with 6GB block cache
+./db_bench \
+    --benchmarks=readwhilewriting \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$((NUM / 4)) \
+    --reads=$((NUM / 4)) \
+    --benchmark_write_rate_limit=$(( 110 * 1024 )) \
+    --write_buffer_size=100000000 \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readwhilewriting
+
+# measure memtable performance -- none of the data gets flushed to disk
+./db_bench \
+    --benchmarks=fillrandom,readrandom, \
+    --db=$DATA_DIR \
+    --use_existing_db=0 \
+    --num=$((NUM / 10)) \
+    --reads=$NUM \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --write_buffer_size=1000000000 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --value_size=10 \
+    --threads=16 > ${STAT_FILE}.memtablefillreadrandom
+
+common_in_mem_args="--db=/dev/shm/rocksdb \
+    --num_levels=6 \
+    --key_size=20 \
+    --prefix_size=12 \
+    --keys_per_prefix=10 \
+    --value_size=100 \
+    --compression_type=none \
+    --compression_ratio=1 \
+    --write_buffer_size=134217728 \
+    --max_write_buffer_number=4 \
+    --level0_file_num_compaction_trigger=8 \
+    --level0_slowdown_writes_trigger=16 \
+    --level0_stop_writes_trigger=24 \
+    --target_file_size_base=134217728 \
+    --max_bytes_for_level_base=1073741824 \
+    --disable_wal=0 \
+    --wal_dir=/dev/shm/rocksdb \
+    --sync=0 \
+    --verify_checksum=1 \
+    --delete_obsolete_files_period_micros=314572800 \
+    --use_plain_table=1 \
+    --open_files=-1 \
+    --mmap_read=1 \
+    --mmap_write=0 \
+    --bloom_bits=10 \
+    --bloom_locality=1 \
+    --perf_level=0"
+
+# prepare a in-memory DB with 50M keys, total DB size is ~6G
+./db_bench \
+    $common_in_mem_args \
+    --statistics=0 \
+    --max_background_compactions=16 \
+    --max_background_flushes=16 \
+    --benchmarks=filluniquerandom \
+    --use_existing_db=0 \
+    --num=52428800 \
+    --threads=1 > /dev/null
+
+# Readwhilewriting
+./db_bench \
+    $common_in_mem_args \
+    --statistics=1 \
+    --max_background_compactions=4 \
+    --max_background_flushes=0 \
+    --benchmarks=readwhilewriting\
+    --use_existing_db=1 \
+    --duration=600 \
+    --threads=32 \
+    --benchmark_write_rate_limit=9502720 > ${STAT_FILE}.readwhilewriting_in_ram
+
+# Seekrandomwhilewriting
+./db_bench \
+    $common_in_mem_args \
+    --statistics=1 \
+    --max_background_compactions=4 \
+    --max_background_flushes=0 \
+    --benchmarks=seekrandomwhilewriting \
+    --use_existing_db=1 \
+    --use_tailing_iterator=1 \
+    --duration=600 \
+    --threads=32 \
+    --benchmark_write_rate_limit=9502720 > ${STAT_FILE}.seekwhilewriting_in_ram
+
+# measure fillseq with bunch of column families
+./db_bench \
+    --benchmarks=fillseq \
+    --num_column_families=500 \
+    --write_buffer_size=1048576 \
+    --db=$DATA_DIR \
+    --use_existing_db=0 \
+    --num=$NUM \
+    --writes=$NUM \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_wal=1 \
+    --sync=0  > ${STAT_FILE}.fillseq_lots_column_families
+
+# measure overwrite performance with bunch of column families
+./db_bench \
+    --benchmarks=overwrite \
+    --num_column_families=500 \
+    --write_buffer_size=1048576 \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --num=$NUM \
+    --writes=$((NUM / 10)) \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=8 > ${STAT_FILE}.overwrite_lots_column_families
+
+# send data to ods
+function send_to_ods {
+  key="$1"
+  value="$2"
+
+  if [ -z $JENKINS_HOME ]; then
+    # running on devbox, just print out the values
+    echo $1 $2
+    return
+  fi
+
+  if [ -z "$value" ];then
+    echo >&2 "ERROR: Key $key doesn't have a value."
+    return
+  fi
+  curl --silent "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build&key=$key&value=$value" \
+    --connect-timeout 60
+}
+
+function send_benchmark_to_ods {
+  bench="$1"
+  bench_key="$2"
+  file="$3"
+
+  QPS=$(grep $bench $file | awk '{print $5}')
+  P50_MICROS=$(grep $bench $file -A 6 | grep "Percentiles" | awk '{print $3}' )
+  P75_MICROS=$(grep $bench $file -A 6 | grep "Percentiles" | awk '{print $5}' )
+  P99_MICROS=$(grep $bench $file -A 6 | grep "Percentiles" | awk '{print $7}' )
+
+  send_to_ods rocksdb.build.$bench_key.qps $QPS
+  send_to_ods rocksdb.build.$bench_key.p50_micros $P50_MICROS
+  send_to_ods rocksdb.build.$bench_key.p75_micros $P75_MICROS
+  send_to_ods rocksdb.build.$bench_key.p99_micros $P99_MICROS
+}
+
+send_benchmark_to_ods overwrite overwrite $STAT_FILE.overwrite
+send_benchmark_to_ods fillseq fillseq $STAT_FILE.fillseq
+send_benchmark_to_ods readrandom readrandom $STAT_FILE.readrandom
+send_benchmark_to_ods readrandom readrandom_tailing $STAT_FILE.readrandomtailing
+send_benchmark_to_ods readrandom readrandom_smallblockcache $STAT_FILE.readrandomsmallblockcache
+send_benchmark_to_ods readrandom readrandom_memtable_sst $STAT_FILE.readrandom_mem_sst
+send_benchmark_to_ods readrandom readrandom_fillunique_random $STAT_FILE.readrandom_filluniquerandom
+send_benchmark_to_ods fillrandom memtablefillrandom $STAT_FILE.memtablefillreadrandom
+send_benchmark_to_ods readrandom memtablereadrandom $STAT_FILE.memtablefillreadrandom
+send_benchmark_to_ods readwhilewriting readwhilewriting $STAT_FILE.readwhilewriting
+send_benchmark_to_ods readwhilewriting readwhilewriting_in_ram ${STAT_FILE}.readwhilewriting_in_ram
+send_benchmark_to_ods seekrandomwhilewriting seekwhilewriting_in_ram ${STAT_FILE}.seekwhilewriting_in_ram
+send_benchmark_to_ods fillseq fillseq_lots_column_families ${STAT_FILE}.fillseq_lots_column_families
+send_benchmark_to_ods overwrite overwrite_lots_column_families ${STAT_FILE}.overwrite_lots_column_families
diff --git a/src/rocksdb/build_tools/run_ci_db_test.ps1 b/src/rocksdb/build_tools/run_ci_db_test.ps1
new file mode 100644
index 000000000..f20d3213f
--- /dev/null
+++ b/src/rocksdb/build_tools/run_ci_db_test.ps1
@@ -0,0 +1,493 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# This script enables you running RocksDB tests by running
+# All the tests concurrently and utilizing all the cores
+Param(
+  [switch]$EnableJE = $false,  # Look for and use test executable, append _je to listed exclusions
+  [switch]$RunAll = $false,    # Will attempt discover all *_test[_je].exe binaries and run all
+                               # of them as Google suites. I.e. It will run test cases concurrently
+                               # except those mentioned as $Run, those will run as individual test cases
+                               # And any execlued with $ExcludeExes or $ExcludeCases
+                               # It will also not run any individual test cases
+                               # excluded but $ExcludeCasese
+  [switch]$RunAllExe = $false, # Look for and use test exdcutables, append _je to exclusions automatically
+                               # It will attempt to run them in parallel w/o breaking them up on individual
+                               # test cases. Those listed with $ExcludeExes will be excluded
+  [string]$SuiteRun = "",      # Split test suites in test cases and run in parallel, not compatible with $RunAll
+  [string]$Run = "",           # Run specified executables in parallel but do not split to test cases
+  [string]$ExcludeCases = "",  # Exclude test cases, expects a comma separated list, no spaces
+                               # Takes effect when $RunAll or $SuiteRun is specified. Must have full
+                               # Test cases name including a group and a parameter if any
+  [string]$ExcludeExes = "",   # Exclude exes from consideration, expects a comma separated list,
+                               # no spaces. Takes effect only when $RunAll is specified
+  [string]$WorkFolder = "",    # Direct tests to use that folder. SSD or Ram drive are better options.
+   # Number of async tasks that would run concurrently. Recommend a number below 64.
+   # However, CPU utlization really depends on the storage media. Recommend ram based disk.
+   # a value of 1 will run everything serially
+  [int]$Concurrency = 8,
+  [int]$Limit = -1 # -1 means do not limit for test purposes
+)
+
+# Folders and commands must be fullpath to run assuming
+# the current folder is at the root of the git enlistment
+$StartDate = (Get-Date)
+$StartDate
+
+
+$DebugPreference = "Continue"
+
+# These tests are not google test suites and we should guard
+# Against running them as suites
+$RunOnly = New-Object System.Collections.Generic.HashSet[string]
+$RunOnly.Add("c_test") | Out-Null
+$RunOnly.Add("compact_on_deletion_collector_test") | Out-Null
+$RunOnly.Add("merge_test") | Out-Null
+$RunOnly.Add("stringappend_test") | Out-Null # Apparently incorrectly written
+$RunOnly.Add("backup_engine_test") | Out-Null # Disabled
+$RunOnly.Add("timer_queue_test") | Out-Null # Not a gtest
+
+if($RunAll -and $SuiteRun -ne "") {
+    Write-Error "$RunAll and $SuiteRun are not compatible"
+    exit 1
+}
+
+if($RunAllExe -and $Run -ne "") {
+    Write-Error "$RunAllExe and $Run are not compatible"
+    exit 1
+}
+
+# If running under Appveyor assume that root
+[string]$Appveyor = $Env:APPVEYOR_BUILD_FOLDER
+if($Appveyor -ne "") {
+    $RootFolder = $Appveyor
+} else {
+    $RootFolder = $PSScriptRoot -replace '\\build_tools', ''
+}
+
+$LogFolder = -Join($RootFolder, "\db_logs\")
+$BinariesFolder = -Join($RootFolder, "\build\Debug\")
+
+if($WorkFolder -eq "") {
+
+    # If TEST_TMPDIR is set use it
+    [string]$var = $Env:TEST_TMPDIR
+    if($var -eq "") {
+        $WorkFolder = -Join($RootFolder, "\db_tests\")
+        $Env:TEST_TMPDIR = $WorkFolder
+    } else {
+        $WorkFolder = $var
+    }
+} else {
+# Override from a command line
+  $Env:TEST_TMPDIR = $WorkFolder
+}
+
+Write-Output "Root: $RootFolder, WorkFolder: $WorkFolder"
+Write-Output "BinariesFolder: $BinariesFolder, LogFolder: $LogFolder"
+
+# Create test directories in the current folder
+md -Path $WorkFolder -ErrorAction Ignore | Out-Null
+md -Path $LogFolder -ErrorAction Ignore | Out-Null
+
+
+$ExcludeCasesSet = New-Object System.Collections.Generic.HashSet[string]
+if($ExcludeCases -ne "") {
+    Write-Host "ExcludeCases: $ExcludeCases"
+    $l = $ExcludeCases -split ' '
+    ForEach($t in $l) {
+      $ExcludeCasesSet.Add($t) | Out-Null
+    }
+}
+
+$ExcludeExesSet = New-Object System.Collections.Generic.HashSet[string]
+if($ExcludeExes -ne "") {
+    Write-Host "ExcludeExe: $ExcludeExes"
+    $l = $ExcludeExes -split ' '
+    ForEach($t in $l) {
+      $ExcludeExesSet.Add($t) | Out-Null
+    }
+}
+
+
+# Extract the names of its tests by running db_test with --gtest_list_tests.
+# This filter removes the "#"-introduced comments, and expands to
+# fully-qualified names by changing input like this:
+#
+#   DBTest.
+#     Empty
+#     WriteEmptyBatch
+#   MultiThreaded/MultiThreadedDBTest.
+#     MultiThreaded/0  # GetParam() = 0
+#     MultiThreaded/1  # GetParam() = 1
+#   RibbonTypeParamTest/0.  # TypeParam = struct DefaultTypesAndSettings
+#     CompactnessAndBacktrackAndFpRate
+#     Extremes
+#     FindOccupancyForSuccessRate
+#
+# into this:
+#
+#   DBTest.Empty
+#   DBTest.WriteEmptyBatch
+#   MultiThreaded/MultiThreadedDBTest.MultiThreaded/0
+#   MultiThreaded/MultiThreadedDBTest.MultiThreaded/1
+#   RibbonTypeParamTest/0.CompactnessAndBacktrackAndFpRate
+#   RibbonTypeParamTest/0.Extremes
+#   RibbonTypeParamTest/0.FindOccupancyForSuccessRate
+#
+# Output into the parameter in a form TestName -> Log File Name
+function ExtractTestCases([string]$GTestExe, $HashTable) {
+
+    $Tests = @()
+# Run db_test to get a list of tests and store it into $a array
+    &$GTestExe --gtest_list_tests | tee -Variable Tests | Out-Null
+
+    # Current group
+    $Group=""
+
+    ForEach( $l in $Tests) {
+
+      # remove trailing comment if any
+      $l = $l -replace '\s+\#.*',''
+      # Leading whitespace is fine
+      $l = $l -replace '^\s+',''
+      # Trailing dot is a test group but no whitespace
+      if ($l -match "\.$" -and $l -notmatch "\s+") {
+        $Group = $l
+      }  else {
+        # Otherwise it is a test name, remove leading space
+        $test = $l
+        # create a log name
+        $test = "$Group$test"
+
+        if($ExcludeCasesSet.Contains($test)) {
+            Write-Warning "$test case is excluded"
+            continue
+        }
+
+        $test_log = $test -replace '[\./]','_'
+        $test_log += ".log"
+        $log_path = -join ($LogFolder, $test_log)
+
+        # Add to a hashtable
+        $HashTable.Add($test, $log_path);
+      }
+    }
+}
+
+# The function removes trailing .exe siffix if any,
+# creates a name for the log file
+# Then adds the test name if it was not excluded into
+# a HashTable in a form of test_name -> log_path
+function MakeAndAdd([string]$token, $HashTable) {
+
+    $test_name = $token -replace '.exe$', ''
+    $log_name =  -join ($test_name, ".log")
+    $log_path = -join ($LogFolder, $log_name)
+    $HashTable.Add($test_name, $log_path)
+}
+
+# This function takes a list of Suites to run
+# Lists all the test cases in each of the suite
+# and populates HashOfHashes
+# Ordered by suite(exe) @{ Exe = @{ TestCase = LogName }}
+function ProcessSuites($ListOfSuites, $HashOfHashes) {
+
+  $suite_list = $ListOfSuites
+  # Problem: if you run --gtest_list_tests on
+  # a non Google Test executable then it will start executing
+  # and we will get nowhere
+  ForEach($suite in $suite_list) {
+
+    if($RunOnly.Contains($suite)) {
+      Write-Warning "$suite is excluded from running as Google test suite"
+      continue
+    }
+
+    if($EnableJE) {
+      $suite += "_je"
+    }
+
+    $Cases = [ordered]@{}
+    $Cases.Clear()
+    $suite_exe = -Join ($BinariesFolder, $suite)
+    ExtractTestCases -GTestExe $suite_exe -HashTable $Cases
+    if($Cases.Count -gt 0) {
+      $HashOfHashes.Add($suite, $Cases);
+    }
+  }
+
+  # Make logs and run
+  if($CasesToRun.Count -lt 1) {
+     Write-Error "Failed to extract tests from $SuiteRun"
+     exit 1
+  }
+
+}
+
+# This will contain all test executables to run
+
+# Hash table that contains all non suite
+# Test executable to run
+$TestExes = [ordered]@{}
+
+# Check for test exe that are not
+# Google Test Suites
+# Since this is explicitely mentioned it is not subject
+# for exclusions
+if($Run -ne "") {
+
+  $test_list = $Run -split ' '
+  ForEach($t in $test_list) {
+
+    if($EnableJE) {
+      $t += "_je"
+    }
+    MakeAndAdd -token $t -HashTable $TestExes
+  }
+
+  if($TestExes.Count -lt 1) {
+     Write-Error "Failed to extract tests from $Run"
+     exit 1
+  }
+} elseif($RunAllExe) {
+  # Discover all the test binaries
+  if($EnableJE) {
+    $pattern = "*_test_je.exe"
+  } else {
+    $pattern = "*_test.exe"
+  }
+
+  $search_path = -join ($BinariesFolder, $pattern)
+  Write-Host "Binaries Search Path: $search_path"
+
+  $DiscoveredExe = @()
+  dir -Path $search_path | ForEach-Object {
+     $DiscoveredExe += ($_.Name)
+  }
+
+  # Remove exclusions
+  ForEach($e in $DiscoveredExe) {
+    $e = $e -replace '.exe$', ''
+    $bare_name = $e -replace '_je$', ''
+
+    if($ExcludeExesSet.Contains($bare_name)) {
+      Write-Warning "Test $e is excluded"
+      continue
+    }
+    MakeAndAdd -token $e -HashTable $TestExes
+  }
+
+  if($TestExes.Count -lt 1) {
+     Write-Error "Failed to discover test executables"
+     exit 1
+  }
+}
+
+# Ordered by exe @{ Exe = @{ TestCase = LogName }}
+$CasesToRun = [ordered]@{}
+
+if($SuiteRun -ne "") {
+  $suite_list = $SuiteRun -split ' '
+  ProcessSuites -ListOfSuites $suite_list -HashOfHashes $CasesToRun
+} elseif ($RunAll) {
+# Discover all the test binaries
+  if($EnableJE) {
+    $pattern = "*_test_je.exe"
+  } else {
+    $pattern = "*_test.exe"
+  }
+
+  $search_path = -join ($BinariesFolder, $pattern)
+  Write-Host "Binaries Search Path: $search_path"
+
+  $ListOfExe = @()
+  dir -Path $search_path | ForEach-Object {
+     $ListOfExe += ($_.Name)
+  }
+
+  # Exclude those in RunOnly from running as suites
+  $ListOfSuites = @()
+  ForEach($e in $ListOfExe) {
+
+    $e = $e -replace '.exe$', ''
+    $bare_name = $e -replace '_je$', ''
+
+    if($ExcludeExesSet.Contains($bare_name)) {
+      Write-Warning "Test $e is excluded"
+      continue
+    }
+
+    if($RunOnly.Contains($bare_name)) {
+      MakeAndAdd -token $e -HashTable $TestExes
+    } else {
+      $ListOfSuites += $bare_name
+    }
+  }
+
+  ProcessSuites -ListOfSuites $ListOfSuites -HashOfHashes $CasesToRun
+}
+
+
+# Invoke a test with a filter and redirect all output
+$InvokeTestCase = {
+    param($exe, $test, $log);
+    &$exe --gtest_filter=$test > $log 2>&1
+}
+
+# Invoke all tests and redirect output
+$InvokeTestAsync = {
+    param($exe, $log)
+    &$exe > $log 2>&1
+}
+
+# Hash that contains tests to rerun if any failed
+# Those tests will be rerun sequentially
+# $Rerun = [ordered]@{}
+# Test limiting factor here
+[int]$count = 0
+# Overall status
+[bool]$script:success = $true;
+
+function RunJobs($Suites, $TestCmds, [int]$ConcurrencyVal)
+{
+    # Array to wait for any of the running jobs
+    $jobs = @()
+    # Hash JobToLog
+    $JobToLog = @{}
+
+    # Wait for all to finish and get the results
+    while(($JobToLog.Count -gt 0) -or
+          ($TestCmds.Count -gt 0) -or
+           ($Suites.Count -gt 0)) {
+
+        # Make sure we have maximum concurrent jobs running if anything
+        # and the $Limit either not set or allows to proceed
+        while(($JobToLog.Count -lt $ConcurrencyVal) -and
+              ((($TestCmds.Count -gt 0) -or ($Suites.Count -gt 0)) -and
+              (($Limit -lt 0) -or ($count -lt $Limit)))) {
+
+            # We always favore suites to run if available
+            [string]$exe_name = ""
+            [string]$log_path = ""
+            $Cases = @{}
+
+            if($Suites.Count -gt 0) {
+              # Will the first one
+              ForEach($e in $Suites.Keys) {
+                $exe_name = $e
+                $Cases = $Suites[$e]
+                break
+              }
+              [string]$test_case = ""
+              [string]$log_path = ""
+              ForEach($c in $Cases.Keys) {
+                 $test_case = $c
+                 $log_path = $Cases[$c]
+                 break
+              }
+
+              Write-Host "Starting $exe_name::$test_case"
+              [string]$Exe =  -Join ($BinariesFolder, $exe_name)
+              $job = Start-Job -Name "$exe_name::$test_case" -ArgumentList @($Exe,$test_case,$log_path) -ScriptBlock $InvokeTestCase
+              $JobToLog.Add($job, $log_path)
+
+              $Cases.Remove($test_case)
+              if($Cases.Count -lt 1) {
+                $Suites.Remove($exe_name)
+              }
+
+            } elseif ($TestCmds.Count -gt 0) {
+
+               ForEach($e in $TestCmds.Keys) {
+                 $exe_name = $e
+                 $log_path = $TestCmds[$e]
+                 break
+               }
+
+              Write-Host "Starting $exe_name"
+              [string]$Exe =  -Join ($BinariesFolder, $exe_name)
+              $job = Start-Job -Name $exe_name -ScriptBlock $InvokeTestAsync -ArgumentList @($Exe,$log_path)
+              $JobToLog.Add($job, $log_path)
+
+              $TestCmds.Remove($exe_name)
+
+            } else {
+                Write-Error "In the job loop but nothing to run"
+                exit 1
+            }
+
+            ++$count
+        } # End of Job starting loop
+
+        if($JobToLog.Count -lt 1) {
+          break
+        }
+
+        $jobs = @()
+        foreach($k in $JobToLog.Keys) { $jobs += $k }
+
+        $completed = Wait-Job -Job $jobs -Any
+        $log = $JobToLog[$completed]
+        $JobToLog.Remove($completed)
+
+        $message = -join @($completed.Name, " State: ", ($completed.State))
+
+        $log_content = @(Get-Content $log)
+
+        if($completed.State -ne "Completed") {
+            $script:success = $false
+            Write-Warning $message
+            $log_content | Write-Warning
+        } else {
+            # Scan the log. If we find PASSED and no occurrence of FAILED
+            # then it is a success
+            [bool]$pass_found = $false
+            ForEach($l in $log_content) {
+
+                if(($l -match "^\[\s+FAILED") -or
+                   ($l -match "Assertion failed:")) {
+                    $pass_found = $false
+                    break
+                }
+
+                if(($l -match "^\[\s+PASSED") -or
+                   ($l -match " : PASSED$") -or
+                    ($l -match "^PASS$") -or   # Special c_test case
+                    ($l -match "Passed all tests!") ) {
+                    $pass_found = $true
+                }
+            }
+
+            if(!$pass_found) {
+                $script:success = $false;
+                Write-Warning $message
+                $log_content | Write-Warning
+            } else {
+                Write-Host $message
+            }
+        }
+
+        # Remove cached job info from the system
+        # Should be no output
+        Receive-Job -Job $completed | Out-Null
+    }
+}
+
+RunJobs -Suites $CasesToRun -TestCmds $TestExes -ConcurrencyVal $Concurrency
+
+$EndDate = (Get-Date)
+
+New-TimeSpan -Start $StartDate -End $EndDate |
+  ForEach-Object {
+    "Elapsed time: {0:g}" -f $_
+  }
+
+
+if(!$script:success) {
+# This does not succeed killing off jobs quick
+# So we simply exit
+#    Remove-Job -Job $jobs -Force
+# indicate failure using this exit code
+    exit 1
+ }
+
+ exit 0
diff --git a/src/rocksdb/build_tools/setup_centos7.sh b/src/rocksdb/build_tools/setup_centos7.sh
new file mode 100755
index 000000000..474d91a3d
--- /dev/null
+++ b/src/rocksdb/build_tools/setup_centos7.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+set -ex
+
+ROCKSDB_VERSION="6.7.3"
+ZSTD_VERSION="1.4.4"
+
+echo "This script configures CentOS with everything needed to build and run RocksDB"
+
+yum update -y && yum install epel-release -y
+
+yum install -y \
+  wget \
+  gcc-c++ \
+  snappy snappy-devel \
+  zlib zlib-devel \
+  bzip2 bzip2-devel \
+  lz4-devel \
+  libasan \
+  gflags
+
+mkdir -pv /usr/local/rocksdb-${ROCKSDB_VERSION}
+ln -sfT /usr/local/rocksdb-${ROCKSDB_VERSION} /usr/local/rocksdb
+
+wget -qO /tmp/zstd-${ZSTD_VERSION}.tar.gz https://github.com/facebook/zstd/archive/v${ZSTD_VERSION}.tar.gz
+wget -qO /tmp/rocksdb-${ROCKSDB_VERSION}.tar.gz https://github.com/facebook/rocksdb/archive/v${ROCKSDB_VERSION}.tar.gz
+
+cd /tmp
+
+tar xzvf zstd-${ZSTD_VERSION}.tar.gz
+tar xzvf rocksdb-${ROCKSDB_VERSION}.tar.gz -C /usr/local/
+
+echo "Installing ZSTD..."
+pushd zstd-${ZSTD_VERSION}
+make && make install
+popd
+
+echo "Compiling RocksDB..."
+cd /usr/local/rocksdb
+chown -R vagrant:vagrant /usr/local/rocksdb/
+sudo -u vagrant make static_lib
+cd examples/
+sudo -u vagrant LD_LIBRARY_PATH=/usr/local/lib/ make all
+sudo -u vagrant LD_LIBRARY_PATH=/usr/local/lib/ ./c_simple_example
+
diff --git a/src/rocksdb/build_tools/ubuntu20_image/Dockerfile b/src/rocksdb/build_tools/ubuntu20_image/Dockerfile
new file mode 100644
index 000000000..d81a5e4b2
--- /dev/null
+++ b/src/rocksdb/build_tools/ubuntu20_image/Dockerfile
@@ -0,0 +1,57 @@
+# from official ubuntu 20.04
+FROM ubuntu:20.04
+# update system
+RUN apt-get update && apt-get upgrade -y
+# install basic tools
+RUN apt-get install -y vim wget curl
+# install tzdata noninteractive
+RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata
+# install git and default compilers
+RUN apt-get install -y git gcc g++ clang clang-tools
+# install basic package
+RUN apt-get install -y lsb-release software-properties-common gnupg
+# install gflags, tbb
+RUN apt-get install -y libgflags-dev libtbb-dev
+# install compression libs
+RUN apt-get install -y libsnappy-dev zlib1g-dev libbz2-dev liblz4-dev libzstd-dev
+# install cmake
+RUN apt-get install -y cmake
+RUN apt-get install -y libssl-dev
+# install clang-13
+WORKDIR /root
+RUN wget https://apt.llvm.org/llvm.sh
+RUN chmod +x llvm.sh
+RUN ./llvm.sh 13 all
+# install gcc-7, 8, 10, 11, default is 9
+RUN apt-get install -y gcc-7 g++-7
+RUN apt-get install -y gcc-8 g++-8
+RUN apt-get install -y gcc-10 g++-10
+RUN add-apt-repository -y ppa:ubuntu-toolchain-r/test
+RUN apt-get install -y gcc-11 g++-11
+# install apt-get install -y valgrind
+RUN apt-get install -y valgrind
+# install folly depencencies
+RUN apt-get install -y libgoogle-glog-dev
+# install openjdk 8
+RUN apt-get install -y openjdk-8-jdk
+ENV JAVA_HOME /usr/lib/jvm/java-1.8.0-openjdk-amd64
+# install mingw
+RUN apt-get install -y mingw-w64
+
+# install gtest-parallel package
+RUN git clone --single-branch --branch master --depth 1 https://github.com/google/gtest-parallel.git ~/gtest-parallel
+ENV PATH $PATH:/root/gtest-parallel
+
+# install libprotobuf for fuzzers test
+RUN apt-get install -y ninja-build binutils liblzma-dev libz-dev pkg-config autoconf libtool
+RUN git clone --branch v1.0 https://github.com/google/libprotobuf-mutator.git ~/libprotobuf-mutator && cd ~/libprotobuf-mutator && git checkout ffd86a32874e5c08a143019aad1aaf0907294c9f && mkdir build && cd build && cmake .. -GNinja -DCMAKE_C_COMPILER=clang-13 -DCMAKE_CXX_COMPILER=clang++-13 -DCMAKE_BUILD_TYPE=Release -DLIB_PROTO_MUTATOR_DOWNLOAD_PROTOBUF=ON && ninja && ninja install
+ENV PKG_CONFIG_PATH /usr/local/OFF/:/root/libprotobuf-mutator/build/external.protobuf/lib/pkgconfig/
+ENV PROTOC_BIN /root/libprotobuf-mutator/build/external.protobuf/bin/protoc
+
+# install the latest google benchmark
+RUN git clone --depth 1 --branch v1.7.0 https://github.com/google/benchmark.git ~/benchmark
+RUN cd ~/benchmark && mkdir build && cd build && cmake .. -GNinja -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_GTEST_TESTS=0 && ninja && ninja install
+
+# clean up
+RUN rm -rf /var/lib/apt/lists/*
+RUN rm -rf /root/benchmark
diff --git a/src/rocksdb/build_tools/update_dependencies.sh b/src/rocksdb/build_tools/update_dependencies.sh
new file mode 100755
index 000000000..a2fdcd0ee
--- /dev/null
+++ b/src/rocksdb/build_tools/update_dependencies.sh
@@ -0,0 +1,149 @@
+#!/bin/sh
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+# Update dependencies.sh file with the latest avaliable versions
+
+BASEDIR=$(dirname $0)
+OUTPUT=""
+
+function log_header()
+{
+  echo "# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved." >> "$OUTPUT"
+  echo "# The file is generated using update_dependencies.sh." >> "$OUTPUT"
+}
+
+
+function log_variable()
+{
+  echo "$1=${!1}" >> "$OUTPUT"
+}
+
+
+TP2_LATEST="/data/users/$USER/fbsource/fbcode/third-party2/"
+## $1 => lib name
+## $2 => lib version (if not provided, will try to pick latest)
+## $3 => platform (if not provided, will try to pick latest gcc)
+##
+## get_lib_base will set a variable named ${LIB_NAME}_BASE to the lib location
+function get_lib_base()
+{
+  local lib_name=$1
+  local lib_version=$2
+  local lib_platform=$3
+
+  local result="$TP2_LATEST/$lib_name/"
+  
+  # Lib Version
+  if [ -z "$lib_version" ] || [ "$lib_version" = "LATEST" ]; then
+    # version is not provided, use latest
+    result=`ls -dr1v $result/*/ | head -n1`
+  else
+    result="$result/$lib_version/"
+  fi
+  
+  # Lib Platform
+  if [ -z "$lib_platform" ]; then
+    # platform is not provided, use latest gcc
+    result=`ls -dr1v $result/gcc-*[^fb]/ | head -n1`
+  else
+    echo $lib_platform
+    result="$result/$lib_platform/"
+  fi
+  
+  result=`ls -1d $result/*/ | head -n1`
+
+  echo Finding link $result
+  
+  # lib_name => LIB_NAME_BASE
+  local __res_var=${lib_name^^}"_BASE"
+  __res_var=`echo $__res_var | tr - _`
+  # LIB_NAME_BASE=$result
+  eval $__res_var=`readlink -f $result`
+  
+  log_variable $__res_var
+}
+
+###########################################################
+#                platform010 dependencies                 #
+###########################################################
+
+OUTPUT="$BASEDIR/dependencies_platform010.sh"
+
+rm -f "$OUTPUT"
+touch "$OUTPUT"
+
+echo "Writing dependencies to $OUTPUT"
+
+# Compilers locations
+GCC_BASE=`readlink -f $TP2_LATEST/gcc/11.x/centos7-native/*/`
+CLANG_BASE=`readlink -f $TP2_LATEST/llvm-fb/12/platform010/*/`
+
+log_header
+log_variable GCC_BASE
+log_variable CLANG_BASE
+
+# Libraries locations
+get_lib_base libgcc     11.x    platform010
+get_lib_base glibc      2.34    platform010
+get_lib_base snappy     LATEST  platform010
+get_lib_base zlib       LATEST  platform010
+get_lib_base bzip2      LATEST  platform010
+get_lib_base lz4        LATEST  platform010
+get_lib_base zstd       LATEST  platform010
+get_lib_base gflags     LATEST  platform010
+get_lib_base jemalloc   LATEST  platform010
+get_lib_base numa       LATEST  platform010
+get_lib_base libunwind  LATEST  platform010
+get_lib_base tbb        2018_U5 platform010
+get_lib_base liburing   LATEST  platform010
+get_lib_base benchmark  LATEST  platform010
+
+get_lib_base kernel-headers fb platform010
+get_lib_base binutils   LATEST centos7-native
+get_lib_base valgrind   LATEST platform010
+get_lib_base lua        5.3.4  platform010
+
+git diff $OUTPUT
+
+
+###########################################################
+#                platform009 dependencies                 #
+###########################################################
+
+OUTPUT="$BASEDIR/dependencies_platform009.sh"
+
+rm -f "$OUTPUT"
+touch "$OUTPUT"
+
+echo "Writing dependencies to $OUTPUT"
+
+# Compilers locations
+GCC_BASE=`readlink -f $TP2_LATEST/gcc/9.x/centos7-native/*/`
+CLANG_BASE=`readlink -f $TP2_LATEST/llvm-fb/9.0.0/platform009/*/`
+
+log_header
+log_variable GCC_BASE
+log_variable CLANG_BASE
+
+# Libraries locations
+get_lib_base libgcc     9.x     platform009
+get_lib_base glibc      2.30    platform009
+get_lib_base snappy     LATEST  platform009
+get_lib_base zlib       LATEST  platform009
+get_lib_base bzip2      LATEST  platform009
+get_lib_base lz4        LATEST  platform009
+get_lib_base zstd       LATEST  platform009
+get_lib_base gflags     LATEST  platform009
+get_lib_base jemalloc   LATEST  platform009
+get_lib_base numa       LATEST  platform009
+get_lib_base libunwind  LATEST  platform009
+get_lib_base tbb        2018_U5 platform009
+get_lib_base liburing   LATEST  platform009
+get_lib_base benchmark  LATEST  platform009
+
+get_lib_base kernel-headers fb platform009
+get_lib_base binutils   LATEST centos7-native
+get_lib_base valgrind   LATEST platform009
+get_lib_base lua        5.3.4  platform009
+
+git diff $OUTPUT
diff --git a/src/rocksdb/build_tools/version.sh b/src/rocksdb/build_tools/version.sh
new file mode 100755
index 000000000..dbc1a9296
--- /dev/null
+++ b/src/rocksdb/build_tools/version.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+if [ "$#" = "0" ]; then
+  echo "Usage: $0 major|minor|patch|full"
+  exit 1
+fi
+
+if [ "$1" = "major" ]; then
+  cat include/rocksdb/version.h  | grep MAJOR | head -n1 | awk '{print $3}'
+fi
+if [ "$1" = "minor" ]; then
+  cat include/rocksdb/version.h  | grep MINOR | head -n1 | awk '{print $3}'
+fi
+if [ "$1" = "patch" ]; then
+  cat include/rocksdb/version.h  | grep PATCH | head -n1 | awk '{print $3}'
+fi
+if [ "$1" = "full" ]; then
+  awk '/#define ROCKSDB/ { env[$2] = $3 }
+       END { printf "%s.%s.%s\n", env["ROCKSDB_MAJOR"],
+                                  env["ROCKSDB_MINOR"],
+                                  env["ROCKSDB_PATCH"] }'  \
+      include/rocksdb/version.h
+fi
diff --git a/src/rocksdb/cache/cache.cc b/src/rocksdb/cache/cache.cc
new file mode 100644
index 000000000..7d23fb757
--- /dev/null
+++ b/src/rocksdb/cache/cache.cc
@@ -0,0 +1,129 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/cache.h"
+
+#include "cache/lru_cache.h"
+#include "rocksdb/secondary_cache.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+#ifndef ROCKSDB_LITE
+static std::unordered_map<std::string, OptionTypeInfo>
+    lru_cache_options_type_info = {
+        {"capacity",
+         {offsetof(struct LRUCacheOptions, capacity), OptionType::kSizeT,
+          OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
+        {"num_shard_bits",
+         {offsetof(struct LRUCacheOptions, num_shard_bits), OptionType::kInt,
+          OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
+        {"strict_capacity_limit",
+         {offsetof(struct LRUCacheOptions, strict_capacity_limit),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"high_pri_pool_ratio",
+         {offsetof(struct LRUCacheOptions, high_pri_pool_ratio),
+          OptionType::kDouble, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"low_pri_pool_ratio",
+         {offsetof(struct LRUCacheOptions, low_pri_pool_ratio),
+          OptionType::kDouble, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    comp_sec_cache_options_type_info = {
+        {"capacity",
+         {offsetof(struct CompressedSecondaryCacheOptions, capacity),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"num_shard_bits",
+         {offsetof(struct CompressedSecondaryCacheOptions, num_shard_bits),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"compression_type",
+         {offsetof(struct CompressedSecondaryCacheOptions, compression_type),
+          OptionType::kCompressionType, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"compress_format_version",
+         {offsetof(struct CompressedSecondaryCacheOptions,
+                   compress_format_version),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"enable_custom_split_merge",
+         {offsetof(struct CompressedSecondaryCacheOptions,
+                   enable_custom_split_merge),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+};
+#endif  // ROCKSDB_LITE
+
+Status SecondaryCache::CreateFromString(
+    const ConfigOptions& config_options, const std::string& value,
+    std::shared_ptr<SecondaryCache>* result) {
+  if (value.find("compressed_secondary_cache://") == 0) {
+    std::string args = value;
+    args.erase(0, std::strlen("compressed_secondary_cache://"));
+    Status status;
+    std::shared_ptr<SecondaryCache> sec_cache;
+
+#ifndef ROCKSDB_LITE
+    CompressedSecondaryCacheOptions sec_cache_opts;
+    status = OptionTypeInfo::ParseStruct(config_options, "",
+                                         &comp_sec_cache_options_type_info, "",
+                                         args, &sec_cache_opts);
+    if (status.ok()) {
+      sec_cache = NewCompressedSecondaryCache(sec_cache_opts);
+    }
+
+#else
+    (void)config_options;
+    status = Status::NotSupported(
+        "Cannot load compressed secondary cache in LITE mode ", args);
+#endif  //! ROCKSDB_LITE
+
+    if (status.ok()) {
+      result->swap(sec_cache);
+    }
+    return status;
+  } else {
+    return LoadSharedObject<SecondaryCache>(config_options, value, nullptr,
+                                            result);
+  }
+}
+
+Status Cache::CreateFromString(const ConfigOptions& config_options,
+                               const std::string& value,
+                               std::shared_ptr<Cache>* result) {
+  Status status;
+  std::shared_ptr<Cache> cache;
+  if (value.find('=') == std::string::npos) {
+    cache = NewLRUCache(ParseSizeT(value));
+  } else {
+#ifndef ROCKSDB_LITE
+    LRUCacheOptions cache_opts;
+    status = OptionTypeInfo::ParseStruct(config_options, "",
+                                         &lru_cache_options_type_info, "",
+                                         value, &cache_opts);
+    if (status.ok()) {
+      cache = NewLRUCache(cache_opts);
+    }
+#else
+    (void)config_options;
+    status = Status::NotSupported("Cannot load cache in LITE mode ", value);
+#endif  //! ROCKSDB_LITE
+  }
+  if (status.ok()) {
+    result->swap(cache);
+  }
+  return status;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/cache_bench.cc b/src/rocksdb/cache/cache_bench.cc
new file mode 100644
index 000000000..f836939a3
--- /dev/null
+++ b/src/rocksdb/cache/cache_bench.cc
@@ -0,0 +1,20 @@
+//  Copyright (c) 2013-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+#include "rocksdb/cache_bench_tool.h"
+int main(int argc, char** argv) {
+  return ROCKSDB_NAMESPACE::cache_bench_tool(argc, argv);
+}
+#endif  // GFLAGS
diff --git a/src/rocksdb/cache/cache_bench_tool.cc b/src/rocksdb/cache/cache_bench_tool.cc
new file mode 100644
index 000000000..73360f414
--- /dev/null
+++ b/src/rocksdb/cache/cache_bench_tool.cc
@@ -0,0 +1,973 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "cache_key.h"
+#ifdef GFLAGS
+#include <cinttypes>
+#include <cstddef>
+#include <cstdio>
+#include <limits>
+#include <memory>
+#include <set>
+#include <sstream>
+
+#include "db/db_impl/db_impl.h"
+#include "monitoring/histogram.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/secondary_cache.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/table_properties.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/cachable_entry.h"
+#include "util/coding.h"
+#include "util/distributed_mutex.h"
+#include "util/gflags_compat.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+static constexpr uint32_t KiB = uint32_t{1} << 10;
+static constexpr uint32_t MiB = KiB << 10;
+static constexpr uint64_t GiB = MiB << 10;
+
+DEFINE_uint32(threads, 16, "Number of concurrent threads to run.");
+DEFINE_uint64(cache_size, 1 * GiB,
+              "Number of bytes to use as a cache of uncompressed data.");
+DEFINE_uint32(num_shard_bits, 6, "shard_bits.");
+
+DEFINE_double(resident_ratio, 0.25,
+              "Ratio of keys fitting in cache to keyspace.");
+DEFINE_uint64(ops_per_thread, 2000000U, "Number of operations per thread.");
+DEFINE_uint32(value_bytes, 8 * KiB, "Size of each value added.");
+
+DEFINE_uint32(skew, 5, "Degree of skew in key selection");
+DEFINE_bool(populate_cache, true, "Populate cache before operations");
+
+DEFINE_uint32(lookup_insert_percent, 87,
+              "Ratio of lookup (+ insert on not found) to total workload "
+              "(expressed as a percentage)");
+DEFINE_uint32(insert_percent, 2,
+              "Ratio of insert to total workload (expressed as a percentage)");
+DEFINE_uint32(lookup_percent, 10,
+              "Ratio of lookup to total workload (expressed as a percentage)");
+DEFINE_uint32(erase_percent, 1,
+              "Ratio of erase to total workload (expressed as a percentage)");
+DEFINE_bool(gather_stats, false,
+            "Whether to periodically simulate gathering block cache stats, "
+            "using one more thread.");
+DEFINE_uint32(
+    gather_stats_sleep_ms, 1000,
+    "How many milliseconds to sleep between each gathering of stats.");
+
+DEFINE_uint32(gather_stats_entries_per_lock, 256,
+              "For Cache::ApplyToAllEntries");
+DEFINE_bool(skewed, false, "If true, skew the key access distribution");
+
+DEFINE_bool(lean, false,
+            "If true, no additional computation is performed besides cache "
+            "operations.");
+
+#ifndef ROCKSDB_LITE
+DEFINE_string(secondary_cache_uri, "",
+              "Full URI for creating a custom secondary cache object");
+static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
+#endif  // ROCKSDB_LITE
+
+DEFINE_string(cache_type, "lru_cache", "Type of block cache.");
+
+// ## BEGIN stress_cache_key sub-tool options ##
+// See class StressCacheKey below.
+DEFINE_bool(stress_cache_key, false,
+            "If true, run cache key stress test instead");
+DEFINE_uint32(
+    sck_files_per_day, 2500000,
+    "(-stress_cache_key) Simulated files generated per simulated day");
+// NOTE: Giving each run a specified lifetime, rather than e.g. "until
+// first collision" ensures equal skew from start-up, when collisions are
+// less likely.
+DEFINE_uint32(sck_days_per_run, 90,
+              "(-stress_cache_key) Number of days to simulate in each run");
+// NOTE: The number of observed collisions directly affects the relative
+// accuracy of the predicted probabilities. 15 observations should be well
+// within factor-of-2 accuracy.
+DEFINE_uint32(
+    sck_min_collision, 15,
+    "(-stress_cache_key) Keep running until this many collisions seen");
+// sck_file_size_mb can be thought of as average file size. The simulation is
+// not precise enough to care about the distribution of file sizes; other
+// simulations (https://github.com/pdillinger/unique_id/tree/main/monte_carlo)
+// indicate the distribution only makes a small difference (e.g. < 2x factor)
+DEFINE_uint32(
+    sck_file_size_mb, 32,
+    "(-stress_cache_key) Simulated file size in MiB, for accounting purposes");
+DEFINE_uint32(sck_reopen_nfiles, 100,
+              "(-stress_cache_key) Simulate DB re-open average every n files");
+DEFINE_uint32(sck_newdb_nreopen, 1000,
+              "(-stress_cache_key) Simulate new DB average every n re-opens");
+DEFINE_uint32(sck_restarts_per_day, 24,
+              "(-stress_cache_key) Average simulated process restarts per day "
+              "(across DBs)");
+DEFINE_uint32(
+    sck_db_count, 100,
+    "(-stress_cache_key) Parallel DBs in simulation sharing a block cache");
+DEFINE_uint32(
+    sck_table_bits, 20,
+    "(-stress_cache_key) Log2 number of tracked (live) files (across DBs)");
+// sck_keep_bits being well below full 128 bits amplifies the collision
+// probability so that the true probability can be estimated through observed
+// collisions. (More explanation below.)
+DEFINE_uint32(
+    sck_keep_bits, 50,
+    "(-stress_cache_key) Number of bits to keep from each cache key (<= 64)");
+// sck_randomize is used to validate whether cache key is performing "better
+// than random." Even with this setting, file offsets are not randomized.
+DEFINE_bool(sck_randomize, false,
+            "(-stress_cache_key) Randomize (hash) cache key");
+// See https://github.com/facebook/rocksdb/pull/9058
+DEFINE_bool(sck_footer_unique_id, false,
+            "(-stress_cache_key) Simulate using proposed footer unique id");
+// ## END stress_cache_key sub-tool options ##
+
+namespace ROCKSDB_NAMESPACE {
+
+class CacheBench;
+namespace {
+// State shared by all concurrent executions of the same benchmark.
+class SharedState {
+ public:
+  explicit SharedState(CacheBench* cache_bench)
+      : cv_(&mu_),
+        num_initialized_(0),
+        start_(false),
+        num_done_(0),
+        cache_bench_(cache_bench) {}
+
+  ~SharedState() {}
+
+  port::Mutex* GetMutex() { return &mu_; }
+
+  port::CondVar* GetCondVar() { return &cv_; }
+
+  CacheBench* GetCacheBench() const { return cache_bench_; }
+
+  void IncInitialized() { num_initialized_++; }
+
+  void IncDone() { num_done_++; }
+
+  bool AllInitialized() const { return num_initialized_ >= FLAGS_threads; }
+
+  bool AllDone() const { return num_done_ >= FLAGS_threads; }
+
+  void SetStart() { start_ = true; }
+
+  bool Started() const { return start_; }
+
+ private:
+  port::Mutex mu_;
+  port::CondVar cv_;
+
+  uint64_t num_initialized_;
+  bool start_;
+  uint64_t num_done_;
+
+  CacheBench* cache_bench_;
+};
+
+// Per-thread state for concurrent executions of the same benchmark.
+struct ThreadState {
+  uint32_t tid;
+  Random64 rnd;
+  SharedState* shared;
+  HistogramImpl latency_ns_hist;
+  uint64_t duration_us = 0;
+
+  ThreadState(uint32_t index, SharedState* _shared)
+      : tid(index), rnd(1000 + index), shared(_shared) {}
+};
+
+struct KeyGen {
+  char key_data[27];
+
+  Slice GetRand(Random64& rnd, uint64_t max_key, int max_log) {
+    uint64_t key = 0;
+    if (!FLAGS_skewed) {
+      uint64_t raw = rnd.Next();
+      // Skew according to setting
+      for (uint32_t i = 0; i < FLAGS_skew; ++i) {
+        raw = std::min(raw, rnd.Next());
+      }
+      key = FastRange64(raw, max_key);
+    } else {
+      key = rnd.Skewed(max_log);
+      if (key > max_key) {
+        key -= max_key;
+      }
+    }
+    // Variable size and alignment
+    size_t off = key % 8;
+    key_data[0] = char{42};
+    EncodeFixed64(key_data + 1, key);
+    key_data[9] = char{11};
+    EncodeFixed64(key_data + 10, key);
+    key_data[18] = char{4};
+    EncodeFixed64(key_data + 19, key);
+    assert(27 >= kCacheKeySize);
+    return Slice(&key_data[off], kCacheKeySize);
+  }
+};
+
+char* createValue(Random64& rnd) {
+  char* rv = new char[FLAGS_value_bytes];
+  // Fill with some filler data, and take some CPU time
+  for (uint32_t i = 0; i < FLAGS_value_bytes; i += 8) {
+    EncodeFixed64(rv + i, rnd.Next());
+  }
+  return rv;
+}
+
+// Callbacks for secondary cache
+size_t SizeFn(void* /*obj*/) { return FLAGS_value_bytes; }
+
+Status SaveToFn(void* obj, size_t /*offset*/, size_t size, void* out) {
+  memcpy(out, obj, size);
+  return Status::OK();
+}
+
+// Different deleters to simulate using deleter to gather
+// stats on the code origin and kind of cache entries.
+void deleter1(const Slice& /*key*/, void* value) {
+  delete[] static_cast<char*>(value);
+}
+void deleter2(const Slice& /*key*/, void* value) {
+  delete[] static_cast<char*>(value);
+}
+void deleter3(const Slice& /*key*/, void* value) {
+  delete[] static_cast<char*>(value);
+}
+
+Cache::CacheItemHelper helper1(SizeFn, SaveToFn, deleter1);
+Cache::CacheItemHelper helper2(SizeFn, SaveToFn, deleter2);
+Cache::CacheItemHelper helper3(SizeFn, SaveToFn, deleter3);
+}  // namespace
+
+class CacheBench {
+  static constexpr uint64_t kHundredthUint64 =
+      std::numeric_limits<uint64_t>::max() / 100U;
+
+ public:
+  CacheBench()
+      : max_key_(static_cast<uint64_t>(FLAGS_cache_size / FLAGS_resident_ratio /
+                                       FLAGS_value_bytes)),
+        lookup_insert_threshold_(kHundredthUint64 *
+                                 FLAGS_lookup_insert_percent),
+        insert_threshold_(lookup_insert_threshold_ +
+                          kHundredthUint64 * FLAGS_insert_percent),
+        lookup_threshold_(insert_threshold_ +
+                          kHundredthUint64 * FLAGS_lookup_percent),
+        erase_threshold_(lookup_threshold_ +
+                         kHundredthUint64 * FLAGS_erase_percent),
+        skewed_(FLAGS_skewed) {
+    if (erase_threshold_ != 100U * kHundredthUint64) {
+      fprintf(stderr, "Percentages must add to 100.\n");
+      exit(1);
+    }
+
+    max_log_ = 0;
+    if (skewed_) {
+      uint64_t max_key = max_key_;
+      while (max_key >>= 1) max_log_++;
+      if (max_key > (static_cast<uint64_t>(1) << max_log_)) max_log_++;
+    }
+
+    if (FLAGS_cache_type == "clock_cache") {
+      fprintf(stderr, "Old clock cache implementation has been removed.\n");
+      exit(1);
+    } else if (FLAGS_cache_type == "hyper_clock_cache") {
+      cache_ = HyperClockCacheOptions(FLAGS_cache_size, FLAGS_value_bytes,
+                                      FLAGS_num_shard_bits)
+                   .MakeSharedCache();
+    } else if (FLAGS_cache_type == "lru_cache") {
+      LRUCacheOptions opts(FLAGS_cache_size, FLAGS_num_shard_bits,
+                           false /* strict_capacity_limit */,
+                           0.5 /* high_pri_pool_ratio */);
+#ifndef ROCKSDB_LITE
+      if (!FLAGS_secondary_cache_uri.empty()) {
+        Status s = SecondaryCache::CreateFromString(
+            ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache);
+        if (secondary_cache == nullptr) {
+          fprintf(
+              stderr,
+              "No secondary cache registered matching string: %s status=%s\n",
+              FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str());
+          exit(1);
+        }
+        opts.secondary_cache = secondary_cache;
+      }
+#endif  // ROCKSDB_LITE
+
+      cache_ = NewLRUCache(opts);
+    } else {
+      fprintf(stderr, "Cache type not supported.");
+      exit(1);
+    }
+  }
+
+  ~CacheBench() {}
+
+  void PopulateCache() {
+    Random64 rnd(1);
+    KeyGen keygen;
+    for (uint64_t i = 0; i < 2 * FLAGS_cache_size; i += FLAGS_value_bytes) {
+      Status s = cache_->Insert(keygen.GetRand(rnd, max_key_, max_log_),
+                                createValue(rnd), &helper1, FLAGS_value_bytes);
+      assert(s.ok());
+    }
+  }
+
+  bool Run() {
+    const auto clock = SystemClock::Default().get();
+
+    PrintEnv();
+    SharedState shared(this);
+    std::vector<std::unique_ptr<ThreadState> > threads(FLAGS_threads);
+    for (uint32_t i = 0; i < FLAGS_threads; i++) {
+      threads[i].reset(new ThreadState(i, &shared));
+      std::thread(ThreadBody, threads[i].get()).detach();
+    }
+
+    HistogramImpl stats_hist;
+    std::string stats_report;
+    std::thread stats_thread(StatsBody, &shared, &stats_hist, &stats_report);
+
+    uint64_t start_time;
+    {
+      MutexLock l(shared.GetMutex());
+      while (!shared.AllInitialized()) {
+        shared.GetCondVar()->Wait();
+      }
+      // Record start time
+      start_time = clock->NowMicros();
+
+      // Start all threads
+      shared.SetStart();
+      shared.GetCondVar()->SignalAll();
+
+      // Wait threads to complete
+      while (!shared.AllDone()) {
+        shared.GetCondVar()->Wait();
+      }
+    }
+
+    // Stats gathering is considered background work. This time measurement
+    // is for foreground work, and not really ideal for that. See below.
+    uint64_t end_time = clock->NowMicros();
+    stats_thread.join();
+
+    // Wall clock time - includes idle time if threads
+    // finish at different times (not ideal).
+    double elapsed_secs = static_cast<double>(end_time - start_time) * 1e-6;
+    uint32_t ops_per_sec = static_cast<uint32_t>(
+        1.0 * FLAGS_threads * FLAGS_ops_per_thread / elapsed_secs);
+    printf("Complete in %.3f s; Rough parallel ops/sec = %u\n", elapsed_secs,
+           ops_per_sec);
+
+    // Total time in each thread (more accurate throughput measure)
+    elapsed_secs = 0;
+    for (uint32_t i = 0; i < FLAGS_threads; i++) {
+      elapsed_secs += threads[i]->duration_us * 1e-6;
+    }
+    ops_per_sec = static_cast<uint32_t>(1.0 * FLAGS_threads *
+                                        FLAGS_ops_per_thread / elapsed_secs);
+    printf("Thread ops/sec = %u\n", ops_per_sec);
+
+    printf("\nOperation latency (ns):\n");
+    HistogramImpl combined;
+    for (uint32_t i = 0; i < FLAGS_threads; i++) {
+      combined.Merge(threads[i]->latency_ns_hist);
+    }
+    printf("%s", combined.ToString().c_str());
+
+    if (FLAGS_gather_stats) {
+      printf("\nGather stats latency (us):\n");
+      printf("%s", stats_hist.ToString().c_str());
+    }
+
+    printf("\n%s", stats_report.c_str());
+
+    return true;
+  }
+
+ private:
+  std::shared_ptr<Cache> cache_;
+  const uint64_t max_key_;
+  // Cumulative thresholds in the space of a random uint64_t
+  const uint64_t lookup_insert_threshold_;
+  const uint64_t insert_threshold_;
+  const uint64_t lookup_threshold_;
+  const uint64_t erase_threshold_;
+  const bool skewed_;
+  int max_log_;
+
+  // A benchmark version of gathering stats on an active block cache by
+  // iterating over it. The primary purpose is to measure the impact of
+  // gathering stats with ApplyToAllEntries on throughput- and
+  // latency-sensitive Cache users. Performance of stats gathering is
+  // also reported. The last set of gathered stats is also reported, for
+  // manual sanity checking for logical errors or other unexpected
+  // behavior of cache_bench or the underlying Cache.
+  static void StatsBody(SharedState* shared, HistogramImpl* stats_hist,
+                        std::string* stats_report) {
+    if (!FLAGS_gather_stats) {
+      return;
+    }
+    const auto clock = SystemClock::Default().get();
+    uint64_t total_key_size = 0;
+    uint64_t total_charge = 0;
+    uint64_t total_entry_count = 0;
+    uint64_t table_occupancy = 0;
+    uint64_t table_size = 0;
+    std::set<Cache::DeleterFn> deleters;
+    StopWatchNano timer(clock);
+
+    for (;;) {
+      uint64_t time;
+      time = clock->NowMicros();
+      uint64_t deadline = time + uint64_t{FLAGS_gather_stats_sleep_ms} * 1000;
+
+      {
+        MutexLock l(shared->GetMutex());
+        for (;;) {
+          if (shared->AllDone()) {
+            std::ostringstream ostr;
+            ostr << "Most recent cache entry stats:\n"
+                 << "Number of entries: " << total_entry_count << "\n"
+                 << "Table occupancy: " << table_occupancy << " / "
+                 << table_size << " = "
+                 << (100.0 * table_occupancy / table_size) << "%\n"
+                 << "Total charge: " << BytesToHumanString(total_charge) << "\n"
+                 << "Average key size: "
+                 << (1.0 * total_key_size / total_entry_count) << "\n"
+                 << "Average charge: "
+                 << BytesToHumanString(static_cast<uint64_t>(
+                        1.0 * total_charge / total_entry_count))
+                 << "\n"
+                 << "Unique deleters: " << deleters.size() << "\n";
+            *stats_report = ostr.str();
+            return;
+          }
+          if (clock->NowMicros() >= deadline) {
+            break;
+          }
+          uint64_t diff = deadline - std::min(clock->NowMicros(), deadline);
+          shared->GetCondVar()->TimedWait(diff + 1);
+        }
+      }
+
+      // Now gather stats, outside of mutex
+      total_key_size = 0;
+      total_charge = 0;
+      total_entry_count = 0;
+      deleters.clear();
+      auto fn = [&](const Slice& key, void* /*value*/, size_t charge,
+                    Cache::DeleterFn deleter) {
+        total_key_size += key.size();
+        total_charge += charge;
+        ++total_entry_count;
+        // Something slightly more expensive as in (future) stats by category
+        deleters.insert(deleter);
+      };
+      timer.Start();
+      Cache::ApplyToAllEntriesOptions opts;
+      opts.average_entries_per_lock = FLAGS_gather_stats_entries_per_lock;
+      shared->GetCacheBench()->cache_->ApplyToAllEntries(fn, opts);
+      table_occupancy = shared->GetCacheBench()->cache_->GetOccupancyCount();
+      table_size = shared->GetCacheBench()->cache_->GetTableAddressCount();
+      stats_hist->Add(timer.ElapsedNanos() / 1000);
+    }
+  }
+
+  static void ThreadBody(ThreadState* thread) {
+    SharedState* shared = thread->shared;
+
+    {
+      MutexLock l(shared->GetMutex());
+      shared->IncInitialized();
+      if (shared->AllInitialized()) {
+        shared->GetCondVar()->SignalAll();
+      }
+      while (!shared->Started()) {
+        shared->GetCondVar()->Wait();
+      }
+    }
+    thread->shared->GetCacheBench()->OperateCache(thread);
+
+    {
+      MutexLock l(shared->GetMutex());
+      shared->IncDone();
+      if (shared->AllDone()) {
+        shared->GetCondVar()->SignalAll();
+      }
+    }
+  }
+
+  void OperateCache(ThreadState* thread) {
+    // To use looked-up values
+    uint64_t result = 0;
+    // To hold handles for a non-trivial amount of time
+    Cache::Handle* handle = nullptr;
+    KeyGen gen;
+    const auto clock = SystemClock::Default().get();
+    uint64_t start_time = clock->NowMicros();
+    StopWatchNano timer(clock);
+
+    for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
+      Slice key = gen.GetRand(thread->rnd, max_key_, max_log_);
+      uint64_t random_op = thread->rnd.Next();
+      Cache::CreateCallback create_cb = [](const void* buf, size_t size,
+                                           void** out_obj,
+                                           size_t* charge) -> Status {
+        *out_obj = reinterpret_cast<void*>(new char[size]);
+        memcpy(*out_obj, buf, size);
+        *charge = size;
+        return Status::OK();
+      };
+
+      timer.Start();
+
+      if (random_op < lookup_insert_threshold_) {
+        if (handle) {
+          cache_->Release(handle);
+          handle = nullptr;
+        }
+        // do lookup
+        handle = cache_->Lookup(key, &helper2, create_cb, Cache::Priority::LOW,
+                                true);
+        if (handle) {
+          if (!FLAGS_lean) {
+            // do something with the data
+            result += NPHash64(static_cast<char*>(cache_->Value(handle)),
+                               FLAGS_value_bytes);
+          }
+        } else {
+          // do insert
+          Status s = cache_->Insert(key, createValue(thread->rnd), &helper2,
+                                    FLAGS_value_bytes, &handle);
+          assert(s.ok());
+        }
+      } else if (random_op < insert_threshold_) {
+        if (handle) {
+          cache_->Release(handle);
+          handle = nullptr;
+        }
+        // do insert
+        Status s = cache_->Insert(key, createValue(thread->rnd), &helper3,
+                                  FLAGS_value_bytes, &handle);
+        assert(s.ok());
+      } else if (random_op < lookup_threshold_) {
+        if (handle) {
+          cache_->Release(handle);
+          handle = nullptr;
+        }
+        // do lookup
+        handle = cache_->Lookup(key, &helper2, create_cb, Cache::Priority::LOW,
+                                true);
+        if (handle) {
+          if (!FLAGS_lean) {
+            // do something with the data
+            result += NPHash64(static_cast<char*>(cache_->Value(handle)),
+                               FLAGS_value_bytes);
+          }
+        }
+      } else if (random_op < erase_threshold_) {
+        // do erase
+        cache_->Erase(key);
+      } else {
+        // Should be extremely unlikely (noop)
+        assert(random_op >= kHundredthUint64 * 100U);
+      }
+      thread->latency_ns_hist.Add(timer.ElapsedNanos());
+    }
+    if (handle) {
+      cache_->Release(handle);
+      handle = nullptr;
+    }
+    // Ensure computations on `result` are not optimized away.
+    if (result == 1) {
+      printf("You are extremely unlucky(2). Try again.\n");
+      exit(1);
+    }
+    thread->duration_us = clock->NowMicros() - start_time;
+  }
+
+  void PrintEnv() const {
+#if defined(__GNUC__) && !defined(__OPTIMIZE__)
+    printf(
+        "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n");
+#endif
+#ifndef NDEBUG
+    printf("WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
+#endif
+    printf("RocksDB version     : %d.%d\n", kMajorVersion, kMinorVersion);
+    printf("DMutex impl name    : %s\n", DMutex::kName());
+    printf("Number of threads   : %u\n", FLAGS_threads);
+    printf("Ops per thread      : %" PRIu64 "\n", FLAGS_ops_per_thread);
+    printf("Cache size          : %s\n",
+           BytesToHumanString(FLAGS_cache_size).c_str());
+    printf("Num shard bits      : %u\n", FLAGS_num_shard_bits);
+    printf("Max key             : %" PRIu64 "\n", max_key_);
+    printf("Resident ratio      : %g\n", FLAGS_resident_ratio);
+    printf("Skew degree         : %u\n", FLAGS_skew);
+    printf("Populate cache      : %d\n", int{FLAGS_populate_cache});
+    printf("Lookup+Insert pct   : %u%%\n", FLAGS_lookup_insert_percent);
+    printf("Insert percentage   : %u%%\n", FLAGS_insert_percent);
+    printf("Lookup percentage   : %u%%\n", FLAGS_lookup_percent);
+    printf("Erase percentage    : %u%%\n", FLAGS_erase_percent);
+    std::ostringstream stats;
+    if (FLAGS_gather_stats) {
+      stats << "enabled (" << FLAGS_gather_stats_sleep_ms << "ms, "
+            << FLAGS_gather_stats_entries_per_lock << "/lock)";
+    } else {
+      stats << "disabled";
+    }
+    printf("Gather stats        : %s\n", stats.str().c_str());
+    printf("----------------------------\n");
+  }
+};
+
+// cache_bench -stress_cache_key is an independent embedded tool for
+// estimating the probability of CacheKey collisions through simulation.
+// At a high level, it simulates generating SST files over many months,
+// keeping them in the DB and/or cache for some lifetime while staying
+// under resource caps, and checking for any cache key collisions that
+// arise among the set of live files. For efficient simulation, we make
+// some simplifying "pessimistic" assumptions (that only increase the
+// chance of the simulation reporting a collision relative to the chance
+// of collision in practice):
+// * Every generated file has a cache entry for every byte offset in the
+// file (contiguous range of cache keys)
+// * All of every file is cached for its entire lifetime. (Here "lifetime"
+// is technically the union of DB and Cache lifetime, though we only
+// model a generous DB lifetime, where space usage is always maximized.
+// In a effective Cache, lifetime in cache can only substantially exceed
+// lifetime in DB if there is little cache activity; cache activity is
+// required to hit cache key collisions.)
+//
+// It would be possible to track an exact set of cache key ranges for the
+// set of live files, but we would have no hope of observing collisions
+// (overlap in live files) in our simulation. We need to employ some way
+// of amplifying collision probability that allows us to predict the real
+// collision probability by extrapolation from observed collisions. Our
+// basic approach is to reduce each cache key range down to some smaller
+// number of bits, and limiting to bits that are shared over the whole
+// range.  Now we can observe collisions using a set of smaller stripped-down
+// (reduced) cache keys. Let's do some case analysis to understand why this
+// works:
+// * No collision in reduced key - because the reduction is a pure function
+// this implies no collision in the full keys
+// * Collision detected between two reduced keys - either
+//   * The reduction has dropped some structured uniqueness info (from one of
+// session counter or file number; file offsets are never materialized here).
+// This can only artificially inflate the observed and extrapolated collision
+// probabilities. We only have to worry about this in designing the reduction.
+//   * The reduction has preserved all the structured uniqueness in the cache
+// key, which means either
+//     * REJECTED: We have a uniqueness bug in generating cache keys, where
+// structured uniqueness info should have been different but isn't. In such a
+// case, increasing by 1 the number of bits kept after reduction would not
+// reduce observed probabilities by half. (In our observations, the
+// probabilities are reduced approximately by half.)
+//     * ACCEPTED: The lost unstructured uniqueness in the key determines the
+// probability that an observed collision would imply an overlap in ranges.
+// In short, dropping n bits from key would increase collision probability by
+// 2**n, assuming those n bits have full entropy in unstructured uniqueness.
+//
+// But we also have to account for the key ranges based on file size. If file
+// sizes are roughly 2**b offsets, using XOR in 128-bit cache keys for
+// "ranges", we know from other simulations (see
+// https://github.com/pdillinger/unique_id/) that that's roughly equivalent to
+// (less than 2x higher collision probability) using a cache key of size
+// 128 - b bits for the whole file. (This is the only place we make an
+// "optimistic" assumption, which is more than offset by the real
+// implementation stripping off 2 lower bits from block byte offsets for cache
+// keys. The simulation assumes byte offsets, which is net pessimistic.)
+//
+// So to accept the extrapolation as valid, we need to be confident that all
+// "lost" bits, excluding those covered by file offset, are full entropy.
+// Recall that we have assumed (verifiably, safely) that other structured data
+// (file number and session counter) are kept, not lost. Based on the
+// implementation comments for OffsetableCacheKey, the only potential hole here
+// is that we only have ~103 bits of entropy in "all new" session IDs, and in
+// extreme cases, there might be only 1 DB ID. However, because the upper ~39
+// bits of session ID are hashed, the combination of file number and file
+// offset only has to add to 25 bits (or more) to ensure full entropy in
+// unstructured uniqueness lost in the reduction. Typical file size of 32MB
+// suffices (at least for simulation purposes where we assume each file offset
+// occupies a cache key).
+//
+// Example results in comments on OffsetableCacheKey.
+class StressCacheKey {
+ public:
+  void Run() {
+    if (FLAGS_sck_footer_unique_id) {
+      // Proposed footer unique IDs are DB-independent and session-independent
+      // (but process-dependent) which is most easily simulated here by
+      // assuming 1 DB and (later below) no session resets without process
+      // reset.
+      FLAGS_sck_db_count = 1;
+    }
+
+    // Describe the simulated workload
+    uint64_t mb_per_day =
+        uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_file_size_mb;
+    printf("Total cache or DBs size: %gTiB  Writing %g MiB/s or %gTiB/day\n",
+           FLAGS_sck_file_size_mb / 1024.0 / 1024.0 *
+               std::pow(2.0, FLAGS_sck_table_bits),
+           mb_per_day / 86400.0, mb_per_day / 1024.0 / 1024.0);
+    // For extrapolating probability of any collisions from a number of
+    // observed collisions
+    multiplier_ = std::pow(2.0, 128 - FLAGS_sck_keep_bits) /
+                  (FLAGS_sck_file_size_mb * 1024.0 * 1024.0);
+    printf(
+        "Multiply by %g to correct for simulation losses (but still assume "
+        "whole file cached)\n",
+        multiplier_);
+    restart_nfiles_ = FLAGS_sck_files_per_day / FLAGS_sck_restarts_per_day;
+    double without_ejection =
+        std::pow(1.414214, FLAGS_sck_keep_bits) / FLAGS_sck_files_per_day;
+    // This should be a lower bound for -sck_randomize, usually a terribly
+    // rough lower bound.
+    // If observation is worse than this, then something has gone wrong.
+    printf(
+        "Without ejection, expect random collision after %g days (%g "
+        "corrected)\n",
+        without_ejection, without_ejection * multiplier_);
+    double with_full_table =
+        std::pow(2.0, FLAGS_sck_keep_bits - FLAGS_sck_table_bits) /
+        FLAGS_sck_files_per_day;
+    // This is an alternate lower bound for -sck_randomize, usually pretty
+    // accurate. Our cache keys should usually perform "better than random"
+    // but always no worse. (If observation is substantially worse than this,
+    // then something has gone wrong.)
+    printf(
+        "With ejection and full table, expect random collision after %g "
+        "days (%g corrected)\n",
+        with_full_table, with_full_table * multiplier_);
+    collisions_ = 0;
+
+    // Run until sufficient number of observed collisions.
+    for (int i = 1; collisions_ < FLAGS_sck_min_collision; i++) {
+      RunOnce();
+      if (collisions_ == 0) {
+        printf(
+            "No collisions after %d x %u days                              "
+            "                   \n",
+            i, FLAGS_sck_days_per_run);
+      } else {
+        double est = 1.0 * i * FLAGS_sck_days_per_run / collisions_;
+        printf("%" PRIu64
+               " collisions after %d x %u days, est %g days between (%g "
+               "corrected)        \n",
+               collisions_, i, FLAGS_sck_days_per_run, est, est * multiplier_);
+      }
+    }
+  }
+
+  void RunOnce() {
+    // Re-initialized simulated state
+    const size_t db_count = std::max(size_t{FLAGS_sck_db_count}, size_t{1});
+    dbs_.reset(new TableProperties[db_count]{});
+    const size_t table_mask = (size_t{1} << FLAGS_sck_table_bits) - 1;
+    table_.reset(new uint64_t[table_mask + 1]{});
+    if (FLAGS_sck_keep_bits > 64) {
+      FLAGS_sck_keep_bits = 64;
+    }
+
+    // Details of which bits are dropped in reduction
+    uint32_t shift_away = 64 - FLAGS_sck_keep_bits;
+    // Shift away fewer potential file number bits (b) than potential
+    // session counter bits (a).
+    uint32_t shift_away_b = shift_away / 3;
+    uint32_t shift_away_a = shift_away - shift_away_b;
+
+    process_count_ = 0;
+    session_count_ = 0;
+    newdb_count_ = 0;
+    ResetProcess(/*newdbs*/ true);
+
+    Random64 r{std::random_device{}()};
+
+    uint64_t max_file_count =
+        uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_days_per_run;
+    uint32_t report_count = 0;
+    uint32_t collisions_this_run = 0;
+    size_t db_i = 0;
+
+    for (uint64_t file_count = 1; file_count <= max_file_count;
+         ++file_count, ++db_i) {
+      // Round-robin through DBs (this faster than %)
+      if (db_i >= db_count) {
+        db_i = 0;
+      }
+      // Any other periodic actions before simulating next file
+      if (!FLAGS_sck_footer_unique_id && r.OneIn(FLAGS_sck_reopen_nfiles)) {
+        ResetSession(db_i, /*newdb*/ r.OneIn(FLAGS_sck_newdb_nreopen));
+      } else if (r.OneIn(restart_nfiles_)) {
+        ResetProcess(/*newdbs*/ false);
+      }
+      // Simulate next file
+      OffsetableCacheKey ock;
+      dbs_[db_i].orig_file_number += 1;
+      // skip some file numbers for other file kinds, except in footer unique
+      // ID, orig_file_number here tracks process-wide generated SST file
+      // count.
+      if (!FLAGS_sck_footer_unique_id) {
+        dbs_[db_i].orig_file_number += (r.Next() & 3);
+      }
+      bool is_stable;
+      BlockBasedTable::SetupBaseCacheKey(&dbs_[db_i], /* ignored */ "",
+                                         /* ignored */ 42, &ock, &is_stable);
+      assert(is_stable);
+      // Get a representative cache key, which later we analytically generalize
+      // to a range.
+      CacheKey ck = ock.WithOffset(0);
+      uint64_t reduced_key;
+      if (FLAGS_sck_randomize) {
+        reduced_key = GetSliceHash64(ck.AsSlice()) >> shift_away;
+      } else if (FLAGS_sck_footer_unique_id) {
+        // Special case: keep only file number, not session counter
+        reduced_key = DecodeFixed64(ck.AsSlice().data()) >> shift_away;
+      } else {
+        // Try to keep file number and session counter (shift away other bits)
+        uint32_t a = DecodeFixed32(ck.AsSlice().data()) << shift_away_a;
+        uint32_t b = DecodeFixed32(ck.AsSlice().data() + 4) >> shift_away_b;
+        reduced_key = (uint64_t{a} << 32) + b;
+      }
+      if (reduced_key == 0) {
+        // Unlikely, but we need to exclude tracking this value because we
+        // use it to mean "empty" in table. This case is OK as long as we
+        // don't hit it often.
+        printf("Hit Zero!                                                  \n");
+        file_count--;
+        continue;
+      }
+      uint64_t h =
+          NPHash64(reinterpret_cast<char*>(&reduced_key), sizeof(reduced_key));
+      // Skew expected lifetimes, for high variance (super-Poisson) variance
+      // in actual lifetimes.
+      size_t pos =
+          std::min(Lower32of64(h) & table_mask, Upper32of64(h) & table_mask);
+      if (table_[pos] == reduced_key) {
+        collisions_this_run++;
+        // Our goal is to predict probability of no collisions, not expected
+        // number of collisions. To make the distinction, we have to get rid
+        // of observing correlated collisions, which this takes care of:
+        ResetProcess(/*newdbs*/ false);
+      } else {
+        // Replace (end of lifetime for file that was in this slot)
+        table_[pos] = reduced_key;
+      }
+
+      if (++report_count == FLAGS_sck_files_per_day) {
+        report_count = 0;
+        // Estimate fill %
+        size_t incr = table_mask / 1000;
+        size_t sampled_count = 0;
+        for (size_t i = 0; i <= table_mask; i += incr) {
+          if (table_[i] != 0) {
+            sampled_count++;
+          }
+        }
+        // Report
+        printf(
+            "%" PRIu64 " days, %" PRIu64 " proc, %" PRIu64 " sess, %" PRIu64
+            " newdb, %u coll, occ %g%%, ejected %g%%      \r",
+            file_count / FLAGS_sck_files_per_day, process_count_,
+            session_count_, newdb_count_ - FLAGS_sck_db_count,
+            collisions_this_run, 100.0 * sampled_count / 1000.0,
+            100.0 * (1.0 - sampled_count / 1000.0 * table_mask / file_count));
+        fflush(stdout);
+      }
+    }
+    collisions_ += collisions_this_run;
+  }
+
+  void ResetSession(size_t i, bool newdb) {
+    dbs_[i].db_session_id = DBImpl::GenerateDbSessionId(nullptr);
+    if (newdb) {
+      ++newdb_count_;
+      if (FLAGS_sck_footer_unique_id) {
+        // Simulate how footer id would behave
+        dbs_[i].db_id = "none";
+      } else {
+        // db_id might be ignored, depending on the implementation details
+        dbs_[i].db_id = std::to_string(newdb_count_);
+        dbs_[i].orig_file_number = 0;
+      }
+    }
+    session_count_++;
+  }
+
+  void ResetProcess(bool newdbs) {
+    process_count_++;
+    DBImpl::TEST_ResetDbSessionIdGen();
+    for (size_t i = 0; i < FLAGS_sck_db_count; ++i) {
+      ResetSession(i, newdbs);
+    }
+    if (FLAGS_sck_footer_unique_id) {
+      // For footer unique ID, this tracks process-wide generated SST file
+      // count.
+      dbs_[0].orig_file_number = 0;
+    }
+  }
+
+ private:
+  // Use db_session_id and orig_file_number from TableProperties
+  std::unique_ptr<TableProperties[]> dbs_;
+  std::unique_ptr<uint64_t[]> table_;
+  uint64_t process_count_ = 0;
+  uint64_t session_count_ = 0;
+  uint64_t newdb_count_ = 0;
+  uint64_t collisions_ = 0;
+  uint32_t restart_nfiles_ = 0;
+  double multiplier_ = 0.0;
+};
+
+int cache_bench_tool(int argc, char** argv) {
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  if (FLAGS_stress_cache_key) {
+    // Alternate tool
+    StressCacheKey().Run();
+    return 0;
+  }
+
+  if (FLAGS_threads <= 0) {
+    fprintf(stderr, "threads number <= 0\n");
+    exit(1);
+  }
+
+  ROCKSDB_NAMESPACE::CacheBench bench;
+  if (FLAGS_populate_cache) {
+    bench.PopulateCache();
+    printf("Population complete\n");
+    printf("----------------------------\n");
+  }
+  if (bench.Run()) {
+    return 0;
+  } else {
+    return 1;
+  }
+}  // namespace ROCKSDB_NAMESPACE
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/cache/cache_entry_roles.cc b/src/rocksdb/cache/cache_entry_roles.cc
new file mode 100644
index 000000000..b27349554
--- /dev/null
+++ b/src/rocksdb/cache/cache_entry_roles.cc
@@ -0,0 +1,134 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "cache/cache_entry_roles.h"
+
+#include <mutex>
+
+#include "port/lang.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::array<std::string, kNumCacheEntryRoles> kCacheEntryRoleToCamelString{{
+    "DataBlock",
+    "FilterBlock",
+    "FilterMetaBlock",
+    "DeprecatedFilterBlock",
+    "IndexBlock",
+    "OtherBlock",
+    "WriteBuffer",
+    "CompressionDictionaryBuildingBuffer",
+    "FilterConstruction",
+    "BlockBasedTableReader",
+    "FileMetadata",
+    "BlobValue",
+    "BlobCache",
+    "Misc",
+}};
+
+std::array<std::string, kNumCacheEntryRoles> kCacheEntryRoleToHyphenString{{
+    "data-block",
+    "filter-block",
+    "filter-meta-block",
+    "deprecated-filter-block",
+    "index-block",
+    "other-block",
+    "write-buffer",
+    "compression-dictionary-building-buffer",
+    "filter-construction",
+    "block-based-table-reader",
+    "file-metadata",
+    "blob-value",
+    "blob-cache",
+    "misc",
+}};
+
+const std::string& GetCacheEntryRoleName(CacheEntryRole role) {
+  return kCacheEntryRoleToHyphenString[static_cast<size_t>(role)];
+}
+
+const std::string& BlockCacheEntryStatsMapKeys::CacheId() {
+  static const std::string kCacheId = "id";
+  return kCacheId;
+}
+
+const std::string& BlockCacheEntryStatsMapKeys::CacheCapacityBytes() {
+  static const std::string kCacheCapacityBytes = "capacity";
+  return kCacheCapacityBytes;
+}
+
+const std::string&
+BlockCacheEntryStatsMapKeys::LastCollectionDurationSeconds() {
+  static const std::string kLastCollectionDurationSeconds =
+      "secs_for_last_collection";
+  return kLastCollectionDurationSeconds;
+}
+
+const std::string& BlockCacheEntryStatsMapKeys::LastCollectionAgeSeconds() {
+  static const std::string kLastCollectionAgeSeconds =
+      "secs_since_last_collection";
+  return kLastCollectionAgeSeconds;
+}
+
+namespace {
+
+std::string GetPrefixedCacheEntryRoleName(const std::string& prefix,
+                                          CacheEntryRole role) {
+  const std::string& role_name = GetCacheEntryRoleName(role);
+  std::string prefixed_role_name;
+  prefixed_role_name.reserve(prefix.size() + role_name.size());
+  prefixed_role_name.append(prefix);
+  prefixed_role_name.append(role_name);
+  return prefixed_role_name;
+}
+
+}  // namespace
+
+std::string BlockCacheEntryStatsMapKeys::EntryCount(CacheEntryRole role) {
+  const static std::string kPrefix = "count.";
+  return GetPrefixedCacheEntryRoleName(kPrefix, role);
+}
+
+std::string BlockCacheEntryStatsMapKeys::UsedBytes(CacheEntryRole role) {
+  const static std::string kPrefix = "bytes.";
+  return GetPrefixedCacheEntryRoleName(kPrefix, role);
+}
+
+std::string BlockCacheEntryStatsMapKeys::UsedPercent(CacheEntryRole role) {
+  const static std::string kPrefix = "percent.";
+  return GetPrefixedCacheEntryRoleName(kPrefix, role);
+}
+
+namespace {
+
+struct Registry {
+  std::mutex mutex;
+  UnorderedMap<Cache::DeleterFn, CacheEntryRole> role_map;
+  void Register(Cache::DeleterFn fn, CacheEntryRole role) {
+    std::lock_guard<std::mutex> lock(mutex);
+    role_map[fn] = role;
+  }
+  UnorderedMap<Cache::DeleterFn, CacheEntryRole> Copy() {
+    std::lock_guard<std::mutex> lock(mutex);
+    return role_map;
+  }
+};
+
+Registry& GetRegistry() {
+  STATIC_AVOID_DESTRUCTION(Registry, registry);
+  return registry;
+}
+
+}  // namespace
+
+void RegisterCacheDeleterRole(Cache::DeleterFn fn, CacheEntryRole role) {
+  GetRegistry().Register(fn, role);
+}
+
+UnorderedMap<Cache::DeleterFn, CacheEntryRole> CopyCacheDeleterRoleMap() {
+  return GetRegistry().Copy();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/cache_entry_roles.h b/src/rocksdb/cache/cache_entry_roles.h
new file mode 100644
index 000000000..5a49fdfd4
--- /dev/null
+++ b/src/rocksdb/cache/cache_entry_roles.h
@@ -0,0 +1,103 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+#include "rocksdb/cache.h"
+#include "util/hash_containers.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+extern std::array<std::string, kNumCacheEntryRoles>
+    kCacheEntryRoleToCamelString;
+extern std::array<std::string, kNumCacheEntryRoles>
+    kCacheEntryRoleToHyphenString;
+
+// To associate cache entries with their role, we use a hack on the
+// existing Cache interface. Because the deleter of an entry can authenticate
+// the code origin of an entry, we can elaborate the choice of deleter to
+// also encode role information, without inferring false role information
+// from entries not choosing to encode a role.
+//
+// The rest of this file is for handling mappings between deleters and
+// roles.
+
+// To infer a role from a deleter, the deleter must be registered. This
+// can be done "manually" with this function. This function is thread-safe,
+// and the registration mappings go into private but static storage. (Note
+// that DeleterFn is a function pointer, not std::function. Registrations
+// should not be too many.)
+void RegisterCacheDeleterRole(Cache::DeleterFn fn, CacheEntryRole role);
+
+// Gets a copy of the registered deleter -> role mappings. This is the only
+// function for reading the mappings made with RegisterCacheDeleterRole.
+// Why only this interface for reading?
+// * This function has to be thread safe, which could incur substantial
+// overhead. We should not pay this overhead for every deleter look-up.
+// * This is suitable for preparing for batch operations, like with
+// CacheEntryStatsCollector.
+// * The number of mappings should be sufficiently small (dozens).
+UnorderedMap<Cache::DeleterFn, CacheEntryRole> CopyCacheDeleterRoleMap();
+
+// ************************************************************** //
+// An automatic registration infrastructure. This enables code
+// to simply ask for a deleter associated with a particular type
+// and role, and registration is automatic. In a sense, this is
+// a small dependency injection infrastructure, because linking
+// in new deleter instantiations is essentially sufficient for
+// making stats collection (using CopyCacheDeleterRoleMap) aware
+// of them.
+
+namespace cache_entry_roles_detail {
+
+template <typename T, CacheEntryRole R>
+struct RegisteredDeleter {
+  RegisteredDeleter() { RegisterCacheDeleterRole(Delete, R); }
+
+  // These have global linkage to help ensure compiler optimizations do not
+  // break uniqueness for each <T,R>
+  static void Delete(const Slice& /* key */, void* value) {
+    // Supports T == Something[], unlike delete operator
+    std::default_delete<T>()(
+        static_cast<typename std::remove_extent<T>::type*>(value));
+  }
+};
+
+template <CacheEntryRole R>
+struct RegisteredNoopDeleter {
+  RegisteredNoopDeleter() { RegisterCacheDeleterRole(Delete, R); }
+
+  static void Delete(const Slice& /* key */, void* /* value */) {
+    // Here was `assert(value == nullptr);` but we can also put pointers
+    // to static data in Cache, for testing at least.
+  }
+};
+
+}  // namespace cache_entry_roles_detail
+
+// Get an automatically registered deleter for value type T and role R.
+// Based on C++ semantics, registration is invoked exactly once in a
+// thread-safe way on first call to this function, for each <T, R>.
+template <typename T, CacheEntryRole R>
+Cache::DeleterFn GetCacheEntryDeleterForRole() {
+  static cache_entry_roles_detail::RegisteredDeleter<T, R> reg;
+  return reg.Delete;
+}
+
+// Get an automatically registered no-op deleter (value should be nullptr)
+// and associated with role R. This is used for Cache "reservation" entries
+// such as for WriteBufferManager.
+template <CacheEntryRole R>
+Cache::DeleterFn GetNoopDeleterForRole() {
+  static cache_entry_roles_detail::RegisteredNoopDeleter<R> reg;
+  return reg.Delete;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/cache_entry_stats.h b/src/rocksdb/cache/cache_entry_stats.h
new file mode 100644
index 000000000..63b12735b
--- /dev/null
+++ b/src/rocksdb/cache/cache_entry_stats.h
@@ -0,0 +1,183 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <memory>
+#include <mutex>
+
+#include "cache/cache_helpers.h"
+#include "cache/cache_key.h"
+#include "port/lang.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/status.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/sync_point.h"
+#include "util/coding_lean.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A generic helper object for gathering stats about cache entries by
+// iterating over them with ApplyToAllEntries. This class essentially
+// solves the problem of slowing down a Cache with too many stats
+// collectors that could be sharing stat results, such as from multiple
+// column families or multiple DBs sharing a Cache. We employ a few
+// mitigations:
+// * Only one collector for a particular kind of Stats is alive
+// for each Cache. This is guaranteed using the Cache itself to hold
+// the collector.
+// * A mutex ensures only one thread is gathering stats for this
+// collector.
+// * The most recent gathered stats are saved and simply copied to
+// satisfy requests within a time window (default: 3 minutes) of
+// completion of the most recent stat gathering.
+//
+// Template parameter Stats must be copyable and trivially constructable,
+// as well as...
+// concept Stats {
+//   // Notification before applying callback to all entries
+//   void BeginCollection(Cache*, SystemClock*, uint64_t start_time_micros);
+//   // Get the callback to apply to all entries. `callback`
+//   // type must be compatible with Cache::ApplyToAllEntries
+//   callback GetEntryCallback();
+//   // Notification after applying callback to all entries
+//   void EndCollection(Cache*, SystemClock*, uint64_t end_time_micros);
+//   // Notification that a collection was skipped because of
+//   // sufficiently recent saved results.
+//   void SkippedCollection();
+// }
+template <class Stats>
+class CacheEntryStatsCollector {
+ public:
+  // Gather and save stats if saved stats are too old. (Use GetStats() to
+  // read saved stats.)
+  //
+  // Maximum allowed age for a "hit" on saved results is determined by the
+  // two interval parameters. Both set to 0 forces a re-scan. For example
+  // with min_interval_seconds=300 and min_interval_factor=100, if the last
+  // scan took 10s, we would only rescan ("miss") if the age in seconds of
+  // the saved results is > max(300, 100*10).
+  // Justification: scans can vary wildly in duration, e.g. from 0.02 sec
+  // to as much as 20 seconds, so we want to be able to cap the absolute
+  // and relative frequency of scans.
+  void CollectStats(int min_interval_seconds, int min_interval_factor) {
+    // Waits for any pending reader or writer (collector)
+    std::lock_guard<std::mutex> lock(working_mutex_);
+
+    uint64_t max_age_micros =
+        static_cast<uint64_t>(std::max(min_interval_seconds, 0)) * 1000000U;
+
+    if (last_end_time_micros_ > last_start_time_micros_ &&
+        min_interval_factor > 0) {
+      max_age_micros = std::max(
+          max_age_micros, min_interval_factor * (last_end_time_micros_ -
+                                                 last_start_time_micros_));
+    }
+
+    uint64_t start_time_micros = clock_->NowMicros();
+    if ((start_time_micros - last_end_time_micros_) > max_age_micros) {
+      last_start_time_micros_ = start_time_micros;
+      working_stats_.BeginCollection(cache_, clock_, start_time_micros);
+
+      cache_->ApplyToAllEntries(working_stats_.GetEntryCallback(), {});
+      TEST_SYNC_POINT_CALLBACK(
+          "CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries", nullptr);
+
+      uint64_t end_time_micros = clock_->NowMicros();
+      last_end_time_micros_ = end_time_micros;
+      working_stats_.EndCollection(cache_, clock_, end_time_micros);
+    } else {
+      working_stats_.SkippedCollection();
+    }
+
+    // Save so that we don't need to wait for an outstanding collection in
+    // order to make of copy of the last saved stats
+    std::lock_guard<std::mutex> lock2(saved_mutex_);
+    saved_stats_ = working_stats_;
+  }
+
+  // Gets saved stats, regardless of age
+  void GetStats(Stats *stats) {
+    std::lock_guard<std::mutex> lock(saved_mutex_);
+    *stats = saved_stats_;
+  }
+
+  Cache *GetCache() const { return cache_; }
+
+  // Gets or creates a shared instance of CacheEntryStatsCollector in the
+  // cache itself, and saves into `ptr`. This shared_ptr will hold the
+  // entry in cache until all refs are destroyed.
+  static Status GetShared(Cache *cache, SystemClock *clock,
+                          std::shared_ptr<CacheEntryStatsCollector> *ptr) {
+    const Slice &cache_key = GetCacheKey();
+
+    Cache::Handle *h = cache->Lookup(cache_key);
+    if (h == nullptr) {
+      // Not yet in cache, but Cache doesn't provide a built-in way to
+      // avoid racing insert. So we double-check under a shared mutex,
+      // inspired by TableCache.
+      STATIC_AVOID_DESTRUCTION(std::mutex, static_mutex);
+      std::lock_guard<std::mutex> lock(static_mutex);
+
+      h = cache->Lookup(cache_key);
+      if (h == nullptr) {
+        auto new_ptr = new CacheEntryStatsCollector(cache, clock);
+        // TODO: non-zero charge causes some tests that count block cache
+        // usage to go flaky. Fix the problem somehow so we can use an
+        // accurate charge.
+        size_t charge = 0;
+        Status s = cache->Insert(cache_key, new_ptr, charge, Deleter, &h,
+                                 Cache::Priority::HIGH);
+        if (!s.ok()) {
+          assert(h == nullptr);
+          delete new_ptr;
+          return s;
+        }
+      }
+    }
+    // If we reach here, shared entry is in cache with handle `h`.
+    assert(cache->GetDeleter(h) == Deleter);
+
+    // Build an aliasing shared_ptr that keeps `ptr` in cache while there
+    // are references.
+    *ptr = MakeSharedCacheHandleGuard<CacheEntryStatsCollector>(cache, h);
+    return Status::OK();
+  }
+
+ private:
+  explicit CacheEntryStatsCollector(Cache *cache, SystemClock *clock)
+      : saved_stats_(),
+        working_stats_(),
+        last_start_time_micros_(0),
+        last_end_time_micros_(/*pessimistic*/ 10000000),
+        cache_(cache),
+        clock_(clock) {}
+
+  static void Deleter(const Slice &, void *value) {
+    delete static_cast<CacheEntryStatsCollector *>(value);
+  }
+
+  static const Slice &GetCacheKey() {
+    // For each template instantiation
+    static CacheKey ckey = CacheKey::CreateUniqueForProcessLifetime();
+    static Slice ckey_slice = ckey.AsSlice();
+    return ckey_slice;
+  }
+
+  std::mutex saved_mutex_;
+  Stats saved_stats_;
+
+  std::mutex working_mutex_;
+  Stats working_stats_;
+  uint64_t last_start_time_micros_;
+  uint64_t last_end_time_micros_;
+
+  Cache *const cache_;
+  SystemClock *const clock_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/cache_helpers.h b/src/rocksdb/cache/cache_helpers.h
new file mode 100644
index 000000000..7ea2365b8
--- /dev/null
+++ b/src/rocksdb/cache/cache_helpers.h
@@ -0,0 +1,145 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Returns the cached value given a cache handle.
+template <typename T>
+T* GetFromCacheHandle(Cache* cache, Cache::Handle* handle) {
+  assert(cache);
+  assert(handle);
+
+  return static_cast<T*>(cache->Value(handle));
+}
+
+// Simple generic deleter for Cache (to be used with Cache::Insert).
+template <typename T>
+void DeleteCacheEntry(const Slice& /* key */, void* value) {
+  delete static_cast<T*>(value);
+}
+
+// Turns a T* into a Slice so it can be used as a key with Cache.
+template <typename T>
+Slice GetSlice(const T* t) {
+  return Slice(reinterpret_cast<const char*>(t), sizeof(T));
+}
+
+// Generic resource management object for cache handles that releases the handle
+// when destroyed. Has unique ownership of the handle, so copying it is not
+// allowed, while moving it transfers ownership.
+template <typename T>
+class CacheHandleGuard {
+ public:
+  CacheHandleGuard() = default;
+
+  CacheHandleGuard(Cache* cache, Cache::Handle* handle)
+      : cache_(cache),
+        handle_(handle),
+        value_(GetFromCacheHandle<T>(cache, handle)) {
+    assert(cache_ && handle_ && value_);
+  }
+
+  CacheHandleGuard(const CacheHandleGuard&) = delete;
+  CacheHandleGuard& operator=(const CacheHandleGuard&) = delete;
+
+  CacheHandleGuard(CacheHandleGuard&& rhs) noexcept
+      : cache_(rhs.cache_), handle_(rhs.handle_), value_(rhs.value_) {
+    assert((!cache_ && !handle_ && !value_) || (cache_ && handle_ && value_));
+
+    rhs.ResetFields();
+  }
+
+  CacheHandleGuard& operator=(CacheHandleGuard&& rhs) noexcept {
+    if (this == &rhs) {
+      return *this;
+    }
+
+    ReleaseHandle();
+
+    cache_ = rhs.cache_;
+    handle_ = rhs.handle_;
+    value_ = rhs.value_;
+
+    assert((!cache_ && !handle_ && !value_) || (cache_ && handle_ && value_));
+
+    rhs.ResetFields();
+
+    return *this;
+  }
+
+  ~CacheHandleGuard() { ReleaseHandle(); }
+
+  bool IsEmpty() const { return !handle_; }
+
+  Cache* GetCache() const { return cache_; }
+  Cache::Handle* GetCacheHandle() const { return handle_; }
+  T* GetValue() const { return value_; }
+
+  void TransferTo(Cleanable* cleanable) {
+    if (cleanable) {
+      if (handle_ != nullptr) {
+        assert(cache_);
+        cleanable->RegisterCleanup(&ReleaseCacheHandle, cache_, handle_);
+      }
+    }
+    ResetFields();
+  }
+
+  void Reset() {
+    ReleaseHandle();
+    ResetFields();
+  }
+
+ private:
+  void ReleaseHandle() {
+    if (IsEmpty()) {
+      return;
+    }
+
+    assert(cache_);
+    cache_->Release(handle_);
+  }
+
+  void ResetFields() {
+    cache_ = nullptr;
+    handle_ = nullptr;
+    value_ = nullptr;
+  }
+
+  static void ReleaseCacheHandle(void* arg1, void* arg2) {
+    Cache* const cache = static_cast<Cache*>(arg1);
+    assert(cache);
+
+    Cache::Handle* const cache_handle = static_cast<Cache::Handle*>(arg2);
+    assert(cache_handle);
+
+    cache->Release(cache_handle);
+  }
+
+ private:
+  Cache* cache_ = nullptr;
+  Cache::Handle* handle_ = nullptr;
+  T* value_ = nullptr;
+};
+
+// Build an aliasing shared_ptr that keeps `handle` in cache while there
+// are references, but the pointer is to the value for that cache entry,
+// which must be of type T. This is copyable, unlike CacheHandleGuard, but
+// does not provide access to caching details.
+template <typename T>
+std::shared_ptr<T> MakeSharedCacheHandleGuard(Cache* cache,
+                                              Cache::Handle* handle) {
+  auto wrapper = std::make_shared<CacheHandleGuard<T>>(cache, handle);
+  return std::shared_ptr<T>(wrapper, static_cast<T*>(cache->Value(handle)));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/cache_key.cc b/src/rocksdb/cache/cache_key.cc
new file mode 100644
index 000000000..a79328972
--- /dev/null
+++ b/src/rocksdb/cache/cache_key.cc
@@ -0,0 +1,364 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "cache/cache_key.h"
+
+#include <algorithm>
+#include <atomic>
+
+#include "rocksdb/cache.h"
+#include "table/unique_id_impl.h"
+#include "util/hash.h"
+#include "util/math.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Value space plan for CacheKey:
+//
+// file_num_etc64_ | offset_etc64_ | Only generated by
+// ---------------+---------------+------------------------------------------
+//              0 |             0 | Reserved for "empty" CacheKey()
+//              0 |  > 0, < 1<<63 | CreateUniqueForCacheLifetime
+//              0 |      >= 1<<63 | CreateUniqueForProcessLifetime
+//            > 0 |           any | OffsetableCacheKey.WithOffset
+
+CacheKey CacheKey::CreateUniqueForCacheLifetime(Cache *cache) {
+  // +1 so that we can reserve all zeros for "unset" cache key
+  uint64_t id = cache->NewId() + 1;
+  // Ensure we don't collide with CreateUniqueForProcessLifetime
+  assert((id >> 63) == 0U);
+  return CacheKey(0, id);
+}
+
+CacheKey CacheKey::CreateUniqueForProcessLifetime() {
+  // To avoid colliding with CreateUniqueForCacheLifetime, assuming
+  // Cache::NewId counts up from zero, here we count down from UINT64_MAX.
+  // If this ever becomes a point of contention, we could sub-divide the
+  // space and use CoreLocalArray.
+  static std::atomic<uint64_t> counter{UINT64_MAX};
+  uint64_t id = counter.fetch_sub(1, std::memory_order_relaxed);
+  // Ensure we don't collide with CreateUniqueForCacheLifetime
+  assert((id >> 63) == 1U);
+  return CacheKey(0, id);
+}
+
+// How we generate CacheKeys and base OffsetableCacheKey, assuming that
+// db_session_ids are generated from a base_session_id and
+// session_id_counter (by SemiStructuredUniqueIdGen+EncodeSessionId
+// in DBImpl::GenerateDbSessionId):
+//
+// Conceptual inputs:
+//   db_id                   (unstructured, from GenerateRawUniqueId or equiv)
+//                           * could be shared between cloned DBs but rare
+//                           * could be constant, if session id suffices
+//   base_session_id         (unstructured, from GenerateRawUniqueId)
+//   session_id_counter      (structured)
+//                           * usually much smaller than 2**24
+//   orig_file_number        (structured)
+//                           * usually smaller than 2**24
+//   offset_in_file          (structured, might skip lots of values)
+//                           * usually smaller than 2**32
+//
+// Overall approach (see https://github.com/pdillinger/unique_id for
+// background):
+//
+// First, we have three "structured" values, up to 64 bits each, that we
+// need to fit, without losses, into 128 bits. In practice, the values will
+// be small enough that they should fit. For example, applications generating
+// large SST files (large offsets) will naturally produce fewer files (small
+// file numbers). But we don't know ahead of time what bounds the values will
+// have.
+//
+// Second, we have unstructured inputs that enable distinct RocksDB processes
+// to pick a random point in space, likely very different from others. Xoring
+// the structured with the unstructured give us a cache key that is
+// structurally distinct between related keys (e.g. same file or same RocksDB
+// process) and distinct with high probability between unrelated keys.
+//
+// The problem of packing three structured values into the space for two is
+// complicated by the fact that we want to derive cache keys from SST unique
+// IDs, which have already combined structured and unstructured inputs in a
+// practically inseparable way. And we want a base cache key that works
+// with an offset of any size. So basically, we need to encode these three
+// structured values, each up to 64 bits, into 128 bits without knowing any
+// of their sizes. The DownwardInvolution() function gives us a mechanism to
+// accomplish this. (See its properties in math.h.) Specifically, for inputs
+// a, b, and c:
+//   lower64 = DownwardInvolution(a) ^ ReverseBits(b);
+//   upper64 = c ^ ReverseBits(a);
+// The 128-bit output is unique assuming there exist some i, j, and k
+// where a < 2**i, b < 2**j, c < 2**k, i <= 64, j <= 64, k <= 64, and
+// i + j + k <= 128. In other words, as long as there exist some bounds
+// that would allow us to pack the bits of a, b, and c into the output
+// if we know the bound, we can generate unique outputs without knowing
+// those bounds. To validate this claim, the inversion function (given
+// the bounds) has been implemented in CacheKeyDecoder in
+// db_block_cache_test.cc.
+//
+// With that in mind, the outputs in terms of the conceptual inputs look
+// like this, using bitwise-xor of the constituent pieces, low bits on left:
+//
+// |------------------------- file_num_etc64 -------------------------|
+// | +++++++++ base_session_id (lower 64 bits, involution) +++++++++ |
+// |-----------------------------------------------------------------|
+// | session_id_counter (involution) ..... |                         |
+// |-----------------------------------------------------------------|
+// | hash of: ++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
+// |  * base_session_id (upper ~39 bits)                             |
+// |  * db_id (~122 bits entropy)                                    |
+// |-----------------------------------------------------------------|
+// |                             | ..... orig_file_number (reversed) |
+// |-----------------------------------------------------------------|
+//
+//
+// |------------------------- offset_etc64 --------------------------|
+// | ++++++++++ base_session_id (lower 64 bits, reversed) ++++++++++ |
+// |-----------------------------------------------------------------|
+// |                           | ..... session_id_counter (reversed) |
+// |-----------------------------------------------------------------|
+// | offset_in_file ............... |                                |
+// |-----------------------------------------------------------------|
+//
+// Some oddities or inconveniences of this layout are due to deriving
+// the "base" cache key (without offset) from the SST unique ID (see
+// GetSstInternalUniqueId). Specifically,
+// * Lower 64 of base_session_id occurs in both output words (ok but
+//   weird)
+// * The inclusion of db_id is bad for the conditions under which we
+//   can guarantee uniqueness, but could be useful in some cases with
+//   few small files per process, to make up for db session id only having
+//   ~103 bits of entropy.
+//
+// In fact, if DB ids were not involved, we would be guaranteed unique
+// cache keys for files generated in a single process until total bits for
+// biggest session_id_counter, orig_file_number, and offset_in_file
+// reach 128 bits.
+//
+// With the DB id limitation, we only have nice guaranteed unique cache
+// keys for files generated in a single process until biggest
+// session_id_counter and offset_in_file reach combined 64 bits. This
+// is quite good in practice because we can have millions of DB Opens
+// with terabyte size SST files, or billions of DB Opens with gigabyte
+// size SST files.
+//
+// One of the considerations in the translation between existing SST unique
+// IDs and base cache keys is supporting better SST unique IDs in a future
+// format_version. If we use a process-wide file counter instead of
+// session counter and file numbers, we only need to combine two 64-bit values
+// instead of three. But we don't want to track unique ID versions in the
+// manifest, so we want to keep the same translation layer between SST unique
+// IDs and base cache keys, even with updated SST unique IDs. If the new
+// unique IDs put the file counter where the orig_file_number was, and
+// use no structured field where session_id_counter was, then our translation
+// layer works fine for two structured fields as well as three (for
+// compatibility). The small computation for the translation (one
+// DownwardInvolution(), two ReverseBits(), both ~log(64) instructions deep)
+// is negligible for computing as part of SST file reader open.
+//
+// More on how https://github.com/pdillinger/unique_id applies here:
+// Every bit of output always includes "unstructured" uniqueness bits and
+// often combines with "structured" uniqueness bits. The "unstructured" bits
+// change infrequently: only when we cannot guarantee our state tracking for
+// "structured" uniqueness hasn't been cloned. Using a static
+// SemiStructuredUniqueIdGen for db_session_ids, this means we only get an
+// "all new" session id when a new process uses RocksDB. (Between processes,
+// we don't know if a DB or other persistent storage has been cloned. We
+// assume that if VM hot cloning is used, subsequently generated SST files
+// do not interact.) Within a process, only the session_lower of the
+// db_session_id changes incrementally ("structured" uniqueness).
+//
+// This basically means that our offsets, counters and file numbers allow us
+// to do somewhat "better than random" (birthday paradox) while in the
+// degenerate case of completely new session for each tiny file, we still
+// have strong uniqueness properties from the birthday paradox, with ~103
+// bit session IDs or up to 128 bits entropy with different DB IDs sharing a
+// cache.
+//
+// More collision probability analysis:
+// Suppose a RocksDB host generates (generously) 2 GB/s (10TB data, 17 DWPD)
+// with average process/session lifetime of (pessimistically) 4 minutes.
+// In 180 days (generous allowable data lifespan), we generate 31 million GB
+// of data, or 2^55 bytes, and 2^16 "all new" session IDs.
+//
+// First, suppose this is in a single DB (lifetime 180 days):
+// 128 bits cache key size
+// - 55 <- ideal size for byte offsets + file numbers
+// -  2 <- bits for offsets and file numbers not exactly powers of two
+// +  2 <- bits saved not using byte offsets in BlockBasedTable::GetCacheKey
+// ----
+//   73 <- bits remaining for distinguishing session IDs
+// The probability of a collision in 73 bits of session ID data is less than
+// 1 in 2**(73 - (2 * 16)), or roughly 1 in a trillion. And this assumes all
+// data from the last 180 days is in cache for potential collision, and that
+// cache keys under each session id exhaustively cover the remaining 57 bits
+// while in reality they'll only cover a small fraction of it.
+//
+// Although data could be transferred between hosts, each host has its own
+// cache and we are already assuming a high rate of "all new" session ids.
+// So this doesn't really change the collision calculation. Across a fleet
+// of 1 million, each with <1 in a trillion collision possibility,
+// fleetwide collision probability is <1 in a million.
+//
+// Now suppose we have many DBs per host, say 2**10, with same host-wide write
+// rate and process/session lifetime. File numbers will be ~10 bits smaller
+// and we will have 2**10 times as many session IDs because of simultaneous
+// lifetimes. So now collision chance is less than 1 in 2**(83 - (2 * 26)),
+// or roughly 1 in a billion.
+//
+// Suppose instead we generated random or hashed cache keys for each
+// (compressed) block. For 1KB compressed block size, that is 2^45 cache keys
+// in 180 days. Collision probability is more easily estimated at roughly
+// 1 in 2**(128 - (2 * 45)) or roughly 1 in a trillion (assuming all
+// data from the last 180 days is in cache, but NOT the other assumption
+// for the 1 in a trillion estimate above).
+//
+//
+// Collision probability estimation through simulation:
+// A tool ./cache_bench -stress_cache_key broadly simulates host-wide cache
+// activity over many months, by making some pessimistic simplifying
+// assumptions. See class StressCacheKey in cache_bench_tool.cc for details.
+// Here is some sample output with
+// `./cache_bench -stress_cache_key -sck_keep_bits=43`:
+//
+//   Total cache or DBs size: 32TiB  Writing 925.926 MiB/s or 76.2939TiB/day
+//   Multiply by 1.15292e+18 to correct for simulation losses (but still
+//   assume whole file cached)
+//
+// These come from default settings of 2.5M files per day of 32 MB each, and
+// `-sck_keep_bits=43` means that to represent a single file, we are only
+// keeping 43 bits of the 128-bit (base) cache key.  With file size of 2**25
+// contiguous keys (pessimistic), our simulation is about 2\*\*(128-43-25) or
+// about 1 billion billion times more prone to collision than reality.
+//
+// More default assumptions, relatively pessimistic:
+// * 100 DBs in same process (doesn't matter much)
+// * Re-open DB in same process (new session ID related to old session ID) on
+// average every 100 files generated
+// * Restart process (all new session IDs unrelated to old) 24 times per day
+//
+// After enough data, we get a result at the end (-sck_keep_bits=43):
+//
+//   (keep 43 bits)  18 collisions after 2 x 90 days, est 10 days between
+//                   (1.15292e+19 corrected)
+//
+// If we believe the (pessimistic) simulation and the mathematical
+// extrapolation, we would need to run a billion machines all for 11 billion
+// days to expect a cache key collision. To help verify that our extrapolation
+// ("corrected") is robust, we can make our simulation more precise by
+// increasing the "keep" bits, which takes more running time to get enough
+// collision data:
+//
+//   (keep 44 bits)  16 collisions after 5 x 90 days, est 28.125 days between
+//                   (1.6213e+19 corrected)
+//   (keep 45 bits)  15 collisions after 7 x 90 days, est 42 days between
+//                   (1.21057e+19 corrected)
+//   (keep 46 bits)  15 collisions after 17 x 90 days, est 102 days between
+//                   (1.46997e+19 corrected)
+//   (keep 47 bits)  15 collisions after 49 x 90 days, est 294 days between
+//                   (2.11849e+19 corrected)
+//
+// The extrapolated prediction seems to be within noise (sampling error).
+//
+// With the `-sck_randomize` option, we can see that typical workloads like
+// above have lower collision probability than "random" cache keys (note:
+// offsets still non-randomized) by a modest amount (roughly 2-3x less
+// collision prone than random), which should make us reasonably comfortable
+// even in "degenerate" cases (e.g. repeatedly launch a process to generate
+// one file with SstFileWriter):
+//
+//   (rand 43 bits) 22 collisions after 1 x 90 days, est 4.09091 days between
+//                  (4.7165e+18 corrected)
+//
+// We can see that with more frequent process restarts,
+// -sck_restarts_per_day=5000, which means more all-new session IDs, we get
+// closer to the "random" cache key performance:
+//
+// 15 collisions after 1 x 90 days, est 6 days between (6.91753e+18 corrected)
+//
+// And with less frequent process restarts and re-opens,
+// -sck_restarts_per_day=1 -sck_reopen_nfiles=1000, we get lower collision
+// probability:
+//
+// 18 collisions after 8 x 90 days, est 40 days between (4.61169e+19 corrected)
+//
+// Other tests have been run to validate other conditions behave as expected,
+// never behaving "worse than random" unless we start chopping off structured
+// data.
+//
+// Conclusion: Even in extreme cases, rapidly burning through "all new" IDs
+// that only arise when a new process is started, the chance of any cache key
+// collisions in a giant fleet of machines is negligible. Especially when
+// processes live for hours or days, the chance of a cache key collision is
+// likely more plausibly due to bad hardware than to bad luck in random
+// session ID data. Software defects are surely more likely to cause corruption
+// than both of those.
+//
+// TODO: Nevertheless / regardless, an efficient way to detect (and thus
+// quantify) block cache corruptions, including collisions, should be added.
+OffsetableCacheKey::OffsetableCacheKey(const std::string &db_id,
+                                       const std::string &db_session_id,
+                                       uint64_t file_number) {
+  UniqueId64x2 internal_id;
+  Status s = GetSstInternalUniqueId(db_id, db_session_id, file_number,
+                                    &internal_id, /*force=*/true);
+  assert(s.ok());
+  *this = FromInternalUniqueId(&internal_id);
+}
+
+OffsetableCacheKey OffsetableCacheKey::FromInternalUniqueId(UniqueIdPtr id) {
+  uint64_t session_lower = id.ptr[0];
+  uint64_t file_num_etc = id.ptr[1];
+
+#ifndef NDEBUG
+  bool is_empty = session_lower == 0 && file_num_etc == 0;
+#endif
+
+  // Although DBImpl guarantees (in recent versions) that session_lower is not
+  // zero, that's not entirely sufficient to guarantee that file_num_etc64_ is
+  // not zero (so that the 0 case can be used by CacheKey::CreateUnique*)
+  // However, if we are given an "empty" id as input, then we should produce
+  // "empty" as output.
+  // As a consequence, this function is only bijective assuming
+  // id[0] == 0 only if id[1] == 0.
+  if (session_lower == 0U) {
+    session_lower = file_num_etc;
+  }
+
+  // See comments above for how DownwardInvolution and ReverseBits
+  // make this function invertible under various assumptions.
+  OffsetableCacheKey rv;
+  rv.file_num_etc64_ =
+      DownwardInvolution(session_lower) ^ ReverseBits(file_num_etc);
+  rv.offset_etc64_ = ReverseBits(session_lower);
+
+  // Because of these transformations and needing to allow arbitrary
+  // offset (thus, second 64 bits of cache key might be 0), we need to
+  // make some correction to ensure the first 64 bits is not 0.
+  // Fortunately, the transformation ensures the second 64 bits is not 0
+  // for non-empty base key, so we can swap in the case one is 0 without
+  // breaking bijectivity (assuming condition above).
+  assert(is_empty || rv.offset_etc64_ > 0);
+  if (rv.file_num_etc64_ == 0) {
+    std::swap(rv.file_num_etc64_, rv.offset_etc64_);
+  }
+  assert(is_empty || rv.file_num_etc64_ > 0);
+  return rv;
+}
+
+// Inverse of FromInternalUniqueId (assuming file_num_etc64 == 0 only if
+// offset_etc64 == 0)
+UniqueId64x2 OffsetableCacheKey::ToInternalUniqueId() {
+  uint64_t a = file_num_etc64_;
+  uint64_t b = offset_etc64_;
+  if (b == 0) {
+    std::swap(a, b);
+  }
+  UniqueId64x2 rv;
+  rv[0] = ReverseBits(b);
+  rv[1] = ReverseBits(a ^ DownwardInvolution(rv[0]));
+  return rv;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/cache_key.h b/src/rocksdb/cache/cache_key.h
new file mode 100644
index 000000000..0b93c6bd9
--- /dev/null
+++ b/src/rocksdb/cache/cache_key.h
@@ -0,0 +1,143 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/slice.h"
+#include "table/unique_id_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cache;
+
+// A standard holder for fixed-size block cache keys (and for related caches).
+// They are created through one of these, each using its own range of values:
+// * CacheKey::CreateUniqueForCacheLifetime
+// * CacheKey::CreateUniqueForProcessLifetime
+// * Default ctor ("empty" cache key)
+// * OffsetableCacheKey->WithOffset
+//
+// The first two use atomic counters to guarantee uniqueness over the given
+// lifetime and the last uses a form of universally unique identifier for
+// uniqueness with very high probabilty (and guaranteed for files generated
+// during a single process lifetime).
+//
+// CacheKeys are currently used by calling AsSlice() to pass as a key to
+// Cache. For performance, the keys are endianness-dependent (though otherwise
+// portable). (Persistable cache entries are not intended to cross platforms.)
+class CacheKey {
+ public:
+  // For convenience, constructs an "empty" cache key that is never returned
+  // by other means.
+  inline CacheKey() : file_num_etc64_(), offset_etc64_() {}
+
+  inline bool IsEmpty() const {
+    return (file_num_etc64_ == 0) & (offset_etc64_ == 0);
+  }
+
+  // Use this cache key as a Slice (byte order is endianness-dependent)
+  inline Slice AsSlice() const {
+    static_assert(sizeof(*this) == 16, "Standardized on 16-byte cache key");
+    assert(!IsEmpty());
+    return Slice(reinterpret_cast<const char *>(this), sizeof(*this));
+  }
+
+  // Create a CacheKey that is unique among others associated with this Cache
+  // instance. Depends on Cache::NewId. This is useful for block cache
+  // "reservations".
+  static CacheKey CreateUniqueForCacheLifetime(Cache *cache);
+
+  // Create a CacheKey that is unique among others for the lifetime of this
+  // process. This is useful for saving in a static data member so that
+  // different DB instances can agree on a cache key for shared entities,
+  // such as for CacheEntryStatsCollector.
+  static CacheKey CreateUniqueForProcessLifetime();
+
+ protected:
+  friend class OffsetableCacheKey;
+  CacheKey(uint64_t file_num_etc64, uint64_t offset_etc64)
+      : file_num_etc64_(file_num_etc64), offset_etc64_(offset_etc64) {}
+  uint64_t file_num_etc64_;
+  uint64_t offset_etc64_;
+};
+
+constexpr uint8_t kCacheKeySize = static_cast<uint8_t>(sizeof(CacheKey));
+
+// A file-specific generator of cache keys, sometimes referred to as the
+// "base" cache key for a file because all the cache keys for various offsets
+// within the file are computed using simple arithmetic. The basis for the
+// general approach is dicussed here: https://github.com/pdillinger/unique_id
+// Heavily related to GetUniqueIdFromTableProperties.
+//
+// If the db_id, db_session_id, and file_number come from the file's table
+// properties, then the keys will be stable across DB::Open/Close, backup/
+// restore, import/export, etc.
+//
+// This class "is a" CacheKey only privately so that it is not misused as
+// a ready-to-use CacheKey.
+class OffsetableCacheKey : private CacheKey {
+ public:
+  // For convenience, constructs an "empty" cache key that should not be used.
+  inline OffsetableCacheKey() : CacheKey() {}
+
+  // Constructs an OffsetableCacheKey with the given information about a file.
+  // This constructor never generates an "empty" base key.
+  OffsetableCacheKey(const std::string &db_id, const std::string &db_session_id,
+                     uint64_t file_number);
+
+  // Creates an OffsetableCacheKey from an SST unique ID, so that cache keys
+  // can be derived from DB manifest data before reading the file from
+  // storage--so that every part of the file can potentially go in a persistent
+  // cache.
+  //
+  // Calling GetSstInternalUniqueId() on a db_id, db_session_id, and
+  // file_number and passing the result to this function produces the same
+  // base cache key as feeding those inputs directly to the constructor.
+  //
+  // This is a bijective transformation assuming either id is empty or
+  // lower 64 bits is non-zero:
+  // * Empty (all zeros) input -> empty (all zeros) output
+  // * Lower 64 input is non-zero -> lower 64 output (file_num_etc64_) is
+  //   non-zero
+  static OffsetableCacheKey FromInternalUniqueId(UniqueIdPtr id);
+
+  // This is the inverse transformation to the above, assuming either empty
+  // or lower 64 bits (file_num_etc64_) is non-zero. Perhaps only useful for
+  // testing.
+  UniqueId64x2 ToInternalUniqueId();
+
+  inline bool IsEmpty() const {
+    bool result = file_num_etc64_ == 0;
+    assert(!(offset_etc64_ > 0 && result));
+    return result;
+  }
+
+  // Construct a CacheKey for an offset within a file. An offset is not
+  // necessarily a byte offset if a smaller unique identifier of keyable
+  // offsets is used.
+  //
+  // This class was designed to make this hot code extremely fast.
+  inline CacheKey WithOffset(uint64_t offset) const {
+    assert(!IsEmpty());
+    return CacheKey(file_num_etc64_, offset_etc64_ ^ offset);
+  }
+
+  // The "common prefix" is a shared prefix for all the returned CacheKeys.
+  // It is specific to the file but the same for all offsets within the file.
+  static constexpr size_t kCommonPrefixSize = 8;
+  inline Slice CommonPrefixSlice() const {
+    static_assert(sizeof(file_num_etc64_) == kCommonPrefixSize,
+                  "8 byte common prefix expected");
+    assert(!IsEmpty());
+    assert(&this->file_num_etc64_ == static_cast<const void *>(this));
+
+    return Slice(reinterpret_cast<const char *>(this), kCommonPrefixSize);
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/cache_reservation_manager.cc b/src/rocksdb/cache/cache_reservation_manager.cc
new file mode 100644
index 000000000..53dee5d79
--- /dev/null
+++ b/src/rocksdb/cache/cache_reservation_manager.cc
@@ -0,0 +1,185 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "cache/cache_reservation_manager.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstring>
+#include <memory>
+
+#include "cache/cache_entry_roles.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "table/block_based/reader_common.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+template <CacheEntryRole R>
+CacheReservationManagerImpl<R>::CacheReservationHandle::CacheReservationHandle(
+    std::size_t incremental_memory_used,
+    std::shared_ptr<CacheReservationManagerImpl> cache_res_mgr)
+    : incremental_memory_used_(incremental_memory_used) {
+  assert(cache_res_mgr);
+  cache_res_mgr_ = cache_res_mgr;
+}
+
+template <CacheEntryRole R>
+CacheReservationManagerImpl<
+    R>::CacheReservationHandle::~CacheReservationHandle() {
+  Status s = cache_res_mgr_->ReleaseCacheReservation(incremental_memory_used_);
+  s.PermitUncheckedError();
+}
+
+template <CacheEntryRole R>
+CacheReservationManagerImpl<R>::CacheReservationManagerImpl(
+    std::shared_ptr<Cache> cache, bool delayed_decrease)
+    : delayed_decrease_(delayed_decrease),
+      cache_allocated_size_(0),
+      memory_used_(0) {
+  assert(cache != nullptr);
+  cache_ = cache;
+}
+
+template <CacheEntryRole R>
+CacheReservationManagerImpl<R>::~CacheReservationManagerImpl() {
+  for (auto* handle : dummy_handles_) {
+    cache_->Release(handle, true);
+  }
+}
+
+template <CacheEntryRole R>
+Status CacheReservationManagerImpl<R>::UpdateCacheReservation(
+    std::size_t new_mem_used) {
+  memory_used_ = new_mem_used;
+  std::size_t cur_cache_allocated_size =
+      cache_allocated_size_.load(std::memory_order_relaxed);
+  if (new_mem_used == cur_cache_allocated_size) {
+    return Status::OK();
+  } else if (new_mem_used > cur_cache_allocated_size) {
+    Status s = IncreaseCacheReservation(new_mem_used);
+    return s;
+  } else {
+    // In delayed decrease mode, we don't decrease cache reservation
+    // untill the memory usage is less than 3/4 of what we reserve
+    // in the cache.
+    // We do this because
+    // (1) Dummy entry insertion is expensive in block cache
+    // (2) Delayed releasing previously inserted dummy entries can save such
+    // expensive dummy entry insertion on memory increase in the near future,
+    // which is likely to happen when the memory usage is greater than or equal
+    // to 3/4 of what we reserve
+    if (delayed_decrease_ && new_mem_used >= cur_cache_allocated_size / 4 * 3) {
+      return Status::OK();
+    } else {
+      Status s = DecreaseCacheReservation(new_mem_used);
+      return s;
+    }
+  }
+}
+
+template <CacheEntryRole R>
+Status CacheReservationManagerImpl<R>::MakeCacheReservation(
+    std::size_t incremental_memory_used,
+    std::unique_ptr<CacheReservationManager::CacheReservationHandle>* handle) {
+  assert(handle);
+  Status s =
+      UpdateCacheReservation(GetTotalMemoryUsed() + incremental_memory_used);
+  (*handle).reset(new CacheReservationManagerImpl::CacheReservationHandle(
+      incremental_memory_used,
+      std::enable_shared_from_this<
+          CacheReservationManagerImpl<R>>::shared_from_this()));
+  return s;
+}
+
+template <CacheEntryRole R>
+Status CacheReservationManagerImpl<R>::ReleaseCacheReservation(
+    std::size_t incremental_memory_used) {
+  assert(GetTotalMemoryUsed() >= incremental_memory_used);
+  std::size_t updated_total_mem_used =
+      GetTotalMemoryUsed() - incremental_memory_used;
+  Status s = UpdateCacheReservation(updated_total_mem_used);
+  return s;
+}
+
+template <CacheEntryRole R>
+Status CacheReservationManagerImpl<R>::IncreaseCacheReservation(
+    std::size_t new_mem_used) {
+  Status return_status = Status::OK();
+  while (new_mem_used > cache_allocated_size_.load(std::memory_order_relaxed)) {
+    Cache::Handle* handle = nullptr;
+    return_status = cache_->Insert(GetNextCacheKey(), nullptr, kSizeDummyEntry,
+                                   GetNoopDeleterForRole<R>(), &handle);
+
+    if (return_status != Status::OK()) {
+      return return_status;
+    }
+
+    dummy_handles_.push_back(handle);
+    cache_allocated_size_ += kSizeDummyEntry;
+  }
+  return return_status;
+}
+
+template <CacheEntryRole R>
+Status CacheReservationManagerImpl<R>::DecreaseCacheReservation(
+    std::size_t new_mem_used) {
+  Status return_status = Status::OK();
+
+  // Decrease to the smallest multiple of kSizeDummyEntry that is greater than
+  // or equal to new_mem_used We do addition instead of new_mem_used <=
+  // cache_allocated_size_.load(std::memory_order_relaxed) - kSizeDummyEntry to
+  // avoid underflow of size_t when cache_allocated_size_ = 0
+  while (new_mem_used + kSizeDummyEntry <=
+         cache_allocated_size_.load(std::memory_order_relaxed)) {
+    assert(!dummy_handles_.empty());
+    auto* handle = dummy_handles_.back();
+    cache_->Release(handle, true);
+    dummy_handles_.pop_back();
+    cache_allocated_size_ -= kSizeDummyEntry;
+  }
+  return return_status;
+}
+
+template <CacheEntryRole R>
+std::size_t CacheReservationManagerImpl<R>::GetTotalReservedCacheSize() {
+  return cache_allocated_size_.load(std::memory_order_relaxed);
+}
+
+template <CacheEntryRole R>
+std::size_t CacheReservationManagerImpl<R>::GetTotalMemoryUsed() {
+  return memory_used_;
+}
+
+template <CacheEntryRole R>
+Slice CacheReservationManagerImpl<R>::GetNextCacheKey() {
+  // Calling this function will have the side-effect of changing the
+  // underlying cache_key_ that is shared among other keys generated from this
+  // fucntion. Therefore please make sure the previous keys are saved/copied
+  // before calling this function.
+  cache_key_ = CacheKey::CreateUniqueForCacheLifetime(cache_.get());
+  return cache_key_.AsSlice();
+}
+
+template <CacheEntryRole R>
+Cache::DeleterFn CacheReservationManagerImpl<R>::TEST_GetNoopDeleterForRole() {
+  return GetNoopDeleterForRole<R>();
+}
+
+template class CacheReservationManagerImpl<
+    CacheEntryRole::kBlockBasedTableReader>;
+template class CacheReservationManagerImpl<
+    CacheEntryRole::kCompressionDictionaryBuildingBuffer>;
+template class CacheReservationManagerImpl<CacheEntryRole::kFilterConstruction>;
+template class CacheReservationManagerImpl<CacheEntryRole::kMisc>;
+template class CacheReservationManagerImpl<CacheEntryRole::kWriteBuffer>;
+template class CacheReservationManagerImpl<CacheEntryRole::kFileMetadata>;
+template class CacheReservationManagerImpl<CacheEntryRole::kBlobCache>;
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/cache_reservation_manager.h b/src/rocksdb/cache/cache_reservation_manager.h
new file mode 100644
index 000000000..147aaa915
--- /dev/null
+++ b/src/rocksdb/cache/cache_reservation_manager.h
@@ -0,0 +1,316 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <vector>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_key.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+// CacheReservationManager is an interface for reserving cache space for the
+// memory used
+class CacheReservationManager {
+ public:
+  // CacheReservationHandle is for managing the lifetime of a cache reservation
+  // for an incremental amount of memory used (i.e, incremental_memory_used)
+  class CacheReservationHandle {
+   public:
+    virtual ~CacheReservationHandle() {}
+  };
+  virtual ~CacheReservationManager() {}
+  virtual Status UpdateCacheReservation(std::size_t new_memory_used) = 0;
+  // TODO(hx235): replace the usage of
+  // `UpdateCacheReservation(memory_used_delta, increase)` with
+  // `UpdateCacheReservation(new_memory_used)` so that we only have one
+  // `UpdateCacheReservation` function
+  virtual Status UpdateCacheReservation(std::size_t memory_used_delta,
+                                        bool increase) = 0;
+  virtual Status MakeCacheReservation(
+      std::size_t incremental_memory_used,
+      std::unique_ptr<CacheReservationManager::CacheReservationHandle>
+          *handle) = 0;
+  virtual std::size_t GetTotalReservedCacheSize() = 0;
+  virtual std::size_t GetTotalMemoryUsed() = 0;
+};
+
+// CacheReservationManagerImpl implements interface CacheReservationManager
+// for reserving cache space for the memory used by inserting/releasing dummy
+// entries in the cache.
+//
+// This class is NOT thread-safe, except that GetTotalReservedCacheSize()
+// can be called without external synchronization.
+template <CacheEntryRole R>
+class CacheReservationManagerImpl
+    : public CacheReservationManager,
+      public std::enable_shared_from_this<CacheReservationManagerImpl<R>> {
+ public:
+  class CacheReservationHandle
+      : public CacheReservationManager::CacheReservationHandle {
+   public:
+    CacheReservationHandle(
+        std::size_t incremental_memory_used,
+        std::shared_ptr<CacheReservationManagerImpl> cache_res_mgr);
+    ~CacheReservationHandle() override;
+
+   private:
+    std::size_t incremental_memory_used_;
+    std::shared_ptr<CacheReservationManagerImpl> cache_res_mgr_;
+  };
+
+  // Construct a CacheReservationManagerImpl
+  // @param cache The cache where dummy entries are inserted and released for
+  // reserving cache space
+  // @param delayed_decrease If set true, then dummy entries won't be released
+  //                         immediately when memory usage decreases.
+  //                         Instead, it will be released when the memory usage
+  //                         decreases to 3/4 of what we have reserved so far.
+  //                         This is for saving some future dummy entry
+  //                         insertion when memory usage increases are likely to
+  //                         happen in the near future.
+  //
+  // REQUIRED: cache is not nullptr
+  explicit CacheReservationManagerImpl(std::shared_ptr<Cache> cache,
+                                       bool delayed_decrease = false);
+
+  // no copy constructor, copy assignment, move constructor, move assignment
+  CacheReservationManagerImpl(const CacheReservationManagerImpl &) = delete;
+  CacheReservationManagerImpl &operator=(const CacheReservationManagerImpl &) =
+      delete;
+  CacheReservationManagerImpl(CacheReservationManagerImpl &&) = delete;
+  CacheReservationManagerImpl &operator=(CacheReservationManagerImpl &&) =
+      delete;
+
+  ~CacheReservationManagerImpl() override;
+
+  // One of the two ways of reserving/releasing cache space,
+  // see MakeCacheReservation() for the other.
+  //
+  // Use ONLY one of these two ways to prevent unexpected behavior.
+  //
+  // Insert and release dummy entries in the cache to
+  // match the size of total dummy entries with the least multiple of
+  // kSizeDummyEntry greater than or equal to new_mem_used
+  //
+  // Insert dummy entries if new_memory_used > cache_allocated_size_;
+  //
+  // Release dummy entries if new_memory_used < cache_allocated_size_
+  // (and new_memory_used < cache_allocated_size_ * 3/4
+  // when delayed_decrease is set true);
+  //
+  // Keey dummy entries the same if (1) new_memory_used == cache_allocated_size_
+  // or (2) new_memory_used is in the interval of
+  // [cache_allocated_size_ * 3/4, cache_allocated_size) when delayed_decrease
+  // is set true.
+  //
+  // @param new_memory_used The number of bytes used by new memory
+  //        The most recent new_memoy_used passed in will be returned
+  //        in GetTotalMemoryUsed() even when the call return non-ok status.
+  //
+  //        Since the class is NOT thread-safe, external synchronization on the
+  //        order of calling UpdateCacheReservation() is needed if you want
+  //        GetTotalMemoryUsed() indeed returns the latest memory used.
+  //
+  // @return On inserting dummy entries, it returns Status::OK() if all dummy
+  //         entry insertions succeed.
+  //         Otherwise, it returns the first non-ok status;
+  //         On releasing dummy entries, it always returns Status::OK().
+  //         On keeping dummy entries the same, it always returns Status::OK().
+  Status UpdateCacheReservation(std::size_t new_memory_used) override;
+
+  Status UpdateCacheReservation(std::size_t /* memory_used_delta */,
+                                bool /* increase */) override {
+    return Status::NotSupported();
+  }
+
+  // One of the two ways of reserving cache space and releasing is done through
+  // destruction of CacheReservationHandle.
+  // See UpdateCacheReservation() for the other way.
+  //
+  // Use ONLY one of these two ways to prevent unexpected behavior.
+  //
+  // Insert dummy entries in the cache for the incremental memory usage
+  // to match the size of total dummy entries with the least multiple of
+  // kSizeDummyEntry greater than or equal to the total memory used.
+  //
+  // A CacheReservationHandle is returned as an output parameter.
+  // The reserved dummy entries are automatically released on the destruction of
+  // this handle, which achieves better RAII per cache reservation.
+  //
+  // WARNING: Deallocate all the handles of the CacheReservationManager object
+  //          before deallocating the object to prevent unexpected behavior.
+  //
+  // @param incremental_memory_used The number of bytes increased in memory
+  //        usage.
+  //
+  //        Calling GetTotalMemoryUsed() afterward will return the total memory
+  //        increased by this number, even when calling MakeCacheReservation()
+  //        returns non-ok status.
+  //
+  //        Since the class is NOT thread-safe, external synchronization in
+  //        calling MakeCacheReservation() is needed if you want
+  //        GetTotalMemoryUsed() indeed returns the latest memory used.
+  //
+  // @param handle An pointer to std::unique_ptr<CacheReservationHandle> that
+  //        manages the lifetime of the cache reservation represented by the
+  //        handle.
+  //
+  // @return It returns Status::OK() if all dummy
+  //         entry insertions succeed.
+  //         Otherwise, it returns the first non-ok status;
+  //
+  // REQUIRES: handle != nullptr
+  Status MakeCacheReservation(
+      std::size_t incremental_memory_used,
+      std::unique_ptr<CacheReservationManager::CacheReservationHandle> *handle)
+      override;
+
+  // Return the size of the cache (which is a multiple of kSizeDummyEntry)
+  // successfully reserved by calling UpdateCacheReservation().
+  //
+  // When UpdateCacheReservation() returns non-ok status,
+  // calling GetTotalReservedCacheSize() after that might return a slightly
+  // smaller number than the actual reserved cache size due to
+  // the returned number will always be a multiple of kSizeDummyEntry
+  // and cache full might happen in the middle of inserting a dummy entry.
+  std::size_t GetTotalReservedCacheSize() override;
+
+  // Return the latest total memory used indicated by the most recent call of
+  // UpdateCacheReservation(std::size_t new_memory_used);
+  std::size_t GetTotalMemoryUsed() override;
+
+  static constexpr std::size_t GetDummyEntrySize() { return kSizeDummyEntry; }
+
+  // For testing only - it is to help ensure the NoopDeleterForRole<R>
+  // accessed from CacheReservationManagerImpl and the one accessed from the
+  // test are from the same translation units
+  static Cache::DeleterFn TEST_GetNoopDeleterForRole();
+
+ private:
+  static constexpr std::size_t kSizeDummyEntry = 256 * 1024;
+
+  Slice GetNextCacheKey();
+
+  Status ReleaseCacheReservation(std::size_t incremental_memory_used);
+  Status IncreaseCacheReservation(std::size_t new_mem_used);
+  Status DecreaseCacheReservation(std::size_t new_mem_used);
+
+  std::shared_ptr<Cache> cache_;
+  bool delayed_decrease_;
+  std::atomic<std::size_t> cache_allocated_size_;
+  std::size_t memory_used_;
+  std::vector<Cache::Handle *> dummy_handles_;
+  CacheKey cache_key_;
+};
+
+class ConcurrentCacheReservationManager
+    : public CacheReservationManager,
+      public std::enable_shared_from_this<ConcurrentCacheReservationManager> {
+ public:
+  class CacheReservationHandle
+      : public CacheReservationManager::CacheReservationHandle {
+   public:
+    CacheReservationHandle(
+        std::shared_ptr<ConcurrentCacheReservationManager> cache_res_mgr,
+        std::unique_ptr<CacheReservationManager::CacheReservationHandle>
+            cache_res_handle) {
+      assert(cache_res_mgr && cache_res_handle);
+      cache_res_mgr_ = cache_res_mgr;
+      cache_res_handle_ = std::move(cache_res_handle);
+    }
+
+    ~CacheReservationHandle() override {
+      std::lock_guard<std::mutex> lock(cache_res_mgr_->cache_res_mgr_mu_);
+      cache_res_handle_.reset();
+    }
+
+   private:
+    std::shared_ptr<ConcurrentCacheReservationManager> cache_res_mgr_;
+    std::unique_ptr<CacheReservationManager::CacheReservationHandle>
+        cache_res_handle_;
+  };
+
+  explicit ConcurrentCacheReservationManager(
+      std::shared_ptr<CacheReservationManager> cache_res_mgr) {
+    cache_res_mgr_ = std::move(cache_res_mgr);
+  }
+  ConcurrentCacheReservationManager(const ConcurrentCacheReservationManager &) =
+      delete;
+  ConcurrentCacheReservationManager &operator=(
+      const ConcurrentCacheReservationManager &) = delete;
+  ConcurrentCacheReservationManager(ConcurrentCacheReservationManager &&) =
+      delete;
+  ConcurrentCacheReservationManager &operator=(
+      ConcurrentCacheReservationManager &&) = delete;
+
+  ~ConcurrentCacheReservationManager() override {}
+
+  inline Status UpdateCacheReservation(std::size_t new_memory_used) override {
+    std::lock_guard<std::mutex> lock(cache_res_mgr_mu_);
+    return cache_res_mgr_->UpdateCacheReservation(new_memory_used);
+  }
+
+  inline Status UpdateCacheReservation(std::size_t memory_used_delta,
+                                       bool increase) override {
+    std::lock_guard<std::mutex> lock(cache_res_mgr_mu_);
+    std::size_t total_mem_used = cache_res_mgr_->GetTotalMemoryUsed();
+    Status s;
+    if (!increase) {
+      assert(total_mem_used >= memory_used_delta);
+      s = cache_res_mgr_->UpdateCacheReservation(total_mem_used -
+                                                 memory_used_delta);
+    } else {
+      s = cache_res_mgr_->UpdateCacheReservation(total_mem_used +
+                                                 memory_used_delta);
+    }
+    return s;
+  }
+
+  inline Status MakeCacheReservation(
+      std::size_t incremental_memory_used,
+      std::unique_ptr<CacheReservationManager::CacheReservationHandle> *handle)
+      override {
+    std::unique_ptr<CacheReservationManager::CacheReservationHandle>
+        wrapped_handle;
+    Status s;
+    {
+      std::lock_guard<std::mutex> lock(cache_res_mgr_mu_);
+      s = cache_res_mgr_->MakeCacheReservation(incremental_memory_used,
+                                               &wrapped_handle);
+    }
+    (*handle).reset(
+        new ConcurrentCacheReservationManager::CacheReservationHandle(
+            std::enable_shared_from_this<
+                ConcurrentCacheReservationManager>::shared_from_this(),
+            std::move(wrapped_handle)));
+    return s;
+  }
+  inline std::size_t GetTotalReservedCacheSize() override {
+    return cache_res_mgr_->GetTotalReservedCacheSize();
+  }
+  inline std::size_t GetTotalMemoryUsed() override {
+    std::lock_guard<std::mutex> lock(cache_res_mgr_mu_);
+    return cache_res_mgr_->GetTotalMemoryUsed();
+  }
+
+ private:
+  std::mutex cache_res_mgr_mu_;
+  std::shared_ptr<CacheReservationManager> cache_res_mgr_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/cache_reservation_manager_test.cc b/src/rocksdb/cache/cache_reservation_manager_test.cc
new file mode 100644
index 000000000..2a0c318e0
--- /dev/null
+++ b/src/rocksdb/cache/cache_reservation_manager_test.cc
@@ -0,0 +1,469 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "cache/cache_reservation_manager.h"
+
+#include <cstddef>
+#include <cstring>
+#include <memory>
+
+#include "cache/cache_entry_roles.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/slice.h"
+#include "test_util/testharness.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+class CacheReservationManagerTest : public ::testing::Test {
+ protected:
+  static constexpr std::size_t kSizeDummyEntry =
+      CacheReservationManagerImpl<CacheEntryRole::kMisc>::GetDummyEntrySize();
+  static constexpr std::size_t kCacheCapacity = 4096 * kSizeDummyEntry;
+  static constexpr int kNumShardBits = 0;  // 2^0 shard
+  static constexpr std::size_t kMetaDataChargeOverhead = 10000;
+
+  std::shared_ptr<Cache> cache = NewLRUCache(kCacheCapacity, kNumShardBits);
+  std::shared_ptr<CacheReservationManager> test_cache_rev_mng;
+
+  CacheReservationManagerTest() {
+    test_cache_rev_mng =
+        std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
+            cache);
+  }
+};
+
+TEST_F(CacheReservationManagerTest, GenerateCacheKey) {
+  std::size_t new_mem_used = 1 * kSizeDummyEntry;
+  Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry);
+  ASSERT_LT(cache->GetPinnedUsage(),
+            1 * kSizeDummyEntry + kMetaDataChargeOverhead);
+
+  // Next unique Cache key
+  CacheKey ckey = CacheKey::CreateUniqueForCacheLifetime(cache.get());
+  // Get to the underlying values
+  uint64_t* ckey_data = reinterpret_cast<uint64_t*>(&ckey);
+  // Back it up to the one used by CRM (using CacheKey implementation details)
+  ckey_data[1]--;
+
+  // Specific key (subject to implementation details)
+  EXPECT_EQ(ckey_data[0], 0);
+  EXPECT_EQ(ckey_data[1], 2);
+
+  Cache::Handle* handle = cache->Lookup(ckey.AsSlice());
+  EXPECT_NE(handle, nullptr)
+      << "Failed to generate the cache key for the dummy entry correctly";
+  // Clean up the returned handle from Lookup() to prevent memory leak
+  cache->Release(handle);
+}
+
+TEST_F(CacheReservationManagerTest, KeepCacheReservationTheSame) {
+  std::size_t new_mem_used = 1 * kSizeDummyEntry;
+  Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            1 * kSizeDummyEntry);
+  ASSERT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used);
+  std::size_t initial_pinned_usage = cache->GetPinnedUsage();
+  ASSERT_GE(initial_pinned_usage, 1 * kSizeDummyEntry);
+  ASSERT_LT(initial_pinned_usage,
+            1 * kSizeDummyEntry + kMetaDataChargeOverhead);
+
+  s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+  EXPECT_EQ(s, Status::OK())
+      << "Failed to keep cache reservation the same when new_mem_used equals "
+         "to current cache reservation";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            1 * kSizeDummyEntry)
+      << "Failed to bookkeep correctly when new_mem_used equals to current "
+         "cache reservation";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+      << "Failed to bookkeep the used memory correctly when new_mem_used "
+         "equals to current cache reservation";
+  EXPECT_EQ(cache->GetPinnedUsage(), initial_pinned_usage)
+      << "Failed to keep underlying dummy entries the same when new_mem_used "
+         "equals to current cache reservation";
+}
+
+TEST_F(CacheReservationManagerTest,
+       IncreaseCacheReservationByMultiplesOfDummyEntrySize) {
+  std::size_t new_mem_used = 2 * kSizeDummyEntry;
+  Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+  EXPECT_EQ(s, Status::OK())
+      << "Failed to increase cache reservation correctly";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            2 * kSizeDummyEntry)
+      << "Failed to bookkeep cache reservation increase correctly";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+      << "Failed to bookkeep the used memory correctly";
+  EXPECT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry)
+      << "Failed to increase underlying dummy entries in cache correctly";
+  EXPECT_LT(cache->GetPinnedUsage(),
+            2 * kSizeDummyEntry + kMetaDataChargeOverhead)
+      << "Failed to increase underlying dummy entries in cache correctly";
+}
+
+TEST_F(CacheReservationManagerTest,
+       IncreaseCacheReservationNotByMultiplesOfDummyEntrySize) {
+  std::size_t new_mem_used = 2 * kSizeDummyEntry + kSizeDummyEntry / 2;
+  Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+  EXPECT_EQ(s, Status::OK())
+      << "Failed to increase cache reservation correctly";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            3 * kSizeDummyEntry)
+      << "Failed to bookkeep cache reservation increase correctly";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+      << "Failed to bookkeep the used memory correctly";
+  EXPECT_GE(cache->GetPinnedUsage(), 3 * kSizeDummyEntry)
+      << "Failed to increase underlying dummy entries in cache correctly";
+  EXPECT_LT(cache->GetPinnedUsage(),
+            3 * kSizeDummyEntry + kMetaDataChargeOverhead)
+      << "Failed to increase underlying dummy entries in cache correctly";
+}
+
+TEST(CacheReservationManagerIncreaseReservcationOnFullCacheTest,
+     IncreaseCacheReservationOnFullCache) {
+  ;
+  constexpr std::size_t kSizeDummyEntry =
+      CacheReservationManagerImpl<CacheEntryRole::kMisc>::GetDummyEntrySize();
+  constexpr std::size_t kSmallCacheCapacity = 4 * kSizeDummyEntry;
+  constexpr std::size_t kBigCacheCapacity = 4096 * kSizeDummyEntry;
+  constexpr std::size_t kMetaDataChargeOverhead = 10000;
+
+  LRUCacheOptions lo;
+  lo.capacity = kSmallCacheCapacity;
+  lo.num_shard_bits = 0;  // 2^0 shard
+  lo.strict_capacity_limit = true;
+  std::shared_ptr<Cache> cache = NewLRUCache(lo);
+  std::shared_ptr<CacheReservationManager> test_cache_rev_mng =
+      std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
+          cache);
+
+  std::size_t new_mem_used = kSmallCacheCapacity + 1;
+  Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+  EXPECT_EQ(s, Status::MemoryLimit())
+      << "Failed to return status to indicate failure of dummy entry insertion "
+         "during cache reservation on full cache";
+  EXPECT_GE(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            1 * kSizeDummyEntry)
+      << "Failed to bookkeep correctly before cache resevation failure happens "
+         "due to full cache";
+  EXPECT_LE(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            kSmallCacheCapacity)
+      << "Failed to bookkeep correctly (i.e, bookkeep only successful dummy "
+         "entry insertions) when encountering cache resevation failure due to "
+         "full cache";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+      << "Failed to bookkeep the used memory correctly";
+  EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry)
+      << "Failed to insert underlying dummy entries correctly when "
+         "encountering cache resevation failure due to full cache";
+  EXPECT_LE(cache->GetPinnedUsage(), kSmallCacheCapacity)
+      << "Failed to insert underlying dummy entries correctly when "
+         "encountering cache resevation failure due to full cache";
+
+  new_mem_used = kSmallCacheCapacity / 2;  // 2 dummy entries
+  s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+  EXPECT_EQ(s, Status::OK())
+      << "Failed to decrease cache reservation after encountering cache "
+         "reservation failure due to full cache";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            2 * kSizeDummyEntry)
+      << "Failed to bookkeep cache reservation decrease correctly after "
+         "encountering cache reservation due to full cache";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+      << "Failed to bookkeep the used memory correctly";
+  EXPECT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry)
+      << "Failed to release underlying dummy entries correctly on cache "
+         "reservation decrease after encountering cache resevation failure due "
+         "to full cache";
+  EXPECT_LT(cache->GetPinnedUsage(),
+            2 * kSizeDummyEntry + kMetaDataChargeOverhead)
+      << "Failed to release underlying dummy entries correctly on cache "
+         "reservation decrease after encountering cache resevation failure due "
+         "to full cache";
+
+  // Create cache full again for subsequent tests
+  new_mem_used = kSmallCacheCapacity + 1;
+  s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+  EXPECT_EQ(s, Status::MemoryLimit())
+      << "Failed to return status to indicate failure of dummy entry insertion "
+         "during cache reservation on full cache";
+  EXPECT_GE(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            1 * kSizeDummyEntry)
+      << "Failed to bookkeep correctly before cache resevation failure happens "
+         "due to full cache";
+  EXPECT_LE(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            kSmallCacheCapacity)
+      << "Failed to bookkeep correctly (i.e, bookkeep only successful dummy "
+         "entry insertions) when encountering cache resevation failure due to "
+         "full cache";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+      << "Failed to bookkeep the used memory correctly";
+  EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry)
+      << "Failed to insert underlying dummy entries correctly when "
+         "encountering cache resevation failure due to full cache";
+  EXPECT_LE(cache->GetPinnedUsage(), kSmallCacheCapacity)
+      << "Failed to insert underlying dummy entries correctly when "
+         "encountering cache resevation failure due to full cache";
+
+  // Increase cache capacity so the previously failed insertion can fully
+  // succeed
+  cache->SetCapacity(kBigCacheCapacity);
+  new_mem_used = kSmallCacheCapacity + 1;
+  s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+  EXPECT_EQ(s, Status::OK())
+      << "Failed to increase cache reservation after increasing cache capacity "
+         "and mitigating cache full error";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            5 * kSizeDummyEntry)
+      << "Failed to bookkeep cache reservation increase correctly after "
+         "increasing cache capacity and mitigating cache full error";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+      << "Failed to bookkeep the used memory correctly";
+  EXPECT_GE(cache->GetPinnedUsage(), 5 * kSizeDummyEntry)
+      << "Failed to insert underlying dummy entries correctly after increasing "
+         "cache capacity and mitigating cache full error";
+  EXPECT_LT(cache->GetPinnedUsage(),
+            5 * kSizeDummyEntry + kMetaDataChargeOverhead)
+      << "Failed to insert underlying dummy entries correctly after increasing "
+         "cache capacity and mitigating cache full error";
+}
+
+TEST_F(CacheReservationManagerTest,
+       DecreaseCacheReservationByMultiplesOfDummyEntrySize) {
+  std::size_t new_mem_used = 2 * kSizeDummyEntry;
+  Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            2 * kSizeDummyEntry);
+  ASSERT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used);
+  ASSERT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry);
+  ASSERT_LT(cache->GetPinnedUsage(),
+            2 * kSizeDummyEntry + kMetaDataChargeOverhead);
+
+  new_mem_used = 1 * kSizeDummyEntry;
+  s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+  EXPECT_EQ(s, Status::OK())
+      << "Failed to decrease cache reservation correctly";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            1 * kSizeDummyEntry)
+      << "Failed to bookkeep cache reservation decrease correctly";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+      << "Failed to bookkeep the used memory correctly";
+  EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry)
+      << "Failed to decrease underlying dummy entries in cache correctly";
+  EXPECT_LT(cache->GetPinnedUsage(),
+            1 * kSizeDummyEntry + kMetaDataChargeOverhead)
+      << "Failed to decrease underlying dummy entries in cache correctly";
+}
+
+TEST_F(CacheReservationManagerTest,
+       DecreaseCacheReservationNotByMultiplesOfDummyEntrySize) {
+  std::size_t new_mem_used = 2 * kSizeDummyEntry;
+  Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            2 * kSizeDummyEntry);
+  ASSERT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used);
+  ASSERT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry);
+  ASSERT_LT(cache->GetPinnedUsage(),
+            2 * kSizeDummyEntry + kMetaDataChargeOverhead);
+
+  new_mem_used = kSizeDummyEntry / 2;
+  s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+  EXPECT_EQ(s, Status::OK())
+      << "Failed to decrease cache reservation correctly";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            1 * kSizeDummyEntry)
+      << "Failed to bookkeep cache reservation decrease correctly";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+      << "Failed to bookkeep the used memory correctly";
+  EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry)
+      << "Failed to decrease underlying dummy entries in cache correctly";
+  EXPECT_LT(cache->GetPinnedUsage(),
+            1 * kSizeDummyEntry + kMetaDataChargeOverhead)
+      << "Failed to decrease underlying dummy entries in cache correctly";
+}
+
+TEST(CacheReservationManagerWithDelayedDecreaseTest,
+     DecreaseCacheReservationWithDelayedDecrease) {
+  constexpr std::size_t kSizeDummyEntry =
+      CacheReservationManagerImpl<CacheEntryRole::kMisc>::GetDummyEntrySize();
+  constexpr std::size_t kCacheCapacity = 4096 * kSizeDummyEntry;
+  constexpr std::size_t kMetaDataChargeOverhead = 10000;
+
+  LRUCacheOptions lo;
+  lo.capacity = kCacheCapacity;
+  lo.num_shard_bits = 0;
+  std::shared_ptr<Cache> cache = NewLRUCache(lo);
+  std::shared_ptr<CacheReservationManager> test_cache_rev_mng =
+      std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
+          cache, true /* delayed_decrease */);
+
+  std::size_t new_mem_used = 8 * kSizeDummyEntry;
+  Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            8 * kSizeDummyEntry);
+  ASSERT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used);
+  std::size_t initial_pinned_usage = cache->GetPinnedUsage();
+  ASSERT_GE(initial_pinned_usage, 8 * kSizeDummyEntry);
+  ASSERT_LT(initial_pinned_usage,
+            8 * kSizeDummyEntry + kMetaDataChargeOverhead);
+
+  new_mem_used = 6 * kSizeDummyEntry;
+  s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+  EXPECT_EQ(s, Status::OK()) << "Failed to delay decreasing cache reservation";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            8 * kSizeDummyEntry)
+      << "Failed to bookkeep correctly when delaying cache reservation "
+         "decrease";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+      << "Failed to bookkeep the used memory correctly";
+  EXPECT_EQ(cache->GetPinnedUsage(), initial_pinned_usage)
+      << "Failed to delay decreasing underlying dummy entries in cache";
+
+  new_mem_used = 7 * kSizeDummyEntry;
+  s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+  EXPECT_EQ(s, Status::OK()) << "Failed to delay decreasing cache reservation";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            8 * kSizeDummyEntry)
+      << "Failed to bookkeep correctly when delaying cache reservation "
+         "decrease";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+      << "Failed to bookkeep the used memory correctly";
+  EXPECT_EQ(cache->GetPinnedUsage(), initial_pinned_usage)
+      << "Failed to delay decreasing underlying dummy entries in cache";
+
+  new_mem_used = 6 * kSizeDummyEntry - 1;
+  s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+  EXPECT_EQ(s, Status::OK())
+      << "Failed to decrease cache reservation correctly when new_mem_used < "
+         "GetTotalReservedCacheSize() * 3 / 4 on delayed decrease mode";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            6 * kSizeDummyEntry)
+      << "Failed to bookkeep correctly when new_mem_used < "
+         "GetTotalReservedCacheSize() * 3 / 4 on delayed decrease mode";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+      << "Failed to bookkeep the used memory correctly";
+  EXPECT_GE(cache->GetPinnedUsage(), 6 * kSizeDummyEntry)
+      << "Failed to decrease underlying dummy entries in cache when "
+         "new_mem_used < GetTotalReservedCacheSize() * 3 / 4 on delayed "
+         "decrease mode";
+  EXPECT_LT(cache->GetPinnedUsage(),
+            6 * kSizeDummyEntry + kMetaDataChargeOverhead)
+      << "Failed to decrease underlying dummy entries in cache when "
+         "new_mem_used < GetTotalReservedCacheSize() * 3 / 4 on delayed "
+         "decrease mode";
+}
+
+TEST(CacheReservationManagerDestructorTest,
+     ReleaseRemainingDummyEntriesOnDestruction) {
+  constexpr std::size_t kSizeDummyEntry =
+      CacheReservationManagerImpl<CacheEntryRole::kMisc>::GetDummyEntrySize();
+  constexpr std::size_t kCacheCapacity = 4096 * kSizeDummyEntry;
+  constexpr std::size_t kMetaDataChargeOverhead = 10000;
+
+  LRUCacheOptions lo;
+  lo.capacity = kCacheCapacity;
+  lo.num_shard_bits = 0;
+  std::shared_ptr<Cache> cache = NewLRUCache(lo);
+  {
+    std::shared_ptr<CacheReservationManager> test_cache_rev_mng =
+        std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
+            cache);
+    std::size_t new_mem_used = 1 * kSizeDummyEntry;
+    Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
+    ASSERT_EQ(s, Status::OK());
+    ASSERT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry);
+    ASSERT_LT(cache->GetPinnedUsage(),
+              1 * kSizeDummyEntry + kMetaDataChargeOverhead);
+  }
+  EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry)
+      << "Failed to release remaining underlying dummy entries in cache in "
+         "CacheReservationManager's destructor";
+}
+
+TEST(CacheReservationHandleTest, HandleTest) {
+  constexpr std::size_t kOneGigabyte = 1024 * 1024 * 1024;
+  constexpr std::size_t kSizeDummyEntry = 256 * 1024;
+  constexpr std::size_t kMetaDataChargeOverhead = 10000;
+
+  LRUCacheOptions lo;
+  lo.capacity = kOneGigabyte;
+  lo.num_shard_bits = 0;
+  std::shared_ptr<Cache> cache = NewLRUCache(lo);
+
+  std::shared_ptr<CacheReservationManager> test_cache_rev_mng(
+      std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
+          cache));
+
+  std::size_t mem_used = 0;
+  const std::size_t incremental_mem_used_handle_1 = 1 * kSizeDummyEntry;
+  const std::size_t incremental_mem_used_handle_2 = 2 * kSizeDummyEntry;
+  std::unique_ptr<CacheReservationManager::CacheReservationHandle> handle_1,
+      handle_2;
+
+  // To test consecutive CacheReservationManager::MakeCacheReservation works
+  // correctly in terms of returning the handle as well as updating cache
+  // reservation and the latest total memory used
+  Status s = test_cache_rev_mng->MakeCacheReservation(
+      incremental_mem_used_handle_1, &handle_1);
+  mem_used = mem_used + incremental_mem_used_handle_1;
+  ASSERT_EQ(s, Status::OK());
+  EXPECT_TRUE(handle_1 != nullptr);
+  EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), mem_used);
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), mem_used);
+  EXPECT_GE(cache->GetPinnedUsage(), mem_used);
+  EXPECT_LT(cache->GetPinnedUsage(), mem_used + kMetaDataChargeOverhead);
+
+  s = test_cache_rev_mng->MakeCacheReservation(incremental_mem_used_handle_2,
+                                               &handle_2);
+  mem_used = mem_used + incremental_mem_used_handle_2;
+  ASSERT_EQ(s, Status::OK());
+  EXPECT_TRUE(handle_2 != nullptr);
+  EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), mem_used);
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), mem_used);
+  EXPECT_GE(cache->GetPinnedUsage(), mem_used);
+  EXPECT_LT(cache->GetPinnedUsage(), mem_used + kMetaDataChargeOverhead);
+
+  // To test
+  // CacheReservationManager::CacheReservationHandle::~CacheReservationHandle()
+  // works correctly in releasing the cache reserved for the handle
+  handle_1.reset();
+  EXPECT_TRUE(handle_1 == nullptr);
+  mem_used = mem_used - incremental_mem_used_handle_1;
+  EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), mem_used);
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), mem_used);
+  EXPECT_GE(cache->GetPinnedUsage(), mem_used);
+  EXPECT_LT(cache->GetPinnedUsage(), mem_used + kMetaDataChargeOverhead);
+
+  // To test the actual CacheReservationManager object won't be deallocated
+  // as long as there remain handles pointing to it.
+  // We strongly recommend deallocating CacheReservationManager object only
+  // after all its handles are deallocated to keep things easy to reasonate
+  test_cache_rev_mng.reset();
+  EXPECT_GE(cache->GetPinnedUsage(), mem_used);
+  EXPECT_LT(cache->GetPinnedUsage(), mem_used + kMetaDataChargeOverhead);
+
+  handle_2.reset();
+  // The CacheReservationManager object is now deallocated since all the handles
+  // and its original pointer is gone
+  mem_used = mem_used - incremental_mem_used_handle_2;
+  EXPECT_EQ(mem_used, 0);
+  EXPECT_EQ(cache->GetPinnedUsage(), mem_used);
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/cache/cache_test.cc b/src/rocksdb/cache/cache_test.cc
new file mode 100644
index 000000000..212d65d96
--- /dev/null
+++ b/src/rocksdb/cache/cache_test.cc
@@ -0,0 +1,1037 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/cache.h"
+
+#include <forward_list>
+#include <functional>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "cache/lru_cache.h"
+#include "port/stack_trace.h"
+#include "test_util/testharness.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+// HyperClockCache only supports 16-byte keys, so some of the tests
+// originally written for LRUCache do not work on the other caches.
+// Those tests were adapted to use 16-byte keys. We kept the original ones.
+// TODO: Remove the original tests if they ever become unused.
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Conversions between numeric keys/values and the types expected by Cache.
+std::string EncodeKey16Bytes(int k) {
+  std::string result;
+  PutFixed32(&result, k);
+  result.append(std::string(12, 'a'));  // Because we need a 16B output, we
+                                        // add a 12-byte padding.
+  return result;
+}
+
+int DecodeKey16Bytes(const Slice& k) {
+  assert(k.size() == 16);
+  return DecodeFixed32(k.data());  // Decodes only the first 4 bytes of k.
+}
+
+std::string EncodeKey32Bits(int k) {
+  std::string result;
+  PutFixed32(&result, k);
+  return result;
+}
+
+int DecodeKey32Bits(const Slice& k) {
+  assert(k.size() == 4);
+  return DecodeFixed32(k.data());
+}
+
+void* EncodeValue(uintptr_t v) { return reinterpret_cast<void*>(v); }
+
+int DecodeValue(void* v) {
+  return static_cast<int>(reinterpret_cast<uintptr_t>(v));
+}
+
+void DumbDeleter(const Slice& /*key*/, void* /*value*/) {}
+
+void EraseDeleter1(const Slice& /*key*/, void* value) {
+  Cache* cache = reinterpret_cast<Cache*>(value);
+  cache->Erase("foo");
+}
+
+void EraseDeleter2(const Slice& /*key*/, void* value) {
+  Cache* cache = reinterpret_cast<Cache*>(value);
+  cache->Erase(EncodeKey16Bytes(1234));
+}
+
+const std::string kLRU = "lru";
+const std::string kHyperClock = "hyper_clock";
+
+}  // anonymous namespace
+
+class CacheTest : public testing::TestWithParam<std::string> {
+ public:
+  static CacheTest* current_;
+  static std::string type_;
+
+  static void Deleter(const Slice& key, void* v) {
+    if (type_ == kHyperClock) {
+      current_->deleted_keys_.push_back(DecodeKey16Bytes(key));
+    } else {
+      current_->deleted_keys_.push_back(DecodeKey32Bits(key));
+    }
+    current_->deleted_values_.push_back(DecodeValue(v));
+  }
+
+  static const int kCacheSize = 1000;
+  static const int kNumShardBits = 4;
+
+  static const int kCacheSize2 = 100;
+  static const int kNumShardBits2 = 2;
+
+  std::vector<int> deleted_keys_;
+  std::vector<int> deleted_values_;
+  std::shared_ptr<Cache> cache_;
+  std::shared_ptr<Cache> cache2_;
+
+  size_t estimated_value_size_ = 1;
+
+  CacheTest()
+      : cache_(NewCache(kCacheSize, kNumShardBits, false)),
+        cache2_(NewCache(kCacheSize2, kNumShardBits2, false)) {
+    current_ = this;
+    type_ = GetParam();
+  }
+
+  ~CacheTest() override {}
+
+  std::shared_ptr<Cache> NewCache(size_t capacity) {
+    auto type = GetParam();
+    if (type == kLRU) {
+      return NewLRUCache(capacity);
+    }
+    if (type == kHyperClock) {
+      return HyperClockCacheOptions(
+                 capacity, estimated_value_size_ /*estimated_value_size*/)
+          .MakeSharedCache();
+    }
+    return nullptr;
+  }
+
+  std::shared_ptr<Cache> NewCache(
+      size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+      CacheMetadataChargePolicy charge_policy = kDontChargeCacheMetadata) {
+    auto type = GetParam();
+    if (type == kLRU) {
+      LRUCacheOptions co;
+      co.capacity = capacity;
+      co.num_shard_bits = num_shard_bits;
+      co.strict_capacity_limit = strict_capacity_limit;
+      co.high_pri_pool_ratio = 0;
+      co.metadata_charge_policy = charge_policy;
+      return NewLRUCache(co);
+    }
+    if (type == kHyperClock) {
+      return HyperClockCacheOptions(capacity, 1 /*estimated_value_size*/,
+                                    num_shard_bits, strict_capacity_limit,
+                                    nullptr /*allocator*/, charge_policy)
+          .MakeSharedCache();
+    }
+    return nullptr;
+  }
+
+  // These functions encode/decode keys in tests cases that use
+  // int keys.
+  // Currently, HyperClockCache requires keys to be 16B long, whereas
+  // LRUCache doesn't, so the encoding depends on the cache type.
+  std::string EncodeKey(int k) {
+    auto type = GetParam();
+    if (type == kHyperClock) {
+      return EncodeKey16Bytes(k);
+    } else {
+      return EncodeKey32Bits(k);
+    }
+  }
+
+  int DecodeKey(const Slice& k) {
+    auto type = GetParam();
+    if (type == kHyperClock) {
+      return DecodeKey16Bytes(k);
+    } else {
+      return DecodeKey32Bits(k);
+    }
+  }
+
+  int Lookup(std::shared_ptr<Cache> cache, int key) {
+    Cache::Handle* handle = cache->Lookup(EncodeKey(key));
+    const int r = (handle == nullptr) ? -1 : DecodeValue(cache->Value(handle));
+    if (handle != nullptr) {
+      cache->Release(handle);
+    }
+    return r;
+  }
+
+  void Insert(std::shared_ptr<Cache> cache, int key, int value,
+              int charge = 1) {
+    EXPECT_OK(cache->Insert(EncodeKey(key), EncodeValue(value), charge,
+                            &CacheTest::Deleter));
+  }
+
+  void Erase(std::shared_ptr<Cache> cache, int key) {
+    cache->Erase(EncodeKey(key));
+  }
+
+  int Lookup(int key) { return Lookup(cache_, key); }
+
+  void Insert(int key, int value, int charge = 1) {
+    Insert(cache_, key, value, charge);
+  }
+
+  void Erase(int key) { Erase(cache_, key); }
+
+  int Lookup2(int key) { return Lookup(cache2_, key); }
+
+  void Insert2(int key, int value, int charge = 1) {
+    Insert(cache2_, key, value, charge);
+  }
+
+  void Erase2(int key) { Erase(cache2_, key); }
+};
+
+CacheTest* CacheTest::current_;
+std::string CacheTest::type_;
+
+class LRUCacheTest : public CacheTest {};
+
+TEST_P(CacheTest, UsageTest) {
+  auto type = GetParam();
+
+  // cache is std::shared_ptr and will be automatically cleaned up.
+  const size_t kCapacity = 100000;
+  auto cache = NewCache(kCapacity, 8, false, kDontChargeCacheMetadata);
+  auto precise_cache = NewCache(kCapacity, 0, false, kFullChargeCacheMetadata);
+  ASSERT_EQ(0, cache->GetUsage());
+  size_t baseline_meta_usage = precise_cache->GetUsage();
+  if (type != kHyperClock) {
+    ASSERT_EQ(0, baseline_meta_usage);
+  }
+
+  size_t usage = 0;
+  char value[10] = "abcdef";
+  // make sure everything will be cached
+  for (int i = 1; i < 100; ++i) {
+    std::string key;
+    if (type == kLRU) {
+      key = std::string(i, 'a');
+    } else {
+      key = EncodeKey(i);
+    }
+    auto kv_size = key.size() + 5;
+    ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), kv_size,
+                            DumbDeleter));
+    ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value),
+                                    kv_size, DumbDeleter));
+    usage += kv_size;
+    ASSERT_EQ(usage, cache->GetUsage());
+    if (type == kHyperClock) {
+      ASSERT_EQ(baseline_meta_usage + usage, precise_cache->GetUsage());
+    } else {
+      ASSERT_LT(usage, precise_cache->GetUsage());
+    }
+  }
+
+  cache->EraseUnRefEntries();
+  precise_cache->EraseUnRefEntries();
+  ASSERT_EQ(0, cache->GetUsage());
+  ASSERT_EQ(baseline_meta_usage, precise_cache->GetUsage());
+
+  // make sure the cache will be overloaded
+  for (size_t i = 1; i < kCapacity; ++i) {
+    std::string key;
+    if (type == kLRU) {
+      key = std::to_string(i);
+    } else {
+      key = EncodeKey(static_cast<int>(1000 + i));
+    }
+    ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
+                            DumbDeleter));
+    ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value),
+                                    key.size() + 5, DumbDeleter));
+  }
+
+  // the usage should be close to the capacity
+  ASSERT_GT(kCapacity, cache->GetUsage());
+  ASSERT_GT(kCapacity, precise_cache->GetUsage());
+  ASSERT_LT(kCapacity * 0.95, cache->GetUsage());
+  if (type != kHyperClock) {
+    ASSERT_LT(kCapacity * 0.95, precise_cache->GetUsage());
+  } else {
+    // estimated value size of 1 is weird for clock cache, because
+    // almost all of the capacity will be used for metadata, and due to only
+    // using power of 2 table sizes, we might hit strict occupancy limit
+    // before hitting capacity limit.
+    ASSERT_LT(kCapacity * 0.80, precise_cache->GetUsage());
+  }
+}
+
+// TODO: This test takes longer than expected on ClockCache. This is
+// because the values size estimate at construction is too sloppy.
+// Fix this.
+// Why is it so slow? The cache is constructed with an estimate of 1, but
+// then the charge is claimed to be 21. This will cause the hash table
+// to be extremely sparse, which in turn means clock needs to scan too
+// many slots to find victims.
+TEST_P(CacheTest, PinnedUsageTest) {
+  auto type = GetParam();
+
+  // cache is std::shared_ptr and will be automatically cleaned up.
+  const size_t kCapacity = 200000;
+  auto cache = NewCache(kCapacity, 8, false, kDontChargeCacheMetadata);
+  auto precise_cache = NewCache(kCapacity, 8, false, kFullChargeCacheMetadata);
+  size_t baseline_meta_usage = precise_cache->GetUsage();
+  if (type != kHyperClock) {
+    ASSERT_EQ(0, baseline_meta_usage);
+  }
+
+  size_t pinned_usage = 0;
+  char value[10] = "abcdef";
+
+  std::forward_list<Cache::Handle*> unreleased_handles;
+  std::forward_list<Cache::Handle*> unreleased_handles_in_precise_cache;
+
+  // Add entries. Unpin some of them after insertion. Then, pin some of them
+  // again. Check GetPinnedUsage().
+  for (int i = 1; i < 100; ++i) {
+    std::string key;
+    if (type == kLRU) {
+      key = std::string(i, 'a');
+    } else {
+      key = EncodeKey(i);
+    }
+    auto kv_size = key.size() + 5;
+    Cache::Handle* handle;
+    Cache::Handle* handle_in_precise_cache;
+    ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), kv_size,
+                            DumbDeleter, &handle));
+    assert(handle);
+    ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value),
+                                    kv_size, DumbDeleter,
+                                    &handle_in_precise_cache));
+    assert(handle_in_precise_cache);
+    pinned_usage += kv_size;
+    ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+    ASSERT_LT(pinned_usage, precise_cache->GetPinnedUsage());
+    if (i % 2 == 0) {
+      cache->Release(handle);
+      precise_cache->Release(handle_in_precise_cache);
+      pinned_usage -= kv_size;
+      ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+      ASSERT_LT(pinned_usage, precise_cache->GetPinnedUsage());
+    } else {
+      unreleased_handles.push_front(handle);
+      unreleased_handles_in_precise_cache.push_front(handle_in_precise_cache);
+    }
+    if (i % 3 == 0) {
+      unreleased_handles.push_front(cache->Lookup(key));
+      auto x = precise_cache->Lookup(key);
+      assert(x);
+      unreleased_handles_in_precise_cache.push_front(x);
+      // If i % 2 == 0, then the entry was unpinned before Lookup, so pinned
+      // usage increased
+      if (i % 2 == 0) {
+        pinned_usage += kv_size;
+      }
+      ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+      ASSERT_LT(pinned_usage, precise_cache->GetPinnedUsage());
+    }
+  }
+  auto precise_cache_pinned_usage = precise_cache->GetPinnedUsage();
+  ASSERT_LT(pinned_usage, precise_cache_pinned_usage);
+
+  // check that overloading the cache does not change the pinned usage
+  for (size_t i = 1; i < 2 * kCapacity; ++i) {
+    std::string key;
+    if (type == kLRU) {
+      key = std::to_string(i);
+    } else {
+      key = EncodeKey(static_cast<int>(1000 + i));
+    }
+    ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
+                            DumbDeleter));
+    ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value),
+                                    key.size() + 5, DumbDeleter));
+  }
+  ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+  ASSERT_EQ(precise_cache_pinned_usage, precise_cache->GetPinnedUsage());
+
+  cache->EraseUnRefEntries();
+  precise_cache->EraseUnRefEntries();
+  ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+  ASSERT_EQ(precise_cache_pinned_usage, precise_cache->GetPinnedUsage());
+
+  // release handles for pinned entries to prevent memory leaks
+  for (auto handle : unreleased_handles) {
+    cache->Release(handle);
+  }
+  for (auto handle : unreleased_handles_in_precise_cache) {
+    precise_cache->Release(handle);
+  }
+  ASSERT_EQ(0, cache->GetPinnedUsage());
+  ASSERT_EQ(0, precise_cache->GetPinnedUsage());
+  cache->EraseUnRefEntries();
+  precise_cache->EraseUnRefEntries();
+  ASSERT_EQ(0, cache->GetUsage());
+  ASSERT_EQ(baseline_meta_usage, precise_cache->GetUsage());
+}
+
+TEST_P(CacheTest, HitAndMiss) {
+  ASSERT_EQ(-1, Lookup(100));
+
+  Insert(100, 101);
+  ASSERT_EQ(101, Lookup(100));
+  ASSERT_EQ(-1, Lookup(200));
+  ASSERT_EQ(-1, Lookup(300));
+
+  Insert(200, 201);
+  ASSERT_EQ(101, Lookup(100));
+  ASSERT_EQ(201, Lookup(200));
+  ASSERT_EQ(-1, Lookup(300));
+
+  Insert(100, 102);
+  if (GetParam() == kHyperClock) {
+    // ClockCache usually doesn't overwrite on Insert
+    ASSERT_EQ(101, Lookup(100));
+  } else {
+    ASSERT_EQ(102, Lookup(100));
+  }
+  ASSERT_EQ(201, Lookup(200));
+  ASSERT_EQ(-1, Lookup(300));
+
+  ASSERT_EQ(1U, deleted_keys_.size());
+  ASSERT_EQ(100, deleted_keys_[0]);
+  if (GetParam() == kHyperClock) {
+    ASSERT_EQ(102, deleted_values_[0]);
+  } else {
+    ASSERT_EQ(101, deleted_values_[0]);
+  }
+}
+
+TEST_P(CacheTest, InsertSameKey) {
+  if (GetParam() == kHyperClock) {
+    ROCKSDB_GTEST_BYPASS(
+        "ClockCache doesn't guarantee Insert overwrite same key.");
+    return;
+  }
+  Insert(1, 1);
+  Insert(1, 2);
+  ASSERT_EQ(2, Lookup(1));
+}
+
+TEST_P(CacheTest, Erase) {
+  Erase(200);
+  ASSERT_EQ(0U, deleted_keys_.size());
+
+  Insert(100, 101);
+  Insert(200, 201);
+  Erase(100);
+  ASSERT_EQ(-1, Lookup(100));
+  ASSERT_EQ(201, Lookup(200));
+  ASSERT_EQ(1U, deleted_keys_.size());
+  ASSERT_EQ(100, deleted_keys_[0]);
+  ASSERT_EQ(101, deleted_values_[0]);
+
+  Erase(100);
+  ASSERT_EQ(-1, Lookup(100));
+  ASSERT_EQ(201, Lookup(200));
+  ASSERT_EQ(1U, deleted_keys_.size());
+}
+
+TEST_P(CacheTest, EntriesArePinned) {
+  if (GetParam() == kHyperClock) {
+    ROCKSDB_GTEST_BYPASS(
+        "ClockCache doesn't guarantee Insert overwrite same key.");
+    return;
+  }
+  Insert(100, 101);
+  Cache::Handle* h1 = cache_->Lookup(EncodeKey(100));
+  ASSERT_EQ(101, DecodeValue(cache_->Value(h1)));
+  ASSERT_EQ(1U, cache_->GetUsage());
+
+  Insert(100, 102);
+  Cache::Handle* h2 = cache_->Lookup(EncodeKey(100));
+  ASSERT_EQ(102, DecodeValue(cache_->Value(h2)));
+  ASSERT_EQ(0U, deleted_keys_.size());
+  ASSERT_EQ(2U, cache_->GetUsage());
+
+  cache_->Release(h1);
+  ASSERT_EQ(1U, deleted_keys_.size());
+  ASSERT_EQ(100, deleted_keys_[0]);
+  ASSERT_EQ(101, deleted_values_[0]);
+  ASSERT_EQ(1U, cache_->GetUsage());
+
+  Erase(100);
+  ASSERT_EQ(-1, Lookup(100));
+  ASSERT_EQ(1U, deleted_keys_.size());
+  ASSERT_EQ(1U, cache_->GetUsage());
+
+  cache_->Release(h2);
+  ASSERT_EQ(2U, deleted_keys_.size());
+  ASSERT_EQ(100, deleted_keys_[1]);
+  ASSERT_EQ(102, deleted_values_[1]);
+  ASSERT_EQ(0U, cache_->GetUsage());
+}
+
+TEST_P(CacheTest, EvictionPolicy) {
+  Insert(100, 101);
+  Insert(200, 201);
+  // Frequently used entry must be kept around
+  for (int i = 0; i < 2 * kCacheSize; i++) {
+    Insert(1000 + i, 2000 + i);
+    ASSERT_EQ(101, Lookup(100));
+  }
+  ASSERT_EQ(101, Lookup(100));
+  ASSERT_EQ(-1, Lookup(200));
+}
+
+TEST_P(CacheTest, ExternalRefPinsEntries) {
+  Insert(100, 101);
+  Cache::Handle* h = cache_->Lookup(EncodeKey(100));
+  ASSERT_TRUE(cache_->Ref(h));
+  ASSERT_EQ(101, DecodeValue(cache_->Value(h)));
+  ASSERT_EQ(1U, cache_->GetUsage());
+
+  for (int i = 0; i < 3; ++i) {
+    if (i > 0) {
+      // First release (i == 1) corresponds to Ref(), second release (i == 2)
+      // corresponds to Lookup(). Then, since all external refs are released,
+      // the below insertions should push out the cache entry.
+      cache_->Release(h);
+    }
+    // double cache size because the usage bit in block cache prevents 100 from
+    // being evicted in the first kCacheSize iterations
+    for (int j = 0; j < 2 * kCacheSize + 100; j++) {
+      Insert(1000 + j, 2000 + j);
+    }
+    // Clock cache is even more stateful and needs more churn to evict
+    if (GetParam() == kHyperClock) {
+      for (int j = 0; j < kCacheSize; j++) {
+        Insert(11000 + j, 11000 + j);
+      }
+    }
+    if (i < 2) {
+      ASSERT_EQ(101, Lookup(100));
+    }
+  }
+  ASSERT_EQ(-1, Lookup(100));
+}
+
+TEST_P(CacheTest, EvictionPolicyRef) {
+  Insert(100, 101);
+  Insert(101, 102);
+  Insert(102, 103);
+  Insert(103, 104);
+  Insert(200, 101);
+  Insert(201, 102);
+  Insert(202, 103);
+  Insert(203, 104);
+  Cache::Handle* h201 = cache_->Lookup(EncodeKey(200));
+  Cache::Handle* h202 = cache_->Lookup(EncodeKey(201));
+  Cache::Handle* h203 = cache_->Lookup(EncodeKey(202));
+  Cache::Handle* h204 = cache_->Lookup(EncodeKey(203));
+  Insert(300, 101);
+  Insert(301, 102);
+  Insert(302, 103);
+  Insert(303, 104);
+
+  // Insert entries much more than cache capacity.
+  for (int i = 0; i < 100 * kCacheSize; i++) {
+    Insert(1000 + i, 2000 + i);
+  }
+
+  // Check whether the entries inserted in the beginning
+  // are evicted. Ones without extra ref are evicted and
+  // those with are not.
+  ASSERT_EQ(-1, Lookup(100));
+  ASSERT_EQ(-1, Lookup(101));
+  ASSERT_EQ(-1, Lookup(102));
+  ASSERT_EQ(-1, Lookup(103));
+
+  ASSERT_EQ(-1, Lookup(300));
+  ASSERT_EQ(-1, Lookup(301));
+  ASSERT_EQ(-1, Lookup(302));
+  ASSERT_EQ(-1, Lookup(303));
+
+  ASSERT_EQ(101, Lookup(200));
+  ASSERT_EQ(102, Lookup(201));
+  ASSERT_EQ(103, Lookup(202));
+  ASSERT_EQ(104, Lookup(203));
+
+  // Cleaning up all the handles
+  cache_->Release(h201);
+  cache_->Release(h202);
+  cache_->Release(h203);
+  cache_->Release(h204);
+}
+
+TEST_P(CacheTest, EvictEmptyCache) {
+  auto type = GetParam();
+
+  // Insert item large than capacity to trigger eviction on empty cache.
+  auto cache = NewCache(1, 0, false);
+  if (type == kLRU) {
+    ASSERT_OK(cache->Insert("foo", nullptr, 10, DumbDeleter));
+  } else {
+    ASSERT_OK(cache->Insert(EncodeKey(1000), nullptr, 10, DumbDeleter));
+  }
+}
+
+TEST_P(CacheTest, EraseFromDeleter) {
+  auto type = GetParam();
+
+  // Have deleter which will erase item from cache, which will re-enter
+  // the cache at that point.
+  std::shared_ptr<Cache> cache = NewCache(10, 0, false);
+  std::string foo, bar;
+  Cache::DeleterFn erase_deleter;
+  if (type == kLRU) {
+    foo = "foo";
+    bar = "bar";
+    erase_deleter = EraseDeleter1;
+  } else {
+    foo = EncodeKey(1234);
+    bar = EncodeKey(5678);
+    erase_deleter = EraseDeleter2;
+  }
+
+  ASSERT_OK(cache->Insert(foo, nullptr, 1, DumbDeleter));
+  ASSERT_OK(cache->Insert(bar, cache.get(), 1, erase_deleter));
+
+  cache->Erase(bar);
+  ASSERT_EQ(nullptr, cache->Lookup(foo));
+  ASSERT_EQ(nullptr, cache->Lookup(bar));
+}
+
+TEST_P(CacheTest, ErasedHandleState) {
+  // insert a key and get two handles
+  Insert(100, 1000);
+  Cache::Handle* h1 = cache_->Lookup(EncodeKey(100));
+  Cache::Handle* h2 = cache_->Lookup(EncodeKey(100));
+  ASSERT_EQ(h1, h2);
+  ASSERT_EQ(DecodeValue(cache_->Value(h1)), 1000);
+  ASSERT_EQ(DecodeValue(cache_->Value(h2)), 1000);
+
+  // delete the key from the cache
+  Erase(100);
+  // can no longer find in the cache
+  ASSERT_EQ(-1, Lookup(100));
+
+  // release one handle
+  cache_->Release(h1);
+  // still can't find in cache
+  ASSERT_EQ(-1, Lookup(100));
+
+  cache_->Release(h2);
+}
+
+TEST_P(CacheTest, HeavyEntries) {
+  // Add a bunch of light and heavy entries and then count the combined
+  // size of items still in the cache, which must be approximately the
+  // same as the total capacity.
+  const int kLight = 1;
+  const int kHeavy = 10;
+  int added = 0;
+  int index = 0;
+  while (added < 2 * kCacheSize) {
+    const int weight = (index & 1) ? kLight : kHeavy;
+    Insert(index, 1000 + index, weight);
+    added += weight;
+    index++;
+  }
+
+  int cached_weight = 0;
+  for (int i = 0; i < index; i++) {
+    const int weight = (i & 1 ? kLight : kHeavy);
+    int r = Lookup(i);
+    if (r >= 0) {
+      cached_weight += weight;
+      ASSERT_EQ(1000 + i, r);
+    }
+  }
+  ASSERT_LE(cached_weight, kCacheSize + kCacheSize / 10);
+}
+
+TEST_P(CacheTest, NewId) {
+  uint64_t a = cache_->NewId();
+  uint64_t b = cache_->NewId();
+  ASSERT_NE(a, b);
+}
+
+class Value {
+ public:
+  explicit Value(int v) : v_(v) {}
+
+  int v_;
+};
+
+namespace {
+void deleter(const Slice& /*key*/, void* value) {
+  delete static_cast<Value*>(value);
+}
+}  // namespace
+
+TEST_P(CacheTest, ReleaseAndErase) {
+  std::shared_ptr<Cache> cache = NewCache(5, 0, false);
+  Cache::Handle* handle;
+  Status s = cache->Insert(EncodeKey(100), EncodeValue(100), 1,
+                           &CacheTest::Deleter, &handle);
+  ASSERT_TRUE(s.ok());
+  ASSERT_EQ(5U, cache->GetCapacity());
+  ASSERT_EQ(1U, cache->GetUsage());
+  ASSERT_EQ(0U, deleted_keys_.size());
+  auto erased = cache->Release(handle, true);
+  ASSERT_TRUE(erased);
+  // This tests that deleter has been called
+  ASSERT_EQ(1U, deleted_keys_.size());
+}
+
+TEST_P(CacheTest, ReleaseWithoutErase) {
+  std::shared_ptr<Cache> cache = NewCache(5, 0, false);
+  Cache::Handle* handle;
+  Status s = cache->Insert(EncodeKey(100), EncodeValue(100), 1,
+                           &CacheTest::Deleter, &handle);
+  ASSERT_TRUE(s.ok());
+  ASSERT_EQ(5U, cache->GetCapacity());
+  ASSERT_EQ(1U, cache->GetUsage());
+  ASSERT_EQ(0U, deleted_keys_.size());
+  auto erased = cache->Release(handle);
+  ASSERT_FALSE(erased);
+  // This tests that deleter is not called. When cache has free capacity it is
+  // not expected to immediately erase the released items.
+  ASSERT_EQ(0U, deleted_keys_.size());
+}
+
+TEST_P(CacheTest, SetCapacity) {
+  auto type = GetParam();
+  if (type == kHyperClock) {
+    ROCKSDB_GTEST_BYPASS(
+        "FastLRUCache and HyperClockCache don't support arbitrary capacity "
+        "adjustments.");
+    return;
+  }
+  // test1: increase capacity
+  // lets create a cache with capacity 5,
+  // then, insert 5 elements, then increase capacity
+  // to 10, returned capacity should be 10, usage=5
+  std::shared_ptr<Cache> cache = NewCache(5, 0, false);
+  std::vector<Cache::Handle*> handles(10);
+  // Insert 5 entries, but not releasing.
+  for (int i = 0; i < 5; i++) {
+    std::string key = EncodeKey(i + 1);
+    Status s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
+    ASSERT_TRUE(s.ok());
+  }
+  ASSERT_EQ(5U, cache->GetCapacity());
+  ASSERT_EQ(5U, cache->GetUsage());
+  cache->SetCapacity(10);
+  ASSERT_EQ(10U, cache->GetCapacity());
+  ASSERT_EQ(5U, cache->GetUsage());
+
+  // test2: decrease capacity
+  // insert 5 more elements to cache, then release 5,
+  // then decrease capacity to 7, final capacity should be 7
+  // and usage should be 7
+  for (int i = 5; i < 10; i++) {
+    std::string key = EncodeKey(i + 1);
+    Status s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
+    ASSERT_TRUE(s.ok());
+  }
+  ASSERT_EQ(10U, cache->GetCapacity());
+  ASSERT_EQ(10U, cache->GetUsage());
+  for (int i = 0; i < 5; i++) {
+    cache->Release(handles[i]);
+  }
+  ASSERT_EQ(10U, cache->GetCapacity());
+  ASSERT_EQ(10U, cache->GetUsage());
+  cache->SetCapacity(7);
+  ASSERT_EQ(7, cache->GetCapacity());
+  ASSERT_EQ(7, cache->GetUsage());
+
+  // release remaining 5 to keep valgrind happy
+  for (int i = 5; i < 10; i++) {
+    cache->Release(handles[i]);
+  }
+
+  // Make sure this doesn't crash or upset ASAN/valgrind
+  cache->DisownData();
+}
+
+TEST_P(LRUCacheTest, SetStrictCapacityLimit) {
+  // test1: set the flag to false. Insert more keys than capacity. See if they
+  // all go through.
+  std::shared_ptr<Cache> cache = NewCache(5, 0, false);
+  std::vector<Cache::Handle*> handles(10);
+  Status s;
+  for (int i = 0; i < 10; i++) {
+    std::string key = EncodeKey(i + 1);
+    s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
+    ASSERT_OK(s);
+    ASSERT_NE(nullptr, handles[i]);
+  }
+  ASSERT_EQ(10, cache->GetUsage());
+
+  // test2: set the flag to true. Insert and check if it fails.
+  std::string extra_key = EncodeKey(100);
+  Value* extra_value = new Value(0);
+  cache->SetStrictCapacityLimit(true);
+  Cache::Handle* handle;
+  s = cache->Insert(extra_key, extra_value, 1, &deleter, &handle);
+  ASSERT_TRUE(s.IsMemoryLimit());
+  ASSERT_EQ(nullptr, handle);
+  ASSERT_EQ(10, cache->GetUsage());
+
+  for (int i = 0; i < 10; i++) {
+    cache->Release(handles[i]);
+  }
+
+  // test3: init with flag being true.
+  std::shared_ptr<Cache> cache2 = NewCache(5, 0, true);
+  for (int i = 0; i < 5; i++) {
+    std::string key = EncodeKey(i + 1);
+    s = cache2->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
+    ASSERT_OK(s);
+    ASSERT_NE(nullptr, handles[i]);
+  }
+  s = cache2->Insert(extra_key, extra_value, 1, &deleter, &handle);
+  ASSERT_TRUE(s.IsMemoryLimit());
+  ASSERT_EQ(nullptr, handle);
+  // test insert without handle
+  s = cache2->Insert(extra_key, extra_value, 1, &deleter);
+  // AS if the key have been inserted into cache but get evicted immediately.
+  ASSERT_OK(s);
+  ASSERT_EQ(5, cache2->GetUsage());
+  ASSERT_EQ(nullptr, cache2->Lookup(extra_key));
+
+  for (int i = 0; i < 5; i++) {
+    cache2->Release(handles[i]);
+  }
+}
+
+TEST_P(CacheTest, OverCapacity) {
+  size_t n = 10;
+
+  // a LRUCache with n entries and one shard only
+  std::shared_ptr<Cache> cache = NewCache(n, 0, false);
+
+  std::vector<Cache::Handle*> handles(n + 1);
+
+  // Insert n+1 entries, but not releasing.
+  for (int i = 0; i < static_cast<int>(n + 1); i++) {
+    std::string key = EncodeKey(i + 1);
+    Status s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
+    ASSERT_TRUE(s.ok());
+  }
+
+  // Guess what's in the cache now?
+  for (int i = 0; i < static_cast<int>(n + 1); i++) {
+    std::string key = EncodeKey(i + 1);
+    auto h = cache->Lookup(key);
+    ASSERT_TRUE(h != nullptr);
+    if (h) cache->Release(h);
+  }
+
+  // the cache is over capacity since nothing could be evicted
+  ASSERT_EQ(n + 1U, cache->GetUsage());
+  for (int i = 0; i < static_cast<int>(n + 1); i++) {
+    cache->Release(handles[i]);
+  }
+
+  if (GetParam() == kHyperClock) {
+    // Make sure eviction is triggered.
+    ASSERT_OK(cache->Insert(EncodeKey(-1), nullptr, 1, &deleter, &handles[0]));
+
+    // cache is under capacity now since elements were released
+    ASSERT_GE(n, cache->GetUsage());
+
+    // clean up
+    cache->Release(handles[0]);
+  } else {
+    // LRUCache checks for over-capacity in Release.
+
+    // cache is exactly at capacity now with minimal eviction
+    ASSERT_EQ(n, cache->GetUsage());
+
+    // element 0 is evicted and the rest is there
+    // This is consistent with the LRU policy since the element 0
+    // was released first
+    for (int i = 0; i < static_cast<int>(n + 1); i++) {
+      std::string key = EncodeKey(i + 1);
+      auto h = cache->Lookup(key);
+      if (h) {
+        ASSERT_NE(static_cast<size_t>(i), 0U);
+        cache->Release(h);
+      } else {
+        ASSERT_EQ(static_cast<size_t>(i), 0U);
+      }
+    }
+  }
+}
+
+namespace {
+std::vector<std::pair<int, int>> legacy_callback_state;
+void legacy_callback(void* value, size_t charge) {
+  legacy_callback_state.push_back(
+      {DecodeValue(value), static_cast<int>(charge)});
+}
+};  // namespace
+
+TEST_P(CacheTest, ApplyToAllCacheEntriesTest) {
+  std::vector<std::pair<int, int>> inserted;
+  legacy_callback_state.clear();
+
+  for (int i = 0; i < 10; ++i) {
+    Insert(i, i * 2, i + 1);
+    inserted.push_back({i * 2, i + 1});
+  }
+  cache_->ApplyToAllCacheEntries(legacy_callback, true);
+
+  std::sort(inserted.begin(), inserted.end());
+  std::sort(legacy_callback_state.begin(), legacy_callback_state.end());
+  ASSERT_EQ(inserted.size(), legacy_callback_state.size());
+  for (int i = 0; i < static_cast<int>(inserted.size()); ++i) {
+    EXPECT_EQ(inserted[i], legacy_callback_state[i]);
+  }
+}
+
+TEST_P(CacheTest, ApplyToAllEntriesTest) {
+  std::vector<std::string> callback_state;
+  const auto callback = [&](const Slice& key, void* value, size_t charge,
+                            Cache::DeleterFn deleter) {
+    callback_state.push_back(std::to_string(DecodeKey(key)) + "," +
+                             std::to_string(DecodeValue(value)) + "," +
+                             std::to_string(charge));
+    assert(deleter == &CacheTest::Deleter);
+  };
+
+  std::vector<std::string> inserted;
+  callback_state.clear();
+
+  for (int i = 0; i < 10; ++i) {
+    Insert(i, i * 2, i + 1);
+    inserted.push_back(std::to_string(i) + "," + std::to_string(i * 2) + "," +
+                       std::to_string(i + 1));
+  }
+  cache_->ApplyToAllEntries(callback, /*opts*/ {});
+
+  std::sort(inserted.begin(), inserted.end());
+  std::sort(callback_state.begin(), callback_state.end());
+  ASSERT_EQ(inserted.size(), callback_state.size());
+  for (int i = 0; i < static_cast<int>(inserted.size()); ++i) {
+    EXPECT_EQ(inserted[i], callback_state[i]);
+  }
+}
+
+TEST_P(CacheTest, ApplyToAllEntriesDuringResize) {
+  // This is a mini-stress test of ApplyToAllEntries, to ensure
+  // items in the cache that are neither added nor removed
+  // during ApplyToAllEntries are counted exactly once.
+
+  // Insert some entries that we expect to be seen exactly once
+  // during iteration.
+  constexpr int kSpecialCharge = 2;
+  constexpr int kNotSpecialCharge = 1;
+  constexpr int kSpecialCount = 100;
+  size_t expected_usage = 0;
+  for (int i = 0; i < kSpecialCount; ++i) {
+    Insert(i, i * 2, kSpecialCharge);
+    expected_usage += kSpecialCharge;
+  }
+
+  // For callback
+  int special_count = 0;
+  const auto callback = [&](const Slice&, void*, size_t charge,
+                            Cache::DeleterFn) {
+    if (charge == static_cast<size_t>(kSpecialCharge)) {
+      ++special_count;
+    }
+  };
+
+  // Start counting
+  std::thread apply_thread([&]() {
+    // Use small average_entries_per_lock to make the problem difficult
+    Cache::ApplyToAllEntriesOptions opts;
+    opts.average_entries_per_lock = 2;
+    cache_->ApplyToAllEntries(callback, opts);
+  });
+
+  // In parallel, add more entries, enough to cause resize but not enough
+  // to cause ejections. (Note: if any cache shard is over capacity, there
+  // will be ejections)
+  for (int i = kSpecialCount * 1; i < kSpecialCount * 5; ++i) {
+    Insert(i, i * 2, kNotSpecialCharge);
+    expected_usage += kNotSpecialCharge;
+  }
+
+  apply_thread.join();
+  // verify no evictions
+  ASSERT_EQ(cache_->GetUsage(), expected_usage);
+  // verify everything seen in ApplyToAllEntries
+  ASSERT_EQ(special_count, kSpecialCount);
+}
+
+TEST_P(CacheTest, DefaultShardBits) {
+  // Prevent excessive allocation (to save time & space)
+  estimated_value_size_ = 100000;
+  // Implementations use different minimum shard sizes
+  size_t min_shard_size =
+      (GetParam() == kHyperClock ? 32U * 1024U : 512U) * 1024U;
+
+  std::shared_ptr<Cache> cache = NewCache(32U * min_shard_size);
+  ShardedCacheBase* sc = dynamic_cast<ShardedCacheBase*>(cache.get());
+  ASSERT_EQ(5, sc->GetNumShardBits());
+
+  cache = NewCache(min_shard_size / 1000U * 999U);
+  sc = dynamic_cast<ShardedCacheBase*>(cache.get());
+  ASSERT_EQ(0, sc->GetNumShardBits());
+
+  cache = NewCache(3U * 1024U * 1024U * 1024U);
+  sc = dynamic_cast<ShardedCacheBase*>(cache.get());
+  // current maximum of 6
+  ASSERT_EQ(6, sc->GetNumShardBits());
+
+  if constexpr (sizeof(size_t) > 4) {
+    cache = NewCache(128U * min_shard_size);
+    sc = dynamic_cast<ShardedCacheBase*>(cache.get());
+    // current maximum of 6
+    ASSERT_EQ(6, sc->GetNumShardBits());
+  }
+}
+
+TEST_P(CacheTest, GetChargeAndDeleter) {
+  Insert(1, 2);
+  Cache::Handle* h1 = cache_->Lookup(EncodeKey(1));
+  ASSERT_EQ(2, DecodeValue(cache_->Value(h1)));
+  ASSERT_EQ(1, cache_->GetCharge(h1));
+  ASSERT_EQ(&CacheTest::Deleter, cache_->GetDeleter(h1));
+  cache_->Release(h1);
+}
+
+INSTANTIATE_TEST_CASE_P(CacheTestInstance, CacheTest,
+                        testing::Values(kLRU, kHyperClock));
+INSTANTIATE_TEST_CASE_P(CacheTestInstance, LRUCacheTest, testing::Values(kLRU));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/cache/charged_cache.cc b/src/rocksdb/cache/charged_cache.cc
new file mode 100644
index 000000000..a9ff969b8
--- /dev/null
+++ b/src/rocksdb/cache/charged_cache.cc
@@ -0,0 +1,117 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "cache/charged_cache.h"
+
+#include "cache/cache_reservation_manager.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ChargedCache::ChargedCache(std::shared_ptr<Cache> cache,
+                           std::shared_ptr<Cache> block_cache)
+    : cache_(cache),
+      cache_res_mgr_(std::make_shared<ConcurrentCacheReservationManager>(
+          std::make_shared<
+              CacheReservationManagerImpl<CacheEntryRole::kBlobCache>>(
+              block_cache))) {}
+
+Status ChargedCache::Insert(const Slice& key, void* value, size_t charge,
+                            DeleterFn deleter, Handle** handle,
+                            Priority priority) {
+  Status s = cache_->Insert(key, value, charge, deleter, handle, priority);
+  if (s.ok()) {
+    // Insert may cause the cache entry eviction if the cache is full. So we
+    // directly call the reservation manager to update the total memory used
+    // in the cache.
+    assert(cache_res_mgr_);
+    cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
+        .PermitUncheckedError();
+  }
+  return s;
+}
+
+Status ChargedCache::Insert(const Slice& key, void* value,
+                            const CacheItemHelper* helper, size_t charge,
+                            Handle** handle, Priority priority) {
+  Status s = cache_->Insert(key, value, helper, charge, handle, priority);
+  if (s.ok()) {
+    // Insert may cause the cache entry eviction if the cache is full. So we
+    // directly call the reservation manager to update the total memory used
+    // in the cache.
+    assert(cache_res_mgr_);
+    cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
+        .PermitUncheckedError();
+  }
+  return s;
+}
+
+Cache::Handle* ChargedCache::Lookup(const Slice& key, Statistics* stats) {
+  return cache_->Lookup(key, stats);
+}
+
+Cache::Handle* ChargedCache::Lookup(const Slice& key,
+                                    const CacheItemHelper* helper,
+                                    const CreateCallback& create_cb,
+                                    Priority priority, bool wait,
+                                    Statistics* stats) {
+  auto handle = cache_->Lookup(key, helper, create_cb, priority, wait, stats);
+  // Lookup may promote the KV pair from the secondary cache to the primary
+  // cache. So we directly call the reservation manager to update the total
+  // memory used in the cache.
+  assert(cache_res_mgr_);
+  cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
+      .PermitUncheckedError();
+  return handle;
+}
+
+bool ChargedCache::Release(Cache::Handle* handle, bool useful,
+                           bool erase_if_last_ref) {
+  size_t memory_used_delta = cache_->GetUsage(handle);
+  bool erased = cache_->Release(handle, useful, erase_if_last_ref);
+  if (erased) {
+    assert(cache_res_mgr_);
+    cache_res_mgr_
+        ->UpdateCacheReservation(memory_used_delta, /* increase */ false)
+        .PermitUncheckedError();
+  }
+  return erased;
+}
+
+bool ChargedCache::Release(Cache::Handle* handle, bool erase_if_last_ref) {
+  size_t memory_used_delta = cache_->GetUsage(handle);
+  bool erased = cache_->Release(handle, erase_if_last_ref);
+  if (erased) {
+    assert(cache_res_mgr_);
+    cache_res_mgr_
+        ->UpdateCacheReservation(memory_used_delta, /* increase */ false)
+        .PermitUncheckedError();
+  }
+  return erased;
+}
+
+void ChargedCache::Erase(const Slice& key) {
+  cache_->Erase(key);
+  assert(cache_res_mgr_);
+  cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
+      .PermitUncheckedError();
+}
+
+void ChargedCache::EraseUnRefEntries() {
+  cache_->EraseUnRefEntries();
+  assert(cache_res_mgr_);
+  cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
+      .PermitUncheckedError();
+}
+
+void ChargedCache::SetCapacity(size_t capacity) {
+  cache_->SetCapacity(capacity);
+  // SetCapacity can result in evictions when the cache capacity is decreased,
+  // so we would want to update the cache reservation here as well.
+  assert(cache_res_mgr_);
+  cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
+      .PermitUncheckedError();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/charged_cache.h b/src/rocksdb/cache/charged_cache.h
new file mode 100644
index 000000000..1739e4088
--- /dev/null
+++ b/src/rocksdb/cache/charged_cache.h
@@ -0,0 +1,121 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+
+#include "port/port.h"
+#include "rocksdb/cache.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ConcurrentCacheReservationManager;
+
+// A cache interface which wraps around another cache and takes care of
+// reserving space in block cache towards a single global memory limit, and
+// forwards all the calls to the underlying cache.
+class ChargedCache : public Cache {
+ public:
+  ChargedCache(std::shared_ptr<Cache> cache,
+               std::shared_ptr<Cache> block_cache);
+  ~ChargedCache() override = default;
+
+  Status Insert(const Slice& key, void* value, size_t charge, DeleterFn deleter,
+                Handle** handle, Priority priority) override;
+  Status Insert(const Slice& key, void* value, const CacheItemHelper* helper,
+                size_t charge, Handle** handle = nullptr,
+                Priority priority = Priority::LOW) override;
+
+  Cache::Handle* Lookup(const Slice& key, Statistics* stats) override;
+  Cache::Handle* Lookup(const Slice& key, const CacheItemHelper* helper,
+                        const CreateCallback& create_cb, Priority priority,
+                        bool wait, Statistics* stats = nullptr) override;
+
+  bool Release(Cache::Handle* handle, bool useful,
+               bool erase_if_last_ref = false) override;
+  bool Release(Cache::Handle* handle, bool erase_if_last_ref = false) override;
+
+  void Erase(const Slice& key) override;
+  void EraseUnRefEntries() override;
+
+  static const char* kClassName() { return "ChargedCache"; }
+  const char* Name() const override { return kClassName(); }
+
+  uint64_t NewId() override { return cache_->NewId(); }
+
+  void SetCapacity(size_t capacity) override;
+
+  void SetStrictCapacityLimit(bool strict_capacity_limit) override {
+    cache_->SetStrictCapacityLimit(strict_capacity_limit);
+  }
+
+  bool HasStrictCapacityLimit() const override {
+    return cache_->HasStrictCapacityLimit();
+  }
+
+  void* Value(Cache::Handle* handle) override { return cache_->Value(handle); }
+
+  bool IsReady(Cache::Handle* handle) override {
+    return cache_->IsReady(handle);
+  }
+
+  void Wait(Cache::Handle* handle) override { cache_->Wait(handle); }
+
+  void WaitAll(std::vector<Handle*>& handles) override {
+    cache_->WaitAll(handles);
+  }
+
+  bool Ref(Cache::Handle* handle) override { return cache_->Ref(handle); }
+
+  size_t GetCapacity() const override { return cache_->GetCapacity(); }
+
+  size_t GetUsage() const override { return cache_->GetUsage(); }
+
+  size_t GetUsage(Cache::Handle* handle) const override {
+    return cache_->GetUsage(handle);
+  }
+
+  size_t GetPinnedUsage() const override { return cache_->GetPinnedUsage(); }
+
+  size_t GetCharge(Cache::Handle* handle) const override {
+    return cache_->GetCharge(handle);
+  }
+
+  Cache::DeleterFn GetDeleter(Cache::Handle* handle) const override {
+    return cache_->GetDeleter(handle);
+  }
+
+  void ApplyToAllEntries(
+      const std::function<void(const Slice& key, void* value, size_t charge,
+                               Cache::DeleterFn deleter)>& callback,
+      const Cache::ApplyToAllEntriesOptions& opts) override {
+    cache_->ApplyToAllEntries(callback, opts);
+  }
+
+  void ApplyToAllCacheEntries(void (*callback)(void* value, size_t charge),
+                              bool thread_safe) override {
+    cache_->ApplyToAllCacheEntries(callback, thread_safe);
+  }
+
+  std::string GetPrintableOptions() const override {
+    return cache_->GetPrintableOptions();
+  }
+
+  void DisownData() override { return cache_->DisownData(); }
+
+  inline Cache* GetCache() const { return cache_.get(); }
+
+  inline ConcurrentCacheReservationManager* TEST_GetCacheReservationManager()
+      const {
+    return cache_res_mgr_.get();
+  }
+
+ private:
+  std::shared_ptr<Cache> cache_;
+  std::shared_ptr<ConcurrentCacheReservationManager> cache_res_mgr_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/clock_cache.cc b/src/rocksdb/cache/clock_cache.cc
new file mode 100644
index 000000000..6c9f18c2f
--- /dev/null
+++ b/src/rocksdb/cache/clock_cache.cc
@@ -0,0 +1,1404 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "cache/clock_cache.h"
+
+#include <cassert>
+#include <functional>
+#include <numeric>
+
+#include "cache/cache_key.h"
+#include "logging/logging.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/statistics.h"
+#include "port/lang.h"
+#include "util/hash.h"
+#include "util/math.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace clock_cache {
+
+namespace {
+inline uint64_t GetRefcount(uint64_t meta) {
+  return ((meta >> ClockHandle::kAcquireCounterShift) -
+          (meta >> ClockHandle::kReleaseCounterShift)) &
+         ClockHandle::kCounterMask;
+}
+
+inline uint64_t GetInitialCountdown(Cache::Priority priority) {
+  // Set initial clock data from priority
+  // TODO: configuration parameters for priority handling and clock cycle
+  // count?
+  switch (priority) {
+    case Cache::Priority::HIGH:
+      return ClockHandle::kHighCountdown;
+    default:
+      assert(false);
+      FALLTHROUGH_INTENDED;
+    case Cache::Priority::LOW:
+      return ClockHandle::kLowCountdown;
+    case Cache::Priority::BOTTOM:
+      return ClockHandle::kBottomCountdown;
+  }
+}
+
+inline void FreeDataMarkEmpty(ClockHandle& h) {
+  // NOTE: in theory there's more room for parallelism if we copy the handle
+  // data and delay actions like this until after marking the entry as empty,
+  // but performance tests only show a regression by copying the few words
+  // of data.
+  h.FreeData();
+
+#ifndef NDEBUG
+  // Mark slot as empty, with assertion
+  uint64_t meta = h.meta.exchange(0, std::memory_order_release);
+  assert(meta >> ClockHandle::kStateShift == ClockHandle::kStateConstruction);
+#else
+  // Mark slot as empty
+  h.meta.store(0, std::memory_order_release);
+#endif
+}
+
+inline bool ClockUpdate(ClockHandle& h) {
+  uint64_t meta = h.meta.load(std::memory_order_relaxed);
+
+  uint64_t acquire_count =
+      (meta >> ClockHandle::kAcquireCounterShift) & ClockHandle::kCounterMask;
+  uint64_t release_count =
+      (meta >> ClockHandle::kReleaseCounterShift) & ClockHandle::kCounterMask;
+  // fprintf(stderr, "ClockUpdate @ %p: %lu %lu %u\n", &h, acquire_count,
+  // release_count, (unsigned)(meta >> ClockHandle::kStateShift));
+  if (acquire_count != release_count) {
+    // Only clock update entries with no outstanding refs
+    return false;
+  }
+  if (!((meta >> ClockHandle::kStateShift) & ClockHandle::kStateShareableBit)) {
+    // Only clock update Shareable entries
+    return false;
+  }
+  if ((meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible) &&
+      acquire_count > 0) {
+    // Decrement clock
+    uint64_t new_count =
+        std::min(acquire_count - 1, uint64_t{ClockHandle::kMaxCountdown} - 1);
+    // Compare-exchange in the decremented clock info, but
+    // not aggressively
+    uint64_t new_meta =
+        (uint64_t{ClockHandle::kStateVisible} << ClockHandle::kStateShift) |
+        (new_count << ClockHandle::kReleaseCounterShift) |
+        (new_count << ClockHandle::kAcquireCounterShift);
+    h.meta.compare_exchange_strong(meta, new_meta, std::memory_order_relaxed);
+    return false;
+  }
+  // Otherwise, remove entry (either unreferenced invisible or
+  // unreferenced and expired visible).
+  if (h.meta.compare_exchange_strong(
+          meta,
+          uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift,
+          std::memory_order_acquire)) {
+    // Took ownership.
+    return true;
+  } else {
+    // Compare-exchange failing probably
+    // indicates the entry was used, so skip it in that case.
+    return false;
+  }
+}
+
+}  // namespace
+
+void ClockHandleBasicData::FreeData() const {
+  if (deleter) {
+    UniqueId64x2 unhashed;
+    (*deleter)(
+        ClockCacheShard<HyperClockTable>::ReverseHash(hashed_key, &unhashed),
+        value);
+  }
+}
+
+HyperClockTable::HyperClockTable(
+    size_t capacity, bool /*strict_capacity_limit*/,
+    CacheMetadataChargePolicy metadata_charge_policy, const Opts& opts)
+    : length_bits_(CalcHashBits(capacity, opts.estimated_value_size,
+                                metadata_charge_policy)),
+      length_bits_mask_((size_t{1} << length_bits_) - 1),
+      occupancy_limit_(static_cast<size_t>((uint64_t{1} << length_bits_) *
+                                           kStrictLoadFactor)),
+      array_(new HandleImpl[size_t{1} << length_bits_]) {
+  if (metadata_charge_policy ==
+      CacheMetadataChargePolicy::kFullChargeCacheMetadata) {
+    usage_ += size_t{GetTableSize()} * sizeof(HandleImpl);
+  }
+
+  static_assert(sizeof(HandleImpl) == 64U,
+                "Expecting size / alignment with common cache line size");
+}
+
+HyperClockTable::~HyperClockTable() {
+  // Assumes there are no references or active operations on any slot/element
+  // in the table.
+  for (size_t i = 0; i < GetTableSize(); i++) {
+    HandleImpl& h = array_[i];
+    switch (h.meta >> ClockHandle::kStateShift) {
+      case ClockHandle::kStateEmpty:
+        // noop
+        break;
+      case ClockHandle::kStateInvisible:  // rare but possible
+      case ClockHandle::kStateVisible:
+        assert(GetRefcount(h.meta) == 0);
+        h.FreeData();
+#ifndef NDEBUG
+        Rollback(h.hashed_key, &h);
+        ReclaimEntryUsage(h.GetTotalCharge());
+#endif
+        break;
+      // otherwise
+      default:
+        assert(false);
+        break;
+    }
+  }
+
+#ifndef NDEBUG
+  for (size_t i = 0; i < GetTableSize(); i++) {
+    assert(array_[i].displacements.load() == 0);
+  }
+#endif
+
+  assert(usage_.load() == 0 ||
+         usage_.load() == size_t{GetTableSize()} * sizeof(HandleImpl));
+  assert(occupancy_ == 0);
+}
+
+// If an entry doesn't receive clock updates but is repeatedly referenced &
+// released, the acquire and release counters could overflow without some
+// intervention. This is that intervention, which should be inexpensive
+// because it only incurs a simple, very predictable check. (Applying a bit
+// mask in addition to an increment to every Release likely would be
+// relatively expensive, because it's an extra atomic update.)
+//
+// We do have to assume that we never have many millions of simultaneous
+// references to a cache handle, because we cannot represent so many
+// references with the difference in counters, masked to the number of
+// counter bits. Similarly, we assume there aren't millions of threads
+// holding transient references (which might be "undone" rather than
+// released by the way).
+//
+// Consider these possible states for each counter:
+// low: less than kMaxCountdown
+// medium: kMaxCountdown to half way to overflow + kMaxCountdown
+// high: half way to overflow + kMaxCountdown, or greater
+//
+// And these possible states for the combination of counters:
+// acquire / release
+// -------   -------
+// low       low       - Normal / common, with caveats (see below)
+// medium    low       - Can happen while holding some refs
+// high      low       - Violates assumptions (too many refs)
+// low       medium    - Violates assumptions (refs underflow, etc.)
+// medium    medium    - Normal (very read heavy cache)
+// high      medium    - Can happen while holding some refs
+// low       high      - This function is supposed to prevent
+// medium    high      - Violates assumptions (refs underflow, etc.)
+// high      high      - Needs CorrectNearOverflow
+//
+// Basically, this function detects (high, high) state (inferred from
+// release alone being high) and bumps it back down to (medium, medium)
+// state with the same refcount and the same logical countdown counter
+// (everything > kMaxCountdown is logically the same). Note that bumping
+// down to (low, low) would modify the countdown counter, so is "reserved"
+// in a sense.
+//
+// If near-overflow correction is triggered here, there's no guarantee
+// that another thread hasn't freed the entry and replaced it with another.
+// Therefore, it must be the case that the correction does not affect
+// entries unless they are very old (many millions of acquire-release cycles).
+// (Our bit manipulation is indeed idempotent and only affects entries in
+// exceptional cases.) We assume a pre-empted thread will not stall that long.
+// If it did, the state could be corrupted in the (unlikely) case that the top
+// bit of the acquire counter is set but not the release counter, and thus
+// we only clear the top bit of the acquire counter on resumption. It would
+// then appear that there are too many refs and the entry would be permanently
+// pinned (which is not terrible for an exceptionally rare occurrence), unless
+// it is referenced enough (at least kMaxCountdown more times) for the release
+// counter to reach "high" state again and bumped back to "medium." (This
+// motivates only checking for release counter in high state, not both in high
+// state.)
+inline void CorrectNearOverflow(uint64_t old_meta,
+                                std::atomic<uint64_t>& meta) {
+  // We clear both top-most counter bits at the same time.
+  constexpr uint64_t kCounterTopBit = uint64_t{1}
+                                      << (ClockHandle::kCounterNumBits - 1);
+  constexpr uint64_t kClearBits =
+      (kCounterTopBit << ClockHandle::kAcquireCounterShift) |
+      (kCounterTopBit << ClockHandle::kReleaseCounterShift);
+  // A simple check that allows us to initiate clearing the top bits for
+  // a large portion of the "high" state space on release counter.
+  constexpr uint64_t kCheckBits =
+      (kCounterTopBit | (ClockHandle::kMaxCountdown + 1))
+      << ClockHandle::kReleaseCounterShift;
+
+  if (UNLIKELY(old_meta & kCheckBits)) {
+    meta.fetch_and(~kClearBits, std::memory_order_relaxed);
+  }
+}
+
+inline Status HyperClockTable::ChargeUsageMaybeEvictStrict(
+    size_t total_charge, size_t capacity, bool need_evict_for_occupancy) {
+  if (total_charge > capacity) {
+    return Status::MemoryLimit(
+        "Cache entry too large for a single cache shard: " +
+        std::to_string(total_charge) + " > " + std::to_string(capacity));
+  }
+  // Grab any available capacity, and free up any more required.
+  size_t old_usage = usage_.load(std::memory_order_relaxed);
+  size_t new_usage;
+  if (LIKELY(old_usage != capacity)) {
+    do {
+      new_usage = std::min(capacity, old_usage + total_charge);
+    } while (!usage_.compare_exchange_weak(old_usage, new_usage,
+                                           std::memory_order_relaxed));
+  } else {
+    new_usage = old_usage;
+  }
+  // How much do we need to evict then?
+  size_t need_evict_charge = old_usage + total_charge - new_usage;
+  size_t request_evict_charge = need_evict_charge;
+  if (UNLIKELY(need_evict_for_occupancy) && request_evict_charge == 0) {
+    // Require at least 1 eviction.
+    request_evict_charge = 1;
+  }
+  if (request_evict_charge > 0) {
+    size_t evicted_charge = 0;
+    size_t evicted_count = 0;
+    Evict(request_evict_charge, &evicted_charge, &evicted_count);
+    occupancy_.fetch_sub(evicted_count, std::memory_order_release);
+    if (LIKELY(evicted_charge > need_evict_charge)) {
+      assert(evicted_count > 0);
+      // Evicted more than enough
+      usage_.fetch_sub(evicted_charge - need_evict_charge,
+                       std::memory_order_relaxed);
+    } else if (evicted_charge < need_evict_charge ||
+               (UNLIKELY(need_evict_for_occupancy) && evicted_count == 0)) {
+      // Roll back to old usage minus evicted
+      usage_.fetch_sub(evicted_charge + (new_usage - old_usage),
+                       std::memory_order_relaxed);
+      if (evicted_charge < need_evict_charge) {
+        return Status::MemoryLimit(
+            "Insert failed because unable to evict entries to stay within "
+            "capacity limit.");
+      } else {
+        return Status::MemoryLimit(
+            "Insert failed because unable to evict entries to stay within "
+            "table occupancy limit.");
+      }
+    }
+    // If we needed to evict something and we are proceeding, we must have
+    // evicted something.
+    assert(evicted_count > 0);
+  }
+  return Status::OK();
+}
+
+inline bool HyperClockTable::ChargeUsageMaybeEvictNonStrict(
+    size_t total_charge, size_t capacity, bool need_evict_for_occupancy) {
+  // For simplicity, we consider that either the cache can accept the insert
+  // with no evictions, or we must evict enough to make (at least) enough
+  // space. It could lead to unnecessary failures or excessive evictions in
+  // some extreme cases, but allows a fast, simple protocol. If we allow a
+  // race to get us over capacity, then we might never get back to capacity
+  // limit if the sizes of entries allow each insertion to evict the minimum
+  // charge. Thus, we should evict some extra if it's not a signifcant
+  // portion of the shard capacity. This can have the side benefit of
+  // involving fewer threads in eviction.
+  size_t old_usage = usage_.load(std::memory_order_relaxed);
+  size_t need_evict_charge;
+  // NOTE: if total_charge > old_usage, there isn't yet enough to evict
+  // `total_charge` amount. Even if we only try to evict `old_usage` amount,
+  // there's likely something referenced and we would eat CPU looking for
+  // enough to evict.
+  if (old_usage + total_charge <= capacity || total_charge > old_usage) {
+    // Good enough for me (might run over with a race)
+    need_evict_charge = 0;
+  } else {
+    // Try to evict enough space, and maybe some extra
+    need_evict_charge = total_charge;
+    if (old_usage > capacity) {
+      // Not too much to avoid thundering herd while avoiding strict
+      // synchronization, such as the compare_exchange used with strict
+      // capacity limit.
+      need_evict_charge += std::min(capacity / 1024, total_charge) + 1;
+    }
+  }
+  if (UNLIKELY(need_evict_for_occupancy) && need_evict_charge == 0) {
+    // Special case: require at least 1 eviction if we only have to
+    // deal with occupancy
+    need_evict_charge = 1;
+  }
+  size_t evicted_charge = 0;
+  size_t evicted_count = 0;
+  if (need_evict_charge > 0) {
+    Evict(need_evict_charge, &evicted_charge, &evicted_count);
+    // Deal with potential occupancy deficit
+    if (UNLIKELY(need_evict_for_occupancy) && evicted_count == 0) {
+      assert(evicted_charge == 0);
+      // Can't meet occupancy requirement
+      return false;
+    } else {
+      // Update occupancy for evictions
+      occupancy_.fetch_sub(evicted_count, std::memory_order_release);
+    }
+  }
+  // Track new usage even if we weren't able to evict enough
+  usage_.fetch_add(total_charge - evicted_charge, std::memory_order_relaxed);
+  // No underflow
+  assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
+  // Success
+  return true;
+}
+
+inline HyperClockTable::HandleImpl* HyperClockTable::DetachedInsert(
+    const ClockHandleBasicData& proto) {
+  // Heap allocated separate from table
+  HandleImpl* h = new HandleImpl();
+  ClockHandleBasicData* h_alias = h;
+  *h_alias = proto;
+  h->SetDetached();
+  // Single reference (detached entries only created if returning a refed
+  // Handle back to user)
+  uint64_t meta = uint64_t{ClockHandle::kStateInvisible}
+                  << ClockHandle::kStateShift;
+  meta |= uint64_t{1} << ClockHandle::kAcquireCounterShift;
+  h->meta.store(meta, std::memory_order_release);
+  // Keep track of how much of usage is detached
+  detached_usage_.fetch_add(proto.GetTotalCharge(), std::memory_order_relaxed);
+  return h;
+}
+
+Status HyperClockTable::Insert(const ClockHandleBasicData& proto,
+                               HandleImpl** handle, Cache::Priority priority,
+                               size_t capacity, bool strict_capacity_limit) {
+  // Do we have the available occupancy? Optimistically assume we do
+  // and deal with it if we don't.
+  size_t old_occupancy = occupancy_.fetch_add(1, std::memory_order_acquire);
+  auto revert_occupancy_fn = [&]() {
+    occupancy_.fetch_sub(1, std::memory_order_relaxed);
+  };
+  // Whether we over-committed and need an eviction to make up for it
+  bool need_evict_for_occupancy = old_occupancy >= occupancy_limit_;
+
+  // Usage/capacity handling is somewhat different depending on
+  // strict_capacity_limit, but mostly pessimistic.
+  bool use_detached_insert = false;
+  const size_t total_charge = proto.GetTotalCharge();
+  if (strict_capacity_limit) {
+    Status s = ChargeUsageMaybeEvictStrict(total_charge, capacity,
+                                           need_evict_for_occupancy);
+    if (!s.ok()) {
+      revert_occupancy_fn();
+      return s;
+    }
+  } else {
+    // Case strict_capacity_limit == false
+    bool success = ChargeUsageMaybeEvictNonStrict(total_charge, capacity,
+                                                  need_evict_for_occupancy);
+    if (!success) {
+      revert_occupancy_fn();
+      if (handle == nullptr) {
+        // Don't insert the entry but still return ok, as if the entry
+        // inserted into cache and evicted immediately.
+        proto.FreeData();
+        return Status::OK();
+      } else {
+        // Need to track usage of fallback detached insert
+        usage_.fetch_add(total_charge, std::memory_order_relaxed);
+        use_detached_insert = true;
+      }
+    }
+  }
+  auto revert_usage_fn = [&]() {
+    usage_.fetch_sub(total_charge, std::memory_order_relaxed);
+    // No underflow
+    assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
+  };
+
+  if (!use_detached_insert) {
+    // Attempt a table insert, but abort if we find an existing entry for the
+    // key. If we were to overwrite old entries, we would either
+    // * Have to gain ownership over an existing entry to overwrite it, which
+    // would only work if there are no outstanding (read) references and would
+    // create a small gap in availability of the entry (old or new) to lookups.
+    // * Have to insert into a suboptimal location (more probes) so that the
+    // old entry can be kept around as well.
+
+    uint64_t initial_countdown = GetInitialCountdown(priority);
+    assert(initial_countdown > 0);
+
+    size_t probe = 0;
+    HandleImpl* e = FindSlot(
+        proto.hashed_key,
+        [&](HandleImpl* h) {
+          // Optimistically transition the slot from "empty" to
+          // "under construction" (no effect on other states)
+          uint64_t old_meta =
+              h->meta.fetch_or(uint64_t{ClockHandle::kStateOccupiedBit}
+                                   << ClockHandle::kStateShift,
+                               std::memory_order_acq_rel);
+          uint64_t old_state = old_meta >> ClockHandle::kStateShift;
+
+          if (old_state == ClockHandle::kStateEmpty) {
+            // We've started inserting into an available slot, and taken
+            // ownership Save data fields
+            ClockHandleBasicData* h_alias = h;
+            *h_alias = proto;
+
+            // Transition from "under construction" state to "visible" state
+            uint64_t new_meta = uint64_t{ClockHandle::kStateVisible}
+                                << ClockHandle::kStateShift;
+
+            // Maybe with an outstanding reference
+            new_meta |= initial_countdown << ClockHandle::kAcquireCounterShift;
+            new_meta |= (initial_countdown - (handle != nullptr))
+                        << ClockHandle::kReleaseCounterShift;
+
+#ifndef NDEBUG
+            // Save the state transition, with assertion
+            old_meta = h->meta.exchange(new_meta, std::memory_order_release);
+            assert(old_meta >> ClockHandle::kStateShift ==
+                   ClockHandle::kStateConstruction);
+#else
+            // Save the state transition
+            h->meta.store(new_meta, std::memory_order_release);
+#endif
+            return true;
+          } else if (old_state != ClockHandle::kStateVisible) {
+            // Slot not usable / touchable now
+            return false;
+          }
+          // Existing, visible entry, which might be a match.
+          // But first, we need to acquire a ref to read it. In fact, number of
+          // refs for initial countdown, so that we boost the clock state if
+          // this is a match.
+          old_meta = h->meta.fetch_add(
+              ClockHandle::kAcquireIncrement * initial_countdown,
+              std::memory_order_acq_rel);
+          // Like Lookup
+          if ((old_meta >> ClockHandle::kStateShift) ==
+              ClockHandle::kStateVisible) {
+            // Acquired a read reference
+            if (h->hashed_key == proto.hashed_key) {
+              // Match. Release in a way that boosts the clock state
+              old_meta = h->meta.fetch_add(
+                  ClockHandle::kReleaseIncrement * initial_countdown,
+                  std::memory_order_acq_rel);
+              // Correct for possible (but rare) overflow
+              CorrectNearOverflow(old_meta, h->meta);
+              // Insert detached instead (only if return handle needed)
+              use_detached_insert = true;
+              return true;
+            } else {
+              // Mismatch. Pretend we never took the reference
+              old_meta = h->meta.fetch_sub(
+                  ClockHandle::kAcquireIncrement * initial_countdown,
+                  std::memory_order_acq_rel);
+            }
+          } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) ==
+                              ClockHandle::kStateInvisible)) {
+            // Pretend we never took the reference
+            // WART: there's a tiny chance we release last ref to invisible
+            // entry here. If that happens, we let eviction take care of it.
+            old_meta = h->meta.fetch_sub(
+                ClockHandle::kAcquireIncrement * initial_countdown,
+                std::memory_order_acq_rel);
+          } else {
+            // For other states, incrementing the acquire counter has no effect
+            // so we don't need to undo it.
+            // Slot not usable / touchable now.
+          }
+          (void)old_meta;
+          return false;
+        },
+        [&](HandleImpl* /*h*/) { return false; },
+        [&](HandleImpl* h) {
+          h->displacements.fetch_add(1, std::memory_order_relaxed);
+        },
+        probe);
+    if (e == nullptr) {
+      // Occupancy check and never abort FindSlot above should generally
+      // prevent this, except it's theoretically possible for other threads
+      // to evict and replace entries in the right order to hit every slot
+      // when it is populated. Assuming random hashing, the chance of that
+      // should be no higher than pow(kStrictLoadFactor, n) for n slots.
+      // That should be infeasible for roughly n >= 256, so if this assertion
+      // fails, that suggests something is going wrong.
+      assert(GetTableSize() < 256);
+      use_detached_insert = true;
+    }
+    if (!use_detached_insert) {
+      // Successfully inserted
+      if (handle) {
+        *handle = e;
+      }
+      return Status::OK();
+    }
+    // Roll back table insertion
+    Rollback(proto.hashed_key, e);
+    revert_occupancy_fn();
+    // Maybe fall back on detached insert
+    if (handle == nullptr) {
+      revert_usage_fn();
+      // As if unrefed entry immdiately evicted
+      proto.FreeData();
+      return Status::OK();
+    }
+  }
+
+  // Run detached insert
+  assert(use_detached_insert);
+
+  *handle = DetachedInsert(proto);
+
+  // The OkOverwritten status is used to count "redundant" insertions into
+  // block cache. This implementation doesn't strictly check for redundant
+  // insertions, but we instead are probably interested in how many insertions
+  // didn't go into the table (instead "detached"), which could be redundant
+  // Insert or some other reason (use_detached_insert reasons above).
+  return Status::OkOverwritten();
+}
+
+HyperClockTable::HandleImpl* HyperClockTable::Lookup(
+    const UniqueId64x2& hashed_key) {
+  size_t probe = 0;
+  HandleImpl* e = FindSlot(
+      hashed_key,
+      [&](HandleImpl* h) {
+        // Mostly branch-free version (similar performance)
+        /*
+        uint64_t old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement,
+                                     std::memory_order_acquire);
+        bool Shareable = (old_meta >> (ClockHandle::kStateShift + 1)) & 1U;
+        bool visible = (old_meta >> ClockHandle::kStateShift) & 1U;
+        bool match = (h->key == key) & visible;
+        h->meta.fetch_sub(static_cast<uint64_t>(Shareable & !match) <<
+        ClockHandle::kAcquireCounterShift, std::memory_order_release); return
+        match;
+        */
+        // Optimistic lookup should pay off when the table is relatively
+        // sparse.
+        constexpr bool kOptimisticLookup = true;
+        uint64_t old_meta;
+        if (!kOptimisticLookup) {
+          old_meta = h->meta.load(std::memory_order_acquire);
+          if ((old_meta >> ClockHandle::kStateShift) !=
+              ClockHandle::kStateVisible) {
+            return false;
+          }
+        }
+        // (Optimistically) increment acquire counter
+        old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement,
+                                     std::memory_order_acquire);
+        // Check if it's an entry visible to lookups
+        if ((old_meta >> ClockHandle::kStateShift) ==
+            ClockHandle::kStateVisible) {
+          // Acquired a read reference
+          if (h->hashed_key == hashed_key) {
+            // Match
+            return true;
+          } else {
+            // Mismatch. Pretend we never took the reference
+            old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
+                                         std::memory_order_release);
+          }
+        } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) ==
+                            ClockHandle::kStateInvisible)) {
+          // Pretend we never took the reference
+          // WART: there's a tiny chance we release last ref to invisible
+          // entry here. If that happens, we let eviction take care of it.
+          old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
+                                       std::memory_order_release);
+        } else {
+          // For other states, incrementing the acquire counter has no effect
+          // so we don't need to undo it. Furthermore, we cannot safely undo
+          // it because we did not acquire a read reference to lock the
+          // entry in a Shareable state.
+        }
+        (void)old_meta;
+        return false;
+      },
+      [&](HandleImpl* h) {
+        return h->displacements.load(std::memory_order_relaxed) == 0;
+      },
+      [&](HandleImpl* /*h*/) {}, probe);
+
+  return e;
+}
+
+bool HyperClockTable::Release(HandleImpl* h, bool useful,
+                              bool erase_if_last_ref) {
+  // In contrast with LRUCache's Release, this function won't delete the handle
+  // when the cache is above capacity and the reference is the last one. Space
+  // is only freed up by EvictFromClock (called by Insert when space is needed)
+  // and Erase. We do this to avoid an extra atomic read of the variable usage_.
+
+  uint64_t old_meta;
+  if (useful) {
+    // Increment release counter to indicate was used
+    old_meta = h->meta.fetch_add(ClockHandle::kReleaseIncrement,
+                                 std::memory_order_release);
+  } else {
+    // Decrement acquire counter to pretend it never happened
+    old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
+                                 std::memory_order_release);
+  }
+
+  assert((old_meta >> ClockHandle::kStateShift) &
+         ClockHandle::kStateShareableBit);
+  // No underflow
+  assert(((old_meta >> ClockHandle::kAcquireCounterShift) &
+          ClockHandle::kCounterMask) !=
+         ((old_meta >> ClockHandle::kReleaseCounterShift) &
+          ClockHandle::kCounterMask));
+
+  if (erase_if_last_ref || UNLIKELY(old_meta >> ClockHandle::kStateShift ==
+                                    ClockHandle::kStateInvisible)) {
+    // Update for last fetch_add op
+    if (useful) {
+      old_meta += ClockHandle::kReleaseIncrement;
+    } else {
+      old_meta -= ClockHandle::kAcquireIncrement;
+    }
+    // Take ownership if no refs
+    do {
+      if (GetRefcount(old_meta) != 0) {
+        // Not last ref at some point in time during this Release call
+        // Correct for possible (but rare) overflow
+        CorrectNearOverflow(old_meta, h->meta);
+        return false;
+      }
+      if ((old_meta & (uint64_t{ClockHandle::kStateShareableBit}
+                       << ClockHandle::kStateShift)) == 0) {
+        // Someone else took ownership
+        return false;
+      }
+      // Note that there's a small chance that we release, another thread
+      // replaces this entry with another, reaches zero refs, and then we end
+      // up erasing that other entry. That's an acceptable risk / imprecision.
+    } while (!h->meta.compare_exchange_weak(
+        old_meta,
+        uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift,
+        std::memory_order_acquire));
+    // Took ownership
+    size_t total_charge = h->GetTotalCharge();
+    if (UNLIKELY(h->IsDetached())) {
+      h->FreeData();
+      // Delete detached handle
+      delete h;
+      detached_usage_.fetch_sub(total_charge, std::memory_order_relaxed);
+      usage_.fetch_sub(total_charge, std::memory_order_relaxed);
+    } else {
+      Rollback(h->hashed_key, h);
+      FreeDataMarkEmpty(*h);
+      ReclaimEntryUsage(total_charge);
+    }
+    return true;
+  } else {
+    // Correct for possible (but rare) overflow
+    CorrectNearOverflow(old_meta, h->meta);
+    return false;
+  }
+}
+
+void HyperClockTable::Ref(HandleImpl& h) {
+  // Increment acquire counter
+  uint64_t old_meta = h.meta.fetch_add(ClockHandle::kAcquireIncrement,
+                                       std::memory_order_acquire);
+
+  assert((old_meta >> ClockHandle::kStateShift) &
+         ClockHandle::kStateShareableBit);
+  // Must have already had a reference
+  assert(GetRefcount(old_meta) > 0);
+  (void)old_meta;
+}
+
+void HyperClockTable::TEST_RefN(HandleImpl& h, size_t n) {
+  // Increment acquire counter
+  uint64_t old_meta = h.meta.fetch_add(n * ClockHandle::kAcquireIncrement,
+                                       std::memory_order_acquire);
+
+  assert((old_meta >> ClockHandle::kStateShift) &
+         ClockHandle::kStateShareableBit);
+  (void)old_meta;
+}
+
+void HyperClockTable::TEST_ReleaseN(HandleImpl* h, size_t n) {
+  if (n > 0) {
+    // Split into n - 1 and 1 steps.
+    uint64_t old_meta = h->meta.fetch_add(
+        (n - 1) * ClockHandle::kReleaseIncrement, std::memory_order_acquire);
+    assert((old_meta >> ClockHandle::kStateShift) &
+           ClockHandle::kStateShareableBit);
+    (void)old_meta;
+
+    Release(h, /*useful*/ true, /*erase_if_last_ref*/ false);
+  }
+}
+
+void HyperClockTable::Erase(const UniqueId64x2& hashed_key) {
+  size_t probe = 0;
+  (void)FindSlot(
+      hashed_key,
+      [&](HandleImpl* h) {
+        // Could be multiple entries in rare cases. Erase them all.
+        // Optimistically increment acquire counter
+        uint64_t old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement,
+                                              std::memory_order_acquire);
+        // Check if it's an entry visible to lookups
+        if ((old_meta >> ClockHandle::kStateShift) ==
+            ClockHandle::kStateVisible) {
+          // Acquired a read reference
+          if (h->hashed_key == hashed_key) {
+            // Match. Set invisible.
+            old_meta =
+                h->meta.fetch_and(~(uint64_t{ClockHandle::kStateVisibleBit}
+                                    << ClockHandle::kStateShift),
+                                  std::memory_order_acq_rel);
+            // Apply update to local copy
+            old_meta &= ~(uint64_t{ClockHandle::kStateVisibleBit}
+                          << ClockHandle::kStateShift);
+            for (;;) {
+              uint64_t refcount = GetRefcount(old_meta);
+              assert(refcount > 0);
+              if (refcount > 1) {
+                // Not last ref at some point in time during this Erase call
+                // Pretend we never took the reference
+                h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
+                                  std::memory_order_release);
+                break;
+              } else if (h->meta.compare_exchange_weak(
+                             old_meta,
+                             uint64_t{ClockHandle::kStateConstruction}
+                                 << ClockHandle::kStateShift,
+                             std::memory_order_acq_rel)) {
+                // Took ownership
+                assert(hashed_key == h->hashed_key);
+                size_t total_charge = h->GetTotalCharge();
+                FreeDataMarkEmpty(*h);
+                ReclaimEntryUsage(total_charge);
+                // We already have a copy of hashed_key in this case, so OK to
+                // delay Rollback until after releasing the entry
+                Rollback(hashed_key, h);
+                break;
+              }
+            }
+          } else {
+            // Mismatch. Pretend we never took the reference
+            h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
+                              std::memory_order_release);
+          }
+        } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) ==
+                            ClockHandle::kStateInvisible)) {
+          // Pretend we never took the reference
+          // WART: there's a tiny chance we release last ref to invisible
+          // entry here. If that happens, we let eviction take care of it.
+          h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
+                            std::memory_order_release);
+        } else {
+          // For other states, incrementing the acquire counter has no effect
+          // so we don't need to undo it.
+        }
+        return false;
+      },
+      [&](HandleImpl* h) {
+        return h->displacements.load(std::memory_order_relaxed) == 0;
+      },
+      [&](HandleImpl* /*h*/) {}, probe);
+}
+
+void HyperClockTable::ConstApplyToEntriesRange(
+    std::function<void(const HandleImpl&)> func, size_t index_begin,
+    size_t index_end, bool apply_if_will_be_deleted) const {
+  uint64_t check_state_mask = ClockHandle::kStateShareableBit;
+  if (!apply_if_will_be_deleted) {
+    check_state_mask |= ClockHandle::kStateVisibleBit;
+  }
+
+  for (size_t i = index_begin; i < index_end; i++) {
+    HandleImpl& h = array_[i];
+
+    // Note: to avoid using compare_exchange, we have to be extra careful.
+    uint64_t old_meta = h.meta.load(std::memory_order_relaxed);
+    // Check if it's an entry visible to lookups
+    if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
+      // Increment acquire counter. Note: it's possible that the entry has
+      // completely changed since we loaded old_meta, but incrementing acquire
+      // count is always safe. (Similar to optimistic Lookup here.)
+      old_meta = h.meta.fetch_add(ClockHandle::kAcquireIncrement,
+                                  std::memory_order_acquire);
+      // Check whether we actually acquired a reference.
+      if ((old_meta >> ClockHandle::kStateShift) &
+          ClockHandle::kStateShareableBit) {
+        // Apply func if appropriate
+        if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
+          func(h);
+        }
+        // Pretend we never took the reference
+        h.meta.fetch_sub(ClockHandle::kAcquireIncrement,
+                         std::memory_order_release);
+        // No net change, so don't need to check for overflow
+      } else {
+        // For other states, incrementing the acquire counter has no effect
+        // so we don't need to undo it. Furthermore, we cannot safely undo
+        // it because we did not acquire a read reference to lock the
+        // entry in a Shareable state.
+      }
+    }
+  }
+}
+
+void HyperClockTable::EraseUnRefEntries() {
+  for (size_t i = 0; i <= this->length_bits_mask_; i++) {
+    HandleImpl& h = array_[i];
+
+    uint64_t old_meta = h.meta.load(std::memory_order_relaxed);
+    if (old_meta & (uint64_t{ClockHandle::kStateShareableBit}
+                    << ClockHandle::kStateShift) &&
+        GetRefcount(old_meta) == 0 &&
+        h.meta.compare_exchange_strong(old_meta,
+                                       uint64_t{ClockHandle::kStateConstruction}
+                                           << ClockHandle::kStateShift,
+                                       std::memory_order_acquire)) {
+      // Took ownership
+      size_t total_charge = h.GetTotalCharge();
+      Rollback(h.hashed_key, &h);
+      FreeDataMarkEmpty(h);
+      ReclaimEntryUsage(total_charge);
+    }
+  }
+}
+
+inline HyperClockTable::HandleImpl* HyperClockTable::FindSlot(
+    const UniqueId64x2& hashed_key, std::function<bool(HandleImpl*)> match_fn,
+    std::function<bool(HandleImpl*)> abort_fn,
+    std::function<void(HandleImpl*)> update_fn, size_t& probe) {
+  // NOTE: upper 32 bits of hashed_key[0] is used for sharding
+  //
+  // We use double-hashing probing. Every probe in the sequence is a
+  // pseudorandom integer, computed as a linear function of two random hashes,
+  // which we call base and increment. Specifically, the i-th probe is base + i
+  // * increment modulo the table size.
+  size_t base = static_cast<size_t>(hashed_key[1]);
+  // We use an odd increment, which is relatively prime with the power-of-two
+  // table size. This implies that we cycle back to the first probe only
+  // after probing every slot exactly once.
+  // TODO: we could also reconsider linear probing, though locality benefits
+  // are limited because each slot is a full cache line
+  size_t increment = static_cast<size_t>(hashed_key[0]) | 1U;
+  size_t current = ModTableSize(base + probe * increment);
+  while (probe <= length_bits_mask_) {
+    HandleImpl* h = &array_[current];
+    if (match_fn(h)) {
+      probe++;
+      return h;
+    }
+    if (abort_fn(h)) {
+      return nullptr;
+    }
+    probe++;
+    update_fn(h);
+    current = ModTableSize(current + increment);
+  }
+  // We looped back.
+  return nullptr;
+}
+
+inline void HyperClockTable::Rollback(const UniqueId64x2& hashed_key,
+                                      const HandleImpl* h) {
+  size_t current = ModTableSize(hashed_key[1]);
+  size_t increment = static_cast<size_t>(hashed_key[0]) | 1U;
+  while (&array_[current] != h) {
+    array_[current].displacements.fetch_sub(1, std::memory_order_relaxed);
+    current = ModTableSize(current + increment);
+  }
+}
+
+inline void HyperClockTable::ReclaimEntryUsage(size_t total_charge) {
+  auto old_occupancy = occupancy_.fetch_sub(1U, std::memory_order_release);
+  (void)old_occupancy;
+  // No underflow
+  assert(old_occupancy > 0);
+  auto old_usage = usage_.fetch_sub(total_charge, std::memory_order_relaxed);
+  (void)old_usage;
+  // No underflow
+  assert(old_usage >= total_charge);
+}
+
+inline void HyperClockTable::Evict(size_t requested_charge,
+                                   size_t* freed_charge, size_t* freed_count) {
+  // precondition
+  assert(requested_charge > 0);
+
+  // TODO: make a tuning parameter?
+  constexpr size_t step_size = 4;
+
+  // First (concurrent) increment clock pointer
+  uint64_t old_clock_pointer =
+      clock_pointer_.fetch_add(step_size, std::memory_order_relaxed);
+
+  // Cap the eviction effort at this thread (along with those operating in
+  // parallel) circling through the whole structure kMaxCountdown times.
+  // In other words, this eviction run must find something/anything that is
+  // unreferenced at start of and during the eviction run that isn't reclaimed
+  // by a concurrent eviction run.
+  uint64_t max_clock_pointer =
+      old_clock_pointer + (ClockHandle::kMaxCountdown << length_bits_);
+
+  for (;;) {
+    for (size_t i = 0; i < step_size; i++) {
+      HandleImpl& h = array_[ModTableSize(Lower32of64(old_clock_pointer + i))];
+      bool evicting = ClockUpdate(h);
+      if (evicting) {
+        Rollback(h.hashed_key, &h);
+        *freed_charge += h.GetTotalCharge();
+        *freed_count += 1;
+        FreeDataMarkEmpty(h);
+      }
+    }
+
+    // Loop exit condition
+    if (*freed_charge >= requested_charge) {
+      return;
+    }
+    if (old_clock_pointer >= max_clock_pointer) {
+      return;
+    }
+
+    // Advance clock pointer (concurrently)
+    old_clock_pointer =
+        clock_pointer_.fetch_add(step_size, std::memory_order_relaxed);
+  }
+}
+
+template <class Table>
+ClockCacheShard<Table>::ClockCacheShard(
+    size_t capacity, bool strict_capacity_limit,
+    CacheMetadataChargePolicy metadata_charge_policy,
+    const typename Table::Opts& opts)
+    : CacheShardBase(metadata_charge_policy),
+      table_(capacity, strict_capacity_limit, metadata_charge_policy, opts),
+      capacity_(capacity),
+      strict_capacity_limit_(strict_capacity_limit) {
+  // Initial charge metadata should not exceed capacity
+  assert(table_.GetUsage() <= capacity_ || capacity_ < sizeof(HandleImpl));
+}
+
+template <class Table>
+void ClockCacheShard<Table>::EraseUnRefEntries() {
+  table_.EraseUnRefEntries();
+}
+
+template <class Table>
+void ClockCacheShard<Table>::ApplyToSomeEntries(
+    const std::function<void(const Slice& key, void* value, size_t charge,
+                             DeleterFn deleter)>& callback,
+    size_t average_entries_per_lock, size_t* state) {
+  // The state is essentially going to be the starting hash, which works
+  // nicely even if we resize between calls because we use upper-most
+  // hash bits for table indexes.
+  size_t length_bits = table_.GetLengthBits();
+  size_t length = table_.GetTableSize();
+
+  assert(average_entries_per_lock > 0);
+  // Assuming we are called with same average_entries_per_lock repeatedly,
+  // this simplifies some logic (index_end will not overflow).
+  assert(average_entries_per_lock < length || *state == 0);
+
+  size_t index_begin = *state >> (sizeof(size_t) * 8u - length_bits);
+  size_t index_end = index_begin + average_entries_per_lock;
+  if (index_end >= length) {
+    // Going to end.
+    index_end = length;
+    *state = SIZE_MAX;
+  } else {
+    *state = index_end << (sizeof(size_t) * 8u - length_bits);
+  }
+
+  table_.ConstApplyToEntriesRange(
+      [callback](const HandleImpl& h) {
+        UniqueId64x2 unhashed;
+        callback(ReverseHash(h.hashed_key, &unhashed), h.value,
+                 h.GetTotalCharge(), h.deleter);
+      },
+      index_begin, index_end, false);
+}
+
+int HyperClockTable::CalcHashBits(
+    size_t capacity, size_t estimated_value_size,
+    CacheMetadataChargePolicy metadata_charge_policy) {
+  double average_slot_charge = estimated_value_size * kLoadFactor;
+  if (metadata_charge_policy == kFullChargeCacheMetadata) {
+    average_slot_charge += sizeof(HandleImpl);
+  }
+  assert(average_slot_charge > 0.0);
+  uint64_t num_slots =
+      static_cast<uint64_t>(capacity / average_slot_charge + 0.999999);
+
+  int hash_bits = FloorLog2((num_slots << 1) - 1);
+  if (metadata_charge_policy == kFullChargeCacheMetadata) {
+    // For very small estimated value sizes, it's possible to overshoot
+    while (hash_bits > 0 &&
+           uint64_t{sizeof(HandleImpl)} << hash_bits > capacity) {
+      hash_bits--;
+    }
+  }
+  return hash_bits;
+}
+
+template <class Table>
+void ClockCacheShard<Table>::SetCapacity(size_t capacity) {
+  capacity_.store(capacity, std::memory_order_relaxed);
+  // next Insert will take care of any necessary evictions
+}
+
+template <class Table>
+void ClockCacheShard<Table>::SetStrictCapacityLimit(
+    bool strict_capacity_limit) {
+  strict_capacity_limit_.store(strict_capacity_limit,
+                               std::memory_order_relaxed);
+  // next Insert will take care of any necessary evictions
+}
+
+template <class Table>
+Status ClockCacheShard<Table>::Insert(const Slice& key,
+                                      const UniqueId64x2& hashed_key,
+                                      void* value, size_t charge,
+                                      Cache::DeleterFn deleter,
+                                      HandleImpl** handle,
+                                      Cache::Priority priority) {
+  if (UNLIKELY(key.size() != kCacheKeySize)) {
+    return Status::NotSupported("ClockCache only supports key size " +
+                                std::to_string(kCacheKeySize) + "B");
+  }
+  ClockHandleBasicData proto;
+  proto.hashed_key = hashed_key;
+  proto.value = value;
+  proto.deleter = deleter;
+  proto.total_charge = charge;
+  Status s = table_.Insert(
+      proto, handle, priority, capacity_.load(std::memory_order_relaxed),
+      strict_capacity_limit_.load(std::memory_order_relaxed));
+  return s;
+}
+
+template <class Table>
+typename ClockCacheShard<Table>::HandleImpl* ClockCacheShard<Table>::Lookup(
+    const Slice& key, const UniqueId64x2& hashed_key) {
+  if (UNLIKELY(key.size() != kCacheKeySize)) {
+    return nullptr;
+  }
+  return table_.Lookup(hashed_key);
+}
+
+template <class Table>
+bool ClockCacheShard<Table>::Ref(HandleImpl* h) {
+  if (h == nullptr) {
+    return false;
+  }
+  table_.Ref(*h);
+  return true;
+}
+
+template <class Table>
+bool ClockCacheShard<Table>::Release(HandleImpl* handle, bool useful,
+                                     bool erase_if_last_ref) {
+  if (handle == nullptr) {
+    return false;
+  }
+  return table_.Release(handle, useful, erase_if_last_ref);
+}
+
+template <class Table>
+void ClockCacheShard<Table>::TEST_RefN(HandleImpl* h, size_t n) {
+  table_.TEST_RefN(*h, n);
+}
+
+template <class Table>
+void ClockCacheShard<Table>::TEST_ReleaseN(HandleImpl* h, size_t n) {
+  table_.TEST_ReleaseN(h, n);
+}
+
+template <class Table>
+bool ClockCacheShard<Table>::Release(HandleImpl* handle,
+                                     bool erase_if_last_ref) {
+  return Release(handle, /*useful=*/true, erase_if_last_ref);
+}
+
+template <class Table>
+void ClockCacheShard<Table>::Erase(const Slice& key,
+                                   const UniqueId64x2& hashed_key) {
+  if (UNLIKELY(key.size() != kCacheKeySize)) {
+    return;
+  }
+  table_.Erase(hashed_key);
+}
+
+template <class Table>
+size_t ClockCacheShard<Table>::GetUsage() const {
+  return table_.GetUsage();
+}
+
+template <class Table>
+size_t ClockCacheShard<Table>::GetDetachedUsage() const {
+  return table_.GetDetachedUsage();
+}
+
+template <class Table>
+size_t ClockCacheShard<Table>::GetCapacity() const {
+  return capacity_;
+}
+
+template <class Table>
+size_t ClockCacheShard<Table>::GetPinnedUsage() const {
+  // Computes the pinned usage by scanning the whole hash table. This
+  // is slow, but avoids keeping an exact counter on the clock usage,
+  // i.e., the number of not externally referenced elements.
+  // Why avoid this counter? Because Lookup removes elements from the clock
+  // list, so it would need to update the pinned usage every time,
+  // which creates additional synchronization costs.
+  size_t table_pinned_usage = 0;
+  const bool charge_metadata =
+      metadata_charge_policy_ == kFullChargeCacheMetadata;
+  table_.ConstApplyToEntriesRange(
+      [&table_pinned_usage, charge_metadata](const HandleImpl& h) {
+        uint64_t meta = h.meta.load(std::memory_order_relaxed);
+        uint64_t refcount = GetRefcount(meta);
+        // Holding one ref for ConstApplyToEntriesRange
+        assert(refcount > 0);
+        if (refcount > 1) {
+          table_pinned_usage += h.GetTotalCharge();
+          if (charge_metadata) {
+            table_pinned_usage += sizeof(HandleImpl);
+          }
+        }
+      },
+      0, table_.GetTableSize(), true);
+
+  return table_pinned_usage + table_.GetDetachedUsage();
+}
+
+template <class Table>
+size_t ClockCacheShard<Table>::GetOccupancyCount() const {
+  return table_.GetOccupancy();
+}
+
+template <class Table>
+size_t ClockCacheShard<Table>::GetOccupancyLimit() const {
+  return table_.GetOccupancyLimit();
+}
+
+template <class Table>
+size_t ClockCacheShard<Table>::GetTableAddressCount() const {
+  return table_.GetTableSize();
+}
+
+// Explicit instantiation
+template class ClockCacheShard<HyperClockTable>;
+
+HyperClockCache::HyperClockCache(
+    size_t capacity, size_t estimated_value_size, int num_shard_bits,
+    bool strict_capacity_limit,
+    CacheMetadataChargePolicy metadata_charge_policy,
+    std::shared_ptr<MemoryAllocator> memory_allocator)
+    : ShardedCache(capacity, num_shard_bits, strict_capacity_limit,
+                   std::move(memory_allocator)) {
+  assert(estimated_value_size > 0 ||
+         metadata_charge_policy != kDontChargeCacheMetadata);
+  // TODO: should not need to go through two levels of pointer indirection to
+  // get to table entries
+  size_t per_shard = GetPerShardCapacity();
+  InitShards([=](Shard* cs) {
+    HyperClockTable::Opts opts;
+    opts.estimated_value_size = estimated_value_size;
+    new (cs)
+        Shard(per_shard, strict_capacity_limit, metadata_charge_policy, opts);
+  });
+}
+
+void* HyperClockCache::Value(Handle* handle) {
+  return reinterpret_cast<const HandleImpl*>(handle)->value;
+}
+
+size_t HyperClockCache::GetCharge(Handle* handle) const {
+  return reinterpret_cast<const HandleImpl*>(handle)->GetTotalCharge();
+}
+
+Cache::DeleterFn HyperClockCache::GetDeleter(Handle* handle) const {
+  auto h = reinterpret_cast<const HandleImpl*>(handle);
+  return h->deleter;
+}
+
+namespace {
+
+// For each cache shard, estimate what the table load factor would be if
+// cache filled to capacity with average entries. This is considered
+// indicative of a potential problem if the shard is essentially operating
+// "at limit", which we define as high actual usage (>80% of capacity)
+// or actual occupancy very close to limit (>95% of limit).
+// Also, for each shard compute the recommended estimated_entry_charge,
+// and keep the minimum one for use as overall recommendation.
+void AddShardEvaluation(const HyperClockCache::Shard& shard,
+                        std::vector<double>& predicted_load_factors,
+                        size_t& min_recommendation) {
+  size_t usage = shard.GetUsage() - shard.GetDetachedUsage();
+  size_t capacity = shard.GetCapacity();
+  double usage_ratio = 1.0 * usage / capacity;
+
+  size_t occupancy = shard.GetOccupancyCount();
+  size_t occ_limit = shard.GetOccupancyLimit();
+  double occ_ratio = 1.0 * occupancy / occ_limit;
+  if (usage == 0 || occupancy == 0 || (usage_ratio < 0.8 && occ_ratio < 0.95)) {
+    // Skip as described above
+    return;
+  }
+
+  // If filled to capacity, what would the occupancy ratio be?
+  double ratio = occ_ratio / usage_ratio;
+  // Given max load factor, what that load factor be?
+  double lf = ratio * kStrictLoadFactor;
+  predicted_load_factors.push_back(lf);
+
+  // Update min_recommendation also
+  size_t recommendation = usage / occupancy;
+  min_recommendation = std::min(min_recommendation, recommendation);
+}
+
+}  // namespace
+
+void HyperClockCache::ReportProblems(
+    const std::shared_ptr<Logger>& info_log) const {
+  uint32_t shard_count = GetNumShards();
+  std::vector<double> predicted_load_factors;
+  size_t min_recommendation = SIZE_MAX;
+  const_cast<HyperClockCache*>(this)->ForEachShard(
+      [&](HyperClockCache::Shard* shard) {
+        AddShardEvaluation(*shard, predicted_load_factors, min_recommendation);
+      });
+
+  if (predicted_load_factors.empty()) {
+    // None operating "at limit" -> nothing to report
+    return;
+  }
+  std::sort(predicted_load_factors.begin(), predicted_load_factors.end());
+
+  // First, if the average load factor is within spec, we aren't going to
+  // complain about a few shards being out of spec.
+  // NOTE: this is only the average among cache shards operating "at limit,"
+  // which should be representative of what we care about. It it normal, even
+  // desirable, for a cache to operate "at limit" so this should not create
+  // selection bias. See AddShardEvaluation().
+  // TODO: Consider detecting cases where decreasing the number of shards
+  // would be good, e.g. serious imbalance among shards.
+  double average_load_factor =
+      std::accumulate(predicted_load_factors.begin(),
+                      predicted_load_factors.end(), 0.0) /
+      shard_count;
+
+  constexpr double kLowSpecLoadFactor = kLoadFactor / 2;
+  constexpr double kMidSpecLoadFactor = kLoadFactor / 1.414;
+  if (average_load_factor > kLoadFactor) {
+    // Out of spec => Consider reporting load factor too high
+    // Estimate effective overall capacity loss due to enforcing occupancy limit
+    double lost_portion = 0.0;
+    int over_count = 0;
+    for (double lf : predicted_load_factors) {
+      if (lf > kStrictLoadFactor) {
+        ++over_count;
+        lost_portion += (lf - kStrictLoadFactor) / lf / shard_count;
+      }
+    }
+    // >= 20% loss -> error
+    // >= 10% loss -> consistent warning
+    // >= 1% loss -> intermittent warning
+    InfoLogLevel level = InfoLogLevel::INFO_LEVEL;
+    bool report = true;
+    if (lost_portion > 0.2) {
+      level = InfoLogLevel::ERROR_LEVEL;
+    } else if (lost_portion > 0.1) {
+      level = InfoLogLevel::WARN_LEVEL;
+    } else if (lost_portion > 0.01) {
+      int report_percent = static_cast<int>(lost_portion * 100.0);
+      if (Random::GetTLSInstance()->PercentTrue(report_percent)) {
+        level = InfoLogLevel::WARN_LEVEL;
+      }
+    } else {
+      // don't report
+      report = false;
+    }
+    if (report) {
+      ROCKS_LOG_AT_LEVEL(
+          info_log, level,
+          "HyperClockCache@%p unable to use estimated %.1f%% capacity because "
+          "of "
+          "full occupancy in %d/%u cache shards (estimated_entry_charge too "
+          "high). Recommend estimated_entry_charge=%zu",
+          this, lost_portion * 100.0, over_count, (unsigned)shard_count,
+          min_recommendation);
+    }
+  } else if (average_load_factor < kLowSpecLoadFactor) {
+    // Out of spec => Consider reporting load factor too low
+    // But cautiously because low is not as big of a problem.
+
+    // Only report if highest occupancy shard is also below
+    // spec and only if average is substantially out of spec
+    if (predicted_load_factors.back() < kLowSpecLoadFactor &&
+        average_load_factor < kLowSpecLoadFactor / 1.414) {
+      InfoLogLevel level = InfoLogLevel::INFO_LEVEL;
+      if (average_load_factor < kLowSpecLoadFactor / 2) {
+        level = InfoLogLevel::WARN_LEVEL;
+      }
+      ROCKS_LOG_AT_LEVEL(
+          info_log, level,
+          "HyperClockCache@%p table has low occupancy at full capacity. Higher "
+          "estimated_entry_charge (about %.1fx) would likely improve "
+          "performance. Recommend estimated_entry_charge=%zu",
+          this, kMidSpecLoadFactor / average_load_factor, min_recommendation);
+    }
+  }
+}
+
+}  // namespace clock_cache
+
+// DEPRECATED (see public API)
+std::shared_ptr<Cache> NewClockCache(
+    size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+    CacheMetadataChargePolicy metadata_charge_policy) {
+  return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit,
+                     /* high_pri_pool_ratio */ 0.5, nullptr,
+                     kDefaultToAdaptiveMutex, metadata_charge_policy,
+                     /* low_pri_pool_ratio */ 0.0);
+}
+
+std::shared_ptr<Cache> HyperClockCacheOptions::MakeSharedCache() const {
+  auto my_num_shard_bits = num_shard_bits;
+  if (my_num_shard_bits >= 20) {
+    return nullptr;  // The cache cannot be sharded into too many fine pieces.
+  }
+  if (my_num_shard_bits < 0) {
+    // Use larger shard size to reduce risk of large entries clustering
+    // or skewing individual shards.
+    constexpr size_t min_shard_size = 32U * 1024U * 1024U;
+    my_num_shard_bits = GetDefaultCacheShardBits(capacity, min_shard_size);
+  }
+  return std::make_shared<clock_cache::HyperClockCache>(
+      capacity, estimated_entry_charge, my_num_shard_bits,
+      strict_capacity_limit, metadata_charge_policy, memory_allocator);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/clock_cache.h b/src/rocksdb/cache/clock_cache.h
new file mode 100644
index 000000000..ef1b0ccb7
--- /dev/null
+++ b/src/rocksdb/cache/clock_cache.h
@@ -0,0 +1,701 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <array>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "cache/cache_key.h"
+#include "cache/sharded_cache.h"
+#include "port/lang.h"
+#include "port/malloc.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/secondary_cache.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace clock_cache {
+
+// Forward declaration of friend class.
+class ClockCacheTest;
+
+// HyperClockCache is an alternative to LRUCache specifically tailored for
+// use as BlockBasedTableOptions::block_cache
+//
+// Benefits
+// --------
+// * Fully lock free (no waits or spins) for efficiency under high concurrency
+// * Optimized for hot path reads. For concurrency control, most Lookup() and
+// essentially all Release() are a single atomic add operation.
+// * Eviction on insertion is fully parallel and lock-free.
+// * Uses a generalized + aging variant of CLOCK eviction that might outperform
+// LRU in some cases. (For background, see
+// https://en.wikipedia.org/wiki/Page_replacement_algorithm)
+//
+// Costs
+// -----
+// * Hash table is not resizable (for lock-free efficiency) so capacity is not
+// dynamically changeable. Rely on an estimated average value (block) size for
+// space+time efficiency. (See estimated_entry_charge option details.)
+// * Insert usually does not (but might) overwrite a previous entry associated
+// with a cache key. This is OK for RocksDB uses of Cache.
+// * Only supports keys of exactly 16 bytes, which is what RocksDB uses for
+// block cache (not row cache or table cache).
+// * SecondaryCache is not supported.
+// * Cache priorities are less aggressively enforced. Unlike LRUCache, enough
+// transient LOW or BOTTOM priority items can evict HIGH priority entries that
+// are not referenced recently (or often) enough.
+// * If pinned entries leave little or nothing eligible for eviction,
+// performance can degrade substantially, because of clock eviction eating
+// CPU looking for evictable entries and because Release does not
+// pro-actively delete unreferenced entries when the cache is over-full.
+// Specifically, this makes this implementation more susceptible to the
+// following combination:
+//   * num_shard_bits is high (e.g. 6)
+//   * capacity small (e.g. some MBs)
+//   * some large individual entries (e.g. non-partitioned filters)
+// where individual entries occupy a large portion of their shard capacity.
+// This should be mostly mitigated by the implementation picking a lower
+// number of cache shards than LRUCache for a given capacity (when
+// num_shard_bits is not overridden; see calls to GetDefaultCacheShardBits()).
+// * With strict_capacity_limit=false, respecting the capacity limit is not as
+// aggressive as LRUCache. The limit might be transiently exceeded by a very
+// small number of entries even when not strictly necessary, and slower to
+// recover after pinning forces limit to be substantially exceeded. (Even with
+// strict_capacity_limit=true, RocksDB will nevertheless transiently allocate
+// memory before discovering it is over the block cache capacity, so this
+// should not be a detectable regression in respecting memory limits, except
+// on exceptionally small caches.)
+// * In some cases, erased or duplicated entries might not be freed
+// immediately. They will eventually be freed by eviction from further Inserts.
+// * Internal metadata can overflow if the number of simultaneous references
+// to a cache handle reaches many millions.
+//
+// High-level eviction algorithm
+// -----------------------------
+// A score (or "countdown") is maintained for each entry, initially determined
+// by priority. The score is incremented on each Lookup, up to a max of 3,
+// though is easily returned to previous state if useful=false with Release.
+// During CLOCK-style eviction iteration, entries with score > 0 are
+// decremented if currently unreferenced and entries with score == 0 are
+// evicted if currently unreferenced. Note that scoring might not be perfect
+// because entries can be referenced transiently within the cache even when
+// there are no outside references to the entry.
+//
+// Cache sharding like LRUCache is used to reduce contention on usage+eviction
+// state, though here the performance improvement from more shards is small,
+// and (as noted above) potentially detrimental if shard capacity is too close
+// to largest entry size. Here cache sharding mostly only affects cache update
+// (Insert / Erase) performance, not read performance.
+//
+// Read efficiency (hot path)
+// --------------------------
+// Mostly to minimize the cost of accessing metadata blocks with
+// cache_index_and_filter_blocks=true, we focus on optimizing Lookup and
+// Release. In terms of concurrency, at a minimum, these operations have
+// to do reference counting (and Lookup has to compare full keys in a safe
+// way). Can we fold in all the other metadata tracking *for free* with
+// Lookup and Release doing a simple atomic fetch_add/fetch_sub? (Assume
+// for the moment that Lookup succeeds on the first probe.)
+//
+// We have a clever way of encoding an entry's reference count and countdown
+// clock so that Lookup and Release are each usually a single atomic addition.
+// In a single metadata word we have both an "acquire" count, incremented by
+// Lookup, and a "release" count, incremented by Release. If useful=false,
+// Release can instead decrement the acquire count. Thus the current ref
+// count is (acquires - releases), and the countdown clock is min(3, acquires).
+// Note that only unreferenced entries (acquires == releases) are eligible
+// for CLOCK manipulation and eviction. We tolerate use of more expensive
+// compare_exchange operations for cache writes (insertions and erasures).
+//
+// In a cache receiving many reads and little or no writes, it is possible
+// for the acquire and release counters to overflow. Assuming the *current*
+// refcount never reaches to many millions, we only have to correct for
+// overflow in both counters in Release, not in Lookup. The overflow check
+// should be only 1-2 CPU cycles per Release because it is a predictable
+// branch on a simple condition on data already in registers.
+//
+// Slot states
+// -----------
+// We encode a state indicator into the same metadata word with the
+// acquire and release counters. This allows bigger state transitions to
+// be atomic. States:
+//
+// * Empty - slot is not in use and unowned. All other metadata and data is
+// in an undefined state.
+// * Construction - slot is exclusively owned by one thread, the thread
+// successfully entering this state, for populating or freeing data.
+// * Shareable (group) - slot holds an entry with counted references for
+// pinning and reading, including
+//   * Visible - slot holds an entry that can be returned by Lookup
+//   * Invisible - slot holds an entry that is not visible to Lookup
+//     (erased by user) but can be read by existing references, and ref count
+//     changed by Ref and Release.
+//
+// A special case is "detached" entries, which are heap-allocated handles
+// not in the table. They are always Invisible and freed on zero refs.
+//
+// State transitions:
+// Empty -> Construction (in Insert): The encoding of state enables Insert to
+// perform an optimistic atomic bitwise-or to take ownership if a slot is
+// empty, or otherwise make no state change.
+//
+// Construction -> Visible (in Insert): This can be a simple assignment to the
+// metadata word because the current thread has exclusive ownership and other
+// metadata is meaningless.
+//
+// Visible -> Invisible (in Erase): This can be a bitwise-and while holding
+// a shared reference, which is safe because the change is idempotent (in case
+// of parallel Erase). By the way, we never go Invisible->Visible.
+//
+// Shareable -> Construction (in Evict part of Insert, in Erase, and in
+// Release if Invisible): This is for starting to freeing/deleting an
+// unreferenced entry. We have to use compare_exchange to ensure we only make
+// this transition when there are zero refs.
+//
+// Construction -> Empty (in same places): This is for completing free/delete
+// of an entry. A "release" atomic store suffices, as we have exclusive
+// ownership of the slot but have to ensure none of the data member reads are
+// re-ordered after committing the state transition.
+//
+// Insert
+// ------
+// If Insert were to guarantee replacing an existing entry for a key, there
+// would be complications for concurrency and efficiency. First, consider how
+// many probes to get to an entry. To ensure Lookup never waits and
+// availability of a key is uninterrupted, we would need to use a different
+// slot for a new entry for the same key. This means it is most likely in a
+// later probing position than the old version, which should soon be removed.
+// (Also, an entry is too big to replace atomically, even if no current refs.)
+//
+// However, overwrite capability is not really needed by RocksDB. Also, we
+// know from our "redundant" stats that overwrites are very rare for the block
+// cache, so we should not spend much to make them effective.
+//
+// So instead we Insert as soon as we find an empty slot in the probing
+// sequence without seeing an existing (visible) entry for the same key. This
+// way we only insert if we can improve the probing performance, and we don't
+// need to probe beyond our insert position, assuming we are willing to let
+// the previous entry for the same key die of old age (eventual eviction from
+// not being used). We can reach a similar state with concurrent insertions,
+// where one will pass over the other while it is "under construction."
+// This temporary duplication is acceptable for RocksDB block cache because
+// we know redundant insertion is rare.
+//
+// Another problem to solve is what to return to the caller when we find an
+// existing entry whose probing position we cannot improve on, or when the
+// table occupancy limit has been reached. If strict_capacity_limit=false,
+// we must never fail Insert, and if a Handle* is provided, we have to return
+// a usable Cache handle on success. The solution to this (typically rare)
+// problem is "detached" handles, which are usable by the caller but not
+// actually available for Lookup in the Cache. Detached handles are allocated
+// independently on the heap and specially marked so that they are freed on
+// the heap when their last reference is released.
+//
+// Usage on capacity
+// -----------------
+// Insert takes different approaches to usage tracking depending on
+// strict_capacity_limit setting. If true, we enforce a kind of strong
+// consistency where compare-exchange is used to ensure the usage number never
+// exceeds its limit, and provide threads with an authoritative signal on how
+// much "usage" they have taken ownership of. With strict_capacity_limit=false,
+// we use a kind of "eventual consistency" where all threads Inserting to the
+// same cache shard might race on reserving the same space, but the
+// over-commitment will be worked out in later insertions. It is kind of a
+// dance because we don't want threads racing each other too much on paying
+// down the over-commitment (with eviction) either.
+//
+// Eviction
+// --------
+// A key part of Insert is evicting some entries currently unreferenced to
+// make room for new entries. The high-level eviction algorithm is described
+// above, but the details are also interesting. A key part is parallelizing
+// eviction with a single CLOCK pointer. This works by each thread working on
+// eviction pre-emptively incrementing the CLOCK pointer, and then CLOCK-
+// updating or evicting the incremented-over slot(s). To reduce contention at
+// the cost of possibly evicting too much, each thread increments the clock
+// pointer by 4, so commits to updating at least 4 slots per batch. As
+// described above, a CLOCK update will decrement the "countdown" of
+// unreferenced entries, or evict unreferenced entries with zero countdown.
+// Referenced entries are not updated, because we (presumably) don't want
+// long-referenced entries to age while referenced. Note however that we
+// cannot distinguish transiently referenced entries from cache user
+// references, so some CLOCK updates might be somewhat arbitrarily skipped.
+// This is OK as long as it is rare enough that eviction order is still
+// pretty good.
+//
+// There is no synchronization on the completion of the CLOCK updates, so it
+// is theoretically possible for another thread to cycle back around and have
+// two threads racing on CLOCK updates to the same slot. Thus, we cannot rely
+// on any implied exclusivity to make the updates or eviction more efficient.
+// These updates use an opportunistic compare-exchange (no loop), where a
+// racing thread might cause the update to be skipped without retry, but in
+// such case the update is likely not needed because the most likely update
+// to an entry is that it has become referenced. (TODO: test efficiency of
+// avoiding compare-exchange loop)
+//
+// Release
+// -------
+// In the common case, Release is a simple atomic increment of the release
+// counter. There is a simple overflow check that only does another atomic
+// update in extremely rare cases, so costs almost nothing.
+//
+// If the Release specifies "not useful", we can instead decrement the
+// acquire counter, which returns to the same CLOCK state as before Lookup
+// or Ref.
+//
+// Adding a check for over-full cache on every release to zero-refs would
+// likely be somewhat expensive, increasing read contention on cache shard
+// metadata. Instead we are less aggressive about deleting entries right
+// away in those cases.
+//
+// However Release tries to immediately delete entries reaching zero refs
+// if (a) erase_if_last_ref is set by the caller, or (b) the entry is already
+// marked invisible. Both of these are checks on values already in CPU
+// registers so do not increase cross-CPU contention when not applicable.
+// When applicable, they use a compare-exchange loop to take exclusive
+// ownership of the slot for freeing the entry. These are rare cases
+// that should not usually affect performance.
+//
+// Erase
+// -----
+// Searches for an entry like Lookup but moves it to Invisible state if found.
+// This state transition is with bit operations so is idempotent and safely
+// done while only holding a shared "read" reference. Like Release, it makes
+// a best effort to immediately release an Invisible entry that reaches zero
+// refs, but there are some corner cases where it will only be freed by the
+// clock eviction process.
+
+// ----------------------------------------------------------------------- //
+
+// The load factor p is a real number in (0, 1) such that at all
+// times at most a fraction p of all slots, without counting tombstones,
+// are occupied by elements. This means that the probability that a random
+// probe hits an occupied slot is at most p, and thus at most 1/p probes
+// are required on average. For example, p = 70% implies that between 1 and 2
+// probes are needed on average (bear in mind that this reasoning doesn't
+// consider the effects of clustering over time, which should be negligible
+// with double hashing).
+// Because the size of the hash table is always rounded up to the next
+// power of 2, p is really an upper bound on the actual load factor---the
+// actual load factor is anywhere between p/2 and p. This is a bit wasteful,
+// but bear in mind that slots only hold metadata, not actual values.
+// Since space cost is dominated by the values (the LSM blocks),
+// overprovisioning the table with metadata only increases the total cache space
+// usage by a tiny fraction.
+constexpr double kLoadFactor = 0.7;
+
+// The user can exceed kLoadFactor if the sizes of the inserted values don't
+// match estimated_value_size, or in some rare cases with
+// strict_capacity_limit == false. To avoid degenerate performance, we set a
+// strict upper bound on the load factor.
+constexpr double kStrictLoadFactor = 0.84;
+
+struct ClockHandleBasicData {
+  void* value = nullptr;
+  Cache::DeleterFn deleter = nullptr;
+  // A lossless, reversible hash of the fixed-size (16 byte) cache key. This
+  // eliminates the need to store a hash separately.
+  UniqueId64x2 hashed_key = kNullUniqueId64x2;
+  size_t total_charge = 0;
+
+  // For total_charge_and_flags
+  // "Detached" means the handle is allocated separately from hash table.
+  static constexpr uint64_t kFlagDetached = uint64_t{1} << 63;
+  // Extract just the total charge
+  static constexpr uint64_t kTotalChargeMask = kFlagDetached - 1;
+
+  inline size_t GetTotalCharge() const { return total_charge; }
+
+  // Calls deleter (if non-null) on cache key and value
+  void FreeData() const;
+
+  // Required by concept HandleImpl
+  const UniqueId64x2& GetHash() const { return hashed_key; }
+};
+
+struct ClockHandle : public ClockHandleBasicData {
+  // Constants for handling the atomic `meta` word, which tracks most of the
+  // state of the handle. The meta word looks like this:
+  // low bits                                                     high bits
+  // -----------------------------------------------------------------------
+  // | acquire counter          | release counter           | state marker |
+  // -----------------------------------------------------------------------
+
+  // For reading or updating counters in meta word.
+  static constexpr uint8_t kCounterNumBits = 30;
+  static constexpr uint64_t kCounterMask = (uint64_t{1} << kCounterNumBits) - 1;
+
+  static constexpr uint8_t kAcquireCounterShift = 0;
+  static constexpr uint64_t kAcquireIncrement = uint64_t{1}
+                                                << kAcquireCounterShift;
+  static constexpr uint8_t kReleaseCounterShift = kCounterNumBits;
+  static constexpr uint64_t kReleaseIncrement = uint64_t{1}
+                                                << kReleaseCounterShift;
+
+  // For reading or updating the state marker in meta word
+  static constexpr uint8_t kStateShift = 2U * kCounterNumBits;
+
+  // Bits contribution to state marker.
+  // Occupied means any state other than empty
+  static constexpr uint8_t kStateOccupiedBit = 0b100;
+  // Shareable means the entry is reference counted (visible or invisible)
+  // (only set if also occupied)
+  static constexpr uint8_t kStateShareableBit = 0b010;
+  // Visible is only set if also shareable
+  static constexpr uint8_t kStateVisibleBit = 0b001;
+
+  // Complete state markers (not shifted into full word)
+  static constexpr uint8_t kStateEmpty = 0b000;
+  static constexpr uint8_t kStateConstruction = kStateOccupiedBit;
+  static constexpr uint8_t kStateInvisible =
+      kStateOccupiedBit | kStateShareableBit;
+  static constexpr uint8_t kStateVisible =
+      kStateOccupiedBit | kStateShareableBit | kStateVisibleBit;
+
+  // Constants for initializing the countdown clock. (Countdown clock is only
+  // in effect with zero refs, acquire counter == release counter, and in that
+  // case the countdown clock == both of those counters.)
+  static constexpr uint8_t kHighCountdown = 3;
+  static constexpr uint8_t kLowCountdown = 2;
+  static constexpr uint8_t kBottomCountdown = 1;
+  // During clock update, treat any countdown clock value greater than this
+  // value the same as this value.
+  static constexpr uint8_t kMaxCountdown = kHighCountdown;
+  // TODO: make these coundown values tuning parameters for eviction?
+
+  // See above
+  std::atomic<uint64_t> meta{};
+
+  // Anticipating use for SecondaryCache support
+  void* reserved_for_future_use = nullptr;
+};  // struct ClockHandle
+
+class HyperClockTable {
+ public:
+  // Target size to be exactly a common cache line size (see static_assert in
+  // clock_cache.cc)
+  struct ALIGN_AS(64U) HandleImpl : public ClockHandle {
+    // The number of elements that hash to this slot or a lower one, but wind
+    // up in this slot or a higher one.
+    std::atomic<uint32_t> displacements{};
+
+    // Whether this is a "deteched" handle that is independently allocated
+    // with `new` (so must be deleted with `delete`).
+    // TODO: ideally this would be packed into some other data field, such
+    // as upper bits of total_charge, but that incurs a measurable performance
+    // regression.
+    bool detached = false;
+
+    inline bool IsDetached() const { return detached; }
+
+    inline void SetDetached() { detached = true; }
+  };  // struct HandleImpl
+
+  struct Opts {
+    size_t estimated_value_size;
+  };
+
+  HyperClockTable(size_t capacity, bool strict_capacity_limit,
+                  CacheMetadataChargePolicy metadata_charge_policy,
+                  const Opts& opts);
+  ~HyperClockTable();
+
+  Status Insert(const ClockHandleBasicData& proto, HandleImpl** handle,
+                Cache::Priority priority, size_t capacity,
+                bool strict_capacity_limit);
+
+  HandleImpl* Lookup(const UniqueId64x2& hashed_key);
+
+  bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref);
+
+  void Ref(HandleImpl& handle);
+
+  void Erase(const UniqueId64x2& hashed_key);
+
+  void ConstApplyToEntriesRange(std::function<void(const HandleImpl&)> func,
+                                size_t index_begin, size_t index_end,
+                                bool apply_if_will_be_deleted) const;
+
+  void EraseUnRefEntries();
+
+  size_t GetTableSize() const { return size_t{1} << length_bits_; }
+
+  int GetLengthBits() const { return length_bits_; }
+
+  size_t GetOccupancy() const {
+    return occupancy_.load(std::memory_order_relaxed);
+  }
+
+  size_t GetOccupancyLimit() const { return occupancy_limit_; }
+
+  size_t GetUsage() const { return usage_.load(std::memory_order_relaxed); }
+
+  size_t GetDetachedUsage() const {
+    return detached_usage_.load(std::memory_order_relaxed);
+  }
+
+  // Acquire/release N references
+  void TEST_RefN(HandleImpl& handle, size_t n);
+  void TEST_ReleaseN(HandleImpl* handle, size_t n);
+
+ private:  // functions
+  // Returns x mod 2^{length_bits_}.
+  inline size_t ModTableSize(uint64_t x) {
+    return static_cast<size_t>(x) & length_bits_mask_;
+  }
+
+  // Runs the clock eviction algorithm trying to reclaim at least
+  // requested_charge. Returns how much is evicted, which could be less
+  // if it appears impossible to evict the requested amount without blocking.
+  inline void Evict(size_t requested_charge, size_t* freed_charge,
+                    size_t* freed_count);
+
+  // Returns the first slot in the probe sequence, starting from the given
+  // probe number, with a handle e such that match(e) is true. At every
+  // step, the function first tests whether match(e) holds. If this is false,
+  // it evaluates abort(e) to decide whether the search should be aborted,
+  // and in the affirmative returns -1. For every handle e probed except
+  // the last one, the function runs update(e).
+  // The probe parameter is modified as follows. We say a probe to a handle
+  // e is aborting if match(e) is false and abort(e) is true. Then the final
+  // value of probe is one more than the last non-aborting probe during the
+  // call. This is so that that the variable can be used to keep track of
+  // progress across consecutive calls to FindSlot.
+  inline HandleImpl* FindSlot(const UniqueId64x2& hashed_key,
+                              std::function<bool(HandleImpl*)> match,
+                              std::function<bool(HandleImpl*)> stop,
+                              std::function<void(HandleImpl*)> update,
+                              size_t& probe);
+
+  // Re-decrement all displacements in probe path starting from beginning
+  // until (not including) the given handle
+  inline void Rollback(const UniqueId64x2& hashed_key, const HandleImpl* h);
+
+  // Subtracts `total_charge` from `usage_` and 1 from `occupancy_`.
+  // Ideally this comes after releasing the entry itself so that we
+  // actually have the available occupancy/usage that is claimed.
+  // However, that means total_charge has to be saved from the handle
+  // before releasing it so that it can be provided to this function.
+  inline void ReclaimEntryUsage(size_t total_charge);
+
+  // Helper for updating `usage_` for new entry with given `total_charge`
+  // and evicting if needed under strict_capacity_limit=true rules. This
+  // means the operation might fail with Status::MemoryLimit. If
+  // `need_evict_for_occupancy`, then eviction of at least one entry is
+  // required, and the operation should fail if not possible.
+  // NOTE: Otherwise, occupancy_ is not managed in this function
+  inline Status ChargeUsageMaybeEvictStrict(size_t total_charge,
+                                            size_t capacity,
+                                            bool need_evict_for_occupancy);
+
+  // Helper for updating `usage_` for new entry with given `total_charge`
+  // and evicting if needed under strict_capacity_limit=false rules. This
+  // means that updating `usage_` always succeeds even if forced to exceed
+  // capacity. If `need_evict_for_occupancy`, then eviction of at least one
+  // entry is required, and the operation should return false if such eviction
+  // is not possible. `usage_` is not updated in that case. Otherwise, returns
+  // true, indicating success.
+  // NOTE: occupancy_ is not managed in this function
+  inline bool ChargeUsageMaybeEvictNonStrict(size_t total_charge,
+                                             size_t capacity,
+                                             bool need_evict_for_occupancy);
+
+  // Creates a "detached" handle for returning from an Insert operation that
+  // cannot be completed by actually inserting into the table.
+  // Updates `detached_usage_` but not `usage_` nor `occupancy_`.
+  inline HandleImpl* DetachedInsert(const ClockHandleBasicData& proto);
+
+  // Returns the number of bits used to hash an element in the hash
+  // table.
+  static int CalcHashBits(size_t capacity, size_t estimated_value_size,
+                          CacheMetadataChargePolicy metadata_charge_policy);
+
+ private:  // data
+  // Number of hash bits used for table index.
+  // The size of the table is 1 << length_bits_.
+  const int length_bits_;
+
+  // For faster computation of ModTableSize.
+  const size_t length_bits_mask_;
+
+  // Maximum number of elements the user can store in the table.
+  const size_t occupancy_limit_;
+
+  // Array of slots comprising the hash table.
+  const std::unique_ptr<HandleImpl[]> array_;
+
+  // We partition the following members into different cache lines
+  // to avoid false sharing among Lookup, Release, Erase and Insert
+  // operations in ClockCacheShard.
+
+  ALIGN_AS(CACHE_LINE_SIZE)
+  // Clock algorithm sweep pointer.
+  std::atomic<uint64_t> clock_pointer_{};
+
+  ALIGN_AS(CACHE_LINE_SIZE)
+  // Number of elements in the table.
+  std::atomic<size_t> occupancy_{};
+
+  // Memory usage by entries tracked by the cache (including detached)
+  std::atomic<size_t> usage_{};
+
+  // Part of usage by detached entries (not in table)
+  std::atomic<size_t> detached_usage_{};
+};  // class HyperClockTable
+
+// A single shard of sharded cache.
+template <class Table>
+class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
+ public:
+  ClockCacheShard(size_t capacity, bool strict_capacity_limit,
+                  CacheMetadataChargePolicy metadata_charge_policy,
+                  const typename Table::Opts& opts);
+
+  // For CacheShard concept
+  using HandleImpl = typename Table::HandleImpl;
+  // Hash is lossless hash of 128-bit key
+  using HashVal = UniqueId64x2;
+  using HashCref = const HashVal&;
+  static inline uint32_t HashPieceForSharding(HashCref hash) {
+    return Upper32of64(hash[0]);
+  }
+  static inline HashVal ComputeHash(const Slice& key) {
+    assert(key.size() == kCacheKeySize);
+    HashVal in;
+    HashVal out;
+    // NOTE: endian dependence
+    // TODO: use GetUnaligned?
+    std::memcpy(&in, key.data(), kCacheKeySize);
+    BijectiveHash2x64(in[1], in[0], &out[1], &out[0]);
+    return out;
+  }
+
+  // For reconstructing key from hashed_key. Requires the caller to provide
+  // backing storage for the Slice in `unhashed`
+  static inline Slice ReverseHash(const UniqueId64x2& hashed,
+                                  UniqueId64x2* unhashed) {
+    BijectiveUnhash2x64(hashed[1], hashed[0], &(*unhashed)[1], &(*unhashed)[0]);
+    // NOTE: endian dependence
+    return Slice(reinterpret_cast<const char*>(unhashed), kCacheKeySize);
+  }
+
+  // Although capacity is dynamically changeable, the number of table slots is
+  // not, so growing capacity substantially could lead to hitting occupancy
+  // limit.
+  void SetCapacity(size_t capacity);
+
+  void SetStrictCapacityLimit(bool strict_capacity_limit);
+
+  Status Insert(const Slice& key, const UniqueId64x2& hashed_key, void* value,
+                size_t charge, Cache::DeleterFn deleter, HandleImpl** handle,
+                Cache::Priority priority);
+
+  HandleImpl* Lookup(const Slice& key, const UniqueId64x2& hashed_key);
+
+  bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref);
+
+  bool Release(HandleImpl* handle, bool erase_if_last_ref = false);
+
+  bool Ref(HandleImpl* handle);
+
+  void Erase(const Slice& key, const UniqueId64x2& hashed_key);
+
+  size_t GetCapacity() const;
+
+  size_t GetUsage() const;
+
+  size_t GetDetachedUsage() const;
+
+  size_t GetPinnedUsage() const;
+
+  size_t GetOccupancyCount() const;
+
+  size_t GetOccupancyLimit() const;
+
+  size_t GetTableAddressCount() const;
+
+  void ApplyToSomeEntries(
+      const std::function<void(const Slice& key, void* value, size_t charge,
+                               DeleterFn deleter)>& callback,
+      size_t average_entries_per_lock, size_t* state);
+
+  void EraseUnRefEntries();
+
+  std::string GetPrintableOptions() const { return std::string{}; }
+
+  // SecondaryCache not yet supported
+  Status Insert(const Slice& key, const UniqueId64x2& hashed_key, void* value,
+                const Cache::CacheItemHelper* helper, size_t charge,
+                HandleImpl** handle, Cache::Priority priority) {
+    return Insert(key, hashed_key, value, charge, helper->del_cb, handle,
+                  priority);
+  }
+
+  HandleImpl* Lookup(const Slice& key, const UniqueId64x2& hashed_key,
+                     const Cache::CacheItemHelper* /*helper*/,
+                     const Cache::CreateCallback& /*create_cb*/,
+                     Cache::Priority /*priority*/, bool /*wait*/,
+                     Statistics* /*stats*/) {
+    return Lookup(key, hashed_key);
+  }
+
+  bool IsReady(HandleImpl* /*handle*/) { return true; }
+
+  void Wait(HandleImpl* /*handle*/) {}
+
+  // Acquire/release N references
+  void TEST_RefN(HandleImpl* handle, size_t n);
+  void TEST_ReleaseN(HandleImpl* handle, size_t n);
+
+ private:  // data
+  Table table_;
+
+  // Maximum total charge of all elements stored in the table.
+  std::atomic<size_t> capacity_;
+
+  // Whether to reject insertion if cache reaches its full capacity.
+  std::atomic<bool> strict_capacity_limit_;
+};  // class ClockCacheShard
+
+class HyperClockCache
+#ifdef NDEBUG
+    final
+#endif
+    : public ShardedCache<ClockCacheShard<HyperClockTable>> {
+ public:
+  using Shard = ClockCacheShard<HyperClockTable>;
+
+  HyperClockCache(size_t capacity, size_t estimated_value_size,
+                  int num_shard_bits, bool strict_capacity_limit,
+                  CacheMetadataChargePolicy metadata_charge_policy,
+                  std::shared_ptr<MemoryAllocator> memory_allocator);
+
+  const char* Name() const override { return "HyperClockCache"; }
+
+  void* Value(Handle* handle) override;
+
+  size_t GetCharge(Handle* handle) const override;
+
+  DeleterFn GetDeleter(Handle* handle) const override;
+
+  void ReportProblems(
+      const std::shared_ptr<Logger>& /*info_log*/) const override;
+};  // class HyperClockCache
+
+}  // namespace clock_cache
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/compressed_secondary_cache.cc b/src/rocksdb/cache/compressed_secondary_cache.cc
new file mode 100644
index 000000000..7d1bdc789
--- /dev/null
+++ b/src/rocksdb/cache/compressed_secondary_cache.cc
@@ -0,0 +1,325 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "cache/compressed_secondary_cache.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+
+#include "memory/memory_allocator.h"
+#include "monitoring/perf_context_imp.h"
+#include "util/compression.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+CompressedSecondaryCache::CompressedSecondaryCache(
+    size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+    double high_pri_pool_ratio, double low_pri_pool_ratio,
+    std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
+    CacheMetadataChargePolicy metadata_charge_policy,
+    CompressionType compression_type, uint32_t compress_format_version,
+    bool enable_custom_split_merge)
+    : cache_options_(capacity, num_shard_bits, strict_capacity_limit,
+                     high_pri_pool_ratio, low_pri_pool_ratio, memory_allocator,
+                     use_adaptive_mutex, metadata_charge_policy,
+                     compression_type, compress_format_version,
+                     enable_custom_split_merge) {
+  cache_ =
+      NewLRUCache(capacity, num_shard_bits, strict_capacity_limit,
+                  high_pri_pool_ratio, memory_allocator, use_adaptive_mutex,
+                  metadata_charge_policy, low_pri_pool_ratio);
+}
+
+CompressedSecondaryCache::~CompressedSecondaryCache() { cache_.reset(); }
+
+std::unique_ptr<SecondaryCacheResultHandle> CompressedSecondaryCache::Lookup(
+    const Slice& key, const Cache::CreateCallback& create_cb, bool /*wait*/,
+    bool advise_erase, bool& is_in_sec_cache) {
+  std::unique_ptr<SecondaryCacheResultHandle> handle;
+  is_in_sec_cache = false;
+  Cache::Handle* lru_handle = cache_->Lookup(key);
+  if (lru_handle == nullptr) {
+    return nullptr;
+  }
+
+  void* handle_value = cache_->Value(lru_handle);
+  if (handle_value == nullptr) {
+    cache_->Release(lru_handle, /*erase_if_last_ref=*/false);
+    return nullptr;
+  }
+
+  CacheAllocationPtr* ptr{nullptr};
+  CacheAllocationPtr merged_value;
+  size_t handle_value_charge{0};
+  if (cache_options_.enable_custom_split_merge) {
+    CacheValueChunk* value_chunk_ptr =
+        reinterpret_cast<CacheValueChunk*>(handle_value);
+    merged_value = MergeChunksIntoValue(value_chunk_ptr, handle_value_charge);
+    ptr = &merged_value;
+  } else {
+    ptr = reinterpret_cast<CacheAllocationPtr*>(handle_value);
+    handle_value_charge = cache_->GetCharge(lru_handle);
+  }
+
+  Status s;
+  void* value{nullptr};
+  size_t charge{0};
+  if (cache_options_.compression_type == kNoCompression) {
+    s = create_cb(ptr->get(), handle_value_charge, &value, &charge);
+  } else {
+    UncompressionContext uncompression_context(cache_options_.compression_type);
+    UncompressionInfo uncompression_info(uncompression_context,
+                                         UncompressionDict::GetEmptyDict(),
+                                         cache_options_.compression_type);
+
+    size_t uncompressed_size{0};
+    CacheAllocationPtr uncompressed = UncompressData(
+        uncompression_info, (char*)ptr->get(), handle_value_charge,
+        &uncompressed_size, cache_options_.compress_format_version,
+        cache_options_.memory_allocator.get());
+
+    if (!uncompressed) {
+      cache_->Release(lru_handle, /*erase_if_last_ref=*/true);
+      return nullptr;
+    }
+    s = create_cb(uncompressed.get(), uncompressed_size, &value, &charge);
+  }
+
+  if (!s.ok()) {
+    cache_->Release(lru_handle, /*erase_if_last_ref=*/true);
+    return nullptr;
+  }
+
+  if (advise_erase) {
+    cache_->Release(lru_handle, /*erase_if_last_ref=*/true);
+    // Insert a dummy handle.
+    cache_
+        ->Insert(key, /*value=*/nullptr, /*charge=*/0,
+                 GetDeletionCallback(cache_options_.enable_custom_split_merge))
+        .PermitUncheckedError();
+  } else {
+    is_in_sec_cache = true;
+    cache_->Release(lru_handle, /*erase_if_last_ref=*/false);
+  }
+  handle.reset(new CompressedSecondaryCacheResultHandle(value, charge));
+  return handle;
+}
+
+Status CompressedSecondaryCache::Insert(const Slice& key, void* value,
+                                        const Cache::CacheItemHelper* helper) {
+  if (value == nullptr) {
+    return Status::InvalidArgument();
+  }
+
+  Cache::Handle* lru_handle = cache_->Lookup(key);
+  Cache::DeleterFn del_cb =
+      GetDeletionCallback(cache_options_.enable_custom_split_merge);
+  if (lru_handle == nullptr) {
+    PERF_COUNTER_ADD(compressed_sec_cache_insert_dummy_count, 1);
+    // Insert a dummy handle if the handle is evicted for the first time.
+    return cache_->Insert(key, /*value=*/nullptr, /*charge=*/0, del_cb);
+  } else {
+    cache_->Release(lru_handle, /*erase_if_last_ref=*/false);
+  }
+
+  size_t size = (*helper->size_cb)(value);
+  CacheAllocationPtr ptr =
+      AllocateBlock(size, cache_options_.memory_allocator.get());
+
+  Status s = (*helper->saveto_cb)(value, 0, size, ptr.get());
+  if (!s.ok()) {
+    return s;
+  }
+  Slice val(ptr.get(), size);
+
+  std::string compressed_val;
+  if (cache_options_.compression_type != kNoCompression) {
+    PERF_COUNTER_ADD(compressed_sec_cache_uncompressed_bytes, size);
+    CompressionOptions compression_opts;
+    CompressionContext compression_context(cache_options_.compression_type);
+    uint64_t sample_for_compression{0};
+    CompressionInfo compression_info(
+        compression_opts, compression_context, CompressionDict::GetEmptyDict(),
+        cache_options_.compression_type, sample_for_compression);
+
+    bool success =
+        CompressData(val, compression_info,
+                     cache_options_.compress_format_version, &compressed_val);
+
+    if (!success) {
+      return Status::Corruption("Error compressing value.");
+    }
+
+    val = Slice(compressed_val);
+    size = compressed_val.size();
+    PERF_COUNTER_ADD(compressed_sec_cache_compressed_bytes, size);
+
+    if (!cache_options_.enable_custom_split_merge) {
+      ptr = AllocateBlock(size, cache_options_.memory_allocator.get());
+      memcpy(ptr.get(), compressed_val.data(), size);
+    }
+  }
+
+  PERF_COUNTER_ADD(compressed_sec_cache_insert_real_count, 1);
+  if (cache_options_.enable_custom_split_merge) {
+    size_t charge{0};
+    CacheValueChunk* value_chunks_head =
+        SplitValueIntoChunks(val, cache_options_.compression_type, charge);
+    return cache_->Insert(key, value_chunks_head, charge, del_cb);
+  } else {
+    CacheAllocationPtr* buf = new CacheAllocationPtr(std::move(ptr));
+    return cache_->Insert(key, buf, size, del_cb);
+  }
+}
+
+void CompressedSecondaryCache::Erase(const Slice& key) { cache_->Erase(key); }
+
+Status CompressedSecondaryCache::SetCapacity(size_t capacity) {
+  MutexLock l(&capacity_mutex_);
+  cache_options_.capacity = capacity;
+  cache_->SetCapacity(capacity);
+  return Status::OK();
+}
+
+Status CompressedSecondaryCache::GetCapacity(size_t& capacity) {
+  MutexLock l(&capacity_mutex_);
+  capacity = cache_options_.capacity;
+  return Status::OK();
+}
+
+std::string CompressedSecondaryCache::GetPrintableOptions() const {
+  std::string ret;
+  ret.reserve(20000);
+  const int kBufferSize{200};
+  char buffer[kBufferSize];
+  ret.append(cache_->GetPrintableOptions());
+  snprintf(buffer, kBufferSize, "    compression_type : %s\n",
+           CompressionTypeToString(cache_options_.compression_type).c_str());
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "    compress_format_version : %d\n",
+           cache_options_.compress_format_version);
+  ret.append(buffer);
+  return ret;
+}
+
+CompressedSecondaryCache::CacheValueChunk*
+CompressedSecondaryCache::SplitValueIntoChunks(const Slice& value,
+                                               CompressionType compression_type,
+                                               size_t& charge) {
+  assert(!value.empty());
+  const char* src_ptr = value.data();
+  size_t src_size{value.size()};
+
+  CacheValueChunk dummy_head = CacheValueChunk();
+  CacheValueChunk* current_chunk = &dummy_head;
+  // Do not split when value size is large or there is no compression.
+  size_t predicted_chunk_size{0};
+  size_t actual_chunk_size{0};
+  size_t tmp_size{0};
+  while (src_size > 0) {
+    predicted_chunk_size = sizeof(CacheValueChunk) - 1 + src_size;
+    auto upper =
+        std::upper_bound(malloc_bin_sizes_.begin(), malloc_bin_sizes_.end(),
+                         predicted_chunk_size);
+    // Do not split when value size is too small, too large, close to a bin
+    // size, or there is no compression.
+    if (upper == malloc_bin_sizes_.begin() ||
+        upper == malloc_bin_sizes_.end() ||
+        *upper - predicted_chunk_size < malloc_bin_sizes_.front() ||
+        compression_type == kNoCompression) {
+      tmp_size = predicted_chunk_size;
+    } else {
+      tmp_size = *(--upper);
+    }
+
+    CacheValueChunk* new_chunk =
+        reinterpret_cast<CacheValueChunk*>(new char[tmp_size]);
+    current_chunk->next = new_chunk;
+    current_chunk = current_chunk->next;
+    actual_chunk_size = tmp_size - sizeof(CacheValueChunk) + 1;
+    memcpy(current_chunk->data, src_ptr, actual_chunk_size);
+    current_chunk->size = actual_chunk_size;
+    src_ptr += actual_chunk_size;
+    src_size -= actual_chunk_size;
+    charge += tmp_size;
+  }
+  current_chunk->next = nullptr;
+
+  return dummy_head.next;
+}
+
+CacheAllocationPtr CompressedSecondaryCache::MergeChunksIntoValue(
+    const void* chunks_head, size_t& charge) {
+  const CacheValueChunk* head =
+      reinterpret_cast<const CacheValueChunk*>(chunks_head);
+  const CacheValueChunk* current_chunk = head;
+  charge = 0;
+  while (current_chunk != nullptr) {
+    charge += current_chunk->size;
+    current_chunk = current_chunk->next;
+  }
+
+  CacheAllocationPtr ptr =
+      AllocateBlock(charge, cache_options_.memory_allocator.get());
+  current_chunk = head;
+  size_t pos{0};
+  while (current_chunk != nullptr) {
+    memcpy(ptr.get() + pos, current_chunk->data, current_chunk->size);
+    pos += current_chunk->size;
+    current_chunk = current_chunk->next;
+  }
+
+  return ptr;
+}
+
+Cache::DeleterFn CompressedSecondaryCache::GetDeletionCallback(
+    bool enable_custom_split_merge) {
+  if (enable_custom_split_merge) {
+    return [](const Slice& /*key*/, void* obj) {
+      CacheValueChunk* chunks_head = reinterpret_cast<CacheValueChunk*>(obj);
+      while (chunks_head != nullptr) {
+        CacheValueChunk* tmp_chunk = chunks_head;
+        chunks_head = chunks_head->next;
+        tmp_chunk->Free();
+        obj = nullptr;
+      };
+    };
+  } else {
+    return [](const Slice& /*key*/, void* obj) {
+      delete reinterpret_cast<CacheAllocationPtr*>(obj);
+      obj = nullptr;
+    };
+  }
+}
+
+std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache(
+    size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+    double high_pri_pool_ratio, double low_pri_pool_ratio,
+    std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
+    CacheMetadataChargePolicy metadata_charge_policy,
+    CompressionType compression_type, uint32_t compress_format_version,
+    bool enable_custom_split_merge) {
+  return std::make_shared<CompressedSecondaryCache>(
+      capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio,
+      low_pri_pool_ratio, memory_allocator, use_adaptive_mutex,
+      metadata_charge_policy, compression_type, compress_format_version,
+      enable_custom_split_merge);
+}
+
+std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache(
+    const CompressedSecondaryCacheOptions& opts) {
+  // The secondary_cache is disabled for this LRUCache instance.
+  assert(opts.secondary_cache == nullptr);
+  return NewCompressedSecondaryCache(
+      opts.capacity, opts.num_shard_bits, opts.strict_capacity_limit,
+      opts.high_pri_pool_ratio, opts.low_pri_pool_ratio, opts.memory_allocator,
+      opts.use_adaptive_mutex, opts.metadata_charge_policy,
+      opts.compression_type, opts.compress_format_version,
+      opts.enable_custom_split_merge);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/compressed_secondary_cache.h b/src/rocksdb/cache/compressed_secondary_cache.h
new file mode 100644
index 000000000..4dee38802
--- /dev/null
+++ b/src/rocksdb/cache/compressed_secondary_cache.h
@@ -0,0 +1,139 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <memory>
+
+#include "cache/lru_cache.h"
+#include "memory/memory_allocator.h"
+#include "rocksdb/secondary_cache.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "util/compression.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CompressedSecondaryCacheResultHandle : public SecondaryCacheResultHandle {
+ public:
+  CompressedSecondaryCacheResultHandle(void* value, size_t size)
+      : value_(value), size_(size) {}
+  ~CompressedSecondaryCacheResultHandle() override = default;
+
+  CompressedSecondaryCacheResultHandle(
+      const CompressedSecondaryCacheResultHandle&) = delete;
+  CompressedSecondaryCacheResultHandle& operator=(
+      const CompressedSecondaryCacheResultHandle&) = delete;
+
+  bool IsReady() override { return true; }
+
+  void Wait() override {}
+
+  void* Value() override { return value_; }
+
+  size_t Size() override { return size_; }
+
+ private:
+  void* value_;
+  size_t size_;
+};
+
+// The CompressedSecondaryCache is a concrete implementation of
+// rocksdb::SecondaryCache.
+//
+// When a block is found from CompressedSecondaryCache::Lookup, we check whether
+// there is a dummy block with the same key in the primary cache.
+// 1. If the dummy block exits, we erase the block from
+//    CompressedSecondaryCache and insert it into the primary cache.
+// 2. If not, we just insert a dummy block into the primary cache
+//    (charging the actual size of the block) and don not erase the block from
+//    CompressedSecondaryCache. A standalone handle is returned to the caller.
+//
+// When a block is evicted from the primary cache, we check whether
+// there is a dummy block with the same key in CompressedSecondaryCache.
+// 1. If the dummy block exits, the block is inserted into
+//    CompressedSecondaryCache.
+// 2. If not, we just insert a dummy block (size 0) in CompressedSecondaryCache.
+//
+// Users can also cast a pointer to CompressedSecondaryCache and call methods on
+// it directly, especially custom methods that may be added
+// in the future.  For example -
+// std::unique_ptr<rocksdb::SecondaryCache> cache =
+//      NewCompressedSecondaryCache(opts);
+// static_cast<CompressedSecondaryCache*>(cache.get())->Erase(key);
+
+class CompressedSecondaryCache : public SecondaryCache {
+ public:
+  CompressedSecondaryCache(
+      size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+      double high_pri_pool_ratio, double low_pri_pool_ratio,
+      std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
+      bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
+      CacheMetadataChargePolicy metadata_charge_policy =
+          kDefaultCacheMetadataChargePolicy,
+      CompressionType compression_type = CompressionType::kLZ4Compression,
+      uint32_t compress_format_version = 2,
+      bool enable_custom_split_merge = false);
+  ~CompressedSecondaryCache() override;
+
+  const char* Name() const override { return "CompressedSecondaryCache"; }
+
+  Status Insert(const Slice& key, void* value,
+                const Cache::CacheItemHelper* helper) override;
+
+  std::unique_ptr<SecondaryCacheResultHandle> Lookup(
+      const Slice& key, const Cache::CreateCallback& create_cb, bool /*wait*/,
+      bool advise_erase, bool& is_in_sec_cache) override;
+
+  bool SupportForceErase() const override { return true; }
+
+  void Erase(const Slice& key) override;
+
+  void WaitAll(std::vector<SecondaryCacheResultHandle*> /*handles*/) override {}
+
+  Status SetCapacity(size_t capacity) override;
+
+  Status GetCapacity(size_t& capacity) override;
+
+  std::string GetPrintableOptions() const override;
+
+ private:
+  friend class CompressedSecondaryCacheTest;
+  static constexpr std::array<uint16_t, 8> malloc_bin_sizes_{
+      128, 256, 512, 1024, 2048, 4096, 8192, 16384};
+
+  struct CacheValueChunk {
+    // TODO try "CacheAllocationPtr next;".
+    CacheValueChunk* next;
+    size_t size;
+    // Beginning of the chunk data (MUST BE THE LAST FIELD IN THIS STRUCT!)
+    char data[1];
+
+    void Free() { delete[] reinterpret_cast<char*>(this); }
+  };
+
+  // Split value into chunks to better fit into jemalloc bins. The chunks
+  // are stored in CacheValueChunk and extra charge is needed for each chunk,
+  // so the cache charge is recalculated here.
+  CacheValueChunk* SplitValueIntoChunks(const Slice& value,
+                                        CompressionType compression_type,
+                                        size_t& charge);
+
+  // After merging chunks, the extra charge for each chunk is removed, so
+  // the charge is recalculated.
+  CacheAllocationPtr MergeChunksIntoValue(const void* chunks_head,
+                                          size_t& charge);
+
+  // An implementation of Cache::DeleterFn.
+  static Cache::DeleterFn GetDeletionCallback(bool enable_custom_split_merge);
+  std::shared_ptr<Cache> cache_;
+  CompressedSecondaryCacheOptions cache_options_;
+  mutable port::Mutex capacity_mutex_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/compressed_secondary_cache_test.cc b/src/rocksdb/cache/compressed_secondary_cache_test.cc
new file mode 100644
index 000000000..574c257a7
--- /dev/null
+++ b/src/rocksdb/cache/compressed_secondary_cache_test.cc
@@ -0,0 +1,1005 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "cache/compressed_secondary_cache.h"
+
+#include <iterator>
+#include <memory>
+#include <tuple>
+
+#include "memory/jemalloc_nodump_allocator.h"
+#include "rocksdb/convenience.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CompressedSecondaryCacheTest : public testing::Test {
+ public:
+  CompressedSecondaryCacheTest() : fail_create_(false) {}
+  ~CompressedSecondaryCacheTest() override = default;
+
+ protected:
+  class TestItem {
+   public:
+    TestItem(const char* buf, size_t size) : buf_(new char[size]), size_(size) {
+      memcpy(buf_.get(), buf, size);
+    }
+    ~TestItem() = default;
+
+    char* Buf() { return buf_.get(); }
+    [[nodiscard]] size_t Size() const { return size_; }
+
+   private:
+    std::unique_ptr<char[]> buf_;
+    size_t size_;
+  };
+
+  static size_t SizeCallback(void* obj) {
+    return reinterpret_cast<TestItem*>(obj)->Size();
+  }
+
+  static Status SaveToCallback(void* from_obj, size_t from_offset,
+                               size_t length, void* out) {
+    auto item = reinterpret_cast<TestItem*>(from_obj);
+    const char* buf = item->Buf();
+    EXPECT_EQ(length, item->Size());
+    EXPECT_EQ(from_offset, 0);
+    memcpy(out, buf, length);
+    return Status::OK();
+  }
+
+  static void DeletionCallback(const Slice& /*key*/, void* obj) {
+    delete reinterpret_cast<TestItem*>(obj);
+    obj = nullptr;
+  }
+
+  static Cache::CacheItemHelper helper_;
+
+  static Status SaveToCallbackFail(void* /*obj*/, size_t /*offset*/,
+                                   size_t /*size*/, void* /*out*/) {
+    return Status::NotSupported();
+  }
+
+  static Cache::CacheItemHelper helper_fail_;
+
+  Cache::CreateCallback test_item_creator = [&](const void* buf, size_t size,
+                                                void** out_obj,
+                                                size_t* charge) -> Status {
+    if (fail_create_) {
+      return Status::NotSupported();
+    }
+    *out_obj = reinterpret_cast<void*>(new TestItem((char*)buf, size));
+    *charge = size;
+    return Status::OK();
+  };
+
+  void SetFailCreate(bool fail) { fail_create_ = fail; }
+
+  void BasicTestHelper(std::shared_ptr<SecondaryCache> sec_cache,
+                       bool sec_cache_is_compressed) {
+    get_perf_context()->Reset();
+    bool is_in_sec_cache{true};
+    // Lookup an non-existent key.
+    std::unique_ptr<SecondaryCacheResultHandle> handle0 = sec_cache->Lookup(
+        "k0", test_item_creator, true, /*advise_erase=*/true, is_in_sec_cache);
+    ASSERT_EQ(handle0, nullptr);
+
+    Random rnd(301);
+    // Insert and Lookup the item k1 for the first time.
+    std::string str1(rnd.RandomString(1000));
+    TestItem item1(str1.data(), str1.length());
+    // A dummy handle is inserted if the item is inserted for the first time.
+    ASSERT_OK(sec_cache->Insert("k1", &item1,
+                                &CompressedSecondaryCacheTest::helper_));
+    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 1);
+    ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
+    ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
+
+    std::unique_ptr<SecondaryCacheResultHandle> handle1_1 = sec_cache->Lookup(
+        "k1", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
+    ASSERT_EQ(handle1_1, nullptr);
+
+    // Insert and Lookup the item k1 for the second time and advise erasing it.
+    ASSERT_OK(sec_cache->Insert("k1", &item1,
+                                &CompressedSecondaryCacheTest::helper_));
+    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1);
+
+    std::unique_ptr<SecondaryCacheResultHandle> handle1_2 = sec_cache->Lookup(
+        "k1", test_item_creator, true, /*advise_erase=*/true, is_in_sec_cache);
+    ASSERT_NE(handle1_2, nullptr);
+    ASSERT_FALSE(is_in_sec_cache);
+    if (sec_cache_is_compressed) {
+      ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
+                1000);
+      ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes,
+                1007);
+    } else {
+      ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
+      ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
+    }
+
+    std::unique_ptr<TestItem> val1 =
+        std::unique_ptr<TestItem>(static_cast<TestItem*>(handle1_2->Value()));
+    ASSERT_NE(val1, nullptr);
+    ASSERT_EQ(memcmp(val1->Buf(), item1.Buf(), item1.Size()), 0);
+
+    // Lookup the item k1 again.
+    std::unique_ptr<SecondaryCacheResultHandle> handle1_3 = sec_cache->Lookup(
+        "k1", test_item_creator, true, /*advise_erase=*/true, is_in_sec_cache);
+    ASSERT_EQ(handle1_3, nullptr);
+
+    // Insert and Lookup the item k2.
+    std::string str2(rnd.RandomString(1000));
+    TestItem item2(str2.data(), str2.length());
+    ASSERT_OK(sec_cache->Insert("k2", &item2,
+                                &CompressedSecondaryCacheTest::helper_));
+    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 2);
+    std::unique_ptr<SecondaryCacheResultHandle> handle2_1 = sec_cache->Lookup(
+        "k2", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
+    ASSERT_EQ(handle2_1, nullptr);
+
+    ASSERT_OK(sec_cache->Insert("k2", &item2,
+                                &CompressedSecondaryCacheTest::helper_));
+    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 2);
+    if (sec_cache_is_compressed) {
+      ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
+                2000);
+      ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes,
+                2014);
+    } else {
+      ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
+      ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
+    }
+    std::unique_ptr<SecondaryCacheResultHandle> handle2_2 = sec_cache->Lookup(
+        "k2", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
+    ASSERT_NE(handle2_2, nullptr);
+    std::unique_ptr<TestItem> val2 =
+        std::unique_ptr<TestItem>(static_cast<TestItem*>(handle2_2->Value()));
+    ASSERT_NE(val2, nullptr);
+    ASSERT_EQ(memcmp(val2->Buf(), item2.Buf(), item2.Size()), 0);
+
+    std::vector<SecondaryCacheResultHandle*> handles = {handle1_2.get(),
+                                                        handle2_2.get()};
+    sec_cache->WaitAll(handles);
+
+    sec_cache.reset();
+  }
+
+  void BasicTest(bool sec_cache_is_compressed, bool use_jemalloc) {
+    CompressedSecondaryCacheOptions opts;
+    opts.capacity = 2048;
+    opts.num_shard_bits = 0;
+
+    if (sec_cache_is_compressed) {
+      if (!LZ4_Supported()) {
+        ROCKSDB_GTEST_SKIP("This test requires LZ4 support.");
+        opts.compression_type = CompressionType::kNoCompression;
+        sec_cache_is_compressed = false;
+      }
+    } else {
+      opts.compression_type = CompressionType::kNoCompression;
+    }
+
+    if (use_jemalloc) {
+      JemallocAllocatorOptions jopts;
+      std::shared_ptr<MemoryAllocator> allocator;
+      std::string msg;
+      if (JemallocNodumpAllocator::IsSupported(&msg)) {
+        Status s = NewJemallocNodumpAllocator(jopts, &allocator);
+        if (s.ok()) {
+          opts.memory_allocator = allocator;
+        }
+      } else {
+        ROCKSDB_GTEST_BYPASS("JEMALLOC not supported");
+      }
+    }
+    std::shared_ptr<SecondaryCache> sec_cache =
+        NewCompressedSecondaryCache(opts);
+
+    BasicTestHelper(sec_cache, sec_cache_is_compressed);
+  }
+
+  void FailsTest(bool sec_cache_is_compressed) {
+    CompressedSecondaryCacheOptions secondary_cache_opts;
+    if (sec_cache_is_compressed) {
+      if (!LZ4_Supported()) {
+        ROCKSDB_GTEST_SKIP("This test requires LZ4 support.");
+        secondary_cache_opts.compression_type = CompressionType::kNoCompression;
+      }
+    } else {
+      secondary_cache_opts.compression_type = CompressionType::kNoCompression;
+    }
+
+    secondary_cache_opts.capacity = 1100;
+    secondary_cache_opts.num_shard_bits = 0;
+    std::shared_ptr<SecondaryCache> sec_cache =
+        NewCompressedSecondaryCache(secondary_cache_opts);
+
+    // Insert and Lookup the first item.
+    Random rnd(301);
+    std::string str1(rnd.RandomString(1000));
+    TestItem item1(str1.data(), str1.length());
+    // Insert a dummy handle.
+    ASSERT_OK(sec_cache->Insert("k1", &item1,
+                                &CompressedSecondaryCacheTest::helper_));
+    // Insert k1.
+    ASSERT_OK(sec_cache->Insert("k1", &item1,
+                                &CompressedSecondaryCacheTest::helper_));
+
+    // Insert and Lookup the second item.
+    std::string str2(rnd.RandomString(200));
+    TestItem item2(str2.data(), str2.length());
+    // Insert a dummy handle, k1 is not evicted.
+    ASSERT_OK(sec_cache->Insert("k2", &item2,
+                                &CompressedSecondaryCacheTest::helper_));
+    bool is_in_sec_cache{false};
+    std::unique_ptr<SecondaryCacheResultHandle> handle1 = sec_cache->Lookup(
+        "k1", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
+    ASSERT_EQ(handle1, nullptr);
+
+    // Insert k2 and k1 is evicted.
+    ASSERT_OK(sec_cache->Insert("k2", &item2,
+                                &CompressedSecondaryCacheTest::helper_));
+    std::unique_ptr<SecondaryCacheResultHandle> handle2 = sec_cache->Lookup(
+        "k2", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
+    ASSERT_NE(handle2, nullptr);
+    std::unique_ptr<TestItem> val2 =
+        std::unique_ptr<TestItem>(static_cast<TestItem*>(handle2->Value()));
+    ASSERT_NE(val2, nullptr);
+    ASSERT_EQ(memcmp(val2->Buf(), item2.Buf(), item2.Size()), 0);
+
+    // Insert k1 again and a dummy handle is inserted.
+    ASSERT_OK(sec_cache->Insert("k1", &item1,
+                                &CompressedSecondaryCacheTest::helper_));
+
+    std::unique_ptr<SecondaryCacheResultHandle> handle1_1 = sec_cache->Lookup(
+        "k1", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
+    ASSERT_EQ(handle1_1, nullptr);
+
+    // Create Fails.
+    SetFailCreate(true);
+    std::unique_ptr<SecondaryCacheResultHandle> handle2_1 = sec_cache->Lookup(
+        "k2", test_item_creator, true, /*advise_erase=*/true, is_in_sec_cache);
+    ASSERT_EQ(handle2_1, nullptr);
+
+    // Save Fails.
+    std::string str3 = rnd.RandomString(10);
+    TestItem item3(str3.data(), str3.length());
+    // The Status is OK because a dummy handle is inserted.
+    ASSERT_OK(sec_cache->Insert("k3", &item3,
+                                &CompressedSecondaryCacheTest::helper_fail_));
+    ASSERT_NOK(sec_cache->Insert("k3", &item3,
+                                 &CompressedSecondaryCacheTest::helper_fail_));
+
+    sec_cache.reset();
+  }
+
+  void BasicIntegrationTest(bool sec_cache_is_compressed,
+                            bool enable_custom_split_merge) {
+    CompressedSecondaryCacheOptions secondary_cache_opts;
+
+    if (sec_cache_is_compressed) {
+      if (!LZ4_Supported()) {
+        ROCKSDB_GTEST_SKIP("This test requires LZ4 support.");
+        secondary_cache_opts.compression_type = CompressionType::kNoCompression;
+        sec_cache_is_compressed = false;
+      }
+    } else {
+      secondary_cache_opts.compression_type = CompressionType::kNoCompression;
+    }
+
+    secondary_cache_opts.capacity = 6000;
+    secondary_cache_opts.num_shard_bits = 0;
+    secondary_cache_opts.enable_custom_split_merge = enable_custom_split_merge;
+    std::shared_ptr<SecondaryCache> secondary_cache =
+        NewCompressedSecondaryCache(secondary_cache_opts);
+    LRUCacheOptions lru_cache_opts(
+        /*_capacity =*/1300, /*_num_shard_bits =*/0,
+        /*_strict_capacity_limit =*/false, /*_high_pri_pool_ratio =*/0.5,
+        /*_memory_allocator =*/nullptr, kDefaultToAdaptiveMutex,
+        kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio =*/0.0);
+    lru_cache_opts.secondary_cache = secondary_cache;
+    std::shared_ptr<Cache> cache = NewLRUCache(lru_cache_opts);
+    std::shared_ptr<Statistics> stats = CreateDBStatistics();
+
+    get_perf_context()->Reset();
+    Random rnd(301);
+    std::string str1 = rnd.RandomString(1001);
+    auto item1_1 = new TestItem(str1.data(), str1.length());
+    ASSERT_OK(cache->Insert(
+        "k1", item1_1, &CompressedSecondaryCacheTest::helper_, str1.length()));
+
+    std::string str2 = rnd.RandomString(1012);
+    auto item2_1 = new TestItem(str2.data(), str2.length());
+    // After this Insert, primary cache contains k2 and secondary cache contains
+    // k1's dummy item.
+    ASSERT_OK(cache->Insert(
+        "k2", item2_1, &CompressedSecondaryCacheTest::helper_, str2.length()));
+    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 1);
+    ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
+    ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
+
+    std::string str3 = rnd.RandomString(1024);
+    auto item3_1 = new TestItem(str3.data(), str3.length());
+    // After this Insert, primary cache contains k3 and secondary cache contains
+    // k1's dummy item and k2's dummy item.
+    ASSERT_OK(cache->Insert(
+        "k3", item3_1, &CompressedSecondaryCacheTest::helper_, str3.length()));
+    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 2);
+
+    // After this Insert, primary cache contains k1 and secondary cache contains
+    // k1's dummy item, k2's dummy item, and k3's dummy item.
+    auto item1_2 = new TestItem(str1.data(), str1.length());
+    ASSERT_OK(cache->Insert(
+        "k1", item1_2, &CompressedSecondaryCacheTest::helper_, str1.length()));
+    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 3);
+
+    // After this Insert, primary cache contains k2 and secondary cache contains
+    // k1's item, k2's dummy item, and k3's dummy item.
+    auto item2_2 = new TestItem(str2.data(), str2.length());
+    ASSERT_OK(cache->Insert(
+        "k2", item2_2, &CompressedSecondaryCacheTest::helper_, str2.length()));
+    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1);
+    if (sec_cache_is_compressed) {
+      ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
+                str1.length());
+      ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes,
+                1008);
+    } else {
+      ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
+      ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
+    }
+
+    // After this Insert, primary cache contains k3 and secondary cache contains
+    // k1's item and k2's item.
+    auto item3_2 = new TestItem(str3.data(), str3.length());
+    ASSERT_OK(cache->Insert(
+        "k3", item3_2, &CompressedSecondaryCacheTest::helper_, str3.length()));
+    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 2);
+    if (sec_cache_is_compressed) {
+      ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
+                str1.length() + str2.length());
+      ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes,
+                2027);
+    } else {
+      ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
+      ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
+    }
+
+    Cache::Handle* handle;
+    handle = cache->Lookup("k3", &CompressedSecondaryCacheTest::helper_,
+                           test_item_creator, Cache::Priority::LOW, true,
+                           stats.get());
+    ASSERT_NE(handle, nullptr);
+    auto val3 = static_cast<TestItem*>(cache->Value(handle));
+    ASSERT_NE(val3, nullptr);
+    ASSERT_EQ(memcmp(val3->Buf(), item3_2->Buf(), item3_2->Size()), 0);
+    cache->Release(handle);
+
+    // Lookup an non-existent key.
+    handle = cache->Lookup("k0", &CompressedSecondaryCacheTest::helper_,
+                           test_item_creator, Cache::Priority::LOW, true,
+                           stats.get());
+    ASSERT_EQ(handle, nullptr);
+
+    // This Lookup should just insert a dummy handle in the primary cache
+    // and the k1 is still in the secondary cache.
+    handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_,
+                           test_item_creator, Cache::Priority::LOW, true,
+                           stats.get());
+    ASSERT_NE(handle, nullptr);
+    ASSERT_EQ(get_perf_context()->block_cache_standalone_handle_count, 1);
+    auto val1_1 = static_cast<TestItem*>(cache->Value(handle));
+    ASSERT_NE(val1_1, nullptr);
+    ASSERT_EQ(memcmp(val1_1->Buf(), str1.data(), str1.size()), 0);
+    cache->Release(handle);
+
+    // This Lookup should erase k1 from the secondary cache and insert
+    // it into primary cache; then k3 is demoted.
+    // k2 and k3 are in secondary cache.
+    handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_,
+                           test_item_creator, Cache::Priority::LOW, true,
+                           stats.get());
+    ASSERT_NE(handle, nullptr);
+    ASSERT_EQ(get_perf_context()->block_cache_standalone_handle_count, 1);
+    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 3);
+    cache->Release(handle);
+
+    // k2 is still in secondary cache.
+    handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
+                           test_item_creator, Cache::Priority::LOW, true,
+                           stats.get());
+    ASSERT_NE(handle, nullptr);
+    ASSERT_EQ(get_perf_context()->block_cache_standalone_handle_count, 2);
+    cache->Release(handle);
+
+    // Testing SetCapacity().
+    ASSERT_OK(secondary_cache->SetCapacity(0));
+    handle = cache->Lookup("k3", &CompressedSecondaryCacheTest::helper_,
+                           test_item_creator, Cache::Priority::LOW, true,
+                           stats.get());
+    ASSERT_EQ(handle, nullptr);
+
+    ASSERT_OK(secondary_cache->SetCapacity(7000));
+    size_t capacity;
+    ASSERT_OK(secondary_cache->GetCapacity(capacity));
+    ASSERT_EQ(capacity, 7000);
+    auto item1_3 = new TestItem(str1.data(), str1.length());
+    // After this Insert, primary cache contains k1.
+    ASSERT_OK(cache->Insert(
+        "k1", item1_3, &CompressedSecondaryCacheTest::helper_, str2.length()));
+    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 3);
+    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 4);
+
+    auto item2_3 = new TestItem(str2.data(), str2.length());
+    // After this Insert, primary cache contains k2 and secondary cache contains
+    // k1's dummy item.
+    ASSERT_OK(cache->Insert(
+        "k2", item2_3, &CompressedSecondaryCacheTest::helper_, str1.length()));
+    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 4);
+
+    auto item1_4 = new TestItem(str1.data(), str1.length());
+    // After this Insert, primary cache contains k1 and secondary cache contains
+    // k1's dummy item and k2's dummy item.
+    ASSERT_OK(cache->Insert(
+        "k1", item1_4, &CompressedSecondaryCacheTest::helper_, str2.length()));
+    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 5);
+
+    auto item2_4 = new TestItem(str2.data(), str2.length());
+    // After this Insert, primary cache contains k2 and secondary cache contains
+    // k1's real item and k2's dummy item.
+    ASSERT_OK(cache->Insert(
+        "k2", item2_4, &CompressedSecondaryCacheTest::helper_, str2.length()));
+    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 5);
+    // This Lookup should just insert a dummy handle in the primary cache
+    // and the k1 is still in the secondary cache.
+    handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_,
+                           test_item_creator, Cache::Priority::LOW, true,
+                           stats.get());
+
+    ASSERT_NE(handle, nullptr);
+    cache->Release(handle);
+    ASSERT_EQ(get_perf_context()->block_cache_standalone_handle_count, 3);
+
+    cache.reset();
+    secondary_cache.reset();
+  }
+
+  void BasicIntegrationFailTest(bool sec_cache_is_compressed) {
+    CompressedSecondaryCacheOptions secondary_cache_opts;
+
+    if (sec_cache_is_compressed) {
+      if (!LZ4_Supported()) {
+        ROCKSDB_GTEST_SKIP("This test requires LZ4 support.");
+        secondary_cache_opts.compression_type = CompressionType::kNoCompression;
+      }
+    } else {
+      secondary_cache_opts.compression_type = CompressionType::kNoCompression;
+    }
+
+    secondary_cache_opts.capacity = 6000;
+    secondary_cache_opts.num_shard_bits = 0;
+    std::shared_ptr<SecondaryCache> secondary_cache =
+        NewCompressedSecondaryCache(secondary_cache_opts);
+
+    LRUCacheOptions opts(
+        /*_capacity=*/1300, /*_num_shard_bits=*/0,
+        /*_strict_capacity_limit=*/false, /*_high_pri_pool_ratio=*/0.5,
+        /*_memory_allocator=*/nullptr, kDefaultToAdaptiveMutex,
+        kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio=*/0.0);
+    opts.secondary_cache = secondary_cache;
+    std::shared_ptr<Cache> cache = NewLRUCache(opts);
+
+    Random rnd(301);
+    std::string str1 = rnd.RandomString(1001);
+    auto item1 = std::make_unique<TestItem>(str1.data(), str1.length());
+    ASSERT_NOK(cache->Insert("k1", item1.get(), nullptr, str1.length()));
+    ASSERT_OK(cache->Insert("k1", item1.get(),
+                            &CompressedSecondaryCacheTest::helper_,
+                            str1.length()));
+    item1.release();  // Appease clang-analyze "potential memory leak"
+
+    Cache::Handle* handle;
+    handle = cache->Lookup("k2", nullptr, test_item_creator,
+                           Cache::Priority::LOW, true);
+    ASSERT_EQ(handle, nullptr);
+    handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
+                           test_item_creator, Cache::Priority::LOW, false);
+    ASSERT_EQ(handle, nullptr);
+
+    cache.reset();
+    secondary_cache.reset();
+  }
+
+  void IntegrationSaveFailTest(bool sec_cache_is_compressed) {
+    CompressedSecondaryCacheOptions secondary_cache_opts;
+
+    if (sec_cache_is_compressed) {
+      if (!LZ4_Supported()) {
+        ROCKSDB_GTEST_SKIP("This test requires LZ4 support.");
+        secondary_cache_opts.compression_type = CompressionType::kNoCompression;
+      }
+    } else {
+      secondary_cache_opts.compression_type = CompressionType::kNoCompression;
+    }
+
+    secondary_cache_opts.capacity = 6000;
+    secondary_cache_opts.num_shard_bits = 0;
+
+    std::shared_ptr<SecondaryCache> secondary_cache =
+        NewCompressedSecondaryCache(secondary_cache_opts);
+
+    LRUCacheOptions opts(
+        /*_capacity=*/1300, /*_num_shard_bits=*/0,
+        /*_strict_capacity_limit=*/false, /*_high_pri_pool_ratio=*/0.5,
+        /*_memory_allocator=*/nullptr, kDefaultToAdaptiveMutex,
+        kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio=*/0.0);
+    opts.secondary_cache = secondary_cache;
+    std::shared_ptr<Cache> cache = NewLRUCache(opts);
+
+    Random rnd(301);
+    std::string str1 = rnd.RandomString(1001);
+    auto item1 = new TestItem(str1.data(), str1.length());
+    ASSERT_OK(cache->Insert("k1", item1,
+                            &CompressedSecondaryCacheTest::helper_fail_,
+                            str1.length()));
+
+    std::string str2 = rnd.RandomString(1002);
+    auto item2 = new TestItem(str2.data(), str2.length());
+    // k1 should be demoted to the secondary cache.
+    ASSERT_OK(cache->Insert("k2", item2,
+                            &CompressedSecondaryCacheTest::helper_fail_,
+                            str2.length()));
+
+    Cache::Handle* handle;
+    handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_fail_,
+                           test_item_creator, Cache::Priority::LOW, true);
+    ASSERT_NE(handle, nullptr);
+    cache->Release(handle);
+    // This lookup should fail, since k1 demotion would have failed.
+    handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_fail_,
+                           test_item_creator, Cache::Priority::LOW, true);
+    ASSERT_EQ(handle, nullptr);
+    // Since k1 was not promoted, k2 should still be in cache.
+    handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_fail_,
+                           test_item_creator, Cache::Priority::LOW, true);
+    ASSERT_NE(handle, nullptr);
+    cache->Release(handle);
+
+    cache.reset();
+    secondary_cache.reset();
+  }
+
+  void IntegrationCreateFailTest(bool sec_cache_is_compressed) {
+    CompressedSecondaryCacheOptions secondary_cache_opts;
+
+    if (sec_cache_is_compressed) {
+      if (!LZ4_Supported()) {
+        ROCKSDB_GTEST_SKIP("This test requires LZ4 support.");
+        secondary_cache_opts.compression_type = CompressionType::kNoCompression;
+      }
+    } else {
+      secondary_cache_opts.compression_type = CompressionType::kNoCompression;
+    }
+
+    secondary_cache_opts.capacity = 6000;
+    secondary_cache_opts.num_shard_bits = 0;
+
+    std::shared_ptr<SecondaryCache> secondary_cache =
+        NewCompressedSecondaryCache(secondary_cache_opts);
+
+    LRUCacheOptions opts(
+        /*_capacity=*/1300, /*_num_shard_bits=*/0,
+        /*_strict_capacity_limit=*/false, /*_high_pri_pool_ratio=*/0.5,
+        /*_memory_allocator=*/nullptr, kDefaultToAdaptiveMutex,
+        kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio=*/0.0);
+    opts.secondary_cache = secondary_cache;
+    std::shared_ptr<Cache> cache = NewLRUCache(opts);
+
+    Random rnd(301);
+    std::string str1 = rnd.RandomString(1001);
+    auto item1 = new TestItem(str1.data(), str1.length());
+    ASSERT_OK(cache->Insert("k1", item1, &CompressedSecondaryCacheTest::helper_,
+                            str1.length()));
+
+    std::string str2 = rnd.RandomString(1002);
+    auto item2 = new TestItem(str2.data(), str2.length());
+    // k1 should be demoted to the secondary cache.
+    ASSERT_OK(cache->Insert("k2", item2, &CompressedSecondaryCacheTest::helper_,
+                            str2.length()));
+
+    Cache::Handle* handle;
+    SetFailCreate(true);
+    handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
+                           test_item_creator, Cache::Priority::LOW, true);
+    ASSERT_NE(handle, nullptr);
+    cache->Release(handle);
+    // This lookup should fail, since k1 creation would have failed
+    handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_,
+                           test_item_creator, Cache::Priority::LOW, true);
+    ASSERT_EQ(handle, nullptr);
+    // Since k1 didn't get promoted, k2 should still be in cache
+    handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
+                           test_item_creator, Cache::Priority::LOW, true);
+    ASSERT_NE(handle, nullptr);
+    cache->Release(handle);
+
+    cache.reset();
+    secondary_cache.reset();
+  }
+
+  void IntegrationFullCapacityTest(bool sec_cache_is_compressed) {
+    CompressedSecondaryCacheOptions secondary_cache_opts;
+
+    if (sec_cache_is_compressed) {
+      if (!LZ4_Supported()) {
+        ROCKSDB_GTEST_SKIP("This test requires LZ4 support.");
+        secondary_cache_opts.compression_type = CompressionType::kNoCompression;
+      }
+    } else {
+      secondary_cache_opts.compression_type = CompressionType::kNoCompression;
+    }
+
+    secondary_cache_opts.capacity = 6000;
+    secondary_cache_opts.num_shard_bits = 0;
+
+    std::shared_ptr<SecondaryCache> secondary_cache =
+        NewCompressedSecondaryCache(secondary_cache_opts);
+
+    LRUCacheOptions opts(
+        /*_capacity=*/1300, /*_num_shard_bits=*/0,
+        /*_strict_capacity_limit=*/false, /*_high_pri_pool_ratio=*/0.5,
+        /*_memory_allocator=*/nullptr, kDefaultToAdaptiveMutex,
+        kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio=*/0.0);
+    opts.secondary_cache = secondary_cache;
+    std::shared_ptr<Cache> cache = NewLRUCache(opts);
+
+    Random rnd(301);
+    std::string str1 = rnd.RandomString(1001);
+    auto item1_1 = new TestItem(str1.data(), str1.length());
+    ASSERT_OK(cache->Insert(
+        "k1", item1_1, &CompressedSecondaryCacheTest::helper_, str1.length()));
+
+    std::string str2 = rnd.RandomString(1002);
+    std::string str2_clone{str2};
+    auto item2 = new TestItem(str2.data(), str2.length());
+    // After this Insert, primary cache contains k2 and secondary cache contains
+    // k1's dummy item.
+    ASSERT_OK(cache->Insert("k2", item2, &CompressedSecondaryCacheTest::helper_,
+                            str2.length()));
+
+    // After this Insert, primary cache contains k1 and secondary cache contains
+    // k1's dummy item and k2's dummy item.
+    auto item1_2 = new TestItem(str1.data(), str1.length());
+    ASSERT_OK(cache->Insert(
+        "k1", item1_2, &CompressedSecondaryCacheTest::helper_, str1.length()));
+
+    auto item2_2 = new TestItem(str2.data(), str2.length());
+    // After this Insert, primary cache contains k2 and secondary cache contains
+    // k1's item and k2's dummy item.
+    ASSERT_OK(cache->Insert(
+        "k2", item2_2, &CompressedSecondaryCacheTest::helper_, str2.length()));
+
+    Cache::Handle* handle2;
+    handle2 = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
+                            test_item_creator, Cache::Priority::LOW, true);
+    ASSERT_NE(handle2, nullptr);
+    cache->Release(handle2);
+
+    // k1 promotion should fail because cache is at capacity and
+    // strict_capacity_limit is true, but the lookup should still succeed.
+    // A k1's dummy item is inserted into primary cache.
+    Cache::Handle* handle1;
+    handle1 = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_,
+                            test_item_creator, Cache::Priority::LOW, true);
+    ASSERT_NE(handle1, nullptr);
+    cache->Release(handle1);
+
+    // Since k1 didn't get inserted, k2 should still be in cache
+    handle2 = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
+                            test_item_creator, Cache::Priority::LOW, true);
+    ASSERT_NE(handle2, nullptr);
+    cache->Release(handle2);
+
+    cache.reset();
+    secondary_cache.reset();
+  }
+
+  void SplitValueIntoChunksTest() {
+    JemallocAllocatorOptions jopts;
+    std::shared_ptr<MemoryAllocator> allocator;
+    std::string msg;
+    if (JemallocNodumpAllocator::IsSupported(&msg)) {
+      Status s = NewJemallocNodumpAllocator(jopts, &allocator);
+      if (!s.ok()) {
+        ROCKSDB_GTEST_BYPASS("JEMALLOC not supported");
+      }
+    } else {
+      ROCKSDB_GTEST_BYPASS("JEMALLOC not supported");
+    }
+
+    using CacheValueChunk = CompressedSecondaryCache::CacheValueChunk;
+    std::unique_ptr<CompressedSecondaryCache> sec_cache =
+        std::make_unique<CompressedSecondaryCache>(1000, 0, true, 0.5, 0.0,
+                                                   allocator);
+    Random rnd(301);
+    // 8500 = 8169 + 233 + 98, so there should be 3 chunks after split.
+    size_t str_size{8500};
+    std::string str = rnd.RandomString(static_cast<int>(str_size));
+    size_t charge{0};
+    CacheValueChunk* chunks_head =
+        sec_cache->SplitValueIntoChunks(str, kLZ4Compression, charge);
+    ASSERT_EQ(charge, str_size + 3 * (sizeof(CacheValueChunk) - 1));
+
+    CacheValueChunk* current_chunk = chunks_head;
+    ASSERT_EQ(current_chunk->size, 8192 - sizeof(CacheValueChunk) + 1);
+    current_chunk = current_chunk->next;
+    ASSERT_EQ(current_chunk->size, 256 - sizeof(CacheValueChunk) + 1);
+    current_chunk = current_chunk->next;
+    ASSERT_EQ(current_chunk->size, 98);
+
+    sec_cache->GetDeletionCallback(true)("dummy", chunks_head);
+  }
+
+  void MergeChunksIntoValueTest() {
+    using CacheValueChunk = CompressedSecondaryCache::CacheValueChunk;
+    Random rnd(301);
+    size_t size1{2048};
+    std::string str1 = rnd.RandomString(static_cast<int>(size1));
+    CacheValueChunk* current_chunk = reinterpret_cast<CacheValueChunk*>(
+        new char[sizeof(CacheValueChunk) - 1 + size1]);
+    CacheValueChunk* chunks_head = current_chunk;
+    memcpy(current_chunk->data, str1.data(), size1);
+    current_chunk->size = size1;
+
+    size_t size2{256};
+    std::string str2 = rnd.RandomString(static_cast<int>(size2));
+    current_chunk->next = reinterpret_cast<CacheValueChunk*>(
+        new char[sizeof(CacheValueChunk) - 1 + size2]);
+    current_chunk = current_chunk->next;
+    memcpy(current_chunk->data, str2.data(), size2);
+    current_chunk->size = size2;
+
+    size_t size3{31};
+    std::string str3 = rnd.RandomString(static_cast<int>(size3));
+    current_chunk->next = reinterpret_cast<CacheValueChunk*>(
+        new char[sizeof(CacheValueChunk) - 1 + size3]);
+    current_chunk = current_chunk->next;
+    memcpy(current_chunk->data, str3.data(), size3);
+    current_chunk->size = size3;
+    current_chunk->next = nullptr;
+
+    std::string str = str1 + str2 + str3;
+
+    std::unique_ptr<CompressedSecondaryCache> sec_cache =
+        std::make_unique<CompressedSecondaryCache>(1000, 0, true, 0.5, 0.0);
+    size_t charge{0};
+    CacheAllocationPtr value =
+        sec_cache->MergeChunksIntoValue(chunks_head, charge);
+    ASSERT_EQ(charge, size1 + size2 + size3);
+    std::string value_str{value.get(), charge};
+    ASSERT_EQ(strcmp(value_str.data(), str.data()), 0);
+
+    while (chunks_head != nullptr) {
+      CacheValueChunk* tmp_chunk = chunks_head;
+      chunks_head = chunks_head->next;
+      tmp_chunk->Free();
+    }
+  }
+
+  void SplictValueAndMergeChunksTest() {
+    JemallocAllocatorOptions jopts;
+    std::shared_ptr<MemoryAllocator> allocator;
+    std::string msg;
+    if (JemallocNodumpAllocator::IsSupported(&msg)) {
+      Status s = NewJemallocNodumpAllocator(jopts, &allocator);
+      if (!s.ok()) {
+        ROCKSDB_GTEST_BYPASS("JEMALLOC not supported");
+      }
+    } else {
+      ROCKSDB_GTEST_BYPASS("JEMALLOC not supported");
+    }
+
+    using CacheValueChunk = CompressedSecondaryCache::CacheValueChunk;
+    std::unique_ptr<CompressedSecondaryCache> sec_cache =
+        std::make_unique<CompressedSecondaryCache>(1000, 0, true, 0.5, 0.0,
+                                                   allocator);
+    Random rnd(301);
+    // 8500 = 8169 + 233 + 98, so there should be 3 chunks after split.
+    size_t str_size{8500};
+    std::string str = rnd.RandomString(static_cast<int>(str_size));
+    size_t charge{0};
+    CacheValueChunk* chunks_head =
+        sec_cache->SplitValueIntoChunks(str, kLZ4Compression, charge);
+    ASSERT_EQ(charge, str_size + 3 * (sizeof(CacheValueChunk) - 1));
+
+    CacheAllocationPtr value =
+        sec_cache->MergeChunksIntoValue(chunks_head, charge);
+    ASSERT_EQ(charge, str_size);
+    std::string value_str{value.get(), charge};
+    ASSERT_EQ(strcmp(value_str.data(), str.data()), 0);
+
+    sec_cache->GetDeletionCallback(true)("dummy", chunks_head);
+  }
+
+ private:
+  bool fail_create_;
+};
+
+Cache::CacheItemHelper CompressedSecondaryCacheTest::helper_(
+    CompressedSecondaryCacheTest::SizeCallback,
+    CompressedSecondaryCacheTest::SaveToCallback,
+    CompressedSecondaryCacheTest::DeletionCallback);
+
+Cache::CacheItemHelper CompressedSecondaryCacheTest::helper_fail_(
+    CompressedSecondaryCacheTest::SizeCallback,
+    CompressedSecondaryCacheTest::SaveToCallbackFail,
+    CompressedSecondaryCacheTest::DeletionCallback);
+
+class CompressedSecCacheTestWithCompressAndAllocatorParam
+    : public CompressedSecondaryCacheTest,
+      public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  CompressedSecCacheTestWithCompressAndAllocatorParam() {
+    sec_cache_is_compressed_ = std::get<0>(GetParam());
+    use_jemalloc_ = std::get<1>(GetParam());
+  }
+  bool sec_cache_is_compressed_;
+  bool use_jemalloc_;
+};
+
+TEST_P(CompressedSecCacheTestWithCompressAndAllocatorParam, BasicTes) {
+  BasicTest(sec_cache_is_compressed_, use_jemalloc_);
+}
+
+INSTANTIATE_TEST_CASE_P(CompressedSecCacheTests,
+                        CompressedSecCacheTestWithCompressAndAllocatorParam,
+                        ::testing::Combine(testing::Bool(), testing::Bool()));
+
+class CompressedSecondaryCacheTestWithCompressionParam
+    : public CompressedSecondaryCacheTest,
+      public ::testing::WithParamInterface<bool> {
+ public:
+  CompressedSecondaryCacheTestWithCompressionParam() {
+    sec_cache_is_compressed_ = GetParam();
+  }
+  bool sec_cache_is_compressed_;
+};
+
+#ifndef ROCKSDB_LITE
+
+TEST_P(CompressedSecondaryCacheTestWithCompressionParam, BasicTestFromString) {
+  std::shared_ptr<SecondaryCache> sec_cache{nullptr};
+  std::string sec_cache_uri;
+  if (sec_cache_is_compressed_) {
+    if (LZ4_Supported()) {
+      sec_cache_uri =
+          "compressed_secondary_cache://"
+          "capacity=2048;num_shard_bits=0;compression_type=kLZ4Compression;"
+          "compress_format_version=2";
+    } else {
+      ROCKSDB_GTEST_SKIP("This test requires LZ4 support.");
+      sec_cache_uri =
+          "compressed_secondary_cache://"
+          "capacity=2048;num_shard_bits=0;compression_type=kNoCompression";
+      sec_cache_is_compressed_ = false;
+    }
+    Status s = SecondaryCache::CreateFromString(ConfigOptions(), sec_cache_uri,
+                                                &sec_cache);
+    EXPECT_OK(s);
+  } else {
+    sec_cache_uri =
+        "compressed_secondary_cache://"
+        "capacity=2048;num_shard_bits=0;compression_type=kNoCompression";
+    Status s = SecondaryCache::CreateFromString(ConfigOptions(), sec_cache_uri,
+                                                &sec_cache);
+    EXPECT_OK(s);
+  }
+  BasicTestHelper(sec_cache, sec_cache_is_compressed_);
+}
+
+TEST_P(CompressedSecondaryCacheTestWithCompressionParam,
+       BasicTestFromStringWithSplit) {
+  std::shared_ptr<SecondaryCache> sec_cache{nullptr};
+  std::string sec_cache_uri;
+  if (sec_cache_is_compressed_) {
+    if (LZ4_Supported()) {
+      sec_cache_uri =
+          "compressed_secondary_cache://"
+          "capacity=2048;num_shard_bits=0;compression_type=kLZ4Compression;"
+          "compress_format_version=2;enable_custom_split_merge=true";
+    } else {
+      ROCKSDB_GTEST_SKIP("This test requires LZ4 support.");
+      sec_cache_uri =
+          "compressed_secondary_cache://"
+          "capacity=2048;num_shard_bits=0;compression_type=kNoCompression;"
+          "enable_custom_split_merge=true";
+      sec_cache_is_compressed_ = false;
+    }
+    Status s = SecondaryCache::CreateFromString(ConfigOptions(), sec_cache_uri,
+                                                &sec_cache);
+    EXPECT_OK(s);
+  } else {
+    sec_cache_uri =
+        "compressed_secondary_cache://"
+        "capacity=2048;num_shard_bits=0;compression_type=kNoCompression;"
+        "enable_custom_split_merge=true";
+    Status s = SecondaryCache::CreateFromString(ConfigOptions(), sec_cache_uri,
+                                                &sec_cache);
+    EXPECT_OK(s);
+  }
+  BasicTestHelper(sec_cache, sec_cache_is_compressed_);
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST_P(CompressedSecondaryCacheTestWithCompressionParam, FailsTest) {
+  FailsTest(sec_cache_is_compressed_);
+}
+
+TEST_P(CompressedSecondaryCacheTestWithCompressionParam,
+       BasicIntegrationFailTest) {
+  BasicIntegrationFailTest(sec_cache_is_compressed_);
+}
+
+TEST_P(CompressedSecondaryCacheTestWithCompressionParam,
+       IntegrationSaveFailTest) {
+  IntegrationSaveFailTest(sec_cache_is_compressed_);
+}
+
+TEST_P(CompressedSecondaryCacheTestWithCompressionParam,
+       IntegrationCreateFailTest) {
+  IntegrationCreateFailTest(sec_cache_is_compressed_);
+}
+
+TEST_P(CompressedSecondaryCacheTestWithCompressionParam,
+       IntegrationFullCapacityTest) {
+  IntegrationFullCapacityTest(sec_cache_is_compressed_);
+}
+
+INSTANTIATE_TEST_CASE_P(CompressedSecCacheTests,
+                        CompressedSecondaryCacheTestWithCompressionParam,
+                        testing::Bool());
+
+class CompressedSecCacheTestWithCompressAndSplitParam
+    : public CompressedSecondaryCacheTest,
+      public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  CompressedSecCacheTestWithCompressAndSplitParam() {
+    sec_cache_is_compressed_ = std::get<0>(GetParam());
+    enable_custom_split_merge_ = std::get<1>(GetParam());
+  }
+  bool sec_cache_is_compressed_;
+  bool enable_custom_split_merge_;
+};
+
+TEST_P(CompressedSecCacheTestWithCompressAndSplitParam, BasicIntegrationTest) {
+  BasicIntegrationTest(sec_cache_is_compressed_, enable_custom_split_merge_);
+}
+
+INSTANTIATE_TEST_CASE_P(CompressedSecCacheTests,
+                        CompressedSecCacheTestWithCompressAndSplitParam,
+                        ::testing::Combine(testing::Bool(), testing::Bool()));
+
+TEST_F(CompressedSecondaryCacheTest, SplitValueIntoChunksTest) {
+  SplitValueIntoChunksTest();
+}
+
+TEST_F(CompressedSecondaryCacheTest, MergeChunksIntoValueTest) {
+  MergeChunksIntoValueTest();
+}
+
+TEST_F(CompressedSecondaryCacheTest, SplictValueAndMergeChunksTest) {
+  SplictValueAndMergeChunksTest();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/cache/lru_cache.cc b/src/rocksdb/cache/lru_cache.cc
new file mode 100644
index 000000000..c8e4d29ba
--- /dev/null
+++ b/src/rocksdb/cache/lru_cache.cc
@@ -0,0 +1,921 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "cache/lru_cache.h"
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/statistics.h"
+#include "port/lang.h"
+#include "util/distributed_mutex.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace lru_cache {
+
+// A distinct pointer value for marking "dummy" cache entries
+void* const kDummyValueMarker = const_cast<char*>("kDummyValueMarker");
+
+LRUHandleTable::LRUHandleTable(int max_upper_hash_bits)
+    : length_bits_(/* historical starting size*/ 4),
+      list_(new LRUHandle* [size_t{1} << length_bits_] {}),
+      elems_(0),
+      max_length_bits_(max_upper_hash_bits) {}
+
+LRUHandleTable::~LRUHandleTable() {
+  ApplyToEntriesRange(
+      [](LRUHandle* h) {
+        if (!h->HasRefs()) {
+          h->Free();
+        }
+      },
+      0, size_t{1} << length_bits_);
+}
+
+LRUHandle* LRUHandleTable::Lookup(const Slice& key, uint32_t hash) {
+  return *FindPointer(key, hash);
+}
+
+LRUHandle* LRUHandleTable::Insert(LRUHandle* h) {
+  LRUHandle** ptr = FindPointer(h->key(), h->hash);
+  LRUHandle* old = *ptr;
+  h->next_hash = (old == nullptr ? nullptr : old->next_hash);
+  *ptr = h;
+  if (old == nullptr) {
+    ++elems_;
+    if ((elems_ >> length_bits_) > 0) {  // elems_ >= length
+      // Since each cache entry is fairly large, we aim for a small
+      // average linked list length (<= 1).
+      Resize();
+    }
+  }
+  return old;
+}
+
+LRUHandle* LRUHandleTable::Remove(const Slice& key, uint32_t hash) {
+  LRUHandle** ptr = FindPointer(key, hash);
+  LRUHandle* result = *ptr;
+  if (result != nullptr) {
+    *ptr = result->next_hash;
+    --elems_;
+  }
+  return result;
+}
+
+LRUHandle** LRUHandleTable::FindPointer(const Slice& key, uint32_t hash) {
+  LRUHandle** ptr = &list_[hash >> (32 - length_bits_)];
+  while (*ptr != nullptr && ((*ptr)->hash != hash || key != (*ptr)->key())) {
+    ptr = &(*ptr)->next_hash;
+  }
+  return ptr;
+}
+
+void LRUHandleTable::Resize() {
+  if (length_bits_ >= max_length_bits_) {
+    // Due to reaching limit of hash information, if we made the table bigger,
+    // we would allocate more addresses but only the same number would be used.
+    return;
+  }
+  if (length_bits_ >= 31) {
+    // Avoid undefined behavior shifting uint32_t by 32.
+    return;
+  }
+
+  uint32_t old_length = uint32_t{1} << length_bits_;
+  int new_length_bits = length_bits_ + 1;
+  std::unique_ptr<LRUHandle* []> new_list {
+    new LRUHandle* [size_t{1} << new_length_bits] {}
+  };
+  uint32_t count = 0;
+  for (uint32_t i = 0; i < old_length; i++) {
+    LRUHandle* h = list_[i];
+    while (h != nullptr) {
+      LRUHandle* next = h->next_hash;
+      uint32_t hash = h->hash;
+      LRUHandle** ptr = &new_list[hash >> (32 - new_length_bits)];
+      h->next_hash = *ptr;
+      *ptr = h;
+      h = next;
+      count++;
+    }
+  }
+  assert(elems_ == count);
+  list_ = std::move(new_list);
+  length_bits_ = new_length_bits;
+}
+
+LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit,
+                             double high_pri_pool_ratio,
+                             double low_pri_pool_ratio, bool use_adaptive_mutex,
+                             CacheMetadataChargePolicy metadata_charge_policy,
+                             int max_upper_hash_bits,
+                             SecondaryCache* secondary_cache)
+    : CacheShardBase(metadata_charge_policy),
+      capacity_(0),
+      high_pri_pool_usage_(0),
+      low_pri_pool_usage_(0),
+      strict_capacity_limit_(strict_capacity_limit),
+      high_pri_pool_ratio_(high_pri_pool_ratio),
+      high_pri_pool_capacity_(0),
+      low_pri_pool_ratio_(low_pri_pool_ratio),
+      low_pri_pool_capacity_(0),
+      table_(max_upper_hash_bits),
+      usage_(0),
+      lru_usage_(0),
+      mutex_(use_adaptive_mutex),
+      secondary_cache_(secondary_cache) {
+  // Make empty circular linked list.
+  lru_.next = &lru_;
+  lru_.prev = &lru_;
+  lru_low_pri_ = &lru_;
+  lru_bottom_pri_ = &lru_;
+  SetCapacity(capacity);
+}
+
+void LRUCacheShard::EraseUnRefEntries() {
+  autovector<LRUHandle*> last_reference_list;
+  {
+    DMutexLock l(mutex_);
+    while (lru_.next != &lru_) {
+      LRUHandle* old = lru_.next;
+      // LRU list contains only elements which can be evicted.
+      assert(old->InCache() && !old->HasRefs());
+      LRU_Remove(old);
+      table_.Remove(old->key(), old->hash);
+      old->SetInCache(false);
+      assert(usage_ >= old->total_charge);
+      usage_ -= old->total_charge;
+      last_reference_list.push_back(old);
+    }
+  }
+
+  for (auto entry : last_reference_list) {
+    entry->Free();
+  }
+}
+
+void LRUCacheShard::ApplyToSomeEntries(
+    const std::function<void(const Slice& key, void* value, size_t charge,
+                             DeleterFn deleter)>& callback,
+    size_t average_entries_per_lock, size_t* state) {
+  // The state is essentially going to be the starting hash, which works
+  // nicely even if we resize between calls because we use upper-most
+  // hash bits for table indexes.
+  DMutexLock l(mutex_);
+  int length_bits = table_.GetLengthBits();
+  size_t length = size_t{1} << length_bits;
+
+  assert(average_entries_per_lock > 0);
+  // Assuming we are called with same average_entries_per_lock repeatedly,
+  // this simplifies some logic (index_end will not overflow).
+  assert(average_entries_per_lock < length || *state == 0);
+
+  size_t index_begin = *state >> (sizeof(size_t) * 8u - length_bits);
+  size_t index_end = index_begin + average_entries_per_lock;
+  if (index_end >= length) {
+    // Going to end
+    index_end = length;
+    *state = SIZE_MAX;
+  } else {
+    *state = index_end << (sizeof(size_t) * 8u - length_bits);
+  }
+
+  table_.ApplyToEntriesRange(
+      [callback,
+       metadata_charge_policy = metadata_charge_policy_](LRUHandle* h) {
+        DeleterFn deleter = h->IsSecondaryCacheCompatible()
+                                ? h->info_.helper->del_cb
+                                : h->info_.deleter;
+        callback(h->key(), h->value, h->GetCharge(metadata_charge_policy),
+                 deleter);
+      },
+      index_begin, index_end);
+}
+
+void LRUCacheShard::TEST_GetLRUList(LRUHandle** lru, LRUHandle** lru_low_pri,
+                                    LRUHandle** lru_bottom_pri) {
+  DMutexLock l(mutex_);
+  *lru = &lru_;
+  *lru_low_pri = lru_low_pri_;
+  *lru_bottom_pri = lru_bottom_pri_;
+}
+
+size_t LRUCacheShard::TEST_GetLRUSize() {
+  DMutexLock l(mutex_);
+  LRUHandle* lru_handle = lru_.next;
+  size_t lru_size = 0;
+  while (lru_handle != &lru_) {
+    lru_size++;
+    lru_handle = lru_handle->next;
+  }
+  return lru_size;
+}
+
+double LRUCacheShard::GetHighPriPoolRatio() {
+  DMutexLock l(mutex_);
+  return high_pri_pool_ratio_;
+}
+
+double LRUCacheShard::GetLowPriPoolRatio() {
+  DMutexLock l(mutex_);
+  return low_pri_pool_ratio_;
+}
+
+void LRUCacheShard::LRU_Remove(LRUHandle* e) {
+  assert(e->next != nullptr);
+  assert(e->prev != nullptr);
+  if (lru_low_pri_ == e) {
+    lru_low_pri_ = e->prev;
+  }
+  if (lru_bottom_pri_ == e) {
+    lru_bottom_pri_ = e->prev;
+  }
+  e->next->prev = e->prev;
+  e->prev->next = e->next;
+  e->prev = e->next = nullptr;
+  assert(lru_usage_ >= e->total_charge);
+  lru_usage_ -= e->total_charge;
+  assert(!e->InHighPriPool() || !e->InLowPriPool());
+  if (e->InHighPriPool()) {
+    assert(high_pri_pool_usage_ >= e->total_charge);
+    high_pri_pool_usage_ -= e->total_charge;
+  } else if (e->InLowPriPool()) {
+    assert(low_pri_pool_usage_ >= e->total_charge);
+    low_pri_pool_usage_ -= e->total_charge;
+  }
+}
+
+void LRUCacheShard::LRU_Insert(LRUHandle* e) {
+  assert(e->next == nullptr);
+  assert(e->prev == nullptr);
+  if (high_pri_pool_ratio_ > 0 && (e->IsHighPri() || e->HasHit())) {
+    // Inset "e" to head of LRU list.
+    e->next = &lru_;
+    e->prev = lru_.prev;
+    e->prev->next = e;
+    e->next->prev = e;
+    e->SetInHighPriPool(true);
+    e->SetInLowPriPool(false);
+    high_pri_pool_usage_ += e->total_charge;
+    MaintainPoolSize();
+  } else if (low_pri_pool_ratio_ > 0 &&
+             (e->IsHighPri() || e->IsLowPri() || e->HasHit())) {
+    // Insert "e" to the head of low-pri pool.
+    e->next = lru_low_pri_->next;
+    e->prev = lru_low_pri_;
+    e->prev->next = e;
+    e->next->prev = e;
+    e->SetInHighPriPool(false);
+    e->SetInLowPriPool(true);
+    low_pri_pool_usage_ += e->total_charge;
+    MaintainPoolSize();
+    lru_low_pri_ = e;
+  } else {
+    // Insert "e" to the head of bottom-pri pool.
+    e->next = lru_bottom_pri_->next;
+    e->prev = lru_bottom_pri_;
+    e->prev->next = e;
+    e->next->prev = e;
+    e->SetInHighPriPool(false);
+    e->SetInLowPriPool(false);
+    // if the low-pri pool is empty, lru_low_pri_ also needs to be updated.
+    if (lru_bottom_pri_ == lru_low_pri_) {
+      lru_low_pri_ = e;
+    }
+    lru_bottom_pri_ = e;
+  }
+  lru_usage_ += e->total_charge;
+}
+
+void LRUCacheShard::MaintainPoolSize() {
+  while (high_pri_pool_usage_ > high_pri_pool_capacity_) {
+    // Overflow last entry in high-pri pool to low-pri pool.
+    lru_low_pri_ = lru_low_pri_->next;
+    assert(lru_low_pri_ != &lru_);
+    lru_low_pri_->SetInHighPriPool(false);
+    lru_low_pri_->SetInLowPriPool(true);
+    assert(high_pri_pool_usage_ >= lru_low_pri_->total_charge);
+    high_pri_pool_usage_ -= lru_low_pri_->total_charge;
+    low_pri_pool_usage_ += lru_low_pri_->total_charge;
+  }
+
+  while (low_pri_pool_usage_ > low_pri_pool_capacity_) {
+    // Overflow last entry in low-pri pool to bottom-pri pool.
+    lru_bottom_pri_ = lru_bottom_pri_->next;
+    assert(lru_bottom_pri_ != &lru_);
+    lru_bottom_pri_->SetInHighPriPool(false);
+    lru_bottom_pri_->SetInLowPriPool(false);
+    assert(low_pri_pool_usage_ >= lru_bottom_pri_->total_charge);
+    low_pri_pool_usage_ -= lru_bottom_pri_->total_charge;
+  }
+}
+
+void LRUCacheShard::EvictFromLRU(size_t charge,
+                                 autovector<LRUHandle*>* deleted) {
+  while ((usage_ + charge) > capacity_ && lru_.next != &lru_) {
+    LRUHandle* old = lru_.next;
+    // LRU list contains only elements which can be evicted.
+    assert(old->InCache() && !old->HasRefs());
+    LRU_Remove(old);
+    table_.Remove(old->key(), old->hash);
+    old->SetInCache(false);
+    assert(usage_ >= old->total_charge);
+    usage_ -= old->total_charge;
+    deleted->push_back(old);
+  }
+}
+
+void LRUCacheShard::TryInsertIntoSecondaryCache(
+    autovector<LRUHandle*> evicted_handles) {
+  for (auto entry : evicted_handles) {
+    if (secondary_cache_ && entry->IsSecondaryCacheCompatible() &&
+        !entry->IsInSecondaryCache()) {
+      secondary_cache_->Insert(entry->key(), entry->value, entry->info_.helper)
+          .PermitUncheckedError();
+    }
+    // Free the entries here outside of mutex for performance reasons.
+    entry->Free();
+  }
+}
+
+void LRUCacheShard::SetCapacity(size_t capacity) {
+  autovector<LRUHandle*> last_reference_list;
+  {
+    DMutexLock l(mutex_);
+    capacity_ = capacity;
+    high_pri_pool_capacity_ = capacity_ * high_pri_pool_ratio_;
+    low_pri_pool_capacity_ = capacity_ * low_pri_pool_ratio_;
+    EvictFromLRU(0, &last_reference_list);
+  }
+
+  TryInsertIntoSecondaryCache(last_reference_list);
+}
+
+void LRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
+  DMutexLock l(mutex_);
+  strict_capacity_limit_ = strict_capacity_limit;
+}
+
+Status LRUCacheShard::InsertItem(LRUHandle* e, LRUHandle** handle,
+                                 bool free_handle_on_fail) {
+  Status s = Status::OK();
+  autovector<LRUHandle*> last_reference_list;
+
+  {
+    DMutexLock l(mutex_);
+
+    // Free the space following strict LRU policy until enough space
+    // is freed or the lru list is empty.
+    EvictFromLRU(e->total_charge, &last_reference_list);
+
+    if ((usage_ + e->total_charge) > capacity_ &&
+        (strict_capacity_limit_ || handle == nullptr)) {
+      e->SetInCache(false);
+      if (handle == nullptr) {
+        // Don't insert the entry but still return ok, as if the entry inserted
+        // into cache and get evicted immediately.
+        last_reference_list.push_back(e);
+      } else {
+        if (free_handle_on_fail) {
+          free(e);
+          *handle = nullptr;
+        }
+        s = Status::MemoryLimit("Insert failed due to LRU cache being full.");
+      }
+    } else {
+      // Insert into the cache. Note that the cache might get larger than its
+      // capacity if not enough space was freed up.
+      LRUHandle* old = table_.Insert(e);
+      usage_ += e->total_charge;
+      if (old != nullptr) {
+        s = Status::OkOverwritten();
+        assert(old->InCache());
+        old->SetInCache(false);
+        if (!old->HasRefs()) {
+          // old is on LRU because it's in cache and its reference count is 0.
+          LRU_Remove(old);
+          assert(usage_ >= old->total_charge);
+          usage_ -= old->total_charge;
+          last_reference_list.push_back(old);
+        }
+      }
+      if (handle == nullptr) {
+        LRU_Insert(e);
+      } else {
+        // If caller already holds a ref, no need to take one here.
+        if (!e->HasRefs()) {
+          e->Ref();
+        }
+        *handle = e;
+      }
+    }
+  }
+
+  TryInsertIntoSecondaryCache(last_reference_list);
+
+  return s;
+}
+
+void LRUCacheShard::Promote(LRUHandle* e) {
+  SecondaryCacheResultHandle* secondary_handle = e->sec_handle;
+
+  assert(secondary_handle->IsReady());
+  // e is not thread-shared here; OK to modify "immutable" fields as well as
+  // "mutable" (normally requiring mutex)
+  e->SetIsPending(false);
+  e->value = secondary_handle->Value();
+  assert(e->total_charge == 0);
+  size_t value_size = secondary_handle->Size();
+  delete secondary_handle;
+
+  if (e->value) {
+    e->CalcTotalCharge(value_size, metadata_charge_policy_);
+    Status s;
+    if (e->IsStandalone()) {
+      assert(secondary_cache_ && secondary_cache_->SupportForceErase());
+
+      // Insert a dummy handle and return a standalone handle to caller.
+      // Charge the standalone handle.
+      autovector<LRUHandle*> last_reference_list;
+      bool free_standalone_handle{false};
+      {
+        DMutexLock l(mutex_);
+
+        // Free the space following strict LRU policy until enough space
+        // is freed or the lru list is empty.
+        EvictFromLRU(e->total_charge, &last_reference_list);
+
+        if ((usage_ + e->total_charge) > capacity_ && strict_capacity_limit_) {
+          free_standalone_handle = true;
+        } else {
+          usage_ += e->total_charge;
+        }
+      }
+
+      TryInsertIntoSecondaryCache(last_reference_list);
+      if (free_standalone_handle) {
+        e->Unref();
+        e->Free();
+        e = nullptr;
+      } else {
+        PERF_COUNTER_ADD(block_cache_standalone_handle_count, 1);
+      }
+
+      // Insert a dummy handle into the primary cache. This dummy handle is
+      // not IsSecondaryCacheCompatible().
+      // FIXME? This should not overwrite an existing non-dummy entry in the
+      // rare case that one exists
+      Cache::Priority priority =
+          e->IsHighPri() ? Cache::Priority::HIGH : Cache::Priority::LOW;
+      s = Insert(e->key(), e->hash, kDummyValueMarker, /*charge=*/0,
+                 /*deleter=*/nullptr, /*helper=*/nullptr, /*handle=*/nullptr,
+                 priority);
+    } else {
+      e->SetInCache(true);
+      LRUHandle* handle = e;
+      // This InsertItem() could fail if the cache is over capacity and
+      // strict_capacity_limit_ is true. In such a case, we don't want
+      // InsertItem() to free the handle, since the item is already in memory
+      // and the caller will most likely just read it from disk if we erase it
+      // here.
+      s = InsertItem(e, &handle, /*free_handle_on_fail=*/false);
+      if (s.ok()) {
+        PERF_COUNTER_ADD(block_cache_real_handle_count, 1);
+      }
+    }
+
+    if (!s.ok()) {
+      // Item is in memory, but not accounted against the cache capacity.
+      // When the handle is released, the item should get deleted.
+      assert(!e->InCache());
+    }
+  } else {
+    // Secondary cache lookup failed. The caller will take care of detecting
+    // this and eventually releasing e.
+    assert(!e->value);
+    assert(!e->InCache());
+  }
+}
+
+LRUHandle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash,
+                                 const Cache::CacheItemHelper* helper,
+                                 const Cache::CreateCallback& create_cb,
+                                 Cache::Priority priority, bool wait,
+                                 Statistics* stats) {
+  LRUHandle* e = nullptr;
+  bool found_dummy_entry{false};
+  {
+    DMutexLock l(mutex_);
+    e = table_.Lookup(key, hash);
+    if (e != nullptr) {
+      assert(e->InCache());
+      if (e->value == kDummyValueMarker) {
+        // For a dummy handle, if it was retrieved from secondary cache,
+        // it may still exist in secondary cache.
+        // If the handle exists in secondary cache, the value should be
+        // erased from sec cache and be inserted into primary cache.
+        found_dummy_entry = true;
+        // Let the dummy entry be overwritten
+        e = nullptr;
+      } else {
+        if (!e->HasRefs()) {
+          // The entry is in LRU since it's in hash and has no external
+          // references.
+          LRU_Remove(e);
+        }
+        e->Ref();
+        e->SetHit();
+      }
+    }
+  }
+
+  // If handle table lookup failed or the handle is a dummy one, allocate
+  // a handle outside the mutex if we re going to lookup in the secondary cache.
+  //
+  // When a block is firstly Lookup from CompressedSecondaryCache, we just
+  // insert a dummy block into the primary cache (charging the actual size of
+  // the block) and don't erase the block from CompressedSecondaryCache. A
+  // standalone handle is returned to the caller. Only if the block is hit
+  // again, we erase it from CompressedSecondaryCache and add it into the
+  // primary cache.
+  if (!e && secondary_cache_ && helper && helper->saveto_cb) {
+    // For objects from the secondary cache, we expect the caller to provide
+    // a way to create/delete the primary cache object. The only case where
+    // a deleter would not be required is for dummy entries inserted for
+    // accounting purposes, which we won't demote to the secondary cache
+    // anyway.
+    assert(create_cb && helper->del_cb);
+    bool is_in_sec_cache{false};
+    std::unique_ptr<SecondaryCacheResultHandle> secondary_handle =
+        secondary_cache_->Lookup(key, create_cb, wait, found_dummy_entry,
+                                 is_in_sec_cache);
+    if (secondary_handle != nullptr) {
+      e = static_cast<LRUHandle*>(malloc(sizeof(LRUHandle) - 1 + key.size()));
+
+      e->m_flags = 0;
+      e->im_flags = 0;
+      e->SetSecondaryCacheCompatible(true);
+      e->info_.helper = helper;
+      e->key_length = key.size();
+      e->hash = hash;
+      e->refs = 0;
+      e->next = e->prev = nullptr;
+      e->SetPriority(priority);
+      memcpy(e->key_data, key.data(), key.size());
+      e->value = nullptr;
+      e->sec_handle = secondary_handle.release();
+      e->total_charge = 0;
+      e->Ref();
+      e->SetIsInSecondaryCache(is_in_sec_cache);
+      e->SetIsStandalone(secondary_cache_->SupportForceErase() &&
+                         !found_dummy_entry);
+
+      if (wait) {
+        Promote(e);
+        if (e) {
+          if (!e->value) {
+            // The secondary cache returned a handle, but the lookup failed.
+            e->Unref();
+            e->Free();
+            e = nullptr;
+          } else {
+            PERF_COUNTER_ADD(secondary_cache_hit_count, 1);
+            RecordTick(stats, SECONDARY_CACHE_HITS);
+          }
+        }
+      } else {
+        // If wait is false, we always return a handle and let the caller
+        // release the handle after checking for success or failure.
+        e->SetIsPending(true);
+        // This may be slightly inaccurate, if the lookup eventually fails.
+        // But the probability is very low.
+        PERF_COUNTER_ADD(secondary_cache_hit_count, 1);
+        RecordTick(stats, SECONDARY_CACHE_HITS);
+      }
+    } else {
+      // Caller will most likely overwrite the dummy entry with an Insert
+      // after this Lookup fails
+      assert(e == nullptr);
+    }
+  }
+  return e;
+}
+
+bool LRUCacheShard::Ref(LRUHandle* e) {
+  DMutexLock l(mutex_);
+  // To create another reference - entry must be already externally referenced.
+  assert(e->HasRefs());
+  // Pending handles are not for sharing
+  assert(!e->IsPending());
+  e->Ref();
+  return true;
+}
+
+void LRUCacheShard::SetHighPriorityPoolRatio(double high_pri_pool_ratio) {
+  DMutexLock l(mutex_);
+  high_pri_pool_ratio_ = high_pri_pool_ratio;
+  high_pri_pool_capacity_ = capacity_ * high_pri_pool_ratio_;
+  MaintainPoolSize();
+}
+
+void LRUCacheShard::SetLowPriorityPoolRatio(double low_pri_pool_ratio) {
+  DMutexLock l(mutex_);
+  low_pri_pool_ratio_ = low_pri_pool_ratio;
+  low_pri_pool_capacity_ = capacity_ * low_pri_pool_ratio_;
+  MaintainPoolSize();
+}
+
+bool LRUCacheShard::Release(LRUHandle* e, bool /*useful*/,
+                            bool erase_if_last_ref) {
+  if (e == nullptr) {
+    return false;
+  }
+  bool last_reference = false;
+  // Must Wait or WaitAll first on pending handles. Otherwise, would leak
+  // a secondary cache handle.
+  assert(!e->IsPending());
+  {
+    DMutexLock l(mutex_);
+    last_reference = e->Unref();
+    if (last_reference && e->InCache()) {
+      // The item is still in cache, and nobody else holds a reference to it.
+      if (usage_ > capacity_ || erase_if_last_ref) {
+        // The LRU list must be empty since the cache is full.
+        assert(lru_.next == &lru_ || erase_if_last_ref);
+        // Take this opportunity and remove the item.
+        table_.Remove(e->key(), e->hash);
+        e->SetInCache(false);
+      } else {
+        // Put the item back on the LRU list, and don't free it.
+        LRU_Insert(e);
+        last_reference = false;
+      }
+    }
+    // If it was the last reference, then decrement the cache usage.
+    if (last_reference) {
+      assert(usage_ >= e->total_charge);
+      usage_ -= e->total_charge;
+    }
+  }
+
+  // Free the entry here outside of mutex for performance reasons.
+  if (last_reference) {
+    e->Free();
+  }
+  return last_reference;
+}
+
+Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
+                             size_t charge,
+                             void (*deleter)(const Slice& key, void* value),
+                             const Cache::CacheItemHelper* helper,
+                             LRUHandle** handle, Cache::Priority priority) {
+  // Allocate the memory here outside of the mutex.
+  // If the cache is full, we'll have to release it.
+  // It shouldn't happen very often though.
+  LRUHandle* e =
+      static_cast<LRUHandle*>(malloc(sizeof(LRUHandle) - 1 + key.size()));
+
+  e->value = value;
+  e->m_flags = 0;
+  e->im_flags = 0;
+  if (helper) {
+    // Use only one of the two parameters
+    assert(deleter == nullptr);
+    // value == nullptr is reserved for indicating failure for when secondary
+    // cache compatible
+    assert(value != nullptr);
+    e->SetSecondaryCacheCompatible(true);
+    e->info_.helper = helper;
+  } else {
+    e->info_.deleter = deleter;
+  }
+  e->key_length = key.size();
+  e->hash = hash;
+  e->refs = 0;
+  e->next = e->prev = nullptr;
+  e->SetInCache(true);
+  e->SetPriority(priority);
+  memcpy(e->key_data, key.data(), key.size());
+  e->CalcTotalCharge(charge, metadata_charge_policy_);
+
+  return InsertItem(e, handle, /* free_handle_on_fail */ true);
+}
+
+void LRUCacheShard::Erase(const Slice& key, uint32_t hash) {
+  LRUHandle* e;
+  bool last_reference = false;
+  {
+    DMutexLock l(mutex_);
+    e = table_.Remove(key, hash);
+    if (e != nullptr) {
+      assert(e->InCache());
+      e->SetInCache(false);
+      if (!e->HasRefs()) {
+        // The entry is in LRU since it's in hash and has no external references
+        LRU_Remove(e);
+        assert(usage_ >= e->total_charge);
+        usage_ -= e->total_charge;
+        last_reference = true;
+      }
+    }
+  }
+
+  // Free the entry here outside of mutex for performance reasons.
+  // last_reference will only be true if e != nullptr.
+  if (last_reference) {
+    e->Free();
+  }
+}
+
+bool LRUCacheShard::IsReady(LRUHandle* e) {
+  bool ready = true;
+  if (e->IsPending()) {
+    assert(secondary_cache_);
+    assert(e->sec_handle);
+    ready = e->sec_handle->IsReady();
+  }
+  return ready;
+}
+
+size_t LRUCacheShard::GetUsage() const {
+  DMutexLock l(mutex_);
+  return usage_;
+}
+
+size_t LRUCacheShard::GetPinnedUsage() const {
+  DMutexLock l(mutex_);
+  assert(usage_ >= lru_usage_);
+  return usage_ - lru_usage_;
+}
+
+size_t LRUCacheShard::GetOccupancyCount() const {
+  DMutexLock l(mutex_);
+  return table_.GetOccupancyCount();
+}
+
+size_t LRUCacheShard::GetTableAddressCount() const {
+  DMutexLock l(mutex_);
+  return size_t{1} << table_.GetLengthBits();
+}
+
+void LRUCacheShard::AppendPrintableOptions(std::string& str) const {
+  const int kBufferSize = 200;
+  char buffer[kBufferSize];
+  {
+    DMutexLock l(mutex_);
+    snprintf(buffer, kBufferSize, "    high_pri_pool_ratio: %.3lf\n",
+             high_pri_pool_ratio_);
+    snprintf(buffer + strlen(buffer), kBufferSize - strlen(buffer),
+             "    low_pri_pool_ratio: %.3lf\n", low_pri_pool_ratio_);
+  }
+  str.append(buffer);
+}
+
+LRUCache::LRUCache(size_t capacity, int num_shard_bits,
+                   bool strict_capacity_limit, double high_pri_pool_ratio,
+                   double low_pri_pool_ratio,
+                   std::shared_ptr<MemoryAllocator> allocator,
+                   bool use_adaptive_mutex,
+                   CacheMetadataChargePolicy metadata_charge_policy,
+                   std::shared_ptr<SecondaryCache> _secondary_cache)
+    : ShardedCache(capacity, num_shard_bits, strict_capacity_limit,
+                   std::move(allocator)),
+      secondary_cache_(std::move(_secondary_cache)) {
+  size_t per_shard = GetPerShardCapacity();
+  SecondaryCache* secondary_cache = secondary_cache_.get();
+  InitShards([=](LRUCacheShard* cs) {
+    new (cs) LRUCacheShard(
+        per_shard, strict_capacity_limit, high_pri_pool_ratio,
+        low_pri_pool_ratio, use_adaptive_mutex, metadata_charge_policy,
+        /* max_upper_hash_bits */ 32 - num_shard_bits, secondary_cache);
+  });
+}
+
+void* LRUCache::Value(Handle* handle) {
+  auto h = reinterpret_cast<const LRUHandle*>(handle);
+  assert(!h->IsPending() || h->value == nullptr);
+  assert(h->value != kDummyValueMarker);
+  return h->value;
+}
+
+size_t LRUCache::GetCharge(Handle* handle) const {
+  return reinterpret_cast<const LRUHandle*>(handle)->GetCharge(
+      GetShard(0).metadata_charge_policy_);
+}
+
+Cache::DeleterFn LRUCache::GetDeleter(Handle* handle) const {
+  auto h = reinterpret_cast<const LRUHandle*>(handle);
+  if (h->IsSecondaryCacheCompatible()) {
+    return h->info_.helper->del_cb;
+  } else {
+    return h->info_.deleter;
+  }
+}
+
+size_t LRUCache::TEST_GetLRUSize() {
+  return SumOverShards([](LRUCacheShard& cs) { return cs.TEST_GetLRUSize(); });
+}
+
+double LRUCache::GetHighPriPoolRatio() {
+  return GetShard(0).GetHighPriPoolRatio();
+}
+
+void LRUCache::WaitAll(std::vector<Handle*>& handles) {
+  if (secondary_cache_) {
+    std::vector<SecondaryCacheResultHandle*> sec_handles;
+    sec_handles.reserve(handles.size());
+    for (Handle* handle : handles) {
+      if (!handle) {
+        continue;
+      }
+      LRUHandle* lru_handle = reinterpret_cast<LRUHandle*>(handle);
+      if (!lru_handle->IsPending()) {
+        continue;
+      }
+      sec_handles.emplace_back(lru_handle->sec_handle);
+    }
+    secondary_cache_->WaitAll(sec_handles);
+    for (Handle* handle : handles) {
+      if (!handle) {
+        continue;
+      }
+      LRUHandle* lru_handle = reinterpret_cast<LRUHandle*>(handle);
+      if (!lru_handle->IsPending()) {
+        continue;
+      }
+      GetShard(lru_handle->hash).Promote(lru_handle);
+    }
+  }
+}
+
+void LRUCache::AppendPrintableOptions(std::string& str) const {
+  ShardedCache::AppendPrintableOptions(str);  // options from shard
+  if (secondary_cache_) {
+    str.append("  secondary_cache:\n");
+    str.append(secondary_cache_->GetPrintableOptions());
+  }
+}
+
+}  // namespace lru_cache
+
+std::shared_ptr<Cache> NewLRUCache(
+    size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+    double high_pri_pool_ratio,
+    std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
+    CacheMetadataChargePolicy metadata_charge_policy,
+    const std::shared_ptr<SecondaryCache>& secondary_cache,
+    double low_pri_pool_ratio) {
+  if (num_shard_bits >= 20) {
+    return nullptr;  // The cache cannot be sharded into too many fine pieces.
+  }
+  if (high_pri_pool_ratio < 0.0 || high_pri_pool_ratio > 1.0) {
+    // Invalid high_pri_pool_ratio
+    return nullptr;
+  }
+  if (low_pri_pool_ratio < 0.0 || low_pri_pool_ratio > 1.0) {
+    // Invalid low_pri_pool_ratio
+    return nullptr;
+  }
+  if (low_pri_pool_ratio + high_pri_pool_ratio > 1.0) {
+    // Invalid high_pri_pool_ratio and low_pri_pool_ratio combination
+    return nullptr;
+  }
+  if (num_shard_bits < 0) {
+    num_shard_bits = GetDefaultCacheShardBits(capacity);
+  }
+  return std::make_shared<LRUCache>(
+      capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio,
+      low_pri_pool_ratio, std::move(memory_allocator), use_adaptive_mutex,
+      metadata_charge_policy, secondary_cache);
+}
+
+std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts) {
+  return NewLRUCache(cache_opts.capacity, cache_opts.num_shard_bits,
+                     cache_opts.strict_capacity_limit,
+                     cache_opts.high_pri_pool_ratio,
+                     cache_opts.memory_allocator, cache_opts.use_adaptive_mutex,
+                     cache_opts.metadata_charge_policy,
+                     cache_opts.secondary_cache, cache_opts.low_pri_pool_ratio);
+}
+
+std::shared_ptr<Cache> NewLRUCache(
+    size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+    double high_pri_pool_ratio,
+    std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
+    CacheMetadataChargePolicy metadata_charge_policy,
+    double low_pri_pool_ratio) {
+  return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit,
+                     high_pri_pool_ratio, memory_allocator, use_adaptive_mutex,
+                     metadata_charge_policy, nullptr, low_pri_pool_ratio);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/lru_cache.h b/src/rocksdb/cache/lru_cache.h
new file mode 100644
index 000000000..99b2f2b20
--- /dev/null
+++ b/src/rocksdb/cache/lru_cache.h
@@ -0,0 +1,546 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "cache/sharded_cache.h"
+#include "port/lang.h"
+#include "port/malloc.h"
+#include "port/port.h"
+#include "rocksdb/secondary_cache.h"
+#include "util/autovector.h"
+#include "util/distributed_mutex.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace lru_cache {
+
+// LRU cache implementation. This class is not thread-safe.
+
+// An entry is a variable length heap-allocated structure.
+// Entries are referenced by cache and/or by any external entity.
+// The cache keeps all its entries in a hash table. Some elements
+// are also stored on LRU list.
+//
+// LRUHandle can be in these states:
+// 1. Referenced externally AND in hash table.
+//    In that case the entry is *not* in the LRU list
+//    (refs >= 1 && in_cache == true)
+// 2. Not referenced externally AND in hash table.
+//    In that case the entry is in the LRU list and can be freed.
+//    (refs == 0 && in_cache == true)
+// 3. Referenced externally AND not in hash table.
+//    In that case the entry is not in the LRU list and not in hash table.
+//    The entry must be freed if refs becomes 0 in this state.
+//    (refs >= 1 && in_cache == false)
+// If you call LRUCacheShard::Release enough times on an entry in state 1, it
+// will go into state 2. To move from state 1 to state 3, either call
+// LRUCacheShard::Erase or LRUCacheShard::Insert with the same key (but
+// possibly different value). To move from state 2 to state 1, use
+// LRUCacheShard::Lookup.
+// While refs > 0, public properties like value and deleter must not change.
+
+struct LRUHandle {
+  void* value;
+  union Info {
+    Info() {}
+    ~Info() {}
+    Cache::DeleterFn deleter;
+    const Cache::CacheItemHelper* helper;
+  } info_;
+  // An entry is not added to the LRUHandleTable until the secondary cache
+  // lookup is complete, so its safe to have this union.
+  union {
+    LRUHandle* next_hash;
+    SecondaryCacheResultHandle* sec_handle;
+  };
+  LRUHandle* next;
+  LRUHandle* prev;
+  size_t total_charge;  // TODO(opt): Only allow uint32_t?
+  size_t key_length;
+  // The hash of key(). Used for fast sharding and comparisons.
+  uint32_t hash;
+  // The number of external refs to this entry. The cache itself is not counted.
+  uint32_t refs;
+
+  // Mutable flags - access controlled by mutex
+  // The m_ and M_ prefixes (and im_ and IM_ later) are to hopefully avoid
+  // checking an M_ flag on im_flags or an IM_ flag on m_flags.
+  uint8_t m_flags;
+  enum MFlags : uint8_t {
+    // Whether this entry is referenced by the hash table.
+    M_IN_CACHE = (1 << 0),
+    // Whether this entry has had any lookups (hits).
+    M_HAS_HIT = (1 << 1),
+    // Whether this entry is in high-pri pool.
+    M_IN_HIGH_PRI_POOL = (1 << 2),
+    // Whether this entry is in low-pri pool.
+    M_IN_LOW_PRI_POOL = (1 << 3),
+  };
+
+  // "Immutable" flags - only set in single-threaded context and then
+  // can be accessed without mutex
+  uint8_t im_flags;
+  enum ImFlags : uint8_t {
+    // Whether this entry is high priority entry.
+    IM_IS_HIGH_PRI = (1 << 0),
+    // Whether this entry is low priority entry.
+    IM_IS_LOW_PRI = (1 << 1),
+    // Can this be inserted into the secondary cache.
+    IM_IS_SECONDARY_CACHE_COMPATIBLE = (1 << 2),
+    // Is the handle still being read from a lower tier.
+    IM_IS_PENDING = (1 << 3),
+    // Whether this handle is still in a lower tier
+    IM_IS_IN_SECONDARY_CACHE = (1 << 4),
+    // Marks result handles that should not be inserted into cache
+    IM_IS_STANDALONE = (1 << 5),
+  };
+
+  // Beginning of the key (MUST BE THE LAST FIELD IN THIS STRUCT!)
+  char key_data[1];
+
+  Slice key() const { return Slice(key_data, key_length); }
+
+  // For HandleImpl concept
+  uint32_t GetHash() const { return hash; }
+
+  // Increase the reference count by 1.
+  void Ref() { refs++; }
+
+  // Just reduce the reference count by 1. Return true if it was last reference.
+  bool Unref() {
+    assert(refs > 0);
+    refs--;
+    return refs == 0;
+  }
+
+  // Return true if there are external refs, false otherwise.
+  bool HasRefs() const { return refs > 0; }
+
+  bool InCache() const { return m_flags & M_IN_CACHE; }
+  bool IsHighPri() const { return im_flags & IM_IS_HIGH_PRI; }
+  bool InHighPriPool() const { return m_flags & M_IN_HIGH_PRI_POOL; }
+  bool IsLowPri() const { return im_flags & IM_IS_LOW_PRI; }
+  bool InLowPriPool() const { return m_flags & M_IN_LOW_PRI_POOL; }
+  bool HasHit() const { return m_flags & M_HAS_HIT; }
+  bool IsSecondaryCacheCompatible() const {
+    return im_flags & IM_IS_SECONDARY_CACHE_COMPATIBLE;
+  }
+  bool IsPending() const { return im_flags & IM_IS_PENDING; }
+  bool IsInSecondaryCache() const {
+    return im_flags & IM_IS_IN_SECONDARY_CACHE;
+  }
+  bool IsStandalone() const { return im_flags & IM_IS_STANDALONE; }
+
+  void SetInCache(bool in_cache) {
+    if (in_cache) {
+      m_flags |= M_IN_CACHE;
+    } else {
+      m_flags &= ~M_IN_CACHE;
+    }
+  }
+
+  void SetPriority(Cache::Priority priority) {
+    if (priority == Cache::Priority::HIGH) {
+      im_flags |= IM_IS_HIGH_PRI;
+      im_flags &= ~IM_IS_LOW_PRI;
+    } else if (priority == Cache::Priority::LOW) {
+      im_flags &= ~IM_IS_HIGH_PRI;
+      im_flags |= IM_IS_LOW_PRI;
+    } else {
+      im_flags &= ~IM_IS_HIGH_PRI;
+      im_flags &= ~IM_IS_LOW_PRI;
+    }
+  }
+
+  void SetInHighPriPool(bool in_high_pri_pool) {
+    if (in_high_pri_pool) {
+      m_flags |= M_IN_HIGH_PRI_POOL;
+    } else {
+      m_flags &= ~M_IN_HIGH_PRI_POOL;
+    }
+  }
+
+  void SetInLowPriPool(bool in_low_pri_pool) {
+    if (in_low_pri_pool) {
+      m_flags |= M_IN_LOW_PRI_POOL;
+    } else {
+      m_flags &= ~M_IN_LOW_PRI_POOL;
+    }
+  }
+
+  void SetHit() { m_flags |= M_HAS_HIT; }
+
+  void SetSecondaryCacheCompatible(bool compat) {
+    if (compat) {
+      im_flags |= IM_IS_SECONDARY_CACHE_COMPATIBLE;
+    } else {
+      im_flags &= ~IM_IS_SECONDARY_CACHE_COMPATIBLE;
+    }
+  }
+
+  void SetIsPending(bool pending) {
+    if (pending) {
+      im_flags |= IM_IS_PENDING;
+    } else {
+      im_flags &= ~IM_IS_PENDING;
+    }
+  }
+
+  void SetIsInSecondaryCache(bool is_in_secondary_cache) {
+    if (is_in_secondary_cache) {
+      im_flags |= IM_IS_IN_SECONDARY_CACHE;
+    } else {
+      im_flags &= ~IM_IS_IN_SECONDARY_CACHE;
+    }
+  }
+
+  void SetIsStandalone(bool is_standalone) {
+    if (is_standalone) {
+      im_flags |= IM_IS_STANDALONE;
+    } else {
+      im_flags &= ~IM_IS_STANDALONE;
+    }
+  }
+
+  void Free() {
+    assert(refs == 0);
+
+    if (!IsSecondaryCacheCompatible() && info_.deleter) {
+      (*info_.deleter)(key(), value);
+    } else if (IsSecondaryCacheCompatible()) {
+      if (IsPending()) {
+        assert(sec_handle != nullptr);
+        SecondaryCacheResultHandle* tmp_sec_handle = sec_handle;
+        tmp_sec_handle->Wait();
+        value = tmp_sec_handle->Value();
+        delete tmp_sec_handle;
+      }
+      if (value) {
+        (*info_.helper->del_cb)(key(), value);
+      }
+    }
+
+    free(this);
+  }
+
+  inline size_t CalcuMetaCharge(
+      CacheMetadataChargePolicy metadata_charge_policy) const {
+    if (metadata_charge_policy != kFullChargeCacheMetadata) {
+      return 0;
+    } else {
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+      return malloc_usable_size(
+          const_cast<void*>(static_cast<const void*>(this)));
+#else
+      // This is the size that is used when a new handle is created.
+      return sizeof(LRUHandle) - 1 + key_length;
+#endif
+    }
+  }
+
+  // Calculate the memory usage by metadata.
+  inline void CalcTotalCharge(
+      size_t charge, CacheMetadataChargePolicy metadata_charge_policy) {
+    total_charge = charge + CalcuMetaCharge(metadata_charge_policy);
+  }
+
+  inline size_t GetCharge(
+      CacheMetadataChargePolicy metadata_charge_policy) const {
+    size_t meta_charge = CalcuMetaCharge(metadata_charge_policy);
+    assert(total_charge >= meta_charge);
+    return total_charge - meta_charge;
+  }
+};
+
+// We provide our own simple hash table since it removes a whole bunch
+// of porting hacks and is also faster than some of the built-in hash
+// table implementations in some of the compiler/runtime combinations
+// we have tested.  E.g., readrandom speeds up by ~5% over the g++
+// 4.4.3's builtin hashtable.
+class LRUHandleTable {
+ public:
+  explicit LRUHandleTable(int max_upper_hash_bits);
+  ~LRUHandleTable();
+
+  LRUHandle* Lookup(const Slice& key, uint32_t hash);
+  LRUHandle* Insert(LRUHandle* h);
+  LRUHandle* Remove(const Slice& key, uint32_t hash);
+
+  template <typename T>
+  void ApplyToEntriesRange(T func, size_t index_begin, size_t index_end) {
+    for (size_t i = index_begin; i < index_end; i++) {
+      LRUHandle* h = list_[i];
+      while (h != nullptr) {
+        auto n = h->next_hash;
+        assert(h->InCache());
+        func(h);
+        h = n;
+      }
+    }
+  }
+
+  int GetLengthBits() const { return length_bits_; }
+
+  size_t GetOccupancyCount() const { return elems_; }
+
+ private:
+  // Return a pointer to slot that points to a cache entry that
+  // matches key/hash.  If there is no such cache entry, return a
+  // pointer to the trailing slot in the corresponding linked list.
+  LRUHandle** FindPointer(const Slice& key, uint32_t hash);
+
+  void Resize();
+
+  // Number of hash bits (upper because lower bits used for sharding)
+  // used for table index. Length == 1 << length_bits_
+  int length_bits_;
+
+  // The table consists of an array of buckets where each bucket is
+  // a linked list of cache entries that hash into the bucket.
+  std::unique_ptr<LRUHandle*[]> list_;
+
+  // Number of elements currently in the table.
+  uint32_t elems_;
+
+  // Set from max_upper_hash_bits (see constructor).
+  const int max_length_bits_;
+};
+
+// A single shard of sharded cache.
+class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
+ public:
+  LRUCacheShard(size_t capacity, bool strict_capacity_limit,
+                double high_pri_pool_ratio, double low_pri_pool_ratio,
+                bool use_adaptive_mutex,
+                CacheMetadataChargePolicy metadata_charge_policy,
+                int max_upper_hash_bits, SecondaryCache* secondary_cache);
+
+ public:  // Type definitions expected as parameter to ShardedCache
+  using HandleImpl = LRUHandle;
+  using HashVal = uint32_t;
+  using HashCref = uint32_t;
+
+ public:  // Function definitions expected as parameter to ShardedCache
+  static inline HashVal ComputeHash(const Slice& key) {
+    return Lower32of64(GetSliceNPHash64(key));
+  }
+
+  // Separate from constructor so caller can easily make an array of LRUCache
+  // if current usage is more than new capacity, the function will attempt to
+  // free the needed space.
+  void SetCapacity(size_t capacity);
+
+  // Set the flag to reject insertion if cache if full.
+  void SetStrictCapacityLimit(bool strict_capacity_limit);
+
+  // Set percentage of capacity reserved for high-pri cache entries.
+  void SetHighPriorityPoolRatio(double high_pri_pool_ratio);
+
+  // Set percentage of capacity reserved for low-pri cache entries.
+  void SetLowPriorityPoolRatio(double low_pri_pool_ratio);
+
+  // Like Cache methods, but with an extra "hash" parameter.
+  inline Status Insert(const Slice& key, uint32_t hash, void* value,
+                       size_t charge, Cache::DeleterFn deleter,
+                       LRUHandle** handle, Cache::Priority priority) {
+    return Insert(key, hash, value, charge, deleter, nullptr, handle, priority);
+  }
+  inline Status Insert(const Slice& key, uint32_t hash, void* value,
+                       const Cache::CacheItemHelper* helper, size_t charge,
+                       LRUHandle** handle, Cache::Priority priority) {
+    assert(helper);
+    return Insert(key, hash, value, charge, nullptr, helper, handle, priority);
+  }
+  // If helper_cb is null, the values of the following arguments don't matter.
+  LRUHandle* Lookup(const Slice& key, uint32_t hash,
+                    const Cache::CacheItemHelper* helper,
+                    const Cache::CreateCallback& create_cb,
+                    Cache::Priority priority, bool wait, Statistics* stats);
+  inline LRUHandle* Lookup(const Slice& key, uint32_t hash) {
+    return Lookup(key, hash, nullptr, nullptr, Cache::Priority::LOW, true,
+                  nullptr);
+  }
+  bool Release(LRUHandle* handle, bool useful, bool erase_if_last_ref);
+  bool IsReady(LRUHandle* /*handle*/);
+  void Wait(LRUHandle* /*handle*/) {}
+  bool Ref(LRUHandle* handle);
+  void Erase(const Slice& key, uint32_t hash);
+
+  // Although in some platforms the update of size_t is atomic, to make sure
+  // GetUsage() and GetPinnedUsage() work correctly under any platform, we'll
+  // protect them with mutex_.
+
+  size_t GetUsage() const;
+  size_t GetPinnedUsage() const;
+  size_t GetOccupancyCount() const;
+  size_t GetTableAddressCount() const;
+
+  void ApplyToSomeEntries(
+      const std::function<void(const Slice& key, void* value, size_t charge,
+                               DeleterFn deleter)>& callback,
+      size_t average_entries_per_lock, size_t* state);
+
+  void EraseUnRefEntries();
+
+ public:  // other function definitions
+  void TEST_GetLRUList(LRUHandle** lru, LRUHandle** lru_low_pri,
+                       LRUHandle** lru_bottom_pri);
+
+  // Retrieves number of elements in LRU, for unit test purpose only.
+  // Not threadsafe.
+  size_t TEST_GetLRUSize();
+
+  // Retrieves high pri pool ratio
+  double GetHighPriPoolRatio();
+
+  // Retrieves low pri pool ratio
+  double GetLowPriPoolRatio();
+
+  void AppendPrintableOptions(std::string& /*str*/) const;
+
+ private:
+  friend class LRUCache;
+  // Insert an item into the hash table and, if handle is null, insert into
+  // the LRU list. Older items are evicted as necessary. If the cache is full
+  // and free_handle_on_fail is true, the item is deleted and handle is set to
+  // nullptr.
+  Status InsertItem(LRUHandle* item, LRUHandle** handle,
+                    bool free_handle_on_fail);
+  Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
+                DeleterFn deleter, const Cache::CacheItemHelper* helper,
+                LRUHandle** handle, Cache::Priority priority);
+  // Promote an item looked up from the secondary cache to the LRU cache.
+  // The item may be still in the secondary cache.
+  // It is only inserted into the hash table and not the LRU list, and only
+  // if the cache is not at full capacity, as is the case during Insert.  The
+  // caller should hold a reference on the LRUHandle. When the caller releases
+  // the last reference, the item is added to the LRU list.
+  // The item is promoted to the high pri or low pri pool as specified by the
+  // caller in Lookup.
+  void Promote(LRUHandle* e);
+  void LRU_Remove(LRUHandle* e);
+  void LRU_Insert(LRUHandle* e);
+
+  // Overflow the last entry in high-pri pool to low-pri pool until size of
+  // high-pri pool is no larger than the size specify by high_pri_pool_pct.
+  void MaintainPoolSize();
+
+  // Free some space following strict LRU policy until enough space
+  // to hold (usage_ + charge) is freed or the lru list is empty
+  // This function is not thread safe - it needs to be executed while
+  // holding the mutex_.
+  void EvictFromLRU(size_t charge, autovector<LRUHandle*>* deleted);
+
+  // Try to insert the evicted handles into the secondary cache.
+  void TryInsertIntoSecondaryCache(autovector<LRUHandle*> evicted_handles);
+
+  // Initialized before use.
+  size_t capacity_;
+
+  // Memory size for entries in high-pri pool.
+  size_t high_pri_pool_usage_;
+
+  // Memory size for entries in low-pri pool.
+  size_t low_pri_pool_usage_;
+
+  // Whether to reject insertion if cache reaches its full capacity.
+  bool strict_capacity_limit_;
+
+  // Ratio of capacity reserved for high priority cache entries.
+  double high_pri_pool_ratio_;
+
+  // High-pri pool size, equals to capacity * high_pri_pool_ratio.
+  // Remember the value to avoid recomputing each time.
+  double high_pri_pool_capacity_;
+
+  // Ratio of capacity reserved for low priority cache entries.
+  double low_pri_pool_ratio_;
+
+  // Low-pri pool size, equals to capacity * low_pri_pool_ratio.
+  // Remember the value to avoid recomputing each time.
+  double low_pri_pool_capacity_;
+
+  // Dummy head of LRU list.
+  // lru.prev is newest entry, lru.next is oldest entry.
+  // LRU contains items which can be evicted, ie reference only by cache
+  LRUHandle lru_;
+
+  // Pointer to head of low-pri pool in LRU list.
+  LRUHandle* lru_low_pri_;
+
+  // Pointer to head of bottom-pri pool in LRU list.
+  LRUHandle* lru_bottom_pri_;
+
+  // ------------^^^^^^^^^^^^^-----------
+  // Not frequently modified data members
+  // ------------------------------------
+  //
+  // We separate data members that are updated frequently from the ones that
+  // are not frequently updated so that they don't share the same cache line
+  // which will lead into false cache sharing
+  //
+  // ------------------------------------
+  // Frequently modified data members
+  // ------------vvvvvvvvvvvvv-----------
+  LRUHandleTable table_;
+
+  // Memory size for entries residing in the cache.
+  size_t usage_;
+
+  // Memory size for entries residing only in the LRU list.
+  size_t lru_usage_;
+
+  // mutex_ protects the following state.
+  // We don't count mutex_ as the cache's internal state so semantically we
+  // don't mind mutex_ invoking the non-const actions.
+  mutable DMutex mutex_;
+
+  // Owned by LRUCache
+  SecondaryCache* secondary_cache_;
+};
+
+class LRUCache
+#ifdef NDEBUG
+    final
+#endif
+    : public ShardedCache<LRUCacheShard> {
+ public:
+  LRUCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+           double high_pri_pool_ratio, double low_pri_pool_ratio,
+           std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
+           bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
+           CacheMetadataChargePolicy metadata_charge_policy =
+               kDontChargeCacheMetadata,
+           std::shared_ptr<SecondaryCache> secondary_cache = nullptr);
+  const char* Name() const override { return "LRUCache"; }
+  void* Value(Handle* handle) override;
+  size_t GetCharge(Handle* handle) const override;
+  DeleterFn GetDeleter(Handle* handle) const override;
+  void WaitAll(std::vector<Handle*>& handles) override;
+
+  // Retrieves number of elements in LRU, for unit test purpose only.
+  size_t TEST_GetLRUSize();
+  // Retrieves high pri pool ratio.
+  double GetHighPriPoolRatio();
+
+  void AppendPrintableOptions(std::string& str) const override;
+
+ private:
+  std::shared_ptr<SecondaryCache> secondary_cache_;
+};
+
+}  // namespace lru_cache
+
+using LRUCache = lru_cache::LRUCache;
+using LRUHandle = lru_cache::LRUHandle;
+using LRUCacheShard = lru_cache::LRUCacheShard;
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/lru_cache_test.cc b/src/rocksdb/cache/lru_cache_test.cc
new file mode 100644
index 000000000..7904a196d
--- /dev/null
+++ b/src/rocksdb/cache/lru_cache_test.cc
@@ -0,0 +1,2624 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "cache/lru_cache.h"
+
+#include <string>
+#include <vector>
+
+#include "cache/cache_key.h"
+#include "cache/clock_cache.h"
+#include "db/db_test_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/sst_file_manager.h"
+#include "rocksdb/utilities/cache_dump_load.h"
+#include "test_util/testharness.h"
+#include "util/coding.h"
+#include "util/random.h"
+#include "utilities/cache_dump_load_impl.h"
+#include "utilities/fault_injection_fs.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class LRUCacheTest : public testing::Test {
+ public:
+  LRUCacheTest() {}
+  ~LRUCacheTest() override { DeleteCache(); }
+
+  void DeleteCache() {
+    if (cache_ != nullptr) {
+      cache_->~LRUCacheShard();
+      port::cacheline_aligned_free(cache_);
+      cache_ = nullptr;
+    }
+  }
+
+  void NewCache(size_t capacity, double high_pri_pool_ratio = 0.0,
+                double low_pri_pool_ratio = 1.0,
+                bool use_adaptive_mutex = kDefaultToAdaptiveMutex) {
+    DeleteCache();
+    cache_ = reinterpret_cast<LRUCacheShard*>(
+        port::cacheline_aligned_alloc(sizeof(LRUCacheShard)));
+    new (cache_) LRUCacheShard(capacity, /*strict_capacity_limit=*/false,
+                               high_pri_pool_ratio, low_pri_pool_ratio,
+                               use_adaptive_mutex, kDontChargeCacheMetadata,
+                               /*max_upper_hash_bits=*/24,
+                               /*secondary_cache=*/nullptr);
+  }
+
+  void Insert(const std::string& key,
+              Cache::Priority priority = Cache::Priority::LOW) {
+    EXPECT_OK(cache_->Insert(key, 0 /*hash*/, nullptr /*value*/, 1 /*charge*/,
+                             nullptr /*deleter*/, nullptr /*handle*/,
+                             priority));
+  }
+
+  void Insert(char key, Cache::Priority priority = Cache::Priority::LOW) {
+    Insert(std::string(1, key), priority);
+  }
+
+  bool Lookup(const std::string& key) {
+    auto handle = cache_->Lookup(key, 0 /*hash*/);
+    if (handle) {
+      cache_->Release(handle, true /*useful*/, false /*erase*/);
+      return true;
+    }
+    return false;
+  }
+
+  bool Lookup(char key) { return Lookup(std::string(1, key)); }
+
+  void Erase(const std::string& key) { cache_->Erase(key, 0 /*hash*/); }
+
+  void ValidateLRUList(std::vector<std::string> keys,
+                       size_t num_high_pri_pool_keys = 0,
+                       size_t num_low_pri_pool_keys = 0,
+                       size_t num_bottom_pri_pool_keys = 0) {
+    LRUHandle* lru;
+    LRUHandle* lru_low_pri;
+    LRUHandle* lru_bottom_pri;
+    cache_->TEST_GetLRUList(&lru, &lru_low_pri, &lru_bottom_pri);
+
+    LRUHandle* iter = lru;
+
+    bool in_low_pri_pool = false;
+    bool in_high_pri_pool = false;
+
+    size_t high_pri_pool_keys = 0;
+    size_t low_pri_pool_keys = 0;
+    size_t bottom_pri_pool_keys = 0;
+
+    if (iter == lru_bottom_pri) {
+      in_low_pri_pool = true;
+      in_high_pri_pool = false;
+    }
+    if (iter == lru_low_pri) {
+      in_low_pri_pool = false;
+      in_high_pri_pool = true;
+    }
+
+    for (const auto& key : keys) {
+      iter = iter->next;
+      ASSERT_NE(lru, iter);
+      ASSERT_EQ(key, iter->key().ToString());
+      ASSERT_EQ(in_high_pri_pool, iter->InHighPriPool());
+      ASSERT_EQ(in_low_pri_pool, iter->InLowPriPool());
+      if (in_high_pri_pool) {
+        ASSERT_FALSE(iter->InLowPriPool());
+        high_pri_pool_keys++;
+      } else if (in_low_pri_pool) {
+        ASSERT_FALSE(iter->InHighPriPool());
+        low_pri_pool_keys++;
+      } else {
+        bottom_pri_pool_keys++;
+      }
+      if (iter == lru_bottom_pri) {
+        ASSERT_FALSE(in_low_pri_pool);
+        ASSERT_FALSE(in_high_pri_pool);
+        in_low_pri_pool = true;
+        in_high_pri_pool = false;
+      }
+      if (iter == lru_low_pri) {
+        ASSERT_TRUE(in_low_pri_pool);
+        ASSERT_FALSE(in_high_pri_pool);
+        in_low_pri_pool = false;
+        in_high_pri_pool = true;
+      }
+    }
+    ASSERT_EQ(lru, iter->next);
+    ASSERT_FALSE(in_low_pri_pool);
+    ASSERT_TRUE(in_high_pri_pool);
+    ASSERT_EQ(num_high_pri_pool_keys, high_pri_pool_keys);
+    ASSERT_EQ(num_low_pri_pool_keys, low_pri_pool_keys);
+    ASSERT_EQ(num_bottom_pri_pool_keys, bottom_pri_pool_keys);
+  }
+
+ private:
+  LRUCacheShard* cache_ = nullptr;
+};
+
+TEST_F(LRUCacheTest, BasicLRU) {
+  NewCache(5);
+  for (char ch = 'a'; ch <= 'e'; ch++) {
+    Insert(ch);
+  }
+  ValidateLRUList({"a", "b", "c", "d", "e"}, 0, 5);
+  for (char ch = 'x'; ch <= 'z'; ch++) {
+    Insert(ch);
+  }
+  ValidateLRUList({"d", "e", "x", "y", "z"}, 0, 5);
+  ASSERT_FALSE(Lookup("b"));
+  ValidateLRUList({"d", "e", "x", "y", "z"}, 0, 5);
+  ASSERT_TRUE(Lookup("e"));
+  ValidateLRUList({"d", "x", "y", "z", "e"}, 0, 5);
+  ASSERT_TRUE(Lookup("z"));
+  ValidateLRUList({"d", "x", "y", "e", "z"}, 0, 5);
+  Erase("x");
+  ValidateLRUList({"d", "y", "e", "z"}, 0, 4);
+  ASSERT_TRUE(Lookup("d"));
+  ValidateLRUList({"y", "e", "z", "d"}, 0, 4);
+  Insert("u");
+  ValidateLRUList({"y", "e", "z", "d", "u"}, 0, 5);
+  Insert("v");
+  ValidateLRUList({"e", "z", "d", "u", "v"}, 0, 5);
+}
+
+TEST_F(LRUCacheTest, LowPriorityMidpointInsertion) {
+  // Allocate 2 cache entries to high-pri pool and 3 to low-pri pool.
+  NewCache(5, /* high_pri_pool_ratio */ 0.40, /* low_pri_pool_ratio */ 0.60);
+
+  Insert("a", Cache::Priority::LOW);
+  Insert("b", Cache::Priority::LOW);
+  Insert("c", Cache::Priority::LOW);
+  Insert("x", Cache::Priority::HIGH);
+  Insert("y", Cache::Priority::HIGH);
+  ValidateLRUList({"a", "b", "c", "x", "y"}, 2, 3);
+
+  // Low-pri entries inserted to the tail of low-pri list (the midpoint).
+  // After lookup, it will move to the tail of the full list.
+  Insert("d", Cache::Priority::LOW);
+  ValidateLRUList({"b", "c", "d", "x", "y"}, 2, 3);
+  ASSERT_TRUE(Lookup("d"));
+  ValidateLRUList({"b", "c", "x", "y", "d"}, 2, 3);
+
+  // High-pri entries will be inserted to the tail of full list.
+  Insert("z", Cache::Priority::HIGH);
+  ValidateLRUList({"c", "x", "y", "d", "z"}, 2, 3);
+}
+
+TEST_F(LRUCacheTest, BottomPriorityMidpointInsertion) {
+  // Allocate 2 cache entries to high-pri pool and 2 to low-pri pool.
+  NewCache(6, /* high_pri_pool_ratio */ 0.35, /* low_pri_pool_ratio */ 0.35);
+
+  Insert("a", Cache::Priority::BOTTOM);
+  Insert("b", Cache::Priority::BOTTOM);
+  Insert("i", Cache::Priority::LOW);
+  Insert("j", Cache::Priority::LOW);
+  Insert("x", Cache::Priority::HIGH);
+  Insert("y", Cache::Priority::HIGH);
+  ValidateLRUList({"a", "b", "i", "j", "x", "y"}, 2, 2, 2);
+
+  // Low-pri entries will be inserted to the tail of low-pri list (the
+  // midpoint). After lookup, 'k' will move to the tail of the full list, and
+  // 'x' will spill over to the low-pri pool.
+  Insert("k", Cache::Priority::LOW);
+  ValidateLRUList({"b", "i", "j", "k", "x", "y"}, 2, 2, 2);
+  ASSERT_TRUE(Lookup("k"));
+  ValidateLRUList({"b", "i", "j", "x", "y", "k"}, 2, 2, 2);
+
+  // High-pri entries will be inserted to the tail of full list. Although y was
+  // inserted with high priority, it got spilled over to the low-pri pool. As
+  // a result, j also got spilled over to the bottom-pri pool.
+  Insert("z", Cache::Priority::HIGH);
+  ValidateLRUList({"i", "j", "x", "y", "k", "z"}, 2, 2, 2);
+  Erase("x");
+  ValidateLRUList({"i", "j", "y", "k", "z"}, 2, 1, 2);
+  Erase("y");
+  ValidateLRUList({"i", "j", "k", "z"}, 2, 0, 2);
+
+  // Bottom-pri entries will be inserted to the tail of bottom-pri list.
+  Insert("c", Cache::Priority::BOTTOM);
+  ValidateLRUList({"i", "j", "c", "k", "z"}, 2, 0, 3);
+  Insert("d", Cache::Priority::BOTTOM);
+  ValidateLRUList({"i", "j", "c", "d", "k", "z"}, 2, 0, 4);
+  Insert("e", Cache::Priority::BOTTOM);
+  ValidateLRUList({"j", "c", "d", "e", "k", "z"}, 2, 0, 4);
+
+  // Low-pri entries will be inserted to the tail of low-pri list (the
+  // midpoint).
+  Insert("l", Cache::Priority::LOW);
+  ValidateLRUList({"c", "d", "e", "l", "k", "z"}, 2, 1, 3);
+  Insert("m", Cache::Priority::LOW);
+  ValidateLRUList({"d", "e", "l", "m", "k", "z"}, 2, 2, 2);
+
+  Erase("k");
+  ValidateLRUList({"d", "e", "l", "m", "z"}, 1, 2, 2);
+  Erase("z");
+  ValidateLRUList({"d", "e", "l", "m"}, 0, 2, 2);
+
+  // Bottom-pri entries will be inserted to the tail of bottom-pri list.
+  Insert("f", Cache::Priority::BOTTOM);
+  ValidateLRUList({"d", "e", "f", "l", "m"}, 0, 2, 3);
+  Insert("g", Cache::Priority::BOTTOM);
+  ValidateLRUList({"d", "e", "f", "g", "l", "m"}, 0, 2, 4);
+
+  // High-pri entries will be inserted to the tail of full list.
+  Insert("o", Cache::Priority::HIGH);
+  ValidateLRUList({"e", "f", "g", "l", "m", "o"}, 1, 2, 3);
+  Insert("p", Cache::Priority::HIGH);
+  ValidateLRUList({"f", "g", "l", "m", "o", "p"}, 2, 2, 2);
+}
+
+TEST_F(LRUCacheTest, EntriesWithPriority) {
+  // Allocate 2 cache entries to high-pri pool and 2 to low-pri pool.
+  NewCache(6, /* high_pri_pool_ratio */ 0.35, /* low_pri_pool_ratio */ 0.35);
+
+  Insert("a", Cache::Priority::LOW);
+  Insert("b", Cache::Priority::LOW);
+  ValidateLRUList({"a", "b"}, 0, 2, 0);
+  // Low-pri entries can overflow to bottom-pri pool.
+  Insert("c", Cache::Priority::LOW);
+  ValidateLRUList({"a", "b", "c"}, 0, 2, 1);
+
+  // Bottom-pri entries can take high-pri pool capacity if available
+  Insert("t", Cache::Priority::LOW);
+  Insert("u", Cache::Priority::LOW);
+  ValidateLRUList({"a", "b", "c", "t", "u"}, 0, 2, 3);
+  Insert("v", Cache::Priority::LOW);
+  ValidateLRUList({"a", "b", "c", "t", "u", "v"}, 0, 2, 4);
+  Insert("w", Cache::Priority::LOW);
+  ValidateLRUList({"b", "c", "t", "u", "v", "w"}, 0, 2, 4);
+
+  Insert("X", Cache::Priority::HIGH);
+  Insert("Y", Cache::Priority::HIGH);
+  ValidateLRUList({"t", "u", "v", "w", "X", "Y"}, 2, 2, 2);
+
+  // After lookup, the high-pri entry 'X' got spilled over to the low-pri pool.
+  // The low-pri entry 'v' got spilled over to the bottom-pri pool.
+  Insert("Z", Cache::Priority::HIGH);
+  ValidateLRUList({"u", "v", "w", "X", "Y", "Z"}, 2, 2, 2);
+
+  // Low-pri entries will be inserted to head of low-pri pool.
+  Insert("a", Cache::Priority::LOW);
+  ValidateLRUList({"v", "w", "X", "a", "Y", "Z"}, 2, 2, 2);
+
+  // After lookup, the high-pri entry 'Y' got spilled over to the low-pri pool.
+  // The low-pri entry 'X' got spilled over to the bottom-pri pool.
+  ASSERT_TRUE(Lookup("v"));
+  ValidateLRUList({"w", "X", "a", "Y", "Z", "v"}, 2, 2, 2);
+
+  // After lookup, the high-pri entry 'Z' got spilled over to the low-pri pool.
+  // The low-pri entry 'a' got spilled over to the bottom-pri pool.
+  ASSERT_TRUE(Lookup("X"));
+  ValidateLRUList({"w", "a", "Y", "Z", "v", "X"}, 2, 2, 2);
+
+  // After lookup, the low pri entry 'Z' got promoted back to high-pri pool. The
+  // high-pri entry 'v' got spilled over to the low-pri pool.
+  ASSERT_TRUE(Lookup("Z"));
+  ValidateLRUList({"w", "a", "Y", "v", "X", "Z"}, 2, 2, 2);
+
+  Erase("Y");
+  ValidateLRUList({"w", "a", "v", "X", "Z"}, 2, 1, 2);
+  Erase("X");
+  ValidateLRUList({"w", "a", "v", "Z"}, 1, 1, 2);
+
+  Insert("d", Cache::Priority::LOW);
+  Insert("e", Cache::Priority::LOW);
+  ValidateLRUList({"w", "a", "v", "d", "e", "Z"}, 1, 2, 3);
+
+  Insert("f", Cache::Priority::LOW);
+  Insert("g", Cache::Priority::LOW);
+  ValidateLRUList({"v", "d", "e", "f", "g", "Z"}, 1, 2, 3);
+  ASSERT_TRUE(Lookup("d"));
+  ValidateLRUList({"v", "e", "f", "g", "Z", "d"}, 2, 2, 2);
+
+  // Erase some entries.
+  Erase("e");
+  Erase("f");
+  Erase("Z");
+  ValidateLRUList({"v", "g", "d"}, 1, 1, 1);
+
+  // Bottom-pri entries can take low- and high-pri pool capacity if available
+  Insert("o", Cache::Priority::BOTTOM);
+  ValidateLRUList({"v", "o", "g", "d"}, 1, 1, 2);
+  Insert("p", Cache::Priority::BOTTOM);
+  ValidateLRUList({"v", "o", "p", "g", "d"}, 1, 1, 3);
+  Insert("q", Cache::Priority::BOTTOM);
+  ValidateLRUList({"v", "o", "p", "q", "g", "d"}, 1, 1, 4);
+
+  // High-pri entries can overflow to low-pri pool, and bottom-pri entries will
+  // be evicted.
+  Insert("x", Cache::Priority::HIGH);
+  ValidateLRUList({"o", "p", "q", "g", "d", "x"}, 2, 1, 3);
+  Insert("y", Cache::Priority::HIGH);
+  ValidateLRUList({"p", "q", "g", "d", "x", "y"}, 2, 2, 2);
+  Insert("z", Cache::Priority::HIGH);
+  ValidateLRUList({"q", "g", "d", "x", "y", "z"}, 2, 2, 2);
+
+  // 'g' is bottom-pri before this lookup, it will be inserted to head of
+  // high-pri pool after lookup.
+  ASSERT_TRUE(Lookup("g"));
+  ValidateLRUList({"q", "d", "x", "y", "z", "g"}, 2, 2, 2);
+
+  // High-pri entries will be inserted to head of high-pri pool after lookup.
+  ASSERT_TRUE(Lookup("z"));
+  ValidateLRUList({"q", "d", "x", "y", "g", "z"}, 2, 2, 2);
+
+  // Bottom-pri entries will be inserted to head of high-pri pool after lookup.
+  ASSERT_TRUE(Lookup("d"));
+  ValidateLRUList({"q", "x", "y", "g", "z", "d"}, 2, 2, 2);
+
+  // Bottom-pri entries will be inserted to the tail of bottom-pri list.
+  Insert("m", Cache::Priority::BOTTOM);
+  ValidateLRUList({"x", "m", "y", "g", "z", "d"}, 2, 2, 2);
+
+  // Bottom-pri entries will be inserted to head of high-pri pool after lookup.
+  ASSERT_TRUE(Lookup("m"));
+  ValidateLRUList({"x", "y", "g", "z", "d", "m"}, 2, 2, 2);
+}
+
+namespace clock_cache {
+
+class ClockCacheTest : public testing::Test {
+ public:
+  using Shard = HyperClockCache::Shard;
+  using Table = HyperClockTable;
+  using HandleImpl = Shard::HandleImpl;
+
+  ClockCacheTest() {}
+  ~ClockCacheTest() override { DeleteShard(); }
+
+  void DeleteShard() {
+    if (shard_ != nullptr) {
+      shard_->~ClockCacheShard();
+      port::cacheline_aligned_free(shard_);
+      shard_ = nullptr;
+    }
+  }
+
+  void NewShard(size_t capacity, bool strict_capacity_limit = true) {
+    DeleteShard();
+    shard_ =
+        reinterpret_cast<Shard*>(port::cacheline_aligned_alloc(sizeof(Shard)));
+
+    Table::Opts opts;
+    opts.estimated_value_size = 1;
+    new (shard_)
+        Shard(capacity, strict_capacity_limit, kDontChargeCacheMetadata, opts);
+  }
+
+  Status Insert(const UniqueId64x2& hashed_key,
+                Cache::Priority priority = Cache::Priority::LOW) {
+    return shard_->Insert(TestKey(hashed_key), hashed_key, nullptr /*value*/,
+                          1 /*charge*/, nullptr /*deleter*/, nullptr /*handle*/,
+                          priority);
+  }
+
+  Status Insert(char key, Cache::Priority priority = Cache::Priority::LOW) {
+    return Insert(TestHashedKey(key), priority);
+  }
+
+  Status InsertWithLen(char key, size_t len) {
+    std::string skey(len, key);
+    return shard_->Insert(skey, TestHashedKey(key), nullptr /*value*/,
+                          1 /*charge*/, nullptr /*deleter*/, nullptr /*handle*/,
+                          Cache::Priority::LOW);
+  }
+
+  bool Lookup(const Slice& key, const UniqueId64x2& hashed_key,
+              bool useful = true) {
+    auto handle = shard_->Lookup(key, hashed_key);
+    if (handle) {
+      shard_->Release(handle, useful, /*erase_if_last_ref=*/false);
+      return true;
+    }
+    return false;
+  }
+
+  bool Lookup(const UniqueId64x2& hashed_key, bool useful = true) {
+    return Lookup(TestKey(hashed_key), hashed_key, useful);
+  }
+
+  bool Lookup(char key, bool useful = true) {
+    return Lookup(TestHashedKey(key), useful);
+  }
+
+  void Erase(char key) {
+    UniqueId64x2 hashed_key = TestHashedKey(key);
+    shard_->Erase(TestKey(hashed_key), hashed_key);
+  }
+
+  static inline Slice TestKey(const UniqueId64x2& hashed_key) {
+    return Slice(reinterpret_cast<const char*>(&hashed_key), 16U);
+  }
+
+  static inline UniqueId64x2 TestHashedKey(char key) {
+    // For testing hash near-collision behavior, put the variance in
+    // hashed_key in bits that are unlikely to be used as hash bits.
+    return {(static_cast<uint64_t>(key) << 56) + 1234U, 5678U};
+  }
+
+  Shard* shard_ = nullptr;
+};
+
+TEST_F(ClockCacheTest, Misc) {
+  NewShard(3);
+
+  // Key size stuff
+  EXPECT_OK(InsertWithLen('a', 16));
+  EXPECT_NOK(InsertWithLen('b', 15));
+  EXPECT_OK(InsertWithLen('b', 16));
+  EXPECT_NOK(InsertWithLen('c', 17));
+  EXPECT_NOK(InsertWithLen('d', 1000));
+  EXPECT_NOK(InsertWithLen('e', 11));
+  EXPECT_NOK(InsertWithLen('f', 0));
+
+  // Some of this is motivated by code coverage
+  std::string wrong_size_key(15, 'x');
+  EXPECT_FALSE(Lookup(wrong_size_key, TestHashedKey('x')));
+  EXPECT_FALSE(shard_->Ref(nullptr));
+  EXPECT_FALSE(shard_->Release(nullptr));
+  shard_->Erase(wrong_size_key, TestHashedKey('x'));  // no-op
+}
+
+TEST_F(ClockCacheTest, Limits) {
+  constexpr size_t kCapacity = 3;
+  NewShard(kCapacity, false /*strict_capacity_limit*/);
+  for (bool strict_capacity_limit : {false, true, false}) {
+    SCOPED_TRACE("strict_capacity_limit = " +
+                 std::to_string(strict_capacity_limit));
+
+    // Also tests switching between strict limit and not
+    shard_->SetStrictCapacityLimit(strict_capacity_limit);
+
+    UniqueId64x2 hkey = TestHashedKey('x');
+
+    // Single entry charge beyond capacity
+    {
+      Status s = shard_->Insert(TestKey(hkey), hkey, nullptr /*value*/,
+                                5 /*charge*/, nullptr /*deleter*/,
+                                nullptr /*handle*/, Cache::Priority::LOW);
+      if (strict_capacity_limit) {
+        EXPECT_TRUE(s.IsMemoryLimit());
+      } else {
+        EXPECT_OK(s);
+      }
+    }
+
+    // Single entry fills capacity
+    {
+      HandleImpl* h;
+      ASSERT_OK(shard_->Insert(TestKey(hkey), hkey, nullptr /*value*/,
+                               3 /*charge*/, nullptr /*deleter*/, &h,
+                               Cache::Priority::LOW));
+      // Try to insert more
+      Status s = Insert('a');
+      if (strict_capacity_limit) {
+        EXPECT_TRUE(s.IsMemoryLimit());
+      } else {
+        EXPECT_OK(s);
+      }
+      // Release entry filling capacity.
+      // Cover useful = false case.
+      shard_->Release(h, false /*useful*/, false /*erase_if_last_ref*/);
+    }
+
+    // Insert more than table size can handle to exceed occupancy limit.
+    // (Cleverly using mostly zero-charge entries, but some non-zero to
+    // verify usage tracking on detached entries.)
+    {
+      size_t n = shard_->GetTableAddressCount() + 1;
+      std::unique_ptr<HandleImpl* []> ha { new HandleImpl* [n] {} };
+      Status s;
+      for (size_t i = 0; i < n && s.ok(); ++i) {
+        hkey[1] = i;
+        s = shard_->Insert(TestKey(hkey), hkey, nullptr /*value*/,
+                           (i + kCapacity < n) ? 0 : 1 /*charge*/,
+                           nullptr /*deleter*/, &ha[i], Cache::Priority::LOW);
+        if (i == 0) {
+          EXPECT_OK(s);
+        }
+      }
+      if (strict_capacity_limit) {
+        EXPECT_TRUE(s.IsMemoryLimit());
+      } else {
+        EXPECT_OK(s);
+      }
+      // Same result if not keeping a reference
+      s = Insert('a');
+      if (strict_capacity_limit) {
+        EXPECT_TRUE(s.IsMemoryLimit());
+      } else {
+        EXPECT_OK(s);
+      }
+
+      // Regardless, we didn't allow table to actually get full
+      EXPECT_LT(shard_->GetOccupancyCount(), shard_->GetTableAddressCount());
+
+      // Release handles
+      for (size_t i = 0; i < n; ++i) {
+        if (ha[i]) {
+          shard_->Release(ha[i]);
+        }
+      }
+    }
+  }
+}
+
+TEST_F(ClockCacheTest, ClockEvictionTest) {
+  for (bool strict_capacity_limit : {false, true}) {
+    SCOPED_TRACE("strict_capacity_limit = " +
+                 std::to_string(strict_capacity_limit));
+
+    NewShard(6, strict_capacity_limit);
+    EXPECT_OK(Insert('a', Cache::Priority::BOTTOM));
+    EXPECT_OK(Insert('b', Cache::Priority::LOW));
+    EXPECT_OK(Insert('c', Cache::Priority::HIGH));
+    EXPECT_OK(Insert('d', Cache::Priority::BOTTOM));
+    EXPECT_OK(Insert('e', Cache::Priority::LOW));
+    EXPECT_OK(Insert('f', Cache::Priority::HIGH));
+
+    EXPECT_TRUE(Lookup('a', /*use*/ false));
+    EXPECT_TRUE(Lookup('b', /*use*/ false));
+    EXPECT_TRUE(Lookup('c', /*use*/ false));
+    EXPECT_TRUE(Lookup('d', /*use*/ false));
+    EXPECT_TRUE(Lookup('e', /*use*/ false));
+    EXPECT_TRUE(Lookup('f', /*use*/ false));
+
+    // Ensure bottom are evicted first, even if new entries are low
+    EXPECT_OK(Insert('g', Cache::Priority::LOW));
+    EXPECT_OK(Insert('h', Cache::Priority::LOW));
+
+    EXPECT_FALSE(Lookup('a', /*use*/ false));
+    EXPECT_TRUE(Lookup('b', /*use*/ false));
+    EXPECT_TRUE(Lookup('c', /*use*/ false));
+    EXPECT_FALSE(Lookup('d', /*use*/ false));
+    EXPECT_TRUE(Lookup('e', /*use*/ false));
+    EXPECT_TRUE(Lookup('f', /*use*/ false));
+    // Mark g & h useful
+    EXPECT_TRUE(Lookup('g', /*use*/ true));
+    EXPECT_TRUE(Lookup('h', /*use*/ true));
+
+    // Then old LOW entries
+    EXPECT_OK(Insert('i', Cache::Priority::LOW));
+    EXPECT_OK(Insert('j', Cache::Priority::LOW));
+
+    EXPECT_FALSE(Lookup('b', /*use*/ false));
+    EXPECT_TRUE(Lookup('c', /*use*/ false));
+    EXPECT_FALSE(Lookup('e', /*use*/ false));
+    EXPECT_TRUE(Lookup('f', /*use*/ false));
+    // Mark g & h useful once again
+    EXPECT_TRUE(Lookup('g', /*use*/ true));
+    EXPECT_TRUE(Lookup('h', /*use*/ true));
+    EXPECT_TRUE(Lookup('i', /*use*/ false));
+    EXPECT_TRUE(Lookup('j', /*use*/ false));
+
+    // Then old HIGH entries
+    EXPECT_OK(Insert('k', Cache::Priority::LOW));
+    EXPECT_OK(Insert('l', Cache::Priority::LOW));
+
+    EXPECT_FALSE(Lookup('c', /*use*/ false));
+    EXPECT_FALSE(Lookup('f', /*use*/ false));
+    EXPECT_TRUE(Lookup('g', /*use*/ false));
+    EXPECT_TRUE(Lookup('h', /*use*/ false));
+    EXPECT_TRUE(Lookup('i', /*use*/ false));
+    EXPECT_TRUE(Lookup('j', /*use*/ false));
+    EXPECT_TRUE(Lookup('k', /*use*/ false));
+    EXPECT_TRUE(Lookup('l', /*use*/ false));
+
+    // Then the (roughly) least recently useful
+    EXPECT_OK(Insert('m', Cache::Priority::HIGH));
+    EXPECT_OK(Insert('n', Cache::Priority::HIGH));
+
+    EXPECT_TRUE(Lookup('g', /*use*/ false));
+    EXPECT_TRUE(Lookup('h', /*use*/ false));
+    EXPECT_FALSE(Lookup('i', /*use*/ false));
+    EXPECT_FALSE(Lookup('j', /*use*/ false));
+    EXPECT_TRUE(Lookup('k', /*use*/ false));
+    EXPECT_TRUE(Lookup('l', /*use*/ false));
+
+    // Now try changing capacity down
+    shard_->SetCapacity(4);
+    // Insert to ensure evictions happen
+    EXPECT_OK(Insert('o', Cache::Priority::LOW));
+    EXPECT_OK(Insert('p', Cache::Priority::LOW));
+
+    EXPECT_FALSE(Lookup('g', /*use*/ false));
+    EXPECT_FALSE(Lookup('h', /*use*/ false));
+    EXPECT_FALSE(Lookup('k', /*use*/ false));
+    EXPECT_FALSE(Lookup('l', /*use*/ false));
+    EXPECT_TRUE(Lookup('m', /*use*/ false));
+    EXPECT_TRUE(Lookup('n', /*use*/ false));
+    EXPECT_TRUE(Lookup('o', /*use*/ false));
+    EXPECT_TRUE(Lookup('p', /*use*/ false));
+
+    // Now try changing capacity up
+    EXPECT_TRUE(Lookup('m', /*use*/ true));
+    EXPECT_TRUE(Lookup('n', /*use*/ true));
+    shard_->SetCapacity(6);
+    EXPECT_OK(Insert('q', Cache::Priority::HIGH));
+    EXPECT_OK(Insert('r', Cache::Priority::HIGH));
+    EXPECT_OK(Insert('s', Cache::Priority::HIGH));
+    EXPECT_OK(Insert('t', Cache::Priority::HIGH));
+
+    EXPECT_FALSE(Lookup('o', /*use*/ false));
+    EXPECT_FALSE(Lookup('p', /*use*/ false));
+    EXPECT_TRUE(Lookup('m', /*use*/ false));
+    EXPECT_TRUE(Lookup('n', /*use*/ false));
+    EXPECT_TRUE(Lookup('q', /*use*/ false));
+    EXPECT_TRUE(Lookup('r', /*use*/ false));
+    EXPECT_TRUE(Lookup('s', /*use*/ false));
+    EXPECT_TRUE(Lookup('t', /*use*/ false));
+  }
+}
+
+void IncrementIntDeleter(const Slice& /*key*/, void* value) {
+  *reinterpret_cast<int*>(value) += 1;
+}
+
+// Testing calls to CorrectNearOverflow in Release
+TEST_F(ClockCacheTest, ClockCounterOverflowTest) {
+  NewShard(6, /*strict_capacity_limit*/ false);
+  HandleImpl* h;
+  int deleted = 0;
+  UniqueId64x2 hkey = TestHashedKey('x');
+  ASSERT_OK(shard_->Insert(TestKey(hkey), hkey, &deleted, 1,
+                           IncrementIntDeleter, &h, Cache::Priority::HIGH));
+
+  // Some large number outstanding
+  shard_->TEST_RefN(h, 123456789);
+  // Simulate many lookup/ref + release, plenty to overflow counters
+  for (int i = 0; i < 10000; ++i) {
+    shard_->TEST_RefN(h, 1234567);
+    shard_->TEST_ReleaseN(h, 1234567);
+  }
+  // Mark it invisible (to reach a different CorrectNearOverflow() in Release)
+  shard_->Erase(TestKey(hkey), hkey);
+  // Simulate many more lookup/ref + release (one-by-one would be too
+  // expensive for unit test)
+  for (int i = 0; i < 10000; ++i) {
+    shard_->TEST_RefN(h, 1234567);
+    shard_->TEST_ReleaseN(h, 1234567);
+  }
+  // Free all but last 1
+  shard_->TEST_ReleaseN(h, 123456789);
+  // Still alive
+  ASSERT_EQ(deleted, 0);
+  // Free last ref, which will finalize erasure
+  shard_->Release(h);
+  // Deleted
+  ASSERT_EQ(deleted, 1);
+}
+
+// This test is mostly to exercise some corner case logic, by forcing two
+// keys to have the same hash, and more
+TEST_F(ClockCacheTest, CollidingInsertEraseTest) {
+  NewShard(6, /*strict_capacity_limit*/ false);
+  int deleted = 0;
+  UniqueId64x2 hkey1 = TestHashedKey('x');
+  Slice key1 = TestKey(hkey1);
+  UniqueId64x2 hkey2 = TestHashedKey('y');
+  Slice key2 = TestKey(hkey2);
+  UniqueId64x2 hkey3 = TestHashedKey('z');
+  Slice key3 = TestKey(hkey3);
+  HandleImpl* h1;
+  ASSERT_OK(shard_->Insert(key1, hkey1, &deleted, 1, IncrementIntDeleter, &h1,
+                           Cache::Priority::HIGH));
+  HandleImpl* h2;
+  ASSERT_OK(shard_->Insert(key2, hkey2, &deleted, 1, IncrementIntDeleter, &h2,
+                           Cache::Priority::HIGH));
+  HandleImpl* h3;
+  ASSERT_OK(shard_->Insert(key3, hkey3, &deleted, 1, IncrementIntDeleter, &h3,
+                           Cache::Priority::HIGH));
+
+  // Can repeatedly lookup+release despite the hash collision
+  HandleImpl* tmp_h;
+  for (bool erase_if_last_ref : {true, false}) {  // but not last ref
+    tmp_h = shard_->Lookup(key1, hkey1);
+    ASSERT_EQ(h1, tmp_h);
+    ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
+
+    tmp_h = shard_->Lookup(key2, hkey2);
+    ASSERT_EQ(h2, tmp_h);
+    ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
+
+    tmp_h = shard_->Lookup(key3, hkey3);
+    ASSERT_EQ(h3, tmp_h);
+    ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
+  }
+
+  // Make h1 invisible
+  shard_->Erase(key1, hkey1);
+  // Redundant erase
+  shard_->Erase(key1, hkey1);
+
+  // All still alive
+  ASSERT_EQ(deleted, 0);
+
+  // Invisible to Lookup
+  tmp_h = shard_->Lookup(key1, hkey1);
+  ASSERT_EQ(nullptr, tmp_h);
+
+  // Can still find h2, h3
+  for (bool erase_if_last_ref : {true, false}) {  // but not last ref
+    tmp_h = shard_->Lookup(key2, hkey2);
+    ASSERT_EQ(h2, tmp_h);
+    ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
+
+    tmp_h = shard_->Lookup(key3, hkey3);
+    ASSERT_EQ(h3, tmp_h);
+    ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
+  }
+
+  // Also Insert with invisible entry there
+  ASSERT_OK(shard_->Insert(key1, hkey1, &deleted, 1, IncrementIntDeleter,
+                           nullptr, Cache::Priority::HIGH));
+  tmp_h = shard_->Lookup(key1, hkey1);
+  // Found but distinct handle
+  ASSERT_NE(nullptr, tmp_h);
+  ASSERT_NE(h1, tmp_h);
+  ASSERT_TRUE(shard_->Release(tmp_h, /*erase_if_last_ref*/ true));
+
+  // tmp_h deleted
+  ASSERT_EQ(deleted--, 1);
+
+  // Release last ref on h1 (already invisible)
+  ASSERT_TRUE(shard_->Release(h1, /*erase_if_last_ref*/ false));
+
+  // h1 deleted
+  ASSERT_EQ(deleted--, 1);
+  h1 = nullptr;
+
+  // Can still find h2, h3
+  for (bool erase_if_last_ref : {true, false}) {  // but not last ref
+    tmp_h = shard_->Lookup(key2, hkey2);
+    ASSERT_EQ(h2, tmp_h);
+    ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
+
+    tmp_h = shard_->Lookup(key3, hkey3);
+    ASSERT_EQ(h3, tmp_h);
+    ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
+  }
+
+  // Release last ref on h2
+  ASSERT_FALSE(shard_->Release(h2, /*erase_if_last_ref*/ false));
+
+  // h2 still not deleted (unreferenced in cache)
+  ASSERT_EQ(deleted, 0);
+
+  // Can still find it
+  tmp_h = shard_->Lookup(key2, hkey2);
+  ASSERT_EQ(h2, tmp_h);
+
+  // Release last ref on h2, with erase
+  ASSERT_TRUE(shard_->Release(h2, /*erase_if_last_ref*/ true));
+
+  // h2 deleted
+  ASSERT_EQ(deleted--, 1);
+  tmp_h = shard_->Lookup(key2, hkey2);
+  ASSERT_EQ(nullptr, tmp_h);
+
+  // Can still find h3
+  for (bool erase_if_last_ref : {true, false}) {  // but not last ref
+    tmp_h = shard_->Lookup(key3, hkey3);
+    ASSERT_EQ(h3, tmp_h);
+    ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref));
+  }
+
+  // Release last ref on h3, without erase
+  ASSERT_FALSE(shard_->Release(h3, /*erase_if_last_ref*/ false));
+
+  // h3 still not deleted (unreferenced in cache)
+  ASSERT_EQ(deleted, 0);
+
+  // Explicit erase
+  shard_->Erase(key3, hkey3);
+
+  // h3 deleted
+  ASSERT_EQ(deleted--, 1);
+  tmp_h = shard_->Lookup(key3, hkey3);
+  ASSERT_EQ(nullptr, tmp_h);
+}
+
+// This uses the public API to effectively test CalcHashBits etc.
+TEST_F(ClockCacheTest, TableSizesTest) {
+  for (size_t est_val_size : {1U, 5U, 123U, 2345U, 345678U}) {
+    SCOPED_TRACE("est_val_size = " + std::to_string(est_val_size));
+    for (double est_count : {1.1, 2.2, 511.9, 512.1, 2345.0}) {
+      SCOPED_TRACE("est_count = " + std::to_string(est_count));
+      size_t capacity = static_cast<size_t>(est_val_size * est_count);
+      // kDontChargeCacheMetadata
+      auto cache = HyperClockCacheOptions(
+                       capacity, est_val_size, /*num shard_bits*/ -1,
+                       /*strict_capacity_limit*/ false,
+                       /*memory_allocator*/ nullptr, kDontChargeCacheMetadata)
+                       .MakeSharedCache();
+      // Table sizes are currently only powers of two
+      EXPECT_GE(cache->GetTableAddressCount(), est_count / kLoadFactor);
+      EXPECT_LE(cache->GetTableAddressCount(), est_count / kLoadFactor * 2.0);
+      EXPECT_EQ(cache->GetUsage(), 0);
+
+      // kFullChargeMetaData
+      // Because table sizes are currently only powers of two, sizes get
+      // really weird when metadata is a huge portion of capacity. For example,
+      // doubling the table size could cut by 90% the space available to
+      // values. Therefore, we omit those weird cases for now.
+      if (est_val_size >= 512) {
+        cache = HyperClockCacheOptions(
+                    capacity, est_val_size, /*num shard_bits*/ -1,
+                    /*strict_capacity_limit*/ false,
+                    /*memory_allocator*/ nullptr, kFullChargeCacheMetadata)
+                    .MakeSharedCache();
+        double est_count_after_meta =
+            (capacity - cache->GetUsage()) * 1.0 / est_val_size;
+        EXPECT_GE(cache->GetTableAddressCount(),
+                  est_count_after_meta / kLoadFactor);
+        EXPECT_LE(cache->GetTableAddressCount(),
+                  est_count_after_meta / kLoadFactor * 2.0);
+      }
+    }
+  }
+}
+
+}  // namespace clock_cache
+
+class TestSecondaryCache : public SecondaryCache {
+ public:
+  // Specifies what action to take on a lookup for a particular key
+  enum ResultType {
+    SUCCESS,
+    // Fail lookup immediately
+    FAIL,
+    // Defer the result. It will returned after Wait/WaitAll is called
+    DEFER,
+    // Defer the result and eventually return failure
+    DEFER_AND_FAIL
+  };
+
+  using ResultMap = std::unordered_map<std::string, ResultType>;
+
+  explicit TestSecondaryCache(size_t capacity)
+      : num_inserts_(0), num_lookups_(0), inject_failure_(false) {
+    cache_ =
+        NewLRUCache(capacity, 0, false, 0.5 /* high_pri_pool_ratio */, nullptr,
+                    kDefaultToAdaptiveMutex, kDontChargeCacheMetadata);
+  }
+  ~TestSecondaryCache() override { cache_.reset(); }
+
+  const char* Name() const override { return "TestSecondaryCache"; }
+
+  void InjectFailure() { inject_failure_ = true; }
+
+  void ResetInjectFailure() { inject_failure_ = false; }
+
+  Status Insert(const Slice& key, void* value,
+                const Cache::CacheItemHelper* helper) override {
+    if (inject_failure_) {
+      return Status::Corruption("Insertion Data Corrupted");
+    }
+    CheckCacheKeyCommonPrefix(key);
+    size_t size;
+    char* buf;
+    Status s;
+
+    num_inserts_++;
+    size = (*helper->size_cb)(value);
+    buf = new char[size + sizeof(uint64_t)];
+    EncodeFixed64(buf, size);
+    s = (*helper->saveto_cb)(value, 0, size, buf + sizeof(uint64_t));
+    if (!s.ok()) {
+      delete[] buf;
+      return s;
+    }
+    return cache_->Insert(key, buf, size,
+                          [](const Slice& /*key*/, void* val) -> void {
+                            delete[] static_cast<char*>(val);
+                          });
+  }
+
+  std::unique_ptr<SecondaryCacheResultHandle> Lookup(
+      const Slice& key, const Cache::CreateCallback& create_cb, bool /*wait*/,
+      bool /*advise_erase*/, bool& is_in_sec_cache) override {
+    std::string key_str = key.ToString();
+    TEST_SYNC_POINT_CALLBACK("TestSecondaryCache::Lookup", &key_str);
+
+    std::unique_ptr<SecondaryCacheResultHandle> secondary_handle;
+    is_in_sec_cache = false;
+    ResultType type = ResultType::SUCCESS;
+    auto iter = result_map_.find(key.ToString());
+    if (iter != result_map_.end()) {
+      type = iter->second;
+    }
+    if (type == ResultType::FAIL) {
+      return secondary_handle;
+    }
+
+    Cache::Handle* handle = cache_->Lookup(key);
+    num_lookups_++;
+    if (handle) {
+      void* value = nullptr;
+      size_t charge = 0;
+      Status s;
+      if (type != ResultType::DEFER_AND_FAIL) {
+        char* ptr = (char*)cache_->Value(handle);
+        size_t size = DecodeFixed64(ptr);
+        ptr += sizeof(uint64_t);
+        s = create_cb(ptr, size, &value, &charge);
+      }
+      if (s.ok()) {
+        secondary_handle.reset(new TestSecondaryCacheResultHandle(
+            cache_.get(), handle, value, charge, type));
+        is_in_sec_cache = true;
+      } else {
+        cache_->Release(handle);
+      }
+    }
+    return secondary_handle;
+  }
+
+  bool SupportForceErase() const override { return false; }
+
+  void Erase(const Slice& /*key*/) override {}
+
+  void WaitAll(std::vector<SecondaryCacheResultHandle*> handles) override {
+    for (SecondaryCacheResultHandle* handle : handles) {
+      TestSecondaryCacheResultHandle* sec_handle =
+          static_cast<TestSecondaryCacheResultHandle*>(handle);
+      sec_handle->SetReady();
+    }
+  }
+
+  std::string GetPrintableOptions() const override { return ""; }
+
+  void SetResultMap(ResultMap&& map) { result_map_ = std::move(map); }
+
+  uint32_t num_inserts() { return num_inserts_; }
+
+  uint32_t num_lookups() { return num_lookups_; }
+
+  void CheckCacheKeyCommonPrefix(const Slice& key) {
+    Slice current_prefix(key.data(), OffsetableCacheKey::kCommonPrefixSize);
+    if (ckey_prefix_.empty()) {
+      ckey_prefix_ = current_prefix.ToString();
+    } else {
+      EXPECT_EQ(ckey_prefix_, current_prefix.ToString());
+    }
+  }
+
+ private:
+  class TestSecondaryCacheResultHandle : public SecondaryCacheResultHandle {
+   public:
+    TestSecondaryCacheResultHandle(Cache* cache, Cache::Handle* handle,
+                                   void* value, size_t size, ResultType type)
+        : cache_(cache),
+          handle_(handle),
+          value_(value),
+          size_(size),
+          is_ready_(true) {
+      if (type != ResultType::SUCCESS) {
+        is_ready_ = false;
+      }
+    }
+
+    ~TestSecondaryCacheResultHandle() override { cache_->Release(handle_); }
+
+    bool IsReady() override { return is_ready_; }
+
+    void Wait() override {}
+
+    void* Value() override {
+      assert(is_ready_);
+      return value_;
+    }
+
+    size_t Size() override { return Value() ? size_ : 0; }
+
+    void SetReady() { is_ready_ = true; }
+
+   private:
+    Cache* cache_;
+    Cache::Handle* handle_;
+    void* value_;
+    size_t size_;
+    bool is_ready_;
+  };
+
+  std::shared_ptr<Cache> cache_;
+  uint32_t num_inserts_;
+  uint32_t num_lookups_;
+  bool inject_failure_;
+  std::string ckey_prefix_;
+  ResultMap result_map_;
+};
+
+class DBSecondaryCacheTest : public DBTestBase {
+ public:
+  DBSecondaryCacheTest()
+      : DBTestBase("db_secondary_cache_test", /*env_do_fsync=*/true) {
+    fault_fs_.reset(new FaultInjectionTestFS(env_->GetFileSystem()));
+    fault_env_.reset(new CompositeEnvWrapper(env_, fault_fs_));
+  }
+
+  std::shared_ptr<FaultInjectionTestFS> fault_fs_;
+  std::unique_ptr<Env> fault_env_;
+};
+
+class LRUCacheSecondaryCacheTest : public LRUCacheTest {
+ public:
+  LRUCacheSecondaryCacheTest() : fail_create_(false) {}
+  ~LRUCacheSecondaryCacheTest() {}
+
+ protected:
+  class TestItem {
+   public:
+    TestItem(const char* buf, size_t size) : buf_(new char[size]), size_(size) {
+      memcpy(buf_.get(), buf, size);
+    }
+    ~TestItem() {}
+
+    char* Buf() { return buf_.get(); }
+    size_t Size() { return size_; }
+    std::string ToString() { return std::string(Buf(), Size()); }
+
+   private:
+    std::unique_ptr<char[]> buf_;
+    size_t size_;
+  };
+
+  static size_t SizeCallback(void* obj) {
+    return reinterpret_cast<TestItem*>(obj)->Size();
+  }
+
+  static Status SaveToCallback(void* from_obj, size_t from_offset,
+                               size_t length, void* out) {
+    TestItem* item = reinterpret_cast<TestItem*>(from_obj);
+    char* buf = item->Buf();
+    EXPECT_EQ(length, item->Size());
+    EXPECT_EQ(from_offset, 0);
+    memcpy(out, buf, length);
+    return Status::OK();
+  }
+
+  static void DeletionCallback(const Slice& /*key*/, void* obj) {
+    delete reinterpret_cast<TestItem*>(obj);
+  }
+
+  static Cache::CacheItemHelper helper_;
+
+  static Status SaveToCallbackFail(void* /*obj*/, size_t /*offset*/,
+                                   size_t /*size*/, void* /*out*/) {
+    return Status::NotSupported();
+  }
+
+  static Cache::CacheItemHelper helper_fail_;
+
+  Cache::CreateCallback test_item_creator = [&](const void* buf, size_t size,
+                                                void** out_obj,
+                                                size_t* charge) -> Status {
+    if (fail_create_) {
+      return Status::NotSupported();
+    }
+    *out_obj = reinterpret_cast<void*>(new TestItem((char*)buf, size));
+    *charge = size;
+    return Status::OK();
+  };
+
+  void SetFailCreate(bool fail) { fail_create_ = fail; }
+
+ private:
+  bool fail_create_;
+};
+
+Cache::CacheItemHelper LRUCacheSecondaryCacheTest::helper_(
+    LRUCacheSecondaryCacheTest::SizeCallback,
+    LRUCacheSecondaryCacheTest::SaveToCallback,
+    LRUCacheSecondaryCacheTest::DeletionCallback);
+
+Cache::CacheItemHelper LRUCacheSecondaryCacheTest::helper_fail_(
+    LRUCacheSecondaryCacheTest::SizeCallback,
+    LRUCacheSecondaryCacheTest::SaveToCallbackFail,
+    LRUCacheSecondaryCacheTest::DeletionCallback);
+
+TEST_F(LRUCacheSecondaryCacheTest, BasicTest) {
+  LRUCacheOptions opts(1024 /* capacity */, 0 /* num_shard_bits */,
+                       false /* strict_capacity_limit */,
+                       0.5 /* high_pri_pool_ratio */,
+                       nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache =
+      std::make_shared<TestSecondaryCache>(4096);
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  std::shared_ptr<Statistics> stats = CreateDBStatistics();
+  CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
+  CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
+  CacheKey k3 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
+
+  Random rnd(301);
+  // Start with warming k3
+  std::string str3 = rnd.RandomString(1021);
+  ASSERT_OK(secondary_cache->InsertSaved(k3.AsSlice(), str3));
+
+  std::string str1 = rnd.RandomString(1020);
+  TestItem* item1 = new TestItem(str1.data(), str1.length());
+  ASSERT_OK(cache->Insert(k1.AsSlice(), item1,
+                          &LRUCacheSecondaryCacheTest::helper_, str1.length()));
+  std::string str2 = rnd.RandomString(1021);
+  TestItem* item2 = new TestItem(str2.data(), str2.length());
+  // k1 should be demoted to NVM
+  ASSERT_OK(cache->Insert(k2.AsSlice(), item2,
+                          &LRUCacheSecondaryCacheTest::helper_, str2.length()));
+
+  get_perf_context()->Reset();
+  Cache::Handle* handle;
+  handle =
+      cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
+                    test_item_creator, Cache::Priority::LOW, true, stats.get());
+  ASSERT_NE(handle, nullptr);
+  ASSERT_EQ(static_cast<TestItem*>(cache->Value(handle))->Size(), str2.size());
+  cache->Release(handle);
+
+  // This lookup should promote k1 and demote k2
+  handle =
+      cache->Lookup(k1.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
+                    test_item_creator, Cache::Priority::LOW, true, stats.get());
+  ASSERT_NE(handle, nullptr);
+  ASSERT_EQ(static_cast<TestItem*>(cache->Value(handle))->Size(), str1.size());
+  cache->Release(handle);
+
+  // This lookup should promote k3 and demote k1
+  handle =
+      cache->Lookup(k3.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
+                    test_item_creator, Cache::Priority::LOW, true, stats.get());
+  ASSERT_NE(handle, nullptr);
+  ASSERT_EQ(static_cast<TestItem*>(cache->Value(handle))->Size(), str3.size());
+  cache->Release(handle);
+
+  ASSERT_EQ(secondary_cache->num_inserts(), 3u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+  ASSERT_EQ(stats->getTickerCount(SECONDARY_CACHE_HITS),
+            secondary_cache->num_lookups());
+  PerfContext perf_ctx = *get_perf_context();
+  ASSERT_EQ(perf_ctx.secondary_cache_hit_count, secondary_cache->num_lookups());
+
+  cache.reset();
+  secondary_cache.reset();
+}
+
+TEST_F(LRUCacheSecondaryCacheTest, BasicFailTest) {
+  LRUCacheOptions opts(1024 /* capacity */, 0 /* num_shard_bits */,
+                       false /* strict_capacity_limit */,
+                       0.5 /* high_pri_pool_ratio */,
+                       nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache =
+      std::make_shared<TestSecondaryCache>(2048);
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
+  CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
+
+  Random rnd(301);
+  std::string str1 = rnd.RandomString(1020);
+  auto item1 = std::make_unique<TestItem>(str1.data(), str1.length());
+  ASSERT_TRUE(cache->Insert(k1.AsSlice(), item1.get(), nullptr, str1.length())
+                  .IsInvalidArgument());
+  ASSERT_OK(cache->Insert(k1.AsSlice(), item1.get(),
+                          &LRUCacheSecondaryCacheTest::helper_, str1.length()));
+  item1.release();  // Appease clang-analyze "potential memory leak"
+
+  Cache::Handle* handle;
+  handle = cache->Lookup(k2.AsSlice(), nullptr, test_item_creator,
+                         Cache::Priority::LOW, true);
+  ASSERT_EQ(handle, nullptr);
+  handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
+                         test_item_creator, Cache::Priority::LOW, false);
+  ASSERT_EQ(handle, nullptr);
+
+  cache.reset();
+  secondary_cache.reset();
+}
+
+TEST_F(LRUCacheSecondaryCacheTest, SaveFailTest) {
+  LRUCacheOptions opts(1024 /* capacity */, 0 /* num_shard_bits */,
+                       false /* strict_capacity_limit */,
+                       0.5 /* high_pri_pool_ratio */,
+                       nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache =
+      std::make_shared<TestSecondaryCache>(2048);
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
+  CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
+
+  Random rnd(301);
+  std::string str1 = rnd.RandomString(1020);
+  TestItem* item1 = new TestItem(str1.data(), str1.length());
+  ASSERT_OK(cache->Insert(k1.AsSlice(), item1,
+                          &LRUCacheSecondaryCacheTest::helper_fail_,
+                          str1.length()));
+  std::string str2 = rnd.RandomString(1020);
+  TestItem* item2 = new TestItem(str2.data(), str2.length());
+  // k1 should be demoted to NVM
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_OK(cache->Insert(k2.AsSlice(), item2,
+                          &LRUCacheSecondaryCacheTest::helper_fail_,
+                          str2.length()));
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+
+  Cache::Handle* handle;
+  handle =
+      cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_fail_,
+                    test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_NE(handle, nullptr);
+  cache->Release(handle);
+  // This lookup should fail, since k1 demotion would have failed
+  handle =
+      cache->Lookup(k1.AsSlice(), &LRUCacheSecondaryCacheTest::helper_fail_,
+                    test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_EQ(handle, nullptr);
+  // Since k1 didn't get promoted, k2 should still be in cache
+  handle =
+      cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_fail_,
+                    test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_NE(handle, nullptr);
+  cache->Release(handle);
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 1u);
+
+  cache.reset();
+  secondary_cache.reset();
+}
+
+TEST_F(LRUCacheSecondaryCacheTest, CreateFailTest) {
+  LRUCacheOptions opts(1024 /* capacity */, 0 /* num_shard_bits */,
+                       false /* strict_capacity_limit */,
+                       0.5 /* high_pri_pool_ratio */,
+                       nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache =
+      std::make_shared<TestSecondaryCache>(2048);
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
+  CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
+
+  Random rnd(301);
+  std::string str1 = rnd.RandomString(1020);
+  TestItem* item1 = new TestItem(str1.data(), str1.length());
+  ASSERT_OK(cache->Insert(k1.AsSlice(), item1,
+                          &LRUCacheSecondaryCacheTest::helper_, str1.length()));
+  std::string str2 = rnd.RandomString(1020);
+  TestItem* item2 = new TestItem(str2.data(), str2.length());
+  // k1 should be demoted to NVM
+  ASSERT_OK(cache->Insert(k2.AsSlice(), item2,
+                          &LRUCacheSecondaryCacheTest::helper_, str2.length()));
+
+  Cache::Handle* handle;
+  SetFailCreate(true);
+  handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
+                         test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_NE(handle, nullptr);
+  cache->Release(handle);
+  // This lookup should fail, since k1 creation would have failed
+  handle = cache->Lookup(k1.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
+                         test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_EQ(handle, nullptr);
+  // Since k1 didn't get promoted, k2 should still be in cache
+  handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
+                         test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_NE(handle, nullptr);
+  cache->Release(handle);
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 1u);
+
+  cache.reset();
+  secondary_cache.reset();
+}
+
+TEST_F(LRUCacheSecondaryCacheTest, FullCapacityTest) {
+  LRUCacheOptions opts(1024 /* capacity */, 0 /* num_shard_bits */,
+                       true /* strict_capacity_limit */,
+                       0.5 /* high_pri_pool_ratio */,
+                       nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache =
+      std::make_shared<TestSecondaryCache>(2048);
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
+  CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get());
+
+  Random rnd(301);
+  std::string str1 = rnd.RandomString(1020);
+  TestItem* item1 = new TestItem(str1.data(), str1.length());
+  ASSERT_OK(cache->Insert(k1.AsSlice(), item1,
+                          &LRUCacheSecondaryCacheTest::helper_, str1.length()));
+  std::string str2 = rnd.RandomString(1020);
+  TestItem* item2 = new TestItem(str2.data(), str2.length());
+  // k1 should be demoted to NVM
+  ASSERT_OK(cache->Insert(k2.AsSlice(), item2,
+                          &LRUCacheSecondaryCacheTest::helper_, str2.length()));
+
+  Cache::Handle* handle;
+  handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
+                         test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_NE(handle, nullptr);
+  // k1 promotion should fail due to the block cache being at capacity,
+  // but the lookup should still succeed
+  Cache::Handle* handle2;
+  handle2 = cache->Lookup(k1.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
+                          test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_NE(handle2, nullptr);
+  // Since k1 didn't get inserted, k2 should still be in cache
+  cache->Release(handle);
+  cache->Release(handle2);
+  handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
+                         test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_NE(handle, nullptr);
+  cache->Release(handle);
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 1u);
+
+  cache.reset();
+  secondary_cache.reset();
+}
+
+// In this test, the block cache size is set to 4096, after insert 6 KV-pairs
+// and flush, there are 5 blocks in this SST file, 2 data blocks and 3 meta
+// blocks. block_1 size is 4096 and block_2 size is 2056. The total size
+// of the meta blocks are about 900 to 1000. Therefore, in any situation,
+// if we try to insert block_1 to the block cache, it will always fails. Only
+// block_2 will be successfully inserted into the block cache.
+TEST_F(DBSecondaryCacheTest, TestSecondaryCacheCorrectness1) {
+  LRUCacheOptions opts(4 * 1024 /* capacity */, 0 /* num_shard_bits */,
+                       false /* strict_capacity_limit */,
+                       0.5 /* high_pri_pool_ratio */,
+                       nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache(
+      new TestSecondaryCache(2048 * 1024));
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  table_options.block_size = 4 * 1024;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = fault_env_.get();
+  fault_fs_->SetFailGetUniqueId(true);
+
+  // Set the file paranoid check, so after flush, the file will be read
+  // all the blocks will be accessed.
+  options.paranoid_file_checks = true;
+  DestroyAndReopen(options);
+  Random rnd(301);
+  const int N = 6;
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(1007);
+    ASSERT_OK(Put(Key(i), p_v));
+  }
+
+  ASSERT_OK(Flush());
+  // After Flush is successful, RocksDB will do the paranoid check for the new
+  // SST file. Meta blocks are always cached in the block cache and they
+  // will not be evicted. When block_2 is cache miss and read out, it is
+  // inserted to the block cache. Note that, block_1 is never successfully
+  // inserted to the block cache. Here are 2 lookups in the secondary cache
+  // for block_1 and block_2
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+  Compact("a", "z");
+  // Compaction will create the iterator to scan the whole file. So all the
+  // blocks are needed. Meta blocks are always cached. When block_1 is read
+  // out, block_2 is evicted from block cache and inserted to secondary
+  // cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 3u);
+
+  std::string v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  // The first data block is not in the cache, similarly, trigger the block
+  // cache Lookup and secondary cache lookup for block_1. But block_1 will not
+  // be inserted successfully due to the size. Currently, cache only has
+  // the meta blocks.
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 4u);
+
+  v = Get(Key(5));
+  ASSERT_EQ(1007, v.size());
+  // The second data block is not in the cache, similarly, trigger the block
+  // cache Lookup and secondary cache lookup for block_2 and block_2 is found
+  // in the secondary cache. Now block cache has block_2
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 5u);
+
+  v = Get(Key(5));
+  ASSERT_EQ(1007, v.size());
+  // block_2 is in the block cache. There is a block cache hit. No need to
+  // lookup or insert the secondary cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 5u);
+
+  v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  // Lookup the first data block, not in the block cache, so lookup the
+  // secondary cache. Also not in the secondary cache. After Get, still
+  // block_1 is will not be cached.
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 6u);
+
+  v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  // Lookup the first data block, not in the block cache, so lookup the
+  // secondary cache. Also not in the secondary cache. After Get, still
+  // block_1 is will not be cached.
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 7u);
+
+  Destroy(options);
+}
+
+// In this test, the block cache size is set to 6100, after insert 6 KV-pairs
+// and flush, there are 5 blocks in this SST file, 2 data blocks and 3 meta
+// blocks. block_1 size is 4096 and block_2 size is 2056. The total size
+// of the meta blocks are about 900 to 1000. Therefore, we can successfully
+// insert and cache block_1 in the block cache (this is the different place
+// from TestSecondaryCacheCorrectness1)
+TEST_F(DBSecondaryCacheTest, TestSecondaryCacheCorrectness2) {
+  LRUCacheOptions opts(6100 /* capacity */, 0 /* num_shard_bits */,
+                       false /* strict_capacity_limit */,
+                       0.5 /* high_pri_pool_ratio */,
+                       nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache(
+      new TestSecondaryCache(2048 * 1024));
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  table_options.block_size = 4 * 1024;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.paranoid_file_checks = true;
+  options.env = fault_env_.get();
+  fault_fs_->SetFailGetUniqueId(true);
+  DestroyAndReopen(options);
+  Random rnd(301);
+  const int N = 6;
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(1007);
+    ASSERT_OK(Put(Key(i), p_v));
+  }
+
+  ASSERT_OK(Flush());
+  // After Flush is successful, RocksDB will do the paranoid check for the new
+  // SST file. Meta blocks are always cached in the block cache and they
+  // will not be evicted. When block_2 is cache miss and read out, it is
+  // inserted to the block cache. Thefore, block_1 is evicted from block
+  // cache and successfully inserted to the secondary cache. Here are 2
+  // lookups in the secondary cache for block_1 and block_2.
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+  Compact("a", "z");
+  // Compaction will create the iterator to scan the whole file. So all the
+  // blocks are needed. After Flush, only block_2 is cached in block cache
+  // and block_1 is in the secondary cache. So when read block_1, it is
+  // read out from secondary cache and inserted to block cache. At the same
+  // time, block_2 is inserted to secondary cache. Now, secondary cache has
+  // both block_1 and block_2. After compaction, block_1 is in the cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 2u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 3u);
+
+  std::string v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  // This Get needs to access block_1, since block_1 is cached in block cache
+  // there is no secondary cache lookup.
+  ASSERT_EQ(secondary_cache->num_inserts(), 2u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 3u);
+
+  v = Get(Key(5));
+  ASSERT_EQ(1007, v.size());
+  // This Get needs to access block_2 which is not in the block cache. So
+  // it will lookup the secondary cache for block_2 and cache it in the
+  // block_cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 2u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 4u);
+
+  v = Get(Key(5));
+  ASSERT_EQ(1007, v.size());
+  // This Get needs to access block_2 which is already in the block cache.
+  // No need to lookup secondary cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 2u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 4u);
+
+  v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  // This Get needs to access block_1, since block_1 is not in block cache
+  // there is one econdary cache lookup. Then, block_1 is cached in the
+  // block cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 2u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 5u);
+
+  v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  // This Get needs to access block_1, since block_1 is cached in block cache
+  // there is no secondary cache lookup.
+  ASSERT_EQ(secondary_cache->num_inserts(), 2u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 5u);
+
+  Destroy(options);
+}
+
+// The block cache size is set to 1024*1024, after insert 6 KV-pairs
+// and flush, there are 5 blocks in this SST file, 2 data blocks and 3 meta
+// blocks. block_1 size is 4096 and block_2 size is 2056. The total size
+// of the meta blocks are about 900 to 1000. Therefore, we can successfully
+// cache all the blocks in the block cache and there is not secondary cache
+// insertion. 2 lookup is needed for the blocks.
+TEST_F(DBSecondaryCacheTest, NoSecondaryCacheInsertion) {
+  LRUCacheOptions opts(1024 * 1024 /* capacity */, 0 /* num_shard_bits */,
+                       false /* strict_capacity_limit */,
+                       0.5 /* high_pri_pool_ratio */,
+                       nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache(
+      new TestSecondaryCache(2048 * 1024));
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  table_options.block_size = 4 * 1024;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.paranoid_file_checks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = fault_env_.get();
+  fault_fs_->SetFailGetUniqueId(true);
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+  const int N = 6;
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(1000);
+    ASSERT_OK(Put(Key(i), p_v));
+  }
+
+  ASSERT_OK(Flush());
+  // After Flush is successful, RocksDB will do the paranoid check for the new
+  // SST file. Meta blocks are always cached in the block cache and they
+  // will not be evicted. Now, block cache is large enough, it cache
+  // both block_1 and block_2. When first time read block_1 and block_2
+  // there are cache misses. So 2 secondary cache lookups are needed for
+  // the 2 blocks
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+  Compact("a", "z");
+  // Compaction will iterate the whole SST file. Since all the data blocks
+  // are in the block cache. No need to lookup the secondary cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+  std::string v = Get(Key(0));
+  ASSERT_EQ(1000, v.size());
+  // Since the block cache is large enough, all the blocks are cached. we
+  // do not need to lookup the seondary cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+  Destroy(options);
+}
+
+TEST_F(DBSecondaryCacheTest, SecondaryCacheIntensiveTesting) {
+  LRUCacheOptions opts(8 * 1024 /* capacity */, 0 /* num_shard_bits */,
+                       false /* strict_capacity_limit */,
+                       0.5 /* high_pri_pool_ratio */,
+                       nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache(
+      new TestSecondaryCache(2048 * 1024));
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  table_options.block_size = 4 * 1024;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = fault_env_.get();
+  fault_fs_->SetFailGetUniqueId(true);
+  DestroyAndReopen(options);
+  Random rnd(301);
+  const int N = 256;
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(1000);
+    ASSERT_OK(Put(Key(i), p_v));
+  }
+  ASSERT_OK(Flush());
+  Compact("a", "z");
+
+  Random r_index(47);
+  std::string v;
+  for (int i = 0; i < 1000; i++) {
+    uint32_t key_i = r_index.Next() % N;
+    v = Get(Key(key_i));
+  }
+
+  // We have over 200 data blocks there will be multiple insertion
+  // and lookups.
+  ASSERT_GE(secondary_cache->num_inserts(), 1u);
+  ASSERT_GE(secondary_cache->num_lookups(), 1u);
+
+  Destroy(options);
+}
+
+// In this test, the block cache size is set to 4096, after insert 6 KV-pairs
+// and flush, there are 5 blocks in this SST file, 2 data blocks and 3 meta
+// blocks. block_1 size is 4096 and block_2 size is 2056. The total size
+// of the meta blocks are about 900 to 1000. Therefore, in any situation,
+// if we try to insert block_1 to the block cache, it will always fails. Only
+// block_2 will be successfully inserted into the block cache.
+TEST_F(DBSecondaryCacheTest, SecondaryCacheFailureTest) {
+  LRUCacheOptions opts(4 * 1024 /* capacity */, 0 /* num_shard_bits */,
+                       false /* strict_capacity_limit */,
+                       0.5 /* high_pri_pool_ratio */,
+                       nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache(
+      new TestSecondaryCache(2048 * 1024));
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  table_options.block_size = 4 * 1024;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.paranoid_file_checks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = fault_env_.get();
+  fault_fs_->SetFailGetUniqueId(true);
+  DestroyAndReopen(options);
+  Random rnd(301);
+  const int N = 6;
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(1007);
+    ASSERT_OK(Put(Key(i), p_v));
+  }
+
+  ASSERT_OK(Flush());
+  // After Flush is successful, RocksDB will do the paranoid check for the new
+  // SST file. Meta blocks are always cached in the block cache and they
+  // will not be evicted. When block_2 is cache miss and read out, it is
+  // inserted to the block cache. Note that, block_1 is never successfully
+  // inserted to the block cache. Here are 2 lookups in the secondary cache
+  // for block_1 and block_2
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+  // Fail the insertion, in LRU cache, the secondary insertion returned status
+  // is not checked, therefore, the DB will not be influenced.
+  secondary_cache->InjectFailure();
+  Compact("a", "z");
+  // Compaction will create the iterator to scan the whole file. So all the
+  // blocks are needed. Meta blocks are always cached. When block_1 is read
+  // out, block_2 is evicted from block cache and inserted to secondary
+  // cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 3u);
+
+  std::string v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  // The first data block is not in the cache, similarly, trigger the block
+  // cache Lookup and secondary cache lookup for block_1. But block_1 will not
+  // be inserted successfully due to the size. Currently, cache only has
+  // the meta blocks.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 4u);
+
+  v = Get(Key(5));
+  ASSERT_EQ(1007, v.size());
+  // The second data block is not in the cache, similarly, trigger the block
+  // cache Lookup and secondary cache lookup for block_2 and block_2 is found
+  // in the secondary cache. Now block cache has block_2
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 5u);
+
+  v = Get(Key(5));
+  ASSERT_EQ(1007, v.size());
+  // block_2 is in the block cache. There is a block cache hit. No need to
+  // lookup or insert the secondary cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 5u);
+
+  v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  // Lookup the first data block, not in the block cache, so lookup the
+  // secondary cache. Also not in the secondary cache. After Get, still
+  // block_1 is will not be cached.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 6u);
+
+  v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  // Lookup the first data block, not in the block cache, so lookup the
+  // secondary cache. Also not in the secondary cache. After Get, still
+  // block_1 is will not be cached.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 7u);
+  secondary_cache->ResetInjectFailure();
+
+  Destroy(options);
+}
+
+TEST_F(DBSecondaryCacheTest, TestSecondaryWithCompressedCache) {
+  if (!Snappy_Supported()) {
+    ROCKSDB_GTEST_SKIP("Compressed cache test requires snappy support");
+    return;
+  }
+  LRUCacheOptions opts(2000 /* capacity */, 0 /* num_shard_bits */,
+                       false /* strict_capacity_limit */,
+                       0.5 /* high_pri_pool_ratio */,
+                       nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache(
+      new TestSecondaryCache(2048 * 1024));
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache_compressed = cache;
+  table_options.no_block_cache = true;
+  table_options.block_size = 1234;
+  Options options = GetDefaultOptions();
+  options.compression = kSnappyCompression;
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+  Random rnd(301);
+  const int N = 6;
+  for (int i = 0; i < N; i++) {
+    // Partly compressible
+    std::string p_v = rnd.RandomString(507) + std::string(500, ' ');
+    ASSERT_OK(Put(Key(i), p_v));
+  }
+  ASSERT_OK(Flush());
+  for (int i = 0; i < 2 * N; i++) {
+    std::string v = Get(Key(i % N));
+    ASSERT_EQ(1007, v.size());
+  }
+}
+
+TEST_F(LRUCacheSecondaryCacheTest, BasicWaitAllTest) {
+  LRUCacheOptions opts(1024 /* capacity */, 2 /* num_shard_bits */,
+                       false /* strict_capacity_limit */,
+                       0.5 /* high_pri_pool_ratio */,
+                       nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache =
+      std::make_shared<TestSecondaryCache>(32 * 1024);
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  const int num_keys = 32;
+  OffsetableCacheKey ock{"foo", "bar", 1};
+
+  Random rnd(301);
+  std::vector<std::string> values;
+  for (int i = 0; i < num_keys; ++i) {
+    std::string str = rnd.RandomString(1020);
+    values.emplace_back(str);
+    TestItem* item = new TestItem(str.data(), str.length());
+    ASSERT_OK(cache->Insert(ock.WithOffset(i).AsSlice(), item,
+                            &LRUCacheSecondaryCacheTest::helper_,
+                            str.length()));
+  }
+  // Force all entries to be evicted to the secondary cache
+  cache->SetCapacity(0);
+  ASSERT_EQ(secondary_cache->num_inserts(), 32u);
+  cache->SetCapacity(32 * 1024);
+
+  secondary_cache->SetResultMap(
+      {{ock.WithOffset(3).AsSlice().ToString(),
+        TestSecondaryCache::ResultType::DEFER},
+       {ock.WithOffset(4).AsSlice().ToString(),
+        TestSecondaryCache::ResultType::DEFER_AND_FAIL},
+       {ock.WithOffset(5).AsSlice().ToString(),
+        TestSecondaryCache::ResultType::FAIL}});
+  std::vector<Cache::Handle*> results;
+  for (int i = 0; i < 6; ++i) {
+    results.emplace_back(cache->Lookup(
+        ock.WithOffset(i).AsSlice(), &LRUCacheSecondaryCacheTest::helper_,
+        test_item_creator, Cache::Priority::LOW, false));
+  }
+  cache->WaitAll(results);
+  for (int i = 0; i < 6; ++i) {
+    if (i == 4) {
+      ASSERT_EQ(cache->Value(results[i]), nullptr);
+    } else if (i == 5) {
+      ASSERT_EQ(results[i], nullptr);
+      continue;
+    } else {
+      TestItem* item = static_cast<TestItem*>(cache->Value(results[i]));
+      ASSERT_EQ(item->ToString(), values[i]);
+    }
+    cache->Release(results[i]);
+  }
+
+  cache.reset();
+  secondary_cache.reset();
+}
+
+// In this test, we have one KV pair per data block. We indirectly determine
+// the cache key associated with each data block (and thus each KV) by using
+// a sync point callback in TestSecondaryCache::Lookup. We then control the
+// lookup result by setting the ResultMap.
+TEST_F(DBSecondaryCacheTest, TestSecondaryCacheMultiGet) {
+  LRUCacheOptions opts(1 << 20 /* capacity */, 0 /* num_shard_bits */,
+                       false /* strict_capacity_limit */,
+                       0.5 /* high_pri_pool_ratio */,
+                       nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache(
+      new TestSecondaryCache(2048 * 1024));
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  table_options.block_size = 4 * 1024;
+  table_options.cache_index_and_filter_blocks = false;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.paranoid_file_checks = true;
+  DestroyAndReopen(options);
+  Random rnd(301);
+  const int N = 8;
+  std::vector<std::string> keys;
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(4000);
+    keys.emplace_back(p_v);
+    ASSERT_OK(Put(Key(i), p_v));
+  }
+
+  ASSERT_OK(Flush());
+  // After Flush is successful, RocksDB does the paranoid check for the new
+  // SST file. This will try to lookup all data blocks in the secondary
+  // cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 8u);
+
+  cache->SetCapacity(0);
+  ASSERT_EQ(secondary_cache->num_inserts(), 8u);
+  cache->SetCapacity(1 << 20);
+
+  std::vector<std::string> cache_keys;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "TestSecondaryCache::Lookup", [&cache_keys](void* key) -> void {
+        cache_keys.emplace_back(*(static_cast<std::string*>(key)));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  for (int i = 0; i < N; ++i) {
+    std::string v = Get(Key(i));
+    ASSERT_EQ(4000, v.size());
+    ASSERT_EQ(v, keys[i]);
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(secondary_cache->num_lookups(), 16u);
+  cache->SetCapacity(0);
+  cache->SetCapacity(1 << 20);
+
+  ASSERT_EQ(Get(Key(2)), keys[2]);
+  ASSERT_EQ(Get(Key(7)), keys[7]);
+  secondary_cache->SetResultMap(
+      {{cache_keys[3], TestSecondaryCache::ResultType::DEFER},
+       {cache_keys[4], TestSecondaryCache::ResultType::DEFER_AND_FAIL},
+       {cache_keys[5], TestSecondaryCache::ResultType::FAIL}});
+
+  std::vector<std::string> mget_keys(
+      {Key(0), Key(1), Key(2), Key(3), Key(4), Key(5), Key(6), Key(7)});
+  std::vector<PinnableSlice> values(mget_keys.size());
+  std::vector<Status> s(keys.size());
+  std::vector<Slice> key_slices;
+  for (const std::string& key : mget_keys) {
+    key_slices.emplace_back(key);
+  }
+  uint32_t num_lookups = secondary_cache->num_lookups();
+  dbfull()->MultiGet(ReadOptions(), dbfull()->DefaultColumnFamily(),
+                     key_slices.size(), key_slices.data(), values.data(),
+                     s.data(), false);
+  ASSERT_EQ(secondary_cache->num_lookups(), num_lookups + 5);
+  for (int i = 0; i < N; ++i) {
+    ASSERT_OK(s[i]);
+    ASSERT_EQ(values[i].ToString(), keys[i]);
+    values[i].Reset();
+  }
+  Destroy(options);
+}
+
+class LRUCacheWithStat : public LRUCache {
+ public:
+  LRUCacheWithStat(
+      size_t _capacity, int _num_shard_bits, bool _strict_capacity_limit,
+      double _high_pri_pool_ratio, double _low_pri_pool_ratio,
+      std::shared_ptr<MemoryAllocator> _memory_allocator = nullptr,
+      bool _use_adaptive_mutex = kDefaultToAdaptiveMutex,
+      CacheMetadataChargePolicy _metadata_charge_policy =
+          kDontChargeCacheMetadata,
+      const std::shared_ptr<SecondaryCache>& _secondary_cache = nullptr)
+      : LRUCache(_capacity, _num_shard_bits, _strict_capacity_limit,
+                 _high_pri_pool_ratio, _low_pri_pool_ratio, _memory_allocator,
+                 _use_adaptive_mutex, _metadata_charge_policy,
+                 _secondary_cache) {
+    insert_count_ = 0;
+    lookup_count_ = 0;
+  }
+  ~LRUCacheWithStat() {}
+
+  Status Insert(const Slice& key, void* value, size_t charge, DeleterFn deleter,
+                Handle** handle, Priority priority) override {
+    insert_count_++;
+    return LRUCache::Insert(key, value, charge, deleter, handle, priority);
+  }
+  Status Insert(const Slice& key, void* value, const CacheItemHelper* helper,
+                size_t charge, Handle** handle = nullptr,
+                Priority priority = Priority::LOW) override {
+    insert_count_++;
+    return LRUCache::Insert(key, value, helper, charge, handle, priority);
+  }
+  Handle* Lookup(const Slice& key, Statistics* stats) override {
+    lookup_count_++;
+    return LRUCache::Lookup(key, stats);
+  }
+  Handle* Lookup(const Slice& key, const CacheItemHelper* helper,
+                 const CreateCallback& create_cb, Priority priority, bool wait,
+                 Statistics* stats = nullptr) override {
+    lookup_count_++;
+    return LRUCache::Lookup(key, helper, create_cb, priority, wait, stats);
+  }
+
+  uint32_t GetInsertCount() { return insert_count_; }
+  uint32_t GetLookupcount() { return lookup_count_; }
+  void ResetCount() {
+    insert_count_ = 0;
+    lookup_count_ = 0;
+  }
+
+ private:
+  uint32_t insert_count_;
+  uint32_t lookup_count_;
+};
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBSecondaryCacheTest, LRUCacheDumpLoadBasic) {
+  LRUCacheOptions cache_opts(1024 * 1024 /* capacity */, 0 /* num_shard_bits */,
+                             false /* strict_capacity_limit */,
+                             0.5 /* high_pri_pool_ratio */,
+                             nullptr /* memory_allocator */,
+                             kDefaultToAdaptiveMutex, kDontChargeCacheMetadata);
+  LRUCacheWithStat* tmp_cache = new LRUCacheWithStat(
+      cache_opts.capacity, cache_opts.num_shard_bits,
+      cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio,
+      cache_opts.low_pri_pool_ratio, cache_opts.memory_allocator,
+      cache_opts.use_adaptive_mutex, cache_opts.metadata_charge_policy,
+      cache_opts.secondary_cache);
+  std::shared_ptr<Cache> cache(tmp_cache);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  table_options.block_size = 4 * 1024;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = fault_env_.get();
+  DestroyAndReopen(options);
+  fault_fs_->SetFailGetUniqueId(true);
+
+  Random rnd(301);
+  const int N = 256;
+  std::vector<std::string> value;
+  char buf[1000];
+  memset(buf, 'a', 1000);
+  value.resize(N);
+  for (int i = 0; i < N; i++) {
+    // std::string p_v = rnd.RandomString(1000);
+    std::string p_v(buf, 1000);
+    value[i] = p_v;
+    ASSERT_OK(Put(Key(i), p_v));
+  }
+  ASSERT_OK(Flush());
+  Compact("a", "z");
+
+  // do th eread for all the key value pairs, so all the blocks should be in
+  // cache
+  uint32_t start_insert = tmp_cache->GetInsertCount();
+  uint32_t start_lookup = tmp_cache->GetLookupcount();
+  std::string v;
+  for (int i = 0; i < N; i++) {
+    v = Get(Key(i));
+    ASSERT_EQ(v, value[i]);
+  }
+  uint32_t dump_insert = tmp_cache->GetInsertCount() - start_insert;
+  uint32_t dump_lookup = tmp_cache->GetLookupcount() - start_lookup;
+  ASSERT_EQ(63,
+            static_cast<int>(dump_insert));  // the insert in the block cache
+  ASSERT_EQ(256,
+            static_cast<int>(dump_lookup));  // the lookup in the block cache
+  // We have enough blocks in the block cache
+
+  CacheDumpOptions cd_options;
+  cd_options.clock = fault_env_->GetSystemClock().get();
+  std::string dump_path = db_->GetName() + "/cache_dump";
+  std::unique_ptr<CacheDumpWriter> dump_writer;
+  Status s = NewToFileCacheDumpWriter(fault_fs_, FileOptions(), dump_path,
+                                      &dump_writer);
+  ASSERT_OK(s);
+  std::unique_ptr<CacheDumper> cache_dumper;
+  s = NewDefaultCacheDumper(cd_options, cache, std::move(dump_writer),
+                            &cache_dumper);
+  ASSERT_OK(s);
+  std::vector<DB*> db_list;
+  db_list.push_back(db_);
+  s = cache_dumper->SetDumpFilter(db_list);
+  ASSERT_OK(s);
+  s = cache_dumper->DumpCacheEntriesToWriter();
+  ASSERT_OK(s);
+  cache_dumper.reset();
+
+  // we have a new cache it is empty, then, before we do the Get, we do the
+  // dumpload
+  std::shared_ptr<TestSecondaryCache> secondary_cache =
+      std::make_shared<TestSecondaryCache>(2048 * 1024);
+  cache_opts.secondary_cache = secondary_cache;
+  tmp_cache = new LRUCacheWithStat(
+      cache_opts.capacity, cache_opts.num_shard_bits,
+      cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio,
+      cache_opts.low_pri_pool_ratio, cache_opts.memory_allocator,
+      cache_opts.use_adaptive_mutex, cache_opts.metadata_charge_policy,
+      cache_opts.secondary_cache);
+  std::shared_ptr<Cache> cache_new(tmp_cache);
+  table_options.block_cache = cache_new;
+  table_options.block_size = 4 * 1024;
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = fault_env_.get();
+
+  // start to load the data to new block cache
+  start_insert = secondary_cache->num_inserts();
+  start_lookup = secondary_cache->num_lookups();
+  std::unique_ptr<CacheDumpReader> dump_reader;
+  s = NewFromFileCacheDumpReader(fault_fs_, FileOptions(), dump_path,
+                                 &dump_reader);
+  ASSERT_OK(s);
+  std::unique_ptr<CacheDumpedLoader> cache_loader;
+  s = NewDefaultCacheDumpedLoader(cd_options, table_options, secondary_cache,
+                                  std::move(dump_reader), &cache_loader);
+  ASSERT_OK(s);
+  s = cache_loader->RestoreCacheEntriesToSecondaryCache();
+  ASSERT_OK(s);
+  uint32_t load_insert = secondary_cache->num_inserts() - start_insert;
+  uint32_t load_lookup = secondary_cache->num_lookups() - start_lookup;
+  // check the number we inserted
+  ASSERT_EQ(64, static_cast<int>(load_insert));
+  ASSERT_EQ(0, static_cast<int>(load_lookup));
+  ASSERT_OK(s);
+
+  Reopen(options);
+
+  // After load, we do the Get again
+  start_insert = secondary_cache->num_inserts();
+  start_lookup = secondary_cache->num_lookups();
+  uint32_t cache_insert = tmp_cache->GetInsertCount();
+  uint32_t cache_lookup = tmp_cache->GetLookupcount();
+  for (int i = 0; i < N; i++) {
+    v = Get(Key(i));
+    ASSERT_EQ(v, value[i]);
+  }
+  uint32_t final_insert = secondary_cache->num_inserts() - start_insert;
+  uint32_t final_lookup = secondary_cache->num_lookups() - start_lookup;
+  // no insert to secondary cache
+  ASSERT_EQ(0, static_cast<int>(final_insert));
+  // lookup the secondary to get all blocks
+  ASSERT_EQ(64, static_cast<int>(final_lookup));
+  uint32_t block_insert = tmp_cache->GetInsertCount() - cache_insert;
+  uint32_t block_lookup = tmp_cache->GetLookupcount() - cache_lookup;
+  // Check the new block cache insert and lookup, should be no insert since all
+  // blocks are from the secondary cache.
+  ASSERT_EQ(0, static_cast<int>(block_insert));
+  ASSERT_EQ(256, static_cast<int>(block_lookup));
+
+  fault_fs_->SetFailGetUniqueId(false);
+  Destroy(options);
+}
+
+TEST_F(DBSecondaryCacheTest, LRUCacheDumpLoadWithFilter) {
+  LRUCacheOptions cache_opts(1024 * 1024 /* capacity */, 0 /* num_shard_bits */,
+                             false /* strict_capacity_limit */,
+                             0.5 /* high_pri_pool_ratio */,
+                             nullptr /* memory_allocator */,
+                             kDefaultToAdaptiveMutex, kDontChargeCacheMetadata);
+  LRUCacheWithStat* tmp_cache = new LRUCacheWithStat(
+      cache_opts.capacity, cache_opts.num_shard_bits,
+      cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio,
+      cache_opts.low_pri_pool_ratio, cache_opts.memory_allocator,
+      cache_opts.use_adaptive_mutex, cache_opts.metadata_charge_policy,
+      cache_opts.secondary_cache);
+  std::shared_ptr<Cache> cache(tmp_cache);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  table_options.block_size = 4 * 1024;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = fault_env_.get();
+  std::string dbname1 = test::PerThreadDBPath("db_1");
+  ASSERT_OK(DestroyDB(dbname1, options));
+  DB* db1 = nullptr;
+  ASSERT_OK(DB::Open(options, dbname1, &db1));
+  std::string dbname2 = test::PerThreadDBPath("db_2");
+  ASSERT_OK(DestroyDB(dbname2, options));
+  DB* db2 = nullptr;
+  ASSERT_OK(DB::Open(options, dbname2, &db2));
+  fault_fs_->SetFailGetUniqueId(true);
+
+  // write the KVs to db1
+  Random rnd(301);
+  const int N = 256;
+  std::vector<std::string> value1;
+  WriteOptions wo;
+  char buf[1000];
+  memset(buf, 'a', 1000);
+  value1.resize(N);
+  for (int i = 0; i < N; i++) {
+    std::string p_v(buf, 1000);
+    value1[i] = p_v;
+    ASSERT_OK(db1->Put(wo, Key(i), p_v));
+  }
+  ASSERT_OK(db1->Flush(FlushOptions()));
+  Slice bg("a");
+  Slice ed("b");
+  ASSERT_OK(db1->CompactRange(CompactRangeOptions(), &bg, &ed));
+
+  // Write the KVs to DB2
+  std::vector<std::string> value2;
+  memset(buf, 'b', 1000);
+  value2.resize(N);
+  for (int i = 0; i < N; i++) {
+    std::string p_v(buf, 1000);
+    value2[i] = p_v;
+    ASSERT_OK(db2->Put(wo, Key(i), p_v));
+  }
+  ASSERT_OK(db2->Flush(FlushOptions()));
+  ASSERT_OK(db2->CompactRange(CompactRangeOptions(), &bg, &ed));
+
+  // do th eread for all the key value pairs, so all the blocks should be in
+  // cache
+  uint32_t start_insert = tmp_cache->GetInsertCount();
+  uint32_t start_lookup = tmp_cache->GetLookupcount();
+  ReadOptions ro;
+  std::string v;
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(db1->Get(ro, Key(i), &v));
+    ASSERT_EQ(v, value1[i]);
+  }
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(db2->Get(ro, Key(i), &v));
+    ASSERT_EQ(v, value2[i]);
+  }
+  uint32_t dump_insert = tmp_cache->GetInsertCount() - start_insert;
+  uint32_t dump_lookup = tmp_cache->GetLookupcount() - start_lookup;
+  ASSERT_EQ(128,
+            static_cast<int>(dump_insert));  // the insert in the block cache
+  ASSERT_EQ(512,
+            static_cast<int>(dump_lookup));  // the lookup in the block cache
+  // We have enough blocks in the block cache
+
+  CacheDumpOptions cd_options;
+  cd_options.clock = fault_env_->GetSystemClock().get();
+  std::string dump_path = db1->GetName() + "/cache_dump";
+  std::unique_ptr<CacheDumpWriter> dump_writer;
+  Status s = NewToFileCacheDumpWriter(fault_fs_, FileOptions(), dump_path,
+                                      &dump_writer);
+  ASSERT_OK(s);
+  std::unique_ptr<CacheDumper> cache_dumper;
+  s = NewDefaultCacheDumper(cd_options, cache, std::move(dump_writer),
+                            &cache_dumper);
+  ASSERT_OK(s);
+  std::vector<DB*> db_list;
+  db_list.push_back(db1);
+  s = cache_dumper->SetDumpFilter(db_list);
+  ASSERT_OK(s);
+  s = cache_dumper->DumpCacheEntriesToWriter();
+  ASSERT_OK(s);
+  cache_dumper.reset();
+
+  // we have a new cache it is empty, then, before we do the Get, we do the
+  // dumpload
+  std::shared_ptr<TestSecondaryCache> secondary_cache =
+      std::make_shared<TestSecondaryCache>(2048 * 1024);
+  cache_opts.secondary_cache = secondary_cache;
+  tmp_cache = new LRUCacheWithStat(
+      cache_opts.capacity, cache_opts.num_shard_bits,
+      cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio,
+      cache_opts.low_pri_pool_ratio, cache_opts.memory_allocator,
+      cache_opts.use_adaptive_mutex, cache_opts.metadata_charge_policy,
+      cache_opts.secondary_cache);
+  std::shared_ptr<Cache> cache_new(tmp_cache);
+  table_options.block_cache = cache_new;
+  table_options.block_size = 4 * 1024;
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = fault_env_.get();
+
+  // Start the cache loading process
+  start_insert = secondary_cache->num_inserts();
+  start_lookup = secondary_cache->num_lookups();
+  std::unique_ptr<CacheDumpReader> dump_reader;
+  s = NewFromFileCacheDumpReader(fault_fs_, FileOptions(), dump_path,
+                                 &dump_reader);
+  ASSERT_OK(s);
+  std::unique_ptr<CacheDumpedLoader> cache_loader;
+  s = NewDefaultCacheDumpedLoader(cd_options, table_options, secondary_cache,
+                                  std::move(dump_reader), &cache_loader);
+  ASSERT_OK(s);
+  s = cache_loader->RestoreCacheEntriesToSecondaryCache();
+  ASSERT_OK(s);
+  uint32_t load_insert = secondary_cache->num_inserts() - start_insert;
+  uint32_t load_lookup = secondary_cache->num_lookups() - start_lookup;
+  // check the number we inserted
+  ASSERT_EQ(64, static_cast<int>(load_insert));
+  ASSERT_EQ(0, static_cast<int>(load_lookup));
+  ASSERT_OK(s);
+
+  ASSERT_OK(db1->Close());
+  delete db1;
+  ASSERT_OK(DB::Open(options, dbname1, &db1));
+
+  // After load, we do the Get again. To validate the cache, we do not allow any
+  // I/O, so we set the file system to false.
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  fault_fs_->SetFilesystemActive(false, error_msg);
+  start_insert = secondary_cache->num_inserts();
+  start_lookup = secondary_cache->num_lookups();
+  uint32_t cache_insert = tmp_cache->GetInsertCount();
+  uint32_t cache_lookup = tmp_cache->GetLookupcount();
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(db1->Get(ro, Key(i), &v));
+    ASSERT_EQ(v, value1[i]);
+  }
+  uint32_t final_insert = secondary_cache->num_inserts() - start_insert;
+  uint32_t final_lookup = secondary_cache->num_lookups() - start_lookup;
+  // no insert to secondary cache
+  ASSERT_EQ(0, static_cast<int>(final_insert));
+  // lookup the secondary to get all blocks
+  ASSERT_EQ(64, static_cast<int>(final_lookup));
+  uint32_t block_insert = tmp_cache->GetInsertCount() - cache_insert;
+  uint32_t block_lookup = tmp_cache->GetLookupcount() - cache_lookup;
+  // Check the new block cache insert and lookup, should be no insert since all
+  // blocks are from the secondary cache.
+  ASSERT_EQ(0, static_cast<int>(block_insert));
+  ASSERT_EQ(256, static_cast<int>(block_lookup));
+  fault_fs_->SetFailGetUniqueId(false);
+  fault_fs_->SetFilesystemActive(true);
+  delete db1;
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname1, options));
+  ASSERT_OK(DestroyDB(dbname2, options));
+}
+
+// Test the option not to use the secondary cache in a certain DB.
+TEST_F(DBSecondaryCacheTest, TestSecondaryCacheOptionBasic) {
+  LRUCacheOptions opts(4 * 1024 /* capacity */, 0 /* num_shard_bits */,
+                       false /* strict_capacity_limit */,
+                       0.5 /* high_pri_pool_ratio */,
+                       nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache(
+      new TestSecondaryCache(2048 * 1024));
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  table_options.block_size = 4 * 1024;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = fault_env_.get();
+  fault_fs_->SetFailGetUniqueId(true);
+  options.lowest_used_cache_tier = CacheTier::kVolatileTier;
+
+  // Set the file paranoid check, so after flush, the file will be read
+  // all the blocks will be accessed.
+  options.paranoid_file_checks = true;
+  DestroyAndReopen(options);
+  Random rnd(301);
+  const int N = 6;
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(1007);
+    ASSERT_OK(Put(Key(i), p_v));
+  }
+
+  ASSERT_OK(Flush());
+
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(1007);
+    ASSERT_OK(Put(Key(i + 70), p_v));
+  }
+
+  ASSERT_OK(Flush());
+
+  // Flush will trigger the paranoid check and read blocks. But only block cache
+  // will be read. No operations for secondary cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+  Compact("a", "z");
+
+  // Compaction will also insert and evict blocks, no operations to the block
+  // cache. No operations for secondary cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+  std::string v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+
+  // Check the data in first block. Cache miss, direclty read from SST file.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+  v = Get(Key(5));
+  ASSERT_EQ(1007, v.size());
+
+  // Check the second block.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+  v = Get(Key(5));
+  ASSERT_EQ(1007, v.size());
+
+  // block cache hit
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+  v = Get(Key(70));
+  ASSERT_EQ(1007, v.size());
+
+  // Check the first block in the second SST file. Cache miss and trigger SST
+  // file read. No operations for secondary cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+  v = Get(Key(75));
+  ASSERT_EQ(1007, v.size());
+
+  // Check the second block in the second SST file. Cache miss and trigger SST
+  // file read. No operations for secondary cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+  Destroy(options);
+}
+
+// We disable the secondary cache in DBOptions at first. Close and reopen the DB
+// with new options, which set the lowest_used_cache_tier to
+// kNonVolatileBlockTier. So secondary cache will be used.
+TEST_F(DBSecondaryCacheTest, TestSecondaryCacheOptionChange) {
+  LRUCacheOptions opts(4 * 1024 /* capacity */, 0 /* num_shard_bits */,
+                       false /* strict_capacity_limit */,
+                       0.5 /* high_pri_pool_ratio */,
+                       nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache(
+      new TestSecondaryCache(2048 * 1024));
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  table_options.block_size = 4 * 1024;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = fault_env_.get();
+  fault_fs_->SetFailGetUniqueId(true);
+  options.lowest_used_cache_tier = CacheTier::kVolatileTier;
+
+  // Set the file paranoid check, so after flush, the file will be read
+  // all the blocks will be accessed.
+  options.paranoid_file_checks = true;
+  DestroyAndReopen(options);
+  Random rnd(301);
+  const int N = 6;
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(1007);
+    ASSERT_OK(Put(Key(i), p_v));
+  }
+
+  ASSERT_OK(Flush());
+
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(1007);
+    ASSERT_OK(Put(Key(i + 70), p_v));
+  }
+
+  ASSERT_OK(Flush());
+
+  // Flush will trigger the paranoid check and read blocks. But only block cache
+  // will be read.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+  Compact("a", "z");
+
+  // Compaction will also insert and evict blocks, no operations to the block
+  // cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+  std::string v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+
+  // Check the data in first block. Cache miss, direclty read from SST file.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+  v = Get(Key(5));
+  ASSERT_EQ(1007, v.size());
+
+  // Check the second block.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+  v = Get(Key(5));
+  ASSERT_EQ(1007, v.size());
+
+  // block cache hit
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+  // Change the option to enable secondary cache after we Reopen the DB
+  options.lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier;
+  Reopen(options);
+
+  v = Get(Key(70));
+  ASSERT_EQ(1007, v.size());
+
+  // Enable the secondary cache, trigger lookup of the first block in second SST
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 1u);
+
+  v = Get(Key(75));
+  ASSERT_EQ(1007, v.size());
+
+  // trigger lookup of the second block in second SST
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+  Destroy(options);
+}
+
+// Two DB test. We create 2 DBs sharing the same block cache and secondary
+// cache. We diable the secondary cache option for DB2.
+TEST_F(DBSecondaryCacheTest, TestSecondaryCacheOptionTwoDB) {
+  LRUCacheOptions opts(4 * 1024 /* capacity */, 0 /* num_shard_bits */,
+                       false /* strict_capacity_limit */,
+                       0.5 /* high_pri_pool_ratio */,
+                       nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache(
+      new TestSecondaryCache(2048 * 1024));
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  table_options.block_size = 4 * 1024;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = fault_env_.get();
+  options.paranoid_file_checks = true;
+  std::string dbname1 = test::PerThreadDBPath("db_t_1");
+  ASSERT_OK(DestroyDB(dbname1, options));
+  DB* db1 = nullptr;
+  ASSERT_OK(DB::Open(options, dbname1, &db1));
+  std::string dbname2 = test::PerThreadDBPath("db_t_2");
+  ASSERT_OK(DestroyDB(dbname2, options));
+  DB* db2 = nullptr;
+  Options options2 = options;
+  options2.lowest_used_cache_tier = CacheTier::kVolatileTier;
+  ASSERT_OK(DB::Open(options2, dbname2, &db2));
+  fault_fs_->SetFailGetUniqueId(true);
+
+  WriteOptions wo;
+  Random rnd(301);
+  const int N = 6;
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(1007);
+    ASSERT_OK(db1->Put(wo, Key(i), p_v));
+  }
+
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+  ASSERT_OK(db1->Flush(FlushOptions()));
+
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(1007);
+    ASSERT_OK(db2->Put(wo, Key(i), p_v));
+  }
+
+  // No change in the secondary cache, since it is disabled in DB2
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+  ASSERT_OK(db2->Flush(FlushOptions()));
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+  Slice bg("a");
+  Slice ed("b");
+  ASSERT_OK(db1->CompactRange(CompactRangeOptions(), &bg, &ed));
+  ASSERT_OK(db2->CompactRange(CompactRangeOptions(), &bg, &ed));
+
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+  ReadOptions ro;
+  std::string v;
+  ASSERT_OK(db1->Get(ro, Key(0), &v));
+  ASSERT_EQ(1007, v.size());
+
+  // DB 1 has lookup block 1 and it is miss in block cache, trigger secondary
+  // cache lookup
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 3u);
+
+  ASSERT_OK(db1->Get(ro, Key(5), &v));
+  ASSERT_EQ(1007, v.size());
+
+  // DB 1 lookup the second block and it is miss in block cache, trigger
+  // secondary cache lookup
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 4u);
+
+  ASSERT_OK(db2->Get(ro, Key(0), &v));
+  ASSERT_EQ(1007, v.size());
+
+  // For db2, it is not enabled with secondary cache, so no search in the
+  // secondary cache
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 4u);
+
+  ASSERT_OK(db2->Get(ro, Key(5), &v));
+  ASSERT_EQ(1007, v.size());
+
+  // For db2, it is not enabled with secondary cache, so no search in the
+  // secondary cache
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 4u);
+
+  fault_fs_->SetFailGetUniqueId(false);
+  fault_fs_->SetFilesystemActive(true);
+  delete db1;
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname1, options));
+  ASSERT_OK(DestroyDB(dbname2, options));
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/cache/secondary_cache.cc b/src/rocksdb/cache/secondary_cache.cc
new file mode 100644
index 000000000..84352db71
--- /dev/null
+++ b/src/rocksdb/cache/secondary_cache.cc
@@ -0,0 +1,32 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/secondary_cache.h"
+
+#include "cache/cache_entry_roles.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+size_t SliceSize(void* obj) { return static_cast<Slice*>(obj)->size(); }
+
+Status SliceSaveTo(void* from_obj, size_t from_offset, size_t length,
+                   void* out) {
+  const Slice& slice = *static_cast<Slice*>(from_obj);
+  std::memcpy(out, slice.data() + from_offset, length);
+  return Status::OK();
+}
+
+}  // namespace
+
+Status SecondaryCache::InsertSaved(const Slice& key, const Slice& saved) {
+  static Cache::CacheItemHelper helper{
+      &SliceSize, &SliceSaveTo, GetNoopDeleterForRole<CacheEntryRole::kMisc>()};
+  // NOTE: depends on Insert() being synchronous, not keeping pointer `&saved`
+  return Insert(key, const_cast<Slice*>(&saved), &helper);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/sharded_cache.cc b/src/rocksdb/cache/sharded_cache.cc
new file mode 100644
index 000000000..9ebca3ba8
--- /dev/null
+++ b/src/rocksdb/cache/sharded_cache.cc
@@ -0,0 +1,100 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "cache/sharded_cache.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+
+#include "util/hash.h"
+#include "util/math.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ShardedCacheBase::ShardedCacheBase(size_t capacity, int num_shard_bits,
+                                   bool strict_capacity_limit,
+                                   std::shared_ptr<MemoryAllocator> allocator)
+    : Cache(std::move(allocator)),
+      last_id_(1),
+      shard_mask_((uint32_t{1} << num_shard_bits) - 1),
+      strict_capacity_limit_(strict_capacity_limit),
+      capacity_(capacity) {}
+
+size_t ShardedCacheBase::ComputePerShardCapacity(size_t capacity) const {
+  uint32_t num_shards = GetNumShards();
+  return (capacity + (num_shards - 1)) / num_shards;
+}
+
+size_t ShardedCacheBase::GetPerShardCapacity() const {
+  return ComputePerShardCapacity(GetCapacity());
+}
+
+uint64_t ShardedCacheBase::NewId() {
+  return last_id_.fetch_add(1, std::memory_order_relaxed);
+}
+
+size_t ShardedCacheBase::GetCapacity() const {
+  MutexLock l(&config_mutex_);
+  return capacity_;
+}
+
+bool ShardedCacheBase::HasStrictCapacityLimit() const {
+  MutexLock l(&config_mutex_);
+  return strict_capacity_limit_;
+}
+
+size_t ShardedCacheBase::GetUsage(Handle* handle) const {
+  return GetCharge(handle);
+}
+
+std::string ShardedCacheBase::GetPrintableOptions() const {
+  std::string ret;
+  ret.reserve(20000);
+  const int kBufferSize = 200;
+  char buffer[kBufferSize];
+  {
+    MutexLock l(&config_mutex_);
+    snprintf(buffer, kBufferSize, "    capacity : %" ROCKSDB_PRIszt "\n",
+             capacity_);
+    ret.append(buffer);
+    snprintf(buffer, kBufferSize, "    num_shard_bits : %d\n",
+             GetNumShardBits());
+    ret.append(buffer);
+    snprintf(buffer, kBufferSize, "    strict_capacity_limit : %d\n",
+             strict_capacity_limit_);
+    ret.append(buffer);
+  }
+  snprintf(buffer, kBufferSize, "    memory_allocator : %s\n",
+           memory_allocator() ? memory_allocator()->Name() : "None");
+  ret.append(buffer);
+  AppendPrintableOptions(ret);
+  return ret;
+}
+
+int GetDefaultCacheShardBits(size_t capacity, size_t min_shard_size) {
+  int num_shard_bits = 0;
+  size_t num_shards = capacity / min_shard_size;
+  while (num_shards >>= 1) {
+    if (++num_shard_bits >= 6) {
+      // No more than 6.
+      return num_shard_bits;
+    }
+  }
+  return num_shard_bits;
+}
+
+int ShardedCacheBase::GetNumShardBits() const {
+  return BitsSetToOne(shard_mask_);
+}
+
+uint32_t ShardedCacheBase::GetNumShards() const { return shard_mask_ + 1; }
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cache/sharded_cache.h b/src/rocksdb/cache/sharded_cache.h
new file mode 100644
index 000000000..e3271cc7b
--- /dev/null
+++ b/src/rocksdb/cache/sharded_cache.h
@@ -0,0 +1,322 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+#include <string>
+
+#include "port/lang.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Optional base class for classes implementing the CacheShard concept
+class CacheShardBase {
+ public:
+  explicit CacheShardBase(CacheMetadataChargePolicy metadata_charge_policy)
+      : metadata_charge_policy_(metadata_charge_policy) {}
+
+  using DeleterFn = Cache::DeleterFn;
+
+  // Expected by concept CacheShard (TODO with C++20 support)
+  // Some Defaults
+  std::string GetPrintableOptions() const { return ""; }
+  using HashVal = uint64_t;
+  using HashCref = uint64_t;
+  static inline HashVal ComputeHash(const Slice& key) {
+    return GetSliceNPHash64(key);
+  }
+  static inline uint32_t HashPieceForSharding(HashCref hash) {
+    return Lower32of64(hash);
+  }
+  void AppendPrintableOptions(std::string& /*str*/) const {}
+
+  // Must be provided for concept CacheShard (TODO with C++20 support)
+  /*
+  struct HandleImpl {  // for concept HandleImpl
+    HashVal hash;
+    HashCref GetHash() const;
+    ...
+  };
+  Status Insert(const Slice& key, HashCref hash, void* value, size_t charge,
+                DeleterFn deleter, HandleImpl** handle,
+                Cache::Priority priority) = 0;
+  Status Insert(const Slice& key, HashCref hash, void* value,
+                const Cache::CacheItemHelper* helper, size_t charge,
+                HandleImpl** handle, Cache::Priority priority) = 0;
+  HandleImpl* Lookup(const Slice& key, HashCref hash) = 0;
+  HandleImpl* Lookup(const Slice& key, HashCref hash,
+                        const Cache::CacheItemHelper* helper,
+                        const Cache::CreateCallback& create_cb,
+                        Cache::Priority priority, bool wait,
+                        Statistics* stats) = 0;
+  bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref) = 0;
+  bool IsReady(HandleImpl* handle) = 0;
+  void Wait(HandleImpl* handle) = 0;
+  bool Ref(HandleImpl* handle) = 0;
+  void Erase(const Slice& key, HashCref hash) = 0;
+  void SetCapacity(size_t capacity) = 0;
+  void SetStrictCapacityLimit(bool strict_capacity_limit) = 0;
+  size_t GetUsage() const = 0;
+  size_t GetPinnedUsage() const = 0;
+  size_t GetOccupancyCount() const = 0;
+  size_t GetTableAddressCount() const = 0;
+  // Handles iterating over roughly `average_entries_per_lock` entries, using
+  // `state` to somehow record where it last ended up. Caller initially uses
+  // *state == 0 and implementation sets *state = SIZE_MAX to indicate
+  // completion.
+  void ApplyToSomeEntries(
+      const std::function<void(const Slice& key, void* value, size_t charge,
+                               DeleterFn deleter)>& callback,
+      size_t average_entries_per_lock, size_t* state) = 0;
+  void EraseUnRefEntries() = 0;
+  */
+
+ protected:
+  const CacheMetadataChargePolicy metadata_charge_policy_;
+};
+
+// Portions of ShardedCache that do not depend on the template parameter
+class ShardedCacheBase : public Cache {
+ public:
+  ShardedCacheBase(size_t capacity, int num_shard_bits,
+                   bool strict_capacity_limit,
+                   std::shared_ptr<MemoryAllocator> memory_allocator);
+  virtual ~ShardedCacheBase() = default;
+
+  int GetNumShardBits() const;
+  uint32_t GetNumShards() const;
+
+  uint64_t NewId() override;
+
+  bool HasStrictCapacityLimit() const override;
+  size_t GetCapacity() const override;
+
+  using Cache::GetUsage;
+  size_t GetUsage(Handle* handle) const override;
+  std::string GetPrintableOptions() const override;
+
+ protected:  // fns
+  virtual void AppendPrintableOptions(std::string& str) const = 0;
+  size_t GetPerShardCapacity() const;
+  size_t ComputePerShardCapacity(size_t capacity) const;
+
+ protected:                        // data
+  std::atomic<uint64_t> last_id_;  // For NewId
+  const uint32_t shard_mask_;
+
+  // Dynamic configuration parameters, guarded by config_mutex_
+  bool strict_capacity_limit_;
+  size_t capacity_;
+  mutable port::Mutex config_mutex_;
+};
+
+// Generic cache interface that shards cache by hash of keys. 2^num_shard_bits
+// shards will be created, with capacity split evenly to each of the shards.
+// Keys are typically sharded by the lowest num_shard_bits bits of hash value
+// so that the upper bits of the hash value can keep a stable ordering of
+// table entries even as the table grows (using more upper hash bits).
+// See CacheShardBase above for what is expected of the CacheShard parameter.
+template <class CacheShard>
+class ShardedCache : public ShardedCacheBase {
+ public:
+  using HashVal = typename CacheShard::HashVal;
+  using HashCref = typename CacheShard::HashCref;
+  using HandleImpl = typename CacheShard::HandleImpl;
+
+  ShardedCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+               std::shared_ptr<MemoryAllocator> allocator)
+      : ShardedCacheBase(capacity, num_shard_bits, strict_capacity_limit,
+                         allocator),
+        shards_(reinterpret_cast<CacheShard*>(port::cacheline_aligned_alloc(
+            sizeof(CacheShard) * GetNumShards()))),
+        destroy_shards_in_dtor_(false) {}
+
+  virtual ~ShardedCache() {
+    if (destroy_shards_in_dtor_) {
+      ForEachShard([](CacheShard* cs) { cs->~CacheShard(); });
+    }
+    port::cacheline_aligned_free(shards_);
+  }
+
+  CacheShard& GetShard(HashCref hash) {
+    return shards_[CacheShard::HashPieceForSharding(hash) & shard_mask_];
+  }
+
+  const CacheShard& GetShard(HashCref hash) const {
+    return shards_[CacheShard::HashPieceForSharding(hash) & shard_mask_];
+  }
+
+  void SetCapacity(size_t capacity) override {
+    MutexLock l(&config_mutex_);
+    capacity_ = capacity;
+    auto per_shard = ComputePerShardCapacity(capacity);
+    ForEachShard([=](CacheShard* cs) { cs->SetCapacity(per_shard); });
+  }
+
+  void SetStrictCapacityLimit(bool s_c_l) override {
+    MutexLock l(&config_mutex_);
+    strict_capacity_limit_ = s_c_l;
+    ForEachShard(
+        [s_c_l](CacheShard* cs) { cs->SetStrictCapacityLimit(s_c_l); });
+  }
+
+  Status Insert(const Slice& key, void* value, size_t charge, DeleterFn deleter,
+                Handle** handle, Priority priority) override {
+    HashVal hash = CacheShard::ComputeHash(key);
+    auto h_out = reinterpret_cast<HandleImpl**>(handle);
+    return GetShard(hash).Insert(key, hash, value, charge, deleter, h_out,
+                                 priority);
+  }
+  Status Insert(const Slice& key, void* value, const CacheItemHelper* helper,
+                size_t charge, Handle** handle = nullptr,
+                Priority priority = Priority::LOW) override {
+    if (!helper) {
+      return Status::InvalidArgument();
+    }
+    HashVal hash = CacheShard::ComputeHash(key);
+    auto h_out = reinterpret_cast<HandleImpl**>(handle);
+    return GetShard(hash).Insert(key, hash, value, helper, charge, h_out,
+                                 priority);
+  }
+
+  Handle* Lookup(const Slice& key, Statistics* /*stats*/) override {
+    HashVal hash = CacheShard::ComputeHash(key);
+    HandleImpl* result = GetShard(hash).Lookup(key, hash);
+    return reinterpret_cast<Handle*>(result);
+  }
+  Handle* Lookup(const Slice& key, const CacheItemHelper* helper,
+                 const CreateCallback& create_cb, Priority priority, bool wait,
+                 Statistics* stats = nullptr) override {
+    HashVal hash = CacheShard::ComputeHash(key);
+    HandleImpl* result = GetShard(hash).Lookup(key, hash, helper, create_cb,
+                                               priority, wait, stats);
+    return reinterpret_cast<Handle*>(result);
+  }
+
+  void Erase(const Slice& key) override {
+    HashVal hash = CacheShard::ComputeHash(key);
+    GetShard(hash).Erase(key, hash);
+  }
+
+  bool Release(Handle* handle, bool useful,
+               bool erase_if_last_ref = false) override {
+    auto h = reinterpret_cast<HandleImpl*>(handle);
+    return GetShard(h->GetHash()).Release(h, useful, erase_if_last_ref);
+  }
+  bool IsReady(Handle* handle) override {
+    auto h = reinterpret_cast<HandleImpl*>(handle);
+    return GetShard(h->GetHash()).IsReady(h);
+  }
+  void Wait(Handle* handle) override {
+    auto h = reinterpret_cast<HandleImpl*>(handle);
+    GetShard(h->GetHash()).Wait(h);
+  }
+  bool Ref(Handle* handle) override {
+    auto h = reinterpret_cast<HandleImpl*>(handle);
+    return GetShard(h->GetHash()).Ref(h);
+  }
+  bool Release(Handle* handle, bool erase_if_last_ref = false) override {
+    return Release(handle, true /*useful*/, erase_if_last_ref);
+  }
+  using ShardedCacheBase::GetUsage;
+  size_t GetUsage() const override {
+    return SumOverShards2(&CacheShard::GetUsage);
+  }
+  size_t GetPinnedUsage() const override {
+    return SumOverShards2(&CacheShard::GetPinnedUsage);
+  }
+  size_t GetOccupancyCount() const override {
+    return SumOverShards2(&CacheShard::GetPinnedUsage);
+  }
+  size_t GetTableAddressCount() const override {
+    return SumOverShards2(&CacheShard::GetTableAddressCount);
+  }
+  void ApplyToAllEntries(
+      const std::function<void(const Slice& key, void* value, size_t charge,
+                               DeleterFn deleter)>& callback,
+      const ApplyToAllEntriesOptions& opts) override {
+    uint32_t num_shards = GetNumShards();
+    // Iterate over part of each shard, rotating between shards, to
+    // minimize impact on latency of concurrent operations.
+    std::unique_ptr<size_t[]> states(new size_t[num_shards]{});
+
+    size_t aepl = opts.average_entries_per_lock;
+    aepl = std::min(aepl, size_t{1});
+
+    bool remaining_work;
+    do {
+      remaining_work = false;
+      for (uint32_t i = 0; i < num_shards; i++) {
+        if (states[i] != SIZE_MAX) {
+          shards_[i].ApplyToSomeEntries(callback, aepl, &states[i]);
+          remaining_work |= states[i] != SIZE_MAX;
+        }
+      }
+    } while (remaining_work);
+  }
+
+  virtual void EraseUnRefEntries() override {
+    ForEachShard([](CacheShard* cs) { cs->EraseUnRefEntries(); });
+  }
+
+  void DisownData() override {
+    // Leak data only if that won't generate an ASAN/valgrind warning.
+    if (!kMustFreeHeapAllocations) {
+      destroy_shards_in_dtor_ = false;
+    }
+  }
+
+ protected:
+  inline void ForEachShard(const std::function<void(CacheShard*)>& fn) {
+    uint32_t num_shards = GetNumShards();
+    for (uint32_t i = 0; i < num_shards; i++) {
+      fn(shards_ + i);
+    }
+  }
+
+  inline size_t SumOverShards(
+      const std::function<size_t(CacheShard&)>& fn) const {
+    uint32_t num_shards = GetNumShards();
+    size_t result = 0;
+    for (uint32_t i = 0; i < num_shards; i++) {
+      result += fn(shards_[i]);
+    }
+    return result;
+  }
+
+  inline size_t SumOverShards2(size_t (CacheShard::*fn)() const) const {
+    return SumOverShards([fn](CacheShard& cs) { return (cs.*fn)(); });
+  }
+
+  // Must be called exactly once by derived class constructor
+  void InitShards(const std::function<void(CacheShard*)>& placement_new) {
+    ForEachShard(placement_new);
+    destroy_shards_in_dtor_ = true;
+  }
+
+  void AppendPrintableOptions(std::string& str) const override {
+    shards_[0].AppendPrintableOptions(str);
+  }
+
+ private:
+  CacheShard* const shards_;
+  bool destroy_shards_in_dtor_;
+};
+
+// 512KB is traditional minimum shard size.
+int GetDefaultCacheShardBits(size_t capacity,
+                             size_t min_shard_size = 512U * 1024U);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/cmake/RocksDBConfig.cmake.in b/src/rocksdb/cmake/RocksDBConfig.cmake.in
new file mode 100644
index 000000000..0bd14be11
--- /dev/null
+++ b/src/rocksdb/cmake/RocksDBConfig.cmake.in
@@ -0,0 +1,54 @@
+@PACKAGE_INIT@
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/modules")
+
+include(CMakeFindDependencyMacro)
+
+set(GFLAGS_USE_TARGET_NAMESPACE @GFLAGS_USE_TARGET_NAMESPACE@)
+
+if(@WITH_JEMALLOC@)
+  find_dependency(JeMalloc)
+endif()
+
+if(@WITH_GFLAGS@)
+  find_dependency(gflags CONFIG)
+  if(NOT gflags_FOUND)
+    find_dependency(gflags)
+  endif()
+endif()
+
+if(@WITH_SNAPPY@)
+  find_dependency(Snappy CONFIG)
+  if(NOT Snappy_FOUND)
+    find_dependency(Snappy)
+  endif()
+endif()
+
+if(@WITH_ZLIB@)
+  find_dependency(ZLIB)
+endif()
+
+if(@WITH_BZ2@)
+  find_dependency(BZip2)
+endif()
+
+if(@WITH_LZ4@)
+  find_dependency(lz4)
+endif()
+
+if(@WITH_ZSTD@)
+  find_dependency(zstd)
+endif()
+
+if(@WITH_NUMA@)
+  find_dependency(NUMA)
+endif()
+
+if(@WITH_TBB@)
+  find_dependency(TBB)
+endif()
+
+find_dependency(Threads)
+
+include("${CMAKE_CURRENT_LIST_DIR}/RocksDBTargets.cmake")
+check_required_components(RocksDB)
diff --git a/src/rocksdb/cmake/modules/CxxFlags.cmake b/src/rocksdb/cmake/modules/CxxFlags.cmake
new file mode 100644
index 000000000..7980cca70
--- /dev/null
+++ b/src/rocksdb/cmake/modules/CxxFlags.cmake
@@ -0,0 +1,7 @@
+macro(get_cxx_std_flags FLAGS_VARIABLE)
+  if( CMAKE_CXX_STANDARD_REQUIRED )
+    set(${FLAGS_VARIABLE} ${CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION})
+  else()
+    set(${FLAGS_VARIABLE} ${CMAKE_CXX${CMAKE_CXX_STANDARD}_EXTENSION_COMPILE_OPTION})
+  endif()
+endmacro()
diff --git a/src/rocksdb/cmake/modules/FindJeMalloc.cmake b/src/rocksdb/cmake/modules/FindJeMalloc.cmake
new file mode 100644
index 000000000..f695b3ed1
--- /dev/null
+++ b/src/rocksdb/cmake/modules/FindJeMalloc.cmake
@@ -0,0 +1,29 @@
+# - Find JeMalloc library
+# Find the native JeMalloc includes and library
+#
+# JeMalloc_INCLUDE_DIRS - where to find jemalloc.h, etc.
+# JeMalloc_LIBRARIES - List of libraries when using jemalloc.
+# JeMalloc_FOUND - True if jemalloc found.
+
+find_path(JeMalloc_INCLUDE_DIRS
+  NAMES jemalloc/jemalloc.h
+  HINTS ${JEMALLOC_ROOT_DIR}/include)
+
+find_library(JeMalloc_LIBRARIES
+  NAMES jemalloc
+  HINTS ${JEMALLOC_ROOT_DIR}/lib)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(JeMalloc DEFAULT_MSG JeMalloc_LIBRARIES JeMalloc_INCLUDE_DIRS)
+
+mark_as_advanced(
+  JeMalloc_LIBRARIES
+  JeMalloc_INCLUDE_DIRS)
+
+if(JeMalloc_FOUND AND NOT (TARGET JeMalloc::JeMalloc))
+  add_library (JeMalloc::JeMalloc UNKNOWN IMPORTED)
+  set_target_properties(JeMalloc::JeMalloc
+    PROPERTIES
+      IMPORTED_LOCATION ${JeMalloc_LIBRARIES}
+      INTERFACE_INCLUDE_DIRECTORIES ${JeMalloc_INCLUDE_DIRS})
+endif()
diff --git a/src/rocksdb/cmake/modules/FindNUMA.cmake b/src/rocksdb/cmake/modules/FindNUMA.cmake
new file mode 100644
index 000000000..69b95c9b6
--- /dev/null
+++ b/src/rocksdb/cmake/modules/FindNUMA.cmake
@@ -0,0 +1,29 @@
+# - Find NUMA
+# Find the NUMA library and includes
+#
+# NUMA_INCLUDE_DIRS - where to find numa.h, etc.
+# NUMA_LIBRARIES - List of libraries when using NUMA.
+# NUMA_FOUND - True if NUMA found.
+
+find_path(NUMA_INCLUDE_DIRS
+  NAMES numa.h numaif.h
+  HINTS ${NUMA_ROOT_DIR}/include)
+
+find_library(NUMA_LIBRARIES
+  NAMES numa
+  HINTS ${NUMA_ROOT_DIR}/lib)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(NUMA DEFAULT_MSG NUMA_LIBRARIES NUMA_INCLUDE_DIRS)
+
+mark_as_advanced(
+  NUMA_LIBRARIES
+  NUMA_INCLUDE_DIRS)
+
+if(NUMA_FOUND AND NOT (TARGET NUMA::NUMA))
+  add_library (NUMA::NUMA UNKNOWN IMPORTED)
+  set_target_properties(NUMA::NUMA
+    PROPERTIES
+      IMPORTED_LOCATION ${NUMA_LIBRARIES}
+      INTERFACE_INCLUDE_DIRECTORIES ${NUMA_INCLUDE_DIRS})
+endif()
diff --git a/src/rocksdb/cmake/modules/FindSnappy.cmake b/src/rocksdb/cmake/modules/FindSnappy.cmake
new file mode 100644
index 000000000..0b0dbf861
--- /dev/null
+++ b/src/rocksdb/cmake/modules/FindSnappy.cmake
@@ -0,0 +1,29 @@
+# - Find Snappy
+# Find the snappy compression library and includes
+#
+# Snappy_INCLUDE_DIRS - where to find snappy.h, etc.
+# Snappy_LIBRARIES - List of libraries when using snappy.
+# Snappy_FOUND - True if snappy found.
+
+find_path(Snappy_INCLUDE_DIRS
+  NAMES snappy.h
+  HINTS ${snappy_ROOT_DIR}/include)
+
+find_library(Snappy_LIBRARIES
+  NAMES snappy
+  HINTS ${snappy_ROOT_DIR}/lib)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Snappy DEFAULT_MSG Snappy_LIBRARIES Snappy_INCLUDE_DIRS)
+
+mark_as_advanced(
+  Snappy_LIBRARIES
+  Snappy_INCLUDE_DIRS)
+
+if(Snappy_FOUND AND NOT (TARGET Snappy::snappy))
+  add_library (Snappy::snappy UNKNOWN IMPORTED)
+  set_target_properties(Snappy::snappy
+    PROPERTIES
+      IMPORTED_LOCATION ${Snappy_LIBRARIES}
+      INTERFACE_INCLUDE_DIRECTORIES ${Snappy_INCLUDE_DIRS})
+endif()
diff --git a/src/rocksdb/cmake/modules/FindTBB.cmake b/src/rocksdb/cmake/modules/FindTBB.cmake
new file mode 100644
index 000000000..f6861fa55
--- /dev/null
+++ b/src/rocksdb/cmake/modules/FindTBB.cmake
@@ -0,0 +1,33 @@
+# - Find TBB
+# Find the Thread Building Blocks library and includes
+#
+# TBB_INCLUDE_DIRS - where to find tbb.h, etc.
+# TBB_LIBRARIES - List of libraries when using TBB.
+# TBB_FOUND - True if TBB found.
+
+if(NOT DEFINED TBB_ROOT_DIR)
+  set(TBB_ROOT_DIR "$ENV{TBBROOT}")
+endif()
+
+find_path(TBB_INCLUDE_DIRS
+  NAMES tbb/tbb.h
+  HINTS ${TBB_ROOT_DIR}/include)
+
+find_library(TBB_LIBRARIES
+  NAMES tbb
+  HINTS ${TBB_ROOT_DIR}/lib ENV LIBRARY_PATH)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(TBB DEFAULT_MSG TBB_LIBRARIES TBB_INCLUDE_DIRS)
+
+mark_as_advanced(
+  TBB_LIBRARIES
+  TBB_INCLUDE_DIRS)
+
+if(TBB_FOUND AND NOT (TARGET TBB::TBB))
+  add_library (TBB::TBB UNKNOWN IMPORTED)
+  set_target_properties(TBB::TBB
+    PROPERTIES
+      IMPORTED_LOCATION ${TBB_LIBRARIES}
+      INTERFACE_INCLUDE_DIRECTORIES ${TBB_INCLUDE_DIRS})
+endif()
diff --git a/src/rocksdb/cmake/modules/Findgflags.cmake b/src/rocksdb/cmake/modules/Findgflags.cmake
new file mode 100644
index 000000000..786def27b
--- /dev/null
+++ b/src/rocksdb/cmake/modules/Findgflags.cmake
@@ -0,0 +1,29 @@
+# - Find gflags library
+# Find the gflags includes and library
+#
+# GFLAGS_INCLUDE_DIR - where to find gflags.h.
+# GFLAGS_LIBRARIES - List of libraries when using gflags.
+# gflags_FOUND - True if gflags found.
+
+find_path(GFLAGS_INCLUDE_DIR
+  NAMES gflags/gflags.h)
+
+find_library(GFLAGS_LIBRARIES
+  NAMES gflags)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(gflags
+  DEFAULT_MSG GFLAGS_LIBRARIES GFLAGS_INCLUDE_DIR)
+
+mark_as_advanced(
+  GFLAGS_LIBRARIES
+  GFLAGS_INCLUDE_DIR)
+
+if(gflags_FOUND AND NOT (TARGET gflags::gflags))
+  add_library(gflags::gflags UNKNOWN IMPORTED)
+  set_target_properties(gflags::gflags
+    PROPERTIES
+      IMPORTED_LOCATION ${GFLAGS_LIBRARIES}
+      INTERFACE_INCLUDE_DIRECTORIES ${GFLAGS_INCLUDE_DIR}
+      IMPORTED_LINK_INTERFACE_LANGUAGES "CXX")
+endif()
diff --git a/src/rocksdb/cmake/modules/Findlz4.cmake b/src/rocksdb/cmake/modules/Findlz4.cmake
new file mode 100644
index 000000000..7cf7d7f5f
--- /dev/null
+++ b/src/rocksdb/cmake/modules/Findlz4.cmake
@@ -0,0 +1,29 @@
+# - Find Lz4
+# Find the lz4 compression library and includes
+#
+# lz4_INCLUDE_DIRS - where to find lz4.h, etc.
+# lz4_LIBRARIES - List of libraries when using lz4.
+# lz4_FOUND - True if lz4 found.
+
+find_path(lz4_INCLUDE_DIRS
+  NAMES lz4.h
+  HINTS ${lz4_ROOT_DIR}/include)
+
+find_library(lz4_LIBRARIES
+  NAMES lz4
+  HINTS ${lz4_ROOT_DIR}/lib)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(lz4 DEFAULT_MSG lz4_LIBRARIES lz4_INCLUDE_DIRS)
+
+mark_as_advanced(
+  lz4_LIBRARIES
+  lz4_INCLUDE_DIRS)
+
+if(lz4_FOUND AND NOT (TARGET lz4::lz4))
+  add_library(lz4::lz4 UNKNOWN IMPORTED)
+  set_target_properties(lz4::lz4
+    PROPERTIES
+      IMPORTED_LOCATION ${lz4_LIBRARIES}
+      INTERFACE_INCLUDE_DIRECTORIES ${lz4_INCLUDE_DIRS})
+endif()
diff --git a/src/rocksdb/cmake/modules/Finduring.cmake b/src/rocksdb/cmake/modules/Finduring.cmake
new file mode 100644
index 000000000..8cb14cb27
--- /dev/null
+++ b/src/rocksdb/cmake/modules/Finduring.cmake
@@ -0,0 +1,26 @@
+# - Find liburing
+#
+# uring_INCLUDE_DIR - Where to find liburing.h
+# uring_LIBRARIES - List of libraries when using uring.
+# uring_FOUND - True if uring found.
+
+find_path(uring_INCLUDE_DIR
+  NAMES liburing.h)
+find_library(uring_LIBRARIES
+  NAMES liburing.a liburing)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(uring
+  DEFAULT_MSG uring_LIBRARIES uring_INCLUDE_DIR)
+
+mark_as_advanced(
+  uring_INCLUDE_DIR
+  uring_LIBRARIES)
+
+if(uring_FOUND AND NOT TARGET uring::uring)
+  add_library(uring::uring UNKNOWN IMPORTED)
+  set_target_properties(uring::uring PROPERTIES
+    INTERFACE_INCLUDE_DIRECTORIES "${uring_INCLUDE_DIR}"
+    IMPORTED_LINK_INTERFACE_LANGUAGES "C"
+    IMPORTED_LOCATION "${uring_LIBRARIES}")
+endif()
diff --git a/src/rocksdb/cmake/modules/Findzstd.cmake b/src/rocksdb/cmake/modules/Findzstd.cmake
new file mode 100644
index 000000000..9430821df
--- /dev/null
+++ b/src/rocksdb/cmake/modules/Findzstd.cmake
@@ -0,0 +1,29 @@
+# - Find zstd
+# Find the zstd compression library and includes
+#
+# zstd_INCLUDE_DIRS - where to find zstd.h, etc.
+# zstd_LIBRARIES - List of libraries when using zstd.
+# zstd_FOUND - True if zstd found.
+
+find_path(zstd_INCLUDE_DIRS
+  NAMES zstd.h
+  HINTS ${zstd_ROOT_DIR}/include)
+
+find_library(zstd_LIBRARIES
+  NAMES zstd
+  HINTS ${zstd_ROOT_DIR}/lib)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(zstd DEFAULT_MSG zstd_LIBRARIES zstd_INCLUDE_DIRS)
+
+mark_as_advanced(
+  zstd_LIBRARIES
+  zstd_INCLUDE_DIRS)
+
+if(zstd_FOUND AND NOT (TARGET zstd::zstd))
+  add_library (zstd::zstd UNKNOWN IMPORTED)
+  set_target_properties(zstd::zstd
+    PROPERTIES
+      IMPORTED_LOCATION ${zstd_LIBRARIES}
+      INTERFACE_INCLUDE_DIRECTORIES ${zstd_INCLUDE_DIRS})
+endif()
diff --git a/src/rocksdb/cmake/modules/ReadVersion.cmake b/src/rocksdb/cmake/modules/ReadVersion.cmake
new file mode 100644
index 000000000..ebfd7d6f9
--- /dev/null
+++ b/src/rocksdb/cmake/modules/ReadVersion.cmake
@@ -0,0 +1,10 @@
+# Read rocksdb version from version.h header file.
+
+function(get_rocksdb_version version_var)
+  file(READ "${CMAKE_CURRENT_SOURCE_DIR}/include/rocksdb/version.h" version_header_file)
+  foreach(component MAJOR MINOR PATCH)
+    string(REGEX MATCH "#define ROCKSDB_${component} ([0-9]+)" _ ${version_header_file})
+    set(ROCKSDB_VERSION_${component} ${CMAKE_MATCH_1})
+  endforeach()
+  set(${version_var} "${ROCKSDB_VERSION_MAJOR}.${ROCKSDB_VERSION_MINOR}.${ROCKSDB_VERSION_PATCH}" PARENT_SCOPE)
+endfunction()
diff --git a/src/rocksdb/common.mk b/src/rocksdb/common.mk
new file mode 100644
index 000000000..85c99fcec
--- /dev/null
+++ b/src/rocksdb/common.mk
@@ -0,0 +1,30 @@
+ifndef PYTHON
+
+# Default to python3. Some distros like CentOS 8 do not have `python`.
+ifeq ($(origin PYTHON), undefined)
+	PYTHON := $(shell which python3 || which python || echo python3)
+endif
+export PYTHON
+
+endif
+
+# To setup tmp directory, first recognize some old variables for setting
+# test tmp directory or base tmp directory. TEST_TMPDIR is usually read
+# by RocksDB tools though Env/FileSystem::GetTestDirectory.
+ifeq ($(TEST_TMPDIR),)
+TEST_TMPDIR := $(TMPD)
+endif
+ifeq ($(TEST_TMPDIR),)
+ifeq ($(BASE_TMPDIR),)
+BASE_TMPDIR :=$(TMPDIR)
+endif
+ifeq ($(BASE_TMPDIR),)
+BASE_TMPDIR :=/tmp
+endif
+# Use /dev/shm if it has the sticky bit set (otherwise, /tmp or other
+# base dir), and create a randomly-named rocksdb.XXXX directory therein.
+TEST_TMPDIR := $(shell f=/dev/shm; test -k $$f || f=$(BASE_TMPDIR); \
+  perl -le 'use File::Temp "tempdir";'	                            \
+    -e 'print tempdir("'$$f'/rocksdb.XXXX", CLEANUP => 0)')
+endif
+export TEST_TMPDIR
diff --git a/src/rocksdb/coverage/coverage_test.sh b/src/rocksdb/coverage/coverage_test.sh
new file mode 100755
index 000000000..aa5f68c77
--- /dev/null
+++ b/src/rocksdb/coverage/coverage_test.sh
@@ -0,0 +1,82 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+# Exit on error.
+set -e
+
+if [ -n "$USE_CLANG" ]; then
+  echo "Error: Coverage test is supported only for gcc."
+  exit 1
+fi
+
+ROOT=".."
+# Fetch right version of gcov
+if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
+  source $ROOT/build_tools/fbcode_config_platform009.sh
+  GCOV=$GCC_BASE/bin/gcov
+else
+  GCOV=$(which gcov)
+fi
+echo -e "Using $GCOV"
+
+COVERAGE_DIR="$PWD/COVERAGE_REPORT"
+mkdir -p $COVERAGE_DIR
+
+# Find all gcno files to generate the coverage report
+
+PYTHON=${1:-`which python3`}
+echo -e "Using $PYTHON"
+GCNO_FILES=`find $ROOT -name "*.gcno"`
+$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
+  # Parse the raw gcov report to more human readable form.
+  $PYTHON $ROOT/coverage/parse_gcov_output.py |
+  # Write the output to both stdout and report file.
+  tee $COVERAGE_DIR/coverage_report_all.txt &&
+echo -e "Generated coverage report for all files: $COVERAGE_DIR/coverage_report_all.txt\n"
+
+# TODO: we also need to get the files of the latest commits.
+# Get the most recently committed files.
+LATEST_FILES=`
+  git show --pretty="format:" --name-only HEAD |
+  grep -v "^$" |
+  paste -s -d,`
+RECENT_REPORT=$COVERAGE_DIR/coverage_report_recent.txt
+
+echo -e "Recently updated files: $LATEST_FILES\n" > $RECENT_REPORT
+$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
+  $PYTHON $ROOT/coverage/parse_gcov_output.py -interested-files $LATEST_FILES |
+  tee -a $RECENT_REPORT &&
+echo -e "Generated coverage report for recently updated files: $RECENT_REPORT\n"
+
+# Unless otherwise specified, we'll not generate html report by default
+if [ -z "$HTML" ]; then
+  exit 0
+fi
+
+# Generate the html report. If we cannot find lcov in this machine, we'll simply
+# skip this step.
+echo "Generating the html coverage report..."
+
+LCOV=$(which lcov || true 2>/dev/null)
+if [ -z $LCOV ]
+then
+  echo "Skip: Cannot find lcov to generate the html report."
+  exit 0
+fi
+
+LCOV_VERSION=$(lcov -v | grep 1.1 || true)
+if [ $LCOV_VERSION ]
+then
+  echo "Not supported lcov version. Expect lcov 1.1."
+  exit 0
+fi
+
+(cd $ROOT; lcov --no-external \
+     --capture  \
+     --directory $PWD \
+     --gcov-tool $GCOV \
+     --output-file $COVERAGE_DIR/coverage.info)
+
+genhtml $COVERAGE_DIR/coverage.info -o $COVERAGE_DIR
+
+echo "HTML Coverage report is generated in $COVERAGE_DIR"
diff --git a/src/rocksdb/coverage/parse_gcov_output.py b/src/rocksdb/coverage/parse_gcov_output.py
new file mode 100644
index 000000000..b9788ec81
--- /dev/null
+++ b/src/rocksdb/coverage/parse_gcov_output.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from __future__ import print_function
+
+import optparse
+import re
+import sys
+
+# the gcov report follows certain pattern. Each file will have two lines
+# of report, from which we can extract the file name, total lines and coverage
+# percentage.
+def parse_gcov_report(gcov_input):
+    per_file_coverage = {}
+    total_coverage = None
+
+    for line in sys.stdin:
+        line = line.strip()
+
+        # --First line of the coverage report (with file name in it)?
+        match_obj = re.match("^File '(.*)'$", line)
+        if match_obj:
+            # fetch the file name from the first line of the report.
+            current_file = match_obj.group(1)
+            continue
+
+        # -- Second line of the file report (with coverage percentage)
+        match_obj = re.match("^Lines executed:(.*)% of (.*)", line)
+
+        if match_obj:
+            coverage = float(match_obj.group(1))
+            lines = int(match_obj.group(2))
+
+            if current_file is not None:
+                per_file_coverage[current_file] = (coverage, lines)
+                current_file = None
+            else:
+                # If current_file is not set, we reach the last line of report,
+                # which contains the summarized coverage percentage.
+                total_coverage = (coverage, lines)
+            continue
+
+        # If the line's pattern doesn't fall into the above categories. We
+        # can simply ignore them since they're either empty line or doesn't
+        # find executable lines of the given file.
+        current_file = None
+
+    return per_file_coverage, total_coverage
+
+
+def get_option_parser():
+    usage = (
+        "Parse the gcov output and generate more human-readable code "
+        + "coverage report."
+    )
+    parser = optparse.OptionParser(usage)
+
+    parser.add_option(
+        "--interested-files",
+        "-i",
+        dest="filenames",
+        help="Comma separated files names. if specified, we will display "
+        + "the coverage report only for interested source files. "
+        + "Otherwise we will display the coverage report for all "
+        + "source files.",
+    )
+    return parser
+
+
+def display_file_coverage(per_file_coverage, total_coverage):
+    # To print out auto-adjustable column, we need to know the longest
+    # length of file names.
+    max_file_name_length = max(len(fname) for fname in per_file_coverage.keys())
+
+    # -- Print header
+    # size of separator is determined by 3 column sizes:
+    # file name, coverage percentage and lines.
+    header_template = "%" + str(max_file_name_length) + "s\t%s\t%s"
+    separator = "-" * (max_file_name_length + 10 + 20)
+    print(
+        header_template % ("Filename", "Coverage", "Lines")
+    )  # noqa: E999 T25377293 Grandfathered in
+    print(separator)
+
+    # -- Print body
+    # template for printing coverage report for each file.
+    record_template = "%" + str(max_file_name_length) + "s\t%5.2f%%\t%10d"
+
+    for fname, coverage_info in per_file_coverage.items():
+        coverage, lines = coverage_info
+        print(record_template % (fname, coverage, lines))
+
+    # -- Print footer
+    if total_coverage:
+        print(separator)
+        print(record_template % ("Total", total_coverage[0], total_coverage[1]))
+
+
+def report_coverage():
+    parser = get_option_parser()
+    (options, args) = parser.parse_args()
+
+    interested_files = set()
+    if options.filenames is not None:
+        interested_files = {f.strip() for f in options.filenames.split(",")}
+
+    # To make things simple, right now we only read gcov report from the input
+    per_file_coverage, total_coverage = parse_gcov_report(sys.stdin)
+
+    # Check if we need to display coverage info for interested files.
+    if len(interested_files):
+        per_file_coverage = dict(
+            (fname, per_file_coverage[fname])
+            for fname in interested_files
+            if fname in per_file_coverage
+        )
+        # If we only interested in several files, it makes no sense to report
+        # the total_coverage
+        total_coverage = None
+
+    if not len(per_file_coverage):
+        print("Cannot find coverage info for the given files.", file=sys.stderr)
+        return
+    display_file_coverage(per_file_coverage, total_coverage)
+
+
+if __name__ == "__main__":
+    report_coverage()
diff --git a/src/rocksdb/crash_test.mk b/src/rocksdb/crash_test.mk
new file mode 100644
index 000000000..5e8b3573a
--- /dev/null
+++ b/src/rocksdb/crash_test.mk
@@ -0,0 +1,107 @@
+# This file is used by Meta-internal infrastructure as well as by Makefile
+
+# When included from Makefile, there are rules to build DB_STRESS_CMD. When
+# used directly with `make -f crashtest.mk ...` there will be no rules to
+# build DB_STRESS_CMD so it must exist prior.
+DB_STRESS_CMD?=./db_stress
+
+include common.mk
+
+CRASHTEST_MAKE=$(MAKE) -f crash_test.mk
+CRASHTEST_PY=$(PYTHON) -u tools/db_crashtest.py --stress_cmd=$(DB_STRESS_CMD) --cleanup_cmd='$(DB_CLEANUP_CMD)'
+
+.PHONY: crash_test crash_test_with_atomic_flush crash_test_with_txn \
+	crash_test_with_best_efforts_recovery crash_test_with_ts \
+	blackbox_crash_test blackbox_crash_test_with_atomic_flush \
+	blackbox_crash_test_with_txn blackbox_crash_test_with_ts \
+	blackbox_crash_test_with_best_efforts_recovery \
+	whitebox_crash_test whitebox_crash_test_with_atomic_flush \
+	whitebox_crash_test_with_txn whitebox_crash_test_with_ts \
+	blackbox_crash_test_with_multiops_wc_txn \
+	blackbox_crash_test_with_multiops_wp_txn \
+	crash_test_with_tiered_storage blackbox_crash_test_with_tiered_storage \
+	whitebox_crash_test_with_tiered_storage \
+
+crash_test: $(DB_STRESS_CMD)
+# Do not parallelize
+	$(CRASHTEST_MAKE) whitebox_crash_test
+	$(CRASHTEST_MAKE) blackbox_crash_test
+
+crash_test_with_atomic_flush: $(DB_STRESS_CMD)
+# Do not parallelize
+	$(CRASHTEST_MAKE) whitebox_crash_test_with_atomic_flush
+	$(CRASHTEST_MAKE) blackbox_crash_test_with_atomic_flush
+
+crash_test_with_txn: $(DB_STRESS_CMD)
+# Do not parallelize
+	$(CRASHTEST_MAKE) whitebox_crash_test_with_txn
+	$(CRASHTEST_MAKE) blackbox_crash_test_with_txn
+
+crash_test_with_best_efforts_recovery: blackbox_crash_test_with_best_efforts_recovery
+
+crash_test_with_ts: $(DB_STRESS_CMD)
+# Do not parallelize
+	$(CRASHTEST_MAKE) whitebox_crash_test_with_ts
+	$(CRASHTEST_MAKE) blackbox_crash_test_with_ts
+
+crash_test_with_tiered_storage: $(DB_STRESS_CMD)
+# Do not parallelize
+	$(CRASHTEST_MAKE) whitebox_crash_test_with_tiered_storage
+	$(CRASHTEST_MAKE) blackbox_crash_test_with_tiered_storage
+
+crash_test_with_multiops_wc_txn: $(DB_STRESS_CMD)
+	$(CRASHTEST_MAKE) blackbox_crash_test_with_multiops_wc_txn
+
+crash_test_with_multiops_wp_txn: $(DB_STRESS_CMD)
+	$(CRASHTEST_MAKE) blackbox_crash_test_with_multiops_wp_txn
+
+blackbox_crash_test: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --simple blackbox $(CRASH_TEST_EXT_ARGS)
+	$(CRASHTEST_PY) blackbox $(CRASH_TEST_EXT_ARGS)
+
+blackbox_crash_test_with_atomic_flush: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --cf_consistency blackbox $(CRASH_TEST_EXT_ARGS)
+
+blackbox_crash_test_with_txn: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --txn blackbox $(CRASH_TEST_EXT_ARGS)
+
+blackbox_crash_test_with_best_efforts_recovery: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --test_best_efforts_recovery blackbox $(CRASH_TEST_EXT_ARGS)
+
+blackbox_crash_test_with_ts: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --enable_ts blackbox $(CRASH_TEST_EXT_ARGS)
+
+blackbox_crash_test_with_multiops_wc_txn: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --test_multiops_txn --write_policy write_committed blackbox $(CRASH_TEST_EXT_ARGS)
+
+blackbox_crash_test_with_multiops_wp_txn: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --test_multiops_txn --write_policy write_prepared blackbox $(CRASH_TEST_EXT_ARGS)
+
+blackbox_crash_test_with_tiered_storage: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --test_tiered_storage blackbox $(CRASH_TEST_EXT_ARGS)
+
+ifeq ($(CRASH_TEST_KILL_ODD),)
+  CRASH_TEST_KILL_ODD=888887
+endif
+
+whitebox_crash_test: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --simple whitebox --random_kill_odd \
+      $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
+	$(CRASHTEST_PY) whitebox  --random_kill_odd \
+      $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
+
+whitebox_crash_test_with_atomic_flush: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --cf_consistency whitebox  --random_kill_odd \
+      $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
+
+whitebox_crash_test_with_txn: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --txn whitebox --random_kill_odd \
+      $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
+
+whitebox_crash_test_with_ts: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --enable_ts whitebox --random_kill_odd \
+      $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
+
+whitebox_crash_test_with_tiered_storage: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --test_tiered_storage whitebox --random_kill_odd \
+      $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
diff --git a/src/rocksdb/db/arena_wrapped_db_iter.cc b/src/rocksdb/db/arena_wrapped_db_iter.cc
new file mode 100644
index 000000000..607403ccc
--- /dev/null
+++ b/src/rocksdb/db/arena_wrapped_db_iter.cc
@@ -0,0 +1,160 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/arena_wrapped_db_iter.h"
+
+#include "memory/arena.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "table/internal_iterator.h"
+#include "table/iterator_wrapper.h"
+#include "util/user_comparator_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status ArenaWrappedDBIter::GetProperty(std::string prop_name,
+                                       std::string* prop) {
+  if (prop_name == "rocksdb.iterator.super-version-number") {
+    // First try to pass the value returned from inner iterator.
+    if (!db_iter_->GetProperty(prop_name, prop).ok()) {
+      *prop = std::to_string(sv_number_);
+    }
+    return Status::OK();
+  }
+  return db_iter_->GetProperty(prop_name, prop);
+}
+
+void ArenaWrappedDBIter::Init(
+    Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options, const Version* version,
+    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iteration,
+    uint64_t version_number, ReadCallback* read_callback, DBImpl* db_impl,
+    ColumnFamilyData* cfd, bool expose_blob_index, bool allow_refresh) {
+  auto mem = arena_.AllocateAligned(sizeof(DBIter));
+  db_iter_ =
+      new (mem) DBIter(env, read_options, ioptions, mutable_cf_options,
+                       ioptions.user_comparator, /* iter */ nullptr, version,
+                       sequence, true, max_sequential_skip_in_iteration,
+                       read_callback, db_impl, cfd, expose_blob_index);
+  sv_number_ = version_number;
+  read_options_ = read_options;
+  allow_refresh_ = allow_refresh;
+  memtable_range_tombstone_iter_ = nullptr;
+}
+
+Status ArenaWrappedDBIter::Refresh() {
+  if (cfd_ == nullptr || db_impl_ == nullptr || !allow_refresh_) {
+    return Status::NotSupported("Creating renew iterator is not allowed.");
+  }
+  assert(db_iter_ != nullptr);
+  // TODO(yiwu): For last_seq_same_as_publish_seq_==false, this is not the
+  // correct behavior. Will be corrected automatically when we take a snapshot
+  // here for the case of WritePreparedTxnDB.
+  uint64_t cur_sv_number = cfd_->GetSuperVersionNumber();
+  TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:1");
+  TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:2");
+  auto reinit_internal_iter = [&]() {
+    Env* env = db_iter_->env();
+    db_iter_->~DBIter();
+    arena_.~Arena();
+    new (&arena_) Arena();
+
+    SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_);
+    SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber();
+    if (read_callback_) {
+      read_callback_->Refresh(latest_seq);
+    }
+    Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options,
+         sv->current, latest_seq,
+         sv->mutable_cf_options.max_sequential_skip_in_iterations,
+         cur_sv_number, read_callback_, db_impl_, cfd_, expose_blob_index_,
+         allow_refresh_);
+
+    InternalIterator* internal_iter = db_impl_->NewInternalIterator(
+        read_options_, cfd_, sv, &arena_, latest_seq,
+        /* allow_unprepared_value */ true, /* db_iter */ this);
+    SetIterUnderDBIter(internal_iter);
+  };
+  while (true) {
+    if (sv_number_ != cur_sv_number) {
+      reinit_internal_iter();
+      break;
+    } else {
+      SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber();
+      // Refresh range-tombstones in MemTable
+      if (!read_options_.ignore_range_deletions) {
+        SuperVersion* sv = cfd_->GetThreadLocalSuperVersion(db_impl_);
+        TEST_SYNC_POINT_CALLBACK("ArenaWrappedDBIter::Refresh:SV", nullptr);
+        auto t = sv->mem->NewRangeTombstoneIterator(
+            read_options_, latest_seq, false /* immutable_memtable */);
+        if (!t || t->empty()) {
+          // If memtable_range_tombstone_iter_ points to a non-empty tombstone
+          // iterator, then it means sv->mem is not the memtable that
+          // memtable_range_tombstone_iter_ points to, so SV must have changed
+          // after the sv_number_ != cur_sv_number check above. We will fall
+          // back to re-init the InternalIterator, and the tombstone iterator
+          // will be freed during db_iter destruction there.
+          if (memtable_range_tombstone_iter_) {
+            assert(!*memtable_range_tombstone_iter_ ||
+                   sv_number_ != cfd_->GetSuperVersionNumber());
+          }
+          delete t;
+        } else {  // current mutable memtable has range tombstones
+          if (!memtable_range_tombstone_iter_) {
+            delete t;
+            db_impl_->ReturnAndCleanupSuperVersion(cfd_, sv);
+            // The memtable under DBIter did not have range tombstone before
+            // refresh.
+            reinit_internal_iter();
+            break;
+          } else {
+            delete *memtable_range_tombstone_iter_;
+            *memtable_range_tombstone_iter_ = new TruncatedRangeDelIterator(
+                std::unique_ptr<FragmentedRangeTombstoneIterator>(t),
+                &cfd_->internal_comparator(), nullptr, nullptr);
+          }
+        }
+        db_impl_->ReturnAndCleanupSuperVersion(cfd_, sv);
+      }
+      // Refresh latest sequence number
+      db_iter_->set_sequence(latest_seq);
+      db_iter_->set_valid(false);
+      // Check again if the latest super version number is changed
+      uint64_t latest_sv_number = cfd_->GetSuperVersionNumber();
+      if (latest_sv_number != cur_sv_number) {
+        // If the super version number is changed after refreshing,
+        // fallback to Re-Init the InternalIterator
+        cur_sv_number = latest_sv_number;
+        continue;
+      }
+      break;
+    }
+  }
+  return Status::OK();
+}
+
+ArenaWrappedDBIter* NewArenaWrappedDbIterator(
+    Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options, const Version* version,
+    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
+    uint64_t version_number, ReadCallback* read_callback, DBImpl* db_impl,
+    ColumnFamilyData* cfd, bool expose_blob_index, bool allow_refresh) {
+  ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
+  iter->Init(env, read_options, ioptions, mutable_cf_options, version, sequence,
+             max_sequential_skip_in_iterations, version_number, read_callback,
+             db_impl, cfd, expose_blob_index, allow_refresh);
+  if (db_impl != nullptr && cfd != nullptr && allow_refresh) {
+    iter->StoreRefreshInfo(db_impl, cfd, read_callback, expose_blob_index);
+  }
+
+  return iter;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/arena_wrapped_db_iter.h b/src/rocksdb/db/arena_wrapped_db_iter.h
new file mode 100644
index 000000000..f15be306d
--- /dev/null
+++ b/src/rocksdb/db/arena_wrapped_db_iter.h
@@ -0,0 +1,127 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+
+#include <string>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_iter.h"
+#include "db/range_del_aggregator.h"
+#include "memory/arena.h"
+#include "options/cf_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena;
+class Version;
+
+// A wrapper iterator which wraps DB Iterator and the arena, with which the DB
+// iterator is supposed to be allocated. This class is used as an entry point of
+// a iterator hierarchy whose memory can be allocated inline. In that way,
+// accessing the iterator tree can be more cache friendly. It is also faster
+// to allocate.
+// When using the class's Iterator interface, the behavior is exactly
+// the same as the inner DBIter.
+class ArenaWrappedDBIter : public Iterator {
+ public:
+  ~ArenaWrappedDBIter() override {
+    if (db_iter_ != nullptr) {
+      db_iter_->~DBIter();
+    } else {
+      assert(false);
+    }
+  }
+
+  // Get the arena to be used to allocate memory for DBIter to be wrapped,
+  // as well as child iterators in it.
+  virtual Arena* GetArena() { return &arena_; }
+
+  const ReadOptions& GetReadOptions() { return read_options_; }
+
+  // Set the internal iterator wrapped inside the DB Iterator. Usually it is
+  // a merging iterator.
+  virtual void SetIterUnderDBIter(InternalIterator* iter) {
+    db_iter_->SetIter(iter);
+  }
+
+  void SetMemtableRangetombstoneIter(TruncatedRangeDelIterator** iter) {
+    memtable_range_tombstone_iter_ = iter;
+  }
+
+  bool Valid() const override { return db_iter_->Valid(); }
+  void SeekToFirst() override { db_iter_->SeekToFirst(); }
+  void SeekToLast() override { db_iter_->SeekToLast(); }
+  // 'target' does not contain timestamp, even if user timestamp feature is
+  // enabled.
+  void Seek(const Slice& target) override { db_iter_->Seek(target); }
+  void SeekForPrev(const Slice& target) override {
+    db_iter_->SeekForPrev(target);
+  }
+  void Next() override { db_iter_->Next(); }
+  void Prev() override { db_iter_->Prev(); }
+  Slice key() const override { return db_iter_->key(); }
+  Slice value() const override { return db_iter_->value(); }
+  const WideColumns& columns() const override { return db_iter_->columns(); }
+  Status status() const override { return db_iter_->status(); }
+  Slice timestamp() const override { return db_iter_->timestamp(); }
+  bool IsBlob() const { return db_iter_->IsBlob(); }
+
+  Status GetProperty(std::string prop_name, std::string* prop) override;
+
+  Status Refresh() override;
+
+  void Init(Env* env, const ReadOptions& read_options,
+            const ImmutableOptions& ioptions,
+            const MutableCFOptions& mutable_cf_options, const Version* version,
+            const SequenceNumber& sequence,
+            uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
+            ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
+            bool expose_blob_index, bool allow_refresh);
+
+  // Store some parameters so we can refresh the iterator at a later point
+  // with these same params
+  void StoreRefreshInfo(DBImpl* db_impl, ColumnFamilyData* cfd,
+                        ReadCallback* read_callback, bool expose_blob_index) {
+    db_impl_ = db_impl;
+    cfd_ = cfd;
+    read_callback_ = read_callback;
+    expose_blob_index_ = expose_blob_index;
+  }
+
+ private:
+  DBIter* db_iter_ = nullptr;
+  Arena arena_;
+  uint64_t sv_number_;
+  ColumnFamilyData* cfd_ = nullptr;
+  DBImpl* db_impl_ = nullptr;
+  ReadOptions read_options_;
+  ReadCallback* read_callback_;
+  bool expose_blob_index_ = false;
+  bool allow_refresh_ = true;
+  // If this is nullptr, it means the mutable memtable does not contain range
+  // tombstone when added under this DBIter.
+  TruncatedRangeDelIterator** memtable_range_tombstone_iter_ = nullptr;
+};
+
+// Generate the arena wrapped iterator class.
+// `db_impl` and `cfd` are used for reneweal. If left null, renewal will not
+// be supported.
+extern ArenaWrappedDBIter* NewArenaWrappedDbIterator(
+    Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options, const Version* version,
+    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
+    uint64_t version_number, ReadCallback* read_callback,
+    DBImpl* db_impl = nullptr, ColumnFamilyData* cfd = nullptr,
+    bool expose_blob_index = false, bool allow_refresh = true);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_constants.h b/src/rocksdb/db/blob/blob_constants.h
new file mode 100644
index 000000000..a5d09ac76
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_constants.h
@@ -0,0 +1,16 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+constexpr uint64_t kInvalidBlobFileNumber = 0;
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_contents.cc b/src/rocksdb/db/blob/blob_contents.cc
new file mode 100644
index 000000000..9015609e7
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_contents.cc
@@ -0,0 +1,90 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_contents.h"
+
+#include <cassert>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_helpers.h"
+#include "port/malloc.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::unique_ptr<BlobContents> BlobContents::Create(
+    CacheAllocationPtr&& allocation, size_t size) {
+  return std::unique_ptr<BlobContents>(
+      new BlobContents(std::move(allocation), size));
+}
+
+size_t BlobContents::ApproximateMemoryUsage() const {
+  size_t usage = 0;
+
+  if (allocation_) {
+    MemoryAllocator* const allocator = allocation_.get_deleter().allocator;
+
+    if (allocator) {
+      usage += allocator->UsableSize(allocation_.get(), data_.size());
+    } else {
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+      usage += malloc_usable_size(allocation_.get());
+#else
+      usage += data_.size();
+#endif
+    }
+  }
+
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  usage += malloc_usable_size(const_cast<BlobContents*>(this));
+#else
+  usage += sizeof(*this);
+#endif
+
+  return usage;
+}
+
+size_t BlobContents::SizeCallback(void* obj) {
+  assert(obj);
+
+  return static_cast<const BlobContents*>(obj)->size();
+}
+
+Status BlobContents::SaveToCallback(void* from_obj, size_t from_offset,
+                                    size_t length, void* out) {
+  assert(from_obj);
+
+  const BlobContents* buf = static_cast<const BlobContents*>(from_obj);
+  assert(buf->size() >= from_offset + length);
+
+  memcpy(out, buf->data().data() + from_offset, length);
+
+  return Status::OK();
+}
+
+Cache::CacheItemHelper* BlobContents::GetCacheItemHelper() {
+  static Cache::CacheItemHelper cache_helper(
+      &SizeCallback, &SaveToCallback,
+      GetCacheEntryDeleterForRole<BlobContents, CacheEntryRole::kBlobValue>());
+
+  return &cache_helper;
+}
+
+Status BlobContents::CreateCallback(CacheAllocationPtr&& allocation,
+                                    const void* buf, size_t size,
+                                    void** out_obj, size_t* charge) {
+  assert(allocation);
+
+  memcpy(allocation.get(), buf, size);
+
+  std::unique_ptr<BlobContents> obj = Create(std::move(allocation), size);
+  BlobContents* const contents = obj.release();
+
+  *out_obj = contents;
+  *charge = contents->ApproximateMemoryUsage();
+
+  return Status::OK();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_contents.h b/src/rocksdb/db/blob/blob_contents.h
new file mode 100644
index 000000000..9b7c5b969
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_contents.h
@@ -0,0 +1,56 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+
+#include "memory/memory_allocator.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A class representing a single uncompressed value read from a blob file.
+class BlobContents {
+ public:
+  static std::unique_ptr<BlobContents> Create(CacheAllocationPtr&& allocation,
+                                              size_t size);
+
+  BlobContents(const BlobContents&) = delete;
+  BlobContents& operator=(const BlobContents&) = delete;
+
+  BlobContents(BlobContents&&) = default;
+  BlobContents& operator=(BlobContents&&) = default;
+
+  ~BlobContents() = default;
+
+  const Slice& data() const { return data_; }
+  size_t size() const { return data_.size(); }
+
+  size_t ApproximateMemoryUsage() const;
+
+  // Callbacks for secondary cache
+  static size_t SizeCallback(void* obj);
+
+  static Status SaveToCallback(void* from_obj, size_t from_offset,
+                               size_t length, void* out);
+
+  static Cache::CacheItemHelper* GetCacheItemHelper();
+
+  static Status CreateCallback(CacheAllocationPtr&& allocation, const void* buf,
+                               size_t size, void** out_obj, size_t* charge);
+
+ private:
+  BlobContents(CacheAllocationPtr&& allocation, size_t size)
+      : allocation_(std::move(allocation)), data_(allocation_.get(), size) {}
+
+  CacheAllocationPtr allocation_;
+  Slice data_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_counting_iterator.h b/src/rocksdb/db/blob/blob_counting_iterator.h
new file mode 100644
index 000000000..de549afa2
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_counting_iterator.h
@@ -0,0 +1,146 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+
+#include "db/blob/blob_garbage_meter.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+#include "table/internal_iterator.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// An internal iterator that passes each key-value encountered to
+// BlobGarbageMeter as inflow in order to measure the total number and size of
+// blobs in the compaction input on a per-blob file basis.
+class BlobCountingIterator : public InternalIterator {
+ public:
+  BlobCountingIterator(InternalIterator* iter,
+                       BlobGarbageMeter* blob_garbage_meter)
+      : iter_(iter), blob_garbage_meter_(blob_garbage_meter) {
+    assert(iter_);
+    assert(blob_garbage_meter_);
+
+    UpdateAndCountBlobIfNeeded();
+  }
+
+  bool Valid() const override { return iter_->Valid() && status_.ok(); }
+
+  void SeekToFirst() override {
+    iter_->SeekToFirst();
+    UpdateAndCountBlobIfNeeded();
+  }
+
+  void SeekToLast() override {
+    iter_->SeekToLast();
+    UpdateAndCountBlobIfNeeded();
+  }
+
+  void Seek(const Slice& target) override {
+    iter_->Seek(target);
+    UpdateAndCountBlobIfNeeded();
+  }
+
+  void SeekForPrev(const Slice& target) override {
+    iter_->SeekForPrev(target);
+    UpdateAndCountBlobIfNeeded();
+  }
+
+  void Next() override {
+    assert(Valid());
+
+    iter_->Next();
+    UpdateAndCountBlobIfNeeded();
+  }
+
+  bool NextAndGetResult(IterateResult* result) override {
+    assert(Valid());
+
+    const bool res = iter_->NextAndGetResult(result);
+    UpdateAndCountBlobIfNeeded();
+    return res;
+  }
+
+  void Prev() override {
+    assert(Valid());
+
+    iter_->Prev();
+    UpdateAndCountBlobIfNeeded();
+  }
+
+  Slice key() const override {
+    assert(Valid());
+    return iter_->key();
+  }
+
+  Slice user_key() const override {
+    assert(Valid());
+    return iter_->user_key();
+  }
+
+  Slice value() const override {
+    assert(Valid());
+    return iter_->value();
+  }
+
+  Status status() const override { return status_; }
+
+  bool PrepareValue() override {
+    assert(Valid());
+    return iter_->PrepareValue();
+  }
+
+  bool MayBeOutOfLowerBound() override {
+    assert(Valid());
+    return iter_->MayBeOutOfLowerBound();
+  }
+
+  IterBoundCheck UpperBoundCheckResult() override {
+    assert(Valid());
+    return iter_->UpperBoundCheckResult();
+  }
+
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+    iter_->SetPinnedItersMgr(pinned_iters_mgr);
+  }
+
+  bool IsKeyPinned() const override {
+    assert(Valid());
+    return iter_->IsKeyPinned();
+  }
+
+  bool IsValuePinned() const override {
+    assert(Valid());
+    return iter_->IsValuePinned();
+  }
+
+  Status GetProperty(std::string prop_name, std::string* prop) override {
+    return iter_->GetProperty(prop_name, prop);
+  }
+
+ private:
+  void UpdateAndCountBlobIfNeeded() {
+    assert(!iter_->Valid() || iter_->status().ok());
+
+    if (!iter_->Valid()) {
+      status_ = iter_->status();
+      return;
+    }
+
+    TEST_SYNC_POINT(
+        "BlobCountingIterator::UpdateAndCountBlobIfNeeded:ProcessInFlow");
+
+    status_ = blob_garbage_meter_->ProcessInFlow(key(), value());
+  }
+
+  InternalIterator* iter_;
+  BlobGarbageMeter* blob_garbage_meter_;
+  Status status_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_counting_iterator_test.cc b/src/rocksdb/db/blob/blob_counting_iterator_test.cc
new file mode 100644
index 000000000..c7bbc8f58
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_counting_iterator_test.cc
@@ -0,0 +1,327 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_counting_iterator.h"
+
+#include <string>
+#include <vector>
+
+#include "db/blob/blob_garbage_meter.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/dbformat.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/vector_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void CheckInFlow(const BlobGarbageMeter& blob_garbage_meter,
+                 uint64_t blob_file_number, uint64_t count, uint64_t bytes) {
+  const auto& flows = blob_garbage_meter.flows();
+
+  const auto it = flows.find(blob_file_number);
+  if (it == flows.end()) {
+    ASSERT_EQ(count, 0);
+    ASSERT_EQ(bytes, 0);
+    return;
+  }
+
+  const auto& in = it->second.GetInFlow();
+
+  ASSERT_EQ(in.GetCount(), count);
+  ASSERT_EQ(in.GetBytes(), bytes);
+}
+
+TEST(BlobCountingIteratorTest, CountBlobs) {
+  // Note: the input consists of three key-values: two are blob references to
+  // different blob files, while the third one is a plain value.
+  constexpr char user_key0[] = "key0";
+  constexpr char user_key1[] = "key1";
+  constexpr char user_key2[] = "key2";
+
+  const std::vector<std::string> keys{
+      test::KeyStr(user_key0, 1, kTypeBlobIndex),
+      test::KeyStr(user_key1, 2, kTypeBlobIndex),
+      test::KeyStr(user_key2, 3, kTypeValue)};
+
+  constexpr uint64_t first_blob_file_number = 4;
+  constexpr uint64_t first_offset = 1000;
+  constexpr uint64_t first_size = 2000;
+
+  std::string first_blob_index;
+  BlobIndex::EncodeBlob(&first_blob_index, first_blob_file_number, first_offset,
+                        first_size, kNoCompression);
+
+  constexpr uint64_t second_blob_file_number = 6;
+  constexpr uint64_t second_offset = 2000;
+  constexpr uint64_t second_size = 4000;
+
+  std::string second_blob_index;
+  BlobIndex::EncodeBlob(&second_blob_index, second_blob_file_number,
+                        second_offset, second_size, kNoCompression);
+
+  const std::vector<std::string> values{first_blob_index, second_blob_index,
+                                        "raw_value"};
+
+  assert(keys.size() == values.size());
+
+  VectorIterator input(keys, values);
+  BlobGarbageMeter blob_garbage_meter;
+
+  BlobCountingIterator blob_counter(&input, &blob_garbage_meter);
+
+  constexpr uint64_t first_expected_bytes =
+      first_size +
+      BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(user_key0) - 1);
+  constexpr uint64_t second_expected_bytes =
+      second_size +
+      BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(user_key1) - 1);
+
+  // Call SeekToFirst and iterate forward
+  blob_counter.SeekToFirst();
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[0]);
+  ASSERT_EQ(blob_counter.user_key(), user_key0);
+  ASSERT_EQ(blob_counter.value(), values[0]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 1,
+              first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 0, 0);
+
+  blob_counter.Next();
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[1]);
+  ASSERT_EQ(blob_counter.user_key(), user_key1);
+  ASSERT_EQ(blob_counter.value(), values[1]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 1,
+              first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 1,
+              second_expected_bytes);
+
+  blob_counter.Next();
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[2]);
+  ASSERT_EQ(blob_counter.user_key(), user_key2);
+  ASSERT_EQ(blob_counter.value(), values[2]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 1,
+              first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 1,
+              second_expected_bytes);
+
+  blob_counter.Next();
+  ASSERT_FALSE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 1,
+              first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 1,
+              second_expected_bytes);
+
+  // Do it again using NextAndGetResult
+  blob_counter.SeekToFirst();
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[0]);
+  ASSERT_EQ(blob_counter.user_key(), user_key0);
+  ASSERT_EQ(blob_counter.value(), values[0]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+              2 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 1,
+              second_expected_bytes);
+
+  {
+    IterateResult result;
+    ASSERT_TRUE(blob_counter.NextAndGetResult(&result));
+    ASSERT_EQ(result.key, keys[1]);
+    ASSERT_EQ(blob_counter.user_key(), user_key1);
+    ASSERT_TRUE(blob_counter.Valid());
+    ASSERT_OK(blob_counter.status());
+    ASSERT_EQ(blob_counter.key(), keys[1]);
+    ASSERT_EQ(blob_counter.value(), values[1]);
+    CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+                2 * first_expected_bytes);
+    CheckInFlow(blob_garbage_meter, second_blob_file_number, 2,
+                2 * second_expected_bytes);
+  }
+
+  {
+    IterateResult result;
+    ASSERT_TRUE(blob_counter.NextAndGetResult(&result));
+    ASSERT_EQ(result.key, keys[2]);
+    ASSERT_EQ(blob_counter.user_key(), user_key2);
+    ASSERT_TRUE(blob_counter.Valid());
+    ASSERT_OK(blob_counter.status());
+    ASSERT_EQ(blob_counter.key(), keys[2]);
+    ASSERT_EQ(blob_counter.value(), values[2]);
+    CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+                2 * first_expected_bytes);
+    CheckInFlow(blob_garbage_meter, second_blob_file_number, 2,
+                2 * second_expected_bytes);
+  }
+
+  {
+    IterateResult result;
+    ASSERT_FALSE(blob_counter.NextAndGetResult(&result));
+    ASSERT_FALSE(blob_counter.Valid());
+    ASSERT_OK(blob_counter.status());
+    CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+                2 * first_expected_bytes);
+    CheckInFlow(blob_garbage_meter, second_blob_file_number, 2,
+                2 * second_expected_bytes);
+  }
+
+  // Call SeekToLast and iterate backward
+  blob_counter.SeekToLast();
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[2]);
+  ASSERT_EQ(blob_counter.user_key(), user_key2);
+  ASSERT_EQ(blob_counter.value(), values[2]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+              2 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 2,
+              2 * second_expected_bytes);
+
+  blob_counter.Prev();
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[1]);
+  ASSERT_EQ(blob_counter.user_key(), user_key1);
+  ASSERT_EQ(blob_counter.value(), values[1]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+              2 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 3,
+              3 * second_expected_bytes);
+
+  blob_counter.Prev();
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[0]);
+  ASSERT_EQ(blob_counter.user_key(), user_key0);
+  ASSERT_EQ(blob_counter.value(), values[0]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 3,
+              3 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 3,
+              3 * second_expected_bytes);
+
+  blob_counter.Prev();
+  ASSERT_FALSE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 3,
+              3 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 3,
+              3 * second_expected_bytes);
+
+  // Call Seek for all keys (plus one that's greater than all of them)
+  blob_counter.Seek(keys[0]);
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[0]);
+  ASSERT_EQ(blob_counter.user_key(), user_key0);
+  ASSERT_EQ(blob_counter.value(), values[0]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 4,
+              4 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 3,
+              3 * second_expected_bytes);
+
+  blob_counter.Seek(keys[1]);
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[1]);
+  ASSERT_EQ(blob_counter.user_key(), user_key1);
+  ASSERT_EQ(blob_counter.value(), values[1]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 4,
+              4 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 4,
+              4 * second_expected_bytes);
+
+  blob_counter.Seek(keys[2]);
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[2]);
+  ASSERT_EQ(blob_counter.user_key(), user_key2);
+  ASSERT_EQ(blob_counter.value(), values[2]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 4,
+              4 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 4,
+              4 * second_expected_bytes);
+
+  blob_counter.Seek("zzz");
+  ASSERT_FALSE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 4,
+              4 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 4,
+              4 * second_expected_bytes);
+
+  // Call SeekForPrev for all keys (plus one that's less than all of them)
+  blob_counter.SeekForPrev("aaa");
+  ASSERT_FALSE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 4,
+              4 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 4,
+              4 * second_expected_bytes);
+
+  blob_counter.SeekForPrev(keys[0]);
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[0]);
+  ASSERT_EQ(blob_counter.user_key(), user_key0);
+  ASSERT_EQ(blob_counter.value(), values[0]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 5,
+              5 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 4,
+              4 * second_expected_bytes);
+
+  blob_counter.SeekForPrev(keys[1]);
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[1]);
+  ASSERT_EQ(blob_counter.user_key(), user_key1);
+  ASSERT_EQ(blob_counter.value(), values[1]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 5,
+              5 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 5,
+              5 * second_expected_bytes);
+
+  blob_counter.SeekForPrev(keys[2]);
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[2]);
+  ASSERT_EQ(blob_counter.user_key(), user_key2);
+  ASSERT_EQ(blob_counter.value(), values[2]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 5,
+              5 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 5,
+              5 * second_expected_bytes);
+}
+
+TEST(BlobCountingIteratorTest, CorruptBlobIndex) {
+  const std::vector<std::string> keys{
+      test::KeyStr("user_key", 1, kTypeBlobIndex)};
+  const std::vector<std::string> values{"i_am_not_a_blob_index"};
+
+  assert(keys.size() == values.size());
+
+  VectorIterator input(keys, values);
+  BlobGarbageMeter blob_garbage_meter;
+
+  BlobCountingIterator blob_counter(&input, &blob_garbage_meter);
+
+  blob_counter.SeekToFirst();
+  ASSERT_FALSE(blob_counter.Valid());
+  ASSERT_NOK(blob_counter.status());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/blob_fetcher.cc b/src/rocksdb/db/blob/blob_fetcher.cc
new file mode 100644
index 000000000..124429f93
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_fetcher.cc
@@ -0,0 +1,34 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_fetcher.h"
+
+#include "db/version_set.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status BlobFetcher::FetchBlob(const Slice& user_key,
+                              const Slice& blob_index_slice,
+                              FilePrefetchBuffer* prefetch_buffer,
+                              PinnableSlice* blob_value,
+                              uint64_t* bytes_read) const {
+  assert(version_);
+
+  return version_->GetBlob(read_options_, user_key, blob_index_slice,
+                           prefetch_buffer, blob_value, bytes_read);
+}
+
+Status BlobFetcher::FetchBlob(const Slice& user_key,
+                              const BlobIndex& blob_index,
+                              FilePrefetchBuffer* prefetch_buffer,
+                              PinnableSlice* blob_value,
+                              uint64_t* bytes_read) const {
+  assert(version_);
+
+  return version_->GetBlob(read_options_, user_key, blob_index, prefetch_buffer,
+                           blob_value, bytes_read);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_fetcher.h b/src/rocksdb/db/blob/blob_fetcher.h
new file mode 100644
index 000000000..8aeaf965d
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_fetcher.h
@@ -0,0 +1,37 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Version;
+class Slice;
+class FilePrefetchBuffer;
+class PinnableSlice;
+class BlobIndex;
+
+// A thin wrapper around the blob retrieval functionality of Version.
+class BlobFetcher {
+ public:
+  BlobFetcher(const Version* version, const ReadOptions& read_options)
+      : version_(version), read_options_(read_options) {}
+
+  Status FetchBlob(const Slice& user_key, const Slice& blob_index_slice,
+                   FilePrefetchBuffer* prefetch_buffer,
+                   PinnableSlice* blob_value, uint64_t* bytes_read) const;
+
+  Status FetchBlob(const Slice& user_key, const BlobIndex& blob_index,
+                   FilePrefetchBuffer* prefetch_buffer,
+                   PinnableSlice* blob_value, uint64_t* bytes_read) const;
+
+ private:
+  const Version* version_;
+  ReadOptions read_options_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_addition.cc b/src/rocksdb/db/blob/blob_file_addition.cc
new file mode 100644
index 000000000..71b1bb7fc
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_addition.cc
@@ -0,0 +1,156 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_addition.h"
+
+#include <ostream>
+#include <sstream>
+
+#include "logging/event_logger.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Tags for custom fields. Note that these get persisted in the manifest,
+// so existing tags should not be modified.
+enum BlobFileAddition::CustomFieldTags : uint32_t {
+  kEndMarker,
+
+  // Add forward compatible fields here
+
+  /////////////////////////////////////////////////////////////////////
+
+  kForwardIncompatibleMask = 1 << 6,
+
+  // Add forward incompatible fields here
+};
+
+void BlobFileAddition::EncodeTo(std::string* output) const {
+  PutVarint64(output, blob_file_number_);
+  PutVarint64(output, total_blob_count_);
+  PutVarint64(output, total_blob_bytes_);
+  PutLengthPrefixedSlice(output, checksum_method_);
+  PutLengthPrefixedSlice(output, checksum_value_);
+
+  // Encode any custom fields here. The format to use is a Varint32 tag (see
+  // CustomFieldTags above) followed by a length prefixed slice. Unknown custom
+  // fields will be ignored during decoding unless they're in the forward
+  // incompatible range.
+
+  TEST_SYNC_POINT_CALLBACK("BlobFileAddition::EncodeTo::CustomFields", output);
+
+  PutVarint32(output, kEndMarker);
+}
+
+Status BlobFileAddition::DecodeFrom(Slice* input) {
+  constexpr char class_name[] = "BlobFileAddition";
+
+  if (!GetVarint64(input, &blob_file_number_)) {
+    return Status::Corruption(class_name, "Error decoding blob file number");
+  }
+
+  if (!GetVarint64(input, &total_blob_count_)) {
+    return Status::Corruption(class_name, "Error decoding total blob count");
+  }
+
+  if (!GetVarint64(input, &total_blob_bytes_)) {
+    return Status::Corruption(class_name, "Error decoding total blob bytes");
+  }
+
+  Slice checksum_method;
+  if (!GetLengthPrefixedSlice(input, &checksum_method)) {
+    return Status::Corruption(class_name, "Error decoding checksum method");
+  }
+  checksum_method_ = checksum_method.ToString();
+
+  Slice checksum_value;
+  if (!GetLengthPrefixedSlice(input, &checksum_value)) {
+    return Status::Corruption(class_name, "Error decoding checksum value");
+  }
+  checksum_value_ = checksum_value.ToString();
+
+  while (true) {
+    uint32_t custom_field_tag = 0;
+    if (!GetVarint32(input, &custom_field_tag)) {
+      return Status::Corruption(class_name, "Error decoding custom field tag");
+    }
+
+    if (custom_field_tag == kEndMarker) {
+      break;
+    }
+
+    if (custom_field_tag & kForwardIncompatibleMask) {
+      return Status::Corruption(
+          class_name, "Forward incompatible custom field encountered");
+    }
+
+    Slice custom_field_value;
+    if (!GetLengthPrefixedSlice(input, &custom_field_value)) {
+      return Status::Corruption(class_name,
+                                "Error decoding custom field value");
+    }
+  }
+
+  return Status::OK();
+}
+
+std::string BlobFileAddition::DebugString() const {
+  std::ostringstream oss;
+
+  oss << *this;
+
+  return oss.str();
+}
+
+std::string BlobFileAddition::DebugJSON() const {
+  JSONWriter jw;
+
+  jw << *this;
+
+  jw.EndObject();
+
+  return jw.Get();
+}
+
+bool operator==(const BlobFileAddition& lhs, const BlobFileAddition& rhs) {
+  return lhs.GetBlobFileNumber() == rhs.GetBlobFileNumber() &&
+         lhs.GetTotalBlobCount() == rhs.GetTotalBlobCount() &&
+         lhs.GetTotalBlobBytes() == rhs.GetTotalBlobBytes() &&
+         lhs.GetChecksumMethod() == rhs.GetChecksumMethod() &&
+         lhs.GetChecksumValue() == rhs.GetChecksumValue();
+}
+
+bool operator!=(const BlobFileAddition& lhs, const BlobFileAddition& rhs) {
+  return !(lhs == rhs);
+}
+
+std::ostream& operator<<(std::ostream& os,
+                         const BlobFileAddition& blob_file_addition) {
+  os << "blob_file_number: " << blob_file_addition.GetBlobFileNumber()
+     << " total_blob_count: " << blob_file_addition.GetTotalBlobCount()
+     << " total_blob_bytes: " << blob_file_addition.GetTotalBlobBytes()
+     << " checksum_method: " << blob_file_addition.GetChecksumMethod()
+     << " checksum_value: "
+     << Slice(blob_file_addition.GetChecksumValue()).ToString(/* hex */ true);
+
+  return os;
+}
+
+JSONWriter& operator<<(JSONWriter& jw,
+                       const BlobFileAddition& blob_file_addition) {
+  jw << "BlobFileNumber" << blob_file_addition.GetBlobFileNumber()
+     << "TotalBlobCount" << blob_file_addition.GetTotalBlobCount()
+     << "TotalBlobBytes" << blob_file_addition.GetTotalBlobBytes()
+     << "ChecksumMethod" << blob_file_addition.GetChecksumMethod()
+     << "ChecksumValue"
+     << Slice(blob_file_addition.GetChecksumValue()).ToString(/* hex */ true);
+
+  return jw;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_addition.h b/src/rocksdb/db/blob/blob_file_addition.h
new file mode 100644
index 000000000..43b1a0bcb
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_addition.h
@@ -0,0 +1,67 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <iosfwd>
+#include <string>
+
+#include "db/blob/blob_constants.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class JSONWriter;
+class Slice;
+class Status;
+
+class BlobFileAddition {
+ public:
+  BlobFileAddition() = default;
+
+  BlobFileAddition(uint64_t blob_file_number, uint64_t total_blob_count,
+                   uint64_t total_blob_bytes, std::string checksum_method,
+                   std::string checksum_value)
+      : blob_file_number_(blob_file_number),
+        total_blob_count_(total_blob_count),
+        total_blob_bytes_(total_blob_bytes),
+        checksum_method_(std::move(checksum_method)),
+        checksum_value_(std::move(checksum_value)) {
+    assert(checksum_method_.empty() == checksum_value_.empty());
+  }
+
+  uint64_t GetBlobFileNumber() const { return blob_file_number_; }
+  uint64_t GetTotalBlobCount() const { return total_blob_count_; }
+  uint64_t GetTotalBlobBytes() const { return total_blob_bytes_; }
+  const std::string& GetChecksumMethod() const { return checksum_method_; }
+  const std::string& GetChecksumValue() const { return checksum_value_; }
+
+  void EncodeTo(std::string* output) const;
+  Status DecodeFrom(Slice* input);
+
+  std::string DebugString() const;
+  std::string DebugJSON() const;
+
+ private:
+  enum CustomFieldTags : uint32_t;
+
+  uint64_t blob_file_number_ = kInvalidBlobFileNumber;
+  uint64_t total_blob_count_ = 0;
+  uint64_t total_blob_bytes_ = 0;
+  std::string checksum_method_;
+  std::string checksum_value_;
+};
+
+bool operator==(const BlobFileAddition& lhs, const BlobFileAddition& rhs);
+bool operator!=(const BlobFileAddition& lhs, const BlobFileAddition& rhs);
+
+std::ostream& operator<<(std::ostream& os,
+                         const BlobFileAddition& blob_file_addition);
+JSONWriter& operator<<(JSONWriter& jw,
+                       const BlobFileAddition& blob_file_addition);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_addition_test.cc b/src/rocksdb/db/blob/blob_file_addition_test.cc
new file mode 100644
index 000000000..64cb0a9d6
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_addition_test.cc
@@ -0,0 +1,211 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_addition.h"
+
+#include <cstdint>
+#include <cstring>
+#include <string>
+
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlobFileAdditionTest : public testing::Test {
+ public:
+  static void TestEncodeDecode(const BlobFileAddition& blob_file_addition) {
+    std::string encoded;
+    blob_file_addition.EncodeTo(&encoded);
+
+    BlobFileAddition decoded;
+    Slice input(encoded);
+    ASSERT_OK(decoded.DecodeFrom(&input));
+
+    ASSERT_EQ(blob_file_addition, decoded);
+  }
+};
+
+TEST_F(BlobFileAdditionTest, Empty) {
+  BlobFileAddition blob_file_addition;
+
+  ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), kInvalidBlobFileNumber);
+  ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 0);
+  ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(), 0);
+  ASSERT_TRUE(blob_file_addition.GetChecksumMethod().empty());
+  ASSERT_TRUE(blob_file_addition.GetChecksumValue().empty());
+
+  TestEncodeDecode(blob_file_addition);
+}
+
+TEST_F(BlobFileAdditionTest, NonEmpty) {
+  constexpr uint64_t blob_file_number = 123;
+  constexpr uint64_t total_blob_count = 2;
+  constexpr uint64_t total_blob_bytes = 123456;
+  const std::string checksum_method("SHA1");
+  const std::string checksum_value(
+      "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+      "\x5c\xbd");
+
+  BlobFileAddition blob_file_addition(blob_file_number, total_blob_count,
+                                      total_blob_bytes, checksum_method,
+                                      checksum_value);
+
+  ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number);
+  ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), total_blob_count);
+  ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(), total_blob_bytes);
+  ASSERT_EQ(blob_file_addition.GetChecksumMethod(), checksum_method);
+  ASSERT_EQ(blob_file_addition.GetChecksumValue(), checksum_value);
+
+  TestEncodeDecode(blob_file_addition);
+}
+
+TEST_F(BlobFileAdditionTest, DecodeErrors) {
+  std::string str;
+  Slice slice(str);
+
+  BlobFileAddition blob_file_addition;
+
+  {
+    const Status s = blob_file_addition.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "blob file number"));
+  }
+
+  constexpr uint64_t blob_file_number = 123;
+  PutVarint64(&str, blob_file_number);
+  slice = str;
+
+  {
+    const Status s = blob_file_addition.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "total blob count"));
+  }
+
+  constexpr uint64_t total_blob_count = 4567;
+  PutVarint64(&str, total_blob_count);
+  slice = str;
+
+  {
+    const Status s = blob_file_addition.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "total blob bytes"));
+  }
+
+  constexpr uint64_t total_blob_bytes = 12345678;
+  PutVarint64(&str, total_blob_bytes);
+  slice = str;
+
+  {
+    const Status s = blob_file_addition.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "checksum method"));
+  }
+
+  constexpr char checksum_method[] = "SHA1";
+  PutLengthPrefixedSlice(&str, checksum_method);
+  slice = str;
+
+  {
+    const Status s = blob_file_addition.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "checksum value"));
+  }
+
+  constexpr char checksum_value[] =
+      "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+      "\x5c\xbd";
+  PutLengthPrefixedSlice(&str, checksum_value);
+  slice = str;
+
+  {
+    const Status s = blob_file_addition.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "custom field tag"));
+  }
+
+  constexpr uint32_t custom_tag = 2;
+  PutVarint32(&str, custom_tag);
+  slice = str;
+
+  {
+    const Status s = blob_file_addition.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "custom field value"));
+  }
+}
+
+TEST_F(BlobFileAdditionTest, ForwardCompatibleCustomField) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileAddition::EncodeTo::CustomFields", [&](void* arg) {
+        std::string* output = static_cast<std::string*>(arg);
+
+        constexpr uint32_t forward_compatible_tag = 2;
+        PutVarint32(output, forward_compatible_tag);
+
+        PutLengthPrefixedSlice(output, "deadbeef");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr uint64_t blob_file_number = 678;
+  constexpr uint64_t total_blob_count = 9999;
+  constexpr uint64_t total_blob_bytes = 100000000;
+  const std::string checksum_method("CRC32");
+  const std::string checksum_value("\x3d\x87\xff\x57");
+
+  BlobFileAddition blob_file_addition(blob_file_number, total_blob_count,
+                                      total_blob_bytes, checksum_method,
+                                      checksum_value);
+
+  TestEncodeDecode(blob_file_addition);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(BlobFileAdditionTest, ForwardIncompatibleCustomField) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileAddition::EncodeTo::CustomFields", [&](void* arg) {
+        std::string* output = static_cast<std::string*>(arg);
+
+        constexpr uint32_t forward_incompatible_tag = (1 << 6) + 1;
+        PutVarint32(output, forward_incompatible_tag);
+
+        PutLengthPrefixedSlice(output, "foobar");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr uint64_t blob_file_number = 456;
+  constexpr uint64_t total_blob_count = 100;
+  constexpr uint64_t total_blob_bytes = 2000000;
+  const std::string checksum_method("CRC32B");
+  const std::string checksum_value("\x6d\xbd\xf2\x3a");
+
+  BlobFileAddition blob_file_addition(blob_file_number, total_blob_count,
+                                      total_blob_bytes, checksum_method,
+                                      checksum_value);
+
+  std::string encoded;
+  blob_file_addition.EncodeTo(&encoded);
+
+  BlobFileAddition decoded_blob_file_addition;
+  Slice input(encoded);
+  const Status s = decoded_blob_file_addition.DecodeFrom(&input);
+
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(), "Forward incompatible"));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/blob_file_builder.cc b/src/rocksdb/db/blob/blob_file_builder.cc
new file mode 100644
index 000000000..5e0e7f6cb
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_builder.cc
@@ -0,0 +1,446 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_builder.h"
+
+#include <cassert>
+
+#include "db/blob/blob_contents.h"
+#include "db/blob/blob_file_addition.h"
+#include "db/blob/blob_file_completion_callback.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_writer.h"
+#include "db/event_helpers.h"
+#include "db/version_set.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
+#include "logging/logging.h"
+#include "options/cf_options.h"
+#include "options/options_helper.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "test_util/sync_point.h"
+#include "trace_replay/io_tracer.h"
+#include "util/compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlobFileBuilder::BlobFileBuilder(
+    VersionSet* versions, FileSystem* fs,
+    const ImmutableOptions* immutable_options,
+    const MutableCFOptions* mutable_cf_options, const FileOptions* file_options,
+    std::string db_id, std::string db_session_id, int job_id,
+    uint32_t column_family_id, const std::string& column_family_name,
+    Env::IOPriority io_priority, Env::WriteLifeTimeHint write_hint,
+    const std::shared_ptr<IOTracer>& io_tracer,
+    BlobFileCompletionCallback* blob_callback,
+    BlobFileCreationReason creation_reason,
+    std::vector<std::string>* blob_file_paths,
+    std::vector<BlobFileAddition>* blob_file_additions)
+    : BlobFileBuilder([versions]() { return versions->NewFileNumber(); }, fs,
+                      immutable_options, mutable_cf_options, file_options,
+                      db_id, db_session_id, job_id, column_family_id,
+                      column_family_name, io_priority, write_hint, io_tracer,
+                      blob_callback, creation_reason, blob_file_paths,
+                      blob_file_additions) {}
+
+BlobFileBuilder::BlobFileBuilder(
+    std::function<uint64_t()> file_number_generator, FileSystem* fs,
+    const ImmutableOptions* immutable_options,
+    const MutableCFOptions* mutable_cf_options, const FileOptions* file_options,
+    std::string db_id, std::string db_session_id, int job_id,
+    uint32_t column_family_id, const std::string& column_family_name,
+    Env::IOPriority io_priority, Env::WriteLifeTimeHint write_hint,
+    const std::shared_ptr<IOTracer>& io_tracer,
+    BlobFileCompletionCallback* blob_callback,
+    BlobFileCreationReason creation_reason,
+    std::vector<std::string>* blob_file_paths,
+    std::vector<BlobFileAddition>* blob_file_additions)
+    : file_number_generator_(std::move(file_number_generator)),
+      fs_(fs),
+      immutable_options_(immutable_options),
+      min_blob_size_(mutable_cf_options->min_blob_size),
+      blob_file_size_(mutable_cf_options->blob_file_size),
+      blob_compression_type_(mutable_cf_options->blob_compression_type),
+      prepopulate_blob_cache_(mutable_cf_options->prepopulate_blob_cache),
+      file_options_(file_options),
+      db_id_(std::move(db_id)),
+      db_session_id_(std::move(db_session_id)),
+      job_id_(job_id),
+      column_family_id_(column_family_id),
+      column_family_name_(column_family_name),
+      io_priority_(io_priority),
+      write_hint_(write_hint),
+      io_tracer_(io_tracer),
+      blob_callback_(blob_callback),
+      creation_reason_(creation_reason),
+      blob_file_paths_(blob_file_paths),
+      blob_file_additions_(blob_file_additions),
+      blob_count_(0),
+      blob_bytes_(0) {
+  assert(file_number_generator_);
+  assert(fs_);
+  assert(immutable_options_);
+  assert(file_options_);
+  assert(blob_file_paths_);
+  assert(blob_file_paths_->empty());
+  assert(blob_file_additions_);
+  assert(blob_file_additions_->empty());
+}
+
+BlobFileBuilder::~BlobFileBuilder() = default;
+
+Status BlobFileBuilder::Add(const Slice& key, const Slice& value,
+                            std::string* blob_index) {
+  assert(blob_index);
+  assert(blob_index->empty());
+
+  if (value.size() < min_blob_size_) {
+    return Status::OK();
+  }
+
+  {
+    const Status s = OpenBlobFileIfNeeded();
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  Slice blob = value;
+  std::string compressed_blob;
+
+  {
+    const Status s = CompressBlobIfNeeded(&blob, &compressed_blob);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  uint64_t blob_file_number = 0;
+  uint64_t blob_offset = 0;
+
+  {
+    const Status s =
+        WriteBlobToFile(key, blob, &blob_file_number, &blob_offset);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  {
+    const Status s = CloseBlobFileIfNeeded();
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  {
+    const Status s =
+        PutBlobIntoCacheIfNeeded(value, blob_file_number, blob_offset);
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(immutable_options_->info_log,
+                     "Failed to pre-populate the blob into blob cache: %s",
+                     s.ToString().c_str());
+    }
+  }
+
+  BlobIndex::EncodeBlob(blob_index, blob_file_number, blob_offset, blob.size(),
+                        blob_compression_type_);
+
+  return Status::OK();
+}
+
+Status BlobFileBuilder::Finish() {
+  if (!IsBlobFileOpen()) {
+    return Status::OK();
+  }
+
+  return CloseBlobFile();
+}
+
+bool BlobFileBuilder::IsBlobFileOpen() const { return !!writer_; }
+
+Status BlobFileBuilder::OpenBlobFileIfNeeded() {
+  if (IsBlobFileOpen()) {
+    return Status::OK();
+  }
+
+  assert(!blob_count_);
+  assert(!blob_bytes_);
+
+  assert(file_number_generator_);
+  const uint64_t blob_file_number = file_number_generator_();
+
+  assert(immutable_options_);
+  assert(!immutable_options_->cf_paths.empty());
+  std::string blob_file_path =
+      BlobFileName(immutable_options_->cf_paths.front().path, blob_file_number);
+
+  if (blob_callback_) {
+    blob_callback_->OnBlobFileCreationStarted(
+        blob_file_path, column_family_name_, job_id_, creation_reason_);
+  }
+
+  std::unique_ptr<FSWritableFile> file;
+
+  {
+    assert(file_options_);
+    Status s = NewWritableFile(fs_, blob_file_path, &file, *file_options_);
+
+    TEST_SYNC_POINT_CALLBACK(
+        "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile", &s);
+
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  // Note: files get added to blob_file_paths_ right after the open, so they
+  // can be cleaned up upon failure. Contrast this with blob_file_additions_,
+  // which only contains successfully written files.
+  assert(blob_file_paths_);
+  blob_file_paths_->emplace_back(std::move(blob_file_path));
+
+  assert(file);
+  file->SetIOPriority(io_priority_);
+  file->SetWriteLifeTimeHint(write_hint_);
+  FileTypeSet tmp_set = immutable_options_->checksum_handoff_file_types;
+  Statistics* const statistics = immutable_options_->stats;
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(file), blob_file_paths_->back(), *file_options_,
+      immutable_options_->clock, io_tracer_, statistics,
+      immutable_options_->listeners,
+      immutable_options_->file_checksum_gen_factory.get(),
+      tmp_set.Contains(FileType::kBlobFile), false));
+
+  constexpr bool do_flush = false;
+
+  std::unique_ptr<BlobLogWriter> blob_log_writer(new BlobLogWriter(
+      std::move(file_writer), immutable_options_->clock, statistics,
+      blob_file_number, immutable_options_->use_fsync, do_flush));
+
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+
+  BlobLogHeader header(column_family_id_, blob_compression_type_, has_ttl,
+                       expiration_range);
+
+  {
+    Status s = blob_log_writer->WriteHeader(header);
+
+    TEST_SYNC_POINT_CALLBACK(
+        "BlobFileBuilder::OpenBlobFileIfNeeded:WriteHeader", &s);
+
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  writer_ = std::move(blob_log_writer);
+
+  assert(IsBlobFileOpen());
+
+  return Status::OK();
+}
+
+Status BlobFileBuilder::CompressBlobIfNeeded(
+    Slice* blob, std::string* compressed_blob) const {
+  assert(blob);
+  assert(compressed_blob);
+  assert(compressed_blob->empty());
+  assert(immutable_options_);
+
+  if (blob_compression_type_ == kNoCompression) {
+    return Status::OK();
+  }
+
+  CompressionOptions opts;
+  CompressionContext context(blob_compression_type_);
+  constexpr uint64_t sample_for_compression = 0;
+
+  CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
+                       blob_compression_type_, sample_for_compression);
+
+  constexpr uint32_t compression_format_version = 2;
+
+  bool success = false;
+
+  {
+    StopWatch stop_watch(immutable_options_->clock, immutable_options_->stats,
+                         BLOB_DB_COMPRESSION_MICROS);
+    success =
+        CompressData(*blob, info, compression_format_version, compressed_blob);
+  }
+
+  if (!success) {
+    return Status::Corruption("Error compressing blob");
+  }
+
+  *blob = Slice(*compressed_blob);
+
+  return Status::OK();
+}
+
+Status BlobFileBuilder::WriteBlobToFile(const Slice& key, const Slice& blob,
+                                        uint64_t* blob_file_number,
+                                        uint64_t* blob_offset) {
+  assert(IsBlobFileOpen());
+  assert(blob_file_number);
+  assert(blob_offset);
+
+  uint64_t key_offset = 0;
+
+  Status s = writer_->AddRecord(key, blob, &key_offset, blob_offset);
+
+  TEST_SYNC_POINT_CALLBACK("BlobFileBuilder::WriteBlobToFile:AddRecord", &s);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  *blob_file_number = writer_->get_log_number();
+
+  ++blob_count_;
+  blob_bytes_ += BlobLogRecord::kHeaderSize + key.size() + blob.size();
+
+  return Status::OK();
+}
+
+Status BlobFileBuilder::CloseBlobFile() {
+  assert(IsBlobFileOpen());
+
+  BlobLogFooter footer;
+  footer.blob_count = blob_count_;
+
+  std::string checksum_method;
+  std::string checksum_value;
+
+  Status s = writer_->AppendFooter(footer, &checksum_method, &checksum_value);
+
+  TEST_SYNC_POINT_CALLBACK("BlobFileBuilder::WriteBlobToFile:AppendFooter", &s);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  const uint64_t blob_file_number = writer_->get_log_number();
+
+  if (blob_callback_) {
+    s = blob_callback_->OnBlobFileCompleted(
+        blob_file_paths_->back(), column_family_name_, job_id_,
+        blob_file_number, creation_reason_, s, checksum_value, checksum_method,
+        blob_count_, blob_bytes_);
+  }
+
+  assert(blob_file_additions_);
+  blob_file_additions_->emplace_back(blob_file_number, blob_count_, blob_bytes_,
+                                     std::move(checksum_method),
+                                     std::move(checksum_value));
+
+  assert(immutable_options_);
+  ROCKS_LOG_INFO(immutable_options_->logger,
+                 "[%s] [JOB %d] Generated blob file #%" PRIu64 ": %" PRIu64
+                 " total blobs, %" PRIu64 " total bytes",
+                 column_family_name_.c_str(), job_id_, blob_file_number,
+                 blob_count_, blob_bytes_);
+
+  writer_.reset();
+  blob_count_ = 0;
+  blob_bytes_ = 0;
+
+  return s;
+}
+
+Status BlobFileBuilder::CloseBlobFileIfNeeded() {
+  assert(IsBlobFileOpen());
+
+  const WritableFileWriter* const file_writer = writer_->file();
+  assert(file_writer);
+
+  if (file_writer->GetFileSize() < blob_file_size_) {
+    return Status::OK();
+  }
+
+  return CloseBlobFile();
+}
+
+void BlobFileBuilder::Abandon(const Status& s) {
+  if (!IsBlobFileOpen()) {
+    return;
+  }
+  if (blob_callback_) {
+    // BlobFileBuilder::Abandon() is called because of error while writing to
+    // Blob files. So we can ignore the below error.
+    blob_callback_
+        ->OnBlobFileCompleted(blob_file_paths_->back(), column_family_name_,
+                              job_id_, writer_->get_log_number(),
+                              creation_reason_, s, "", "", blob_count_,
+                              blob_bytes_)
+        .PermitUncheckedError();
+  }
+
+  writer_.reset();
+  blob_count_ = 0;
+  blob_bytes_ = 0;
+}
+
+Status BlobFileBuilder::PutBlobIntoCacheIfNeeded(const Slice& blob,
+                                                 uint64_t blob_file_number,
+                                                 uint64_t blob_offset) const {
+  Status s = Status::OK();
+
+  auto blob_cache = immutable_options_->blob_cache;
+  auto statistics = immutable_options_->statistics.get();
+  bool warm_cache =
+      prepopulate_blob_cache_ == PrepopulateBlobCache::kFlushOnly &&
+      creation_reason_ == BlobFileCreationReason::kFlush;
+
+  if (blob_cache && warm_cache) {
+    const OffsetableCacheKey base_cache_key(db_id_, db_session_id_,
+                                            blob_file_number);
+    const CacheKey cache_key = base_cache_key.WithOffset(blob_offset);
+    const Slice key = cache_key.AsSlice();
+
+    const Cache::Priority priority = Cache::Priority::BOTTOM;
+
+    // Objects to be put into the cache have to be heap-allocated and
+    // self-contained, i.e. own their contents. The Cache has to be able to
+    // take unique ownership of them.
+    CacheAllocationPtr allocation =
+        AllocateBlock(blob.size(), blob_cache->memory_allocator());
+    memcpy(allocation.get(), blob.data(), blob.size());
+    std::unique_ptr<BlobContents> buf =
+        BlobContents::Create(std::move(allocation), blob.size());
+
+    Cache::CacheItemHelper* const cache_item_helper =
+        BlobContents::GetCacheItemHelper();
+    assert(cache_item_helper);
+
+    if (immutable_options_->lowest_used_cache_tier ==
+        CacheTier::kNonVolatileBlockTier) {
+      s = blob_cache->Insert(key, buf.get(), cache_item_helper,
+                             buf->ApproximateMemoryUsage(),
+                             nullptr /* cache_handle */, priority);
+    } else {
+      s = blob_cache->Insert(key, buf.get(), buf->ApproximateMemoryUsage(),
+                             cache_item_helper->del_cb,
+                             nullptr /* cache_handle */, priority);
+    }
+
+    if (s.ok()) {
+      RecordTick(statistics, BLOB_DB_CACHE_ADD);
+      RecordTick(statistics, BLOB_DB_CACHE_BYTES_WRITE, buf->size());
+      buf.release();
+    } else {
+      RecordTick(statistics, BLOB_DB_CACHE_ADD_FAILURES);
+    }
+  }
+
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_builder.h b/src/rocksdb/db/blob/blob_file_builder.h
new file mode 100644
index 000000000..8e7aab502
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_builder.h
@@ -0,0 +1,112 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <cinttypes>
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/env.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class VersionSet;
+class FileSystem;
+class SystemClock;
+struct ImmutableOptions;
+struct MutableCFOptions;
+struct FileOptions;
+class BlobFileAddition;
+class Status;
+class Slice;
+class BlobLogWriter;
+class IOTracer;
+class BlobFileCompletionCallback;
+
+class BlobFileBuilder {
+ public:
+  BlobFileBuilder(VersionSet* versions, FileSystem* fs,
+                  const ImmutableOptions* immutable_options,
+                  const MutableCFOptions* mutable_cf_options,
+                  const FileOptions* file_options, std::string db_id,
+                  std::string db_session_id, int job_id,
+                  uint32_t column_family_id,
+                  const std::string& column_family_name,
+                  Env::IOPriority io_priority,
+                  Env::WriteLifeTimeHint write_hint,
+                  const std::shared_ptr<IOTracer>& io_tracer,
+                  BlobFileCompletionCallback* blob_callback,
+                  BlobFileCreationReason creation_reason,
+                  std::vector<std::string>* blob_file_paths,
+                  std::vector<BlobFileAddition>* blob_file_additions);
+
+  BlobFileBuilder(std::function<uint64_t()> file_number_generator,
+                  FileSystem* fs, const ImmutableOptions* immutable_options,
+                  const MutableCFOptions* mutable_cf_options,
+                  const FileOptions* file_options, std::string db_id,
+                  std::string db_session_id, int job_id,
+                  uint32_t column_family_id,
+                  const std::string& column_family_name,
+                  Env::IOPriority io_priority,
+                  Env::WriteLifeTimeHint write_hint,
+                  const std::shared_ptr<IOTracer>& io_tracer,
+                  BlobFileCompletionCallback* blob_callback,
+                  BlobFileCreationReason creation_reason,
+                  std::vector<std::string>* blob_file_paths,
+                  std::vector<BlobFileAddition>* blob_file_additions);
+
+  BlobFileBuilder(const BlobFileBuilder&) = delete;
+  BlobFileBuilder& operator=(const BlobFileBuilder&) = delete;
+
+  ~BlobFileBuilder();
+
+  Status Add(const Slice& key, const Slice& value, std::string* blob_index);
+  Status Finish();
+  void Abandon(const Status& s);
+
+ private:
+  bool IsBlobFileOpen() const;
+  Status OpenBlobFileIfNeeded();
+  Status CompressBlobIfNeeded(Slice* blob, std::string* compressed_blob) const;
+  Status WriteBlobToFile(const Slice& key, const Slice& blob,
+                         uint64_t* blob_file_number, uint64_t* blob_offset);
+  Status CloseBlobFile();
+  Status CloseBlobFileIfNeeded();
+
+  Status PutBlobIntoCacheIfNeeded(const Slice& blob, uint64_t blob_file_number,
+                                  uint64_t blob_offset) const;
+
+  std::function<uint64_t()> file_number_generator_;
+  FileSystem* fs_;
+  const ImmutableOptions* immutable_options_;
+  uint64_t min_blob_size_;
+  uint64_t blob_file_size_;
+  CompressionType blob_compression_type_;
+  PrepopulateBlobCache prepopulate_blob_cache_;
+  const FileOptions* file_options_;
+  const std::string db_id_;
+  const std::string db_session_id_;
+  int job_id_;
+  uint32_t column_family_id_;
+  std::string column_family_name_;
+  Env::IOPriority io_priority_;
+  Env::WriteLifeTimeHint write_hint_;
+  std::shared_ptr<IOTracer> io_tracer_;
+  BlobFileCompletionCallback* blob_callback_;
+  BlobFileCreationReason creation_reason_;
+  std::vector<std::string>* blob_file_paths_;
+  std::vector<BlobFileAddition>* blob_file_additions_;
+  std::unique_ptr<BlobLogWriter> writer_;
+  uint64_t blob_count_;
+  uint64_t blob_bytes_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_builder_test.cc b/src/rocksdb/db/blob/blob_file_builder_test.cc
new file mode 100644
index 000000000..3a0feee45
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_builder_test.cc
@@ -0,0 +1,680 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_builder.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/blob/blob_file_addition.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_sequential_reader.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "options/cf_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_checksum.h"
+#include "rocksdb/options.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/compression.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TestFileNumberGenerator {
+ public:
+  uint64_t operator()() { return ++next_file_number_; }
+
+ private:
+  uint64_t next_file_number_ = 1;
+};
+
+class BlobFileBuilderTest : public testing::Test {
+ protected:
+  BlobFileBuilderTest() {
+    mock_env_.reset(MockEnv::Create(Env::Default()));
+    fs_ = mock_env_->GetFileSystem().get();
+    clock_ = mock_env_->GetSystemClock().get();
+  }
+
+  void VerifyBlobFile(uint64_t blob_file_number,
+                      const std::string& blob_file_path,
+                      uint32_t column_family_id,
+                      CompressionType blob_compression_type,
+                      const std::vector<std::pair<std::string, std::string>>&
+                          expected_key_value_pairs,
+                      const std::vector<std::string>& blob_indexes) {
+    assert(expected_key_value_pairs.size() == blob_indexes.size());
+
+    std::unique_ptr<FSRandomAccessFile> file;
+    constexpr IODebugContext* dbg = nullptr;
+    ASSERT_OK(
+        fs_->NewRandomAccessFile(blob_file_path, file_options_, &file, dbg));
+
+    std::unique_ptr<RandomAccessFileReader> file_reader(
+        new RandomAccessFileReader(std::move(file), blob_file_path, clock_));
+
+    constexpr Statistics* statistics = nullptr;
+    BlobLogSequentialReader blob_log_reader(std::move(file_reader), clock_,
+                                            statistics);
+
+    BlobLogHeader header;
+    ASSERT_OK(blob_log_reader.ReadHeader(&header));
+    ASSERT_EQ(header.version, kVersion1);
+    ASSERT_EQ(header.column_family_id, column_family_id);
+    ASSERT_EQ(header.compression, blob_compression_type);
+    ASSERT_FALSE(header.has_ttl);
+    ASSERT_EQ(header.expiration_range, ExpirationRange());
+
+    for (size_t i = 0; i < expected_key_value_pairs.size(); ++i) {
+      BlobLogRecord record;
+      uint64_t blob_offset = 0;
+
+      ASSERT_OK(blob_log_reader.ReadRecord(
+          &record, BlobLogSequentialReader::kReadHeaderKeyBlob, &blob_offset));
+
+      // Check the contents of the blob file
+      const auto& expected_key_value = expected_key_value_pairs[i];
+      const auto& key = expected_key_value.first;
+      const auto& value = expected_key_value.second;
+
+      ASSERT_EQ(record.key_size, key.size());
+      ASSERT_EQ(record.value_size, value.size());
+      ASSERT_EQ(record.expiration, 0);
+      ASSERT_EQ(record.key, key);
+      ASSERT_EQ(record.value, value);
+
+      // Make sure the blob reference returned by the builder points to the
+      // right place
+      BlobIndex blob_index;
+      ASSERT_OK(blob_index.DecodeFrom(blob_indexes[i]));
+      ASSERT_FALSE(blob_index.IsInlined());
+      ASSERT_FALSE(blob_index.HasTTL());
+      ASSERT_EQ(blob_index.file_number(), blob_file_number);
+      ASSERT_EQ(blob_index.offset(), blob_offset);
+      ASSERT_EQ(blob_index.size(), value.size());
+    }
+
+    BlobLogFooter footer;
+    ASSERT_OK(blob_log_reader.ReadFooter(&footer));
+    ASSERT_EQ(footer.blob_count, expected_key_value_pairs.size());
+    ASSERT_EQ(footer.expiration_range, ExpirationRange());
+  }
+
+  std::unique_ptr<Env> mock_env_;
+  FileSystem* fs_;
+  SystemClock* clock_;
+  FileOptions file_options_;
+};
+
+TEST_F(BlobFileBuilderTest, BuildAndCheckOneFile) {
+  // Build a single blob file
+  constexpr size_t number_of_blobs = 10;
+  constexpr size_t key_size = 1;
+  constexpr size_t value_size = 4;
+  constexpr size_t value_offset = 1234;
+
+  Options options;
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileBuilderTest_BuildAndCheckOneFile"),
+      0);
+  options.enable_blob_files = true;
+  options.env = mock_env_.get();
+
+  ImmutableOptions immutable_options(options);
+  MutableCFOptions mutable_cf_options(options);
+
+  constexpr int job_id = 1;
+  constexpr uint32_t column_family_id = 123;
+  constexpr char column_family_name[] = "foobar";
+  constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+  constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+  std::vector<std::string> blob_file_paths;
+  std::vector<BlobFileAddition> blob_file_additions;
+
+  BlobFileBuilder builder(
+      TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+      &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id,
+      column_family_id, column_family_name, io_priority, write_hint,
+      nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+      BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+  std::vector<std::pair<std::string, std::string>> expected_key_value_pairs(
+      number_of_blobs);
+  std::vector<std::string> blob_indexes(number_of_blobs);
+
+  for (size_t i = 0; i < number_of_blobs; ++i) {
+    auto& expected_key_value = expected_key_value_pairs[i];
+
+    auto& key = expected_key_value.first;
+    key = std::to_string(i);
+    assert(key.size() == key_size);
+
+    auto& value = expected_key_value.second;
+    value = std::to_string(i + value_offset);
+    assert(value.size() == value_size);
+
+    auto& blob_index = blob_indexes[i];
+
+    ASSERT_OK(builder.Add(key, value, &blob_index));
+    ASSERT_FALSE(blob_index.empty());
+  }
+
+  ASSERT_OK(builder.Finish());
+
+  // Check the metadata generated
+  constexpr uint64_t blob_file_number = 2;
+
+  ASSERT_EQ(blob_file_paths.size(), 1);
+
+  const std::string& blob_file_path = blob_file_paths[0];
+
+  ASSERT_EQ(
+      blob_file_path,
+      BlobFileName(immutable_options.cf_paths.front().path, blob_file_number));
+
+  ASSERT_EQ(blob_file_additions.size(), 1);
+
+  const auto& blob_file_addition = blob_file_additions[0];
+
+  ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number);
+  ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), number_of_blobs);
+  ASSERT_EQ(
+      blob_file_addition.GetTotalBlobBytes(),
+      number_of_blobs * (BlobLogRecord::kHeaderSize + key_size + value_size));
+
+  // Verify the contents of the new blob file as well as the blob references
+  VerifyBlobFile(blob_file_number, blob_file_path, column_family_id,
+                 kNoCompression, expected_key_value_pairs, blob_indexes);
+}
+
+TEST_F(BlobFileBuilderTest, BuildAndCheckMultipleFiles) {
+  // Build multiple blob files: file size limit is set to the size of a single
+  // value, so each blob ends up in a file of its own
+  constexpr size_t number_of_blobs = 10;
+  constexpr size_t key_size = 1;
+  constexpr size_t value_size = 10;
+  constexpr size_t value_offset = 1234567890;
+
+  Options options;
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileBuilderTest_BuildAndCheckMultipleFiles"),
+      0);
+  options.enable_blob_files = true;
+  options.blob_file_size = value_size;
+  options.env = mock_env_.get();
+
+  ImmutableOptions immutable_options(options);
+  MutableCFOptions mutable_cf_options(options);
+
+  constexpr int job_id = 1;
+  constexpr uint32_t column_family_id = 123;
+  constexpr char column_family_name[] = "foobar";
+  constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+  constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+  std::vector<std::string> blob_file_paths;
+  std::vector<BlobFileAddition> blob_file_additions;
+
+  BlobFileBuilder builder(
+      TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+      &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id,
+      column_family_id, column_family_name, io_priority, write_hint,
+      nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+      BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+  std::vector<std::pair<std::string, std::string>> expected_key_value_pairs(
+      number_of_blobs);
+  std::vector<std::string> blob_indexes(number_of_blobs);
+
+  for (size_t i = 0; i < number_of_blobs; ++i) {
+    auto& expected_key_value = expected_key_value_pairs[i];
+
+    auto& key = expected_key_value.first;
+    key = std::to_string(i);
+    assert(key.size() == key_size);
+
+    auto& value = expected_key_value.second;
+    value = std::to_string(i + value_offset);
+    assert(value.size() == value_size);
+
+    auto& blob_index = blob_indexes[i];
+
+    ASSERT_OK(builder.Add(key, value, &blob_index));
+    ASSERT_FALSE(blob_index.empty());
+  }
+
+  ASSERT_OK(builder.Finish());
+
+  // Check the metadata generated
+  ASSERT_EQ(blob_file_paths.size(), number_of_blobs);
+  ASSERT_EQ(blob_file_additions.size(), number_of_blobs);
+
+  for (size_t i = 0; i < number_of_blobs; ++i) {
+    const uint64_t blob_file_number = i + 2;
+
+    ASSERT_EQ(blob_file_paths[i],
+              BlobFileName(immutable_options.cf_paths.front().path,
+                           blob_file_number));
+
+    const auto& blob_file_addition = blob_file_additions[i];
+
+    ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number);
+    ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 1);
+    ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(),
+              BlobLogRecord::kHeaderSize + key_size + value_size);
+  }
+
+  // Verify the contents of the new blob files as well as the blob references
+  for (size_t i = 0; i < number_of_blobs; ++i) {
+    std::vector<std::pair<std::string, std::string>> expected_key_value_pair{
+        expected_key_value_pairs[i]};
+    std::vector<std::string> blob_index{blob_indexes[i]};
+
+    VerifyBlobFile(i + 2, blob_file_paths[i], column_family_id, kNoCompression,
+                   expected_key_value_pair, blob_index);
+  }
+}
+
+TEST_F(BlobFileBuilderTest, InlinedValues) {
+  // All values are below the min_blob_size threshold; no blob files get written
+  constexpr size_t number_of_blobs = 10;
+  constexpr size_t key_size = 1;
+  constexpr size_t value_size = 10;
+  constexpr size_t value_offset = 1234567890;
+
+  Options options;
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileBuilderTest_InlinedValues"),
+      0);
+  options.enable_blob_files = true;
+  options.min_blob_size = 1024;
+  options.env = mock_env_.get();
+
+  ImmutableOptions immutable_options(options);
+  MutableCFOptions mutable_cf_options(options);
+
+  constexpr int job_id = 1;
+  constexpr uint32_t column_family_id = 123;
+  constexpr char column_family_name[] = "foobar";
+  constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+  constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+  std::vector<std::string> blob_file_paths;
+  std::vector<BlobFileAddition> blob_file_additions;
+
+  BlobFileBuilder builder(
+      TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+      &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id,
+      column_family_id, column_family_name, io_priority, write_hint,
+      nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+      BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+  for (size_t i = 0; i < number_of_blobs; ++i) {
+    const std::string key = std::to_string(i);
+    assert(key.size() == key_size);
+
+    const std::string value = std::to_string(i + value_offset);
+    assert(value.size() == value_size);
+
+    std::string blob_index;
+    ASSERT_OK(builder.Add(key, value, &blob_index));
+    ASSERT_TRUE(blob_index.empty());
+  }
+
+  ASSERT_OK(builder.Finish());
+
+  // Check the metadata generated
+  ASSERT_TRUE(blob_file_paths.empty());
+  ASSERT_TRUE(blob_file_additions.empty());
+}
+
+TEST_F(BlobFileBuilderTest, Compression) {
+  // Build a blob file with a compressed blob
+  if (!Snappy_Supported()) {
+    return;
+  }
+
+  constexpr size_t key_size = 1;
+  constexpr size_t value_size = 100;
+
+  Options options;
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(), "BlobFileBuilderTest_Compression"),
+      0);
+  options.enable_blob_files = true;
+  options.blob_compression_type = kSnappyCompression;
+  options.env = mock_env_.get();
+
+  ImmutableOptions immutable_options(options);
+  MutableCFOptions mutable_cf_options(options);
+
+  constexpr int job_id = 1;
+  constexpr uint32_t column_family_id = 123;
+  constexpr char column_family_name[] = "foobar";
+  constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+  constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+  std::vector<std::string> blob_file_paths;
+  std::vector<BlobFileAddition> blob_file_additions;
+
+  BlobFileBuilder builder(
+      TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+      &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id,
+      column_family_id, column_family_name, io_priority, write_hint,
+      nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+      BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+  const std::string key("1");
+  const std::string uncompressed_value(value_size, 'x');
+
+  std::string blob_index;
+
+  ASSERT_OK(builder.Add(key, uncompressed_value, &blob_index));
+  ASSERT_FALSE(blob_index.empty());
+
+  ASSERT_OK(builder.Finish());
+
+  // Check the metadata generated
+  constexpr uint64_t blob_file_number = 2;
+
+  ASSERT_EQ(blob_file_paths.size(), 1);
+
+  const std::string& blob_file_path = blob_file_paths[0];
+
+  ASSERT_EQ(
+      blob_file_path,
+      BlobFileName(immutable_options.cf_paths.front().path, blob_file_number));
+
+  ASSERT_EQ(blob_file_additions.size(), 1);
+
+  const auto& blob_file_addition = blob_file_additions[0];
+
+  ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number);
+  ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 1);
+
+  CompressionOptions opts;
+  CompressionContext context(kSnappyCompression);
+  constexpr uint64_t sample_for_compression = 0;
+
+  CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
+                       kSnappyCompression, sample_for_compression);
+
+  std::string compressed_value;
+  ASSERT_TRUE(Snappy_Compress(info, uncompressed_value.data(),
+                              uncompressed_value.size(), &compressed_value));
+
+  ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(),
+            BlobLogRecord::kHeaderSize + key_size + compressed_value.size());
+
+  // Verify the contents of the new blob file as well as the blob reference
+  std::vector<std::pair<std::string, std::string>> expected_key_value_pairs{
+      {key, compressed_value}};
+  std::vector<std::string> blob_indexes{blob_index};
+
+  VerifyBlobFile(blob_file_number, blob_file_path, column_family_id,
+                 kSnappyCompression, expected_key_value_pairs, blob_indexes);
+}
+
+TEST_F(BlobFileBuilderTest, CompressionError) {
+  // Simulate an error during compression
+  if (!Snappy_Supported()) {
+    return;
+  }
+
+  Options options;
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileBuilderTest_CompressionError"),
+      0);
+  options.enable_blob_files = true;
+  options.blob_compression_type = kSnappyCompression;
+  options.env = mock_env_.get();
+  ImmutableOptions immutable_options(options);
+  MutableCFOptions mutable_cf_options(options);
+
+  constexpr int job_id = 1;
+  constexpr uint32_t column_family_id = 123;
+  constexpr char column_family_name[] = "foobar";
+  constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+  constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+  std::vector<std::string> blob_file_paths;
+  std::vector<BlobFileAddition> blob_file_additions;
+
+  BlobFileBuilder builder(
+      TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+      &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id,
+      column_family_id, column_family_name, io_priority, write_hint,
+      nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+      BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+  SyncPoint::GetInstance()->SetCallBack("CompressData:TamperWithReturnValue",
+                                        [](void* arg) {
+                                          bool* ret = static_cast<bool*>(arg);
+                                          *ret = false;
+                                        });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr char key[] = "1";
+  constexpr char value[] = "deadbeef";
+
+  std::string blob_index;
+
+  ASSERT_TRUE(builder.Add(key, value, &blob_index).IsCorruption());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  constexpr uint64_t blob_file_number = 2;
+
+  ASSERT_EQ(blob_file_paths.size(), 1);
+  ASSERT_EQ(
+      blob_file_paths[0],
+      BlobFileName(immutable_options.cf_paths.front().path, blob_file_number));
+
+  ASSERT_TRUE(blob_file_additions.empty());
+}
+
+TEST_F(BlobFileBuilderTest, Checksum) {
+  // Build a blob file with checksum
+
+  class DummyFileChecksumGenerator : public FileChecksumGenerator {
+   public:
+    void Update(const char* /* data */, size_t /* n */) override {}
+
+    void Finalize() override {}
+
+    std::string GetChecksum() const override { return std::string("dummy"); }
+
+    const char* Name() const override { return "DummyFileChecksum"; }
+  };
+
+  class DummyFileChecksumGenFactory : public FileChecksumGenFactory {
+   public:
+    std::unique_ptr<FileChecksumGenerator> CreateFileChecksumGenerator(
+        const FileChecksumGenContext& /* context */) override {
+      return std::unique_ptr<FileChecksumGenerator>(
+          new DummyFileChecksumGenerator);
+    }
+
+    const char* Name() const override { return "DummyFileChecksumGenFactory"; }
+  };
+
+  Options options;
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(), "BlobFileBuilderTest_Checksum"),
+      0);
+  options.enable_blob_files = true;
+  options.file_checksum_gen_factory =
+      std::make_shared<DummyFileChecksumGenFactory>();
+  options.env = mock_env_.get();
+
+  ImmutableOptions immutable_options(options);
+  MutableCFOptions mutable_cf_options(options);
+
+  constexpr int job_id = 1;
+  constexpr uint32_t column_family_id = 123;
+  constexpr char column_family_name[] = "foobar";
+  constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+  constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+  std::vector<std::string> blob_file_paths;
+  std::vector<BlobFileAddition> blob_file_additions;
+
+  BlobFileBuilder builder(
+      TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+      &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id,
+      column_family_id, column_family_name, io_priority, write_hint,
+      nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+      BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+  const std::string key("1");
+  const std::string value("deadbeef");
+
+  std::string blob_index;
+
+  ASSERT_OK(builder.Add(key, value, &blob_index));
+  ASSERT_FALSE(blob_index.empty());
+
+  ASSERT_OK(builder.Finish());
+
+  // Check the metadata generated
+  constexpr uint64_t blob_file_number = 2;
+
+  ASSERT_EQ(blob_file_paths.size(), 1);
+
+  const std::string& blob_file_path = blob_file_paths[0];
+
+  ASSERT_EQ(
+      blob_file_path,
+      BlobFileName(immutable_options.cf_paths.front().path, blob_file_number));
+
+  ASSERT_EQ(blob_file_additions.size(), 1);
+
+  const auto& blob_file_addition = blob_file_additions[0];
+
+  ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number);
+  ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 1);
+  ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(),
+            BlobLogRecord::kHeaderSize + key.size() + value.size());
+  ASSERT_EQ(blob_file_addition.GetChecksumMethod(), "DummyFileChecksum");
+  ASSERT_EQ(blob_file_addition.GetChecksumValue(), "dummy");
+
+  // Verify the contents of the new blob file as well as the blob reference
+  std::vector<std::pair<std::string, std::string>> expected_key_value_pairs{
+      {key, value}};
+  std::vector<std::string> blob_indexes{blob_index};
+
+  VerifyBlobFile(blob_file_number, blob_file_path, column_family_id,
+                 kNoCompression, expected_key_value_pairs, blob_indexes);
+}
+
+class BlobFileBuilderIOErrorTest
+    : public testing::Test,
+      public testing::WithParamInterface<std::string> {
+ protected:
+  BlobFileBuilderIOErrorTest() : sync_point_(GetParam()) {
+    mock_env_.reset(MockEnv::Create(Env::Default()));
+    fs_ = mock_env_->GetFileSystem().get();
+  }
+
+  std::unique_ptr<Env> mock_env_;
+  FileSystem* fs_;
+  FileOptions file_options_;
+  std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    BlobFileBuilderTest, BlobFileBuilderIOErrorTest,
+    ::testing::ValuesIn(std::vector<std::string>{
+        "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile",
+        "BlobFileBuilder::OpenBlobFileIfNeeded:WriteHeader",
+        "BlobFileBuilder::WriteBlobToFile:AddRecord",
+        "BlobFileBuilder::WriteBlobToFile:AppendFooter"}));
+
+TEST_P(BlobFileBuilderIOErrorTest, IOError) {
+  // Simulate an I/O error during the specified step of Add()
+  // Note: blob_file_size will be set to value_size in order for the first blob
+  // to trigger close
+  constexpr size_t value_size = 8;
+
+  Options options;
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileBuilderIOErrorTest_IOError"),
+      0);
+  options.enable_blob_files = true;
+  options.blob_file_size = value_size;
+  options.env = mock_env_.get();
+
+  ImmutableOptions immutable_options(options);
+  MutableCFOptions mutable_cf_options(options);
+
+  constexpr int job_id = 1;
+  constexpr uint32_t column_family_id = 123;
+  constexpr char column_family_name[] = "foobar";
+  constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+  constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+  std::vector<std::string> blob_file_paths;
+  std::vector<BlobFileAddition> blob_file_additions;
+
+  BlobFileBuilder builder(
+      TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+      &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id,
+      column_family_id, column_family_name, io_priority, write_hint,
+      nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+      BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) {
+    Status* const s = static_cast<Status*>(arg);
+    assert(s);
+
+    (*s) = Status::IOError(sync_point_);
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr char key[] = "1";
+  constexpr char value[] = "deadbeef";
+
+  std::string blob_index;
+
+  ASSERT_TRUE(builder.Add(key, value, &blob_index).IsIOError());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  if (sync_point_ == "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile") {
+    ASSERT_TRUE(blob_file_paths.empty());
+  } else {
+    constexpr uint64_t blob_file_number = 2;
+
+    ASSERT_EQ(blob_file_paths.size(), 1);
+    ASSERT_EQ(blob_file_paths[0],
+              BlobFileName(immutable_options.cf_paths.front().path,
+                           blob_file_number));
+  }
+
+  ASSERT_TRUE(blob_file_additions.empty());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/blob_file_cache.cc b/src/rocksdb/db/blob/blob_file_cache.cc
new file mode 100644
index 000000000..1a6cdf688
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_cache.cc
@@ -0,0 +1,102 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_cache.h"
+
+#include <cassert>
+#include <memory>
+
+#include "db/blob/blob_file_reader.h"
+#include "options/cf_options.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/slice.h"
+#include "test_util/sync_point.h"
+#include "trace_replay/io_tracer.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlobFileCache::BlobFileCache(Cache* cache,
+                             const ImmutableOptions* immutable_options,
+                             const FileOptions* file_options,
+                             uint32_t column_family_id,
+                             HistogramImpl* blob_file_read_hist,
+                             const std::shared_ptr<IOTracer>& io_tracer)
+    : cache_(cache),
+      mutex_(kNumberOfMutexStripes, kGetSliceNPHash64UnseededFnPtr),
+      immutable_options_(immutable_options),
+      file_options_(file_options),
+      column_family_id_(column_family_id),
+      blob_file_read_hist_(blob_file_read_hist),
+      io_tracer_(io_tracer) {
+  assert(cache_);
+  assert(immutable_options_);
+  assert(file_options_);
+}
+
+Status BlobFileCache::GetBlobFileReader(
+    uint64_t blob_file_number,
+    CacheHandleGuard<BlobFileReader>* blob_file_reader) {
+  assert(blob_file_reader);
+  assert(blob_file_reader->IsEmpty());
+
+  const Slice key = GetSlice(&blob_file_number);
+
+  assert(cache_);
+
+  Cache::Handle* handle = cache_->Lookup(key);
+  if (handle) {
+    *blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle);
+    return Status::OK();
+  }
+
+  TEST_SYNC_POINT("BlobFileCache::GetBlobFileReader:DoubleCheck");
+
+  // Check again while holding mutex
+  MutexLock lock(mutex_.get(key));
+
+  handle = cache_->Lookup(key);
+  if (handle) {
+    *blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle);
+    return Status::OK();
+  }
+
+  assert(immutable_options_);
+  Statistics* const statistics = immutable_options_->stats;
+
+  RecordTick(statistics, NO_FILE_OPENS);
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  {
+    assert(file_options_);
+    const Status s = BlobFileReader::Create(
+        *immutable_options_, *file_options_, column_family_id_,
+        blob_file_read_hist_, blob_file_number, io_tracer_, &reader);
+    if (!s.ok()) {
+      RecordTick(statistics, NO_FILE_ERRORS);
+      return s;
+    }
+  }
+
+  {
+    constexpr size_t charge = 1;
+
+    const Status s = cache_->Insert(key, reader.get(), charge,
+                                    &DeleteCacheEntry<BlobFileReader>, &handle);
+    if (!s.ok()) {
+      RecordTick(statistics, NO_FILE_ERRORS);
+      return s;
+    }
+  }
+
+  reader.release();
+
+  *blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle);
+
+  return Status::OK();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_cache.h b/src/rocksdb/db/blob/blob_file_cache.h
new file mode 100644
index 000000000..8eec05f18
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_cache.h
@@ -0,0 +1,52 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cinttypes>
+
+#include "cache/cache_helpers.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cache;
+struct ImmutableOptions;
+struct FileOptions;
+class HistogramImpl;
+class Status;
+class BlobFileReader;
+class Slice;
+class IOTracer;
+
+class BlobFileCache {
+ public:
+  BlobFileCache(Cache* cache, const ImmutableOptions* immutable_options,
+                const FileOptions* file_options, uint32_t column_family_id,
+                HistogramImpl* blob_file_read_hist,
+                const std::shared_ptr<IOTracer>& io_tracer);
+
+  BlobFileCache(const BlobFileCache&) = delete;
+  BlobFileCache& operator=(const BlobFileCache&) = delete;
+
+  Status GetBlobFileReader(uint64_t blob_file_number,
+                           CacheHandleGuard<BlobFileReader>* blob_file_reader);
+
+ private:
+  Cache* cache_;
+  // Note: mutex_ below is used to guard against multiple threads racing to open
+  // the same file.
+  Striped<port::Mutex, Slice> mutex_;
+  const ImmutableOptions* immutable_options_;
+  const FileOptions* file_options_;
+  uint32_t column_family_id_;
+  HistogramImpl* blob_file_read_hist_;
+  std::shared_ptr<IOTracer> io_tracer_;
+
+  static constexpr size_t kNumberOfMutexStripes = 1 << 7;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_cache_test.cc b/src/rocksdb/db/blob/blob_file_cache_test.cc
new file mode 100644
index 000000000..d3a61b3c5
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_cache_test.cc
@@ -0,0 +1,269 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_cache.h"
+
+#include <cassert>
+#include <string>
+
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_writer.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Creates a test blob file with a single blob in it.
+void WriteBlobFile(uint32_t column_family_id,
+                   const ImmutableOptions& immutable_options,
+                   uint64_t blob_file_number) {
+  assert(!immutable_options.cf_paths.empty());
+
+  const std::string blob_file_path =
+      BlobFileName(immutable_options.cf_paths.front().path, blob_file_number);
+
+  std::unique_ptr<FSWritableFile> file;
+  ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file,
+                            FileOptions()));
+
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(file), blob_file_path, FileOptions(), immutable_options.clock));
+
+  constexpr Statistics* statistics = nullptr;
+  constexpr bool use_fsync = false;
+  constexpr bool do_flush = false;
+
+  BlobLogWriter blob_log_writer(std::move(file_writer), immutable_options.clock,
+                                statistics, blob_file_number, use_fsync,
+                                do_flush);
+
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+
+  BlobLogHeader header(column_family_id, kNoCompression, has_ttl,
+                       expiration_range);
+
+  ASSERT_OK(blob_log_writer.WriteHeader(header));
+
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  std::string compressed_blob;
+
+  uint64_t key_offset = 0;
+  uint64_t blob_offset = 0;
+
+  ASSERT_OK(blob_log_writer.AddRecord(key, blob, &key_offset, &blob_offset));
+
+  BlobLogFooter footer;
+  footer.blob_count = 1;
+  footer.expiration_range = expiration_range;
+
+  std::string checksum_method;
+  std::string checksum_value;
+
+  ASSERT_OK(
+      blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value));
+}
+
+}  // anonymous namespace
+
+class BlobFileCacheTest : public testing::Test {
+ protected:
+  BlobFileCacheTest() { mock_env_.reset(MockEnv::Create(Env::Default())); }
+
+  std::unique_ptr<Env> mock_env_;
+};
+
+TEST_F(BlobFileCacheTest, GetBlobFileReader) {
+  Options options;
+  options.env = mock_env_.get();
+  options.statistics = CreateDBStatistics();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileCacheTest_GetBlobFileReader"),
+      0);
+  options.enable_blob_files = true;
+
+  constexpr uint32_t column_family_id = 1;
+  ImmutableOptions immutable_options(options);
+  constexpr uint64_t blob_file_number = 123;
+
+  WriteBlobFile(column_family_id, immutable_options, blob_file_number);
+
+  constexpr size_t capacity = 10;
+  std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+  FileOptions file_options;
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options,
+                                &file_options, column_family_id,
+                                blob_file_read_hist, nullptr /*IOTracer*/);
+
+  // First try: reader should be opened and put in cache
+  CacheHandleGuard<BlobFileReader> first;
+
+  ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first));
+  ASSERT_NE(first.GetValue(), nullptr);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
+
+  // Second try: reader should be served from cache
+  CacheHandleGuard<BlobFileReader> second;
+
+  ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second));
+  ASSERT_NE(second.GetValue(), nullptr);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
+
+  ASSERT_EQ(first.GetValue(), second.GetValue());
+}
+
+TEST_F(BlobFileCacheTest, GetBlobFileReader_Race) {
+  Options options;
+  options.env = mock_env_.get();
+  options.statistics = CreateDBStatistics();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileCacheTest_GetBlobFileReader_Race"),
+      0);
+  options.enable_blob_files = true;
+
+  constexpr uint32_t column_family_id = 1;
+  ImmutableOptions immutable_options(options);
+  constexpr uint64_t blob_file_number = 123;
+
+  WriteBlobFile(column_family_id, immutable_options, blob_file_number);
+
+  constexpr size_t capacity = 10;
+  std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+  FileOptions file_options;
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options,
+                                &file_options, column_family_id,
+                                blob_file_read_hist, nullptr /*IOTracer*/);
+
+  CacheHandleGuard<BlobFileReader> first;
+  CacheHandleGuard<BlobFileReader> second;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileCache::GetBlobFileReader:DoubleCheck", [&](void* /* arg */) {
+        // Disabling sync points to prevent infinite recursion
+        SyncPoint::GetInstance()->DisableProcessing();
+
+        ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second));
+        ASSERT_NE(second.GetValue(), nullptr);
+        ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+        ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first));
+  ASSERT_NE(first.GetValue(), nullptr);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
+
+  ASSERT_EQ(first.GetValue(), second.GetValue());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(BlobFileCacheTest, GetBlobFileReader_IOError) {
+  Options options;
+  options.env = mock_env_.get();
+  options.statistics = CreateDBStatistics();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileCacheTest_GetBlobFileReader_IOError"),
+      0);
+  options.enable_blob_files = true;
+
+  constexpr size_t capacity = 10;
+  std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+  ImmutableOptions immutable_options(options);
+  FileOptions file_options;
+  constexpr uint32_t column_family_id = 1;
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options,
+                                &file_options, column_family_id,
+                                blob_file_read_hist, nullptr /*IOTracer*/);
+
+  // Note: there is no blob file with the below number
+  constexpr uint64_t blob_file_number = 123;
+
+  CacheHandleGuard<BlobFileReader> reader;
+
+  ASSERT_TRUE(
+      blob_file_cache.GetBlobFileReader(blob_file_number, &reader).IsIOError());
+  ASSERT_EQ(reader.GetValue(), nullptr);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1);
+}
+
+TEST_F(BlobFileCacheTest, GetBlobFileReader_CacheFull) {
+  Options options;
+  options.env = mock_env_.get();
+  options.statistics = CreateDBStatistics();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileCacheTest_GetBlobFileReader_CacheFull"),
+      0);
+  options.enable_blob_files = true;
+
+  constexpr uint32_t column_family_id = 1;
+  ImmutableOptions immutable_options(options);
+  constexpr uint64_t blob_file_number = 123;
+
+  WriteBlobFile(column_family_id, immutable_options, blob_file_number);
+
+  constexpr size_t capacity = 0;
+  constexpr int num_shard_bits = -1;  // determined automatically
+  constexpr bool strict_capacity_limit = true;
+  std::shared_ptr<Cache> backing_cache =
+      NewLRUCache(capacity, num_shard_bits, strict_capacity_limit);
+
+  FileOptions file_options;
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options,
+                                &file_options, column_family_id,
+                                blob_file_read_hist, nullptr /*IOTracer*/);
+
+  // Insert into cache should fail since it has zero capacity and
+  // strict_capacity_limit is set
+  CacheHandleGuard<BlobFileReader> reader;
+
+  ASSERT_TRUE(blob_file_cache.GetBlobFileReader(blob_file_number, &reader)
+                  .IsMemoryLimit());
+  ASSERT_EQ(reader.GetValue(), nullptr);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/blob_file_completion_callback.h b/src/rocksdb/db/blob/blob_file_completion_callback.h
new file mode 100644
index 000000000..ffe65a0ff
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_completion_callback.h
@@ -0,0 +1,101 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "file/sst_file_manager_impl.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlobFileCompletionCallback {
+ public:
+  BlobFileCompletionCallback(
+      SstFileManager* sst_file_manager, InstrumentedMutex* mutex,
+      ErrorHandler* error_handler, EventLogger* event_logger,
+      const std::vector<std::shared_ptr<EventListener>>& listeners,
+      const std::string& dbname)
+      : event_logger_(event_logger), listeners_(listeners), dbname_(dbname) {
+#ifndef ROCKSDB_LITE
+    sst_file_manager_ = sst_file_manager;
+    mutex_ = mutex;
+    error_handler_ = error_handler;
+#else
+    (void)sst_file_manager;
+    (void)mutex;
+    (void)error_handler;
+#endif  // ROCKSDB_LITE
+  }
+
+  void OnBlobFileCreationStarted(const std::string& file_name,
+                                 const std::string& column_family_name,
+                                 int job_id,
+                                 BlobFileCreationReason creation_reason) {
+#ifndef ROCKSDB_LITE
+    // Notify the listeners.
+    EventHelpers::NotifyBlobFileCreationStarted(listeners_, dbname_,
+                                                column_family_name, file_name,
+                                                job_id, creation_reason);
+#else
+    (void)file_name;
+    (void)column_family_name;
+    (void)job_id;
+    (void)creation_reason;
+#endif
+  }
+
+  Status OnBlobFileCompleted(const std::string& file_name,
+                             const std::string& column_family_name, int job_id,
+                             uint64_t file_number,
+                             BlobFileCreationReason creation_reason,
+                             const Status& report_status,
+                             const std::string& checksum_value,
+                             const std::string& checksum_method,
+                             uint64_t blob_count, uint64_t blob_bytes) {
+    Status s;
+
+#ifndef ROCKSDB_LITE
+    auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager_);
+    if (sfm) {
+      // Report new blob files to SstFileManagerImpl
+      s = sfm->OnAddFile(file_name);
+      if (sfm->IsMaxAllowedSpaceReached()) {
+        s = Status::SpaceLimit("Max allowed space was reached");
+        TEST_SYNC_POINT(
+            "BlobFileCompletionCallback::CallBack::MaxAllowedSpaceReached");
+        InstrumentedMutexLock l(mutex_);
+        error_handler_->SetBGError(s, BackgroundErrorReason::kFlush);
+      }
+    }
+#endif  // !ROCKSDB_LITE
+
+    // Notify the listeners.
+    EventHelpers::LogAndNotifyBlobFileCreationFinished(
+        event_logger_, listeners_, dbname_, column_family_name, file_name,
+        job_id, file_number, creation_reason,
+        (!report_status.ok() ? report_status : s),
+        (checksum_value.empty() ? kUnknownFileChecksum : checksum_value),
+        (checksum_method.empty() ? kUnknownFileChecksumFuncName
+                                 : checksum_method),
+        blob_count, blob_bytes);
+    return s;
+  }
+
+ private:
+#ifndef ROCKSDB_LITE
+  SstFileManager* sst_file_manager_;
+  InstrumentedMutex* mutex_;
+  ErrorHandler* error_handler_;
+#endif  // ROCKSDB_LITE
+  EventLogger* event_logger_;
+  std::vector<std::shared_ptr<EventListener>> listeners_;
+  std::string dbname_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_garbage.cc b/src/rocksdb/db/blob/blob_file_garbage.cc
new file mode 100644
index 000000000..52c336f49
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_garbage.cc
@@ -0,0 +1,134 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_garbage.h"
+
+#include <ostream>
+#include <sstream>
+
+#include "logging/event_logger.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Tags for custom fields. Note that these get persisted in the manifest,
+// so existing tags should not be modified.
+enum BlobFileGarbage::CustomFieldTags : uint32_t {
+  kEndMarker,
+
+  // Add forward compatible fields here
+
+  /////////////////////////////////////////////////////////////////////
+
+  kForwardIncompatibleMask = 1 << 6,
+
+  // Add forward incompatible fields here
+};
+
+void BlobFileGarbage::EncodeTo(std::string* output) const {
+  PutVarint64(output, blob_file_number_);
+  PutVarint64(output, garbage_blob_count_);
+  PutVarint64(output, garbage_blob_bytes_);
+
+  // Encode any custom fields here. The format to use is a Varint32 tag (see
+  // CustomFieldTags above) followed by a length prefixed slice. Unknown custom
+  // fields will be ignored during decoding unless they're in the forward
+  // incompatible range.
+
+  TEST_SYNC_POINT_CALLBACK("BlobFileGarbage::EncodeTo::CustomFields", output);
+
+  PutVarint32(output, kEndMarker);
+}
+
+Status BlobFileGarbage::DecodeFrom(Slice* input) {
+  constexpr char class_name[] = "BlobFileGarbage";
+
+  if (!GetVarint64(input, &blob_file_number_)) {
+    return Status::Corruption(class_name, "Error decoding blob file number");
+  }
+
+  if (!GetVarint64(input, &garbage_blob_count_)) {
+    return Status::Corruption(class_name, "Error decoding garbage blob count");
+  }
+
+  if (!GetVarint64(input, &garbage_blob_bytes_)) {
+    return Status::Corruption(class_name, "Error decoding garbage blob bytes");
+  }
+
+  while (true) {
+    uint32_t custom_field_tag = 0;
+    if (!GetVarint32(input, &custom_field_tag)) {
+      return Status::Corruption(class_name, "Error decoding custom field tag");
+    }
+
+    if (custom_field_tag == kEndMarker) {
+      break;
+    }
+
+    if (custom_field_tag & kForwardIncompatibleMask) {
+      return Status::Corruption(
+          class_name, "Forward incompatible custom field encountered");
+    }
+
+    Slice custom_field_value;
+    if (!GetLengthPrefixedSlice(input, &custom_field_value)) {
+      return Status::Corruption(class_name,
+                                "Error decoding custom field value");
+    }
+  }
+
+  return Status::OK();
+}
+
+std::string BlobFileGarbage::DebugString() const {
+  std::ostringstream oss;
+
+  oss << *this;
+
+  return oss.str();
+}
+
+std::string BlobFileGarbage::DebugJSON() const {
+  JSONWriter jw;
+
+  jw << *this;
+
+  jw.EndObject();
+
+  return jw.Get();
+}
+
+bool operator==(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs) {
+  return lhs.GetBlobFileNumber() == rhs.GetBlobFileNumber() &&
+         lhs.GetGarbageBlobCount() == rhs.GetGarbageBlobCount() &&
+         lhs.GetGarbageBlobBytes() == rhs.GetGarbageBlobBytes();
+}
+
+bool operator!=(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs) {
+  return !(lhs == rhs);
+}
+
+std::ostream& operator<<(std::ostream& os,
+                         const BlobFileGarbage& blob_file_garbage) {
+  os << "blob_file_number: " << blob_file_garbage.GetBlobFileNumber()
+     << " garbage_blob_count: " << blob_file_garbage.GetGarbageBlobCount()
+     << " garbage_blob_bytes: " << blob_file_garbage.GetGarbageBlobBytes();
+
+  return os;
+}
+
+JSONWriter& operator<<(JSONWriter& jw,
+                       const BlobFileGarbage& blob_file_garbage) {
+  jw << "BlobFileNumber" << blob_file_garbage.GetBlobFileNumber()
+     << "GarbageBlobCount" << blob_file_garbage.GetGarbageBlobCount()
+     << "GarbageBlobBytes" << blob_file_garbage.GetGarbageBlobBytes();
+
+  return jw;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_garbage.h b/src/rocksdb/db/blob/blob_file_garbage.h
new file mode 100644
index 000000000..6dc14ddca
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_garbage.h
@@ -0,0 +1,57 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+#include <iosfwd>
+#include <string>
+
+#include "db/blob/blob_constants.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class JSONWriter;
+class Slice;
+class Status;
+
+class BlobFileGarbage {
+ public:
+  BlobFileGarbage() = default;
+
+  BlobFileGarbage(uint64_t blob_file_number, uint64_t garbage_blob_count,
+                  uint64_t garbage_blob_bytes)
+      : blob_file_number_(blob_file_number),
+        garbage_blob_count_(garbage_blob_count),
+        garbage_blob_bytes_(garbage_blob_bytes) {}
+
+  uint64_t GetBlobFileNumber() const { return blob_file_number_; }
+  uint64_t GetGarbageBlobCount() const { return garbage_blob_count_; }
+  uint64_t GetGarbageBlobBytes() const { return garbage_blob_bytes_; }
+
+  void EncodeTo(std::string* output) const;
+  Status DecodeFrom(Slice* input);
+
+  std::string DebugString() const;
+  std::string DebugJSON() const;
+
+ private:
+  enum CustomFieldTags : uint32_t;
+
+  uint64_t blob_file_number_ = kInvalidBlobFileNumber;
+  uint64_t garbage_blob_count_ = 0;
+  uint64_t garbage_blob_bytes_ = 0;
+};
+
+bool operator==(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs);
+bool operator!=(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs);
+
+std::ostream& operator<<(std::ostream& os,
+                         const BlobFileGarbage& blob_file_garbage);
+JSONWriter& operator<<(JSONWriter& jw,
+                       const BlobFileGarbage& blob_file_garbage);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_garbage_test.cc b/src/rocksdb/db/blob/blob_file_garbage_test.cc
new file mode 100644
index 000000000..292a8b38a
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_garbage_test.cc
@@ -0,0 +1,174 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_garbage.h"
+
+#include <cstdint>
+#include <cstring>
+#include <string>
+
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlobFileGarbageTest : public testing::Test {
+ public:
+  static void TestEncodeDecode(const BlobFileGarbage& blob_file_garbage) {
+    std::string encoded;
+    blob_file_garbage.EncodeTo(&encoded);
+
+    BlobFileGarbage decoded;
+    Slice input(encoded);
+    ASSERT_OK(decoded.DecodeFrom(&input));
+
+    ASSERT_EQ(blob_file_garbage, decoded);
+  }
+};
+
+TEST_F(BlobFileGarbageTest, Empty) {
+  BlobFileGarbage blob_file_garbage;
+
+  ASSERT_EQ(blob_file_garbage.GetBlobFileNumber(), kInvalidBlobFileNumber);
+  ASSERT_EQ(blob_file_garbage.GetGarbageBlobCount(), 0);
+  ASSERT_EQ(blob_file_garbage.GetGarbageBlobBytes(), 0);
+
+  TestEncodeDecode(blob_file_garbage);
+}
+
+TEST_F(BlobFileGarbageTest, NonEmpty) {
+  constexpr uint64_t blob_file_number = 123;
+  constexpr uint64_t garbage_blob_count = 1;
+  constexpr uint64_t garbage_blob_bytes = 9876;
+
+  BlobFileGarbage blob_file_garbage(blob_file_number, garbage_blob_count,
+                                    garbage_blob_bytes);
+
+  ASSERT_EQ(blob_file_garbage.GetBlobFileNumber(), blob_file_number);
+  ASSERT_EQ(blob_file_garbage.GetGarbageBlobCount(), garbage_blob_count);
+  ASSERT_EQ(blob_file_garbage.GetGarbageBlobBytes(), garbage_blob_bytes);
+
+  TestEncodeDecode(blob_file_garbage);
+}
+
+TEST_F(BlobFileGarbageTest, DecodeErrors) {
+  std::string str;
+  Slice slice(str);
+
+  BlobFileGarbage blob_file_garbage;
+
+  {
+    const Status s = blob_file_garbage.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "blob file number"));
+  }
+
+  constexpr uint64_t blob_file_number = 123;
+  PutVarint64(&str, blob_file_number);
+  slice = str;
+
+  {
+    const Status s = blob_file_garbage.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "garbage blob count"));
+  }
+
+  constexpr uint64_t garbage_blob_count = 4567;
+  PutVarint64(&str, garbage_blob_count);
+  slice = str;
+
+  {
+    const Status s = blob_file_garbage.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "garbage blob bytes"));
+  }
+
+  constexpr uint64_t garbage_blob_bytes = 12345678;
+  PutVarint64(&str, garbage_blob_bytes);
+  slice = str;
+
+  {
+    const Status s = blob_file_garbage.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "custom field tag"));
+  }
+
+  constexpr uint32_t custom_tag = 2;
+  PutVarint32(&str, custom_tag);
+  slice = str;
+
+  {
+    const Status s = blob_file_garbage.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "custom field value"));
+  }
+}
+
+TEST_F(BlobFileGarbageTest, ForwardCompatibleCustomField) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileGarbage::EncodeTo::CustomFields", [&](void* arg) {
+        std::string* output = static_cast<std::string*>(arg);
+
+        constexpr uint32_t forward_compatible_tag = 2;
+        PutVarint32(output, forward_compatible_tag);
+
+        PutLengthPrefixedSlice(output, "deadbeef");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr uint64_t blob_file_number = 678;
+  constexpr uint64_t garbage_blob_count = 9999;
+  constexpr uint64_t garbage_blob_bytes = 100000000;
+
+  BlobFileGarbage blob_file_garbage(blob_file_number, garbage_blob_count,
+                                    garbage_blob_bytes);
+
+  TestEncodeDecode(blob_file_garbage);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(BlobFileGarbageTest, ForwardIncompatibleCustomField) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileGarbage::EncodeTo::CustomFields", [&](void* arg) {
+        std::string* output = static_cast<std::string*>(arg);
+
+        constexpr uint32_t forward_incompatible_tag = (1 << 6) + 1;
+        PutVarint32(output, forward_incompatible_tag);
+
+        PutLengthPrefixedSlice(output, "foobar");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr uint64_t blob_file_number = 456;
+  constexpr uint64_t garbage_blob_count = 100;
+  constexpr uint64_t garbage_blob_bytes = 2000000;
+
+  BlobFileGarbage blob_file_garbage(blob_file_number, garbage_blob_count,
+                                    garbage_blob_bytes);
+
+  std::string encoded;
+  blob_file_garbage.EncodeTo(&encoded);
+
+  BlobFileGarbage decoded_blob_file_addition;
+  Slice input(encoded);
+  const Status s = decoded_blob_file_addition.DecodeFrom(&input);
+
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(), "Forward incompatible"));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/blob_file_meta.cc b/src/rocksdb/db/blob/blob_file_meta.cc
new file mode 100644
index 000000000..4913137e5
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_meta.cc
@@ -0,0 +1,62 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_meta.h"
+
+#include <ostream>
+#include <sstream>
+
+#include "db/blob/blob_log_format.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+uint64_t SharedBlobFileMetaData::GetBlobFileSize() const {
+  return BlobLogHeader::kSize + total_blob_bytes_ + BlobLogFooter::kSize;
+}
+
+std::string SharedBlobFileMetaData::DebugString() const {
+  std::ostringstream oss;
+  oss << (*this);
+
+  return oss.str();
+}
+
+std::ostream& operator<<(std::ostream& os,
+                         const SharedBlobFileMetaData& shared_meta) {
+  os << "blob_file_number: " << shared_meta.GetBlobFileNumber()
+     << " total_blob_count: " << shared_meta.GetTotalBlobCount()
+     << " total_blob_bytes: " << shared_meta.GetTotalBlobBytes()
+     << " checksum_method: " << shared_meta.GetChecksumMethod()
+     << " checksum_value: "
+     << Slice(shared_meta.GetChecksumValue()).ToString(/* hex */ true);
+
+  return os;
+}
+
+std::string BlobFileMetaData::DebugString() const {
+  std::ostringstream oss;
+  oss << (*this);
+
+  return oss.str();
+}
+
+std::ostream& operator<<(std::ostream& os, const BlobFileMetaData& meta) {
+  const auto& shared_meta = meta.GetSharedMeta();
+  assert(shared_meta);
+  os << (*shared_meta);
+
+  os << " linked_ssts: {";
+  for (uint64_t file_number : meta.GetLinkedSsts()) {
+    os << ' ' << file_number;
+  }
+  os << " }";
+
+  os << " garbage_blob_count: " << meta.GetGarbageBlobCount()
+     << " garbage_blob_bytes: " << meta.GetGarbageBlobBytes();
+
+  return os;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_meta.h b/src/rocksdb/db/blob/blob_file_meta.h
new file mode 100644
index 000000000..d7c8a1243
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_meta.h
@@ -0,0 +1,170 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <unordered_set>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// SharedBlobFileMetaData represents the immutable part of blob files' metadata,
+// like the blob file number, total number and size of blobs, or checksum
+// method and value. There is supposed to be one object of this class per blob
+// file (shared across all versions that include the blob file in question);
+// hence, the type is neither copyable nor movable. A blob file can be marked
+// obsolete when the corresponding SharedBlobFileMetaData object is destroyed.
+
+class SharedBlobFileMetaData {
+ public:
+  static std::shared_ptr<SharedBlobFileMetaData> Create(
+      uint64_t blob_file_number, uint64_t total_blob_count,
+      uint64_t total_blob_bytes, std::string checksum_method,
+      std::string checksum_value) {
+    return std::shared_ptr<SharedBlobFileMetaData>(new SharedBlobFileMetaData(
+        blob_file_number, total_blob_count, total_blob_bytes,
+        std::move(checksum_method), std::move(checksum_value)));
+  }
+
+  template <typename Deleter>
+  static std::shared_ptr<SharedBlobFileMetaData> Create(
+      uint64_t blob_file_number, uint64_t total_blob_count,
+      uint64_t total_blob_bytes, std::string checksum_method,
+      std::string checksum_value, Deleter deleter) {
+    return std::shared_ptr<SharedBlobFileMetaData>(
+        new SharedBlobFileMetaData(blob_file_number, total_blob_count,
+                                   total_blob_bytes, std::move(checksum_method),
+                                   std::move(checksum_value)),
+        deleter);
+  }
+
+  SharedBlobFileMetaData(const SharedBlobFileMetaData&) = delete;
+  SharedBlobFileMetaData& operator=(const SharedBlobFileMetaData&) = delete;
+
+  SharedBlobFileMetaData(SharedBlobFileMetaData&&) = delete;
+  SharedBlobFileMetaData& operator=(SharedBlobFileMetaData&&) = delete;
+
+  uint64_t GetBlobFileSize() const;
+  uint64_t GetBlobFileNumber() const { return blob_file_number_; }
+  uint64_t GetTotalBlobCount() const { return total_blob_count_; }
+  uint64_t GetTotalBlobBytes() const { return total_blob_bytes_; }
+  const std::string& GetChecksumMethod() const { return checksum_method_; }
+  const std::string& GetChecksumValue() const { return checksum_value_; }
+
+  std::string DebugString() const;
+
+ private:
+  SharedBlobFileMetaData(uint64_t blob_file_number, uint64_t total_blob_count,
+                         uint64_t total_blob_bytes, std::string checksum_method,
+                         std::string checksum_value)
+      : blob_file_number_(blob_file_number),
+        total_blob_count_(total_blob_count),
+        total_blob_bytes_(total_blob_bytes),
+        checksum_method_(std::move(checksum_method)),
+        checksum_value_(std::move(checksum_value)) {
+    assert(checksum_method_.empty() == checksum_value_.empty());
+  }
+
+  uint64_t blob_file_number_;
+  uint64_t total_blob_count_;
+  uint64_t total_blob_bytes_;
+  std::string checksum_method_;
+  std::string checksum_value_;
+};
+
+std::ostream& operator<<(std::ostream& os,
+                         const SharedBlobFileMetaData& shared_meta);
+
+// BlobFileMetaData contains the part of the metadata for blob files that can
+// vary across versions, like the amount of garbage in the blob file. In
+// addition, BlobFileMetaData objects point to and share the ownership of the
+// SharedBlobFileMetaData object for the corresponding blob file. Similarly to
+// SharedBlobFileMetaData, BlobFileMetaData are not copyable or movable. They
+// are meant to be jointly owned by the versions in which the blob file has the
+// same (immutable *and* mutable) state.
+
+class BlobFileMetaData {
+ public:
+  using LinkedSsts = std::unordered_set<uint64_t>;
+
+  static std::shared_ptr<BlobFileMetaData> Create(
+      std::shared_ptr<SharedBlobFileMetaData> shared_meta,
+      LinkedSsts linked_ssts, uint64_t garbage_blob_count,
+      uint64_t garbage_blob_bytes) {
+    return std::shared_ptr<BlobFileMetaData>(
+        new BlobFileMetaData(std::move(shared_meta), std::move(linked_ssts),
+                             garbage_blob_count, garbage_blob_bytes));
+  }
+
+  BlobFileMetaData(const BlobFileMetaData&) = delete;
+  BlobFileMetaData& operator=(const BlobFileMetaData&) = delete;
+
+  BlobFileMetaData(BlobFileMetaData&&) = delete;
+  BlobFileMetaData& operator=(BlobFileMetaData&&) = delete;
+
+  const std::shared_ptr<SharedBlobFileMetaData>& GetSharedMeta() const {
+    return shared_meta_;
+  }
+
+  uint64_t GetBlobFileSize() const {
+    assert(shared_meta_);
+    return shared_meta_->GetBlobFileSize();
+  }
+
+  uint64_t GetBlobFileNumber() const {
+    assert(shared_meta_);
+    return shared_meta_->GetBlobFileNumber();
+  }
+  uint64_t GetTotalBlobCount() const {
+    assert(shared_meta_);
+    return shared_meta_->GetTotalBlobCount();
+  }
+  uint64_t GetTotalBlobBytes() const {
+    assert(shared_meta_);
+    return shared_meta_->GetTotalBlobBytes();
+  }
+  const std::string& GetChecksumMethod() const {
+    assert(shared_meta_);
+    return shared_meta_->GetChecksumMethod();
+  }
+  const std::string& GetChecksumValue() const {
+    assert(shared_meta_);
+    return shared_meta_->GetChecksumValue();
+  }
+
+  const LinkedSsts& GetLinkedSsts() const { return linked_ssts_; }
+
+  uint64_t GetGarbageBlobCount() const { return garbage_blob_count_; }
+  uint64_t GetGarbageBlobBytes() const { return garbage_blob_bytes_; }
+
+  std::string DebugString() const;
+
+ private:
+  BlobFileMetaData(std::shared_ptr<SharedBlobFileMetaData> shared_meta,
+                   LinkedSsts linked_ssts, uint64_t garbage_blob_count,
+                   uint64_t garbage_blob_bytes)
+      : shared_meta_(std::move(shared_meta)),
+        linked_ssts_(std::move(linked_ssts)),
+        garbage_blob_count_(garbage_blob_count),
+        garbage_blob_bytes_(garbage_blob_bytes) {
+    assert(shared_meta_);
+    assert(garbage_blob_count_ <= shared_meta_->GetTotalBlobCount());
+    assert(garbage_blob_bytes_ <= shared_meta_->GetTotalBlobBytes());
+  }
+
+  std::shared_ptr<SharedBlobFileMetaData> shared_meta_;
+  LinkedSsts linked_ssts_;
+  uint64_t garbage_blob_count_;
+  uint64_t garbage_blob_bytes_;
+};
+
+std::ostream& operator<<(std::ostream& os, const BlobFileMetaData& meta);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_reader.cc b/src/rocksdb/db/blob/blob_file_reader.cc
new file mode 100644
index 000000000..a4eabb605
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_reader.cc
@@ -0,0 +1,610 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_reader.h"
+
+#include <cassert>
+#include <string>
+
+#include "db/blob/blob_contents.h"
+#include "db/blob/blob_log_format.h"
+#include "file/file_prefetch_buffer.h"
+#include "file/filename.h"
+#include "monitoring/statistics.h"
+#include "options/cf_options.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "table/multiget_context.h"
+#include "test_util/sync_point.h"
+#include "util/compression.h"
+#include "util/crc32c.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status BlobFileReader::Create(
+    const ImmutableOptions& immutable_options, const FileOptions& file_options,
+    uint32_t column_family_id, HistogramImpl* blob_file_read_hist,
+    uint64_t blob_file_number, const std::shared_ptr<IOTracer>& io_tracer,
+    std::unique_ptr<BlobFileReader>* blob_file_reader) {
+  assert(blob_file_reader);
+  assert(!*blob_file_reader);
+
+  uint64_t file_size = 0;
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+
+  {
+    const Status s =
+        OpenFile(immutable_options, file_options, blob_file_read_hist,
+                 blob_file_number, io_tracer, &file_size, &file_reader);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  assert(file_reader);
+
+  Statistics* const statistics = immutable_options.stats;
+
+  CompressionType compression_type = kNoCompression;
+
+  {
+    const Status s = ReadHeader(file_reader.get(), column_family_id, statistics,
+                                &compression_type);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  {
+    const Status s = ReadFooter(file_reader.get(), file_size, statistics);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  blob_file_reader->reset(
+      new BlobFileReader(std::move(file_reader), file_size, compression_type,
+                         immutable_options.clock, statistics));
+
+  return Status::OK();
+}
+
+Status BlobFileReader::OpenFile(
+    const ImmutableOptions& immutable_options, const FileOptions& file_opts,
+    HistogramImpl* blob_file_read_hist, uint64_t blob_file_number,
+    const std::shared_ptr<IOTracer>& io_tracer, uint64_t* file_size,
+    std::unique_ptr<RandomAccessFileReader>* file_reader) {
+  assert(file_size);
+  assert(file_reader);
+
+  const auto& cf_paths = immutable_options.cf_paths;
+  assert(!cf_paths.empty());
+
+  const std::string blob_file_path =
+      BlobFileName(cf_paths.front().path, blob_file_number);
+
+  FileSystem* const fs = immutable_options.fs.get();
+  assert(fs);
+
+  constexpr IODebugContext* dbg = nullptr;
+
+  {
+    TEST_SYNC_POINT("BlobFileReader::OpenFile:GetFileSize");
+
+    const Status s =
+        fs->GetFileSize(blob_file_path, IOOptions(), file_size, dbg);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (*file_size < BlobLogHeader::kSize + BlobLogFooter::kSize) {
+    return Status::Corruption("Malformed blob file");
+  }
+
+  std::unique_ptr<FSRandomAccessFile> file;
+
+  {
+    TEST_SYNC_POINT("BlobFileReader::OpenFile:NewRandomAccessFile");
+
+    const Status s =
+        fs->NewRandomAccessFile(blob_file_path, file_opts, &file, dbg);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  assert(file);
+
+  if (immutable_options.advise_random_on_open) {
+    file->Hint(FSRandomAccessFile::kRandom);
+  }
+
+  file_reader->reset(new RandomAccessFileReader(
+      std::move(file), blob_file_path, immutable_options.clock, io_tracer,
+      immutable_options.stats, BLOB_DB_BLOB_FILE_READ_MICROS,
+      blob_file_read_hist, immutable_options.rate_limiter.get(),
+      immutable_options.listeners));
+
+  return Status::OK();
+}
+
+Status BlobFileReader::ReadHeader(const RandomAccessFileReader* file_reader,
+                                  uint32_t column_family_id,
+                                  Statistics* statistics,
+                                  CompressionType* compression_type) {
+  assert(file_reader);
+  assert(compression_type);
+
+  Slice header_slice;
+  Buffer buf;
+  AlignedBuf aligned_buf;
+
+  {
+    TEST_SYNC_POINT("BlobFileReader::ReadHeader:ReadFromFile");
+
+    constexpr uint64_t read_offset = 0;
+    constexpr size_t read_size = BlobLogHeader::kSize;
+
+    // TODO: rate limit reading headers from blob files.
+    const Status s = ReadFromFile(file_reader, read_offset, read_size,
+                                  statistics, &header_slice, &buf, &aligned_buf,
+                                  Env::IO_TOTAL /* rate_limiter_priority */);
+    if (!s.ok()) {
+      return s;
+    }
+
+    TEST_SYNC_POINT_CALLBACK("BlobFileReader::ReadHeader:TamperWithResult",
+                             &header_slice);
+  }
+
+  BlobLogHeader header;
+
+  {
+    const Status s = header.DecodeFrom(header_slice);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  constexpr ExpirationRange no_expiration_range;
+
+  if (header.has_ttl || header.expiration_range != no_expiration_range) {
+    return Status::Corruption("Unexpected TTL blob file");
+  }
+
+  if (header.column_family_id != column_family_id) {
+    return Status::Corruption("Column family ID mismatch");
+  }
+
+  *compression_type = header.compression;
+
+  return Status::OK();
+}
+
+Status BlobFileReader::ReadFooter(const RandomAccessFileReader* file_reader,
+                                  uint64_t file_size, Statistics* statistics) {
+  assert(file_size >= BlobLogHeader::kSize + BlobLogFooter::kSize);
+  assert(file_reader);
+
+  Slice footer_slice;
+  Buffer buf;
+  AlignedBuf aligned_buf;
+
+  {
+    TEST_SYNC_POINT("BlobFileReader::ReadFooter:ReadFromFile");
+
+    const uint64_t read_offset = file_size - BlobLogFooter::kSize;
+    constexpr size_t read_size = BlobLogFooter::kSize;
+
+    // TODO: rate limit reading footers from blob files.
+    const Status s = ReadFromFile(file_reader, read_offset, read_size,
+                                  statistics, &footer_slice, &buf, &aligned_buf,
+                                  Env::IO_TOTAL /* rate_limiter_priority */);
+    if (!s.ok()) {
+      return s;
+    }
+
+    TEST_SYNC_POINT_CALLBACK("BlobFileReader::ReadFooter:TamperWithResult",
+                             &footer_slice);
+  }
+
+  BlobLogFooter footer;
+
+  {
+    const Status s = footer.DecodeFrom(footer_slice);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  constexpr ExpirationRange no_expiration_range;
+
+  if (footer.expiration_range != no_expiration_range) {
+    return Status::Corruption("Unexpected TTL blob file");
+  }
+
+  return Status::OK();
+}
+
+Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader,
+                                    uint64_t read_offset, size_t read_size,
+                                    Statistics* statistics, Slice* slice,
+                                    Buffer* buf, AlignedBuf* aligned_buf,
+                                    Env::IOPriority rate_limiter_priority) {
+  assert(slice);
+  assert(buf);
+  assert(aligned_buf);
+
+  assert(file_reader);
+
+  RecordTick(statistics, BLOB_DB_BLOB_FILE_BYTES_READ, read_size);
+
+  Status s;
+
+  if (file_reader->use_direct_io()) {
+    constexpr char* scratch = nullptr;
+
+    s = file_reader->Read(IOOptions(), read_offset, read_size, slice, scratch,
+                          aligned_buf, rate_limiter_priority);
+  } else {
+    buf->reset(new char[read_size]);
+    constexpr AlignedBuf* aligned_scratch = nullptr;
+
+    s = file_reader->Read(IOOptions(), read_offset, read_size, slice,
+                          buf->get(), aligned_scratch, rate_limiter_priority);
+  }
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (slice->size() != read_size) {
+    return Status::Corruption("Failed to read data from blob file");
+  }
+
+  return Status::OK();
+}
+
+BlobFileReader::BlobFileReader(
+    std::unique_ptr<RandomAccessFileReader>&& file_reader, uint64_t file_size,
+    CompressionType compression_type, SystemClock* clock,
+    Statistics* statistics)
+    : file_reader_(std::move(file_reader)),
+      file_size_(file_size),
+      compression_type_(compression_type),
+      clock_(clock),
+      statistics_(statistics) {
+  assert(file_reader_);
+}
+
+BlobFileReader::~BlobFileReader() = default;
+
+Status BlobFileReader::GetBlob(
+    const ReadOptions& read_options, const Slice& user_key, uint64_t offset,
+    uint64_t value_size, CompressionType compression_type,
+    FilePrefetchBuffer* prefetch_buffer, MemoryAllocator* allocator,
+    std::unique_ptr<BlobContents>* result, uint64_t* bytes_read) const {
+  assert(result);
+
+  const uint64_t key_size = user_key.size();
+
+  if (!IsValidBlobOffset(offset, key_size, value_size, file_size_)) {
+    return Status::Corruption("Invalid blob offset");
+  }
+
+  if (compression_type != compression_type_) {
+    return Status::Corruption("Compression type mismatch when reading blob");
+  }
+
+  // Note: if verify_checksum is set, we read the entire blob record to be able
+  // to perform the verification; otherwise, we just read the blob itself. Since
+  // the offset in BlobIndex actually points to the blob value, we need to make
+  // an adjustment in the former case.
+  const uint64_t adjustment =
+      read_options.verify_checksums
+          ? BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size)
+          : 0;
+  assert(offset >= adjustment);
+
+  const uint64_t record_offset = offset - adjustment;
+  const uint64_t record_size = value_size + adjustment;
+
+  Slice record_slice;
+  Buffer buf;
+  AlignedBuf aligned_buf;
+
+  bool prefetched = false;
+
+  if (prefetch_buffer) {
+    Status s;
+    constexpr bool for_compaction = true;
+
+    prefetched = prefetch_buffer->TryReadFromCache(
+        IOOptions(), file_reader_.get(), record_offset,
+        static_cast<size_t>(record_size), &record_slice, &s,
+        read_options.rate_limiter_priority, for_compaction);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (!prefetched) {
+    TEST_SYNC_POINT("BlobFileReader::GetBlob:ReadFromFile");
+    PERF_COUNTER_ADD(blob_read_count, 1);
+    PERF_COUNTER_ADD(blob_read_byte, record_size);
+    PERF_TIMER_GUARD(blob_read_time);
+    const Status s = ReadFromFile(file_reader_.get(), record_offset,
+                                  static_cast<size_t>(record_size), statistics_,
+                                  &record_slice, &buf, &aligned_buf,
+                                  read_options.rate_limiter_priority);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  TEST_SYNC_POINT_CALLBACK("BlobFileReader::GetBlob:TamperWithResult",
+                           &record_slice);
+
+  if (read_options.verify_checksums) {
+    const Status s = VerifyBlob(record_slice, user_key, value_size);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  const Slice value_slice(record_slice.data() + adjustment, value_size);
+
+  {
+    const Status s = UncompressBlobIfNeeded(
+        value_slice, compression_type, allocator, clock_, statistics_, result);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (bytes_read) {
+    *bytes_read = record_size;
+  }
+
+  return Status::OK();
+}
+
+void BlobFileReader::MultiGetBlob(
+    const ReadOptions& read_options, MemoryAllocator* allocator,
+    autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>&
+        blob_reqs,
+    uint64_t* bytes_read) const {
+  const size_t num_blobs = blob_reqs.size();
+  assert(num_blobs > 0);
+  assert(num_blobs <= MultiGetContext::MAX_BATCH_SIZE);
+
+#ifndef NDEBUG
+  for (size_t i = 0; i < num_blobs - 1; ++i) {
+    assert(blob_reqs[i].first->offset <= blob_reqs[i + 1].first->offset);
+  }
+#endif  // !NDEBUG
+
+  std::vector<FSReadRequest> read_reqs;
+  autovector<uint64_t> adjustments;
+  uint64_t total_len = 0;
+  read_reqs.reserve(num_blobs);
+  for (size_t i = 0; i < num_blobs; ++i) {
+    BlobReadRequest* const req = blob_reqs[i].first;
+    assert(req);
+    assert(req->user_key);
+    assert(req->status);
+
+    const size_t key_size = req->user_key->size();
+    const uint64_t offset = req->offset;
+    const uint64_t value_size = req->len;
+
+    if (!IsValidBlobOffset(offset, key_size, value_size, file_size_)) {
+      *req->status = Status::Corruption("Invalid blob offset");
+      continue;
+    }
+    if (req->compression != compression_type_) {
+      *req->status =
+          Status::Corruption("Compression type mismatch when reading a blob");
+      continue;
+    }
+
+    const uint64_t adjustment =
+        read_options.verify_checksums
+            ? BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size)
+            : 0;
+    assert(req->offset >= adjustment);
+    adjustments.push_back(adjustment);
+
+    FSReadRequest read_req = {};
+    read_req.offset = req->offset - adjustment;
+    read_req.len = req->len + adjustment;
+    read_reqs.emplace_back(read_req);
+    total_len += read_req.len;
+  }
+
+  RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, total_len);
+
+  Buffer buf;
+  AlignedBuf aligned_buf;
+
+  Status s;
+  bool direct_io = file_reader_->use_direct_io();
+  if (direct_io) {
+    for (size_t i = 0; i < read_reqs.size(); ++i) {
+      read_reqs[i].scratch = nullptr;
+    }
+  } else {
+    buf.reset(new char[total_len]);
+    std::ptrdiff_t pos = 0;
+    for (size_t i = 0; i < read_reqs.size(); ++i) {
+      read_reqs[i].scratch = buf.get() + pos;
+      pos += read_reqs[i].len;
+    }
+  }
+  TEST_SYNC_POINT("BlobFileReader::MultiGetBlob:ReadFromFile");
+  PERF_COUNTER_ADD(blob_read_count, num_blobs);
+  PERF_COUNTER_ADD(blob_read_byte, total_len);
+  s = file_reader_->MultiRead(IOOptions(), read_reqs.data(), read_reqs.size(),
+                              direct_io ? &aligned_buf : nullptr,
+                              read_options.rate_limiter_priority);
+  if (!s.ok()) {
+    for (auto& req : read_reqs) {
+      req.status.PermitUncheckedError();
+    }
+    for (auto& blob_req : blob_reqs) {
+      BlobReadRequest* const req = blob_req.first;
+      assert(req);
+      assert(req->status);
+
+      if (!req->status->IsCorruption()) {
+        // Avoid overwriting corruption status.
+        *req->status = s;
+      }
+    }
+    return;
+  }
+
+  assert(s.ok());
+
+  uint64_t total_bytes = 0;
+  for (size_t i = 0, j = 0; i < num_blobs; ++i) {
+    BlobReadRequest* const req = blob_reqs[i].first;
+    assert(req);
+    assert(req->user_key);
+    assert(req->status);
+
+    if (!req->status->ok()) {
+      continue;
+    }
+
+    assert(j < read_reqs.size());
+    auto& read_req = read_reqs[j++];
+    const auto& record_slice = read_req.result;
+    if (read_req.status.ok() && record_slice.size() != read_req.len) {
+      read_req.status =
+          IOStatus::Corruption("Failed to read data from blob file");
+    }
+
+    *req->status = read_req.status;
+    if (!req->status->ok()) {
+      continue;
+    }
+
+    // Verify checksums if enabled
+    if (read_options.verify_checksums) {
+      *req->status = VerifyBlob(record_slice, *req->user_key, req->len);
+      if (!req->status->ok()) {
+        continue;
+      }
+    }
+
+    // Uncompress blob if needed
+    Slice value_slice(record_slice.data() + adjustments[i], req->len);
+    *req->status =
+        UncompressBlobIfNeeded(value_slice, compression_type_, allocator,
+                               clock_, statistics_, &blob_reqs[i].second);
+    if (req->status->ok()) {
+      total_bytes += record_slice.size();
+    }
+  }
+
+  if (bytes_read) {
+    *bytes_read = total_bytes;
+  }
+}
+
+Status BlobFileReader::VerifyBlob(const Slice& record_slice,
+                                  const Slice& user_key, uint64_t value_size) {
+  PERF_TIMER_GUARD(blob_checksum_time);
+
+  BlobLogRecord record;
+
+  const Slice header_slice(record_slice.data(), BlobLogRecord::kHeaderSize);
+
+  {
+    const Status s = record.DecodeHeaderFrom(header_slice);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (record.key_size != user_key.size()) {
+    return Status::Corruption("Key size mismatch when reading blob");
+  }
+
+  if (record.value_size != value_size) {
+    return Status::Corruption("Value size mismatch when reading blob");
+  }
+
+  record.key =
+      Slice(record_slice.data() + BlobLogRecord::kHeaderSize, record.key_size);
+  if (record.key != user_key) {
+    return Status::Corruption("Key mismatch when reading blob");
+  }
+
+  record.value = Slice(record.key.data() + record.key_size, value_size);
+
+  {
+    TEST_SYNC_POINT_CALLBACK("BlobFileReader::VerifyBlob:CheckBlobCRC",
+                             &record);
+
+    const Status s = record.CheckBlobCRC();
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  return Status::OK();
+}
+
+Status BlobFileReader::UncompressBlobIfNeeded(
+    const Slice& value_slice, CompressionType compression_type,
+    MemoryAllocator* allocator, SystemClock* clock, Statistics* statistics,
+    std::unique_ptr<BlobContents>* result) {
+  assert(result);
+
+  if (compression_type == kNoCompression) {
+    CacheAllocationPtr allocation =
+        AllocateBlock(value_slice.size(), allocator);
+    memcpy(allocation.get(), value_slice.data(), value_slice.size());
+
+    *result = BlobContents::Create(std::move(allocation), value_slice.size());
+
+    return Status::OK();
+  }
+
+  UncompressionContext context(compression_type);
+  UncompressionInfo info(context, UncompressionDict::GetEmptyDict(),
+                         compression_type);
+
+  size_t uncompressed_size = 0;
+  constexpr uint32_t compression_format_version = 2;
+
+  CacheAllocationPtr output;
+
+  {
+    PERF_TIMER_GUARD(blob_decompress_time);
+    StopWatch stop_watch(clock, statistics, BLOB_DB_DECOMPRESSION_MICROS);
+    output = UncompressData(info, value_slice.data(), value_slice.size(),
+                            &uncompressed_size, compression_format_version,
+                            allocator);
+  }
+
+  TEST_SYNC_POINT_CALLBACK(
+      "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", &output);
+
+  if (!output) {
+    return Status::Corruption("Unable to uncompress blob");
+  }
+
+  *result = BlobContents::Create(std::move(output), uncompressed_size);
+
+  return Status::OK();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_reader.h b/src/rocksdb/db/blob/blob_file_reader.h
new file mode 100644
index 000000000..75b756da1
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_reader.h
@@ -0,0 +1,108 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cinttypes>
+#include <memory>
+
+#include "db/blob/blob_read_request.h"
+#include "file/random_access_file_reader.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Status;
+struct ImmutableOptions;
+struct FileOptions;
+class HistogramImpl;
+struct ReadOptions;
+class Slice;
+class FilePrefetchBuffer;
+class BlobContents;
+class Statistics;
+
+class BlobFileReader {
+ public:
+  static Status Create(const ImmutableOptions& immutable_options,
+                       const FileOptions& file_options,
+                       uint32_t column_family_id,
+                       HistogramImpl* blob_file_read_hist,
+                       uint64_t blob_file_number,
+                       const std::shared_ptr<IOTracer>& io_tracer,
+                       std::unique_ptr<BlobFileReader>* reader);
+
+  BlobFileReader(const BlobFileReader&) = delete;
+  BlobFileReader& operator=(const BlobFileReader&) = delete;
+
+  ~BlobFileReader();
+
+  Status GetBlob(const ReadOptions& read_options, const Slice& user_key,
+                 uint64_t offset, uint64_t value_size,
+                 CompressionType compression_type,
+                 FilePrefetchBuffer* prefetch_buffer,
+                 MemoryAllocator* allocator,
+                 std::unique_ptr<BlobContents>* result,
+                 uint64_t* bytes_read) const;
+
+  // offsets must be sorted in ascending order by caller.
+  void MultiGetBlob(
+      const ReadOptions& read_options, MemoryAllocator* allocator,
+      autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>&
+          blob_reqs,
+      uint64_t* bytes_read) const;
+
+  CompressionType GetCompressionType() const { return compression_type_; }
+
+  uint64_t GetFileSize() const { return file_size_; }
+
+ private:
+  BlobFileReader(std::unique_ptr<RandomAccessFileReader>&& file_reader,
+                 uint64_t file_size, CompressionType compression_type,
+                 SystemClock* clock, Statistics* statistics);
+
+  static Status OpenFile(const ImmutableOptions& immutable_options,
+                         const FileOptions& file_opts,
+                         HistogramImpl* blob_file_read_hist,
+                         uint64_t blob_file_number,
+                         const std::shared_ptr<IOTracer>& io_tracer,
+                         uint64_t* file_size,
+                         std::unique_ptr<RandomAccessFileReader>* file_reader);
+
+  static Status ReadHeader(const RandomAccessFileReader* file_reader,
+                           uint32_t column_family_id, Statistics* statistics,
+                           CompressionType* compression_type);
+
+  static Status ReadFooter(const RandomAccessFileReader* file_reader,
+                           uint64_t file_size, Statistics* statistics);
+
+  using Buffer = std::unique_ptr<char[]>;
+
+  static Status ReadFromFile(const RandomAccessFileReader* file_reader,
+                             uint64_t read_offset, size_t read_size,
+                             Statistics* statistics, Slice* slice, Buffer* buf,
+                             AlignedBuf* aligned_buf,
+                             Env::IOPriority rate_limiter_priority);
+
+  static Status VerifyBlob(const Slice& record_slice, const Slice& user_key,
+                           uint64_t value_size);
+
+  static Status UncompressBlobIfNeeded(const Slice& value_slice,
+                                       CompressionType compression_type,
+                                       MemoryAllocator* allocator,
+                                       SystemClock* clock,
+                                       Statistics* statistics,
+                                       std::unique_ptr<BlobContents>* result);
+
+  std::unique_ptr<RandomAccessFileReader> file_reader_;
+  uint64_t file_size_;
+  CompressionType compression_type_;
+  SystemClock* clock_;
+  Statistics* statistics_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_reader_test.cc b/src/rocksdb/db/blob/blob_file_reader_test.cc
new file mode 100644
index 000000000..03458e2b5
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_reader_test.cc
@@ -0,0 +1,1024 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_reader.h"
+
+#include <cassert>
+#include <string>
+
+#include "db/blob/blob_contents.h"
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_writer.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/options.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/compression.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Creates a test blob file with `num` blobs in it.
+void WriteBlobFile(const ImmutableOptions& immutable_options,
+                   uint32_t column_family_id, bool has_ttl,
+                   const ExpirationRange& expiration_range_header,
+                   const ExpirationRange& expiration_range_footer,
+                   uint64_t blob_file_number, const std::vector<Slice>& keys,
+                   const std::vector<Slice>& blobs, CompressionType compression,
+                   std::vector<uint64_t>& blob_offsets,
+                   std::vector<uint64_t>& blob_sizes) {
+  assert(!immutable_options.cf_paths.empty());
+  size_t num = keys.size();
+  assert(num == blobs.size());
+  assert(num == blob_offsets.size());
+  assert(num == blob_sizes.size());
+
+  const std::string blob_file_path =
+      BlobFileName(immutable_options.cf_paths.front().path, blob_file_number);
+  std::unique_ptr<FSWritableFile> file;
+  ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file,
+                            FileOptions()));
+
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(file), blob_file_path, FileOptions(), immutable_options.clock));
+
+  constexpr Statistics* statistics = nullptr;
+  constexpr bool use_fsync = false;
+  constexpr bool do_flush = false;
+
+  BlobLogWriter blob_log_writer(std::move(file_writer), immutable_options.clock,
+                                statistics, blob_file_number, use_fsync,
+                                do_flush);
+
+  BlobLogHeader header(column_family_id, compression, has_ttl,
+                       expiration_range_header);
+
+  ASSERT_OK(blob_log_writer.WriteHeader(header));
+
+  std::vector<std::string> compressed_blobs(num);
+  std::vector<Slice> blobs_to_write(num);
+  if (kNoCompression == compression) {
+    for (size_t i = 0; i < num; ++i) {
+      blobs_to_write[i] = blobs[i];
+      blob_sizes[i] = blobs[i].size();
+    }
+  } else {
+    CompressionOptions opts;
+    CompressionContext context(compression);
+    constexpr uint64_t sample_for_compression = 0;
+    CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
+                         compression, sample_for_compression);
+
+    constexpr uint32_t compression_format_version = 2;
+
+    for (size_t i = 0; i < num; ++i) {
+      ASSERT_TRUE(CompressData(blobs[i], info, compression_format_version,
+                               &compressed_blobs[i]));
+      blobs_to_write[i] = compressed_blobs[i];
+      blob_sizes[i] = compressed_blobs[i].size();
+    }
+  }
+
+  for (size_t i = 0; i < num; ++i) {
+    uint64_t key_offset = 0;
+    ASSERT_OK(blob_log_writer.AddRecord(keys[i], blobs_to_write[i], &key_offset,
+                                        &blob_offsets[i]));
+  }
+
+  BlobLogFooter footer;
+  footer.blob_count = num;
+  footer.expiration_range = expiration_range_footer;
+
+  std::string checksum_method;
+  std::string checksum_value;
+  ASSERT_OK(
+      blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value));
+}
+
+// Creates a test blob file with a single blob in it. Note: this method
+// makes it possible to test various corner cases by allowing the caller
+// to specify the contents of various blob file header/footer fields.
+void WriteBlobFile(const ImmutableOptions& immutable_options,
+                   uint32_t column_family_id, bool has_ttl,
+                   const ExpirationRange& expiration_range_header,
+                   const ExpirationRange& expiration_range_footer,
+                   uint64_t blob_file_number, const Slice& key,
+                   const Slice& blob, CompressionType compression,
+                   uint64_t* blob_offset, uint64_t* blob_size) {
+  std::vector<Slice> keys{key};
+  std::vector<Slice> blobs{blob};
+  std::vector<uint64_t> blob_offsets{0};
+  std::vector<uint64_t> blob_sizes{0};
+  WriteBlobFile(immutable_options, column_family_id, has_ttl,
+                expiration_range_header, expiration_range_footer,
+                blob_file_number, keys, blobs, compression, blob_offsets,
+                blob_sizes);
+  if (blob_offset) {
+    *blob_offset = blob_offsets[0];
+  }
+  if (blob_size) {
+    *blob_size = blob_sizes[0];
+  }
+}
+
+}  // anonymous namespace
+
+class BlobFileReaderTest : public testing::Test {
+ protected:
+  BlobFileReaderTest() { mock_env_.reset(MockEnv::Create(Env::Default())); }
+  std::unique_ptr<Env> mock_env_;
+};
+
+TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) {
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileReaderTest_CreateReaderAndGetBlob"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr size_t num_blobs = 3;
+  const std::vector<std::string> key_strs = {"key1", "key2", "key3"};
+  const std::vector<std::string> blob_strs = {"blob1", "blob2", "blob3"};
+
+  const std::vector<Slice> keys = {key_strs[0], key_strs[1], key_strs[2]};
+  const std::vector<Slice> blobs = {blob_strs[0], blob_strs[1], blob_strs[2]};
+
+  std::vector<uint64_t> blob_offsets(keys.size());
+  std::vector<uint64_t> blob_sizes(keys.size());
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, keys, blobs, kNoCompression,
+                blob_offsets, blob_sizes);
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  ASSERT_OK(BlobFileReader::Create(
+      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+      blob_file_number, nullptr /*IOTracer*/, &reader));
+
+  // Make sure the blob can be retrieved with and without checksum verification
+  ReadOptions read_options;
+  read_options.verify_checksums = false;
+
+  constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+  constexpr MemoryAllocator* allocator = nullptr;
+
+  {
+    std::unique_ptr<BlobContents> value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_OK(reader->GetBlob(read_options, keys[0], blob_offsets[0],
+                              blob_sizes[0], kNoCompression, prefetch_buffer,
+                              allocator, &value, &bytes_read));
+    ASSERT_NE(value, nullptr);
+    ASSERT_EQ(value->data(), blobs[0]);
+    ASSERT_EQ(bytes_read, blob_sizes[0]);
+
+    // MultiGetBlob
+    bytes_read = 0;
+    size_t total_size = 0;
+
+    std::array<Status, num_blobs> statuses_buf;
+    std::array<BlobReadRequest, num_blobs> requests_buf;
+    autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>
+        blob_reqs;
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      requests_buf[i] =
+          BlobReadRequest(keys[i], blob_offsets[i], blob_sizes[i],
+                          kNoCompression, nullptr, &statuses_buf[i]);
+      blob_reqs.emplace_back(&requests_buf[i], std::unique_ptr<BlobContents>());
+    }
+
+    reader->MultiGetBlob(read_options, allocator, blob_reqs, &bytes_read);
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      const auto& result = blob_reqs[i].second;
+
+      ASSERT_OK(statuses_buf[i]);
+      ASSERT_NE(result, nullptr);
+      ASSERT_EQ(result->data(), blobs[i]);
+      total_size += blob_sizes[i];
+    }
+    ASSERT_EQ(bytes_read, total_size);
+  }
+
+  read_options.verify_checksums = true;
+
+  {
+    std::unique_ptr<BlobContents> value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_OK(reader->GetBlob(read_options, keys[1], blob_offsets[1],
+                              blob_sizes[1], kNoCompression, prefetch_buffer,
+                              allocator, &value, &bytes_read));
+    ASSERT_NE(value, nullptr);
+    ASSERT_EQ(value->data(), blobs[1]);
+
+    const uint64_t key_size = keys[1].size();
+    ASSERT_EQ(bytes_read,
+              BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) +
+                  blob_sizes[1]);
+  }
+
+  // Invalid offset (too close to start of file)
+  {
+    std::unique_ptr<BlobContents> value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_TRUE(reader
+                    ->GetBlob(read_options, keys[0], blob_offsets[0] - 1,
+                              blob_sizes[0], kNoCompression, prefetch_buffer,
+                              allocator, &value, &bytes_read)
+                    .IsCorruption());
+    ASSERT_EQ(value, nullptr);
+    ASSERT_EQ(bytes_read, 0);
+  }
+
+  // Invalid offset (too close to end of file)
+  {
+    std::unique_ptr<BlobContents> value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_TRUE(reader
+                    ->GetBlob(read_options, keys[2], blob_offsets[2] + 1,
+                              blob_sizes[2], kNoCompression, prefetch_buffer,
+                              allocator, &value, &bytes_read)
+                    .IsCorruption());
+    ASSERT_EQ(value, nullptr);
+    ASSERT_EQ(bytes_read, 0);
+  }
+
+  // Incorrect compression type
+  {
+    std::unique_ptr<BlobContents> value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_TRUE(reader
+                    ->GetBlob(read_options, keys[0], blob_offsets[0],
+                              blob_sizes[0], kZSTD, prefetch_buffer, allocator,
+                              &value, &bytes_read)
+                    .IsCorruption());
+    ASSERT_EQ(value, nullptr);
+    ASSERT_EQ(bytes_read, 0);
+  }
+
+  // Incorrect key size
+  {
+    constexpr char shorter_key[] = "k";
+    std::unique_ptr<BlobContents> value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_TRUE(reader
+                    ->GetBlob(read_options, shorter_key,
+                              blob_offsets[0] -
+                                  (keys[0].size() - sizeof(shorter_key) + 1),
+                              blob_sizes[0], kNoCompression, prefetch_buffer,
+                              allocator, &value, &bytes_read)
+                    .IsCorruption());
+    ASSERT_EQ(value, nullptr);
+    ASSERT_EQ(bytes_read, 0);
+
+    // MultiGetBlob
+    autovector<std::reference_wrapper<const Slice>> key_refs;
+    for (const auto& key_ref : keys) {
+      key_refs.emplace_back(std::cref(key_ref));
+    }
+    Slice shorter_key_slice(shorter_key, sizeof(shorter_key) - 1);
+    key_refs[1] = std::cref(shorter_key_slice);
+
+    autovector<uint64_t> offsets{
+        blob_offsets[0],
+        blob_offsets[1] - (keys[1].size() - key_refs[1].get().size()),
+        blob_offsets[2]};
+
+    std::array<Status, num_blobs> statuses_buf;
+    std::array<BlobReadRequest, num_blobs> requests_buf;
+    autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>
+        blob_reqs;
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      requests_buf[i] =
+          BlobReadRequest(key_refs[i], offsets[i], blob_sizes[i],
+                          kNoCompression, nullptr, &statuses_buf[i]);
+      blob_reqs.emplace_back(&requests_buf[i], std::unique_ptr<BlobContents>());
+    }
+
+    reader->MultiGetBlob(read_options, allocator, blob_reqs, &bytes_read);
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      if (i == 1) {
+        ASSERT_TRUE(statuses_buf[i].IsCorruption());
+      } else {
+        ASSERT_OK(statuses_buf[i]);
+      }
+    }
+  }
+
+  // Incorrect key
+  {
+    constexpr char incorrect_key[] = "foo1";
+    std::unique_ptr<BlobContents> value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_TRUE(reader
+                    ->GetBlob(read_options, incorrect_key, blob_offsets[0],
+                              blob_sizes[0], kNoCompression, prefetch_buffer,
+                              allocator, &value, &bytes_read)
+                    .IsCorruption());
+    ASSERT_EQ(value, nullptr);
+    ASSERT_EQ(bytes_read, 0);
+
+    // MultiGetBlob
+    autovector<std::reference_wrapper<const Slice>> key_refs;
+    for (const auto& key_ref : keys) {
+      key_refs.emplace_back(std::cref(key_ref));
+    }
+    Slice wrong_key_slice(incorrect_key, sizeof(incorrect_key) - 1);
+    key_refs[2] = std::cref(wrong_key_slice);
+
+    std::array<Status, num_blobs> statuses_buf;
+    std::array<BlobReadRequest, num_blobs> requests_buf;
+    autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>
+        blob_reqs;
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      requests_buf[i] =
+          BlobReadRequest(key_refs[i], blob_offsets[i], blob_sizes[i],
+                          kNoCompression, nullptr, &statuses_buf[i]);
+      blob_reqs.emplace_back(&requests_buf[i], std::unique_ptr<BlobContents>());
+    }
+
+    reader->MultiGetBlob(read_options, allocator, blob_reqs, &bytes_read);
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      if (i == num_blobs - 1) {
+        ASSERT_TRUE(statuses_buf[i].IsCorruption());
+      } else {
+        ASSERT_OK(statuses_buf[i]);
+      }
+    }
+  }
+
+  // Incorrect value size
+  {
+    std::unique_ptr<BlobContents> value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_TRUE(reader
+                    ->GetBlob(read_options, keys[1], blob_offsets[1],
+                              blob_sizes[1] + 1, kNoCompression,
+                              prefetch_buffer, allocator, &value, &bytes_read)
+                    .IsCorruption());
+    ASSERT_EQ(value, nullptr);
+    ASSERT_EQ(bytes_read, 0);
+
+    // MultiGetBlob
+    autovector<std::reference_wrapper<const Slice>> key_refs;
+    for (const auto& key_ref : keys) {
+      key_refs.emplace_back(std::cref(key_ref));
+    }
+
+    std::array<Status, num_blobs> statuses_buf;
+    std::array<BlobReadRequest, num_blobs> requests_buf;
+
+    requests_buf[0] =
+        BlobReadRequest(key_refs[0], blob_offsets[0], blob_sizes[0],
+                        kNoCompression, nullptr, &statuses_buf[0]);
+    requests_buf[1] =
+        BlobReadRequest(key_refs[1], blob_offsets[1], blob_sizes[1] + 1,
+                        kNoCompression, nullptr, &statuses_buf[1]);
+    requests_buf[2] =
+        BlobReadRequest(key_refs[2], blob_offsets[2], blob_sizes[2],
+                        kNoCompression, nullptr, &statuses_buf[2]);
+
+    autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>
+        blob_reqs;
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      blob_reqs.emplace_back(&requests_buf[i], std::unique_ptr<BlobContents>());
+    }
+
+    reader->MultiGetBlob(read_options, allocator, blob_reqs, &bytes_read);
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      if (i != 1) {
+        ASSERT_OK(statuses_buf[i]);
+      } else {
+        ASSERT_TRUE(statuses_buf[i].IsCorruption());
+      }
+    }
+  }
+}
+
+TEST_F(BlobFileReaderTest, Malformed) {
+  // Write a blob file consisting of nothing but a header, and make sure we
+  // detect the error when we open it for reading
+
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_Malformed"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr uint64_t blob_file_number = 1;
+
+  {
+    constexpr bool has_ttl = false;
+    constexpr ExpirationRange expiration_range;
+
+    const std::string blob_file_path =
+        BlobFileName(immutable_options.cf_paths.front().path, blob_file_number);
+
+    std::unique_ptr<FSWritableFile> file;
+    ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file,
+                              FileOptions()));
+
+    std::unique_ptr<WritableFileWriter> file_writer(
+        new WritableFileWriter(std::move(file), blob_file_path, FileOptions(),
+                               immutable_options.clock));
+
+    constexpr Statistics* statistics = nullptr;
+    constexpr bool use_fsync = false;
+    constexpr bool do_flush = false;
+
+    BlobLogWriter blob_log_writer(std::move(file_writer),
+                                  immutable_options.clock, statistics,
+                                  blob_file_number, use_fsync, do_flush);
+
+    BlobLogHeader header(column_family_id, kNoCompression, has_ttl,
+                         expiration_range);
+
+    ASSERT_OK(blob_log_writer.WriteHeader(header));
+  }
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
+                                     column_family_id, blob_file_read_hist,
+                                     blob_file_number, nullptr /*IOTracer*/,
+                                     &reader)
+                  .IsCorruption());
+}
+
+TEST_F(BlobFileReaderTest, TTL) {
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_TTL"), 0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = true;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  uint64_t blob_offset = 0;
+  uint64_t blob_size = 0;
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob, kNoCompression,
+                &blob_offset, &blob_size);
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
+                                     column_family_id, blob_file_read_hist,
+                                     blob_file_number, nullptr /*IOTracer*/,
+                                     &reader)
+                  .IsCorruption());
+}
+
+TEST_F(BlobFileReaderTest, ExpirationRangeInHeader) {
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileReaderTest_ExpirationRangeInHeader"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  const ExpirationRange expiration_range_header(
+      1, 2);  // can be made constexpr when we adopt C++14
+  constexpr ExpirationRange expiration_range_footer;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  uint64_t blob_offset = 0;
+  uint64_t blob_size = 0;
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl,
+                expiration_range_header, expiration_range_footer,
+                blob_file_number, key, blob, kNoCompression, &blob_offset,
+                &blob_size);
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
+                                     column_family_id, blob_file_read_hist,
+                                     blob_file_number, nullptr /*IOTracer*/,
+                                     &reader)
+                  .IsCorruption());
+}
+
+TEST_F(BlobFileReaderTest, ExpirationRangeInFooter) {
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileReaderTest_ExpirationRangeInFooter"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range_header;
+  const ExpirationRange expiration_range_footer(
+      1, 2);  // can be made constexpr when we adopt C++14
+  constexpr uint64_t blob_file_number = 1;
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  uint64_t blob_offset = 0;
+  uint64_t blob_size = 0;
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl,
+                expiration_range_header, expiration_range_footer,
+                blob_file_number, key, blob, kNoCompression, &blob_offset,
+                &blob_size);
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
+                                     column_family_id, blob_file_read_hist,
+                                     blob_file_number, nullptr /*IOTracer*/,
+                                     &reader)
+                  .IsCorruption());
+}
+
+TEST_F(BlobFileReaderTest, IncorrectColumnFamily) {
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileReaderTest_IncorrectColumnFamily"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  uint64_t blob_offset = 0;
+  uint64_t blob_size = 0;
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob, kNoCompression,
+                &blob_offset, &blob_size);
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  constexpr uint32_t incorrect_column_family_id = 2;
+
+  ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
+                                     incorrect_column_family_id,
+                                     blob_file_read_hist, blob_file_number,
+                                     nullptr /*IOTracer*/, &reader)
+                  .IsCorruption());
+}
+
+TEST_F(BlobFileReaderTest, BlobCRCError) {
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_BlobCRCError"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  uint64_t blob_offset = 0;
+  uint64_t blob_size = 0;
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob, kNoCompression,
+                &blob_offset, &blob_size);
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  ASSERT_OK(BlobFileReader::Create(
+      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+      blob_file_number, nullptr /*IOTracer*/, &reader));
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileReader::VerifyBlob:CheckBlobCRC", [](void* arg) {
+        BlobLogRecord* const record = static_cast<BlobLogRecord*>(arg);
+        assert(record);
+
+        record->blob_crc = 0xfaceb00c;
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+  constexpr MemoryAllocator* allocator = nullptr;
+
+  std::unique_ptr<BlobContents> value;
+  uint64_t bytes_read = 0;
+
+  ASSERT_TRUE(reader
+                  ->GetBlob(ReadOptions(), key, blob_offset, blob_size,
+                            kNoCompression, prefetch_buffer, allocator, &value,
+                            &bytes_read)
+                  .IsCorruption());
+  ASSERT_EQ(value, nullptr);
+  ASSERT_EQ(bytes_read, 0);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(BlobFileReaderTest, Compression) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_Compression"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  uint64_t blob_offset = 0;
+  uint64_t blob_size = 0;
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob,
+                kSnappyCompression, &blob_offset, &blob_size);
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  ASSERT_OK(BlobFileReader::Create(
+      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+      blob_file_number, nullptr /*IOTracer*/, &reader));
+
+  // Make sure the blob can be retrieved with and without checksum verification
+  ReadOptions read_options;
+  read_options.verify_checksums = false;
+
+  constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+  constexpr MemoryAllocator* allocator = nullptr;
+
+  {
+    std::unique_ptr<BlobContents> value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_OK(reader->GetBlob(read_options, key, blob_offset, blob_size,
+                              kSnappyCompression, prefetch_buffer, allocator,
+                              &value, &bytes_read));
+    ASSERT_NE(value, nullptr);
+    ASSERT_EQ(value->data(), blob);
+    ASSERT_EQ(bytes_read, blob_size);
+  }
+
+  read_options.verify_checksums = true;
+
+  {
+    std::unique_ptr<BlobContents> value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_OK(reader->GetBlob(read_options, key, blob_offset, blob_size,
+                              kSnappyCompression, prefetch_buffer, allocator,
+                              &value, &bytes_read));
+    ASSERT_NE(value, nullptr);
+    ASSERT_EQ(value->data(), blob);
+
+    constexpr uint64_t key_size = sizeof(key) - 1;
+    ASSERT_EQ(bytes_read,
+              BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) +
+                  blob_size);
+  }
+}
+
+TEST_F(BlobFileReaderTest, UncompressionError) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileReaderTest_UncompressionError"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  uint64_t blob_offset = 0;
+  uint64_t blob_size = 0;
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob,
+                kSnappyCompression, &blob_offset, &blob_size);
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  ASSERT_OK(BlobFileReader::Create(
+      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+      blob_file_number, nullptr /*IOTracer*/, &reader));
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", [](void* arg) {
+        CacheAllocationPtr* const output =
+            static_cast<CacheAllocationPtr*>(arg);
+        assert(output);
+
+        output->reset();
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+  constexpr MemoryAllocator* allocator = nullptr;
+
+  std::unique_ptr<BlobContents> value;
+  uint64_t bytes_read = 0;
+
+  ASSERT_TRUE(reader
+                  ->GetBlob(ReadOptions(), key, blob_offset, blob_size,
+                            kSnappyCompression, prefetch_buffer, allocator,
+                            &value, &bytes_read)
+                  .IsCorruption());
+  ASSERT_EQ(value, nullptr);
+  ASSERT_EQ(bytes_read, 0);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+class BlobFileReaderIOErrorTest
+    : public testing::Test,
+      public testing::WithParamInterface<std::string> {
+ protected:
+  BlobFileReaderIOErrorTest() : sync_point_(GetParam()) {
+    mock_env_.reset(MockEnv::Create(Env::Default()));
+    fault_injection_env_.reset(new FaultInjectionTestEnv(mock_env_.get()));
+  }
+
+  std::unique_ptr<Env> mock_env_;
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env_;
+  std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(BlobFileReaderTest, BlobFileReaderIOErrorTest,
+                        ::testing::ValuesIn(std::vector<std::string>{
+                            "BlobFileReader::OpenFile:GetFileSize",
+                            "BlobFileReader::OpenFile:NewRandomAccessFile",
+                            "BlobFileReader::ReadHeader:ReadFromFile",
+                            "BlobFileReader::ReadFooter:ReadFromFile",
+                            "BlobFileReader::GetBlob:ReadFromFile"}));
+
+TEST_P(BlobFileReaderIOErrorTest, IOError) {
+  // Simulates an I/O error during the specified step
+
+  Options options;
+  options.env = fault_injection_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(fault_injection_env_.get(),
+                            "BlobFileReaderIOErrorTest_IOError"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  uint64_t blob_offset = 0;
+  uint64_t blob_size = 0;
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob, kNoCompression,
+                &blob_offset, &blob_size);
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
+    fault_injection_env_->SetFilesystemActive(false,
+                                              Status::IOError(sync_point_));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  const Status s = BlobFileReader::Create(
+      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+      blob_file_number, nullptr /*IOTracer*/, &reader);
+
+  const bool fail_during_create =
+      (sync_point_ != "BlobFileReader::GetBlob:ReadFromFile");
+
+  if (fail_during_create) {
+    ASSERT_TRUE(s.IsIOError());
+  } else {
+    ASSERT_OK(s);
+
+    constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+    constexpr MemoryAllocator* allocator = nullptr;
+
+    std::unique_ptr<BlobContents> value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_TRUE(reader
+                    ->GetBlob(ReadOptions(), key, blob_offset, blob_size,
+                              kNoCompression, prefetch_buffer, allocator,
+                              &value, &bytes_read)
+                    .IsIOError());
+    ASSERT_EQ(value, nullptr);
+    ASSERT_EQ(bytes_read, 0);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+class BlobFileReaderDecodingErrorTest
+    : public testing::Test,
+      public testing::WithParamInterface<std::string> {
+ protected:
+  BlobFileReaderDecodingErrorTest() : sync_point_(GetParam()) {
+    mock_env_.reset(MockEnv::Create(Env::Default()));
+  }
+
+  std::unique_ptr<Env> mock_env_;
+  std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(BlobFileReaderTest, BlobFileReaderDecodingErrorTest,
+                        ::testing::ValuesIn(std::vector<std::string>{
+                            "BlobFileReader::ReadHeader:TamperWithResult",
+                            "BlobFileReader::ReadFooter:TamperWithResult",
+                            "BlobFileReader::GetBlob:TamperWithResult"}));
+
+TEST_P(BlobFileReaderDecodingErrorTest, DecodingError) {
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileReaderDecodingErrorTest_DecodingError"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  uint64_t blob_offset = 0;
+  uint64_t blob_size = 0;
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob, kNoCompression,
+                &blob_offset, &blob_size);
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [](void* arg) {
+    Slice* const slice = static_cast<Slice*>(arg);
+    assert(slice);
+    assert(!slice->empty());
+
+    slice->remove_prefix(1);
+  });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  const Status s = BlobFileReader::Create(
+      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+      blob_file_number, nullptr /*IOTracer*/, &reader);
+
+  const bool fail_during_create =
+      sync_point_ != "BlobFileReader::GetBlob:TamperWithResult";
+
+  if (fail_during_create) {
+    ASSERT_TRUE(s.IsCorruption());
+  } else {
+    ASSERT_OK(s);
+
+    constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+    constexpr MemoryAllocator* allocator = nullptr;
+
+    std::unique_ptr<BlobContents> value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_TRUE(reader
+                    ->GetBlob(ReadOptions(), key, blob_offset, blob_size,
+                              kNoCompression, prefetch_buffer, allocator,
+                              &value, &bytes_read)
+                    .IsCorruption());
+    ASSERT_EQ(value, nullptr);
+    ASSERT_EQ(bytes_read, 0);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/blob_garbage_meter.cc b/src/rocksdb/db/blob/blob_garbage_meter.cc
new file mode 100644
index 000000000..d328d7ff4
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_garbage_meter.cc
@@ -0,0 +1,100 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_garbage_meter.h"
+
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/dbformat.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status BlobGarbageMeter::ProcessInFlow(const Slice& key, const Slice& value) {
+  uint64_t blob_file_number = kInvalidBlobFileNumber;
+  uint64_t bytes = 0;
+
+  const Status s = Parse(key, value, &blob_file_number, &bytes);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (blob_file_number == kInvalidBlobFileNumber) {
+    return Status::OK();
+  }
+
+  flows_[blob_file_number].AddInFlow(bytes);
+
+  return Status::OK();
+}
+
+Status BlobGarbageMeter::ProcessOutFlow(const Slice& key, const Slice& value) {
+  uint64_t blob_file_number = kInvalidBlobFileNumber;
+  uint64_t bytes = 0;
+
+  const Status s = Parse(key, value, &blob_file_number, &bytes);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (blob_file_number == kInvalidBlobFileNumber) {
+    return Status::OK();
+  }
+
+  // Note: in order to measure the amount of additional garbage, we only need to
+  // track the outflow for preexisting files, i.e. those that also had inflow.
+  // (Newly written files would only have outflow.)
+  auto it = flows_.find(blob_file_number);
+  if (it == flows_.end()) {
+    return Status::OK();
+  }
+
+  it->second.AddOutFlow(bytes);
+
+  return Status::OK();
+}
+
+Status BlobGarbageMeter::Parse(const Slice& key, const Slice& value,
+                               uint64_t* blob_file_number, uint64_t* bytes) {
+  assert(blob_file_number);
+  assert(*blob_file_number == kInvalidBlobFileNumber);
+  assert(bytes);
+  assert(*bytes == 0);
+
+  ParsedInternalKey ikey;
+
+  {
+    constexpr bool log_err_key = false;
+    const Status s = ParseInternalKey(key, &ikey, log_err_key);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (ikey.type != kTypeBlobIndex) {
+    return Status::OK();
+  }
+
+  BlobIndex blob_index;
+
+  {
+    const Status s = blob_index.DecodeFrom(value);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (blob_index.IsInlined() || blob_index.HasTTL()) {
+    return Status::Corruption("Unexpected TTL/inlined blob index");
+  }
+
+  *blob_file_number = blob_index.file_number();
+  *bytes =
+      blob_index.size() +
+      BlobLogRecord::CalculateAdjustmentForRecordHeader(ikey.user_key.size());
+
+  return Status::OK();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_garbage_meter.h b/src/rocksdb/db/blob/blob_garbage_meter.h
new file mode 100644
index 000000000..a6c04b0b2
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_garbage_meter.h
@@ -0,0 +1,102 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <unordered_map>
+
+#include "db/blob/blob_constants.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+
+// A class that can be used to compute the amount of additional garbage
+// generated by a compaction. It parses the keys and blob references in the
+// input and output of a compaction, and aggregates the "inflow" and "outflow"
+// on a per-blob file basis. The amount of additional garbage for any given blob
+// file can then be computed by subtracting the outflow from the inflow.
+class BlobGarbageMeter {
+ public:
+  // A class to store the number and total size of blobs on a per-blob file
+  // basis.
+  class BlobStats {
+   public:
+    void Add(uint64_t bytes) {
+      ++count_;
+      bytes_ += bytes;
+    }
+    void Add(uint64_t count, uint64_t bytes) {
+      count_ += count;
+      bytes_ += bytes;
+    }
+
+    uint64_t GetCount() const { return count_; }
+    uint64_t GetBytes() const { return bytes_; }
+
+   private:
+    uint64_t count_ = 0;
+    uint64_t bytes_ = 0;
+  };
+
+  // A class to keep track of the "inflow" and the "outflow" and to compute the
+  // amount of additional garbage for a given blob file.
+  class BlobInOutFlow {
+   public:
+    void AddInFlow(uint64_t bytes) {
+      in_flow_.Add(bytes);
+      assert(IsValid());
+    }
+    void AddOutFlow(uint64_t bytes) {
+      out_flow_.Add(bytes);
+      assert(IsValid());
+    }
+
+    const BlobStats& GetInFlow() const { return in_flow_; }
+    const BlobStats& GetOutFlow() const { return out_flow_; }
+
+    bool IsValid() const {
+      return in_flow_.GetCount() >= out_flow_.GetCount() &&
+             in_flow_.GetBytes() >= out_flow_.GetBytes();
+    }
+    bool HasGarbage() const {
+      assert(IsValid());
+      return in_flow_.GetCount() > out_flow_.GetCount();
+    }
+    uint64_t GetGarbageCount() const {
+      assert(IsValid());
+      assert(HasGarbage());
+      return in_flow_.GetCount() - out_flow_.GetCount();
+    }
+    uint64_t GetGarbageBytes() const {
+      assert(IsValid());
+      assert(HasGarbage());
+      return in_flow_.GetBytes() - out_flow_.GetBytes();
+    }
+
+   private:
+    BlobStats in_flow_;
+    BlobStats out_flow_;
+  };
+
+  Status ProcessInFlow(const Slice& key, const Slice& value);
+  Status ProcessOutFlow(const Slice& key, const Slice& value);
+
+  const std::unordered_map<uint64_t, BlobInOutFlow>& flows() const {
+    return flows_;
+  }
+
+ private:
+  static Status Parse(const Slice& key, const Slice& value,
+                      uint64_t* blob_file_number, uint64_t* bytes);
+
+  std::unordered_map<uint64_t, BlobInOutFlow> flows_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_garbage_meter_test.cc b/src/rocksdb/db/blob/blob_garbage_meter_test.cc
new file mode 100644
index 000000000..ba53f06f1
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_garbage_meter_test.cc
@@ -0,0 +1,197 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_garbage_meter.h"
+
+#include <string>
+#include <vector>
+
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/dbformat.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+TEST(BlobGarbageMeterTest, MeasureGarbage) {
+  BlobGarbageMeter blob_garbage_meter;
+
+  struct BlobDescriptor {
+    std::string user_key;
+    uint64_t blob_file_number;
+    uint64_t offset;
+    uint64_t size;
+    CompressionType compression_type;
+    bool has_in_flow;
+    bool has_out_flow;
+
+    uint64_t GetExpectedBytes() const {
+      return size +
+             BlobLogRecord::CalculateAdjustmentForRecordHeader(user_key.size());
+    }
+  };
+
+  // Note: blob file 4 has the same inflow and outflow and hence no additional
+  // garbage. Blob file 5 has less outflow than inflow and thus it does have
+  // additional garbage. Blob file 6 is a newly written file (i.e. no inflow,
+  // only outflow) and is thus not tracked by the meter.
+  std::vector<BlobDescriptor> blobs{
+      {"key", 4, 1234, 555, kLZ4Compression, true, true},
+      {"other_key", 4, 6789, 101010, kLZ4Compression, true, true},
+      {"yet_another_key", 5, 22222, 3456, kLZ4Compression, true, true},
+      {"foo_key", 5, 77777, 8888, kLZ4Compression, true, true},
+      {"bar_key", 5, 999999, 1212, kLZ4Compression, true, false},
+      {"baz_key", 5, 1234567, 890, kLZ4Compression, true, false},
+      {"new_key", 6, 7777, 9999, kNoCompression, false, true}};
+
+  for (const auto& blob : blobs) {
+    constexpr SequenceNumber seq = 123;
+    const InternalKey key(blob.user_key, seq, kTypeBlobIndex);
+    const Slice key_slice = key.Encode();
+
+    std::string value;
+    BlobIndex::EncodeBlob(&value, blob.blob_file_number, blob.offset, blob.size,
+                          blob.compression_type);
+    const Slice value_slice(value);
+
+    if (blob.has_in_flow) {
+      ASSERT_OK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice));
+    }
+    if (blob.has_out_flow) {
+      ASSERT_OK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice));
+    }
+  }
+
+  const auto& flows = blob_garbage_meter.flows();
+  ASSERT_EQ(flows.size(), 2);
+
+  {
+    const auto it = flows.find(4);
+    ASSERT_NE(it, flows.end());
+
+    const auto& flow = it->second;
+
+    constexpr uint64_t expected_count = 2;
+    const uint64_t expected_bytes =
+        blobs[0].GetExpectedBytes() + blobs[1].GetExpectedBytes();
+
+    const auto& in = flow.GetInFlow();
+    ASSERT_EQ(in.GetCount(), expected_count);
+    ASSERT_EQ(in.GetBytes(), expected_bytes);
+
+    const auto& out = flow.GetOutFlow();
+    ASSERT_EQ(out.GetCount(), expected_count);
+    ASSERT_EQ(out.GetBytes(), expected_bytes);
+
+    ASSERT_TRUE(flow.IsValid());
+    ASSERT_FALSE(flow.HasGarbage());
+  }
+
+  {
+    const auto it = flows.find(5);
+    ASSERT_NE(it, flows.end());
+
+    const auto& flow = it->second;
+
+    const auto& in = flow.GetInFlow();
+
+    constexpr uint64_t expected_in_count = 4;
+    const uint64_t expected_in_bytes =
+        blobs[2].GetExpectedBytes() + blobs[3].GetExpectedBytes() +
+        blobs[4].GetExpectedBytes() + blobs[5].GetExpectedBytes();
+
+    ASSERT_EQ(in.GetCount(), expected_in_count);
+    ASSERT_EQ(in.GetBytes(), expected_in_bytes);
+
+    const auto& out = flow.GetOutFlow();
+
+    constexpr uint64_t expected_out_count = 2;
+    const uint64_t expected_out_bytes =
+        blobs[2].GetExpectedBytes() + blobs[3].GetExpectedBytes();
+
+    ASSERT_EQ(out.GetCount(), expected_out_count);
+    ASSERT_EQ(out.GetBytes(), expected_out_bytes);
+
+    ASSERT_TRUE(flow.IsValid());
+    ASSERT_TRUE(flow.HasGarbage());
+    ASSERT_EQ(flow.GetGarbageCount(), expected_in_count - expected_out_count);
+    ASSERT_EQ(flow.GetGarbageBytes(), expected_in_bytes - expected_out_bytes);
+  }
+}
+
+TEST(BlobGarbageMeterTest, PlainValue) {
+  constexpr char user_key[] = "user_key";
+  constexpr SequenceNumber seq = 123;
+
+  const InternalKey key(user_key, seq, kTypeValue);
+  const Slice key_slice = key.Encode();
+
+  constexpr char value[] = "value";
+  const Slice value_slice(value);
+
+  BlobGarbageMeter blob_garbage_meter;
+
+  ASSERT_OK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice));
+  ASSERT_OK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice));
+  ASSERT_TRUE(blob_garbage_meter.flows().empty());
+}
+
+TEST(BlobGarbageMeterTest, CorruptInternalKey) {
+  constexpr char corrupt_key[] = "i_am_corrupt";
+  const Slice key_slice(corrupt_key);
+
+  constexpr char value[] = "value";
+  const Slice value_slice(value);
+
+  BlobGarbageMeter blob_garbage_meter;
+
+  ASSERT_NOK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice));
+  ASSERT_NOK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice));
+}
+
+TEST(BlobGarbageMeterTest, CorruptBlobIndex) {
+  constexpr char user_key[] = "user_key";
+  constexpr SequenceNumber seq = 123;
+
+  const InternalKey key(user_key, seq, kTypeBlobIndex);
+  const Slice key_slice = key.Encode();
+
+  constexpr char value[] = "i_am_not_a_blob_index";
+  const Slice value_slice(value);
+
+  BlobGarbageMeter blob_garbage_meter;
+
+  ASSERT_NOK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice));
+  ASSERT_NOK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice));
+}
+
+TEST(BlobGarbageMeterTest, InlinedTTLBlobIndex) {
+  constexpr char user_key[] = "user_key";
+  constexpr SequenceNumber seq = 123;
+
+  const InternalKey key(user_key, seq, kTypeBlobIndex);
+  const Slice key_slice = key.Encode();
+
+  constexpr uint64_t expiration = 1234567890;
+  constexpr char inlined_value[] = "inlined";
+
+  std::string value;
+  BlobIndex::EncodeInlinedTTL(&value, expiration, inlined_value);
+
+  const Slice value_slice(value);
+
+  BlobGarbageMeter blob_garbage_meter;
+
+  ASSERT_NOK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice));
+  ASSERT_NOK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/blob_index.h b/src/rocksdb/db/blob/blob_index.h
new file mode 100644
index 000000000..e9944d784
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_index.h
@@ -0,0 +1,187 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <sstream>
+#include <string>
+
+#include "rocksdb/compression_type.h"
+#include "util/coding.h"
+#include "util/compression.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// BlobIndex is a pointer to the blob and metadata of the blob. The index is
+// stored in base DB as ValueType::kTypeBlobIndex.
+// There are three types of blob index:
+//
+//    kInlinedTTL:
+//      +------+------------+---------------+
+//      | type | expiration | value         |
+//      +------+------------+---------------+
+//      | char | varint64   | variable size |
+//      +------+------------+---------------+
+//
+//    kBlob:
+//      +------+-------------+----------+----------+-------------+
+//      | type | file number | offset   | size     | compression |
+//      +------+-------------+----------+----------+-------------+
+//      | char | varint64    | varint64 | varint64 | char        |
+//      +------+-------------+----------+----------+-------------+
+//
+//    kBlobTTL:
+//      +------+------------+-------------+----------+----------+-------------+
+//      | type | expiration | file number | offset   | size     | compression |
+//      +------+------------+-------------+----------+----------+-------------+
+//      | char | varint64   | varint64    | varint64 | varint64 | char        |
+//      +------+------------+-------------+----------+----------+-------------+
+//
+// There isn't a kInlined (without TTL) type since we can store it as a plain
+// value (i.e. ValueType::kTypeValue).
+class BlobIndex {
+ public:
+  enum class Type : unsigned char {
+    kInlinedTTL = 0,
+    kBlob = 1,
+    kBlobTTL = 2,
+    kUnknown = 3,
+  };
+
+  BlobIndex() : type_(Type::kUnknown) {}
+
+  BlobIndex(const BlobIndex&) = default;
+  BlobIndex& operator=(const BlobIndex&) = default;
+
+  bool IsInlined() const { return type_ == Type::kInlinedTTL; }
+
+  bool HasTTL() const {
+    return type_ == Type::kInlinedTTL || type_ == Type::kBlobTTL;
+  }
+
+  uint64_t expiration() const {
+    assert(HasTTL());
+    return expiration_;
+  }
+
+  const Slice& value() const {
+    assert(IsInlined());
+    return value_;
+  }
+
+  uint64_t file_number() const {
+    assert(!IsInlined());
+    return file_number_;
+  }
+
+  uint64_t offset() const {
+    assert(!IsInlined());
+    return offset_;
+  }
+
+  uint64_t size() const {
+    assert(!IsInlined());
+    return size_;
+  }
+
+  CompressionType compression() const {
+    assert(!IsInlined());
+    return compression_;
+  }
+
+  Status DecodeFrom(Slice slice) {
+    const char* kErrorMessage = "Error while decoding blob index";
+    assert(slice.size() > 0);
+    type_ = static_cast<Type>(*slice.data());
+    if (type_ >= Type::kUnknown) {
+      return Status::Corruption(kErrorMessage,
+                                "Unknown blob index type: " +
+                                    std::to_string(static_cast<char>(type_)));
+    }
+    slice = Slice(slice.data() + 1, slice.size() - 1);
+    if (HasTTL()) {
+      if (!GetVarint64(&slice, &expiration_)) {
+        return Status::Corruption(kErrorMessage, "Corrupted expiration");
+      }
+    }
+    if (IsInlined()) {
+      value_ = slice;
+    } else {
+      if (GetVarint64(&slice, &file_number_) && GetVarint64(&slice, &offset_) &&
+          GetVarint64(&slice, &size_) && slice.size() == 1) {
+        compression_ = static_cast<CompressionType>(*slice.data());
+      } else {
+        return Status::Corruption(kErrorMessage, "Corrupted blob offset");
+      }
+    }
+    return Status::OK();
+  }
+
+  std::string DebugString(bool output_hex) const {
+    std::ostringstream oss;
+
+    if (IsInlined()) {
+      oss << "[inlined blob] value:" << value_.ToString(output_hex);
+    } else {
+      oss << "[blob ref] file:" << file_number_ << " offset:" << offset_
+          << " size:" << size_
+          << " compression: " << CompressionTypeToString(compression_);
+    }
+
+    if (HasTTL()) {
+      oss << " exp:" << expiration_;
+    }
+
+    return oss.str();
+  }
+
+  static void EncodeInlinedTTL(std::string* dst, uint64_t expiration,
+                               const Slice& value) {
+    assert(dst != nullptr);
+    dst->clear();
+    dst->reserve(1 + kMaxVarint64Length + value.size());
+    dst->push_back(static_cast<char>(Type::kInlinedTTL));
+    PutVarint64(dst, expiration);
+    dst->append(value.data(), value.size());
+  }
+
+  static void EncodeBlob(std::string* dst, uint64_t file_number,
+                         uint64_t offset, uint64_t size,
+                         CompressionType compression) {
+    assert(dst != nullptr);
+    dst->clear();
+    dst->reserve(kMaxVarint64Length * 3 + 2);
+    dst->push_back(static_cast<char>(Type::kBlob));
+    PutVarint64(dst, file_number);
+    PutVarint64(dst, offset);
+    PutVarint64(dst, size);
+    dst->push_back(static_cast<char>(compression));
+  }
+
+  static void EncodeBlobTTL(std::string* dst, uint64_t expiration,
+                            uint64_t file_number, uint64_t offset,
+                            uint64_t size, CompressionType compression) {
+    assert(dst != nullptr);
+    dst->clear();
+    dst->reserve(kMaxVarint64Length * 4 + 2);
+    dst->push_back(static_cast<char>(Type::kBlobTTL));
+    PutVarint64(dst, expiration);
+    PutVarint64(dst, file_number);
+    PutVarint64(dst, offset);
+    PutVarint64(dst, size);
+    dst->push_back(static_cast<char>(compression));
+  }
+
+ private:
+  Type type_ = Type::kUnknown;
+  uint64_t expiration_ = 0;
+  Slice value_;
+  uint64_t file_number_ = 0;
+  uint64_t offset_ = 0;
+  uint64_t size_ = 0;
+  CompressionType compression_ = kNoCompression;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_log_format.cc b/src/rocksdb/db/blob/blob_log_format.cc
new file mode 100644
index 000000000..8e26281e3
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_log_format.cc
@@ -0,0 +1,143 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "db/blob/blob_log_format.h"
+
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void BlobLogHeader::EncodeTo(std::string* dst) {
+  assert(dst != nullptr);
+  dst->clear();
+  dst->reserve(BlobLogHeader::kSize);
+  PutFixed32(dst, kMagicNumber);
+  PutFixed32(dst, version);
+  PutFixed32(dst, column_family_id);
+  unsigned char flags = (has_ttl ? 1 : 0);
+  dst->push_back(flags);
+  dst->push_back(compression);
+  PutFixed64(dst, expiration_range.first);
+  PutFixed64(dst, expiration_range.second);
+}
+
+Status BlobLogHeader::DecodeFrom(Slice src) {
+  const char* kErrorMessage = "Error while decoding blob log header";
+  if (src.size() != BlobLogHeader::kSize) {
+    return Status::Corruption(kErrorMessage,
+                              "Unexpected blob file header size");
+  }
+  uint32_t magic_number;
+  unsigned char flags;
+  if (!GetFixed32(&src, &magic_number) || !GetFixed32(&src, &version) ||
+      !GetFixed32(&src, &column_family_id)) {
+    return Status::Corruption(
+        kErrorMessage,
+        "Error decoding magic number, version and column family id");
+  }
+  if (magic_number != kMagicNumber) {
+    return Status::Corruption(kErrorMessage, "Magic number mismatch");
+  }
+  if (version != kVersion1) {
+    return Status::Corruption(kErrorMessage, "Unknown header version");
+  }
+  flags = src.data()[0];
+  compression = static_cast<CompressionType>(src.data()[1]);
+  has_ttl = (flags & 1) == 1;
+  src.remove_prefix(2);
+  if (!GetFixed64(&src, &expiration_range.first) ||
+      !GetFixed64(&src, &expiration_range.second)) {
+    return Status::Corruption(kErrorMessage, "Error decoding expiration range");
+  }
+  return Status::OK();
+}
+
+void BlobLogFooter::EncodeTo(std::string* dst) {
+  assert(dst != nullptr);
+  dst->clear();
+  dst->reserve(BlobLogFooter::kSize);
+  PutFixed32(dst, kMagicNumber);
+  PutFixed64(dst, blob_count);
+  PutFixed64(dst, expiration_range.first);
+  PutFixed64(dst, expiration_range.second);
+  crc = crc32c::Value(dst->c_str(), dst->size());
+  crc = crc32c::Mask(crc);
+  PutFixed32(dst, crc);
+}
+
+Status BlobLogFooter::DecodeFrom(Slice src) {
+  const char* kErrorMessage = "Error while decoding blob log footer";
+  if (src.size() != BlobLogFooter::kSize) {
+    return Status::Corruption(kErrorMessage,
+                              "Unexpected blob file footer size");
+  }
+  uint32_t src_crc = 0;
+  src_crc = crc32c::Value(src.data(), BlobLogFooter::kSize - sizeof(uint32_t));
+  src_crc = crc32c::Mask(src_crc);
+  uint32_t magic_number = 0;
+  if (!GetFixed32(&src, &magic_number) || !GetFixed64(&src, &blob_count) ||
+      !GetFixed64(&src, &expiration_range.first) ||
+      !GetFixed64(&src, &expiration_range.second) || !GetFixed32(&src, &crc)) {
+    return Status::Corruption(kErrorMessage, "Error decoding content");
+  }
+  if (magic_number != kMagicNumber) {
+    return Status::Corruption(kErrorMessage, "Magic number mismatch");
+  }
+  if (src_crc != crc) {
+    return Status::Corruption(kErrorMessage, "CRC mismatch");
+  }
+  return Status::OK();
+}
+
+void BlobLogRecord::EncodeHeaderTo(std::string* dst) {
+  assert(dst != nullptr);
+  dst->clear();
+  dst->reserve(BlobLogRecord::kHeaderSize + key.size() + value.size());
+  PutFixed64(dst, key.size());
+  PutFixed64(dst, value.size());
+  PutFixed64(dst, expiration);
+  header_crc = crc32c::Value(dst->c_str(), dst->size());
+  header_crc = crc32c::Mask(header_crc);
+  PutFixed32(dst, header_crc);
+  blob_crc = crc32c::Value(key.data(), key.size());
+  blob_crc = crc32c::Extend(blob_crc, value.data(), value.size());
+  blob_crc = crc32c::Mask(blob_crc);
+  PutFixed32(dst, blob_crc);
+}
+
+Status BlobLogRecord::DecodeHeaderFrom(Slice src) {
+  const char* kErrorMessage = "Error while decoding blob record";
+  if (src.size() != BlobLogRecord::kHeaderSize) {
+    return Status::Corruption(kErrorMessage,
+                              "Unexpected blob record header size");
+  }
+  uint32_t src_crc = 0;
+  src_crc = crc32c::Value(src.data(), BlobLogRecord::kHeaderSize - 8);
+  src_crc = crc32c::Mask(src_crc);
+  if (!GetFixed64(&src, &key_size) || !GetFixed64(&src, &value_size) ||
+      !GetFixed64(&src, &expiration) || !GetFixed32(&src, &header_crc) ||
+      !GetFixed32(&src, &blob_crc)) {
+    return Status::Corruption(kErrorMessage, "Error decoding content");
+  }
+  if (src_crc != header_crc) {
+    return Status::Corruption(kErrorMessage, "Header CRC mismatch");
+  }
+  return Status::OK();
+}
+
+Status BlobLogRecord::CheckBlobCRC() const {
+  uint32_t expected_crc = 0;
+  expected_crc = crc32c::Value(key.data(), key.size());
+  expected_crc = crc32c::Extend(expected_crc, value.data(), value.size());
+  expected_crc = crc32c::Mask(expected_crc);
+  if (expected_crc != blob_crc) {
+    return Status::Corruption("Blob CRC mismatch");
+  }
+  return Status::OK();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_log_format.h b/src/rocksdb/db/blob/blob_log_format.h
new file mode 100644
index 000000000..607db2367
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_log_format.h
@@ -0,0 +1,164 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Log format information shared by reader and writer.
+
+#pragma once
+
+#include <memory>
+#include <utility>
+
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+constexpr uint32_t kMagicNumber = 2395959;  // 0x00248f37
+constexpr uint32_t kVersion1 = 1;
+
+using ExpirationRange = std::pair<uint64_t, uint64_t>;
+
+// clang-format off
+
+// Format of blob log file header (30 bytes):
+//
+//    +--------------+---------+---------+-------+-------------+-------------------+
+//    | magic number | version |  cf id  | flags | compression | expiration range  |
+//    +--------------+---------+---------+-------+-------------+-------------------+
+//    |   Fixed32    | Fixed32 | Fixed32 | char  |    char     | Fixed64   Fixed64 |
+//    +--------------+---------+---------+-------+-------------+-------------------+
+//
+// List of flags:
+//   has_ttl: Whether the file contain TTL data.
+//
+// Expiration range in the header is a rough range based on
+// blob_db_options.ttl_range_secs.
+
+// clang-format on
+
+struct BlobLogHeader {
+  static constexpr size_t kSize = 30;
+
+  BlobLogHeader() = default;
+  BlobLogHeader(uint32_t _column_family_id, CompressionType _compression,
+                bool _has_ttl, const ExpirationRange& _expiration_range)
+      : column_family_id(_column_family_id),
+        compression(_compression),
+        has_ttl(_has_ttl),
+        expiration_range(_expiration_range) {}
+
+  uint32_t version = kVersion1;
+  uint32_t column_family_id = 0;
+  CompressionType compression = kNoCompression;
+  bool has_ttl = false;
+  ExpirationRange expiration_range;
+
+  void EncodeTo(std::string* dst);
+
+  Status DecodeFrom(Slice slice);
+};
+
+// clang-format off
+
+// Format of blob log file footer (32 bytes):
+//
+//    +--------------+------------+-------------------+------------+
+//    | magic number | blob count | expiration range  | footer CRC |
+//    +--------------+------------+-------------------+------------+
+//    |   Fixed32    |  Fixed64   | Fixed64 + Fixed64 |   Fixed32  |
+//    +--------------+------------+-------------------+------------+
+//
+// The footer will be presented only when the blob file is properly closed.
+//
+// Unlike the same field in file header, expiration range in the footer is the
+// range of smallest and largest expiration of the data in this file.
+
+// clang-format on
+
+struct BlobLogFooter {
+  static constexpr size_t kSize = 32;
+
+  uint64_t blob_count = 0;
+  ExpirationRange expiration_range = std::make_pair(0, 0);
+  uint32_t crc = 0;
+
+  void EncodeTo(std::string* dst);
+
+  Status DecodeFrom(Slice slice);
+};
+
+// clang-format off
+
+// Blob record format (32 bytes header + key + value):
+//
+//    +------------+--------------+------------+------------+----------+---------+-----------+
+//    | key length | value length | expiration | header CRC | blob CRC |   key   |   value   |
+//    +------------+--------------+------------+------------+----------+---------+-----------+
+//    |   Fixed64  |   Fixed64    |  Fixed64   |  Fixed32   | Fixed32  | key len | value len |
+//    +------------+--------------+------------+------------+----------+---------+-----------+
+//
+// If file has has_ttl = false, expiration field is always 0, and the blob
+// doesn't has expiration.
+//
+// Also note that if compression is used, value is compressed value and value
+// length is compressed value length.
+//
+// Header CRC is the checksum of (key_len + val_len + expiration), while
+// blob CRC is the checksum of (key + value).
+//
+// We could use variable length encoding (Varint64) to save more space, but it
+// make reader more complicated.
+
+// clang-format on
+
+struct BlobLogRecord {
+  // header include fields up to blob CRC
+  static constexpr size_t kHeaderSize = 32;
+
+  // Note that the offset field of BlobIndex actually points to the blob value
+  // as opposed to the start of the blob record. The following method can
+  // be used to calculate the adjustment needed to read the blob record header.
+  static constexpr uint64_t CalculateAdjustmentForRecordHeader(
+      uint64_t key_size) {
+    return key_size + kHeaderSize;
+  }
+
+  uint64_t key_size = 0;
+  uint64_t value_size = 0;
+  uint64_t expiration = 0;
+  uint32_t header_crc = 0;
+  uint32_t blob_crc = 0;
+  Slice key;
+  Slice value;
+  std::unique_ptr<char[]> key_buf;
+  std::unique_ptr<char[]> value_buf;
+
+  uint64_t record_size() const { return kHeaderSize + key_size + value_size; }
+
+  void EncodeHeaderTo(std::string* dst);
+
+  Status DecodeHeaderFrom(Slice src);
+
+  Status CheckBlobCRC() const;
+};
+
+// Checks whether a blob offset is potentially valid or not.
+inline bool IsValidBlobOffset(uint64_t value_offset, uint64_t key_size,
+                              uint64_t value_size, uint64_t file_size) {
+  if (value_offset <
+      BlobLogHeader::kSize + BlobLogRecord::kHeaderSize + key_size) {
+    return false;
+  }
+
+  if (value_offset + value_size + BlobLogFooter::kSize > file_size) {
+    return false;
+  }
+
+  return true;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_log_sequential_reader.cc b/src/rocksdb/db/blob/blob_log_sequential_reader.cc
new file mode 100644
index 000000000..778725189
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_log_sequential_reader.cc
@@ -0,0 +1,134 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "db/blob/blob_log_sequential_reader.h"
+
+#include "file/random_access_file_reader.h"
+#include "monitoring/statistics.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlobLogSequentialReader::BlobLogSequentialReader(
+    std::unique_ptr<RandomAccessFileReader>&& file_reader, SystemClock* clock,
+    Statistics* statistics)
+    : file_(std::move(file_reader)),
+      clock_(clock),
+      statistics_(statistics),
+      next_byte_(0) {}
+
+BlobLogSequentialReader::~BlobLogSequentialReader() = default;
+
+Status BlobLogSequentialReader::ReadSlice(uint64_t size, Slice* slice,
+                                          char* buf) {
+  assert(slice);
+  assert(file_);
+
+  StopWatch read_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
+  // TODO: rate limit `BlobLogSequentialReader` reads (it appears unused?)
+  Status s =
+      file_->Read(IOOptions(), next_byte_, static_cast<size_t>(size), slice,
+                  buf, nullptr, Env::IO_TOTAL /* rate_limiter_priority */);
+  next_byte_ += size;
+  if (!s.ok()) {
+    return s;
+  }
+  RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, slice->size());
+  if (slice->size() != size) {
+    return Status::Corruption("EOF reached while reading record");
+  }
+  return s;
+}
+
+Status BlobLogSequentialReader::ReadHeader(BlobLogHeader* header) {
+  assert(header);
+  assert(next_byte_ == 0);
+
+  static_assert(BlobLogHeader::kSize <= sizeof(header_buf_),
+                "Buffer is smaller than BlobLogHeader::kSize");
+
+  Status s = ReadSlice(BlobLogHeader::kSize, &buffer_, header_buf_);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (buffer_.size() != BlobLogHeader::kSize) {
+    return Status::Corruption("EOF reached before file header");
+  }
+
+  return header->DecodeFrom(buffer_);
+}
+
+Status BlobLogSequentialReader::ReadRecord(BlobLogRecord* record,
+                                           ReadLevel level,
+                                           uint64_t* blob_offset) {
+  assert(record);
+  static_assert(BlobLogRecord::kHeaderSize <= sizeof(header_buf_),
+                "Buffer is smaller than BlobLogRecord::kHeaderSize");
+
+  Status s = ReadSlice(BlobLogRecord::kHeaderSize, &buffer_, header_buf_);
+  if (!s.ok()) {
+    return s;
+  }
+  if (buffer_.size() != BlobLogRecord::kHeaderSize) {
+    return Status::Corruption("EOF reached before record header");
+  }
+
+  s = record->DecodeHeaderFrom(buffer_);
+  if (!s.ok()) {
+    return s;
+  }
+
+  uint64_t kb_size = record->key_size + record->value_size;
+  if (blob_offset != nullptr) {
+    *blob_offset = next_byte_ + record->key_size;
+  }
+
+  switch (level) {
+    case kReadHeader:
+      next_byte_ += kb_size;
+      break;
+
+    case kReadHeaderKey:
+      record->key_buf.reset(new char[record->key_size]);
+      s = ReadSlice(record->key_size, &record->key, record->key_buf.get());
+      next_byte_ += record->value_size;
+      break;
+
+    case kReadHeaderKeyBlob:
+      record->key_buf.reset(new char[record->key_size]);
+      s = ReadSlice(record->key_size, &record->key, record->key_buf.get());
+      if (s.ok()) {
+        record->value_buf.reset(new char[record->value_size]);
+        s = ReadSlice(record->value_size, &record->value,
+                      record->value_buf.get());
+      }
+      if (s.ok()) {
+        s = record->CheckBlobCRC();
+      }
+      break;
+  }
+  return s;
+}
+
+Status BlobLogSequentialReader::ReadFooter(BlobLogFooter* footer) {
+  assert(footer);
+  static_assert(BlobLogFooter::kSize <= sizeof(header_buf_),
+                "Buffer is smaller than BlobLogFooter::kSize");
+
+  Status s = ReadSlice(BlobLogFooter::kSize, &buffer_, header_buf_);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (buffer_.size() != BlobLogFooter::kSize) {
+    return Status::Corruption("EOF reached before file footer");
+  }
+
+  return footer->DecodeFrom(buffer_);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_log_sequential_reader.h b/src/rocksdb/db/blob/blob_log_sequential_reader.h
new file mode 100644
index 000000000..98afa8518
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_log_sequential_reader.h
@@ -0,0 +1,83 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#include <memory>
+
+#include "db/blob/blob_log_format.h"
+#include "rocksdb/slice.h"
+
+#define MAX_HEADER_SIZE(a, b, c) (a > b ? (a > c ? a : c) : (b > c ? b : c))
+
+namespace ROCKSDB_NAMESPACE {
+
+class RandomAccessFileReader;
+class Env;
+class Statistics;
+class Status;
+class SystemClock;
+
+/**
+ * BlobLogSequentialReader is a general purpose log stream reader
+ * implementation. The actual job of reading from the device is implemented by
+ * the RandomAccessFileReader interface.
+ *
+ * Please see BlobLogWriter for details on the file and record layout.
+ */
+
+class BlobLogSequentialReader {
+ public:
+  enum ReadLevel {
+    kReadHeader,
+    kReadHeaderKey,
+    kReadHeaderKeyBlob,
+  };
+
+  // Create a reader that will return log records from "*file_reader".
+  BlobLogSequentialReader(std::unique_ptr<RandomAccessFileReader>&& file_reader,
+                          SystemClock* clock, Statistics* statistics);
+
+  // No copying allowed
+  BlobLogSequentialReader(const BlobLogSequentialReader&) = delete;
+  BlobLogSequentialReader& operator=(const BlobLogSequentialReader&) = delete;
+
+  ~BlobLogSequentialReader();
+
+  Status ReadHeader(BlobLogHeader* header);
+
+  // Read the next record into *record.  Returns true if read
+  // successfully, false if we hit end of the input. The contents filled in
+  // *record will only be valid until the next mutating operation on this
+  // reader.
+  // If blob_offset is non-null, return offset of the blob through it.
+  Status ReadRecord(BlobLogRecord* record, ReadLevel level = kReadHeader,
+                    uint64_t* blob_offset = nullptr);
+
+  Status ReadFooter(BlobLogFooter* footer);
+
+  void ResetNextByte() { next_byte_ = 0; }
+
+  uint64_t GetNextByte() const { return next_byte_; }
+
+ private:
+  Status ReadSlice(uint64_t size, Slice* slice, char* buf);
+
+  const std::unique_ptr<RandomAccessFileReader> file_;
+  SystemClock* clock_;
+
+  Statistics* statistics_;
+
+  Slice buffer_;
+  char header_buf_[MAX_HEADER_SIZE(BlobLogHeader::kSize, BlobLogFooter::kSize,
+                                   BlobLogRecord::kHeaderSize)];
+
+  // which byte to read next
+  uint64_t next_byte_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#undef MAX_HEADER_SIZE
+\ No newline at end of file
diff --git a/src/rocksdb/db/blob/blob_log_writer.cc b/src/rocksdb/db/blob/blob_log_writer.cc
new file mode 100644
index 000000000..9dbac7f25
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_log_writer.cc
@@ -0,0 +1,178 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_log_writer.h"
+
+#include <cstdint>
+#include <string>
+
+#include "db/blob/blob_log_format.h"
+#include "file/writable_file_writer.h"
+#include "monitoring/statistics.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlobLogWriter::BlobLogWriter(std::unique_ptr<WritableFileWriter>&& dest,
+                             SystemClock* clock, Statistics* statistics,
+                             uint64_t log_number, bool use_fs, bool do_flush,
+                             uint64_t boffset)
+    : dest_(std::move(dest)),
+      clock_(clock),
+      statistics_(statistics),
+      log_number_(log_number),
+      block_offset_(boffset),
+      use_fsync_(use_fs),
+      do_flush_(do_flush),
+      last_elem_type_(kEtNone) {}
+
+BlobLogWriter::~BlobLogWriter() = default;
+
+Status BlobLogWriter::Sync() {
+  TEST_SYNC_POINT("BlobLogWriter::Sync");
+
+  StopWatch sync_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_SYNC_MICROS);
+  Status s = dest_->Sync(use_fsync_);
+  RecordTick(statistics_, BLOB_DB_BLOB_FILE_SYNCED);
+  return s;
+}
+
+Status BlobLogWriter::WriteHeader(BlobLogHeader& header) {
+  assert(block_offset_ == 0);
+  assert(last_elem_type_ == kEtNone);
+  std::string str;
+  header.EncodeTo(&str);
+
+  Status s = dest_->Append(Slice(str));
+  if (s.ok()) {
+    block_offset_ += str.size();
+    if (do_flush_) {
+      s = dest_->Flush();
+    }
+  }
+  last_elem_type_ = kEtFileHdr;
+  RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
+             BlobLogHeader::kSize);
+  return s;
+}
+
+Status BlobLogWriter::AppendFooter(BlobLogFooter& footer,
+                                   std::string* checksum_method,
+                                   std::string* checksum_value) {
+  assert(block_offset_ != 0);
+  assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord);
+
+  std::string str;
+  footer.EncodeTo(&str);
+
+  Status s;
+  if (dest_->seen_error()) {
+    s.PermitUncheckedError();
+    return Status::IOError("Seen Error. Skip closing.");
+  } else {
+    s = dest_->Append(Slice(str));
+    if (s.ok()) {
+      block_offset_ += str.size();
+
+      s = Sync();
+
+      if (s.ok()) {
+        s = dest_->Close();
+
+        if (s.ok()) {
+          assert(!!checksum_method == !!checksum_value);
+
+          if (checksum_method) {
+            assert(checksum_method->empty());
+
+            std::string method = dest_->GetFileChecksumFuncName();
+            if (method != kUnknownFileChecksumFuncName) {
+              *checksum_method = std::move(method);
+            }
+          }
+          if (checksum_value) {
+            assert(checksum_value->empty());
+
+            std::string value = dest_->GetFileChecksum();
+            if (value != kUnknownFileChecksum) {
+              *checksum_value = std::move(value);
+            }
+          }
+        }
+      }
+    }
+
+    dest_.reset();
+  }
+
+  last_elem_type_ = kEtFileFooter;
+  RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
+             BlobLogFooter::kSize);
+  return s;
+}
+
+Status BlobLogWriter::AddRecord(const Slice& key, const Slice& val,
+                                uint64_t expiration, uint64_t* key_offset,
+                                uint64_t* blob_offset) {
+  assert(block_offset_ != 0);
+  assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord);
+
+  std::string buf;
+  ConstructBlobHeader(&buf, key, val, expiration);
+
+  Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset);
+  return s;
+}
+
+Status BlobLogWriter::AddRecord(const Slice& key, const Slice& val,
+                                uint64_t* key_offset, uint64_t* blob_offset) {
+  assert(block_offset_ != 0);
+  assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord);
+
+  std::string buf;
+  ConstructBlobHeader(&buf, key, val, 0);
+
+  Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset);
+  return s;
+}
+
+void BlobLogWriter::ConstructBlobHeader(std::string* buf, const Slice& key,
+                                        const Slice& val, uint64_t expiration) {
+  BlobLogRecord record;
+  record.key = key;
+  record.value = val;
+  record.expiration = expiration;
+  record.EncodeHeaderTo(buf);
+}
+
+Status BlobLogWriter::EmitPhysicalRecord(const std::string& headerbuf,
+                                         const Slice& key, const Slice& val,
+                                         uint64_t* key_offset,
+                                         uint64_t* blob_offset) {
+  StopWatch write_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_WRITE_MICROS);
+  Status s = dest_->Append(Slice(headerbuf));
+  if (s.ok()) {
+    s = dest_->Append(key);
+  }
+  if (s.ok()) {
+    s = dest_->Append(val);
+  }
+  if (do_flush_ && s.ok()) {
+    s = dest_->Flush();
+  }
+
+  *key_offset = block_offset_ + BlobLogRecord::kHeaderSize;
+  *blob_offset = *key_offset + key.size();
+  block_offset_ = *blob_offset + val.size();
+  last_elem_type_ = kEtRecord;
+  RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
+             BlobLogRecord::kHeaderSize + key.size() + val.size());
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_log_writer.h b/src/rocksdb/db/blob/blob_log_writer.h
new file mode 100644
index 000000000..c1f9f31ad
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_log_writer.h
@@ -0,0 +1,83 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "db/blob/blob_log_format.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WritableFileWriter;
+class SystemClock;
+/**
+ * BlobLogWriter is the blob log stream writer. It provides an append-only
+ * abstraction for writing blob data.
+ *
+ *
+ * Look at blob_db_format.h to see the details of the record formats.
+ */
+
+class BlobLogWriter {
+ public:
+  // Create a writer that will append data to "*dest".
+  // "*dest" must be initially empty.
+  // "*dest" must remain live while this BlobLogWriter is in use.
+  BlobLogWriter(std::unique_ptr<WritableFileWriter>&& dest, SystemClock* clock,
+                Statistics* statistics, uint64_t log_number, bool use_fsync,
+                bool do_flush, uint64_t boffset = 0);
+  // No copying allowed
+  BlobLogWriter(const BlobLogWriter&) = delete;
+  BlobLogWriter& operator=(const BlobLogWriter&) = delete;
+
+  ~BlobLogWriter();
+
+  static void ConstructBlobHeader(std::string* buf, const Slice& key,
+                                  const Slice& val, uint64_t expiration);
+
+  Status AddRecord(const Slice& key, const Slice& val, uint64_t* key_offset,
+                   uint64_t* blob_offset);
+
+  Status AddRecord(const Slice& key, const Slice& val, uint64_t expiration,
+                   uint64_t* key_offset, uint64_t* blob_offset);
+
+  Status EmitPhysicalRecord(const std::string& headerbuf, const Slice& key,
+                            const Slice& val, uint64_t* key_offset,
+                            uint64_t* blob_offset);
+
+  Status AppendFooter(BlobLogFooter& footer, std::string* checksum_method,
+                      std::string* checksum_value);
+
+  Status WriteHeader(BlobLogHeader& header);
+
+  WritableFileWriter* file() { return dest_.get(); }
+
+  const WritableFileWriter* file() const { return dest_.get(); }
+
+  uint64_t get_log_number() const { return log_number_; }
+
+  Status Sync();
+
+ private:
+  std::unique_ptr<WritableFileWriter> dest_;
+  SystemClock* clock_;
+  Statistics* statistics_;
+  uint64_t log_number_;
+  uint64_t block_offset_;  // Current offset in block
+  bool use_fsync_;
+  bool do_flush_;
+
+ public:
+  enum ElemType { kEtNone, kEtFileHdr, kEtRecord, kEtFileFooter };
+  ElemType last_elem_type_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_read_request.h b/src/rocksdb/db/blob/blob_read_request.h
new file mode 100644
index 000000000..f9668ca2e
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_read_request.h
@@ -0,0 +1,58 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cinttypes>
+
+#include "rocksdb/compression_type.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A read Blob request structure for use in BlobSource::MultiGetBlob and
+// BlobFileReader::MultiGetBlob.
+struct BlobReadRequest {
+  // User key to lookup the paired blob
+  const Slice* user_key = nullptr;
+
+  // File offset in bytes
+  uint64_t offset = 0;
+
+  // Length to read in bytes
+  size_t len = 0;
+
+  // Blob compression type
+  CompressionType compression = kNoCompression;
+
+  // Output parameter set by MultiGetBlob() to point to the data buffer, and
+  // the number of valid bytes
+  PinnableSlice* result = nullptr;
+
+  // Status of read
+  Status* status = nullptr;
+
+  BlobReadRequest(const Slice& _user_key, uint64_t _offset, size_t _len,
+                  CompressionType _compression, PinnableSlice* _result,
+                  Status* _status)
+      : user_key(&_user_key),
+        offset(_offset),
+        len(_len),
+        compression(_compression),
+        result(_result),
+        status(_status) {}
+
+  BlobReadRequest() = default;
+  BlobReadRequest(const BlobReadRequest& other) = default;
+  BlobReadRequest& operator=(const BlobReadRequest& other) = default;
+};
+
+using BlobFileReadRequests =
+    std::tuple<uint64_t /* file_number */, uint64_t /* file_size */,
+               autovector<BlobReadRequest>>;
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_source.cc b/src/rocksdb/db/blob/blob_source.cc
new file mode 100644
index 000000000..bfade2507
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_source.cc
@@ -0,0 +1,488 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_source.h"
+
+#include <cassert>
+#include <string>
+
+#include "cache/cache_reservation_manager.h"
+#include "cache/charged_cache.h"
+#include "db/blob/blob_contents.h"
+#include "db/blob/blob_file_reader.h"
+#include "db/blob/blob_log_format.h"
+#include "monitoring/statistics.h"
+#include "options/cf_options.h"
+#include "table/get_context.h"
+#include "table/multiget_context.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlobSource::BlobSource(const ImmutableOptions* immutable_options,
+                       const std::string& db_id,
+                       const std::string& db_session_id,
+                       BlobFileCache* blob_file_cache)
+    : db_id_(db_id),
+      db_session_id_(db_session_id),
+      statistics_(immutable_options->statistics.get()),
+      blob_file_cache_(blob_file_cache),
+      blob_cache_(immutable_options->blob_cache),
+      lowest_used_cache_tier_(immutable_options->lowest_used_cache_tier) {
+#ifndef ROCKSDB_LITE
+  auto bbto =
+      immutable_options->table_factory->GetOptions<BlockBasedTableOptions>();
+  if (bbto &&
+      bbto->cache_usage_options.options_overrides.at(CacheEntryRole::kBlobCache)
+              .charged == CacheEntryRoleOptions::Decision::kEnabled) {
+    blob_cache_ = std::make_shared<ChargedCache>(immutable_options->blob_cache,
+                                                 bbto->block_cache);
+  }
+#endif  // ROCKSDB_LITE
+}
+
+BlobSource::~BlobSource() = default;
+
+Status BlobSource::GetBlobFromCache(
+    const Slice& cache_key, CacheHandleGuard<BlobContents>* cached_blob) const {
+  assert(blob_cache_);
+  assert(!cache_key.empty());
+  assert(cached_blob);
+  assert(cached_blob->IsEmpty());
+
+  Cache::Handle* cache_handle = nullptr;
+  cache_handle = GetEntryFromCache(cache_key);
+  if (cache_handle != nullptr) {
+    *cached_blob =
+        CacheHandleGuard<BlobContents>(blob_cache_.get(), cache_handle);
+
+    assert(cached_blob->GetValue());
+
+    PERF_COUNTER_ADD(blob_cache_hit_count, 1);
+    RecordTick(statistics_, BLOB_DB_CACHE_HIT);
+    RecordTick(statistics_, BLOB_DB_CACHE_BYTES_READ,
+               cached_blob->GetValue()->size());
+
+    return Status::OK();
+  }
+
+  RecordTick(statistics_, BLOB_DB_CACHE_MISS);
+
+  return Status::NotFound("Blob not found in cache");
+}
+
+Status BlobSource::PutBlobIntoCache(
+    const Slice& cache_key, std::unique_ptr<BlobContents>* blob,
+    CacheHandleGuard<BlobContents>* cached_blob) const {
+  assert(blob_cache_);
+  assert(!cache_key.empty());
+  assert(blob);
+  assert(*blob);
+  assert(cached_blob);
+  assert(cached_blob->IsEmpty());
+
+  Cache::Handle* cache_handle = nullptr;
+  const Status s = InsertEntryIntoCache(cache_key, blob->get(),
+                                        (*blob)->ApproximateMemoryUsage(),
+                                        &cache_handle, Cache::Priority::BOTTOM);
+  if (s.ok()) {
+    blob->release();
+
+    assert(cache_handle != nullptr);
+    *cached_blob =
+        CacheHandleGuard<BlobContents>(blob_cache_.get(), cache_handle);
+
+    assert(cached_blob->GetValue());
+
+    RecordTick(statistics_, BLOB_DB_CACHE_ADD);
+    RecordTick(statistics_, BLOB_DB_CACHE_BYTES_WRITE,
+               cached_blob->GetValue()->size());
+
+  } else {
+    RecordTick(statistics_, BLOB_DB_CACHE_ADD_FAILURES);
+  }
+
+  return s;
+}
+
+Cache::Handle* BlobSource::GetEntryFromCache(const Slice& key) const {
+  Cache::Handle* cache_handle = nullptr;
+
+  if (lowest_used_cache_tier_ == CacheTier::kNonVolatileBlockTier) {
+    Cache::CreateCallback create_cb =
+        [allocator = blob_cache_->memory_allocator()](
+            const void* buf, size_t size, void** out_obj,
+            size_t* charge) -> Status {
+      return BlobContents::CreateCallback(AllocateBlock(size, allocator), buf,
+                                          size, out_obj, charge);
+    };
+
+    cache_handle = blob_cache_->Lookup(key, BlobContents::GetCacheItemHelper(),
+                                       create_cb, Cache::Priority::BOTTOM,
+                                       true /* wait_for_cache */, statistics_);
+  } else {
+    cache_handle = blob_cache_->Lookup(key, statistics_);
+  }
+
+  return cache_handle;
+}
+
+void BlobSource::PinCachedBlob(CacheHandleGuard<BlobContents>* cached_blob,
+                               PinnableSlice* value) {
+  assert(cached_blob);
+  assert(cached_blob->GetValue());
+  assert(value);
+
+  // To avoid copying the cached blob into the buffer provided by the
+  // application, we can simply transfer ownership of the cache handle to
+  // the target PinnableSlice. This has the potential to save a lot of
+  // CPU, especially with large blob values.
+
+  value->Reset();
+
+  constexpr Cleanable* cleanable = nullptr;
+  value->PinSlice(cached_blob->GetValue()->data(), cleanable);
+
+  cached_blob->TransferTo(value);
+}
+
+void BlobSource::PinOwnedBlob(std::unique_ptr<BlobContents>* owned_blob,
+                              PinnableSlice* value) {
+  assert(owned_blob);
+  assert(*owned_blob);
+  assert(value);
+
+  BlobContents* const blob = owned_blob->release();
+  assert(blob);
+
+  value->Reset();
+  value->PinSlice(
+      blob->data(),
+      [](void* arg1, void* /* arg2 */) {
+        delete static_cast<BlobContents*>(arg1);
+      },
+      blob, nullptr);
+}
+
+Status BlobSource::InsertEntryIntoCache(const Slice& key, BlobContents* value,
+                                        size_t charge,
+                                        Cache::Handle** cache_handle,
+                                        Cache::Priority priority) const {
+  Status s;
+
+  Cache::CacheItemHelper* const cache_item_helper =
+      BlobContents::GetCacheItemHelper();
+  assert(cache_item_helper);
+
+  if (lowest_used_cache_tier_ == CacheTier::kNonVolatileBlockTier) {
+    s = blob_cache_->Insert(key, value, cache_item_helper, charge, cache_handle,
+                            priority);
+  } else {
+    s = blob_cache_->Insert(key, value, charge, cache_item_helper->del_cb,
+                            cache_handle, priority);
+  }
+
+  return s;
+}
+
+Status BlobSource::GetBlob(const ReadOptions& read_options,
+                           const Slice& user_key, uint64_t file_number,
+                           uint64_t offset, uint64_t file_size,
+                           uint64_t value_size,
+                           CompressionType compression_type,
+                           FilePrefetchBuffer* prefetch_buffer,
+                           PinnableSlice* value, uint64_t* bytes_read) {
+  assert(value);
+
+  Status s;
+
+  const CacheKey cache_key = GetCacheKey(file_number, file_size, offset);
+
+  CacheHandleGuard<BlobContents> blob_handle;
+
+  // First, try to get the blob from the cache
+  //
+  // If blob cache is enabled, we'll try to read from it.
+  if (blob_cache_) {
+    Slice key = cache_key.AsSlice();
+    s = GetBlobFromCache(key, &blob_handle);
+    if (s.ok()) {
+      PinCachedBlob(&blob_handle, value);
+
+      // For consistency, the size of on-disk (possibly compressed) blob record
+      // is assigned to bytes_read.
+      uint64_t adjustment =
+          read_options.verify_checksums
+              ? BlobLogRecord::CalculateAdjustmentForRecordHeader(
+                    user_key.size())
+              : 0;
+      assert(offset >= adjustment);
+
+      uint64_t record_size = value_size + adjustment;
+      if (bytes_read) {
+        *bytes_read = record_size;
+      }
+      return s;
+    }
+  }
+
+  assert(blob_handle.IsEmpty());
+
+  const bool no_io = read_options.read_tier == kBlockCacheTier;
+  if (no_io) {
+    s = Status::Incomplete("Cannot read blob(s): no disk I/O allowed");
+    return s;
+  }
+
+  // Can't find the blob from the cache. Since I/O is allowed, read from the
+  // file.
+  std::unique_ptr<BlobContents> blob_contents;
+
+  {
+    CacheHandleGuard<BlobFileReader> blob_file_reader;
+    s = blob_file_cache_->GetBlobFileReader(file_number, &blob_file_reader);
+    if (!s.ok()) {
+      return s;
+    }
+
+    assert(blob_file_reader.GetValue());
+
+    if (compression_type != blob_file_reader.GetValue()->GetCompressionType()) {
+      return Status::Corruption("Compression type mismatch when reading blob");
+    }
+
+    MemoryAllocator* const allocator = (blob_cache_ && read_options.fill_cache)
+                                           ? blob_cache_->memory_allocator()
+                                           : nullptr;
+
+    uint64_t read_size = 0;
+    s = blob_file_reader.GetValue()->GetBlob(
+        read_options, user_key, offset, value_size, compression_type,
+        prefetch_buffer, allocator, &blob_contents, &read_size);
+    if (!s.ok()) {
+      return s;
+    }
+    if (bytes_read) {
+      *bytes_read = read_size;
+    }
+  }
+
+  if (blob_cache_ && read_options.fill_cache) {
+    // If filling cache is allowed and a cache is configured, try to put the
+    // blob to the cache.
+    Slice key = cache_key.AsSlice();
+    s = PutBlobIntoCache(key, &blob_contents, &blob_handle);
+    if (!s.ok()) {
+      return s;
+    }
+
+    PinCachedBlob(&blob_handle, value);
+  } else {
+    PinOwnedBlob(&blob_contents, value);
+  }
+
+  assert(s.ok());
+  return s;
+}
+
+void BlobSource::MultiGetBlob(const ReadOptions& read_options,
+                              autovector<BlobFileReadRequests>& blob_reqs,
+                              uint64_t* bytes_read) {
+  assert(blob_reqs.size() > 0);
+
+  uint64_t total_bytes_read = 0;
+  uint64_t bytes_read_in_file = 0;
+
+  for (auto& [file_number, file_size, blob_reqs_in_file] : blob_reqs) {
+    // sort blob_reqs_in_file by file offset.
+    std::sort(
+        blob_reqs_in_file.begin(), blob_reqs_in_file.end(),
+        [](const BlobReadRequest& lhs, const BlobReadRequest& rhs) -> bool {
+          return lhs.offset < rhs.offset;
+        });
+
+    MultiGetBlobFromOneFile(read_options, file_number, file_size,
+                            blob_reqs_in_file, &bytes_read_in_file);
+
+    total_bytes_read += bytes_read_in_file;
+  }
+
+  if (bytes_read) {
+    *bytes_read = total_bytes_read;
+  }
+}
+
+void BlobSource::MultiGetBlobFromOneFile(const ReadOptions& read_options,
+                                         uint64_t file_number,
+                                         uint64_t /*file_size*/,
+                                         autovector<BlobReadRequest>& blob_reqs,
+                                         uint64_t* bytes_read) {
+  const size_t num_blobs = blob_reqs.size();
+  assert(num_blobs > 0);
+  assert(num_blobs <= MultiGetContext::MAX_BATCH_SIZE);
+
+#ifndef NDEBUG
+  for (size_t i = 0; i < num_blobs - 1; ++i) {
+    assert(blob_reqs[i].offset <= blob_reqs[i + 1].offset);
+  }
+#endif  // !NDEBUG
+
+  using Mask = uint64_t;
+  Mask cache_hit_mask = 0;
+
+  uint64_t total_bytes = 0;
+  const OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number);
+
+  if (blob_cache_) {
+    size_t cached_blob_count = 0;
+    for (size_t i = 0; i < num_blobs; ++i) {
+      auto& req = blob_reqs[i];
+
+      CacheHandleGuard<BlobContents> blob_handle;
+      const CacheKey cache_key = base_cache_key.WithOffset(req.offset);
+      const Slice key = cache_key.AsSlice();
+
+      const Status s = GetBlobFromCache(key, &blob_handle);
+
+      if (s.ok()) {
+        assert(req.status);
+        *req.status = s;
+
+        PinCachedBlob(&blob_handle, req.result);
+
+        // Update the counter for the number of valid blobs read from the cache.
+        ++cached_blob_count;
+
+        // For consistency, the size of each on-disk (possibly compressed) blob
+        // record is accumulated to total_bytes.
+        uint64_t adjustment =
+            read_options.verify_checksums
+                ? BlobLogRecord::CalculateAdjustmentForRecordHeader(
+                      req.user_key->size())
+                : 0;
+        assert(req.offset >= adjustment);
+        total_bytes += req.len + adjustment;
+        cache_hit_mask |= (Mask{1} << i);  // cache hit
+      }
+    }
+
+    // All blobs were read from the cache.
+    if (cached_blob_count == num_blobs) {
+      if (bytes_read) {
+        *bytes_read = total_bytes;
+      }
+      return;
+    }
+  }
+
+  const bool no_io = read_options.read_tier == kBlockCacheTier;
+  if (no_io) {
+    for (size_t i = 0; i < num_blobs; ++i) {
+      if (!(cache_hit_mask & (Mask{1} << i))) {
+        BlobReadRequest& req = blob_reqs[i];
+        assert(req.status);
+
+        *req.status =
+            Status::Incomplete("Cannot read blob(s): no disk I/O allowed");
+      }
+    }
+    return;
+  }
+
+  {
+    // Find the rest of blobs from the file since I/O is allowed.
+    autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>
+        _blob_reqs;
+    uint64_t _bytes_read = 0;
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      if (!(cache_hit_mask & (Mask{1} << i))) {
+        _blob_reqs.emplace_back(&blob_reqs[i], std::unique_ptr<BlobContents>());
+      }
+    }
+
+    CacheHandleGuard<BlobFileReader> blob_file_reader;
+    Status s =
+        blob_file_cache_->GetBlobFileReader(file_number, &blob_file_reader);
+    if (!s.ok()) {
+      for (size_t i = 0; i < _blob_reqs.size(); ++i) {
+        BlobReadRequest* const req = _blob_reqs[i].first;
+        assert(req);
+        assert(req->status);
+
+        *req->status = s;
+      }
+      return;
+    }
+
+    assert(blob_file_reader.GetValue());
+
+    MemoryAllocator* const allocator = (blob_cache_ && read_options.fill_cache)
+                                           ? blob_cache_->memory_allocator()
+                                           : nullptr;
+
+    blob_file_reader.GetValue()->MultiGetBlob(read_options, allocator,
+                                              _blob_reqs, &_bytes_read);
+
+    if (blob_cache_ && read_options.fill_cache) {
+      // If filling cache is allowed and a cache is configured, try to put
+      // the blob(s) to the cache.
+      for (auto& [req, blob_contents] : _blob_reqs) {
+        assert(req);
+
+        if (req->status->ok()) {
+          CacheHandleGuard<BlobContents> blob_handle;
+          const CacheKey cache_key = base_cache_key.WithOffset(req->offset);
+          const Slice key = cache_key.AsSlice();
+          s = PutBlobIntoCache(key, &blob_contents, &blob_handle);
+          if (!s.ok()) {
+            *req->status = s;
+          } else {
+            PinCachedBlob(&blob_handle, req->result);
+          }
+        }
+      }
+    } else {
+      for (auto& [req, blob_contents] : _blob_reqs) {
+        assert(req);
+
+        if (req->status->ok()) {
+          PinOwnedBlob(&blob_contents, req->result);
+        }
+      }
+    }
+
+    total_bytes += _bytes_read;
+    if (bytes_read) {
+      *bytes_read = total_bytes;
+    }
+  }
+}
+
+bool BlobSource::TEST_BlobInCache(uint64_t file_number, uint64_t file_size,
+                                  uint64_t offset, size_t* charge) const {
+  const CacheKey cache_key = GetCacheKey(file_number, file_size, offset);
+  const Slice key = cache_key.AsSlice();
+
+  CacheHandleGuard<BlobContents> blob_handle;
+  const Status s = GetBlobFromCache(key, &blob_handle);
+
+  if (s.ok() && blob_handle.GetValue() != nullptr) {
+    if (charge) {
+      const Cache* const cache = blob_handle.GetCache();
+      assert(cache);
+
+      Cache::Handle* const handle = blob_handle.GetCacheHandle();
+      assert(handle);
+
+      *charge = cache->GetUsage(handle);
+    }
+
+    return true;
+  }
+
+  return false;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_source.h b/src/rocksdb/db/blob/blob_source.h
new file mode 100644
index 000000000..2ed296eeb
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_source.h
@@ -0,0 +1,153 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cinttypes>
+#include <memory>
+
+#include "cache/cache_helpers.h"
+#include "cache/cache_key.h"
+#include "db/blob/blob_file_cache.h"
+#include "db/blob/blob_read_request.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "table/block_based/cachable_entry.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct ImmutableOptions;
+class Status;
+class FilePrefetchBuffer;
+class Slice;
+class BlobContents;
+
+// BlobSource is a class that provides universal access to blobs, regardless of
+// whether they are in the blob cache, secondary cache, or (remote) storage.
+// Depending on user settings, it always fetch blobs from multi-tier cache and
+// storage with minimal cost.
+class BlobSource {
+ public:
+  BlobSource(const ImmutableOptions* immutable_options,
+             const std::string& db_id, const std::string& db_session_id,
+             BlobFileCache* blob_file_cache);
+
+  BlobSource(const BlobSource&) = delete;
+  BlobSource& operator=(const BlobSource&) = delete;
+
+  ~BlobSource();
+
+  // Read a blob from the underlying cache or one blob file.
+  //
+  // If successful, returns ok and sets "*value" to the newly retrieved
+  // uncompressed blob. If there was an error while fetching the blob, sets
+  // "*value" to empty and returns a non-ok status.
+  //
+  // Note: For consistency, whether the blob is found in the cache or on disk,
+  // sets "*bytes_read" to the size of on-disk (possibly compressed) blob
+  // record.
+  Status GetBlob(const ReadOptions& read_options, const Slice& user_key,
+                 uint64_t file_number, uint64_t offset, uint64_t file_size,
+                 uint64_t value_size, CompressionType compression_type,
+                 FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value,
+                 uint64_t* bytes_read);
+
+  // Read multiple blobs from the underlying cache or blob file(s).
+  //
+  // If successful, returns ok and sets "result" in the elements of "blob_reqs"
+  // to the newly retrieved uncompressed blobs. If there was an error while
+  // fetching one of blobs, sets its "result" to empty and sets its
+  // corresponding "status" to a non-ok status.
+  //
+  // Note:
+  //  - The main difference between this function and MultiGetBlobFromOneFile is
+  //    that this function can read multiple blobs from multiple blob files.
+  //
+  //  - For consistency, whether the blob is found in the cache or on disk, sets
+  //  "*bytes_read" to the total size of on-disk (possibly compressed) blob
+  //  records.
+  void MultiGetBlob(const ReadOptions& read_options,
+                    autovector<BlobFileReadRequests>& blob_reqs,
+                    uint64_t* bytes_read);
+
+  // Read multiple blobs from the underlying cache or one blob file.
+  //
+  // If successful, returns ok and sets "result" in the elements of "blob_reqs"
+  // to the newly retrieved uncompressed blobs. If there was an error while
+  // fetching one of blobs, sets its "result" to empty and sets its
+  // corresponding "status" to a non-ok status.
+  //
+  // Note:
+  //  - The main difference between this function and MultiGetBlob is that this
+  //  function is only used for the case where the demanded blobs are stored in
+  //  one blob file. MultiGetBlob will call this function multiple times if the
+  //  demanded blobs are stored in multiple blob files.
+  //
+  //  - For consistency, whether the blob is found in the cache or on disk, sets
+  //  "*bytes_read" to the total size of on-disk (possibly compressed) blob
+  //  records.
+  void MultiGetBlobFromOneFile(const ReadOptions& read_options,
+                               uint64_t file_number, uint64_t file_size,
+                               autovector<BlobReadRequest>& blob_reqs,
+                               uint64_t* bytes_read);
+
+  inline Status GetBlobFileReader(
+      uint64_t blob_file_number,
+      CacheHandleGuard<BlobFileReader>* blob_file_reader) {
+    return blob_file_cache_->GetBlobFileReader(blob_file_number,
+                                               blob_file_reader);
+  }
+
+  inline Cache* GetBlobCache() const { return blob_cache_.get(); }
+
+  bool TEST_BlobInCache(uint64_t file_number, uint64_t file_size,
+                        uint64_t offset, size_t* charge = nullptr) const;
+
+ private:
+  Status GetBlobFromCache(const Slice& cache_key,
+                          CacheHandleGuard<BlobContents>* cached_blob) const;
+
+  Status PutBlobIntoCache(const Slice& cache_key,
+                          std::unique_ptr<BlobContents>* blob,
+                          CacheHandleGuard<BlobContents>* cached_blob) const;
+
+  static void PinCachedBlob(CacheHandleGuard<BlobContents>* cached_blob,
+                            PinnableSlice* value);
+
+  static void PinOwnedBlob(std::unique_ptr<BlobContents>* owned_blob,
+                           PinnableSlice* value);
+
+  Cache::Handle* GetEntryFromCache(const Slice& key) const;
+
+  Status InsertEntryIntoCache(const Slice& key, BlobContents* value,
+                              size_t charge, Cache::Handle** cache_handle,
+                              Cache::Priority priority) const;
+
+  inline CacheKey GetCacheKey(uint64_t file_number, uint64_t /*file_size*/,
+                              uint64_t offset) const {
+    OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number);
+    return base_cache_key.WithOffset(offset);
+  }
+
+  const std::string& db_id_;
+  const std::string& db_session_id_;
+
+  Statistics* statistics_;
+
+  // A cache to store blob file reader.
+  BlobFileCache* blob_file_cache_;
+
+  // A cache to store uncompressed blobs.
+  std::shared_ptr<Cache> blob_cache_;
+
+  // The control option of how the cache tiers will be used. Currently rocksdb
+  // support block/blob cache (volatile tier) and secondary cache (this tier
+  // isn't strictly speaking a non-volatile tier since the compressed cache in
+  // this tier is in volatile memory).
+  const CacheTier lowest_used_cache_tier_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_source_test.cc b/src/rocksdb/db/blob/blob_source_test.cc
new file mode 100644
index 000000000..a85ed8646
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_source_test.cc
@@ -0,0 +1,1624 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_source.h"
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <string>
+
+#include "cache/charged_cache.h"
+#include "cache/compressed_secondary_cache.h"
+#include "db/blob/blob_contents.h"
+#include "db/blob/blob_file_cache.h"
+#include "db/blob/blob_file_reader.h"
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_writer.h"
+#include "db/db_test_util.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "options/cf_options.h"
+#include "rocksdb/options.h"
+#include "util/compression.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Creates a test blob file with `num` blobs in it.
+void WriteBlobFile(const ImmutableOptions& immutable_options,
+                   uint32_t column_family_id, bool has_ttl,
+                   const ExpirationRange& expiration_range_header,
+                   const ExpirationRange& expiration_range_footer,
+                   uint64_t blob_file_number, const std::vector<Slice>& keys,
+                   const std::vector<Slice>& blobs, CompressionType compression,
+                   std::vector<uint64_t>& blob_offsets,
+                   std::vector<uint64_t>& blob_sizes) {
+  assert(!immutable_options.cf_paths.empty());
+  size_t num = keys.size();
+  assert(num == blobs.size());
+  assert(num == blob_offsets.size());
+  assert(num == blob_sizes.size());
+
+  const std::string blob_file_path =
+      BlobFileName(immutable_options.cf_paths.front().path, blob_file_number);
+  std::unique_ptr<FSWritableFile> file;
+  ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file,
+                            FileOptions()));
+
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(file), blob_file_path, FileOptions(), immutable_options.clock));
+
+  constexpr Statistics* statistics = nullptr;
+  constexpr bool use_fsync = false;
+  constexpr bool do_flush = false;
+
+  BlobLogWriter blob_log_writer(std::move(file_writer), immutable_options.clock,
+                                statistics, blob_file_number, use_fsync,
+                                do_flush);
+
+  BlobLogHeader header(column_family_id, compression, has_ttl,
+                       expiration_range_header);
+
+  ASSERT_OK(blob_log_writer.WriteHeader(header));
+
+  std::vector<std::string> compressed_blobs(num);
+  std::vector<Slice> blobs_to_write(num);
+  if (kNoCompression == compression) {
+    for (size_t i = 0; i < num; ++i) {
+      blobs_to_write[i] = blobs[i];
+      blob_sizes[i] = blobs[i].size();
+    }
+  } else {
+    CompressionOptions opts;
+    CompressionContext context(compression);
+    constexpr uint64_t sample_for_compression = 0;
+    CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
+                         compression, sample_for_compression);
+
+    constexpr uint32_t compression_format_version = 2;
+
+    for (size_t i = 0; i < num; ++i) {
+      ASSERT_TRUE(CompressData(blobs[i], info, compression_format_version,
+                               &compressed_blobs[i]));
+      blobs_to_write[i] = compressed_blobs[i];
+      blob_sizes[i] = compressed_blobs[i].size();
+    }
+  }
+
+  for (size_t i = 0; i < num; ++i) {
+    uint64_t key_offset = 0;
+    ASSERT_OK(blob_log_writer.AddRecord(keys[i], blobs_to_write[i], &key_offset,
+                                        &blob_offsets[i]));
+  }
+
+  BlobLogFooter footer;
+  footer.blob_count = num;
+  footer.expiration_range = expiration_range_footer;
+
+  std::string checksum_method;
+  std::string checksum_value;
+  ASSERT_OK(
+      blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value));
+}
+
+}  // anonymous namespace
+
+class BlobSourceTest : public DBTestBase {
+ protected:
+ public:
+  explicit BlobSourceTest()
+      : DBTestBase("blob_source_test", /*env_do_fsync=*/true) {
+    options_.env = env_;
+    options_.enable_blob_files = true;
+    options_.create_if_missing = true;
+
+    LRUCacheOptions co;
+    co.capacity = 8 << 20;
+    co.num_shard_bits = 2;
+    co.metadata_charge_policy = kDontChargeCacheMetadata;
+    co.high_pri_pool_ratio = 0.2;
+    co.low_pri_pool_ratio = 0.2;
+    options_.blob_cache = NewLRUCache(co);
+    options_.lowest_used_cache_tier = CacheTier::kVolatileTier;
+
+    assert(db_->GetDbIdentity(db_id_).ok());
+    assert(db_->GetDbSessionId(db_session_id_).ok());
+  }
+
+  Options options_;
+  std::string db_id_;
+  std::string db_session_id_;
+};
+
+TEST_F(BlobSourceTest, GetBlobsFromCache) {
+  options_.cf_paths.emplace_back(
+      test::PerThreadDBPath(env_, "BlobSourceTest_GetBlobsFromCache"), 0);
+
+  options_.statistics = CreateDBStatistics();
+  Statistics* statistics = options_.statistics.get();
+  assert(statistics);
+
+  DestroyAndReopen(options_);
+
+  ImmutableOptions immutable_options(options_);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr size_t num_blobs = 16;
+
+  std::vector<std::string> key_strs;
+  std::vector<std::string> blob_strs;
+
+  for (size_t i = 0; i < num_blobs; ++i) {
+    key_strs.push_back("key" + std::to_string(i));
+    blob_strs.push_back("blob" + std::to_string(i));
+  }
+
+  std::vector<Slice> keys;
+  std::vector<Slice> blobs;
+
+  uint64_t file_size = BlobLogHeader::kSize;
+  for (size_t i = 0; i < num_blobs; ++i) {
+    keys.push_back({key_strs[i]});
+    blobs.push_back({blob_strs[i]});
+    file_size += BlobLogRecord::kHeaderSize + keys[i].size() + blobs[i].size();
+  }
+  file_size += BlobLogFooter::kSize;
+
+  std::vector<uint64_t> blob_offsets(keys.size());
+  std::vector<uint64_t> blob_sizes(keys.size());
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, keys, blobs, kNoCompression,
+                blob_offsets, blob_sizes);
+
+  constexpr size_t capacity = 1024;
+  std::shared_ptr<Cache> backing_cache =
+      NewLRUCache(capacity);  // Blob file cache
+
+  FileOptions file_options;
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileCache> blob_file_cache =
+      std::make_unique<BlobFileCache>(
+          backing_cache.get(), &immutable_options, &file_options,
+          column_family_id, blob_file_read_hist, nullptr /*IOTracer*/);
+
+  BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
+                         blob_file_cache.get());
+
+  ReadOptions read_options;
+  read_options.verify_checksums = true;
+
+  constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+
+  {
+    // GetBlob
+    std::vector<PinnableSlice> values(keys.size());
+    uint64_t bytes_read = 0;
+    uint64_t blob_bytes = 0;
+    uint64_t total_bytes = 0;
+
+    read_options.fill_cache = false;
+    get_perf_context()->Reset();
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                                blob_offsets[i]));
+
+      ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number,
+                                    blob_offsets[i], file_size, blob_sizes[i],
+                                    kNoCompression, prefetch_buffer, &values[i],
+                                    &bytes_read));
+      ASSERT_EQ(values[i], blobs[i]);
+      ASSERT_TRUE(values[i].IsPinned());
+      ASSERT_EQ(bytes_read,
+                BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]);
+
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                                blob_offsets[i]));
+      total_bytes += bytes_read;
+    }
+
+    // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache,
+    // GetBlob, and TEST_BlobInCache.
+    ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0);
+    ASSERT_EQ((int)get_perf_context()->blob_read_count, num_blobs);
+    ASSERT_EQ((int)get_perf_context()->blob_read_byte, total_bytes);
+    ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0);
+    ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 3);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+
+    read_options.fill_cache = true;
+    blob_bytes = 0;
+    total_bytes = 0;
+    get_perf_context()->Reset();
+    statistics->Reset().PermitUncheckedError();
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                                blob_offsets[i]));
+
+      ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number,
+                                    blob_offsets[i], file_size, blob_sizes[i],
+                                    kNoCompression, prefetch_buffer, &values[i],
+                                    &bytes_read));
+      ASSERT_EQ(values[i], blobs[i]);
+      ASSERT_TRUE(values[i].IsPinned());
+      ASSERT_EQ(bytes_read,
+                BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]);
+
+      blob_bytes += blob_sizes[i];
+      total_bytes += bytes_read;
+      ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, i);
+      ASSERT_EQ((int)get_perf_context()->blob_read_count, i + 1);
+      ASSERT_EQ((int)get_perf_context()->blob_read_byte, total_bytes);
+
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                               blob_offsets[i]));
+
+      ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, i + 1);
+      ASSERT_EQ((int)get_perf_context()->blob_read_count, i + 1);
+      ASSERT_EQ((int)get_perf_context()->blob_read_byte, total_bytes);
+    }
+
+    ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_blobs);
+    ASSERT_EQ((int)get_perf_context()->blob_read_count, num_blobs);
+    ASSERT_EQ((int)get_perf_context()->blob_read_byte, total_bytes);
+
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 2);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_blobs);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), num_blobs);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), blob_bytes);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE),
+              blob_bytes);
+
+    read_options.fill_cache = true;
+    total_bytes = 0;
+    blob_bytes = 0;
+    get_perf_context()->Reset();
+    statistics->Reset().PermitUncheckedError();
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                               blob_offsets[i]));
+
+      ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number,
+                                    blob_offsets[i], file_size, blob_sizes[i],
+                                    kNoCompression, prefetch_buffer, &values[i],
+                                    &bytes_read));
+      ASSERT_EQ(values[i], blobs[i]);
+      ASSERT_TRUE(values[i].IsPinned());
+      ASSERT_EQ(bytes_read,
+                BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]);
+
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                               blob_offsets[i]));
+      total_bytes += bytes_read;    // on-disk blob record size
+      blob_bytes += blob_sizes[i];  // cached blob value size
+    }
+
+    // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache,
+    // GetBlob, and TEST_BlobInCache.
+    ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_blobs * 3);
+    ASSERT_EQ((int)get_perf_context()->blob_read_count, 0);  // without i/o
+    ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0);   // without i/o
+
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_blobs * 3);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ),
+              blob_bytes * 3);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+
+    // Cache-only GetBlob
+    read_options.read_tier = ReadTier::kBlockCacheTier;
+    total_bytes = 0;
+    blob_bytes = 0;
+    get_perf_context()->Reset();
+    statistics->Reset().PermitUncheckedError();
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                               blob_offsets[i]));
+
+      ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number,
+                                    blob_offsets[i], file_size, blob_sizes[i],
+                                    kNoCompression, prefetch_buffer, &values[i],
+                                    &bytes_read));
+      ASSERT_EQ(values[i], blobs[i]);
+      ASSERT_TRUE(values[i].IsPinned());
+      ASSERT_EQ(bytes_read,
+                BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]);
+
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                               blob_offsets[i]));
+      total_bytes += bytes_read;
+      blob_bytes += blob_sizes[i];
+    }
+
+    // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache,
+    // GetBlob, and TEST_BlobInCache.
+    ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_blobs * 3);
+    ASSERT_EQ((int)get_perf_context()->blob_read_count, 0);  // without i/o
+    ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0);   // without i/o
+
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_blobs * 3);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ),
+              blob_bytes * 3);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+  }
+
+  options_.blob_cache->EraseUnRefEntries();
+
+  {
+    // Cache-only GetBlob
+    std::vector<PinnableSlice> values(keys.size());
+    uint64_t bytes_read = 0;
+
+    read_options.read_tier = ReadTier::kBlockCacheTier;
+    read_options.fill_cache = true;
+    get_perf_context()->Reset();
+    statistics->Reset().PermitUncheckedError();
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                                blob_offsets[i]));
+
+      ASSERT_TRUE(blob_source
+                      .GetBlob(read_options, keys[i], blob_file_number,
+                               blob_offsets[i], file_size, blob_sizes[i],
+                               kNoCompression, prefetch_buffer, &values[i],
+                               &bytes_read)
+                      .IsIncomplete());
+      ASSERT_TRUE(values[i].empty());
+      ASSERT_FALSE(values[i].IsPinned());
+      ASSERT_EQ(bytes_read, 0);
+
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                                blob_offsets[i]));
+    }
+
+    // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache,
+    // GetBlob, and TEST_BlobInCache.
+    ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0);
+    ASSERT_EQ((int)get_perf_context()->blob_read_count, 0);
+    ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0);
+
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 3);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+  }
+
+  {
+    // GetBlob from non-existing file
+    std::vector<PinnableSlice> values(keys.size());
+    uint64_t bytes_read = 0;
+    uint64_t file_number = 100;  // non-existing file
+
+    read_options.read_tier = ReadTier::kReadAllTier;
+    read_options.fill_cache = true;
+    get_perf_context()->Reset();
+    statistics->Reset().PermitUncheckedError();
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(file_number, file_size,
+                                                blob_offsets[i]));
+
+      ASSERT_TRUE(blob_source
+                      .GetBlob(read_options, keys[i], file_number,
+                               blob_offsets[i], file_size, blob_sizes[i],
+                               kNoCompression, prefetch_buffer, &values[i],
+                               &bytes_read)
+                      .IsIOError());
+      ASSERT_TRUE(values[i].empty());
+      ASSERT_FALSE(values[i].IsPinned());
+      ASSERT_EQ(bytes_read, 0);
+
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(file_number, file_size,
+                                                blob_offsets[i]));
+    }
+
+    // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache,
+    // GetBlob, and TEST_BlobInCache.
+    ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0);
+    ASSERT_EQ((int)get_perf_context()->blob_read_count, 0);
+    ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0);
+
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 3);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+  }
+}
+
+TEST_F(BlobSourceTest, GetCompressedBlobs) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+
+  const CompressionType compression = kSnappyCompression;
+
+  options_.cf_paths.emplace_back(
+      test::PerThreadDBPath(env_, "BlobSourceTest_GetCompressedBlobs"), 0);
+
+  DestroyAndReopen(options_);
+
+  ImmutableOptions immutable_options(options_);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr size_t num_blobs = 256;
+
+  std::vector<std::string> key_strs;
+  std::vector<std::string> blob_strs;
+
+  for (size_t i = 0; i < num_blobs; ++i) {
+    key_strs.push_back("key" + std::to_string(i));
+    blob_strs.push_back("blob" + std::to_string(i));
+  }
+
+  std::vector<Slice> keys;
+  std::vector<Slice> blobs;
+
+  for (size_t i = 0; i < num_blobs; ++i) {
+    keys.push_back({key_strs[i]});
+    blobs.push_back({blob_strs[i]});
+  }
+
+  std::vector<uint64_t> blob_offsets(keys.size());
+  std::vector<uint64_t> blob_sizes(keys.size());
+
+  constexpr size_t capacity = 1024;
+  auto backing_cache = NewLRUCache(capacity);  // Blob file cache
+
+  FileOptions file_options;
+  std::unique_ptr<BlobFileCache> blob_file_cache =
+      std::make_unique<BlobFileCache>(
+          backing_cache.get(), &immutable_options, &file_options,
+          column_family_id, nullptr /*HistogramImpl*/, nullptr /*IOTracer*/);
+
+  BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
+                         blob_file_cache.get());
+
+  ReadOptions read_options;
+  read_options.verify_checksums = true;
+
+  uint64_t bytes_read = 0;
+  std::vector<PinnableSlice> values(keys.size());
+
+  {
+    // Snappy Compression
+    const uint64_t file_number = 1;
+
+    read_options.read_tier = ReadTier::kReadAllTier;
+
+    WriteBlobFile(immutable_options, column_family_id, has_ttl,
+                  expiration_range, expiration_range, file_number, keys, blobs,
+                  compression, blob_offsets, blob_sizes);
+
+    CacheHandleGuard<BlobFileReader> blob_file_reader;
+    ASSERT_OK(blob_source.GetBlobFileReader(file_number, &blob_file_reader));
+    ASSERT_NE(blob_file_reader.GetValue(), nullptr);
+
+    const uint64_t file_size = blob_file_reader.GetValue()->GetFileSize();
+    ASSERT_EQ(blob_file_reader.GetValue()->GetCompressionType(), compression);
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      ASSERT_NE(blobs[i].size() /*uncompressed size*/,
+                blob_sizes[i] /*compressed size*/);
+    }
+
+    read_options.fill_cache = true;
+    read_options.read_tier = ReadTier::kReadAllTier;
+    get_perf_context()->Reset();
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(file_number, file_size,
+                                                blob_offsets[i]));
+      ASSERT_OK(blob_source.GetBlob(read_options, keys[i], file_number,
+                                    blob_offsets[i], file_size, blob_sizes[i],
+                                    compression, nullptr /*prefetch_buffer*/,
+                                    &values[i], &bytes_read));
+      ASSERT_EQ(values[i], blobs[i] /*uncompressed blob*/);
+      ASSERT_NE(values[i].size(), blob_sizes[i] /*compressed size*/);
+      ASSERT_EQ(bytes_read,
+                BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]);
+
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+                                               blob_offsets[i]));
+    }
+
+    ASSERT_GE((int)get_perf_context()->blob_decompress_time, 0);
+
+    read_options.read_tier = ReadTier::kBlockCacheTier;
+    get_perf_context()->Reset();
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+                                               blob_offsets[i]));
+
+      // Compressed blob size is passed in GetBlob
+      ASSERT_OK(blob_source.GetBlob(read_options, keys[i], file_number,
+                                    blob_offsets[i], file_size, blob_sizes[i],
+                                    compression, nullptr /*prefetch_buffer*/,
+                                    &values[i], &bytes_read));
+      ASSERT_EQ(values[i], blobs[i] /*uncompressed blob*/);
+      ASSERT_NE(values[i].size(), blob_sizes[i] /*compressed size*/);
+      ASSERT_EQ(bytes_read,
+                BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]);
+
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+                                               blob_offsets[i]));
+    }
+
+    ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+  }
+}
+
+TEST_F(BlobSourceTest, MultiGetBlobsFromMultiFiles) {
+  options_.cf_paths.emplace_back(
+      test::PerThreadDBPath(env_, "BlobSourceTest_MultiGetBlobsFromMultiFiles"),
+      0);
+
+  options_.statistics = CreateDBStatistics();
+  Statistics* statistics = options_.statistics.get();
+  assert(statistics);
+
+  DestroyAndReopen(options_);
+
+  ImmutableOptions immutable_options(options_);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_files = 2;
+  constexpr size_t num_blobs = 32;
+
+  std::vector<std::string> key_strs;
+  std::vector<std::string> blob_strs;
+
+  for (size_t i = 0; i < num_blobs; ++i) {
+    key_strs.push_back("key" + std::to_string(i));
+    blob_strs.push_back("blob" + std::to_string(i));
+  }
+
+  std::vector<Slice> keys;
+  std::vector<Slice> blobs;
+
+  uint64_t file_size = BlobLogHeader::kSize;
+  uint64_t blob_value_bytes = 0;
+  for (size_t i = 0; i < num_blobs; ++i) {
+    keys.push_back({key_strs[i]});
+    blobs.push_back({blob_strs[i]});
+    blob_value_bytes += blobs[i].size();
+    file_size += BlobLogRecord::kHeaderSize + keys[i].size() + blobs[i].size();
+  }
+  file_size += BlobLogFooter::kSize;
+  const uint64_t blob_records_bytes =
+      file_size - BlobLogHeader::kSize - BlobLogFooter::kSize;
+
+  std::vector<uint64_t> blob_offsets(keys.size());
+  std::vector<uint64_t> blob_sizes(keys.size());
+
+  {
+    // Write key/blob pairs to multiple blob files.
+    for (size_t i = 0; i < blob_files; ++i) {
+      const uint64_t file_number = i + 1;
+      WriteBlobFile(immutable_options, column_family_id, has_ttl,
+                    expiration_range, expiration_range, file_number, keys,
+                    blobs, kNoCompression, blob_offsets, blob_sizes);
+    }
+  }
+
+  constexpr size_t capacity = 10;
+  std::shared_ptr<Cache> backing_cache =
+      NewLRUCache(capacity);  // Blob file cache
+
+  FileOptions file_options;
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileCache> blob_file_cache =
+      std::make_unique<BlobFileCache>(
+          backing_cache.get(), &immutable_options, &file_options,
+          column_family_id, blob_file_read_hist, nullptr /*IOTracer*/);
+
+  BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
+                         blob_file_cache.get());
+
+  ReadOptions read_options;
+  read_options.verify_checksums = true;
+
+  uint64_t bytes_read = 0;
+
+  {
+    // MultiGetBlob
+    read_options.fill_cache = true;
+    read_options.read_tier = ReadTier::kReadAllTier;
+
+    autovector<BlobFileReadRequests> blob_reqs;
+    std::array<autovector<BlobReadRequest>, blob_files> blob_reqs_in_file;
+    std::array<PinnableSlice, num_blobs * blob_files> value_buf;
+    std::array<Status, num_blobs * blob_files> statuses_buf;
+
+    for (size_t i = 0; i < blob_files; ++i) {
+      const uint64_t file_number = i + 1;
+      for (size_t j = 0; j < num_blobs; ++j) {
+        blob_reqs_in_file[i].emplace_back(
+            keys[j], blob_offsets[j], blob_sizes[j], kNoCompression,
+            &value_buf[i * num_blobs + j], &statuses_buf[i * num_blobs + j]);
+      }
+      blob_reqs.emplace_back(file_number, file_size, blob_reqs_in_file[i]);
+    }
+
+    get_perf_context()->Reset();
+    statistics->Reset().PermitUncheckedError();
+
+    blob_source.MultiGetBlob(read_options, blob_reqs, &bytes_read);
+
+    for (size_t i = 0; i < blob_files; ++i) {
+      const uint64_t file_number = i + 1;
+      for (size_t j = 0; j < num_blobs; ++j) {
+        ASSERT_OK(statuses_buf[i * num_blobs + j]);
+        ASSERT_EQ(value_buf[i * num_blobs + j], blobs[j]);
+        ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+                                                 blob_offsets[j]));
+      }
+    }
+
+    // Retrieved all blobs from 2 blob files twice via MultiGetBlob and
+    // TEST_BlobInCache.
+    ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count,
+              num_blobs * blob_files);
+    ASSERT_EQ((int)get_perf_context()->blob_read_count,
+              num_blobs * blob_files);  // blocking i/o
+    ASSERT_EQ((int)get_perf_context()->blob_read_byte,
+              blob_records_bytes * blob_files);  // blocking i/o
+    ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0);
+    ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS),
+              num_blobs * blob_files);  // MultiGetBlob
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT),
+              num_blobs * blob_files);  // TEST_BlobInCache
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD),
+              num_blobs * blob_files);  // MultiGetBlob
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ),
+              blob_value_bytes * blob_files);  // TEST_BlobInCache
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE),
+              blob_value_bytes * blob_files);  // MultiGetBlob
+
+    get_perf_context()->Reset();
+    statistics->Reset().PermitUncheckedError();
+
+    autovector<BlobReadRequest> fake_blob_reqs_in_file;
+    std::array<PinnableSlice, num_blobs> fake_value_buf;
+    std::array<Status, num_blobs> fake_statuses_buf;
+
+    const uint64_t fake_file_number = 100;
+    for (size_t i = 0; i < num_blobs; ++i) {
+      fake_blob_reqs_in_file.emplace_back(
+          keys[i], blob_offsets[i], blob_sizes[i], kNoCompression,
+          &fake_value_buf[i], &fake_statuses_buf[i]);
+    }
+
+    // Add a fake multi-get blob request.
+    blob_reqs.emplace_back(fake_file_number, file_size, fake_blob_reqs_in_file);
+
+    blob_source.MultiGetBlob(read_options, blob_reqs, &bytes_read);
+
+    // Check the real blob read requests.
+    for (size_t i = 0; i < blob_files; ++i) {
+      const uint64_t file_number = i + 1;
+      for (size_t j = 0; j < num_blobs; ++j) {
+        ASSERT_OK(statuses_buf[i * num_blobs + j]);
+        ASSERT_EQ(value_buf[i * num_blobs + j], blobs[j]);
+        ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+                                                 blob_offsets[j]));
+      }
+    }
+
+    // Check the fake blob request.
+    for (size_t i = 0; i < num_blobs; ++i) {
+      ASSERT_TRUE(fake_statuses_buf[i].IsIOError());
+      ASSERT_TRUE(fake_value_buf[i].empty());
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(fake_file_number, file_size,
+                                                blob_offsets[i]));
+    }
+
+    // Retrieved all blobs from 3 blob files (including the fake one) twice
+    // via MultiGetBlob and TEST_BlobInCache.
+    ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count,
+              num_blobs * blob_files * 2);
+    ASSERT_EQ((int)get_perf_context()->blob_read_count,
+              0);  // blocking i/o
+    ASSERT_EQ((int)get_perf_context()->blob_read_byte,
+              0);  // blocking i/o
+    ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0);
+    ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+
+    // Fake blob requests: MultiGetBlob and TEST_BlobInCache
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 2);
+    // Real blob requests: MultiGetBlob and TEST_BlobInCache
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT),
+              num_blobs * blob_files * 2);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+    // Real blob requests: MultiGetBlob and TEST_BlobInCache
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ),
+              blob_value_bytes * blob_files * 2);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+  }
+}
+
+TEST_F(BlobSourceTest, MultiGetBlobsFromCache) {
+  options_.cf_paths.emplace_back(
+      test::PerThreadDBPath(env_, "BlobSourceTest_MultiGetBlobsFromCache"), 0);
+
+  options_.statistics = CreateDBStatistics();
+  Statistics* statistics = options_.statistics.get();
+  assert(statistics);
+
+  DestroyAndReopen(options_);
+
+  ImmutableOptions immutable_options(options_);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr size_t num_blobs = 16;
+
+  std::vector<std::string> key_strs;
+  std::vector<std::string> blob_strs;
+
+  for (size_t i = 0; i < num_blobs; ++i) {
+    key_strs.push_back("key" + std::to_string(i));
+    blob_strs.push_back("blob" + std::to_string(i));
+  }
+
+  std::vector<Slice> keys;
+  std::vector<Slice> blobs;
+
+  uint64_t file_size = BlobLogHeader::kSize;
+  for (size_t i = 0; i < num_blobs; ++i) {
+    keys.push_back({key_strs[i]});
+    blobs.push_back({blob_strs[i]});
+    file_size += BlobLogRecord::kHeaderSize + keys[i].size() + blobs[i].size();
+  }
+  file_size += BlobLogFooter::kSize;
+
+  std::vector<uint64_t> blob_offsets(keys.size());
+  std::vector<uint64_t> blob_sizes(keys.size());
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, keys, blobs, kNoCompression,
+                blob_offsets, blob_sizes);
+
+  constexpr size_t capacity = 10;
+  std::shared_ptr<Cache> backing_cache =
+      NewLRUCache(capacity);  // Blob file cache
+
+  FileOptions file_options;
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileCache> blob_file_cache =
+      std::make_unique<BlobFileCache>(
+          backing_cache.get(), &immutable_options, &file_options,
+          column_family_id, blob_file_read_hist, nullptr /*IOTracer*/);
+
+  BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
+                         blob_file_cache.get());
+
+  ReadOptions read_options;
+  read_options.verify_checksums = true;
+
+  constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+
+  {
+    // MultiGetBlobFromOneFile
+    uint64_t bytes_read = 0;
+    std::array<Status, num_blobs> statuses_buf;
+    std::array<PinnableSlice, num_blobs> value_buf;
+    autovector<BlobReadRequest> blob_reqs;
+
+    for (size_t i = 0; i < num_blobs; i += 2) {  // even index
+      blob_reqs.emplace_back(keys[i], blob_offsets[i], blob_sizes[i],
+                             kNoCompression, &value_buf[i], &statuses_buf[i]);
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                                blob_offsets[i]));
+    }
+
+    read_options.fill_cache = true;
+    read_options.read_tier = ReadTier::kReadAllTier;
+    get_perf_context()->Reset();
+    statistics->Reset().PermitUncheckedError();
+
+    // Get half of blobs
+    blob_source.MultiGetBlobFromOneFile(read_options, blob_file_number,
+                                        file_size, blob_reqs, &bytes_read);
+
+    uint64_t fs_read_bytes = 0;
+    uint64_t ca_read_bytes = 0;
+    for (size_t i = 0; i < num_blobs; ++i) {
+      if (i % 2 == 0) {
+        ASSERT_OK(statuses_buf[i]);
+        ASSERT_EQ(value_buf[i], blobs[i]);
+        ASSERT_TRUE(value_buf[i].IsPinned());
+        fs_read_bytes +=
+            blob_sizes[i] + keys[i].size() + BlobLogRecord::kHeaderSize;
+        ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                                 blob_offsets[i]));
+        ca_read_bytes += blob_sizes[i];
+      } else {
+        statuses_buf[i].PermitUncheckedError();
+        ASSERT_TRUE(value_buf[i].empty());
+        ASSERT_FALSE(value_buf[i].IsPinned());
+        ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                                  blob_offsets[i]));
+      }
+    }
+
+    constexpr int num_even_blobs = num_blobs / 2;
+    ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_even_blobs);
+    ASSERT_EQ((int)get_perf_context()->blob_read_count,
+              num_even_blobs);  // blocking i/o
+    ASSERT_EQ((int)get_perf_context()->blob_read_byte,
+              fs_read_bytes);  // blocking i/o
+    ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0);
+    ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_even_blobs);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), num_even_blobs);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ),
+              ca_read_bytes);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE),
+              ca_read_bytes);
+
+    // Get the rest of blobs
+    for (size_t i = 1; i < num_blobs; i += 2) {  // odd index
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                                blob_offsets[i]));
+
+      ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number,
+                                    blob_offsets[i], file_size, blob_sizes[i],
+                                    kNoCompression, prefetch_buffer,
+                                    &value_buf[i], &bytes_read));
+      ASSERT_EQ(value_buf[i], blobs[i]);
+      ASSERT_TRUE(value_buf[i].IsPinned());
+      ASSERT_EQ(bytes_read,
+                BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]);
+
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                               blob_offsets[i]));
+    }
+
+    // Cache-only MultiGetBlobFromOneFile
+    read_options.read_tier = ReadTier::kBlockCacheTier;
+    get_perf_context()->Reset();
+    statistics->Reset().PermitUncheckedError();
+
+    blob_reqs.clear();
+    for (size_t i = 0; i < num_blobs; ++i) {
+      blob_reqs.emplace_back(keys[i], blob_offsets[i], blob_sizes[i],
+                             kNoCompression, &value_buf[i], &statuses_buf[i]);
+    }
+
+    blob_source.MultiGetBlobFromOneFile(read_options, blob_file_number,
+                                        file_size, blob_reqs, &bytes_read);
+
+    uint64_t blob_bytes = 0;
+    for (size_t i = 0; i < num_blobs; ++i) {
+      ASSERT_OK(statuses_buf[i]);
+      ASSERT_EQ(value_buf[i], blobs[i]);
+      ASSERT_TRUE(value_buf[i].IsPinned());
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                               blob_offsets[i]));
+      blob_bytes += blob_sizes[i];
+    }
+
+    // Retrieved the blob cache num_blobs * 2 times via GetBlob and
+    // TEST_BlobInCache.
+    ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_blobs * 2);
+    ASSERT_EQ((int)get_perf_context()->blob_read_count, 0);  // blocking i/o
+    ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0);   // blocking i/o
+    ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0);
+    ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_blobs * 2);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ),
+              blob_bytes * 2);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+  }
+
+  options_.blob_cache->EraseUnRefEntries();
+
+  {
+    // Cache-only MultiGetBlobFromOneFile
+    uint64_t bytes_read = 0;
+    read_options.read_tier = ReadTier::kBlockCacheTier;
+
+    std::array<Status, num_blobs> statuses_buf;
+    std::array<PinnableSlice, num_blobs> value_buf;
+    autovector<BlobReadRequest> blob_reqs;
+
+    for (size_t i = 0; i < num_blobs; i++) {
+      blob_reqs.emplace_back(keys[i], blob_offsets[i], blob_sizes[i],
+                             kNoCompression, &value_buf[i], &statuses_buf[i]);
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                                blob_offsets[i]));
+    }
+
+    get_perf_context()->Reset();
+    statistics->Reset().PermitUncheckedError();
+
+    blob_source.MultiGetBlobFromOneFile(read_options, blob_file_number,
+                                        file_size, blob_reqs, &bytes_read);
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      ASSERT_TRUE(statuses_buf[i].IsIncomplete());
+      ASSERT_TRUE(value_buf[i].empty());
+      ASSERT_FALSE(value_buf[i].IsPinned());
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                                blob_offsets[i]));
+    }
+
+    ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0);
+    ASSERT_EQ((int)get_perf_context()->blob_read_count, 0);  // blocking i/o
+    ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0);   // blocking i/o
+    ASSERT_EQ((int)get_perf_context()->blob_checksum_time, 0);
+    ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 2);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+  }
+
+  {
+    // MultiGetBlobFromOneFile from non-existing file
+    uint64_t bytes_read = 0;
+    uint64_t non_existing_file_number = 100;
+    read_options.read_tier = ReadTier::kReadAllTier;
+
+    std::array<Status, num_blobs> statuses_buf;
+    std::array<PinnableSlice, num_blobs> value_buf;
+    autovector<BlobReadRequest> blob_reqs;
+
+    for (size_t i = 0; i < num_blobs; i++) {
+      blob_reqs.emplace_back(keys[i], blob_offsets[i], blob_sizes[i],
+                             kNoCompression, &value_buf[i], &statuses_buf[i]);
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(non_existing_file_number,
+                                                file_size, blob_offsets[i]));
+    }
+
+    get_perf_context()->Reset();
+    statistics->Reset().PermitUncheckedError();
+
+    blob_source.MultiGetBlobFromOneFile(read_options, non_existing_file_number,
+                                        file_size, blob_reqs, &bytes_read);
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      ASSERT_TRUE(statuses_buf[i].IsIOError());
+      ASSERT_TRUE(value_buf[i].empty());
+      ASSERT_FALSE(value_buf[i].IsPinned());
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(non_existing_file_number,
+                                                file_size, blob_offsets[i]));
+    }
+
+    ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0);
+    ASSERT_EQ((int)get_perf_context()->blob_read_count, 0);  // blocking i/o
+    ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0);   // blocking i/o
+    ASSERT_EQ((int)get_perf_context()->blob_checksum_time, 0);
+    ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 2);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+  }
+}
+
+class BlobSecondaryCacheTest : public DBTestBase {
+ protected:
+ public:
+  explicit BlobSecondaryCacheTest()
+      : DBTestBase("blob_secondary_cache_test", /*env_do_fsync=*/true) {
+    options_.env = env_;
+    options_.enable_blob_files = true;
+    options_.create_if_missing = true;
+
+    // Set a small cache capacity to evict entries from the cache, and to test
+    // that secondary cache is used properly.
+    lru_cache_opts_.capacity = 1024;
+    lru_cache_opts_.num_shard_bits = 0;
+    lru_cache_opts_.strict_capacity_limit = true;
+    lru_cache_opts_.metadata_charge_policy = kDontChargeCacheMetadata;
+    lru_cache_opts_.high_pri_pool_ratio = 0.2;
+    lru_cache_opts_.low_pri_pool_ratio = 0.2;
+
+    secondary_cache_opts_.capacity = 8 << 20;  // 8 MB
+    secondary_cache_opts_.num_shard_bits = 0;
+    secondary_cache_opts_.metadata_charge_policy =
+        kDefaultCacheMetadataChargePolicy;
+
+    // Read blobs from the secondary cache if they are not in the primary cache
+    options_.lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier;
+
+    assert(db_->GetDbIdentity(db_id_).ok());
+    assert(db_->GetDbSessionId(db_session_id_).ok());
+  }
+
+  Options options_;
+
+  LRUCacheOptions lru_cache_opts_;
+  CompressedSecondaryCacheOptions secondary_cache_opts_;
+
+  std::string db_id_;
+  std::string db_session_id_;
+};
+
+TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+
+  secondary_cache_opts_.compression_type = kSnappyCompression;
+  lru_cache_opts_.secondary_cache =
+      NewCompressedSecondaryCache(secondary_cache_opts_);
+  options_.blob_cache = NewLRUCache(lru_cache_opts_);
+
+  options_.cf_paths.emplace_back(
+      test::PerThreadDBPath(
+          env_, "BlobSecondaryCacheTest_GetBlobsFromSecondaryCache"),
+      0);
+
+  options_.statistics = CreateDBStatistics();
+  Statistics* statistics = options_.statistics.get();
+  assert(statistics);
+
+  DestroyAndReopen(options_);
+
+  ImmutableOptions immutable_options(options_);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t file_number = 1;
+
+  Random rnd(301);
+
+  std::vector<std::string> key_strs{"key0", "key1"};
+  std::vector<std::string> blob_strs{rnd.RandomString(512),
+                                     rnd.RandomString(768)};
+
+  std::vector<Slice> keys{key_strs[0], key_strs[1]};
+  std::vector<Slice> blobs{blob_strs[0], blob_strs[1]};
+
+  std::vector<uint64_t> blob_offsets(keys.size());
+  std::vector<uint64_t> blob_sizes(keys.size());
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, file_number, keys, blobs, kNoCompression,
+                blob_offsets, blob_sizes);
+
+  constexpr size_t capacity = 1024;
+  std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+  FileOptions file_options;
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileCache> blob_file_cache(new BlobFileCache(
+      backing_cache.get(), &immutable_options, &file_options, column_family_id,
+      blob_file_read_hist, nullptr /*IOTracer*/));
+
+  BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
+                         blob_file_cache.get());
+
+  CacheHandleGuard<BlobFileReader> file_reader;
+  ASSERT_OK(blob_source.GetBlobFileReader(file_number, &file_reader));
+  ASSERT_NE(file_reader.GetValue(), nullptr);
+  const uint64_t file_size = file_reader.GetValue()->GetFileSize();
+  ASSERT_EQ(file_reader.GetValue()->GetCompressionType(), kNoCompression);
+
+  ReadOptions read_options;
+  read_options.verify_checksums = true;
+
+  auto blob_cache = options_.blob_cache;
+  auto secondary_cache = lru_cache_opts_.secondary_cache;
+
+  Cache::CreateCallback create_cb = [](const void* buf, size_t size,
+                                       void** out_obj,
+                                       size_t* charge) -> Status {
+    CacheAllocationPtr allocation(new char[size]);
+
+    return BlobContents::CreateCallback(std::move(allocation), buf, size,
+                                        out_obj, charge);
+  };
+
+  {
+    // GetBlob
+    std::vector<PinnableSlice> values(keys.size());
+
+    read_options.fill_cache = true;
+    get_perf_context()->Reset();
+
+    // key0 should be filled to the primary cache from the blob file.
+    ASSERT_OK(blob_source.GetBlob(read_options, keys[0], file_number,
+                                  blob_offsets[0], file_size, blob_sizes[0],
+                                  kNoCompression, nullptr /* prefetch_buffer */,
+                                  &values[0], nullptr /* bytes_read */));
+    // Release cache handle
+    values[0].Reset();
+
+    // key0 should be evicted and key0's dummy item is inserted into secondary
+    // cache. key1 should be filled to the primary cache from the blob file.
+    ASSERT_OK(blob_source.GetBlob(read_options, keys[1], file_number,
+                                  blob_offsets[1], file_size, blob_sizes[1],
+                                  kNoCompression, nullptr /* prefetch_buffer */,
+                                  &values[1], nullptr /* bytes_read */));
+
+    // Release cache handle
+    values[1].Reset();
+
+    // key0 should be filled to the primary cache from the blob file. key1
+    // should be evicted and key1's dummy item is inserted into secondary cache.
+    ASSERT_OK(blob_source.GetBlob(read_options, keys[0], file_number,
+                                  blob_offsets[0], file_size, blob_sizes[0],
+                                  kNoCompression, nullptr /* prefetch_buffer */,
+                                  &values[0], nullptr /* bytes_read */));
+    ASSERT_EQ(values[0], blobs[0]);
+    ASSERT_TRUE(
+        blob_source.TEST_BlobInCache(file_number, file_size, blob_offsets[0]));
+
+    // Release cache handle
+    values[0].Reset();
+
+    // key0 should be evicted and is inserted into secondary cache.
+    // key1 should be filled to the primary cache from the blob file.
+    ASSERT_OK(blob_source.GetBlob(read_options, keys[1], file_number,
+                                  blob_offsets[1], file_size, blob_sizes[1],
+                                  kNoCompression, nullptr /* prefetch_buffer */,
+                                  &values[1], nullptr /* bytes_read */));
+    ASSERT_EQ(values[1], blobs[1]);
+    ASSERT_TRUE(
+        blob_source.TEST_BlobInCache(file_number, file_size, blob_offsets[1]));
+
+    // Release cache handle
+    values[1].Reset();
+
+    OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number);
+
+    // blob_cache here only looks at the primary cache since we didn't provide
+    // the cache item helper for the secondary cache. However, since key0 is
+    // demoted to the secondary cache, we shouldn't be able to find it in the
+    // primary cache.
+    {
+      CacheKey cache_key = base_cache_key.WithOffset(blob_offsets[0]);
+      const Slice key0 = cache_key.AsSlice();
+      auto handle0 = blob_cache->Lookup(key0, statistics);
+      ASSERT_EQ(handle0, nullptr);
+
+      // key0's item should be in the secondary cache.
+      bool is_in_sec_cache = false;
+      auto sec_handle0 =
+          secondary_cache->Lookup(key0, create_cb, true,
+                                  /*advise_erase=*/true, is_in_sec_cache);
+      ASSERT_FALSE(is_in_sec_cache);
+      ASSERT_NE(sec_handle0, nullptr);
+      ASSERT_TRUE(sec_handle0->IsReady());
+      auto value = static_cast<BlobContents*>(sec_handle0->Value());
+      ASSERT_NE(value, nullptr);
+      ASSERT_EQ(value->data(), blobs[0]);
+      delete value;
+
+      // key0 doesn't exist in the blob cache although key0's dummy
+      // item exist in the secondary cache.
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(file_number, file_size,
+                                                blob_offsets[0]));
+    }
+
+    // key1 should exists in the primary cache. key1's dummy item exists
+    // in the secondary cache.
+    {
+      CacheKey cache_key = base_cache_key.WithOffset(blob_offsets[1]);
+      const Slice key1 = cache_key.AsSlice();
+      auto handle1 = blob_cache->Lookup(key1, statistics);
+      ASSERT_NE(handle1, nullptr);
+      blob_cache->Release(handle1);
+
+      bool is_in_sec_cache = false;
+      auto sec_handle1 =
+          secondary_cache->Lookup(key1, create_cb, true,
+                                  /*advise_erase=*/true, is_in_sec_cache);
+      ASSERT_FALSE(is_in_sec_cache);
+      ASSERT_EQ(sec_handle1, nullptr);
+
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+                                               blob_offsets[1]));
+    }
+
+    {
+      // fetch key0 from the blob file to the primary cache.
+      // key1 is evicted and inserted into the secondary cache.
+      ASSERT_OK(blob_source.GetBlob(
+          read_options, keys[0], file_number, blob_offsets[0], file_size,
+          blob_sizes[0], kNoCompression, nullptr /* prefetch_buffer */,
+          &values[0], nullptr /* bytes_read */));
+      ASSERT_EQ(values[0], blobs[0]);
+
+      // Release cache handle
+      values[0].Reset();
+
+      // key0 should be in the primary cache.
+      CacheKey cache_key0 = base_cache_key.WithOffset(blob_offsets[0]);
+      const Slice key0 = cache_key0.AsSlice();
+      auto handle0 = blob_cache->Lookup(key0, statistics);
+      ASSERT_NE(handle0, nullptr);
+      auto value = static_cast<BlobContents*>(blob_cache->Value(handle0));
+      ASSERT_NE(value, nullptr);
+      ASSERT_EQ(value->data(), blobs[0]);
+      blob_cache->Release(handle0);
+
+      // key1 is not in the primary cache and is in the secondary cache.
+      CacheKey cache_key1 = base_cache_key.WithOffset(blob_offsets[1]);
+      const Slice key1 = cache_key1.AsSlice();
+      auto handle1 = blob_cache->Lookup(key1, statistics);
+      ASSERT_EQ(handle1, nullptr);
+
+      // erase key0 from the primary cache.
+      blob_cache->Erase(key0);
+      handle0 = blob_cache->Lookup(key0, statistics);
+      ASSERT_EQ(handle0, nullptr);
+
+      // key1 promotion should succeed due to the primary cache being empty. we
+      // did't call secondary cache's Lookup() here, because it will remove the
+      // key but it won't be able to promote the key to the primary cache.
+      // Instead we use the end-to-end blob source API to read key1.
+      // In function TEST_BlobInCache, key1's dummy item is inserted into the
+      // primary cache and a standalone handle is checked by GetValue().
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+                                               blob_offsets[1]));
+
+      // key1's dummy handle is in the primary cache and key1's item is still
+      // in the secondary cache. So, the primary cache's Lookup() without
+      // secondary cache support cannot see it. (NOTE: The dummy handle used
+      // to be a leaky abstraction but not anymore.)
+      handle1 = blob_cache->Lookup(key1, statistics);
+      ASSERT_EQ(handle1, nullptr);
+
+      // But after another access, it is promoted to primary cache
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+                                               blob_offsets[1]));
+
+      // And Lookup() can find it (without secondary cache support)
+      handle1 = blob_cache->Lookup(key1, statistics);
+      ASSERT_NE(handle1, nullptr);
+      ASSERT_NE(blob_cache->Value(handle1), nullptr);
+      blob_cache->Release(handle1);
+    }
+  }
+}
+
+class BlobSourceCacheReservationTest : public DBTestBase {
+ public:
+  explicit BlobSourceCacheReservationTest()
+      : DBTestBase("blob_source_cache_reservation_test",
+                   /*env_do_fsync=*/true) {
+    options_.env = env_;
+    options_.enable_blob_files = true;
+    options_.create_if_missing = true;
+
+    LRUCacheOptions co;
+    co.capacity = kCacheCapacity;
+    co.num_shard_bits = kNumShardBits;
+    co.metadata_charge_policy = kDontChargeCacheMetadata;
+
+    co.high_pri_pool_ratio = 0.0;
+    co.low_pri_pool_ratio = 0.0;
+    std::shared_ptr<Cache> blob_cache = NewLRUCache(co);
+
+    co.high_pri_pool_ratio = 0.5;
+    co.low_pri_pool_ratio = 0.5;
+    std::shared_ptr<Cache> block_cache = NewLRUCache(co);
+
+    options_.blob_cache = blob_cache;
+    options_.lowest_used_cache_tier = CacheTier::kVolatileTier;
+
+    BlockBasedTableOptions block_based_options;
+    block_based_options.no_block_cache = false;
+    block_based_options.block_cache = block_cache;
+    block_based_options.cache_usage_options.options_overrides.insert(
+        {CacheEntryRole::kBlobCache,
+         {/* charged = */ CacheEntryRoleOptions::Decision::kEnabled}});
+    options_.table_factory.reset(
+        NewBlockBasedTableFactory(block_based_options));
+
+    assert(db_->GetDbIdentity(db_id_).ok());
+    assert(db_->GetDbSessionId(db_session_id_).ok());
+  }
+
+  void GenerateKeysAndBlobs() {
+    for (size_t i = 0; i < kNumBlobs; ++i) {
+      key_strs_.push_back("key" + std::to_string(i));
+      blob_strs_.push_back("blob" + std::to_string(i));
+    }
+
+    blob_file_size_ = BlobLogHeader::kSize;
+    for (size_t i = 0; i < kNumBlobs; ++i) {
+      keys_.push_back({key_strs_[i]});
+      blobs_.push_back({blob_strs_[i]});
+      blob_file_size_ +=
+          BlobLogRecord::kHeaderSize + keys_[i].size() + blobs_[i].size();
+    }
+    blob_file_size_ += BlobLogFooter::kSize;
+  }
+
+  static constexpr std::size_t kSizeDummyEntry = CacheReservationManagerImpl<
+      CacheEntryRole::kBlobCache>::GetDummyEntrySize();
+  static constexpr std::size_t kCacheCapacity = 1 * kSizeDummyEntry;
+  static constexpr int kNumShardBits = 0;  // 2^0 shard
+
+  static constexpr uint32_t kColumnFamilyId = 1;
+  static constexpr bool kHasTTL = false;
+  static constexpr uint64_t kBlobFileNumber = 1;
+  static constexpr size_t kNumBlobs = 16;
+
+  std::vector<Slice> keys_;
+  std::vector<Slice> blobs_;
+  std::vector<std::string> key_strs_;
+  std::vector<std::string> blob_strs_;
+  uint64_t blob_file_size_;
+
+  Options options_;
+  std::string db_id_;
+  std::string db_session_id_;
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(BlobSourceCacheReservationTest, SimpleCacheReservation) {
+  options_.cf_paths.emplace_back(
+      test::PerThreadDBPath(
+          env_, "BlobSourceCacheReservationTest_SimpleCacheReservation"),
+      0);
+
+  GenerateKeysAndBlobs();
+
+  DestroyAndReopen(options_);
+
+  ImmutableOptions immutable_options(options_);
+
+  constexpr ExpirationRange expiration_range;
+
+  std::vector<uint64_t> blob_offsets(keys_.size());
+  std::vector<uint64_t> blob_sizes(keys_.size());
+
+  WriteBlobFile(immutable_options, kColumnFamilyId, kHasTTL, expiration_range,
+                expiration_range, kBlobFileNumber, keys_, blobs_,
+                kNoCompression, blob_offsets, blob_sizes);
+
+  constexpr size_t capacity = 10;
+  std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+  FileOptions file_options;
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileCache> blob_file_cache =
+      std::make_unique<BlobFileCache>(
+          backing_cache.get(), &immutable_options, &file_options,
+          kColumnFamilyId, blob_file_read_hist, nullptr /*IOTracer*/);
+
+  BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
+                         blob_file_cache.get());
+
+  ConcurrentCacheReservationManager* cache_res_mgr =
+      static_cast<ChargedCache*>(blob_source.GetBlobCache())
+          ->TEST_GetCacheReservationManager();
+  ASSERT_NE(cache_res_mgr, nullptr);
+
+  ReadOptions read_options;
+  read_options.verify_checksums = true;
+
+  {
+    read_options.fill_cache = false;
+
+    std::vector<PinnableSlice> values(keys_.size());
+
+    for (size_t i = 0; i < kNumBlobs; ++i) {
+      ASSERT_OK(blob_source.GetBlob(
+          read_options, keys_[i], kBlobFileNumber, blob_offsets[i],
+          blob_file_size_, blob_sizes[i], kNoCompression,
+          nullptr /* prefetch_buffer */, &values[i], nullptr /* bytes_read */));
+      ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), 0);
+      ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), 0);
+    }
+  }
+
+  {
+    read_options.fill_cache = true;
+
+    std::vector<PinnableSlice> values(keys_.size());
+
+    // num_blobs is 16, so the total blob cache usage is less than a single
+    // dummy entry. Therefore, cache reservation manager only reserves one dummy
+    // entry here.
+    uint64_t blob_bytes = 0;
+    for (size_t i = 0; i < kNumBlobs; ++i) {
+      ASSERT_OK(blob_source.GetBlob(
+          read_options, keys_[i], kBlobFileNumber, blob_offsets[i],
+          blob_file_size_, blob_sizes[i], kNoCompression,
+          nullptr /* prefetch_buffer */, &values[i], nullptr /* bytes_read */));
+
+      size_t charge = 0;
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(kBlobFileNumber, blob_file_size_,
+                                               blob_offsets[i], &charge));
+
+      blob_bytes += charge;
+      ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), kSizeDummyEntry);
+      ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), blob_bytes);
+      ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(),
+                options_.blob_cache->GetUsage());
+    }
+  }
+
+  {
+    OffsetableCacheKey base_cache_key(db_id_, db_session_id_, kBlobFileNumber);
+    size_t blob_bytes = options_.blob_cache->GetUsage();
+
+    for (size_t i = 0; i < kNumBlobs; ++i) {
+      size_t charge = 0;
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(kBlobFileNumber, blob_file_size_,
+                                               blob_offsets[i], &charge));
+
+      CacheKey cache_key = base_cache_key.WithOffset(blob_offsets[i]);
+      // We didn't call options_.blob_cache->Erase() here, this is because
+      // the cache wrapper's Erase() method must be called to update the
+      // cache usage after erasing the cache entry.
+      blob_source.GetBlobCache()->Erase(cache_key.AsSlice());
+      if (i == kNumBlobs - 1) {
+        // All the blobs got removed from the cache. cache_res_mgr should not
+        // reserve any space for them.
+        ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), 0);
+      } else {
+        ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), kSizeDummyEntry);
+      }
+      blob_bytes -= charge;
+      ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), blob_bytes);
+      ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(),
+                options_.blob_cache->GetUsage());
+    }
+  }
+}
+
+TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservationOnFullCache) {
+  options_.cf_paths.emplace_back(
+      test::PerThreadDBPath(
+          env_,
+          "BlobSourceCacheReservationTest_IncreaseCacheReservationOnFullCache"),
+      0);
+
+  GenerateKeysAndBlobs();
+
+  DestroyAndReopen(options_);
+
+  ImmutableOptions immutable_options(options_);
+  constexpr size_t blob_size = kSizeDummyEntry / (kNumBlobs / 2);
+  for (size_t i = 0; i < kNumBlobs; ++i) {
+    blob_file_size_ -= blobs_[i].size();  // old blob size
+    blob_strs_[i].resize(blob_size, '@');
+    blobs_[i] = Slice(blob_strs_[i]);
+    blob_file_size_ += blobs_[i].size();  // new blob size
+  }
+
+  std::vector<uint64_t> blob_offsets(keys_.size());
+  std::vector<uint64_t> blob_sizes(keys_.size());
+
+  constexpr ExpirationRange expiration_range;
+  WriteBlobFile(immutable_options, kColumnFamilyId, kHasTTL, expiration_range,
+                expiration_range, kBlobFileNumber, keys_, blobs_,
+                kNoCompression, blob_offsets, blob_sizes);
+
+  constexpr size_t capacity = 10;
+  std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+  FileOptions file_options;
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileCache> blob_file_cache =
+      std::make_unique<BlobFileCache>(
+          backing_cache.get(), &immutable_options, &file_options,
+          kColumnFamilyId, blob_file_read_hist, nullptr /*IOTracer*/);
+
+  BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
+                         blob_file_cache.get());
+
+  ConcurrentCacheReservationManager* cache_res_mgr =
+      static_cast<ChargedCache*>(blob_source.GetBlobCache())
+          ->TEST_GetCacheReservationManager();
+  ASSERT_NE(cache_res_mgr, nullptr);
+
+  ReadOptions read_options;
+  read_options.verify_checksums = true;
+
+  {
+    read_options.fill_cache = false;
+
+    std::vector<PinnableSlice> values(keys_.size());
+
+    for (size_t i = 0; i < kNumBlobs; ++i) {
+      ASSERT_OK(blob_source.GetBlob(
+          read_options, keys_[i], kBlobFileNumber, blob_offsets[i],
+          blob_file_size_, blob_sizes[i], kNoCompression,
+          nullptr /* prefetch_buffer */, &values[i], nullptr /* bytes_read */));
+      ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), 0);
+      ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), 0);
+    }
+  }
+
+  {
+    read_options.fill_cache = true;
+
+    std::vector<PinnableSlice> values(keys_.size());
+
+    // Since we resized each blob to be kSizeDummyEntry / (num_blobs / 2), we
+    // can't fit all the blobs in the cache at the same time, which means we
+    // should observe cache evictions once we reach the cache's capacity.
+    // Due to the overhead of the cache and the BlobContents objects, as well as
+    // jemalloc bin sizes, this happens after inserting seven blobs.
+    uint64_t blob_bytes = 0;
+    for (size_t i = 0; i < kNumBlobs; ++i) {
+      ASSERT_OK(blob_source.GetBlob(
+          read_options, keys_[i], kBlobFileNumber, blob_offsets[i],
+          blob_file_size_, blob_sizes[i], kNoCompression,
+          nullptr /* prefetch_buffer */, &values[i], nullptr /* bytes_read */));
+
+      // Release cache handle
+      values[i].Reset();
+
+      if (i < kNumBlobs / 2 - 1) {
+        size_t charge = 0;
+        ASSERT_TRUE(blob_source.TEST_BlobInCache(
+            kBlobFileNumber, blob_file_size_, blob_offsets[i], &charge));
+
+        blob_bytes += charge;
+      }
+
+      ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), kSizeDummyEntry);
+      ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), blob_bytes);
+      ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(),
+                options_.blob_cache->GetUsage());
+    }
+  }
+}
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/db_blob_basic_test.cc b/src/rocksdb/db/blob/db_blob_basic_test.cc
new file mode 100644
index 000000000..e6832a2ae
--- /dev/null
+++ b/src/rocksdb/db/blob/db_blob_basic_test.cc
@@ -0,0 +1,1789 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <array>
+#include <sstream>
+#include <string>
+
+#include "cache/compressed_secondary_cache.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/sync_point.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBBlobBasicTest : public DBTestBase {
+ protected:
+  DBBlobBasicTest()
+      : DBTestBase("db_blob_basic_test", /* env_do_fsync */ false) {}
+};
+
+TEST_F(DBBlobBasicTest, GetBlob) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr char key[] = "key";
+  constexpr char blob_value[] = "blob_value";
+
+  ASSERT_OK(Put(key, blob_value));
+
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(Get(key), blob_value);
+
+  // Try again with no I/O allowed. The table and the necessary blocks should
+  // already be in their respective caches; however, the blob itself can only be
+  // read from the blob file, so the read should return Incomplete.
+  ReadOptions read_options;
+  read_options.read_tier = kBlockCacheTier;
+
+  PinnableSlice result;
+  ASSERT_TRUE(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result)
+                  .IsIncomplete());
+}
+
+TEST_F(DBBlobBasicTest, GetBlobFromCache) {
+  Options options = GetDefaultOptions();
+
+  LRUCacheOptions co;
+  co.capacity = 2 << 20;  // 2MB
+  co.num_shard_bits = 2;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  auto backing_cache = NewLRUCache(co);
+
+  options.enable_blob_files = true;
+  options.blob_cache = backing_cache;
+
+  BlockBasedTableOptions block_based_options;
+  block_based_options.no_block_cache = false;
+  block_based_options.block_cache = backing_cache;
+  block_based_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
+
+  Reopen(options);
+
+  constexpr char key[] = "key";
+  constexpr char blob_value[] = "blob_value";
+
+  ASSERT_OK(Put(key, blob_value));
+
+  ASSERT_OK(Flush());
+
+  ReadOptions read_options;
+
+  read_options.fill_cache = false;
+
+  {
+    PinnableSlice result;
+
+    read_options.read_tier = kReadAllTier;
+    ASSERT_OK(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result));
+    ASSERT_EQ(result, blob_value);
+
+    result.Reset();
+    read_options.read_tier = kBlockCacheTier;
+
+    // Try again with no I/O allowed. Since we didn't re-fill the cache, the
+    // blob itself can only be read from the blob file, so the read should
+    // return Incomplete.
+    ASSERT_TRUE(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result)
+                    .IsIncomplete());
+    ASSERT_TRUE(result.empty());
+  }
+
+  read_options.fill_cache = true;
+
+  {
+    PinnableSlice result;
+
+    read_options.read_tier = kReadAllTier;
+    ASSERT_OK(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result));
+    ASSERT_EQ(result, blob_value);
+
+    result.Reset();
+    read_options.read_tier = kBlockCacheTier;
+
+    // Try again with no I/O allowed. The table and the necessary blocks/blobs
+    // should already be in their respective caches.
+    ASSERT_OK(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result));
+    ASSERT_EQ(result, blob_value);
+  }
+}
+
+TEST_F(DBBlobBasicTest, IterateBlobsFromCache) {
+  Options options = GetDefaultOptions();
+
+  LRUCacheOptions co;
+  co.capacity = 2 << 20;  // 2MB
+  co.num_shard_bits = 2;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  auto backing_cache = NewLRUCache(co);
+
+  options.enable_blob_files = true;
+  options.blob_cache = backing_cache;
+
+  BlockBasedTableOptions block_based_options;
+  block_based_options.no_block_cache = false;
+  block_based_options.block_cache = backing_cache;
+  block_based_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
+
+  options.statistics = CreateDBStatistics();
+
+  Reopen(options);
+
+  int num_blobs = 5;
+  std::vector<std::string> keys;
+  std::vector<std::string> blobs;
+
+  for (int i = 0; i < num_blobs; ++i) {
+    keys.push_back("key" + std::to_string(i));
+    blobs.push_back("blob" + std::to_string(i));
+    ASSERT_OK(Put(keys[i], blobs[i]));
+  }
+  ASSERT_OK(Flush());
+
+  ReadOptions read_options;
+
+  {
+    read_options.fill_cache = false;
+    read_options.read_tier = kReadAllTier;
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    ASSERT_OK(iter->status());
+
+    int i = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key().ToString(), keys[i]);
+      ASSERT_EQ(iter->value().ToString(), blobs[i]);
+      ++i;
+    }
+    ASSERT_EQ(i, num_blobs);
+    ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 0);
+  }
+
+  {
+    read_options.fill_cache = false;
+    read_options.read_tier = kBlockCacheTier;
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    ASSERT_OK(iter->status());
+
+    // Try again with no I/O allowed. Since we didn't re-fill the cache,
+    // the blob itself can only be read from the blob file, so iter->Valid()
+    // should be false.
+    iter->SeekToFirst();
+    ASSERT_NOK(iter->status());
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 0);
+  }
+
+  {
+    read_options.fill_cache = true;
+    read_options.read_tier = kReadAllTier;
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    ASSERT_OK(iter->status());
+
+    // Read blobs from the file and refill the cache.
+    int i = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key().ToString(), keys[i]);
+      ASSERT_EQ(iter->value().ToString(), blobs[i]);
+      ++i;
+    }
+    ASSERT_EQ(i, num_blobs);
+    ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD),
+              num_blobs);
+  }
+
+  {
+    read_options.fill_cache = false;
+    read_options.read_tier = kBlockCacheTier;
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    ASSERT_OK(iter->status());
+
+    // Try again with no I/O allowed. The table and the necessary blocks/blobs
+    // should already be in their respective caches.
+    int i = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key().ToString(), keys[i]);
+      ASSERT_EQ(iter->value().ToString(), blobs[i]);
+      ++i;
+    }
+    ASSERT_EQ(i, num_blobs);
+    ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 0);
+  }
+}
+
+TEST_F(DBBlobBasicTest, IterateBlobsFromCachePinning) {
+  constexpr size_t min_blob_size = 6;
+
+  Options options = GetDefaultOptions();
+
+  LRUCacheOptions cache_options;
+  cache_options.capacity = 2048;
+  cache_options.num_shard_bits = 0;
+  cache_options.metadata_charge_policy = kDontChargeCacheMetadata;
+
+  options.blob_cache = NewLRUCache(cache_options);
+  options.enable_blob_files = true;
+  options.min_blob_size = min_blob_size;
+
+  Reopen(options);
+
+  // Put then iterate over three key-values. The second value is below the size
+  // limit and is thus stored inline; the other two are stored separately as
+  // blobs. We expect to have something pinned in the cache iff we are
+  // positioned on a blob.
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "long_value";
+  static_assert(sizeof(first_value) - 1 >= min_blob_size,
+                "first_value too short to be stored as blob");
+
+  ASSERT_OK(Put(first_key, first_value));
+
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "short";
+  static_assert(sizeof(second_value) - 1 < min_blob_size,
+                "second_value too long to be inlined");
+
+  ASSERT_OK(Put(second_key, second_value));
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "other_long_value";
+  static_assert(sizeof(third_value) - 1 >= min_blob_size,
+                "third_value too short to be stored as blob");
+
+  ASSERT_OK(Put(third_key, third_value));
+
+  ASSERT_OK(Flush());
+
+  {
+    ReadOptions read_options;
+    read_options.fill_cache = true;
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(iter->key(), first_key);
+    ASSERT_EQ(iter->value(), first_value);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(iter->key(), second_key);
+    ASSERT_EQ(iter->value(), second_value);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(iter->key(), third_key);
+    ASSERT_EQ(iter->value(), third_value);
+
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_OK(iter->status());
+  }
+
+  {
+    ReadOptions read_options;
+    read_options.fill_cache = false;
+    read_options.read_tier = kBlockCacheTier;
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(iter->key(), first_key);
+    ASSERT_EQ(iter->value(), first_value);
+    ASSERT_GT(options.blob_cache->GetPinnedUsage(), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(iter->key(), second_key);
+    ASSERT_EQ(iter->value(), second_value);
+    ASSERT_EQ(options.blob_cache->GetPinnedUsage(), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(iter->key(), third_key);
+    ASSERT_EQ(iter->value(), third_value);
+    ASSERT_GT(options.blob_cache->GetPinnedUsage(), 0);
+
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(options.blob_cache->GetPinnedUsage(), 0);
+  }
+
+  {
+    ReadOptions read_options;
+    read_options.fill_cache = false;
+    read_options.read_tier = kBlockCacheTier;
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+
+    iter->SeekToLast();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(iter->key(), third_key);
+    ASSERT_EQ(iter->value(), third_value);
+    ASSERT_GT(options.blob_cache->GetPinnedUsage(), 0);
+
+    iter->Prev();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(iter->key(), second_key);
+    ASSERT_EQ(iter->value(), second_value);
+    ASSERT_EQ(options.blob_cache->GetPinnedUsage(), 0);
+
+    iter->Prev();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(iter->key(), first_key);
+    ASSERT_EQ(iter->value(), first_value);
+    ASSERT_GT(options.blob_cache->GetPinnedUsage(), 0);
+
+    iter->Prev();
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(options.blob_cache->GetPinnedUsage(), 0);
+  }
+}
+
+TEST_F(DBBlobBasicTest, MultiGetBlobs) {
+  constexpr size_t min_blob_size = 6;
+
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = min_blob_size;
+
+  Reopen(options);
+
+  // Put then retrieve three key-values. The first value is below the size limit
+  // and is thus stored inline; the other two are stored separately as blobs.
+  constexpr size_t num_keys = 3;
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "short";
+  static_assert(sizeof(first_value) - 1 < min_blob_size,
+                "first_value too long to be inlined");
+
+  ASSERT_OK(Put(first_key, first_value));
+
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "long_value";
+  static_assert(sizeof(second_value) - 1 >= min_blob_size,
+                "second_value too short to be stored as blob");
+
+  ASSERT_OK(Put(second_key, second_value));
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "other_long_value";
+  static_assert(sizeof(third_value) - 1 >= min_blob_size,
+                "third_value too short to be stored as blob");
+
+  ASSERT_OK(Put(third_key, third_value));
+
+  ASSERT_OK(Flush());
+
+  ReadOptions read_options;
+
+  std::array<Slice, num_keys> keys{{first_key, second_key, third_key}};
+
+  {
+    std::array<PinnableSlice, num_keys> values;
+    std::array<Status, num_keys> statuses;
+
+    db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+                  &values[0], &statuses[0]);
+
+    ASSERT_OK(statuses[0]);
+    ASSERT_EQ(values[0], first_value);
+
+    ASSERT_OK(statuses[1]);
+    ASSERT_EQ(values[1], second_value);
+
+    ASSERT_OK(statuses[2]);
+    ASSERT_EQ(values[2], third_value);
+  }
+
+  // Try again with no I/O allowed. The table and the necessary blocks should
+  // already be in their respective caches. The first (inlined) value should be
+  // successfully read; however, the two blob values could only be read from the
+  // blob file, so for those the read should return Incomplete.
+  read_options.read_tier = kBlockCacheTier;
+
+  {
+    std::array<PinnableSlice, num_keys> values;
+    std::array<Status, num_keys> statuses;
+
+    db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+                  &values[0], &statuses[0]);
+
+    ASSERT_OK(statuses[0]);
+    ASSERT_EQ(values[0], first_value);
+
+    ASSERT_TRUE(statuses[1].IsIncomplete());
+
+    ASSERT_TRUE(statuses[2].IsIncomplete());
+  }
+}
+
+TEST_F(DBBlobBasicTest, MultiGetBlobsFromCache) {
+  Options options = GetDefaultOptions();
+
+  LRUCacheOptions co;
+  co.capacity = 2 << 20;  // 2MB
+  co.num_shard_bits = 2;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  auto backing_cache = NewLRUCache(co);
+
+  constexpr size_t min_blob_size = 6;
+  options.min_blob_size = min_blob_size;
+  options.create_if_missing = true;
+  options.enable_blob_files = true;
+  options.blob_cache = backing_cache;
+
+  BlockBasedTableOptions block_based_options;
+  block_based_options.no_block_cache = false;
+  block_based_options.block_cache = backing_cache;
+  block_based_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
+
+  DestroyAndReopen(options);
+
+  // Put then retrieve three key-values. The first value is below the size limit
+  // and is thus stored inline; the other two are stored separately as blobs.
+  constexpr size_t num_keys = 3;
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "short";
+  static_assert(sizeof(first_value) - 1 < min_blob_size,
+                "first_value too long to be inlined");
+
+  ASSERT_OK(Put(first_key, first_value));
+
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "long_value";
+  static_assert(sizeof(second_value) - 1 >= min_blob_size,
+                "second_value too short to be stored as blob");
+
+  ASSERT_OK(Put(second_key, second_value));
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "other_long_value";
+  static_assert(sizeof(third_value) - 1 >= min_blob_size,
+                "third_value too short to be stored as blob");
+
+  ASSERT_OK(Put(third_key, third_value));
+
+  ASSERT_OK(Flush());
+
+  ReadOptions read_options;
+  read_options.fill_cache = false;
+
+  std::array<Slice, num_keys> keys{{first_key, second_key, third_key}};
+
+  {
+    std::array<PinnableSlice, num_keys> values;
+    std::array<Status, num_keys> statuses;
+
+    db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+                  &values[0], &statuses[0]);
+
+    ASSERT_OK(statuses[0]);
+    ASSERT_EQ(values[0], first_value);
+
+    ASSERT_OK(statuses[1]);
+    ASSERT_EQ(values[1], second_value);
+
+    ASSERT_OK(statuses[2]);
+    ASSERT_EQ(values[2], third_value);
+  }
+
+  // Try again with no I/O allowed. The first (inlined) value should be
+  // successfully read; however, the two blob values could only be read from the
+  // blob file, so for those the read should return Incomplete.
+  read_options.read_tier = kBlockCacheTier;
+
+  {
+    std::array<PinnableSlice, num_keys> values;
+    std::array<Status, num_keys> statuses;
+
+    db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+                  &values[0], &statuses[0]);
+
+    ASSERT_OK(statuses[0]);
+    ASSERT_EQ(values[0], first_value);
+
+    ASSERT_TRUE(statuses[1].IsIncomplete());
+
+    ASSERT_TRUE(statuses[2].IsIncomplete());
+  }
+
+  // Fill the cache when reading blobs from the blob file.
+  read_options.read_tier = kReadAllTier;
+  read_options.fill_cache = true;
+
+  {
+    std::array<PinnableSlice, num_keys> values;
+    std::array<Status, num_keys> statuses;
+
+    db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+                  &values[0], &statuses[0]);
+
+    ASSERT_OK(statuses[0]);
+    ASSERT_EQ(values[0], first_value);
+
+    ASSERT_OK(statuses[1]);
+    ASSERT_EQ(values[1], second_value);
+
+    ASSERT_OK(statuses[2]);
+    ASSERT_EQ(values[2], third_value);
+  }
+
+  // Try again with no I/O allowed. All blobs should be successfully read from
+  // the cache.
+  read_options.read_tier = kBlockCacheTier;
+
+  {
+    std::array<PinnableSlice, num_keys> values;
+    std::array<Status, num_keys> statuses;
+
+    db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+                  &values[0], &statuses[0]);
+
+    ASSERT_OK(statuses[0]);
+    ASSERT_EQ(values[0], first_value);
+
+    ASSERT_OK(statuses[1]);
+    ASSERT_EQ(values[1], second_value);
+
+    ASSERT_OK(statuses[2]);
+    ASSERT_EQ(values[2], third_value);
+  }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobBasicTest, MultiGetWithDirectIO) {
+  Options options = GetDefaultOptions();
+
+  // First, create an external SST file ["b"].
+  const std::string file_path = dbname_ + "/test.sst";
+  {
+    SstFileWriter sst_file_writer(EnvOptions(), GetDefaultOptions());
+    Status s = sst_file_writer.Open(file_path);
+    ASSERT_OK(s);
+    ASSERT_OK(sst_file_writer.Put("b", "b_value"));
+    ASSERT_OK(sst_file_writer.Finish());
+  }
+
+  options.enable_blob_files = true;
+  options.min_blob_size = 1000;
+  options.use_direct_reads = true;
+  options.allow_ingest_behind = true;
+
+  // Open DB with fixed-prefix sst-partitioner so that compaction will cut
+  // new table file when encountering a new key whose 1-byte prefix changes.
+  constexpr size_t key_len = 1;
+  options.sst_partitioner_factory =
+      NewSstPartitionerFixedPrefixFactory(key_len);
+
+  Status s = TryReopen(options);
+  if (s.IsInvalidArgument()) {
+    ROCKSDB_GTEST_SKIP("This test requires direct IO support");
+    return;
+  }
+  ASSERT_OK(s);
+
+  constexpr size_t num_keys = 3;
+  constexpr size_t blob_size = 3000;
+
+  constexpr char first_key[] = "a";
+  const std::string first_blob(blob_size, 'a');
+  ASSERT_OK(Put(first_key, first_blob));
+
+  constexpr char second_key[] = "b";
+  const std::string second_blob(2 * blob_size, 'b');
+  ASSERT_OK(Put(second_key, second_blob));
+
+  constexpr char third_key[] = "d";
+  const std::string third_blob(blob_size, 'd');
+  ASSERT_OK(Put(third_key, third_blob));
+
+  // first_blob, second_blob and third_blob in the same blob file.
+  //      SST                    Blob file
+  // L0  ["a",    "b",    "d"]   |'aaaa', 'bbbb', 'dddd'|
+  //       |       |       |         ^       ^        ^
+  //       |       |       |         |       |        |
+  //       |       |       +---------|-------|--------+
+  //       |       +-----------------|-------+
+  //       +-------------------------+
+  ASSERT_OK(Flush());
+
+  constexpr char fourth_key[] = "c";
+  const std::string fourth_blob(blob_size, 'c');
+  ASSERT_OK(Put(fourth_key, fourth_blob));
+  // fourth_blob in another blob file.
+  //      SST                    Blob file                 SST     Blob file
+  // L0  ["a",    "b",    "d"]   |'aaaa', 'bbbb', 'dddd'|  ["c"]   |'cccc'|
+  //       |       |       |         ^       ^        ^      |       ^
+  //       |       |       |         |       |        |      |       |
+  //       |       |       +---------|-------|--------+      +-------+
+  //       |       +-----------------|-------+
+  //       +-------------------------+
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                              /*end=*/nullptr));
+
+  // Due to the above sst partitioner, we get 4 L1 files. The blob files are
+  // unchanged.
+  //                             |'aaaa', 'bbbb', 'dddd'|  |'cccc'|
+  //                                 ^       ^     ^         ^
+  //                                 |       |     |         |
+  // L0                              |       |     |         |
+  // L1  ["a"]   ["b"]   ["c"]       |       |   ["d"]       |
+  //       |       |       |         |       |               |
+  //       |       |       +---------|-------|---------------+
+  //       |       +-----------------|-------+
+  //       +-------------------------+
+  ASSERT_EQ(4, NumTableFilesAtLevel(/*level=*/1));
+
+  {
+    // Ingest the external SST file into bottommost level.
+    std::vector<std::string> ext_files{file_path};
+    IngestExternalFileOptions opts;
+    opts.ingest_behind = true;
+    ASSERT_OK(
+        db_->IngestExternalFile(db_->DefaultColumnFamily(), ext_files, opts));
+  }
+
+  // Now the database becomes as follows.
+  //                             |'aaaa', 'bbbb', 'dddd'|  |'cccc'|
+  //                                 ^       ^     ^         ^
+  //                                 |       |     |         |
+  // L0                              |       |     |         |
+  // L1  ["a"]   ["b"]   ["c"]       |       |   ["d"]       |
+  //       |       |       |         |       |               |
+  //       |       |       +---------|-------|---------------+
+  //       |       +-----------------|-------+
+  //       +-------------------------+
+  //
+  // L6          ["b"]
+
+  {
+    // Compact ["b"] to bottommost level.
+    Slice begin = Slice(second_key);
+    Slice end = Slice(second_key);
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+    ASSERT_OK(db_->CompactRange(cro, &begin, &end));
+  }
+
+  //                             |'aaaa', 'bbbb', 'dddd'|  |'cccc'|
+  //                                 ^       ^     ^         ^
+  //                                 |       |     |         |
+  // L0                              |       |     |         |
+  // L1  ["a"]           ["c"]       |       |   ["d"]       |
+  //       |               |         |       |               |
+  //       |               +---------|-------|---------------+
+  //       |       +-----------------|-------+
+  //       +-------|-----------------+
+  //               |
+  // L6          ["b"]
+  ASSERT_EQ(3, NumTableFilesAtLevel(/*level=*/1));
+  ASSERT_EQ(1, NumTableFilesAtLevel(/*level=*/6));
+
+  bool called = false;
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "RandomAccessFileReader::MultiRead:AlignedReqs", [&](void* arg) {
+        auto* aligned_reqs = static_cast<std::vector<FSReadRequest>*>(arg);
+        assert(aligned_reqs);
+        ASSERT_EQ(1, aligned_reqs->size());
+        called = true;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::array<Slice, num_keys> keys{{first_key, third_key, second_key}};
+
+  {
+    std::array<PinnableSlice, num_keys> values;
+    std::array<Status, num_keys> statuses;
+
+    // The MultiGet(), when constructing the KeyContexts, will process the keys
+    // in such order: a, d, b. The reason is that ["a"] and ["d"] are in L1,
+    // while ["b"] resides in L6.
+    // Consequently, the original FSReadRequest list prepared by
+    // Version::MultiGetblob() will be for "a", "d" and "b". It is unsorted as
+    // follows:
+    //
+    // ["a", offset=30, len=3033],
+    // ["d", offset=9096, len=3033],
+    // ["b", offset=3063, len=6033]
+    //
+    // If we do not sort them before calling MultiRead() in DirectIO, then the
+    // underlying IO merging logic will yield two requests.
+    //
+    // [offset=0, len=4096] (for "a")
+    // [offset=0, len=12288] (result of merging the request for "d" and "b")
+    //
+    // We need to sort them in Version::MultiGetBlob() so that the underlying
+    // IO merging logic in DirectIO mode works as expected. The correct
+    // behavior will be one aligned request:
+    //
+    // [offset=0, len=12288]
+
+    db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0],
+                  &values[0], &statuses[0]);
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+
+    ASSERT_TRUE(called);
+
+    ASSERT_OK(statuses[0]);
+    ASSERT_EQ(values[0], first_blob);
+
+    ASSERT_OK(statuses[1]);
+    ASSERT_EQ(values[1], third_blob);
+
+    ASSERT_OK(statuses[2]);
+    ASSERT_EQ(values[2], second_blob);
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_F(DBBlobBasicTest, MultiGetBlobsFromMultipleFiles) {
+  Options options = GetDefaultOptions();
+
+  LRUCacheOptions co;
+  co.capacity = 2 << 20;  // 2MB
+  co.num_shard_bits = 2;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  auto backing_cache = NewLRUCache(co);
+
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+  options.enable_blob_files = true;
+  options.blob_cache = backing_cache;
+
+  BlockBasedTableOptions block_based_options;
+  block_based_options.no_block_cache = false;
+  block_based_options.block_cache = backing_cache;
+  block_based_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
+
+  Reopen(options);
+
+  constexpr size_t kNumBlobFiles = 3;
+  constexpr size_t kNumBlobsPerFile = 3;
+  constexpr size_t kNumKeys = kNumBlobsPerFile * kNumBlobFiles;
+
+  std::vector<std::string> key_strs;
+  std::vector<std::string> value_strs;
+  for (size_t i = 0; i < kNumBlobFiles; ++i) {
+    for (size_t j = 0; j < kNumBlobsPerFile; ++j) {
+      std::string key = "key" + std::to_string(i) + "_" + std::to_string(j);
+      std::string value =
+          "value_as_blob" + std::to_string(i) + "_" + std::to_string(j);
+      ASSERT_OK(Put(key, value));
+      key_strs.push_back(key);
+      value_strs.push_back(value);
+    }
+    ASSERT_OK(Flush());
+  }
+  assert(key_strs.size() == kNumKeys);
+  std::array<Slice, kNumKeys> keys;
+  for (size_t i = 0; i < keys.size(); ++i) {
+    keys[i] = key_strs[i];
+  }
+
+  ReadOptions read_options;
+  read_options.read_tier = kReadAllTier;
+  read_options.fill_cache = false;
+
+  {
+    std::array<PinnableSlice, kNumKeys> values;
+    std::array<Status, kNumKeys> statuses;
+    db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, &keys[0],
+                  &values[0], &statuses[0]);
+
+    for (size_t i = 0; i < kNumKeys; ++i) {
+      ASSERT_OK(statuses[i]);
+      ASSERT_EQ(value_strs[i], values[i]);
+    }
+  }
+
+  read_options.read_tier = kBlockCacheTier;
+
+  {
+    std::array<PinnableSlice, kNumKeys> values;
+    std::array<Status, kNumKeys> statuses;
+    db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, &keys[0],
+                  &values[0], &statuses[0]);
+
+    for (size_t i = 0; i < kNumKeys; ++i) {
+      ASSERT_TRUE(statuses[i].IsIncomplete());
+      ASSERT_TRUE(values[i].empty());
+    }
+  }
+
+  read_options.read_tier = kReadAllTier;
+  read_options.fill_cache = true;
+
+  {
+    std::array<PinnableSlice, kNumKeys> values;
+    std::array<Status, kNumKeys> statuses;
+    db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, &keys[0],
+                  &values[0], &statuses[0]);
+
+    for (size_t i = 0; i < kNumKeys; ++i) {
+      ASSERT_OK(statuses[i]);
+      ASSERT_EQ(value_strs[i], values[i]);
+    }
+  }
+
+  read_options.read_tier = kBlockCacheTier;
+
+  {
+    std::array<PinnableSlice, kNumKeys> values;
+    std::array<Status, kNumKeys> statuses;
+    db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, &keys[0],
+                  &values[0], &statuses[0]);
+
+    for (size_t i = 0; i < kNumKeys; ++i) {
+      ASSERT_OK(statuses[i]);
+      ASSERT_EQ(value_strs[i], values[i]);
+    }
+  }
+}
+
+TEST_F(DBBlobBasicTest, GetBlob_CorruptIndex) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  ASSERT_OK(Put(key, blob));
+  ASSERT_OK(Flush());
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "Version::Get::TamperWithBlobIndex", [](void* arg) {
+        Slice* const blob_index = static_cast<Slice*>(arg);
+        assert(blob_index);
+        assert(!blob_index->empty());
+        blob_index->remove_prefix(1);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  PinnableSlice result;
+  ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)
+                  .IsCorruption());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBBlobBasicTest, MultiGetBlob_CorruptIndex) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+
+  DestroyAndReopen(options);
+
+  constexpr size_t kNumOfKeys = 3;
+  std::array<std::string, kNumOfKeys> key_strs;
+  std::array<std::string, kNumOfKeys> value_strs;
+  std::array<Slice, kNumOfKeys + 1> keys;
+  for (size_t i = 0; i < kNumOfKeys; ++i) {
+    key_strs[i] = "foo" + std::to_string(i);
+    value_strs[i] = "blob_value" + std::to_string(i);
+    ASSERT_OK(Put(key_strs[i], value_strs[i]));
+    keys[i] = key_strs[i];
+  }
+
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+  ASSERT_OK(Put(key, blob));
+  keys[kNumOfKeys] = key;
+
+  ASSERT_OK(Flush());
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "Version::MultiGet::TamperWithBlobIndex", [&key](void* arg) {
+        KeyContext* const key_context = static_cast<KeyContext*>(arg);
+        assert(key_context);
+        assert(key_context->key);
+
+        if (*(key_context->key) == key) {
+          Slice* const blob_index = key_context->value;
+          assert(blob_index);
+          assert(!blob_index->empty());
+          blob_index->remove_prefix(1);
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::array<PinnableSlice, kNumOfKeys + 1> values;
+  std::array<Status, kNumOfKeys + 1> statuses;
+  db_->MultiGet(ReadOptions(), dbfull()->DefaultColumnFamily(), kNumOfKeys + 1,
+                keys.data(), values.data(), statuses.data(),
+                /*sorted_input=*/false);
+  for (size_t i = 0; i < kNumOfKeys + 1; ++i) {
+    if (i != kNumOfKeys) {
+      ASSERT_OK(statuses[i]);
+      ASSERT_EQ("blob_value" + std::to_string(i), values[i]);
+    } else {
+      ASSERT_TRUE(statuses[i].IsCorruption());
+    }
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBBlobBasicTest, MultiGetBlob_ExceedSoftLimit) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr size_t kNumOfKeys = 3;
+  std::array<std::string, kNumOfKeys> key_bufs;
+  std::array<std::string, kNumOfKeys> value_bufs;
+  std::array<Slice, kNumOfKeys> keys;
+  for (size_t i = 0; i < kNumOfKeys; ++i) {
+    key_bufs[i] = "foo" + std::to_string(i);
+    value_bufs[i] = "blob_value" + std::to_string(i);
+    ASSERT_OK(Put(key_bufs[i], value_bufs[i]));
+    keys[i] = key_bufs[i];
+  }
+  ASSERT_OK(Flush());
+
+  std::array<PinnableSlice, kNumOfKeys> values;
+  std::array<Status, kNumOfKeys> statuses;
+  ReadOptions read_opts;
+  read_opts.value_size_soft_limit = 1;
+  db_->MultiGet(read_opts, dbfull()->DefaultColumnFamily(), kNumOfKeys,
+                keys.data(), values.data(), statuses.data(),
+                /*sorted_input=*/true);
+  for (const auto& s : statuses) {
+    ASSERT_TRUE(s.IsAborted());
+  }
+}
+
+TEST_F(DBBlobBasicTest, GetBlob_InlinedTTLIndex) {
+  constexpr uint64_t min_blob_size = 10;
+
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = min_blob_size;
+
+  Reopen(options);
+
+  constexpr char key[] = "key";
+  constexpr char blob[] = "short";
+  static_assert(sizeof(short) - 1 < min_blob_size,
+                "Blob too long to be inlined");
+
+  // Fake an inlined TTL blob index.
+  std::string blob_index;
+
+  constexpr uint64_t expiration = 1234567890;
+
+  BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob);
+
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  ASSERT_OK(Flush());
+
+  PinnableSlice result;
+  ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)
+                  .IsCorruption());
+}
+
+TEST_F(DBBlobBasicTest, GetBlob_IndexWithInvalidFileNumber) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr char key[] = "key";
+
+  // Fake a blob index referencing a non-existent blob file.
+  std::string blob_index;
+
+  constexpr uint64_t blob_file_number = 1000;
+  constexpr uint64_t offset = 1234;
+  constexpr uint64_t size = 5678;
+
+  BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+                        kNoCompression);
+
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  ASSERT_OK(Flush());
+
+  PinnableSlice result;
+  ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)
+                  .IsCorruption());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobBasicTest, GenerateIOTracing) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  std::string trace_file = dbname_ + "/io_trace_file";
+
+  Reopen(options);
+  {
+    // Create IO trace file
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(
+        NewFileTraceWriter(env_, EnvOptions(), trace_file, &trace_writer));
+    ASSERT_OK(db_->StartIOTrace(TraceOptions(), std::move(trace_writer)));
+
+    constexpr char key[] = "key";
+    constexpr char blob_value[] = "blob_value";
+
+    ASSERT_OK(Put(key, blob_value));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(Get(key), blob_value);
+
+    ASSERT_OK(db_->EndIOTrace());
+    ASSERT_OK(env_->FileExists(trace_file));
+  }
+  {
+    // Parse trace file to check file operations related to blob files are
+    // recorded.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(
+        NewFileTraceReader(env_, EnvOptions(), trace_file, &trace_reader));
+    IOTraceReader reader(std::move(trace_reader));
+
+    IOTraceHeader header;
+    ASSERT_OK(reader.ReadHeader(&header));
+    ASSERT_EQ(kMajorVersion, static_cast<int>(header.rocksdb_major_version));
+    ASSERT_EQ(kMinorVersion, static_cast<int>(header.rocksdb_minor_version));
+
+    // Read records.
+    int blob_files_op_count = 0;
+    Status status;
+    while (true) {
+      IOTraceRecord record;
+      status = reader.ReadIOOp(&record);
+      if (!status.ok()) {
+        break;
+      }
+      if (record.file_name.find("blob") != std::string::npos) {
+        blob_files_op_count++;
+      }
+    }
+    // Assuming blob files will have Append, Close and then Read operations.
+    ASSERT_GT(blob_files_op_count, 2);
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_F(DBBlobBasicTest, BestEffortsRecovery_MissingNewestBlobFile) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+  Reopen(options);
+
+  ASSERT_OK(dbfull()->DisableFileDeletions());
+  constexpr int kNumTableFiles = 2;
+  for (int i = 0; i < kNumTableFiles; ++i) {
+    for (char ch = 'a'; ch != 'c'; ++ch) {
+      std::string key(1, ch);
+      ASSERT_OK(Put(key, "value" + std::to_string(i)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  Close();
+
+  std::vector<std::string> files;
+  ASSERT_OK(env_->GetChildren(dbname_, &files));
+  std::string blob_file_path;
+  uint64_t max_blob_file_num = kInvalidBlobFileNumber;
+  for (const auto& fname : files) {
+    uint64_t file_num = 0;
+    FileType type;
+    if (ParseFileName(fname, &file_num, /*info_log_name_prefix=*/"", &type) &&
+        type == kBlobFile) {
+      if (file_num > max_blob_file_num) {
+        max_blob_file_num = file_num;
+        blob_file_path = dbname_ + "/" + fname;
+      }
+    }
+  }
+  ASSERT_OK(env_->DeleteFile(blob_file_path));
+
+  options.best_efforts_recovery = true;
+  Reopen(options);
+  std::string value;
+  ASSERT_OK(db_->Get(ReadOptions(), "a", &value));
+  ASSERT_EQ("value" + std::to_string(kNumTableFiles - 2), value);
+}
+
+TEST_F(DBBlobBasicTest, GetMergeBlobWithPut) {
+  Options options = GetDefaultOptions();
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  ASSERT_OK(Put("Key1", "v1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("Key1", "v2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("Key1", "v3"));
+  ASSERT_OK(Flush());
+
+  std::string value;
+  ASSERT_OK(db_->Get(ReadOptions(), "Key1", &value));
+  ASSERT_EQ(Get("Key1"), "v1,v2,v3");
+}
+
+TEST_F(DBBlobBasicTest, MultiGetMergeBlobWithPut) {
+  constexpr size_t num_keys = 3;
+
+  Options options = GetDefaultOptions();
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  ASSERT_OK(Put("Key0", "v0_0"));
+  ASSERT_OK(Put("Key1", "v1_0"));
+  ASSERT_OK(Put("Key2", "v2_0"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("Key0", "v0_1"));
+  ASSERT_OK(Merge("Key1", "v1_1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("Key0", "v0_2"));
+  ASSERT_OK(Flush());
+
+  std::array<Slice, num_keys> keys{{"Key0", "Key1", "Key2"}};
+  std::array<PinnableSlice, num_keys> values;
+  std::array<Status, num_keys> statuses;
+
+  db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0],
+                &values[0], &statuses[0]);
+
+  ASSERT_OK(statuses[0]);
+  ASSERT_EQ(values[0], "v0_0,v0_1,v0_2");
+
+  ASSERT_OK(statuses[1]);
+  ASSERT_EQ(values[1], "v1_0,v1_1");
+
+  ASSERT_OK(statuses[2]);
+  ASSERT_EQ(values[2], "v2_0");
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobBasicTest, Properties) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr char key1[] = "key1";
+  constexpr size_t key1_size = sizeof(key1) - 1;
+
+  constexpr char key2[] = "key2";
+  constexpr size_t key2_size = sizeof(key2) - 1;
+
+  constexpr char key3[] = "key3";
+  constexpr size_t key3_size = sizeof(key3) - 1;
+
+  constexpr char blob[] = "00000000000000";
+  constexpr size_t blob_size = sizeof(blob) - 1;
+
+  constexpr char longer_blob[] = "00000000000000000000";
+  constexpr size_t longer_blob_size = sizeof(longer_blob) - 1;
+
+  ASSERT_OK(Put(key1, blob));
+  ASSERT_OK(Put(key2, longer_blob));
+  ASSERT_OK(Flush());
+
+  constexpr size_t first_blob_file_expected_size =
+      BlobLogHeader::kSize +
+      BlobLogRecord::CalculateAdjustmentForRecordHeader(key1_size) + blob_size +
+      BlobLogRecord::CalculateAdjustmentForRecordHeader(key2_size) +
+      longer_blob_size + BlobLogFooter::kSize;
+
+  ASSERT_OK(Put(key3, blob));
+  ASSERT_OK(Flush());
+
+  constexpr size_t second_blob_file_expected_size =
+      BlobLogHeader::kSize +
+      BlobLogRecord::CalculateAdjustmentForRecordHeader(key3_size) + blob_size +
+      BlobLogFooter::kSize;
+
+  constexpr size_t total_expected_size =
+      first_blob_file_expected_size + second_blob_file_expected_size;
+
+  // Number of blob files
+  uint64_t num_blob_files = 0;
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kNumBlobFiles, &num_blob_files));
+  ASSERT_EQ(num_blob_files, 2);
+
+  // Total size of live blob files
+  uint64_t live_blob_file_size = 0;
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kLiveBlobFileSize,
+                                  &live_blob_file_size));
+  ASSERT_EQ(live_blob_file_size, total_expected_size);
+
+  // Total amount of garbage in live blob files
+  {
+    uint64_t live_blob_file_garbage_size = 0;
+    ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kLiveBlobFileGarbageSize,
+                                    &live_blob_file_garbage_size));
+    ASSERT_EQ(live_blob_file_garbage_size, 0);
+  }
+
+  // Total size of all blob files across all versions
+  // Note: this should be the same as above since we only have one
+  // version at this point.
+  uint64_t total_blob_file_size = 0;
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kTotalBlobFileSize,
+                                  &total_blob_file_size));
+  ASSERT_EQ(total_blob_file_size, total_expected_size);
+
+  // Delete key2 to create some garbage
+  ASSERT_OK(Delete(key2));
+  ASSERT_OK(Flush());
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  constexpr size_t expected_garbage_size =
+      BlobLogRecord::CalculateAdjustmentForRecordHeader(key2_size) +
+      longer_blob_size;
+
+  constexpr double expected_space_amp =
+      static_cast<double>(total_expected_size) /
+      (total_expected_size - expected_garbage_size);
+
+  // Blob file stats
+  std::string blob_stats;
+  ASSERT_TRUE(db_->GetProperty(DB::Properties::kBlobStats, &blob_stats));
+
+  std::ostringstream oss;
+  oss << "Number of blob files: 2\nTotal size of blob files: "
+      << total_expected_size
+      << "\nTotal size of garbage in blob files: " << expected_garbage_size
+      << "\nBlob file space amplification: " << expected_space_amp << '\n';
+
+  ASSERT_EQ(blob_stats, oss.str());
+
+  // Total amount of garbage in live blob files
+  {
+    uint64_t live_blob_file_garbage_size = 0;
+    ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kLiveBlobFileGarbageSize,
+                                    &live_blob_file_garbage_size));
+    ASSERT_EQ(live_blob_file_garbage_size, expected_garbage_size);
+  }
+}
+
+TEST_F(DBBlobBasicTest, PropertiesMultiVersion) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr char key1[] = "key1";
+  constexpr char key2[] = "key2";
+  constexpr char key3[] = "key3";
+
+  constexpr size_t key_size = sizeof(key1) - 1;
+  static_assert(sizeof(key2) - 1 == key_size, "unexpected size: key2");
+  static_assert(sizeof(key3) - 1 == key_size, "unexpected size: key3");
+
+  constexpr char blob[] = "0000000000";
+  constexpr size_t blob_size = sizeof(blob) - 1;
+
+  ASSERT_OK(Put(key1, blob));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(key2, blob));
+  ASSERT_OK(Flush());
+
+  // Create an iterator to keep the current version alive
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+  ASSERT_OK(iter->status());
+
+  // Note: the Delete and subsequent compaction results in the first blob file
+  // not making it to the final version. (It is still part of the previous
+  // version kept alive by the iterator though.) On the other hand, the Put
+  // results in a third blob file.
+  ASSERT_OK(Delete(key1));
+  ASSERT_OK(Put(key3, blob));
+  ASSERT_OK(Flush());
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  // Total size of all blob files across all versions: between the two versions,
+  // we should have three blob files of the same size with one blob each.
+  // The version kept alive by the iterator contains the first and the second
+  // blob file, while the final version contains the second and the third blob
+  // file. (The second blob file is thus shared by the two versions but should
+  // be counted only once.)
+  uint64_t total_blob_file_size = 0;
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kTotalBlobFileSize,
+                                  &total_blob_file_size));
+  ASSERT_EQ(total_blob_file_size,
+            3 * (BlobLogHeader::kSize +
+                 BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) +
+                 blob_size + BlobLogFooter::kSize));
+}
+#endif  // !ROCKSDB_LITE
+
+class DBBlobBasicIOErrorTest : public DBBlobBasicTest,
+                               public testing::WithParamInterface<std::string> {
+ protected:
+  DBBlobBasicIOErrorTest() : sync_point_(GetParam()) {
+    fault_injection_env_.reset(new FaultInjectionTestEnv(env_));
+  }
+  ~DBBlobBasicIOErrorTest() { Close(); }
+
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env_;
+  std::string sync_point_;
+};
+
+class DBBlobBasicIOErrorMultiGetTest : public DBBlobBasicIOErrorTest {
+ public:
+  DBBlobBasicIOErrorMultiGetTest() : DBBlobBasicIOErrorTest() {}
+};
+
+INSTANTIATE_TEST_CASE_P(DBBlobBasicTest, DBBlobBasicIOErrorTest,
+                        ::testing::ValuesIn(std::vector<std::string>{
+                            "BlobFileReader::OpenFile:NewRandomAccessFile",
+                            "BlobFileReader::GetBlob:ReadFromFile"}));
+
+INSTANTIATE_TEST_CASE_P(DBBlobBasicTest, DBBlobBasicIOErrorMultiGetTest,
+                        ::testing::ValuesIn(std::vector<std::string>{
+                            "BlobFileReader::OpenFile:NewRandomAccessFile",
+                            "BlobFileReader::MultiGetBlob:ReadFromFile"}));
+
+TEST_P(DBBlobBasicIOErrorTest, GetBlob_IOError) {
+  Options options;
+  options.env = fault_injection_env_.get();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr char key[] = "key";
+  constexpr char blob_value[] = "blob_value";
+
+  ASSERT_OK(Put(key, blob_value));
+
+  ASSERT_OK(Flush());
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
+    fault_injection_env_->SetFilesystemActive(false,
+                                              Status::IOError(sync_point_));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  PinnableSlice result;
+  ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)
+                  .IsIOError());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBBlobBasicIOErrorMultiGetTest, MultiGetBlobs_IOError) {
+  Options options = GetDefaultOptions();
+  options.env = fault_injection_env_.get();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr size_t num_keys = 2;
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "first_value";
+
+  ASSERT_OK(Put(first_key, first_value));
+
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "second_value";
+
+  ASSERT_OK(Put(second_key, second_value));
+
+  ASSERT_OK(Flush());
+
+  std::array<Slice, num_keys> keys{{first_key, second_key}};
+  std::array<PinnableSlice, num_keys> values;
+  std::array<Status, num_keys> statuses;
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
+    fault_injection_env_->SetFilesystemActive(false,
+                                              Status::IOError(sync_point_));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0],
+                &values[0], &statuses[0]);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_TRUE(statuses[0].IsIOError());
+  ASSERT_TRUE(statuses[1].IsIOError());
+}
+
+TEST_P(DBBlobBasicIOErrorMultiGetTest, MultipleBlobFiles) {
+  Options options = GetDefaultOptions();
+  options.env = fault_injection_env_.get();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr size_t num_keys = 2;
+
+  constexpr char key1[] = "key1";
+  constexpr char value1[] = "blob1";
+
+  ASSERT_OK(Put(key1, value1));
+  ASSERT_OK(Flush());
+
+  constexpr char key2[] = "key2";
+  constexpr char value2[] = "blob2";
+
+  ASSERT_OK(Put(key2, value2));
+  ASSERT_OK(Flush());
+
+  std::array<Slice, num_keys> keys{{key1, key2}};
+  std::array<PinnableSlice, num_keys> values;
+  std::array<Status, num_keys> statuses;
+
+  bool first_blob_file = true;
+  SyncPoint::GetInstance()->SetCallBack(
+      sync_point_, [&first_blob_file, this](void* /* arg */) {
+        if (first_blob_file) {
+          first_blob_file = false;
+          return;
+        }
+        fault_injection_env_->SetFilesystemActive(false,
+                                                  Status::IOError(sync_point_));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys,
+                keys.data(), values.data(), statuses.data());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  ASSERT_OK(statuses[0]);
+  ASSERT_EQ(value1, values[0]);
+  ASSERT_TRUE(statuses[1].IsIOError());
+}
+
+namespace {
+
+class ReadBlobCompactionFilter : public CompactionFilter {
+ public:
+  ReadBlobCompactionFilter() = default;
+  const char* Name() const override {
+    return "rocksdb.compaction.filter.read.blob";
+  }
+  CompactionFilter::Decision FilterV2(
+      int /*level*/, const Slice& /*key*/, ValueType value_type,
+      const Slice& existing_value, std::string* new_value,
+      std::string* /*skip_until*/) const override {
+    if (value_type != CompactionFilter::ValueType::kValue) {
+      return CompactionFilter::Decision::kKeep;
+    }
+    assert(new_value);
+    new_value->assign(existing_value.data(), existing_value.size());
+    return CompactionFilter::Decision::kChangeValue;
+  }
+};
+
+}  // anonymous namespace
+
+TEST_P(DBBlobBasicIOErrorTest, CompactionFilterReadBlob_IOError) {
+  Options options = GetDefaultOptions();
+  options.env = fault_injection_env_.get();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new ReadBlobCompactionFilter);
+  options.compaction_filter = compaction_filter_guard.get();
+
+  DestroyAndReopen(options);
+  constexpr char key[] = "foo";
+  constexpr char blob_value[] = "foo_blob_value";
+  ASSERT_OK(Put(key, blob_value));
+  ASSERT_OK(Flush());
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
+    fault_injection_env_->SetFilesystemActive(false,
+                                              Status::IOError(sync_point_));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                                /*end=*/nullptr)
+                  .IsIOError());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBBlobBasicTest, WarmCacheWithBlobsDuringFlush) {
+  Options options = GetDefaultOptions();
+
+  LRUCacheOptions co;
+  co.capacity = 1 << 25;
+  co.num_shard_bits = 2;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  auto backing_cache = NewLRUCache(co);
+
+  options.blob_cache = backing_cache;
+
+  BlockBasedTableOptions block_based_options;
+  block_based_options.no_block_cache = false;
+  block_based_options.block_cache = backing_cache;
+  block_based_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
+
+  options.enable_blob_files = true;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 1.0;
+  options.prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+  DestroyAndReopen(options);
+
+  constexpr size_t kNumBlobs = 10;
+  constexpr size_t kValueSize = 100;
+
+  std::string value(kValueSize, 'a');
+
+  for (size_t i = 1; i <= kNumBlobs; i++) {
+    ASSERT_OK(Put(std::to_string(i), value));
+    ASSERT_OK(Put(std::to_string(i + kNumBlobs), value));  // Add some overlap
+    ASSERT_OK(Flush());
+    ASSERT_EQ(i * 2, options.statistics->getTickerCount(BLOB_DB_CACHE_ADD));
+    ASSERT_EQ(value, Get(std::to_string(i)));
+    ASSERT_EQ(value, Get(std::to_string(i + kNumBlobs)));
+    ASSERT_EQ(0, options.statistics->getTickerCount(BLOB_DB_CACHE_MISS));
+    ASSERT_EQ(i * 2, options.statistics->getTickerCount(BLOB_DB_CACHE_HIT));
+  }
+
+  // Verify compaction not counted
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                              /*end=*/nullptr));
+  EXPECT_EQ(kNumBlobs * 2,
+            options.statistics->getTickerCount(BLOB_DB_CACHE_ADD));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobBasicTest, DynamicallyWarmCacheDuringFlush) {
+  Options options = GetDefaultOptions();
+
+  LRUCacheOptions co;
+  co.capacity = 1 << 25;
+  co.num_shard_bits = 2;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  auto backing_cache = NewLRUCache(co);
+
+  options.blob_cache = backing_cache;
+
+  BlockBasedTableOptions block_based_options;
+  block_based_options.no_block_cache = false;
+  block_based_options.block_cache = backing_cache;
+  block_based_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
+
+  options.enable_blob_files = true;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 1.0;
+  options.prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+  DestroyAndReopen(options);
+
+  constexpr size_t kNumBlobs = 10;
+  constexpr size_t kValueSize = 100;
+
+  std::string value(kValueSize, 'a');
+
+  for (size_t i = 1; i <= 5; i++) {
+    ASSERT_OK(Put(std::to_string(i), value));
+    ASSERT_OK(Put(std::to_string(i + kNumBlobs), value));  // Add some overlap
+    ASSERT_OK(Flush());
+    ASSERT_EQ(2, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD));
+
+    ASSERT_EQ(value, Get(std::to_string(i)));
+    ASSERT_EQ(value, Get(std::to_string(i + kNumBlobs)));
+    ASSERT_EQ(0, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD));
+    ASSERT_EQ(0,
+              options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS));
+    ASSERT_EQ(2, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT));
+  }
+
+  ASSERT_OK(dbfull()->SetOptions({{"prepopulate_blob_cache", "kDisable"}}));
+
+  for (size_t i = 6; i <= kNumBlobs; i++) {
+    ASSERT_OK(Put(std::to_string(i), value));
+    ASSERT_OK(Put(std::to_string(i + kNumBlobs), value));  // Add some overlap
+    ASSERT_OK(Flush());
+    ASSERT_EQ(0, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD));
+
+    ASSERT_EQ(value, Get(std::to_string(i)));
+    ASSERT_EQ(value, Get(std::to_string(i + kNumBlobs)));
+    ASSERT_EQ(2, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD));
+    ASSERT_EQ(2,
+              options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS));
+    ASSERT_EQ(0, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT));
+  }
+
+  // Verify compaction not counted
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                              /*end=*/nullptr));
+  EXPECT_EQ(0, options.statistics->getTickerCount(BLOB_DB_CACHE_ADD));
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_F(DBBlobBasicTest, WarmCacheWithBlobsSecondary) {
+  CompressedSecondaryCacheOptions secondary_cache_opts;
+  secondary_cache_opts.capacity = 1 << 20;
+  secondary_cache_opts.num_shard_bits = 0;
+  secondary_cache_opts.metadata_charge_policy = kDontChargeCacheMetadata;
+  secondary_cache_opts.compression_type = kNoCompression;
+
+  LRUCacheOptions primary_cache_opts;
+  primary_cache_opts.capacity = 1024;
+  primary_cache_opts.num_shard_bits = 0;
+  primary_cache_opts.metadata_charge_policy = kDontChargeCacheMetadata;
+  primary_cache_opts.secondary_cache =
+      NewCompressedSecondaryCache(secondary_cache_opts);
+
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.statistics = CreateDBStatistics();
+  options.enable_blob_files = true;
+  options.blob_cache = NewLRUCache(primary_cache_opts);
+  options.prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly;
+
+  DestroyAndReopen(options);
+
+  // Note: only one of the two blobs fit in the primary cache at any given time.
+  constexpr char first_key[] = "foo";
+  constexpr size_t first_blob_size = 512;
+  const std::string first_blob(first_blob_size, 'a');
+
+  constexpr char second_key[] = "bar";
+  constexpr size_t second_blob_size = 768;
+  const std::string second_blob(second_blob_size, 'b');
+
+  // First blob is inserted into primary cache during flush.
+  ASSERT_OK(Put(first_key, first_blob));
+  ASSERT_OK(Flush());
+  ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 1);
+
+  // Second blob is inserted into primary cache during flush,
+  // First blob is evicted but only a dummy handle is inserted into secondary
+  // cache.
+  ASSERT_OK(Put(second_key, second_blob));
+  ASSERT_OK(Flush());
+  ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 1);
+
+  // First blob is inserted into primary cache.
+  // Second blob is evicted but only a dummy handle is inserted into secondary
+  // cache.
+  ASSERT_EQ(Get(first_key), first_blob);
+  ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS), 1);
+  ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT), 0);
+  ASSERT_EQ(options.statistics->getAndResetTickerCount(SECONDARY_CACHE_HITS),
+            0);
+  // Second blob is inserted into primary cache,
+  // First blob is evicted and is inserted into secondary cache.
+  ASSERT_EQ(Get(second_key), second_blob);
+  ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS), 1);
+  ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT), 0);
+  ASSERT_EQ(options.statistics->getAndResetTickerCount(SECONDARY_CACHE_HITS),
+            0);
+
+  // First blob's dummy item is inserted into primary cache b/c of lookup.
+  // Second blob is still in primary cache.
+  ASSERT_EQ(Get(first_key), first_blob);
+  ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS), 0);
+  ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT), 1);
+  ASSERT_EQ(options.statistics->getAndResetTickerCount(SECONDARY_CACHE_HITS),
+            1);
+
+  // First blob's item is inserted into primary cache b/c of lookup.
+  // Second blob is evicted and inserted into secondary cache.
+  ASSERT_EQ(Get(first_key), first_blob);
+  ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS), 0);
+  ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT), 1);
+  ASSERT_EQ(options.statistics->getAndResetTickerCount(SECONDARY_CACHE_HITS),
+            1);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/db_blob_compaction_test.cc b/src/rocksdb/db/blob/db_blob_compaction_test.cc
new file mode 100644
index 000000000..f3fe3c03b
--- /dev/null
+++ b/src/rocksdb/db/blob/db_blob_compaction_test.cc
@@ -0,0 +1,913 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBBlobCompactionTest : public DBTestBase {
+ public:
+  explicit DBBlobCompactionTest()
+      : DBTestBase("db_blob_compaction_test", /*env_do_fsync=*/false) {}
+
+#ifndef ROCKSDB_LITE
+  const std::vector<InternalStats::CompactionStats>& GetCompactionStats() {
+    VersionSet* const versions = dbfull()->GetVersionSet();
+    assert(versions);
+    assert(versions->GetColumnFamilySet());
+
+    ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+    assert(cfd);
+
+    const InternalStats* const internal_stats = cfd->internal_stats();
+    assert(internal_stats);
+
+    return internal_stats->TEST_GetCompactionStats();
+  }
+#endif  // ROCKSDB_LITE
+};
+
+namespace {
+
+class FilterByKeyLength : public CompactionFilter {
+ public:
+  explicit FilterByKeyLength(size_t len) : length_threshold_(len) {}
+  const char* Name() const override {
+    return "rocksdb.compaction.filter.by.key.length";
+  }
+  CompactionFilter::Decision FilterBlobByKey(
+      int /*level*/, const Slice& key, std::string* /*new_value*/,
+      std::string* /*skip_until*/) const override {
+    if (key.size() < length_threshold_) {
+      return CompactionFilter::Decision::kRemove;
+    }
+    return CompactionFilter::Decision::kKeep;
+  }
+
+ private:
+  size_t length_threshold_;
+};
+
+class FilterByValueLength : public CompactionFilter {
+ public:
+  explicit FilterByValueLength(size_t len) : length_threshold_(len) {}
+  const char* Name() const override {
+    return "rocksdb.compaction.filter.by.value.length";
+  }
+  CompactionFilter::Decision FilterV2(
+      int /*level*/, const Slice& /*key*/, ValueType /*value_type*/,
+      const Slice& existing_value, std::string* /*new_value*/,
+      std::string* /*skip_until*/) const override {
+    if (existing_value.size() < length_threshold_) {
+      return CompactionFilter::Decision::kRemove;
+    }
+    return CompactionFilter::Decision::kKeep;
+  }
+
+ private:
+  size_t length_threshold_;
+};
+
+class BadBlobCompactionFilter : public CompactionFilter {
+ public:
+  explicit BadBlobCompactionFilter(std::string prefix,
+                                   CompactionFilter::Decision filter_by_key,
+                                   CompactionFilter::Decision filter_v2)
+      : prefix_(std::move(prefix)),
+        filter_blob_by_key_(filter_by_key),
+        filter_v2_(filter_v2) {}
+  const char* Name() const override { return "rocksdb.compaction.filter.bad"; }
+  CompactionFilter::Decision FilterBlobByKey(
+      int /*level*/, const Slice& key, std::string* /*new_value*/,
+      std::string* /*skip_until*/) const override {
+    if (key.size() >= prefix_.size() &&
+        0 == strncmp(prefix_.data(), key.data(), prefix_.size())) {
+      return CompactionFilter::Decision::kUndetermined;
+    }
+    return filter_blob_by_key_;
+  }
+  CompactionFilter::Decision FilterV2(
+      int /*level*/, const Slice& /*key*/, ValueType /*value_type*/,
+      const Slice& /*existing_value*/, std::string* /*new_value*/,
+      std::string* /*skip_until*/) const override {
+    return filter_v2_;
+  }
+
+ private:
+  const std::string prefix_;
+  const CompactionFilter::Decision filter_blob_by_key_;
+  const CompactionFilter::Decision filter_v2_;
+};
+
+class ValueBlindWriteFilter : public CompactionFilter {
+ public:
+  explicit ValueBlindWriteFilter(std::string new_val)
+      : new_value_(std::move(new_val)) {}
+  const char* Name() const override {
+    return "rocksdb.compaction.filter.blind.write";
+  }
+  CompactionFilter::Decision FilterBlobByKey(
+      int level, const Slice& key, std::string* new_value,
+      std::string* skip_until) const override;
+
+ private:
+  const std::string new_value_;
+};
+
+CompactionFilter::Decision ValueBlindWriteFilter::FilterBlobByKey(
+    int /*level*/, const Slice& /*key*/, std::string* new_value,
+    std::string* /*skip_until*/) const {
+  assert(new_value);
+  new_value->assign(new_value_);
+  return CompactionFilter::Decision::kChangeValue;
+}
+
+class ValueMutationFilter : public CompactionFilter {
+ public:
+  explicit ValueMutationFilter(std::string padding)
+      : padding_(std::move(padding)) {}
+  const char* Name() const override {
+    return "rocksdb.compaction.filter.value.mutation";
+  }
+  CompactionFilter::Decision FilterV2(int level, const Slice& key,
+                                      ValueType value_type,
+                                      const Slice& existing_value,
+                                      std::string* new_value,
+                                      std::string* skip_until) const override;
+
+ private:
+  const std::string padding_;
+};
+
+CompactionFilter::Decision ValueMutationFilter::FilterV2(
+    int /*level*/, const Slice& /*key*/, ValueType value_type,
+    const Slice& existing_value, std::string* new_value,
+    std::string* /*skip_until*/) const {
+  assert(CompactionFilter::ValueType::kBlobIndex != value_type);
+  if (CompactionFilter::ValueType::kValue != value_type) {
+    return CompactionFilter::Decision::kKeep;
+  }
+  assert(new_value);
+  new_value->assign(existing_value.data(), existing_value.size());
+  new_value->append(padding_);
+  return CompactionFilter::Decision::kChangeValue;
+}
+
+class AlwaysKeepFilter : public CompactionFilter {
+ public:
+  explicit AlwaysKeepFilter() = default;
+  const char* Name() const override {
+    return "rocksdb.compaction.filter.always.keep";
+  }
+  CompactionFilter::Decision FilterV2(
+      int /*level*/, const Slice& /*key*/, ValueType /*value_type*/,
+      const Slice& /*existing_value*/, std::string* /*new_value*/,
+      std::string* /*skip_until*/) const override {
+    return CompactionFilter::Decision::kKeep;
+  }
+};
+
+class SkipUntilFilter : public CompactionFilter {
+ public:
+  explicit SkipUntilFilter(std::string skip_until)
+      : skip_until_(std::move(skip_until)) {}
+
+  const char* Name() const override {
+    return "rocksdb.compaction.filter.skip.until";
+  }
+
+  CompactionFilter::Decision FilterV2(int /* level */, const Slice& /* key */,
+                                      ValueType /* value_type */,
+                                      const Slice& /* existing_value */,
+                                      std::string* /* new_value */,
+                                      std::string* skip_until) const override {
+    assert(skip_until);
+    *skip_until = skip_until_;
+
+    return CompactionFilter::Decision::kRemoveAndSkipUntil;
+  }
+
+ private:
+  std::string skip_until_;
+};
+
+}  // anonymous namespace
+
+class DBBlobBadCompactionFilterTest
+    : public DBBlobCompactionTest,
+      public testing::WithParamInterface<
+          std::tuple<std::string, CompactionFilter::Decision,
+                     CompactionFilter::Decision>> {
+ public:
+  explicit DBBlobBadCompactionFilterTest()
+      : compaction_filter_guard_(new BadBlobCompactionFilter(
+            std::get<0>(GetParam()), std::get<1>(GetParam()),
+            std::get<2>(GetParam()))) {}
+
+ protected:
+  std::unique_ptr<CompactionFilter> compaction_filter_guard_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    BadCompactionFilter, DBBlobBadCompactionFilterTest,
+    testing::Combine(
+        testing::Values("a"),
+        testing::Values(CompactionFilter::Decision::kChangeBlobIndex,
+                        CompactionFilter::Decision::kIOError),
+        testing::Values(CompactionFilter::Decision::kUndetermined,
+                        CompactionFilter::Decision::kChangeBlobIndex,
+                        CompactionFilter::Decision::kIOError)));
+
+TEST_F(DBBlobCompactionTest, FilterByKeyLength) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+  constexpr size_t kKeyLength = 2;
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new FilterByKeyLength(kKeyLength));
+  options.compaction_filter = compaction_filter_guard.get();
+
+  constexpr char short_key[] = "a";
+  constexpr char long_key[] = "abc";
+  constexpr char blob_value[] = "value";
+
+  DestroyAndReopen(options);
+  ASSERT_OK(Put(short_key, blob_value));
+  ASSERT_OK(Put(long_key, blob_value));
+  ASSERT_OK(Flush());
+  CompactRangeOptions cro;
+  ASSERT_OK(db_->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
+  std::string value;
+  ASSERT_TRUE(db_->Get(ReadOptions(), short_key, &value).IsNotFound());
+  value.clear();
+  ASSERT_OK(db_->Get(ReadOptions(), long_key, &value));
+  ASSERT_EQ("value", value);
+
+#ifndef ROCKSDB_LITE
+  const auto& compaction_stats = GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+
+  // Filter decides between kKeep and kRemove solely based on key;
+  // this involves neither reading nor writing blobs
+  ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+  ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+#endif  // ROCKSDB_LITE
+
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, FilterByValueLength) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 5;
+  options.create_if_missing = true;
+  constexpr size_t kValueLength = 5;
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new FilterByValueLength(kValueLength));
+  options.compaction_filter = compaction_filter_guard.get();
+
+  const std::vector<std::string> short_value_keys = {"a", "e", "j"};
+  constexpr char short_value[] = "val";
+  const std::vector<std::string> long_value_keys = {"b", "f", "k"};
+  constexpr char long_value[] = "valuevalue";
+
+  DestroyAndReopen(options);
+  for (size_t i = 0; i < short_value_keys.size(); ++i) {
+    ASSERT_OK(Put(short_value_keys[i], short_value));
+  }
+  for (size_t i = 0; i < short_value_keys.size(); ++i) {
+    ASSERT_OK(Put(long_value_keys[i], long_value));
+  }
+  ASSERT_OK(Flush());
+  CompactRangeOptions cro;
+  ASSERT_OK(db_->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
+  std::string value;
+  for (size_t i = 0; i < short_value_keys.size(); ++i) {
+    ASSERT_TRUE(
+        db_->Get(ReadOptions(), short_value_keys[i], &value).IsNotFound());
+    value.clear();
+  }
+  for (size_t i = 0; i < long_value_keys.size(); ++i) {
+    ASSERT_OK(db_->Get(ReadOptions(), long_value_keys[i], &value));
+    ASSERT_EQ(long_value, value);
+  }
+
+#ifndef ROCKSDB_LITE
+  const auto& compaction_stats = GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+
+  // Filter decides between kKeep and kRemove based on value;
+  // this involves reading but not writing blobs
+  ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
+  ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+#endif  // ROCKSDB_LITE
+
+  Close();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobCompactionTest, BlobCompactWithStartingLevel) {
+  Options options = GetDefaultOptions();
+
+  options.enable_blob_files = true;
+  options.min_blob_size = 1000;
+  options.blob_file_starting_level = 5;
+  options.create_if_missing = true;
+
+  // Open DB with fixed-prefix sst-partitioner so that compaction will cut
+  // new table file when encountering a new key whose 1-byte prefix changes.
+  constexpr size_t key_len = 1;
+  options.sst_partitioner_factory =
+      NewSstPartitionerFixedPrefixFactory(key_len);
+
+  ASSERT_OK(TryReopen(options));
+
+  constexpr size_t blob_size = 3000;
+
+  constexpr char first_key[] = "a";
+  const std::string first_blob(blob_size, 'a');
+  ASSERT_OK(Put(first_key, first_blob));
+
+  constexpr char second_key[] = "b";
+  const std::string second_blob(2 * blob_size, 'b');
+  ASSERT_OK(Put(second_key, second_blob));
+
+  constexpr char third_key[] = "d";
+  const std::string third_blob(blob_size, 'd');
+  ASSERT_OK(Put(third_key, third_blob));
+
+  ASSERT_OK(Flush());
+
+  constexpr char fourth_key[] = "c";
+  const std::string fourth_blob(blob_size, 'c');
+  ASSERT_OK(Put(fourth_key, fourth_blob));
+
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(0, GetBlobFileNumbers().size());
+  ASSERT_EQ(2, NumTableFilesAtLevel(/*level=*/0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(/*level=*/1));
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                              /*end=*/nullptr));
+
+  // No blob file should be created since blob_file_starting_level is 5.
+  ASSERT_EQ(0, GetBlobFileNumbers().size());
+  ASSERT_EQ(0, NumTableFilesAtLevel(/*level=*/0));
+  ASSERT_EQ(4, NumTableFilesAtLevel(/*level=*/1));
+
+  {
+    options.blob_file_starting_level = 1;
+    DestroyAndReopen(options);
+
+    ASSERT_OK(Put(first_key, first_blob));
+    ASSERT_OK(Put(second_key, second_blob));
+    ASSERT_OK(Put(third_key, third_blob));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Put(fourth_key, fourth_blob));
+    ASSERT_OK(Flush());
+
+    ASSERT_EQ(0, GetBlobFileNumbers().size());
+    ASSERT_EQ(2, NumTableFilesAtLevel(/*level=*/0));
+    ASSERT_EQ(0, NumTableFilesAtLevel(/*level=*/1));
+
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                                /*end=*/nullptr));
+    // The compaction's output level equals to blob_file_starting_level.
+    ASSERT_EQ(1, GetBlobFileNumbers().size());
+    ASSERT_EQ(0, NumTableFilesAtLevel(/*level=*/0));
+    ASSERT_EQ(4, NumTableFilesAtLevel(/*level=*/1));
+  }
+
+  Close();
+}
+#endif
+
+TEST_F(DBBlobCompactionTest, BlindWriteFilter) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+  constexpr char new_blob_value[] = "new_blob_value";
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new ValueBlindWriteFilter(new_blob_value));
+  options.compaction_filter = compaction_filter_guard.get();
+  DestroyAndReopen(options);
+  const std::vector<std::string> keys = {"a", "b", "c"};
+  const std::vector<std::string> values = {"a_value", "b_value", "c_value"};
+  assert(keys.size() == values.size());
+  for (size_t i = 0; i < keys.size(); ++i) {
+    ASSERT_OK(Put(keys[i], values[i]));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                              /*end=*/nullptr));
+  for (const auto& key : keys) {
+    ASSERT_EQ(new_blob_value, Get(key));
+  }
+
+#ifndef ROCKSDB_LITE
+  const auto& compaction_stats = GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+
+  // Filter unconditionally changes value in FilterBlobByKey;
+  // this involves writing but not reading blobs
+  ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+  ASSERT_GT(compaction_stats[1].bytes_written_blob, 0);
+#endif  // ROCKSDB_LITE
+
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, SkipUntilFilter) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new SkipUntilFilter("z"));
+  options.compaction_filter = compaction_filter_guard.get();
+
+  Reopen(options);
+
+  const std::vector<std::string> keys{"a", "b", "c"};
+  const std::vector<std::string> values{"a_value", "b_value", "c_value"};
+  assert(keys.size() == values.size());
+
+  for (size_t i = 0; i < keys.size(); ++i) {
+    ASSERT_OK(Put(keys[i], values[i]));
+  }
+
+  ASSERT_OK(Flush());
+
+  int process_in_flow_called = 0;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobCountingIterator::UpdateAndCountBlobIfNeeded:ProcessInFlow",
+      [&process_in_flow_called](void* /* arg */) { ++process_in_flow_called; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /* begin */ nullptr,
+                              /* end */ nullptr));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  for (const auto& key : keys) {
+    ASSERT_EQ(Get(key), "NOT_FOUND");
+  }
+
+  // Make sure SkipUntil was performed using iteration rather than Seek
+  ASSERT_EQ(process_in_flow_called, keys.size());
+
+  Close();
+}
+
+TEST_P(DBBlobBadCompactionFilterTest, BadDecisionFromCompactionFilter) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+  options.compaction_filter = compaction_filter_guard_.get();
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("b", "value"));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                                /*end=*/nullptr)
+                  .IsNotSupported());
+  Close();
+
+  DestroyAndReopen(options);
+  std::string key(std::get<0>(GetParam()));
+  ASSERT_OK(Put(key, "value"));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                                /*end=*/nullptr)
+                  .IsNotSupported());
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionFilter_InlinedTTLIndex) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new ValueMutationFilter(""));
+  options.compaction_filter = compaction_filter_guard.get();
+  DestroyAndReopen(options);
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+  // Fake an inlined TTL blob index.
+  std::string blob_index;
+  constexpr uint64_t expiration = 1234567890;
+  BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob);
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                                /*end=*/nullptr)
+                  .IsCorruption());
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionFilter) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  constexpr char padding[] = "_delta";
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new ValueMutationFilter(padding));
+  options.compaction_filter = compaction_filter_guard.get();
+  DestroyAndReopen(options);
+  const std::vector<std::pair<std::string, std::string>> kvs = {
+      {"a", "a_value"}, {"b", "b_value"}, {"c", "c_value"}};
+  for (const auto& kv : kvs) {
+    ASSERT_OK(Put(kv.first, kv.second));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                              /*end=*/nullptr));
+  for (const auto& kv : kvs) {
+    ASSERT_EQ(kv.second + std::string(padding), Get(kv.first));
+  }
+
+#ifndef ROCKSDB_LITE
+  const auto& compaction_stats = GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+
+  // Filter changes the value using the previous value in FilterV2;
+  // this involves reading and writing blobs
+  ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
+  ASSERT_GT(compaction_stats[1].bytes_written_blob, 0);
+#endif  // ROCKSDB_LITE
+
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, CorruptedBlobIndex) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new ValueMutationFilter(""));
+  options.compaction_filter = compaction_filter_guard.get();
+  DestroyAndReopen(options);
+
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  ASSERT_OK(Put(key, blob));
+  ASSERT_OK(Flush());
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::InvokeFilterIfNeeded::TamperWithBlobIndex",
+      [](void* arg) {
+        Slice* const blob_index = static_cast<Slice*>(arg);
+        assert(blob_index);
+        assert(!blob_index->empty());
+        blob_index->remove_prefix(1);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                                /*end=*/nullptr)
+                  .IsCorruption());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionFilterReadBlobAndKeep) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new AlwaysKeepFilter());
+  options.compaction_filter = compaction_filter_guard.get();
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "foo_value"));
+  ASSERT_OK(Flush());
+  std::vector<uint64_t> blob_files = GetBlobFileNumbers();
+  ASSERT_EQ(1, blob_files.size());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                              /*end=*/nullptr));
+  ASSERT_EQ(blob_files, GetBlobFileNumbers());
+
+#ifndef ROCKSDB_LITE
+  const auto& compaction_stats = GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+
+  // Filter decides to keep the existing value in FilterV2;
+  // this involves reading but not writing blobs
+  ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
+  ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+#endif  // ROCKSDB_LITE
+
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, TrackGarbage) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+
+  Reopen(options);
+
+  // First table+blob file pair: 4 blobs with different keys
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "first_value";
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "second_value";
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "third_value";
+  constexpr char fourth_key[] = "fourth_key";
+  constexpr char fourth_value[] = "fourth_value";
+
+  ASSERT_OK(Put(first_key, first_value));
+  ASSERT_OK(Put(second_key, second_value));
+  ASSERT_OK(Put(third_key, third_value));
+  ASSERT_OK(Put(fourth_key, fourth_value));
+  ASSERT_OK(Flush());
+
+  // Second table+blob file pair: overwrite 2 existing keys
+  constexpr char new_first_value[] = "new_first_value";
+  constexpr char new_second_value[] = "new_second_value";
+
+  ASSERT_OK(Put(first_key, new_first_value));
+  ASSERT_OK(Put(second_key, new_second_value));
+  ASSERT_OK(Flush());
+
+  // Compact them together. The first blob file should have 2 garbage blobs
+  // corresponding to the 2 overwritten keys.
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+  assert(versions->GetColumnFamilySet());
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  assert(cfd);
+
+  Version* const current = cfd->current();
+  assert(current);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  assert(storage_info);
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+  ASSERT_EQ(blob_files.size(), 2);
+
+  {
+    const auto& meta = blob_files.front();
+    assert(meta);
+
+    constexpr uint64_t first_expected_bytes =
+        sizeof(first_value) - 1 +
+        BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(first_key) -
+                                                          1);
+    constexpr uint64_t second_expected_bytes =
+        sizeof(second_value) - 1 +
+        BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(second_key) -
+                                                          1);
+    constexpr uint64_t third_expected_bytes =
+        sizeof(third_value) - 1 +
+        BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(third_key) -
+                                                          1);
+    constexpr uint64_t fourth_expected_bytes =
+        sizeof(fourth_value) - 1 +
+        BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(fourth_key) -
+                                                          1);
+
+    ASSERT_EQ(meta->GetTotalBlobCount(), 4);
+    ASSERT_EQ(meta->GetTotalBlobBytes(),
+              first_expected_bytes + second_expected_bytes +
+                  third_expected_bytes + fourth_expected_bytes);
+    ASSERT_EQ(meta->GetGarbageBlobCount(), 2);
+    ASSERT_EQ(meta->GetGarbageBlobBytes(),
+              first_expected_bytes + second_expected_bytes);
+  }
+
+  {
+    const auto& meta = blob_files.back();
+    assert(meta);
+
+    constexpr uint64_t new_first_expected_bytes =
+        sizeof(new_first_value) - 1 +
+        BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(first_key) -
+                                                          1);
+    constexpr uint64_t new_second_expected_bytes =
+        sizeof(new_second_value) - 1 +
+        BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(second_key) -
+                                                          1);
+
+    ASSERT_EQ(meta->GetTotalBlobCount(), 2);
+    ASSERT_EQ(meta->GetTotalBlobBytes(),
+              new_first_expected_bytes + new_second_expected_bytes);
+    ASSERT_EQ(meta->GetGarbageBlobCount(), 0);
+    ASSERT_EQ(meta->GetGarbageBlobBytes(), 0);
+  }
+}
+
+TEST_F(DBBlobCompactionTest, MergeBlobWithBase) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  options.disable_auto_compactions = true;
+
+  Reopen(options);
+  ASSERT_OK(Put("Key1", "v1_1"));
+  ASSERT_OK(Put("Key2", "v2_1"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Merge("Key1", "v1_2"));
+  ASSERT_OK(Merge("Key2", "v2_2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Merge("Key1", "v1_3"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                              /*end=*/nullptr));
+  ASSERT_EQ(Get("Key1"), "v1_1,v1_2,v1_3");
+  ASSERT_EQ(Get("Key2"), "v2_1,v2_2");
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionReadaheadGarbageCollection) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 1.0;
+  options.blob_compaction_readahead_size = 1 << 10;
+  options.disable_auto_compactions = true;
+
+  Reopen(options);
+
+  ASSERT_OK(Put("key", "lime"));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("key", "pie"));
+  ASSERT_OK(Put("foo", "baz"));
+  ASSERT_OK(Flush());
+
+  size_t num_non_prefetch_reads = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileReader::GetBlob:ReadFromFile",
+      [&num_non_prefetch_reads](void* /* arg */) { ++num_non_prefetch_reads; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_EQ(Get("key"), "pie");
+  ASSERT_EQ(Get("foo"), "baz");
+  ASSERT_EQ(num_non_prefetch_reads, 0);
+
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionReadaheadFilter) {
+  Options options = GetDefaultOptions();
+
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new ValueMutationFilter("pie"));
+
+  options.compaction_filter = compaction_filter_guard.get();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.blob_compaction_readahead_size = 1 << 10;
+  options.disable_auto_compactions = true;
+
+  Reopen(options);
+
+  ASSERT_OK(Put("key", "lime"));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  size_t num_non_prefetch_reads = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileReader::GetBlob:ReadFromFile",
+      [&num_non_prefetch_reads](void* /* arg */) { ++num_non_prefetch_reads; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_EQ(Get("key"), "limepie");
+  ASSERT_EQ(Get("foo"), "barpie");
+  ASSERT_EQ(num_non_prefetch_reads, 0);
+
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionReadaheadMerge) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.blob_compaction_readahead_size = 1 << 10;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  options.disable_auto_compactions = true;
+
+  Reopen(options);
+
+  ASSERT_OK(Put("key", "lime"));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Merge("key", "pie"));
+  ASSERT_OK(Merge("foo", "baz"));
+  ASSERT_OK(Flush());
+
+  size_t num_non_prefetch_reads = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileReader::GetBlob:ReadFromFile",
+      [&num_non_prefetch_reads](void* /* arg */) { ++num_non_prefetch_reads; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_EQ(Get("key"), "lime,pie");
+  ASSERT_EQ(Get("foo"), "bar,baz");
+  ASSERT_EQ(num_non_prefetch_reads, 0);
+
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionDoNotFillCache) {
+  Options options = GetDefaultOptions();
+
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 1.0;
+  options.disable_auto_compactions = true;
+  options.statistics = CreateDBStatistics();
+
+  LRUCacheOptions cache_options;
+  cache_options.capacity = 1 << 20;
+  cache_options.metadata_charge_policy = kDontChargeCacheMetadata;
+
+  options.blob_cache = NewLRUCache(cache_options);
+
+  Reopen(options);
+
+  ASSERT_OK(Put("key", "lime"));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("key", "pie"));
+  ASSERT_OK(Put("foo", "baz"));
+  ASSERT_OK(Flush());
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  ASSERT_EQ(options.statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+
+  Close();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/db_blob_corruption_test.cc b/src/rocksdb/db/blob/db_blob_corruption_test.cc
new file mode 100644
index 000000000..7ac7ce3fc
--- /dev/null
+++ b/src/rocksdb/db/blob/db_blob_corruption_test.cc
@@ -0,0 +1,82 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBBlobCorruptionTest : public DBTestBase {
+ protected:
+  DBBlobCorruptionTest()
+      : DBTestBase("db_blob_corruption_test", /* env_do_fsync */ false) {}
+
+  void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
+    // Pick file to corrupt
+    std::vector<std::string> filenames;
+    ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+    uint64_t number;
+    FileType type;
+    std::string fname;
+    uint64_t picked_number = kInvalidBlobFileNumber;
+    for (size_t i = 0; i < filenames.size(); i++) {
+      if (ParseFileName(filenames[i], &number, &type) && type == filetype &&
+          number > picked_number) {  // Pick latest file
+        fname = dbname_ + "/" + filenames[i];
+        picked_number = number;
+      }
+    }
+    ASSERT_TRUE(!fname.empty()) << filetype;
+    ASSERT_OK(test::CorruptFile(env_, fname, offset, bytes_to_corrupt));
+  }
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobCorruptionTest, VerifyWholeBlobFileChecksum) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+  options.file_checksum_gen_factory =
+      ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory();
+  Reopen(options);
+
+  ASSERT_OK(Put(Slice("key_1"), Slice("blob_value_1")));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Slice("key_2"), Slice("blob_value_2")));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+  Close();
+
+  Corrupt(kBlobFile, 0, 2);
+
+  ASSERT_OK(TryReopen(options));
+
+  int count{0};
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::VerifyFullFileChecksum:mismatch", [&](void* arg) {
+        const Status* s = static_cast<Status*>(arg);
+        ASSERT_NE(s, nullptr);
+        ++count;
+        ASSERT_NOK(*s);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsCorruption());
+  ASSERT_EQ(1, count);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif  // !ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/db_blob_index_test.cc b/src/rocksdb/db/blob/db_blob_index_test.cc
new file mode 100644
index 000000000..64c550894
--- /dev/null
+++ b/src/rocksdb/db/blob/db_blob_index_test.cc
@@ -0,0 +1,602 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/blob/blob_index.h"
+#include "db/column_family.h"
+#include "db/db_iter.h"
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "db/write_batch_internal.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// kTypeBlobIndex is a value type used by BlobDB only. The base rocksdb
+// should accept the value type on write, and report not supported value
+// for reads, unless caller request for it explicitly. The base rocksdb
+// doesn't understand format of actual blob index (the value).
+class DBBlobIndexTest : public DBTestBase {
+ public:
+  enum Tier {
+    kMemtable = 0,
+    kImmutableMemtables = 1,
+    kL0SstFile = 2,
+    kLnSstFile = 3,
+  };
+  const std::vector<Tier> kAllTiers = {Tier::kMemtable,
+                                       Tier::kImmutableMemtables,
+                                       Tier::kL0SstFile, Tier::kLnSstFile};
+
+  DBBlobIndexTest() : DBTestBase("db_blob_index_test", /*env_do_fsync=*/true) {}
+
+  ColumnFamilyHandle* cfh() { return dbfull()->DefaultColumnFamily(); }
+
+  ColumnFamilyData* cfd() {
+    return static_cast_with_check<ColumnFamilyHandleImpl>(cfh())->cfd();
+  }
+
+  Status PutBlobIndex(WriteBatch* batch, const Slice& key,
+                      const Slice& blob_index) {
+    return WriteBatchInternal::PutBlobIndex(batch, cfd()->GetID(), key,
+                                            blob_index);
+  }
+
+  Status Write(WriteBatch* batch) {
+    return dbfull()->Write(WriteOptions(), batch);
+  }
+
+  std::string GetImpl(const Slice& key, bool* is_blob_index = nullptr,
+                      const Snapshot* snapshot = nullptr) {
+    ReadOptions read_options;
+    read_options.snapshot = snapshot;
+    PinnableSlice value;
+    DBImpl::GetImplOptions get_impl_options;
+    get_impl_options.column_family = cfh();
+    get_impl_options.value = &value;
+    get_impl_options.is_blob_index = is_blob_index;
+    auto s = dbfull()->GetImpl(read_options, key, get_impl_options);
+    if (s.IsNotFound()) {
+      return "NOT_FOUND";
+    }
+    if (s.IsCorruption()) {
+      return "CORRUPTION";
+    }
+    if (s.IsNotSupported()) {
+      return "NOT_SUPPORTED";
+    }
+    if (!s.ok()) {
+      return s.ToString();
+    }
+    return value.ToString();
+  }
+
+  std::string GetBlobIndex(const Slice& key,
+                           const Snapshot* snapshot = nullptr) {
+    bool is_blob_index = false;
+    std::string value = GetImpl(key, &is_blob_index, snapshot);
+    if (!is_blob_index) {
+      return "NOT_BLOB";
+    }
+    return value;
+  }
+
+  ArenaWrappedDBIter* GetBlobIterator() {
+    return dbfull()->NewIteratorImpl(
+        ReadOptions(), cfd(), dbfull()->GetLatestSequenceNumber(),
+        nullptr /*read_callback*/, true /*expose_blob_index*/);
+  }
+
+  Options GetTestOptions() {
+    Options options;
+    options.env = CurrentOptions().env;
+    options.create_if_missing = true;
+    options.num_levels = 2;
+    options.disable_auto_compactions = true;
+    // Disable auto flushes.
+    options.max_write_buffer_number = 10;
+    options.min_write_buffer_number_to_merge = 10;
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+    return options;
+  }
+
+  void MoveDataTo(Tier tier) {
+    switch (tier) {
+      case Tier::kMemtable:
+        break;
+      case Tier::kImmutableMemtables:
+        ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+        break;
+      case Tier::kL0SstFile:
+        ASSERT_OK(Flush());
+        break;
+      case Tier::kLnSstFile:
+        ASSERT_OK(Flush());
+        ASSERT_OK(Put("a", "dummy"));
+        ASSERT_OK(Put("z", "dummy"));
+        ASSERT_OK(Flush());
+        ASSERT_OK(
+            dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+#ifndef ROCKSDB_LITE
+        ASSERT_EQ("0,1", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+        break;
+    }
+  }
+};
+
+// Note: the following test case pertains to the StackableDB-based BlobDB
+// implementation. We should be able to write kTypeBlobIndex to memtables and
+// SST files.
+TEST_F(DBBlobIndexTest, Write) {
+  for (auto tier : kAllTiers) {
+    DestroyAndReopen(GetTestOptions());
+
+    std::vector<std::pair<std::string, std::string>> key_values;
+
+    constexpr size_t num_key_values = 5;
+
+    key_values.reserve(num_key_values);
+
+    for (size_t i = 1; i <= num_key_values; ++i) {
+      std::string key = "key" + std::to_string(i);
+
+      std::string blob_index;
+      BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 9876543210,
+                                  "blob" + std::to_string(i));
+
+      key_values.emplace_back(std::move(key), std::move(blob_index));
+    }
+
+    for (const auto& key_value : key_values) {
+      WriteBatch batch;
+      ASSERT_OK(PutBlobIndex(&batch, key_value.first, key_value.second));
+      ASSERT_OK(Write(&batch));
+    }
+
+    MoveDataTo(tier);
+
+    for (const auto& key_value : key_values) {
+      ASSERT_EQ(GetBlobIndex(key_value.first), key_value.second);
+    }
+  }
+}
+
+// Note: the following test case pertains to the StackableDB-based BlobDB
+// implementation. Get should be able to return blob index if is_blob_index is
+// provided, otherwise it should return Status::NotSupported (when reading from
+// memtable) or Status::Corruption (when reading from SST). Reading from SST
+// returns Corruption because we can't differentiate between the application
+// accidentally opening the base DB of a stacked BlobDB and actual corruption
+// when using the integrated BlobDB.
+TEST_F(DBBlobIndexTest, Get) {
+  std::string blob_index;
+  BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 9876543210, "blob");
+
+  for (auto tier : kAllTiers) {
+    DestroyAndReopen(GetTestOptions());
+
+    WriteBatch batch;
+    ASSERT_OK(batch.Put("key", "value"));
+    ASSERT_OK(PutBlobIndex(&batch, "blob_key", blob_index));
+    ASSERT_OK(Write(&batch));
+
+    MoveDataTo(tier);
+
+    // Verify normal value
+    bool is_blob_index = false;
+    PinnableSlice value;
+    ASSERT_EQ("value", Get("key"));
+    ASSERT_EQ("value", GetImpl("key"));
+    ASSERT_EQ("value", GetImpl("key", &is_blob_index));
+    ASSERT_FALSE(is_blob_index);
+
+    // Verify blob index
+    if (tier <= kImmutableMemtables) {
+      ASSERT_TRUE(Get("blob_key", &value).IsNotSupported());
+      ASSERT_EQ("NOT_SUPPORTED", GetImpl("blob_key"));
+    } else {
+      ASSERT_TRUE(Get("blob_key", &value).IsCorruption());
+      ASSERT_EQ("CORRUPTION", GetImpl("blob_key"));
+    }
+    ASSERT_EQ(blob_index, GetImpl("blob_key", &is_blob_index));
+    ASSERT_TRUE(is_blob_index);
+  }
+}
+
+// Note: the following test case pertains to the StackableDB-based BlobDB
+// implementation. Get should NOT return Status::NotSupported/Status::Corruption
+// if blob index is updated with a normal value. See the test case above for
+// more details.
+TEST_F(DBBlobIndexTest, Updated) {
+  std::string blob_index;
+  BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 9876543210, "blob");
+
+  for (auto tier : kAllTiers) {
+    DestroyAndReopen(GetTestOptions());
+    WriteBatch batch;
+    for (int i = 0; i < 10; i++) {
+      ASSERT_OK(PutBlobIndex(&batch, "key" + std::to_string(i), blob_index));
+    }
+    ASSERT_OK(Write(&batch));
+    // Avoid blob values from being purged.
+    const Snapshot* snapshot = dbfull()->GetSnapshot();
+    ASSERT_OK(Put("key1", "new_value"));
+    ASSERT_OK(Merge("key2", "a"));
+    ASSERT_OK(Merge("key2", "b"));
+    ASSERT_OK(Merge("key2", "c"));
+    ASSERT_OK(Delete("key3"));
+    ASSERT_OK(SingleDelete("key4"));
+    ASSERT_OK(Delete("key5"));
+    ASSERT_OK(Merge("key5", "a"));
+    ASSERT_OK(Merge("key5", "b"));
+    ASSERT_OK(Merge("key5", "c"));
+    ASSERT_OK(dbfull()->DeleteRange(WriteOptions(), cfh(), "key6", "key9"));
+    MoveDataTo(tier);
+    for (int i = 0; i < 10; i++) {
+      ASSERT_EQ(blob_index, GetBlobIndex("key" + std::to_string(i), snapshot));
+    }
+    ASSERT_EQ("new_value", Get("key1"));
+    if (tier <= kImmutableMemtables) {
+      ASSERT_EQ("NOT_SUPPORTED", GetImpl("key2"));
+    } else {
+      ASSERT_EQ("CORRUPTION", GetImpl("key2"));
+    }
+    ASSERT_EQ("NOT_FOUND", Get("key3"));
+    ASSERT_EQ("NOT_FOUND", Get("key4"));
+    ASSERT_EQ("a,b,c", GetImpl("key5"));
+    for (int i = 6; i < 9; i++) {
+      ASSERT_EQ("NOT_FOUND", Get("key" + std::to_string(i)));
+    }
+    ASSERT_EQ(blob_index, GetBlobIndex("key9"));
+    dbfull()->ReleaseSnapshot(snapshot);
+  }
+}
+
+// Note: the following test case pertains to the StackableDB-based BlobDB
+// implementation. When a blob iterator is used, it should set the
+// expose_blob_index flag for the underlying DBIter, and retrieve/return the
+// corresponding blob value. If a regular DBIter is created (i.e.
+// expose_blob_index is not set), it should return Status::Corruption.
+TEST_F(DBBlobIndexTest, Iterate) {
+  const std::vector<std::vector<ValueType>> data = {
+      /*00*/ {kTypeValue},
+      /*01*/ {kTypeBlobIndex},
+      /*02*/ {kTypeValue},
+      /*03*/ {kTypeBlobIndex, kTypeValue},
+      /*04*/ {kTypeValue},
+      /*05*/ {kTypeValue, kTypeBlobIndex},
+      /*06*/ {kTypeValue},
+      /*07*/ {kTypeDeletion, kTypeBlobIndex},
+      /*08*/ {kTypeValue},
+      /*09*/ {kTypeSingleDeletion, kTypeBlobIndex},
+      /*10*/ {kTypeValue},
+      /*11*/ {kTypeMerge, kTypeMerge, kTypeMerge, kTypeBlobIndex},
+      /*12*/ {kTypeValue},
+      /*13*/
+      {kTypeMerge, kTypeMerge, kTypeMerge, kTypeDeletion, kTypeBlobIndex},
+      /*14*/ {kTypeValue},
+      /*15*/ {kTypeBlobIndex},
+      /*16*/ {kTypeValue},
+  };
+
+  auto get_key = [](int index) {
+    char buf[20];
+    snprintf(buf, sizeof(buf), "%02d", index);
+    return "key" + std::string(buf);
+  };
+
+  auto get_value = [&](int index, int version) {
+    return get_key(index) + "_value" + std::to_string(version);
+  };
+
+  auto check_iterator = [&](Iterator* iterator, Status::Code expected_status,
+                            const Slice& expected_value) {
+    ASSERT_EQ(expected_status, iterator->status().code());
+    if (expected_status == Status::kOk) {
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ(expected_value, iterator->value());
+    } else {
+      ASSERT_FALSE(iterator->Valid());
+    }
+  };
+
+  auto create_normal_iterator = [&]() -> Iterator* {
+    return dbfull()->NewIterator(ReadOptions());
+  };
+
+  auto create_blob_iterator = [&]() -> Iterator* { return GetBlobIterator(); };
+
+  auto check_is_blob = [&](bool is_blob) {
+    return [is_blob](Iterator* iterator) {
+      ASSERT_EQ(is_blob,
+                reinterpret_cast<ArenaWrappedDBIter*>(iterator)->IsBlob());
+    };
+  };
+
+  auto verify = [&](int index, Status::Code expected_status,
+                    const Slice& forward_value, const Slice& backward_value,
+                    std::function<Iterator*()> create_iterator,
+                    std::function<void(Iterator*)> extra_check = nullptr) {
+    // Seek
+    auto* iterator = create_iterator();
+    ASSERT_OK(iterator->status());
+    ASSERT_OK(iterator->Refresh());
+    iterator->Seek(get_key(index));
+    check_iterator(iterator, expected_status, forward_value);
+    if (extra_check) {
+      extra_check(iterator);
+    }
+    delete iterator;
+
+    // Next
+    iterator = create_iterator();
+    ASSERT_OK(iterator->Refresh());
+    iterator->Seek(get_key(index - 1));
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_OK(iterator->status());
+    iterator->Next();
+    check_iterator(iterator, expected_status, forward_value);
+    if (extra_check) {
+      extra_check(iterator);
+    }
+    delete iterator;
+
+    // SeekForPrev
+    iterator = create_iterator();
+    ASSERT_OK(iterator->status());
+    ASSERT_OK(iterator->Refresh());
+    iterator->SeekForPrev(get_key(index));
+    check_iterator(iterator, expected_status, backward_value);
+    if (extra_check) {
+      extra_check(iterator);
+    }
+    delete iterator;
+
+    // Prev
+    iterator = create_iterator();
+    iterator->Seek(get_key(index + 1));
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_OK(iterator->status());
+    iterator->Prev();
+    check_iterator(iterator, expected_status, backward_value);
+    if (extra_check) {
+      extra_check(iterator);
+    }
+    delete iterator;
+  };
+
+  for (auto tier : {Tier::kMemtable} /*kAllTiers*/) {
+    // Avoid values from being purged.
+    std::vector<const Snapshot*> snapshots;
+    DestroyAndReopen(GetTestOptions());
+
+    // fill data
+    for (int i = 0; i < static_cast<int>(data.size()); i++) {
+      for (int j = static_cast<int>(data[i].size()) - 1; j >= 0; j--) {
+        std::string key = get_key(i);
+        std::string value = get_value(i, j);
+        WriteBatch batch;
+        switch (data[i][j]) {
+          case kTypeValue:
+            ASSERT_OK(Put(key, value));
+            break;
+          case kTypeDeletion:
+            ASSERT_OK(Delete(key));
+            break;
+          case kTypeSingleDeletion:
+            ASSERT_OK(SingleDelete(key));
+            break;
+          case kTypeMerge:
+            ASSERT_OK(Merge(key, value));
+            break;
+          case kTypeBlobIndex:
+            ASSERT_OK(PutBlobIndex(&batch, key, value));
+            ASSERT_OK(Write(&batch));
+            break;
+          default:
+            FAIL();
+        };
+      }
+      snapshots.push_back(dbfull()->GetSnapshot());
+    }
+    ASSERT_OK(
+        dbfull()->DeleteRange(WriteOptions(), cfh(), get_key(15), get_key(16)));
+    snapshots.push_back(dbfull()->GetSnapshot());
+    MoveDataTo(tier);
+
+    // Normal iterator
+    verify(1, Status::kCorruption, "", "", create_normal_iterator);
+    verify(3, Status::kCorruption, "", "", create_normal_iterator);
+    verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
+           create_normal_iterator);
+    verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
+           create_normal_iterator);
+    verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
+           create_normal_iterator);
+    verify(11, Status::kCorruption, "", "", create_normal_iterator);
+    verify(13, Status::kOk,
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           create_normal_iterator);
+    verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
+           create_normal_iterator);
+
+    // Iterator with blob support
+    verify(1, Status::kOk, get_value(1, 0), get_value(1, 0),
+           create_blob_iterator, check_is_blob(true));
+    verify(3, Status::kOk, get_value(3, 0), get_value(3, 0),
+           create_blob_iterator, check_is_blob(true));
+    verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
+           create_blob_iterator, check_is_blob(false));
+    if (tier <= kImmutableMemtables) {
+      verify(11, Status::kNotSupported, "", "", create_blob_iterator);
+    } else {
+      verify(11, Status::kCorruption, "", "", create_blob_iterator);
+    }
+    verify(13, Status::kOk,
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
+           create_blob_iterator, check_is_blob(false));
+
+#ifndef ROCKSDB_LITE
+    // Iterator with blob support and using seek.
+    ASSERT_OK(dbfull()->SetOptions(
+        cfh(), {{"max_sequential_skip_in_iterations", "0"}}));
+    verify(1, Status::kOk, get_value(1, 0), get_value(1, 0),
+           create_blob_iterator, check_is_blob(true));
+    verify(3, Status::kOk, get_value(3, 0), get_value(3, 0),
+           create_blob_iterator, check_is_blob(true));
+    verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
+           create_blob_iterator, check_is_blob(false));
+    if (tier <= kImmutableMemtables) {
+      verify(11, Status::kNotSupported, "", "", create_blob_iterator);
+    } else {
+      verify(11, Status::kCorruption, "", "", create_blob_iterator);
+    }
+    verify(13, Status::kOk,
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
+           create_blob_iterator, check_is_blob(false));
+#endif  // !ROCKSDB_LITE
+
+    for (auto* snapshot : snapshots) {
+      dbfull()->ReleaseSnapshot(snapshot);
+    }
+  }
+}
+
+TEST_F(DBBlobIndexTest, IntegratedBlobIterate) {
+  const std::vector<std::vector<std::string>> data = {
+      /*00*/ {"Put"},
+      /*01*/ {"Put", "Merge", "Merge", "Merge"},
+      /*02*/ {"Put"}};
+
+  auto get_key = [](size_t index) { return ("key" + std::to_string(index)); };
+
+  auto get_value = [&](size_t index, size_t version) {
+    return get_key(index) + "_value" + std::to_string(version);
+  };
+
+  auto check_iterator = [&](Iterator* iterator, Status expected_status,
+                            const Slice& expected_value) {
+    ASSERT_EQ(expected_status, iterator->status());
+    if (expected_status.ok()) {
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ(expected_value, iterator->value());
+    } else {
+      ASSERT_FALSE(iterator->Valid());
+    }
+  };
+
+  auto verify = [&](size_t index, Status expected_status,
+                    const Slice& expected_value) {
+    // Seek
+    {
+      Iterator* iterator = db_->NewIterator(ReadOptions());
+      std::unique_ptr<Iterator> iterator_guard(iterator);
+      ASSERT_OK(iterator->status());
+      ASSERT_OK(iterator->Refresh());
+      iterator->Seek(get_key(index));
+      check_iterator(iterator, expected_status, expected_value);
+    }
+    // Next
+    {
+      Iterator* iterator = db_->NewIterator(ReadOptions());
+      std::unique_ptr<Iterator> iterator_guard(iterator);
+      ASSERT_OK(iterator->Refresh());
+      iterator->Seek(get_key(index - 1));
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_OK(iterator->status());
+      iterator->Next();
+      check_iterator(iterator, expected_status, expected_value);
+    }
+    // SeekForPrev
+    {
+      Iterator* iterator = db_->NewIterator(ReadOptions());
+      std::unique_ptr<Iterator> iterator_guard(iterator);
+      ASSERT_OK(iterator->status());
+      ASSERT_OK(iterator->Refresh());
+      iterator->SeekForPrev(get_key(index));
+      check_iterator(iterator, expected_status, expected_value);
+    }
+    // Prev
+    {
+      Iterator* iterator = db_->NewIterator(ReadOptions());
+      std::unique_ptr<Iterator> iterator_guard(iterator);
+      iterator->Seek(get_key(index + 1));
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_OK(iterator->status());
+      iterator->Prev();
+      check_iterator(iterator, expected_status, expected_value);
+    }
+  };
+
+  Options options = GetTestOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  DestroyAndReopen(options);
+
+  // fill data
+  for (size_t i = 0; i < data.size(); i++) {
+    for (size_t j = 0; j < data[i].size(); j++) {
+      std::string key = get_key(i);
+      std::string value = get_value(i, j);
+      if (data[i][j] == "Put") {
+        ASSERT_OK(Put(key, value));
+        ASSERT_OK(Flush());
+      } else if (data[i][j] == "Merge") {
+        ASSERT_OK(Merge(key, value));
+        ASSERT_OK(Flush());
+      }
+    }
+  }
+
+  std::string expected_value = get_value(1, 0) + "," + get_value(1, 1) + "," +
+                               get_value(1, 2) + "," + get_value(1, 3);
+  Status expected_status;
+  verify(1, expected_status, expected_value);
+
+#ifndef ROCKSDB_LITE
+  // Test DBIter::FindValueForCurrentKeyUsingSeek flow.
+  ASSERT_OK(dbfull()->SetOptions(cfh(),
+                                 {{"max_sequential_skip_in_iterations", "0"}}));
+  verify(1, expected_status, expected_value);
+#endif  // !ROCKSDB_LITE
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/prefetch_buffer_collection.cc b/src/rocksdb/db/blob/prefetch_buffer_collection.cc
new file mode 100644
index 000000000..079576f51
--- /dev/null
+++ b/src/rocksdb/db/blob/prefetch_buffer_collection.cc
@@ -0,0 +1,21 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/prefetch_buffer_collection.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+FilePrefetchBuffer* PrefetchBufferCollection::GetOrCreatePrefetchBuffer(
+    uint64_t file_number) {
+  auto& prefetch_buffer = prefetch_buffers_[file_number];
+  if (!prefetch_buffer) {
+    prefetch_buffer.reset(
+        new FilePrefetchBuffer(readahead_size_, readahead_size_));
+  }
+
+  return prefetch_buffer.get();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/prefetch_buffer_collection.h b/src/rocksdb/db/blob/prefetch_buffer_collection.h
new file mode 100644
index 000000000..b973eddc0
--- /dev/null
+++ b/src/rocksdb/db/blob/prefetch_buffer_collection.h
@@ -0,0 +1,38 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <unordered_map>
+
+#include "file/file_prefetch_buffer.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A class that owns a collection of FilePrefetchBuffers using the file number
+// as key. Used for implementing compaction readahead for blob files. Designed
+// to be accessed by a single thread only: every (sub)compaction needs its own
+// buffers since they are guaranteed to read different blobs from different
+// positions even when reading the same file.
+class PrefetchBufferCollection {
+ public:
+  explicit PrefetchBufferCollection(uint64_t readahead_size)
+      : readahead_size_(readahead_size) {
+    assert(readahead_size_ > 0);
+  }
+
+  FilePrefetchBuffer* GetOrCreatePrefetchBuffer(uint64_t file_number);
+
+ private:
+  uint64_t readahead_size_;
+  std::unordered_map<uint64_t, std::unique_ptr<FilePrefetchBuffer>>
+      prefetch_buffers_;  // maps file number to prefetch buffer
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/builder.cc b/src/rocksdb/db/builder.cc
new file mode 100644
index 000000000..9283ffd64
--- /dev/null
+++ b/src/rocksdb/db/builder.cc
@@ -0,0 +1,434 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/builder.h"
+
+#include <algorithm>
+#include <deque>
+#include <vector>
+
+#include "db/blob/blob_file_builder.h"
+#include "db/compaction/compaction_iterator.h"
+#include "db/event_helpers.h"
+#include "db/internal_stats.h"
+#include "db/merge_helper.h"
+#include "db/output_validator.h"
+#include "db/range_del_aggregator.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/thread_status_util.h"
+#include "options/options_helper.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/format.h"
+#include "table/internal_iterator.h"
+#include "table/unique_id_impl.h"
+#include "test_util/sync_point.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TableFactory;
+
+TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions,
+                              WritableFileWriter* file) {
+  assert((tboptions.column_family_id ==
+          TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
+         tboptions.column_family_name.empty());
+  return tboptions.ioptions.table_factory->NewTableBuilder(tboptions, file);
+}
+
+Status BuildTable(
+    const std::string& dbname, VersionSet* versions,
+    const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions,
+    const FileOptions& file_options, TableCache* table_cache,
+    InternalIterator* iter,
+    std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+        range_del_iters,
+    FileMetaData* meta, std::vector<BlobFileAddition>* blob_file_additions,
+    std::vector<SequenceNumber> snapshots,
+    SequenceNumber earliest_write_conflict_snapshot,
+    SequenceNumber job_snapshot, SnapshotChecker* snapshot_checker,
+    bool paranoid_file_checks, InternalStats* internal_stats,
+    IOStatus* io_status, const std::shared_ptr<IOTracer>& io_tracer,
+    BlobFileCreationReason blob_creation_reason,
+    const SeqnoToTimeMapping& seqno_to_time_mapping, EventLogger* event_logger,
+    int job_id, const Env::IOPriority io_priority,
+    TableProperties* table_properties, Env::WriteLifeTimeHint write_hint,
+    const std::string* full_history_ts_low,
+    BlobFileCompletionCallback* blob_callback, uint64_t* num_input_entries,
+    uint64_t* memtable_payload_bytes, uint64_t* memtable_garbage_bytes) {
+  assert((tboptions.column_family_id ==
+          TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
+         tboptions.column_family_name.empty());
+  auto& mutable_cf_options = tboptions.moptions;
+  auto& ioptions = tboptions.ioptions;
+  // Reports the IOStats for flush for every following bytes.
+  const size_t kReportFlushIOStatsEvery = 1048576;
+  OutputValidator output_validator(
+      tboptions.internal_comparator,
+      /*enable_order_check=*/
+      mutable_cf_options.check_flush_compaction_key_order,
+      /*enable_hash=*/paranoid_file_checks);
+  Status s;
+  meta->fd.file_size = 0;
+  iter->SeekToFirst();
+  std::unique_ptr<CompactionRangeDelAggregator> range_del_agg(
+      new CompactionRangeDelAggregator(&tboptions.internal_comparator,
+                                       snapshots, full_history_ts_low));
+  uint64_t num_unfragmented_tombstones = 0;
+  uint64_t total_tombstone_payload_bytes = 0;
+  for (auto& range_del_iter : range_del_iters) {
+    num_unfragmented_tombstones +=
+        range_del_iter->num_unfragmented_tombstones();
+    total_tombstone_payload_bytes +=
+        range_del_iter->total_tombstone_payload_bytes();
+    range_del_agg->AddTombstones(std::move(range_del_iter));
+  }
+
+  std::string fname = TableFileName(ioptions.cf_paths, meta->fd.GetNumber(),
+                                    meta->fd.GetPathId());
+  std::vector<std::string> blob_file_paths;
+  std::string file_checksum = kUnknownFileChecksum;
+  std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
+#ifndef ROCKSDB_LITE
+  EventHelpers::NotifyTableFileCreationStarted(ioptions.listeners, dbname,
+                                               tboptions.column_family_name,
+                                               fname, job_id, tboptions.reason);
+#endif  // !ROCKSDB_LITE
+  Env* env = db_options.env;
+  assert(env);
+  FileSystem* fs = db_options.fs.get();
+  assert(fs);
+
+  TableProperties tp;
+  bool table_file_created = false;
+  if (iter->Valid() || !range_del_agg->IsEmpty()) {
+    std::unique_ptr<CompactionFilter> compaction_filter;
+    if (ioptions.compaction_filter_factory != nullptr &&
+        ioptions.compaction_filter_factory->ShouldFilterTableFileCreation(
+            tboptions.reason)) {
+      CompactionFilter::Context context;
+      context.is_full_compaction = false;
+      context.is_manual_compaction = false;
+      context.column_family_id = tboptions.column_family_id;
+      context.reason = tboptions.reason;
+      compaction_filter =
+          ioptions.compaction_filter_factory->CreateCompactionFilter(context);
+      if (compaction_filter != nullptr &&
+          !compaction_filter->IgnoreSnapshots()) {
+        s.PermitUncheckedError();
+        return Status::NotSupported(
+            "CompactionFilter::IgnoreSnapshots() = false is not supported "
+            "anymore.");
+      }
+    }
+
+    TableBuilder* builder;
+    std::unique_ptr<WritableFileWriter> file_writer;
+    {
+      std::unique_ptr<FSWritableFile> file;
+#ifndef NDEBUG
+      bool use_direct_writes = file_options.use_direct_writes;
+      TEST_SYNC_POINT_CALLBACK("BuildTable:create_file", &use_direct_writes);
+#endif  // !NDEBUG
+      IOStatus io_s = NewWritableFile(fs, fname, &file, file_options);
+      assert(s.ok());
+      s = io_s;
+      if (io_status->ok()) {
+        *io_status = io_s;
+      }
+      if (!s.ok()) {
+        EventHelpers::LogAndNotifyTableFileCreationFinished(
+            event_logger, ioptions.listeners, dbname,
+            tboptions.column_family_name, fname, job_id, meta->fd,
+            kInvalidBlobFileNumber, tp, tboptions.reason, s, file_checksum,
+            file_checksum_func_name);
+        return s;
+      }
+
+      table_file_created = true;
+      FileTypeSet tmp_set = ioptions.checksum_handoff_file_types;
+      file->SetIOPriority(io_priority);
+      file->SetWriteLifeTimeHint(write_hint);
+      file_writer.reset(new WritableFileWriter(
+          std::move(file), fname, file_options, ioptions.clock, io_tracer,
+          ioptions.stats, ioptions.listeners,
+          ioptions.file_checksum_gen_factory.get(),
+          tmp_set.Contains(FileType::kTableFile), false));
+
+      builder = NewTableBuilder(tboptions, file_writer.get());
+    }
+
+    MergeHelper merge(
+        env, tboptions.internal_comparator.user_comparator(),
+        ioptions.merge_operator.get(), compaction_filter.get(), ioptions.logger,
+        true /* internal key corruption is not ok */,
+        snapshots.empty() ? 0 : snapshots.back(), snapshot_checker);
+
+    std::unique_ptr<BlobFileBuilder> blob_file_builder(
+        (mutable_cf_options.enable_blob_files &&
+         tboptions.level_at_creation >=
+             mutable_cf_options.blob_file_starting_level &&
+         blob_file_additions)
+            ? new BlobFileBuilder(
+                  versions, fs, &ioptions, &mutable_cf_options, &file_options,
+                  tboptions.db_id, tboptions.db_session_id, job_id,
+                  tboptions.column_family_id, tboptions.column_family_name,
+                  io_priority, write_hint, io_tracer, blob_callback,
+                  blob_creation_reason, &blob_file_paths, blob_file_additions)
+            : nullptr);
+
+    const std::atomic<bool> kManualCompactionCanceledFalse{false};
+    CompactionIterator c_iter(
+        iter, tboptions.internal_comparator.user_comparator(), &merge,
+        kMaxSequenceNumber, &snapshots, earliest_write_conflict_snapshot,
+        job_snapshot, snapshot_checker, env,
+        ShouldReportDetailedTime(env, ioptions.stats),
+        true /* internal key corruption is not ok */, range_del_agg.get(),
+        blob_file_builder.get(), ioptions.allow_data_in_errors,
+        ioptions.enforce_single_del_contracts,
+        /*manual_compaction_canceled=*/kManualCompactionCanceledFalse,
+        /*compaction=*/nullptr, compaction_filter.get(),
+        /*shutting_down=*/nullptr, db_options.info_log, full_history_ts_low);
+
+    c_iter.SeekToFirst();
+    for (; c_iter.Valid(); c_iter.Next()) {
+      const Slice& key = c_iter.key();
+      const Slice& value = c_iter.value();
+      const ParsedInternalKey& ikey = c_iter.ikey();
+      // Generate a rolling 64-bit hash of the key and values
+      // Note :
+      // Here "key" integrates 'sequence_number'+'kType'+'user key'.
+      s = output_validator.Add(key, value);
+      if (!s.ok()) {
+        break;
+      }
+      builder->Add(key, value);
+
+      s = meta->UpdateBoundaries(key, value, ikey.sequence, ikey.type);
+      if (!s.ok()) {
+        break;
+      }
+
+      // TODO(noetzli): Update stats after flush, too.
+      if (io_priority == Env::IO_HIGH &&
+          IOSTATS(bytes_written) >= kReportFlushIOStatsEvery) {
+        ThreadStatusUtil::SetThreadOperationProperty(
+            ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
+      }
+    }
+    if (!s.ok()) {
+      c_iter.status().PermitUncheckedError();
+    } else if (!c_iter.status().ok()) {
+      s = c_iter.status();
+    }
+
+    if (s.ok()) {
+      auto range_del_it = range_del_agg->NewIterator();
+      for (range_del_it->SeekToFirst(); range_del_it->Valid();
+           range_del_it->Next()) {
+        auto tombstone = range_del_it->Tombstone();
+        auto kv = tombstone.Serialize();
+        builder->Add(kv.first.Encode(), kv.second);
+        meta->UpdateBoundariesForRange(kv.first, tombstone.SerializeEndKey(),
+                                       tombstone.seq_,
+                                       tboptions.internal_comparator);
+      }
+    }
+
+    TEST_SYNC_POINT("BuildTable:BeforeFinishBuildTable");
+    const bool empty = builder->IsEmpty();
+    if (num_input_entries != nullptr) {
+      *num_input_entries =
+          c_iter.num_input_entry_scanned() + num_unfragmented_tombstones;
+    }
+    if (!s.ok() || empty) {
+      builder->Abandon();
+    } else {
+      std::string seqno_time_mapping_str;
+      seqno_to_time_mapping.Encode(
+          seqno_time_mapping_str, meta->fd.smallest_seqno,
+          meta->fd.largest_seqno, meta->file_creation_time);
+      builder->SetSeqnoTimeTableProperties(
+          seqno_time_mapping_str,
+          ioptions.compaction_style == CompactionStyle::kCompactionStyleFIFO
+              ? meta->file_creation_time
+              : meta->oldest_ancester_time);
+      s = builder->Finish();
+    }
+    if (io_status->ok()) {
+      *io_status = builder->io_status();
+    }
+
+    if (s.ok() && !empty) {
+      uint64_t file_size = builder->FileSize();
+      meta->fd.file_size = file_size;
+      meta->marked_for_compaction = builder->NeedCompact();
+      assert(meta->fd.GetFileSize() > 0);
+      tp = builder
+               ->GetTableProperties();  // refresh now that builder is finished
+      if (memtable_payload_bytes != nullptr &&
+          memtable_garbage_bytes != nullptr) {
+        const CompactionIterationStats& ci_stats = c_iter.iter_stats();
+        uint64_t total_payload_bytes = ci_stats.total_input_raw_key_bytes +
+                                       ci_stats.total_input_raw_value_bytes +
+                                       total_tombstone_payload_bytes;
+        uint64_t total_payload_bytes_written =
+            (tp.raw_key_size + tp.raw_value_size);
+        // Prevent underflow, which may still happen at this point
+        // since we only support inserts, deletes, and deleteRanges.
+        if (total_payload_bytes_written <= total_payload_bytes) {
+          *memtable_payload_bytes = total_payload_bytes;
+          *memtable_garbage_bytes =
+              total_payload_bytes - total_payload_bytes_written;
+        } else {
+          *memtable_payload_bytes = 0;
+          *memtable_garbage_bytes = 0;
+        }
+      }
+      if (table_properties) {
+        *table_properties = tp;
+      }
+    }
+    delete builder;
+
+    // Finish and check for file errors
+    TEST_SYNC_POINT("BuildTable:BeforeSyncTable");
+    if (s.ok() && !empty) {
+      StopWatch sw(ioptions.clock, ioptions.stats, TABLE_SYNC_MICROS);
+      *io_status = file_writer->Sync(ioptions.use_fsync);
+    }
+    TEST_SYNC_POINT("BuildTable:BeforeCloseTableFile");
+    if (s.ok() && io_status->ok() && !empty) {
+      *io_status = file_writer->Close();
+    }
+    if (s.ok() && io_status->ok() && !empty) {
+      // Add the checksum information to file metadata.
+      meta->file_checksum = file_writer->GetFileChecksum();
+      meta->file_checksum_func_name = file_writer->GetFileChecksumFuncName();
+      file_checksum = meta->file_checksum;
+      file_checksum_func_name = meta->file_checksum_func_name;
+      // Set unique_id only if db_id and db_session_id exist
+      if (!tboptions.db_id.empty() && !tboptions.db_session_id.empty()) {
+        if (!GetSstInternalUniqueId(tboptions.db_id, tboptions.db_session_id,
+                                    meta->fd.GetNumber(), &(meta->unique_id))
+                 .ok()) {
+          // if failed to get unique id, just set it Null
+          meta->unique_id = kNullUniqueId64x2;
+        }
+      }
+    }
+
+    if (s.ok()) {
+      s = *io_status;
+    }
+
+    if (blob_file_builder) {
+      if (s.ok()) {
+        s = blob_file_builder->Finish();
+      } else {
+        blob_file_builder->Abandon(s);
+      }
+      blob_file_builder.reset();
+    }
+
+    // TODO Also check the IO status when create the Iterator.
+
+    TEST_SYNC_POINT("BuildTable:BeforeOutputValidation");
+    if (s.ok() && !empty) {
+      // Verify that the table is usable
+      // We set for_compaction to false and don't OptimizeForCompactionTableRead
+      // here because this is a special case after we finish the table building.
+      // No matter whether use_direct_io_for_flush_and_compaction is true,
+      // the goal is to cache it here for further user reads.
+      ReadOptions read_options;
+      std::unique_ptr<InternalIterator> it(table_cache->NewIterator(
+          read_options, file_options, tboptions.internal_comparator, *meta,
+          nullptr /* range_del_agg */, mutable_cf_options.prefix_extractor,
+          nullptr,
+          (internal_stats == nullptr) ? nullptr
+                                      : internal_stats->GetFileReadHist(0),
+          TableReaderCaller::kFlush, /*arena=*/nullptr,
+          /*skip_filter=*/false, tboptions.level_at_creation,
+          MaxFileSizeForL0MetaPin(mutable_cf_options),
+          /*smallest_compaction_key=*/nullptr,
+          /*largest_compaction_key*/ nullptr,
+          /*allow_unprepared_value*/ false));
+      s = it->status();
+      if (s.ok() && paranoid_file_checks) {
+        OutputValidator file_validator(tboptions.internal_comparator,
+                                       /*enable_order_check=*/true,
+                                       /*enable_hash=*/true);
+        for (it->SeekToFirst(); it->Valid(); it->Next()) {
+          // Generate a rolling 64-bit hash of the key and values
+          file_validator.Add(it->key(), it->value()).PermitUncheckedError();
+        }
+        s = it->status();
+        if (s.ok() && !output_validator.CompareValidator(file_validator)) {
+          s = Status::Corruption("Paranoid checksums do not match");
+        }
+      }
+    }
+  }
+
+  // Check for input iterator errors
+  if (!iter->status().ok()) {
+    s = iter->status();
+  }
+
+  if (!s.ok() || meta->fd.GetFileSize() == 0) {
+    TEST_SYNC_POINT("BuildTable:BeforeDeleteFile");
+
+    constexpr IODebugContext* dbg = nullptr;
+
+    if (table_file_created) {
+      Status ignored = fs->DeleteFile(fname, IOOptions(), dbg);
+      ignored.PermitUncheckedError();
+    }
+
+    assert(blob_file_additions || blob_file_paths.empty());
+
+    if (blob_file_additions) {
+      for (const std::string& blob_file_path : blob_file_paths) {
+        Status ignored = DeleteDBFile(&db_options, blob_file_path, dbname,
+                                      /*force_bg=*/false, /*force_fg=*/false);
+        ignored.PermitUncheckedError();
+        TEST_SYNC_POINT("BuildTable::AfterDeleteFile");
+      }
+    }
+  }
+
+  Status status_for_listener = s;
+  if (meta->fd.GetFileSize() == 0) {
+    fname = "(nil)";
+    if (s.ok()) {
+      status_for_listener = Status::Aborted("Empty SST file not kept");
+    }
+  }
+  // Output to event logger and fire events.
+  EventHelpers::LogAndNotifyTableFileCreationFinished(
+      event_logger, ioptions.listeners, dbname, tboptions.column_family_name,
+      fname, job_id, meta->fd, meta->oldest_blob_file_number, tp,
+      tboptions.reason, status_for_listener, file_checksum,
+      file_checksum_func_name);
+
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/builder.h b/src/rocksdb/db/builder.h
new file mode 100644
index 000000000..a028fd2ba
--- /dev/null
+++ b/src/rocksdb/db/builder.h
@@ -0,0 +1,77 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/range_tombstone_fragmenter.h"
+#include "db/seqno_to_time_mapping.h"
+#include "db/table_properties_collector.h"
+#include "logging/event_logger.h"
+#include "options/cf_options.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/types.h"
+#include "table/scoped_arena_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct FileMetaData;
+
+class VersionSet;
+class BlobFileAddition;
+class SnapshotChecker;
+class TableCache;
+class TableBuilder;
+class WritableFileWriter;
+class InternalStats;
+class BlobFileCompletionCallback;
+
+// Convenience function for NewTableBuilder on the embedded table_factory.
+TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions,
+                              WritableFileWriter* file);
+
+// Build a Table file from the contents of *iter.  The generated file
+// will be named according to number specified in meta. On success, the rest of
+// *meta will be filled with metadata about the generated table.
+// If no data is present in *iter, meta->file_size will be set to
+// zero, and no Table file will be produced.
+//
+// @param column_family_name Name of the column family that is also identified
+//    by column_family_id, or empty string if unknown.
+extern Status BuildTable(
+    const std::string& dbname, VersionSet* versions,
+    const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions,
+    const FileOptions& file_options, TableCache* table_cache,
+    InternalIterator* iter,
+    std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+        range_del_iters,
+    FileMetaData* meta, std::vector<BlobFileAddition>* blob_file_additions,
+    std::vector<SequenceNumber> snapshots,
+    SequenceNumber earliest_write_conflict_snapshot,
+    SequenceNumber job_snapshot, SnapshotChecker* snapshot_checker,
+    bool paranoid_file_checks, InternalStats* internal_stats,
+    IOStatus* io_status, const std::shared_ptr<IOTracer>& io_tracer,
+    BlobFileCreationReason blob_creation_reason,
+    const SeqnoToTimeMapping& seqno_to_time_mapping,
+    EventLogger* event_logger = nullptr, int job_id = 0,
+    const Env::IOPriority io_priority = Env::IO_HIGH,
+    TableProperties* table_properties = nullptr,
+    Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET,
+    const std::string* full_history_ts_low = nullptr,
+    BlobFileCompletionCallback* blob_callback = nullptr,
+    uint64_t* num_input_entries = nullptr,
+    uint64_t* memtable_payload_bytes = nullptr,
+    uint64_t* memtable_garbage_bytes = nullptr);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/c.cc b/src/rocksdb/db/c.cc
new file mode 100644
index 000000000..a7e4360c6
--- /dev/null
+++ b/src/rocksdb/db/c.cc
@@ -0,0 +1,6390 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/c.h"
+
+#include <cstdlib>
+#include <map>
+#include <unordered_set>
+#include <vector>
+
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/experimental.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/universal_compaction.h"
+#include "rocksdb/utilities/backup_engine.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "rocksdb/utilities/memory_util.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksdb/utilities/options_util.h"
+#include "rocksdb/utilities/table_properties_collectors.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "rocksdb/write_batch.h"
+#include "utilities/merge_operators.h"
+
+using ROCKSDB_NAMESPACE::BackupEngine;
+using ROCKSDB_NAMESPACE::BackupEngineOptions;
+using ROCKSDB_NAMESPACE::BackupID;
+using ROCKSDB_NAMESPACE::BackupInfo;
+using ROCKSDB_NAMESPACE::BatchResult;
+using ROCKSDB_NAMESPACE::BlockBasedTableOptions;
+using ROCKSDB_NAMESPACE::BottommostLevelCompaction;
+using ROCKSDB_NAMESPACE::BytewiseComparator;
+using ROCKSDB_NAMESPACE::Cache;
+using ROCKSDB_NAMESPACE::Checkpoint;
+using ROCKSDB_NAMESPACE::ColumnFamilyDescriptor;
+using ROCKSDB_NAMESPACE::ColumnFamilyHandle;
+using ROCKSDB_NAMESPACE::ColumnFamilyMetaData;
+using ROCKSDB_NAMESPACE::ColumnFamilyOptions;
+using ROCKSDB_NAMESPACE::CompactionFilter;
+using ROCKSDB_NAMESPACE::CompactionFilterFactory;
+using ROCKSDB_NAMESPACE::CompactionOptionsFIFO;
+using ROCKSDB_NAMESPACE::CompactRangeOptions;
+using ROCKSDB_NAMESPACE::Comparator;
+using ROCKSDB_NAMESPACE::CompressionType;
+using ROCKSDB_NAMESPACE::CuckooTableOptions;
+using ROCKSDB_NAMESPACE::DB;
+using ROCKSDB_NAMESPACE::DBOptions;
+using ROCKSDB_NAMESPACE::DbPath;
+using ROCKSDB_NAMESPACE::Env;
+using ROCKSDB_NAMESPACE::EnvOptions;
+using ROCKSDB_NAMESPACE::FileLock;
+using ROCKSDB_NAMESPACE::FilterPolicy;
+using ROCKSDB_NAMESPACE::FlushOptions;
+using ROCKSDB_NAMESPACE::InfoLogLevel;
+using ROCKSDB_NAMESPACE::IngestExternalFileOptions;
+using ROCKSDB_NAMESPACE::Iterator;
+using ROCKSDB_NAMESPACE::LevelMetaData;
+using ROCKSDB_NAMESPACE::LiveFileMetaData;
+using ROCKSDB_NAMESPACE::Logger;
+using ROCKSDB_NAMESPACE::LRUCacheOptions;
+using ROCKSDB_NAMESPACE::MemoryAllocator;
+using ROCKSDB_NAMESPACE::MemoryUtil;
+using ROCKSDB_NAMESPACE::MergeOperator;
+using ROCKSDB_NAMESPACE::NewBloomFilterPolicy;
+using ROCKSDB_NAMESPACE::NewCompactOnDeletionCollectorFactory;
+using ROCKSDB_NAMESPACE::NewGenericRateLimiter;
+using ROCKSDB_NAMESPACE::NewLRUCache;
+using ROCKSDB_NAMESPACE::NewRibbonFilterPolicy;
+using ROCKSDB_NAMESPACE::OptimisticTransactionDB;
+using ROCKSDB_NAMESPACE::OptimisticTransactionOptions;
+using ROCKSDB_NAMESPACE::Options;
+using ROCKSDB_NAMESPACE::PerfContext;
+using ROCKSDB_NAMESPACE::PerfLevel;
+using ROCKSDB_NAMESPACE::PinnableSlice;
+using ROCKSDB_NAMESPACE::PrepopulateBlobCache;
+using ROCKSDB_NAMESPACE::RandomAccessFile;
+using ROCKSDB_NAMESPACE::Range;
+using ROCKSDB_NAMESPACE::RateLimiter;
+using ROCKSDB_NAMESPACE::ReadOptions;
+using ROCKSDB_NAMESPACE::RestoreOptions;
+using ROCKSDB_NAMESPACE::SequentialFile;
+using ROCKSDB_NAMESPACE::Slice;
+using ROCKSDB_NAMESPACE::SliceParts;
+using ROCKSDB_NAMESPACE::SliceTransform;
+using ROCKSDB_NAMESPACE::Snapshot;
+using ROCKSDB_NAMESPACE::SstFileMetaData;
+using ROCKSDB_NAMESPACE::SstFileWriter;
+using ROCKSDB_NAMESPACE::Status;
+using ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory;
+using ROCKSDB_NAMESPACE::Transaction;
+using ROCKSDB_NAMESPACE::TransactionDB;
+using ROCKSDB_NAMESPACE::TransactionDBOptions;
+using ROCKSDB_NAMESPACE::TransactionLogIterator;
+using ROCKSDB_NAMESPACE::TransactionOptions;
+using ROCKSDB_NAMESPACE::WALRecoveryMode;
+using ROCKSDB_NAMESPACE::WritableFile;
+using ROCKSDB_NAMESPACE::WriteBatch;
+using ROCKSDB_NAMESPACE::WriteBatchWithIndex;
+using ROCKSDB_NAMESPACE::WriteOptions;
+
+using std::unordered_set;
+using std::vector;
+
+extern "C" {
+
+struct rocksdb_t {
+  DB* rep;
+};
+struct rocksdb_backup_engine_t {
+  BackupEngine* rep;
+};
+struct rocksdb_backup_engine_info_t {
+  std::vector<BackupInfo> rep;
+};
+struct rocksdb_restore_options_t {
+  RestoreOptions rep;
+};
+struct rocksdb_iterator_t {
+  Iterator* rep;
+};
+struct rocksdb_writebatch_t {
+  WriteBatch rep;
+};
+struct rocksdb_writebatch_wi_t {
+  WriteBatchWithIndex* rep;
+};
+struct rocksdb_snapshot_t {
+  const Snapshot* rep;
+};
+struct rocksdb_flushoptions_t {
+  FlushOptions rep;
+};
+struct rocksdb_fifo_compaction_options_t {
+  CompactionOptionsFIFO rep;
+};
+struct rocksdb_readoptions_t {
+  ReadOptions rep;
+  // stack variables to set pointers to in ReadOptions
+  Slice upper_bound;
+  Slice lower_bound;
+  Slice timestamp;
+  Slice iter_start_ts;
+};
+struct rocksdb_writeoptions_t {
+  WriteOptions rep;
+};
+struct rocksdb_options_t {
+  Options rep;
+};
+struct rocksdb_compactoptions_t {
+  CompactRangeOptions rep;
+  Slice full_history_ts_low;
+};
+struct rocksdb_block_based_table_options_t {
+  BlockBasedTableOptions rep;
+};
+struct rocksdb_cuckoo_table_options_t {
+  CuckooTableOptions rep;
+};
+struct rocksdb_seqfile_t {
+  SequentialFile* rep;
+};
+struct rocksdb_randomfile_t {
+  RandomAccessFile* rep;
+};
+struct rocksdb_writablefile_t {
+  WritableFile* rep;
+};
+struct rocksdb_wal_iterator_t {
+  TransactionLogIterator* rep;
+};
+struct rocksdb_wal_readoptions_t {
+  TransactionLogIterator::ReadOptions rep;
+};
+struct rocksdb_filelock_t {
+  FileLock* rep;
+};
+struct rocksdb_logger_t {
+  std::shared_ptr<Logger> rep;
+};
+struct rocksdb_lru_cache_options_t {
+  LRUCacheOptions rep;
+};
+struct rocksdb_memory_allocator_t {
+  std::shared_ptr<MemoryAllocator> rep;
+};
+struct rocksdb_cache_t {
+  std::shared_ptr<Cache> rep;
+};
+struct rocksdb_livefiles_t {
+  std::vector<LiveFileMetaData> rep;
+};
+struct rocksdb_column_family_handle_t {
+  ColumnFamilyHandle* rep;
+};
+struct rocksdb_column_family_metadata_t {
+  ColumnFamilyMetaData rep;
+};
+struct rocksdb_level_metadata_t {
+  const LevelMetaData* rep;
+};
+struct rocksdb_sst_file_metadata_t {
+  const SstFileMetaData* rep;
+};
+struct rocksdb_envoptions_t {
+  EnvOptions rep;
+};
+struct rocksdb_ingestexternalfileoptions_t {
+  IngestExternalFileOptions rep;
+};
+struct rocksdb_sstfilewriter_t {
+  SstFileWriter* rep;
+};
+struct rocksdb_ratelimiter_t {
+  std::shared_ptr<RateLimiter> rep;
+};
+struct rocksdb_perfcontext_t {
+  PerfContext* rep;
+};
+struct rocksdb_pinnableslice_t {
+  PinnableSlice rep;
+};
+struct rocksdb_transactiondb_options_t {
+  TransactionDBOptions rep;
+};
+struct rocksdb_transactiondb_t {
+  TransactionDB* rep;
+};
+struct rocksdb_transaction_options_t {
+  TransactionOptions rep;
+};
+struct rocksdb_transaction_t {
+  Transaction* rep;
+};
+struct rocksdb_backup_engine_options_t {
+  BackupEngineOptions rep;
+};
+struct rocksdb_checkpoint_t {
+  Checkpoint* rep;
+};
+struct rocksdb_optimistictransactiondb_t {
+  OptimisticTransactionDB* rep;
+};
+struct rocksdb_optimistictransaction_options_t {
+  OptimisticTransactionOptions rep;
+};
+
+struct rocksdb_compactionfiltercontext_t {
+  CompactionFilter::Context rep;
+};
+
+struct rocksdb_compactionfilter_t : public CompactionFilter {
+  void* state_;
+  void (*destructor_)(void*);
+  unsigned char (*filter_)(void*, int level, const char* key, size_t key_length,
+                           const char* existing_value, size_t value_length,
+                           char** new_value, size_t* new_value_length,
+                           unsigned char* value_changed);
+  const char* (*name_)(void*);
+  unsigned char ignore_snapshots_;
+
+  ~rocksdb_compactionfilter_t() override { (*destructor_)(state_); }
+
+  bool Filter(int level, const Slice& key, const Slice& existing_value,
+              std::string* new_value, bool* value_changed) const override {
+    char* c_new_value = nullptr;
+    size_t new_value_length = 0;
+    unsigned char c_value_changed = 0;
+    unsigned char result =
+        (*filter_)(state_, level, key.data(), key.size(), existing_value.data(),
+                   existing_value.size(), &c_new_value, &new_value_length,
+                   &c_value_changed);
+    if (c_value_changed) {
+      new_value->assign(c_new_value, new_value_length);
+      *value_changed = true;
+    }
+    return result;
+  }
+
+  const char* Name() const override { return (*name_)(state_); }
+
+  bool IgnoreSnapshots() const override { return ignore_snapshots_; }
+};
+
+struct rocksdb_compactionfilterfactory_t : public CompactionFilterFactory {
+  void* state_;
+  void (*destructor_)(void*);
+  rocksdb_compactionfilter_t* (*create_compaction_filter_)(
+      void*, rocksdb_compactionfiltercontext_t* context);
+  const char* (*name_)(void*);
+
+  ~rocksdb_compactionfilterfactory_t() override { (*destructor_)(state_); }
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    rocksdb_compactionfiltercontext_t ccontext;
+    ccontext.rep = context;
+    CompactionFilter* cf = (*create_compaction_filter_)(state_, &ccontext);
+    return std::unique_ptr<CompactionFilter>(cf);
+  }
+
+  const char* Name() const override { return (*name_)(state_); }
+};
+
+struct rocksdb_comparator_t : public Comparator {
+  void* state_;
+  void (*destructor_)(void*);
+  int (*compare_)(void*, const char* a, size_t alen, const char* b,
+                  size_t blen);
+  const char* (*name_)(void*);
+  int (*compare_ts_)(void*, const char* a_ts, size_t a_tslen, const char* b_ts,
+                     size_t b_tslen);
+  int (*compare_without_ts_)(void*, const char* a, size_t alen,
+                             unsigned char a_has_ts, const char* b, size_t blen,
+                             unsigned char b_has_ts);
+
+  rocksdb_comparator_t() : Comparator() {}
+
+  rocksdb_comparator_t(size_t ts_size) : Comparator(ts_size) {}
+
+  ~rocksdb_comparator_t() override { (*destructor_)(state_); }
+
+  int Compare(const Slice& a, const Slice& b) const override {
+    return (*compare_)(state_, a.data(), a.size(), b.data(), b.size());
+  }
+
+  int CompareTimestamp(const Slice& a_ts, const Slice& b_ts) const override {
+    if (compare_ts_ == nullptr) {
+      return 0;
+    }
+    return (*compare_ts_)(state_, a_ts.data(), a_ts.size(), b_ts.data(),
+                          b_ts.size());
+  }
+
+  int CompareWithoutTimestamp(const Slice& a, bool a_has_ts, const Slice& b,
+                              bool b_has_ts) const override {
+    if (compare_without_ts_ == nullptr) {
+      return Compare(a, b);
+    }
+    return (*compare_without_ts_)(state_, a.data(), a.size(), a_has_ts,
+                                  b.data(), b.size(), b_has_ts);
+  }
+
+  const char* Name() const override { return (*name_)(state_); }
+
+  // No-ops since the C binding does not support key shortening methods.
+  void FindShortestSeparator(std::string*, const Slice&) const override {}
+  void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+struct rocksdb_filterpolicy_t : public FilterPolicy {
+  void* state_;
+  void (*destructor_)(void*);
+  const char* (*name_)(void*);
+
+  ~rocksdb_filterpolicy_t() override { (*destructor_)(state_); }
+
+  const char* Name() const override { return (*name_)(state_); }
+};
+
+struct rocksdb_mergeoperator_t : public MergeOperator {
+  void* state_;
+  void (*destructor_)(void*);
+  const char* (*name_)(void*);
+  char* (*full_merge_)(void*, const char* key, size_t key_length,
+                       const char* existing_value, size_t existing_value_length,
+                       const char* const* operands_list,
+                       const size_t* operands_list_length, int num_operands,
+                       unsigned char* success, size_t* new_value_length);
+  char* (*partial_merge_)(void*, const char* key, size_t key_length,
+                          const char* const* operands_list,
+                          const size_t* operands_list_length, int num_operands,
+                          unsigned char* success, size_t* new_value_length);
+  void (*delete_value_)(void*, const char* value, size_t value_length);
+
+  ~rocksdb_mergeoperator_t() override { (*destructor_)(state_); }
+
+  const char* Name() const override { return (*name_)(state_); }
+
+  bool FullMergeV2(const MergeOperationInput& merge_in,
+                   MergeOperationOutput* merge_out) const override {
+    size_t n = merge_in.operand_list.size();
+    std::vector<const char*> operand_pointers(n);
+    std::vector<size_t> operand_sizes(n);
+    for (size_t i = 0; i < n; i++) {
+      Slice operand(merge_in.operand_list[i]);
+      operand_pointers[i] = operand.data();
+      operand_sizes[i] = operand.size();
+    }
+
+    const char* existing_value_data = nullptr;
+    size_t existing_value_len = 0;
+    if (merge_in.existing_value != nullptr) {
+      existing_value_data = merge_in.existing_value->data();
+      existing_value_len = merge_in.existing_value->size();
+    }
+
+    unsigned char success;
+    size_t new_value_len;
+    char* tmp_new_value = (*full_merge_)(
+        state_, merge_in.key.data(), merge_in.key.size(), existing_value_data,
+        existing_value_len, &operand_pointers[0], &operand_sizes[0],
+        static_cast<int>(n), &success, &new_value_len);
+    merge_out->new_value.assign(tmp_new_value, new_value_len);
+
+    if (delete_value_ != nullptr) {
+      (*delete_value_)(state_, tmp_new_value, new_value_len);
+    } else {
+      free(tmp_new_value);
+    }
+
+    return success;
+  }
+
+  bool PartialMergeMulti(const Slice& key,
+                         const std::deque<Slice>& operand_list,
+                         std::string* new_value,
+                         Logger* /*logger*/) const override {
+    size_t operand_count = operand_list.size();
+    std::vector<const char*> operand_pointers(operand_count);
+    std::vector<size_t> operand_sizes(operand_count);
+    for (size_t i = 0; i < operand_count; ++i) {
+      Slice operand(operand_list[i]);
+      operand_pointers[i] = operand.data();
+      operand_sizes[i] = operand.size();
+    }
+
+    unsigned char success;
+    size_t new_value_len;
+    char* tmp_new_value = (*partial_merge_)(
+        state_, key.data(), key.size(), &operand_pointers[0], &operand_sizes[0],
+        static_cast<int>(operand_count), &success, &new_value_len);
+    new_value->assign(tmp_new_value, new_value_len);
+
+    if (delete_value_ != nullptr) {
+      (*delete_value_)(state_, tmp_new_value, new_value_len);
+    } else {
+      free(tmp_new_value);
+    }
+
+    return success;
+  }
+};
+
+struct rocksdb_dbpath_t {
+  DbPath rep;
+};
+
+struct rocksdb_env_t {
+  Env* rep;
+  bool is_default;
+};
+
+struct rocksdb_slicetransform_t : public SliceTransform {
+  void* state_;
+  void (*destructor_)(void*);
+  const char* (*name_)(void*);
+  char* (*transform_)(void*, const char* key, size_t length,
+                      size_t* dst_length);
+  unsigned char (*in_domain_)(void*, const char* key, size_t length);
+  unsigned char (*in_range_)(void*, const char* key, size_t length);
+
+  ~rocksdb_slicetransform_t() override { (*destructor_)(state_); }
+
+  const char* Name() const override { return (*name_)(state_); }
+
+  Slice Transform(const Slice& src) const override {
+    size_t len;
+    char* dst = (*transform_)(state_, src.data(), src.size(), &len);
+    return Slice(dst, len);
+  }
+
+  bool InDomain(const Slice& src) const override {
+    return (*in_domain_)(state_, src.data(), src.size());
+  }
+
+  bool InRange(const Slice& src) const override {
+    return (*in_range_)(state_, src.data(), src.size());
+  }
+};
+
+struct rocksdb_universal_compaction_options_t {
+  ROCKSDB_NAMESPACE::CompactionOptionsUniversal* rep;
+};
+
+static bool SaveError(char** errptr, const Status& s) {
+  assert(errptr != nullptr);
+  if (s.ok()) {
+    return false;
+  } else if (*errptr == nullptr) {
+    *errptr = strdup(s.ToString().c_str());
+  } else {
+    // TODO(sanjay): Merge with existing error?
+    // This is a bug if *errptr is not created by malloc()
+    free(*errptr);
+    *errptr = strdup(s.ToString().c_str());
+  }
+  return true;
+}
+
+static char* CopyString(const std::string& str) {
+  char* result = reinterpret_cast<char*>(malloc(sizeof(char) * str.size()));
+  memcpy(result, str.data(), sizeof(char) * str.size());
+  return result;
+}
+
+rocksdb_t* rocksdb_open(const rocksdb_options_t* options, const char* name,
+                        char** errptr) {
+  DB* db;
+  if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) {
+    return nullptr;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+rocksdb_t* rocksdb_open_with_ttl(const rocksdb_options_t* options,
+                                 const char* name, int ttl, char** errptr) {
+  ROCKSDB_NAMESPACE::DBWithTTL* db;
+  if (SaveError(errptr, ROCKSDB_NAMESPACE::DBWithTTL::Open(
+                            options->rep, std::string(name), &db, ttl))) {
+    return nullptr;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+rocksdb_t* rocksdb_open_for_read_only(const rocksdb_options_t* options,
+                                      const char* name,
+                                      unsigned char error_if_wal_file_exists,
+                                      char** errptr) {
+  DB* db;
+  if (SaveError(errptr, DB::OpenForReadOnly(options->rep, std::string(name),
+                                            &db, error_if_wal_file_exists))) {
+    return nullptr;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+rocksdb_t* rocksdb_open_as_secondary(const rocksdb_options_t* options,
+                                     const char* name,
+                                     const char* secondary_path,
+                                     char** errptr) {
+  DB* db;
+  if (SaveError(errptr,
+                DB::OpenAsSecondary(options->rep, std::string(name),
+                                    std::string(secondary_path), &db))) {
+    return nullptr;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+rocksdb_backup_engine_t* rocksdb_backup_engine_open(
+    const rocksdb_options_t* options, const char* path, char** errptr) {
+  BackupEngine* be;
+  if (SaveError(errptr, BackupEngine::Open(
+                            options->rep.env,
+                            BackupEngineOptions(path, nullptr, true,
+                                                options->rep.info_log.get()),
+                            &be))) {
+    return nullptr;
+  }
+  rocksdb_backup_engine_t* result = new rocksdb_backup_engine_t;
+  result->rep = be;
+  return result;
+}
+
+rocksdb_backup_engine_t* rocksdb_backup_engine_open_opts(
+    const rocksdb_backup_engine_options_t* options, rocksdb_env_t* env,
+    char** errptr) {
+  BackupEngine* be;
+  if (SaveError(errptr, BackupEngine::Open(options->rep, env->rep, &be))) {
+    return nullptr;
+  }
+  rocksdb_backup_engine_t* result = new rocksdb_backup_engine_t;
+  result->rep = be;
+  return result;
+}
+
+void rocksdb_backup_engine_create_new_backup(rocksdb_backup_engine_t* be,
+                                             rocksdb_t* db, char** errptr) {
+  SaveError(errptr, be->rep->CreateNewBackup(db->rep));
+}
+
+void rocksdb_backup_engine_create_new_backup_flush(
+    rocksdb_backup_engine_t* be, rocksdb_t* db,
+    unsigned char flush_before_backup, char** errptr) {
+  SaveError(errptr, be->rep->CreateNewBackup(db->rep, flush_before_backup));
+}
+
+void rocksdb_backup_engine_purge_old_backups(rocksdb_backup_engine_t* be,
+                                             uint32_t num_backups_to_keep,
+                                             char** errptr) {
+  SaveError(errptr, be->rep->PurgeOldBackups(num_backups_to_keep));
+}
+
+rocksdb_restore_options_t* rocksdb_restore_options_create() {
+  return new rocksdb_restore_options_t;
+}
+
+void rocksdb_restore_options_destroy(rocksdb_restore_options_t* opt) {
+  delete opt;
+}
+
+void rocksdb_restore_options_set_keep_log_files(rocksdb_restore_options_t* opt,
+                                                int v) {
+  opt->rep.keep_log_files = v;
+}
+
+void rocksdb_backup_engine_verify_backup(rocksdb_backup_engine_t* be,
+                                         uint32_t backup_id, char** errptr) {
+  SaveError(errptr, be->rep->VerifyBackup(static_cast<BackupID>(backup_id)));
+}
+
+void rocksdb_backup_engine_restore_db_from_latest_backup(
+    rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir,
+    const rocksdb_restore_options_t* restore_options, char** errptr) {
+  SaveError(errptr, be->rep->RestoreDBFromLatestBackup(std::string(db_dir),
+                                                       std::string(wal_dir),
+                                                       restore_options->rep));
+}
+
+void rocksdb_backup_engine_restore_db_from_backup(
+    rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir,
+    const rocksdb_restore_options_t* restore_options, const uint32_t backup_id,
+    char** errptr) {
+  SaveError(errptr, be->rep->RestoreDBFromBackup(backup_id, std::string(db_dir),
+                                                 std::string(wal_dir),
+                                                 restore_options->rep));
+}
+
+const rocksdb_backup_engine_info_t* rocksdb_backup_engine_get_backup_info(
+    rocksdb_backup_engine_t* be) {
+  rocksdb_backup_engine_info_t* result = new rocksdb_backup_engine_info_t;
+  be->rep->GetBackupInfo(&result->rep);
+  return result;
+}
+
+int rocksdb_backup_engine_info_count(const rocksdb_backup_engine_info_t* info) {
+  return static_cast<int>(info->rep.size());
+}
+
+int64_t rocksdb_backup_engine_info_timestamp(
+    const rocksdb_backup_engine_info_t* info, int index) {
+  return info->rep[index].timestamp;
+}
+
+uint32_t rocksdb_backup_engine_info_backup_id(
+    const rocksdb_backup_engine_info_t* info, int index) {
+  return info->rep[index].backup_id;
+}
+
+uint64_t rocksdb_backup_engine_info_size(
+    const rocksdb_backup_engine_info_t* info, int index) {
+  return info->rep[index].size;
+}
+
+uint32_t rocksdb_backup_engine_info_number_files(
+    const rocksdb_backup_engine_info_t* info, int index) {
+  return info->rep[index].number_files;
+}
+
+void rocksdb_backup_engine_info_destroy(
+    const rocksdb_backup_engine_info_t* info) {
+  delete info;
+}
+
+void rocksdb_backup_engine_close(rocksdb_backup_engine_t* be) {
+  delete be->rep;
+  delete be;
+}
+
+rocksdb_backup_engine_options_t* rocksdb_backup_engine_options_create(
+    const char* backup_dir) {
+  return new rocksdb_backup_engine_options_t{
+      BackupEngineOptions(std::string(backup_dir))};
+}
+
+void rocksdb_backup_engine_options_set_backup_dir(
+    rocksdb_backup_engine_options_t* options, const char* backup_dir) {
+  options->rep.backup_dir = std::string(backup_dir);
+}
+
+void rocksdb_backup_engine_options_set_env(
+    rocksdb_backup_engine_options_t* options, rocksdb_env_t* env) {
+  options->rep.backup_env = (env ? env->rep : nullptr);
+}
+
+void rocksdb_backup_engine_options_set_share_table_files(
+    rocksdb_backup_engine_options_t* options, unsigned char val) {
+  options->rep.share_table_files = val;
+}
+
+unsigned char rocksdb_backup_engine_options_get_share_table_files(
+    rocksdb_backup_engine_options_t* options) {
+  return options->rep.share_table_files;
+}
+
+void rocksdb_backup_engine_options_set_sync(
+    rocksdb_backup_engine_options_t* options, unsigned char val) {
+  options->rep.sync = val;
+}
+
+unsigned char rocksdb_backup_engine_options_get_sync(
+    rocksdb_backup_engine_options_t* options) {
+  return options->rep.sync;
+}
+
+void rocksdb_backup_engine_options_set_destroy_old_data(
+    rocksdb_backup_engine_options_t* options, unsigned char val) {
+  options->rep.destroy_old_data = val;
+}
+
+unsigned char rocksdb_backup_engine_options_get_destroy_old_data(
+    rocksdb_backup_engine_options_t* options) {
+  return options->rep.destroy_old_data;
+}
+
+void rocksdb_backup_engine_options_set_backup_log_files(
+    rocksdb_backup_engine_options_t* options, unsigned char val) {
+  options->rep.backup_log_files = val;
+}
+
+unsigned char rocksdb_backup_engine_options_get_backup_log_files(
+    rocksdb_backup_engine_options_t* options) {
+  return options->rep.backup_log_files;
+}
+
+void rocksdb_backup_engine_options_set_backup_rate_limit(
+    rocksdb_backup_engine_options_t* options, uint64_t limit) {
+  options->rep.backup_rate_limit = limit;
+}
+
+uint64_t rocksdb_backup_engine_options_get_backup_rate_limit(
+    rocksdb_backup_engine_options_t* options) {
+  return options->rep.backup_rate_limit;
+}
+
+void rocksdb_backup_engine_options_set_restore_rate_limit(
+    rocksdb_backup_engine_options_t* options, uint64_t limit) {
+  options->rep.restore_rate_limit = limit;
+}
+
+uint64_t rocksdb_backup_engine_options_get_restore_rate_limit(
+    rocksdb_backup_engine_options_t* options) {
+  return options->rep.restore_rate_limit;
+}
+
+void rocksdb_backup_engine_options_set_max_background_operations(
+    rocksdb_backup_engine_options_t* options, int val) {
+  options->rep.max_background_operations = val;
+}
+
+int rocksdb_backup_engine_options_get_max_background_operations(
+    rocksdb_backup_engine_options_t* options) {
+  return options->rep.max_background_operations;
+}
+
+void rocksdb_backup_engine_options_set_callback_trigger_interval_size(
+    rocksdb_backup_engine_options_t* options, uint64_t size) {
+  options->rep.callback_trigger_interval_size = size;
+}
+
+uint64_t rocksdb_backup_engine_options_get_callback_trigger_interval_size(
+    rocksdb_backup_engine_options_t* options) {
+  return options->rep.callback_trigger_interval_size;
+}
+
+void rocksdb_backup_engine_options_set_max_valid_backups_to_open(
+    rocksdb_backup_engine_options_t* options, int val) {
+  options->rep.max_valid_backups_to_open = val;
+}
+
+int rocksdb_backup_engine_options_get_max_valid_backups_to_open(
+    rocksdb_backup_engine_options_t* options) {
+  return options->rep.max_valid_backups_to_open;
+}
+
+void rocksdb_backup_engine_options_set_share_files_with_checksum_naming(
+    rocksdb_backup_engine_options_t* options, int val) {
+  options->rep.share_files_with_checksum_naming =
+      static_cast<BackupEngineOptions::ShareFilesNaming>(val);
+}
+
+int rocksdb_backup_engine_options_get_share_files_with_checksum_naming(
+    rocksdb_backup_engine_options_t* options) {
+  return static_cast<int>(options->rep.share_files_with_checksum_naming);
+}
+
+void rocksdb_backup_engine_options_destroy(
+    rocksdb_backup_engine_options_t* options) {
+  delete options;
+}
+
+rocksdb_checkpoint_t* rocksdb_checkpoint_object_create(rocksdb_t* db,
+                                                       char** errptr) {
+  Checkpoint* checkpoint;
+  if (SaveError(errptr, Checkpoint::Create(db->rep, &checkpoint))) {
+    return nullptr;
+  }
+  rocksdb_checkpoint_t* result = new rocksdb_checkpoint_t;
+  result->rep = checkpoint;
+  return result;
+}
+
+void rocksdb_checkpoint_create(rocksdb_checkpoint_t* checkpoint,
+                               const char* checkpoint_dir,
+                               uint64_t log_size_for_flush, char** errptr) {
+  SaveError(errptr, checkpoint->rep->CreateCheckpoint(
+                        std::string(checkpoint_dir), log_size_for_flush));
+}
+
+void rocksdb_checkpoint_object_destroy(rocksdb_checkpoint_t* checkpoint) {
+  delete checkpoint->rep;
+  delete checkpoint;
+}
+
+void rocksdb_close(rocksdb_t* db) {
+  delete db->rep;
+  delete db;
+}
+
+void rocksdb_options_set_uint64add_merge_operator(rocksdb_options_t* opt) {
+  opt->rep.merge_operator =
+      ROCKSDB_NAMESPACE::MergeOperators::CreateUInt64AddOperator();
+}
+
+rocksdb_t* rocksdb_open_and_trim_history(
+    const rocksdb_options_t* db_options, const char* name,
+    int num_column_families, const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char* trim_ts,
+    size_t trim_tslen, char** errptr) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (int i = 0; i < num_column_families; i++) {
+    column_families.push_back(ColumnFamilyDescriptor(
+        std::string(column_family_names[i]),
+        ColumnFamilyOptions(column_family_options[i]->rep)));
+  }
+
+  std::string trim_ts_(trim_ts, trim_tslen);
+
+  DB* db;
+  std::vector<ColumnFamilyHandle*> handles;
+  if (SaveError(errptr, DB::OpenAndTrimHistory(
+                            DBOptions(db_options->rep), std::string(name),
+                            column_families, &handles, &db, trim_ts_))) {
+    return nullptr;
+  }
+
+  for (size_t i = 0; i < handles.size(); i++) {
+    rocksdb_column_family_handle_t* c_handle =
+        new rocksdb_column_family_handle_t;
+    c_handle->rep = handles[i];
+    column_family_handles[i] = c_handle;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+rocksdb_t* rocksdb_open_column_families(
+    const rocksdb_options_t* db_options, const char* name,
+    int num_column_families, const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char** errptr) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (int i = 0; i < num_column_families; i++) {
+    column_families.push_back(ColumnFamilyDescriptor(
+        std::string(column_family_names[i]),
+        ColumnFamilyOptions(column_family_options[i]->rep)));
+  }
+
+  DB* db;
+  std::vector<ColumnFamilyHandle*> handles;
+  if (SaveError(errptr, DB::Open(DBOptions(db_options->rep), std::string(name),
+                                 column_families, &handles, &db))) {
+    return nullptr;
+  }
+
+  for (size_t i = 0; i < handles.size(); i++) {
+    rocksdb_column_family_handle_t* c_handle =
+        new rocksdb_column_family_handle_t;
+    c_handle->rep = handles[i];
+    column_family_handles[i] = c_handle;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+rocksdb_t* rocksdb_open_column_families_with_ttl(
+    const rocksdb_options_t* db_options, const char* name,
+    int num_column_families, const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, const int* ttls,
+    char** errptr) {
+  std::vector<int32_t> ttls_vec;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (int i = 0; i < num_column_families; i++) {
+    ttls_vec.push_back(ttls[i]);
+
+    column_families.push_back(ColumnFamilyDescriptor(
+        std::string(column_family_names[i]),
+        ColumnFamilyOptions(column_family_options[i]->rep)));
+  }
+
+  ROCKSDB_NAMESPACE::DBWithTTL* db;
+  std::vector<ColumnFamilyHandle*> handles;
+  if (SaveError(errptr, ROCKSDB_NAMESPACE::DBWithTTL::Open(
+                            DBOptions(db_options->rep), std::string(name),
+                            column_families, &handles, &db, ttls_vec))) {
+    return nullptr;
+  }
+
+  for (size_t i = 0; i < handles.size(); i++) {
+    rocksdb_column_family_handle_t* c_handle =
+        new rocksdb_column_family_handle_t;
+    c_handle->rep = handles[i];
+    column_family_handles[i] = c_handle;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+rocksdb_t* rocksdb_open_for_read_only_column_families(
+    const rocksdb_options_t* db_options, const char* name,
+    int num_column_families, const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles,
+    unsigned char error_if_wal_file_exists, char** errptr) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (int i = 0; i < num_column_families; i++) {
+    column_families.push_back(ColumnFamilyDescriptor(
+        std::string(column_family_names[i]),
+        ColumnFamilyOptions(column_family_options[i]->rep)));
+  }
+
+  DB* db;
+  std::vector<ColumnFamilyHandle*> handles;
+  if (SaveError(errptr,
+                DB::OpenForReadOnly(DBOptions(db_options->rep),
+                                    std::string(name), column_families,
+                                    &handles, &db, error_if_wal_file_exists))) {
+    return nullptr;
+  }
+
+  for (size_t i = 0; i < handles.size(); i++) {
+    rocksdb_column_family_handle_t* c_handle =
+        new rocksdb_column_family_handle_t;
+    c_handle->rep = handles[i];
+    column_family_handles[i] = c_handle;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+rocksdb_t* rocksdb_open_as_secondary_column_families(
+    const rocksdb_options_t* db_options, const char* name,
+    const char* secondary_path, int num_column_families,
+    const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char** errptr) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (int i = 0; i != num_column_families; ++i) {
+    column_families.emplace_back(
+        std::string(column_family_names[i]),
+        ColumnFamilyOptions(column_family_options[i]->rep));
+  }
+  DB* db;
+  std::vector<ColumnFamilyHandle*> handles;
+  if (SaveError(errptr, DB::OpenAsSecondary(DBOptions(db_options->rep),
+                                            std::string(name),
+                                            std::string(secondary_path),
+                                            column_families, &handles, &db))) {
+    return nullptr;
+  }
+  for (size_t i = 0; i != handles.size(); ++i) {
+    rocksdb_column_family_handle_t* c_handle =
+        new rocksdb_column_family_handle_t;
+    c_handle->rep = handles[i];
+    column_family_handles[i] = c_handle;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+char** rocksdb_list_column_families(const rocksdb_options_t* options,
+                                    const char* name, size_t* lencfs,
+                                    char** errptr) {
+  std::vector<std::string> fams;
+  SaveError(errptr, DB::ListColumnFamilies(DBOptions(options->rep),
+                                           std::string(name), &fams));
+
+  *lencfs = fams.size();
+  char** column_families =
+      static_cast<char**>(malloc(sizeof(char*) * fams.size()));
+  for (size_t i = 0; i < fams.size(); i++) {
+    column_families[i] = strdup(fams[i].c_str());
+  }
+  return column_families;
+}
+
+void rocksdb_list_column_families_destroy(char** list, size_t len) {
+  for (size_t i = 0; i < len; ++i) {
+    free(list[i]);
+  }
+  free(list);
+}
+
+rocksdb_column_family_handle_t* rocksdb_create_column_family(
+    rocksdb_t* db, const rocksdb_options_t* column_family_options,
+    const char* column_family_name, char** errptr) {
+  rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t;
+  SaveError(errptr, db->rep->CreateColumnFamily(
+                        ColumnFamilyOptions(column_family_options->rep),
+                        std::string(column_family_name), &(handle->rep)));
+  return handle;
+}
+
+rocksdb_column_family_handle_t* rocksdb_create_column_family_with_ttl(
+    rocksdb_t* db, const rocksdb_options_t* column_family_options,
+    const char* column_family_name, int ttl, char** errptr) {
+  ROCKSDB_NAMESPACE::DBWithTTL* db_with_ttl =
+      static_cast<ROCKSDB_NAMESPACE::DBWithTTL*>(db->rep);
+  rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t;
+  SaveError(errptr, db_with_ttl->CreateColumnFamilyWithTtl(
+                        ColumnFamilyOptions(column_family_options->rep),
+                        std::string(column_family_name), &(handle->rep), ttl));
+  return handle;
+}
+
+void rocksdb_drop_column_family(rocksdb_t* db,
+                                rocksdb_column_family_handle_t* handle,
+                                char** errptr) {
+  SaveError(errptr, db->rep->DropColumnFamily(handle->rep));
+}
+
+uint32_t rocksdb_column_family_handle_get_id(
+    rocksdb_column_family_handle_t* handle) {
+  return handle->rep->GetID();
+}
+
+char* rocksdb_column_family_handle_get_name(
+    rocksdb_column_family_handle_t* handle, size_t* name_len) {
+  auto name = handle->rep->GetName();
+  *name_len = name.size();
+  return CopyString(name);
+}
+
+void rocksdb_column_family_handle_destroy(
+    rocksdb_column_family_handle_t* handle) {
+  delete handle->rep;
+  delete handle;
+}
+
+void rocksdb_put(rocksdb_t* db, const rocksdb_writeoptions_t* options,
+                 const char* key, size_t keylen, const char* val, size_t vallen,
+                 char** errptr) {
+  SaveError(errptr,
+            db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_put_cf(rocksdb_t* db, const rocksdb_writeoptions_t* options,
+                    rocksdb_column_family_handle_t* column_family,
+                    const char* key, size_t keylen, const char* val,
+                    size_t vallen, char** errptr) {
+  SaveError(errptr, db->rep->Put(options->rep, column_family->rep,
+                                 Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_put_with_ts(rocksdb_t* db, const rocksdb_writeoptions_t* options,
+                         const char* key, size_t keylen, const char* ts,
+                         size_t tslen, const char* val, size_t vallen,
+                         char** errptr) {
+  SaveError(errptr, db->rep->Put(options->rep, Slice(key, keylen),
+                                 Slice(ts, tslen), Slice(val, vallen)));
+}
+
+void rocksdb_put_cf_with_ts(rocksdb_t* db,
+                            const rocksdb_writeoptions_t* options,
+                            rocksdb_column_family_handle_t* column_family,
+                            const char* key, size_t keylen, const char* ts,
+                            size_t tslen, const char* val, size_t vallen,
+                            char** errptr) {
+  SaveError(errptr,
+            db->rep->Put(options->rep, column_family->rep, Slice(key, keylen),
+                         Slice(ts, tslen), Slice(val, vallen)));
+}
+
+void rocksdb_delete(rocksdb_t* db, const rocksdb_writeoptions_t* options,
+                    const char* key, size_t keylen, char** errptr) {
+  SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen)));
+}
+
+void rocksdb_delete_cf(rocksdb_t* db, const rocksdb_writeoptions_t* options,
+                       rocksdb_column_family_handle_t* column_family,
+                       const char* key, size_t keylen, char** errptr) {
+  SaveError(errptr, db->rep->Delete(options->rep, column_family->rep,
+                                    Slice(key, keylen)));
+}
+
+void rocksdb_delete_with_ts(rocksdb_t* db,
+                            const rocksdb_writeoptions_t* options,
+                            const char* key, size_t keylen, const char* ts,
+                            size_t tslen, char** errptr) {
+  SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen),
+                                    Slice(ts, tslen)));
+}
+
+void rocksdb_delete_cf_with_ts(rocksdb_t* db,
+                               const rocksdb_writeoptions_t* options,
+                               rocksdb_column_family_handle_t* column_family,
+                               const char* key, size_t keylen, const char* ts,
+                               size_t tslen, char** errptr) {
+  SaveError(errptr, db->rep->Delete(options->rep, column_family->rep,
+                                    Slice(key, keylen), Slice(ts, tslen)));
+}
+
+void rocksdb_singledelete(rocksdb_t* db, const rocksdb_writeoptions_t* options,
+                          const char* key, size_t keylen, char** errptr) {
+  SaveError(errptr, db->rep->SingleDelete(options->rep, Slice(key, keylen)));
+}
+
+void rocksdb_singledelete_cf(rocksdb_t* db,
+                             const rocksdb_writeoptions_t* options,
+                             rocksdb_column_family_handle_t* column_family,
+                             const char* key, size_t keylen, char** errptr) {
+  SaveError(errptr, db->rep->SingleDelete(options->rep, column_family->rep,
+                                          Slice(key, keylen)));
+}
+
+void rocksdb_singledelete_with_ts(rocksdb_t* db,
+                                  const rocksdb_writeoptions_t* options,
+                                  const char* key, size_t keylen,
+                                  const char* ts, size_t tslen, char** errptr) {
+  SaveError(errptr, db->rep->SingleDelete(options->rep, Slice(key, keylen),
+                                          Slice(ts, tslen)));
+}
+
+void rocksdb_singledelete_cf_with_ts(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, const char* ts, size_t tslen, char** errptr) {
+  SaveError(errptr,
+            db->rep->SingleDelete(options->rep, column_family->rep,
+                                  Slice(key, keylen), Slice(ts, tslen)));
+}
+
+void rocksdb_increase_full_history_ts_low(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    const char* ts_low, size_t ts_lowlen, char** errptr) {
+  std::string ts(ts_low, ts_lowlen);
+  SaveError(errptr, db->rep->IncreaseFullHistoryTsLow(column_family->rep, ts));
+}
+
+char* rocksdb_get_full_history_ts_low(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    size_t* ts_len, char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = db->rep->GetFullHistoryTsLow(column_family->rep, &tmp);
+  if (s.ok()) {
+    *ts_len = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *ts_len = 0;
+    SaveError(errptr, s);
+  }
+  return result;
+}
+
+void rocksdb_delete_range_cf(rocksdb_t* db,
+                             const rocksdb_writeoptions_t* options,
+                             rocksdb_column_family_handle_t* column_family,
+                             const char* start_key, size_t start_key_len,
+                             const char* end_key, size_t end_key_len,
+                             char** errptr) {
+  SaveError(errptr, db->rep->DeleteRange(options->rep, column_family->rep,
+                                         Slice(start_key, start_key_len),
+                                         Slice(end_key, end_key_len)));
+}
+
+void rocksdb_merge(rocksdb_t* db, const rocksdb_writeoptions_t* options,
+                   const char* key, size_t keylen, const char* val,
+                   size_t vallen, char** errptr) {
+  SaveError(errptr, db->rep->Merge(options->rep, Slice(key, keylen),
+                                   Slice(val, vallen)));
+}
+
+void rocksdb_merge_cf(rocksdb_t* db, const rocksdb_writeoptions_t* options,
+                      rocksdb_column_family_handle_t* column_family,
+                      const char* key, size_t keylen, const char* val,
+                      size_t vallen, char** errptr) {
+  SaveError(errptr, db->rep->Merge(options->rep, column_family->rep,
+                                   Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_write(rocksdb_t* db, const rocksdb_writeoptions_t* options,
+                   rocksdb_writebatch_t* batch, char** errptr) {
+  SaveError(errptr, db->rep->Write(options->rep, &batch->rep));
+}
+
+char* rocksdb_get(rocksdb_t* db, const rocksdb_readoptions_t* options,
+                  const char* key, size_t keylen, size_t* vallen,
+                  char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+char* rocksdb_get_cf(rocksdb_t* db, const rocksdb_readoptions_t* options,
+                     rocksdb_column_family_handle_t* column_family,
+                     const char* key, size_t keylen, size_t* vallen,
+                     char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s =
+      db->rep->Get(options->rep, column_family->rep, Slice(key, keylen), &tmp);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+char* rocksdb_get_with_ts(rocksdb_t* db, const rocksdb_readoptions_t* options,
+                          const char* key, size_t keylen, size_t* vallen,
+                          char** ts, size_t* tslen, char** errptr) {
+  char* result = nullptr;
+  std::string tmp_val;
+  std::string tmp_ts;
+  Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp_val, &tmp_ts);
+  if (s.ok()) {
+    *vallen = tmp_val.size();
+    result = CopyString(tmp_val);
+    *tslen = tmp_ts.size();
+    *ts = CopyString(tmp_ts);
+  } else {
+    *vallen = 0;
+    *tslen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+char* rocksdb_get_cf_with_ts(rocksdb_t* db,
+                             const rocksdb_readoptions_t* options,
+                             rocksdb_column_family_handle_t* column_family,
+                             const char* key, size_t keylen, size_t* vallen,
+                             char** ts, size_t* tslen, char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  std::string tmp_ts;
+  Status s = db->rep->Get(options->rep, column_family->rep, Slice(key, keylen),
+                          &tmp, &tmp_ts);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+    *tslen = tmp_ts.size();
+    *ts = CopyString(tmp_ts);
+  } else {
+    *vallen = 0;
+    *tslen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+void rocksdb_multi_get(rocksdb_t* db, const rocksdb_readoptions_t* options,
+                       size_t num_keys, const char* const* keys_list,
+                       const size_t* keys_list_sizes, char** values_list,
+                       size_t* values_list_sizes, char** errs) {
+  std::vector<Slice> keys(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<std::string> values(num_keys);
+  std::vector<Status> statuses = db->rep->MultiGet(options->rep, keys, &values);
+  for (size_t i = 0; i < num_keys; i++) {
+    if (statuses[i].ok()) {
+      values_list[i] = CopyString(values[i]);
+      values_list_sizes[i] = values[i].size();
+      errs[i] = nullptr;
+    } else {
+      values_list[i] = nullptr;
+      values_list_sizes[i] = 0;
+      if (!statuses[i].IsNotFound()) {
+        errs[i] = strdup(statuses[i].ToString().c_str());
+      } else {
+        errs[i] = nullptr;
+      }
+    }
+  }
+}
+
+void rocksdb_multi_get_with_ts(rocksdb_t* db,
+                               const rocksdb_readoptions_t* options,
+                               size_t num_keys, const char* const* keys_list,
+                               const size_t* keys_list_sizes,
+                               char** values_list, size_t* values_list_sizes,
+                               char** timestamp_list,
+                               size_t* timestamp_list_sizes, char** errs) {
+  std::vector<Slice> keys(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<std::string> values(num_keys);
+  std::vector<std::string> timestamps(num_keys);
+  std::vector<Status> statuses =
+      db->rep->MultiGet(options->rep, keys, &values, &timestamps);
+  for (size_t i = 0; i < num_keys; i++) {
+    if (statuses[i].ok()) {
+      values_list[i] = CopyString(values[i]);
+      values_list_sizes[i] = values[i].size();
+      timestamp_list[i] = CopyString(timestamps[i]);
+      timestamp_list_sizes[i] = timestamps[i].size();
+      errs[i] = nullptr;
+    } else {
+      values_list[i] = nullptr;
+      values_list_sizes[i] = 0;
+      timestamp_list[i] = nullptr;
+      timestamp_list_sizes[i] = 0;
+      if (!statuses[i].IsNotFound()) {
+        errs[i] = strdup(statuses[i].ToString().c_str());
+      } else {
+        errs[i] = nullptr;
+      }
+    }
+  }
+}
+
+void rocksdb_multi_get_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    const rocksdb_column_family_handle_t* const* column_families,
+    size_t num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, char** values_list,
+    size_t* values_list_sizes, char** errs) {
+  std::vector<Slice> keys(num_keys);
+  std::vector<ColumnFamilyHandle*> cfs(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+    cfs[i] = column_families[i]->rep;
+  }
+  std::vector<std::string> values(num_keys);
+  std::vector<Status> statuses =
+      db->rep->MultiGet(options->rep, cfs, keys, &values);
+  for (size_t i = 0; i < num_keys; i++) {
+    if (statuses[i].ok()) {
+      values_list[i] = CopyString(values[i]);
+      values_list_sizes[i] = values[i].size();
+      errs[i] = nullptr;
+    } else {
+      values_list[i] = nullptr;
+      values_list_sizes[i] = 0;
+      if (!statuses[i].IsNotFound()) {
+        errs[i] = strdup(statuses[i].ToString().c_str());
+      } else {
+        errs[i] = nullptr;
+      }
+    }
+  }
+}
+
+void rocksdb_multi_get_cf_with_ts(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    const rocksdb_column_family_handle_t* const* column_families,
+    size_t num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, char** values_list,
+    size_t* values_list_sizes, char** timestamps_list,
+    size_t* timestamps_list_sizes, char** errs) {
+  std::vector<Slice> keys(num_keys);
+  std::vector<ColumnFamilyHandle*> cfs(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+    cfs[i] = column_families[i]->rep;
+  }
+  std::vector<std::string> values(num_keys);
+  std::vector<std::string> timestamps(num_keys);
+  std::vector<Status> statuses =
+      db->rep->MultiGet(options->rep, cfs, keys, &values, &timestamps);
+  for (size_t i = 0; i < num_keys; i++) {
+    if (statuses[i].ok()) {
+      values_list[i] = CopyString(values[i]);
+      values_list_sizes[i] = values[i].size();
+      timestamps_list[i] = CopyString(timestamps[i]);
+      timestamps_list_sizes[i] = timestamps[i].size();
+      errs[i] = nullptr;
+    } else {
+      values_list[i] = nullptr;
+      values_list_sizes[i] = 0;
+      timestamps_list[i] = nullptr;
+      timestamps_list_sizes[i] = 0;
+      if (!statuses[i].IsNotFound()) {
+        errs[i] = strdup(statuses[i].ToString().c_str());
+      } else {
+        errs[i] = nullptr;
+      }
+    }
+  }
+}
+
+void rocksdb_batched_multi_get_cf(rocksdb_t* db,
+                                  const rocksdb_readoptions_t* options,
+                                  rocksdb_column_family_handle_t* column_family,
+                                  size_t num_keys, const char* const* keys_list,
+                                  const size_t* keys_list_sizes,
+                                  rocksdb_pinnableslice_t** values, char** errs,
+                                  const bool sorted_input) {
+  Slice* key_slices = new Slice[num_keys];
+  PinnableSlice* value_slices = new PinnableSlice[num_keys];
+  Status* statuses = new Status[num_keys];
+  for (size_t i = 0; i < num_keys; ++i) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+
+  db->rep->MultiGet(options->rep, column_family->rep, num_keys, key_slices,
+                    value_slices, statuses, sorted_input);
+
+  for (size_t i = 0; i < num_keys; ++i) {
+    if (statuses[i].ok()) {
+      values[i] = new (rocksdb_pinnableslice_t);
+      values[i]->rep = std::move(value_slices[i]);
+      errs[i] = nullptr;
+    } else {
+      values[i] = nullptr;
+      if (!statuses[i].IsNotFound()) {
+        errs[i] = strdup(statuses[i].ToString().c_str());
+      } else {
+        errs[i] = nullptr;
+      }
+    }
+  }
+
+  delete[] key_slices;
+  delete[] value_slices;
+  delete[] statuses;
+}
+
+unsigned char rocksdb_key_may_exist(rocksdb_t* db,
+                                    const rocksdb_readoptions_t* options,
+                                    const char* key, size_t key_len,
+                                    char** value, size_t* val_len,
+                                    const char* timestamp, size_t timestamp_len,
+                                    unsigned char* value_found) {
+  std::string tmp;
+  std::string time;
+  if (timestamp) {
+    time.assign(timestamp, timestamp_len);
+  }
+  bool found = false;
+  const bool result = db->rep->KeyMayExist(options->rep, Slice(key, key_len),
+                                           &tmp, timestamp ? &time : nullptr,
+                                           value_found ? &found : nullptr);
+  if (value_found) {
+    *value_found = found;
+    if (found) {
+      *val_len = tmp.size();
+      *value = CopyString(tmp);
+    }
+  }
+  return result;
+}
+
+unsigned char rocksdb_key_may_exist_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t key_len, char** value, size_t* val_len, const char* timestamp,
+    size_t timestamp_len, unsigned char* value_found) {
+  std::string tmp;
+  std::string time;
+  if (timestamp) {
+    time.assign(timestamp, timestamp_len);
+  }
+  bool found = false;
+  const bool result = db->rep->KeyMayExist(
+      options->rep, column_family->rep, Slice(key, key_len), &tmp,
+      timestamp ? &time : nullptr, value_found ? &found : nullptr);
+  if (value_found) {
+    *value_found = found;
+    if (found) {
+      *val_len = tmp.size();
+      *value = CopyString(tmp);
+    }
+  }
+  return result;
+}
+
+rocksdb_iterator_t* rocksdb_create_iterator(
+    rocksdb_t* db, const rocksdb_readoptions_t* options) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = db->rep->NewIterator(options->rep);
+  return result;
+}
+
+rocksdb_wal_iterator_t* rocksdb_get_updates_since(
+    rocksdb_t* db, uint64_t seq_number,
+    const rocksdb_wal_readoptions_t* options, char** errptr) {
+  std::unique_ptr<TransactionLogIterator> iter;
+  TransactionLogIterator::ReadOptions ro;
+  if (options != nullptr) {
+    ro = options->rep;
+  }
+  if (SaveError(errptr, db->rep->GetUpdatesSince(seq_number, &iter, ro))) {
+    return nullptr;
+  }
+  rocksdb_wal_iterator_t* result = new rocksdb_wal_iterator_t;
+  result->rep = iter.release();
+  return result;
+}
+
+void rocksdb_wal_iter_next(rocksdb_wal_iterator_t* iter) { iter->rep->Next(); }
+
+unsigned char rocksdb_wal_iter_valid(const rocksdb_wal_iterator_t* iter) {
+  return iter->rep->Valid();
+}
+
+void rocksdb_wal_iter_status(const rocksdb_wal_iterator_t* iter,
+                             char** errptr) {
+  SaveError(errptr, iter->rep->status());
+}
+
+void rocksdb_wal_iter_destroy(const rocksdb_wal_iterator_t* iter) {
+  delete iter->rep;
+  delete iter;
+}
+
+rocksdb_writebatch_t* rocksdb_wal_iter_get_batch(
+    const rocksdb_wal_iterator_t* iter, uint64_t* seq) {
+  rocksdb_writebatch_t* result = rocksdb_writebatch_create();
+  BatchResult wal_batch = iter->rep->GetBatch();
+  result->rep = std::move(*wal_batch.writeBatchPtr);
+  if (seq != nullptr) {
+    *seq = wal_batch.sequence;
+  }
+  return result;
+}
+
+uint64_t rocksdb_get_latest_sequence_number(rocksdb_t* db) {
+  return db->rep->GetLatestSequenceNumber();
+}
+
+rocksdb_iterator_t* rocksdb_create_iterator_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = db->rep->NewIterator(options->rep, column_family->rep);
+  return result;
+}
+
+void rocksdb_create_iterators(rocksdb_t* db, rocksdb_readoptions_t* opts,
+                              rocksdb_column_family_handle_t** column_families,
+                              rocksdb_iterator_t** iterators, size_t size,
+                              char** errptr) {
+  std::vector<ColumnFamilyHandle*> column_families_vec;
+  for (size_t i = 0; i < size; i++) {
+    column_families_vec.push_back(column_families[i]->rep);
+  }
+
+  std::vector<Iterator*> res;
+  Status status = db->rep->NewIterators(opts->rep, column_families_vec, &res);
+  assert(res.size() == size);
+  if (SaveError(errptr, status)) {
+    return;
+  }
+
+  for (size_t i = 0; i < size; i++) {
+    iterators[i] = new rocksdb_iterator_t;
+    iterators[i]->rep = res[i];
+  }
+}
+
+const rocksdb_snapshot_t* rocksdb_create_snapshot(rocksdb_t* db) {
+  rocksdb_snapshot_t* result = new rocksdb_snapshot_t;
+  result->rep = db->rep->GetSnapshot();
+  return result;
+}
+
+void rocksdb_release_snapshot(rocksdb_t* db,
+                              const rocksdb_snapshot_t* snapshot) {
+  db->rep->ReleaseSnapshot(snapshot->rep);
+  delete snapshot;
+}
+
+char* rocksdb_property_value(rocksdb_t* db, const char* propname) {
+  std::string tmp;
+  if (db->rep->GetProperty(Slice(propname), &tmp)) {
+    // We use strdup() since we expect human readable output.
+    return strdup(tmp.c_str());
+  } else {
+    return nullptr;
+  }
+}
+
+int rocksdb_property_int(rocksdb_t* db, const char* propname,
+                         uint64_t* out_val) {
+  if (db->rep->GetIntProperty(Slice(propname), out_val)) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+int rocksdb_property_int_cf(rocksdb_t* db,
+                            rocksdb_column_family_handle_t* column_family,
+                            const char* propname, uint64_t* out_val) {
+  if (db->rep->GetIntProperty(column_family->rep, Slice(propname), out_val)) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+char* rocksdb_property_value_cf(rocksdb_t* db,
+                                rocksdb_column_family_handle_t* column_family,
+                                const char* propname) {
+  std::string tmp;
+  if (db->rep->GetProperty(column_family->rep, Slice(propname), &tmp)) {
+    // We use strdup() since we expect human readable output.
+    return strdup(tmp.c_str());
+  } else {
+    return nullptr;
+  }
+}
+
+void rocksdb_approximate_sizes(rocksdb_t* db, int num_ranges,
+                               const char* const* range_start_key,
+                               const size_t* range_start_key_len,
+                               const char* const* range_limit_key,
+                               const size_t* range_limit_key_len,
+                               uint64_t* sizes, char** errptr) {
+  Range* ranges = new Range[num_ranges];
+  for (int i = 0; i < num_ranges; i++) {
+    ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]);
+    ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]);
+  }
+  Status s = db->rep->GetApproximateSizes(ranges, num_ranges, sizes);
+  if (!s.ok()) {
+    SaveError(errptr, s);
+  }
+  delete[] ranges;
+}
+
+void rocksdb_approximate_sizes_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    int num_ranges, const char* const* range_start_key,
+    const size_t* range_start_key_len, const char* const* range_limit_key,
+    const size_t* range_limit_key_len, uint64_t* sizes, char** errptr) {
+  Range* ranges = new Range[num_ranges];
+  for (int i = 0; i < num_ranges; i++) {
+    ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]);
+    ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]);
+  }
+  Status s = db->rep->GetApproximateSizes(column_family->rep, ranges,
+                                          num_ranges, sizes);
+  if (!s.ok()) {
+    SaveError(errptr, s);
+  }
+  delete[] ranges;
+}
+
+void rocksdb_delete_file(rocksdb_t* db, const char* name) {
+  db->rep->DeleteFile(name);
+}
+
+const rocksdb_livefiles_t* rocksdb_livefiles(rocksdb_t* db) {
+  rocksdb_livefiles_t* result = new rocksdb_livefiles_t;
+  db->rep->GetLiveFilesMetaData(&result->rep);
+  return result;
+}
+
+void rocksdb_compact_range(rocksdb_t* db, const char* start_key,
+                           size_t start_key_len, const char* limit_key,
+                           size_t limit_key_len) {
+  Slice a, b;
+  db->rep->CompactRange(
+      CompactRangeOptions(),
+      // Pass nullptr Slice if corresponding "const char*" is nullptr
+      (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+      (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+}
+
+void rocksdb_compact_range_cf(rocksdb_t* db,
+                              rocksdb_column_family_handle_t* column_family,
+                              const char* start_key, size_t start_key_len,
+                              const char* limit_key, size_t limit_key_len) {
+  Slice a, b;
+  db->rep->CompactRange(
+      CompactRangeOptions(), column_family->rep,
+      // Pass nullptr Slice if corresponding "const char*" is nullptr
+      (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+      (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+}
+
+void rocksdb_suggest_compact_range(rocksdb_t* db, const char* start_key,
+                                   size_t start_key_len, const char* limit_key,
+                                   size_t limit_key_len, char** errptr) {
+  Slice a, b;
+  Status s = ROCKSDB_NAMESPACE::experimental::SuggestCompactRange(
+      db->rep,
+      (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+      (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+  SaveError(errptr, s);
+}
+
+void rocksdb_suggest_compact_range_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    const char* start_key, size_t start_key_len, const char* limit_key,
+    size_t limit_key_len, char** errptr) {
+  Slice a, b;
+  Status s = db->rep->SuggestCompactRange(
+      column_family->rep,
+      (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+      (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+  SaveError(errptr, s);
+}
+
+void rocksdb_compact_range_opt(rocksdb_t* db, rocksdb_compactoptions_t* opt,
+                               const char* start_key, size_t start_key_len,
+                               const char* limit_key, size_t limit_key_len) {
+  Slice a, b;
+  db->rep->CompactRange(
+      opt->rep,
+      // Pass nullptr Slice if corresponding "const char*" is nullptr
+      (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+      (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+}
+
+void rocksdb_compact_range_cf_opt(rocksdb_t* db,
+                                  rocksdb_column_family_handle_t* column_family,
+                                  rocksdb_compactoptions_t* opt,
+                                  const char* start_key, size_t start_key_len,
+                                  const char* limit_key, size_t limit_key_len) {
+  Slice a, b;
+  db->rep->CompactRange(
+      opt->rep, column_family->rep,
+      // Pass nullptr Slice if corresponding "const char*" is nullptr
+      (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+      (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+}
+
+void rocksdb_flush(rocksdb_t* db, const rocksdb_flushoptions_t* options,
+                   char** errptr) {
+  SaveError(errptr, db->rep->Flush(options->rep));
+}
+
+void rocksdb_flush_cf(rocksdb_t* db, const rocksdb_flushoptions_t* options,
+                      rocksdb_column_family_handle_t* column_family,
+                      char** errptr) {
+  SaveError(errptr, db->rep->Flush(options->rep, column_family->rep));
+}
+
+void rocksdb_flush_wal(rocksdb_t* db, unsigned char sync, char** errptr) {
+  SaveError(errptr, db->rep->FlushWAL(sync));
+}
+
+void rocksdb_disable_file_deletions(rocksdb_t* db, char** errptr) {
+  SaveError(errptr, db->rep->DisableFileDeletions());
+}
+
+void rocksdb_enable_file_deletions(rocksdb_t* db, unsigned char force,
+                                   char** errptr) {
+  SaveError(errptr, db->rep->EnableFileDeletions(force));
+}
+
+void rocksdb_destroy_db(const rocksdb_options_t* options, const char* name,
+                        char** errptr) {
+  SaveError(errptr, DestroyDB(name, options->rep));
+}
+
+void rocksdb_repair_db(const rocksdb_options_t* options, const char* name,
+                       char** errptr) {
+  SaveError(errptr, RepairDB(name, options->rep));
+}
+
+void rocksdb_iter_destroy(rocksdb_iterator_t* iter) {
+  delete iter->rep;
+  delete iter;
+}
+
+unsigned char rocksdb_iter_valid(const rocksdb_iterator_t* iter) {
+  return iter->rep->Valid();
+}
+
+void rocksdb_iter_seek_to_first(rocksdb_iterator_t* iter) {
+  iter->rep->SeekToFirst();
+}
+
+void rocksdb_iter_seek_to_last(rocksdb_iterator_t* iter) {
+  iter->rep->SeekToLast();
+}
+
+void rocksdb_iter_seek(rocksdb_iterator_t* iter, const char* k, size_t klen) {
+  iter->rep->Seek(Slice(k, klen));
+}
+
+void rocksdb_iter_seek_for_prev(rocksdb_iterator_t* iter, const char* k,
+                                size_t klen) {
+  iter->rep->SeekForPrev(Slice(k, klen));
+}
+
+void rocksdb_iter_next(rocksdb_iterator_t* iter) { iter->rep->Next(); }
+
+void rocksdb_iter_prev(rocksdb_iterator_t* iter) { iter->rep->Prev(); }
+
+const char* rocksdb_iter_key(const rocksdb_iterator_t* iter, size_t* klen) {
+  Slice s = iter->rep->key();
+  *klen = s.size();
+  return s.data();
+}
+
+const char* rocksdb_iter_value(const rocksdb_iterator_t* iter, size_t* vlen) {
+  Slice s = iter->rep->value();
+  *vlen = s.size();
+  return s.data();
+}
+
+const char* rocksdb_iter_timestamp(const rocksdb_iterator_t* iter,
+                                   size_t* tslen) {
+  Slice s = iter->rep->timestamp();
+  *tslen = s.size();
+  return s.data();
+}
+
+void rocksdb_iter_get_error(const rocksdb_iterator_t* iter, char** errptr) {
+  SaveError(errptr, iter->rep->status());
+}
+
+rocksdb_writebatch_t* rocksdb_writebatch_create() {
+  return new rocksdb_writebatch_t;
+}
+
+rocksdb_writebatch_t* rocksdb_writebatch_create_from(const char* rep,
+                                                     size_t size) {
+  rocksdb_writebatch_t* b = new rocksdb_writebatch_t;
+  b->rep = WriteBatch(std::string(rep, size));
+  return b;
+}
+
+void rocksdb_writebatch_destroy(rocksdb_writebatch_t* b) { delete b; }
+
+void rocksdb_writebatch_clear(rocksdb_writebatch_t* b) { b->rep.Clear(); }
+
+int rocksdb_writebatch_count(rocksdb_writebatch_t* b) { return b->rep.Count(); }
+
+void rocksdb_writebatch_put(rocksdb_writebatch_t* b, const char* key,
+                            size_t klen, const char* val, size_t vlen) {
+  b->rep.Put(Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_put_cf(rocksdb_writebatch_t* b,
+                               rocksdb_column_family_handle_t* column_family,
+                               const char* key, size_t klen, const char* val,
+                               size_t vlen) {
+  b->rep.Put(column_family->rep, Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_put_cf_with_ts(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* ts, size_t tslen, const char* val,
+    size_t vlen) {
+  b->rep.Put(column_family->rep, Slice(key, klen), Slice(ts, tslen),
+             Slice(val, vlen));
+}
+
+void rocksdb_writebatch_putv(rocksdb_writebatch_t* b, int num_keys,
+                             const char* const* keys_list,
+                             const size_t* keys_list_sizes, int num_values,
+                             const char* const* values_list,
+                             const size_t* values_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<Slice> value_slices(num_values);
+  for (int i = 0; i < num_values; i++) {
+    value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+  }
+  b->rep.Put(SliceParts(key_slices.data(), num_keys),
+             SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_putv_cf(rocksdb_writebatch_t* b,
+                                rocksdb_column_family_handle_t* column_family,
+                                int num_keys, const char* const* keys_list,
+                                const size_t* keys_list_sizes, int num_values,
+                                const char* const* values_list,
+                                const size_t* values_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<Slice> value_slices(num_values);
+  for (int i = 0; i < num_values; i++) {
+    value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+  }
+  b->rep.Put(column_family->rep, SliceParts(key_slices.data(), num_keys),
+             SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_merge(rocksdb_writebatch_t* b, const char* key,
+                              size_t klen, const char* val, size_t vlen) {
+  b->rep.Merge(Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_merge_cf(rocksdb_writebatch_t* b,
+                                 rocksdb_column_family_handle_t* column_family,
+                                 const char* key, size_t klen, const char* val,
+                                 size_t vlen) {
+  b->rep.Merge(column_family->rep, Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_mergev(rocksdb_writebatch_t* b, int num_keys,
+                               const char* const* keys_list,
+                               const size_t* keys_list_sizes, int num_values,
+                               const char* const* values_list,
+                               const size_t* values_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<Slice> value_slices(num_values);
+  for (int i = 0; i < num_values; i++) {
+    value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+  }
+  b->rep.Merge(SliceParts(key_slices.data(), num_keys),
+               SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_mergev_cf(rocksdb_writebatch_t* b,
+                                  rocksdb_column_family_handle_t* column_family,
+                                  int num_keys, const char* const* keys_list,
+                                  const size_t* keys_list_sizes, int num_values,
+                                  const char* const* values_list,
+                                  const size_t* values_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<Slice> value_slices(num_values);
+  for (int i = 0; i < num_values; i++) {
+    value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+  }
+  b->rep.Merge(column_family->rep, SliceParts(key_slices.data(), num_keys),
+               SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_delete(rocksdb_writebatch_t* b, const char* key,
+                               size_t klen) {
+  b->rep.Delete(Slice(key, klen));
+}
+
+void rocksdb_writebatch_singledelete(rocksdb_writebatch_t* b, const char* key,
+                                     size_t klen) {
+  b->rep.SingleDelete(Slice(key, klen));
+}
+
+void rocksdb_writebatch_delete_cf(rocksdb_writebatch_t* b,
+                                  rocksdb_column_family_handle_t* column_family,
+                                  const char* key, size_t klen) {
+  b->rep.Delete(column_family->rep, Slice(key, klen));
+}
+
+void rocksdb_writebatch_delete_cf_with_ts(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* ts, size_t tslen) {
+  b->rep.Delete(column_family->rep, Slice(key, klen), Slice(ts, tslen));
+}
+
+void rocksdb_writebatch_singledelete_cf(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen) {
+  b->rep.SingleDelete(column_family->rep, Slice(key, klen));
+}
+
+void rocksdb_writebatch_singledelete_cf_with_ts(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* ts, size_t tslen) {
+  b->rep.SingleDelete(column_family->rep, Slice(key, klen), Slice(ts, tslen));
+}
+
+void rocksdb_writebatch_deletev(rocksdb_writebatch_t* b, int num_keys,
+                                const char* const* keys_list,
+                                const size_t* keys_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  b->rep.Delete(SliceParts(key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_deletev_cf(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list, const size_t* keys_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  b->rep.Delete(column_family->rep, SliceParts(key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_delete_range(rocksdb_writebatch_t* b,
+                                     const char* start_key,
+                                     size_t start_key_len, const char* end_key,
+                                     size_t end_key_len) {
+  b->rep.DeleteRange(Slice(start_key, start_key_len),
+                     Slice(end_key, end_key_len));
+}
+
+void rocksdb_writebatch_delete_range_cf(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* start_key, size_t start_key_len, const char* end_key,
+    size_t end_key_len) {
+  b->rep.DeleteRange(column_family->rep, Slice(start_key, start_key_len),
+                     Slice(end_key, end_key_len));
+}
+
+void rocksdb_writebatch_delete_rangev(rocksdb_writebatch_t* b, int num_keys,
+                                      const char* const* start_keys_list,
+                                      const size_t* start_keys_list_sizes,
+                                      const char* const* end_keys_list,
+                                      const size_t* end_keys_list_sizes) {
+  std::vector<Slice> start_key_slices(num_keys);
+  std::vector<Slice> end_key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
+    end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
+  }
+  b->rep.DeleteRange(SliceParts(start_key_slices.data(), num_keys),
+                     SliceParts(end_key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_delete_rangev_cf(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* start_keys_list,
+    const size_t* start_keys_list_sizes, const char* const* end_keys_list,
+    const size_t* end_keys_list_sizes) {
+  std::vector<Slice> start_key_slices(num_keys);
+  std::vector<Slice> end_key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
+    end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
+  }
+  b->rep.DeleteRange(column_family->rep,
+                     SliceParts(start_key_slices.data(), num_keys),
+                     SliceParts(end_key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_put_log_data(rocksdb_writebatch_t* b, const char* blob,
+                                     size_t len) {
+  b->rep.PutLogData(Slice(blob, len));
+}
+
+class H : public WriteBatch::Handler {
+ public:
+  void* state_;
+  void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
+  void (*deleted_)(void*, const char* k, size_t klen);
+  void Put(const Slice& key, const Slice& value) override {
+    (*put_)(state_, key.data(), key.size(), value.data(), value.size());
+  }
+  void Delete(const Slice& key) override {
+    (*deleted_)(state_, key.data(), key.size());
+  }
+};
+
+void rocksdb_writebatch_iterate(rocksdb_writebatch_t* b, void* state,
+                                void (*put)(void*, const char* k, size_t klen,
+                                            const char* v, size_t vlen),
+                                void (*deleted)(void*, const char* k,
+                                                size_t klen)) {
+  H handler;
+  handler.state_ = state;
+  handler.put_ = put;
+  handler.deleted_ = deleted;
+  b->rep.Iterate(&handler);
+}
+
+const char* rocksdb_writebatch_data(rocksdb_writebatch_t* b, size_t* size) {
+  *size = b->rep.GetDataSize();
+  return b->rep.Data().c_str();
+}
+
+void rocksdb_writebatch_set_save_point(rocksdb_writebatch_t* b) {
+  b->rep.SetSavePoint();
+}
+
+void rocksdb_writebatch_rollback_to_save_point(rocksdb_writebatch_t* b,
+                                               char** errptr) {
+  SaveError(errptr, b->rep.RollbackToSavePoint());
+}
+
+void rocksdb_writebatch_pop_save_point(rocksdb_writebatch_t* b, char** errptr) {
+  SaveError(errptr, b->rep.PopSavePoint());
+}
+
+rocksdb_writebatch_wi_t* rocksdb_writebatch_wi_create(
+    size_t reserved_bytes, unsigned char overwrite_key) {
+  rocksdb_writebatch_wi_t* b = new rocksdb_writebatch_wi_t;
+  b->rep = new WriteBatchWithIndex(BytewiseComparator(), reserved_bytes,
+                                   overwrite_key);
+  return b;
+}
+
+void rocksdb_writebatch_wi_destroy(rocksdb_writebatch_wi_t* b) {
+  if (b->rep) {
+    delete b->rep;
+  }
+  delete b;
+}
+
+void rocksdb_writebatch_wi_clear(rocksdb_writebatch_wi_t* b) {
+  b->rep->Clear();
+}
+
+int rocksdb_writebatch_wi_count(rocksdb_writebatch_wi_t* b) {
+  return b->rep->GetWriteBatch()->Count();
+}
+
+void rocksdb_writebatch_wi_put(rocksdb_writebatch_wi_t* b, const char* key,
+                               size_t klen, const char* val, size_t vlen) {
+  b->rep->Put(Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_wi_put_cf(rocksdb_writebatch_wi_t* b,
+                                  rocksdb_column_family_handle_t* column_family,
+                                  const char* key, size_t klen, const char* val,
+                                  size_t vlen) {
+  b->rep->Put(column_family->rep, Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_wi_putv(rocksdb_writebatch_wi_t* b, int num_keys,
+                                const char* const* keys_list,
+                                const size_t* keys_list_sizes, int num_values,
+                                const char* const* values_list,
+                                const size_t* values_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<Slice> value_slices(num_values);
+  for (int i = 0; i < num_values; i++) {
+    value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+  }
+  b->rep->Put(SliceParts(key_slices.data(), num_keys),
+              SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_wi_putv_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list, const size_t* keys_list_sizes,
+    int num_values, const char* const* values_list,
+    const size_t* values_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<Slice> value_slices(num_values);
+  for (int i = 0; i < num_values; i++) {
+    value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+  }
+  b->rep->Put(column_family->rep, SliceParts(key_slices.data(), num_keys),
+              SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_wi_merge(rocksdb_writebatch_wi_t* b, const char* key,
+                                 size_t klen, const char* val, size_t vlen) {
+  b->rep->Merge(Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_wi_merge_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* val, size_t vlen) {
+  b->rep->Merge(column_family->rep, Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_wi_mergev(rocksdb_writebatch_wi_t* b, int num_keys,
+                                  const char* const* keys_list,
+                                  const size_t* keys_list_sizes, int num_values,
+                                  const char* const* values_list,
+                                  const size_t* values_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<Slice> value_slices(num_values);
+  for (int i = 0; i < num_values; i++) {
+    value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+  }
+  b->rep->Merge(SliceParts(key_slices.data(), num_keys),
+                SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_wi_mergev_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list, const size_t* keys_list_sizes,
+    int num_values, const char* const* values_list,
+    const size_t* values_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<Slice> value_slices(num_values);
+  for (int i = 0; i < num_values; i++) {
+    value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+  }
+  b->rep->Merge(column_family->rep, SliceParts(key_slices.data(), num_keys),
+                SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_wi_delete(rocksdb_writebatch_wi_t* b, const char* key,
+                                  size_t klen) {
+  b->rep->Delete(Slice(key, klen));
+}
+
+void rocksdb_writebatch_wi_singledelete(rocksdb_writebatch_wi_t* b,
+                                        const char* key, size_t klen) {
+  b->rep->SingleDelete(Slice(key, klen));
+}
+
+void rocksdb_writebatch_wi_delete_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen) {
+  b->rep->Delete(column_family->rep, Slice(key, klen));
+}
+
+void rocksdb_writebatch_wi_singledelete_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen) {
+  b->rep->SingleDelete(column_family->rep, Slice(key, klen));
+}
+
+void rocksdb_writebatch_wi_deletev(rocksdb_writebatch_wi_t* b, int num_keys,
+                                   const char* const* keys_list,
+                                   const size_t* keys_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  b->rep->Delete(SliceParts(key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_wi_deletev_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list, const size_t* keys_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  b->rep->Delete(column_family->rep, SliceParts(key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_wi_delete_range(rocksdb_writebatch_wi_t* b,
+                                        const char* start_key,
+                                        size_t start_key_len,
+                                        const char* end_key,
+                                        size_t end_key_len) {
+  b->rep->DeleteRange(Slice(start_key, start_key_len),
+                      Slice(end_key, end_key_len));
+}
+
+void rocksdb_writebatch_wi_delete_range_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* start_key, size_t start_key_len, const char* end_key,
+    size_t end_key_len) {
+  b->rep->DeleteRange(column_family->rep, Slice(start_key, start_key_len),
+                      Slice(end_key, end_key_len));
+}
+
+void rocksdb_writebatch_wi_delete_rangev(rocksdb_writebatch_wi_t* b,
+                                         int num_keys,
+                                         const char* const* start_keys_list,
+                                         const size_t* start_keys_list_sizes,
+                                         const char* const* end_keys_list,
+                                         const size_t* end_keys_list_sizes) {
+  std::vector<Slice> start_key_slices(num_keys);
+  std::vector<Slice> end_key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
+    end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
+  }
+  b->rep->DeleteRange(SliceParts(start_key_slices.data(), num_keys),
+                      SliceParts(end_key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_wi_delete_rangev_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* start_keys_list,
+    const size_t* start_keys_list_sizes, const char* const* end_keys_list,
+    const size_t* end_keys_list_sizes) {
+  std::vector<Slice> start_key_slices(num_keys);
+  std::vector<Slice> end_key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
+    end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
+  }
+  b->rep->DeleteRange(column_family->rep,
+                      SliceParts(start_key_slices.data(), num_keys),
+                      SliceParts(end_key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_wi_put_log_data(rocksdb_writebatch_wi_t* b,
+                                        const char* blob, size_t len) {
+  b->rep->PutLogData(Slice(blob, len));
+}
+
+void rocksdb_writebatch_wi_iterate(
+    rocksdb_writebatch_wi_t* b, void* state,
+    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+    void (*deleted)(void*, const char* k, size_t klen)) {
+  H handler;
+  handler.state_ = state;
+  handler.put_ = put;
+  handler.deleted_ = deleted;
+  b->rep->GetWriteBatch()->Iterate(&handler);
+}
+
+const char* rocksdb_writebatch_wi_data(rocksdb_writebatch_wi_t* b,
+                                       size_t* size) {
+  WriteBatch* wb = b->rep->GetWriteBatch();
+  *size = wb->GetDataSize();
+  return wb->Data().c_str();
+}
+
+void rocksdb_writebatch_wi_set_save_point(rocksdb_writebatch_wi_t* b) {
+  b->rep->SetSavePoint();
+}
+
+void rocksdb_writebatch_wi_rollback_to_save_point(rocksdb_writebatch_wi_t* b,
+                                                  char** errptr) {
+  SaveError(errptr, b->rep->RollbackToSavePoint());
+}
+
+rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = wbwi->rep->NewIteratorWithBase(base_iterator->rep);
+  delete base_iterator;
+  return result;
+}
+
+rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base_cf(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator,
+    rocksdb_column_family_handle_t* column_family) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep =
+      wbwi->rep->NewIteratorWithBase(column_family->rep, base_iterator->rep);
+  delete base_iterator;
+  return result;
+}
+
+char* rocksdb_writebatch_wi_get_from_batch(rocksdb_writebatch_wi_t* wbwi,
+                                           const rocksdb_options_t* options,
+                                           const char* key, size_t keylen,
+                                           size_t* vallen, char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = wbwi->rep->GetFromBatch(options->rep, Slice(key, keylen), &tmp);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+char* rocksdb_writebatch_wi_get_from_batch_cf(
+    rocksdb_writebatch_wi_t* wbwi, const rocksdb_options_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, size_t* vallen, char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = wbwi->rep->GetFromBatch(column_family->rep, options->rep,
+                                     Slice(key, keylen), &tmp);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+char* rocksdb_writebatch_wi_get_from_batch_and_db(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db,
+    const rocksdb_readoptions_t* options, const char* key, size_t keylen,
+    size_t* vallen, char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = wbwi->rep->GetFromBatchAndDB(db->rep, options->rep,
+                                          Slice(key, keylen), &tmp);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+char* rocksdb_writebatch_wi_get_from_batch_and_db_cf(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, size_t* vallen, char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = wbwi->rep->GetFromBatchAndDB(
+      db->rep, options->rep, column_family->rep, Slice(key, keylen), &tmp);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+void rocksdb_write_writebatch_wi(rocksdb_t* db,
+                                 const rocksdb_writeoptions_t* options,
+                                 rocksdb_writebatch_wi_t* wbwi, char** errptr) {
+  WriteBatch* wb = wbwi->rep->GetWriteBatch();
+  SaveError(errptr, db->rep->Write(options->rep, wb));
+}
+
+void rocksdb_load_latest_options(
+    const char* db_path, rocksdb_env_t* env, bool ignore_unknown_options,
+    rocksdb_cache_t* cache, rocksdb_options_t** db_options,
+    size_t* num_column_families, char*** list_column_family_names,
+    rocksdb_options_t*** list_column_family_options, char** errptr) {
+  DBOptions db_opt;
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  Status s = LoadLatestOptions(std::string(db_path), env->rep, &db_opt,
+                               &cf_descs, ignore_unknown_options, &cache->rep);
+  if (s.ok()) {
+    char** cf_names = (char**)malloc(cf_descs.size() * sizeof(char*));
+    rocksdb_options_t** cf_options = (rocksdb_options_t**)malloc(
+        cf_descs.size() * sizeof(rocksdb_options_t*));
+    for (size_t i = 0; i < cf_descs.size(); ++i) {
+      cf_names[i] = strdup(cf_descs[i].name.c_str());
+      cf_options[i] = new rocksdb_options_t{
+          Options(DBOptions(), std::move(cf_descs[i].options))};
+    }
+    *num_column_families = cf_descs.size();
+    *db_options = new rocksdb_options_t{
+        Options(std::move(db_opt), ColumnFamilyOptions())};
+    *list_column_family_names = cf_names;
+    *list_column_family_options = cf_options;
+  } else {
+    *num_column_families = 0;
+    *db_options = nullptr;
+    *list_column_family_names = nullptr;
+    *list_column_family_options = nullptr;
+    SaveError(errptr, s);
+  }
+}
+
+void rocksdb_load_latest_options_destroy(
+    rocksdb_options_t* db_options, char** list_column_family_names,
+    rocksdb_options_t** list_column_family_options, size_t len) {
+  rocksdb_options_destroy(db_options);
+  if (list_column_family_names) {
+    for (size_t i = 0; i < len; ++i) {
+      free(list_column_family_names[i]);
+    }
+    free(list_column_family_names);
+  }
+  if (list_column_family_options) {
+    for (size_t i = 0; i < len; ++i) {
+      rocksdb_options_destroy(list_column_family_options[i]);
+    }
+    free(list_column_family_options);
+  }
+}
+
+rocksdb_block_based_table_options_t* rocksdb_block_based_options_create() {
+  return new rocksdb_block_based_table_options_t;
+}
+
+void rocksdb_block_based_options_destroy(
+    rocksdb_block_based_table_options_t* options) {
+  delete options;
+}
+
+void rocksdb_block_based_options_set_checksum(
+    rocksdb_block_based_table_options_t* opt, char v) {
+  opt->rep.checksum = static_cast<ROCKSDB_NAMESPACE::ChecksumType>(v);
+}
+
+void rocksdb_block_based_options_set_block_size(
+    rocksdb_block_based_table_options_t* options, size_t block_size) {
+  options->rep.block_size = block_size;
+}
+
+void rocksdb_block_based_options_set_block_size_deviation(
+    rocksdb_block_based_table_options_t* options, int block_size_deviation) {
+  options->rep.block_size_deviation = block_size_deviation;
+}
+
+void rocksdb_block_based_options_set_block_restart_interval(
+    rocksdb_block_based_table_options_t* options, int block_restart_interval) {
+  options->rep.block_restart_interval = block_restart_interval;
+}
+
+void rocksdb_block_based_options_set_index_block_restart_interval(
+    rocksdb_block_based_table_options_t* options,
+    int index_block_restart_interval) {
+  options->rep.index_block_restart_interval = index_block_restart_interval;
+}
+
+void rocksdb_block_based_options_set_metadata_block_size(
+    rocksdb_block_based_table_options_t* options,
+    uint64_t metadata_block_size) {
+  options->rep.metadata_block_size = metadata_block_size;
+}
+
+void rocksdb_block_based_options_set_partition_filters(
+    rocksdb_block_based_table_options_t* options,
+    unsigned char partition_filters) {
+  options->rep.partition_filters = partition_filters;
+}
+
+void rocksdb_block_based_options_set_use_delta_encoding(
+    rocksdb_block_based_table_options_t* options,
+    unsigned char use_delta_encoding) {
+  options->rep.use_delta_encoding = use_delta_encoding;
+}
+
+void rocksdb_block_based_options_set_filter_policy(
+    rocksdb_block_based_table_options_t* options,
+    rocksdb_filterpolicy_t* filter_policy) {
+  options->rep.filter_policy.reset(filter_policy);
+}
+
+void rocksdb_block_based_options_set_no_block_cache(
+    rocksdb_block_based_table_options_t* options,
+    unsigned char no_block_cache) {
+  options->rep.no_block_cache = no_block_cache;
+}
+
+void rocksdb_block_based_options_set_block_cache(
+    rocksdb_block_based_table_options_t* options,
+    rocksdb_cache_t* block_cache) {
+  if (block_cache) {
+    options->rep.block_cache = block_cache->rep;
+  }
+}
+
+void rocksdb_block_based_options_set_block_cache_compressed(
+    rocksdb_block_based_table_options_t* options,
+    rocksdb_cache_t* block_cache_compressed) {
+  if (block_cache_compressed) {
+    options->rep.block_cache_compressed = block_cache_compressed->rep;
+  }
+}
+
+void rocksdb_block_based_options_set_whole_key_filtering(
+    rocksdb_block_based_table_options_t* options, unsigned char v) {
+  options->rep.whole_key_filtering = v;
+}
+
+void rocksdb_block_based_options_set_format_version(
+    rocksdb_block_based_table_options_t* options, int v) {
+  options->rep.format_version = v;
+}
+
+void rocksdb_block_based_options_set_index_type(
+    rocksdb_block_based_table_options_t* options, int v) {
+  options->rep.index_type = static_cast<BlockBasedTableOptions::IndexType>(v);
+}
+
+void rocksdb_block_based_options_set_data_block_index_type(
+    rocksdb_block_based_table_options_t* options, int v) {
+  options->rep.data_block_index_type =
+      static_cast<BlockBasedTableOptions::DataBlockIndexType>(v);
+}
+
+void rocksdb_block_based_options_set_data_block_hash_ratio(
+    rocksdb_block_based_table_options_t* options, double v) {
+  options->rep.data_block_hash_table_util_ratio = v;
+}
+
+void rocksdb_block_based_options_set_cache_index_and_filter_blocks(
+    rocksdb_block_based_table_options_t* options, unsigned char v) {
+  options->rep.cache_index_and_filter_blocks = v;
+}
+
+void rocksdb_block_based_options_set_cache_index_and_filter_blocks_with_high_priority(
+    rocksdb_block_based_table_options_t* options, unsigned char v) {
+  options->rep.cache_index_and_filter_blocks_with_high_priority = v;
+}
+
+void rocksdb_block_based_options_set_pin_l0_filter_and_index_blocks_in_cache(
+    rocksdb_block_based_table_options_t* options, unsigned char v) {
+  options->rep.pin_l0_filter_and_index_blocks_in_cache = v;
+}
+
+void rocksdb_block_based_options_set_pin_top_level_index_and_filter(
+    rocksdb_block_based_table_options_t* options, unsigned char v) {
+  options->rep.pin_top_level_index_and_filter = v;
+}
+
+void rocksdb_options_set_block_based_table_factory(
+    rocksdb_options_t* opt,
+    rocksdb_block_based_table_options_t* table_options) {
+  if (table_options) {
+    opt->rep.table_factory.reset(
+        ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(table_options->rep));
+  }
+}
+
+rocksdb_cuckoo_table_options_t* rocksdb_cuckoo_options_create() {
+  return new rocksdb_cuckoo_table_options_t;
+}
+
+void rocksdb_cuckoo_options_destroy(rocksdb_cuckoo_table_options_t* options) {
+  delete options;
+}
+
+void rocksdb_cuckoo_options_set_hash_ratio(
+    rocksdb_cuckoo_table_options_t* options, double v) {
+  options->rep.hash_table_ratio = v;
+}
+
+void rocksdb_cuckoo_options_set_max_search_depth(
+    rocksdb_cuckoo_table_options_t* options, uint32_t v) {
+  options->rep.max_search_depth = v;
+}
+
+void rocksdb_cuckoo_options_set_cuckoo_block_size(
+    rocksdb_cuckoo_table_options_t* options, uint32_t v) {
+  options->rep.cuckoo_block_size = v;
+}
+
+void rocksdb_cuckoo_options_set_identity_as_first_hash(
+    rocksdb_cuckoo_table_options_t* options, unsigned char v) {
+  options->rep.identity_as_first_hash = v;
+}
+
+void rocksdb_cuckoo_options_set_use_module_hash(
+    rocksdb_cuckoo_table_options_t* options, unsigned char v) {
+  options->rep.use_module_hash = v;
+}
+
+void rocksdb_options_set_cuckoo_table_factory(
+    rocksdb_options_t* opt, rocksdb_cuckoo_table_options_t* table_options) {
+  if (table_options) {
+    opt->rep.table_factory.reset(
+        ROCKSDB_NAMESPACE::NewCuckooTableFactory(table_options->rep));
+  }
+}
+
+void rocksdb_set_options(rocksdb_t* db, int count, const char* const keys[],
+                         const char* const values[], char** errptr) {
+  std::unordered_map<std::string, std::string> options_map;
+  for (int i = 0; i < count; i++) options_map[keys[i]] = values[i];
+  SaveError(errptr, db->rep->SetOptions(options_map));
+}
+
+void rocksdb_set_options_cf(rocksdb_t* db,
+                            rocksdb_column_family_handle_t* handle, int count,
+                            const char* const keys[],
+                            const char* const values[], char** errptr) {
+  std::unordered_map<std::string, std::string> options_map;
+  for (int i = 0; i < count; i++) options_map[keys[i]] = values[i];
+  SaveError(errptr, db->rep->SetOptions(handle->rep, options_map));
+}
+
+rocksdb_options_t* rocksdb_options_create() { return new rocksdb_options_t; }
+
+void rocksdb_options_destroy(rocksdb_options_t* options) { delete options; }
+
+rocksdb_options_t* rocksdb_options_create_copy(rocksdb_options_t* options) {
+  return new rocksdb_options_t(*options);
+}
+
+void rocksdb_options_increase_parallelism(rocksdb_options_t* opt,
+                                          int total_threads) {
+  opt->rep.IncreaseParallelism(total_threads);
+}
+
+void rocksdb_options_optimize_for_point_lookup(rocksdb_options_t* opt,
+                                               uint64_t block_cache_size_mb) {
+  opt->rep.OptimizeForPointLookup(block_cache_size_mb);
+}
+
+void rocksdb_options_optimize_level_style_compaction(
+    rocksdb_options_t* opt, uint64_t memtable_memory_budget) {
+  opt->rep.OptimizeLevelStyleCompaction(memtable_memory_budget);
+}
+
+void rocksdb_options_optimize_universal_style_compaction(
+    rocksdb_options_t* opt, uint64_t memtable_memory_budget) {
+  opt->rep.OptimizeUniversalStyleCompaction(memtable_memory_budget);
+}
+
+void rocksdb_options_set_allow_ingest_behind(rocksdb_options_t* opt,
+                                             unsigned char v) {
+  opt->rep.allow_ingest_behind = v;
+}
+
+unsigned char rocksdb_options_get_allow_ingest_behind(rocksdb_options_t* opt) {
+  return opt->rep.allow_ingest_behind;
+}
+
+void rocksdb_options_set_compaction_filter(rocksdb_options_t* opt,
+                                           rocksdb_compactionfilter_t* filter) {
+  opt->rep.compaction_filter = filter;
+}
+
+void rocksdb_options_set_compaction_filter_factory(
+    rocksdb_options_t* opt, rocksdb_compactionfilterfactory_t* factory) {
+  opt->rep.compaction_filter_factory =
+      std::shared_ptr<CompactionFilterFactory>(factory);
+}
+
+void rocksdb_options_compaction_readahead_size(rocksdb_options_t* opt,
+                                               size_t s) {
+  opt->rep.compaction_readahead_size = s;
+}
+
+size_t rocksdb_options_get_compaction_readahead_size(rocksdb_options_t* opt) {
+  return opt->rep.compaction_readahead_size;
+}
+
+void rocksdb_options_set_comparator(rocksdb_options_t* opt,
+                                    rocksdb_comparator_t* cmp) {
+  opt->rep.comparator = cmp;
+}
+
+void rocksdb_options_set_merge_operator(
+    rocksdb_options_t* opt, rocksdb_mergeoperator_t* merge_operator) {
+  opt->rep.merge_operator = std::shared_ptr<MergeOperator>(merge_operator);
+}
+
+void rocksdb_options_set_create_if_missing(rocksdb_options_t* opt,
+                                           unsigned char v) {
+  opt->rep.create_if_missing = v;
+}
+
+unsigned char rocksdb_options_get_create_if_missing(rocksdb_options_t* opt) {
+  return opt->rep.create_if_missing;
+}
+
+void rocksdb_options_set_create_missing_column_families(rocksdb_options_t* opt,
+                                                        unsigned char v) {
+  opt->rep.create_missing_column_families = v;
+}
+
+unsigned char rocksdb_options_get_create_missing_column_families(
+    rocksdb_options_t* opt) {
+  return opt->rep.create_missing_column_families;
+}
+
+void rocksdb_options_set_error_if_exists(rocksdb_options_t* opt,
+                                         unsigned char v) {
+  opt->rep.error_if_exists = v;
+}
+
+unsigned char rocksdb_options_get_error_if_exists(rocksdb_options_t* opt) {
+  return opt->rep.error_if_exists;
+}
+
+void rocksdb_options_set_paranoid_checks(rocksdb_options_t* opt,
+                                         unsigned char v) {
+  opt->rep.paranoid_checks = v;
+}
+
+unsigned char rocksdb_options_get_paranoid_checks(rocksdb_options_t* opt) {
+  return opt->rep.paranoid_checks;
+}
+
+void rocksdb_options_set_db_paths(rocksdb_options_t* opt,
+                                  const rocksdb_dbpath_t** dbpath_values,
+                                  size_t num_paths) {
+  std::vector<DbPath> db_paths(num_paths);
+  for (size_t i = 0; i < num_paths; ++i) {
+    db_paths[i] = dbpath_values[i]->rep;
+  }
+  opt->rep.db_paths = db_paths;
+}
+
+void rocksdb_options_set_env(rocksdb_options_t* opt, rocksdb_env_t* env) {
+  opt->rep.env = (env ? env->rep : nullptr);
+}
+
+void rocksdb_options_set_info_log(rocksdb_options_t* opt, rocksdb_logger_t* l) {
+  if (l) {
+    opt->rep.info_log = l->rep;
+  }
+}
+
+void rocksdb_options_set_info_log_level(rocksdb_options_t* opt, int v) {
+  opt->rep.info_log_level = static_cast<InfoLogLevel>(v);
+}
+
+int rocksdb_options_get_info_log_level(rocksdb_options_t* opt) {
+  return static_cast<int>(opt->rep.info_log_level);
+}
+
+void rocksdb_options_set_db_write_buffer_size(rocksdb_options_t* opt,
+                                              size_t s) {
+  opt->rep.db_write_buffer_size = s;
+}
+
+size_t rocksdb_options_get_db_write_buffer_size(rocksdb_options_t* opt) {
+  return opt->rep.db_write_buffer_size;
+}
+
+void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) {
+  opt->rep.write_buffer_size = s;
+}
+
+size_t rocksdb_options_get_write_buffer_size(rocksdb_options_t* opt) {
+  return opt->rep.write_buffer_size;
+}
+
+void rocksdb_options_set_max_open_files(rocksdb_options_t* opt, int n) {
+  opt->rep.max_open_files = n;
+}
+
+int rocksdb_options_get_max_open_files(rocksdb_options_t* opt) {
+  return opt->rep.max_open_files;
+}
+
+void rocksdb_options_set_max_file_opening_threads(rocksdb_options_t* opt,
+                                                  int n) {
+  opt->rep.max_file_opening_threads = n;
+}
+
+int rocksdb_options_get_max_file_opening_threads(rocksdb_options_t* opt) {
+  return opt->rep.max_file_opening_threads;
+}
+
+void rocksdb_options_set_max_total_wal_size(rocksdb_options_t* opt,
+                                            uint64_t n) {
+  opt->rep.max_total_wal_size = n;
+}
+
+uint64_t rocksdb_options_get_max_total_wal_size(rocksdb_options_t* opt) {
+  return opt->rep.max_total_wal_size;
+}
+
+void rocksdb_options_set_target_file_size_base(rocksdb_options_t* opt,
+                                               uint64_t n) {
+  opt->rep.target_file_size_base = n;
+}
+
+uint64_t rocksdb_options_get_target_file_size_base(rocksdb_options_t* opt) {
+  return opt->rep.target_file_size_base;
+}
+
+void rocksdb_options_set_target_file_size_multiplier(rocksdb_options_t* opt,
+                                                     int n) {
+  opt->rep.target_file_size_multiplier = n;
+}
+
+int rocksdb_options_get_target_file_size_multiplier(rocksdb_options_t* opt) {
+  return opt->rep.target_file_size_multiplier;
+}
+
+void rocksdb_options_set_max_bytes_for_level_base(rocksdb_options_t* opt,
+                                                  uint64_t n) {
+  opt->rep.max_bytes_for_level_base = n;
+}
+
+uint64_t rocksdb_options_get_max_bytes_for_level_base(rocksdb_options_t* opt) {
+  return opt->rep.max_bytes_for_level_base;
+}
+
+void rocksdb_options_set_level_compaction_dynamic_level_bytes(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.level_compaction_dynamic_level_bytes = v;
+}
+
+unsigned char rocksdb_options_get_level_compaction_dynamic_level_bytes(
+    rocksdb_options_t* opt) {
+  return opt->rep.level_compaction_dynamic_level_bytes;
+}
+
+void rocksdb_options_set_max_bytes_for_level_multiplier(rocksdb_options_t* opt,
+                                                        double n) {
+  opt->rep.max_bytes_for_level_multiplier = n;
+}
+
+double rocksdb_options_get_max_bytes_for_level_multiplier(
+    rocksdb_options_t* opt) {
+  return opt->rep.max_bytes_for_level_multiplier;
+}
+
+void rocksdb_options_set_max_compaction_bytes(rocksdb_options_t* opt,
+                                              uint64_t n) {
+  opt->rep.max_compaction_bytes = n;
+}
+
+uint64_t rocksdb_options_get_max_compaction_bytes(rocksdb_options_t* opt) {
+  return opt->rep.max_compaction_bytes;
+}
+
+void rocksdb_options_set_max_bytes_for_level_multiplier_additional(
+    rocksdb_options_t* opt, int* level_values, size_t num_levels) {
+  opt->rep.max_bytes_for_level_multiplier_additional.resize(num_levels);
+  for (size_t i = 0; i < num_levels; ++i) {
+    opt->rep.max_bytes_for_level_multiplier_additional[i] = level_values[i];
+  }
+}
+
+void rocksdb_options_enable_statistics(rocksdb_options_t* opt) {
+  opt->rep.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+}
+
+void rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt,
+                                                      unsigned char val) {
+  opt->rep.skip_stats_update_on_db_open = val;
+}
+
+unsigned char rocksdb_options_get_skip_stats_update_on_db_open(
+    rocksdb_options_t* opt) {
+  return opt->rep.skip_stats_update_on_db_open;
+}
+
+void rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(
+    rocksdb_options_t* opt, unsigned char val) {
+  opt->rep.skip_checking_sst_file_sizes_on_db_open = val;
+}
+
+unsigned char rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(
+    rocksdb_options_t* opt) {
+  return opt->rep.skip_checking_sst_file_sizes_on_db_open;
+}
+
+/* Blob Options Settings */
+void rocksdb_options_set_enable_blob_files(rocksdb_options_t* opt,
+                                           unsigned char val) {
+  opt->rep.enable_blob_files = val;
+}
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_files(
+    rocksdb_options_t* opt) {
+  return opt->rep.enable_blob_files;
+}
+
+void rocksdb_options_set_min_blob_size(rocksdb_options_t* opt, uint64_t val) {
+  opt->rep.min_blob_size = val;
+}
+
+uint64_t rocksdb_options_get_min_blob_size(rocksdb_options_t* opt) {
+  return opt->rep.min_blob_size;
+}
+
+void rocksdb_options_set_blob_file_size(rocksdb_options_t* opt, uint64_t val) {
+  opt->rep.blob_file_size = val;
+}
+
+uint64_t rocksdb_options_get_blob_file_size(rocksdb_options_t* opt) {
+  return opt->rep.blob_file_size;
+}
+
+void rocksdb_options_set_blob_compression_type(rocksdb_options_t* opt,
+                                               int val) {
+  opt->rep.blob_compression_type = static_cast<CompressionType>(val);
+}
+
+int rocksdb_options_get_blob_compression_type(rocksdb_options_t* opt) {
+  return opt->rep.blob_compression_type;
+}
+
+void rocksdb_options_set_enable_blob_gc(rocksdb_options_t* opt,
+                                        unsigned char val) {
+  opt->rep.enable_blob_garbage_collection = val;
+}
+
+unsigned char rocksdb_options_get_enable_blob_gc(rocksdb_options_t* opt) {
+  return opt->rep.enable_blob_garbage_collection;
+}
+
+void rocksdb_options_set_blob_gc_age_cutoff(rocksdb_options_t* opt,
+                                            double val) {
+  opt->rep.blob_garbage_collection_age_cutoff = val;
+}
+
+double rocksdb_options_get_blob_gc_age_cutoff(rocksdb_options_t* opt) {
+  return opt->rep.blob_garbage_collection_age_cutoff;
+}
+
+void rocksdb_options_set_blob_gc_force_threshold(rocksdb_options_t* opt,
+                                                 double val) {
+  opt->rep.blob_garbage_collection_force_threshold = val;
+}
+
+double rocksdb_options_get_blob_gc_force_threshold(rocksdb_options_t* opt) {
+  return opt->rep.blob_garbage_collection_force_threshold;
+}
+
+void rocksdb_options_set_blob_compaction_readahead_size(rocksdb_options_t* opt,
+                                                        uint64_t val) {
+  opt->rep.blob_compaction_readahead_size = val;
+}
+
+uint64_t rocksdb_options_get_blob_compaction_readahead_size(
+    rocksdb_options_t* opt) {
+  return opt->rep.blob_compaction_readahead_size;
+}
+
+void rocksdb_options_set_blob_file_starting_level(rocksdb_options_t* opt,
+                                                  int val) {
+  opt->rep.blob_file_starting_level = val;
+}
+
+int rocksdb_options_get_blob_file_starting_level(rocksdb_options_t* opt) {
+  return opt->rep.blob_file_starting_level;
+}
+
+void rocksdb_options_set_blob_cache(rocksdb_options_t* opt,
+                                    rocksdb_cache_t* blob_cache) {
+  opt->rep.blob_cache = blob_cache->rep;
+}
+
+void rocksdb_options_set_prepopulate_blob_cache(rocksdb_options_t* opt, int t) {
+  opt->rep.prepopulate_blob_cache = static_cast<PrepopulateBlobCache>(t);
+}
+
+int rocksdb_options_get_prepopulate_blob_cache(rocksdb_options_t* opt) {
+  return static_cast<int>(opt->rep.prepopulate_blob_cache);
+}
+
+void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) {
+  opt->rep.num_levels = n;
+}
+
+int rocksdb_options_get_num_levels(rocksdb_options_t* opt) {
+  return opt->rep.num_levels;
+}
+
+void rocksdb_options_set_level0_file_num_compaction_trigger(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.level0_file_num_compaction_trigger = n;
+}
+
+int rocksdb_options_get_level0_file_num_compaction_trigger(
+    rocksdb_options_t* opt) {
+  return opt->rep.level0_file_num_compaction_trigger;
+}
+
+void rocksdb_options_set_level0_slowdown_writes_trigger(rocksdb_options_t* opt,
+                                                        int n) {
+  opt->rep.level0_slowdown_writes_trigger = n;
+}
+
+int rocksdb_options_get_level0_slowdown_writes_trigger(rocksdb_options_t* opt) {
+  return opt->rep.level0_slowdown_writes_trigger;
+}
+
+void rocksdb_options_set_level0_stop_writes_trigger(rocksdb_options_t* opt,
+                                                    int n) {
+  opt->rep.level0_stop_writes_trigger = n;
+}
+
+int rocksdb_options_get_level0_stop_writes_trigger(rocksdb_options_t* opt) {
+  return opt->rep.level0_stop_writes_trigger;
+}
+
+void rocksdb_options_set_wal_recovery_mode(rocksdb_options_t* opt, int mode) {
+  opt->rep.wal_recovery_mode = static_cast<WALRecoveryMode>(mode);
+}
+
+int rocksdb_options_get_wal_recovery_mode(rocksdb_options_t* opt) {
+  return static_cast<int>(opt->rep.wal_recovery_mode);
+}
+
+void rocksdb_options_set_compression(rocksdb_options_t* opt, int t) {
+  opt->rep.compression = static_cast<CompressionType>(t);
+}
+
+int rocksdb_options_get_compression(rocksdb_options_t* opt) {
+  return opt->rep.compression;
+}
+
+void rocksdb_options_set_bottommost_compression(rocksdb_options_t* opt, int t) {
+  opt->rep.bottommost_compression = static_cast<CompressionType>(t);
+}
+
+int rocksdb_options_get_bottommost_compression(rocksdb_options_t* opt) {
+  return opt->rep.bottommost_compression;
+}
+
+void rocksdb_options_set_compression_per_level(rocksdb_options_t* opt,
+                                               const int* level_values,
+                                               size_t num_levels) {
+  opt->rep.compression_per_level.resize(num_levels);
+  for (size_t i = 0; i < num_levels; ++i) {
+    opt->rep.compression_per_level[i] =
+        static_cast<CompressionType>(level_values[i]);
+  }
+}
+
+void rocksdb_options_set_bottommost_compression_options(rocksdb_options_t* opt,
+                                                        int w_bits, int level,
+                                                        int strategy,
+                                                        int max_dict_bytes,
+                                                        unsigned char enabled) {
+  opt->rep.bottommost_compression_opts.window_bits = w_bits;
+  opt->rep.bottommost_compression_opts.level = level;
+  opt->rep.bottommost_compression_opts.strategy = strategy;
+  opt->rep.bottommost_compression_opts.max_dict_bytes = max_dict_bytes;
+  opt->rep.bottommost_compression_opts.enabled = enabled;
+}
+
+void rocksdb_options_set_bottommost_compression_options_zstd_max_train_bytes(
+    rocksdb_options_t* opt, int zstd_max_train_bytes, unsigned char enabled) {
+  opt->rep.bottommost_compression_opts.zstd_max_train_bytes =
+      zstd_max_train_bytes;
+  opt->rep.bottommost_compression_opts.enabled = enabled;
+}
+
+void rocksdb_options_set_bottommost_compression_options_use_zstd_dict_trainer(
+    rocksdb_options_t* opt, unsigned char use_zstd_dict_trainer,
+    unsigned char enabled) {
+  opt->rep.bottommost_compression_opts.use_zstd_dict_trainer =
+      use_zstd_dict_trainer;
+  opt->rep.bottommost_compression_opts.enabled = enabled;
+}
+
+unsigned char
+rocksdb_options_get_bottommost_compression_options_use_zstd_dict_trainer(
+    rocksdb_options_t* opt) {
+  return opt->rep.bottommost_compression_opts.use_zstd_dict_trainer;
+}
+
+void rocksdb_options_set_bottommost_compression_options_max_dict_buffer_bytes(
+    rocksdb_options_t* opt, uint64_t max_dict_buffer_bytes,
+    unsigned char enabled) {
+  opt->rep.bottommost_compression_opts.max_dict_buffer_bytes =
+      max_dict_buffer_bytes;
+  opt->rep.bottommost_compression_opts.enabled = enabled;
+}
+
+void rocksdb_options_set_compression_options(rocksdb_options_t* opt, int w_bits,
+                                             int level, int strategy,
+                                             int max_dict_bytes) {
+  opt->rep.compression_opts.window_bits = w_bits;
+  opt->rep.compression_opts.level = level;
+  opt->rep.compression_opts.strategy = strategy;
+  opt->rep.compression_opts.max_dict_bytes = max_dict_bytes;
+}
+
+void rocksdb_options_set_compression_options_zstd_max_train_bytes(
+    rocksdb_options_t* opt, int zstd_max_train_bytes) {
+  opt->rep.compression_opts.zstd_max_train_bytes = zstd_max_train_bytes;
+}
+
+int rocksdb_options_get_compression_options_zstd_max_train_bytes(
+    rocksdb_options_t* opt) {
+  return opt->rep.compression_opts.zstd_max_train_bytes;
+}
+
+void rocksdb_options_set_compression_options_use_zstd_dict_trainer(
+    rocksdb_options_t* opt, unsigned char use_zstd_dict_trainer) {
+  opt->rep.compression_opts.use_zstd_dict_trainer = use_zstd_dict_trainer;
+}
+
+unsigned char rocksdb_options_get_compression_options_use_zstd_dict_trainer(
+    rocksdb_options_t* opt) {
+  return opt->rep.compression_opts.use_zstd_dict_trainer;
+}
+
+void rocksdb_options_set_compression_options_parallel_threads(
+    rocksdb_options_t* opt, int value) {
+  opt->rep.compression_opts.parallel_threads = value;
+}
+
+int rocksdb_options_get_compression_options_parallel_threads(
+    rocksdb_options_t* opt) {
+  return opt->rep.compression_opts.parallel_threads;
+}
+
+void rocksdb_options_set_compression_options_max_dict_buffer_bytes(
+    rocksdb_options_t* opt, uint64_t max_dict_buffer_bytes) {
+  opt->rep.compression_opts.max_dict_buffer_bytes = max_dict_buffer_bytes;
+}
+
+uint64_t rocksdb_options_get_compression_options_max_dict_buffer_bytes(
+    rocksdb_options_t* opt) {
+  return opt->rep.compression_opts.max_dict_buffer_bytes;
+}
+
+void rocksdb_options_set_prefix_extractor(
+    rocksdb_options_t* opt, rocksdb_slicetransform_t* prefix_extractor) {
+  opt->rep.prefix_extractor.reset(prefix_extractor);
+}
+
+void rocksdb_options_set_use_fsync(rocksdb_options_t* opt, int use_fsync) {
+  opt->rep.use_fsync = use_fsync;
+}
+
+int rocksdb_options_get_use_fsync(rocksdb_options_t* opt) {
+  return opt->rep.use_fsync;
+}
+
+void rocksdb_options_set_db_log_dir(rocksdb_options_t* opt,
+                                    const char* db_log_dir) {
+  opt->rep.db_log_dir = db_log_dir;
+}
+
+void rocksdb_options_set_wal_dir(rocksdb_options_t* opt, const char* v) {
+  opt->rep.wal_dir = v;
+}
+
+void rocksdb_options_set_WAL_ttl_seconds(rocksdb_options_t* opt, uint64_t ttl) {
+  opt->rep.WAL_ttl_seconds = ttl;
+}
+
+uint64_t rocksdb_options_get_WAL_ttl_seconds(rocksdb_options_t* opt) {
+  return opt->rep.WAL_ttl_seconds;
+}
+
+void rocksdb_options_set_WAL_size_limit_MB(rocksdb_options_t* opt,
+                                           uint64_t limit) {
+  opt->rep.WAL_size_limit_MB = limit;
+}
+
+uint64_t rocksdb_options_get_WAL_size_limit_MB(rocksdb_options_t* opt) {
+  return opt->rep.WAL_size_limit_MB;
+}
+
+void rocksdb_options_set_manifest_preallocation_size(rocksdb_options_t* opt,
+                                                     size_t v) {
+  opt->rep.manifest_preallocation_size = v;
+}
+
+size_t rocksdb_options_get_manifest_preallocation_size(rocksdb_options_t* opt) {
+  return opt->rep.manifest_preallocation_size;
+}
+
+void rocksdb_options_set_use_direct_reads(rocksdb_options_t* opt,
+                                          unsigned char v) {
+  opt->rep.use_direct_reads = v;
+}
+
+unsigned char rocksdb_options_get_use_direct_reads(rocksdb_options_t* opt) {
+  return opt->rep.use_direct_reads;
+}
+
+void rocksdb_options_set_use_direct_io_for_flush_and_compaction(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.use_direct_io_for_flush_and_compaction = v;
+}
+
+unsigned char rocksdb_options_get_use_direct_io_for_flush_and_compaction(
+    rocksdb_options_t* opt) {
+  return opt->rep.use_direct_io_for_flush_and_compaction;
+}
+
+void rocksdb_options_set_allow_mmap_reads(rocksdb_options_t* opt,
+                                          unsigned char v) {
+  opt->rep.allow_mmap_reads = v;
+}
+
+unsigned char rocksdb_options_get_allow_mmap_reads(rocksdb_options_t* opt) {
+  return opt->rep.allow_mmap_reads;
+}
+
+void rocksdb_options_set_allow_mmap_writes(rocksdb_options_t* opt,
+                                           unsigned char v) {
+  opt->rep.allow_mmap_writes = v;
+}
+
+unsigned char rocksdb_options_get_allow_mmap_writes(rocksdb_options_t* opt) {
+  return opt->rep.allow_mmap_writes;
+}
+
+void rocksdb_options_set_is_fd_close_on_exec(rocksdb_options_t* opt,
+                                             unsigned char v) {
+  opt->rep.is_fd_close_on_exec = v;
+}
+
+unsigned char rocksdb_options_get_is_fd_close_on_exec(rocksdb_options_t* opt) {
+  return opt->rep.is_fd_close_on_exec;
+}
+
+void rocksdb_options_set_stats_dump_period_sec(rocksdb_options_t* opt,
+                                               unsigned int v) {
+  opt->rep.stats_dump_period_sec = v;
+}
+
+unsigned int rocksdb_options_get_stats_dump_period_sec(rocksdb_options_t* opt) {
+  return opt->rep.stats_dump_period_sec;
+}
+
+void rocksdb_options_set_stats_persist_period_sec(rocksdb_options_t* opt,
+                                                  unsigned int v) {
+  opt->rep.stats_persist_period_sec = v;
+}
+
+unsigned int rocksdb_options_get_stats_persist_period_sec(
+    rocksdb_options_t* opt) {
+  return opt->rep.stats_persist_period_sec;
+}
+
+void rocksdb_options_set_advise_random_on_open(rocksdb_options_t* opt,
+                                               unsigned char v) {
+  opt->rep.advise_random_on_open = v;
+}
+
+unsigned char rocksdb_options_get_advise_random_on_open(
+    rocksdb_options_t* opt) {
+  return opt->rep.advise_random_on_open;
+}
+
+void rocksdb_options_set_access_hint_on_compaction_start(rocksdb_options_t* opt,
+                                                         int v) {
+  switch (v) {
+    case 0:
+      opt->rep.access_hint_on_compaction_start =
+          ROCKSDB_NAMESPACE::Options::NONE;
+      break;
+    case 1:
+      opt->rep.access_hint_on_compaction_start =
+          ROCKSDB_NAMESPACE::Options::NORMAL;
+      break;
+    case 2:
+      opt->rep.access_hint_on_compaction_start =
+          ROCKSDB_NAMESPACE::Options::SEQUENTIAL;
+      break;
+    case 3:
+      opt->rep.access_hint_on_compaction_start =
+          ROCKSDB_NAMESPACE::Options::WILLNEED;
+      break;
+    default:
+      assert(0);
+  }
+}
+
+int rocksdb_options_get_access_hint_on_compaction_start(
+    rocksdb_options_t* opt) {
+  return opt->rep.access_hint_on_compaction_start;
+}
+
+void rocksdb_options_set_use_adaptive_mutex(rocksdb_options_t* opt,
+                                            unsigned char v) {
+  opt->rep.use_adaptive_mutex = v;
+}
+
+unsigned char rocksdb_options_get_use_adaptive_mutex(rocksdb_options_t* opt) {
+  return opt->rep.use_adaptive_mutex;
+}
+
+void rocksdb_options_set_wal_bytes_per_sync(rocksdb_options_t* opt,
+                                            uint64_t v) {
+  opt->rep.wal_bytes_per_sync = v;
+}
+
+uint64_t rocksdb_options_get_wal_bytes_per_sync(rocksdb_options_t* opt) {
+  return opt->rep.wal_bytes_per_sync;
+}
+
+void rocksdb_options_set_bytes_per_sync(rocksdb_options_t* opt, uint64_t v) {
+  opt->rep.bytes_per_sync = v;
+}
+
+uint64_t rocksdb_options_get_bytes_per_sync(rocksdb_options_t* opt) {
+  return opt->rep.bytes_per_sync;
+}
+
+void rocksdb_options_set_writable_file_max_buffer_size(rocksdb_options_t* opt,
+                                                       uint64_t v) {
+  opt->rep.writable_file_max_buffer_size = static_cast<size_t>(v);
+}
+
+uint64_t rocksdb_options_get_writable_file_max_buffer_size(
+    rocksdb_options_t* opt) {
+  return opt->rep.writable_file_max_buffer_size;
+}
+
+void rocksdb_options_set_allow_concurrent_memtable_write(rocksdb_options_t* opt,
+                                                         unsigned char v) {
+  opt->rep.allow_concurrent_memtable_write = v;
+}
+
+unsigned char rocksdb_options_get_allow_concurrent_memtable_write(
+    rocksdb_options_t* opt) {
+  return opt->rep.allow_concurrent_memtable_write;
+}
+
+void rocksdb_options_set_enable_write_thread_adaptive_yield(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.enable_write_thread_adaptive_yield = v;
+}
+
+unsigned char rocksdb_options_get_enable_write_thread_adaptive_yield(
+    rocksdb_options_t* opt) {
+  return opt->rep.enable_write_thread_adaptive_yield;
+}
+
+void rocksdb_options_set_max_sequential_skip_in_iterations(
+    rocksdb_options_t* opt, uint64_t v) {
+  opt->rep.max_sequential_skip_in_iterations = v;
+}
+
+uint64_t rocksdb_options_get_max_sequential_skip_in_iterations(
+    rocksdb_options_t* opt) {
+  return opt->rep.max_sequential_skip_in_iterations;
+}
+
+void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t* opt,
+                                                 int n) {
+  opt->rep.max_write_buffer_number = n;
+}
+
+int rocksdb_options_get_max_write_buffer_number(rocksdb_options_t* opt) {
+  return opt->rep.max_write_buffer_number;
+}
+
+void rocksdb_options_set_min_write_buffer_number_to_merge(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.min_write_buffer_number_to_merge = n;
+}
+
+int rocksdb_options_get_min_write_buffer_number_to_merge(
+    rocksdb_options_t* opt) {
+  return opt->rep.min_write_buffer_number_to_merge;
+}
+
+void rocksdb_options_set_max_write_buffer_number_to_maintain(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.max_write_buffer_number_to_maintain = n;
+}
+
+int rocksdb_options_get_max_write_buffer_number_to_maintain(
+    rocksdb_options_t* opt) {
+  return opt->rep.max_write_buffer_number_to_maintain;
+}
+
+void rocksdb_options_set_max_write_buffer_size_to_maintain(
+    rocksdb_options_t* opt, int64_t n) {
+  opt->rep.max_write_buffer_size_to_maintain = n;
+}
+
+int64_t rocksdb_options_get_max_write_buffer_size_to_maintain(
+    rocksdb_options_t* opt) {
+  return opt->rep.max_write_buffer_size_to_maintain;
+}
+
+void rocksdb_options_set_enable_pipelined_write(rocksdb_options_t* opt,
+                                                unsigned char v) {
+  opt->rep.enable_pipelined_write = v;
+}
+
+unsigned char rocksdb_options_get_enable_pipelined_write(
+    rocksdb_options_t* opt) {
+  return opt->rep.enable_pipelined_write;
+}
+
+void rocksdb_options_set_unordered_write(rocksdb_options_t* opt,
+                                         unsigned char v) {
+  opt->rep.unordered_write = v;
+}
+
+unsigned char rocksdb_options_get_unordered_write(rocksdb_options_t* opt) {
+  return opt->rep.unordered_write;
+}
+
+void rocksdb_options_set_max_subcompactions(rocksdb_options_t* opt,
+                                            uint32_t n) {
+  opt->rep.max_subcompactions = n;
+}
+
+uint32_t rocksdb_options_get_max_subcompactions(rocksdb_options_t* opt) {
+  return opt->rep.max_subcompactions;
+}
+
+void rocksdb_options_set_max_background_jobs(rocksdb_options_t* opt, int n) {
+  opt->rep.max_background_jobs = n;
+}
+
+int rocksdb_options_get_max_background_jobs(rocksdb_options_t* opt) {
+  return opt->rep.max_background_jobs;
+}
+
+void rocksdb_options_set_max_background_compactions(rocksdb_options_t* opt,
+                                                    int n) {
+  opt->rep.max_background_compactions = n;
+}
+
+int rocksdb_options_get_max_background_compactions(rocksdb_options_t* opt) {
+  return opt->rep.max_background_compactions;
+}
+
+void rocksdb_options_set_max_background_flushes(rocksdb_options_t* opt, int n) {
+  opt->rep.max_background_flushes = n;
+}
+
+int rocksdb_options_get_max_background_flushes(rocksdb_options_t* opt) {
+  return opt->rep.max_background_flushes;
+}
+
+void rocksdb_options_set_experimental_mempurge_threshold(rocksdb_options_t* opt,
+                                                         double v) {
+  opt->rep.experimental_mempurge_threshold = v;
+}
+
+double rocksdb_options_get_experimental_mempurge_threshold(
+    rocksdb_options_t* opt) {
+  return opt->rep.experimental_mempurge_threshold;
+}
+
+void rocksdb_options_set_max_log_file_size(rocksdb_options_t* opt, size_t v) {
+  opt->rep.max_log_file_size = v;
+}
+
+size_t rocksdb_options_get_max_log_file_size(rocksdb_options_t* opt) {
+  return opt->rep.max_log_file_size;
+}
+
+void rocksdb_options_set_log_file_time_to_roll(rocksdb_options_t* opt,
+                                               size_t v) {
+  opt->rep.log_file_time_to_roll = v;
+}
+
+size_t rocksdb_options_get_log_file_time_to_roll(rocksdb_options_t* opt) {
+  return opt->rep.log_file_time_to_roll;
+}
+
+void rocksdb_options_set_keep_log_file_num(rocksdb_options_t* opt, size_t v) {
+  opt->rep.keep_log_file_num = v;
+}
+
+size_t rocksdb_options_get_keep_log_file_num(rocksdb_options_t* opt) {
+  return opt->rep.keep_log_file_num;
+}
+
+void rocksdb_options_set_recycle_log_file_num(rocksdb_options_t* opt,
+                                              size_t v) {
+  opt->rep.recycle_log_file_num = v;
+}
+
+size_t rocksdb_options_get_recycle_log_file_num(rocksdb_options_t* opt) {
+  return opt->rep.recycle_log_file_num;
+}
+
+void rocksdb_options_set_soft_pending_compaction_bytes_limit(
+    rocksdb_options_t* opt, size_t v) {
+  opt->rep.soft_pending_compaction_bytes_limit = v;
+}
+
+size_t rocksdb_options_get_soft_pending_compaction_bytes_limit(
+    rocksdb_options_t* opt) {
+  return opt->rep.soft_pending_compaction_bytes_limit;
+}
+
+void rocksdb_options_set_hard_pending_compaction_bytes_limit(
+    rocksdb_options_t* opt, size_t v) {
+  opt->rep.hard_pending_compaction_bytes_limit = v;
+}
+
+size_t rocksdb_options_get_hard_pending_compaction_bytes_limit(
+    rocksdb_options_t* opt) {
+  return opt->rep.hard_pending_compaction_bytes_limit;
+}
+
+void rocksdb_options_set_max_manifest_file_size(rocksdb_options_t* opt,
+                                                size_t v) {
+  opt->rep.max_manifest_file_size = v;
+}
+
+size_t rocksdb_options_get_max_manifest_file_size(rocksdb_options_t* opt) {
+  return opt->rep.max_manifest_file_size;
+}
+
+void rocksdb_options_set_table_cache_numshardbits(rocksdb_options_t* opt,
+                                                  int v) {
+  opt->rep.table_cache_numshardbits = v;
+}
+
+int rocksdb_options_get_table_cache_numshardbits(rocksdb_options_t* opt) {
+  return opt->rep.table_cache_numshardbits;
+}
+
+void rocksdb_options_set_arena_block_size(rocksdb_options_t* opt, size_t v) {
+  opt->rep.arena_block_size = v;
+}
+
+size_t rocksdb_options_get_arena_block_size(rocksdb_options_t* opt) {
+  return opt->rep.arena_block_size;
+}
+
+void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t* opt,
+                                                  int disable) {
+  opt->rep.disable_auto_compactions = disable;
+}
+
+unsigned char rocksdb_options_get_disable_auto_compactions(
+    rocksdb_options_t* opt) {
+  return opt->rep.disable_auto_compactions;
+}
+
+void rocksdb_options_set_optimize_filters_for_hits(rocksdb_options_t* opt,
+                                                   int v) {
+  opt->rep.optimize_filters_for_hits = v;
+}
+
+unsigned char rocksdb_options_get_optimize_filters_for_hits(
+    rocksdb_options_t* opt) {
+  return opt->rep.optimize_filters_for_hits;
+}
+
+void rocksdb_options_set_delete_obsolete_files_period_micros(
+    rocksdb_options_t* opt, uint64_t v) {
+  opt->rep.delete_obsolete_files_period_micros = v;
+}
+
+uint64_t rocksdb_options_get_delete_obsolete_files_period_micros(
+    rocksdb_options_t* opt) {
+  return opt->rep.delete_obsolete_files_period_micros;
+}
+
+void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t* opt) {
+  opt->rep.PrepareForBulkLoad();
+}
+
+void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t* opt) {
+  opt->rep.memtable_factory.reset(new ROCKSDB_NAMESPACE::VectorRepFactory);
+}
+
+void rocksdb_options_set_memtable_prefix_bloom_size_ratio(
+    rocksdb_options_t* opt, double v) {
+  opt->rep.memtable_prefix_bloom_size_ratio = v;
+}
+
+double rocksdb_options_get_memtable_prefix_bloom_size_ratio(
+    rocksdb_options_t* opt) {
+  return opt->rep.memtable_prefix_bloom_size_ratio;
+}
+
+void rocksdb_options_set_memtable_huge_page_size(rocksdb_options_t* opt,
+                                                 size_t v) {
+  opt->rep.memtable_huge_page_size = v;
+}
+
+size_t rocksdb_options_get_memtable_huge_page_size(rocksdb_options_t* opt) {
+  return opt->rep.memtable_huge_page_size;
+}
+
+void rocksdb_options_set_hash_skip_list_rep(rocksdb_options_t* opt,
+                                            size_t bucket_count,
+                                            int32_t skiplist_height,
+                                            int32_t skiplist_branching_factor) {
+  ROCKSDB_NAMESPACE::MemTableRepFactory* factory =
+      ROCKSDB_NAMESPACE::NewHashSkipListRepFactory(
+          bucket_count, skiplist_height, skiplist_branching_factor);
+  opt->rep.memtable_factory.reset(factory);
+}
+
+void rocksdb_options_set_hash_link_list_rep(rocksdb_options_t* opt,
+                                            size_t bucket_count) {
+  opt->rep.memtable_factory.reset(
+      ROCKSDB_NAMESPACE::NewHashLinkListRepFactory(bucket_count));
+}
+
+void rocksdb_options_set_plain_table_factory(rocksdb_options_t* opt,
+                                             uint32_t user_key_len,
+                                             int bloom_bits_per_key,
+                                             double hash_table_ratio,
+                                             size_t index_sparseness) {
+  ROCKSDB_NAMESPACE::PlainTableOptions options;
+  options.user_key_len = user_key_len;
+  options.bloom_bits_per_key = bloom_bits_per_key;
+  options.hash_table_ratio = hash_table_ratio;
+  options.index_sparseness = index_sparseness;
+
+  ROCKSDB_NAMESPACE::TableFactory* factory =
+      ROCKSDB_NAMESPACE::NewPlainTableFactory(options);
+  opt->rep.table_factory.reset(factory);
+}
+
+void rocksdb_options_set_max_successive_merges(rocksdb_options_t* opt,
+                                               size_t v) {
+  opt->rep.max_successive_merges = v;
+}
+
+size_t rocksdb_options_get_max_successive_merges(rocksdb_options_t* opt) {
+  return opt->rep.max_successive_merges;
+}
+
+void rocksdb_options_set_bloom_locality(rocksdb_options_t* opt, uint32_t v) {
+  opt->rep.bloom_locality = v;
+}
+
+uint32_t rocksdb_options_get_bloom_locality(rocksdb_options_t* opt) {
+  return opt->rep.bloom_locality;
+}
+
+void rocksdb_options_set_inplace_update_support(rocksdb_options_t* opt,
+                                                unsigned char v) {
+  opt->rep.inplace_update_support = v;
+}
+
+unsigned char rocksdb_options_get_inplace_update_support(
+    rocksdb_options_t* opt) {
+  return opt->rep.inplace_update_support;
+}
+
+void rocksdb_options_set_inplace_update_num_locks(rocksdb_options_t* opt,
+                                                  size_t v) {
+  opt->rep.inplace_update_num_locks = v;
+}
+
+size_t rocksdb_options_get_inplace_update_num_locks(rocksdb_options_t* opt) {
+  return opt->rep.inplace_update_num_locks;
+}
+
+void rocksdb_options_set_report_bg_io_stats(rocksdb_options_t* opt, int v) {
+  opt->rep.report_bg_io_stats = v;
+}
+
+unsigned char rocksdb_options_get_report_bg_io_stats(rocksdb_options_t* opt) {
+  return opt->rep.report_bg_io_stats;
+}
+
+void rocksdb_options_set_compaction_style(rocksdb_options_t* opt, int style) {
+  opt->rep.compaction_style =
+      static_cast<ROCKSDB_NAMESPACE::CompactionStyle>(style);
+}
+
+int rocksdb_options_get_compaction_style(rocksdb_options_t* opt) {
+  return opt->rep.compaction_style;
+}
+
+void rocksdb_options_set_universal_compaction_options(
+    rocksdb_options_t* opt, rocksdb_universal_compaction_options_t* uco) {
+  opt->rep.compaction_options_universal = *(uco->rep);
+}
+
+void rocksdb_options_set_fifo_compaction_options(
+    rocksdb_options_t* opt, rocksdb_fifo_compaction_options_t* fifo) {
+  opt->rep.compaction_options_fifo = fifo->rep;
+}
+
+char* rocksdb_options_statistics_get_string(rocksdb_options_t* opt) {
+  ROCKSDB_NAMESPACE::Statistics* statistics = opt->rep.statistics.get();
+  if (statistics) {
+    return strdup(statistics->ToString().c_str());
+  }
+  return nullptr;
+}
+
+void rocksdb_options_set_ratelimiter(rocksdb_options_t* opt,
+                                     rocksdb_ratelimiter_t* limiter) {
+  if (limiter) {
+    opt->rep.rate_limiter = limiter->rep;
+  }
+}
+
+void rocksdb_options_set_atomic_flush(rocksdb_options_t* opt,
+                                      unsigned char atomic_flush) {
+  opt->rep.atomic_flush = atomic_flush;
+}
+
+unsigned char rocksdb_options_get_atomic_flush(rocksdb_options_t* opt) {
+  return opt->rep.atomic_flush;
+}
+
+void rocksdb_options_set_manual_wal_flush(rocksdb_options_t* opt,
+                                          unsigned char manual_wal_flush) {
+  opt->rep.manual_wal_flush = manual_wal_flush;
+}
+
+unsigned char rocksdb_options_get_manual_wal_flush(rocksdb_options_t* opt) {
+  return opt->rep.manual_wal_flush;
+}
+
+void rocksdb_options_set_wal_compression(rocksdb_options_t* opt, int val) {
+  opt->rep.wal_compression = static_cast<CompressionType>(val);
+}
+
+int rocksdb_options_get_wal_compression(rocksdb_options_t* opt) {
+  return opt->rep.wal_compression;
+}
+
+rocksdb_ratelimiter_t* rocksdb_ratelimiter_create(int64_t rate_bytes_per_sec,
+                                                  int64_t refill_period_us,
+                                                  int32_t fairness) {
+  rocksdb_ratelimiter_t* rate_limiter = new rocksdb_ratelimiter_t;
+  rate_limiter->rep.reset(
+      NewGenericRateLimiter(rate_bytes_per_sec, refill_period_us, fairness));
+  return rate_limiter;
+}
+
+void rocksdb_ratelimiter_destroy(rocksdb_ratelimiter_t* limiter) {
+  delete limiter;
+}
+
+void rocksdb_options_set_row_cache(rocksdb_options_t* opt,
+                                   rocksdb_cache_t* cache) {
+  if (cache) {
+    opt->rep.row_cache = cache->rep;
+  }
+}
+
+void rocksdb_options_add_compact_on_deletion_collector_factory(
+    rocksdb_options_t* opt, size_t window_size, size_t num_dels_trigger) {
+  std::shared_ptr<ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory>
+      compact_on_del =
+          NewCompactOnDeletionCollectorFactory(window_size, num_dels_trigger);
+  opt->rep.table_properties_collector_factories.emplace_back(compact_on_del);
+}
+
+void rocksdb_set_perf_level(int v) {
+  PerfLevel level = static_cast<PerfLevel>(v);
+  SetPerfLevel(level);
+}
+
+rocksdb_perfcontext_t* rocksdb_perfcontext_create() {
+  rocksdb_perfcontext_t* context = new rocksdb_perfcontext_t;
+  context->rep = ROCKSDB_NAMESPACE::get_perf_context();
+  return context;
+}
+
+void rocksdb_perfcontext_reset(rocksdb_perfcontext_t* context) {
+  context->rep->Reset();
+}
+
+char* rocksdb_perfcontext_report(rocksdb_perfcontext_t* context,
+                                 unsigned char exclude_zero_counters) {
+  return strdup(context->rep->ToString(exclude_zero_counters).c_str());
+}
+
+uint64_t rocksdb_perfcontext_metric(rocksdb_perfcontext_t* context,
+                                    int metric) {
+  PerfContext* rep = context->rep;
+  switch (metric) {
+    case rocksdb_user_key_comparison_count:
+      return rep->user_key_comparison_count;
+    case rocksdb_block_cache_hit_count:
+      return rep->block_cache_hit_count;
+    case rocksdb_block_read_count:
+      return rep->block_read_count;
+    case rocksdb_block_read_byte:
+      return rep->block_read_byte;
+    case rocksdb_block_read_time:
+      return rep->block_read_time;
+    case rocksdb_block_checksum_time:
+      return rep->block_checksum_time;
+    case rocksdb_block_decompress_time:
+      return rep->block_decompress_time;
+    case rocksdb_get_read_bytes:
+      return rep->get_read_bytes;
+    case rocksdb_multiget_read_bytes:
+      return rep->multiget_read_bytes;
+    case rocksdb_iter_read_bytes:
+      return rep->iter_read_bytes;
+    case rocksdb_internal_key_skipped_count:
+      return rep->internal_key_skipped_count;
+    case rocksdb_internal_delete_skipped_count:
+      return rep->internal_delete_skipped_count;
+    case rocksdb_internal_recent_skipped_count:
+      return rep->internal_recent_skipped_count;
+    case rocksdb_internal_merge_count:
+      return rep->internal_merge_count;
+    case rocksdb_get_snapshot_time:
+      return rep->get_snapshot_time;
+    case rocksdb_get_from_memtable_time:
+      return rep->get_from_memtable_time;
+    case rocksdb_get_from_memtable_count:
+      return rep->get_from_memtable_count;
+    case rocksdb_get_post_process_time:
+      return rep->get_post_process_time;
+    case rocksdb_get_from_output_files_time:
+      return rep->get_from_output_files_time;
+    case rocksdb_seek_on_memtable_time:
+      return rep->seek_on_memtable_time;
+    case rocksdb_seek_on_memtable_count:
+      return rep->seek_on_memtable_count;
+    case rocksdb_next_on_memtable_count:
+      return rep->next_on_memtable_count;
+    case rocksdb_prev_on_memtable_count:
+      return rep->prev_on_memtable_count;
+    case rocksdb_seek_child_seek_time:
+      return rep->seek_child_seek_time;
+    case rocksdb_seek_child_seek_count:
+      return rep->seek_child_seek_count;
+    case rocksdb_seek_min_heap_time:
+      return rep->seek_min_heap_time;
+    case rocksdb_seek_max_heap_time:
+      return rep->seek_max_heap_time;
+    case rocksdb_seek_internal_seek_time:
+      return rep->seek_internal_seek_time;
+    case rocksdb_find_next_user_entry_time:
+      return rep->find_next_user_entry_time;
+    case rocksdb_write_wal_time:
+      return rep->write_wal_time;
+    case rocksdb_write_memtable_time:
+      return rep->write_memtable_time;
+    case rocksdb_write_delay_time:
+      return rep->write_delay_time;
+    case rocksdb_write_pre_and_post_process_time:
+      return rep->write_pre_and_post_process_time;
+    case rocksdb_db_mutex_lock_nanos:
+      return rep->db_mutex_lock_nanos;
+    case rocksdb_db_condition_wait_nanos:
+      return rep->db_condition_wait_nanos;
+    case rocksdb_merge_operator_time_nanos:
+      return rep->merge_operator_time_nanos;
+    case rocksdb_read_index_block_nanos:
+      return rep->read_index_block_nanos;
+    case rocksdb_read_filter_block_nanos:
+      return rep->read_filter_block_nanos;
+    case rocksdb_new_table_block_iter_nanos:
+      return rep->new_table_block_iter_nanos;
+    case rocksdb_new_table_iterator_nanos:
+      return rep->new_table_iterator_nanos;
+    case rocksdb_block_seek_nanos:
+      return rep->block_seek_nanos;
+    case rocksdb_find_table_nanos:
+      return rep->find_table_nanos;
+    case rocksdb_bloom_memtable_hit_count:
+      return rep->bloom_memtable_hit_count;
+    case rocksdb_bloom_memtable_miss_count:
+      return rep->bloom_memtable_miss_count;
+    case rocksdb_bloom_sst_hit_count:
+      return rep->bloom_sst_hit_count;
+    case rocksdb_bloom_sst_miss_count:
+      return rep->bloom_sst_miss_count;
+    case rocksdb_key_lock_wait_time:
+      return rep->key_lock_wait_time;
+    case rocksdb_key_lock_wait_count:
+      return rep->key_lock_wait_count;
+    case rocksdb_env_new_sequential_file_nanos:
+      return rep->env_new_sequential_file_nanos;
+    case rocksdb_env_new_random_access_file_nanos:
+      return rep->env_new_random_access_file_nanos;
+    case rocksdb_env_new_writable_file_nanos:
+      return rep->env_new_writable_file_nanos;
+    case rocksdb_env_reuse_writable_file_nanos:
+      return rep->env_reuse_writable_file_nanos;
+    case rocksdb_env_new_random_rw_file_nanos:
+      return rep->env_new_random_rw_file_nanos;
+    case rocksdb_env_new_directory_nanos:
+      return rep->env_new_directory_nanos;
+    case rocksdb_env_file_exists_nanos:
+      return rep->env_file_exists_nanos;
+    case rocksdb_env_get_children_nanos:
+      return rep->env_get_children_nanos;
+    case rocksdb_env_get_children_file_attributes_nanos:
+      return rep->env_get_children_file_attributes_nanos;
+    case rocksdb_env_delete_file_nanos:
+      return rep->env_delete_file_nanos;
+    case rocksdb_env_create_dir_nanos:
+      return rep->env_create_dir_nanos;
+    case rocksdb_env_create_dir_if_missing_nanos:
+      return rep->env_create_dir_if_missing_nanos;
+    case rocksdb_env_delete_dir_nanos:
+      return rep->env_delete_dir_nanos;
+    case rocksdb_env_get_file_size_nanos:
+      return rep->env_get_file_size_nanos;
+    case rocksdb_env_get_file_modification_time_nanos:
+      return rep->env_get_file_modification_time_nanos;
+    case rocksdb_env_rename_file_nanos:
+      return rep->env_rename_file_nanos;
+    case rocksdb_env_link_file_nanos:
+      return rep->env_link_file_nanos;
+    case rocksdb_env_lock_file_nanos:
+      return rep->env_lock_file_nanos;
+    case rocksdb_env_unlock_file_nanos:
+      return rep->env_unlock_file_nanos;
+    case rocksdb_env_new_logger_nanos:
+      return rep->env_new_logger_nanos;
+    case rocksdb_number_async_seek:
+      return rep->number_async_seek;
+    case rocksdb_blob_cache_hit_count:
+      return rep->blob_cache_hit_count;
+    case rocksdb_blob_read_count:
+      return rep->blob_read_count;
+    case rocksdb_blob_read_byte:
+      return rep->blob_read_byte;
+    case rocksdb_blob_read_time:
+      return rep->blob_read_time;
+    case rocksdb_blob_checksum_time:
+      return rep->blob_checksum_time;
+    case rocksdb_blob_decompress_time:
+      return rep->blob_decompress_time;
+    case rocksdb_internal_range_del_reseek_count:
+      return rep->internal_range_del_reseek_count;
+    default:
+      break;
+  }
+  return 0;
+}
+
+void rocksdb_perfcontext_destroy(rocksdb_perfcontext_t* context) {
+  delete context;
+}
+
+/*
+TODO:
+DB::OpenForReadOnly
+DB::KeyMayExist
+DB::GetOptions
+DB::GetSortedWalFiles
+DB::GetLatestSequenceNumber
+DB::GetUpdatesSince
+DB::GetDbIdentity
+DB::RunManualCompaction
+custom cache
+table_properties_collectors
+*/
+
+rocksdb_compactionfilter_t* rocksdb_compactionfilter_create(
+    void* state, void (*destructor)(void*),
+    unsigned char (*filter)(void*, int level, const char* key,
+                            size_t key_length, const char* existing_value,
+                            size_t value_length, char** new_value,
+                            size_t* new_value_length,
+                            unsigned char* value_changed),
+    const char* (*name)(void*)) {
+  rocksdb_compactionfilter_t* result = new rocksdb_compactionfilter_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->filter_ = filter;
+  result->ignore_snapshots_ = true;
+  result->name_ = name;
+  return result;
+}
+
+void rocksdb_compactionfilter_set_ignore_snapshots(
+    rocksdb_compactionfilter_t* filter, unsigned char whether_ignore) {
+  filter->ignore_snapshots_ = whether_ignore;
+}
+
+void rocksdb_compactionfilter_destroy(rocksdb_compactionfilter_t* filter) {
+  delete filter;
+}
+
+unsigned char rocksdb_compactionfiltercontext_is_full_compaction(
+    rocksdb_compactionfiltercontext_t* context) {
+  return context->rep.is_full_compaction;
+}
+
+unsigned char rocksdb_compactionfiltercontext_is_manual_compaction(
+    rocksdb_compactionfiltercontext_t* context) {
+  return context->rep.is_manual_compaction;
+}
+
+rocksdb_compactionfilterfactory_t* rocksdb_compactionfilterfactory_create(
+    void* state, void (*destructor)(void*),
+    rocksdb_compactionfilter_t* (*create_compaction_filter)(
+        void*, rocksdb_compactionfiltercontext_t* context),
+    const char* (*name)(void*)) {
+  rocksdb_compactionfilterfactory_t* result =
+      new rocksdb_compactionfilterfactory_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->create_compaction_filter_ = create_compaction_filter;
+  result->name_ = name;
+  return result;
+}
+
+void rocksdb_compactionfilterfactory_destroy(
+    rocksdb_compactionfilterfactory_t* factory) {
+  delete factory;
+}
+
+rocksdb_comparator_t* rocksdb_comparator_create(
+    void* state, void (*destructor)(void*),
+    int (*compare)(void*, const char* a, size_t alen, const char* b,
+                   size_t blen),
+    const char* (*name)(void*)) {
+  rocksdb_comparator_t* result = new rocksdb_comparator_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->compare_ = compare;
+  result->name_ = name;
+  result->compare_ts_ = nullptr;
+  result->compare_without_ts_ = nullptr;
+  return result;
+}
+
+void rocksdb_comparator_destroy(rocksdb_comparator_t* cmp) { delete cmp; }
+
+rocksdb_comparator_t* rocksdb_comparator_with_ts_create(
+    void* state, void (*destructor)(void*),
+    int (*compare)(void*, const char* a, size_t alen, const char* b,
+                   size_t blen),
+    int (*compare_ts)(void*, const char* a_ts, size_t a_tslen, const char* b_ts,
+                      size_t b_tslen),
+    int (*compare_without_ts)(void*, const char* a, size_t alen,
+                              unsigned char a_has_ts, const char* b,
+                              size_t blen, unsigned char b_has_ts),
+    const char* (*name)(void*), size_t timestamp_size) {
+  rocksdb_comparator_t* result = new rocksdb_comparator_t(timestamp_size);
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->compare_ = compare;
+  result->compare_ts_ = compare_ts;
+  result->compare_without_ts_ = compare_without_ts;
+  result->name_ = name;
+  return result;
+}
+
+void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t* filter) {
+  delete filter;
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_format(
+    double bits_per_key, bool original_format) {
+  // Make a rocksdb_filterpolicy_t, but override all of its methods so
+  // they delegate to a NewBloomFilterPolicy() instead of user
+  // supplied C functions.
+  struct Wrapper : public rocksdb_filterpolicy_t {
+    const FilterPolicy* rep_;
+    ~Wrapper() override { delete rep_; }
+    const char* Name() const override { return rep_->Name(); }
+    const char* CompatibilityName() const override {
+      return rep_->CompatibilityName();
+    }
+    // No need to override GetFilterBitsBuilder if this one is overridden
+    ROCKSDB_NAMESPACE::FilterBitsBuilder* GetBuilderWithContext(
+        const ROCKSDB_NAMESPACE::FilterBuildingContext& context)
+        const override {
+      return rep_->GetBuilderWithContext(context);
+    }
+    ROCKSDB_NAMESPACE::FilterBitsReader* GetFilterBitsReader(
+        const Slice& contents) const override {
+      return rep_->GetFilterBitsReader(contents);
+    }
+    static void DoNothing(void*) {}
+  };
+  Wrapper* wrapper = new Wrapper;
+  wrapper->rep_ = NewBloomFilterPolicy(bits_per_key, original_format);
+  wrapper->state_ = nullptr;
+  wrapper->destructor_ = &Wrapper::DoNothing;
+  return wrapper;
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_full(
+    double bits_per_key) {
+  return rocksdb_filterpolicy_create_bloom_format(bits_per_key, false);
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(double bits_per_key) {
+  return rocksdb_filterpolicy_create_bloom_format(bits_per_key, true);
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_ribbon_format(
+    double bloom_equivalent_bits_per_key, int bloom_before_level) {
+  // Make a rocksdb_filterpolicy_t, but override all of its methods so
+  // they delegate to a NewRibbonFilterPolicy() instead of user
+  // supplied C functions.
+  struct Wrapper : public rocksdb_filterpolicy_t {
+    const FilterPolicy* rep_;
+    ~Wrapper() override { delete rep_; }
+    const char* Name() const override { return rep_->Name(); }
+    const char* CompatibilityName() const override {
+      return rep_->CompatibilityName();
+    }
+    ROCKSDB_NAMESPACE::FilterBitsBuilder* GetBuilderWithContext(
+        const ROCKSDB_NAMESPACE::FilterBuildingContext& context)
+        const override {
+      return rep_->GetBuilderWithContext(context);
+    }
+    ROCKSDB_NAMESPACE::FilterBitsReader* GetFilterBitsReader(
+        const Slice& contents) const override {
+      return rep_->GetFilterBitsReader(contents);
+    }
+    static void DoNothing(void*) {}
+  };
+  Wrapper* wrapper = new Wrapper;
+  wrapper->rep_ =
+      NewRibbonFilterPolicy(bloom_equivalent_bits_per_key, bloom_before_level);
+  wrapper->state_ = nullptr;
+  wrapper->destructor_ = &Wrapper::DoNothing;
+  return wrapper;
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_ribbon(
+    double bloom_equivalent_bits_per_key) {
+  return rocksdb_filterpolicy_create_ribbon_format(
+      bloom_equivalent_bits_per_key, /*bloom_before_level = disabled*/ -1);
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_ribbon_hybrid(
+    double bloom_equivalent_bits_per_key, int bloom_before_level) {
+  return rocksdb_filterpolicy_create_ribbon_format(
+      bloom_equivalent_bits_per_key, bloom_before_level);
+}
+
+rocksdb_mergeoperator_t* rocksdb_mergeoperator_create(
+    void* state, void (*destructor)(void*),
+    char* (*full_merge)(void*, const char* key, size_t key_length,
+                        const char* existing_value,
+                        size_t existing_value_length,
+                        const char* const* operands_list,
+                        const size_t* operands_list_length, int num_operands,
+                        unsigned char* success, size_t* new_value_length),
+    char* (*partial_merge)(void*, const char* key, size_t key_length,
+                           const char* const* operands_list,
+                           const size_t* operands_list_length, int num_operands,
+                           unsigned char* success, size_t* new_value_length),
+    void (*delete_value)(void*, const char* value, size_t value_length),
+    const char* (*name)(void*)) {
+  rocksdb_mergeoperator_t* result = new rocksdb_mergeoperator_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->full_merge_ = full_merge;
+  result->partial_merge_ = partial_merge;
+  result->delete_value_ = delete_value;
+  result->name_ = name;
+  return result;
+}
+
+void rocksdb_mergeoperator_destroy(rocksdb_mergeoperator_t* merge_operator) {
+  delete merge_operator;
+}
+
+rocksdb_readoptions_t* rocksdb_readoptions_create() {
+  return new rocksdb_readoptions_t;
+}
+
+void rocksdb_readoptions_destroy(rocksdb_readoptions_t* opt) { delete opt; }
+
+void rocksdb_readoptions_set_verify_checksums(rocksdb_readoptions_t* opt,
+                                              unsigned char v) {
+  opt->rep.verify_checksums = v;
+}
+
+unsigned char rocksdb_readoptions_get_verify_checksums(
+    rocksdb_readoptions_t* opt) {
+  return opt->rep.verify_checksums;
+}
+
+void rocksdb_readoptions_set_fill_cache(rocksdb_readoptions_t* opt,
+                                        unsigned char v) {
+  opt->rep.fill_cache = v;
+}
+
+unsigned char rocksdb_readoptions_get_fill_cache(rocksdb_readoptions_t* opt) {
+  return opt->rep.fill_cache;
+}
+
+void rocksdb_readoptions_set_snapshot(rocksdb_readoptions_t* opt,
+                                      const rocksdb_snapshot_t* snap) {
+  opt->rep.snapshot = (snap ? snap->rep : nullptr);
+}
+
+void rocksdb_readoptions_set_iterate_upper_bound(rocksdb_readoptions_t* opt,
+                                                 const char* key,
+                                                 size_t keylen) {
+  if (key == nullptr) {
+    opt->upper_bound = Slice();
+    opt->rep.iterate_upper_bound = nullptr;
+
+  } else {
+    opt->upper_bound = Slice(key, keylen);
+    opt->rep.iterate_upper_bound = &opt->upper_bound;
+  }
+}
+
+void rocksdb_readoptions_set_iterate_lower_bound(rocksdb_readoptions_t* opt,
+                                                 const char* key,
+                                                 size_t keylen) {
+  if (key == nullptr) {
+    opt->lower_bound = Slice();
+    opt->rep.iterate_lower_bound = nullptr;
+  } else {
+    opt->lower_bound = Slice(key, keylen);
+    opt->rep.iterate_lower_bound = &opt->lower_bound;
+  }
+}
+
+void rocksdb_readoptions_set_read_tier(rocksdb_readoptions_t* opt, int v) {
+  opt->rep.read_tier = static_cast<ROCKSDB_NAMESPACE::ReadTier>(v);
+}
+
+int rocksdb_readoptions_get_read_tier(rocksdb_readoptions_t* opt) {
+  return static_cast<int>(opt->rep.read_tier);
+}
+
+void rocksdb_readoptions_set_tailing(rocksdb_readoptions_t* opt,
+                                     unsigned char v) {
+  opt->rep.tailing = v;
+}
+
+unsigned char rocksdb_readoptions_get_tailing(rocksdb_readoptions_t* opt) {
+  return opt->rep.tailing;
+}
+
+void rocksdb_readoptions_set_managed(rocksdb_readoptions_t* opt,
+                                     unsigned char v) {
+  opt->rep.managed = v;
+}
+
+void rocksdb_readoptions_set_readahead_size(rocksdb_readoptions_t* opt,
+                                            size_t v) {
+  opt->rep.readahead_size = v;
+}
+
+size_t rocksdb_readoptions_get_readahead_size(rocksdb_readoptions_t* opt) {
+  return opt->rep.readahead_size;
+}
+
+void rocksdb_readoptions_set_prefix_same_as_start(rocksdb_readoptions_t* opt,
+                                                  unsigned char v) {
+  opt->rep.prefix_same_as_start = v;
+}
+
+unsigned char rocksdb_readoptions_get_prefix_same_as_start(
+    rocksdb_readoptions_t* opt) {
+  return opt->rep.prefix_same_as_start;
+}
+
+void rocksdb_readoptions_set_pin_data(rocksdb_readoptions_t* opt,
+                                      unsigned char v) {
+  opt->rep.pin_data = v;
+}
+
+unsigned char rocksdb_readoptions_get_pin_data(rocksdb_readoptions_t* opt) {
+  return opt->rep.pin_data;
+}
+
+void rocksdb_readoptions_set_total_order_seek(rocksdb_readoptions_t* opt,
+                                              unsigned char v) {
+  opt->rep.total_order_seek = v;
+}
+
+unsigned char rocksdb_readoptions_get_total_order_seek(
+    rocksdb_readoptions_t* opt) {
+  return opt->rep.total_order_seek;
+}
+
+void rocksdb_readoptions_set_max_skippable_internal_keys(
+    rocksdb_readoptions_t* opt, uint64_t v) {
+  opt->rep.max_skippable_internal_keys = v;
+}
+
+uint64_t rocksdb_readoptions_get_max_skippable_internal_keys(
+    rocksdb_readoptions_t* opt) {
+  return opt->rep.max_skippable_internal_keys;
+}
+
+void rocksdb_readoptions_set_background_purge_on_iterator_cleanup(
+    rocksdb_readoptions_t* opt, unsigned char v) {
+  opt->rep.background_purge_on_iterator_cleanup = v;
+}
+
+unsigned char rocksdb_readoptions_get_background_purge_on_iterator_cleanup(
+    rocksdb_readoptions_t* opt) {
+  return opt->rep.background_purge_on_iterator_cleanup;
+}
+
+void rocksdb_readoptions_set_ignore_range_deletions(rocksdb_readoptions_t* opt,
+                                                    unsigned char v) {
+  opt->rep.ignore_range_deletions = v;
+}
+
+unsigned char rocksdb_readoptions_get_ignore_range_deletions(
+    rocksdb_readoptions_t* opt) {
+  return opt->rep.ignore_range_deletions;
+}
+
+void rocksdb_readoptions_set_deadline(rocksdb_readoptions_t* opt,
+                                      uint64_t microseconds) {
+  opt->rep.deadline = std::chrono::microseconds(microseconds);
+}
+
+uint64_t rocksdb_readoptions_get_deadline(rocksdb_readoptions_t* opt) {
+  return opt->rep.deadline.count();
+}
+
+void rocksdb_readoptions_set_io_timeout(rocksdb_readoptions_t* opt,
+                                        uint64_t microseconds) {
+  opt->rep.io_timeout = std::chrono::microseconds(microseconds);
+}
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_readoptions_get_io_timeout(rocksdb_readoptions_t* opt) {
+  return opt->rep.io_timeout.count();
+}
+
+void rocksdb_readoptions_set_timestamp(rocksdb_readoptions_t* opt,
+                                       const char* ts, size_t tslen) {
+  if (ts == nullptr) {
+    opt->timestamp = Slice();
+    opt->rep.timestamp = nullptr;
+  } else {
+    opt->timestamp = Slice(ts, tslen);
+    opt->rep.timestamp = &opt->timestamp;
+  }
+}
+
+void rocksdb_readoptions_set_iter_start_ts(rocksdb_readoptions_t* opt,
+                                           const char* ts, size_t tslen) {
+  if (ts == nullptr) {
+    opt->iter_start_ts = Slice();
+    opt->rep.iter_start_ts = nullptr;
+  } else {
+    opt->iter_start_ts = Slice(ts, tslen);
+    opt->rep.iter_start_ts = &opt->iter_start_ts;
+  }
+}
+
+rocksdb_writeoptions_t* rocksdb_writeoptions_create() {
+  return new rocksdb_writeoptions_t;
+}
+
+void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t* opt) { delete opt; }
+
+void rocksdb_writeoptions_set_sync(rocksdb_writeoptions_t* opt,
+                                   unsigned char v) {
+  opt->rep.sync = v;
+}
+
+unsigned char rocksdb_writeoptions_get_sync(rocksdb_writeoptions_t* opt) {
+  return opt->rep.sync;
+}
+
+void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt,
+                                      int disable) {
+  opt->rep.disableWAL = disable;
+}
+
+unsigned char rocksdb_writeoptions_get_disable_WAL(
+    rocksdb_writeoptions_t* opt) {
+  return opt->rep.disableWAL;
+}
+
+void rocksdb_writeoptions_set_ignore_missing_column_families(
+    rocksdb_writeoptions_t* opt, unsigned char v) {
+  opt->rep.ignore_missing_column_families = v;
+}
+
+unsigned char rocksdb_writeoptions_get_ignore_missing_column_families(
+    rocksdb_writeoptions_t* opt) {
+  return opt->rep.ignore_missing_column_families;
+}
+
+void rocksdb_writeoptions_set_no_slowdown(rocksdb_writeoptions_t* opt,
+                                          unsigned char v) {
+  opt->rep.no_slowdown = v;
+}
+
+unsigned char rocksdb_writeoptions_get_no_slowdown(
+    rocksdb_writeoptions_t* opt) {
+  return opt->rep.no_slowdown;
+}
+
+void rocksdb_writeoptions_set_low_pri(rocksdb_writeoptions_t* opt,
+                                      unsigned char v) {
+  opt->rep.low_pri = v;
+}
+
+unsigned char rocksdb_writeoptions_get_low_pri(rocksdb_writeoptions_t* opt) {
+  return opt->rep.low_pri;
+}
+
+void rocksdb_writeoptions_set_memtable_insert_hint_per_batch(
+    rocksdb_writeoptions_t* opt, unsigned char v) {
+  opt->rep.memtable_insert_hint_per_batch = v;
+}
+
+unsigned char rocksdb_writeoptions_get_memtable_insert_hint_per_batch(
+    rocksdb_writeoptions_t* opt) {
+  return opt->rep.memtable_insert_hint_per_batch;
+}
+
+rocksdb_compactoptions_t* rocksdb_compactoptions_create() {
+  return new rocksdb_compactoptions_t;
+}
+
+void rocksdb_compactoptions_destroy(rocksdb_compactoptions_t* opt) {
+  delete opt;
+}
+
+void rocksdb_compactoptions_set_bottommost_level_compaction(
+    rocksdb_compactoptions_t* opt, unsigned char v) {
+  opt->rep.bottommost_level_compaction =
+      static_cast<BottommostLevelCompaction>(v);
+}
+
+unsigned char rocksdb_compactoptions_get_bottommost_level_compaction(
+    rocksdb_compactoptions_t* opt) {
+  return static_cast<unsigned char>(opt->rep.bottommost_level_compaction);
+}
+
+void rocksdb_compactoptions_set_exclusive_manual_compaction(
+    rocksdb_compactoptions_t* opt, unsigned char v) {
+  opt->rep.exclusive_manual_compaction = v;
+}
+
+unsigned char rocksdb_compactoptions_get_exclusive_manual_compaction(
+    rocksdb_compactoptions_t* opt) {
+  return opt->rep.exclusive_manual_compaction;
+}
+
+void rocksdb_compactoptions_set_change_level(rocksdb_compactoptions_t* opt,
+                                             unsigned char v) {
+  opt->rep.change_level = v;
+}
+
+unsigned char rocksdb_compactoptions_get_change_level(
+    rocksdb_compactoptions_t* opt) {
+  return opt->rep.change_level;
+}
+
+void rocksdb_compactoptions_set_target_level(rocksdb_compactoptions_t* opt,
+                                             int n) {
+  opt->rep.target_level = n;
+}
+
+int rocksdb_compactoptions_get_target_level(rocksdb_compactoptions_t* opt) {
+  return opt->rep.target_level;
+}
+
+void rocksdb_compactoptions_set_full_history_ts_low(
+    rocksdb_compactoptions_t* opt, char* ts, size_t tslen) {
+  if (ts == nullptr) {
+    opt->full_history_ts_low = Slice();
+    opt->rep.full_history_ts_low = nullptr;
+  } else {
+    opt->full_history_ts_low = Slice(ts, tslen);
+    opt->rep.full_history_ts_low = &opt->full_history_ts_low;
+  }
+}
+
+rocksdb_flushoptions_t* rocksdb_flushoptions_create() {
+  return new rocksdb_flushoptions_t;
+}
+
+void rocksdb_flushoptions_destroy(rocksdb_flushoptions_t* opt) { delete opt; }
+
+void rocksdb_flushoptions_set_wait(rocksdb_flushoptions_t* opt,
+                                   unsigned char v) {
+  opt->rep.wait = v;
+}
+
+unsigned char rocksdb_flushoptions_get_wait(rocksdb_flushoptions_t* opt) {
+  return opt->rep.wait;
+}
+
+rocksdb_memory_allocator_t* rocksdb_jemalloc_nodump_allocator_create(
+    char** errptr) {
+  rocksdb_memory_allocator_t* allocator = new rocksdb_memory_allocator_t;
+  ROCKSDB_NAMESPACE::JemallocAllocatorOptions options;
+  SaveError(errptr, ROCKSDB_NAMESPACE::NewJemallocNodumpAllocator(
+                        options, &allocator->rep));
+  return allocator;
+}
+
+void rocksdb_memory_allocator_destroy(rocksdb_memory_allocator_t* allocator) {
+  delete allocator;
+}
+
+rocksdb_lru_cache_options_t* rocksdb_lru_cache_options_create() {
+  return new rocksdb_lru_cache_options_t;
+}
+
+void rocksdb_lru_cache_options_destroy(rocksdb_lru_cache_options_t* opt) {
+  delete opt;
+}
+
+void rocksdb_lru_cache_options_set_capacity(rocksdb_lru_cache_options_t* opt,
+                                            size_t capacity) {
+  opt->rep.capacity = capacity;
+}
+
+void rocksdb_lru_cache_options_set_num_shard_bits(
+    rocksdb_lru_cache_options_t* opt, int num_shard_bits) {
+  opt->rep.num_shard_bits = num_shard_bits;
+}
+
+void rocksdb_lru_cache_options_set_memory_allocator(
+    rocksdb_lru_cache_options_t* opt, rocksdb_memory_allocator_t* allocator) {
+  opt->rep.memory_allocator = allocator->rep;
+}
+
+rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity) {
+  rocksdb_cache_t* c = new rocksdb_cache_t;
+  c->rep = NewLRUCache(capacity);
+  return c;
+}
+
+rocksdb_cache_t* rocksdb_cache_create_lru_with_strict_capacity_limit(
+    size_t capacity) {
+  rocksdb_cache_t* c = new rocksdb_cache_t;
+  c->rep = NewLRUCache(capacity);
+  c->rep->SetStrictCapacityLimit(true);
+  return c;
+}
+
+rocksdb_cache_t* rocksdb_cache_create_lru_opts(
+    rocksdb_lru_cache_options_t* opt) {
+  rocksdb_cache_t* c = new rocksdb_cache_t;
+  c->rep = NewLRUCache(opt->rep);
+  return c;
+}
+
+void rocksdb_cache_destroy(rocksdb_cache_t* cache) { delete cache; }
+
+void rocksdb_cache_disown_data(rocksdb_cache_t* cache) {
+  cache->rep->DisownData();
+}
+
+void rocksdb_cache_set_capacity(rocksdb_cache_t* cache, size_t capacity) {
+  cache->rep->SetCapacity(capacity);
+}
+
+size_t rocksdb_cache_get_capacity(rocksdb_cache_t* cache) {
+  return cache->rep->GetCapacity();
+}
+
+size_t rocksdb_cache_get_usage(rocksdb_cache_t* cache) {
+  return cache->rep->GetUsage();
+}
+
+size_t rocksdb_cache_get_pinned_usage(rocksdb_cache_t* cache) {
+  return cache->rep->GetPinnedUsage();
+}
+
+rocksdb_dbpath_t* rocksdb_dbpath_create(const char* path,
+                                        uint64_t target_size) {
+  rocksdb_dbpath_t* result = new rocksdb_dbpath_t;
+  result->rep.path = std::string(path);
+  result->rep.target_size = target_size;
+  return result;
+}
+
+void rocksdb_dbpath_destroy(rocksdb_dbpath_t* dbpath) { delete dbpath; }
+
+rocksdb_env_t* rocksdb_create_default_env() {
+  rocksdb_env_t* result = new rocksdb_env_t;
+  result->rep = Env::Default();
+  result->is_default = true;
+  return result;
+}
+
+rocksdb_env_t* rocksdb_create_mem_env() {
+  rocksdb_env_t* result = new rocksdb_env_t;
+  result->rep = ROCKSDB_NAMESPACE::NewMemEnv(Env::Default());
+  result->is_default = false;
+  return result;
+}
+
+void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n) {
+  env->rep->SetBackgroundThreads(n);
+}
+
+int rocksdb_env_get_background_threads(rocksdb_env_t* env) {
+  return env->rep->GetBackgroundThreads();
+}
+
+void rocksdb_env_set_bottom_priority_background_threads(rocksdb_env_t* env,
+                                                        int n) {
+  env->rep->SetBackgroundThreads(n, Env::BOTTOM);
+}
+
+int rocksdb_env_get_bottom_priority_background_threads(rocksdb_env_t* env) {
+  return env->rep->GetBackgroundThreads(Env::BOTTOM);
+}
+
+void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env,
+                                                      int n) {
+  env->rep->SetBackgroundThreads(n, Env::HIGH);
+}
+
+int rocksdb_env_get_high_priority_background_threads(rocksdb_env_t* env) {
+  return env->rep->GetBackgroundThreads(Env::HIGH);
+}
+
+void rocksdb_env_set_low_priority_background_threads(rocksdb_env_t* env,
+                                                     int n) {
+  env->rep->SetBackgroundThreads(n, Env::LOW);
+}
+
+int rocksdb_env_get_low_priority_background_threads(rocksdb_env_t* env) {
+  return env->rep->GetBackgroundThreads(Env::LOW);
+}
+
+void rocksdb_env_join_all_threads(rocksdb_env_t* env) {
+  env->rep->WaitForJoin();
+}
+
+void rocksdb_env_lower_thread_pool_io_priority(rocksdb_env_t* env) {
+  env->rep->LowerThreadPoolIOPriority();
+}
+
+void rocksdb_env_lower_high_priority_thread_pool_io_priority(
+    rocksdb_env_t* env) {
+  env->rep->LowerThreadPoolIOPriority(Env::HIGH);
+}
+
+void rocksdb_env_lower_thread_pool_cpu_priority(rocksdb_env_t* env) {
+  env->rep->LowerThreadPoolCPUPriority();
+}
+
+void rocksdb_env_lower_high_priority_thread_pool_cpu_priority(
+    rocksdb_env_t* env) {
+  env->rep->LowerThreadPoolCPUPriority(Env::HIGH);
+}
+
+void rocksdb_env_destroy(rocksdb_env_t* env) {
+  if (!env->is_default) delete env->rep;
+  delete env;
+}
+
+rocksdb_envoptions_t* rocksdb_envoptions_create() {
+  rocksdb_envoptions_t* opt = new rocksdb_envoptions_t;
+  return opt;
+}
+
+void rocksdb_envoptions_destroy(rocksdb_envoptions_t* opt) { delete opt; }
+
+rocksdb_sstfilewriter_t* rocksdb_sstfilewriter_create(
+    const rocksdb_envoptions_t* env, const rocksdb_options_t* io_options) {
+  rocksdb_sstfilewriter_t* writer = new rocksdb_sstfilewriter_t;
+  writer->rep = new SstFileWriter(env->rep, io_options->rep);
+  return writer;
+}
+
+void rocksdb_create_dir_if_missing(rocksdb_env_t* env, const char* path,
+                                   char** errptr) {
+  SaveError(errptr, env->rep->CreateDirIfMissing(std::string(path)));
+}
+
+rocksdb_sstfilewriter_t* rocksdb_sstfilewriter_create_with_comparator(
+    const rocksdb_envoptions_t* env, const rocksdb_options_t* io_options,
+    const rocksdb_comparator_t* /*comparator*/) {
+  rocksdb_sstfilewriter_t* writer = new rocksdb_sstfilewriter_t;
+  writer->rep = new SstFileWriter(env->rep, io_options->rep);
+  return writer;
+}
+
+void rocksdb_sstfilewriter_open(rocksdb_sstfilewriter_t* writer,
+                                const char* name, char** errptr) {
+  SaveError(errptr, writer->rep->Open(std::string(name)));
+}
+
+void rocksdb_sstfilewriter_add(rocksdb_sstfilewriter_t* writer, const char* key,
+                               size_t keylen, const char* val, size_t vallen,
+                               char** errptr) {
+  SaveError(errptr, writer->rep->Put(Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_sstfilewriter_put(rocksdb_sstfilewriter_t* writer, const char* key,
+                               size_t keylen, const char* val, size_t vallen,
+                               char** errptr) {
+  SaveError(errptr, writer->rep->Put(Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_sstfilewriter_put_with_ts(rocksdb_sstfilewriter_t* writer,
+                                       const char* key, size_t keylen,
+                                       const char* ts, size_t tslen,
+                                       const char* val, size_t vallen,
+                                       char** errptr) {
+  SaveError(errptr, writer->rep->Put(Slice(key, keylen), Slice(ts, tslen),
+                                     Slice(val, vallen)));
+}
+
+void rocksdb_sstfilewriter_merge(rocksdb_sstfilewriter_t* writer,
+                                 const char* key, size_t keylen,
+                                 const char* val, size_t vallen,
+                                 char** errptr) {
+  SaveError(errptr, writer->rep->Merge(Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_sstfilewriter_delete(rocksdb_sstfilewriter_t* writer,
+                                  const char* key, size_t keylen,
+                                  char** errptr) {
+  SaveError(errptr, writer->rep->Delete(Slice(key, keylen)));
+}
+
+void rocksdb_sstfilewriter_delete_with_ts(rocksdb_sstfilewriter_t* writer,
+                                          const char* key, size_t keylen,
+                                          const char* ts, size_t tslen,
+                                          char** errptr) {
+  SaveError(errptr, writer->rep->Delete(Slice(key, keylen), Slice(ts, tslen)));
+}
+
+void rocksdb_sstfilewriter_delete_range(rocksdb_sstfilewriter_t* writer,
+                                        const char* begin_key,
+                                        size_t begin_keylen,
+                                        const char* end_key, size_t end_keylen,
+                                        char** errptr) {
+  SaveError(errptr, writer->rep->DeleteRange(Slice(begin_key, begin_keylen),
+                                             Slice(end_key, end_keylen)));
+}
+
+void rocksdb_sstfilewriter_finish(rocksdb_sstfilewriter_t* writer,
+                                  char** errptr) {
+  SaveError(errptr, writer->rep->Finish(nullptr));
+}
+
+void rocksdb_sstfilewriter_file_size(rocksdb_sstfilewriter_t* writer,
+                                     uint64_t* file_size) {
+  *file_size = writer->rep->FileSize();
+}
+
+void rocksdb_sstfilewriter_destroy(rocksdb_sstfilewriter_t* writer) {
+  delete writer->rep;
+  delete writer;
+}
+
+rocksdb_ingestexternalfileoptions_t*
+rocksdb_ingestexternalfileoptions_create() {
+  rocksdb_ingestexternalfileoptions_t* opt =
+      new rocksdb_ingestexternalfileoptions_t;
+  return opt;
+}
+
+void rocksdb_ingestexternalfileoptions_set_move_files(
+    rocksdb_ingestexternalfileoptions_t* opt, unsigned char move_files) {
+  opt->rep.move_files = move_files;
+}
+
+void rocksdb_ingestexternalfileoptions_set_snapshot_consistency(
+    rocksdb_ingestexternalfileoptions_t* opt,
+    unsigned char snapshot_consistency) {
+  opt->rep.snapshot_consistency = snapshot_consistency;
+}
+
+void rocksdb_ingestexternalfileoptions_set_allow_global_seqno(
+    rocksdb_ingestexternalfileoptions_t* opt,
+    unsigned char allow_global_seqno) {
+  opt->rep.allow_global_seqno = allow_global_seqno;
+}
+
+void rocksdb_ingestexternalfileoptions_set_allow_blocking_flush(
+    rocksdb_ingestexternalfileoptions_t* opt,
+    unsigned char allow_blocking_flush) {
+  opt->rep.allow_blocking_flush = allow_blocking_flush;
+}
+
+void rocksdb_ingestexternalfileoptions_set_ingest_behind(
+    rocksdb_ingestexternalfileoptions_t* opt, unsigned char ingest_behind) {
+  opt->rep.ingest_behind = ingest_behind;
+}
+
+void rocksdb_ingestexternalfileoptions_destroy(
+    rocksdb_ingestexternalfileoptions_t* opt) {
+  delete opt;
+}
+
+void rocksdb_ingest_external_file(
+    rocksdb_t* db, const char* const* file_list, const size_t list_len,
+    const rocksdb_ingestexternalfileoptions_t* opt, char** errptr) {
+  std::vector<std::string> files(list_len);
+  for (size_t i = 0; i < list_len; ++i) {
+    files[i] = std::string(file_list[i]);
+  }
+  SaveError(errptr, db->rep->IngestExternalFile(files, opt->rep));
+}
+
+void rocksdb_ingest_external_file_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* handle,
+    const char* const* file_list, const size_t list_len,
+    const rocksdb_ingestexternalfileoptions_t* opt, char** errptr) {
+  std::vector<std::string> files(list_len);
+  for (size_t i = 0; i < list_len; ++i) {
+    files[i] = std::string(file_list[i]);
+  }
+  SaveError(errptr, db->rep->IngestExternalFile(handle->rep, files, opt->rep));
+}
+
+void rocksdb_try_catch_up_with_primary(rocksdb_t* db, char** errptr) {
+  SaveError(errptr, db->rep->TryCatchUpWithPrimary());
+}
+
+rocksdb_slicetransform_t* rocksdb_slicetransform_create(
+    void* state, void (*destructor)(void*),
+    char* (*transform)(void*, const char* key, size_t length,
+                       size_t* dst_length),
+    unsigned char (*in_domain)(void*, const char* key, size_t length),
+    unsigned char (*in_range)(void*, const char* key, size_t length),
+    const char* (*name)(void*)) {
+  rocksdb_slicetransform_t* result = new rocksdb_slicetransform_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->transform_ = transform;
+  result->in_domain_ = in_domain;
+  result->in_range_ = in_range;
+  result->name_ = name;
+  return result;
+}
+
+void rocksdb_slicetransform_destroy(rocksdb_slicetransform_t* st) { delete st; }
+
+struct SliceTransformWrapper : public rocksdb_slicetransform_t {
+  const SliceTransform* rep_;
+  ~SliceTransformWrapper() override { delete rep_; }
+  const char* Name() const override { return rep_->Name(); }
+  std::string GetId() const override { return rep_->GetId(); }
+  Slice Transform(const Slice& src) const override {
+    return rep_->Transform(src);
+  }
+  bool InDomain(const Slice& src) const override { return rep_->InDomain(src); }
+  bool InRange(const Slice& src) const override { return rep_->InRange(src); }
+  static void DoNothing(void*) {}
+};
+
+rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix(
+    size_t prefixLen) {
+  SliceTransformWrapper* wrapper = new SliceTransformWrapper;
+  wrapper->rep_ = ROCKSDB_NAMESPACE::NewFixedPrefixTransform(prefixLen);
+  wrapper->state_ = nullptr;
+  wrapper->destructor_ = &SliceTransformWrapper::DoNothing;
+  return wrapper;
+}
+
+rocksdb_slicetransform_t* rocksdb_slicetransform_create_noop() {
+  SliceTransformWrapper* wrapper = new SliceTransformWrapper;
+  wrapper->rep_ = ROCKSDB_NAMESPACE::NewNoopTransform();
+  wrapper->state_ = nullptr;
+  wrapper->destructor_ = &SliceTransformWrapper::DoNothing;
+  return wrapper;
+}
+
+rocksdb_universal_compaction_options_t*
+rocksdb_universal_compaction_options_create() {
+  rocksdb_universal_compaction_options_t* result =
+      new rocksdb_universal_compaction_options_t;
+  result->rep = new ROCKSDB_NAMESPACE::CompactionOptionsUniversal;
+  return result;
+}
+
+void rocksdb_universal_compaction_options_set_size_ratio(
+    rocksdb_universal_compaction_options_t* uco, int ratio) {
+  uco->rep->size_ratio = ratio;
+}
+
+int rocksdb_universal_compaction_options_get_size_ratio(
+    rocksdb_universal_compaction_options_t* uco) {
+  return uco->rep->size_ratio;
+}
+
+void rocksdb_universal_compaction_options_set_min_merge_width(
+    rocksdb_universal_compaction_options_t* uco, int w) {
+  uco->rep->min_merge_width = w;
+}
+
+int rocksdb_universal_compaction_options_get_min_merge_width(
+    rocksdb_universal_compaction_options_t* uco) {
+  return uco->rep->min_merge_width;
+}
+
+void rocksdb_universal_compaction_options_set_max_merge_width(
+    rocksdb_universal_compaction_options_t* uco, int w) {
+  uco->rep->max_merge_width = w;
+}
+
+int rocksdb_universal_compaction_options_get_max_merge_width(
+    rocksdb_universal_compaction_options_t* uco) {
+  return uco->rep->max_merge_width;
+}
+
+void rocksdb_universal_compaction_options_set_max_size_amplification_percent(
+    rocksdb_universal_compaction_options_t* uco, int p) {
+  uco->rep->max_size_amplification_percent = p;
+}
+
+int rocksdb_universal_compaction_options_get_max_size_amplification_percent(
+    rocksdb_universal_compaction_options_t* uco) {
+  return uco->rep->max_size_amplification_percent;
+}
+
+void rocksdb_universal_compaction_options_set_compression_size_percent(
+    rocksdb_universal_compaction_options_t* uco, int p) {
+  uco->rep->compression_size_percent = p;
+}
+
+int rocksdb_universal_compaction_options_get_compression_size_percent(
+    rocksdb_universal_compaction_options_t* uco) {
+  return uco->rep->compression_size_percent;
+}
+
+void rocksdb_universal_compaction_options_set_stop_style(
+    rocksdb_universal_compaction_options_t* uco, int style) {
+  uco->rep->stop_style =
+      static_cast<ROCKSDB_NAMESPACE::CompactionStopStyle>(style);
+}
+
+int rocksdb_universal_compaction_options_get_stop_style(
+    rocksdb_universal_compaction_options_t* uco) {
+  return static_cast<int>(uco->rep->stop_style);
+}
+
+void rocksdb_universal_compaction_options_destroy(
+    rocksdb_universal_compaction_options_t* uco) {
+  delete uco->rep;
+  delete uco;
+}
+
+rocksdb_fifo_compaction_options_t* rocksdb_fifo_compaction_options_create() {
+  rocksdb_fifo_compaction_options_t* result =
+      new rocksdb_fifo_compaction_options_t;
+  result->rep = CompactionOptionsFIFO();
+  return result;
+}
+
+void rocksdb_fifo_compaction_options_set_max_table_files_size(
+    rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size) {
+  fifo_opts->rep.max_table_files_size = size;
+}
+
+uint64_t rocksdb_fifo_compaction_options_get_max_table_files_size(
+    rocksdb_fifo_compaction_options_t* fifo_opts) {
+  return fifo_opts->rep.max_table_files_size;
+}
+
+void rocksdb_fifo_compaction_options_destroy(
+    rocksdb_fifo_compaction_options_t* fifo_opts) {
+  delete fifo_opts;
+}
+
+void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt,
+                                               int level) {
+  if (level >= 0) {
+    assert(level <= opt->rep.num_levels);
+    opt->rep.compression_per_level.resize(opt->rep.num_levels);
+    for (int i = 0; i < level; i++) {
+      opt->rep.compression_per_level[i] = ROCKSDB_NAMESPACE::kNoCompression;
+    }
+    for (int i = level; i < opt->rep.num_levels; i++) {
+      opt->rep.compression_per_level[i] = opt->rep.compression;
+    }
+  }
+}
+
+int rocksdb_livefiles_count(const rocksdb_livefiles_t* lf) {
+  return static_cast<int>(lf->rep.size());
+}
+
+const char* rocksdb_livefiles_column_family_name(const rocksdb_livefiles_t* lf,
+                                                 int index) {
+  return lf->rep[index].column_family_name.c_str();
+}
+
+const char* rocksdb_livefiles_name(const rocksdb_livefiles_t* lf, int index) {
+  return lf->rep[index].name.c_str();
+}
+
+int rocksdb_livefiles_level(const rocksdb_livefiles_t* lf, int index) {
+  return lf->rep[index].level;
+}
+
+size_t rocksdb_livefiles_size(const rocksdb_livefiles_t* lf, int index) {
+  return lf->rep[index].size;
+}
+
+const char* rocksdb_livefiles_smallestkey(const rocksdb_livefiles_t* lf,
+                                          int index, size_t* size) {
+  *size = lf->rep[index].smallestkey.size();
+  return lf->rep[index].smallestkey.data();
+}
+
+const char* rocksdb_livefiles_largestkey(const rocksdb_livefiles_t* lf,
+                                         int index, size_t* size) {
+  *size = lf->rep[index].largestkey.size();
+  return lf->rep[index].largestkey.data();
+}
+
+uint64_t rocksdb_livefiles_entries(const rocksdb_livefiles_t* lf, int index) {
+  return lf->rep[index].num_entries;
+}
+
+uint64_t rocksdb_livefiles_deletions(const rocksdb_livefiles_t* lf, int index) {
+  return lf->rep[index].num_deletions;
+}
+
+extern void rocksdb_livefiles_destroy(const rocksdb_livefiles_t* lf) {
+  delete lf;
+}
+
+void rocksdb_get_options_from_string(const rocksdb_options_t* base_options,
+                                     const char* opts_str,
+                                     rocksdb_options_t* new_options,
+                                     char** errptr) {
+  SaveError(errptr,
+            GetOptionsFromString(base_options->rep, std::string(opts_str),
+                                 &new_options->rep));
+}
+
+void rocksdb_delete_file_in_range(rocksdb_t* db, const char* start_key,
+                                  size_t start_key_len, const char* limit_key,
+                                  size_t limit_key_len, char** errptr) {
+  Slice a, b;
+  SaveError(
+      errptr,
+      DeleteFilesInRange(
+          db->rep, db->rep->DefaultColumnFamily(),
+          (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+          (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)));
+}
+
+void rocksdb_delete_file_in_range_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    const char* start_key, size_t start_key_len, const char* limit_key,
+    size_t limit_key_len, char** errptr) {
+  Slice a, b;
+  SaveError(
+      errptr,
+      DeleteFilesInRange(
+          db->rep, column_family->rep,
+          (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+          (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)));
+}
+
+/* MetaData */
+
+rocksdb_column_family_metadata_t* rocksdb_get_column_family_metadata(
+    rocksdb_t* db) {
+  rocksdb_column_family_metadata_t* meta = new rocksdb_column_family_metadata_t;
+  db->rep->GetColumnFamilyMetaData(&meta->rep);
+  return meta;
+}
+
+rocksdb_column_family_metadata_t* rocksdb_get_column_family_metadata_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family) {
+  rocksdb_column_family_metadata_t* meta = new rocksdb_column_family_metadata_t;
+  db->rep->GetColumnFamilyMetaData(column_family->rep, &meta->rep);
+  return meta;
+}
+
+void rocksdb_column_family_metadata_destroy(
+    rocksdb_column_family_metadata_t* cf_meta) {
+  delete cf_meta;
+}
+
+uint64_t rocksdb_column_family_metadata_get_size(
+    rocksdb_column_family_metadata_t* cf_meta) {
+  return cf_meta->rep.size;
+}
+
+size_t rocksdb_column_family_metadata_get_file_count(
+    rocksdb_column_family_metadata_t* cf_meta) {
+  return cf_meta->rep.file_count;
+}
+
+char* rocksdb_column_family_metadata_get_name(
+    rocksdb_column_family_metadata_t* cf_meta) {
+  return strdup(cf_meta->rep.name.c_str());
+}
+
+size_t rocksdb_column_family_metadata_get_level_count(
+    rocksdb_column_family_metadata_t* cf_meta) {
+  return cf_meta->rep.levels.size();
+}
+
+rocksdb_level_metadata_t* rocksdb_column_family_metadata_get_level_metadata(
+    rocksdb_column_family_metadata_t* cf_meta, size_t i) {
+  if (i >= cf_meta->rep.levels.size()) {
+    return NULL;
+  }
+  rocksdb_level_metadata_t* level_meta =
+      (rocksdb_level_metadata_t*)malloc(sizeof(rocksdb_level_metadata_t));
+  level_meta->rep = &cf_meta->rep.levels[i];
+
+  return level_meta;
+}
+
+void rocksdb_level_metadata_destroy(rocksdb_level_metadata_t* level_meta) {
+  // Only free the base pointer as its parent rocksdb_column_family_metadata_t
+  // has the ownership of its rep.
+  free(level_meta);
+}
+
+int rocksdb_level_metadata_get_level(rocksdb_level_metadata_t* level_meta) {
+  return level_meta->rep->level;
+}
+
+uint64_t rocksdb_level_metadata_get_size(rocksdb_level_metadata_t* level_meta) {
+  return level_meta->rep->size;
+}
+
+size_t rocksdb_level_metadata_get_file_count(
+    rocksdb_level_metadata_t* level_meta) {
+  return level_meta->rep->files.size();
+}
+
+rocksdb_sst_file_metadata_t* rocksdb_level_metadata_get_sst_file_metadata(
+    rocksdb_level_metadata_t* level_meta, size_t i) {
+  if (i >= level_meta->rep->files.size()) {
+    return nullptr;
+  }
+  rocksdb_sst_file_metadata_t* file_meta =
+      (rocksdb_sst_file_metadata_t*)malloc(sizeof(rocksdb_sst_file_metadata_t));
+  file_meta->rep = &level_meta->rep->files[i];
+  return file_meta;
+}
+
+void rocksdb_sst_file_metadata_destroy(rocksdb_sst_file_metadata_t* file_meta) {
+  // Only free the base pointer as its parent rocksdb_level_metadata_t
+  // has the ownership of its rep.
+  free(file_meta);
+}
+
+char* rocksdb_sst_file_metadata_get_relative_filename(
+    rocksdb_sst_file_metadata_t* file_meta) {
+  return strdup(file_meta->rep->relative_filename.c_str());
+}
+
+uint64_t rocksdb_sst_file_metadata_get_size(
+    rocksdb_sst_file_metadata_t* file_meta) {
+  return file_meta->rep->size;
+}
+
+char* rocksdb_sst_file_metadata_get_smallestkey(
+    rocksdb_sst_file_metadata_t* file_meta, size_t* key_len) {
+  *key_len = file_meta->rep->smallestkey.size();
+  return CopyString(file_meta->rep->smallestkey);
+}
+
+char* rocksdb_sst_file_metadata_get_largestkey(
+    rocksdb_sst_file_metadata_t* file_meta, size_t* key_len) {
+  *key_len = file_meta->rep->largestkey.size();
+  return CopyString(file_meta->rep->largestkey);
+}
+
+/* Transactions */
+
+rocksdb_transactiondb_options_t* rocksdb_transactiondb_options_create() {
+  return new rocksdb_transactiondb_options_t;
+}
+
+void rocksdb_transactiondb_options_destroy(
+    rocksdb_transactiondb_options_t* opt) {
+  delete opt;
+}
+
+void rocksdb_transactiondb_options_set_max_num_locks(
+    rocksdb_transactiondb_options_t* opt, int64_t max_num_locks) {
+  opt->rep.max_num_locks = max_num_locks;
+}
+
+void rocksdb_transactiondb_options_set_num_stripes(
+    rocksdb_transactiondb_options_t* opt, size_t num_stripes) {
+  opt->rep.num_stripes = num_stripes;
+}
+
+void rocksdb_transactiondb_options_set_transaction_lock_timeout(
+    rocksdb_transactiondb_options_t* opt, int64_t txn_lock_timeout) {
+  opt->rep.transaction_lock_timeout = txn_lock_timeout;
+}
+
+void rocksdb_transactiondb_options_set_default_lock_timeout(
+    rocksdb_transactiondb_options_t* opt, int64_t default_lock_timeout) {
+  opt->rep.default_lock_timeout = default_lock_timeout;
+}
+
+rocksdb_transaction_options_t* rocksdb_transaction_options_create() {
+  return new rocksdb_transaction_options_t;
+}
+
+void rocksdb_transaction_options_destroy(rocksdb_transaction_options_t* opt) {
+  delete opt;
+}
+
+void rocksdb_transaction_options_set_set_snapshot(
+    rocksdb_transaction_options_t* opt, unsigned char v) {
+  opt->rep.set_snapshot = v;
+}
+
+void rocksdb_transaction_options_set_deadlock_detect(
+    rocksdb_transaction_options_t* opt, unsigned char v) {
+  opt->rep.deadlock_detect = v;
+}
+
+void rocksdb_transaction_options_set_lock_timeout(
+    rocksdb_transaction_options_t* opt, int64_t lock_timeout) {
+  opt->rep.lock_timeout = lock_timeout;
+}
+
+void rocksdb_transaction_options_set_expiration(
+    rocksdb_transaction_options_t* opt, int64_t expiration) {
+  opt->rep.expiration = expiration;
+}
+
+void rocksdb_transaction_options_set_deadlock_detect_depth(
+    rocksdb_transaction_options_t* opt, int64_t depth) {
+  opt->rep.deadlock_detect_depth = depth;
+}
+
+void rocksdb_transaction_options_set_max_write_batch_size(
+    rocksdb_transaction_options_t* opt, size_t size) {
+  opt->rep.max_write_batch_size = size;
+}
+
+void rocksdb_transaction_options_set_skip_prepare(
+    rocksdb_transaction_options_t* opt, unsigned char v) {
+  opt->rep.skip_prepare = v;
+}
+
+rocksdb_optimistictransaction_options_t*
+rocksdb_optimistictransaction_options_create() {
+  return new rocksdb_optimistictransaction_options_t;
+}
+
+void rocksdb_optimistictransaction_options_destroy(
+    rocksdb_optimistictransaction_options_t* opt) {
+  delete opt;
+}
+
+void rocksdb_optimistictransaction_options_set_set_snapshot(
+    rocksdb_optimistictransaction_options_t* opt, unsigned char v) {
+  opt->rep.set_snapshot = v;
+}
+
+char* rocksdb_optimistictransactiondb_property_value(
+    rocksdb_optimistictransactiondb_t* db, const char* propname) {
+  std::string tmp;
+  if (db->rep->GetProperty(Slice(propname), &tmp)) {
+    // We use strdup() since we expect human readable output.
+    return strdup(tmp.c_str());
+  } else {
+    return nullptr;
+  }
+}
+
+int rocksdb_optimistictransactiondb_property_int(
+    rocksdb_optimistictransactiondb_t* db, const char* propname,
+    uint64_t* out_val) {
+  if (db->rep->GetIntProperty(Slice(propname), out_val)) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+rocksdb_column_family_handle_t* rocksdb_transactiondb_create_column_family(
+    rocksdb_transactiondb_t* txn_db,
+    const rocksdb_options_t* column_family_options,
+    const char* column_family_name, char** errptr) {
+  rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t;
+  SaveError(errptr, txn_db->rep->CreateColumnFamily(
+                        ColumnFamilyOptions(column_family_options->rep),
+                        std::string(column_family_name), &(handle->rep)));
+  return handle;
+}
+
+rocksdb_transactiondb_t* rocksdb_transactiondb_open(
+    const rocksdb_options_t* options,
+    const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
+    char** errptr) {
+  TransactionDB* txn_db;
+  if (SaveError(errptr, TransactionDB::Open(options->rep, txn_db_options->rep,
+                                            std::string(name), &txn_db))) {
+    return nullptr;
+  }
+  rocksdb_transactiondb_t* result = new rocksdb_transactiondb_t;
+  result->rep = txn_db;
+  return result;
+}
+
+rocksdb_transactiondb_t* rocksdb_transactiondb_open_column_families(
+    const rocksdb_options_t* options,
+    const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
+    int num_column_families, const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char** errptr) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (int i = 0; i < num_column_families; i++) {
+    column_families.push_back(ColumnFamilyDescriptor(
+        std::string(column_family_names[i]),
+        ColumnFamilyOptions(column_family_options[i]->rep)));
+  }
+
+  TransactionDB* txn_db;
+  std::vector<ColumnFamilyHandle*> handles;
+  if (SaveError(errptr, TransactionDB::Open(options->rep, txn_db_options->rep,
+                                            std::string(name), column_families,
+                                            &handles, &txn_db))) {
+    return nullptr;
+  }
+
+  for (size_t i = 0; i < handles.size(); i++) {
+    rocksdb_column_family_handle_t* c_handle =
+        new rocksdb_column_family_handle_t;
+    c_handle->rep = handles[i];
+    column_family_handles[i] = c_handle;
+  }
+  rocksdb_transactiondb_t* result = new rocksdb_transactiondb_t;
+  result->rep = txn_db;
+  return result;
+}
+
+const rocksdb_snapshot_t* rocksdb_transactiondb_create_snapshot(
+    rocksdb_transactiondb_t* txn_db) {
+  rocksdb_snapshot_t* result = new rocksdb_snapshot_t;
+  result->rep = txn_db->rep->GetSnapshot();
+  return result;
+}
+
+void rocksdb_transactiondb_release_snapshot(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_snapshot_t* snapshot) {
+  txn_db->rep->ReleaseSnapshot(snapshot->rep);
+  delete snapshot;
+}
+
+char* rocksdb_transactiondb_property_value(rocksdb_transactiondb_t* db,
+                                           const char* propname) {
+  std::string tmp;
+  if (db->rep->GetProperty(Slice(propname), &tmp)) {
+    // We use strdup() since we expect human readable output.
+    return strdup(tmp.c_str());
+  } else {
+    return nullptr;
+  }
+}
+
+int rocksdb_transactiondb_property_int(rocksdb_transactiondb_t* db,
+                                       const char* propname,
+                                       uint64_t* out_val) {
+  if (db->rep->GetIntProperty(Slice(propname), out_val)) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+rocksdb_transaction_t* rocksdb_transaction_begin(
+    rocksdb_transactiondb_t* txn_db,
+    const rocksdb_writeoptions_t* write_options,
+    const rocksdb_transaction_options_t* txn_options,
+    rocksdb_transaction_t* old_txn) {
+  if (old_txn == nullptr) {
+    rocksdb_transaction_t* result = new rocksdb_transaction_t;
+    result->rep = txn_db->rep->BeginTransaction(write_options->rep,
+                                                txn_options->rep, nullptr);
+    return result;
+  }
+  old_txn->rep = txn_db->rep->BeginTransaction(write_options->rep,
+                                               txn_options->rep, old_txn->rep);
+  return old_txn;
+}
+
+rocksdb_transaction_t** rocksdb_transactiondb_get_prepared_transactions(
+    rocksdb_transactiondb_t* txn_db, size_t* cnt) {
+  std::vector<Transaction*> txns;
+  txn_db->rep->GetAllPreparedTransactions(&txns);
+  *cnt = txns.size();
+  if (txns.empty()) {
+    return nullptr;
+  } else {
+    rocksdb_transaction_t** buf = (rocksdb_transaction_t**)malloc(
+        txns.size() * sizeof(rocksdb_transaction_t*));
+    for (size_t i = 0; i < txns.size(); i++) {
+      buf[i] = new rocksdb_transaction_t;
+      buf[i]->rep = txns[i];
+    }
+    return buf;
+  }
+}
+
+void rocksdb_transaction_set_name(rocksdb_transaction_t* txn, const char* name,
+                                  size_t name_len, char** errptr) {
+  std::string str = std::string(name, name_len);
+  SaveError(errptr, txn->rep->SetName(str));
+}
+
+char* rocksdb_transaction_get_name(rocksdb_transaction_t* txn,
+                                   size_t* name_len) {
+  auto name = txn->rep->GetName();
+  *name_len = name.size();
+  return CopyString(name);
+}
+
+void rocksdb_transaction_prepare(rocksdb_transaction_t* txn, char** errptr) {
+  SaveError(errptr, txn->rep->Prepare());
+}
+
+rocksdb_writebatch_wi_t* rocksdb_transaction_get_writebatch_wi(
+    rocksdb_transaction_t* txn) {
+  rocksdb_writebatch_wi_t* wi =
+      (rocksdb_writebatch_wi_t*)malloc(sizeof(rocksdb_writebatch_wi_t));
+  wi->rep = txn->rep->GetWriteBatch();
+
+  return wi;
+}
+
+void rocksdb_transaction_rebuild_from_writebatch(
+    rocksdb_transaction_t* txn, rocksdb_writebatch_t* writebatch,
+    char** errptr) {
+  SaveError(errptr, txn->rep->RebuildFromWriteBatch(&writebatch->rep));
+}
+
+void rocksdb_transaction_rebuild_from_writebatch_wi(rocksdb_transaction_t* txn,
+                                                    rocksdb_writebatch_wi_t* wi,
+                                                    char** errptr) {
+  SaveError(errptr, txn->rep->RebuildFromWriteBatch(wi->rep->GetWriteBatch()));
+}
+
+void rocksdb_transaction_commit(rocksdb_transaction_t* txn, char** errptr) {
+  SaveError(errptr, txn->rep->Commit());
+}
+
+void rocksdb_transaction_rollback(rocksdb_transaction_t* txn, char** errptr) {
+  SaveError(errptr, txn->rep->Rollback());
+}
+
+void rocksdb_transaction_set_savepoint(rocksdb_transaction_t* txn) {
+  txn->rep->SetSavePoint();
+}
+
+void rocksdb_transaction_rollback_to_savepoint(rocksdb_transaction_t* txn,
+                                               char** errptr) {
+  SaveError(errptr, txn->rep->RollbackToSavePoint());
+}
+
+void rocksdb_transaction_destroy(rocksdb_transaction_t* txn) {
+  delete txn->rep;
+  delete txn;
+}
+
+const rocksdb_snapshot_t* rocksdb_transaction_get_snapshot(
+    rocksdb_transaction_t* txn) {
+  // This will be freed later on using free, so use malloc here to avoid a
+  // mismatch
+  rocksdb_snapshot_t* result =
+      (rocksdb_snapshot_t*)malloc(sizeof(rocksdb_snapshot_t));
+  result->rep = txn->rep->GetSnapshot();
+  return result;
+}
+
+// Read a key inside a transaction
+char* rocksdb_transaction_get(rocksdb_transaction_t* txn,
+                              const rocksdb_readoptions_t* options,
+                              const char* key, size_t klen, size_t* vlen,
+                              char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = txn->rep->Get(options->rep, Slice(key, klen), &tmp);
+  if (s.ok()) {
+    *vlen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vlen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+rocksdb_pinnableslice_t* rocksdb_transaction_get_pinned(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    const char* key, size_t klen, char** errptr) {
+  rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+  Status s = txn->rep->Get(options->rep, Slice(key, klen), &v->rep);
+  if (!s.ok()) {
+    delete (v);
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return nullptr;
+  }
+  return v;
+}
+
+char* rocksdb_transaction_get_cf(rocksdb_transaction_t* txn,
+                                 const rocksdb_readoptions_t* options,
+                                 rocksdb_column_family_handle_t* column_family,
+                                 const char* key, size_t klen, size_t* vlen,
+                                 char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s =
+      txn->rep->Get(options->rep, column_family->rep, Slice(key, klen), &tmp);
+  if (s.ok()) {
+    *vlen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vlen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+rocksdb_pinnableslice_t* rocksdb_transaction_get_pinned_cf(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+    char** errptr) {
+  rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+  Status s = txn->rep->Get(options->rep, column_family->rep, Slice(key, klen),
+                           &v->rep);
+  if (!s.ok()) {
+    delete (v);
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return nullptr;
+  }
+  return v;
+}
+
+// Read a key inside a transaction
+char* rocksdb_transaction_get_for_update(rocksdb_transaction_t* txn,
+                                         const rocksdb_readoptions_t* options,
+                                         const char* key, size_t klen,
+                                         size_t* vlen, unsigned char exclusive,
+                                         char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s =
+      txn->rep->GetForUpdate(options->rep, Slice(key, klen), &tmp, exclusive);
+  if (s.ok()) {
+    *vlen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vlen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+rocksdb_pinnableslice_t* rocksdb_transaction_get_pinned_for_update(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    const char* key, size_t klen, unsigned char exclusive, char** errptr) {
+  rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+  Status s = txn->rep->GetForUpdate(options->rep, Slice(key, klen),
+                                    v->rep.GetSelf(), exclusive);
+  v->rep.PinSelf();
+  if (!s.ok()) {
+    delete (v);
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return nullptr;
+  }
+  return v;
+}
+
+char* rocksdb_transaction_get_for_update_cf(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+    size_t* vlen, unsigned char exclusive, char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = txn->rep->GetForUpdate(options->rep, column_family->rep,
+                                    Slice(key, klen), &tmp, exclusive);
+  if (s.ok()) {
+    *vlen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vlen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+rocksdb_pinnableslice_t* rocksdb_transaction_get_pinned_for_update_cf(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+    unsigned char exclusive, char** errptr) {
+  rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+  Status s = txn->rep->GetForUpdate(options->rep, column_family->rep,
+                                    Slice(key, klen), &v->rep, exclusive);
+  if (!s.ok()) {
+    delete (v);
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return nullptr;
+  }
+  return v;
+}
+
+void rocksdb_transaction_multi_get(rocksdb_transaction_t* txn,
+                                   const rocksdb_readoptions_t* options,
+                                   size_t num_keys,
+                                   const char* const* keys_list,
+                                   const size_t* keys_list_sizes,
+                                   char** values_list,
+                                   size_t* values_list_sizes, char** errs) {
+  std::vector<Slice> keys(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<std::string> values(num_keys);
+  std::vector<Status> statuses =
+      txn->rep->MultiGet(options->rep, keys, &values);
+  for (size_t i = 0; i < num_keys; i++) {
+    if (statuses[i].ok()) {
+      values_list[i] = CopyString(values[i]);
+      values_list_sizes[i] = values[i].size();
+      errs[i] = nullptr;
+    } else {
+      values_list[i] = nullptr;
+      values_list_sizes[i] = 0;
+      if (!statuses[i].IsNotFound()) {
+        errs[i] = strdup(statuses[i].ToString().c_str());
+      } else {
+        errs[i] = nullptr;
+      }
+    }
+  }
+}
+
+void rocksdb_transaction_multi_get_cf(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    const rocksdb_column_family_handle_t* const* column_families,
+    size_t num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, char** values_list,
+    size_t* values_list_sizes, char** errs) {
+  std::vector<Slice> keys(num_keys);
+  std::vector<ColumnFamilyHandle*> cfs(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+    cfs[i] = column_families[i]->rep;
+  }
+  std::vector<std::string> values(num_keys);
+  std::vector<Status> statuses =
+      txn->rep->MultiGet(options->rep, cfs, keys, &values);
+  for (size_t i = 0; i < num_keys; i++) {
+    if (statuses[i].ok()) {
+      values_list[i] = CopyString(values[i]);
+      values_list_sizes[i] = values[i].size();
+      errs[i] = nullptr;
+    } else {
+      values_list[i] = nullptr;
+      values_list_sizes[i] = 0;
+      if (!statuses[i].IsNotFound()) {
+        errs[i] = strdup(statuses[i].ToString().c_str());
+      } else {
+        errs[i] = nullptr;
+      }
+    }
+  }
+}
+
+// Read a key outside a transaction
+char* rocksdb_transactiondb_get(rocksdb_transactiondb_t* txn_db,
+                                const rocksdb_readoptions_t* options,
+                                const char* key, size_t klen, size_t* vlen,
+                                char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = txn_db->rep->Get(options->rep, Slice(key, klen), &tmp);
+  if (s.ok()) {
+    *vlen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vlen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+rocksdb_pinnableslice_t* rocksdb_transactiondb_get_pinned(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+    const char* key, size_t klen, char** errptr) {
+  rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+  Status s = txn_db->rep->Get(options->rep, txn_db->rep->DefaultColumnFamily(),
+                              Slice(key, klen), &v->rep);
+  if (!s.ok()) {
+    delete (v);
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return nullptr;
+  }
+  return v;
+}
+
+char* rocksdb_transactiondb_get_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, size_t* vallen, char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = txn_db->rep->Get(options->rep, column_family->rep,
+                              Slice(key, keylen), &tmp);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+rocksdb_pinnableslice_t* rocksdb_transactiondb_get_pinned_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr) {
+  rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+  Status s = txn_db->rep->Get(options->rep, column_family->rep,
+                              Slice(key, keylen), &v->rep);
+  if (!s.ok()) {
+    delete (v);
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return nullptr;
+  }
+  return v;
+}
+
+void rocksdb_transactiondb_multi_get(rocksdb_transactiondb_t* txn_db,
+                                     const rocksdb_readoptions_t* options,
+                                     size_t num_keys,
+                                     const char* const* keys_list,
+                                     const size_t* keys_list_sizes,
+                                     char** values_list,
+                                     size_t* values_list_sizes, char** errs) {
+  std::vector<Slice> keys(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<std::string> values(num_keys);
+  std::vector<Status> statuses =
+      txn_db->rep->MultiGet(options->rep, keys, &values);
+  for (size_t i = 0; i < num_keys; i++) {
+    if (statuses[i].ok()) {
+      values_list[i] = CopyString(values[i]);
+      values_list_sizes[i] = values[i].size();
+      errs[i] = nullptr;
+    } else {
+      values_list[i] = nullptr;
+      values_list_sizes[i] = 0;
+      if (!statuses[i].IsNotFound()) {
+        errs[i] = strdup(statuses[i].ToString().c_str());
+      } else {
+        errs[i] = nullptr;
+      }
+    }
+  }
+}
+
+void rocksdb_transactiondb_multi_get_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+    const rocksdb_column_family_handle_t* const* column_families,
+    size_t num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, char** values_list,
+    size_t* values_list_sizes, char** errs) {
+  std::vector<Slice> keys(num_keys);
+  std::vector<ColumnFamilyHandle*> cfs(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+    cfs[i] = column_families[i]->rep;
+  }
+  std::vector<std::string> values(num_keys);
+  std::vector<Status> statuses =
+      txn_db->rep->MultiGet(options->rep, cfs, keys, &values);
+  for (size_t i = 0; i < num_keys; i++) {
+    if (statuses[i].ok()) {
+      values_list[i] = CopyString(values[i]);
+      values_list_sizes[i] = values[i].size();
+      errs[i] = nullptr;
+    } else {
+      values_list[i] = nullptr;
+      values_list_sizes[i] = 0;
+      if (!statuses[i].IsNotFound()) {
+        errs[i] = strdup(statuses[i].ToString().c_str());
+      } else {
+        errs[i] = nullptr;
+      }
+    }
+  }
+}
+
+// Put a key inside a transaction
+void rocksdb_transaction_put(rocksdb_transaction_t* txn, const char* key,
+                             size_t klen, const char* val, size_t vlen,
+                             char** errptr) {
+  SaveError(errptr, txn->rep->Put(Slice(key, klen), Slice(val, vlen)));
+}
+
+void rocksdb_transaction_put_cf(rocksdb_transaction_t* txn,
+                                rocksdb_column_family_handle_t* column_family,
+                                const char* key, size_t klen, const char* val,
+                                size_t vlen, char** errptr) {
+  SaveError(errptr, txn->rep->Put(column_family->rep, Slice(key, klen),
+                                  Slice(val, vlen)));
+}
+
+void rocksdb_transaction_set_commit_timestamp(rocksdb_transaction_t* txn,
+                                              uint64_t commit_timestamp) {
+  txn->rep->SetCommitTimestamp(commit_timestamp);
+}
+
+void rocksdb_transaction_set_read_timestamp_for_validation(
+    rocksdb_transaction_t* txn, uint64_t read_timestamp) {
+  txn->rep->SetReadTimestampForValidation(read_timestamp);
+}
+
+// Put a key outside a transaction
+void rocksdb_transactiondb_put(rocksdb_transactiondb_t* txn_db,
+                               const rocksdb_writeoptions_t* options,
+                               const char* key, size_t klen, const char* val,
+                               size_t vlen, char** errptr) {
+  SaveError(errptr,
+            txn_db->rep->Put(options->rep, Slice(key, klen), Slice(val, vlen)));
+}
+
+void rocksdb_transactiondb_put_cf(rocksdb_transactiondb_t* txn_db,
+                                  const rocksdb_writeoptions_t* options,
+                                  rocksdb_column_family_handle_t* column_family,
+                                  const char* key, size_t keylen,
+                                  const char* val, size_t vallen,
+                                  char** errptr) {
+  SaveError(errptr, txn_db->rep->Put(options->rep, column_family->rep,
+                                     Slice(key, keylen), Slice(val, vallen)));
+}
+
+// Write batch into transaction db
+void rocksdb_transactiondb_write(rocksdb_transactiondb_t* db,
+                                 const rocksdb_writeoptions_t* options,
+                                 rocksdb_writebatch_t* batch, char** errptr) {
+  SaveError(errptr, db->rep->Write(options->rep, &batch->rep));
+}
+
+// Merge a key inside a transaction
+void rocksdb_transaction_merge(rocksdb_transaction_t* txn, const char* key,
+                               size_t klen, const char* val, size_t vlen,
+                               char** errptr) {
+  SaveError(errptr, txn->rep->Merge(Slice(key, klen), Slice(val, vlen)));
+}
+
+void rocksdb_transaction_merge_cf(rocksdb_transaction_t* txn,
+                                  rocksdb_column_family_handle_t* column_family,
+                                  const char* key, size_t klen, const char* val,
+                                  size_t vlen, char** errptr) {
+  SaveError(errptr, txn->rep->Merge(column_family->rep, Slice(key, klen),
+                                    Slice(val, vlen)));
+}
+
+// Merge a key outside a transaction
+void rocksdb_transactiondb_merge(rocksdb_transactiondb_t* txn_db,
+                                 const rocksdb_writeoptions_t* options,
+                                 const char* key, size_t klen, const char* val,
+                                 size_t vlen, char** errptr) {
+  SaveError(errptr, txn_db->rep->Merge(options->rep, Slice(key, klen),
+                                       Slice(val, vlen)));
+}
+
+void rocksdb_transactiondb_merge_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+    const char* val, size_t vlen, char** errptr) {
+  SaveError(errptr, txn_db->rep->Merge(options->rep, column_family->rep,
+                                       Slice(key, klen), Slice(val, vlen)));
+}
+
+// Delete a key inside a transaction
+void rocksdb_transaction_delete(rocksdb_transaction_t* txn, const char* key,
+                                size_t klen, char** errptr) {
+  SaveError(errptr, txn->rep->Delete(Slice(key, klen)));
+}
+
+void rocksdb_transaction_delete_cf(
+    rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, char** errptr) {
+  SaveError(errptr, txn->rep->Delete(column_family->rep, Slice(key, klen)));
+}
+
+// Delete a key outside a transaction
+void rocksdb_transactiondb_delete(rocksdb_transactiondb_t* txn_db,
+                                  const rocksdb_writeoptions_t* options,
+                                  const char* key, size_t klen, char** errptr) {
+  SaveError(errptr, txn_db->rep->Delete(options->rep, Slice(key, klen)));
+}
+
+void rocksdb_transactiondb_delete_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr) {
+  SaveError(errptr, txn_db->rep->Delete(options->rep, column_family->rep,
+                                        Slice(key, keylen)));
+}
+
+// Create an iterator inside a transaction
+rocksdb_iterator_t* rocksdb_transaction_create_iterator(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = txn->rep->GetIterator(options->rep);
+  return result;
+}
+
+// Create an iterator inside a transaction with column family
+rocksdb_iterator_t* rocksdb_transaction_create_iterator_cf(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = txn->rep->GetIterator(options->rep, column_family->rep);
+  return result;
+}
+
+// Create an iterator outside a transaction
+rocksdb_iterator_t* rocksdb_transactiondb_create_iterator(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = txn_db->rep->NewIterator(options->rep);
+  return result;
+}
+
+rocksdb_iterator_t* rocksdb_transactiondb_create_iterator_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = txn_db->rep->NewIterator(options->rep, column_family->rep);
+  return result;
+}
+
+void rocksdb_transactiondb_close(rocksdb_transactiondb_t* txn_db) {
+  delete txn_db->rep;
+  delete txn_db;
+}
+
+void rocksdb_transactiondb_flush_wal(rocksdb_transactiondb_t* txn_db,
+                                     unsigned char sync, char** errptr) {
+  SaveError(errptr, txn_db->rep->FlushWAL(sync));
+}
+
+void rocksdb_transactiondb_flush(rocksdb_transactiondb_t* txn_db,
+                                 const rocksdb_flushoptions_t* options,
+                                 char** errptr) {
+  SaveError(errptr, txn_db->rep->Flush(options->rep));
+}
+
+void rocksdb_transactiondb_flush_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_flushoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, char** errptr) {
+  SaveError(errptr, txn_db->rep->Flush(options->rep, column_family->rep));
+}
+
+rocksdb_checkpoint_t* rocksdb_transactiondb_checkpoint_object_create(
+    rocksdb_transactiondb_t* txn_db, char** errptr) {
+  Checkpoint* checkpoint;
+  if (SaveError(errptr, Checkpoint::Create(txn_db->rep, &checkpoint))) {
+    return nullptr;
+  }
+  rocksdb_checkpoint_t* result = new rocksdb_checkpoint_t;
+  result->rep = checkpoint;
+  return result;
+}
+
+rocksdb_optimistictransactiondb_t* rocksdb_optimistictransactiondb_open(
+    const rocksdb_options_t* options, const char* name, char** errptr) {
+  OptimisticTransactionDB* otxn_db;
+  if (SaveError(errptr, OptimisticTransactionDB::Open(
+                            options->rep, std::string(name), &otxn_db))) {
+    return nullptr;
+  }
+  rocksdb_optimistictransactiondb_t* result =
+      new rocksdb_optimistictransactiondb_t;
+  result->rep = otxn_db;
+  return result;
+}
+
+rocksdb_optimistictransactiondb_t*
+rocksdb_optimistictransactiondb_open_column_families(
+    const rocksdb_options_t* db_options, const char* name,
+    int num_column_families, const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char** errptr) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (int i = 0; i < num_column_families; i++) {
+    column_families.push_back(ColumnFamilyDescriptor(
+        std::string(column_family_names[i]),
+        ColumnFamilyOptions(column_family_options[i]->rep)));
+  }
+
+  OptimisticTransactionDB* otxn_db;
+  std::vector<ColumnFamilyHandle*> handles;
+  if (SaveError(errptr, OptimisticTransactionDB::Open(
+                            DBOptions(db_options->rep), std::string(name),
+                            column_families, &handles, &otxn_db))) {
+    return nullptr;
+  }
+
+  for (size_t i = 0; i < handles.size(); i++) {
+    rocksdb_column_family_handle_t* c_handle =
+        new rocksdb_column_family_handle_t;
+    c_handle->rep = handles[i];
+    column_family_handles[i] = c_handle;
+  }
+  rocksdb_optimistictransactiondb_t* result =
+      new rocksdb_optimistictransactiondb_t;
+  result->rep = otxn_db;
+  return result;
+}
+
+rocksdb_t* rocksdb_optimistictransactiondb_get_base_db(
+    rocksdb_optimistictransactiondb_t* otxn_db) {
+  DB* base_db = otxn_db->rep->GetBaseDB();
+
+  if (base_db != nullptr) {
+    rocksdb_t* result = new rocksdb_t;
+    result->rep = base_db;
+    return result;
+  }
+
+  return nullptr;
+}
+
+void rocksdb_optimistictransactiondb_close_base_db(rocksdb_t* base_db) {
+  delete base_db;
+}
+
+rocksdb_transaction_t* rocksdb_optimistictransaction_begin(
+    rocksdb_optimistictransactiondb_t* otxn_db,
+    const rocksdb_writeoptions_t* write_options,
+    const rocksdb_optimistictransaction_options_t* otxn_options,
+    rocksdb_transaction_t* old_txn) {
+  if (old_txn == nullptr) {
+    rocksdb_transaction_t* result = new rocksdb_transaction_t;
+    result->rep = otxn_db->rep->BeginTransaction(write_options->rep,
+                                                 otxn_options->rep, nullptr);
+    return result;
+  }
+  old_txn->rep = otxn_db->rep->BeginTransaction(
+      write_options->rep, otxn_options->rep, old_txn->rep);
+  return old_txn;
+}
+
+// Write batch into OptimisticTransactionDB
+void rocksdb_optimistictransactiondb_write(
+    rocksdb_optimistictransactiondb_t* otxn_db,
+    const rocksdb_writeoptions_t* options, rocksdb_writebatch_t* batch,
+    char** errptr) {
+  SaveError(errptr, otxn_db->rep->Write(options->rep, &batch->rep));
+}
+
+void rocksdb_optimistictransactiondb_close(
+    rocksdb_optimistictransactiondb_t* otxn_db) {
+  delete otxn_db->rep;
+  delete otxn_db;
+}
+
+rocksdb_checkpoint_t* rocksdb_optimistictransactiondb_checkpoint_object_create(
+    rocksdb_optimistictransactiondb_t* otxn_db, char** errptr) {
+  Checkpoint* checkpoint;
+  if (SaveError(errptr, Checkpoint::Create(otxn_db->rep, &checkpoint))) {
+    return nullptr;
+  }
+  rocksdb_checkpoint_t* result = new rocksdb_checkpoint_t;
+  result->rep = checkpoint;
+  return result;
+}
+
+void rocksdb_free(void* ptr) { free(ptr); }
+
+rocksdb_pinnableslice_t* rocksdb_get_pinned(
+    rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key,
+    size_t keylen, char** errptr) {
+  rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+  Status s = db->rep->Get(options->rep, db->rep->DefaultColumnFamily(),
+                          Slice(key, keylen), &v->rep);
+  if (!s.ok()) {
+    delete (v);
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return nullptr;
+  }
+  return v;
+}
+
+rocksdb_pinnableslice_t* rocksdb_get_pinned_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr) {
+  rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+  Status s = db->rep->Get(options->rep, column_family->rep, Slice(key, keylen),
+                          &v->rep);
+  if (!s.ok()) {
+    delete v;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return nullptr;
+  }
+  return v;
+}
+
+void rocksdb_pinnableslice_destroy(rocksdb_pinnableslice_t* v) { delete v; }
+
+const char* rocksdb_pinnableslice_value(const rocksdb_pinnableslice_t* v,
+                                        size_t* vlen) {
+  if (!v) {
+    *vlen = 0;
+    return nullptr;
+  }
+
+  *vlen = v->rep.size();
+  return v->rep.data();
+}
+
+// container to keep databases and caches in order to use
+// ROCKSDB_NAMESPACE::MemoryUtil
+struct rocksdb_memory_consumers_t {
+  std::vector<rocksdb_t*> dbs;
+  std::unordered_set<rocksdb_cache_t*> caches;
+};
+
+// initializes new container of memory consumers
+rocksdb_memory_consumers_t* rocksdb_memory_consumers_create() {
+  return new rocksdb_memory_consumers_t;
+}
+
+// adds datatabase to the container of memory consumers
+void rocksdb_memory_consumers_add_db(rocksdb_memory_consumers_t* consumers,
+                                     rocksdb_t* db) {
+  consumers->dbs.push_back(db);
+}
+
+// adds cache to the container of memory consumers
+void rocksdb_memory_consumers_add_cache(rocksdb_memory_consumers_t* consumers,
+                                        rocksdb_cache_t* cache) {
+  consumers->caches.insert(cache);
+}
+
+// deletes container with memory consumers
+void rocksdb_memory_consumers_destroy(rocksdb_memory_consumers_t* consumers) {
+  delete consumers;
+}
+
+// contains memory usage statistics provided by ROCKSDB_NAMESPACE::MemoryUtil
+struct rocksdb_memory_usage_t {
+  uint64_t mem_table_total;
+  uint64_t mem_table_unflushed;
+  uint64_t mem_table_readers_total;
+  uint64_t cache_total;
+};
+
+// estimates amount of memory occupied by consumers (dbs and caches)
+rocksdb_memory_usage_t* rocksdb_approximate_memory_usage_create(
+    rocksdb_memory_consumers_t* consumers, char** errptr) {
+  vector<DB*> dbs;
+  for (auto db : consumers->dbs) {
+    dbs.push_back(db->rep);
+  }
+
+  unordered_set<const Cache*> cache_set;
+  for (auto cache : consumers->caches) {
+    cache_set.insert(const_cast<const Cache*>(cache->rep.get()));
+  }
+
+  std::map<ROCKSDB_NAMESPACE::MemoryUtil::UsageType, uint64_t> usage_by_type;
+
+  auto status = MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set,
+                                                            &usage_by_type);
+  if (SaveError(errptr, status)) {
+    return nullptr;
+  }
+
+  auto result = new rocksdb_memory_usage_t;
+  result->mem_table_total = usage_by_type[MemoryUtil::kMemTableTotal];
+  result->mem_table_unflushed = usage_by_type[MemoryUtil::kMemTableUnFlushed];
+  result->mem_table_readers_total =
+      usage_by_type[MemoryUtil::kTableReadersTotal];
+  result->cache_total = usage_by_type[MemoryUtil::kCacheTotal];
+  return result;
+}
+
+uint64_t rocksdb_approximate_memory_usage_get_mem_table_total(
+    rocksdb_memory_usage_t* memory_usage) {
+  return memory_usage->mem_table_total;
+}
+
+uint64_t rocksdb_approximate_memory_usage_get_mem_table_unflushed(
+    rocksdb_memory_usage_t* memory_usage) {
+  return memory_usage->mem_table_unflushed;
+}
+
+uint64_t rocksdb_approximate_memory_usage_get_mem_table_readers_total(
+    rocksdb_memory_usage_t* memory_usage) {
+  return memory_usage->mem_table_readers_total;
+}
+
+uint64_t rocksdb_approximate_memory_usage_get_cache_total(
+    rocksdb_memory_usage_t* memory_usage) {
+  return memory_usage->cache_total;
+}
+
+void rocksdb_options_set_dump_malloc_stats(rocksdb_options_t* opt,
+                                           unsigned char val) {
+  opt->rep.dump_malloc_stats = val;
+}
+
+void rocksdb_options_set_memtable_whole_key_filtering(rocksdb_options_t* opt,
+                                                      unsigned char val) {
+  opt->rep.memtable_whole_key_filtering = val;
+}
+
+void rocksdb_options_set_avoid_unnecessary_blocking_io(rocksdb_options_t* opt,
+                                                       unsigned char val) {
+  opt->rep.avoid_unnecessary_blocking_io = val;
+}
+
+unsigned char rocksdb_options_get_avoid_unnecessary_blocking_io(
+    rocksdb_options_t* opt) {
+  return opt->rep.avoid_unnecessary_blocking_io;
+}
+
+// deletes container with memory usage estimates
+void rocksdb_approximate_memory_usage_destroy(rocksdb_memory_usage_t* usage) {
+  delete usage;
+}
+
+void rocksdb_cancel_all_background_work(rocksdb_t* db, unsigned char wait) {
+  CancelAllBackgroundWork(db->rep, wait);
+}
+
+void rocksdb_disable_manual_compaction(rocksdb_t* db) {
+  db->rep->DisableManualCompaction();
+}
+
+void rocksdb_enable_manual_compaction(rocksdb_t* db) {
+  db->rep->EnableManualCompaction();
+}
+
+}  // end extern "C"
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/c_test.c b/src/rocksdb/db/c_test.c
new file mode 100644
index 000000000..249ab9023
--- /dev/null
+++ b/src/rocksdb/db/c_test.c
@@ -0,0 +1,3476 @@
+/* Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+   Use of this source code is governed by a BSD-style license that can be
+   found in the LICENSE file. See the AUTHORS file for names of contributors. */
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+#include <stdio.h>
+
+#ifndef ROCKSDB_LITE  // Lite does not support C API
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include "rocksdb/c.h"
+#ifndef OS_WIN
+#include <unistd.h>
+#endif
+#include <inttypes.h>
+
+// Can not use port/port.h macros as this is a c file
+#ifdef OS_WIN
+#include <windows.h>
+
+// Ok for uniqueness
+int geteuid() {
+  int result = 0;
+
+  result = ((int)GetCurrentProcessId() << 16);
+  result |= (int)GetCurrentThreadId();
+
+  return result;
+}
+
+#endif
+
+const char* phase = "";
+static char dbname[200];
+static char sstfilename[200];
+static char dbbackupname[200];
+static char dbcheckpointname[200];
+static char dbpathname[200];
+static char secondary_path[200];
+
+static void StartPhase(const char* name) {
+  fprintf(stderr, "=== Test %s\n", name);
+  phase = name;
+}
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4996)  // getenv security warning
+#endif
+static const char* GetTempDir(void) {
+  const char* ret = getenv("TEST_TMPDIR");
+  if (ret == NULL || ret[0] == '\0')
+#ifdef OS_WIN
+    ret = getenv("TEMP");
+#else
+    ret = "/tmp";
+#endif
+  return ret;
+}
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#define CheckNoError(err)                                                 \
+  if ((err) != NULL) {                                                    \
+    fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, (err)); \
+    abort();                                                              \
+  }
+
+#define CheckCondition(cond)                                              \
+  if (!(cond)) {                                                          \
+    fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, #cond); \
+    abort();                                                              \
+  }
+
+static void CheckEqual(const char* expected, const char* v, size_t n) {
+  if (expected == NULL && v == NULL) {
+    // ok
+  } else if (expected != NULL && v != NULL && n == strlen(expected) &&
+             memcmp(expected, v, n) == 0) {
+    // ok
+    return;
+  } else {
+    fprintf(stderr, "%s: expected '%s', got '%s'\n", phase,
+            (expected ? expected : "(null)"), (v ? v : "(null)"));
+    abort();
+  }
+}
+
+static void Free(char** ptr) {
+  if (*ptr) {
+    free(*ptr);
+    *ptr = NULL;
+  }
+}
+
+static void CheckValue(char* err, const char* expected, char** actual,
+                       size_t actual_length) {
+  CheckNoError(err);
+  CheckEqual(expected, *actual, actual_length);
+  Free(actual);
+}
+
+static void CheckGet(rocksdb_t* db, const rocksdb_readoptions_t* options,
+                     const char* key, const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  char* val;
+  val = rocksdb_get(db, options, key, strlen(key), &val_len, &err);
+  CheckNoError(err);
+  CheckEqual(expected, val, val_len);
+  Free(&val);
+}
+
+static void CheckGetCF(rocksdb_t* db, const rocksdb_readoptions_t* options,
+                       rocksdb_column_family_handle_t* handle, const char* key,
+                       const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  char* val;
+  val = rocksdb_get_cf(db, options, handle, key, strlen(key), &val_len, &err);
+  CheckNoError(err);
+  CheckEqual(expected, val, val_len);
+  Free(&val);
+}
+
+static void CheckPinGet(rocksdb_t* db, const rocksdb_readoptions_t* options,
+                        const char* key, const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  const char* val;
+  rocksdb_pinnableslice_t* p;
+  p = rocksdb_get_pinned(db, options, key, strlen(key), &err);
+  CheckNoError(err);
+  val = rocksdb_pinnableslice_value(p, &val_len);
+  CheckEqual(expected, val, val_len);
+  rocksdb_pinnableslice_destroy(p);
+}
+
+static void CheckPinGetCF(rocksdb_t* db, const rocksdb_readoptions_t* options,
+                          rocksdb_column_family_handle_t* handle,
+                          const char* key, const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  const char* val;
+  rocksdb_pinnableslice_t* p;
+  p = rocksdb_get_pinned_cf(db, options, handle, key, strlen(key), &err);
+  CheckNoError(err);
+  val = rocksdb_pinnableslice_value(p, &val_len);
+  CheckEqual(expected, val, val_len);
+  rocksdb_pinnableslice_destroy(p);
+}
+
+static void CheckMultiGetValues(size_t num_keys, char** values,
+                                size_t* values_sizes, char** errs,
+                                const char** expected) {
+  for (size_t i = 0; i < num_keys; i++) {
+    CheckNoError(errs[i]);
+    CheckEqual(expected[i], values[i], values_sizes[i]);
+    Free(&values[i]);
+  }
+}
+
+static void CheckIter(rocksdb_iterator_t* iter, const char* key,
+                      const char* val) {
+  size_t len;
+  const char* str;
+  str = rocksdb_iter_key(iter, &len);
+  CheckEqual(key, str, len);
+  str = rocksdb_iter_value(iter, &len);
+  CheckEqual(val, str, len);
+}
+
+// Callback from rocksdb_writebatch_iterate()
+static void CheckPut(void* ptr, const char* k, size_t klen, const char* v,
+                     size_t vlen) {
+  int* state = (int*)ptr;
+  CheckCondition(*state < 2);
+  switch (*state) {
+    case 0:
+      CheckEqual("bar", k, klen);
+      CheckEqual("b", v, vlen);
+      break;
+    case 1:
+      CheckEqual("box", k, klen);
+      CheckEqual("c", v, vlen);
+      break;
+  }
+  (*state)++;
+}
+
+// Callback from rocksdb_writebatch_iterate()
+static void CheckDel(void* ptr, const char* k, size_t klen) {
+  int* state = (int*)ptr;
+  CheckCondition(*state == 2);
+  CheckEqual("bar", k, klen);
+  (*state)++;
+}
+
+static void CmpDestroy(void* arg) { (void)arg; }
+
+static int CmpCompare(void* arg, const char* a, size_t alen, const char* b,
+                      size_t blen) {
+  (void)arg;
+  size_t n = (alen < blen) ? alen : blen;
+  int r = memcmp(a, b, n);
+  if (r == 0) {
+    if (alen < blen)
+      r = -1;
+    else if (alen > blen)
+      r = +1;
+  }
+  return r;
+}
+
+static const char* CmpName(void* arg) {
+  (void)arg;
+  return "foo";
+}
+
+// Custom compaction filter
+static void CFilterDestroy(void* arg) { (void)arg; }
+static const char* CFilterName(void* arg) {
+  (void)arg;
+  return "foo";
+}
+static unsigned char CFilterFilter(void* arg, int level, const char* key,
+                                   size_t key_length,
+                                   const char* existing_value,
+                                   size_t value_length, char** new_value,
+                                   size_t* new_value_length,
+                                   unsigned char* value_changed) {
+  (void)arg;
+  (void)level;
+  (void)existing_value;
+  (void)value_length;
+  if (key_length == 3) {
+    if (memcmp(key, "bar", key_length) == 0) {
+      return 1;
+    } else if (memcmp(key, "baz", key_length) == 0) {
+      *value_changed = 1;
+      *new_value = "newbazvalue";
+      *new_value_length = 11;
+      return 0;
+    }
+  }
+  return 0;
+}
+
+static void CFilterFactoryDestroy(void* arg) { (void)arg; }
+static const char* CFilterFactoryName(void* arg) {
+  (void)arg;
+  return "foo";
+}
+static rocksdb_compactionfilter_t* CFilterCreate(
+    void* arg, rocksdb_compactionfiltercontext_t* context) {
+  (void)arg;
+  (void)context;
+  return rocksdb_compactionfilter_create(NULL, CFilterDestroy, CFilterFilter,
+                                         CFilterName);
+}
+
+void CheckMetaData(rocksdb_column_family_metadata_t* cf_meta,
+                   const char* expected_cf_name) {
+  char* cf_name = rocksdb_column_family_metadata_get_name(cf_meta);
+  assert(strcmp(cf_name, expected_cf_name) == 0);
+  rocksdb_free(cf_name);
+
+  size_t cf_size = rocksdb_column_family_metadata_get_size(cf_meta);
+  assert(cf_size > 0);
+  size_t cf_file_count = rocksdb_column_family_metadata_get_size(cf_meta);
+  assert(cf_file_count > 0);
+
+  uint64_t total_level_size = 0;
+  size_t total_file_count = 0;
+  size_t level_count = rocksdb_column_family_metadata_get_level_count(cf_meta);
+  assert(level_count > 0);
+  for (size_t l = 0; l < level_count; ++l) {
+    rocksdb_level_metadata_t* level_meta =
+        rocksdb_column_family_metadata_get_level_metadata(cf_meta, l);
+    assert(level_meta);
+    assert(rocksdb_level_metadata_get_level(level_meta) >= (int)l);
+    uint64_t level_size = rocksdb_level_metadata_get_size(level_meta);
+    uint64_t file_size_in_level = 0;
+
+    size_t file_count = rocksdb_level_metadata_get_file_count(level_meta);
+    total_file_count += file_count;
+    for (size_t f = 0; f < file_count; ++f) {
+      rocksdb_sst_file_metadata_t* file_meta =
+          rocksdb_level_metadata_get_sst_file_metadata(level_meta, f);
+      assert(file_meta);
+
+      uint64_t file_size = rocksdb_sst_file_metadata_get_size(file_meta);
+      assert(file_size > 0);
+      file_size_in_level += file_size;
+
+      char* file_name =
+          rocksdb_sst_file_metadata_get_relative_filename(file_meta);
+      assert(file_name);
+      assert(strlen(file_name) > 0);
+      rocksdb_free(file_name);
+
+      size_t smallest_key_len;
+      char* smallest_key = rocksdb_sst_file_metadata_get_smallestkey(
+          file_meta, &smallest_key_len);
+      assert(smallest_key);
+      assert(smallest_key_len > 0);
+      size_t largest_key_len;
+      char* largest_key =
+          rocksdb_sst_file_metadata_get_largestkey(file_meta, &largest_key_len);
+      assert(largest_key);
+      assert(largest_key_len > 0);
+      rocksdb_free(smallest_key);
+      rocksdb_free(largest_key);
+
+      rocksdb_sst_file_metadata_destroy(file_meta);
+    }
+    assert(level_size == file_size_in_level);
+    total_level_size += level_size;
+    rocksdb_level_metadata_destroy(level_meta);
+  }
+  assert(total_file_count > 0);
+  assert(cf_size == total_level_size);
+}
+
+void GetAndCheckMetaData(rocksdb_t* db) {
+  rocksdb_column_family_metadata_t* cf_meta =
+      rocksdb_get_column_family_metadata(db);
+
+  CheckMetaData(cf_meta, "default");
+
+  rocksdb_column_family_metadata_destroy(cf_meta);
+}
+
+void GetAndCheckMetaDataCf(rocksdb_t* db,
+                           rocksdb_column_family_handle_t* handle,
+                           const char* cf_name) {
+  // Compact to make sure we have at least one sst file to obtain datadata.
+  rocksdb_compact_range_cf(db, handle, NULL, 0, NULL, 0);
+
+  rocksdb_column_family_metadata_t* cf_meta =
+      rocksdb_get_column_family_metadata_cf(db, handle);
+
+  CheckMetaData(cf_meta, cf_name);
+
+  rocksdb_column_family_metadata_destroy(cf_meta);
+}
+
+static rocksdb_t* CheckCompaction(rocksdb_t* db, rocksdb_options_t* options,
+                                  rocksdb_readoptions_t* roptions,
+                                  rocksdb_writeoptions_t* woptions) {
+  char* err = NULL;
+  db = rocksdb_open(options, dbname, &err);
+  CheckNoError(err);
+  rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
+  CheckNoError(err);
+  CheckGet(db, roptions, "foo", "foovalue");
+  rocksdb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
+  CheckNoError(err);
+  CheckGet(db, roptions, "bar", "barvalue");
+  rocksdb_put(db, woptions, "baz", 3, "bazvalue", 8, &err);
+  CheckNoError(err);
+  CheckGet(db, roptions, "baz", "bazvalue");
+
+  // Disable compaction
+  rocksdb_disable_manual_compaction(db);
+  rocksdb_compact_range(db, NULL, 0, NULL, 0);
+  // should not filter anything when disabled
+  CheckGet(db, roptions, "foo", "foovalue");
+  CheckGet(db, roptions, "bar", "barvalue");
+  CheckGet(db, roptions, "baz", "bazvalue");
+  // Reenable compaction
+  rocksdb_enable_manual_compaction(db);
+
+  // Force compaction
+  rocksdb_compact_range(db, NULL, 0, NULL, 0);
+  // should have filtered bar, but not foo
+  CheckGet(db, roptions, "foo", "foovalue");
+  CheckGet(db, roptions, "bar", NULL);
+  CheckGet(db, roptions, "baz", "newbazvalue");
+
+  rocksdb_suggest_compact_range(db, "bar", 3, "foo", 3, &err);
+  GetAndCheckMetaData(db);
+  CheckNoError(err);
+
+  return db;
+}
+
+// Custom merge operator
+static void MergeOperatorDestroy(void* arg) { (void)arg; }
+static const char* MergeOperatorName(void* arg) {
+  (void)arg;
+  return "TestMergeOperator";
+}
+static char* MergeOperatorFullMerge(
+    void* arg, const char* key, size_t key_length, const char* existing_value,
+    size_t existing_value_length, const char* const* operands_list,
+    const size_t* operands_list_length, int num_operands,
+    unsigned char* success, size_t* new_value_length) {
+  (void)arg;
+  (void)key;
+  (void)key_length;
+  (void)existing_value;
+  (void)existing_value_length;
+  (void)operands_list;
+  (void)operands_list_length;
+  (void)num_operands;
+  *new_value_length = 4;
+  *success = 1;
+  char* result = malloc(4);
+  memcpy(result, "fake", 4);
+  return result;
+}
+static char* MergeOperatorPartialMerge(void* arg, const char* key,
+                                       size_t key_length,
+                                       const char* const* operands_list,
+                                       const size_t* operands_list_length,
+                                       int num_operands, unsigned char* success,
+                                       size_t* new_value_length) {
+  (void)arg;
+  (void)key;
+  (void)key_length;
+  (void)operands_list;
+  (void)operands_list_length;
+  (void)num_operands;
+  *new_value_length = 4;
+  *success = 1;
+  char* result = malloc(4);
+  memcpy(result, "fake", 4);
+  return result;
+}
+
+static void CheckTxnGet(rocksdb_transaction_t* txn,
+                        const rocksdb_readoptions_t* options, const char* key,
+                        const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  char* val;
+  val = rocksdb_transaction_get(txn, options, key, strlen(key), &val_len, &err);
+  CheckNoError(err);
+  CheckEqual(expected, val, val_len);
+  Free(&val);
+}
+
+static void CheckTxnGetCF(rocksdb_transaction_t* txn,
+                          const rocksdb_readoptions_t* options,
+                          rocksdb_column_family_handle_t* column_family,
+                          const char* key, const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  char* val;
+  val = rocksdb_transaction_get_cf(txn, options, column_family, key,
+                                   strlen(key), &val_len, &err);
+  CheckNoError(err);
+  CheckEqual(expected, val, val_len);
+  Free(&val);
+}
+
+static void CheckTxnPinGet(rocksdb_transaction_t* txn,
+                           const rocksdb_readoptions_t* options,
+                           const char* key, const char* expected) {
+  rocksdb_pinnableslice_t* p = NULL;
+  const char* val = NULL;
+  char* err = NULL;
+  size_t val_len;
+  p = rocksdb_transaction_get_pinned(txn, options, key, strlen(key), &err);
+  CheckNoError(err);
+  val = rocksdb_pinnableslice_value(p, &val_len);
+  CheckEqual(expected, val, val_len);
+  rocksdb_pinnableslice_destroy(p);
+}
+
+static void CheckTxnPinGetCF(rocksdb_transaction_t* txn,
+                             const rocksdb_readoptions_t* options,
+                             rocksdb_column_family_handle_t* column_family,
+                             const char* key, const char* expected) {
+  rocksdb_pinnableslice_t* p = NULL;
+  const char* val = NULL;
+  char* err = NULL;
+  size_t val_len;
+  p = rocksdb_transaction_get_pinned_cf(txn, options, column_family, key,
+                                        strlen(key), &err);
+  CheckNoError(err);
+  val = rocksdb_pinnableslice_value(p, &val_len);
+  CheckEqual(expected, val, val_len);
+  rocksdb_pinnableslice_destroy(p);
+}
+
+static void CheckTxnDBGet(rocksdb_transactiondb_t* txn_db,
+                          const rocksdb_readoptions_t* options, const char* key,
+                          const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  char* val;
+  val = rocksdb_transactiondb_get(txn_db, options, key, strlen(key), &val_len,
+                                  &err);
+  CheckNoError(err);
+  CheckEqual(expected, val, val_len);
+  Free(&val);
+}
+
+static void CheckTxnDBGetCF(rocksdb_transactiondb_t* txn_db,
+                            const rocksdb_readoptions_t* options,
+                            rocksdb_column_family_handle_t* column_family,
+                            const char* key, const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  char* val;
+  val = rocksdb_transactiondb_get_cf(txn_db, options, column_family, key,
+                                     strlen(key), &val_len, &err);
+  CheckNoError(err);
+  CheckEqual(expected, val, val_len);
+  Free(&val);
+}
+
+static void CheckTxnDBPinGet(rocksdb_transactiondb_t* txn_db,
+                             const rocksdb_readoptions_t* options,
+                             const char* key, const char* expected) {
+  rocksdb_pinnableslice_t* p = NULL;
+  const char* val = NULL;
+  char* err = NULL;
+  size_t val_len;
+  p = rocksdb_transactiondb_get_pinned(txn_db, options, key, strlen(key), &err);
+  CheckNoError(err);
+  val = rocksdb_pinnableslice_value(p, &val_len);
+  CheckEqual(expected, val, val_len);
+  rocksdb_pinnableslice_destroy(p);
+}
+
+static void CheckTxnDBPinGetCF(rocksdb_transactiondb_t* txn_db,
+                               const rocksdb_readoptions_t* options,
+                               rocksdb_column_family_handle_t* column_family,
+                               const char* key, const char* expected) {
+  rocksdb_pinnableslice_t* p = NULL;
+  const char* val = NULL;
+  char* err = NULL;
+  size_t val_len;
+  p = rocksdb_transactiondb_get_pinned_cf(txn_db, options, column_family, key,
+                                          strlen(key), &err);
+  CheckNoError(err);
+  val = rocksdb_pinnableslice_value(p, &val_len);
+  CheckEqual(expected, val, val_len);
+  rocksdb_pinnableslice_destroy(p);
+}
+
+static void LoadAndCheckLatestOptions(const char* db_name, rocksdb_env_t* env,
+                                      bool ignore_unknown_options,
+                                      rocksdb_cache_t* cache,
+                                      rocksdb_comparator_t* cmp,
+                                      const size_t expected_num_column_families,
+                                      const char** expected_cf_names,
+                                      const char* expected_open_err) {
+  rocksdb_options_t* db_options;
+  size_t num_column_families;
+  char** list_column_family_names;
+  rocksdb_options_t** list_column_family_options;
+  char* err = 0;
+
+  // load the latest rocksdb option
+  rocksdb_load_latest_options(db_name, env, ignore_unknown_options, cache,
+                              &db_options, &num_column_families,
+                              &list_column_family_names,
+                              &list_column_family_options, &err);
+  assert(num_column_families == expected_num_column_families);
+  CheckNoError(err);
+
+  // verify the loaded options by opening the db.
+  rocksdb_options_set_error_if_exists(db_options, 0);
+
+  char** list_const_cf_names =
+      (char**)malloc(num_column_families * sizeof(char*));
+  rocksdb_options_t** list_const_cf_options = (rocksdb_options_t**)malloc(
+      num_column_families * sizeof(rocksdb_options_t*));
+  for (size_t i = 0; i < num_column_families; ++i) {
+    assert(strcmp(list_column_family_names[i], expected_cf_names[i]) == 0);
+    list_const_cf_names[i] = list_column_family_names[i];
+    if (cmp) {
+      rocksdb_options_set_comparator(list_column_family_options[i], cmp);
+    }
+    list_const_cf_options[i] = list_column_family_options[i];
+  }
+  rocksdb_column_family_handle_t** handles =
+      (rocksdb_column_family_handle_t**)malloc(
+          num_column_families * sizeof(rocksdb_column_family_handle_t*));
+
+  rocksdb_t* db = rocksdb_open_column_families(
+      db_options, db_name, (int)num_column_families,
+      (const char* const*)list_const_cf_names,
+      (const rocksdb_options_t* const*)list_const_cf_options, handles, &err);
+  if (expected_open_err == NULL) {
+    CheckNoError(err);
+    for (size_t i = 0; i < num_column_families; ++i) {
+      rocksdb_column_family_handle_destroy(handles[i]);
+    }
+    free(handles);
+    rocksdb_close(db);
+  } else {
+    assert(err != NULL);
+    assert(strcmp(err, expected_open_err) == 0);
+    free(handles);
+    free(err);
+  }
+
+  free(list_const_cf_names);
+  free(list_const_cf_options);
+  rocksdb_load_latest_options_destroy(db_options, list_column_family_names,
+                                      list_column_family_options,
+                                      num_column_families);
+}
+
+int main(int argc, char** argv) {
+  (void)argc;
+  (void)argv;
+  rocksdb_t* db;
+  rocksdb_comparator_t* cmp;
+  rocksdb_cache_t* cache;
+  rocksdb_dbpath_t* dbpath;
+  rocksdb_env_t* env;
+  rocksdb_options_t* options;
+  rocksdb_compactoptions_t* coptions;
+  rocksdb_block_based_table_options_t* table_options;
+  rocksdb_readoptions_t* roptions;
+  rocksdb_writeoptions_t* woptions;
+  rocksdb_ratelimiter_t* rate_limiter;
+  rocksdb_transactiondb_t* txn_db;
+  rocksdb_transactiondb_options_t* txn_db_options;
+  rocksdb_transaction_t* txn;
+  rocksdb_transaction_options_t* txn_options;
+  rocksdb_optimistictransactiondb_t* otxn_db;
+  rocksdb_optimistictransaction_options_t* otxn_options;
+  char* err = NULL;
+  int run = -1;
+
+  snprintf(dbname, sizeof(dbname), "%s/rocksdb_c_test-%d", GetTempDir(),
+           ((int)geteuid()));
+
+  snprintf(dbbackupname, sizeof(dbbackupname), "%s/rocksdb_c_test-%d-backup",
+           GetTempDir(), ((int)geteuid()));
+
+  snprintf(dbcheckpointname, sizeof(dbcheckpointname),
+           "%s/rocksdb_c_test-%d-checkpoint", GetTempDir(), ((int)geteuid()));
+
+  snprintf(sstfilename, sizeof(sstfilename), "%s/rocksdb_c_test-%d-sst",
+           GetTempDir(), ((int)geteuid()));
+
+  snprintf(dbpathname, sizeof(dbpathname), "%s/rocksdb_c_test-%d-dbpath",
+           GetTempDir(), ((int)geteuid()));
+
+  StartPhase("create_objects");
+  cmp = rocksdb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName);
+  dbpath = rocksdb_dbpath_create(dbpathname, 1024 * 1024);
+  env = rocksdb_create_default_env();
+
+  rocksdb_create_dir_if_missing(env, GetTempDir(), &err);
+  CheckNoError(err);
+
+  cache = rocksdb_cache_create_lru(100000);
+
+  options = rocksdb_options_create();
+  rocksdb_options_set_comparator(options, cmp);
+  rocksdb_options_set_error_if_exists(options, 1);
+  rocksdb_options_set_env(options, env);
+  rocksdb_options_set_info_log(options, NULL);
+  rocksdb_options_set_write_buffer_size(options, 100000);
+  rocksdb_options_set_paranoid_checks(options, 1);
+  rocksdb_options_set_max_open_files(options, 10);
+
+  table_options = rocksdb_block_based_options_create();
+  rocksdb_block_based_options_set_block_cache(table_options, cache);
+  rocksdb_block_based_options_set_data_block_index_type(table_options, 1);
+  rocksdb_block_based_options_set_data_block_hash_ratio(table_options, 0.75);
+  rocksdb_options_set_block_based_table_factory(options, table_options);
+
+  rocksdb_options_set_compression(options, rocksdb_no_compression);
+  rocksdb_options_set_compression_options(options, -14, -1, 0, 0);
+  int compression_levels[] = {rocksdb_no_compression, rocksdb_no_compression,
+                              rocksdb_no_compression, rocksdb_no_compression};
+  rocksdb_options_set_compression_per_level(options, compression_levels, 4);
+  rate_limiter = rocksdb_ratelimiter_create(1000 * 1024 * 1024, 100 * 1000, 10);
+  rocksdb_options_set_ratelimiter(options, rate_limiter);
+  rocksdb_ratelimiter_destroy(rate_limiter);
+
+  roptions = rocksdb_readoptions_create();
+  rocksdb_readoptions_set_verify_checksums(roptions, 1);
+  rocksdb_readoptions_set_fill_cache(roptions, 1);
+
+  woptions = rocksdb_writeoptions_create();
+  rocksdb_writeoptions_set_sync(woptions, 1);
+
+  coptions = rocksdb_compactoptions_create();
+  rocksdb_compactoptions_set_exclusive_manual_compaction(coptions, 1);
+
+  rocksdb_options_add_compact_on_deletion_collector_factory(options, 10000,
+                                                            10001);
+
+  StartPhase("destroy");
+  rocksdb_destroy_db(options, dbname, &err);
+  Free(&err);
+
+  StartPhase("open_error");
+  rocksdb_open(options, dbname, &err);
+  CheckCondition(err != NULL);
+  Free(&err);
+
+  StartPhase("open");
+  rocksdb_options_set_create_if_missing(options, 1);
+  db = rocksdb_open(options, dbname, &err);
+  CheckNoError(err);
+  CheckGet(db, roptions, "foo", NULL);
+
+  StartPhase("put");
+  rocksdb_put(db, woptions, "foo", 3, "hello", 5, &err);
+  CheckNoError(err);
+  CheckGet(db, roptions, "foo", "hello");
+
+  StartPhase("backup_and_restore");
+  {
+    rocksdb_destroy_db(options, dbbackupname, &err);
+    CheckNoError(err);
+
+    rocksdb_backup_engine_t* be =
+        rocksdb_backup_engine_open(options, dbbackupname, &err);
+    CheckNoError(err);
+
+    rocksdb_backup_engine_create_new_backup(be, db, &err);
+    CheckNoError(err);
+
+    // need a change to trigger a new backup
+    rocksdb_delete(db, woptions, "does-not-exist", 14, &err);
+    CheckNoError(err);
+
+    rocksdb_backup_engine_create_new_backup(be, db, &err);
+    CheckNoError(err);
+
+    const rocksdb_backup_engine_info_t* bei =
+        rocksdb_backup_engine_get_backup_info(be);
+    CheckCondition(rocksdb_backup_engine_info_count(bei) > 1);
+    rocksdb_backup_engine_info_destroy(bei);
+
+    rocksdb_backup_engine_purge_old_backups(be, 1, &err);
+    CheckNoError(err);
+
+    bei = rocksdb_backup_engine_get_backup_info(be);
+    CheckCondition(rocksdb_backup_engine_info_count(bei) == 1);
+    rocksdb_backup_engine_info_destroy(bei);
+
+    rocksdb_delete(db, woptions, "foo", 3, &err);
+    CheckNoError(err);
+
+    rocksdb_close(db);
+
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_restore_options_t* restore_options =
+        rocksdb_restore_options_create();
+    rocksdb_restore_options_set_keep_log_files(restore_options, 0);
+    rocksdb_backup_engine_restore_db_from_latest_backup(be, dbname, dbname,
+                                                        restore_options, &err);
+    CheckNoError(err);
+    rocksdb_restore_options_destroy(restore_options);
+
+    rocksdb_options_set_error_if_exists(options, 0);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_options_set_error_if_exists(options, 1);
+
+    CheckGet(db, roptions, "foo", "hello");
+
+    rocksdb_backup_engine_close(be);
+  }
+
+  StartPhase("checkpoint");
+  {
+    rocksdb_destroy_db(options, dbcheckpointname, &err);
+    CheckNoError(err);
+
+    rocksdb_checkpoint_t* checkpoint =
+        rocksdb_checkpoint_object_create(db, &err);
+    CheckNoError(err);
+
+    rocksdb_checkpoint_create(checkpoint, dbcheckpointname, 0, &err);
+    CheckNoError(err);
+
+    // start a new database from the checkpoint
+    rocksdb_close(db);
+    rocksdb_options_set_error_if_exists(options, 0);
+    db = rocksdb_open(options, dbcheckpointname, &err);
+    CheckNoError(err);
+
+    CheckGet(db, roptions, "foo", "hello");
+
+    rocksdb_checkpoint_object_destroy(checkpoint);
+
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbcheckpointname, &err);
+    CheckNoError(err);
+
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_options_set_error_if_exists(options, 1);
+  }
+
+  StartPhase("compactall");
+  rocksdb_compact_range(db, NULL, 0, NULL, 0);
+  CheckGet(db, roptions, "foo", "hello");
+
+  StartPhase("compactrange");
+  rocksdb_compact_range(db, "a", 1, "z", 1);
+  CheckGet(db, roptions, "foo", "hello");
+
+  StartPhase("compactallopt");
+  rocksdb_compact_range_opt(db, coptions, NULL, 0, NULL, 0);
+  CheckGet(db, roptions, "foo", "hello");
+
+  StartPhase("compactrangeopt");
+  rocksdb_compact_range_opt(db, coptions, "a", 1, "z", 1);
+  CheckGet(db, roptions, "foo", "hello");
+
+  // Simple check cache usage
+  StartPhase("cache_usage");
+  {
+    rocksdb_readoptions_set_pin_data(roptions, 1);
+    rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+    rocksdb_iter_seek(iter, "foo", 3);
+
+    size_t usage = rocksdb_cache_get_usage(cache);
+    CheckCondition(usage > 0);
+
+    size_t pin_usage = rocksdb_cache_get_pinned_usage(cache);
+    CheckCondition(pin_usage > 0);
+
+    rocksdb_iter_next(iter);
+    rocksdb_iter_destroy(iter);
+    rocksdb_readoptions_set_pin_data(roptions, 0);
+  }
+
+  StartPhase("addfile");
+  {
+    rocksdb_envoptions_t* env_opt = rocksdb_envoptions_create();
+    rocksdb_options_t* io_options = rocksdb_options_create();
+    rocksdb_sstfilewriter_t* writer =
+        rocksdb_sstfilewriter_create(env_opt, io_options);
+
+    remove(sstfilename);
+    rocksdb_sstfilewriter_open(writer, sstfilename, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_put(writer, "sstk1", 5, "v1", 2, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_put(writer, "sstk2", 5, "v2", 2, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_put(writer, "sstk3", 5, "v3", 2, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_finish(writer, &err);
+    CheckNoError(err);
+
+    rocksdb_ingestexternalfileoptions_t* ing_opt =
+        rocksdb_ingestexternalfileoptions_create();
+    const char* file_list[1] = {sstfilename};
+    rocksdb_ingest_external_file(db, file_list, 1, ing_opt, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "sstk1", "v1");
+    CheckGet(db, roptions, "sstk2", "v2");
+    CheckGet(db, roptions, "sstk3", "v3");
+
+    remove(sstfilename);
+    rocksdb_sstfilewriter_open(writer, sstfilename, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_put(writer, "sstk2", 5, "v4", 2, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_put(writer, "sstk22", 6, "v5", 2, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_put(writer, "sstk3", 5, "v6", 2, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_finish(writer, &err);
+    CheckNoError(err);
+
+    rocksdb_ingest_external_file(db, file_list, 1, ing_opt, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "sstk1", "v1");
+    CheckGet(db, roptions, "sstk2", "v4");
+    CheckGet(db, roptions, "sstk22", "v5");
+    CheckGet(db, roptions, "sstk3", "v6");
+
+    rocksdb_sstfilewriter_open(writer, sstfilename, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_put(writer, "abc1", 4, "v7", 2, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_put(writer, "abc2", 4, "v8", 2, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_put(writer, "abc3", 4, "v9", 2, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_put(writer, "abc4", 4, "v10", 3, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_delete_range(writer, "abc1", 4, "abc4", 4, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_finish(writer, &err);
+    CheckNoError(err);
+
+    rocksdb_ingestexternalfileoptions_destroy(ing_opt);
+    rocksdb_sstfilewriter_destroy(writer);
+    rocksdb_options_destroy(io_options);
+    rocksdb_envoptions_destroy(env_opt);
+
+    // Delete all keys we just ingested
+    rocksdb_delete(db, woptions, "sstk1", 5, &err);
+    CheckNoError(err);
+    rocksdb_delete(db, woptions, "sstk2", 5, &err);
+    CheckNoError(err);
+    rocksdb_delete(db, woptions, "sstk22", 6, &err);
+    CheckNoError(err);
+    rocksdb_delete(db, woptions, "sstk3", 5, &err);
+    CheckNoError(err);
+  }
+
+  StartPhase("writebatch");
+  {
+    rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+    rocksdb_writebatch_put(wb, "foo", 3, "a", 1);
+    rocksdb_writebatch_clear(wb);
+    rocksdb_writebatch_put(wb, "bar", 3, "b", 1);
+    rocksdb_writebatch_put(wb, "box", 3, "c", 1);
+    rocksdb_writebatch_delete(wb, "bar", 3);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", "hello");
+    CheckGet(db, roptions, "bar", NULL);
+    CheckGet(db, roptions, "box", "c");
+    int pos = 0;
+    rocksdb_writebatch_iterate(wb, &pos, CheckPut, CheckDel);
+    CheckCondition(pos == 3);
+    rocksdb_writebatch_clear(wb);
+    rocksdb_writebatch_put(wb, "bar", 3, "b", 1);
+    rocksdb_writebatch_put(wb, "bay", 3, "d", 1);
+    rocksdb_writebatch_delete_range(wb, "bar", 3, "bay", 3);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "bar", NULL);
+    CheckGet(db, roptions, "bay", "d");
+    rocksdb_writebatch_clear(wb);
+    const char* start_list[1] = {"bay"};
+    const size_t start_sizes[1] = {3};
+    const char* end_list[1] = {"baz"};
+    const size_t end_sizes[1] = {3};
+    rocksdb_writebatch_delete_rangev(wb, 1, start_list, start_sizes, end_list,
+                                     end_sizes);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "bay", NULL);
+    rocksdb_writebatch_destroy(wb);
+  }
+
+  StartPhase("writebatch_vectors");
+  {
+    rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+    const char* k_list[2] = {"z", "ap"};
+    const size_t k_sizes[2] = {1, 2};
+    const char* v_list[3] = {"x", "y", "z"};
+    const size_t v_sizes[3] = {1, 1, 1};
+    rocksdb_writebatch_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "zap", "xyz");
+    rocksdb_writebatch_delete(wb, "zap", 3);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "zap", NULL);
+    rocksdb_writebatch_destroy(wb);
+  }
+
+  StartPhase("writebatch_savepoint");
+  {
+    rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+    rocksdb_writebatch_set_save_point(wb);
+    rocksdb_writebatch_set_save_point(wb);
+    const char* k_list[2] = {"z", "ap"};
+    const size_t k_sizes[2] = {1, 2};
+    const char* v_list[3] = {"x", "y", "z"};
+    const size_t v_sizes[3] = {1, 1, 1};
+    rocksdb_writebatch_pop_save_point(wb, &err);
+    CheckNoError(err);
+    rocksdb_writebatch_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes);
+    rocksdb_writebatch_rollback_to_save_point(wb, &err);
+    CheckNoError(err);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "zap", NULL);
+    rocksdb_writebatch_destroy(wb);
+  }
+
+  StartPhase("writebatch_rep");
+  {
+    rocksdb_writebatch_t* wb1 = rocksdb_writebatch_create();
+    rocksdb_writebatch_put(wb1, "baz", 3, "d", 1);
+    rocksdb_writebatch_put(wb1, "quux", 4, "e", 1);
+    rocksdb_writebatch_delete(wb1, "quux", 4);
+    size_t repsize1 = 0;
+    const char* rep = rocksdb_writebatch_data(wb1, &repsize1);
+    rocksdb_writebatch_t* wb2 = rocksdb_writebatch_create_from(rep, repsize1);
+    CheckCondition(rocksdb_writebatch_count(wb1) ==
+                   rocksdb_writebatch_count(wb2));
+    size_t repsize2 = 0;
+    CheckCondition(
+        memcmp(rep, rocksdb_writebatch_data(wb2, &repsize2), repsize1) == 0);
+    rocksdb_writebatch_destroy(wb1);
+    rocksdb_writebatch_destroy(wb2);
+  }
+
+  StartPhase("writebatch_wi");
+  {
+    rocksdb_writebatch_wi_t* wbi = rocksdb_writebatch_wi_create(0, 1);
+    rocksdb_writebatch_wi_put(wbi, "foo", 3, "a", 1);
+    rocksdb_writebatch_wi_clear(wbi);
+    rocksdb_writebatch_wi_put(wbi, "bar", 3, "b", 1);
+    rocksdb_writebatch_wi_put(wbi, "box", 3, "c", 1);
+    rocksdb_writebatch_wi_delete(wbi, "bar", 3);
+    int count = rocksdb_writebatch_wi_count(wbi);
+    CheckCondition(count == 3);
+    size_t size;
+    char* value;
+    value = rocksdb_writebatch_wi_get_from_batch(wbi, options, "box", 3, &size,
+                                                 &err);
+    CheckValue(err, "c", &value, size);
+    value = rocksdb_writebatch_wi_get_from_batch(wbi, options, "bar", 3, &size,
+                                                 &err);
+    CheckValue(err, NULL, &value, size);
+    value = rocksdb_writebatch_wi_get_from_batch_and_db(wbi, db, roptions,
+                                                        "foo", 3, &size, &err);
+    CheckValue(err, "hello", &value, size);
+    value = rocksdb_writebatch_wi_get_from_batch_and_db(wbi, db, roptions,
+                                                        "box", 3, &size, &err);
+    CheckValue(err, "c", &value, size);
+    rocksdb_write_writebatch_wi(db, woptions, wbi, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", "hello");
+    CheckGet(db, roptions, "bar", NULL);
+    CheckGet(db, roptions, "box", "c");
+    int pos = 0;
+    rocksdb_writebatch_wi_iterate(wbi, &pos, CheckPut, CheckDel);
+    CheckCondition(pos == 3);
+    rocksdb_writebatch_wi_clear(wbi);
+    rocksdb_writebatch_wi_destroy(wbi);
+  }
+
+  StartPhase("writebatch_wi_vectors");
+  {
+    rocksdb_writebatch_wi_t* wb = rocksdb_writebatch_wi_create(0, 1);
+    const char* k_list[2] = {"z", "ap"};
+    const size_t k_sizes[2] = {1, 2};
+    const char* v_list[3] = {"x", "y", "z"};
+    const size_t v_sizes[3] = {1, 1, 1};
+    rocksdb_writebatch_wi_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes);
+    rocksdb_write_writebatch_wi(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "zap", "xyz");
+    rocksdb_writebatch_wi_delete(wb, "zap", 3);
+    rocksdb_write_writebatch_wi(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "zap", NULL);
+    rocksdb_writebatch_wi_destroy(wb);
+  }
+
+  StartPhase("writebatch_wi_savepoint");
+  {
+    rocksdb_writebatch_wi_t* wb = rocksdb_writebatch_wi_create(0, 1);
+    rocksdb_writebatch_wi_set_save_point(wb);
+    const char* k_list[2] = {"z", "ap"};
+    const size_t k_sizes[2] = {1, 2};
+    const char* v_list[3] = {"x", "y", "z"};
+    const size_t v_sizes[3] = {1, 1, 1};
+    rocksdb_writebatch_wi_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes);
+    rocksdb_writebatch_wi_rollback_to_save_point(wb, &err);
+    CheckNoError(err);
+    rocksdb_write_writebatch_wi(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "zap", NULL);
+    rocksdb_writebatch_wi_destroy(wb);
+  }
+
+  StartPhase("iter");
+  {
+    rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_next(iter);
+    CheckIter(iter, "foo", "hello");
+    rocksdb_iter_prev(iter);
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_prev(iter);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_last(iter);
+    CheckIter(iter, "foo", "hello");
+    rocksdb_iter_seek(iter, "b", 1);
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_seek_for_prev(iter, "g", 1);
+    CheckIter(iter, "foo", "hello");
+    rocksdb_iter_seek_for_prev(iter, "box", 3);
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+  }
+
+  StartPhase("wbwi_iter");
+  {
+    rocksdb_iterator_t* base_iter = rocksdb_create_iterator(db, roptions);
+    rocksdb_writebatch_wi_t* wbi = rocksdb_writebatch_wi_create(0, 1);
+    rocksdb_writebatch_wi_put(wbi, "bar", 3, "b", 1);
+    rocksdb_writebatch_wi_delete(wbi, "foo", 3);
+    rocksdb_iterator_t* iter =
+        rocksdb_writebatch_wi_create_iterator_with_base(wbi, base_iter);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+    CheckIter(iter, "bar", "b");
+    rocksdb_iter_next(iter);
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_prev(iter);
+    CheckIter(iter, "bar", "b");
+    rocksdb_iter_prev(iter);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_last(iter);
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_seek(iter, "b", 1);
+    CheckIter(iter, "bar", "b");
+    rocksdb_iter_seek_for_prev(iter, "c", 1);
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_seek_for_prev(iter, "box", 3);
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+    rocksdb_writebatch_wi_destroy(wbi);
+  }
+
+  StartPhase("multiget");
+  {
+    const char* keys[3] = {"box", "foo", "notfound"};
+    const size_t keys_sizes[3] = {3, 3, 8};
+    char* vals[3];
+    size_t vals_sizes[3];
+    char* errs[3];
+    const char* expected[3] = {"c", "hello", NULL};
+    rocksdb_multi_get(db, roptions, 3, keys, keys_sizes, vals, vals_sizes,
+                      errs);
+    CheckMultiGetValues(3, vals, vals_sizes, errs, expected);
+  }
+
+  StartPhase("pin_get");
+  {
+    CheckPinGet(db, roptions, "box", "c");
+    CheckPinGet(db, roptions, "foo", "hello");
+    CheckPinGet(db, roptions, "notfound", NULL);
+  }
+
+  StartPhase("approximate_sizes");
+  {
+    int i;
+    int n = 20000;
+    char keybuf[100];
+    char valbuf[100];
+    uint64_t sizes[2];
+    const char* start[2] = {"a", "k00000000000000010000"};
+    size_t start_len[2] = {1, 21};
+    const char* limit[2] = {"k00000000000000010000", "z"};
+    size_t limit_len[2] = {21, 1};
+    rocksdb_writeoptions_set_sync(woptions, 0);
+    for (i = 0; i < n; i++) {
+      snprintf(keybuf, sizeof(keybuf), "k%020d", i);
+      snprintf(valbuf, sizeof(valbuf), "v%020d", i);
+      rocksdb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf),
+                  &err);
+      CheckNoError(err);
+    }
+    rocksdb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes,
+                              &err);
+    CheckNoError(err);
+    CheckCondition(sizes[0] > 0);
+    CheckCondition(sizes[1] > 0);
+  }
+
+  StartPhase("property");
+  {
+    char* prop = rocksdb_property_value(db, "nosuchprop");
+    CheckCondition(prop == NULL);
+    prop = rocksdb_property_value(db, "rocksdb.stats");
+    CheckCondition(prop != NULL);
+    Free(&prop);
+  }
+
+  StartPhase("snapshot");
+  {
+    const rocksdb_snapshot_t* snap;
+    snap = rocksdb_create_snapshot(db);
+    rocksdb_delete(db, woptions, "foo", 3, &err);
+    CheckNoError(err);
+    rocksdb_readoptions_set_snapshot(roptions, snap);
+    CheckGet(db, roptions, "foo", "hello");
+    rocksdb_readoptions_set_snapshot(roptions, NULL);
+    CheckGet(db, roptions, "foo", NULL);
+    rocksdb_release_snapshot(db, snap);
+  }
+  StartPhase("snapshot_with_memtable_inplace_update");
+  {
+    rocksdb_close(db);
+    const rocksdb_snapshot_t* snap = NULL;
+    const char* s_key = "foo_snap";
+    const char* value1 = "hello_s1";
+    const char* value2 = "hello_s2";
+    rocksdb_options_set_allow_concurrent_memtable_write(options, 0);
+    rocksdb_options_set_inplace_update_support(options, 1);
+    rocksdb_options_set_error_if_exists(options, 0);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, s_key, 8, value1, 8, &err);
+    snap = rocksdb_create_snapshot(db);
+    assert(snap != NULL);
+    rocksdb_put(db, woptions, s_key, 8, value2, 8, &err);
+    CheckNoError(err);
+    rocksdb_readoptions_set_snapshot(roptions, snap);
+    CheckGet(db, roptions, "foo", NULL);
+    // snapshot syntax is invalid, because of inplace update supported is set
+    CheckGet(db, roptions, s_key, value2);
+    // restore the data and options
+    rocksdb_delete(db, woptions, s_key, 8, &err);
+    CheckGet(db, roptions, s_key, NULL);
+    rocksdb_release_snapshot(db, snap);
+    rocksdb_readoptions_set_snapshot(roptions, NULL);
+    rocksdb_options_set_inplace_update_support(options, 0);
+    rocksdb_options_set_allow_concurrent_memtable_write(options, 1);
+    rocksdb_options_set_error_if_exists(options, 1);
+  }
+  StartPhase("repair");
+  {
+    // If we do not compact here, then the lazy deletion of
+    // files (https://reviews.facebook.net/D6123) would leave
+    // around deleted files and the repair process will find
+    // those files and put them back into the database.
+    rocksdb_compact_range(db, NULL, 0, NULL, 0);
+    rocksdb_close(db);
+    rocksdb_options_set_create_if_missing(options, 0);
+    rocksdb_options_set_error_if_exists(options, 0);
+    rocksdb_options_set_wal_recovery_mode(options, 2);
+    rocksdb_repair_db(options, dbname, &err);
+    CheckNoError(err);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", NULL);
+    CheckGet(db, roptions, "bar", NULL);
+    CheckGet(db, roptions, "box", "c");
+    rocksdb_options_set_create_if_missing(options, 1);
+    rocksdb_options_set_error_if_exists(options, 1);
+  }
+
+  StartPhase("filter");
+  for (run = 1; run <= 4; run++) {
+    // run=0 uses custom filter (not currently supported)
+    // run=1 uses old block-based bloom filter
+    // run=2 run uses full bloom filter
+    // run=3 uses Ribbon
+    // run=4 uses Ribbon-Bloom hybrid configuration
+    CheckNoError(err);
+    rocksdb_filterpolicy_t* policy;
+    if (run == 1) {
+      policy = rocksdb_filterpolicy_create_bloom(8.0);
+    } else if (run == 2) {
+      policy = rocksdb_filterpolicy_create_bloom_full(8.0);
+    } else if (run == 3) {
+      policy = rocksdb_filterpolicy_create_ribbon(8.0);
+    } else {
+      policy = rocksdb_filterpolicy_create_ribbon_hybrid(8.0, 1);
+    }
+    rocksdb_block_based_options_set_filter_policy(table_options, policy);
+
+    // Create new database
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    rocksdb_options_set_block_based_table_factory(options, table_options);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
+    CheckNoError(err);
+
+    {
+      // Add enough keys to get just one reasonably populated Bloom filter
+      const int keys_to_add = 1500;
+      int i;
+      char keybuf[100];
+      for (i = 0; i < keys_to_add; i++) {
+        snprintf(keybuf, sizeof(keybuf), "yes%020d", i);
+        rocksdb_put(db, woptions, keybuf, strlen(keybuf), "val", 3, &err);
+        CheckNoError(err);
+      }
+    }
+    rocksdb_compact_range(db, NULL, 0, NULL, 0);
+
+    CheckGet(db, roptions, "foo", "foovalue");
+    CheckGet(db, roptions, "bar", "barvalue");
+
+    {
+      // Query some keys not added to identify Bloom filter implementation
+      // from false positive queries, using perfcontext to detect Bloom
+      // filter behavior
+      rocksdb_perfcontext_t* perf = rocksdb_perfcontext_create();
+      rocksdb_perfcontext_reset(perf);
+
+      const int keys_to_query = 10000;
+      int i;
+      char keybuf[100];
+      for (i = 0; i < keys_to_query; i++) {
+        snprintf(keybuf, sizeof(keybuf), "no%020d", i);
+        CheckGet(db, roptions, keybuf, NULL);
+      }
+
+      const int hits =
+          (int)rocksdb_perfcontext_metric(perf, rocksdb_bloom_sst_hit_count);
+      if (run == 0) {
+        // Due to half true, half false with fake filter result
+        CheckCondition(hits == keys_to_query / 2);
+      } else if (run == 1 || run == 2 || run == 4) {
+        // For run == 1, block-based Bloom is no longer available in public
+        // API; attempting to enable it enables full Bloom instead.
+        //
+        // Essentially a fingerprint of full Bloom schema, format_version=5
+        CheckCondition(hits == 188);
+      } else {
+        // Essentially a fingerprint of Ribbon schema
+        CheckCondition(hits == 226);
+      }
+      CheckCondition(
+          (keys_to_query - hits) ==
+          (int)rocksdb_perfcontext_metric(perf, rocksdb_bloom_sst_miss_count));
+
+      rocksdb_perfcontext_destroy(perf);
+    }
+
+    // Reset the policy
+    rocksdb_block_based_options_set_filter_policy(table_options, NULL);
+    rocksdb_options_set_block_based_table_factory(options, table_options);
+  }
+
+  StartPhase("compaction_filter");
+  {
+    rocksdb_options_t* options_with_filter = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(options_with_filter, 1);
+    rocksdb_compactionfilter_t* cfilter;
+    cfilter = rocksdb_compactionfilter_create(NULL, CFilterDestroy,
+                                              CFilterFilter, CFilterName);
+    // Create new database
+    rocksdb_close(db);
+    rocksdb_destroy_db(options_with_filter, dbname, &err);
+    rocksdb_options_set_compaction_filter(options_with_filter, cfilter);
+    db = CheckCompaction(db, options_with_filter, roptions, woptions);
+
+    rocksdb_options_set_compaction_filter(options_with_filter, NULL);
+    rocksdb_compactionfilter_destroy(cfilter);
+    rocksdb_options_destroy(options_with_filter);
+  }
+
+  StartPhase("compaction_filter_factory");
+  {
+    rocksdb_options_t* options_with_filter_factory = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(options_with_filter_factory, 1);
+    rocksdb_compactionfilterfactory_t* factory;
+    factory = rocksdb_compactionfilterfactory_create(
+        NULL, CFilterFactoryDestroy, CFilterCreate, CFilterFactoryName);
+    // Create new database
+    rocksdb_close(db);
+    rocksdb_destroy_db(options_with_filter_factory, dbname, &err);
+    rocksdb_options_set_compaction_filter_factory(options_with_filter_factory,
+                                                  factory);
+    db = CheckCompaction(db, options_with_filter_factory, roptions, woptions);
+
+    rocksdb_options_set_compaction_filter_factory(options_with_filter_factory,
+                                                  NULL);
+    rocksdb_options_destroy(options_with_filter_factory);
+  }
+
+  StartPhase("merge_operator");
+  {
+    rocksdb_mergeoperator_t* merge_operator;
+    merge_operator = rocksdb_mergeoperator_create(
+        NULL, MergeOperatorDestroy, MergeOperatorFullMerge,
+        MergeOperatorPartialMerge, NULL, MergeOperatorName);
+    // Create new database
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    rocksdb_options_set_merge_operator(options, merge_operator);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", "foovalue");
+    rocksdb_merge(db, woptions, "foo", 3, "barvalue", 8, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", "fake");
+
+    // Merge of a non-existing value
+    rocksdb_merge(db, woptions, "bar", 3, "barvalue", 8, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "bar", "fake");
+  }
+
+  StartPhase("columnfamilies");
+  {
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_options_t* db_options = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(db_options, 1);
+    db = rocksdb_open(db_options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_close(db);
+    {
+      const char* expected_cf_names[1] = {"default"};
+      LoadAndCheckLatestOptions(dbname, env, false, cache, NULL, 1,
+                                expected_cf_names, NULL);
+    }
+
+    rocksdb_options_set_create_if_missing(db_options, 0);
+    db = rocksdb_open(db_options, dbname, &err);
+    rocksdb_column_family_handle_t* cfh;
+    cfh = rocksdb_create_column_family(db, db_options, "cf1", &err);
+    rocksdb_column_family_handle_destroy(cfh);
+    CheckNoError(err);
+    rocksdb_close(db);
+
+    size_t cflen;
+    char** column_fams =
+        rocksdb_list_column_families(db_options, dbname, &cflen, &err);
+    CheckNoError(err);
+    CheckEqual("default", column_fams[0], 7);
+    CheckEqual("cf1", column_fams[1], 3);
+    CheckCondition(cflen == 2);
+    rocksdb_list_column_families_destroy(column_fams, cflen);
+
+    rocksdb_options_t* cf_options = rocksdb_options_create();
+
+    const char* cf_names[2] = {"default", "cf1"};
+    const rocksdb_options_t* cf_opts[2] = {cf_options, cf_options};
+    rocksdb_column_family_handle_t* handles[2];
+
+    LoadAndCheckLatestOptions(dbname, env, false, cache, NULL, 2, cf_names,
+                              NULL);
+
+    db = rocksdb_open_column_families(db_options, dbname, 2, cf_names, cf_opts,
+                                      handles, &err);
+    CheckNoError(err);
+
+    rocksdb_put_cf(db, woptions, handles[1], "foo", 3, "hello", 5, &err);
+    CheckNoError(err);
+
+    rocksdb_put_cf(db, woptions, handles[1], "foobar1", 7, "hello1", 6, &err);
+    CheckNoError(err);
+    rocksdb_put_cf(db, woptions, handles[1], "foobar2", 7, "hello2", 6, &err);
+    CheckNoError(err);
+    rocksdb_put_cf(db, woptions, handles[1], "foobar3", 7, "hello3", 6, &err);
+    CheckNoError(err);
+    rocksdb_put_cf(db, woptions, handles[1], "foobar4", 7, "hello4", 6, &err);
+    CheckNoError(err);
+    rocksdb_suggest_compact_range_cf(db, handles[1], "foo", 3, "foobar9", 7,
+                                     &err);
+    CheckNoError(err);
+
+    rocksdb_flushoptions_t* flush_options = rocksdb_flushoptions_create();
+    rocksdb_flushoptions_set_wait(flush_options, 1);
+    rocksdb_flush_cf(db, flush_options, handles[1], &err);
+    CheckNoError(err) rocksdb_flushoptions_destroy(flush_options);
+
+    CheckGetCF(db, roptions, handles[1], "foo", "hello");
+    CheckPinGetCF(db, roptions, handles[1], "foo", "hello");
+
+    rocksdb_delete_cf(db, woptions, handles[1], "foo", 3, &err);
+    CheckNoError(err);
+
+    rocksdb_delete_range_cf(db, woptions, handles[1], "foobar2", 7, "foobar4",
+                            7, &err);
+    CheckNoError(err);
+
+    CheckGetCF(db, roptions, handles[1], "foo", NULL);
+    CheckPinGetCF(db, roptions, handles[1], "foo", NULL);
+
+    rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+    rocksdb_writebatch_put_cf(wb, handles[1], "baz", 3, "a", 1);
+    rocksdb_writebatch_clear(wb);
+    rocksdb_writebatch_put_cf(wb, handles[1], "bar", 3, "b", 1);
+    rocksdb_writebatch_put_cf(wb, handles[1], "box", 3, "c", 1);
+    rocksdb_writebatch_put_cf(wb, handles[1], "buff", 4, "rocksdb", 7);
+    rocksdb_writebatch_delete_cf(wb, handles[1], "bar", 3);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGetCF(db, roptions, handles[1], "baz", NULL);
+    CheckGetCF(db, roptions, handles[1], "bar", NULL);
+    CheckGetCF(db, roptions, handles[1], "box", "c");
+    CheckGetCF(db, roptions, handles[1], "buff", "rocksdb");
+    CheckPinGetCF(db, roptions, handles[1], "baz", NULL);
+    CheckPinGetCF(db, roptions, handles[1], "bar", NULL);
+    CheckPinGetCF(db, roptions, handles[1], "box", "c");
+    CheckPinGetCF(db, roptions, handles[1], "buff", "rocksdb");
+    rocksdb_writebatch_destroy(wb);
+
+    rocksdb_flush_wal(db, 1, &err);
+    CheckNoError(err);
+
+    const char* keys[3] = {"box", "box", "barfooxx"};
+    const rocksdb_column_family_handle_t* get_handles[3] = {
+        handles[0], handles[1], handles[1]};
+    const size_t keys_sizes[3] = {3, 3, 8};
+    char* vals[3];
+    size_t vals_sizes[3];
+    char* errs[3];
+    rocksdb_multi_get_cf(db, roptions, get_handles, 3, keys, keys_sizes, vals,
+                         vals_sizes, errs);
+
+    int i;
+    for (i = 0; i < 3; i++) {
+      CheckEqual(NULL, errs[i], 0);
+      switch (i) {
+        case 0:
+          CheckEqual(NULL, vals[i], vals_sizes[i]);  // wrong cf
+          break;
+        case 1:
+          CheckEqual("c", vals[i], vals_sizes[i]);  // bingo
+          break;
+        case 2:
+          CheckEqual(NULL, vals[i], vals_sizes[i]);  // normal not found
+          break;
+      }
+      Free(&vals[i]);
+    }
+
+    {
+      const char* batched_keys[4] = {"box", "buff", "barfooxx", "box"};
+      const size_t batched_keys_sizes[4] = {3, 4, 8, 3};
+      const char* expected_value[4] = {"c", "rocksdb", NULL, "c"};
+      char* batched_errs[4];
+
+      rocksdb_pinnableslice_t* pvals[4];
+      rocksdb_batched_multi_get_cf(db, roptions, handles[1], 4, batched_keys,
+                                   batched_keys_sizes, pvals, batched_errs,
+                                   false);
+      const char* val;
+      size_t val_len;
+      for (i = 0; i < 4; ++i) {
+        val = rocksdb_pinnableslice_value(pvals[i], &val_len);
+        CheckNoError(batched_errs[i]);
+        CheckEqual(expected_value[i], val, val_len);
+        rocksdb_pinnableslice_destroy(pvals[i]);
+      }
+    }
+
+    {
+      unsigned char value_found = 0;
+
+      CheckCondition(!rocksdb_key_may_exist(db, roptions, "invalid_key", 11,
+                                            NULL, NULL, NULL, 0, NULL));
+      CheckCondition(!rocksdb_key_may_exist(db, roptions, "invalid_key", 11,
+                                            &vals[0], &vals_sizes[0], NULL, 0,
+                                            &value_found));
+      if (value_found) {
+        Free(&vals[0]);
+      }
+
+      CheckCondition(!rocksdb_key_may_exist_cf(db, roptions, handles[1],
+                                               "invalid_key", 11, NULL, NULL,
+                                               NULL, 0, NULL));
+      CheckCondition(!rocksdb_key_may_exist_cf(db, roptions, handles[1],
+                                               "invalid_key", 11, &vals[0],
+                                               &vals_sizes[0], NULL, 0, NULL));
+      if (value_found) {
+        Free(&vals[0]);
+      }
+    }
+
+    rocksdb_iterator_t* iter =
+        rocksdb_create_iterator_cf(db, roptions, handles[1]);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+
+    for (i = 0; rocksdb_iter_valid(iter) != 0; rocksdb_iter_next(iter)) {
+      i++;
+    }
+    CheckCondition(i == 4);
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+
+    rocksdb_column_family_handle_t* iters_cf_handles[2] = {handles[0],
+                                                           handles[1]};
+    rocksdb_iterator_t* iters_handles[2];
+    rocksdb_create_iterators(db, roptions, iters_cf_handles, iters_handles, 2,
+                             &err);
+    CheckNoError(err);
+
+    iter = iters_handles[0];
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_destroy(iter);
+
+    iter = iters_handles[1];
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+
+    for (i = 0; rocksdb_iter_valid(iter) != 0; rocksdb_iter_next(iter)) {
+      i++;
+    }
+    CheckCondition(i == 4);
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+
+    GetAndCheckMetaDataCf(db, handles[1], cf_names[1]);
+
+    rocksdb_drop_column_family(db, handles[1], &err);
+    CheckNoError(err);
+    for (i = 0; i < 2; i++) {
+      rocksdb_column_family_handle_destroy(handles[i]);
+    }
+    rocksdb_close(db);
+    {
+      // As column family has been dropped, we expect only one column family.
+      const char* expected_cf_names[1] = {"default"};
+      LoadAndCheckLatestOptions(dbname, env, false, cache, NULL, 1,
+                                expected_cf_names, NULL);
+    }
+    rocksdb_destroy_db(options, dbname, &err);
+    rocksdb_options_destroy(db_options);
+    rocksdb_options_destroy(cf_options);
+  }
+
+  StartPhase("prefix");
+  {
+    // Create new database
+    rocksdb_options_set_allow_mmap_reads(options, 1);
+    rocksdb_options_set_prefix_extractor(
+        options, rocksdb_slicetransform_create_fixed_prefix(3));
+    rocksdb_options_set_hash_skip_list_rep(options, 5000, 4, 4);
+    rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16);
+    rocksdb_options_set_allow_concurrent_memtable_write(options, 0);
+
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_put(db, woptions, "foo1", 4, "foo", 3, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "foo2", 4, "foo", 3, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "foo3", 4, "foo", 3, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "bar1", 4, "bar", 3, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "bar2", 4, "bar", 3, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "bar3", 4, "bar", 3, &err);
+    CheckNoError(err);
+
+    rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+    CheckCondition(!rocksdb_iter_valid(iter));
+
+    rocksdb_iter_seek(iter, "bar", 3);
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    CheckCondition(rocksdb_iter_valid(iter));
+
+    CheckIter(iter, "bar1", "bar");
+    rocksdb_iter_next(iter);
+    CheckIter(iter, "bar2", "bar");
+    rocksdb_iter_next(iter);
+    CheckIter(iter, "bar3", "bar");
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+
+    rocksdb_readoptions_set_total_order_seek(roptions, 1);
+    iter = rocksdb_create_iterator(db, roptions);
+    CheckCondition(!rocksdb_iter_valid(iter));
+
+    rocksdb_iter_seek(iter, "ba", 2);
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    CheckCondition(rocksdb_iter_valid(iter));
+    CheckIter(iter, "bar1", "bar");
+
+    rocksdb_iter_destroy(iter);
+    rocksdb_readoptions_set_total_order_seek(roptions, 0);
+
+    rocksdb_close(db);
+
+    {
+      const char* expected_cf_names[1] = {"default"};
+      LoadAndCheckLatestOptions(dbname, env, false, cache, NULL, 1,
+                                expected_cf_names,
+                                "Invalid argument: leveldb.BytewiseComparator: "
+                                "does not match existing comparator foo");
+      LoadAndCheckLatestOptions(dbname, env, false, cache, cmp, 1,
+                                expected_cf_names, NULL);
+    }
+    rocksdb_destroy_db(options, dbname, &err);
+  }
+
+  // Check memory usage stats
+  StartPhase("approximate_memory_usage");
+  {
+    // Create database
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_memory_consumers_t* consumers;
+    consumers = rocksdb_memory_consumers_create();
+    rocksdb_memory_consumers_add_db(consumers, db);
+    rocksdb_memory_consumers_add_cache(consumers, cache);
+
+    // take memory usage report before write-read operation
+    rocksdb_memory_usage_t* mu1;
+    mu1 = rocksdb_approximate_memory_usage_create(consumers, &err);
+    CheckNoError(err);
+
+    // Put data (this should affect memtables)
+    rocksdb_put(db, woptions, "memory", 6, "test", 4, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "memory", "test");
+
+    // take memory usage report after write-read operation
+    rocksdb_memory_usage_t* mu2;
+    mu2 = rocksdb_approximate_memory_usage_create(consumers, &err);
+    CheckNoError(err);
+
+    // amount of memory used within memtables should grow
+    CheckCondition(rocksdb_approximate_memory_usage_get_mem_table_total(mu2) >=
+                   rocksdb_approximate_memory_usage_get_mem_table_total(mu1));
+    CheckCondition(
+        rocksdb_approximate_memory_usage_get_mem_table_unflushed(mu2) >=
+        rocksdb_approximate_memory_usage_get_mem_table_unflushed(mu1));
+
+    rocksdb_memory_consumers_destroy(consumers);
+    rocksdb_approximate_memory_usage_destroy(mu1);
+    rocksdb_approximate_memory_usage_destroy(mu2);
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+  }
+
+  StartPhase("cuckoo_options");
+  {
+    rocksdb_cuckoo_table_options_t* cuckoo_options;
+    cuckoo_options = rocksdb_cuckoo_options_create();
+    rocksdb_cuckoo_options_set_hash_ratio(cuckoo_options, 0.5);
+    rocksdb_cuckoo_options_set_max_search_depth(cuckoo_options, 200);
+    rocksdb_cuckoo_options_set_cuckoo_block_size(cuckoo_options, 10);
+    rocksdb_cuckoo_options_set_identity_as_first_hash(cuckoo_options, 1);
+    rocksdb_cuckoo_options_set_use_module_hash(cuckoo_options, 0);
+    rocksdb_options_set_cuckoo_table_factory(options, cuckoo_options);
+
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_cuckoo_options_destroy(cuckoo_options);
+  }
+
+  StartPhase("options");
+  {
+    rocksdb_options_t* o;
+    o = rocksdb_options_create();
+
+    // Set and check options.
+    rocksdb_options_set_allow_ingest_behind(o, 1);
+    CheckCondition(1 == rocksdb_options_get_allow_ingest_behind(o));
+
+    rocksdb_options_compaction_readahead_size(o, 10);
+    CheckCondition(10 == rocksdb_options_get_compaction_readahead_size(o));
+
+    rocksdb_options_set_create_if_missing(o, 1);
+    CheckCondition(1 == rocksdb_options_get_create_if_missing(o));
+
+    rocksdb_options_set_create_missing_column_families(o, 1);
+    CheckCondition(1 == rocksdb_options_get_create_missing_column_families(o));
+
+    rocksdb_options_set_error_if_exists(o, 1);
+    CheckCondition(1 == rocksdb_options_get_error_if_exists(o));
+
+    rocksdb_options_set_paranoid_checks(o, 1);
+    CheckCondition(1 == rocksdb_options_get_paranoid_checks(o));
+
+    rocksdb_options_set_info_log_level(o, 3);
+    CheckCondition(3 == rocksdb_options_get_info_log_level(o));
+
+    rocksdb_options_set_write_buffer_size(o, 100);
+    CheckCondition(100 == rocksdb_options_get_write_buffer_size(o));
+
+    rocksdb_options_set_db_write_buffer_size(o, 1000);
+    CheckCondition(1000 == rocksdb_options_get_db_write_buffer_size(o));
+
+    rocksdb_options_set_max_open_files(o, 21);
+    CheckCondition(21 == rocksdb_options_get_max_open_files(o));
+
+    rocksdb_options_set_max_file_opening_threads(o, 5);
+    CheckCondition(5 == rocksdb_options_get_max_file_opening_threads(o));
+
+    rocksdb_options_set_max_total_wal_size(o, 400);
+    CheckCondition(400 == rocksdb_options_get_max_total_wal_size(o));
+
+    rocksdb_options_set_num_levels(o, 7);
+    CheckCondition(7 == rocksdb_options_get_num_levels(o));
+
+    rocksdb_options_set_level0_file_num_compaction_trigger(o, 4);
+    CheckCondition(4 ==
+                   rocksdb_options_get_level0_file_num_compaction_trigger(o));
+
+    rocksdb_options_set_level0_slowdown_writes_trigger(o, 6);
+    CheckCondition(6 == rocksdb_options_get_level0_slowdown_writes_trigger(o));
+
+    rocksdb_options_set_level0_stop_writes_trigger(o, 8);
+    CheckCondition(8 == rocksdb_options_get_level0_stop_writes_trigger(o));
+
+    rocksdb_options_set_target_file_size_base(o, 256);
+    CheckCondition(256 == rocksdb_options_get_target_file_size_base(o));
+
+    rocksdb_options_set_target_file_size_multiplier(o, 3);
+    CheckCondition(3 == rocksdb_options_get_target_file_size_multiplier(o));
+
+    rocksdb_options_set_max_bytes_for_level_base(o, 1024);
+    CheckCondition(1024 == rocksdb_options_get_max_bytes_for_level_base(o));
+
+    rocksdb_options_set_level_compaction_dynamic_level_bytes(o, 1);
+    CheckCondition(1 ==
+                   rocksdb_options_get_level_compaction_dynamic_level_bytes(o));
+
+    rocksdb_options_set_max_bytes_for_level_multiplier(o, 2.0);
+    CheckCondition(2.0 ==
+                   rocksdb_options_get_max_bytes_for_level_multiplier(o));
+
+    rocksdb_options_set_skip_stats_update_on_db_open(o, 1);
+    CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(o));
+
+    rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(o, 1);
+    CheckCondition(
+        1 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(o));
+
+    rocksdb_options_set_max_write_buffer_number(o, 97);
+    CheckCondition(97 == rocksdb_options_get_max_write_buffer_number(o));
+
+    rocksdb_options_set_min_write_buffer_number_to_merge(o, 23);
+    CheckCondition(23 ==
+                   rocksdb_options_get_min_write_buffer_number_to_merge(o));
+
+    rocksdb_options_set_max_write_buffer_number_to_maintain(o, 64);
+    CheckCondition(64 ==
+                   rocksdb_options_get_max_write_buffer_number_to_maintain(o));
+
+    rocksdb_options_set_max_write_buffer_size_to_maintain(o, 50000);
+    CheckCondition(50000 ==
+                   rocksdb_options_get_max_write_buffer_size_to_maintain(o));
+
+    rocksdb_options_set_enable_pipelined_write(o, 1);
+    CheckCondition(1 == rocksdb_options_get_enable_pipelined_write(o));
+
+    rocksdb_options_set_unordered_write(o, 1);
+    CheckCondition(1 == rocksdb_options_get_unordered_write(o));
+
+    rocksdb_options_set_max_subcompactions(o, 123456);
+    CheckCondition(123456 == rocksdb_options_get_max_subcompactions(o));
+
+    rocksdb_options_set_max_background_jobs(o, 2);
+    CheckCondition(2 == rocksdb_options_get_max_background_jobs(o));
+
+    rocksdb_options_set_max_background_compactions(o, 3);
+    CheckCondition(3 == rocksdb_options_get_max_background_compactions(o));
+
+    rocksdb_options_set_max_background_flushes(o, 5);
+    CheckCondition(5 == rocksdb_options_get_max_background_flushes(o));
+
+    rocksdb_options_set_max_log_file_size(o, 6);
+    CheckCondition(6 == rocksdb_options_get_max_log_file_size(o));
+
+    rocksdb_options_set_log_file_time_to_roll(o, 7);
+    CheckCondition(7 == rocksdb_options_get_log_file_time_to_roll(o));
+
+    rocksdb_options_set_keep_log_file_num(o, 8);
+    CheckCondition(8 == rocksdb_options_get_keep_log_file_num(o));
+
+    rocksdb_options_set_recycle_log_file_num(o, 9);
+    CheckCondition(9 == rocksdb_options_get_recycle_log_file_num(o));
+
+    rocksdb_options_set_soft_pending_compaction_bytes_limit(o, 10);
+    CheckCondition(10 ==
+                   rocksdb_options_get_soft_pending_compaction_bytes_limit(o));
+
+    rocksdb_options_set_hard_pending_compaction_bytes_limit(o, 11);
+    CheckCondition(11 ==
+                   rocksdb_options_get_hard_pending_compaction_bytes_limit(o));
+
+    rocksdb_options_set_max_manifest_file_size(o, 12);
+    CheckCondition(12 == rocksdb_options_get_max_manifest_file_size(o));
+
+    rocksdb_options_set_table_cache_numshardbits(o, 13);
+    CheckCondition(13 == rocksdb_options_get_table_cache_numshardbits(o));
+
+    rocksdb_options_set_arena_block_size(o, 14);
+    CheckCondition(14 == rocksdb_options_get_arena_block_size(o));
+
+    rocksdb_options_set_use_fsync(o, 1);
+    CheckCondition(1 == rocksdb_options_get_use_fsync(o));
+
+    rocksdb_options_set_WAL_ttl_seconds(o, 15);
+    CheckCondition(15 == rocksdb_options_get_WAL_ttl_seconds(o));
+
+    rocksdb_options_set_WAL_size_limit_MB(o, 16);
+    CheckCondition(16 == rocksdb_options_get_WAL_size_limit_MB(o));
+
+    rocksdb_options_set_manifest_preallocation_size(o, 17);
+    CheckCondition(17 == rocksdb_options_get_manifest_preallocation_size(o));
+
+    rocksdb_options_set_allow_mmap_reads(o, 1);
+    CheckCondition(1 == rocksdb_options_get_allow_mmap_reads(o));
+
+    rocksdb_options_set_allow_mmap_writes(o, 1);
+    CheckCondition(1 == rocksdb_options_get_allow_mmap_writes(o));
+
+    rocksdb_options_set_use_direct_reads(o, 1);
+    CheckCondition(1 == rocksdb_options_get_use_direct_reads(o));
+
+    rocksdb_options_set_use_direct_io_for_flush_and_compaction(o, 1);
+    CheckCondition(
+        1 == rocksdb_options_get_use_direct_io_for_flush_and_compaction(o));
+
+    rocksdb_options_set_is_fd_close_on_exec(o, 1);
+    CheckCondition(1 == rocksdb_options_get_is_fd_close_on_exec(o));
+
+    rocksdb_options_set_stats_dump_period_sec(o, 18);
+    CheckCondition(18 == rocksdb_options_get_stats_dump_period_sec(o));
+
+    rocksdb_options_set_stats_persist_period_sec(o, 5);
+    CheckCondition(5 == rocksdb_options_get_stats_persist_period_sec(o));
+
+    rocksdb_options_set_advise_random_on_open(o, 1);
+    CheckCondition(1 == rocksdb_options_get_advise_random_on_open(o));
+
+    rocksdb_options_set_access_hint_on_compaction_start(o, 3);
+    CheckCondition(3 == rocksdb_options_get_access_hint_on_compaction_start(o));
+
+    rocksdb_options_set_use_adaptive_mutex(o, 1);
+    CheckCondition(1 == rocksdb_options_get_use_adaptive_mutex(o));
+
+    rocksdb_options_set_bytes_per_sync(o, 19);
+    CheckCondition(19 == rocksdb_options_get_bytes_per_sync(o));
+
+    rocksdb_options_set_wal_bytes_per_sync(o, 20);
+    CheckCondition(20 == rocksdb_options_get_wal_bytes_per_sync(o));
+
+    rocksdb_options_set_writable_file_max_buffer_size(o, 21);
+    CheckCondition(21 == rocksdb_options_get_writable_file_max_buffer_size(o));
+
+    rocksdb_options_set_allow_concurrent_memtable_write(o, 1);
+    CheckCondition(1 == rocksdb_options_get_allow_concurrent_memtable_write(o));
+
+    rocksdb_options_set_enable_write_thread_adaptive_yield(o, 1);
+    CheckCondition(1 ==
+                   rocksdb_options_get_enable_write_thread_adaptive_yield(o));
+
+    rocksdb_options_set_max_sequential_skip_in_iterations(o, 22);
+    CheckCondition(22 ==
+                   rocksdb_options_get_max_sequential_skip_in_iterations(o));
+
+    rocksdb_options_set_disable_auto_compactions(o, 1);
+    CheckCondition(1 == rocksdb_options_get_disable_auto_compactions(o));
+
+    rocksdb_options_set_optimize_filters_for_hits(o, 1);
+    CheckCondition(1 == rocksdb_options_get_optimize_filters_for_hits(o));
+
+    rocksdb_options_set_delete_obsolete_files_period_micros(o, 23);
+    CheckCondition(23 ==
+                   rocksdb_options_get_delete_obsolete_files_period_micros(o));
+
+    rocksdb_options_set_memtable_prefix_bloom_size_ratio(o, 2.0);
+    CheckCondition(2.0 ==
+                   rocksdb_options_get_memtable_prefix_bloom_size_ratio(o));
+
+    rocksdb_options_set_max_compaction_bytes(o, 24);
+    CheckCondition(24 == rocksdb_options_get_max_compaction_bytes(o));
+
+    rocksdb_options_set_memtable_huge_page_size(o, 25);
+    CheckCondition(25 == rocksdb_options_get_memtable_huge_page_size(o));
+
+    rocksdb_options_set_max_successive_merges(o, 26);
+    CheckCondition(26 == rocksdb_options_get_max_successive_merges(o));
+
+    rocksdb_options_set_bloom_locality(o, 27);
+    CheckCondition(27 == rocksdb_options_get_bloom_locality(o));
+
+    rocksdb_options_set_inplace_update_support(o, 1);
+    CheckCondition(1 == rocksdb_options_get_inplace_update_support(o));
+
+    rocksdb_options_set_inplace_update_num_locks(o, 28);
+    CheckCondition(28 == rocksdb_options_get_inplace_update_num_locks(o));
+
+    rocksdb_options_set_report_bg_io_stats(o, 1);
+    CheckCondition(1 == rocksdb_options_get_report_bg_io_stats(o));
+
+    rocksdb_options_set_wal_recovery_mode(o, 2);
+    CheckCondition(2 == rocksdb_options_get_wal_recovery_mode(o));
+
+    rocksdb_options_set_compression(o, 5);
+    CheckCondition(5 == rocksdb_options_get_compression(o));
+
+    rocksdb_options_set_bottommost_compression(o, 4);
+    CheckCondition(4 == rocksdb_options_get_bottommost_compression(o));
+
+    rocksdb_options_set_compaction_style(o, 2);
+    CheckCondition(2 == rocksdb_options_get_compaction_style(o));
+
+    rocksdb_options_set_atomic_flush(o, 1);
+    CheckCondition(1 == rocksdb_options_get_atomic_flush(o));
+
+    rocksdb_options_set_manual_wal_flush(o, 1);
+    CheckCondition(1 == rocksdb_options_get_manual_wal_flush(o));
+
+    rocksdb_options_set_wal_compression(o, 1);
+    CheckCondition(1 == rocksdb_options_get_wal_compression(o));
+
+    rocksdb_options_set_experimental_mempurge_threshold(o, 29.0);
+    CheckCondition(29.0 ==
+                   rocksdb_options_get_experimental_mempurge_threshold(o));
+
+    /* Blob Options */
+    rocksdb_options_set_enable_blob_files(o, 1);
+    CheckCondition(1 == rocksdb_options_get_enable_blob_files(o));
+
+    rocksdb_options_set_min_blob_size(o, 29);
+    CheckCondition(29 == rocksdb_options_get_min_blob_size(o));
+
+    rocksdb_options_set_blob_file_size(o, 30);
+    CheckCondition(30 == rocksdb_options_get_blob_file_size(o));
+
+    rocksdb_options_set_blob_compression_type(o, 4);
+    CheckCondition(4 == rocksdb_options_get_blob_compression_type(o));
+
+    rocksdb_options_set_enable_blob_gc(o, 1);
+    CheckCondition(1 == rocksdb_options_get_enable_blob_gc(o));
+
+    rocksdb_options_set_blob_gc_age_cutoff(o, 0.5);
+    CheckCondition(0.5 == rocksdb_options_get_blob_gc_age_cutoff(o));
+
+    rocksdb_options_set_blob_gc_force_threshold(o, 0.75);
+    CheckCondition(0.75 == rocksdb_options_get_blob_gc_force_threshold(o));
+
+    rocksdb_options_set_blob_compaction_readahead_size(o, 262144);
+    CheckCondition(262144 ==
+                   rocksdb_options_get_blob_compaction_readahead_size(o));
+
+    rocksdb_options_set_blob_file_starting_level(o, 5);
+    CheckCondition(5 == rocksdb_options_get_blob_file_starting_level(o));
+
+    rocksdb_options_set_prepopulate_blob_cache(o, 1 /* flush only */);
+    CheckCondition(1 == rocksdb_options_get_prepopulate_blob_cache(o));
+
+    // Create a copy that should be equal to the original.
+    rocksdb_options_t* copy;
+    copy = rocksdb_options_create_copy(o);
+
+    CheckCondition(1 == rocksdb_options_get_allow_ingest_behind(copy));
+    CheckCondition(10 == rocksdb_options_get_compaction_readahead_size(copy));
+    CheckCondition(1 == rocksdb_options_get_create_if_missing(copy));
+    CheckCondition(1 ==
+                   rocksdb_options_get_create_missing_column_families(copy));
+    CheckCondition(1 == rocksdb_options_get_error_if_exists(copy));
+    CheckCondition(1 == rocksdb_options_get_paranoid_checks(copy));
+    CheckCondition(3 == rocksdb_options_get_info_log_level(copy));
+    CheckCondition(100 == rocksdb_options_get_write_buffer_size(copy));
+    CheckCondition(1000 == rocksdb_options_get_db_write_buffer_size(copy));
+    CheckCondition(21 == rocksdb_options_get_max_open_files(copy));
+    CheckCondition(5 == rocksdb_options_get_max_file_opening_threads(copy));
+    CheckCondition(400 == rocksdb_options_get_max_total_wal_size(copy));
+    CheckCondition(7 == rocksdb_options_get_num_levels(copy));
+    CheckCondition(
+        4 == rocksdb_options_get_level0_file_num_compaction_trigger(copy));
+    CheckCondition(6 ==
+                   rocksdb_options_get_level0_slowdown_writes_trigger(copy));
+    CheckCondition(8 == rocksdb_options_get_level0_stop_writes_trigger(copy));
+    CheckCondition(256 == rocksdb_options_get_target_file_size_base(copy));
+    CheckCondition(3 == rocksdb_options_get_target_file_size_multiplier(copy));
+    CheckCondition(1024 == rocksdb_options_get_max_bytes_for_level_base(copy));
+    CheckCondition(
+        1 == rocksdb_options_get_level_compaction_dynamic_level_bytes(copy));
+    CheckCondition(2.0 ==
+                   rocksdb_options_get_max_bytes_for_level_multiplier(copy));
+    CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(copy));
+    CheckCondition(
+        1 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(copy));
+    CheckCondition(97 == rocksdb_options_get_max_write_buffer_number(copy));
+    CheckCondition(23 ==
+                   rocksdb_options_get_min_write_buffer_number_to_merge(copy));
+    CheckCondition(
+        64 == rocksdb_options_get_max_write_buffer_number_to_maintain(copy));
+    CheckCondition(50000 ==
+                   rocksdb_options_get_max_write_buffer_size_to_maintain(copy));
+    CheckCondition(1 == rocksdb_options_get_enable_pipelined_write(copy));
+    CheckCondition(1 == rocksdb_options_get_unordered_write(copy));
+    CheckCondition(123456 == rocksdb_options_get_max_subcompactions(copy));
+    CheckCondition(2 == rocksdb_options_get_max_background_jobs(copy));
+    CheckCondition(3 == rocksdb_options_get_max_background_compactions(copy));
+    CheckCondition(5 == rocksdb_options_get_max_background_flushes(copy));
+    CheckCondition(6 == rocksdb_options_get_max_log_file_size(copy));
+    CheckCondition(7 == rocksdb_options_get_log_file_time_to_roll(copy));
+    CheckCondition(8 == rocksdb_options_get_keep_log_file_num(copy));
+    CheckCondition(9 == rocksdb_options_get_recycle_log_file_num(copy));
+    CheckCondition(
+        10 == rocksdb_options_get_soft_pending_compaction_bytes_limit(copy));
+    CheckCondition(
+        11 == rocksdb_options_get_hard_pending_compaction_bytes_limit(copy));
+    CheckCondition(12 == rocksdb_options_get_max_manifest_file_size(copy));
+    CheckCondition(13 == rocksdb_options_get_table_cache_numshardbits(copy));
+    CheckCondition(14 == rocksdb_options_get_arena_block_size(copy));
+    CheckCondition(1 == rocksdb_options_get_use_fsync(copy));
+    CheckCondition(15 == rocksdb_options_get_WAL_ttl_seconds(copy));
+    CheckCondition(16 == rocksdb_options_get_WAL_size_limit_MB(copy));
+    CheckCondition(17 == rocksdb_options_get_manifest_preallocation_size(copy));
+    CheckCondition(1 == rocksdb_options_get_allow_mmap_reads(copy));
+    CheckCondition(1 == rocksdb_options_get_allow_mmap_writes(copy));
+    CheckCondition(1 == rocksdb_options_get_use_direct_reads(copy));
+    CheckCondition(
+        1 == rocksdb_options_get_use_direct_io_for_flush_and_compaction(copy));
+    CheckCondition(1 == rocksdb_options_get_is_fd_close_on_exec(copy));
+    CheckCondition(18 == rocksdb_options_get_stats_dump_period_sec(copy));
+    CheckCondition(5 == rocksdb_options_get_stats_persist_period_sec(copy));
+    CheckCondition(1 == rocksdb_options_get_advise_random_on_open(copy));
+    CheckCondition(3 ==
+                   rocksdb_options_get_access_hint_on_compaction_start(copy));
+    CheckCondition(1 == rocksdb_options_get_use_adaptive_mutex(copy));
+    CheckCondition(19 == rocksdb_options_get_bytes_per_sync(copy));
+    CheckCondition(20 == rocksdb_options_get_wal_bytes_per_sync(copy));
+    CheckCondition(21 ==
+                   rocksdb_options_get_writable_file_max_buffer_size(copy));
+    CheckCondition(1 ==
+                   rocksdb_options_get_allow_concurrent_memtable_write(copy));
+    CheckCondition(
+        1 == rocksdb_options_get_enable_write_thread_adaptive_yield(copy));
+    CheckCondition(22 ==
+                   rocksdb_options_get_max_sequential_skip_in_iterations(copy));
+    CheckCondition(1 == rocksdb_options_get_disable_auto_compactions(copy));
+    CheckCondition(1 == rocksdb_options_get_optimize_filters_for_hits(copy));
+    CheckCondition(
+        23 == rocksdb_options_get_delete_obsolete_files_period_micros(copy));
+    CheckCondition(2.0 ==
+                   rocksdb_options_get_memtable_prefix_bloom_size_ratio(copy));
+    CheckCondition(24 == rocksdb_options_get_max_compaction_bytes(copy));
+    CheckCondition(25 == rocksdb_options_get_memtable_huge_page_size(copy));
+    CheckCondition(26 == rocksdb_options_get_max_successive_merges(copy));
+    CheckCondition(27 == rocksdb_options_get_bloom_locality(copy));
+    CheckCondition(1 == rocksdb_options_get_inplace_update_support(copy));
+    CheckCondition(28 == rocksdb_options_get_inplace_update_num_locks(copy));
+    CheckCondition(1 == rocksdb_options_get_report_bg_io_stats(copy));
+    CheckCondition(2 == rocksdb_options_get_wal_recovery_mode(copy));
+    CheckCondition(5 == rocksdb_options_get_compression(copy));
+    CheckCondition(4 == rocksdb_options_get_bottommost_compression(copy));
+    CheckCondition(2 == rocksdb_options_get_compaction_style(copy));
+    CheckCondition(1 == rocksdb_options_get_atomic_flush(copy));
+    CheckCondition(29.0 ==
+                   rocksdb_options_get_experimental_mempurge_threshold(copy));
+
+    // Copies should be independent.
+    rocksdb_options_set_allow_ingest_behind(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_allow_ingest_behind(copy));
+    CheckCondition(1 == rocksdb_options_get_allow_ingest_behind(o));
+
+    rocksdb_options_compaction_readahead_size(copy, 20);
+    CheckCondition(20 == rocksdb_options_get_compaction_readahead_size(copy));
+    CheckCondition(10 == rocksdb_options_get_compaction_readahead_size(o));
+
+    rocksdb_options_set_create_if_missing(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_create_if_missing(copy));
+    CheckCondition(1 == rocksdb_options_get_create_if_missing(o));
+
+    rocksdb_options_set_create_missing_column_families(copy, 0);
+    CheckCondition(0 ==
+                   rocksdb_options_get_create_missing_column_families(copy));
+    CheckCondition(1 == rocksdb_options_get_create_missing_column_families(o));
+
+    rocksdb_options_set_error_if_exists(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_error_if_exists(copy));
+    CheckCondition(1 == rocksdb_options_get_error_if_exists(o));
+
+    rocksdb_options_set_paranoid_checks(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_paranoid_checks(copy));
+    CheckCondition(1 == rocksdb_options_get_paranoid_checks(o));
+
+    rocksdb_options_set_info_log_level(copy, 2);
+    CheckCondition(2 == rocksdb_options_get_info_log_level(copy));
+    CheckCondition(3 == rocksdb_options_get_info_log_level(o));
+
+    rocksdb_options_set_write_buffer_size(copy, 200);
+    CheckCondition(200 == rocksdb_options_get_write_buffer_size(copy));
+    CheckCondition(100 == rocksdb_options_get_write_buffer_size(o));
+
+    rocksdb_options_set_db_write_buffer_size(copy, 2000);
+    CheckCondition(2000 == rocksdb_options_get_db_write_buffer_size(copy));
+    CheckCondition(1000 == rocksdb_options_get_db_write_buffer_size(o));
+
+    rocksdb_options_set_max_open_files(copy, 42);
+    CheckCondition(42 == rocksdb_options_get_max_open_files(copy));
+    CheckCondition(21 == rocksdb_options_get_max_open_files(o));
+
+    rocksdb_options_set_max_file_opening_threads(copy, 3);
+    CheckCondition(3 == rocksdb_options_get_max_file_opening_threads(copy));
+    CheckCondition(5 == rocksdb_options_get_max_file_opening_threads(o));
+
+    rocksdb_options_set_max_total_wal_size(copy, 4000);
+    CheckCondition(4000 == rocksdb_options_get_max_total_wal_size(copy));
+    CheckCondition(400 == rocksdb_options_get_max_total_wal_size(o));
+
+    rocksdb_options_set_num_levels(copy, 6);
+    CheckCondition(6 == rocksdb_options_get_num_levels(copy));
+    CheckCondition(7 == rocksdb_options_get_num_levels(o));
+
+    rocksdb_options_set_level0_file_num_compaction_trigger(copy, 14);
+    CheckCondition(
+        14 == rocksdb_options_get_level0_file_num_compaction_trigger(copy));
+    CheckCondition(4 ==
+                   rocksdb_options_get_level0_file_num_compaction_trigger(o));
+
+    rocksdb_options_set_level0_slowdown_writes_trigger(copy, 61);
+    CheckCondition(61 ==
+                   rocksdb_options_get_level0_slowdown_writes_trigger(copy));
+    CheckCondition(6 == rocksdb_options_get_level0_slowdown_writes_trigger(o));
+
+    rocksdb_options_set_level0_stop_writes_trigger(copy, 17);
+    CheckCondition(17 == rocksdb_options_get_level0_stop_writes_trigger(copy));
+    CheckCondition(8 == rocksdb_options_get_level0_stop_writes_trigger(o));
+
+    rocksdb_options_set_target_file_size_base(copy, 128);
+    CheckCondition(128 == rocksdb_options_get_target_file_size_base(copy));
+    CheckCondition(256 == rocksdb_options_get_target_file_size_base(o));
+
+    rocksdb_options_set_target_file_size_multiplier(copy, 13);
+    CheckCondition(13 == rocksdb_options_get_target_file_size_multiplier(copy));
+    CheckCondition(3 == rocksdb_options_get_target_file_size_multiplier(o));
+
+    rocksdb_options_set_max_bytes_for_level_base(copy, 900);
+    CheckCondition(900 == rocksdb_options_get_max_bytes_for_level_base(copy));
+    CheckCondition(1024 == rocksdb_options_get_max_bytes_for_level_base(o));
+
+    rocksdb_options_set_level_compaction_dynamic_level_bytes(copy, 0);
+    CheckCondition(
+        0 == rocksdb_options_get_level_compaction_dynamic_level_bytes(copy));
+    CheckCondition(1 ==
+                   rocksdb_options_get_level_compaction_dynamic_level_bytes(o));
+
+    rocksdb_options_set_max_bytes_for_level_multiplier(copy, 8.0);
+    CheckCondition(8.0 ==
+                   rocksdb_options_get_max_bytes_for_level_multiplier(copy));
+    CheckCondition(2.0 ==
+                   rocksdb_options_get_max_bytes_for_level_multiplier(o));
+
+    rocksdb_options_set_skip_stats_update_on_db_open(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_skip_stats_update_on_db_open(copy));
+    CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(o));
+
+    rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(copy, 0);
+    CheckCondition(
+        0 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(copy));
+    CheckCondition(
+        1 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(o));
+
+    rocksdb_options_set_max_write_buffer_number(copy, 2000);
+    CheckCondition(2000 == rocksdb_options_get_max_write_buffer_number(copy));
+    CheckCondition(97 == rocksdb_options_get_max_write_buffer_number(o));
+
+    rocksdb_options_set_min_write_buffer_number_to_merge(copy, 146);
+    CheckCondition(146 ==
+                   rocksdb_options_get_min_write_buffer_number_to_merge(copy));
+    CheckCondition(23 ==
+                   rocksdb_options_get_min_write_buffer_number_to_merge(o));
+
+    rocksdb_options_set_max_write_buffer_number_to_maintain(copy, 128);
+    CheckCondition(
+        128 == rocksdb_options_get_max_write_buffer_number_to_maintain(copy));
+    CheckCondition(64 ==
+                   rocksdb_options_get_max_write_buffer_number_to_maintain(o));
+
+    rocksdb_options_set_max_write_buffer_size_to_maintain(copy, 9000);
+    CheckCondition(9000 ==
+                   rocksdb_options_get_max_write_buffer_size_to_maintain(copy));
+    CheckCondition(50000 ==
+                   rocksdb_options_get_max_write_buffer_size_to_maintain(o));
+
+    rocksdb_options_set_enable_pipelined_write(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_enable_pipelined_write(copy));
+    CheckCondition(1 == rocksdb_options_get_enable_pipelined_write(o));
+
+    rocksdb_options_set_unordered_write(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_unordered_write(copy));
+    CheckCondition(1 == rocksdb_options_get_unordered_write(o));
+
+    rocksdb_options_set_max_subcompactions(copy, 90001);
+    CheckCondition(90001 == rocksdb_options_get_max_subcompactions(copy));
+    CheckCondition(123456 == rocksdb_options_get_max_subcompactions(o));
+
+    rocksdb_options_set_max_background_jobs(copy, 12);
+    CheckCondition(12 == rocksdb_options_get_max_background_jobs(copy));
+    CheckCondition(2 == rocksdb_options_get_max_background_jobs(o));
+
+    rocksdb_options_set_max_background_compactions(copy, 13);
+    CheckCondition(13 == rocksdb_options_get_max_background_compactions(copy));
+    CheckCondition(3 == rocksdb_options_get_max_background_compactions(o));
+
+    rocksdb_options_set_max_background_flushes(copy, 15);
+    CheckCondition(15 == rocksdb_options_get_max_background_flushes(copy));
+    CheckCondition(5 == rocksdb_options_get_max_background_flushes(o));
+
+    rocksdb_options_set_max_log_file_size(copy, 16);
+    CheckCondition(16 == rocksdb_options_get_max_log_file_size(copy));
+    CheckCondition(6 == rocksdb_options_get_max_log_file_size(o));
+
+    rocksdb_options_set_log_file_time_to_roll(copy, 17);
+    CheckCondition(17 == rocksdb_options_get_log_file_time_to_roll(copy));
+    CheckCondition(7 == rocksdb_options_get_log_file_time_to_roll(o));
+
+    rocksdb_options_set_keep_log_file_num(copy, 18);
+    CheckCondition(18 == rocksdb_options_get_keep_log_file_num(copy));
+    CheckCondition(8 == rocksdb_options_get_keep_log_file_num(o));
+
+    rocksdb_options_set_recycle_log_file_num(copy, 19);
+    CheckCondition(19 == rocksdb_options_get_recycle_log_file_num(copy));
+    CheckCondition(9 == rocksdb_options_get_recycle_log_file_num(o));
+
+    rocksdb_options_set_soft_pending_compaction_bytes_limit(copy, 110);
+    CheckCondition(
+        110 == rocksdb_options_get_soft_pending_compaction_bytes_limit(copy));
+    CheckCondition(10 ==
+                   rocksdb_options_get_soft_pending_compaction_bytes_limit(o));
+
+    rocksdb_options_set_hard_pending_compaction_bytes_limit(copy, 111);
+    CheckCondition(
+        111 == rocksdb_options_get_hard_pending_compaction_bytes_limit(copy));
+    CheckCondition(11 ==
+                   rocksdb_options_get_hard_pending_compaction_bytes_limit(o));
+
+    rocksdb_options_set_max_manifest_file_size(copy, 112);
+    CheckCondition(112 == rocksdb_options_get_max_manifest_file_size(copy));
+    CheckCondition(12 == rocksdb_options_get_max_manifest_file_size(o));
+
+    rocksdb_options_set_table_cache_numshardbits(copy, 113);
+    CheckCondition(113 == rocksdb_options_get_table_cache_numshardbits(copy));
+    CheckCondition(13 == rocksdb_options_get_table_cache_numshardbits(o));
+
+    rocksdb_options_set_arena_block_size(copy, 114);
+    CheckCondition(114 == rocksdb_options_get_arena_block_size(copy));
+    CheckCondition(14 == rocksdb_options_get_arena_block_size(o));
+
+    rocksdb_options_set_use_fsync(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_use_fsync(copy));
+    CheckCondition(1 == rocksdb_options_get_use_fsync(o));
+
+    rocksdb_options_set_WAL_ttl_seconds(copy, 115);
+    CheckCondition(115 == rocksdb_options_get_WAL_ttl_seconds(copy));
+    CheckCondition(15 == rocksdb_options_get_WAL_ttl_seconds(o));
+
+    rocksdb_options_set_WAL_size_limit_MB(copy, 116);
+    CheckCondition(116 == rocksdb_options_get_WAL_size_limit_MB(copy));
+    CheckCondition(16 == rocksdb_options_get_WAL_size_limit_MB(o));
+
+    rocksdb_options_set_manifest_preallocation_size(copy, 117);
+    CheckCondition(117 ==
+                   rocksdb_options_get_manifest_preallocation_size(copy));
+    CheckCondition(17 == rocksdb_options_get_manifest_preallocation_size(o));
+
+    rocksdb_options_set_allow_mmap_reads(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_allow_mmap_reads(copy));
+    CheckCondition(1 == rocksdb_options_get_allow_mmap_reads(o));
+
+    rocksdb_options_set_allow_mmap_writes(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_allow_mmap_writes(copy));
+    CheckCondition(1 == rocksdb_options_get_allow_mmap_writes(o));
+
+    rocksdb_options_set_use_direct_reads(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_use_direct_reads(copy));
+    CheckCondition(1 == rocksdb_options_get_use_direct_reads(o));
+
+    rocksdb_options_set_use_direct_io_for_flush_and_compaction(copy, 0);
+    CheckCondition(
+        0 == rocksdb_options_get_use_direct_io_for_flush_and_compaction(copy));
+    CheckCondition(
+        1 == rocksdb_options_get_use_direct_io_for_flush_and_compaction(o));
+
+    rocksdb_options_set_is_fd_close_on_exec(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_is_fd_close_on_exec(copy));
+    CheckCondition(1 == rocksdb_options_get_is_fd_close_on_exec(o));
+
+    rocksdb_options_set_stats_dump_period_sec(copy, 218);
+    CheckCondition(218 == rocksdb_options_get_stats_dump_period_sec(copy));
+    CheckCondition(18 == rocksdb_options_get_stats_dump_period_sec(o));
+
+    rocksdb_options_set_stats_persist_period_sec(copy, 600);
+    CheckCondition(600 == rocksdb_options_get_stats_persist_period_sec(copy));
+    CheckCondition(5 == rocksdb_options_get_stats_persist_period_sec(o));
+
+    rocksdb_options_set_advise_random_on_open(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_advise_random_on_open(copy));
+    CheckCondition(1 == rocksdb_options_get_advise_random_on_open(o));
+
+    rocksdb_options_set_access_hint_on_compaction_start(copy, 2);
+    CheckCondition(2 ==
+                   rocksdb_options_get_access_hint_on_compaction_start(copy));
+    CheckCondition(3 == rocksdb_options_get_access_hint_on_compaction_start(o));
+
+    rocksdb_options_set_use_adaptive_mutex(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_use_adaptive_mutex(copy));
+    CheckCondition(1 == rocksdb_options_get_use_adaptive_mutex(o));
+
+    rocksdb_options_set_bytes_per_sync(copy, 219);
+    CheckCondition(219 == rocksdb_options_get_bytes_per_sync(copy));
+    CheckCondition(19 == rocksdb_options_get_bytes_per_sync(o));
+
+    rocksdb_options_set_wal_bytes_per_sync(copy, 120);
+    CheckCondition(120 == rocksdb_options_get_wal_bytes_per_sync(copy));
+    CheckCondition(20 == rocksdb_options_get_wal_bytes_per_sync(o));
+
+    rocksdb_options_set_writable_file_max_buffer_size(copy, 121);
+    CheckCondition(121 ==
+                   rocksdb_options_get_writable_file_max_buffer_size(copy));
+    CheckCondition(21 == rocksdb_options_get_writable_file_max_buffer_size(o));
+
+    rocksdb_options_set_allow_concurrent_memtable_write(copy, 0);
+    CheckCondition(0 ==
+                   rocksdb_options_get_allow_concurrent_memtable_write(copy));
+    CheckCondition(1 == rocksdb_options_get_allow_concurrent_memtable_write(o));
+
+    rocksdb_options_set_enable_write_thread_adaptive_yield(copy, 0);
+    CheckCondition(
+        0 == rocksdb_options_get_enable_write_thread_adaptive_yield(copy));
+    CheckCondition(1 ==
+                   rocksdb_options_get_enable_write_thread_adaptive_yield(o));
+
+    rocksdb_options_set_max_sequential_skip_in_iterations(copy, 122);
+    CheckCondition(122 ==
+                   rocksdb_options_get_max_sequential_skip_in_iterations(copy));
+    CheckCondition(22 ==
+                   rocksdb_options_get_max_sequential_skip_in_iterations(o));
+
+    rocksdb_options_set_disable_auto_compactions(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_disable_auto_compactions(copy));
+    CheckCondition(1 == rocksdb_options_get_disable_auto_compactions(o));
+
+    rocksdb_options_set_optimize_filters_for_hits(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_optimize_filters_for_hits(copy));
+    CheckCondition(1 == rocksdb_options_get_optimize_filters_for_hits(o));
+
+    rocksdb_options_set_delete_obsolete_files_period_micros(copy, 123);
+    CheckCondition(
+        123 == rocksdb_options_get_delete_obsolete_files_period_micros(copy));
+    CheckCondition(23 ==
+                   rocksdb_options_get_delete_obsolete_files_period_micros(o));
+
+    rocksdb_options_set_memtable_prefix_bloom_size_ratio(copy, 4.0);
+    CheckCondition(4.0 ==
+                   rocksdb_options_get_memtable_prefix_bloom_size_ratio(copy));
+    CheckCondition(2.0 ==
+                   rocksdb_options_get_memtable_prefix_bloom_size_ratio(o));
+
+    rocksdb_options_set_max_compaction_bytes(copy, 124);
+    CheckCondition(124 == rocksdb_options_get_max_compaction_bytes(copy));
+    CheckCondition(24 == rocksdb_options_get_max_compaction_bytes(o));
+
+    rocksdb_options_set_memtable_huge_page_size(copy, 125);
+    CheckCondition(125 == rocksdb_options_get_memtable_huge_page_size(copy));
+    CheckCondition(25 == rocksdb_options_get_memtable_huge_page_size(o));
+
+    rocksdb_options_set_max_successive_merges(copy, 126);
+    CheckCondition(126 == rocksdb_options_get_max_successive_merges(copy));
+    CheckCondition(26 == rocksdb_options_get_max_successive_merges(o));
+
+    rocksdb_options_set_bloom_locality(copy, 127);
+    CheckCondition(127 == rocksdb_options_get_bloom_locality(copy));
+    CheckCondition(27 == rocksdb_options_get_bloom_locality(o));
+
+    rocksdb_options_set_inplace_update_support(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_inplace_update_support(copy));
+    CheckCondition(1 == rocksdb_options_get_inplace_update_support(o));
+
+    rocksdb_options_set_inplace_update_num_locks(copy, 128);
+    CheckCondition(128 == rocksdb_options_get_inplace_update_num_locks(copy));
+    CheckCondition(28 == rocksdb_options_get_inplace_update_num_locks(o));
+
+    rocksdb_options_set_report_bg_io_stats(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_report_bg_io_stats(copy));
+    CheckCondition(1 == rocksdb_options_get_report_bg_io_stats(o));
+
+    rocksdb_options_set_wal_recovery_mode(copy, 1);
+    CheckCondition(1 == rocksdb_options_get_wal_recovery_mode(copy));
+    CheckCondition(2 == rocksdb_options_get_wal_recovery_mode(o));
+
+    rocksdb_options_set_compression(copy, 4);
+    CheckCondition(4 == rocksdb_options_get_compression(copy));
+    CheckCondition(5 == rocksdb_options_get_compression(o));
+
+    rocksdb_options_set_bottommost_compression(copy, 3);
+    CheckCondition(3 == rocksdb_options_get_bottommost_compression(copy));
+    CheckCondition(4 == rocksdb_options_get_bottommost_compression(o));
+
+    rocksdb_options_set_compaction_style(copy, 1);
+    CheckCondition(1 == rocksdb_options_get_compaction_style(copy));
+    CheckCondition(2 == rocksdb_options_get_compaction_style(o));
+
+    rocksdb_options_set_atomic_flush(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_atomic_flush(copy));
+    CheckCondition(1 == rocksdb_options_get_atomic_flush(o));
+
+    rocksdb_options_set_experimental_mempurge_threshold(copy, 229.0);
+    CheckCondition(229.0 ==
+                   rocksdb_options_get_experimental_mempurge_threshold(copy));
+    CheckCondition(29.0 ==
+                   rocksdb_options_get_experimental_mempurge_threshold(o));
+
+    rocksdb_options_destroy(copy);
+    rocksdb_options_destroy(o);
+  }
+
+  StartPhase("read_options");
+  {
+    rocksdb_readoptions_t* ro;
+    ro = rocksdb_readoptions_create();
+
+    rocksdb_readoptions_set_verify_checksums(ro, 1);
+    CheckCondition(1 == rocksdb_readoptions_get_verify_checksums(ro));
+
+    rocksdb_readoptions_set_fill_cache(ro, 1);
+    CheckCondition(1 == rocksdb_readoptions_get_fill_cache(ro));
+
+    rocksdb_readoptions_set_read_tier(ro, 2);
+    CheckCondition(2 == rocksdb_readoptions_get_read_tier(ro));
+
+    rocksdb_readoptions_set_tailing(ro, 1);
+    CheckCondition(1 == rocksdb_readoptions_get_tailing(ro));
+
+    rocksdb_readoptions_set_readahead_size(ro, 100);
+    CheckCondition(100 == rocksdb_readoptions_get_readahead_size(ro));
+
+    rocksdb_readoptions_set_prefix_same_as_start(ro, 1);
+    CheckCondition(1 == rocksdb_readoptions_get_prefix_same_as_start(ro));
+
+    rocksdb_readoptions_set_pin_data(ro, 1);
+    CheckCondition(1 == rocksdb_readoptions_get_pin_data(ro));
+
+    rocksdb_readoptions_set_total_order_seek(ro, 1);
+    CheckCondition(1 == rocksdb_readoptions_get_total_order_seek(ro));
+
+    rocksdb_readoptions_set_max_skippable_internal_keys(ro, 200);
+    CheckCondition(200 ==
+                   rocksdb_readoptions_get_max_skippable_internal_keys(ro));
+
+    rocksdb_readoptions_set_background_purge_on_iterator_cleanup(ro, 1);
+    CheckCondition(
+        1 == rocksdb_readoptions_get_background_purge_on_iterator_cleanup(ro));
+
+    rocksdb_readoptions_set_ignore_range_deletions(ro, 1);
+    CheckCondition(1 == rocksdb_readoptions_get_ignore_range_deletions(ro));
+
+    rocksdb_readoptions_set_deadline(ro, 300);
+    CheckCondition(300 == rocksdb_readoptions_get_deadline(ro));
+
+    rocksdb_readoptions_set_io_timeout(ro, 400);
+    CheckCondition(400 == rocksdb_readoptions_get_io_timeout(ro));
+
+    rocksdb_readoptions_destroy(ro);
+  }
+
+  StartPhase("write_options");
+  {
+    rocksdb_writeoptions_t* wo;
+    wo = rocksdb_writeoptions_create();
+
+    rocksdb_writeoptions_set_sync(wo, 1);
+    CheckCondition(1 == rocksdb_writeoptions_get_sync(wo));
+
+    rocksdb_writeoptions_disable_WAL(wo, 1);
+    CheckCondition(1 == rocksdb_writeoptions_get_disable_WAL(wo));
+
+    rocksdb_writeoptions_set_ignore_missing_column_families(wo, 1);
+    CheckCondition(1 ==
+                   rocksdb_writeoptions_get_ignore_missing_column_families(wo));
+
+    rocksdb_writeoptions_set_no_slowdown(wo, 1);
+    CheckCondition(1 == rocksdb_writeoptions_get_no_slowdown(wo));
+
+    rocksdb_writeoptions_set_low_pri(wo, 1);
+    CheckCondition(1 == rocksdb_writeoptions_get_low_pri(wo));
+
+    rocksdb_writeoptions_set_memtable_insert_hint_per_batch(wo, 1);
+    CheckCondition(1 ==
+                   rocksdb_writeoptions_get_memtable_insert_hint_per_batch(wo));
+
+    rocksdb_writeoptions_destroy(wo);
+  }
+
+  StartPhase("compact_options");
+  {
+    rocksdb_compactoptions_t* co;
+    co = rocksdb_compactoptions_create();
+
+    rocksdb_compactoptions_set_exclusive_manual_compaction(co, 1);
+    CheckCondition(1 ==
+                   rocksdb_compactoptions_get_exclusive_manual_compaction(co));
+
+    rocksdb_compactoptions_set_bottommost_level_compaction(co, 1);
+    CheckCondition(1 ==
+                   rocksdb_compactoptions_get_bottommost_level_compaction(co));
+
+    rocksdb_compactoptions_set_change_level(co, 1);
+    CheckCondition(1 == rocksdb_compactoptions_get_change_level(co));
+
+    rocksdb_compactoptions_set_target_level(co, 1);
+    CheckCondition(1 == rocksdb_compactoptions_get_target_level(co));
+
+    rocksdb_compactoptions_destroy(co);
+  }
+
+  StartPhase("flush_options");
+  {
+    rocksdb_flushoptions_t* fo;
+    fo = rocksdb_flushoptions_create();
+
+    rocksdb_flushoptions_set_wait(fo, 1);
+    CheckCondition(1 == rocksdb_flushoptions_get_wait(fo));
+
+    rocksdb_flushoptions_destroy(fo);
+  }
+
+  StartPhase("cache_options");
+  {
+    rocksdb_cache_t* co;
+    co = rocksdb_cache_create_lru(100);
+    CheckCondition(100 == rocksdb_cache_get_capacity(co));
+
+    rocksdb_cache_set_capacity(co, 200);
+    CheckCondition(200 == rocksdb_cache_get_capacity(co));
+
+    rocksdb_cache_destroy(co);
+  }
+
+  StartPhase("jemalloc_nodump_allocator");
+  {
+    rocksdb_memory_allocator_t* allocator;
+    allocator = rocksdb_jemalloc_nodump_allocator_create(&err);
+    if (err != NULL) {
+      // not supported on all platforms, allow unsupported error
+      const char* ni = "Not implemented: ";
+      size_t ni_len = strlen(ni);
+      size_t err_len = strlen(err);
+
+      CheckCondition(err_len >= ni_len);
+      CheckCondition(memcmp(ni, err, ni_len) == 0);
+      Free(&err);
+    } else {
+      rocksdb_cache_t* co;
+      rocksdb_lru_cache_options_t* copts;
+
+      copts = rocksdb_lru_cache_options_create();
+
+      rocksdb_lru_cache_options_set_capacity(copts, 100);
+      rocksdb_lru_cache_options_set_memory_allocator(copts, allocator);
+
+      co = rocksdb_cache_create_lru_opts(copts);
+      CheckCondition(100 == rocksdb_cache_get_capacity(co));
+
+      rocksdb_cache_destroy(co);
+      rocksdb_lru_cache_options_destroy(copts);
+    }
+    rocksdb_memory_allocator_destroy(allocator);
+  }
+
+  StartPhase("env");
+  {
+    rocksdb_env_t* e;
+    e = rocksdb_create_default_env();
+
+    rocksdb_env_set_background_threads(e, 10);
+    CheckCondition(10 == rocksdb_env_get_background_threads(e));
+
+    rocksdb_env_set_high_priority_background_threads(e, 20);
+    CheckCondition(20 == rocksdb_env_get_high_priority_background_threads(e));
+
+    rocksdb_env_set_low_priority_background_threads(e, 30);
+    CheckCondition(30 == rocksdb_env_get_low_priority_background_threads(e));
+
+    rocksdb_env_set_bottom_priority_background_threads(e, 40);
+    CheckCondition(40 == rocksdb_env_get_bottom_priority_background_threads(e));
+
+    rocksdb_env_destroy(e);
+  }
+
+  StartPhase("universal_compaction_options");
+  {
+    rocksdb_universal_compaction_options_t* uco;
+    uco = rocksdb_universal_compaction_options_create();
+
+    rocksdb_universal_compaction_options_set_size_ratio(uco, 5);
+    CheckCondition(5 ==
+                   rocksdb_universal_compaction_options_get_size_ratio(uco));
+
+    rocksdb_universal_compaction_options_set_min_merge_width(uco, 15);
+    CheckCondition(
+        15 == rocksdb_universal_compaction_options_get_min_merge_width(uco));
+
+    rocksdb_universal_compaction_options_set_max_merge_width(uco, 25);
+    CheckCondition(
+        25 == rocksdb_universal_compaction_options_get_max_merge_width(uco));
+
+    rocksdb_universal_compaction_options_set_max_size_amplification_percent(uco,
+                                                                            35);
+    CheckCondition(
+        35 ==
+        rocksdb_universal_compaction_options_get_max_size_amplification_percent(
+            uco));
+
+    rocksdb_universal_compaction_options_set_compression_size_percent(uco, 45);
+    CheckCondition(
+        45 ==
+        rocksdb_universal_compaction_options_get_compression_size_percent(uco));
+
+    rocksdb_universal_compaction_options_set_stop_style(uco, 1);
+    CheckCondition(1 ==
+                   rocksdb_universal_compaction_options_get_stop_style(uco));
+
+    rocksdb_universal_compaction_options_destroy(uco);
+  }
+
+  StartPhase("fifo_compaction_options");
+  {
+    rocksdb_fifo_compaction_options_t* fco;
+    fco = rocksdb_fifo_compaction_options_create();
+
+    rocksdb_fifo_compaction_options_set_max_table_files_size(fco, 100000);
+    CheckCondition(
+        100000 ==
+        rocksdb_fifo_compaction_options_get_max_table_files_size(fco));
+
+    rocksdb_fifo_compaction_options_destroy(fco);
+  }
+
+  StartPhase("backup_engine_option");
+  {
+    rocksdb_backup_engine_options_t* bdo;
+    bdo = rocksdb_backup_engine_options_create("path");
+
+    rocksdb_backup_engine_options_set_share_table_files(bdo, 1);
+    CheckCondition(1 ==
+                   rocksdb_backup_engine_options_get_share_table_files(bdo));
+
+    rocksdb_backup_engine_options_set_sync(bdo, 1);
+    CheckCondition(1 == rocksdb_backup_engine_options_get_sync(bdo));
+
+    rocksdb_backup_engine_options_set_destroy_old_data(bdo, 1);
+    CheckCondition(1 ==
+                   rocksdb_backup_engine_options_get_destroy_old_data(bdo));
+
+    rocksdb_backup_engine_options_set_backup_log_files(bdo, 1);
+    CheckCondition(1 ==
+                   rocksdb_backup_engine_options_get_backup_log_files(bdo));
+
+    rocksdb_backup_engine_options_set_backup_rate_limit(bdo, 123);
+    CheckCondition(123 ==
+                   rocksdb_backup_engine_options_get_backup_rate_limit(bdo));
+
+    rocksdb_backup_engine_options_set_restore_rate_limit(bdo, 37);
+    CheckCondition(37 ==
+                   rocksdb_backup_engine_options_get_restore_rate_limit(bdo));
+
+    rocksdb_backup_engine_options_set_max_background_operations(bdo, 20);
+    CheckCondition(
+        20 == rocksdb_backup_engine_options_get_max_background_operations(bdo));
+
+    rocksdb_backup_engine_options_set_callback_trigger_interval_size(bdo, 9000);
+    CheckCondition(
+        9000 ==
+        rocksdb_backup_engine_options_get_callback_trigger_interval_size(bdo));
+
+    rocksdb_backup_engine_options_set_max_valid_backups_to_open(bdo, 40);
+    CheckCondition(
+        40 == rocksdb_backup_engine_options_get_max_valid_backups_to_open(bdo));
+
+    rocksdb_backup_engine_options_set_share_files_with_checksum_naming(bdo, 2);
+    CheckCondition(
+        2 == rocksdb_backup_engine_options_get_share_files_with_checksum_naming(
+                 bdo));
+
+    rocksdb_backup_engine_options_destroy(bdo);
+  }
+
+  StartPhase("compression_options");
+  {
+    rocksdb_options_t* co;
+    co = rocksdb_options_create();
+
+    rocksdb_options_set_compression_options_zstd_max_train_bytes(co, 100);
+    CheckCondition(
+        100 ==
+        rocksdb_options_get_compression_options_zstd_max_train_bytes(co));
+
+    rocksdb_options_set_compression_options_parallel_threads(co, 2);
+    CheckCondition(
+        2 == rocksdb_options_get_compression_options_parallel_threads(co));
+
+    rocksdb_options_set_compression_options_max_dict_buffer_bytes(co, 200);
+    CheckCondition(
+        200 ==
+        rocksdb_options_get_compression_options_max_dict_buffer_bytes(co));
+
+    rocksdb_options_set_compression_options_use_zstd_dict_trainer(co, 0);
+    CheckCondition(
+        0 == rocksdb_options_get_compression_options_use_zstd_dict_trainer(co));
+    rocksdb_options_destroy(co);
+  }
+
+  StartPhase("iterate_upper_bound");
+  {
+    // Create new empty database
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_options_set_prefix_extractor(options, NULL);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_put(db, woptions, "a", 1, "0", 1, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "foo", 3, "bar", 3, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "foo1", 4, "bar1", 4, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "g1", 2, "0", 1, &err);
+    CheckNoError(err);
+
+    // testing basic case with no iterate_upper_bound and no prefix_extractor
+    {
+      rocksdb_readoptions_set_iterate_upper_bound(roptions, NULL, 0);
+      rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+
+      rocksdb_iter_seek(iter, "foo", 3);
+      CheckCondition(rocksdb_iter_valid(iter));
+      CheckIter(iter, "foo", "bar");
+
+      rocksdb_iter_next(iter);
+      CheckCondition(rocksdb_iter_valid(iter));
+      CheckIter(iter, "foo1", "bar1");
+
+      rocksdb_iter_next(iter);
+      CheckCondition(rocksdb_iter_valid(iter));
+      CheckIter(iter, "g1", "0");
+
+      rocksdb_iter_destroy(iter);
+    }
+
+    // testing iterate_upper_bound and forward iterator
+    // to make sure it stops at bound
+    {
+      // iterate_upper_bound points beyond the last expected entry
+      rocksdb_readoptions_set_iterate_upper_bound(roptions, "foo2", 4);
+
+      rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+
+      rocksdb_iter_seek(iter, "foo", 3);
+      CheckCondition(rocksdb_iter_valid(iter));
+      CheckIter(iter, "foo", "bar");
+
+      rocksdb_iter_next(iter);
+      CheckCondition(rocksdb_iter_valid(iter));
+      CheckIter(iter, "foo1", "bar1");
+
+      rocksdb_iter_next(iter);
+      // should stop here...
+      CheckCondition(!rocksdb_iter_valid(iter));
+
+      rocksdb_iter_destroy(iter);
+      rocksdb_readoptions_set_iterate_upper_bound(roptions, NULL, 0);
+    }
+  }
+
+  StartPhase("transactions");
+  {
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+
+    // open a TransactionDB
+    txn_db_options = rocksdb_transactiondb_options_create();
+    txn_options = rocksdb_transaction_options_create();
+    rocksdb_options_set_create_if_missing(options, 1);
+    txn_db = rocksdb_transactiondb_open(options, txn_db_options, dbname, &err);
+    CheckNoError(err);
+
+    // put outside a transaction
+    rocksdb_transactiondb_put(txn_db, woptions, "foo", 3, "hello", 5, &err);
+    CheckNoError(err);
+    CheckTxnDBGet(txn_db, roptions, "foo", "hello");
+    CheckTxnDBPinGet(txn_db, roptions, "foo", "hello");
+
+    // delete from outside transaction
+    rocksdb_transactiondb_delete(txn_db, woptions, "foo", 3, &err);
+    CheckNoError(err);
+    CheckTxnDBGet(txn_db, roptions, "foo", NULL);
+    CheckTxnDBPinGet(txn_db, roptions, "foo", NULL);
+
+    // write batch into TransactionDB
+    rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+    rocksdb_writebatch_put(wb, "foo", 3, "a", 1);
+    rocksdb_writebatch_clear(wb);
+    rocksdb_writebatch_put(wb, "bar", 3, "b", 1);
+    rocksdb_writebatch_put(wb, "box", 3, "c", 1);
+    rocksdb_writebatch_delete(wb, "bar", 3);
+    rocksdb_transactiondb_write(txn_db, woptions, wb, &err);
+    rocksdb_writebatch_destroy(wb);
+    CheckTxnDBGet(txn_db, roptions, "box", "c");
+    CheckTxnDBPinGet(txn_db, roptions, "box", "c");
+    CheckNoError(err);
+
+    // multi get
+    {
+      const char* keys[3] = {"box", "foo", "notfound"};
+      const size_t keys_sizes[3] = {3, 3, 8};
+      char* vals[3];
+      size_t vals_sizes[3];
+      char* errs[3];
+      const char* expected[3] = {"c", NULL, NULL};
+      rocksdb_transactiondb_multi_get(txn_db, roptions, 3, keys, keys_sizes,
+                                      vals, vals_sizes, errs);
+      CheckMultiGetValues(3, vals, vals_sizes, errs, expected);
+    }
+
+    // begin a transaction
+    txn = rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL);
+    // put
+    rocksdb_transaction_put(txn, "foo", 3, "hello", 5, &err);
+    CheckNoError(err);
+    CheckTxnGet(txn, roptions, "foo", "hello");
+    CheckTxnPinGet(txn, roptions, "foo", "hello");
+    {
+      const char* keys[3] = {"box", "foo", "notfound"};
+      const size_t keys_sizes[3] = {3, 3, 8};
+      char* vals[3];
+      size_t vals_sizes[3];
+      char* errs[3];
+      const char* expected[3] = {"c", "hello", NULL};
+      rocksdb_transaction_multi_get(txn, roptions, 3, keys, keys_sizes, vals,
+                                    vals_sizes, errs);
+      CheckMultiGetValues(3, vals, vals_sizes, errs, expected);
+    }
+    // delete
+    rocksdb_transaction_delete(txn, "foo", 3, &err);
+    CheckNoError(err);
+    CheckTxnGet(txn, roptions, "foo", NULL);
+    CheckTxnPinGet(txn, roptions, "foo", NULL);
+
+    rocksdb_transaction_put(txn, "foo", 3, "hello", 5, &err);
+    CheckNoError(err);
+
+    // read from outside transaction, before commit
+    CheckTxnDBGet(txn_db, roptions, "foo", NULL);
+    CheckTxnDBPinGet(txn_db, roptions, "foo", NULL);
+    {
+      const char* keys[3] = {"box", "foo", "notfound"};
+      const size_t keys_sizes[3] = {3, 3, 8};
+      char* vals[3];
+      size_t vals_sizes[3];
+      char* errs[3];
+      const char* expected[3] = {"c", NULL, NULL};
+      rocksdb_transactiondb_multi_get(txn_db, roptions, 3, keys, keys_sizes,
+                                      vals, vals_sizes, errs);
+      CheckMultiGetValues(3, vals, vals_sizes, errs, expected);
+    }
+
+    // commit
+    rocksdb_transaction_commit(txn, &err);
+    CheckNoError(err);
+
+    // read from outside transaction, after commit
+    CheckTxnDBGet(txn_db, roptions, "foo", "hello");
+    CheckTxnDBPinGet(txn_db, roptions, "foo", "hello");
+    {
+      const char* keys[3] = {"box", "foo", "notfound"};
+      const size_t keys_sizes[3] = {3, 3, 8};
+      char* vals[3];
+      size_t vals_sizes[3];
+      char* errs[3];
+      const char* expected[3] = {"c", "hello", NULL};
+      rocksdb_transactiondb_multi_get(txn_db, roptions, 3, keys, keys_sizes,
+                                      vals, vals_sizes, errs);
+      CheckMultiGetValues(3, vals, vals_sizes, errs, expected);
+    }
+
+    // reuse old transaction
+    txn = rocksdb_transaction_begin(txn_db, woptions, txn_options, txn);
+
+    // snapshot
+    const rocksdb_snapshot_t* snapshot;
+    snapshot = rocksdb_transactiondb_create_snapshot(txn_db);
+    rocksdb_readoptions_set_snapshot(roptions, snapshot);
+
+    rocksdb_transactiondb_put(txn_db, woptions, "foo", 3, "hey", 3, &err);
+    CheckNoError(err);
+
+    CheckTxnDBGet(txn_db, roptions, "foo", "hello");
+    CheckTxnDBPinGet(txn_db, roptions, "foo", "hello");
+    rocksdb_readoptions_set_snapshot(roptions, NULL);
+    rocksdb_transactiondb_release_snapshot(txn_db, snapshot);
+    CheckTxnDBGet(txn_db, roptions, "foo", "hey");
+    CheckTxnDBPinGet(txn_db, roptions, "foo", "hey");
+
+    // iterate
+    rocksdb_transaction_put(txn, "bar", 3, "hi", 2, &err);
+    rocksdb_iterator_t* iter =
+        rocksdb_transaction_create_iterator(txn, roptions);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+    CheckIter(iter, "bar", "hi");
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+
+    // rollback
+    rocksdb_transaction_rollback(txn, &err);
+    CheckNoError(err);
+    CheckTxnDBGet(txn_db, roptions, "bar", NULL);
+    CheckTxnDBPinGet(txn_db, roptions, "bar", NULL);
+
+    // save point
+    rocksdb_transaction_put(txn, "foo1", 4, "hi1", 3, &err);
+    rocksdb_transaction_set_savepoint(txn);
+    CheckTxnGet(txn, roptions, "foo1", "hi1");
+    CheckTxnPinGet(txn, roptions, "foo1", "hi1");
+    rocksdb_transaction_put(txn, "foo2", 4, "hi2", 3, &err);
+    CheckTxnGet(txn, roptions, "foo2", "hi2");
+    CheckTxnPinGet(txn, roptions, "foo2", "hi2");
+
+    // rollback to savepoint
+    rocksdb_transaction_rollback_to_savepoint(txn, &err);
+    CheckNoError(err);
+    CheckTxnGet(txn, roptions, "foo2", NULL);
+    CheckTxnGet(txn, roptions, "foo1", "hi1");
+    CheckTxnPinGet(txn, roptions, "foo2", NULL);
+    CheckTxnPinGet(txn, roptions, "foo1", "hi1");
+    CheckTxnDBGet(txn_db, roptions, "foo1", NULL);
+    CheckTxnDBGet(txn_db, roptions, "foo2", NULL);
+    CheckTxnDBPinGet(txn_db, roptions, "foo1", NULL);
+    CheckTxnDBPinGet(txn_db, roptions, "foo2", NULL);
+    rocksdb_transaction_commit(txn, &err);
+    CheckNoError(err);
+    CheckTxnDBGet(txn_db, roptions, "foo1", "hi1");
+    CheckTxnDBGet(txn_db, roptions, "foo2", NULL);
+    CheckTxnDBPinGet(txn_db, roptions, "foo1", "hi1");
+    CheckTxnDBPinGet(txn_db, roptions, "foo2", NULL);
+
+    // Column families.
+    rocksdb_column_family_handle_t* cfh;
+    cfh = rocksdb_transactiondb_create_column_family(txn_db, options,
+                                                     "txn_db_cf", &err);
+    CheckNoError(err);
+
+    rocksdb_transactiondb_put_cf(txn_db, woptions, cfh, "cf_foo", 6, "cf_hello",
+                                 8, &err);
+    CheckNoError(err);
+    CheckTxnDBGetCF(txn_db, roptions, cfh, "cf_foo", "cf_hello");
+    CheckTxnDBPinGetCF(txn_db, roptions, cfh, "cf_foo", "cf_hello");
+    {
+      const rocksdb_column_family_handle_t* get_handles[2] = {cfh, cfh};
+      const char* keys[2] = {"cf_foo", "notfound"};
+      const size_t keys_sizes[2] = {6, 8};
+      char* vals[2];
+      size_t vals_sizes[2];
+      char* errs[2];
+      const char* expected[2] = {"cf_hello", NULL};
+      rocksdb_transactiondb_multi_get_cf(txn_db, roptions, get_handles, 2, keys,
+                                         keys_sizes, vals, vals_sizes, errs);
+      CheckMultiGetValues(2, vals, vals_sizes, errs, expected);
+    }
+
+    rocksdb_transactiondb_delete_cf(txn_db, woptions, cfh, "cf_foo", 6, &err);
+    CheckNoError(err);
+    CheckTxnDBGetCF(txn_db, roptions, cfh, "cf_foo", NULL);
+    CheckTxnDBPinGetCF(txn_db, roptions, cfh, "cf_foo", NULL);
+
+    // flush
+    rocksdb_flushoptions_t* flush_options = rocksdb_flushoptions_create();
+    rocksdb_flushoptions_set_wait(flush_options, 1);
+    rocksdb_transactiondb_flush_wal(txn_db, 1, &err);
+    CheckNoError(err);
+    rocksdb_transactiondb_flush_cf(txn_db, flush_options, cfh, &err);
+    CheckNoError(err);
+    rocksdb_transactiondb_flush(txn_db, flush_options, &err);
+    CheckNoError(err);
+    rocksdb_flushoptions_destroy(flush_options);
+
+    // close and destroy
+    rocksdb_column_family_handle_destroy(cfh);
+    rocksdb_transaction_destroy(txn);
+    rocksdb_transactiondb_close(txn_db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_transaction_options_destroy(txn_options);
+    rocksdb_transactiondb_options_destroy(txn_db_options);
+  }
+
+  StartPhase("two-phase commit");
+  {
+    // open a TransactionDB
+    txn_db_options = rocksdb_transactiondb_options_create();
+    txn_options = rocksdb_transaction_options_create();
+    rocksdb_options_set_create_if_missing(options, 1);
+    txn_db = rocksdb_transactiondb_open(options, txn_db_options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_transaction_options_set_skip_prepare(txn_options, 0);
+    txn = rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL);
+    rocksdb_transaction_commit(txn, &err);
+    CheckCondition(err != NULL);
+    Free(&err);
+    err = NULL;
+    rocksdb_transaction_prepare(txn, &err);
+    CheckCondition(err != NULL);
+    Free(&err);
+    err = NULL;
+    rocksdb_transaction_set_name(txn, "txn1", 4, &err);
+    CheckNoError(err);
+    rocksdb_transaction_prepare(txn, &err);
+    CheckNoError(err);
+    rocksdb_transaction_commit(txn, &err);
+    CheckNoError(err);
+    rocksdb_transaction_destroy(txn);
+
+    // prepare 2 transactions and close db.
+    rocksdb_transaction_t* txn1 =
+        rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL);
+    rocksdb_transaction_put(txn1, "bar1", 4, "1", 1, &err);
+    CheckNoError(err);
+    rocksdb_transaction_set_name(txn1, "txn1", 4, &err);
+    CheckNoError(err);
+    rocksdb_transaction_prepare(txn1, &err);
+    CheckNoError(err);
+    rocksdb_transaction_t* txn2 =
+        rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL);
+    rocksdb_transaction_put(txn2, "bar2", 4, "2", 1, &err);
+    CheckNoError(err);
+    rocksdb_transaction_set_name(txn2, "txn2", 4, &err);
+    CheckNoError(err);
+    rocksdb_transaction_prepare(txn2, &err);
+    CheckNoError(err);
+    rocksdb_transaction_destroy(txn1);
+    rocksdb_transaction_destroy(txn2);
+    rocksdb_transactiondb_close(txn_db);
+    rocksdb_transaction_options_destroy(txn_options);
+    rocksdb_transactiondb_options_destroy(txn_db_options);
+
+    // reopen db and get all prepared.
+    txn_db_options = rocksdb_transactiondb_options_create();
+    txn_options = rocksdb_transaction_options_create();
+    rocksdb_options_set_error_if_exists(options, 0);
+    txn_db = rocksdb_transactiondb_open(options, txn_db_options, dbname, &err);
+    CheckNoError(err);
+    CheckTxnDBPinGet(txn_db, roptions, "bar1", NULL);
+    CheckTxnDBPinGet(txn_db, roptions, "bar2", NULL);
+    size_t cnt;
+    rocksdb_transaction_t** txns =
+        rocksdb_transactiondb_get_prepared_transactions(txn_db, &cnt);
+    CheckCondition(cnt == 2);
+    size_t i;
+    for (i = 0; i < cnt; i++) {
+      txn = txns[i];
+      size_t name_len = 0;
+      char* name = rocksdb_transaction_get_name(txn, &name_len);
+      CheckCondition(name_len == 4);
+      if (strncmp(name, "txn1", name_len) == 0) {
+        rocksdb_transaction_commit(txn, &err);
+      } else if (strncmp(name, "txn2", name_len) == 0) {
+        rocksdb_transaction_rollback(txn, &err);
+      }
+      rocksdb_free(name);
+      CheckNoError(err);
+      rocksdb_transaction_destroy(txn);
+    }
+    rocksdb_free(txns);
+    CheckTxnDBGet(txn_db, roptions, "bar1", "1");
+    CheckTxnDBGet(txn_db, roptions, "bar2", NULL);
+    rocksdb_transactiondb_put(txn_db, woptions, "bar2", 4, "2", 1, &err);
+    CheckNoError(err);
+
+    // close and destroy
+    rocksdb_transactiondb_close(txn_db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_transaction_options_destroy(txn_options);
+    rocksdb_transactiondb_options_destroy(txn_db_options);
+  }
+
+  StartPhase("optimistic_transactions");
+  {
+    rocksdb_options_t* db_options = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(db_options, 1);
+    rocksdb_options_set_allow_concurrent_memtable_write(db_options, 1);
+    otxn_db = rocksdb_optimistictransactiondb_open(db_options, dbname, &err);
+    otxn_options = rocksdb_optimistictransaction_options_create();
+    rocksdb_transaction_t* txn1 = rocksdb_optimistictransaction_begin(
+        otxn_db, woptions, otxn_options, NULL);
+    rocksdb_transaction_t* txn2 = rocksdb_optimistictransaction_begin(
+        otxn_db, woptions, otxn_options, NULL);
+    rocksdb_transaction_put(txn1, "key", 3, "value", 5, &err);
+    CheckNoError(err);
+    rocksdb_transaction_put(txn2, "key1", 4, "value1", 6, &err);
+    CheckNoError(err);
+    CheckTxnGet(txn1, roptions, "key", "value");
+    CheckTxnPinGet(txn1, roptions, "key", "value");
+    rocksdb_transaction_commit(txn1, &err);
+    CheckNoError(err);
+    rocksdb_transaction_commit(txn2, &err);
+    CheckNoError(err);
+    rocksdb_transaction_destroy(txn1);
+    rocksdb_transaction_destroy(txn2);
+
+    // Check column family
+    db = rocksdb_optimistictransactiondb_get_base_db(otxn_db);
+    rocksdb_put(db, woptions, "key", 3, "value", 5, &err);
+    CheckNoError(err);
+    rocksdb_column_family_handle_t *cfh1, *cfh2;
+    cfh1 = rocksdb_create_column_family(db, db_options, "txn_db_cf1", &err);
+    cfh2 = rocksdb_create_column_family(db, db_options, "txn_db_cf2", &err);
+    txn = rocksdb_optimistictransaction_begin(otxn_db, woptions, otxn_options,
+                                              NULL);
+    rocksdb_transaction_put_cf(txn, cfh1, "key_cf1", 7, "val_cf1", 7, &err);
+    CheckNoError(err);
+    rocksdb_transaction_put_cf(txn, cfh2, "key_cf2", 7, "val_cf2", 7, &err);
+    CheckNoError(err);
+    rocksdb_transaction_commit(txn, &err);
+    CheckNoError(err);
+    txn = rocksdb_optimistictransaction_begin(otxn_db, woptions, otxn_options,
+                                              txn);
+    CheckGetCF(db, roptions, cfh1, "key_cf1", "val_cf1");
+    CheckTxnGetCF(txn, roptions, cfh1, "key_cf1", "val_cf1");
+    CheckTxnPinGetCF(txn, roptions, cfh1, "key_cf1", "val_cf1");
+    {
+      const rocksdb_column_family_handle_t* get_handles[3] = {cfh1, cfh2, cfh2};
+      const char* keys[3] = {"key_cf1", "key_cf2", "notfound"};
+      const size_t keys_sizes[3] = {7, 7, 8};
+      char* vals[3];
+      size_t vals_sizes[3];
+      char* errs[3];
+      const char* expected[3] = {"val_cf1", "val_cf2", NULL};
+      rocksdb_transaction_multi_get_cf(txn, roptions, get_handles, 3, keys,
+                                       keys_sizes, vals, vals_sizes, errs);
+      CheckMultiGetValues(3, vals, vals_sizes, errs, expected);
+    }
+
+    // Check iterator with column family
+    rocksdb_transaction_put_cf(txn, cfh1, "key1_cf", 7, "val1_cf", 7, &err);
+    CheckNoError(err);
+    rocksdb_iterator_t* iter =
+        rocksdb_transaction_create_iterator_cf(txn, roptions, cfh1);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+    CheckIter(iter, "key1_cf", "val1_cf");
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+
+    rocksdb_transaction_destroy(txn);
+    rocksdb_column_family_handle_destroy(cfh1);
+    rocksdb_column_family_handle_destroy(cfh2);
+    rocksdb_optimistictransactiondb_close_base_db(db);
+    rocksdb_optimistictransactiondb_close(otxn_db);
+
+    // Check open optimistic transaction db with column families
+    size_t cf_len;
+    char** column_fams =
+        rocksdb_list_column_families(db_options, dbname, &cf_len, &err);
+    CheckNoError(err);
+    CheckEqual("default", column_fams[0], 7);
+    CheckEqual("txn_db_cf1", column_fams[1], 10);
+    CheckEqual("txn_db_cf2", column_fams[2], 10);
+    CheckCondition(cf_len == 3);
+    rocksdb_list_column_families_destroy(column_fams, cf_len);
+
+    const char* cf_names[3] = {"default", "txn_db_cf1", "txn_db_cf2"};
+    rocksdb_options_t* cf_options = rocksdb_options_create();
+    const rocksdb_options_t* cf_opts[3] = {cf_options, cf_options, cf_options};
+
+    rocksdb_options_set_error_if_exists(cf_options, 0);
+    rocksdb_column_family_handle_t* cf_handles[3];
+    otxn_db = rocksdb_optimistictransactiondb_open_column_families(
+        db_options, dbname, 3, cf_names, cf_opts, cf_handles, &err);
+    CheckNoError(err);
+    rocksdb_transaction_t* txn_cf = rocksdb_optimistictransaction_begin(
+        otxn_db, woptions, otxn_options, NULL);
+    CheckTxnGetCF(txn_cf, roptions, cf_handles[0], "key", "value");
+    CheckTxnGetCF(txn_cf, roptions, cf_handles[1], "key_cf1", "val_cf1");
+    CheckTxnGetCF(txn_cf, roptions, cf_handles[2], "key_cf2", "val_cf2");
+    CheckTxnPinGetCF(txn_cf, roptions, cf_handles[0], "key", "value");
+    CheckTxnPinGetCF(txn_cf, roptions, cf_handles[1], "key_cf1", "val_cf1");
+    CheckTxnPinGetCF(txn_cf, roptions, cf_handles[2], "key_cf2", "val_cf2");
+    rocksdb_transaction_destroy(txn_cf);
+    rocksdb_options_destroy(cf_options);
+    rocksdb_column_family_handle_destroy(cf_handles[0]);
+    rocksdb_column_family_handle_destroy(cf_handles[1]);
+    rocksdb_column_family_handle_destroy(cf_handles[2]);
+    rocksdb_optimistictransactiondb_close(otxn_db);
+    rocksdb_destroy_db(db_options, dbname, &err);
+    rocksdb_options_destroy(db_options);
+    rocksdb_optimistictransaction_options_destroy(otxn_options);
+    CheckNoError(err);
+  }
+
+  // Simple sanity check that setting memtable rep works.
+  StartPhase("memtable_reps");
+  {
+    // Create database with vector memtable.
+    rocksdb_options_set_memtable_vector_rep(options);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    // Create database with hash skiplist memtable.
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_options_set_hash_skip_list_rep(options, 5000, 4, 4);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+  }
+
+  // Check that secondary instance works.
+  StartPhase("open_as_secondary");
+  {
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+
+    rocksdb_options_t* db_options = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(db_options, 1);
+    db = rocksdb_open(db_options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_t* db1;
+    rocksdb_options_t* opts = rocksdb_options_create();
+    rocksdb_options_set_max_open_files(opts, -1);
+    rocksdb_options_set_create_if_missing(opts, 1);
+    snprintf(secondary_path, sizeof(secondary_path),
+             "%s/rocksdb_c_test_secondary-%d", GetTempDir(), ((int)geteuid()));
+    db1 = rocksdb_open_as_secondary(opts, dbname, secondary_path, &err);
+    CheckNoError(err);
+
+    rocksdb_writeoptions_set_sync(woptions, 0);
+    rocksdb_writeoptions_disable_WAL(woptions, 1);
+    rocksdb_put(db, woptions, "key0", 4, "value0", 6, &err);
+    CheckNoError(err);
+    rocksdb_flushoptions_t* flush_opts = rocksdb_flushoptions_create();
+    rocksdb_flushoptions_set_wait(flush_opts, 1);
+    rocksdb_flush(db, flush_opts, &err);
+    CheckNoError(err);
+    rocksdb_try_catch_up_with_primary(db1, &err);
+    CheckNoError(err);
+    rocksdb_readoptions_t* ropts = rocksdb_readoptions_create();
+    rocksdb_readoptions_set_verify_checksums(ropts, 1);
+    rocksdb_readoptions_set_snapshot(ropts, NULL);
+    CheckGet(db, ropts, "key0", "value0");
+    CheckGet(db1, ropts, "key0", "value0");
+
+    rocksdb_writeoptions_disable_WAL(woptions, 0);
+    rocksdb_put(db, woptions, "key1", 4, "value1", 6, &err);
+    CheckNoError(err);
+    rocksdb_try_catch_up_with_primary(db1, &err);
+    CheckNoError(err);
+    CheckGet(db1, ropts, "key0", "value0");
+    CheckGet(db1, ropts, "key1", "value1");
+
+    rocksdb_close(db1);
+    rocksdb_destroy_db(opts, secondary_path, &err);
+    CheckNoError(err);
+
+    rocksdb_options_destroy(db_options);
+    rocksdb_options_destroy(opts);
+    rocksdb_readoptions_destroy(ropts);
+    rocksdb_flushoptions_destroy(flush_opts);
+  }
+
+  // Simple sanity check that options setting db_paths work.
+  StartPhase("open_db_paths");
+  {
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+
+    const rocksdb_dbpath_t* paths[1] = {dbpath};
+    rocksdb_options_set_db_paths(options, paths, 1);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+  }
+
+  StartPhase("filter_with_prefix_seek");
+  {
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_options_set_prefix_extractor(
+        options, rocksdb_slicetransform_create_fixed_prefix(1));
+    rocksdb_filterpolicy_t* filter_policy =
+        rocksdb_filterpolicy_create_bloom_full(8.0);
+    rocksdb_block_based_options_set_filter_policy(table_options, filter_policy);
+    rocksdb_options_set_block_based_table_factory(options, table_options);
+
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    int i;
+    for (i = 0; i < 10; ++i) {
+      char key = '0' + (char)i;
+      rocksdb_put(db, woptions, &key, 1, "", 1, &err);
+      CheckNoError(err);
+    }
+
+    // Flush to generate an L0 so that filter will be used later.
+    rocksdb_flushoptions_t* flush_options = rocksdb_flushoptions_create();
+    rocksdb_flushoptions_set_wait(flush_options, 1);
+    rocksdb_flush(db, flush_options, &err);
+    rocksdb_flushoptions_destroy(flush_options);
+    CheckNoError(err);
+
+    rocksdb_readoptions_t* ropts = rocksdb_readoptions_create();
+    rocksdb_iterator_t* iter = rocksdb_create_iterator(db, ropts);
+
+    rocksdb_iter_seek(iter, "0", 1);
+    int cnt = 0;
+    while (rocksdb_iter_valid(iter)) {
+      ++cnt;
+      rocksdb_iter_next(iter);
+    }
+    CheckCondition(10 == cnt);
+
+    rocksdb_iter_destroy(iter);
+    rocksdb_readoptions_destroy(ropts);
+  }
+
+  StartPhase("cancel_all_background_work");
+  rocksdb_cancel_all_background_work(db, 1);
+
+  StartPhase("cleanup");
+  rocksdb_close(db);
+  rocksdb_options_destroy(options);
+  rocksdb_block_based_options_destroy(table_options);
+  rocksdb_readoptions_destroy(roptions);
+  rocksdb_writeoptions_destroy(woptions);
+  rocksdb_compactoptions_destroy(coptions);
+  rocksdb_cache_destroy(cache);
+  rocksdb_comparator_destroy(cmp);
+  rocksdb_dbpath_destroy(dbpath);
+  rocksdb_env_destroy(env);
+
+  fprintf(stderr, "PASS\n");
+  return 0;
+}
+
+#else
+
+int main(void) {
+  fprintf(stderr, "SKIPPED\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/column_family.cc b/src/rocksdb/db/column_family.cc
new file mode 100644
index 000000000..268060ddf
--- /dev/null
+++ b/src/rocksdb/db/column_family.cc
@@ -0,0 +1,1683 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/column_family.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "db/blob/blob_file_cache.h"
+#include "db/blob/blob_source.h"
+#include "db/compaction/compaction_picker.h"
+#include "db/compaction/compaction_picker_fifo.h"
+#include "db/compaction/compaction_picker_level.h"
+#include "db/compaction/compaction_picker_universal.h"
+#include "db/db_impl/db_impl.h"
+#include "db/internal_stats.h"
+#include "db/job_context.h"
+#include "db/range_del_aggregator.h"
+#include "db/table_properties_collector.h"
+#include "db/version_set.h"
+#include "db/write_controller.h"
+#include "file/sst_file_manager_impl.h"
+#include "logging/logging.h"
+#include "monitoring/thread_status_util.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/table.h"
+#include "table/merging_iterator.h"
+#include "util/autovector.h"
+#include "util/cast_util.h"
+#include "util/compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(
+    ColumnFamilyData* column_family_data, DBImpl* db, InstrumentedMutex* mutex)
+    : cfd_(column_family_data), db_(db), mutex_(mutex) {
+  if (cfd_ != nullptr) {
+    cfd_->Ref();
+  }
+}
+
+ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
+  if (cfd_ != nullptr) {
+#ifndef ROCKSDB_LITE
+    for (auto& listener : cfd_->ioptions()->listeners) {
+      listener->OnColumnFamilyHandleDeletionStarted(this);
+    }
+#endif  // ROCKSDB_LITE
+    // Job id == 0 means that this is not our background process, but rather
+    // user thread
+    // Need to hold some shared pointers owned by the initial_cf_options
+    // before final cleaning up finishes.
+    ColumnFamilyOptions initial_cf_options_copy = cfd_->initial_cf_options();
+    JobContext job_context(0);
+    mutex_->Lock();
+    bool dropped = cfd_->IsDropped();
+    if (cfd_->UnrefAndTryDelete()) {
+      if (dropped) {
+        db_->FindObsoleteFiles(&job_context, false, true);
+      }
+    }
+    mutex_->Unlock();
+    if (job_context.HaveSomethingToDelete()) {
+      bool defer_purge =
+          db_->immutable_db_options().avoid_unnecessary_blocking_io;
+      db_->PurgeObsoleteFiles(job_context, defer_purge);
+    }
+    job_context.Clean();
+  }
+}
+
+uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); }
+
+const std::string& ColumnFamilyHandleImpl::GetName() const {
+  return cfd()->GetName();
+}
+
+Status ColumnFamilyHandleImpl::GetDescriptor(ColumnFamilyDescriptor* desc) {
+#ifndef ROCKSDB_LITE
+  // accessing mutable cf-options requires db mutex.
+  InstrumentedMutexLock l(mutex_);
+  *desc = ColumnFamilyDescriptor(cfd()->GetName(), cfd()->GetLatestCFOptions());
+  return Status::OK();
+#else
+  (void)desc;
+  return Status::NotSupported();
+#endif  // !ROCKSDB_LITE
+}
+
+const Comparator* ColumnFamilyHandleImpl::GetComparator() const {
+  return cfd()->user_comparator();
+}
+
+void GetIntTblPropCollectorFactory(
+    const ImmutableCFOptions& ioptions,
+    IntTblPropCollectorFactories* int_tbl_prop_collector_factories) {
+  assert(int_tbl_prop_collector_factories);
+
+  auto& collector_factories = ioptions.table_properties_collector_factories;
+  for (size_t i = 0; i < ioptions.table_properties_collector_factories.size();
+       ++i) {
+    assert(collector_factories[i]);
+    int_tbl_prop_collector_factories->emplace_back(
+        new UserKeyTablePropertiesCollectorFactory(collector_factories[i]));
+  }
+}
+
+Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options) {
+  if (!cf_options.compression_per_level.empty()) {
+    for (size_t level = 0; level < cf_options.compression_per_level.size();
+         ++level) {
+      if (!CompressionTypeSupported(cf_options.compression_per_level[level])) {
+        return Status::InvalidArgument(
+            "Compression type " +
+            CompressionTypeToString(cf_options.compression_per_level[level]) +
+            " is not linked with the binary.");
+      }
+    }
+  } else {
+    if (!CompressionTypeSupported(cf_options.compression)) {
+      return Status::InvalidArgument(
+          "Compression type " +
+          CompressionTypeToString(cf_options.compression) +
+          " is not linked with the binary.");
+    }
+  }
+  if (cf_options.compression_opts.zstd_max_train_bytes > 0) {
+    if (cf_options.compression_opts.use_zstd_dict_trainer) {
+      if (!ZSTD_TrainDictionarySupported()) {
+        return Status::InvalidArgument(
+            "zstd dictionary trainer cannot be used because ZSTD 1.1.3+ "
+            "is not linked with the binary.");
+      }
+    } else if (!ZSTD_FinalizeDictionarySupported()) {
+      return Status::InvalidArgument(
+          "zstd finalizeDictionary cannot be used because ZSTD 1.4.5+ "
+          "is not linked with the binary.");
+    }
+    if (cf_options.compression_opts.max_dict_bytes == 0) {
+      return Status::InvalidArgument(
+          "The dictionary size limit (`CompressionOptions::max_dict_bytes`) "
+          "should be nonzero if we're using zstd's dictionary generator.");
+    }
+  }
+
+  if (!CompressionTypeSupported(cf_options.blob_compression_type)) {
+    std::ostringstream oss;
+    oss << "The specified blob compression type "
+        << CompressionTypeToString(cf_options.blob_compression_type)
+        << " is not available.";
+
+    return Status::InvalidArgument(oss.str());
+  }
+
+  return Status::OK();
+}
+
+Status CheckConcurrentWritesSupported(const ColumnFamilyOptions& cf_options) {
+  if (cf_options.inplace_update_support) {
+    return Status::InvalidArgument(
+        "In-place memtable updates (inplace_update_support) is not compatible "
+        "with concurrent writes (allow_concurrent_memtable_write)");
+  }
+  if (!cf_options.memtable_factory->IsInsertConcurrentlySupported()) {
+    return Status::InvalidArgument(
+        "Memtable doesn't concurrent writes (allow_concurrent_memtable_write)");
+  }
+  return Status::OK();
+}
+
+Status CheckCFPathsSupported(const DBOptions& db_options,
+                             const ColumnFamilyOptions& cf_options) {
+  // More than one cf_paths are supported only in universal
+  // and level compaction styles. This function also checks the case
+  // in which cf_paths is not specified, which results in db_paths
+  // being used.
+  if ((cf_options.compaction_style != kCompactionStyleUniversal) &&
+      (cf_options.compaction_style != kCompactionStyleLevel)) {
+    if (cf_options.cf_paths.size() > 1) {
+      return Status::NotSupported(
+          "More than one CF paths are only supported in "
+          "universal and level compaction styles. ");
+    } else if (cf_options.cf_paths.empty() && db_options.db_paths.size() > 1) {
+      return Status::NotSupported(
+          "More than one DB paths are only supported in "
+          "universal and level compaction styles. ");
+    }
+  }
+  return Status::OK();
+}
+
+namespace {
+const uint64_t kDefaultTtl = 0xfffffffffffffffe;
+const uint64_t kDefaultPeriodicCompSecs = 0xfffffffffffffffe;
+}  // anonymous namespace
+
+ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
+                                    const ColumnFamilyOptions& src) {
+  ColumnFamilyOptions result = src;
+  size_t clamp_max = std::conditional<
+      sizeof(size_t) == 4, std::integral_constant<size_t, 0xffffffff>,
+      std::integral_constant<uint64_t, 64ull << 30>>::type::value;
+  ClipToRange(&result.write_buffer_size, (static_cast<size_t>(64)) << 10,
+              clamp_max);
+  // if user sets arena_block_size, we trust user to use this value. Otherwise,
+  // calculate a proper value from writer_buffer_size;
+  if (result.arena_block_size <= 0) {
+    result.arena_block_size =
+        std::min(size_t{1024 * 1024}, result.write_buffer_size / 8);
+
+    // Align up to 4k
+    const size_t align = 4 * 1024;
+    result.arena_block_size =
+        ((result.arena_block_size + align - 1) / align) * align;
+  }
+  result.min_write_buffer_number_to_merge =
+      std::min(result.min_write_buffer_number_to_merge,
+               result.max_write_buffer_number - 1);
+  if (result.min_write_buffer_number_to_merge < 1) {
+    result.min_write_buffer_number_to_merge = 1;
+  }
+
+  if (db_options.atomic_flush && result.min_write_buffer_number_to_merge > 1) {
+    ROCKS_LOG_WARN(
+        db_options.logger,
+        "Currently, if atomic_flush is true, then triggering flush for any "
+        "column family internally (non-manual flush) will trigger flushing "
+        "all column families even if the number of memtables is smaller "
+        "min_write_buffer_number_to_merge. Therefore, configuring "
+        "min_write_buffer_number_to_merge > 1 is not compatible and should "
+        "be satinized to 1. Not doing so will lead to data loss and "
+        "inconsistent state across multiple column families when WAL is "
+        "disabled, which is a common setting for atomic flush");
+
+    result.min_write_buffer_number_to_merge = 1;
+  }
+
+  if (result.num_levels < 1) {
+    result.num_levels = 1;
+  }
+  if (result.compaction_style == kCompactionStyleLevel &&
+      result.num_levels < 2) {
+    result.num_levels = 2;
+  }
+
+  if (result.compaction_style == kCompactionStyleUniversal &&
+      db_options.allow_ingest_behind && result.num_levels < 3) {
+    result.num_levels = 3;
+  }
+
+  if (result.max_write_buffer_number < 2) {
+    result.max_write_buffer_number = 2;
+  }
+  // fall back max_write_buffer_number_to_maintain if
+  // max_write_buffer_size_to_maintain is not set
+  if (result.max_write_buffer_size_to_maintain < 0) {
+    result.max_write_buffer_size_to_maintain =
+        result.max_write_buffer_number *
+        static_cast<int64_t>(result.write_buffer_size);
+  } else if (result.max_write_buffer_size_to_maintain == 0 &&
+             result.max_write_buffer_number_to_maintain < 0) {
+    result.max_write_buffer_number_to_maintain = result.max_write_buffer_number;
+  }
+  // bloom filter size shouldn't exceed 1/4 of memtable size.
+  if (result.memtable_prefix_bloom_size_ratio > 0.25) {
+    result.memtable_prefix_bloom_size_ratio = 0.25;
+  } else if (result.memtable_prefix_bloom_size_ratio < 0) {
+    result.memtable_prefix_bloom_size_ratio = 0;
+  }
+
+  if (!result.prefix_extractor) {
+    assert(result.memtable_factory);
+    Slice name = result.memtable_factory->Name();
+    if (name.compare("HashSkipListRepFactory") == 0 ||
+        name.compare("HashLinkListRepFactory") == 0) {
+      result.memtable_factory = std::make_shared<SkipListFactory>();
+    }
+  }
+
+  if (result.compaction_style == kCompactionStyleFIFO) {
+    // since we delete level0 files in FIFO compaction when there are too many
+    // of them, these options don't really mean anything
+    result.level0_slowdown_writes_trigger = std::numeric_limits<int>::max();
+    result.level0_stop_writes_trigger = std::numeric_limits<int>::max();
+  }
+
+  if (result.max_bytes_for_level_multiplier <= 0) {
+    result.max_bytes_for_level_multiplier = 1;
+  }
+
+  if (result.level0_file_num_compaction_trigger == 0) {
+    ROCKS_LOG_WARN(db_options.logger,
+                   "level0_file_num_compaction_trigger cannot be 0");
+    result.level0_file_num_compaction_trigger = 1;
+  }
+
+  if (result.level0_stop_writes_trigger <
+          result.level0_slowdown_writes_trigger ||
+      result.level0_slowdown_writes_trigger <
+          result.level0_file_num_compaction_trigger) {
+    ROCKS_LOG_WARN(db_options.logger,
+                   "This condition must be satisfied: "
+                   "level0_stop_writes_trigger(%d) >= "
+                   "level0_slowdown_writes_trigger(%d) >= "
+                   "level0_file_num_compaction_trigger(%d)",
+                   result.level0_stop_writes_trigger,
+                   result.level0_slowdown_writes_trigger,
+                   result.level0_file_num_compaction_trigger);
+    if (result.level0_slowdown_writes_trigger <
+        result.level0_file_num_compaction_trigger) {
+      result.level0_slowdown_writes_trigger =
+          result.level0_file_num_compaction_trigger;
+    }
+    if (result.level0_stop_writes_trigger <
+        result.level0_slowdown_writes_trigger) {
+      result.level0_stop_writes_trigger = result.level0_slowdown_writes_trigger;
+    }
+    ROCKS_LOG_WARN(db_options.logger,
+                   "Adjust the value to "
+                   "level0_stop_writes_trigger(%d)"
+                   "level0_slowdown_writes_trigger(%d)"
+                   "level0_file_num_compaction_trigger(%d)",
+                   result.level0_stop_writes_trigger,
+                   result.level0_slowdown_writes_trigger,
+                   result.level0_file_num_compaction_trigger);
+  }
+
+  if (result.soft_pending_compaction_bytes_limit == 0) {
+    result.soft_pending_compaction_bytes_limit =
+        result.hard_pending_compaction_bytes_limit;
+  } else if (result.hard_pending_compaction_bytes_limit > 0 &&
+             result.soft_pending_compaction_bytes_limit >
+                 result.hard_pending_compaction_bytes_limit) {
+    result.soft_pending_compaction_bytes_limit =
+        result.hard_pending_compaction_bytes_limit;
+  }
+
+#ifndef ROCKSDB_LITE
+  // When the DB is stopped, it's possible that there are some .trash files that
+  // were not deleted yet, when we open the DB we will find these .trash files
+  // and schedule them to be deleted (or delete immediately if SstFileManager
+  // was not used)
+  auto sfm =
+      static_cast<SstFileManagerImpl*>(db_options.sst_file_manager.get());
+  for (size_t i = 0; i < result.cf_paths.size(); i++) {
+    DeleteScheduler::CleanupDirectory(db_options.env, sfm,
+                                      result.cf_paths[i].path)
+        .PermitUncheckedError();
+  }
+#endif
+
+  if (result.cf_paths.empty()) {
+    result.cf_paths = db_options.db_paths;
+  }
+
+  if (result.level_compaction_dynamic_level_bytes) {
+    if (result.compaction_style != kCompactionStyleLevel) {
+      ROCKS_LOG_WARN(db_options.info_log.get(),
+                     "level_compaction_dynamic_level_bytes only makes sense"
+                     "for level-based compaction");
+      result.level_compaction_dynamic_level_bytes = false;
+    } else if (result.cf_paths.size() > 1U) {
+      // we don't yet know how to make both of this feature and multiple
+      // DB path work.
+      ROCKS_LOG_WARN(db_options.info_log.get(),
+                     "multiple cf_paths/db_paths and"
+                     "level_compaction_dynamic_level_bytes"
+                     "can't be used together");
+      result.level_compaction_dynamic_level_bytes = false;
+    }
+  }
+
+  if (result.max_compaction_bytes == 0) {
+    result.max_compaction_bytes = result.target_file_size_base * 25;
+  }
+
+  bool is_block_based_table = (result.table_factory->IsInstanceOf(
+      TableFactory::kBlockBasedTableName()));
+
+  const uint64_t kAdjustedTtl = 30 * 24 * 60 * 60;
+  if (result.ttl == kDefaultTtl) {
+    if (is_block_based_table &&
+        result.compaction_style != kCompactionStyleFIFO) {
+      result.ttl = kAdjustedTtl;
+    } else {
+      result.ttl = 0;
+    }
+  }
+
+  const uint64_t kAdjustedPeriodicCompSecs = 30 * 24 * 60 * 60;
+
+  // Turn on periodic compactions and set them to occur once every 30 days if
+  // compaction filters are used and periodic_compaction_seconds is set to the
+  // default value.
+  if (result.compaction_style != kCompactionStyleFIFO) {
+    if ((result.compaction_filter != nullptr ||
+         result.compaction_filter_factory != nullptr) &&
+        result.periodic_compaction_seconds == kDefaultPeriodicCompSecs &&
+        is_block_based_table) {
+      result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs;
+    }
+  } else {
+    // result.compaction_style == kCompactionStyleFIFO
+    if (result.ttl == 0) {
+      if (is_block_based_table) {
+        if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) {
+          result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs;
+        }
+        result.ttl = result.periodic_compaction_seconds;
+      }
+    } else if (result.periodic_compaction_seconds != 0) {
+      result.ttl = std::min(result.ttl, result.periodic_compaction_seconds);
+    }
+  }
+
+  // TTL compactions would work similar to Periodic Compactions in Universal in
+  // most of the cases. So, if ttl is set, execute the periodic compaction
+  // codepath.
+  if (result.compaction_style == kCompactionStyleUniversal && result.ttl != 0) {
+    if (result.periodic_compaction_seconds != 0) {
+      result.periodic_compaction_seconds =
+          std::min(result.ttl, result.periodic_compaction_seconds);
+    } else {
+      result.periodic_compaction_seconds = result.ttl;
+    }
+  }
+
+  if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) {
+    result.periodic_compaction_seconds = 0;
+  }
+
+  return result;
+}
+
+int SuperVersion::dummy = 0;
+void* const SuperVersion::kSVInUse = &SuperVersion::dummy;
+void* const SuperVersion::kSVObsolete = nullptr;
+
+SuperVersion::~SuperVersion() {
+  for (auto td : to_delete) {
+    delete td;
+  }
+}
+
+SuperVersion* SuperVersion::Ref() {
+  refs.fetch_add(1, std::memory_order_relaxed);
+  return this;
+}
+
+bool SuperVersion::Unref() {
+  // fetch_sub returns the previous value of ref
+  uint32_t previous_refs = refs.fetch_sub(1);
+  assert(previous_refs > 0);
+  return previous_refs == 1;
+}
+
+void SuperVersion::Cleanup() {
+  assert(refs.load(std::memory_order_relaxed) == 0);
+  // Since this SuperVersion object is being deleted,
+  // decrement reference to the immutable MemtableList
+  // this SV object was pointing to.
+  imm->Unref(&to_delete);
+  MemTable* m = mem->Unref();
+  if (m != nullptr) {
+    auto* memory_usage = current->cfd()->imm()->current_memory_usage();
+    assert(*memory_usage >= m->ApproximateMemoryUsage());
+    *memory_usage -= m->ApproximateMemoryUsage();
+    to_delete.push_back(m);
+  }
+  current->Unref();
+  cfd->UnrefAndTryDelete();
+}
+
+void SuperVersion::Init(ColumnFamilyData* new_cfd, MemTable* new_mem,
+                        MemTableListVersion* new_imm, Version* new_current) {
+  cfd = new_cfd;
+  mem = new_mem;
+  imm = new_imm;
+  current = new_current;
+  cfd->Ref();
+  mem->Ref();
+  imm->Ref();
+  current->Ref();
+  refs.store(1, std::memory_order_relaxed);
+}
+
+namespace {
+void SuperVersionUnrefHandle(void* ptr) {
+  // UnrefHandle is called when a thread exits or a ThreadLocalPtr gets
+  // destroyed. When the former happens, the thread shouldn't see kSVInUse.
+  // When the latter happens, only super_version_ holds a reference
+  // to ColumnFamilyData, so no further queries are possible.
+  SuperVersion* sv = static_cast<SuperVersion*>(ptr);
+  bool was_last_ref __attribute__((__unused__));
+  was_last_ref = sv->Unref();
+  // Thread-local SuperVersions can't outlive ColumnFamilyData::super_version_.
+  // This is important because we can't do SuperVersion cleanup here.
+  // That would require locking DB mutex, which would deadlock because
+  // SuperVersionUnrefHandle is called with locked ThreadLocalPtr mutex.
+  assert(!was_last_ref);
+}
+}  // anonymous namespace
+
+std::vector<std::string> ColumnFamilyData::GetDbPaths() const {
+  std::vector<std::string> paths;
+  paths.reserve(ioptions_.cf_paths.size());
+  for (const DbPath& db_path : ioptions_.cf_paths) {
+    paths.emplace_back(db_path.path);
+  }
+  return paths;
+}
+
+const uint32_t ColumnFamilyData::kDummyColumnFamilyDataId =
+    std::numeric_limits<uint32_t>::max();
+
+ColumnFamilyData::ColumnFamilyData(
+    uint32_t id, const std::string& name, Version* _dummy_versions,
+    Cache* _table_cache, WriteBufferManager* write_buffer_manager,
+    const ColumnFamilyOptions& cf_options, const ImmutableDBOptions& db_options,
+    const FileOptions* file_options, ColumnFamilySet* column_family_set,
+    BlockCacheTracer* const block_cache_tracer,
+    const std::shared_ptr<IOTracer>& io_tracer, const std::string& db_id,
+    const std::string& db_session_id)
+    : id_(id),
+      name_(name),
+      dummy_versions_(_dummy_versions),
+      current_(nullptr),
+      refs_(0),
+      initialized_(false),
+      dropped_(false),
+      internal_comparator_(cf_options.comparator),
+      initial_cf_options_(SanitizeOptions(db_options, cf_options)),
+      ioptions_(db_options, initial_cf_options_),
+      mutable_cf_options_(initial_cf_options_),
+      is_delete_range_supported_(
+          cf_options.table_factory->IsDeleteRangeSupported()),
+      write_buffer_manager_(write_buffer_manager),
+      mem_(nullptr),
+      imm_(ioptions_.min_write_buffer_number_to_merge,
+           ioptions_.max_write_buffer_number_to_maintain,
+           ioptions_.max_write_buffer_size_to_maintain),
+      super_version_(nullptr),
+      super_version_number_(0),
+      local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)),
+      next_(nullptr),
+      prev_(nullptr),
+      log_number_(0),
+      flush_reason_(FlushReason::kOthers),
+      column_family_set_(column_family_set),
+      queued_for_flush_(false),
+      queued_for_compaction_(false),
+      prev_compaction_needed_bytes_(0),
+      allow_2pc_(db_options.allow_2pc),
+      last_memtable_id_(0),
+      db_paths_registered_(false),
+      mempurge_used_(false) {
+  if (id_ != kDummyColumnFamilyDataId) {
+    // TODO(cc): RegisterDbPaths can be expensive, considering moving it
+    // outside of this constructor which might be called with db mutex held.
+    // TODO(cc): considering using ioptions_.fs, currently some tests rely on
+    // EnvWrapper, that's the main reason why we use env here.
+    Status s = ioptions_.env->RegisterDbPaths(GetDbPaths());
+    if (s.ok()) {
+      db_paths_registered_ = true;
+    } else {
+      ROCKS_LOG_ERROR(
+          ioptions_.logger,
+          "Failed to register data paths of column family (id: %d, name: %s)",
+          id_, name_.c_str());
+    }
+  }
+  Ref();
+
+  // Convert user defined table properties collector factories to internal ones.
+  GetIntTblPropCollectorFactory(ioptions_, &int_tbl_prop_collector_factories_);
+
+  // if _dummy_versions is nullptr, then this is a dummy column family.
+  if (_dummy_versions != nullptr) {
+    internal_stats_.reset(
+        new InternalStats(ioptions_.num_levels, ioptions_.clock, this));
+    table_cache_.reset(new TableCache(ioptions_, file_options, _table_cache,
+                                      block_cache_tracer, io_tracer,
+                                      db_session_id));
+    blob_file_cache_.reset(
+        new BlobFileCache(_table_cache, ioptions(), soptions(), id_,
+                          internal_stats_->GetBlobFileReadHist(), io_tracer));
+    blob_source_.reset(new BlobSource(ioptions(), db_id, db_session_id,
+                                      blob_file_cache_.get()));
+
+    if (ioptions_.compaction_style == kCompactionStyleLevel) {
+      compaction_picker_.reset(
+          new LevelCompactionPicker(ioptions_, &internal_comparator_));
+#ifndef ROCKSDB_LITE
+    } else if (ioptions_.compaction_style == kCompactionStyleUniversal) {
+      compaction_picker_.reset(
+          new UniversalCompactionPicker(ioptions_, &internal_comparator_));
+    } else if (ioptions_.compaction_style == kCompactionStyleFIFO) {
+      compaction_picker_.reset(
+          new FIFOCompactionPicker(ioptions_, &internal_comparator_));
+    } else if (ioptions_.compaction_style == kCompactionStyleNone) {
+      compaction_picker_.reset(
+          new NullCompactionPicker(ioptions_, &internal_comparator_));
+      ROCKS_LOG_WARN(ioptions_.logger,
+                     "Column family %s does not use any background compaction. "
+                     "Compactions can only be done via CompactFiles\n",
+                     GetName().c_str());
+#endif  // !ROCKSDB_LITE
+    } else {
+      ROCKS_LOG_ERROR(ioptions_.logger,
+                      "Unable to recognize the specified compaction style %d. "
+                      "Column family %s will use kCompactionStyleLevel.\n",
+                      ioptions_.compaction_style, GetName().c_str());
+      compaction_picker_.reset(
+          new LevelCompactionPicker(ioptions_, &internal_comparator_));
+    }
+
+    if (column_family_set_->NumberOfColumnFamilies() < 10) {
+      ROCKS_LOG_INFO(ioptions_.logger,
+                     "--------------- Options for column family [%s]:\n",
+                     name.c_str());
+      initial_cf_options_.Dump(ioptions_.logger);
+    } else {
+      ROCKS_LOG_INFO(ioptions_.logger, "\t(skipping printing options)\n");
+    }
+  }
+
+  RecalculateWriteStallConditions(mutable_cf_options_);
+
+  if (cf_options.table_factory->IsInstanceOf(
+          TableFactory::kBlockBasedTableName()) &&
+      cf_options.table_factory->GetOptions<BlockBasedTableOptions>()) {
+    const BlockBasedTableOptions* bbto =
+        cf_options.table_factory->GetOptions<BlockBasedTableOptions>();
+    const auto& options_overrides = bbto->cache_usage_options.options_overrides;
+    const auto file_metadata_charged =
+        options_overrides.at(CacheEntryRole::kFileMetadata).charged;
+    if (bbto->block_cache &&
+        file_metadata_charged == CacheEntryRoleOptions::Decision::kEnabled) {
+      // TODO(hx235): Add a `ConcurrentCacheReservationManager` at DB scope
+      // responsible for reservation of `ObsoleteFileInfo` so that we can keep
+      // this `file_metadata_cache_res_mgr_` nonconcurrent
+      file_metadata_cache_res_mgr_.reset(new ConcurrentCacheReservationManager(
+          std::make_shared<
+              CacheReservationManagerImpl<CacheEntryRole::kFileMetadata>>(
+              bbto->block_cache)));
+    }
+  }
+}
+
+// DB mutex held
+ColumnFamilyData::~ColumnFamilyData() {
+  assert(refs_.load(std::memory_order_relaxed) == 0);
+  // remove from linked list
+  auto prev = prev_;
+  auto next = next_;
+  prev->next_ = next;
+  next->prev_ = prev;
+
+  if (!dropped_ && column_family_set_ != nullptr) {
+    // If it's dropped, it's already removed from column family set
+    // If column_family_set_ == nullptr, this is dummy CFD and not in
+    // ColumnFamilySet
+    column_family_set_->RemoveColumnFamily(this);
+  }
+
+  if (current_ != nullptr) {
+    current_->Unref();
+  }
+
+  // It would be wrong if this ColumnFamilyData is in flush_queue_ or
+  // compaction_queue_ and we destroyed it
+  assert(!queued_for_flush_);
+  assert(!queued_for_compaction_);
+  assert(super_version_ == nullptr);
+
+  if (dummy_versions_ != nullptr) {
+    // List must be empty
+    assert(dummy_versions_->Next() == dummy_versions_);
+    bool deleted __attribute__((__unused__));
+    deleted = dummy_versions_->Unref();
+    assert(deleted);
+  }
+
+  if (mem_ != nullptr) {
+    delete mem_->Unref();
+  }
+  autovector<MemTable*> to_delete;
+  imm_.current()->Unref(&to_delete);
+  for (MemTable* m : to_delete) {
+    delete m;
+  }
+
+  if (db_paths_registered_) {
+    // TODO(cc): considering using ioptions_.fs, currently some tests rely on
+    // EnvWrapper, that's the main reason why we use env here.
+    Status s = ioptions_.env->UnregisterDbPaths(GetDbPaths());
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(
+          ioptions_.logger,
+          "Failed to unregister data paths of column family (id: %d, name: %s)",
+          id_, name_.c_str());
+    }
+  }
+}
+
+bool ColumnFamilyData::UnrefAndTryDelete() {
+  int old_refs = refs_.fetch_sub(1);
+  assert(old_refs > 0);
+
+  if (old_refs == 1) {
+    assert(super_version_ == nullptr);
+    delete this;
+    return true;
+  }
+
+  if (old_refs == 2 && super_version_ != nullptr) {
+    // Only the super_version_ holds me
+    SuperVersion* sv = super_version_;
+    super_version_ = nullptr;
+
+    // Release SuperVersion references kept in ThreadLocalPtr.
+    local_sv_.reset();
+
+    if (sv->Unref()) {
+      // Note: sv will delete this ColumnFamilyData during Cleanup()
+      assert(sv->cfd == this);
+      sv->Cleanup();
+      delete sv;
+      return true;
+    }
+  }
+  return false;
+}
+
+void ColumnFamilyData::SetDropped() {
+  // can't drop default CF
+  assert(id_ != 0);
+  dropped_ = true;
+  write_controller_token_.reset();
+
+  // remove from column_family_set
+  column_family_set_->RemoveColumnFamily(this);
+}
+
+ColumnFamilyOptions ColumnFamilyData::GetLatestCFOptions() const {
+  return BuildColumnFamilyOptions(initial_cf_options_, mutable_cf_options_);
+}
+
+uint64_t ColumnFamilyData::OldestLogToKeep() {
+  auto current_log = GetLogNumber();
+
+  if (allow_2pc_) {
+    auto imm_prep_log = imm()->PrecomputeMinLogContainingPrepSection();
+    auto mem_prep_log = mem()->GetMinLogContainingPrepSection();
+
+    if (imm_prep_log > 0 && imm_prep_log < current_log) {
+      current_log = imm_prep_log;
+    }
+
+    if (mem_prep_log > 0 && mem_prep_log < current_log) {
+      current_log = mem_prep_log;
+    }
+  }
+
+  return current_log;
+}
+
+const double kIncSlowdownRatio = 0.8;
+const double kDecSlowdownRatio = 1 / kIncSlowdownRatio;
+const double kNearStopSlowdownRatio = 0.6;
+const double kDelayRecoverSlowdownRatio = 1.4;
+
+namespace {
+// If penalize_stop is true, we further reduce slowdown rate.
+std::unique_ptr<WriteControllerToken> SetupDelay(
+    WriteController* write_controller, uint64_t compaction_needed_bytes,
+    uint64_t prev_compaction_need_bytes, bool penalize_stop,
+    bool auto_compactions_disabled) {
+  const uint64_t kMinWriteRate = 16 * 1024u;  // Minimum write rate 16KB/s.
+
+  uint64_t max_write_rate = write_controller->max_delayed_write_rate();
+  uint64_t write_rate = write_controller->delayed_write_rate();
+
+  if (auto_compactions_disabled) {
+    // When auto compaction is disabled, always use the value user gave.
+    write_rate = max_write_rate;
+  } else if (write_controller->NeedsDelay() && max_write_rate > kMinWriteRate) {
+    // If user gives rate less than kMinWriteRate, don't adjust it.
+    //
+    // If already delayed, need to adjust based on previous compaction debt.
+    // When there are two or more column families require delay, we always
+    // increase or reduce write rate based on information for one single
+    // column family. It is likely to be OK but we can improve if there is a
+    // problem.
+    // Ignore compaction_needed_bytes = 0 case because compaction_needed_bytes
+    // is only available in level-based compaction
+    //
+    // If the compaction debt stays the same as previously, we also further slow
+    // down. It usually means a mem table is full. It's mainly for the case
+    // where both of flush and compaction are much slower than the speed we
+    // insert to mem tables, so we need to actively slow down before we get
+    // feedback signal from compaction and flushes to avoid the full stop
+    // because of hitting the max write buffer number.
+    //
+    // If DB just falled into the stop condition, we need to further reduce
+    // the write rate to avoid the stop condition.
+    if (penalize_stop) {
+      // Penalize the near stop or stop condition by more aggressive slowdown.
+      // This is to provide the long term slowdown increase signal.
+      // The penalty is more than the reward of recovering to the normal
+      // condition.
+      write_rate = static_cast<uint64_t>(static_cast<double>(write_rate) *
+                                         kNearStopSlowdownRatio);
+      if (write_rate < kMinWriteRate) {
+        write_rate = kMinWriteRate;
+      }
+    } else if (prev_compaction_need_bytes > 0 &&
+               prev_compaction_need_bytes <= compaction_needed_bytes) {
+      write_rate = static_cast<uint64_t>(static_cast<double>(write_rate) *
+                                         kIncSlowdownRatio);
+      if (write_rate < kMinWriteRate) {
+        write_rate = kMinWriteRate;
+      }
+    } else if (prev_compaction_need_bytes > compaction_needed_bytes) {
+      // We are speeding up by ratio of kSlowdownRatio when we have paid
+      // compaction debt. But we'll never speed up to faster than the write rate
+      // given by users.
+      write_rate = static_cast<uint64_t>(static_cast<double>(write_rate) *
+                                         kDecSlowdownRatio);
+      if (write_rate > max_write_rate) {
+        write_rate = max_write_rate;
+      }
+    }
+  }
+  return write_controller->GetDelayToken(write_rate);
+}
+
+int GetL0ThresholdSpeedupCompaction(int level0_file_num_compaction_trigger,
+                                    int level0_slowdown_writes_trigger) {
+  // SanitizeOptions() ensures it.
+  assert(level0_file_num_compaction_trigger <= level0_slowdown_writes_trigger);
+
+  if (level0_file_num_compaction_trigger < 0) {
+    return std::numeric_limits<int>::max();
+  }
+
+  const int64_t twice_level0_trigger =
+      static_cast<int64_t>(level0_file_num_compaction_trigger) * 2;
+
+  const int64_t one_fourth_trigger_slowdown =
+      static_cast<int64_t>(level0_file_num_compaction_trigger) +
+      ((level0_slowdown_writes_trigger - level0_file_num_compaction_trigger) /
+       4);
+
+  assert(twice_level0_trigger >= 0);
+  assert(one_fourth_trigger_slowdown >= 0);
+
+  // 1/4 of the way between L0 compaction trigger threshold and slowdown
+  // condition.
+  // Or twice as compaction trigger, if it is smaller.
+  int64_t res = std::min(twice_level0_trigger, one_fourth_trigger_slowdown);
+  if (res >= std::numeric_limits<int32_t>::max()) {
+    return std::numeric_limits<int32_t>::max();
+  } else {
+    // res fits in int
+    return static_cast<int>(res);
+  }
+}
+}  // anonymous namespace
+
+std::pair<WriteStallCondition, ColumnFamilyData::WriteStallCause>
+ColumnFamilyData::GetWriteStallConditionAndCause(
+    int num_unflushed_memtables, int num_l0_files,
+    uint64_t num_compaction_needed_bytes,
+    const MutableCFOptions& mutable_cf_options,
+    const ImmutableCFOptions& immutable_cf_options) {
+  if (num_unflushed_memtables >= mutable_cf_options.max_write_buffer_number) {
+    return {WriteStallCondition::kStopped, WriteStallCause::kMemtableLimit};
+  } else if (!mutable_cf_options.disable_auto_compactions &&
+             num_l0_files >= mutable_cf_options.level0_stop_writes_trigger) {
+    return {WriteStallCondition::kStopped, WriteStallCause::kL0FileCountLimit};
+  } else if (!mutable_cf_options.disable_auto_compactions &&
+             mutable_cf_options.hard_pending_compaction_bytes_limit > 0 &&
+             num_compaction_needed_bytes >=
+                 mutable_cf_options.hard_pending_compaction_bytes_limit) {
+    return {WriteStallCondition::kStopped,
+            WriteStallCause::kPendingCompactionBytes};
+  } else if (mutable_cf_options.max_write_buffer_number > 3 &&
+             num_unflushed_memtables >=
+                 mutable_cf_options.max_write_buffer_number - 1 &&
+             num_unflushed_memtables - 1 >=
+                 immutable_cf_options.min_write_buffer_number_to_merge) {
+    return {WriteStallCondition::kDelayed, WriteStallCause::kMemtableLimit};
+  } else if (!mutable_cf_options.disable_auto_compactions &&
+             mutable_cf_options.level0_slowdown_writes_trigger >= 0 &&
+             num_l0_files >=
+                 mutable_cf_options.level0_slowdown_writes_trigger) {
+    return {WriteStallCondition::kDelayed, WriteStallCause::kL0FileCountLimit};
+  } else if (!mutable_cf_options.disable_auto_compactions &&
+             mutable_cf_options.soft_pending_compaction_bytes_limit > 0 &&
+             num_compaction_needed_bytes >=
+                 mutable_cf_options.soft_pending_compaction_bytes_limit) {
+    return {WriteStallCondition::kDelayed,
+            WriteStallCause::kPendingCompactionBytes};
+  }
+  return {WriteStallCondition::kNormal, WriteStallCause::kNone};
+}
+
+WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
+    const MutableCFOptions& mutable_cf_options) {
+  auto write_stall_condition = WriteStallCondition::kNormal;
+  if (current_ != nullptr) {
+    auto* vstorage = current_->storage_info();
+    auto write_controller = column_family_set_->write_controller_;
+    uint64_t compaction_needed_bytes =
+        vstorage->estimated_compaction_needed_bytes();
+
+    auto write_stall_condition_and_cause = GetWriteStallConditionAndCause(
+        imm()->NumNotFlushed(), vstorage->l0_delay_trigger_count(),
+        vstorage->estimated_compaction_needed_bytes(), mutable_cf_options,
+        *ioptions());
+    write_stall_condition = write_stall_condition_and_cause.first;
+    auto write_stall_cause = write_stall_condition_and_cause.second;
+
+    bool was_stopped = write_controller->IsStopped();
+    bool needed_delay = write_controller->NeedsDelay();
+
+    if (write_stall_condition == WriteStallCondition::kStopped &&
+        write_stall_cause == WriteStallCause::kMemtableLimit) {
+      write_controller_token_ = write_controller->GetStopToken();
+      internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_STOPS, 1);
+      ROCKS_LOG_WARN(
+          ioptions_.logger,
+          "[%s] Stopping writes because we have %d immutable memtables "
+          "(waiting for flush), max_write_buffer_number is set to %d",
+          name_.c_str(), imm()->NumNotFlushed(),
+          mutable_cf_options.max_write_buffer_number);
+    } else if (write_stall_condition == WriteStallCondition::kStopped &&
+               write_stall_cause == WriteStallCause::kL0FileCountLimit) {
+      write_controller_token_ = write_controller->GetStopToken();
+      internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_STOPS, 1);
+      if (compaction_picker_->IsLevel0CompactionInProgress()) {
+        internal_stats_->AddCFStats(
+            InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_STOPS, 1);
+      }
+      ROCKS_LOG_WARN(ioptions_.logger,
+                     "[%s] Stopping writes because we have %d level-0 files",
+                     name_.c_str(), vstorage->l0_delay_trigger_count());
+    } else if (write_stall_condition == WriteStallCondition::kStopped &&
+               write_stall_cause == WriteStallCause::kPendingCompactionBytes) {
+      write_controller_token_ = write_controller->GetStopToken();
+      internal_stats_->AddCFStats(
+          InternalStats::PENDING_COMPACTION_BYTES_LIMIT_STOPS, 1);
+      ROCKS_LOG_WARN(
+          ioptions_.logger,
+          "[%s] Stopping writes because of estimated pending compaction "
+          "bytes %" PRIu64,
+          name_.c_str(), compaction_needed_bytes);
+    } else if (write_stall_condition == WriteStallCondition::kDelayed &&
+               write_stall_cause == WriteStallCause::kMemtableLimit) {
+      write_controller_token_ =
+          SetupDelay(write_controller, compaction_needed_bytes,
+                     prev_compaction_needed_bytes_, was_stopped,
+                     mutable_cf_options.disable_auto_compactions);
+      internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_SLOWDOWNS, 1);
+      ROCKS_LOG_WARN(
+          ioptions_.logger,
+          "[%s] Stalling writes because we have %d immutable memtables "
+          "(waiting for flush), max_write_buffer_number is set to %d "
+          "rate %" PRIu64,
+          name_.c_str(), imm()->NumNotFlushed(),
+          mutable_cf_options.max_write_buffer_number,
+          write_controller->delayed_write_rate());
+    } else if (write_stall_condition == WriteStallCondition::kDelayed &&
+               write_stall_cause == WriteStallCause::kL0FileCountLimit) {
+      // L0 is the last two files from stopping.
+      bool near_stop = vstorage->l0_delay_trigger_count() >=
+                       mutable_cf_options.level0_stop_writes_trigger - 2;
+      write_controller_token_ =
+          SetupDelay(write_controller, compaction_needed_bytes,
+                     prev_compaction_needed_bytes_, was_stopped || near_stop,
+                     mutable_cf_options.disable_auto_compactions);
+      internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+                                  1);
+      if (compaction_picker_->IsLevel0CompactionInProgress()) {
+        internal_stats_->AddCFStats(
+            InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS, 1);
+      }
+      ROCKS_LOG_WARN(ioptions_.logger,
+                     "[%s] Stalling writes because we have %d level-0 files "
+                     "rate %" PRIu64,
+                     name_.c_str(), vstorage->l0_delay_trigger_count(),
+                     write_controller->delayed_write_rate());
+    } else if (write_stall_condition == WriteStallCondition::kDelayed &&
+               write_stall_cause == WriteStallCause::kPendingCompactionBytes) {
+      // If the distance to hard limit is less than 1/4 of the gap between soft
+      // and
+      // hard bytes limit, we think it is near stop and speed up the slowdown.
+      bool near_stop =
+          mutable_cf_options.hard_pending_compaction_bytes_limit > 0 &&
+          (compaction_needed_bytes -
+           mutable_cf_options.soft_pending_compaction_bytes_limit) >
+              3 *
+                  (mutable_cf_options.hard_pending_compaction_bytes_limit -
+                   mutable_cf_options.soft_pending_compaction_bytes_limit) /
+                  4;
+
+      write_controller_token_ =
+          SetupDelay(write_controller, compaction_needed_bytes,
+                     prev_compaction_needed_bytes_, was_stopped || near_stop,
+                     mutable_cf_options.disable_auto_compactions);
+      internal_stats_->AddCFStats(
+          InternalStats::PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS, 1);
+      ROCKS_LOG_WARN(
+          ioptions_.logger,
+          "[%s] Stalling writes because of estimated pending compaction "
+          "bytes %" PRIu64 " rate %" PRIu64,
+          name_.c_str(), vstorage->estimated_compaction_needed_bytes(),
+          write_controller->delayed_write_rate());
+    } else {
+      assert(write_stall_condition == WriteStallCondition::kNormal);
+      if (vstorage->l0_delay_trigger_count() >=
+          GetL0ThresholdSpeedupCompaction(
+              mutable_cf_options.level0_file_num_compaction_trigger,
+              mutable_cf_options.level0_slowdown_writes_trigger)) {
+        write_controller_token_ =
+            write_controller->GetCompactionPressureToken();
+        ROCKS_LOG_INFO(
+            ioptions_.logger,
+            "[%s] Increasing compaction threads because we have %d level-0 "
+            "files ",
+            name_.c_str(), vstorage->l0_delay_trigger_count());
+      } else if (vstorage->estimated_compaction_needed_bytes() >=
+                 mutable_cf_options.soft_pending_compaction_bytes_limit / 4) {
+        // Increase compaction threads if bytes needed for compaction exceeds
+        // 1/4 of threshold for slowing down.
+        // If soft pending compaction byte limit is not set, always speed up
+        // compaction.
+        write_controller_token_ =
+            write_controller->GetCompactionPressureToken();
+        if (mutable_cf_options.soft_pending_compaction_bytes_limit > 0) {
+          ROCKS_LOG_INFO(
+              ioptions_.logger,
+              "[%s] Increasing compaction threads because of estimated pending "
+              "compaction "
+              "bytes %" PRIu64,
+              name_.c_str(), vstorage->estimated_compaction_needed_bytes());
+        }
+      } else {
+        write_controller_token_.reset();
+      }
+      // If the DB recovers from delay conditions, we reward with reducing
+      // double the slowdown ratio. This is to balance the long term slowdown
+      // increase signal.
+      if (needed_delay) {
+        uint64_t write_rate = write_controller->delayed_write_rate();
+        write_controller->set_delayed_write_rate(static_cast<uint64_t>(
+            static_cast<double>(write_rate) * kDelayRecoverSlowdownRatio));
+        // Set the low pri limit to be 1/4 the delayed write rate.
+        // Note we don't reset this value even after delay condition is relased.
+        // Low-pri rate will continue to apply if there is a compaction
+        // pressure.
+        write_controller->low_pri_rate_limiter()->SetBytesPerSecond(write_rate /
+                                                                    4);
+      }
+    }
+    prev_compaction_needed_bytes_ = compaction_needed_bytes;
+  }
+  return write_stall_condition;
+}
+
+const FileOptions* ColumnFamilyData::soptions() const {
+  return &(column_family_set_->file_options_);
+}
+
+void ColumnFamilyData::SetCurrent(Version* current_version) {
+  current_ = current_version;
+}
+
+uint64_t ColumnFamilyData::GetNumLiveVersions() const {
+  return VersionSet::GetNumLiveVersions(dummy_versions_);
+}
+
+uint64_t ColumnFamilyData::GetTotalSstFilesSize() const {
+  return VersionSet::GetTotalSstFilesSize(dummy_versions_);
+}
+
+uint64_t ColumnFamilyData::GetTotalBlobFileSize() const {
+  return VersionSet::GetTotalBlobFileSize(dummy_versions_);
+}
+
+uint64_t ColumnFamilyData::GetLiveSstFilesSize() const {
+  return current_->GetSstFilesSize();
+}
+
+MemTable* ColumnFamilyData::ConstructNewMemtable(
+    const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) {
+  return new MemTable(internal_comparator_, ioptions_, mutable_cf_options,
+                      write_buffer_manager_, earliest_seq, id_);
+}
+
+void ColumnFamilyData::CreateNewMemtable(
+    const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) {
+  if (mem_ != nullptr) {
+    delete mem_->Unref();
+  }
+  SetMemtable(ConstructNewMemtable(mutable_cf_options, earliest_seq));
+  mem_->Ref();
+}
+
+bool ColumnFamilyData::NeedsCompaction() const {
+  return !mutable_cf_options_.disable_auto_compactions &&
+         compaction_picker_->NeedsCompaction(current_->storage_info());
+}
+
+Compaction* ColumnFamilyData::PickCompaction(
+    const MutableCFOptions& mutable_options,
+    const MutableDBOptions& mutable_db_options, LogBuffer* log_buffer) {
+  SequenceNumber earliest_mem_seqno =
+      std::min(mem_->GetEarliestSequenceNumber(),
+               imm_.current()->GetEarliestSequenceNumber(false));
+  auto* result = compaction_picker_->PickCompaction(
+      GetName(), mutable_options, mutable_db_options, current_->storage_info(),
+      log_buffer, earliest_mem_seqno);
+  if (result != nullptr) {
+    result->SetInputVersion(current_);
+  }
+  return result;
+}
+
+bool ColumnFamilyData::RangeOverlapWithCompaction(
+    const Slice& smallest_user_key, const Slice& largest_user_key,
+    int level) const {
+  return compaction_picker_->RangeOverlapWithCompaction(
+      smallest_user_key, largest_user_key, level);
+}
+
+Status ColumnFamilyData::RangesOverlapWithMemtables(
+    const autovector<Range>& ranges, SuperVersion* super_version,
+    bool allow_data_in_errors, bool* overlap) {
+  assert(overlap != nullptr);
+  *overlap = false;
+  // Create an InternalIterator over all unflushed memtables
+  Arena arena;
+  ReadOptions read_opts;
+  read_opts.total_order_seek = true;
+  MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena);
+  merge_iter_builder.AddIterator(
+      super_version->mem->NewIterator(read_opts, &arena));
+  super_version->imm->AddIterators(read_opts, &merge_iter_builder,
+                                   false /* add_range_tombstone_iter */);
+  ScopedArenaIterator memtable_iter(merge_iter_builder.Finish());
+
+  auto read_seq = super_version->current->version_set()->LastSequence();
+  ReadRangeDelAggregator range_del_agg(&internal_comparator_, read_seq);
+  auto* active_range_del_iter = super_version->mem->NewRangeTombstoneIterator(
+      read_opts, read_seq, false /* immutable_memtable */);
+  range_del_agg.AddTombstones(
+      std::unique_ptr<FragmentedRangeTombstoneIterator>(active_range_del_iter));
+  Status status;
+  status = super_version->imm->AddRangeTombstoneIterators(
+      read_opts, nullptr /* arena */, &range_del_agg);
+  // AddRangeTombstoneIterators always return Status::OK.
+  assert(status.ok());
+
+  for (size_t i = 0; i < ranges.size() && status.ok() && !*overlap; ++i) {
+    auto* vstorage = super_version->current->storage_info();
+    auto* ucmp = vstorage->InternalComparator()->user_comparator();
+    InternalKey range_start(ranges[i].start, kMaxSequenceNumber,
+                            kValueTypeForSeek);
+    memtable_iter->Seek(range_start.Encode());
+    status = memtable_iter->status();
+    ParsedInternalKey seek_result;
+
+    if (status.ok() && memtable_iter->Valid()) {
+      status = ParseInternalKey(memtable_iter->key(), &seek_result,
+                                allow_data_in_errors);
+    }
+
+    if (status.ok()) {
+      if (memtable_iter->Valid() &&
+          ucmp->Compare(seek_result.user_key, ranges[i].limit) <= 0) {
+        *overlap = true;
+      } else if (range_del_agg.IsRangeOverlapped(ranges[i].start,
+                                                 ranges[i].limit)) {
+        *overlap = true;
+      }
+    }
+  }
+  return status;
+}
+
+const int ColumnFamilyData::kCompactAllLevels = -1;
+const int ColumnFamilyData::kCompactToBaseLevel = -2;
+
+Compaction* ColumnFamilyData::CompactRange(
+    const MutableCFOptions& mutable_cf_options,
+    const MutableDBOptions& mutable_db_options, int input_level,
+    int output_level, const CompactRangeOptions& compact_range_options,
+    const InternalKey* begin, const InternalKey* end,
+    InternalKey** compaction_end, bool* conflict,
+    uint64_t max_file_num_to_ignore, const std::string& trim_ts) {
+  auto* result = compaction_picker_->CompactRange(
+      GetName(), mutable_cf_options, mutable_db_options,
+      current_->storage_info(), input_level, output_level,
+      compact_range_options, begin, end, compaction_end, conflict,
+      max_file_num_to_ignore, trim_ts);
+  if (result != nullptr) {
+    result->SetInputVersion(current_);
+  }
+  return result;
+}
+
+SuperVersion* ColumnFamilyData::GetReferencedSuperVersion(DBImpl* db) {
+  SuperVersion* sv = GetThreadLocalSuperVersion(db);
+  sv->Ref();
+  if (!ReturnThreadLocalSuperVersion(sv)) {
+    // This Unref() corresponds to the Ref() in GetThreadLocalSuperVersion()
+    // when the thread-local pointer was populated. So, the Ref() earlier in
+    // this function still prevents the returned SuperVersion* from being
+    // deleted out from under the caller.
+    sv->Unref();
+  }
+  return sv;
+}
+
+SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(DBImpl* db) {
+  // The SuperVersion is cached in thread local storage to avoid acquiring
+  // mutex when SuperVersion does not change since the last use. When a new
+  // SuperVersion is installed, the compaction or flush thread cleans up
+  // cached SuperVersion in all existing thread local storage. To avoid
+  // acquiring mutex for this operation, we use atomic Swap() on the thread
+  // local pointer to guarantee exclusive access. If the thread local pointer
+  // is being used while a new SuperVersion is installed, the cached
+  // SuperVersion can become stale. In that case, the background thread would
+  // have swapped in kSVObsolete. We re-check the value at when returning
+  // SuperVersion back to thread local, with an atomic compare and swap.
+  // The superversion will need to be released if detected to be stale.
+  void* ptr = local_sv_->Swap(SuperVersion::kSVInUse);
+  // Invariant:
+  // (1) Scrape (always) installs kSVObsolete in ThreadLocal storage
+  // (2) the Swap above (always) installs kSVInUse, ThreadLocal storage
+  // should only keep kSVInUse before ReturnThreadLocalSuperVersion call
+  // (if no Scrape happens).
+  assert(ptr != SuperVersion::kSVInUse);
+  SuperVersion* sv = static_cast<SuperVersion*>(ptr);
+  if (sv == SuperVersion::kSVObsolete ||
+      sv->version_number != super_version_number_.load()) {
+    RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_ACQUIRES);
+    SuperVersion* sv_to_delete = nullptr;
+
+    if (sv && sv->Unref()) {
+      RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_CLEANUPS);
+      db->mutex()->Lock();
+      // NOTE: underlying resources held by superversion (sst files) might
+      // not be released until the next background job.
+      sv->Cleanup();
+      if (db->immutable_db_options().avoid_unnecessary_blocking_io) {
+        db->AddSuperVersionsToFreeQueue(sv);
+        db->SchedulePurge();
+      } else {
+        sv_to_delete = sv;
+      }
+    } else {
+      db->mutex()->Lock();
+    }
+    sv = super_version_->Ref();
+    db->mutex()->Unlock();
+
+    delete sv_to_delete;
+  }
+  assert(sv != nullptr);
+  return sv;
+}
+
+bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
+  assert(sv != nullptr);
+  // Put the SuperVersion back
+  void* expected = SuperVersion::kSVInUse;
+  if (local_sv_->CompareAndSwap(static_cast<void*>(sv), expected)) {
+    // When we see kSVInUse in the ThreadLocal, we are sure ThreadLocal
+    // storage has not been altered and no Scrape has happened. The
+    // SuperVersion is still current.
+    return true;
+  } else {
+    // ThreadLocal scrape happened in the process of this GetImpl call (after
+    // thread local Swap() at the beginning and before CompareAndSwap()).
+    // This means the SuperVersion it holds is obsolete.
+    assert(expected == SuperVersion::kSVObsolete);
+  }
+  return false;
+}
+
+void ColumnFamilyData::InstallSuperVersion(SuperVersionContext* sv_context,
+                                           InstrumentedMutex* db_mutex) {
+  db_mutex->AssertHeld();
+  return InstallSuperVersion(sv_context, mutable_cf_options_);
+}
+
+void ColumnFamilyData::InstallSuperVersion(
+    SuperVersionContext* sv_context,
+    const MutableCFOptions& mutable_cf_options) {
+  SuperVersion* new_superversion = sv_context->new_superversion.release();
+  new_superversion->mutable_cf_options = mutable_cf_options;
+  new_superversion->Init(this, mem_, imm_.current(), current_);
+  SuperVersion* old_superversion = super_version_;
+  super_version_ = new_superversion;
+  ++super_version_number_;
+  super_version_->version_number = super_version_number_;
+  if (old_superversion == nullptr || old_superversion->current != current() ||
+      old_superversion->mem != mem_ ||
+      old_superversion->imm != imm_.current()) {
+    // Should not recalculate slow down condition if nothing has changed, since
+    // currently RecalculateWriteStallConditions() treats it as further slowing
+    // down is needed.
+    super_version_->write_stall_condition =
+        RecalculateWriteStallConditions(mutable_cf_options);
+  } else {
+    super_version_->write_stall_condition =
+        old_superversion->write_stall_condition;
+  }
+  if (old_superversion != nullptr) {
+    // Reset SuperVersions cached in thread local storage.
+    // This should be done before old_superversion->Unref(). That's to ensure
+    // that local_sv_ never holds the last reference to SuperVersion, since
+    // it has no means to safely do SuperVersion cleanup.
+    ResetThreadLocalSuperVersions();
+
+    if (old_superversion->mutable_cf_options.write_buffer_size !=
+        mutable_cf_options.write_buffer_size) {
+      mem_->UpdateWriteBufferSize(mutable_cf_options.write_buffer_size);
+    }
+    if (old_superversion->write_stall_condition !=
+        new_superversion->write_stall_condition) {
+      sv_context->PushWriteStallNotification(
+          old_superversion->write_stall_condition,
+          new_superversion->write_stall_condition, GetName(), ioptions());
+    }
+    if (old_superversion->Unref()) {
+      old_superversion->Cleanup();
+      sv_context->superversions_to_free.push_back(old_superversion);
+    }
+  }
+}
+
+void ColumnFamilyData::ResetThreadLocalSuperVersions() {
+  autovector<void*> sv_ptrs;
+  local_sv_->Scrape(&sv_ptrs, SuperVersion::kSVObsolete);
+  for (auto ptr : sv_ptrs) {
+    assert(ptr);
+    if (ptr == SuperVersion::kSVInUse) {
+      continue;
+    }
+    auto sv = static_cast<SuperVersion*>(ptr);
+    bool was_last_ref __attribute__((__unused__));
+    was_last_ref = sv->Unref();
+    // sv couldn't have been the last reference because
+    // ResetThreadLocalSuperVersions() is called before
+    // unref'ing super_version_.
+    assert(!was_last_ref);
+  }
+}
+
+Status ColumnFamilyData::ValidateOptions(
+    const DBOptions& db_options, const ColumnFamilyOptions& cf_options) {
+  Status s;
+  s = CheckCompressionSupported(cf_options);
+  if (s.ok() && db_options.allow_concurrent_memtable_write) {
+    s = CheckConcurrentWritesSupported(cf_options);
+  }
+  if (s.ok() && db_options.unordered_write &&
+      cf_options.max_successive_merges != 0) {
+    s = Status::InvalidArgument(
+        "max_successive_merges > 0 is incompatible with unordered_write");
+  }
+  if (s.ok()) {
+    s = CheckCFPathsSupported(db_options, cf_options);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (cf_options.ttl > 0 && cf_options.ttl != kDefaultTtl) {
+    if (!cf_options.table_factory->IsInstanceOf(
+            TableFactory::kBlockBasedTableName())) {
+      return Status::NotSupported(
+          "TTL is only supported in Block-Based Table format. ");
+    }
+  }
+
+  if (cf_options.periodic_compaction_seconds > 0 &&
+      cf_options.periodic_compaction_seconds != kDefaultPeriodicCompSecs) {
+    if (!cf_options.table_factory->IsInstanceOf(
+            TableFactory::kBlockBasedTableName())) {
+      return Status::NotSupported(
+          "Periodic Compaction is only supported in "
+          "Block-Based Table format. ");
+    }
+  }
+
+  if (cf_options.enable_blob_garbage_collection) {
+    if (cf_options.blob_garbage_collection_age_cutoff < 0.0 ||
+        cf_options.blob_garbage_collection_age_cutoff > 1.0) {
+      return Status::InvalidArgument(
+          "The age cutoff for blob garbage collection should be in the range "
+          "[0.0, 1.0].");
+    }
+    if (cf_options.blob_garbage_collection_force_threshold < 0.0 ||
+        cf_options.blob_garbage_collection_force_threshold > 1.0) {
+      return Status::InvalidArgument(
+          "The garbage ratio threshold for forcing blob garbage collection "
+          "should be in the range [0.0, 1.0].");
+    }
+  }
+
+  if (cf_options.compaction_style == kCompactionStyleFIFO &&
+      db_options.max_open_files != -1 && cf_options.ttl > 0) {
+    return Status::NotSupported(
+        "FIFO compaction only supported with max_open_files = -1.");
+  }
+
+  std::vector<uint32_t> supported{0, 1, 2, 4, 8};
+  if (std::find(supported.begin(), supported.end(),
+                cf_options.memtable_protection_bytes_per_key) ==
+      supported.end()) {
+    return Status::NotSupported(
+        "Memtable per key-value checksum protection only supports 0, 1, 2, 4 "
+        "or 8 bytes per key.");
+  }
+  return s;
+}
+
+#ifndef ROCKSDB_LITE
+Status ColumnFamilyData::SetOptions(
+    const DBOptions& db_opts,
+    const std::unordered_map<std::string, std::string>& options_map) {
+  ColumnFamilyOptions cf_opts =
+      BuildColumnFamilyOptions(initial_cf_options_, mutable_cf_options_);
+  ConfigOptions config_opts;
+  config_opts.mutable_options_only = true;
+  Status s = GetColumnFamilyOptionsFromMap(config_opts, cf_opts, options_map,
+                                           &cf_opts);
+  if (s.ok()) {
+    s = ValidateOptions(db_opts, cf_opts);
+  }
+  if (s.ok()) {
+    mutable_cf_options_ = MutableCFOptions(cf_opts);
+    mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+  }
+  return s;
+}
+#endif  // ROCKSDB_LITE
+
+// REQUIRES: DB mutex held
+Env::WriteLifeTimeHint ColumnFamilyData::CalculateSSTWriteHint(int level) {
+  if (initial_cf_options_.compaction_style != kCompactionStyleLevel) {
+    return Env::WLTH_NOT_SET;
+  }
+  if (level == 0) {
+    return Env::WLTH_MEDIUM;
+  }
+  int base_level = current_->storage_info()->base_level();
+
+  // L1: medium, L2: long, ...
+  if (level - base_level >= 2) {
+    return Env::WLTH_EXTREME;
+  } else if (level < base_level) {
+    // There is no restriction which prevents level passed in to be smaller
+    // than base_level.
+    return Env::WLTH_MEDIUM;
+  }
+  return static_cast<Env::WriteLifeTimeHint>(
+      level - base_level + static_cast<int>(Env::WLTH_MEDIUM));
+}
+
+Status ColumnFamilyData::AddDirectories(
+    std::map<std::string, std::shared_ptr<FSDirectory>>* created_dirs) {
+  Status s;
+  assert(created_dirs != nullptr);
+  assert(data_dirs_.empty());
+  for (auto& p : ioptions_.cf_paths) {
+    auto existing_dir = created_dirs->find(p.path);
+
+    if (existing_dir == created_dirs->end()) {
+      std::unique_ptr<FSDirectory> path_directory;
+      s = DBImpl::CreateAndNewDirectory(ioptions_.fs.get(), p.path,
+                                        &path_directory);
+      if (!s.ok()) {
+        return s;
+      }
+      assert(path_directory != nullptr);
+      data_dirs_.emplace_back(path_directory.release());
+      (*created_dirs)[p.path] = data_dirs_.back();
+    } else {
+      data_dirs_.emplace_back(existing_dir->second);
+    }
+  }
+  assert(data_dirs_.size() == ioptions_.cf_paths.size());
+  return s;
+}
+
+FSDirectory* ColumnFamilyData::GetDataDir(size_t path_id) const {
+  if (data_dirs_.empty()) {
+    return nullptr;
+  }
+
+  assert(path_id < data_dirs_.size());
+  return data_dirs_[path_id].get();
+}
+
+ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
+                                 const ImmutableDBOptions* db_options,
+                                 const FileOptions& file_options,
+                                 Cache* table_cache,
+                                 WriteBufferManager* _write_buffer_manager,
+                                 WriteController* _write_controller,
+                                 BlockCacheTracer* const block_cache_tracer,
+                                 const std::shared_ptr<IOTracer>& io_tracer,
+                                 const std::string& db_id,
+                                 const std::string& db_session_id)
+    : max_column_family_(0),
+      file_options_(file_options),
+      dummy_cfd_(new ColumnFamilyData(
+          ColumnFamilyData::kDummyColumnFamilyDataId, "", nullptr, nullptr,
+          nullptr, ColumnFamilyOptions(), *db_options, &file_options_, nullptr,
+          block_cache_tracer, io_tracer, db_id, db_session_id)),
+      default_cfd_cache_(nullptr),
+      db_name_(dbname),
+      db_options_(db_options),
+      table_cache_(table_cache),
+      write_buffer_manager_(_write_buffer_manager),
+      write_controller_(_write_controller),
+      block_cache_tracer_(block_cache_tracer),
+      io_tracer_(io_tracer),
+      db_id_(db_id),
+      db_session_id_(db_session_id) {
+  // initialize linked list
+  dummy_cfd_->prev_ = dummy_cfd_;
+  dummy_cfd_->next_ = dummy_cfd_;
+}
+
+ColumnFamilySet::~ColumnFamilySet() {
+  while (column_family_data_.size() > 0) {
+    // cfd destructor will delete itself from column_family_data_
+    auto cfd = column_family_data_.begin()->second;
+    bool last_ref __attribute__((__unused__));
+    last_ref = cfd->UnrefAndTryDelete();
+    assert(last_ref);
+  }
+  bool dummy_last_ref __attribute__((__unused__));
+  dummy_last_ref = dummy_cfd_->UnrefAndTryDelete();
+  assert(dummy_last_ref);
+}
+
+ColumnFamilyData* ColumnFamilySet::GetDefault() const {
+  assert(default_cfd_cache_ != nullptr);
+  return default_cfd_cache_;
+}
+
+ColumnFamilyData* ColumnFamilySet::GetColumnFamily(uint32_t id) const {
+  auto cfd_iter = column_family_data_.find(id);
+  if (cfd_iter != column_family_data_.end()) {
+    return cfd_iter->second;
+  } else {
+    return nullptr;
+  }
+}
+
+ColumnFamilyData* ColumnFamilySet::GetColumnFamily(
+    const std::string& name) const {
+  auto cfd_iter = column_families_.find(name);
+  if (cfd_iter != column_families_.end()) {
+    auto cfd = GetColumnFamily(cfd_iter->second);
+    assert(cfd != nullptr);
+    return cfd;
+  } else {
+    return nullptr;
+  }
+}
+
+uint32_t ColumnFamilySet::GetNextColumnFamilyID() {
+  return ++max_column_family_;
+}
+
+uint32_t ColumnFamilySet::GetMaxColumnFamily() { return max_column_family_; }
+
+void ColumnFamilySet::UpdateMaxColumnFamily(uint32_t new_max_column_family) {
+  max_column_family_ = std::max(new_max_column_family, max_column_family_);
+}
+
+size_t ColumnFamilySet::NumberOfColumnFamilies() const {
+  return column_families_.size();
+}
+
+// under a DB mutex AND write thread
+ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
+    const std::string& name, uint32_t id, Version* dummy_versions,
+    const ColumnFamilyOptions& options) {
+  assert(column_families_.find(name) == column_families_.end());
+  ColumnFamilyData* new_cfd = new ColumnFamilyData(
+      id, name, dummy_versions, table_cache_, write_buffer_manager_, options,
+      *db_options_, &file_options_, this, block_cache_tracer_, io_tracer_,
+      db_id_, db_session_id_);
+  column_families_.insert({name, id});
+  column_family_data_.insert({id, new_cfd});
+  max_column_family_ = std::max(max_column_family_, id);
+  // add to linked list
+  new_cfd->next_ = dummy_cfd_;
+  auto prev = dummy_cfd_->prev_;
+  new_cfd->prev_ = prev;
+  prev->next_ = new_cfd;
+  dummy_cfd_->prev_ = new_cfd;
+  if (id == 0) {
+    default_cfd_cache_ = new_cfd;
+  }
+  return new_cfd;
+}
+
+// under a DB mutex AND from a write thread
+void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
+  auto cfd_iter = column_family_data_.find(cfd->GetID());
+  assert(cfd_iter != column_family_data_.end());
+  column_family_data_.erase(cfd_iter);
+  column_families_.erase(cfd->GetName());
+}
+
+// under a DB mutex OR from a write thread
+bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) {
+  if (column_family_id == 0) {
+    // optimization for common case
+    current_ = column_family_set_->GetDefault();
+  } else {
+    current_ = column_family_set_->GetColumnFamily(column_family_id);
+  }
+  handle_.SetCFD(current_);
+  return current_ != nullptr;
+}
+
+uint64_t ColumnFamilyMemTablesImpl::GetLogNumber() const {
+  assert(current_ != nullptr);
+  return current_->GetLogNumber();
+}
+
+MemTable* ColumnFamilyMemTablesImpl::GetMemTable() const {
+  assert(current_ != nullptr);
+  return current_->mem();
+}
+
+ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() {
+  assert(current_ != nullptr);
+  return &handle_;
+}
+
+uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) {
+  uint32_t column_family_id = 0;
+  if (column_family != nullptr) {
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+    column_family_id = cfh->GetID();
+  }
+  return column_family_id;
+}
+
+const Comparator* GetColumnFamilyUserComparator(
+    ColumnFamilyHandle* column_family) {
+  if (column_family != nullptr) {
+    return column_family->GetComparator();
+  }
+  return nullptr;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/column_family.h b/src/rocksdb/db/column_family.h
new file mode 100644
index 000000000..3e6d01d22
--- /dev/null
+++ b/src/rocksdb/db/column_family.h
@@ -0,0 +1,845 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <atomic>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "cache/cache_reservation_manager.h"
+#include "db/memtable_list.h"
+#include "db/table_cache.h"
+#include "db/table_properties_collector.h"
+#include "db/write_batch_internal.h"
+#include "db/write_controller.h"
+#include "options/cf_options.h"
+#include "rocksdb/compaction_job_stats.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "trace_replay/block_cache_tracer.h"
+#include "util/hash_containers.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Version;
+class VersionSet;
+class VersionStorageInfo;
+class MemTable;
+class MemTableListVersion;
+class CompactionPicker;
+class Compaction;
+class InternalKey;
+class InternalStats;
+class ColumnFamilyData;
+class DBImpl;
+class LogBuffer;
+class InstrumentedMutex;
+class InstrumentedMutexLock;
+struct SuperVersionContext;
+class BlobFileCache;
+class BlobSource;
+
+extern const double kIncSlowdownRatio;
+// This file contains a list of data structures for managing column family
+// level metadata.
+//
+// The basic relationships among classes declared here are illustrated as
+// following:
+//
+//       +----------------------+    +----------------------+   +--------+
+//   +---+ ColumnFamilyHandle 1 | +--+ ColumnFamilyHandle 2 |   | DBImpl |
+//   |   +----------------------+ |  +----------------------+   +----+---+
+//   | +--------------------------+                                  |
+//   | |                               +-----------------------------+
+//   | |                               |
+//   | | +-----------------------------v-------------------------------+
+//   | | |                                                             |
+//   | | |                      ColumnFamilySet                        |
+//   | | |                                                             |
+//   | | +-------------+--------------------------+----------------+---+
+//   | |               |                          |                |
+//   | +-------------------------------------+    |                |
+//   |                 |                     |    |                v
+//   |   +-------------v-------------+ +-----v----v---------+
+//   |   |                           | |                    |
+//   |   |     ColumnFamilyData 1    | | ColumnFamilyData 2 |    ......
+//   |   |                           | |                    |
+//   +--->                           | |                    |
+//       |                 +---------+ |                    |
+//       |                 | MemTable| |                    |
+//       |                 |  List   | |                    |
+//       +--------+---+--+-+----+----+ +--------------------++
+//                |   |  |      |
+//                |   |  |      |
+//                |   |  |      +-----------------------+
+//                |   |  +-----------+                  |
+//                v   +--------+     |                  |
+//       +--------+--------+   |     |                  |
+//       |                 |   |     |       +----------v----------+
+// +---> |SuperVersion 1.a +----------------->                     |
+//       |                 +------+  |       | MemTableListVersion |
+//       +---+-------------+   |  |  |       |                     |
+//           |                 |  |  |       +----+------------+---+
+//           |      current    |  |  |            |            |
+//           |   +-------------+  |  |mem         |            |
+//           |   |                |  |            |            |
+//         +-v---v-------+    +---v--v---+  +-----v----+  +----v-----+
+//         |             |    |          |  |          |  |          |
+//         | Version 1.a |    | memtable |  | memtable |  | memtable |
+//         |             |    |   1.a    |  |   1.b    |  |   1.c    |
+//         +-------------+    |          |  |          |  |          |
+//                            +----------+  +----------+  +----------+
+//
+// DBImpl keeps a ColumnFamilySet, which references to all column families by
+// pointing to respective ColumnFamilyData object of each column family.
+// This is how DBImpl can list and operate on all the column families.
+// ColumnFamilyHandle also points to ColumnFamilyData directly, so that
+// when a user executes a query, it can directly find memtables and Version
+// as well as SuperVersion to the column family, without going through
+// ColumnFamilySet.
+//
+// ColumnFamilySet points to the latest view of the LSM-tree (list of memtables
+// and SST files) indirectly, while ongoing operations may hold references
+// to a current or an out-of-date SuperVersion, which in turn points to a
+// point-in-time view of the LSM-tree. This guarantees the memtables and SST
+// files being operated on will not go away, until the SuperVersion is
+// unreferenced to 0 and destoryed.
+//
+// The following graph illustrates a possible referencing relationships:
+//
+// Column       +--------------+      current       +-----------+
+// Family +---->+              +------------------->+           |
+//  Data        | SuperVersion +----------+         | Version A |
+//              |      3       |   imm    |         |           |
+// Iter2 +----->+              |  +-------v------+  +-----------+
+//              +-----+--------+  | MemtableList +----------------> Empty
+//                    |           |   Version r  |  +-----------+
+//                    |           +--------------+  |           |
+//                    +------------------+   current| Version B |
+//              +--------------+         |   +----->+           |
+//              |              |         |   |      +-----+-----+
+// Compaction +>+ SuperVersion +-------------+            ^
+//    Job       |      2       +------+  |                |current
+//              |              +----+ |  |     mem        |    +------------+
+//              +--------------+    | |  +--------------------->            |
+//                                  | +------------------------> MemTable a |
+//                                  |          mem        |    |            |
+//              +--------------+    |                     |    +------------+
+//              |              +--------------------------+
+//  Iter1 +-----> SuperVersion |    |                          +------------+
+//              |      1       +------------------------------>+            |
+//              |              +-+  |        mem               | MemTable b |
+//              +--------------+ |  |                          |            |
+//                               |  |    +--------------+      +-----^------+
+//                               |  |imm | MemtableList |            |
+//                               |  +--->+   Version s  +------------+
+//                               |       +--------------+
+//                               |       +--------------+
+//                               |       | MemtableList |
+//                               +------>+   Version t  +-------->  Empty
+//                                 imm   +--------------+
+//
+// In this example, even if the current LSM-tree consists of Version A and
+// memtable a, which is also referenced by SuperVersion, two older SuperVersion
+// SuperVersion2 and Superversion1 still exist, and are referenced by a
+// compaction job and an old iterator Iter1, respectively. SuperVersion2
+// contains Version B, memtable a and memtable b; SuperVersion1 contains
+// Version B and memtable b (mutable). As a result, Version B and memtable b
+// are prevented from being destroyed or deleted.
+
+// ColumnFamilyHandleImpl is the class that clients use to access different
+// column families. It has non-trivial destructor, which gets called when client
+// is done using the column family
+class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
+ public:
+  // create while holding the mutex
+  ColumnFamilyHandleImpl(ColumnFamilyData* cfd, DBImpl* db,
+                         InstrumentedMutex* mutex);
+  // destroy without mutex
+  virtual ~ColumnFamilyHandleImpl();
+  virtual ColumnFamilyData* cfd() const { return cfd_; }
+
+  virtual uint32_t GetID() const override;
+  virtual const std::string& GetName() const override;
+  virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) override;
+  virtual const Comparator* GetComparator() const override;
+
+ private:
+  ColumnFamilyData* cfd_;
+  DBImpl* db_;
+  InstrumentedMutex* mutex_;
+};
+
+// Does not ref-count ColumnFamilyData
+// We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter
+// calls DBImpl methods. When this happens, MemTableInserter need access to
+// ColumnFamilyHandle (same as the client would need). In that case, we feed
+// MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl
+// methods
+class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl {
+ public:
+  ColumnFamilyHandleInternal()
+      : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr),
+        internal_cfd_(nullptr) {}
+
+  void SetCFD(ColumnFamilyData* _cfd) { internal_cfd_ = _cfd; }
+  virtual ColumnFamilyData* cfd() const override { return internal_cfd_; }
+
+ private:
+  ColumnFamilyData* internal_cfd_;
+};
+
+// holds references to memtable, all immutable memtables and version
+struct SuperVersion {
+  // Accessing members of this class is not thread-safe and requires external
+  // synchronization (ie db mutex held or on write thread).
+  ColumnFamilyData* cfd;
+  MemTable* mem;
+  MemTableListVersion* imm;
+  Version* current;
+  MutableCFOptions mutable_cf_options;
+  // Version number of the current SuperVersion
+  uint64_t version_number;
+  WriteStallCondition write_stall_condition;
+
+  // should be called outside the mutex
+  SuperVersion() = default;
+  ~SuperVersion();
+  SuperVersion* Ref();
+  // If Unref() returns true, Cleanup() should be called with mutex held
+  // before deleting this SuperVersion.
+  bool Unref();
+
+  // call these two methods with db mutex held
+  // Cleanup unrefs mem, imm and current. Also, it stores all memtables
+  // that needs to be deleted in to_delete vector. Unrefing those
+  // objects needs to be done in the mutex
+  void Cleanup();
+  void Init(ColumnFamilyData* new_cfd, MemTable* new_mem,
+            MemTableListVersion* new_imm, Version* new_current);
+
+  // The value of dummy is not actually used. kSVInUse takes its address as a
+  // mark in the thread local storage to indicate the SuperVersion is in use
+  // by thread. This way, the value of kSVInUse is guaranteed to have no
+  // conflict with SuperVersion object address and portable on different
+  // platform.
+  static int dummy;
+  static void* const kSVInUse;
+  static void* const kSVObsolete;
+
+ private:
+  std::atomic<uint32_t> refs;
+  // We need to_delete because during Cleanup(), imm->Unref() returns
+  // all memtables that we need to free through this vector. We then
+  // delete all those memtables outside of mutex, during destruction
+  autovector<MemTable*> to_delete;
+};
+
+extern Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options);
+
+extern Status CheckConcurrentWritesSupported(
+    const ColumnFamilyOptions& cf_options);
+
+extern Status CheckCFPathsSupported(const DBOptions& db_options,
+                                    const ColumnFamilyOptions& cf_options);
+
+extern ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
+                                           const ColumnFamilyOptions& src);
+// Wrap user defined table properties collector factories `from cf_options`
+// into internal ones in int_tbl_prop_collector_factories. Add a system internal
+// one too.
+extern void GetIntTblPropCollectorFactory(
+    const ImmutableCFOptions& ioptions,
+    IntTblPropCollectorFactories* int_tbl_prop_collector_factories);
+
+class ColumnFamilySet;
+
+// This class keeps all the data that a column family needs.
+// Most methods require DB mutex held, unless otherwise noted
+class ColumnFamilyData {
+ public:
+  ~ColumnFamilyData();
+
+  // thread-safe
+  uint32_t GetID() const { return id_; }
+  // thread-safe
+  const std::string& GetName() const { return name_; }
+
+  // Ref() can only be called from a context where the caller can guarantee
+  // that ColumnFamilyData is alive (while holding a non-zero ref already,
+  // holding a DB mutex, or as the leader in a write batch group).
+  void Ref() { refs_.fetch_add(1); }
+
+  // UnrefAndTryDelete() decreases the reference count and do free if needed,
+  // return true if this is freed else false, UnrefAndTryDelete() can only
+  // be called while holding a DB mutex, or during single-threaded recovery.
+  bool UnrefAndTryDelete();
+
+  // SetDropped() can only be called under following conditions:
+  // 1) Holding a DB mutex,
+  // 2) from single-threaded write thread, AND
+  // 3) from single-threaded VersionSet::LogAndApply()
+  // After dropping column family no other operation on that column family
+  // will be executed. All the files and memory will be, however, kept around
+  // until client drops the column family handle. That way, client can still
+  // access data from dropped column family.
+  // Column family can be dropped and still alive. In that state:
+  // *) Compaction and flush is not executed on the dropped column family.
+  // *) Client can continue reading from column family. Writes will fail unless
+  // WriteOptions::ignore_missing_column_families is true
+  // When the dropped column family is unreferenced, then we:
+  // *) Remove column family from the linked list maintained by ColumnFamilySet
+  // *) delete all memory associated with that column family
+  // *) delete all the files associated with that column family
+  void SetDropped();
+  bool IsDropped() const { return dropped_.load(std::memory_order_relaxed); }
+
+  // thread-safe
+  int NumberLevels() const { return ioptions_.num_levels; }
+
+  void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
+  uint64_t GetLogNumber() const { return log_number_; }
+
+  void SetFlushReason(FlushReason flush_reason) {
+    flush_reason_ = flush_reason;
+  }
+  FlushReason GetFlushReason() const { return flush_reason_; }
+  // thread-safe
+  const FileOptions* soptions() const;
+  const ImmutableOptions* ioptions() const { return &ioptions_; }
+  // REQUIRES: DB mutex held
+  // This returns the MutableCFOptions used by current SuperVersion
+  // You should use this API to reference MutableCFOptions most of the time.
+  const MutableCFOptions* GetCurrentMutableCFOptions() const {
+    return &(super_version_->mutable_cf_options);
+  }
+  // REQUIRES: DB mutex held
+  // This returns the latest MutableCFOptions, which may be not in effect yet.
+  const MutableCFOptions* GetLatestMutableCFOptions() const {
+    return &mutable_cf_options_;
+  }
+
+  // REQUIRES: DB mutex held
+  // Build ColumnFamiliesOptions with immutable options and latest mutable
+  // options.
+  ColumnFamilyOptions GetLatestCFOptions() const;
+
+  bool is_delete_range_supported() { return is_delete_range_supported_; }
+
+  // Validate CF options against DB options
+  static Status ValidateOptions(const DBOptions& db_options,
+                                const ColumnFamilyOptions& cf_options);
+#ifndef ROCKSDB_LITE
+  // REQUIRES: DB mutex held
+  Status SetOptions(
+      const DBOptions& db_options,
+      const std::unordered_map<std::string, std::string>& options_map);
+#endif  // ROCKSDB_LITE
+
+  InternalStats* internal_stats() { return internal_stats_.get(); }
+
+  MemTableList* imm() { return &imm_; }
+  MemTable* mem() { return mem_; }
+
+  bool IsEmpty() {
+    return mem()->GetFirstSequenceNumber() == 0 && imm()->NumNotFlushed() == 0;
+  }
+
+  Version* current() { return current_; }
+  Version* dummy_versions() { return dummy_versions_; }
+  void SetCurrent(Version* _current);
+  uint64_t GetNumLiveVersions() const;    // REQUIRE: DB mutex held
+  uint64_t GetTotalSstFilesSize() const;  // REQUIRE: DB mutex held
+  uint64_t GetLiveSstFilesSize() const;   // REQUIRE: DB mutex held
+  uint64_t GetTotalBlobFileSize() const;  // REQUIRE: DB mutex held
+  void SetMemtable(MemTable* new_mem) {
+    uint64_t memtable_id = last_memtable_id_.fetch_add(1) + 1;
+    new_mem->SetID(memtable_id);
+    mem_ = new_mem;
+  }
+
+  // calculate the oldest log needed for the durability of this column family
+  uint64_t OldestLogToKeep();
+
+  // See Memtable constructor for explanation of earliest_seq param.
+  MemTable* ConstructNewMemtable(const MutableCFOptions& mutable_cf_options,
+                                 SequenceNumber earliest_seq);
+  void CreateNewMemtable(const MutableCFOptions& mutable_cf_options,
+                         SequenceNumber earliest_seq);
+
+  TableCache* table_cache() const { return table_cache_.get(); }
+  BlobSource* blob_source() const { return blob_source_.get(); }
+
+  // See documentation in compaction_picker.h
+  // REQUIRES: DB mutex held
+  bool NeedsCompaction() const;
+  // REQUIRES: DB mutex held
+  Compaction* PickCompaction(const MutableCFOptions& mutable_options,
+                             const MutableDBOptions& mutable_db_options,
+                             LogBuffer* log_buffer);
+
+  // Check if the passed range overlap with any running compactions.
+  // REQUIRES: DB mutex held
+  bool RangeOverlapWithCompaction(const Slice& smallest_user_key,
+                                  const Slice& largest_user_key,
+                                  int level) const;
+
+  // Check if the passed ranges overlap with any unflushed memtables
+  // (immutable or mutable).
+  //
+  // @param super_version A referenced SuperVersion that will be held for the
+  //    duration of this function.
+  //
+  // Thread-safe
+  Status RangesOverlapWithMemtables(const autovector<Range>& ranges,
+                                    SuperVersion* super_version,
+                                    bool allow_data_in_errors, bool* overlap);
+
+  // A flag to tell a manual compaction is to compact all levels together
+  // instead of a specific level.
+  static const int kCompactAllLevels;
+  // A flag to tell a manual compaction's output is base level.
+  static const int kCompactToBaseLevel;
+  // REQUIRES: DB mutex held
+  Compaction* CompactRange(const MutableCFOptions& mutable_cf_options,
+                           const MutableDBOptions& mutable_db_options,
+                           int input_level, int output_level,
+                           const CompactRangeOptions& compact_range_options,
+                           const InternalKey* begin, const InternalKey* end,
+                           InternalKey** compaction_end, bool* manual_conflict,
+                           uint64_t max_file_num_to_ignore,
+                           const std::string& trim_ts);
+
+  CompactionPicker* compaction_picker() { return compaction_picker_.get(); }
+  // thread-safe
+  const Comparator* user_comparator() const {
+    return internal_comparator_.user_comparator();
+  }
+  // thread-safe
+  const InternalKeyComparator& internal_comparator() const {
+    return internal_comparator_;
+  }
+
+  const IntTblPropCollectorFactories* int_tbl_prop_collector_factories() const {
+    return &int_tbl_prop_collector_factories_;
+  }
+
+  SuperVersion* GetSuperVersion() { return super_version_; }
+  // thread-safe
+  // Return a already referenced SuperVersion to be used safely.
+  SuperVersion* GetReferencedSuperVersion(DBImpl* db);
+  // thread-safe
+  // Get SuperVersion stored in thread local storage. If it does not exist,
+  // get a reference from a current SuperVersion.
+  SuperVersion* GetThreadLocalSuperVersion(DBImpl* db);
+  // Try to return SuperVersion back to thread local storage. Return true on
+  // success and false on failure. It fails when the thread local storage
+  // contains anything other than SuperVersion::kSVInUse flag.
+  bool ReturnThreadLocalSuperVersion(SuperVersion* sv);
+  // thread-safe
+  uint64_t GetSuperVersionNumber() const {
+    return super_version_number_.load();
+  }
+  // will return a pointer to SuperVersion* if previous SuperVersion
+  // if its reference count is zero and needs deletion or nullptr if not
+  // As argument takes a pointer to allocated SuperVersion to enable
+  // the clients to allocate SuperVersion outside of mutex.
+  // IMPORTANT: Only call this from DBImpl::InstallSuperVersion()
+  void InstallSuperVersion(SuperVersionContext* sv_context,
+                           const MutableCFOptions& mutable_cf_options);
+  void InstallSuperVersion(SuperVersionContext* sv_context,
+                           InstrumentedMutex* db_mutex);
+
+  void ResetThreadLocalSuperVersions();
+
+  // Protected by DB mutex
+  void set_queued_for_flush(bool value) { queued_for_flush_ = value; }
+  void set_queued_for_compaction(bool value) { queued_for_compaction_ = value; }
+  bool queued_for_flush() { return queued_for_flush_; }
+  bool queued_for_compaction() { return queued_for_compaction_; }
+
+  enum class WriteStallCause {
+    kNone,
+    kMemtableLimit,
+    kL0FileCountLimit,
+    kPendingCompactionBytes,
+  };
+  static std::pair<WriteStallCondition, WriteStallCause>
+  GetWriteStallConditionAndCause(
+      int num_unflushed_memtables, int num_l0_files,
+      uint64_t num_compaction_needed_bytes,
+      const MutableCFOptions& mutable_cf_options,
+      const ImmutableCFOptions& immutable_cf_options);
+
+  // Recalculate some stall conditions, which are changed only during
+  // compaction, adding new memtable and/or recalculation of compaction score.
+  WriteStallCondition RecalculateWriteStallConditions(
+      const MutableCFOptions& mutable_cf_options);
+
+  void set_initialized() { initialized_.store(true); }
+
+  bool initialized() const { return initialized_.load(); }
+
+  const ColumnFamilyOptions& initial_cf_options() {
+    return initial_cf_options_;
+  }
+
+  Env::WriteLifeTimeHint CalculateSSTWriteHint(int level);
+
+  // created_dirs remembers directory created, so that we don't need to call
+  // the same data creation operation again.
+  Status AddDirectories(
+      std::map<std::string, std::shared_ptr<FSDirectory>>* created_dirs);
+
+  FSDirectory* GetDataDir(size_t path_id) const;
+
+  // full_history_ts_low_ can only increase.
+  void SetFullHistoryTsLow(std::string ts_low) {
+    assert(!ts_low.empty());
+    const Comparator* ucmp = user_comparator();
+    assert(ucmp);
+    if (full_history_ts_low_.empty() ||
+        ucmp->CompareTimestamp(ts_low, full_history_ts_low_) > 0) {
+      full_history_ts_low_ = std::move(ts_low);
+    }
+  }
+
+  const std::string& GetFullHistoryTsLow() const {
+    return full_history_ts_low_;
+  }
+
+  ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); }
+  WriteBufferManager* write_buffer_mgr() { return write_buffer_manager_; }
+  std::shared_ptr<CacheReservationManager>
+  GetFileMetadataCacheReservationManager() {
+    return file_metadata_cache_res_mgr_;
+  }
+
+  SequenceNumber GetFirstMemtableSequenceNumber() const;
+
+  static const uint32_t kDummyColumnFamilyDataId;
+
+  // Keep track of whether the mempurge feature was ever used.
+  void SetMempurgeUsed() { mempurge_used_ = true; }
+  bool GetMempurgeUsed() { return mempurge_used_; }
+
+ private:
+  friend class ColumnFamilySet;
+  ColumnFamilyData(uint32_t id, const std::string& name,
+                   Version* dummy_versions, Cache* table_cache,
+                   WriteBufferManager* write_buffer_manager,
+                   const ColumnFamilyOptions& options,
+                   const ImmutableDBOptions& db_options,
+                   const FileOptions* file_options,
+                   ColumnFamilySet* column_family_set,
+                   BlockCacheTracer* const block_cache_tracer,
+                   const std::shared_ptr<IOTracer>& io_tracer,
+                   const std::string& db_id, const std::string& db_session_id);
+
+  std::vector<std::string> GetDbPaths() const;
+
+  uint32_t id_;
+  const std::string name_;
+  Version* dummy_versions_;  // Head of circular doubly-linked list of versions.
+  Version* current_;         // == dummy_versions->prev_
+
+  std::atomic<int> refs_;  // outstanding references to ColumnFamilyData
+  std::atomic<bool> initialized_;
+  std::atomic<bool> dropped_;  // true if client dropped it
+
+  const InternalKeyComparator internal_comparator_;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories_;
+
+  const ColumnFamilyOptions initial_cf_options_;
+  const ImmutableOptions ioptions_;
+  MutableCFOptions mutable_cf_options_;
+
+  const bool is_delete_range_supported_;
+
+  std::unique_ptr<TableCache> table_cache_;
+  std::unique_ptr<BlobFileCache> blob_file_cache_;
+  std::unique_ptr<BlobSource> blob_source_;
+
+  std::unique_ptr<InternalStats> internal_stats_;
+
+  WriteBufferManager* write_buffer_manager_;
+
+  MemTable* mem_;
+  MemTableList imm_;
+  SuperVersion* super_version_;
+
+  // An ordinal representing the current SuperVersion. Updated by
+  // InstallSuperVersion(), i.e. incremented every time super_version_
+  // changes.
+  std::atomic<uint64_t> super_version_number_;
+
+  // Thread's local copy of SuperVersion pointer
+  // This needs to be destructed before mutex_
+  std::unique_ptr<ThreadLocalPtr> local_sv_;
+
+  // pointers for a circular linked list. we use it to support iterations over
+  // all column families that are alive (note: dropped column families can also
+  // be alive as long as client holds a reference)
+  ColumnFamilyData* next_;
+  ColumnFamilyData* prev_;
+
+  // This is the earliest log file number that contains data from this
+  // Column Family. All earlier log files must be ignored and not
+  // recovered from
+  uint64_t log_number_;
+
+  std::atomic<FlushReason> flush_reason_;
+
+  // An object that keeps all the compaction stats
+  // and picks the next compaction
+  std::unique_ptr<CompactionPicker> compaction_picker_;
+
+  ColumnFamilySet* column_family_set_;
+
+  std::unique_ptr<WriteControllerToken> write_controller_token_;
+
+  // If true --> this ColumnFamily is currently present in DBImpl::flush_queue_
+  bool queued_for_flush_;
+
+  // If true --> this ColumnFamily is currently present in
+  // DBImpl::compaction_queue_
+  bool queued_for_compaction_;
+
+  uint64_t prev_compaction_needed_bytes_;
+
+  // if the database was opened with 2pc enabled
+  bool allow_2pc_;
+
+  // Memtable id to track flush.
+  std::atomic<uint64_t> last_memtable_id_;
+
+  // Directories corresponding to cf_paths.
+  std::vector<std::shared_ptr<FSDirectory>> data_dirs_;
+
+  bool db_paths_registered_;
+
+  std::string full_history_ts_low_;
+
+  // For charging memory usage of file metadata created for newly added files to
+  // a Version associated with this CFD
+  std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr_;
+  bool mempurge_used_;
+};
+
+// ColumnFamilySet has interesting thread-safety requirements
+// * CreateColumnFamily() or RemoveColumnFamily() -- need to be protected by DB
+// mutex AND executed in the write thread.
+// CreateColumnFamily() should ONLY be called from VersionSet::LogAndApply() AND
+// single-threaded write thread. It is also called during Recovery and in
+// DumpManifest().
+// RemoveColumnFamily() is only called from SetDropped(). DB mutex needs to be
+// held and it needs to be executed from the write thread. SetDropped() also
+// guarantees that it will be called only from single-threaded LogAndApply(),
+// but this condition is not that important.
+// * Iteration -- hold DB mutex. If you want to release the DB mutex in the
+// body of the iteration, wrap in a RefedColumnFamilySet.
+// * GetDefault() -- thread safe
+// * GetColumnFamily() -- either inside of DB mutex or from a write thread
+// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
+// NumberOfColumnFamilies -- inside of DB mutex
+class ColumnFamilySet {
+ public:
+  // ColumnFamilySet supports iteration
+  class iterator {
+   public:
+    explicit iterator(ColumnFamilyData* cfd) : current_(cfd) {}
+    // NOTE: minimum operators for for-loop iteration
+    iterator& operator++() {
+      current_ = current_->next_;
+      return *this;
+    }
+    bool operator!=(const iterator& other) const {
+      return this->current_ != other.current_;
+    }
+    ColumnFamilyData* operator*() { return current_; }
+
+   private:
+    ColumnFamilyData* current_;
+  };
+
+  ColumnFamilySet(const std::string& dbname,
+                  const ImmutableDBOptions* db_options,
+                  const FileOptions& file_options, Cache* table_cache,
+                  WriteBufferManager* _write_buffer_manager,
+                  WriteController* _write_controller,
+                  BlockCacheTracer* const block_cache_tracer,
+                  const std::shared_ptr<IOTracer>& io_tracer,
+                  const std::string& db_id, const std::string& db_session_id);
+  ~ColumnFamilySet();
+
+  ColumnFamilyData* GetDefault() const;
+  // GetColumnFamily() calls return nullptr if column family is not found
+  ColumnFamilyData* GetColumnFamily(uint32_t id) const;
+  ColumnFamilyData* GetColumnFamily(const std::string& name) const;
+  // this call will return the next available column family ID. it guarantees
+  // that there is no column family with id greater than or equal to the
+  // returned value in the current running instance or anytime in RocksDB
+  // instance history.
+  uint32_t GetNextColumnFamilyID();
+  uint32_t GetMaxColumnFamily();
+  void UpdateMaxColumnFamily(uint32_t new_max_column_family);
+  size_t NumberOfColumnFamilies() const;
+
+  ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id,
+                                       Version* dummy_version,
+                                       const ColumnFamilyOptions& options);
+
+  iterator begin() { return iterator(dummy_cfd_->next_); }
+  iterator end() { return iterator(dummy_cfd_); }
+
+  Cache* get_table_cache() { return table_cache_; }
+
+  WriteBufferManager* write_buffer_manager() { return write_buffer_manager_; }
+
+  WriteController* write_controller() { return write_controller_; }
+
+ private:
+  friend class ColumnFamilyData;
+  // helper function that gets called from cfd destructor
+  // REQUIRES: DB mutex held
+  void RemoveColumnFamily(ColumnFamilyData* cfd);
+
+  // column_families_ and column_family_data_ need to be protected:
+  // * when mutating both conditions have to be satisfied:
+  // 1. DB mutex locked
+  // 2. thread currently in single-threaded write thread
+  // * when reading, at least one condition needs to be satisfied:
+  // 1. DB mutex locked
+  // 2. accessed from a single-threaded write thread
+  UnorderedMap<std::string, uint32_t> column_families_;
+  UnorderedMap<uint32_t, ColumnFamilyData*> column_family_data_;
+
+  uint32_t max_column_family_;
+  const FileOptions file_options_;
+
+  ColumnFamilyData* dummy_cfd_;
+  // We don't hold the refcount here, since default column family always exists
+  // We are also not responsible for cleaning up default_cfd_cache_. This is
+  // just a cache that makes common case (accessing default column family)
+  // faster
+  ColumnFamilyData* default_cfd_cache_;
+
+  const std::string db_name_;
+  const ImmutableDBOptions* const db_options_;
+  Cache* table_cache_;
+  WriteBufferManager* write_buffer_manager_;
+  WriteController* write_controller_;
+  BlockCacheTracer* const block_cache_tracer_;
+  std::shared_ptr<IOTracer> io_tracer_;
+  const std::string& db_id_;
+  std::string db_session_id_;
+};
+
+// A wrapper for ColumnFamilySet that supports releasing DB mutex during each
+// iteration over the iterator, because the cfd is Refed and Unrefed during
+// each iteration to prevent concurrent CF drop from destroying it (until
+// Unref).
+class RefedColumnFamilySet {
+ public:
+  explicit RefedColumnFamilySet(ColumnFamilySet* cfs) : wrapped_(cfs) {}
+
+  class iterator {
+   public:
+    explicit iterator(ColumnFamilySet::iterator wrapped) : wrapped_(wrapped) {
+      MaybeRef(*wrapped_);
+    }
+    ~iterator() { MaybeUnref(*wrapped_); }
+    inline void MaybeRef(ColumnFamilyData* cfd) {
+      if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) {
+        cfd->Ref();
+      }
+    }
+    inline void MaybeUnref(ColumnFamilyData* cfd) {
+      if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) {
+        cfd->UnrefAndTryDelete();
+      }
+    }
+    // NOTE: minimum operators for for-loop iteration
+    inline iterator& operator++() {
+      ColumnFamilyData* old = *wrapped_;
+      ++wrapped_;
+      // Can only unref & potentially free cfd after accessing its next_
+      MaybeUnref(old);
+      MaybeRef(*wrapped_);
+      return *this;
+    }
+    inline bool operator!=(const iterator& other) const {
+      return this->wrapped_ != other.wrapped_;
+    }
+    inline ColumnFamilyData* operator*() { return *wrapped_; }
+
+   private:
+    ColumnFamilySet::iterator wrapped_;
+  };
+
+  iterator begin() { return iterator(wrapped_->begin()); }
+  iterator end() { return iterator(wrapped_->end()); }
+
+ private:
+  ColumnFamilySet* wrapped_;
+};
+
+// We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
+// memtables of different column families (specified by ID in the write batch)
+class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
+ public:
+  explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set)
+      : column_family_set_(column_family_set), current_(nullptr) {}
+
+  // Constructs a ColumnFamilyMemTablesImpl equivalent to one constructed
+  // with the arguments used to construct *orig.
+  explicit ColumnFamilyMemTablesImpl(ColumnFamilyMemTablesImpl* orig)
+      : column_family_set_(orig->column_family_set_), current_(nullptr) {}
+
+  // sets current_ to ColumnFamilyData with column_family_id
+  // returns false if column family doesn't exist
+  // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
+  //           under a DB mutex OR from a write thread
+  bool Seek(uint32_t column_family_id) override;
+
+  // Returns log number of the selected column family
+  // REQUIRES: under a DB mutex OR from a write thread
+  uint64_t GetLogNumber() const override;
+
+  // REQUIRES: Seek() called first
+  // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
+  //           under a DB mutex OR from a write thread
+  virtual MemTable* GetMemTable() const override;
+
+  // Returns column family handle for the selected column family
+  // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
+  //           under a DB mutex OR from a write thread
+  virtual ColumnFamilyHandle* GetColumnFamilyHandle() override;
+
+  // Cannot be called while another thread is calling Seek().
+  // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
+  //           under a DB mutex OR from a write thread
+  virtual ColumnFamilyData* current() override { return current_; }
+
+ private:
+  ColumnFamilySet* column_family_set_;
+  ColumnFamilyData* current_;
+  ColumnFamilyHandleInternal handle_;
+};
+
+extern uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family);
+
+extern const Comparator* GetColumnFamilyUserComparator(
+    ColumnFamilyHandle* column_family);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/column_family_test.cc b/src/rocksdb/db/column_family_test.cc
new file mode 100644
index 000000000..d33cbe50a
--- /dev/null
+++ b/src/rocksdb/db/column_family_test.cc
@@ -0,0 +1,3453 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <algorithm>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "options/options_parser.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static const int kValueSize = 1000;
+
+// counts how many operations were performed
+class EnvCounter : public SpecialEnv {
+ public:
+  explicit EnvCounter(Env* base)
+      : SpecialEnv(base), num_new_writable_file_(0) {}
+  int GetNumberOfNewWritableFileCalls() { return num_new_writable_file_; }
+  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
+                         const EnvOptions& soptions) override {
+    ++num_new_writable_file_;
+    return EnvWrapper::NewWritableFile(f, r, soptions);
+  }
+
+ private:
+  std::atomic<int> num_new_writable_file_;
+};
+
+class ColumnFamilyTestBase : public testing::Test {
+ public:
+  explicit ColumnFamilyTestBase(uint32_t format) : rnd_(139), format_(format) {
+    Env* base_env = Env::Default();
+    EXPECT_OK(
+        test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_));
+    EXPECT_NE(nullptr, base_env);
+    env_ = new EnvCounter(base_env);
+    env_->skip_fsync_ = true;
+    dbname_ = test::PerThreadDBPath("column_family_test");
+    db_options_.create_if_missing = true;
+    db_options_.fail_if_options_file_error = true;
+    db_options_.env = env_;
+    EXPECT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_)));
+  }
+
+  ~ColumnFamilyTestBase() override {
+    std::vector<ColumnFamilyDescriptor> column_families;
+    for (auto h : handles_) {
+      ColumnFamilyDescriptor cfdescriptor;
+      Status s = h->GetDescriptor(&cfdescriptor);
+#ifdef ROCKSDB_LITE
+      EXPECT_TRUE(s.IsNotSupported());
+#else
+      EXPECT_OK(s);
+#endif  // ROCKSDB_LITE
+      column_families.push_back(cfdescriptor);
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    Destroy(column_families);
+    delete env_;
+  }
+
+  BlockBasedTableOptions GetBlockBasedTableOptions() {
+    BlockBasedTableOptions options;
+    options.format_version = format_;
+    return options;
+  }
+
+  // Return the value to associate with the specified key
+  Slice Value(int k, std::string* storage) {
+    if (k == 0) {
+      // Ugh.  Random seed of 0 used to produce no entropy.  This code
+      // preserves the implementation that was in place when all of the
+      // magic values in this file were picked.
+      *storage = std::string(kValueSize, ' ');
+    } else {
+      Random r(k);
+      *storage = r.RandomString(kValueSize);
+    }
+    return Slice(*storage);
+  }
+
+  void Build(int base, int n, int flush_every = 0) {
+    std::string key_space, value_space;
+    WriteBatch batch;
+
+    for (int i = 0; i < n; i++) {
+      if (flush_every != 0 && i != 0 && i % flush_every == 0) {
+        DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+        dbi->TEST_FlushMemTable();
+      }
+
+      int keyi = base + i;
+      Slice key(DBTestBase::Key(keyi));
+
+      batch.Clear();
+      batch.Put(handles_[0], key, Value(keyi, &value_space));
+      batch.Put(handles_[1], key, Value(keyi, &value_space));
+      batch.Put(handles_[2], key, Value(keyi, &value_space));
+      ASSERT_OK(db_->Write(WriteOptions(), &batch));
+    }
+  }
+
+  void CheckMissed() {
+    uint64_t next_expected = 0;
+    uint64_t missed = 0;
+    int bad_keys = 0;
+    int bad_values = 0;
+    int correct = 0;
+    std::string value_space;
+    for (int cf = 0; cf < 3; cf++) {
+      next_expected = 0;
+      Iterator* iter = db_->NewIterator(ReadOptions(false, true), handles_[cf]);
+      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+        uint64_t key;
+        Slice in(iter->key());
+        in.remove_prefix(3);
+        if (!ConsumeDecimalNumber(&in, &key) || !in.empty() ||
+            key < next_expected) {
+          bad_keys++;
+          continue;
+        }
+        missed += (key - next_expected);
+        next_expected = key + 1;
+        if (iter->value() != Value(static_cast<int>(key), &value_space)) {
+          bad_values++;
+        } else {
+          correct++;
+        }
+      }
+      delete iter;
+    }
+
+    ASSERT_EQ(0, bad_keys);
+    ASSERT_EQ(0, bad_values);
+    ASSERT_EQ(0, missed);
+    (void)correct;
+  }
+
+  void Close() {
+    for (auto h : handles_) {
+      if (h) {
+        ASSERT_OK(db_->DestroyColumnFamilyHandle(h));
+      }
+    }
+    handles_.clear();
+    names_.clear();
+    delete db_;
+    db_ = nullptr;
+  }
+
+  Status TryOpen(std::vector<std::string> cf,
+                 std::vector<ColumnFamilyOptions> options = {}) {
+    std::vector<ColumnFamilyDescriptor> column_families;
+    names_.clear();
+    for (size_t i = 0; i < cf.size(); ++i) {
+      column_families.emplace_back(
+          cf[i], options.size() == 0 ? column_family_options_ : options[i]);
+      names_.push_back(cf[i]);
+    }
+    return DB::Open(db_options_, dbname_, column_families, &handles_, &db_);
+  }
+
+  Status OpenReadOnly(std::vector<std::string> cf,
+                      std::vector<ColumnFamilyOptions> options = {}) {
+    std::vector<ColumnFamilyDescriptor> column_families;
+    names_.clear();
+    for (size_t i = 0; i < cf.size(); ++i) {
+      column_families.emplace_back(
+          cf[i], options.size() == 0 ? column_family_options_ : options[i]);
+      names_.push_back(cf[i]);
+    }
+    return DB::OpenForReadOnly(db_options_, dbname_, column_families, &handles_,
+                               &db_);
+  }
+
+#ifndef ROCKSDB_LITE  // ReadOnlyDB is not supported
+  void AssertOpenReadOnly(std::vector<std::string> cf,
+                          std::vector<ColumnFamilyOptions> options = {}) {
+    ASSERT_OK(OpenReadOnly(cf, options));
+  }
+#endif  // !ROCKSDB_LITE
+
+  void Open(std::vector<std::string> cf,
+            std::vector<ColumnFamilyOptions> options = {}) {
+    ASSERT_OK(TryOpen(cf, options));
+  }
+
+  void Open() { Open({"default"}); }
+
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
+
+  int GetProperty(int cf, std::string property) {
+    std::string value;
+    EXPECT_TRUE(dbfull()->GetProperty(handles_[cf], property, &value));
+#ifndef CYGWIN
+    return std::stoi(value);
+#else
+    return std::strtol(value.c_str(), 0 /* off */, 10 /* base */);
+#endif
+  }
+
+  bool IsDbWriteStopped() {
+#ifndef ROCKSDB_LITE
+    uint64_t v;
+    EXPECT_TRUE(dbfull()->GetIntProperty("rocksdb.is-write-stopped", &v));
+    return (v == 1);
+#else
+    return dbfull()->TEST_write_controler().IsStopped();
+#endif  // !ROCKSDB_LITE
+  }
+
+  uint64_t GetDbDelayedWriteRate() {
+#ifndef ROCKSDB_LITE
+    uint64_t v;
+    EXPECT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.actual-delayed-write-rate", &v));
+    return v;
+#else
+    if (!dbfull()->TEST_write_controler().NeedsDelay()) {
+      return 0;
+    }
+    return dbfull()->TEST_write_controler().delayed_write_rate();
+#endif  // !ROCKSDB_LITE
+  }
+
+  void Destroy(const std::vector<ColumnFamilyDescriptor>& column_families =
+                   std::vector<ColumnFamilyDescriptor>()) {
+    Close();
+    ASSERT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_),
+                        column_families));
+  }
+
+  void CreateColumnFamilies(
+      const std::vector<std::string>& cfs,
+      const std::vector<ColumnFamilyOptions> options = {}) {
+    int cfi = static_cast<int>(handles_.size());
+    handles_.resize(cfi + cfs.size());
+    names_.resize(cfi + cfs.size());
+    for (size_t i = 0; i < cfs.size(); ++i) {
+      const auto& current_cf_opt =
+          options.size() == 0 ? column_family_options_ : options[i];
+      ASSERT_OK(
+          db_->CreateColumnFamily(current_cf_opt, cfs[i], &handles_[cfi]));
+      names_[cfi] = cfs[i];
+
+#ifndef ROCKSDB_LITE  // RocksDBLite does not support GetDescriptor
+      // Verify the CF options of the returned CF handle.
+      ColumnFamilyDescriptor desc;
+      ASSERT_OK(handles_[cfi]->GetDescriptor(&desc));
+      // Need to sanitize the default column family options before comparing
+      // them.
+      ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
+          ConfigOptions(), desc.options,
+          SanitizeOptions(dbfull()->immutable_db_options(), current_cf_opt)));
+#endif  // !ROCKSDB_LITE
+      cfi++;
+    }
+  }
+
+  void Reopen(const std::vector<ColumnFamilyOptions> options = {}) {
+    std::vector<std::string> names;
+    for (auto name : names_) {
+      if (name != "") {
+        names.push_back(name);
+      }
+    }
+    Close();
+    assert(options.size() == 0 || names.size() == options.size());
+    Open(names, options);
+  }
+
+  void CreateColumnFamiliesAndReopen(const std::vector<std::string>& cfs) {
+    CreateColumnFamilies(cfs);
+    Reopen();
+  }
+
+  void DropColumnFamilies(const std::vector<int>& cfs) {
+    for (auto cf : cfs) {
+      ASSERT_OK(db_->DropColumnFamily(handles_[cf]));
+      ASSERT_OK(db_->DestroyColumnFamilyHandle(handles_[cf]));
+      handles_[cf] = nullptr;
+      names_[cf] = "";
+    }
+  }
+
+  void PutRandomData(int cf, int num, int key_value_size, bool save = false) {
+    if (cf >= static_cast<int>(keys_.size())) {
+      keys_.resize(cf + 1);
+    }
+    for (int i = 0; i < num; ++i) {
+      // 10 bytes for key, rest is value
+      if (!save) {
+        ASSERT_OK(Put(cf, test::RandomKey(&rnd_, 11),
+                      rnd_.RandomString(key_value_size - 10)));
+      } else {
+        std::string key = test::RandomKey(&rnd_, 11);
+        keys_[cf].insert(key);
+        ASSERT_OK(Put(cf, key, rnd_.RandomString(key_value_size - 10)));
+      }
+    }
+    ASSERT_OK(db_->FlushWAL(/*sync=*/false));
+  }
+
+#ifndef ROCKSDB_LITE  // TEST functions in DB are not supported in lite
+  void WaitForFlush(int cf) {
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
+  }
+
+  void WaitForCompaction() { ASSERT_OK(dbfull()->TEST_WaitForCompact()); }
+
+  uint64_t MaxTotalInMemoryState() {
+    return dbfull()->TEST_MaxTotalInMemoryState();
+  }
+
+  void AssertMaxTotalInMemoryState(uint64_t value) {
+    ASSERT_EQ(value, MaxTotalInMemoryState());
+  }
+#endif  // !ROCKSDB_LITE
+
+  Status Put(int cf, const std::string& key, const std::string& value) {
+    return db_->Put(WriteOptions(), handles_[cf], Slice(key), Slice(value));
+  }
+  Status Merge(int cf, const std::string& key, const std::string& value) {
+    return db_->Merge(WriteOptions(), handles_[cf], Slice(key), Slice(value));
+  }
+  Status Flush(int cf) { return db_->Flush(FlushOptions(), handles_[cf]); }
+
+  std::string Get(int cf, const std::string& key) {
+    ReadOptions options;
+    options.verify_checksums = true;
+    std::string result;
+    Status s = db_->Get(options, handles_[cf], Slice(key), &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  void CompactAll(int cf) {
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf], nullptr,
+                                nullptr));
+  }
+
+  void Compact(int cf, const Slice& start, const Slice& limit) {
+    ASSERT_OK(
+        db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit));
+  }
+
+  int NumTableFilesAtLevel(int level, int cf) {
+    return GetProperty(cf,
+                       "rocksdb.num-files-at-level" + std::to_string(level));
+  }
+
+#ifndef ROCKSDB_LITE
+  // Return spread of files per level
+  std::string FilesPerLevel(int cf) {
+    std::string result;
+    int last_non_zero_offset = 0;
+    for (int level = 0; level < dbfull()->NumberLevels(handles_[cf]); level++) {
+      int f = NumTableFilesAtLevel(level, cf);
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+      result += buf;
+      if (f > 0) {
+        last_non_zero_offset = static_cast<int>(result.size());
+      }
+    }
+    result.resize(last_non_zero_offset);
+    return result;
+  }
+#endif
+
+  void AssertFilesPerLevel(const std::string& value, int cf) {
+#ifndef ROCKSDB_LITE
+    ASSERT_EQ(value, FilesPerLevel(cf));
+#else
+    (void)value;
+    (void)cf;
+#endif
+  }
+
+#ifndef ROCKSDB_LITE  // GetLiveFilesMetaData is not supported
+  int CountLiveFiles() {
+    std::vector<LiveFileMetaData> metadata;
+    db_->GetLiveFilesMetaData(&metadata);
+    return static_cast<int>(metadata.size());
+  }
+#endif  // !ROCKSDB_LITE
+
+  void AssertCountLiveFiles(int expected_value) {
+#ifndef ROCKSDB_LITE
+    ASSERT_EQ(expected_value, CountLiveFiles());
+#else
+    (void)expected_value;
+#endif
+  }
+
+  // Do n memtable flushes, each of which produces an sstable
+  // covering the range [small,large].
+  void MakeTables(int cf, int n, const std::string& small,
+                  const std::string& large) {
+    for (int i = 0; i < n; i++) {
+      ASSERT_OK(Put(cf, small, "begin"));
+      ASSERT_OK(Put(cf, large, "end"));
+      ASSERT_OK(db_->Flush(FlushOptions(), handles_[cf]));
+    }
+  }
+
+#ifndef ROCKSDB_LITE  // GetSortedWalFiles is not supported
+  int CountLiveLogFiles() {
+    int micros_wait_for_log_deletion = 20000;
+    env_->SleepForMicroseconds(micros_wait_for_log_deletion);
+    int ret = 0;
+    VectorLogPtr wal_files;
+    Status s;
+    // GetSortedWalFiles is a flakey function -- it gets all the wal_dir
+    // children files and then later checks for their existence. if some of the
+    // log files doesn't exist anymore, it reports an error. it does all of this
+    // without DB mutex held, so if a background process deletes the log file
+    // while the function is being executed, it returns an error. We retry the
+    // function 10 times to avoid the error failing the test
+    for (int retries = 0; retries < 10; ++retries) {
+      wal_files.clear();
+      s = db_->GetSortedWalFiles(wal_files);
+      if (s.ok()) {
+        break;
+      }
+    }
+    EXPECT_OK(s);
+    for (const auto& wal : wal_files) {
+      if (wal->Type() == kAliveLogFile) {
+        ++ret;
+      }
+    }
+    return ret;
+    return 0;
+  }
+#endif  // !ROCKSDB_LITE
+
+  void AssertCountLiveLogFiles(int value) {
+#ifndef ROCKSDB_LITE  // GetSortedWalFiles is not supported
+    ASSERT_EQ(value, CountLiveLogFiles());
+#else
+    (void)value;
+#endif  // !ROCKSDB_LITE
+  }
+
+  void AssertNumberOfImmutableMemtables(std::vector<int> num_per_cf) {
+    assert(num_per_cf.size() == handles_.size());
+
+#ifndef ROCKSDB_LITE  // GetProperty is not supported in lite
+    for (size_t i = 0; i < num_per_cf.size(); ++i) {
+      ASSERT_EQ(num_per_cf[i], GetProperty(static_cast<int>(i),
+                                           "rocksdb.num-immutable-mem-table"));
+    }
+#endif  // !ROCKSDB_LITE
+  }
+
+  void CopyFile(const std::string& source, const std::string& destination,
+                uint64_t size = 0) {
+    const EnvOptions soptions;
+    std::unique_ptr<SequentialFile> srcfile;
+    ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions));
+    std::unique_ptr<WritableFile> destfile;
+    ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions));
+
+    if (size == 0) {
+      // default argument means copy everything
+      ASSERT_OK(env_->GetFileSize(source, &size));
+    }
+
+    char buffer[4096];
+    Slice slice;
+    while (size > 0) {
+      uint64_t one = std::min(uint64_t(sizeof(buffer)), size);
+      ASSERT_OK(srcfile->Read(one, &slice, buffer));
+      ASSERT_OK(destfile->Append(slice));
+      size -= slice.size();
+    }
+    ASSERT_OK(destfile->Close());
+  }
+
+  int GetSstFileCount(std::string path) {
+    std::vector<std::string> files;
+    DBTestBase::GetSstFiles(env_, path, &files);
+    return static_cast<int>(files.size());
+  }
+
+  void RecalculateWriteStallConditions(
+      ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options) {
+    // add lock to avoid race condition between
+    // `RecalculateWriteStallConditions` which writes to CFStats and
+    // background `DBImpl::DumpStats()` threads which read CFStats
+    dbfull()->TEST_LockMutex();
+    cfd->RecalculateWriteStallConditions(mutable_cf_options);
+    dbfull()->TEST_UnlockMutex();
+  }
+
+  std::vector<ColumnFamilyHandle*> handles_;
+  std::vector<std::string> names_;
+  std::vector<std::set<std::string>> keys_;
+  ColumnFamilyOptions column_family_options_;
+  DBOptions db_options_;
+  std::string dbname_;
+  DB* db_ = nullptr;
+  EnvCounter* env_;
+  std::shared_ptr<Env> env_guard_;
+  Random rnd_;
+  uint32_t format_;
+};
+
+class ColumnFamilyTest
+    : public ColumnFamilyTestBase,
+      virtual public ::testing::WithParamInterface<uint32_t> {
+ public:
+  ColumnFamilyTest() : ColumnFamilyTestBase(GetParam()) {}
+};
+
+INSTANTIATE_TEST_CASE_P(FormatDef, ColumnFamilyTest,
+                        testing::Values(test::kDefaultFormatVersion));
+INSTANTIATE_TEST_CASE_P(FormatLatest, ColumnFamilyTest,
+                        testing::Values(kLatestFormatVersion));
+
+TEST_P(ColumnFamilyTest, DontReuseColumnFamilyID) {
+  for (int iter = 0; iter < 3; ++iter) {
+    Open();
+    CreateColumnFamilies({"one", "two", "three"});
+    for (size_t i = 0; i < handles_.size(); ++i) {
+      auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(handles_[i]);
+      ASSERT_EQ(i, cfh->GetID());
+    }
+    if (iter == 1) {
+      Reopen();
+    }
+    DropColumnFamilies({3});
+    Reopen();
+    if (iter == 2) {
+      // this tests if max_column_family is correctly persisted with
+      // WriteSnapshot()
+      Reopen();
+    }
+    CreateColumnFamilies({"three2"});
+    // ID 3 that was used for dropped column family "three" should not be
+    // reused
+    auto cfh3 = static_cast_with_check<ColumnFamilyHandleImpl>(handles_[3]);
+    ASSERT_EQ(4U, cfh3->GetID());
+    Close();
+    Destroy();
+  }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(ColumnFamilyTest, CreateCFRaceWithGetAggProperty) {
+  Open();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::WriteOptionsFile:1",
+        "ColumnFamilyTest.CreateCFRaceWithGetAggProperty:1"},
+       {"ColumnFamilyTest.CreateCFRaceWithGetAggProperty:2",
+        "DBImpl::WriteOptionsFile:2"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread thread(
+      [&] { CreateColumnFamilies({"one"}); });
+
+  TEST_SYNC_POINT("ColumnFamilyTest.CreateCFRaceWithGetAggProperty:1");
+  uint64_t pv;
+  db_->GetAggregatedIntProperty(DB::Properties::kEstimateTableReadersMem, &pv);
+  TEST_SYNC_POINT("ColumnFamilyTest.CreateCFRaceWithGetAggProperty:2");
+
+  thread.join();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+#endif  // !ROCKSDB_LITE
+
+class FlushEmptyCFTestWithParam
+    : public ColumnFamilyTestBase,
+      virtual public testing::WithParamInterface<std::tuple<uint32_t, bool>> {
+ public:
+  FlushEmptyCFTestWithParam()
+      : ColumnFamilyTestBase(std::get<0>(GetParam())),
+        allow_2pc_(std::get<1>(GetParam())) {}
+
+  // Required if inheriting from testing::WithParamInterface<>
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  bool allow_2pc_;
+};
+
+TEST_P(FlushEmptyCFTestWithParam, FlushEmptyCFTest) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(env_));
+  db_options_.env = fault_env.get();
+  db_options_.allow_2pc = allow_2pc_;
+  Open();
+  CreateColumnFamilies({"one", "two"});
+  // Generate log file A.
+  ASSERT_OK(Put(1, "foo", "v1"));  // seqID 1
+
+  Reopen();
+  // Log file A is not dropped after reopening because default column family's
+  // min log number is 0.
+  // It flushes to SST file X
+  ASSERT_OK(Put(1, "foo", "v1"));  // seqID 2
+  ASSERT_OK(Put(1, "bar", "v2"));  // seqID 3
+  // Current log file is file B now. While flushing, a new log file C is created
+  // and is set to current. Boths' min log number is set to file C in memory, so
+  // after flushing file B is deleted. At the same time, the min log number of
+  // default CF is not written to manifest. Log file A still remains.
+  // Flushed to SST file Y.
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Flush(0));
+  ASSERT_OK(Put(1, "bar", "v3"));  // seqID 4
+  ASSERT_OK(Put(1, "foo", "v4"));  // seqID 5
+  ASSERT_OK(db_->FlushWAL(/*sync=*/false));
+
+  // Preserve file system state up to here to simulate a crash condition.
+  fault_env->SetFilesystemActive(false);
+  std::vector<std::string> names;
+  for (auto name : names_) {
+    if (name != "") {
+      names.push_back(name);
+    }
+  }
+
+  Close();
+  fault_env->ResetState();
+
+  // Before opening, there are four files:
+  //   Log file A contains seqID 1
+  //   Log file C contains seqID 4, 5
+  //   SST file X contains seqID 1
+  //   SST file Y contains seqID 2, 3
+  // Min log number:
+  //   default CF: 0
+  //   CF one, two: C
+  // When opening the DB, all the seqID should be preserved.
+  Open(names, {});
+  ASSERT_EQ("v4", Get(1, "foo"));
+  ASSERT_EQ("v3", Get(1, "bar"));
+  Close();
+
+  db_options_.env = env_;
+}
+
+TEST_P(FlushEmptyCFTestWithParam, FlushEmptyCFTest2) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(env_));
+  db_options_.env = fault_env.get();
+  db_options_.allow_2pc = allow_2pc_;
+  Open();
+  CreateColumnFamilies({"one", "two"});
+  // Generate log file A.
+  ASSERT_OK(Put(1, "foo", "v1"));  // seqID 1
+
+  Reopen();
+  // Log file A is not dropped after reopening because default column family's
+  // min log number is 0.
+  // It flushes to SST file X
+  ASSERT_OK(Put(1, "foo", "v1"));  // seqID 2
+  ASSERT_OK(Put(1, "bar", "v2"));  // seqID 3
+  // Current log file is file B now. While flushing, a new log file C is created
+  // and is set to current. Both CFs' min log number is set to file C so after
+  // flushing file B is deleted. Log file A still remains.
+  // Flushed to SST file Y.
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(0, "bar", "v2"));  // seqID 4
+  ASSERT_OK(Put(2, "bar", "v2"));  // seqID 5
+  ASSERT_OK(Put(1, "bar", "v3"));  // seqID 6
+  // Flushing all column families. This forces all CFs' min log to current. This
+  // is written to the manifest file. Log file C is cleared.
+  ASSERT_OK(Flush(0));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Flush(2));
+  // Write to log file D
+  ASSERT_OK(Put(1, "bar", "v4"));  // seqID 7
+  ASSERT_OK(Put(1, "bar", "v5"));  // seqID 8
+  ASSERT_OK(db_->FlushWAL(/*sync=*/false));
+  // Preserve file system state up to here to simulate a crash condition.
+  fault_env->SetFilesystemActive(false);
+  std::vector<std::string> names;
+  for (auto name : names_) {
+    if (name != "") {
+      names.push_back(name);
+    }
+  }
+
+  Close();
+  fault_env->ResetState();
+  // Before opening, there are two logfiles:
+  //   Log file A contains seqID 1
+  //   Log file D contains seqID 7, 8
+  // Min log number:
+  //   default CF: D
+  //   CF one, two: D
+  // When opening the DB, log file D should be replayed using the seqID
+  // specified in the file.
+  Open(names, {});
+  ASSERT_EQ("v1", Get(1, "foo"));
+  ASSERT_EQ("v5", Get(1, "bar"));
+  Close();
+
+  db_options_.env = env_;
+}
+
+INSTANTIATE_TEST_CASE_P(
+    FormatDef, FlushEmptyCFTestWithParam,
+    testing::Values(std::make_tuple(test::kDefaultFormatVersion, true),
+                    std::make_tuple(test::kDefaultFormatVersion, false)));
+INSTANTIATE_TEST_CASE_P(
+    FormatLatest, FlushEmptyCFTestWithParam,
+    testing::Values(std::make_tuple(kLatestFormatVersion, true),
+                    std::make_tuple(kLatestFormatVersion, false)));
+
+TEST_P(ColumnFamilyTest, AddDrop) {
+  Open();
+  CreateColumnFamilies({"one", "two", "three"});
+  ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
+  ASSERT_EQ("NOT_FOUND", Get(2, "fodor"));
+  DropColumnFamilies({2});
+  ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
+  CreateColumnFamilies({"four"});
+  ASSERT_EQ("NOT_FOUND", Get(3, "fodor"));
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  ASSERT_EQ("mirko", Get(1, "fodor"));
+  ASSERT_EQ("NOT_FOUND", Get(3, "fodor"));
+  Close();
+  ASSERT_TRUE(TryOpen({"default"}).IsInvalidArgument());
+  Open({"default", "one", "three", "four"});
+  DropColumnFamilies({1});
+  Reopen();
+  Close();
+
+  std::vector<std::string> families;
+  ASSERT_OK(DB::ListColumnFamilies(db_options_, dbname_, &families));
+  std::sort(families.begin(), families.end());
+  ASSERT_TRUE(families ==
+              std::vector<std::string>({"default", "four", "three"}));
+}
+
+TEST_P(ColumnFamilyTest, BulkAddDrop) {
+  constexpr int kNumCF = 1000;
+  ColumnFamilyOptions cf_options;
+  WriteOptions write_options;
+  Open();
+  std::vector<std::string> cf_names;
+  std::vector<ColumnFamilyHandle*> cf_handles;
+  for (int i = 1; i <= kNumCF; i++) {
+    cf_names.push_back("cf1-" + std::to_string(i));
+  }
+  ASSERT_OK(db_->CreateColumnFamilies(cf_options, cf_names, &cf_handles));
+  for (int i = 1; i <= kNumCF; i++) {
+    ASSERT_OK(db_->Put(write_options, cf_handles[i - 1], "foo", "bar"));
+  }
+  ASSERT_OK(db_->DropColumnFamilies(cf_handles));
+  std::vector<ColumnFamilyDescriptor> cf_descriptors;
+  for (auto* handle : cf_handles) {
+    delete handle;
+  }
+  cf_handles.clear();
+  for (int i = 1; i <= kNumCF; i++) {
+    cf_descriptors.emplace_back("cf2-" + std::to_string(i),
+                                ColumnFamilyOptions());
+  }
+  ASSERT_OK(db_->CreateColumnFamilies(cf_descriptors, &cf_handles));
+  for (int i = 1; i <= kNumCF; i++) {
+    ASSERT_OK(db_->Put(write_options, cf_handles[i - 1], "foo", "bar"));
+  }
+  ASSERT_OK(db_->DropColumnFamilies(cf_handles));
+  for (auto* handle : cf_handles) {
+    delete handle;
+  }
+  Close();
+  std::vector<std::string> families;
+  ASSERT_OK(DB::ListColumnFamilies(db_options_, dbname_, &families));
+  std::sort(families.begin(), families.end());
+  ASSERT_TRUE(families == std::vector<std::string>({"default"}));
+}
+
+TEST_P(ColumnFamilyTest, DropTest) {
+  // first iteration - don't reopen DB before dropping
+  // second iteration - reopen DB before dropping
+  for (int iter = 0; iter < 2; ++iter) {
+    Open({"default"});
+    CreateColumnFamiliesAndReopen({"pikachu"});
+    for (int i = 0; i < 100; ++i) {
+      ASSERT_OK(Put(1, std::to_string(i), "bar" + std::to_string(i)));
+    }
+    ASSERT_OK(Flush(1));
+
+    if (iter == 1) {
+      Reopen();
+    }
+    ASSERT_EQ("bar1", Get(1, "1"));
+
+    AssertCountLiveFiles(1);
+    DropColumnFamilies({1});
+    // make sure that all files are deleted when we drop the column family
+    AssertCountLiveFiles(0);
+    Destroy();
+  }
+}
+
+TEST_P(ColumnFamilyTest, WriteBatchFailure) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two"});
+  WriteBatch batch;
+  ASSERT_OK(batch.Put(handles_[0], Slice("existing"), Slice("column-family")));
+  ASSERT_OK(
+      batch.Put(handles_[1], Slice("non-existing"), Slice("column-family")));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+  DropColumnFamilies({1});
+  WriteOptions woptions_ignore_missing_cf;
+  woptions_ignore_missing_cf.ignore_missing_column_families = true;
+  ASSERT_OK(
+      batch.Put(handles_[0], Slice("still here"), Slice("column-family")));
+  ASSERT_OK(db_->Write(woptions_ignore_missing_cf, &batch));
+  ASSERT_EQ("column-family", Get(0, "still here"));
+  Status s = db_->Write(WriteOptions(), &batch);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  Close();
+}
+
+TEST_P(ColumnFamilyTest, ReadWrite) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two"});
+  ASSERT_OK(Put(0, "foo", "v1"));
+  ASSERT_OK(Put(0, "bar", "v2"));
+  ASSERT_OK(Put(1, "mirko", "v3"));
+  ASSERT_OK(Put(0, "foo", "v2"));
+  ASSERT_OK(Put(2, "fodor", "v5"));
+
+  for (int iter = 0; iter <= 3; ++iter) {
+    ASSERT_EQ("v2", Get(0, "foo"));
+    ASSERT_EQ("v2", Get(0, "bar"));
+    ASSERT_EQ("v3", Get(1, "mirko"));
+    ASSERT_EQ("v5", Get(2, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(0, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(2, "foo"));
+    if (iter <= 1) {
+      Reopen();
+    }
+  }
+  Close();
+}
+
+TEST_P(ColumnFamilyTest, IgnoreRecoveredLog) {
+  std::string backup_logs = dbname_ + "/backup_logs";
+
+  // delete old files in backup_logs directory
+  ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+  ASSERT_OK(env_->CreateDirIfMissing(backup_logs));
+  std::vector<std::string> old_files;
+  ASSERT_OK(env_->GetChildren(backup_logs, &old_files));
+  for (auto& file : old_files) {
+    ASSERT_OK(env_->DeleteFile(backup_logs + "/" + file));
+  }
+
+  column_family_options_.merge_operator =
+      MergeOperators::CreateUInt64AddOperator();
+  db_options_.wal_dir = dbname_ + "/logs";
+  Destroy();
+  Open();
+  CreateColumnFamilies({"cf1", "cf2"});
+
+  // fill up the DB
+  std::string one, two, three;
+  PutFixed64(&one, 1);
+  PutFixed64(&two, 2);
+  PutFixed64(&three, 3);
+  ASSERT_OK(Merge(0, "foo", one));
+  ASSERT_OK(Merge(1, "mirko", one));
+  ASSERT_OK(Merge(0, "foo", one));
+  ASSERT_OK(Merge(2, "bla", one));
+  ASSERT_OK(Merge(2, "fodor", one));
+  ASSERT_OK(Merge(0, "bar", one));
+  ASSERT_OK(Merge(2, "bla", one));
+  ASSERT_OK(Merge(1, "mirko", two));
+  ASSERT_OK(Merge(1, "franjo", one));
+
+  // copy the logs to backup
+  std::vector<std::string> logs;
+  ASSERT_OK(env_->GetChildren(db_options_.wal_dir, &logs));
+  for (auto& log : logs) {
+    CopyFile(db_options_.wal_dir + "/" + log, backup_logs + "/" + log);
+  }
+
+  // recover the DB
+  Close();
+
+  // 1. check consistency
+  // 2. copy the logs from backup back to WAL dir. if the recovery happens
+  // again on the same log files, this should lead to incorrect results
+  // due to applying merge operator twice
+  // 3. check consistency
+  for (int iter = 0; iter < 2; ++iter) {
+    // assert consistency
+    Open({"default", "cf1", "cf2"});
+    ASSERT_EQ(two, Get(0, "foo"));
+    ASSERT_EQ(one, Get(0, "bar"));
+    ASSERT_EQ(three, Get(1, "mirko"));
+    ASSERT_EQ(one, Get(1, "franjo"));
+    ASSERT_EQ(one, Get(2, "fodor"));
+    ASSERT_EQ(two, Get(2, "bla"));
+    Close();
+
+    if (iter == 0) {
+      // copy the logs from backup back to wal dir
+      for (auto& log : logs) {
+        CopyFile(backup_logs + "/" + log, db_options_.wal_dir + "/" + log);
+      }
+    }
+  }
+}
+
+#ifndef ROCKSDB_LITE  // TEST functions used are not supported
+TEST_P(ColumnFamilyTest, FlushTest) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two"});
+  ASSERT_OK(Put(0, "foo", "v1"));
+  ASSERT_OK(Put(0, "bar", "v2"));
+  ASSERT_OK(Put(1, "mirko", "v3"));
+  ASSERT_OK(Put(0, "foo", "v2"));
+  ASSERT_OK(Put(2, "fodor", "v5"));
+
+  for (int j = 0; j < 2; j++) {
+    ReadOptions ro;
+    std::vector<Iterator*> iterators;
+    // Hold super version.
+    if (j == 0) {
+      ASSERT_OK(db_->NewIterators(ro, handles_, &iterators));
+    }
+
+    for (int i = 0; i < 3; ++i) {
+      uint64_t max_total_in_memory_state = MaxTotalInMemoryState();
+      ASSERT_OK(Flush(i));
+      AssertMaxTotalInMemoryState(max_total_in_memory_state);
+    }
+    ASSERT_OK(Put(1, "foofoo", "bar"));
+    ASSERT_OK(Put(0, "foofoo", "bar"));
+
+    for (auto* it : iterators) {
+      ASSERT_OK(it->status());
+      delete it;
+    }
+  }
+  Reopen();
+
+  for (int iter = 0; iter <= 2; ++iter) {
+    ASSERT_EQ("v2", Get(0, "foo"));
+    ASSERT_EQ("v2", Get(0, "bar"));
+    ASSERT_EQ("v3", Get(1, "mirko"));
+    ASSERT_EQ("v5", Get(2, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(0, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(2, "foo"));
+    if (iter <= 1) {
+      Reopen();
+    }
+  }
+  Close();
+}
+
+// Makes sure that obsolete log files get deleted
+TEST_P(ColumnFamilyTest, LogDeletionTest) {
+  db_options_.max_total_wal_size = std::numeric_limits<uint64_t>::max();
+  column_family_options_.arena_block_size = 4 * 1024;
+  column_family_options_.write_buffer_size = 128000;  // 128KB
+  Open();
+  CreateColumnFamilies({"one", "two", "three", "four"});
+  // Each bracket is one log file. if number is in (), it means
+  // we don't need it anymore (it's been flushed)
+  // []
+  AssertCountLiveLogFiles(0);
+  PutRandomData(0, 1, 128);
+  // [0]
+  PutRandomData(1, 1, 128);
+  // [0, 1]
+  PutRandomData(1, 1000, 128);
+  WaitForFlush(1);
+  // [0, (1)] [1]
+  AssertCountLiveLogFiles(2);
+  PutRandomData(0, 1, 128);
+  // [0, (1)] [0, 1]
+  AssertCountLiveLogFiles(2);
+  PutRandomData(2, 1, 128);
+  // [0, (1)] [0, 1, 2]
+  PutRandomData(2, 1000, 128);
+  WaitForFlush(2);
+  // [0, (1)] [0, 1, (2)] [2]
+  AssertCountLiveLogFiles(3);
+  PutRandomData(2, 1000, 128);
+  WaitForFlush(2);
+  // [0, (1)] [0, 1, (2)] [(2)] [2]
+  AssertCountLiveLogFiles(4);
+  PutRandomData(3, 1, 128);
+  // [0, (1)] [0, 1, (2)] [(2)] [2, 3]
+  PutRandomData(1, 1, 128);
+  // [0, (1)] [0, 1, (2)] [(2)] [1, 2, 3]
+  AssertCountLiveLogFiles(4);
+  PutRandomData(1, 1000, 128);
+  WaitForFlush(1);
+  // [0, (1)] [0, (1), (2)] [(2)] [(1), 2, 3] [1]
+  AssertCountLiveLogFiles(5);
+  PutRandomData(0, 1000, 128);
+  WaitForFlush(0);
+  // [(0), (1)] [(0), (1), (2)] [(2)] [(1), 2, 3] [1, (0)] [0]
+  // delete obsolete logs -->
+  // [(1), 2, 3] [1, (0)] [0]
+  AssertCountLiveLogFiles(3);
+  PutRandomData(0, 1000, 128);
+  WaitForFlush(0);
+  // [(1), 2, 3] [1, (0)], [(0)] [0]
+  AssertCountLiveLogFiles(4);
+  PutRandomData(1, 1000, 128);
+  WaitForFlush(1);
+  // [(1), 2, 3] [(1), (0)] [(0)] [0, (1)] [1]
+  AssertCountLiveLogFiles(5);
+  PutRandomData(2, 1000, 128);
+  WaitForFlush(2);
+  // [(1), (2), 3] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2]
+  AssertCountLiveLogFiles(6);
+  PutRandomData(3, 1000, 128);
+  WaitForFlush(3);
+  // [(1), (2), (3)] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2, (3)] [3]
+  // delete obsolete logs -->
+  // [0, (1)] [1, (2)], [2, (3)] [3]
+  AssertCountLiveLogFiles(4);
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_P(ColumnFamilyTest, CrashAfterFlush) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(env_));
+  db_options_.env = fault_env.get();
+  Open();
+  CreateColumnFamilies({"one"});
+
+  WriteBatch batch;
+  ASSERT_OK(batch.Put(handles_[0], Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Put(handles_[1], Slice("foo"), Slice("bar")));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+  ASSERT_OK(Flush(0));
+  fault_env->SetFilesystemActive(false);
+
+  std::vector<std::string> names;
+  for (auto name : names_) {
+    if (name != "") {
+      names.push_back(name);
+    }
+  }
+  Close();
+  ASSERT_OK(fault_env->DropUnsyncedFileData());
+  fault_env->ResetState();
+  Open(names, {});
+
+  // Write batch should be atomic.
+  ASSERT_EQ(Get(0, "foo"), Get(1, "foo"));
+
+  Close();
+  db_options_.env = env_;
+}
+
+TEST_P(ColumnFamilyTest, OpenNonexistentColumnFamily) {
+  ASSERT_OK(TryOpen({"default"}));
+  Close();
+  ASSERT_TRUE(TryOpen({"default", "dne"}).IsInvalidArgument());
+}
+
+#ifndef ROCKSDB_LITE  // WaitForFlush() is not supported
+// Makes sure that obsolete log files get deleted
+TEST_P(ColumnFamilyTest, DifferentWriteBufferSizes) {
+  // disable flushing stale column families
+  db_options_.max_total_wal_size = std::numeric_limits<uint64_t>::max();
+  Open();
+  CreateColumnFamilies({"one", "two", "three"});
+  ColumnFamilyOptions default_cf, one, two, three;
+  // setup options. all column families have max_write_buffer_number setup to 10
+  // "default" -> 100KB memtable, start flushing immediately
+  // "one" -> 200KB memtable, start flushing with two immutable memtables
+  // "two" -> 1MB memtable, start flushing with three immutable memtables
+  // "three" -> 90KB memtable, start flushing with four immutable memtables
+  default_cf.write_buffer_size = 100000;
+  default_cf.arena_block_size = 4 * 4096;
+  default_cf.max_write_buffer_number = 10;
+  default_cf.min_write_buffer_number_to_merge = 1;
+  default_cf.max_write_buffer_size_to_maintain = 0;
+  one.write_buffer_size = 200000;
+  one.arena_block_size = 4 * 4096;
+  one.max_write_buffer_number = 10;
+  one.min_write_buffer_number_to_merge = 2;
+  one.max_write_buffer_size_to_maintain =
+      static_cast<int>(one.write_buffer_size);
+  two.write_buffer_size = 1000000;
+  two.arena_block_size = 4 * 4096;
+  two.max_write_buffer_number = 10;
+  two.min_write_buffer_number_to_merge = 3;
+  two.max_write_buffer_size_to_maintain =
+      static_cast<int>(two.write_buffer_size);
+  three.write_buffer_size = 4096 * 22;
+  three.arena_block_size = 4096;
+  three.max_write_buffer_number = 10;
+  three.min_write_buffer_number_to_merge = 4;
+  three.max_write_buffer_size_to_maintain =
+      static_cast<int>(three.write_buffer_size);
+
+  Reopen({default_cf, one, two, three});
+
+  int micros_wait_for_flush = 10000;
+  PutRandomData(0, 100, 1000);
+  WaitForFlush(0);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+  AssertCountLiveLogFiles(1);
+  PutRandomData(1, 200, 1000);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 0, 0});
+  AssertCountLiveLogFiles(2);
+  PutRandomData(2, 1000, 1000);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 1, 0});
+  AssertCountLiveLogFiles(3);
+  PutRandomData(2, 1000, 1000);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 2, 0});
+  AssertCountLiveLogFiles(4);
+  PutRandomData(3, 93, 990);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 2, 1});
+  AssertCountLiveLogFiles(5);
+  PutRandomData(3, 88, 990);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 2, 2});
+  AssertCountLiveLogFiles(6);
+  PutRandomData(3, 88, 990);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 2, 3});
+  AssertCountLiveLogFiles(7);
+  PutRandomData(0, 100, 1000);
+  WaitForFlush(0);
+  AssertNumberOfImmutableMemtables({0, 1, 2, 3});
+  AssertCountLiveLogFiles(8);
+  PutRandomData(2, 100, 10000);
+  WaitForFlush(2);
+  AssertNumberOfImmutableMemtables({0, 1, 0, 3});
+  AssertCountLiveLogFiles(9);
+  PutRandomData(3, 88, 990);
+  WaitForFlush(3);
+  AssertNumberOfImmutableMemtables({0, 1, 0, 0});
+  AssertCountLiveLogFiles(10);
+  PutRandomData(3, 88, 990);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 0, 1});
+  AssertCountLiveLogFiles(11);
+  PutRandomData(1, 200, 1000);
+  WaitForFlush(1);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 1});
+  AssertCountLiveLogFiles(5);
+  PutRandomData(3, 88 * 3, 990);
+  WaitForFlush(3);
+  PutRandomData(3, 88 * 4, 990);
+  WaitForFlush(3);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+  AssertCountLiveLogFiles(12);
+  PutRandomData(0, 100, 1000);
+  WaitForFlush(0);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+  AssertCountLiveLogFiles(12);
+  PutRandomData(2, 3 * 1000, 1000);
+  WaitForFlush(2);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+  AssertCountLiveLogFiles(12);
+  PutRandomData(1, 2 * 200, 1000);
+  WaitForFlush(1);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+  AssertCountLiveLogFiles(7);
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+// The test is commented out because we want to test that snapshot is
+// not created for memtables not supported it, but There isn't a memtable
+// that doesn't support snapshot right now. If we have one later, we can
+// re-enable the test.
+//
+// #ifndef ROCKSDB_LITE  // Cuckoo is not supported in lite
+//   TEST_P(ColumnFamilyTest, MemtableNotSupportSnapshot) {
+//   db_options_.allow_concurrent_memtable_write = false;
+//   Open();
+//   auto* s1 = dbfull()->GetSnapshot();
+//   ASSERT_TRUE(s1 != nullptr);
+//   dbfull()->ReleaseSnapshot(s1);
+
+//   // Add a column family that doesn't support snapshot
+//   ColumnFamilyOptions first;
+//   first.memtable_factory.reset(new DummyMemtableNotSupportingSnapshot());
+//   CreateColumnFamilies({"first"}, {first});
+//   auto* s2 = dbfull()->GetSnapshot();
+//   ASSERT_TRUE(s2 == nullptr);
+
+//   // Add a column family that supports snapshot. Snapshot stays not
+//   supported. ColumnFamilyOptions second; CreateColumnFamilies({"second"},
+//   {second}); auto* s3 = dbfull()->GetSnapshot(); ASSERT_TRUE(s3 == nullptr);
+//   Close();
+// }
+// #endif  // !ROCKSDB_LITE
+
+class TestComparator : public Comparator {
+  int Compare(const ROCKSDB_NAMESPACE::Slice& /*a*/,
+              const ROCKSDB_NAMESPACE::Slice& /*b*/) const override {
+    return 0;
+  }
+  const char* Name() const override { return "Test"; }
+  void FindShortestSeparator(
+      std::string* /*start*/,
+      const ROCKSDB_NAMESPACE::Slice& /*limit*/) const override {}
+  void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+static TestComparator third_comparator;
+static TestComparator fourth_comparator;
+
+// Test that we can retrieve the comparator from a created CF
+TEST_P(ColumnFamilyTest, GetComparator) {
+  Open();
+  // Add a column family with no comparator specified
+  CreateColumnFamilies({"first"});
+  const Comparator* comp = handles_[0]->GetComparator();
+  ASSERT_EQ(comp, BytewiseComparator());
+
+  // Add three column families - one with no comparator and two
+  // with comparators specified
+  ColumnFamilyOptions second, third, fourth;
+  second.comparator = &third_comparator;
+  third.comparator = &fourth_comparator;
+  CreateColumnFamilies({"second", "third", "fourth"}, {second, third, fourth});
+  ASSERT_EQ(handles_[1]->GetComparator(), BytewiseComparator());
+  ASSERT_EQ(handles_[2]->GetComparator(), &third_comparator);
+  ASSERT_EQ(handles_[3]->GetComparator(), &fourth_comparator);
+  Close();
+}
+
+TEST_P(ColumnFamilyTest, DifferentMergeOperators) {
+  Open();
+  CreateColumnFamilies({"first", "second"});
+  ColumnFamilyOptions default_cf, first, second;
+  first.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  second.merge_operator = MergeOperators::CreateStringAppendOperator();
+  Reopen({default_cf, first, second});
+
+  std::string one, two, three;
+  PutFixed64(&one, 1);
+  PutFixed64(&two, 2);
+  PutFixed64(&three, 3);
+
+  ASSERT_OK(Put(0, "foo", two));
+  ASSERT_OK(Put(0, "foo", one));
+  ASSERT_TRUE(Merge(0, "foo", two).IsNotSupported());
+  ASSERT_EQ(Get(0, "foo"), one);
+
+  ASSERT_OK(Put(1, "foo", two));
+  ASSERT_OK(Put(1, "foo", one));
+  ASSERT_OK(Merge(1, "foo", two));
+  ASSERT_EQ(Get(1, "foo"), three);
+
+  ASSERT_OK(Put(2, "foo", two));
+  ASSERT_OK(Put(2, "foo", one));
+  ASSERT_OK(Merge(2, "foo", two));
+  ASSERT_EQ(Get(2, "foo"), one + "," + two);
+  Close();
+}
+
+#ifndef ROCKSDB_LITE  // WaitForFlush() is not supported
+TEST_P(ColumnFamilyTest, DifferentCompactionStyles) {
+  Open();
+  CreateColumnFamilies({"one", "two"});
+  ColumnFamilyOptions default_cf, one, two;
+  db_options_.max_open_files = 20;  // only 10 files in file cache
+
+  default_cf.compaction_style = kCompactionStyleLevel;
+  default_cf.num_levels = 3;
+  default_cf.write_buffer_size = 64 << 10;  // 64KB
+  default_cf.target_file_size_base = 30 << 10;
+  default_cf.max_compaction_bytes = static_cast<uint64_t>(1) << 60;
+
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.no_block_cache = true;
+  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  one.compaction_style = kCompactionStyleUniversal;
+
+  one.num_levels = 1;
+  // trigger compaction if there are >= 4 files
+  one.level0_file_num_compaction_trigger = 4;
+  one.write_buffer_size = 120000;
+
+  two.compaction_style = kCompactionStyleLevel;
+  two.num_levels = 4;
+  two.level0_file_num_compaction_trigger = 3;
+  two.write_buffer_size = 100000;
+
+  Reopen({default_cf, one, two});
+
+  // SETUP column family "one" -- universal style
+  for (int i = 0; i < one.level0_file_num_compaction_trigger - 1; ++i) {
+    PutRandomData(1, 10, 12000);
+    PutRandomData(1, 1, 10);
+    WaitForFlush(1);
+    AssertFilesPerLevel(std::to_string(i + 1), 1);
+  }
+
+  // SETUP column family "two" -- level style with 4 levels
+  for (int i = 0; i < two.level0_file_num_compaction_trigger - 1; ++i) {
+    PutRandomData(2, 10, 12000);
+    PutRandomData(2, 1, 10);
+    WaitForFlush(2);
+    AssertFilesPerLevel(std::to_string(i + 1), 2);
+  }
+
+  // TRIGGER compaction "one"
+  PutRandomData(1, 10, 12000);
+  PutRandomData(1, 1, 10);
+
+  // TRIGGER compaction "two"
+  PutRandomData(2, 10, 12000);
+  PutRandomData(2, 1, 10);
+
+  // WAIT for compactions
+  WaitForCompaction();
+
+  // VERIFY compaction "one"
+  AssertFilesPerLevel("1", 1);
+
+  // VERIFY compaction "two"
+  AssertFilesPerLevel("0,1", 2);
+  CompactAll(2);
+  AssertFilesPerLevel("0,1", 2);
+
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+// Sync points not supported in RocksDB Lite
+
+TEST_P(ColumnFamilyTest, MultipleManualCompactions) {
+  Open();
+  CreateColumnFamilies({"one", "two"});
+  ColumnFamilyOptions default_cf, one, two;
+  db_options_.max_open_files = 20;  // only 10 files in file cache
+  db_options_.max_background_compactions = 3;
+
+  default_cf.compaction_style = kCompactionStyleLevel;
+  default_cf.num_levels = 3;
+  default_cf.write_buffer_size = 64 << 10;  // 64KB
+  default_cf.target_file_size_base = 30 << 10;
+  default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.no_block_cache = true;
+  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  one.compaction_style = kCompactionStyleUniversal;
+
+  one.num_levels = 1;
+  // trigger compaction if there are >= 4 files
+  one.level0_file_num_compaction_trigger = 4;
+  one.write_buffer_size = 120000;
+
+  two.compaction_style = kCompactionStyleLevel;
+  two.num_levels = 4;
+  two.level0_file_num_compaction_trigger = 3;
+  two.write_buffer_size = 100000;
+
+  Reopen({default_cf, one, two});
+
+  // SETUP column family "one" -- universal style
+  for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(std::to_string(i + 1), 1);
+  }
+  std::atomic_bool cf_1_1{true};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"ColumnFamilyTest::MultiManual:4", "ColumnFamilyTest::MultiManual:1"},
+       {"ColumnFamilyTest::MultiManual:2", "ColumnFamilyTest::MultiManual:5"},
+       {"ColumnFamilyTest::MultiManual:2", "ColumnFamilyTest::MultiManual:3"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+        if (cf_1_1.exchange(false)) {
+          TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:4");
+          TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:3");
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  std::vector<port::Thread> threads;
+  threads.emplace_back([&] {
+    CompactRangeOptions compact_options;
+    compact_options.exclusive_manual_compaction = false;
+    ASSERT_OK(
+        db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+  });
+
+  // SETUP column family "two" -- level style with 4 levels
+  for (int i = 0; i < two.level0_file_num_compaction_trigger - 2; ++i) {
+    PutRandomData(2, 10, 12000);
+    PutRandomData(2, 1, 10);
+    WaitForFlush(2);
+    AssertFilesPerLevel(std::to_string(i + 1), 2);
+  }
+  threads.emplace_back([&] {
+    TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:1");
+    CompactRangeOptions compact_options;
+    compact_options.exclusive_manual_compaction = false;
+    ASSERT_OK(
+        db_->CompactRange(compact_options, handles_[2], nullptr, nullptr));
+    TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:2");
+  });
+
+  TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:5");
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  // VERIFY compaction "one"
+  AssertFilesPerLevel("1", 1);
+
+  // VERIFY compaction "two"
+  AssertFilesPerLevel("0,1", 2);
+  CompactAll(2);
+  AssertFilesPerLevel("0,1", 2);
+  // Compare against saved keys
+  std::set<std::string>::iterator key_iter = keys_[1].begin();
+  while (key_iter != keys_[1].end()) {
+    ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+    key_iter++;
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  Close();
+}
+
+TEST_P(ColumnFamilyTest, AutomaticAndManualCompactions) {
+  Open();
+  CreateColumnFamilies({"one", "two"});
+  ColumnFamilyOptions default_cf, one, two;
+  db_options_.max_open_files = 20;  // only 10 files in file cache
+  db_options_.max_background_compactions = 3;
+
+  default_cf.compaction_style = kCompactionStyleLevel;
+  default_cf.num_levels = 3;
+  default_cf.write_buffer_size = 64 << 10;  // 64KB
+  default_cf.target_file_size_base = 30 << 10;
+  default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  ;
+  table_options.no_block_cache = true;
+  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  one.compaction_style = kCompactionStyleUniversal;
+
+  one.num_levels = 1;
+  // trigger compaction if there are >= 4 files
+  one.level0_file_num_compaction_trigger = 4;
+  one.write_buffer_size = 120000;
+
+  two.compaction_style = kCompactionStyleLevel;
+  two.num_levels = 4;
+  two.level0_file_num_compaction_trigger = 3;
+  two.write_buffer_size = 100000;
+
+  Reopen({default_cf, one, two});
+  // make sure all background compaction jobs can be scheduled
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  std::atomic_bool cf_1_1{true};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"ColumnFamilyTest::AutoManual:4", "ColumnFamilyTest::AutoManual:1"},
+       {"ColumnFamilyTest::AutoManual:2", "ColumnFamilyTest::AutoManual:5"},
+       {"ColumnFamilyTest::AutoManual:2", "ColumnFamilyTest::AutoManual:3"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+        if (cf_1_1.exchange(false)) {
+          TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:4");
+          TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:3");
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  // SETUP column family "one" -- universal style
+  for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(std::to_string(i + 1), 1);
+  }
+
+  TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:1");
+
+  // SETUP column family "two" -- level style with 4 levels
+  for (int i = 0; i < two.level0_file_num_compaction_trigger - 2; ++i) {
+    PutRandomData(2, 10, 12000);
+    PutRandomData(2, 1, 10);
+    WaitForFlush(2);
+    AssertFilesPerLevel(std::to_string(i + 1), 2);
+  }
+  ROCKSDB_NAMESPACE::port::Thread threads([&] {
+    CompactRangeOptions compact_options;
+    compact_options.exclusive_manual_compaction = false;
+    ASSERT_OK(
+        db_->CompactRange(compact_options, handles_[2], nullptr, nullptr));
+    TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:2");
+  });
+
+  TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:5");
+  threads.join();
+
+  // WAIT for compactions
+  WaitForCompaction();
+
+  // VERIFY compaction "one"
+  AssertFilesPerLevel("1", 1);
+
+  // VERIFY compaction "two"
+  AssertFilesPerLevel("0,1", 2);
+  CompactAll(2);
+  AssertFilesPerLevel("0,1", 2);
+  // Compare against saved keys
+  std::set<std::string>::iterator key_iter = keys_[1].begin();
+  while (key_iter != keys_[1].end()) {
+    ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+    key_iter++;
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(ColumnFamilyTest, ManualAndAutomaticCompactions) {
+  Open();
+  CreateColumnFamilies({"one", "two"});
+  ColumnFamilyOptions default_cf, one, two;
+  db_options_.max_open_files = 20;  // only 10 files in file cache
+  db_options_.max_background_compactions = 3;
+
+  default_cf.compaction_style = kCompactionStyleLevel;
+  default_cf.num_levels = 3;
+  default_cf.write_buffer_size = 64 << 10;  // 64KB
+  default_cf.target_file_size_base = 30 << 10;
+  default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  ;
+  table_options.no_block_cache = true;
+  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  one.compaction_style = kCompactionStyleUniversal;
+
+  one.num_levels = 1;
+  // trigger compaction if there are >= 4 files
+  one.level0_file_num_compaction_trigger = 4;
+  one.write_buffer_size = 120000;
+
+  two.compaction_style = kCompactionStyleLevel;
+  two.num_levels = 4;
+  two.level0_file_num_compaction_trigger = 3;
+  two.write_buffer_size = 100000;
+
+  Reopen({default_cf, one, two});
+  // make sure all background compaction jobs can be scheduled
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  // SETUP column family "one" -- universal style
+  for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(std::to_string(i + 1), 1);
+  }
+  std::atomic_bool cf_1_1{true};
+  std::atomic_bool cf_1_2{true};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:1"},
+       {"ColumnFamilyTest::ManualAuto:5", "ColumnFamilyTest::ManualAuto:2"},
+       {"ColumnFamilyTest::ManualAuto:2", "ColumnFamilyTest::ManualAuto:3"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+        if (cf_1_1.exchange(false)) {
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:4");
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:3");
+        } else if (cf_1_2.exchange(false)) {
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:2");
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ROCKSDB_NAMESPACE::port::Thread threads([&] {
+    CompactRangeOptions compact_options;
+    compact_options.exclusive_manual_compaction = false;
+    ASSERT_OK(
+        db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:1");
+
+  // SETUP column family "two" -- level style with 4 levels
+  for (int i = 0; i < two.level0_file_num_compaction_trigger; ++i) {
+    PutRandomData(2, 10, 12000);
+    PutRandomData(2, 1, 10);
+    WaitForFlush(2);
+    AssertFilesPerLevel(std::to_string(i + 1), 2);
+  }
+  TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:5");
+  threads.join();
+
+  // WAIT for compactions
+  WaitForCompaction();
+
+  // VERIFY compaction "one"
+  AssertFilesPerLevel("1", 1);
+
+  // VERIFY compaction "two"
+  AssertFilesPerLevel("0,1", 2);
+  CompactAll(2);
+  AssertFilesPerLevel("0,1", 2);
+  // Compare against saved keys
+  std::set<std::string>::iterator key_iter = keys_[1].begin();
+  while (key_iter != keys_[1].end()) {
+    ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+    key_iter++;
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(ColumnFamilyTest, SameCFManualManualCompactions) {
+  Open();
+  CreateColumnFamilies({"one"});
+  ColumnFamilyOptions default_cf, one;
+  db_options_.max_open_files = 20;  // only 10 files in file cache
+  db_options_.max_background_compactions = 3;
+
+  default_cf.compaction_style = kCompactionStyleLevel;
+  default_cf.num_levels = 3;
+  default_cf.write_buffer_size = 64 << 10;  // 64KB
+  default_cf.target_file_size_base = 30 << 10;
+  default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  ;
+  table_options.no_block_cache = true;
+  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  one.compaction_style = kCompactionStyleUniversal;
+
+  one.num_levels = 1;
+  // trigger compaction if there are >= 4 files
+  one.level0_file_num_compaction_trigger = 4;
+  one.write_buffer_size = 120000;
+
+  Reopen({default_cf, one});
+  // make sure all background compaction jobs can be scheduled
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  // SETUP column family "one" -- universal style
+  for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(std::to_string(i + 1), 1);
+  }
+  std::atomic_bool cf_1_1{true};
+  std::atomic_bool cf_1_2{true};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"ColumnFamilyTest::ManualManual:4", "ColumnFamilyTest::ManualManual:2"},
+       {"ColumnFamilyTest::ManualManual:4", "ColumnFamilyTest::ManualManual:5"},
+       {"ColumnFamilyTest::ManualManual:1", "ColumnFamilyTest::ManualManual:2"},
+       {"ColumnFamilyTest::ManualManual:1",
+        "ColumnFamilyTest::ManualManual:3"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+        if (cf_1_1.exchange(false)) {
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:4");
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:3");
+        } else if (cf_1_2.exchange(false)) {
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:2");
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ROCKSDB_NAMESPACE::port::Thread threads([&] {
+    CompactRangeOptions compact_options;
+    compact_options.exclusive_manual_compaction = true;
+    ASSERT_OK(
+        db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:5");
+
+  WaitForFlush(1);
+
+  // Add more L0 files and force another manual compaction
+  for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(
+        std::to_string(one.level0_file_num_compaction_trigger + i), 1);
+  }
+
+  ROCKSDB_NAMESPACE::port::Thread threads1([&] {
+    CompactRangeOptions compact_options;
+    compact_options.exclusive_manual_compaction = false;
+    ASSERT_OK(
+        db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:1");
+
+  threads.join();
+  threads1.join();
+  WaitForCompaction();
+  // VERIFY compaction "one"
+  ASSERT_LE(NumTableFilesAtLevel(0, 1), 2);
+
+  // Compare against saved keys
+  std::set<std::string>::iterator key_iter = keys_[1].begin();
+  while (key_iter != keys_[1].end()) {
+    ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+    key_iter++;
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(ColumnFamilyTest, SameCFManualAutomaticCompactions) {
+  Open();
+  CreateColumnFamilies({"one"});
+  ColumnFamilyOptions default_cf, one;
+  db_options_.max_open_files = 20;  // only 10 files in file cache
+  db_options_.max_background_compactions = 3;
+
+  default_cf.compaction_style = kCompactionStyleLevel;
+  default_cf.num_levels = 3;
+  default_cf.write_buffer_size = 64 << 10;  // 64KB
+  default_cf.target_file_size_base = 30 << 10;
+  default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  ;
+  table_options.no_block_cache = true;
+  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  one.compaction_style = kCompactionStyleUniversal;
+
+  one.num_levels = 1;
+  // trigger compaction if there are >= 4 files
+  one.level0_file_num_compaction_trigger = 4;
+  one.write_buffer_size = 120000;
+
+  Reopen({default_cf, one});
+  // make sure all background compaction jobs can be scheduled
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  // SETUP column family "one" -- universal style
+  for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(std::to_string(i + 1), 1);
+  }
+  std::atomic_bool cf_1_1{true};
+  std::atomic_bool cf_1_2{true};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:2"},
+       {"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:5"},
+       {"ColumnFamilyTest::ManualAuto:1", "ColumnFamilyTest::ManualAuto:2"},
+       {"ColumnFamilyTest::ManualAuto:1", "ColumnFamilyTest::ManualAuto:3"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+        if (cf_1_1.exchange(false)) {
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:4");
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:3");
+        } else if (cf_1_2.exchange(false)) {
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:2");
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ROCKSDB_NAMESPACE::port::Thread threads([&] {
+    CompactRangeOptions compact_options;
+    compact_options.exclusive_manual_compaction = false;
+    ASSERT_OK(
+        db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:5");
+
+  WaitForFlush(1);
+
+  // Add more L0 files and force automatic compaction
+  for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(
+        std::to_string(one.level0_file_num_compaction_trigger + i), 1);
+  }
+
+  TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:1");
+
+  threads.join();
+  WaitForCompaction();
+  // VERIFY compaction "one"
+  ASSERT_LE(NumTableFilesAtLevel(0, 1), 2);
+
+  // Compare against saved keys
+  std::set<std::string>::iterator key_iter = keys_[1].begin();
+  while (key_iter != keys_[1].end()) {
+    ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+    key_iter++;
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(ColumnFamilyTest, SameCFManualAutomaticCompactionsLevel) {
+  Open();
+  CreateColumnFamilies({"one"});
+  ColumnFamilyOptions default_cf, one;
+  db_options_.max_open_files = 20;  // only 10 files in file cache
+  db_options_.max_background_compactions = 3;
+
+  default_cf.compaction_style = kCompactionStyleLevel;
+  default_cf.num_levels = 3;
+  default_cf.write_buffer_size = 64 << 10;  // 64KB
+  default_cf.target_file_size_base = 30 << 10;
+  default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  ;
+  table_options.no_block_cache = true;
+  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  one.compaction_style = kCompactionStyleLevel;
+
+  one.num_levels = 1;
+  // trigger compaction if there are >= 4 files
+  one.level0_file_num_compaction_trigger = 3;
+  one.write_buffer_size = 120000;
+
+  Reopen({default_cf, one});
+  // make sure all background compaction jobs can be scheduled
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  // SETUP column family "one" -- level style
+  for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(std::to_string(i + 1), 1);
+  }
+  std::atomic_bool cf_1_1{true};
+  std::atomic_bool cf_1_2{true};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:2"},
+       {"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:5"},
+       {"ColumnFamilyTest::ManualAuto:3", "ColumnFamilyTest::ManualAuto:2"},
+       {"LevelCompactionPicker::PickCompactionBySize:0",
+        "ColumnFamilyTest::ManualAuto:3"},
+       {"ColumnFamilyTest::ManualAuto:1", "ColumnFamilyTest::ManualAuto:3"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+        if (cf_1_1.exchange(false)) {
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:4");
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:3");
+        } else if (cf_1_2.exchange(false)) {
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:2");
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ROCKSDB_NAMESPACE::port::Thread threads([&] {
+    CompactRangeOptions compact_options;
+    compact_options.exclusive_manual_compaction = false;
+    ASSERT_OK(
+        db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:5");
+
+  // Add more L0 files and force automatic compaction
+  for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(
+        std::to_string(one.level0_file_num_compaction_trigger + i), 1);
+  }
+
+  TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:1");
+
+  threads.join();
+  WaitForCompaction();
+  // VERIFY compaction "one"
+  AssertFilesPerLevel("0,1", 1);
+
+  // Compare against saved keys
+  std::set<std::string>::iterator key_iter = keys_[1].begin();
+  while (key_iter != keys_[1].end()) {
+    ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+    key_iter++;
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// In this test, we generate enough files to trigger automatic compactions.
+// The automatic compaction waits in NonTrivial:AfterRun
+// We generate more files and then trigger an automatic compaction
+// This will wait because the automatic compaction has files it needs.
+// Once the conflict is hit, the automatic compaction starts and ends
+// Then the manual will run and end.
+TEST_P(ColumnFamilyTest, SameCFAutomaticManualCompactions) {
+  Open();
+  CreateColumnFamilies({"one"});
+  ColumnFamilyOptions default_cf, one;
+  db_options_.max_open_files = 20;  // only 10 files in file cache
+  db_options_.max_background_compactions = 3;
+
+  default_cf.compaction_style = kCompactionStyleLevel;
+  default_cf.num_levels = 3;
+  default_cf.write_buffer_size = 64 << 10;  // 64KB
+  default_cf.target_file_size_base = 30 << 10;
+  default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  ;
+  table_options.no_block_cache = true;
+  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  one.compaction_style = kCompactionStyleUniversal;
+
+  one.num_levels = 1;
+  // trigger compaction if there are >= 4 files
+  one.level0_file_num_compaction_trigger = 4;
+  one.write_buffer_size = 120000;
+
+  Reopen({default_cf, one});
+  // make sure all background compaction jobs can be scheduled
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  std::atomic_bool cf_1_1{true};
+  std::atomic_bool cf_1_2{true};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"ColumnFamilyTest::AutoManual:4", "ColumnFamilyTest::AutoManual:2"},
+       {"ColumnFamilyTest::AutoManual:4", "ColumnFamilyTest::AutoManual:5"},
+       {"CompactionPicker::CompactRange:Conflict",
+        "ColumnFamilyTest::AutoManual:3"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+        if (cf_1_1.exchange(false)) {
+          TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:4");
+          TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:3");
+        } else if (cf_1_2.exchange(false)) {
+          TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:2");
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // SETUP column family "one" -- universal style
+  for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(std::to_string(i + 1), 1);
+  }
+
+  TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:5");
+
+  // Add another L0 file and force automatic compaction
+  for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+  }
+
+  CompactRangeOptions compact_options;
+  compact_options.exclusive_manual_compaction = false;
+  ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+
+  TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:1");
+
+  WaitForCompaction();
+  // VERIFY compaction "one"
+  AssertFilesPerLevel("1", 1);
+  // Compare against saved keys
+  std::set<std::string>::iterator key_iter = keys_[1].begin();
+  while (key_iter != keys_[1].end()) {
+    ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+    key_iter++;
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE  // Tailing iterator not supported
+namespace {
+std::string IterStatus(Iterator* iter) {
+  std::string result;
+  if (iter->Valid()) {
+    result = iter->key().ToString() + "->" + iter->value().ToString();
+  } else {
+    EXPECT_OK(iter->status());
+    result = "(invalid)";
+  }
+  return result;
+}
+}  // anonymous namespace
+
+TEST_P(ColumnFamilyTest, NewIteratorsTest) {
+  // iter == 0 -- no tailing
+  // iter == 2 -- tailing
+  for (int iter = 0; iter < 2; ++iter) {
+    Open();
+    CreateColumnFamiliesAndReopen({"one", "two"});
+    ASSERT_OK(Put(0, "a", "b"));
+    ASSERT_OK(Put(1, "b", "a"));
+    ASSERT_OK(Put(2, "c", "m"));
+    ASSERT_OK(Put(2, "v", "t"));
+    std::vector<Iterator*> iterators;
+    ReadOptions options;
+    options.tailing = (iter == 1);
+    ASSERT_OK(db_->NewIterators(options, handles_, &iterators));
+
+    for (auto it : iterators) {
+      it->SeekToFirst();
+    }
+    ASSERT_EQ(IterStatus(iterators[0]), "a->b");
+    ASSERT_EQ(IterStatus(iterators[1]), "b->a");
+    ASSERT_EQ(IterStatus(iterators[2]), "c->m");
+
+    ASSERT_OK(Put(1, "x", "x"));
+
+    for (auto it : iterators) {
+      it->Next();
+    }
+
+    ASSERT_EQ(IterStatus(iterators[0]), "(invalid)");
+    if (iter == 0) {
+      // no tailing
+      ASSERT_EQ(IterStatus(iterators[1]), "(invalid)");
+    } else {
+      // tailing
+      ASSERT_EQ(IterStatus(iterators[1]), "x->x");
+    }
+    ASSERT_EQ(IterStatus(iterators[2]), "v->t");
+
+    for (auto it : iterators) {
+      delete it;
+    }
+    Destroy();
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE  // ReadOnlyDB is not supported
+TEST_P(ColumnFamilyTest, ReadOnlyDBTest) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
+  ASSERT_OK(Put(0, "a", "b"));
+  ASSERT_OK(Put(1, "foo", "bla"));
+  ASSERT_OK(Put(2, "foo", "blabla"));
+  ASSERT_OK(Put(3, "foo", "blablabla"));
+  ASSERT_OK(Put(4, "foo", "blablablabla"));
+
+  DropColumnFamilies({2});
+  Close();
+  // open only a subset of column families
+  AssertOpenReadOnly({"default", "one", "four"});
+  ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
+  ASSERT_EQ("bla", Get(1, "foo"));
+  ASSERT_EQ("blablablabla", Get(2, "foo"));
+
+  // test newiterators
+  {
+    std::vector<Iterator*> iterators;
+    ASSERT_OK(db_->NewIterators(ReadOptions(), handles_, &iterators));
+    for (auto it : iterators) {
+      it->SeekToFirst();
+    }
+    ASSERT_EQ(IterStatus(iterators[0]), "a->b");
+    ASSERT_EQ(IterStatus(iterators[1]), "foo->bla");
+    ASSERT_EQ(IterStatus(iterators[2]), "foo->blablablabla");
+    for (auto it : iterators) {
+      it->Next();
+    }
+    ASSERT_EQ(IterStatus(iterators[0]), "(invalid)");
+    ASSERT_EQ(IterStatus(iterators[1]), "(invalid)");
+    ASSERT_EQ(IterStatus(iterators[2]), "(invalid)");
+
+    for (auto it : iterators) {
+      delete it;
+    }
+  }
+
+  Close();
+  // can't open dropped column family
+  Status s = OpenReadOnly({"default", "one", "two"});
+  ASSERT_TRUE(!s.ok());
+
+  // Can't open without specifying default column family
+  s = OpenReadOnly({"one", "four"});
+  ASSERT_TRUE(!s.ok());
+}
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE  //  WaitForFlush() is not supported in lite
+TEST_P(ColumnFamilyTest, DontRollEmptyLogs) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
+
+  for (size_t i = 0; i < handles_.size(); ++i) {
+    PutRandomData(static_cast<int>(i), 10, 100);
+  }
+  int num_writable_file_start = env_->GetNumberOfNewWritableFileCalls();
+  // this will trigger the flushes
+  for (int i = 0; i <= 4; ++i) {
+    ASSERT_OK(Flush(i));
+  }
+
+  for (int i = 0; i < 4; ++i) {
+    WaitForFlush(i);
+  }
+  int total_new_writable_files =
+      env_->GetNumberOfNewWritableFileCalls() - num_writable_file_start;
+  ASSERT_EQ(static_cast<size_t>(total_new_writable_files), handles_.size() + 1);
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE  //  WaitForCompaction() is not supported in lite
+TEST_P(ColumnFamilyTest, FlushStaleColumnFamilies) {
+  Open();
+  CreateColumnFamilies({"one", "two"});
+  ColumnFamilyOptions default_cf, one, two;
+  default_cf.write_buffer_size = 100000;  // small write buffer size
+  default_cf.arena_block_size = 4096;
+  default_cf.disable_auto_compactions = true;
+  one.disable_auto_compactions = true;
+  two.disable_auto_compactions = true;
+  db_options_.max_total_wal_size = 210000;
+
+  Reopen({default_cf, one, two});
+
+  PutRandomData(2, 1, 10);  // 10 bytes
+  for (int i = 0; i < 2; ++i) {
+    PutRandomData(0, 100, 1000);  // flush
+    WaitForFlush(0);
+
+    AssertCountLiveFiles(i + 1);
+  }
+  // third flush. now, CF [two] should be detected as stale and flushed
+  // column family 1 should not be flushed since it's empty
+  PutRandomData(0, 100, 1000);  // flush
+  WaitForFlush(0);
+  WaitForFlush(2);
+  // at least 3 files for default column families, 1 file for column family
+  // [two], zero files for column family [one], because it's empty
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_GE(metadata.size(), 4);
+  bool has_cf1_sst = false;
+  bool has_cf2_sst = false;
+  for (const auto& file : metadata) {
+    if (file.column_family_name == "one") {
+      has_cf1_sst = true;
+    } else if (file.column_family_name == "two") {
+      has_cf2_sst = true;
+    }
+  }
+  ASSERT_FALSE(has_cf1_sst);
+  ASSERT_TRUE(has_cf2_sst);
+
+  ASSERT_OK(Flush(0));
+  ASSERT_EQ(0, dbfull()->TEST_total_log_size());
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_P(ColumnFamilyTest, CreateMissingColumnFamilies) {
+  Status s = TryOpen({"one", "two"});
+  ASSERT_TRUE(!s.ok());
+  db_options_.create_missing_column_families = true;
+  s = TryOpen({"default", "one", "two"});
+  ASSERT_TRUE(s.ok());
+  Close();
+}
+
+TEST_P(ColumnFamilyTest, SanitizeOptions) {
+  DBOptions db_options;
+  for (int s = kCompactionStyleLevel; s <= kCompactionStyleUniversal; ++s) {
+    for (int l = 0; l <= 2; l++) {
+      for (int i = 1; i <= 3; i++) {
+        for (int j = 1; j <= 3; j++) {
+          for (int k = 1; k <= 3; k++) {
+            ColumnFamilyOptions original;
+            original.compaction_style = static_cast<CompactionStyle>(s);
+            original.num_levels = l;
+            original.level0_stop_writes_trigger = i;
+            original.level0_slowdown_writes_trigger = j;
+            original.level0_file_num_compaction_trigger = k;
+            original.write_buffer_size =
+                l * 4 * 1024 * 1024 + i * 1024 * 1024 + j * 1024 + k;
+
+            ColumnFamilyOptions result =
+                SanitizeOptions(ImmutableDBOptions(db_options), original);
+            ASSERT_TRUE(result.level0_stop_writes_trigger >=
+                        result.level0_slowdown_writes_trigger);
+            ASSERT_TRUE(result.level0_slowdown_writes_trigger >=
+                        result.level0_file_num_compaction_trigger);
+            ASSERT_TRUE(result.level0_file_num_compaction_trigger ==
+                        original.level0_file_num_compaction_trigger);
+            if (s == kCompactionStyleLevel) {
+              ASSERT_GE(result.num_levels, 2);
+            } else {
+              ASSERT_GE(result.num_levels, 1);
+              if (original.num_levels >= 1) {
+                ASSERT_EQ(result.num_levels, original.num_levels);
+              }
+            }
+
+            // Make sure Sanitize options sets arena_block_size to 1/8 of
+            // the write_buffer_size, rounded up to a multiple of 4k.
+            size_t expected_arena_block_size =
+                l * 4 * 1024 * 1024 / 8 + i * 1024 * 1024 / 8;
+            if (j + k != 0) {
+              // not a multiple of 4k, round up 4k
+              expected_arena_block_size += 4 * 1024;
+            }
+            expected_arena_block_size =
+                std::min(size_t{1024 * 1024}, expected_arena_block_size);
+            ASSERT_EQ(expected_arena_block_size, result.arena_block_size);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST_P(ColumnFamilyTest, ReadDroppedColumnFamily) {
+  // iter 0 -- drop CF, don't reopen
+  // iter 1 -- delete CF, reopen
+  for (int iter = 0; iter < 2; ++iter) {
+    db_options_.create_missing_column_families = true;
+    db_options_.max_open_files = 20;
+    // delete obsolete files always
+    db_options_.delete_obsolete_files_period_micros = 0;
+    Open({"default", "one", "two"});
+    ColumnFamilyOptions options;
+    options.level0_file_num_compaction_trigger = 100;
+    options.level0_slowdown_writes_trigger = 200;
+    options.level0_stop_writes_trigger = 200;
+    options.write_buffer_size = 100000;  // small write buffer size
+    Reopen({options, options, options});
+
+    // 1MB should create ~10 files for each CF
+    int kKeysNum = 10000;
+    PutRandomData(0, kKeysNum, 100);
+    PutRandomData(1, kKeysNum, 100);
+    PutRandomData(2, kKeysNum, 100);
+
+    {
+      std::unique_ptr<Iterator> iterator(
+          db_->NewIterator(ReadOptions(), handles_[2]));
+      iterator->SeekToFirst();
+
+      if (iter == 0) {
+        // Drop CF two
+        ASSERT_OK(db_->DropColumnFamily(handles_[2]));
+      } else {
+        // delete CF two
+        ASSERT_OK(db_->DestroyColumnFamilyHandle(handles_[2]));
+        handles_[2] = nullptr;
+      }
+      // Make sure iterator created can still be used.
+      int count = 0;
+      for (; iterator->Valid(); iterator->Next()) {
+        ASSERT_OK(iterator->status());
+        ++count;
+      }
+      ASSERT_OK(iterator->status());
+      ASSERT_EQ(count, kKeysNum);
+    }
+
+    // Add bunch more data to other CFs
+    PutRandomData(0, kKeysNum, 100);
+    PutRandomData(1, kKeysNum, 100);
+
+    if (iter == 1) {
+      Reopen();
+    }
+
+    // Since we didn't delete CF handle, RocksDB's contract guarantees that
+    // we're still able to read dropped CF
+    for (int i = 0; i < 3; ++i) {
+      std::unique_ptr<Iterator> iterator(
+          db_->NewIterator(ReadOptions(), handles_[i]));
+      int count = 0;
+      for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+        ASSERT_OK(iterator->status());
+        ++count;
+      }
+      ASSERT_OK(iterator->status());
+      ASSERT_EQ(count, kKeysNum * ((i == 2) ? 1 : 2));
+    }
+
+    Close();
+    Destroy();
+  }
+}
+
+TEST_P(ColumnFamilyTest, LiveIteratorWithDroppedColumnFamily) {
+  db_options_.create_missing_column_families = true;
+  db_options_.max_open_files = 20;
+  // delete obsolete files always
+  db_options_.delete_obsolete_files_period_micros = 0;
+  Open({"default", "one", "two"});
+  ColumnFamilyOptions options;
+  options.level0_file_num_compaction_trigger = 100;
+  options.level0_slowdown_writes_trigger = 200;
+  options.level0_stop_writes_trigger = 200;
+  options.write_buffer_size = 100000;  // small write buffer size
+  Reopen({options, options, options});
+
+  // 1MB should create ~10 files for each CF
+  int kKeysNum = 10000;
+  PutRandomData(1, kKeysNum, 100);
+  {
+    std::unique_ptr<Iterator> iterator(
+        db_->NewIterator(ReadOptions(), handles_[1]));
+    iterator->SeekToFirst();
+
+    DropColumnFamilies({1});
+
+    // Make sure iterator created can still be used.
+    int count = 0;
+    for (; iterator->Valid(); iterator->Next()) {
+      ASSERT_OK(iterator->status());
+      ++count;
+    }
+    ASSERT_OK(iterator->status());
+    ASSERT_EQ(count, kKeysNum);
+  }
+
+  Reopen();
+  Close();
+  Destroy();
+}
+
+TEST_P(ColumnFamilyTest, FlushAndDropRaceCondition) {
+  db_options_.create_missing_column_families = true;
+  Open({"default", "one"});
+  ColumnFamilyOptions options;
+  options.level0_file_num_compaction_trigger = 100;
+  options.level0_slowdown_writes_trigger = 200;
+  options.level0_stop_writes_trigger = 200;
+  options.max_write_buffer_number = 20;
+  options.write_buffer_size = 100000;  // small write buffer size
+  Reopen({options, options});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"VersionSet::LogAndApply::ColumnFamilyDrop:0",
+        "FlushJob::WriteLevel0Table"},
+       {"VersionSet::LogAndApply::ColumnFamilyDrop:1",
+        "FlushJob::InstallResults"},
+       {"FlushJob::InstallResults",
+        "VersionSet::LogAndApply::ColumnFamilyDrop:2"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  test::SleepingBackgroundTask sleeping_task;
+
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+                 Env::Priority::HIGH);
+  // Make sure the task is sleeping. Otherwise, it might start to execute
+  // after sleeping_task.WaitUntilDone() and cause TSAN warning.
+  sleeping_task.WaitUntilSleeping();
+
+  // 1MB should create ~10 files for each CF
+  int kKeysNum = 10000;
+  PutRandomData(1, kKeysNum, 100);
+
+  std::vector<port::Thread> threads;
+  threads.emplace_back([&] { ASSERT_OK(db_->DropColumnFamily(handles_[1])); });
+
+  sleeping_task.WakeUp();
+  sleeping_task.WaitUntilDone();
+  sleeping_task.Reset();
+  // now we sleep again. this is just so we're certain that flush job finished
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+                 Env::Priority::HIGH);
+  // Make sure the task is sleeping. Otherwise, it might start to execute
+  // after sleeping_task.WaitUntilDone() and cause TSAN warning.
+  sleeping_task.WaitUntilSleeping();
+  sleeping_task.WakeUp();
+  sleeping_task.WaitUntilDone();
+
+  {
+    // Since we didn't delete CF handle, RocksDB's contract guarantees that
+    // we're still able to read dropped CF
+    std::unique_ptr<Iterator> iterator(
+        db_->NewIterator(ReadOptions(), handles_[1]));
+    int count = 0;
+    for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+      ASSERT_OK(iterator->status());
+      ++count;
+    }
+    ASSERT_OK(iterator->status());
+    ASSERT_EQ(count, kKeysNum);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  Close();
+  Destroy();
+}
+
+#ifndef ROCKSDB_LITE
+// skipped as persisting options is not supported in ROCKSDB_LITE
+namespace {
+std::atomic<int> test_stage(0);
+std::atomic<bool> ordered_by_writethread(false);
+const int kMainThreadStartPersistingOptionsFile = 1;
+const int kChildThreadFinishDroppingColumnFamily = 2;
+void DropSingleColumnFamily(ColumnFamilyTest* cf_test, int cf_id,
+                            std::vector<Comparator*>* comparators) {
+  while (test_stage < kMainThreadStartPersistingOptionsFile &&
+         !ordered_by_writethread) {
+    Env::Default()->SleepForMicroseconds(100);
+  }
+  cf_test->DropColumnFamilies({cf_id});
+  if ((*comparators)[cf_id]) {
+    delete (*comparators)[cf_id];
+    (*comparators)[cf_id] = nullptr;
+  }
+  test_stage = kChildThreadFinishDroppingColumnFamily;
+}
+}  // anonymous namespace
+
+TEST_P(ColumnFamilyTest, CreateAndDropRace) {
+  const int kCfCount = 5;
+  std::vector<ColumnFamilyOptions> cf_opts;
+  std::vector<Comparator*> comparators;
+  for (int i = 0; i < kCfCount; ++i) {
+    cf_opts.emplace_back();
+    comparators.push_back(new test::SimpleSuffixReverseComparator());
+    cf_opts.back().comparator = comparators.back();
+  }
+  db_options_.create_if_missing = true;
+  db_options_.create_missing_column_families = true;
+
+  auto main_thread_id = std::this_thread::get_id();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "PersistRocksDBOptions:start", [&](void* /*arg*/) {
+        auto current_thread_id = std::this_thread::get_id();
+        // If it's the main thread hitting this sync-point, then it
+        // will be blocked until some other thread update the test_stage.
+        if (main_thread_id == current_thread_id) {
+          test_stage = kMainThreadStartPersistingOptionsFile;
+          while (test_stage < kChildThreadFinishDroppingColumnFamily &&
+                 !ordered_by_writethread) {
+            Env::Default()->SleepForMicroseconds(100);
+          }
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::EnterUnbatched:Wait", [&](void* /*arg*/) {
+        // This means a thread doing DropColumnFamily() is waiting for
+        // other thread to finish persisting options.
+        // In such case, we update the test_stage to unblock the main thread.
+        ordered_by_writethread = true;
+      });
+
+  // Create a database with four column families
+  Open({"default", "one", "two", "three"},
+       {cf_opts[0], cf_opts[1], cf_opts[2], cf_opts[3]});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Start a thread that will drop the first column family
+  // and its comparator
+  ROCKSDB_NAMESPACE::port::Thread drop_cf_thread(DropSingleColumnFamily, this,
+                                                 1, &comparators);
+
+  DropColumnFamilies({2});
+
+  drop_cf_thread.join();
+  Close();
+  Destroy();
+  for (auto* comparator : comparators) {
+    if (comparator) {
+      delete comparator;
+    }
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_P(ColumnFamilyTest, WriteStallSingleColumnFamily) {
+  const uint64_t kBaseRate = 800000u;
+  db_options_.delayed_write_rate = kBaseRate;
+  db_options_.max_background_compactions = 6;
+
+  Open({"default"});
+  ColumnFamilyData* cfd =
+      static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())->cfd();
+
+  VersionStorageInfo* vstorage = cfd->current()->storage_info();
+
+  MutableCFOptions mutable_cf_options(column_family_options_);
+
+  mutable_cf_options.level0_slowdown_writes_trigger = 20;
+  mutable_cf_options.level0_stop_writes_trigger = 10000;
+  mutable_cf_options.soft_pending_compaction_bytes_limit = 200;
+  mutable_cf_options.hard_pending_compaction_bytes_limit = 2000;
+  mutable_cf_options.disable_auto_compactions = false;
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(50);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(201);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(400);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(500);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(450);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(205);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(202);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(201);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(198);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(399);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(599);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(2001);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(3001);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(390);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(100);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+  vstorage->set_l0_delay_trigger_count(100);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->set_l0_delay_trigger_count(101);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->set_l0_delay_trigger_count(0);
+  vstorage->TEST_set_estimated_compaction_needed_bytes(300);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->set_l0_delay_trigger_count(101);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25 / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(200);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->set_l0_delay_trigger_count(0);
+  vstorage->TEST_set_estimated_compaction_needed_bytes(0);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+  mutable_cf_options.disable_auto_compactions = true;
+  dbfull()->TEST_write_controler().set_delayed_write_rate(kBaseRate);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+  vstorage->set_l0_delay_trigger_count(50);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(0, GetDbDelayedWriteRate());
+  ASSERT_EQ(kBaseRate, dbfull()->TEST_write_controler().delayed_write_rate());
+
+  vstorage->set_l0_delay_trigger_count(60);
+  vstorage->TEST_set_estimated_compaction_needed_bytes(300);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(0, GetDbDelayedWriteRate());
+  ASSERT_EQ(kBaseRate, dbfull()->TEST_write_controler().delayed_write_rate());
+
+  mutable_cf_options.disable_auto_compactions = false;
+  vstorage->set_l0_delay_trigger_count(70);
+  vstorage->TEST_set_estimated_compaction_needed_bytes(500);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+  vstorage->set_l0_delay_trigger_count(71);
+  vstorage->TEST_set_estimated_compaction_needed_bytes(501);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+}
+
+TEST_P(ColumnFamilyTest, CompactionSpeedupSingleColumnFamily) {
+  db_options_.max_background_compactions = 6;
+  Open({"default"});
+  ColumnFamilyData* cfd =
+      static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())->cfd();
+
+  VersionStorageInfo* vstorage = cfd->current()->storage_info();
+
+  MutableCFOptions mutable_cf_options(column_family_options_);
+
+  // Speed up threshold = min(4 * 2, 4 + (36 - 4)/4) = 8
+  mutable_cf_options.level0_file_num_compaction_trigger = 4;
+  mutable_cf_options.level0_slowdown_writes_trigger = 36;
+  mutable_cf_options.level0_stop_writes_trigger = 50;
+  // Speedup threshold = 200 / 4 = 50
+  mutable_cf_options.soft_pending_compaction_bytes_limit = 200;
+  mutable_cf_options.hard_pending_compaction_bytes_limit = 2000;
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(40);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(50);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(300);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(45);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->set_l0_delay_trigger_count(7);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->set_l0_delay_trigger_count(9);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->set_l0_delay_trigger_count(6);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+  // Speed up threshold = min(4 * 2, 4 + (12 - 4)/4) = 6
+  mutable_cf_options.level0_file_num_compaction_trigger = 4;
+  mutable_cf_options.level0_slowdown_writes_trigger = 16;
+  mutable_cf_options.level0_stop_writes_trigger = 30;
+
+  vstorage->set_l0_delay_trigger_count(5);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->set_l0_delay_trigger_count(7);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->set_l0_delay_trigger_count(3);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+}
+
+TEST_P(ColumnFamilyTest, WriteStallTwoColumnFamilies) {
+  const uint64_t kBaseRate = 810000u;
+  db_options_.delayed_write_rate = kBaseRate;
+  Open();
+  CreateColumnFamilies({"one"});
+  ColumnFamilyData* cfd =
+      static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())->cfd();
+  VersionStorageInfo* vstorage = cfd->current()->storage_info();
+
+  ColumnFamilyData* cfd1 =
+      static_cast<ColumnFamilyHandleImpl*>(handles_[1])->cfd();
+  VersionStorageInfo* vstorage1 = cfd1->current()->storage_info();
+
+  MutableCFOptions mutable_cf_options(column_family_options_);
+  mutable_cf_options.level0_slowdown_writes_trigger = 20;
+  mutable_cf_options.level0_stop_writes_trigger = 10000;
+  mutable_cf_options.soft_pending_compaction_bytes_limit = 200;
+  mutable_cf_options.hard_pending_compaction_bytes_limit = 2000;
+
+  MutableCFOptions mutable_cf_options1 = mutable_cf_options;
+  mutable_cf_options1.soft_pending_compaction_bytes_limit = 500;
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(50);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+  vstorage1->TEST_set_estimated_compaction_needed_bytes(201);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+  vstorage1->TEST_set_estimated_compaction_needed_bytes(600);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(70);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+  vstorage1->TEST_set_estimated_compaction_needed_bytes(800);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(300);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+  vstorage1->TEST_set_estimated_compaction_needed_bytes(700);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(500);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+  vstorage1->TEST_set_estimated_compaction_needed_bytes(600);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+}
+
+TEST_P(ColumnFamilyTest, CompactionSpeedupTwoColumnFamilies) {
+  db_options_.max_background_compactions = 6;
+  column_family_options_.soft_pending_compaction_bytes_limit = 200;
+  column_family_options_.hard_pending_compaction_bytes_limit = 2000;
+  Open();
+  CreateColumnFamilies({"one"});
+  ColumnFamilyData* cfd =
+      static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())->cfd();
+  VersionStorageInfo* vstorage = cfd->current()->storage_info();
+
+  ColumnFamilyData* cfd1 =
+      static_cast<ColumnFamilyHandleImpl*>(handles_[1])->cfd();
+  VersionStorageInfo* vstorage1 = cfd1->current()->storage_info();
+
+  MutableCFOptions mutable_cf_options(column_family_options_);
+  // Speed up threshold = min(4 * 2, 4 + (36 - 4)/4) = 8
+  mutable_cf_options.level0_file_num_compaction_trigger = 4;
+  mutable_cf_options.level0_slowdown_writes_trigger = 36;
+  mutable_cf_options.level0_stop_writes_trigger = 30;
+  // Speedup threshold = 200 / 4 = 50
+  mutable_cf_options.soft_pending_compaction_bytes_limit = 200;
+  mutable_cf_options.hard_pending_compaction_bytes_limit = 2000;
+
+  MutableCFOptions mutable_cf_options1 = mutable_cf_options;
+  mutable_cf_options1.level0_slowdown_writes_trigger = 16;
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(40);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(60);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage1->TEST_set_estimated_compaction_needed_bytes(30);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage1->TEST_set_estimated_compaction_needed_bytes(70);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(20);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage1->TEST_set_estimated_compaction_needed_bytes(3);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->set_l0_delay_trigger_count(9);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage1->set_l0_delay_trigger_count(2);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->set_l0_delay_trigger_count(0);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+}
+
+TEST_P(ColumnFamilyTest, CreateAndDestroyOptions) {
+  std::unique_ptr<ColumnFamilyOptions> cfo(new ColumnFamilyOptions());
+  ColumnFamilyHandle* cfh;
+  Open();
+  ASSERT_OK(db_->CreateColumnFamily(*(cfo.get()), "yoyo", &cfh));
+  cfo.reset();
+  ASSERT_OK(db_->Put(WriteOptions(), cfh, "foo", "bar"));
+  ASSERT_OK(db_->Flush(FlushOptions(), cfh));
+  ASSERT_OK(db_->DropColumnFamily(cfh));
+  ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh));
+}
+
+TEST_P(ColumnFamilyTest, CreateDropAndDestroy) {
+  ColumnFamilyHandle* cfh;
+  Open();
+  ASSERT_OK(db_->CreateColumnFamily(ColumnFamilyOptions(), "yoyo", &cfh));
+  ASSERT_OK(db_->Put(WriteOptions(), cfh, "foo", "bar"));
+  ASSERT_OK(db_->Flush(FlushOptions(), cfh));
+  ASSERT_OK(db_->DropColumnFamily(cfh));
+  ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(ColumnFamilyTest, CreateDropAndDestroyWithoutFileDeletion) {
+  ColumnFamilyHandle* cfh;
+  Open();
+  ASSERT_OK(db_->CreateColumnFamily(ColumnFamilyOptions(), "yoyo", &cfh));
+  ASSERT_OK(db_->Put(WriteOptions(), cfh, "foo", "bar"));
+  ASSERT_OK(db_->Flush(FlushOptions(), cfh));
+  ASSERT_OK(db_->DisableFileDeletions());
+  ASSERT_OK(db_->DropColumnFamily(cfh));
+  ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh));
+}
+
+TEST_P(ColumnFamilyTest, FlushCloseWALFiles) {
+  SpecialEnv env(Env::Default());
+  db_options_.env = &env;
+  db_options_.max_background_flushes = 1;
+  column_family_options_.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(2));
+  Open();
+  CreateColumnFamilies({"one"});
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  ASSERT_OK(Put(0, "fodor", "mirko"));
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::BGWorkFlush:done", "FlushCloseWALFiles:0"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Block flush jobs from running
+  test::SleepingBackgroundTask sleeping_task;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+                 Env::Priority::HIGH);
+  // Make sure the task is sleeping. Otherwise, it might start to execute
+  // after sleeping_task.WaitUntilDone() and cause TSAN warning.
+  sleeping_task.WaitUntilSleeping();
+
+  WriteOptions wo;
+  wo.sync = true;
+  ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko"));
+
+  ASSERT_EQ(2, env.num_open_wal_file_.load());
+
+  sleeping_task.WakeUp();
+  sleeping_task.WaitUntilDone();
+  TEST_SYNC_POINT("FlushCloseWALFiles:0");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(1, env.num_open_wal_file_.load());
+
+  Reopen();
+  ASSERT_EQ("mirko", Get(0, "fodor"));
+  ASSERT_EQ("mirko", Get(1, "fodor"));
+  db_options_.env = env_;
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE  // WaitForFlush() is not supported
+TEST_P(ColumnFamilyTest, IteratorCloseWALFile1) {
+  SpecialEnv env(Env::Default());
+  db_options_.env = &env;
+  db_options_.max_background_flushes = 1;
+  column_family_options_.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(2));
+  Open();
+  CreateColumnFamilies({"one"});
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  // Create an iterator holding the current super version.
+  Iterator* it = db_->NewIterator(ReadOptions(), handles_[1]);
+  ASSERT_OK(it->status());
+  // A flush will make `it` hold the last reference of its super version.
+  ASSERT_OK(Flush(1));
+
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  ASSERT_OK(Put(0, "fodor", "mirko"));
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+
+  // Flush jobs will close previous WAL files after finishing. By
+  // block flush jobs from running, we trigger a condition where
+  // the iterator destructor should close the WAL files.
+  test::SleepingBackgroundTask sleeping_task;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+                 Env::Priority::HIGH);
+  // Make sure the task is sleeping. Otherwise, it might start to execute
+  // after sleeping_task.WaitUntilDone() and cause TSAN warning.
+  sleeping_task.WaitUntilSleeping();
+
+  WriteOptions wo;
+  wo.sync = true;
+  ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko"));
+
+  ASSERT_EQ(2, env.num_open_wal_file_.load());
+  // Deleting the iterator will clear its super version, triggering
+  // closing all files
+  delete it;
+  ASSERT_EQ(1, env.num_open_wal_file_.load());
+
+  sleeping_task.WakeUp();
+  sleeping_task.WaitUntilDone();
+  WaitForFlush(1);
+
+  Reopen();
+  ASSERT_EQ("mirko", Get(0, "fodor"));
+  ASSERT_EQ("mirko", Get(1, "fodor"));
+  db_options_.env = env_;
+  Close();
+}
+
+TEST_P(ColumnFamilyTest, IteratorCloseWALFile2) {
+  SpecialEnv env(Env::Default());
+  // Allow both of flush and purge job to schedule.
+  env.SetBackgroundThreads(2, Env::HIGH);
+  db_options_.env = &env;
+  db_options_.max_background_flushes = 1;
+  column_family_options_.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(2));
+  Open();
+  CreateColumnFamilies({"one"});
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  // Create an iterator holding the current super version.
+  ReadOptions ro;
+  ro.background_purge_on_iterator_cleanup = true;
+  Iterator* it = db_->NewIterator(ro, handles_[1]);
+  ASSERT_OK(it->status());
+  // A flush will make `it` hold the last reference of its super version.
+  ASSERT_OK(Flush(1));
+
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  ASSERT_OK(Put(0, "fodor", "mirko"));
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"ColumnFamilyTest::IteratorCloseWALFile2:0",
+       "DBImpl::BGWorkPurge:start"},
+      {"ColumnFamilyTest::IteratorCloseWALFile2:2",
+       "DBImpl::BackgroundCallFlush:start"},
+      {"DBImpl::BGWorkPurge:end", "ColumnFamilyTest::IteratorCloseWALFile2:1"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions wo;
+  wo.sync = true;
+  ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko"));
+
+  ASSERT_EQ(2, env.num_open_wal_file_.load());
+  // Deleting the iterator will clear its super version, triggering
+  // closing all files
+  delete it;
+  ASSERT_EQ(2, env.num_open_wal_file_.load());
+
+  TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:0");
+  TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:1");
+  ASSERT_EQ(1, env.num_open_wal_file_.load());
+  TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:2");
+  WaitForFlush(1);
+  ASSERT_EQ(1, env.num_open_wal_file_.load());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  Reopen();
+  ASSERT_EQ("mirko", Get(0, "fodor"));
+  ASSERT_EQ("mirko", Get(1, "fodor"));
+  db_options_.env = env_;
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE  // TEST functions are not supported in lite
+TEST_P(ColumnFamilyTest, ForwardIteratorCloseWALFile) {
+  SpecialEnv env(Env::Default());
+  // Allow both of flush and purge job to schedule.
+  env.SetBackgroundThreads(2, Env::HIGH);
+  db_options_.env = &env;
+  db_options_.max_background_flushes = 1;
+  column_family_options_.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(3));
+  column_family_options_.level0_file_num_compaction_trigger = 2;
+  Open();
+  CreateColumnFamilies({"one"});
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  ASSERT_OK(Put(1, "fodar2", "mirko"));
+  ASSERT_OK(Flush(1));
+
+  // Create an iterator holding the current super version, as well as
+  // the SST file just flushed.
+  ReadOptions ro;
+  ro.tailing = true;
+  ro.background_purge_on_iterator_cleanup = true;
+  Iterator* it = db_->NewIterator(ro, handles_[1]);
+  // A flush will make `it` hold the last reference of its super version.
+
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  ASSERT_OK(Put(1, "fodar2", "mirko"));
+  ASSERT_OK(Flush(1));
+
+  WaitForCompaction();
+
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  ASSERT_OK(Put(0, "fodor", "mirko"));
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"ColumnFamilyTest::IteratorCloseWALFile2:0",
+       "DBImpl::BGWorkPurge:start"},
+      {"ColumnFamilyTest::IteratorCloseWALFile2:2",
+       "DBImpl::BackgroundCallFlush:start"},
+      {"DBImpl::BGWorkPurge:end", "ColumnFamilyTest::IteratorCloseWALFile2:1"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions wo;
+  wo.sync = true;
+  ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko"));
+
+  env.delete_count_.store(0);
+  ASSERT_EQ(2, env.num_open_wal_file_.load());
+  // Deleting the iterator will clear its super version, triggering
+  // closing all files
+  it->Seek("");
+  ASSERT_OK(it->status());
+
+  ASSERT_EQ(2, env.num_open_wal_file_.load());
+  ASSERT_EQ(0, env.delete_count_.load());
+
+  TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:0");
+  TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:1");
+  ASSERT_EQ(1, env.num_open_wal_file_.load());
+  ASSERT_EQ(1, env.delete_count_.load());
+  TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:2");
+  WaitForFlush(1);
+  ASSERT_EQ(1, env.num_open_wal_file_.load());
+  ASSERT_EQ(1, env.delete_count_.load());
+
+  delete it;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  Reopen();
+  ASSERT_EQ("mirko", Get(0, "fodor"));
+  ASSERT_EQ("mirko", Get(1, "fodor"));
+  db_options_.env = env_;
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+// Disable on windows because SyncWAL requires env->IsSyncThreadSafe()
+// to return true which is not so in unbuffered mode.
+#ifndef OS_WIN
+TEST_P(ColumnFamilyTest, LogSyncConflictFlush) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two"});
+
+  ASSERT_OK(Put(0, "", ""));
+  ASSERT_OK(Put(1, "foo", "bar"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::SyncWAL:BeforeMarkLogsSynced:1",
+        "ColumnFamilyTest::LogSyncConflictFlush:1"},
+       {"ColumnFamilyTest::LogSyncConflictFlush:2",
+        "DBImpl::SyncWAL:BeforeMarkLogsSynced:2"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread thread([&] { ASSERT_OK(db_->SyncWAL()); });
+
+  TEST_SYNC_POINT("ColumnFamilyTest::LogSyncConflictFlush:1");
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Flush(1));
+
+  TEST_SYNC_POINT("ColumnFamilyTest::LogSyncConflictFlush:2");
+
+  thread.join();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  Close();
+}
+#endif
+
+// this test is placed here, because the infrastructure for Column Family
+// test is being used to ensure a roll of wal files.
+// Basic idea is to test that WAL truncation is being detected and not
+// ignored
+TEST_P(ColumnFamilyTest, DISABLED_LogTruncationTest) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two"});
+
+  Build(0, 100);
+
+  // Flush the 0th column family to force a roll of the wal log
+  ASSERT_OK(Flush(0));
+
+  // Add some more entries
+  Build(100, 100);
+
+  std::vector<std::string> filenames;
+  ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+
+  // collect wal files
+  std::vector<std::string> logfs;
+  for (size_t i = 0; i < filenames.size(); i++) {
+    uint64_t number;
+    FileType type;
+    if (!(ParseFileName(filenames[i], &number, &type))) continue;
+
+    if (type != kWalFile) continue;
+
+    logfs.push_back(filenames[i]);
+  }
+
+  std::sort(logfs.begin(), logfs.end());
+  ASSERT_GE(logfs.size(), 2);
+
+  // Take the last but one file, and truncate it
+  std::string fpath = dbname_ + "/" + logfs[logfs.size() - 2];
+  std::vector<std::string> names_save = names_;
+
+  uint64_t fsize;
+  ASSERT_OK(env_->GetFileSize(fpath, &fsize));
+  ASSERT_GT(fsize, 0);
+
+  Close();
+
+  std::string backup_logs = dbname_ + "/backup_logs";
+  std::string t_fpath = backup_logs + "/" + logfs[logfs.size() - 2];
+
+  ASSERT_OK(env_->CreateDirIfMissing(backup_logs));
+  // Not sure how easy it is to make this data driven.
+  // need to read back the WAL file and truncate last 10
+  // entries
+  CopyFile(fpath, t_fpath, fsize - 9180);
+
+  ASSERT_OK(env_->DeleteFile(fpath));
+  ASSERT_OK(env_->RenameFile(t_fpath, fpath));
+
+  db_options_.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+
+  OpenReadOnly(names_save);
+
+  CheckMissed();
+
+  Close();
+
+  Open(names_save);
+
+  CheckMissed();
+
+  Close();
+
+  // cleanup
+  ASSERT_OK(env_->DeleteDir(backup_logs));
+}
+
+TEST_P(ColumnFamilyTest, DefaultCfPathsTest) {
+  Open();
+  // Leave cf_paths for one column families to be empty.
+  // Files should be generated according to db_paths for that
+  // column family.
+  ColumnFamilyOptions cf_opt1, cf_opt2;
+  cf_opt1.cf_paths.emplace_back(dbname_ + "_one_1",
+                                std::numeric_limits<uint64_t>::max());
+  CreateColumnFamilies({"one", "two"}, {cf_opt1, cf_opt2});
+  Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2});
+
+  // Fill Column family 1.
+  PutRandomData(1, 100, 100);
+  ASSERT_OK(Flush(1));
+
+  ASSERT_EQ(1, GetSstFileCount(cf_opt1.cf_paths[0].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // Fill column family 2
+  PutRandomData(2, 100, 100);
+  ASSERT_OK(Flush(2));
+
+  // SST from Column family 2 should be generated in
+  // db_paths which is dbname_ in this case.
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+}
+
+TEST_P(ColumnFamilyTest, MultipleCFPathsTest) {
+  Open();
+  // Configure Column family specific paths.
+  ColumnFamilyOptions cf_opt1, cf_opt2;
+  cf_opt1.cf_paths.emplace_back(dbname_ + "_one_1",
+                                std::numeric_limits<uint64_t>::max());
+  cf_opt2.cf_paths.emplace_back(dbname_ + "_two_1",
+                                std::numeric_limits<uint64_t>::max());
+  CreateColumnFamilies({"one", "two"}, {cf_opt1, cf_opt2});
+  Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2});
+
+  PutRandomData(1, 100, 100, true /* save */);
+  ASSERT_OK(Flush(1));
+
+  // Check that files are generated in appropriate paths.
+  ASSERT_EQ(1, GetSstFileCount(cf_opt1.cf_paths[0].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  PutRandomData(2, 100, 100, true /* save */);
+  ASSERT_OK(Flush(2));
+
+  ASSERT_EQ(1, GetSstFileCount(cf_opt2.cf_paths[0].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // Re-open and verify the keys.
+  Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2});
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  for (int cf = 1; cf != 3; ++cf) {
+    ReadOptions read_options;
+    read_options.readahead_size = 0;
+    auto it = dbi->NewIterator(read_options, handles_[cf]);
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      ASSERT_OK(it->status());
+      Slice key(it->key());
+      ASSERT_NE(keys_[cf].end(), keys_[cf].find(key.ToString()));
+    }
+    ASSERT_OK(it->status());
+    delete it;
+
+    for (const auto& key : keys_[cf]) {
+      ASSERT_NE("NOT_FOUND", Get(cf, key));
+    }
+  }
+}
+
+TEST(ColumnFamilyTest, ValidateBlobGCCutoff) {
+  DBOptions db_options;
+
+  ColumnFamilyOptions cf_options;
+  cf_options.enable_blob_garbage_collection = true;
+
+  cf_options.blob_garbage_collection_age_cutoff = -0.5;
+  ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options)
+                  .IsInvalidArgument());
+
+  cf_options.blob_garbage_collection_age_cutoff = 0.0;
+  ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+  cf_options.blob_garbage_collection_age_cutoff = 0.5;
+  ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+  cf_options.blob_garbage_collection_age_cutoff = 1.0;
+  ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+  cf_options.blob_garbage_collection_age_cutoff = 1.5;
+  ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options)
+                  .IsInvalidArgument());
+}
+
+TEST(ColumnFamilyTest, ValidateBlobGCForceThreshold) {
+  DBOptions db_options;
+
+  ColumnFamilyOptions cf_options;
+  cf_options.enable_blob_garbage_collection = true;
+
+  cf_options.blob_garbage_collection_force_threshold = -0.5;
+  ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options)
+                  .IsInvalidArgument());
+
+  cf_options.blob_garbage_collection_force_threshold = 0.0;
+  ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+  cf_options.blob_garbage_collection_force_threshold = 0.5;
+  ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+  cf_options.blob_garbage_collection_force_threshold = 1.0;
+  ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+  cf_options.blob_garbage_collection_force_threshold = 1.5;
+  ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options)
+                  .IsInvalidArgument());
+}
+
+TEST(ColumnFamilyTest, ValidateMemtableKVChecksumOption) {
+  DBOptions db_options;
+
+  ColumnFamilyOptions cf_options;
+  ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+  cf_options.memtable_protection_bytes_per_key = 5;
+  ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options)
+                  .IsNotSupported());
+
+  cf_options.memtable_protection_bytes_per_key = 1;
+  ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+  cf_options.memtable_protection_bytes_per_key = 16;
+  ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options)
+                  .IsNotSupported());
+
+  cf_options.memtable_protection_bytes_per_key = 0;
+  ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/compact_files_test.cc b/src/rocksdb/db/compact_files_test.cc
new file mode 100644
index 000000000..ef38946f7
--- /dev/null
+++ b/src/rocksdb/db/compact_files_test.cc
@@ -0,0 +1,502 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <mutex>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/cast_util.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CompactFilesTest : public testing::Test {
+ public:
+  CompactFilesTest() {
+    env_ = Env::Default();
+    db_name_ = test::PerThreadDBPath("compact_files_test");
+  }
+
+  std::string db_name_;
+  Env* env_;
+};
+
+// A class which remembers the name of each flushed file.
+class FlushedFileCollector : public EventListener {
+ public:
+  FlushedFileCollector() {}
+  ~FlushedFileCollector() override {}
+
+  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    flushed_files_.push_back(info.file_path);
+  }
+
+  std::vector<std::string> GetFlushedFiles() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    std::vector<std::string> result;
+    for (auto fname : flushed_files_) {
+      result.push_back(fname);
+    }
+    return result;
+  }
+  void ClearFlushedFiles() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    flushed_files_.clear();
+  }
+
+ private:
+  std::vector<std::string> flushed_files_;
+  std::mutex mutex_;
+};
+
+TEST_F(CompactFilesTest, L0ConflictsFiles) {
+  Options options;
+  // to trigger compaction more easily
+  const int kWriteBufferSize = 10000;
+  const int kLevel0Trigger = 2;
+  options.create_if_missing = true;
+  options.compaction_style = kCompactionStyleLevel;
+  // Small slowdown and stop trigger for experimental purpose.
+  options.level0_slowdown_writes_trigger = 20;
+  options.level0_stop_writes_trigger = 20;
+  options.level0_stop_writes_trigger = 20;
+  options.write_buffer_size = kWriteBufferSize;
+  options.level0_file_num_compaction_trigger = kLevel0Trigger;
+  options.compression = kNoCompression;
+
+  DB* db = nullptr;
+  ASSERT_OK(DestroyDB(db_name_, options));
+  Status s = DB::Open(options, db_name_, &db);
+  assert(s.ok());
+  assert(db);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"CompactFilesImpl:0", "BackgroundCallCompaction:0"},
+      {"BackgroundCallCompaction:1", "CompactFilesImpl:1"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // create couple files
+  // Background compaction starts and waits in BackgroundCallCompaction:0
+  for (int i = 0; i < kLevel0Trigger * 4; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), std::to_string(i), ""));
+    ASSERT_OK(db->Put(WriteOptions(), std::to_string(100 - i), ""));
+    ASSERT_OK(db->Flush(FlushOptions()));
+  }
+
+  ROCKSDB_NAMESPACE::ColumnFamilyMetaData meta;
+  db->GetColumnFamilyMetaData(&meta);
+  std::string file1;
+  for (auto& file : meta.levels[0].files) {
+    ASSERT_EQ(0, meta.levels[0].level);
+    if (file1 == "") {
+      file1 = file.db_path + "/" + file.name;
+    } else {
+      std::string file2 = file.db_path + "/" + file.name;
+      // Another thread starts a compact files and creates an L0 compaction
+      // The background compaction then notices that there is an L0 compaction
+      // already in progress and doesn't do an L0 compaction
+      // Once the background compaction finishes, the compact files finishes
+      ASSERT_OK(db->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(),
+                                 {file1, file2}, 0));
+      break;
+    }
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  delete db;
+}
+
+TEST_F(CompactFilesTest, MultipleLevel) {
+  Options options;
+  options.create_if_missing = true;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.num_levels = 6;
+  // Add listener
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  DB* db = nullptr;
+  ASSERT_OK(DestroyDB(db_name_, options));
+  Status s = DB::Open(options, db_name_, &db);
+  ASSERT_OK(s);
+  ASSERT_NE(db, nullptr);
+
+  // create couple files in L0, L3, L4 and L5
+  for (int i = 5; i > 2; --i) {
+    collector->ClearFlushedFiles();
+    ASSERT_OK(db->Put(WriteOptions(), std::to_string(i), ""));
+    ASSERT_OK(db->Flush(FlushOptions()));
+    // Ensure background work is fully finished including listener callbacks
+    // before accessing listener state.
+    ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForBackgroundWork());
+    auto l0_files = collector->GetFlushedFiles();
+    ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, i));
+
+    std::string prop;
+    ASSERT_TRUE(db->GetProperty(
+        "rocksdb.num-files-at-level" + std::to_string(i), &prop));
+    ASSERT_EQ("1", prop);
+  }
+  ASSERT_OK(db->Put(WriteOptions(), std::to_string(0), ""));
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  ColumnFamilyMetaData meta;
+  db->GetColumnFamilyMetaData(&meta);
+  // Compact files except the file in L3
+  std::vector<std::string> files;
+  for (int i = 0; i < 6; ++i) {
+    if (i == 3) continue;
+    for (auto& file : meta.levels[i].files) {
+      files.push_back(file.db_path + "/" + file.name);
+    }
+  }
+
+  SyncPoint::GetInstance()->LoadDependency({
+      {"CompactionJob::Run():Start", "CompactFilesTest.MultipleLevel:0"},
+      {"CompactFilesTest.MultipleLevel:1", "CompactFilesImpl:3"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::thread thread([&] {
+    TEST_SYNC_POINT("CompactFilesTest.MultipleLevel:0");
+    ASSERT_OK(db->Put(WriteOptions(), "bar", "v2"));
+    ASSERT_OK(db->Put(WriteOptions(), "foo", "v2"));
+    ASSERT_OK(db->Flush(FlushOptions()));
+    TEST_SYNC_POINT("CompactFilesTest.MultipleLevel:1");
+  });
+
+  // Compaction cannot move up the data to higher level
+  // here we have input file from level 5, so the output level has to be >= 5
+  for (int invalid_output_level = 0; invalid_output_level < 5;
+       invalid_output_level++) {
+    s = db->CompactFiles(CompactionOptions(), files, invalid_output_level);
+    std::cout << s.ToString() << std::endl;
+    ASSERT_TRUE(s.IsInvalidArgument());
+  }
+
+  ASSERT_OK(db->CompactFiles(CompactionOptions(), files, 5));
+  SyncPoint::GetInstance()->DisableProcessing();
+  thread.join();
+
+  delete db;
+}
+
+TEST_F(CompactFilesTest, ObsoleteFiles) {
+  Options options;
+  // to trigger compaction more easily
+  const int kWriteBufferSize = 65536;
+  options.create_if_missing = true;
+  // Disable RocksDB background compaction.
+  options.compaction_style = kCompactionStyleNone;
+  options.level0_slowdown_writes_trigger = (1 << 30);
+  options.level0_stop_writes_trigger = (1 << 30);
+  options.write_buffer_size = kWriteBufferSize;
+  options.max_write_buffer_number = 2;
+  options.compression = kNoCompression;
+
+  // Add listener
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  DB* db = nullptr;
+  ASSERT_OK(DestroyDB(db_name_, options));
+  Status s = DB::Open(options, db_name_, &db);
+  ASSERT_OK(s);
+  ASSERT_NE(db, nullptr);
+
+  // create couple files
+  for (int i = 1000; i < 2000; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), std::to_string(i),
+                      std::string(kWriteBufferSize / 10, 'a' + (i % 26))));
+  }
+
+  auto l0_files = collector->GetFlushedFiles();
+  ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, 1));
+  ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForCompact());
+
+  // verify all compaction input files are deleted
+  for (auto fname : l0_files) {
+    ASSERT_EQ(Status::NotFound(), env_->FileExists(fname));
+  }
+  delete db;
+}
+
+TEST_F(CompactFilesTest, NotCutOutputOnLevel0) {
+  Options options;
+  options.create_if_missing = true;
+  // Disable RocksDB background compaction.
+  options.compaction_style = kCompactionStyleNone;
+  options.level0_slowdown_writes_trigger = 1000;
+  options.level0_stop_writes_trigger = 1000;
+  options.write_buffer_size = 65536;
+  options.max_write_buffer_number = 2;
+  options.compression = kNoCompression;
+  options.max_compaction_bytes = 5000;
+
+  // Add listener
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  DB* db = nullptr;
+  ASSERT_OK(DestroyDB(db_name_, options));
+  Status s = DB::Open(options, db_name_, &db);
+  assert(s.ok());
+  assert(db);
+
+  // create couple files
+  for (int i = 0; i < 500; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), std::to_string(i),
+                      std::string(1000, 'a' + (i % 26))));
+  }
+  ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForFlushMemTable());
+  auto l0_files_1 = collector->GetFlushedFiles();
+  collector->ClearFlushedFiles();
+  for (int i = 0; i < 500; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), std::to_string(i),
+                      std::string(1000, 'a' + (i % 26))));
+  }
+  ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForFlushMemTable());
+  auto l0_files_2 = collector->GetFlushedFiles();
+  ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files_1, 0));
+  ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files_2, 0));
+  // no assertion failure
+  delete db;
+}
+
+TEST_F(CompactFilesTest, CapturingPendingFiles) {
+  Options options;
+  options.create_if_missing = true;
+  // Disable RocksDB background compaction.
+  options.compaction_style = kCompactionStyleNone;
+  // Always do full scans for obsolete files (needed to reproduce the issue).
+  options.delete_obsolete_files_period_micros = 0;
+
+  // Add listener.
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  DB* db = nullptr;
+  ASSERT_OK(DestroyDB(db_name_, options));
+  Status s = DB::Open(options, db_name_, &db);
+  ASSERT_OK(s);
+  assert(db);
+
+  // Create 5 files.
+  for (int i = 0; i < 5; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), "key" + std::to_string(i), "value"));
+    ASSERT_OK(db->Flush(FlushOptions()));
+  }
+
+  // Ensure background work is fully finished including listener callbacks
+  // before accessing listener state.
+  ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForBackgroundWork());
+  auto l0_files = collector->GetFlushedFiles();
+  EXPECT_EQ(5, l0_files.size());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"CompactFilesImpl:2", "CompactFilesTest.CapturingPendingFiles:0"},
+      {"CompactFilesTest.CapturingPendingFiles:1", "CompactFilesImpl:3"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Start compacting files.
+  ROCKSDB_NAMESPACE::port::Thread compaction_thread(
+      [&] { EXPECT_OK(db->CompactFiles(CompactionOptions(), l0_files, 1)); });
+
+  // In the meantime flush another file.
+  TEST_SYNC_POINT("CompactFilesTest.CapturingPendingFiles:0");
+  ASSERT_OK(db->Put(WriteOptions(), "key5", "value"));
+  ASSERT_OK(db->Flush(FlushOptions()));
+  TEST_SYNC_POINT("CompactFilesTest.CapturingPendingFiles:1");
+
+  compaction_thread.join();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  delete db;
+
+  // Make sure we can reopen the DB.
+  s = DB::Open(options, db_name_, &db);
+  ASSERT_OK(s);
+  assert(db);
+  delete db;
+}
+
+TEST_F(CompactFilesTest, CompactionFilterWithGetSv) {
+  class FilterWithGet : public CompactionFilter {
+   public:
+    bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+                std::string* /*new_value*/,
+                bool* /*value_changed*/) const override {
+      if (db_ == nullptr) {
+        return true;
+      }
+      std::string res;
+      db_->Get(ReadOptions(), "", &res);
+      return true;
+    }
+
+    void SetDB(DB* db) { db_ = db; }
+
+    const char* Name() const override { return "FilterWithGet"; }
+
+   private:
+    DB* db_;
+  };
+
+  std::shared_ptr<FilterWithGet> cf(new FilterWithGet());
+
+  Options options;
+  options.create_if_missing = true;
+  options.compaction_filter = cf.get();
+
+  DB* db = nullptr;
+  ASSERT_OK(DestroyDB(db_name_, options));
+  Status s = DB::Open(options, db_name_, &db);
+  ASSERT_OK(s);
+
+  cf->SetDB(db);
+
+  // Write one L0 file
+  ASSERT_OK(db->Put(WriteOptions(), "K1", "V1"));
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  // Compact all L0 files using CompactFiles
+  ROCKSDB_NAMESPACE::ColumnFamilyMetaData meta;
+  db->GetColumnFamilyMetaData(&meta);
+  for (auto& file : meta.levels[0].files) {
+    std::string fname = file.db_path + "/" + file.name;
+    ASSERT_OK(
+        db->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), {fname}, 0));
+  }
+
+  delete db;
+}
+
+TEST_F(CompactFilesTest, SentinelCompressionType) {
+  if (!Zlib_Supported()) {
+    fprintf(stderr, "zlib compression not supported, skip this test\n");
+    return;
+  }
+  if (!Snappy_Supported()) {
+    fprintf(stderr, "snappy compression not supported, skip this test\n");
+    return;
+  }
+  // Check that passing `CompressionType::kDisableCompressionOption` to
+  // `CompactFiles` causes it to use the column family compression options.
+  for (auto compaction_style : {CompactionStyle::kCompactionStyleLevel,
+                                CompactionStyle::kCompactionStyleUniversal,
+                                CompactionStyle::kCompactionStyleNone}) {
+    ASSERT_OK(DestroyDB(db_name_, Options()));
+    Options options;
+    options.compaction_style = compaction_style;
+    // L0: Snappy, L1: ZSTD, L2: Snappy
+    options.compression_per_level = {CompressionType::kSnappyCompression,
+                                     CompressionType::kZlibCompression,
+                                     CompressionType::kSnappyCompression};
+    options.create_if_missing = true;
+    FlushedFileCollector* collector = new FlushedFileCollector();
+    options.listeners.emplace_back(collector);
+    DB* db = nullptr;
+    ASSERT_OK(DB::Open(options, db_name_, &db));
+
+    ASSERT_OK(db->Put(WriteOptions(), "key", "val"));
+    ASSERT_OK(db->Flush(FlushOptions()));
+
+    // Ensure background work is fully finished including listener callbacks
+    // before accessing listener state.
+    ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForBackgroundWork());
+    auto l0_files = collector->GetFlushedFiles();
+    ASSERT_EQ(1, l0_files.size());
+
+    // L0->L1 compaction, so output should be ZSTD-compressed
+    CompactionOptions compaction_opts;
+    compaction_opts.compression = CompressionType::kDisableCompressionOption;
+    ASSERT_OK(db->CompactFiles(compaction_opts, l0_files, 1));
+
+    ROCKSDB_NAMESPACE::TablePropertiesCollection all_tables_props;
+    ASSERT_OK(db->GetPropertiesOfAllTables(&all_tables_props));
+    for (const auto& name_and_table_props : all_tables_props) {
+      ASSERT_EQ(CompressionTypeToString(CompressionType::kZlibCompression),
+                name_and_table_props.second->compression_name);
+    }
+    delete db;
+  }
+}
+
+TEST_F(CompactFilesTest, GetCompactionJobInfo) {
+  Options options;
+  options.create_if_missing = true;
+  // Disable RocksDB background compaction.
+  options.compaction_style = kCompactionStyleNone;
+  options.level0_slowdown_writes_trigger = 1000;
+  options.level0_stop_writes_trigger = 1000;
+  options.write_buffer_size = 65536;
+  options.max_write_buffer_number = 2;
+  options.compression = kNoCompression;
+  options.max_compaction_bytes = 5000;
+
+  // Add listener
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  DB* db = nullptr;
+  ASSERT_OK(DestroyDB(db_name_, options));
+  Status s = DB::Open(options, db_name_, &db);
+  ASSERT_OK(s);
+  assert(db);
+
+  // create couple files
+  for (int i = 0; i < 500; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), std::to_string(i),
+                      std::string(1000, 'a' + (i % 26))));
+  }
+  ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForFlushMemTable());
+  auto l0_files_1 = collector->GetFlushedFiles();
+  CompactionOptions co;
+  co.compression = CompressionType::kLZ4Compression;
+  CompactionJobInfo compaction_job_info{};
+  ASSERT_OK(
+      db->CompactFiles(co, l0_files_1, 0, -1, nullptr, &compaction_job_info));
+  ASSERT_EQ(compaction_job_info.base_input_level, 0);
+  ASSERT_EQ(compaction_job_info.cf_id, db->DefaultColumnFamily()->GetID());
+  ASSERT_EQ(compaction_job_info.cf_name, db->DefaultColumnFamily()->GetName());
+  ASSERT_EQ(compaction_job_info.compaction_reason,
+            CompactionReason::kManualCompaction);
+  ASSERT_EQ(compaction_job_info.compression, CompressionType::kLZ4Compression);
+  ASSERT_EQ(compaction_job_info.output_level, 0);
+  ASSERT_OK(compaction_job_info.status);
+  // no assertion failure
+  delete db;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as DBImpl::CompactFiles is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/clipping_iterator.h b/src/rocksdb/db/compaction/clipping_iterator.h
new file mode 100644
index 000000000..1ed465c2c
--- /dev/null
+++ b/src/rocksdb/db/compaction/clipping_iterator.h
@@ -0,0 +1,276 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+
+#include "rocksdb/comparator.h"
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// An internal iterator that wraps another one and ensures that any keys
+// returned are strictly within a range [start, end). If the underlying
+// iterator has already performed the bounds checking, it relies on that result;
+// otherwise, it performs the necessary key comparisons itself. Both bounds
+// are optional.
+class ClippingIterator : public InternalIterator {
+ public:
+  ClippingIterator(InternalIterator* iter, const Slice* start, const Slice* end,
+                   const CompareInterface* cmp)
+      : iter_(iter), start_(start), end_(end), cmp_(cmp), valid_(false) {
+    assert(iter_);
+    assert(cmp_);
+    assert(!start_ || !end_ || cmp_->Compare(*start_, *end_) <= 0);
+
+    UpdateAndEnforceBounds();
+  }
+
+  bool Valid() const override { return valid_; }
+
+  void SeekToFirst() override {
+    if (start_) {
+      iter_->Seek(*start_);
+    } else {
+      iter_->SeekToFirst();
+    }
+
+    UpdateAndEnforceUpperBound();
+  }
+
+  void SeekToLast() override {
+    if (end_) {
+      iter_->SeekForPrev(*end_);
+
+      // Upper bound is exclusive, so we need a key which is strictly smaller
+      if (iter_->Valid() && cmp_->Compare(iter_->key(), *end_) == 0) {
+        iter_->Prev();
+      }
+    } else {
+      iter_->SeekToLast();
+    }
+
+    UpdateAndEnforceLowerBound();
+  }
+
+  void Seek(const Slice& target) override {
+    if (start_ && cmp_->Compare(target, *start_) < 0) {
+      iter_->Seek(*start_);
+      UpdateAndEnforceUpperBound();
+      return;
+    }
+
+    if (end_ && cmp_->Compare(target, *end_) >= 0) {
+      valid_ = false;
+      return;
+    }
+
+    iter_->Seek(target);
+    UpdateAndEnforceUpperBound();
+  }
+
+  void SeekForPrev(const Slice& target) override {
+    if (start_ && cmp_->Compare(target, *start_) < 0) {
+      valid_ = false;
+      return;
+    }
+
+    if (end_ && cmp_->Compare(target, *end_) >= 0) {
+      iter_->SeekForPrev(*end_);
+
+      // Upper bound is exclusive, so we need a key which is strictly smaller
+      if (iter_->Valid() && cmp_->Compare(iter_->key(), *end_) == 0) {
+        iter_->Prev();
+      }
+
+      UpdateAndEnforceLowerBound();
+      return;
+    }
+
+    iter_->SeekForPrev(target);
+    UpdateAndEnforceLowerBound();
+  }
+
+  void Next() override {
+    assert(valid_);
+    iter_->Next();
+    UpdateAndEnforceUpperBound();
+  }
+
+  bool NextAndGetResult(IterateResult* result) override {
+    assert(valid_);
+    assert(result);
+
+    IterateResult res;
+    valid_ = iter_->NextAndGetResult(&res);
+
+    if (!valid_) {
+      return false;
+    }
+
+    if (end_) {
+      EnforceUpperBoundImpl(res.bound_check_result);
+
+      if (!valid_) {
+        return false;
+      }
+    }
+
+    res.bound_check_result = IterBoundCheck::kInbound;
+    *result = res;
+
+    return true;
+  }
+
+  void Prev() override {
+    assert(valid_);
+    iter_->Prev();
+    UpdateAndEnforceLowerBound();
+  }
+
+  Slice key() const override {
+    assert(valid_);
+    return iter_->key();
+  }
+
+  Slice user_key() const override {
+    assert(valid_);
+    return iter_->user_key();
+  }
+
+  Slice value() const override {
+    assert(valid_);
+    return iter_->value();
+  }
+
+  Status status() const override { return iter_->status(); }
+
+  bool PrepareValue() override {
+    assert(valid_);
+
+    if (iter_->PrepareValue()) {
+      return true;
+    }
+
+    assert(!iter_->Valid());
+    valid_ = false;
+    return false;
+  }
+
+  bool MayBeOutOfLowerBound() override {
+    assert(valid_);
+    return false;
+  }
+
+  IterBoundCheck UpperBoundCheckResult() override {
+    assert(valid_);
+    return IterBoundCheck::kInbound;
+  }
+
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+    iter_->SetPinnedItersMgr(pinned_iters_mgr);
+  }
+
+  bool IsKeyPinned() const override {
+    assert(valid_);
+    return iter_->IsKeyPinned();
+  }
+
+  bool IsValuePinned() const override {
+    assert(valid_);
+    return iter_->IsValuePinned();
+  }
+
+  Status GetProperty(std::string prop_name, std::string* prop) override {
+    return iter_->GetProperty(prop_name, prop);
+  }
+
+ private:
+  void UpdateValid() {
+    assert(!iter_->Valid() || iter_->status().ok());
+
+    valid_ = iter_->Valid();
+  }
+
+  void EnforceUpperBoundImpl(IterBoundCheck bound_check_result) {
+    if (bound_check_result == IterBoundCheck::kInbound) {
+      return;
+    }
+
+    if (bound_check_result == IterBoundCheck::kOutOfBound) {
+      valid_ = false;
+      return;
+    }
+
+    assert(bound_check_result == IterBoundCheck::kUnknown);
+
+    if (cmp_->Compare(key(), *end_) >= 0) {
+      valid_ = false;
+    }
+  }
+
+  void EnforceUpperBound() {
+    if (!valid_) {
+      return;
+    }
+
+    if (!end_) {
+      return;
+    }
+
+    EnforceUpperBoundImpl(iter_->UpperBoundCheckResult());
+  }
+
+  void EnforceLowerBound() {
+    if (!valid_) {
+      return;
+    }
+
+    if (!start_) {
+      return;
+    }
+
+    if (!iter_->MayBeOutOfLowerBound()) {
+      return;
+    }
+
+    if (cmp_->Compare(key(), *start_) < 0) {
+      valid_ = false;
+    }
+  }
+
+  void AssertBounds() {
+    assert(!valid_ || !start_ || cmp_->Compare(key(), *start_) >= 0);
+    assert(!valid_ || !end_ || cmp_->Compare(key(), *end_) < 0);
+  }
+
+  void UpdateAndEnforceBounds() {
+    UpdateValid();
+    EnforceUpperBound();
+    EnforceLowerBound();
+    AssertBounds();
+  }
+
+  void UpdateAndEnforceUpperBound() {
+    UpdateValid();
+    EnforceUpperBound();
+    AssertBounds();
+  }
+
+  void UpdateAndEnforceLowerBound() {
+    UpdateValid();
+    EnforceLowerBound();
+    AssertBounds();
+  }
+
+  InternalIterator* iter_;
+  const Slice* start_;
+  const Slice* end_;
+  const CompareInterface* cmp_;
+  bool valid_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/clipping_iterator_test.cc b/src/rocksdb/db/compaction/clipping_iterator_test.cc
new file mode 100644
index 000000000..b2b167048
--- /dev/null
+++ b/src/rocksdb/db/compaction/clipping_iterator_test.cc
@@ -0,0 +1,259 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/compaction/clipping_iterator.h"
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/vector_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A vector iterator which does its own bounds checking. This is for testing the
+// optimizations in the clipping iterator where we bypass the bounds checking if
+// the input iterator has already performed it.
+class BoundsCheckingVectorIterator : public VectorIterator {
+ public:
+  BoundsCheckingVectorIterator(const std::vector<std::string>& keys,
+                               const std::vector<std::string>& values,
+                               const Slice* start, const Slice* end,
+                               const Comparator* cmp)
+      : VectorIterator(keys, values, cmp), start_(start), end_(end), cmp_(cmp) {
+    assert(cmp_);
+  }
+
+  bool NextAndGetResult(IterateResult* result) override {
+    assert(Valid());
+    assert(result);
+
+    Next();
+
+    if (!Valid()) {
+      return false;
+    }
+
+    result->key = key();
+    result->bound_check_result = UpperBoundCheckResult();
+    result->value_prepared = true;
+
+    return true;
+  }
+
+  bool MayBeOutOfLowerBound() override {
+    assert(Valid());
+
+    if (!start_) {
+      return false;
+    }
+
+    return cmp_->Compare(key(), *start_) < 0;
+  }
+
+  IterBoundCheck UpperBoundCheckResult() override {
+    assert(Valid());
+
+    if (!end_) {
+      return IterBoundCheck::kInbound;
+    }
+
+    return cmp_->Compare(key(), *end_) >= 0 ? IterBoundCheck::kOutOfBound
+                                            : IterBoundCheck::kInbound;
+  }
+
+ private:
+  const Slice* start_;
+  const Slice* end_;
+  const Comparator* cmp_;
+};
+
+class ClippingIteratorTest
+    : public ::testing::Test,
+      public ::testing::WithParamInterface<std::tuple<bool, size_t, size_t>> {};
+
+TEST_P(ClippingIteratorTest, Clip) {
+  const std::vector<std::string> keys{"key0", "key1", "key2", "key3", "key4",
+                                      "key5", "key6", "key7", "key8", "key9"};
+  const std::vector<std::string> values{
+      "unused0", "value1",  "value2",  "value3",  "unused4",
+      "unused5", "unused6", "unused7", "unused8", "unused9"};
+
+  assert(keys.size() == values.size());
+
+  // Note: the input always contains key1, key2, and key3; however, the clipping
+  // window is based on the test parameters: its left edge is a value in the
+  // range [0, 4], and its size is a value in the range [0, 5]
+  const std::vector<std::string> input_keys{keys[1], keys[2], keys[3]};
+  const std::vector<std::string> input_values{values[1], values[2], values[3]};
+
+  const bool use_bounds_checking_vec_it = std::get<0>(GetParam());
+
+  const size_t clip_start_idx = std::get<1>(GetParam());
+  const size_t clip_window_size = std::get<2>(GetParam());
+  const size_t clip_end_idx = clip_start_idx + clip_window_size;
+
+  const Slice start(keys[clip_start_idx]);
+  const Slice end(keys[clip_end_idx]);
+
+  std::unique_ptr<InternalIterator> input(
+      use_bounds_checking_vec_it
+          ? new BoundsCheckingVectorIterator(input_keys, input_values, &start,
+                                             &end, BytewiseComparator())
+          : new VectorIterator(input_keys, input_values, BytewiseComparator()));
+
+  ClippingIterator clip(input.get(), &start, &end, BytewiseComparator());
+
+  // The range the clipping iterator should return values from. This is
+  // essentially the intersection of the input range [1, 4) and the clipping
+  // window [clip_start_idx, clip_end_idx)
+  const size_t data_start_idx =
+      std::max(clip_start_idx, static_cast<size_t>(1));
+  const size_t data_end_idx = std::min(clip_end_idx, static_cast<size_t>(4));
+
+  // Range is empty; all Seeks should fail
+  if (data_start_idx >= data_end_idx) {
+    clip.SeekToFirst();
+    ASSERT_FALSE(clip.Valid());
+
+    clip.SeekToLast();
+    ASSERT_FALSE(clip.Valid());
+
+    for (size_t i = 0; i < keys.size(); ++i) {
+      clip.Seek(keys[i]);
+      ASSERT_FALSE(clip.Valid());
+
+      clip.SeekForPrev(keys[i]);
+      ASSERT_FALSE(clip.Valid());
+    }
+
+    return;
+  }
+
+  // Range is non-empty; call SeekToFirst and iterate forward
+  clip.SeekToFirst();
+  ASSERT_TRUE(clip.Valid());
+  ASSERT_EQ(clip.key(), keys[data_start_idx]);
+  ASSERT_EQ(clip.value(), values[data_start_idx]);
+  ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+  ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+
+  for (size_t i = data_start_idx + 1; i < data_end_idx; ++i) {
+    clip.Next();
+    ASSERT_TRUE(clip.Valid());
+    ASSERT_EQ(clip.key(), keys[i]);
+    ASSERT_EQ(clip.value(), values[i]);
+    ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+    ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+  }
+
+  clip.Next();
+  ASSERT_FALSE(clip.Valid());
+
+  // Do it again using NextAndGetResult
+  clip.SeekToFirst();
+  ASSERT_TRUE(clip.Valid());
+  ASSERT_EQ(clip.key(), keys[data_start_idx]);
+  ASSERT_EQ(clip.value(), values[data_start_idx]);
+  ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+  ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+
+  for (size_t i = data_start_idx + 1; i < data_end_idx; ++i) {
+    IterateResult result;
+    ASSERT_TRUE(clip.NextAndGetResult(&result));
+    ASSERT_EQ(result.key, keys[i]);
+    ASSERT_EQ(result.bound_check_result, IterBoundCheck::kInbound);
+    ASSERT_TRUE(clip.Valid());
+    ASSERT_EQ(clip.key(), keys[i]);
+    ASSERT_EQ(clip.value(), values[i]);
+    ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+    ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+  }
+
+  IterateResult result;
+  ASSERT_FALSE(clip.NextAndGetResult(&result));
+  ASSERT_FALSE(clip.Valid());
+
+  // Call SeekToLast and iterate backward
+  clip.SeekToLast();
+  ASSERT_TRUE(clip.Valid());
+  ASSERT_EQ(clip.key(), keys[data_end_idx - 1]);
+  ASSERT_EQ(clip.value(), values[data_end_idx - 1]);
+  ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+  ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+
+  for (size_t i = data_end_idx - 2; i >= data_start_idx; --i) {
+    clip.Prev();
+    ASSERT_TRUE(clip.Valid());
+    ASSERT_EQ(clip.key(), keys[i]);
+    ASSERT_EQ(clip.value(), values[i]);
+    ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+    ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+  }
+
+  clip.Prev();
+  ASSERT_FALSE(clip.Valid());
+
+  // Call Seek/SeekForPrev for all keys; Seek should return the smallest key
+  // which is >= the target; SeekForPrev should return the largest key which is
+  // <= the target
+  for (size_t i = 0; i < keys.size(); ++i) {
+    clip.Seek(keys[i]);
+
+    if (i < data_start_idx) {
+      ASSERT_TRUE(clip.Valid());
+      ASSERT_EQ(clip.key(), keys[data_start_idx]);
+      ASSERT_EQ(clip.value(), values[data_start_idx]);
+      ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+      ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+    } else if (i < data_end_idx) {
+      ASSERT_TRUE(clip.Valid());
+      ASSERT_EQ(clip.key(), keys[i]);
+      ASSERT_EQ(clip.value(), values[i]);
+      ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+      ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+    } else {
+      ASSERT_FALSE(clip.Valid());
+    }
+
+    clip.SeekForPrev(keys[i]);
+
+    if (i < data_start_idx) {
+      ASSERT_FALSE(clip.Valid());
+    } else if (i < data_end_idx) {
+      ASSERT_TRUE(clip.Valid());
+      ASSERT_EQ(clip.key(), keys[i]);
+      ASSERT_EQ(clip.value(), values[i]);
+      ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+      ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+    } else {
+      ASSERT_TRUE(clip.Valid());
+      ASSERT_EQ(clip.key(), keys[data_end_idx - 1]);
+      ASSERT_EQ(clip.value(), values[data_end_idx - 1]);
+      ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+      ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    ClippingIteratorTest, ClippingIteratorTest,
+    ::testing::Combine(
+        ::testing::Bool(),
+        ::testing::Range(static_cast<size_t>(0), static_cast<size_t>(5)),
+        ::testing::Range(static_cast<size_t>(0), static_cast<size_t>(6))));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/compaction/compaction.cc b/src/rocksdb/db/compaction/compaction.cc
new file mode 100644
index 000000000..a32b529f7
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction.cc
@@ -0,0 +1,855 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction.h"
+
+#include <cinttypes>
+#include <vector>
+
+#include "db/column_family.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/sst_partitioner.h"
+#include "test_util/sync_point.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const uint64_t kRangeTombstoneSentinel =
+    PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion);
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+                      const InternalKey& b) {
+  auto c = user_cmp->CompareWithoutTimestamp(a.user_key(), b.user_key());
+  if (c != 0) {
+    return c;
+  }
+  auto a_footer = ExtractInternalKeyFooter(a.Encode());
+  auto b_footer = ExtractInternalKeyFooter(b.Encode());
+  if (a_footer == kRangeTombstoneSentinel) {
+    if (b_footer != kRangeTombstoneSentinel) {
+      return -1;
+    }
+  } else if (b_footer == kRangeTombstoneSentinel) {
+    return 1;
+  }
+  return 0;
+}
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
+                      const InternalKey& b) {
+  if (a == nullptr) {
+    return -1;
+  }
+  return sstableKeyCompare(user_cmp, *a, b);
+}
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+                      const InternalKey* b) {
+  if (b == nullptr) {
+    return -1;
+  }
+  return sstableKeyCompare(user_cmp, a, *b);
+}
+
+uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
+  uint64_t sum = 0;
+  for (size_t i = 0; i < files.size() && files[i]; i++) {
+    sum += files[i]->fd.GetFileSize();
+  }
+  return sum;
+}
+
+void Compaction::SetInputVersion(Version* _input_version) {
+  input_version_ = _input_version;
+  cfd_ = input_version_->cfd();
+
+  cfd_->Ref();
+  input_version_->Ref();
+  edit_.SetColumnFamily(cfd_->GetID());
+}
+
+void Compaction::GetBoundaryKeys(
+    VersionStorageInfo* vstorage,
+    const std::vector<CompactionInputFiles>& inputs, Slice* smallest_user_key,
+    Slice* largest_user_key, int exclude_level) {
+  bool initialized = false;
+  const Comparator* ucmp = vstorage->InternalComparator()->user_comparator();
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    if (inputs[i].files.empty() || inputs[i].level == exclude_level) {
+      continue;
+    }
+    if (inputs[i].level == 0) {
+      // we need to consider all files on level 0
+      for (const auto* f : inputs[i].files) {
+        const Slice& start_user_key = f->smallest.user_key();
+        if (!initialized ||
+            ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
+          *smallest_user_key = start_user_key;
+        }
+        const Slice& end_user_key = f->largest.user_key();
+        if (!initialized ||
+            ucmp->Compare(end_user_key, *largest_user_key) > 0) {
+          *largest_user_key = end_user_key;
+        }
+        initialized = true;
+      }
+    } else {
+      // we only need to consider the first and last file
+      const Slice& start_user_key = inputs[i].files[0]->smallest.user_key();
+      if (!initialized ||
+          ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
+        *smallest_user_key = start_user_key;
+      }
+      const Slice& end_user_key = inputs[i].files.back()->largest.user_key();
+      if (!initialized || ucmp->Compare(end_user_key, *largest_user_key) > 0) {
+        *largest_user_key = end_user_key;
+      }
+      initialized = true;
+    }
+  }
+}
+
+std::vector<CompactionInputFiles> Compaction::PopulateWithAtomicBoundaries(
+    VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs) {
+  const Comparator* ucmp = vstorage->InternalComparator()->user_comparator();
+  for (size_t i = 0; i < inputs.size(); i++) {
+    if (inputs[i].level == 0 || inputs[i].files.empty()) {
+      continue;
+    }
+    inputs[i].atomic_compaction_unit_boundaries.reserve(inputs[i].files.size());
+    AtomicCompactionUnitBoundary cur_boundary;
+    size_t first_atomic_idx = 0;
+    auto add_unit_boundary = [&](size_t to) {
+      if (first_atomic_idx == to) return;
+      for (size_t k = first_atomic_idx; k < to; k++) {
+        inputs[i].atomic_compaction_unit_boundaries.push_back(cur_boundary);
+      }
+      first_atomic_idx = to;
+    };
+    for (size_t j = 0; j < inputs[i].files.size(); j++) {
+      const auto* f = inputs[i].files[j];
+      if (j == 0) {
+        // First file in a level.
+        cur_boundary.smallest = &f->smallest;
+        cur_boundary.largest = &f->largest;
+      } else if (sstableKeyCompare(ucmp, *cur_boundary.largest, f->smallest) ==
+                 0) {
+        // SSTs overlap but the end key of the previous file was not
+        // artificially extended by a range tombstone. Extend the current
+        // boundary.
+        cur_boundary.largest = &f->largest;
+      } else {
+        // Atomic compaction unit has ended.
+        add_unit_boundary(j);
+        cur_boundary.smallest = &f->smallest;
+        cur_boundary.largest = &f->largest;
+      }
+    }
+    add_unit_boundary(inputs[i].files.size());
+    assert(inputs[i].files.size() ==
+           inputs[i].atomic_compaction_unit_boundaries.size());
+  }
+  return inputs;
+}
+
+// helper function to determine if compaction is creating files at the
+// bottommost level
+bool Compaction::IsBottommostLevel(
+    int output_level, VersionStorageInfo* vstorage,
+    const std::vector<CompactionInputFiles>& inputs) {
+  int output_l0_idx;
+  if (output_level == 0) {
+    output_l0_idx = 0;
+    for (const auto* file : vstorage->LevelFiles(0)) {
+      if (inputs[0].files.back() == file) {
+        break;
+      }
+      ++output_l0_idx;
+    }
+    assert(static_cast<size_t>(output_l0_idx) < vstorage->LevelFiles(0).size());
+  } else {
+    output_l0_idx = -1;
+  }
+  Slice smallest_key, largest_key;
+  GetBoundaryKeys(vstorage, inputs, &smallest_key, &largest_key);
+  return !vstorage->RangeMightExistAfterSortedRun(smallest_key, largest_key,
+                                                  output_level, output_l0_idx);
+}
+
+// test function to validate the functionality of IsBottommostLevel()
+// function -- determines if compaction with inputs and storage is bottommost
+bool Compaction::TEST_IsBottommostLevel(
+    int output_level, VersionStorageInfo* vstorage,
+    const std::vector<CompactionInputFiles>& inputs) {
+  return IsBottommostLevel(output_level, vstorage, inputs);
+}
+
+bool Compaction::IsFullCompaction(
+    VersionStorageInfo* vstorage,
+    const std::vector<CompactionInputFiles>& inputs) {
+  size_t num_files_in_compaction = 0;
+  size_t total_num_files = 0;
+  for (int l = 0; l < vstorage->num_levels(); l++) {
+    total_num_files += vstorage->NumLevelFiles(l);
+  }
+  for (size_t i = 0; i < inputs.size(); i++) {
+    num_files_in_compaction += inputs[i].size();
+  }
+  return num_files_in_compaction == total_num_files;
+}
+
+Compaction::Compaction(
+    VersionStorageInfo* vstorage, const ImmutableOptions& _immutable_options,
+    const MutableCFOptions& _mutable_cf_options,
+    const MutableDBOptions& _mutable_db_options,
+    std::vector<CompactionInputFiles> _inputs, int _output_level,
+    uint64_t _target_file_size, uint64_t _max_compaction_bytes,
+    uint32_t _output_path_id, CompressionType _compression,
+    CompressionOptions _compression_opts, Temperature _output_temperature,
+    uint32_t _max_subcompactions, std::vector<FileMetaData*> _grandparents,
+    bool _manual_compaction, const std::string& _trim_ts, double _score,
+    bool _deletion_compaction, bool l0_files_might_overlap,
+    CompactionReason _compaction_reason,
+    BlobGarbageCollectionPolicy _blob_garbage_collection_policy,
+    double _blob_garbage_collection_age_cutoff)
+    : input_vstorage_(vstorage),
+      start_level_(_inputs[0].level),
+      output_level_(_output_level),
+      target_output_file_size_(_target_file_size),
+      max_compaction_bytes_(_max_compaction_bytes),
+      max_subcompactions_(_max_subcompactions),
+      immutable_options_(_immutable_options),
+      mutable_cf_options_(_mutable_cf_options),
+      input_version_(nullptr),
+      number_levels_(vstorage->num_levels()),
+      cfd_(nullptr),
+      output_path_id_(_output_path_id),
+      output_compression_(_compression),
+      output_compression_opts_(_compression_opts),
+      output_temperature_(_output_temperature),
+      deletion_compaction_(_deletion_compaction),
+      l0_files_might_overlap_(l0_files_might_overlap),
+      inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))),
+      grandparents_(std::move(_grandparents)),
+      score_(_score),
+      bottommost_level_(IsBottommostLevel(output_level_, vstorage, inputs_)),
+      is_full_compaction_(IsFullCompaction(vstorage, inputs_)),
+      is_manual_compaction_(_manual_compaction),
+      trim_ts_(_trim_ts),
+      is_trivial_move_(false),
+
+      compaction_reason_(_compaction_reason),
+      notify_on_compaction_completion_(false),
+      enable_blob_garbage_collection_(
+          _blob_garbage_collection_policy == BlobGarbageCollectionPolicy::kForce
+              ? true
+              : (_blob_garbage_collection_policy ==
+                         BlobGarbageCollectionPolicy::kDisable
+                     ? false
+                     : mutable_cf_options()->enable_blob_garbage_collection)),
+      blob_garbage_collection_age_cutoff_(
+          _blob_garbage_collection_age_cutoff < 0 ||
+                  _blob_garbage_collection_age_cutoff > 1
+              ? mutable_cf_options()->blob_garbage_collection_age_cutoff
+              : _blob_garbage_collection_age_cutoff),
+      penultimate_level_(EvaluatePenultimateLevel(
+          vstorage, immutable_options_, start_level_, output_level_)) {
+  MarkFilesBeingCompacted(true);
+  if (is_manual_compaction_) {
+    compaction_reason_ = CompactionReason::kManualCompaction;
+  }
+  if (max_subcompactions_ == 0) {
+    max_subcompactions_ = _mutable_db_options.max_subcompactions;
+  }
+
+  // for the non-bottommost levels, it tries to build files match the target
+  // file size, but not guaranteed. It could be 2x the size of the target size.
+  max_output_file_size_ =
+      bottommost_level_ || grandparents_.empty() ||
+              !_immutable_options.level_compaction_dynamic_file_size
+          ? target_output_file_size_
+          : 2 * target_output_file_size_;
+
+#ifndef NDEBUG
+  for (size_t i = 1; i < inputs_.size(); ++i) {
+    assert(inputs_[i].level > inputs_[i - 1].level);
+  }
+#endif
+
+  // setup input_levels_
+  {
+    input_levels_.resize(num_input_levels());
+    for (size_t which = 0; which < num_input_levels(); which++) {
+      DoGenerateLevelFilesBrief(&input_levels_[which], inputs_[which].files,
+                                &arena_);
+    }
+  }
+
+  GetBoundaryKeys(vstorage, inputs_, &smallest_user_key_, &largest_user_key_);
+
+  // Every compaction regardless of any compaction reason may respect the
+  // existing compact cursor in the output level to split output files
+  output_split_key_ = nullptr;
+  if (immutable_options_.compaction_style == kCompactionStyleLevel &&
+      immutable_options_.compaction_pri == kRoundRobin) {
+    const InternalKey* cursor =
+        &input_vstorage_->GetCompactCursors()[output_level_];
+    if (cursor->size() != 0) {
+      const Slice& cursor_user_key = ExtractUserKey(cursor->Encode());
+      auto ucmp = vstorage->InternalComparator()->user_comparator();
+      // May split output files according to the cursor if it in the user-key
+      // range
+      if (ucmp->CompareWithoutTimestamp(cursor_user_key, smallest_user_key_) >
+              0 &&
+          ucmp->CompareWithoutTimestamp(cursor_user_key, largest_user_key_) <=
+              0) {
+        output_split_key_ = cursor;
+      }
+    }
+  }
+
+  PopulatePenultimateLevelOutputRange();
+}
+
+void Compaction::PopulatePenultimateLevelOutputRange() {
+  if (!SupportsPerKeyPlacement()) {
+    return;
+  }
+
+  // exclude the last level, the range of all input levels is the safe range
+  // of keys that can be moved up.
+  int exclude_level = number_levels_ - 1;
+  penultimate_output_range_type_ = PenultimateOutputRangeType::kNonLastRange;
+
+  // For universal compaction, the penultimate_output_range could be extended if
+  // all penultimate level files are included in the compaction (which includes
+  // the case that the penultimate level is empty).
+  if (immutable_options_.compaction_style == kCompactionStyleUniversal) {
+    exclude_level = kInvalidLevel;
+    std::set<uint64_t> penultimate_inputs;
+    for (const auto& input_lvl : inputs_) {
+      if (input_lvl.level == penultimate_level_) {
+        for (const auto& file : input_lvl.files) {
+          penultimate_inputs.emplace(file->fd.GetNumber());
+        }
+      }
+    }
+    auto penultimate_files = input_vstorage_->LevelFiles(penultimate_level_);
+    for (const auto& file : penultimate_files) {
+      if (penultimate_inputs.find(file->fd.GetNumber()) ==
+          penultimate_inputs.end()) {
+        exclude_level = number_levels_ - 1;
+        penultimate_output_range_type_ = PenultimateOutputRangeType::kFullRange;
+        break;
+      }
+    }
+  }
+
+  GetBoundaryKeys(input_vstorage_, inputs_,
+                  &penultimate_level_smallest_user_key_,
+                  &penultimate_level_largest_user_key_, exclude_level);
+
+  // If there's a case that the penultimate level output range is overlapping
+  // with the existing files, disable the penultimate level output by setting
+  // the range to empty. One example is the range delete could have overlap
+  // boundary with the next file. (which is actually a false overlap)
+  // TODO: Exclude such false overlap, so it won't disable the penultimate
+  //  output.
+  std::set<uint64_t> penultimate_inputs;
+  for (const auto& input_lvl : inputs_) {
+    if (input_lvl.level == penultimate_level_) {
+      for (const auto& file : input_lvl.files) {
+        penultimate_inputs.emplace(file->fd.GetNumber());
+      }
+    }
+  }
+
+  auto penultimate_files = input_vstorage_->LevelFiles(penultimate_level_);
+  for (const auto& file : penultimate_files) {
+    if (penultimate_inputs.find(file->fd.GetNumber()) ==
+            penultimate_inputs.end() &&
+        OverlapPenultimateLevelOutputRange(file->smallest.user_key(),
+                                           file->largest.user_key())) {
+      // basically disable the penultimate range output. which should be rare
+      // or a false overlap caused by range del
+      penultimate_level_smallest_user_key_ = "";
+      penultimate_level_largest_user_key_ = "";
+      penultimate_output_range_type_ = PenultimateOutputRangeType::kDisabled;
+    }
+  }
+}
+
+Compaction::~Compaction() {
+  if (input_version_ != nullptr) {
+    input_version_->Unref();
+  }
+  if (cfd_ != nullptr) {
+    cfd_->UnrefAndTryDelete();
+  }
+}
+
+bool Compaction::SupportsPerKeyPlacement() const {
+  return penultimate_level_ != kInvalidLevel;
+}
+
+int Compaction::GetPenultimateLevel() const { return penultimate_level_; }
+
+// smallest_key and largest_key include timestamps if user-defined timestamp is
+// enabled.
+bool Compaction::OverlapPenultimateLevelOutputRange(
+    const Slice& smallest_key, const Slice& largest_key) const {
+  if (!SupportsPerKeyPlacement()) {
+    return false;
+  }
+  const Comparator* ucmp =
+      input_vstorage_->InternalComparator()->user_comparator();
+
+  return ucmp->CompareWithoutTimestamp(
+             smallest_key, penultimate_level_largest_user_key_) <= 0 &&
+         ucmp->CompareWithoutTimestamp(
+             largest_key, penultimate_level_smallest_user_key_) >= 0;
+}
+
+// key includes timestamp if user-defined timestamp is enabled.
+bool Compaction::WithinPenultimateLevelOutputRange(const Slice& key) const {
+  if (!SupportsPerKeyPlacement()) {
+    return false;
+  }
+
+  if (penultimate_level_smallest_user_key_.empty() ||
+      penultimate_level_largest_user_key_.empty()) {
+    return false;
+  }
+
+  const Comparator* ucmp =
+      input_vstorage_->InternalComparator()->user_comparator();
+
+  return ucmp->CompareWithoutTimestamp(
+             key, penultimate_level_smallest_user_key_) >= 0 &&
+         ucmp->CompareWithoutTimestamp(
+             key, penultimate_level_largest_user_key_) <= 0;
+}
+
+bool Compaction::InputCompressionMatchesOutput() const {
+  int base_level = input_vstorage_->base_level();
+  bool matches =
+      (GetCompressionType(input_vstorage_, mutable_cf_options_, start_level_,
+                          base_level) == output_compression_);
+  if (matches) {
+    TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:Matches");
+    return true;
+  }
+  TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:DidntMatch");
+  return matches;
+}
+
+bool Compaction::IsTrivialMove() const {
+  // Avoid a move if there is lots of overlapping grandparent data.
+  // Otherwise, the move could create a parent file that will require
+  // a very expensive merge later on.
+  // If start_level_== output_level_, the purpose is to force compaction
+  // filter to be applied to that level, and thus cannot be a trivial move.
+
+  // Check if start level have files with overlapping ranges
+  if (start_level_ == 0 && input_vstorage_->level0_non_overlapping() == false &&
+      l0_files_might_overlap_) {
+    // We cannot move files from L0 to L1 if the L0 files in the LSM-tree are
+    // overlapping, unless we are sure that files picked in L0 don't overlap.
+    return false;
+  }
+
+  if (is_manual_compaction_ &&
+      (immutable_options_.compaction_filter != nullptr ||
+       immutable_options_.compaction_filter_factory != nullptr)) {
+    // This is a manual compaction and we have a compaction filter that should
+    // be executed, we cannot do a trivial move
+    return false;
+  }
+
+  if (start_level_ == output_level_) {
+    // It doesn't make sense if compaction picker picks files just to trivial
+    // move to the same level.
+    return false;
+  }
+
+  // Used in universal compaction, where trivial move can be done if the
+  // input files are non overlapping
+  if ((mutable_cf_options_.compaction_options_universal.allow_trivial_move) &&
+      (output_level_ != 0) &&
+      (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal)) {
+    return is_trivial_move_;
+  }
+
+  if (!(start_level_ != output_level_ && num_input_levels() == 1 &&
+        input(0, 0)->fd.GetPathId() == output_path_id() &&
+        InputCompressionMatchesOutput())) {
+    return false;
+  }
+
+  // assert inputs_.size() == 1
+
+  std::unique_ptr<SstPartitioner> partitioner = CreateSstPartitioner();
+
+  for (const auto& file : inputs_.front().files) {
+    std::vector<FileMetaData*> file_grand_parents;
+    if (output_level_ + 1 >= number_levels_) {
+      continue;
+    }
+    input_vstorage_->GetOverlappingInputs(output_level_ + 1, &file->smallest,
+                                          &file->largest, &file_grand_parents);
+    const auto compaction_size =
+        file->fd.GetFileSize() + TotalFileSize(file_grand_parents);
+    if (compaction_size > max_compaction_bytes_) {
+      return false;
+    }
+
+    if (partitioner.get() != nullptr) {
+      if (!partitioner->CanDoTrivialMove(file->smallest.user_key(),
+                                         file->largest.user_key())) {
+        return false;
+      }
+    }
+  }
+
+  // PerKeyPlacement compaction should never be trivial move.
+  if (SupportsPerKeyPlacement()) {
+    return false;
+  }
+
+  return true;
+}
+
+void Compaction::AddInputDeletions(VersionEdit* out_edit) {
+  for (size_t which = 0; which < num_input_levels(); which++) {
+    for (size_t i = 0; i < inputs_[which].size(); i++) {
+      out_edit->DeleteFile(level(which), inputs_[which][i]->fd.GetNumber());
+    }
+  }
+}
+
+bool Compaction::KeyNotExistsBeyondOutputLevel(
+    const Slice& user_key, std::vector<size_t>* level_ptrs) const {
+  assert(input_version_ != nullptr);
+  assert(level_ptrs != nullptr);
+  assert(level_ptrs->size() == static_cast<size_t>(number_levels_));
+  if (bottommost_level_) {
+    return true;
+  } else if (output_level_ != 0 &&
+             cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
+    // Maybe use binary search to find right entry instead of linear search?
+    const Comparator* user_cmp = cfd_->user_comparator();
+    for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) {
+      const std::vector<FileMetaData*>& files =
+          input_vstorage_->LevelFiles(lvl);
+      for (; level_ptrs->at(lvl) < files.size(); level_ptrs->at(lvl)++) {
+        auto* f = files[level_ptrs->at(lvl)];
+        if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
+          // We've advanced far enough
+          // In the presence of user-defined timestamp, we may need to handle
+          // the case in which f->smallest.user_key() (including ts) has the
+          // same user key, but the ts part is smaller. If so,
+          // Compare(user_key, f->smallest.user_key()) returns -1.
+          // That's why we need CompareWithoutTimestamp().
+          if (user_cmp->CompareWithoutTimestamp(user_key,
+                                                f->smallest.user_key()) >= 0) {
+            // Key falls in this file's range, so it may
+            // exist beyond output level
+            return false;
+          }
+          break;
+        }
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
+// Mark (or clear) each file that is being compacted
+void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) {
+  for (size_t i = 0; i < num_input_levels(); i++) {
+    for (size_t j = 0; j < inputs_[i].size(); j++) {
+      assert(mark_as_compacted ? !inputs_[i][j]->being_compacted
+                               : inputs_[i][j]->being_compacted);
+      inputs_[i][j]->being_compacted = mark_as_compacted;
+    }
+  }
+}
+
+// Sample output:
+// If compacting 3 L0 files, 2 L3 files and 1 L4 file, and outputting to L5,
+// print: "3@0 + 2@3 + 1@4 files to L5"
+const char* Compaction::InputLevelSummary(
+    InputLevelSummaryBuffer* scratch) const {
+  int len = 0;
+  bool is_first = true;
+  for (auto& input_level : inputs_) {
+    if (input_level.empty()) {
+      continue;
+    }
+    if (!is_first) {
+      len +=
+          snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, " + ");
+      len = std::min(len, static_cast<int>(sizeof(scratch->buffer)));
+    } else {
+      is_first = false;
+    }
+    len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+                    "%" ROCKSDB_PRIszt "@%d", input_level.size(),
+                    input_level.level);
+    len = std::min(len, static_cast<int>(sizeof(scratch->buffer)));
+  }
+  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+           " files to L%d", output_level());
+
+  return scratch->buffer;
+}
+
+uint64_t Compaction::CalculateTotalInputSize() const {
+  uint64_t size = 0;
+  for (auto& input_level : inputs_) {
+    for (auto f : input_level.files) {
+      size += f->fd.GetFileSize();
+    }
+  }
+  return size;
+}
+
+void Compaction::ReleaseCompactionFiles(Status status) {
+  MarkFilesBeingCompacted(false);
+  cfd_->compaction_picker()->ReleaseCompactionFiles(this, status);
+}
+
+void Compaction::ResetNextCompactionIndex() {
+  assert(input_version_ != nullptr);
+  input_vstorage_->ResetNextCompactionIndex(start_level_);
+}
+
+namespace {
+int InputSummary(const std::vector<FileMetaData*>& files, char* output,
+                 int len) {
+  *output = '\0';
+  int write = 0;
+  for (size_t i = 0; i < files.size(); i++) {
+    int sz = len - write;
+    int ret;
+    char sztxt[16];
+    AppendHumanBytes(files.at(i)->fd.GetFileSize(), sztxt, 16);
+    ret = snprintf(output + write, sz, "%" PRIu64 "(%s) ",
+                   files.at(i)->fd.GetNumber(), sztxt);
+    if (ret < 0 || ret >= sz) break;
+    write += ret;
+  }
+  // if files.size() is non-zero, overwrite the last space
+  return write - !!files.size();
+}
+}  // namespace
+
+void Compaction::Summary(char* output, int len) {
+  int write =
+      snprintf(output, len, "Base version %" PRIu64 " Base level %d, inputs: [",
+               input_version_->GetVersionNumber(), start_level_);
+  if (write < 0 || write >= len) {
+    return;
+  }
+
+  for (size_t level_iter = 0; level_iter < num_input_levels(); ++level_iter) {
+    if (level_iter > 0) {
+      write += snprintf(output + write, len - write, "], [");
+      if (write < 0 || write >= len) {
+        return;
+      }
+    }
+    write +=
+        InputSummary(inputs_[level_iter].files, output + write, len - write);
+    if (write < 0 || write >= len) {
+      return;
+    }
+  }
+
+  snprintf(output + write, len - write, "]");
+}
+
+uint64_t Compaction::OutputFilePreallocationSize() const {
+  uint64_t preallocation_size = 0;
+
+  for (const auto& level_files : inputs_) {
+    for (const auto& file : level_files.files) {
+      preallocation_size += file->fd.GetFileSize();
+    }
+  }
+
+  if (max_output_file_size_ != std::numeric_limits<uint64_t>::max() &&
+      (immutable_options_.compaction_style == kCompactionStyleLevel ||
+       output_level() > 0)) {
+    preallocation_size = std::min(max_output_file_size_, preallocation_size);
+  }
+
+  // Over-estimate slightly so we don't end up just barely crossing
+  // the threshold
+  // No point to preallocate more than 1GB.
+  return std::min(uint64_t{1073741824},
+                  preallocation_size + (preallocation_size / 10));
+}
+
+std::unique_ptr<CompactionFilter> Compaction::CreateCompactionFilter() const {
+  if (!cfd_->ioptions()->compaction_filter_factory) {
+    return nullptr;
+  }
+
+  if (!cfd_->ioptions()
+           ->compaction_filter_factory->ShouldFilterTableFileCreation(
+               TableFileCreationReason::kCompaction)) {
+    return nullptr;
+  }
+
+  CompactionFilter::Context context;
+  context.is_full_compaction = is_full_compaction_;
+  context.is_manual_compaction = is_manual_compaction_;
+  context.column_family_id = cfd_->GetID();
+  context.reason = TableFileCreationReason::kCompaction;
+  return cfd_->ioptions()->compaction_filter_factory->CreateCompactionFilter(
+      context);
+}
+
+std::unique_ptr<SstPartitioner> Compaction::CreateSstPartitioner() const {
+  if (!immutable_options_.sst_partitioner_factory) {
+    return nullptr;
+  }
+
+  SstPartitioner::Context context;
+  context.is_full_compaction = is_full_compaction_;
+  context.is_manual_compaction = is_manual_compaction_;
+  context.output_level = output_level_;
+  context.smallest_user_key = smallest_user_key_;
+  context.largest_user_key = largest_user_key_;
+  return immutable_options_.sst_partitioner_factory->CreatePartitioner(context);
+}
+
+bool Compaction::IsOutputLevelEmpty() const {
+  return inputs_.back().level != output_level_ || inputs_.back().empty();
+}
+
+bool Compaction::ShouldFormSubcompactions() const {
+  if (cfd_ == nullptr) {
+    return false;
+  }
+
+  // Round-Robin pri under leveled compaction allows subcompactions by default
+  // and the number of subcompactions can be larger than max_subcompactions_
+  if (cfd_->ioptions()->compaction_pri == kRoundRobin &&
+      cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
+    return output_level_ > 0;
+  }
+
+  if (max_subcompactions_ <= 1) {
+    return false;
+  }
+
+  if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
+    return (start_level_ == 0 || is_manual_compaction_) && output_level_ > 0;
+  } else if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) {
+    return number_levels_ > 1 && output_level_ > 0;
+  } else {
+    return false;
+  }
+}
+
+bool Compaction::DoesInputReferenceBlobFiles() const {
+  assert(input_version_);
+
+  const VersionStorageInfo* storage_info = input_version_->storage_info();
+  assert(storage_info);
+
+  if (storage_info->GetBlobFiles().empty()) {
+    return false;
+  }
+
+  for (size_t i = 0; i < inputs_.size(); ++i) {
+    for (const FileMetaData* meta : inputs_[i].files) {
+      assert(meta);
+
+      if (meta->oldest_blob_file_number != kInvalidBlobFileNumber) {
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+uint64_t Compaction::MinInputFileOldestAncesterTime(
+    const InternalKey* start, const InternalKey* end) const {
+  uint64_t min_oldest_ancester_time = std::numeric_limits<uint64_t>::max();
+  const InternalKeyComparator& icmp =
+      column_family_data()->internal_comparator();
+  for (const auto& level_files : inputs_) {
+    for (const auto& file : level_files.files) {
+      if (start != nullptr && icmp.Compare(file->largest, *start) < 0) {
+        continue;
+      }
+      if (end != nullptr && icmp.Compare(file->smallest, *end) > 0) {
+        continue;
+      }
+      uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime();
+      if (oldest_ancester_time != 0) {
+        min_oldest_ancester_time =
+            std::min(min_oldest_ancester_time, oldest_ancester_time);
+      }
+    }
+  }
+  return min_oldest_ancester_time;
+}
+
+int Compaction::EvaluatePenultimateLevel(
+    const VersionStorageInfo* vstorage,
+    const ImmutableOptions& immutable_options, const int start_level,
+    const int output_level) {
+  // TODO: currently per_key_placement feature only support level and universal
+  //  compaction
+  if (immutable_options.compaction_style != kCompactionStyleLevel &&
+      immutable_options.compaction_style != kCompactionStyleUniversal) {
+    return kInvalidLevel;
+  }
+  if (output_level != immutable_options.num_levels - 1) {
+    return kInvalidLevel;
+  }
+
+  int penultimate_level = output_level - 1;
+  assert(penultimate_level < immutable_options.num_levels);
+  if (penultimate_level <= 0) {
+    return kInvalidLevel;
+  }
+
+  // If the penultimate level is not within input level -> output level range
+  // check if the penultimate output level is empty, if it's empty, it could
+  // also be locked for the penultimate output.
+  // TODO: ideally, it only needs to check if there's a file within the
+  //  compaction output key range. For simplicity, it just check if there's any
+  //  file on the penultimate level.
+  if (start_level == immutable_options.num_levels - 1 &&
+      (immutable_options.compaction_style != kCompactionStyleUniversal ||
+       !vstorage->LevelFiles(penultimate_level).empty())) {
+    return kInvalidLevel;
+  }
+
+  bool supports_per_key_placement =
+      immutable_options.preclude_last_level_data_seconds > 0;
+
+  // it could be overridden by unittest
+  TEST_SYNC_POINT_CALLBACK("Compaction::SupportsPerKeyPlacement:Enabled",
+                           &supports_per_key_placement);
+  if (!supports_per_key_placement) {
+    return kInvalidLevel;
+  }
+
+  return penultimate_level;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction.h b/src/rocksdb/db/compaction/compaction.h
new file mode 100644
index 000000000..21d1190ac
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction.h
@@ -0,0 +1,559 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "db/version_set.h"
+#include "memory/arena.h"
+#include "options/cf_options.h"
+#include "rocksdb/sst_partitioner.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+// The file contains class Compaction, as well as some helper functions
+// and data structures used by the class.
+
+// Utility for comparing sstable boundary keys. Returns -1 if either a or b is
+// null which provides the property that a==null indicates a key that is less
+// than any key and b==null indicates a key that is greater than any key. Note
+// that the comparison is performed primarily on the user-key portion of the
+// key. If the user-keys compare equal, an additional test is made to sort
+// range tombstone sentinel keys before other keys with the same user-key. The
+// result is that 2 user-keys will compare equal if they differ purely on
+// their sequence number and value, but the range tombstone sentinel for that
+// user-key will compare not equal. This is necessary because the range
+// tombstone sentinel key is set as the largest key for an sstable even though
+// that key never appears in the database. We don't want adjacent sstables to
+// be considered overlapping if they are separated by the range tombstone
+// sentinel.
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+                      const InternalKey& b);
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
+                      const InternalKey& b);
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+                      const InternalKey* b);
+
+// An AtomicCompactionUnitBoundary represents a range of keys [smallest,
+// largest] that exactly spans one ore more neighbouring SSTs on the same
+// level. Every pair of  SSTs in this range "overlap" (i.e., the largest
+// user key of one file is the smallest user key of the next file). These
+// boundaries are propagated down to RangeDelAggregator during compaction
+// to provide safe truncation boundaries for range tombstones.
+struct AtomicCompactionUnitBoundary {
+  const InternalKey* smallest = nullptr;
+  const InternalKey* largest = nullptr;
+};
+
+// The structure that manages compaction input files associated
+// with the same physical level.
+struct CompactionInputFiles {
+  int level;
+  std::vector<FileMetaData*> files;
+  std::vector<AtomicCompactionUnitBoundary> atomic_compaction_unit_boundaries;
+  inline bool empty() const { return files.empty(); }
+  inline size_t size() const { return files.size(); }
+  inline void clear() { files.clear(); }
+  inline FileMetaData* operator[](size_t i) const { return files[i]; }
+};
+
+class Version;
+class ColumnFamilyData;
+class VersionStorageInfo;
+class CompactionFilter;
+
+// A Compaction encapsulates metadata about a compaction.
+class Compaction {
+ public:
+  Compaction(VersionStorageInfo* input_version,
+             const ImmutableOptions& immutable_options,
+             const MutableCFOptions& mutable_cf_options,
+             const MutableDBOptions& mutable_db_options,
+             std::vector<CompactionInputFiles> inputs, int output_level,
+             uint64_t target_file_size, uint64_t max_compaction_bytes,
+             uint32_t output_path_id, CompressionType compression,
+             CompressionOptions compression_opts,
+             Temperature output_temperature, uint32_t max_subcompactions,
+             std::vector<FileMetaData*> grandparents,
+             bool manual_compaction = false, const std::string& trim_ts = "",
+             double score = -1, bool deletion_compaction = false,
+             bool l0_files_might_overlap = true,
+             CompactionReason compaction_reason = CompactionReason::kUnknown,
+             BlobGarbageCollectionPolicy blob_garbage_collection_policy =
+                 BlobGarbageCollectionPolicy::kUseDefault,
+             double blob_garbage_collection_age_cutoff = -1);
+
+  // The type of the penultimate level output range
+  enum class PenultimateOutputRangeType : int {
+    kNotSupported,  // it cannot output to the penultimate level
+    kFullRange,     // any data could be output to the penultimate level
+    kNonLastRange,  // only the keys within non_last_level compaction inputs can
+                    // be outputted to the penultimate level
+    kDisabled,      // no data can be outputted to the penultimate level
+  };
+
+  // No copying allowed
+  Compaction(const Compaction&) = delete;
+  void operator=(const Compaction&) = delete;
+
+  ~Compaction();
+
+  // Returns the level associated to the specified compaction input level.
+  // If compaction_input_level is not specified, then input_level is set to 0.
+  int level(size_t compaction_input_level = 0) const {
+    return inputs_[compaction_input_level].level;
+  }
+
+  int start_level() const { return start_level_; }
+
+  // Outputs will go to this level
+  int output_level() const { return output_level_; }
+
+  // Returns the number of input levels in this compaction.
+  size_t num_input_levels() const { return inputs_.size(); }
+
+  // Return the object that holds the edits to the descriptor done
+  // by this compaction.
+  VersionEdit* edit() { return &edit_; }
+
+  // Returns the number of input files associated to the specified
+  // compaction input level.
+  // The function will return 0 if when "compaction_input_level" < 0
+  // or "compaction_input_level" >= "num_input_levels()".
+  size_t num_input_files(size_t compaction_input_level) const {
+    if (compaction_input_level < inputs_.size()) {
+      return inputs_[compaction_input_level].size();
+    }
+    return 0;
+  }
+
+  // Returns input version of the compaction
+  Version* input_version() const { return input_version_; }
+
+  // Returns the ColumnFamilyData associated with the compaction.
+  ColumnFamilyData* column_family_data() const { return cfd_; }
+
+  // Returns the file meta data of the 'i'th input file at the
+  // specified compaction input level.
+  // REQUIREMENT: "compaction_input_level" must be >= 0 and
+  //              < "input_levels()"
+  FileMetaData* input(size_t compaction_input_level, size_t i) const {
+    assert(compaction_input_level < inputs_.size());
+    return inputs_[compaction_input_level][i];
+  }
+
+  const std::vector<AtomicCompactionUnitBoundary>* boundaries(
+      size_t compaction_input_level) const {
+    assert(compaction_input_level < inputs_.size());
+    return &inputs_[compaction_input_level].atomic_compaction_unit_boundaries;
+  }
+
+  // Returns the list of file meta data of the specified compaction
+  // input level.
+  // REQUIREMENT: "compaction_input_level" must be >= 0 and
+  //              < "input_levels()"
+  const std::vector<FileMetaData*>* inputs(
+      size_t compaction_input_level) const {
+    assert(compaction_input_level < inputs_.size());
+    return &inputs_[compaction_input_level].files;
+  }
+
+  const std::vector<CompactionInputFiles>* inputs() { return &inputs_; }
+
+  // Returns the LevelFilesBrief of the specified compaction input level.
+  const LevelFilesBrief* input_levels(size_t compaction_input_level) const {
+    return &input_levels_[compaction_input_level];
+  }
+
+  // Maximum size of files to build during this compaction.
+  uint64_t max_output_file_size() const { return max_output_file_size_; }
+
+  // Target output file size for this compaction
+  uint64_t target_output_file_size() const { return target_output_file_size_; }
+
+  // What compression for output
+  CompressionType output_compression() const { return output_compression_; }
+
+  // What compression options for output
+  const CompressionOptions& output_compression_opts() const {
+    return output_compression_opts_;
+  }
+
+  // Whether need to write output file to second DB path.
+  uint32_t output_path_id() const { return output_path_id_; }
+
+  // Is this a trivial compaction that can be implemented by just
+  // moving a single input file to the next level (no merging or splitting)
+  bool IsTrivialMove() const;
+
+  // The split user key in the output level if this compaction is required to
+  // split the output files according to the existing cursor in the output
+  // level under round-robin compaction policy. Empty indicates no required
+  // splitting key
+  const InternalKey* GetOutputSplitKey() const { return output_split_key_; }
+
+  // If true, then the compaction can be done by simply deleting input files.
+  bool deletion_compaction() const { return deletion_compaction_; }
+
+  // Add all inputs to this compaction as delete operations to *edit.
+  void AddInputDeletions(VersionEdit* edit);
+
+  // Returns true if the available information we have guarantees that
+  // the input "user_key" does not exist in any level beyond "output_level()".
+  bool KeyNotExistsBeyondOutputLevel(const Slice& user_key,
+                                     std::vector<size_t>* level_ptrs) const;
+
+  // Clear all files to indicate that they are not being compacted
+  // Delete this compaction from the list of running compactions.
+  //
+  // Requirement: DB mutex held
+  void ReleaseCompactionFiles(Status status);
+
+  // Returns the summary of the compaction in "output" with maximum "len"
+  // in bytes.  The caller is responsible for the memory management of
+  // "output".
+  void Summary(char* output, int len);
+
+  // Return the score that was used to pick this compaction run.
+  double score() const { return score_; }
+
+  // Is this compaction creating a file in the bottom most level?
+  bool bottommost_level() const { return bottommost_level_; }
+
+  // Is the compaction compact to the last level
+  bool is_last_level() const {
+    return output_level_ == immutable_options_.num_levels - 1;
+  }
+
+  // Does this compaction include all sst files?
+  bool is_full_compaction() const { return is_full_compaction_; }
+
+  // Was this compaction triggered manually by the client?
+  bool is_manual_compaction() const { return is_manual_compaction_; }
+
+  std::string trim_ts() const { return trim_ts_; }
+
+  // Used when allow_trivial_move option is set in
+  // Universal compaction. If all the input files are
+  // non overlapping, then is_trivial_move_ variable
+  // will be set true, else false
+  void set_is_trivial_move(bool trivial_move) {
+    is_trivial_move_ = trivial_move;
+  }
+
+  // Used when allow_trivial_move option is set in
+  // Universal compaction. Returns true, if the input files
+  // are non-overlapping and can be trivially moved.
+  bool is_trivial_move() const { return is_trivial_move_; }
+
+  // How many total levels are there?
+  int number_levels() const { return number_levels_; }
+
+  // Return the ImmutableOptions that should be used throughout the compaction
+  // procedure
+  const ImmutableOptions* immutable_options() const {
+    return &immutable_options_;
+  }
+
+  // Return the MutableCFOptions that should be used throughout the compaction
+  // procedure
+  const MutableCFOptions* mutable_cf_options() const {
+    return &mutable_cf_options_;
+  }
+
+  // Returns the size in bytes that the output file should be preallocated to.
+  // In level compaction, that is max_file_size_. In universal compaction, that
+  // is the sum of all input file sizes.
+  uint64_t OutputFilePreallocationSize() const;
+
+  void SetInputVersion(Version* input_version);
+
+  struct InputLevelSummaryBuffer {
+    char buffer[128];
+  };
+
+  const char* InputLevelSummary(InputLevelSummaryBuffer* scratch) const;
+
+  uint64_t CalculateTotalInputSize() const;
+
+  // In case of compaction error, reset the nextIndex that is used
+  // to pick up the next file to be compacted from files_by_size_
+  void ResetNextCompactionIndex();
+
+  // Create a CompactionFilter from compaction_filter_factory
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter() const;
+
+  // Create a SstPartitioner from sst_partitioner_factory
+  std::unique_ptr<SstPartitioner> CreateSstPartitioner() const;
+
+  // Is the input level corresponding to output_level_ empty?
+  bool IsOutputLevelEmpty() const;
+
+  // Should this compaction be broken up into smaller ones run in parallel?
+  bool ShouldFormSubcompactions() const;
+
+  // Returns true iff at least one input file references a blob file.
+  //
+  // PRE: input version has been set.
+  bool DoesInputReferenceBlobFiles() const;
+
+  // test function to validate the functionality of IsBottommostLevel()
+  // function -- determines if compaction with inputs and storage is bottommost
+  static bool TEST_IsBottommostLevel(
+      int output_level, VersionStorageInfo* vstorage,
+      const std::vector<CompactionInputFiles>& inputs);
+
+  TablePropertiesCollection GetOutputTableProperties() const {
+    return output_table_properties_;
+  }
+
+  void SetOutputTableProperties(TablePropertiesCollection tp) {
+    output_table_properties_ = std::move(tp);
+  }
+
+  Slice GetSmallestUserKey() const { return smallest_user_key_; }
+
+  Slice GetLargestUserKey() const { return largest_user_key_; }
+
+  Slice GetPenultimateLevelSmallestUserKey() const {
+    return penultimate_level_smallest_user_key_;
+  }
+
+  Slice GetPenultimateLevelLargestUserKey() const {
+    return penultimate_level_largest_user_key_;
+  }
+
+  PenultimateOutputRangeType GetPenultimateOutputRangeType() const {
+    return penultimate_output_range_type_;
+  }
+
+  // Return true if the compaction supports per_key_placement
+  bool SupportsPerKeyPlacement() const;
+
+  // Get per_key_placement penultimate output level, which is `last_level - 1`
+  // if per_key_placement feature is supported. Otherwise, return -1.
+  int GetPenultimateLevel() const;
+
+  // Return true if the given range is overlap with penultimate level output
+  // range.
+  // Both smallest_key and largest_key include timestamps if user-defined
+  // timestamp is enabled.
+  bool OverlapPenultimateLevelOutputRange(const Slice& smallest_key,
+                                          const Slice& largest_key) const;
+
+  // Return true if the key is within penultimate level output range for
+  // per_key_placement feature, which is safe to place the key to the
+  // penultimate level. different compaction strategy has different rules.
+  // If per_key_placement is not supported, always return false.
+  // TODO: currently it doesn't support moving data from the last level to the
+  //  penultimate level
+  //  key includes timestamp if user-defined timestamp is enabled.
+  bool WithinPenultimateLevelOutputRange(const Slice& key) const;
+
+  CompactionReason compaction_reason() const { return compaction_reason_; }
+
+  const std::vector<FileMetaData*>& grandparents() const {
+    return grandparents_;
+  }
+
+  uint64_t max_compaction_bytes() const { return max_compaction_bytes_; }
+
+  Temperature output_temperature() const { return output_temperature_; }
+
+  uint32_t max_subcompactions() const { return max_subcompactions_; }
+
+  bool enable_blob_garbage_collection() const {
+    return enable_blob_garbage_collection_;
+  }
+
+  double blob_garbage_collection_age_cutoff() const {
+    return blob_garbage_collection_age_cutoff_;
+  }
+
+  // start and end are sub compact range. Null if no boundary.
+  // This is used to filter out some input files' ancester's time range.
+  uint64_t MinInputFileOldestAncesterTime(const InternalKey* start,
+                                          const InternalKey* end) const;
+
+  // Called by DBImpl::NotifyOnCompactionCompleted to make sure number of
+  // compaction begin and compaction completion callbacks match.
+  void SetNotifyOnCompactionCompleted() {
+    notify_on_compaction_completion_ = true;
+  }
+
+  bool ShouldNotifyOnCompactionCompleted() const {
+    return notify_on_compaction_completion_;
+  }
+
+  static constexpr int kInvalidLevel = -1;
+
+  // Evaluate penultimate output level. If the compaction supports
+  // per_key_placement feature, it returns the penultimate level number.
+  // Otherwise, it's set to kInvalidLevel (-1), which means
+  // output_to_penultimate_level is not supported.
+  // Note: even the penultimate level output is supported (PenultimateLevel !=
+  // kInvalidLevel), some key range maybe unsafe to be outputted to the
+  // penultimate level. The safe key range is populated by
+  // `PopulatePenultimateLevelOutputRange()`.
+  // Which could potentially disable all penultimate level output.
+  static int EvaluatePenultimateLevel(const VersionStorageInfo* vstorage,
+                                      const ImmutableOptions& immutable_options,
+                                      const int start_level,
+                                      const int output_level);
+
+ private:
+  // mark (or clear) all files that are being compacted
+  void MarkFilesBeingCompacted(bool mark_as_compacted);
+
+  // get the smallest and largest key present in files to be compacted
+  static void GetBoundaryKeys(VersionStorageInfo* vstorage,
+                              const std::vector<CompactionInputFiles>& inputs,
+                              Slice* smallest_key, Slice* largest_key,
+                              int exclude_level = -1);
+
+  // populate penultimate level output range, which will be used to determine if
+  // a key is safe to output to the penultimate level (details see
+  // `Compaction::WithinPenultimateLevelOutputRange()`.
+  void PopulatePenultimateLevelOutputRange();
+
+  // Get the atomic file boundaries for all files in the compaction. Necessary
+  // in order to avoid the scenario described in
+  // https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and
+  // plumb down appropriate key boundaries to RangeDelAggregator during
+  // compaction.
+  static std::vector<CompactionInputFiles> PopulateWithAtomicBoundaries(
+      VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs);
+
+  // helper function to determine if compaction with inputs and storage is
+  // bottommost
+  static bool IsBottommostLevel(
+      int output_level, VersionStorageInfo* vstorage,
+      const std::vector<CompactionInputFiles>& inputs);
+
+  static bool IsFullCompaction(VersionStorageInfo* vstorage,
+                               const std::vector<CompactionInputFiles>& inputs);
+
+  VersionStorageInfo* input_vstorage_;
+
+  const int start_level_;   // the lowest level to be compacted
+  const int output_level_;  // levels to which output files are stored
+  uint64_t target_output_file_size_;
+  uint64_t max_output_file_size_;
+  uint64_t max_compaction_bytes_;
+  uint32_t max_subcompactions_;
+  const ImmutableOptions immutable_options_;
+  const MutableCFOptions mutable_cf_options_;
+  Version* input_version_;
+  VersionEdit edit_;
+  const int number_levels_;
+  ColumnFamilyData* cfd_;
+  Arena arena_;  // Arena used to allocate space for file_levels_
+
+  const uint32_t output_path_id_;
+  CompressionType output_compression_;
+  CompressionOptions output_compression_opts_;
+  Temperature output_temperature_;
+  // If true, then the compaction can be done by simply deleting input files.
+  const bool deletion_compaction_;
+  // should it split the output file using the compact cursor?
+  const InternalKey* output_split_key_;
+
+  // L0 files in LSM-tree might be overlapping. But the compaction picking
+  // logic might pick a subset of the files that aren't overlapping. if
+  // that is the case, set the value to false. Otherwise, set it true.
+  bool l0_files_might_overlap_;
+
+  // Compaction input files organized by level. Constant after construction
+  const std::vector<CompactionInputFiles> inputs_;
+
+  // A copy of inputs_, organized more closely in memory
+  autovector<LevelFilesBrief, 2> input_levels_;
+
+  // State used to check for number of overlapping grandparent files
+  // (grandparent == "output_level_ + 1")
+  std::vector<FileMetaData*> grandparents_;
+  const double score_;  // score that was used to pick this compaction.
+
+  // Is this compaction creating a file in the bottom most level?
+  const bool bottommost_level_;
+  // Does this compaction include all sst files?
+  const bool is_full_compaction_;
+
+  // Is this compaction requested by the client?
+  const bool is_manual_compaction_;
+
+  // The data with timestamp > trim_ts_ will be removed
+  const std::string trim_ts_;
+
+  // True if we can do trivial move in Universal multi level
+  // compaction
+  bool is_trivial_move_;
+
+  // Does input compression match the output compression?
+  bool InputCompressionMatchesOutput() const;
+
+  // table properties of output files
+  TablePropertiesCollection output_table_properties_;
+
+  // smallest user keys in compaction
+  // includes timestamp if user-defined timestamp is enabled.
+  Slice smallest_user_key_;
+
+  // largest user keys in compaction
+  // includes timestamp if user-defined timestamp is enabled.
+  Slice largest_user_key_;
+
+  // Reason for compaction
+  CompactionReason compaction_reason_;
+
+  // Notify on compaction completion only if listener was notified on compaction
+  // begin.
+  bool notify_on_compaction_completion_;
+
+  // Enable/disable GC collection for blobs during compaction.
+  bool enable_blob_garbage_collection_;
+
+  // Blob garbage collection age cutoff.
+  double blob_garbage_collection_age_cutoff_;
+
+  // only set when per_key_placement feature is enabled, -1 (kInvalidLevel)
+  // means not supported.
+  const int penultimate_level_;
+
+  // Key range for penultimate level output
+  // includes timestamp if user-defined timestamp is enabled.
+  // penultimate_output_range_type_ shows the range type
+  Slice penultimate_level_smallest_user_key_;
+  Slice penultimate_level_largest_user_key_;
+  PenultimateOutputRangeType penultimate_output_range_type_ =
+      PenultimateOutputRangeType::kNotSupported;
+};
+
+#ifndef NDEBUG
+// Helper struct only for tests, which contains the data to decide if a key
+// should be output to the penultimate level.
+// TODO: remove this when the public feature knob is available
+struct PerKeyPlacementContext {
+  const int level;
+  const Slice key;
+  const Slice value;
+  const SequenceNumber seq_num;
+
+  bool output_to_penultimate_level;
+
+  PerKeyPlacementContext(int _level, Slice _key, Slice _value,
+                         SequenceNumber _seq_num)
+      : level(_level), key(_key), value(_value), seq_num(_seq_num) {
+    output_to_penultimate_level = false;
+  }
+};
+#endif /* !NDEBUG */
+
+// Return sum of sizes of all files in `files`.
+extern uint64_t TotalFileSize(const std::vector<FileMetaData*>& files);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_iteration_stats.h b/src/rocksdb/db/compaction/compaction_iteration_stats.h
new file mode 100644
index 000000000..1b1c28b57
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_iteration_stats.h
@@ -0,0 +1,49 @@
+//  Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct CompactionIterationStats {
+  // Compaction statistics
+
+  // Doesn't include records skipped because of
+  // CompactionFilter::Decision::kRemoveAndSkipUntil.
+  int64_t num_record_drop_user = 0;
+
+  int64_t num_record_drop_hidden = 0;
+  int64_t num_record_drop_obsolete = 0;
+  int64_t num_record_drop_range_del = 0;
+  int64_t num_range_del_drop_obsolete = 0;
+  // Deletions obsoleted before bottom level due to file gap optimization.
+  int64_t num_optimized_del_drop_obsolete = 0;
+  uint64_t total_filter_time = 0;
+
+  // Input statistics
+  // TODO(noetzli): The stats are incomplete. They are lacking everything
+  // consumed by MergeHelper.
+  uint64_t num_input_records = 0;
+  uint64_t num_input_deletion_records = 0;
+  uint64_t num_input_corrupt_records = 0;
+  uint64_t total_input_raw_key_bytes = 0;
+  uint64_t total_input_raw_value_bytes = 0;
+
+  // Single-Delete diagnostics for exceptional situations
+  uint64_t num_single_del_fallthru = 0;
+  uint64_t num_single_del_mismatch = 0;
+
+  // Blob related statistics
+  uint64_t num_blobs_read = 0;
+  uint64_t total_blob_bytes_read = 0;
+  uint64_t num_blobs_relocated = 0;
+  uint64_t total_blob_bytes_relocated = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_iterator.cc b/src/rocksdb/db/compaction/compaction_iterator.cc
new file mode 100644
index 000000000..9f54f7813
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_iterator.cc
@@ -0,0 +1,1338 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/compaction/compaction_iterator.h"
+
+#include <iterator>
+#include <limits>
+
+#include "db/blob/blob_fetcher.h"
+#include "db/blob/blob_file_builder.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/prefetch_buffer_collection.h"
+#include "db/snapshot_checker.h"
+#include "logging/logging.h"
+#include "port/likely.h"
+#include "rocksdb/listener.h"
+#include "table/internal_iterator.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+CompactionIterator::CompactionIterator(
+    InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+    SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
+    SequenceNumber earliest_write_conflict_snapshot,
+    SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker,
+    Env* env, bool report_detailed_time, bool expect_valid_internal_key,
+    CompactionRangeDelAggregator* range_del_agg,
+    BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
+    bool enforce_single_del_contracts,
+    const std::atomic<bool>& manual_compaction_canceled,
+    const Compaction* compaction, const CompactionFilter* compaction_filter,
+    const std::atomic<bool>* shutting_down,
+    const std::shared_ptr<Logger> info_log,
+    const std::string* full_history_ts_low,
+    const SequenceNumber preserve_time_min_seqno,
+    const SequenceNumber preclude_last_level_min_seqno)
+    : CompactionIterator(
+          input, cmp, merge_helper, last_sequence, snapshots,
+          earliest_write_conflict_snapshot, job_snapshot, snapshot_checker, env,
+          report_detailed_time, expect_valid_internal_key, range_del_agg,
+          blob_file_builder, allow_data_in_errors, enforce_single_del_contracts,
+          manual_compaction_canceled,
+          std::unique_ptr<CompactionProxy>(
+              compaction ? new RealCompaction(compaction) : nullptr),
+          compaction_filter, shutting_down, info_log, full_history_ts_low,
+          preserve_time_min_seqno, preclude_last_level_min_seqno) {}
+
+CompactionIterator::CompactionIterator(
+    InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+    SequenceNumber /*last_sequence*/, std::vector<SequenceNumber>* snapshots,
+    SequenceNumber earliest_write_conflict_snapshot,
+    SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker,
+    Env* env, bool report_detailed_time, bool expect_valid_internal_key,
+    CompactionRangeDelAggregator* range_del_agg,
+    BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
+    bool enforce_single_del_contracts,
+    const std::atomic<bool>& manual_compaction_canceled,
+    std::unique_ptr<CompactionProxy> compaction,
+    const CompactionFilter* compaction_filter,
+    const std::atomic<bool>* shutting_down,
+    const std::shared_ptr<Logger> info_log,
+    const std::string* full_history_ts_low,
+    const SequenceNumber preserve_time_min_seqno,
+    const SequenceNumber preclude_last_level_min_seqno)
+    : input_(input, cmp,
+             !compaction || compaction->DoesInputReferenceBlobFiles()),
+      cmp_(cmp),
+      merge_helper_(merge_helper),
+      snapshots_(snapshots),
+      earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
+      job_snapshot_(job_snapshot),
+      snapshot_checker_(snapshot_checker),
+      env_(env),
+      clock_(env_->GetSystemClock().get()),
+      report_detailed_time_(report_detailed_time),
+      expect_valid_internal_key_(expect_valid_internal_key),
+      range_del_agg_(range_del_agg),
+      blob_file_builder_(blob_file_builder),
+      compaction_(std::move(compaction)),
+      compaction_filter_(compaction_filter),
+      shutting_down_(shutting_down),
+      manual_compaction_canceled_(manual_compaction_canceled),
+      bottommost_level_(!compaction_ ? false
+                                     : compaction_->bottommost_level() &&
+                                           !compaction_->allow_ingest_behind()),
+      // snapshots_ cannot be nullptr, but we will assert later in the body of
+      // the constructor.
+      visible_at_tip_(snapshots_ ? snapshots_->empty() : false),
+      earliest_snapshot_(!snapshots_ || snapshots_->empty()
+                             ? kMaxSequenceNumber
+                             : snapshots_->at(0)),
+      info_log_(info_log),
+      allow_data_in_errors_(allow_data_in_errors),
+      enforce_single_del_contracts_(enforce_single_del_contracts),
+      timestamp_size_(cmp_ ? cmp_->timestamp_size() : 0),
+      full_history_ts_low_(full_history_ts_low),
+      current_user_key_sequence_(0),
+      current_user_key_snapshot_(0),
+      merge_out_iter_(merge_helper_),
+      blob_garbage_collection_cutoff_file_number_(
+          ComputeBlobGarbageCollectionCutoffFileNumber(compaction_.get())),
+      blob_fetcher_(CreateBlobFetcherIfNeeded(compaction_.get())),
+      prefetch_buffers_(
+          CreatePrefetchBufferCollectionIfNeeded(compaction_.get())),
+      current_key_committed_(false),
+      cmp_with_history_ts_low_(0),
+      level_(compaction_ == nullptr ? 0 : compaction_->level()),
+      preserve_time_min_seqno_(preserve_time_min_seqno),
+      preclude_last_level_min_seqno_(preclude_last_level_min_seqno) {
+  assert(snapshots_ != nullptr);
+  assert(preserve_time_min_seqno_ <= preclude_last_level_min_seqno_);
+
+  if (compaction_ != nullptr) {
+    level_ptrs_ = std::vector<size_t>(compaction_->number_levels(), 0);
+  }
+#ifndef NDEBUG
+  // findEarliestVisibleSnapshot assumes this ordering.
+  for (size_t i = 1; i < snapshots_->size(); ++i) {
+    assert(snapshots_->at(i - 1) < snapshots_->at(i));
+  }
+  assert(timestamp_size_ == 0 || !full_history_ts_low_ ||
+         timestamp_size_ == full_history_ts_low_->size());
+#endif
+  input_.SetPinnedItersMgr(&pinned_iters_mgr_);
+  TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get());
+}
+
+CompactionIterator::~CompactionIterator() {
+  // input_ Iterator lifetime is longer than pinned_iters_mgr_ lifetime
+  input_.SetPinnedItersMgr(nullptr);
+}
+
+void CompactionIterator::ResetRecordCounts() {
+  iter_stats_.num_record_drop_user = 0;
+  iter_stats_.num_record_drop_hidden = 0;
+  iter_stats_.num_record_drop_obsolete = 0;
+  iter_stats_.num_record_drop_range_del = 0;
+  iter_stats_.num_range_del_drop_obsolete = 0;
+  iter_stats_.num_optimized_del_drop_obsolete = 0;
+}
+
+void CompactionIterator::SeekToFirst() {
+  NextFromInput();
+  PrepareOutput();
+}
+
+void CompactionIterator::Next() {
+  // If there is a merge output, return it before continuing to process the
+  // input.
+  if (merge_out_iter_.Valid()) {
+    merge_out_iter_.Next();
+
+    // Check if we returned all records of the merge output.
+    if (merge_out_iter_.Valid()) {
+      key_ = merge_out_iter_.key();
+      value_ = merge_out_iter_.value();
+      Status s = ParseInternalKey(key_, &ikey_, allow_data_in_errors_);
+      // MergeUntil stops when it encounters a corrupt key and does not
+      // include them in the result, so we expect the keys here to be valid.
+      if (!s.ok()) {
+        ROCKS_LOG_FATAL(
+            info_log_, "Invalid ikey %s in compaction. %s",
+            allow_data_in_errors_ ? key_.ToString(true).c_str() : "hidden",
+            s.getState());
+        assert(false);
+      }
+
+      // Keep current_key_ in sync.
+      if (0 == timestamp_size_) {
+        current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+      } else {
+        Slice ts = ikey_.GetTimestamp(timestamp_size_);
+        current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type, &ts);
+      }
+      key_ = current_key_.GetInternalKey();
+      ikey_.user_key = current_key_.GetUserKey();
+      validity_info_.SetValid(ValidContext::kMerge1);
+    } else {
+      // We consumed all pinned merge operands, release pinned iterators
+      pinned_iters_mgr_.ReleasePinnedData();
+      // MergeHelper moves the iterator to the first record after the merged
+      // records, so even though we reached the end of the merge output, we do
+      // not want to advance the iterator.
+      NextFromInput();
+    }
+  } else {
+    // Only advance the input iterator if there is no merge output and the
+    // iterator is not already at the next record.
+    if (!at_next_) {
+      AdvanceInputIter();
+    }
+    NextFromInput();
+  }
+
+  if (Valid()) {
+    // Record that we've outputted a record for the current key.
+    has_outputted_key_ = true;
+  }
+
+  PrepareOutput();
+}
+
+bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
+                                              Slice* skip_until) {
+  // TODO: support compaction filter for wide-column entities
+  if (!compaction_filter_ ||
+      (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex)) {
+    return true;
+  }
+  bool error = false;
+  // If the user has specified a compaction filter and the sequence
+  // number is greater than any external snapshot, then invoke the
+  // filter. If the return value of the compaction filter is true,
+  // replace the entry with a deletion marker.
+  CompactionFilter::Decision filter = CompactionFilter::Decision::kUndetermined;
+  compaction_filter_value_.clear();
+  compaction_filter_skip_until_.Clear();
+  CompactionFilter::ValueType value_type =
+      ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue
+                               : CompactionFilter::ValueType::kBlobIndex;
+  // Hack: pass internal key to BlobIndexCompactionFilter since it needs
+  // to get sequence number.
+  assert(compaction_filter_);
+  Slice& filter_key =
+      (ikey_.type == kTypeValue ||
+       !compaction_filter_->IsStackedBlobDbInternalCompactionFilter())
+          ? ikey_.user_key
+          : key_;
+  {
+    StopWatchNano timer(clock_, report_detailed_time_);
+    if (kTypeBlobIndex == ikey_.type) {
+      filter = compaction_filter_->FilterBlobByKey(
+          level_, filter_key, &compaction_filter_value_,
+          compaction_filter_skip_until_.rep());
+      if (CompactionFilter::Decision::kUndetermined == filter &&
+          !compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
+        if (compaction_ == nullptr) {
+          status_ =
+              Status::Corruption("Unexpected blob index outside of compaction");
+          validity_info_.Invalidate();
+          return false;
+        }
+
+        TEST_SYNC_POINT_CALLBACK(
+            "CompactionIterator::InvokeFilterIfNeeded::TamperWithBlobIndex",
+            &value_);
+
+        // For integrated BlobDB impl, CompactionIterator reads blob value.
+        // For Stacked BlobDB impl, the corresponding CompactionFilter's
+        // FilterV2 method should read the blob value.
+        BlobIndex blob_index;
+        Status s = blob_index.DecodeFrom(value_);
+        if (!s.ok()) {
+          status_ = s;
+          validity_info_.Invalidate();
+          return false;
+        }
+
+        FilePrefetchBuffer* prefetch_buffer =
+            prefetch_buffers_ ? prefetch_buffers_->GetOrCreatePrefetchBuffer(
+                                    blob_index.file_number())
+                              : nullptr;
+
+        uint64_t bytes_read = 0;
+
+        assert(blob_fetcher_);
+
+        s = blob_fetcher_->FetchBlob(ikey_.user_key, blob_index,
+                                     prefetch_buffer, &blob_value_,
+                                     &bytes_read);
+        if (!s.ok()) {
+          status_ = s;
+          validity_info_.Invalidate();
+          return false;
+        }
+
+        ++iter_stats_.num_blobs_read;
+        iter_stats_.total_blob_bytes_read += bytes_read;
+
+        value_type = CompactionFilter::ValueType::kValue;
+      }
+    }
+    if (CompactionFilter::Decision::kUndetermined == filter) {
+      filter = compaction_filter_->FilterV2(
+          level_, filter_key, value_type,
+          blob_value_.empty() ? value_ : blob_value_, &compaction_filter_value_,
+          compaction_filter_skip_until_.rep());
+    }
+    iter_stats_.total_filter_time +=
+        env_ != nullptr && report_detailed_time_ ? timer.ElapsedNanos() : 0;
+  }
+
+  if (CompactionFilter::Decision::kUndetermined == filter) {
+    // Should not reach here, since FilterV2 should never return kUndetermined.
+    status_ =
+        Status::NotSupported("FilterV2() should never return kUndetermined");
+    validity_info_.Invalidate();
+    return false;
+  }
+
+  if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil &&
+      cmp_->Compare(*compaction_filter_skip_until_.rep(), ikey_.user_key) <=
+          0) {
+    // Can't skip to a key smaller than the current one.
+    // Keep the key as per FilterV2 documentation.
+    filter = CompactionFilter::Decision::kKeep;
+  }
+
+  if (filter == CompactionFilter::Decision::kRemove) {
+    // convert the current key to a delete; key_ is pointing into
+    // current_key_ at this point, so updating current_key_ updates key()
+    ikey_.type = kTypeDeletion;
+    current_key_.UpdateInternalKey(ikey_.sequence, kTypeDeletion);
+    // no value associated with delete
+    value_.clear();
+    iter_stats_.num_record_drop_user++;
+  } else if (filter == CompactionFilter::Decision::kPurge) {
+    // convert the current key to a single delete; key_ is pointing into
+    // current_key_ at this point, so updating current_key_ updates key()
+    ikey_.type = kTypeSingleDeletion;
+    current_key_.UpdateInternalKey(ikey_.sequence, kTypeSingleDeletion);
+    // no value associated with single delete
+    value_.clear();
+    iter_stats_.num_record_drop_user++;
+  } else if (filter == CompactionFilter::Decision::kChangeValue) {
+    if (ikey_.type == kTypeBlobIndex) {
+      // value transfer from blob file to inlined data
+      ikey_.type = kTypeValue;
+      current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+    }
+    value_ = compaction_filter_value_;
+  } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) {
+    *need_skip = true;
+    compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber,
+                                                     kValueTypeForSeek);
+    *skip_until = compaction_filter_skip_until_.Encode();
+  } else if (filter == CompactionFilter::Decision::kChangeBlobIndex) {
+    // Only the StackableDB-based BlobDB impl's compaction filter should return
+    // kChangeBlobIndex. Decision about rewriting blob and changing blob index
+    // in the integrated BlobDB impl is made in subsequent call to
+    // PrepareOutput() and its callees.
+    if (!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
+      status_ = Status::NotSupported(
+          "Only stacked BlobDB's internal compaction filter can return "
+          "kChangeBlobIndex.");
+      validity_info_.Invalidate();
+      return false;
+    }
+    if (ikey_.type == kTypeValue) {
+      // value transfer from inlined data to blob file
+      ikey_.type = kTypeBlobIndex;
+      current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+    }
+    value_ = compaction_filter_value_;
+  } else if (filter == CompactionFilter::Decision::kIOError) {
+    if (!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
+      status_ = Status::NotSupported(
+          "CompactionFilter for integrated BlobDB should not return kIOError");
+      validity_info_.Invalidate();
+      return false;
+    }
+    status_ = Status::IOError("Failed to access blob during compaction filter");
+    error = true;
+  }
+  return !error;
+}
+
+void CompactionIterator::NextFromInput() {
+  at_next_ = false;
+  validity_info_.Invalidate();
+
+  while (!Valid() && input_.Valid() && !IsPausingManualCompaction() &&
+         !IsShuttingDown()) {
+    key_ = input_.key();
+    value_ = input_.value();
+    blob_value_.Reset();
+    iter_stats_.num_input_records++;
+
+    Status pik_status = ParseInternalKey(key_, &ikey_, allow_data_in_errors_);
+    if (!pik_status.ok()) {
+      iter_stats_.num_input_corrupt_records++;
+
+      // If `expect_valid_internal_key_` is false, return the corrupted key
+      // and let the caller decide what to do with it.
+      if (expect_valid_internal_key_) {
+        status_ = pik_status;
+        return;
+      }
+      key_ = current_key_.SetInternalKey(key_);
+      has_current_user_key_ = false;
+      current_user_key_sequence_ = kMaxSequenceNumber;
+      current_user_key_snapshot_ = 0;
+      validity_info_.SetValid(ValidContext::kParseKeyError);
+      break;
+    }
+    TEST_SYNC_POINT_CALLBACK("CompactionIterator:ProcessKV", &ikey_);
+
+    // Update input statistics
+    if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion ||
+        ikey_.type == kTypeDeletionWithTimestamp) {
+      iter_stats_.num_input_deletion_records++;
+    }
+    iter_stats_.total_input_raw_key_bytes += key_.size();
+    iter_stats_.total_input_raw_value_bytes += value_.size();
+
+    // If need_skip is true, we should seek the input iterator
+    // to internal key skip_until and continue from there.
+    bool need_skip = false;
+    // Points either into compaction_filter_skip_until_ or into
+    // merge_helper_->compaction_filter_skip_until_.
+    Slice skip_until;
+
+    bool user_key_equal_without_ts = false;
+    int cmp_ts = 0;
+    if (has_current_user_key_) {
+      user_key_equal_without_ts =
+          cmp_->EqualWithoutTimestamp(ikey_.user_key, current_user_key_);
+      // if timestamp_size_ > 0, then curr_ts_ has been initialized by a
+      // previous key.
+      cmp_ts = timestamp_size_ ? cmp_->CompareTimestamp(
+                                     ExtractTimestampFromUserKey(
+                                         ikey_.user_key, timestamp_size_),
+                                     curr_ts_)
+                               : 0;
+    }
+
+    // Check whether the user key changed. After this if statement current_key_
+    // is a copy of the current input key (maybe converted to a delete by the
+    // compaction filter). ikey_.user_key is pointing to the copy.
+    if (!has_current_user_key_ || !user_key_equal_without_ts || cmp_ts != 0) {
+      // First occurrence of this user key
+      // Copy key for output
+      key_ = current_key_.SetInternalKey(key_, &ikey_);
+
+      int prev_cmp_with_ts_low =
+          !full_history_ts_low_ ? 0
+          : curr_ts_.empty()
+              ? 0
+              : cmp_->CompareTimestamp(curr_ts_, *full_history_ts_low_);
+
+      // If timestamp_size_ > 0, then copy from ikey_ to curr_ts_ for the use
+      // in next iteration to compare with the timestamp of next key.
+      UpdateTimestampAndCompareWithFullHistoryLow();
+
+      // If
+      // (1) !has_current_user_key_, OR
+      // (2) timestamp is disabled, OR
+      // (3) all history will be preserved, OR
+      // (4) user key (excluding timestamp) is different from previous key, OR
+      // (5) timestamp is NO older than *full_history_ts_low_, OR
+      // (6) timestamp is the largest one older than full_history_ts_low_,
+      // then current_user_key_ must be treated as a different user key.
+      // This means, if a user key (excluding ts) is the same as the previous
+      // user key, and its ts is older than *full_history_ts_low_, then we
+      // consider this key for GC, e.g. it may be dropped if certain conditions
+      // match.
+      if (!has_current_user_key_ || !timestamp_size_ || !full_history_ts_low_ ||
+          !user_key_equal_without_ts || cmp_with_history_ts_low_ >= 0 ||
+          prev_cmp_with_ts_low >= 0) {
+        // Initialize for future comparison for rule (A) and etc.
+        current_user_key_sequence_ = kMaxSequenceNumber;
+        current_user_key_snapshot_ = 0;
+        has_current_user_key_ = true;
+      }
+      current_user_key_ = ikey_.user_key;
+
+      has_outputted_key_ = false;
+
+      last_key_seq_zeroed_ = false;
+
+      current_key_committed_ = KeyCommitted(ikey_.sequence);
+
+      // Apply the compaction filter to the first committed version of the user
+      // key.
+      if (current_key_committed_ &&
+          !InvokeFilterIfNeeded(&need_skip, &skip_until)) {
+        break;
+      }
+    } else {
+      // Update the current key to reflect the new sequence number/type without
+      // copying the user key.
+      // TODO(rven): Compaction filter does not process keys in this path
+      // Need to have the compaction filter process multiple versions
+      // if we have versions on both sides of a snapshot
+      current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+      key_ = current_key_.GetInternalKey();
+      ikey_.user_key = current_key_.GetUserKey();
+
+      // Note that newer version of a key is ordered before older versions. If a
+      // newer version of a key is committed, so as the older version. No need
+      // to query snapshot_checker_ in that case.
+      if (UNLIKELY(!current_key_committed_)) {
+        assert(snapshot_checker_ != nullptr);
+        current_key_committed_ = KeyCommitted(ikey_.sequence);
+        // Apply the compaction filter to the first committed version of the
+        // user key.
+        if (current_key_committed_ &&
+            !InvokeFilterIfNeeded(&need_skip, &skip_until)) {
+          break;
+        }
+      }
+    }
+
+    if (UNLIKELY(!current_key_committed_)) {
+      assert(snapshot_checker_ != nullptr);
+      validity_info_.SetValid(ValidContext::kCurrentKeyUncommitted);
+      break;
+    }
+
+    // If there are no snapshots, then this kv affect visibility at tip.
+    // Otherwise, search though all existing snapshots to find the earliest
+    // snapshot that is affected by this kv.
+    SequenceNumber last_sequence = current_user_key_sequence_;
+    current_user_key_sequence_ = ikey_.sequence;
+    SequenceNumber last_snapshot = current_user_key_snapshot_;
+    SequenceNumber prev_snapshot = 0;  // 0 means no previous snapshot
+    current_user_key_snapshot_ =
+        visible_at_tip_
+            ? earliest_snapshot_
+            : findEarliestVisibleSnapshot(ikey_.sequence, &prev_snapshot);
+
+    if (need_skip) {
+      // This case is handled below.
+    } else if (clear_and_output_next_key_) {
+      // In the previous iteration we encountered a single delete that we could
+      // not compact out.  We will keep this Put, but can drop it's data.
+      // (See Optimization 3, below.)
+      if (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex &&
+          ikey_.type != kTypeWideColumnEntity) {
+        ROCKS_LOG_FATAL(info_log_, "Unexpected key %s for compaction output",
+                        ikey_.DebugString(allow_data_in_errors_, true).c_str());
+        assert(false);
+      }
+      if (current_user_key_snapshot_ < last_snapshot) {
+        ROCKS_LOG_FATAL(info_log_,
+                        "key %s, current_user_key_snapshot_ (%" PRIu64
+                        ") < last_snapshot (%" PRIu64 ")",
+                        ikey_.DebugString(allow_data_in_errors_, true).c_str(),
+                        current_user_key_snapshot_, last_snapshot);
+        assert(false);
+      }
+
+      if (ikey_.type == kTypeBlobIndex || ikey_.type == kTypeWideColumnEntity) {
+        ikey_.type = kTypeValue;
+        current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+      }
+
+      value_.clear();
+      validity_info_.SetValid(ValidContext::kKeepSDAndClearPut);
+      clear_and_output_next_key_ = false;
+    } else if (ikey_.type == kTypeSingleDeletion) {
+      // We can compact out a SingleDelete if:
+      // 1) We encounter the corresponding PUT -OR- we know that this key
+      //    doesn't appear past this output level
+      // =AND=
+      // 2) We've already returned a record in this snapshot -OR-
+      //    there are no earlier earliest_write_conflict_snapshot.
+      //
+      // A note about 2) above:
+      // we try to determine whether there is any earlier write conflict
+      // checking snapshot by calling DefinitelyInSnapshot() with seq and
+      // earliest_write_conflict_snapshot as arguments. For write-prepared
+      // and write-unprepared transactions, if earliest_write_conflict_snapshot
+      // is evicted from WritePreparedTxnDB::commit_cache, then
+      // DefinitelyInSnapshot(seq, earliest_write_conflict_snapshot) returns
+      // false, even if the seq is actually visible within
+      // earliest_write_conflict_snapshot. Consequently, CompactionIterator
+      // may try to zero out its sequence number, thus hitting assertion error
+      // in debug mode or cause incorrect DBIter return result.
+      // We observe that earliest_write_conflict_snapshot >= earliest_snapshot,
+      // and the seq zeroing logic depends on
+      // DefinitelyInSnapshot(seq, earliest_snapshot). Therefore, if we cannot
+      // determine whether seq is **definitely** in
+      // earliest_write_conflict_snapshot, then we can additionally check if
+      // seq is definitely in earliest_snapshot. If the latter holds, then the
+      // former holds too.
+      //
+      // Rule 1 is needed for SingleDelete correctness.  Rule 2 is needed to
+      // allow Transactions to do write-conflict checking (if we compacted away
+      // all keys, then we wouldn't know that a write happened in this
+      // snapshot).  If there is no earlier snapshot, then we know that there
+      // are no active transactions that need to know about any writes.
+      //
+      // Optimization 3:
+      // If we encounter a SingleDelete followed by a PUT and Rule 2 is NOT
+      // true, then we must output a SingleDelete.  In this case, we will decide
+      // to also output the PUT.  While we are compacting less by outputting the
+      // PUT now, hopefully this will lead to better compaction in the future
+      // when Rule 2 is later true (Ie, We are hoping we can later compact out
+      // both the SingleDelete and the Put, while we couldn't if we only
+      // outputted the SingleDelete now).
+      // In this case, we can save space by removing the PUT's value as it will
+      // never be read.
+      //
+      // Deletes and Merges are not supported on the same key that has a
+      // SingleDelete as it is not possible to correctly do any partial
+      // compaction of such a combination of operations.  The result of mixing
+      // those operations for a given key is documented as being undefined.  So
+      // we can choose how to handle such a combinations of operations.  We will
+      // try to compact out as much as we can in these cases.
+      // We will report counts on these anomalous cases.
+      //
+      // Note: If timestamp is enabled, then record will be eligible for
+      // deletion, only if, along with above conditions (Rule 1 and Rule 2)
+      // full_history_ts_low_ is specified and timestamp for that key is less
+      // than *full_history_ts_low_. If it's not eligible for deletion, then we
+      // will output the SingleDelete. For Optimization 3 also, if
+      // full_history_ts_low_ is specified and timestamp for the key is less
+      // than *full_history_ts_low_ then only optimization will be applied.
+
+      // The easiest way to process a SingleDelete during iteration is to peek
+      // ahead at the next key.
+      const bool is_timestamp_eligible_for_gc =
+          (timestamp_size_ == 0 ||
+           (full_history_ts_low_ && cmp_with_history_ts_low_ < 0));
+
+      ParsedInternalKey next_ikey;
+      AdvanceInputIter();
+
+      // Check whether the next key exists, is not corrupt, and is the same key
+      // as the single delete.
+      if (input_.Valid() &&
+          ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_)
+              .ok() &&
+          cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) {
+#ifndef NDEBUG
+        const Compaction* c =
+            compaction_ ? compaction_->real_compaction() : nullptr;
+#endif
+        TEST_SYNC_POINT_CALLBACK(
+            "CompactionIterator::NextFromInput:SingleDelete:1",
+            const_cast<Compaction*>(c));
+        if (last_key_seq_zeroed_) {
+          ++iter_stats_.num_record_drop_hidden;
+          ++iter_stats_.num_record_drop_obsolete;
+          assert(bottommost_level_);
+          AdvanceInputIter();
+        } else if (prev_snapshot == 0 ||
+                   DefinitelyNotInSnapshot(next_ikey.sequence, prev_snapshot)) {
+          // Check whether the next key belongs to the same snapshot as the
+          // SingleDelete.
+
+          TEST_SYNC_POINT_CALLBACK(
+              "CompactionIterator::NextFromInput:SingleDelete:2", nullptr);
+          if (next_ikey.type == kTypeSingleDeletion) {
+            // We encountered two SingleDeletes for same key in a row. This
+            // could be due to unexpected user input. If write-(un)prepared
+            // transaction is used, this could also be due to releasing an old
+            // snapshot between a Put and its matching SingleDelete.
+            // Skip the first SingleDelete and let the next iteration decide
+            // how to handle the second SingleDelete.
+
+            // First SingleDelete has been skipped since we already called
+            // input_.Next().
+            ++iter_stats_.num_record_drop_obsolete;
+            ++iter_stats_.num_single_del_mismatch;
+          } else if (next_ikey.type == kTypeDeletion) {
+            std::ostringstream oss;
+            oss << "Found SD and type: " << static_cast<int>(next_ikey.type)
+                << " on the same key, violating the contract "
+                   "of SingleDelete. Check your application to make sure the "
+                   "application does not mix SingleDelete and Delete for "
+                   "the same key. If you are using "
+                   "write-prepared/write-unprepared transactions, and use "
+                   "SingleDelete to delete certain keys, then make sure "
+                   "TransactionDBOptions::rollback_deletion_type_callback is "
+                   "configured properly. Mixing SD and DEL can lead to "
+                   "undefined behaviors";
+            ++iter_stats_.num_record_drop_obsolete;
+            ++iter_stats_.num_single_del_mismatch;
+            if (enforce_single_del_contracts_) {
+              ROCKS_LOG_ERROR(info_log_, "%s", oss.str().c_str());
+              validity_info_.Invalidate();
+              status_ = Status::Corruption(oss.str());
+              return;
+            }
+            ROCKS_LOG_WARN(info_log_, "%s", oss.str().c_str());
+          } else if (!is_timestamp_eligible_for_gc) {
+            // We cannot drop the SingleDelete as timestamp is enabled, and
+            // timestamp of this key is greater than or equal to
+            // *full_history_ts_low_. We will output the SingleDelete.
+            validity_info_.SetValid(ValidContext::kKeepTsHistory);
+          } else if (has_outputted_key_ ||
+                     DefinitelyInSnapshot(ikey_.sequence,
+                                          earliest_write_conflict_snapshot_) ||
+                     (earliest_snapshot_ < earliest_write_conflict_snapshot_ &&
+                      DefinitelyInSnapshot(ikey_.sequence,
+                                           earliest_snapshot_))) {
+            // Found a matching value, we can drop the single delete and the
+            // value.  It is safe to drop both records since we've already
+            // outputted a key in this snapshot, or there is no earlier
+            // snapshot (Rule 2 above).
+
+            // Note: it doesn't matter whether the second key is a Put or if it
+            // is an unexpected Merge or Delete.  We will compact it out
+            // either way. We will maintain counts of how many mismatches
+            // happened
+            if (next_ikey.type != kTypeValue &&
+                next_ikey.type != kTypeBlobIndex &&
+                next_ikey.type != kTypeWideColumnEntity) {
+              ++iter_stats_.num_single_del_mismatch;
+            }
+
+            ++iter_stats_.num_record_drop_hidden;
+            ++iter_stats_.num_record_drop_obsolete;
+            // Already called input_.Next() once.  Call it a second time to
+            // skip past the second key.
+            AdvanceInputIter();
+          } else {
+            // Found a matching value, but we cannot drop both keys since
+            // there is an earlier snapshot and we need to leave behind a record
+            // to know that a write happened in this snapshot (Rule 2 above).
+            // Clear the value and output the SingleDelete. (The value will be
+            // outputted on the next iteration.)
+
+            // Setting valid_ to true will output the current SingleDelete
+            validity_info_.SetValid(ValidContext::kKeepSDForConflictCheck);
+
+            // Set up the Put to be outputted in the next iteration.
+            // (Optimization 3).
+            clear_and_output_next_key_ = true;
+            TEST_SYNC_POINT_CALLBACK(
+                "CompactionIterator::NextFromInput:KeepSDForWW",
+                /*arg=*/nullptr);
+          }
+        } else {
+          // We hit the next snapshot without hitting a put, so the iterator
+          // returns the single delete.
+          validity_info_.SetValid(ValidContext::kKeepSDForSnapshot);
+          TEST_SYNC_POINT_CALLBACK(
+              "CompactionIterator::NextFromInput:SingleDelete:3",
+              const_cast<Compaction*>(c));
+        }
+      } else {
+        // We are at the end of the input, could not parse the next key, or hit
+        // a different key. The iterator returns the single delete if the key
+        // possibly exists beyond the current output level.  We set
+        // has_current_user_key to false so that if the iterator is at the next
+        // key, we do not compare it again against the previous key at the next
+        // iteration. If the next key is corrupt, we return before the
+        // comparison, so the value of has_current_user_key does not matter.
+        has_current_user_key_ = false;
+        if (compaction_ != nullptr &&
+            DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
+            compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
+                                                       &level_ptrs_) &&
+            is_timestamp_eligible_for_gc) {
+          // Key doesn't exist outside of this range.
+          // Can compact out this SingleDelete.
+          ++iter_stats_.num_record_drop_obsolete;
+          ++iter_stats_.num_single_del_fallthru;
+          if (!bottommost_level_) {
+            ++iter_stats_.num_optimized_del_drop_obsolete;
+          }
+        } else if (last_key_seq_zeroed_) {
+          // Skip.
+          ++iter_stats_.num_record_drop_hidden;
+          ++iter_stats_.num_record_drop_obsolete;
+          assert(bottommost_level_);
+        } else {
+          // Output SingleDelete
+          validity_info_.SetValid(ValidContext::kKeepSD);
+        }
+      }
+
+      if (Valid()) {
+        at_next_ = true;
+      }
+    } else if (last_snapshot == current_user_key_snapshot_ ||
+               (last_snapshot > 0 &&
+                last_snapshot < current_user_key_snapshot_)) {
+      // If the earliest snapshot is which this key is visible in
+      // is the same as the visibility of a previous instance of the
+      // same key, then this kv is not visible in any snapshot.
+      // Hidden by an newer entry for same user key
+      //
+      // Note: Dropping this key will not affect TransactionDB write-conflict
+      // checking since there has already been a record returned for this key
+      // in this snapshot.
+      if (last_sequence < current_user_key_sequence_) {
+        ROCKS_LOG_FATAL(info_log_,
+                        "key %s, last_sequence (%" PRIu64
+                        ") < current_user_key_sequence_ (%" PRIu64 ")",
+                        ikey_.DebugString(allow_data_in_errors_, true).c_str(),
+                        last_sequence, current_user_key_sequence_);
+        assert(false);
+      }
+
+      ++iter_stats_.num_record_drop_hidden;  // rule (A)
+      AdvanceInputIter();
+    } else if (compaction_ != nullptr &&
+               (ikey_.type == kTypeDeletion ||
+                (ikey_.type == kTypeDeletionWithTimestamp &&
+                 cmp_with_history_ts_low_ < 0)) &&
+               DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
+               compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
+                                                          &level_ptrs_)) {
+      // TODO(noetzli): This is the only place where we use compaction_
+      // (besides the constructor). We should probably get rid of this
+      // dependency and find a way to do similar filtering during flushes.
+      //
+      // For this user key:
+      // (1) there is no data in higher levels
+      // (2) data in lower levels will have larger sequence numbers
+      // (3) data in layers that are being compacted here and have
+      //     smaller sequence numbers will be dropped in the next
+      //     few iterations of this loop (by rule (A) above).
+      // Therefore this deletion marker is obsolete and can be dropped.
+      //
+      // Note:  Dropping this Delete will not affect TransactionDB
+      // write-conflict checking since it is earlier than any snapshot.
+      //
+      // It seems that we can also drop deletion later than earliest snapshot
+      // given that:
+      // (1) The deletion is earlier than earliest_write_conflict_snapshot, and
+      // (2) No value exist earlier than the deletion.
+      //
+      // Note also that a deletion marker of type kTypeDeletionWithTimestamp
+      // will be treated as a different user key unless the timestamp is older
+      // than *full_history_ts_low_.
+      ++iter_stats_.num_record_drop_obsolete;
+      if (!bottommost_level_) {
+        ++iter_stats_.num_optimized_del_drop_obsolete;
+      }
+      AdvanceInputIter();
+    } else if ((ikey_.type == kTypeDeletion ||
+                (ikey_.type == kTypeDeletionWithTimestamp &&
+                 cmp_with_history_ts_low_ < 0)) &&
+               bottommost_level_) {
+      // Handle the case where we have a delete key at the bottom most level
+      // We can skip outputting the key iff there are no subsequent puts for
+      // this key
+      assert(!compaction_ || compaction_->KeyNotExistsBeyondOutputLevel(
+                                 ikey_.user_key, &level_ptrs_));
+      ParsedInternalKey next_ikey;
+      AdvanceInputIter();
+#ifndef NDEBUG
+      const Compaction* c =
+          compaction_ ? compaction_->real_compaction() : nullptr;
+#endif
+      TEST_SYNC_POINT_CALLBACK(
+          "CompactionIterator::NextFromInput:BottommostDelete:1",
+          const_cast<Compaction*>(c));
+      // Skip over all versions of this key that happen to occur in the same
+      // snapshot range as the delete.
+      //
+      // Note that a deletion marker of type kTypeDeletionWithTimestamp will be
+      // considered to have a different user key unless the timestamp is older
+      // than *full_history_ts_low_.
+      while (!IsPausingManualCompaction() && !IsShuttingDown() &&
+             input_.Valid() &&
+             (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_)
+                  .ok()) &&
+             cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key) &&
+             (prev_snapshot == 0 ||
+              DefinitelyNotInSnapshot(next_ikey.sequence, prev_snapshot))) {
+        AdvanceInputIter();
+      }
+      // If you find you still need to output a row with this key, we need to
+      // output the delete too
+      if (input_.Valid() &&
+          (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_)
+               .ok()) &&
+          cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) {
+        validity_info_.SetValid(ValidContext::kKeepDel);
+        at_next_ = true;
+      }
+    } else if (ikey_.type == kTypeMerge) {
+      if (!merge_helper_->HasOperator()) {
+        status_ = Status::InvalidArgument(
+            "merge_operator is not properly initialized.");
+        return;
+      }
+
+      pinned_iters_mgr_.StartPinning();
+
+      // We know the merge type entry is not hidden, otherwise we would
+      // have hit (A)
+      // We encapsulate the merge related state machine in a different
+      // object to minimize change to the existing flow.
+      Status s = merge_helper_->MergeUntil(
+          &input_, range_del_agg_, prev_snapshot, bottommost_level_,
+          allow_data_in_errors_, blob_fetcher_.get(), full_history_ts_low_,
+          prefetch_buffers_.get(), &iter_stats_);
+      merge_out_iter_.SeekToFirst();
+
+      if (!s.ok() && !s.IsMergeInProgress()) {
+        status_ = s;
+        return;
+      } else if (merge_out_iter_.Valid()) {
+        // NOTE: key, value, and ikey_ refer to old entries.
+        //       These will be correctly set below.
+        key_ = merge_out_iter_.key();
+        value_ = merge_out_iter_.value();
+        pik_status = ParseInternalKey(key_, &ikey_, allow_data_in_errors_);
+        // MergeUntil stops when it encounters a corrupt key and does not
+        // include them in the result, so we expect the keys here to valid.
+        if (!pik_status.ok()) {
+          ROCKS_LOG_FATAL(
+              info_log_, "Invalid key %s in compaction. %s",
+              allow_data_in_errors_ ? key_.ToString(true).c_str() : "hidden",
+              pik_status.getState());
+          assert(false);
+        }
+        // Keep current_key_ in sync.
+        current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+        key_ = current_key_.GetInternalKey();
+        ikey_.user_key = current_key_.GetUserKey();
+        validity_info_.SetValid(ValidContext::kMerge2);
+      } else {
+        // all merge operands were filtered out. reset the user key, since the
+        // batch consumed by the merge operator should not shadow any keys
+        // coming after the merges
+        has_current_user_key_ = false;
+        pinned_iters_mgr_.ReleasePinnedData();
+
+        if (merge_helper_->FilteredUntil(&skip_until)) {
+          need_skip = true;
+        }
+      }
+    } else {
+      // 1. new user key -OR-
+      // 2. different snapshot stripe
+      // If user-defined timestamp is enabled, we consider keys for GC if they
+      // are below history_ts_low_. CompactionRangeDelAggregator::ShouldDelete()
+      // only considers range deletions that are at or below history_ts_low_ and
+      // trim_ts_. We drop keys here that are below history_ts_low_ and are
+      // covered by a range tombstone that is at or below history_ts_low_ and
+      // trim_ts.
+      bool should_delete = false;
+      if (!timestamp_size_ || cmp_with_history_ts_low_ < 0) {
+        should_delete = range_del_agg_->ShouldDelete(
+            key_, RangeDelPositioningMode::kForwardTraversal);
+      }
+      if (should_delete) {
+        ++iter_stats_.num_record_drop_hidden;
+        ++iter_stats_.num_record_drop_range_del;
+        AdvanceInputIter();
+      } else {
+        validity_info_.SetValid(ValidContext::kNewUserKey);
+      }
+    }
+
+    if (need_skip) {
+      SkipUntil(skip_until);
+    }
+  }
+
+  if (!Valid() && IsShuttingDown()) {
+    status_ = Status::ShutdownInProgress();
+  }
+
+  if (IsPausingManualCompaction()) {
+    status_ = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+  }
+
+  // Propagate corruption status from memtable itereator
+  if (!input_.Valid() && input_.status().IsCorruption()) {
+    status_ = input_.status();
+  }
+}
+
+bool CompactionIterator::ExtractLargeValueIfNeededImpl() {
+  if (!blob_file_builder_) {
+    return false;
+  }
+
+  blob_index_.clear();
+  const Status s = blob_file_builder_->Add(user_key(), value_, &blob_index_);
+
+  if (!s.ok()) {
+    status_ = s;
+    validity_info_.Invalidate();
+
+    return false;
+  }
+
+  if (blob_index_.empty()) {
+    return false;
+  }
+
+  value_ = blob_index_;
+
+  return true;
+}
+
+void CompactionIterator::ExtractLargeValueIfNeeded() {
+  assert(ikey_.type == kTypeValue);
+
+  if (!ExtractLargeValueIfNeededImpl()) {
+    return;
+  }
+
+  ikey_.type = kTypeBlobIndex;
+  current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+}
+
+void CompactionIterator::GarbageCollectBlobIfNeeded() {
+  assert(ikey_.type == kTypeBlobIndex);
+
+  if (!compaction_) {
+    return;
+  }
+
+  // GC for integrated BlobDB
+  if (compaction_->enable_blob_garbage_collection()) {
+    TEST_SYNC_POINT_CALLBACK(
+        "CompactionIterator::GarbageCollectBlobIfNeeded::TamperWithBlobIndex",
+        &value_);
+
+    BlobIndex blob_index;
+
+    {
+      const Status s = blob_index.DecodeFrom(value_);
+
+      if (!s.ok()) {
+        status_ = s;
+        validity_info_.Invalidate();
+
+        return;
+      }
+    }
+
+    if (blob_index.file_number() >=
+        blob_garbage_collection_cutoff_file_number_) {
+      return;
+    }
+
+    FilePrefetchBuffer* prefetch_buffer =
+        prefetch_buffers_ ? prefetch_buffers_->GetOrCreatePrefetchBuffer(
+                                blob_index.file_number())
+                          : nullptr;
+
+    uint64_t bytes_read = 0;
+
+    {
+      assert(blob_fetcher_);
+
+      const Status s = blob_fetcher_->FetchBlob(
+          user_key(), blob_index, prefetch_buffer, &blob_value_, &bytes_read);
+
+      if (!s.ok()) {
+        status_ = s;
+        validity_info_.Invalidate();
+
+        return;
+      }
+    }
+
+    ++iter_stats_.num_blobs_read;
+    iter_stats_.total_blob_bytes_read += bytes_read;
+
+    ++iter_stats_.num_blobs_relocated;
+    iter_stats_.total_blob_bytes_relocated += blob_index.size();
+
+    value_ = blob_value_;
+
+    if (ExtractLargeValueIfNeededImpl()) {
+      return;
+    }
+
+    ikey_.type = kTypeValue;
+    current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+
+    return;
+  }
+
+  // GC for stacked BlobDB
+  if (compaction_filter_ &&
+      compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
+    const auto blob_decision = compaction_filter_->PrepareBlobOutput(
+        user_key(), value_, &compaction_filter_value_);
+
+    if (blob_decision == CompactionFilter::BlobDecision::kCorruption) {
+      status_ =
+          Status::Corruption("Corrupted blob reference encountered during GC");
+      validity_info_.Invalidate();
+
+      return;
+    }
+
+    if (blob_decision == CompactionFilter::BlobDecision::kIOError) {
+      status_ = Status::IOError("Could not relocate blob during GC");
+      validity_info_.Invalidate();
+
+      return;
+    }
+
+    if (blob_decision == CompactionFilter::BlobDecision::kChangeValue) {
+      value_ = compaction_filter_value_;
+
+      return;
+    }
+  }
+}
+
+void CompactionIterator::DecideOutputLevel() {
+  assert(compaction_->SupportsPerKeyPlacement());
+#ifndef NDEBUG
+  // Could be overridden by unittest
+  PerKeyPlacementContext context(level_, ikey_.user_key, value_,
+                                 ikey_.sequence);
+  TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context",
+                           &context);
+  output_to_penultimate_level_ = context.output_to_penultimate_level;
+#else
+  output_to_penultimate_level_ = false;
+#endif  // NDEBUG
+
+  // if the key is newer than the cutoff sequence or within the earliest
+  // snapshot, it should output to the penultimate level.
+  if (ikey_.sequence > preclude_last_level_min_seqno_ ||
+      ikey_.sequence > earliest_snapshot_) {
+    output_to_penultimate_level_ = true;
+  }
+
+  if (output_to_penultimate_level_) {
+    // If it's decided to output to the penultimate level, but unsafe to do so,
+    // still output to the last level. For example, moving the data from a lower
+    // level to a higher level outside of the higher-level input key range is
+    // considered unsafe, because the key may conflict with higher-level SSTs
+    // not from this compaction.
+    // TODO: add statistic for declined output_to_penultimate_level
+    bool safe_to_penultimate_level =
+        compaction_->WithinPenultimateLevelOutputRange(ikey_.user_key);
+    if (!safe_to_penultimate_level) {
+      output_to_penultimate_level_ = false;
+      // It could happen when disable/enable `last_level_temperature` while
+      // holding a snapshot. When `last_level_temperature` is not set
+      // (==kUnknown), the data newer than any snapshot is pushed to the last
+      // level, but when the per_key_placement feature is enabled on the fly,
+      // the data later than the snapshot has to be moved to the penultimate
+      // level, which may or may not be safe. So the user needs to make sure all
+      // snapshot is released before enabling `last_level_temperature` feature
+      // We will migrate the feature to `last_level_temperature` and maybe make
+      // it not dynamically changeable.
+      if (ikey_.sequence > earliest_snapshot_) {
+        status_ = Status::Corruption(
+            "Unsafe to store Seq later than snapshot in the last level if "
+            "per_key_placement is enabled");
+      }
+    }
+  }
+}
+
+void CompactionIterator::PrepareOutput() {
+  if (Valid()) {
+    if (ikey_.type == kTypeValue) {
+      ExtractLargeValueIfNeeded();
+    } else if (ikey_.type == kTypeBlobIndex) {
+      GarbageCollectBlobIfNeeded();
+    }
+
+    if (compaction_ != nullptr && compaction_->SupportsPerKeyPlacement()) {
+      DecideOutputLevel();
+    }
+
+    // Zeroing out the sequence number leads to better compression.
+    // If this is the bottommost level (no files in lower levels)
+    // and the earliest snapshot is larger than this seqno
+    // and the userkey differs from the last userkey in compaction
+    // then we can squash the seqno to zero.
+    //
+    // This is safe for TransactionDB write-conflict checking since transactions
+    // only care about sequence number larger than any active snapshots.
+    //
+    // Can we do the same for levels above bottom level as long as
+    // KeyNotExistsBeyondOutputLevel() return true?
+    if (Valid() && compaction_ != nullptr &&
+        !compaction_->allow_ingest_behind() && bottommost_level_ &&
+        DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
+        ikey_.type != kTypeMerge && current_key_committed_ &&
+        !output_to_penultimate_level_ &&
+        ikey_.sequence < preserve_time_min_seqno_) {
+      if (ikey_.type == kTypeDeletion ||
+          (ikey_.type == kTypeSingleDeletion && timestamp_size_ == 0)) {
+        ROCKS_LOG_FATAL(
+            info_log_,
+            "Unexpected key %s for seq-zero optimization. "
+            "earliest_snapshot %" PRIu64
+            ", earliest_write_conflict_snapshot %" PRIu64
+            " job_snapshot %" PRIu64
+            ". timestamp_size: %d full_history_ts_low_ %s. validity %x",
+            ikey_.DebugString(allow_data_in_errors_, true).c_str(),
+            earliest_snapshot_, earliest_write_conflict_snapshot_,
+            job_snapshot_, static_cast<int>(timestamp_size_),
+            full_history_ts_low_ != nullptr
+                ? Slice(*full_history_ts_low_).ToString(true).c_str()
+                : "null",
+            validity_info_.rep);
+        assert(false);
+      }
+      ikey_.sequence = 0;
+      last_key_seq_zeroed_ = true;
+      TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput:ZeroingSeq",
+                               &ikey_);
+      if (!timestamp_size_) {
+        current_key_.UpdateInternalKey(0, ikey_.type);
+      } else if (full_history_ts_low_ && cmp_with_history_ts_low_ < 0) {
+        // We can also zero out timestamp for better compression.
+        // For the same user key (excluding timestamp), the timestamp-based
+        // history can be collapsed to save some space if the timestamp is
+        // older than *full_history_ts_low_.
+        const std::string kTsMin(timestamp_size_, static_cast<char>(0));
+        const Slice ts_slice = kTsMin;
+        ikey_.SetTimestamp(ts_slice);
+        current_key_.UpdateInternalKey(0, ikey_.type, &ts_slice);
+      }
+    }
+  }
+}
+
+inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot(
+    SequenceNumber in, SequenceNumber* prev_snapshot) {
+  assert(snapshots_->size());
+  if (snapshots_->size() == 0) {
+    ROCKS_LOG_FATAL(info_log_,
+                    "No snapshot left in findEarliestVisibleSnapshot");
+  }
+  auto snapshots_iter =
+      std::lower_bound(snapshots_->begin(), snapshots_->end(), in);
+  assert(prev_snapshot != nullptr);
+  if (snapshots_iter == snapshots_->begin()) {
+    *prev_snapshot = 0;
+  } else {
+    *prev_snapshot = *std::prev(snapshots_iter);
+    if (*prev_snapshot >= in) {
+      ROCKS_LOG_FATAL(info_log_,
+                      "*prev_snapshot (%" PRIu64 ") >= in (%" PRIu64
+                      ") in findEarliestVisibleSnapshot",
+                      *prev_snapshot, in);
+      assert(false);
+    }
+  }
+  if (snapshot_checker_ == nullptr) {
+    return snapshots_iter != snapshots_->end() ? *snapshots_iter
+                                               : kMaxSequenceNumber;
+  }
+  bool has_released_snapshot = !released_snapshots_.empty();
+  for (; snapshots_iter != snapshots_->end(); ++snapshots_iter) {
+    auto cur = *snapshots_iter;
+    if (in > cur) {
+      ROCKS_LOG_FATAL(info_log_,
+                      "in (%" PRIu64 ") > cur (%" PRIu64
+                      ") in findEarliestVisibleSnapshot",
+                      in, cur);
+      assert(false);
+    }
+    // Skip if cur is in released_snapshots.
+    if (has_released_snapshot && released_snapshots_.count(cur) > 0) {
+      continue;
+    }
+    auto res = snapshot_checker_->CheckInSnapshot(in, cur);
+    if (res == SnapshotCheckerResult::kInSnapshot) {
+      return cur;
+    } else if (res == SnapshotCheckerResult::kSnapshotReleased) {
+      released_snapshots_.insert(cur);
+    }
+    *prev_snapshot = cur;
+  }
+  return kMaxSequenceNumber;
+}
+
+uint64_t CompactionIterator::ComputeBlobGarbageCollectionCutoffFileNumber(
+    const CompactionProxy* compaction) {
+  if (!compaction) {
+    return 0;
+  }
+
+  if (!compaction->enable_blob_garbage_collection()) {
+    return 0;
+  }
+
+  const Version* const version = compaction->input_version();
+  assert(version);
+
+  const VersionStorageInfo* const storage_info = version->storage_info();
+  assert(storage_info);
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+
+  const size_t cutoff_index = static_cast<size_t>(
+      compaction->blob_garbage_collection_age_cutoff() * blob_files.size());
+
+  if (cutoff_index >= blob_files.size()) {
+    return std::numeric_limits<uint64_t>::max();
+  }
+
+  const auto& meta = blob_files[cutoff_index];
+  assert(meta);
+
+  return meta->GetBlobFileNumber();
+}
+
+std::unique_ptr<BlobFetcher> CompactionIterator::CreateBlobFetcherIfNeeded(
+    const CompactionProxy* compaction) {
+  if (!compaction) {
+    return nullptr;
+  }
+
+  const Version* const version = compaction->input_version();
+  if (!version) {
+    return nullptr;
+  }
+
+  ReadOptions read_options;
+  read_options.fill_cache = false;
+
+  return std::unique_ptr<BlobFetcher>(new BlobFetcher(version, read_options));
+}
+
+std::unique_ptr<PrefetchBufferCollection>
+CompactionIterator::CreatePrefetchBufferCollectionIfNeeded(
+    const CompactionProxy* compaction) {
+  if (!compaction) {
+    return nullptr;
+  }
+
+  if (!compaction->input_version()) {
+    return nullptr;
+  }
+
+  if (compaction->allow_mmap_reads()) {
+    return nullptr;
+  }
+
+  const uint64_t readahead_size = compaction->blob_compaction_readahead_size();
+  if (!readahead_size) {
+    return nullptr;
+  }
+
+  return std::unique_ptr<PrefetchBufferCollection>(
+      new PrefetchBufferCollection(readahead_size));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_iterator.h b/src/rocksdb/db/compaction/compaction_iterator.h
new file mode 100644
index 000000000..c215d2bbb
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_iterator.h
@@ -0,0 +1,513 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <algorithm>
+#include <cinttypes>
+#include <deque>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_iteration_stats.h"
+#include "db/merge_helper.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/range_del_aggregator.h"
+#include "db/snapshot_checker.h"
+#include "options/cf_options.h"
+#include "rocksdb/compaction_filter.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlobFileBuilder;
+class BlobFetcher;
+class PrefetchBufferCollection;
+
+// A wrapper of internal iterator whose purpose is to count how
+// many entries there are in the iterator.
+class SequenceIterWrapper : public InternalIterator {
+ public:
+  SequenceIterWrapper(InternalIterator* iter, const Comparator* cmp,
+                      bool need_count_entries)
+      : icmp_(cmp),
+        inner_iter_(iter),
+        need_count_entries_(need_count_entries) {}
+  bool Valid() const override { return inner_iter_->Valid(); }
+  Status status() const override { return inner_iter_->status(); }
+  void Next() override {
+    num_itered_++;
+    inner_iter_->Next();
+  }
+  void Seek(const Slice& target) override {
+    if (!need_count_entries_) {
+      inner_iter_->Seek(target);
+    } else {
+      // For flush cases, we need to count total number of entries, so we
+      // do Next() rather than Seek().
+      while (inner_iter_->Valid() &&
+             icmp_.Compare(inner_iter_->key(), target) < 0) {
+        Next();
+      }
+    }
+  }
+  Slice key() const override { return inner_iter_->key(); }
+  Slice value() const override { return inner_iter_->value(); }
+
+  // Unused InternalIterator methods
+  void SeekToFirst() override { assert(false); }
+  void Prev() override { assert(false); }
+  void SeekForPrev(const Slice& /* target */) override { assert(false); }
+  void SeekToLast() override { assert(false); }
+
+  uint64_t num_itered() const { return num_itered_; }
+
+ private:
+  InternalKeyComparator icmp_;
+  InternalIterator* inner_iter_;  // not owned
+  uint64_t num_itered_ = 0;
+  bool need_count_entries_;
+};
+
+class CompactionIterator {
+ public:
+  // A wrapper around Compaction. Has a much smaller interface, only what
+  // CompactionIterator uses. Tests can override it.
+  class CompactionProxy {
+   public:
+    virtual ~CompactionProxy() = default;
+
+    virtual int level() const = 0;
+
+    virtual bool KeyNotExistsBeyondOutputLevel(
+        const Slice& user_key, std::vector<size_t>* level_ptrs) const = 0;
+
+    virtual bool bottommost_level() const = 0;
+
+    virtual int number_levels() const = 0;
+
+    // Result includes timestamp if user-defined timestamp is enabled.
+    virtual Slice GetLargestUserKey() const = 0;
+
+    virtual bool allow_ingest_behind() const = 0;
+
+    virtual bool allow_mmap_reads() const = 0;
+
+    virtual bool enable_blob_garbage_collection() const = 0;
+
+    virtual double blob_garbage_collection_age_cutoff() const = 0;
+
+    virtual uint64_t blob_compaction_readahead_size() const = 0;
+
+    virtual const Version* input_version() const = 0;
+
+    virtual bool DoesInputReferenceBlobFiles() const = 0;
+
+    virtual const Compaction* real_compaction() const = 0;
+
+    virtual bool SupportsPerKeyPlacement() const = 0;
+
+    // `key` includes timestamp if user-defined timestamp is enabled.
+    virtual bool WithinPenultimateLevelOutputRange(const Slice& key) const = 0;
+  };
+
+  class RealCompaction : public CompactionProxy {
+   public:
+    explicit RealCompaction(const Compaction* compaction)
+        : compaction_(compaction) {
+      assert(compaction_);
+      assert(compaction_->immutable_options());
+      assert(compaction_->mutable_cf_options());
+    }
+
+    int level() const override { return compaction_->level(); }
+
+    bool KeyNotExistsBeyondOutputLevel(
+        const Slice& user_key, std::vector<size_t>* level_ptrs) const override {
+      return compaction_->KeyNotExistsBeyondOutputLevel(user_key, level_ptrs);
+    }
+
+    bool bottommost_level() const override {
+      return compaction_->bottommost_level();
+    }
+
+    int number_levels() const override { return compaction_->number_levels(); }
+
+    // Result includes timestamp if user-defined timestamp is enabled.
+    Slice GetLargestUserKey() const override {
+      return compaction_->GetLargestUserKey();
+    }
+
+    bool allow_ingest_behind() const override {
+      return compaction_->immutable_options()->allow_ingest_behind;
+    }
+
+    bool allow_mmap_reads() const override {
+      return compaction_->immutable_options()->allow_mmap_reads;
+    }
+
+    bool enable_blob_garbage_collection() const override {
+      return compaction_->enable_blob_garbage_collection();
+    }
+
+    double blob_garbage_collection_age_cutoff() const override {
+      return compaction_->blob_garbage_collection_age_cutoff();
+    }
+
+    uint64_t blob_compaction_readahead_size() const override {
+      return compaction_->mutable_cf_options()->blob_compaction_readahead_size;
+    }
+
+    const Version* input_version() const override {
+      return compaction_->input_version();
+    }
+
+    bool DoesInputReferenceBlobFiles() const override {
+      return compaction_->DoesInputReferenceBlobFiles();
+    }
+
+    const Compaction* real_compaction() const override { return compaction_; }
+
+    bool SupportsPerKeyPlacement() const override {
+      return compaction_->SupportsPerKeyPlacement();
+    }
+
+    // Check if key is within penultimate level output range, to see if it's
+    // safe to output to the penultimate level for per_key_placement feature.
+    // `key` includes timestamp if user-defined timestamp is enabled.
+    bool WithinPenultimateLevelOutputRange(const Slice& key) const override {
+      return compaction_->WithinPenultimateLevelOutputRange(key);
+    }
+
+   private:
+    const Compaction* compaction_;
+  };
+
+  CompactionIterator(
+      InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+      SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
+      SequenceNumber earliest_write_conflict_snapshot,
+      SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker,
+      Env* env, bool report_detailed_time, bool expect_valid_internal_key,
+      CompactionRangeDelAggregator* range_del_agg,
+      BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
+      bool enforce_single_del_contracts,
+      const std::atomic<bool>& manual_compaction_canceled,
+      const Compaction* compaction = nullptr,
+      const CompactionFilter* compaction_filter = nullptr,
+      const std::atomic<bool>* shutting_down = nullptr,
+      const std::shared_ptr<Logger> info_log = nullptr,
+      const std::string* full_history_ts_low = nullptr,
+      const SequenceNumber preserve_time_min_seqno = kMaxSequenceNumber,
+      const SequenceNumber preclude_last_level_min_seqno = kMaxSequenceNumber);
+
+  // Constructor with custom CompactionProxy, used for tests.
+  CompactionIterator(
+      InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+      SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
+      SequenceNumber earliest_write_conflict_snapshot,
+      SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker,
+      Env* env, bool report_detailed_time, bool expect_valid_internal_key,
+      CompactionRangeDelAggregator* range_del_agg,
+      BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
+      bool enforce_single_del_contracts,
+      const std::atomic<bool>& manual_compaction_canceled,
+      std::unique_ptr<CompactionProxy> compaction,
+      const CompactionFilter* compaction_filter = nullptr,
+      const std::atomic<bool>* shutting_down = nullptr,
+      const std::shared_ptr<Logger> info_log = nullptr,
+      const std::string* full_history_ts_low = nullptr,
+      const SequenceNumber preserve_time_min_seqno = kMaxSequenceNumber,
+      const SequenceNumber preclude_last_level_min_seqno = kMaxSequenceNumber);
+
+  ~CompactionIterator();
+
+  void ResetRecordCounts();
+
+  // Seek to the beginning of the compaction iterator output.
+  //
+  // REQUIRED: Call only once.
+  void SeekToFirst();
+
+  // Produces the next record in the compaction.
+  //
+  // REQUIRED: SeekToFirst() has been called.
+  void Next();
+
+  // Getters
+  const Slice& key() const { return key_; }
+  const Slice& value() const { return value_; }
+  const Status& status() const { return status_; }
+  const ParsedInternalKey& ikey() const { return ikey_; }
+  inline bool Valid() const { return validity_info_.IsValid(); }
+  const Slice& user_key() const { return current_user_key_; }
+  const CompactionIterationStats& iter_stats() const { return iter_stats_; }
+  uint64_t num_input_entry_scanned() const { return input_.num_itered(); }
+  // If the current key should be placed on penultimate level, only valid if
+  // per_key_placement is supported
+  bool output_to_penultimate_level() const {
+    return output_to_penultimate_level_;
+  }
+  Status InputStatus() const { return input_.status(); }
+
+ private:
+  // Processes the input stream to find the next output
+  void NextFromInput();
+
+  // Do final preparations before presenting the output to the callee.
+  void PrepareOutput();
+
+  // Decide the current key should be output to the last level or penultimate
+  // level, only call for compaction supports per key placement
+  void DecideOutputLevel();
+
+  // Passes the output value to the blob file builder (if any), and replaces it
+  // with the corresponding blob reference if it has been actually written to a
+  // blob file (i.e. if it passed the value size check). Returns true if the
+  // value got extracted to a blob file, false otherwise.
+  bool ExtractLargeValueIfNeededImpl();
+
+  // Extracts large values as described above, and updates the internal key's
+  // type to kTypeBlobIndex if the value got extracted. Should only be called
+  // for regular values (kTypeValue).
+  void ExtractLargeValueIfNeeded();
+
+  // Relocates valid blobs residing in the oldest blob files if garbage
+  // collection is enabled. Relocated blobs are written to new blob files or
+  // inlined in the LSM tree depending on the current settings (i.e.
+  // enable_blob_files and min_blob_size). Should only be called for blob
+  // references (kTypeBlobIndex).
+  //
+  // Note: the stacked BlobDB implementation's compaction filter based GC
+  // algorithm is also called from here.
+  void GarbageCollectBlobIfNeeded();
+
+  // Invoke compaction filter if needed.
+  // Return true on success, false on failures (e.g.: kIOError).
+  bool InvokeFilterIfNeeded(bool* need_skip, Slice* skip_until);
+
+  // Given a sequence number, return the sequence number of the
+  // earliest snapshot that this sequence number is visible in.
+  // The snapshots themselves are arranged in ascending order of
+  // sequence numbers.
+  // Employ a sequential search because the total number of
+  // snapshots are typically small.
+  inline SequenceNumber findEarliestVisibleSnapshot(
+      SequenceNumber in, SequenceNumber* prev_snapshot);
+
+  inline bool KeyCommitted(SequenceNumber sequence) {
+    return snapshot_checker_ == nullptr ||
+           snapshot_checker_->CheckInSnapshot(sequence, job_snapshot_) ==
+               SnapshotCheckerResult::kInSnapshot;
+  }
+
+  bool DefinitelyInSnapshot(SequenceNumber seq, SequenceNumber snapshot);
+
+  bool DefinitelyNotInSnapshot(SequenceNumber seq, SequenceNumber snapshot);
+
+  // Extract user-defined timestamp from user key if possible and compare it
+  // with *full_history_ts_low_ if applicable.
+  inline void UpdateTimestampAndCompareWithFullHistoryLow() {
+    if (!timestamp_size_) {
+      return;
+    }
+    Slice ts = ExtractTimestampFromUserKey(ikey_.user_key, timestamp_size_);
+    curr_ts_.assign(ts.data(), ts.size());
+    if (full_history_ts_low_) {
+      cmp_with_history_ts_low_ =
+          cmp_->CompareTimestamp(ts, *full_history_ts_low_);
+    }
+  }
+
+  static uint64_t ComputeBlobGarbageCollectionCutoffFileNumber(
+      const CompactionProxy* compaction);
+  static std::unique_ptr<BlobFetcher> CreateBlobFetcherIfNeeded(
+      const CompactionProxy* compaction);
+  static std::unique_ptr<PrefetchBufferCollection>
+  CreatePrefetchBufferCollectionIfNeeded(const CompactionProxy* compaction);
+
+  SequenceIterWrapper input_;
+  const Comparator* cmp_;
+  MergeHelper* merge_helper_;
+  const std::vector<SequenceNumber>* snapshots_;
+  // List of snapshots released during compaction.
+  // findEarliestVisibleSnapshot() find them out from return of
+  // snapshot_checker, and make sure they will not be returned as
+  // earliest visible snapshot of an older value.
+  // See WritePreparedTransactionTest::ReleaseSnapshotDuringCompaction3.
+  std::unordered_set<SequenceNumber> released_snapshots_;
+  const SequenceNumber earliest_write_conflict_snapshot_;
+  const SequenceNumber job_snapshot_;
+  const SnapshotChecker* const snapshot_checker_;
+  Env* env_;
+  SystemClock* clock_;
+  const bool report_detailed_time_;
+  const bool expect_valid_internal_key_;
+  CompactionRangeDelAggregator* range_del_agg_;
+  BlobFileBuilder* blob_file_builder_;
+  std::unique_ptr<CompactionProxy> compaction_;
+  const CompactionFilter* compaction_filter_;
+  const std::atomic<bool>* shutting_down_;
+  const std::atomic<bool>& manual_compaction_canceled_;
+  const bool bottommost_level_;
+  const bool visible_at_tip_;
+  const SequenceNumber earliest_snapshot_;
+
+  std::shared_ptr<Logger> info_log_;
+
+  const bool allow_data_in_errors_;
+
+  const bool enforce_single_del_contracts_;
+
+  // Comes from comparator.
+  const size_t timestamp_size_;
+
+  // Lower bound timestamp to retain full history in terms of user-defined
+  // timestamp. If a key's timestamp is older than full_history_ts_low_, then
+  // the key *may* be eligible for garbage collection (GC). The skipping logic
+  // is in `NextFromInput()` and `PrepareOutput()`.
+  // If nullptr, NO GC will be performed and all history will be preserved.
+  const std::string* const full_history_ts_low_;
+
+  // State
+  //
+  enum ValidContext : uint8_t {
+    kMerge1 = 0,
+    kMerge2 = 1,
+    kParseKeyError = 2,
+    kCurrentKeyUncommitted = 3,
+    kKeepSDAndClearPut = 4,
+    kKeepTsHistory = 5,
+    kKeepSDForConflictCheck = 6,
+    kKeepSDForSnapshot = 7,
+    kKeepSD = 8,
+    kKeepDel = 9,
+    kNewUserKey = 10,
+  };
+
+  struct ValidityInfo {
+    inline bool IsValid() const { return rep & 1; }
+    ValidContext GetContext() const {
+      return static_cast<ValidContext>(rep >> 1);
+    }
+    inline void SetValid(uint8_t ctx) { rep = (ctx << 1) | 1; }
+    inline void Invalidate() { rep = 0; }
+
+    uint8_t rep{0};
+  } validity_info_;
+
+  // Points to a copy of the current compaction iterator output (current_key_)
+  // if valid.
+  Slice key_;
+  // Points to the value in the underlying iterator that corresponds to the
+  // current output.
+  Slice value_;
+  // The status is OK unless compaction iterator encounters a merge operand
+  // while not having a merge operator defined.
+  Status status_;
+  // Stores the user key, sequence number and type of the current compaction
+  // iterator output (or current key in the underlying iterator during
+  // NextFromInput()).
+  ParsedInternalKey ikey_;
+  // Stores whether ikey_.user_key is valid. If set to false, the user key is
+  // not compared against the current key in the underlying iterator.
+  bool has_current_user_key_ = false;
+  // If false, the iterator holds a copy of the current compaction iterator
+  // output (or current key in the underlying iterator during NextFromInput()).
+  bool at_next_ = false;
+
+  IterKey current_key_;
+  Slice current_user_key_;
+  std::string curr_ts_;
+  SequenceNumber current_user_key_sequence_;
+  SequenceNumber current_user_key_snapshot_;
+
+  // True if the iterator has already returned a record for the current key.
+  bool has_outputted_key_ = false;
+
+  // truncated the value of the next key and output it without applying any
+  // compaction rules.  This is used for outputting a put after a single delete.
+  bool clear_and_output_next_key_ = false;
+
+  MergeOutputIterator merge_out_iter_;
+  // PinnedIteratorsManager used to pin input_ Iterator blocks while reading
+  // merge operands and then releasing them after consuming them.
+  PinnedIteratorsManager pinned_iters_mgr_;
+
+  uint64_t blob_garbage_collection_cutoff_file_number_;
+
+  std::unique_ptr<BlobFetcher> blob_fetcher_;
+  std::unique_ptr<PrefetchBufferCollection> prefetch_buffers_;
+
+  std::string blob_index_;
+  PinnableSlice blob_value_;
+  std::string compaction_filter_value_;
+  InternalKey compaction_filter_skip_until_;
+  // "level_ptrs" holds indices that remember which file of an associated
+  // level we were last checking during the last call to compaction->
+  // KeyNotExistsBeyondOutputLevel(). This allows future calls to the function
+  // to pick off where it left off since each subcompaction's key range is
+  // increasing so a later call to the function must be looking for a key that
+  // is in or beyond the last file checked during the previous call
+  std::vector<size_t> level_ptrs_;
+  CompactionIterationStats iter_stats_;
+
+  // Used to avoid purging uncommitted values. The application can specify
+  // uncommitted values by providing a SnapshotChecker object.
+  bool current_key_committed_;
+
+  // Saved result of ucmp->CompareTimestamp(current_ts_, *full_history_ts_low_)
+  int cmp_with_history_ts_low_;
+
+  const int level_;
+
+  // True if the previous internal key (same user key)'s sequence number has
+  // just been zeroed out during bottommost compaction.
+  bool last_key_seq_zeroed_{false};
+
+  // True if the current key should be output to the penultimate level if
+  // possible, compaction logic makes the final decision on which level to
+  // output to.
+  bool output_to_penultimate_level_{false};
+
+  // min seqno for preserving the time information.
+  const SequenceNumber preserve_time_min_seqno_ = kMaxSequenceNumber;
+
+  // min seqno to preclude the data from the last level, if the key seqno larger
+  // than this, it will be output to penultimate level
+  const SequenceNumber preclude_last_level_min_seqno_ = kMaxSequenceNumber;
+
+  void AdvanceInputIter() { input_.Next(); }
+
+  void SkipUntil(const Slice& skip_until) { input_.Seek(skip_until); }
+
+  bool IsShuttingDown() {
+    // This is a best-effort facility, so memory_order_relaxed is sufficient.
+    return shutting_down_ && shutting_down_->load(std::memory_order_relaxed);
+  }
+
+  bool IsPausingManualCompaction() {
+    // This is a best-effort facility, so memory_order_relaxed is sufficient.
+    return manual_compaction_canceled_.load(std::memory_order_relaxed);
+  }
+};
+
+inline bool CompactionIterator::DefinitelyInSnapshot(SequenceNumber seq,
+                                                     SequenceNumber snapshot) {
+  return ((seq) <= (snapshot) &&
+          (snapshot_checker_ == nullptr ||
+           LIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) ==
+                  SnapshotCheckerResult::kInSnapshot)));
+}
+
+inline bool CompactionIterator::DefinitelyNotInSnapshot(
+    SequenceNumber seq, SequenceNumber snapshot) {
+  return ((seq) > (snapshot) ||
+          (snapshot_checker_ != nullptr &&
+           UNLIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) ==
+                    SnapshotCheckerResult::kNotInSnapshot)));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_iterator_test.cc b/src/rocksdb/db/compaction/compaction_iterator_test.cc
new file mode 100644
index 000000000..81362d792
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_iterator_test.cc
@@ -0,0 +1,1618 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/compaction/compaction_iterator.h"
+
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+#include "util/vector_iterator.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Expects no merging attempts.
+class NoMergingMergeOp : public MergeOperator {
+ public:
+  bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+                   MergeOperationOutput* /*merge_out*/) const override {
+    ADD_FAILURE();
+    return false;
+  }
+  bool PartialMergeMulti(const Slice& /*key*/,
+                         const std::deque<Slice>& /*operand_list*/,
+                         std::string* /*new_value*/,
+                         Logger* /*logger*/) const override {
+    ADD_FAILURE();
+    return false;
+  }
+  const char* Name() const override {
+    return "CompactionIteratorTest NoMergingMergeOp";
+  }
+};
+
+// Compaction filter that gets stuck when it sees a particular key,
+// then gets unstuck when told to.
+// Always returns Decision::kRemove.
+class StallingFilter : public CompactionFilter {
+ public:
+  Decision FilterV2(int /*level*/, const Slice& key, ValueType /*type*/,
+                    const Slice& /*existing_value*/, std::string* /*new_value*/,
+                    std::string* /*skip_until*/) const override {
+    int k = std::atoi(key.ToString().c_str());
+    last_seen.store(k);
+    while (k >= stall_at.load()) {
+      std::this_thread::yield();
+    }
+    return Decision::kRemove;
+  }
+
+  const char* Name() const override {
+    return "CompactionIteratorTest StallingFilter";
+  }
+
+  // Wait until the filter sees a key >= k and stalls at that key.
+  // If `exact`, asserts that the seen key is equal to k.
+  void WaitForStall(int k, bool exact = true) {
+    stall_at.store(k);
+    while (last_seen.load() < k) {
+      std::this_thread::yield();
+    }
+    if (exact) {
+      EXPECT_EQ(k, last_seen.load());
+    }
+  }
+
+  // Filter will stall on key >= stall_at. Advance stall_at to unstall.
+  mutable std::atomic<int> stall_at{0};
+  // Last key the filter was called with.
+  mutable std::atomic<int> last_seen{0};
+};
+
+// Compaction filter that filter out all keys.
+class FilterAllKeysCompactionFilter : public CompactionFilter {
+ public:
+  Decision FilterV2(int /*level*/, const Slice& /*key*/, ValueType /*type*/,
+                    const Slice& /*existing_value*/, std::string* /*new_value*/,
+                    std::string* /*skip_until*/) const override {
+    return Decision::kRemove;
+  }
+
+  const char* Name() const override { return "AllKeysCompactionFilter"; }
+};
+
+class LoggingForwardVectorIterator : public VectorIterator {
+ public:
+  struct Action {
+    enum class Type {
+      SEEK_TO_FIRST,
+      SEEK,
+      NEXT,
+    };
+
+    Type type;
+    std::string arg;
+
+    explicit Action(Type _type, std::string _arg = "")
+        : type(_type), arg(_arg) {}
+
+    bool operator==(const Action& rhs) const {
+      return std::tie(type, arg) == std::tie(rhs.type, rhs.arg);
+    }
+  };
+
+  LoggingForwardVectorIterator(const std::vector<std::string>& keys,
+                               const std::vector<std::string>& values)
+      : VectorIterator(keys, values) {
+    current_ = keys_.size();
+  }
+
+  void SeekToFirst() override {
+    log.emplace_back(Action::Type::SEEK_TO_FIRST);
+    VectorIterator::SeekToFirst();
+  }
+  void SeekToLast() override { assert(false); }
+
+  void Seek(const Slice& target) override {
+    log.emplace_back(Action::Type::SEEK, target.ToString());
+    VectorIterator::Seek(target);
+  }
+
+  void SeekForPrev(const Slice& /*target*/) override { assert(false); }
+
+  void Next() override {
+    assert(Valid());
+    log.emplace_back(Action::Type::NEXT);
+    VectorIterator::Next();
+  }
+  void Prev() override { assert(false); }
+
+  Slice key() const override {
+    assert(Valid());
+    return VectorIterator::key();
+  }
+  Slice value() const override {
+    assert(Valid());
+    return VectorIterator::value();
+  }
+
+  std::vector<Action> log;
+};
+
+class FakeCompaction : public CompactionIterator::CompactionProxy {
+ public:
+  int level() const override { return 0; }
+
+  bool KeyNotExistsBeyondOutputLevel(
+      const Slice& /*user_key*/,
+      std::vector<size_t>* /*level_ptrs*/) const override {
+    return is_bottommost_level || key_not_exists_beyond_output_level;
+  }
+
+  bool bottommost_level() const override { return is_bottommost_level; }
+
+  int number_levels() const override { return 1; }
+
+  Slice GetLargestUserKey() const override {
+    return "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
+  }
+
+  bool allow_ingest_behind() const override { return is_allow_ingest_behind; }
+
+  bool allow_mmap_reads() const override { return false; }
+
+  bool enable_blob_garbage_collection() const override { return false; }
+
+  double blob_garbage_collection_age_cutoff() const override { return 0.0; }
+
+  uint64_t blob_compaction_readahead_size() const override { return 0; }
+
+  const Version* input_version() const override { return nullptr; }
+
+  bool DoesInputReferenceBlobFiles() const override { return false; }
+
+  const Compaction* real_compaction() const override { return nullptr; }
+
+  bool SupportsPerKeyPlacement() const override {
+    return supports_per_key_placement;
+  }
+
+  bool WithinPenultimateLevelOutputRange(const Slice& key) const override {
+    return (!key.starts_with("unsafe_pb"));
+  }
+
+  bool key_not_exists_beyond_output_level = false;
+
+  bool is_bottommost_level = false;
+
+  bool is_allow_ingest_behind = false;
+
+  bool supports_per_key_placement = false;
+};
+
+// A simplified snapshot checker which assumes each snapshot has a global
+// last visible sequence.
+class TestSnapshotChecker : public SnapshotChecker {
+ public:
+  explicit TestSnapshotChecker(
+      SequenceNumber last_committed_sequence,
+      const std::unordered_map<SequenceNumber, SequenceNumber>& snapshots =
+          {{}})
+      : last_committed_sequence_(last_committed_sequence),
+        snapshots_(snapshots) {}
+
+  SnapshotCheckerResult CheckInSnapshot(
+      SequenceNumber seq, SequenceNumber snapshot_seq) const override {
+    if (snapshot_seq == kMaxSequenceNumber) {
+      return seq <= last_committed_sequence_
+                 ? SnapshotCheckerResult::kInSnapshot
+                 : SnapshotCheckerResult::kNotInSnapshot;
+    }
+    assert(snapshots_.count(snapshot_seq) > 0);
+    return seq <= snapshots_.at(snapshot_seq)
+               ? SnapshotCheckerResult::kInSnapshot
+               : SnapshotCheckerResult::kNotInSnapshot;
+  }
+
+ private:
+  SequenceNumber last_committed_sequence_;
+  // A map of valid snapshot to last visible sequence to the snapshot.
+  std::unordered_map<SequenceNumber, SequenceNumber> snapshots_;
+};
+
+// Test param:
+//   bool: whether to pass snapshot_checker to compaction iterator.
+class CompactionIteratorTest : public testing::TestWithParam<bool> {
+ public:
+  CompactionIteratorTest()
+      : cmp_(BytewiseComparator()), icmp_(cmp_), snapshots_({}) {}
+
+  explicit CompactionIteratorTest(const Comparator* ucmp)
+      : cmp_(ucmp), icmp_(cmp_), snapshots_({}) {}
+
+  void InitIterators(
+      const std::vector<std::string>& ks, const std::vector<std::string>& vs,
+      const std::vector<std::string>& range_del_ks,
+      const std::vector<std::string>& range_del_vs,
+      SequenceNumber last_sequence,
+      SequenceNumber last_committed_sequence = kMaxSequenceNumber,
+      MergeOperator* merge_op = nullptr, CompactionFilter* filter = nullptr,
+      bool bottommost_level = false,
+      SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
+      bool key_not_exists_beyond_output_level = false,
+      const std::string* full_history_ts_low = nullptr) {
+    std::unique_ptr<InternalIterator> unfragmented_range_del_iter(
+        new VectorIterator(range_del_ks, range_del_vs, &icmp_));
+    auto tombstone_list = std::make_shared<FragmentedRangeTombstoneList>(
+        std::move(unfragmented_range_del_iter), icmp_);
+    std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+        new FragmentedRangeTombstoneIterator(tombstone_list, icmp_,
+                                             kMaxSequenceNumber));
+    range_del_agg_.reset(new CompactionRangeDelAggregator(&icmp_, snapshots_));
+    range_del_agg_->AddTombstones(std::move(range_del_iter));
+
+    std::unique_ptr<CompactionIterator::CompactionProxy> compaction;
+    if (filter || bottommost_level || key_not_exists_beyond_output_level) {
+      compaction_proxy_ = new FakeCompaction();
+      compaction_proxy_->is_bottommost_level = bottommost_level;
+      compaction_proxy_->is_allow_ingest_behind = AllowIngestBehind();
+      compaction_proxy_->key_not_exists_beyond_output_level =
+          key_not_exists_beyond_output_level;
+      compaction_proxy_->supports_per_key_placement = SupportsPerKeyPlacement();
+      compaction.reset(compaction_proxy_);
+    }
+    bool use_snapshot_checker = UseSnapshotChecker() || GetParam();
+    if (use_snapshot_checker || last_committed_sequence < kMaxSequenceNumber) {
+      snapshot_checker_.reset(
+          new TestSnapshotChecker(last_committed_sequence, snapshot_map_));
+    }
+    merge_helper_.reset(
+        new MergeHelper(Env::Default(), cmp_, merge_op, filter, nullptr, false,
+                        0 /*latest_snapshot*/, snapshot_checker_.get(),
+                        0 /*level*/, nullptr /*statistics*/, &shutting_down_));
+
+    if (c_iter_) {
+      // Since iter_ is still used in ~CompactionIterator(), we call
+      // ~CompactionIterator() first.
+      c_iter_.reset();
+    }
+    iter_.reset(new LoggingForwardVectorIterator(ks, vs));
+    iter_->SeekToFirst();
+    c_iter_.reset(new CompactionIterator(
+        iter_.get(), cmp_, merge_helper_.get(), last_sequence, &snapshots_,
+        earliest_write_conflict_snapshot, kMaxSequenceNumber,
+        snapshot_checker_.get(), Env::Default(),
+        false /* report_detailed_time */, false, range_del_agg_.get(),
+        nullptr /* blob_file_builder */, true /*allow_data_in_errors*/,
+        true /*enforce_single_del_contracts*/,
+        /*manual_compaction_canceled=*/kManualCompactionCanceledFalse_,
+        std::move(compaction), filter, &shutting_down_, /*info_log=*/nullptr,
+        full_history_ts_low));
+  }
+
+  void AddSnapshot(SequenceNumber snapshot,
+                   SequenceNumber last_visible_seq = kMaxSequenceNumber) {
+    snapshots_.push_back(snapshot);
+    snapshot_map_[snapshot] = last_visible_seq;
+  }
+
+  virtual bool UseSnapshotChecker() const { return false; }
+
+  virtual bool AllowIngestBehind() const { return false; }
+
+  virtual bool SupportsPerKeyPlacement() const { return false; }
+
+  void RunTest(
+      const std::vector<std::string>& input_keys,
+      const std::vector<std::string>& input_values,
+      const std::vector<std::string>& expected_keys,
+      const std::vector<std::string>& expected_values,
+      SequenceNumber last_committed_seq = kMaxSequenceNumber,
+      MergeOperator* merge_operator = nullptr,
+      CompactionFilter* compaction_filter = nullptr,
+      bool bottommost_level = false,
+      SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
+      bool key_not_exists_beyond_output_level = false,
+      const std::string* full_history_ts_low = nullptr) {
+    InitIterators(input_keys, input_values, {}, {}, kMaxSequenceNumber,
+                  last_committed_seq, merge_operator, compaction_filter,
+                  bottommost_level, earliest_write_conflict_snapshot,
+                  key_not_exists_beyond_output_level, full_history_ts_low);
+    c_iter_->SeekToFirst();
+    for (size_t i = 0; i < expected_keys.size(); i++) {
+      std::string info = "i = " + std::to_string(i);
+      ASSERT_TRUE(c_iter_->Valid()) << info;
+      ASSERT_OK(c_iter_->status()) << info;
+      ASSERT_EQ(expected_keys[i], c_iter_->key().ToString()) << info;
+      ASSERT_EQ(expected_values[i], c_iter_->value().ToString()) << info;
+      c_iter_->Next();
+    }
+    ASSERT_OK(c_iter_->status());
+    ASSERT_FALSE(c_iter_->Valid());
+  }
+
+  void ClearSnapshots() {
+    snapshots_.clear();
+    snapshot_map_.clear();
+  }
+
+  const Comparator* cmp_;
+  const InternalKeyComparator icmp_;
+  std::vector<SequenceNumber> snapshots_;
+  // A map of valid snapshot to last visible sequence to the snapshot.
+  std::unordered_map<SequenceNumber, SequenceNumber> snapshot_map_;
+  std::unique_ptr<MergeHelper> merge_helper_;
+  std::unique_ptr<LoggingForwardVectorIterator> iter_;
+  std::unique_ptr<CompactionIterator> c_iter_;
+  std::unique_ptr<CompactionRangeDelAggregator> range_del_agg_;
+  std::unique_ptr<SnapshotChecker> snapshot_checker_;
+  std::atomic<bool> shutting_down_{false};
+  const std::atomic<bool> kManualCompactionCanceledFalse_{false};
+  FakeCompaction* compaction_proxy_;
+};
+
+// It is possible that the output of the compaction iterator is empty even if
+// the input is not.
+TEST_P(CompactionIteratorTest, EmptyResult) {
+  InitIterators({test::KeyStr("a", 5, kTypeSingleDeletion),
+                 test::KeyStr("a", 3, kTypeValue)},
+                {"", "val"}, {}, {}, 5);
+  c_iter_->SeekToFirst();
+  ASSERT_OK(c_iter_->status());
+  ASSERT_FALSE(c_iter_->Valid());
+}
+
+// If there is a corruption after a single deletion, the corrupted key should
+// be preserved.
+TEST_P(CompactionIteratorTest, CorruptionAfterSingleDeletion) {
+  InitIterators({test::KeyStr("a", 5, kTypeSingleDeletion),
+                 test::KeyStr("a", 3, kTypeValue, true),
+                 test::KeyStr("b", 10, kTypeValue)},
+                {"", "val", "val2"}, {}, {}, 10);
+  c_iter_->SeekToFirst();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("a", 5, kTypeSingleDeletion),
+            c_iter_->key().ToString());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("a", 3, kTypeValue, true), c_iter_->key().ToString());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("b", 10, kTypeValue), c_iter_->key().ToString());
+  c_iter_->Next();
+  ASSERT_OK(c_iter_->status());
+  ASSERT_FALSE(c_iter_->Valid());
+}
+
+TEST_P(CompactionIteratorTest, SimpleRangeDeletion) {
+  InitIterators({test::KeyStr("morning", 5, kTypeValue),
+                 test::KeyStr("morning", 2, kTypeValue),
+                 test::KeyStr("night", 3, kTypeValue)},
+                {"zao", "zao", "wan"},
+                {test::KeyStr("ma", 4, kTypeRangeDeletion)}, {"mz"}, 5);
+  c_iter_->SeekToFirst();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("morning", 5, kTypeValue), c_iter_->key().ToString());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("night", 3, kTypeValue), c_iter_->key().ToString());
+  c_iter_->Next();
+  ASSERT_OK(c_iter_->status());
+  ASSERT_FALSE(c_iter_->Valid());
+}
+
+TEST_P(CompactionIteratorTest, RangeDeletionWithSnapshots) {
+  AddSnapshot(10);
+  std::vector<std::string> ks1;
+  ks1.push_back(test::KeyStr("ma", 28, kTypeRangeDeletion));
+  std::vector<std::string> vs1{"mz"};
+  std::vector<std::string> ks2{test::KeyStr("morning", 15, kTypeValue),
+                               test::KeyStr("morning", 5, kTypeValue),
+                               test::KeyStr("night", 40, kTypeValue),
+                               test::KeyStr("night", 20, kTypeValue)};
+  std::vector<std::string> vs2{"zao 15", "zao 5", "wan 40", "wan 20"};
+  InitIterators(ks2, vs2, ks1, vs1, 40);
+  c_iter_->SeekToFirst();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("morning", 5, kTypeValue), c_iter_->key().ToString());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("night", 40, kTypeValue), c_iter_->key().ToString());
+  c_iter_->Next();
+  ASSERT_OK(c_iter_->status());
+  ASSERT_FALSE(c_iter_->Valid());
+}
+
+TEST_P(CompactionIteratorTest, CompactionFilterSkipUntil) {
+  class Filter : public CompactionFilter {
+    Decision FilterV2(int /*level*/, const Slice& key, ValueType t,
+                      const Slice& existing_value, std::string* /*new_value*/,
+                      std::string* skip_until) const override {
+      std::string k = key.ToString();
+      std::string v = existing_value.ToString();
+      // See InitIterators() call below for the sequence of keys and their
+      // filtering decisions. Here we closely assert that compaction filter is
+      // called with the expected keys and only them, and with the right values.
+      if (k == "a") {
+        EXPECT_EQ(ValueType::kValue, t);
+        EXPECT_EQ("av50", v);
+        return Decision::kKeep;
+      }
+      if (k == "b") {
+        EXPECT_EQ(ValueType::kValue, t);
+        EXPECT_EQ("bv60", v);
+        *skip_until = "d+";
+        return Decision::kRemoveAndSkipUntil;
+      }
+      if (k == "e") {
+        EXPECT_EQ(ValueType::kMergeOperand, t);
+        EXPECT_EQ("em71", v);
+        return Decision::kKeep;
+      }
+      if (k == "f") {
+        if (v == "fm65") {
+          EXPECT_EQ(ValueType::kMergeOperand, t);
+          *skip_until = "f";
+        } else {
+          EXPECT_EQ("fm30", v);
+          EXPECT_EQ(ValueType::kMergeOperand, t);
+          *skip_until = "g+";
+        }
+        return Decision::kRemoveAndSkipUntil;
+      }
+      if (k == "h") {
+        EXPECT_EQ(ValueType::kValue, t);
+        EXPECT_EQ("hv91", v);
+        return Decision::kKeep;
+      }
+      if (k == "i") {
+        EXPECT_EQ(ValueType::kMergeOperand, t);
+        EXPECT_EQ("im95", v);
+        *skip_until = "z";
+        return Decision::kRemoveAndSkipUntil;
+      }
+      ADD_FAILURE();
+      return Decision::kKeep;
+    }
+
+    const char* Name() const override {
+      return "CompactionIteratorTest.CompactionFilterSkipUntil::Filter";
+    }
+  };
+
+  NoMergingMergeOp merge_op;
+  Filter filter;
+  InitIterators(
+      {test::KeyStr("a", 50, kTypeValue),  // keep
+       test::KeyStr("a", 45, kTypeMerge),
+       test::KeyStr("b", 60, kTypeValue),  // skip to "d+"
+       test::KeyStr("b", 40, kTypeValue), test::KeyStr("c", 35, kTypeValue),
+       test::KeyStr("d", 70, kTypeMerge),
+       test::KeyStr("e", 71, kTypeMerge),  // keep
+       test::KeyStr("f", 65, kTypeMerge),  // skip to "f", aka keep
+       test::KeyStr("f", 30, kTypeMerge),  // skip to "g+"
+       test::KeyStr("f", 25, kTypeValue), test::KeyStr("g", 90, kTypeValue),
+       test::KeyStr("h", 91, kTypeValue),  // keep
+       test::KeyStr("i", 95, kTypeMerge),  // skip to "z"
+       test::KeyStr("j", 99, kTypeValue)},
+      {"av50", "am45", "bv60", "bv40", "cv35", "dm70", "em71", "fm65", "fm30",
+       "fv25", "gv90", "hv91", "im95", "jv99"},
+      {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, &merge_op, &filter);
+
+  // Compaction should output just "a", "e" and "h" keys.
+  c_iter_->SeekToFirst();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("a", 50, kTypeValue), c_iter_->key().ToString());
+  ASSERT_EQ("av50", c_iter_->value().ToString());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("e", 71, kTypeMerge), c_iter_->key().ToString());
+  ASSERT_EQ("em71", c_iter_->value().ToString());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("h", 91, kTypeValue), c_iter_->key().ToString());
+  ASSERT_EQ("hv91", c_iter_->value().ToString());
+  c_iter_->Next();
+  ASSERT_OK(c_iter_->status());
+  ASSERT_FALSE(c_iter_->Valid());
+
+  // Check that the compaction iterator did the correct sequence of calls on
+  // the underlying iterator.
+  using A = LoggingForwardVectorIterator::Action;
+  using T = A::Type;
+  std::vector<A> expected_actions = {
+      A(T::SEEK_TO_FIRST),
+      A(T::NEXT),
+      A(T::NEXT),
+      A(T::SEEK, test::KeyStr("d+", kMaxSequenceNumber, kValueTypeForSeek)),
+      A(T::NEXT),
+      A(T::NEXT),
+      A(T::SEEK, test::KeyStr("g+", kMaxSequenceNumber, kValueTypeForSeek)),
+      A(T::NEXT),
+      A(T::SEEK, test::KeyStr("z", kMaxSequenceNumber, kValueTypeForSeek))};
+  ASSERT_EQ(expected_actions, iter_->log);
+}
+
+TEST_P(CompactionIteratorTest, ShuttingDownInFilter) {
+  NoMergingMergeOp merge_op;
+  StallingFilter filter;
+  InitIterators(
+      {test::KeyStr("1", 1, kTypeValue), test::KeyStr("2", 2, kTypeValue),
+       test::KeyStr("3", 3, kTypeValue), test::KeyStr("4", 4, kTypeValue)},
+      {"v1", "v2", "v3", "v4"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
+      &merge_op, &filter);
+  // Don't leave tombstones (kTypeDeletion) for filtered keys.
+  compaction_proxy_->key_not_exists_beyond_output_level = true;
+
+  std::atomic<bool> seek_done{false};
+  ROCKSDB_NAMESPACE::port::Thread compaction_thread([&] {
+    c_iter_->SeekToFirst();
+    EXPECT_FALSE(c_iter_->Valid());
+    EXPECT_TRUE(c_iter_->status().IsShutdownInProgress());
+    seek_done.store(true);
+  });
+
+  // Let key 1 through.
+  filter.WaitForStall(1);
+
+  // Shutdown during compaction filter call for key 2.
+  filter.WaitForStall(2);
+  shutting_down_.store(true);
+  EXPECT_FALSE(seek_done.load());
+
+  // Unstall filter and wait for SeekToFirst() to return.
+  filter.stall_at.store(3);
+  compaction_thread.join();
+  assert(seek_done.load());
+
+  // Check that filter was never called again.
+  EXPECT_EQ(2, filter.last_seen.load());
+}
+
+// Same as ShuttingDownInFilter, but shutdown happens during filter call for
+// a merge operand, not for a value.
+TEST_P(CompactionIteratorTest, ShuttingDownInMerge) {
+  NoMergingMergeOp merge_op;
+  StallingFilter filter;
+  InitIterators(
+      {test::KeyStr("1", 1, kTypeValue), test::KeyStr("2", 2, kTypeMerge),
+       test::KeyStr("3", 3, kTypeMerge), test::KeyStr("4", 4, kTypeValue)},
+      {"v1", "v2", "v3", "v4"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
+      &merge_op, &filter);
+  compaction_proxy_->key_not_exists_beyond_output_level = true;
+
+  std::atomic<bool> seek_done{false};
+  ROCKSDB_NAMESPACE::port::Thread compaction_thread([&] {
+    c_iter_->SeekToFirst();
+    ASSERT_FALSE(c_iter_->Valid());
+    ASSERT_TRUE(c_iter_->status().IsShutdownInProgress());
+    seek_done.store(true);
+  });
+
+  // Let key 1 through.
+  filter.WaitForStall(1);
+
+  // Shutdown during compaction filter call for key 2.
+  filter.WaitForStall(2);
+  shutting_down_.store(true);
+  EXPECT_FALSE(seek_done.load());
+
+  // Unstall filter and wait for SeekToFirst() to return.
+  filter.stall_at.store(3);
+  compaction_thread.join();
+  assert(seek_done.load());
+
+  // Check that filter was never called again.
+  EXPECT_EQ(2, filter.last_seen.load());
+}
+
+TEST_P(CompactionIteratorTest, SingleMergeOperand) {
+  class Filter : public CompactionFilter {
+    Decision FilterV2(int /*level*/, const Slice& key, ValueType t,
+                      const Slice& existing_value, std::string* /*new_value*/,
+                      std::string* /*skip_until*/) const override {
+      std::string k = key.ToString();
+      std::string v = existing_value.ToString();
+
+      // See InitIterators() call below for the sequence of keys and their
+      // filtering decisions. Here we closely assert that compaction filter is
+      // called with the expected keys and only them, and with the right values.
+      if (k == "a") {
+        EXPECT_EQ(ValueType::kMergeOperand, t);
+        EXPECT_EQ("av1", v);
+        return Decision::kKeep;
+      } else if (k == "b") {
+        EXPECT_EQ(ValueType::kMergeOperand, t);
+        return Decision::kKeep;
+      } else if (k == "c") {
+        return Decision::kKeep;
+      }
+
+      ADD_FAILURE();
+      return Decision::kKeep;
+    }
+
+    const char* Name() const override {
+      return "CompactionIteratorTest.SingleMergeOperand::Filter";
+    }
+  };
+
+  class SingleMergeOp : public MergeOperator {
+   public:
+    bool FullMergeV2(const MergeOperationInput& merge_in,
+                     MergeOperationOutput* merge_out) const override {
+      // See InitIterators() call below for why "c" is the only key for which
+      // FullMergeV2 should be called.
+      EXPECT_EQ("c", merge_in.key.ToString());
+
+      std::string temp_value;
+      if (merge_in.existing_value != nullptr) {
+        temp_value = merge_in.existing_value->ToString();
+      }
+
+      for (auto& operand : merge_in.operand_list) {
+        temp_value.append(operand.ToString());
+      }
+      merge_out->new_value = temp_value;
+
+      return true;
+    }
+
+    bool PartialMergeMulti(const Slice& key,
+                           const std::deque<Slice>& operand_list,
+                           std::string* new_value,
+                           Logger* /*logger*/) const override {
+      std::string string_key = key.ToString();
+      EXPECT_TRUE(string_key == "a" || string_key == "b");
+
+      if (string_key == "a") {
+        EXPECT_EQ(1, operand_list.size());
+      } else if (string_key == "b") {
+        EXPECT_EQ(2, operand_list.size());
+      }
+
+      std::string temp_value;
+      for (auto& operand : operand_list) {
+        temp_value.append(operand.ToString());
+      }
+      swap(temp_value, *new_value);
+
+      return true;
+    }
+
+    const char* Name() const override {
+      return "CompactionIteratorTest SingleMergeOp";
+    }
+
+    bool AllowSingleOperand() const override { return true; }
+  };
+
+  SingleMergeOp merge_op;
+  Filter filter;
+  InitIterators(
+      // a should invoke PartialMergeMulti with a single merge operand.
+      {test::KeyStr("a", 50, kTypeMerge),
+       // b should invoke PartialMergeMulti with two operands.
+       test::KeyStr("b", 70, kTypeMerge), test::KeyStr("b", 60, kTypeMerge),
+       // c should invoke FullMerge due to kTypeValue at the beginning.
+       test::KeyStr("c", 90, kTypeMerge), test::KeyStr("c", 80, kTypeValue)},
+      {"av1", "bv2", "bv1", "cv2", "cv1"}, {}, {}, kMaxSequenceNumber,
+      kMaxSequenceNumber, &merge_op, &filter);
+
+  c_iter_->SeekToFirst();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), c_iter_->key().ToString());
+  ASSERT_EQ("av1", c_iter_->value().ToString());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ("bv1bv2", c_iter_->value().ToString());
+  c_iter_->Next();
+  ASSERT_OK(c_iter_->status());
+  ASSERT_EQ("cv1cv2", c_iter_->value().ToString());
+}
+
+// In bottommost level, values earlier than earliest snapshot can be output
+// with sequence = 0.
+TEST_P(CompactionIteratorTest, ZeroOutSequenceAtBottomLevel) {
+  AddSnapshot(1);
+  RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue)},
+          {"v1", "v2"},
+          {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue)},
+          {"v1", "v2"}, kMaxSequenceNumber /*last_committed_seq*/,
+          nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+          true /*bottommost_level*/);
+}
+
+// In bottommost level, deletions earlier than earliest snapshot can be removed
+// permanently.
+TEST_P(CompactionIteratorTest, RemoveDeletionAtBottomLevel) {
+  AddSnapshot(1);
+  RunTest(
+      {test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 3, kTypeDeletion),
+       test::KeyStr("b", 1, kTypeValue)},
+      {"", "", ""},
+      {test::KeyStr("b", 3, kTypeDeletion), test::KeyStr("b", 0, kTypeValue)},
+      {"", ""}, kMaxSequenceNumber /*last_committed_seq*/,
+      nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+      true /*bottommost_level*/);
+}
+
+// In bottommost level, single deletions earlier than earliest snapshot can be
+// removed permanently.
+TEST_P(CompactionIteratorTest, RemoveSingleDeletionAtBottomLevel) {
+  AddSnapshot(1);
+  RunTest({test::KeyStr("a", 1, kTypeSingleDeletion),
+           test::KeyStr("b", 2, kTypeSingleDeletion)},
+          {"", ""}, {test::KeyStr("b", 2, kTypeSingleDeletion)}, {""},
+          kMaxSequenceNumber /*last_committed_seq*/, nullptr /*merge_operator*/,
+          nullptr /*compaction_filter*/, true /*bottommost_level*/);
+}
+
+TEST_P(CompactionIteratorTest, ConvertToPutAtBottom) {
+  std::shared_ptr<MergeOperator> merge_op =
+      MergeOperators::CreateStringAppendOperator();
+  RunTest({test::KeyStr("a", 4, kTypeMerge), test::KeyStr("a", 3, kTypeMerge),
+           test::KeyStr("a", 2, kTypeMerge), test::KeyStr("b", 1, kTypeValue)},
+          {"a4", "a3", "a2", "b1"},
+          {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 0, kTypeValue)},
+          {"a2,a3,a4", "b1"}, kMaxSequenceNumber /*last_committed_seq*/,
+          merge_op.get(), nullptr /*compaction_filter*/,
+          true /*bottomost_level*/);
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionIteratorTestInstance, CompactionIteratorTest,
+                        testing::Values(true, false));
+
+class PerKeyPlacementCompIteratorTest : public CompactionIteratorTest {
+ public:
+  bool SupportsPerKeyPlacement() const override { return true; }
+};
+
+TEST_P(PerKeyPlacementCompIteratorTest, SplitLastLevelData) {
+  std::atomic_uint64_t latest_cold_seq = 0;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+        auto context = static_cast<PerKeyPlacementContext*>(arg);
+        context->output_to_penultimate_level =
+            context->seq_num > latest_cold_seq;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  latest_cold_seq = 5;
+
+  InitIterators(
+      {test::KeyStr("a", 7, kTypeValue), test::KeyStr("b", 6, kTypeValue),
+       test::KeyStr("c", 5, kTypeValue)},
+      {"vala", "valb", "valc"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
+      nullptr, nullptr, true);
+  c_iter_->SeekToFirst();
+  ASSERT_TRUE(c_iter_->Valid());
+
+  // the first 2 keys are hot, which should has
+  // `output_to_penultimate_level()==true` and seq num not zeroed out
+  ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString());
+  ASSERT_TRUE(c_iter_->output_to_penultimate_level());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("b", 6, kTypeValue), c_iter_->key().ToString());
+  ASSERT_TRUE(c_iter_->output_to_penultimate_level());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  // `a` is cold data, which should be output to bottommost
+  ASSERT_EQ(test::KeyStr("c", 0, kTypeValue), c_iter_->key().ToString());
+  ASSERT_FALSE(c_iter_->output_to_penultimate_level());
+  c_iter_->Next();
+  ASSERT_OK(c_iter_->status());
+  ASSERT_FALSE(c_iter_->Valid());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(PerKeyPlacementCompIteratorTest, SnapshotData) {
+  AddSnapshot(5);
+
+  InitIterators(
+      {test::KeyStr("a", 7, kTypeValue), test::KeyStr("b", 6, kTypeDeletion),
+       test::KeyStr("b", 5, kTypeValue)},
+      {"vala", "", "valb"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
+      nullptr, nullptr, true);
+  c_iter_->SeekToFirst();
+  ASSERT_TRUE(c_iter_->Valid());
+
+  // The first key and the tombstone are within snapshot, which should output
+  // to the penultimate level (and seq num cannot be zeroed out).
+  ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString());
+  ASSERT_TRUE(c_iter_->output_to_penultimate_level());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("b", 6, kTypeDeletion), c_iter_->key().ToString());
+  ASSERT_TRUE(c_iter_->output_to_penultimate_level());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  // `a` is not protected by the snapshot, the sequence number is zero out and
+  // should output bottommost
+  ASSERT_EQ(test::KeyStr("b", 0, kTypeValue), c_iter_->key().ToString());
+  ASSERT_FALSE(c_iter_->output_to_penultimate_level());
+  c_iter_->Next();
+  ASSERT_OK(c_iter_->status());
+  ASSERT_FALSE(c_iter_->Valid());
+}
+
+TEST_P(PerKeyPlacementCompIteratorTest, ConflictWithSnapshot) {
+  std::atomic_uint64_t latest_cold_seq = 0;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+        auto context = static_cast<PerKeyPlacementContext*>(arg);
+        context->output_to_penultimate_level =
+            context->seq_num > latest_cold_seq;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  latest_cold_seq = 6;
+
+  AddSnapshot(5);
+
+  InitIterators({test::KeyStr("a", 7, kTypeValue),
+                 test::KeyStr("unsafe_pb", 6, kTypeValue),
+                 test::KeyStr("c", 5, kTypeValue)},
+                {"vala", "valb", "valc"}, {}, {}, kMaxSequenceNumber,
+                kMaxSequenceNumber, nullptr, nullptr, true);
+  c_iter_->SeekToFirst();
+  ASSERT_TRUE(c_iter_->Valid());
+
+  ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString());
+  ASSERT_TRUE(c_iter_->output_to_penultimate_level());
+  // the 2nd key is unsafe to output_to_penultimate_level, but it's within
+  // snapshot so for per_key_placement feature it has to be outputted to the
+  // penultimate level. which is a corruption. We should never see
+  // such case as the data with seq num (within snapshot) should always come
+  // from higher compaction input level, which makes it safe to
+  // output_to_penultimate_level.
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->status().IsCorruption());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+INSTANTIATE_TEST_CASE_P(PerKeyPlacementCompIteratorTest,
+                        PerKeyPlacementCompIteratorTest,
+                        testing::Values(true, false));
+
+// Tests how CompactionIterator work together with SnapshotChecker.
+class CompactionIteratorWithSnapshotCheckerTest
+    : public CompactionIteratorTest {
+ public:
+  bool UseSnapshotChecker() const override { return true; }
+};
+
+// Uncommitted keys (keys with seq > last_committed_seq) should be output as-is
+// while committed version of these keys should get compacted as usual.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       PreserveUncommittedKeys_Value) {
+  RunTest(
+      {test::KeyStr("foo", 3, kTypeValue), test::KeyStr("foo", 2, kTypeValue),
+       test::KeyStr("foo", 1, kTypeValue)},
+      {"v3", "v2", "v1"},
+      {test::KeyStr("foo", 3, kTypeValue), test::KeyStr("foo", 2, kTypeValue)},
+      {"v3", "v2"}, 2 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       PreserveUncommittedKeys_Deletion) {
+  RunTest({test::KeyStr("foo", 2, kTypeDeletion),
+           test::KeyStr("foo", 1, kTypeValue)},
+          {"", "v1"},
+          {test::KeyStr("foo", 2, kTypeDeletion),
+           test::KeyStr("foo", 1, kTypeValue)},
+          {"", "v1"}, 1 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       PreserveUncommittedKeys_Merge) {
+  auto merge_op = MergeOperators::CreateStringAppendOperator();
+  RunTest(
+      {test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeMerge),
+       test::KeyStr("foo", 1, kTypeValue)},
+      {"v3", "v2", "v1"},
+      {test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeValue)},
+      {"v3", "v1,v2"}, 2 /*last_committed_seq*/, merge_op.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       PreserveUncommittedKeys_SingleDelete) {
+  RunTest({test::KeyStr("foo", 2, kTypeSingleDeletion),
+           test::KeyStr("foo", 1, kTypeValue)},
+          {"", "v1"},
+          {test::KeyStr("foo", 2, kTypeSingleDeletion),
+           test::KeyStr("foo", 1, kTypeValue)},
+          {"", "v1"}, 1 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       PreserveUncommittedKeys_BlobIndex) {
+  RunTest({test::KeyStr("foo", 3, kTypeBlobIndex),
+           test::KeyStr("foo", 2, kTypeBlobIndex),
+           test::KeyStr("foo", 1, kTypeBlobIndex)},
+          {"v3", "v2", "v1"},
+          {test::KeyStr("foo", 3, kTypeBlobIndex),
+           test::KeyStr("foo", 2, kTypeBlobIndex)},
+          {"v3", "v2"}, 2 /*last_committed_seq*/);
+}
+
+// Test compaction iterator dedup keys visible to the same snapshot.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Value) {
+  AddSnapshot(2, 1);
+  RunTest(
+      {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 3, kTypeValue),
+       test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+      {"v4", "v3", "v2", "v1"},
+      {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 3, kTypeValue),
+       test::KeyStr("foo", 1, kTypeValue)},
+      {"v4", "v3", "v1"}, 3 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Deletion) {
+  AddSnapshot(2, 1);
+  RunTest(
+      {test::KeyStr("foo", 4, kTypeValue),
+       test::KeyStr("foo", 3, kTypeDeletion),
+       test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+      {"v4", "", "v2", "v1"},
+      {test::KeyStr("foo", 4, kTypeValue),
+       test::KeyStr("foo", 3, kTypeDeletion),
+       test::KeyStr("foo", 1, kTypeValue)},
+      {"v4", "", "v1"}, 3 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Merge) {
+  AddSnapshot(2, 1);
+  AddSnapshot(4, 3);
+  auto merge_op = MergeOperators::CreateStringAppendOperator();
+  RunTest(
+      {test::KeyStr("foo", 5, kTypeMerge), test::KeyStr("foo", 4, kTypeMerge),
+       test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeMerge),
+       test::KeyStr("foo", 1, kTypeValue)},
+      {"v5", "v4", "v3", "v2", "v1"},
+      {test::KeyStr("foo", 5, kTypeMerge), test::KeyStr("foo", 4, kTypeMerge),
+       test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 1, kTypeValue)},
+      {"v5", "v4", "v2,v3", "v1"}, 4 /*last_committed_seq*/, merge_op.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       DedupSameSnapshot_SingleDeletion) {
+  AddSnapshot(2, 1);
+  RunTest(
+      {test::KeyStr("foo", 4, kTypeValue),
+       test::KeyStr("foo", 3, kTypeSingleDeletion),
+       test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+      {"v4", "", "v2", "v1"},
+      {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+      {"v4", "v1"}, 3 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_BlobIndex) {
+  AddSnapshot(2, 1);
+  RunTest({test::KeyStr("foo", 4, kTypeBlobIndex),
+           test::KeyStr("foo", 3, kTypeBlobIndex),
+           test::KeyStr("foo", 2, kTypeBlobIndex),
+           test::KeyStr("foo", 1, kTypeBlobIndex)},
+          {"v4", "v3", "v2", "v1"},
+          {test::KeyStr("foo", 4, kTypeBlobIndex),
+           test::KeyStr("foo", 3, kTypeBlobIndex),
+           test::KeyStr("foo", 1, kTypeBlobIndex)},
+          {"v4", "v3", "v1"}, 3 /*last_committed_seq*/);
+}
+
+// At bottom level, sequence numbers can be zero out, and deletions can be
+// removed, but only when they are visible to earliest snapshot.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       NotZeroOutSequenceIfNotVisibleToEarliestSnapshot) {
+  AddSnapshot(2, 1);
+  RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue),
+           test::KeyStr("c", 3, kTypeValue)},
+          {"v1", "v2", "v3"},
+          {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue),
+           test::KeyStr("c", 3, kTypeValue)},
+          {"v1", "v2", "v3"}, kMaxSequenceNumber /*last_committed_seq*/,
+          nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+          true /*bottommost_level*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       NotRemoveDeletionIfNotVisibleToEarliestSnapshot) {
+  AddSnapshot(2, 1);
+  RunTest(
+      {test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 2, kTypeDeletion),
+       test::KeyStr("c", 3, kTypeDeletion)},
+      {"", "", ""}, {}, {"", ""}, kMaxSequenceNumber /*last_committed_seq*/,
+      nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+      true /*bottommost_level*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       NotRemoveDeletionIfValuePresentToEarlierSnapshot) {
+  AddSnapshot(2, 1);
+  RunTest({test::KeyStr("a", 4, kTypeDeletion),
+           test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 3, kTypeValue)},
+          {"", "", ""},
+          {test::KeyStr("a", 4, kTypeDeletion),
+           test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 3, kTypeValue)},
+          {"", "", ""}, kMaxSequenceNumber /*last_committed_seq*/,
+          nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+          true /*bottommost_level*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       NotRemoveSingleDeletionIfNotVisibleToEarliestSnapshot) {
+  AddSnapshot(2, 1);
+  RunTest({test::KeyStr("a", 1, kTypeSingleDeletion),
+           test::KeyStr("b", 2, kTypeSingleDeletion),
+           test::KeyStr("c", 3, kTypeSingleDeletion)},
+          {"", "", ""},
+          {test::KeyStr("b", 2, kTypeSingleDeletion),
+           test::KeyStr("c", 3, kTypeSingleDeletion)},
+          {"", ""}, kMaxSequenceNumber /*last_committed_seq*/,
+          nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+          true /*bottommost_level*/);
+}
+
+// Single delete should not cancel out values that not visible to the
+// same set of snapshots
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       SingleDeleteAcrossSnapshotBoundary) {
+  AddSnapshot(2, 1);
+  RunTest({test::KeyStr("a", 2, kTypeSingleDeletion),
+           test::KeyStr("a", 1, kTypeValue)},
+          {"", "v1"},
+          {test::KeyStr("a", 2, kTypeSingleDeletion),
+           test::KeyStr("a", 1, kTypeValue)},
+          {"", "v1"}, 2 /*last_committed_seq*/);
+}
+
+// Single delete should be kept in case it is not visible to the
+// earliest write conflict snapshot. If a single delete is kept for this reason,
+// corresponding value can be trimmed to save space.
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       KeepSingleDeletionForWriteConflictChecking) {
+  AddSnapshot(2, 0);
+  RunTest({test::KeyStr("a", 2, kTypeSingleDeletion),
+           test::KeyStr("a", 1, kTypeValue)},
+          {"", "v1"},
+          {test::KeyStr("a", 2, kTypeSingleDeletion),
+           test::KeyStr("a", 1, kTypeValue)},
+          {"", ""}, 2 /*last_committed_seq*/, nullptr /*merge_operator*/,
+          nullptr /*compaction_filter*/, false /*bottommost_level*/,
+          2 /*earliest_write_conflict_snapshot*/);
+}
+
+// Same as above but with a blob index. In addition to the value getting
+// trimmed, the type of the KV is changed to kTypeValue.
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       KeepSingleDeletionForWriteConflictChecking_BlobIndex) {
+  AddSnapshot(2, 0);
+  RunTest({test::KeyStr("a", 2, kTypeSingleDeletion),
+           test::KeyStr("a", 1, kTypeBlobIndex)},
+          {"", "fake_blob_index"},
+          {test::KeyStr("a", 2, kTypeSingleDeletion),
+           test::KeyStr("a", 1, kTypeValue)},
+          {"", ""}, 2 /*last_committed_seq*/, nullptr /*merge_operator*/,
+          nullptr /*compaction_filter*/, false /*bottommost_level*/,
+          2 /*earliest_write_conflict_snapshot*/);
+}
+
+// Same as above but with a wide-column entity. In addition to the value getting
+// trimmed, the type of the KV is changed to kTypeValue.
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       KeepSingleDeletionForWriteConflictChecking_WideColumnEntity) {
+  AddSnapshot(2, 0);
+  RunTest({test::KeyStr("a", 2, kTypeSingleDeletion),
+           test::KeyStr("a", 1, kTypeWideColumnEntity)},
+          {"", "fake_entity"},
+          {test::KeyStr("a", 2, kTypeSingleDeletion),
+           test::KeyStr("a", 1, kTypeValue)},
+          {"", ""}, 2 /* last_committed_seq */, nullptr /* merge_operator */,
+          nullptr /* compaction_filter */, false /* bottommost_level */,
+          2 /* earliest_write_conflict_snapshot */);
+}
+
+// Compaction filter should keep uncommitted key as-is, and
+//   * Convert the latest value to deletion, and/or
+//   * if latest value is a merge, apply filter to all subsequent merges.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Value) {
+  std::unique_ptr<CompactionFilter> compaction_filter(
+      new FilterAllKeysCompactionFilter());
+  RunTest(
+      {test::KeyStr("a", 2, kTypeValue), test::KeyStr("a", 1, kTypeValue),
+       test::KeyStr("b", 3, kTypeValue), test::KeyStr("c", 1, kTypeValue)},
+      {"v2", "v1", "v3", "v4"},
+      {test::KeyStr("a", 2, kTypeValue), test::KeyStr("a", 1, kTypeDeletion),
+       test::KeyStr("b", 3, kTypeValue), test::KeyStr("c", 1, kTypeDeletion)},
+      {"v2", "", "v3", ""}, 1 /*last_committed_seq*/,
+      nullptr /*merge_operator*/, compaction_filter.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Deletion) {
+  std::unique_ptr<CompactionFilter> compaction_filter(
+      new FilterAllKeysCompactionFilter());
+  RunTest(
+      {test::KeyStr("a", 2, kTypeDeletion), test::KeyStr("a", 1, kTypeValue)},
+      {"", "v1"},
+      {test::KeyStr("a", 2, kTypeDeletion),
+       test::KeyStr("a", 1, kTypeDeletion)},
+      {"", ""}, 1 /*last_committed_seq*/, nullptr /*merge_operator*/,
+      compaction_filter.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       CompactionFilter_PartialMerge) {
+  std::shared_ptr<MergeOperator> merge_op =
+      MergeOperators::CreateStringAppendOperator();
+  std::unique_ptr<CompactionFilter> compaction_filter(
+      new FilterAllKeysCompactionFilter());
+  RunTest({test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeMerge),
+           test::KeyStr("a", 1, kTypeMerge)},
+          {"v3", "v2", "v1"}, {test::KeyStr("a", 3, kTypeMerge)}, {"v3"},
+          2 /*last_committed_seq*/, merge_op.get(), compaction_filter.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_FullMerge) {
+  std::shared_ptr<MergeOperator> merge_op =
+      MergeOperators::CreateStringAppendOperator();
+  std::unique_ptr<CompactionFilter> compaction_filter(
+      new FilterAllKeysCompactionFilter());
+  RunTest(
+      {test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeMerge),
+       test::KeyStr("a", 1, kTypeValue)},
+      {"v3", "v2", "v1"},
+      {test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 1, kTypeDeletion)},
+      {"v3", ""}, 2 /*last_committed_seq*/, merge_op.get(),
+      compaction_filter.get());
+}
+
+// Tests how CompactionIterator work together with AllowIngestBehind.
+class CompactionIteratorWithAllowIngestBehindTest
+    : public CompactionIteratorTest {
+ public:
+  bool AllowIngestBehind() const override { return true; }
+};
+
+// When allow_ingest_behind is set, compaction iterator is not targeting
+// the bottommost level since there is no guarantee there won't be further
+// data ingested under the compaction output in future.
+TEST_P(CompactionIteratorWithAllowIngestBehindTest, NoConvertToPutAtBottom) {
+  std::shared_ptr<MergeOperator> merge_op =
+      MergeOperators::CreateStringAppendOperator();
+  RunTest({test::KeyStr("a", 4, kTypeMerge), test::KeyStr("a", 3, kTypeMerge),
+           test::KeyStr("a", 2, kTypeMerge), test::KeyStr("b", 1, kTypeValue)},
+          {"a4", "a3", "a2", "b1"},
+          {test::KeyStr("a", 4, kTypeMerge), test::KeyStr("b", 1, kTypeValue)},
+          {"a2,a3,a4", "b1"}, kMaxSequenceNumber /*last_committed_seq*/,
+          merge_op.get(), nullptr /*compaction_filter*/,
+          true /*bottomost_level*/);
+}
+
+TEST_P(CompactionIteratorWithAllowIngestBehindTest,
+       MergeToPutIfEncounteredPutAtBottom) {
+  std::shared_ptr<MergeOperator> merge_op =
+      MergeOperators::CreateStringAppendOperator();
+  RunTest({test::KeyStr("a", 4, kTypeMerge), test::KeyStr("a", 3, kTypeMerge),
+           test::KeyStr("a", 2, kTypeValue), test::KeyStr("b", 1, kTypeValue)},
+          {"a4", "a3", "a2", "b1"},
+          {test::KeyStr("a", 4, kTypeValue), test::KeyStr("b", 1, kTypeValue)},
+          {"a2,a3,a4", "b1"}, kMaxSequenceNumber /*last_committed_seq*/,
+          merge_op.get(), nullptr /*compaction_filter*/,
+          true /*bottomost_level*/);
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionIteratorWithAllowIngestBehindTestInstance,
+                        CompactionIteratorWithAllowIngestBehindTest,
+                        testing::Values(true, false));
+
+class CompactionIteratorTsGcTest : public CompactionIteratorTest {
+ public:
+  CompactionIteratorTsGcTest()
+      : CompactionIteratorTest(test::BytewiseComparatorWithU64TsWrapper()) {}
+};
+
+TEST_P(CompactionIteratorTsGcTest, NoKeyEligibleForGC) {
+  constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}};
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/4, kTypeValue),
+      test::KeyStr(/*ts=*/102, user_key[0], /*seq=*/3,
+                   kTypeDeletionWithTimestamp),
+      test::KeyStr(/*ts=*/104, user_key[1], /*seq=*/5, kTypeValue)};
+  const std::vector<std::string> input_values = {"a3", "", "b2"};
+  std::string full_history_ts_low;
+  // All keys' timestamps are newer than or equal to 102, thus none of them
+  // will be eligible for GC.
+  PutFixed64(&full_history_ts_low, 102);
+  const std::vector<std::string>& expected_keys = input_keys;
+  const std::vector<std::string>& expected_values = input_values;
+  const std::vector<std::pair<bool, bool>> params = {
+      {false, false}, {false, true}, {true, true}};
+  for (const std::pair<bool, bool>& param : params) {
+    const bool bottommost_level = param.first;
+    const bool key_not_exists_beyond_output_level = param.second;
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            bottommost_level,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            key_not_exists_beyond_output_level, &full_history_ts_low);
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, NoMergeEligibleForGc) {
+  constexpr char user_key[] = "a";
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(10002, user_key, 102, kTypeMerge),
+      test::KeyStr(10001, user_key, 101, kTypeMerge),
+      test::KeyStr(10000, user_key, 100, kTypeValue)};
+  const std::vector<std::string> input_values = {"2", "1", "a0"};
+  std::shared_ptr<MergeOperator> merge_op =
+      MergeOperators::CreateStringAppendTESTOperator();
+  const std::vector<std::string>& expected_keys = input_keys;
+  const std::vector<std::string>& expected_values = input_values;
+  const std::vector<std::pair<bool, bool>> params = {
+      {false, false}, {false, true}, {true, true}};
+  for (const auto& param : params) {
+    const bool bottommost_level = param.first;
+    const bool key_not_exists_beyond_output_level = param.second;
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber, merge_op.get(),
+            /*compaction_filter=*/nullptr, bottommost_level,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            key_not_exists_beyond_output_level,
+            /*full_history_ts_low=*/nullptr);
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, AllKeysOlderThanThreshold) {
+  constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}};
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/4,
+                   kTypeDeletionWithTimestamp),
+      test::KeyStr(/*ts=*/102, user_key[0], /*seq=*/3, kTypeValue),
+      test::KeyStr(/*ts=*/101, user_key[0], /*seq=*/2, kTypeValue),
+      test::KeyStr(/*ts=*/104, user_key[1], /*seq=*/5, kTypeValue)};
+  const std::vector<std::string> input_values = {"", "a2", "a1", "b5"};
+  std::string full_history_ts_low;
+  PutFixed64(&full_history_ts_low, std::numeric_limits<uint64_t>::max());
+  {
+    // With a snapshot at seq 3, both the deletion marker and the key at 3 must
+    // be preserved.
+    AddSnapshot(3);
+    const std::vector<std::string> expected_keys = {
+        input_keys[0], input_keys[1], input_keys[3]};
+    const std::vector<std::string> expected_values = {"", "a2", "b5"};
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/false,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+    ClearSnapshots();
+  }
+  {
+    // No snapshot, the deletion marker should be preserved because the user
+    // key may appear beyond output level.
+    const std::vector<std::string> expected_keys = {input_keys[0],
+                                                    input_keys[3]};
+    const std::vector<std::string> expected_values = {"", "b5"};
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/false,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+  }
+  {
+    // No snapshot, the deletion marker can be dropped because the user key
+    // does not appear in higher levels.
+    const std::vector<std::string> expected_keys = {input_keys[3]};
+    const std::vector<std::string> expected_values = {"b5"};
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/false,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low);
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, SomeMergesOlderThanThreshold) {
+  constexpr char user_key[][2] = {"a", "f"};
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/25000, user_key[0], /*seq=*/2500, kTypeMerge),
+      test::KeyStr(/*ts=*/19000, user_key[0], /*seq=*/2300, kTypeMerge),
+      test::KeyStr(/*ts=*/18000, user_key[0], /*seq=*/1800, kTypeMerge),
+      test::KeyStr(/*ts=*/16000, user_key[0], /*seq=*/1600, kTypeValue),
+      test::KeyStr(/*ts=*/19000, user_key[1], /*seq=*/2000, kTypeMerge),
+      test::KeyStr(/*ts=*/17000, user_key[1], /*seq=*/1700, kTypeMerge),
+      test::KeyStr(/*ts=*/15000, user_key[1], /*seq=*/1600,
+                   kTypeDeletionWithTimestamp)};
+  const std::vector<std::string> input_values = {"25", "19", "18", "16",
+                                                 "19", "17", ""};
+  std::shared_ptr<MergeOperator> merge_op =
+      MergeOperators::CreateStringAppendTESTOperator();
+  std::string full_history_ts_low;
+  PutFixed64(&full_history_ts_low, 20000);
+
+  const std::vector<std::pair<bool, bool>> params = {
+      {false, false}, {false, true}, {true, true}};
+
+  {
+    AddSnapshot(1600);
+    AddSnapshot(1900);
+    const std::vector<std::string> expected_keys = {
+        test::KeyStr(/*ts=*/25000, user_key[0], /*seq=*/2500, kTypeMerge),
+        test::KeyStr(/*ts=*/19000, user_key[0], /*seq=*/2300, kTypeMerge),
+        test::KeyStr(/*ts=*/18000, user_key[0], /*seq=*/1800, kTypeMerge),
+        test::KeyStr(/*ts=*/16000, user_key[0], /*seq=*/1600, kTypeValue),
+        test::KeyStr(/*ts=*/19000, user_key[1], /*seq=*/2000, kTypeMerge),
+        test::KeyStr(/*ts=*/17000, user_key[1], /*seq=*/1700, kTypeMerge),
+        test::KeyStr(/*ts=*/15000, user_key[1], /*seq=*/1600,
+                     kTypeDeletionWithTimestamp)};
+    const std::vector<std::string> expected_values = {"25", "19", "18", "16",
+                                                      "19", "17", ""};
+    for (const auto& param : params) {
+      const bool bottommost_level = param.first;
+      const bool key_not_exists_beyond_output_level = param.second;
+      auto expected_keys_copy = expected_keys;
+      auto expected_values_copy = expected_values;
+      if (bottommost_level || key_not_exists_beyond_output_level) {
+        // the kTypeDeletionWithTimestamp will be dropped
+        expected_keys_copy.pop_back();
+        expected_values_copy.pop_back();
+        if (bottommost_level) {
+          // seq zero
+          expected_keys_copy[3] =
+              test::KeyStr(/*ts=*/0, user_key[0], /*seq=*/0, kTypeValue);
+        }
+      }
+      RunTest(input_keys, input_values, expected_keys_copy,
+              expected_values_copy,
+              /*last_committed_seq=*/kMaxSequenceNumber, merge_op.get(),
+              /*compaction_filter=*/nullptr, bottommost_level,
+              /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+              key_not_exists_beyond_output_level, &full_history_ts_low);
+    }
+    ClearSnapshots();
+  }
+
+  // No snapshots
+  {
+    const std::vector<std::string> expected_keys = {
+        test::KeyStr(/*ts=*/25000, user_key[0], /*seq=*/2500, kTypeValue),
+        test::KeyStr(/*ts=*/19000, user_key[1], /*seq=*/2000, kTypeValue)};
+    const std::vector<std::string> expected_values = {"16,18,19,25", "17,19"};
+    for (const auto& param : params) {
+      const bool bottommost_level = param.first;
+      const bool key_not_exists_beyond_output_level = param.second;
+      auto expected_keys_copy = expected_keys;
+      auto expected_values_copy = expected_values;
+      if (bottommost_level) {
+        expected_keys_copy[1] =
+            test::KeyStr(/*ts=*/0, user_key[1], /*seq=*/0, kTypeValue);
+      }
+      RunTest(input_keys, input_values, expected_keys_copy,
+              expected_values_copy,
+              /*last_committed_seq=*/kMaxSequenceNumber, merge_op.get(),
+              /*compaction_filter=*/nullptr, bottommost_level,
+              /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+              key_not_exists_beyond_output_level, &full_history_ts_low);
+    }
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, NewHidesOldSameSnapshot) {
+  constexpr char user_key[] = "a";
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp),
+      test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue),
+      test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeValue),
+      test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)};
+  const std::vector<std::string> input_values = {"", "a2", "a1", "a0"};
+  {
+    std::string full_history_ts_low;
+    // Keys whose timestamps larger than or equal to 102 will be preserved.
+    PutFixed64(&full_history_ts_low, 102);
+    const std::vector<std::string> expected_keys = {
+        input_keys[0], input_keys[1], input_keys[2]};
+    const std::vector<std::string> expected_values = {"", input_values[1],
+                                                      input_values[2]};
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/false,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, DropTombstones) {
+  constexpr char user_key[] = "a";
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp),
+      test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue),
+      test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeDeletionWithTimestamp),
+      test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)};
+  const std::vector<std::string> input_values = {"", "a2", "", "a0"};
+  const std::vector<std::string> expected_keys = {input_keys[0], input_keys[1]};
+  const std::vector<std::string> expected_values = {"", "a2"};
+
+  // Take a snapshot at seq 2.
+  AddSnapshot(2);
+
+  {
+    // Non-bottommost level, but key does not exist beyond output level.
+    std::string full_history_ts_low;
+    PutFixed64(&full_history_ts_low, 102);
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_sequence=*/kMaxSequenceNumber,
+            /*merge_op=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/false,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low);
+  }
+  {
+    // Bottommost level
+    std::string full_history_ts_low;
+    PutFixed64(&full_history_ts_low, 102);
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/true,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, RewriteTs) {
+  constexpr char user_key[] = "a";
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp),
+      test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue),
+      test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeDeletionWithTimestamp),
+      test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)};
+  const std::vector<std::string> input_values = {"", "a2", "", "a0"};
+  const std::vector<std::string> expected_keys = {
+      input_keys[0], input_keys[1], input_keys[2],
+      test::KeyStr(/*ts=*/0, user_key, /*seq=*/0, kTypeValue)};
+  const std::vector<std::string> expected_values = {"", "a2", "", "a0"};
+
+  AddSnapshot(1);
+  AddSnapshot(2);
+
+  {
+    // Bottommost level and need to rewrite both ts and seq.
+    std::string full_history_ts_low;
+    PutFixed64(&full_history_ts_low, 102);
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/true,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low);
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, SingleDeleteNoKeyEligibleForGC) {
+  constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}};
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/104, user_key[0], /*seq=*/4, kTypeSingleDeletion),
+      test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/3, kTypeValue),
+      test::KeyStr(/*ts=*/102, user_key[1], /*seq=*/2, kTypeValue)};
+  const std::vector<std::string> input_values = {"", "a3", "b2"};
+  std::string full_history_ts_low;
+  // All keys' timestamps are newer than or equal to 102, thus none of them
+  // will be eligible for GC.
+  PutFixed64(&full_history_ts_low, 102);
+  const std::vector<std::string>& expected_keys = input_keys;
+  const std::vector<std::string>& expected_values = input_values;
+  const std::vector<std::pair<bool, bool>> params = {
+      {false, false}, {false, true}, {true, true}};
+  for (const std::pair<bool, bool>& param : params) {
+    const bool bottommost_level = param.first;
+    const bool key_not_exists_beyond_output_level = param.second;
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            bottommost_level,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            key_not_exists_beyond_output_level, &full_history_ts_low);
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, SingleDeleteDropTombstones) {
+  constexpr char user_key[] = "a";
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeSingleDeletion),
+      test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue),
+      test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeSingleDeletion),
+      test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)};
+  const std::vector<std::string> input_values = {"", "a2", "", "a0"};
+  const std::vector<std::string> expected_keys = {input_keys[0], input_keys[1]};
+  const std::vector<std::string> expected_values = {"", "a2"};
+
+  // Take a snapshot at seq 2.
+  AddSnapshot(2);
+  {
+    const std::vector<std::pair<bool, bool>> params = {
+        {false, false}, {false, true}, {true, true}};
+    for (const std::pair<bool, bool>& param : params) {
+      const bool bottommost_level = param.first;
+      const bool key_not_exists_beyond_output_level = param.second;
+      std::string full_history_ts_low;
+      PutFixed64(&full_history_ts_low, 102);
+      RunTest(input_keys, input_values, expected_keys, expected_values,
+              /*last_committed_seq=*/kMaxSequenceNumber,
+              /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+              bottommost_level,
+              /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+              key_not_exists_beyond_output_level, &full_history_ts_low);
+    }
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, SingleDeleteAllKeysOlderThanThreshold) {
+  constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}};
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/4, kTypeSingleDeletion),
+      test::KeyStr(/*ts=*/102, user_key[0], /*seq=*/3, kTypeValue),
+      test::KeyStr(/*ts=*/104, user_key[1], /*seq=*/5, kTypeValue)};
+  const std::vector<std::string> input_values = {"", "a2", "b5"};
+  std::string full_history_ts_low;
+  PutFixed64(&full_history_ts_low, std::numeric_limits<uint64_t>::max());
+  {
+    // With a snapshot at seq 3, both the deletion marker and the key at 3 must
+    // be preserved.
+    AddSnapshot(3);
+    const std::vector<std::string> expected_keys = {
+        input_keys[0], input_keys[1], input_keys[2]};
+    const std::vector<std::string> expected_values = {"", "a2", "b5"};
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/false,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+    ClearSnapshots();
+  }
+  {
+    // No snapshot.
+    const std::vector<std::string> expected_keys = {input_keys[2]};
+    const std::vector<std::string> expected_values = {"b5"};
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/false,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionIteratorTsGcTestInstance,
+                        CompactionIteratorTsGcTest,
+                        testing::Values(true, false));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/compaction/compaction_job.cc b/src/rocksdb/db/compaction/compaction_job.cc
new file mode 100644
index 000000000..1da1bcda8
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_job.cc
@@ -0,0 +1,2060 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_job.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <memory>
+#include <optional>
+#include <set>
+#include <utility>
+#include <vector>
+
+#include "db/blob/blob_counting_iterator.h"
+#include "db/blob/blob_file_addition.h"
+#include "db/blob/blob_file_builder.h"
+#include "db/builder.h"
+#include "db/compaction/clipping_iterator.h"
+#include "db/compaction/compaction_state.h"
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "db/history_trimming_iterator.h"
+#include "db/log_writer.h"
+#include "db/merge_helper.h"
+#include "db/range_del_aggregator.h"
+#include "db/version_edit.h"
+#include "db/version_set.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "file/writable_file_writer.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/thread_status_util.h"
+#include "options/configurable_helper.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/options_type.h"
+#include "table/merging_iterator.h"
+#include "table/table_builder.h"
+#include "table/unique_id_impl.h"
+#include "test_util/sync_point.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const char* GetCompactionReasonString(CompactionReason compaction_reason) {
+  switch (compaction_reason) {
+    case CompactionReason::kUnknown:
+      return "Unknown";
+    case CompactionReason::kLevelL0FilesNum:
+      return "LevelL0FilesNum";
+    case CompactionReason::kLevelMaxLevelSize:
+      return "LevelMaxLevelSize";
+    case CompactionReason::kUniversalSizeAmplification:
+      return "UniversalSizeAmplification";
+    case CompactionReason::kUniversalSizeRatio:
+      return "UniversalSizeRatio";
+    case CompactionReason::kUniversalSortedRunNum:
+      return "UniversalSortedRunNum";
+    case CompactionReason::kFIFOMaxSize:
+      return "FIFOMaxSize";
+    case CompactionReason::kFIFOReduceNumFiles:
+      return "FIFOReduceNumFiles";
+    case CompactionReason::kFIFOTtl:
+      return "FIFOTtl";
+    case CompactionReason::kManualCompaction:
+      return "ManualCompaction";
+    case CompactionReason::kFilesMarkedForCompaction:
+      return "FilesMarkedForCompaction";
+    case CompactionReason::kBottommostFiles:
+      return "BottommostFiles";
+    case CompactionReason::kTtl:
+      return "Ttl";
+    case CompactionReason::kFlush:
+      return "Flush";
+    case CompactionReason::kExternalSstIngestion:
+      return "ExternalSstIngestion";
+    case CompactionReason::kPeriodicCompaction:
+      return "PeriodicCompaction";
+    case CompactionReason::kChangeTemperature:
+      return "ChangeTemperature";
+    case CompactionReason::kForcedBlobGC:
+      return "ForcedBlobGC";
+    case CompactionReason::kRoundRobinTtl:
+      return "RoundRobinTtl";
+    case CompactionReason::kNumOfReasons:
+      // fall through
+    default:
+      assert(false);
+      return "Invalid";
+  }
+}
+
+const char* GetCompactionPenultimateOutputRangeTypeString(
+    Compaction::PenultimateOutputRangeType range_type) {
+  switch (range_type) {
+    case Compaction::PenultimateOutputRangeType::kNotSupported:
+      return "NotSupported";
+    case Compaction::PenultimateOutputRangeType::kFullRange:
+      return "FullRange";
+    case Compaction::PenultimateOutputRangeType::kNonLastRange:
+      return "NonLastRange";
+    case Compaction::PenultimateOutputRangeType::kDisabled:
+      return "Disabled";
+    default:
+      assert(false);
+      return "Invalid";
+  }
+}
+
+CompactionJob::CompactionJob(
+    int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
+    const MutableDBOptions& mutable_db_options, const FileOptions& file_options,
+    VersionSet* versions, const std::atomic<bool>* shutting_down,
+    LogBuffer* log_buffer, FSDirectory* db_directory,
+    FSDirectory* output_directory, FSDirectory* blob_output_directory,
+    Statistics* stats, InstrumentedMutex* db_mutex,
+    ErrorHandler* db_error_handler,
+    std::vector<SequenceNumber> existing_snapshots,
+    SequenceNumber earliest_write_conflict_snapshot,
+    const SnapshotChecker* snapshot_checker, JobContext* job_context,
+    std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+    bool paranoid_file_checks, bool measure_io_stats, const std::string& dbname,
+    CompactionJobStats* compaction_job_stats, Env::Priority thread_pri,
+    const std::shared_ptr<IOTracer>& io_tracer,
+    const std::atomic<bool>& manual_compaction_canceled,
+    const std::string& db_id, const std::string& db_session_id,
+    std::string full_history_ts_low, std::string trim_ts,
+    BlobFileCompletionCallback* blob_callback, int* bg_compaction_scheduled,
+    int* bg_bottom_compaction_scheduled)
+    : compact_(new CompactionState(compaction)),
+      compaction_stats_(compaction->compaction_reason(), 1),
+      db_options_(db_options),
+      mutable_db_options_copy_(mutable_db_options),
+      log_buffer_(log_buffer),
+      output_directory_(output_directory),
+      stats_(stats),
+      bottommost_level_(false),
+      write_hint_(Env::WLTH_NOT_SET),
+      compaction_job_stats_(compaction_job_stats),
+      job_id_(job_id),
+      dbname_(dbname),
+      db_id_(db_id),
+      db_session_id_(db_session_id),
+      file_options_(file_options),
+      env_(db_options.env),
+      io_tracer_(io_tracer),
+      fs_(db_options.fs, io_tracer),
+      file_options_for_read_(
+          fs_->OptimizeForCompactionTableRead(file_options, db_options_)),
+      versions_(versions),
+      shutting_down_(shutting_down),
+      manual_compaction_canceled_(manual_compaction_canceled),
+      db_directory_(db_directory),
+      blob_output_directory_(blob_output_directory),
+      db_mutex_(db_mutex),
+      db_error_handler_(db_error_handler),
+      existing_snapshots_(std::move(existing_snapshots)),
+      earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
+      snapshot_checker_(snapshot_checker),
+      job_context_(job_context),
+      table_cache_(std::move(table_cache)),
+      event_logger_(event_logger),
+      paranoid_file_checks_(paranoid_file_checks),
+      measure_io_stats_(measure_io_stats),
+      thread_pri_(thread_pri),
+      full_history_ts_low_(std::move(full_history_ts_low)),
+      trim_ts_(std::move(trim_ts)),
+      blob_callback_(blob_callback),
+      extra_num_subcompaction_threads_reserved_(0),
+      bg_compaction_scheduled_(bg_compaction_scheduled),
+      bg_bottom_compaction_scheduled_(bg_bottom_compaction_scheduled) {
+  assert(compaction_job_stats_ != nullptr);
+  assert(log_buffer_ != nullptr);
+
+  const auto* cfd = compact_->compaction->column_family_data();
+  ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
+                                    db_options_.enable_thread_tracking);
+  ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
+  ReportStartedCompaction(compaction);
+}
+
+CompactionJob::~CompactionJob() {
+  assert(compact_ == nullptr);
+  ThreadStatusUtil::ResetThreadStatus();
+}
+
+void CompactionJob::ReportStartedCompaction(Compaction* compaction) {
+  const auto* cfd = compact_->compaction->column_family_data();
+  ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
+                                    db_options_.enable_thread_tracking);
+
+  ThreadStatusUtil::SetThreadOperationProperty(ThreadStatus::COMPACTION_JOB_ID,
+                                               job_id_);
+
+  ThreadStatusUtil::SetThreadOperationProperty(
+      ThreadStatus::COMPACTION_INPUT_OUTPUT_LEVEL,
+      (static_cast<uint64_t>(compact_->compaction->start_level()) << 32) +
+          compact_->compaction->output_level());
+
+  // In the current design, a CompactionJob is always created
+  // for non-trivial compaction.
+  assert(compaction->IsTrivialMove() == false ||
+         compaction->is_manual_compaction() == true);
+
+  ThreadStatusUtil::SetThreadOperationProperty(
+      ThreadStatus::COMPACTION_PROP_FLAGS,
+      compaction->is_manual_compaction() +
+          (compaction->deletion_compaction() << 1));
+
+  ThreadStatusUtil::SetThreadOperationProperty(
+      ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES,
+      compaction->CalculateTotalInputSize());
+
+  IOSTATS_RESET(bytes_written);
+  IOSTATS_RESET(bytes_read);
+  ThreadStatusUtil::SetThreadOperationProperty(
+      ThreadStatus::COMPACTION_BYTES_WRITTEN, 0);
+  ThreadStatusUtil::SetThreadOperationProperty(
+      ThreadStatus::COMPACTION_BYTES_READ, 0);
+
+  // Set the thread operation after operation properties
+  // to ensure GetThreadList() can always show them all together.
+  ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
+
+  compaction_job_stats_->is_manual_compaction =
+      compaction->is_manual_compaction();
+  compaction_job_stats_->is_full_compaction = compaction->is_full_compaction();
+}
+
+void CompactionJob::Prepare() {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_COMPACTION_PREPARE);
+
+  // Generate file_levels_ for compaction before making Iterator
+  auto* c = compact_->compaction;
+  ColumnFamilyData* cfd = c->column_family_data();
+  assert(cfd != nullptr);
+  assert(cfd->current()->storage_info()->NumLevelFiles(
+             compact_->compaction->level()) > 0);
+
+  write_hint_ = cfd->CalculateSSTWriteHint(c->output_level());
+  bottommost_level_ = c->bottommost_level();
+
+  if (c->ShouldFormSubcompactions()) {
+    StopWatch sw(db_options_.clock, stats_, SUBCOMPACTION_SETUP_TIME);
+    GenSubcompactionBoundaries();
+  }
+  if (boundaries_.size() > 1) {
+    for (size_t i = 0; i <= boundaries_.size(); i++) {
+      compact_->sub_compact_states.emplace_back(
+          c, (i != 0) ? std::optional<Slice>(boundaries_[i - 1]) : std::nullopt,
+          (i != boundaries_.size()) ? std::optional<Slice>(boundaries_[i])
+                                    : std::nullopt,
+          static_cast<uint32_t>(i));
+      // assert to validate that boundaries don't have same user keys (without
+      // timestamp part).
+      assert(i == 0 || i == boundaries_.size() ||
+             cfd->user_comparator()->CompareWithoutTimestamp(
+                 boundaries_[i - 1], boundaries_[i]) < 0);
+    }
+    RecordInHistogram(stats_, NUM_SUBCOMPACTIONS_SCHEDULED,
+                      compact_->sub_compact_states.size());
+  } else {
+    compact_->sub_compact_states.emplace_back(c, std::nullopt, std::nullopt,
+                                              /*sub_job_id*/ 0);
+  }
+
+  // collect all seqno->time information from the input files which will be used
+  // to encode seqno->time to the output files.
+  uint64_t preserve_time_duration =
+      std::max(c->immutable_options()->preserve_internal_time_seconds,
+               c->immutable_options()->preclude_last_level_data_seconds);
+
+  if (preserve_time_duration > 0) {
+    // setup seqno_time_mapping_
+    seqno_time_mapping_.SetMaxTimeDuration(preserve_time_duration);
+    for (const auto& each_level : *c->inputs()) {
+      for (const auto& fmd : each_level.files) {
+        std::shared_ptr<const TableProperties> tp;
+        Status s = cfd->current()->GetTableProperties(&tp, fmd, nullptr);
+        if (s.ok()) {
+          seqno_time_mapping_.Add(tp->seqno_to_time_mapping)
+              .PermitUncheckedError();
+          seqno_time_mapping_.Add(fmd->fd.smallest_seqno,
+                                  fmd->oldest_ancester_time);
+        }
+      }
+    }
+
+    auto status = seqno_time_mapping_.Sort();
+    if (!status.ok()) {
+      ROCKS_LOG_WARN(db_options_.info_log,
+                     "Invalid sequence number to time mapping: Status: %s",
+                     status.ToString().c_str());
+    }
+    int64_t _current_time = 0;
+    status = db_options_.clock->GetCurrentTime(&_current_time);
+    if (!status.ok()) {
+      ROCKS_LOG_WARN(db_options_.info_log,
+                     "Failed to get current time in compaction: Status: %s",
+                     status.ToString().c_str());
+      // preserve all time information
+      preserve_time_min_seqno_ = 0;
+      preclude_last_level_min_seqno_ = 0;
+    } else {
+      seqno_time_mapping_.TruncateOldEntries(_current_time);
+      uint64_t preserve_time =
+          static_cast<uint64_t>(_current_time) > preserve_time_duration
+              ? _current_time - preserve_time_duration
+              : 0;
+      preserve_time_min_seqno_ =
+          seqno_time_mapping_.GetOldestSequenceNum(preserve_time);
+      if (c->immutable_options()->preclude_last_level_data_seconds > 0) {
+        uint64_t preclude_last_level_time =
+            static_cast<uint64_t>(_current_time) >
+                    c->immutable_options()->preclude_last_level_data_seconds
+                ? _current_time -
+                      c->immutable_options()->preclude_last_level_data_seconds
+                : 0;
+        preclude_last_level_min_seqno_ =
+            seqno_time_mapping_.GetOldestSequenceNum(preclude_last_level_time);
+      }
+    }
+  }
+}
+
+uint64_t CompactionJob::GetSubcompactionsLimit() {
+  return extra_num_subcompaction_threads_reserved_ +
+         std::max(
+             std::uint64_t(1),
+             static_cast<uint64_t>(compact_->compaction->max_subcompactions()));
+}
+
+void CompactionJob::AcquireSubcompactionResources(
+    int num_extra_required_subcompactions) {
+  TEST_SYNC_POINT("CompactionJob::AcquireSubcompactionResources:0");
+  TEST_SYNC_POINT("CompactionJob::AcquireSubcompactionResources:1");
+  int max_db_compactions =
+      DBImpl::GetBGJobLimits(
+          mutable_db_options_copy_.max_background_flushes,
+          mutable_db_options_copy_.max_background_compactions,
+          mutable_db_options_copy_.max_background_jobs,
+          versions_->GetColumnFamilySet()
+              ->write_controller()
+              ->NeedSpeedupCompaction())
+          .max_compactions;
+  InstrumentedMutexLock l(db_mutex_);
+  // Apply min function first since We need to compute the extra subcompaction
+  // against compaction limits. And then try to reserve threads for extra
+  // subcompactions. The actual number of reserved threads could be less than
+  // the desired number.
+  int available_bg_compactions_against_db_limit =
+      std::max(max_db_compactions - *bg_compaction_scheduled_ -
+                   *bg_bottom_compaction_scheduled_,
+               0);
+  // Reservation only supports backgrdoun threads of which the priority is
+  // between BOTTOM and HIGH. Need to degrade the priority to HIGH if the
+  // origin thread_pri_ is higher than that. Similar to ReleaseThreads().
+  extra_num_subcompaction_threads_reserved_ =
+      env_->ReserveThreads(std::min(num_extra_required_subcompactions,
+                                    available_bg_compactions_against_db_limit),
+                           std::min(thread_pri_, Env::Priority::HIGH));
+
+  // Update bg_compaction_scheduled_ or bg_bottom_compaction_scheduled_
+  // depending on if this compaction has the bottommost priority
+  if (thread_pri_ == Env::Priority::BOTTOM) {
+    *bg_bottom_compaction_scheduled_ +=
+        extra_num_subcompaction_threads_reserved_;
+  } else {
+    *bg_compaction_scheduled_ += extra_num_subcompaction_threads_reserved_;
+  }
+}
+
+void CompactionJob::ShrinkSubcompactionResources(uint64_t num_extra_resources) {
+  // Do nothing when we have zero resources to shrink
+  if (num_extra_resources == 0) return;
+  db_mutex_->Lock();
+  // We cannot release threads more than what we reserved before
+  int extra_num_subcompaction_threads_released = env_->ReleaseThreads(
+      (int)num_extra_resources, std::min(thread_pri_, Env::Priority::HIGH));
+  // Update the number of reserved threads and the number of background
+  // scheduled compactions for this compaction job
+  extra_num_subcompaction_threads_reserved_ -=
+      extra_num_subcompaction_threads_released;
+  // TODO (zichen): design a test case with new subcompaction partitioning
+  // when the number of actual partitions is less than the number of planned
+  // partitions
+  assert(extra_num_subcompaction_threads_released == (int)num_extra_resources);
+  // Update bg_compaction_scheduled_ or bg_bottom_compaction_scheduled_
+  // depending on if this compaction has the bottommost priority
+  if (thread_pri_ == Env::Priority::BOTTOM) {
+    *bg_bottom_compaction_scheduled_ -=
+        extra_num_subcompaction_threads_released;
+  } else {
+    *bg_compaction_scheduled_ -= extra_num_subcompaction_threads_released;
+  }
+  db_mutex_->Unlock();
+  TEST_SYNC_POINT("CompactionJob::ShrinkSubcompactionResources:0");
+}
+
+void CompactionJob::ReleaseSubcompactionResources() {
+  if (extra_num_subcompaction_threads_reserved_ == 0) {
+    return;
+  }
+  {
+    InstrumentedMutexLock l(db_mutex_);
+    // The number of reserved threads becomes larger than 0 only if the
+    // compaction prioity is round robin and there is no sufficient
+    // sub-compactions available
+
+    // The scheduled compaction must be no less than 1 + extra number
+    // subcompactions using acquired resources since this compaction job has not
+    // finished yet
+    assert(*bg_bottom_compaction_scheduled_ >=
+               1 + extra_num_subcompaction_threads_reserved_ ||
+           *bg_compaction_scheduled_ >=
+               1 + extra_num_subcompaction_threads_reserved_);
+  }
+  ShrinkSubcompactionResources(extra_num_subcompaction_threads_reserved_);
+}
+
+struct RangeWithSize {
+  Range range;
+  uint64_t size;
+
+  RangeWithSize(const Slice& a, const Slice& b, uint64_t s = 0)
+      : range(a, b), size(s) {}
+};
+
+void CompactionJob::GenSubcompactionBoundaries() {
+  // The goal is to find some boundary keys so that we can evenly partition
+  // the compaction input data into max_subcompactions ranges.
+  // For every input file, we ask TableReader to estimate 128 anchor points
+  // that evenly partition the input file into 128 ranges and the range
+  // sizes. This can be calculated by scanning index blocks of the file.
+  // Once we have the anchor points for all the input files, we merge them
+  // together and try to find keys dividing ranges evenly.
+  // For example, if we have two input files, and each returns following
+  // ranges:
+  //   File1: (a1, 1000), (b1, 1200), (c1, 1100)
+  //   File2: (a2, 1100), (b2, 1000), (c2, 1000)
+  // We total sort the keys to following:
+  //  (a1, 1000), (a2, 1100), (b1, 1200), (b2, 1000), (c1, 1100), (c2, 1000)
+  // We calculate the total size by adding up all ranges' size, which is 6400.
+  // If we would like to partition into 2 subcompactions, the target of the
+  // range size is 3200. Based on the size, we take "b1" as the partition key
+  // since the first three ranges would hit 3200.
+  //
+  // Note that the ranges are actually overlapping. For example, in the example
+  // above, the range ending with "b1" is overlapping with the range ending with
+  // "b2". So the size 1000+1100+1200 is an underestimation of data size up to
+  // "b1". In extreme cases where we only compact N L0 files, a range can
+  // overlap with N-1 other ranges. Since we requested a relatively large number
+  // (128) of ranges from each input files, even N range overlapping would
+  // cause relatively small inaccuracy.
+
+  auto* c = compact_->compaction;
+  if (c->max_subcompactions() <= 1 &&
+      !(c->immutable_options()->compaction_pri == kRoundRobin &&
+        c->immutable_options()->compaction_style == kCompactionStyleLevel)) {
+    return;
+  }
+  auto* cfd = c->column_family_data();
+  const Comparator* cfd_comparator = cfd->user_comparator();
+  const InternalKeyComparator& icomp = cfd->internal_comparator();
+
+  auto* v = compact_->compaction->input_version();
+  int base_level = v->storage_info()->base_level();
+  InstrumentedMutexUnlock unlock_guard(db_mutex_);
+
+  uint64_t total_size = 0;
+  std::vector<TableReader::Anchor> all_anchors;
+  int start_lvl = c->start_level();
+  int out_lvl = c->output_level();
+
+  for (size_t lvl_idx = 0; lvl_idx < c->num_input_levels(); lvl_idx++) {
+    int lvl = c->level(lvl_idx);
+    if (lvl >= start_lvl && lvl <= out_lvl) {
+      const LevelFilesBrief* flevel = c->input_levels(lvl_idx);
+      size_t num_files = flevel->num_files;
+
+      if (num_files == 0) {
+        continue;
+      }
+
+      for (size_t i = 0; i < num_files; i++) {
+        FileMetaData* f = flevel->files[i].file_metadata;
+        std::vector<TableReader::Anchor> my_anchors;
+        Status s = cfd->table_cache()->ApproximateKeyAnchors(
+            ReadOptions(), icomp, *f, my_anchors);
+        if (!s.ok() || my_anchors.empty()) {
+          my_anchors.emplace_back(f->largest.user_key(), f->fd.GetFileSize());
+        }
+        for (auto& ac : my_anchors) {
+          // Can be optimize to avoid this loop.
+          total_size += ac.range_size;
+        }
+
+        all_anchors.insert(all_anchors.end(), my_anchors.begin(),
+                           my_anchors.end());
+      }
+    }
+  }
+  // Here we total sort all the anchor points across all files and go through
+  // them in the sorted order to find partitioning boundaries.
+  // Not the most efficient implementation. A much more efficient algorithm
+  // probably exists. But they are more complex. If performance turns out to
+  // be a problem, we can optimize.
+  std::sort(
+      all_anchors.begin(), all_anchors.end(),
+      [cfd_comparator](TableReader::Anchor& a, TableReader::Anchor& b) -> bool {
+        return cfd_comparator->CompareWithoutTimestamp(a.user_key, b.user_key) <
+               0;
+      });
+
+  // Remove duplicated entries from boundaries.
+  all_anchors.erase(
+      std::unique(all_anchors.begin(), all_anchors.end(),
+                  [cfd_comparator](TableReader::Anchor& a,
+                                   TableReader::Anchor& b) -> bool {
+                    return cfd_comparator->CompareWithoutTimestamp(
+                               a.user_key, b.user_key) == 0;
+                  }),
+      all_anchors.end());
+
+  // Get the number of planned subcompactions, may update reserve threads
+  // and update extra_num_subcompaction_threads_reserved_ for round-robin
+  uint64_t num_planned_subcompactions;
+  if (c->immutable_options()->compaction_pri == kRoundRobin &&
+      c->immutable_options()->compaction_style == kCompactionStyleLevel) {
+    // For round-robin compaction prioity, we need to employ more
+    // subcompactions (may exceed the max_subcompaction limit). The extra
+    // subcompactions will be executed using reserved threads and taken into
+    // account bg_compaction_scheduled or bg_bottom_compaction_scheduled.
+
+    // Initialized by the number of input files
+    num_planned_subcompactions = static_cast<uint64_t>(c->num_input_files(0));
+    uint64_t max_subcompactions_limit = GetSubcompactionsLimit();
+    if (max_subcompactions_limit < num_planned_subcompactions) {
+      // Assert two pointers are not empty so that we can use extra
+      // subcompactions against db compaction limits
+      assert(bg_bottom_compaction_scheduled_ != nullptr);
+      assert(bg_compaction_scheduled_ != nullptr);
+      // Reserve resources when max_subcompaction is not sufficient
+      AcquireSubcompactionResources(
+          (int)(num_planned_subcompactions - max_subcompactions_limit));
+      // Subcompactions limit changes after acquiring additional resources.
+      // Need to call GetSubcompactionsLimit() again to update the number
+      // of planned subcompactions
+      num_planned_subcompactions =
+          std::min(num_planned_subcompactions, GetSubcompactionsLimit());
+    } else {
+      num_planned_subcompactions = max_subcompactions_limit;
+    }
+  } else {
+    num_planned_subcompactions = GetSubcompactionsLimit();
+  }
+
+  TEST_SYNC_POINT_CALLBACK("CompactionJob::GenSubcompactionBoundaries:0",
+                           &num_planned_subcompactions);
+  if (num_planned_subcompactions == 1) return;
+
+  // Group the ranges into subcompactions
+  uint64_t target_range_size = std::max(
+      total_size / num_planned_subcompactions,
+      MaxFileSizeForLevel(
+          *(c->mutable_cf_options()), out_lvl,
+          c->immutable_options()->compaction_style, base_level,
+          c->immutable_options()->level_compaction_dynamic_level_bytes));
+
+  if (target_range_size >= total_size) {
+    return;
+  }
+
+  uint64_t next_threshold = target_range_size;
+  uint64_t cumulative_size = 0;
+  uint64_t num_actual_subcompactions = 1U;
+  for (TableReader::Anchor& anchor : all_anchors) {
+    cumulative_size += anchor.range_size;
+    if (cumulative_size > next_threshold) {
+      next_threshold += target_range_size;
+      num_actual_subcompactions++;
+      boundaries_.push_back(anchor.user_key);
+    }
+    if (num_actual_subcompactions == num_planned_subcompactions) {
+      break;
+    }
+  }
+  TEST_SYNC_POINT_CALLBACK("CompactionJob::GenSubcompactionBoundaries:1",
+                           &num_actual_subcompactions);
+  // Shrink extra subcompactions resources when extra resrouces are acquired
+  ShrinkSubcompactionResources(
+      std::min((int)(num_planned_subcompactions - num_actual_subcompactions),
+               extra_num_subcompaction_threads_reserved_));
+}
+
+Status CompactionJob::Run() {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_COMPACTION_RUN);
+  TEST_SYNC_POINT("CompactionJob::Run():Start");
+  log_buffer_->FlushBufferToLog();
+  LogCompaction();
+
+  const size_t num_threads = compact_->sub_compact_states.size();
+  assert(num_threads > 0);
+  const uint64_t start_micros = db_options_.clock->NowMicros();
+
+  // Launch a thread for each of subcompactions 1...num_threads-1
+  std::vector<port::Thread> thread_pool;
+  thread_pool.reserve(num_threads - 1);
+  for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) {
+    thread_pool.emplace_back(&CompactionJob::ProcessKeyValueCompaction, this,
+                             &compact_->sub_compact_states[i]);
+  }
+
+  // Always schedule the first subcompaction (whether or not there are also
+  // others) in the current thread to be efficient with resources
+  ProcessKeyValueCompaction(&compact_->sub_compact_states[0]);
+
+  // Wait for all other threads (if there are any) to finish execution
+  for (auto& thread : thread_pool) {
+    thread.join();
+  }
+
+  compaction_stats_.SetMicros(db_options_.clock->NowMicros() - start_micros);
+
+  for (auto& state : compact_->sub_compact_states) {
+    compaction_stats_.AddCpuMicros(state.compaction_job_stats.cpu_micros);
+    state.RemoveLastEmptyOutput();
+  }
+
+  RecordTimeToHistogram(stats_, COMPACTION_TIME,
+                        compaction_stats_.stats.micros);
+  RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME,
+                        compaction_stats_.stats.cpu_micros);
+
+  TEST_SYNC_POINT("CompactionJob::Run:BeforeVerify");
+
+  // Check if any thread encountered an error during execution
+  Status status;
+  IOStatus io_s;
+  bool wrote_new_blob_files = false;
+
+  for (const auto& state : compact_->sub_compact_states) {
+    if (!state.status.ok()) {
+      status = state.status;
+      io_s = state.io_status;
+      break;
+    }
+
+    if (state.Current().HasBlobFileAdditions()) {
+      wrote_new_blob_files = true;
+    }
+  }
+
+  if (io_status_.ok()) {
+    io_status_ = io_s;
+  }
+  if (status.ok()) {
+    constexpr IODebugContext* dbg = nullptr;
+
+    if (output_directory_) {
+      io_s = output_directory_->FsyncWithDirOptions(
+          IOOptions(), dbg,
+          DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+    }
+
+    if (io_s.ok() && wrote_new_blob_files && blob_output_directory_ &&
+        blob_output_directory_ != output_directory_) {
+      io_s = blob_output_directory_->FsyncWithDirOptions(
+          IOOptions(), dbg,
+          DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+    }
+  }
+  if (io_status_.ok()) {
+    io_status_ = io_s;
+  }
+  if (status.ok()) {
+    status = io_s;
+  }
+  if (status.ok()) {
+    thread_pool.clear();
+    std::vector<const CompactionOutputs::Output*> files_output;
+    for (const auto& state : compact_->sub_compact_states) {
+      for (const auto& output : state.GetOutputs()) {
+        files_output.emplace_back(&output);
+      }
+    }
+    ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+    auto& prefix_extractor =
+        compact_->compaction->mutable_cf_options()->prefix_extractor;
+    std::atomic<size_t> next_file_idx(0);
+    auto verify_table = [&](Status& output_status) {
+      while (true) {
+        size_t file_idx = next_file_idx.fetch_add(1);
+        if (file_idx >= files_output.size()) {
+          break;
+        }
+        // Verify that the table is usable
+        // We set for_compaction to false and don't
+        // OptimizeForCompactionTableRead here because this is a special case
+        // after we finish the table building No matter whether
+        // use_direct_io_for_flush_and_compaction is true, we will regard this
+        // verification as user reads since the goal is to cache it here for
+        // further user reads
+        ReadOptions read_options;
+        InternalIterator* iter = cfd->table_cache()->NewIterator(
+            read_options, file_options_, cfd->internal_comparator(),
+            files_output[file_idx]->meta, /*range_del_agg=*/nullptr,
+            prefix_extractor,
+            /*table_reader_ptr=*/nullptr,
+            cfd->internal_stats()->GetFileReadHist(
+                compact_->compaction->output_level()),
+            TableReaderCaller::kCompactionRefill, /*arena=*/nullptr,
+            /*skip_filters=*/false, compact_->compaction->output_level(),
+            MaxFileSizeForL0MetaPin(
+                *compact_->compaction->mutable_cf_options()),
+            /*smallest_compaction_key=*/nullptr,
+            /*largest_compaction_key=*/nullptr,
+            /*allow_unprepared_value=*/false);
+        auto s = iter->status();
+
+        if (s.ok() && paranoid_file_checks_) {
+          OutputValidator validator(cfd->internal_comparator(),
+                                    /*_enable_order_check=*/true,
+                                    /*_enable_hash=*/true);
+          for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+            s = validator.Add(iter->key(), iter->value());
+            if (!s.ok()) {
+              break;
+            }
+          }
+          if (s.ok()) {
+            s = iter->status();
+          }
+          if (s.ok() &&
+              !validator.CompareValidator(files_output[file_idx]->validator)) {
+            s = Status::Corruption("Paranoid checksums do not match");
+          }
+        }
+
+        delete iter;
+
+        if (!s.ok()) {
+          output_status = s;
+          break;
+        }
+      }
+    };
+    for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) {
+      thread_pool.emplace_back(
+          verify_table, std::ref(compact_->sub_compact_states[i].status));
+    }
+    verify_table(compact_->sub_compact_states[0].status);
+    for (auto& thread : thread_pool) {
+      thread.join();
+    }
+
+    for (const auto& state : compact_->sub_compact_states) {
+      if (!state.status.ok()) {
+        status = state.status;
+        break;
+      }
+    }
+  }
+
+  ReleaseSubcompactionResources();
+  TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources:0");
+  TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources:1");
+
+  TablePropertiesCollection tp;
+  for (const auto& state : compact_->sub_compact_states) {
+    for (const auto& output : state.GetOutputs()) {
+      auto fn =
+          TableFileName(state.compaction->immutable_options()->cf_paths,
+                        output.meta.fd.GetNumber(), output.meta.fd.GetPathId());
+      tp[fn] = output.table_properties;
+    }
+  }
+  compact_->compaction->SetOutputTableProperties(std::move(tp));
+
+  // Finish up all book-keeping to unify the subcompaction results
+  compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_);
+  UpdateCompactionStats();
+
+  RecordCompactionIOStats();
+  LogFlush(db_options_.info_log);
+  TEST_SYNC_POINT("CompactionJob::Run():End");
+
+  compact_->status = status;
+  return status;
+}
+
+Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) {
+  assert(compact_);
+
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_COMPACTION_INSTALL);
+  db_mutex_->AssertHeld();
+  Status status = compact_->status;
+
+  ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+  assert(cfd);
+
+  int output_level = compact_->compaction->output_level();
+  cfd->internal_stats()->AddCompactionStats(output_level, thread_pri_,
+                                            compaction_stats_);
+
+  if (status.ok()) {
+    status = InstallCompactionResults(mutable_cf_options);
+  }
+  if (!versions_->io_status().ok()) {
+    io_status_ = versions_->io_status();
+  }
+
+  VersionStorageInfo::LevelSummaryStorage tmp;
+  auto vstorage = cfd->current()->storage_info();
+  const auto& stats = compaction_stats_.stats;
+
+  double read_write_amp = 0.0;
+  double write_amp = 0.0;
+  double bytes_read_per_sec = 0;
+  double bytes_written_per_sec = 0;
+
+  const uint64_t bytes_read_non_output_and_blob =
+      stats.bytes_read_non_output_levels + stats.bytes_read_blob;
+  const uint64_t bytes_read_all =
+      stats.bytes_read_output_level + bytes_read_non_output_and_blob;
+  const uint64_t bytes_written_all =
+      stats.bytes_written + stats.bytes_written_blob;
+
+  if (bytes_read_non_output_and_blob > 0) {
+    read_write_amp = (bytes_written_all + bytes_read_all) /
+                     static_cast<double>(bytes_read_non_output_and_blob);
+    write_amp =
+        bytes_written_all / static_cast<double>(bytes_read_non_output_and_blob);
+  }
+  if (stats.micros > 0) {
+    bytes_read_per_sec = bytes_read_all / static_cast<double>(stats.micros);
+    bytes_written_per_sec =
+        bytes_written_all / static_cast<double>(stats.micros);
+  }
+
+  const std::string& column_family_name = cfd->GetName();
+
+  constexpr double kMB = 1048576.0;
+
+  ROCKS_LOG_BUFFER(
+      log_buffer_,
+      "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, "
+      "files in(%d, %d) out(%d +%d blob) "
+      "MB in(%.1f, %.1f +%.1f blob) out(%.1f +%.1f blob), "
+      "read-write-amplify(%.1f) write-amplify(%.1f) %s, records in: %" PRIu64
+      ", records dropped: %" PRIu64 " output_compression: %s\n",
+      column_family_name.c_str(), vstorage->LevelSummary(&tmp),
+      bytes_read_per_sec, bytes_written_per_sec,
+      compact_->compaction->output_level(),
+      stats.num_input_files_in_non_output_levels,
+      stats.num_input_files_in_output_level, stats.num_output_files,
+      stats.num_output_files_blob, stats.bytes_read_non_output_levels / kMB,
+      stats.bytes_read_output_level / kMB, stats.bytes_read_blob / kMB,
+      stats.bytes_written / kMB, stats.bytes_written_blob / kMB, read_write_amp,
+      write_amp, status.ToString().c_str(), stats.num_input_records,
+      stats.num_dropped_records,
+      CompressionTypeToString(compact_->compaction->output_compression())
+          .c_str());
+
+  const auto& blob_files = vstorage->GetBlobFiles();
+  if (!blob_files.empty()) {
+    assert(blob_files.front());
+    assert(blob_files.back());
+
+    ROCKS_LOG_BUFFER(
+        log_buffer_,
+        "[%s] Blob file summary: head=%" PRIu64 ", tail=%" PRIu64 "\n",
+        column_family_name.c_str(), blob_files.front()->GetBlobFileNumber(),
+        blob_files.back()->GetBlobFileNumber());
+  }
+
+  if (compaction_stats_.has_penultimate_level_output) {
+    ROCKS_LOG_BUFFER(
+        log_buffer_,
+        "[%s] has Penultimate Level output: %" PRIu64
+        ", level %d, number of files: %" PRIu64 ", number of records: %" PRIu64,
+        column_family_name.c_str(),
+        compaction_stats_.penultimate_level_stats.bytes_written,
+        compact_->compaction->GetPenultimateLevel(),
+        compaction_stats_.penultimate_level_stats.num_output_files,
+        compaction_stats_.penultimate_level_stats.num_output_records);
+  }
+
+  UpdateCompactionJobStats(stats);
+
+  auto stream = event_logger_->LogToBuffer(log_buffer_, 8192);
+  stream << "job" << job_id_ << "event"
+         << "compaction_finished"
+         << "compaction_time_micros" << stats.micros
+         << "compaction_time_cpu_micros" << stats.cpu_micros << "output_level"
+         << compact_->compaction->output_level() << "num_output_files"
+         << stats.num_output_files << "total_output_size"
+         << stats.bytes_written;
+
+  if (stats.num_output_files_blob > 0) {
+    stream << "num_blob_output_files" << stats.num_output_files_blob
+           << "total_blob_output_size" << stats.bytes_written_blob;
+  }
+
+  stream << "num_input_records" << stats.num_input_records
+         << "num_output_records" << stats.num_output_records
+         << "num_subcompactions" << compact_->sub_compact_states.size()
+         << "output_compression"
+         << CompressionTypeToString(compact_->compaction->output_compression());
+
+  stream << "num_single_delete_mismatches"
+         << compaction_job_stats_->num_single_del_mismatch;
+  stream << "num_single_delete_fallthrough"
+         << compaction_job_stats_->num_single_del_fallthru;
+
+  if (measure_io_stats_) {
+    stream << "file_write_nanos" << compaction_job_stats_->file_write_nanos;
+    stream << "file_range_sync_nanos"
+           << compaction_job_stats_->file_range_sync_nanos;
+    stream << "file_fsync_nanos" << compaction_job_stats_->file_fsync_nanos;
+    stream << "file_prepare_write_nanos"
+           << compaction_job_stats_->file_prepare_write_nanos;
+  }
+
+  stream << "lsm_state";
+  stream.StartArray();
+  for (int level = 0; level < vstorage->num_levels(); ++level) {
+    stream << vstorage->NumLevelFiles(level);
+  }
+  stream.EndArray();
+
+  if (!blob_files.empty()) {
+    assert(blob_files.front());
+    stream << "blob_file_head" << blob_files.front()->GetBlobFileNumber();
+
+    assert(blob_files.back());
+    stream << "blob_file_tail" << blob_files.back()->GetBlobFileNumber();
+  }
+
+  if (compaction_stats_.has_penultimate_level_output) {
+    InternalStats::CompactionStats& pl_stats =
+        compaction_stats_.penultimate_level_stats;
+    stream << "penultimate_level_num_output_files" << pl_stats.num_output_files;
+    stream << "penultimate_level_bytes_written" << pl_stats.bytes_written;
+    stream << "penultimate_level_num_output_records"
+           << pl_stats.num_output_records;
+    stream << "penultimate_level_num_output_files_blob"
+           << pl_stats.num_output_files_blob;
+    stream << "penultimate_level_bytes_written_blob"
+           << pl_stats.bytes_written_blob;
+  }
+
+  CleanupCompaction();
+  return status;
+}
+
+void CompactionJob::NotifyOnSubcompactionBegin(
+    SubcompactionState* sub_compact) {
+#ifndef ROCKSDB_LITE
+  Compaction* c = compact_->compaction;
+
+  if (db_options_.listeners.empty()) {
+    return;
+  }
+  if (shutting_down_->load(std::memory_order_acquire)) {
+    return;
+  }
+  if (c->is_manual_compaction() &&
+      manual_compaction_canceled_.load(std::memory_order_acquire)) {
+    return;
+  }
+
+  sub_compact->notify_on_subcompaction_completion = true;
+
+  SubcompactionJobInfo info{};
+  sub_compact->BuildSubcompactionJobInfo(info);
+  info.job_id = static_cast<int>(job_id_);
+  info.thread_id = env_->GetThreadID();
+
+  for (const auto& listener : db_options_.listeners) {
+    listener->OnSubcompactionBegin(info);
+  }
+  info.status.PermitUncheckedError();
+
+#else
+  (void)sub_compact;
+#endif  // ROCKSDB_LITE
+}
+
+void CompactionJob::NotifyOnSubcompactionCompleted(
+    SubcompactionState* sub_compact) {
+#ifndef ROCKSDB_LITE
+
+  if (db_options_.listeners.empty()) {
+    return;
+  }
+  if (shutting_down_->load(std::memory_order_acquire)) {
+    return;
+  }
+
+  if (sub_compact->notify_on_subcompaction_completion == false) {
+    return;
+  }
+
+  SubcompactionJobInfo info{};
+  sub_compact->BuildSubcompactionJobInfo(info);
+  info.job_id = static_cast<int>(job_id_);
+  info.thread_id = env_->GetThreadID();
+
+  for (const auto& listener : db_options_.listeners) {
+    listener->OnSubcompactionCompleted(info);
+  }
+#else
+  (void)sub_compact;
+#endif  // ROCKSDB_LITE
+}
+
+void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
+  assert(sub_compact);
+  assert(sub_compact->compaction);
+
+#ifndef ROCKSDB_LITE
+  if (db_options_.compaction_service) {
+    CompactionServiceJobStatus comp_status =
+        ProcessKeyValueCompactionWithCompactionService(sub_compact);
+    if (comp_status == CompactionServiceJobStatus::kSuccess ||
+        comp_status == CompactionServiceJobStatus::kFailure) {
+      return;
+    }
+    // fallback to local compaction
+    assert(comp_status == CompactionServiceJobStatus::kUseLocal);
+  }
+#endif  // !ROCKSDB_LITE
+
+  uint64_t prev_cpu_micros = db_options_.clock->CPUMicros();
+
+  ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
+
+  // Create compaction filter and fail the compaction if
+  // IgnoreSnapshots() = false because it is not supported anymore
+  const CompactionFilter* compaction_filter =
+      cfd->ioptions()->compaction_filter;
+  std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
+  if (compaction_filter == nullptr) {
+    compaction_filter_from_factory =
+        sub_compact->compaction->CreateCompactionFilter();
+    compaction_filter = compaction_filter_from_factory.get();
+  }
+  if (compaction_filter != nullptr && !compaction_filter->IgnoreSnapshots()) {
+    sub_compact->status = Status::NotSupported(
+        "CompactionFilter::IgnoreSnapshots() = false is not supported "
+        "anymore.");
+    return;
+  }
+
+  NotifyOnSubcompactionBegin(sub_compact);
+
+  auto range_del_agg = std::make_unique<CompactionRangeDelAggregator>(
+      &cfd->internal_comparator(), existing_snapshots_, &full_history_ts_low_,
+      &trim_ts_);
+
+  // TODO: since we already use C++17, should use
+  // std::optional<const Slice> instead.
+  const std::optional<Slice> start = sub_compact->start;
+  const std::optional<Slice> end = sub_compact->end;
+
+  std::optional<Slice> start_without_ts;
+  std::optional<Slice> end_without_ts;
+
+  ReadOptions read_options;
+  read_options.verify_checksums = true;
+  read_options.fill_cache = false;
+  read_options.rate_limiter_priority = GetRateLimiterPriority();
+  // Compaction iterators shouldn't be confined to a single prefix.
+  // Compactions use Seek() for
+  // (a) concurrent compactions,
+  // (b) CompactionFilter::Decision::kRemoveAndSkipUntil.
+  read_options.total_order_seek = true;
+
+  // Remove the timestamps from boundaries because boundaries created in
+  // GenSubcompactionBoundaries doesn't strip away the timestamp.
+  size_t ts_sz = cfd->user_comparator()->timestamp_size();
+  if (start.has_value()) {
+    read_options.iterate_lower_bound = &start.value();
+    if (ts_sz > 0) {
+      start_without_ts = StripTimestampFromUserKey(start.value(), ts_sz);
+      read_options.iterate_lower_bound = &start_without_ts.value();
+    }
+  }
+  if (end.has_value()) {
+    read_options.iterate_upper_bound = &end.value();
+    if (ts_sz > 0) {
+      end_without_ts = StripTimestampFromUserKey(end.value(), ts_sz);
+      read_options.iterate_upper_bound = &end_without_ts.value();
+    }
+  }
+
+  // Although the v2 aggregator is what the level iterator(s) know about,
+  // the AddTombstones calls will be propagated down to the v1 aggregator.
+  std::unique_ptr<InternalIterator> raw_input(versions_->MakeInputIterator(
+      read_options, sub_compact->compaction, range_del_agg.get(),
+      file_options_for_read_, start, end));
+  InternalIterator* input = raw_input.get();
+
+  IterKey start_ikey;
+  IterKey end_ikey;
+  Slice start_slice;
+  Slice end_slice;
+
+  static constexpr char kMaxTs[] =
+      "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff";
+  Slice ts_slice;
+  std::string max_ts;
+  if (ts_sz > 0) {
+    if (ts_sz <= strlen(kMaxTs)) {
+      ts_slice = Slice(kMaxTs, ts_sz);
+    } else {
+      max_ts = std::string(ts_sz, '\xff');
+      ts_slice = Slice(max_ts);
+    }
+  }
+
+  if (start.has_value()) {
+    start_ikey.SetInternalKey(start.value(), kMaxSequenceNumber,
+                              kValueTypeForSeek);
+    if (ts_sz > 0) {
+      start_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek,
+                                   &ts_slice);
+    }
+    start_slice = start_ikey.GetInternalKey();
+  }
+  if (end.has_value()) {
+    end_ikey.SetInternalKey(end.value(), kMaxSequenceNumber, kValueTypeForSeek);
+    if (ts_sz > 0) {
+      end_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek,
+                                 &ts_slice);
+    }
+    end_slice = end_ikey.GetInternalKey();
+  }
+
+  std::unique_ptr<InternalIterator> clip;
+  if (start.has_value() || end.has_value()) {
+    clip = std::make_unique<ClippingIterator>(
+        raw_input.get(), start.has_value() ? &start_slice : nullptr,
+        end.has_value() ? &end_slice : nullptr, &cfd->internal_comparator());
+    input = clip.get();
+  }
+
+  std::unique_ptr<InternalIterator> blob_counter;
+
+  if (sub_compact->compaction->DoesInputReferenceBlobFiles()) {
+    BlobGarbageMeter* meter = sub_compact->Current().CreateBlobGarbageMeter();
+    blob_counter = std::make_unique<BlobCountingIterator>(input, meter);
+    input = blob_counter.get();
+  }
+
+  std::unique_ptr<InternalIterator> trim_history_iter;
+  if (ts_sz > 0 && !trim_ts_.empty()) {
+    trim_history_iter = std::make_unique<HistoryTrimmingIterator>(
+        input, cfd->user_comparator(), trim_ts_);
+    input = trim_history_iter.get();
+  }
+
+  input->SeekToFirst();
+
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_COMPACTION_PROCESS_KV);
+
+  // I/O measurement variables
+  PerfLevel prev_perf_level = PerfLevel::kEnableTime;
+  const uint64_t kRecordStatsEvery = 1000;
+  uint64_t prev_write_nanos = 0;
+  uint64_t prev_fsync_nanos = 0;
+  uint64_t prev_range_sync_nanos = 0;
+  uint64_t prev_prepare_write_nanos = 0;
+  uint64_t prev_cpu_write_nanos = 0;
+  uint64_t prev_cpu_read_nanos = 0;
+  if (measure_io_stats_) {
+    prev_perf_level = GetPerfLevel();
+    SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
+    prev_write_nanos = IOSTATS(write_nanos);
+    prev_fsync_nanos = IOSTATS(fsync_nanos);
+    prev_range_sync_nanos = IOSTATS(range_sync_nanos);
+    prev_prepare_write_nanos = IOSTATS(prepare_write_nanos);
+    prev_cpu_write_nanos = IOSTATS(cpu_write_nanos);
+    prev_cpu_read_nanos = IOSTATS(cpu_read_nanos);
+  }
+
+  MergeHelper merge(
+      env_, cfd->user_comparator(), cfd->ioptions()->merge_operator.get(),
+      compaction_filter, db_options_.info_log.get(),
+      false /* internal key corruption is expected */,
+      existing_snapshots_.empty() ? 0 : existing_snapshots_.back(),
+      snapshot_checker_, compact_->compaction->level(), db_options_.stats);
+
+  const MutableCFOptions* mutable_cf_options =
+      sub_compact->compaction->mutable_cf_options();
+  assert(mutable_cf_options);
+
+  std::vector<std::string> blob_file_paths;
+
+  // TODO: BlobDB to support output_to_penultimate_level compaction, which needs
+  //  2 builders, so may need to move to `CompactionOutputs`
+  std::unique_ptr<BlobFileBuilder> blob_file_builder(
+      (mutable_cf_options->enable_blob_files &&
+       sub_compact->compaction->output_level() >=
+           mutable_cf_options->blob_file_starting_level)
+          ? new BlobFileBuilder(
+                versions_, fs_.get(),
+                sub_compact->compaction->immutable_options(),
+                mutable_cf_options, &file_options_, db_id_, db_session_id_,
+                job_id_, cfd->GetID(), cfd->GetName(), Env::IOPriority::IO_LOW,
+                write_hint_, io_tracer_, blob_callback_,
+                BlobFileCreationReason::kCompaction, &blob_file_paths,
+                sub_compact->Current().GetBlobFileAdditionsPtr())
+          : nullptr);
+
+  TEST_SYNC_POINT("CompactionJob::Run():Inprogress");
+  TEST_SYNC_POINT_CALLBACK(
+      "CompactionJob::Run():PausingManualCompaction:1",
+      reinterpret_cast<void*>(
+          const_cast<std::atomic<bool>*>(&manual_compaction_canceled_)));
+
+  const std::string* const full_history_ts_low =
+      full_history_ts_low_.empty() ? nullptr : &full_history_ts_low_;
+  const SequenceNumber job_snapshot_seq =
+      job_context_ ? job_context_->GetJobSnapshotSequence()
+                   : kMaxSequenceNumber;
+
+  auto c_iter = std::make_unique<CompactionIterator>(
+      input, cfd->user_comparator(), &merge, versions_->LastSequence(),
+      &existing_snapshots_, earliest_write_conflict_snapshot_, job_snapshot_seq,
+      snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_),
+      /*expect_valid_internal_key=*/true, range_del_agg.get(),
+      blob_file_builder.get(), db_options_.allow_data_in_errors,
+      db_options_.enforce_single_del_contracts, manual_compaction_canceled_,
+      sub_compact->compaction, compaction_filter, shutting_down_,
+      db_options_.info_log, full_history_ts_low, preserve_time_min_seqno_,
+      preclude_last_level_min_seqno_);
+  c_iter->SeekToFirst();
+
+  // Assign range delete aggregator to the target output level, which makes sure
+  // it only output to single level
+  sub_compact->AssignRangeDelAggregator(std::move(range_del_agg));
+
+  const auto& c_iter_stats = c_iter->iter_stats();
+
+  // define the open and close functions for the compaction files, which will be
+  // used open/close output files when needed.
+  const CompactionFileOpenFunc open_file_func =
+      [this, sub_compact](CompactionOutputs& outputs) {
+        return this->OpenCompactionOutputFile(sub_compact, outputs);
+      };
+  const CompactionFileCloseFunc close_file_func =
+      [this, sub_compact](CompactionOutputs& outputs, const Status& status,
+                          const Slice& next_table_min_key) {
+        return this->FinishCompactionOutputFile(status, sub_compact, outputs,
+                                                next_table_min_key);
+      };
+
+  Status status;
+  TEST_SYNC_POINT_CALLBACK(
+      "CompactionJob::ProcessKeyValueCompaction()::Processing",
+      reinterpret_cast<void*>(
+          const_cast<Compaction*>(sub_compact->compaction)));
+  while (status.ok() && !cfd->IsDropped() && c_iter->Valid()) {
+    // Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid()
+    // returns true.
+
+    assert(!end.has_value() || cfd->user_comparator()->Compare(
+                                   c_iter->user_key(), end.value()) < 0);
+
+    if (c_iter_stats.num_input_records % kRecordStatsEvery ==
+        kRecordStatsEvery - 1) {
+      RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
+      c_iter->ResetRecordCounts();
+      RecordCompactionIOStats();
+    }
+
+    // Add current compaction_iterator key to target compaction output, if the
+    // output file needs to be close or open, it will call the `open_file_func`
+    // and `close_file_func`.
+    // TODO: it would be better to have the compaction file open/close moved
+    // into `CompactionOutputs` which has the output file information.
+    status = sub_compact->AddToOutput(*c_iter, open_file_func, close_file_func);
+    if (!status.ok()) {
+      break;
+    }
+
+    TEST_SYNC_POINT_CALLBACK(
+        "CompactionJob::Run():PausingManualCompaction:2",
+        reinterpret_cast<void*>(
+            const_cast<std::atomic<bool>*>(&manual_compaction_canceled_)));
+    c_iter->Next();
+    if (c_iter->status().IsManualCompactionPaused()) {
+      break;
+    }
+  }
+
+  sub_compact->compaction_job_stats.num_blobs_read =
+      c_iter_stats.num_blobs_read;
+  sub_compact->compaction_job_stats.total_blob_bytes_read =
+      c_iter_stats.total_blob_bytes_read;
+  sub_compact->compaction_job_stats.num_input_deletion_records =
+      c_iter_stats.num_input_deletion_records;
+  sub_compact->compaction_job_stats.num_corrupt_keys =
+      c_iter_stats.num_input_corrupt_records;
+  sub_compact->compaction_job_stats.num_single_del_fallthru =
+      c_iter_stats.num_single_del_fallthru;
+  sub_compact->compaction_job_stats.num_single_del_mismatch =
+      c_iter_stats.num_single_del_mismatch;
+  sub_compact->compaction_job_stats.total_input_raw_key_bytes +=
+      c_iter_stats.total_input_raw_key_bytes;
+  sub_compact->compaction_job_stats.total_input_raw_value_bytes +=
+      c_iter_stats.total_input_raw_value_bytes;
+
+  RecordTick(stats_, FILTER_OPERATION_TOTAL_TIME,
+             c_iter_stats.total_filter_time);
+
+  if (c_iter_stats.num_blobs_relocated > 0) {
+    RecordTick(stats_, BLOB_DB_GC_NUM_KEYS_RELOCATED,
+               c_iter_stats.num_blobs_relocated);
+  }
+  if (c_iter_stats.total_blob_bytes_relocated > 0) {
+    RecordTick(stats_, BLOB_DB_GC_BYTES_RELOCATED,
+               c_iter_stats.total_blob_bytes_relocated);
+  }
+
+  RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
+  RecordCompactionIOStats();
+
+  if (status.ok() && cfd->IsDropped()) {
+    status =
+        Status::ColumnFamilyDropped("Column family dropped during compaction");
+  }
+  if ((status.ok() || status.IsColumnFamilyDropped()) &&
+      shutting_down_->load(std::memory_order_relaxed)) {
+    status = Status::ShutdownInProgress("Database shutdown");
+  }
+  if ((status.ok() || status.IsColumnFamilyDropped()) &&
+      (manual_compaction_canceled_.load(std::memory_order_relaxed))) {
+    status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+  }
+  if (status.ok()) {
+    status = input->status();
+  }
+  if (status.ok()) {
+    status = c_iter->status();
+  }
+
+  // Call FinishCompactionOutputFile() even if status is not ok: it needs to
+  // close the output files. Open file function is also passed, in case there's
+  // only range-dels, no file was opened, to save the range-dels, it need to
+  // create a new output file.
+  status = sub_compact->CloseCompactionFiles(status, open_file_func,
+                                             close_file_func);
+
+  if (blob_file_builder) {
+    if (status.ok()) {
+      status = blob_file_builder->Finish();
+    } else {
+      blob_file_builder->Abandon(status);
+    }
+    blob_file_builder.reset();
+    sub_compact->Current().UpdateBlobStats();
+  }
+
+  sub_compact->compaction_job_stats.cpu_micros =
+      db_options_.clock->CPUMicros() - prev_cpu_micros;
+
+  if (measure_io_stats_) {
+    sub_compact->compaction_job_stats.file_write_nanos +=
+        IOSTATS(write_nanos) - prev_write_nanos;
+    sub_compact->compaction_job_stats.file_fsync_nanos +=
+        IOSTATS(fsync_nanos) - prev_fsync_nanos;
+    sub_compact->compaction_job_stats.file_range_sync_nanos +=
+        IOSTATS(range_sync_nanos) - prev_range_sync_nanos;
+    sub_compact->compaction_job_stats.file_prepare_write_nanos +=
+        IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos;
+    sub_compact->compaction_job_stats.cpu_micros -=
+        (IOSTATS(cpu_write_nanos) - prev_cpu_write_nanos +
+         IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos) /
+        1000;
+    if (prev_perf_level != PerfLevel::kEnableTimeAndCPUTimeExceptForMutex) {
+      SetPerfLevel(prev_perf_level);
+    }
+  }
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+  if (!status.ok()) {
+    if (c_iter) {
+      c_iter->status().PermitUncheckedError();
+    }
+    if (input) {
+      input->status().PermitUncheckedError();
+    }
+  }
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+
+  blob_counter.reset();
+  clip.reset();
+  raw_input.reset();
+  sub_compact->status = status;
+  NotifyOnSubcompactionCompleted(sub_compact);
+}
+
+uint64_t CompactionJob::GetCompactionId(SubcompactionState* sub_compact) const {
+  return (uint64_t)job_id_ << 32 | sub_compact->sub_job_id;
+}
+
+void CompactionJob::RecordDroppedKeys(
+    const CompactionIterationStats& c_iter_stats,
+    CompactionJobStats* compaction_job_stats) {
+  if (c_iter_stats.num_record_drop_user > 0) {
+    RecordTick(stats_, COMPACTION_KEY_DROP_USER,
+               c_iter_stats.num_record_drop_user);
+  }
+  if (c_iter_stats.num_record_drop_hidden > 0) {
+    RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY,
+               c_iter_stats.num_record_drop_hidden);
+    if (compaction_job_stats) {
+      compaction_job_stats->num_records_replaced +=
+          c_iter_stats.num_record_drop_hidden;
+    }
+  }
+  if (c_iter_stats.num_record_drop_obsolete > 0) {
+    RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE,
+               c_iter_stats.num_record_drop_obsolete);
+    if (compaction_job_stats) {
+      compaction_job_stats->num_expired_deletion_records +=
+          c_iter_stats.num_record_drop_obsolete;
+    }
+  }
+  if (c_iter_stats.num_record_drop_range_del > 0) {
+    RecordTick(stats_, COMPACTION_KEY_DROP_RANGE_DEL,
+               c_iter_stats.num_record_drop_range_del);
+  }
+  if (c_iter_stats.num_range_del_drop_obsolete > 0) {
+    RecordTick(stats_, COMPACTION_RANGE_DEL_DROP_OBSOLETE,
+               c_iter_stats.num_range_del_drop_obsolete);
+  }
+  if (c_iter_stats.num_optimized_del_drop_obsolete > 0) {
+    RecordTick(stats_, COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
+               c_iter_stats.num_optimized_del_drop_obsolete);
+  }
+}
+
+Status CompactionJob::FinishCompactionOutputFile(
+    const Status& input_status, SubcompactionState* sub_compact,
+    CompactionOutputs& outputs, const Slice& next_table_min_key) {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_COMPACTION_SYNC_FILE);
+  assert(sub_compact != nullptr);
+  assert(outputs.HasBuilder());
+
+  FileMetaData* meta = outputs.GetMetaData();
+  uint64_t output_number = meta->fd.GetNumber();
+  assert(output_number != 0);
+
+  ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
+  std::string file_checksum = kUnknownFileChecksum;
+  std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
+
+  // Check for iterator errors
+  Status s = input_status;
+
+  // Add range tombstones
+  auto earliest_snapshot = kMaxSequenceNumber;
+  if (existing_snapshots_.size() > 0) {
+    earliest_snapshot = existing_snapshots_[0];
+  }
+  if (s.ok()) {
+    CompactionIterationStats range_del_out_stats;
+    // if the compaction supports per_key_placement, only output range dels to
+    // the penultimate level.
+    // Note: Use `bottommost_level_ = true` for both bottommost and
+    // output_to_penultimate_level compaction here, as it's only used to decide
+    // if range dels could be dropped.
+    if (outputs.HasRangeDel()) {
+      s = outputs.AddRangeDels(
+          sub_compact->start.has_value() ? &(sub_compact->start.value())
+                                         : nullptr,
+          sub_compact->end.has_value() ? &(sub_compact->end.value()) : nullptr,
+          range_del_out_stats, bottommost_level_, cfd->internal_comparator(),
+          earliest_snapshot, next_table_min_key, full_history_ts_low_);
+    }
+    RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats);
+    TEST_SYNC_POINT("CompactionJob::FinishCompactionOutputFile1");
+  }
+
+  const uint64_t current_entries = outputs.NumEntries();
+
+  s = outputs.Finish(s, seqno_time_mapping_);
+
+  if (s.ok()) {
+    // With accurate smallest and largest key, we can get a slightly more
+    // accurate oldest ancester time.
+    // This makes oldest ancester time in manifest more accurate than in
+    // table properties. Not sure how to resolve it.
+    if (meta->smallest.size() > 0 && meta->largest.size() > 0) {
+      uint64_t refined_oldest_ancester_time;
+      Slice new_smallest = meta->smallest.user_key();
+      Slice new_largest = meta->largest.user_key();
+      if (!new_largest.empty() && !new_smallest.empty()) {
+        refined_oldest_ancester_time =
+            sub_compact->compaction->MinInputFileOldestAncesterTime(
+                &(meta->smallest), &(meta->largest));
+        if (refined_oldest_ancester_time !=
+            std::numeric_limits<uint64_t>::max()) {
+          meta->oldest_ancester_time = refined_oldest_ancester_time;
+        }
+      }
+    }
+  }
+
+  // Finish and check for file errors
+  IOStatus io_s = outputs.WriterSyncClose(s, db_options_.clock, stats_,
+                                          db_options_.use_fsync);
+
+  if (s.ok() && io_s.ok()) {
+    file_checksum = meta->file_checksum;
+    file_checksum_func_name = meta->file_checksum_func_name;
+  }
+
+  if (s.ok()) {
+    s = io_s;
+  }
+  if (sub_compact->io_status.ok()) {
+    sub_compact->io_status = io_s;
+    // Since this error is really a copy of the
+    // "normal" status, it does not also need to be checked
+    sub_compact->io_status.PermitUncheckedError();
+  }
+
+  TableProperties tp;
+  if (s.ok()) {
+    tp = outputs.GetTableProperties();
+  }
+
+  if (s.ok() && current_entries == 0 && tp.num_range_deletions == 0) {
+    // If there is nothing to output, no necessary to generate a sst file.
+    // This happens when the output level is bottom level, at the same time
+    // the sub_compact output nothing.
+    std::string fname =
+        TableFileName(sub_compact->compaction->immutable_options()->cf_paths,
+                      meta->fd.GetNumber(), meta->fd.GetPathId());
+
+    // TODO(AR) it is not clear if there are any larger implications if
+    // DeleteFile fails here
+    Status ds = env_->DeleteFile(fname);
+    if (!ds.ok()) {
+      ROCKS_LOG_WARN(
+          db_options_.info_log,
+          "[%s] [JOB %d] Unable to remove SST file for table #%" PRIu64
+          " at bottom level%s",
+          cfd->GetName().c_str(), job_id_, output_number,
+          meta->marked_for_compaction ? " (need compaction)" : "");
+    }
+
+    // Also need to remove the file from outputs, or it will be added to the
+    // VersionEdit.
+    outputs.RemoveLastOutput();
+    meta = nullptr;
+  }
+
+  if (s.ok() && (current_entries > 0 || tp.num_range_deletions > 0)) {
+    // Output to event logger and fire events.
+    outputs.UpdateTableProperties();
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "[%s] [JOB %d] Generated table #%" PRIu64 ": %" PRIu64
+                   " keys, %" PRIu64 " bytes%s, temperature: %s",
+                   cfd->GetName().c_str(), job_id_, output_number,
+                   current_entries, meta->fd.file_size,
+                   meta->marked_for_compaction ? " (need compaction)" : "",
+                   temperature_to_string[meta->temperature].c_str());
+  }
+  std::string fname;
+  FileDescriptor output_fd;
+  uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
+  Status status_for_listener = s;
+  if (meta != nullptr) {
+    fname = GetTableFileName(meta->fd.GetNumber());
+    output_fd = meta->fd;
+    oldest_blob_file_number = meta->oldest_blob_file_number;
+  } else {
+    fname = "(nil)";
+    if (s.ok()) {
+      status_for_listener = Status::Aborted("Empty SST file not kept");
+    }
+  }
+  EventHelpers::LogAndNotifyTableFileCreationFinished(
+      event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname,
+      job_id_, output_fd, oldest_blob_file_number, tp,
+      TableFileCreationReason::kCompaction, status_for_listener, file_checksum,
+      file_checksum_func_name);
+
+#ifndef ROCKSDB_LITE
+  // Report new file to SstFileManagerImpl
+  auto sfm =
+      static_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
+  if (sfm && meta != nullptr && meta->fd.GetPathId() == 0) {
+    Status add_s = sfm->OnAddFile(fname);
+    if (!add_s.ok() && s.ok()) {
+      s = add_s;
+    }
+    if (sfm->IsMaxAllowedSpaceReached()) {
+      // TODO(ajkr): should we return OK() if max space was reached by the final
+      // compaction output file (similarly to how flush works when full)?
+      s = Status::SpaceLimit("Max allowed space was reached");
+      TEST_SYNC_POINT(
+          "CompactionJob::FinishCompactionOutputFile:MaxAllowedSpaceReached");
+      InstrumentedMutexLock l(db_mutex_);
+      db_error_handler_->SetBGError(s, BackgroundErrorReason::kCompaction);
+    }
+  }
+#endif
+
+  outputs.ResetBuilder();
+  return s;
+}
+
+Status CompactionJob::InstallCompactionResults(
+    const MutableCFOptions& mutable_cf_options) {
+  assert(compact_);
+
+  db_mutex_->AssertHeld();
+
+  auto* compaction = compact_->compaction;
+  assert(compaction);
+
+  {
+    Compaction::InputLevelSummaryBuffer inputs_summary;
+    if (compaction_stats_.has_penultimate_level_output) {
+      ROCKS_LOG_BUFFER(
+          log_buffer_,
+          "[%s] [JOB %d] Compacted %s => output_to_penultimate_level: %" PRIu64
+          " bytes + last: %" PRIu64 " bytes. Total: %" PRIu64 " bytes",
+          compaction->column_family_data()->GetName().c_str(), job_id_,
+          compaction->InputLevelSummary(&inputs_summary),
+          compaction_stats_.penultimate_level_stats.bytes_written,
+          compaction_stats_.stats.bytes_written,
+          compaction_stats_.TotalBytesWritten());
+    } else {
+      ROCKS_LOG_BUFFER(log_buffer_,
+                       "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes",
+                       compaction->column_family_data()->GetName().c_str(),
+                       job_id_, compaction->InputLevelSummary(&inputs_summary),
+                       compaction_stats_.TotalBytesWritten());
+    }
+  }
+
+  VersionEdit* const edit = compaction->edit();
+  assert(edit);
+
+  // Add compaction inputs
+  compaction->AddInputDeletions(edit);
+
+  std::unordered_map<uint64_t, BlobGarbageMeter::BlobStats> blob_total_garbage;
+
+  for (const auto& sub_compact : compact_->sub_compact_states) {
+    sub_compact.AddOutputsEdit(edit);
+
+    for (const auto& blob : sub_compact.Current().GetBlobFileAdditions()) {
+      edit->AddBlobFile(blob);
+    }
+
+    if (sub_compact.Current().GetBlobGarbageMeter()) {
+      const auto& flows = sub_compact.Current().GetBlobGarbageMeter()->flows();
+
+      for (const auto& pair : flows) {
+        const uint64_t blob_file_number = pair.first;
+        const BlobGarbageMeter::BlobInOutFlow& flow = pair.second;
+
+        assert(flow.IsValid());
+        if (flow.HasGarbage()) {
+          blob_total_garbage[blob_file_number].Add(flow.GetGarbageCount(),
+                                                   flow.GetGarbageBytes());
+        }
+      }
+    }
+  }
+
+  for (const auto& pair : blob_total_garbage) {
+    const uint64_t blob_file_number = pair.first;
+    const BlobGarbageMeter::BlobStats& stats = pair.second;
+
+    edit->AddBlobFileGarbage(blob_file_number, stats.GetCount(),
+                             stats.GetBytes());
+  }
+
+  if ((compaction->compaction_reason() ==
+           CompactionReason::kLevelMaxLevelSize ||
+       compaction->compaction_reason() == CompactionReason::kRoundRobinTtl) &&
+      compaction->immutable_options()->compaction_pri == kRoundRobin) {
+    int start_level = compaction->start_level();
+    if (start_level > 0) {
+      auto vstorage = compaction->input_version()->storage_info();
+      edit->AddCompactCursor(start_level,
+                             vstorage->GetNextCompactCursor(
+                                 start_level, compaction->num_input_files(0)));
+    }
+  }
+
+  return versions_->LogAndApply(compaction->column_family_data(),
+                                mutable_cf_options, edit, db_mutex_,
+                                db_directory_);
+}
+
+void CompactionJob::RecordCompactionIOStats() {
+  RecordTick(stats_, COMPACT_READ_BYTES, IOSTATS(bytes_read));
+  RecordTick(stats_, COMPACT_WRITE_BYTES, IOSTATS(bytes_written));
+  CompactionReason compaction_reason =
+      compact_->compaction->compaction_reason();
+  if (compaction_reason == CompactionReason::kFilesMarkedForCompaction) {
+    RecordTick(stats_, COMPACT_READ_BYTES_MARKED, IOSTATS(bytes_read));
+    RecordTick(stats_, COMPACT_WRITE_BYTES_MARKED, IOSTATS(bytes_written));
+  } else if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+    RecordTick(stats_, COMPACT_READ_BYTES_PERIODIC, IOSTATS(bytes_read));
+    RecordTick(stats_, COMPACT_WRITE_BYTES_PERIODIC, IOSTATS(bytes_written));
+  } else if (compaction_reason == CompactionReason::kTtl) {
+    RecordTick(stats_, COMPACT_READ_BYTES_TTL, IOSTATS(bytes_read));
+    RecordTick(stats_, COMPACT_WRITE_BYTES_TTL, IOSTATS(bytes_written));
+  }
+  ThreadStatusUtil::IncreaseThreadOperationProperty(
+      ThreadStatus::COMPACTION_BYTES_READ, IOSTATS(bytes_read));
+  IOSTATS_RESET(bytes_read);
+  ThreadStatusUtil::IncreaseThreadOperationProperty(
+      ThreadStatus::COMPACTION_BYTES_WRITTEN, IOSTATS(bytes_written));
+  IOSTATS_RESET(bytes_written);
+}
+
+Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
+                                               CompactionOutputs& outputs) {
+  assert(sub_compact != nullptr);
+
+  // no need to lock because VersionSet::next_file_number_ is atomic
+  uint64_t file_number = versions_->NewFileNumber();
+  std::string fname = GetTableFileName(file_number);
+  // Fire events.
+  ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
+#ifndef ROCKSDB_LITE
+  EventHelpers::NotifyTableFileCreationStarted(
+      cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname, job_id_,
+      TableFileCreationReason::kCompaction);
+#endif  // !ROCKSDB_LITE
+  // Make the output file
+  std::unique_ptr<FSWritableFile> writable_file;
+#ifndef NDEBUG
+  bool syncpoint_arg = file_options_.use_direct_writes;
+  TEST_SYNC_POINT_CALLBACK("CompactionJob::OpenCompactionOutputFile",
+                           &syncpoint_arg);
+#endif
+
+  // Pass temperature of the last level files to FileSystem.
+  FileOptions fo_copy = file_options_;
+  Temperature temperature = sub_compact->compaction->output_temperature();
+  // only set for the last level compaction and also it's not output to
+  // penultimate level (when preclude_last_level feature is enabled)
+  if (temperature == Temperature::kUnknown &&
+      sub_compact->compaction->is_last_level() &&
+      !sub_compact->IsCurrentPenultimateLevel()) {
+    temperature =
+        sub_compact->compaction->mutable_cf_options()->last_level_temperature;
+  }
+  fo_copy.temperature = temperature;
+
+  Status s;
+  IOStatus io_s = NewWritableFile(fs_.get(), fname, &writable_file, fo_copy);
+  s = io_s;
+  if (sub_compact->io_status.ok()) {
+    sub_compact->io_status = io_s;
+    // Since this error is really a copy of the io_s that is checked below as s,
+    // it does not also need to be checked.
+    sub_compact->io_status.PermitUncheckedError();
+  }
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(
+        db_options_.info_log,
+        "[%s] [JOB %d] OpenCompactionOutputFiles for table #%" PRIu64
+        " fails at NewWritableFile with status %s",
+        sub_compact->compaction->column_family_data()->GetName().c_str(),
+        job_id_, file_number, s.ToString().c_str());
+    LogFlush(db_options_.info_log);
+    EventHelpers::LogAndNotifyTableFileCreationFinished(
+        event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(),
+        fname, job_id_, FileDescriptor(), kInvalidBlobFileNumber,
+        TableProperties(), TableFileCreationReason::kCompaction, s,
+        kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+    return s;
+  }
+
+  // Try to figure out the output file's oldest ancester time.
+  int64_t temp_current_time = 0;
+  auto get_time_status = db_options_.clock->GetCurrentTime(&temp_current_time);
+  // Safe to proceed even if GetCurrentTime fails. So, log and proceed.
+  if (!get_time_status.ok()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Failed to get current time. Status: %s",
+                   get_time_status.ToString().c_str());
+  }
+  uint64_t current_time = static_cast<uint64_t>(temp_current_time);
+  InternalKey tmp_start, tmp_end;
+  if (sub_compact->start.has_value()) {
+    tmp_start.SetMinPossibleForUserKey(sub_compact->start.value());
+  }
+  if (sub_compact->end.has_value()) {
+    tmp_end.SetMinPossibleForUserKey(sub_compact->end.value());
+  }
+  uint64_t oldest_ancester_time =
+      sub_compact->compaction->MinInputFileOldestAncesterTime(
+          sub_compact->start.has_value() ? &tmp_start : nullptr,
+          sub_compact->end.has_value() ? &tmp_end : nullptr);
+  if (oldest_ancester_time == std::numeric_limits<uint64_t>::max()) {
+    oldest_ancester_time = current_time;
+  }
+
+  // Initialize a SubcompactionState::Output and add it to sub_compact->outputs
+  {
+    FileMetaData meta;
+    meta.fd = FileDescriptor(file_number,
+                             sub_compact->compaction->output_path_id(), 0);
+    meta.oldest_ancester_time = oldest_ancester_time;
+    meta.file_creation_time = current_time;
+    meta.temperature = temperature;
+    assert(!db_id_.empty());
+    assert(!db_session_id_.empty());
+    s = GetSstInternalUniqueId(db_id_, db_session_id_, meta.fd.GetNumber(),
+                               &meta.unique_id);
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(db_options_.info_log,
+                      "[%s] [JOB %d] file #%" PRIu64
+                      " failed to generate unique id: %s.",
+                      cfd->GetName().c_str(), job_id_, meta.fd.GetNumber(),
+                      s.ToString().c_str());
+      return s;
+    }
+
+    outputs.AddOutput(std::move(meta), cfd->internal_comparator(),
+                      sub_compact->compaction->mutable_cf_options()
+                          ->check_flush_compaction_key_order,
+                      paranoid_file_checks_);
+  }
+
+  writable_file->SetIOPriority(GetRateLimiterPriority());
+  writable_file->SetWriteLifeTimeHint(write_hint_);
+  FileTypeSet tmp_set = db_options_.checksum_handoff_file_types;
+  writable_file->SetPreallocationBlockSize(static_cast<size_t>(
+      sub_compact->compaction->OutputFilePreallocationSize()));
+  const auto& listeners =
+      sub_compact->compaction->immutable_options()->listeners;
+  outputs.AssignFileWriter(new WritableFileWriter(
+      std::move(writable_file), fname, fo_copy, db_options_.clock, io_tracer_,
+      db_options_.stats, listeners, db_options_.file_checksum_gen_factory.get(),
+      tmp_set.Contains(FileType::kTableFile), false));
+
+  TableBuilderOptions tboptions(
+      *cfd->ioptions(), *(sub_compact->compaction->mutable_cf_options()),
+      cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(),
+      sub_compact->compaction->output_compression(),
+      sub_compact->compaction->output_compression_opts(), cfd->GetID(),
+      cfd->GetName(), sub_compact->compaction->output_level(),
+      bottommost_level_, TableFileCreationReason::kCompaction,
+      0 /* oldest_key_time */, current_time, db_id_, db_session_id_,
+      sub_compact->compaction->max_output_file_size(), file_number);
+
+  outputs.NewBuilder(tboptions);
+
+  LogFlush(db_options_.info_log);
+  return s;
+}
+
+void CompactionJob::CleanupCompaction() {
+  for (SubcompactionState& sub_compact : compact_->sub_compact_states) {
+    sub_compact.Cleanup(table_cache_.get());
+  }
+  delete compact_;
+  compact_ = nullptr;
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) {
+  assert(prefix_length > 0);
+  size_t length = src.size() > prefix_length ? prefix_length : src.size();
+  dst->assign(src.data(), length);
+}
+}  // namespace
+
+#endif  // !ROCKSDB_LITE
+
+void CompactionJob::UpdateCompactionStats() {
+  assert(compact_);
+
+  Compaction* compaction = compact_->compaction;
+  compaction_stats_.stats.num_input_files_in_non_output_levels = 0;
+  compaction_stats_.stats.num_input_files_in_output_level = 0;
+  for (int input_level = 0;
+       input_level < static_cast<int>(compaction->num_input_levels());
+       ++input_level) {
+    if (compaction->level(input_level) != compaction->output_level()) {
+      UpdateCompactionInputStatsHelper(
+          &compaction_stats_.stats.num_input_files_in_non_output_levels,
+          &compaction_stats_.stats.bytes_read_non_output_levels, input_level);
+    } else {
+      UpdateCompactionInputStatsHelper(
+          &compaction_stats_.stats.num_input_files_in_output_level,
+          &compaction_stats_.stats.bytes_read_output_level, input_level);
+    }
+  }
+
+  assert(compaction_job_stats_);
+  compaction_stats_.stats.bytes_read_blob =
+      compaction_job_stats_->total_blob_bytes_read;
+
+  compaction_stats_.stats.num_dropped_records =
+      compaction_stats_.DroppedRecords();
+}
+
+void CompactionJob::UpdateCompactionInputStatsHelper(int* num_files,
+                                                     uint64_t* bytes_read,
+                                                     int input_level) {
+  const Compaction* compaction = compact_->compaction;
+  auto num_input_files = compaction->num_input_files(input_level);
+  *num_files += static_cast<int>(num_input_files);
+
+  for (size_t i = 0; i < num_input_files; ++i) {
+    const auto* file_meta = compaction->input(input_level, i);
+    *bytes_read += file_meta->fd.GetFileSize();
+    compaction_stats_.stats.num_input_records +=
+        static_cast<uint64_t>(file_meta->num_entries);
+  }
+}
+
+void CompactionJob::UpdateCompactionJobStats(
+    const InternalStats::CompactionStats& stats) const {
+#ifndef ROCKSDB_LITE
+  compaction_job_stats_->elapsed_micros = stats.micros;
+
+  // input information
+  compaction_job_stats_->total_input_bytes =
+      stats.bytes_read_non_output_levels + stats.bytes_read_output_level;
+  compaction_job_stats_->num_input_records = stats.num_input_records;
+  compaction_job_stats_->num_input_files =
+      stats.num_input_files_in_non_output_levels +
+      stats.num_input_files_in_output_level;
+  compaction_job_stats_->num_input_files_at_output_level =
+      stats.num_input_files_in_output_level;
+
+  // output information
+  compaction_job_stats_->total_output_bytes = stats.bytes_written;
+  compaction_job_stats_->total_output_bytes_blob = stats.bytes_written_blob;
+  compaction_job_stats_->num_output_records = stats.num_output_records;
+  compaction_job_stats_->num_output_files = stats.num_output_files;
+  compaction_job_stats_->num_output_files_blob = stats.num_output_files_blob;
+
+  if (stats.num_output_files > 0) {
+    CopyPrefix(compact_->SmallestUserKey(),
+               CompactionJobStats::kMaxPrefixLength,
+               &compaction_job_stats_->smallest_output_key_prefix);
+    CopyPrefix(compact_->LargestUserKey(), CompactionJobStats::kMaxPrefixLength,
+               &compaction_job_stats_->largest_output_key_prefix);
+  }
+#else
+  (void)stats;
+#endif  // !ROCKSDB_LITE
+}
+
+void CompactionJob::LogCompaction() {
+  Compaction* compaction = compact_->compaction;
+  ColumnFamilyData* cfd = compaction->column_family_data();
+
+  // Let's check if anything will get logged. Don't prepare all the info if
+  // we're not logging
+  if (db_options_.info_log_level <= InfoLogLevel::INFO_LEVEL) {
+    Compaction::InputLevelSummaryBuffer inputs_summary;
+    ROCKS_LOG_INFO(
+        db_options_.info_log, "[%s] [JOB %d] Compacting %s, score %.2f",
+        cfd->GetName().c_str(), job_id_,
+        compaction->InputLevelSummary(&inputs_summary), compaction->score());
+    char scratch[2345];
+    compaction->Summary(scratch, sizeof(scratch));
+    ROCKS_LOG_INFO(db_options_.info_log, "[%s]: Compaction start summary: %s\n",
+                   cfd->GetName().c_str(), scratch);
+    // build event logger report
+    auto stream = event_logger_->Log();
+    stream << "job" << job_id_ << "event"
+           << "compaction_started"
+           << "compaction_reason"
+           << GetCompactionReasonString(compaction->compaction_reason());
+    for (size_t i = 0; i < compaction->num_input_levels(); ++i) {
+      stream << ("files_L" + std::to_string(compaction->level(i)));
+      stream.StartArray();
+      for (auto f : *compaction->inputs(i)) {
+        stream << f->fd.GetNumber();
+      }
+      stream.EndArray();
+    }
+    stream << "score" << compaction->score() << "input_data_size"
+           << compaction->CalculateTotalInputSize() << "oldest_snapshot_seqno"
+           << (existing_snapshots_.empty()
+                   ? int64_t{-1}  // Use -1 for "none"
+                   : static_cast<int64_t>(existing_snapshots_[0]));
+    if (compaction->SupportsPerKeyPlacement()) {
+      stream << "preclude_last_level_min_seqno"
+             << preclude_last_level_min_seqno_;
+      stream << "penultimate_output_level" << compaction->GetPenultimateLevel();
+      stream << "penultimate_output_range"
+             << GetCompactionPenultimateOutputRangeTypeString(
+                    compaction->GetPenultimateOutputRangeType());
+
+      if (compaction->GetPenultimateOutputRangeType() ==
+          Compaction::PenultimateOutputRangeType::kDisabled) {
+        ROCKS_LOG_WARN(
+            db_options_.info_log,
+            "[%s] [JOB %d] Penultimate level output is disabled, likely "
+            "because of the range conflict in the penultimate level",
+            cfd->GetName().c_str(), job_id_);
+      }
+    }
+  }
+}
+
+std::string CompactionJob::GetTableFileName(uint64_t file_number) {
+  return TableFileName(compact_->compaction->immutable_options()->cf_paths,
+                       file_number, compact_->compaction->output_path_id());
+}
+
+Env::IOPriority CompactionJob::GetRateLimiterPriority() {
+  if (versions_ && versions_->GetColumnFamilySet() &&
+      versions_->GetColumnFamilySet()->write_controller()) {
+    WriteController* write_controller =
+        versions_->GetColumnFamilySet()->write_controller();
+    if (write_controller->NeedsDelay() || write_controller->IsStopped()) {
+      return Env::IO_USER;
+    }
+  }
+
+  return Env::IO_LOW;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_job.h b/src/rocksdb/db/compaction/compaction_job.h
new file mode 100644
index 000000000..bfbce1011
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_job.h
@@ -0,0 +1,500 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <functional>
+#include <limits>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/blob/blob_file_completion_callback.h"
+#include "db/column_family.h"
+#include "db/compaction/compaction_iterator.h"
+#include "db/compaction/compaction_outputs.h"
+#include "db/flush_scheduler.h"
+#include "db/internal_stats.h"
+#include "db/job_context.h"
+#include "db/log_writer.h"
+#include "db/memtable_list.h"
+#include "db/range_del_aggregator.h"
+#include "db/seqno_to_time_mapping.h"
+#include "db/version_edit.h"
+#include "db/write_controller.h"
+#include "db/write_thread.h"
+#include "logging/event_logger.h"
+#include "options/cf_options.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/compaction_job_stats.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/transaction_log.h"
+#include "table/scoped_arena_iterator.h"
+#include "util/autovector.h"
+#include "util/stop_watch.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena;
+class CompactionState;
+class ErrorHandler;
+class MemTable;
+class SnapshotChecker;
+class SystemClock;
+class TableCache;
+class Version;
+class VersionEdit;
+class VersionSet;
+
+class SubcompactionState;
+
+// CompactionJob is responsible for executing the compaction. Each (manual or
+// automated) compaction corresponds to a CompactionJob object, and usually
+// goes through the stages of `Prepare()`->`Run()`->`Install()`. CompactionJob
+// will divide the compaction into subcompactions and execute them in parallel
+// if needed.
+//
+// CompactionJob has 2 main stats:
+// 1. CompactionJobStats compaction_job_stats_
+//    CompactionJobStats is a public data structure which is part of Compaction
+//    event listener that rocksdb share the job stats with the user.
+//    Internally it's an aggregation of all the compaction_job_stats from each
+//    `SubcompactionState`:
+//                                           +------------------------+
+//                                           | SubcompactionState     |
+//                                           |                        |
+//                                +--------->|   compaction_job_stats |
+//                                |          |                        |
+//                                |          +------------------------+
+// +------------------------+     |
+// | CompactionJob          |     |          +------------------------+
+// |                        |     |          | SubcompactionState     |
+// |   compaction_job_stats +-----+          |                        |
+// |                        |     +--------->|   compaction_job_stats |
+// |                        |     |          |                        |
+// +------------------------+     |          +------------------------+
+//                                |
+//                                |          +------------------------+
+//                                |          | SubcompactionState     |
+//                                |          |                        |
+//                                +--------->+   compaction_job_stats |
+//                                |          |                        |
+//                                |          +------------------------+
+//                                |
+//                                |          +------------------------+
+//                                |          |       ...              |
+//                                +--------->+                        |
+//                                           +------------------------+
+//
+// 2. CompactionStatsFull compaction_stats_
+//    `CompactionStatsFull` is an internal stats about the compaction, which
+//    is eventually sent to `ColumnFamilyData::internal_stats_` and used for
+//    logging and public metrics.
+//    Internally, it's an aggregation of stats_ from each `SubcompactionState`.
+//    It has 2 parts, normal stats about the main compaction information and
+//    the penultimate level output stats.
+//    `SubcompactionState` maintains the CompactionOutputs for normal output and
+//    the penultimate level output if exists, the per_level stats is
+//    stored with the outputs.
+//                                                +---------------------------+
+//                                                | SubcompactionState        |
+//                                                |                           |
+//                                                | +----------------------+  |
+//                                                | | CompactionOutputs    |  |
+//                                                | | (normal output)      |  |
+//                                            +---->|   stats_             |  |
+//                                            |   | +----------------------+  |
+//                                            |   |                           |
+//                                            |   | +----------------------+  |
+// +--------------------------------+         |   | | CompactionOutputs    |  |
+// | CompactionJob                  |         |   | | (penultimate_level)  |  |
+// |                                |    +--------->|   stats_             |  |
+// |   compaction_stats_            |    |    |   | +----------------------+  |
+// |    +-------------------------+ |    |    |   |                           |
+// |    |stats (normal)           |------|----+   +---------------------------+
+// |    +-------------------------+ |    |    |
+// |                                |    |    |
+// |    +-------------------------+ |    |    |   +---------------------------+
+// |    |penultimate_level_stats  +------+    |   | SubcompactionState        |
+// |    +-------------------------+ |    |    |   |                           |
+// |                                |    |    |   | +----------------------+  |
+// |                                |    |    |   | | CompactionOutputs    |  |
+// +--------------------------------+    |    |   | | (normal output)      |  |
+//                                       |    +---->|   stats_             |  |
+//                                       |        | +----------------------+  |
+//                                       |        |                           |
+//                                       |        | +----------------------+  |
+//                                       |        | | CompactionOutputs    |  |
+//                                       |        | | (penultimate_level)  |  |
+//                                       +--------->|   stats_             |  |
+//                                                | +----------------------+  |
+//                                                |                           |
+//                                                +---------------------------+
+
+class CompactionJob {
+ public:
+  CompactionJob(
+      int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
+      const MutableDBOptions& mutable_db_options,
+      const FileOptions& file_options, VersionSet* versions,
+      const std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
+      FSDirectory* db_directory, FSDirectory* output_directory,
+      FSDirectory* blob_output_directory, Statistics* stats,
+      InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+      std::vector<SequenceNumber> existing_snapshots,
+      SequenceNumber earliest_write_conflict_snapshot,
+      const SnapshotChecker* snapshot_checker, JobContext* job_context,
+      std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+      bool paranoid_file_checks, bool measure_io_stats,
+      const std::string& dbname, CompactionJobStats* compaction_job_stats,
+      Env::Priority thread_pri, const std::shared_ptr<IOTracer>& io_tracer,
+      const std::atomic<bool>& manual_compaction_canceled,
+      const std::string& db_id = "", const std::string& db_session_id = "",
+      std::string full_history_ts_low = "", std::string trim_ts = "",
+      BlobFileCompletionCallback* blob_callback = nullptr,
+      int* bg_compaction_scheduled = nullptr,
+      int* bg_bottom_compaction_scheduled = nullptr);
+
+  virtual ~CompactionJob();
+
+  // no copy/move
+  CompactionJob(CompactionJob&& job) = delete;
+  CompactionJob(const CompactionJob& job) = delete;
+  CompactionJob& operator=(const CompactionJob& job) = delete;
+
+  // REQUIRED: mutex held
+  // Prepare for the compaction by setting up boundaries for each subcompaction
+  void Prepare();
+  // REQUIRED mutex not held
+  // Launch threads for each subcompaction and wait for them to finish. After
+  // that, verify table is usable and finally do bookkeeping to unify
+  // subcompaction results
+  Status Run();
+
+  // REQUIRED: mutex held
+  // Add compaction input/output to the current version
+  Status Install(const MutableCFOptions& mutable_cf_options);
+
+  // Return the IO status
+  IOStatus io_status() const { return io_status_; }
+
+ protected:
+  void UpdateCompactionStats();
+  void LogCompaction();
+  virtual void RecordCompactionIOStats();
+  void CleanupCompaction();
+
+  // Call compaction filter. Then iterate through input and compact the
+  // kv-pairs
+  void ProcessKeyValueCompaction(SubcompactionState* sub_compact);
+
+  CompactionState* compact_;
+  InternalStats::CompactionStatsFull compaction_stats_;
+  const ImmutableDBOptions& db_options_;
+  const MutableDBOptions mutable_db_options_copy_;
+  LogBuffer* log_buffer_;
+  FSDirectory* output_directory_;
+  Statistics* stats_;
+  // Is this compaction creating a file in the bottom most level?
+  bool bottommost_level_;
+
+  Env::WriteLifeTimeHint write_hint_;
+
+  IOStatus io_status_;
+
+  CompactionJobStats* compaction_job_stats_;
+
+ private:
+  friend class CompactionJobTestBase;
+
+  // Generates a histogram representing potential divisions of key ranges from
+  // the input. It adds the starting and/or ending keys of certain input files
+  // to the working set and then finds the approximate size of data in between
+  // each consecutive pair of slices. Then it divides these ranges into
+  // consecutive groups such that each group has a similar size.
+  void GenSubcompactionBoundaries();
+
+  // Get the number of planned subcompactions based on max_subcompactions and
+  // extra reserved resources
+  uint64_t GetSubcompactionsLimit();
+
+  // Additional reserved threads are reserved and the number is stored in
+  // extra_num_subcompaction_threads_reserved__. For now, this happens only if
+  // the compaction priority is round-robin and max_subcompactions is not
+  // sufficient (extra resources may be needed)
+  void AcquireSubcompactionResources(int num_extra_required_subcompactions);
+
+  // Additional threads may be reserved during IncreaseSubcompactionResources()
+  // if num_actual_subcompactions is less than num_planned_subcompactions.
+  // Additional threads will be released and the bg_compaction_scheduled_ or
+  // bg_bottom_compaction_scheduled_ will be updated if they are used.
+  // DB Mutex lock is required.
+  void ShrinkSubcompactionResources(uint64_t num_extra_resources);
+
+  // Release all reserved threads and update the compaction limits.
+  void ReleaseSubcompactionResources();
+
+  CompactionServiceJobStatus ProcessKeyValueCompactionWithCompactionService(
+      SubcompactionState* sub_compact);
+
+  // update the thread status for starting a compaction.
+  void ReportStartedCompaction(Compaction* compaction);
+
+  Status FinishCompactionOutputFile(const Status& input_status,
+                                    SubcompactionState* sub_compact,
+                                    CompactionOutputs& outputs,
+                                    const Slice& next_table_min_key);
+  Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options);
+  Status OpenCompactionOutputFile(SubcompactionState* sub_compact,
+                                  CompactionOutputs& outputs);
+  void UpdateCompactionJobStats(
+      const InternalStats::CompactionStats& stats) const;
+  void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats,
+                         CompactionJobStats* compaction_job_stats = nullptr);
+
+  void UpdateCompactionInputStatsHelper(int* num_files, uint64_t* bytes_read,
+                                        int input_level);
+
+  void NotifyOnSubcompactionBegin(SubcompactionState* sub_compact);
+
+  void NotifyOnSubcompactionCompleted(SubcompactionState* sub_compact);
+
+  uint32_t job_id_;
+
+  // DBImpl state
+  const std::string& dbname_;
+  const std::string db_id_;
+  const std::string db_session_id_;
+  const FileOptions file_options_;
+
+  Env* env_;
+  std::shared_ptr<IOTracer> io_tracer_;
+  FileSystemPtr fs_;
+  // env_option optimized for compaction table reads
+  FileOptions file_options_for_read_;
+  VersionSet* versions_;
+  const std::atomic<bool>* shutting_down_;
+  const std::atomic<bool>& manual_compaction_canceled_;
+  FSDirectory* db_directory_;
+  FSDirectory* blob_output_directory_;
+  InstrumentedMutex* db_mutex_;
+  ErrorHandler* db_error_handler_;
+  // If there were two snapshots with seq numbers s1 and
+  // s2 and s1 < s2, and if we find two instances of a key k1 then lies
+  // entirely within s1 and s2, then the earlier version of k1 can be safely
+  // deleted because that version is not visible in any snapshot.
+  std::vector<SequenceNumber> existing_snapshots_;
+
+  // This is the earliest snapshot that could be used for write-conflict
+  // checking by a transaction.  For any user-key newer than this snapshot, we
+  // should make sure not to remove evidence that a write occurred.
+  SequenceNumber earliest_write_conflict_snapshot_;
+
+  const SnapshotChecker* const snapshot_checker_;
+
+  JobContext* job_context_;
+
+  std::shared_ptr<Cache> table_cache_;
+
+  EventLogger* event_logger_;
+
+  bool paranoid_file_checks_;
+  bool measure_io_stats_;
+  // Stores the Slices that designate the boundaries for each subcompaction
+  std::vector<std::string> boundaries_;
+  Env::Priority thread_pri_;
+  std::string full_history_ts_low_;
+  std::string trim_ts_;
+  BlobFileCompletionCallback* blob_callback_;
+
+  uint64_t GetCompactionId(SubcompactionState* sub_compact) const;
+  // Stores the number of reserved threads in shared env_ for the number of
+  // extra subcompaction in kRoundRobin compaction priority
+  int extra_num_subcompaction_threads_reserved_;
+
+  // Stores the pointer to bg_compaction_scheduled_,
+  // bg_bottom_compaction_scheduled_ in DBImpl. Mutex is required when accessing
+  // or updating it.
+  int* bg_compaction_scheduled_;
+  int* bg_bottom_compaction_scheduled_;
+
+  // Stores the sequence number to time mapping gathered from all input files
+  // it also collects the smallest_seqno -> oldest_ancester_time from the SST.
+  SeqnoToTimeMapping seqno_time_mapping_;
+
+  // Minimal sequence number for preserving the time information. The time info
+  // older than this sequence number won't be preserved after the compaction and
+  // if it's bottommost compaction, the seq num will be zeroed out.
+  SequenceNumber preserve_time_min_seqno_ = kMaxSequenceNumber;
+
+  // Minimal sequence number to preclude the data from the last level. If the
+  // key has bigger (newer) sequence number than this, it will be precluded from
+  // the last level (output to penultimate level).
+  SequenceNumber preclude_last_level_min_seqno_ = kMaxSequenceNumber;
+
+  // Get table file name in where it's outputting to, which should also be in
+  // `output_directory_`.
+  virtual std::string GetTableFileName(uint64_t file_number);
+  // The rate limiter priority (io_priority) is determined dynamically here.
+  // The Compaction Read and Write priorities are the same for different
+  // scenarios, such as write stalled.
+  Env::IOPriority GetRateLimiterPriority();
+};
+
+// CompactionServiceInput is used the pass compaction information between two
+// db instances. It contains the information needed to do a compaction. It
+// doesn't contain the LSM tree information, which is passed though MANIFEST
+// file.
+struct CompactionServiceInput {
+  ColumnFamilyDescriptor column_family;
+
+  DBOptions db_options;
+
+  std::vector<SequenceNumber> snapshots;
+
+  // SST files for compaction, it should already be expended to include all the
+  // files needed for this compaction, for both input level files and output
+  // level files.
+  std::vector<std::string> input_files;
+  int output_level;
+
+  // db_id is used to generate unique id of sst on the remote compactor
+  std::string db_id;
+
+  // information for subcompaction
+  bool has_begin = false;
+  std::string begin;
+  bool has_end = false;
+  std::string end;
+
+  // serialization interface to read and write the object
+  static Status Read(const std::string& data_str, CompactionServiceInput* obj);
+  Status Write(std::string* output);
+
+  // Initialize a dummy ColumnFamilyDescriptor
+  CompactionServiceInput() : column_family("", ColumnFamilyOptions()) {}
+
+#ifndef NDEBUG
+  bool TEST_Equals(CompactionServiceInput* other);
+  bool TEST_Equals(CompactionServiceInput* other, std::string* mismatch);
+#endif  // NDEBUG
+};
+
+// CompactionServiceOutputFile is the metadata for the output SST file
+struct CompactionServiceOutputFile {
+  std::string file_name;
+  SequenceNumber smallest_seqno;
+  SequenceNumber largest_seqno;
+  std::string smallest_internal_key;
+  std::string largest_internal_key;
+  uint64_t oldest_ancester_time;
+  uint64_t file_creation_time;
+  uint64_t paranoid_hash;
+  bool marked_for_compaction;
+  UniqueId64x2 unique_id;
+
+  CompactionServiceOutputFile() = default;
+  CompactionServiceOutputFile(
+      const std::string& name, SequenceNumber smallest, SequenceNumber largest,
+      std::string _smallest_internal_key, std::string _largest_internal_key,
+      uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
+      uint64_t _paranoid_hash, bool _marked_for_compaction,
+      UniqueId64x2 _unique_id)
+      : file_name(name),
+        smallest_seqno(smallest),
+        largest_seqno(largest),
+        smallest_internal_key(std::move(_smallest_internal_key)),
+        largest_internal_key(std::move(_largest_internal_key)),
+        oldest_ancester_time(_oldest_ancester_time),
+        file_creation_time(_file_creation_time),
+        paranoid_hash(_paranoid_hash),
+        marked_for_compaction(_marked_for_compaction),
+        unique_id(std::move(_unique_id)) {}
+};
+
+// CompactionServiceResult contains the compaction result from a different db
+// instance, with these information, the primary db instance with write
+// permission is able to install the result to the DB.
+struct CompactionServiceResult {
+  Status status;
+  std::vector<CompactionServiceOutputFile> output_files;
+  int output_level;
+
+  // location of the output files
+  std::string output_path;
+
+  // some statistics about the compaction
+  uint64_t num_output_records = 0;
+  uint64_t total_bytes = 0;
+  uint64_t bytes_read = 0;
+  uint64_t bytes_written = 0;
+  CompactionJobStats stats;
+
+  // serialization interface to read and write the object
+  static Status Read(const std::string& data_str, CompactionServiceResult* obj);
+  Status Write(std::string* output);
+
+#ifndef NDEBUG
+  bool TEST_Equals(CompactionServiceResult* other);
+  bool TEST_Equals(CompactionServiceResult* other, std::string* mismatch);
+#endif  // NDEBUG
+};
+
+// CompactionServiceCompactionJob is an read-only compaction job, it takes
+// input information from `compaction_service_input` and put result information
+// in `compaction_service_result`, the SST files are generated to `output_path`.
+class CompactionServiceCompactionJob : private CompactionJob {
+ public:
+  CompactionServiceCompactionJob(
+      int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
+      const MutableDBOptions& mutable_db_options,
+      const FileOptions& file_options, VersionSet* versions,
+      const std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
+      FSDirectory* output_directory, Statistics* stats,
+      InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+      std::vector<SequenceNumber> existing_snapshots,
+      std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+      const std::string& dbname, const std::shared_ptr<IOTracer>& io_tracer,
+      const std::atomic<bool>& manual_compaction_canceled,
+      const std::string& db_id, const std::string& db_session_id,
+      std::string output_path,
+      const CompactionServiceInput& compaction_service_input,
+      CompactionServiceResult* compaction_service_result);
+
+  // Run the compaction in current thread and return the result
+  Status Run();
+
+  void CleanupCompaction();
+
+  IOStatus io_status() const { return CompactionJob::io_status(); }
+
+ protected:
+  void RecordCompactionIOStats() override;
+
+ private:
+  // Get table file name in output_path
+  std::string GetTableFileName(uint64_t file_number) override;
+  // Specific the compaction output path, otherwise it uses default DB path
+  const std::string output_path_;
+
+  // Compaction job input
+  const CompactionServiceInput& compaction_input_;
+
+  // Compaction job result
+  CompactionServiceResult* compaction_result_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_job_stats_test.cc b/src/rocksdb/db/compaction/compaction_job_stats_test.cc
new file mode 100644
index 000000000..930270778
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_job_stats_test.cc
@@ -0,0 +1,975 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <algorithm>
+#include <cinttypes>
+#include <iostream>
+#include <mutex>
+#include <queue>
+#include <set>
+#include <thread>
+#include <unordered_set>
+#include <utility>
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/job_context.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "monitoring/statistics.h"
+#include "monitoring/thread_status_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/experimental.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/thread_status.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/mock_table.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/scoped_arena_iterator.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/cast_util.h"
+#include "util/compression.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
+#include "util/rate_limiter.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+#if !defined(IOS_CROSS_COMPILE)
+#ifndef ROCKSDB_LITE
+namespace ROCKSDB_NAMESPACE {
+
+static std::string RandomString(Random* rnd, int len, double ratio) {
+  std::string r;
+  test::CompressibleString(rnd, ratio, len, &r);
+  return r;
+}
+
+std::string Key(uint64_t key, int length) {
+  const int kBufSize = 1000;
+  char buf[kBufSize];
+  if (length > kBufSize) {
+    length = kBufSize;
+  }
+  snprintf(buf, kBufSize, "%0*" PRIu64, length, key);
+  return std::string(buf);
+}
+
+class CompactionJobStatsTest : public testing::Test,
+                               public testing::WithParamInterface<bool> {
+ public:
+  std::string dbname_;
+  std::string alternative_wal_dir_;
+  Env* env_;
+  DB* db_;
+  std::vector<ColumnFamilyHandle*> handles_;
+  uint32_t max_subcompactions_;
+
+  Options last_options_;
+
+  CompactionJobStatsTest() : env_(Env::Default()) {
+    env_->SetBackgroundThreads(1, Env::LOW);
+    env_->SetBackgroundThreads(1, Env::HIGH);
+    dbname_ = test::PerThreadDBPath("compaction_job_stats_test");
+    alternative_wal_dir_ = dbname_ + "/wal";
+    Options options;
+    options.create_if_missing = true;
+    max_subcompactions_ = GetParam();
+    options.max_subcompactions = max_subcompactions_;
+    auto delete_options = options;
+    delete_options.wal_dir = alternative_wal_dir_;
+    EXPECT_OK(DestroyDB(dbname_, delete_options));
+    // Destroy it for not alternative WAL dir is used.
+    EXPECT_OK(DestroyDB(dbname_, options));
+    db_ = nullptr;
+    Reopen(options);
+  }
+
+  ~CompactionJobStatsTest() override {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({});
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+    Close();
+    Options options;
+    options.db_paths.emplace_back(dbname_, 0);
+    options.db_paths.emplace_back(dbname_ + "_2", 0);
+    options.db_paths.emplace_back(dbname_ + "_3", 0);
+    options.db_paths.emplace_back(dbname_ + "_4", 0);
+    EXPECT_OK(DestroyDB(dbname_, options));
+  }
+
+  // Required if inheriting from testing::WithParamInterface<>
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
+
+  void CreateColumnFamilies(const std::vector<std::string>& cfs,
+                            const Options& options) {
+    ColumnFamilyOptions cf_opts(options);
+    size_t cfi = handles_.size();
+    handles_.resize(cfi + cfs.size());
+    for (auto cf : cfs) {
+      ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]));
+    }
+  }
+
+  void CreateAndReopenWithCF(const std::vector<std::string>& cfs,
+                             const Options& options) {
+    CreateColumnFamilies(cfs, options);
+    std::vector<std::string> cfs_plus_default = cfs;
+    cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
+    ReopenWithColumnFamilies(cfs_plus_default, options);
+  }
+
+  void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                const std::vector<Options>& options) {
+    ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+  }
+
+  void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                const Options& options) {
+    ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+  }
+
+  Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                     const std::vector<Options>& options) {
+    Close();
+    EXPECT_EQ(cfs.size(), options.size());
+    std::vector<ColumnFamilyDescriptor> column_families;
+    for (size_t i = 0; i < cfs.size(); ++i) {
+      column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i]));
+    }
+    DBOptions db_opts = DBOptions(options[0]);
+    return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+  }
+
+  Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                     const Options& options) {
+    Close();
+    std::vector<Options> v_opts(cfs.size(), options);
+    return TryReopenWithColumnFamilies(cfs, v_opts);
+  }
+
+  void Reopen(const Options& options) { ASSERT_OK(TryReopen(options)); }
+
+  void Close() {
+    for (auto h : handles_) {
+      delete h;
+    }
+    handles_.clear();
+    delete db_;
+    db_ = nullptr;
+  }
+
+  void DestroyAndReopen(const Options& options) {
+    // Destroy using last options
+    Destroy(last_options_);
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Destroy(const Options& options) {
+    Close();
+    ASSERT_OK(DestroyDB(dbname_, options));
+  }
+
+  Status ReadOnlyReopen(const Options& options) {
+    return DB::OpenForReadOnly(options, dbname_, &db_);
+  }
+
+  Status TryReopen(const Options& options) {
+    Close();
+    last_options_ = options;
+    return DB::Open(options, dbname_, &db_);
+  }
+
+  Status Flush(int cf = 0) {
+    if (cf == 0) {
+      return db_->Flush(FlushOptions());
+    } else {
+      return db_->Flush(FlushOptions(), handles_[cf]);
+    }
+  }
+
+  Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()) {
+    return db_->Put(wo, k, v);
+  }
+
+  Status Put(int cf, const Slice& k, const Slice& v,
+             WriteOptions wo = WriteOptions()) {
+    return db_->Put(wo, handles_[cf], k, v);
+  }
+
+  Status Delete(const std::string& k) { return db_->Delete(WriteOptions(), k); }
+
+  Status Delete(int cf, const std::string& k) {
+    return db_->Delete(WriteOptions(), handles_[cf], k);
+  }
+
+  std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
+    ReadOptions options;
+    options.verify_checksums = true;
+    options.snapshot = snapshot;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  std::string Get(int cf, const std::string& k,
+                  const Snapshot* snapshot = nullptr) {
+    ReadOptions options;
+    options.verify_checksums = true;
+    options.snapshot = snapshot;
+    std::string result;
+    Status s = db_->Get(options, handles_[cf], k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  int NumTableFilesAtLevel(int level, int cf = 0) {
+    std::string property;
+    if (cf == 0) {
+      // default cfd
+      EXPECT_TRUE(db_->GetProperty(
+          "rocksdb.num-files-at-level" + std::to_string(level), &property));
+    } else {
+      EXPECT_TRUE(db_->GetProperty(
+          handles_[cf], "rocksdb.num-files-at-level" + std::to_string(level),
+          &property));
+    }
+    return atoi(property.c_str());
+  }
+
+  // Return spread of files per level
+  std::string FilesPerLevel(int cf = 0) {
+    int num_levels =
+        (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]);
+    std::string result;
+    size_t last_non_zero_offset = 0;
+    for (int level = 0; level < num_levels; level++) {
+      int f = NumTableFilesAtLevel(level, cf);
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+      result += buf;
+      if (f > 0) {
+        last_non_zero_offset = result.size();
+      }
+    }
+    result.resize(last_non_zero_offset);
+    return result;
+  }
+
+  Status Size(uint64_t* size, const Slice& start, const Slice& limit,
+              int cf = 0) {
+    Range r(start, limit);
+    if (cf == 0) {
+      return db_->GetApproximateSizes(&r, 1, size);
+    } else {
+      return db_->GetApproximateSizes(handles_[1], &r, 1, size);
+    }
+  }
+
+  void Compact(int cf, const Slice& start, const Slice& limit,
+               uint32_t target_path_id) {
+    CompactRangeOptions compact_options;
+    compact_options.target_path_id = target_path_id;
+    ASSERT_OK(db_->CompactRange(compact_options, handles_[cf], &start, &limit));
+  }
+
+  void Compact(int cf, const Slice& start, const Slice& limit) {
+    ASSERT_OK(
+        db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit));
+  }
+
+  void Compact(const Slice& start, const Slice& limit) {
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &limit));
+  }
+
+  void TEST_Compact(int level, int cf, const Slice& start, const Slice& limit) {
+    ASSERT_OK(dbfull()->TEST_CompactRange(level, &start, &limit, handles_[cf],
+                                          true /* disallow trivial move */));
+  }
+
+  // Do n memtable compactions, each of which produces an sstable
+  // covering the range [small,large].
+  void MakeTables(int n, const std::string& small, const std::string& large,
+                  int cf = 0) {
+    for (int i = 0; i < n; i++) {
+      ASSERT_OK(Put(cf, small, "begin"));
+      ASSERT_OK(Put(cf, large, "end"));
+      ASSERT_OK(Flush(cf));
+    }
+  }
+
+  static void SetDeletionCompactionStats(CompactionJobStats* stats,
+                                         uint64_t input_deletions,
+                                         uint64_t expired_deletions,
+                                         uint64_t records_replaced) {
+    stats->num_input_deletion_records = input_deletions;
+    stats->num_expired_deletion_records = expired_deletions;
+    stats->num_records_replaced = records_replaced;
+  }
+
+  void MakeTableWithKeyValues(Random* rnd, uint64_t smallest, uint64_t largest,
+                              int key_size, int value_size, uint64_t interval,
+                              double ratio, int cf = 0) {
+    for (auto key = smallest; key < largest; key += interval) {
+      ASSERT_OK(Put(cf, Slice(Key(key, key_size)),
+                    Slice(RandomString(rnd, value_size, ratio))));
+    }
+    ASSERT_OK(Flush(cf));
+  }
+
+  // This function behaves with the implicit understanding that two
+  // rounds of keys are inserted into the database, as per the behavior
+  // of the DeletionStatsTest.
+  void SelectivelyDeleteKeys(uint64_t smallest, uint64_t largest,
+                             uint64_t interval, int deletion_interval,
+                             int key_size, uint64_t cutoff_key_num,
+                             CompactionJobStats* stats, int cf = 0) {
+    // interval needs to be >= 2 so that deletion entries can be inserted
+    // that are intended to not result in an actual key deletion by using
+    // an offset of 1 from another existing key
+    ASSERT_GE(interval, 2);
+
+    uint64_t ctr = 1;
+    uint32_t deletions_made = 0;
+    uint32_t num_deleted = 0;
+    uint32_t num_expired = 0;
+    for (auto key = smallest; key <= largest; key += interval, ctr++) {
+      if (ctr % deletion_interval == 0) {
+        ASSERT_OK(Delete(cf, Key(key, key_size)));
+        deletions_made++;
+        num_deleted++;
+
+        if (key > cutoff_key_num) {
+          num_expired++;
+        }
+      }
+    }
+
+    // Insert some deletions for keys that don't exist that
+    // are both in and out of the key range
+    ASSERT_OK(Delete(cf, Key(smallest + 1, key_size)));
+    deletions_made++;
+
+    ASSERT_OK(Delete(cf, Key(smallest - 1, key_size)));
+    deletions_made++;
+    num_expired++;
+
+    ASSERT_OK(Delete(cf, Key(smallest - 9, key_size)));
+    deletions_made++;
+    num_expired++;
+
+    ASSERT_OK(Flush(cf));
+    SetDeletionCompactionStats(stats, deletions_made, num_expired, num_deleted);
+  }
+};
+
+// An EventListener which helps verify the compaction results in
+// test CompactionJobStatsTest.
+class CompactionJobStatsChecker : public EventListener {
+ public:
+  CompactionJobStatsChecker()
+      : compression_enabled_(false), verify_next_comp_io_stats_(false) {}
+
+  size_t NumberOfUnverifiedStats() { return expected_stats_.size(); }
+
+  void set_verify_next_comp_io_stats(bool v) { verify_next_comp_io_stats_ = v; }
+
+  // Once a compaction completed, this function will verify the returned
+  // CompactionJobInfo with the oldest CompactionJobInfo added earlier
+  // in "expected_stats_" which has not yet being used for verification.
+  void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
+    if (verify_next_comp_io_stats_) {
+      ASSERT_GT(ci.stats.file_write_nanos, 0);
+      ASSERT_GT(ci.stats.file_range_sync_nanos, 0);
+      ASSERT_GT(ci.stats.file_fsync_nanos, 0);
+      ASSERT_GT(ci.stats.file_prepare_write_nanos, 0);
+      verify_next_comp_io_stats_ = false;
+    }
+
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (expected_stats_.size()) {
+      Verify(ci.stats, expected_stats_.front());
+      expected_stats_.pop();
+    }
+  }
+
+  // A helper function which verifies whether two CompactionJobStats
+  // match.  The verification of all compaction stats are done by
+  // ASSERT_EQ except for the total input / output bytes, which we
+  // use ASSERT_GE and ASSERT_LE with a reasonable bias ---
+  // 10% in uncompressed case and 20% when compression is used.
+  virtual void Verify(const CompactionJobStats& current_stats,
+                      const CompactionJobStats& stats) {
+    // time
+    ASSERT_GT(current_stats.elapsed_micros, 0U);
+
+    ASSERT_EQ(current_stats.num_input_records, stats.num_input_records);
+    ASSERT_EQ(current_stats.num_input_files, stats.num_input_files);
+    ASSERT_EQ(current_stats.num_input_files_at_output_level,
+              stats.num_input_files_at_output_level);
+
+    ASSERT_EQ(current_stats.num_output_records, stats.num_output_records);
+    ASSERT_EQ(current_stats.num_output_files, stats.num_output_files);
+
+    ASSERT_EQ(current_stats.is_full_compaction, stats.is_full_compaction);
+    ASSERT_EQ(current_stats.is_manual_compaction, stats.is_manual_compaction);
+
+    // file size
+    double kFileSizeBias = compression_enabled_ ? 0.20 : 0.10;
+    ASSERT_GE(current_stats.total_input_bytes * (1.00 + kFileSizeBias),
+              stats.total_input_bytes);
+    ASSERT_LE(current_stats.total_input_bytes,
+              stats.total_input_bytes * (1.00 + kFileSizeBias));
+    ASSERT_GE(current_stats.total_output_bytes * (1.00 + kFileSizeBias),
+              stats.total_output_bytes);
+    ASSERT_LE(current_stats.total_output_bytes,
+              stats.total_output_bytes * (1.00 + kFileSizeBias));
+    ASSERT_EQ(current_stats.total_input_raw_key_bytes,
+              stats.total_input_raw_key_bytes);
+    ASSERT_EQ(current_stats.total_input_raw_value_bytes,
+              stats.total_input_raw_value_bytes);
+
+    ASSERT_EQ(current_stats.num_records_replaced, stats.num_records_replaced);
+
+    ASSERT_EQ(current_stats.num_corrupt_keys, stats.num_corrupt_keys);
+
+    ASSERT_EQ(std::string(current_stats.smallest_output_key_prefix),
+              std::string(stats.smallest_output_key_prefix));
+    ASSERT_EQ(std::string(current_stats.largest_output_key_prefix),
+              std::string(stats.largest_output_key_prefix));
+  }
+
+  // Add an expected compaction stats, which will be used to
+  // verify the CompactionJobStats returned by the OnCompactionCompleted()
+  // callback.
+  void AddExpectedStats(const CompactionJobStats& stats) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    expected_stats_.push(stats);
+  }
+
+  void EnableCompression(bool flag) { compression_enabled_ = flag; }
+
+  bool verify_next_comp_io_stats() const { return verify_next_comp_io_stats_; }
+
+ private:
+  std::mutex mutex_;
+  std::queue<CompactionJobStats> expected_stats_;
+  bool compression_enabled_;
+  bool verify_next_comp_io_stats_;
+};
+
+// An EventListener which helps verify the compaction statistics in
+// the test DeletionStatsTest.
+class CompactionJobDeletionStatsChecker : public CompactionJobStatsChecker {
+ public:
+  // Verifies whether two CompactionJobStats match.
+  void Verify(const CompactionJobStats& current_stats,
+              const CompactionJobStats& stats) override {
+    ASSERT_EQ(current_stats.num_input_deletion_records,
+              stats.num_input_deletion_records);
+    ASSERT_EQ(current_stats.num_expired_deletion_records,
+              stats.num_expired_deletion_records);
+    ASSERT_EQ(current_stats.num_records_replaced, stats.num_records_replaced);
+
+    ASSERT_EQ(current_stats.num_corrupt_keys, stats.num_corrupt_keys);
+  }
+};
+
+namespace {
+
+uint64_t EstimatedFileSize(uint64_t num_records, size_t key_size,
+                           size_t value_size, double compression_ratio = 1.0,
+                           size_t block_size = 4096,
+                           int bloom_bits_per_key = 10) {
+  const size_t kPerKeyOverhead = 8;
+  const size_t kFooterSize = 512;
+
+  uint64_t data_size = static_cast<uint64_t>(
+      num_records *
+      (key_size + value_size * compression_ratio + kPerKeyOverhead));
+
+  return data_size + kFooterSize +
+         num_records * bloom_bits_per_key / 8        // filter block
+         + data_size * (key_size + 8) / block_size;  // index block
+}
+
+namespace {
+
+void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) {
+  assert(prefix_length > 0);
+  size_t length = src.size() > prefix_length ? prefix_length : src.size();
+  dst->assign(src.data(), length);
+}
+
+}  // namespace
+
+CompactionJobStats NewManualCompactionJobStats(
+    const std::string& smallest_key, const std::string& largest_key,
+    size_t num_input_files, size_t num_input_files_at_output_level,
+    uint64_t num_input_records, size_t key_size, size_t value_size,
+    size_t num_output_files, uint64_t num_output_records,
+    double compression_ratio, uint64_t num_records_replaced,
+    bool is_full = false, bool is_manual = true) {
+  CompactionJobStats stats;
+  stats.Reset();
+
+  stats.num_input_records = num_input_records;
+  stats.num_input_files = num_input_files;
+  stats.num_input_files_at_output_level = num_input_files_at_output_level;
+
+  stats.num_output_records = num_output_records;
+  stats.num_output_files = num_output_files;
+
+  stats.total_input_bytes =
+      EstimatedFileSize(num_input_records / num_input_files, key_size,
+                        value_size, compression_ratio) *
+      num_input_files;
+  stats.total_output_bytes =
+      EstimatedFileSize(num_output_records / num_output_files, key_size,
+                        value_size, compression_ratio) *
+      num_output_files;
+  stats.total_input_raw_key_bytes = num_input_records * (key_size + 8);
+  stats.total_input_raw_value_bytes = num_input_records * value_size;
+
+  stats.is_full_compaction = is_full;
+  stats.is_manual_compaction = is_manual;
+
+  stats.num_records_replaced = num_records_replaced;
+
+  CopyPrefix(smallest_key, CompactionJobStats::kMaxPrefixLength,
+             &stats.smallest_output_key_prefix);
+  CopyPrefix(largest_key, CompactionJobStats::kMaxPrefixLength,
+             &stats.largest_output_key_prefix);
+
+  return stats;
+}
+
+CompressionType GetAnyCompression() {
+  if (Snappy_Supported()) {
+    return kSnappyCompression;
+  } else if (Zlib_Supported()) {
+    return kZlibCompression;
+  } else if (BZip2_Supported()) {
+    return kBZip2Compression;
+  } else if (LZ4_Supported()) {
+    return kLZ4Compression;
+  } else if (XPRESS_Supported()) {
+    return kXpressCompression;
+  }
+
+  return kNoCompression;
+}
+
+}  // namespace
+
+TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) {
+  Random rnd(301);
+  const int kBufSize = 100;
+  char buf[kBufSize];
+  uint64_t key_base = 100000000l;
+  // Note: key_base must be multiple of num_keys_per_L0_file
+  int num_keys_per_L0_file = 100;
+  const int kTestScale = 8;
+  const int kKeySize = 10;
+  const int kValueSize = 1000;
+  const double kCompressionRatio = 0.5;
+  double compression_ratio = 1.0;
+  uint64_t key_interval = key_base / num_keys_per_L0_file;
+
+  // Whenever a compaction completes, this listener will try to
+  // verify whether the returned CompactionJobStats matches
+  // what we expect.  The expected CompactionJobStats is added
+  // via AddExpectedStats().
+  auto* stats_checker = new CompactionJobStatsChecker();
+  Options options;
+  options.listeners.emplace_back(stats_checker);
+  options.create_if_missing = true;
+  // just enough setting to hold off auto-compaction.
+  options.level0_file_num_compaction_trigger = kTestScale + 1;
+  options.num_levels = 3;
+  options.compression = kNoCompression;
+  options.max_subcompactions = max_subcompactions_;
+  options.bytes_per_sync = 512 * 1024;
+
+  options.report_bg_io_stats = true;
+  for (int test = 0; test < 2; ++test) {
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // 1st Phase: generate "num_L0_files" L0 files.
+    int num_L0_files = 0;
+    for (uint64_t start_key = key_base; start_key <= key_base * kTestScale;
+         start_key += key_base) {
+      MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1,
+                             kKeySize, kValueSize, key_interval,
+                             compression_ratio, 1);
+      snprintf(buf, kBufSize, "%d", ++num_L0_files);
+      ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+    }
+    ASSERT_EQ(std::to_string(num_L0_files), FilesPerLevel(1));
+
+    // 2nd Phase: perform L0 -> L1 compaction.
+    int L0_compaction_count = 6;
+    int count = 1;
+    std::string smallest_key;
+    std::string largest_key;
+    for (uint64_t start_key = key_base;
+         start_key <= key_base * L0_compaction_count;
+         start_key += key_base, count++) {
+      smallest_key = Key(start_key, 10);
+      largest_key = Key(start_key + key_base - key_interval, 10);
+      stats_checker->AddExpectedStats(NewManualCompactionJobStats(
+          smallest_key, largest_key, 1, 0, num_keys_per_L0_file, kKeySize,
+          kValueSize, 1, num_keys_per_L0_file, compression_ratio, 0));
+      ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+      TEST_Compact(0, 1, smallest_key, largest_key);
+      snprintf(buf, kBufSize, "%d,%d", num_L0_files - count, count);
+      ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+    }
+
+    // compact two files into one in the last L0 -> L1 compaction
+    int num_remaining_L0 = num_L0_files - L0_compaction_count;
+    smallest_key = Key(key_base * (L0_compaction_count + 1), 10);
+    largest_key = Key(key_base * (kTestScale + 1) - key_interval, 10);
+    stats_checker->AddExpectedStats(NewManualCompactionJobStats(
+        smallest_key, largest_key, num_remaining_L0, 0,
+        num_keys_per_L0_file * num_remaining_L0, kKeySize, kValueSize, 1,
+        num_keys_per_L0_file * num_remaining_L0, compression_ratio, 0));
+    ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+    TEST_Compact(0, 1, smallest_key, largest_key);
+
+    int num_L1_files = num_L0_files - num_remaining_L0 + 1;
+    num_L0_files = 0;
+    snprintf(buf, kBufSize, "%d,%d", num_L0_files, num_L1_files);
+    ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+
+    // 3rd Phase: generate sparse L0 files (wider key-range, same num of keys)
+    int sparseness = 2;
+    for (uint64_t start_key = key_base; start_key <= key_base * kTestScale;
+         start_key += key_base * sparseness) {
+      MakeTableWithKeyValues(
+          &rnd, start_key, start_key + key_base * sparseness - 1, kKeySize,
+          kValueSize, key_base * sparseness / num_keys_per_L0_file,
+          compression_ratio, 1);
+      snprintf(buf, kBufSize, "%d,%d", ++num_L0_files, num_L1_files);
+      ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+    }
+
+    // 4th Phase: perform L0 -> L1 compaction again, expect higher write amp
+    // When subcompactions are enabled, the number of output files increases
+    // by 1 because multiple threads are consuming the input and generating
+    // output files without coordinating to see if the output could fit into
+    // a smaller number of files like it does when it runs sequentially
+    int num_output_files = options.max_subcompactions > 1 ? 2 : 1;
+    for (uint64_t start_key = key_base; num_L0_files > 1;
+         start_key += key_base * sparseness) {
+      smallest_key = Key(start_key, 10);
+      largest_key = Key(start_key + key_base * sparseness - key_interval, 10);
+      stats_checker->AddExpectedStats(NewManualCompactionJobStats(
+          smallest_key, largest_key, 3, 2, num_keys_per_L0_file * 3, kKeySize,
+          kValueSize, num_output_files,
+          num_keys_per_L0_file * 2,  // 1/3 of the data will be updated.
+          compression_ratio, num_keys_per_L0_file));
+      ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+      Compact(1, smallest_key, largest_key);
+      if (options.max_subcompactions == 1) {
+        --num_L1_files;
+      }
+      snprintf(buf, kBufSize, "%d,%d", --num_L0_files, num_L1_files);
+      ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+    }
+
+    // 5th Phase: Do a full compaction, which involves in two sub-compactions.
+    // Here we expect to have 1 L0 files and 4 L1 files
+    // In the first sub-compaction, we expect L0 compaction.
+    smallest_key = Key(key_base, 10);
+    largest_key = Key(key_base * (kTestScale + 1) - key_interval, 10);
+    stats_checker->AddExpectedStats(NewManualCompactionJobStats(
+        Key(key_base * (kTestScale + 1 - sparseness), 10), largest_key, 2, 1,
+        num_keys_per_L0_file * 3, kKeySize, kValueSize, 1,
+        num_keys_per_L0_file * 2, compression_ratio, num_keys_per_L0_file));
+    ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+    Compact(1, smallest_key, largest_key);
+
+    num_L1_files = options.max_subcompactions > 1 ? 7 : 4;
+    char L1_buf[4];
+    snprintf(L1_buf, sizeof(L1_buf), "0,%d", num_L1_files);
+    std::string L1_files(L1_buf);
+    ASSERT_EQ(L1_files, FilesPerLevel(1));
+    options.compression = GetAnyCompression();
+    if (options.compression == kNoCompression) {
+      break;
+    }
+    stats_checker->EnableCompression(true);
+    compression_ratio = kCompressionRatio;
+
+    for (int i = 0; i < 5; i++) {
+      ASSERT_OK(Put(1, Slice(Key(key_base + i, 10)),
+                    Slice(RandomString(&rnd, 512 * 1024, 1))));
+    }
+
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_WaitForCompact());
+
+    stats_checker->set_verify_next_comp_io_stats(true);
+    std::atomic<bool> first_prepare_write(true);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Append:BeforePrepareWrite", [&](void* /*arg*/) {
+          if (first_prepare_write.load()) {
+            options.env->SleepForMicroseconds(3);
+            first_prepare_write.store(false);
+          }
+        });
+
+    std::atomic<bool> first_flush(true);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Flush:BeforeAppend", [&](void* /*arg*/) {
+          if (first_flush.load()) {
+            options.env->SleepForMicroseconds(3);
+            first_flush.store(false);
+          }
+        });
+
+    std::atomic<bool> first_sync(true);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::SyncInternal:0", [&](void* /*arg*/) {
+          if (first_sync.load()) {
+            options.env->SleepForMicroseconds(3);
+            first_sync.store(false);
+          }
+        });
+
+    std::atomic<bool> first_range_sync(true);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::RangeSync:0", [&](void* /*arg*/) {
+          if (first_range_sync.load()) {
+            options.env->SleepForMicroseconds(3);
+            first_range_sync.store(false);
+          }
+        });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    Compact(1, smallest_key, largest_key);
+
+    ASSERT_TRUE(!stats_checker->verify_next_comp_io_stats());
+    ASSERT_TRUE(!first_prepare_write.load());
+    ASSERT_TRUE(!first_flush.load());
+    ASSERT_TRUE(!first_sync.load());
+    ASSERT_TRUE(!first_range_sync.load());
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+  ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U);
+}
+
+TEST_P(CompactionJobStatsTest, DeletionStatsTest) {
+  Random rnd(301);
+  uint64_t key_base = 100000l;
+  // Note: key_base must be multiple of num_keys_per_L0_file
+  int num_keys_per_L0_file = 20;
+  const int kTestScale = 8;  // make sure this is even
+  const int kKeySize = 10;
+  const int kValueSize = 100;
+  double compression_ratio = 1.0;
+  uint64_t key_interval = key_base / num_keys_per_L0_file;
+  uint64_t largest_key_num = key_base * (kTestScale + 1) - key_interval;
+  uint64_t cutoff_key_num = key_base * (kTestScale / 2 + 1) - key_interval;
+  const std::string smallest_key = Key(key_base - 10, kKeySize);
+  const std::string largest_key = Key(largest_key_num + 10, kKeySize);
+
+  // Whenever a compaction completes, this listener will try to
+  // verify whether the returned CompactionJobStats matches
+  // what we expect.
+  auto* stats_checker = new CompactionJobDeletionStatsChecker();
+  Options options;
+  options.listeners.emplace_back(stats_checker);
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = kTestScale + 1;
+  options.num_levels = 3;
+  options.compression = kNoCompression;
+  options.max_bytes_for_level_multiplier = 2;
+  options.max_subcompactions = max_subcompactions_;
+
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Stage 1: Generate several L0 files and then send them to L2 by
+  // using CompactRangeOptions and CompactRange(). These files will
+  // have a strict subset of the keys from the full key-range
+  for (uint64_t start_key = key_base; start_key <= key_base * kTestScale / 2;
+       start_key += key_base) {
+    MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1, kKeySize,
+                           kValueSize, key_interval, compression_ratio, 1);
+  }
+
+  CompactRangeOptions cr_options;
+  cr_options.change_level = true;
+  cr_options.target_level = 2;
+  ASSERT_OK(db_->CompactRange(cr_options, handles_[1], nullptr, nullptr));
+  ASSERT_GT(NumTableFilesAtLevel(2, 1), 0);
+
+  // Stage 2: Generate files including keys from the entire key range
+  for (uint64_t start_key = key_base; start_key <= key_base * kTestScale;
+       start_key += key_base) {
+    MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1, kKeySize,
+                           kValueSize, key_interval, compression_ratio, 1);
+  }
+
+  // Send these L0 files to L1
+  TEST_Compact(0, 1, smallest_key, largest_key);
+  ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
+
+  // Add a new record and flush so now there is a L0 file
+  // with a value too (not just deletions from the next step)
+  ASSERT_OK(Put(1, Key(key_base - 6, kKeySize), "test"));
+  ASSERT_OK(Flush(1));
+
+  // Stage 3: Generate L0 files with some deletions so now
+  // there are files with the same key range in L0, L1, and L2
+  int deletion_interval = 3;
+  CompactionJobStats first_compaction_stats;
+  SelectivelyDeleteKeys(key_base, largest_key_num, key_interval,
+                        deletion_interval, kKeySize, cutoff_key_num,
+                        &first_compaction_stats, 1);
+
+  stats_checker->AddExpectedStats(first_compaction_stats);
+
+  // Stage 4: Trigger compaction and verify the stats
+  TEST_Compact(0, 1, smallest_key, largest_key);
+}
+
+namespace {
+int GetUniversalCompactionInputUnits(uint32_t num_flushes) {
+  uint32_t compaction_input_units;
+  for (compaction_input_units = 1; num_flushes >= compaction_input_units;
+       compaction_input_units *= 2) {
+    if ((num_flushes & compaction_input_units) != 0) {
+      return compaction_input_units > 1 ? compaction_input_units : 0;
+    }
+  }
+  return 0;
+}
+}  // namespace
+
+TEST_P(CompactionJobStatsTest, UniversalCompactionTest) {
+  Random rnd(301);
+  uint64_t key_base = 100000000l;
+  // Note: key_base must be multiple of num_keys_per_L0_file
+  int num_keys_per_table = 100;
+  const uint32_t kTestScale = 6;
+  const int kKeySize = 10;
+  const int kValueSize = 900;
+  double compression_ratio = 1.0;
+  uint64_t key_interval = key_base / num_keys_per_table;
+
+  auto* stats_checker = new CompactionJobStatsChecker();
+  Options options;
+  options.listeners.emplace_back(stats_checker);
+  options.create_if_missing = true;
+  options.num_levels = 3;
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 2;
+  options.target_file_size_base = num_keys_per_table * 1000;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.size_ratio = 1;
+  options.compaction_options_universal.max_size_amplification_percent = 1000;
+  options.max_subcompactions = max_subcompactions_;
+
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Generates the expected CompactionJobStats for each compaction
+  for (uint32_t num_flushes = 2; num_flushes <= kTestScale; num_flushes++) {
+    // Here we treat one newly flushed file as an unit.
+    //
+    // For example, if a newly flushed file is 100k, and a compaction has
+    // 4 input units, then this compaction inputs 400k.
+    uint32_t num_input_units = GetUniversalCompactionInputUnits(num_flushes);
+    if (num_input_units == 0) {
+      continue;
+    }
+    // A full compaction only happens when the number of flushes equals to
+    // the number of compaction input runs.
+    bool is_full = num_flushes == num_input_units;
+    // The following statement determines the expected smallest key
+    // based on whether it is a full compaction.
+    uint64_t smallest_key = is_full ? key_base : key_base * (num_flushes - 1);
+
+    stats_checker->AddExpectedStats(NewManualCompactionJobStats(
+        Key(smallest_key, 10),
+        Key(smallest_key + key_base * num_input_units - key_interval, 10),
+        num_input_units, num_input_units > 2 ? num_input_units / 2 : 0,
+        num_keys_per_table * num_input_units, kKeySize, kValueSize,
+        num_input_units, num_keys_per_table * num_input_units, 1.0, 0, is_full,
+        false));
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 3U);
+
+  for (uint64_t start_key = key_base; start_key <= key_base * kTestScale;
+       start_key += key_base) {
+    MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1, kKeySize,
+                           kValueSize, key_interval, compression_ratio, 1);
+    ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_WaitForCompact());
+  }
+  ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U);
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionJobStatsTest, CompactionJobStatsTest,
+                        ::testing::Values(1, 4));
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED, not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
+
+#else
+
+int main(int /*argc*/, char** /*argv*/) { return 0; }
+#endif  // !defined(IOS_CROSS_COMPILE)
diff --git a/src/rocksdb/db/compaction/compaction_job_test.cc b/src/rocksdb/db/compaction/compaction_job_test.cc
new file mode 100644
index 000000000..c87871100
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_job_test.cc
@@ -0,0 +1,2451 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/compaction/compaction_job.h"
+
+#include <algorithm>
+#include <array>
+#include <cinttypes>
+#include <map>
+#include <string>
+#include <tuple>
+
+#include "db/blob/blob_index.h"
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/error_handler.h"
+#include "db/version_set.h"
+#include "file/random_access_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "options/options_helper.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/options.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/mock_table.h"
+#include "table/unique_id_impl.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+void VerifyInitializationOfCompactionJobStats(
+    const CompactionJobStats& compaction_job_stats) {
+#if !defined(IOS_CROSS_COMPILE)
+  ASSERT_EQ(compaction_job_stats.elapsed_micros, 0U);
+
+  ASSERT_EQ(compaction_job_stats.num_input_records, 0U);
+  ASSERT_EQ(compaction_job_stats.num_input_files, 0U);
+  ASSERT_EQ(compaction_job_stats.num_input_files_at_output_level, 0U);
+
+  ASSERT_EQ(compaction_job_stats.num_output_records, 0U);
+  ASSERT_EQ(compaction_job_stats.num_output_files, 0U);
+
+  ASSERT_EQ(compaction_job_stats.is_manual_compaction, true);
+
+  ASSERT_EQ(compaction_job_stats.total_input_bytes, 0U);
+  ASSERT_EQ(compaction_job_stats.total_output_bytes, 0U);
+
+  ASSERT_EQ(compaction_job_stats.total_input_raw_key_bytes, 0U);
+  ASSERT_EQ(compaction_job_stats.total_input_raw_value_bytes, 0U);
+
+  ASSERT_EQ(compaction_job_stats.smallest_output_key_prefix[0], 0);
+  ASSERT_EQ(compaction_job_stats.largest_output_key_prefix[0], 0);
+
+  ASSERT_EQ(compaction_job_stats.num_records_replaced, 0U);
+
+  ASSERT_EQ(compaction_job_stats.num_input_deletion_records, 0U);
+  ASSERT_EQ(compaction_job_stats.num_expired_deletion_records, 0U);
+
+  ASSERT_EQ(compaction_job_stats.num_corrupt_keys, 0U);
+#endif  // !defined(IOS_CROSS_COMPILE)
+}
+
+// Mock FSWritableFile for testing io priority.
+// Only override the essential functions for testing compaction io priority.
+class MockTestWritableFile : public FSWritableFileOwnerWrapper {
+ public:
+  MockTestWritableFile(std::unique_ptr<FSWritableFile>&& file,
+                       Env::IOPriority io_priority)
+      : FSWritableFileOwnerWrapper(std::move(file)),
+        write_io_priority_(io_priority) {}
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  IODebugContext* dbg) override {
+    EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+    return target()->Append(data, options, dbg);
+  }
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  const DataVerificationInfo& verification_info,
+                  IODebugContext* dbg) override {
+    EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+    return target()->Append(data, options, verification_info, dbg);
+  }
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+    EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+    return target()->Close(options, dbg);
+  }
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override {
+    EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+    return target()->Flush(options, dbg);
+  }
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override {
+    EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+    return target()->Sync(options, dbg);
+  }
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override {
+    EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+    return target()->Fsync(options, dbg);
+  }
+  uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override {
+    EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+    return target()->GetFileSize(options, dbg);
+  }
+  IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options,
+                     IODebugContext* dbg) override {
+    EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+    return target()->RangeSync(offset, nbytes, options, dbg);
+  }
+
+  void PrepareWrite(size_t offset, size_t len, const IOOptions& options,
+                    IODebugContext* dbg) override {
+    EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+    target()->PrepareWrite(offset, len, options, dbg);
+  }
+
+  IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options,
+                    IODebugContext* dbg) override {
+    EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+    return target()->Allocate(offset, len, options, dbg);
+  }
+
+ private:
+  Env::IOPriority write_io_priority_;
+};
+
+// Mock FSRandomAccessFile for testing io priority.
+// Only override the essential functions for testing compaction io priority.
+class MockTestRandomAccessFile : public FSRandomAccessFileOwnerWrapper {
+ public:
+  MockTestRandomAccessFile(std::unique_ptr<FSRandomAccessFile>&& file,
+                           Env::IOPriority io_priority)
+      : FSRandomAccessFileOwnerWrapper(std::move(file)),
+        read_io_priority_(io_priority) {}
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override {
+    EXPECT_EQ(options.rate_limiter_priority, read_io_priority_);
+    return target()->Read(offset, n, options, result, scratch, dbg);
+  }
+  IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
+                    IODebugContext* dbg) override {
+    EXPECT_EQ(options.rate_limiter_priority, read_io_priority_);
+    return target()->Prefetch(offset, n, options, dbg);
+  }
+
+ private:
+  Env::IOPriority read_io_priority_;
+};
+
+// Mock FileSystem for testing io priority.
+class MockTestFileSystem : public FileSystemWrapper {
+ public:
+  explicit MockTestFileSystem(const std::shared_ptr<FileSystem>& base,
+                              Env::IOPriority read_io_priority,
+                              Env::IOPriority write_io_priority)
+      : FileSystemWrapper(base),
+        read_io_priority_(read_io_priority),
+        write_io_priority_(write_io_priority) {}
+
+  static const char* kClassName() { return "MockTestFileSystem"; }
+  const char* Name() const override { return kClassName(); }
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& file_opts,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override {
+    IOStatus s = target()->NewRandomAccessFile(fname, file_opts, result, dbg);
+    EXPECT_OK(s);
+    result->reset(
+        new MockTestRandomAccessFile(std::move(*result), read_io_priority_));
+    return s;
+  }
+  IOStatus NewWritableFile(const std::string& fname,
+                           const FileOptions& file_opts,
+                           std::unique_ptr<FSWritableFile>* result,
+                           IODebugContext* dbg) override {
+    IOStatus s = target()->NewWritableFile(fname, file_opts, result, dbg);
+    EXPECT_OK(s);
+    result->reset(
+        new MockTestWritableFile(std::move(*result), write_io_priority_));
+    return s;
+  }
+
+ private:
+  Env::IOPriority read_io_priority_;
+  Env::IOPriority write_io_priority_;
+};
+
+enum TableTypeForTest : uint8_t { kMockTable = 0, kBlockBasedTable = 1 };
+
+}  // namespace
+
+class CompactionJobTestBase : public testing::Test {
+ protected:
+  CompactionJobTestBase(std::string dbname, const Comparator* ucmp,
+                        std::function<std::string(uint64_t)> encode_u64_ts,
+                        bool test_io_priority, TableTypeForTest table_type)
+      : dbname_(std::move(dbname)),
+        ucmp_(ucmp),
+        db_options_(),
+        mutable_cf_options_(cf_options_),
+        mutable_db_options_(),
+        table_cache_(NewLRUCache(50000, 16)),
+        write_buffer_manager_(db_options_.db_write_buffer_size),
+        versions_(new VersionSet(
+            dbname_, &db_options_, env_options_, table_cache_.get(),
+            &write_buffer_manager_, &write_controller_,
+            /*block_cache_tracer=*/nullptr,
+            /*io_tracer=*/nullptr, /*db_id*/ "", /*db_session_id*/ "")),
+        shutting_down_(false),
+        mock_table_factory_(new mock::MockTableFactory()),
+        error_handler_(nullptr, db_options_, &mutex_),
+        encode_u64_ts_(std::move(encode_u64_ts)),
+        test_io_priority_(test_io_priority),
+        table_type_(table_type) {
+    Env* base_env = Env::Default();
+    EXPECT_OK(
+        test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_));
+    env_ = base_env;
+    fs_ = env_->GetFileSystem();
+    // set default for the tests
+    mutable_cf_options_.target_file_size_base = 1024 * 1024;
+    mutable_cf_options_.max_compaction_bytes = 10 * 1024 * 1024;
+  }
+
+  void SetUp() override {
+    EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+    db_options_.env = env_;
+    db_options_.fs = fs_;
+    db_options_.db_paths.emplace_back(dbname_,
+                                      std::numeric_limits<uint64_t>::max());
+    cf_options_.comparator = ucmp_;
+    if (table_type_ == TableTypeForTest::kBlockBasedTable) {
+      BlockBasedTableOptions table_options;
+      cf_options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    } else if (table_type_ == TableTypeForTest::kMockTable) {
+      cf_options_.table_factory = mock_table_factory_;
+    } else {
+      assert(false);
+    }
+  }
+
+  std::string GenerateFileName(uint64_t file_number) {
+    FileMetaData meta;
+    std::vector<DbPath> db_paths;
+    db_paths.emplace_back(dbname_, std::numeric_limits<uint64_t>::max());
+    meta.fd = FileDescriptor(file_number, 0, 0);
+    return TableFileName(db_paths, meta.fd.GetNumber(), meta.fd.GetPathId());
+  }
+
+  std::string KeyStr(const std::string& user_key, const SequenceNumber seq_num,
+                     const ValueType t, uint64_t ts = 0) {
+    std::string user_key_with_ts = user_key + encode_u64_ts_(ts);
+    return InternalKey(user_key_with_ts, seq_num, t).Encode().ToString();
+  }
+
+  static std::string BlobStr(uint64_t blob_file_number, uint64_t offset,
+                             uint64_t size) {
+    std::string blob_index;
+    BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+                          kNoCompression);
+    return blob_index;
+  }
+
+  static std::string BlobStrTTL(uint64_t blob_file_number, uint64_t offset,
+                                uint64_t size, uint64_t expiration) {
+    std::string blob_index;
+    BlobIndex::EncodeBlobTTL(&blob_index, expiration, blob_file_number, offset,
+                             size, kNoCompression);
+    return blob_index;
+  }
+
+  static std::string BlobStrInlinedTTL(const Slice& value,
+                                       uint64_t expiration) {
+    std::string blob_index;
+    BlobIndex::EncodeInlinedTTL(&blob_index, expiration, value);
+    return blob_index;
+  }
+
+  // Creates a table with the specificied key value pairs.
+  void CreateTable(const std::string& table_name,
+                   const mock::KVVector& contents, uint64_t& file_size) {
+    std::unique_ptr<WritableFileWriter> file_writer;
+    Status s = WritableFileWriter::Create(fs_, table_name, FileOptions(),
+                                          &file_writer, nullptr);
+    ASSERT_OK(s);
+    std::unique_ptr<TableBuilder> table_builder(
+        cf_options_.table_factory->NewTableBuilder(
+            TableBuilderOptions(*cfd_->ioptions(), mutable_cf_options_,
+                                cfd_->internal_comparator(),
+                                cfd_->int_tbl_prop_collector_factories(),
+                                CompressionType::kNoCompression,
+                                CompressionOptions(), 0 /* column_family_id */,
+                                kDefaultColumnFamilyName, -1 /* level */),
+            file_writer.get()));
+    // Build table.
+    for (auto kv : contents) {
+      std::string key;
+      std::string value;
+      std::tie(key, value) = kv;
+      table_builder->Add(key, value);
+    }
+    ASSERT_OK(table_builder->Finish());
+    file_size = table_builder->FileSize();
+  }
+
+  void AddMockFile(const mock::KVVector& contents, int level = 0) {
+    assert(contents.size() > 0);
+
+    bool first_key = true;
+    std::string smallest, largest;
+    InternalKey smallest_key, largest_key;
+    SequenceNumber smallest_seqno = kMaxSequenceNumber;
+    SequenceNumber largest_seqno = 0;
+    uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
+    for (auto kv : contents) {
+      ParsedInternalKey key;
+      std::string skey;
+      std::string value;
+      std::tie(skey, value) = kv;
+      const Status pik_status =
+          ParseInternalKey(skey, &key, true /* log_err_key */);
+
+      smallest_seqno = std::min(smallest_seqno, key.sequence);
+      largest_seqno = std::max(largest_seqno, key.sequence);
+
+      if (first_key ||
+          cfd_->user_comparator()->Compare(key.user_key, smallest) < 0) {
+        smallest.assign(key.user_key.data(), key.user_key.size());
+        smallest_key.DecodeFrom(skey);
+      }
+      if (first_key ||
+          cfd_->user_comparator()->Compare(key.user_key, largest) > 0) {
+        largest.assign(key.user_key.data(), key.user_key.size());
+        largest_key.DecodeFrom(skey);
+      }
+
+      first_key = false;
+
+      if (pik_status.ok() && key.type == kTypeBlobIndex) {
+        BlobIndex blob_index;
+        const Status s = blob_index.DecodeFrom(value);
+        if (!s.ok()) {
+          continue;
+        }
+
+        if (blob_index.IsInlined() || blob_index.HasTTL() ||
+            blob_index.file_number() == kInvalidBlobFileNumber) {
+          continue;
+        }
+
+        if (oldest_blob_file_number == kInvalidBlobFileNumber ||
+            oldest_blob_file_number > blob_index.file_number()) {
+          oldest_blob_file_number = blob_index.file_number();
+        }
+      }
+    }
+
+    uint64_t file_number = versions_->NewFileNumber();
+
+    uint64_t file_size = 0;
+    if (table_type_ == TableTypeForTest::kBlockBasedTable) {
+      CreateTable(GenerateFileName(file_number), contents, file_size);
+    } else if (table_type_ == TableTypeForTest::kMockTable) {
+      file_size = 10;
+      EXPECT_OK(mock_table_factory_->CreateMockTable(
+          env_, GenerateFileName(file_number), std::move(contents)));
+    } else {
+      assert(false);
+    }
+
+    VersionEdit edit;
+    edit.AddFile(level, file_number, 0, file_size, smallest_key, largest_key,
+                 smallest_seqno, largest_seqno, false, Temperature::kUnknown,
+                 oldest_blob_file_number, kUnknownOldestAncesterTime,
+                 kUnknownFileCreationTime, kUnknownFileChecksum,
+                 kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+    mutex_.Lock();
+    EXPECT_OK(
+        versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+                               mutable_cf_options_, &edit, &mutex_, nullptr));
+    mutex_.Unlock();
+  }
+
+  void VerifyTables(int output_level,
+                    const std::vector<mock::KVVector>& expected_results,
+                    std::vector<uint64_t> expected_oldest_blob_file_numbers) {
+    if (expected_results.empty()) {
+      ASSERT_EQ(compaction_job_stats_.num_output_files, 0U);
+      return;
+    }
+    int expected_output_file_num = 0;
+    for (const auto& e : expected_results) {
+      if (!e.empty()) {
+        ++expected_output_file_num;
+      }
+    }
+    ASSERT_EQ(expected_output_file_num, compaction_job_stats_.num_output_files);
+    if (expected_output_file_num == 0) {
+      return;
+    }
+
+    if (expected_oldest_blob_file_numbers.empty()) {
+      expected_oldest_blob_file_numbers.resize(expected_output_file_num,
+                                               kInvalidBlobFileNumber);
+    }
+
+    auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+    if (table_type_ == TableTypeForTest::kMockTable) {
+      ASSERT_EQ(compaction_job_stats_.num_output_files,
+                expected_results.size());
+      mock_table_factory_->AssertLatestFiles(expected_results);
+    } else {
+      assert(table_type_ == TableTypeForTest::kBlockBasedTable);
+    }
+
+    auto output_files =
+        cfd->current()->storage_info()->LevelFiles(output_level);
+    ASSERT_EQ(expected_output_file_num, output_files.size());
+
+    if (table_type_ == TableTypeForTest::kMockTable) {
+      assert(output_files.size() ==
+             static_cast<size_t>(expected_output_file_num));
+      const FileMetaData* const output_file = output_files[0];
+      ASSERT_EQ(output_file->oldest_blob_file_number,
+                expected_oldest_blob_file_numbers[0]);
+      return;
+    }
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+      const FileMetaData* const output_file = output_files[i];
+      std::string file_name = GenerateFileName(output_file->fd.GetNumber());
+      const auto& fs = env_->GetFileSystem();
+      std::unique_ptr<RandomAccessFileReader> freader;
+      IOStatus ios = RandomAccessFileReader::Create(
+          fs, file_name, FileOptions(), &freader, nullptr);
+      ASSERT_OK(ios);
+      std::unique_ptr<TableReader> table_reader;
+      uint64_t file_size = output_file->fd.GetFileSize();
+      ReadOptions read_opts;
+      Status s = cf_options_.table_factory->NewTableReader(
+          read_opts,
+          TableReaderOptions(*cfd->ioptions(), nullptr, FileOptions(),
+                             cfd_->internal_comparator()),
+          std::move(freader), file_size, &table_reader, false);
+      ASSERT_OK(s);
+      assert(table_reader);
+      std::unique_ptr<InternalIterator> iiter(
+          table_reader->NewIterator(read_opts, nullptr, nullptr, true,
+                                    TableReaderCaller::kUncategorized));
+      assert(iiter);
+
+      mock::KVVector from_db;
+      for (iiter->SeekToFirst(); iiter->Valid(); iiter->Next()) {
+        const Slice key = iiter->key();
+        const Slice value = iiter->value();
+        from_db.emplace_back(
+            make_pair(key.ToString(false), value.ToString(false)));
+      }
+      ASSERT_EQ(expected_results[i], from_db);
+    }
+  }
+
+  void SetLastSequence(const SequenceNumber sequence_number) {
+    versions_->SetLastAllocatedSequence(sequence_number + 1);
+    versions_->SetLastPublishedSequence(sequence_number + 1);
+    versions_->SetLastSequence(sequence_number + 1);
+  }
+
+  // returns expected result after compaction
+  mock::KVVector CreateTwoFiles(bool gen_corrupted_keys) {
+    stl_wrappers::KVMap expected_results;
+    constexpr int kKeysPerFile = 10000;
+    constexpr int kCorruptKeysPerFile = 200;
+    constexpr int kMatchingKeys = kKeysPerFile / 2;
+    SequenceNumber sequence_number = 0;
+
+    auto corrupt_id = [&](int id) {
+      return gen_corrupted_keys && id > 0 && id <= kCorruptKeysPerFile;
+    };
+
+    for (int i = 0; i < 2; ++i) {
+      auto contents = mock::MakeMockFile();
+      for (int k = 0; k < kKeysPerFile; ++k) {
+        auto key = std::to_string(i * kMatchingKeys + k);
+        auto value = std::to_string(i * kKeysPerFile + k);
+        InternalKey internal_key(key, ++sequence_number, kTypeValue);
+
+        // This is how the key will look like once it's written in bottommost
+        // file
+        InternalKey bottommost_internal_key(key, 0, kTypeValue);
+
+        if (corrupt_id(k)) {
+          test::CorruptKeyType(&internal_key);
+          test::CorruptKeyType(&bottommost_internal_key);
+        }
+        contents.push_back({internal_key.Encode().ToString(), value});
+        if (i == 1 || k < kMatchingKeys || corrupt_id(k - kMatchingKeys)) {
+          expected_results.insert(
+              {bottommost_internal_key.Encode().ToString(), value});
+        }
+      }
+      mock::SortKVVector(&contents, ucmp_);
+
+      AddMockFile(contents);
+    }
+
+    SetLastSequence(sequence_number);
+
+    mock::KVVector expected_results_kvvector;
+    for (auto& kv : expected_results) {
+      expected_results_kvvector.push_back({kv.first, kv.second});
+    }
+
+    return expected_results_kvvector;
+  }
+
+  void NewDB() {
+    EXPECT_OK(DestroyDB(dbname_, Options()));
+    EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+
+    std::shared_ptr<Logger> info_log;
+    DBOptions db_opts = BuildDBOptions(db_options_, mutable_db_options_);
+    Status s = CreateLoggerFromOptions(dbname_, db_opts, &info_log);
+    ASSERT_OK(s);
+    db_options_.info_log = info_log;
+
+    versions_.reset(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_id*/ "", /*db_session_id*/ ""));
+    compaction_job_stats_.Reset();
+    ASSERT_OK(SetIdentityFile(env_, dbname_));
+
+    VersionEdit new_db;
+    new_db.SetLogNumber(0);
+    new_db.SetNextFile(2);
+    new_db.SetLastSequence(0);
+
+    const std::string manifest = DescriptorFileName(dbname_, 1);
+    std::unique_ptr<WritableFileWriter> file_writer;
+    const auto& fs = env_->GetFileSystem();
+    s = WritableFileWriter::Create(fs, manifest,
+                                   fs->OptimizeForManifestWrite(env_options_),
+                                   &file_writer, nullptr);
+
+    ASSERT_OK(s);
+    {
+      log::Writer log(std::move(file_writer), 0, false);
+      std::string record;
+      new_db.EncodeTo(&record);
+      s = log.AddRecord(record);
+    }
+    ASSERT_OK(s);
+    // Make "CURRENT" file that points to the new manifest file.
+    s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
+
+    ASSERT_OK(s);
+
+    cf_options_.merge_operator = merge_op_;
+    cf_options_.compaction_filter = compaction_filter_.get();
+    std::vector<ColumnFamilyDescriptor> column_families;
+    column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
+
+    ASSERT_OK(versions_->Recover(column_families, false));
+    cfd_ = versions_->GetColumnFamilySet()->GetDefault();
+  }
+
+  // input_files[i] on input_levels[i]
+  void RunLastLevelCompaction(
+      const std::vector<std::vector<FileMetaData*>>& input_files,
+      const std::vector<int> input_levels,
+      std::function<void(Compaction& comp)>&& verify_func,
+      const std::vector<SequenceNumber>& snapshots = {}) {
+    const int kLastLevel = cf_options_.num_levels - 1;
+    verify_per_key_placement_ = std::move(verify_func);
+    mock::KVVector empty_map;
+    RunCompaction(input_files, input_levels, {empty_map}, snapshots,
+                  kMaxSequenceNumber, kLastLevel, false);
+  }
+
+  // input_files[i] on input_levels[i]
+  void RunCompaction(
+      const std::vector<std::vector<FileMetaData*>>& input_files,
+      const std::vector<int>& input_levels,
+      const std::vector<mock::KVVector>& expected_results,
+      const std::vector<SequenceNumber>& snapshots = {},
+      SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
+      int output_level = 1, bool verify = true,
+      std::vector<uint64_t> expected_oldest_blob_file_numbers = {},
+      bool check_get_priority = false,
+      Env::IOPriority read_io_priority = Env::IO_TOTAL,
+      Env::IOPriority write_io_priority = Env::IO_TOTAL,
+      int max_subcompactions = 0) {
+    // For compaction, set fs as MockTestFileSystem to check the io_priority.
+    if (test_io_priority_) {
+      db_options_.fs.reset(
+          new MockTestFileSystem(fs_, read_io_priority, write_io_priority));
+    }
+
+    auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+
+    size_t num_input_files = 0;
+    std::vector<CompactionInputFiles> compaction_input_files;
+    for (size_t i = 0; i < input_files.size(); ++i) {
+      auto level_files = input_files[i];
+      CompactionInputFiles compaction_level;
+      compaction_level.level = input_levels[i];
+      compaction_level.files.insert(compaction_level.files.end(),
+                                    level_files.begin(), level_files.end());
+      compaction_input_files.push_back(compaction_level);
+      num_input_files += level_files.size();
+    }
+
+    std::vector<FileMetaData*> grandparents;
+    // it should actually be the next non-empty level
+    const int kGrandparentsLevel = output_level + 1;
+    if (kGrandparentsLevel < cf_options_.num_levels) {
+      grandparents =
+          cfd_->current()->storage_info()->LevelFiles(kGrandparentsLevel);
+    }
+
+    Compaction compaction(
+        cfd->current()->storage_info(), *cfd->ioptions(),
+        *cfd->GetLatestMutableCFOptions(), mutable_db_options_,
+        compaction_input_files, output_level,
+        mutable_cf_options_.target_file_size_base,
+        mutable_cf_options_.max_compaction_bytes, 0, kNoCompression,
+        cfd->GetLatestMutableCFOptions()->compression_opts,
+        Temperature::kUnknown, max_subcompactions, grandparents, true);
+    compaction.SetInputVersion(cfd->current());
+
+    assert(db_options_.info_log);
+    LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
+    mutex_.Lock();
+    EventLogger event_logger(db_options_.info_log.get());
+    // TODO(yiwu) add a mock snapshot checker and add test for it.
+    SnapshotChecker* snapshot_checker = nullptr;
+    ASSERT_TRUE(full_history_ts_low_.empty() ||
+                ucmp_->timestamp_size() == full_history_ts_low_.size());
+    const std::atomic<bool> kManualCompactionCanceledFalse{false};
+    CompactionJob compaction_job(
+        0, &compaction, db_options_, mutable_db_options_, env_options_,
+        versions_.get(), &shutting_down_, &log_buffer, nullptr, nullptr,
+        nullptr, nullptr, &mutex_, &error_handler_, snapshots,
+        earliest_write_conflict_snapshot, snapshot_checker, nullptr,
+        table_cache_, &event_logger, false, false, dbname_,
+        &compaction_job_stats_, Env::Priority::USER, nullptr /* IOTracer */,
+        /*manual_compaction_canceled=*/kManualCompactionCanceledFalse,
+        env_->GenerateUniqueId(), DBImpl::GenerateDbSessionId(nullptr),
+        full_history_ts_low_);
+    VerifyInitializationOfCompactionJobStats(compaction_job_stats_);
+
+    compaction_job.Prepare();
+    mutex_.Unlock();
+    Status s = compaction_job.Run();
+    ASSERT_OK(s);
+    ASSERT_OK(compaction_job.io_status());
+    mutex_.Lock();
+    ASSERT_OK(compaction_job.Install(*cfd->GetLatestMutableCFOptions()));
+    ASSERT_OK(compaction_job.io_status());
+    mutex_.Unlock();
+    log_buffer.FlushBufferToLog();
+
+    if (verify) {
+      ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
+      ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
+
+      VerifyTables(output_level, expected_results,
+                   expected_oldest_blob_file_numbers);
+    }
+
+    if (check_get_priority) {
+      CheckGetRateLimiterPriority(compaction_job);
+    }
+
+    if (verify_per_key_placement_) {
+      // Verify per_key_placement compaction
+      assert(compaction.SupportsPerKeyPlacement());
+      verify_per_key_placement_(compaction);
+    }
+  }
+
+  void CheckGetRateLimiterPriority(CompactionJob& compaction_job) {
+    // When the state from WriteController is normal.
+    ASSERT_EQ(compaction_job.GetRateLimiterPriority(), Env::IO_LOW);
+
+    WriteController* write_controller =
+        compaction_job.versions_->GetColumnFamilySet()->write_controller();
+
+    {
+      // When the state from WriteController is Delayed.
+      std::unique_ptr<WriteControllerToken> delay_token =
+          write_controller->GetDelayToken(1000000);
+      ASSERT_EQ(compaction_job.GetRateLimiterPriority(), Env::IO_USER);
+    }
+
+    {
+      // When the state from WriteController is Stopped.
+      std::unique_ptr<WriteControllerToken> stop_token =
+          write_controller->GetStopToken();
+      ASSERT_EQ(compaction_job.GetRateLimiterPriority(), Env::IO_USER);
+    }
+  }
+
+  std::shared_ptr<Env> env_guard_;
+  Env* env_;
+  std::shared_ptr<FileSystem> fs_;
+  std::string dbname_;
+  const Comparator* const ucmp_;
+  EnvOptions env_options_;
+  ImmutableDBOptions db_options_;
+  ColumnFamilyOptions cf_options_;
+  MutableCFOptions mutable_cf_options_;
+  MutableDBOptions mutable_db_options_;
+  std::shared_ptr<Cache> table_cache_;
+  WriteController write_controller_;
+  WriteBufferManager write_buffer_manager_;
+  std::unique_ptr<VersionSet> versions_;
+  InstrumentedMutex mutex_;
+  std::atomic<bool> shutting_down_;
+  std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
+  CompactionJobStats compaction_job_stats_;
+  ColumnFamilyData* cfd_;
+  std::unique_ptr<CompactionFilter> compaction_filter_;
+  std::shared_ptr<MergeOperator> merge_op_;
+  ErrorHandler error_handler_;
+  std::string full_history_ts_low_;
+  const std::function<std::string(uint64_t)> encode_u64_ts_;
+  const bool test_io_priority_;
+  std::function<void(Compaction& comp)> verify_per_key_placement_;
+  const TableTypeForTest table_type_ = kMockTable;
+};
+
+// TODO(icanadi) Make it simpler once we mock out VersionSet
+class CompactionJobTest : public CompactionJobTestBase {
+ public:
+  CompactionJobTest()
+      : CompactionJobTestBase(
+            test::PerThreadDBPath("compaction_job_test"), BytewiseComparator(),
+            [](uint64_t /*ts*/) { return ""; }, /*test_io_priority=*/false,
+            TableTypeForTest::kMockTable) {}
+};
+
+TEST_F(CompactionJobTest, Simple) {
+  NewDB();
+
+  auto expected_results = CreateTwoFiles(false);
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  constexpr int input_level = 0;
+  auto files = cfd->current()->storage_info()->LevelFiles(input_level);
+  ASSERT_EQ(2U, files.size());
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, DISABLED_SimpleCorrupted) {
+  NewDB();
+
+  auto expected_results = CreateTwoFiles(true);
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  constexpr int input_level = 0;
+  auto files = cfd->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results});
+  ASSERT_EQ(compaction_job_stats_.num_corrupt_keys, 400U);
+}
+
+TEST_F(CompactionJobTest, SimpleDeletion) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({{KeyStr("c", 4U, kTypeDeletion), ""},
+                                   {KeyStr("c", 3U, kTypeValue), "val"}});
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("b", 2U, kTypeValue), "val"},
+                                   {KeyStr("b", 1U, kTypeValue), "val"}});
+  AddMockFile(file2);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("b", 0U, kTypeValue), "val"}});
+
+  SetLastSequence(4U);
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, OutputNothing) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"}});
+
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("a", 2U, kTypeDeletion), ""}});
+
+  AddMockFile(file2);
+
+  auto expected_results = mock::MakeMockFile();
+
+  SetLastSequence(4U);
+
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, SimpleOverwrite) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("a", 3U, kTypeValue), "val2"},
+      {KeyStr("b", 4U, kTypeValue), "val3"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"},
+                                   {KeyStr("b", 2U, kTypeValue), "val"}});
+  AddMockFile(file2);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "val2"},
+                          {KeyStr("b", 0U, kTypeValue), "val3"}});
+
+  SetLastSequence(4U);
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, SimpleNonLastLevel) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("a", 5U, kTypeValue), "val2"},
+      {KeyStr("b", 6U, kTypeValue), "val3"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("a", 3U, kTypeValue), "val"},
+                                   {KeyStr("b", 4U, kTypeValue), "val"}});
+  AddMockFile(file2, 1);
+
+  auto file3 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"},
+                                   {KeyStr("b", 2U, kTypeValue), "val"}});
+  AddMockFile(file3, 2);
+
+  // Because level 1 is not the last level, the sequence numbers of a and b
+  // cannot be set to 0
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 5U, kTypeValue), "val2"},
+                          {KeyStr("b", 6U, kTypeValue), "val3"}});
+
+  SetLastSequence(6U);
+  const std::vector<int> input_levels = {0, 1};
+  auto lvl0_files =
+      cfd_->current()->storage_info()->LevelFiles(input_levels[0]);
+  auto lvl1_files =
+      cfd_->current()->storage_info()->LevelFiles(input_levels[1]);
+  RunCompaction({lvl0_files, lvl1_files}, input_levels, {expected_results});
+}
+
+TEST_F(CompactionJobTest, SimpleMerge) {
+  merge_op_ = MergeOperators::CreateStringAppendOperator();
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("a", 5U, kTypeMerge), "5"},
+      {KeyStr("a", 4U, kTypeMerge), "4"},
+      {KeyStr("a", 3U, kTypeValue), "3"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile(
+      {{KeyStr("b", 2U, kTypeMerge), "2"}, {KeyStr("b", 1U, kTypeValue), "1"}});
+  AddMockFile(file2);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "3,4,5"},
+                          {KeyStr("b", 0U, kTypeValue), "1,2"}});
+
+  SetLastSequence(5U);
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, NonAssocMerge) {
+  merge_op_ = MergeOperators::CreateStringAppendTESTOperator();
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("a", 5U, kTypeMerge), "5"},
+      {KeyStr("a", 4U, kTypeMerge), "4"},
+      {KeyStr("a", 3U, kTypeMerge), "3"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile(
+      {{KeyStr("b", 2U, kTypeMerge), "2"}, {KeyStr("b", 1U, kTypeMerge), "1"}});
+  AddMockFile(file2);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "3,4,5"},
+                          {KeyStr("b", 0U, kTypeValue), "1,2"}});
+
+  SetLastSequence(5U);
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+// Filters merge operands with value 10.
+TEST_F(CompactionJobTest, MergeOperandFilter) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+  compaction_filter_.reset(new test::FilterNumber(10U));
+  NewDB();
+
+  auto file1 = mock::MakeMockFile(
+      {{KeyStr("a", 5U, kTypeMerge), test::EncodeInt(5U)},
+       {KeyStr("a", 4U, kTypeMerge), test::EncodeInt(10U)},  // Filtered
+       {KeyStr("a", 3U, kTypeMerge), test::EncodeInt(3U)}});
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({
+      {KeyStr("b", 2U, kTypeMerge), test::EncodeInt(2U)},
+      {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)}  // Filtered
+  });
+  AddMockFile(file2);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), test::EncodeInt(8U)},
+                          {KeyStr("b", 0U, kTypeValue), test::EncodeInt(2U)}});
+
+  SetLastSequence(5U);
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, FilterSomeMergeOperands) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+  compaction_filter_.reset(new test::FilterNumber(10U));
+  NewDB();
+
+  auto file1 = mock::MakeMockFile(
+      {{KeyStr("a", 5U, kTypeMerge), test::EncodeInt(5U)},
+       {KeyStr("a", 4U, kTypeMerge), test::EncodeInt(10U)},  // Filtered
+       {KeyStr("a", 3U, kTypeValue), test::EncodeInt(5U)},
+       {KeyStr("d", 8U, kTypeMerge), test::EncodeInt(10U)}});
+  AddMockFile(file1);
+
+  auto file2 =
+      mock::MakeMockFile({{KeyStr("b", 2U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("c", 2U, kTypeMerge), test::EncodeInt(3U)},
+                          {KeyStr("c", 1U, kTypeValue), test::EncodeInt(7U)},
+                          {KeyStr("d", 1U, kTypeValue), test::EncodeInt(6U)}});
+  AddMockFile(file2);
+
+  auto file3 =
+      mock::MakeMockFile({{KeyStr("a", 1U, kTypeMerge), test::EncodeInt(3U)}});
+  AddMockFile(file3, 2);
+
+  auto expected_results = mock::MakeMockFile({
+      {KeyStr("a", 5U, kTypeValue), test::EncodeInt(10U)},
+      {KeyStr("c", 2U, kTypeValue), test::EncodeInt(10U)},
+      {KeyStr("d", 1U, kTypeValue), test::EncodeInt(6U)}
+      // b does not appear because the operands are filtered
+  });
+
+  SetLastSequence(5U);
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+// Test where all operands/merge results are filtered out.
+TEST_F(CompactionJobTest, FilterAllMergeOperands) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+  compaction_filter_.reset(new test::FilterNumber(10U));
+  NewDB();
+
+  auto file1 =
+      mock::MakeMockFile({{KeyStr("a", 11U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("a", 10U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("a", 9U, kTypeMerge), test::EncodeInt(10U)}});
+  AddMockFile(file1);
+
+  auto file2 =
+      mock::MakeMockFile({{KeyStr("b", 8U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 7U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 6U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 5U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 4U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 3U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 2U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("c", 2U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("c", 1U, kTypeMerge), test::EncodeInt(10U)}});
+  AddMockFile(file2);
+
+  auto file3 =
+      mock::MakeMockFile({{KeyStr("a", 2U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)}});
+  AddMockFile(file3, 2);
+
+  SetLastSequence(11U);
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+
+  mock::KVVector empty_map;
+  RunCompaction({files}, {input_level}, {empty_map});
+}
+
+TEST_F(CompactionJobTest, SimpleSingleDelete) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("a", 5U, kTypeDeletion), ""},
+      {KeyStr("b", 6U, kTypeSingleDeletion), ""},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("a", 3U, kTypeValue), "val"},
+                                   {KeyStr("b", 4U, kTypeValue), "val"}});
+  AddMockFile(file2);
+
+  auto file3 = mock::MakeMockFile({
+      {KeyStr("a", 1U, kTypeValue), "val"},
+  });
+  AddMockFile(file3, 2);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 5U, kTypeDeletion), ""}});
+
+  SetLastSequence(6U);
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, SingleDeleteSnapshots) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("a", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("b", 21U, kTypeSingleDeletion), ""},
+      {KeyStr("c", 22U, kTypeSingleDeletion), ""},
+      {KeyStr("d", 9U, kTypeSingleDeletion), ""},
+      {KeyStr("f", 21U, kTypeSingleDeletion), ""},
+      {KeyStr("j", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("j", 9U, kTypeSingleDeletion), ""},
+      {KeyStr("k", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("k", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("l", 3U, kTypeSingleDeletion), ""},
+      {KeyStr("l", 2U, kTypeSingleDeletion), ""},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({
+      {KeyStr("0", 2U, kTypeSingleDeletion), ""},
+      {KeyStr("a", 11U, kTypeValue), "val1"},
+      {KeyStr("b", 11U, kTypeValue), "val2"},
+      {KeyStr("c", 21U, kTypeValue), "val3"},
+      {KeyStr("d", 8U, kTypeValue), "val4"},
+      {KeyStr("e", 2U, kTypeSingleDeletion), ""},
+      {KeyStr("f", 1U, kTypeValue), "val1"},
+      {KeyStr("g", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("h", 2U, kTypeSingleDeletion), ""},
+      {KeyStr("m", 12U, kTypeValue), "val1"},
+      {KeyStr("m", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("m", 8U, kTypeValue), "val2"},
+  });
+  AddMockFile(file2);
+
+  auto file3 = mock::MakeMockFile({
+      {KeyStr("A", 1U, kTypeValue), "val"},
+      {KeyStr("e", 1U, kTypeValue), "val"},
+  });
+  AddMockFile(file3, 2);
+
+  auto expected_results = mock::MakeMockFile({
+      {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("a", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("a", 11U, kTypeValue), ""},
+      {KeyStr("b", 21U, kTypeSingleDeletion), ""},
+      {KeyStr("b", 11U, kTypeValue), "val2"},
+      {KeyStr("c", 22U, kTypeSingleDeletion), ""},
+      {KeyStr("c", 21U, kTypeValue), ""},
+      {KeyStr("e", 2U, kTypeSingleDeletion), ""},
+      {KeyStr("f", 21U, kTypeSingleDeletion), ""},
+      {KeyStr("f", 1U, kTypeValue), "val1"},
+      {KeyStr("g", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("j", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("k", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("m", 12U, kTypeValue), "val1"},
+      {KeyStr("m", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("m", 8U, kTypeValue), "val2"},
+  });
+
+  SetLastSequence(22U);
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results}, {10U, 20U}, 10U);
+}
+
+TEST_F(CompactionJobTest, EarliestWriteConflictSnapshot) {
+  NewDB();
+
+  // Test multiple snapshots where the earliest snapshot is not a
+  // write-conflic-snapshot.
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("A", 24U, kTypeSingleDeletion), ""},
+      {KeyStr("A", 23U, kTypeValue), "val"},
+      {KeyStr("B", 24U, kTypeSingleDeletion), ""},
+      {KeyStr("B", 23U, kTypeValue), "val"},
+      {KeyStr("D", 24U, kTypeSingleDeletion), ""},
+      {KeyStr("G", 32U, kTypeSingleDeletion), ""},
+      {KeyStr("G", 31U, kTypeValue), "val"},
+      {KeyStr("G", 24U, kTypeSingleDeletion), ""},
+      {KeyStr("G", 23U, kTypeValue), "val2"},
+      {KeyStr("H", 31U, kTypeValue), "val"},
+      {KeyStr("H", 24U, kTypeSingleDeletion), ""},
+      {KeyStr("H", 23U, kTypeValue), "val"},
+      {KeyStr("I", 35U, kTypeSingleDeletion), ""},
+      {KeyStr("I", 34U, kTypeValue), "val2"},
+      {KeyStr("I", 33U, kTypeSingleDeletion), ""},
+      {KeyStr("I", 32U, kTypeValue), "val3"},
+      {KeyStr("I", 31U, kTypeSingleDeletion), ""},
+      {KeyStr("J", 34U, kTypeValue), "val"},
+      {KeyStr("J", 33U, kTypeSingleDeletion), ""},
+      {KeyStr("J", 25U, kTypeValue), "val2"},
+      {KeyStr("J", 24U, kTypeSingleDeletion), ""},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({
+      {KeyStr("A", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("A", 13U, kTypeValue), "val2"},
+      {KeyStr("C", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("C", 13U, kTypeValue), "val"},
+      {KeyStr("E", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("F", 4U, kTypeSingleDeletion), ""},
+      {KeyStr("F", 3U, kTypeValue), "val"},
+      {KeyStr("G", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("G", 13U, kTypeValue), "val3"},
+      {KeyStr("H", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("H", 13U, kTypeValue), "val2"},
+      {KeyStr("I", 13U, kTypeValue), "val4"},
+      {KeyStr("I", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("I", 11U, kTypeValue), "val5"},
+      {KeyStr("J", 15U, kTypeValue), "val3"},
+      {KeyStr("J", 14U, kTypeSingleDeletion), ""},
+  });
+  AddMockFile(file2);
+
+  auto expected_results = mock::MakeMockFile({
+      {KeyStr("A", 24U, kTypeSingleDeletion), ""},
+      {KeyStr("A", 23U, kTypeValue), ""},
+      {KeyStr("B", 24U, kTypeSingleDeletion), ""},
+      {KeyStr("B", 23U, kTypeValue), ""},
+      {KeyStr("D", 24U, kTypeSingleDeletion), ""},
+      {KeyStr("E", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("G", 32U, kTypeSingleDeletion), ""},
+      {KeyStr("G", 31U, kTypeValue), ""},
+      {KeyStr("H", 31U, kTypeValue), "val"},
+      {KeyStr("I", 35U, kTypeSingleDeletion), ""},
+      {KeyStr("I", 34U, kTypeValue), ""},
+      {KeyStr("I", 31U, kTypeSingleDeletion), ""},
+      {KeyStr("I", 13U, kTypeValue), "val4"},
+      {KeyStr("J", 34U, kTypeValue), "val"},
+      {KeyStr("J", 33U, kTypeSingleDeletion), ""},
+      {KeyStr("J", 25U, kTypeValue), "val2"},
+      {KeyStr("J", 24U, kTypeSingleDeletion), ""},
+      {KeyStr("J", 15U, kTypeValue), "val3"},
+      {KeyStr("J", 14U, kTypeSingleDeletion), ""},
+  });
+
+  SetLastSequence(24U);
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results}, {10U, 20U, 30U},
+                20U);
+}
+
+TEST_F(CompactionJobTest, SingleDeleteZeroSeq) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("A", 10U, kTypeSingleDeletion), ""},
+      {KeyStr("dummy", 5U, kTypeValue), "val2"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({
+      {KeyStr("A", 0U, kTypeValue), "val"},
+  });
+  AddMockFile(file2);
+
+  auto expected_results = mock::MakeMockFile({
+      {KeyStr("dummy", 0U, kTypeValue), "val2"},
+  });
+
+  SetLastSequence(22U);
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results}, {});
+}
+
+TEST_F(CompactionJobTest, MultiSingleDelete) {
+  // Tests three scenarios involving multiple single delete/put pairs:
+  //
+  // A: Put Snapshot SDel Put SDel -> Put Snapshot SDel
+  // B: Snapshot Put SDel Put SDel Snapshot -> Snapshot SDel Snapshot
+  // C: SDel Put SDel Snapshot Put -> Snapshot Put
+  // D: (Put) SDel Snapshot Put SDel -> (Put) SDel Snapshot SDel
+  // E: Put SDel Snapshot Put SDel -> Snapshot SDel
+  // F: Put SDel Put Sdel Snapshot -> removed
+  // G: Snapshot SDel Put SDel Put -> Snapshot Put SDel
+  // H: (Put) Put SDel Put Sdel Snapshot -> Removed
+  // I: (Put) Snapshot Put SDel Put SDel -> SDel
+  // J: Put Put SDel Put SDel SDel Snapshot Put Put SDel SDel Put
+  //      -> Snapshot Put
+  // K: SDel SDel Put SDel Put Put Snapshot SDel Put SDel SDel Put SDel
+  //      -> Snapshot Put Snapshot SDel
+  // L: SDel Put SDel Put SDel Snapshot SDel Put SDel SDel Put SDel
+  //      -> Snapshot SDel Put SDel
+  // M: (Put) SDel Put SDel Put SDel Snapshot Put SDel SDel Put SDel SDel
+  //      -> SDel Snapshot Put SDel
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("A", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("A", 13U, kTypeValue), "val5"},
+      {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("B", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("B", 13U, kTypeValue), "val2"},
+      {KeyStr("C", 14U, kTypeValue), "val3"},
+      {KeyStr("D", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("D", 11U, kTypeValue), "val4"},
+      {KeyStr("G", 15U, kTypeValue), "val"},
+      {KeyStr("G", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("G", 13U, kTypeValue), "val"},
+      {KeyStr("I", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("I", 13U, kTypeValue), "val"},
+      {KeyStr("J", 15U, kTypeValue), "val"},
+      {KeyStr("J", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("J", 13U, kTypeSingleDeletion), ""},
+      {KeyStr("J", 12U, kTypeValue), "val"},
+      {KeyStr("J", 11U, kTypeValue), "val"},
+      {KeyStr("K", 16U, kTypeSingleDeletion), ""},
+      {KeyStr("K", 15U, kTypeValue), "val1"},
+      {KeyStr("K", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("K", 13U, kTypeSingleDeletion), ""},
+      {KeyStr("K", 12U, kTypeValue), "val2"},
+      {KeyStr("K", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("L", 16U, kTypeSingleDeletion), ""},
+      {KeyStr("L", 15U, kTypeValue), "val"},
+      {KeyStr("L", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("L", 13U, kTypeSingleDeletion), ""},
+      {KeyStr("L", 12U, kTypeValue), "val"},
+      {KeyStr("L", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("M", 16U, kTypeSingleDeletion), ""},
+      {KeyStr("M", 15U, kTypeSingleDeletion), ""},
+      {KeyStr("M", 14U, kTypeValue), "val"},
+      {KeyStr("M", 13U, kTypeSingleDeletion), ""},
+      {KeyStr("M", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("M", 11U, kTypeValue), "val"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({
+      {KeyStr("A", 10U, kTypeValue), "val"},
+      {KeyStr("B", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("B", 11U, kTypeValue), "val2"},
+      {KeyStr("C", 10U, kTypeSingleDeletion), ""},
+      {KeyStr("C", 9U, kTypeValue), "val6"},
+      {KeyStr("C", 8U, kTypeSingleDeletion), ""},
+      {KeyStr("D", 10U, kTypeSingleDeletion), ""},
+      {KeyStr("E", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("E", 11U, kTypeValue), "val"},
+      {KeyStr("E", 5U, kTypeSingleDeletion), ""},
+      {KeyStr("E", 4U, kTypeValue), "val"},
+      {KeyStr("F", 6U, kTypeSingleDeletion), ""},
+      {KeyStr("F", 5U, kTypeValue), "val"},
+      {KeyStr("F", 4U, kTypeSingleDeletion), ""},
+      {KeyStr("F", 3U, kTypeValue), "val"},
+      {KeyStr("G", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("H", 6U, kTypeSingleDeletion), ""},
+      {KeyStr("H", 5U, kTypeValue), "val"},
+      {KeyStr("H", 4U, kTypeSingleDeletion), ""},
+      {KeyStr("H", 3U, kTypeValue), "val"},
+      {KeyStr("I", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("I", 11U, kTypeValue), "val"},
+      {KeyStr("J", 6U, kTypeSingleDeletion), ""},
+      {KeyStr("J", 5U, kTypeSingleDeletion), ""},
+      {KeyStr("J", 4U, kTypeValue), "val"},
+      {KeyStr("J", 3U, kTypeSingleDeletion), ""},
+      {KeyStr("J", 2U, kTypeValue), "val"},
+      {KeyStr("K", 8U, kTypeValue), "val3"},
+      {KeyStr("K", 7U, kTypeValue), "val4"},
+      {KeyStr("K", 6U, kTypeSingleDeletion), ""},
+      {KeyStr("K", 5U, kTypeValue), "val5"},
+      {KeyStr("K", 2U, kTypeSingleDeletion), ""},
+      {KeyStr("K", 1U, kTypeSingleDeletion), ""},
+      {KeyStr("L", 5U, kTypeSingleDeletion), ""},
+      {KeyStr("L", 4U, kTypeValue), "val"},
+      {KeyStr("L", 3U, kTypeSingleDeletion), ""},
+      {KeyStr("L", 2U, kTypeValue), "val"},
+      {KeyStr("L", 1U, kTypeSingleDeletion), ""},
+      {KeyStr("M", 10U, kTypeSingleDeletion), ""},
+      {KeyStr("M", 7U, kTypeValue), "val"},
+      {KeyStr("M", 5U, kTypeSingleDeletion), ""},
+      {KeyStr("M", 4U, kTypeValue), "val"},
+      {KeyStr("M", 3U, kTypeSingleDeletion), ""},
+  });
+  AddMockFile(file2);
+
+  auto file3 = mock::MakeMockFile({
+      {KeyStr("D", 1U, kTypeValue), "val"},
+      {KeyStr("H", 1U, kTypeValue), "val"},
+      {KeyStr("I", 2U, kTypeValue), "val"},
+  });
+  AddMockFile(file3, 2);
+
+  auto file4 = mock::MakeMockFile({
+      {KeyStr("M", 1U, kTypeValue), "val"},
+  });
+  AddMockFile(file4, 2);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("A", 14U, kTypeSingleDeletion), ""},
+                          {KeyStr("A", 13U, kTypeValue), ""},
+                          {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+                          {KeyStr("A", 10U, kTypeValue), "val"},
+                          {KeyStr("B", 14U, kTypeSingleDeletion), ""},
+                          {KeyStr("B", 13U, kTypeValue), ""},
+                          {KeyStr("C", 14U, kTypeValue), "val3"},
+                          {KeyStr("D", 12U, kTypeSingleDeletion), ""},
+                          {KeyStr("D", 11U, kTypeValue), ""},
+                          {KeyStr("D", 10U, kTypeSingleDeletion), ""},
+                          {KeyStr("E", 12U, kTypeSingleDeletion), ""},
+                          {KeyStr("E", 11U, kTypeValue), ""},
+                          {KeyStr("G", 15U, kTypeValue), "val"},
+                          {KeyStr("G", 12U, kTypeSingleDeletion), ""},
+                          {KeyStr("I", 14U, kTypeSingleDeletion), ""},
+                          {KeyStr("I", 13U, kTypeValue), ""},
+                          {KeyStr("J", 15U, kTypeValue), "val"},
+                          {KeyStr("K", 16U, kTypeSingleDeletion), ""},
+                          {KeyStr("K", 15U, kTypeValue), ""},
+                          {KeyStr("K", 11U, kTypeSingleDeletion), ""},
+                          {KeyStr("K", 8U, kTypeValue), "val3"},
+                          {KeyStr("L", 16U, kTypeSingleDeletion), ""},
+                          {KeyStr("L", 15U, kTypeValue), ""},
+                          {KeyStr("L", 11U, kTypeSingleDeletion), ""},
+                          {KeyStr("M", 15U, kTypeSingleDeletion), ""},
+                          {KeyStr("M", 14U, kTypeValue), ""},
+                          {KeyStr("M", 3U, kTypeSingleDeletion), ""}});
+
+  SetLastSequence(22U);
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results}, {10U}, 10U);
+}
+
+// This test documents the behavior where a corrupt key follows a deletion or a
+// single deletion and the (single) deletion gets removed while the corrupt key
+// gets written out. TODO(noetzli): We probably want a better way to treat
+// corrupt keys.
+TEST_F(CompactionJobTest, DISABLED_CorruptionAfterDeletion) {
+  NewDB();
+
+  auto file1 =
+      mock::MakeMockFile({{test::KeyStr("A", 6U, kTypeValue), "val3"},
+                          {test::KeyStr("a", 5U, kTypeDeletion), ""},
+                          {test::KeyStr("a", 4U, kTypeValue, true), "val"}});
+  AddMockFile(file1);
+
+  auto file2 =
+      mock::MakeMockFile({{test::KeyStr("b", 3U, kTypeSingleDeletion), ""},
+                          {test::KeyStr("b", 2U, kTypeValue, true), "val"},
+                          {test::KeyStr("c", 1U, kTypeValue), "val2"}});
+  AddMockFile(file2);
+
+  auto expected_results =
+      mock::MakeMockFile({{test::KeyStr("A", 0U, kTypeValue), "val3"},
+                          {test::KeyStr("a", 0U, kTypeValue, true), "val"},
+                          {test::KeyStr("b", 0U, kTypeValue, true), "val"},
+                          {test::KeyStr("c", 0U, kTypeValue), "val2"}});
+
+  SetLastSequence(6U);
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, OldestBlobFileNumber) {
+  NewDB();
+
+  // Note: blob1 is inlined TTL, so it will not be considered for the purposes
+  // of identifying the oldest referenced blob file. Similarly, blob6 will be
+  // ignored because it has TTL and hence refers to a TTL blob file.
+  const stl_wrappers::KVMap::value_type blob1(
+      KeyStr("a", 1U, kTypeBlobIndex), BlobStrInlinedTTL("foo", 1234567890ULL));
+  const stl_wrappers::KVMap::value_type blob2(KeyStr("b", 2U, kTypeBlobIndex),
+                                              BlobStr(59, 123456, 999));
+  const stl_wrappers::KVMap::value_type blob3(KeyStr("c", 3U, kTypeBlobIndex),
+                                              BlobStr(138, 1000, 1 << 8));
+  auto file1 = mock::MakeMockFile({blob1, blob2, blob3});
+  AddMockFile(file1);
+
+  const stl_wrappers::KVMap::value_type blob4(KeyStr("d", 4U, kTypeBlobIndex),
+                                              BlobStr(199, 3 << 10, 1 << 20));
+  const stl_wrappers::KVMap::value_type blob5(KeyStr("e", 5U, kTypeBlobIndex),
+                                              BlobStr(19, 6789, 333));
+  const stl_wrappers::KVMap::value_type blob6(
+      KeyStr("f", 6U, kTypeBlobIndex),
+      BlobStrTTL(5, 2048, 1 << 7, 1234567890ULL));
+  auto file2 = mock::MakeMockFile({blob4, blob5, blob6});
+  AddMockFile(file2);
+
+  const stl_wrappers::KVMap::value_type expected_blob1(
+      KeyStr("a", 0U, kTypeBlobIndex), blob1.second);
+  const stl_wrappers::KVMap::value_type expected_blob2(
+      KeyStr("b", 0U, kTypeBlobIndex), blob2.second);
+  const stl_wrappers::KVMap::value_type expected_blob3(
+      KeyStr("c", 0U, kTypeBlobIndex), blob3.second);
+  const stl_wrappers::KVMap::value_type expected_blob4(
+      KeyStr("d", 0U, kTypeBlobIndex), blob4.second);
+  const stl_wrappers::KVMap::value_type expected_blob5(
+      KeyStr("e", 0U, kTypeBlobIndex), blob5.second);
+  const stl_wrappers::KVMap::value_type expected_blob6(
+      KeyStr("f", 0U, kTypeBlobIndex), blob6.second);
+  auto expected_results =
+      mock::MakeMockFile({expected_blob1, expected_blob2, expected_blob3,
+                          expected_blob4, expected_blob5, expected_blob6});
+
+  SetLastSequence(6U);
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results},
+                std::vector<SequenceNumber>(), kMaxSequenceNumber,
+                /* output_level */ 1, /* verify */ true,
+                /* expected_oldest_blob_file_numbers */ {19});
+}
+
+TEST_F(CompactionJobTest, VerifyPenultimateLevelOutput) {
+  cf_options_.bottommost_temperature = Temperature::kCold;
+  SyncPoint::GetInstance()->SetCallBack(
+      "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
+        auto supports_per_key_placement = static_cast<bool*>(arg);
+        *supports_per_key_placement = true;
+      });
+
+  std::atomic_uint64_t latest_cold_seq = 0;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+        auto context = static_cast<PerKeyPlacementContext*>(arg);
+        context->output_to_penultimate_level =
+            context->seq_num > latest_cold_seq;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  NewDB();
+
+  // Add files on different levels that may overlap
+  auto file0_1 = mock::MakeMockFile({{KeyStr("z", 12U, kTypeValue), "val"}});
+  AddMockFile(file0_1);
+
+  auto file1_1 = mock::MakeMockFile({{KeyStr("b", 10U, kTypeValue), "val"},
+                                     {KeyStr("f", 11U, kTypeValue), "val"}});
+  AddMockFile(file1_1, 1);
+  auto file1_2 = mock::MakeMockFile({{KeyStr("j", 12U, kTypeValue), "val"},
+                                     {KeyStr("k", 13U, kTypeValue), "val"}});
+  AddMockFile(file1_2, 1);
+  auto file1_3 = mock::MakeMockFile({{KeyStr("p", 14U, kTypeValue), "val"},
+                                     {KeyStr("u", 15U, kTypeValue), "val"}});
+  AddMockFile(file1_3, 1);
+
+  auto file2_1 = mock::MakeMockFile({{KeyStr("f", 8U, kTypeValue), "val"},
+                                     {KeyStr("h", 9U, kTypeValue), "val"}});
+  AddMockFile(file2_1, 2);
+  auto file2_2 = mock::MakeMockFile({{KeyStr("m", 6U, kTypeValue), "val"},
+                                     {KeyStr("p", 7U, kTypeValue), "val"}});
+  AddMockFile(file2_2, 2);
+
+  auto file3_1 = mock::MakeMockFile({{KeyStr("g", 2U, kTypeValue), "val"},
+                                     {KeyStr("k", 3U, kTypeValue), "val"}});
+  AddMockFile(file3_1, 3);
+  auto file3_2 = mock::MakeMockFile({{KeyStr("v", 4U, kTypeValue), "val"},
+                                     {KeyStr("x", 5U, kTypeValue), "val"}});
+  AddMockFile(file3_2, 3);
+
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  const std::vector<int> input_levels = {0, 1, 2, 3};
+  auto files0 = cfd->current()->storage_info()->LevelFiles(input_levels[0]);
+  auto files1 = cfd->current()->storage_info()->LevelFiles(input_levels[1]);
+  auto files2 = cfd->current()->storage_info()->LevelFiles(input_levels[2]);
+  auto files3 = cfd->current()->storage_info()->LevelFiles(input_levels[3]);
+
+  RunLastLevelCompaction(
+      {files0, files1, files2, files3}, input_levels,
+      /*verify_func=*/[&](Compaction& comp) {
+        for (char c = 'a'; c <= 'z'; c++) {
+          std::string c_str;
+          c_str = c;
+          const Slice key(c_str);
+          if (c == 'a') {
+            ASSERT_FALSE(comp.WithinPenultimateLevelOutputRange(key));
+          } else {
+            ASSERT_TRUE(comp.WithinPenultimateLevelOutputRange(key));
+          }
+        }
+      });
+}
+
+TEST_F(CompactionJobTest, NoEnforceSingleDeleteContract) {
+  db_options_.enforce_single_del_contracts = false;
+  NewDB();
+
+  auto file =
+      mock::MakeMockFile({{KeyStr("a", 4U, kTypeSingleDeletion), ""},
+                          {KeyStr("a", 3U, kTypeDeletion), "dontcare"}});
+  AddMockFile(file);
+  SetLastSequence(4U);
+
+  auto expected_results = mock::MakeMockFile();
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, InputSerialization) {
+  // Setup a random CompactionServiceInput
+  CompactionServiceInput input;
+  const int kStrMaxLen = 1000;
+  Random rnd(static_cast<uint32_t>(time(nullptr)));
+  Random64 rnd64(time(nullptr));
+  input.column_family.name = rnd.RandomString(rnd.Uniform(kStrMaxLen));
+  input.column_family.options.comparator = ReverseBytewiseComparator();
+  input.column_family.options.max_bytes_for_level_base =
+      rnd64.Uniform(UINT64_MAX);
+  input.column_family.options.disable_auto_compactions = rnd.OneIn(2);
+  input.column_family.options.compression = kZSTD;
+  input.column_family.options.compression_opts.level = 4;
+  input.db_options.max_background_flushes = 10;
+  input.db_options.paranoid_checks = rnd.OneIn(2);
+  input.db_options.statistics = CreateDBStatistics();
+  input.db_options.env = env_;
+  while (!rnd.OneIn(10)) {
+    input.snapshots.emplace_back(rnd64.Uniform(UINT64_MAX));
+  }
+  while (!rnd.OneIn(10)) {
+    input.input_files.emplace_back(rnd.RandomString(
+        rnd.Uniform(kStrMaxLen - 1) +
+        1));  // input file name should have at least one character
+  }
+  input.output_level = 4;
+  input.has_begin = rnd.OneIn(2);
+  if (input.has_begin) {
+    input.begin = rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen));
+  }
+  input.has_end = rnd.OneIn(2);
+  if (input.has_end) {
+    input.end = rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen));
+  }
+
+  std::string output;
+  ASSERT_OK(input.Write(&output));
+
+  // Test deserialization
+  CompactionServiceInput deserialized1;
+  ASSERT_OK(CompactionServiceInput::Read(output, &deserialized1));
+  ASSERT_TRUE(deserialized1.TEST_Equals(&input));
+
+  // Test mismatch
+  deserialized1.db_options.max_background_flushes += 10;
+  std::string mismatch;
+  ASSERT_FALSE(deserialized1.TEST_Equals(&input, &mismatch));
+  ASSERT_EQ(mismatch, "db_options.max_background_flushes");
+
+  // Test unknown field
+  CompactionServiceInput deserialized2;
+  output.clear();
+  ASSERT_OK(input.Write(&output));
+  output.append("new_field=123;");
+
+  ASSERT_OK(CompactionServiceInput::Read(output, &deserialized2));
+  ASSERT_TRUE(deserialized2.TEST_Equals(&input));
+
+  // Test missing field
+  CompactionServiceInput deserialized3;
+  deserialized3.output_level = 0;
+  std::string to_remove = "output_level=4;";
+  size_t pos = output.find(to_remove);
+  ASSERT_TRUE(pos != std::string::npos);
+  output.erase(pos, to_remove.length());
+  ASSERT_OK(CompactionServiceInput::Read(output, &deserialized3));
+  mismatch.clear();
+  ASSERT_FALSE(deserialized3.TEST_Equals(&input, &mismatch));
+  ASSERT_EQ(mismatch, "output_level");
+
+  // manually set the value back, should match the original structure
+  deserialized3.output_level = 4;
+  ASSERT_TRUE(deserialized3.TEST_Equals(&input));
+
+  // Test invalid version
+  output.clear();
+  ASSERT_OK(input.Write(&output));
+
+  uint32_t data_version = DecodeFixed32(output.data());
+  const size_t kDataVersionSize = sizeof(data_version);
+  ASSERT_EQ(data_version,
+            1U);  // Update once the default data version is changed
+  char buf[kDataVersionSize];
+  EncodeFixed32(buf, data_version + 10);  // make sure it's not valid
+  output.replace(0, kDataVersionSize, buf, kDataVersionSize);
+  Status s = CompactionServiceInput::Read(output, &deserialized3);
+  ASSERT_TRUE(s.IsNotSupported());
+}
+
+TEST_F(CompactionJobTest, ResultSerialization) {
+  // Setup a random CompactionServiceResult
+  CompactionServiceResult result;
+  const int kStrMaxLen = 1000;
+  Random rnd(static_cast<uint32_t>(time(nullptr)));
+  Random64 rnd64(time(nullptr));
+  std::vector<Status> status_list = {
+      Status::OK(),
+      Status::InvalidArgument("invalid option"),
+      Status::Aborted("failed to run"),
+      Status::NotSupported("not supported option"),
+  };
+  result.status =
+      status_list.at(rnd.Uniform(static_cast<int>(status_list.size())));
+  while (!rnd.OneIn(10)) {
+    UniqueId64x2 id{rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX)};
+    result.output_files.emplace_back(
+        rnd.RandomString(rnd.Uniform(kStrMaxLen)), rnd64.Uniform(UINT64_MAX),
+        rnd64.Uniform(UINT64_MAX),
+        rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)),
+        rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)),
+        rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX),
+        rnd64.Uniform(UINT64_MAX), rnd.OneIn(2), id);
+  }
+  result.output_level = rnd.Uniform(10);
+  result.output_path = rnd.RandomString(rnd.Uniform(kStrMaxLen));
+  result.num_output_records = rnd64.Uniform(UINT64_MAX);
+  result.total_bytes = rnd64.Uniform(UINT64_MAX);
+  result.bytes_read = 123;
+  result.bytes_written = rnd64.Uniform(UINT64_MAX);
+  result.stats.elapsed_micros = rnd64.Uniform(UINT64_MAX);
+  result.stats.num_output_files = rnd.Uniform(1000);
+  result.stats.is_full_compaction = rnd.OneIn(2);
+  result.stats.num_single_del_mismatch = rnd64.Uniform(UINT64_MAX);
+  result.stats.num_input_files = 9;
+
+  std::string output;
+  ASSERT_OK(result.Write(&output));
+
+  // Test deserialization
+  CompactionServiceResult deserialized1;
+  ASSERT_OK(CompactionServiceResult::Read(output, &deserialized1));
+  ASSERT_TRUE(deserialized1.TEST_Equals(&result));
+
+  // Test mismatch
+  deserialized1.stats.num_input_files += 10;
+  std::string mismatch;
+  ASSERT_FALSE(deserialized1.TEST_Equals(&result, &mismatch));
+  ASSERT_EQ(mismatch, "stats.num_input_files");
+
+  // Test unique id mismatch
+  if (!result.output_files.empty()) {
+    CompactionServiceResult deserialized_tmp;
+    ASSERT_OK(CompactionServiceResult::Read(output, &deserialized_tmp));
+    deserialized_tmp.output_files[0].unique_id[0] += 1;
+    ASSERT_FALSE(deserialized_tmp.TEST_Equals(&result, &mismatch));
+    ASSERT_EQ(mismatch, "output_files.unique_id");
+    deserialized_tmp.status.PermitUncheckedError();
+  }
+
+  // Test unknown field
+  CompactionServiceResult deserialized2;
+  output.clear();
+  ASSERT_OK(result.Write(&output));
+  output.append("new_field=123;");
+
+  ASSERT_OK(CompactionServiceResult::Read(output, &deserialized2));
+  ASSERT_TRUE(deserialized2.TEST_Equals(&result));
+
+  // Test missing field
+  CompactionServiceResult deserialized3;
+  deserialized3.bytes_read = 0;
+  std::string to_remove = "bytes_read=123;";
+  size_t pos = output.find(to_remove);
+  ASSERT_TRUE(pos != std::string::npos);
+  output.erase(pos, to_remove.length());
+  ASSERT_OK(CompactionServiceResult::Read(output, &deserialized3));
+  mismatch.clear();
+  ASSERT_FALSE(deserialized3.TEST_Equals(&result, &mismatch));
+  ASSERT_EQ(mismatch, "bytes_read");
+
+  deserialized3.bytes_read = 123;
+  ASSERT_TRUE(deserialized3.TEST_Equals(&result));
+
+  // Test invalid version
+  output.clear();
+  ASSERT_OK(result.Write(&output));
+
+  uint32_t data_version = DecodeFixed32(output.data());
+  const size_t kDataVersionSize = sizeof(data_version);
+  ASSERT_EQ(data_version,
+            1U);  // Update once the default data version is changed
+  char buf[kDataVersionSize];
+  EncodeFixed32(buf, data_version + 10);  // make sure it's not valid
+  output.replace(0, kDataVersionSize, buf, kDataVersionSize);
+  Status s = CompactionServiceResult::Read(output, &deserialized3);
+  ASSERT_TRUE(s.IsNotSupported());
+  for (const auto& item : status_list) {
+    item.PermitUncheckedError();
+  }
+}
+
+class CompactionJobDynamicFileSizeTest
+    : public CompactionJobTestBase,
+      public ::testing::WithParamInterface<bool> {
+ public:
+  CompactionJobDynamicFileSizeTest()
+      : CompactionJobTestBase(
+            test::PerThreadDBPath("compaction_job_dynamic_file_size_test"),
+            BytewiseComparator(), [](uint64_t /*ts*/) { return ""; },
+            /*test_io_priority=*/false, TableTypeForTest::kMockTable) {}
+};
+
+TEST_P(CompactionJobDynamicFileSizeTest, CutForMaxCompactionBytes) {
+  // dynamic_file_size option should have no impact on cutting for max
+  // compaction bytes.
+  bool enable_dyanmic_file_size = GetParam();
+  cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size;
+
+  NewDB();
+  mutable_cf_options_.target_file_size_base = 80;
+  mutable_cf_options_.max_compaction_bytes = 21;
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("c", 5U, kTypeValue), "val2"},
+      {KeyStr("n", 6U, kTypeValue), "val3"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("h", 3U, kTypeValue), "val"},
+                                   {KeyStr("j", 4U, kTypeValue), "val"}});
+  AddMockFile(file2, 1);
+
+  // Create three L2 files, each size 10.
+  // max_compaction_bytes 21 means the compaction output in L1 will
+  // be cut to at least two files.
+  auto file3 = mock::MakeMockFile({{KeyStr("b", 1U, kTypeValue), "val"},
+                                   {KeyStr("c", 1U, kTypeValue), "val"},
+                                   {KeyStr("c1", 1U, kTypeValue), "val"},
+                                   {KeyStr("c2", 1U, kTypeValue), "val"},
+                                   {KeyStr("c3", 1U, kTypeValue), "val"},
+                                   {KeyStr("c4", 1U, kTypeValue), "val"},
+                                   {KeyStr("d", 1U, kTypeValue), "val"},
+                                   {KeyStr("e", 2U, kTypeValue), "val"}});
+  AddMockFile(file3, 2);
+
+  auto file4 = mock::MakeMockFile({{KeyStr("h", 1U, kTypeValue), "val"},
+                                   {KeyStr("i", 1U, kTypeValue), "val"},
+                                   {KeyStr("i1", 1U, kTypeValue), "val"},
+                                   {KeyStr("i2", 1U, kTypeValue), "val"},
+                                   {KeyStr("i3", 1U, kTypeValue), "val"},
+                                   {KeyStr("i4", 1U, kTypeValue), "val"},
+                                   {KeyStr("j", 1U, kTypeValue), "val"},
+                                   {KeyStr("k", 2U, kTypeValue), "val"}});
+  AddMockFile(file4, 2);
+
+  auto file5 = mock::MakeMockFile({{KeyStr("l", 1U, kTypeValue), "val"},
+                                   {KeyStr("m", 1U, kTypeValue), "val"},
+                                   {KeyStr("m1", 1U, kTypeValue), "val"},
+                                   {KeyStr("m2", 1U, kTypeValue), "val"},
+                                   {KeyStr("m3", 1U, kTypeValue), "val"},
+                                   {KeyStr("m4", 1U, kTypeValue), "val"},
+                                   {KeyStr("n", 1U, kTypeValue), "val"},
+                                   {KeyStr("o", 2U, kTypeValue), "val"}});
+  AddMockFile(file5, 2);
+
+  // The expected output should be:
+  //  L1:   [c,   h,  j]        [n]
+  //  L2: [b ... e] [h ... k] [l ... o]
+  // It's better to have "j" in the first file, because anyway it's overlapping
+  // with the second file on L2.
+  // (Note: before this PR, it was cut at "h" because it's using the internal
+  // comparator which think L1 "h" with seqno 3 is smaller than L2 "h" with
+  // seqno 1, but actually they're overlapped with the compaction picker).
+
+  auto expected_file1 =
+      mock::MakeMockFile({{KeyStr("c", 5U, kTypeValue), "val2"},
+                          {KeyStr("h", 3U, kTypeValue), "val"},
+                          {KeyStr("j", 4U, kTypeValue), "val"}});
+  auto expected_file2 =
+      mock::MakeMockFile({{KeyStr("n", 6U, kTypeValue), "val3"}});
+
+  SetLastSequence(6U);
+
+  const std::vector<int> input_levels = {0, 1};
+  auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0);
+  auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1);
+
+  RunCompaction({lvl0_files, lvl1_files}, input_levels,
+                {expected_file1, expected_file2});
+}
+
+TEST_P(CompactionJobDynamicFileSizeTest, CutToSkipGrandparentFile) {
+  bool enable_dyanmic_file_size = GetParam();
+  cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size;
+
+  NewDB();
+  // Make sure the grandparent level file size (10) qualifies skipping.
+  // Currently, it has to be > 1/8 of target file size.
+  mutable_cf_options_.target_file_size_base = 70;
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("a", 5U, kTypeValue), "val2"},
+      {KeyStr("z", 6U, kTypeValue), "val3"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("c", 3U, kTypeValue), "val"},
+                                   {KeyStr("x", 4U, kTypeValue), "val"}});
+  AddMockFile(file2, 1);
+
+  auto file3 = mock::MakeMockFile({{KeyStr("b", 1U, kTypeValue), "val"},
+                                   {KeyStr("d", 2U, kTypeValue), "val"}});
+  AddMockFile(file3, 2);
+
+  auto file4 = mock::MakeMockFile({{KeyStr("h", 1U, kTypeValue), "val"},
+                                   {KeyStr("i", 2U, kTypeValue), "val"}});
+  AddMockFile(file4, 2);
+
+  auto file5 = mock::MakeMockFile({{KeyStr("v", 1U, kTypeValue), "val"},
+                                   {KeyStr("y", 2U, kTypeValue), "val"}});
+  AddMockFile(file5, 2);
+
+  auto expected_file1 =
+      mock::MakeMockFile({{KeyStr("a", 5U, kTypeValue), "val2"},
+                          {KeyStr("c", 3U, kTypeValue), "val"}});
+  auto expected_file2 =
+      mock::MakeMockFile({{KeyStr("x", 4U, kTypeValue), "val"},
+                          {KeyStr("z", 6U, kTypeValue), "val3"}});
+
+  auto expected_file_disable_dynamic_file_size =
+      mock::MakeMockFile({{KeyStr("a", 5U, kTypeValue), "val2"},
+                          {KeyStr("c", 3U, kTypeValue), "val"},
+                          {KeyStr("x", 4U, kTypeValue), "val"},
+                          {KeyStr("z", 6U, kTypeValue), "val3"}});
+
+  SetLastSequence(6U);
+  const std::vector<int> input_levels = {0, 1};
+  auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0);
+  auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1);
+  if (enable_dyanmic_file_size) {
+    RunCompaction({lvl0_files, lvl1_files}, input_levels,
+                  {expected_file1, expected_file2});
+  } else {
+    RunCompaction({lvl0_files, lvl1_files}, input_levels,
+                  {expected_file_disable_dynamic_file_size});
+  }
+}
+
+TEST_P(CompactionJobDynamicFileSizeTest, CutToAlignGrandparentBoundary) {
+  bool enable_dyanmic_file_size = GetParam();
+  cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size;
+  NewDB();
+
+  // MockTable has 1 byte per entry by default and each file is 10 bytes.
+  // When the file size is smaller than 100, it won't cut file earlier to align
+  // with its grandparent boundary.
+  const size_t kKeyValueSize = 10000;
+  mock_table_factory_->SetKeyValueSize(kKeyValueSize);
+
+  mutable_cf_options_.target_file_size_base = 10 * kKeyValueSize;
+
+  mock::KVVector file1;
+  char ch = 'd';
+  // Add value from d -> o
+  for (char i = 0; i < 12; i++) {
+    file1.emplace_back(KeyStr(std::string(1, ch + i), i + 10, kTypeValue),
+                       "val" + std::to_string(i));
+  }
+
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("e", 3U, kTypeValue), "val"},
+                                   {KeyStr("s", 4U, kTypeValue), "val"}});
+  AddMockFile(file2, 1);
+
+  // the 1st grandparent file should be skipped
+  auto file3 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"},
+                                   {KeyStr("b", 2U, kTypeValue), "val"}});
+  AddMockFile(file3, 2);
+
+  auto file4 = mock::MakeMockFile({{KeyStr("c", 1U, kTypeValue), "val"},
+                                   {KeyStr("e", 2U, kTypeValue), "val"}});
+  AddMockFile(file4, 2);
+
+  auto file5 = mock::MakeMockFile({{KeyStr("h", 1U, kTypeValue), "val"},
+                                   {KeyStr("j", 2U, kTypeValue), "val"}});
+  AddMockFile(file5, 2);
+
+  auto file6 = mock::MakeMockFile({{KeyStr("k", 1U, kTypeValue), "val"},
+                                   {KeyStr("n", 2U, kTypeValue), "val"}});
+  AddMockFile(file6, 2);
+
+  auto file7 = mock::MakeMockFile({{KeyStr("q", 1U, kTypeValue), "val"},
+                                   {KeyStr("t", 2U, kTypeValue), "val"}});
+  AddMockFile(file7, 2);
+
+  // The expected outputs are:
+  //  L1:         [d,e,f,g,h,i,j] [k,l,m,n,o,s]
+  //  L2: [a, b] [c,  e]   [h, j] [k, n]  [q, t]
+  // The first output cut earlier at "j", so it could be aligned with L2 files.
+  // If dynamic_file_size is not enabled, it will be cut based on the
+  // target_file_size
+  mock::KVVector expected_file1;
+  for (char i = 0; i < 7; i++) {
+    expected_file1.emplace_back(
+        KeyStr(std::string(1, ch + i), i + 10, kTypeValue),
+        "val" + std::to_string(i));
+  }
+
+  mock::KVVector expected_file2;
+  for (char i = 7; i < 12; i++) {
+    expected_file2.emplace_back(
+        KeyStr(std::string(1, ch + i), i + 10, kTypeValue),
+        "val" + std::to_string(i));
+  }
+  expected_file2.emplace_back(KeyStr("s", 4U, kTypeValue), "val");
+
+  mock::KVVector expected_file_disable_dynamic_file_size1;
+  for (char i = 0; i < 10; i++) {
+    expected_file_disable_dynamic_file_size1.emplace_back(
+        KeyStr(std::string(1, ch + i), i + 10, kTypeValue),
+        "val" + std::to_string(i));
+  }
+
+  mock::KVVector expected_file_disable_dynamic_file_size2;
+  for (char i = 10; i < 12; i++) {
+    expected_file_disable_dynamic_file_size2.emplace_back(
+        KeyStr(std::string(1, ch + i), i + 10, kTypeValue),
+        "val" + std::to_string(i));
+  }
+
+  expected_file_disable_dynamic_file_size2.emplace_back(
+      KeyStr("s", 4U, kTypeValue), "val");
+
+  SetLastSequence(22U);
+  const std::vector<int> input_levels = {0, 1};
+  auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0);
+  auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1);
+  if (enable_dyanmic_file_size) {
+    RunCompaction({lvl0_files, lvl1_files}, input_levels,
+                  {expected_file1, expected_file2});
+  } else {
+    RunCompaction({lvl0_files, lvl1_files}, input_levels,
+                  {expected_file_disable_dynamic_file_size1,
+                   expected_file_disable_dynamic_file_size2});
+  }
+}
+
+TEST_P(CompactionJobDynamicFileSizeTest, CutToAlignGrandparentBoundarySameKey) {
+  bool enable_dyanmic_file_size = GetParam();
+  cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size;
+  NewDB();
+
+  // MockTable has 1 byte per entry by default and each file is 10 bytes.
+  // When the file size is smaller than 100, it won't cut file earlier to align
+  // with its grandparent boundary.
+  const size_t kKeyValueSize = 10000;
+  mock_table_factory_->SetKeyValueSize(kKeyValueSize);
+
+  mutable_cf_options_.target_file_size_base = 10 * kKeyValueSize;
+
+  mock::KVVector file1;
+  for (int i = 0; i < 7; i++) {
+    file1.emplace_back(KeyStr("a", 100 - i, kTypeValue),
+                       "val" + std::to_string(100 - i));
+  }
+  file1.emplace_back(KeyStr("b", 90, kTypeValue), "valb");
+
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("a", 93U, kTypeValue), "val93"},
+                                   {KeyStr("b", 90U, kTypeValue), "valb"}});
+  AddMockFile(file2, 1);
+
+  auto file3 = mock::MakeMockFile({{KeyStr("a", 89U, kTypeValue), "val"},
+                                   {KeyStr("a", 88U, kTypeValue), "val"}});
+  AddMockFile(file3, 2);
+
+  auto file4 = mock::MakeMockFile({{KeyStr("a", 87U, kTypeValue), "val"},
+                                   {KeyStr("a", 86U, kTypeValue), "val"}});
+  AddMockFile(file4, 2);
+
+  auto file5 = mock::MakeMockFile({{KeyStr("b", 85U, kTypeValue), "val"},
+                                   {KeyStr("b", 84U, kTypeValue), "val"}});
+  AddMockFile(file5, 2);
+
+  mock::KVVector expected_file1;
+  mock::KVVector expected_file_disable_dynamic_file_size;
+
+  for (int i = 0; i < 8; i++) {
+    expected_file1.emplace_back(KeyStr("a", 100 - i, kTypeValue),
+                                "val" + std::to_string(100 - i));
+    expected_file_disable_dynamic_file_size.emplace_back(
+        KeyStr("a", 100 - i, kTypeValue), "val" + std::to_string(100 - i));
+  }
+
+  // make sure `b` is cut in a separated file (so internally it's not using
+  // internal comparator, which will think the "b:90" (seqno 90) here is smaller
+  // than "b:85" on L2.)
+  auto expected_file2 =
+      mock::MakeMockFile({{KeyStr("b", 90U, kTypeValue), "valb"}});
+
+  expected_file_disable_dynamic_file_size.emplace_back(
+      KeyStr("b", 90U, kTypeValue), "valb");
+
+  SetLastSequence(122U);
+  const std::vector<int> input_levels = {0, 1};
+  auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0);
+  auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1);
+
+  // Just keep all the history
+  std::vector<SequenceNumber> snapshots;
+  for (int i = 80; i <= 100; i++) {
+    snapshots.emplace_back(i);
+  }
+  if (enable_dyanmic_file_size) {
+    RunCompaction({lvl0_files, lvl1_files}, input_levels,
+                  {expected_file1, expected_file2}, snapshots);
+  } else {
+    RunCompaction({lvl0_files, lvl1_files}, input_levels,
+                  {expected_file_disable_dynamic_file_size}, snapshots);
+  }
+}
+
+TEST_P(CompactionJobDynamicFileSizeTest, CutForMaxCompactionBytesSameKey) {
+  // dynamic_file_size option should have no impact on cutting for max
+  // compaction bytes.
+  bool enable_dyanmic_file_size = GetParam();
+  cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size;
+
+  NewDB();
+  mutable_cf_options_.target_file_size_base = 80;
+  mutable_cf_options_.max_compaction_bytes = 20;
+
+  auto file1 = mock::MakeMockFile({{KeyStr("a", 104U, kTypeValue), "val1"},
+                                   {KeyStr("b", 103U, kTypeValue), "val"}});
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("a", 102U, kTypeValue), "val2"},
+                                   {KeyStr("c", 101U, kTypeValue), "val"}});
+  AddMockFile(file2, 1);
+
+  for (int i = 0; i < 10; i++) {
+    auto file =
+        mock::MakeMockFile({{KeyStr("a", 100 - (i * 2), kTypeValue), "val"},
+                            {KeyStr("a", 99 - (i * 2), kTypeValue), "val"}});
+    AddMockFile(file, 2);
+  }
+
+  for (int i = 0; i < 10; i++) {
+    auto file =
+        mock::MakeMockFile({{KeyStr("b", 80 - (i * 2), kTypeValue), "val"},
+                            {KeyStr("b", 79 - (i * 2), kTypeValue), "val"}});
+    AddMockFile(file, 2);
+  }
+
+  auto file5 = mock::MakeMockFile({{KeyStr("c", 60U, kTypeValue), "valc"},
+                                   {KeyStr("c", 59U, kTypeValue), "valc"}});
+
+  // "a" has 10 overlapped grandparent files (each size 10), which is far
+  // exceeded the `max_compaction_bytes`, but make sure 2 "a" are not separated,
+  // as splitting them won't help reducing the compaction size.
+  // also make sure "b" and "c" are cut separately.
+  mock::KVVector expected_file1 =
+      mock::MakeMockFile({{KeyStr("a", 104U, kTypeValue), "val1"},
+                          {KeyStr("a", 102U, kTypeValue), "val2"}});
+  mock::KVVector expected_file2 =
+      mock::MakeMockFile({{KeyStr("b", 103U, kTypeValue), "val"}});
+  mock::KVVector expected_file3 =
+      mock::MakeMockFile({{KeyStr("c", 101U, kTypeValue), "val"}});
+
+  SetLastSequence(122U);
+  const std::vector<int> input_levels = {0, 1};
+  auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0);
+  auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1);
+
+  // Just keep all the history
+  std::vector<SequenceNumber> snapshots;
+  for (int i = 80; i <= 105; i++) {
+    snapshots.emplace_back(i);
+  }
+  RunCompaction({lvl0_files, lvl1_files}, input_levels,
+                {expected_file1, expected_file2, expected_file3}, snapshots);
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionJobDynamicFileSizeTest,
+                        CompactionJobDynamicFileSizeTest, testing::Bool());
+
+class CompactionJobTimestampTest : public CompactionJobTestBase {
+ public:
+  CompactionJobTimestampTest()
+      : CompactionJobTestBase(test::PerThreadDBPath("compaction_job_ts_test"),
+                              test::BytewiseComparatorWithU64TsWrapper(),
+                              test::EncodeInt, /*test_io_priority=*/false,
+                              TableTypeForTest::kMockTable) {}
+};
+
+TEST_F(CompactionJobTimestampTest, GCDisabled) {
+  NewDB();
+
+  auto file1 =
+      mock::MakeMockFile({{KeyStr("a", 10, ValueType::kTypeValue, 100), "a10"},
+                          {KeyStr("a", 9, ValueType::kTypeValue, 99), "a9"},
+                          {KeyStr("b", 8, ValueType::kTypeValue, 98), "b8"},
+                          {KeyStr("d", 7, ValueType::kTypeValue, 97), "d7"}});
+
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile(
+      {{KeyStr("b", 6, ValueType::kTypeDeletionWithTimestamp, 96), ""},
+       {KeyStr("c", 5, ValueType::kTypeDeletionWithTimestamp, 95), ""},
+       {KeyStr("c", 4, ValueType::kTypeValue, 94), "c5"},
+       {KeyStr("d", 3, ValueType::kTypeSingleDeletion, 93), ""}});
+  AddMockFile(file2);
+
+  SetLastSequence(10);
+
+  auto expected_results = mock::MakeMockFile(
+      {{KeyStr("a", 10, ValueType::kTypeValue, 100), "a10"},
+       {KeyStr("a", 9, ValueType::kTypeValue, 99), "a9"},
+       {KeyStr("b", 8, ValueType::kTypeValue, 98), "b8"},
+       {KeyStr("b", 6, ValueType::kTypeDeletionWithTimestamp, 96), ""},
+       {KeyStr("c", 5, ValueType::kTypeDeletionWithTimestamp, 95), ""},
+       {KeyStr("c", 4, ValueType::kTypeValue, 94), "c5"},
+       {KeyStr("d", 7, ValueType::kTypeValue, 97), "d7"},
+       {KeyStr("d", 3, ValueType::kTypeSingleDeletion, 93), ""}});
+  constexpr int input_level = 0;
+  const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTimestampTest, NoKeyExpired) {
+  NewDB();
+
+  auto file1 =
+      mock::MakeMockFile({{KeyStr("a", 6, ValueType::kTypeValue, 100), "a6"},
+                          {KeyStr("b", 7, ValueType::kTypeValue, 101), "b7"},
+                          {KeyStr("c", 5, ValueType::kTypeValue, 99), "c5"}});
+  AddMockFile(file1);
+
+  auto file2 =
+      mock::MakeMockFile({{KeyStr("a", 4, ValueType::kTypeValue, 98), "a4"},
+                          {KeyStr("c", 3, ValueType::kTypeValue, 97), "c3"}});
+  AddMockFile(file2);
+
+  SetLastSequence(101);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 6, ValueType::kTypeValue, 100), "a6"},
+                          {KeyStr("a", 4, ValueType::kTypeValue, 98), "a4"},
+                          {KeyStr("b", 7, ValueType::kTypeValue, 101), "b7"},
+                          {KeyStr("c", 5, ValueType::kTypeValue, 99), "c5"},
+                          {KeyStr("c", 3, ValueType::kTypeValue, 97), "c3"}});
+  constexpr int input_level = 0;
+  const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level);
+
+  full_history_ts_low_ = encode_u64_ts_(0);
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTimestampTest, AllKeysExpired) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile(
+      {{KeyStr("a", 5, ValueType::kTypeDeletionWithTimestamp, 100), ""},
+       {KeyStr("b", 6, ValueType::kTypeSingleDeletion, 99), ""},
+       {KeyStr("c", 7, ValueType::kTypeValue, 98), "c7"}});
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile(
+      {{KeyStr("a", 4, ValueType::kTypeValue, 97), "a4"},
+       {KeyStr("b", 3, ValueType::kTypeValue, 96), "b3"},
+       {KeyStr("c", 2, ValueType::kTypeDeletionWithTimestamp, 95), ""},
+       {KeyStr("c", 1, ValueType::kTypeValue, 94), "c1"}});
+  AddMockFile(file2);
+
+  SetLastSequence(7);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("c", 0, ValueType::kTypeValue, 0), "c7"}});
+  constexpr int input_level = 0;
+  const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level);
+
+  full_history_ts_low_ = encode_u64_ts_(std::numeric_limits<uint64_t>::max());
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTimestampTest, SomeKeysExpired) {
+  NewDB();
+
+  auto file1 =
+      mock::MakeMockFile({{KeyStr("a", 5, ValueType::kTypeValue, 50), "a5"},
+                          {KeyStr("b", 6, ValueType::kTypeValue, 49), "b6"}});
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile(
+      {{KeyStr("a", 3, ValueType::kTypeValue, 48), "a3"},
+       {KeyStr("a", 2, ValueType::kTypeValue, 46), "a2"},
+       {KeyStr("b", 4, ValueType::kTypeDeletionWithTimestamp, 47), ""}});
+  AddMockFile(file2);
+
+  SetLastSequence(6);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 5, ValueType::kTypeValue, 50), "a5"},
+                          {KeyStr("a", 0, ValueType::kTypeValue, 0), "a3"},
+                          {KeyStr("b", 6, ValueType::kTypeValue, 49), "b6"}});
+  constexpr int input_level = 0;
+  const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level);
+
+  full_history_ts_low_ = encode_u64_ts_(49);
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+class CompactionJobTimestampTestWithBbTable : public CompactionJobTestBase {
+ public:
+  // Block-based table is needed if we want to test subcompaction partitioning
+  // with anchors.
+  explicit CompactionJobTimestampTestWithBbTable()
+      : CompactionJobTestBase(
+            test::PerThreadDBPath("compaction_job_ts_bbt_test"),
+            test::BytewiseComparatorWithU64TsWrapper(), test::EncodeInt,
+            /*test_io_priority=*/false, TableTypeForTest::kBlockBasedTable) {}
+};
+
+TEST_F(CompactionJobTimestampTestWithBbTable, SubcompactionAnchorL1) {
+  cf_options_.target_file_size_base = 20;
+  mutable_cf_options_.target_file_size_base = 20;
+  NewDB();
+
+  const std::vector<std::string> keys = {
+      KeyStr("a", 20, ValueType::kTypeValue, 200),
+      KeyStr("b", 21, ValueType::kTypeValue, 210),
+      KeyStr("b", 20, ValueType::kTypeValue, 200),
+      KeyStr("b", 18, ValueType::kTypeValue, 180),
+      KeyStr("c", 17, ValueType::kTypeValue, 170),
+      KeyStr("c", 16, ValueType::kTypeValue, 160),
+      KeyStr("c", 15, ValueType::kTypeValue, 150)};
+  const std::vector<std::string> values = {"a20", "b21", "b20", "b18",
+                                           "c17", "c16", "c15"};
+
+  constexpr int input_level = 1;
+
+  auto file1 = mock::MakeMockFile(
+      {{keys[0], values[0]}, {keys[1], values[1]}, {keys[2], values[2]}});
+  AddMockFile(file1, input_level);
+
+  auto file2 = mock::MakeMockFile(
+      {{keys[3], values[3]}, {keys[4], values[4]}, {keys[5], values[5]}});
+  AddMockFile(file2, input_level);
+
+  auto file3 = mock::MakeMockFile({{keys[6], values[6]}});
+  AddMockFile(file3, input_level);
+
+  SetLastSequence(20);
+
+  auto output1 = mock::MakeMockFile({{keys[0], values[0]}});
+  auto output2 = mock::MakeMockFile(
+      {{keys[1], values[1]}, {keys[2], values[2]}, {keys[3], values[3]}});
+  auto output3 = mock::MakeMockFile(
+      {{keys[4], values[4]}, {keys[5], values[5]}, {keys[6], values[6]}});
+
+  auto expected_results =
+      std::vector<mock::KVVector>{output1, output2, output3};
+  const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level);
+
+  constexpr int output_level = 2;
+  constexpr int max_subcompactions = 4;
+  RunCompaction({files}, {input_level}, expected_results, /*snapshots=*/{},
+                /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+                output_level, /*verify=*/true, {kInvalidBlobFileNumber},
+                /*check_get_priority=*/false, Env::IO_TOTAL, Env::IO_TOTAL,
+                max_subcompactions);
+}
+
+TEST_F(CompactionJobTimestampTestWithBbTable, SubcompactionL0) {
+  cf_options_.target_file_size_base = 20;
+  mutable_cf_options_.target_file_size_base = 20;
+  NewDB();
+
+  const std::vector<std::string> keys = {
+      KeyStr("a", 20, ValueType::kTypeValue, 200),
+      KeyStr("b", 20, ValueType::kTypeValue, 200),
+      KeyStr("b", 19, ValueType::kTypeValue, 190),
+      KeyStr("b", 18, ValueType::kTypeValue, 180),
+      KeyStr("c", 17, ValueType::kTypeValue, 170),
+      KeyStr("c", 16, ValueType::kTypeValue, 160),
+      KeyStr("c", 15, ValueType::kTypeValue, 150)};
+  const std::vector<std::string> values = {"a20", "b20", "b19", "b18",
+                                           "c17", "c16", "c15"};
+
+  constexpr int input_level = 0;
+
+  auto file1 = mock::MakeMockFile({{keys[5], values[5]}, {keys[6], values[6]}});
+  AddMockFile(file1, input_level);
+
+  auto file2 = mock::MakeMockFile({{keys[3], values[3]}, {keys[4], values[4]}});
+  AddMockFile(file2, input_level);
+
+  auto file3 = mock::MakeMockFile(
+      {{keys[0], values[0]}, {keys[1], values[1]}, {keys[2], values[2]}});
+  AddMockFile(file3, input_level);
+
+  SetLastSequence(20);
+
+  auto output1 = mock::MakeMockFile({{keys[0], values[0]}});
+  auto output2 = mock::MakeMockFile(
+      {{keys[1], values[1]}, {keys[2], values[2]}, {keys[3], values[3]}});
+  auto output3 = mock::MakeMockFile(
+      {{keys[4], values[4]}, {keys[5], values[5]}, {keys[6], values[6]}});
+
+  auto expected_results =
+      std::vector<mock::KVVector>{output1, output2, output3};
+  const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level);
+
+  constexpr int output_level = 1;
+  constexpr int max_subcompactions = 4;
+  RunCompaction({files}, {input_level}, expected_results, /*snapshots=*/{},
+                /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+                output_level, /*verify=*/true, {kInvalidBlobFileNumber},
+                /*check_get_priority=*/false, Env::IO_TOTAL, Env::IO_TOTAL,
+                max_subcompactions);
+}
+
+// The io priority of the compaction reads and writes are different from
+// other DB reads and writes. To prepare the compaction input files, use the
+// default filesystem from Env. To test the io priority of the compaction
+// reads and writes, db_options_.fs is set as MockTestFileSystem.
+class CompactionJobIOPriorityTest : public CompactionJobTestBase {
+ public:
+  CompactionJobIOPriorityTest()
+      : CompactionJobTestBase(
+            test::PerThreadDBPath("compaction_job_io_priority_test"),
+            BytewiseComparator(), [](uint64_t /*ts*/) { return ""; },
+            /*test_io_priority=*/true, TableTypeForTest::kBlockBasedTable) {}
+};
+
+TEST_F(CompactionJobIOPriorityTest, WriteControllerStateNormal) {
+  // When the state from WriteController is normal.
+  NewDB();
+  mock::KVVector expected_results = CreateTwoFiles(false);
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  constexpr int input_level = 0;
+  auto files = cfd->current()->storage_info()->LevelFiles(input_level);
+  ASSERT_EQ(2U, files.size());
+  RunCompaction({files}, {input_level}, {expected_results}, {},
+                kMaxSequenceNumber, 1, false, {kInvalidBlobFileNumber}, false,
+                Env::IO_LOW, Env::IO_LOW);
+}
+
+TEST_F(CompactionJobIOPriorityTest, WriteControllerStateDelayed) {
+  // When the state from WriteController is Delayed.
+  NewDB();
+  mock::KVVector expected_results = CreateTwoFiles(false);
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  constexpr int input_level = 0;
+  auto files = cfd->current()->storage_info()->LevelFiles(input_level);
+  ASSERT_EQ(2U, files.size());
+  {
+    std::unique_ptr<WriteControllerToken> delay_token =
+        write_controller_.GetDelayToken(1000000);
+    RunCompaction({files}, {input_level}, {expected_results}, {},
+                  kMaxSequenceNumber, 1, false, {kInvalidBlobFileNumber}, false,
+                  Env::IO_USER, Env::IO_USER);
+  }
+}
+
+TEST_F(CompactionJobIOPriorityTest, WriteControllerStateStalled) {
+  // When the state from WriteController is Stalled.
+  NewDB();
+  mock::KVVector expected_results = CreateTwoFiles(false);
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  constexpr int input_level = 0;
+  auto files = cfd->current()->storage_info()->LevelFiles(input_level);
+  ASSERT_EQ(2U, files.size());
+  {
+    std::unique_ptr<WriteControllerToken> stop_token =
+        write_controller_.GetStopToken();
+    RunCompaction({files}, {input_level}, {expected_results}, {},
+                  kMaxSequenceNumber, 1, false, {kInvalidBlobFileNumber}, false,
+                  Env::IO_USER, Env::IO_USER);
+  }
+}
+
+TEST_F(CompactionJobIOPriorityTest, GetRateLimiterPriority) {
+  NewDB();
+  mock::KVVector expected_results = CreateTwoFiles(false);
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  constexpr int input_level = 0;
+  auto files = cfd->current()->storage_info()->LevelFiles(input_level);
+  ASSERT_EQ(2U, files.size());
+  RunCompaction({files}, {input_level}, {expected_results}, {},
+                kMaxSequenceNumber, 1, false, {kInvalidBlobFileNumber}, true,
+                Env::IO_LOW, Env::IO_LOW);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as CompactionJobStats is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_outputs.cc b/src/rocksdb/db/compaction/compaction_outputs.cc
new file mode 100644
index 000000000..e74378e2a
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_outputs.cc
@@ -0,0 +1,646 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_outputs.h"
+
+#include "db/builder.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void CompactionOutputs::NewBuilder(const TableBuilderOptions& tboptions) {
+  builder_.reset(NewTableBuilder(tboptions, file_writer_.get()));
+}
+
+Status CompactionOutputs::Finish(const Status& intput_status,
+                                 const SeqnoToTimeMapping& seqno_time_mapping) {
+  FileMetaData* meta = GetMetaData();
+  assert(meta != nullptr);
+  Status s = intput_status;
+  if (s.ok()) {
+    std::string seqno_time_mapping_str;
+    seqno_time_mapping.Encode(seqno_time_mapping_str, meta->fd.smallest_seqno,
+                              meta->fd.largest_seqno, meta->file_creation_time);
+    builder_->SetSeqnoTimeTableProperties(seqno_time_mapping_str,
+                                          meta->oldest_ancester_time);
+    s = builder_->Finish();
+
+  } else {
+    builder_->Abandon();
+  }
+  Status io_s = builder_->io_status();
+  if (s.ok()) {
+    s = io_s;
+  } else {
+    io_s.PermitUncheckedError();
+  }
+  const uint64_t current_bytes = builder_->FileSize();
+  if (s.ok()) {
+    meta->fd.file_size = current_bytes;
+    meta->marked_for_compaction = builder_->NeedCompact();
+  }
+  current_output().finished = true;
+  stats_.bytes_written += current_bytes;
+  stats_.num_output_files = outputs_.size();
+
+  return s;
+}
+
+IOStatus CompactionOutputs::WriterSyncClose(const Status& input_status,
+                                            SystemClock* clock,
+                                            Statistics* statistics,
+                                            bool use_fsync) {
+  IOStatus io_s;
+  if (input_status.ok()) {
+    StopWatch sw(clock, statistics, COMPACTION_OUTFILE_SYNC_MICROS);
+    io_s = file_writer_->Sync(use_fsync);
+  }
+  if (input_status.ok() && io_s.ok()) {
+    io_s = file_writer_->Close();
+  }
+
+  if (input_status.ok() && io_s.ok()) {
+    FileMetaData* meta = GetMetaData();
+    meta->file_checksum = file_writer_->GetFileChecksum();
+    meta->file_checksum_func_name = file_writer_->GetFileChecksumFuncName();
+  }
+
+  file_writer_.reset();
+
+  return io_s;
+}
+
+size_t CompactionOutputs::UpdateGrandparentBoundaryInfo(
+    const Slice& internal_key) {
+  size_t curr_key_boundary_switched_num = 0;
+  const std::vector<FileMetaData*>& grandparents = compaction_->grandparents();
+
+  if (grandparents.empty()) {
+    return curr_key_boundary_switched_num;
+  }
+  assert(!internal_key.empty());
+  InternalKey ikey;
+  ikey.DecodeFrom(internal_key);
+  assert(ikey.Valid());
+
+  const Comparator* ucmp = compaction_->column_family_data()->user_comparator();
+
+  // Move the grandparent_index_ to the file containing the current user_key.
+  // If there are multiple files containing the same user_key, make sure the
+  // index points to the last file containing the key.
+  while (grandparent_index_ < grandparents.size()) {
+    if (being_grandparent_gap_) {
+      if (sstableKeyCompare(ucmp, ikey,
+                            grandparents[grandparent_index_]->smallest) < 0) {
+        break;
+      }
+      if (seen_key_) {
+        curr_key_boundary_switched_num++;
+        grandparent_overlapped_bytes_ +=
+            grandparents[grandparent_index_]->fd.GetFileSize();
+        grandparent_boundary_switched_num_++;
+      }
+      being_grandparent_gap_ = false;
+    } else {
+      int cmp_result = sstableKeyCompare(
+          ucmp, ikey, grandparents[grandparent_index_]->largest);
+      // If it's same key, make sure grandparent_index_ is pointing to the last
+      // one.
+      if (cmp_result < 0 ||
+          (cmp_result == 0 &&
+           (grandparent_index_ == grandparents.size() - 1 ||
+            sstableKeyCompare(ucmp, ikey,
+                              grandparents[grandparent_index_ + 1]->smallest) <
+                0))) {
+        break;
+      }
+      if (seen_key_) {
+        curr_key_boundary_switched_num++;
+        grandparent_boundary_switched_num_++;
+      }
+      being_grandparent_gap_ = true;
+      grandparent_index_++;
+    }
+  }
+
+  // If the first key is in the middle of a grandparent file, adding it to the
+  // overlap
+  if (!seen_key_ && !being_grandparent_gap_) {
+    assert(grandparent_overlapped_bytes_ == 0);
+    grandparent_overlapped_bytes_ =
+        GetCurrentKeyGrandparentOverlappedBytes(internal_key);
+  }
+
+  seen_key_ = true;
+  return curr_key_boundary_switched_num;
+}
+
+uint64_t CompactionOutputs::GetCurrentKeyGrandparentOverlappedBytes(
+    const Slice& internal_key) const {
+  // no overlap with any grandparent file
+  if (being_grandparent_gap_) {
+    return 0;
+  }
+  uint64_t overlapped_bytes = 0;
+
+  const std::vector<FileMetaData*>& grandparents = compaction_->grandparents();
+  const Comparator* ucmp = compaction_->column_family_data()->user_comparator();
+  InternalKey ikey;
+  ikey.DecodeFrom(internal_key);
+#ifndef NDEBUG
+  // make sure the grandparent_index_ is pointing to the last files containing
+  // the current key.
+  int cmp_result =
+      sstableKeyCompare(ucmp, ikey, grandparents[grandparent_index_]->largest);
+  assert(
+      cmp_result < 0 ||
+      (cmp_result == 0 &&
+       (grandparent_index_ == grandparents.size() - 1 ||
+        sstableKeyCompare(
+            ucmp, ikey, grandparents[grandparent_index_ + 1]->smallest) < 0)));
+  assert(sstableKeyCompare(ucmp, ikey,
+                           grandparents[grandparent_index_]->smallest) >= 0);
+#endif
+  overlapped_bytes += grandparents[grandparent_index_]->fd.GetFileSize();
+
+  // go backwards to find all overlapped files, one key can overlap multiple
+  // files. In the following example, if the current output key is `c`, and one
+  // compaction file was cut before `c`, current `c` can overlap with 3 files:
+  //  [a b]               [c...
+  // [b, b] [c, c] [c, c] [c, d]
+  for (int64_t i = static_cast<int64_t>(grandparent_index_) - 1;
+       i >= 0 && sstableKeyCompare(ucmp, ikey, grandparents[i]->largest) == 0;
+       i--) {
+    overlapped_bytes += grandparents[i]->fd.GetFileSize();
+  }
+
+  return overlapped_bytes;
+}
+
+bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) {
+  assert(c_iter.Valid());
+
+  // always update grandparent information like overlapped file number, size
+  // etc.
+  const Slice& internal_key = c_iter.key();
+  const uint64_t previous_overlapped_bytes = grandparent_overlapped_bytes_;
+  size_t num_grandparent_boundaries_crossed =
+      UpdateGrandparentBoundaryInfo(internal_key);
+
+  if (!HasBuilder()) {
+    return false;
+  }
+
+  // If there's user defined partitioner, check that first
+  if (partitioner_ && partitioner_->ShouldPartition(PartitionerRequest(
+                          last_key_for_partitioner_, c_iter.user_key(),
+                          current_output_file_size_)) == kRequired) {
+    return true;
+  }
+
+  // files output to Level 0 won't be split
+  if (compaction_->output_level() == 0) {
+    return false;
+  }
+
+  // reach the max file size
+  if (current_output_file_size_ >= compaction_->max_output_file_size()) {
+    return true;
+  }
+
+  const InternalKeyComparator* icmp =
+      &compaction_->column_family_data()->internal_comparator();
+
+  // Check if it needs to split for RoundRobin
+  // Invalid local_output_split_key indicates that we do not need to split
+  if (local_output_split_key_ != nullptr && !is_split_) {
+    // Split occurs when the next key is larger than/equal to the cursor
+    if (icmp->Compare(internal_key, local_output_split_key_->Encode()) >= 0) {
+      is_split_ = true;
+      return true;
+    }
+  }
+
+  // only check if the current key is going to cross the grandparents file
+  // boundary (either the file beginning or ending).
+  if (num_grandparent_boundaries_crossed > 0) {
+    // Cut the file before the current key if the size of the current output
+    // file + its overlapped grandparent files is bigger than
+    // max_compaction_bytes. Which is to prevent future bigger than
+    // max_compaction_bytes compaction from the current output level.
+    if (grandparent_overlapped_bytes_ + current_output_file_size_ >
+        compaction_->max_compaction_bytes()) {
+      return true;
+    }
+
+    // Cut the file if including the key is going to add a skippable file on
+    // the grandparent level AND its size is reasonably big (1/8 of target file
+    // size). For example, if it's compacting the files L0 + L1:
+    //  L0:  [1,   21]
+    //  L1:    [3,   23]
+    //  L2: [2, 4] [11, 15] [22, 24]
+    // Without this break, it will output as:
+    //  L1: [1,3, 21,23]
+    // With this break, it will output as (assuming [11, 15] at L2 is bigger
+    // than 1/8 of target size):
+    //  L1: [1,3] [21,23]
+    // Then for the future compactions, [11,15] won't be included.
+    // For random datasets (either evenly distributed or skewed), it rarely
+    // triggers this condition, but if the user is adding 2 different datasets
+    // without any overlap, it may likely happen.
+    // More details, check PR #1963
+    const size_t num_skippable_boundaries_crossed =
+        being_grandparent_gap_ ? 2 : 3;
+    if (compaction_->immutable_options()->compaction_style ==
+            kCompactionStyleLevel &&
+        compaction_->immutable_options()->level_compaction_dynamic_file_size &&
+        num_grandparent_boundaries_crossed >=
+            num_skippable_boundaries_crossed &&
+        grandparent_overlapped_bytes_ - previous_overlapped_bytes >
+            compaction_->target_output_file_size() / 8) {
+      return true;
+    }
+
+    // Pre-cut the output file if it's reaching a certain size AND it's at the
+    // boundary of a grandparent file. It can reduce the future compaction size,
+    // the cost is having smaller files.
+    // The pre-cut size threshold is based on how many grandparent boundaries
+    // it has seen before. Basically, if it has seen no boundary at all, then it
+    // will pre-cut at 50% target file size. Every boundary it has seen
+    // increases the threshold by 5%, max at 90%, which it will always cut.
+    // The idea is based on if it has seen more boundaries before, it will more
+    // likely to see another boundary (file cutting opportunity) before the
+    // target file size. The test shows it can generate larger files than a
+    // static threshold like 75% and has a similar write amplification
+    // improvement.
+    if (compaction_->immutable_options()->compaction_style ==
+            kCompactionStyleLevel &&
+        compaction_->immutable_options()->level_compaction_dynamic_file_size &&
+        current_output_file_size_ >=
+            ((compaction_->target_output_file_size() + 99) / 100) *
+                (50 + std::min(grandparent_boundary_switched_num_ * 5,
+                               size_t{40}))) {
+      return true;
+    }
+  }
+
+  // check ttl file boundaries if there's any
+  if (!files_to_cut_for_ttl_.empty()) {
+    if (cur_files_to_cut_for_ttl_ != -1) {
+      // Previous key is inside the range of a file
+      if (icmp->Compare(internal_key,
+                        files_to_cut_for_ttl_[cur_files_to_cut_for_ttl_]
+                            ->largest.Encode()) > 0) {
+        next_files_to_cut_for_ttl_ = cur_files_to_cut_for_ttl_ + 1;
+        cur_files_to_cut_for_ttl_ = -1;
+        return true;
+      }
+    } else {
+      // Look for the key position
+      while (next_files_to_cut_for_ttl_ <
+             static_cast<int>(files_to_cut_for_ttl_.size())) {
+        if (icmp->Compare(internal_key,
+                          files_to_cut_for_ttl_[next_files_to_cut_for_ttl_]
+                              ->smallest.Encode()) >= 0) {
+          if (icmp->Compare(internal_key,
+                            files_to_cut_for_ttl_[next_files_to_cut_for_ttl_]
+                                ->largest.Encode()) <= 0) {
+            // With in the current file
+            cur_files_to_cut_for_ttl_ = next_files_to_cut_for_ttl_;
+            return true;
+          }
+          // Beyond the current file
+          next_files_to_cut_for_ttl_++;
+        } else {
+          // Still fall into the gap
+          break;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+Status CompactionOutputs::AddToOutput(
+    const CompactionIterator& c_iter,
+    const CompactionFileOpenFunc& open_file_func,
+    const CompactionFileCloseFunc& close_file_func) {
+  Status s;
+  const Slice& key = c_iter.key();
+
+  if (ShouldStopBefore(c_iter) && HasBuilder()) {
+    s = close_file_func(*this, c_iter.InputStatus(), key);
+    if (!s.ok()) {
+      return s;
+    }
+    // reset grandparent information
+    grandparent_boundary_switched_num_ = 0;
+    grandparent_overlapped_bytes_ =
+        GetCurrentKeyGrandparentOverlappedBytes(key);
+  }
+
+  // Open output file if necessary
+  if (!HasBuilder()) {
+    s = open_file_func(*this);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  assert(builder_ != nullptr);
+  const Slice& value = c_iter.value();
+  s = current_output().validator.Add(key, value);
+  if (!s.ok()) {
+    return s;
+  }
+  builder_->Add(key, value);
+
+  stats_.num_output_records++;
+  current_output_file_size_ = builder_->EstimatedFileSize();
+
+  if (blob_garbage_meter_) {
+    s = blob_garbage_meter_->ProcessOutFlow(key, value);
+  }
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  const ParsedInternalKey& ikey = c_iter.ikey();
+  s = current_output().meta.UpdateBoundaries(key, value, ikey.sequence,
+                                             ikey.type);
+
+  if (partitioner_) {
+    last_key_for_partitioner_.assign(c_iter.user_key().data_,
+                                     c_iter.user_key().size_);
+  }
+
+  return s;
+}
+
+Status CompactionOutputs::AddRangeDels(
+    const Slice* comp_start_user_key, const Slice* comp_end_user_key,
+    CompactionIterationStats& range_del_out_stats, bool bottommost_level,
+    const InternalKeyComparator& icmp, SequenceNumber earliest_snapshot,
+    const Slice& next_table_min_key, const std::string& full_history_ts_low) {
+  assert(HasRangeDel());
+  FileMetaData& meta = current_output().meta;
+  const Comparator* ucmp = icmp.user_comparator();
+
+  Slice lower_bound_guard, upper_bound_guard;
+  std::string smallest_user_key;
+  const Slice *lower_bound, *upper_bound;
+  bool lower_bound_from_sub_compact = false;
+
+  size_t output_size = outputs_.size();
+  if (output_size == 1) {
+    // For the first output table, include range tombstones before the min
+    // key but after the subcompaction boundary.
+    lower_bound = comp_start_user_key;
+    lower_bound_from_sub_compact = true;
+  } else if (meta.smallest.size() > 0) {
+    // For subsequent output tables, only include range tombstones from min
+    // key onwards since the previous file was extended to contain range
+    // tombstones falling before min key.
+    smallest_user_key = meta.smallest.user_key().ToString(false /*hex*/);
+    lower_bound_guard = Slice(smallest_user_key);
+    lower_bound = &lower_bound_guard;
+  } else {
+    lower_bound = nullptr;
+  }
+  if (!next_table_min_key.empty()) {
+    // This may be the last file in the subcompaction in some cases, so we
+    // need to compare the end key of subcompaction with the next file start
+    // key. When the end key is chosen by the subcompaction, we know that
+    // it must be the biggest key in output file. Therefore, it is safe to
+    // use the smaller key as the upper bound of the output file, to ensure
+    // that there is no overlapping between different output files.
+    upper_bound_guard = ExtractUserKey(next_table_min_key);
+    if (comp_end_user_key != nullptr &&
+        ucmp->CompareWithoutTimestamp(upper_bound_guard, *comp_end_user_key) >=
+            0) {
+      upper_bound = comp_end_user_key;
+    } else {
+      upper_bound = &upper_bound_guard;
+    }
+  } else {
+    // This is the last file in the subcompaction, so extend until the
+    // subcompaction ends.
+    upper_bound = comp_end_user_key;
+  }
+  bool has_overlapping_endpoints;
+  if (upper_bound != nullptr && meta.largest.size() > 0) {
+    has_overlapping_endpoints = ucmp->CompareWithoutTimestamp(
+                                    meta.largest.user_key(), *upper_bound) == 0;
+  } else {
+    has_overlapping_endpoints = false;
+  }
+
+  // The end key of the subcompaction must be bigger or equal to the upper
+  // bound. If the end of subcompaction is null or the upper bound is null,
+  // it means that this file is the last file in the compaction. So there
+  // will be no overlapping between this file and others.
+  assert(comp_end_user_key == nullptr || upper_bound == nullptr ||
+         ucmp->CompareWithoutTimestamp(*upper_bound, *comp_end_user_key) <= 0);
+  auto it = range_del_agg_->NewIterator(lower_bound, upper_bound,
+                                        has_overlapping_endpoints);
+  // Position the range tombstone output iterator. There may be tombstone
+  // fragments that are entirely out of range, so make sure that we do not
+  // include those.
+  if (lower_bound != nullptr) {
+    it->Seek(*lower_bound);
+  } else {
+    it->SeekToFirst();
+  }
+  for (; it->Valid(); it->Next()) {
+    auto tombstone = it->Tombstone();
+    if (upper_bound != nullptr) {
+      int cmp =
+          ucmp->CompareWithoutTimestamp(*upper_bound, tombstone.start_key_);
+      if ((has_overlapping_endpoints && cmp < 0) ||
+          (!has_overlapping_endpoints && cmp <= 0)) {
+        // Tombstones starting after upper_bound only need to be included in
+        // the next table. If the current SST ends before upper_bound, i.e.,
+        // `has_overlapping_endpoints == false`, we can also skip over range
+        // tombstones that start exactly at upper_bound. Such range
+        // tombstones will be included in the next file and are not relevant
+        // to the point keys or endpoints of the current file.
+        break;
+      }
+    }
+
+    const size_t ts_sz = ucmp->timestamp_size();
+    // Garbage collection for range tombstones.
+    // If user-defined timestamp is enabled, range tombstones are dropped if
+    // they are at bottommost_level, below full_history_ts_low and not visible
+    // in any snapshot. trim_ts_ is passed to the constructor for
+    // range_del_agg_, and range_del_agg_ internally drops tombstones above
+    // trim_ts_.
+    if (bottommost_level && tombstone.seq_ <= earliest_snapshot &&
+        (ts_sz == 0 ||
+         (!full_history_ts_low.empty() &&
+          ucmp->CompareTimestamp(tombstone.ts_, full_history_ts_low) < 0))) {
+      // TODO(andrewkr): tombstones that span multiple output files are
+      // counted for each compaction output file, so lots of double
+      // counting.
+      range_del_out_stats.num_range_del_drop_obsolete++;
+      range_del_out_stats.num_record_drop_obsolete++;
+      continue;
+    }
+
+    auto kv = tombstone.Serialize();
+    assert(lower_bound == nullptr ||
+           ucmp->CompareWithoutTimestamp(*lower_bound, kv.second) < 0);
+    // Range tombstone is not supported by output validator yet.
+    builder_->Add(kv.first.Encode(), kv.second);
+    InternalKey smallest_candidate = std::move(kv.first);
+    if (lower_bound != nullptr &&
+        ucmp->CompareWithoutTimestamp(smallest_candidate.user_key(),
+                                      *lower_bound) <= 0) {
+      // Pretend the smallest key has the same user key as lower_bound
+      // (the max key in the previous table or subcompaction) in order for
+      // files to appear key-space partitioned.
+      //
+      // When lower_bound is chosen by a subcompaction, we know that
+      // subcompactions over smaller keys cannot contain any keys at
+      // lower_bound. We also know that smaller subcompactions exist,
+      // because otherwise the subcompaction woud be unbounded on the left.
+      // As a result, we know that no other files on the output level will
+      // contain actual keys at lower_bound (an output file may have a
+      // largest key of lower_bound@kMaxSequenceNumber, but this only
+      // indicates a large range tombstone was truncated). Therefore, it is
+      // safe to use the tombstone's sequence number, to ensure that keys at
+      // lower_bound at lower levels are covered by truncated tombstones.
+      //
+      // If lower_bound was chosen by the smallest data key in the file,
+      // choose lowest seqnum so this file's smallest internal key comes
+      // after the previous file's largest. The fake seqnum is OK because
+      // the read path's file-picking code only considers user key.
+      if (lower_bound_from_sub_compact) {
+        if (ts_sz) {
+          assert(tombstone.ts_.size() == ts_sz);
+          smallest_candidate = InternalKey(*lower_bound, tombstone.seq_,
+                                           kTypeRangeDeletion, tombstone.ts_);
+        } else {
+          smallest_candidate =
+              InternalKey(*lower_bound, tombstone.seq_, kTypeRangeDeletion);
+        }
+      } else {
+        smallest_candidate = InternalKey(*lower_bound, 0, kTypeRangeDeletion);
+      }
+    }
+    InternalKey largest_candidate = tombstone.SerializeEndKey();
+    if (upper_bound != nullptr &&
+        ucmp->CompareWithoutTimestamp(*upper_bound,
+                                      largest_candidate.user_key()) <= 0) {
+      // Pretend the largest key has the same user key as upper_bound (the
+      // min key in the following table or subcompaction) in order for files
+      // to appear key-space partitioned.
+      //
+      // Choose highest seqnum so this file's largest internal key comes
+      // before the next file's/subcompaction's smallest. The fake seqnum is
+      // OK because the read path's file-picking code only considers the
+      // user key portion.
+      //
+      // Note Seek() also creates InternalKey with (user_key,
+      // kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of
+      // kTypeRangeDeletion (0xF), so the range tombstone comes before the
+      // Seek() key in InternalKey's ordering. So Seek() will look in the
+      // next file for the user key
+      if (ts_sz) {
+        static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
+        if (ts_sz <= strlen(kTsMax)) {
+          largest_candidate =
+              InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion,
+                          Slice(kTsMax, ts_sz));
+        } else {
+          largest_candidate =
+              InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion,
+                          std::string(ts_sz, '\xff'));
+        }
+      } else {
+        largest_candidate =
+            InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion);
+      }
+    }
+#ifndef NDEBUG
+    SequenceNumber smallest_ikey_seqnum = kMaxSequenceNumber;
+    if (meta.smallest.size() > 0) {
+      smallest_ikey_seqnum = GetInternalKeySeqno(meta.smallest.Encode());
+    }
+#endif
+    meta.UpdateBoundariesForRange(smallest_candidate, largest_candidate,
+                                  tombstone.seq_, icmp);
+    // The smallest key in a file is used for range tombstone truncation, so
+    // it cannot have a seqnum of 0 (unless the smallest data key in a file
+    // has a seqnum of 0). Otherwise, the truncated tombstone may expose
+    // deleted keys at lower levels.
+    assert(smallest_ikey_seqnum == 0 ||
+           ExtractInternalKeyFooter(meta.smallest.Encode()) !=
+               PackSequenceAndType(0, kTypeRangeDeletion));
+  }
+  return Status::OK();
+}
+
+void CompactionOutputs::FillFilesToCutForTtl() {
+  if (compaction_->immutable_options()->compaction_style !=
+          kCompactionStyleLevel ||
+      compaction_->immutable_options()->compaction_pri !=
+          kMinOverlappingRatio ||
+      compaction_->mutable_cf_options()->ttl == 0 ||
+      compaction_->num_input_levels() < 2 || compaction_->bottommost_level()) {
+    return;
+  }
+
+  // We define new file with the oldest ancestor time to be younger than 1/4
+  // TTL, and an old one to be older than 1/2 TTL time.
+  int64_t temp_current_time;
+  auto get_time_status =
+      compaction_->immutable_options()->clock->GetCurrentTime(
+          &temp_current_time);
+  if (!get_time_status.ok()) {
+    return;
+  }
+
+  auto current_time = static_cast<uint64_t>(temp_current_time);
+  if (current_time < compaction_->mutable_cf_options()->ttl) {
+    return;
+  }
+
+  uint64_t old_age_thres =
+      current_time - compaction_->mutable_cf_options()->ttl / 2;
+  const std::vector<FileMetaData*>& olevel =
+      *(compaction_->inputs(compaction_->num_input_levels() - 1));
+  for (FileMetaData* file : olevel) {
+    // Worth filtering out by start and end?
+    uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime();
+    // We put old files if they are not too small to prevent a flood
+    // of small files.
+    if (oldest_ancester_time < old_age_thres &&
+        file->fd.GetFileSize() >
+            compaction_->mutable_cf_options()->target_file_size_base / 2) {
+      files_to_cut_for_ttl_.push_back(file);
+    }
+  }
+}
+
+CompactionOutputs::CompactionOutputs(const Compaction* compaction,
+                                     const bool is_penultimate_level)
+    : compaction_(compaction), is_penultimate_level_(is_penultimate_level) {
+  partitioner_ = compaction->output_level() == 0
+                     ? nullptr
+                     : compaction->CreateSstPartitioner();
+
+  if (compaction->output_level() != 0) {
+    FillFilesToCutForTtl();
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_outputs.h b/src/rocksdb/db/compaction/compaction_outputs.h
new file mode 100644
index 000000000..f40aa8215
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_outputs.h
@@ -0,0 +1,385 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/blob/blob_garbage_meter.h"
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_iterator.h"
+#include "db/internal_stats.h"
+#include "db/output_validator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CompactionOutputs;
+using CompactionFileOpenFunc = std::function<Status(CompactionOutputs&)>;
+using CompactionFileCloseFunc =
+    std::function<Status(CompactionOutputs&, const Status&, const Slice&)>;
+
+// Files produced by subcompaction, most of the functions are used by
+// compaction_job Open/Close compaction file functions.
+class CompactionOutputs {
+ public:
+  // compaction output file
+  struct Output {
+    Output(FileMetaData&& _meta, const InternalKeyComparator& _icmp,
+           bool _enable_order_check, bool _enable_hash, bool _finished,
+           uint64_t precalculated_hash)
+        : meta(std::move(_meta)),
+          validator(_icmp, _enable_order_check, _enable_hash,
+                    precalculated_hash),
+          finished(_finished) {}
+    FileMetaData meta;
+    OutputValidator validator;
+    bool finished;
+    std::shared_ptr<const TableProperties> table_properties;
+  };
+
+  CompactionOutputs() = delete;
+
+  explicit CompactionOutputs(const Compaction* compaction,
+                             const bool is_penultimate_level);
+
+  // Add generated output to the list
+  void AddOutput(FileMetaData&& meta, const InternalKeyComparator& icmp,
+                 bool enable_order_check, bool enable_hash,
+                 bool finished = false, uint64_t precalculated_hash = 0) {
+    outputs_.emplace_back(std::move(meta), icmp, enable_order_check,
+                          enable_hash, finished, precalculated_hash);
+  }
+
+  // Set new table builder for the current output
+  void NewBuilder(const TableBuilderOptions& tboptions);
+
+  // Assign a new WritableFileWriter to the current output
+  void AssignFileWriter(WritableFileWriter* writer) {
+    file_writer_.reset(writer);
+  }
+
+  // TODO: Remove it when remote compaction support tiered compaction
+  void SetTotalBytes(uint64_t bytes) { stats_.bytes_written += bytes; }
+  void SetNumOutputRecords(uint64_t num) { stats_.num_output_records = num; }
+
+  // TODO: Move the BlobDB builder into CompactionOutputs
+  const std::vector<BlobFileAddition>& GetBlobFileAdditions() const {
+    if (is_penultimate_level_) {
+      assert(blob_file_additions_.empty());
+    }
+    return blob_file_additions_;
+  }
+
+  std::vector<BlobFileAddition>* GetBlobFileAdditionsPtr() {
+    assert(!is_penultimate_level_);
+    return &blob_file_additions_;
+  }
+
+  bool HasBlobFileAdditions() const { return !blob_file_additions_.empty(); }
+
+  BlobGarbageMeter* CreateBlobGarbageMeter() {
+    assert(!is_penultimate_level_);
+    blob_garbage_meter_ = std::make_unique<BlobGarbageMeter>();
+    return blob_garbage_meter_.get();
+  }
+
+  BlobGarbageMeter* GetBlobGarbageMeter() const {
+    if (is_penultimate_level_) {
+      // blobdb doesn't support per_key_placement yet
+      assert(blob_garbage_meter_ == nullptr);
+      return nullptr;
+    }
+    return blob_garbage_meter_.get();
+  }
+
+  void UpdateBlobStats() {
+    assert(!is_penultimate_level_);
+    stats_.num_output_files_blob = blob_file_additions_.size();
+    for (const auto& blob : blob_file_additions_) {
+      stats_.bytes_written_blob += blob.GetTotalBlobBytes();
+    }
+  }
+
+  // Finish the current output file
+  Status Finish(const Status& intput_status,
+                const SeqnoToTimeMapping& seqno_time_mapping);
+
+  // Update output table properties from table builder
+  void UpdateTableProperties() {
+    current_output().table_properties =
+        std::make_shared<TableProperties>(GetTableProperties());
+  }
+
+  IOStatus WriterSyncClose(const Status& intput_status, SystemClock* clock,
+                           Statistics* statistics, bool use_fsync);
+
+  TableProperties GetTableProperties() {
+    return builder_->GetTableProperties();
+  }
+
+  Slice SmallestUserKey() const {
+    if (!outputs_.empty() && outputs_[0].finished) {
+      return outputs_[0].meta.smallest.user_key();
+    } else {
+      return Slice{nullptr, 0};
+    }
+  }
+
+  Slice LargestUserKey() const {
+    if (!outputs_.empty() && outputs_.back().finished) {
+      return outputs_.back().meta.largest.user_key();
+    } else {
+      return Slice{nullptr, 0};
+    }
+  }
+
+  // In case the last output file is empty, which doesn't need to keep.
+  void RemoveLastEmptyOutput() {
+    if (!outputs_.empty() && !outputs_.back().meta.fd.file_size) {
+      // An error occurred, so ignore the last output.
+      outputs_.pop_back();
+    }
+  }
+
+  // Remove the last output, for example the last output doesn't have data (no
+  // entry and no range-dels), but file_size might not be 0, as it has SST
+  // metadata.
+  void RemoveLastOutput() {
+    assert(!outputs_.empty());
+    outputs_.pop_back();
+  }
+
+  bool HasBuilder() const { return builder_ != nullptr; }
+
+  FileMetaData* GetMetaData() { return &current_output().meta; }
+
+  bool HasOutput() const { return !outputs_.empty(); }
+
+  uint64_t NumEntries() const { return builder_->NumEntries(); }
+
+  void ResetBuilder() {
+    builder_.reset();
+    current_output_file_size_ = 0;
+  }
+
+  // Add range-dels from the aggregator to the current output file
+  // @param comp_start_user_key and comp_end_user_key include timestamp if
+  // user-defined timestamp is enabled.
+  // @param full_history_ts_low used for range tombstone garbage collection.
+  Status AddRangeDels(const Slice* comp_start_user_key,
+                      const Slice* comp_end_user_key,
+                      CompactionIterationStats& range_del_out_stats,
+                      bool bottommost_level, const InternalKeyComparator& icmp,
+                      SequenceNumber earliest_snapshot,
+                      const Slice& next_table_min_key,
+                      const std::string& full_history_ts_low);
+
+  // if the outputs have range delete, range delete is also data
+  bool HasRangeDel() const {
+    return range_del_agg_ && !range_del_agg_->IsEmpty();
+  }
+
+ private:
+  friend class SubcompactionState;
+
+  void FillFilesToCutForTtl();
+
+  void SetOutputSlitKey(const std::optional<Slice> start,
+                        const std::optional<Slice> end) {
+    const InternalKeyComparator* icmp =
+        &compaction_->column_family_data()->internal_comparator();
+
+    const InternalKey* output_split_key = compaction_->GetOutputSplitKey();
+    // Invalid output_split_key indicates that we do not need to split
+    if (output_split_key != nullptr) {
+      // We may only split the output when the cursor is in the range. Split
+      if ((!end.has_value() ||
+           icmp->user_comparator()->Compare(
+               ExtractUserKey(output_split_key->Encode()), end.value()) < 0) &&
+          (!start.has_value() || icmp->user_comparator()->Compare(
+                                     ExtractUserKey(output_split_key->Encode()),
+                                     start.value()) > 0)) {
+        local_output_split_key_ = output_split_key;
+      }
+    }
+  }
+
+  // Returns true iff we should stop building the current output
+  // before processing the current key in compaction iterator.
+  bool ShouldStopBefore(const CompactionIterator& c_iter);
+
+  void Cleanup() {
+    if (builder_ != nullptr) {
+      // May happen if we get a shutdown call in the middle of compaction
+      builder_->Abandon();
+      builder_.reset();
+    }
+  }
+
+  // update tracked grandparents information like grandparent index, if it's
+  // in the gap between 2 grandparent files, accumulated grandparent files size
+  // etc.
+  // It returns how many boundaries it crosses by including current key.
+  size_t UpdateGrandparentBoundaryInfo(const Slice& internal_key);
+
+  // helper function to get the overlapped grandparent files size, it's only
+  // used for calculating the first key's overlap.
+  uint64_t GetCurrentKeyGrandparentOverlappedBytes(
+      const Slice& internal_key) const;
+
+  // Add current key from compaction_iterator to the output file. If needed
+  // close and open new compaction output with the functions provided.
+  Status AddToOutput(const CompactionIterator& c_iter,
+                     const CompactionFileOpenFunc& open_file_func,
+                     const CompactionFileCloseFunc& close_file_func);
+
+  // Close the current output. `open_file_func` is needed for creating new file
+  // for range-dels only output file.
+  Status CloseOutput(const Status& curr_status,
+                     const CompactionFileOpenFunc& open_file_func,
+                     const CompactionFileCloseFunc& close_file_func) {
+    Status status = curr_status;
+    // handle subcompaction containing only range deletions
+    if (status.ok() && !HasBuilder() && !HasOutput() && HasRangeDel()) {
+      status = open_file_func(*this);
+    }
+    if (HasBuilder()) {
+      const Slice empty_key{};
+      Status s = close_file_func(*this, status, empty_key);
+      if (!s.ok() && status.ok()) {
+        status = s;
+      }
+    }
+
+    return status;
+  }
+
+  // This subcompaction's output could be empty if compaction was aborted before
+  // this subcompaction had a chance to generate any output files. When
+  // subcompactions are executed sequentially this is more likely and will be
+  // particularly likely for the later subcompactions to be empty. Once they are
+  // run in parallel however it should be much rarer.
+  // It's caller's responsibility to make sure it's not empty.
+  Output& current_output() {
+    assert(!outputs_.empty());
+    return outputs_.back();
+  }
+
+  // Assign the range_del_agg to the target output level. There's only one
+  // range-del-aggregator per compaction outputs, for
+  // output_to_penultimate_level compaction it is only assigned to the
+  // penultimate level.
+  void AssignRangeDelAggregator(
+      std::unique_ptr<CompactionRangeDelAggregator>&& range_del_agg) {
+    assert(range_del_agg_ == nullptr);
+    range_del_agg_ = std::move(range_del_agg);
+  }
+
+  const Compaction* compaction_;
+
+  // current output builder and writer
+  std::unique_ptr<TableBuilder> builder_;
+  std::unique_ptr<WritableFileWriter> file_writer_;
+  uint64_t current_output_file_size_ = 0;
+
+  // all the compaction outputs so far
+  std::vector<Output> outputs_;
+
+  // BlobDB info
+  std::vector<BlobFileAddition> blob_file_additions_;
+  std::unique_ptr<BlobGarbageMeter> blob_garbage_meter_;
+
+  // Basic compaction output stats for this level's outputs
+  InternalStats::CompactionOutputsStats stats_;
+
+  // indicate if this CompactionOutputs obj for penultimate_level, should always
+  // be false if per_key_placement feature is not enabled.
+  const bool is_penultimate_level_;
+  std::unique_ptr<CompactionRangeDelAggregator> range_del_agg_ = nullptr;
+
+  // partitioner information
+  std::string last_key_for_partitioner_;
+  std::unique_ptr<SstPartitioner> partitioner_;
+
+  // A flag determines if this subcompaction has been split by the cursor
+  bool is_split_ = false;
+
+  // We also maintain the output split key for each subcompaction to avoid
+  // repetitive comparison in ShouldStopBefore()
+  const InternalKey* local_output_split_key_ = nullptr;
+
+  // Some identified files with old oldest ancester time and the range should be
+  // isolated out so that the output file(s) in that range can be merged down
+  // for TTL and clear the timestamps for the range.
+  std::vector<FileMetaData*> files_to_cut_for_ttl_;
+  int cur_files_to_cut_for_ttl_ = -1;
+  int next_files_to_cut_for_ttl_ = 0;
+
+  // An index that used to speed up ShouldStopBefore().
+  size_t grandparent_index_ = 0;
+
+  // if the output key is being grandparent files gap, so:
+  //  key > grandparents[grandparent_index_ - 1].largest &&
+  //  key < grandparents[grandparent_index_].smallest
+  bool being_grandparent_gap_ = true;
+
+  // The number of bytes overlapping between the current output and
+  // grandparent files used in ShouldStopBefore().
+  uint64_t grandparent_overlapped_bytes_ = 0;
+
+  // A flag determines whether the key has been seen in ShouldStopBefore()
+  bool seen_key_ = false;
+
+  // for the current output file, how many file boundaries has it crossed,
+  // basically number of files overlapped * 2
+  size_t grandparent_boundary_switched_num_ = 0;
+};
+
+// helper struct to concatenate the last level and penultimate level outputs
+// which could be replaced by std::ranges::join_view() in c++20
+struct OutputIterator {
+ public:
+  explicit OutputIterator(const std::vector<CompactionOutputs::Output>& a,
+                          const std::vector<CompactionOutputs::Output>& b)
+      : a_(a), b_(b) {
+    within_a = !a_.empty();
+    idx_ = 0;
+  }
+
+  OutputIterator begin() { return *this; }
+
+  OutputIterator end() { return *this; }
+
+  size_t size() { return a_.size() + b_.size(); }
+
+  const CompactionOutputs::Output& operator*() const {
+    return within_a ? a_[idx_] : b_[idx_];
+  }
+
+  OutputIterator& operator++() {
+    idx_++;
+    if (within_a && idx_ >= a_.size()) {
+      within_a = false;
+      idx_ = 0;
+    }
+    assert(within_a || idx_ <= b_.size());
+    return *this;
+  }
+
+  bool operator!=(const OutputIterator& /*rhs*/) const {
+    return within_a || idx_ < b_.size();
+  }
+
+ private:
+  const std::vector<CompactionOutputs::Output>& a_;
+  const std::vector<CompactionOutputs::Output>& b_;
+  bool within_a;
+  size_t idx_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker.cc b/src/rocksdb/db/compaction/compaction_picker.cc
new file mode 100644
index 000000000..abdecca9f
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker.cc
@@ -0,0 +1,1234 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_picker.h"
+
+#include <cinttypes>
+#include <limits>
+#include <queue>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/column_family.h"
+#include "file/filename.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "monitoring/statistics.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
+                           size_t min_files_to_compact,
+                           uint64_t max_compact_bytes_per_del_file,
+                           uint64_t max_compaction_bytes,
+                           CompactionInputFiles* comp_inputs,
+                           SequenceNumber earliest_mem_seqno) {
+  // Do not pick ingested file when there is at least one memtable not flushed
+  // which of seqno is overlap with the sst.
+  TEST_SYNC_POINT("FindIntraL0Compaction");
+  size_t start = 0;
+  for (; start < level_files.size(); start++) {
+    if (level_files[start]->being_compacted) {
+      return false;
+    }
+    // If there is no data in memtable, the earliest sequence number would the
+    // largest sequence number in last memtable.
+    // Because all files are sorted in descending order by largest_seqno, so we
+    // only need to check the first one.
+    if (level_files[start]->fd.largest_seqno <= earliest_mem_seqno) {
+      break;
+    }
+  }
+  if (start >= level_files.size()) {
+    return false;
+  }
+  size_t compact_bytes = static_cast<size_t>(level_files[start]->fd.file_size);
+  size_t compact_bytes_per_del_file = std::numeric_limits<size_t>::max();
+  // Compaction range will be [start, limit).
+  size_t limit;
+  // Pull in files until the amount of compaction work per deleted file begins
+  // increasing or maximum total compaction size is reached.
+  size_t new_compact_bytes_per_del_file = 0;
+  for (limit = start + 1; limit < level_files.size(); ++limit) {
+    compact_bytes += static_cast<size_t>(level_files[limit]->fd.file_size);
+    new_compact_bytes_per_del_file = compact_bytes / (limit - start);
+    if (level_files[limit]->being_compacted ||
+        new_compact_bytes_per_del_file > compact_bytes_per_del_file ||
+        compact_bytes > max_compaction_bytes) {
+      break;
+    }
+    compact_bytes_per_del_file = new_compact_bytes_per_del_file;
+  }
+
+  if ((limit - start) >= min_files_to_compact &&
+      compact_bytes_per_del_file < max_compact_bytes_per_del_file) {
+    assert(comp_inputs != nullptr);
+    comp_inputs->level = 0;
+    for (size_t i = start; i < limit; ++i) {
+      comp_inputs->files.push_back(level_files[i]);
+    }
+    return true;
+  }
+  return false;
+}
+
+// Determine compression type, based on user options, level of the output
+// file and whether compression is disabled.
+// If enable_compression is false, then compression is always disabled no
+// matter what the values of the other two parameters are.
+// Otherwise, the compression type is determined based on options and level.
+CompressionType GetCompressionType(const VersionStorageInfo* vstorage,
+                                   const MutableCFOptions& mutable_cf_options,
+                                   int level, int base_level,
+                                   const bool enable_compression) {
+  if (!enable_compression) {
+    // disable compression
+    return kNoCompression;
+  }
+
+  // If bottommost_compression is set and we are compacting to the
+  // bottommost level then we should use it.
+  if (mutable_cf_options.bottommost_compression != kDisableCompressionOption &&
+      level >= (vstorage->num_non_empty_levels() - 1)) {
+    return mutable_cf_options.bottommost_compression;
+  }
+  // If the user has specified a different compression level for each level,
+  // then pick the compression for that level.
+  if (!mutable_cf_options.compression_per_level.empty()) {
+    assert(level == 0 || level >= base_level);
+    int idx = (level == 0) ? 0 : level - base_level + 1;
+
+    const int n =
+        static_cast<int>(mutable_cf_options.compression_per_level.size()) - 1;
+    // It is possible for level_ to be -1; in that case, we use level
+    // 0's compression.  This occurs mostly in backwards compatibility
+    // situations when the builder doesn't know what level the file
+    // belongs to.  Likewise, if level is beyond the end of the
+    // specified compression levels, use the last value.
+    return mutable_cf_options
+        .compression_per_level[std::max(0, std::min(idx, n))];
+  } else {
+    return mutable_cf_options.compression;
+  }
+}
+
+CompressionOptions GetCompressionOptions(const MutableCFOptions& cf_options,
+                                         const VersionStorageInfo* vstorage,
+                                         int level,
+                                         const bool enable_compression) {
+  if (!enable_compression) {
+    return cf_options.compression_opts;
+  }
+  // If bottommost_compression_opts is enabled and we are compacting to the
+  // bottommost level then we should use the specified compression options.
+  if (level >= (vstorage->num_non_empty_levels() - 1) &&
+      cf_options.bottommost_compression_opts.enabled) {
+    return cf_options.bottommost_compression_opts;
+  }
+  return cf_options.compression_opts;
+}
+
+CompactionPicker::CompactionPicker(const ImmutableOptions& ioptions,
+                                   const InternalKeyComparator* icmp)
+    : ioptions_(ioptions), icmp_(icmp) {}
+
+CompactionPicker::~CompactionPicker() {}
+
+// Delete this compaction from the list of running compactions.
+void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) {
+  UnregisterCompaction(c);
+  if (!status.ok()) {
+    c->ResetNextCompactionIndex();
+  }
+}
+
+void CompactionPicker::GetRange(const CompactionInputFiles& inputs,
+                                InternalKey* smallest,
+                                InternalKey* largest) const {
+  const int level = inputs.level;
+  assert(!inputs.empty());
+  smallest->Clear();
+  largest->Clear();
+
+  if (level == 0) {
+    for (size_t i = 0; i < inputs.size(); i++) {
+      FileMetaData* f = inputs[i];
+      if (i == 0) {
+        *smallest = f->smallest;
+        *largest = f->largest;
+      } else {
+        if (icmp_->Compare(f->smallest, *smallest) < 0) {
+          *smallest = f->smallest;
+        }
+        if (icmp_->Compare(f->largest, *largest) > 0) {
+          *largest = f->largest;
+        }
+      }
+    }
+  } else {
+    *smallest = inputs[0]->smallest;
+    *largest = inputs[inputs.size() - 1]->largest;
+  }
+}
+
+void CompactionPicker::GetRange(const CompactionInputFiles& inputs1,
+                                const CompactionInputFiles& inputs2,
+                                InternalKey* smallest,
+                                InternalKey* largest) const {
+  assert(!inputs1.empty() || !inputs2.empty());
+  if (inputs1.empty()) {
+    GetRange(inputs2, smallest, largest);
+  } else if (inputs2.empty()) {
+    GetRange(inputs1, smallest, largest);
+  } else {
+    InternalKey smallest1, smallest2, largest1, largest2;
+    GetRange(inputs1, &smallest1, &largest1);
+    GetRange(inputs2, &smallest2, &largest2);
+    *smallest =
+        icmp_->Compare(smallest1, smallest2) < 0 ? smallest1 : smallest2;
+    *largest = icmp_->Compare(largest1, largest2) < 0 ? largest2 : largest1;
+  }
+}
+
+void CompactionPicker::GetRange(const std::vector<CompactionInputFiles>& inputs,
+                                InternalKey* smallest, InternalKey* largest,
+                                int exclude_level) const {
+  InternalKey current_smallest;
+  InternalKey current_largest;
+  bool initialized = false;
+  for (const auto& in : inputs) {
+    if (in.empty() || in.level == exclude_level) {
+      continue;
+    }
+    GetRange(in, &current_smallest, &current_largest);
+    if (!initialized) {
+      *smallest = current_smallest;
+      *largest = current_largest;
+      initialized = true;
+    } else {
+      if (icmp_->Compare(current_smallest, *smallest) < 0) {
+        *smallest = current_smallest;
+      }
+      if (icmp_->Compare(current_largest, *largest) > 0) {
+        *largest = current_largest;
+      }
+    }
+  }
+  assert(initialized);
+}
+
+bool CompactionPicker::ExpandInputsToCleanCut(const std::string& /*cf_name*/,
+                                              VersionStorageInfo* vstorage,
+                                              CompactionInputFiles* inputs,
+                                              InternalKey** next_smallest) {
+  // This isn't good compaction
+  assert(!inputs->empty());
+
+  const int level = inputs->level;
+  // GetOverlappingInputs will always do the right thing for level-0.
+  // So we don't need to do any expansion if level == 0.
+  if (level == 0) {
+    return true;
+  }
+
+  InternalKey smallest, largest;
+
+  // Keep expanding inputs until we are sure that there is a "clean cut"
+  // boundary between the files in input and the surrounding files.
+  // This will ensure that no parts of a key are lost during compaction.
+  int hint_index = -1;
+  size_t old_size;
+  do {
+    old_size = inputs->size();
+    GetRange(*inputs, &smallest, &largest);
+    inputs->clear();
+    vstorage->GetOverlappingInputs(level, &smallest, &largest, &inputs->files,
+                                   hint_index, &hint_index, true,
+                                   next_smallest);
+  } while (inputs->size() > old_size);
+
+  // we started off with inputs non-empty and the previous loop only grew
+  // inputs. thus, inputs should be non-empty here
+  assert(!inputs->empty());
+
+  // If, after the expansion, there are files that are already under
+  // compaction, then we must drop/cancel this compaction.
+  if (AreFilesInCompaction(inputs->files)) {
+    return false;
+  }
+  return true;
+}
+
+bool CompactionPicker::RangeOverlapWithCompaction(
+    const Slice& smallest_user_key, const Slice& largest_user_key,
+    int level) const {
+  const Comparator* ucmp = icmp_->user_comparator();
+  for (Compaction* c : compactions_in_progress_) {
+    if (c->output_level() == level &&
+        ucmp->CompareWithoutTimestamp(smallest_user_key,
+                                      c->GetLargestUserKey()) <= 0 &&
+        ucmp->CompareWithoutTimestamp(largest_user_key,
+                                      c->GetSmallestUserKey()) >= 0) {
+      // Overlap
+      return true;
+    }
+    if (c->SupportsPerKeyPlacement()) {
+      if (c->OverlapPenultimateLevelOutputRange(smallest_user_key,
+                                                largest_user_key)) {
+        return true;
+      }
+    }
+  }
+  // Did not overlap with any running compaction in level `level`
+  return false;
+}
+
+bool CompactionPicker::FilesRangeOverlapWithCompaction(
+    const std::vector<CompactionInputFiles>& inputs, int level,
+    int penultimate_level) const {
+  bool is_empty = true;
+  for (auto& in : inputs) {
+    if (!in.empty()) {
+      is_empty = false;
+      break;
+    }
+  }
+  if (is_empty) {
+    // No files in inputs
+    return false;
+  }
+
+  // TODO: Intra L0 compactions can have the ranges overlapped, but the input
+  //  files cannot be overlapped in the order of L0 files.
+  InternalKey smallest, largest;
+  GetRange(inputs, &smallest, &largest, Compaction::kInvalidLevel);
+  if (penultimate_level != Compaction::kInvalidLevel) {
+    if (ioptions_.compaction_style == kCompactionStyleUniversal) {
+      if (RangeOverlapWithCompaction(smallest.user_key(), largest.user_key(),
+                                     penultimate_level)) {
+        return true;
+      }
+    } else {
+      InternalKey penultimate_smallest, penultimate_largest;
+      GetRange(inputs, &penultimate_smallest, &penultimate_largest, level);
+      if (RangeOverlapWithCompaction(penultimate_smallest.user_key(),
+                                     penultimate_largest.user_key(),
+                                     penultimate_level)) {
+        return true;
+      }
+    }
+  }
+
+  return RangeOverlapWithCompaction(smallest.user_key(), largest.user_key(),
+                                    level);
+}
+
+// Returns true if any one of specified files are being compacted
+bool CompactionPicker::AreFilesInCompaction(
+    const std::vector<FileMetaData*>& files) {
+  for (size_t i = 0; i < files.size(); i++) {
+    if (files[i]->being_compacted) {
+      return true;
+    }
+  }
+  return false;
+}
+
+Compaction* CompactionPicker::CompactFiles(
+    const CompactionOptions& compact_options,
+    const std::vector<CompactionInputFiles>& input_files, int output_level,
+    VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options,
+    const MutableDBOptions& mutable_db_options, uint32_t output_path_id) {
+#ifndef NDEBUG
+  assert(input_files.size());
+  // This compaction output should not overlap with a running compaction as
+  // `SanitizeCompactionInputFiles` should've checked earlier and db mutex
+  // shouldn't have been released since.
+  int start_level = Compaction::kInvalidLevel;
+  for (const auto& in : input_files) {
+    // input_files should already be sorted by level
+    if (!in.empty()) {
+      start_level = in.level;
+      break;
+    }
+  }
+  assert(output_level == 0 ||
+         !FilesRangeOverlapWithCompaction(
+             input_files, output_level,
+             Compaction::EvaluatePenultimateLevel(vstorage, ioptions_,
+                                                  start_level, output_level)));
+#endif /* !NDEBUG */
+
+  CompressionType compression_type;
+  if (compact_options.compression == kDisableCompressionOption) {
+    int base_level;
+    if (ioptions_.compaction_style == kCompactionStyleLevel) {
+      base_level = vstorage->base_level();
+    } else {
+      base_level = 1;
+    }
+    compression_type = GetCompressionType(vstorage, mutable_cf_options,
+                                          output_level, base_level);
+  } else {
+    // TODO(ajkr): `CompactionOptions` offers configurable `CompressionType`
+    // without configurable `CompressionOptions`, which is inconsistent.
+    compression_type = compact_options.compression;
+  }
+  auto c = new Compaction(
+      vstorage, ioptions_, mutable_cf_options, mutable_db_options, input_files,
+      output_level, compact_options.output_file_size_limit,
+      mutable_cf_options.max_compaction_bytes, output_path_id, compression_type,
+      GetCompressionOptions(mutable_cf_options, vstorage, output_level),
+      Temperature::kUnknown, compact_options.max_subcompactions,
+      /* grandparents */ {}, true);
+  RegisterCompaction(c);
+  return c;
+}
+
+Status CompactionPicker::GetCompactionInputsFromFileNumbers(
+    std::vector<CompactionInputFiles>* input_files,
+    std::unordered_set<uint64_t>* input_set, const VersionStorageInfo* vstorage,
+    const CompactionOptions& /*compact_options*/) const {
+  if (input_set->size() == 0U) {
+    return Status::InvalidArgument(
+        "Compaction must include at least one file.");
+  }
+  assert(input_files);
+
+  std::vector<CompactionInputFiles> matched_input_files;
+  matched_input_files.resize(vstorage->num_levels());
+  int first_non_empty_level = -1;
+  int last_non_empty_level = -1;
+  // TODO(yhchiang): use a lazy-initialized mapping from
+  //                 file_number to FileMetaData in Version.
+  for (int level = 0; level < vstorage->num_levels(); ++level) {
+    for (auto file : vstorage->LevelFiles(level)) {
+      auto iter = input_set->find(file->fd.GetNumber());
+      if (iter != input_set->end()) {
+        matched_input_files[level].files.push_back(file);
+        input_set->erase(iter);
+        last_non_empty_level = level;
+        if (first_non_empty_level == -1) {
+          first_non_empty_level = level;
+        }
+      }
+    }
+  }
+
+  if (!input_set->empty()) {
+    std::string message(
+        "Cannot find matched SST files for the following file numbers:");
+    for (auto fn : *input_set) {
+      message += " ";
+      message += std::to_string(fn);
+    }
+    return Status::InvalidArgument(message);
+  }
+
+  for (int level = first_non_empty_level; level <= last_non_empty_level;
+       ++level) {
+    matched_input_files[level].level = level;
+    input_files->emplace_back(std::move(matched_input_files[level]));
+  }
+
+  return Status::OK();
+}
+
+// Returns true if any one of the parent files are being compacted
+bool CompactionPicker::IsRangeInCompaction(VersionStorageInfo* vstorage,
+                                           const InternalKey* smallest,
+                                           const InternalKey* largest,
+                                           int level, int* level_index) {
+  std::vector<FileMetaData*> inputs;
+  assert(level < NumberLevels());
+
+  vstorage->GetOverlappingInputs(level, smallest, largest, &inputs,
+                                 level_index ? *level_index : 0, level_index);
+  return AreFilesInCompaction(inputs);
+}
+
+// Populates the set of inputs of all other levels that overlap with the
+// start level.
+// Now we assume all levels except start level and output level are empty.
+// Will also attempt to expand "start level" if that doesn't expand
+// "output level" or cause "level" to include a file for compaction that has an
+// overlapping user-key with another file.
+// REQUIRES: input_level and output_level are different
+// REQUIRES: inputs->empty() == false
+// Returns false if files on parent level are currently in compaction, which
+// means that we can't compact them
+bool CompactionPicker::SetupOtherInputs(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, CompactionInputFiles* inputs,
+    CompactionInputFiles* output_level_inputs, int* parent_index,
+    int base_index, bool only_expand_towards_right) {
+  assert(!inputs->empty());
+  assert(output_level_inputs->empty());
+  const int input_level = inputs->level;
+  const int output_level = output_level_inputs->level;
+  if (input_level == output_level) {
+    // no possibility of conflict
+    return true;
+  }
+
+  // For now, we only support merging two levels, start level and output level.
+  // We need to assert other levels are empty.
+  for (int l = input_level + 1; l < output_level; l++) {
+    assert(vstorage->NumLevelFiles(l) == 0);
+  }
+
+  InternalKey smallest, largest;
+
+  // Get the range one last time.
+  GetRange(*inputs, &smallest, &largest);
+
+  // Populate the set of next-level files (inputs_GetOutputLevelInputs()) to
+  // include in compaction
+  vstorage->GetOverlappingInputs(output_level, &smallest, &largest,
+                                 &output_level_inputs->files, *parent_index,
+                                 parent_index);
+  if (AreFilesInCompaction(output_level_inputs->files)) {
+    return false;
+  }
+  if (!output_level_inputs->empty()) {
+    if (!ExpandInputsToCleanCut(cf_name, vstorage, output_level_inputs)) {
+      return false;
+    }
+  }
+
+  // See if we can further grow the number of inputs in "level" without
+  // changing the number of "level+1" files we pick up. We also choose NOT
+  // to expand if this would cause "level" to include some entries for some
+  // user key, while excluding other entries for the same user key. This
+  // can happen when one user key spans multiple files.
+  if (!output_level_inputs->empty()) {
+    const uint64_t limit = mutable_cf_options.max_compaction_bytes;
+    const uint64_t output_level_inputs_size =
+        TotalFileSize(output_level_inputs->files);
+    const uint64_t inputs_size = TotalFileSize(inputs->files);
+    bool expand_inputs = false;
+
+    CompactionInputFiles expanded_inputs;
+    expanded_inputs.level = input_level;
+    // Get closed interval of output level
+    InternalKey all_start, all_limit;
+    GetRange(*inputs, *output_level_inputs, &all_start, &all_limit);
+    bool try_overlapping_inputs = true;
+    if (only_expand_towards_right) {
+      // Round-robin compaction only allows expansion towards the larger side.
+      vstorage->GetOverlappingInputs(input_level, &smallest, &all_limit,
+                                     &expanded_inputs.files, base_index,
+                                     nullptr);
+    } else {
+      vstorage->GetOverlappingInputs(input_level, &all_start, &all_limit,
+                                     &expanded_inputs.files, base_index,
+                                     nullptr);
+    }
+    uint64_t expanded_inputs_size = TotalFileSize(expanded_inputs.files);
+    if (!ExpandInputsToCleanCut(cf_name, vstorage, &expanded_inputs)) {
+      try_overlapping_inputs = false;
+    }
+    if (try_overlapping_inputs && expanded_inputs.size() > inputs->size() &&
+        (mutable_cf_options.ignore_max_compaction_bytes_for_input ||
+         output_level_inputs_size + expanded_inputs_size < limit) &&
+        !AreFilesInCompaction(expanded_inputs.files)) {
+      InternalKey new_start, new_limit;
+      GetRange(expanded_inputs, &new_start, &new_limit);
+      CompactionInputFiles expanded_output_level_inputs;
+      expanded_output_level_inputs.level = output_level;
+      vstorage->GetOverlappingInputs(output_level, &new_start, &new_limit,
+                                     &expanded_output_level_inputs.files,
+                                     *parent_index, parent_index);
+      assert(!expanded_output_level_inputs.empty());
+      if (!AreFilesInCompaction(expanded_output_level_inputs.files) &&
+          ExpandInputsToCleanCut(cf_name, vstorage,
+                                 &expanded_output_level_inputs) &&
+          expanded_output_level_inputs.size() == output_level_inputs->size()) {
+        expand_inputs = true;
+      }
+    }
+    if (!expand_inputs) {
+      vstorage->GetCleanInputsWithinInterval(input_level, &all_start,
+                                             &all_limit, &expanded_inputs.files,
+                                             base_index, nullptr);
+      expanded_inputs_size = TotalFileSize(expanded_inputs.files);
+      if (expanded_inputs.size() > inputs->size() &&
+          (mutable_cf_options.ignore_max_compaction_bytes_for_input ||
+           output_level_inputs_size + expanded_inputs_size < limit) &&
+          !AreFilesInCompaction(expanded_inputs.files)) {
+        expand_inputs = true;
+      }
+    }
+    if (expand_inputs) {
+      ROCKS_LOG_INFO(ioptions_.logger,
+                     "[%s] Expanding@%d %" ROCKSDB_PRIszt "+%" ROCKSDB_PRIszt
+                     "(%" PRIu64 "+%" PRIu64 " bytes) to %" ROCKSDB_PRIszt
+                     "+%" ROCKSDB_PRIszt " (%" PRIu64 "+%" PRIu64 " bytes)\n",
+                     cf_name.c_str(), input_level, inputs->size(),
+                     output_level_inputs->size(), inputs_size,
+                     output_level_inputs_size, expanded_inputs.size(),
+                     output_level_inputs->size(), expanded_inputs_size,
+                     output_level_inputs_size);
+      inputs->files = expanded_inputs.files;
+    }
+  } else {
+    // Likely to be trivial move. Expand files if they are still trivial moves,
+    // but limit to mutable_cf_options.max_compaction_bytes or 8 files so that
+    // we don't create too much compaction pressure for the next level.
+  }
+  return true;
+}
+
+void CompactionPicker::GetGrandparents(
+    VersionStorageInfo* vstorage, const CompactionInputFiles& inputs,
+    const CompactionInputFiles& output_level_inputs,
+    std::vector<FileMetaData*>* grandparents) {
+  InternalKey start, limit;
+  GetRange(inputs, output_level_inputs, &start, &limit);
+  // Compute the set of grandparent files that overlap this compaction
+  // (parent == level+1; grandparent == level+2 or the first
+  // level after that has overlapping files)
+  for (int level = output_level_inputs.level + 1; level < NumberLevels();
+       level++) {
+    vstorage->GetOverlappingInputs(level, &start, &limit, grandparents);
+    if (!grandparents->empty()) {
+      break;
+    }
+  }
+}
+
+Compaction* CompactionPicker::CompactRange(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    int input_level, int output_level,
+    const CompactRangeOptions& compact_range_options, const InternalKey* begin,
+    const InternalKey* end, InternalKey** compaction_end, bool* manual_conflict,
+    uint64_t max_file_num_to_ignore, const std::string& trim_ts) {
+  // CompactionPickerFIFO has its own implementation of compact range
+  assert(ioptions_.compaction_style != kCompactionStyleFIFO);
+
+  if (input_level == ColumnFamilyData::kCompactAllLevels) {
+    assert(ioptions_.compaction_style == kCompactionStyleUniversal);
+
+    // Universal compaction with more than one level always compacts all the
+    // files together to the last level.
+    assert(vstorage->num_levels() > 1);
+    // DBImpl::CompactRange() set output level to be the last level
+    if (ioptions_.allow_ingest_behind) {
+      assert(output_level == vstorage->num_levels() - 2);
+    } else {
+      assert(output_level == vstorage->num_levels() - 1);
+    }
+    // DBImpl::RunManualCompaction will make full range for universal compaction
+    assert(begin == nullptr);
+    assert(end == nullptr);
+    *compaction_end = nullptr;
+
+    int start_level = 0;
+    for (; start_level < vstorage->num_levels() &&
+           vstorage->NumLevelFiles(start_level) == 0;
+         start_level++) {
+    }
+    if (start_level == vstorage->num_levels()) {
+      return nullptr;
+    }
+
+    if ((start_level == 0) && (!level0_compactions_in_progress_.empty())) {
+      *manual_conflict = true;
+      // Only one level 0 compaction allowed
+      return nullptr;
+    }
+
+    std::vector<CompactionInputFiles> inputs(vstorage->num_levels() -
+                                             start_level);
+    for (int level = start_level; level < vstorage->num_levels(); level++) {
+      inputs[level - start_level].level = level;
+      auto& files = inputs[level - start_level].files;
+      for (FileMetaData* f : vstorage->LevelFiles(level)) {
+        files.push_back(f);
+      }
+      if (AreFilesInCompaction(files)) {
+        *manual_conflict = true;
+        return nullptr;
+      }
+    }
+
+    // 2 non-exclusive manual compactions could run at the same time producing
+    // overlaping outputs in the same level.
+    if (FilesRangeOverlapWithCompaction(
+            inputs, output_level,
+            Compaction::EvaluatePenultimateLevel(vstorage, ioptions_,
+                                                 start_level, output_level))) {
+      // This compaction output could potentially conflict with the output
+      // of a currently running compaction, we cannot run it.
+      *manual_conflict = true;
+      return nullptr;
+    }
+
+    Compaction* c = new Compaction(
+        vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+        std::move(inputs), output_level,
+        MaxFileSizeForLevel(mutable_cf_options, output_level,
+                            ioptions_.compaction_style),
+        /* max_compaction_bytes */ LLONG_MAX,
+        compact_range_options.target_path_id,
+        GetCompressionType(vstorage, mutable_cf_options, output_level, 1),
+        GetCompressionOptions(mutable_cf_options, vstorage, output_level),
+        Temperature::kUnknown, compact_range_options.max_subcompactions,
+        /* grandparents */ {}, /* is manual */ true, trim_ts, /* score */ -1,
+        /* deletion_compaction */ false, /* l0_files_might_overlap */ true,
+        CompactionReason::kUnknown,
+        compact_range_options.blob_garbage_collection_policy,
+        compact_range_options.blob_garbage_collection_age_cutoff);
+
+    RegisterCompaction(c);
+    vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options);
+    return c;
+  }
+
+  CompactionInputFiles inputs;
+  inputs.level = input_level;
+  bool covering_the_whole_range = true;
+
+  // All files are 'overlapping' in universal style compaction.
+  // We have to compact the entire range in one shot.
+  if (ioptions_.compaction_style == kCompactionStyleUniversal) {
+    begin = nullptr;
+    end = nullptr;
+  }
+
+  vstorage->GetOverlappingInputs(input_level, begin, end, &inputs.files);
+  if (inputs.empty()) {
+    return nullptr;
+  }
+
+  if ((input_level == 0) && (!level0_compactions_in_progress_.empty())) {
+    // Only one level 0 compaction allowed
+    TEST_SYNC_POINT("CompactionPicker::CompactRange:Conflict");
+    *manual_conflict = true;
+    return nullptr;
+  }
+
+  // Avoid compacting too much in one shot in case the range is large.
+  // But we cannot do this for level-0 since level-0 files can overlap
+  // and we must not pick one file and drop another older file if the
+  // two files overlap.
+  if (input_level > 0) {
+    const uint64_t limit = mutable_cf_options.max_compaction_bytes;
+    uint64_t input_level_total = 0;
+    int hint_index = -1;
+    InternalKey* smallest = nullptr;
+    InternalKey* largest = nullptr;
+    for (size_t i = 0; i + 1 < inputs.size(); ++i) {
+      if (!smallest) {
+        smallest = &inputs[i]->smallest;
+      }
+      largest = &inputs[i]->largest;
+
+      uint64_t input_file_size = inputs[i]->fd.GetFileSize();
+      uint64_t output_level_total = 0;
+      if (output_level < vstorage->num_non_empty_levels()) {
+        std::vector<FileMetaData*> files;
+        vstorage->GetOverlappingInputsRangeBinarySearch(
+            output_level, smallest, largest, &files, hint_index, &hint_index);
+        for (const auto& file : files) {
+          output_level_total += file->fd.GetFileSize();
+        }
+      }
+
+      input_level_total += input_file_size;
+
+      if (input_level_total + output_level_total >= limit) {
+        covering_the_whole_range = false;
+        // still include the current file, so the compaction could be larger
+        // than max_compaction_bytes, which is also to make sure the compaction
+        // can make progress even `max_compaction_bytes` is small (e.g. smaller
+        // than an SST file).
+        inputs.files.resize(i + 1);
+        break;
+      }
+    }
+  }
+
+  assert(compact_range_options.target_path_id <
+         static_cast<uint32_t>(ioptions_.cf_paths.size()));
+
+  // for BOTTOM LEVEL compaction only, use max_file_num_to_ignore to filter out
+  // files that are created during the current compaction.
+  if (compact_range_options.bottommost_level_compaction ==
+          BottommostLevelCompaction::kForceOptimized &&
+      max_file_num_to_ignore != std::numeric_limits<uint64_t>::max()) {
+    assert(input_level == output_level);
+    // inputs_shrunk holds a continuous subset of input files which were all
+    // created before the current manual compaction
+    std::vector<FileMetaData*> inputs_shrunk;
+    size_t skip_input_index = inputs.size();
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      if (inputs[i]->fd.GetNumber() < max_file_num_to_ignore) {
+        inputs_shrunk.push_back(inputs[i]);
+      } else if (!inputs_shrunk.empty()) {
+        // inputs[i] was created during the current manual compaction and
+        // need to be skipped
+        skip_input_index = i;
+        break;
+      }
+    }
+    if (inputs_shrunk.empty()) {
+      return nullptr;
+    }
+    if (inputs.size() != inputs_shrunk.size()) {
+      inputs.files.swap(inputs_shrunk);
+    }
+    // set covering_the_whole_range to false if there is any file that need to
+    // be compacted in the range of inputs[skip_input_index+1, inputs.size())
+    for (size_t i = skip_input_index + 1; i < inputs.size(); ++i) {
+      if (inputs[i]->fd.GetNumber() < max_file_num_to_ignore) {
+        covering_the_whole_range = false;
+      }
+    }
+  }
+
+  InternalKey key_storage;
+  InternalKey* next_smallest = &key_storage;
+  if (ExpandInputsToCleanCut(cf_name, vstorage, &inputs, &next_smallest) ==
+      false) {
+    // manual compaction is now multi-threaded, so it can
+    // happen that ExpandWhileOverlapping fails
+    // we handle it higher in RunManualCompaction
+    *manual_conflict = true;
+    return nullptr;
+  }
+
+  if (covering_the_whole_range || !next_smallest) {
+    *compaction_end = nullptr;
+  } else {
+    **compaction_end = *next_smallest;
+  }
+
+  CompactionInputFiles output_level_inputs;
+  if (output_level == ColumnFamilyData::kCompactToBaseLevel) {
+    assert(input_level == 0);
+    output_level = vstorage->base_level();
+    assert(output_level > 0);
+  }
+  output_level_inputs.level = output_level;
+  if (input_level != output_level) {
+    int parent_index = -1;
+    if (!SetupOtherInputs(cf_name, mutable_cf_options, vstorage, &inputs,
+                          &output_level_inputs, &parent_index, -1)) {
+      // manual compaction is now multi-threaded, so it can
+      // happen that SetupOtherInputs fails
+      // we handle it higher in RunManualCompaction
+      *manual_conflict = true;
+      return nullptr;
+    }
+  }
+
+  std::vector<CompactionInputFiles> compaction_inputs({inputs});
+  if (!output_level_inputs.empty()) {
+    compaction_inputs.push_back(output_level_inputs);
+  }
+  for (size_t i = 0; i < compaction_inputs.size(); i++) {
+    if (AreFilesInCompaction(compaction_inputs[i].files)) {
+      *manual_conflict = true;
+      return nullptr;
+    }
+  }
+
+  // 2 non-exclusive manual compactions could run at the same time producing
+  // overlaping outputs in the same level.
+  if (FilesRangeOverlapWithCompaction(
+          compaction_inputs, output_level,
+          Compaction::EvaluatePenultimateLevel(vstorage, ioptions_, input_level,
+                                               output_level))) {
+    // This compaction output could potentially conflict with the output
+    // of a currently running compaction, we cannot run it.
+    *manual_conflict = true;
+    return nullptr;
+  }
+
+  std::vector<FileMetaData*> grandparents;
+  GetGrandparents(vstorage, inputs, output_level_inputs, &grandparents);
+  Compaction* compaction = new Compaction(
+      vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+      std::move(compaction_inputs), output_level,
+      MaxFileSizeForLevel(mutable_cf_options, output_level,
+                          ioptions_.compaction_style, vstorage->base_level(),
+                          ioptions_.level_compaction_dynamic_level_bytes),
+      mutable_cf_options.max_compaction_bytes,
+      compact_range_options.target_path_id,
+      GetCompressionType(vstorage, mutable_cf_options, output_level,
+                         vstorage->base_level()),
+      GetCompressionOptions(mutable_cf_options, vstorage, output_level),
+      Temperature::kUnknown, compact_range_options.max_subcompactions,
+      std::move(grandparents), /* is manual */ true, trim_ts, /* score */ -1,
+      /* deletion_compaction */ false, /* l0_files_might_overlap */ true,
+      CompactionReason::kUnknown,
+      compact_range_options.blob_garbage_collection_policy,
+      compact_range_options.blob_garbage_collection_age_cutoff);
+
+  TEST_SYNC_POINT_CALLBACK("CompactionPicker::CompactRange:Return", compaction);
+  RegisterCompaction(compaction);
+
+  // Creating a compaction influences the compaction score because the score
+  // takes running compactions into account (by skipping files that are already
+  // being compacted). Since we just changed compaction score, we recalculate it
+  // here
+  vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options);
+
+  return compaction;
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+// Test whether two files have overlapping key-ranges.
+bool HaveOverlappingKeyRanges(const Comparator* c, const SstFileMetaData& a,
+                              const SstFileMetaData& b) {
+  if (c->CompareWithoutTimestamp(a.smallestkey, b.smallestkey) >= 0) {
+    if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) {
+      // b.smallestkey <= a.smallestkey <= b.largestkey
+      return true;
+    }
+  } else if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) {
+    // a.smallestkey < b.smallestkey <= a.largestkey
+    return true;
+  }
+  if (c->CompareWithoutTimestamp(a.largestkey, b.largestkey) <= 0) {
+    if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) {
+      // b.smallestkey <= a.largestkey <= b.largestkey
+      return true;
+    }
+  } else if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) {
+    // a.smallestkey <= b.largestkey < a.largestkey
+    return true;
+  }
+  return false;
+}
+}  // namespace
+
+Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels(
+    std::unordered_set<uint64_t>* input_files,
+    const ColumnFamilyMetaData& cf_meta, const int output_level) const {
+  auto& levels = cf_meta.levels;
+  auto comparator = icmp_->user_comparator();
+
+  // TODO(yhchiang): add is_adjustable to CompactionOptions
+
+  // the smallest and largest key of the current compaction input
+  std::string smallestkey;
+  std::string largestkey;
+  // a flag for initializing smallest and largest key
+  bool is_first = false;
+  const int kNotFound = -1;
+
+  // For each level, it does the following things:
+  // 1. Find the first and the last compaction input files
+  //    in the current level.
+  // 2. Include all files between the first and the last
+  //    compaction input files.
+  // 3. Update the compaction key-range.
+  // 4. For all remaining levels, include files that have
+  //    overlapping key-range with the compaction key-range.
+  for (int l = 0; l <= output_level; ++l) {
+    auto& current_files = levels[l].files;
+    int first_included = static_cast<int>(current_files.size());
+    int last_included = kNotFound;
+
+    // identify the first and the last compaction input files
+    // in the current level.
+    for (size_t f = 0; f < current_files.size(); ++f) {
+      const uint64_t file_number = TableFileNameToNumber(current_files[f].name);
+      if (input_files->find(file_number) == input_files->end()) {
+        continue;
+      }
+      first_included = std::min(first_included, static_cast<int>(f));
+      last_included = std::max(last_included, static_cast<int>(f));
+      if (is_first == false) {
+        smallestkey = current_files[f].smallestkey;
+        largestkey = current_files[f].largestkey;
+        is_first = true;
+      }
+    }
+    if (last_included == kNotFound) {
+      continue;
+    }
+
+    if (l != 0) {
+      // expand the compaction input of the current level if it
+      // has overlapping key-range with other non-compaction input
+      // files in the same level.
+      while (first_included > 0) {
+        if (comparator->CompareWithoutTimestamp(
+                current_files[first_included - 1].largestkey,
+                current_files[first_included].smallestkey) < 0) {
+          break;
+        }
+        first_included--;
+      }
+
+      while (last_included < static_cast<int>(current_files.size()) - 1) {
+        if (comparator->CompareWithoutTimestamp(
+                current_files[last_included + 1].smallestkey,
+                current_files[last_included].largestkey) > 0) {
+          break;
+        }
+        last_included++;
+      }
+    } else if (output_level > 0) {
+      last_included = static_cast<int>(current_files.size() - 1);
+    }
+
+    // include all files between the first and the last compaction input files.
+    for (int f = first_included; f <= last_included; ++f) {
+      if (current_files[f].being_compacted) {
+        return Status::Aborted("Necessary compaction input file " +
+                               current_files[f].name +
+                               " is currently being compacted.");
+      }
+      input_files->insert(TableFileNameToNumber(current_files[f].name));
+    }
+
+    // update smallest and largest key
+    if (l == 0) {
+      for (int f = first_included; f <= last_included; ++f) {
+        if (comparator->CompareWithoutTimestamp(
+                smallestkey, current_files[f].smallestkey) > 0) {
+          smallestkey = current_files[f].smallestkey;
+        }
+        if (comparator->CompareWithoutTimestamp(
+                largestkey, current_files[f].largestkey) < 0) {
+          largestkey = current_files[f].largestkey;
+        }
+      }
+    } else {
+      if (comparator->CompareWithoutTimestamp(
+              smallestkey, current_files[first_included].smallestkey) > 0) {
+        smallestkey = current_files[first_included].smallestkey;
+      }
+      if (comparator->CompareWithoutTimestamp(
+              largestkey, current_files[last_included].largestkey) < 0) {
+        largestkey = current_files[last_included].largestkey;
+      }
+    }
+
+    SstFileMetaData aggregated_file_meta;
+    aggregated_file_meta.smallestkey = smallestkey;
+    aggregated_file_meta.largestkey = largestkey;
+
+    // For all lower levels, include all overlapping files.
+    // We need to add overlapping files from the current level too because even
+    // if there no input_files in level l, we would still need to add files
+    // which overlap with the range containing the input_files in levels 0 to l
+    // Level 0 doesn't need to be handled this way because files are sorted by
+    // time and not by key
+    for (int m = std::max(l, 1); m <= output_level; ++m) {
+      for (auto& next_lv_file : levels[m].files) {
+        if (HaveOverlappingKeyRanges(comparator, aggregated_file_meta,
+                                     next_lv_file)) {
+          if (next_lv_file.being_compacted) {
+            return Status::Aborted(
+                "File " + next_lv_file.name +
+                " that has overlapping key range with one of the compaction "
+                " input file is currently being compacted.");
+          }
+          input_files->insert(TableFileNameToNumber(next_lv_file.name));
+        }
+      }
+    }
+  }
+  if (RangeOverlapWithCompaction(smallestkey, largestkey, output_level)) {
+    return Status::Aborted(
+        "A running compaction is writing to the same output level in an "
+        "overlapping key range");
+  }
+  return Status::OK();
+}
+
+Status CompactionPicker::SanitizeCompactionInputFiles(
+    std::unordered_set<uint64_t>* input_files,
+    const ColumnFamilyMetaData& cf_meta, const int output_level) const {
+  assert(static_cast<int>(cf_meta.levels.size()) - 1 ==
+         cf_meta.levels[cf_meta.levels.size() - 1].level);
+  if (output_level >= static_cast<int>(cf_meta.levels.size())) {
+    return Status::InvalidArgument(
+        "Output level for column family " + cf_meta.name +
+        " must between [0, " +
+        std::to_string(cf_meta.levels[cf_meta.levels.size() - 1].level) + "].");
+  }
+
+  if (output_level > MaxOutputLevel()) {
+    return Status::InvalidArgument(
+        "Exceed the maximum output level defined by "
+        "the current compaction algorithm --- " +
+        std::to_string(MaxOutputLevel()));
+  }
+
+  if (output_level < 0) {
+    return Status::InvalidArgument("Output level cannot be negative.");
+  }
+
+  if (input_files->size() == 0) {
+    return Status::InvalidArgument(
+        "A compaction must contain at least one file.");
+  }
+
+  Status s = SanitizeCompactionInputFilesForAllLevels(input_files, cf_meta,
+                                                      output_level);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  // for all input files, check whether the file number matches
+  // any currently-existing files.
+  for (auto file_num : *input_files) {
+    bool found = false;
+    int input_file_level = -1;
+    for (const auto& level_meta : cf_meta.levels) {
+      for (const auto& file_meta : level_meta.files) {
+        if (file_num == TableFileNameToNumber(file_meta.name)) {
+          if (file_meta.being_compacted) {
+            return Status::Aborted("Specified compaction input file " +
+                                   MakeTableFileName("", file_num) +
+                                   " is already being compacted.");
+          }
+          found = true;
+          input_file_level = level_meta.level;
+          break;
+        }
+      }
+      if (found) {
+        break;
+      }
+    }
+    if (!found) {
+      return Status::InvalidArgument(
+          "Specified compaction input file " + MakeTableFileName("", file_num) +
+          " does not exist in column family " + cf_meta.name + ".");
+    }
+    if (input_file_level > output_level) {
+      return Status::InvalidArgument(
+          "Cannot compact file to up level, input file: " +
+          MakeTableFileName("", file_num) + " level " +
+          std::to_string(input_file_level) + " > output level " +
+          std::to_string(output_level));
+    }
+  }
+
+  return Status::OK();
+}
+#endif  // !ROCKSDB_LITE
+
+void CompactionPicker::RegisterCompaction(Compaction* c) {
+  if (c == nullptr) {
+    return;
+  }
+  assert(ioptions_.compaction_style != kCompactionStyleLevel ||
+         c->output_level() == 0 ||
+         !FilesRangeOverlapWithCompaction(*c->inputs(), c->output_level(),
+                                          c->GetPenultimateLevel()));
+  if (c->start_level() == 0 ||
+      ioptions_.compaction_style == kCompactionStyleUniversal) {
+    level0_compactions_in_progress_.insert(c);
+  }
+  compactions_in_progress_.insert(c);
+  TEST_SYNC_POINT_CALLBACK("CompactionPicker::RegisterCompaction:Registered",
+                           c);
+}
+
+void CompactionPicker::UnregisterCompaction(Compaction* c) {
+  if (c == nullptr) {
+    return;
+  }
+  if (c->start_level() == 0 ||
+      ioptions_.compaction_style == kCompactionStyleUniversal) {
+    level0_compactions_in_progress_.erase(c);
+  }
+  compactions_in_progress_.erase(c);
+}
+
+void CompactionPicker::PickFilesMarkedForCompaction(
+    const std::string& cf_name, VersionStorageInfo* vstorage, int* start_level,
+    int* output_level, CompactionInputFiles* start_level_inputs) {
+  if (vstorage->FilesMarkedForCompaction().empty()) {
+    return;
+  }
+
+  auto continuation = [&, cf_name](std::pair<int, FileMetaData*> level_file) {
+    // If it's being compacted it has nothing to do here.
+    // If this assert() fails that means that some function marked some
+    // files as being_compacted, but didn't call ComputeCompactionScore()
+    assert(!level_file.second->being_compacted);
+    *start_level = level_file.first;
+    *output_level =
+        (*start_level == 0) ? vstorage->base_level() : *start_level + 1;
+
+    if (*start_level == 0 && !level0_compactions_in_progress()->empty()) {
+      return false;
+    }
+
+    start_level_inputs->files = {level_file.second};
+    start_level_inputs->level = *start_level;
+    return ExpandInputsToCleanCut(cf_name, vstorage, start_level_inputs);
+  };
+
+  // take a chance on a random file first
+  Random64 rnd(/* seed */ reinterpret_cast<uint64_t>(vstorage));
+  size_t random_file_index = static_cast<size_t>(rnd.Uniform(
+      static_cast<uint64_t>(vstorage->FilesMarkedForCompaction().size())));
+  TEST_SYNC_POINT_CALLBACK("CompactionPicker::PickFilesMarkedForCompaction",
+                           &random_file_index);
+
+  if (continuation(vstorage->FilesMarkedForCompaction()[random_file_index])) {
+    // found the compaction!
+    return;
+  }
+
+  for (auto& level_file : vstorage->FilesMarkedForCompaction()) {
+    if (continuation(level_file)) {
+      // found the compaction!
+      return;
+    }
+  }
+  start_level_inputs->files.clear();
+}
+
+bool CompactionPicker::GetOverlappingL0Files(
+    VersionStorageInfo* vstorage, CompactionInputFiles* start_level_inputs,
+    int output_level, int* parent_index) {
+  // Two level 0 compaction won't run at the same time, so don't need to worry
+  // about files on level 0 being compacted.
+  assert(level0_compactions_in_progress()->empty());
+  InternalKey smallest, largest;
+  GetRange(*start_level_inputs, &smallest, &largest);
+  // Note that the next call will discard the file we placed in
+  // c->inputs_[0] earlier and replace it with an overlapping set
+  // which will include the picked file.
+  start_level_inputs->files.clear();
+  vstorage->GetOverlappingInputs(0, &smallest, &largest,
+                                 &(start_level_inputs->files));
+
+  // If we include more L0 files in the same compaction run it can
+  // cause the 'smallest' and 'largest' key to get extended to a
+  // larger range. So, re-invoke GetRange to get the new key range
+  GetRange(*start_level_inputs, &smallest, &largest);
+  if (IsRangeInCompaction(vstorage, &smallest, &largest, output_level,
+                          parent_index)) {
+    return false;
+  }
+  assert(!start_level_inputs->files.empty());
+
+  return true;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker.h b/src/rocksdb/db/compaction/compaction_picker.h
new file mode 100644
index 000000000..7739dd96b
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker.h
@@ -0,0 +1,323 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/compaction/compaction.h"
+#include "db/version_set.h"
+#include "options/cf_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The file contains an abstract class CompactionPicker, and its two
+// sub-classes LevelCompactionPicker and NullCompactionPicker, as
+// well as some helper functions used by them.
+
+class LogBuffer;
+class Compaction;
+class VersionStorageInfo;
+struct CompactionInputFiles;
+
+// An abstract class to pick compactions from an existing LSM-tree.
+//
+// Each compaction style inherits the class and implement the
+// interface to form automatic compactions. If NeedCompaction() is true,
+// then call PickCompaction() to find what files need to be compacted
+// and where to put the output files.
+//
+// Non-virtual functions CompactRange() and CompactFiles() are used to
+// pick files to compact based on users' DB::CompactRange() and
+// DB::CompactFiles() requests, respectively. There is little
+// compaction style specific logic for them.
+class CompactionPicker {
+ public:
+  CompactionPicker(const ImmutableOptions& ioptions,
+                   const InternalKeyComparator* icmp);
+  virtual ~CompactionPicker();
+
+  // Pick level and inputs for a new compaction.
+  // Returns nullptr if there is no compaction to be done.
+  // Otherwise returns a pointer to a heap-allocated object that
+  // describes the compaction.  Caller should delete the result.
+  virtual Compaction* PickCompaction(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+      LogBuffer* log_buffer,
+      SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) = 0;
+
+  // Return a compaction object for compacting the range [begin,end] in
+  // the specified level.  Returns nullptr if there is nothing in that
+  // level that overlaps the specified range.  Caller should delete
+  // the result.
+  //
+  // The returned Compaction might not include the whole requested range.
+  // In that case, compaction_end will be set to the next key that needs
+  // compacting. In case the compaction will compact the whole range,
+  // compaction_end will be set to nullptr.
+  // Client is responsible for compaction_end storage -- when called,
+  // *compaction_end should point to valid InternalKey!
+  virtual Compaction* CompactRange(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+      int input_level, int output_level,
+      const CompactRangeOptions& compact_range_options,
+      const InternalKey* begin, const InternalKey* end,
+      InternalKey** compaction_end, bool* manual_conflict,
+      uint64_t max_file_num_to_ignore, const std::string& trim_ts);
+
+  // The maximum allowed output level.  Default value is NumberLevels() - 1.
+  virtual int MaxOutputLevel() const { return NumberLevels() - 1; }
+
+  virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const = 0;
+
+// Sanitize the input set of compaction input files.
+// When the input parameters do not describe a valid compaction, the
+// function will try to fix the input_files by adding necessary
+// files.  If it's not possible to conver an invalid input_files
+// into a valid one by adding more files, the function will return a
+// non-ok status with specific reason.
+#ifndef ROCKSDB_LITE
+  Status SanitizeCompactionInputFiles(std::unordered_set<uint64_t>* input_files,
+                                      const ColumnFamilyMetaData& cf_meta,
+                                      const int output_level) const;
+#endif  // ROCKSDB_LITE
+
+  // Free up the files that participated in a compaction
+  //
+  // Requirement: DB mutex held
+  void ReleaseCompactionFiles(Compaction* c, Status status);
+
+  // Returns true if any one of the specified files are being compacted
+  bool AreFilesInCompaction(const std::vector<FileMetaData*>& files);
+
+  // Takes a list of CompactionInputFiles and returns a (manual) Compaction
+  // object.
+  //
+  // Caller must provide a set of input files that has been passed through
+  // `SanitizeCompactionInputFiles` earlier. The lock should not be released
+  // between that call and this one.
+  Compaction* CompactFiles(const CompactionOptions& compact_options,
+                           const std::vector<CompactionInputFiles>& input_files,
+                           int output_level, VersionStorageInfo* vstorage,
+                           const MutableCFOptions& mutable_cf_options,
+                           const MutableDBOptions& mutable_db_options,
+                           uint32_t output_path_id);
+
+  // Converts a set of compaction input file numbers into
+  // a list of CompactionInputFiles.
+  Status GetCompactionInputsFromFileNumbers(
+      std::vector<CompactionInputFiles>* input_files,
+      std::unordered_set<uint64_t>* input_set,
+      const VersionStorageInfo* vstorage,
+      const CompactionOptions& compact_options) const;
+
+  // Is there currently a compaction involving level 0 taking place
+  bool IsLevel0CompactionInProgress() const {
+    return !level0_compactions_in_progress_.empty();
+  }
+
+  // Return true if the passed key range overlap with a compaction output
+  // that is currently running.
+  bool RangeOverlapWithCompaction(const Slice& smallest_user_key,
+                                  const Slice& largest_user_key,
+                                  int level) const;
+
+  // Stores the minimal range that covers all entries in inputs in
+  // *smallest, *largest.
+  // REQUIRES: inputs is not empty
+  void GetRange(const CompactionInputFiles& inputs, InternalKey* smallest,
+                InternalKey* largest) const;
+
+  // Stores the minimal range that covers all entries in inputs1 and inputs2
+  // in *smallest, *largest.
+  // REQUIRES: inputs is not empty
+  void GetRange(const CompactionInputFiles& inputs1,
+                const CompactionInputFiles& inputs2, InternalKey* smallest,
+                InternalKey* largest) const;
+
+  // Stores the minimal range that covers all entries in inputs
+  // in *smallest, *largest.
+  // REQUIRES: inputs is not empty (at least on entry have one file)
+  void GetRange(const std::vector<CompactionInputFiles>& inputs,
+                InternalKey* smallest, InternalKey* largest,
+                int exclude_level) const;
+
+  int NumberLevels() const { return ioptions_.num_levels; }
+
+  // Add more files to the inputs on "level" to make sure that
+  // no newer version of a key is compacted to "level+1" while leaving an older
+  // version in a "level". Otherwise, any Get() will search "level" first,
+  // and will likely return an old/stale value for the key, since it always
+  // searches in increasing order of level to find the value. This could
+  // also scramble the order of merge operands. This function should be
+  // called any time a new Compaction is created, and its inputs_[0] are
+  // populated.
+  //
+  // Will return false if it is impossible to apply this compaction.
+  bool ExpandInputsToCleanCut(const std::string& cf_name,
+                              VersionStorageInfo* vstorage,
+                              CompactionInputFiles* inputs,
+                              InternalKey** next_smallest = nullptr);
+
+  // Returns true if any one of the parent files are being compacted
+  bool IsRangeInCompaction(VersionStorageInfo* vstorage,
+                           const InternalKey* smallest,
+                           const InternalKey* largest, int level, int* index);
+
+  // Returns true if the key range that `inputs` files cover overlap with the
+  // key range of a currently running compaction.
+  bool FilesRangeOverlapWithCompaction(
+      const std::vector<CompactionInputFiles>& inputs, int level,
+      int penultimate_level) const;
+
+  bool SetupOtherInputs(const std::string& cf_name,
+                        const MutableCFOptions& mutable_cf_options,
+                        VersionStorageInfo* vstorage,
+                        CompactionInputFiles* inputs,
+                        CompactionInputFiles* output_level_inputs,
+                        int* parent_index, int base_index,
+                        bool only_expand_towards_right = false);
+
+  void GetGrandparents(VersionStorageInfo* vstorage,
+                       const CompactionInputFiles& inputs,
+                       const CompactionInputFiles& output_level_inputs,
+                       std::vector<FileMetaData*>* grandparents);
+
+  void PickFilesMarkedForCompaction(const std::string& cf_name,
+                                    VersionStorageInfo* vstorage,
+                                    int* start_level, int* output_level,
+                                    CompactionInputFiles* start_level_inputs);
+
+  bool GetOverlappingL0Files(VersionStorageInfo* vstorage,
+                             CompactionInputFiles* start_level_inputs,
+                             int output_level, int* parent_index);
+
+  // Register this compaction in the set of running compactions
+  void RegisterCompaction(Compaction* c);
+
+  // Remove this compaction from the set of running compactions
+  void UnregisterCompaction(Compaction* c);
+
+  std::set<Compaction*>* level0_compactions_in_progress() {
+    return &level0_compactions_in_progress_;
+  }
+  std::unordered_set<Compaction*>* compactions_in_progress() {
+    return &compactions_in_progress_;
+  }
+
+  const InternalKeyComparator* icmp() const { return icmp_; }
+
+ protected:
+  const ImmutableOptions& ioptions_;
+
+// A helper function to SanitizeCompactionInputFiles() that
+// sanitizes "input_files" by adding necessary files.
+#ifndef ROCKSDB_LITE
+  virtual Status SanitizeCompactionInputFilesForAllLevels(
+      std::unordered_set<uint64_t>* input_files,
+      const ColumnFamilyMetaData& cf_meta, const int output_level) const;
+#endif  // ROCKSDB_LITE
+
+  // Keeps track of all compactions that are running on Level0.
+  // Protected by DB mutex
+  std::set<Compaction*> level0_compactions_in_progress_;
+
+  // Keeps track of all compactions that are running.
+  // Protected by DB mutex
+  std::unordered_set<Compaction*> compactions_in_progress_;
+
+  const InternalKeyComparator* const icmp_;
+};
+
+#ifndef ROCKSDB_LITE
+// A dummy compaction that never triggers any automatic
+// compaction.
+class NullCompactionPicker : public CompactionPicker {
+ public:
+  NullCompactionPicker(const ImmutableOptions& ioptions,
+                       const InternalKeyComparator* icmp)
+      : CompactionPicker(ioptions, icmp) {}
+  virtual ~NullCompactionPicker() {}
+
+  // Always return "nullptr"
+  Compaction* PickCompaction(
+      const std::string& /*cf_name*/,
+      const MutableCFOptions& /*mutable_cf_options*/,
+      const MutableDBOptions& /*mutable_db_options*/,
+      VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */,
+      SequenceNumber /* earliest_memtable_seqno */) override {
+    return nullptr;
+  }
+
+  // Always return "nullptr"
+  Compaction* CompactRange(const std::string& /*cf_name*/,
+                           const MutableCFOptions& /*mutable_cf_options*/,
+                           const MutableDBOptions& /*mutable_db_options*/,
+                           VersionStorageInfo* /*vstorage*/,
+                           int /*input_level*/, int /*output_level*/,
+                           const CompactRangeOptions& /*compact_range_options*/,
+                           const InternalKey* /*begin*/,
+                           const InternalKey* /*end*/,
+                           InternalKey** /*compaction_end*/,
+                           bool* /*manual_conflict*/,
+                           uint64_t /*max_file_num_to_ignore*/,
+                           const std::string& /*trim_ts*/) override {
+    return nullptr;
+  }
+
+  // Always returns false.
+  virtual bool NeedsCompaction(
+      const VersionStorageInfo* /*vstorage*/) const override {
+    return false;
+  }
+};
+#endif  // !ROCKSDB_LITE
+
+// Attempts to find an intra L0 compaction conforming to the given parameters.
+//
+// @param level_files                     Metadata for L0 files.
+// @param min_files_to_compact            Minimum number of files required to
+//                                        do the compaction.
+// @param max_compact_bytes_per_del_file  Maximum average size in bytes per
+//                                        file that is going to get deleted by
+//                                        the compaction.
+// @param max_compaction_bytes            Maximum total size in bytes (in terms
+//                                        of compensated file size) for files
+//                                        to be compacted.
+// @param [out] comp_inputs               If a compaction was found, will be
+//                                        initialized with corresponding input
+//                                        files. Cannot be nullptr.
+//
+// @return                                true iff compaction was found.
+bool FindIntraL0Compaction(
+    const std::vector<FileMetaData*>& level_files, size_t min_files_to_compact,
+    uint64_t max_compact_bytes_per_del_file, uint64_t max_compaction_bytes,
+    CompactionInputFiles* comp_inputs,
+    SequenceNumber earliest_mem_seqno = kMaxSequenceNumber);
+
+CompressionType GetCompressionType(const VersionStorageInfo* vstorage,
+                                   const MutableCFOptions& mutable_cf_options,
+                                   int level, int base_level,
+                                   const bool enable_compression = true);
+
+CompressionOptions GetCompressionOptions(
+    const MutableCFOptions& mutable_cf_options,
+    const VersionStorageInfo* vstorage, int level,
+    const bool enable_compression = true);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker_fifo.cc b/src/rocksdb/db/compaction/compaction_picker_fifo.cc
new file mode 100644
index 000000000..1f875e3e1
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_fifo.cc
@@ -0,0 +1,433 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_picker_fifo.h"
+#ifndef ROCKSDB_LITE
+
+#include <cinttypes>
+#include <string>
+#include <vector>
+
+#include "db/column_family.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+uint64_t GetTotalFilesSize(const std::vector<FileMetaData*>& files) {
+  uint64_t total_size = 0;
+  for (const auto& f : files) {
+    total_size += f->fd.file_size;
+  }
+  return total_size;
+}
+}  // anonymous namespace
+
+bool FIFOCompactionPicker::NeedsCompaction(
+    const VersionStorageInfo* vstorage) const {
+  const int kLevel0 = 0;
+  return vstorage->CompactionScore(kLevel0) >= 1;
+}
+
+Compaction* FIFOCompactionPicker::PickTTLCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    LogBuffer* log_buffer) {
+  assert(mutable_cf_options.ttl > 0);
+
+  const int kLevel0 = 0;
+  const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
+  uint64_t total_size = GetTotalFilesSize(level_files);
+
+  int64_t _current_time;
+  auto status = ioptions_.clock->GetCurrentTime(&_current_time);
+  if (!status.ok()) {
+    ROCKS_LOG_BUFFER(log_buffer,
+                     "[%s] FIFO compaction: Couldn't get current time: %s. "
+                     "Not doing compactions based on TTL. ",
+                     cf_name.c_str(), status.ToString().c_str());
+    return nullptr;
+  }
+  const uint64_t current_time = static_cast<uint64_t>(_current_time);
+
+  if (!level0_compactions_in_progress_.empty()) {
+    ROCKS_LOG_BUFFER(
+        log_buffer,
+        "[%s] FIFO compaction: Already executing compaction. No need "
+        "to run parallel compactions since compactions are very fast",
+        cf_name.c_str());
+    return nullptr;
+  }
+
+  std::vector<CompactionInputFiles> inputs;
+  inputs.emplace_back();
+  inputs[0].level = 0;
+
+  // avoid underflow
+  if (current_time > mutable_cf_options.ttl) {
+    for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
+      FileMetaData* f = *ritr;
+      assert(f);
+      if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) {
+        uint64_t creation_time =
+            f->fd.table_reader->GetTableProperties()->creation_time;
+        if (creation_time == 0 ||
+            creation_time >= (current_time - mutable_cf_options.ttl)) {
+          break;
+        }
+      }
+      total_size -= f->fd.file_size;
+      inputs[0].files.push_back(f);
+    }
+  }
+
+  // Return a nullptr and proceed to size-based FIFO compaction if:
+  // 1. there are no files older than ttl OR
+  // 2. there are a few files older than ttl, but deleting them will not bring
+  //    the total size to be less than max_table_files_size threshold.
+  if (inputs[0].files.empty() ||
+      total_size >
+          mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+    return nullptr;
+  }
+
+  for (const auto& f : inputs[0].files) {
+    uint64_t creation_time = 0;
+    assert(f);
+    if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) {
+      creation_time = f->fd.table_reader->GetTableProperties()->creation_time;
+    }
+    ROCKS_LOG_BUFFER(log_buffer,
+                     "[%s] FIFO compaction: picking file %" PRIu64
+                     " with creation time %" PRIu64 " for deletion",
+                     cf_name.c_str(), f->fd.GetNumber(), creation_time);
+  }
+
+  Compaction* c = new Compaction(
+      vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+      std::move(inputs), 0, 0, 0, 0, kNoCompression,
+      mutable_cf_options.compression_opts, Temperature::kUnknown,
+      /* max_subcompactions */ 0, {}, /* is manual */ false,
+      /* trim_ts */ "", vstorage->CompactionScore(0),
+      /* is deletion compaction */ true, /* l0_files_might_overlap */ true,
+      CompactionReason::kFIFOTtl);
+  return c;
+}
+
+// The size-based compaction picker for FIFO.
+//
+// When the entire column family size exceeds max_table_files_size, FIFO will
+// try to delete the oldest sst file(s) until the resulting column family size
+// is smaller than max_table_files_size.
+//
+// This function also takes care the case where a DB is migrating from level /
+// universal compaction to FIFO compaction.  During the migration, the column
+// family will also have non-L0 files while FIFO can only create L0 files.
+// In this case, this function will first purge the sst files in the bottom-
+// most non-empty level first, and the DB will eventually converge to the
+// regular FIFO case where there're only L0 files.  Note that during the
+// migration case, the purge order will only be an approximation of "FIFO"
+// as entries inside lower-level files might sometimes be newer than some
+// entries inside upper-level files.
+Compaction* FIFOCompactionPicker::PickSizeCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    LogBuffer* log_buffer) {
+  // compute the total size and identify the last non-empty level
+  int last_level = 0;
+  uint64_t total_size = 0;
+  for (int level = 0; level < vstorage->num_levels(); ++level) {
+    auto level_size = GetTotalFilesSize(vstorage->LevelFiles(level));
+    total_size += level_size;
+    if (level_size > 0) {
+      last_level = level;
+    }
+  }
+  const std::vector<FileMetaData*>& last_level_files =
+      vstorage->LevelFiles(last_level);
+
+  if (last_level == 0 &&
+      total_size <=
+          mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+    // total size not exceeded, try to find intra level 0 compaction if enabled
+    const std::vector<FileMetaData*>& level0_files = vstorage->LevelFiles(0);
+    if (mutable_cf_options.compaction_options_fifo.allow_compaction &&
+        level0_files.size() > 0) {
+      CompactionInputFiles comp_inputs;
+      // try to prevent same files from being compacted multiple times, which
+      // could produce large files that may never TTL-expire. Achieve this by
+      // disallowing compactions with files larger than memtable (inflate its
+      // size by 10% to account for uncompressed L0 files that may have size
+      // slightly greater than memtable size limit).
+      size_t max_compact_bytes_per_del_file =
+          static_cast<size_t>(MultiplyCheckOverflow(
+              static_cast<uint64_t>(mutable_cf_options.write_buffer_size),
+              1.1));
+      if (FindIntraL0Compaction(
+              level0_files,
+              mutable_cf_options
+                  .level0_file_num_compaction_trigger /* min_files_to_compact */
+              ,
+              max_compact_bytes_per_del_file,
+              mutable_cf_options.max_compaction_bytes, &comp_inputs)) {
+        Compaction* c = new Compaction(
+            vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+            {comp_inputs}, 0, 16 * 1024 * 1024 /* output file size limit */,
+            0 /* max compaction bytes, not applicable */,
+            0 /* output path ID */, mutable_cf_options.compression,
+            mutable_cf_options.compression_opts, Temperature::kUnknown,
+            0 /* max_subcompactions */, {}, /* is manual */ false,
+            /* trim_ts */ "", vstorage->CompactionScore(0),
+            /* is deletion compaction */ false,
+            /* l0_files_might_overlap */ true,
+            CompactionReason::kFIFOReduceNumFiles);
+        return c;
+      }
+    }
+
+    ROCKS_LOG_BUFFER(
+        log_buffer,
+        "[%s] FIFO compaction: nothing to do. Total size %" PRIu64
+        ", max size %" PRIu64 "\n",
+        cf_name.c_str(), total_size,
+        mutable_cf_options.compaction_options_fifo.max_table_files_size);
+    return nullptr;
+  }
+
+  if (!level0_compactions_in_progress_.empty()) {
+    ROCKS_LOG_BUFFER(
+        log_buffer,
+        "[%s] FIFO compaction: Already executing compaction. No need "
+        "to run parallel compactions since compactions are very fast",
+        cf_name.c_str());
+    return nullptr;
+  }
+
+  std::vector<CompactionInputFiles> inputs;
+  inputs.emplace_back();
+  inputs[0].level = last_level;
+
+  if (last_level == 0) {
+    // In L0, right-most files are the oldest files.
+    for (auto ritr = last_level_files.rbegin(); ritr != last_level_files.rend();
+         ++ritr) {
+      auto f = *ritr;
+      total_size -= f->fd.file_size;
+      inputs[0].files.push_back(f);
+      char tmp_fsize[16];
+      AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize));
+      ROCKS_LOG_BUFFER(log_buffer,
+                       "[%s] FIFO compaction: picking file %" PRIu64
+                       " with size %s for deletion",
+                       cf_name.c_str(), f->fd.GetNumber(), tmp_fsize);
+      if (total_size <=
+          mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+        break;
+      }
+    }
+  } else {
+    // If the last level is non-L0, we actually don't know which file is
+    // logically the oldest since the file creation time only represents
+    // when this file was compacted to this level, which is independent
+    // to when the entries in this file were first inserted.
+    //
+    // As a result, we delete files from the left instead.  This means the sst
+    // file with the smallest key will be deleted first.  This design decision
+    // better serves a major type of FIFO use cases where smaller keys are
+    // associated with older data.
+    for (const auto& f : last_level_files) {
+      total_size -= f->fd.file_size;
+      inputs[0].files.push_back(f);
+      char tmp_fsize[16];
+      AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize));
+      ROCKS_LOG_BUFFER(log_buffer,
+                       "[%s] FIFO compaction: picking file %" PRIu64
+                       " with size %s for deletion",
+                       cf_name.c_str(), f->fd.GetNumber(), tmp_fsize);
+      if (total_size <=
+          mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+        break;
+      }
+    }
+  }
+
+  Compaction* c = new Compaction(
+      vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+      std::move(inputs), last_level,
+      /* target_file_size */ 0,
+      /* max_compaction_bytes */ 0,
+      /* output_path_id */ 0, kNoCompression,
+      mutable_cf_options.compression_opts, Temperature::kUnknown,
+      /* max_subcompactions */ 0, {}, /* is manual */ false,
+      /* trim_ts */ "", vstorage->CompactionScore(0),
+      /* is deletion compaction */ true,
+      /* l0_files_might_overlap */ true, CompactionReason::kFIFOMaxSize);
+  return c;
+}
+
+Compaction* FIFOCompactionPicker::PickCompactionToWarm(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    LogBuffer* log_buffer) {
+  if (mutable_cf_options.compaction_options_fifo.age_for_warm == 0) {
+    return nullptr;
+  }
+
+  // PickCompactionToWarm is only triggered if there is no non-L0 files.
+  for (int level = 1; level < vstorage->num_levels(); ++level) {
+    if (GetTotalFilesSize(vstorage->LevelFiles(level)) > 0) {
+      return nullptr;
+    }
+  }
+
+  const int kLevel0 = 0;
+  const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
+
+  int64_t _current_time;
+  auto status = ioptions_.clock->GetCurrentTime(&_current_time);
+  if (!status.ok()) {
+    ROCKS_LOG_BUFFER(log_buffer,
+                     "[%s] FIFO compaction: Couldn't get current time: %s. "
+                     "Not doing compactions based on warm threshold. ",
+                     cf_name.c_str(), status.ToString().c_str());
+    return nullptr;
+  }
+  const uint64_t current_time = static_cast<uint64_t>(_current_time);
+
+  if (!level0_compactions_in_progress_.empty()) {
+    ROCKS_LOG_BUFFER(
+        log_buffer,
+        "[%s] FIFO compaction: Already executing compaction. Parallel "
+        "compactions are not supported",
+        cf_name.c_str());
+    return nullptr;
+  }
+
+  std::vector<CompactionInputFiles> inputs;
+  inputs.emplace_back();
+  inputs[0].level = 0;
+
+  // avoid underflow
+  if (current_time > mutable_cf_options.compaction_options_fifo.age_for_warm) {
+    uint64_t create_time_threshold =
+        current_time - mutable_cf_options.compaction_options_fifo.age_for_warm;
+    uint64_t compaction_size = 0;
+    // We will ideally identify a file qualifying for warm tier by knowing
+    // the timestamp for the youngest entry in the file. However, right now
+    // we don't have the information. We infer it by looking at timestamp
+    // of the next file's (which is just younger) oldest entry's timestamp.
+    FileMetaData* prev_file = nullptr;
+    for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
+      FileMetaData* f = *ritr;
+      assert(f);
+      if (f->being_compacted) {
+        // Right now this probably won't happen as we never try to schedule
+        // two compactions in parallel, so here we just simply don't schedule
+        // anything.
+        return nullptr;
+      }
+      uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
+      if (oldest_ancester_time == kUnknownOldestAncesterTime) {
+        // Older files might not have enough information. It is possible to
+        // handle these files by looking at newer files, but maintaining the
+        // logic isn't worth it.
+        break;
+      }
+      if (oldest_ancester_time > create_time_threshold) {
+        // The previous file (which has slightly older data) doesn't qualify
+        // for warm tier.
+        break;
+      }
+      if (prev_file != nullptr) {
+        compaction_size += prev_file->fd.GetFileSize();
+        if (compaction_size > mutable_cf_options.max_compaction_bytes) {
+          break;
+        }
+        inputs[0].files.push_back(prev_file);
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "[%s] FIFO compaction: picking file %" PRIu64
+                         " with next file's oldest time %" PRIu64 " for warm",
+                         cf_name.c_str(), prev_file->fd.GetNumber(),
+                         oldest_ancester_time);
+      }
+      if (f->temperature == Temperature::kUnknown ||
+          f->temperature == Temperature::kHot) {
+        prev_file = f;
+      } else if (!inputs[0].files.empty()) {
+        // A warm file newer than files picked.
+        break;
+      } else {
+        assert(prev_file == nullptr);
+      }
+    }
+  }
+
+  if (inputs[0].files.empty()) {
+    return nullptr;
+  }
+
+  Compaction* c = new Compaction(
+      vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+      std::move(inputs), 0, 0 /* output file size limit */,
+      0 /* max compaction bytes, not applicable */, 0 /* output path ID */,
+      mutable_cf_options.compression, mutable_cf_options.compression_opts,
+      Temperature::kWarm,
+      /* max_subcompactions */ 0, {}, /* is manual */ false, /* trim_ts */ "",
+      vstorage->CompactionScore(0),
+      /* is deletion compaction */ false, /* l0_files_might_overlap */ true,
+      CompactionReason::kChangeTemperature);
+  return c;
+}
+
+Compaction* FIFOCompactionPicker::PickCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    LogBuffer* log_buffer, SequenceNumber /*earliest_memtable_seqno*/) {
+  Compaction* c = nullptr;
+  if (mutable_cf_options.ttl > 0) {
+    c = PickTTLCompaction(cf_name, mutable_cf_options, mutable_db_options,
+                          vstorage, log_buffer);
+  }
+  if (c == nullptr) {
+    c = PickSizeCompaction(cf_name, mutable_cf_options, mutable_db_options,
+                           vstorage, log_buffer);
+  }
+  if (c == nullptr) {
+    c = PickCompactionToWarm(cf_name, mutable_cf_options, mutable_db_options,
+                             vstorage, log_buffer);
+  }
+  RegisterCompaction(c);
+  return c;
+}
+
+Compaction* FIFOCompactionPicker::CompactRange(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    int input_level, int output_level,
+    const CompactRangeOptions& /*compact_range_options*/,
+    const InternalKey* /*begin*/, const InternalKey* /*end*/,
+    InternalKey** compaction_end, bool* /*manual_conflict*/,
+    uint64_t /*max_file_num_to_ignore*/, const std::string& /*trim_ts*/) {
+#ifdef NDEBUG
+  (void)input_level;
+  (void)output_level;
+#endif
+  assert(input_level == 0);
+  assert(output_level == 0);
+  *compaction_end = nullptr;
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.logger);
+  Compaction* c = PickCompaction(cf_name, mutable_cf_options,
+                                 mutable_db_options, vstorage, &log_buffer);
+  log_buffer.FlushBufferToLog();
+  return c;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_picker_fifo.h b/src/rocksdb/db/compaction/compaction_picker_fifo.h
new file mode 100644
index 000000000..544259f38
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_fifo.h
@@ -0,0 +1,63 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "db/compaction/compaction_picker.h"
+
+namespace ROCKSDB_NAMESPACE {
+class FIFOCompactionPicker : public CompactionPicker {
+ public:
+  FIFOCompactionPicker(const ImmutableOptions& ioptions,
+                       const InternalKeyComparator* icmp)
+      : CompactionPicker(ioptions, icmp) {}
+
+  virtual Compaction* PickCompaction(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      const MutableDBOptions& mutable_db_options, VersionStorageInfo* version,
+      LogBuffer* log_buffer,
+      SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
+
+  virtual Compaction* CompactRange(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+      int input_level, int output_level,
+      const CompactRangeOptions& compact_range_options,
+      const InternalKey* begin, const InternalKey* end,
+      InternalKey** compaction_end, bool* manual_conflict,
+      uint64_t max_file_num_to_ignore, const std::string& trim_ts) override;
+
+  // The maximum allowed output level.  Always returns 0.
+  virtual int MaxOutputLevel() const override { return 0; }
+
+  virtual bool NeedsCompaction(
+      const VersionStorageInfo* vstorage) const override;
+
+ private:
+  Compaction* PickTTLCompaction(const std::string& cf_name,
+                                const MutableCFOptions& mutable_cf_options,
+                                const MutableDBOptions& mutable_db_options,
+                                VersionStorageInfo* version,
+                                LogBuffer* log_buffer);
+
+  Compaction* PickSizeCompaction(const std::string& cf_name,
+                                 const MutableCFOptions& mutable_cf_options,
+                                 const MutableDBOptions& mutable_db_options,
+                                 VersionStorageInfo* version,
+                                 LogBuffer* log_buffer);
+
+  Compaction* PickCompactionToWarm(const std::string& cf_name,
+                                   const MutableCFOptions& mutable_cf_options,
+                                   const MutableDBOptions& mutable_db_options,
+                                   VersionStorageInfo* version,
+                                   LogBuffer* log_buffer);
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_picker_level.cc b/src/rocksdb/db/compaction/compaction_picker_level.cc
new file mode 100644
index 000000000..b689b6add
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_level.cc
@@ -0,0 +1,841 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_picker_level.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/version_edit.h"
+#include "logging/log_buffer.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool LevelCompactionPicker::NeedsCompaction(
+    const VersionStorageInfo* vstorage) const {
+  if (!vstorage->ExpiredTtlFiles().empty()) {
+    return true;
+  }
+  if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) {
+    return true;
+  }
+  if (!vstorage->BottommostFilesMarkedForCompaction().empty()) {
+    return true;
+  }
+  if (!vstorage->FilesMarkedForCompaction().empty()) {
+    return true;
+  }
+  if (!vstorage->FilesMarkedForForcedBlobGC().empty()) {
+    return true;
+  }
+  for (int i = 0; i <= vstorage->MaxInputLevel(); i++) {
+    if (vstorage->CompactionScore(i) >= 1) {
+      return true;
+    }
+  }
+  return false;
+}
+
+namespace {
+// A class to build a leveled compaction step-by-step.
+class LevelCompactionBuilder {
+ public:
+  LevelCompactionBuilder(const std::string& cf_name,
+                         VersionStorageInfo* vstorage,
+                         SequenceNumber earliest_mem_seqno,
+                         CompactionPicker* compaction_picker,
+                         LogBuffer* log_buffer,
+                         const MutableCFOptions& mutable_cf_options,
+                         const ImmutableOptions& ioptions,
+                         const MutableDBOptions& mutable_db_options)
+      : cf_name_(cf_name),
+        vstorage_(vstorage),
+        earliest_mem_seqno_(earliest_mem_seqno),
+        compaction_picker_(compaction_picker),
+        log_buffer_(log_buffer),
+        mutable_cf_options_(mutable_cf_options),
+        ioptions_(ioptions),
+        mutable_db_options_(mutable_db_options) {}
+
+  // Pick and return a compaction.
+  Compaction* PickCompaction();
+
+  // Pick the initial files to compact to the next level. (or together
+  // in Intra-L0 compactions)
+  void SetupInitialFiles();
+
+  // If the initial files are from L0 level, pick other L0
+  // files if needed.
+  bool SetupOtherL0FilesIfNeeded();
+
+  // Compaction with round-robin compaction priority allows more files to be
+  // picked to form a large compaction
+  void SetupOtherFilesWithRoundRobinExpansion();
+  // Based on initial files, setup other files need to be compacted
+  // in this compaction, accordingly.
+  bool SetupOtherInputsIfNeeded();
+
+  Compaction* GetCompaction();
+
+  // For the specfied level, pick a file that we want to compact.
+  // Returns false if there is no file to compact.
+  // If it returns true, inputs->files.size() will be exactly one for
+  // all compaction priorities except round-robin. For round-robin,
+  // multiple consecutive files may be put into inputs->files.
+  // If level is 0 and there is already a compaction on that level, this
+  // function will return false.
+  bool PickFileToCompact();
+
+  // Return true if a L0 trivial move is picked up.
+  bool TryPickL0TrivialMove();
+
+  // For L0->L0, picks the longest span of files that aren't currently
+  // undergoing compaction for which work-per-deleted-file decreases. The span
+  // always starts from the newest L0 file.
+  //
+  // Intra-L0 compaction is independent of all other files, so it can be
+  // performed even when L0->base_level compactions are blocked.
+  //
+  // Returns true if `inputs` is populated with a span of files to be compacted;
+  // otherwise, returns false.
+  bool PickIntraL0Compaction();
+
+  // Return true if TrivialMove is extended. `start_index` is the index of
+  // the intiial file picked, which should already be in `start_level_inputs_`.
+  bool TryExtendNonL0TrivialMove(int start_index);
+
+  // Picks a file from level_files to compact.
+  // level_files is a vector of (level, file metadata) in ascending order of
+  // level. If compact_to_next_level is true, compact the file to the next
+  // level, otherwise, compact to the same level as the input file.
+  void PickFileToCompact(
+      const autovector<std::pair<int, FileMetaData*>>& level_files,
+      bool compact_to_next_level);
+
+  const std::string& cf_name_;
+  VersionStorageInfo* vstorage_;
+  SequenceNumber earliest_mem_seqno_;
+  CompactionPicker* compaction_picker_;
+  LogBuffer* log_buffer_;
+  int start_level_ = -1;
+  int output_level_ = -1;
+  int parent_index_ = -1;
+  int base_index_ = -1;
+  double start_level_score_ = 0;
+  bool is_manual_ = false;
+  bool is_l0_trivial_move_ = false;
+  CompactionInputFiles start_level_inputs_;
+  std::vector<CompactionInputFiles> compaction_inputs_;
+  CompactionInputFiles output_level_inputs_;
+  std::vector<FileMetaData*> grandparents_;
+  CompactionReason compaction_reason_ = CompactionReason::kUnknown;
+
+  const MutableCFOptions& mutable_cf_options_;
+  const ImmutableOptions& ioptions_;
+  const MutableDBOptions& mutable_db_options_;
+  // Pick a path ID to place a newly generated file, with its level
+  static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
+                            const MutableCFOptions& mutable_cf_options,
+                            int level);
+
+  static const int kMinFilesForIntraL0Compaction = 4;
+};
+
+void LevelCompactionBuilder::PickFileToCompact(
+    const autovector<std::pair<int, FileMetaData*>>& level_files,
+    bool compact_to_next_level) {
+  for (auto& level_file : level_files) {
+    // If it's being compacted it has nothing to do here.
+    // If this assert() fails that means that some function marked some
+    // files as being_compacted, but didn't call ComputeCompactionScore()
+    assert(!level_file.second->being_compacted);
+    start_level_ = level_file.first;
+    if ((compact_to_next_level &&
+         start_level_ == vstorage_->num_non_empty_levels() - 1) ||
+        (start_level_ == 0 &&
+         !compaction_picker_->level0_compactions_in_progress()->empty())) {
+      continue;
+    }
+    if (compact_to_next_level) {
+      output_level_ =
+          (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
+    } else {
+      output_level_ = start_level_;
+    }
+    start_level_inputs_.files = {level_file.second};
+    start_level_inputs_.level = start_level_;
+    if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                   &start_level_inputs_)) {
+      return;
+    }
+  }
+  start_level_inputs_.files.clear();
+}
+
+void LevelCompactionBuilder::SetupInitialFiles() {
+  // Find the compactions by size on all levels.
+  bool skipped_l0_to_base = false;
+  for (int i = 0; i < compaction_picker_->NumberLevels() - 1; i++) {
+    start_level_score_ = vstorage_->CompactionScore(i);
+    start_level_ = vstorage_->CompactionScoreLevel(i);
+    assert(i == 0 || start_level_score_ <= vstorage_->CompactionScore(i - 1));
+    if (start_level_score_ >= 1) {
+      if (skipped_l0_to_base && start_level_ == vstorage_->base_level()) {
+        // If L0->base_level compaction is pending, don't schedule further
+        // compaction from base level. Otherwise L0->base_level compaction
+        // may starve.
+        continue;
+      }
+      output_level_ =
+          (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
+      if (PickFileToCompact()) {
+        // found the compaction!
+        if (start_level_ == 0) {
+          // L0 score = `num L0 files` / `level0_file_num_compaction_trigger`
+          compaction_reason_ = CompactionReason::kLevelL0FilesNum;
+        } else {
+          // L1+ score = `Level files size` / `MaxBytesForLevel`
+          compaction_reason_ = CompactionReason::kLevelMaxLevelSize;
+        }
+        break;
+      } else {
+        // didn't find the compaction, clear the inputs
+        start_level_inputs_.clear();
+        if (start_level_ == 0) {
+          skipped_l0_to_base = true;
+          // L0->base_level may be blocked due to ongoing L0->base_level
+          // compactions. It may also be blocked by an ongoing compaction from
+          // base_level downwards.
+          //
+          // In these cases, to reduce L0 file count and thus reduce likelihood
+          // of write stalls, we can attempt compacting a span of files within
+          // L0.
+          if (PickIntraL0Compaction()) {
+            output_level_ = 0;
+            compaction_reason_ = CompactionReason::kLevelL0FilesNum;
+            break;
+          }
+        }
+      }
+    } else {
+      // Compaction scores are sorted in descending order, no further scores
+      // will be >= 1.
+      break;
+    }
+  }
+  if (!start_level_inputs_.empty()) {
+    return;
+  }
+
+  // if we didn't find a compaction, check if there are any files marked for
+  // compaction
+  parent_index_ = base_index_ = -1;
+
+  compaction_picker_->PickFilesMarkedForCompaction(
+      cf_name_, vstorage_, &start_level_, &output_level_, &start_level_inputs_);
+  if (!start_level_inputs_.empty()) {
+    compaction_reason_ = CompactionReason::kFilesMarkedForCompaction;
+    return;
+  }
+
+  // Bottommost Files Compaction on deleting tombstones
+  PickFileToCompact(vstorage_->BottommostFilesMarkedForCompaction(), false);
+  if (!start_level_inputs_.empty()) {
+    compaction_reason_ = CompactionReason::kBottommostFiles;
+    return;
+  }
+
+  // TTL Compaction
+  if (ioptions_.compaction_pri == kRoundRobin &&
+      !vstorage_->ExpiredTtlFiles().empty()) {
+    auto expired_files = vstorage_->ExpiredTtlFiles();
+    // the expired files list should already be sorted by level
+    start_level_ = expired_files.front().first;
+#ifndef NDEBUG
+    for (const auto& file : expired_files) {
+      assert(start_level_ <= file.first);
+    }
+#endif
+    if (start_level_ > 0) {
+      output_level_ = start_level_ + 1;
+      if (PickFileToCompact()) {
+        compaction_reason_ = CompactionReason::kRoundRobinTtl;
+        return;
+      }
+    }
+  }
+
+  PickFileToCompact(vstorage_->ExpiredTtlFiles(), true);
+  if (!start_level_inputs_.empty()) {
+    compaction_reason_ = CompactionReason::kTtl;
+    return;
+  }
+
+  // Periodic Compaction
+  PickFileToCompact(vstorage_->FilesMarkedForPeriodicCompaction(), false);
+  if (!start_level_inputs_.empty()) {
+    compaction_reason_ = CompactionReason::kPeriodicCompaction;
+    return;
+  }
+
+  // Forced blob garbage collection
+  PickFileToCompact(vstorage_->FilesMarkedForForcedBlobGC(), false);
+  if (!start_level_inputs_.empty()) {
+    compaction_reason_ = CompactionReason::kForcedBlobGC;
+    return;
+  }
+}
+
+bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() {
+  if (start_level_ == 0 && output_level_ != 0 && !is_l0_trivial_move_) {
+    return compaction_picker_->GetOverlappingL0Files(
+        vstorage_, &start_level_inputs_, output_level_, &parent_index_);
+  }
+  return true;
+}
+
+void LevelCompactionBuilder::SetupOtherFilesWithRoundRobinExpansion() {
+  // We only expand when the start level is not L0 under round robin
+  assert(start_level_ >= 1);
+
+  // For round-robin compaction priority, we have 3 constraints when picking
+  // multiple files.
+  // Constraint 1: We can only pick consecutive files
+  //  -> Constraint 1a: When a file is being compacted (or some input files
+  //                    are being compacted after expanding, we cannot
+  //                    choose it and have to stop choosing more files
+  //  -> Constraint 1b: When we reach the last file (with largest keys), we
+  //                    cannot choose more files (the next file will be the
+  //                    first one)
+  // Constraint 2: We should ensure the total compaction bytes (including the
+  //               overlapped files from the next level) is no more than
+  //               mutable_cf_options_.max_compaction_bytes
+  // Constraint 3: We try our best to pick as many files as possible so that
+  //               the post-compaction level size is less than
+  //               MaxBytesForLevel(start_level_)
+  // Constraint 4: We do not expand if it is possible to apply a trivial move
+  // Constraint 5 (TODO): Try to pick minimal files to split into the target
+  //               number of subcompactions
+  TEST_SYNC_POINT("LevelCompactionPicker::RoundRobin");
+
+  // Only expand the inputs when we have selected a file in start_level_inputs_
+  if (start_level_inputs_.size() == 0) return;
+
+  uint64_t start_lvl_bytes_no_compacting = 0;
+  uint64_t curr_bytes_to_compact = 0;
+  uint64_t start_lvl_max_bytes_to_compact = 0;
+  const std::vector<FileMetaData*>& level_files =
+      vstorage_->LevelFiles(start_level_);
+  // Constraint 3 (pre-calculate the ideal max bytes to compact)
+  for (auto f : level_files) {
+    if (!f->being_compacted) {
+      start_lvl_bytes_no_compacting += f->fd.GetFileSize();
+    }
+  }
+  if (start_lvl_bytes_no_compacting >
+      vstorage_->MaxBytesForLevel(start_level_)) {
+    start_lvl_max_bytes_to_compact = start_lvl_bytes_no_compacting -
+                                     vstorage_->MaxBytesForLevel(start_level_);
+  }
+
+  size_t start_index = vstorage_->FilesByCompactionPri(start_level_)[0];
+  InternalKey smallest, largest;
+  // Constraint 4 (No need to check again later)
+  compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest);
+  CompactionInputFiles output_level_inputs;
+  output_level_inputs.level = output_level_;
+  vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
+                                  &output_level_inputs.files);
+  if (output_level_inputs.empty()) {
+    if (TryExtendNonL0TrivialMove((int)start_index)) {
+      return;
+    }
+  }
+  // Constraint 3
+  if (start_level_inputs_[0]->fd.GetFileSize() >=
+      start_lvl_max_bytes_to_compact) {
+    return;
+  }
+  CompactionInputFiles tmp_start_level_inputs;
+  tmp_start_level_inputs = start_level_inputs_;
+  // TODO (zichen): Future parallel round-robin may also need to update this
+  // Constraint 1b (only expand till the end)
+  for (size_t i = start_index + 1; i < level_files.size(); i++) {
+    auto* f = level_files[i];
+    if (f->being_compacted) {
+      // Constraint 1a
+      return;
+    }
+
+    tmp_start_level_inputs.files.push_back(f);
+    if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                    &tmp_start_level_inputs) ||
+        compaction_picker_->FilesRangeOverlapWithCompaction(
+            {tmp_start_level_inputs}, output_level_,
+            Compaction::EvaluatePenultimateLevel(
+                vstorage_, ioptions_, start_level_, output_level_))) {
+      // Constraint 1a
+      tmp_start_level_inputs.clear();
+      return;
+    }
+
+    curr_bytes_to_compact = 0;
+    for (auto start_lvl_f : tmp_start_level_inputs.files) {
+      curr_bytes_to_compact += start_lvl_f->fd.GetFileSize();
+    }
+
+    // Check whether any output level files are locked
+    compaction_picker_->GetRange(tmp_start_level_inputs, &smallest, &largest);
+    vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
+                                    &output_level_inputs.files);
+    if (!output_level_inputs.empty() &&
+        !compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                    &output_level_inputs)) {
+      // Constraint 1a
+      tmp_start_level_inputs.clear();
+      return;
+    }
+
+    uint64_t start_lvl_curr_bytes_to_compact = curr_bytes_to_compact;
+    for (auto output_lvl_f : output_level_inputs.files) {
+      curr_bytes_to_compact += output_lvl_f->fd.GetFileSize();
+    }
+    if (curr_bytes_to_compact > mutable_cf_options_.max_compaction_bytes) {
+      // Constraint 2
+      tmp_start_level_inputs.clear();
+      return;
+    }
+
+    start_level_inputs_.files = tmp_start_level_inputs.files;
+    // Constraint 3
+    if (start_lvl_curr_bytes_to_compact > start_lvl_max_bytes_to_compact) {
+      return;
+    }
+  }
+}
+
+bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() {
+  // Setup input files from output level. For output to L0, we only compact
+  // spans of files that do not interact with any pending compactions, so don't
+  // need to consider other levels.
+  if (output_level_ != 0) {
+    output_level_inputs_.level = output_level_;
+    bool round_robin_expanding =
+        ioptions_.compaction_pri == kRoundRobin &&
+        compaction_reason_ == CompactionReason::kLevelMaxLevelSize;
+    if (round_robin_expanding) {
+      SetupOtherFilesWithRoundRobinExpansion();
+    }
+    if (!is_l0_trivial_move_ &&
+        !compaction_picker_->SetupOtherInputs(
+            cf_name_, mutable_cf_options_, vstorage_, &start_level_inputs_,
+            &output_level_inputs_, &parent_index_, base_index_,
+            round_robin_expanding)) {
+      return false;
+    }
+
+    compaction_inputs_.push_back(start_level_inputs_);
+    if (!output_level_inputs_.empty()) {
+      compaction_inputs_.push_back(output_level_inputs_);
+    }
+
+    if (!is_l0_trivial_move_) {
+      // In some edge cases we could pick a compaction that will be compacting
+      // a key range that overlap with another running compaction, and both
+      // of them have the same output level. This could happen if
+      // (1) we are running a non-exclusive manual compaction
+      // (2) AddFile ingest a new file into the LSM tree
+      // We need to disallow this from happening.
+      if (compaction_picker_->FilesRangeOverlapWithCompaction(
+              compaction_inputs_, output_level_,
+              Compaction::EvaluatePenultimateLevel(
+                  vstorage_, ioptions_, start_level_, output_level_))) {
+        // This compaction output could potentially conflict with the output
+        // of a currently running compaction, we cannot run it.
+        return false;
+      }
+      compaction_picker_->GetGrandparents(vstorage_, start_level_inputs_,
+                                          output_level_inputs_, &grandparents_);
+    }
+  } else {
+    compaction_inputs_.push_back(start_level_inputs_);
+  }
+  return true;
+}
+
+Compaction* LevelCompactionBuilder::PickCompaction() {
+  // Pick up the first file to start compaction. It may have been extended
+  // to a clean cut.
+  SetupInitialFiles();
+  if (start_level_inputs_.empty()) {
+    return nullptr;
+  }
+  assert(start_level_ >= 0 && output_level_ >= 0);
+
+  // If it is a L0 -> base level compaction, we need to set up other L0
+  // files if needed.
+  if (!SetupOtherL0FilesIfNeeded()) {
+    return nullptr;
+  }
+
+  // Pick files in the output level and expand more files in the start level
+  // if needed.
+  if (!SetupOtherInputsIfNeeded()) {
+    return nullptr;
+  }
+
+  // Form a compaction object containing the files we picked.
+  Compaction* c = GetCompaction();
+
+  TEST_SYNC_POINT_CALLBACK("LevelCompactionPicker::PickCompaction:Return", c);
+
+  return c;
+}
+
+Compaction* LevelCompactionBuilder::GetCompaction() {
+  auto c = new Compaction(
+      vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_,
+      std::move(compaction_inputs_), output_level_,
+      MaxFileSizeForLevel(mutable_cf_options_, output_level_,
+                          ioptions_.compaction_style, vstorage_->base_level(),
+                          ioptions_.level_compaction_dynamic_level_bytes),
+      mutable_cf_options_.max_compaction_bytes,
+      GetPathId(ioptions_, mutable_cf_options_, output_level_),
+      GetCompressionType(vstorage_, mutable_cf_options_, output_level_,
+                         vstorage_->base_level()),
+      GetCompressionOptions(mutable_cf_options_, vstorage_, output_level_),
+      Temperature::kUnknown,
+      /* max_subcompactions */ 0, std::move(grandparents_), is_manual_,
+      /* trim_ts */ "", start_level_score_, false /* deletion_compaction */,
+      /* l0_files_might_overlap */ start_level_ == 0 && !is_l0_trivial_move_,
+      compaction_reason_);
+
+  // If it's level 0 compaction, make sure we don't execute any other level 0
+  // compactions in parallel
+  compaction_picker_->RegisterCompaction(c);
+
+  // Creating a compaction influences the compaction score because the score
+  // takes running compactions into account (by skipping files that are already
+  // being compacted). Since we just changed compaction score, we recalculate it
+  // here
+  vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+  return c;
+}
+
+/*
+ * Find the optimal path to place a file
+ * Given a level, finds the path where levels up to it will fit in levels
+ * up to and including this path
+ */
+uint32_t LevelCompactionBuilder::GetPathId(
+    const ImmutableCFOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options, int level) {
+  uint32_t p = 0;
+  assert(!ioptions.cf_paths.empty());
+
+  // size remaining in the most recent path
+  uint64_t current_path_size = ioptions.cf_paths[0].target_size;
+
+  uint64_t level_size;
+  int cur_level = 0;
+
+  // max_bytes_for_level_base denotes L1 size.
+  // We estimate L0 size to be the same as L1.
+  level_size = mutable_cf_options.max_bytes_for_level_base;
+
+  // Last path is the fallback
+  while (p < ioptions.cf_paths.size() - 1) {
+    if (level_size <= current_path_size) {
+      if (cur_level == level) {
+        // Does desired level fit in this path?
+        return p;
+      } else {
+        current_path_size -= level_size;
+        if (cur_level > 0) {
+          if (ioptions.level_compaction_dynamic_level_bytes) {
+            // Currently, level_compaction_dynamic_level_bytes is ignored when
+            // multiple db paths are specified. https://github.com/facebook/
+            // rocksdb/blob/main/db/column_family.cc.
+            // Still, adding this check to avoid accidentally using
+            // max_bytes_for_level_multiplier_additional
+            level_size = static_cast<uint64_t>(
+                level_size * mutable_cf_options.max_bytes_for_level_multiplier);
+          } else {
+            level_size = static_cast<uint64_t>(
+                level_size * mutable_cf_options.max_bytes_for_level_multiplier *
+                mutable_cf_options.MaxBytesMultiplerAdditional(cur_level));
+          }
+        }
+        cur_level++;
+        continue;
+      }
+    }
+    p++;
+    current_path_size = ioptions.cf_paths[p].target_size;
+  }
+  return p;
+}
+
+bool LevelCompactionBuilder::TryPickL0TrivialMove() {
+  if (vstorage_->base_level() <= 0) {
+    return false;
+  }
+  if (start_level_ == 0 && mutable_cf_options_.compression_per_level.empty() &&
+      !vstorage_->LevelFiles(output_level_).empty() &&
+      ioptions_.db_paths.size() <= 1) {
+    // Try to pick trivial move from L0 to L1. We start from the oldest
+    // file. We keep expanding to newer files if it would form a
+    // trivial move.
+    // For now we don't support it with
+    // mutable_cf_options_.compression_per_level to prevent the logic
+    // of determining whether L0 can be trivial moved to the next level.
+    // We skip the case where output level is empty, since in this case, at
+    // least the oldest file would qualify for trivial move, and this would
+    // be a surprising behavior with few benefits.
+
+    // We search from the oldest file from the newest. In theory, there are
+    // files in the middle can form trivial move too, but it is probably
+    // uncommon and we ignore these cases for simplicity.
+    const std::vector<FileMetaData*>& level_files =
+        vstorage_->LevelFiles(start_level_);
+
+    InternalKey my_smallest, my_largest;
+    for (auto it = level_files.rbegin(); it != level_files.rend(); ++it) {
+      CompactionInputFiles output_level_inputs;
+      output_level_inputs.level = output_level_;
+      FileMetaData* file = *it;
+      if (it == level_files.rbegin()) {
+        my_smallest = file->smallest;
+        my_largest = file->largest;
+      } else {
+        if (compaction_picker_->icmp()->Compare(file->largest, my_smallest) <
+            0) {
+          my_smallest = file->smallest;
+        } else if (compaction_picker_->icmp()->Compare(file->smallest,
+                                                       my_largest) > 0) {
+          my_largest = file->largest;
+        } else {
+          break;
+        }
+      }
+      vstorage_->GetOverlappingInputs(output_level_, &my_smallest, &my_largest,
+                                      &output_level_inputs.files);
+      if (output_level_inputs.empty()) {
+        assert(!file->being_compacted);
+        start_level_inputs_.files.push_back(file);
+      } else {
+        break;
+      }
+    }
+  }
+
+  if (!start_level_inputs_.empty()) {
+    // Sort files by key range. Not sure it's 100% necessary but it's cleaner
+    // to always keep files sorted by key the key ranges don't overlap.
+    std::sort(start_level_inputs_.files.begin(),
+              start_level_inputs_.files.end(),
+              [icmp = compaction_picker_->icmp()](FileMetaData* f1,
+                                                  FileMetaData* f2) -> bool {
+                return (icmp->Compare(f1->smallest, f2->smallest) < 0);
+              });
+
+    is_l0_trivial_move_ = true;
+    return true;
+  }
+  return false;
+}
+
+bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index) {
+  if (start_level_inputs_.size() == 1 &&
+      (ioptions_.db_paths.empty() || ioptions_.db_paths.size() == 1) &&
+      (mutable_cf_options_.compression_per_level.empty())) {
+    // Only file of `index`, and it is likely a trivial move. Try to
+    // expand if it is still a trivial move, but not beyond
+    // max_compaction_bytes or 4 files, so that we don't create too
+    // much compaction pressure for the next level.
+    // Ignore if there are more than one DB path, as it would be hard
+    // to predict whether it is a trivial move.
+    const std::vector<FileMetaData*>& level_files =
+        vstorage_->LevelFiles(start_level_);
+    const size_t kMaxMultiTrivialMove = 4;
+    FileMetaData* initial_file = start_level_inputs_.files[0];
+    size_t total_size = initial_file->fd.GetFileSize();
+    CompactionInputFiles output_level_inputs;
+    output_level_inputs.level = output_level_;
+    for (int i = start_index + 1;
+         i < static_cast<int>(level_files.size()) &&
+         start_level_inputs_.size() < kMaxMultiTrivialMove;
+         i++) {
+      FileMetaData* next_file = level_files[i];
+      if (next_file->being_compacted) {
+        break;
+      }
+      vstorage_->GetOverlappingInputs(output_level_, &(initial_file->smallest),
+                                      &(next_file->largest),
+                                      &output_level_inputs.files);
+      if (!output_level_inputs.empty()) {
+        break;
+      }
+      if (i < static_cast<int>(level_files.size()) - 1 &&
+          compaction_picker_->icmp()
+                  ->user_comparator()
+                  ->CompareWithoutTimestamp(
+                      next_file->largest.user_key(),
+                      level_files[i + 1]->smallest.user_key()) == 0) {
+        TEST_SYNC_POINT_CALLBACK(
+            "LevelCompactionBuilder::TryExtendNonL0TrivialMove:NoCleanCut",
+            nullptr);
+        // Not a clean up after adding the next file. Skip.
+        break;
+      }
+      total_size += next_file->fd.GetFileSize();
+      if (total_size > mutable_cf_options_.max_compaction_bytes) {
+        break;
+      }
+      start_level_inputs_.files.push_back(next_file);
+    }
+    return start_level_inputs_.size() > 1;
+  }
+  return false;
+}
+
+bool LevelCompactionBuilder::PickFileToCompact() {
+  // level 0 files are overlapping. So we cannot pick more
+  // than one concurrent compactions at this level. This
+  // could be made better by looking at key-ranges that are
+  // being compacted at level 0.
+  if (start_level_ == 0 &&
+      !compaction_picker_->level0_compactions_in_progress()->empty()) {
+    TEST_SYNC_POINT("LevelCompactionPicker::PickCompactionBySize:0");
+    return false;
+  }
+
+  start_level_inputs_.clear();
+  start_level_inputs_.level = start_level_;
+
+  assert(start_level_ >= 0);
+
+  if (TryPickL0TrivialMove()) {
+    return true;
+  }
+
+  const std::vector<FileMetaData*>& level_files =
+      vstorage_->LevelFiles(start_level_);
+
+  // Pick the file with the highest score in this level that is not already
+  // being compacted.
+  const std::vector<int>& file_scores =
+      vstorage_->FilesByCompactionPri(start_level_);
+
+  unsigned int cmp_idx;
+  for (cmp_idx = vstorage_->NextCompactionIndex(start_level_);
+       cmp_idx < file_scores.size(); cmp_idx++) {
+    int index = file_scores[cmp_idx];
+    auto* f = level_files[index];
+
+    // do not pick a file to compact if it is being compacted
+    // from n-1 level.
+    if (f->being_compacted) {
+      if (ioptions_.compaction_pri == kRoundRobin) {
+        // TODO(zichen): this file may be involved in one compaction from
+        // an upper level, cannot advance the cursor for round-robin policy.
+        // Currently, we do not pick any file to compact in this case. We
+        // should fix this later to ensure a compaction is picked but the
+        // cursor shall not be advanced.
+        return false;
+      }
+      continue;
+    }
+
+    start_level_inputs_.files.push_back(f);
+    if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                    &start_level_inputs_) ||
+        compaction_picker_->FilesRangeOverlapWithCompaction(
+            {start_level_inputs_}, output_level_,
+            Compaction::EvaluatePenultimateLevel(
+                vstorage_, ioptions_, start_level_, output_level_))) {
+      // A locked (pending compaction) input-level file was pulled in due to
+      // user-key overlap.
+      start_level_inputs_.clear();
+
+      if (ioptions_.compaction_pri == kRoundRobin) {
+        return false;
+      }
+      continue;
+    }
+
+    // Now that input level is fully expanded, we check whether any output
+    // files are locked due to pending compaction.
+    //
+    // Note we rely on ExpandInputsToCleanCut() to tell us whether any output-
+    // level files are locked, not just the extra ones pulled in for user-key
+    // overlap.
+    InternalKey smallest, largest;
+    compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest);
+    CompactionInputFiles output_level_inputs;
+    output_level_inputs.level = output_level_;
+    vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
+                                    &output_level_inputs.files);
+    if (output_level_inputs.empty()) {
+      if (TryExtendNonL0TrivialMove(index)) {
+        break;
+      }
+    } else {
+      if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                      &output_level_inputs)) {
+        start_level_inputs_.clear();
+        if (ioptions_.compaction_pri == kRoundRobin) {
+          return false;
+        }
+        continue;
+      }
+    }
+
+    base_index_ = index;
+    break;
+  }
+
+  // store where to start the iteration in the next call to PickCompaction
+  if (ioptions_.compaction_pri != kRoundRobin) {
+    vstorage_->SetNextCompactionIndex(start_level_, cmp_idx);
+  }
+  return start_level_inputs_.size() > 0;
+}
+
+bool LevelCompactionBuilder::PickIntraL0Compaction() {
+  start_level_inputs_.clear();
+  const std::vector<FileMetaData*>& level_files =
+      vstorage_->LevelFiles(0 /* level */);
+  if (level_files.size() <
+          static_cast<size_t>(
+              mutable_cf_options_.level0_file_num_compaction_trigger + 2) ||
+      level_files[0]->being_compacted) {
+    // If L0 isn't accumulating much files beyond the regular trigger, don't
+    // resort to L0->L0 compaction yet.
+    return false;
+  }
+  return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction,
+                               std::numeric_limits<uint64_t>::max(),
+                               mutable_cf_options_.max_compaction_bytes,
+                               &start_level_inputs_, earliest_mem_seqno_);
+}
+}  // namespace
+
+Compaction* LevelCompactionPicker::PickCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    LogBuffer* log_buffer, SequenceNumber earliest_mem_seqno) {
+  LevelCompactionBuilder builder(cf_name, vstorage, earliest_mem_seqno, this,
+                                 log_buffer, mutable_cf_options, ioptions_,
+                                 mutable_db_options);
+  return builder.PickCompaction();
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker_level.h b/src/rocksdb/db/compaction/compaction_picker_level.h
new file mode 100644
index 000000000..42a9b60a6
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_level.h
@@ -0,0 +1,33 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/compaction/compaction_picker.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Picking compactions for leveled compaction. See wiki page
+// https://github.com/facebook/rocksdb/wiki/Leveled-Compaction
+// for description of Leveled compaction.
+class LevelCompactionPicker : public CompactionPicker {
+ public:
+  LevelCompactionPicker(const ImmutableOptions& ioptions,
+                        const InternalKeyComparator* icmp)
+      : CompactionPicker(ioptions, icmp) {}
+  virtual Compaction* PickCompaction(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+      LogBuffer* log_buffer,
+      SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
+
+  virtual bool NeedsCompaction(
+      const VersionStorageInfo* vstorage) const override;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker_test.cc b/src/rocksdb/db/compaction/compaction_picker_test.cc
new file mode 100644
index 000000000..2e2e566c0
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_test.cc
@@ -0,0 +1,3964 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <limits>
+#include <string>
+#include <utility>
+
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_picker_fifo.h"
+#include "db/compaction/compaction_picker_level.h"
+#include "db/compaction/compaction_picker_universal.h"
+#include "db/compaction/file_pri.h"
+#include "rocksdb/advanced_options.h"
+#include "table/unique_id_impl.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CountingLogger : public Logger {
+ public:
+  using Logger::Logv;
+  void Logv(const char* /*format*/, va_list /*ap*/) override { log_count++; }
+  size_t log_count;
+};
+
+class CompactionPickerTestBase : public testing::Test {
+ public:
+  const Comparator* ucmp_;
+  InternalKeyComparator icmp_;
+  Options options_;
+  ImmutableOptions ioptions_;
+  MutableCFOptions mutable_cf_options_;
+  MutableDBOptions mutable_db_options_;
+  LevelCompactionPicker level_compaction_picker;
+  std::string cf_name_;
+  CountingLogger logger_;
+  LogBuffer log_buffer_;
+  uint32_t file_num_;
+  CompactionOptionsFIFO fifo_options_;
+  std::unique_ptr<VersionStorageInfo> vstorage_;
+  std::vector<std::unique_ptr<FileMetaData>> files_;
+  // does not own FileMetaData
+  std::unordered_map<uint32_t, std::pair<FileMetaData*, int>> file_map_;
+  // input files to compaction process.
+  std::vector<CompactionInputFiles> input_files_;
+  int compaction_level_start_;
+
+  explicit CompactionPickerTestBase(const Comparator* _ucmp)
+      : ucmp_(_ucmp),
+        icmp_(ucmp_),
+        options_(CreateOptions(ucmp_)),
+        ioptions_(options_),
+        mutable_cf_options_(options_),
+        mutable_db_options_(),
+        level_compaction_picker(ioptions_, &icmp_),
+        cf_name_("dummy"),
+        log_buffer_(InfoLogLevel::INFO_LEVEL, &logger_),
+        file_num_(1),
+        vstorage_(nullptr) {
+    mutable_cf_options_.ttl = 0;
+    mutable_cf_options_.periodic_compaction_seconds = 0;
+    // ioptions_.compaction_pri = kMinOverlappingRatio has its own set of
+    // tests to cover.
+    ioptions_.compaction_pri = kByCompensatedSize;
+    fifo_options_.max_table_files_size = 1;
+    mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+    ioptions_.cf_paths.emplace_back("dummy",
+                                    std::numeric_limits<uint64_t>::max());
+  }
+
+  ~CompactionPickerTestBase() override {}
+
+  void NewVersionStorage(int num_levels, CompactionStyle style) {
+    DeleteVersionStorage();
+    options_.num_levels = num_levels;
+    vstorage_.reset(new VersionStorageInfo(&icmp_, ucmp_, options_.num_levels,
+                                           style, nullptr, false));
+    vstorage_->PrepareForVersionAppend(ioptions_, mutable_cf_options_);
+  }
+
+  // Create a new VersionStorageInfo object so we can add mode files and then
+  // merge it with the existing VersionStorageInfo
+  void AddVersionStorage() {
+    temp_vstorage_.reset(new VersionStorageInfo(
+        &icmp_, ucmp_, options_.num_levels, ioptions_.compaction_style,
+        vstorage_.get(), false));
+  }
+
+  void DeleteVersionStorage() {
+    vstorage_.reset();
+    temp_vstorage_.reset();
+    files_.clear();
+    file_map_.clear();
+    input_files_.clear();
+  }
+
+  // REQUIRES: smallest and largest are c-style strings ending with '\0'
+  void Add(int level, uint32_t file_number, const char* smallest,
+           const char* largest, uint64_t file_size = 1, uint32_t path_id = 0,
+           SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100,
+           size_t compensated_file_size = 0, bool marked_for_compact = false,
+           Temperature temperature = Temperature::kUnknown,
+           uint64_t oldest_ancestor_time = kUnknownOldestAncesterTime,
+           Slice ts_of_smallest = Slice(), Slice ts_of_largest = Slice()) {
+    assert(ts_of_smallest.size() == ucmp_->timestamp_size());
+    assert(ts_of_largest.size() == ucmp_->timestamp_size());
+
+    VersionStorageInfo* vstorage;
+    if (temp_vstorage_) {
+      vstorage = temp_vstorage_.get();
+    } else {
+      vstorage = vstorage_.get();
+    }
+    assert(level < vstorage->num_levels());
+    char* smallest_key_buf = nullptr;
+    char* largest_key_buf = nullptr;
+
+    if (!ts_of_smallest.empty()) {
+      smallest_key_buf = new char[strlen(smallest) + ucmp_->timestamp_size()];
+      memcpy(smallest_key_buf, smallest, strlen(smallest));
+      memcpy(smallest_key_buf + strlen(smallest), ts_of_smallest.data(),
+             ucmp_->timestamp_size());
+      largest_key_buf = new char[strlen(largest) + ucmp_->timestamp_size()];
+      memcpy(largest_key_buf, largest, strlen(largest));
+      memcpy(largest_key_buf + strlen(largest), ts_of_largest.data(),
+             ucmp_->timestamp_size());
+    }
+
+    InternalKey smallest_ikey = InternalKey(
+        smallest_key_buf ? Slice(smallest_key_buf,
+                                 ucmp_->timestamp_size() + strlen(smallest))
+                         : smallest,
+        smallest_seq, kTypeValue);
+    InternalKey largest_ikey = InternalKey(
+        largest_key_buf
+            ? Slice(largest_key_buf, ucmp_->timestamp_size() + strlen(largest))
+            : largest,
+        largest_seq, kTypeValue);
+
+    FileMetaData* f = new FileMetaData(
+        file_number, path_id, file_size, smallest_ikey, largest_ikey,
+        smallest_seq, largest_seq, marked_for_compact, temperature,
+        kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+        kUnknownFileCreationTime, kUnknownFileChecksum,
+        kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+    f->compensated_file_size =
+        (compensated_file_size != 0) ? compensated_file_size : file_size;
+    f->oldest_ancester_time = oldest_ancestor_time;
+    vstorage->AddFile(level, f);
+    files_.emplace_back(f);
+    file_map_.insert({file_number, {f, level}});
+
+    delete[] smallest_key_buf;
+    delete[] largest_key_buf;
+  }
+
+  void SetCompactionInputFilesLevels(int level_count, int start_level) {
+    input_files_.resize(level_count);
+    for (int i = 0; i < level_count; ++i) {
+      input_files_[i].level = start_level + i;
+    }
+    compaction_level_start_ = start_level;
+  }
+
+  void AddToCompactionFiles(uint32_t file_number) {
+    auto iter = file_map_.find(file_number);
+    assert(iter != file_map_.end());
+    int level = iter->second.second;
+    assert(level < vstorage_->num_levels());
+    input_files_[level - compaction_level_start_].files.emplace_back(
+        iter->second.first);
+  }
+
+  void UpdateVersionStorageInfo() {
+    if (temp_vstorage_) {
+      VersionBuilder builder(FileOptions(), &ioptions_, nullptr,
+                             vstorage_.get(), nullptr);
+      ASSERT_OK(builder.SaveTo(temp_vstorage_.get()));
+      vstorage_ = std::move(temp_vstorage_);
+    }
+    vstorage_->PrepareForVersionAppend(ioptions_, mutable_cf_options_);
+    vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+    vstorage_->SetFinalized();
+  }
+
+ private:
+  Options CreateOptions(const Comparator* ucmp) const {
+    Options opts;
+    opts.comparator = ucmp;
+    return opts;
+  }
+
+  std::unique_ptr<VersionStorageInfo> temp_vstorage_;
+};
+
+class CompactionPickerTest : public CompactionPickerTestBase {
+ public:
+  explicit CompactionPickerTest()
+      : CompactionPickerTestBase(BytewiseComparator()) {}
+
+  ~CompactionPickerTest() override {}
+};
+
+class CompactionPickerU64TsTest : public CompactionPickerTestBase {
+ public:
+  explicit CompactionPickerU64TsTest()
+      : CompactionPickerTestBase(test::BytewiseComparatorWithU64TsWrapper()) {}
+
+  ~CompactionPickerU64TsTest() override {}
+};
+
+TEST_F(CompactionPickerTest, Empty) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  UpdateVersionStorageInfo();
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, Single) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  Add(0, 1U, "p", "q");
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, Level0Trigger) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  Add(0, 1U, "150", "200");
+  Add(0, 2U, "200", "250");
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, Level1Trigger) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(1, 66U, "150", "200", 1000000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, Level1Trigger2) {
+  mutable_cf_options_.target_file_size_base = 10000000000;
+  mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(1, 66U, "150", "200", 1000000001U);
+  Add(1, 88U, "201", "300", 1000000000U);
+  Add(2, 6U, "150", "179", 1000000000U);
+  Add(2, 7U, "180", "220", 1000000000U);
+  Add(2, 8U, "221", "300", 1000000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->num_input_files(1));
+  ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
+  ASSERT_EQ(uint64_t{1073741824}, compaction->OutputFilePreallocationSize());
+}
+
+TEST_F(CompactionPickerTest, LevelMaxScore) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.target_file_size_base = 10000000;
+  mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024;
+  mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+  Add(0, 1U, "150", "200", 1000000U);
+  // Level 1 score 1.2
+  Add(1, 66U, "150", "200", 6000000U);
+  Add(1, 88U, "201", "300", 6000000U);
+  // Level 2 score 1.8. File 7 is the largest. Should be picked
+  Add(2, 6U, "150", "179", 60000000U);
+  Add(2, 7U, "180", "220", 60000001U);
+  Add(2, 8U, "221", "300", 60000000U);
+  // Level 3 score slightly larger than 1
+  Add(3, 26U, "150", "170", 260000000U);
+  Add(3, 27U, "171", "179", 260000000U);
+  Add(3, 28U, "191", "220", 260000000U);
+  Add(3, 29U, "221", "300", 260000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(mutable_cf_options_.target_file_size_base +
+                mutable_cf_options_.target_file_size_base / 10,
+            compaction->OutputFilePreallocationSize());
+}
+
+TEST_F(CompactionPickerTest, NeedsCompactionLevel) {
+  const int kLevels = 6;
+  const int kFileCount = 20;
+
+  for (int level = 0; level < kLevels - 1; ++level) {
+    NewVersionStorage(kLevels, kCompactionStyleLevel);
+    uint64_t file_size = vstorage_->MaxBytesForLevel(level) * 2 / kFileCount;
+    for (int file_count = 1; file_count <= kFileCount; ++file_count) {
+      // start a brand new version in each test.
+      NewVersionStorage(kLevels, kCompactionStyleLevel);
+      for (int i = 0; i < file_count; ++i) {
+        Add(level, i, std::to_string((i + 100) * 1000).c_str(),
+            std::to_string((i + 100) * 1000 + 999).c_str(), file_size, 0,
+            i * 100, i * 100 + 99);
+      }
+      UpdateVersionStorageInfo();
+      ASSERT_EQ(vstorage_->CompactionScoreLevel(0), level);
+      ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
+                vstorage_->CompactionScore(0) >= 1);
+      // release the version storage
+      DeleteVersionStorage();
+    }
+  }
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 200;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200");
+  Add(0, 2U, "200", "250");
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels()));
+  ASSERT_EQ(num_levels - 1, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic2) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 200;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200");
+  Add(0, 2U, "200", "250");
+  Add(num_levels - 1, 3U, "200", "250", 300U);
+
+  UpdateVersionStorageInfo();
+  ASSERT_EQ(vstorage_->base_level(), num_levels - 2);
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels()));
+  ASSERT_EQ(num_levels - 2, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic3) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 200;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200");
+  Add(0, 2U, "200", "250");
+  Add(num_levels - 1, 3U, "200", "250", 300U);
+  Add(num_levels - 1, 4U, "300", "350", 3000U);
+
+  UpdateVersionStorageInfo();
+  ASSERT_EQ(vstorage_->base_level(), num_levels - 3);
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels()));
+  ASSERT_EQ(num_levels - 3, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic4) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 200;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200");
+  Add(0, 2U, "200", "250");
+  Add(num_levels - 1, 3U, "200", "250", 300U);
+  Add(num_levels - 1, 4U, "300", "350", 3000U);
+  Add(num_levels - 3, 5U, "150", "180", 3U);
+  Add(num_levels - 3, 6U, "181", "300", 3U);
+  Add(num_levels - 3, 7U, "400", "450", 3U);
+
+  UpdateVersionStorageInfo();
+  ASSERT_EQ(vstorage_->base_level(), num_levels - 3);
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->num_input_files(1));
+  ASSERT_EQ(num_levels - 3, compaction->level(1));
+  ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(1, 1)->fd.GetNumber());
+  ASSERT_EQ(2, static_cast<int>(compaction->num_input_levels()));
+  ASSERT_EQ(num_levels - 3, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, LevelTriggerDynamic4) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 200;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200");
+  Add(num_levels - 1, 2U, "200", "250", 300U);
+  Add(num_levels - 1, 3U, "300", "350", 3000U);
+  Add(num_levels - 1, 4U, "400", "450", 3U);
+  Add(num_levels - 2, 5U, "150", "180", 300U);
+  Add(num_levels - 2, 6U, "181", "350", 500U);
+  Add(num_levels - 2, 7U, "400", "450", 200U);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(0, compaction->num_input_files(1));
+  ASSERT_EQ(1U, compaction->num_input_levels());
+  ASSERT_EQ(num_levels - 1, compaction->output_level());
+}
+
+// Universal and FIFO Compactions are not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+TEST_F(CompactionPickerTest, NeedsCompactionUniversal) {
+  NewVersionStorage(1, kCompactionStyleUniversal);
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  UpdateVersionStorageInfo();
+  // must return false when there's no files.
+  ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
+            false);
+
+  // verify the trigger given different number of L0 files.
+  for (int i = 1;
+       i <= mutable_cf_options_.level0_file_num_compaction_trigger * 2; ++i) {
+    NewVersionStorage(1, kCompactionStyleUniversal);
+    Add(0, i, std::to_string((i + 100) * 1000).c_str(),
+        std::to_string((i + 100) * 1000 + 999).c_str(), 1000000, 0, i * 100,
+        i * 100 + 99);
+    UpdateVersionStorageInfo();
+    ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
+              vstorage_->CompactionScore(0) >= 1);
+  }
+}
+
+TEST_F(CompactionPickerTest, CompactionUniversalIngestBehindReservedLevel) {
+  const uint64_t kFileSize = 100000;
+  NewVersionStorage(1, kCompactionStyleUniversal);
+  ioptions_.allow_ingest_behind = true;
+  ioptions_.num_levels = 3;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  UpdateVersionStorageInfo();
+  // must return false when there's no files.
+  ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
+            false);
+
+  NewVersionStorage(3, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+  Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+  Add(1, 5U, "100", "151", kFileSize, 0, 200, 251);
+  Add(1, 3U, "301", "350", kFileSize, 0, 101, 150);
+  Add(2, 6U, "120", "200", kFileSize, 0, 20, 100);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  // output level should be the one above the bottom-most
+  ASSERT_EQ(1, compaction->output_level());
+}
+// Tests if the files can be trivially moved in multi level
+// universal compaction when allow_trivial_move option is set
+// In this test as the input files overlaps, they cannot
+// be trivially moved.
+
+TEST_F(CompactionPickerTest, CannotTrivialMoveUniversal) {
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.compaction_options_universal.allow_trivial_move = true;
+  NewVersionStorage(1, kCompactionStyleUniversal);
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  UpdateVersionStorageInfo();
+  // must return false when there's no files.
+  ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
+            false);
+
+  NewVersionStorage(3, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+  Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+  Add(1, 5U, "100", "151", kFileSize, 0, 200, 251);
+  Add(1, 3U, "301", "350", kFileSize, 0, 101, 150);
+  Add(2, 6U, "120", "200", kFileSize, 0, 20, 100);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  ASSERT_TRUE(!compaction->is_trivial_move());
+}
+// Tests if the files can be trivially moved in multi level
+// universal compaction when allow_trivial_move option is set
+// In this test as the input files doesn't overlaps, they should
+// be trivially moved.
+TEST_F(CompactionPickerTest, AllowsTrivialMoveUniversal) {
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.compaction_options_universal.allow_trivial_move = true;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(3, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+  Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+  Add(1, 5U, "010", "080", kFileSize, 0, 200, 251);
+  Add(2, 3U, "301", "350", kFileSize, 0, 101, 150);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  ASSERT_TRUE(compaction->is_trivial_move());
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction1) {
+  // The case where universal periodic compaction can be picked
+  // with some newer files being compacted.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+  Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+  Add(3, 5U, "010", "080", kFileSize, 0, 200, 251);
+  Add(4, 3U, "301", "350", kFileSize, 0, 101, 150);
+  Add(4, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+  file_map_[2].first->being_compacted = true;
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(4, file_map_[3].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(4, compaction->output_level());
+  ASSERT_EQ(0, compaction->start_level());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction2) {
+  // The case where universal periodic compaction does not
+  // pick up only level to compact if it doesn't cover
+  // any file marked as periodic compaction.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(3, 5U, "010", "080", kFileSize, 0, 200, 251);
+  Add(4, 3U, "301", "350", kFileSize, 0, 101, 150);
+  Add(4, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+  file_map_[5].first->being_compacted = true;
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[1].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  ASSERT_FALSE(compaction);
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction3) {
+  // The case where universal periodic compaction does not
+  // pick up only the last sorted run which is an L0 file if it isn't
+  // marked as periodic compaction.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 5U, "010", "080", kFileSize, 0, 200, 251);
+  Add(0, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+  file_map_[5].first->being_compacted = true;
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[1].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  ASSERT_FALSE(compaction);
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction4) {
+  // The case where universal periodic compaction couldn't form
+  // a compaction that includes any file marked for periodic compaction.
+  // Right now we form the compaction anyway if it is more than one
+  // sorted run. Just put the case here to validate that it doesn't
+  // crash.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+  Add(3, 5U, "010", "080", kFileSize, 0, 200, 251);
+  Add(4, 3U, "301", "350", kFileSize, 0, 101, 150);
+  Add(4, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+  file_map_[2].first->being_compacted = true;
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[2].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(!compaction ||
+              compaction->start_level() != compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction5) {
+  // Test single L0 file periodic compaction triggering.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 6U, "150", "200", kFileSize, 0, 500, 550);
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[6].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(0, compaction->start_level());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(4, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction6) {
+  // Test single sorted run non-L0 periodic compaction
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(4, 5U, "150", "200", kFileSize, 0, 500, 550);
+  Add(4, 6U, "350", "400", kFileSize, 0, 500, 550);
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(4, file_map_[6].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(4, compaction->start_level());
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(4, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, UniversalIncrementalSpace1) {
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.max_compaction_bytes = 555555;
+  mutable_cf_options_.compaction_options_universal.incremental = true;
+  mutable_cf_options_.compaction_options_universal
+      .max_size_amplification_percent = 30;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+  Add(3, 5U, "310", "380", kFileSize, 0, 200, 251);
+  Add(3, 6U, "410", "880", kFileSize, 0, 200, 251);
+  Add(3, 7U, "910", "980", 1, 0, 200, 251);
+  Add(4, 10U, "201", "250", kFileSize, 0, 101, 150);
+  Add(4, 11U, "301", "350", kFileSize, 0, 101, 150);
+  Add(4, 12U, "401", "450", kFileSize, 0, 101, 150);
+  Add(4, 13U, "501", "750", kFileSize, 0, 101, 150);
+  Add(4, 14U, "801", "850", kFileSize, 0, 101, 150);
+  Add(4, 15U, "901", "950", kFileSize, 0, 101, 150);
+  //  Add(4, 15U, "960", "970", kFileSize, 0, 101, 150);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(4, compaction->output_level());
+  ASSERT_EQ(3, compaction->start_level());
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(0, 1)->fd.GetNumber());
+  // ASSERT_EQ(4U, compaction->num_input_files(1));
+  ASSERT_EQ(11U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(12U, compaction->input(1, 1)->fd.GetNumber());
+  ASSERT_EQ(13U, compaction->input(1, 2)->fd.GetNumber());
+  ASSERT_EQ(14U, compaction->input(1, 3)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, UniversalIncrementalSpace2) {
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.max_compaction_bytes = 400000;
+  mutable_cf_options_.compaction_options_universal.incremental = true;
+  mutable_cf_options_.compaction_options_universal
+      .max_size_amplification_percent = 30;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(1, 2U, "010", "080", kFileSize, 0, 200, 251);
+  Add(2, 5U, "310", "380", kFileSize, 0, 200, 251);
+  Add(2, 6U, "410", "880", kFileSize, 0, 200, 251);
+  Add(2, 7U, "910", "980", kFileSize, 0, 200, 251);
+  Add(4, 10U, "201", "250", kFileSize, 0, 101, 150);
+  Add(4, 11U, "301", "350", kFileSize, 0, 101, 150);
+  Add(4, 12U, "401", "450", kFileSize, 0, 101, 150);
+  Add(4, 13U, "501", "750", kFileSize, 0, 101, 150);
+  Add(4, 14U, "801", "850", kFileSize, 0, 101, 150);
+  Add(4, 15U, "901", "950", kFileSize, 0, 101, 150);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(4, compaction->output_level());
+  ASSERT_EQ(2, compaction->start_level());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+  ASSERT_EQ(15U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, UniversalIncrementalSpace3) {
+  // Test bottom level files falling between gaps between two upper level
+  // files
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.max_compaction_bytes = 300000;
+  mutable_cf_options_.compaction_options_universal.incremental = true;
+  mutable_cf_options_.compaction_options_universal
+      .max_size_amplification_percent = 30;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+  Add(3, 5U, "000", "180", kFileSize, 0, 200, 251);
+  Add(3, 6U, "181", "190", kFileSize, 0, 200, 251);
+  Add(3, 7U, "710", "810", kFileSize, 0, 200, 251);
+  Add(3, 8U, "820", "830", kFileSize, 0, 200, 251);
+  Add(3, 9U, "900", "991", kFileSize, 0, 200, 251);
+  Add(4, 10U, "201", "250", kFileSize, 0, 101, 150);
+  Add(4, 11U, "301", "350", kFileSize, 0, 101, 150);
+  Add(4, 12U, "401", "450", kFileSize, 0, 101, 150);
+  Add(4, 13U, "501", "750", kFileSize, 0, 101, 150);
+  Add(4, 14U, "801", "850", kFileSize, 0, 101, 150);
+  Add(4, 15U, "901", "950", kFileSize, 0, 101, 150);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(4, compaction->output_level());
+  ASSERT_EQ(2, compaction->start_level());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->num_input_files(1));
+  ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(1, 1)->fd.GetNumber());
+  ASSERT_EQ(0, compaction->num_input_files(2));
+}
+
+TEST_F(CompactionPickerTest, UniversalIncrementalSpace4) {
+  // Test compaction candidates always cover many files.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.max_compaction_bytes = 3200000;
+  mutable_cf_options_.compaction_options_universal.incremental = true;
+  mutable_cf_options_.compaction_options_universal
+      .max_size_amplification_percent = 30;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+
+  // Generate files like following:
+  // L3: (1101, 1180) (1201, 1280) ... (7901, 7908)
+  // L4: (1130, 1150) (1160, 1210) (1230, 1250) (1260 1310) ... (7960, 8010)
+  for (int i = 11; i < 79; i++) {
+    Add(3, 100 + i * 3, std::to_string(i * 100).c_str(),
+        std::to_string(i * 100 + 80).c_str(), kFileSize, 0, 200, 251);
+    // Add a tie breaker
+    if (i == 66) {
+      Add(3, 10000U, "6690", "6699", kFileSize, 0, 200, 251);
+    }
+
+    Add(4, 100 + i * 3 + 1, std::to_string(i * 100 + 30).c_str(),
+        std::to_string(i * 100 + 50).c_str(), kFileSize, 0, 200, 251);
+    Add(4, 100 + i * 3 + 2, std::to_string(i * 100 + 60).c_str(),
+        std::to_string(i * 100 + 110).c_str(), kFileSize, 0, 200, 251);
+  }
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(4, compaction->output_level());
+  ASSERT_EQ(3, compaction->start_level());
+  ASSERT_EQ(6U, compaction->num_input_files(0));
+  ASSERT_EQ(100 + 62U * 3, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(10000U, compaction->input(0, 5)->fd.GetNumber());
+  ASSERT_EQ(11, compaction->num_input_files(1));
+}
+
+TEST_F(CompactionPickerTest, UniversalIncrementalSpace5) {
+  // Test compaction candidates always cover many files with some single
+  // files larger than size threshold.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.max_compaction_bytes = 3200000;
+  mutable_cf_options_.compaction_options_universal.incremental = true;
+  mutable_cf_options_.compaction_options_universal
+      .max_size_amplification_percent = 30;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+
+  // Generate files like following:
+  // L3: (1101, 1180) (1201, 1280) ... (7901, 7908)
+  // L4: (1130, 1150) (1160, 1210) (1230, 1250) (1260 1310) ... (7960, 8010)
+  for (int i = 11; i < 70; i++) {
+    Add(3, 100 + i * 3, std::to_string(i * 100).c_str(),
+        std::to_string(i * 100 + 80).c_str(),
+        i % 10 == 9 ? kFileSize * 100 : kFileSize, 0, 200, 251);
+
+    Add(4, 100 + i * 3 + 1, std::to_string(i * 100 + 30).c_str(),
+        std::to_string(i * 100 + 50).c_str(), kFileSize, 0, 200, 251);
+    Add(4, 100 + i * 3 + 2, std::to_string(i * 100 + 60).c_str(),
+        std::to_string(i * 100 + 110).c_str(), kFileSize, 0, 200, 251);
+  }
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(4, compaction->output_level());
+  ASSERT_EQ(3, compaction->start_level());
+  ASSERT_EQ(6U, compaction->num_input_files(0));
+  ASSERT_EQ(100 + 14 * 3, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(100 + 19 * 3, compaction->input(0, 5)->fd.GetNumber());
+  ASSERT_EQ(13, compaction->num_input_files(1));
+}
+
+TEST_F(CompactionPickerTest, NeedsCompactionFIFO) {
+  NewVersionStorage(1, kCompactionStyleFIFO);
+  const int kFileCount =
+      mutable_cf_options_.level0_file_num_compaction_trigger * 3;
+  const uint64_t kFileSize = 100000;
+  const uint64_t kMaxSize = kFileSize * kFileCount / 2;
+
+  fifo_options_.max_table_files_size = kMaxSize;
+  mutable_cf_options_.compaction_options_fifo = fifo_options_;
+  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+  UpdateVersionStorageInfo();
+  // must return false when there's no files.
+  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), false);
+
+  // verify whether compaction is needed based on the current
+  // size of L0 files.
+  for (int i = 1; i <= kFileCount; ++i) {
+    NewVersionStorage(1, kCompactionStyleFIFO);
+    Add(0, i, std::to_string((i + 100) * 1000).c_str(),
+        std::to_string((i + 100) * 1000 + 999).c_str(), kFileSize, 0, i * 100,
+        i * 100 + 99);
+    UpdateVersionStorageInfo();
+    ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()),
+              vstorage_->CompactionScore(0) >= 1);
+  }
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarm1) {
+  NewVersionStorage(1, kCompactionStyleFIFO);
+  const uint64_t kFileSize = 100000;
+  const uint64_t kMaxSize = kFileSize * 100000;
+  uint64_t kWarmThreshold = 2000;
+
+  fifo_options_.max_table_files_size = kMaxSize;
+  fifo_options_.age_for_warm = kWarmThreshold;
+  mutable_cf_options_.compaction_options_fifo = fifo_options_;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
+  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+  int64_t current_time = 0;
+  ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+  uint64_t threshold_time =
+      static_cast<uint64_t>(current_time) - kWarmThreshold;
+  Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+      Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+      Temperature::kUnknown, threshold_time + 100);
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+      Temperature::kUnknown, threshold_time - 2000);
+  Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+      Temperature::kUnknown, threshold_time - 3000);
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+  std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarm2) {
+  NewVersionStorage(1, kCompactionStyleFIFO);
+  const uint64_t kFileSize = 100000;
+  const uint64_t kMaxSize = kFileSize * 100000;
+  uint64_t kWarmThreshold = 2000;
+
+  fifo_options_.max_table_files_size = kMaxSize;
+  fifo_options_.age_for_warm = kWarmThreshold;
+  mutable_cf_options_.compaction_options_fifo = fifo_options_;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
+  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+  int64_t current_time = 0;
+  ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+  uint64_t threshold_time =
+      static_cast<uint64_t>(current_time) - kWarmThreshold;
+  Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+      Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+      Temperature::kUnknown, threshold_time + 100);
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+      Temperature::kUnknown, threshold_time - 2000);
+  Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+      Temperature::kUnknown, threshold_time - 3000);
+  Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
+      Temperature::kUnknown, threshold_time - 4000);
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+  std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarmMaxSize) {
+  NewVersionStorage(1, kCompactionStyleFIFO);
+  const uint64_t kFileSize = 100000;
+  const uint64_t kMaxSize = kFileSize * 100000;
+  uint64_t kWarmThreshold = 2000;
+
+  fifo_options_.max_table_files_size = kMaxSize;
+  fifo_options_.age_for_warm = kWarmThreshold;
+  mutable_cf_options_.compaction_options_fifo = fifo_options_;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_compaction_bytes = kFileSize * 9;
+  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+  int64_t current_time = 0;
+  ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+  uint64_t threshold_time =
+      static_cast<uint64_t>(current_time) - kWarmThreshold;
+  Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+      Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+      Temperature::kUnknown, threshold_time + 100);
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+      Temperature::kUnknown, threshold_time - 2000);
+  Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+      Temperature::kUnknown, threshold_time - 3000);
+  Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
+      Temperature::kUnknown, threshold_time - 4000);
+  Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
+      Temperature::kUnknown, threshold_time - 5000);
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+  std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarmWithExistingWarm) {
+  NewVersionStorage(1, kCompactionStyleFIFO);
+  const uint64_t kFileSize = 100000;
+  const uint64_t kMaxSize = kFileSize * 100000;
+  uint64_t kWarmThreshold = 2000;
+
+  fifo_options_.max_table_files_size = kMaxSize;
+  fifo_options_.age_for_warm = kWarmThreshold;
+  mutable_cf_options_.compaction_options_fifo = fifo_options_;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
+  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+  int64_t current_time = 0;
+  ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+  uint64_t threshold_time =
+      static_cast<uint64_t>(current_time) - kWarmThreshold;
+  Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+      Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+      Temperature::kUnknown, threshold_time + 100);
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+      Temperature::kUnknown, threshold_time - 2000);
+  Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+      Temperature::kUnknown, threshold_time - 3000);
+  Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
+      Temperature::kUnknown, threshold_time - 4000);
+  Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
+      Temperature::kWarm, threshold_time - 5000);
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+  std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarmWithOngoing) {
+  NewVersionStorage(1, kCompactionStyleFIFO);
+  const uint64_t kFileSize = 100000;
+  const uint64_t kMaxSize = kFileSize * 100000;
+  uint64_t kWarmThreshold = 2000;
+
+  fifo_options_.max_table_files_size = kMaxSize;
+  fifo_options_.age_for_warm = kWarmThreshold;
+  mutable_cf_options_.compaction_options_fifo = fifo_options_;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
+  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+  int64_t current_time = 0;
+  ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+  uint64_t threshold_time =
+      static_cast<uint64_t>(current_time) - kWarmThreshold;
+  Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+      Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+      Temperature::kUnknown, threshold_time + 100);
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+      Temperature::kUnknown, threshold_time - 2000);
+  Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+      Temperature::kUnknown, threshold_time - 3000);
+  Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
+      Temperature::kUnknown, threshold_time - 4000);
+  Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
+      Temperature::kWarm, threshold_time - 5000);
+  file_map_[2].first->being_compacted = true;
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+  std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  // Stop if a file is being compacted
+  ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarmWithHotBetweenWarms) {
+  NewVersionStorage(1, kCompactionStyleFIFO);
+  const uint64_t kFileSize = 100000;
+  const uint64_t kMaxSize = kFileSize * 100000;
+  uint64_t kWarmThreshold = 2000;
+
+  fifo_options_.max_table_files_size = kMaxSize;
+  fifo_options_.age_for_warm = kWarmThreshold;
+  mutable_cf_options_.compaction_options_fifo = fifo_options_;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
+  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+  int64_t current_time = 0;
+  ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+  uint64_t threshold_time =
+      static_cast<uint64_t>(current_time) - kWarmThreshold;
+  Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+      Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+      Temperature::kUnknown, threshold_time + 100);
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+      Temperature::kUnknown, threshold_time - 2000);
+  Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+      Temperature::kWarm, threshold_time - 3000);
+  Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
+      Temperature::kUnknown, threshold_time - 4000);
+  Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
+      Temperature::kWarm, threshold_time - 5000);
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+  std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  // Stop if a file is being compacted
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  mutable_cf_options_.target_file_size_base = 100000000000;
+  mutable_cf_options_.target_file_size_multiplier = 10;
+  mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024;
+  mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+
+  Add(2, 6U, "150", "179", 50000000U);
+  Add(2, 7U, "180", "220", 50000000U);
+  Add(2, 8U, "321", "400", 50000000U);  // File not overlapping
+  Add(2, 9U, "721", "800", 50000000U);
+
+  Add(3, 26U, "150", "170", 260000000U);
+  Add(3, 27U, "171", "179", 260000000U);
+  Add(3, 28U, "191", "220", 260000000U);
+  Add(3, 29U, "221", "300", 260000000U);
+  Add(3, 30U, "750", "900", 260000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  // Pick file 8 because it overlaps with 0 files on level 3.
+  ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber());
+  // Compaction input size * 1.1
+  ASSERT_GE(uint64_t{55000000}, compaction->OutputFilePreallocationSize());
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping2) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  mutable_cf_options_.target_file_size_base = 10000000;
+  mutable_cf_options_.target_file_size_multiplier = 10;
+  mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024;
+
+  Add(2, 6U, "150", "175",
+      60000000U);  // Overlaps with file 26, 27, total size 521M
+  Add(2, 7U, "176", "200", 60000000U);  // Overlaps with file 27, 28, total size
+                                        // 520M, the smallest overlapping
+  Add(2, 8U, "201", "300",
+      60000000U);  // Overlaps with file 28, 29, total size 521M
+
+  Add(3, 25U, "100", "110", 261000000U);
+  Add(3, 26U, "150", "170", 261000000U);
+  Add(3, 27U, "171", "179", 260000000U);
+  Add(3, 28U, "191", "220", 260000000U);
+  Add(3, 29U, "221", "300", 261000000U);
+  Add(3, 30U, "321", "400", 261000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  // Picking file 7 because overlapping ratio is the biggest.
+  ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping3) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  mutable_cf_options_.max_bytes_for_level_base = 10000000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+
+  // file 7 and 8 over lap with the same file, but file 8 is smaller so
+  // it will be picked.
+  Add(2, 6U, "150", "167", 60000000U);  // Overlaps with file 26, 27
+  Add(2, 7U, "168", "169", 60000000U);  // Overlaps with file 27
+  Add(2, 8U, "201", "300", 61000000U);  // Overlaps with file 28, but the file
+                                        // itself is larger. Should be picked.
+
+  Add(3, 26U, "160", "165", 260000000U);
+  Add(3, 27U, "166", "170", 260000000U);
+  Add(3, 28U, "180", "400", 260000000U);
+  Add(3, 29U, "401", "500", 260000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  // Picking file 8 because overlapping ratio is the biggest.
+  ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping4) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  mutable_cf_options_.max_bytes_for_level_base = 10000000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  mutable_cf_options_.ignore_max_compaction_bytes_for_input = false;
+
+  // file 7 and 8 over lap with the same file, but file 8 is smaller so
+  // it will be picked.
+  // Overlaps with file 26, 27. And the file is compensated so will be
+  // picked up.
+  Add(2, 6U, "150", "167", 60000000U, 0, 100, 100, 180000000U);
+  Add(2, 7U, "168", "169", 60000000U);  // Overlaps with file 27
+  Add(2, 8U, "201", "300", 61000000U);  // Overlaps with file 28
+
+  Add(3, 26U, "160", "165", 60000000U);
+  // Boosted file size in output level is not considered.
+  Add(3, 27U, "166", "170", 60000000U, 0, 100, 100, 260000000U);
+  Add(3, 28U, "180", "400", 60000000U);
+  Add(3, 29U, "401", "500", 60000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  // Picking file 8 because overlapping ratio is the biggest.
+  ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, CompactionPriRoundRobin) {
+  std::vector<InternalKey> test_cursors = {InternalKey("249", 100, kTypeValue),
+                                           InternalKey("600", 100, kTypeValue),
+                                           InternalKey()};
+  std::vector<uint32_t> selected_files = {8U, 6U, 6U};
+
+  ioptions_.compaction_pri = kRoundRobin;
+  mutable_cf_options_.max_bytes_for_level_base = 12000000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  for (size_t i = 0; i < test_cursors.size(); i++) {
+    // start a brand new version in each test.
+    NewVersionStorage(6, kCompactionStyleLevel);
+    vstorage_->ResizeCompactCursors(6);
+    // Set the cursor
+    vstorage_->AddCursorForOneLevel(2, test_cursors[i]);
+    Add(2, 6U, "150", "199", 50000000U);  // Overlap with 26U, 27U
+    Add(2, 7U, "200", "249", 50000000U);  // File not overlapping
+    Add(2, 8U, "300", "600", 50000000U);  // Overlap with 28U, 29U
+
+    Add(3, 26U, "130", "165", 60000000U);
+    Add(3, 27U, "166", "170", 60000000U);
+    Add(3, 28U, "270", "340", 60000000U);
+    Add(3, 29U, "401", "500", 60000000U);
+    UpdateVersionStorageInfo();
+    LevelCompactionPicker local_level_compaction_picker =
+        LevelCompactionPicker(ioptions_, &icmp_);
+    std::unique_ptr<Compaction> compaction(
+        local_level_compaction_picker.PickCompaction(
+            cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+            &log_buffer_));
+    ASSERT_TRUE(compaction.get() != nullptr);
+    // Since the max bytes for level 2 is 120M, picking one file to compact
+    // makes the post-compaction level size less than 120M, there is exactly one
+    // file picked for round-robin compaction
+    ASSERT_EQ(1U, compaction->num_input_files(0));
+    ASSERT_EQ(selected_files[i], compaction->input(0, 0)->fd.GetNumber());
+    // release the version storage
+    DeleteVersionStorage();
+  }
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin1) {
+  ioptions_.compaction_pri = kRoundRobin;
+  mutable_cf_options_.max_compaction_bytes = 100000000u;
+  mutable_cf_options_.max_bytes_for_level_base = 120;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  // start a brand new version in each test.
+  NewVersionStorage(6, kCompactionStyleLevel);
+  vstorage_->ResizeCompactCursors(6);
+  // Set the cursor (file picking should start with 7U)
+  vstorage_->AddCursorForOneLevel(2, InternalKey("199", 100, kTypeValue));
+  Add(2, 6U, "150", "199", 500U);
+  Add(2, 7U, "200", "249", 500U);
+  Add(2, 8U, "300", "600", 500U);
+  Add(2, 9U, "700", "800", 500U);
+  Add(2, 10U, "850", "950", 500U);
+
+  Add(3, 26U, "130", "165", 600U);
+  Add(3, 27U, "166", "170", 600U);
+  Add(3, 28U, "270", "340", 600U);
+  Add(3, 29U, "401", "500", 600U);
+  Add(3, 30U, "601", "800", 600U);
+  Add(3, 31U, "830", "890", 600U);
+  UpdateVersionStorageInfo();
+  LevelCompactionPicker local_level_compaction_picker =
+      LevelCompactionPicker(ioptions_, &icmp_);
+  std::unique_ptr<Compaction> compaction(
+      local_level_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+
+  // The maximum compaction bytes is very large in this case so we can igore its
+  // constraint in this test case. The maximum bytes for level 2 is 1200
+  // bytes, and thus at least 3 files should be picked so that the bytes in
+  // level 2 is less than the maximum
+  ASSERT_EQ(3U, compaction->num_input_files(0));
+  ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(8U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(9U, compaction->input(0, 2)->fd.GetNumber());
+  // release the version storage
+  DeleteVersionStorage();
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin2) {
+  ioptions_.compaction_pri = kRoundRobin;
+  mutable_cf_options_.max_compaction_bytes = 2500u;
+  mutable_cf_options_.max_bytes_for_level_base = 120;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  // start a brand new version in each test.
+  NewVersionStorage(6, kCompactionStyleLevel);
+  vstorage_->ResizeCompactCursors(6);
+  // Set the cursor (file picking should start with 6U)
+  vstorage_->AddCursorForOneLevel(2, InternalKey("1000", 100, kTypeValue));
+  Add(2, 6U, "150", "199", 500U);  // Overlap with 26U, 27U
+  Add(2, 7U, "200", "249", 500U);  // Overlap with 27U
+  Add(2, 8U, "300", "600", 500U);  // Overlap with 28U, 29U
+  Add(2, 9U, "700", "800", 500U);
+  Add(2, 10U, "850", "950", 500U);
+
+  Add(3, 26U, "130", "165", 600U);
+  Add(3, 27U, "166", "230", 600U);
+  Add(3, 28U, "270", "340", 600U);
+  Add(3, 29U, "401", "500", 600U);
+  Add(3, 30U, "601", "800", 600U);
+  Add(3, 31U, "830", "890", 600U);
+  UpdateVersionStorageInfo();
+  LevelCompactionPicker local_level_compaction_picker =
+      LevelCompactionPicker(ioptions_, &icmp_);
+  std::unique_ptr<Compaction> compaction(
+      local_level_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+
+  // The maximum compaction bytes is only 2500 bytes now. Even though we are
+  // required to choose 3 files so that the post-compaction level size is less
+  // than 1200 bytes. We cannot pick 3 files to compact since the maximum
+  // compaction size is 2500. After picking files 6U and 7U, the number of
+  // compaction bytes has reached 2200, and thus no more space to add another
+  // input file with 50M bytes.
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(7U, compaction->input(0, 1)->fd.GetNumber());
+  // release the version storage
+  DeleteVersionStorage();
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin3) {
+  ioptions_.compaction_pri = kRoundRobin;
+  mutable_cf_options_.max_compaction_bytes = 1000000u;
+  mutable_cf_options_.max_bytes_for_level_base = 120;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  // start a brand new version in each test.
+  NewVersionStorage(6, kCompactionStyleLevel);
+  vstorage_->ResizeCompactCursors(6);
+  // Set the cursor (file picking should start with 9U)
+  vstorage_->AddCursorForOneLevel(2, InternalKey("700", 100, kTypeValue));
+  Add(2, 6U, "150", "199", 500U);
+  Add(2, 7U, "200", "249", 500U);
+  Add(2, 8U, "300", "600", 500U);
+  Add(2, 9U, "700", "800", 500U);
+  Add(2, 10U, "850", "950", 500U);
+
+  Add(3, 26U, "130", "165", 600U);
+  Add(3, 27U, "166", "170", 600U);
+  Add(3, 28U, "270", "340", 600U);
+  Add(3, 29U, "401", "500", 600U);
+  Add(3, 30U, "601", "800", 600U);
+  Add(3, 31U, "830", "890", 600U);
+  UpdateVersionStorageInfo();
+  LevelCompactionPicker local_level_compaction_picker =
+      LevelCompactionPicker(ioptions_, &icmp_);
+  std::unique_ptr<Compaction> compaction(
+      local_level_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+
+  // Cannot pick more files since we reach the last file in level 2
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(9U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(10U, compaction->input(0, 1)->fd.GetNumber());
+  // release the version storage
+  DeleteVersionStorage();
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlappingManyFiles) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  mutable_cf_options_.max_bytes_for_level_base = 15000000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+
+  // file 7 and 8 over lap with the same file, but file 8 is smaller so
+  // it will be picked.
+  Add(2, 13U, "010", "011",
+      6100U);  // Overlaps with a large file. Not picked
+  Add(2, 14U, "020", "021",
+      6100U);  // Overlaps with a large file. Not picked
+  Add(2, 15U, "030", "031",
+      6100U);  // Overlaps with a large file. Not picked
+  Add(2, 16U, "040", "041",
+      6100U);  // Overlaps with a large file. Not picked
+  Add(2, 17U, "050", "051",
+      6100U);  // Overlaps with a large file. Not picked
+  Add(2, 18U, "060", "061",
+      6100U);  // Overlaps with a large file. Not picked
+  Add(2, 19U, "070", "071",
+      6100U);  // Overlaps with a large file. Not picked
+  Add(2, 20U, "080", "081",
+      6100U);  // Overlaps with a large file. Not picked
+
+  Add(2, 6U, "150", "167", 60000000U);  // Overlaps with file 26, 27
+  Add(2, 7U, "168", "169", 60000000U);  // Overlaps with file 27
+  Add(2, 8U, "201", "300", 61000000U);  // Overlaps with file 28, but the file
+                                        // itself is larger. Should be picked.
+  Add(2, 9U, "610", "611",
+      6100U);  // Overlaps with a large file. Not picked
+  Add(2, 10U, "620", "621",
+      6100U);  // Overlaps with a large file. Not picked
+  Add(2, 11U, "630", "631",
+      6100U);  // Overlaps with a large file. Not picked
+  Add(2, 12U, "640", "641",
+      6100U);  // Overlaps with a large file. Not picked
+
+  Add(3, 31U, "001", "100", 260000000U);
+  Add(3, 26U, "160", "165", 260000000U);
+  Add(3, 27U, "166", "170", 260000000U);
+  Add(3, 28U, "180", "400", 260000000U);
+  Add(3, 29U, "401", "500", 260000000U);
+  Add(3, 30U, "601", "700", 260000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  // Picking file 8 because overlapping ratio is the biggest.
+  ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+// This test exhibits the bug where we don't properly reset parent_index in
+// PickCompaction()
+TEST_F(CompactionPickerTest, ParentIndexResetBug) {
+  int num_levels = ioptions_.num_levels;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 200;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200");       // <- marked for compaction
+  Add(1, 3U, "400", "500", 600);  // <- this one needs compacting
+  Add(2, 4U, "150", "200");
+  Add(2, 5U, "201", "210");
+  Add(2, 6U, "300", "310");
+  Add(2, 7U, "400", "500");  // <- being compacted
+
+  vstorage_->LevelFiles(2)[3]->being_compacted = true;
+  vstorage_->LevelFiles(0)[0]->marked_for_compaction = true;
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+}
+
+// This test checks ExpandWhileOverlapping() by having overlapping user keys
+// ranges (with different sequence numbers) in the input files.
+TEST_F(CompactionPickerTest, OverlappingUserKeys) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  ioptions_.compaction_pri = kByCompensatedSize;
+
+  Add(1, 1U, "100", "150", 1U);
+  // Overlapping user keys
+  Add(1, 2U, "200", "400", 1U);
+  Add(1, 3U, "400", "500", 1000000000U, 0, 0);
+  Add(2, 4U, "600", "700", 1U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_levels());
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys2) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // Overlapping user keys on same level and output level
+  Add(1, 1U, "200", "400", 1000000000U);
+  Add(1, 2U, "400", "500", 1U, 0, 0);
+  Add(2, 3U, "000", "100", 1U);
+  Add(2, 4U, "100", "600", 1U, 0, 0);
+  Add(2, 5U, "600", "700", 1U, 0, 0);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(3U, compaction->num_input_files(1));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(3U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(4U, compaction->input(1, 1)->fd.GetNumber());
+  ASSERT_EQ(5U, compaction->input(1, 2)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys3) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // Chain of overlapping user key ranges (forces ExpandWhileOverlapping() to
+  // expand multiple times)
+  Add(1, 1U, "100", "150", 1U);
+  Add(1, 2U, "150", "200", 1U, 0, 0);
+  Add(1, 3U, "200", "250", 1000000000U, 0, 0);
+  Add(1, 4U, "250", "300", 1U, 0, 0);
+  Add(1, 5U, "300", "350", 1U, 0, 0);
+  // Output level overlaps with the beginning and the end of the chain
+  Add(2, 6U, "050", "100", 1U);
+  Add(2, 7U, "350", "400", 1U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(5U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->num_input_files(1));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber());
+  ASSERT_EQ(4U, compaction->input(0, 3)->fd.GetNumber());
+  ASSERT_EQ(5U, compaction->input(0, 4)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys4) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.max_bytes_for_level_base = 1000000;
+
+  Add(1, 1U, "100", "150", 1U);
+  Add(1, 2U, "150", "199", 1U, 0, 0);
+  Add(1, 3U, "200", "250", 1100000U, 0, 0);
+  Add(1, 4U, "251", "300", 1U, 0, 0);
+  Add(1, 5U, "300", "350", 1U, 0, 0);
+
+  Add(2, 6U, "100", "115", 1U);
+  Add(2, 7U, "125", "325", 1U);
+  Add(2, 8U, "350", "400", 1U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+  ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys5) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // Overlapping user keys on same level and output level
+  Add(1, 1U, "200", "400", 1000000000U);
+  Add(1, 2U, "400", "500", 1U, 0, 0);
+  Add(2, 3U, "000", "100", 1U);
+  Add(2, 4U, "100", "600", 1U, 0, 0);
+  Add(2, 5U, "600", "700", 1U, 0, 0);
+
+  vstorage_->LevelFiles(2)[2]->being_compacted = true;
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys6) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // Overlapping user keys on same level and output level
+  Add(1, 1U, "200", "400", 1U, 0, 0);
+  Add(1, 2U, "401", "500", 1U, 0, 0);
+  Add(2, 3U, "000", "100", 1U);
+  Add(2, 4U, "100", "300", 1U, 0, 0);
+  Add(2, 5U, "305", "450", 1U, 0, 0);
+  Add(2, 6U, "460", "600", 1U, 0, 0);
+  Add(2, 7U, "600", "700", 1U, 0, 0);
+
+  vstorage_->LevelFiles(1)[0]->marked_for_compaction = true;
+  vstorage_->LevelFiles(1)[1]->marked_for_compaction = true;
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(3U, compaction->num_input_files(1));
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys7) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.max_compaction_bytes = 100000000000u;
+  // Overlapping user keys on same level and output level
+  Add(1, 1U, "200", "400", 1U, 0, 0);
+  Add(1, 2U, "401", "500", 1000000000U, 0, 0);
+  Add(2, 3U, "100", "250", 1U);
+  Add(2, 4U, "300", "600", 1U, 0, 0);
+  Add(2, 5U, "600", "800", 1U, 0, 0);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_GE(1U, compaction->num_input_files(0));
+  ASSERT_GE(2U, compaction->num_input_files(1));
+  // File 5 has to be included in the compaction
+  ASSERT_EQ(5U, compaction->inputs(1)->back()->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys8) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.max_compaction_bytes = 100000000000u;
+  // grow the number of inputs in "level" without
+  // changing the number of "level+1" files we pick up
+  // Expand input level as much as possible
+  // no overlapping case
+  Add(1, 1U, "101", "150", 1U);
+  Add(1, 2U, "151", "200", 1U);
+  Add(1, 3U, "201", "300", 1000000000U);
+  Add(1, 4U, "301", "400", 1U);
+  Add(1, 5U, "401", "500", 1U);
+  Add(2, 6U, "150", "200", 1U);
+  Add(2, 7U, "200", "450", 1U, 0, 0);
+  Add(2, 8U, "500", "600", 1U);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(3U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->num_input_files(1));
+  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(4U, compaction->input(0, 2)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys9) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.max_compaction_bytes = 100000000000u;
+  // grow the number of inputs in "level" without
+  // changing the number of "level+1" files we pick up
+  // Expand input level as much as possible
+  // overlapping case
+  Add(1, 1U, "121", "150", 1U);
+  Add(1, 2U, "151", "200", 1U);
+  Add(1, 3U, "201", "300", 1000000000U);
+  Add(1, 4U, "301", "400", 1U);
+  Add(1, 5U, "401", "500", 1U);
+  Add(2, 6U, "100", "120", 1U);
+  Add(2, 7U, "150", "200", 1U);
+  Add(2, 8U, "200", "450", 1U, 0, 0);
+  Add(2, 9U, "501", "600", 1U);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(5U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->num_input_files(1));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber());
+  ASSERT_EQ(4U, compaction->input(0, 3)->fd.GetNumber());
+  ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(8U, compaction->input(1, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys10) {
+  // Locked file encountered when pulling in extra input-level files with same
+  // user keys. Verify we pick the next-best file from the same input level.
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.max_compaction_bytes = 100000000000u;
+
+  // file_number 2U is largest and thus first choice. But it overlaps with
+  // file_number 1U which is being compacted. So instead we pick the next-
+  // biggest file, 3U, which is eligible for compaction.
+  Add(1 /* level */, 1U /* file_number */, "100" /* smallest */,
+      "150" /* largest */, 1U /* file_size */);
+  file_map_[1U].first->being_compacted = true;
+  Add(1 /* level */, 2U /* file_number */, "150" /* smallest */,
+      "200" /* largest */, 1000000000U /* file_size */, 0 /* smallest_seq */,
+      0 /* largest_seq */);
+  Add(1 /* level */, 3U /* file_number */, "201" /* smallest */,
+      "250" /* largest */, 900000000U /* file_size */);
+  Add(2 /* level */, 4U /* file_number */, "100" /* smallest */,
+      "150" /* largest */, 1U /* file_size */);
+  Add(2 /* level */, 5U /* file_number */, "151" /* smallest */,
+      "200" /* largest */, 1U /* file_size */);
+  Add(2 /* level */, 6U /* file_number */, "201" /* smallest */,
+      "250" /* largest */, 1U /* file_size */);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+  ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys11) {
+  // Locked file encountered when pulling in extra output-level files with same
+  // user keys. Expected to skip that compaction and pick the next-best choice.
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.max_compaction_bytes = 100000000000u;
+
+  // score(L1) = 3.7
+  // score(L2) = 1.85
+  // There is no eligible file in L1 to compact since both candidates pull in
+  // file_number 5U, which overlaps with a file pending compaction (6U). The
+  // first eligible compaction is from L2->L3.
+  Add(1 /* level */, 2U /* file_number */, "151" /* smallest */,
+      "200" /* largest */, 1000000000U /* file_size */);
+  Add(1 /* level */, 3U /* file_number */, "201" /* smallest */,
+      "250" /* largest */, 1U /* file_size */);
+  Add(2 /* level */, 4U /* file_number */, "100" /* smallest */,
+      "149" /* largest */, 5000000000U /* file_size */);
+  Add(2 /* level */, 5U /* file_number */, "150" /* smallest */,
+      "201" /* largest */, 1U /* file_size */);
+  Add(2 /* level */, 6U /* file_number */, "201" /* smallest */,
+      "249" /* largest */, 1U /* file_size */, 0 /* smallest_seq */,
+      0 /* largest_seq */);
+  file_map_[6U].first->being_compacted = true;
+  Add(3 /* level */, 7U /* file_number */, "100" /* smallest */,
+      "149" /* largest */, 1U /* file_size */);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+  ASSERT_EQ(4U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, FileTtlBooster) {
+  // Set TTL to 2048
+  // TTL boosting for all levels starts at 1024,
+  // Whole TTL range is 2048 * 31 / 32 - 1024 = 1984 - 1024 = 960.
+  // From second last level (L5), range starts at
+  // 1024 + 480, 1024 + 240, 1024 + 120 (which is L3).
+  // Boosting step 124 / 16 = 7.75 -> 7
+  //
+  const uint64_t kCurrentTime = 1000000;
+  FileMetaData meta;
+
+  {
+    FileTtlBooster booster(kCurrentTime, 2048, 7, 3);
+
+    // Not triggering if the file is younger than ttl/2
+    meta.oldest_ancester_time = kCurrentTime - 1023;
+    ASSERT_EQ(1, booster.GetBoostScore(&meta));
+    meta.oldest_ancester_time = kCurrentTime - 1024;
+    ASSERT_EQ(1, booster.GetBoostScore(&meta));
+    meta.oldest_ancester_time = kCurrentTime + 10;
+    ASSERT_EQ(1, booster.GetBoostScore(&meta));
+
+    // Within one boosting step
+    meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 6);
+    ASSERT_EQ(1, booster.GetBoostScore(&meta));
+
+    // One boosting step
+    meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 7);
+    ASSERT_EQ(2, booster.GetBoostScore(&meta));
+    meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 8);
+    ASSERT_EQ(2, booster.GetBoostScore(&meta));
+
+    // Multiple boosting steps
+    meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 30);
+    ASSERT_EQ(5, booster.GetBoostScore(&meta));
+
+    // Very high boosting steps
+    meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 700);
+    ASSERT_EQ(101, booster.GetBoostScore(&meta));
+  }
+  {
+    // Test second last level
+    FileTtlBooster booster(kCurrentTime, 2048, 7, 5);
+    meta.oldest_ancester_time = kCurrentTime - (1024 + 480);
+    ASSERT_EQ(1, booster.GetBoostScore(&meta));
+    meta.oldest_ancester_time = kCurrentTime - (1024 + 480 + 60);
+    ASSERT_EQ(3, booster.GetBoostScore(&meta));
+  }
+  {
+    // Test last level
+    FileTtlBooster booster(kCurrentTime, 2048, 7, 6);
+    meta.oldest_ancester_time = kCurrentTime - (1024 + 480);
+    ASSERT_EQ(1, booster.GetBoostScore(&meta));
+    meta.oldest_ancester_time = kCurrentTime - (1024 + 480 + 60);
+    ASSERT_EQ(1, booster.GetBoostScore(&meta));
+    meta.oldest_ancester_time = kCurrentTime - 3000;
+    ASSERT_EQ(1, booster.GetBoostScore(&meta));
+  }
+}
+
+TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri1) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 900000000U;
+
+  // 6 L0 files, score 3.
+  Add(0, 1U, "000", "400", 1U);
+  Add(0, 2U, "001", "400", 1U, 0, 0);
+  Add(0, 3U, "001", "400", 1000000000U, 0, 0);
+  Add(0, 31U, "001", "400", 1000000000U, 0, 0);
+  Add(0, 32U, "001", "400", 1000000000U, 0, 0);
+  Add(0, 33U, "001", "400", 1000000000U, 0, 0);
+
+  // L1 total size 2GB, score 2.2. If one file being compacted, score 1.1.
+  Add(1, 4U, "050", "300", 1000000000U, 0, 0);
+  file_map_[4u].first->being_compacted = true;
+  Add(1, 5U, "301", "350", 1000000000U, 0, 0);
+
+  // Output level overlaps with the beginning and the end of the chain
+  Add(2, 6U, "050", "100", 1U);
+  Add(2, 7U, "300", "400", 1U);
+
+  // No compaction should be scheduled, if L0 has higher priority than L1
+  // but L0->L1 compaction is blocked by a file in L1 being compacted.
+  UpdateVersionStorageInfo();
+  ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0));
+  ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1));
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri2) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 900000000U;
+
+  // 6 L0 files, score 3.
+  Add(0, 1U, "000", "400", 1U);
+  Add(0, 2U, "001", "400", 1U, 0, 0);
+  Add(0, 3U, "001", "400", 1000000000U, 0, 0);
+  Add(0, 31U, "001", "400", 1000000000U, 0, 0);
+  Add(0, 32U, "001", "400", 1000000000U, 0, 0);
+  Add(0, 33U, "001", "400", 1000000000U, 0, 0);
+
+  // L1 total size 2GB, score 2.2. If one file being compacted, score 1.1.
+  Add(1, 4U, "050", "300", 1000000000U, 0, 0);
+  Add(1, 5U, "301", "350", 1000000000U, 0, 0);
+
+  // Output level overlaps with the beginning and the end of the chain
+  Add(2, 6U, "050", "100", 1U);
+  Add(2, 7U, "300", "400", 1U);
+
+  // If no file in L1 being compacted, L0->L1 compaction will be scheduled.
+  UpdateVersionStorageInfo();  // being_compacted flag is cleared here.
+  ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0));
+  ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1));
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+}
+
+TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri3) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 900000000U;
+
+  // 6 L0 files, score 3.
+  Add(0, 1U, "000", "400", 1U);
+  Add(0, 2U, "001", "400", 1U, 0, 0);
+  Add(0, 3U, "001", "400", 1000000000U, 0, 0);
+  Add(0, 31U, "001", "400", 1000000000U, 0, 0);
+  Add(0, 32U, "001", "400", 1000000000U, 0, 0);
+  Add(0, 33U, "001", "400", 1000000000U, 0, 0);
+
+  // L1 score more than 6.
+  Add(1, 4U, "050", "300", 1000000000U, 0, 0);
+  file_map_[4u].first->being_compacted = true;
+  Add(1, 5U, "301", "350", 1000000000U, 0, 0);
+  Add(1, 51U, "351", "400", 6000000000U, 0, 0);
+
+  // Output level overlaps with the beginning and the end of the chain
+  Add(2, 6U, "050", "100", 1U);
+  Add(2, 7U, "300", "400", 1U);
+
+  // If score in L1 is larger than L0, L1 compaction goes through despite
+  // there is pending L0 compaction.
+  UpdateVersionStorageInfo();
+  ASSERT_EQ(1, vstorage_->CompactionScoreLevel(0));
+  ASSERT_EQ(0, vstorage_->CompactionScoreLevel(1));
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded1) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 4;
+  mutable_cf_options_.max_bytes_for_level_base = 1000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200", 200);
+  Add(0, 2U, "150", "200", 200);
+  Add(0, 3U, "150", "200", 200);
+  // Level 1 is over target by 200
+  Add(1, 4U, "400", "500", 600);
+  Add(1, 5U, "600", "700", 600);
+  // Level 2 is less than target 10000 even added size of level 1
+  // Size ratio of L2/L1 is 9600 / 1200 = 8
+  Add(2, 6U, "150", "200", 2500);
+  Add(2, 7U, "201", "210", 2000);
+  Add(2, 8U, "300", "310", 2600);
+  Add(2, 9U, "400", "500", 2500);
+  // Level 3 exceeds target 100,000 of 1000
+  Add(3, 10U, "400", "500", 101000);
+  // Level 4 exceeds target 1,000,000 by 900 after adding size from level 3
+  // Size ratio L4/L3 is 9.9
+  // After merge from L3, L4 size is 1000900
+  Add(4, 11U, "400", "500", 999900);
+  Add(5, 12U, "400", "500", 8007200);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(200u * 9u + 10900u + 900u * 9,
+            vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded2) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+  mutable_cf_options_.max_bytes_for_level_base = 1000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200", 200);
+  Add(0, 2U, "150", "200", 200);
+  Add(0, 4U, "150", "200", 200);
+  Add(0, 5U, "150", "200", 200);
+  Add(0, 6U, "150", "200", 200);
+  // Level 1 size will be 1400 after merging with L0
+  Add(1, 7U, "400", "500", 200);
+  Add(1, 8U, "600", "700", 200);
+  // Level 2 is less than target 10000 even added size of level 1
+  Add(2, 9U, "150", "200", 9100);
+  // Level 3 over the target, but since level 4 is empty, we assume it will be
+  // a trivial move.
+  Add(3, 10U, "400", "500", 101000);
+
+  UpdateVersionStorageInfo();
+
+  // estimated L1->L2 merge: 400 * (9100.0 / 1400.0 + 1.0)
+  ASSERT_EQ(1400u + 3000u, vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded3) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+  mutable_cf_options_.max_bytes_for_level_base = 1000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200", 2000);
+  Add(0, 2U, "150", "200", 2000);
+  Add(0, 4U, "150", "200", 2000);
+  Add(0, 5U, "150", "200", 2000);
+  Add(0, 6U, "150", "200", 1000);
+  // Level 1 size will be 10000 after merging with L0
+  Add(1, 7U, "400", "500", 500);
+  Add(1, 8U, "600", "700", 500);
+
+  Add(2, 9U, "150", "200", 10000);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(10000u + 18000u, vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeededDynamicLevel) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+  mutable_cf_options_.max_bytes_for_level_base = 1000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+
+  // Set Last level size 50000
+  // num_levels - 1 target 5000
+  // num_levels - 2 is base level with target 1000 (rounded up to
+  // max_bytes_for_level_base).
+  Add(num_levels - 1, 10U, "400", "500", 50000);
+
+  Add(0, 1U, "150", "200", 200);
+  Add(0, 2U, "150", "200", 200);
+  Add(0, 4U, "150", "200", 200);
+  Add(0, 5U, "150", "200", 200);
+  Add(0, 6U, "150", "200", 200);
+  // num_levels - 3 is over target by 100 + 1000
+  Add(num_levels - 3, 7U, "400", "500", 550);
+  Add(num_levels - 3, 8U, "600", "700", 550);
+  // num_levels - 2 is over target by 1100 + 200
+  Add(num_levels - 2, 9U, "150", "200", 5200);
+
+  UpdateVersionStorageInfo();
+
+  // Merging to the second last level: (5200 / 2100 + 1) * 1100
+  // Merging to the last level: (50000 / 6300 + 1) * 1300
+  ASSERT_EQ(2100u + 3823u + 11617u,
+            vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, IsBottommostLevelTest) {
+  // case 1: Higher levels are empty
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "a", "m");
+  Add(0, 2U, "c", "z");
+  Add(1, 3U, "d", "e");
+  Add(1, 4U, "l", "p");
+  Add(2, 5U, "g", "i");
+  Add(2, 6U, "x", "z");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(2, 1);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(5U);
+  bool result =
+      Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_TRUE(result);
+
+  // case 2: Higher levels have no overlap
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "a", "m");
+  Add(0, 2U, "c", "z");
+  Add(1, 3U, "d", "e");
+  Add(1, 4U, "l", "p");
+  Add(2, 5U, "g", "i");
+  Add(2, 6U, "x", "z");
+  Add(3, 7U, "k", "p");
+  Add(3, 8U, "t", "w");
+  Add(4, 9U, "a", "b");
+  Add(5, 10U, "c", "cc");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(2, 1);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(5U);
+  result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_TRUE(result);
+
+  // case 3.1: Higher levels (level 3) have overlap
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "a", "m");
+  Add(0, 2U, "c", "z");
+  Add(1, 3U, "d", "e");
+  Add(1, 4U, "l", "p");
+  Add(2, 5U, "g", "i");
+  Add(2, 6U, "x", "z");
+  Add(3, 7U, "e", "g");
+  Add(3, 8U, "h", "k");
+  Add(4, 9U, "a", "b");
+  Add(5, 10U, "c", "cc");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(2, 1);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(5U);
+  result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_FALSE(result);
+
+  // case 3.2: Higher levels (level 5) have overlap
+  DeleteVersionStorage();
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "a", "m");
+  Add(0, 2U, "c", "z");
+  Add(1, 3U, "d", "e");
+  Add(1, 4U, "l", "p");
+  Add(2, 5U, "g", "i");
+  Add(2, 6U, "x", "z");
+  Add(3, 7U, "j", "k");
+  Add(3, 8U, "l", "m");
+  Add(4, 9U, "a", "b");
+  Add(5, 10U, "c", "cc");
+  Add(5, 11U, "h", "k");
+  Add(5, 12U, "y", "yy");
+  Add(5, 13U, "z", "zz");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(2, 1);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(5U);
+  result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_FALSE(result);
+
+  // case 3.3: Higher levels (level 5) have overlap, but it's only overlapping
+  // one key ("d")
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "a", "m");
+  Add(0, 2U, "c", "z");
+  Add(1, 3U, "d", "e");
+  Add(1, 4U, "l", "p");
+  Add(2, 5U, "g", "i");
+  Add(2, 6U, "x", "z");
+  Add(3, 7U, "j", "k");
+  Add(3, 8U, "l", "m");
+  Add(4, 9U, "a", "b");
+  Add(5, 10U, "c", "cc");
+  Add(5, 11U, "ccc", "d");
+  Add(5, 12U, "y", "yy");
+  Add(5, 13U, "z", "zz");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(2, 1);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(5U);
+  result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_FALSE(result);
+
+  // Level 0 files overlap
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "s", "t");
+  Add(0, 2U, "a", "m");
+  Add(0, 3U, "b", "z");
+  Add(0, 4U, "e", "f");
+  Add(5, 10U, "y", "z");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(1, 0);
+  AddToCompactionFiles(1U);
+  AddToCompactionFiles(2U);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(4U);
+  result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_FALSE(result);
+
+  // Level 0 files don't overlap
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "s", "t");
+  Add(0, 2U, "a", "m");
+  Add(0, 3U, "b", "k");
+  Add(0, 4U, "e", "f");
+  Add(5, 10U, "y", "z");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(1, 0);
+  AddToCompactionFiles(1U);
+  AddToCompactionFiles(2U);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(4U);
+  result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_TRUE(result);
+
+  // Level 1 files overlap
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "s", "t");
+  Add(0, 2U, "a", "m");
+  Add(0, 3U, "b", "k");
+  Add(0, 4U, "e", "f");
+  Add(1, 5U, "a", "m");
+  Add(1, 6U, "n", "o");
+  Add(1, 7U, "w", "y");
+  Add(5, 10U, "y", "z");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(2, 0);
+  AddToCompactionFiles(1U);
+  AddToCompactionFiles(2U);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(4U);
+  AddToCompactionFiles(5U);
+  AddToCompactionFiles(6U);
+  AddToCompactionFiles(7U);
+  result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_FALSE(result);
+
+  DeleteVersionStorage();
+}
+
+TEST_F(CompactionPickerTest, MaxCompactionBytesHit) {
+  mutable_cf_options_.max_bytes_for_level_base = 1000000u;
+  mutable_cf_options_.max_compaction_bytes = 800000u;
+  mutable_cf_options_.ignore_max_compaction_bytes_for_input = false;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // A compaction should be triggered and pick file 2 and 5.
+  // It can expand because adding file 1 and 3, the compaction size will
+  // exceed mutable_cf_options_.max_bytes_for_level_base.
+  Add(1, 1U, "100", "150", 300000U);
+  Add(1, 2U, "151", "200", 300001U, 0, 0);
+  Add(1, 3U, "201", "250", 300000U, 0, 0);
+  Add(1, 4U, "251", "300", 300000U, 0, 0);
+  Add(2, 5U, "100", "256", 1U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, MaxCompactionBytesNotHit) {
+  mutable_cf_options_.max_bytes_for_level_base = 800000u;
+  mutable_cf_options_.max_compaction_bytes = 1000000u;
+  mutable_cf_options_.ignore_max_compaction_bytes_for_input = false;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // A compaction should be triggered and pick file 2 and 5.
+  // and it expands to file 1 and 3 too.
+  Add(1, 1U, "100", "150", 300000U);
+  Add(1, 2U, "151", "200", 300001U, 0, 0);
+  Add(1, 3U, "201", "250", 300000U, 0, 0);
+  Add(1, 4U, "251", "300", 300000U, 0, 0);
+  Add(2, 5U, "000", "251", 1U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(3U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber());
+  ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, IsTrivialMoveOn) {
+  mutable_cf_options_.max_bytes_for_level_base = 10000u;
+  mutable_cf_options_.max_compaction_bytes = 10001u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // A compaction should be triggered and pick file 2
+  Add(1, 1U, "100", "150", 3000U);
+  Add(1, 2U, "151", "200", 3001U);
+  Add(1, 3U, "201", "250", 3000U);
+  Add(1, 4U, "251", "300", 3000U);
+
+  Add(3, 5U, "120", "130", 7000U);
+  Add(3, 6U, "170", "180", 7000U);
+  Add(3, 7U, "220", "230", 7000U);
+  Add(3, 8U, "270", "280", 7000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_TRUE(compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, L0TrivialMove1) {
+  mutable_cf_options_.max_bytes_for_level_base = 10000000u;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 4;
+  mutable_cf_options_.max_compaction_bytes = 10000000u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  Add(0, 1U, "100", "150", 3000U, 0, 710, 800);
+  Add(0, 2U, "151", "200", 3001U, 0, 610, 700);
+  Add(0, 3U, "301", "350", 3000U, 0, 510, 600);
+  Add(0, 4U, "451", "400", 3000U, 0, 410, 500);
+
+  Add(1, 5U, "120", "130", 7000U);
+  Add(1, 6U, "170", "180", 7000U);
+  Add(1, 7U, "220", "230", 7000U);
+  Add(1, 8U, "270", "280", 7000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1, compaction->num_input_levels());
+  ASSERT_EQ(2, compaction->num_input_files(0));
+  ASSERT_EQ(3, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(4, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_TRUE(compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, L0TrivialMoveOneFile) {
+  mutable_cf_options_.max_bytes_for_level_base = 10000000u;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 4;
+  mutable_cf_options_.max_compaction_bytes = 10000000u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  Add(0, 1U, "100", "150", 3000U, 0, 710, 800);
+  Add(0, 2U, "551", "600", 3001U, 0, 610, 700);
+  Add(0, 3U, "101", "150", 3000U, 0, 510, 600);
+  Add(0, 4U, "451", "400", 3000U, 0, 410, 500);
+
+  Add(1, 5U, "120", "130", 7000U);
+  Add(1, 6U, "170", "180", 7000U);
+  Add(1, 7U, "220", "230", 7000U);
+  Add(1, 8U, "270", "280", 7000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1, compaction->num_input_levels());
+  ASSERT_EQ(1, compaction->num_input_files(0));
+  ASSERT_EQ(4, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_TRUE(compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, L0TrivialMoveWholeL0) {
+  mutable_cf_options_.max_bytes_for_level_base = 10000000u;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 4;
+  mutable_cf_options_.max_compaction_bytes = 10000000u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  Add(0, 1U, "300", "350", 3000U, 0, 710, 800);
+  Add(0, 2U, "651", "600", 3001U, 0, 610, 700);
+  Add(0, 3U, "501", "550", 3000U, 0, 510, 600);
+  Add(0, 4U, "451", "400", 3000U, 0, 410, 500);
+
+  Add(1, 5U, "120", "130", 7000U);
+  Add(1, 6U, "970", "980", 7000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1, compaction->num_input_levels());
+  ASSERT_EQ(4, compaction->num_input_files(0));
+  ASSERT_EQ(1, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(4, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(3, compaction->input(0, 2)->fd.GetNumber());
+  ASSERT_EQ(2, compaction->input(0, 3)->fd.GetNumber());
+  ASSERT_TRUE(compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, IsTrivialMoveOffSstPartitioned) {
+  mutable_cf_options_.max_bytes_for_level_base = 10000u;
+  mutable_cf_options_.max_compaction_bytes = 10001u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  ioptions_.sst_partitioner_factory = NewSstPartitionerFixedPrefixFactory(1);
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // A compaction should be triggered and pick file 2
+  Add(1, 1U, "100", "150", 3000U);
+  Add(1, 2U, "151", "200", 3001U);
+  Add(1, 3U, "201", "250", 3000U);
+  Add(1, 4U, "251", "300", 3000U);
+
+  Add(3, 5U, "120", "130", 7000U);
+  Add(3, 6U, "170", "180", 7000U);
+  Add(3, 7U, "220", "230", 7000U);
+  Add(3, 8U, "270", "280", 7000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  // No trivial move, because partitioning is applied
+  ASSERT_TRUE(!compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, IsTrivialMoveOff) {
+  mutable_cf_options_.max_bytes_for_level_base = 1000000u;
+  mutable_cf_options_.max_compaction_bytes = 10000u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // A compaction should be triggered and pick all files from level 1
+  Add(1, 1U, "100", "150", 300000U, 0, 0);
+  Add(1, 2U, "150", "200", 300000U, 0, 0);
+  Add(1, 3U, "200", "250", 300000U, 0, 0);
+  Add(1, 4U, "250", "300", 300000U, 0, 0);
+
+  Add(3, 5U, "120", "130", 6000U);
+  Add(3, 6U, "140", "150", 6000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_FALSE(compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles1) {
+  mutable_cf_options_.max_bytes_for_level_base = 1000u;
+  mutable_cf_options_.max_compaction_bytes = 10000001u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  Add(2, 1U, "100", "150", 3000U);
+  Add(2, 2U, "151", "200", 3001U);
+  Add(2, 3U, "301", "350", 3000U);
+  Add(2, 4U, "451", "400", 3000U);
+  Add(2, 5U, "551", "500", 3000U);
+  Add(2, 6U, "651", "600", 3000U);
+  Add(2, 7U, "751", "700", 3000U);
+  Add(2, 8U, "851", "900", 3000U);
+
+  Add(3, 15U, "120", "130", 700U);
+  Add(3, 16U, "170", "180", 700U);
+  Add(3, 17U, "220", "230", 700U);
+  Add(3, 18U, "870", "880", 700U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_TRUE(compaction->IsTrivialMove());
+  ASSERT_EQ(1, compaction->num_input_levels());
+  ASSERT_EQ(4, compaction->num_input_files(0));
+  ASSERT_EQ(3, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(4, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(5, compaction->input(0, 2)->fd.GetNumber());
+  ASSERT_EQ(6, compaction->input(0, 3)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles2) {
+  mutable_cf_options_.max_bytes_for_level_base = 1000u;
+  mutable_cf_options_.max_compaction_bytes = 10000001u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  Add(2, 1U, "100", "150", 3000U);
+  Add(2, 2U, "151", "160", 3001U);
+  Add(2, 3U, "161", "179", 3000U);
+  Add(2, 4U, "220", "400", 3000U);
+  Add(2, 5U, "551", "500", 3000U);
+  Add(2, 6U, "651", "600", 3000U);
+  Add(2, 7U, "751", "700", 3000U);
+  Add(2, 8U, "851", "900", 3000U);
+
+  Add(3, 15U, "120", "130", 700U);
+  Add(3, 17U, "220", "230", 700U);
+  Add(3, 18U, "870", "880", 700U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_TRUE(compaction->IsTrivialMove());
+  ASSERT_EQ(1, compaction->num_input_levels());
+  ASSERT_EQ(2, compaction->num_input_files(0));
+  ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(3, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles3) {
+  mutable_cf_options_.max_bytes_for_level_base = 1000u;
+  mutable_cf_options_.max_compaction_bytes = 10000001u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  // Even if consecutive files can be trivial moved, we don't pick them
+  // since in case trivial move can't be issued for a reason, we cannot
+  // fall back to normal compactions.
+  Add(2, 1U, "100", "150", 3000U);
+  Add(2, 2U, "151", "160", 3001U);
+  Add(2, 5U, "551", "500", 3000U);
+  Add(2, 6U, "651", "600", 3000U);
+  Add(2, 7U, "751", "700", 3000U);
+  Add(2, 8U, "851", "900", 3000U);
+
+  Add(3, 15U, "120", "130", 700U);
+  Add(3, 17U, "220", "230", 700U);
+  Add(3, 18U, "870", "880", 700U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_TRUE(compaction->IsTrivialMove());
+  ASSERT_EQ(1, compaction->num_input_levels());
+  ASSERT_EQ(1, compaction->num_input_files(0));
+  ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles4) {
+  mutable_cf_options_.max_bytes_for_level_base = 1000u;
+  mutable_cf_options_.max_compaction_bytes = 10000001u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  Add(2, 1U, "100", "150", 4000U);
+  Add(2, 2U, "151", "160", 4001U);
+  Add(2, 3U, "161", "179", 4000U);
+
+  Add(3, 15U, "120", "130", 700U);
+  Add(3, 17U, "220", "230", 700U);
+  Add(3, 18U, "870", "880", 700U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_TRUE(compaction->IsTrivialMove());
+  ASSERT_EQ(1, compaction->num_input_levels());
+  ASSERT_EQ(2, compaction->num_input_files(0));
+  ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(3, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles5) {
+  mutable_cf_options_.max_bytes_for_level_base = 1000u;
+  mutable_cf_options_.max_compaction_bytes = 10000001u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  // File 4 and 5 aren't clean cut, so only 2 and 3 are picked.
+  Add(2, 1U, "100", "150", 4000U);
+  Add(2, 2U, "151", "160", 4001U);
+  Add(2, 3U, "161", "179", 4000U);
+  Add(2, 4U, "180", "185", 4000U);
+  Add(2, 5U, "185", "190", 4000U);
+
+  Add(3, 15U, "120", "130", 700U);
+  Add(3, 17U, "220", "230", 700U);
+  Add(3, 18U, "870", "880", 700U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_TRUE(compaction->IsTrivialMove());
+  ASSERT_EQ(1, compaction->num_input_levels());
+  ASSERT_EQ(2, compaction->num_input_files(0));
+  ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(3, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles6) {
+  mutable_cf_options_.max_bytes_for_level_base = 1000u;
+  mutable_cf_options_.max_compaction_bytes = 10000001u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  Add(2, 1U, "100", "150", 3000U);
+  Add(2, 2U, "151", "200", 3001U);
+  Add(2, 3U, "301", "350", 3000U);
+  Add(2, 4U, "451", "400", 3000U);
+  Add(2, 5U, "551", "500", 3000U);
+  file_map_[5U].first->being_compacted = true;
+  Add(2, 6U, "651", "600", 3000U);
+  Add(2, 7U, "751", "700", 3000U);
+  Add(2, 8U, "851", "900", 3000U);
+
+  Add(3, 15U, "120", "130", 700U);
+  Add(3, 16U, "170", "180", 700U);
+  Add(3, 17U, "220", "230", 700U);
+  Add(3, 18U, "870", "880", 700U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_TRUE(compaction->IsTrivialMove());
+  ASSERT_EQ(1, compaction->num_input_levels());
+  // Since the next file is being compacted. Stopping at 3 and 4.
+  ASSERT_EQ(2, compaction->num_input_files(0));
+  ASSERT_EQ(3, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(4, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, CacheNextCompactionIndex) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.max_compaction_bytes = 100000000000u;
+
+  Add(1 /* level */, 1U /* file_number */, "100" /* smallest */,
+      "149" /* largest */, 1000000000U /* file_size */);
+  file_map_[1U].first->being_compacted = true;
+  Add(1 /* level */, 2U /* file_number */, "150" /* smallest */,
+      "199" /* largest */, 900000000U /* file_size */);
+  Add(1 /* level */, 3U /* file_number */, "200" /* smallest */,
+      "249" /* largest */, 800000000U /* file_size */);
+  Add(1 /* level */, 4U /* file_number */, "250" /* smallest */,
+      "299" /* largest */, 700000000U /* file_size */);
+  Add(2 /* level */, 5U /* file_number */, "150" /* smallest */,
+      "199" /* largest */, 100U /* file_size */);
+  Add(2 /* level */, 6U /* file_number */, "200" /* smallest */,
+      "240" /* largest */, 1U /* file_size */);
+  Add(2 /* level */, 7U /* file_number */, "260" /* smallest */,
+      "270" /* largest */, 1U /* file_size */);
+  file_map_[5U].first->being_compacted = true;
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+  ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2, vstorage_->NextCompactionIndex(1 /* level */));
+
+  compaction.reset(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+  ASSERT_EQ(4U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(3, vstorage_->NextCompactionIndex(1 /* level */));
+
+  compaction.reset(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() == nullptr);
+  ASSERT_EQ(4, vstorage_->NextCompactionIndex(1 /* level */));
+}
+
+TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesNotHit) {
+  // Intra L0 compaction triggers only if there are at least
+  // level0_file_num_compaction_trigger + 2 L0 files.
+  mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+  mutable_cf_options_.max_compaction_bytes = 1000000u;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  // All 5 L0 files will be picked for intra L0 compaction. The one L1 file
+  // spans entire L0 key range and is marked as being compacted to avoid
+  // L0->L1 compaction.
+  Add(0, 1U, "100", "150", 200000U, 0, 100, 101);
+  Add(0, 2U, "151", "200", 200000U, 0, 102, 103);
+  Add(0, 3U, "201", "250", 200000U, 0, 104, 105);
+  Add(0, 4U, "251", "300", 200000U, 0, 106, 107);
+  Add(0, 5U, "301", "350", 200000U, 0, 108, 109);
+  Add(1, 6U, "100", "350", 200000U, 0, 110, 111);
+  vstorage_->LevelFiles(1)[0]->being_compacted = true;
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_levels());
+  ASSERT_EQ(5U, compaction->num_input_files(0));
+  ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
+            compaction->compaction_reason());
+  ASSERT_EQ(0, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) {
+  // Intra L0 compaction triggers only if there are at least
+  // level0_file_num_compaction_trigger + 2 L0 files.
+  mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+  mutable_cf_options_.max_compaction_bytes = 999999u;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  // 4 out of 5 L0 files will be picked for intra L0 compaction due to
+  // max_compaction_bytes limit (the minimum number of files for triggering
+  // intra L0 compaction is 4). The one L1 file spans entire L0 key range and
+  // is marked as being compacted to avoid L0->L1 compaction.
+  Add(0, 1U, "100", "150", 200000U, 0, 100, 101);
+  Add(0, 2U, "151", "200", 200000U, 0, 102, 103);
+  Add(0, 3U, "201", "250", 200000U, 0, 104, 105);
+  Add(0, 4U, "251", "300", 200000U, 0, 106, 107);
+  Add(0, 5U, "301", "350", 200000U, 0, 108, 109);
+  Add(1, 6U, "100", "350", 200000U, 0, 109, 110);
+  vstorage_->LevelFiles(1)[0]->being_compacted = true;
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_levels());
+  ASSERT_EQ(4U, compaction->num_input_files(0));
+  ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
+            compaction->compaction_reason());
+  ASSERT_EQ(0, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, IntraL0ForEarliestSeqno) {
+  // Intra L0 compaction triggers only if there are at least
+  // level0_file_num_compaction_trigger + 2 L0 files.
+  mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+  mutable_cf_options_.max_compaction_bytes = 999999u;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  // 4 out of 6 L0 files will be picked for intra L0 compaction due to
+  // being_compact limit. And the latest one L0 will be skipped due to earliest
+  // seqno. The one L1 file spans entire L0 key range and is marked as being
+  // compacted to avoid L0->L1 compaction.
+  Add(1, 1U, "100", "350", 200000U, 0, 110, 111);
+  Add(0, 2U, "301", "350", 1U, 0, 108, 109);
+  Add(0, 3U, "251", "300", 1U, 0, 106, 107);
+  Add(0, 4U, "201", "250", 1U, 0, 104, 105);
+  Add(0, 5U, "151", "200", 1U, 0, 102, 103);
+  Add(0, 6U, "100", "150", 1U, 0, 100, 101);
+  Add(0, 7U, "100", "100", 1U, 0, 99, 100);
+  vstorage_->LevelFiles(0)[5]->being_compacted = true;
+  vstorage_->LevelFiles(1)[0]->being_compacted = true;
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_, 107));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_levels());
+  ASSERT_EQ(4U, compaction->num_input_files(0));
+  ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
+            compaction->compaction_reason());
+  ASSERT_EQ(0, compaction->output_level());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap) {
+  const uint64_t kFileSize = 100000;
+
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  // This test covers the case where a "regular" universal compaction is
+  // scheduled first, followed by a delete triggered compaction. The latter
+  // should fail
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 2U, "201", "250", 2 * kFileSize, 0, 401, 450);
+  Add(0, 4U, "260", "300", 4 * kFileSize, 0, 260, 300);
+  Add(3, 5U, "010", "080", 8 * kFileSize, 0, 200, 251);
+  Add(4, 3U, "301", "350", 8 * kFileSize, 0, 101, 150);
+  Add(4, 6U, "501", "750", 8 * kFileSize, 0, 101, 150);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  ASSERT_TRUE(compaction);
+  // Validate that its a compaction to reduce sorted runs
+  ASSERT_EQ(CompactionReason::kUniversalSortedRunNum,
+            compaction->compaction_reason());
+  ASSERT_EQ(0, compaction->output_level());
+  ASSERT_EQ(0, compaction->start_level());
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+
+  AddVersionStorage();
+  // Simulate a flush and mark the file for compaction
+  Add(0, 7U, "150", "200", kFileSize, 0, 551, 600, 0, true);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction2(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_FALSE(compaction2);
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap2) {
+  const uint64_t kFileSize = 100000;
+
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  // This test covers the case where a delete triggered compaction is
+  // scheduled first, followed by a "regular" compaction. The latter
+  // should fail
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  // Mark file number 4 for compaction
+  Add(0, 4U, "260", "300", 4 * kFileSize, 0, 260, 300, 0, true);
+  Add(3, 5U, "240", "290", 8 * kFileSize, 0, 201, 250);
+  Add(4, 3U, "301", "350", 8 * kFileSize, 0, 101, 150);
+  Add(4, 6U, "501", "750", 8 * kFileSize, 0, 101, 150);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  ASSERT_TRUE(compaction);
+  // Validate that its a delete triggered compaction
+  ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction,
+            compaction->compaction_reason());
+  ASSERT_EQ(3, compaction->output_level());
+  ASSERT_EQ(0, compaction->start_level());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+
+  AddVersionStorage();
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 2U, "201", "250", 2 * kFileSize, 0, 401, 450);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction2(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_FALSE(compaction2);
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedCompactionStartOutputOverlap) {
+  // The case where universal periodic compaction can be picked
+  // with some newer files being compacted.
+  const uint64_t kFileSize = 100000;
+
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+
+  bool input_level_overlap = false;
+  bool output_level_overlap = false;
+  // Let's mark 2 files in 2 different levels for compaction. The
+  // compaction picker will randomly pick one, so use the sync point to
+  // ensure a deterministic order. Loop until both cases are covered
+  size_t random_index = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionPicker::PickFilesMarkedForCompaction", [&](void* arg) {
+        size_t* index = static_cast<size_t*>(arg);
+        *index = random_index;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  while (!input_level_overlap || !output_level_overlap) {
+    // Ensure that the L0 file gets picked first
+    random_index = !input_level_overlap ? 0 : 1;
+    UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+    NewVersionStorage(5, kCompactionStyleUniversal);
+
+    Add(0, 1U, "260", "300", 4 * kFileSize, 0, 260, 300, 0, true);
+    Add(3, 2U, "010", "020", 2 * kFileSize, 0, 201, 248);
+    Add(3, 3U, "250", "270", 2 * kFileSize, 0, 202, 249);
+    Add(3, 4U, "290", "310", 2 * kFileSize, 0, 203, 250);
+    Add(3, 5U, "310", "320", 2 * kFileSize, 0, 204, 251, 0, true);
+    Add(4, 6U, "301", "350", 8 * kFileSize, 0, 101, 150);
+    Add(4, 7U, "501", "750", 8 * kFileSize, 0, 101, 150);
+    UpdateVersionStorageInfo();
+
+    std::unique_ptr<Compaction> compaction(
+        universal_compaction_picker.PickCompaction(
+            cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+            &log_buffer_));
+
+    ASSERT_TRUE(compaction);
+    // Validate that its a delete triggered compaction
+    ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction,
+              compaction->compaction_reason());
+    ASSERT_TRUE(compaction->start_level() == 0 ||
+                compaction->start_level() == 3);
+    if (compaction->start_level() == 0) {
+      // The L0 file was picked. The next compaction will detect an
+      // overlap on its input level
+      input_level_overlap = true;
+      ASSERT_EQ(3, compaction->output_level());
+      ASSERT_EQ(1U, compaction->num_input_files(0));
+      ASSERT_EQ(3U, compaction->num_input_files(1));
+    } else {
+      // The level 3 file was picked. The next compaction will pick
+      // the L0 file and will detect overlap when adding output
+      // level inputs
+      output_level_overlap = true;
+      ASSERT_EQ(4, compaction->output_level());
+      ASSERT_EQ(2U, compaction->num_input_files(0));
+      ASSERT_EQ(1U, compaction->num_input_files(1));
+    }
+
+    vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+    // After recomputing the compaction score, only one marked file will remain
+    random_index = 0;
+    std::unique_ptr<Compaction> compaction2(
+        universal_compaction_picker.PickCompaction(
+            cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+            &log_buffer_));
+    ASSERT_FALSE(compaction2);
+    DeleteVersionStorage();
+  }
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedL0NoOverlap) {
+  const uint64_t kFileSize = 100000;
+
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  // This test covers the case where a delete triggered compaction is
+  // scheduled and should result in a full compaction
+  NewVersionStorage(1, kCompactionStyleUniversal);
+
+  // Mark file number 4 for compaction
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300, 0, true);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250);
+  Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150);
+  Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  ASSERT_TRUE(compaction);
+  // Validate that its a delete triggered compaction
+  ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction,
+            compaction->compaction_reason());
+  ASSERT_EQ(0, compaction->output_level());
+  ASSERT_EQ(0, compaction->start_level());
+  ASSERT_EQ(4U, compaction->num_input_files(0));
+  ASSERT_TRUE(file_map_[4].first->being_compacted);
+  ASSERT_TRUE(file_map_[5].first->being_compacted);
+  ASSERT_TRUE(file_map_[3].first->being_compacted);
+  ASSERT_TRUE(file_map_[6].first->being_compacted);
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedL0WithOverlap) {
+  const uint64_t kFileSize = 100000;
+
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  // This test covers the case where a file is being compacted, and a
+  // delete triggered compaction is then scheduled. The latter should stop
+  // at the first file being compacted
+  NewVersionStorage(1, kCompactionStyleUniversal);
+
+  // Mark file number 4 for compaction
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300, 0, true);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250);
+  Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150);
+  Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100);
+  UpdateVersionStorageInfo();
+  file_map_[3].first->being_compacted = true;
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  ASSERT_TRUE(compaction);
+  // Validate that its a delete triggered compaction
+  ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction,
+            compaction->compaction_reason());
+  ASSERT_EQ(0, compaction->output_level());
+  ASSERT_EQ(0, compaction->start_level());
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_TRUE(file_map_[4].first->being_compacted);
+  ASSERT_TRUE(file_map_[5].first->being_compacted);
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedL0Overlap2) {
+  const uint64_t kFileSize = 100000;
+
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  // This test covers the case where a delete triggered compaction is
+  // scheduled first, followed by a "regular" compaction. The latter
+  // should fail
+  NewVersionStorage(1, kCompactionStyleUniversal);
+
+  // Mark file number 5 for compaction
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250, 0, true);
+  Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150);
+  Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  ASSERT_TRUE(compaction);
+  // Validate that its a delete triggered compaction
+  ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction,
+            compaction->compaction_reason());
+  ASSERT_EQ(0, compaction->output_level());
+  ASSERT_EQ(0, compaction->start_level());
+  ASSERT_EQ(3U, compaction->num_input_files(0));
+  ASSERT_TRUE(file_map_[5].first->being_compacted);
+  ASSERT_TRUE(file_map_[3].first->being_compacted);
+  ASSERT_TRUE(file_map_[6].first->being_compacted);
+
+  AddVersionStorage();
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction2(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(compaction2);
+  ASSERT_EQ(3U, compaction->num_input_files(0));
+  ASSERT_TRUE(file_map_[1].first->being_compacted);
+  ASSERT_TRUE(file_map_[2].first->being_compacted);
+  ASSERT_TRUE(file_map_[4].first->being_compacted);
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedManualCompaction) {
+  const uint64_t kFileSize = 100000;
+  const int kNumLevels = 7;
+
+  // This test makes sure the `files_marked_for_compaction_` is updated after
+  // creating manual compaction.
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(kNumLevels, kCompactionStyleUniversal);
+
+  // Add 3 files marked for compaction
+  Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150, 0, true);
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300, 0, true);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250, 0, true);
+  UpdateVersionStorageInfo();
+
+  // All 3 files are marked for compaction
+  ASSERT_EQ(3U, vstorage_->FilesMarkedForCompaction().size());
+
+  bool manual_conflict = false;
+  InternalKey* manual_end = nullptr;
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.CompactRange(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          ColumnFamilyData::kCompactAllLevels, 6, CompactRangeOptions(),
+          nullptr, nullptr, &manual_end, &manual_conflict,
+          std::numeric_limits<uint64_t>::max(), ""));
+
+  ASSERT_TRUE(compaction);
+
+  ASSERT_EQ(CompactionReason::kManualCompaction,
+            compaction->compaction_reason());
+  ASSERT_EQ(kNumLevels - 1, compaction->output_level());
+  ASSERT_EQ(0, compaction->start_level());
+  ASSERT_EQ(3U, compaction->num_input_files(0));
+  ASSERT_TRUE(file_map_[3].first->being_compacted);
+  ASSERT_TRUE(file_map_[4].first->being_compacted);
+  ASSERT_TRUE(file_map_[5].first->being_compacted);
+
+  // After creating the manual compaction, all files should be cleared from
+  // `FilesMarkedForCompaction`. So they won't be picked by others.
+  ASSERT_EQ(0U, vstorage_->FilesMarkedForCompaction().size());
+}
+
+TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNonLastLevel) {
+  // This test make sure size amplification compaction could still be triggered
+  // if the last sorted run is not the last level.
+  const uint64_t kFileSize = 100000;
+  const int kNumLevels = 7;
+  const int kLastLevel = kNumLevels - 1;
+
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  ioptions_.preclude_last_level_data_seconds = 1000;
+  mutable_cf_options_.compaction_options_universal
+      .max_size_amplification_percent = 200;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(kNumLevels, kCompactionStyleUniversal);
+  Add(0, 100U, "100", "300", 1 * kFileSize);
+  Add(0, 101U, "200", "400", 1 * kFileSize);
+  Add(4, 90U, "100", "600", 4 * kFileSize);
+  Add(5, 80U, "200", "300", 2 * kFileSize);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  // Make sure it's a size amp compaction and includes all files
+  ASSERT_EQ(compaction->compaction_reason(),
+            CompactionReason::kUniversalSizeAmplification);
+  ASSERT_EQ(compaction->output_level(), kLastLevel);
+  ASSERT_EQ(compaction->input_levels(0)->num_files, 2);
+  ASSERT_EQ(compaction->input_levels(4)->num_files, 1);
+  ASSERT_EQ(compaction->input_levels(5)->num_files, 1);
+}
+
+TEST_F(CompactionPickerTest, UniversalSizeRatioTierCompactionLastLevel) {
+  // This test makes sure the size amp calculation skips the last level (L6), so
+  // size amp compaction is not triggered, instead a size ratio compaction is
+  // triggered.
+  const uint64_t kFileSize = 100000;
+  const int kNumLevels = 7;
+  const int kLastLevel = kNumLevels - 1;
+  const int kPenultimateLevel = kLastLevel - 1;
+
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  ioptions_.preclude_last_level_data_seconds = 1000;
+  mutable_cf_options_.compaction_options_universal
+      .max_size_amplification_percent = 200;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(kNumLevels, kCompactionStyleUniversal);
+  Add(0, 100U, "100", "300", 1 * kFileSize);
+  Add(0, 101U, "200", "400", 1 * kFileSize);
+  Add(5, 90U, "100", "600", 4 * kFileSize);
+  Add(6, 80U, "200", "300", 2 * kFileSize);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  // Internally, size amp compaction is evaluated before size ratio compaction.
+  // Here to make sure it's size ratio compaction instead of size amp
+  ASSERT_EQ(compaction->compaction_reason(),
+            CompactionReason::kUniversalSizeRatio);
+  ASSERT_EQ(compaction->output_level(), kPenultimateLevel - 1);
+  ASSERT_EQ(compaction->input_levels(0)->num_files, 2);
+  ASSERT_EQ(compaction->input_levels(5)->num_files, 0);
+  ASSERT_EQ(compaction->input_levels(6)->num_files, 0);
+}
+
+TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNotSuport) {
+  // Tiered compaction only support level_num > 2 (otherwise the penultimate
+  // level is going to be level 0, which may make thing more complicated), so
+  // when there's only 2 level, still treating level 1 as the last level for
+  // size amp compaction
+  const uint64_t kFileSize = 100000;
+  const int kNumLevels = 2;
+  const int kLastLevel = kNumLevels - 1;
+
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  ioptions_.preclude_last_level_data_seconds = 1000;
+  mutable_cf_options_.compaction_options_universal
+      .max_size_amplification_percent = 200;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(kNumLevels, kCompactionStyleUniversal);
+  Add(0, 100U, "100", "300", 1 * kFileSize);
+  Add(0, 101U, "200", "400", 1 * kFileSize);
+  Add(0, 90U, "100", "600", 4 * kFileSize);
+  Add(1, 80U, "200", "300", 2 * kFileSize);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  // size amp compaction is still triggered even preclude_last_level is set
+  ASSERT_EQ(compaction->compaction_reason(),
+            CompactionReason::kUniversalSizeAmplification);
+  ASSERT_EQ(compaction->output_level(), kLastLevel);
+  ASSERT_EQ(compaction->input_levels(0)->num_files, 3);
+  ASSERT_EQ(compaction->input_levels(1)->num_files, 1);
+}
+
+TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionLastLevel) {
+  // This test makes sure the size amp compaction for tiered storage could still
+  // be triggered, but only for non-last-level files
+  const uint64_t kFileSize = 100000;
+  const int kNumLevels = 7;
+  const int kLastLevel = kNumLevels - 1;
+  const int kPenultimateLevel = kLastLevel - 1;
+
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  ioptions_.preclude_last_level_data_seconds = 1000;
+  mutable_cf_options_.compaction_options_universal
+      .max_size_amplification_percent = 200;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(kNumLevels, kCompactionStyleUniversal);
+  Add(0, 100U, "100", "300", 3 * kFileSize);
+  Add(0, 101U, "200", "400", 2 * kFileSize);
+  Add(5, 90U, "100", "600", 2 * kFileSize);
+  Add(6, 80U, "200", "300", 2 * kFileSize);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  // It's a Size Amp compaction, but doesn't include the last level file and
+  // output to the penultimate level.
+  ASSERT_EQ(compaction->compaction_reason(),
+            CompactionReason::kUniversalSizeAmplification);
+  ASSERT_EQ(compaction->output_level(), kPenultimateLevel);
+  ASSERT_EQ(compaction->input_levels(0)->num_files, 2);
+  ASSERT_EQ(compaction->input_levels(5)->num_files, 1);
+  ASSERT_EQ(compaction->input_levels(6)->num_files, 0);
+}
+
+TEST_F(CompactionPickerU64TsTest, Overlap) {
+  int num_levels = ioptions_.num_levels;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+
+  constexpr int level = 0;
+  constexpr uint64_t file_number = 20ULL;
+  constexpr char smallest[4] = "500";
+  constexpr char largest[4] = "600";
+  constexpr uint64_t ts_of_smallest = 12345ULL;
+  constexpr uint64_t ts_of_largest = 56789ULL;
+
+  {
+    std::string ts1;
+    PutFixed64(&ts1, ts_of_smallest);
+    std::string ts2;
+    PutFixed64(&ts2, ts_of_largest);
+    Add(level, file_number, smallest, largest,
+        /*file_size=*/1U, /*path_id=*/0,
+        /*smallest_seq=*/100, /*largest_seq=*/100, /*compensated_file_size=*/0,
+        /*marked_for_compact=*/false, /*temperature=*/Temperature::kUnknown,
+        /*oldest_ancestor_time=*/kUnknownOldestAncesterTime, ts1, ts2);
+    UpdateVersionStorageInfo();
+  }
+
+  std::unordered_set<uint64_t> input{file_number};
+
+  std::vector<CompactionInputFiles> input_files;
+  ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input, vstorage_.get(), CompactionOptions()));
+  std::unique_ptr<Compaction> comp1(level_compaction_picker.CompactFiles(
+      CompactionOptions(), input_files, level, vstorage_.get(),
+      mutable_cf_options_, mutable_db_options_, /*output_path_id=*/0));
+
+  {
+    // [600, ts=50000] to [600, ts=50000] is the range to check.
+    // ucmp->Compare(smallest_user_key, c->GetLargestUserKey()) > 0, but
+    // ucmp->CompareWithoutTimestamp(smallest_user_key,
+    //                               c->GetLargestUserKey()) == 0.
+    // Should still be considered overlapping.
+    std::string user_key_with_ts1(largest);
+    PutFixed64(&user_key_with_ts1, ts_of_largest - 1);
+    std::string user_key_with_ts2(largest);
+    PutFixed64(&user_key_with_ts2, ts_of_largest - 1);
+    ASSERT_TRUE(level_compaction_picker.RangeOverlapWithCompaction(
+        user_key_with_ts1, user_key_with_ts2, level));
+  }
+  {
+    // [500, ts=60000] to [500, ts=60000] is the range to check.
+    // ucmp->Compare(largest_user_key, c->GetSmallestUserKey()) < 0, but
+    // ucmp->CompareWithoutTimestamp(largest_user_key,
+    //                               c->GetSmallestUserKey()) == 0.
+    // Should still be considered overlapping.
+    std::string user_key_with_ts1(smallest);
+    PutFixed64(&user_key_with_ts1, ts_of_smallest + 1);
+    std::string user_key_with_ts2(smallest);
+    PutFixed64(&user_key_with_ts2, ts_of_smallest + 1);
+    ASSERT_TRUE(level_compaction_picker.RangeOverlapWithCompaction(
+        user_key_with_ts1, user_key_with_ts2, level));
+  }
+}
+
+TEST_F(CompactionPickerU64TsTest, CannotTrivialMoveUniversal) {
+  constexpr uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.compaction_options_universal.allow_trivial_move = true;
+  NewVersionStorage(1, kCompactionStyleUniversal);
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  UpdateVersionStorageInfo();
+  // must return false when there's no files.
+  ASSERT_FALSE(universal_compaction_picker.NeedsCompaction(vstorage_.get()));
+
+  std::string ts1;
+  PutFixed64(&ts1, 9000);
+  std::string ts2;
+  PutFixed64(&ts2, 8000);
+  std::string ts3;
+  PutFixed64(&ts3, 7000);
+  std::string ts4;
+  PutFixed64(&ts4, 6000);
+
+  NewVersionStorage(3, kCompactionStyleUniversal);
+  // A compaction should be triggered and pick file 2
+  Add(1, 1U, "150", "150", kFileSize, /*path_id=*/0, /*smallest_seq=*/100,
+      /*largest_seq=*/100, /*compensated_file_size=*/kFileSize,
+      /*marked_for_compact=*/false, Temperature::kUnknown,
+      kUnknownOldestAncesterTime, ts1, ts2);
+  Add(2, 2U, "150", "150", kFileSize, /*path_id=*/0, /*smallest_seq=*/100,
+      /*largest_seq=*/100, /*compensated_file_size=*/kFileSize,
+      /*marked_for_compact=*/false, Temperature::kUnknown,
+      kUnknownOldestAncesterTime, ts3, ts4);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  assert(compaction);
+  ASSERT_TRUE(!compaction->is_trivial_move());
+}
+
+class PerKeyPlacementCompactionPickerTest
+    : public CompactionPickerTest,
+      public testing::WithParamInterface<bool> {
+ public:
+  PerKeyPlacementCompactionPickerTest() : CompactionPickerTest() {}
+
+  void SetUp() override { enable_per_key_placement_ = GetParam(); }
+
+ protected:
+  bool enable_per_key_placement_ = false;
+};
+
+TEST_P(PerKeyPlacementCompactionPickerTest, OverlapWithNormalCompaction) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
+        auto supports_per_key_placement = static_cast<bool*>(arg);
+        *supports_per_key_placement = enable_per_key_placement_;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  int num_levels = ioptions_.num_levels;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+
+  Add(0, 21U, "100", "150", 60000000U);
+  Add(0, 22U, "300", "350", 60000000U);
+  Add(5, 40U, "200", "250", 60000000U);
+  Add(6, 50U, "101", "351", 60000000U);
+  UpdateVersionStorageInfo();
+
+  CompactionOptions comp_options;
+  std::unordered_set<uint64_t> input_set;
+  input_set.insert(40);
+  std::vector<CompactionInputFiles> input_files;
+  ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  std::unique_ptr<Compaction> comp1(level_compaction_picker.CompactFiles(
+      comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+      mutable_db_options_, 0));
+
+  input_set.clear();
+  input_files.clear();
+  input_set.insert(21);
+  input_set.insert(22);
+  input_set.insert(50);
+  ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  ASSERT_EQ(enable_per_key_placement_,
+            level_compaction_picker.FilesRangeOverlapWithCompaction(
+                input_files, 6,
+                Compaction::EvaluatePenultimateLevel(vstorage_.get(), ioptions_,
+                                                     0, 6)));
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest, NormalCompactionOverlap) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
+        auto supports_per_key_placement = static_cast<bool*>(arg);
+        *supports_per_key_placement = enable_per_key_placement_;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  int num_levels = ioptions_.num_levels;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+
+  Add(0, 21U, "100", "150", 60000000U);
+  Add(0, 22U, "300", "350", 60000000U);
+  Add(4, 40U, "200", "220", 60000000U);
+  Add(4, 41U, "230", "250", 60000000U);
+  Add(6, 50U, "101", "351", 60000000U);
+  UpdateVersionStorageInfo();
+
+  CompactionOptions comp_options;
+  std::unordered_set<uint64_t> input_set;
+  input_set.insert(21);
+  input_set.insert(22);
+  input_set.insert(50);
+  std::vector<CompactionInputFiles> input_files;
+  ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  std::unique_ptr<Compaction> comp1(level_compaction_picker.CompactFiles(
+      comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+      mutable_db_options_, 0));
+
+  input_set.clear();
+  input_files.clear();
+  input_set.insert(40);
+  input_set.insert(41);
+  ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  ASSERT_EQ(enable_per_key_placement_,
+            level_compaction_picker.FilesRangeOverlapWithCompaction(
+                input_files, 5, Compaction::kInvalidLevel));
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest,
+       OverlapWithNormalCompactionUniveral) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
+        auto supports_per_key_placement = static_cast<bool*>(arg);
+        *supports_per_key_placement = enable_per_key_placement_;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  int num_levels = ioptions_.num_levels;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  NewVersionStorage(num_levels, kCompactionStyleUniversal);
+
+  Add(0, 21U, "100", "150", 60000000U);
+  Add(0, 22U, "300", "350", 60000000U);
+  Add(5, 40U, "200", "250", 60000000U);
+  Add(6, 50U, "101", "351", 60000000U);
+  UpdateVersionStorageInfo();
+
+  CompactionOptions comp_options;
+  std::unordered_set<uint64_t> input_set;
+  input_set.insert(40);
+  std::vector<CompactionInputFiles> input_files;
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
+      comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+      mutable_db_options_, 0));
+
+  input_set.clear();
+  input_files.clear();
+  input_set.insert(21);
+  input_set.insert(22);
+  input_set.insert(50);
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  ASSERT_EQ(enable_per_key_placement_,
+            universal_compaction_picker.FilesRangeOverlapWithCompaction(
+                input_files, 6,
+                Compaction::EvaluatePenultimateLevel(vstorage_.get(), ioptions_,
+                                                     0, 6)));
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest, NormalCompactionOverlapUniversal) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
+        auto supports_per_key_placement = static_cast<bool*>(arg);
+        *supports_per_key_placement = enable_per_key_placement_;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  int num_levels = ioptions_.num_levels;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  NewVersionStorage(num_levels, kCompactionStyleUniversal);
+
+  Add(0, 21U, "100", "150", 60000000U);
+  Add(0, 22U, "300", "350", 60000000U);
+  Add(4, 40U, "200", "220", 60000000U);
+  Add(4, 41U, "230", "250", 60000000U);
+  Add(6, 50U, "101", "351", 60000000U);
+  UpdateVersionStorageInfo();
+
+  CompactionOptions comp_options;
+  std::unordered_set<uint64_t> input_set;
+  input_set.insert(21);
+  input_set.insert(22);
+  input_set.insert(50);
+  std::vector<CompactionInputFiles> input_files;
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
+      comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+      mutable_db_options_, 0));
+
+  input_set.clear();
+  input_files.clear();
+  input_set.insert(40);
+  input_set.insert(41);
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  ASSERT_EQ(enable_per_key_placement_,
+            universal_compaction_picker.FilesRangeOverlapWithCompaction(
+                input_files, 5, Compaction::kInvalidLevel));
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest, PenultimateOverlapUniversal) {
+  // This test is make sure the Tiered compaction would lock whole range of
+  // both output level and penultimate level
+  if (enable_per_key_placement_) {
+    ioptions_.preclude_last_level_data_seconds = 10000;
+  }
+
+  int num_levels = ioptions_.num_levels;
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  NewVersionStorage(num_levels, kCompactionStyleUniversal);
+
+  // L4:   [200, 220]  [230, 250]       [360, 380]
+  // L5:
+  // L6: [101,                    351]
+  Add(4, 40U, "200", "220", 60000000U);
+  Add(4, 41U, "230", "250", 60000000U);
+  Add(4, 42U, "360", "380", 60000000U);
+  Add(6, 60U, "101", "351", 60000000U);
+  UpdateVersionStorageInfo();
+
+  // the existing compaction is the 1st L4 file + L6 file
+  // then compaction of the 2nd L4 file to L5 (penultimate level) is overlapped
+  // when the tiered compaction feature is on.
+  CompactionOptions comp_options;
+  std::unordered_set<uint64_t> input_set;
+  input_set.insert(40);
+  input_set.insert(60);
+  std::vector<CompactionInputFiles> input_files;
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
+      comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+      mutable_db_options_, 0));
+
+  input_set.clear();
+  input_files.clear();
+  input_set.insert(41);
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  ASSERT_EQ(enable_per_key_placement_,
+            universal_compaction_picker.FilesRangeOverlapWithCompaction(
+                input_files, 5, Compaction::kInvalidLevel));
+
+  // compacting the 3rd L4 file is always safe:
+  input_set.clear();
+  input_files.clear();
+  input_set.insert(42);
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction(
+      input_files, 5, Compaction::kInvalidLevel));
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest, LastLevelOnlyOverlapUniversal) {
+  if (enable_per_key_placement_) {
+    ioptions_.preclude_last_level_data_seconds = 10000;
+  }
+
+  int num_levels = ioptions_.num_levels;
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  NewVersionStorage(num_levels, kCompactionStyleUniversal);
+
+  // L4:   [200, 220]  [230, 250]       [360, 380]
+  // L5:
+  // L6: [101,                    351]
+  Add(4, 40U, "200", "220", 60000000U);
+  Add(4, 41U, "230", "250", 60000000U);
+  Add(4, 42U, "360", "380", 60000000U);
+  Add(6, 60U, "101", "351", 60000000U);
+  UpdateVersionStorageInfo();
+
+  CompactionOptions comp_options;
+  std::unordered_set<uint64_t> input_set;
+  input_set.insert(60);
+  std::vector<CompactionInputFiles> input_files;
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
+      comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+      mutable_db_options_, 0));
+
+  // cannot compact file 41 if the preclude_last_level feature is on, otherwise
+  // compact file 41 is okay.
+  input_set.clear();
+  input_files.clear();
+  input_set.insert(41);
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  ASSERT_EQ(enable_per_key_placement_,
+            universal_compaction_picker.FilesRangeOverlapWithCompaction(
+                input_files, 5, Compaction::kInvalidLevel));
+
+  // compacting the 3rd L4 file is always safe:
+  input_set.clear();
+  input_files.clear();
+  input_set.insert(42);
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction(
+      input_files, 5, Compaction::kInvalidLevel));
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest,
+       LastLevelOnlyFailPenultimateUniversal) {
+  // This is to test last_level only compaction still unable to do the
+  // penultimate level compaction if there's already a file in the penultimate
+  // level.
+  // This should rarely happen in universal compaction, as the non-empty L5
+  // should be included in the compaction.
+  if (enable_per_key_placement_) {
+    ioptions_.preclude_last_level_data_seconds = 10000;
+  }
+
+  int num_levels = ioptions_.num_levels;
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  NewVersionStorage(num_levels, kCompactionStyleUniversal);
+
+  // L4:   [200, 220]
+  // L5:              [230, 250]
+  // L6: [101,                    351]
+  Add(4, 40U, "200", "220", 60000000U);
+  Add(5, 50U, "230", "250", 60000000U);
+  Add(6, 60U, "101", "351", 60000000U);
+  UpdateVersionStorageInfo();
+
+  CompactionOptions comp_options;
+  std::unordered_set<uint64_t> input_set;
+  input_set.insert(60);
+  std::vector<CompactionInputFiles> input_files;
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
+      comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+      mutable_db_options_, 0));
+
+  ASSERT_TRUE(comp1);
+  ASSERT_EQ(comp1->GetPenultimateLevel(), Compaction::kInvalidLevel);
+
+  // As comp1 cannot be output to the penultimate level, compacting file 40 to
+  // L5 is always safe.
+  input_set.clear();
+  input_files.clear();
+  input_set.insert(40);
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction(
+      input_files, 5, Compaction::kInvalidLevel));
+
+  std::unique_ptr<Compaction> comp2(universal_compaction_picker.CompactFiles(
+      comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+      mutable_db_options_, 0));
+  ASSERT_TRUE(comp2);
+  ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetPenultimateLevel());
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest,
+       LastLevelOnlyConflictWithOngoingUniversal) {
+  // This is to test last_level only compaction still unable to do the
+  // penultimate level compaction if there's already an ongoing compaction to
+  // the penultimate level
+  if (enable_per_key_placement_) {
+    ioptions_.preclude_last_level_data_seconds = 10000;
+  }
+
+  int num_levels = ioptions_.num_levels;
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  NewVersionStorage(num_levels, kCompactionStyleUniversal);
+
+  // L4:   [200, 220]  [230, 250]       [360, 380]
+  // L5:
+  // L6: [101,                    351]
+  Add(4, 40U, "200", "220", 60000000U);
+  Add(4, 41U, "230", "250", 60000000U);
+  Add(4, 42U, "360", "380", 60000000U);
+  Add(6, 60U, "101", "351", 60000000U);
+  UpdateVersionStorageInfo();
+
+  // create an ongoing compaction to L5 (penultimate level)
+  CompactionOptions comp_options;
+  std::unordered_set<uint64_t> input_set;
+  input_set.insert(40);
+  std::vector<CompactionInputFiles> input_files;
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
+      comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+      mutable_db_options_, 0));
+
+  ASSERT_TRUE(comp1);
+  ASSERT_EQ(comp1->GetPenultimateLevel(), Compaction::kInvalidLevel);
+
+  input_set.clear();
+  input_files.clear();
+  input_set.insert(60);
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  ASSERT_EQ(enable_per_key_placement_,
+            universal_compaction_picker.FilesRangeOverlapWithCompaction(
+                input_files, 6,
+                Compaction::EvaluatePenultimateLevel(vstorage_.get(), ioptions_,
+                                                     6, 6)));
+
+  if (!enable_per_key_placement_) {
+    std::unique_ptr<Compaction> comp2(universal_compaction_picker.CompactFiles(
+        comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+        mutable_db_options_, 0));
+    ASSERT_TRUE(comp2);
+    ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetPenultimateLevel());
+  }
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest,
+       LastLevelOnlyNoConflictWithOngoingUniversal) {
+  // This is similar to `LastLevelOnlyConflictWithOngoingUniversal`, the only
+  // change is the ongoing compaction to L5 has no overlap with the last level
+  // compaction, so it's safe to move data from the last level to the
+  // penultimate level.
+  if (enable_per_key_placement_) {
+    ioptions_.preclude_last_level_data_seconds = 10000;
+  }
+
+  int num_levels = ioptions_.num_levels;
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  NewVersionStorage(num_levels, kCompactionStyleUniversal);
+
+  // L4:   [200, 220]  [230, 250]       [360, 380]
+  // L5:
+  // L6: [101,                    351]
+  Add(4, 40U, "200", "220", 60000000U);
+  Add(4, 41U, "230", "250", 60000000U);
+  Add(4, 42U, "360", "380", 60000000U);
+  Add(6, 60U, "101", "351", 60000000U);
+  UpdateVersionStorageInfo();
+
+  // create an ongoing compaction to L5 (penultimate level)
+  CompactionOptions comp_options;
+  std::unordered_set<uint64_t> input_set;
+  input_set.insert(42);
+  std::vector<CompactionInputFiles> input_files;
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
+      comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+      mutable_db_options_, 0));
+
+  ASSERT_TRUE(comp1);
+  ASSERT_EQ(comp1->GetPenultimateLevel(), Compaction::kInvalidLevel);
+
+  input_set.clear();
+  input_files.clear();
+  input_set.insert(60);
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  // always safe to move data up
+  ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction(
+      input_files, 6,
+      Compaction::EvaluatePenultimateLevel(vstorage_.get(), ioptions_, 6, 6)));
+
+  // 2 compactions can be run in parallel
+  std::unique_ptr<Compaction> comp2(universal_compaction_picker.CompactFiles(
+      comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+      mutable_db_options_, 0));
+  ASSERT_TRUE(comp2);
+  if (enable_per_key_placement_) {
+    ASSERT_NE(Compaction::kInvalidLevel, comp2->GetPenultimateLevel());
+  } else {
+    ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetPenultimateLevel());
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(PerKeyPlacementCompactionPickerTest,
+                        PerKeyPlacementCompactionPickerTest, ::testing::Bool());
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/compaction/compaction_picker_universal.cc b/src/rocksdb/db/compaction/compaction_picker_universal.cc
new file mode 100644
index 000000000..376e4f60f
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_universal.cc
@@ -0,0 +1,1450 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_picker_universal.h"
+#ifndef ROCKSDB_LITE
+
+#include <cinttypes>
+#include <limits>
+#include <queue>
+#include <string>
+#include <utility>
+
+#include "db/column_family.h"
+#include "file/filename.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "monitoring/statistics.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+// A helper class that form universal compactions. The class is used by
+// UniversalCompactionPicker::PickCompaction().
+// The usage is to create the class, and get the compaction object by calling
+// PickCompaction().
+class UniversalCompactionBuilder {
+ public:
+  UniversalCompactionBuilder(
+      const ImmutableOptions& ioptions, const InternalKeyComparator* icmp,
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+      UniversalCompactionPicker* picker, LogBuffer* log_buffer)
+      : ioptions_(ioptions),
+        icmp_(icmp),
+        cf_name_(cf_name),
+        mutable_cf_options_(mutable_cf_options),
+        mutable_db_options_(mutable_db_options),
+        vstorage_(vstorage),
+        picker_(picker),
+        log_buffer_(log_buffer) {}
+
+  // Form and return the compaction object. The caller owns return object.
+  Compaction* PickCompaction();
+
+ private:
+  struct SortedRun {
+    SortedRun(int _level, FileMetaData* _file, uint64_t _size,
+              uint64_t _compensated_file_size, bool _being_compacted)
+        : level(_level),
+          file(_file),
+          size(_size),
+          compensated_file_size(_compensated_file_size),
+          being_compacted(_being_compacted) {
+      assert(compensated_file_size > 0);
+      assert(level != 0 || file != nullptr);
+    }
+
+    void Dump(char* out_buf, size_t out_buf_size,
+              bool print_path = false) const;
+
+    // sorted_run_count is added into the string to print
+    void DumpSizeInfo(char* out_buf, size_t out_buf_size,
+                      size_t sorted_run_count) const;
+
+    int level;
+    // `file` Will be null for level > 0. For level = 0, the sorted run is
+    // for this file.
+    FileMetaData* file;
+    // For level > 0, `size` and `compensated_file_size` are sum of sizes all
+    // files in the level. `being_compacted` should be the same for all files
+    // in a non-zero level. Use the value here.
+    uint64_t size;
+    uint64_t compensated_file_size;
+    bool being_compacted;
+  };
+
+  // Pick Universal compaction to limit read amplification
+  Compaction* PickCompactionToReduceSortedRuns(
+      unsigned int ratio, unsigned int max_number_of_files_to_compact);
+
+  // Pick Universal compaction to limit space amplification.
+  Compaction* PickCompactionToReduceSizeAmp();
+
+  // Try to pick incremental compaction to reduce space amplification.
+  // It will return null if it cannot find a fanout within the threshold.
+  // Fanout is defined as
+  //    total size of files to compact at output level
+  //  --------------------------------------------------
+  //    total size of files to compact at other levels
+  Compaction* PickIncrementalForReduceSizeAmp(double fanout_threshold);
+
+  Compaction* PickDeleteTriggeredCompaction();
+
+  // Form a compaction from the sorted run indicated by start_index to the
+  // oldest sorted run.
+  // The caller is responsible for making sure that those files are not in
+  // compaction.
+  Compaction* PickCompactionToOldest(size_t start_index,
+                                     CompactionReason compaction_reason);
+
+  Compaction* PickCompactionWithSortedRunRange(
+      size_t start_index, size_t end_index, CompactionReason compaction_reason);
+
+  // Try to pick periodic compaction. The caller should only call it
+  // if there is at least one file marked for periodic compaction.
+  // null will be returned if no such a compaction can be formed
+  // because some files are being compacted.
+  Compaction* PickPeriodicCompaction();
+
+  // Used in universal compaction when the allow_trivial_move
+  // option is set. Checks whether there are any overlapping files
+  // in the input. Returns true if the input files are non
+  // overlapping.
+  bool IsInputFilesNonOverlapping(Compaction* c);
+
+  uint64_t GetMaxOverlappingBytes() const;
+
+  const ImmutableOptions& ioptions_;
+  const InternalKeyComparator* icmp_;
+  double score_;
+  std::vector<SortedRun> sorted_runs_;
+  const std::string& cf_name_;
+  const MutableCFOptions& mutable_cf_options_;
+  const MutableDBOptions& mutable_db_options_;
+  VersionStorageInfo* vstorage_;
+  UniversalCompactionPicker* picker_;
+  LogBuffer* log_buffer_;
+
+  static std::vector<SortedRun> CalculateSortedRuns(
+      const VersionStorageInfo& vstorage);
+
+  // Pick a path ID to place a newly generated file, with its estimated file
+  // size.
+  static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
+                            const MutableCFOptions& mutable_cf_options,
+                            uint64_t file_size);
+};
+
+// Used in universal compaction when trivial move is enabled.
+// This structure is used for the construction of min heap
+// that contains the file meta data, the level of the file
+// and the index of the file in that level
+
+struct InputFileInfo {
+  InputFileInfo() : f(nullptr), level(0), index(0) {}
+
+  FileMetaData* f;
+  size_t level;
+  size_t index;
+};
+
+// Used in universal compaction when trivial move is enabled.
+// This comparator is used for the construction of min heap
+// based on the smallest key of the file.
+struct SmallestKeyHeapComparator {
+  explicit SmallestKeyHeapComparator(const Comparator* ucmp) { ucmp_ = ucmp; }
+
+  bool operator()(InputFileInfo i1, InputFileInfo i2) const {
+    return (ucmp_->CompareWithoutTimestamp(i1.f->smallest.user_key(),
+                                           i2.f->smallest.user_key()) > 0);
+  }
+
+ private:
+  const Comparator* ucmp_;
+};
+
+using SmallestKeyHeap =
+    std::priority_queue<InputFileInfo, std::vector<InputFileInfo>,
+                        SmallestKeyHeapComparator>;
+
+// This function creates the heap that is used to find if the files are
+// overlapping during universal compaction when the allow_trivial_move
+// is set.
+SmallestKeyHeap create_level_heap(Compaction* c, const Comparator* ucmp) {
+  SmallestKeyHeap smallest_key_priority_q =
+      SmallestKeyHeap(SmallestKeyHeapComparator(ucmp));
+
+  InputFileInfo input_file;
+
+  for (size_t l = 0; l < c->num_input_levels(); l++) {
+    if (c->num_input_files(l) != 0) {
+      if (l == 0 && c->start_level() == 0) {
+        for (size_t i = 0; i < c->num_input_files(0); i++) {
+          input_file.f = c->input(0, i);
+          input_file.level = 0;
+          input_file.index = i;
+          smallest_key_priority_q.push(std::move(input_file));
+        }
+      } else {
+        input_file.f = c->input(l, 0);
+        input_file.level = l;
+        input_file.index = 0;
+        smallest_key_priority_q.push(std::move(input_file));
+      }
+    }
+  }
+  return smallest_key_priority_q;
+}
+
+#ifndef NDEBUG
+// smallest_seqno and largest_seqno are set iff. `files` is not empty.
+void GetSmallestLargestSeqno(const std::vector<FileMetaData*>& files,
+                             SequenceNumber* smallest_seqno,
+                             SequenceNumber* largest_seqno) {
+  bool is_first = true;
+  for (FileMetaData* f : files) {
+    assert(f->fd.smallest_seqno <= f->fd.largest_seqno);
+    if (is_first) {
+      is_first = false;
+      *smallest_seqno = f->fd.smallest_seqno;
+      *largest_seqno = f->fd.largest_seqno;
+    } else {
+      if (f->fd.smallest_seqno < *smallest_seqno) {
+        *smallest_seqno = f->fd.smallest_seqno;
+      }
+      if (f->fd.largest_seqno > *largest_seqno) {
+        *largest_seqno = f->fd.largest_seqno;
+      }
+    }
+  }
+}
+#endif
+}  // namespace
+
+// Algorithm that checks to see if there are any overlapping
+// files in the input
+bool UniversalCompactionBuilder::IsInputFilesNonOverlapping(Compaction* c) {
+  auto comparator = icmp_->user_comparator();
+  int first_iter = 1;
+
+  InputFileInfo prev, curr, next;
+
+  SmallestKeyHeap smallest_key_priority_q =
+      create_level_heap(c, icmp_->user_comparator());
+
+  while (!smallest_key_priority_q.empty()) {
+    curr = smallest_key_priority_q.top();
+    smallest_key_priority_q.pop();
+
+    if (first_iter) {
+      prev = curr;
+      first_iter = 0;
+    } else {
+      if (comparator->CompareWithoutTimestamp(
+              prev.f->largest.user_key(), curr.f->smallest.user_key()) >= 0) {
+        // found overlapping files, return false
+        return false;
+      }
+      assert(comparator->CompareWithoutTimestamp(
+                 curr.f->largest.user_key(), prev.f->largest.user_key()) > 0);
+      prev = curr;
+    }
+
+    next.f = nullptr;
+
+    if (c->level(curr.level) != 0 &&
+        curr.index < c->num_input_files(curr.level) - 1) {
+      next.f = c->input(curr.level, curr.index + 1);
+      next.level = curr.level;
+      next.index = curr.index + 1;
+    }
+
+    if (next.f) {
+      smallest_key_priority_q.push(std::move(next));
+    }
+  }
+  return true;
+}
+
+bool UniversalCompactionPicker::NeedsCompaction(
+    const VersionStorageInfo* vstorage) const {
+  const int kLevel0 = 0;
+  if (vstorage->CompactionScore(kLevel0) >= 1) {
+    return true;
+  }
+  if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) {
+    return true;
+  }
+  if (!vstorage->FilesMarkedForCompaction().empty()) {
+    return true;
+  }
+  return false;
+}
+
+Compaction* UniversalCompactionPicker::PickCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    LogBuffer* log_buffer, SequenceNumber /* earliest_memtable_seqno */) {
+  UniversalCompactionBuilder builder(ioptions_, icmp_, cf_name,
+                                     mutable_cf_options, mutable_db_options,
+                                     vstorage, this, log_buffer);
+  return builder.PickCompaction();
+}
+
+void UniversalCompactionBuilder::SortedRun::Dump(char* out_buf,
+                                                 size_t out_buf_size,
+                                                 bool print_path) const {
+  if (level == 0) {
+    assert(file != nullptr);
+    if (file->fd.GetPathId() == 0 || !print_path) {
+      snprintf(out_buf, out_buf_size, "file %" PRIu64, file->fd.GetNumber());
+    } else {
+      snprintf(out_buf, out_buf_size,
+               "file %" PRIu64
+               "(path "
+               "%" PRIu32 ")",
+               file->fd.GetNumber(), file->fd.GetPathId());
+    }
+  } else {
+    snprintf(out_buf, out_buf_size, "level %d", level);
+  }
+}
+
+void UniversalCompactionBuilder::SortedRun::DumpSizeInfo(
+    char* out_buf, size_t out_buf_size, size_t sorted_run_count) const {
+  if (level == 0) {
+    assert(file != nullptr);
+    snprintf(out_buf, out_buf_size,
+             "file %" PRIu64 "[%" ROCKSDB_PRIszt
+             "] "
+             "with size %" PRIu64 " (compensated size %" PRIu64 ")",
+             file->fd.GetNumber(), sorted_run_count, file->fd.GetFileSize(),
+             file->compensated_file_size);
+  } else {
+    snprintf(out_buf, out_buf_size,
+             "level %d[%" ROCKSDB_PRIszt
+             "] "
+             "with size %" PRIu64 " (compensated size %" PRIu64 ")",
+             level, sorted_run_count, size, compensated_file_size);
+  }
+}
+
+std::vector<UniversalCompactionBuilder::SortedRun>
+UniversalCompactionBuilder::CalculateSortedRuns(
+    const VersionStorageInfo& vstorage) {
+  std::vector<UniversalCompactionBuilder::SortedRun> ret;
+  for (FileMetaData* f : vstorage.LevelFiles(0)) {
+    ret.emplace_back(0, f, f->fd.GetFileSize(), f->compensated_file_size,
+                     f->being_compacted);
+  }
+  for (int level = 1; level < vstorage.num_levels(); level++) {
+    uint64_t total_compensated_size = 0U;
+    uint64_t total_size = 0U;
+    bool being_compacted = false;
+    for (FileMetaData* f : vstorage.LevelFiles(level)) {
+      total_compensated_size += f->compensated_file_size;
+      total_size += f->fd.GetFileSize();
+      // Size amp, read amp and periodic compactions always include all files
+      // for a non-zero level. However, a delete triggered compaction and
+      // a trivial move might pick a subset of files in a sorted run. So
+      // always check all files in a sorted run and mark the entire run as
+      // being compacted if one or more files are being compacted
+      if (f->being_compacted) {
+        being_compacted = f->being_compacted;
+      }
+    }
+    if (total_compensated_size > 0) {
+      ret.emplace_back(level, nullptr, total_size, total_compensated_size,
+                       being_compacted);
+    }
+  }
+  return ret;
+}
+
+// Universal style of compaction. Pick files that are contiguous in
+// time-range to compact.
+Compaction* UniversalCompactionBuilder::PickCompaction() {
+  const int kLevel0 = 0;
+  score_ = vstorage_->CompactionScore(kLevel0);
+  sorted_runs_ = CalculateSortedRuns(*vstorage_);
+
+  if (sorted_runs_.size() == 0 ||
+      (vstorage_->FilesMarkedForPeriodicCompaction().empty() &&
+       vstorage_->FilesMarkedForCompaction().empty() &&
+       sorted_runs_.size() < (unsigned int)mutable_cf_options_
+                                 .level0_file_num_compaction_trigger)) {
+    ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: nothing to do\n",
+                     cf_name_.c_str());
+    TEST_SYNC_POINT_CALLBACK(
+        "UniversalCompactionBuilder::PickCompaction:Return", nullptr);
+    return nullptr;
+  }
+  VersionStorageInfo::LevelSummaryStorage tmp;
+  ROCKS_LOG_BUFFER_MAX_SZ(
+      log_buffer_, 3072,
+      "[%s] Universal: sorted runs: %" ROCKSDB_PRIszt " files: %s\n",
+      cf_name_.c_str(), sorted_runs_.size(), vstorage_->LevelSummary(&tmp));
+
+  Compaction* c = nullptr;
+  // Periodic compaction has higher priority than other type of compaction
+  // because it's a hard requirement.
+  if (!vstorage_->FilesMarkedForPeriodicCompaction().empty()) {
+    // Always need to do a full compaction for periodic compaction.
+    c = PickPeriodicCompaction();
+  }
+
+  // Check for size amplification.
+  if (c == nullptr &&
+      sorted_runs_.size() >=
+          static_cast<size_t>(
+              mutable_cf_options_.level0_file_num_compaction_trigger)) {
+    if ((c = PickCompactionToReduceSizeAmp()) != nullptr) {
+      ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: compacting for size amp\n",
+                       cf_name_.c_str());
+    } else {
+      // Size amplification is within limits. Try reducing read
+      // amplification while maintaining file size ratios.
+      unsigned int ratio =
+          mutable_cf_options_.compaction_options_universal.size_ratio;
+
+      if ((c = PickCompactionToReduceSortedRuns(ratio, UINT_MAX)) != nullptr) {
+        ROCKS_LOG_BUFFER(log_buffer_,
+                         "[%s] Universal: compacting for size ratio\n",
+                         cf_name_.c_str());
+      } else {
+        // Size amplification and file size ratios are within configured limits.
+        // If max read amplification is exceeding configured limits, then force
+        // compaction without looking at filesize ratios and try to reduce
+        // the number of files to fewer than level0_file_num_compaction_trigger.
+        // This is guaranteed by NeedsCompaction()
+        assert(sorted_runs_.size() >=
+               static_cast<size_t>(
+                   mutable_cf_options_.level0_file_num_compaction_trigger));
+        // Get the total number of sorted runs that are not being compacted
+        int num_sr_not_compacted = 0;
+        for (size_t i = 0; i < sorted_runs_.size(); i++) {
+          if (sorted_runs_[i].being_compacted == false) {
+            num_sr_not_compacted++;
+          }
+        }
+
+        // The number of sorted runs that are not being compacted is greater
+        // than the maximum allowed number of sorted runs
+        if (num_sr_not_compacted >
+            mutable_cf_options_.level0_file_num_compaction_trigger) {
+          unsigned int num_files =
+              num_sr_not_compacted -
+              mutable_cf_options_.level0_file_num_compaction_trigger + 1;
+          if ((c = PickCompactionToReduceSortedRuns(UINT_MAX, num_files)) !=
+              nullptr) {
+            ROCKS_LOG_BUFFER(log_buffer_,
+                             "[%s] Universal: compacting for file num -- %u\n",
+                             cf_name_.c_str(), num_files);
+          }
+        }
+      }
+    }
+  }
+
+  if (c == nullptr) {
+    if ((c = PickDeleteTriggeredCompaction()) != nullptr) {
+      ROCKS_LOG_BUFFER(log_buffer_,
+                       "[%s] Universal: delete triggered compaction\n",
+                       cf_name_.c_str());
+    }
+  }
+
+  if (c == nullptr) {
+    TEST_SYNC_POINT_CALLBACK(
+        "UniversalCompactionBuilder::PickCompaction:Return", nullptr);
+    return nullptr;
+  }
+
+  if (mutable_cf_options_.compaction_options_universal.allow_trivial_move ==
+          true &&
+      c->compaction_reason() != CompactionReason::kPeriodicCompaction) {
+    c->set_is_trivial_move(IsInputFilesNonOverlapping(c));
+  }
+
+// validate that all the chosen files of L0 are non overlapping in time
+#ifndef NDEBUG
+  bool is_first = true;
+
+  size_t level_index = 0U;
+  if (c->start_level() == 0) {
+    for (auto f : *c->inputs(0)) {
+      assert(f->fd.smallest_seqno <= f->fd.largest_seqno);
+      if (is_first) {
+        is_first = false;
+      }
+    }
+    level_index = 1U;
+  }
+  for (; level_index < c->num_input_levels(); level_index++) {
+    if (c->num_input_files(level_index) != 0) {
+      SequenceNumber smallest_seqno = 0U;
+      SequenceNumber largest_seqno = 0U;
+      GetSmallestLargestSeqno(*(c->inputs(level_index)), &smallest_seqno,
+                              &largest_seqno);
+      if (is_first) {
+        is_first = false;
+      }
+    }
+  }
+#endif
+  // update statistics
+  size_t num_files = 0;
+  for (auto& each_level : *c->inputs()) {
+    num_files += each_level.files.size();
+  }
+  RecordInHistogram(ioptions_.stats, NUM_FILES_IN_SINGLE_COMPACTION, num_files);
+
+  picker_->RegisterCompaction(c);
+  vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+
+  TEST_SYNC_POINT_CALLBACK("UniversalCompactionBuilder::PickCompaction:Return",
+                           c);
+  return c;
+}
+
+uint32_t UniversalCompactionBuilder::GetPathId(
+    const ImmutableCFOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options, uint64_t file_size) {
+  // Two conditions need to be satisfied:
+  // (1) the target path needs to be able to hold the file's size
+  // (2) Total size left in this and previous paths need to be not
+  //     smaller than expected future file size before this new file is
+  //     compacted, which is estimated based on size_ratio.
+  // For example, if now we are compacting files of size (1, 1, 2, 4, 8),
+  // we will make sure the target file, probably with size of 16, will be
+  // placed in a path so that eventually when new files are generated and
+  // compacted to (1, 1, 2, 4, 8, 16), all those files can be stored in or
+  // before the path we chose.
+  //
+  // TODO(sdong): now the case of multiple column families is not
+  // considered in this algorithm. So the target size can be violated in
+  // that case. We need to improve it.
+  uint64_t accumulated_size = 0;
+  uint64_t future_size =
+      file_size *
+      (100 - mutable_cf_options.compaction_options_universal.size_ratio) / 100;
+  uint32_t p = 0;
+  assert(!ioptions.cf_paths.empty());
+  for (; p < ioptions.cf_paths.size() - 1; p++) {
+    uint64_t target_size = ioptions.cf_paths[p].target_size;
+    if (target_size > file_size &&
+        accumulated_size + (target_size - file_size) > future_size) {
+      return p;
+    }
+    accumulated_size += target_size;
+  }
+  return p;
+}
+
+//
+// Consider compaction files based on their size differences with
+// the next file in time order.
+//
+Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
+    unsigned int ratio, unsigned int max_number_of_files_to_compact) {
+  unsigned int min_merge_width =
+      mutable_cf_options_.compaction_options_universal.min_merge_width;
+  unsigned int max_merge_width =
+      mutable_cf_options_.compaction_options_universal.max_merge_width;
+
+  const SortedRun* sr = nullptr;
+  bool done = false;
+  size_t start_index = 0;
+  unsigned int candidate_count = 0;
+
+  unsigned int max_files_to_compact =
+      std::min(max_merge_width, max_number_of_files_to_compact);
+  min_merge_width = std::max(min_merge_width, 2U);
+
+  // Caller checks the size before executing this function. This invariant is
+  // important because otherwise we may have a possible integer underflow when
+  // dealing with unsigned types.
+  assert(sorted_runs_.size() > 0);
+
+  // Considers a candidate file only if it is smaller than the
+  // total size accumulated so far.
+  for (size_t loop = 0; loop < sorted_runs_.size(); loop++) {
+    candidate_count = 0;
+
+    // Skip files that are already being compacted
+    for (sr = nullptr; loop < sorted_runs_.size(); loop++) {
+      sr = &sorted_runs_[loop];
+
+      if (!sr->being_compacted) {
+        candidate_count = 1;
+        break;
+      }
+      char file_num_buf[kFormatFileNumberBufSize];
+      sr->Dump(file_num_buf, sizeof(file_num_buf));
+      ROCKS_LOG_BUFFER(log_buffer_,
+                       "[%s] Universal: %s"
+                       "[%d] being compacted, skipping",
+                       cf_name_.c_str(), file_num_buf, loop);
+
+      sr = nullptr;
+    }
+
+    // This file is not being compacted. Consider it as the
+    // first candidate to be compacted.
+    uint64_t candidate_size = sr != nullptr ? sr->compensated_file_size : 0;
+    if (sr != nullptr) {
+      char file_num_buf[kFormatFileNumberBufSize];
+      sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+      ROCKS_LOG_BUFFER(log_buffer_,
+                       "[%s] Universal: Possible candidate %s[%d].",
+                       cf_name_.c_str(), file_num_buf, loop);
+    }
+
+    // Check if the succeeding files need compaction.
+    for (size_t i = loop + 1;
+         candidate_count < max_files_to_compact && i < sorted_runs_.size();
+         i++) {
+      const SortedRun* succeeding_sr = &sorted_runs_[i];
+      if (succeeding_sr->being_compacted) {
+        break;
+      }
+      // Pick files if the total/last candidate file size (increased by the
+      // specified ratio) is still larger than the next candidate file.
+      // candidate_size is the total size of files picked so far with the
+      // default kCompactionStopStyleTotalSize; with
+      // kCompactionStopStyleSimilarSize, it's simply the size of the last
+      // picked file.
+      double sz = candidate_size * (100.0 + ratio) / 100.0;
+      if (sz < static_cast<double>(succeeding_sr->size)) {
+        break;
+      }
+      if (mutable_cf_options_.compaction_options_universal.stop_style ==
+          kCompactionStopStyleSimilarSize) {
+        // Similar-size stopping rule: also check the last picked file isn't
+        // far larger than the next candidate file.
+        sz = (succeeding_sr->size * (100.0 + ratio)) / 100.0;
+        if (sz < static_cast<double>(candidate_size)) {
+          // If the small file we've encountered begins a run of similar-size
+          // files, we'll pick them up on a future iteration of the outer
+          // loop. If it's some lonely straggler, it'll eventually get picked
+          // by the last-resort read amp strategy which disregards size ratios.
+          break;
+        }
+        candidate_size = succeeding_sr->compensated_file_size;
+      } else {  // default kCompactionStopStyleTotalSize
+        candidate_size += succeeding_sr->compensated_file_size;
+      }
+      candidate_count++;
+    }
+
+    // Found a series of consecutive files that need compaction.
+    if (candidate_count >= (unsigned int)min_merge_width) {
+      start_index = loop;
+      done = true;
+      break;
+    } else {
+      for (size_t i = loop;
+           i < loop + candidate_count && i < sorted_runs_.size(); i++) {
+        const SortedRun* skipping_sr = &sorted_runs_[i];
+        char file_num_buf[256];
+        skipping_sr->DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop);
+        ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Skipping %s",
+                         cf_name_.c_str(), file_num_buf);
+      }
+    }
+  }
+  if (!done || candidate_count <= 1) {
+    return nullptr;
+  }
+  size_t first_index_after = start_index + candidate_count;
+  // Compression is enabled if files compacted earlier already reached
+  // size ratio of compression.
+  bool enable_compression = true;
+  int ratio_to_compress =
+      mutable_cf_options_.compaction_options_universal.compression_size_percent;
+  if (ratio_to_compress >= 0) {
+    uint64_t total_size = 0;
+    for (auto& sorted_run : sorted_runs_) {
+      total_size += sorted_run.compensated_file_size;
+    }
+
+    uint64_t older_file_size = 0;
+    for (size_t i = sorted_runs_.size() - 1; i >= first_index_after; i--) {
+      older_file_size += sorted_runs_[i].size;
+      if (older_file_size * 100L >= total_size * (long)ratio_to_compress) {
+        enable_compression = false;
+        break;
+      }
+    }
+  }
+
+  uint64_t estimated_total_size = 0;
+  for (unsigned int i = 0; i < first_index_after; i++) {
+    estimated_total_size += sorted_runs_[i].size;
+  }
+  uint32_t path_id =
+      GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
+  int start_level = sorted_runs_[start_index].level;
+  int output_level;
+  if (first_index_after == sorted_runs_.size()) {
+    output_level = vstorage_->num_levels() - 1;
+  } else if (sorted_runs_[first_index_after].level == 0) {
+    output_level = 0;
+  } else {
+    output_level = sorted_runs_[first_index_after].level - 1;
+  }
+
+  // last level is reserved for the files ingested behind
+  if (ioptions_.allow_ingest_behind &&
+      (output_level == vstorage_->num_levels() - 1)) {
+    assert(output_level > 1);
+    output_level--;
+  }
+
+  std::vector<CompactionInputFiles> inputs(vstorage_->num_levels());
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    inputs[i].level = start_level + static_cast<int>(i);
+  }
+  for (size_t i = start_index; i < first_index_after; i++) {
+    auto& picking_sr = sorted_runs_[i];
+    if (picking_sr.level == 0) {
+      FileMetaData* picking_file = picking_sr.file;
+      inputs[0].files.push_back(picking_file);
+    } else {
+      auto& files = inputs[picking_sr.level - start_level].files;
+      for (auto* f : vstorage_->LevelFiles(picking_sr.level)) {
+        files.push_back(f);
+      }
+    }
+    char file_num_buf[256];
+    picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), i);
+    ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Picking %s",
+                     cf_name_.c_str(), file_num_buf);
+  }
+
+  std::vector<FileMetaData*> grandparents;
+  // Include grandparents for potential file cutting in incremental
+  // mode. It is for aligning file cutting boundaries across levels,
+  // so that subsequent compactions can pick files with aligned
+  // buffer.
+  // Single files are only picked up in incremental mode, so that
+  // there is no need for full range.
+  if (mutable_cf_options_.compaction_options_universal.incremental &&
+      first_index_after < sorted_runs_.size() &&
+      sorted_runs_[first_index_after].level > 1) {
+    grandparents = vstorage_->LevelFiles(sorted_runs_[first_index_after].level);
+  }
+
+  if (output_level != 0 &&
+      picker_->FilesRangeOverlapWithCompaction(
+          inputs, output_level,
+          Compaction::EvaluatePenultimateLevel(vstorage_, ioptions_,
+                                               start_level, output_level))) {
+    return nullptr;
+  }
+  CompactionReason compaction_reason;
+  if (max_number_of_files_to_compact == UINT_MAX) {
+    compaction_reason = CompactionReason::kUniversalSizeRatio;
+  } else {
+    compaction_reason = CompactionReason::kUniversalSortedRunNum;
+  }
+  return new Compaction(vstorage_, ioptions_, mutable_cf_options_,
+                        mutable_db_options_, std::move(inputs), output_level,
+                        MaxFileSizeForLevel(mutable_cf_options_, output_level,
+                                            kCompactionStyleUniversal),
+                        GetMaxOverlappingBytes(), path_id,
+                        GetCompressionType(vstorage_, mutable_cf_options_,
+                                           output_level, 1, enable_compression),
+                        GetCompressionOptions(mutable_cf_options_, vstorage_,
+                                              output_level, enable_compression),
+                        Temperature::kUnknown,
+                        /* max_subcompactions */ 0, grandparents,
+                        /* is manual */ false, /* trim_ts */ "", score_,
+                        false /* deletion_compaction */,
+                        /* l0_files_might_overlap */ true, compaction_reason);
+}
+
+// Look at overall size amplification. If size amplification
+// exceeds the configured value, then do a compaction
+// of the candidate files all the way upto the earliest
+// base file (overrides configured values of file-size ratios,
+// min_merge_width and max_merge_width).
+//
+Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() {
+  // percentage flexibility while reducing size amplification
+  uint64_t ratio = mutable_cf_options_.compaction_options_universal
+                       .max_size_amplification_percent;
+
+  unsigned int candidate_count = 0;
+  uint64_t candidate_size = 0;
+  size_t start_index = 0;
+  const SortedRun* sr = nullptr;
+
+  assert(!sorted_runs_.empty());
+  if (sorted_runs_.back().being_compacted) {
+    return nullptr;
+  }
+
+  // Skip files that are already being compacted
+  for (size_t loop = 0; loop + 1 < sorted_runs_.size(); loop++) {
+    sr = &sorted_runs_[loop];
+    if (!sr->being_compacted) {
+      start_index = loop;  // Consider this as the first candidate.
+      break;
+    }
+    char file_num_buf[kFormatFileNumberBufSize];
+    sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+    ROCKS_LOG_BUFFER(log_buffer_,
+                     "[%s] Universal: skipping %s[%d] compacted %s",
+                     cf_name_.c_str(), file_num_buf, loop,
+                     " cannot be a candidate to reduce size amp.\n");
+    sr = nullptr;
+  }
+
+  if (sr == nullptr) {
+    return nullptr;  // no candidate files
+  }
+  {
+    char file_num_buf[kFormatFileNumberBufSize];
+    sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+    ROCKS_LOG_BUFFER(
+        log_buffer_,
+        "[%s] Universal: First candidate %s[%" ROCKSDB_PRIszt "] %s",
+        cf_name_.c_str(), file_num_buf, start_index, " to reduce size amp.\n");
+  }
+
+  // size of the base sorted run for size amp calculation
+  uint64_t base_sr_size = sorted_runs_.back().size;
+  size_t sr_end_idx = sorted_runs_.size() - 1;
+  // If tiered compaction is enabled and the last sorted run is the last level
+  if (ioptions_.preclude_last_level_data_seconds > 0 &&
+      ioptions_.num_levels > 2 &&
+      sorted_runs_.back().level == ioptions_.num_levels - 1 &&
+      sorted_runs_.size() > 1) {
+    sr_end_idx = sorted_runs_.size() - 2;
+    base_sr_size = sorted_runs_[sr_end_idx].size;
+  }
+
+  // keep adding up all the remaining files
+  for (size_t loop = start_index; loop < sr_end_idx; loop++) {
+    sr = &sorted_runs_[loop];
+    if (sr->being_compacted) {
+      // TODO with incremental compaction is supported, we might want to
+      // schedule some incremental compactions in parallel if needed.
+      char file_num_buf[kFormatFileNumberBufSize];
+      sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+      ROCKS_LOG_BUFFER(
+          log_buffer_, "[%s] Universal: Possible candidate %s[%d] %s",
+          cf_name_.c_str(), file_num_buf, start_index,
+          " is already being compacted. No size amp reduction possible.\n");
+      return nullptr;
+    }
+    candidate_size += sr->compensated_file_size;
+    candidate_count++;
+  }
+  if (candidate_count == 0) {
+    return nullptr;
+  }
+
+  // size amplification = percentage of additional size
+  if (candidate_size * 100 < ratio * base_sr_size) {
+    ROCKS_LOG_BUFFER(
+        log_buffer_,
+        "[%s] Universal: size amp not needed. newer-files-total-size %" PRIu64
+        " earliest-file-size %" PRIu64,
+        cf_name_.c_str(), candidate_size, base_sr_size);
+    return nullptr;
+  } else {
+    ROCKS_LOG_BUFFER(
+        log_buffer_,
+        "[%s] Universal: size amp needed. newer-files-total-size %" PRIu64
+        " earliest-file-size %" PRIu64,
+        cf_name_.c_str(), candidate_size, base_sr_size);
+  }
+  // Since incremental compaction can't include more than second last
+  // level, it can introduce penalty, compared to full compaction. We
+  // hard code the pentalty to be 80%. If we end up with a compaction
+  // fanout higher than 80% of full level compactions, we fall back
+  // to full level compaction.
+  // The 80% threshold is arbitrary and can be adjusted or made
+  // configurable in the future.
+  // This also prevent the case when compaction falls behind and we
+  // need to compact more levels for compactions to catch up.
+  if (mutable_cf_options_.compaction_options_universal.incremental) {
+    double fanout_threshold = static_cast<double>(base_sr_size) /
+                              static_cast<double>(candidate_size) * 1.8;
+    Compaction* picked = PickIncrementalForReduceSizeAmp(fanout_threshold);
+    if (picked != nullptr) {
+      // As the feature is still incremental, picking incremental compaction
+      // might fail and we will fall bck to compacting full level.
+      return picked;
+    }
+  }
+  return PickCompactionWithSortedRunRange(
+      start_index, sr_end_idx, CompactionReason::kUniversalSizeAmplification);
+}
+
+Compaction* UniversalCompactionBuilder::PickIncrementalForReduceSizeAmp(
+    double fanout_threshold) {
+  // Try find all potential compactions with total size just over
+  // options.max_compaction_size / 2, and take the one with the lowest
+  // fanout (defined in declaration of the function).
+  // This is done by having a sliding window of the files at the second
+  // lowest level, and keep expanding while finding overlapping in the
+  // last level. Once total size exceeds the size threshold, calculate
+  // the fanout value. And then shrinking from the small side of the
+  // window. Keep doing it until the end.
+  // Finally, we try to include upper level files if they fall into
+  // the range.
+  //
+  // Note that it is a similar problem as leveled compaction's
+  // kMinOverlappingRatio priority, but instead of picking single files
+  // we expand to a target compaction size. The reason is that in
+  // leveled compaction, actual fanout value tends to high, e.g. 10, so
+  // even with single file in down merging level, the extra size
+  // compacted in boundary files is at a lower ratio. But here users
+  // often have size of second last level size to be 1/4, 1/3 or even
+  // 1/2 of the bottommost level, so picking single file in second most
+  // level will cause significant waste, which is not desirable.
+  //
+  // This algorithm has lots of room to improve to pick more efficient
+  // compactions.
+  assert(sorted_runs_.size() >= 2);
+  int second_last_level = sorted_runs_[sorted_runs_.size() - 2].level;
+  if (second_last_level == 0) {
+    // Can't split Level 0.
+    return nullptr;
+  }
+  int output_level = sorted_runs_.back().level;
+  const std::vector<FileMetaData*>& bottom_files =
+      vstorage_->LevelFiles(output_level);
+  const std::vector<FileMetaData*>& files =
+      vstorage_->LevelFiles(second_last_level);
+  assert(!bottom_files.empty());
+  assert(!files.empty());
+
+  //  std::unordered_map<uint64_t, uint64_t> file_to_order;
+
+  int picked_start_idx = 0;
+  int picked_end_idx = 0;
+  double picked_fanout = fanout_threshold;
+
+  // Use half target compaction bytes as anchor to stop growing second most
+  // level files, and reserve growing space for more overlapping bottom level,
+  // clean cut, files from other levels, etc.
+  uint64_t comp_thres_size = mutable_cf_options_.max_compaction_bytes / 2;
+  int start_idx = 0;
+  int bottom_end_idx = 0;
+  int bottom_start_idx = 0;
+  uint64_t non_bottom_size = 0;
+  uint64_t bottom_size = 0;
+  bool end_bottom_size_counted = false;
+  for (int end_idx = 0; end_idx < static_cast<int>(files.size()); end_idx++) {
+    FileMetaData* end_file = files[end_idx];
+
+    // Include bottom most level files smaller than the current second
+    // last level file.
+    int num_skipped = 0;
+    while (bottom_end_idx < static_cast<int>(bottom_files.size()) &&
+           icmp_->Compare(bottom_files[bottom_end_idx]->largest,
+                          end_file->smallest) < 0) {
+      if (!end_bottom_size_counted) {
+        bottom_size += bottom_files[bottom_end_idx]->fd.file_size;
+      }
+      bottom_end_idx++;
+      end_bottom_size_counted = false;
+      num_skipped++;
+    }
+
+    if (num_skipped > 1) {
+      // At least a file in the bottom most level falls into the file gap. No
+      // reason to include the file. We cut the range and start a new sliding
+      // window.
+      start_idx = end_idx;
+    }
+
+    if (start_idx == end_idx) {
+      // new sliding window.
+      non_bottom_size = 0;
+      bottom_size = 0;
+      bottom_start_idx = bottom_end_idx;
+      end_bottom_size_counted = false;
+    }
+
+    non_bottom_size += end_file->fd.file_size;
+
+    // Include all overlapping files in bottom level.
+    while (bottom_end_idx < static_cast<int>(bottom_files.size()) &&
+           icmp_->Compare(bottom_files[bottom_end_idx]->smallest,
+                          end_file->largest) < 0) {
+      if (!end_bottom_size_counted) {
+        bottom_size += bottom_files[bottom_end_idx]->fd.file_size;
+        end_bottom_size_counted = true;
+      }
+      if (icmp_->Compare(bottom_files[bottom_end_idx]->largest,
+                         end_file->largest) > 0) {
+        // next level file cross large boundary of current file.
+        break;
+      }
+      bottom_end_idx++;
+      end_bottom_size_counted = false;
+    }
+
+    if ((non_bottom_size + bottom_size > comp_thres_size ||
+         end_idx == static_cast<int>(files.size()) - 1) &&
+        non_bottom_size > 0) {  // Do we alow 0 size file at all?
+      // If it is a better compaction, remember it in picked* variables.
+      double fanout = static_cast<double>(bottom_size) /
+                      static_cast<double>(non_bottom_size);
+      if (fanout < picked_fanout) {
+        picked_start_idx = start_idx;
+        picked_end_idx = end_idx;
+        picked_fanout = fanout;
+      }
+      // Shrink from the start end to under comp_thres_size
+      while (non_bottom_size + bottom_size > comp_thres_size &&
+             start_idx <= end_idx) {
+        non_bottom_size -= files[start_idx]->fd.file_size;
+        start_idx++;
+        if (start_idx < static_cast<int>(files.size())) {
+          while (bottom_start_idx <= bottom_end_idx &&
+                 icmp_->Compare(bottom_files[bottom_start_idx]->largest,
+                                files[start_idx]->smallest) < 0) {
+            bottom_size -= bottom_files[bottom_start_idx]->fd.file_size;
+            bottom_start_idx++;
+          }
+        }
+      }
+    }
+  }
+
+  if (picked_fanout >= fanout_threshold) {
+    assert(picked_fanout == fanout_threshold);
+    return nullptr;
+  }
+
+  std::vector<CompactionInputFiles> inputs;
+  CompactionInputFiles bottom_level_inputs;
+  CompactionInputFiles second_last_level_inputs;
+  second_last_level_inputs.level = second_last_level;
+  bottom_level_inputs.level = output_level;
+  for (int i = picked_start_idx; i <= picked_end_idx; i++) {
+    if (files[i]->being_compacted) {
+      return nullptr;
+    }
+    second_last_level_inputs.files.push_back(files[i]);
+  }
+  assert(!second_last_level_inputs.empty());
+  if (!picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                       &second_last_level_inputs,
+                                       /*next_smallest=*/nullptr)) {
+    return nullptr;
+  }
+  // We might be able to avoid this binary search if we save and expand
+  // from bottom_start_idx and bottom_end_idx, but for now, we use
+  // SetupOtherInputs() for simplicity.
+  int parent_index = -1;  // Create and use bottom_start_idx?
+  if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_,
+                                 &second_last_level_inputs,
+                                 &bottom_level_inputs, &parent_index,
+                                 /*base_index=*/-1)) {
+    return nullptr;
+  }
+
+  // Try to include files in upper levels if they fall into the range.
+  // Since we need to go from lower level up and this is in the reverse
+  // order, compared to level order, we first write to an reversed
+  // data structure and finally copy them to compaction inputs.
+  InternalKey smallest, largest;
+  picker_->GetRange(second_last_level_inputs, &smallest, &largest);
+  std::vector<CompactionInputFiles> inputs_reverse;
+  for (auto it = ++(++sorted_runs_.rbegin()); it != sorted_runs_.rend(); it++) {
+    SortedRun& sr = *it;
+    if (sr.level == 0) {
+      break;
+    }
+    std::vector<FileMetaData*> level_inputs;
+    vstorage_->GetCleanInputsWithinInterval(sr.level, &smallest, &largest,
+                                            &level_inputs);
+    if (!level_inputs.empty()) {
+      inputs_reverse.push_back({});
+      inputs_reverse.back().level = sr.level;
+      inputs_reverse.back().files = level_inputs;
+      picker_->GetRange(inputs_reverse.back(), &smallest, &largest);
+    }
+  }
+  for (auto it = inputs_reverse.rbegin(); it != inputs_reverse.rend(); it++) {
+    inputs.push_back(*it);
+  }
+
+  inputs.push_back(second_last_level_inputs);
+  inputs.push_back(bottom_level_inputs);
+
+  int start_level = Compaction::kInvalidLevel;
+  for (const auto& in : inputs) {
+    if (!in.empty()) {
+      // inputs should already be sorted by level
+      start_level = in.level;
+      break;
+    }
+  }
+
+  // intra L0 compactions outputs could have overlap
+  if (output_level != 0 &&
+      picker_->FilesRangeOverlapWithCompaction(
+          inputs, output_level,
+          Compaction::EvaluatePenultimateLevel(vstorage_, ioptions_,
+                                               start_level, output_level))) {
+    return nullptr;
+  }
+
+  // TODO support multi paths?
+  uint32_t path_id = 0;
+  return new Compaction(
+      vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_,
+      std::move(inputs), output_level,
+      MaxFileSizeForLevel(mutable_cf_options_, output_level,
+                          kCompactionStyleUniversal),
+      GetMaxOverlappingBytes(), path_id,
+      GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1,
+                         true /* enable_compression */),
+      GetCompressionOptions(mutable_cf_options_, vstorage_, output_level,
+                            true /* enable_compression */),
+      Temperature::kUnknown,
+      /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
+      /* trim_ts */ "", score_, false /* deletion_compaction */,
+      /* l0_files_might_overlap */ true,
+      CompactionReason::kUniversalSizeAmplification);
+}
+
+// Pick files marked for compaction. Typically, files are marked by
+// CompactOnDeleteCollector due to the presence of tombstones.
+Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
+  CompactionInputFiles start_level_inputs;
+  int output_level;
+  std::vector<CompactionInputFiles> inputs;
+  std::vector<FileMetaData*> grandparents;
+
+  if (vstorage_->num_levels() == 1) {
+    // This is single level universal. Since we're basically trying to reclaim
+    // space by processing files marked for compaction due to high tombstone
+    // density, let's do the same thing as compaction to reduce size amp which
+    // has the same goals.
+    int start_index = -1;
+
+    start_level_inputs.level = 0;
+    start_level_inputs.files.clear();
+    output_level = 0;
+    // Find the first file marked for compaction. Ignore the last file
+    for (size_t loop = 0; loop + 1 < sorted_runs_.size(); loop++) {
+      SortedRun* sr = &sorted_runs_[loop];
+      if (sr->being_compacted) {
+        continue;
+      }
+      FileMetaData* f = vstorage_->LevelFiles(0)[loop];
+      if (f->marked_for_compaction) {
+        start_level_inputs.files.push_back(f);
+        start_index =
+            static_cast<int>(loop);  // Consider this as the first candidate.
+        break;
+      }
+    }
+    if (start_index < 0) {
+      // Either no file marked, or they're already being compacted
+      return nullptr;
+    }
+
+    for (size_t loop = start_index + 1; loop < sorted_runs_.size(); loop++) {
+      SortedRun* sr = &sorted_runs_[loop];
+      if (sr->being_compacted) {
+        break;
+      }
+
+      FileMetaData* f = vstorage_->LevelFiles(0)[loop];
+      start_level_inputs.files.push_back(f);
+    }
+    if (start_level_inputs.size() <= 1) {
+      // If only the last file in L0 is marked for compaction, ignore it
+      return nullptr;
+    }
+    inputs.push_back(start_level_inputs);
+  } else {
+    int start_level;
+
+    // For multi-level universal, the strategy is to make this look more like
+    // leveled. We pick one of the files marked for compaction and compact with
+    // overlapping files in the adjacent level.
+    picker_->PickFilesMarkedForCompaction(cf_name_, vstorage_, &start_level,
+                                          &output_level, &start_level_inputs);
+    if (start_level_inputs.empty()) {
+      return nullptr;
+    }
+
+    // Pick the first non-empty level after the start_level
+    for (output_level = start_level + 1; output_level < vstorage_->num_levels();
+         output_level++) {
+      if (vstorage_->NumLevelFiles(output_level) != 0) {
+        break;
+      }
+    }
+
+    // If all higher levels are empty, pick the highest level as output level
+    if (output_level == vstorage_->num_levels()) {
+      if (start_level == 0) {
+        output_level = vstorage_->num_levels() - 1;
+      } else {
+        // If start level is non-zero and all higher levels are empty, this
+        // compaction will translate into a trivial move. Since the idea is
+        // to reclaim space and trivial move doesn't help with that, we
+        // skip compaction in this case and return nullptr
+        return nullptr;
+      }
+    }
+    if (ioptions_.allow_ingest_behind &&
+        output_level == vstorage_->num_levels() - 1) {
+      assert(output_level > 1);
+      output_level--;
+    }
+
+    if (output_level != 0) {
+      if (start_level == 0) {
+        if (!picker_->GetOverlappingL0Files(vstorage_, &start_level_inputs,
+                                            output_level, nullptr)) {
+          return nullptr;
+        }
+      }
+
+      CompactionInputFiles output_level_inputs;
+      int parent_index = -1;
+
+      output_level_inputs.level = output_level;
+      if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_,
+                                     &start_level_inputs, &output_level_inputs,
+                                     &parent_index, -1)) {
+        return nullptr;
+      }
+      inputs.push_back(start_level_inputs);
+      if (!output_level_inputs.empty()) {
+        inputs.push_back(output_level_inputs);
+      }
+      if (picker_->FilesRangeOverlapWithCompaction(
+              inputs, output_level,
+              Compaction::EvaluatePenultimateLevel(
+                  vstorage_, ioptions_, start_level, output_level))) {
+        return nullptr;
+      }
+
+      picker_->GetGrandparents(vstorage_, start_level_inputs,
+                               output_level_inputs, &grandparents);
+    } else {
+      inputs.push_back(start_level_inputs);
+    }
+  }
+
+  uint64_t estimated_total_size = 0;
+  // Use size of the output level as estimated file size
+  for (FileMetaData* f : vstorage_->LevelFiles(output_level)) {
+    estimated_total_size += f->fd.GetFileSize();
+  }
+  uint32_t path_id =
+      GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
+  return new Compaction(
+      vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_,
+      std::move(inputs), output_level,
+      MaxFileSizeForLevel(mutable_cf_options_, output_level,
+                          kCompactionStyleUniversal),
+      /* max_grandparent_overlap_bytes */ GetMaxOverlappingBytes(), path_id,
+      GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1),
+      GetCompressionOptions(mutable_cf_options_, vstorage_, output_level),
+      Temperature::kUnknown,
+      /* max_subcompactions */ 0, grandparents, /* is manual */ false,
+      /* trim_ts */ "", score_, false /* deletion_compaction */,
+      /* l0_files_might_overlap */ true,
+      CompactionReason::kFilesMarkedForCompaction);
+}
+
+Compaction* UniversalCompactionBuilder::PickCompactionToOldest(
+    size_t start_index, CompactionReason compaction_reason) {
+  return PickCompactionWithSortedRunRange(start_index, sorted_runs_.size() - 1,
+                                          compaction_reason);
+}
+
+Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange(
+    size_t start_index, size_t end_index, CompactionReason compaction_reason) {
+  assert(start_index < sorted_runs_.size());
+
+  // Estimate total file size
+  uint64_t estimated_total_size = 0;
+  for (size_t loop = start_index; loop <= end_index; loop++) {
+    estimated_total_size += sorted_runs_[loop].size;
+  }
+  uint32_t path_id =
+      GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
+  int start_level = sorted_runs_[start_index].level;
+
+  std::vector<CompactionInputFiles> inputs(vstorage_->num_levels());
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    inputs[i].level = start_level + static_cast<int>(i);
+  }
+  for (size_t loop = start_index; loop <= end_index; loop++) {
+    auto& picking_sr = sorted_runs_[loop];
+    if (picking_sr.level == 0) {
+      FileMetaData* f = picking_sr.file;
+      inputs[0].files.push_back(f);
+    } else {
+      auto& files = inputs[picking_sr.level - start_level].files;
+      for (auto* f : vstorage_->LevelFiles(picking_sr.level)) {
+        files.push_back(f);
+      }
+    }
+    std::string comp_reason_print_string;
+    if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+      comp_reason_print_string = "periodic compaction";
+    } else if (compaction_reason ==
+               CompactionReason::kUniversalSizeAmplification) {
+      comp_reason_print_string = "size amp";
+    } else {
+      assert(false);
+      comp_reason_print_string = "unknown: ";
+      comp_reason_print_string.append(
+          std::to_string(static_cast<int>(compaction_reason)));
+    }
+
+    char file_num_buf[256];
+    picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop);
+    ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: %s picking %s",
+                     cf_name_.c_str(), comp_reason_print_string.c_str(),
+                     file_num_buf);
+  }
+
+  int output_level;
+  if (end_index == sorted_runs_.size() - 1) {
+    // output files at the last level, unless it's reserved
+    output_level = vstorage_->num_levels() - 1;
+    // last level is reserved for the files ingested behind
+    if (ioptions_.allow_ingest_behind) {
+      assert(output_level > 1);
+      output_level--;
+    }
+  } else {
+    // if it's not including all sorted_runs, it can only output to the level
+    // above the `end_index + 1` sorted_run.
+    output_level = sorted_runs_[end_index + 1].level - 1;
+  }
+
+  // intra L0 compactions outputs could have overlap
+  if (output_level != 0 &&
+      picker_->FilesRangeOverlapWithCompaction(
+          inputs, output_level,
+          Compaction::EvaluatePenultimateLevel(vstorage_, ioptions_,
+                                               start_level, output_level))) {
+    return nullptr;
+  }
+
+  // We never check size for
+  // compaction_options_universal.compression_size_percent,
+  // because we always compact all the files, so always compress.
+  return new Compaction(
+      vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_,
+      std::move(inputs), output_level,
+      MaxFileSizeForLevel(mutable_cf_options_, output_level,
+                          kCompactionStyleUniversal),
+      GetMaxOverlappingBytes(), path_id,
+      GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1,
+                         true /* enable_compression */),
+      GetCompressionOptions(mutable_cf_options_, vstorage_, output_level,
+                            true /* enable_compression */),
+      Temperature::kUnknown,
+      /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
+      /* trim_ts */ "", score_, false /* deletion_compaction */,
+      /* l0_files_might_overlap */ true, compaction_reason);
+}
+
+Compaction* UniversalCompactionBuilder::PickPeriodicCompaction() {
+  ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Periodic Compaction",
+                   cf_name_.c_str());
+
+  // In universal compaction, sorted runs contain older data are almost always
+  // generated earlier too. To simplify the problem, we just try to trigger
+  // a full compaction. We start from the oldest sorted run and include
+  // all sorted runs, until we hit a sorted already being compacted.
+  // Since usually the largest (which is usually the oldest) sorted run is
+  // included anyway, doing a full compaction won't increase write
+  // amplification much.
+
+  // Get some information from marked files to check whether a file is
+  // included in the compaction.
+
+  size_t start_index = sorted_runs_.size();
+  while (start_index > 0 && !sorted_runs_[start_index - 1].being_compacted) {
+    start_index--;
+  }
+  if (start_index == sorted_runs_.size()) {
+    return nullptr;
+  }
+
+  // There is a rare corner case where we can't pick up all the files
+  // because some files are being compacted and we end up with picking files
+  // but none of them need periodic compaction. Unless we simply recompact
+  // the last sorted run (either the last level or last L0 file), we would just
+  // execute the compaction, in order to simplify  the logic.
+  if (start_index == sorted_runs_.size() - 1) {
+    bool included_file_marked = false;
+    int start_level = sorted_runs_[start_index].level;
+    FileMetaData* start_file = sorted_runs_[start_index].file;
+    for (const std::pair<int, FileMetaData*>& level_file_pair :
+         vstorage_->FilesMarkedForPeriodicCompaction()) {
+      if (start_level != 0) {
+        // Last sorted run is a level
+        if (start_level == level_file_pair.first) {
+          included_file_marked = true;
+          break;
+        }
+      } else {
+        // Last sorted run is a L0 file.
+        if (start_file == level_file_pair.second) {
+          included_file_marked = true;
+          break;
+        }
+      }
+    }
+    if (!included_file_marked) {
+      ROCKS_LOG_BUFFER(log_buffer_,
+                       "[%s] Universal: Cannot form a compaction covering file "
+                       "marked for periodic compaction",
+                       cf_name_.c_str());
+      return nullptr;
+    }
+  }
+
+  Compaction* c = PickCompactionToOldest(start_index,
+                                         CompactionReason::kPeriodicCompaction);
+
+  TEST_SYNC_POINT_CALLBACK(
+      "UniversalCompactionPicker::PickPeriodicCompaction:Return", c);
+
+  return c;
+}
+
+uint64_t UniversalCompactionBuilder::GetMaxOverlappingBytes() const {
+  if (!mutable_cf_options_.compaction_options_universal.incremental) {
+    return std::numeric_limits<uint64_t>::max();
+  } else {
+    // Try to align cutting boundary with files at the next level if the
+    // file isn't end up with 1/2 of target size, or it would overlap
+    // with two full size files at the next level.
+    return mutable_cf_options_.target_file_size_base / 2 * 3;
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_picker_universal.h b/src/rocksdb/db/compaction/compaction_picker_universal.h
new file mode 100644
index 000000000..5f897cc9b
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_universal.h
@@ -0,0 +1,32 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "db/compaction/compaction_picker.h"
+
+namespace ROCKSDB_NAMESPACE {
+class UniversalCompactionPicker : public CompactionPicker {
+ public:
+  UniversalCompactionPicker(const ImmutableOptions& ioptions,
+                            const InternalKeyComparator* icmp)
+      : CompactionPicker(ioptions, icmp) {}
+  virtual Compaction* PickCompaction(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+      LogBuffer* log_buffer,
+      SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
+  virtual int MaxOutputLevel() const override { return NumberLevels() - 1; }
+
+  virtual bool NeedsCompaction(
+      const VersionStorageInfo* vstorage) const override;
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_service_job.cc b/src/rocksdb/db/compaction/compaction_service_job.cc
new file mode 100644
index 000000000..1d2e99d99
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_service_job.cc
@@ -0,0 +1,829 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_job.h"
+#include "db/compaction/compaction_state.h"
+#include "logging/logging.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/thread_status_util.h"
+#include "options/options_helper.h"
+#include "rocksdb/utilities/options_type.h"
+
+#ifndef ROCKSDB_LITE
+namespace ROCKSDB_NAMESPACE {
+class SubcompactionState;
+
+CompactionServiceJobStatus
+CompactionJob::ProcessKeyValueCompactionWithCompactionService(
+    SubcompactionState* sub_compact) {
+  assert(sub_compact);
+  assert(sub_compact->compaction);
+  assert(db_options_.compaction_service);
+
+  const Compaction* compaction = sub_compact->compaction;
+  CompactionServiceInput compaction_input;
+  compaction_input.output_level = compaction->output_level();
+  compaction_input.db_id = db_id_;
+
+  const std::vector<CompactionInputFiles>& inputs =
+      *(compact_->compaction->inputs());
+  for (const auto& files_per_level : inputs) {
+    for (const auto& file : files_per_level.files) {
+      compaction_input.input_files.emplace_back(
+          MakeTableFileName(file->fd.GetNumber()));
+    }
+  }
+  compaction_input.column_family.name =
+      compaction->column_family_data()->GetName();
+  compaction_input.column_family.options =
+      compaction->column_family_data()->GetLatestCFOptions();
+  compaction_input.db_options =
+      BuildDBOptions(db_options_, mutable_db_options_copy_);
+  compaction_input.snapshots = existing_snapshots_;
+  compaction_input.has_begin = sub_compact->start.has_value();
+  compaction_input.begin =
+      compaction_input.has_begin ? sub_compact->start->ToString() : "";
+  compaction_input.has_end = sub_compact->end.has_value();
+  compaction_input.end =
+      compaction_input.has_end ? sub_compact->end->ToString() : "";
+
+  std::string compaction_input_binary;
+  Status s = compaction_input.Write(&compaction_input_binary);
+  if (!s.ok()) {
+    sub_compact->status = s;
+    return CompactionServiceJobStatus::kFailure;
+  }
+
+  std::ostringstream input_files_oss;
+  bool is_first_one = true;
+  for (const auto& file : compaction_input.input_files) {
+    input_files_oss << (is_first_one ? "" : ", ") << file;
+    is_first_one = false;
+  }
+
+  ROCKS_LOG_INFO(
+      db_options_.info_log,
+      "[%s] [JOB %d] Starting remote compaction (output level: %d): %s",
+      compaction_input.column_family.name.c_str(), job_id_,
+      compaction_input.output_level, input_files_oss.str().c_str());
+  CompactionServiceJobInfo info(dbname_, db_id_, db_session_id_,
+                                GetCompactionId(sub_compact), thread_pri_);
+  CompactionServiceJobStatus compaction_status =
+      db_options_.compaction_service->StartV2(info, compaction_input_binary);
+  switch (compaction_status) {
+    case CompactionServiceJobStatus::kSuccess:
+      break;
+    case CompactionServiceJobStatus::kFailure:
+      sub_compact->status = Status::Incomplete(
+          "CompactionService failed to start compaction job.");
+      ROCKS_LOG_WARN(db_options_.info_log,
+                     "[%s] [JOB %d] Remote compaction failed to start.",
+                     compaction_input.column_family.name.c_str(), job_id_);
+      return compaction_status;
+    case CompactionServiceJobStatus::kUseLocal:
+      ROCKS_LOG_INFO(
+          db_options_.info_log,
+          "[%s] [JOB %d] Remote compaction fallback to local by API Start.",
+          compaction_input.column_family.name.c_str(), job_id_);
+      return compaction_status;
+    default:
+      assert(false);  // unknown status
+      break;
+  }
+
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "[%s] [JOB %d] Waiting for remote compaction...",
+                 compaction_input.column_family.name.c_str(), job_id_);
+  std::string compaction_result_binary;
+  compaction_status = db_options_.compaction_service->WaitForCompleteV2(
+      info, &compaction_result_binary);
+
+  if (compaction_status == CompactionServiceJobStatus::kUseLocal) {
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "[%s] [JOB %d] Remote compaction fallback to local by API "
+                   "WaitForComplete.",
+                   compaction_input.column_family.name.c_str(), job_id_);
+    return compaction_status;
+  }
+
+  CompactionServiceResult compaction_result;
+  s = CompactionServiceResult::Read(compaction_result_binary,
+                                    &compaction_result);
+
+  if (compaction_status == CompactionServiceJobStatus::kFailure) {
+    if (s.ok()) {
+      if (compaction_result.status.ok()) {
+        sub_compact->status = Status::Incomplete(
+            "CompactionService failed to run the compaction job (even though "
+            "the internal status is okay).");
+      } else {
+        // set the current sub compaction status with the status returned from
+        // remote
+        sub_compact->status = compaction_result.status;
+      }
+    } else {
+      sub_compact->status = Status::Incomplete(
+          "CompactionService failed to run the compaction job (and no valid "
+          "result is returned).");
+      compaction_result.status.PermitUncheckedError();
+    }
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "[%s] [JOB %d] Remote compaction failed.",
+                   compaction_input.column_family.name.c_str(), job_id_);
+    return compaction_status;
+  }
+
+  if (!s.ok()) {
+    sub_compact->status = s;
+    compaction_result.status.PermitUncheckedError();
+    return CompactionServiceJobStatus::kFailure;
+  }
+  sub_compact->status = compaction_result.status;
+
+  std::ostringstream output_files_oss;
+  is_first_one = true;
+  for (const auto& file : compaction_result.output_files) {
+    output_files_oss << (is_first_one ? "" : ", ") << file.file_name;
+    is_first_one = false;
+  }
+
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "[%s] [JOB %d] Receive remote compaction result, output path: "
+                 "%s, files: %s",
+                 compaction_input.column_family.name.c_str(), job_id_,
+                 compaction_result.output_path.c_str(),
+                 output_files_oss.str().c_str());
+
+  if (!s.ok()) {
+    sub_compact->status = s;
+    return CompactionServiceJobStatus::kFailure;
+  }
+
+  for (const auto& file : compaction_result.output_files) {
+    uint64_t file_num = versions_->NewFileNumber();
+    auto src_file = compaction_result.output_path + "/" + file.file_name;
+    auto tgt_file = TableFileName(compaction->immutable_options()->cf_paths,
+                                  file_num, compaction->output_path_id());
+    s = fs_->RenameFile(src_file, tgt_file, IOOptions(), nullptr);
+    if (!s.ok()) {
+      sub_compact->status = s;
+      return CompactionServiceJobStatus::kFailure;
+    }
+
+    FileMetaData meta;
+    uint64_t file_size;
+    s = fs_->GetFileSize(tgt_file, IOOptions(), &file_size, nullptr);
+    if (!s.ok()) {
+      sub_compact->status = s;
+      return CompactionServiceJobStatus::kFailure;
+    }
+    meta.fd = FileDescriptor(file_num, compaction->output_path_id(), file_size,
+                             file.smallest_seqno, file.largest_seqno);
+    meta.smallest.DecodeFrom(file.smallest_internal_key);
+    meta.largest.DecodeFrom(file.largest_internal_key);
+    meta.oldest_ancester_time = file.oldest_ancester_time;
+    meta.file_creation_time = file.file_creation_time;
+    meta.marked_for_compaction = file.marked_for_compaction;
+    meta.unique_id = file.unique_id;
+
+    auto cfd = compaction->column_family_data();
+    sub_compact->Current().AddOutput(std::move(meta),
+                                     cfd->internal_comparator(), false, false,
+                                     true, file.paranoid_hash);
+  }
+  sub_compact->compaction_job_stats = compaction_result.stats;
+  sub_compact->Current().SetNumOutputRecords(
+      compaction_result.num_output_records);
+  sub_compact->Current().SetTotalBytes(compaction_result.total_bytes);
+  RecordTick(stats_, REMOTE_COMPACT_READ_BYTES, compaction_result.bytes_read);
+  RecordTick(stats_, REMOTE_COMPACT_WRITE_BYTES,
+             compaction_result.bytes_written);
+  return CompactionServiceJobStatus::kSuccess;
+}
+
+std::string CompactionServiceCompactionJob::GetTableFileName(
+    uint64_t file_number) {
+  return MakeTableFileName(output_path_, file_number);
+}
+
+void CompactionServiceCompactionJob::RecordCompactionIOStats() {
+  compaction_result_->bytes_read += IOSTATS(bytes_read);
+  compaction_result_->bytes_written += IOSTATS(bytes_written);
+  CompactionJob::RecordCompactionIOStats();
+}
+
+CompactionServiceCompactionJob::CompactionServiceCompactionJob(
+    int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
+    const MutableDBOptions& mutable_db_options, const FileOptions& file_options,
+    VersionSet* versions, const std::atomic<bool>* shutting_down,
+    LogBuffer* log_buffer, FSDirectory* output_directory, Statistics* stats,
+    InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+    std::vector<SequenceNumber> existing_snapshots,
+    std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+    const std::string& dbname, const std::shared_ptr<IOTracer>& io_tracer,
+    const std::atomic<bool>& manual_compaction_canceled,
+    const std::string& db_id, const std::string& db_session_id,
+    std::string output_path,
+    const CompactionServiceInput& compaction_service_input,
+    CompactionServiceResult* compaction_service_result)
+    : CompactionJob(
+          job_id, compaction, db_options, mutable_db_options, file_options,
+          versions, shutting_down, log_buffer, nullptr, output_directory,
+          nullptr, stats, db_mutex, db_error_handler,
+          std::move(existing_snapshots), kMaxSequenceNumber, nullptr, nullptr,
+          std::move(table_cache), event_logger,
+          compaction->mutable_cf_options()->paranoid_file_checks,
+          compaction->mutable_cf_options()->report_bg_io_stats, dbname,
+          &(compaction_service_result->stats), Env::Priority::USER, io_tracer,
+          manual_compaction_canceled, db_id, db_session_id,
+          compaction->column_family_data()->GetFullHistoryTsLow()),
+      output_path_(std::move(output_path)),
+      compaction_input_(compaction_service_input),
+      compaction_result_(compaction_service_result) {}
+
+Status CompactionServiceCompactionJob::Run() {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_COMPACTION_RUN);
+
+  auto* c = compact_->compaction;
+  assert(c->column_family_data() != nullptr);
+  assert(c->column_family_data()->current()->storage_info()->NumLevelFiles(
+             compact_->compaction->level()) > 0);
+
+  write_hint_ =
+      c->column_family_data()->CalculateSSTWriteHint(c->output_level());
+  bottommost_level_ = c->bottommost_level();
+
+  Slice begin = compaction_input_.begin;
+  Slice end = compaction_input_.end;
+  compact_->sub_compact_states.emplace_back(
+      c,
+      compaction_input_.has_begin ? std::optional<Slice>(begin)
+                                  : std::optional<Slice>(),
+      compaction_input_.has_end ? std::optional<Slice>(end)
+                                : std::optional<Slice>(),
+      /*sub_job_id*/ 0);
+
+  log_buffer_->FlushBufferToLog();
+  LogCompaction();
+  const uint64_t start_micros = db_options_.clock->NowMicros();
+  // Pick the only sub-compaction we should have
+  assert(compact_->sub_compact_states.size() == 1);
+  SubcompactionState* sub_compact = compact_->sub_compact_states.data();
+
+  ProcessKeyValueCompaction(sub_compact);
+
+  compaction_stats_.stats.micros =
+      db_options_.clock->NowMicros() - start_micros;
+  compaction_stats_.stats.cpu_micros =
+      sub_compact->compaction_job_stats.cpu_micros;
+
+  RecordTimeToHistogram(stats_, COMPACTION_TIME,
+                        compaction_stats_.stats.micros);
+  RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME,
+                        compaction_stats_.stats.cpu_micros);
+
+  Status status = sub_compact->status;
+  IOStatus io_s = sub_compact->io_status;
+
+  if (io_status_.ok()) {
+    io_status_ = io_s;
+  }
+
+  if (status.ok()) {
+    constexpr IODebugContext* dbg = nullptr;
+
+    if (output_directory_) {
+      io_s = output_directory_->FsyncWithDirOptions(IOOptions(), dbg,
+                                                    DirFsyncOptions());
+    }
+  }
+  if (io_status_.ok()) {
+    io_status_ = io_s;
+  }
+  if (status.ok()) {
+    status = io_s;
+  }
+  if (status.ok()) {
+    // TODO: Add verify_table()
+  }
+
+  // Finish up all book-keeping to unify the subcompaction results
+  compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_);
+  UpdateCompactionStats();
+  RecordCompactionIOStats();
+
+  LogFlush(db_options_.info_log);
+  compact_->status = status;
+  compact_->status.PermitUncheckedError();
+
+  // Build compaction result
+  compaction_result_->output_level = compact_->compaction->output_level();
+  compaction_result_->output_path = output_path_;
+  for (const auto& output_file : sub_compact->GetOutputs()) {
+    auto& meta = output_file.meta;
+    compaction_result_->output_files.emplace_back(
+        MakeTableFileName(meta.fd.GetNumber()), meta.fd.smallest_seqno,
+        meta.fd.largest_seqno, meta.smallest.Encode().ToString(),
+        meta.largest.Encode().ToString(), meta.oldest_ancester_time,
+        meta.file_creation_time, output_file.validator.GetHash(),
+        meta.marked_for_compaction, meta.unique_id);
+  }
+  InternalStats::CompactionStatsFull compaction_stats;
+  sub_compact->AggregateCompactionStats(compaction_stats);
+  compaction_result_->num_output_records =
+      compaction_stats.stats.num_output_records;
+  compaction_result_->total_bytes = compaction_stats.TotalBytesWritten();
+
+  return status;
+}
+
+void CompactionServiceCompactionJob::CleanupCompaction() {
+  CompactionJob::CleanupCompaction();
+}
+
+// Internal binary format for the input and result data
+enum BinaryFormatVersion : uint32_t {
+  kOptionsString = 1,  // Use string format similar to Option string format
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> cfd_type_info = {
+    {"name",
+     {offsetof(struct ColumnFamilyDescriptor, name), OptionType::kEncodedString,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"options",
+     {offsetof(struct ColumnFamilyDescriptor, options),
+      OptionType::kConfigurable, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone,
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const std::string& value, void* addr) {
+        auto cf_options = static_cast<ColumnFamilyOptions*>(addr);
+        return GetColumnFamilyOptionsFromString(opts, ColumnFamilyOptions(),
+                                                value, cf_options);
+      },
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const void* addr, std::string* value) {
+        const auto cf_options = static_cast<const ColumnFamilyOptions*>(addr);
+        std::string result;
+        auto status =
+            GetStringFromColumnFamilyOptions(opts, *cf_options, &result);
+        *value = "{" + result + "}";
+        return status;
+      },
+      [](const ConfigOptions& opts, const std::string& name, const void* addr1,
+         const void* addr2, std::string* mismatch) {
+        const auto this_one = static_cast<const ColumnFamilyOptions*>(addr1);
+        const auto that_one = static_cast<const ColumnFamilyOptions*>(addr2);
+        auto this_conf = CFOptionsAsConfigurable(*this_one);
+        auto that_conf = CFOptionsAsConfigurable(*that_one);
+        std::string mismatch_opt;
+        bool result =
+            this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt);
+        if (!result) {
+          *mismatch = name + "." + mismatch_opt;
+        }
+        return result;
+      }}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> cs_input_type_info = {
+    {"column_family",
+     OptionTypeInfo::Struct(
+         "column_family", &cfd_type_info,
+         offsetof(struct CompactionServiceInput, column_family),
+         OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
+    {"db_options",
+     {offsetof(struct CompactionServiceInput, db_options),
+      OptionType::kConfigurable, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone,
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const std::string& value, void* addr) {
+        auto options = static_cast<DBOptions*>(addr);
+        return GetDBOptionsFromString(opts, DBOptions(), value, options);
+      },
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const void* addr, std::string* value) {
+        const auto options = static_cast<const DBOptions*>(addr);
+        std::string result;
+        auto status = GetStringFromDBOptions(opts, *options, &result);
+        *value = "{" + result + "}";
+        return status;
+      },
+      [](const ConfigOptions& opts, const std::string& name, const void* addr1,
+         const void* addr2, std::string* mismatch) {
+        const auto this_one = static_cast<const DBOptions*>(addr1);
+        const auto that_one = static_cast<const DBOptions*>(addr2);
+        auto this_conf = DBOptionsAsConfigurable(*this_one);
+        auto that_conf = DBOptionsAsConfigurable(*that_one);
+        std::string mismatch_opt;
+        bool result =
+            this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt);
+        if (!result) {
+          *mismatch = name + "." + mismatch_opt;
+        }
+        return result;
+      }}},
+    {"snapshots", OptionTypeInfo::Vector<uint64_t>(
+                      offsetof(struct CompactionServiceInput, snapshots),
+                      OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+                      {0, OptionType::kUInt64T})},
+    {"input_files", OptionTypeInfo::Vector<std::string>(
+                        offsetof(struct CompactionServiceInput, input_files),
+                        OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+                        {0, OptionType::kEncodedString})},
+    {"output_level",
+     {offsetof(struct CompactionServiceInput, output_level), OptionType::kInt,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"db_id",
+     {offsetof(struct CompactionServiceInput, db_id),
+      OptionType::kEncodedString}},
+    {"has_begin",
+     {offsetof(struct CompactionServiceInput, has_begin), OptionType::kBoolean,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"begin",
+     {offsetof(struct CompactionServiceInput, begin),
+      OptionType::kEncodedString, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"has_end",
+     {offsetof(struct CompactionServiceInput, has_end), OptionType::kBoolean,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"end",
+     {offsetof(struct CompactionServiceInput, end), OptionType::kEncodedString,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    cs_output_file_type_info = {
+        {"file_name",
+         {offsetof(struct CompactionServiceOutputFile, file_name),
+          OptionType::kEncodedString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"smallest_seqno",
+         {offsetof(struct CompactionServiceOutputFile, smallest_seqno),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"largest_seqno",
+         {offsetof(struct CompactionServiceOutputFile, largest_seqno),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"smallest_internal_key",
+         {offsetof(struct CompactionServiceOutputFile, smallest_internal_key),
+          OptionType::kEncodedString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"largest_internal_key",
+         {offsetof(struct CompactionServiceOutputFile, largest_internal_key),
+          OptionType::kEncodedString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"oldest_ancester_time",
+         {offsetof(struct CompactionServiceOutputFile, oldest_ancester_time),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"file_creation_time",
+         {offsetof(struct CompactionServiceOutputFile, file_creation_time),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"paranoid_hash",
+         {offsetof(struct CompactionServiceOutputFile, paranoid_hash),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"marked_for_compaction",
+         {offsetof(struct CompactionServiceOutputFile, marked_for_compaction),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"unique_id",
+         OptionTypeInfo::Array<uint64_t, 2>(
+             offsetof(struct CompactionServiceOutputFile, unique_id),
+             OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+             {0, OptionType::kUInt64T})},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    compaction_job_stats_type_info = {
+        {"elapsed_micros",
+         {offsetof(struct CompactionJobStats, elapsed_micros),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"cpu_micros",
+         {offsetof(struct CompactionJobStats, cpu_micros), OptionType::kUInt64T,
+          OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+        {"num_input_records",
+         {offsetof(struct CompactionJobStats, num_input_records),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_blobs_read",
+         {offsetof(struct CompactionJobStats, num_blobs_read),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_input_files",
+         {offsetof(struct CompactionJobStats, num_input_files),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_input_files_at_output_level",
+         {offsetof(struct CompactionJobStats, num_input_files_at_output_level),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_output_records",
+         {offsetof(struct CompactionJobStats, num_output_records),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_output_files",
+         {offsetof(struct CompactionJobStats, num_output_files),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_output_files_blob",
+         {offsetof(struct CompactionJobStats, num_output_files_blob),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"is_full_compaction",
+         {offsetof(struct CompactionJobStats, is_full_compaction),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"is_manual_compaction",
+         {offsetof(struct CompactionJobStats, is_manual_compaction),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"total_input_bytes",
+         {offsetof(struct CompactionJobStats, total_input_bytes),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"total_blob_bytes_read",
+         {offsetof(struct CompactionJobStats, total_blob_bytes_read),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"total_output_bytes",
+         {offsetof(struct CompactionJobStats, total_output_bytes),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"total_output_bytes_blob",
+         {offsetof(struct CompactionJobStats, total_output_bytes_blob),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_records_replaced",
+         {offsetof(struct CompactionJobStats, num_records_replaced),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"total_input_raw_key_bytes",
+         {offsetof(struct CompactionJobStats, total_input_raw_key_bytes),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"total_input_raw_value_bytes",
+         {offsetof(struct CompactionJobStats, total_input_raw_value_bytes),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_input_deletion_records",
+         {offsetof(struct CompactionJobStats, num_input_deletion_records),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_expired_deletion_records",
+         {offsetof(struct CompactionJobStats, num_expired_deletion_records),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_corrupt_keys",
+         {offsetof(struct CompactionJobStats, num_corrupt_keys),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"file_write_nanos",
+         {offsetof(struct CompactionJobStats, file_write_nanos),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"file_range_sync_nanos",
+         {offsetof(struct CompactionJobStats, file_range_sync_nanos),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"file_fsync_nanos",
+         {offsetof(struct CompactionJobStats, file_fsync_nanos),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"file_prepare_write_nanos",
+         {offsetof(struct CompactionJobStats, file_prepare_write_nanos),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"smallest_output_key_prefix",
+         {offsetof(struct CompactionJobStats, smallest_output_key_prefix),
+          OptionType::kEncodedString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"largest_output_key_prefix",
+         {offsetof(struct CompactionJobStats, largest_output_key_prefix),
+          OptionType::kEncodedString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_single_del_fallthru",
+         {offsetof(struct CompactionJobStats, num_single_del_fallthru),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_single_del_mismatch",
+         {offsetof(struct CompactionJobStats, num_single_del_mismatch),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+};
+
+namespace {
+// this is a helper struct to serialize and deserialize class Status, because
+// Status's members are not public.
+struct StatusSerializationAdapter {
+  uint8_t code;
+  uint8_t subcode;
+  uint8_t severity;
+  std::string message;
+
+  StatusSerializationAdapter() = default;
+  explicit StatusSerializationAdapter(const Status& s) {
+    code = s.code();
+    subcode = s.subcode();
+    severity = s.severity();
+    auto msg = s.getState();
+    message = msg ? msg : "";
+  }
+
+  Status GetStatus() const {
+    return Status{static_cast<Status::Code>(code),
+                  static_cast<Status::SubCode>(subcode),
+                  static_cast<Status::Severity>(severity), message};
+  }
+};
+}  // namespace
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    status_adapter_type_info = {
+        {"code",
+         {offsetof(struct StatusSerializationAdapter, code),
+          OptionType::kUInt8T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"subcode",
+         {offsetof(struct StatusSerializationAdapter, subcode),
+          OptionType::kUInt8T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"severity",
+         {offsetof(struct StatusSerializationAdapter, severity),
+          OptionType::kUInt8T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"message",
+         {offsetof(struct StatusSerializationAdapter, message),
+          OptionType::kEncodedString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> cs_result_type_info = {
+    {"status",
+     {offsetof(struct CompactionServiceResult, status),
+      OptionType::kCustomizable, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone,
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const std::string& value, void* addr) {
+        auto status_obj = static_cast<Status*>(addr);
+        StatusSerializationAdapter adapter;
+        Status s = OptionTypeInfo::ParseType(
+            opts, value, status_adapter_type_info, &adapter);
+        *status_obj = adapter.GetStatus();
+        return s;
+      },
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const void* addr, std::string* value) {
+        const auto status_obj = static_cast<const Status*>(addr);
+        StatusSerializationAdapter adapter(*status_obj);
+        std::string result;
+        Status s = OptionTypeInfo::SerializeType(opts, status_adapter_type_info,
+                                                 &adapter, &result);
+        *value = "{" + result + "}";
+        return s;
+      },
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const void* addr1, const void* addr2, std::string* mismatch) {
+        const auto status1 = static_cast<const Status*>(addr1);
+        const auto status2 = static_cast<const Status*>(addr2);
+
+        StatusSerializationAdapter adatper1(*status1);
+        StatusSerializationAdapter adapter2(*status2);
+        return OptionTypeInfo::TypesAreEqual(opts, status_adapter_type_info,
+                                             &adatper1, &adapter2, mismatch);
+      }}},
+    {"output_files",
+     OptionTypeInfo::Vector<CompactionServiceOutputFile>(
+         offsetof(struct CompactionServiceResult, output_files),
+         OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+         OptionTypeInfo::Struct("output_files", &cs_output_file_type_info, 0,
+                                OptionVerificationType::kNormal,
+                                OptionTypeFlags::kNone))},
+    {"output_level",
+     {offsetof(struct CompactionServiceResult, output_level), OptionType::kInt,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"output_path",
+     {offsetof(struct CompactionServiceResult, output_path),
+      OptionType::kEncodedString, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"num_output_records",
+     {offsetof(struct CompactionServiceResult, num_output_records),
+      OptionType::kUInt64T, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"total_bytes",
+     {offsetof(struct CompactionServiceResult, total_bytes),
+      OptionType::kUInt64T, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"bytes_read",
+     {offsetof(struct CompactionServiceResult, bytes_read),
+      OptionType::kUInt64T, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"bytes_written",
+     {offsetof(struct CompactionServiceResult, bytes_written),
+      OptionType::kUInt64T, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"stats", OptionTypeInfo::Struct(
+                  "stats", &compaction_job_stats_type_info,
+                  offsetof(struct CompactionServiceResult, stats),
+                  OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
+};
+
+Status CompactionServiceInput::Read(const std::string& data_str,
+                                    CompactionServiceInput* obj) {
+  if (data_str.size() <= sizeof(BinaryFormatVersion)) {
+    return Status::InvalidArgument("Invalid CompactionServiceInput string");
+  }
+  auto format_version = DecodeFixed32(data_str.data());
+  if (format_version == kOptionsString) {
+    ConfigOptions cf;
+    cf.invoke_prepare_options = false;
+    cf.ignore_unknown_options = true;
+    return OptionTypeInfo::ParseType(
+        cf, data_str.substr(sizeof(BinaryFormatVersion)), cs_input_type_info,
+        obj);
+  } else {
+    return Status::NotSupported(
+        "Compaction Service Input data version not supported: " +
+        std::to_string(format_version));
+  }
+}
+
+Status CompactionServiceInput::Write(std::string* output) {
+  char buf[sizeof(BinaryFormatVersion)];
+  EncodeFixed32(buf, kOptionsString);
+  output->append(buf, sizeof(BinaryFormatVersion));
+  ConfigOptions cf;
+  cf.invoke_prepare_options = false;
+  return OptionTypeInfo::SerializeType(cf, cs_input_type_info, this, output);
+}
+
+Status CompactionServiceResult::Read(const std::string& data_str,
+                                     CompactionServiceResult* obj) {
+  if (data_str.size() <= sizeof(BinaryFormatVersion)) {
+    return Status::InvalidArgument("Invalid CompactionServiceResult string");
+  }
+  auto format_version = DecodeFixed32(data_str.data());
+  if (format_version == kOptionsString) {
+    ConfigOptions cf;
+    cf.invoke_prepare_options = false;
+    cf.ignore_unknown_options = true;
+    return OptionTypeInfo::ParseType(
+        cf, data_str.substr(sizeof(BinaryFormatVersion)), cs_result_type_info,
+        obj);
+  } else {
+    return Status::NotSupported(
+        "Compaction Service Result data version not supported: " +
+        std::to_string(format_version));
+  }
+}
+
+Status CompactionServiceResult::Write(std::string* output) {
+  char buf[sizeof(BinaryFormatVersion)];
+  EncodeFixed32(buf, kOptionsString);
+  output->append(buf, sizeof(BinaryFormatVersion));
+  ConfigOptions cf;
+  cf.invoke_prepare_options = false;
+  return OptionTypeInfo::SerializeType(cf, cs_result_type_info, this, output);
+}
+
+#ifndef NDEBUG
+bool CompactionServiceResult::TEST_Equals(CompactionServiceResult* other) {
+  std::string mismatch;
+  return TEST_Equals(other, &mismatch);
+}
+
+bool CompactionServiceResult::TEST_Equals(CompactionServiceResult* other,
+                                          std::string* mismatch) {
+  ConfigOptions cf;
+  cf.invoke_prepare_options = false;
+  return OptionTypeInfo::TypesAreEqual(cf, cs_result_type_info, this, other,
+                                       mismatch);
+}
+
+bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other) {
+  std::string mismatch;
+  return TEST_Equals(other, &mismatch);
+}
+
+bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other,
+                                         std::string* mismatch) {
+  ConfigOptions cf;
+  cf.invoke_prepare_options = false;
+  return OptionTypeInfo::TypesAreEqual(cf, cs_input_type_info, this, other,
+                                       mismatch);
+}
+#endif  // NDEBUG
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_service_test.cc b/src/rocksdb/db/compaction/compaction_service_test.cc
new file mode 100644
index 000000000..c475c4e3b
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_service_test.cc
@@ -0,0 +1,966 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "table/unique_id_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MyTestCompactionService : public CompactionService {
+ public:
+  MyTestCompactionService(
+      std::string db_path, Options& options,
+      std::shared_ptr<Statistics>& statistics,
+      std::vector<std::shared_ptr<EventListener>>& listeners,
+      std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
+          table_properties_collector_factories)
+      : db_path_(std::move(db_path)),
+        options_(options),
+        statistics_(statistics),
+        start_info_("na", "na", "na", 0, Env::TOTAL),
+        wait_info_("na", "na", "na", 0, Env::TOTAL),
+        listeners_(listeners),
+        table_properties_collector_factories_(
+            std::move(table_properties_collector_factories)) {}
+
+  static const char* kClassName() { return "MyTestCompactionService"; }
+
+  const char* Name() const override { return kClassName(); }
+
+  CompactionServiceJobStatus StartV2(
+      const CompactionServiceJobInfo& info,
+      const std::string& compaction_service_input) override {
+    InstrumentedMutexLock l(&mutex_);
+    start_info_ = info;
+    assert(info.db_name == db_path_);
+    jobs_.emplace(info.job_id, compaction_service_input);
+    CompactionServiceJobStatus s = CompactionServiceJobStatus::kSuccess;
+    if (is_override_start_status_) {
+      return override_start_status_;
+    }
+    return s;
+  }
+
+  CompactionServiceJobStatus WaitForCompleteV2(
+      const CompactionServiceJobInfo& info,
+      std::string* compaction_service_result) override {
+    std::string compaction_input;
+    assert(info.db_name == db_path_);
+    {
+      InstrumentedMutexLock l(&mutex_);
+      wait_info_ = info;
+      auto i = jobs_.find(info.job_id);
+      if (i == jobs_.end()) {
+        return CompactionServiceJobStatus::kFailure;
+      }
+      compaction_input = std::move(i->second);
+      jobs_.erase(i);
+    }
+
+    if (is_override_wait_status_) {
+      return override_wait_status_;
+    }
+
+    CompactionServiceOptionsOverride options_override;
+    options_override.env = options_.env;
+    options_override.file_checksum_gen_factory =
+        options_.file_checksum_gen_factory;
+    options_override.comparator = options_.comparator;
+    options_override.merge_operator = options_.merge_operator;
+    options_override.compaction_filter = options_.compaction_filter;
+    options_override.compaction_filter_factory =
+        options_.compaction_filter_factory;
+    options_override.prefix_extractor = options_.prefix_extractor;
+    options_override.table_factory = options_.table_factory;
+    options_override.sst_partitioner_factory = options_.sst_partitioner_factory;
+    options_override.statistics = statistics_;
+    if (!listeners_.empty()) {
+      options_override.listeners = listeners_;
+    }
+
+    if (!table_properties_collector_factories_.empty()) {
+      options_override.table_properties_collector_factories =
+          table_properties_collector_factories_;
+    }
+
+    OpenAndCompactOptions options;
+    options.canceled = &canceled_;
+
+    Status s = DB::OpenAndCompact(
+        options, db_path_, db_path_ + "/" + std::to_string(info.job_id),
+        compaction_input, compaction_service_result, options_override);
+    if (is_override_wait_result_) {
+      *compaction_service_result = override_wait_result_;
+    }
+    compaction_num_.fetch_add(1);
+    if (s.ok()) {
+      return CompactionServiceJobStatus::kSuccess;
+    } else {
+      return CompactionServiceJobStatus::kFailure;
+    }
+  }
+
+  int GetCompactionNum() { return compaction_num_.load(); }
+
+  CompactionServiceJobInfo GetCompactionInfoForStart() { return start_info_; }
+  CompactionServiceJobInfo GetCompactionInfoForWait() { return wait_info_; }
+
+  void OverrideStartStatus(CompactionServiceJobStatus s) {
+    is_override_start_status_ = true;
+    override_start_status_ = s;
+  }
+
+  void OverrideWaitStatus(CompactionServiceJobStatus s) {
+    is_override_wait_status_ = true;
+    override_wait_status_ = s;
+  }
+
+  void OverrideWaitResult(std::string str) {
+    is_override_wait_result_ = true;
+    override_wait_result_ = std::move(str);
+  }
+
+  void ResetOverride() {
+    is_override_wait_result_ = false;
+    is_override_start_status_ = false;
+    is_override_wait_status_ = false;
+  }
+
+  void SetCanceled(bool canceled) { canceled_ = canceled; }
+
+ private:
+  InstrumentedMutex mutex_;
+  std::atomic_int compaction_num_{0};
+  std::map<uint64_t, std::string> jobs_;
+  const std::string db_path_;
+  Options options_;
+  std::shared_ptr<Statistics> statistics_;
+  CompactionServiceJobInfo start_info_;
+  CompactionServiceJobInfo wait_info_;
+  bool is_override_start_status_ = false;
+  CompactionServiceJobStatus override_start_status_ =
+      CompactionServiceJobStatus::kFailure;
+  bool is_override_wait_status_ = false;
+  CompactionServiceJobStatus override_wait_status_ =
+      CompactionServiceJobStatus::kFailure;
+  bool is_override_wait_result_ = false;
+  std::string override_wait_result_;
+  std::vector<std::shared_ptr<EventListener>> listeners_;
+  std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
+      table_properties_collector_factories_;
+  std::atomic_bool canceled_{false};
+};
+
+class CompactionServiceTest : public DBTestBase {
+ public:
+  explicit CompactionServiceTest()
+      : DBTestBase("compaction_service_test", true) {}
+
+ protected:
+  void ReopenWithCompactionService(Options* options) {
+    options->env = env_;
+    primary_statistics_ = CreateDBStatistics();
+    options->statistics = primary_statistics_;
+    compactor_statistics_ = CreateDBStatistics();
+
+    compaction_service_ = std::make_shared<MyTestCompactionService>(
+        dbname_, *options, compactor_statistics_, remote_listeners,
+        remote_table_properties_collector_factories);
+    options->compaction_service = compaction_service_;
+    DestroyAndReopen(*options);
+  }
+
+  Statistics* GetCompactorStatistics() { return compactor_statistics_.get(); }
+
+  Statistics* GetPrimaryStatistics() { return primary_statistics_.get(); }
+
+  MyTestCompactionService* GetCompactionService() {
+    CompactionService* cs = compaction_service_.get();
+    return static_cast_with_check<MyTestCompactionService>(cs);
+  }
+
+  void GenerateTestData() {
+    // Generate 20 files @ L2
+    for (int i = 0; i < 20; i++) {
+      for (int j = 0; j < 10; j++) {
+        int key_id = i * 10 + j;
+        ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+      }
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(2);
+
+    // Generate 10 files @ L1 overlap with all 20 files @ L2
+    for (int i = 0; i < 10; i++) {
+      for (int j = 0; j < 10; j++) {
+        int key_id = i * 20 + j * 2;
+        ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id)));
+      }
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(1);
+    ASSERT_EQ(FilesPerLevel(), "0,10,20");
+  }
+
+  void VerifyTestData() {
+    for (int i = 0; i < 200; i++) {
+      auto result = Get(Key(i));
+      if (i % 2) {
+        ASSERT_EQ(result, "value" + std::to_string(i));
+      } else {
+        ASSERT_EQ(result, "value_new" + std::to_string(i));
+      }
+    }
+  }
+
+  std::vector<std::shared_ptr<EventListener>> remote_listeners;
+  std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
+      remote_table_properties_collector_factories;
+
+ private:
+  std::shared_ptr<Statistics> compactor_statistics_;
+  std::shared_ptr<Statistics> primary_statistics_;
+  std::shared_ptr<CompactionService> compaction_service_;
+};
+
+TEST_F(CompactionServiceTest, BasicCompactions) {
+  Options options = CurrentOptions();
+  ReopenWithCompactionService(&options);
+
+  Statistics* primary_statistics = GetPrimaryStatistics();
+  Statistics* compactor_statistics = GetCompactorStatistics();
+
+  for (int i = 0; i < 20; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 10 + j;
+      ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 20 + j * 2;
+      ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // verify result
+  for (int i = 0; i < 200; i++) {
+    auto result = Get(Key(i));
+    if (i % 2) {
+      ASSERT_EQ(result, "value" + std::to_string(i));
+    } else {
+      ASSERT_EQ(result, "value_new" + std::to_string(i));
+    }
+  }
+  auto my_cs = GetCompactionService();
+  ASSERT_GE(my_cs->GetCompactionNum(), 1);
+
+  // make sure the compaction statistics is only recorded on the remote side
+  ASSERT_GE(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES), 1);
+  ASSERT_GE(compactor_statistics->getTickerCount(COMPACT_READ_BYTES), 1);
+  ASSERT_EQ(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES), 0);
+  // even with remote compaction, primary host still needs to read SST files to
+  // `verify_table()`.
+  ASSERT_GE(primary_statistics->getTickerCount(COMPACT_READ_BYTES), 1);
+  // all the compaction write happens on the remote side
+  ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES),
+            compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES));
+  ASSERT_GE(primary_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES), 1);
+  ASSERT_GT(primary_statistics->getTickerCount(COMPACT_READ_BYTES),
+            primary_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES));
+  // compactor is already the remote side, which doesn't have remote
+  ASSERT_EQ(compactor_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES), 0);
+  ASSERT_EQ(compactor_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES),
+            0);
+
+  // Test failed compaction
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImplSecondary::CompactWithoutInstallation::End", [&](void* status) {
+        // override job status
+        auto s = static_cast<Status*>(status);
+        *s = Status::Aborted("MyTestCompactionService failed to compact!");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s;
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 20 + j * 2;
+      s = Put(Key(key_id), "value_new" + std::to_string(key_id));
+      if (s.IsAborted()) {
+        break;
+      }
+    }
+    if (s.IsAborted()) {
+      break;
+    }
+    s = Flush();
+    if (s.IsAborted()) {
+      break;
+    }
+    s = dbfull()->TEST_WaitForCompact();
+    if (s.IsAborted()) {
+      break;
+    }
+  }
+  ASSERT_TRUE(s.IsAborted());
+
+  // Test re-open and successful unique id verification
+  std::atomic_int verify_passed{0};
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTable::Open::PassedVerifyUniqueId", [&](void* arg) {
+        // override job status
+        auto id = static_cast<UniqueId64x2*>(arg);
+        assert(*id != kNullUniqueId64x2);
+        verify_passed++;
+      });
+  Reopen(options);
+  ASSERT_GT(verify_passed, 0);
+  Close();
+}
+
+TEST_F(CompactionServiceTest, ManualCompaction) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  ReopenWithCompactionService(&options);
+  GenerateTestData();
+
+  auto my_cs = GetCompactionService();
+
+  std::string start_str = Key(15);
+  std::string end_str = Key(45);
+  Slice start(start_str);
+  Slice end(end_str);
+  uint64_t comp_num = my_cs->GetCompactionNum();
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end));
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+  VerifyTestData();
+
+  start_str = Key(120);
+  start = start_str;
+  comp_num = my_cs->GetCompactionNum();
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, nullptr));
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+  VerifyTestData();
+
+  end_str = Key(92);
+  end = end_str;
+  comp_num = my_cs->GetCompactionNum();
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, &end));
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+  VerifyTestData();
+
+  comp_num = my_cs->GetCompactionNum();
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+  VerifyTestData();
+}
+
+TEST_F(CompactionServiceTest, CancelCompactionOnRemoteSide) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  ReopenWithCompactionService(&options);
+  GenerateTestData();
+
+  auto my_cs = GetCompactionService();
+
+  std::string start_str = Key(15);
+  std::string end_str = Key(45);
+  Slice start(start_str);
+  Slice end(end_str);
+  uint64_t comp_num = my_cs->GetCompactionNum();
+
+  // Test cancel compaction at the beginning
+  my_cs->SetCanceled(true);
+  auto s = db_->CompactRange(CompactRangeOptions(), &start, &end);
+  ASSERT_TRUE(s.IsIncomplete());
+  // compaction number is not increased
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num);
+  VerifyTestData();
+
+  // Test cancel compaction in progress
+  ReopenWithCompactionService(&options);
+  GenerateTestData();
+  my_cs = GetCompactionService();
+  my_cs->SetCanceled(false);
+
+  std::atomic_bool cancel_issued{false};
+  SyncPoint::GetInstance()->SetCallBack("CompactionJob::Run():Inprogress",
+                                        [&](void* /*arg*/) {
+                                          cancel_issued = true;
+                                          my_cs->SetCanceled(true);
+                                        });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  s = db_->CompactRange(CompactRangeOptions(), &start, &end);
+  ASSERT_TRUE(s.IsIncomplete());
+  ASSERT_TRUE(cancel_issued);
+  // compaction number is not increased
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num);
+  VerifyTestData();
+}
+
+TEST_F(CompactionServiceTest, FailedToStart) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  ReopenWithCompactionService(&options);
+
+  GenerateTestData();
+
+  auto my_cs = GetCompactionService();
+  my_cs->OverrideStartStatus(CompactionServiceJobStatus::kFailure);
+
+  std::string start_str = Key(15);
+  std::string end_str = Key(45);
+  Slice start(start_str);
+  Slice end(end_str);
+  Status s = db_->CompactRange(CompactRangeOptions(), &start, &end);
+  ASSERT_TRUE(s.IsIncomplete());
+}
+
+TEST_F(CompactionServiceTest, InvalidResult) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  ReopenWithCompactionService(&options);
+
+  GenerateTestData();
+
+  auto my_cs = GetCompactionService();
+  my_cs->OverrideWaitResult("Invalid Str");
+
+  std::string start_str = Key(15);
+  std::string end_str = Key(45);
+  Slice start(start_str);
+  Slice end(end_str);
+  Status s = db_->CompactRange(CompactRangeOptions(), &start, &end);
+  ASSERT_FALSE(s.ok());
+}
+
+TEST_F(CompactionServiceTest, SubCompaction) {
+  Options options = CurrentOptions();
+  options.max_subcompactions = 10;
+  options.target_file_size_base = 1 << 10;  // 1KB
+  options.disable_auto_compactions = true;
+  ReopenWithCompactionService(&options);
+
+  GenerateTestData();
+  VerifyTestData();
+
+  auto my_cs = GetCompactionService();
+  int compaction_num_before = my_cs->GetCompactionNum();
+
+  auto cro = CompactRangeOptions();
+  cro.max_subcompactions = 10;
+  Status s = db_->CompactRange(cro, nullptr, nullptr);
+  ASSERT_OK(s);
+  VerifyTestData();
+  int compaction_num = my_cs->GetCompactionNum() - compaction_num_before;
+  // make sure there's sub-compaction by checking the compaction number
+  ASSERT_GE(compaction_num, 2);
+}
+
+class PartialDeleteCompactionFilter : public CompactionFilter {
+ public:
+  CompactionFilter::Decision FilterV2(
+      int /*level*/, const Slice& key, ValueType /*value_type*/,
+      const Slice& /*existing_value*/, std::string* /*new_value*/,
+      std::string* /*skip_until*/) const override {
+    int i = std::stoi(key.ToString().substr(3));
+    if (i > 5 && i <= 105) {
+      return CompactionFilter::Decision::kRemove;
+    }
+    return CompactionFilter::Decision::kKeep;
+  }
+
+  const char* Name() const override { return "PartialDeleteCompactionFilter"; }
+};
+
+TEST_F(CompactionServiceTest, CompactionFilter) {
+  Options options = CurrentOptions();
+  std::unique_ptr<CompactionFilter> delete_comp_filter(
+      new PartialDeleteCompactionFilter());
+  options.compaction_filter = delete_comp_filter.get();
+  ReopenWithCompactionService(&options);
+
+  for (int i = 0; i < 20; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 10 + j;
+      ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 20 + j * 2;
+      ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // verify result
+  for (int i = 0; i < 200; i++) {
+    auto result = Get(Key(i));
+    if (i > 5 && i <= 105) {
+      ASSERT_EQ(result, "NOT_FOUND");
+    } else if (i % 2) {
+      ASSERT_EQ(result, "value" + std::to_string(i));
+    } else {
+      ASSERT_EQ(result, "value_new" + std::to_string(i));
+    }
+  }
+  auto my_cs = GetCompactionService();
+  ASSERT_GE(my_cs->GetCompactionNum(), 1);
+}
+
+TEST_F(CompactionServiceTest, Snapshot) {
+  Options options = CurrentOptions();
+  ReopenWithCompactionService(&options);
+
+  ASSERT_OK(Put(Key(1), "value1"));
+  ASSERT_OK(Put(Key(2), "value1"));
+  const Snapshot* s1 = db_->GetSnapshot();
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(1), "value2"));
+  ASSERT_OK(Put(Key(3), "value2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  auto my_cs = GetCompactionService();
+  ASSERT_GE(my_cs->GetCompactionNum(), 1);
+  ASSERT_EQ("value1", Get(Key(1), s1));
+  ASSERT_EQ("value2", Get(Key(1)));
+  db_->ReleaseSnapshot(s1);
+}
+
+TEST_F(CompactionServiceTest, ConcurrentCompaction) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 100;
+  options.max_background_jobs = 20;
+  ReopenWithCompactionService(&options);
+  GenerateTestData();
+
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(&meta);
+
+  std::vector<std::thread> threads;
+  for (const auto& file : meta.levels[1].files) {
+    threads.emplace_back(std::thread([&]() {
+      std::string fname = file.db_path + "/" + file.name;
+      ASSERT_OK(db_->CompactFiles(CompactionOptions(), {fname}, 2));
+    }));
+  }
+
+  for (auto& thread : threads) {
+    thread.join();
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // verify result
+  for (int i = 0; i < 200; i++) {
+    auto result = Get(Key(i));
+    if (i % 2) {
+      ASSERT_EQ(result, "value" + std::to_string(i));
+    } else {
+      ASSERT_EQ(result, "value_new" + std::to_string(i));
+    }
+  }
+  auto my_cs = GetCompactionService();
+  ASSERT_EQ(my_cs->GetCompactionNum(), 10);
+  ASSERT_EQ(FilesPerLevel(), "0,0,10");
+}
+
+TEST_F(CompactionServiceTest, CompactionInfo) {
+  Options options = CurrentOptions();
+  ReopenWithCompactionService(&options);
+
+  for (int i = 0; i < 20; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 10 + j;
+      ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 20 + j * 2;
+      ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  auto my_cs =
+      static_cast_with_check<MyTestCompactionService>(GetCompactionService());
+  uint64_t comp_num = my_cs->GetCompactionNum();
+  ASSERT_GE(comp_num, 1);
+
+  CompactionServiceJobInfo info = my_cs->GetCompactionInfoForStart();
+  ASSERT_EQ(dbname_, info.db_name);
+  std::string db_id, db_session_id;
+  ASSERT_OK(db_->GetDbIdentity(db_id));
+  ASSERT_EQ(db_id, info.db_id);
+  ASSERT_OK(db_->GetDbSessionId(db_session_id));
+  ASSERT_EQ(db_session_id, info.db_session_id);
+  ASSERT_EQ(Env::LOW, info.priority);
+  info = my_cs->GetCompactionInfoForWait();
+  ASSERT_EQ(dbname_, info.db_name);
+  ASSERT_EQ(db_id, info.db_id);
+  ASSERT_EQ(db_session_id, info.db_session_id);
+  ASSERT_EQ(Env::LOW, info.priority);
+
+  // Test priority USER
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(&meta);
+  SstFileMetaData file = meta.levels[1].files[0];
+  ASSERT_OK(db_->CompactFiles(CompactionOptions(),
+                              {file.db_path + "/" + file.name}, 2));
+  info = my_cs->GetCompactionInfoForStart();
+  ASSERT_EQ(Env::USER, info.priority);
+  info = my_cs->GetCompactionInfoForWait();
+  ASSERT_EQ(Env::USER, info.priority);
+
+  // Test priority BOTTOM
+  env_->SetBackgroundThreads(1, Env::BOTTOM);
+  options.num_levels = 2;
+  ReopenWithCompactionService(&options);
+  my_cs =
+      static_cast_with_check<MyTestCompactionService>(GetCompactionService());
+
+  for (int i = 0; i < 20; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 10 + j;
+      ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 20 + j * 2;
+      ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  info = my_cs->GetCompactionInfoForStart();
+  ASSERT_EQ(Env::BOTTOM, info.priority);
+  info = my_cs->GetCompactionInfoForWait();
+  ASSERT_EQ(Env::BOTTOM, info.priority);
+}
+
+TEST_F(CompactionServiceTest, FallbackLocalAuto) {
+  Options options = CurrentOptions();
+  ReopenWithCompactionService(&options);
+
+  auto my_cs = GetCompactionService();
+  Statistics* compactor_statistics = GetCompactorStatistics();
+  Statistics* primary_statistics = GetPrimaryStatistics();
+  uint64_t compactor_write_bytes =
+      compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+  uint64_t primary_write_bytes =
+      primary_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+
+  my_cs->OverrideStartStatus(CompactionServiceJobStatus::kUseLocal);
+
+  for (int i = 0; i < 20; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 10 + j;
+      ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 20 + j * 2;
+      ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // verify result
+  for (int i = 0; i < 200; i++) {
+    auto result = Get(Key(i));
+    if (i % 2) {
+      ASSERT_EQ(result, "value" + std::to_string(i));
+    } else {
+      ASSERT_EQ(result, "value_new" + std::to_string(i));
+    }
+  }
+
+  ASSERT_EQ(my_cs->GetCompactionNum(), 0);
+
+  // make sure the compaction statistics is only recorded on the local side
+  ASSERT_EQ(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+            compactor_write_bytes);
+  ASSERT_GT(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+            primary_write_bytes);
+  ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES), 0);
+  ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES), 0);
+}
+
+TEST_F(CompactionServiceTest, FallbackLocalManual) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  ReopenWithCompactionService(&options);
+
+  GenerateTestData();
+  VerifyTestData();
+
+  auto my_cs = GetCompactionService();
+  Statistics* compactor_statistics = GetCompactorStatistics();
+  Statistics* primary_statistics = GetPrimaryStatistics();
+  uint64_t compactor_write_bytes =
+      compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+  uint64_t primary_write_bytes =
+      primary_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+
+  // re-enable remote compaction
+  my_cs->ResetOverride();
+  std::string start_str = Key(15);
+  std::string end_str = Key(45);
+  Slice start(start_str);
+  Slice end(end_str);
+  uint64_t comp_num = my_cs->GetCompactionNum();
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end));
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+  // make sure the compaction statistics is only recorded on the remote side
+  ASSERT_GT(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+            compactor_write_bytes);
+  ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES),
+            compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES));
+  ASSERT_EQ(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+            primary_write_bytes);
+
+  // return run local again with API WaitForComplete
+  my_cs->OverrideWaitStatus(CompactionServiceJobStatus::kUseLocal);
+  start_str = Key(120);
+  start = start_str;
+  comp_num = my_cs->GetCompactionNum();
+  compactor_write_bytes =
+      compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+  primary_write_bytes = primary_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, nullptr));
+  ASSERT_EQ(my_cs->GetCompactionNum(),
+            comp_num);  // no remote compaction is run
+  // make sure the compaction statistics is only recorded on the local side
+  ASSERT_EQ(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+            compactor_write_bytes);
+  ASSERT_GT(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+            primary_write_bytes);
+  ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES),
+            compactor_write_bytes);
+
+  // verify result after 2 manual compactions
+  VerifyTestData();
+}
+
+TEST_F(CompactionServiceTest, RemoteEventListener) {
+  class RemoteEventListenerTest : public EventListener {
+   public:
+    const char* Name() const override { return "RemoteEventListenerTest"; }
+
+    void OnSubcompactionBegin(const SubcompactionJobInfo& info) override {
+      auto result = on_going_compactions.emplace(info.job_id);
+      ASSERT_TRUE(result.second);  // make sure there's no duplication
+      compaction_num++;
+      EventListener::OnSubcompactionBegin(info);
+    }
+    void OnSubcompactionCompleted(const SubcompactionJobInfo& info) override {
+      auto num = on_going_compactions.erase(info.job_id);
+      ASSERT_TRUE(num == 1);  // make sure the compaction id exists
+      EventListener::OnSubcompactionCompleted(info);
+    }
+    void OnTableFileCreated(const TableFileCreationInfo& info) override {
+      ASSERT_EQ(on_going_compactions.count(info.job_id), 1);
+      file_created++;
+      EventListener::OnTableFileCreated(info);
+    }
+    void OnTableFileCreationStarted(
+        const TableFileCreationBriefInfo& info) override {
+      ASSERT_EQ(on_going_compactions.count(info.job_id), 1);
+      file_creation_started++;
+      EventListener::OnTableFileCreationStarted(info);
+    }
+
+    bool ShouldBeNotifiedOnFileIO() override {
+      file_io_notified++;
+      return EventListener::ShouldBeNotifiedOnFileIO();
+    }
+
+    std::atomic_uint64_t file_io_notified{0};
+    std::atomic_uint64_t file_creation_started{0};
+    std::atomic_uint64_t file_created{0};
+
+    std::set<int> on_going_compactions;  // store the job_id
+    std::atomic_uint64_t compaction_num{0};
+  };
+
+  auto listener = new RemoteEventListenerTest();
+  remote_listeners.emplace_back(listener);
+
+  Options options = CurrentOptions();
+  ReopenWithCompactionService(&options);
+
+  for (int i = 0; i < 20; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 10 + j;
+      ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 20 + j * 2;
+      ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // check the events are triggered
+  ASSERT_TRUE(listener->file_io_notified > 0);
+  ASSERT_TRUE(listener->file_creation_started > 0);
+  ASSERT_TRUE(listener->file_created > 0);
+  ASSERT_TRUE(listener->compaction_num > 0);
+  ASSERT_TRUE(listener->on_going_compactions.empty());
+
+  // verify result
+  for (int i = 0; i < 200; i++) {
+    auto result = Get(Key(i));
+    if (i % 2) {
+      ASSERT_EQ(result, "value" + std::to_string(i));
+    } else {
+      ASSERT_EQ(result, "value_new" + std::to_string(i));
+    }
+  }
+}
+
+TEST_F(CompactionServiceTest, TablePropertiesCollector) {
+  const static std::string kUserPropertyName = "TestCount";
+
+  class TablePropertiesCollectorTest : public TablePropertiesCollector {
+   public:
+    Status Finish(UserCollectedProperties* properties) override {
+      *properties = UserCollectedProperties{
+          {kUserPropertyName, std::to_string(count_)},
+      };
+      return Status::OK();
+    }
+
+    UserCollectedProperties GetReadableProperties() const override {
+      return UserCollectedProperties();
+    }
+
+    const char* Name() const override { return "TablePropertiesCollectorTest"; }
+
+    Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
+                      EntryType /*type*/, SequenceNumber /*seq*/,
+                      uint64_t /*file_size*/) override {
+      count_++;
+      return Status::OK();
+    }
+
+   private:
+    uint32_t count_ = 0;
+  };
+
+  class TablePropertiesCollectorFactoryTest
+      : public TablePropertiesCollectorFactory {
+   public:
+    TablePropertiesCollector* CreateTablePropertiesCollector(
+        TablePropertiesCollectorFactory::Context /*context*/) override {
+      return new TablePropertiesCollectorTest();
+    }
+
+    const char* Name() const override {
+      return "TablePropertiesCollectorFactoryTest";
+    }
+  };
+
+  auto factory = new TablePropertiesCollectorFactoryTest();
+  remote_table_properties_collector_factories.emplace_back(factory);
+
+  const int kNumSst = 3;
+  const int kLevel0Trigger = 4;
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kLevel0Trigger;
+  ReopenWithCompactionService(&options);
+
+  // generate a few SSTs locally which should not have user property
+  for (int i = 0; i < kNumSst; i++) {
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(i * 10 + j), "value"));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  TablePropertiesCollection fname_to_props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&fname_to_props));
+  for (const auto& file_props : fname_to_props) {
+    auto properties = file_props.second->user_collected_properties;
+    auto it = properties.find(kUserPropertyName);
+    ASSERT_EQ(it, properties.end());
+  }
+
+  // trigger compaction
+  for (int i = kNumSst; i < kLevel0Trigger; i++) {
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(i * 10 + j), "value"));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&fname_to_props));
+
+  bool has_user_property = false;
+  for (const auto& file_props : fname_to_props) {
+    auto properties = file_props.second->user_collected_properties;
+    auto it = properties.find(kUserPropertyName);
+    if (it != properties.end()) {
+      has_user_property = true;
+      ASSERT_GT(std::stoi(it->second), 0);
+    }
+  }
+  ASSERT_TRUE(has_user_property);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as CompactionService is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_state.cc b/src/rocksdb/db/compaction/compaction_state.cc
new file mode 100644
index 000000000..ee4b0c189
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_state.cc
@@ -0,0 +1,46 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_state.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Slice CompactionState::SmallestUserKey() {
+  for (const auto& sub_compact_state : sub_compact_states) {
+    Slice smallest = sub_compact_state.SmallestUserKey();
+    if (!smallest.empty()) {
+      return smallest;
+    }
+  }
+  // If there is no finished output, return an empty slice.
+  return Slice{nullptr, 0};
+}
+
+Slice CompactionState::LargestUserKey() {
+  for (auto it = sub_compact_states.rbegin(); it < sub_compact_states.rend();
+       ++it) {
+    Slice largest = it->LargestUserKey();
+    if (!largest.empty()) {
+      return largest;
+    }
+  }
+  // If there is no finished output, return an empty slice.
+  return Slice{nullptr, 0};
+}
+
+void CompactionState::AggregateCompactionStats(
+    InternalStats::CompactionStatsFull& compaction_stats,
+    CompactionJobStats& compaction_job_stats) {
+  for (const auto& sc : sub_compact_states) {
+    sc.AggregateCompactionStats(compaction_stats);
+    compaction_job_stats.Add(sc.compaction_job_stats);
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_state.h b/src/rocksdb/db/compaction/compaction_state.h
new file mode 100644
index 000000000..cc5b66c68
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_state.h
@@ -0,0 +1,42 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/compaction/compaction.h"
+#include "db/compaction/subcompaction_state.h"
+#include "db/internal_stats.h"
+
+// Data structures used for compaction_job and compaction_service_job which has
+// the list of sub_compact_states and the aggregated information for the
+// compaction.
+namespace ROCKSDB_NAMESPACE {
+
+// Maintains state for the entire compaction
+class CompactionState {
+ public:
+  Compaction* const compaction;
+
+  // REQUIRED: subcompaction states are stored in order of increasing key-range
+  std::vector<SubcompactionState> sub_compact_states;
+  Status status;
+
+  void AggregateCompactionStats(
+      InternalStats::CompactionStatsFull& compaction_stats,
+      CompactionJobStats& compaction_job_stats);
+
+  explicit CompactionState(Compaction* c) : compaction(c) {}
+
+  Slice SmallestUserKey();
+
+  Slice LargestUserKey();
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/file_pri.h b/src/rocksdb/db/compaction/file_pri.h
new file mode 100644
index 000000000..82dddcf93
--- /dev/null
+++ b/src/rocksdb/db/compaction/file_pri.h
@@ -0,0 +1,92 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+#include <algorithm>
+
+#include "db/version_edit.h"
+
+namespace ROCKSDB_NAMESPACE {
+// We boost files that are closer to TTL limit. This boosting could be
+// through FileMetaData.compensated_file_size but this compensated size
+// is widely used as something similar to file size so dramatically boost
+// the value might cause unintended consequences.
+//
+// This boosting algorithm can go very fancy, but here we use a simple
+// formula which can satisify:
+// (1) Different levels are triggered slightly differently to avoid
+//     too many cascading cases
+// (2) Files in the same level get boosting more when TTL gets closer.
+//
+// Don't do any boosting before TTL has past by half. This is to make
+// sure lower write amp for most of the case. And all levels should be
+// fully boosted when total TTL compaction threshold triggers.
+// Differientiate boosting ranges of each level by 1/2. This will make
+// range for each level exponentially increasing. We could do it by
+// having them to be equal, or go even fancier. We can adjust it after
+// we observe the behavior in production.
+// The threshold starting boosting:
+// +------------------------------------------------------------------ +
+// ^                            ^   ^     ^       ^                 ^
+// Age 0                        ... |     |    second last level    thresold
+//                                  |     |
+//                                  |  third last level
+//                                  |
+//                            forth last level
+//
+// We arbitrarily set with 0 when a file is aged boost_age_start and
+// grow linearly. The ratio is arbitrarily set so that when the next level
+// starts to boost, the previous level's boosting amount is 16.
+class FileTtlBooster {
+ public:
+  FileTtlBooster(uint64_t current_time, uint64_t ttl, int num_non_empty_levels,
+                 int level)
+      : current_time_(current_time) {
+    if (ttl == 0 || level == 0 || level >= num_non_empty_levels - 1) {
+      enabled_ = false;
+      boost_age_start_ = 0;
+      boost_step_ = 1;
+    } else {
+      enabled_ = true;
+      uint64_t all_boost_start_age = ttl / 2;
+      uint64_t all_boost_age_range = (ttl / 32) * 31 - all_boost_start_age;
+      uint64_t boost_age_range =
+          all_boost_age_range >> (num_non_empty_levels - level - 1);
+      boost_age_start_ = all_boost_start_age + boost_age_range;
+      const uint64_t kBoostRatio = 16;
+      // prevent 0 value to avoid divide 0 error.
+      boost_step_ = std::max(boost_age_range / kBoostRatio, uint64_t{1});
+    }
+  }
+
+  uint64_t GetBoostScore(FileMetaData* f) {
+    if (!enabled_) {
+      return 1;
+    }
+    uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
+    if (oldest_ancester_time >= current_time_) {
+      return 1;
+    }
+    uint64_t age = current_time_ - oldest_ancester_time;
+    if (age > boost_age_start_) {
+      // Use integer just for convenience.
+      // We could make all file_to_order double if we want.
+      // Technically this can overflow if users override timing and
+      // give a very high current time. Ignore the case for simplicity.
+      // Boosting is addition to current value, so +1. This will effectively
+      // make boosting to kick in after the first boost_step_ is reached.
+      return (age - boost_age_start_) / boost_step_ + 1;
+    }
+    return 1;
+  }
+
+ private:
+  bool enabled_;
+  uint64_t current_time_;
+  uint64_t boost_age_start_;
+  uint64_t boost_step_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/sst_partitioner.cc b/src/rocksdb/db/compaction/sst_partitioner.cc
new file mode 100644
index 000000000..9e7f9fa89
--- /dev/null
+++ b/src/rocksdb/db/compaction/sst_partitioner.cc
@@ -0,0 +1,90 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "rocksdb/sst_partitioner.h"
+
+#include <algorithm>
+
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+
+namespace ROCKSDB_NAMESPACE {
+static std::unordered_map<std::string, OptionTypeInfo>
+    sst_fixed_prefix_type_info = {
+#ifndef ROCKSDB_LITE
+        {"length",
+         {0, OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+};
+
+SstPartitionerFixedPrefixFactory::SstPartitionerFixedPrefixFactory(size_t len)
+    : len_(len) {
+  RegisterOptions("Length", &len_, &sst_fixed_prefix_type_info);
+}
+
+PartitionerResult SstPartitionerFixedPrefix::ShouldPartition(
+    const PartitionerRequest& request) {
+  Slice last_key_fixed(*request.prev_user_key);
+  if (last_key_fixed.size() > len_) {
+    last_key_fixed.size_ = len_;
+  }
+  Slice current_key_fixed(*request.current_user_key);
+  if (current_key_fixed.size() > len_) {
+    current_key_fixed.size_ = len_;
+  }
+  return last_key_fixed.compare(current_key_fixed) != 0 ? kRequired
+                                                        : kNotRequired;
+}
+
+bool SstPartitionerFixedPrefix::CanDoTrivialMove(
+    const Slice& smallest_user_key, const Slice& largest_user_key) {
+  return ShouldPartition(PartitionerRequest(smallest_user_key, largest_user_key,
+                                            0)) == kNotRequired;
+}
+
+std::unique_ptr<SstPartitioner>
+SstPartitionerFixedPrefixFactory::CreatePartitioner(
+    const SstPartitioner::Context& /* context */) const {
+  return std::unique_ptr<SstPartitioner>(new SstPartitionerFixedPrefix(len_));
+}
+
+std::shared_ptr<SstPartitionerFactory> NewSstPartitionerFixedPrefixFactory(
+    size_t prefix_len) {
+  return std::make_shared<SstPartitionerFixedPrefixFactory>(prefix_len);
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+static int RegisterSstPartitionerFactories(ObjectLibrary& library,
+                                           const std::string& /*arg*/) {
+  library.AddFactory<SstPartitionerFactory>(
+      SstPartitionerFixedPrefixFactory::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<SstPartitionerFactory>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new SstPartitionerFixedPrefixFactory(0));
+        return guard->get();
+      });
+  return 1;
+}
+}  // namespace
+#endif  // ROCKSDB_LITE
+
+Status SstPartitionerFactory::CreateFromString(
+    const ConfigOptions& options, const std::string& value,
+    std::shared_ptr<SstPartitionerFactory>* result) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    RegisterSstPartitionerFactories(*(ObjectLibrary::Default().get()), "");
+  });
+#endif  // ROCKSDB_LITE
+  return LoadSharedObject<SstPartitionerFactory>(options, value, nullptr,
+                                                 result);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/subcompaction_state.cc b/src/rocksdb/db/compaction/subcompaction_state.cc
new file mode 100644
index 000000000..0c56471e9
--- /dev/null
+++ b/src/rocksdb/db/compaction/subcompaction_state.cc
@@ -0,0 +1,106 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/subcompaction_state.h"
+
+#include "rocksdb/sst_partitioner.h"
+
+namespace ROCKSDB_NAMESPACE {
+void SubcompactionState::AggregateCompactionStats(
+    InternalStats::CompactionStatsFull& compaction_stats) const {
+  compaction_stats.stats.Add(compaction_outputs_.stats_);
+  if (HasPenultimateLevelOutputs()) {
+    compaction_stats.has_penultimate_level_output = true;
+    compaction_stats.penultimate_level_stats.Add(
+        penultimate_level_outputs_.stats_);
+  }
+}
+
+OutputIterator SubcompactionState::GetOutputs() const {
+  return OutputIterator(penultimate_level_outputs_.outputs_,
+                        compaction_outputs_.outputs_);
+}
+
+void SubcompactionState::Cleanup(Cache* cache) {
+  penultimate_level_outputs_.Cleanup();
+  compaction_outputs_.Cleanup();
+
+  if (!status.ok()) {
+    for (const auto& out : GetOutputs()) {
+      // If this file was inserted into the table cache then remove
+      // them here because this compaction was not committed.
+      TableCache::Evict(cache, out.meta.fd.GetNumber());
+    }
+  }
+  // TODO: sub_compact.io_status is not checked like status. Not sure if thats
+  // intentional. So ignoring the io_status as of now.
+  io_status.PermitUncheckedError();
+}
+
+Slice SubcompactionState::SmallestUserKey() const {
+  if (has_penultimate_level_outputs_) {
+    Slice a = compaction_outputs_.SmallestUserKey();
+    Slice b = penultimate_level_outputs_.SmallestUserKey();
+    if (a.empty()) {
+      return b;
+    }
+    if (b.empty()) {
+      return a;
+    }
+    const Comparator* user_cmp =
+        compaction->column_family_data()->user_comparator();
+    if (user_cmp->Compare(a, b) > 0) {
+      return b;
+    } else {
+      return a;
+    }
+  } else {
+    return compaction_outputs_.SmallestUserKey();
+  }
+}
+
+Slice SubcompactionState::LargestUserKey() const {
+  if (has_penultimate_level_outputs_) {
+    Slice a = compaction_outputs_.LargestUserKey();
+    Slice b = penultimate_level_outputs_.LargestUserKey();
+    if (a.empty()) {
+      return b;
+    }
+    if (b.empty()) {
+      return a;
+    }
+    const Comparator* user_cmp =
+        compaction->column_family_data()->user_comparator();
+    if (user_cmp->Compare(a, b) < 0) {
+      return b;
+    } else {
+      return a;
+    }
+  } else {
+    return compaction_outputs_.LargestUserKey();
+  }
+}
+
+Status SubcompactionState::AddToOutput(
+    const CompactionIterator& iter,
+    const CompactionFileOpenFunc& open_file_func,
+    const CompactionFileCloseFunc& close_file_func) {
+  // update target output first
+  is_current_penultimate_level_ = iter.output_to_penultimate_level();
+  current_outputs_ = is_current_penultimate_level_ ? &penultimate_level_outputs_
+                                                   : &compaction_outputs_;
+  if (is_current_penultimate_level_) {
+    has_penultimate_level_outputs_ = true;
+  }
+
+  return Current().AddToOutput(iter, open_file_func, close_file_func);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/subcompaction_state.h b/src/rocksdb/db/compaction/subcompaction_state.h
new file mode 100644
index 000000000..13e63120f
--- /dev/null
+++ b/src/rocksdb/db/compaction/subcompaction_state.h
@@ -0,0 +1,214 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <optional>
+
+#include "db/blob/blob_file_addition.h"
+#include "db/blob/blob_garbage_meter.h"
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_iterator.h"
+#include "db/compaction/compaction_outputs.h"
+#include "db/internal_stats.h"
+#include "db/output_validator.h"
+#include "db/range_del_aggregator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Maintains state and outputs for each sub-compaction
+// It contains 2 `CompactionOutputs`:
+//  1. one for the normal output files
+//  2. another for the penultimate level outputs
+// a `current` pointer maintains the current output group, when calling
+// `AddToOutput()`, it checks the output of the current compaction_iterator key
+// and point `current` to the target output group. By default, it just points to
+// normal compaction_outputs, if the compaction_iterator key should be placed on
+// the penultimate level, `current` is changed to point to
+// `penultimate_level_outputs`.
+// The later operations uses `Current()` to get the target group.
+//
+// +----------+          +-----------------------------+      +---------+
+// | *current |--------> | compaction_outputs          |----->| output  |
+// +----------+          +-----------------------------+      +---------+
+//       |                                                    | output  |
+//       |                                                    +---------+
+//       |                                                    |  ...    |
+//       |
+//       |               +-----------------------------+      +---------+
+//       +-------------> | penultimate_level_outputs   |----->| output  |
+//                       +-----------------------------+      +---------+
+//                                                            |  ...    |
+
+class SubcompactionState {
+ public:
+  const Compaction* compaction;
+
+  // The boundaries of the key-range this compaction is interested in. No two
+  // sub-compactions may have overlapping key-ranges.
+  // 'start' is inclusive, 'end' is exclusive, and nullptr means unbounded
+  const std::optional<Slice> start, end;
+
+  // The return status of this sub-compaction
+  Status status;
+
+  // The return IO Status of this sub-compaction
+  IOStatus io_status;
+
+  // Notify on sub-compaction completion only if listener was notified on
+  // sub-compaction begin.
+  bool notify_on_subcompaction_completion = false;
+
+  // compaction job stats for this sub-compaction
+  CompactionJobStats compaction_job_stats;
+
+  // sub-compaction job id, which is used to identify different sub-compaction
+  // within the same compaction job.
+  const uint32_t sub_job_id;
+
+  Slice SmallestUserKey() const;
+
+  Slice LargestUserKey() const;
+
+  // Get all outputs from the subcompaction. For per_key_placement compaction,
+  // it returns both the last level outputs and penultimate level outputs.
+  OutputIterator GetOutputs() const;
+
+  // Assign range dels aggregator, for each range_del, it can only be assigned
+  // to one output level, for per_key_placement, it's going to be the
+  // penultimate level.
+  void AssignRangeDelAggregator(
+      std::unique_ptr<CompactionRangeDelAggregator>&& range_del_agg) {
+    if (compaction->SupportsPerKeyPlacement()) {
+      penultimate_level_outputs_.AssignRangeDelAggregator(
+          std::move(range_del_agg));
+    } else {
+      compaction_outputs_.AssignRangeDelAggregator(std::move(range_del_agg));
+    }
+  }
+
+  void RemoveLastEmptyOutput() {
+    compaction_outputs_.RemoveLastEmptyOutput();
+    penultimate_level_outputs_.RemoveLastEmptyOutput();
+  }
+
+#ifndef ROCKSDB_LITE
+  void BuildSubcompactionJobInfo(
+      SubcompactionJobInfo& subcompaction_job_info) const {
+    const Compaction* c = compaction;
+    const ColumnFamilyData* cfd = c->column_family_data();
+
+    subcompaction_job_info.cf_id = cfd->GetID();
+    subcompaction_job_info.cf_name = cfd->GetName();
+    subcompaction_job_info.status = status;
+    subcompaction_job_info.subcompaction_job_id = static_cast<int>(sub_job_id);
+    subcompaction_job_info.base_input_level = c->start_level();
+    subcompaction_job_info.output_level = c->output_level();
+    subcompaction_job_info.stats = compaction_job_stats;
+  }
+#endif  // !ROCKSDB_LITE
+
+  SubcompactionState() = delete;
+  SubcompactionState(const SubcompactionState&) = delete;
+  SubcompactionState& operator=(const SubcompactionState&) = delete;
+
+  SubcompactionState(Compaction* c, const std::optional<Slice> _start,
+                     const std::optional<Slice> _end, uint32_t _sub_job_id)
+      : compaction(c),
+        start(_start),
+        end(_end),
+        sub_job_id(_sub_job_id),
+        compaction_outputs_(c, /*is_penultimate_level=*/false),
+        penultimate_level_outputs_(c, /*is_penultimate_level=*/true) {
+    assert(compaction != nullptr);
+    // Set output split key (used for RoundRobin feature) only for normal
+    // compaction_outputs, output to penultimate_level feature doesn't support
+    // RoundRobin feature (and may never going to be supported, because for
+    // RoundRobin, the data time is mostly naturally sorted, no need to have
+    // per-key placement with output_to_penultimate_level).
+    compaction_outputs_.SetOutputSlitKey(start, end);
+  }
+
+  SubcompactionState(SubcompactionState&& state) noexcept
+      : compaction(state.compaction),
+        start(state.start),
+        end(state.end),
+        status(std::move(state.status)),
+        io_status(std::move(state.io_status)),
+        notify_on_subcompaction_completion(
+            state.notify_on_subcompaction_completion),
+        compaction_job_stats(std::move(state.compaction_job_stats)),
+        sub_job_id(state.sub_job_id),
+        compaction_outputs_(std::move(state.compaction_outputs_)),
+        penultimate_level_outputs_(std::move(state.penultimate_level_outputs_)),
+        is_current_penultimate_level_(state.is_current_penultimate_level_),
+        has_penultimate_level_outputs_(state.has_penultimate_level_outputs_) {
+    current_outputs_ = is_current_penultimate_level_
+                           ? &penultimate_level_outputs_
+                           : &compaction_outputs_;
+  }
+
+  bool HasPenultimateLevelOutputs() const {
+    return has_penultimate_level_outputs_ ||
+           penultimate_level_outputs_.HasRangeDel();
+  }
+
+  bool IsCurrentPenultimateLevel() const {
+    return is_current_penultimate_level_;
+  }
+
+  // Add all the new files from this compaction to version_edit
+  void AddOutputsEdit(VersionEdit* out_edit) const {
+    for (const auto& file : penultimate_level_outputs_.outputs_) {
+      out_edit->AddFile(compaction->GetPenultimateLevel(), file.meta);
+    }
+    for (const auto& file : compaction_outputs_.outputs_) {
+      out_edit->AddFile(compaction->output_level(), file.meta);
+    }
+  }
+
+  void Cleanup(Cache* cache);
+
+  void AggregateCompactionStats(
+      InternalStats::CompactionStatsFull& compaction_stats) const;
+
+  CompactionOutputs& Current() const {
+    assert(current_outputs_);
+    return *current_outputs_;
+  }
+
+  // Add compaction_iterator key/value to the `Current` output group.
+  Status AddToOutput(const CompactionIterator& iter,
+                     const CompactionFileOpenFunc& open_file_func,
+                     const CompactionFileCloseFunc& close_file_func);
+
+  // Close all compaction output files, both output_to_penultimate_level outputs
+  // and normal outputs.
+  Status CloseCompactionFiles(const Status& curr_status,
+                              const CompactionFileOpenFunc& open_file_func,
+                              const CompactionFileCloseFunc& close_file_func) {
+    // Call FinishCompactionOutputFile() even if status is not ok: it needs to
+    // close the output file.
+    Status s = penultimate_level_outputs_.CloseOutput(
+        curr_status, open_file_func, close_file_func);
+    s = compaction_outputs_.CloseOutput(s, open_file_func, close_file_func);
+    return s;
+  }
+
+ private:
+  // State kept for output being generated
+  CompactionOutputs compaction_outputs_;
+  CompactionOutputs penultimate_level_outputs_;
+  CompactionOutputs* current_outputs_ = &compaction_outputs_;
+  bool is_current_penultimate_level_ = false;
+  bool has_penultimate_level_outputs_ = false;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/tiered_compaction_test.cc b/src/rocksdb/db/compaction/tiered_compaction_test.cc
new file mode 100644
index 000000000..aaebcfd94
--- /dev/null
+++ b/src/rocksdb/db/compaction/tiered_compaction_test.cc
@@ -0,0 +1,2028 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/iostats_context.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/utilities/debug.h"
+#include "test_util/mock_time_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#if !defined(ROCKSDB_LITE)
+
+class TieredCompactionTest : public DBTestBase,
+                             public testing::WithParamInterface<bool> {
+ public:
+  TieredCompactionTest()
+      : DBTestBase("tiered_compaction_test", /*env_do_fsync=*/true),
+        kBasicCompStats(CompactionReason::kUniversalSizeAmplification, 1),
+        kBasicPerKeyPlacementCompStats(
+            CompactionReason::kUniversalSizeAmplification, 1),
+        kBasicFlushStats(CompactionReason::kFlush, 1) {
+    kBasicCompStats.micros = kHasValue;
+    kBasicCompStats.cpu_micros = kHasValue;
+    kBasicCompStats.bytes_read_non_output_levels = kHasValue;
+    kBasicCompStats.num_input_files_in_non_output_levels = kHasValue;
+    kBasicCompStats.num_input_records = kHasValue;
+    kBasicCompStats.num_dropped_records = kHasValue;
+
+    kBasicPerLevelStats.num_output_records = kHasValue;
+    kBasicPerLevelStats.bytes_written = kHasValue;
+    kBasicPerLevelStats.num_output_files = kHasValue;
+
+    kBasicPerKeyPlacementCompStats.micros = kHasValue;
+    kBasicPerKeyPlacementCompStats.cpu_micros = kHasValue;
+    kBasicPerKeyPlacementCompStats.Add(kBasicPerLevelStats);
+
+    kBasicFlushStats.micros = kHasValue;
+    kBasicFlushStats.cpu_micros = kHasValue;
+    kBasicFlushStats.bytes_written = kHasValue;
+    kBasicFlushStats.num_output_files = kHasValue;
+  }
+
+ protected:
+  static constexpr uint8_t kHasValue = 1;
+
+  InternalStats::CompactionStats kBasicCompStats;
+  InternalStats::CompactionStats kBasicPerKeyPlacementCompStats;
+  InternalStats::CompactionOutputsStats kBasicPerLevelStats;
+  InternalStats::CompactionStats kBasicFlushStats;
+
+  std::atomic_bool enable_per_key_placement = true;
+
+  void SetUp() override {
+    SyncPoint::GetInstance()->SetCallBack(
+        "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
+          auto supports_per_key_placement = static_cast<bool*>(arg);
+          *supports_per_key_placement = enable_per_key_placement;
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+  }
+
+  const std::vector<InternalStats::CompactionStats>& GetCompactionStats() {
+    VersionSet* const versions = dbfull()->GetVersionSet();
+    assert(versions);
+    assert(versions->GetColumnFamilySet());
+
+    ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+    assert(cfd);
+
+    const InternalStats* const internal_stats = cfd->internal_stats();
+    assert(internal_stats);
+
+    return internal_stats->TEST_GetCompactionStats();
+  }
+
+  const InternalStats::CompactionStats& GetPerKeyPlacementCompactionStats() {
+    VersionSet* const versions = dbfull()->GetVersionSet();
+    assert(versions);
+    assert(versions->GetColumnFamilySet());
+
+    ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+    assert(cfd);
+
+    const InternalStats* const internal_stats = cfd->internal_stats();
+    assert(internal_stats);
+
+    return internal_stats->TEST_GetPerKeyPlacementCompactionStats();
+  }
+
+  // Verify the compaction stats, the stats are roughly compared
+  void VerifyCompactionStats(
+      const std::vector<InternalStats::CompactionStats>& expect_stats,
+      const InternalStats::CompactionStats& expect_pl_stats) {
+    const std::vector<InternalStats::CompactionStats>& stats =
+        GetCompactionStats();
+    const size_t kLevels = expect_stats.size();
+    ASSERT_EQ(kLevels, stats.size());
+
+    for (auto it = stats.begin(), expect = expect_stats.begin();
+         it != stats.end(); it++, expect++) {
+      VerifyCompactionStats(*it, *expect);
+    }
+
+    const InternalStats::CompactionStats& pl_stats =
+        GetPerKeyPlacementCompactionStats();
+    VerifyCompactionStats(pl_stats, expect_pl_stats);
+  }
+
+  void ResetAllStats(std::vector<InternalStats::CompactionStats>& stats,
+                     InternalStats::CompactionStats& pl_stats) {
+    ASSERT_OK(dbfull()->ResetStats());
+    for (auto& level_stats : stats) {
+      level_stats.Clear();
+    }
+    pl_stats.Clear();
+  }
+
+  // bottommost_temperature is renaming to last_level_temperature, set either
+  // of them should have the same effect.
+  void SetColdTemperature(Options& options) {
+    if (GetParam()) {
+      options.bottommost_temperature = Temperature::kCold;
+    } else {
+      options.last_level_temperature = Temperature::kCold;
+    }
+  }
+
+ private:
+  void CompareStats(uint64_t val, uint64_t expect) {
+    if (expect > 0) {
+      ASSERT_TRUE(val > 0);
+    } else {
+      ASSERT_EQ(val, 0);
+    }
+  }
+
+  void VerifyCompactionStats(
+      const InternalStats::CompactionStats& stats,
+      const InternalStats::CompactionStats& expect_stats) {
+    CompareStats(stats.micros, expect_stats.micros);
+    CompareStats(stats.cpu_micros, expect_stats.cpu_micros);
+    CompareStats(stats.bytes_read_non_output_levels,
+                 expect_stats.bytes_read_non_output_levels);
+    CompareStats(stats.bytes_read_output_level,
+                 expect_stats.bytes_read_output_level);
+    CompareStats(stats.bytes_read_blob, expect_stats.bytes_read_blob);
+    CompareStats(stats.bytes_written, expect_stats.bytes_written);
+    CompareStats(stats.bytes_moved, expect_stats.bytes_moved);
+    CompareStats(stats.num_input_files_in_non_output_levels,
+                 expect_stats.num_input_files_in_non_output_levels);
+    CompareStats(stats.num_input_files_in_output_level,
+                 expect_stats.num_input_files_in_output_level);
+    CompareStats(stats.num_output_files, expect_stats.num_output_files);
+    CompareStats(stats.num_output_files_blob,
+                 expect_stats.num_output_files_blob);
+    CompareStats(stats.num_input_records, expect_stats.num_input_records);
+    CompareStats(stats.num_dropped_records, expect_stats.num_dropped_records);
+    CompareStats(stats.num_output_records, expect_stats.num_output_records);
+    ASSERT_EQ(stats.count, expect_stats.count);
+    for (int i = 0; i < static_cast<int>(CompactionReason::kNumOfReasons);
+         i++) {
+      ASSERT_EQ(stats.counts[i], expect_stats.counts[i]);
+    }
+  }
+};
+
+TEST_P(TieredCompactionTest, SequenceBasedTieredStorageUniversal) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kNumKeys = 100;
+  const int kLastLevel = kNumLevels - 1;
+
+  auto options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  SetColdTemperature(options);
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.statistics = CreateDBStatistics();
+  options.max_subcompactions = 10;
+  DestroyAndReopen(options);
+
+  std::atomic_uint64_t latest_cold_seq = 0;
+  std::vector<SequenceNumber> seq_history;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+        auto context = static_cast<PerKeyPlacementContext*>(arg);
+        context->output_to_penultimate_level =
+            context->seq_num > latest_cold_seq;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<InternalStats::CompactionStats> expect_stats(kNumLevels);
+  InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel];
+  InternalStats::CompactionStats expect_pl_stats;
+
+  for (int i = 0; i < kNumTrigger; i++) {
+    for (int j = 0; j < kNumKeys; j++) {
+      ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i)));
+    }
+    ASSERT_OK(Flush());
+    seq_history.emplace_back(dbfull()->GetLatestSequenceNumber());
+    expect_stats[0].Add(kBasicFlushStats);
+  }
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+
+  // the penultimate level file temperature is not cold, all data are output to
+  // the penultimate level.
+  ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // basic compaction stats are still counted to the last level
+  expect_stats[kLastLevel].Add(kBasicCompStats);
+  expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
+
+  VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+  ResetAllStats(expect_stats, expect_pl_stats);
+
+  // move forward the cold_seq to split the file into 2 levels, so should have
+  // both the last level stats and the output_to_penultimate_level stats
+  latest_cold_seq = seq_history[0];
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  last_stats.Add(kBasicCompStats);
+  last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+  last_stats.Add(kBasicPerLevelStats);
+  last_stats.num_dropped_records = 0;
+  expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
+  expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+  VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+  // delete all cold data, so all data will be on penultimate level
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Delete(Key(i)));
+  }
+  ASSERT_OK(Flush());
+
+  ResetAllStats(expect_stats, expect_pl_stats);
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+  last_stats.Add(kBasicCompStats);
+  last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+  last_stats.bytes_read_output_level = kHasValue;
+  last_stats.num_input_files_in_output_level = kHasValue;
+  expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
+  expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+  VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+  // move forward the cold_seq again with range delete, take a snapshot to keep
+  // the range dels in both cold and hot SSTs
+  auto snap = db_->GetSnapshot();
+  latest_cold_seq = seq_history[2];
+  std::string start = Key(25), end = Key(35);
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+  ASSERT_OK(Flush());
+
+  ResetAllStats(expect_stats, expect_pl_stats);
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  last_stats.Add(kBasicCompStats);
+  last_stats.Add(kBasicPerLevelStats);
+  last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+  expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
+  expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+  VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+  // verify data
+  std::string value;
+  for (int i = 0; i < kNumKeys; i++) {
+    if (i < 10 || (i >= 25 && i < 35)) {
+      ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound());
+    } else {
+      ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value));
+    }
+  }
+
+  // range delete all hot data
+  start = Key(30);
+  end = Key(130);
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // no range del is dropped because of snapshot
+  ASSERT_EQ(
+      options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+      0);
+
+  // release the snapshot and do compaction again should remove all hot data
+  db_->ReleaseSnapshot(snap);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // 2 range dels are dropped
+  ASSERT_EQ(
+      options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+      3);
+
+  // move backward the cold_seq, for example the user may change the setting of
+  // hot/cold data, but it won't impact the existing cold data, as the sequence
+  // number is zeroed out.
+  latest_cold_seq = seq_history[1];
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+}
+
+TEST_P(TieredCompactionTest, RangeBasedTieredStorageUniversal) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kNumKeys = 100;
+  const int kLastLevel = kNumLevels - 1;
+
+  auto options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  SetColdTemperature(options);
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.statistics = CreateDBStatistics();
+  options.max_subcompactions = 10;
+  DestroyAndReopen(options);
+  auto cmp = options.comparator;
+
+  port::Mutex mutex;
+  std::string hot_start = Key(10);
+  std::string hot_end = Key(50);
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+        auto context = static_cast<PerKeyPlacementContext*>(arg);
+        MutexLock l(&mutex);
+        context->output_to_penultimate_level =
+            cmp->Compare(context->key, hot_start) >= 0 &&
+            cmp->Compare(context->key, hot_end) < 0;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<InternalStats::CompactionStats> expect_stats(kNumLevels);
+  InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel];
+  InternalStats::CompactionStats expect_pl_stats;
+
+  for (int i = 0; i < kNumTrigger; i++) {
+    for (int j = 0; j < kNumKeys; j++) {
+      ASSERT_OK(Put(Key(j), "value" + std::to_string(j)));
+    }
+    ASSERT_OK(Flush());
+    expect_stats[0].Add(kBasicFlushStats);
+  }
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  last_stats.Add(kBasicCompStats);
+  last_stats.Add(kBasicPerLevelStats);
+  expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
+  VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+  ResetAllStats(expect_stats, expect_pl_stats);
+
+  // change to all cold, no output_to_penultimate_level output
+  {
+    MutexLock l(&mutex);
+    hot_start = Key(100);
+    hot_end = Key(200);
+  }
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  last_stats.Add(kBasicCompStats);
+  last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+  last_stats.Add(kBasicPerLevelStats);
+  last_stats.num_dropped_records = 0;
+  last_stats.bytes_read_output_level = kHasValue;
+  last_stats.num_input_files_in_output_level = kHasValue;
+  VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+  // change to all hot, universal compaction support moving data to up level if
+  // it's within compaction level range.
+  {
+    MutexLock l(&mutex);
+    hot_start = Key(0);
+    hot_end = Key(100);
+  }
+
+  // No data is moved from cold tier to hot tier because no input files from L5
+  // or higher, it's not safe to move data to output_to_penultimate_level level.
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+
+  // Add 2 keys in higher level, but in separated files, all keys can be moved
+  // up if it's hot
+  ASSERT_OK(Put(Key(0), "value" + std::to_string(0)));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(50), "value" + std::to_string(0)));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // change to only 1 key cold, to test compaction could stop even it matches
+  // size amp compaction threshold
+  {
+    MutexLock l(&mutex);
+    hot_start = Key(1);
+    hot_end = Key(1000);
+  }
+
+  // generate files just enough to trigger compaction
+  for (int i = 0; i < kNumTrigger - 1; i++) {
+    for (int j = 0; j < 1000; j++) {
+      ASSERT_OK(Put(Key(j), "value" + std::to_string(j)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->WaitForCompact(
+      true));  // make sure the compaction is able to finish
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+  auto opts = db_->GetOptions();
+  auto max_size_amp =
+      opts.compaction_options_universal.max_size_amplification_percent / 100;
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown),
+            GetSstSizeHelper(Temperature::kCold) * max_size_amp);
+
+  // delete all cold data
+  ASSERT_OK(Delete(Key(0)));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // range delete overlap with both hot/cold data, with a snapshot to make sure
+  // the range del is saved
+  auto snap = db_->GetSnapshot();
+  {
+    MutexLock l(&mutex);
+    hot_start = Key(50);
+    hot_end = Key(100);
+  }
+  std::string start = Key(1), end = Key(70);
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // no range del is dropped until snapshot is released
+  ASSERT_EQ(
+      options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+      0);
+
+  // verify data
+  std::string value;
+  for (int i = 0; i < kNumKeys; i++) {
+    if (i < 70) {
+      ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound());
+    } else {
+      ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value));
+    }
+  }
+
+  db_->ReleaseSnapshot(snap);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // range del is dropped
+  ASSERT_EQ(
+      options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+      1);
+}
+
+TEST_P(TieredCompactionTest, LevelColdRangeDelete) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kNumKeys = 100;
+  const int kLastLevel = kNumLevels - 1;
+
+  auto options = CurrentOptions();
+  SetColdTemperature(options);
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.num_levels = kNumLevels;
+  options.statistics = CreateDBStatistics();
+  options.max_subcompactions = 10;
+  DestroyAndReopen(options);
+
+  std::atomic_uint64_t latest_cold_seq = 0;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+        auto context = static_cast<PerKeyPlacementContext*>(arg);
+        context->output_to_penultimate_level =
+            context->seq_num > latest_cold_seq;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(Put(Key(i), "value" + std::to_string(i)));
+  }
+  ASSERT_OK(Flush());
+
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,1",
+            FilesPerLevel());  // bottommost but not last level file is hot
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // explicitly move the data to the last level
+  MoveFilesToLevel(kLastLevel);
+
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+  auto snap = db_->GetSnapshot();
+
+  std::string start = Key(10);
+  std::string end = Key(50);
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+
+  // 20->30 will be marked as cold data, but it cannot be placed to cold tier
+  // (bottommost) otherwise, it will be "deleted" by the range del in
+  // output_to_penultimate_level level verify that these data will be able to
+  // queried
+  for (int i = 20; i < 30; i++) {
+    ASSERT_OK(Put(Key(i), "value" + std::to_string(i)));
+  }
+  // make the range tombstone and data after that cold
+  latest_cold_seq = dbfull()->GetLatestSequenceNumber();
+
+  // add home hot data, just for test
+  for (int i = 30; i < 40; i++) {
+    ASSERT_OK(Put(Key(i), "value" + std::to_string(i)));
+  }
+
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  std::string value;
+  for (int i = 0; i < kNumKeys; i++) {
+    auto s = db_->Get(ReadOptions(), Key(i), &value);
+    if ((i >= 10 && i < 20) || (i >= 40 && i < 50)) {
+      ASSERT_TRUE(s.IsNotFound());
+    } else {
+      ASSERT_OK(s);
+    }
+  }
+
+  db_->ReleaseSnapshot(snap);
+}
+
+// Test SST partitioner cut after every single key
+class SingleKeySstPartitioner : public SstPartitioner {
+ public:
+  const char* Name() const override { return "SingleKeySstPartitioner"; }
+
+  PartitionerResult ShouldPartition(
+      const PartitionerRequest& /*request*/) override {
+    return kRequired;
+  }
+
+  bool CanDoTrivialMove(const Slice& /*smallest_user_key*/,
+                        const Slice& /*largest_user_key*/) override {
+    return false;
+  }
+};
+
+class SingleKeySstPartitionerFactory : public SstPartitionerFactory {
+ public:
+  static const char* kClassName() { return "SingleKeySstPartitionerFactory"; }
+  const char* Name() const override { return kClassName(); }
+
+  std::unique_ptr<SstPartitioner> CreatePartitioner(
+      const SstPartitioner::Context& /* context */) const override {
+    return std::unique_ptr<SstPartitioner>(new SingleKeySstPartitioner());
+  }
+};
+
+TEST_P(TieredCompactionTest, LevelOutofBoundaryRangeDelete) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 3;
+  const int kNumKeys = 10;
+
+  auto factory = std::make_shared<SingleKeySstPartitionerFactory>();
+  auto options = CurrentOptions();
+  SetColdTemperature(options);
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.num_levels = kNumLevels;
+  options.statistics = CreateDBStatistics();
+  options.sst_partitioner_factory = factory;
+  options.max_subcompactions = 10;
+  DestroyAndReopen(options);
+
+  std::atomic_uint64_t latest_cold_seq = 0;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+        auto context = static_cast<PerKeyPlacementContext*>(arg);
+        context->output_to_penultimate_level =
+            context->seq_num > latest_cold_seq;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(Put(Key(i), "value" + std::to_string(i)));
+  }
+  ASSERT_OK(Flush());
+
+  MoveFilesToLevel(kNumLevels - 1);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+  ASSERT_EQ("0,0,10", FilesPerLevel());
+
+  auto snap = db_->GetSnapshot();
+
+  // only range delete
+  std::string start = Key(3);
+  std::string end = Key(5);
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+  ASSERT_OK(Flush());
+
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown),
+            0);  // tombstone has no size, even it's in hot tier
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+  ASSERT_EQ("0,1,10",
+            FilesPerLevel());  // one file is at the penultimate level which
+                               // only contains a range delete
+
+  // Add 2 hot keys, each is a new SST, they will be placed in the same level as
+  // range del, but they don't have overlap with range del, make sure the range
+  // del will still be placed there
+  latest_cold_seq = dbfull()->GetLatestSequenceNumber();
+  ASSERT_OK(Put(Key(0), "new value" + std::to_string(0)));
+  auto snap2 = db_->GetSnapshot();
+  ASSERT_OK(Put(Key(6), "new value" + std::to_string(6)));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,2,10",
+            FilesPerLevel());  // one file is at the penultimate level
+                               // which only contains a range delete
+  std::vector<LiveFileMetaData> live_file_meta;
+  db_->GetLiveFilesMetaData(&live_file_meta);
+  bool found_sst_with_del = false;
+  uint64_t sst_with_del_num = 0;
+  for (const auto& meta : live_file_meta) {
+    if (meta.num_deletions > 0) {
+      // found SST with del, which has 2 entries, one for data one for range del
+      ASSERT_EQ(meta.level,
+                kNumLevels - 2);  // output to penultimate level
+      ASSERT_EQ(meta.num_entries, 2);
+      ASSERT_EQ(meta.num_deletions, 1);
+      found_sst_with_del = true;
+      sst_with_del_num = meta.file_number;
+    }
+  }
+  ASSERT_TRUE(found_sst_with_del);
+
+  // release the first snapshot and compact, which should compact the range del
+  // but new inserted key `0` and `6` are still hot data which will be placed on
+  // the penultimate level
+  db_->ReleaseSnapshot(snap);
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,2,7", FilesPerLevel());
+  db_->GetLiveFilesMetaData(&live_file_meta);
+  found_sst_with_del = false;
+  for (const auto& meta : live_file_meta) {
+    // check new SST with del (the old one may not yet be deleted after
+    // compaction)
+    if (meta.num_deletions > 0 && meta.file_number != sst_with_del_num) {
+      found_sst_with_del = true;
+    }
+  }
+  ASSERT_FALSE(found_sst_with_del);
+
+  // Now make all data cold, key 0 will be moved to the last level, but key 6 is
+  // still in snap2, so it will be kept at the penultimate level
+  latest_cold_seq = dbfull()->GetLatestSequenceNumber();
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,1,8", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  db_->ReleaseSnapshot(snap2);
+
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,8", FilesPerLevel());
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+}
+
+TEST_P(TieredCompactionTest, UniversalRangeDelete) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kNumKeys = 10;
+
+  auto factory = std::make_shared<SingleKeySstPartitionerFactory>();
+
+  auto options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  SetColdTemperature(options);
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.statistics = CreateDBStatistics();
+  options.sst_partitioner_factory = factory;
+  options.max_subcompactions = 10;
+  DestroyAndReopen(options);
+
+  std::atomic_uint64_t latest_cold_seq = 0;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+        auto context = static_cast<PerKeyPlacementContext*>(arg);
+        context->output_to_penultimate_level =
+            context->seq_num > latest_cold_seq;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(Put(Key(i), "value" + std::to_string(i)));
+  }
+  ASSERT_OK(Flush());
+
+  // compact to the penultimate level with 10 files
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  ASSERT_EQ("0,0,0,0,0,10", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // make all data cold
+  latest_cold_seq = dbfull()->GetLatestSequenceNumber();
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,0,10", FilesPerLevel());
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // range del which considered as hot data, but it will be merged and deleted
+  // with the last level data
+  std::string start = Key(3);
+  std::string end = Key(5);
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  ASSERT_EQ("0,0,0,0,0,0,8", FilesPerLevel());
+
+  // range del with snapshot should be preserved in the penultimate level
+  auto snap = db_->GetSnapshot();
+
+  start = Key(6);
+  end = Key(8);
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1,8", FilesPerLevel());
+
+  // Add 2 hot keys, each is a new SST, they will be placed in the same level as
+  // range del, but no overlap with range del.
+  latest_cold_seq = dbfull()->GetLatestSequenceNumber();
+  ASSERT_OK(Put(Key(4), "new value" + std::to_string(0)));
+  auto snap2 = db_->GetSnapshot();
+  ASSERT_OK(Put(Key(9), "new value" + std::to_string(6)));
+
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,2,8", FilesPerLevel());
+  // find the SST with range del
+  std::vector<LiveFileMetaData> live_file_meta;
+  db_->GetLiveFilesMetaData(&live_file_meta);
+  bool found_sst_with_del = false;
+  uint64_t sst_with_del_num = 0;
+  for (const auto& meta : live_file_meta) {
+    if (meta.num_deletions > 0) {
+      // found SST with del, which has 2 entries, one for data one for range del
+      ASSERT_EQ(meta.level,
+                kNumLevels - 2);  // output_to_penultimate_level level
+      ASSERT_EQ(meta.num_entries, 2);
+      ASSERT_EQ(meta.num_deletions, 1);
+      found_sst_with_del = true;
+      sst_with_del_num = meta.file_number;
+    }
+  }
+  ASSERT_TRUE(found_sst_with_del);
+
+  // release the first snapshot which should compact the range del, but data on
+  // the same level is still hot
+  db_->ReleaseSnapshot(snap);
+
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,2,6", FilesPerLevel());
+  db_->GetLiveFilesMetaData(&live_file_meta);
+  // no range del should be found in SST
+  found_sst_with_del = false;
+  for (const auto& meta : live_file_meta) {
+    // check new SST with del (the old one may not yet be deleted after
+    // compaction)
+    if (meta.num_deletions > 0 && meta.file_number != sst_with_del_num) {
+      found_sst_with_del = true;
+    }
+  }
+  ASSERT_FALSE(found_sst_with_del);
+
+  // make all data to cold, but key 6 is still protected by snap2
+  latest_cold_seq = dbfull()->GetLatestSequenceNumber();
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1,7", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  db_->ReleaseSnapshot(snap2);
+
+  // release snapshot, everything go to bottommost
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,0,7", FilesPerLevel());
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+}
+
+TEST_P(TieredCompactionTest, SequenceBasedTieredStorageLevel) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kNumKeys = 100;
+  const int kLastLevel = kNumLevels - 1;
+
+  auto options = CurrentOptions();
+  SetColdTemperature(options);
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.num_levels = kNumLevels;
+  options.statistics = CreateDBStatistics();
+  options.max_subcompactions = 10;
+  DestroyAndReopen(options);
+
+  std::atomic_uint64_t latest_cold_seq = 0;
+  std::vector<SequenceNumber> seq_history;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+        auto context = static_cast<PerKeyPlacementContext*>(arg);
+        context->output_to_penultimate_level =
+            context->seq_num > latest_cold_seq;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<InternalStats::CompactionStats> expect_stats(kNumLevels);
+  InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel];
+  InternalStats::CompactionStats expect_pl_stats;
+
+  for (int i = 0; i < kNumTrigger; i++) {
+    for (int j = 0; j < kNumKeys; j++) {
+      ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i)));
+    }
+    ASSERT_OK(Flush());
+    expect_stats[0].Add(kBasicFlushStats);
+  }
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+
+  // non last level is hot
+  ASSERT_EQ("0,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+  expect_stats[1].Add(kBasicCompStats);
+  expect_stats[1].Add(kBasicPerLevelStats);
+  expect_stats[1].ResetCompactionReason(CompactionReason::kLevelL0FilesNum);
+  VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+  // move all data to the last level
+  MoveFilesToLevel(kLastLevel);
+
+  ResetAllStats(expect_stats, expect_pl_stats);
+
+  // The compaction won't move the data up
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  last_stats.Add(kBasicCompStats);
+  last_stats.Add(kBasicPerLevelStats);
+  last_stats.num_dropped_records = 0;
+  last_stats.bytes_read_non_output_levels = 0;
+  last_stats.num_input_files_in_non_output_levels = 0;
+  last_stats.bytes_read_output_level = kHasValue;
+  last_stats.num_input_files_in_output_level = kHasValue;
+  last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+  VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+  // Add new data, which is all hot and overriding all existing data
+  for (int i = 0; i < kNumTrigger; i++) {
+    for (int j = 0; j < kNumKeys; j++) {
+      ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i)));
+    }
+    ASSERT_OK(Flush());
+    seq_history.emplace_back(dbfull()->GetLatestSequenceNumber());
+  }
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+  ASSERT_EQ("0,1,0,0,0,0,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  ResetAllStats(expect_stats, expect_pl_stats);
+
+  // after compaction, all data are hot
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+  for (int level = 2; level < kNumLevels - 1; level++) {
+    expect_stats[level].bytes_moved = kHasValue;
+  }
+
+  last_stats.Add(kBasicCompStats);
+  last_stats.bytes_read_output_level = kHasValue;
+  last_stats.num_input_files_in_output_level = kHasValue;
+  last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+  expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
+  expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+  VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+  // move forward the cold_seq, try to split the data into cold and hot, but in
+  // this case it's unsafe to split the data
+  // because it's non-last-level but bottommost file, the sequence number will
+  // be zeroed out and lost the time information (with
+  // `level_compaction_dynamic_level_bytes` or Universal Compaction, it should
+  // be rare.)
+  // TODO(zjay): ideally we should avoid zero out non-last-level bottommost file
+  latest_cold_seq = seq_history[1];
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+  seq_history.clear();
+
+  // manually move all data (cold) to last level
+  MoveFilesToLevel(kLastLevel);
+  seq_history.clear();
+  // Add new data once again
+  for (int i = 0; i < kNumTrigger; i++) {
+    for (int j = 0; j < kNumKeys; j++) {
+      ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i)));
+    }
+    ASSERT_OK(Flush());
+    seq_history.emplace_back(dbfull()->GetLatestSequenceNumber());
+  }
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+
+  latest_cold_seq = seq_history[0];
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // delete all cold data
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Delete(Key(i)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+  latest_cold_seq = seq_history[2];
+
+  MoveFilesToLevel(kLastLevel);
+
+  // move forward the cold_seq again with range delete, take a snapshot to keep
+  // the range dels in bottommost
+  auto snap = db_->GetSnapshot();
+
+  std::string start = Key(25), end = Key(35);
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+  // add one small key and large key in the input level, to make sure it's able
+  // to move hot data to input level within that range
+  ASSERT_OK(Put(Key(0), "value" + std::to_string(0)));
+  ASSERT_OK(Put(Key(100), "value" + std::to_string(0)));
+
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // verify data
+  std::string value;
+  for (int i = 1; i < 130; i++) {
+    if (i < 10 || (i >= 25 && i < 35)) {
+      ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound());
+    } else {
+      ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value));
+    }
+  }
+
+  // delete all hot data
+  ASSERT_OK(Delete(Key(0)));
+  start = Key(30);
+  end = Key(101);  // range [101, 130] is cold, because it's not in input range
+                   // in previous compaction
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // no range del is dropped because of snapshot
+  ASSERT_EQ(
+      options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+      0);
+
+  db_->ReleaseSnapshot(snap);
+
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // 3 range dels dropped, the first one is double counted as expected, which is
+  // spread into 2 SST files
+  ASSERT_EQ(
+      options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+      3);
+
+  // move backward of cold_seq, which might happen when the user change the
+  // setting. the hot data won't move up, just to make sure it still runs
+  // fine, which is because:
+  // 1. sequence number is zeroed out, so no time information
+  // 2. leveled compaction only support move data up within the higher level
+  // input range
+  latest_cold_seq = seq_history[1];
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+}
+
+TEST_P(TieredCompactionTest, RangeBasedTieredStorageLevel) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kNumKeys = 100;
+
+  auto options = CurrentOptions();
+  SetColdTemperature(options);
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.num_levels = kNumLevels;
+  options.statistics = CreateDBStatistics();
+  options.max_subcompactions = 10;
+  DestroyAndReopen(options);
+  auto cmp = options.comparator;
+
+  port::Mutex mutex;
+  std::string hot_start = Key(10);
+  std::string hot_end = Key(50);
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+        auto context = static_cast<PerKeyPlacementContext*>(arg);
+        MutexLock l(&mutex);
+        context->output_to_penultimate_level =
+            cmp->Compare(context->key, hot_start) >= 0 &&
+            cmp->Compare(context->key, hot_end) < 0;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int i = 0; i < kNumTrigger; i++) {
+    for (int j = 0; j < kNumKeys; j++) {
+      ASSERT_OK(Put(Key(j), "value" + std::to_string(j)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // change to all cold
+  {
+    MutexLock l(&mutex);
+    hot_start = Key(100);
+    hot_end = Key(200);
+  }
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // change to all hot, but level compaction only support move cold to hot
+  // within it's higher level input range.
+  {
+    MutexLock l(&mutex);
+    hot_start = Key(0);
+    hot_end = Key(100);
+  }
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // with mixed hot/cold data
+  {
+    MutexLock l(&mutex);
+    hot_start = Key(50);
+    hot_end = Key(100);
+  }
+  ASSERT_OK(Put(Key(0), "value" + std::to_string(0)));
+  ASSERT_OK(Put(Key(100), "value" + std::to_string(100)));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // delete all hot data, but with snapshot to keep the range del
+  auto snap = db_->GetSnapshot();
+  std::string start = Key(50);
+  std::string end = Key(100);
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // no range del is dropped because of snapshot
+  ASSERT_EQ(
+      options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+      0);
+
+  // release the snapshot and do compaction again should remove all hot data
+  db_->ReleaseSnapshot(snap);
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  ASSERT_EQ(
+      options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+      1);
+}
+
+INSTANTIATE_TEST_CASE_P(TieredCompactionTest, TieredCompactionTest,
+                        testing::Bool());
+
+class PrecludeLastLevelTest : public DBTestBase {
+ public:
+  PrecludeLastLevelTest()
+      : DBTestBase("preclude_last_level_test", /*env_do_fsync=*/false) {
+    mock_clock_ = std::make_shared<MockSystemClock>(env_->GetSystemClock());
+    mock_env_ = std::make_unique<CompositeEnvWrapper>(env_, mock_clock_);
+  }
+
+ protected:
+  std::unique_ptr<Env> mock_env_;
+  std::shared_ptr<MockSystemClock> mock_clock_;
+
+  void SetUp() override {
+    mock_clock_->InstallTimedWaitFixCallback();
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::StartPeriodicTaskScheduler:Init", [&](void* arg) {
+          auto periodic_task_scheduler_ptr =
+              reinterpret_cast<PeriodicTaskScheduler*>(arg);
+          periodic_task_scheduler_ptr->TEST_OverrideTimer(mock_clock_.get());
+        });
+    mock_clock_->SetCurrentTime(0);
+  }
+};
+
+TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimeManualCompaction) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kNumKeys = 100;
+  const int kKeyPerSec = 10;
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.preserve_internal_time_seconds = 10000;
+  options.env = mock_env_.get();
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.num_levels = kNumLevels;
+  DestroyAndReopen(options);
+
+  // pass some time first, otherwise the first a few keys write time are going
+  // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); });
+
+  int sst_num = 0;
+  // Write files that are overlap and enough to trigger compaction
+  for (; sst_num < kNumTrigger; sst_num++) {
+    for (int i = 0; i < kNumKeys; i++) {
+      ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+      dbfull()->TEST_WaitForPeridicTaskRun([&] {
+        mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+      });
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+
+  // all data is pushed to the last level
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+  // enable preclude feature
+  options.preclude_last_level_data_seconds = 10000;
+  options.last_level_temperature = Temperature::kCold;
+  Reopen(options);
+
+  // all data is hot, even they're in the last level
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+
+  // Generate a sstable and trigger manual compaction
+  ASSERT_OK(Put(Key(10), "value"));
+  ASSERT_OK(Flush());
+
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  // all data is moved up to the penultimate level
+  ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+
+  // close explicitly, because the env is local variable which will be released
+  // first.
+  Close();
+}
+
+TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimeAutoCompaction) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kNumKeys = 100;
+  const int kKeyPerSec = 10;
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.preserve_internal_time_seconds = 10000;
+  options.env = mock_env_.get();
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.num_levels = kNumLevels;
+  DestroyAndReopen(options);
+
+  // pass some time first, otherwise the first a few keys write time are going
+  // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); });
+
+  int sst_num = 0;
+  // Write files that are overlap and enough to trigger compaction
+  for (; sst_num < kNumTrigger; sst_num++) {
+    for (int i = 0; i < kNumKeys; i++) {
+      ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+      dbfull()->TEST_WaitForPeridicTaskRun([&] {
+        mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+      });
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+
+  // all data is pushed to the last level
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+  // enable preclude feature
+  options.preclude_last_level_data_seconds = 10000;
+  options.last_level_temperature = Temperature::kCold;
+  // make sure it won't trigger Size Amp compaction, unlike normal Size Amp
+  // compaction which is typically a last level compaction, when tiered Storage
+  // ("preclude_last_level") is enabled, size amp won't include the last level.
+  // As the last level would be in cold tier and the size would not be a
+  // problem, which also avoid frequent hot to cold storage compaction.
+  options.compaction_options_universal.max_size_amplification_percent = 400;
+  Reopen(options);
+
+  // all data is hot, even they're in the last level
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+
+  // Write more data, but still all hot until the 10th SST, as:
+  // write a key every 10 seconds, 100 keys per SST, each SST takes 1000 seconds
+  // The preclude_last_level_data_seconds is 10k
+  Random rnd(301);
+  for (; sst_num < kNumTrigger * 2 - 1; sst_num++) {
+    for (int i = 0; i < kNumKeys; i++) {
+      // the value needs to be big enough to trigger full compaction
+      ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), rnd.RandomString(100)));
+      dbfull()->TEST_WaitForPeridicTaskRun([&] {
+        mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+      });
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->WaitForCompact(true));
+  }
+
+  // all data is moved up to the penultimate level
+  ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+
+  // close explicitly, because the env is local variable which will be released
+  // first.
+  Close();
+}
+
+TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimePartial) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kNumKeys = 100;
+  const int kKeyPerSec = 10;
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.preserve_internal_time_seconds = 2000;
+  options.env = mock_env_.get();
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.num_levels = kNumLevels;
+  DestroyAndReopen(options);
+
+  // pass some time first, otherwise the first a few keys write time are going
+  // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); });
+
+  int sst_num = 0;
+  // Write files that are overlap and enough to trigger compaction
+  for (; sst_num < kNumTrigger; sst_num++) {
+    for (int i = 0; i < kNumKeys; i++) {
+      ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+      dbfull()->TEST_WaitForPeridicTaskRun([&] {
+        mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+      });
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+
+  // all data is pushed to the last level
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+  std::vector<KeyVersion> key_versions;
+  ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(),
+                              std::numeric_limits<size_t>::max(),
+                              &key_versions));
+
+  // make sure there're more than 300 keys and first 100 keys are having seqno
+  // zeroed out, the last 100 key seqno not zeroed out
+  ASSERT_GT(key_versions.size(), 300);
+  for (int i = 0; i < 100; i++) {
+    ASSERT_EQ(key_versions[i].sequence, 0);
+  }
+  auto rit = key_versions.rbegin();
+  for (int i = 0; i < 100; i++) {
+    ASSERT_GT(rit->sequence, 0);
+    rit++;
+  }
+
+  // enable preclude feature
+  options.preclude_last_level_data_seconds = 2000;
+  options.last_level_temperature = Temperature::kCold;
+  Reopen(options);
+
+  // Generate a sstable and trigger manual compaction
+  ASSERT_OK(Put(Key(10), "value"));
+  ASSERT_OK(Flush());
+
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  // some data are moved up, some are not
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+
+  Close();
+}
+
+TEST_F(PrecludeLastLevelTest, SmallPrecludeTime) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kNumKeys = 100;
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.preclude_last_level_data_seconds = 60;
+  options.preserve_internal_time_seconds = 0;
+  options.env = mock_env_.get();
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.num_levels = kNumLevels;
+  options.last_level_temperature = Temperature::kCold;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+
+  dbfull()->TEST_WaitForPeridicTaskRun([&] {
+    mock_clock_->MockSleepForSeconds(static_cast<int>(rnd.Uniform(10) + 1));
+  });
+
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+    dbfull()->TEST_WaitForPeridicTaskRun([&] {
+      mock_clock_->MockSleepForSeconds(static_cast<int>(rnd.Uniform(2)));
+    });
+  }
+  ASSERT_OK(Flush());
+
+  TablePropertiesCollection tables_props;
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+  ASSERT_EQ(tables_props.size(), 1);
+  ASSERT_FALSE(tables_props.begin()->second->seqno_to_time_mapping.empty());
+  SeqnoToTimeMapping tp_mapping;
+  ASSERT_OK(
+      tp_mapping.Add(tables_props.begin()->second->seqno_to_time_mapping));
+  ASSERT_OK(tp_mapping.Sort());
+  ASSERT_FALSE(tp_mapping.Empty());
+  auto seqs = tp_mapping.TEST_GetInternalMapping();
+  ASSERT_FALSE(seqs.empty());
+
+  // Wait more than preclude_last_level time, then make sure all the data is
+  // compacted to the last level even there's no write (no seqno -> time
+  // information was flushed to any SST).
+  mock_clock_->MockSleepForSeconds(100);
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  Close();
+}
+
+TEST_F(PrecludeLastLevelTest, LastLevelOnlyCompactionPartial) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kNumKeys = 100;
+  const int kKeyPerSec = 10;
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.preserve_internal_time_seconds = 2000;
+  options.env = mock_env_.get();
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.num_levels = kNumLevels;
+  DestroyAndReopen(options);
+
+  // pass some time first, otherwise the first a few keys write time are going
+  // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); });
+
+  int sst_num = 0;
+  // Write files that are overlap and enough to trigger compaction
+  for (; sst_num < kNumTrigger; sst_num++) {
+    for (int i = 0; i < kNumKeys; i++) {
+      ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+      dbfull()->TEST_WaitForPeridicTaskRun([&] {
+        mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+      });
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+
+  // all data is pushed to the last level
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+  // enable preclude feature
+  options.preclude_last_level_data_seconds = 2000;
+  options.last_level_temperature = Temperature::kCold;
+  Reopen(options);
+
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  // some data are moved up, some are not
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+
+  std::vector<KeyVersion> key_versions;
+  ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(),
+                              std::numeric_limits<size_t>::max(),
+                              &key_versions));
+
+  // make sure there're more than 300 keys and first 100 keys are having seqno
+  // zeroed out, the last 100 key seqno not zeroed out
+  ASSERT_GT(key_versions.size(), 300);
+  for (int i = 0; i < 100; i++) {
+    ASSERT_EQ(key_versions[i].sequence, 0);
+  }
+  auto rit = key_versions.rbegin();
+  for (int i = 0; i < 100; i++) {
+    ASSERT_GT(rit->sequence, 0);
+    rit++;
+  }
+
+  Close();
+}
+
+class PrecludeLastLevelTestWithParms
+    : public PrecludeLastLevelTest,
+      public testing::WithParamInterface<bool> {
+ public:
+  PrecludeLastLevelTestWithParms() : PrecludeLastLevelTest() {}
+};
+
+TEST_P(PrecludeLastLevelTestWithParms, LastLevelOnlyCompactionNoPreclude) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kNumKeys = 100;
+  const int kKeyPerSec = 10;
+
+  bool enable_preclude_last_level = GetParam();
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.preserve_internal_time_seconds = 2000;
+  options.env = mock_env_.get();
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.num_levels = kNumLevels;
+  DestroyAndReopen(options);
+
+  // pass some time first, otherwise the first a few keys write time are going
+  // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); });
+
+  Random rnd(301);
+  int sst_num = 0;
+  // Write files that are overlap and enough to trigger compaction
+  for (; sst_num < kNumTrigger; sst_num++) {
+    for (int i = 0; i < kNumKeys; i++) {
+      ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), rnd.RandomString(100)));
+      dbfull()->TEST_WaitForPeridicTaskRun([&] {
+        mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+      });
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+
+  // all data is pushed to the last level
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+  std::atomic_bool is_manual_compaction_running = false;
+  std::atomic_bool verified_compaction_order = false;
+
+  // Make sure the manual compaction is in progress and try to trigger a
+  // SizeRatio compaction by flushing 4 files to L0. The compaction will try to
+  // compact 4 files at L0 to L5 (the last empty level).
+  // If the preclude_last_feature is enabled, the auto triggered compaction
+  // cannot be picked. Otherwise, the auto triggered compaction can run in
+  // parallel with the last level compaction.
+  // L0: [a] [b] [c] [d]
+  // L5:     (locked if preclude_last_level is enabled)
+  // L6: [z] (locked: manual compaction in progress)
+  // TODO: in this case, L0 files should just be compacted to L4, so the 2
+  //  compactions won't be overlapped.
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::ProcessKeyValueCompaction()::Processing", [&](void* arg) {
+        auto compaction = static_cast<Compaction*>(arg);
+        if (compaction->is_manual_compaction()) {
+          is_manual_compaction_running = true;
+          TEST_SYNC_POINT(
+              "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:"
+              "ManualCompaction1");
+          TEST_SYNC_POINT(
+              "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:"
+              "ManualCompaction2");
+          is_manual_compaction_running = false;
+        }
+      });
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+        auto compaction = static_cast<Compaction*>(arg);
+        if (enable_preclude_last_level && is_manual_compaction_running) {
+          ASSERT_TRUE(compaction == nullptr);
+          verified_compaction_order = true;
+        } else {
+          ASSERT_TRUE(compaction != nullptr);
+          verified_compaction_order = true;
+        }
+        if (!compaction || !compaction->is_manual_compaction()) {
+          TEST_SYNC_POINT(
+              "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:"
+              "AutoCompactionPicked");
+        }
+      });
+
+  SyncPoint::GetInstance()->LoadDependency({
+      {"PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:"
+       "ManualCompaction1",
+       "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:StartWrite"},
+      {"PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:"
+       "AutoCompactionPicked",
+       "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:"
+       "ManualCompaction2"},
+  });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // only enable if the Parameter is true
+  if (enable_preclude_last_level) {
+    options.preclude_last_level_data_seconds = 2000;
+  }
+  options.max_background_jobs = 8;
+  options.last_level_temperature = Temperature::kCold;
+  Reopen(options);
+
+  auto manual_compaction_thread = port::Thread([this]() {
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+    cro.exclusive_manual_compaction = false;
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT(
+      "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:StartWrite");
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  for (; sst_num < kNumTrigger * 2; sst_num++) {
+    for (int i = 0; i < kNumKeys; i++) {
+      // the value needs to be big enough to trigger full compaction
+      ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+      dbfull()->TEST_WaitForPeridicTaskRun([&] {
+        mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+      });
+    }
+    ASSERT_OK(Flush());
+  }
+
+  manual_compaction_thread.join();
+
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+
+  if (enable_preclude_last_level) {
+    ASSERT_NE("0,0,0,0,0,1,1", FilesPerLevel());
+  } else {
+    ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  }
+  ASSERT_TRUE(verified_compaction_order);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  stop_token.reset();
+
+  Close();
+}
+
+INSTANTIATE_TEST_CASE_P(PrecludeLastLevelTestWithParms,
+                        PrecludeLastLevelTestWithParms, testing::Bool());
+
+// partition the SST into 3 ranges [0, 19] [20, 39] [40, ...]
+class ThreeRangesPartitioner : public SstPartitioner {
+ public:
+  const char* Name() const override { return "SingleKeySstPartitioner"; }
+
+  PartitionerResult ShouldPartition(
+      const PartitionerRequest& request) override {
+    if ((cmp->CompareWithoutTimestamp(*request.current_user_key,
+                                      DBTestBase::Key(20)) >= 0 &&
+         cmp->CompareWithoutTimestamp(*request.prev_user_key,
+                                      DBTestBase::Key(20)) < 0) ||
+        (cmp->CompareWithoutTimestamp(*request.current_user_key,
+                                      DBTestBase::Key(40)) >= 0 &&
+         cmp->CompareWithoutTimestamp(*request.prev_user_key,
+                                      DBTestBase::Key(40)) < 0)) {
+      return kRequired;
+    } else {
+      return kNotRequired;
+    }
+  }
+
+  bool CanDoTrivialMove(const Slice& /*smallest_user_key*/,
+                        const Slice& /*largest_user_key*/) override {
+    return false;
+  }
+
+  const Comparator* cmp = BytewiseComparator();
+};
+
+class ThreeRangesPartitionerFactory : public SstPartitionerFactory {
+ public:
+  static const char* kClassName() {
+    return "TombstoneTestSstPartitionerFactory";
+  }
+  const char* Name() const override { return kClassName(); }
+
+  std::unique_ptr<SstPartitioner> CreatePartitioner(
+      const SstPartitioner::Context& /* context */) const override {
+    return std::unique_ptr<SstPartitioner>(new ThreeRangesPartitioner());
+  }
+};
+
+TEST_F(PrecludeLastLevelTest, PartialPenultimateLevelCompaction) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kKeyPerSec = 10;
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.env = mock_env_.get();
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.preserve_internal_time_seconds = 10000;
+  options.num_levels = kNumLevels;
+  DestroyAndReopen(options);
+
+  // pass some time first, otherwise the first a few keys write time are going
+  // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+
+  Random rnd(301);
+
+  for (int i = 0; i < 300; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(kKeyPerSec); });
+  }
+  ASSERT_OK(Flush());
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  // make sure all data is compacted to the last level
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+  // Create 3 L5 files
+  auto factory = std::make_shared<ThreeRangesPartitionerFactory>();
+  options.sst_partitioner_factory = factory;
+
+  Reopen(options);
+
+  for (int i = 0; i < kNumTrigger - 1; i++) {
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(i * 100 + j), rnd.RandomString(10)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+
+  // L5: [0,19] [20,39] [40,299]
+  // L6: [0,                299]
+  ASSERT_EQ("0,0,0,0,0,3,1", FilesPerLevel());
+
+  // enable tiered storage feature
+  options.preclude_last_level_data_seconds = 10000;
+  options.last_level_temperature = Temperature::kCold;
+  options.statistics = CreateDBStatistics();
+  Reopen(options);
+
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(&meta);
+  ASSERT_EQ(meta.levels[5].files.size(), 3);
+  ASSERT_EQ(meta.levels[6].files.size(), 1);
+  ASSERT_EQ(meta.levels[6].files[0].smallestkey, Key(0));
+  ASSERT_EQ(meta.levels[6].files[0].largestkey, Key(299));
+
+  std::string file_path = meta.levels[5].files[1].db_path;
+  std::vector<std::string> files;
+  // pick 3rd file @L5 + file@L6 for compaction
+  files.push_back(file_path + "/" + meta.levels[5].files[2].name);
+  files.push_back(file_path + "/" + meta.levels[6].files[0].name);
+  ASSERT_OK(db_->CompactFiles(CompactionOptions(), files, 6));
+
+  // The compaction only moved partial of the hot data to hot tier, range[0,39]
+  // is unsafe to move up, otherwise, they will be overlapped with the existing
+  // files@L5.
+  // The output should be:
+  //  L5: [0,19] [20,39] [40,299]    <-- Temperature::kUnknown
+  //  L6: [0,19] [20,39]             <-- Temperature::kCold
+  // L6 file is split because of the customized partitioner
+  ASSERT_EQ("0,0,0,0,0,3,2", FilesPerLevel());
+
+  // even all the data is hot, but not all data are moved to the hot tier
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  db_->GetColumnFamilyMetaData(&meta);
+  ASSERT_EQ(meta.levels[5].files.size(), 3);
+  ASSERT_EQ(meta.levels[6].files.size(), 2);
+  for (const auto& file : meta.levels[5].files) {
+    ASSERT_EQ(file.temperature, Temperature::kUnknown);
+  }
+  for (const auto& file : meta.levels[6].files) {
+    ASSERT_EQ(file.temperature, Temperature::kCold);
+  }
+  ASSERT_EQ(meta.levels[6].files[0].smallestkey, Key(0));
+  ASSERT_EQ(meta.levels[6].files[0].largestkey, Key(19));
+  ASSERT_EQ(meta.levels[6].files[1].smallestkey, Key(20));
+  ASSERT_EQ(meta.levels[6].files[1].largestkey, Key(39));
+
+  Close();
+}
+
+struct TestPropertiesCollector : public TablePropertiesCollector {
+  Status AddUserKey(const Slice& key, const Slice& /*value*/,
+                    EntryType /*type*/, SequenceNumber /*seq*/,
+                    uint64_t /*file_size*/) override {
+    if (cmp->Compare(key, DBTestBase::Key(100)) == 0) {
+      has_key_100 = true;
+    }
+    if (cmp->Compare(key, DBTestBase::Key(200)) == 0) {
+      has_key_200 = true;
+    }
+
+    return Status::OK();
+  }
+
+  const char* Name() const override { return "TestTablePropertiesCollector"; }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    UserCollectedProperties ret;
+    return ret;
+  }
+
+  Status Finish(UserCollectedProperties* /*properties*/) override {
+    // The LSM tree would be like:
+    // L5: [0,19] [20,39] [40,299]
+    // L6: [0,                299]
+    // the 3rd file @L5 has both 100 and 200, which will be marked for
+    // compaction
+    // Also avoid marking flushed SST for compaction, which won't have both 100
+    // and 200
+    if (has_key_100 && has_key_200) {
+      need_compact_ = true;
+    } else {
+      need_compact_ = false;
+    }
+    has_key_100 = false;
+    has_key_200 = false;
+    return Status::OK();
+  }
+
+  bool NeedCompact() const override { return need_compact_; }
+
+  const Comparator* cmp = BytewiseComparator();
+
+ private:
+  bool has_key_100 = false;
+  bool has_key_200 = false;
+
+  bool need_compact_ = false;
+};
+
+class TestPropertiesCollectorFactory : public TablePropertiesCollectorFactory {
+ public:
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /*context*/) override {
+    return new TestPropertiesCollector;
+  }
+  const char* Name() const override { return "TestTablePropertiesCollector"; }
+};
+
+TEST_F(PrecludeLastLevelTest, PartialPenultimateLevelCompactionWithRangeDel) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kKeyPerSec = 10;
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.env = mock_env_.get();
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.preserve_internal_time_seconds = 10000;
+  options.num_levels = kNumLevels;
+  // set a small max_compaction_bytes to avoid input level expansion
+  options.max_compaction_bytes = 30000;
+  options.ignore_max_compaction_bytes_for_input = false;
+  DestroyAndReopen(options);
+
+  // pass some time first, otherwise the first a few keys write time are going
+  // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+
+  Random rnd(301);
+
+  for (int i = 0; i < 300; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(kKeyPerSec); });
+  }
+  ASSERT_OK(Flush());
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  // make sure all data is compacted to the last level
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+  // Create 3 L5 files
+  auto factory = std::make_shared<ThreeRangesPartitionerFactory>();
+  options.sst_partitioner_factory = factory;
+
+  // the user defined properties_collector will mark the 3rd file for compaction
+  auto collector_factory = std::make_shared<TestPropertiesCollectorFactory>();
+  options.table_properties_collector_factories.resize(1);
+  options.table_properties_collector_factories[0] = collector_factory;
+  // enable tiered storage feature
+  options.preclude_last_level_data_seconds = 10000;
+  options.last_level_temperature = Temperature::kCold;
+  Reopen(options);
+
+  for (int i = 0; i < kNumTrigger - 2; i++) {
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(i * 100 + j), rnd.RandomString(10)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // make sure there is one and only one compaction supports per-key placement
+  // but has the penultimate level output disabled.
+  std::atomic_int per_key_comp_num = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+        auto compaction = static_cast<Compaction*>(arg);
+        if (compaction->SupportsPerKeyPlacement()) {
+          ASSERT_EQ(compaction->GetPenultimateOutputRangeType(),
+                    Compaction::PenultimateOutputRangeType::kDisabled);
+          per_key_comp_num++;
+        }
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int j = 0; j < 100; j++) {
+    ASSERT_OK(Put(Key(200 + j), rnd.RandomString(10)));
+  }
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                             Key(32), Key(40)));
+  ASSERT_OK(Flush());
+
+  // Before the per-key placement compaction, the LSM tress should be like:
+  // L5: [0,19] [20,40] [40,299]
+  // L6: [0,                299]
+  // The 2nd file @L5 has the largest key 40 because of range del
+
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+
+  ASSERT_EQ(per_key_comp_num, 1);
+
+  // the compaction won't move any data to the penultimate level
+  ASSERT_EQ("0,0,0,0,0,2,3", FilesPerLevel());
+
+  Close();
+}
+
+#endif  // !defined(ROCKSDB_LITE)
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  (void)argc;
+  (void)argv;
+  return 0;
+#endif
+}
diff --git a/src/rocksdb/db/comparator_db_test.cc b/src/rocksdb/db/comparator_db_test.cc
new file mode 100644
index 000000000..e5e3493b3
--- /dev/null
+++ b/src/rocksdb/db/comparator_db_test.cc
@@ -0,0 +1,678 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#include <array>
+#include <map>
+#include <string>
+
+#include "memtable/stl_wrappers.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/hash.h"
+#include "util/kv_map.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+
+static const Comparator* kTestComparator = nullptr;
+
+class KVIter : public Iterator {
+ public:
+  explicit KVIter(const stl_wrappers::KVMap* map)
+      : map_(map), iter_(map_->end()) {}
+  bool Valid() const override { return iter_ != map_->end(); }
+  void SeekToFirst() override { iter_ = map_->begin(); }
+  void SeekToLast() override {
+    if (map_->empty()) {
+      iter_ = map_->end();
+    } else {
+      iter_ = map_->find(map_->rbegin()->first);
+    }
+  }
+  void Seek(const Slice& k) override {
+    iter_ = map_->lower_bound(k.ToString());
+  }
+  void SeekForPrev(const Slice& k) override {
+    iter_ = map_->upper_bound(k.ToString());
+    Prev();
+  }
+  void Next() override { ++iter_; }
+  void Prev() override {
+    if (iter_ == map_->begin()) {
+      iter_ = map_->end();
+      return;
+    }
+    --iter_;
+  }
+
+  Slice key() const override { return iter_->first; }
+  Slice value() const override { return iter_->second; }
+  Status status() const override { return Status::OK(); }
+
+ private:
+  const stl_wrappers::KVMap* const map_;
+  stl_wrappers::KVMap::const_iterator iter_;
+};
+
+void AssertItersEqual(Iterator* iter1, Iterator* iter2) {
+  ASSERT_EQ(iter1->Valid(), iter2->Valid());
+  if (iter1->Valid()) {
+    ASSERT_EQ(iter1->key().ToString(), iter2->key().ToString());
+    ASSERT_EQ(iter1->value().ToString(), iter2->value().ToString());
+  }
+}
+
+// Measuring operations on DB (expect to be empty).
+// source_strings are candidate keys
+void DoRandomIteraratorTest(DB* db, std::vector<std::string> source_strings,
+                            Random* rnd, int num_writes, int num_iter_ops,
+                            int num_trigger_flush) {
+  stl_wrappers::KVMap map((stl_wrappers::LessOfComparator(kTestComparator)));
+
+  for (int i = 0; i < num_writes; i++) {
+    if (num_trigger_flush > 0 && i != 0 && i % num_trigger_flush == 0) {
+      db->Flush(FlushOptions());
+    }
+
+    int type = rnd->Uniform(2);
+    int index = rnd->Uniform(static_cast<int>(source_strings.size()));
+    auto& key = source_strings[index];
+    switch (type) {
+      case 0:
+        // put
+        map[key] = key;
+        ASSERT_OK(db->Put(WriteOptions(), key, key));
+        break;
+      case 1:
+        // delete
+        if (map.find(key) != map.end()) {
+          map.erase(key);
+        }
+        ASSERT_OK(db->Delete(WriteOptions(), key));
+        break;
+      default:
+        assert(false);
+    }
+  }
+
+  std::unique_ptr<Iterator> iter(db->NewIterator(ReadOptions()));
+  std::unique_ptr<Iterator> result_iter(new KVIter(&map));
+
+  bool is_valid = false;
+  for (int i = 0; i < num_iter_ops; i++) {
+    // Random walk and make sure iter and result_iter returns the
+    // same key and value
+    int type = rnd->Uniform(6);
+    ASSERT_OK(iter->status());
+    switch (type) {
+      case 0:
+        // Seek to First
+        iter->SeekToFirst();
+        result_iter->SeekToFirst();
+        break;
+      case 1:
+        // Seek to last
+        iter->SeekToLast();
+        result_iter->SeekToLast();
+        break;
+      case 2: {
+        // Seek to random key
+        auto key_idx = rnd->Uniform(static_cast<int>(source_strings.size()));
+        auto key = source_strings[key_idx];
+        iter->Seek(key);
+        result_iter->Seek(key);
+        break;
+      }
+      case 3:
+        // Next
+        if (is_valid) {
+          iter->Next();
+          result_iter->Next();
+        } else {
+          continue;
+        }
+        break;
+      case 4:
+        // Prev
+        if (is_valid) {
+          iter->Prev();
+          result_iter->Prev();
+        } else {
+          continue;
+        }
+        break;
+      default: {
+        assert(type == 5);
+        auto key_idx = rnd->Uniform(static_cast<int>(source_strings.size()));
+        auto key = source_strings[key_idx];
+        std::string result;
+        auto status = db->Get(ReadOptions(), key, &result);
+        if (map.find(key) == map.end()) {
+          ASSERT_TRUE(status.IsNotFound());
+        } else {
+          ASSERT_EQ(map[key], result);
+        }
+        break;
+      }
+    }
+    AssertItersEqual(iter.get(), result_iter.get());
+    is_valid = iter->Valid();
+  }
+}
+
+class DoubleComparator : public Comparator {
+ public:
+  DoubleComparator() {}
+
+  const char* Name() const override { return "DoubleComparator"; }
+
+  int Compare(const Slice& a, const Slice& b) const override {
+#ifndef CYGWIN
+    double da = std::stod(a.ToString());
+    double db = std::stod(b.ToString());
+#else
+    double da = std::strtod(a.ToString().c_str(), 0 /* endptr */);
+    double db = std::strtod(a.ToString().c_str(), 0 /* endptr */);
+#endif
+    if (da == db) {
+      return a.compare(b);
+    } else if (da > db) {
+      return 1;
+    } else {
+      return -1;
+    }
+  }
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {}
+
+  void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+class HashComparator : public Comparator {
+ public:
+  HashComparator() {}
+
+  const char* Name() const override { return "HashComparator"; }
+
+  int Compare(const Slice& a, const Slice& b) const override {
+    uint32_t ha = Hash(a.data(), a.size(), 66);
+    uint32_t hb = Hash(b.data(), b.size(), 66);
+    if (ha == hb) {
+      return a.compare(b);
+    } else if (ha > hb) {
+      return 1;
+    } else {
+      return -1;
+    }
+  }
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {}
+
+  void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+class TwoStrComparator : public Comparator {
+ public:
+  TwoStrComparator() {}
+
+  const char* Name() const override { return "TwoStrComparator"; }
+
+  int Compare(const Slice& a, const Slice& b) const override {
+    assert(a.size() >= 2);
+    assert(b.size() >= 2);
+    size_t size_a1 = static_cast<size_t>(a[0]);
+    size_t size_b1 = static_cast<size_t>(b[0]);
+    size_t size_a2 = static_cast<size_t>(a[1]);
+    size_t size_b2 = static_cast<size_t>(b[1]);
+    assert(size_a1 + size_a2 + 2 == a.size());
+    assert(size_b1 + size_b2 + 2 == b.size());
+
+    Slice a1 = Slice(a.data() + 2, size_a1);
+    Slice b1 = Slice(b.data() + 2, size_b1);
+    Slice a2 = Slice(a.data() + 2 + size_a1, size_a2);
+    Slice b2 = Slice(b.data() + 2 + size_b1, size_b2);
+
+    if (a1 != b1) {
+      return a1.compare(b1);
+    }
+    return a2.compare(b2);
+  }
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {}
+
+  void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+}  // anonymous namespace
+
+class ComparatorDBTest
+    : public testing::Test,
+      virtual public ::testing::WithParamInterface<uint32_t> {
+ private:
+  std::string dbname_;
+  Env* env_;
+  DB* db_;
+  Options last_options_;
+  std::unique_ptr<const Comparator> comparator_guard;
+
+ public:
+  ComparatorDBTest() : env_(Env::Default()), db_(nullptr) {
+    kTestComparator = BytewiseComparator();
+    dbname_ = test::PerThreadDBPath("comparator_db_test");
+    BlockBasedTableOptions toptions;
+    toptions.format_version = GetParam();
+    last_options_.table_factory.reset(
+        ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(toptions));
+    EXPECT_OK(DestroyDB(dbname_, last_options_));
+  }
+
+  ~ComparatorDBTest() override {
+    delete db_;
+    EXPECT_OK(DestroyDB(dbname_, last_options_));
+    kTestComparator = BytewiseComparator();
+  }
+
+  DB* GetDB() { return db_; }
+
+  void SetOwnedComparator(const Comparator* cmp, bool owner = true) {
+    if (owner) {
+      comparator_guard.reset(cmp);
+    } else {
+      comparator_guard.reset();
+    }
+    kTestComparator = cmp;
+    last_options_.comparator = cmp;
+  }
+
+  // Return the current option configuration.
+  Options* GetOptions() { return &last_options_; }
+
+  void DestroyAndReopen() {
+    // Destroy using last options
+    Destroy();
+    ASSERT_OK(TryReopen());
+  }
+
+  void Destroy() {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, last_options_));
+  }
+
+  Status TryReopen() {
+    delete db_;
+    db_ = nullptr;
+    last_options_.create_if_missing = true;
+
+    return DB::Open(last_options_, dbname_, &db_);
+  }
+};
+
+INSTANTIATE_TEST_CASE_P(FormatDef, ComparatorDBTest,
+                        testing::Values(test::kDefaultFormatVersion));
+INSTANTIATE_TEST_CASE_P(FormatLatest, ComparatorDBTest,
+                        testing::Values(kLatestFormatVersion));
+
+TEST_P(ComparatorDBTest, Bytewise) {
+  for (int rand_seed = 301; rand_seed < 306; rand_seed++) {
+    DestroyAndReopen();
+    Random rnd(rand_seed);
+    DoRandomIteraratorTest(GetDB(),
+                           {"a", "b", "c", "d", "e", "f", "g", "h", "i"}, &rnd,
+                           8, 100, 3);
+  }
+}
+
+TEST_P(ComparatorDBTest, SimpleSuffixReverseComparator) {
+  SetOwnedComparator(new test::SimpleSuffixReverseComparator());
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = kTestComparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    std::vector<std::string> source_prefixes;
+    // Randomly generate 5 prefixes
+    for (int i = 0; i < 5; i++) {
+      source_prefixes.push_back(rnd.HumanReadableString(8));
+    }
+    for (int j = 0; j < 20; j++) {
+      int prefix_index = rnd.Uniform(static_cast<int>(source_prefixes.size()));
+      std::string key = source_prefixes[prefix_index] +
+                        rnd.HumanReadableString(rnd.Uniform(8));
+      source_strings.push_back(key);
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 30, 600, 66);
+  }
+}
+
+TEST_P(ComparatorDBTest, Uint64Comparator) {
+  SetOwnedComparator(test::Uint64Comparator(), false /* owner */);
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = kTestComparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+    Random64 rnd64(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    // Randomly generate source keys
+    for (int i = 0; i < 100; i++) {
+      uint64_t r = rnd64.Next();
+      std::string str;
+      str.resize(8);
+      memcpy(&str[0], static_cast<void*>(&r), 8);
+      source_strings.push_back(str);
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+  }
+}
+
+TEST_P(ComparatorDBTest, DoubleComparator) {
+  SetOwnedComparator(new DoubleComparator());
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = kTestComparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    // Randomly generate source keys
+    for (int i = 0; i < 100; i++) {
+      uint32_t r = rnd.Next();
+      uint32_t divide_order = rnd.Uniform(8);
+      double to_divide = 1.0;
+      for (uint32_t j = 0; j < divide_order; j++) {
+        to_divide *= 10.0;
+      }
+      source_strings.push_back(std::to_string(r / to_divide));
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+  }
+}
+
+TEST_P(ComparatorDBTest, HashComparator) {
+  SetOwnedComparator(new HashComparator());
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = kTestComparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    // Randomly generate source keys
+    for (int i = 0; i < 100; i++) {
+      source_strings.push_back(test::RandomKey(&rnd, 8));
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+  }
+}
+
+TEST_P(ComparatorDBTest, TwoStrComparator) {
+  SetOwnedComparator(new TwoStrComparator());
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = kTestComparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    // Randomly generate source keys
+    for (int i = 0; i < 100; i++) {
+      std::string str;
+      uint32_t size1 = rnd.Uniform(8);
+      uint32_t size2 = rnd.Uniform(8);
+      str.append(1, static_cast<char>(size1));
+      str.append(1, static_cast<char>(size2));
+      str.append(test::RandomKey(&rnd, size1));
+      str.append(test::RandomKey(&rnd, size2));
+      source_strings.push_back(str);
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+  }
+}
+
+namespace {
+void VerifyNotSuccessor(const Slice& s, const Slice& t) {
+  auto bc = BytewiseComparator();
+  auto rbc = ReverseBytewiseComparator();
+  ASSERT_FALSE(bc->IsSameLengthImmediateSuccessor(s, t));
+  ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(s, t));
+  ASSERT_FALSE(bc->IsSameLengthImmediateSuccessor(t, s));
+  ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(t, s));
+}
+
+void VerifySuccessor(const Slice& s, const Slice& t) {
+  auto bc = BytewiseComparator();
+  auto rbc = ReverseBytewiseComparator();
+  ASSERT_TRUE(bc->IsSameLengthImmediateSuccessor(s, t));
+  ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(s, t));
+  ASSERT_FALSE(bc->IsSameLengthImmediateSuccessor(t, s));
+  // Should be true but that increases exposure to a design bug in
+  // auto_prefix_mode, so currently set to FALSE
+  ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(t, s));
+}
+
+}  // anonymous namespace
+
+TEST_P(ComparatorDBTest, IsSameLengthImmediateSuccessor) {
+  {
+    // different length
+    Slice s("abcxy");
+    Slice t("abcxyz");
+    VerifyNotSuccessor(s, t);
+  }
+  {
+    Slice s("abcxyz");
+    Slice t("abcxy");
+    VerifyNotSuccessor(s, t);
+  }
+  {
+    // not last byte different
+    Slice s("abc1xyz");
+    Slice t("abc2xyz");
+    VerifyNotSuccessor(s, t);
+  }
+  {
+    // same string
+    Slice s("abcxyz");
+    Slice t("abcxyz");
+    VerifyNotSuccessor(s, t);
+  }
+  {
+    Slice s("abcxy");
+    Slice t("abcxz");
+    VerifySuccessor(s, t);
+  }
+  {
+    const char s_array[] = "\x50\x8a\xac";
+    const char t_array[] = "\x50\x8a\xad";
+    Slice s(s_array);
+    Slice t(t_array);
+    VerifySuccessor(s, t);
+  }
+  {
+    const char s_array[] = "\x50\x8a\xff";
+    const char t_array[] = "\x50\x8b\x00";
+    Slice s(s_array, 3);
+    Slice t(t_array, 3);
+    VerifySuccessor(s, t);
+  }
+  {
+    const char s_array[] = "\x50\x8a\xff\xff";
+    const char t_array[] = "\x50\x8b\x00\x00";
+    Slice s(s_array, 4);
+    Slice t(t_array, 4);
+    VerifySuccessor(s, t);
+  }
+  {
+    const char s_array[] = "\x50\x8a\xff\xff";
+    const char t_array[] = "\x50\x8b\x00\x01";
+    Slice s(s_array, 4);
+    Slice t(t_array, 4);
+    VerifyNotSuccessor(s, t);
+  }
+}
+
+TEST_P(ComparatorDBTest, FindShortestSeparator) {
+  std::string s1 = "abc1xyz";
+  std::string s2 = "abc3xy";
+
+  BytewiseComparator()->FindShortestSeparator(&s1, s2);
+  ASSERT_EQ("abc2", s1);
+
+  s1 = "abc5xyztt";
+
+  ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+  ASSERT_EQ("abc5", s1);
+
+  s1 = "abc3";
+  s2 = "abc2xy";
+  ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+  ASSERT_EQ("abc3", s1);
+
+  s1 = "abc3xyz";
+  s2 = "abc2xy";
+  ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+  ASSERT_EQ("abc3", s1);
+
+  s1 = "abc3xyz";
+  s2 = "abc2";
+  ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+  ASSERT_EQ("abc3", s1);
+
+  std::string old_s1 = s1 = "abc2xy";
+  s2 = "abc2";
+  ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+  ASSERT_TRUE(old_s1 >= s1);
+  ASSERT_TRUE(s1 > s2);
+}
+
+TEST_P(ComparatorDBTest, SeparatorSuccessorRandomizeTest) {
+  // Char list for boundary cases.
+  std::array<unsigned char, 6> char_list{{0, 1, 2, 253, 254, 255}};
+  Random rnd(301);
+
+  for (int attempts = 0; attempts < 1000; attempts++) {
+    uint32_t size1 = rnd.Skewed(4);
+    uint32_t size2;
+
+    if (rnd.OneIn(2)) {
+      // size2 to be random size
+      size2 = rnd.Skewed(4);
+    } else {
+      // size1 is within [-2, +2] of size1
+      int diff = static_cast<int>(rnd.Uniform(5)) - 2;
+      int tmp_size2 = static_cast<int>(size1) + diff;
+      if (tmp_size2 < 0) {
+        tmp_size2 = 0;
+      }
+      size2 = static_cast<uint32_t>(tmp_size2);
+    }
+
+    std::string s1;
+    std::string s2;
+    for (uint32_t i = 0; i < size1; i++) {
+      if (rnd.OneIn(2)) {
+        // Use random byte
+        s1 += static_cast<char>(rnd.Uniform(256));
+      } else {
+        // Use one byte in char_list
+        char c = static_cast<char>(char_list[rnd.Uniform(sizeof(char_list))]);
+        s1 += c;
+      }
+    }
+
+    // First set s2 to be the same as s1, and then modify s2.
+    s2 = s1;
+    s2.resize(size2);
+    // We start from the back of the string
+    if (size2 > 0) {
+      uint32_t pos = size2 - 1;
+      do {
+        if (pos >= size1 || rnd.OneIn(4)) {
+          // For 1/4 chance, use random byte
+          s2[pos] = static_cast<char>(rnd.Uniform(256));
+        } else if (rnd.OneIn(4)) {
+          // In 1/4 chance, stop here.
+          break;
+        } else {
+          // Create a char within [-2, +2] of the matching char of s1.
+          int diff = static_cast<int>(rnd.Uniform(5)) - 2;
+          // char may be signed or unsigned based on platform.
+          int s1_char = static_cast<int>(static_cast<unsigned char>(s1[pos]));
+          int s2_char = s1_char + diff;
+          if (s2_char < 0) {
+            s2_char = 0;
+          }
+          if (s2_char > 255) {
+            s2_char = 255;
+          }
+          s2[pos] = static_cast<char>(s2_char);
+        }
+      } while (pos-- != 0);
+    }
+
+    // Test separators
+    for (int rev = 0; rev < 2; rev++) {
+      if (rev == 1) {
+        // switch s1 and s2
+        std::string t = s1;
+        s1 = s2;
+        s2 = t;
+      }
+      std::string separator = s1;
+      BytewiseComparator()->FindShortestSeparator(&separator, s2);
+      std::string rev_separator = s1;
+      ReverseBytewiseComparator()->FindShortestSeparator(&rev_separator, s2);
+
+      if (s1 == s2) {
+        ASSERT_EQ(s1, separator);
+        ASSERT_EQ(s2, rev_separator);
+      } else if (s1 < s2) {
+        ASSERT_TRUE(s1 <= separator);
+        ASSERT_TRUE(s2 > separator);
+        ASSERT_LE(separator.size(), std::max(s1.size(), s2.size()));
+        ASSERT_EQ(s1, rev_separator);
+      } else {
+        ASSERT_TRUE(s1 >= rev_separator);
+        ASSERT_TRUE(s2 < rev_separator);
+        ASSERT_LE(rev_separator.size(), std::max(s1.size(), s2.size()));
+        ASSERT_EQ(s1, separator);
+      }
+    }
+
+    // Test successors
+    std::string succ = s1;
+    BytewiseComparator()->FindShortSuccessor(&succ);
+    ASSERT_TRUE(succ >= s1);
+
+    succ = s1;
+    ReverseBytewiseComparator()->FindShortSuccessor(&succ);
+    ASSERT_TRUE(succ <= s1);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/convenience.cc b/src/rocksdb/db/convenience.cc
new file mode 100644
index 000000000..6344d356d
--- /dev/null
+++ b/src/rocksdb/db/convenience.cc
@@ -0,0 +1,81 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/convenience.h"
+
+#include "db/db_impl/db_impl.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void CancelAllBackgroundWork(DB* db, bool wait) {
+  (static_cast_with_check<DBImpl>(db->GetRootDB()))
+      ->CancelAllBackgroundWork(wait);
+}
+
+Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family,
+                          const Slice* begin, const Slice* end,
+                          bool include_end) {
+  RangePtr range(begin, end);
+  return DeleteFilesInRanges(db, column_family, &range, 1, include_end);
+}
+
+Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
+                           const RangePtr* ranges, size_t n, bool include_end) {
+  return (static_cast_with_check<DBImpl>(db->GetRootDB()))
+      ->DeleteFilesInRanges(column_family, ranges, n, include_end);
+}
+
+Status VerifySstFileChecksum(const Options& options,
+                             const EnvOptions& env_options,
+                             const std::string& file_path) {
+  return VerifySstFileChecksum(options, env_options, ReadOptions(), file_path);
+}
+Status VerifySstFileChecksum(const Options& options,
+                             const EnvOptions& env_options,
+                             const ReadOptions& read_options,
+                             const std::string& file_path,
+                             const SequenceNumber& largest_seqno) {
+  std::unique_ptr<FSRandomAccessFile> file;
+  uint64_t file_size;
+  InternalKeyComparator internal_comparator(options.comparator);
+  ImmutableOptions ioptions(options);
+
+  Status s = ioptions.fs->NewRandomAccessFile(
+      file_path, FileOptions(env_options), &file, nullptr);
+  if (s.ok()) {
+    s = ioptions.fs->GetFileSize(file_path, IOOptions(), &file_size, nullptr);
+  } else {
+    return s;
+  }
+  std::unique_ptr<TableReader> table_reader;
+  std::unique_ptr<RandomAccessFileReader> file_reader(
+      new RandomAccessFileReader(
+          std::move(file), file_path, ioptions.clock, nullptr /* io_tracer */,
+          nullptr /* stats */, 0 /* hist_type */, nullptr /* file_read_hist */,
+          ioptions.rate_limiter.get()));
+  const bool kImmortal = true;
+  auto reader_options = TableReaderOptions(
+      ioptions, options.prefix_extractor, env_options, internal_comparator,
+      false /* skip_filters */, !kImmortal, false /* force_direct_prefetch */,
+      -1 /* level */);
+  reader_options.largest_seqno = largest_seqno;
+  s = ioptions.table_factory->NewTableReader(
+      reader_options, std::move(file_reader), file_size, &table_reader,
+      false /* prefetch_index_and_filter_in_cache */);
+  if (!s.ok()) {
+    return s;
+  }
+  s = table_reader->VerifyChecksum(read_options,
+                                   TableReaderCaller::kUserVerifyChecksum);
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/corruption_test.cc b/src/rocksdb/db/corruption_test.cc
new file mode 100644
index 000000000..8ccac6130
--- /dev/null
+++ b/src/rocksdb/db/corruption_test.cc
@@ -0,0 +1,1587 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/options.h"
+#ifndef ROCKSDB_LITE
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <cinttypes>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/log_format.h"
+#include "db/version_set.h"
+#include "file/filename.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/write_batch.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/meta_blocks.h"
+#include "table/mock_table.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/cast_util.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static constexpr int kValueSize = 1000;
+namespace {
+// A wrapper that allows injection of errors.
+class ErrorEnv : public EnvWrapper {
+ public:
+  bool writable_file_error_;
+  int num_writable_file_errors_;
+
+  explicit ErrorEnv(Env* _target)
+      : EnvWrapper(_target),
+        writable_file_error_(false),
+        num_writable_file_errors_(0) {}
+  const char* Name() const override { return "ErrorEnv"; }
+
+  virtual Status NewWritableFile(const std::string& fname,
+                                 std::unique_ptr<WritableFile>* result,
+                                 const EnvOptions& soptions) override {
+    result->reset();
+    if (writable_file_error_) {
+      ++num_writable_file_errors_;
+      return Status::IOError(fname, "fake error");
+    }
+    return target()->NewWritableFile(fname, result, soptions);
+  }
+};
+}  // anonymous namespace
+class CorruptionTest : public testing::Test {
+ public:
+  std::shared_ptr<Env> env_guard_;
+  ErrorEnv* env_;
+  std::string dbname_;
+  std::shared_ptr<Cache> tiny_cache_;
+  Options options_;
+  DB* db_;
+
+  CorruptionTest() {
+    // If LRU cache shard bit is smaller than 2 (or -1 which will automatically
+    // set it to 0), test SequenceNumberRecovery will fail, likely because of a
+    // bug in recovery code. Keep it 4 for now to make the test passes.
+    tiny_cache_ = NewLRUCache(100, 4);
+    Env* base_env = Env::Default();
+    EXPECT_OK(
+        test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_));
+    EXPECT_NE(base_env, nullptr);
+    env_ = new ErrorEnv(base_env);
+    options_.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords;
+    options_.env = env_;
+    dbname_ = test::PerThreadDBPath(env_, "corruption_test");
+    Status s = DestroyDB(dbname_, options_);
+    EXPECT_OK(s);
+
+    db_ = nullptr;
+    options_.create_if_missing = true;
+    BlockBasedTableOptions table_options;
+    table_options.block_size_deviation = 0;  // make unit test pass for now
+    options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    Reopen();
+    options_.create_if_missing = false;
+  }
+
+  ~CorruptionTest() override {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->LoadDependency({});
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    delete db_;
+    db_ = nullptr;
+    if (getenv("KEEP_DB")) {
+      fprintf(stdout, "db is still at %s\n", dbname_.c_str());
+    } else {
+      Options opts;
+      opts.env = env_->target();
+      EXPECT_OK(DestroyDB(dbname_, opts));
+    }
+    delete env_;
+  }
+
+  void CloseDb() {
+    delete db_;
+    db_ = nullptr;
+  }
+
+  Status TryReopen(Options* options = nullptr) {
+    delete db_;
+    db_ = nullptr;
+    Options opt = (options ? *options : options_);
+    if (opt.env == Options().env) {
+      // If env is not overridden, replace it with ErrorEnv.
+      // Otherwise, the test already uses a non-default Env.
+      opt.env = env_;
+    }
+    opt.arena_block_size = 4096;
+    BlockBasedTableOptions table_options;
+    table_options.block_cache = tiny_cache_;
+    table_options.block_size_deviation = 0;
+    opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    return DB::Open(opt, dbname_, &db_);
+  }
+
+  void Reopen(Options* options = nullptr) { ASSERT_OK(TryReopen(options)); }
+
+  void RepairDB() {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(::ROCKSDB_NAMESPACE::RepairDB(dbname_, options_));
+  }
+
+  void Build(int n, int start, int flush_every) {
+    std::string key_space, value_space;
+    WriteBatch batch;
+    for (int i = 0; i < n; i++) {
+      if (flush_every != 0 && i != 0 && i % flush_every == 0) {
+        DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+        ASSERT_OK(dbi->TEST_FlushMemTable());
+      }
+      // if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
+      Slice key = Key(i + start, &key_space);
+      batch.Clear();
+      ASSERT_OK(batch.Put(key, Value(i + start, &value_space)));
+      ASSERT_OK(db_->Write(WriteOptions(), &batch));
+    }
+  }
+
+  void Build(int n, int flush_every = 0) { Build(n, 0, flush_every); }
+
+  void Check(int min_expected, int max_expected) {
+    uint64_t next_expected = 0;
+    uint64_t missed = 0;
+    int bad_keys = 0;
+    int bad_values = 0;
+    int correct = 0;
+    std::string value_space;
+    // Do not verify checksums. If we verify checksums then the
+    // db itself will raise errors because data is corrupted.
+    // Instead, we want the reads to be successful and this test
+    // will detect whether the appropriate corruptions have
+    // occurred.
+    Iterator* iter = db_->NewIterator(ReadOptions(false, true));
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      uint64_t key;
+      Slice in(iter->key());
+      if (!ConsumeDecimalNumber(&in, &key) || !in.empty() ||
+          key < next_expected) {
+        bad_keys++;
+        continue;
+      }
+      missed += (key - next_expected);
+      next_expected = key + 1;
+      if (iter->value() != Value(static_cast<int>(key), &value_space)) {
+        bad_values++;
+      } else {
+        correct++;
+      }
+    }
+    iter->status().PermitUncheckedError();
+    delete iter;
+
+    fprintf(
+        stderr,
+        "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%llu\n",
+        min_expected, max_expected, correct, bad_keys, bad_values,
+        static_cast<unsigned long long>(missed));
+    ASSERT_LE(min_expected, correct);
+    ASSERT_GE(max_expected, correct);
+  }
+
+  void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
+    // Pick file to corrupt
+    std::vector<std::string> filenames;
+    ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+    uint64_t number;
+    FileType type;
+    std::string fname;
+    int picked_number = -1;
+    for (size_t i = 0; i < filenames.size(); i++) {
+      if (ParseFileName(filenames[i], &number, &type) && type == filetype &&
+          static_cast<int>(number) > picked_number) {  // Pick latest file
+        fname = dbname_ + "/" + filenames[i];
+        picked_number = static_cast<int>(number);
+      }
+    }
+    ASSERT_TRUE(!fname.empty()) << filetype;
+
+    ASSERT_OK(test::CorruptFile(env_, fname, offset, bytes_to_corrupt));
+  }
+
+  // corrupts exactly one file at level `level`. if no file found at level,
+  // asserts
+  void CorruptTableFileAtLevel(int level, int offset, int bytes_to_corrupt) {
+    std::vector<LiveFileMetaData> metadata;
+    db_->GetLiveFilesMetaData(&metadata);
+    for (const auto& m : metadata) {
+      if (m.level == level) {
+        ASSERT_OK(test::CorruptFile(env_, dbname_ + "/" + m.name, offset,
+                                    bytes_to_corrupt));
+        return;
+      }
+    }
+    FAIL() << "no file found at level";
+  }
+
+  int Property(const std::string& name) {
+    std::string property;
+    int result;
+    if (db_->GetProperty(name, &property) &&
+        sscanf(property.c_str(), "%d", &result) == 1) {
+      return result;
+    } else {
+      return -1;
+    }
+  }
+
+  // Return the ith key
+  Slice Key(int i, std::string* storage) {
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%016d", i);
+    storage->assign(buf, strlen(buf));
+    return Slice(*storage);
+  }
+
+  // Return the value to associate with the specified key
+  Slice Value(int k, std::string* storage) {
+    if (k == 0) {
+      // Ugh.  Random seed of 0 used to produce no entropy.  This code
+      // preserves the implementation that was in place when all of the
+      // magic values in this file were picked.
+      *storage = std::string(kValueSize, ' ');
+    } else {
+      Random r(k);
+      *storage = r.RandomString(kValueSize);
+    }
+    return Slice(*storage);
+  }
+
+  void GetSortedWalFiles(std::vector<uint64_t>& file_nums) {
+    std::vector<std::string> tmp_files;
+    ASSERT_OK(env_->GetChildren(dbname_, &tmp_files));
+    FileType type = kWalFile;
+    for (const auto& file : tmp_files) {
+      uint64_t number = 0;
+      if (ParseFileName(file, &number, &type) && type == kWalFile) {
+        file_nums.push_back(number);
+      }
+    }
+    std::sort(file_nums.begin(), file_nums.end());
+  }
+
+  void CorruptFileWithTruncation(FileType file, uint64_t number,
+                                 uint64_t bytes_to_truncate = 0) {
+    std::string path;
+    switch (file) {
+      case FileType::kWalFile:
+        path = LogFileName(dbname_, number);
+        break;
+      // TODO: Add other file types as this method is being used for those file
+      // types.
+      default:
+        return;
+    }
+    uint64_t old_size = 0;
+    ASSERT_OK(env_->GetFileSize(path, &old_size));
+    assert(old_size > bytes_to_truncate);
+    uint64_t new_size = old_size - bytes_to_truncate;
+    // If bytes_to_truncate == 0, it will do full truncation.
+    if (bytes_to_truncate == 0) {
+      new_size = 0;
+    }
+    ASSERT_OK(test::TruncateFile(env_, path, new_size));
+  }
+};
+
+TEST_F(CorruptionTest, Recovery) {
+  Build(100);
+  Check(100, 100);
+#ifdef OS_WIN
+  // On Wndows OS Disk cache does not behave properly
+  // We do not call FlushBuffers on every Flush. If we do not close
+  // the log file prior to the corruption we end up with the first
+  // block not corrupted but only the second. However, under the debugger
+  // things work just fine but never pass when running normally
+  // For that reason people may want to run with unbuffered I/O. That option
+  // is not available for WAL though.
+  CloseDb();
+#endif
+  Corrupt(kWalFile, 19, 1);  // WriteBatch tag for first record
+  Corrupt(kWalFile, log::kBlockSize + 1000, 1);  // Somewhere in second block
+  ASSERT_TRUE(!TryReopen().ok());
+  options_.paranoid_checks = false;
+  Reopen(&options_);
+
+  // The 64 records in the first two log blocks are completely lost.
+  Check(36, 36);
+}
+
+TEST_F(CorruptionTest, PostPITRCorruptionWALsRetained) {
+  // Repro for bug where WALs following the point-in-time recovery were not
+  // retained leading to the next recovery failing.
+  CloseDb();
+
+  options_.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+
+  const std::string test_cf_name = "test_cf";
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
+  cf_descs.emplace_back(test_cf_name, ColumnFamilyOptions());
+
+  uint64_t log_num;
+  {
+    options_.create_missing_column_families = true;
+    std::vector<ColumnFamilyHandle*> cfhs;
+    ASSERT_OK(DB::Open(options_, dbname_, cf_descs, &cfhs, &db_));
+    assert(db_ != nullptr);  // suppress false clang-analyze report
+
+    ASSERT_OK(db_->Put(WriteOptions(), cfhs[0], "k", "v"));
+    ASSERT_OK(db_->Put(WriteOptions(), cfhs[1], "k", "v"));
+    ASSERT_OK(db_->Put(WriteOptions(), cfhs[0], "k2", "v2"));
+    std::vector<uint64_t> file_nums;
+    GetSortedWalFiles(file_nums);
+    log_num = file_nums.back();
+    for (auto* cfh : cfhs) {
+      delete cfh;
+    }
+    CloseDb();
+  }
+
+  CorruptFileWithTruncation(FileType::kWalFile, log_num,
+                            /*bytes_to_truncate=*/1);
+
+  {
+    // Recover "k" -> "v" for both CFs. "k2" -> "v2" is lost due to truncation.
+    options_.avoid_flush_during_recovery = true;
+    std::vector<ColumnFamilyHandle*> cfhs;
+    ASSERT_OK(DB::Open(options_, dbname_, cf_descs, &cfhs, &db_));
+    assert(db_ != nullptr);  // suppress false clang-analyze report
+
+    // Flush one but not both CFs and write some data so there's a seqno gap
+    // between the PITR corruption and the next DB session's first WAL.
+    ASSERT_OK(db_->Put(WriteOptions(), cfhs[1], "k2", "v2"));
+    ASSERT_OK(db_->Flush(FlushOptions(), cfhs[1]));
+
+    for (auto* cfh : cfhs) {
+      delete cfh;
+    }
+    CloseDb();
+  }
+
+  // With the bug, this DB open would remove the WALs following the PITR
+  // corruption. Then, the next recovery would fail.
+  for (int i = 0; i < 2; ++i) {
+    std::vector<ColumnFamilyHandle*> cfhs;
+    ASSERT_OK(DB::Open(options_, dbname_, cf_descs, &cfhs, &db_));
+    assert(db_ != nullptr);  // suppress false clang-analyze report
+
+    for (auto* cfh : cfhs) {
+      delete cfh;
+    }
+    CloseDb();
+  }
+}
+
+TEST_F(CorruptionTest, RecoverWriteError) {
+  env_->writable_file_error_ = true;
+  Status s = TryReopen();
+  ASSERT_TRUE(!s.ok());
+}
+
+TEST_F(CorruptionTest, NewFileErrorDuringWrite) {
+  // Do enough writing to force minor compaction
+  env_->writable_file_error_ = true;
+  const int num =
+      static_cast<int>(3 + (Options().write_buffer_size / kValueSize));
+  std::string value_storage;
+  Status s;
+  bool failed = false;
+  for (int i = 0; i < num; i++) {
+    WriteBatch batch;
+    ASSERT_OK(batch.Put("a", Value(100, &value_storage)));
+    s = db_->Write(WriteOptions(), &batch);
+    if (!s.ok()) {
+      failed = true;
+    }
+    ASSERT_TRUE(!failed || !s.ok());
+  }
+  ASSERT_TRUE(!s.ok());
+  ASSERT_GE(env_->num_writable_file_errors_, 1);
+  env_->writable_file_error_ = false;
+  Reopen();
+}
+
+TEST_F(CorruptionTest, TableFile) {
+  Build(100);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr));
+  ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr));
+
+  Corrupt(kTableFile, 100, 1);
+  Check(99, 99);
+  ASSERT_NOK(dbi->VerifyChecksum());
+}
+
+TEST_F(CorruptionTest, VerifyChecksumReadahead) {
+  Options options;
+  SpecialEnv senv(env_->target());
+  options.env = &senv;
+  // Disable block cache as we are going to check checksum for
+  // the same file twice and measure number of reads.
+  BlockBasedTableOptions table_options_no_bc;
+  table_options_no_bc.no_block_cache = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options_no_bc));
+
+  Reopen(&options);
+
+  Build(10000);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr));
+  ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr));
+
+  senv.count_random_reads_ = true;
+  senv.random_read_counter_.Reset();
+  ASSERT_OK(dbi->VerifyChecksum());
+
+  // Make sure the counter is enabled.
+  ASSERT_GT(senv.random_read_counter_.Read(), 0);
+
+  // The SST file is about 10MB. Default readahead size is 256KB.
+  // Give a conservative 20 reads for metadata blocks, The number
+  // of random reads should be within 10 MB / 256KB + 20 = 60.
+  ASSERT_LT(senv.random_read_counter_.Read(), 60);
+
+  senv.random_read_bytes_counter_ = 0;
+  ReadOptions ro;
+  ro.readahead_size = size_t{32 * 1024};
+  ASSERT_OK(dbi->VerifyChecksum(ro));
+  // The SST file is about 10MB. We set readahead size to 32KB.
+  // Give 0 to 20 reads for metadata blocks, and allow real read
+  // to range from 24KB to 48KB. The lower bound would be:
+  //   10MB / 48KB + 0 = 213
+  // The higher bound is
+  //   10MB / 24KB + 20 = 447.
+  ASSERT_GE(senv.random_read_counter_.Read(), 213);
+  ASSERT_LE(senv.random_read_counter_.Read(), 447);
+
+  // Test readahead shouldn't break mmap mode (where it should be
+  // disabled).
+  options.allow_mmap_reads = true;
+  Reopen(&options);
+  dbi = static_cast<DBImpl*>(db_);
+  ASSERT_OK(dbi->VerifyChecksum(ro));
+
+  CloseDb();
+}
+
+TEST_F(CorruptionTest, TableFileIndexData) {
+  Options options;
+  // very big, we'll trigger flushes manually
+  options.write_buffer_size = 100 * 1024 * 1024;
+  Reopen(&options);
+  // build 2 tables, flush at 5000
+  Build(10000, 5000);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+
+  // corrupt an index block of an entire file
+  Corrupt(kTableFile, -2000, 500);
+  options.paranoid_checks = false;
+  Reopen(&options);
+  dbi = static_cast_with_check<DBImpl>(db_);
+  // one full file may be readable, since only one was corrupted
+  // the other file should be fully non-readable, since index was corrupted
+  Check(0, 5000);
+  ASSERT_NOK(dbi->VerifyChecksum());
+
+  // In paranoid mode, the db cannot be opened due to the corrupted file.
+  ASSERT_TRUE(TryReopen().IsCorruption());
+}
+
+TEST_F(CorruptionTest, MissingDescriptor) {
+  Build(1000);
+  RepairDB();
+  Reopen();
+  Check(1000, 1000);
+}
+
+TEST_F(CorruptionTest, SequenceNumberRecovery) {
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5"));
+  RepairDB();
+  Reopen();
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("v5", v);
+  // Write something.  If sequence number was not recovered properly,
+  // it will be hidden by an earlier write.
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6"));
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("v6", v);
+  Reopen();
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("v6", v);
+}
+
+TEST_F(CorruptionTest, CorruptedDescriptor) {
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(
+      dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
+
+  Corrupt(kDescriptorFile, 0, 1000);
+  Status s = TryReopen();
+  ASSERT_TRUE(!s.ok());
+
+  RepairDB();
+  Reopen();
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("hello", v);
+}
+
+TEST_F(CorruptionTest, CompactionInputError) {
+  Options options;
+  options.env = env_;
+  Reopen(&options);
+  Build(10);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr));
+  ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr));
+  ASSERT_EQ(1, Property("rocksdb.num-files-at-level2"));
+
+  Corrupt(kTableFile, 100, 1);
+  Check(9, 9);
+  ASSERT_NOK(dbi->VerifyChecksum());
+
+  // Force compactions by writing lots of values
+  Build(10000);
+  Check(10000, 10000);
+  ASSERT_NOK(dbi->VerifyChecksum());
+}
+
+TEST_F(CorruptionTest, CompactionInputErrorParanoid) {
+  Options options;
+  options.env = env_;
+  options.paranoid_checks = true;
+  options.write_buffer_size = 131072;
+  options.max_write_buffer_number = 2;
+  Reopen(&options);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+
+  // Fill levels >= 1
+  for (int level = 1; level < dbi->NumberLevels(); level++) {
+    ASSERT_OK(dbi->Put(WriteOptions(), "", "begin"));
+    ASSERT_OK(dbi->Put(WriteOptions(), "~", "end"));
+    ASSERT_OK(dbi->TEST_FlushMemTable());
+    for (int comp_level = 0; comp_level < dbi->NumberLevels() - level;
+         ++comp_level) {
+      ASSERT_OK(dbi->TEST_CompactRange(comp_level, nullptr, nullptr));
+    }
+  }
+
+  Reopen(&options);
+
+  dbi = static_cast_with_check<DBImpl>(db_);
+  Build(10);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbi->TEST_WaitForCompact());
+  ASSERT_EQ(1, Property("rocksdb.num-files-at-level0"));
+
+  CorruptTableFileAtLevel(0, 100, 1);
+  Check(9, 9);
+  ASSERT_NOK(dbi->VerifyChecksum());
+
+  // Write must eventually fail because of corrupted table
+  Status s;
+  std::string tmp1, tmp2;
+  bool failed = false;
+  for (int i = 0; i < 10000; i++) {
+    s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
+    if (!s.ok()) {
+      failed = true;
+    }
+    // if one write failed, every subsequent write must fail, too
+    ASSERT_TRUE(!failed || !s.ok()) << "write did not fail in a corrupted db";
+  }
+  ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
+}
+
+TEST_F(CorruptionTest, UnrelatedKeys) {
+  Build(10);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+  Corrupt(kTableFile, 100, 1);
+  ASSERT_NOK(dbi->VerifyChecksum());
+
+  std::string tmp1, tmp2;
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
+  ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
+  ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
+}
+
+TEST_F(CorruptionTest, RangeDeletionCorrupted) {
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(static_cast<size_t>(1), metadata.size());
+  std::string filename = dbname_ + metadata[0].name;
+
+  FileOptions file_opts;
+  const auto& fs = options_.env->GetFileSystem();
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  ASSERT_OK(RandomAccessFileReader::Create(fs, filename, file_opts,
+                                           &file_reader, nullptr));
+
+  uint64_t file_size;
+  ASSERT_OK(
+      fs->GetFileSize(filename, file_opts.io_options, &file_size, nullptr));
+
+  BlockHandle range_del_handle;
+  ASSERT_OK(FindMetaBlockInFile(
+      file_reader.get(), file_size, kBlockBasedTableMagicNumber,
+      ImmutableOptions(options_), kRangeDelBlockName, &range_del_handle));
+
+  ASSERT_OK(TryReopen());
+  ASSERT_OK(test::CorruptFile(env_, filename,
+                              static_cast<int>(range_del_handle.offset()), 1));
+  ASSERT_TRUE(TryReopen().IsCorruption());
+}
+
+TEST_F(CorruptionTest, FileSystemStateCorrupted) {
+  for (int iter = 0; iter < 2; ++iter) {
+    Options options;
+    options.env = env_;
+    options.paranoid_checks = true;
+    options.create_if_missing = true;
+    Reopen(&options);
+    Build(10);
+    ASSERT_OK(db_->Flush(FlushOptions()));
+    DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+    std::vector<LiveFileMetaData> metadata;
+    dbi->GetLiveFilesMetaData(&metadata);
+    ASSERT_GT(metadata.size(), 0);
+    std::string filename = dbname_ + metadata[0].name;
+
+    delete db_;
+    db_ = nullptr;
+
+    if (iter == 0) {  // corrupt file size
+      std::unique_ptr<WritableFile> file;
+      ASSERT_OK(env_->NewWritableFile(filename, &file, EnvOptions()));
+      ASSERT_OK(file->Append(Slice("corrupted sst")));
+      file.reset();
+      Status x = TryReopen(&options);
+      ASSERT_TRUE(x.IsCorruption());
+    } else {  // delete the file
+      ASSERT_OK(env_->DeleteFile(filename));
+      Status x = TryReopen(&options);
+      ASSERT_TRUE(x.IsCorruption());
+    }
+
+    ASSERT_OK(DestroyDB(dbname_, options_));
+  }
+}
+
+static const auto& corruption_modes = {
+    mock::MockTableFactory::kCorruptNone, mock::MockTableFactory::kCorruptKey,
+    mock::MockTableFactory::kCorruptValue,
+    mock::MockTableFactory::kCorruptReorderKey};
+
+TEST_F(CorruptionTest, ParanoidFileChecksOnFlush) {
+  Options options;
+  options.env = env_;
+  options.check_flush_compaction_key_order = false;
+  options.paranoid_file_checks = true;
+  options.create_if_missing = true;
+  Status s;
+  for (const auto& mode : corruption_modes) {
+    delete db_;
+    db_ = nullptr;
+    s = DestroyDB(dbname_, options);
+    ASSERT_OK(s);
+    std::shared_ptr<mock::MockTableFactory> mock =
+        std::make_shared<mock::MockTableFactory>();
+    options.table_factory = mock;
+    mock->SetCorruptionMode(mode);
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    assert(db_ != nullptr);  // suppress false clang-analyze report
+    Build(10);
+    s = db_->Flush(FlushOptions());
+    if (mode == mock::MockTableFactory::kCorruptNone) {
+      ASSERT_OK(s);
+    } else {
+      ASSERT_NOK(s);
+    }
+  }
+}
+
+TEST_F(CorruptionTest, ParanoidFileChecksOnCompact) {
+  Options options;
+  options.env = env_;
+  options.paranoid_file_checks = true;
+  options.create_if_missing = true;
+  options.check_flush_compaction_key_order = false;
+  Status s;
+  for (const auto& mode : corruption_modes) {
+    delete db_;
+    db_ = nullptr;
+    s = DestroyDB(dbname_, options);
+    ASSERT_OK(s);
+    std::shared_ptr<mock::MockTableFactory> mock =
+        std::make_shared<mock::MockTableFactory>();
+    options.table_factory = mock;
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    assert(db_ != nullptr);  // suppress false clang-analyze report
+    Build(100, 2);
+    // ASSERT_OK(db_->Flush(FlushOptions()));
+    DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+    ASSERT_OK(dbi->TEST_FlushMemTable());
+    mock->SetCorruptionMode(mode);
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+    s = dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr);
+    if (mode == mock::MockTableFactory::kCorruptNone) {
+      ASSERT_OK(s);
+    } else {
+      ASSERT_NOK(s);
+    }
+  }
+}
+
+TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeFirst) {
+  Options options;
+  options.env = env_;
+  options.check_flush_compaction_key_order = false;
+  options.paranoid_file_checks = true;
+  options.create_if_missing = true;
+  for (bool do_flush : {true, false}) {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, options));
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    std::string start, end;
+    assert(db_ != nullptr);  // suppress false clang-analyze report
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(3, &start), Key(7, &end)));
+    auto snap = db_->GetSnapshot();
+    ASSERT_NE(snap, nullptr);
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(8, &start), Key(9, &end)));
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(2, &start), Key(5, &end)));
+    Build(10);
+    if (do_flush) {
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    } else {
+      DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+      ASSERT_OK(dbi->TEST_FlushMemTable());
+      CompactRangeOptions cro;
+      cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+      ASSERT_OK(
+          dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
+    }
+    db_->ReleaseSnapshot(snap);
+  }
+}
+
+TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRange) {
+  Options options;
+  options.env = env_;
+  options.check_flush_compaction_key_order = false;
+  options.paranoid_file_checks = true;
+  options.create_if_missing = true;
+  for (bool do_flush : {true, false}) {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, options));
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    assert(db_ != nullptr);  // suppress false clang-analyze report
+    Build(10, 0, 0);
+    std::string start, end;
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(5, &start), Key(15, &end)));
+    auto snap = db_->GetSnapshot();
+    ASSERT_NE(snap, nullptr);
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(8, &start), Key(9, &end)));
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(12, &start), Key(17, &end)));
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(2, &start), Key(4, &end)));
+    Build(10, 10, 0);
+    if (do_flush) {
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    } else {
+      DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+      ASSERT_OK(dbi->TEST_FlushMemTable());
+      CompactRangeOptions cro;
+      cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+      ASSERT_OK(
+          dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
+    }
+    db_->ReleaseSnapshot(snap);
+  }
+}
+
+TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeLast) {
+  Options options;
+  options.env = env_;
+  options.check_flush_compaction_key_order = false;
+  options.paranoid_file_checks = true;
+  options.create_if_missing = true;
+  for (bool do_flush : {true, false}) {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, options));
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    assert(db_ != nullptr);  // suppress false clang-analyze report
+    std::string start, end;
+    Build(10);
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(3, &start), Key(7, &end)));
+    auto snap = db_->GetSnapshot();
+    ASSERT_NE(snap, nullptr);
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(6, &start), Key(8, &end)));
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(2, &start), Key(5, &end)));
+    if (do_flush) {
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    } else {
+      DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+      ASSERT_OK(dbi->TEST_FlushMemTable());
+      CompactRangeOptions cro;
+      cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+      ASSERT_OK(
+          dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
+    }
+    db_->ReleaseSnapshot(snap);
+  }
+}
+
+TEST_F(CorruptionTest, LogCorruptionErrorsInCompactionIterator) {
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.allow_data_in_errors = true;
+  auto mode = mock::MockTableFactory::kCorruptKey;
+  delete db_;
+  db_ = nullptr;
+  ASSERT_OK(DestroyDB(dbname_, options));
+
+  std::shared_ptr<mock::MockTableFactory> mock =
+      std::make_shared<mock::MockTableFactory>();
+  mock->SetCorruptionMode(mode);
+  options.table_factory = mock;
+
+  ASSERT_OK(DB::Open(options, dbname_, &db_));
+  assert(db_ != nullptr);  // suppress false clang-analyze report
+  Build(100, 2);
+
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  Status s =
+      dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsCorruption());
+}
+
+TEST_F(CorruptionTest, CompactionKeyOrderCheck) {
+  Options options;
+  options.env = env_;
+  options.paranoid_file_checks = false;
+  options.create_if_missing = true;
+  options.check_flush_compaction_key_order = false;
+  delete db_;
+  db_ = nullptr;
+  ASSERT_OK(DestroyDB(dbname_, options));
+  std::shared_ptr<mock::MockTableFactory> mock =
+      std::make_shared<mock::MockTableFactory>();
+  options.table_factory = mock;
+  ASSERT_OK(DB::Open(options, dbname_, &db_));
+  assert(db_ != nullptr);  // suppress false clang-analyze report
+  mock->SetCorruptionMode(mock::MockTableFactory::kCorruptReorderKey);
+  Build(100, 2);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+
+  mock->SetCorruptionMode(mock::MockTableFactory::kCorruptNone);
+  ASSERT_OK(db_->SetOptions({{"check_flush_compaction_key_order", "true"}}));
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_NOK(
+      dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
+}
+
+TEST_F(CorruptionTest, FlushKeyOrderCheck) {
+  Options options;
+  options.env = env_;
+  options.paranoid_file_checks = false;
+  options.create_if_missing = true;
+  ASSERT_OK(db_->SetOptions({{"check_flush_compaction_key_order", "true"}}));
+
+  ASSERT_OK(db_->Put(WriteOptions(), "foo1", "v1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo2", "v1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo3", "v1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo4", "v1"));
+
+  int cnt = 0;
+  // Generate some out of order keys from the memtable
+  SyncPoint::GetInstance()->SetCallBack(
+      "MemTableIterator::Next:0", [&](void* arg) {
+        MemTableRep::Iterator* mem_iter =
+            static_cast<MemTableRep::Iterator*>(arg);
+        if (++cnt == 3) {
+          mem_iter->Prev();
+          mem_iter->Prev();
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  Status s = static_cast_with_check<DBImpl>(db_)->TEST_FlushMemTable();
+  ASSERT_NOK(s);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(CorruptionTest, DisableKeyOrderCheck) {
+  ASSERT_OK(db_->SetOptions({{"check_flush_compaction_key_order", "false"}}));
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "OutputValidator::Add:order_check",
+      [&](void* /*arg*/) { ASSERT_TRUE(false); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(db_->Put(WriteOptions(), "foo1", "v1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo3", "v1"));
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(db_->Put(WriteOptions(), "foo2", "v1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo4", "v1"));
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(
+      dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(CorruptionTest, VerifyWholeTableChecksum) {
+  CloseDb();
+  Options options;
+  options.env = env_;
+  ASSERT_OK(DestroyDB(dbname_, options));
+  options.create_if_missing = true;
+  options.file_checksum_gen_factory =
+      ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory();
+  Reopen(&options);
+
+  Build(10, 5);
+
+  ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+  CloseDb();
+
+  // Corrupt the first byte of each table file, this must be data block.
+  Corrupt(kTableFile, 0, 1);
+
+  ASSERT_OK(TryReopen(&options));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  int count{0};
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::VerifyFullFileChecksum:mismatch", [&](void* arg) {
+        auto* s = reinterpret_cast<Status*>(arg);
+        ASSERT_NE(s, nullptr);
+        ++count;
+        ASSERT_NOK(*s);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsCorruption());
+  ASSERT_EQ(1, count);
+}
+
+class CrashDuringRecoveryWithCorruptionTest
+    : public CorruptionTest,
+      public testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  explicit CrashDuringRecoveryWithCorruptionTest()
+      : CorruptionTest(),
+        avoid_flush_during_recovery_(std::get<0>(GetParam())),
+        track_and_verify_wals_in_manifest_(std::get<1>(GetParam())) {}
+
+ protected:
+  const bool avoid_flush_during_recovery_;
+  const bool track_and_verify_wals_in_manifest_;
+};
+
+INSTANTIATE_TEST_CASE_P(CorruptionTest, CrashDuringRecoveryWithCorruptionTest,
+                        ::testing::Values(std::make_tuple(true, false),
+                                          std::make_tuple(false, false),
+                                          std::make_tuple(true, true),
+                                          std::make_tuple(false, true)));
+
+// In case of non-TransactionDB with avoid_flush_during_recovery = true, RocksDB
+// won't flush the data from WAL to L0 for all column families if possible. As a
+// result, not all column families can increase their log_numbers, and
+// min_log_number_to_keep won't change.
+// It may prematurely persist a new MANIFEST even before we can declare the DB
+// is in consistent state after recovery (this is when the new WAL is synced)
+// and advances log_numbers for some column families.
+//
+// If there is power failure before we sync the new WAL, we will end up in
+// a situation in which after persisting the MANIFEST, RocksDB will see some
+// column families' log_numbers larger than the corrupted wal, and
+// "Column family inconsistency: SST file contains data beyond the point of
+// corruption" error will be hit, causing recovery to fail.
+//
+// After adding the fix, only after new WAL is synced, RocksDB persist a new
+// MANIFEST with column families to ensure RocksDB is in consistent state.
+// RocksDB writes an empty WriteBatch as a sentinel to the new WAL which is
+// synced immediately afterwards. The sequence number of the sentinel
+// WriteBatch will be the next sequence number immediately after the largest
+// sequence number recovered from previous WALs and MANIFEST because of which DB
+// will be in consistent state.
+// If a future recovery starts from the new MANIFEST, then it means the new WAL
+// is successfully synced. Due to the sentinel empty write batch at the
+// beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point.
+// If future recovery starts from the old MANIFEST, it means the writing the new
+// MANIFEST failed. It won't have the "SST ahead of WAL" error.
+//
+// The combination of corrupting a WAL and injecting an error during subsequent
+// re-open exposes the bug of prematurely persisting a new MANIFEST with
+// advanced ColumnFamilyData::log_number.
+TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) {
+  CloseDb();
+  Options options;
+  options.track_and_verify_wals_in_manifest =
+      track_and_verify_wals_in_manifest_;
+  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+  options.avoid_flush_during_recovery = false;
+  options.env = env_;
+  ASSERT_OK(DestroyDB(dbname_, options));
+  options.create_if_missing = true;
+  options.max_write_buffer_number = 8;
+
+  Reopen(&options);
+  Status s;
+  const std::string test_cf_name = "test_cf";
+  ColumnFamilyHandle* cfh = nullptr;
+  s = db_->CreateColumnFamily(options, test_cf_name, &cfh);
+  ASSERT_OK(s);
+  delete cfh;
+  CloseDb();
+
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+  cf_descs.emplace_back(test_cf_name, options);
+  std::vector<ColumnFamilyHandle*> handles;
+
+  // 1. Open and populate the DB. Write and flush default_cf several times to
+  // advance wal number so that some column families have advanced log_number
+  // while other don't.
+  {
+    ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_));
+    auto* dbimpl = static_cast_with_check<DBImpl>(db_);
+    assert(dbimpl);
+
+    // Write one key to test_cf.
+    ASSERT_OK(db_->Put(WriteOptions(), handles[1], "old_key", "dontcare"));
+    ASSERT_OK(db_->Flush(FlushOptions(), handles[1]));
+
+    // Write to default_cf and flush this cf several times to advance wal
+    // number. TEST_SwitchMemtable makes sure WALs are not synced and test can
+    // corrupt un-sync WAL.
+    for (int i = 0; i < 2; ++i) {
+      ASSERT_OK(db_->Put(WriteOptions(), "key" + std::to_string(i),
+                         "value" + std::to_string(i)));
+      ASSERT_OK(dbimpl->TEST_SwitchMemtable());
+    }
+
+    for (auto* h : handles) {
+      delete h;
+    }
+    handles.clear();
+    CloseDb();
+  }
+
+  // 2. Corrupt second last un-syned wal file to emulate power reset which
+  // caused the DB to lose the un-synced WAL.
+  {
+    std::vector<uint64_t> file_nums;
+    GetSortedWalFiles(file_nums);
+    size_t size = file_nums.size();
+    assert(size >= 2);
+    uint64_t log_num = file_nums[size - 2];
+    CorruptFileWithTruncation(FileType::kWalFile, log_num,
+                              /*bytes_to_truncate=*/8);
+  }
+
+  // 3. After first crash reopen the DB which contains corrupted WAL. Default
+  // family has higher log number than corrupted wal number.
+  //
+  // Case1: If avoid_flush_during_recovery = true, RocksDB won't flush the data
+  // from WAL to L0 for all column families (test_cf_name in this case). As a
+  // result, not all column families can increase their log_numbers, and
+  // min_log_number_to_keep won't change.
+  //
+  // Case2: If avoid_flush_during_recovery = false, all column families have
+  // flushed their data from WAL to L0 during recovery, and none of them will
+  // ever need to read the WALs again.
+
+  // 4. Fault is injected to fail the recovery.
+  {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::GetLogSizeAndMaybeTruncate:0", [&](void* arg) {
+          auto* tmp_s = reinterpret_cast<Status*>(arg);
+          assert(tmp_s);
+          *tmp_s = Status::IOError("Injected");
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    handles.clear();
+    options.avoid_flush_during_recovery = true;
+    s = DB::Open(options, dbname_, cf_descs, &handles, &db_);
+    ASSERT_TRUE(s.IsIOError());
+    ASSERT_EQ("IO error: Injected", s.ToString());
+    for (auto* h : handles) {
+      delete h;
+    }
+    CloseDb();
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+  }
+
+  // 5. After second crash reopen the db with second corruption. Default family
+  // has higher log number than corrupted wal number.
+  //
+  // Case1: If avoid_flush_during_recovery = true, we persist a new
+  // MANIFEST with advanced log_numbers for some column families only after
+  // syncing the WAL. So during second crash, RocksDB will skip the corrupted
+  // WAL files as they have been moved to different folder. Since newly synced
+  // WAL file's sequence number (sentinel WriteBatch) will be the next
+  // sequence number immediately after the largest sequence number recovered
+  // from previous WALs and MANIFEST, db will be in consistent state and opens
+  // successfully.
+  //
+  // Case2: If avoid_flush_during_recovery = false, the corrupted WAL is below
+  // this number. So during a second crash after persisting the new MANIFEST,
+  // RocksDB will skip the corrupted WAL(s) because they are all below this
+  // bound. Therefore, we won't hit the "column family inconsistency" error
+  // message.
+  {
+    options.avoid_flush_during_recovery = avoid_flush_during_recovery_;
+    ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_));
+
+    // Verify that data is not lost.
+    {
+      std::string v;
+      ASSERT_OK(db_->Get(ReadOptions(), handles[1], "old_key", &v));
+      ASSERT_EQ("dontcare", v);
+
+      v.clear();
+      ASSERT_OK(db_->Get(ReadOptions(), "key" + std::to_string(0), &v));
+      ASSERT_EQ("value" + std::to_string(0), v);
+
+      // Since  it's corrupting second last wal, below key is not found.
+      v.clear();
+      ASSERT_EQ(db_->Get(ReadOptions(), "key" + std::to_string(1), &v),
+                Status::NotFound());
+    }
+
+    for (auto* h : handles) {
+      delete h;
+    }
+    handles.clear();
+    CloseDb();
+  }
+}
+
+// In case of TransactionDB, it enables two-phase-commit. The prepare section of
+// an uncommitted transaction always need to be kept. Even if we perform flush
+// during recovery, we may still need to hold an old WAL. The
+// min_log_number_to_keep won't change, and "Column family inconsistency: SST
+// file contains data beyond the point of corruption" error will be hit, causing
+// recovery to fail.
+//
+// After adding the fix, only after new WAL is synced, RocksDB persist a new
+// MANIFEST with column families to ensure RocksDB is in consistent state.
+// RocksDB writes an empty WriteBatch as a sentinel to the new WAL which is
+// synced immediately afterwards. The sequence number of the sentinel
+// WriteBatch will be the next sequence number immediately after the largest
+// sequence number recovered from previous WALs and MANIFEST because of which DB
+// will be in consistent state.
+// If a future recovery starts from the new MANIFEST, then it means the new WAL
+// is successfully synced. Due to the sentinel empty write batch at the
+// beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point.
+// If future recovery starts from the old MANIFEST, it means the writing the new
+// MANIFEST failed. It won't have the "SST ahead of WAL" error.
+//
+// The combination of corrupting a WAL and injecting an error during subsequent
+// re-open exposes the bug of prematurely persisting a new MANIFEST with
+// advanced ColumnFamilyData::log_number.
+TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) {
+  CloseDb();
+  Options options;
+  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+  options.track_and_verify_wals_in_manifest =
+      track_and_verify_wals_in_manifest_;
+  options.avoid_flush_during_recovery = false;
+  options.env = env_;
+  ASSERT_OK(DestroyDB(dbname_, options));
+  options.create_if_missing = true;
+  options.max_write_buffer_number = 3;
+  Reopen(&options);
+
+  // Create cf test_cf_name.
+  ColumnFamilyHandle* cfh = nullptr;
+  const std::string test_cf_name = "test_cf";
+  Status s = db_->CreateColumnFamily(options, test_cf_name, &cfh);
+  ASSERT_OK(s);
+  delete cfh;
+  CloseDb();
+
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+  cf_descs.emplace_back(test_cf_name, options);
+  std::vector<ColumnFamilyHandle*> handles;
+
+  TransactionDB* txn_db = nullptr;
+  TransactionDBOptions txn_db_opts;
+
+  // 1. Open and populate the DB. Write and flush default_cf several times to
+  // advance wal number so that some column families have advanced log_number
+  // while other don't.
+  {
+    ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs,
+                                  &handles, &txn_db));
+
+    auto* txn = txn_db->BeginTransaction(WriteOptions(), TransactionOptions());
+    // Put cf1
+    ASSERT_OK(txn->Put(handles[1], "foo", "value"));
+    ASSERT_OK(txn->SetName("txn0"));
+    ASSERT_OK(txn->Prepare());
+    ASSERT_OK(txn_db->Flush(FlushOptions()));
+
+    delete txn;
+    txn = nullptr;
+
+    auto* dbimpl = static_cast_with_check<DBImpl>(txn_db->GetRootDB());
+    assert(dbimpl);
+
+    // Put and flush cf0
+    for (int i = 0; i < 2; ++i) {
+      ASSERT_OK(txn_db->Put(WriteOptions(), "key" + std::to_string(i),
+                            "value" + std::to_string(i)));
+      ASSERT_OK(dbimpl->TEST_SwitchMemtable());
+    }
+
+    // Put cf1
+    txn = txn_db->BeginTransaction(WriteOptions(), TransactionOptions());
+    ASSERT_OK(txn->Put(handles[1], "foo1", "value1"));
+    ASSERT_OK(txn->Commit());
+
+    delete txn;
+    txn = nullptr;
+
+    for (auto* h : handles) {
+      delete h;
+    }
+    handles.clear();
+    delete txn_db;
+  }
+
+  // 2. Corrupt second last wal to emulate power reset which caused the DB to
+  // lose the un-synced WAL.
+  {
+    std::vector<uint64_t> file_nums;
+    GetSortedWalFiles(file_nums);
+    size_t size = file_nums.size();
+    assert(size >= 2);
+    uint64_t log_num = file_nums[size - 2];
+    CorruptFileWithTruncation(FileType::kWalFile, log_num,
+                              /*bytes_to_truncate=*/8);
+  }
+
+  // 3. After first crash reopen the DB which contains corrupted WAL. Default
+  // family has higher log number than corrupted wal number. There may be old
+  // WAL files that it must not delete because they can contain data of
+  // uncommitted transactions. As a result, min_log_number_to_keep won't change.
+
+  {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::Open::BeforeSyncWAL", [&](void* arg) {
+          auto* tmp_s = reinterpret_cast<Status*>(arg);
+          assert(tmp_s);
+          *tmp_s = Status::IOError("Injected");
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    handles.clear();
+    s = TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs, &handles,
+                            &txn_db);
+    ASSERT_TRUE(s.IsIOError());
+    ASSERT_EQ("IO error: Injected", s.ToString());
+    for (auto* h : handles) {
+      delete h;
+    }
+    CloseDb();
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+  }
+
+  // 4. Corrupt max_wal_num.
+  {
+    std::vector<uint64_t> file_nums;
+    GetSortedWalFiles(file_nums);
+    size_t size = file_nums.size();
+    uint64_t log_num = file_nums[size - 1];
+    CorruptFileWithTruncation(FileType::kWalFile, log_num);
+  }
+
+  // 5. After second crash reopen the db with second corruption. Default family
+  // has higher log number than corrupted wal number.
+  // We persist a new MANIFEST with advanced log_numbers for some column
+  // families only after syncing the WAL. So during second crash, RocksDB will
+  // skip the corrupted WAL files as they have been moved to different folder.
+  // Since newly synced WAL file's sequence number (sentinel WriteBatch) will be
+  // the next sequence number immediately after the largest sequence number
+  // recovered from previous WALs and MANIFEST, db will be in consistent state
+  // and opens successfully.
+  {
+    ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs,
+                                  &handles, &txn_db));
+
+    // Verify that data is not lost.
+    {
+      std::string v;
+      // Key not visible since it's not committed.
+      ASSERT_EQ(txn_db->Get(ReadOptions(), handles[1], "foo", &v),
+                Status::NotFound());
+
+      v.clear();
+      ASSERT_OK(txn_db->Get(ReadOptions(), "key" + std::to_string(0), &v));
+      ASSERT_EQ("value" + std::to_string(0), v);
+
+      // Last WAL is corrupted which contains two keys below.
+      v.clear();
+      ASSERT_EQ(txn_db->Get(ReadOptions(), "key" + std::to_string(1), &v),
+                Status::NotFound());
+      v.clear();
+      ASSERT_EQ(txn_db->Get(ReadOptions(), handles[1], "foo1", &v),
+                Status::NotFound());
+    }
+
+    for (auto* h : handles) {
+      delete h;
+    }
+    delete txn_db;
+  }
+}
+
+// This test is similar to
+// CrashDuringRecoveryWithCorruptionTest.CrashDuringRecovery except it calls
+// flush and corrupts Last WAL. It calls flush to sync some of the WALs and
+// remaining are unsyned one of which is then corrupted to simulate crash.
+//
+// In case of non-TransactionDB with avoid_flush_during_recovery = true, RocksDB
+// won't flush the data from WAL to L0 for all column families if possible. As a
+// result, not all column families can increase their log_numbers, and
+// min_log_number_to_keep won't change.
+// It may prematurely persist a new MANIFEST even before we can declare the DB
+// is in consistent state after recovery (this is when the new WAL is synced)
+// and advances log_numbers for some column families.
+//
+// If there is power failure before we sync the new WAL, we will end up in
+// a situation in which after persisting the MANIFEST, RocksDB will see some
+// column families' log_numbers larger than the corrupted wal, and
+// "Column family inconsistency: SST file contains data beyond the point of
+// corruption" error will be hit, causing recovery to fail.
+//
+// After adding the fix, only after new WAL is synced, RocksDB persist a new
+// MANIFEST with column families to ensure RocksDB is in consistent state.
+// RocksDB writes an empty WriteBatch as a sentinel to the new WAL which is
+// synced immediately afterwards. The sequence number of the sentinel
+// WriteBatch will be the next sequence number immediately after the largest
+// sequence number recovered from previous WALs and MANIFEST because of which DB
+// will be in consistent state.
+// If a future recovery starts from the new MANIFEST, then it means the new WAL
+// is successfully synced. Due to the sentinel empty write batch at the
+// beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point.
+// If future recovery starts from the old MANIFEST, it means the writing the new
+// MANIFEST failed. It won't have the "SST ahead of WAL" error.
+
+// The combination of corrupting a WAL and injecting an error during subsequent
+// re-open exposes the bug of prematurely persisting a new MANIFEST with
+// advanced ColumnFamilyData::log_number.
+TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecoveryWithFlush) {
+  CloseDb();
+  Options options;
+  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+  options.avoid_flush_during_recovery = false;
+  options.env = env_;
+  options.create_if_missing = true;
+
+  ASSERT_OK(DestroyDB(dbname_, options));
+  Reopen(&options);
+
+  ColumnFamilyHandle* cfh = nullptr;
+  const std::string test_cf_name = "test_cf";
+  Status s = db_->CreateColumnFamily(options, test_cf_name, &cfh);
+  ASSERT_OK(s);
+  delete cfh;
+
+  CloseDb();
+
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+  cf_descs.emplace_back(test_cf_name, options);
+  std::vector<ColumnFamilyHandle*> handles;
+
+  {
+    ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_));
+
+    // Write one key to test_cf.
+    ASSERT_OK(db_->Put(WriteOptions(), handles[1], "old_key", "dontcare"));
+
+    // Write to default_cf and flush this cf several times to advance wal
+    // number.
+    for (int i = 0; i < 2; ++i) {
+      ASSERT_OK(db_->Put(WriteOptions(), "key" + std::to_string(i),
+                         "value" + std::to_string(i)));
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    }
+
+    ASSERT_OK(db_->Put(WriteOptions(), handles[1], "dontcare", "dontcare"));
+    for (auto* h : handles) {
+      delete h;
+    }
+    handles.clear();
+    CloseDb();
+  }
+
+  // Corrupt second last un-syned wal file to emulate power reset which
+  // caused the DB to lose the un-synced WAL.
+  {
+    std::vector<uint64_t> file_nums;
+    GetSortedWalFiles(file_nums);
+    size_t size = file_nums.size();
+    uint64_t log_num = file_nums[size - 1];
+    CorruptFileWithTruncation(FileType::kWalFile, log_num,
+                              /*bytes_to_truncate=*/8);
+  }
+
+  // Fault is injected to fail the recovery.
+  {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::GetLogSizeAndMaybeTruncate:0", [&](void* arg) {
+          auto* tmp_s = reinterpret_cast<Status*>(arg);
+          assert(tmp_s);
+          *tmp_s = Status::IOError("Injected");
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    handles.clear();
+    options.avoid_flush_during_recovery = true;
+    s = DB::Open(options, dbname_, cf_descs, &handles, &db_);
+    ASSERT_TRUE(s.IsIOError());
+    ASSERT_EQ("IO error: Injected", s.ToString());
+    for (auto* h : handles) {
+      delete h;
+    }
+    CloseDb();
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+  }
+
+  // Reopen db again
+  {
+    options.avoid_flush_during_recovery = avoid_flush_during_recovery_;
+    ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_));
+
+    // Verify that data is not lost.
+    {
+      std::string v;
+      ASSERT_OK(db_->Get(ReadOptions(), handles[1], "old_key", &v));
+      ASSERT_EQ("dontcare", v);
+
+      for (int i = 0; i < 2; ++i) {
+        v.clear();
+        ASSERT_OK(db_->Get(ReadOptions(), "key" + std::to_string(i), &v));
+        ASSERT_EQ("value" + std::to_string(i), v);
+      }
+
+      // Since it's corrupting last wal after Flush, below key is not found.
+      v.clear();
+      ASSERT_EQ(db_->Get(ReadOptions(), handles[1], "dontcare", &v),
+                Status::NotFound());
+    }
+
+    for (auto* h : handles) {
+      delete h;
+    }
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as RepairDB() is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/cuckoo_table_db_test.cc b/src/rocksdb/db/cuckoo_table_db_test.cc
new file mode 100644
index 000000000..868b798ea
--- /dev/null
+++ b/src/rocksdb/db/cuckoo_table_db_test.cc
@@ -0,0 +1,361 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
+#include "table/cuckoo/cuckoo_table_reader.h"
+#include "table/meta_blocks.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/cast_util.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CuckooTableDBTest : public testing::Test {
+ private:
+  std::string dbname_;
+  Env* env_;
+  DB* db_;
+
+ public:
+  CuckooTableDBTest() : env_(Env::Default()) {
+    dbname_ = test::PerThreadDBPath("cuckoo_table_db_test");
+    EXPECT_OK(DestroyDB(dbname_, Options()));
+    db_ = nullptr;
+    Reopen();
+  }
+
+  ~CuckooTableDBTest() override {
+    delete db_;
+    EXPECT_OK(DestroyDB(dbname_, Options()));
+  }
+
+  Options CurrentOptions() {
+    Options options;
+    options.table_factory.reset(NewCuckooTableFactory());
+    options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true));
+    options.allow_mmap_reads = true;
+    options.create_if_missing = true;
+    options.allow_concurrent_memtable_write = false;
+    return options;
+  }
+
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
+
+  // The following util methods are copied from plain_table_db_test.
+  void Reopen(Options* options = nullptr) {
+    delete db_;
+    db_ = nullptr;
+    Options opts;
+    if (options != nullptr) {
+      opts = *options;
+    } else {
+      opts = CurrentOptions();
+      opts.create_if_missing = true;
+    }
+    ASSERT_OK(DB::Open(opts, dbname_, &db_));
+  }
+
+  void DestroyAndReopen(Options* options) {
+    assert(options);
+    ASSERT_OK(db_->Close());
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, *options));
+    Reopen(options);
+  }
+
+  Status Put(const Slice& k, const Slice& v) {
+    return db_->Put(WriteOptions(), k, v);
+  }
+
+  Status Delete(const std::string& k) { return db_->Delete(WriteOptions(), k); }
+
+  std::string Get(const std::string& k) {
+    ReadOptions options;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  int NumTableFilesAtLevel(int level) {
+    std::string property;
+    EXPECT_TRUE(db_->GetProperty(
+        "rocksdb.num-files-at-level" + std::to_string(level), &property));
+    return atoi(property.c_str());
+  }
+
+  // Return spread of files per level
+  std::string FilesPerLevel() {
+    std::string result;
+    size_t last_non_zero_offset = 0;
+    for (int level = 0; level < db_->NumberLevels(); level++) {
+      int f = NumTableFilesAtLevel(level);
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+      result += buf;
+      if (f > 0) {
+        last_non_zero_offset = result.size();
+      }
+    }
+    result.resize(last_non_zero_offset);
+    return result;
+  }
+};
+
+TEST_F(CuckooTableDBTest, Flush) {
+  // Try with empty DB first.
+  ASSERT_TRUE(dbfull() != nullptr);
+  ASSERT_EQ("NOT_FOUND", Get("key2"));
+
+  // Add some values to db.
+  Options options = CurrentOptions();
+  Reopen(&options);
+
+  ASSERT_OK(Put("key1", "v1"));
+  ASSERT_OK(Put("key2", "v2"));
+  ASSERT_OK(Put("key3", "v3"));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+  TablePropertiesCollection ptc;
+  ASSERT_OK(reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
+  VerifySstUniqueIds(ptc);
+  ASSERT_EQ(1U, ptc.size());
+  ASSERT_EQ(3U, ptc.begin()->second->num_entries);
+  ASSERT_EQ("1", FilesPerLevel());
+
+  ASSERT_EQ("v1", Get("key1"));
+  ASSERT_EQ("v2", Get("key2"));
+  ASSERT_EQ("v3", Get("key3"));
+  ASSERT_EQ("NOT_FOUND", Get("key4"));
+
+  // Now add more keys and flush.
+  ASSERT_OK(Put("key4", "v4"));
+  ASSERT_OK(Put("key5", "v5"));
+  ASSERT_OK(Put("key6", "v6"));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+  ASSERT_OK(reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
+  VerifySstUniqueIds(ptc);
+  ASSERT_EQ(2U, ptc.size());
+  auto row = ptc.begin();
+  ASSERT_EQ(3U, row->second->num_entries);
+  ASSERT_EQ(3U, (++row)->second->num_entries);
+  ASSERT_EQ("2", FilesPerLevel());
+  ASSERT_EQ("v1", Get("key1"));
+  ASSERT_EQ("v2", Get("key2"));
+  ASSERT_EQ("v3", Get("key3"));
+  ASSERT_EQ("v4", Get("key4"));
+  ASSERT_EQ("v5", Get("key5"));
+  ASSERT_EQ("v6", Get("key6"));
+
+  ASSERT_OK(Delete("key6"));
+  ASSERT_OK(Delete("key5"));
+  ASSERT_OK(Delete("key4"));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+  ASSERT_OK(reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
+  VerifySstUniqueIds(ptc);
+  ASSERT_EQ(3U, ptc.size());
+  row = ptc.begin();
+  ASSERT_EQ(3U, row->second->num_entries);
+  ASSERT_EQ(3U, (++row)->second->num_entries);
+  ASSERT_EQ(3U, (++row)->second->num_entries);
+  ASSERT_EQ("3", FilesPerLevel());
+  ASSERT_EQ("v1", Get("key1"));
+  ASSERT_EQ("v2", Get("key2"));
+  ASSERT_EQ("v3", Get("key3"));
+  ASSERT_EQ("NOT_FOUND", Get("key4"));
+  ASSERT_EQ("NOT_FOUND", Get("key5"));
+  ASSERT_EQ("NOT_FOUND", Get("key6"));
+}
+
+TEST_F(CuckooTableDBTest, FlushWithDuplicateKeys) {
+  Options options = CurrentOptions();
+  Reopen(&options);
+  ASSERT_OK(Put("key1", "v1"));
+  ASSERT_OK(Put("key2", "v2"));
+  ASSERT_OK(Put("key1", "v3"));  // Duplicate
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+  TablePropertiesCollection ptc;
+  ASSERT_OK(reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
+  VerifySstUniqueIds(ptc);
+  ASSERT_EQ(1U, ptc.size());
+  ASSERT_EQ(2U, ptc.begin()->second->num_entries);
+  ASSERT_EQ("1", FilesPerLevel());
+  ASSERT_EQ("v3", Get("key1"));
+  ASSERT_EQ("v2", Get("key2"));
+}
+
+namespace {
+static std::string Key(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "key_______%06d", i);
+  return std::string(buf);
+}
+static std::string Uint64Key(uint64_t i) {
+  std::string str;
+  str.resize(8);
+  memcpy(&str[0], static_cast<void*>(&i), 8);
+  return str;
+}
+}  // namespace.
+
+TEST_F(CuckooTableDBTest, Uint64Comparator) {
+  Options options = CurrentOptions();
+  options.comparator = test::Uint64Comparator();
+  DestroyAndReopen(&options);
+
+  ASSERT_OK(Put(Uint64Key(1), "v1"));
+  ASSERT_OK(Put(Uint64Key(2), "v2"));
+  ASSERT_OK(Put(Uint64Key(3), "v3"));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+  ASSERT_EQ("v1", Get(Uint64Key(1)));
+  ASSERT_EQ("v2", Get(Uint64Key(2)));
+  ASSERT_EQ("v3", Get(Uint64Key(3)));
+  ASSERT_EQ("NOT_FOUND", Get(Uint64Key(4)));
+
+  // Add more keys.
+  ASSERT_OK(Delete(Uint64Key(2)));  // Delete.
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+  ASSERT_OK(Put(Uint64Key(3), "v0"));  // Update.
+  ASSERT_OK(Put(Uint64Key(4), "v4"));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+  ASSERT_EQ("v1", Get(Uint64Key(1)));
+  ASSERT_EQ("NOT_FOUND", Get(Uint64Key(2)));
+  ASSERT_EQ("v0", Get(Uint64Key(3)));
+  ASSERT_EQ("v4", Get(Uint64Key(4)));
+}
+
+TEST_F(CuckooTableDBTest, CompactionIntoMultipleFiles) {
+  // Create a big L0 file and check it compacts into multiple files in L1.
+  Options options = CurrentOptions();
+  options.write_buffer_size = 270 << 10;
+  // Two SST files should be created, each containing 14 keys.
+  // Number of buckets will be 16. Total size ~156 KB.
+  options.target_file_size_base = 160 << 10;
+  Reopen(&options);
+
+  // Write 28 values, each 10016 B ~ 10KB
+  for (int idx = 0; idx < 28; ++idx) {
+    ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + char(idx))));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_EQ("1", FilesPerLevel());
+
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                        true /* disallow trivial move */));
+  ASSERT_EQ("0,2", FilesPerLevel());
+  for (int idx = 0; idx < 28; ++idx) {
+    ASSERT_EQ(std::string(10000, 'a' + char(idx)), Get(Key(idx)));
+  }
+}
+
+TEST_F(CuckooTableDBTest, SameKeyInsertedInTwoDifferentFilesAndCompacted) {
+  // Insert same key twice so that they go to different SST files. Then wait for
+  // compaction and check if the latest value is stored and old value removed.
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 2;
+  Reopen(&options);
+
+  // Write 11 values, each 10016 B
+  for (int idx = 0; idx < 11; ++idx) {
+    ASSERT_OK(Put(Key(idx), std::string(10000, 'a')));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_EQ("1", FilesPerLevel());
+
+  // Generate one more file in level-0, and should trigger level-0 compaction
+  for (int idx = 0; idx < 11; ++idx) {
+    ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + char(idx))));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+
+  ASSERT_EQ("0,1", FilesPerLevel());
+  for (int idx = 0; idx < 11; ++idx) {
+    ASSERT_EQ(std::string(10000, 'a' + char(idx)), Get(Key(idx)));
+  }
+}
+
+TEST_F(CuckooTableDBTest, AdaptiveTable) {
+  Options options = CurrentOptions();
+
+  // Ensure options compatible with PlainTable
+  options.prefix_extractor.reset(NewCappedPrefixTransform(8));
+
+  // Write some keys using cuckoo table.
+  options.table_factory.reset(NewCuckooTableFactory());
+  Reopen(&options);
+
+  ASSERT_OK(Put("key1", "v1"));
+  ASSERT_OK(Put("key2", "v2"));
+  ASSERT_OK(Put("key3", "v3"));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+  // Write some keys using plain table.
+  std::shared_ptr<TableFactory> block_based_factory(
+      NewBlockBasedTableFactory());
+  std::shared_ptr<TableFactory> plain_table_factory(NewPlainTableFactory());
+  std::shared_ptr<TableFactory> cuckoo_table_factory(NewCuckooTableFactory());
+  options.create_if_missing = false;
+  options.table_factory.reset(
+      NewAdaptiveTableFactory(plain_table_factory, block_based_factory,
+                              plain_table_factory, cuckoo_table_factory));
+  Reopen(&options);
+  ASSERT_OK(Put("key4", "v4"));
+  ASSERT_OK(Put("key1", "v5"));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+  // Write some keys using block based table.
+  options.table_factory.reset(
+      NewAdaptiveTableFactory(block_based_factory, block_based_factory,
+                              plain_table_factory, cuckoo_table_factory));
+  Reopen(&options);
+  ASSERT_OK(Put("key5", "v6"));
+  ASSERT_OK(Put("key2", "v7"));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+  ASSERT_EQ("v5", Get("key1"));
+  ASSERT_EQ("v7", Get("key2"));
+  ASSERT_EQ("v3", Get("key3"));
+  ASSERT_EQ("v4", Get("key4"));
+  ASSERT_EQ("v6", Get("key5"));
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  if (ROCKSDB_NAMESPACE::port::kLittleEndian) {
+    ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+  } else {
+    fprintf(stderr, "SKIPPED as Cuckoo table doesn't support Big Endian\n");
+    return 0;
+  }
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_basic_test.cc b/src/rocksdb/db/db_basic_test.cc
new file mode 100644
index 000000000..a28ac2b88
--- /dev/null
+++ b/src/rocksdb/db/db_basic_test.cc
@@ -0,0 +1,4643 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <cstring>
+
+#include "db/db_test_util.h"
+#include "options/options_helper.h"
+#include "port/stack_trace.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/debug.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#if !defined(ROCKSDB_LITE)
+#include "test_util/sync_point.h"
+#endif
+#include "util/file_checksum_helper.h"
+#include "util/random.h"
+#include "utilities/counted_fs.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/string_append/stringappend.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBBasicTest : public DBTestBase {
+ public:
+  DBBasicTest() : DBTestBase("db_basic_test", /*env_do_fsync=*/false) {}
+};
+
+TEST_F(DBBasicTest, OpenWhenOpen) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  DB* db2 = nullptr;
+  Status s = DB::Open(options, dbname_, &db2);
+  ASSERT_NOK(s) << [db2]() {
+    delete db2;
+    return "db2 open: ok";
+  }();
+  ASSERT_EQ(Status::Code::kIOError, s.code());
+  ASSERT_EQ(Status::SubCode::kNone, s.subcode());
+  ASSERT_TRUE(strstr(s.getState(), "lock ") != nullptr);
+
+  delete db2;
+}
+
+TEST_F(DBBasicTest, EnableDirectIOWithZeroBuf) {
+  if (!IsDirectIOSupported()) {
+    ROCKSDB_GTEST_BYPASS("Direct IO not supported");
+    return;
+  }
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.use_direct_io_for_flush_and_compaction = true;
+  options.writable_file_max_buffer_size = 0;
+  ASSERT_TRUE(TryReopen(options).IsInvalidArgument());
+
+  options.writable_file_max_buffer_size = 1024;
+  Reopen(options);
+  const std::unordered_map<std::string, std::string> new_db_opts = {
+      {"writable_file_max_buffer_size", "0"}};
+  ASSERT_TRUE(db_->SetDBOptions(new_db_opts).IsInvalidArgument());
+}
+
+TEST_F(DBBasicTest, UniqueSession) {
+  Options options = CurrentOptions();
+  std::string sid1, sid2, sid3, sid4;
+
+  ASSERT_OK(db_->GetDbSessionId(sid1));
+  Reopen(options);
+  ASSERT_OK(db_->GetDbSessionId(sid2));
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(db_->GetDbSessionId(sid4));
+  Reopen(options);
+  ASSERT_OK(db_->GetDbSessionId(sid3));
+
+  ASSERT_NE(sid1, sid2);
+  ASSERT_NE(sid1, sid3);
+  ASSERT_NE(sid2, sid3);
+
+  ASSERT_EQ(sid2, sid4);
+
+  // Expected compact format for session ids (see notes in implementation)
+  TestRegex expected("[0-9A-Z]{20}");
+  EXPECT_MATCHES_REGEX(sid1, expected);
+  EXPECT_MATCHES_REGEX(sid2, expected);
+  EXPECT_MATCHES_REGEX(sid3, expected);
+
+#ifndef ROCKSDB_LITE
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_OK(db_->GetDbSessionId(sid1));
+  // Test uniqueness between readonly open (sid1) and regular open (sid3)
+  ASSERT_NE(sid1, sid3);
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_OK(db_->GetDbSessionId(sid2));
+  ASSERT_EQ("v1", Get("foo"));
+  ASSERT_OK(db_->GetDbSessionId(sid3));
+
+  ASSERT_NE(sid1, sid2);
+
+  ASSERT_EQ(sid2, sid3);
+#endif  // ROCKSDB_LITE
+
+  CreateAndReopenWithCF({"goku"}, options);
+  ASSERT_OK(db_->GetDbSessionId(sid1));
+  ASSERT_OK(Put("bar", "e1"));
+  ASSERT_OK(db_->GetDbSessionId(sid2));
+  ASSERT_EQ("e1", Get("bar"));
+  ASSERT_OK(db_->GetDbSessionId(sid3));
+  ReopenWithColumnFamilies({"default", "goku"}, options);
+  ASSERT_OK(db_->GetDbSessionId(sid4));
+
+  ASSERT_EQ(sid1, sid2);
+  ASSERT_EQ(sid2, sid3);
+
+  ASSERT_NE(sid1, sid4);
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBasicTest, ReadOnlyDB) {
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Put("bar", "v2"));
+  ASSERT_OK(Put("foo", "v3"));
+  Close();
+
+  auto verify_one_iter = [&](Iterator* iter) {
+    int count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      ++count;
+    }
+    // Always expect two keys: "foo" and "bar"
+    ASSERT_EQ(count, 2);
+  };
+
+  auto verify_all_iters = [&]() {
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    verify_one_iter(iter);
+    delete iter;
+
+    std::vector<Iterator*> iters;
+    ASSERT_OK(db_->NewIterators(ReadOptions(),
+                                {dbfull()->DefaultColumnFamily()}, &iters));
+    ASSERT_EQ(static_cast<uint64_t>(1), iters.size());
+    verify_one_iter(iters[0]);
+    delete iters[0];
+  };
+
+  auto options = CurrentOptions();
+  assert(options.env == env_);
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_EQ("v3", Get("foo"));
+  ASSERT_EQ("v2", Get("bar"));
+  verify_all_iters();
+  Close();
+
+  // Reopen and flush memtable.
+  Reopen(options);
+  ASSERT_OK(Flush());
+  Close();
+  // Now check keys in read only mode.
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_EQ("v3", Get("foo"));
+  ASSERT_EQ("v2", Get("bar"));
+  verify_all_iters();
+  ASSERT_TRUE(db_->SyncWAL().IsNotSupported());
+}
+
+// TODO akanksha: Update the test to check that combination
+// does not actually write to FS (use open read-only with
+// CompositeEnvWrapper+ReadOnlyFileSystem).
+TEST_F(DBBasicTest, DISABLED_ReadOnlyDBWithWriteDBIdToManifestSet) {
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Put("bar", "v2"));
+  ASSERT_OK(Put("foo", "v3"));
+  Close();
+
+  auto options = CurrentOptions();
+  options.write_dbid_to_manifest = true;
+  assert(options.env == env_);
+  ASSERT_OK(ReadOnlyReopen(options));
+  std::string db_id1;
+  ASSERT_OK(db_->GetDbIdentity(db_id1));
+  ASSERT_EQ("v3", Get("foo"));
+  ASSERT_EQ("v2", Get("bar"));
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  int count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    ++count;
+  }
+  ASSERT_EQ(count, 2);
+  delete iter;
+  Close();
+
+  // Reopen and flush memtable.
+  Reopen(options);
+  ASSERT_OK(Flush());
+  Close();
+  // Now check keys in read only mode.
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_EQ("v3", Get("foo"));
+  ASSERT_EQ("v2", Get("bar"));
+  ASSERT_TRUE(db_->SyncWAL().IsNotSupported());
+  std::string db_id2;
+  ASSERT_OK(db_->GetDbIdentity(db_id2));
+  ASSERT_EQ(db_id1, db_id2);
+}
+
+TEST_F(DBBasicTest, CompactedDB) {
+  const uint64_t kFileSize = 1 << 20;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = kFileSize;
+  options.target_file_size_base = kFileSize;
+  options.max_bytes_for_level_base = 1 << 30;
+  options.compression = kNoCompression;
+  Reopen(options);
+  // 1 L0 file, use CompactedDB if max_open_files = -1
+  ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, '1')));
+  ASSERT_OK(Flush());
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  Status s = Put("new", "value");
+  ASSERT_EQ(s.ToString(),
+            "Not implemented: Not supported operation in read only mode.");
+  ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa"));
+  Close();
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  s = Put("new", "value");
+  ASSERT_EQ(s.ToString(),
+            "Not implemented: Not supported in compacted db mode.");
+  ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa"));
+  Close();
+  Reopen(options);
+  // Add more L0 files
+  ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, '2')));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, 'a')));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, 'b')));
+  ASSERT_OK(Put("eee", DummyString(kFileSize / 2, 'e')));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("something_not_flushed", "x"));
+  Close();
+
+  ASSERT_OK(ReadOnlyReopen(options));
+  // Fallback to read-only DB
+  s = Put("new", "value");
+  ASSERT_EQ(s.ToString(),
+            "Not implemented: Not supported operation in read only mode.");
+
+  // TODO: validate that other write ops return NotImplemented
+  // (DBImplReadOnly is missing some overrides)
+
+  // Ensure no deadlock on flush triggered by another API function
+  // (Old deadlock bug depends on something_not_flushed above.)
+  std::vector<std::string> files;
+  uint64_t manifest_file_size;
+  ASSERT_OK(db_->GetLiveFiles(files, &manifest_file_size, /*flush*/ true));
+  LiveFilesStorageInfoOptions lfsi_opts;
+  lfsi_opts.wal_size_for_flush = 0;  // always
+  std::vector<LiveFileStorageInfo> files2;
+  ASSERT_OK(db_->GetLiveFilesStorageInfo(lfsi_opts, &files2));
+
+  Close();
+
+  // Full compaction
+  Reopen(options);
+  // Add more keys
+  ASSERT_OK(Put("fff", DummyString(kFileSize / 2, 'f')));
+  ASSERT_OK(Put("hhh", DummyString(kFileSize / 2, 'h')));
+  ASSERT_OK(Put("iii", DummyString(kFileSize / 2, 'i')));
+  ASSERT_OK(Put("jjj", DummyString(kFileSize / 2, 'j')));
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(3, NumTableFilesAtLevel(1));
+  Close();
+
+  // CompactedDB
+  ASSERT_OK(ReadOnlyReopen(options));
+  s = Put("new", "value");
+  ASSERT_EQ(s.ToString(),
+            "Not implemented: Not supported in compacted db mode.");
+  ASSERT_EQ("NOT_FOUND", Get("abc"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'a'), Get("aaa"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'b'), Get("bbb"));
+  ASSERT_EQ("NOT_FOUND", Get("ccc"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'e'), Get("eee"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'f'), Get("fff"));
+  ASSERT_EQ("NOT_FOUND", Get("ggg"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'h'), Get("hhh"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'i'), Get("iii"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'j'), Get("jjj"));
+  ASSERT_EQ("NOT_FOUND", Get("kkk"));
+
+  // TODO: validate that other write ops return NotImplemented
+  // (CompactedDB is missing some overrides)
+
+  // Ensure no deadlock on flush triggered by another API function
+  ASSERT_OK(db_->GetLiveFiles(files, &manifest_file_size, /*flush*/ true));
+  ASSERT_OK(db_->GetLiveFilesStorageInfo(lfsi_opts, &files2));
+
+  // MultiGet
+  std::vector<std::string> values;
+  std::vector<Status> status_list = dbfull()->MultiGet(
+      ReadOptions(),
+      std::vector<Slice>({Slice("aaa"), Slice("ccc"), Slice("eee"),
+                          Slice("ggg"), Slice("iii"), Slice("kkk")}),
+      &values);
+  ASSERT_EQ(status_list.size(), static_cast<uint64_t>(6));
+  ASSERT_EQ(values.size(), static_cast<uint64_t>(6));
+  ASSERT_OK(status_list[0]);
+  ASSERT_EQ(DummyString(kFileSize / 2, 'a'), values[0]);
+  ASSERT_TRUE(status_list[1].IsNotFound());
+  ASSERT_OK(status_list[2]);
+  ASSERT_EQ(DummyString(kFileSize / 2, 'e'), values[2]);
+  ASSERT_TRUE(status_list[3].IsNotFound());
+  ASSERT_OK(status_list[4]);
+  ASSERT_EQ(DummyString(kFileSize / 2, 'i'), values[4]);
+  ASSERT_TRUE(status_list[5].IsNotFound());
+
+  Reopen(options);
+  // Add a key
+  ASSERT_OK(Put("fff", DummyString(kFileSize / 2, 'f')));
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  s = Put("new", "value");
+  ASSERT_EQ(s.ToString(),
+            "Not implemented: Not supported operation in read only mode.");
+}
+
+TEST_F(DBBasicTest, LevelLimitReopen) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  const std::string value(1024 * 1024, ' ');
+  int i = 0;
+  while (NumTableFilesAtLevel(2, 1) == 0) {
+    ASSERT_OK(Put(1, Key(i++), value));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+
+  options.num_levels = 1;
+  options.max_bytes_for_level_multiplier_additional.resize(1, 1);
+  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ(s.IsInvalidArgument(), true);
+  ASSERT_EQ(s.ToString(),
+            "Invalid argument: db has more levels than options.num_levels");
+
+  options.num_levels = 10;
+  options.max_bytes_for_level_multiplier_additional.resize(10, 1);
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBBasicTest, PutDeleteGet) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_OK(Put(1, "foo", "v2"));
+    ASSERT_EQ("v2", Get(1, "foo"));
+    ASSERT_OK(Delete(1, "foo"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+  } while (ChangeOptions());
+}
+
+TEST_F(DBBasicTest, PutSingleDeleteGet) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_OK(Put(1, "foo2", "v2"));
+    ASSERT_EQ("v2", Get(1, "foo2"));
+    ASSERT_OK(SingleDelete(1, "foo"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+    // Ski FIFO and universal compaction because they do not apply to the test
+    // case. Skip MergePut because single delete does not get removed when it
+    // encounters a merge.
+  } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+                         kSkipMergePut));
+}
+
+TEST_F(DBBasicTest, EmptyFlush) {
+  // It is possible to produce empty flushes when using single deletes. Tests
+  // whether empty flushes cause issues.
+  do {
+    Random rnd(301);
+
+    Options options = CurrentOptions();
+    options.disable_auto_compactions = true;
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    ASSERT_OK(Put(1, "a", Slice()));
+    ASSERT_OK(SingleDelete(1, "a"));
+    ASSERT_OK(Flush(1));
+
+    ASSERT_EQ("[ ]", AllEntriesFor("a", 1));
+    // Skip FIFO and  universal compaction as they do not apply to the test
+    // case. Skip MergePut because merges cannot be combined with single
+    // deletions.
+  } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+                         kSkipMergePut));
+}
+
+TEST_F(DBBasicTest, GetFromVersions) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
+  } while (ChangeOptions());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBasicTest, GetSnapshot) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
+    // Try with both a short key and a long key
+    for (int i = 0; i < 2; i++) {
+      std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x');
+      ASSERT_OK(Put(1, key, "v1"));
+      const Snapshot* s1 = db_->GetSnapshot();
+      ASSERT_OK(Put(1, key, "v2"));
+      ASSERT_EQ("v2", Get(1, key));
+      ASSERT_EQ("v1", Get(1, key, s1));
+      ASSERT_OK(Flush(1));
+      ASSERT_EQ("v2", Get(1, key));
+      ASSERT_EQ("v1", Get(1, key, s1));
+      db_->ReleaseSnapshot(s1);
+    }
+  } while (ChangeOptions());
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBBasicTest, CheckLock) {
+  do {
+    DB* localdb = nullptr;
+    Options options = CurrentOptions();
+    ASSERT_OK(TryReopen(options));
+
+    // second open should fail
+    Status s = DB::Open(options, dbname_, &localdb);
+    ASSERT_NOK(s) << [localdb]() {
+      delete localdb;
+      return "localdb open: ok";
+    }();
+#ifdef OS_LINUX
+    ASSERT_TRUE(s.ToString().find("lock ") != std::string::npos);
+#endif  // OS_LINUX
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, FlushMultipleMemtable) {
+  do {
+    Options options = CurrentOptions();
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    options.max_write_buffer_number = 4;
+    options.min_write_buffer_number_to_merge = 3;
+    options.max_write_buffer_size_to_maintain = -1;
+    CreateAndReopenWithCF({"pikachu"}, options);
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v1", Get(1, "bar"));
+    ASSERT_OK(Flush(1));
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, FlushEmptyColumnFamily) {
+  // Block flush thread and disable compaction thread
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  env_->SetBackgroundThreads(1, Env::LOW);
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  test::SleepingBackgroundTask sleeping_task_high;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                 &sleeping_task_high, Env::Priority::HIGH);
+
+  Options options = CurrentOptions();
+  // disable compaction
+  options.disable_auto_compactions = true;
+  WriteOptions writeOpt = WriteOptions();
+  writeOpt.disableWAL = true;
+  options.max_write_buffer_number = 2;
+  options.min_write_buffer_number_to_merge = 1;
+  options.max_write_buffer_size_to_maintain =
+      static_cast<int64_t>(options.write_buffer_size);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Compaction can still go through even if no thread can flush the
+  // mem table.
+  ASSERT_OK(Flush(0));
+  ASSERT_OK(Flush(1));
+
+  // Insert can go through
+  ASSERT_OK(dbfull()->Put(writeOpt, handles_[0], "foo", "v1"));
+  ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+  ASSERT_EQ("v1", Get(0, "foo"));
+  ASSERT_EQ("v1", Get(1, "bar"));
+
+  sleeping_task_high.WakeUp();
+  sleeping_task_high.WaitUntilDone();
+
+  // Flush can still go through.
+  ASSERT_OK(Flush(0));
+  ASSERT_OK(Flush(1));
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+}
+
+TEST_F(DBBasicTest, Flush) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    SetPerfLevel(kEnableTime);
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+    // this will now also flush the last 2 writes
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+    get_perf_context()->Reset();
+    Get(1, "foo");
+    ASSERT_TRUE((int)get_perf_context()->get_from_output_files_time > 0);
+    ASSERT_EQ(2, (int)get_perf_context()->get_read_bytes);
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v1", Get(1, "bar"));
+
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2"));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2"));
+    ASSERT_OK(Flush(1));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("v2", Get(1, "bar"));
+    get_perf_context()->Reset();
+    ASSERT_EQ("v2", Get(1, "foo"));
+    ASSERT_TRUE((int)get_perf_context()->get_from_output_files_time > 0);
+
+    writeOpt.disableWAL = false;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3"));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3"));
+    ASSERT_OK(Flush(1));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    // 'foo' should be there because its put
+    // has WAL enabled.
+    ASSERT_EQ("v3", Get(1, "foo"));
+    ASSERT_EQ("v3", Get(1, "bar"));
+
+    SetPerfLevel(kDisable);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, ManifestRollOver) {
+  do {
+    Options options;
+    options.max_manifest_file_size = 10;  // 10 bytes
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+    {
+      ASSERT_OK(Put(1, "manifest_key1", std::string(1000, '1')));
+      ASSERT_OK(Put(1, "manifest_key2", std::string(1000, '2')));
+      ASSERT_OK(Put(1, "manifest_key3", std::string(1000, '3')));
+      uint64_t manifest_before_flush = dbfull()->TEST_Current_Manifest_FileNo();
+      ASSERT_OK(Flush(1));  // This should trigger LogAndApply.
+      uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo();
+      ASSERT_GT(manifest_after_flush, manifest_before_flush);
+      ReopenWithColumnFamilies({"default", "pikachu"}, options);
+      ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush);
+      // check if a new manifest file got inserted or not.
+      ASSERT_EQ(std::string(1000, '1'), Get(1, "manifest_key1"));
+      ASSERT_EQ(std::string(1000, '2'), Get(1, "manifest_key2"));
+      ASSERT_EQ(std::string(1000, '3'), Get(1, "manifest_key3"));
+    }
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, IdentityAcrossRestarts) {
+  constexpr size_t kMinIdSize = 10;
+  do {
+    for (bool with_manifest : {false, true}) {
+      std::string idfilename = IdentityFileName(dbname_);
+      std::string id1, tmp;
+      ASSERT_OK(db_->GetDbIdentity(id1));
+      ASSERT_GE(id1.size(), kMinIdSize);
+
+      Options options = CurrentOptions();
+      options.write_dbid_to_manifest = with_manifest;
+      Reopen(options);
+      std::string id2;
+      ASSERT_OK(db_->GetDbIdentity(id2));
+      // id2 should match id1 because identity was not regenerated
+      ASSERT_EQ(id1, id2);
+      ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
+      ASSERT_EQ(tmp, id2);
+
+      // Recover from deleted/missing IDENTITY
+      ASSERT_OK(env_->DeleteFile(idfilename));
+      Reopen(options);
+      std::string id3;
+      ASSERT_OK(db_->GetDbIdentity(id3));
+      if (with_manifest) {
+        // id3 should match id1 because identity was restored from manifest
+        ASSERT_EQ(id1, id3);
+      } else {
+        // id3 should NOT match id1 because identity was regenerated
+        ASSERT_NE(id1, id3);
+        ASSERT_GE(id3.size(), kMinIdSize);
+      }
+      ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
+      ASSERT_EQ(tmp, id3);
+
+      // Recover from truncated IDENTITY
+      {
+        std::unique_ptr<WritableFile> w;
+        ASSERT_OK(env_->NewWritableFile(idfilename, &w, EnvOptions()));
+        ASSERT_OK(w->Close());
+      }
+      Reopen(options);
+      std::string id4;
+      ASSERT_OK(db_->GetDbIdentity(id4));
+      if (with_manifest) {
+        // id4 should match id1 because identity was restored from manifest
+        ASSERT_EQ(id1, id4);
+      } else {
+        // id4 should NOT match id1 because identity was regenerated
+        ASSERT_NE(id1, id4);
+        ASSERT_GE(id4.size(), kMinIdSize);
+      }
+      ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
+      ASSERT_EQ(tmp, id4);
+
+      // Recover from overwritten IDENTITY
+      std::string silly_id = "asdf123456789";
+      {
+        std::unique_ptr<WritableFile> w;
+        ASSERT_OK(env_->NewWritableFile(idfilename, &w, EnvOptions()));
+        ASSERT_OK(w->Append(silly_id));
+        ASSERT_OK(w->Close());
+      }
+      Reopen(options);
+      std::string id5;
+      ASSERT_OK(db_->GetDbIdentity(id5));
+      if (with_manifest) {
+        // id4 should match id1 because identity was restored from manifest
+        ASSERT_EQ(id1, id5);
+      } else {
+        ASSERT_EQ(id5, silly_id);
+      }
+      ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
+      ASSERT_EQ(tmp, id5);
+    }
+  } while (ChangeCompactOptions());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBasicTest, Snapshot) {
+  env_->SetMockSleep();
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
+    ASSERT_OK(Put(0, "foo", "0v1"));
+    ASSERT_OK(Put(1, "foo", "1v1"));
+
+    const Snapshot* s1 = db_->GetSnapshot();
+    ASSERT_EQ(1U, GetNumSnapshots());
+    uint64_t time_snap1 = GetTimeOldestSnapshots();
+    ASSERT_GT(time_snap1, 0U);
+    ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+    ASSERT_OK(Put(0, "foo", "0v2"));
+    ASSERT_OK(Put(1, "foo", "1v2"));
+
+    env_->MockSleepForSeconds(1);
+
+    const Snapshot* s2 = db_->GetSnapshot();
+    ASSERT_EQ(2U, GetNumSnapshots());
+    ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+    ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+    ASSERT_OK(Put(0, "foo", "0v3"));
+    ASSERT_OK(Put(1, "foo", "1v3"));
+
+    {
+      ManagedSnapshot s3(db_);
+      ASSERT_EQ(3U, GetNumSnapshots());
+      ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+      ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+
+      ASSERT_OK(Put(0, "foo", "0v4"));
+      ASSERT_OK(Put(1, "foo", "1v4"));
+      ASSERT_EQ("0v1", Get(0, "foo", s1));
+      ASSERT_EQ("1v1", Get(1, "foo", s1));
+      ASSERT_EQ("0v2", Get(0, "foo", s2));
+      ASSERT_EQ("1v2", Get(1, "foo", s2));
+      ASSERT_EQ("0v3", Get(0, "foo", s3.snapshot()));
+      ASSERT_EQ("1v3", Get(1, "foo", s3.snapshot()));
+      ASSERT_EQ("0v4", Get(0, "foo"));
+      ASSERT_EQ("1v4", Get(1, "foo"));
+    }
+
+    ASSERT_EQ(2U, GetNumSnapshots());
+    ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+    ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+    ASSERT_EQ("0v1", Get(0, "foo", s1));
+    ASSERT_EQ("1v1", Get(1, "foo", s1));
+    ASSERT_EQ("0v2", Get(0, "foo", s2));
+    ASSERT_EQ("1v2", Get(1, "foo", s2));
+    ASSERT_EQ("0v4", Get(0, "foo"));
+    ASSERT_EQ("1v4", Get(1, "foo"));
+
+    db_->ReleaseSnapshot(s1);
+    ASSERT_EQ("0v2", Get(0, "foo", s2));
+    ASSERT_EQ("1v2", Get(1, "foo", s2));
+    ASSERT_EQ("0v4", Get(0, "foo"));
+    ASSERT_EQ("1v4", Get(1, "foo"));
+    ASSERT_EQ(1U, GetNumSnapshots());
+    ASSERT_LT(time_snap1, GetTimeOldestSnapshots());
+    ASSERT_EQ(GetSequenceOldestSnapshots(), s2->GetSequenceNumber());
+
+    db_->ReleaseSnapshot(s2);
+    ASSERT_EQ(0U, GetNumSnapshots());
+    ASSERT_EQ(GetSequenceOldestSnapshots(), 0);
+    ASSERT_EQ("0v4", Get(0, "foo"));
+    ASSERT_EQ("1v4", Get(1, "foo"));
+  } while (ChangeOptions());
+}
+
+#endif  // ROCKSDB_LITE
+
+class DBBasicMultiConfigs : public DBBasicTest,
+                            public ::testing::WithParamInterface<int> {
+ public:
+  DBBasicMultiConfigs() { option_config_ = GetParam(); }
+
+  static std::vector<int> GenerateOptionConfigs() {
+    std::vector<int> option_configs;
+    for (int option_config = kDefault; option_config < kEnd; ++option_config) {
+      if (!ShouldSkipOptions(option_config, kSkipFIFOCompaction)) {
+        option_configs.push_back(option_config);
+      }
+    }
+    return option_configs;
+  }
+};
+
+TEST_P(DBBasicMultiConfigs, CompactBetweenSnapshots) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  Options options = CurrentOptions(options_override);
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Random rnd(301);
+  FillLevels("a", "z", 1);
+
+  ASSERT_OK(Put(1, "foo", "first"));
+  const Snapshot* snapshot1 = db_->GetSnapshot();
+  ASSERT_OK(Put(1, "foo", "second"));
+  ASSERT_OK(Put(1, "foo", "third"));
+  ASSERT_OK(Put(1, "foo", "fourth"));
+  const Snapshot* snapshot2 = db_->GetSnapshot();
+  ASSERT_OK(Put(1, "foo", "fifth"));
+  ASSERT_OK(Put(1, "foo", "sixth"));
+
+  // All entries (including duplicates) exist
+  // before any compaction or flush is triggered.
+  ASSERT_EQ(AllEntriesFor("foo", 1),
+            "[ sixth, fifth, fourth, third, second, first ]");
+  ASSERT_EQ("sixth", Get(1, "foo"));
+  ASSERT_EQ("fourth", Get(1, "foo", snapshot2));
+  ASSERT_EQ("first", Get(1, "foo", snapshot1));
+
+  // After a flush, "second", "third" and "fifth" should
+  // be removed
+  ASSERT_OK(Flush(1));
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth, first ]");
+
+  // after we release the snapshot1, only two values left
+  db_->ReleaseSnapshot(snapshot1);
+  FillLevels("a", "z", 1);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                                   nullptr));
+
+  // We have only one valid snapshot snapshot2. Since snapshot1 is
+  // not valid anymore, "first" should be removed by a compaction.
+  ASSERT_EQ("sixth", Get(1, "foo"));
+  ASSERT_EQ("fourth", Get(1, "foo", snapshot2));
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth ]");
+
+  // after we release the snapshot2, only one value should be left
+  db_->ReleaseSnapshot(snapshot2);
+  FillLevels("a", "z", 1);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                                   nullptr));
+  ASSERT_EQ("sixth", Get(1, "foo"));
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth ]");
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DBBasicMultiConfigs, DBBasicMultiConfigs,
+    ::testing::ValuesIn(DBBasicMultiConfigs::GenerateOptionConfigs()));
+
+TEST_F(DBBasicTest, DBOpen_Options) {
+  Options options = CurrentOptions();
+  Close();
+  Destroy(options);
+
+  // Does not exist, and create_if_missing == false: error
+  DB* db = nullptr;
+  options.create_if_missing = false;
+  Status s = DB::Open(options, dbname_, &db);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr);
+  ASSERT_TRUE(db == nullptr);
+
+  // Does not exist, and create_if_missing == true: OK
+  options.create_if_missing = true;
+  s = DB::Open(options, dbname_, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  delete db;
+  db = nullptr;
+
+  // Does exist, and error_if_exists == true: error
+  options.create_if_missing = false;
+  options.error_if_exists = true;
+  s = DB::Open(options, dbname_, &db);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr);
+  ASSERT_TRUE(db == nullptr);
+
+  // Does exist, and error_if_exists == false: OK
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  s = DB::Open(options, dbname_, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  delete db;
+  db = nullptr;
+}
+
+TEST_F(DBBasicTest, CompactOnFlush) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  do {
+    Options options = CurrentOptions(options_override);
+    options.disable_auto_compactions = true;
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v1 ]");
+
+    // Write two new keys
+    ASSERT_OK(Put(1, "a", "begin"));
+    ASSERT_OK(Put(1, "z", "end"));
+    ASSERT_OK(Flush(1));
+
+    // Case1: Delete followed by a put
+    ASSERT_OK(Delete(1, "foo"));
+    ASSERT_OK(Put(1, "foo", "v2"));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]");
+
+    // After the current memtable is flushed, the DEL should
+    // have been removed
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
+
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]");
+
+    // Case 2: Delete followed by another delete
+    ASSERT_OK(Delete(1, "foo"));
+    ASSERT_OK(Delete(1, "foo"));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, DEL, v2 ]");
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v2 ]");
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+
+    // Case 3: Put followed by a delete
+    ASSERT_OK(Put(1, "foo", "v3"));
+    ASSERT_OK(Delete(1, "foo"));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v3 ]");
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL ]");
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+
+    // Case 4: Put followed by another Put
+    ASSERT_OK(Put(1, "foo", "v4"));
+    ASSERT_OK(Put(1, "foo", "v5"));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5, v4 ]");
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]");
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]");
+
+    // clear database
+    ASSERT_OK(Delete(1, "foo"));
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+
+    // Case 5: Put followed by snapshot followed by another Put
+    // Both puts should remain.
+    ASSERT_OK(Put(1, "foo", "v6"));
+    const Snapshot* snapshot = db_->GetSnapshot();
+    ASSERT_OK(Put(1, "foo", "v7"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v7, v6 ]");
+    db_->ReleaseSnapshot(snapshot);
+
+    // clear database
+    ASSERT_OK(Delete(1, "foo"));
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+
+    // Case 5: snapshot followed by a put followed by another Put
+    // Only the last put should remain.
+    const Snapshot* snapshot1 = db_->GetSnapshot();
+    ASSERT_OK(Put(1, "foo", "v8"));
+    ASSERT_OK(Put(1, "foo", "v9"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v9 ]");
+    db_->ReleaseSnapshot(snapshot1);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, FlushOneColumnFamily) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
+                         "alyosha", "popovich"},
+                        options);
+
+  ASSERT_OK(Put(0, "Default", "Default"));
+  ASSERT_OK(Put(1, "pikachu", "pikachu"));
+  ASSERT_OK(Put(2, "ilya", "ilya"));
+  ASSERT_OK(Put(3, "muromec", "muromec"));
+  ASSERT_OK(Put(4, "dobrynia", "dobrynia"));
+  ASSERT_OK(Put(5, "nikitich", "nikitich"));
+  ASSERT_OK(Put(6, "alyosha", "alyosha"));
+  ASSERT_OK(Put(7, "popovich", "popovich"));
+
+  for (int i = 0; i < 8; ++i) {
+    ASSERT_OK(Flush(i));
+    auto tables = ListTableFiles(env_, dbname_);
+    ASSERT_EQ(tables.size(), i + 1U);
+  }
+}
+
+TEST_F(DBBasicTest, MultiGetSimple) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    SetPerfLevel(kEnableCount);
+    ASSERT_OK(Put(1, "k1", "v1"));
+    ASSERT_OK(Put(1, "k2", "v2"));
+    ASSERT_OK(Put(1, "k3", "v3"));
+    ASSERT_OK(Put(1, "k4", "v4"));
+    ASSERT_OK(Delete(1, "k4"));
+    ASSERT_OK(Put(1, "k5", "v5"));
+    ASSERT_OK(Delete(1, "no_key"));
+
+    std::vector<Slice> keys({"k1", "k2", "k3", "k4", "k5", "no_key"});
+
+    std::vector<std::string> values(20, "Temporary data to be overwritten");
+    std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+
+    get_perf_context()->Reset();
+    std::vector<Status> s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
+    ASSERT_EQ(values.size(), keys.size());
+    ASSERT_EQ(values[0], "v1");
+    ASSERT_EQ(values[1], "v2");
+    ASSERT_EQ(values[2], "v3");
+    ASSERT_EQ(values[4], "v5");
+    // four kv pairs * two bytes per value
+    ASSERT_EQ(8, (int)get_perf_context()->multiget_read_bytes);
+
+    ASSERT_OK(s[0]);
+    ASSERT_OK(s[1]);
+    ASSERT_OK(s[2]);
+    ASSERT_TRUE(s[3].IsNotFound());
+    ASSERT_OK(s[4]);
+    ASSERT_TRUE(s[5].IsNotFound());
+    SetPerfLevel(kDisable);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, MultiGetEmpty) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    // Empty Key Set
+    std::vector<Slice> keys;
+    std::vector<std::string> values;
+    std::vector<ColumnFamilyHandle*> cfs;
+    std::vector<Status> s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
+    ASSERT_EQ(s.size(), 0U);
+
+    // Empty Database, Empty Key Set
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+    s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
+    ASSERT_EQ(s.size(), 0U);
+
+    // Empty Database, Search for Keys
+    keys.resize(2);
+    keys[0] = "a";
+    keys[1] = "b";
+    cfs.push_back(handles_[0]);
+    cfs.push_back(handles_[1]);
+    s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
+    ASSERT_EQ(static_cast<int>(s.size()), 2);
+    ASSERT_TRUE(s[0].IsNotFound() && s[1].IsNotFound());
+  } while (ChangeCompactOptions());
+}
+
+class DBBlockChecksumTest : public DBBasicTest,
+                            public testing::WithParamInterface<uint32_t> {};
+
+INSTANTIATE_TEST_CASE_P(FormatVersions, DBBlockChecksumTest,
+                        testing::ValuesIn(test::kFooterFormatVersionsToTest));
+
+TEST_P(DBBlockChecksumTest, BlockChecksumTest) {
+  BlockBasedTableOptions table_options;
+  table_options.format_version = GetParam();
+  Options options = CurrentOptions();
+  const int kNumPerFile = 2;
+
+  const auto algs = GetSupportedChecksums();
+  const int algs_size = static_cast<int>(algs.size());
+
+  // generate one table with each type of checksum
+  for (int i = 0; i < algs_size; ++i) {
+    table_options.checksum = algs[i];
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    Reopen(options);
+    for (int j = 0; j < kNumPerFile; ++j) {
+      ASSERT_OK(Put(Key(i * kNumPerFile + j), Key(i * kNumPerFile + j)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // with each valid checksum type setting...
+  for (int i = 0; i < algs_size; ++i) {
+    table_options.checksum = algs[i];
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    Reopen(options);
+    // verify every type of checksum (should be regardless of that setting)
+    for (int j = 0; j < algs_size * kNumPerFile; ++j) {
+      ASSERT_EQ(Key(j), Get(Key(j)));
+    }
+  }
+
+  // Now test invalid checksum type
+  table_options.checksum = static_cast<ChecksumType>(123);
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_TRUE(TryReopen(options).IsInvalidArgument());
+}
+
+// On Windows you can have either memory mapped file or a file
+// with unbuffered access. So this asserts and does not make
+// sense to run
+#ifndef OS_WIN
+TEST_F(DBBasicTest, MmapAndBufferOptions) {
+  if (!IsMemoryMappedAccessSupported()) {
+    return;
+  }
+  Options options = CurrentOptions();
+
+  options.use_direct_reads = true;
+  options.allow_mmap_reads = true;
+  ASSERT_NOK(TryReopen(options));
+
+  // All other combinations are acceptable
+  options.use_direct_reads = false;
+  ASSERT_OK(TryReopen(options));
+
+  if (IsDirectIOSupported()) {
+    options.use_direct_reads = true;
+    options.allow_mmap_reads = false;
+    ASSERT_OK(TryReopen(options));
+  }
+
+  options.use_direct_reads = false;
+  ASSERT_OK(TryReopen(options));
+}
+#endif
+
+class TestEnv : public EnvWrapper {
+ public:
+  explicit TestEnv(Env* base_env) : EnvWrapper(base_env), close_count(0) {}
+  static const char* kClassName() { return "TestEnv"; }
+  const char* Name() const override { return kClassName(); }
+
+  class TestLogger : public Logger {
+   public:
+    using Logger::Logv;
+    explicit TestLogger(TestEnv* env_ptr) : Logger() { env = env_ptr; }
+    ~TestLogger() override {
+      if (!closed_) {
+        CloseHelper().PermitUncheckedError();
+      }
+    }
+    void Logv(const char* /*format*/, va_list /*ap*/) override {}
+
+   protected:
+    Status CloseImpl() override { return CloseHelper(); }
+
+   private:
+    Status CloseHelper() {
+      env->CloseCountInc();
+      ;
+      return Status::IOError();
+    }
+    TestEnv* env;
+  };
+
+  void CloseCountInc() { close_count++; }
+
+  int GetCloseCount() { return close_count; }
+
+  Status NewLogger(const std::string& /*fname*/,
+                   std::shared_ptr<Logger>* result) override {
+    result->reset(new TestLogger(this));
+    return Status::OK();
+  }
+
+ private:
+  int close_count;
+};
+
+TEST_F(DBBasicTest, DBClose) {
+  Options options = GetDefaultOptions();
+  std::string dbname = test::PerThreadDBPath("db_close_test");
+  ASSERT_OK(DestroyDB(dbname, options));
+
+  DB* db = nullptr;
+  TestEnv* env = new TestEnv(env_);
+  std::unique_ptr<TestEnv> local_env_guard(env);
+  options.create_if_missing = true;
+  options.env = env;
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  s = db->Close();
+  ASSERT_EQ(env->GetCloseCount(), 1);
+  ASSERT_EQ(s, Status::IOError());
+
+  delete db;
+  ASSERT_EQ(env->GetCloseCount(), 1);
+
+  // Do not call DB::Close() and ensure our logger Close() still gets called
+  s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+  delete db;
+  ASSERT_EQ(env->GetCloseCount(), 2);
+
+  // Provide our own logger and ensure DB::Close() does not close it
+  options.info_log.reset(new TestEnv::TestLogger(env));
+  options.create_if_missing = false;
+  s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  s = db->Close();
+  ASSERT_EQ(s, Status::OK());
+  delete db;
+  ASSERT_EQ(env->GetCloseCount(), 2);
+  options.info_log.reset();
+  ASSERT_EQ(env->GetCloseCount(), 3);
+}
+
+TEST_F(DBBasicTest, DBCloseAllDirectoryFDs) {
+  Options options = GetDefaultOptions();
+  std::string dbname = test::PerThreadDBPath("db_close_all_dir_fds_test");
+  // Configure a specific WAL directory
+  options.wal_dir = dbname + "_wal_dir";
+  // Configure 3 different data directories
+  options.db_paths.emplace_back(dbname + "_1", 512 * 1024);
+  options.db_paths.emplace_back(dbname + "_2", 4 * 1024 * 1024);
+  options.db_paths.emplace_back(dbname + "_3", 1024 * 1024 * 1024);
+
+  ASSERT_OK(DestroyDB(dbname, options));
+
+  DB* db = nullptr;
+  std::unique_ptr<Env> env = NewCompositeEnv(
+      std::make_shared<CountedFileSystem>(FileSystem::Default()));
+  options.create_if_missing = true;
+  options.env = env.get();
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  // Explicitly close the database to ensure the open and close counter for
+  // directories are equivalent
+  s = db->Close();
+  auto* counted_fs =
+      options.env->GetFileSystem()->CheckedCast<CountedFileSystem>();
+  ASSERT_TRUE(counted_fs != nullptr);
+  ASSERT_EQ(counted_fs->counters()->dir_opens,
+            counted_fs->counters()->dir_closes);
+  ASSERT_OK(s);
+  delete db;
+}
+
+TEST_F(DBBasicTest, DBCloseFlushError) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.manual_wal_flush = true;
+  options.write_buffer_size = 100;
+  options.env = fault_injection_env.get();
+
+  Reopen(options);
+  ASSERT_OK(Put("key1", "value1"));
+  ASSERT_OK(Put("key2", "value2"));
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+  ASSERT_OK(Put("key3", "value3"));
+  fault_injection_env->SetFilesystemActive(false);
+  Status s = dbfull()->Close();
+  ASSERT_NE(s, Status::OK());
+  // retry should return the same error
+  s = dbfull()->Close();
+  ASSERT_NE(s, Status::OK());
+  fault_injection_env->SetFilesystemActive(true);
+  // retry close() is no-op even the system is back. Could be improved if
+  // Close() is retry-able: #9029
+  s = dbfull()->Close();
+  ASSERT_NE(s, Status::OK());
+  Destroy(options);
+}
+
+class DBMultiGetTestWithParam
+    : public DBBasicTest,
+      public testing::WithParamInterface<std::tuple<bool, bool>> {};
+
+TEST_P(DBMultiGetTestWithParam, MultiGetMultiCF) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
+                         "alyosha", "popovich"},
+                        options);
+  // <CF, key, value> tuples
+  std::vector<std::tuple<int, std::string, std::string>> cf_kv_vec;
+  static const int num_keys = 24;
+  cf_kv_vec.reserve(num_keys);
+
+  for (int i = 0; i < num_keys; ++i) {
+    int cf = i / 3;
+    int cf_key = 1 % 3;
+    cf_kv_vec.emplace_back(std::make_tuple(
+        cf, "cf" + std::to_string(cf) + "_key_" + std::to_string(cf_key),
+        "cf" + std::to_string(cf) + "_val_" + std::to_string(cf_key)));
+    ASSERT_OK(Put(std::get<0>(cf_kv_vec[i]), std::get<1>(cf_kv_vec[i]),
+                  std::get<2>(cf_kv_vec[i])));
+  }
+
+  int get_sv_count = 0;
+  ROCKSDB_NAMESPACE::DBImpl* db = static_cast_with_check<DBImpl>(db_);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) {
+        if (++get_sv_count == 2) {
+          // After MultiGet refs a couple of CFs, flush all CFs so MultiGet
+          // is forced to repeat the process
+          for (int i = 0; i < num_keys; ++i) {
+            int cf = i / 3;
+            int cf_key = i % 8;
+            if (cf_key == 0) {
+              ASSERT_OK(Flush(cf));
+            }
+            ASSERT_OK(Put(std::get<0>(cf_kv_vec[i]), std::get<1>(cf_kv_vec[i]),
+                          std::get<2>(cf_kv_vec[i]) + "_2"));
+          }
+        }
+        if (get_sv_count == 11) {
+          for (int i = 0; i < 8; ++i) {
+            auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(
+                            db->GetColumnFamilyHandle(i))
+                            ->cfd();
+            ASSERT_EQ(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<int> cfs;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+
+  for (int i = 0; i < num_keys; ++i) {
+    cfs.push_back(std::get<0>(cf_kv_vec[i]));
+    keys.push_back(std::get<1>(cf_kv_vec[i]));
+  }
+
+  values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()),
+                    std::get<1>(GetParam()));
+  ASSERT_EQ(values.size(), num_keys);
+  for (unsigned int j = 0; j < values.size(); ++j) {
+    ASSERT_EQ(values[j], std::get<2>(cf_kv_vec[j]) + "_2");
+  }
+
+  keys.clear();
+  cfs.clear();
+  cfs.push_back(std::get<0>(cf_kv_vec[0]));
+  keys.push_back(std::get<1>(cf_kv_vec[0]));
+  cfs.push_back(std::get<0>(cf_kv_vec[3]));
+  keys.push_back(std::get<1>(cf_kv_vec[3]));
+  cfs.push_back(std::get<0>(cf_kv_vec[4]));
+  keys.push_back(std::get<1>(cf_kv_vec[4]));
+  values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()),
+                    std::get<1>(GetParam()));
+  ASSERT_EQ(values[0], std::get<2>(cf_kv_vec[0]) + "_2");
+  ASSERT_EQ(values[1], std::get<2>(cf_kv_vec[3]) + "_2");
+  ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[4]) + "_2");
+
+  keys.clear();
+  cfs.clear();
+  cfs.push_back(std::get<0>(cf_kv_vec[7]));
+  keys.push_back(std::get<1>(cf_kv_vec[7]));
+  cfs.push_back(std::get<0>(cf_kv_vec[6]));
+  keys.push_back(std::get<1>(cf_kv_vec[6]));
+  cfs.push_back(std::get<0>(cf_kv_vec[1]));
+  keys.push_back(std::get<1>(cf_kv_vec[1]));
+  values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()),
+                    std::get<1>(GetParam()));
+  ASSERT_EQ(values[0], std::get<2>(cf_kv_vec[7]) + "_2");
+  ASSERT_EQ(values[1], std::get<2>(cf_kv_vec[6]) + "_2");
+  ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[1]) + "_2");
+
+  for (int cf = 0; cf < 8; ++cf) {
+    auto* cfd =
+        static_cast_with_check<ColumnFamilyHandleImpl>(
+            static_cast_with_check<DBImpl>(db_)->GetColumnFamilyHandle(cf))
+            ->cfd();
+    ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
+    ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVObsolete);
+  }
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFMutex) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
+                         "alyosha", "popovich"},
+                        options);
+
+  for (int i = 0; i < 8; ++i) {
+    ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
+                  "cf" + std::to_string(i) + "_val"));
+  }
+
+  int get_sv_count = 0;
+  int retries = 0;
+  bool last_try = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::MultiGet::LastTry", [&](void* /*arg*/) {
+        last_try = true;
+        ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) {
+        if (last_try) {
+          return;
+        }
+        if (++get_sv_count == 2) {
+          ++retries;
+          get_sv_count = 0;
+          for (int i = 0; i < 8; ++i) {
+            ASSERT_OK(Flush(i));
+            ASSERT_OK(Put(
+                i, "cf" + std::to_string(i) + "_key",
+                "cf" + std::to_string(i) + "_val" + std::to_string(retries)));
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<int> cfs;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+
+  for (int i = 0; i < 8; ++i) {
+    cfs.push_back(i);
+    keys.push_back("cf" + std::to_string(i) + "_key");
+  }
+
+  values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()),
+                    std::get<1>(GetParam()));
+  ASSERT_TRUE(last_try);
+  ASSERT_EQ(values.size(), 8);
+  for (unsigned int j = 0; j < values.size(); ++j) {
+    ASSERT_EQ(values[j],
+              "cf" + std::to_string(j) + "_val" + std::to_string(retries));
+  }
+  for (int i = 0; i < 8; ++i) {
+    auto* cfd =
+        static_cast_with_check<ColumnFamilyHandleImpl>(
+            static_cast_with_check<DBImpl>(db_)->GetColumnFamilyHandle(i))
+            ->cfd();
+    ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
+  }
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFSnapshot) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
+                         "alyosha", "popovich"},
+                        options);
+
+  for (int i = 0; i < 8; ++i) {
+    ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
+                  "cf" + std::to_string(i) + "_val"));
+  }
+
+  int get_sv_count = 0;
+  ROCKSDB_NAMESPACE::DBImpl* db = static_cast_with_check<DBImpl>(db_);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) {
+        if (++get_sv_count == 2) {
+          for (int i = 0; i < 8; ++i) {
+            ASSERT_OK(Flush(i));
+            ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
+                          "cf" + std::to_string(i) + "_val2"));
+          }
+        }
+        if (get_sv_count == 8) {
+          for (int i = 0; i < 8; ++i) {
+            auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(
+                            db->GetColumnFamilyHandle(i))
+                            ->cfd();
+            ASSERT_TRUE(
+                (cfd->TEST_GetLocalSV()->Get() == SuperVersion::kSVInUse) ||
+                (cfd->TEST_GetLocalSV()->Get() == SuperVersion::kSVObsolete));
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<int> cfs;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+
+  for (int i = 0; i < 8; ++i) {
+    cfs.push_back(i);
+    keys.push_back("cf" + std::to_string(i) + "_key");
+  }
+
+  const Snapshot* snapshot = db_->GetSnapshot();
+  values = MultiGet(cfs, keys, snapshot, std::get<0>(GetParam()),
+                    std::get<1>(GetParam()));
+  db_->ReleaseSnapshot(snapshot);
+  ASSERT_EQ(values.size(), 8);
+  for (unsigned int j = 0; j < values.size(); ++j) {
+    ASSERT_EQ(values[j], "cf" + std::to_string(j) + "_val");
+  }
+  for (int i = 0; i < 8; ++i) {
+    auto* cfd =
+        static_cast_with_check<ColumnFamilyHandleImpl>(
+            static_cast_with_check<DBImpl>(db_)->GetColumnFamilyHandle(i))
+            ->cfd();
+    ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
+  }
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFUnsorted) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"one", "two"}, options);
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(2, "baz", "xyz"));
+  ASSERT_OK(Put(1, "abc", "def"));
+
+  // Note: keys for the same CF do not form a consecutive range
+  std::vector<int> cfs{1, 2, 1};
+  std::vector<std::string> keys{"foo", "baz", "abc"};
+  std::vector<std::string> values;
+
+  values = MultiGet(cfs, keys, /* snapshot */ nullptr,
+                    /* batched */ std::get<0>(GetParam()),
+                    /* async */ std::get<1>(GetParam()));
+
+  ASSERT_EQ(values.size(), 3);
+  ASSERT_EQ(values[0], "bar");
+  ASSERT_EQ(values[1], "xyz");
+  ASSERT_EQ(values[2], "def");
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedSimpleUnsorted) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  // Skip for unbatched MultiGet
+  if (!std::get<0>(GetParam())) {
+    ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
+    return;
+  }
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    SetPerfLevel(kEnableCount);
+    ASSERT_OK(Put(1, "k1", "v1"));
+    ASSERT_OK(Put(1, "k2", "v2"));
+    ASSERT_OK(Put(1, "k3", "v3"));
+    ASSERT_OK(Put(1, "k4", "v4"));
+    ASSERT_OK(Delete(1, "k4"));
+    ASSERT_OK(Put(1, "k5", "v5"));
+    ASSERT_OK(Delete(1, "no_key"));
+
+    get_perf_context()->Reset();
+
+    std::vector<Slice> keys({"no_key", "k5", "k4", "k3", "k2", "k1"});
+    std::vector<PinnableSlice> values(keys.size());
+    std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+    std::vector<Status> s(keys.size());
+
+    ReadOptions ro;
+    ro.async_io = std::get<1>(GetParam());
+    db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
+                  s.data(), false);
+
+    ASSERT_EQ(values.size(), keys.size());
+    ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v1");
+    ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v2");
+    ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v3");
+    ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5");
+    // four kv pairs * two bytes per value
+    ASSERT_EQ(8, (int)get_perf_context()->multiget_read_bytes);
+
+    ASSERT_TRUE(s[0].IsNotFound());
+    ASSERT_OK(s[1]);
+    ASSERT_TRUE(s[2].IsNotFound());
+    ASSERT_OK(s[3]);
+    ASSERT_OK(s[4]);
+    ASSERT_OK(s[5]);
+
+    SetPerfLevel(kDisable);
+  } while (ChangeCompactOptions());
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedSortedMultiFile) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  // Skip for unbatched MultiGet
+  if (!std::get<0>(GetParam())) {
+    ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
+    return;
+  }
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    SetPerfLevel(kEnableCount);
+    // To expand the power of this test, generate > 1 table file and
+    // mix with memtable
+    ASSERT_OK(Put(1, "k1", "v1"));
+    ASSERT_OK(Put(1, "k2", "v2"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Put(1, "k3", "v3"));
+    ASSERT_OK(Put(1, "k4", "v4"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Delete(1, "k4"));
+    ASSERT_OK(Put(1, "k5", "v5"));
+    ASSERT_OK(Delete(1, "no_key"));
+
+    get_perf_context()->Reset();
+
+    std::vector<Slice> keys({"k1", "k2", "k3", "k4", "k5", "no_key"});
+    std::vector<PinnableSlice> values(keys.size());
+    std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+    std::vector<Status> s(keys.size());
+
+    ReadOptions ro;
+    ro.async_io = std::get<1>(GetParam());
+    db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
+                  s.data(), true);
+
+    ASSERT_EQ(values.size(), keys.size());
+    ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v1");
+    ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v2");
+    ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v3");
+    ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v5");
+    // four kv pairs * two bytes per value
+    ASSERT_EQ(8, (int)get_perf_context()->multiget_read_bytes);
+
+    ASSERT_OK(s[0]);
+    ASSERT_OK(s[1]);
+    ASSERT_OK(s[2]);
+    ASSERT_TRUE(s[3].IsNotFound());
+    ASSERT_OK(s[4]);
+    ASSERT_TRUE(s[5].IsNotFound());
+
+    SetPerfLevel(kDisable);
+  } while (ChangeOptions());
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedDuplicateKeys) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  // Skip for unbatched MultiGet
+  if (!std::get<0>(GetParam())) {
+    ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
+    return;
+  }
+  Options opts = CurrentOptions();
+  opts.merge_operator = MergeOperators::CreateStringAppendOperator();
+  CreateAndReopenWithCF({"pikachu"}, opts);
+  SetPerfLevel(kEnableCount);
+  // To expand the power of this test, generate > 1 table file and
+  // mix with memtable
+  ASSERT_OK(Merge(1, "k1", "v1"));
+  ASSERT_OK(Merge(1, "k2", "v2"));
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(2, 1);
+  ASSERT_OK(Merge(1, "k3", "v3"));
+  ASSERT_OK(Merge(1, "k4", "v4"));
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(2, 1);
+  ASSERT_OK(Merge(1, "k4", "v4_2"));
+  ASSERT_OK(Merge(1, "k6", "v6"));
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(2, 1);
+  ASSERT_OK(Merge(1, "k7", "v7"));
+  ASSERT_OK(Merge(1, "k8", "v8"));
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(2, 1);
+
+  get_perf_context()->Reset();
+
+  std::vector<Slice> keys({"k8", "k8", "k8", "k4", "k4", "k1", "k3"});
+  std::vector<PinnableSlice> values(keys.size());
+  std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+  std::vector<Status> s(keys.size());
+
+  ReadOptions ro;
+  ro.async_io = std::get<1>(GetParam());
+  db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
+                s.data(), false);
+
+  ASSERT_EQ(values.size(), keys.size());
+  ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v8");
+  ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v8");
+  ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v8");
+  ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v4,v4_2");
+  ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v4,v4_2");
+  ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v1");
+  ASSERT_EQ(std::string(values[6].data(), values[6].size()), "v3");
+  ASSERT_EQ(24, (int)get_perf_context()->multiget_read_bytes);
+
+  for (Status& status : s) {
+    ASSERT_OK(status);
+  }
+
+  SetPerfLevel(kDisable);
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedMultiLevel) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  // Skip for unbatched MultiGet
+  if (!std::get<0>(GetParam())) {
+    ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
+    return;
+  }
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  int num_keys = 0;
+
+  for (int i = 0; i < 128; ++i) {
+    ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i)));
+    num_keys++;
+    if (num_keys == 8) {
+      ASSERT_OK(Flush());
+      num_keys = 0;
+    }
+  }
+  if (num_keys > 0) {
+    ASSERT_OK(Flush());
+    num_keys = 0;
+  }
+  MoveFilesToLevel(2);
+
+  for (int i = 0; i < 128; i += 3) {
+    ASSERT_OK(Put("key_" + std::to_string(i), "val_l1_" + std::to_string(i)));
+    num_keys++;
+    if (num_keys == 8) {
+      ASSERT_OK(Flush());
+      num_keys = 0;
+    }
+  }
+  if (num_keys > 0) {
+    ASSERT_OK(Flush());
+    num_keys = 0;
+  }
+  MoveFilesToLevel(1);
+
+  for (int i = 0; i < 128; i += 5) {
+    ASSERT_OK(Put("key_" + std::to_string(i), "val_l0_" + std::to_string(i)));
+    num_keys++;
+    if (num_keys == 8) {
+      ASSERT_OK(Flush());
+      num_keys = 0;
+    }
+  }
+  if (num_keys > 0) {
+    ASSERT_OK(Flush());
+    num_keys = 0;
+  }
+  ASSERT_EQ(0, num_keys);
+
+  for (int i = 0; i < 128; i += 9) {
+    ASSERT_OK(Put("key_" + std::to_string(i), "val_mem_" + std::to_string(i)));
+  }
+
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+
+  for (int i = 64; i < 80; ++i) {
+    keys.push_back("key_" + std::to_string(i));
+  }
+
+  values = MultiGet(keys, nullptr, std::get<1>(GetParam()));
+  ASSERT_EQ(values.size(), 16);
+  for (unsigned int j = 0; j < values.size(); ++j) {
+    int key = j + 64;
+    if (key % 9 == 0) {
+      ASSERT_EQ(values[j], "val_mem_" + std::to_string(key));
+    } else if (key % 5 == 0) {
+      ASSERT_EQ(values[j], "val_l0_" + std::to_string(key));
+    } else if (key % 3 == 0) {
+      ASSERT_EQ(values[j], "val_l1_" + std::to_string(key));
+    } else {
+      ASSERT_EQ(values[j], "val_l2_" + std::to_string(key));
+    }
+  }
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedMultiLevelMerge) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  // Skip for unbatched MultiGet
+  if (!std::get<0>(GetParam())) {
+    ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
+    return;
+  }
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+  int num_keys = 0;
+
+  for (int i = 0; i < 128; ++i) {
+    ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i)));
+    num_keys++;
+    if (num_keys == 8) {
+      ASSERT_OK(Flush());
+      num_keys = 0;
+    }
+  }
+  if (num_keys > 0) {
+    ASSERT_OK(Flush());
+    num_keys = 0;
+  }
+  MoveFilesToLevel(2);
+
+  for (int i = 0; i < 128; i += 3) {
+    ASSERT_OK(Merge("key_" + std::to_string(i), "val_l1_" + std::to_string(i)));
+    num_keys++;
+    if (num_keys == 8) {
+      ASSERT_OK(Flush());
+      num_keys = 0;
+    }
+  }
+  if (num_keys > 0) {
+    ASSERT_OK(Flush());
+    num_keys = 0;
+  }
+  MoveFilesToLevel(1);
+
+  for (int i = 0; i < 128; i += 5) {
+    ASSERT_OK(Merge("key_" + std::to_string(i), "val_l0_" + std::to_string(i)));
+    num_keys++;
+    if (num_keys == 8) {
+      ASSERT_OK(Flush());
+      num_keys = 0;
+    }
+  }
+  if (num_keys > 0) {
+    ASSERT_OK(Flush());
+    num_keys = 0;
+  }
+  ASSERT_EQ(0, num_keys);
+
+  for (int i = 0; i < 128; i += 9) {
+    ASSERT_OK(
+        Merge("key_" + std::to_string(i), "val_mem_" + std::to_string(i)));
+  }
+
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+
+  for (int i = 32; i < 80; ++i) {
+    keys.push_back("key_" + std::to_string(i));
+  }
+
+  values = MultiGet(keys, nullptr, std::get<1>(GetParam()));
+  ASSERT_EQ(values.size(), keys.size());
+  for (unsigned int j = 0; j < 48; ++j) {
+    int key = j + 32;
+    std::string value;
+    value.append("val_l2_" + std::to_string(key));
+    if (key % 3 == 0) {
+      value.append(",");
+      value.append("val_l1_" + std::to_string(key));
+    }
+    if (key % 5 == 0) {
+      value.append(",");
+      value.append("val_l0_" + std::to_string(key));
+    }
+    if (key % 9 == 0) {
+      value.append(",");
+      value.append("val_mem_" + std::to_string(key));
+    }
+    ASSERT_EQ(values[j], value);
+  }
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedValueSizeInMemory) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  // Skip for unbatched MultiGet
+  if (!std::get<0>(GetParam())) {
+    ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
+    return;
+  }
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  SetPerfLevel(kEnableCount);
+  ASSERT_OK(Put(1, "k1", "v_1"));
+  ASSERT_OK(Put(1, "k2", "v_2"));
+  ASSERT_OK(Put(1, "k3", "v_3"));
+  ASSERT_OK(Put(1, "k4", "v_4"));
+  ASSERT_OK(Put(1, "k5", "v_5"));
+  ASSERT_OK(Put(1, "k6", "v_6"));
+  std::vector<Slice> keys = {"k1", "k2", "k3", "k4", "k5", "k6"};
+  std::vector<PinnableSlice> values(keys.size());
+  std::vector<Status> s(keys.size());
+  std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+
+  get_perf_context()->Reset();
+  ReadOptions ro;
+  ro.value_size_soft_limit = 11;
+  ro.async_io = std::get<1>(GetParam());
+  db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
+                s.data(), false);
+
+  ASSERT_EQ(values.size(), keys.size());
+  for (unsigned int i = 0; i < 4; i++) {
+    ASSERT_EQ(std::string(values[i].data(), values[i].size()),
+              "v_" + std::to_string(i + 1));
+  }
+
+  for (unsigned int i = 4; i < 6; i++) {
+    ASSERT_TRUE(s[i].IsAborted());
+  }
+
+  ASSERT_EQ(12, (int)get_perf_context()->multiget_read_bytes);
+  SetPerfLevel(kDisable);
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedValueSize) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  // Skip for unbatched MultiGet
+  if (!std::get<0>(GetParam())) {
+    return;
+  }
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    SetPerfLevel(kEnableCount);
+
+    ASSERT_OK(Put(1, "k6", "v6"));
+    ASSERT_OK(Put(1, "k7", "v7_"));
+    ASSERT_OK(Put(1, "k3", "v3_"));
+    ASSERT_OK(Put(1, "k4", "v4"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Delete(1, "k4"));
+    ASSERT_OK(Put(1, "k11", "v11"));
+    ASSERT_OK(Delete(1, "no_key"));
+    ASSERT_OK(Put(1, "k8", "v8_"));
+    ASSERT_OK(Put(1, "k13", "v13"));
+    ASSERT_OK(Put(1, "k14", "v14"));
+    ASSERT_OK(Put(1, "k15", "v15"));
+    ASSERT_OK(Put(1, "k16", "v16"));
+    ASSERT_OK(Put(1, "k17", "v17"));
+    ASSERT_OK(Flush(1));
+
+    ASSERT_OK(Put(1, "k1", "v1_"));
+    ASSERT_OK(Put(1, "k2", "v2_"));
+    ASSERT_OK(Put(1, "k5", "v5_"));
+    ASSERT_OK(Put(1, "k9", "v9_"));
+    ASSERT_OK(Put(1, "k10", "v10"));
+    ASSERT_OK(Delete(1, "k2"));
+    ASSERT_OK(Delete(1, "k6"));
+
+    get_perf_context()->Reset();
+
+    std::vector<Slice> keys({"k1", "k10", "k11", "k12", "k13", "k14", "k15",
+                             "k16", "k17", "k2", "k3", "k4", "k5", "k6", "k7",
+                             "k8", "k9", "no_key"});
+    std::vector<PinnableSlice> values(keys.size());
+    std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+    std::vector<Status> s(keys.size());
+
+    ReadOptions ro;
+    ro.value_size_soft_limit = 20;
+    ro.async_io = std::get<1>(GetParam());
+    db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
+                  s.data(), false);
+
+    ASSERT_EQ(values.size(), keys.size());
+
+    // In memory keys
+    ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v1_");
+    ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v10");
+    ASSERT_TRUE(s[9].IsNotFound());  // k2
+    ASSERT_EQ(std::string(values[12].data(), values[12].size()), "v5_");
+    ASSERT_TRUE(s[13].IsNotFound());  // k6
+    ASSERT_EQ(std::string(values[16].data(), values[16].size()), "v9_");
+
+    // In sst files
+    ASSERT_EQ(std::string(values[2].data(), values[1].size()), "v11");
+    ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v13");
+    ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v14");
+
+    // Remaining aborted after value_size exceeds.
+    ASSERT_TRUE(s[3].IsAborted());
+    ASSERT_TRUE(s[6].IsAborted());
+    ASSERT_TRUE(s[7].IsAborted());
+    ASSERT_TRUE(s[8].IsAborted());
+    ASSERT_TRUE(s[10].IsAborted());
+    ASSERT_TRUE(s[11].IsAborted());
+    ASSERT_TRUE(s[14].IsAborted());
+    ASSERT_TRUE(s[15].IsAborted());
+    ASSERT_TRUE(s[17].IsAborted());
+
+    // 6 kv pairs * 3 bytes per value (i.e. 18)
+    ASSERT_EQ(21, (int)get_perf_context()->multiget_read_bytes);
+    SetPerfLevel(kDisable);
+  } while (ChangeCompactOptions());
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedValueSizeMultiLevelMerge) {
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_BYPASS("This test needs to be fixed for async IO");
+    return;
+  }
+  // Skip for unbatched MultiGet
+  if (!std::get<0>(GetParam())) {
+    ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
+    return;
+  }
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+  int num_keys = 0;
+
+  for (int i = 0; i < 64; ++i) {
+    ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i)));
+    num_keys++;
+    if (num_keys == 8) {
+      ASSERT_OK(Flush());
+      num_keys = 0;
+    }
+  }
+  if (num_keys > 0) {
+    ASSERT_OK(Flush());
+    num_keys = 0;
+  }
+  MoveFilesToLevel(2);
+
+  for (int i = 0; i < 64; i += 3) {
+    ASSERT_OK(Merge("key_" + std::to_string(i), "val_l1_" + std::to_string(i)));
+    num_keys++;
+    if (num_keys == 8) {
+      ASSERT_OK(Flush());
+      num_keys = 0;
+    }
+  }
+  if (num_keys > 0) {
+    ASSERT_OK(Flush());
+    num_keys = 0;
+  }
+  MoveFilesToLevel(1);
+
+  for (int i = 0; i < 64; i += 5) {
+    ASSERT_OK(Merge("key_" + std::to_string(i), "val_l0_" + std::to_string(i)));
+    num_keys++;
+    if (num_keys == 8) {
+      ASSERT_OK(Flush());
+      num_keys = 0;
+    }
+  }
+  if (num_keys > 0) {
+    ASSERT_OK(Flush());
+    num_keys = 0;
+  }
+  ASSERT_EQ(0, num_keys);
+
+  for (int i = 0; i < 64; i += 9) {
+    ASSERT_OK(
+        Merge("key_" + std::to_string(i), "val_mem_" + std::to_string(i)));
+  }
+
+  std::vector<std::string> keys_str;
+  for (int i = 10; i < 50; ++i) {
+    keys_str.push_back("key_" + std::to_string(i));
+  }
+
+  std::vector<Slice> keys(keys_str.size());
+  for (int i = 0; i < 40; i++) {
+    keys[i] = Slice(keys_str[i]);
+  }
+
+  std::vector<PinnableSlice> values(keys_str.size());
+  std::vector<Status> statuses(keys_str.size());
+  ReadOptions read_options;
+  read_options.verify_checksums = true;
+  read_options.value_size_soft_limit = 380;
+  read_options.async_io = std::get<1>(GetParam());
+  db_->MultiGet(read_options, dbfull()->DefaultColumnFamily(), keys.size(),
+                keys.data(), values.data(), statuses.data());
+
+  ASSERT_EQ(values.size(), keys.size());
+
+  for (unsigned int j = 0; j < 26; ++j) {
+    int key = j + 10;
+    std::string value;
+    value.append("val_l2_" + std::to_string(key));
+    if (key % 3 == 0) {
+      value.append(",");
+      value.append("val_l1_" + std::to_string(key));
+    }
+    if (key % 5 == 0) {
+      value.append(",");
+      value.append("val_l0_" + std::to_string(key));
+    }
+    if (key % 9 == 0) {
+      value.append(",");
+      value.append("val_mem_" + std::to_string(key));
+    }
+    ASSERT_EQ(values[j], value);
+    ASSERT_OK(statuses[j]);
+  }
+
+  // All remaning keys status is set Status::Abort
+  for (unsigned int j = 26; j < 40; j++) {
+    ASSERT_TRUE(statuses[j].IsAborted());
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(DBMultiGetTestWithParam, DBMultiGetTestWithParam,
+                        testing::Combine(testing::Bool(), testing::Bool()));
+
+#if USE_COROUTINES
+class DBMultiGetAsyncIOTest : public DBBasicTest,
+                              public ::testing::WithParamInterface<bool> {
+ public:
+  DBMultiGetAsyncIOTest()
+      : DBBasicTest(), statistics_(ROCKSDB_NAMESPACE::CreateDBStatistics()) {
+    BlockBasedTableOptions bbto;
+    bbto.filter_policy.reset(NewBloomFilterPolicy(10));
+    options_ = CurrentOptions();
+    options_.disable_auto_compactions = true;
+    options_.statistics = statistics_;
+    options_.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    Reopen(options_);
+    int num_keys = 0;
+
+    // Put all keys in the bottommost level, and overwrite some keys
+    // in L0 and L1
+    for (int i = 0; i < 256; ++i) {
+      EXPECT_OK(Put(Key(i), "val_l2_" + std::to_string(i)));
+      num_keys++;
+      if (num_keys == 8) {
+        EXPECT_OK(Flush());
+        num_keys = 0;
+      }
+    }
+    if (num_keys > 0) {
+      EXPECT_OK(Flush());
+      num_keys = 0;
+    }
+    MoveFilesToLevel(2);
+
+    for (int i = 0; i < 128; i += 3) {
+      EXPECT_OK(Put(Key(i), "val_l1_" + std::to_string(i)));
+      num_keys++;
+      if (num_keys == 8) {
+        EXPECT_OK(Flush());
+        num_keys = 0;
+      }
+    }
+    if (num_keys > 0) {
+      EXPECT_OK(Flush());
+      num_keys = 0;
+    }
+    // Put some range deletes in L1
+    for (int i = 128; i < 256; i += 32) {
+      std::string range_begin = Key(i);
+      std::string range_end = Key(i + 16);
+      EXPECT_OK(dbfull()->DeleteRange(WriteOptions(),
+                                      dbfull()->DefaultColumnFamily(),
+                                      range_begin, range_end));
+      // Also do some Puts to force creation of bloom filter
+      for (int j = i + 16; j < i + 32; ++j) {
+        if (j % 3 == 0) {
+          EXPECT_OK(Put(Key(j), "val_l1_" + std::to_string(j)));
+        }
+      }
+      EXPECT_OK(Flush());
+    }
+    MoveFilesToLevel(1);
+
+    for (int i = 0; i < 128; i += 5) {
+      EXPECT_OK(Put(Key(i), "val_l0_" + std::to_string(i)));
+      num_keys++;
+      if (num_keys == 8) {
+        EXPECT_OK(Flush());
+        num_keys = 0;
+      }
+    }
+    if (num_keys > 0) {
+      EXPECT_OK(Flush());
+      num_keys = 0;
+    }
+    EXPECT_EQ(0, num_keys);
+  }
+
+  const std::shared_ptr<Statistics>& statistics() { return statistics_; }
+
+ protected:
+  void ReopenDB() { Reopen(options_); }
+
+ private:
+  std::shared_ptr<Statistics> statistics_;
+  Options options_;
+};
+
+TEST_P(DBMultiGetAsyncIOTest, GetFromL0) {
+  // All 3 keys in L0. The L0 files should be read serially.
+  std::vector<std::string> key_strs{Key(0), Key(40), Key(80)};
+  std::vector<Slice> keys{key_strs[0], key_strs[1], key_strs[2]};
+  std::vector<PinnableSlice> values(key_strs.size());
+  std::vector<Status> statuses(key_strs.size());
+
+  ReadOptions ro;
+  ro.async_io = true;
+  ro.optimize_multiget_for_io = GetParam();
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data());
+  ASSERT_EQ(values.size(), 3);
+  ASSERT_OK(statuses[0]);
+  ASSERT_OK(statuses[1]);
+  ASSERT_OK(statuses[2]);
+  ASSERT_EQ(values[0], "val_l0_" + std::to_string(0));
+  ASSERT_EQ(values[1], "val_l0_" + std::to_string(40));
+  ASSERT_EQ(values[2], "val_l0_" + std::to_string(80));
+
+  HistogramData multiget_io_batch_size;
+
+  statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size);
+
+  // With async IO, lookups will happen in parallel for each key
+  if (GetParam()) {
+    ASSERT_EQ(multiget_io_batch_size.count, 1);
+    ASSERT_EQ(multiget_io_batch_size.max, 3);
+    ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3);
+  } else {
+    // Without Async IO, MultiGet will call MultiRead 3 times, once for each
+    // L0 file
+    ASSERT_EQ(multiget_io_batch_size.count, 3);
+  }
+}
+
+TEST_P(DBMultiGetAsyncIOTest, GetFromL1) {
+  std::vector<std::string> key_strs;
+  std::vector<Slice> keys;
+  std::vector<PinnableSlice> values;
+  std::vector<Status> statuses;
+
+  key_strs.push_back(Key(33));
+  key_strs.push_back(Key(54));
+  key_strs.push_back(Key(102));
+  keys.push_back(key_strs[0]);
+  keys.push_back(key_strs[1]);
+  keys.push_back(key_strs[2]);
+  values.resize(keys.size());
+  statuses.resize(keys.size());
+
+  ReadOptions ro;
+  ro.async_io = true;
+  ro.optimize_multiget_for_io = GetParam();
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data());
+  ASSERT_EQ(values.size(), 3);
+  ASSERT_EQ(statuses[0], Status::OK());
+  ASSERT_EQ(statuses[1], Status::OK());
+  ASSERT_EQ(statuses[2], Status::OK());
+  ASSERT_EQ(values[0], "val_l1_" + std::to_string(33));
+  ASSERT_EQ(values[1], "val_l1_" + std::to_string(54));
+  ASSERT_EQ(values[2], "val_l1_" + std::to_string(102));
+
+  HistogramData multiget_io_batch_size;
+
+  statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size);
+
+  // A batch of 3 async IOs is expected, one for each overlapping file in L1
+  ASSERT_EQ(multiget_io_batch_size.count, 1);
+  ASSERT_EQ(multiget_io_batch_size.max, 3);
+  ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3);
+}
+
+TEST_P(DBMultiGetAsyncIOTest, GetFromL1Error) {
+  std::vector<std::string> key_strs;
+  std::vector<Slice> keys;
+  std::vector<PinnableSlice> values;
+  std::vector<Status> statuses;
+
+  key_strs.push_back(Key(33));
+  key_strs.push_back(Key(54));
+  key_strs.push_back(Key(102));
+  keys.push_back(key_strs[0]);
+  keys.push_back(key_strs[1]);
+  keys.push_back(key_strs[2]);
+  values.resize(keys.size());
+  statuses.resize(keys.size());
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "TableCache::GetTableReader:BeforeOpenFile", [&](void* status) {
+        static int count = 0;
+        count++;
+        // Fail the last table reader open, which is the 6th SST file
+        // since 3 overlapping L0 files + 3 L1 files containing the keys
+        if (count == 6) {
+          Status* s = static_cast<Status*>(status);
+          *s = Status::IOError();
+        }
+      });
+  // DB open will create table readers unless we reduce the table cache
+  // capacity.
+  // SanitizeOptions will set max_open_files to minimum of 20. Table cache
+  // is allocated with max_open_files - 10 as capacity. So override
+  // max_open_files to 11 so table cache capacity will become 1. This will
+  // prevent file open during DB open and force the file to be opened
+  // during MultiGet
+  SyncPoint::GetInstance()->SetCallBack(
+      "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+        int* max_open_files = (int*)arg;
+        *max_open_files = 11;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ReopenDB();
+
+  ReadOptions ro;
+  ro.async_io = true;
+  ro.optimize_multiget_for_io = GetParam();
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data());
+  SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(values.size(), 3);
+  ASSERT_EQ(statuses[0], Status::OK());
+  ASSERT_EQ(statuses[1], Status::OK());
+  ASSERT_EQ(statuses[2], Status::IOError());
+
+  HistogramData multiget_io_batch_size;
+
+  statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size);
+
+  // A batch of 3 async IOs is expected, one for each overlapping file in L1
+  ASSERT_EQ(multiget_io_batch_size.count, 1);
+  ASSERT_EQ(multiget_io_batch_size.max, 2);
+  ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2);
+}
+
+TEST_P(DBMultiGetAsyncIOTest, LastKeyInFile) {
+  std::vector<std::string> key_strs;
+  std::vector<Slice> keys;
+  std::vector<PinnableSlice> values;
+  std::vector<Status> statuses;
+
+  // 21 is the last key in the first L1 file
+  key_strs.push_back(Key(21));
+  key_strs.push_back(Key(54));
+  key_strs.push_back(Key(102));
+  keys.push_back(key_strs[0]);
+  keys.push_back(key_strs[1]);
+  keys.push_back(key_strs[2]);
+  values.resize(keys.size());
+  statuses.resize(keys.size());
+
+  ReadOptions ro;
+  ro.async_io = true;
+  ro.optimize_multiget_for_io = GetParam();
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data());
+  ASSERT_EQ(values.size(), 3);
+  ASSERT_EQ(statuses[0], Status::OK());
+  ASSERT_EQ(statuses[1], Status::OK());
+  ASSERT_EQ(statuses[2], Status::OK());
+  ASSERT_EQ(values[0], "val_l1_" + std::to_string(21));
+  ASSERT_EQ(values[1], "val_l1_" + std::to_string(54));
+  ASSERT_EQ(values[2], "val_l1_" + std::to_string(102));
+
+  HistogramData multiget_io_batch_size;
+
+  statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size);
+
+  // Since the first MultiGet key is the last key in a file, the MultiGet is
+  // expected to lookup in that file first, before moving on to other files.
+  // So the first file lookup will issue one async read, and the next lookup
+  // will lookup 2 files in parallel and issue 2 async reads
+  ASSERT_EQ(multiget_io_batch_size.count, 2);
+  ASSERT_EQ(multiget_io_batch_size.max, 2);
+}
+
+TEST_P(DBMultiGetAsyncIOTest, GetFromL1AndL2) {
+  std::vector<std::string> key_strs;
+  std::vector<Slice> keys;
+  std::vector<PinnableSlice> values;
+  std::vector<Status> statuses;
+
+  // 33 and 102 are in L1, and 56 is in L2
+  key_strs.push_back(Key(33));
+  key_strs.push_back(Key(56));
+  key_strs.push_back(Key(102));
+  keys.push_back(key_strs[0]);
+  keys.push_back(key_strs[1]);
+  keys.push_back(key_strs[2]);
+  values.resize(keys.size());
+  statuses.resize(keys.size());
+
+  ReadOptions ro;
+  ro.async_io = true;
+  ro.optimize_multiget_for_io = GetParam();
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data());
+  ASSERT_EQ(values.size(), 3);
+  ASSERT_EQ(statuses[0], Status::OK());
+  ASSERT_EQ(statuses[1], Status::OK());
+  ASSERT_EQ(statuses[2], Status::OK());
+  ASSERT_EQ(values[0], "val_l1_" + std::to_string(33));
+  ASSERT_EQ(values[1], "val_l2_" + std::to_string(56));
+  ASSERT_EQ(values[2], "val_l1_" + std::to_string(102));
+
+  HistogramData multiget_io_batch_size;
+
+  statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size);
+
+  // There are 2 keys in L1 in twp separate files, and 1 in L2. With
+  // optimize_multiget_for_io, all three lookups will happen in parallel.
+  // Otherwise, the L2 lookup will happen after L1.
+  ASSERT_EQ(multiget_io_batch_size.count, GetParam() ? 1 : 2);
+  ASSERT_EQ(multiget_io_batch_size.max, GetParam() ? 3 : 2);
+}
+
+TEST_P(DBMultiGetAsyncIOTest, GetFromL2WithRangeOverlapL0L1) {
+  std::vector<std::string> key_strs;
+  std::vector<Slice> keys;
+  std::vector<PinnableSlice> values;
+  std::vector<Status> statuses;
+
+  // 19 and 26 are in L2, but overlap with L0 and L1 file ranges
+  key_strs.push_back(Key(19));
+  key_strs.push_back(Key(26));
+  keys.push_back(key_strs[0]);
+  keys.push_back(key_strs[1]);
+  values.resize(keys.size());
+  statuses.resize(keys.size());
+
+  ReadOptions ro;
+  ro.async_io = true;
+  ro.optimize_multiget_for_io = GetParam();
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data());
+  ASSERT_EQ(values.size(), 2);
+  ASSERT_EQ(statuses[0], Status::OK());
+  ASSERT_EQ(statuses[1], Status::OK());
+  ASSERT_EQ(values[0], "val_l2_" + std::to_string(19));
+  ASSERT_EQ(values[1], "val_l2_" + std::to_string(26));
+
+  // Bloom filters in L0/L1 will avoid the coroutine calls in those levels
+  ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2);
+}
+
+TEST_P(DBMultiGetAsyncIOTest, GetFromL2WithRangeDelInL1) {
+  std::vector<std::string> key_strs;
+  std::vector<Slice> keys;
+  std::vector<PinnableSlice> values;
+  std::vector<Status> statuses;
+
+  // 139 and 163 are in L2, but overlap with a range deletes in L1
+  key_strs.push_back(Key(139));
+  key_strs.push_back(Key(163));
+  keys.push_back(key_strs[0]);
+  keys.push_back(key_strs[1]);
+  values.resize(keys.size());
+  statuses.resize(keys.size());
+
+  ReadOptions ro;
+  ro.async_io = true;
+  ro.optimize_multiget_for_io = GetParam();
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data());
+  ASSERT_EQ(values.size(), 2);
+  ASSERT_EQ(statuses[0], Status::NotFound());
+  ASSERT_EQ(statuses[1], Status::NotFound());
+
+  // Bloom filters in L0/L1 will avoid the coroutine calls in those levels
+  ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2);
+}
+
+TEST_P(DBMultiGetAsyncIOTest, GetFromL1AndL2WithRangeDelInL1) {
+  std::vector<std::string> key_strs;
+  std::vector<Slice> keys;
+  std::vector<PinnableSlice> values;
+  std::vector<Status> statuses;
+
+  // 139 and 163 are in L2, but overlap with a range deletes in L1
+  key_strs.push_back(Key(139));
+  key_strs.push_back(Key(144));
+  key_strs.push_back(Key(163));
+  keys.push_back(key_strs[0]);
+  keys.push_back(key_strs[1]);
+  keys.push_back(key_strs[2]);
+  values.resize(keys.size());
+  statuses.resize(keys.size());
+
+  ReadOptions ro;
+  ro.async_io = true;
+  ro.optimize_multiget_for_io = GetParam();
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data());
+  ASSERT_EQ(values.size(), keys.size());
+  ASSERT_EQ(statuses[0], Status::NotFound());
+  ASSERT_EQ(statuses[1], Status::OK());
+  ASSERT_EQ(values[1], "val_l1_" + std::to_string(144));
+  ASSERT_EQ(statuses[2], Status::NotFound());
+
+  // Bloom filters in L0/L1 will avoid the coroutine calls in those levels
+  ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3);
+}
+
+INSTANTIATE_TEST_CASE_P(DBMultiGetAsyncIOTest, DBMultiGetAsyncIOTest,
+                        testing::Bool());
+#endif  // USE_COROUTINES
+
+TEST_F(DBBasicTest, MultiGetStats) {
+  Options options;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  table_options.partition_filters = true;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  int total_keys = 2000;
+  std::vector<std::string> keys_str(total_keys);
+  std::vector<Slice> keys(total_keys);
+  static size_t kMultiGetBatchSize = 100;
+  std::vector<PinnableSlice> values(kMultiGetBatchSize);
+  std::vector<Status> s(kMultiGetBatchSize);
+  ReadOptions read_opts;
+
+  Random rnd(309);
+  // Create Multiple SST files at multiple levels.
+  for (int i = 0; i < 500; ++i) {
+    keys_str[i] = "k" + std::to_string(i);
+    keys[i] = Slice(keys_str[i]);
+    ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000)));
+    if (i % 100 == 0) {
+      ASSERT_OK(Flush(1));
+    }
+  }
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(2, 1);
+
+  for (int i = 501; i < 1000; ++i) {
+    keys_str[i] = "k" + std::to_string(i);
+    keys[i] = Slice(keys_str[i]);
+    ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000)));
+    if (i % 100 == 0) {
+      ASSERT_OK(Flush(1));
+    }
+  }
+
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(2, 1);
+
+  for (int i = 1001; i < total_keys; ++i) {
+    keys_str[i] = "k" + std::to_string(i);
+    keys[i] = Slice(keys_str[i]);
+    ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000)));
+    if (i % 100 == 0) {
+      ASSERT_OK(Flush(1));
+    }
+  }
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(1, 1);
+  Close();
+
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_OK(options.statistics->Reset());
+
+  db_->MultiGet(read_opts, handles_[1], kMultiGetBatchSize, &keys[1250],
+                values.data(), s.data(), false);
+
+  ASSERT_EQ(values.size(), kMultiGetBatchSize);
+  HistogramData hist_level;
+  HistogramData hist_index_and_filter_blocks;
+  HistogramData hist_sst;
+
+  options.statistics->histogramData(NUM_LEVEL_READ_PER_MULTIGET, &hist_level);
+  options.statistics->histogramData(NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
+                                    &hist_index_and_filter_blocks);
+  options.statistics->histogramData(NUM_SST_READ_PER_LEVEL, &hist_sst);
+
+  // Maximum number of blocks read from a file system in a level.
+  ASSERT_EQ(hist_level.max, 1);
+  ASSERT_GT(hist_index_and_filter_blocks.max, 0);
+  // Maximum number of sst files read from file system in a level.
+  ASSERT_EQ(hist_sst.max, 2);
+
+  // Minimun number of blocks read in a level.
+  ASSERT_EQ(hist_level.min, 1);
+  ASSERT_GT(hist_index_and_filter_blocks.min, 0);
+  // Minimun number of sst files read in a level.
+  ASSERT_EQ(hist_sst.min, 1);
+
+  for (PinnableSlice& value : values) {
+    value.Reset();
+  }
+  for (Status& status : s) {
+    status = Status::OK();
+  }
+  db_->MultiGet(read_opts, handles_[1], kMultiGetBatchSize, &keys[950],
+                values.data(), s.data(), false);
+  options.statistics->histogramData(NUM_LEVEL_READ_PER_MULTIGET, &hist_level);
+  ASSERT_EQ(hist_level.max, 2);
+}
+
+// Test class for batched MultiGet with prefix extractor
+// Param bool - If true, use partitioned filters
+//              If false, use full filter block
+class MultiGetPrefixExtractorTest : public DBBasicTest,
+                                    public ::testing::WithParamInterface<bool> {
+};
+
+TEST_P(MultiGetPrefixExtractorTest, Batched) {
+  Options options = CurrentOptions();
+  options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+  options.memtable_prefix_bloom_size_ratio = 10;
+  BlockBasedTableOptions bbto;
+  if (GetParam()) {
+    bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    bbto.partition_filters = true;
+  }
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  bbto.cache_index_and_filter_blocks = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  SetPerfLevel(kEnableCount);
+  get_perf_context()->Reset();
+
+  ASSERT_OK(Put("k", "v0"));
+  ASSERT_OK(Put("kk1", "v1"));
+  ASSERT_OK(Put("kk2", "v2"));
+  ASSERT_OK(Put("kk3", "v3"));
+  ASSERT_OK(Put("kk4", "v4"));
+  std::vector<std::string> keys(
+      {"k", "kk1", "kk2", "kk3", "kk4", "rofl", "lmho"});
+  std::vector<std::string> expected(
+      {"v0", "v1", "v2", "v3", "v4", "NOT_FOUND", "NOT_FOUND"});
+  std::vector<std::string> values;
+  values = MultiGet(keys, nullptr);
+  ASSERT_EQ(values, expected);
+  // One key ("k") is not queried against the filter because it is outside
+  // the prefix_extractor domain, leaving 6 keys with queried prefixes.
+  ASSERT_EQ(get_perf_context()->bloom_memtable_miss_count, 2);
+  ASSERT_EQ(get_perf_context()->bloom_memtable_hit_count, 4);
+  ASSERT_OK(Flush());
+
+  get_perf_context()->Reset();
+  values = MultiGet(keys, nullptr);
+  ASSERT_EQ(values, expected);
+  ASSERT_EQ(get_perf_context()->bloom_sst_miss_count, 2);
+  ASSERT_EQ(get_perf_context()->bloom_sst_hit_count, 4);
+
+  // Also check Get stat
+  get_perf_context()->Reset();
+  for (size_t i = 0; i < keys.size(); ++i) {
+    values[i] = Get(keys[i]);
+  }
+  ASSERT_EQ(values, expected);
+  ASSERT_EQ(get_perf_context()->bloom_sst_miss_count, 2);
+  ASSERT_EQ(get_perf_context()->bloom_sst_hit_count, 4);
+}
+
+INSTANTIATE_TEST_CASE_P(MultiGetPrefix, MultiGetPrefixExtractorTest,
+                        ::testing::Bool());
+
+#ifndef ROCKSDB_LITE
+class DBMultiGetRowCacheTest : public DBBasicTest,
+                               public ::testing::WithParamInterface<bool> {};
+
+TEST_P(DBMultiGetRowCacheTest, MultiGetBatched) {
+  do {
+    option_config_ = kRowCache;
+    Options options = CurrentOptions();
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    CreateAndReopenWithCF({"pikachu"}, options);
+    SetPerfLevel(kEnableCount);
+    ASSERT_OK(Put(1, "k1", "v1"));
+    ASSERT_OK(Put(1, "k2", "v2"));
+    ASSERT_OK(Put(1, "k3", "v3"));
+    ASSERT_OK(Put(1, "k4", "v4"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Put(1, "k5", "v5"));
+    const Snapshot* snap1 = dbfull()->GetSnapshot();
+    ASSERT_OK(Delete(1, "k4"));
+    ASSERT_OK(Flush(1));
+    const Snapshot* snap2 = dbfull()->GetSnapshot();
+
+    get_perf_context()->Reset();
+
+    std::vector<Slice> keys({"no_key", "k5", "k4", "k3", "k1"});
+    std::vector<PinnableSlice> values(keys.size());
+    std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+    std::vector<Status> s(keys.size());
+
+    ReadOptions ro;
+    bool use_snapshots = GetParam();
+    if (use_snapshots) {
+      ro.snapshot = snap2;
+    }
+    db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
+                  s.data(), false);
+
+    ASSERT_EQ(values.size(), keys.size());
+    ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v1");
+    ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v3");
+    ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5");
+    // four kv pairs * two bytes per value
+    ASSERT_EQ(6, (int)get_perf_context()->multiget_read_bytes);
+
+    ASSERT_TRUE(s[0].IsNotFound());
+    ASSERT_OK(s[1]);
+    ASSERT_TRUE(s[2].IsNotFound());
+    ASSERT_OK(s[3]);
+    ASSERT_OK(s[4]);
+
+    // Call MultiGet() again with some intersection with the previous set of
+    // keys. Those should already be in the row cache.
+    keys.assign({"no_key", "k5", "k3", "k2"});
+    for (size_t i = 0; i < keys.size(); ++i) {
+      values[i].Reset();
+      s[i] = Status::OK();
+    }
+    get_perf_context()->Reset();
+
+    if (use_snapshots) {
+      ro.snapshot = snap1;
+    }
+    db_->MultiGet(ReadOptions(), handles_[1], keys.size(), keys.data(),
+                  values.data(), s.data(), false);
+
+    ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v2");
+    ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v3");
+    ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5");
+    // four kv pairs * two bytes per value
+    ASSERT_EQ(6, (int)get_perf_context()->multiget_read_bytes);
+
+    ASSERT_TRUE(s[0].IsNotFound());
+    ASSERT_OK(s[1]);
+    ASSERT_OK(s[2]);
+    ASSERT_OK(s[3]);
+    if (use_snapshots) {
+      // Only reads from the first SST file would have been cached, since
+      // snapshot seq no is > fd.largest_seqno
+      ASSERT_EQ(1, TestGetTickerCount(options, ROW_CACHE_HIT));
+    } else {
+      ASSERT_EQ(2, TestGetTickerCount(options, ROW_CACHE_HIT));
+    }
+
+    SetPerfLevel(kDisable);
+    dbfull()->ReleaseSnapshot(snap1);
+    dbfull()->ReleaseSnapshot(snap2);
+  } while (ChangeCompactOptions());
+}
+
+INSTANTIATE_TEST_CASE_P(DBMultiGetRowCacheTest, DBMultiGetRowCacheTest,
+                        testing::Values(true, false));
+
+TEST_F(DBBasicTest, GetAllKeyVersions) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_EQ(2, handles_.size());
+  const size_t kNumInserts = 4;
+  const size_t kNumDeletes = 4;
+  const size_t kNumUpdates = 4;
+
+  // Check default column family
+  for (size_t i = 0; i != kNumInserts; ++i) {
+    ASSERT_OK(Put(std::to_string(i), "value"));
+  }
+  for (size_t i = 0; i != kNumUpdates; ++i) {
+    ASSERT_OK(Put(std::to_string(i), "value1"));
+  }
+  for (size_t i = 0; i != kNumDeletes; ++i) {
+    ASSERT_OK(Delete(std::to_string(i)));
+  }
+  std::vector<KeyVersion> key_versions;
+  ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(),
+                              std::numeric_limits<size_t>::max(),
+                              &key_versions));
+  ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size());
+  for (size_t i = 0; i < kNumInserts + kNumDeletes + kNumUpdates; i++) {
+    if (i % 3 == 0) {
+      ASSERT_EQ(key_versions[i].GetTypeName(), "TypeDeletion");
+    } else {
+      ASSERT_EQ(key_versions[i].GetTypeName(), "TypeValue");
+    }
+  }
+  ASSERT_OK(GetAllKeyVersions(db_, handles_[0], Slice(), Slice(),
+                              std::numeric_limits<size_t>::max(),
+                              &key_versions));
+  ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size());
+
+  // Check non-default column family
+  for (size_t i = 0; i + 1 != kNumInserts; ++i) {
+    ASSERT_OK(Put(1, std::to_string(i), "value"));
+  }
+  for (size_t i = 0; i + 1 != kNumUpdates; ++i) {
+    ASSERT_OK(Put(1, std::to_string(i), "value1"));
+  }
+  for (size_t i = 0; i + 1 != kNumDeletes; ++i) {
+    ASSERT_OK(Delete(1, std::to_string(i)));
+  }
+  ASSERT_OK(GetAllKeyVersions(db_, handles_[1], Slice(), Slice(),
+                              std::numeric_limits<size_t>::max(),
+                              &key_versions));
+  ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates - 3, key_versions.size());
+}
+
+TEST_F(DBBasicTest, ValueTypeString) {
+  KeyVersion key_version;
+  // when adding new type, please also update `value_type_string_map`
+  for (unsigned char i = ValueType::kTypeDeletion; i < ValueType::kTypeMaxValid;
+       i++) {
+    key_version.type = i;
+    ASSERT_TRUE(key_version.GetTypeName() != "Invalid");
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_F(DBBasicTest, MultiGetIOBufferOverrun) {
+  Options options = CurrentOptions();
+  Random rnd(301);
+  BlockBasedTableOptions table_options;
+  table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+  table_options.block_size = 16 * 1024;
+  ASSERT_TRUE(table_options.block_size >
+              BlockBasedTable::kMultiGetReadStackBufSize);
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  std::string zero_str(128, '\0');
+  for (int i = 0; i < 100; ++i) {
+    // Make the value compressible. A purely random string doesn't compress
+    // and the resultant data block will not be compressed
+    std::string value(rnd.RandomString(128) + zero_str);
+    assert(Put(Key(i), value) == Status::OK());
+  }
+  ASSERT_OK(Flush());
+
+  std::vector<std::string> key_data(10);
+  std::vector<Slice> keys;
+  // We cannot resize a PinnableSlice vector, so just set initial size to
+  // largest we think we will need
+  std::vector<PinnableSlice> values(10);
+  std::vector<Status> statuses;
+  ReadOptions ro;
+
+  // Warm up the cache first
+  key_data.emplace_back(Key(0));
+  keys.emplace_back(Slice(key_data.back()));
+  key_data.emplace_back(Key(50));
+  keys.emplace_back(Slice(key_data.back()));
+  statuses.resize(keys.size());
+
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+}
+
+TEST_F(DBBasicTest, IncrementalRecoveryNoCorrupt) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions write_opts;
+  write_opts.disableWAL = true;
+  for (size_t cf = 0; cf != num_cfs; ++cf) {
+    for (size_t i = 0; i != 10000; ++i) {
+      std::string key_str = Key(static_cast<int>(i));
+      std::string value_str = std::to_string(cf) + "_" + std::to_string(i);
+
+      ASSERT_OK(Put(static_cast<int>(cf), key_str, value_str));
+      if (0 == (i % 1000)) {
+        ASSERT_OK(Flush(static_cast<int>(cf)));
+      }
+    }
+  }
+  for (size_t cf = 0; cf != num_cfs; ++cf) {
+    ASSERT_OK(Flush(static_cast<int>(cf)));
+  }
+  Close();
+  options.best_efforts_recovery = true;
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+                           options);
+  num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  for (size_t cf = 0; cf != num_cfs; ++cf) {
+    for (int i = 0; i != 10000; ++i) {
+      std::string key_str = Key(static_cast<int>(i));
+      std::string expected_value_str =
+          std::to_string(cf) + "_" + std::to_string(i);
+      ASSERT_EQ(expected_value_str, Get(static_cast<int>(cf), key_str));
+    }
+  }
+}
+
+TEST_F(DBBasicTest, BestEffortsRecoveryWithVersionBuildingFailure) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "value"));
+  ASSERT_OK(Flush());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) {
+        ASSERT_NE(nullptr, arg);
+        *(reinterpret_cast<Status*>(arg)) =
+            Status::Corruption("Inject corruption");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  options.best_efforts_recovery = true;
+  Status s = TryReopen(options);
+  ASSERT_TRUE(s.IsCorruption());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+class TableFileListener : public EventListener {
+ public:
+  void OnTableFileCreated(const TableFileCreationInfo& info) override {
+    InstrumentedMutexLock lock(&mutex_);
+    cf_to_paths_[info.cf_name].push_back(info.file_path);
+  }
+  std::vector<std::string>& GetFiles(const std::string& cf_name) {
+    InstrumentedMutexLock lock(&mutex_);
+    return cf_to_paths_[cf_name];
+  }
+
+ private:
+  InstrumentedMutex mutex_;
+  std::unordered_map<std::string, std::vector<std::string>> cf_to_paths_;
+};
+}  // anonymous namespace
+
+TEST_F(DBBasicTest, LastSstFileNotInManifest) {
+  // If the last sst file is not tracked in MANIFEST,
+  // or the VersionEdit for the last sst file is not synced,
+  // on recovery, the last sst file should be deleted,
+  // and new sst files shouldn't reuse its file number.
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  Close();
+
+  // Manually add a sst file.
+  constexpr uint64_t kSstFileNumber = 100;
+  const std::string kSstFile = MakeTableFileName(dbname_, kSstFileNumber);
+  ASSERT_OK(WriteStringToFile(env_, /* data = */ "bad sst file content",
+                              /* fname = */ kSstFile,
+                              /* should_sync = */ true));
+  ASSERT_OK(env_->FileExists(kSstFile));
+
+  TableFileListener* listener = new TableFileListener();
+  options.listeners.emplace_back(listener);
+  Reopen(options);
+  // kSstFile should already be deleted.
+  ASSERT_TRUE(env_->FileExists(kSstFile).IsNotFound());
+
+  ASSERT_OK(Put("k", "v"));
+  ASSERT_OK(Flush());
+  // New sst file should have file number > kSstFileNumber.
+  std::vector<std::string>& files =
+      listener->GetFiles(kDefaultColumnFamilyName);
+  ASSERT_EQ(files.size(), 1);
+  const std::string fname = files[0].erase(0, (dbname_ + "/").size());
+  uint64_t number = 0;
+  FileType type = kTableFile;
+  ASSERT_TRUE(ParseFileName(fname, &number, &type));
+  ASSERT_EQ(type, kTableFile);
+  ASSERT_GT(number, kSstFileNumber);
+}
+
+TEST_F(DBBasicTest, RecoverWithMissingFiles) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  TableFileListener* listener = new TableFileListener();
+  // Disable auto compaction to simplify SST file name tracking.
+  options.disable_auto_compactions = true;
+  options.listeners.emplace_back(listener);
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  std::vector<std::string> all_cf_names = {kDefaultColumnFamilyName, "pikachu",
+                                           "eevee"};
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  for (size_t cf = 0; cf != num_cfs; ++cf) {
+    ASSERT_OK(Put(static_cast<int>(cf), "a", "0_value"));
+    ASSERT_OK(Flush(static_cast<int>(cf)));
+    ASSERT_OK(Put(static_cast<int>(cf), "b", "0_value"));
+    ASSERT_OK(Flush(static_cast<int>(cf)));
+    ASSERT_OK(Put(static_cast<int>(cf), "c", "0_value"));
+    ASSERT_OK(Flush(static_cast<int>(cf)));
+  }
+
+  // Delete and corrupt files
+  for (size_t i = 0; i < all_cf_names.size(); ++i) {
+    std::vector<std::string>& files = listener->GetFiles(all_cf_names[i]);
+    ASSERT_EQ(3, files.size());
+    std::string corrupted_data;
+    ASSERT_OK(ReadFileToString(env_, files[files.size() - 1], &corrupted_data));
+    ASSERT_OK(WriteStringToFile(
+        env_, corrupted_data.substr(0, corrupted_data.size() - 2),
+        files[files.size() - 1], /*should_sync=*/true));
+    for (int j = static_cast<int>(files.size() - 2); j >= static_cast<int>(i);
+         --j) {
+      ASSERT_OK(env_->DeleteFile(files[j]));
+    }
+  }
+  options.best_efforts_recovery = true;
+  ReopenWithColumnFamilies(all_cf_names, options);
+  // Verify data
+  ReadOptions read_opts;
+  read_opts.total_order_seek = true;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts, handles_[0]));
+    iter->SeekToFirst();
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_OK(iter->status());
+    iter.reset(db_->NewIterator(read_opts, handles_[1]));
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a", iter->key());
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_OK(iter->status());
+    iter.reset(db_->NewIterator(read_opts, handles_[2]));
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a", iter->key());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("b", iter->key());
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_OK(iter->status());
+  }
+}
+
+TEST_F(DBBasicTest, BestEffortsRecoveryTryMultipleManifests) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "value0"));
+  ASSERT_OK(Flush());
+  Close();
+  {
+    // Hack by adding a new MANIFEST with high file number
+    std::string garbage(10, '\0');
+    ASSERT_OK(WriteStringToFile(env_, garbage, dbname_ + "/MANIFEST-001000",
+                                /*should_sync=*/true));
+  }
+  {
+    // Hack by adding a corrupted SST not referenced by any MANIFEST
+    std::string garbage(10, '\0');
+    ASSERT_OK(WriteStringToFile(env_, garbage, dbname_ + "/001001.sst",
+                                /*should_sync=*/true));
+  }
+
+  options.best_efforts_recovery = true;
+
+  Reopen(options);
+  ASSERT_OK(Put("bar", "value"));
+}
+
+TEST_F(DBBasicTest, RecoverWithNoCurrentFile) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  options.best_efforts_recovery = true;
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+  ASSERT_EQ(2, handles_.size());
+  ASSERT_OK(Put("foo", "value"));
+  ASSERT_OK(Put(1, "bar", "value"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Flush(1));
+  Close();
+  ASSERT_OK(env_->DeleteFile(CurrentFileName(dbname_)));
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+  std::vector<std::string> cf_names;
+  ASSERT_OK(DB::ListColumnFamilies(DBOptions(options), dbname_, &cf_names));
+  ASSERT_EQ(2, cf_names.size());
+  for (const auto& name : cf_names) {
+    ASSERT_TRUE(name == kDefaultColumnFamilyName || name == "pikachu");
+  }
+}
+
+TEST_F(DBBasicTest, RecoverWithNoManifest) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "value"));
+  ASSERT_OK(Flush());
+  Close();
+  {
+    // Delete all MANIFEST.
+    std::vector<std::string> files;
+    ASSERT_OK(env_->GetChildren(dbname_, &files));
+    for (const auto& file : files) {
+      uint64_t number = 0;
+      FileType type = kWalFile;
+      if (ParseFileName(file, &number, &type) && type == kDescriptorFile) {
+        ASSERT_OK(env_->DeleteFile(dbname_ + "/" + file));
+      }
+    }
+  }
+  options.best_efforts_recovery = true;
+  options.create_if_missing = false;
+  Status s = TryReopen(options);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  options.create_if_missing = true;
+  Reopen(options);
+  // Since no MANIFEST exists, best-efforts recovery creates a new, empty db.
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+}
+
+TEST_F(DBBasicTest, SkipWALIfMissingTableFiles) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  TableFileListener* listener = new TableFileListener();
+  options.listeners.emplace_back(listener);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  std::vector<std::string> kAllCfNames = {kDefaultColumnFamilyName, "pikachu"};
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(2, num_cfs);
+  for (int cf = 0; cf < static_cast<int>(kAllCfNames.size()); ++cf) {
+    ASSERT_OK(Put(cf, "a", "0_value"));
+    ASSERT_OK(Flush(cf));
+    ASSERT_OK(Put(cf, "b", "0_value"));
+  }
+  // Delete files
+  for (size_t i = 0; i < kAllCfNames.size(); ++i) {
+    std::vector<std::string>& files = listener->GetFiles(kAllCfNames[i]);
+    ASSERT_EQ(1, files.size());
+    for (int j = static_cast<int>(files.size() - 1); j >= static_cast<int>(i);
+         --j) {
+      ASSERT_OK(env_->DeleteFile(files[j]));
+    }
+  }
+  options.best_efforts_recovery = true;
+  ReopenWithColumnFamilies(kAllCfNames, options);
+  // Verify WAL is not applied
+  ReadOptions read_opts;
+  read_opts.total_order_seek = true;
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts, handles_[0]));
+  iter->SeekToFirst();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+  iter.reset(db_->NewIterator(read_opts, handles_[1]));
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("a", iter->key());
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+}
+
+TEST_F(DBBasicTest, DisableTrackWal) {
+  // If WAL tracking was enabled, and then disabled during reopen,
+  // the previously tracked WALs should be removed from MANIFEST.
+
+  Options options = CurrentOptions();
+  options.track_and_verify_wals_in_manifest = true;
+  // extremely small write buffer size,
+  // so that new WALs are created more frequently.
+  options.write_buffer_size = 100;
+  options.env = env_;
+  DestroyAndReopen(options);
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put("foo" + std::to_string(i), "value" + std::to_string(i)));
+  }
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+  ASSERT_OK(db_->SyncWAL());
+  // Some WALs are tracked.
+  ASSERT_FALSE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty());
+  Close();
+
+  // Disable WAL tracking.
+  options.track_and_verify_wals_in_manifest = false;
+  options.create_if_missing = false;
+  ASSERT_OK(TryReopen(options));
+  // Previously tracked WALs are cleared.
+  ASSERT_TRUE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty());
+  Close();
+
+  // Re-enable WAL tracking again.
+  options.track_and_verify_wals_in_manifest = true;
+  options.create_if_missing = false;
+  ASSERT_OK(TryReopen(options));
+  ASSERT_TRUE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty());
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_F(DBBasicTest, ManifestChecksumMismatch) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("bar", "value"));
+  ASSERT_OK(Flush());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "LogWriter::EmitPhysicalRecord:BeforeEncodeChecksum", [&](void* arg) {
+        auto* crc = reinterpret_cast<uint32_t*>(arg);
+        *crc = *crc + 1;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions write_opts;
+  write_opts.disableWAL = true;
+  Status s = db_->Put(write_opts, "foo", "value");
+  ASSERT_OK(s);
+  ASSERT_OK(Flush());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  ASSERT_OK(Put("foo", "value1"));
+  ASSERT_OK(Flush());
+  s = TryReopen(options);
+  ASSERT_TRUE(s.IsCorruption());
+}
+
+TEST_F(DBBasicTest, ConcurrentlyCloseDB) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  std::vector<std::thread> workers;
+  for (int i = 0; i < 10; i++) {
+    workers.push_back(std::thread([&]() {
+      auto s = db_->Close();
+      ASSERT_OK(s);
+    }));
+  }
+  for (auto& w : workers) {
+    w.join();
+  }
+}
+
+#ifndef ROCKSDB_LITE
+class DBBasicTestTrackWal : public DBTestBase,
+                            public testing::WithParamInterface<bool> {
+ public:
+  DBBasicTestTrackWal()
+      : DBTestBase("db_basic_test_track_wal", /*env_do_fsync=*/false) {}
+
+  int CountWalFiles() {
+    VectorLogPtr log_files;
+    EXPECT_OK(dbfull()->GetSortedWalFiles(log_files));
+    return static_cast<int>(log_files.size());
+  };
+};
+
+TEST_P(DBBasicTestTrackWal, DoNotTrackObsoleteWal) {
+  // If a WAL becomes obsolete after flushing, but is not deleted from disk yet,
+  // then if SyncWAL is called afterwards, the obsolete WAL should not be
+  // tracked in MANIFEST.
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.track_and_verify_wals_in_manifest = true;
+  options.atomic_flush = GetParam();
+
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"cf"}, options);
+  ASSERT_EQ(handles_.size(), 2);  // default, cf
+  // Do not delete WALs.
+  ASSERT_OK(db_->DisableFileDeletions());
+  constexpr int n = 10;
+  std::vector<std::unique_ptr<LogFile>> wals(n);
+  for (size_t i = 0; i < n; i++) {
+    // Generate a new WAL for each key-value.
+    const int cf = i % 2;
+    ASSERT_OK(db_->GetCurrentWalFile(&wals[i]));
+    ASSERT_OK(Put(cf, "k" + std::to_string(i), "v" + std::to_string(i)));
+    ASSERT_OK(Flush({0, 1}));
+  }
+  ASSERT_EQ(CountWalFiles(), n);
+  // Since all WALs are obsolete, no WAL should be tracked in MANIFEST.
+  ASSERT_OK(db_->SyncWAL());
+
+  // Manually delete all WALs.
+  Close();
+  for (const auto& wal : wals) {
+    ASSERT_OK(env_->DeleteFile(LogFileName(dbname_, wal->LogNumber())));
+  }
+
+  // If SyncWAL tracks the obsolete WALs in MANIFEST,
+  // reopen will fail because the WALs are missing from disk.
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "cf"}, options));
+  Destroy(options);
+}
+
+INSTANTIATE_TEST_CASE_P(DBBasicTestTrackWal, DBBasicTestTrackWal,
+                        testing::Bool());
+#endif  // ROCKSDB_LITE
+
+class DBBasicTestMultiGet : public DBTestBase {
+ public:
+  DBBasicTestMultiGet(std::string test_dir, int num_cfs, bool compressed_cache,
+                      bool uncompressed_cache, bool _compression_enabled,
+                      bool _fill_cache, uint32_t compression_parallel_threads)
+      : DBTestBase(test_dir, /*env_do_fsync=*/false) {
+    compression_enabled_ = _compression_enabled;
+    fill_cache_ = _fill_cache;
+
+    if (compressed_cache) {
+      std::shared_ptr<Cache> cache = NewLRUCache(1048576);
+      compressed_cache_ = std::make_shared<MyBlockCache>(cache);
+    }
+    if (uncompressed_cache) {
+      std::shared_ptr<Cache> cache = NewLRUCache(1048576);
+      uncompressed_cache_ = std::make_shared<MyBlockCache>(cache);
+    }
+
+    env_->count_random_reads_ = true;
+
+    Options options = CurrentOptions();
+    Random rnd(301);
+    BlockBasedTableOptions table_options;
+
+#ifndef ROCKSDB_LITE
+    if (compression_enabled_) {
+      std::vector<CompressionType> compression_types;
+      compression_types = GetSupportedCompressions();
+      // Not every platform may have compression libraries available, so
+      // dynamically pick based on what's available
+      CompressionType tmp_type = kNoCompression;
+      for (auto c_type : compression_types) {
+        if (c_type != kNoCompression) {
+          tmp_type = c_type;
+          break;
+        }
+      }
+      if (tmp_type != kNoCompression) {
+        options.compression = tmp_type;
+      } else {
+        compression_enabled_ = false;
+      }
+    }
+#else
+    // GetSupportedCompressions() is not available in LITE build
+    if (!Snappy_Supported()) {
+      compression_enabled_ = false;
+    }
+#endif  // ROCKSDB_LITE
+
+    table_options.block_cache = uncompressed_cache_;
+    if (table_options.block_cache == nullptr) {
+      table_options.no_block_cache = true;
+    } else {
+      table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+    }
+    table_options.block_cache_compressed = compressed_cache_;
+    table_options.flush_block_policy_factory.reset(
+        new MyFlushBlockPolicyFactory());
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    if (!compression_enabled_) {
+      options.compression = kNoCompression;
+    } else {
+      options.compression_opts.parallel_threads = compression_parallel_threads;
+    }
+    options_ = options;
+    Reopen(options);
+
+    if (num_cfs > 1) {
+      for (int cf = 0; cf < num_cfs; ++cf) {
+        cf_names_.emplace_back("cf" + std::to_string(cf));
+      }
+      CreateColumnFamilies(cf_names_, options);
+      cf_names_.emplace_back("default");
+    }
+
+    std::string zero_str(128, '\0');
+    for (int cf = 0; cf < num_cfs; ++cf) {
+      for (int i = 0; i < 100; ++i) {
+        // Make the value compressible. A purely random string doesn't compress
+        // and the resultant data block will not be compressed
+        values_.emplace_back(rnd.RandomString(128) + zero_str);
+        assert(((num_cfs == 1) ? Put(Key(i), values_[i])
+                               : Put(cf, Key(i), values_[i])) == Status::OK());
+      }
+      if (num_cfs == 1) {
+        EXPECT_OK(Flush());
+      } else {
+        EXPECT_OK(dbfull()->Flush(FlushOptions(), handles_[cf]));
+      }
+
+      for (int i = 0; i < 100; ++i) {
+        // block cannot gain space by compression
+        uncompressable_values_.emplace_back(rnd.RandomString(256) + '\0');
+        std::string tmp_key = "a" + Key(i);
+        assert(((num_cfs == 1) ? Put(tmp_key, uncompressable_values_[i])
+                               : Put(cf, tmp_key, uncompressable_values_[i])) ==
+               Status::OK());
+      }
+      if (num_cfs == 1) {
+        EXPECT_OK(Flush());
+      } else {
+        EXPECT_OK(dbfull()->Flush(FlushOptions(), handles_[cf]));
+      }
+    }
+    // Clear compressed cache, which is always pre-populated
+    if (compressed_cache_) {
+      compressed_cache_->SetCapacity(0);
+      compressed_cache_->SetCapacity(1048576);
+    }
+  }
+
+  bool CheckValue(int i, const std::string& value) {
+    if (values_[i].compare(value) == 0) {
+      return true;
+    }
+    return false;
+  }
+
+  bool CheckUncompressableValue(int i, const std::string& value) {
+    if (uncompressable_values_[i].compare(value) == 0) {
+      return true;
+    }
+    return false;
+  }
+
+  const std::vector<std::string>& GetCFNames() const { return cf_names_; }
+
+  int num_lookups() { return uncompressed_cache_->num_lookups(); }
+  int num_found() { return uncompressed_cache_->num_found(); }
+  int num_inserts() { return uncompressed_cache_->num_inserts(); }
+
+  int num_lookups_compressed() { return compressed_cache_->num_lookups(); }
+  int num_found_compressed() { return compressed_cache_->num_found(); }
+  int num_inserts_compressed() { return compressed_cache_->num_inserts(); }
+
+  bool fill_cache() { return fill_cache_; }
+  bool compression_enabled() { return compression_enabled_; }
+  bool has_compressed_cache() { return compressed_cache_ != nullptr; }
+  bool has_uncompressed_cache() { return uncompressed_cache_ != nullptr; }
+  Options get_options() { return options_; }
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+ protected:
+  class MyFlushBlockPolicyFactory : public FlushBlockPolicyFactory {
+   public:
+    MyFlushBlockPolicyFactory() {}
+
+    virtual const char* Name() const override {
+      return "MyFlushBlockPolicyFactory";
+    }
+
+    virtual FlushBlockPolicy* NewFlushBlockPolicy(
+        const BlockBasedTableOptions& /*table_options*/,
+        const BlockBuilder& data_block_builder) const override {
+      return new MyFlushBlockPolicy(data_block_builder);
+    }
+  };
+
+  class MyFlushBlockPolicy : public FlushBlockPolicy {
+   public:
+    explicit MyFlushBlockPolicy(const BlockBuilder& data_block_builder)
+        : num_keys_(0), data_block_builder_(data_block_builder) {}
+
+    bool Update(const Slice& /*key*/, const Slice& /*value*/) override {
+      if (data_block_builder_.empty()) {
+        // First key in this block
+        num_keys_ = 1;
+        return false;
+      }
+      // Flush every 10 keys
+      if (num_keys_ == 10) {
+        num_keys_ = 1;
+        return true;
+      }
+      num_keys_++;
+      return false;
+    }
+
+   private:
+    int num_keys_;
+    const BlockBuilder& data_block_builder_;
+  };
+
+  class MyBlockCache : public CacheWrapper {
+   public:
+    explicit MyBlockCache(std::shared_ptr<Cache> target)
+        : CacheWrapper(target),
+          num_lookups_(0),
+          num_found_(0),
+          num_inserts_(0) {}
+
+    const char* Name() const override { return "MyBlockCache"; }
+
+    using Cache::Insert;
+    Status Insert(const Slice& key, void* value, size_t charge,
+                  void (*deleter)(const Slice& key, void* value),
+                  Handle** handle = nullptr,
+                  Priority priority = Priority::LOW) override {
+      num_inserts_++;
+      return target_->Insert(key, value, charge, deleter, handle, priority);
+    }
+
+    using Cache::Lookup;
+    Handle* Lookup(const Slice& key, Statistics* stats = nullptr) override {
+      num_lookups_++;
+      Handle* handle = target_->Lookup(key, stats);
+      if (handle != nullptr) {
+        num_found_++;
+      }
+      return handle;
+    }
+    int num_lookups() { return num_lookups_; }
+
+    int num_found() { return num_found_; }
+
+    int num_inserts() { return num_inserts_; }
+
+   private:
+    int num_lookups_;
+    int num_found_;
+    int num_inserts_;
+  };
+
+  std::shared_ptr<MyBlockCache> compressed_cache_;
+  std::shared_ptr<MyBlockCache> uncompressed_cache_;
+  Options options_;
+  bool compression_enabled_;
+  std::vector<std::string> values_;
+  std::vector<std::string> uncompressable_values_;
+  bool fill_cache_;
+  std::vector<std::string> cf_names_;
+};
+
+class DBBasicTestWithParallelIO
+    : public DBBasicTestMultiGet,
+      public testing::WithParamInterface<
+          std::tuple<bool, bool, bool, bool, uint32_t>> {
+ public:
+  DBBasicTestWithParallelIO()
+      : DBBasicTestMultiGet("/db_basic_test_with_parallel_io", 1,
+                            std::get<0>(GetParam()), std::get<1>(GetParam()),
+                            std::get<2>(GetParam()), std::get<3>(GetParam()),
+                            std::get<4>(GetParam())) {}
+};
+
+TEST_P(DBBasicTestWithParallelIO, MultiGet) {
+  std::vector<std::string> key_data(10);
+  std::vector<Slice> keys;
+  // We cannot resize a PinnableSlice vector, so just set initial size to
+  // largest we think we will need
+  std::vector<PinnableSlice> values(10);
+  std::vector<Status> statuses;
+  ReadOptions ro;
+  ro.fill_cache = fill_cache();
+
+  // Warm up the cache first
+  key_data.emplace_back(Key(0));
+  keys.emplace_back(Slice(key_data.back()));
+  key_data.emplace_back(Key(50));
+  keys.emplace_back(Slice(key_data.back()));
+  statuses.resize(keys.size());
+
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  ASSERT_TRUE(CheckValue(0, values[0].ToString()));
+  ASSERT_TRUE(CheckValue(50, values[1].ToString()));
+
+  int random_reads = env_->random_read_counter_.Read();
+  key_data[0] = Key(1);
+  key_data[1] = Key(51);
+  keys[0] = Slice(key_data[0]);
+  keys[1] = Slice(key_data[1]);
+  values[0].Reset();
+  values[1].Reset();
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  ASSERT_TRUE(CheckValue(1, values[0].ToString()));
+  ASSERT_TRUE(CheckValue(51, values[1].ToString()));
+
+  bool read_from_cache = false;
+  if (fill_cache()) {
+    if (has_uncompressed_cache()) {
+      read_from_cache = true;
+    } else if (has_compressed_cache() && compression_enabled()) {
+      read_from_cache = true;
+    }
+  }
+
+  int expected_reads = random_reads + (read_from_cache ? 0 : 2);
+  ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+
+  keys.resize(10);
+  statuses.resize(10);
+  std::vector<int> key_ints{1, 2, 15, 16, 55, 81, 82, 83, 84, 85};
+  for (size_t i = 0; i < key_ints.size(); ++i) {
+    key_data[i] = Key(key_ints[i]);
+    keys[i] = Slice(key_data[i]);
+    statuses[i] = Status::OK();
+    values[i].Reset();
+  }
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  for (size_t i = 0; i < key_ints.size(); ++i) {
+    ASSERT_OK(statuses[i]);
+    ASSERT_TRUE(CheckValue(key_ints[i], values[i].ToString()));
+  }
+  if (compression_enabled() && !has_compressed_cache()) {
+    expected_reads += (read_from_cache ? 2 : 3);
+  } else {
+    expected_reads += (read_from_cache ? 2 : 4);
+  }
+  ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+
+  keys.resize(10);
+  statuses.resize(10);
+  std::vector<int> key_uncmp{1, 2, 15, 16, 55, 81, 82, 83, 84, 85};
+  for (size_t i = 0; i < key_uncmp.size(); ++i) {
+    key_data[i] = "a" + Key(key_uncmp[i]);
+    keys[i] = Slice(key_data[i]);
+    statuses[i] = Status::OK();
+    values[i].Reset();
+  }
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  for (size_t i = 0; i < key_uncmp.size(); ++i) {
+    ASSERT_OK(statuses[i]);
+    ASSERT_TRUE(CheckUncompressableValue(key_uncmp[i], values[i].ToString()));
+  }
+  if (compression_enabled() && !has_compressed_cache()) {
+    expected_reads += (read_from_cache ? 3 : 3);
+  } else {
+    expected_reads += (read_from_cache ? 4 : 4);
+  }
+  ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+
+  keys.resize(5);
+  statuses.resize(5);
+  std::vector<int> key_tr{1, 2, 15, 16, 55};
+  for (size_t i = 0; i < key_tr.size(); ++i) {
+    key_data[i] = "a" + Key(key_tr[i]);
+    keys[i] = Slice(key_data[i]);
+    statuses[i] = Status::OK();
+    values[i].Reset();
+  }
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  for (size_t i = 0; i < key_tr.size(); ++i) {
+    ASSERT_OK(statuses[i]);
+    ASSERT_TRUE(CheckUncompressableValue(key_tr[i], values[i].ToString()));
+  }
+  if (compression_enabled() && !has_compressed_cache()) {
+    expected_reads += (read_from_cache ? 0 : 2);
+    ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+  } else {
+    if (has_uncompressed_cache()) {
+      expected_reads += (read_from_cache ? 0 : 3);
+      ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+    } else {
+      // A rare case, even we enable the block compression but some of data
+      // blocks are not compressed due to content. If user only enable the
+      // compressed cache, the uncompressed blocks will not tbe cached, and
+      // block reads will be triggered. The number of reads is related to
+      // the compression algorithm.
+      ASSERT_TRUE(env_->random_read_counter_.Read() >= expected_reads);
+    }
+  }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(DBBasicTestWithParallelIO, MultiGetDirectIO) {
+  class FakeDirectIOEnv : public EnvWrapper {
+    class FakeDirectIOSequentialFile;
+    class FakeDirectIORandomAccessFile;
+
+   public:
+    FakeDirectIOEnv(Env* env) : EnvWrapper(env) {}
+    static const char* kClassName() { return "FakeDirectIOEnv"; }
+    const char* Name() const override { return kClassName(); }
+
+    Status NewRandomAccessFile(const std::string& fname,
+                               std::unique_ptr<RandomAccessFile>* result,
+                               const EnvOptions& options) override {
+      std::unique_ptr<RandomAccessFile> file;
+      assert(options.use_direct_reads);
+      EnvOptions opts = options;
+      opts.use_direct_reads = false;
+      Status s = target()->NewRandomAccessFile(fname, &file, opts);
+      if (!s.ok()) {
+        return s;
+      }
+      result->reset(new FakeDirectIORandomAccessFile(std::move(file)));
+      return s;
+    }
+
+   private:
+    class FakeDirectIOSequentialFile : public SequentialFileWrapper {
+     public:
+      FakeDirectIOSequentialFile(std::unique_ptr<SequentialFile>&& file)
+          : SequentialFileWrapper(file.get()), file_(std::move(file)) {}
+      ~FakeDirectIOSequentialFile() {}
+
+      bool use_direct_io() const override { return true; }
+      size_t GetRequiredBufferAlignment() const override { return 1; }
+
+     private:
+      std::unique_ptr<SequentialFile> file_;
+    };
+
+    class FakeDirectIORandomAccessFile : public RandomAccessFileWrapper {
+     public:
+      FakeDirectIORandomAccessFile(std::unique_ptr<RandomAccessFile>&& file)
+          : RandomAccessFileWrapper(file.get()), file_(std::move(file)) {}
+      ~FakeDirectIORandomAccessFile() {}
+
+      bool use_direct_io() const override { return true; }
+      size_t GetRequiredBufferAlignment() const override { return 1; }
+
+     private:
+      std::unique_ptr<RandomAccessFile> file_;
+    };
+  };
+
+  std::unique_ptr<FakeDirectIOEnv> env(new FakeDirectIOEnv(env_));
+  Options opts = get_options();
+  opts.env = env.get();
+  opts.use_direct_reads = true;
+  Reopen(opts);
+
+  std::vector<std::string> key_data(10);
+  std::vector<Slice> keys;
+  // We cannot resize a PinnableSlice vector, so just set initial size to
+  // largest we think we will need
+  std::vector<PinnableSlice> values(10);
+  std::vector<Status> statuses;
+  ReadOptions ro;
+  ro.fill_cache = fill_cache();
+
+  // Warm up the cache first
+  key_data.emplace_back(Key(0));
+  keys.emplace_back(Slice(key_data.back()));
+  key_data.emplace_back(Key(50));
+  keys.emplace_back(Slice(key_data.back()));
+  statuses.resize(keys.size());
+
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  ASSERT_TRUE(CheckValue(0, values[0].ToString()));
+  ASSERT_TRUE(CheckValue(50, values[1].ToString()));
+
+  int random_reads = env_->random_read_counter_.Read();
+  key_data[0] = Key(1);
+  key_data[1] = Key(51);
+  keys[0] = Slice(key_data[0]);
+  keys[1] = Slice(key_data[1]);
+  values[0].Reset();
+  values[1].Reset();
+  if (uncompressed_cache_) {
+    uncompressed_cache_->SetCapacity(0);
+    uncompressed_cache_->SetCapacity(1048576);
+  }
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  ASSERT_TRUE(CheckValue(1, values[0].ToString()));
+  ASSERT_TRUE(CheckValue(51, values[1].ToString()));
+
+  bool read_from_cache = false;
+  if (fill_cache()) {
+    if (has_uncompressed_cache()) {
+      read_from_cache = true;
+    } else if (has_compressed_cache() && compression_enabled()) {
+      read_from_cache = true;
+    }
+  }
+
+  int expected_reads = random_reads;
+  if (!compression_enabled() || !has_compressed_cache()) {
+    expected_reads += 2;
+  } else {
+    expected_reads += (read_from_cache ? 0 : 2);
+  }
+  if (env_->random_read_counter_.Read() != expected_reads) {
+    ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+  }
+  Close();
+}
+#endif  // ROCKSDB_LITE
+
+TEST_P(DBBasicTestWithParallelIO, MultiGetWithChecksumMismatch) {
+  std::vector<std::string> key_data(10);
+  std::vector<Slice> keys;
+  // We cannot resize a PinnableSlice vector, so just set initial size to
+  // largest we think we will need
+  std::vector<PinnableSlice> values(10);
+  std::vector<Status> statuses;
+  int read_count = 0;
+  ReadOptions ro;
+  ro.fill_cache = fill_cache();
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "RetrieveMultipleBlocks:VerifyChecksum", [&](void* status) {
+        Status* s = static_cast<Status*>(status);
+        read_count++;
+        if (read_count == 2) {
+          *s = Status::Corruption();
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Warm up the cache first
+  key_data.emplace_back(Key(0));
+  keys.emplace_back(Slice(key_data.back()));
+  key_data.emplace_back(Key(50));
+  keys.emplace_back(Slice(key_data.back()));
+  statuses.resize(keys.size());
+
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  ASSERT_TRUE(CheckValue(0, values[0].ToString()));
+  // ASSERT_TRUE(CheckValue(50, values[1].ToString()));
+  ASSERT_EQ(statuses[0], Status::OK());
+  ASSERT_EQ(statuses[1], Status::Corruption());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBBasicTestWithParallelIO, MultiGetWithMissingFile) {
+  std::vector<std::string> key_data(10);
+  std::vector<Slice> keys;
+  // We cannot resize a PinnableSlice vector, so just set initial size to
+  // largest we think we will need
+  std::vector<PinnableSlice> values(10);
+  std::vector<Status> statuses;
+  ReadOptions ro;
+  ro.fill_cache = fill_cache();
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "TableCache::MultiGet:FindTable", [&](void* status) {
+        Status* s = static_cast<Status*>(status);
+        *s = Status::IOError();
+      });
+  // DB open will create table readers unless we reduce the table cache
+  // capacity.
+  // SanitizeOptions will set max_open_files to minimum of 20. Table cache
+  // is allocated with max_open_files - 10 as capacity. So override
+  // max_open_files to 11 so table cache capacity will become 1. This will
+  // prevent file open during DB open and force the file to be opened
+  // during MultiGet
+  SyncPoint::GetInstance()->SetCallBack(
+      "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+        int* max_open_files = (int*)arg;
+        *max_open_files = 11;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Reopen(CurrentOptions());
+
+  // Warm up the cache first
+  key_data.emplace_back(Key(0));
+  keys.emplace_back(Slice(key_data.back()));
+  key_data.emplace_back(Key(50));
+  keys.emplace_back(Slice(key_data.back()));
+  statuses.resize(keys.size());
+
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  ASSERT_EQ(statuses[0], Status::IOError());
+  ASSERT_EQ(statuses[1], Status::IOError());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(ParallelIO, DBBasicTestWithParallelIO,
+                        // Params are as follows -
+                        // Param 0 - Compressed cache enabled
+                        // Param 1 - Uncompressed cache enabled
+                        // Param 2 - Data compression enabled
+                        // Param 3 - ReadOptions::fill_cache
+                        // Param 4 - CompressionOptions::parallel_threads
+                        ::testing::Combine(::testing::Bool(), ::testing::Bool(),
+                                           ::testing::Bool(), ::testing::Bool(),
+                                           ::testing::Values(1, 4)));
+
+// Forward declaration
+class DeadlineFS;
+
+class DeadlineRandomAccessFile : public FSRandomAccessFileOwnerWrapper {
+ public:
+  DeadlineRandomAccessFile(DeadlineFS& fs,
+                           std::unique_ptr<FSRandomAccessFile>& file)
+      : FSRandomAccessFileOwnerWrapper(std::move(file)), fs_(fs) {}
+
+  IOStatus Read(uint64_t offset, size_t len, const IOOptions& opts,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
+
+  IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
+                     const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts,
+                     std::function<void(const FSReadRequest&, void*)> cb,
+                     void* cb_arg, void** io_handle, IOHandleDeleter* del_fn,
+                     IODebugContext* dbg) override;
+
+ private:
+  DeadlineFS& fs_;
+  std::unique_ptr<FSRandomAccessFile> file_;
+};
+
+class DeadlineFS : public FileSystemWrapper {
+ public:
+  // The error_on_delay parameter specifies whether a IOStatus::TimedOut()
+  // status should be returned after delaying the IO to exceed the timeout,
+  // or to simply delay but return success anyway. The latter mimics the
+  // behavior of PosixFileSystem, which does not enforce any timeout
+  explicit DeadlineFS(SpecialEnv* env, bool error_on_delay)
+      : FileSystemWrapper(env->GetFileSystem()),
+        deadline_(std::chrono::microseconds::zero()),
+        io_timeout_(std::chrono::microseconds::zero()),
+        env_(env),
+        timedout_(false),
+        ignore_deadline_(false),
+        error_on_delay_(error_on_delay) {}
+
+  static const char* kClassName() { return "DeadlineFileSystem"; }
+  const char* Name() const override { return kClassName(); }
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& opts,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override {
+    std::unique_ptr<FSRandomAccessFile> file;
+    IOStatus s = target()->NewRandomAccessFile(fname, opts, &file, dbg);
+    EXPECT_OK(s);
+    result->reset(new DeadlineRandomAccessFile(*this, file));
+
+    const std::chrono::microseconds deadline = GetDeadline();
+    const std::chrono::microseconds io_timeout = GetIOTimeout();
+    if (deadline.count() || io_timeout.count()) {
+      AssertDeadline(deadline, io_timeout, opts.io_options);
+    }
+    return ShouldDelay(opts.io_options);
+  }
+
+  // Set a vector of {IO counter, delay in microseconds, return status} tuples
+  // that control when to inject a delay and duration of the delay
+  void SetDelayTrigger(const std::chrono::microseconds deadline,
+                       const std::chrono::microseconds io_timeout,
+                       const int trigger) {
+    delay_trigger_ = trigger;
+    io_count_ = 0;
+    deadline_ = deadline;
+    io_timeout_ = io_timeout;
+    timedout_ = false;
+  }
+
+  // Increment the IO counter and return a delay in microseconds
+  IOStatus ShouldDelay(const IOOptions& opts) {
+    if (timedout_) {
+      return IOStatus::TimedOut();
+    } else if (!deadline_.count() && !io_timeout_.count()) {
+      return IOStatus::OK();
+    }
+    if (!ignore_deadline_ && delay_trigger_ == io_count_++) {
+      env_->SleepForMicroseconds(static_cast<int>(opts.timeout.count() + 1));
+      timedout_ = true;
+      if (error_on_delay_) {
+        return IOStatus::TimedOut();
+      }
+    }
+    return IOStatus::OK();
+  }
+
+  const std::chrono::microseconds GetDeadline() {
+    return ignore_deadline_ ? std::chrono::microseconds::zero() : deadline_;
+  }
+
+  const std::chrono::microseconds GetIOTimeout() {
+    return ignore_deadline_ ? std::chrono::microseconds::zero() : io_timeout_;
+  }
+
+  bool TimedOut() { return timedout_; }
+
+  void IgnoreDeadline(bool ignore) { ignore_deadline_ = ignore; }
+
+  void AssertDeadline(const std::chrono::microseconds deadline,
+                      const std::chrono::microseconds io_timeout,
+                      const IOOptions& opts) const {
+    // Give a leeway of +- 10us as it can take some time for the Get/
+    // MultiGet call to reach here, in order to avoid false alarms
+    std::chrono::microseconds now =
+        std::chrono::microseconds(env_->NowMicros());
+    std::chrono::microseconds timeout;
+    if (deadline.count()) {
+      timeout = deadline - now;
+      if (io_timeout.count()) {
+        timeout = std::min(timeout, io_timeout);
+      }
+    } else {
+      timeout = io_timeout;
+    }
+    if (opts.timeout != timeout) {
+      ASSERT_EQ(timeout, opts.timeout);
+    }
+  }
+
+ private:
+  // The number of IOs to trigger the delay after
+  int delay_trigger_;
+  // Current IO count
+  int io_count_;
+  // ReadOptions deadline for the Get/MultiGet/Iterator
+  std::chrono::microseconds deadline_;
+  // ReadOptions io_timeout for the Get/MultiGet/Iterator
+  std::chrono::microseconds io_timeout_;
+  SpecialEnv* env_;
+  // Flag to indicate whether we injected a delay
+  bool timedout_;
+  // Temporarily ignore deadlines/timeouts
+  bool ignore_deadline_;
+  // Return IOStatus::TimedOut() or IOStatus::OK()
+  bool error_on_delay_;
+};
+
+IOStatus DeadlineRandomAccessFile::Read(uint64_t offset, size_t len,
+                                        const IOOptions& opts, Slice* result,
+                                        char* scratch,
+                                        IODebugContext* dbg) const {
+  const std::chrono::microseconds deadline = fs_.GetDeadline();
+  const std::chrono::microseconds io_timeout = fs_.GetIOTimeout();
+  IOStatus s;
+  if (deadline.count() || io_timeout.count()) {
+    fs_.AssertDeadline(deadline, io_timeout, opts);
+  }
+  if (s.ok()) {
+    s = FSRandomAccessFileWrapper::Read(offset, len, opts, result, scratch,
+                                        dbg);
+  }
+  if (s.ok()) {
+    s = fs_.ShouldDelay(opts);
+  }
+  return s;
+}
+
+IOStatus DeadlineRandomAccessFile::ReadAsync(
+    FSReadRequest& req, const IOOptions& opts,
+    std::function<void(const FSReadRequest&, void*)> cb, void* cb_arg,
+    void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) {
+  const std::chrono::microseconds deadline = fs_.GetDeadline();
+  const std::chrono::microseconds io_timeout = fs_.GetIOTimeout();
+  IOStatus s;
+  if (deadline.count() || io_timeout.count()) {
+    fs_.AssertDeadline(deadline, io_timeout, opts);
+  }
+  if (s.ok()) {
+    s = FSRandomAccessFileWrapper::ReadAsync(req, opts, cb, cb_arg, io_handle,
+                                             del_fn, dbg);
+  }
+  if (s.ok()) {
+    s = fs_.ShouldDelay(opts);
+  }
+  return s;
+}
+
+IOStatus DeadlineRandomAccessFile::MultiRead(FSReadRequest* reqs,
+                                             size_t num_reqs,
+                                             const IOOptions& options,
+                                             IODebugContext* dbg) {
+  const std::chrono::microseconds deadline = fs_.GetDeadline();
+  const std::chrono::microseconds io_timeout = fs_.GetIOTimeout();
+  IOStatus s;
+  if (deadline.count() || io_timeout.count()) {
+    fs_.AssertDeadline(deadline, io_timeout, options);
+  }
+  if (s.ok()) {
+    s = FSRandomAccessFileWrapper::MultiRead(reqs, num_reqs, options, dbg);
+  }
+  if (s.ok()) {
+    s = fs_.ShouldDelay(options);
+  }
+  return s;
+}
+
+// A test class for intercepting random reads and injecting artificial
+// delays. Used for testing the MultiGet deadline feature
+class DBBasicTestMultiGetDeadline : public DBBasicTestMultiGet,
+                                    public testing::WithParamInterface<bool> {
+ public:
+  DBBasicTestMultiGetDeadline()
+      : DBBasicTestMultiGet(
+            "db_basic_test_multiget_deadline" /*Test dir*/,
+            10 /*# of column families*/, false /*compressed cache enabled*/,
+            true /*uncompressed cache enabled*/, true /*compression enabled*/,
+            true /*ReadOptions.fill_cache*/,
+            1 /*# of parallel compression threads*/) {}
+
+  inline void CheckStatus(std::vector<Status>& statuses, size_t num_ok) {
+    for (size_t i = 0; i < statuses.size(); ++i) {
+      if (i < num_ok) {
+        EXPECT_OK(statuses[i]);
+      } else {
+        if (statuses[i] != Status::TimedOut()) {
+          EXPECT_EQ(statuses[i], Status::TimedOut());
+        }
+      }
+    }
+  }
+};
+
+TEST_P(DBBasicTestMultiGetDeadline, MultiGetDeadlineExceeded) {
+#ifndef USE_COROUTINES
+  if (GetParam()) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  std::shared_ptr<DeadlineFS> fs = std::make_shared<DeadlineFS>(env_, false);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+  Options options = CurrentOptions();
+
+  std::shared_ptr<Cache> cache = NewLRUCache(1048576);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = env.get();
+  SetTimeElapseOnlySleepOnReopen(&options);
+  ReopenWithColumnFamilies(GetCFNames(), options);
+
+  // Test the non-batched version of MultiGet with multiple column
+  // families
+  std::vector<std::string> key_str;
+  size_t i;
+  for (i = 0; i < 5; ++i) {
+    key_str.emplace_back(Key(static_cast<int>(i)));
+  }
+  std::vector<ColumnFamilyHandle*> cfs(key_str.size());
+  ;
+  std::vector<Slice> keys(key_str.size());
+  std::vector<std::string> values(key_str.size());
+  for (i = 0; i < key_str.size(); ++i) {
+    cfs[i] = handles_[i];
+    keys[i] = Slice(key_str[i].data(), key_str[i].size());
+  }
+
+  ReadOptions ro;
+  ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+  ro.async_io = GetParam();
+  // Delay the first IO
+  fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 0);
+
+  std::vector<Status> statuses = dbfull()->MultiGet(ro, cfs, keys, &values);
+  // The first key is successful because we check after the lookup, but
+  // subsequent keys fail due to deadline exceeded
+  CheckStatus(statuses, 1);
+
+  // Clear the cache
+  cache->SetCapacity(0);
+  cache->SetCapacity(1048576);
+  // Test non-batched Multiget with multiple column families and
+  // introducing an IO delay in one of the middle CFs
+  key_str.clear();
+  for (i = 0; i < 10; ++i) {
+    key_str.emplace_back(Key(static_cast<int>(i)));
+  }
+  cfs.resize(key_str.size());
+  keys.resize(key_str.size());
+  values.resize(key_str.size());
+  for (i = 0; i < key_str.size(); ++i) {
+    // 2 keys per CF
+    cfs[i] = handles_[i / 2];
+    keys[i] = Slice(key_str[i].data(), key_str[i].size());
+  }
+  ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+  fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 1);
+  statuses = dbfull()->MultiGet(ro, cfs, keys, &values);
+  CheckStatus(statuses, 3);
+
+  // Test batched MultiGet with an IO delay in the first data block read.
+  // Both keys in the first CF should succeed as they're in the same data
+  // block and would form one batch, and we check for deadline between
+  // batches.
+  std::vector<PinnableSlice> pin_values(keys.size());
+  cache->SetCapacity(0);
+  cache->SetCapacity(1048576);
+  statuses.clear();
+  statuses.resize(keys.size());
+  ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+  fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 0);
+  dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(),
+                     pin_values.data(), statuses.data());
+  CheckStatus(statuses, 2);
+
+  // Similar to the previous one, but an IO delay in the third CF data block
+  // read
+  for (PinnableSlice& value : pin_values) {
+    value.Reset();
+  }
+  cache->SetCapacity(0);
+  cache->SetCapacity(1048576);
+  statuses.clear();
+  statuses.resize(keys.size());
+  ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+  fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 2);
+  dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(),
+                     pin_values.data(), statuses.data());
+  CheckStatus(statuses, 6);
+
+  // Similar to the previous one, but an IO delay in the last but one CF
+  for (PinnableSlice& value : pin_values) {
+    value.Reset();
+  }
+  cache->SetCapacity(0);
+  cache->SetCapacity(1048576);
+  statuses.clear();
+  statuses.resize(keys.size());
+  ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+  fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 3);
+  dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(),
+                     pin_values.data(), statuses.data());
+  CheckStatus(statuses, 8);
+
+  // Test batched MultiGet with single CF and lots of keys. Inject delay
+  // into the second batch of keys. As each batch is 32, the first 64 keys,
+  // i.e first two batches, should succeed and the rest should time out
+  for (PinnableSlice& value : pin_values) {
+    value.Reset();
+  }
+  cache->SetCapacity(0);
+  cache->SetCapacity(1048576);
+  key_str.clear();
+  for (i = 0; i < 100; ++i) {
+    key_str.emplace_back(Key(static_cast<int>(i)));
+  }
+  keys.resize(key_str.size());
+  pin_values.clear();
+  pin_values.resize(key_str.size());
+  for (i = 0; i < key_str.size(); ++i) {
+    keys[i] = Slice(key_str[i].data(), key_str[i].size());
+  }
+  statuses.clear();
+  statuses.resize(keys.size());
+  ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+  fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 1);
+  dbfull()->MultiGet(ro, handles_[0], keys.size(), keys.data(),
+                     pin_values.data(), statuses.data());
+  CheckStatus(statuses, 64);
+  Close();
+}
+
+INSTANTIATE_TEST_CASE_P(DeadlineIO, DBBasicTestMultiGetDeadline,
+                        ::testing::Bool());
+
+TEST_F(DBBasicTest, ManifestWriteFailure) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:AfterSyncManifest", [&](void* arg) {
+        ASSERT_NE(nullptr, arg);
+        auto* s = reinterpret_cast<Status*>(arg);
+        ASSERT_OK(*s);
+        // Manually overwrite return status
+        *s = Status::IOError();
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put("key", "value"));
+  ASSERT_NOK(Flush());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->EnableProcessing();
+  Reopen(options);
+}
+
+TEST_F(DBBasicTest, DestroyDefaultCfHandle) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  for (const auto* h : handles_) {
+    ASSERT_NE(db_->DefaultColumnFamily(), h);
+  }
+
+  // We have two handles to the default column family. The two handles point to
+  // different ColumnFamilyHandle objects.
+  assert(db_->DefaultColumnFamily());
+  ASSERT_EQ(0U, db_->DefaultColumnFamily()->GetID());
+  assert(handles_[0]);
+  ASSERT_EQ(0U, handles_[0]->GetID());
+
+  // You can destroy handles_[...].
+  for (auto* h : handles_) {
+    ASSERT_OK(db_->DestroyColumnFamilyHandle(h));
+  }
+  handles_.clear();
+
+  // But you should not destroy db_->DefaultColumnFamily(), since it's going to
+  // be deleted in `DBImpl::CloseHelper()`. Before that, it may be used
+  // elsewhere internally too.
+  ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily();
+  ASSERT_TRUE(db_->DestroyColumnFamilyHandle(default_cf).IsInvalidArgument());
+}
+
+TEST_F(DBBasicTest, FailOpenIfLoggerCreationFail) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "rocksdb::CreateLoggerFromOptions:AfterGetPath", [&](void* arg) {
+        auto* s = reinterpret_cast<Status*>(arg);
+        assert(s);
+        *s = Status::IOError("Injected");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = TryReopen(options);
+  ASSERT_EQ(nullptr, options.info_log);
+  ASSERT_TRUE(s.IsIOError());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBasicTest, VerifyFileChecksums) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.env = env_;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("a", "value"));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsInvalidArgument());
+
+  options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  Reopen(options);
+  ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+
+  // Write an L0 with checksum computed.
+  ASSERT_OK(Put("b", "value"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+
+  // Does the right thing but with the wrong name -- using it should lead to an
+  // error.
+  class MisnamedFileChecksumGenerator : public FileChecksumGenCrc32c {
+   public:
+    MisnamedFileChecksumGenerator(const FileChecksumGenContext& context)
+        : FileChecksumGenCrc32c(context) {}
+
+    const char* Name() const override { return "sha1"; }
+  };
+
+  class MisnamedFileChecksumGenFactory : public FileChecksumGenCrc32cFactory {
+   public:
+    std::unique_ptr<FileChecksumGenerator> CreateFileChecksumGenerator(
+        const FileChecksumGenContext& context) override {
+      return std::unique_ptr<FileChecksumGenerator>(
+          new MisnamedFileChecksumGenerator(context));
+    }
+  };
+
+  options.file_checksum_gen_factory.reset(new MisnamedFileChecksumGenFactory());
+  Reopen(options);
+  ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsInvalidArgument());
+}
+
+// TODO: re-enable after we provide finer-grained control for WAL tracking to
+// meet the needs of different use cases, durability levels and recovery modes.
+TEST_F(DBBasicTest, DISABLED_ManualWalSync) {
+  Options options = CurrentOptions();
+  options.track_and_verify_wals_in_manifest = true;
+  options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("x", "y"));
+  // This does not create a new WAL.
+  ASSERT_OK(db_->SyncWAL());
+  EXPECT_FALSE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty());
+
+  std::unique_ptr<LogFile> wal;
+  Status s = db_->GetCurrentWalFile(&wal);
+  ASSERT_OK(s);
+  Close();
+
+  EXPECT_OK(env_->DeleteFile(LogFileName(dbname_, wal->LogNumber())));
+
+  ASSERT_TRUE(TryReopen(options).IsCorruption());
+}
+#endif  // !ROCKSDB_LITE
+
+// A test class for intercepting random reads and injecting artificial
+// delays. Used for testing the deadline/timeout feature
+class DBBasicTestDeadline
+    : public DBBasicTest,
+      public testing::WithParamInterface<std::tuple<bool, bool>> {};
+
+TEST_P(DBBasicTestDeadline, PointLookupDeadline) {
+  std::shared_ptr<DeadlineFS> fs = std::make_shared<DeadlineFS>(env_, true);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+  bool set_deadline = std::get<0>(GetParam());
+  bool set_timeout = std::get<1>(GetParam());
+
+  for (int option_config = kDefault; option_config < kEnd; ++option_config) {
+    if (ShouldSkipOptions(option_config, kSkipPlainTable | kSkipMmapReads)) {
+      continue;
+    }
+    option_config_ = option_config;
+    Options options = CurrentOptions();
+    if (options.use_direct_reads) {
+      continue;
+    }
+    options.env = env.get();
+    options.disable_auto_compactions = true;
+    Cache* block_cache = nullptr;
+    // Fileter block reads currently don't cause the request to get
+    // aborted on a read timeout, so its possible those block reads
+    // may get issued even if the deadline is past
+    SyncPoint::GetInstance()->SetCallBack(
+        "BlockBasedTable::Get:BeforeFilterMatch",
+        [&](void* /*arg*/) { fs->IgnoreDeadline(true); });
+    SyncPoint::GetInstance()->SetCallBack(
+        "BlockBasedTable::Get:AfterFilterMatch",
+        [&](void* /*arg*/) { fs->IgnoreDeadline(false); });
+    // DB open will create table readers unless we reduce the table cache
+    // capacity.
+    // SanitizeOptions will set max_open_files to minimum of 20. Table cache
+    // is allocated with max_open_files - 10 as capacity. So override
+    // max_open_files to 11 so table cache capacity will become 1. This will
+    // prevent file open during DB open and force the file to be opened
+    // during MultiGet
+    SyncPoint::GetInstance()->SetCallBack(
+        "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+          int* max_open_files = (int*)arg;
+          *max_open_files = 11;
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    SetTimeElapseOnlySleepOnReopen(&options);
+    Reopen(options);
+
+    if (options.table_factory) {
+      block_cache = options.table_factory->GetOptions<Cache>(
+          TableFactory::kBlockCacheOpts());
+    }
+
+    Random rnd(301);
+    for (int i = 0; i < 400; ++i) {
+      std::string key = "k" + std::to_string(i);
+      ASSERT_OK(Put(key, rnd.RandomString(100)));
+    }
+    ASSERT_OK(Flush());
+
+    bool timedout = true;
+    // A timeout will be forced when the IO counter reaches this value
+    int io_deadline_trigger = 0;
+    // Keep incrementing io_deadline_trigger and call Get() until there is an
+    // iteration that doesn't cause a timeout. This ensures that we cover
+    // all file reads in the point lookup path that can potentially timeout
+    // and cause the Get() to fail.
+    while (timedout) {
+      ReadOptions ro;
+      if (set_deadline) {
+        ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+      }
+      if (set_timeout) {
+        ro.io_timeout = std::chrono::microseconds{5000};
+      }
+      fs->SetDelayTrigger(ro.deadline, ro.io_timeout, io_deadline_trigger);
+
+      block_cache->SetCapacity(0);
+      block_cache->SetCapacity(1048576);
+
+      std::string value;
+      Status s = dbfull()->Get(ro, "k50", &value);
+      if (fs->TimedOut()) {
+        ASSERT_EQ(s, Status::TimedOut());
+      } else {
+        timedout = false;
+        ASSERT_OK(s);
+      }
+      io_deadline_trigger++;
+    }
+    // Reset the delay sequence in order to avoid false alarms during Reopen
+    fs->SetDelayTrigger(std::chrono::microseconds::zero(),
+                        std::chrono::microseconds::zero(), 0);
+  }
+  Close();
+}
+
+TEST_P(DBBasicTestDeadline, IteratorDeadline) {
+  std::shared_ptr<DeadlineFS> fs = std::make_shared<DeadlineFS>(env_, true);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+  bool set_deadline = std::get<0>(GetParam());
+  bool set_timeout = std::get<1>(GetParam());
+
+  for (int option_config = kDefault; option_config < kEnd; ++option_config) {
+    if (ShouldSkipOptions(option_config, kSkipPlainTable | kSkipMmapReads)) {
+      continue;
+    }
+    Options options = CurrentOptions();
+    if (options.use_direct_reads) {
+      continue;
+    }
+    options.env = env.get();
+    options.disable_auto_compactions = true;
+    Cache* block_cache = nullptr;
+    // DB open will create table readers unless we reduce the table cache
+    // capacity.
+    // SanitizeOptions will set max_open_files to minimum of 20. Table cache
+    // is allocated with max_open_files - 10 as capacity. So override
+    // max_open_files to 11 so table cache capacity will become 1. This will
+    // prevent file open during DB open and force the file to be opened
+    // during MultiGet
+    SyncPoint::GetInstance()->SetCallBack(
+        "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+          int* max_open_files = (int*)arg;
+          *max_open_files = 11;
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    SetTimeElapseOnlySleepOnReopen(&options);
+    Reopen(options);
+
+    if (options.table_factory) {
+      block_cache = options.table_factory->GetOptions<Cache>(
+          TableFactory::kBlockCacheOpts());
+    }
+
+    Random rnd(301);
+    for (int i = 0; i < 400; ++i) {
+      std::string key = "k" + std::to_string(i);
+      ASSERT_OK(Put(key, rnd.RandomString(100)));
+    }
+    ASSERT_OK(Flush());
+
+    bool timedout = true;
+    // A timeout will be forced when the IO counter reaches this value
+    int io_deadline_trigger = 0;
+    // Keep incrementing io_deadline_trigger and call Get() until there is an
+    // iteration that doesn't cause a timeout. This ensures that we cover
+    // all file reads in the point lookup path that can potentially timeout
+    while (timedout) {
+      ReadOptions ro;
+      if (set_deadline) {
+        ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+      }
+      if (set_timeout) {
+        ro.io_timeout = std::chrono::microseconds{5000};
+      }
+      fs->SetDelayTrigger(ro.deadline, ro.io_timeout, io_deadline_trigger);
+
+      block_cache->SetCapacity(0);
+      block_cache->SetCapacity(1048576);
+
+      Iterator* iter = dbfull()->NewIterator(ro);
+      int count = 0;
+      iter->Seek("k50");
+      while (iter->Valid() && count++ < 100) {
+        iter->Next();
+      }
+      if (fs->TimedOut()) {
+        ASSERT_FALSE(iter->Valid());
+        ASSERT_EQ(iter->status(), Status::TimedOut());
+      } else {
+        timedout = false;
+        ASSERT_OK(iter->status());
+      }
+      delete iter;
+      io_deadline_trigger++;
+    }
+    // Reset the delay sequence in order to avoid false alarms during Reopen
+    fs->SetDelayTrigger(std::chrono::microseconds::zero(),
+                        std::chrono::microseconds::zero(), 0);
+  }
+  Close();
+}
+
+// Param 0: If true, set read_options.deadline
+// Param 1: If true, set read_options.io_timeout
+INSTANTIATE_TEST_CASE_P(DBBasicTestDeadline, DBBasicTestDeadline,
+                        ::testing::Values(std::make_tuple(true, false),
+                                          std::make_tuple(false, true),
+                                          std::make_tuple(true, true)));
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_block_cache_test.cc b/src/rocksdb/db/db_block_cache_test.cc
new file mode 100644
index 000000000..db80b82cb
--- /dev/null
+++ b/src/rocksdb/db/db_block_cache_test.cc
@@ -0,0 +1,2313 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <cstdlib>
+#include <functional>
+#include <memory>
+#include <unordered_set>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_key.h"
+#include "cache/lru_cache.h"
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "env/unique_id_gen.h"
+#include "port/stack_trace.h"
+#include "rocksdb/persistent_cache.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/unique_id_impl.h"
+#include "util/compression.h"
+#include "util/defer.h"
+#include "util/hash.h"
+#include "util/math.h"
+#include "util/random.h"
+#include "utilities/fault_injection_fs.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBBlockCacheTest : public DBTestBase {
+ private:
+  size_t miss_count_ = 0;
+  size_t hit_count_ = 0;
+  size_t insert_count_ = 0;
+  size_t failure_count_ = 0;
+  size_t compression_dict_miss_count_ = 0;
+  size_t compression_dict_hit_count_ = 0;
+  size_t compression_dict_insert_count_ = 0;
+  size_t compressed_miss_count_ = 0;
+  size_t compressed_hit_count_ = 0;
+  size_t compressed_insert_count_ = 0;
+  size_t compressed_failure_count_ = 0;
+
+ public:
+  const size_t kNumBlocks = 10;
+  const size_t kValueSize = 100;
+
+  DBBlockCacheTest()
+      : DBTestBase("db_block_cache_test", /*env_do_fsync=*/true) {}
+
+  BlockBasedTableOptions GetTableOptions() {
+    BlockBasedTableOptions table_options;
+    // Set a small enough block size so that each key-value get its own block.
+    table_options.block_size = 1;
+    return table_options;
+  }
+
+  Options GetOptions(const BlockBasedTableOptions& table_options) {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.avoid_flush_during_recovery = false;
+    // options.compression = kNoCompression;
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    return options;
+  }
+
+  void InitTable(const Options& /*options*/) {
+    std::string value(kValueSize, 'a');
+    for (size_t i = 0; i < kNumBlocks; i++) {
+      ASSERT_OK(Put(std::to_string(i), value.c_str()));
+    }
+  }
+
+  void RecordCacheCounters(const Options& options) {
+    miss_count_ = TestGetTickerCount(options, BLOCK_CACHE_MISS);
+    hit_count_ = TestGetTickerCount(options, BLOCK_CACHE_HIT);
+    insert_count_ = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    failure_count_ = TestGetTickerCount(options, BLOCK_CACHE_ADD_FAILURES);
+    compressed_miss_count_ =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS);
+    compressed_hit_count_ =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT);
+    compressed_insert_count_ =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD);
+    compressed_failure_count_ =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
+  }
+
+  void RecordCacheCountersForCompressionDict(const Options& options) {
+    compression_dict_miss_count_ =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS);
+    compression_dict_hit_count_ =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_HIT);
+    compression_dict_insert_count_ =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD);
+  }
+
+  void CheckCacheCounters(const Options& options, size_t expected_misses,
+                          size_t expected_hits, size_t expected_inserts,
+                          size_t expected_failures) {
+    size_t new_miss_count = TestGetTickerCount(options, BLOCK_CACHE_MISS);
+    size_t new_hit_count = TestGetTickerCount(options, BLOCK_CACHE_HIT);
+    size_t new_insert_count = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    size_t new_failure_count =
+        TestGetTickerCount(options, BLOCK_CACHE_ADD_FAILURES);
+    ASSERT_EQ(miss_count_ + expected_misses, new_miss_count);
+    ASSERT_EQ(hit_count_ + expected_hits, new_hit_count);
+    ASSERT_EQ(insert_count_ + expected_inserts, new_insert_count);
+    ASSERT_EQ(failure_count_ + expected_failures, new_failure_count);
+    miss_count_ = new_miss_count;
+    hit_count_ = new_hit_count;
+    insert_count_ = new_insert_count;
+    failure_count_ = new_failure_count;
+  }
+
+  void CheckCacheCountersForCompressionDict(
+      const Options& options, size_t expected_compression_dict_misses,
+      size_t expected_compression_dict_hits,
+      size_t expected_compression_dict_inserts) {
+    size_t new_compression_dict_miss_count =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS);
+    size_t new_compression_dict_hit_count =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_HIT);
+    size_t new_compression_dict_insert_count =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD);
+    ASSERT_EQ(compression_dict_miss_count_ + expected_compression_dict_misses,
+              new_compression_dict_miss_count);
+    ASSERT_EQ(compression_dict_hit_count_ + expected_compression_dict_hits,
+              new_compression_dict_hit_count);
+    ASSERT_EQ(
+        compression_dict_insert_count_ + expected_compression_dict_inserts,
+        new_compression_dict_insert_count);
+    compression_dict_miss_count_ = new_compression_dict_miss_count;
+    compression_dict_hit_count_ = new_compression_dict_hit_count;
+    compression_dict_insert_count_ = new_compression_dict_insert_count;
+  }
+
+  void CheckCompressedCacheCounters(const Options& options,
+                                    size_t expected_misses,
+                                    size_t expected_hits,
+                                    size_t expected_inserts,
+                                    size_t expected_failures) {
+    size_t new_miss_count =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS);
+    size_t new_hit_count =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT);
+    size_t new_insert_count =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD);
+    size_t new_failure_count =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
+    ASSERT_EQ(compressed_miss_count_ + expected_misses, new_miss_count);
+    ASSERT_EQ(compressed_hit_count_ + expected_hits, new_hit_count);
+    ASSERT_EQ(compressed_insert_count_ + expected_inserts, new_insert_count);
+    ASSERT_EQ(compressed_failure_count_ + expected_failures, new_failure_count);
+    compressed_miss_count_ = new_miss_count;
+    compressed_hit_count_ = new_hit_count;
+    compressed_insert_count_ = new_insert_count;
+    compressed_failure_count_ = new_failure_count;
+  }
+
+#ifndef ROCKSDB_LITE
+  const std::array<size_t, kNumCacheEntryRoles> GetCacheEntryRoleCountsBg() {
+    // Verify in cache entry role stats
+    std::array<size_t, kNumCacheEntryRoles> cache_entry_role_counts;
+    std::map<std::string, std::string> values;
+    EXPECT_TRUE(db_->GetMapProperty(DB::Properties::kFastBlockCacheEntryStats,
+                                    &values));
+    for (size_t i = 0; i < kNumCacheEntryRoles; ++i) {
+      auto role = static_cast<CacheEntryRole>(i);
+      cache_entry_role_counts[i] =
+          ParseSizeT(values[BlockCacheEntryStatsMapKeys::EntryCount(role)]);
+    }
+    return cache_entry_role_counts;
+  }
+#endif  // ROCKSDB_LITE
+};
+
+TEST_F(DBBlockCacheTest, IteratorBlockCacheUsage) {
+  ReadOptions read_options;
+  read_options.fill_cache = false;
+  auto table_options = GetTableOptions();
+  auto options = GetOptions(table_options);
+  InitTable(options);
+
+  LRUCacheOptions co;
+  co.capacity = 0;
+  co.num_shard_bits = 0;
+  co.strict_capacity_limit = false;
+  // Needed not to count entry stats collector
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<Cache> cache = NewLRUCache(co);
+  table_options.block_cache = cache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  RecordCacheCounters(options);
+
+  std::vector<std::unique_ptr<Iterator>> iterators(kNumBlocks - 1);
+  Iterator* iter = nullptr;
+
+  ASSERT_EQ(0, cache->GetUsage());
+  iter = db_->NewIterator(read_options);
+  iter->Seek(std::to_string(0));
+  ASSERT_LT(0, cache->GetUsage());
+  delete iter;
+  iter = nullptr;
+  ASSERT_EQ(0, cache->GetUsage());
+}
+
+TEST_F(DBBlockCacheTest, TestWithoutCompressedBlockCache) {
+  ReadOptions read_options;
+  auto table_options = GetTableOptions();
+  auto options = GetOptions(table_options);
+  InitTable(options);
+
+  LRUCacheOptions co;
+  co.capacity = 0;
+  co.num_shard_bits = 0;
+  co.strict_capacity_limit = false;
+  // Needed not to count entry stats collector
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<Cache> cache = NewLRUCache(co);
+  table_options.block_cache = cache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  RecordCacheCounters(options);
+
+  std::vector<std::unique_ptr<Iterator>> iterators(kNumBlocks - 1);
+  Iterator* iter = nullptr;
+
+  // Load blocks into cache.
+  for (size_t i = 0; i + 1 < kNumBlocks; i++) {
+    iter = db_->NewIterator(read_options);
+    iter->Seek(std::to_string(i));
+    ASSERT_OK(iter->status());
+    CheckCacheCounters(options, 1, 0, 1, 0);
+    iterators[i].reset(iter);
+  }
+  size_t usage = cache->GetUsage();
+  ASSERT_LT(0, usage);
+  cache->SetCapacity(usage);
+  ASSERT_EQ(usage, cache->GetPinnedUsage());
+
+  // Test with strict capacity limit.
+  cache->SetStrictCapacityLimit(true);
+  iter = db_->NewIterator(read_options);
+  iter->Seek(std::to_string(kNumBlocks - 1));
+  ASSERT_TRUE(iter->status().IsMemoryLimit());
+  CheckCacheCounters(options, 1, 0, 0, 1);
+  delete iter;
+  iter = nullptr;
+
+  // Release iterators and access cache again.
+  for (size_t i = 0; i + 1 < kNumBlocks; i++) {
+    iterators[i].reset();
+    CheckCacheCounters(options, 0, 0, 0, 0);
+  }
+  ASSERT_EQ(0, cache->GetPinnedUsage());
+  for (size_t i = 0; i + 1 < kNumBlocks; i++) {
+    iter = db_->NewIterator(read_options);
+    iter->Seek(std::to_string(i));
+    ASSERT_OK(iter->status());
+    CheckCacheCounters(options, 0, 1, 0, 0);
+    iterators[i].reset(iter);
+  }
+}
+
+#ifdef SNAPPY
+TEST_F(DBBlockCacheTest, TestWithCompressedBlockCache) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.block_cache_compressed = nullptr;
+  table_options.block_size = 1;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+  table_options.cache_index_and_filter_blocks = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.compression = CompressionType::kSnappyCompression;
+
+  DestroyAndReopen(options);
+
+  std::string value(kValueSize, 'a');
+  for (size_t i = 0; i < kNumBlocks; i++) {
+    ASSERT_OK(Put(std::to_string(i), value));
+    ASSERT_OK(Flush());
+  }
+
+  ReadOptions read_options;
+  std::shared_ptr<Cache> compressed_cache = NewLRUCache(1 << 25, 0, false);
+  LRUCacheOptions co;
+  co.capacity = 0;
+  co.num_shard_bits = 0;
+  co.strict_capacity_limit = false;
+  // Needed not to count entry stats collector
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<Cache> cache = NewLRUCache(co);
+  table_options.block_cache = cache;
+  table_options.no_block_cache = false;
+  table_options.block_cache_compressed = compressed_cache;
+  table_options.max_auto_readahead_size = 0;
+  table_options.cache_index_and_filter_blocks = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  RecordCacheCounters(options);
+
+  // Load blocks into cache.
+  for (size_t i = 0; i < kNumBlocks - 1; i++) {
+    ASSERT_EQ(value, Get(std::to_string(i)));
+    CheckCacheCounters(options, 1, 0, 1, 0);
+    CheckCompressedCacheCounters(options, 1, 0, 1, 0);
+  }
+
+  size_t usage = cache->GetUsage();
+  ASSERT_EQ(0, usage);
+  ASSERT_EQ(usage, cache->GetPinnedUsage());
+  size_t compressed_usage = compressed_cache->GetUsage();
+  ASSERT_LT(0, compressed_usage);
+  // Compressed block cache cannot be pinned.
+  ASSERT_EQ(0, compressed_cache->GetPinnedUsage());
+
+  // Set strict capacity limit flag. Now block will only load into compressed
+  // block cache.
+  cache->SetCapacity(usage);
+  cache->SetStrictCapacityLimit(true);
+  ASSERT_EQ(usage, cache->GetPinnedUsage());
+
+  // Load last key block.
+  ASSERT_EQ(
+      "Operation aborted: Memory limit reached: Insert failed due to LRU cache "
+      "being full.",
+      Get(std::to_string(kNumBlocks - 1)));
+  // Failure will also record the miss counter.
+  CheckCacheCounters(options, 1, 0, 0, 1);
+  CheckCompressedCacheCounters(options, 1, 0, 1, 0);
+
+  // Clear strict capacity limit flag. This time we shall hit compressed block
+  // cache and load into block cache.
+  cache->SetStrictCapacityLimit(false);
+  // Load last key block.
+  ASSERT_EQ(value, Get(std::to_string(kNumBlocks - 1)));
+  CheckCacheCounters(options, 1, 0, 1, 0);
+  CheckCompressedCacheCounters(options, 0, 1, 0, 0);
+}
+
+namespace {
+class PersistentCacheFromCache : public PersistentCache {
+ public:
+  PersistentCacheFromCache(std::shared_ptr<Cache> cache, bool read_only)
+      : cache_(cache), read_only_(read_only) {}
+
+  Status Insert(const Slice& key, const char* data,
+                const size_t size) override {
+    if (read_only_) {
+      return Status::NotSupported();
+    }
+    std::unique_ptr<char[]> copy{new char[size]};
+    std::copy_n(data, size, copy.get());
+    Status s = cache_->Insert(
+        key, copy.get(), size,
+        GetCacheEntryDeleterForRole<char[], CacheEntryRole::kMisc>());
+    if (s.ok()) {
+      copy.release();
+    }
+    return s;
+  }
+
+  Status Lookup(const Slice& key, std::unique_ptr<char[]>* data,
+                size_t* size) override {
+    auto handle = cache_->Lookup(key);
+    if (handle) {
+      char* ptr = static_cast<char*>(cache_->Value(handle));
+      *size = cache_->GetCharge(handle);
+      data->reset(new char[*size]);
+      std::copy_n(ptr, *size, data->get());
+      cache_->Release(handle);
+      return Status::OK();
+    } else {
+      return Status::NotFound();
+    }
+  }
+
+  bool IsCompressed() override { return false; }
+
+  StatsType Stats() override { return StatsType(); }
+
+  std::string GetPrintableOptions() const override { return ""; }
+
+  uint64_t NewId() override { return cache_->NewId(); }
+
+ private:
+  std::shared_ptr<Cache> cache_;
+  bool read_only_;
+};
+
+class ReadOnlyCacheWrapper : public CacheWrapper {
+  using CacheWrapper::CacheWrapper;
+
+  using Cache::Insert;
+  Status Insert(const Slice& /*key*/, void* /*value*/, size_t /*charge*/,
+                void (*)(const Slice& key, void* value) /*deleter*/,
+                Handle** /*handle*/, Priority /*priority*/) override {
+    return Status::NotSupported();
+  }
+};
+
+}  // anonymous namespace
+
+TEST_F(DBBlockCacheTest, TestWithSameCompressed) {
+  auto table_options = GetTableOptions();
+  auto options = GetOptions(table_options);
+  InitTable(options);
+
+  std::shared_ptr<Cache> rw_cache{NewLRUCache(1000000)};
+  std::shared_ptr<PersistentCacheFromCache> rw_pcache{
+      new PersistentCacheFromCache(rw_cache, /*read_only*/ false)};
+  // Exercise some obscure behavior with read-only wrappers
+  std::shared_ptr<Cache> ro_cache{new ReadOnlyCacheWrapper(rw_cache)};
+  std::shared_ptr<PersistentCacheFromCache> ro_pcache{
+      new PersistentCacheFromCache(rw_cache, /*read_only*/ true)};
+
+  // Simple same pointer
+  table_options.block_cache = rw_cache;
+  table_options.block_cache_compressed = rw_cache;
+  table_options.persistent_cache.reset();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_EQ(TryReopen(options).ToString(),
+            "Invalid argument: block_cache same as block_cache_compressed not "
+            "currently supported, and would be bad for performance anyway");
+
+  // Other cases
+  table_options.block_cache = ro_cache;
+  table_options.block_cache_compressed = rw_cache;
+  table_options.persistent_cache.reset();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_EQ(TryReopen(options).ToString(),
+            "Invalid argument: block_cache and block_cache_compressed share "
+            "the same key space, which is not supported");
+
+  table_options.block_cache = rw_cache;
+  table_options.block_cache_compressed = ro_cache;
+  table_options.persistent_cache.reset();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_EQ(TryReopen(options).ToString(),
+            "Invalid argument: block_cache_compressed and block_cache share "
+            "the same key space, which is not supported");
+
+  table_options.block_cache = ro_cache;
+  table_options.block_cache_compressed.reset();
+  table_options.persistent_cache = rw_pcache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_EQ(TryReopen(options).ToString(),
+            "Invalid argument: block_cache and persistent_cache share the same "
+            "key space, which is not supported");
+
+  table_options.block_cache = rw_cache;
+  table_options.block_cache_compressed.reset();
+  table_options.persistent_cache = ro_pcache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_EQ(TryReopen(options).ToString(),
+            "Invalid argument: persistent_cache and block_cache share the same "
+            "key space, which is not supported");
+
+  table_options.block_cache.reset();
+  table_options.no_block_cache = true;
+  table_options.block_cache_compressed = ro_cache;
+  table_options.persistent_cache = rw_pcache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_EQ(TryReopen(options).ToString(),
+            "Invalid argument: block_cache_compressed and persistent_cache "
+            "share the same key space, which is not supported");
+
+  table_options.block_cache.reset();
+  table_options.no_block_cache = true;
+  table_options.block_cache_compressed = rw_cache;
+  table_options.persistent_cache = ro_pcache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_EQ(TryReopen(options).ToString(),
+            "Invalid argument: persistent_cache and block_cache_compressed "
+            "share the same key space, which is not supported");
+}
+#endif  // SNAPPY
+
+#ifndef ROCKSDB_LITE
+
+// Make sure that when options.block_cache is set, after a new table is
+// created its index/filter blocks are added to block cache.
+TEST_F(DBBlockCacheTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "key", "val"));
+  // Create a new table.
+  ASSERT_OK(Flush(1));
+
+  // index/filter blocks added to block cache right after table creation.
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(2, /* only index/filter were added */
+            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
+  uint64_t int_num;
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_EQ(int_num, 0U);
+
+  // Make sure filter block is in cache.
+  std::string value;
+  ReadOptions ropt;
+  db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value);
+
+  // Miss count should remain the same.
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+
+  db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value);
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+
+  // Make sure index block is in cache.
+  auto index_block_hit = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT);
+  value = Get(1, "key");
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(index_block_hit + 1,
+            TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+
+  value = Get(1, "key");
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(index_block_hit + 2,
+            TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+}
+
+// With fill_cache = false, fills up the cache, then iterates over the entire
+// db, verify dummy entries inserted in `BlockBasedTable::NewDataBlockIterator`
+// does not cause heap-use-after-free errors in COMPILE_WITH_ASAN=1 runs
+TEST_F(DBBlockCacheTest, FillCacheAndIterateDB) {
+  ReadOptions read_options;
+  read_options.fill_cache = false;
+  auto table_options = GetTableOptions();
+  auto options = GetOptions(table_options);
+  InitTable(options);
+
+  std::shared_ptr<Cache> cache = NewLRUCache(10, 0, true);
+  table_options.block_cache = cache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  ASSERT_OK(Put("key1", "val1"));
+  ASSERT_OK(Put("key2", "val2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("key3", "val3"));
+  ASSERT_OK(Put("key4", "val4"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("key5", "val5"));
+  ASSERT_OK(Put("key6", "val6"));
+  ASSERT_OK(Flush());
+
+  Iterator* iter = nullptr;
+
+  iter = db_->NewIterator(read_options);
+  iter->Seek(std::to_string(0));
+  while (iter->Valid()) {
+    iter->Next();
+  }
+  delete iter;
+  iter = nullptr;
+}
+
+TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = true;
+  LRUCacheOptions co;
+  // 500 bytes are enough to hold the first two blocks
+  co.capacity = 500;
+  co.num_shard_bits = 0;
+  co.strict_capacity_limit = false;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<Cache> cache = NewLRUCache(co);
+  table_options.block_cache = cache;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(20, true));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "longer_key", "val"));
+  // Create a new table
+  ASSERT_OK(Flush(1));
+  size_t index_bytes_insert =
+      TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_INSERT);
+  size_t filter_bytes_insert =
+      TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_INSERT);
+  ASSERT_GT(index_bytes_insert, 0);
+  ASSERT_GT(filter_bytes_insert, 0);
+  ASSERT_EQ(cache->GetUsage(), index_bytes_insert + filter_bytes_insert);
+  // set the cache capacity to the current usage
+  cache->SetCapacity(index_bytes_insert + filter_bytes_insert);
+  // The index and filter eviction statistics were broken by the refactoring
+  // that moved the readers out of the block cache. Disabling these until we can
+  // bring the stats back.
+  // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), 0);
+  // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), 0);
+  // Note that the second key needs to be no longer than the first one.
+  // Otherwise the second index block may not fit in cache.
+  ASSERT_OK(Put(1, "key", "val"));
+  // Create a new table
+  ASSERT_OK(Flush(1));
+  // cache evicted old index and block entries
+  ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_INSERT),
+            index_bytes_insert);
+  ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_INSERT),
+            filter_bytes_insert);
+  // The index and filter eviction statistics were broken by the refactoring
+  // that moved the readers out of the block cache. Disabling these until we can
+  // bring the stats back.
+  // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT),
+  //           index_bytes_insert);
+  // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT),
+  //           filter_bytes_insert);
+}
+
+#if (defined OS_LINUX || defined OS_WIN)
+TEST_F(DBBlockCacheTest, WarmCacheWithDataBlocksDuringFlush) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = NewLRUCache(1 << 25, 0, false);
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.prepopulate_block_cache =
+      BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  std::string value(kValueSize, 'a');
+  for (size_t i = 1; i <= kNumBlocks; i++) {
+    ASSERT_OK(Put(std::to_string(i), value));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD));
+    ASSERT_EQ(value, Get(std::to_string(i)));
+    ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_HIT));
+  }
+  // Verify compaction not counted
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                              /*end=*/nullptr));
+  EXPECT_EQ(kNumBlocks,
+            options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD));
+}
+
+// This test cache data, index and filter blocks during flush.
+class DBBlockCacheTest1 : public DBTestBase,
+                          public ::testing::WithParamInterface<uint32_t> {
+ public:
+  const size_t kNumBlocks = 10;
+  const size_t kValueSize = 100;
+  DBBlockCacheTest1() : DBTestBase("db_block_cache_test1", true) {}
+};
+
+INSTANTIATE_TEST_CASE_P(DBBlockCacheTest1, DBBlockCacheTest1,
+                        ::testing::Values(1, 2));
+
+TEST_P(DBBlockCacheTest1, WarmCacheWithBlocksDuringFlush) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = NewLRUCache(1 << 25, 0, false);
+
+  uint32_t filter_type = GetParam();
+  switch (filter_type) {
+    case 1:  // partition_filter
+      table_options.partition_filters = true;
+      table_options.index_type =
+          BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+      break;
+    case 2:  // full filter
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+      break;
+    default:
+      assert(false);
+  }
+
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.prepopulate_block_cache =
+      BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  std::string value(kValueSize, 'a');
+  for (size_t i = 1; i <= kNumBlocks; i++) {
+    ASSERT_OK(Put(std::to_string(i), value));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD));
+    if (filter_type == 1) {
+      ASSERT_EQ(2 * i,
+                options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD));
+      ASSERT_EQ(2 * i,
+                options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD));
+    } else {
+      ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD));
+      ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD));
+    }
+    ASSERT_EQ(value, Get(std::to_string(i)));
+
+    ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(i * 3, options.statistics->getTickerCount(BLOCK_CACHE_INDEX_HIT));
+    if (filter_type == 1) {
+      ASSERT_EQ(i * 3,
+                options.statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT));
+    } else {
+      ASSERT_EQ(i * 2,
+                options.statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT));
+    }
+    ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_FILTER_MISS));
+  }
+
+  // Verify compaction not counted
+  CompactRangeOptions cro;
+  // Ensure files are rewritten, not just trivially moved.
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(db_->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
+  EXPECT_EQ(kNumBlocks,
+            options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD));
+  // Index and filter blocks are automatically warmed when the new table file
+  // is automatically opened at the end of compaction. This is not easily
+  // disabled so results in the new index and filter blocks being warmed.
+  if (filter_type == 1) {
+    EXPECT_EQ(2 * (1 + kNumBlocks),
+              options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD));
+    EXPECT_EQ(2 * (1 + kNumBlocks),
+              options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD));
+  } else {
+    EXPECT_EQ(1 + kNumBlocks,
+              options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD));
+    EXPECT_EQ(1 + kNumBlocks,
+              options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD));
+  }
+}
+
+TEST_F(DBBlockCacheTest, DynamicallyWarmCacheDuringFlush) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = NewLRUCache(1 << 25, 0, false);
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.prepopulate_block_cache =
+      BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly;
+
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  std::string value(kValueSize, 'a');
+
+  for (size_t i = 1; i <= 5; i++) {
+    ASSERT_OK(Put(std::to_string(i), value));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(1,
+              options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
+
+    ASSERT_EQ(value, Get(std::to_string(i)));
+    ASSERT_EQ(0,
+              options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
+    ASSERT_EQ(
+        0, options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS));
+    ASSERT_EQ(1,
+              options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT));
+  }
+
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"block_based_table_factory", "{prepopulate_block_cache=kDisable;}"}}));
+
+  for (size_t i = 6; i <= kNumBlocks; i++) {
+    ASSERT_OK(Put(std::to_string(i), value));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(0,
+              options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
+
+    ASSERT_EQ(value, Get(std::to_string(i)));
+    ASSERT_EQ(1,
+              options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
+    ASSERT_EQ(
+        1, options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS));
+    ASSERT_EQ(0,
+              options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT));
+  }
+}
+#endif
+
+namespace {
+
+// A mock cache wraps LRUCache, and record how many entries have been
+// inserted for each priority.
+class MockCache : public LRUCache {
+ public:
+  static uint32_t high_pri_insert_count;
+  static uint32_t low_pri_insert_count;
+
+  MockCache()
+      : LRUCache((size_t)1 << 25 /*capacity*/, 0 /*num_shard_bits*/,
+                 false /*strict_capacity_limit*/, 0.0 /*high_pri_pool_ratio*/,
+                 0.0 /*low_pri_pool_ratio*/) {}
+
+  using ShardedCache::Insert;
+
+  Status Insert(const Slice& key, void* value,
+                const Cache::CacheItemHelper* helper_cb, size_t charge,
+                Handle** handle, Priority priority) override {
+    DeleterFn delete_cb = helper_cb->del_cb;
+    if (priority == Priority::LOW) {
+      low_pri_insert_count++;
+    } else {
+      high_pri_insert_count++;
+    }
+    return LRUCache::Insert(key, value, charge, delete_cb, handle, priority);
+  }
+};
+
+uint32_t MockCache::high_pri_insert_count = 0;
+uint32_t MockCache::low_pri_insert_count = 0;
+
+}  // anonymous namespace
+
+TEST_F(DBBlockCacheTest, IndexAndFilterBlocksCachePriority) {
+  for (auto priority : {Cache::Priority::LOW, Cache::Priority::HIGH}) {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    BlockBasedTableOptions table_options;
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.block_cache.reset(new MockCache());
+    table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+    table_options.cache_index_and_filter_blocks_with_high_priority =
+        priority == Cache::Priority::HIGH ? true : false;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    DestroyAndReopen(options);
+
+    MockCache::high_pri_insert_count = 0;
+    MockCache::low_pri_insert_count = 0;
+
+    // Create a new table.
+    ASSERT_OK(Put("foo", "value"));
+    ASSERT_OK(Put("bar", "value"));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+    // index/filter blocks added to block cache right after table creation.
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(2, /* only index/filter were added */
+              TestGetTickerCount(options, BLOCK_CACHE_ADD));
+    ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
+    if (priority == Cache::Priority::LOW) {
+      ASSERT_EQ(0u, MockCache::high_pri_insert_count);
+      ASSERT_EQ(2u, MockCache::low_pri_insert_count);
+    } else {
+      ASSERT_EQ(2u, MockCache::high_pri_insert_count);
+      ASSERT_EQ(0u, MockCache::low_pri_insert_count);
+    }
+
+    // Access data block.
+    ASSERT_EQ("value", Get("foo"));
+
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(3, /*adding data block*/
+              TestGetTickerCount(options, BLOCK_CACHE_ADD));
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
+
+    // Data block should be inserted with low priority.
+    if (priority == Cache::Priority::LOW) {
+      ASSERT_EQ(0u, MockCache::high_pri_insert_count);
+      ASSERT_EQ(3u, MockCache::low_pri_insert_count);
+    } else {
+      ASSERT_EQ(2u, MockCache::high_pri_insert_count);
+      ASSERT_EQ(1u, MockCache::low_pri_insert_count);
+    }
+  }
+}
+
+namespace {
+
+// An LRUCache wrapper that can falsely report "not found" on Lookup.
+// This allows us to manipulate BlockBasedTableReader into thinking
+// another thread inserted the data in between Lookup and Insert,
+// while mostly preserving the LRUCache interface/behavior.
+class LookupLiarCache : public CacheWrapper {
+  int nth_lookup_not_found_ = 0;
+
+ public:
+  explicit LookupLiarCache(std::shared_ptr<Cache> target)
+      : CacheWrapper(std::move(target)) {}
+
+  using Cache::Lookup;
+  Handle* Lookup(const Slice& key, Statistics* stats) override {
+    if (nth_lookup_not_found_ == 1) {
+      nth_lookup_not_found_ = 0;
+      return nullptr;
+    }
+    if (nth_lookup_not_found_ > 1) {
+      --nth_lookup_not_found_;
+    }
+    return CacheWrapper::Lookup(key, stats);
+  }
+
+  // 1 == next lookup, 2 == after next, etc.
+  void SetNthLookupNotFound(int n) { nth_lookup_not_found_ = n; }
+};
+
+}  // anonymous namespace
+
+TEST_F(DBBlockCacheTest, AddRedundantStats) {
+  const size_t capacity = size_t{1} << 25;
+  const int num_shard_bits = 0;  // 1 shard
+  int iterations_tested = 0;
+  for (std::shared_ptr<Cache> base_cache :
+       {NewLRUCache(capacity, num_shard_bits),
+        HyperClockCacheOptions(
+            capacity,
+            BlockBasedTableOptions().block_size /*estimated_value_size*/,
+            num_shard_bits)
+            .MakeSharedCache()}) {
+    if (!base_cache) {
+      // Skip clock cache when not supported
+      continue;
+    }
+    ++iterations_tested;
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+    std::shared_ptr<LookupLiarCache> cache =
+        std::make_shared<LookupLiarCache>(base_cache);
+
+    BlockBasedTableOptions table_options;
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.block_cache = cache;
+    table_options.filter_policy.reset(NewBloomFilterPolicy(50));
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    DestroyAndReopen(options);
+
+    // Create a new table.
+    ASSERT_OK(Put("foo", "value"));
+    ASSERT_OK(Put("bar", "value"));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+    // Normal access filter+index+data.
+    ASSERT_EQ("value", Get("foo"));
+
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD));
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD));
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD));
+    // --------
+    ASSERT_EQ(3, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+    ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD_REDUNDANT));
+    ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD_REDUNDANT));
+    ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD_REDUNDANT));
+    // --------
+    ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_ADD_REDUNDANT));
+
+    // Againt access filter+index+data, but force redundant load+insert on index
+    cache->SetNthLookupNotFound(2);
+    ASSERT_EQ("value", Get("bar"));
+
+    ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD));
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD));
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD));
+    // --------
+    ASSERT_EQ(4, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD_REDUNDANT));
+    ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD_REDUNDANT));
+    ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD_REDUNDANT));
+    // --------
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_ADD_REDUNDANT));
+
+    // Access just filter (with high probability), and force redundant
+    // load+insert
+    cache->SetNthLookupNotFound(1);
+    ASSERT_EQ("NOT_FOUND", Get("this key was not added"));
+
+    EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD));
+    EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD));
+    EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD));
+    // --------
+    EXPECT_EQ(5, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+    EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD_REDUNDANT));
+    EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD_REDUNDANT));
+    EXPECT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD_REDUNDANT));
+    // --------
+    EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_ADD_REDUNDANT));
+
+    // Access just data, forcing redundant load+insert
+    ReadOptions read_options;
+    std::unique_ptr<Iterator> iter{db_->NewIterator(read_options)};
+    cache->SetNthLookupNotFound(1);
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key(), "bar");
+
+    EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD));
+    EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD));
+    EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD));
+    // --------
+    EXPECT_EQ(6, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+    EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD_REDUNDANT));
+    EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD_REDUNDANT));
+    EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD_REDUNDANT));
+    // --------
+    EXPECT_EQ(3, TestGetTickerCount(options, BLOCK_CACHE_ADD_REDUNDANT));
+  }
+  EXPECT_GE(iterations_tested, 1);
+}
+
+TEST_F(DBBlockCacheTest, ParanoidFileChecks) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.level0_file_num_compaction_trigger = 2;
+  options.paranoid_file_checks = true;
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "1_key", "val"));
+  ASSERT_OK(Put(1, "9_key", "val"));
+  // Create a new table.
+  ASSERT_OK(Flush(1));
+  ASSERT_EQ(1, /* read and cache data block */
+            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+  ASSERT_OK(Put(1, "1_key2", "val2"));
+  ASSERT_OK(Put(1, "9_key2", "val2"));
+  // Create a new SST file. This will further trigger a compaction
+  // and generate another file.
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(3, /* Totally 3 files created up to now */
+            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+  // After disabling options.paranoid_file_checks. NO further block
+  // is added after generating a new file.
+  ASSERT_OK(
+      dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "false"}}));
+
+  ASSERT_OK(Put(1, "1_key3", "val3"));
+  ASSERT_OK(Put(1, "9_key3", "val3"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(1, "1_key4", "val4"));
+  ASSERT_OK(Put(1, "9_key4", "val4"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(3, /* Totally 3 files created up to now */
+            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+}
+
+TEST_F(DBBlockCacheTest, CompressedCache) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+  int num_iter = 80;
+
+  // Run this test three iterations.
+  // Iteration 1: only a uncompressed block cache
+  // Iteration 2: only a compressed block cache
+  // Iteration 3: both block cache and compressed cache
+  // Iteration 4: both block cache and compressed cache, but DB is not
+  // compressed
+  for (int iter = 0; iter < 4; iter++) {
+    Options options = CurrentOptions();
+    options.write_buffer_size = 64 * 1024;  // small write buffer
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+    BlockBasedTableOptions table_options;
+    switch (iter) {
+      case 0:
+        // only uncompressed block cache
+        table_options.block_cache = NewLRUCache(8 * 1024);
+        table_options.block_cache_compressed = nullptr;
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        break;
+      case 1:
+        // no block cache, only compressed cache
+        table_options.no_block_cache = true;
+        table_options.block_cache = nullptr;
+        table_options.block_cache_compressed = NewLRUCache(8 * 1024);
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        break;
+      case 2:
+        // both compressed and uncompressed block cache
+        table_options.block_cache = NewLRUCache(1024);
+        table_options.block_cache_compressed = NewLRUCache(8 * 1024);
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        break;
+      case 3:
+        // both block cache and compressed cache, but DB is not compressed
+        // also, make block cache sizes bigger, to trigger block cache hits
+        table_options.block_cache = NewLRUCache(1024 * 1024);
+        table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024);
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        options.compression = kNoCompression;
+        break;
+      default:
+        FAIL();
+    }
+    CreateAndReopenWithCF({"pikachu"}, options);
+    // default column family doesn't have block cache
+    Options no_block_cache_opts;
+    no_block_cache_opts.statistics = options.statistics;
+    no_block_cache_opts = CurrentOptions(no_block_cache_opts);
+    BlockBasedTableOptions table_options_no_bc;
+    table_options_no_bc.no_block_cache = true;
+    no_block_cache_opts.table_factory.reset(
+        NewBlockBasedTableFactory(table_options_no_bc));
+    ReopenWithColumnFamilies(
+        {"default", "pikachu"},
+        std::vector<Options>({no_block_cache_opts, options}));
+
+    Random rnd(301);
+
+    // Write 8MB (80 values, each 100K)
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+    std::vector<std::string> values;
+    std::string str;
+    for (int i = 0; i < num_iter; i++) {
+      if (i % 4 == 0) {  // high compression ratio
+        str = rnd.RandomString(1000);
+      }
+      values.push_back(str);
+      ASSERT_OK(Put(1, Key(i), values[i]));
+    }
+
+    // flush all data from memtable so that reads are from block cache
+    ASSERT_OK(Flush(1));
+
+    for (int i = 0; i < num_iter; i++) {
+      ASSERT_EQ(Get(1, Key(i)), values[i]);
+    }
+
+    // check that we triggered the appropriate code paths in the cache
+    switch (iter) {
+      case 0:
+        // only uncompressed block cache
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+        ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
+        break;
+      case 1:
+        // no block cache, only compressed cache
+        ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
+        break;
+      case 2:
+        // both compressed and uncompressed block cache
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
+        break;
+      case 3:
+        // both compressed and uncompressed block cache
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_HIT), 0);
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
+        // compressed doesn't have any hits since blocks are not compressed on
+        // storage
+        ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT), 0);
+        break;
+      default:
+        FAIL();
+    }
+
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+  }
+}
+
+TEST_F(DBBlockCacheTest, CacheCompressionDict) {
+  const int kNumFiles = 4;
+  const int kNumEntriesPerFile = 128;
+  const int kNumBytesPerEntry = 1024;
+
+  // Try all the available libraries that support dictionary compression
+  std::vector<CompressionType> compression_types;
+  if (Zlib_Supported()) {
+    compression_types.push_back(kZlibCompression);
+  }
+  if (LZ4_Supported()) {
+    compression_types.push_back(kLZ4Compression);
+    compression_types.push_back(kLZ4HCCompression);
+  }
+  if (ZSTD_Supported()) {
+    compression_types.push_back(kZSTD);
+  } else if (ZSTDNotFinal_Supported()) {
+    compression_types.push_back(kZSTDNotFinalCompression);
+  }
+  Random rnd(301);
+  for (auto compression_type : compression_types) {
+    Options options = CurrentOptions();
+    options.bottommost_compression = compression_type;
+    options.bottommost_compression_opts.max_dict_bytes = 4096;
+    options.bottommost_compression_opts.enabled = true;
+    options.create_if_missing = true;
+    options.num_levels = 2;
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry;
+    BlockBasedTableOptions table_options;
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.block_cache.reset(new MockCache());
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    DestroyAndReopen(options);
+
+    RecordCacheCountersForCompressionDict(options);
+
+    for (int i = 0; i < kNumFiles; ++i) {
+      ASSERT_EQ(i, NumTableFilesAtLevel(0, 0));
+      for (int j = 0; j < kNumEntriesPerFile; ++j) {
+        std::string value = rnd.RandomString(kNumBytesPerEntry);
+        ASSERT_OK(Put(Key(j * kNumFiles + i), value.c_str()));
+      }
+      ASSERT_OK(Flush());
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ(0, NumTableFilesAtLevel(0));
+    ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(1));
+
+    // Compression dictionary blocks are preloaded.
+    CheckCacheCountersForCompressionDict(
+        options, kNumFiles /* expected_compression_dict_misses */,
+        0 /* expected_compression_dict_hits */,
+        kNumFiles /* expected_compression_dict_inserts */);
+
+    // Seek to a key in a file. It should cause the SST's dictionary meta-block
+    // to be read.
+    RecordCacheCounters(options);
+    RecordCacheCountersForCompressionDict(options);
+    ReadOptions read_options;
+    ASSERT_NE("NOT_FOUND", Get(Key(kNumFiles * kNumEntriesPerFile - 1)));
+    // Two block hits: index and dictionary since they are prefetched
+    // One block missed/added: data block
+    CheckCacheCounters(options, 1 /* expected_misses */, 2 /* expected_hits */,
+                       1 /* expected_inserts */, 0 /* expected_failures */);
+    CheckCacheCountersForCompressionDict(
+        options, 0 /* expected_compression_dict_misses */,
+        1 /* expected_compression_dict_hits */,
+        0 /* expected_compression_dict_inserts */);
+  }
+}
+
+static void ClearCache(Cache* cache) {
+  auto roles = CopyCacheDeleterRoleMap();
+  std::deque<std::string> keys;
+  Cache::ApplyToAllEntriesOptions opts;
+  auto callback = [&](const Slice& key, void* /*value*/, size_t /*charge*/,
+                      Cache::DeleterFn deleter) {
+    if (roles.find(deleter) == roles.end()) {
+      // Keep the stats collector
+      return;
+    }
+    keys.push_back(key.ToString());
+  };
+  cache->ApplyToAllEntries(callback, opts);
+  for (auto& k : keys) {
+    cache->Erase(k);
+  }
+}
+
+TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
+  const size_t capacity = size_t{1} << 25;
+  int iterations_tested = 0;
+  for (bool partition : {false, true}) {
+    for (std::shared_ptr<Cache> cache :
+         {NewLRUCache(capacity),
+          HyperClockCacheOptions(
+              capacity,
+              BlockBasedTableOptions().block_size /*estimated_value_size*/)
+              .MakeSharedCache()}) {
+      ++iterations_tested;
+
+      Options options = CurrentOptions();
+      SetTimeElapseOnlySleepOnReopen(&options);
+      options.create_if_missing = true;
+      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+      options.max_open_files = 13;
+      options.table_cache_numshardbits = 0;
+      // If this wakes up, it could interfere with test
+      options.stats_dump_period_sec = 0;
+
+      BlockBasedTableOptions table_options;
+      table_options.block_cache = cache;
+      table_options.cache_index_and_filter_blocks = true;
+      table_options.filter_policy.reset(NewBloomFilterPolicy(50));
+      if (partition) {
+        table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+        table_options.partition_filters = true;
+      }
+      table_options.metadata_cache_options.top_level_index_pinning =
+          PinningTier::kNone;
+      table_options.metadata_cache_options.partition_pinning =
+          PinningTier::kNone;
+      table_options.metadata_cache_options.unpartitioned_pinning =
+          PinningTier::kNone;
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+      DestroyAndReopen(options);
+
+      // Create a new table.
+      ASSERT_OK(Put("foo", "value"));
+      ASSERT_OK(Put("bar", "value"));
+      ASSERT_OK(Flush());
+
+      ASSERT_OK(Put("zfoo", "value"));
+      ASSERT_OK(Put("zbar", "value"));
+      ASSERT_OK(Flush());
+
+      ASSERT_EQ(2, NumTableFilesAtLevel(0));
+
+      // Fresh cache
+      ClearCache(cache.get());
+
+      std::array<size_t, kNumCacheEntryRoles> expected{};
+      // For CacheEntryStatsCollector
+      expected[static_cast<size_t>(CacheEntryRole::kMisc)] = 1;
+      EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
+
+      std::array<size_t, kNumCacheEntryRoles> prev_expected = expected;
+
+      // First access only filters
+      ASSERT_EQ("NOT_FOUND", Get("different from any key added"));
+      expected[static_cast<size_t>(CacheEntryRole::kFilterBlock)] += 2;
+      if (partition) {
+        expected[static_cast<size_t>(CacheEntryRole::kFilterMetaBlock)] += 2;
+      }
+      // Within some time window, we will get cached entry stats
+      EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
+      // Not enough to force a miss
+      env_->MockSleepForSeconds(45);
+      EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
+      // Enough to force a miss
+      env_->MockSleepForSeconds(601);
+      EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
+
+      // Now access index and data block
+      ASSERT_EQ("value", Get("foo"));
+      expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]++;
+      if (partition) {
+        // top-level
+        expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]++;
+      }
+      expected[static_cast<size_t>(CacheEntryRole::kDataBlock)]++;
+      // Enough to force a miss
+      env_->MockSleepForSeconds(601);
+      // But inject a simulated long scan so that we need a longer
+      // interval to force a miss next time.
+      SyncPoint::GetInstance()->SetCallBack(
+          "CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries",
+          [this](void*) {
+            // To spend no more than 0.2% of time scanning, we would need
+            // interval of at least 10000s
+            env_->MockSleepForSeconds(20);
+          });
+      SyncPoint::GetInstance()->EnableProcessing();
+      EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
+      prev_expected = expected;
+      SyncPoint::GetInstance()->DisableProcessing();
+      SyncPoint::GetInstance()->ClearAllCallBacks();
+
+      // The same for other file
+      ASSERT_EQ("value", Get("zfoo"));
+      expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]++;
+      if (partition) {
+        // top-level
+        expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]++;
+      }
+      expected[static_cast<size_t>(CacheEntryRole::kDataBlock)]++;
+      // Because of the simulated long scan, this is not enough to force
+      // a miss
+      env_->MockSleepForSeconds(601);
+      EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
+      // But this is enough
+      env_->MockSleepForSeconds(10000);
+      EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
+      prev_expected = expected;
+
+      // Also check the GetProperty interface
+      std::map<std::string, std::string> values;
+      ASSERT_TRUE(
+          db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats, &values));
+
+      for (size_t i = 0; i < kNumCacheEntryRoles; ++i) {
+        auto role = static_cast<CacheEntryRole>(i);
+        EXPECT_EQ(std::to_string(expected[i]),
+                  values[BlockCacheEntryStatsMapKeys::EntryCount(role)]);
+      }
+
+      // Add one for kWriteBuffer
+      {
+        WriteBufferManager wbm(size_t{1} << 20, cache);
+        wbm.ReserveMem(1024);
+        expected[static_cast<size_t>(CacheEntryRole::kWriteBuffer)]++;
+        // Now we check that the GetProperty interface is more agressive about
+        // re-scanning stats, but not totally aggressive.
+        // Within some time window, we will get cached entry stats
+        env_->MockSleepForSeconds(1);
+        EXPECT_EQ(std::to_string(prev_expected[static_cast<size_t>(
+                      CacheEntryRole::kWriteBuffer)]),
+                  values[BlockCacheEntryStatsMapKeys::EntryCount(
+                      CacheEntryRole::kWriteBuffer)]);
+        // Not enough for a "background" miss but enough for a "foreground" miss
+        env_->MockSleepForSeconds(45);
+
+        ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats,
+                                        &values));
+        EXPECT_EQ(
+            std::to_string(
+                expected[static_cast<size_t>(CacheEntryRole::kWriteBuffer)]),
+            values[BlockCacheEntryStatsMapKeys::EntryCount(
+                CacheEntryRole::kWriteBuffer)]);
+      }
+      prev_expected = expected;
+
+      // With collector pinned in cache, we should be able to hit
+      // even if the cache is full
+      ClearCache(cache.get());
+      Cache::Handle* h = nullptr;
+      if (strcmp(cache->Name(), "LRUCache") == 0) {
+        ASSERT_OK(cache->Insert("Fill-it-up", nullptr, capacity + 1,
+                                GetNoopDeleterForRole<CacheEntryRole::kMisc>(),
+                                &h, Cache::Priority::HIGH));
+      } else {
+        // For ClockCache we use a 16-byte key.
+        ASSERT_OK(cache->Insert("Fill-it-up-xxxxx", nullptr, capacity + 1,
+                                GetNoopDeleterForRole<CacheEntryRole::kMisc>(),
+                                &h, Cache::Priority::HIGH));
+      }
+      ASSERT_GT(cache->GetUsage(), cache->GetCapacity());
+      expected = {};
+      // For CacheEntryStatsCollector
+      expected[static_cast<size_t>(CacheEntryRole::kMisc)] = 1;
+      // For Fill-it-up
+      expected[static_cast<size_t>(CacheEntryRole::kMisc)]++;
+      // Still able to hit on saved stats
+      EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
+      // Enough to force a miss
+      env_->MockSleepForSeconds(1000);
+      EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
+
+      cache->Release(h);
+
+      // Now we test that the DB mutex is not held during scans, for the ways
+      // we know how to (possibly) trigger them. Without a better good way to
+      // check this, we simply inject an acquire & release of the DB mutex
+      // deep in the stat collection code. If we were already holding the
+      // mutex, that is UB that would at least be found by TSAN.
+      int scan_count = 0;
+      SyncPoint::GetInstance()->SetCallBack(
+          "CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries",
+          [this, &scan_count](void*) {
+            dbfull()->TEST_LockMutex();
+            dbfull()->TEST_UnlockMutex();
+            ++scan_count;
+          });
+      SyncPoint::GetInstance()->EnableProcessing();
+
+      // Different things that might trigger a scan, with mock sleeps to
+      // force a miss.
+      env_->MockSleepForSeconds(10000);
+      dbfull()->DumpStats();
+      ASSERT_EQ(scan_count, 1);
+
+      env_->MockSleepForSeconds(60);
+      ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kFastBlockCacheEntryStats,
+                                      &values));
+      ASSERT_EQ(scan_count, 1);
+      ASSERT_TRUE(
+          db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats, &values));
+      ASSERT_EQ(scan_count, 2);
+
+      env_->MockSleepForSeconds(10000);
+      ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kFastBlockCacheEntryStats,
+                                      &values));
+      ASSERT_EQ(scan_count, 3);
+
+      env_->MockSleepForSeconds(60);
+      std::string value_str;
+      ASSERT_TRUE(db_->GetProperty(DB::Properties::kFastBlockCacheEntryStats,
+                                   &value_str));
+      ASSERT_EQ(scan_count, 3);
+      ASSERT_TRUE(
+          db_->GetProperty(DB::Properties::kBlockCacheEntryStats, &value_str));
+      ASSERT_EQ(scan_count, 4);
+
+      env_->MockSleepForSeconds(10000);
+      ASSERT_TRUE(db_->GetProperty(DB::Properties::kFastBlockCacheEntryStats,
+                                   &value_str));
+      ASSERT_EQ(scan_count, 5);
+
+      ASSERT_TRUE(db_->GetProperty(DB::Properties::kCFStats, &value_str));
+      // To match historical speed, querying this property no longer triggers
+      // a scan, even if results are old. But periodic dump stats should keep
+      // things reasonably updated.
+      ASSERT_EQ(scan_count, /*unchanged*/ 5);
+
+      SyncPoint::GetInstance()->DisableProcessing();
+      SyncPoint::GetInstance()->ClearAllCallBacks();
+    }
+    EXPECT_GE(iterations_tested, 1);
+  }
+}
+
+namespace {
+
+void DummyFillCache(Cache& cache, size_t entry_size,
+                    std::vector<CacheHandleGuard<void>>& handles) {
+  // fprintf(stderr, "Entry size: %zu\n", entry_size);
+  handles.clear();
+  cache.EraseUnRefEntries();
+  void* fake_value = &cache;
+  size_t capacity = cache.GetCapacity();
+  OffsetableCacheKey ck{"abc", "abc", 42};
+  for (size_t my_usage = 0; my_usage < capacity;) {
+    size_t charge = std::min(entry_size, capacity - my_usage);
+    Cache::Handle* handle;
+    Status st = cache.Insert(ck.WithOffset(my_usage).AsSlice(), fake_value,
+                             charge, /*deleter*/ nullptr, &handle);
+    ASSERT_OK(st);
+    handles.emplace_back(&cache, handle);
+    my_usage += charge;
+  }
+}
+
+class CountingLogger : public Logger {
+ public:
+  ~CountingLogger() override {}
+  using Logger::Logv;
+  void Logv(const InfoLogLevel log_level, const char* format,
+            va_list /*ap*/) override {
+    if (std::strstr(format, "HyperClockCache") == nullptr) {
+      // Not a match
+      return;
+    }
+    // static StderrLogger debug;
+    // debug.Logv(log_level, format, ap);
+    if (log_level == InfoLogLevel::INFO_LEVEL) {
+      ++info_count_;
+    } else if (log_level == InfoLogLevel::WARN_LEVEL) {
+      ++warn_count_;
+    } else if (log_level == InfoLogLevel::ERROR_LEVEL) {
+      ++error_count_;
+    }
+  }
+
+  std::array<int, 3> PopCounts() {
+    std::array<int, 3> rv{{info_count_, warn_count_, error_count_}};
+    info_count_ = warn_count_ = error_count_ = 0;
+    return rv;
+  }
+
+ private:
+  int info_count_{};
+  int warn_count_{};
+  int error_count_{};
+};
+
+}  // namespace
+
+TEST_F(DBBlockCacheTest, HyperClockCacheReportProblems) {
+  size_t capacity = 1024 * 1024;
+  size_t value_size_est = 8 * 1024;
+  HyperClockCacheOptions hcc_opts{capacity, value_size_est};
+  hcc_opts.num_shard_bits = 2;  // 4 shards
+  hcc_opts.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<Cache> cache = hcc_opts.MakeSharedCache();
+  std::shared_ptr<CountingLogger> logger = std::make_shared<CountingLogger>();
+
+  auto table_options = GetTableOptions();
+  auto options = GetOptions(table_options);
+  table_options.block_cache = cache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.info_log = logger;
+  // Going to sample more directly
+  options.stats_dump_period_sec = 0;
+  Reopen(options);
+
+  std::vector<CacheHandleGuard<void>> handles;
+
+  // Clear anything from DB startup
+  logger->PopCounts();
+
+  // Fill cache based on expected size and check that when we
+  // don't report anything relevant in periodic stats dump
+  DummyFillCache(*cache, value_size_est, handles);
+  dbfull()->DumpStats();
+  EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{0, 0, 0}}));
+
+  // Same, within reasonable bounds
+  DummyFillCache(*cache, value_size_est - value_size_est / 4, handles);
+  dbfull()->DumpStats();
+  EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{0, 0, 0}}));
+
+  DummyFillCache(*cache, value_size_est + value_size_est / 3, handles);
+  dbfull()->DumpStats();
+  EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{0, 0, 0}}));
+
+  // Estimate too high (value size too low) eventually reports ERROR
+  DummyFillCache(*cache, value_size_est / 2, handles);
+  dbfull()->DumpStats();
+  EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{0, 1, 0}}));
+
+  DummyFillCache(*cache, value_size_est / 3, handles);
+  dbfull()->DumpStats();
+  EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{0, 0, 1}}));
+
+  // Estimate too low (value size too high) starts with INFO
+  // and is only WARNING in the worst case
+  DummyFillCache(*cache, value_size_est * 2, handles);
+  dbfull()->DumpStats();
+  EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{1, 0, 0}}));
+
+  DummyFillCache(*cache, value_size_est * 3, handles);
+  dbfull()->DumpStats();
+  EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{0, 1, 0}}));
+
+  DummyFillCache(*cache, value_size_est * 20, handles);
+  dbfull()->DumpStats();
+  EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{0, 1, 0}}));
+}
+
+#endif  // ROCKSDB_LITE
+
+class DBBlockCacheKeyTest
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  DBBlockCacheKeyTest()
+      : DBTestBase("db_block_cache_test", /*env_do_fsync=*/false) {}
+
+  void SetUp() override {
+    use_compressed_cache_ = std::get<0>(GetParam());
+    exclude_file_numbers_ = std::get<1>(GetParam());
+  }
+
+  bool use_compressed_cache_;
+  bool exclude_file_numbers_;
+};
+
+// Disable LinkFile so that we can physically copy a DB using Checkpoint.
+// Disable file GetUniqueId to enable stable cache keys.
+class StableCacheKeyTestFS : public FaultInjectionTestFS {
+ public:
+  explicit StableCacheKeyTestFS(const std::shared_ptr<FileSystem>& base)
+      : FaultInjectionTestFS(base) {
+    SetFailGetUniqueId(true);
+  }
+
+  virtual ~StableCacheKeyTestFS() override {}
+
+  IOStatus LinkFile(const std::string&, const std::string&, const IOOptions&,
+                    IODebugContext*) override {
+    return IOStatus::NotSupported("Disabled");
+  }
+};
+
+TEST_P(DBBlockCacheKeyTest, StableCacheKeys) {
+  std::shared_ptr<StableCacheKeyTestFS> test_fs{
+      new StableCacheKeyTestFS(env_->GetFileSystem())};
+  std::unique_ptr<CompositeEnvWrapper> test_env{
+      new CompositeEnvWrapper(env_, test_fs)};
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.env = test_env.get();
+
+  // Corrupting the table properties corrupts the unique id.
+  // Ignore the unique id recorded in the manifest.
+  options.verify_sst_unique_id_in_manifest = false;
+
+  BlockBasedTableOptions table_options;
+
+  int key_count = 0;
+  uint64_t expected_stat = 0;
+
+  std::function<void()> verify_stats;
+  if (use_compressed_cache_) {
+    if (!Snappy_Supported()) {
+      ROCKSDB_GTEST_SKIP("Compressed cache test requires snappy support");
+      return;
+    }
+    options.compression = CompressionType::kSnappyCompression;
+    table_options.no_block_cache = true;
+    table_options.block_cache_compressed = NewLRUCache(1 << 25, 0, false);
+    verify_stats = [&options, &expected_stat] {
+      // One for ordinary SST file and one for external SST file
+      ASSERT_EQ(expected_stat,
+                options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_ADD));
+    };
+  } else {
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.block_cache = NewLRUCache(1 << 25, 0, false);
+    verify_stats = [&options, &expected_stat] {
+      ASSERT_EQ(expected_stat,
+                options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD));
+      ASSERT_EQ(expected_stat,
+                options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD));
+      ASSERT_EQ(expected_stat,
+                options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD));
+    };
+  }
+
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  CreateAndReopenWithCF({"koko"}, options);
+
+  if (exclude_file_numbers_) {
+    // Simulate something like old behavior without file numbers in properties.
+    // This is a "control" side of the test that also ensures safely degraded
+    // behavior on old files.
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "BlockBasedTableBuilder::BlockBasedTableBuilder:PreSetupBaseCacheKey",
+        [&](void* arg) {
+          TableProperties* props = reinterpret_cast<TableProperties*>(arg);
+          props->orig_file_number = 0;
+        });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  }
+
+  std::function<void()> perform_gets = [&key_count, &expected_stat, this]() {
+    if (exclude_file_numbers_) {
+      // No cache key reuse should happen, because we can't rely on current
+      // file number being stable
+      expected_stat += key_count;
+    } else {
+      // Cache keys should be stable
+      expected_stat = key_count;
+    }
+    for (int i = 0; i < key_count; ++i) {
+      ASSERT_EQ(Get(1, Key(i)), "abc");
+    }
+  };
+
+  // Ordinary SST files with same session id
+  const std::string something_compressible(500U, 'x');
+  for (int i = 0; i < 2; ++i) {
+    ASSERT_OK(Put(1, Key(key_count), "abc"));
+    ASSERT_OK(Put(1, Key(key_count) + "a", something_compressible));
+    ASSERT_OK(Flush(1));
+    ++key_count;
+  }
+
+#ifndef ROCKSDB_LITE
+  // Save an export of those ordinary SST files for later
+  std::string export_files_dir = dbname_ + "/exported";
+  ExportImportFilesMetaData* metadata_ptr_ = nullptr;
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir,
+                                           &metadata_ptr_));
+  ASSERT_NE(metadata_ptr_, nullptr);
+  delete checkpoint;
+  checkpoint = nullptr;
+
+  // External SST files with same session id
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+  std::vector<std::string> external;
+  for (int i = 0; i < 2; ++i) {
+    std::string f = dbname_ + "/external" + std::to_string(i) + ".sst";
+    external.push_back(f);
+    ASSERT_OK(sst_file_writer.Open(f));
+    ASSERT_OK(sst_file_writer.Put(Key(key_count), "abc"));
+    ASSERT_OK(
+        sst_file_writer.Put(Key(key_count) + "a", something_compressible));
+    ++key_count;
+    ExternalSstFileInfo external_info;
+    ASSERT_OK(sst_file_writer.Finish(&external_info));
+    IngestExternalFileOptions ingest_opts;
+    ASSERT_OK(db_->IngestExternalFile(handles_[1], {f}, ingest_opts));
+  }
+
+  if (exclude_file_numbers_) {
+    // FIXME(peterd): figure out where these extra ADDs are coming from
+    options.statistics->recordTick(BLOCK_CACHE_COMPRESSED_ADD,
+                                   uint64_t{0} - uint64_t{2});
+  }
+#endif
+
+  perform_gets();
+  verify_stats();
+
+  // Make sure we can cache hit after re-open
+  ReopenWithColumnFamilies({"default", "koko"}, options);
+
+  perform_gets();
+  verify_stats();
+
+  // Make sure we can cache hit even on a full copy of the DB. Using
+  // StableCacheKeyTestFS, Checkpoint will resort to full copy not hard link.
+  // (Checkpoint  not available in LITE mode to test this.)
+#ifndef ROCKSDB_LITE
+  auto db_copy_name = dbname_ + "-copy";
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->CreateCheckpoint(db_copy_name));
+  delete checkpoint;
+
+  Close();
+  Destroy(options);
+
+  // Switch to the DB copy
+  SaveAndRestore<std::string> save_dbname(&dbname_, db_copy_name);
+  ReopenWithColumnFamilies({"default", "koko"}, options);
+
+  perform_gets();
+  verify_stats();
+
+  // And ensure that re-importing + ingesting the same files into a
+  // different DB uses same cache keys
+  DestroyAndReopen(options);
+
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                              ImportColumnFamilyOptions(),
+                                              *metadata_ptr_, &cfh));
+  ASSERT_NE(cfh, nullptr);
+  delete cfh;
+  cfh = nullptr;
+  delete metadata_ptr_;
+  metadata_ptr_ = nullptr;
+
+  ASSERT_OK(DestroyDB(export_files_dir, options));
+
+  ReopenWithColumnFamilies({"default", "yoyo"}, options);
+
+  IngestExternalFileOptions ingest_opts;
+  ASSERT_OK(db_->IngestExternalFile(handles_[1], {external}, ingest_opts));
+
+  perform_gets();
+  verify_stats();
+#endif  // !ROCKSDB_LITE
+
+  Close();
+  Destroy(options);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+class CacheKeyTest : public testing::Test {
+ public:
+  CacheKey GetBaseCacheKey() {
+    CacheKey rv = GetOffsetableCacheKey(0, /*min file_number*/ 1).WithOffset(0);
+    // Correct for file_number_ == 1
+    *reinterpret_cast<uint64_t*>(&rv) ^= ReverseBits(uint64_t{1});
+    return rv;
+  }
+  CacheKey GetCacheKey(uint64_t session_counter, uint64_t file_number,
+                       uint64_t offset) {
+    OffsetableCacheKey offsetable =
+        GetOffsetableCacheKey(session_counter, file_number);
+    // * 4 to counteract optimization that strips lower 2 bits in encoding
+    // the offset in BlockBasedTable::GetCacheKey (which we prefer to include
+    // in unit tests to maximize functional coverage).
+    EXPECT_GE(offset * 4, offset);  // no overflow
+    return BlockBasedTable::GetCacheKey(offsetable,
+                                        BlockHandle(offset * 4, /*size*/ 5));
+  }
+
+ protected:
+  OffsetableCacheKey GetOffsetableCacheKey(uint64_t session_counter,
+                                           uint64_t file_number) {
+    // Like SemiStructuredUniqueIdGen::GenerateNext
+    tp_.db_session_id = EncodeSessionId(base_session_upper_,
+                                        base_session_lower_ ^ session_counter);
+    tp_.db_id = std::to_string(db_id_);
+    tp_.orig_file_number = file_number;
+    bool is_stable;
+    std::string cur_session_id = "";  // ignored
+    uint64_t cur_file_number = 42;    // ignored
+    OffsetableCacheKey rv;
+    BlockBasedTable::SetupBaseCacheKey(&tp_, cur_session_id, cur_file_number,
+                                       &rv, &is_stable);
+    EXPECT_TRUE(is_stable);
+    EXPECT_TRUE(!rv.IsEmpty());
+    // BEGIN some assertions in relation to SST unique IDs
+    std::string external_unique_id_str;
+    EXPECT_OK(GetUniqueIdFromTableProperties(tp_, &external_unique_id_str));
+    UniqueId64x2 sst_unique_id = {};
+    EXPECT_OK(DecodeUniqueIdBytes(external_unique_id_str, &sst_unique_id));
+    ExternalUniqueIdToInternal(&sst_unique_id);
+    OffsetableCacheKey ock =
+        OffsetableCacheKey::FromInternalUniqueId(&sst_unique_id);
+    EXPECT_EQ(rv.WithOffset(0).AsSlice(), ock.WithOffset(0).AsSlice());
+    EXPECT_EQ(ock.ToInternalUniqueId(), sst_unique_id);
+    // END some assertions in relation to SST unique IDs
+    return rv;
+  }
+
+  TableProperties tp_;
+  uint64_t base_session_upper_ = 0;
+  uint64_t base_session_lower_ = 0;
+  uint64_t db_id_ = 0;
+};
+
+TEST_F(CacheKeyTest, DBImplSessionIdStructure) {
+  // We have to generate our own session IDs for simulation purposes in other
+  // tests. Here we verify that the DBImpl implementation seems to match
+  // our construction here, by using lowest XORed-in bits for "session
+  // counter."
+  std::string session_id1 = DBImpl::GenerateDbSessionId(/*env*/ nullptr);
+  std::string session_id2 = DBImpl::GenerateDbSessionId(/*env*/ nullptr);
+  uint64_t upper1, upper2, lower1, lower2;
+  ASSERT_OK(DecodeSessionId(session_id1, &upper1, &lower1));
+  ASSERT_OK(DecodeSessionId(session_id2, &upper2, &lower2));
+  // Because generated in same process
+  ASSERT_EQ(upper1, upper2);
+  // Unless we generate > 4 billion session IDs in this process...
+  ASSERT_EQ(Upper32of64(lower1), Upper32of64(lower2));
+  // But they must be different somewhere
+  ASSERT_NE(Lower32of64(lower1), Lower32of64(lower2));
+}
+
+namespace {
+// Deconstruct cache key, based on knowledge of implementation details.
+void DeconstructNonemptyCacheKey(const CacheKey& key, uint64_t* file_num_etc64,
+                                 uint64_t* offset_etc64) {
+  *file_num_etc64 = *reinterpret_cast<const uint64_t*>(key.AsSlice().data());
+  *offset_etc64 = *reinterpret_cast<const uint64_t*>(key.AsSlice().data() + 8);
+  assert(*file_num_etc64 != 0);
+  if (*offset_etc64 == 0) {
+    std::swap(*file_num_etc64, *offset_etc64);
+  }
+  assert(*offset_etc64 != 0);
+}
+
+// Make a bit mask of 0 to 64 bits
+uint64_t MakeMask64(int bits) {
+  if (bits >= 64) {
+    return uint64_t{0} - 1;
+  } else {
+    return (uint64_t{1} << bits) - 1;
+  }
+}
+
+// See CacheKeyTest::Encodings
+struct CacheKeyDecoder {
+  // Inputs
+  uint64_t base_file_num_etc64, base_offset_etc64;
+  int session_counter_bits, file_number_bits, offset_bits;
+
+  // Derived
+  uint64_t session_counter_mask, file_number_mask, offset_mask;
+
+  // Outputs
+  uint64_t decoded_session_counter, decoded_file_num, decoded_offset;
+
+  void SetBaseCacheKey(const CacheKey& base) {
+    DeconstructNonemptyCacheKey(base, &base_file_num_etc64, &base_offset_etc64);
+  }
+
+  void SetRanges(int _session_counter_bits, int _file_number_bits,
+                 int _offset_bits) {
+    session_counter_bits = _session_counter_bits;
+    session_counter_mask = MakeMask64(session_counter_bits);
+    file_number_bits = _file_number_bits;
+    file_number_mask = MakeMask64(file_number_bits);
+    offset_bits = _offset_bits;
+    offset_mask = MakeMask64(offset_bits);
+  }
+
+  void Decode(const CacheKey& key) {
+    uint64_t file_num_etc64, offset_etc64;
+    DeconstructNonemptyCacheKey(key, &file_num_etc64, &offset_etc64);
+
+    // First decode session counter
+    if (offset_bits + session_counter_bits <= 64) {
+      // fully recoverable from offset_etc64
+      decoded_session_counter =
+          ReverseBits((offset_etc64 ^ base_offset_etc64)) &
+          session_counter_mask;
+    } else if (file_number_bits + session_counter_bits <= 64) {
+      // fully recoverable from file_num_etc64
+      decoded_session_counter = DownwardInvolution(
+          (file_num_etc64 ^ base_file_num_etc64) & session_counter_mask);
+    } else {
+      // Need to combine parts from each word.
+      // Piece1 will contain some correct prefix of the bottom bits of
+      // session counter.
+      uint64_t piece1 =
+          ReverseBits((offset_etc64 ^ base_offset_etc64) & ~offset_mask);
+      int piece1_bits = 64 - offset_bits;
+      // Piece2 will contain involuded bits that we can combine with piece1
+      // to infer rest of session counter
+      int piece2_bits = std::min(64 - file_number_bits, 64 - piece1_bits);
+      ASSERT_LT(piece2_bits, 64);
+      uint64_t piece2_mask = MakeMask64(piece2_bits);
+      uint64_t piece2 = (file_num_etc64 ^ base_file_num_etc64) & piece2_mask;
+
+      // Cancel out the part of piece2 that we can infer from piece1
+      // (DownwardInvolution distributes over xor)
+      piece2 ^= DownwardInvolution(piece1) & piece2_mask;
+
+      // Now we need to solve for the unknown original bits in higher
+      // positions than piece1 provides. We use Gaussian elimination
+      // because we know that a piece2_bits X piece2_bits submatrix of
+      // the matrix underlying DownwardInvolution times the vector of
+      // unknown original bits equals piece2.
+      //
+      // Build an augmented row matrix for that submatrix, built column by
+      // column.
+      std::array<uint64_t, 64> aug_rows{};
+      for (int i = 0; i < piece2_bits; ++i) {  // over columns
+        uint64_t col_i = DownwardInvolution(uint64_t{1} << piece1_bits << i);
+        ASSERT_NE(col_i & 1U, 0);
+        for (int j = 0; j < piece2_bits; ++j) {  // over rows
+          aug_rows[j] |= (col_i & 1U) << i;
+          col_i >>= 1;
+        }
+      }
+      // Augment with right hand side
+      for (int j = 0; j < piece2_bits; ++j) {  // over rows
+        aug_rows[j] |= (piece2 & 1U) << piece2_bits;
+        piece2 >>= 1;
+      }
+      // Run Gaussian elimination
+      for (int i = 0; i < piece2_bits; ++i) {  // over columns
+        // Find a row that can be used to cancel others
+        uint64_t canceller = 0;
+        // Note: Rows 0 through i-1 contain 1s in columns already eliminated
+        for (int j = i; j < piece2_bits; ++j) {  // over rows
+          if (aug_rows[j] & (uint64_t{1} << i)) {
+            // Swap into appropriate row
+            std::swap(aug_rows[i], aug_rows[j]);
+            // Keep a handy copy for row reductions
+            canceller = aug_rows[i];
+            break;
+          }
+        }
+        ASSERT_NE(canceller, 0);
+        for (int j = 0; j < piece2_bits; ++j) {  // over rows
+          if (i != j && ((aug_rows[j] >> i) & 1) != 0) {
+            // Row reduction
+            aug_rows[j] ^= canceller;
+          }
+        }
+      }
+      // Extract result
+      decoded_session_counter = piece1;
+      for (int j = 0; j < piece2_bits; ++j) {  // over rows
+        ASSERT_EQ(aug_rows[j] & piece2_mask, uint64_t{1} << j);
+        decoded_session_counter |= aug_rows[j] >> piece2_bits << piece1_bits
+                                                              << j;
+      }
+    }
+
+    decoded_offset =
+        offset_etc64 ^ base_offset_etc64 ^ ReverseBits(decoded_session_counter);
+
+    decoded_file_num = ReverseBits(file_num_etc64 ^ base_file_num_etc64 ^
+                                   DownwardInvolution(decoded_session_counter));
+  }
+};
+}  // anonymous namespace
+
+TEST_F(CacheKeyTest, Encodings) {
+  // This test primarily verifies this claim from cache_key.cc:
+  // // In fact, if DB ids were not involved, we would be guaranteed unique
+  // // cache keys for files generated in a single process until total bits for
+  // // biggest session_id_counter, orig_file_number, and offset_in_file
+  // // reach 128 bits.
+  //
+  // To demonstrate this, CacheKeyDecoder can reconstruct the structured inputs
+  // to the cache key when provided an output cache key, the unstructured
+  // inputs, and bounds on the structured inputs.
+  //
+  // See OffsetableCacheKey comments in cache_key.cc.
+
+  // We are going to randomly initialize some values that *should* not affect
+  // result
+  Random64 r{std::random_device{}()};
+
+  CacheKeyDecoder decoder;
+  db_id_ = r.Next();
+  base_session_upper_ = r.Next();
+  base_session_lower_ = r.Next();
+  if (base_session_lower_ == 0) {
+    base_session_lower_ = 1;
+  }
+
+  decoder.SetBaseCacheKey(GetBaseCacheKey());
+
+  // Loop over configurations and test those
+  for (int session_counter_bits = 0; session_counter_bits <= 64;
+       ++session_counter_bits) {
+    for (int file_number_bits = 1; file_number_bits <= 64; ++file_number_bits) {
+      // 62 bits max because unoptimized offset will be 64 bits in that case
+      for (int offset_bits = 0; offset_bits <= 62; ++offset_bits) {
+        if (session_counter_bits + file_number_bits + offset_bits > 128) {
+          break;
+        }
+
+        decoder.SetRanges(session_counter_bits, file_number_bits, offset_bits);
+
+        uint64_t session_counter = r.Next() & decoder.session_counter_mask;
+        uint64_t file_number = r.Next() & decoder.file_number_mask;
+        if (file_number == 0) {
+          // Minimum
+          file_number = 1;
+        }
+        uint64_t offset = r.Next() & decoder.offset_mask;
+        decoder.Decode(GetCacheKey(session_counter, file_number, offset));
+
+        EXPECT_EQ(decoder.decoded_session_counter, session_counter);
+        EXPECT_EQ(decoder.decoded_file_num, file_number);
+        EXPECT_EQ(decoder.decoded_offset, offset);
+      }
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(DBBlockCacheKeyTest, DBBlockCacheKeyTest,
+                        ::testing::Combine(::testing::Bool(),
+                                           ::testing::Bool()));
+
+class DBBlockCachePinningTest
+    : public DBTestBase,
+      public testing::WithParamInterface<
+          std::tuple<bool, PinningTier, PinningTier, PinningTier>> {
+ public:
+  DBBlockCachePinningTest()
+      : DBTestBase("db_block_cache_test", /*env_do_fsync=*/false) {}
+
+  void SetUp() override {
+    partition_index_and_filters_ = std::get<0>(GetParam());
+    top_level_index_pinning_ = std::get<1>(GetParam());
+    partition_pinning_ = std::get<2>(GetParam());
+    unpartitioned_pinning_ = std::get<3>(GetParam());
+  }
+
+  bool partition_index_and_filters_;
+  PinningTier top_level_index_pinning_;
+  PinningTier partition_pinning_;
+  PinningTier unpartitioned_pinning_;
+};
+
+TEST_P(DBBlockCachePinningTest, TwoLevelDB) {
+  // Creates one file in L0 and one file in L1. Both files have enough data that
+  // their index and filter blocks are partitioned. The L1 file will also have
+  // a compression dictionary (those are trained only during compaction), which
+  // must be unpartitioned.
+  const int kKeySize = 32;
+  const int kBlockSize = 128;
+  const int kNumBlocksPerFile = 128;
+  const int kNumKeysPerFile = kBlockSize * kNumBlocksPerFile / kKeySize;
+
+  Options options = CurrentOptions();
+  // `kNoCompression` makes the unit test more portable. But it relies on the
+  // current behavior of persisting/accessing dictionary even when there's no
+  // (de)compression happening, which seems fairly likely to change over time.
+  options.compression = kNoCompression;
+  options.compression_opts.max_dict_bytes = 4 << 10;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = NewLRUCache(1 << 20 /* capacity */);
+  table_options.block_size = kBlockSize;
+  table_options.metadata_block_size = kBlockSize;
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.metadata_cache_options.top_level_index_pinning =
+      top_level_index_pinning_;
+  table_options.metadata_cache_options.partition_pinning = partition_pinning_;
+  table_options.metadata_cache_options.unpartitioned_pinning =
+      unpartitioned_pinning_;
+  table_options.filter_policy.reset(
+      NewBloomFilterPolicy(10 /* bits_per_key */));
+  if (partition_index_and_filters_) {
+    table_options.index_type =
+        BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    table_options.partition_filters = true;
+  }
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kKeySize)));
+    }
+    ASSERT_OK(Flush());
+    if (i == 0) {
+      // Prevent trivial move so file will be rewritten with dictionary and
+      // reopened with L1's pinning settings.
+      CompactRangeOptions cro;
+      cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+      ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+    }
+  }
+
+  // Clear all unpinned blocks so unpinned blocks will show up as cache misses
+  // when reading a key from a file.
+  table_options.block_cache->EraseUnRefEntries();
+
+  // Get base cache values
+  uint64_t filter_misses = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+  uint64_t index_misses = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS);
+  uint64_t compression_dict_misses =
+      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS);
+
+  // Read a key from the L0 file
+  Get(Key(kNumKeysPerFile));
+  uint64_t expected_filter_misses = filter_misses;
+  uint64_t expected_index_misses = index_misses;
+  uint64_t expected_compression_dict_misses = compression_dict_misses;
+  if (partition_index_and_filters_) {
+    if (top_level_index_pinning_ == PinningTier::kNone) {
+      ++expected_filter_misses;
+      ++expected_index_misses;
+    }
+    if (partition_pinning_ == PinningTier::kNone) {
+      ++expected_filter_misses;
+      ++expected_index_misses;
+    }
+  } else {
+    if (unpartitioned_pinning_ == PinningTier::kNone) {
+      ++expected_filter_misses;
+      ++expected_index_misses;
+    }
+  }
+  if (unpartitioned_pinning_ == PinningTier::kNone) {
+    ++expected_compression_dict_misses;
+  }
+  ASSERT_EQ(expected_filter_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(expected_index_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(expected_compression_dict_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS));
+
+  // Clear all unpinned blocks so unpinned blocks will show up as cache misses
+  // when reading a key from a file.
+  table_options.block_cache->EraseUnRefEntries();
+
+  // Read a key from the L1 file
+  Get(Key(0));
+  if (partition_index_and_filters_) {
+    if (top_level_index_pinning_ == PinningTier::kNone ||
+        top_level_index_pinning_ == PinningTier::kFlushedAndSimilar) {
+      ++expected_filter_misses;
+      ++expected_index_misses;
+    }
+    if (partition_pinning_ == PinningTier::kNone ||
+        partition_pinning_ == PinningTier::kFlushedAndSimilar) {
+      ++expected_filter_misses;
+      ++expected_index_misses;
+    }
+  } else {
+    if (unpartitioned_pinning_ == PinningTier::kNone ||
+        unpartitioned_pinning_ == PinningTier::kFlushedAndSimilar) {
+      ++expected_filter_misses;
+      ++expected_index_misses;
+    }
+  }
+  if (unpartitioned_pinning_ == PinningTier::kNone ||
+      unpartitioned_pinning_ == PinningTier::kFlushedAndSimilar) {
+    ++expected_compression_dict_misses;
+  }
+  ASSERT_EQ(expected_filter_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(expected_index_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(expected_compression_dict_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DBBlockCachePinningTest, DBBlockCachePinningTest,
+    ::testing::Combine(
+        ::testing::Bool(),
+        ::testing::Values(PinningTier::kNone, PinningTier::kFlushedAndSimilar,
+                          PinningTier::kAll),
+        ::testing::Values(PinningTier::kNone, PinningTier::kFlushedAndSimilar,
+                          PinningTier::kAll),
+        ::testing::Values(PinningTier::kNone, PinningTier::kFlushedAndSimilar,
+                          PinningTier::kAll)));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_bloom_filter_test.cc b/src/rocksdb/db/db_bloom_filter_test.cc
new file mode 100644
index 000000000..d68ab6115
--- /dev/null
+++ b/src/rocksdb/db/db_bloom_filter_test.cc
@@ -0,0 +1,3498 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <cstring>
+#include <iomanip>
+#include <sstream>
+#include <string>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_reservation_manager.h"
+#include "db/db_test_util.h"
+#include "options/options_helper.h"
+#include "port/stack_trace.h"
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "table/format.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+std::shared_ptr<const FilterPolicy> Create(double bits_per_key,
+                                           const std::string& name) {
+  return BloomLikeFilterPolicy::Create(name, bits_per_key);
+}
+const std::string kLegacyBloom = test::LegacyBloomFilterPolicy::kClassName();
+const std::string kFastLocalBloom =
+    test::FastLocalBloomFilterPolicy::kClassName();
+const std::string kStandard128Ribbon =
+    test::Standard128RibbonFilterPolicy::kClassName();
+const std::string kAutoBloom = BloomFilterPolicy::kClassName();
+const std::string kAutoRibbon = RibbonFilterPolicy::kClassName();
+}  // anonymous namespace
+
+// DB tests related to bloom filter.
+
+class DBBloomFilterTest : public DBTestBase {
+ public:
+  DBBloomFilterTest()
+      : DBTestBase("db_bloom_filter_test", /*env_do_fsync=*/true) {}
+};
+
+class DBBloomFilterTestWithParam
+    : public DBTestBase,
+      public testing::WithParamInterface<
+          std::tuple<std::string, bool, uint32_t>> {
+  //                             public testing::WithParamInterface<bool> {
+ protected:
+  std::string bfp_impl_;
+  bool partition_filters_;
+  uint32_t format_version_;
+
+ public:
+  DBBloomFilterTestWithParam()
+      : DBTestBase("db_bloom_filter_tests", /*env_do_fsync=*/true) {}
+
+  ~DBBloomFilterTestWithParam() override {}
+
+  void SetUp() override {
+    bfp_impl_ = std::get<0>(GetParam());
+    partition_filters_ = std::get<1>(GetParam());
+    format_version_ = std::get<2>(GetParam());
+  }
+};
+
+class DBBloomFilterTestDefFormatVersion : public DBBloomFilterTestWithParam {};
+
+class SliceTransformLimitedDomainGeneric : public SliceTransform {
+  const char* Name() const override {
+    return "SliceTransformLimitedDomainGeneric";
+  }
+
+  Slice Transform(const Slice& src) const override {
+    return Slice(src.data(), 5);
+  }
+
+  bool InDomain(const Slice& src) const override {
+    // prefix will be x????
+    return src.size() >= 5;
+  }
+
+  bool InRange(const Slice& dst) const override {
+    // prefix will be x????
+    return dst.size() == 5;
+  }
+};
+
+// KeyMayExist can lead to a few false positives, but not false negatives.
+// To make test deterministic, use a much larger number of bits per key-20 than
+// bits in the key, so that false positives are eliminated
+TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) {
+  do {
+    ReadOptions ropts;
+    std::string value;
+    anon::OptionsOverride options_override;
+    options_override.filter_policy = Create(20, bfp_impl_);
+    options_override.partition_filters = partition_filters_;
+    options_override.metadata_block_size = 32;
+    options_override.full_block_cache = true;
+    Options options = CurrentOptions(options_override);
+    if (partition_filters_) {
+      auto* table_options =
+          options.table_factory->GetOptions<BlockBasedTableOptions>();
+      if (table_options != nullptr &&
+          table_options->index_type !=
+              BlockBasedTableOptions::kTwoLevelIndexSearch) {
+        // In the current implementation partitioned filters depend on
+        // partitioned indexes
+        continue;
+      }
+    }
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+
+    ASSERT_OK(Put(1, "a", "b"));
+    bool value_found = false;
+    ASSERT_TRUE(
+        db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found));
+    ASSERT_TRUE(value_found);
+    ASSERT_EQ("b", value);
+
+    ASSERT_OK(Flush(1));
+    value.clear();
+
+    uint64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    ASSERT_TRUE(
+        db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found));
+    ASSERT_TRUE(!value_found);
+    // assert that no new files were opened and no new blocks were
+    // read into block cache.
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+    ASSERT_OK(Delete(1, "a"));
+
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
+                                          true /* disallow trivial move */));
+
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+    ASSERT_OK(Delete(1, "c"));
+
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "c", &value));
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+    // KeyMayExist function only checks data in block caches, which is not used
+    // by plain table format.
+  } while (
+      ChangeOptions(kSkipPlainTable | kSkipHashIndex | kSkipFIFOCompaction));
+}
+
+TEST_F(DBBloomFilterTest, GetFilterByPrefixBloomCustomPrefixExtractor) {
+  for (bool partition_filters : {true, false}) {
+    Options options = last_options_;
+    options.prefix_extractor =
+        std::make_shared<SliceTransformLimitedDomainGeneric>();
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    get_perf_context()->EnablePerLevelPerfContext();
+    BlockBasedTableOptions bbto;
+    bbto.filter_policy.reset(NewBloomFilterPolicy(10));
+    if (partition_filters) {
+      bbto.partition_filters = true;
+      bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    }
+    bbto.whole_key_filtering = false;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    DestroyAndReopen(options);
+
+    WriteOptions wo;
+    ReadOptions ro;
+    FlushOptions fo;
+    fo.wait = true;
+    std::string value;
+
+    ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo"));
+    ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2"));
+    ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar"));
+
+    ASSERT_OK(dbfull()->Flush(fo));
+
+    ASSERT_EQ("foo", Get("barbarbar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+    ASSERT_EQ(
+        0,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+    ASSERT_EQ("foo2", Get("barbarbar2"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+    ASSERT_EQ(
+        0,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+    ASSERT_EQ("NOT_FOUND", Get("barbarbar3"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+    ASSERT_EQ(
+        0,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+
+    ASSERT_EQ("NOT_FOUND", Get("barfoofoo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ(
+        1,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+
+    ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+    ASSERT_EQ(
+        2,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+
+    ro.total_order_seek = true;
+    // NOTE: total_order_seek no longer affects Get()
+    ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3);
+    ASSERT_EQ(
+        3,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+
+    // No bloom on extractor changed
+#ifndef ROCKSDB_LITE
+    ASSERT_OK(db_->SetOptions({{"prefix_extractor", "capped:10"}}));
+    ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3);
+    ASSERT_EQ(
+        3,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+#endif  // ROCKSDB_LITE
+
+    // No bloom on extractor changed, after re-open
+    options.prefix_extractor.reset(NewCappedPrefixTransform(10));
+    Reopen(options);
+    ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3);
+    ASSERT_EQ(
+        3,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+
+    get_perf_context()->Reset();
+  }
+}
+
+TEST_F(DBBloomFilterTest, GetFilterByPrefixBloom) {
+  for (bool partition_filters : {true, false}) {
+    Options options = last_options_;
+    options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    get_perf_context()->EnablePerLevelPerfContext();
+    BlockBasedTableOptions bbto;
+    bbto.filter_policy.reset(NewBloomFilterPolicy(10));
+    if (partition_filters) {
+      bbto.partition_filters = true;
+      bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    }
+    bbto.whole_key_filtering = false;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    DestroyAndReopen(options);
+
+    WriteOptions wo;
+    ReadOptions ro;
+    FlushOptions fo;
+    fo.wait = true;
+    std::string value;
+
+    ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo"));
+    ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2"));
+    ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar"));
+
+    ASSERT_OK(dbfull()->Flush(fo));
+
+    ASSERT_EQ("foo", Get("barbarbar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+    ASSERT_EQ("foo2", Get("barbarbar2"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+    ASSERT_EQ("NOT_FOUND", Get("barbarbar3"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+
+    ASSERT_EQ("NOT_FOUND", Get("barfoofoo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+
+    ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+
+    ro.total_order_seek = true;
+    // NOTE: total_order_seek no longer affects Get()
+    ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3);
+    ASSERT_EQ(
+        3,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+
+    // No bloom on extractor changed
+#ifndef ROCKSDB_LITE
+    ASSERT_OK(db_->SetOptions({{"prefix_extractor", "capped:10"}}));
+    ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3);
+    ASSERT_EQ(
+        3,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+#endif  // ROCKSDB_LITE
+
+    get_perf_context()->Reset();
+  }
+}
+
+TEST_F(DBBloomFilterTest, WholeKeyFilterProp) {
+  for (bool partition_filters : {true, false}) {
+    Options options = last_options_;
+    options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    get_perf_context()->EnablePerLevelPerfContext();
+
+    BlockBasedTableOptions bbto;
+    bbto.filter_policy.reset(NewBloomFilterPolicy(10));
+    bbto.whole_key_filtering = false;
+    if (partition_filters) {
+      bbto.partition_filters = true;
+      bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    }
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    DestroyAndReopen(options);
+
+    WriteOptions wo;
+    ReadOptions ro;
+    FlushOptions fo;
+    fo.wait = true;
+    std::string value;
+
+    ASSERT_OK(dbfull()->Put(wo, "foobar", "foo"));
+    // Needs insert some keys to make sure files are not filtered out by key
+    // ranges.
+    ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
+    ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
+    ASSERT_OK(dbfull()->Flush(fo));
+
+    Reopen(options);
+    ASSERT_EQ("NOT_FOUND", Get("foo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+    ASSERT_EQ("NOT_FOUND", Get("bar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ("foo", Get("foobar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+
+    // Reopen with whole key filtering enabled and prefix extractor
+    // NULL. Bloom filter should be off for both of whole key and
+    // prefix bloom.
+    bbto.whole_key_filtering = true;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    options.prefix_extractor.reset();
+    Reopen(options);
+
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ("NOT_FOUND", Get("foo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ("NOT_FOUND", Get("bar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ("foo", Get("foobar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    // Write DB with only full key filtering.
+    ASSERT_OK(dbfull()->Put(wo, "foobar", "foo"));
+    // Needs insert some keys to make sure files are not filtered out by key
+    // ranges.
+    ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
+    ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+    // Reopen with both of whole key off and prefix extractor enabled.
+    // Still no bloom filter should be used.
+    options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+    bbto.whole_key_filtering = false;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    Reopen(options);
+
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ("NOT_FOUND", Get("foo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ("NOT_FOUND", Get("bar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ("foo", Get("foobar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+
+    // Try to create a DB with mixed files:
+    ASSERT_OK(dbfull()->Put(wo, "foobar", "foo"));
+    // Needs insert some keys to make sure files are not filtered out by key
+    // ranges.
+    ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
+    ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+    options.prefix_extractor.reset();
+    bbto.whole_key_filtering = true;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    Reopen(options);
+
+    // Try to create a DB with mixed files.
+    ASSERT_OK(dbfull()->Put(wo, "barfoo", "bar"));
+    // In this case needs insert some keys to make sure files are
+    // not filtered out by key ranges.
+    ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
+    ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
+    ASSERT_OK(Flush());
+
+    // Now we have two files:
+    // File 1: An older file with prefix bloom.
+    // File 2: A newer file with whole bloom filter.
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ("NOT_FOUND", Get("foo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+    ASSERT_EQ("NOT_FOUND", Get("bar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3);
+    ASSERT_EQ("foo", Get("foobar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4);
+    ASSERT_EQ("bar", Get("barfoo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4);
+
+    // Reopen with the same setting: only whole key is used
+    Reopen(options);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4);
+    ASSERT_EQ("NOT_FOUND", Get("foo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 5);
+    ASSERT_EQ("NOT_FOUND", Get("bar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 6);
+    ASSERT_EQ("foo", Get("foobar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7);
+    ASSERT_EQ("bar", Get("barfoo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7);
+
+    // Restart with both filters are allowed
+    options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+    bbto.whole_key_filtering = true;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    Reopen(options);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7);
+    // File 1 will has it filtered out.
+    // File 2 will not, as prefix `foo` exists in the file.
+    ASSERT_EQ("NOT_FOUND", Get("foo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 8);
+    ASSERT_EQ("NOT_FOUND", Get("bar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 10);
+    ASSERT_EQ("foo", Get("foobar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
+    ASSERT_EQ("bar", Get("barfoo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
+
+    // Restart with only prefix bloom is allowed.
+    options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+    bbto.whole_key_filtering = false;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    Reopen(options);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
+    ASSERT_EQ("NOT_FOUND", Get("foo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
+    ASSERT_EQ("NOT_FOUND", Get("bar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12);
+    ASSERT_EQ("foo", Get("foobar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12);
+    ASSERT_EQ("bar", Get("barfoo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12);
+    uint64_t bloom_filter_useful_all_levels = 0;
+    for (auto& kv : (*(get_perf_context()->level_to_perf_context))) {
+      if (kv.second.bloom_filter_useful > 0) {
+        bloom_filter_useful_all_levels += kv.second.bloom_filter_useful;
+      }
+    }
+    ASSERT_EQ(12, bloom_filter_useful_all_levels);
+    get_perf_context()->Reset();
+  }
+}
+
+TEST_P(DBBloomFilterTestWithParam, BloomFilter) {
+  do {
+    Options options = CurrentOptions();
+    env_->count_random_reads_ = true;
+    options.env = env_;
+    // ChangeCompactOptions() only changes compaction style, which does not
+    // trigger reset of table_factory
+    BlockBasedTableOptions table_options;
+    table_options.no_block_cache = true;
+    table_options.filter_policy = Create(10, bfp_impl_);
+    table_options.partition_filters = partition_filters_;
+    if (partition_filters_) {
+      table_options.index_type =
+          BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    }
+    table_options.format_version = format_version_;
+    if (format_version_ >= 4) {
+      // value delta encoding challenged more with index interval > 1
+      table_options.index_block_restart_interval = 8;
+    }
+    table_options.metadata_block_size = 32;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Populate multiple layers
+    const int N = 10000;
+    for (int i = 0; i < N; i++) {
+      ASSERT_OK(Put(1, Key(i), Key(i)));
+    }
+    Compact(1, "a", "z");
+    for (int i = 0; i < N; i += 100) {
+      ASSERT_OK(Put(1, Key(i), Key(i)));
+    }
+    ASSERT_OK(Flush(1));
+
+    // Prevent auto compactions triggered by seeks
+    env_->delay_sstable_sync_.store(true, std::memory_order_release);
+
+    // Lookup present keys.  Should rarely read from small sstable.
+    env_->random_read_counter_.Reset();
+    for (int i = 0; i < N; i++) {
+      ASSERT_EQ(Key(i), Get(1, Key(i)));
+    }
+    int reads = env_->random_read_counter_.Read();
+    fprintf(stderr, "%d present => %d reads\n", N, reads);
+    ASSERT_GE(reads, N);
+    if (partition_filters_) {
+      // Without block cache, we read an extra partition filter per each
+      // level*read and a partition index per each read
+      ASSERT_LE(reads, 4 * N + 2 * N / 100);
+    } else {
+      ASSERT_LE(reads, N + 2 * N / 100);
+    }
+
+    // Lookup present keys.  Should rarely read from either sstable.
+    env_->random_read_counter_.Reset();
+    for (int i = 0; i < N; i++) {
+      ASSERT_EQ("NOT_FOUND", Get(1, Key(i) + ".missing"));
+    }
+    reads = env_->random_read_counter_.Read();
+    fprintf(stderr, "%d missing => %d reads\n", N, reads);
+    if (partition_filters_) {
+      // With partitioned filter we read one extra filter per level per each
+      // missed read.
+      ASSERT_LE(reads, 2 * N + 3 * N / 100);
+    } else {
+      ASSERT_LE(reads, 3 * N / 100);
+    }
+
+#ifndef ROCKSDB_LITE
+    // Sanity check some table properties
+    std::map<std::string, std::string> props;
+    ASSERT_TRUE(db_->GetMapProperty(
+        handles_[1], DB::Properties::kAggregatedTableProperties, &props));
+    uint64_t nkeys = N + N / 100;
+    uint64_t filter_size = ParseUint64(props["filter_size"]);
+    EXPECT_LE(filter_size,
+              (partition_filters_ ? 12 : 11) * nkeys / /*bits / byte*/ 8);
+    if (bfp_impl_ == kAutoRibbon) {
+      // Sometimes using Ribbon filter which is more space-efficient
+      EXPECT_GE(filter_size, 7 * nkeys / /*bits / byte*/ 8);
+    } else {
+      // Always Bloom
+      EXPECT_GE(filter_size, 10 * nkeys / /*bits / byte*/ 8);
+    }
+
+    uint64_t num_filter_entries = ParseUint64(props["num_filter_entries"]);
+    EXPECT_EQ(num_filter_entries, nkeys);
+#endif  // ROCKSDB_LITE
+
+    env_->delay_sstable_sync_.store(false, std::memory_order_release);
+    Close();
+  } while (ChangeCompactOptions());
+}
+
+namespace {
+
+class AlwaysTrueBitsBuilder : public FilterBitsBuilder {
+ public:
+  void AddKey(const Slice&) override {}
+  size_t EstimateEntriesAdded() override { return 0U; }
+  Slice Finish(std::unique_ptr<const char[]>* /* buf */) override {
+    // Interpreted as "always true" filter (0 probes over 1 byte of
+    // payload, 5 bytes metadata)
+    return Slice("\0\0\0\0\0\0", 6);
+  }
+  using FilterBitsBuilder::Finish;
+  size_t ApproximateNumEntries(size_t) override { return SIZE_MAX; }
+};
+
+class AlwaysTrueFilterPolicy : public ReadOnlyBuiltinFilterPolicy {
+ public:
+  explicit AlwaysTrueFilterPolicy(bool skip) : skip_(skip) {}
+
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext&) const override {
+    if (skip_) {
+      return nullptr;
+    } else {
+      return new AlwaysTrueBitsBuilder();
+    }
+  }
+
+ private:
+  bool skip_;
+};
+
+}  // anonymous namespace
+
+TEST_P(DBBloomFilterTestWithParam, SkipFilterOnEssentiallyZeroBpk) {
+  constexpr int maxKey = 10;
+  auto PutFn = [&]() {
+    int i;
+    // Put
+    for (i = 0; i < maxKey; i++) {
+      ASSERT_OK(Put(Key(i), Key(i)));
+    }
+    Flush();
+  };
+  auto GetFn = [&]() {
+    int i;
+    // Get OK
+    for (i = 0; i < maxKey; i++) {
+      ASSERT_EQ(Key(i), Get(Key(i)));
+    }
+    // Get NotFound
+    for (; i < maxKey * 2; i++) {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  };
+  auto PutAndGetFn = [&]() {
+    PutFn();
+    GetFn();
+  };
+#ifndef ROCKSDB_LITE
+  std::map<std::string, std::string> props;
+  const auto& kAggTableProps = DB::Properties::kAggregatedTableProperties;
+#endif  // ROCKSDB_LITE
+
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.partition_filters = partition_filters_;
+  if (partition_filters_) {
+    table_options.index_type =
+        BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  }
+  table_options.format_version = format_version_;
+
+  // Test 1: bits per key < 0.5 means skip filters -> no filter
+  // constructed or read.
+  table_options.filter_policy = Create(0.4, bfp_impl_);
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+  PutAndGetFn();
+
+  // Verify no filter access nor contruction
+  EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), 0);
+  EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), 0);
+
+#ifndef ROCKSDB_LITE
+  props.clear();
+  ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props));
+  EXPECT_EQ(props["filter_size"], "0");
+#endif  // ROCKSDB_LITE
+
+  // Test 2: use custom API to skip filters -> no filter constructed
+  // or read.
+  table_options.filter_policy.reset(
+      new AlwaysTrueFilterPolicy(/* skip */ true));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+  PutAndGetFn();
+
+  // Verify no filter access nor construction
+  EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), 0);
+  EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), 0);
+
+#ifndef ROCKSDB_LITE
+  props.clear();
+  ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props));
+  EXPECT_EQ(props["filter_size"], "0");
+#endif  // ROCKSDB_LITE
+
+  // Control test: using an actual filter with 100% FP rate -> the filter
+  // is constructed and checked on read.
+  table_options.filter_policy.reset(
+      new AlwaysTrueFilterPolicy(/* skip */ false));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+  PutAndGetFn();
+
+  // Verify filter is accessed (and constructed)
+  EXPECT_EQ(TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE),
+            maxKey * 2);
+  EXPECT_EQ(
+      TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE),
+      maxKey);
+#ifndef ROCKSDB_LITE
+  props.clear();
+  ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props));
+  EXPECT_NE(props["filter_size"], "0");
+#endif  // ROCKSDB_LITE
+
+  // Test 3 (options test): Able to read existing filters with longstanding
+  // generated options file entry `filter_policy=rocksdb.BuiltinBloomFilter`
+  ASSERT_OK(FilterPolicy::CreateFromString(ConfigOptions(),
+                                           "rocksdb.BuiltinBloomFilter",
+                                           &table_options.filter_policy));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  GetFn();
+
+  // Verify filter is accessed
+  EXPECT_EQ(TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE),
+            maxKey * 2);
+  EXPECT_EQ(
+      TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE),
+      maxKey);
+
+  // But new filters are not generated (configuration details unknown)
+  DestroyAndReopen(options);
+  PutAndGetFn();
+
+  // Verify no filter access nor construction
+  EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), 0);
+  EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), 0);
+
+#ifndef ROCKSDB_LITE
+  props.clear();
+  ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props));
+  EXPECT_EQ(props["filter_size"], "0");
+#endif  // ROCKSDB_LITE
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+INSTANTIATE_TEST_CASE_P(
+    FormatDef, DBBloomFilterTestDefFormatVersion,
+    ::testing::Values(
+        std::make_tuple(kAutoBloom, true, test::kDefaultFormatVersion),
+        std::make_tuple(kAutoBloom, false, test::kDefaultFormatVersion),
+        std::make_tuple(kAutoRibbon, false, test::kDefaultFormatVersion)));
+
+INSTANTIATE_TEST_CASE_P(
+    FormatDef, DBBloomFilterTestWithParam,
+    ::testing::Values(
+        std::make_tuple(kAutoBloom, true, test::kDefaultFormatVersion),
+        std::make_tuple(kAutoBloom, false, test::kDefaultFormatVersion),
+        std::make_tuple(kAutoRibbon, false, test::kDefaultFormatVersion)));
+
+INSTANTIATE_TEST_CASE_P(
+    FormatLatest, DBBloomFilterTestWithParam,
+    ::testing::Values(std::make_tuple(kAutoBloom, true, kLatestFormatVersion),
+                      std::make_tuple(kAutoBloom, false, kLatestFormatVersion),
+                      std::make_tuple(kAutoRibbon, false,
+                                      kLatestFormatVersion)));
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_F(DBBloomFilterTest, BloomFilterRate) {
+  while (ChangeFilterOptions()) {
+    Options options = CurrentOptions();
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    get_perf_context()->EnablePerLevelPerfContext();
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    const int maxKey = 10000;
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_OK(Put(1, Key(i), Key(i)));
+    }
+    // Add a large key to make the file contain wide range
+    ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+    Flush(1);
+
+    // Check if they can be found
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_EQ(Key(i), Get(1, Key(i)));
+    }
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+
+    // Check if filter is useful
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333)));
+    }
+    ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey * 0.98);
+    ASSERT_GE(
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful,
+        maxKey * 0.98);
+    get_perf_context()->Reset();
+  }
+}
+
+namespace {
+struct CompatibilityConfig {
+  std::shared_ptr<const FilterPolicy> policy;
+  bool partitioned;
+  uint32_t format_version;
+
+  void SetInTableOptions(BlockBasedTableOptions* table_options) {
+    table_options->filter_policy = policy;
+    table_options->partition_filters = partitioned;
+    if (partitioned) {
+      table_options->index_type =
+          BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    } else {
+      table_options->index_type =
+          BlockBasedTableOptions::IndexType::kBinarySearch;
+    }
+    table_options->format_version = format_version;
+  }
+};
+// High bits per key -> almost no FPs
+std::shared_ptr<const FilterPolicy> kCompatibilityBloomPolicy{
+    NewBloomFilterPolicy(20)};
+// bloom_before_level=-1 -> always use Ribbon
+std::shared_ptr<const FilterPolicy> kCompatibilityRibbonPolicy{
+    NewRibbonFilterPolicy(20, -1)};
+
+std::vector<CompatibilityConfig> kCompatibilityConfigs = {
+    {kCompatibilityBloomPolicy, false, BlockBasedTableOptions().format_version},
+    {kCompatibilityBloomPolicy, true, BlockBasedTableOptions().format_version},
+    {kCompatibilityBloomPolicy, false, /* legacy Bloom */ 4U},
+    {kCompatibilityRibbonPolicy, false,
+     BlockBasedTableOptions().format_version},
+    {kCompatibilityRibbonPolicy, true, BlockBasedTableOptions().format_version},
+};
+}  // anonymous namespace
+
+TEST_F(DBBloomFilterTest, BloomFilterCompatibility) {
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.level0_file_num_compaction_trigger =
+      static_cast<int>(kCompatibilityConfigs.size()) + 1;
+  options.max_open_files = -1;
+
+  Close();
+
+  // Create one file for each kind of filter. Each file covers a distinct key
+  // range.
+  for (size_t i = 0; i < kCompatibilityConfigs.size(); ++i) {
+    BlockBasedTableOptions table_options;
+    kCompatibilityConfigs[i].SetInTableOptions(&table_options);
+    ASSERT_TRUE(table_options.filter_policy != nullptr);
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    Reopen(options);
+
+    std::string prefix = std::to_string(i) + "_";
+    ASSERT_OK(Put(prefix + "A", "val"));
+    ASSERT_OK(Put(prefix + "Z", "val"));
+    ASSERT_OK(Flush());
+  }
+
+  // Test filter is used between each pair of {reader,writer} configurations,
+  // because any built-in FilterPolicy should be able to read filters from any
+  // other built-in FilterPolicy
+  for (size_t i = 0; i < kCompatibilityConfigs.size(); ++i) {
+    BlockBasedTableOptions table_options;
+    kCompatibilityConfigs[i].SetInTableOptions(&table_options);
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    Reopen(options);
+    for (size_t j = 0; j < kCompatibilityConfigs.size(); ++j) {
+      std::string prefix = std::to_string(j) + "_";
+      ASSERT_EQ("val", Get(prefix + "A"));  // Filter positive
+      ASSERT_EQ("val", Get(prefix + "Z"));  // Filter positive
+      // Filter negative, with high probability
+      ASSERT_EQ("NOT_FOUND", Get(prefix + "Q"));
+      EXPECT_EQ(TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE),
+                2);
+      EXPECT_EQ(TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    }
+  }
+}
+
+// To align with the type of hash entry being reserved in implementation.
+using FilterConstructionReserveMemoryHash = uint64_t;
+
+class ChargeFilterConstructionTestWithParam
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<
+          CacheEntryRoleOptions::Decision, std::string, bool, bool>> {
+ public:
+  ChargeFilterConstructionTestWithParam()
+      : DBTestBase("db_bloom_filter_tests",
+                   /*env_do_fsync=*/true),
+        num_key_(0),
+        charge_filter_construction_(std::get<0>(GetParam())),
+        policy_(std::get<1>(GetParam())),
+        partition_filters_(std::get<2>(GetParam())),
+        detect_filter_construct_corruption_(std::get<3>(GetParam())) {
+    if (charge_filter_construction_ ==
+            CacheEntryRoleOptions::Decision::kDisabled ||
+        policy_ == kLegacyBloom) {
+      // For these cases, we only interested in whether filter construction
+      // cache charging happens instead of its accuracy. Therefore we don't
+      // need many keys.
+      num_key_ = 5;
+    } else if (partition_filters_) {
+      // For PartitionFilter case, since we set
+      // table_options.metadata_block_size big enough such that each partition
+      // trigger at least 1 dummy entry reservation each for hash entries and
+      // final filter, we need a large number of keys to ensure we have at least
+      // two partitions.
+      num_key_ = 18 *
+                 CacheReservationManagerImpl<
+                     CacheEntryRole::kFilterConstruction>::GetDummyEntrySize() /
+                 sizeof(FilterConstructionReserveMemoryHash);
+    } else if (policy_ == kFastLocalBloom) {
+      // For Bloom Filter + FullFilter case, since we design the num_key_ to
+      // make hash entry cache charging be a multiple of dummy entries, the
+      // correct behavior of charging final filter on top of it will trigger at
+      // least another dummy entry insertion. Therefore we can assert that
+      // behavior and we don't need a large number of keys to verify we
+      // indeed charge the final filter for in cache, even though final
+      // filter is a lot smaller than hash entries.
+      num_key_ = 1 *
+                 CacheReservationManagerImpl<
+                     CacheEntryRole::kFilterConstruction>::GetDummyEntrySize() /
+                 sizeof(FilterConstructionReserveMemoryHash);
+    } else {
+      // For Ribbon Filter + FullFilter case, we need a large enough number of
+      // keys so that charging final filter after releasing the hash entries
+      // reservation will trigger at least another dummy entry (or equivalently
+      // to saying, causing another peak in cache charging) as banding
+      // reservation might not be a multiple of dummy entry.
+      num_key_ = 12 *
+                 CacheReservationManagerImpl<
+                     CacheEntryRole::kFilterConstruction>::GetDummyEntrySize() /
+                 sizeof(FilterConstructionReserveMemoryHash);
+    }
+  }
+
+  BlockBasedTableOptions GetBlockBasedTableOptions() {
+    BlockBasedTableOptions table_options;
+
+    // We set cache capacity big enough to prevent cache full for convenience in
+    // calculation.
+    constexpr std::size_t kCacheCapacity = 100 * 1024 * 1024;
+
+    table_options.cache_usage_options.options_overrides.insert(
+        {CacheEntryRole::kFilterConstruction,
+         {/*.charged = */ charge_filter_construction_}});
+    table_options.filter_policy = Create(10, policy_);
+    table_options.partition_filters = partition_filters_;
+    if (table_options.partition_filters) {
+      table_options.index_type =
+          BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+      // We set table_options.metadata_block_size big enough so that each
+      // partition trigger at least 1 dummy entry insertion each for hash
+      // entries and final filter.
+      table_options.metadata_block_size = 409000;
+    }
+    table_options.detect_filter_construct_corruption =
+        detect_filter_construct_corruption_;
+
+    LRUCacheOptions lo;
+    lo.capacity = kCacheCapacity;
+    lo.num_shard_bits = 0;  // 2^0 shard
+    lo.strict_capacity_limit = true;
+    cache_ = std::make_shared<
+        TargetCacheChargeTrackingCache<CacheEntryRole::kFilterConstruction>>(
+        (NewLRUCache(lo)));
+    table_options.block_cache = cache_;
+
+    return table_options;
+  }
+
+  std::size_t GetNumKey() { return num_key_; }
+
+  CacheEntryRoleOptions::Decision ChargeFilterConstructMemory() {
+    return charge_filter_construction_;
+  }
+
+  std::string GetFilterPolicy() { return policy_; }
+
+  bool PartitionFilters() { return partition_filters_; }
+
+  std::shared_ptr<
+      TargetCacheChargeTrackingCache<CacheEntryRole::kFilterConstruction>>
+  GetCache() {
+    return cache_;
+  }
+
+ private:
+  std::size_t num_key_;
+  CacheEntryRoleOptions::Decision charge_filter_construction_;
+  std::string policy_;
+  bool partition_filters_;
+  std::shared_ptr<
+      TargetCacheChargeTrackingCache<CacheEntryRole::kFilterConstruction>>
+      cache_;
+  bool detect_filter_construct_corruption_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    ChargeFilterConstructionTestWithParam,
+    ChargeFilterConstructionTestWithParam,
+    ::testing::Values(
+        std::make_tuple(CacheEntryRoleOptions::Decision::kDisabled,
+                        kFastLocalBloom, false, false),
+
+        std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+                        kFastLocalBloom, false, false),
+        std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+                        kFastLocalBloom, false, true),
+        std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+                        kFastLocalBloom, true, false),
+        std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+                        kFastLocalBloom, true, true),
+
+        std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+                        kStandard128Ribbon, false, false),
+        std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+                        kStandard128Ribbon, false, true),
+        std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+                        kStandard128Ribbon, true, false),
+        std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+                        kStandard128Ribbon, true, true),
+
+        std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, kLegacyBloom,
+                        false, false)));
+
+// TODO: Speed up this test, and reduce disk space usage (~700MB)
+// The current test inserts many keys (on the scale of dummy entry size)
+// in order to make small memory user (e.g, final filter, partitioned hash
+// entries/filter/banding) , which is proportional to the number of
+// keys, big enough so that its cache charging triggers dummy entry insertion
+// and becomes observable in the test.
+//
+// However, inserting that many keys slows down this test and leaves future
+// developers an opportunity to speed it up.
+//
+// Possible approaches & challenges:
+// 1. Use sync point during cache charging of filter construction
+//
+// Benefit: It does not rely on triggering dummy entry insertion
+// but the sync point to verify small memory user is charged correctly.
+//
+// Challenge: this approach is intrusive.
+//
+// 2. Make dummy entry size configurable and set it small in the test
+//
+// Benefit: It increases the precision of cache charging and therefore
+// small memory usage can still trigger insertion of dummy entry.
+//
+// Challenge: change CacheReservationManager related APIs and a hack
+// might be needed to control the size of dummmy entry of
+// CacheReservationManager used in filter construction for testing
+// since CacheReservationManager is not exposed at the high level.
+//
+TEST_P(ChargeFilterConstructionTestWithParam, Basic) {
+  Options options = CurrentOptions();
+  // We set write_buffer_size big enough so that in the case where there is
+  // filter construction cache charging, flush won't be triggered before we
+  // manually trigger it for clean testing
+  options.write_buffer_size = 640 << 20;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  std::shared_ptr<
+      TargetCacheChargeTrackingCache<CacheEntryRole::kFilterConstruction>>
+      cache = GetCache();
+  options.create_if_missing = true;
+  // Disable auto compaction to prevent its unexpected side effect
+  // to the number of keys per partition designed by us in the test
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+  int num_key = static_cast<int>(GetNumKey());
+  for (int i = 0; i < num_key; i++) {
+    ASSERT_OK(Put(Key(i), Key(i)));
+  }
+
+  ASSERT_EQ(cache->GetChargedCacheIncrementSum(), 0)
+      << "Flush was triggered too early in the test case with filter "
+         "construction cache charging - please make sure no flush triggered "
+         "during the key insertions above";
+
+  ASSERT_OK(Flush());
+
+  bool charge_filter_construction = (ChargeFilterConstructMemory() ==
+                                     CacheEntryRoleOptions::Decision::kEnabled);
+  std::string policy = GetFilterPolicy();
+  bool partition_filters = PartitionFilters();
+  bool detect_filter_construct_corruption =
+      table_options.detect_filter_construct_corruption;
+
+  std::deque<std::size_t> filter_construction_cache_res_peaks =
+      cache->GetChargedCachePeaks();
+  std::size_t filter_construction_cache_res_increments_sum =
+      cache->GetChargedCacheIncrementSum();
+
+  if (!charge_filter_construction) {
+    EXPECT_EQ(filter_construction_cache_res_peaks.size(), 0);
+    return;
+  }
+
+  if (policy == kLegacyBloom) {
+    EXPECT_EQ(filter_construction_cache_res_peaks.size(), 0)
+        << "There shouldn't be filter construction cache charging as this "
+           "feature does not support kLegacyBloom";
+    return;
+  }
+
+  const std::size_t kDummyEntrySize = CacheReservationManagerImpl<
+      CacheEntryRole::kFilterConstruction>::GetDummyEntrySize();
+
+  const std::size_t predicted_hash_entries_cache_res =
+      num_key * sizeof(FilterConstructionReserveMemoryHash);
+  ASSERT_EQ(predicted_hash_entries_cache_res % kDummyEntrySize, 0)
+      << "It's by this test's design that predicted_hash_entries_cache_res is "
+         "a multipe of dummy entry";
+
+  const std::size_t predicted_hash_entries_cache_res_dummy_entry_num =
+      predicted_hash_entries_cache_res / kDummyEntrySize;
+  const std::size_t predicted_final_filter_cache_res =
+      static_cast<std::size_t>(
+          std::ceil(1.0 * predicted_hash_entries_cache_res_dummy_entry_num / 6 *
+                    (policy == kStandard128Ribbon ? 0.7 : 1))) *
+      kDummyEntrySize;
+  const std::size_t predicted_banding_cache_res =
+      static_cast<std::size_t>(
+          std::ceil(predicted_hash_entries_cache_res_dummy_entry_num * 2.5)) *
+      kDummyEntrySize;
+
+  if (policy == kFastLocalBloom) {
+    /* kFastLocalBloom + FullFilter
+     *        p0
+     *       /  \
+     *    b /    \
+     *     /      \
+     *    /        \
+     *  0/          \
+     *  hash entries = b - 0, final filter = p0 - b
+     *  p0 = hash entries + final filter
+     *
+     *  The test is designed in a way such that the reservation for b is a
+     *  multiple of dummy entries so that reservation for (p0 - b)
+     *  will trigger at least another dummy entry insertion.
+     *
+     * kFastLocalBloom + FullFilter +
+     * detect_filter_construct_corruption
+     *  The peak p0 stays the same as
+     *  (kFastLocalBloom + FullFilter) but just lasts
+     *  longer since we release hash entries reservation later.
+     *
+     * kFastLocalBloom + PartitionedFilter
+     *                   p1
+     *                  /  \
+     *        p0     b'/    \
+     *       /  \     /      \
+     *    b /    \   /        \
+     *     /      \ /          \
+     *    /        a            \
+     *  0/                       \
+     *  partitioned hash entries1 = b - 0, partitioned hash entries1 = b' - a
+     *  parittioned final filter1 = p0 - b, parittioned final filter2 = p1 - b'
+     *
+     *  (increment p0 - 0) + (increment p1 - a)
+     *  = partitioned hash entries1 + partitioned hash entries2
+     *  + parittioned final filter1 + parittioned final filter2
+     *  = hash entries + final filter
+     *
+     * kFastLocalBloom + PartitionedFilter +
+     * detect_filter_construct_corruption
+     *  The peak p0, p1 stay the same as
+     *  (kFastLocalBloom + PartitionedFilter) but just
+     *  last longer since we release hash entries reservation later.
+     *
+     */
+    if (!partition_filters) {
+      EXPECT_EQ(filter_construction_cache_res_peaks.size(), 1)
+          << "Filter construction cache charging should have only 1 peak in "
+             "case: kFastLocalBloom + FullFilter";
+      std::size_t filter_construction_cache_res_peak =
+          filter_construction_cache_res_peaks[0];
+      EXPECT_GT(filter_construction_cache_res_peak,
+                predicted_hash_entries_cache_res)
+          << "The testing number of hash entries is designed to make hash "
+             "entries cache charging be multiples of dummy entries"
+             " so the correct behavior of charging final filter on top of it"
+             " should've triggered at least another dummy entry insertion";
+
+      std::size_t predicted_filter_construction_cache_res_peak =
+          predicted_hash_entries_cache_res + predicted_final_filter_cache_res;
+      EXPECT_GE(filter_construction_cache_res_peak,
+                predicted_filter_construction_cache_res_peak * 0.9);
+      EXPECT_LE(filter_construction_cache_res_peak,
+                predicted_filter_construction_cache_res_peak * 1.1);
+      return;
+    } else {
+      EXPECT_GE(filter_construction_cache_res_peaks.size(), 2)
+          << "Filter construction cache charging should have multiple peaks "
+             "in case: kFastLocalBloom + "
+             "PartitionedFilter";
+      std::size_t predicted_filter_construction_cache_res_increments_sum =
+          predicted_hash_entries_cache_res + predicted_final_filter_cache_res;
+      EXPECT_GE(filter_construction_cache_res_increments_sum,
+                predicted_filter_construction_cache_res_increments_sum * 0.9);
+      EXPECT_LE(filter_construction_cache_res_increments_sum,
+                predicted_filter_construction_cache_res_increments_sum * 1.1);
+      return;
+    }
+  }
+
+  if (policy == kStandard128Ribbon) {
+    /* kStandard128Ribbon + FullFilter
+     *        p0
+     *       /  \  p1
+     *      /    \/\
+     *   b /     b' \
+     *    /          \
+     *  0/            \
+     *  hash entries = b - 0, banding = p0 - b, final filter = p1 - b'
+     *  p0 = hash entries + banding
+     *
+     *  The test is designed in a way such that the reservation for (p1 - b')
+     *  will trigger at least another dummy entry insertion
+     *  (or equivelantly to saying, creating another peak).
+     *
+     * kStandard128Ribbon + FullFilter +
+     * detect_filter_construct_corruption
+     *
+     *         new p0
+     *          /  \
+     *         /    \
+     *     pre p0    \
+     *       /        \
+     *      /          \
+     *   b /            \
+     *    /              \
+     *  0/                \
+     *  hash entries = b - 0, banding = pre p0 - b,
+     *  final filter = new p0 - pre p0
+     *  new p0 =  hash entries + banding + final filter
+     *
+     *  The previous p0 will no longer be a peak since under
+     *  detect_filter_construct_corruption == true, we do not release hash
+     *  entries reserveration (like p0 - b' previously) until after final filter
+     *  creation and post-verification
+     *
+     * kStandard128Ribbon + PartitionedFilter
+     *                     p3
+     *        p0           /\  p4
+     *       /  \ p1      /  \ /\
+     *      /    \/\  b''/    a' \
+     *   b /     b' \   /         \
+     *    /          \ /           \
+     *  0/            a             \
+     *  partitioned hash entries1 = b - 0, partitioned hash entries2 = b'' - a
+     *  partitioned banding1 = p0 - b, partitioned banding2 = p3 - b''
+     *  parittioned final filter1 = p1 - b',parittioned final filter2 = p4 - a'
+     *
+     *  (increment p0 - 0) + (increment p1 - b')
+     *  + (increment p3 - a) + (increment p4 - a')
+     *  = partitioned hash entries1 + partitioned hash entries2
+     *  + parittioned banding1 + parittioned banding2
+     *  + parittioned final filter1 + parittioned final filter2
+     *  = hash entries + banding + final filter
+     *
+     * kStandard128Ribbon + PartitionedFilter +
+     * detect_filter_construct_corruption
+     *
+     *                          new p3
+     *                          /    \
+     *                        pre p3  \
+     *        new p0          /        \
+     *         /  \          /          \
+     *      pre p0 \        /            \
+     *       /      \    b'/              \
+     *      /        \    /                \
+     *   b /          \  /                  \
+     *    /            \a                    \
+     *  0/                                    \
+     *  partitioned hash entries1 = b - 0, partitioned hash entries2 = b' - a
+     *  partitioned banding1 = pre p0 - b, partitioned banding2 = pre p3 - b'
+     *  parittioned final filter1 = new p0 - pre p0,
+     *  parittioned final filter2 = new p3 - pre p3
+     *
+     *  The previous p0 and p3 will no longer be a peak since under
+     *  detect_filter_construct_corruption == true, we do not release hash
+     *  entries reserveration (like p0 - b', p3 - a' previously) until after
+     *  parittioned final filter creation and post-verification
+     *
+     *  However, increments sum stay the same as shown below:
+     *    (increment new p0 - 0) + (increment new p3 - a)
+     *  = partitioned hash entries1 + partitioned hash entries2
+     *  + parittioned banding1 + parittioned banding2
+     *  + parittioned final filter1 + parittioned final filter2
+     *  = hash entries + banding + final filter
+     *
+     */
+    if (!partition_filters) {
+      ASSERT_GE(
+          std::floor(
+              1.0 * predicted_final_filter_cache_res /
+              CacheReservationManagerImpl<
+                  CacheEntryRole::kFilterConstruction>::GetDummyEntrySize()),
+          1)
+          << "Final filter cache charging too small for this test - please "
+             "increase the number of keys";
+      if (!detect_filter_construct_corruption) {
+        EXPECT_EQ(filter_construction_cache_res_peaks.size(), 2)
+            << "Filter construction cache charging should have 2 peaks in "
+               "case: kStandard128Ribbon + "
+               "FullFilter. "
+               "The second peak is resulted from charging the final filter "
+               "after "
+               "decreasing the hash entry reservation since the testing final "
+               "filter reservation is designed to be at least 1 dummy entry "
+               "size";
+
+        std::size_t filter_construction_cache_res_peak =
+            filter_construction_cache_res_peaks[0];
+        std::size_t predicted_filter_construction_cache_res_peak =
+            predicted_hash_entries_cache_res + predicted_banding_cache_res;
+        EXPECT_GE(filter_construction_cache_res_peak,
+                  predicted_filter_construction_cache_res_peak * 0.9);
+        EXPECT_LE(filter_construction_cache_res_peak,
+                  predicted_filter_construction_cache_res_peak * 1.1);
+      } else {
+        EXPECT_EQ(filter_construction_cache_res_peaks.size(), 1)
+            << "Filter construction cache charging should have 1 peaks in "
+               "case: kStandard128Ribbon + FullFilter "
+               "+ detect_filter_construct_corruption. "
+               "The previous second peak now disappears since we don't "
+               "decrease the hash entry reservation"
+               "until after final filter reservation and post-verification";
+
+        std::size_t filter_construction_cache_res_peak =
+            filter_construction_cache_res_peaks[0];
+        std::size_t predicted_filter_construction_cache_res_peak =
+            predicted_hash_entries_cache_res + predicted_banding_cache_res +
+            predicted_final_filter_cache_res;
+        EXPECT_GE(filter_construction_cache_res_peak,
+                  predicted_filter_construction_cache_res_peak * 0.9);
+        EXPECT_LE(filter_construction_cache_res_peak,
+                  predicted_filter_construction_cache_res_peak * 1.1);
+      }
+      return;
+    } else {
+      if (!detect_filter_construct_corruption) {
+        EXPECT_GE(filter_construction_cache_res_peaks.size(), 3)
+            << "Filter construction cache charging should have more than 3 "
+               "peaks "
+               "in case: kStandard128Ribbon + "
+               "PartitionedFilter";
+      } else {
+        EXPECT_GE(filter_construction_cache_res_peaks.size(), 2)
+            << "Filter construction cache charging should have more than 2 "
+               "peaks "
+               "in case: kStandard128Ribbon + "
+               "PartitionedFilter + detect_filter_construct_corruption";
+      }
+      std::size_t predicted_filter_construction_cache_res_increments_sum =
+          predicted_hash_entries_cache_res + predicted_banding_cache_res +
+          predicted_final_filter_cache_res;
+      EXPECT_GE(filter_construction_cache_res_increments_sum,
+                predicted_filter_construction_cache_res_increments_sum * 0.9);
+      EXPECT_LE(filter_construction_cache_res_increments_sum,
+                predicted_filter_construction_cache_res_increments_sum * 1.1);
+      return;
+    }
+  }
+}
+
+class DBFilterConstructionCorruptionTestWithParam
+    : public DBTestBase,
+      public testing::WithParamInterface<
+          std::tuple<bool /* detect_filter_construct_corruption */, std::string,
+                     bool /* partition_filters */>> {
+ public:
+  DBFilterConstructionCorruptionTestWithParam()
+      : DBTestBase("db_bloom_filter_tests",
+                   /*env_do_fsync=*/true) {}
+
+  BlockBasedTableOptions GetBlockBasedTableOptions() {
+    BlockBasedTableOptions table_options;
+    table_options.detect_filter_construct_corruption = std::get<0>(GetParam());
+    table_options.filter_policy = Create(10, std::get<1>(GetParam()));
+    table_options.partition_filters = std::get<2>(GetParam());
+    if (table_options.partition_filters) {
+      table_options.index_type =
+          BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+      // We set table_options.metadata_block_size small enough so we can
+      // trigger filter partitioning with GetNumKey() amount of keys
+      table_options.metadata_block_size = 10;
+    }
+
+    return table_options;
+  }
+
+  // Return an appropriate amount of keys for testing
+  // to generate a long filter (i.e, size >= 8 + kMetadataLen)
+  std::size_t GetNumKey() { return 5000; }
+};
+
+INSTANTIATE_TEST_CASE_P(
+    DBFilterConstructionCorruptionTestWithParam,
+    DBFilterConstructionCorruptionTestWithParam,
+    ::testing::Values(std::make_tuple(false, kFastLocalBloom, false),
+                      std::make_tuple(true, kFastLocalBloom, false),
+                      std::make_tuple(true, kFastLocalBloom, true),
+                      std::make_tuple(true, kStandard128Ribbon, false),
+                      std::make_tuple(true, kStandard128Ribbon, true)));
+
+TEST_P(DBFilterConstructionCorruptionTestWithParam, DetectCorruption) {
+  Options options = CurrentOptions();
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+
+  DestroyAndReopen(options);
+  int num_key = static_cast<int>(GetNumKey());
+  Status s;
+
+  // Case 1: No corruption in filter construction
+  for (int i = 0; i < num_key; i++) {
+    ASSERT_OK(Put(Key(i), Key(i)));
+  }
+  s = Flush();
+  EXPECT_TRUE(s.ok());
+
+  // Case 2: Corruption of hash entries in filter construction
+  for (int i = 0; i < num_key; i++) {
+    ASSERT_OK(Put(Key(i), Key(i)));
+  }
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "XXPH3FilterBitsBuilder::Finish::TamperHashEntries", [&](void* arg) {
+        std::deque<uint64_t>* hash_entries_to_corrupt =
+            (std::deque<uint64_t>*)arg;
+        assert(!hash_entries_to_corrupt->empty());
+        *(hash_entries_to_corrupt->begin()) =
+            *(hash_entries_to_corrupt->begin()) ^ uint64_t { 1 };
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  s = Flush();
+
+  if (table_options.detect_filter_construct_corruption) {
+    EXPECT_TRUE(s.IsCorruption());
+    EXPECT_TRUE(
+        s.ToString().find("Filter's hash entries checksum mismatched") !=
+        std::string::npos);
+  } else {
+    EXPECT_TRUE(s.ok());
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearCallBack(
+      "XXPH3FilterBitsBuilder::Finish::"
+      "TamperHashEntries");
+
+  // Case 3: Corruption of filter content in filter construction
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < num_key; i++) {
+    ASSERT_OK(Put(Key(i), Key(i)));
+  }
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "XXPH3FilterBitsBuilder::Finish::TamperFilter", [&](void* arg) {
+        std::pair<std::unique_ptr<char[]>*, std::size_t>* TEST_arg_pair =
+            (std::pair<std::unique_ptr<char[]>*, std::size_t>*)arg;
+        std::size_t filter_size = TEST_arg_pair->second;
+        // 5 is the kMetadataLen and
+        assert(filter_size >= 8 + 5);
+        std::unique_ptr<char[]>* filter_content_to_corrupt =
+            TEST_arg_pair->first;
+        std::memset(filter_content_to_corrupt->get(), '\0', 8);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  s = Flush();
+
+  if (table_options.detect_filter_construct_corruption) {
+    EXPECT_TRUE(s.IsCorruption());
+    EXPECT_TRUE(s.ToString().find("Corrupted filter content") !=
+                std::string::npos);
+  } else {
+    EXPECT_TRUE(s.ok());
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearCallBack(
+      "XXPH3FilterBitsBuilder::Finish::"
+      "TamperFilter");
+}
+
+// RocksDB lite does not support dynamic options
+#ifndef ROCKSDB_LITE
+TEST_P(DBFilterConstructionCorruptionTestWithParam,
+       DynamicallyTurnOnAndOffDetectConstructCorruption) {
+  Options options = CurrentOptions();
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  // We intend to turn on
+  // table_options.detect_filter_construct_corruption dynamically
+  // therefore we override this test parmater's value
+  table_options.detect_filter_construct_corruption = false;
+
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.create_if_missing = true;
+
+  int num_key = static_cast<int>(GetNumKey());
+  Status s;
+
+  DestroyAndReopen(options);
+
+  // Case 1: !table_options.detect_filter_construct_corruption
+  for (int i = 0; i < num_key; i++) {
+    ASSERT_OK(Put(Key(i), Key(i)));
+  }
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "XXPH3FilterBitsBuilder::Finish::TamperHashEntries", [&](void* arg) {
+        std::deque<uint64_t>* hash_entries_to_corrupt =
+            (std::deque<uint64_t>*)arg;
+        assert(!hash_entries_to_corrupt->empty());
+        *(hash_entries_to_corrupt->begin()) =
+            *(hash_entries_to_corrupt->begin()) ^ uint64_t { 1 };
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  s = Flush();
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearCallBack(
+      "XXPH3FilterBitsBuilder::Finish::"
+      "TamperHashEntries");
+
+  ASSERT_FALSE(table_options.detect_filter_construct_corruption);
+  EXPECT_TRUE(s.ok());
+
+  // Case 2: dynamically turn on
+  // table_options.detect_filter_construct_corruption
+  ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
+                              "{detect_filter_construct_corruption=true;}"}}));
+
+  for (int i = 0; i < num_key; i++) {
+    ASSERT_OK(Put(Key(i), Key(i)));
+  }
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "XXPH3FilterBitsBuilder::Finish::TamperHashEntries", [&](void* arg) {
+        std::deque<uint64_t>* hash_entries_to_corrupt =
+            (std::deque<uint64_t>*)arg;
+        assert(!hash_entries_to_corrupt->empty());
+        *(hash_entries_to_corrupt->begin()) =
+            *(hash_entries_to_corrupt->begin()) ^ uint64_t { 1 };
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  s = Flush();
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearCallBack(
+      "XXPH3FilterBitsBuilder::Finish::"
+      "TamperHashEntries");
+
+  auto updated_table_options =
+      db_->GetOptions().table_factory->GetOptions<BlockBasedTableOptions>();
+  EXPECT_TRUE(updated_table_options->detect_filter_construct_corruption);
+  EXPECT_TRUE(s.IsCorruption());
+  EXPECT_TRUE(s.ToString().find("Filter's hash entries checksum mismatched") !=
+              std::string::npos);
+
+  // Case 3: dynamically turn off
+  // table_options.detect_filter_construct_corruption
+  ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
+                              "{detect_filter_construct_corruption=false;}"}}));
+  updated_table_options =
+      db_->GetOptions().table_factory->GetOptions<BlockBasedTableOptions>();
+  EXPECT_FALSE(updated_table_options->detect_filter_construct_corruption);
+}
+#endif  // ROCKSDB_LITE
+
+namespace {
+// NOTE: This class is referenced by HISTORY.md as a model for a wrapper
+// FilterPolicy selecting among configurations based on context.
+class LevelAndStyleCustomFilterPolicy : public FilterPolicy {
+ public:
+  explicit LevelAndStyleCustomFilterPolicy(int bpk_fifo, int bpk_l0_other,
+                                           int bpk_otherwise)
+      : policy_fifo_(NewBloomFilterPolicy(bpk_fifo)),
+        policy_l0_other_(NewBloomFilterPolicy(bpk_l0_other)),
+        policy_otherwise_(NewBloomFilterPolicy(bpk_otherwise)) {}
+
+  const char* Name() const override {
+    return "LevelAndStyleCustomFilterPolicy";
+  }
+
+  // OK to use built-in policy name because we are deferring to a
+  // built-in builder. We aren't changing the serialized format.
+  const char* CompatibilityName() const override {
+    return policy_fifo_->CompatibilityName();
+  }
+
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext& context) const override {
+    if (context.compaction_style == kCompactionStyleFIFO) {
+      return policy_fifo_->GetBuilderWithContext(context);
+    } else if (context.level_at_creation == 0) {
+      return policy_l0_other_->GetBuilderWithContext(context);
+    } else {
+      return policy_otherwise_->GetBuilderWithContext(context);
+    }
+  }
+
+  FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override {
+    // OK to defer to any of them; they all can parse built-in filters
+    // from any settings.
+    return policy_fifo_->GetFilterBitsReader(contents);
+  }
+
+ private:
+  const std::unique_ptr<const FilterPolicy> policy_fifo_;
+  const std::unique_ptr<const FilterPolicy> policy_l0_other_;
+  const std::unique_ptr<const FilterPolicy> policy_otherwise_;
+};
+
+static std::map<TableFileCreationReason, std::string>
+    table_file_creation_reason_to_string{
+        {TableFileCreationReason::kCompaction, "kCompaction"},
+        {TableFileCreationReason::kFlush, "kFlush"},
+        {TableFileCreationReason::kMisc, "kMisc"},
+        {TableFileCreationReason::kRecovery, "kRecovery"},
+    };
+
+class TestingContextCustomFilterPolicy
+    : public LevelAndStyleCustomFilterPolicy {
+ public:
+  explicit TestingContextCustomFilterPolicy(int bpk_fifo, int bpk_l0_other,
+                                            int bpk_otherwise)
+      : LevelAndStyleCustomFilterPolicy(bpk_fifo, bpk_l0_other, bpk_otherwise) {
+  }
+
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext& context) const override {
+    test_report_ += "cf=";
+    test_report_ += context.column_family_name;
+    test_report_ += ",s=";
+    test_report_ +=
+        OptionsHelper::compaction_style_to_string[context.compaction_style];
+    test_report_ += ",n=";
+    test_report_ += std::to_string(context.num_levels);
+    test_report_ += ",l=";
+    test_report_ += std::to_string(context.level_at_creation);
+    test_report_ += ",b=";
+    test_report_ += std::to_string(int{context.is_bottommost});
+    test_report_ += ",r=";
+    test_report_ += table_file_creation_reason_to_string[context.reason];
+    test_report_ += "\n";
+
+    return LevelAndStyleCustomFilterPolicy::GetBuilderWithContext(context);
+  }
+
+  std::string DumpTestReport() {
+    std::string rv;
+    std::swap(rv, test_report_);
+    return rv;
+  }
+
+ private:
+  mutable std::string test_report_;
+};
+}  // anonymous namespace
+
+TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) {
+  auto policy = std::make_shared<TestingContextCustomFilterPolicy>(15, 8, 5);
+  Options options;
+  for (bool fifo : {true, false}) {
+    options = CurrentOptions();
+    options.max_open_files = fifo ? -1 : options.max_open_files;
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    options.compaction_style =
+        fifo ? kCompactionStyleFIFO : kCompactionStyleLevel;
+
+    BlockBasedTableOptions table_options;
+    table_options.filter_policy = policy;
+    table_options.format_version = 5;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    TryReopen(options);
+    CreateAndReopenWithCF({fifo ? "abe" : "bob"}, options);
+
+    const int maxKey = 10000;
+    for (int i = 0; i < maxKey / 2; i++) {
+      ASSERT_OK(Put(1, Key(i), Key(i)));
+    }
+    // Add a large key to make the file contain wide range
+    ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+    Flush(1);
+    EXPECT_EQ(policy->DumpTestReport(),
+              fifo ? "cf=abe,s=kCompactionStyleFIFO,n=7,l=0,b=0,r=kFlush\n"
+                   : "cf=bob,s=kCompactionStyleLevel,n=7,l=0,b=0,r=kFlush\n");
+
+    for (int i = maxKey / 2; i < maxKey; i++) {
+      ASSERT_OK(Put(1, Key(i), Key(i)));
+    }
+    Flush(1);
+    EXPECT_EQ(policy->DumpTestReport(),
+              fifo ? "cf=abe,s=kCompactionStyleFIFO,n=7,l=0,b=0,r=kFlush\n"
+                   : "cf=bob,s=kCompactionStyleLevel,n=7,l=0,b=0,r=kFlush\n");
+
+    // Check that they can be found
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_EQ(Key(i), Get(1, Key(i)));
+    }
+    // Since we have two tables / two filters, we might have Bloom checks on
+    // our queries, but no more than one "useful" per query on a found key.
+    EXPECT_LE(TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey);
+
+    // Check that we have two filters, each about
+    // fifo: 0.12% FP rate (15 bits per key)
+    // level: 2.3% FP rate (8 bits per key)
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333)));
+    }
+    {
+      auto useful_count =
+          TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL);
+      EXPECT_GE(useful_count, maxKey * 2 * (fifo ? 0.9980 : 0.975));
+      EXPECT_LE(useful_count, maxKey * 2 * (fifo ? 0.9995 : 0.98));
+    }
+
+    if (!fifo) {  // FIFO doesn't fully support CompactRange
+      // Full compaction
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                                  nullptr));
+      EXPECT_EQ(policy->DumpTestReport(),
+                "cf=bob,s=kCompactionStyleLevel,n=7,l=1,b=1,r=kCompaction\n");
+
+      // Check that we now have one filter, about 9.2% FP rate (5 bits per key)
+      for (int i = 0; i < maxKey; i++) {
+        ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333)));
+      }
+      {
+        auto useful_count =
+            TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL);
+        EXPECT_GE(useful_count, maxKey * 0.90);
+        EXPECT_LE(useful_count, maxKey * 0.91);
+      }
+    } else {
+#ifndef ROCKSDB_LITE
+      // Also try external SST file
+      {
+        std::string file_path = dbname_ + "/external.sst";
+        SstFileWriter sst_file_writer(EnvOptions(), options, handles_[1]);
+        ASSERT_OK(sst_file_writer.Open(file_path));
+        ASSERT_OK(sst_file_writer.Put("key", "value"));
+        ASSERT_OK(sst_file_writer.Finish());
+      }
+      // Note: kCompactionStyleLevel is default, ignored if num_levels == -1
+      EXPECT_EQ(policy->DumpTestReport(),
+                "cf=abe,s=kCompactionStyleLevel,n=-1,l=-1,b=0,r=kMisc\n");
+#endif
+    }
+
+    // Destroy
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+    ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1]));
+    handles_[1] = nullptr;
+  }
+}
+
+class SliceTransformLimitedDomain : public SliceTransform {
+  const char* Name() const override { return "SliceTransformLimitedDomain"; }
+
+  Slice Transform(const Slice& src) const override {
+    return Slice(src.data(), 5);
+  }
+
+  bool InDomain(const Slice& src) const override {
+    // prefix will be x????
+    return src.size() >= 5 && src[0] == 'x';
+  }
+
+  bool InRange(const Slice& dst) const override {
+    // prefix will be x????
+    return dst.size() == 5 && dst[0] == 'x';
+  }
+};
+
+TEST_F(DBBloomFilterTest, PrefixExtractorWithFilter1) {
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10));
+  bbto.whole_key_filtering = false;
+
+  Options options = CurrentOptions();
+  options.prefix_extractor = std::make_shared<SliceTransformLimitedDomain>();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("x1111_AAAA", "val1"));
+  ASSERT_OK(Put("x1112_AAAA", "val2"));
+  ASSERT_OK(Put("x1113_AAAA", "val3"));
+  ASSERT_OK(Put("x1114_AAAA", "val4"));
+  // Not in domain, wont be added to filter
+  ASSERT_OK(Put("zzzzz_AAAA", "val5"));
+
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(Get("x1111_AAAA"), "val1");
+  ASSERT_EQ(Get("x1112_AAAA"), "val2");
+  ASSERT_EQ(Get("x1113_AAAA"), "val3");
+  ASSERT_EQ(Get("x1114_AAAA"), "val4");
+  // Was not added to filter but rocksdb will try to read it from the filter
+  ASSERT_EQ(Get("zzzzz_AAAA"), "val5");
+}
+
+TEST_F(DBBloomFilterTest, PrefixExtractorWithFilter2) {
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10));
+
+  Options options = CurrentOptions();
+  options.prefix_extractor = std::make_shared<SliceTransformLimitedDomain>();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("x1113_AAAA", "val3"));
+  ASSERT_OK(Put("x1114_AAAA", "val4"));
+  // Not in domain, wont be added to filter
+  ASSERT_OK(Put("zzzzz_AAAA", "val1"));
+  ASSERT_OK(Put("zzzzz_AAAB", "val2"));
+  ASSERT_OK(Put("zzzzz_AAAC", "val3"));
+  ASSERT_OK(Put("zzzzz_AAAD", "val4"));
+
+  ASSERT_OK(Flush());
+
+  std::vector<std::string> iter_res;
+  auto iter = db_->NewIterator(ReadOptions());
+  // Seek to a key that was not in Domain
+  for (iter->Seek("zzzzz_AAAA"); iter->Valid(); iter->Next()) {
+    iter_res.emplace_back(iter->value().ToString());
+  }
+
+  std::vector<std::string> expected_res = {"val1", "val2", "val3", "val4"};
+  ASSERT_EQ(iter_res, expected_res);
+  delete iter;
+}
+
+TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilter) {
+  // regression test for #2743. the range delete tombstones in memtable should
+  // be added even when Get() skips searching due to its prefix bloom filter
+  const int kMemtableSize = 1 << 20;              // 1MB
+  const int kMemtablePrefixFilterSize = 1 << 13;  // 8KB
+  const int kPrefixLen = 4;
+  Options options = CurrentOptions();
+  options.memtable_prefix_bloom_size_ratio =
+      static_cast<double>(kMemtablePrefixFilterSize) / kMemtableSize;
+  options.prefix_extractor.reset(
+      ROCKSDB_NAMESPACE::NewFixedPrefixTransform(kPrefixLen));
+  options.write_buffer_size = kMemtableSize;
+  options.memtable_whole_key_filtering = false;
+  Reopen(options);
+  std::string key1("AAAABBBB");
+  std::string key2("AAAACCCC");  // not in DB
+  std::string key3("AAAADDDD");
+  std::string key4("AAAAEEEE");
+  std::string value1("Value1");
+  std::string value3("Value3");
+  std::string value4("Value4");
+
+  ASSERT_OK(Put(key1, value1, WriteOptions()));
+
+  // check memtable bloom stats
+  ASSERT_EQ("NOT_FOUND", Get(key2));
+  ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count);
+  // same prefix, bloom filter false positive
+  ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
+
+  // enable whole key bloom filter
+  options.memtable_whole_key_filtering = true;
+  Reopen(options);
+  // check memtable bloom stats
+  ASSERT_OK(Put(key3, value3, WriteOptions()));
+  ASSERT_EQ("NOT_FOUND", Get(key2));
+  // whole key bloom filter kicks in and determines it's a miss
+  ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count);
+  ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
+
+  // verify whole key filtering does not depend on prefix_extractor
+  options.prefix_extractor.reset();
+  Reopen(options);
+  // check memtable bloom stats
+  ASSERT_OK(Put(key4, value4, WriteOptions()));
+  ASSERT_EQ("NOT_FOUND", Get(key2));
+  // whole key bloom filter kicks in and determines it's a miss
+  ASSERT_EQ(2, get_perf_context()->bloom_memtable_miss_count);
+  ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
+}
+
+TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilterMultiGet) {
+  Options options = CurrentOptions();
+  options.memtable_prefix_bloom_size_ratio = 0.015;
+  options.memtable_whole_key_filtering = true;
+  Reopen(options);
+  std::string key1("AA");
+  std::string key2("BB");
+  std::string key3("CC");
+  std::string key4("DD");
+  std::string key_not("EE");
+  std::string value1("Value1");
+  std::string value2("Value2");
+  std::string value3("Value3");
+  std::string value4("Value4");
+
+  ASSERT_OK(Put(key1, value1, WriteOptions()));
+  ASSERT_OK(Put(key2, value2, WriteOptions()));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(key3, value3, WriteOptions()));
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(Put(key4, value4, WriteOptions()));
+
+  // Delete key2 and key3
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "BA", "CZ"));
+
+  // Read without snapshot
+  auto results = MultiGet({key_not, key1, key2, key3, key4});
+  ASSERT_EQ(results[0], "NOT_FOUND");
+  ASSERT_EQ(results[1], value1);
+  ASSERT_EQ(results[2], "NOT_FOUND");
+  ASSERT_EQ(results[3], "NOT_FOUND");
+  ASSERT_EQ(results[4], value4);
+
+  // Also check Get
+  ASSERT_EQ(Get(key1), value1);
+  ASSERT_EQ(Get(key2), "NOT_FOUND");
+  ASSERT_EQ(Get(key3), "NOT_FOUND");
+  ASSERT_EQ(Get(key4), value4);
+
+  // Read with snapshot
+  results = MultiGet({key_not, key1, key2, key3, key4}, snapshot);
+  ASSERT_EQ(results[0], "NOT_FOUND");
+  ASSERT_EQ(results[1], value1);
+  ASSERT_EQ(results[2], value2);
+  ASSERT_EQ(results[3], value3);
+  ASSERT_EQ(results[4], "NOT_FOUND");
+
+  // Also check Get
+  ASSERT_EQ(Get(key1, snapshot), value1);
+  ASSERT_EQ(Get(key2, snapshot), value2);
+  ASSERT_EQ(Get(key3, snapshot), value3);
+  ASSERT_EQ(Get(key4, snapshot), "NOT_FOUND");
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) {
+  constexpr size_t kPrefixSize = 8;
+  const std::string kKey = "key";
+  assert(kKey.size() < kPrefixSize);
+  Options options = CurrentOptions();
+  options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixSize));
+  options.memtable_prefix_bloom_size_ratio = 0.25;
+  Reopen(options);
+  ASSERT_OK(Put(kKey, "v"));
+  ASSERT_EQ("v", Get(kKey));
+  std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
+  iter->Seek(kKey);
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(kKey, iter->key());
+  iter->SeekForPrev(kKey);
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(kKey, iter->key());
+}
+
+class DBBloomFilterTestVaryPrefixAndFormatVer
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<bool, uint32_t>> {
+ protected:
+  bool use_prefix_;
+  uint32_t format_version_;
+
+ public:
+  DBBloomFilterTestVaryPrefixAndFormatVer()
+      : DBTestBase("db_bloom_filter_tests", /*env_do_fsync=*/true) {}
+
+  ~DBBloomFilterTestVaryPrefixAndFormatVer() override {}
+
+  void SetUp() override {
+    use_prefix_ = std::get<0>(GetParam());
+    format_version_ = std::get<1>(GetParam());
+  }
+
+  static std::string UKey(uint32_t i) { return Key(static_cast<int>(i)); }
+};
+
+TEST_P(DBBloomFilterTestVaryPrefixAndFormatVer, PartitionedMultiGet) {
+  Options options = CurrentOptions();
+  if (use_prefix_) {
+    // Entire key from UKey()
+    options.prefix_extractor.reset(NewCappedPrefixTransform(9));
+  }
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(20));
+  bbto.partition_filters = true;
+  bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  bbto.whole_key_filtering = !use_prefix_;
+  if (use_prefix_) {  // (not related to prefix, just alternating between)
+    // Make sure code appropriately deals with metadata block size setting
+    // that is "too small" (smaller than minimum size for filter builder)
+    bbto.metadata_block_size = 63;
+  } else {
+    // Make sure the test will work even on platforms with large minimum
+    // filter size, due to large cache line size.
+    // (Largest cache line size + 10+% overhead.)
+    bbto.metadata_block_size = 290;
+  }
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+  ReadOptions ropts;
+
+  constexpr uint32_t N = 12000;
+  // Add N/2 evens
+  for (uint32_t i = 0; i < N; i += 2) {
+    ASSERT_OK(Put(UKey(i), UKey(i)));
+  }
+  ASSERT_OK(Flush());
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ(TotalTableFiles(), 1);
+#endif
+
+  constexpr uint32_t Q = 29;
+  // MultiGet In
+  std::array<std::string, Q> keys;
+  std::array<Slice, Q> key_slices;
+  std::array<ColumnFamilyHandle*, Q> column_families;
+  // MultiGet Out
+  std::array<Status, Q> statuses;
+  std::array<PinnableSlice, Q> values;
+
+  TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+  TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+  TestGetAndResetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL);
+  TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL);
+  TestGetAndResetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED);
+  TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE);
+  TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE);
+
+  // Check that initial clump of keys only loads one partition filter from
+  // block cache.
+  // And that spread out keys load many partition filters.
+  // In both cases, mix present vs. not present keys.
+  for (uint32_t stride : {uint32_t{1}, (N / Q) | 1}) {
+    for (uint32_t i = 0; i < Q; ++i) {
+      keys[i] = UKey(i * stride);
+      key_slices[i] = Slice(keys[i]);
+      column_families[i] = db_->DefaultColumnFamily();
+      statuses[i] = Status();
+      values[i] = PinnableSlice();
+    }
+
+    db_->MultiGet(ropts, Q, &column_families[0], &key_slices[0], &values[0],
+                  /*timestamps=*/nullptr, &statuses[0], true);
+
+    // Confirm correct status results
+    uint32_t number_not_found = 0;
+    for (uint32_t i = 0; i < Q; ++i) {
+      if ((i * stride % 2) == 0) {
+        ASSERT_OK(statuses[i]);
+      } else {
+        ASSERT_TRUE(statuses[i].IsNotFound());
+        ++number_not_found;
+      }
+    }
+
+    // Confirm correct Bloom stats (no FPs)
+    uint64_t filter_useful = TestGetAndResetTickerCount(
+        options,
+        use_prefix_ ? BLOOM_FILTER_PREFIX_USEFUL : BLOOM_FILTER_USEFUL);
+    uint64_t filter_checked =
+        TestGetAndResetTickerCount(options, use_prefix_
+                                                ? BLOOM_FILTER_PREFIX_CHECKED
+                                                : BLOOM_FILTER_FULL_POSITIVE) +
+        (use_prefix_ ? 0 : filter_useful);
+    EXPECT_EQ(filter_useful, number_not_found);
+    EXPECT_EQ(filter_checked, Q);
+    if (!use_prefix_) {
+      EXPECT_EQ(
+          TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE),
+          Q - number_not_found);
+    }
+
+    // Confirm no duplicate loading same filter partition
+    uint64_t filter_accesses =
+        TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_HIT) +
+        TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+    if (stride == 1) {
+      EXPECT_EQ(filter_accesses, 1);
+    } else {
+      // for large stride
+      EXPECT_GE(filter_accesses, Q / 2 + 1);
+    }
+  }
+
+  // Check that a clump of keys (present and not) works when spanning
+  // two partitions
+  int found_spanning = 0;
+  for (uint32_t start = 0; start < N / 2;) {
+    for (uint32_t i = 0; i < Q; ++i) {
+      keys[i] = UKey(start + i);
+      key_slices[i] = Slice(keys[i]);
+      column_families[i] = db_->DefaultColumnFamily();
+      statuses[i] = Status();
+      values[i] = PinnableSlice();
+    }
+
+    db_->MultiGet(ropts, Q, &column_families[0], &key_slices[0], &values[0],
+                  /*timestamps=*/nullptr, &statuses[0], true);
+
+    // Confirm correct status results
+    uint32_t number_not_found = 0;
+    for (uint32_t i = 0; i < Q; ++i) {
+      if (((start + i) % 2) == 0) {
+        ASSERT_OK(statuses[i]);
+      } else {
+        ASSERT_TRUE(statuses[i].IsNotFound());
+        ++number_not_found;
+      }
+    }
+
+    // Confirm correct Bloom stats (might see some FPs)
+    uint64_t filter_useful = TestGetAndResetTickerCount(
+        options,
+        use_prefix_ ? BLOOM_FILTER_PREFIX_USEFUL : BLOOM_FILTER_USEFUL);
+    uint64_t filter_checked =
+        TestGetAndResetTickerCount(options, use_prefix_
+                                                ? BLOOM_FILTER_PREFIX_CHECKED
+                                                : BLOOM_FILTER_FULL_POSITIVE) +
+        (use_prefix_ ? 0 : filter_useful);
+    EXPECT_GE(filter_useful, number_not_found - 2);  // possible FP
+    EXPECT_EQ(filter_checked, Q);
+    if (!use_prefix_) {
+      EXPECT_EQ(
+          TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE),
+          Q - number_not_found);
+    }
+
+    // Confirm no duplicate loading of same filter partition
+    uint64_t filter_accesses =
+        TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_HIT) +
+        TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+    if (filter_accesses == 2) {
+      // Spanned across partitions.
+      ++found_spanning;
+      if (found_spanning >= 2) {
+        break;
+      } else {
+        // Ensure that at least once we have at least one present and
+        // one non-present key on both sides of partition boundary.
+        start += 2;
+      }
+    } else {
+      EXPECT_EQ(filter_accesses, 1);
+      // See explanation at "start += 2"
+      start += Q - 4;
+    }
+  }
+  EXPECT_TRUE(found_spanning >= 2);
+}
+
+INSTANTIATE_TEST_CASE_P(DBBloomFilterTestVaryPrefixAndFormatVer,
+                        DBBloomFilterTestVaryPrefixAndFormatVer,
+                        ::testing::Values(
+                            // (use_prefix, format_version)
+                            std::make_tuple(false, 2),
+                            std::make_tuple(false, 3),
+                            std::make_tuple(false, 4),
+                            std::make_tuple(false, 5), std::make_tuple(true, 2),
+                            std::make_tuple(true, 3), std::make_tuple(true, 4),
+                            std::make_tuple(true, 5)));
+
+#ifndef ROCKSDB_LITE
+namespace {
+static const std::string kPlainTable = "test_PlainTableBloom";
+}  // anonymous namespace
+
+class BloomStatsTestWithParam
+    : public DBBloomFilterTest,
+      public testing::WithParamInterface<std::tuple<std::string, bool>> {
+ public:
+  BloomStatsTestWithParam() {
+    bfp_impl_ = std::get<0>(GetParam());
+    partition_filters_ = std::get<1>(GetParam());
+
+    options_.create_if_missing = true;
+    options_.prefix_extractor.reset(
+        ROCKSDB_NAMESPACE::NewFixedPrefixTransform(4));
+    options_.memtable_prefix_bloom_size_ratio =
+        8.0 * 1024.0 / static_cast<double>(options_.write_buffer_size);
+    if (bfp_impl_ == kPlainTable) {
+      assert(!partition_filters_);  // not supported in plain table
+      PlainTableOptions table_options;
+      options_.table_factory.reset(NewPlainTableFactory(table_options));
+    } else {
+      BlockBasedTableOptions table_options;
+      if (partition_filters_) {
+        table_options.partition_filters = partition_filters_;
+        table_options.index_type =
+            BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+      }
+      table_options.filter_policy = Create(10, bfp_impl_);
+      options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    }
+    options_.env = env_;
+
+    get_perf_context()->Reset();
+    DestroyAndReopen(options_);
+  }
+
+  ~BloomStatsTestWithParam() override {
+    get_perf_context()->Reset();
+    Destroy(options_);
+  }
+
+  // Required if inheriting from testing::WithParamInterface<>
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  std::string bfp_impl_;
+  bool partition_filters_;
+  Options options_;
+};
+
+// 1 Insert 2 K-V pairs into DB
+// 2 Call Get() for both keys - expext memtable bloom hit stat to be 2
+// 3 Call Get() for nonexisting key - expect memtable bloom miss stat to be 1
+// 4 Call Flush() to create SST
+// 5 Call Get() for both keys - expext SST bloom hit stat to be 2
+// 6 Call Get() for nonexisting key - expect SST bloom miss stat to be 1
+// Test both: block and plain SST
+TEST_P(BloomStatsTestWithParam, BloomStatsTest) {
+  std::string key1("AAAA");
+  std::string key2("RXDB");  // not in DB
+  std::string key3("ZBRA");
+  std::string value1("Value1");
+  std::string value3("Value3");
+
+  ASSERT_OK(Put(key1, value1, WriteOptions()));
+  ASSERT_OK(Put(key3, value3, WriteOptions()));
+
+  // check memtable bloom stats
+  ASSERT_EQ(value1, Get(key1));
+  ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
+  ASSERT_EQ(value3, Get(key3));
+  ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count);
+  ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count);
+
+  ASSERT_EQ("NOT_FOUND", Get(key2));
+  ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count);
+  ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count);
+
+  // sanity checks
+  ASSERT_EQ(0, get_perf_context()->bloom_sst_hit_count);
+  ASSERT_EQ(0, get_perf_context()->bloom_sst_miss_count);
+
+  Flush();
+
+  // sanity checks
+  ASSERT_EQ(0, get_perf_context()->bloom_sst_hit_count);
+  ASSERT_EQ(0, get_perf_context()->bloom_sst_miss_count);
+
+  // check SST bloom stats
+  ASSERT_EQ(value1, Get(key1));
+  ASSERT_EQ(1, get_perf_context()->bloom_sst_hit_count);
+  ASSERT_EQ(value3, Get(key3));
+  ASSERT_EQ(2, get_perf_context()->bloom_sst_hit_count);
+
+  ASSERT_EQ("NOT_FOUND", Get(key2));
+  ASSERT_EQ(1, get_perf_context()->bloom_sst_miss_count);
+}
+
+// Same scenario as in BloomStatsTest but using an iterator
+TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) {
+  std::string key1("AAAA");
+  std::string key2("RXDB");  // not in DB
+  std::string key3("ZBRA");
+  std::string value1("Value1");
+  std::string value3("Value3");
+
+  ASSERT_OK(Put(key1, value1, WriteOptions()));
+  ASSERT_OK(Put(key3, value3, WriteOptions()));
+
+  std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
+
+  // check memtable bloom stats
+  iter->Seek(key1);
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(value1, iter->value().ToString());
+  ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
+  ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count);
+
+  iter->Seek(key3);
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(value3, iter->value().ToString());
+  ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count);
+  ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count);
+
+  iter->Seek(key2);
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count);
+  ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count);
+
+  Flush();
+
+  iter.reset(dbfull()->NewIterator(ReadOptions()));
+
+  // Check SST bloom stats
+  iter->Seek(key1);
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(value1, iter->value().ToString());
+  ASSERT_EQ(1, get_perf_context()->bloom_sst_hit_count);
+
+  iter->Seek(key3);
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(value3, iter->value().ToString());
+  uint64_t expected_hits = 2;
+  ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count);
+
+  iter->Seek(key2);
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_EQ(1, get_perf_context()->bloom_sst_miss_count);
+  ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    BloomStatsTestWithParam, BloomStatsTestWithParam,
+    ::testing::Values(std::make_tuple(kLegacyBloom, false),
+                      std::make_tuple(kLegacyBloom, true),
+                      std::make_tuple(kFastLocalBloom, false),
+                      std::make_tuple(kFastLocalBloom, true),
+                      std::make_tuple(kPlainTable, false)));
+
+namespace {
+void PrefixScanInit(DBBloomFilterTest* dbtest) {
+  char buf[100];
+  std::string keystr;
+  const int small_range_sstfiles = 5;
+  const int big_range_sstfiles = 5;
+
+  // Generate 11 sst files with the following prefix ranges.
+  // GROUP 0: [0,10]                              (level 1)
+  // GROUP 1: [1,2], [2,3], [3,4], [4,5], [5, 6]  (level 0)
+  // GROUP 2: [0,6], [0,7], [0,8], [0,9], [0,10]  (level 0)
+  //
+  // A seek with the previous API would do 11 random I/Os (to all the
+  // files).  With the new API and a prefix filter enabled, we should
+  // only do 2 random I/O, to the 2 files containing the key.
+
+  // GROUP 0
+  snprintf(buf, sizeof(buf), "%02d______:start", 0);
+  keystr = std::string(buf);
+  ASSERT_OK(dbtest->Put(keystr, keystr));
+  snprintf(buf, sizeof(buf), "%02d______:end", 10);
+  keystr = std::string(buf);
+  ASSERT_OK(dbtest->Put(keystr, keystr));
+  ASSERT_OK(dbtest->Flush());
+  ASSERT_OK(dbtest->dbfull()->CompactRange(CompactRangeOptions(), nullptr,
+                                           nullptr));  // move to level 1
+
+  // GROUP 1
+  for (int i = 1; i <= small_range_sstfiles; i++) {
+    snprintf(buf, sizeof(buf), "%02d______:start", i);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    snprintf(buf, sizeof(buf), "%02d______:end", i + 1);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    dbtest->Flush();
+  }
+
+  // GROUP 2
+  for (int i = 1; i <= big_range_sstfiles; i++) {
+    snprintf(buf, sizeof(buf), "%02d______:start", 0);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    snprintf(buf, sizeof(buf), "%02d______:end", small_range_sstfiles + i + 1);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    dbtest->Flush();
+  }
+}
+}  // anonymous namespace
+
+TEST_F(DBBloomFilterTest, PrefixScan) {
+  while (ChangeFilterOptions()) {
+    int count;
+    Slice prefix;
+    Slice key;
+    char buf[100];
+    Iterator* iter;
+    snprintf(buf, sizeof(buf), "03______:");
+    prefix = Slice(buf, 8);
+    key = Slice(buf, 9);
+    ASSERT_EQ(key.difference_offset(prefix), 8);
+    ASSERT_EQ(prefix.difference_offset(key), 8);
+    // db configs
+    env_->count_random_reads_ = true;
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+    options.disable_auto_compactions = true;
+    options.max_background_compactions = 2;
+    options.create_if_missing = true;
+    options.memtable_factory.reset(NewHashSkipListRepFactory(16));
+    assert(!options.unordered_write);
+    // It is incompatible with allow_concurrent_memtable_write=false
+    options.allow_concurrent_memtable_write = false;
+
+    BlockBasedTableOptions table_options;
+    table_options.no_block_cache = true;
+    table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+    table_options.whole_key_filtering = false;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    // 11 RAND I/Os
+    DestroyAndReopen(options);
+    PrefixScanInit(this);
+    count = 0;
+    env_->random_read_counter_.Reset();
+    iter = db_->NewIterator(ReadOptions());
+    for (iter->Seek(prefix); iter->Valid(); iter->Next()) {
+      if (!iter->key().starts_with(prefix)) {
+        break;
+      }
+      count++;
+    }
+    ASSERT_OK(iter->status());
+    delete iter;
+    ASSERT_EQ(count, 2);
+    ASSERT_EQ(env_->random_read_counter_.Read(), 2);
+    Close();
+  }  // end of while
+}
+
+TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 64 * 1024;
+  options.arena_block_size = 4 * 1024;
+  options.target_file_size_base = 64 * 1024;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 4;
+  options.max_bytes_for_level_base = 256 * 1024;
+  options.max_write_buffer_number = 2;
+  options.max_background_compactions = 8;
+  options.max_background_flushes = 8;
+  options.compression = kNoCompression;
+  options.compaction_style = kCompactionStyleLevel;
+  options.level_compaction_dynamic_level_bytes = true;
+  BlockBasedTableOptions bbto;
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10));
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  options.optimize_filters_for_hits = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  get_perf_context()->Reset();
+  get_perf_context()->EnablePerLevelPerfContext();
+  CreateAndReopenWithCF({"mypikachu"}, options);
+
+  int numkeys = 200000;
+
+  // Generate randomly shuffled keys, so the updates are almost
+  // random.
+  std::vector<int> keys;
+  keys.reserve(numkeys);
+  for (int i = 0; i < numkeys; i += 2) {
+    keys.push_back(i);
+  }
+  RandomShuffle(std::begin(keys), std::end(keys), /*seed*/ 42);
+  int num_inserted = 0;
+  for (int key : keys) {
+    ASSERT_OK(Put(1, Key(key), "val"));
+    if (++num_inserted % 1000 == 0) {
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+  }
+  ASSERT_OK(Put(1, Key(0), "val"));
+  ASSERT_OK(Put(1, Key(numkeys), "val"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  if (NumTableFilesAtLevel(0, 1) == 0) {
+    // No Level 0 file. Create one.
+    ASSERT_OK(Put(1, Key(0), "val"));
+    ASSERT_OK(Put(1, Key(numkeys), "val"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+
+  for (int i = 1; i < numkeys; i += 2) {
+    ASSERT_EQ(Get(1, Key(i)), "NOT_FOUND");
+  }
+
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0));
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1));
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+
+  // Now we have three sorted run, L0, L5 and L6 with most files in L6 have
+  // no bloom filter. Most keys be checked bloom filters twice.
+  ASSERT_GT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 65000 * 2);
+  ASSERT_LT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 120000 * 2);
+  uint64_t bloom_filter_useful_all_levels = 0;
+  for (auto& kv : (*(get_perf_context()->level_to_perf_context))) {
+    if (kv.second.bloom_filter_useful > 0) {
+      bloom_filter_useful_all_levels += kv.second.bloom_filter_useful;
+    }
+  }
+  ASSERT_GT(bloom_filter_useful_all_levels, 65000 * 2);
+  ASSERT_LT(bloom_filter_useful_all_levels, 120000 * 2);
+
+  for (int i = 0; i < numkeys; i += 2) {
+    ASSERT_EQ(Get(1, Key(i)), "val");
+  }
+
+  // Part 2 (read path): rewrite last level with blooms, then verify they get
+  // cached only if !optimize_filters_for_hits
+  options.disable_auto_compactions = true;
+  options.num_levels = 9;
+  options.optimize_filters_for_hits = false;
+  options.statistics = CreateDBStatistics();
+  bbto.block_cache.reset();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  ReopenWithColumnFamilies({"default", "mypikachu"}, options);
+  MoveFilesToLevel(7 /* level */, 1 /* column family index */);
+
+  std::string value = Get(1, Key(0));
+  uint64_t prev_cache_filter_hits =
+      TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+  value = Get(1, Key(0));
+  ASSERT_EQ(prev_cache_filter_hits + 1,
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+
+  // Now that we know the filter blocks exist in the last level files, see if
+  // filter caching is skipped for this optimization
+  options.optimize_filters_for_hits = true;
+  options.statistics = CreateDBStatistics();
+  bbto.block_cache.reset();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  ReopenWithColumnFamilies({"default", "mypikachu"}, options);
+
+  value = Get(1, Key(0));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+  ASSERT_EQ(2 /* index and data block */,
+            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+  // Check filter block ignored for files preloaded during DB::Open()
+  options.max_open_files = -1;
+  options.statistics = CreateDBStatistics();
+  bbto.block_cache.reset();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  ReopenWithColumnFamilies({"default", "mypikachu"}, options);
+
+  uint64_t prev_cache_filter_misses =
+      TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+  prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+  Get(1, Key(0));
+  ASSERT_EQ(prev_cache_filter_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(prev_cache_filter_hits,
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+
+  // Check filter block ignored for file trivially-moved to bottom level
+  bbto.block_cache.reset();
+  options.max_open_files = 100;  // setting > -1 makes it not preload all files
+  options.statistics = CreateDBStatistics();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  ReopenWithColumnFamilies({"default", "mypikachu"}, options);
+
+  ASSERT_OK(Put(1, Key(numkeys + 1), "val"));
+  ASSERT_OK(Flush(1));
+
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* /*arg*/) { non_trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  CompactRangeOptions compact_options;
+  compact_options.bottommost_level_compaction =
+      BottommostLevelCompaction::kSkip;
+  compact_options.change_level = true;
+  compact_options.target_level = 7;
+  ASSERT_TRUE(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)
+                  .IsNotSupported());
+
+  ASSERT_EQ(trivial_move, 1);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+  prev_cache_filter_misses =
+      TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+  value = Get(1, Key(numkeys + 1));
+  ASSERT_EQ(prev_cache_filter_hits,
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+  ASSERT_EQ(prev_cache_filter_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+
+  // Check filter block not cached for iterator
+  bbto.block_cache.reset();
+  options.statistics = CreateDBStatistics();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  ReopenWithColumnFamilies({"default", "mypikachu"}, options);
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions(), handles_[1]));
+  iter->SeekToFirst();
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+  ASSERT_EQ(2 /* index and data block */,
+            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+  get_perf_context()->Reset();
+}
+
+int CountIter(std::unique_ptr<Iterator>& iter, const Slice& key) {
+  int count = 0;
+  for (iter->Seek(key); iter->Valid(); iter->Next()) {
+    count++;
+  }
+  EXPECT_OK(iter->status());
+  return count;
+}
+
+// use iterate_upper_bound to hint compatiability of existing bloom filters.
+// The BF is considered compatible if 1) upper bound and seek key transform
+// into the same string, or 2) the transformed seek key is of the same length
+// as the upper bound and two keys are adjacent according to the comparator.
+TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
+  for (const auto& bfp_impl : BloomLikeFilterPolicy::GetAllFixedImpls()) {
+    Options options;
+    options.create_if_missing = true;
+    options.env = CurrentOptions().env;
+    options.prefix_extractor.reset(NewCappedPrefixTransform(4));
+    options.disable_auto_compactions = true;
+    options.statistics = CreateDBStatistics();
+    // Enable prefix bloom for SST files
+    BlockBasedTableOptions table_options;
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.filter_policy = Create(10, bfp_impl);
+    table_options.index_shortening = BlockBasedTableOptions::
+        IndexShorteningMode::kShortenSeparatorsAndSuccessor;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    DestroyAndReopen(options);
+
+    ASSERT_OK(Put("abcdxxx0", "val1"));
+    ASSERT_OK(Put("abcdxxx1", "val2"));
+    ASSERT_OK(Put("abcdxxx2", "val3"));
+    ASSERT_OK(Put("abcdxxx3", "val4"));
+    ASSERT_OK(dbfull()->Flush(FlushOptions()));
+    {
+      // prefix_extractor has not changed, BF will always be read
+      Slice upper_bound("abce");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abcd0000"), 4);
+    }
+    {
+      Slice upper_bound("abcdzzzz");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abcd0000"), 4);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 2);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:5"}}));
+    ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(),
+              "rocksdb.FixedPrefix.5");
+    {
+      // BF changed, [abcdxx00, abce) is a valid bound, will trigger BF read
+      Slice upper_bound("abce");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abcdxx00"), 4);
+      // should check bloom filter since upper bound meets requirement
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 3);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    {
+      // [abcdxx01, abcey) is not valid bound since upper bound is too long for
+      // the BF in SST (capped:4)
+      Slice upper_bound("abcey");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abcdxx01"), 4);
+      // should skip bloom filter since upper bound is too long
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 3);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    {
+      // [abcdxx02, abcdy) is a valid bound since the prefix is the same
+      Slice upper_bound("abcdy");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abcdxx02"), 4);
+      // should check bloom filter since upper bound matches transformed seek
+      // key
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    {
+      // [aaaaaaaa, abce) is not a valid bound since 1) they don't share the
+      // same prefix, 2) the prefixes are not consecutive
+      Slice upper_bound("abce");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "aaaaaaaa"), 0);
+      // should skip bloom filter since mismatch is found
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:3"}}));
+    {
+      // [abc, abd) is not a valid bound since the upper bound is too short
+      // for BF (capped:4)
+      Slice upper_bound("abd");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abc"), 4);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    // Same with re-open
+    options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+    Reopen(options);
+    {
+      Slice upper_bound("abd");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abc"), 4);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    // Set back to capped:4 and verify BF is always read
+    options.prefix_extractor.reset(NewCappedPrefixTransform(4));
+    Reopen(options);
+    {
+      Slice upper_bound("abd");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abc"), 0);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 5);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
+    }
+    // Same if there's a problem initally loading prefix transform
+    SyncPoint::GetInstance()->SetCallBack(
+        "BlockBasedTable::Open::ForceNullTablePrefixExtractor",
+        [&](void* arg) { *static_cast<bool*>(arg) = true; });
+    SyncPoint::GetInstance()->EnableProcessing();
+    Reopen(options);
+    {
+      Slice upper_bound("abd");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abc"), 0);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2);
+    }
+    SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+// Create multiple SST files each with a different prefix_extractor config,
+// verify iterators can read all SST files using the latest config.
+TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
+  for (const auto& bfp_impl : BloomLikeFilterPolicy::GetAllFixedImpls()) {
+    Options options;
+    options.env = CurrentOptions().env;
+    options.create_if_missing = true;
+    options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+    options.disable_auto_compactions = true;
+    options.statistics = CreateDBStatistics();
+    // Enable prefix bloom for SST files
+    BlockBasedTableOptions table_options;
+    table_options.filter_policy = Create(10, bfp_impl);
+    table_options.cache_index_and_filter_blocks = true;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    DestroyAndReopen(options);
+
+    Slice upper_bound("foz90000");
+    ReadOptions read_options;
+    read_options.prefix_same_as_start = true;
+
+    // first SST with fixed:1 BF
+    ASSERT_OK(Put("foo2", "bar2"));
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Put("foq1", "bar1"));
+    ASSERT_OK(Put("fpa", "0"));
+    dbfull()->Flush(FlushOptions());
+    std::unique_ptr<Iterator> iter_old(db_->NewIterator(read_options));
+    ASSERT_EQ(CountIter(iter_old, "foo"), 4);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 1);
+
+    ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
+    ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(),
+              "rocksdb.CappedPrefix.3");
+    read_options.iterate_upper_bound = &upper_bound;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    ASSERT_EQ(CountIter(iter, "foo"), 2);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 2);
+    ASSERT_EQ(CountIter(iter, "gpk"), 0);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 2);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+
+    // second SST with capped:3 BF
+    ASSERT_OK(Put("foo3", "bar3"));
+    ASSERT_OK(Put("foo4", "bar4"));
+    ASSERT_OK(Put("foq5", "bar5"));
+    ASSERT_OK(Put("fpb", "1"));
+    ASSERT_OK(dbfull()->Flush(FlushOptions()));
+    {
+      // BF is cappped:3 now
+      std::unique_ptr<Iterator> iter_tmp(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter_tmp, "foo"), 4);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+      ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0);
+      // both counters are incremented because BF is "not changed" for 1 of the
+      // 2 SST files, so filter is checked once and found no match.
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 5);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
+    }
+
+    ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:2"}}));
+    ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(),
+              "rocksdb.FixedPrefix.2");
+    // third SST with fixed:2 BF
+    ASSERT_OK(Put("foo6", "bar6"));
+    ASSERT_OK(Put("foo7", "bar7"));
+    ASSERT_OK(Put("foq8", "bar8"));
+    ASSERT_OK(Put("fpc", "2"));
+    ASSERT_OK(dbfull()->Flush(FlushOptions()));
+    {
+      // BF is fixed:2 now
+      std::unique_ptr<Iterator> iter_tmp(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter_tmp, "foo"), 9);
+      // the first and last BF are checked
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 7);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
+      ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0);
+      // only last BF is checked and not found
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 8);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2);
+    }
+
+    // iter_old can only see the first SST, so checked plus 1
+    ASSERT_EQ(CountIter(iter_old, "foo"), 4);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 9);
+    // iter was created after the first setoptions call so only full filter
+    // will check the filter
+    ASSERT_EQ(CountIter(iter, "foo"), 2);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 10);
+
+    {
+      // keys in all three SSTs are visible to iterator
+      // The range of [foo, foz90000] is compatible with (fixed:1) and (fixed:2)
+      // so +2 for checked counter
+      std::unique_ptr<Iterator> iter_all(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter_all, "foo"), 9);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 12);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2);
+      ASSERT_EQ(CountIter(iter_all, "gpk"), 0);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 13);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
+    }
+    ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
+    ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(),
+              "rocksdb.CappedPrefix.3");
+    {
+      std::unique_ptr<Iterator> iter_all(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter_all, "foo"), 6);
+      // all three SST are checked because the current options has the same as
+      // the remaining SST (capped:3)
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 16);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
+      ASSERT_EQ(CountIter(iter_all, "gpk"), 0);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 17);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 4);
+    }
+    // TODO(Zhongyi): Maybe also need to add Get calls to test point look up?
+  }
+}
+
+// Create a new column family in a running DB, change prefix_extractor
+// dynamically, verify the iterator created on the new column family behaves
+// as expected
+TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) {
+  int iteration = 0;
+  for (const auto& bfp_impl : BloomLikeFilterPolicy::GetAllFixedImpls()) {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+    options.disable_auto_compactions = true;
+    options.statistics = CreateDBStatistics();
+    // Enable prefix bloom for SST files
+    BlockBasedTableOptions table_options;
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.filter_policy = Create(10, bfp_impl);
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    CreateAndReopenWithCF({"pikachu" + std::to_string(iteration)}, options);
+    ReadOptions read_options;
+    read_options.prefix_same_as_start = true;
+    // create a new CF and set prefix_extractor dynamically
+    options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+    CreateColumnFamilies({"ramen_dojo_" + std::to_string(iteration)}, options);
+    ASSERT_EQ(dbfull()->GetOptions(handles_[2]).prefix_extractor->AsString(),
+              "rocksdb.CappedPrefix.3");
+    ASSERT_OK(Put(2, "foo3", "bar3"));
+    ASSERT_OK(Put(2, "foo4", "bar4"));
+    ASSERT_OK(Put(2, "foo5", "bar5"));
+    ASSERT_OK(Put(2, "foq6", "bar6"));
+    ASSERT_OK(Put(2, "fpq7", "bar7"));
+    dbfull()->Flush(FlushOptions());
+    {
+      std::unique_ptr<Iterator> iter(
+          db_->NewIterator(read_options, handles_[2]));
+      ASSERT_EQ(CountIter(iter, "foo"), 3);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 0);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    ASSERT_OK(
+        dbfull()->SetOptions(handles_[2], {{"prefix_extractor", "fixed:2"}}));
+    ASSERT_EQ(dbfull()->GetOptions(handles_[2]).prefix_extractor->AsString(),
+              "rocksdb.FixedPrefix.2");
+    {
+      std::unique_ptr<Iterator> iter(
+          db_->NewIterator(read_options, handles_[2]));
+      ASSERT_EQ(CountIter(iter, "foo"), 4);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 0);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[2]));
+    ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[2]));
+    handles_[2] = nullptr;
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+    ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1]));
+    handles_[1] = nullptr;
+    iteration++;
+  }
+}
+
+// Verify it's possible to change prefix_extractor at runtime and iterators
+// behaves as expected
+TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) {
+  for (const auto& bfp_impl : BloomLikeFilterPolicy::GetAllFixedImpls()) {
+    Options options;
+    options.env = CurrentOptions().env;
+    options.create_if_missing = true;
+    options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+    options.disable_auto_compactions = true;
+    options.statistics = CreateDBStatistics();
+    // Enable prefix bloom for SST files
+    BlockBasedTableOptions table_options;
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.filter_policy = Create(10, bfp_impl);
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    DestroyAndReopen(options);
+
+    ASSERT_OK(Put("foo2", "bar2"));
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Put("foo1", "bar1"));
+    ASSERT_OK(Put("fpa", "0"));
+    dbfull()->Flush(FlushOptions());
+    ASSERT_OK(Put("foo3", "bar3"));
+    ASSERT_OK(Put("foo4", "bar4"));
+    ASSERT_OK(Put("foo5", "bar5"));
+    ASSERT_OK(Put("fpb", "1"));
+    dbfull()->Flush(FlushOptions());
+    ASSERT_OK(Put("foo6", "bar6"));
+    ASSERT_OK(Put("foo7", "bar7"));
+    ASSERT_OK(Put("foo8", "bar8"));
+    ASSERT_OK(Put("fpc", "2"));
+    dbfull()->Flush(FlushOptions());
+
+    ReadOptions read_options;
+    read_options.prefix_same_as_start = true;
+    {
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "foo"), 12);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 3);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    std::unique_ptr<Iterator> iter_old(db_->NewIterator(read_options));
+    ASSERT_EQ(CountIter(iter_old, "foo"), 12);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+
+    ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
+    ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(),
+              "rocksdb.CappedPrefix.3");
+    {
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      // "fp*" should be skipped
+      ASSERT_EQ(CountIter(iter, "foo"), 9);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+
+    // iterator created before should not be affected and see all keys
+    ASSERT_EQ(CountIter(iter_old, "foo"), 12);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 9);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    ASSERT_EQ(CountIter(iter_old, "abc"), 0);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 12);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
+  }
+}
+
+TEST_F(DBBloomFilterTest, SeekForPrevWithPartitionedFilters) {
+  Options options = CurrentOptions();
+  constexpr size_t kNumKeys = 10000;
+  static_assert(kNumKeys <= 10000, "kNumKeys have to be <= 10000");
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeys + 10));
+  options.create_if_missing = true;
+  constexpr size_t kPrefixLength = 4;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixLength));
+  options.compression = kNoCompression;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(50));
+  bbto.index_shortening =
+      BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+  bbto.block_size = 128;
+  bbto.metadata_block_size = 128;
+  bbto.partition_filters = true;
+  bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  const std::string value(64, '\0');
+
+  WriteOptions write_opts;
+  write_opts.disableWAL = true;
+  for (size_t i = 0; i < kNumKeys; ++i) {
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(4) << std::fixed << i;
+    ASSERT_OK(db_->Put(write_opts, oss.str(), value));
+  }
+  ASSERT_OK(Flush());
+
+  ReadOptions read_opts;
+  // Use legacy, implicit prefix seek
+  read_opts.total_order_seek = false;
+  read_opts.auto_prefix_mode = false;
+  std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+  for (size_t i = 0; i < kNumKeys; ++i) {
+    // Seek with a key after each one added but with same prefix. One will
+    // surely cross a partition boundary.
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(4) << std::fixed << i << "a";
+    it->SeekForPrev(oss.str());
+    ASSERT_OK(it->status());
+    ASSERT_TRUE(it->Valid());
+  }
+  it.reset();
+}
+
+namespace {
+class BackwardBytewiseComparator : public Comparator {
+ public:
+  const char* Name() const override { return "BackwardBytewiseComparator"; }
+
+  int Compare(const Slice& a, const Slice& b) const override {
+    int min_size_neg = -static_cast<int>(std::min(a.size(), b.size()));
+    const char* a_end = a.data() + a.size();
+    const char* b_end = b.data() + b.size();
+    for (int i = -1; i >= min_size_neg; --i) {
+      if (a_end[i] != b_end[i]) {
+        if (static_cast<unsigned char>(a_end[i]) <
+            static_cast<unsigned char>(b_end[i])) {
+          return -1;
+        } else {
+          return 1;
+        }
+      }
+    }
+    return static_cast<int>(a.size()) - static_cast<int>(b.size());
+  }
+
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {}
+
+  void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+const BackwardBytewiseComparator kBackwardBytewiseComparator{};
+
+class FixedSuffix4Transform : public SliceTransform {
+  const char* Name() const override { return "FixedSuffixTransform"; }
+
+  Slice Transform(const Slice& src) const override {
+    return Slice(src.data() + src.size() - 4, 4);
+  }
+
+  bool InDomain(const Slice& src) const override { return src.size() >= 4; }
+};
+
+std::pair<uint64_t, uint64_t> GetBloomStat(const Options& options, bool sst) {
+  if (sst) {
+    return {
+        options.statistics->getAndResetTickerCount(BLOOM_FILTER_PREFIX_CHECKED),
+        options.statistics->getAndResetTickerCount(BLOOM_FILTER_PREFIX_USEFUL)};
+  } else {
+    auto hit = std::exchange(get_perf_context()->bloom_memtable_hit_count, 0);
+    auto miss = std::exchange(get_perf_context()->bloom_memtable_miss_count, 0);
+    return {hit + miss, miss};
+  }
+}
+
+std::pair<uint64_t, uint64_t> CheckedAndUseful(uint64_t checked,
+                                               uint64_t useful) {
+  return {checked, useful};
+}
+}  // anonymous namespace
+
+// This uses a prefix_extractor + comparator combination that violates
+// one of the old obsolete, unnecessary axioms of prefix extraction:
+// * key.starts_with(prefix(key))
+// This axiom is not really needed, and we validate that here.
+TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter1) {
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10));
+  bbto.whole_key_filtering = false;
+
+  Options options = CurrentOptions();
+  options.comparator = &kBackwardBytewiseComparator;
+  options.prefix_extractor = std::make_shared<FixedSuffix4Transform>();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  options.statistics = CreateDBStatistics();
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("321aaaa", "val1"));
+  ASSERT_OK(Put("112aaaa", "val2"));
+  ASSERT_OK(Put("009aaaa", "val3"));
+  ASSERT_OK(Put("baa", "val4"));  // out of domain
+  ASSERT_OK(Put("321abaa", "val5"));
+  ASSERT_OK(Put("zzz", "val6"));  // out of domain
+
+  for (auto flushed : {false, true}) {
+    SCOPED_TRACE("flushed=" + std::to_string(flushed));
+    if (flushed) {
+      ASSERT_OK(Flush());
+    }
+    ReadOptions read_options;
+    if (flushed) {  // TODO: support auto_prefix_mode in memtable?
+      read_options.auto_prefix_mode = true;
+    }
+    EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+    {
+      Slice ub("999aaaa");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "aaaa"), 3);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+    }
+    {
+      Slice ub("999abaa");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "abaa"), 1);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+    }
+    {
+      Slice ub("999acaa");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "acaa"), 0);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 1));
+    }
+    {
+      Slice ub("zzzz");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "baa"), 3);
+      if (flushed) {  // TODO: fix memtable case
+        EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+      }
+    }
+  }
+}
+
+// This uses a prefix_extractor + comparator combination that violates
+// one of the old obsolete, unnecessary axioms of prefix extraction:
+// * Compare(prefix(key), key) <= 0
+// This axiom is not really needed, and we validate that here.
+TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter2) {
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10));
+  bbto.whole_key_filtering = false;
+
+  Options options = CurrentOptions();
+  options.comparator = ReverseBytewiseComparator();
+  options.prefix_extractor.reset(NewFixedPrefixTransform(4));
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  options.statistics = CreateDBStatistics();
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("aaaa123", "val1"));
+  ASSERT_OK(Put("aaaa211", "val2"));
+  ASSERT_OK(Put("aaaa900", "val3"));
+  ASSERT_OK(Put("aab", "val4"));  // out of domain
+  ASSERT_OK(Put("aaba123", "val5"));
+  ASSERT_OK(Put("qqqq123", "val7"));
+  ASSERT_OK(Put("qqqq", "val8"));
+  ASSERT_OK(Put("zzz", "val8"));  // out of domain
+
+  for (auto flushed : {false, true}) {
+    SCOPED_TRACE("flushed=" + std::to_string(flushed));
+    if (flushed) {
+      ASSERT_OK(Flush());
+    }
+    ReadOptions read_options;
+    if (flushed) {  // TODO: support auto_prefix_mode in memtable?
+      read_options.auto_prefix_mode = true;
+    } else {
+      // TODO: why needed?
+      get_perf_context()->bloom_memtable_hit_count = 0;
+      get_perf_context()->bloom_memtable_miss_count = 0;
+    }
+    EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+    {
+      Slice ub("aaaa000");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "aaaa999"), 3);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+    }
+    {
+      // Note: prefix does work as upper bound
+      Slice ub("aaaa");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "aaaa999"), 3);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+    }
+    {
+      // Note: prefix does not work here as seek key
+      Slice ub("aaaa500");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "aaaa"), 0);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+    }
+    {
+      Slice ub("aaba000");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "aaba999"), 1);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+    }
+    {
+      Slice ub("aaca000");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "aaca999"), 0);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 1));
+    }
+    {
+      Slice ub("aaaz");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "zzz"), 5);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+    }
+    {
+      // Note: prefix does work here as seek key, but only finds key equal
+      // to prefix (others with same prefix are less)
+      read_options.auto_prefix_mode = false;
+      read_options.iterate_upper_bound = nullptr;
+      read_options.prefix_same_as_start = true;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "qqqq"), 1);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+    }
+  }
+}
+
+namespace {
+// A weird comparator that in combination with NonIdempotentFixed4Transform
+// breaks an old axiom of prefix filtering.
+class WeirdComparator : public Comparator {
+ public:
+  const char* Name() const override { return "WeirdComparator"; }
+
+  int Compare(const Slice& a, const Slice& b) const override {
+    bool a_in = a.size() >= 5;
+    bool b_in = b.size() >= 5;
+    if (a_in != b_in) {
+      // Order keys after prefixes
+      return a_in - b_in;
+    }
+    if (a_in) {
+      return BytewiseComparator()->Compare(a, b);
+    } else {
+      // Different ordering on the prefixes
+      return ReverseBytewiseComparator()->Compare(a, b);
+    }
+  }
+
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {}
+
+  void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+const WeirdComparator kWeirdComparator{};
+
+// Non-idempotentent because prefix is always 4 bytes, but this is
+// out-of-domain for keys to be assigned prefixes (>= 5 bytes)
+class NonIdempotentFixed4Transform : public SliceTransform {
+  const char* Name() const override { return "NonIdempotentFixed4Transform"; }
+
+  Slice Transform(const Slice& src) const override {
+    return Slice(src.data(), 4);
+  }
+
+  bool InDomain(const Slice& src) const override { return src.size() >= 5; }
+};
+}  // anonymous namespace
+
+// This uses a prefix_extractor + comparator combination that violates
+// two of the old obsolete, unnecessary axioms of prefix extraction:
+// * prefix(prefix(key)) == prefix(key)
+// * If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0
+// This axiom is not really needed, and we validate that here.
+TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter3) {
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10));
+  bbto.whole_key_filtering = false;
+
+  Options options = CurrentOptions();
+  options.prefix_extractor = std::make_shared<NonIdempotentFixed4Transform>();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  options.statistics = CreateDBStatistics();
+
+  for (auto weird_comparator : {false, true}) {
+    if (weird_comparator) {
+      options.comparator = &kWeirdComparator;
+    }
+    DestroyAndReopen(options);
+
+    ASSERT_OK(Put("aaaa123", "val1"));
+    ASSERT_OK(Put("aaaa211", "val2"));
+    ASSERT_OK(Put("aaaa900", "val3"));
+    ASSERT_OK(Put("aab", "val4"));  // out of domain
+    ASSERT_OK(Put("aaba123", "val5"));
+    ASSERT_OK(Put("qqqq123", "val7"));
+    ASSERT_OK(Put("qqqq", "val8"));  // out of domain
+    ASSERT_OK(Put("zzzz", "val8"));  // out of domain
+
+    for (auto flushed : {false, true}) {
+      SCOPED_TRACE("flushed=" + std::to_string(flushed));
+      if (flushed) {
+        ASSERT_OK(Flush());
+      }
+      ReadOptions read_options;
+      if (flushed) {  // TODO: support auto_prefix_mode in memtable?
+        read_options.auto_prefix_mode = true;
+      } else {
+        // TODO: why needed?
+        get_perf_context()->bloom_memtable_hit_count = 0;
+        get_perf_context()->bloom_memtable_miss_count = 0;
+      }
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+      {
+        Slice ub("aaaa999");
+        read_options.iterate_upper_bound = &ub;
+        std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+        EXPECT_EQ(CountIter(iter, "aaaa000"), 3);
+        EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+      }
+      {
+        // Note: prefix as seek key is not bloom-optimized
+        // Note: the count works with weird_comparator because "aaaa" is
+        // ordered as the last of the prefixes
+        Slice ub("aaaa999");
+        read_options.iterate_upper_bound = &ub;
+        std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+        EXPECT_EQ(CountIter(iter, "aaaa"), 3);
+        EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+      }
+      {
+        Slice ub("aaba9");
+        read_options.iterate_upper_bound = &ub;
+        std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+        EXPECT_EQ(CountIter(iter, "aaba0"), 1);
+        EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+      }
+      {
+        Slice ub("aaca9");
+        read_options.iterate_upper_bound = &ub;
+        std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+        EXPECT_EQ(CountIter(iter, "aaca0"), 0);
+        EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 1));
+      }
+      {
+        Slice ub("qqqq9");
+        read_options.iterate_upper_bound = &ub;
+        std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+        EXPECT_EQ(CountIter(iter, "qqqq0"), 1);
+        EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+      }
+      {
+        // Note: prefix as seek key is not bloom-optimized
+        Slice ub("qqqq9");
+        read_options.iterate_upper_bound = &ub;
+        std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+        EXPECT_EQ(CountIter(iter, "qqqq"), weird_comparator ? 7 : 2);
+        EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+      }
+      {
+        // Note: prefix as seek key is not bloom-optimized
+        Slice ub("zzzz9");
+        read_options.iterate_upper_bound = &ub;
+        std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+        EXPECT_EQ(CountIter(iter, "zzzz"), weird_comparator ? 8 : 1);
+        EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+      }
+      {
+        Slice ub("zzzz9");
+        read_options.iterate_upper_bound = &ub;
+        std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+        EXPECT_EQ(CountIter(iter, "aab"), weird_comparator ? 6 : 5);
+        EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+      }
+    }
+  }
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_compaction_filter_test.cc b/src/rocksdb/db/db_compaction_filter_test.cc
new file mode 100644
index 000000000..be863d4f6
--- /dev/null
+++ b/src/rocksdb/db/db_compaction_filter_test.cc
@@ -0,0 +1,1036 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static int cfilter_count = 0;
+static int cfilter_skips = 0;
+
+// This is a static filter used for filtering
+// kvs during the compaction process.
+static std::string NEW_VALUE = "NewValue";
+
+class DBTestCompactionFilter : public DBTestBase {
+ public:
+  DBTestCompactionFilter()
+      : DBTestBase("db_compaction_filter_test", /*env_do_fsync=*/true) {}
+};
+
+// Param variant of DBTestBase::ChangeCompactOptions
+class DBTestCompactionFilterWithCompactParam
+    : public DBTestCompactionFilter,
+      public ::testing::WithParamInterface<DBTestBase::OptionConfig> {
+ public:
+  DBTestCompactionFilterWithCompactParam() : DBTestCompactionFilter() {
+    option_config_ = GetParam();
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    if (option_config_ == kDefault || option_config_ == kUniversalCompaction ||
+        option_config_ == kUniversalCompactionMultiLevel) {
+      options.create_if_missing = true;
+    }
+    if (option_config_ == kLevelSubcompactions ||
+        option_config_ == kUniversalSubcompactions) {
+      assert(options.max_subcompactions > 1);
+    }
+    Reopen(options);
+  }
+};
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+INSTANTIATE_TEST_CASE_P(
+    CompactionFilterWithOption, DBTestCompactionFilterWithCompactParam,
+    ::testing::Values(DBTestBase::OptionConfig::kDefault,
+                      DBTestBase::OptionConfig::kUniversalCompaction,
+                      DBTestBase::OptionConfig::kUniversalCompactionMultiLevel,
+                      DBTestBase::OptionConfig::kLevelSubcompactions,
+                      DBTestBase::OptionConfig::kUniversalSubcompactions));
+#else
+// Run fewer cases in non-full valgrind to save time.
+INSTANTIATE_TEST_CASE_P(CompactionFilterWithOption,
+                        DBTestCompactionFilterWithCompactParam,
+                        ::testing::Values(DBTestBase::OptionConfig::kDefault));
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+class KeepFilter : public CompactionFilter {
+ public:
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    cfilter_count++;
+    return false;
+  }
+
+  const char* Name() const override { return "KeepFilter"; }
+};
+
+class DeleteFilter : public CompactionFilter {
+ public:
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    cfilter_count++;
+    return true;
+  }
+
+  bool FilterMergeOperand(int /*level*/, const Slice& /*key*/,
+                          const Slice& /*operand*/) const override {
+    return true;
+  }
+
+  const char* Name() const override { return "DeleteFilter"; }
+};
+
+class DeleteISFilter : public CompactionFilter {
+ public:
+  bool Filter(int /*level*/, const Slice& key, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    cfilter_count++;
+    int i = std::stoi(key.ToString());
+    if (i > 5 && i <= 105) {
+      return true;
+    }
+    return false;
+  }
+
+  bool IgnoreSnapshots() const override { return true; }
+
+  const char* Name() const override { return "DeleteFilter"; }
+};
+
+// Skip x if floor(x/10) is even, use range skips. Requires that keys are
+// zero-padded to length 10.
+class SkipEvenFilter : public CompactionFilter {
+ public:
+  Decision FilterV2(int /*level*/, const Slice& key, ValueType /*value_type*/,
+                    const Slice& /*existing_value*/, std::string* /*new_value*/,
+                    std::string* skip_until) const override {
+    cfilter_count++;
+    int i = std::stoi(key.ToString());
+    if (i / 10 % 2 == 0) {
+      char key_str[100];
+      snprintf(key_str, sizeof(key_str), "%010d", i / 10 * 10 + 10);
+      *skip_until = key_str;
+      ++cfilter_skips;
+      return Decision::kRemoveAndSkipUntil;
+    }
+    return Decision::kKeep;
+  }
+
+  bool IgnoreSnapshots() const override { return true; }
+
+  const char* Name() const override { return "DeleteFilter"; }
+};
+
+class ConditionalFilter : public CompactionFilter {
+ public:
+  explicit ConditionalFilter(const std::string* filtered_value)
+      : filtered_value_(filtered_value) {}
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& value,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    return value.ToString() == *filtered_value_;
+  }
+
+  const char* Name() const override { return "ConditionalFilter"; }
+
+ private:
+  const std::string* filtered_value_;
+};
+
+class ChangeFilter : public CompactionFilter {
+ public:
+  explicit ChangeFilter() {}
+
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* new_value, bool* value_changed) const override {
+    assert(new_value != nullptr);
+    *new_value = NEW_VALUE;
+    *value_changed = true;
+    return false;
+  }
+
+  const char* Name() const override { return "ChangeFilter"; }
+};
+
+class KeepFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit KeepFilterFactory(bool check_context = false,
+                             bool check_context_cf_id = false)
+      : check_context_(check_context),
+        check_context_cf_id_(check_context_cf_id),
+        compaction_filter_created_(false) {}
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    if (check_context_) {
+      EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction);
+      EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction);
+    }
+    if (check_context_cf_id_) {
+      EXPECT_EQ(expect_cf_id_.load(), context.column_family_id);
+    }
+    compaction_filter_created_ = true;
+    return std::unique_ptr<CompactionFilter>(new KeepFilter());
+  }
+
+  bool compaction_filter_created() const { return compaction_filter_created_; }
+
+  const char* Name() const override { return "KeepFilterFactory"; }
+  bool check_context_;
+  bool check_context_cf_id_;
+  std::atomic_bool expect_full_compaction_;
+  std::atomic_bool expect_manual_compaction_;
+  std::atomic<uint32_t> expect_cf_id_;
+  bool compaction_filter_created_;
+};
+
+// This filter factory is configured with a `TableFileCreationReason`. Only
+// table files created for that reason will undergo filtering. This
+// configurability makes it useful to tests for filtering non-compaction table
+// files, such as "CompactionFilterFlush" and "CompactionFilterRecovery".
+class DeleteFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit DeleteFilterFactory(TableFileCreationReason reason)
+      : reason_(reason) {}
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    EXPECT_EQ(reason_, context.reason);
+    if (context.reason == TableFileCreationReason::kCompaction &&
+        !context.is_manual_compaction) {
+      // Table files created by automatic compaction do not undergo filtering.
+      // Presumably some tests rely on this.
+      return std::unique_ptr<CompactionFilter>(nullptr);
+    }
+    return std::unique_ptr<CompactionFilter>(new DeleteFilter());
+  }
+
+  bool ShouldFilterTableFileCreation(
+      TableFileCreationReason reason) const override {
+    return reason_ == reason;
+  }
+
+  const char* Name() const override { return "DeleteFilterFactory"; }
+
+ private:
+  const TableFileCreationReason reason_;
+};
+
+// Delete Filter Factory which ignores snapshots
+class DeleteISFilterFactory : public CompactionFilterFactory {
+ public:
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    if (context.is_manual_compaction) {
+      return std::unique_ptr<CompactionFilter>(new DeleteISFilter());
+    } else {
+      return std::unique_ptr<CompactionFilter>(nullptr);
+    }
+  }
+
+  const char* Name() const override { return "DeleteFilterFactory"; }
+};
+
+class SkipEvenFilterFactory : public CompactionFilterFactory {
+ public:
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    if (context.is_manual_compaction) {
+      return std::unique_ptr<CompactionFilter>(new SkipEvenFilter());
+    } else {
+      return std::unique_ptr<CompactionFilter>(nullptr);
+    }
+  }
+
+  const char* Name() const override { return "SkipEvenFilterFactory"; }
+};
+
+class ConditionalFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit ConditionalFilterFactory(const Slice& filtered_value)
+      : filtered_value_(filtered_value.ToString()) {}
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& /*context*/) override {
+    return std::unique_ptr<CompactionFilter>(
+        new ConditionalFilter(&filtered_value_));
+  }
+
+  const char* Name() const override { return "ConditionalFilterFactory"; }
+
+ private:
+  std::string filtered_value_;
+};
+
+class ChangeFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit ChangeFilterFactory() {}
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& /*context*/) override {
+    return std::unique_ptr<CompactionFilter>(new ChangeFilter());
+  }
+
+  const char* Name() const override { return "ChangeFilterFactory"; }
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTestCompactionFilter, CompactionFilter) {
+  Options options = CurrentOptions();
+  options.max_open_files = -1;
+  options.num_levels = 3;
+  options.compaction_filter_factory = std::make_shared<KeepFilterFactory>();
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Write 100K keys, these are written to a few files in L0.
+  const std::string value(10, 'x');
+  for (int i = 0; i < 100000; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    ASSERT_OK(Put(1, key, value));
+  }
+  ASSERT_OK(Flush(1));
+
+  // Push all files to the highest level L2. Verify that
+  // the compaction is each level invokes the filter for
+  // all the keys in that level.
+  cfilter_count = 0;
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+  ASSERT_EQ(cfilter_count, 100000);
+  cfilter_count = 0;
+  ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
+  ASSERT_EQ(cfilter_count, 100000);
+
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+  ASSERT_NE(NumTableFilesAtLevel(2, 1), 0);
+  cfilter_count = 0;
+
+  // All the files are in the lowest level.
+  // Verify that all but the 100001st record
+  // has sequence number zero. The 100001st record
+  // is at the tip of this snapshot and cannot
+  // be zeroed out.
+  int count = 0;
+  int total = 0;
+  Arena arena;
+  {
+    InternalKeyComparator icmp(options.comparator);
+    ReadOptions read_options;
+    ScopedArenaIterator iter(dbfull()->NewInternalIterator(
+        read_options, &arena, kMaxSequenceNumber, handles_[1]));
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    while (iter->Valid()) {
+      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+      ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
+      total++;
+      if (ikey.sequence != 0) {
+        count++;
+      }
+      iter->Next();
+    }
+    ASSERT_OK(iter->status());
+  }
+  ASSERT_EQ(total, 100000);
+  ASSERT_EQ(count, 0);
+
+  // overwrite all the 100K keys once again.
+  for (int i = 0; i < 100000; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    ASSERT_OK(Put(1, key, value));
+  }
+  ASSERT_OK(Flush(1));
+
+  // push all files to the highest level L2. This
+  // means that all keys should pass at least once
+  // via the compaction filter
+  cfilter_count = 0;
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+  ASSERT_EQ(cfilter_count, 100000);
+  cfilter_count = 0;
+  ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
+  ASSERT_EQ(cfilter_count, 100000);
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+  ASSERT_NE(NumTableFilesAtLevel(2, 1), 0);
+
+  // create a new database with the compaction
+  // filter in such a way that it deletes all keys
+  options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>(
+      TableFileCreationReason::kCompaction);
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // write all the keys once again.
+  for (int i = 0; i < 100000; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    ASSERT_OK(Put(1, key, value));
+  }
+  ASSERT_OK(Flush(1));
+  ASSERT_NE(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2, 1), 0);
+
+  // Push all files to the highest level L2. This
+  // triggers the compaction filter to delete all keys,
+  // verify that at the end of the compaction process,
+  // nothing is left.
+  cfilter_count = 0;
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+  ASSERT_EQ(cfilter_count, 100000);
+  cfilter_count = 0;
+  ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
+  ASSERT_EQ(cfilter_count, 0);
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+
+  {
+    // Scan the entire database to ensure that nothing is left
+    std::unique_ptr<Iterator> iter(
+        db_->NewIterator(ReadOptions(), handles_[1]));
+    iter->SeekToFirst();
+    count = 0;
+    while (iter->Valid()) {
+      count++;
+      iter->Next();
+    }
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(count, 0);
+  }
+
+  // The sequence number of the remaining record
+  // is not zeroed out even though it is at the
+  // level Lmax because this record is at the tip
+  count = 0;
+  {
+    InternalKeyComparator icmp(options.comparator);
+    ReadOptions read_options;
+    ScopedArenaIterator iter(dbfull()->NewInternalIterator(
+        read_options, &arena, kMaxSequenceNumber, handles_[1]));
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    while (iter->Valid()) {
+      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+      ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
+      ASSERT_NE(ikey.sequence, (unsigned)0);
+      count++;
+      iter->Next();
+    }
+    ASSERT_EQ(count, 0);
+  }
+}
+
+// Tests the edge case where compaction does not produce any output -- all
+// entries are deleted. The compaction should create bunch of 'DeleteFile'
+// entries in VersionEdit, but none of the 'AddFile's.
+TEST_F(DBTestCompactionFilter, CompactionFilterDeletesAll) {
+  Options options = CurrentOptions();
+  options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>(
+      TableFileCreationReason::kCompaction);
+  options.disable_auto_compactions = true;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+
+  // put some data
+  for (int table = 0; table < 4; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      ASSERT_OK(Put(std::to_string(table * 100 + i), "val"));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // this will produce empty file (delete compaction filter)
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(0U, CountLiveFiles());
+
+  Reopen(options);
+
+  Iterator* itr = db_->NewIterator(ReadOptions());
+  itr->SeekToFirst();
+  ASSERT_OK(itr->status());
+  // empty db
+  ASSERT_TRUE(!itr->Valid());
+
+  delete itr;
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTestCompactionFilter, CompactionFilterFlush) {
+  // Tests a `CompactionFilterFactory` that filters when table file is created
+  // by flush.
+  Options options = CurrentOptions();
+  options.compaction_filter_factory =
+      std::make_shared<DeleteFilterFactory>(TableFileCreationReason::kFlush);
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  Reopen(options);
+
+  // Puts and Merges are purged in flush.
+  ASSERT_OK(Put("a", "v"));
+  ASSERT_OK(Merge("b", "v"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("NOT_FOUND", Get("a"));
+  ASSERT_EQ("NOT_FOUND", Get("b"));
+
+  // However, Puts and Merges are preserved by recovery.
+  ASSERT_OK(Put("a", "v"));
+  ASSERT_OK(Merge("b", "v"));
+  Reopen(options);
+  ASSERT_EQ("v", Get("a"));
+  ASSERT_EQ("v", Get("b"));
+
+  // Likewise, compaction does not apply filtering.
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("v", Get("a"));
+  ASSERT_EQ("v", Get("b"));
+}
+
+TEST_F(DBTestCompactionFilter, CompactionFilterRecovery) {
+  // Tests a `CompactionFilterFactory` that filters when table file is created
+  // by recovery.
+  Options options = CurrentOptions();
+  options.compaction_filter_factory =
+      std::make_shared<DeleteFilterFactory>(TableFileCreationReason::kRecovery);
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  Reopen(options);
+
+  // Puts and Merges are purged in recovery.
+  ASSERT_OK(Put("a", "v"));
+  ASSERT_OK(Merge("b", "v"));
+  Reopen(options);
+  ASSERT_EQ("NOT_FOUND", Get("a"));
+  ASSERT_EQ("NOT_FOUND", Get("b"));
+
+  // However, Puts and Merges are preserved by flush.
+  ASSERT_OK(Put("a", "v"));
+  ASSERT_OK(Merge("b", "v"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("v", Get("a"));
+  ASSERT_EQ("v", Get("b"));
+
+  // Likewise, compaction does not apply filtering.
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("v", Get("a"));
+  ASSERT_EQ("v", Get("b"));
+}
+
+TEST_P(DBTestCompactionFilterWithCompactParam,
+       CompactionFilterWithValueChange) {
+  Options options = CurrentOptions();
+  options.num_levels = 3;
+  options.compaction_filter_factory = std::make_shared<ChangeFilterFactory>();
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Write 100K+1 keys, these are written to a few files
+  // in L0. We do this so that the current snapshot points
+  // to the 100001 key.The compaction filter is  not invoked
+  // on keys that are visible via a snapshot because we
+  // anyways cannot delete it.
+  const std::string value(10, 'x');
+  for (int i = 0; i < 100001; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    ASSERT_OK(Put(1, key, value));
+  }
+
+  // push all files to  lower levels
+  ASSERT_OK(Flush(1));
+  if (option_config_ != kUniversalCompactionMultiLevel &&
+      option_config_ != kUniversalSubcompactions) {
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+    ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
+  } else {
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
+  }
+
+  // re-write all data again
+  for (int i = 0; i < 100001; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    ASSERT_OK(Put(1, key, value));
+  }
+
+  // push all files to  lower levels. This should
+  // invoke the compaction filter for all 100000 keys.
+  ASSERT_OK(Flush(1));
+  if (option_config_ != kUniversalCompactionMultiLevel &&
+      option_config_ != kUniversalSubcompactions) {
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+    ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
+  } else {
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
+  }
+
+  // verify that all keys now have the new value that
+  // was set by the compaction process.
+  for (int i = 0; i < 100001; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    std::string newvalue = Get(1, key);
+    ASSERT_EQ(newvalue.compare(NEW_VALUE), 0);
+  }
+}
+
+TEST_F(DBTestCompactionFilter, CompactionFilterWithMergeOperator) {
+  std::string one, two, three, four;
+  PutFixed64(&one, 1);
+  PutFixed64(&two, 2);
+  PutFixed64(&three, 3);
+  PutFixed64(&four, 4);
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  options.num_levels = 3;
+  // Filter out keys with value is 2.
+  options.compaction_filter_factory =
+      std::make_shared<ConditionalFilterFactory>(two);
+  DestroyAndReopen(options);
+
+  // In the same compaction, a value type needs to be deleted based on
+  // compaction filter, and there is a merge type for the key. compaction
+  // filter result is ignored.
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", two));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "foo", one));
+  ASSERT_OK(Flush());
+  std::string newvalue = Get("foo");
+  ASSERT_EQ(newvalue, three);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  newvalue = Get("foo");
+  ASSERT_EQ(newvalue, three);
+
+  // value key can be deleted based on compaction filter, leaving only
+  // merge keys.
+  ASSERT_OK(db_->Put(WriteOptions(), "bar", two));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  newvalue = Get("bar");
+  ASSERT_EQ("NOT_FOUND", newvalue);
+  ASSERT_OK(db_->Merge(WriteOptions(), "bar", two));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  newvalue = Get("bar");
+  ASSERT_EQ(two, two);
+
+  // Compaction filter never applies to merge keys.
+  ASSERT_OK(db_->Put(WriteOptions(), "foobar", one));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "foobar", two));
+  ASSERT_OK(Flush());
+  newvalue = Get("foobar");
+  ASSERT_EQ(newvalue, three);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  newvalue = Get("foobar");
+  ASSERT_EQ(newvalue, three);
+
+  // In the same compaction, both of value type and merge type keys need to be
+  // deleted based on compaction filter, and there is a merge type for the key.
+  // For both keys, compaction filter results are ignored.
+  ASSERT_OK(db_->Put(WriteOptions(), "barfoo", two));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "barfoo", two));
+  ASSERT_OK(Flush());
+  newvalue = Get("barfoo");
+  ASSERT_EQ(newvalue, four);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  newvalue = Get("barfoo");
+  ASSERT_EQ(newvalue, four);
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) {
+  KeepFilterFactory* filter = new KeepFilterFactory(true, true);
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_filter_factory.reset(filter);
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 8;
+  Reopen(options);
+  int num_keys_per_file = 400;
+  for (int j = 0; j < 3; j++) {
+    // Write several keys.
+    const std::string value(10, 'x');
+    for (int i = 0; i < num_keys_per_file; i++) {
+      char key[100];
+      snprintf(key, sizeof(key), "B%08d%02d", i, j);
+      ASSERT_OK(Put(key, value));
+    }
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
+    // Make sure next file is much smaller so automatic compaction will not
+    // be triggered.
+    num_keys_per_file /= 2;
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Force a manual compaction
+  cfilter_count = 0;
+  filter->expect_manual_compaction_.store(true);
+  filter->expect_full_compaction_.store(true);
+  filter->expect_cf_id_.store(0);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(cfilter_count, 700);
+  ASSERT_EQ(NumSortedRuns(0), 1);
+  ASSERT_TRUE(filter->compaction_filter_created());
+
+  // Verify total number of keys is correct after manual compaction.
+  {
+    int count = 0;
+    int total = 0;
+    Arena arena;
+    InternalKeyComparator icmp(options.comparator);
+    ReadOptions read_options;
+    ScopedArenaIterator iter(dbfull()->NewInternalIterator(read_options, &arena,
+                                                           kMaxSequenceNumber));
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    while (iter->Valid()) {
+      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+      ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
+      total++;
+      if (ikey.sequence != 0) {
+        count++;
+      }
+      iter->Next();
+    }
+    ASSERT_EQ(total, 700);
+    ASSERT_EQ(count, 0);
+  }
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTestCompactionFilter, CompactionFilterContextCfId) {
+  KeepFilterFactory* filter = new KeepFilterFactory(false, true);
+  filter->expect_cf_id_.store(1);
+
+  Options options = CurrentOptions();
+  options.compaction_filter_factory.reset(filter);
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 2;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  int num_keys_per_file = 400;
+  for (int j = 0; j < 3; j++) {
+    // Write several keys.
+    const std::string value(10, 'x');
+    for (int i = 0; i < num_keys_per_file; i++) {
+      char key[100];
+      snprintf(key, sizeof(key), "B%08d%02d", i, j);
+      ASSERT_OK(Put(1, key, value));
+    }
+    ASSERT_OK(Flush(1));
+    // Make sure next file is much smaller so automatic compaction will not
+    // be triggered.
+    num_keys_per_file /= 2;
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_TRUE(filter->compaction_filter_created());
+}
+
+#ifndef ROCKSDB_LITE
+// Compaction filters aplies to all records, regardless snapshots.
+TEST_F(DBTestCompactionFilter, CompactionFilterIgnoreSnapshot) {
+  std::string five = std::to_string(5);
+  Options options = CurrentOptions();
+  options.compaction_filter_factory = std::make_shared<DeleteISFilterFactory>();
+  options.disable_auto_compactions = true;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+
+  // Put some data.
+  const Snapshot* snapshot = nullptr;
+  for (int table = 0; table < 4; ++table) {
+    for (int i = 0; i < 10; ++i) {
+      ASSERT_OK(Put(std::to_string(table * 100 + i), "val"));
+    }
+    ASSERT_OK(Flush());
+
+    if (table == 0) {
+      snapshot = db_->GetSnapshot();
+    }
+  }
+  assert(snapshot != nullptr);
+
+  cfilter_count = 0;
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  // The filter should delete 40 records.
+  ASSERT_EQ(40, cfilter_count);
+
+  {
+    // Scan the entire database as of the snapshot to ensure
+    // that nothing is left
+    ReadOptions read_options;
+    read_options.snapshot = snapshot;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    int count = 0;
+    while (iter->Valid()) {
+      count++;
+      iter->Next();
+    }
+    ASSERT_EQ(count, 6);
+    read_options.snapshot = nullptr;
+    std::unique_ptr<Iterator> iter1(db_->NewIterator(read_options));
+    ASSERT_OK(iter1->status());
+    iter1->SeekToFirst();
+    count = 0;
+    while (iter1->Valid()) {
+      count++;
+      iter1->Next();
+    }
+    // We have deleted 10 keys from 40 using the compaction filter
+    //  Keys 6-9 before the snapshot and 100-105 after the snapshot
+    ASSERT_EQ(count, 30);
+  }
+
+  // Release the snapshot and compact again -> now all records should be
+  // removed.
+  db_->ReleaseSnapshot(snapshot);
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTestCompactionFilter, SkipUntil) {
+  Options options = CurrentOptions();
+  options.compaction_filter_factory = std::make_shared<SkipEvenFilterFactory>();
+  options.disable_auto_compactions = true;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+
+  // Write 100K keys, these are written to a few files in L0.
+  for (int table = 0; table < 4; ++table) {
+    // Key ranges in tables are [0, 38], [106, 149], [212, 260], [318, 371].
+    for (int i = table * 6; i < 39 + table * 11; ++i) {
+      char key[100];
+      snprintf(key, sizeof(key), "%010d", table * 100 + i);
+      ASSERT_OK(Put(key, std::to_string(table * 1000 + i)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  cfilter_skips = 0;
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  // Number of skips in tables: 2, 3, 3, 3.
+  ASSERT_EQ(11, cfilter_skips);
+
+  for (int table = 0; table < 4; ++table) {
+    for (int i = table * 6; i < 39 + table * 11; ++i) {
+      int k = table * 100 + i;
+      char key[100];
+      snprintf(key, sizeof(key), "%010d", table * 100 + i);
+      auto expected = std::to_string(table * 1000 + i);
+      std::string val;
+      Status s = db_->Get(ReadOptions(), key, &val);
+      if (k / 10 % 2 == 0) {
+        ASSERT_TRUE(s.IsNotFound());
+      } else {
+        ASSERT_OK(s);
+        ASSERT_EQ(expected, val);
+      }
+    }
+  }
+}
+
+TEST_F(DBTestCompactionFilter, SkipUntilWithBloomFilter) {
+  BlockBasedTableOptions table_options;
+  table_options.whole_key_filtering = false;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(100, false));
+
+  Options options = CurrentOptions();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset(NewCappedPrefixTransform(9));
+  options.compaction_filter_factory = std::make_shared<SkipEvenFilterFactory>();
+  options.disable_auto_compactions = true;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("0000000010", "v10"));
+  ASSERT_OK(Put("0000000020", "v20"));  // skipped
+  ASSERT_OK(Put("0000000050", "v50"));
+  ASSERT_OK(Flush());
+
+  cfilter_skips = 0;
+  EXPECT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  EXPECT_EQ(1, cfilter_skips);
+
+  Status s;
+  std::string val;
+
+  s = db_->Get(ReadOptions(), "0000000010", &val);
+  ASSERT_OK(s);
+  EXPECT_EQ("v10", val);
+
+  s = db_->Get(ReadOptions(), "0000000020", &val);
+  EXPECT_TRUE(s.IsNotFound());
+
+  s = db_->Get(ReadOptions(), "0000000050", &val);
+  ASSERT_OK(s);
+  EXPECT_EQ("v50", val);
+}
+
+class TestNotSupportedFilter : public CompactionFilter {
+ public:
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    return true;
+  }
+
+  const char* Name() const override { return "NotSupported"; }
+  bool IgnoreSnapshots() const override { return false; }
+};
+
+TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalse) {
+  Options options = CurrentOptions();
+  options.compaction_filter = new TestNotSupportedFilter();
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("a", "v10"));
+  ASSERT_OK(Put("z", "v20"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("a", "v10"));
+  ASSERT_OK(Put("z", "v20"));
+  ASSERT_OK(Flush());
+
+  // Comapction should fail because IgnoreSnapshots() = false
+  EXPECT_TRUE(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)
+                  .IsNotSupported());
+
+  delete options.compaction_filter;
+}
+
+class TestNotSupportedFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit TestNotSupportedFilterFactory(TableFileCreationReason reason)
+      : reason_(reason) {}
+
+  bool ShouldFilterTableFileCreation(
+      TableFileCreationReason reason) const override {
+    return reason_ == reason;
+  }
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& /* context */) override {
+    return std::unique_ptr<CompactionFilter>(new TestNotSupportedFilter());
+  }
+
+  const char* Name() const override { return "TestNotSupportedFilterFactory"; }
+
+ private:
+  const TableFileCreationReason reason_;
+};
+
+TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalseDuringFlush) {
+  Options options = CurrentOptions();
+  options.compaction_filter_factory =
+      std::make_shared<TestNotSupportedFilterFactory>(
+          TableFileCreationReason::kFlush);
+  Reopen(options);
+
+  ASSERT_OK(Put("a", "v10"));
+  ASSERT_TRUE(Flush().IsNotSupported());
+}
+
+TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalseRecovery) {
+  Options options = CurrentOptions();
+  options.compaction_filter_factory =
+      std::make_shared<TestNotSupportedFilterFactory>(
+          TableFileCreationReason::kRecovery);
+  Reopen(options);
+
+  ASSERT_OK(Put("a", "v10"));
+  ASSERT_TRUE(TryReopen(options).IsNotSupported());
+}
+
+TEST_F(DBTestCompactionFilter, DropKeyWithSingleDelete) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+
+  Reopen(options);
+
+  ASSERT_OK(Put("a", "v0"));
+  ASSERT_OK(Put("b", "v0"));
+  const Snapshot* snapshot = db_->GetSnapshot();
+
+  ASSERT_OK(SingleDelete("b"));
+  ASSERT_OK(Flush());
+
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = options.num_levels - 1;
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  }
+
+  db_->ReleaseSnapshot(snapshot);
+  Close();
+
+  class DeleteFilterV2 : public CompactionFilter {
+   public:
+    Decision FilterV2(int /*level*/, const Slice& key, ValueType /*value_type*/,
+                      const Slice& /*existing_value*/,
+                      std::string* /*new_value*/,
+                      std::string* /*skip_until*/) const override {
+      if (key.starts_with("b")) {
+        return Decision::kPurge;
+      }
+      return Decision::kRemove;
+    }
+
+    const char* Name() const override { return "DeleteFilterV2"; }
+  } delete_filter_v2;
+
+  options.compaction_filter = &delete_filter_v2;
+  options.level0_file_num_compaction_trigger = 2;
+  Reopen(options);
+
+  ASSERT_OK(Put("b", "v1"));
+  ASSERT_OK(Put("x", "v1"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("r", "v1"));
+  ASSERT_OK(Put("z", "v1"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  Close();
+
+  options.compaction_filter = nullptr;
+  Reopen(options);
+  ASSERT_OK(SingleDelete("b"));
+  ASSERT_OK(Flush());
+  {
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_compaction_test.cc b/src/rocksdb/db/db_compaction_test.cc
new file mode 100644
index 000000000..ba9c50b9a
--- /dev/null
+++ b/src/rocksdb/db/db_compaction_test.cc
@@ -0,0 +1,8227 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <tuple>
+
+#include "db/blob/blob_index.h"
+#include "db/db_test_util.h"
+#include "env/mock_env.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/concurrent_task_limiter.h"
+#include "rocksdb/experimental.h"
+#include "rocksdb/sst_file_writer.h"
+#include "rocksdb/utilities/convenience.h"
+#include "test_util/sync_point.h"
+#include "test_util/testutil.h"
+#include "util/concurrent_task_limiter_impl.h"
+#include "util/random.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/fault_injection_fs.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// SYNC_POINT is not supported in released Windows mode.
+#if !defined(ROCKSDB_LITE)
+
+class CompactionStatsCollector : public EventListener {
+ public:
+  CompactionStatsCollector()
+      : compaction_completed_(
+            static_cast<int>(CompactionReason::kNumOfReasons)) {
+    for (auto& v : compaction_completed_) {
+      v.store(0);
+    }
+  }
+
+  ~CompactionStatsCollector() override {}
+
+  void OnCompactionCompleted(DB* /* db */,
+                             const CompactionJobInfo& info) override {
+    int k = static_cast<int>(info.compaction_reason);
+    int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+    assert(k >= 0 && k < num_of_reasons);
+    compaction_completed_[k]++;
+  }
+
+  void OnExternalFileIngested(
+      DB* /* db */, const ExternalFileIngestionInfo& /* info */) override {
+    int k = static_cast<int>(CompactionReason::kExternalSstIngestion);
+    compaction_completed_[k]++;
+  }
+
+  void OnFlushCompleted(DB* /* db */, const FlushJobInfo& /* info */) override {
+    int k = static_cast<int>(CompactionReason::kFlush);
+    compaction_completed_[k]++;
+  }
+
+  int NumberOfCompactions(CompactionReason reason) const {
+    int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+    int k = static_cast<int>(reason);
+    assert(k >= 0 && k < num_of_reasons);
+    return compaction_completed_.at(k).load();
+  }
+
+ private:
+  std::vector<std::atomic<int>> compaction_completed_;
+};
+
+class DBCompactionTest : public DBTestBase {
+ public:
+  DBCompactionTest()
+      : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) {}
+
+ protected:
+  /*
+   * Verifies compaction stats of cfd are valid.
+   *
+   * For each level of cfd, its compaction stats are valid if
+   * 1) sum(stat.counts) == stat.count, and
+   * 2) stat.counts[i] == collector.NumberOfCompactions(i)
+   */
+  void VerifyCompactionStats(ColumnFamilyData& cfd,
+                             const CompactionStatsCollector& collector) {
+#ifndef NDEBUG
+    InternalStats* internal_stats_ptr = cfd.internal_stats();
+    ASSERT_NE(internal_stats_ptr, nullptr);
+    const std::vector<InternalStats::CompactionStats>& comp_stats =
+        internal_stats_ptr->TEST_GetCompactionStats();
+    const int num_of_reasons =
+        static_cast<int>(CompactionReason::kNumOfReasons);
+    std::vector<int> counts(num_of_reasons, 0);
+    // Count the number of compactions caused by each CompactionReason across
+    // all levels.
+    for (const auto& stat : comp_stats) {
+      int sum = 0;
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] += stat.counts[i];
+        sum += stat.counts[i];
+      }
+      ASSERT_EQ(sum, stat.count);
+    }
+    // Verify InternalStats bookkeeping matches that of
+    // CompactionStatsCollector, assuming that all compactions complete.
+    for (int i = 0; i < num_of_reasons; i++) {
+      ASSERT_EQ(collector.NumberOfCompactions(static_cast<CompactionReason>(i)),
+                counts[i]);
+    }
+#endif /* NDEBUG */
+  }
+};
+
+class DBCompactionTestWithParam
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<uint32_t, bool>> {
+ public:
+  DBCompactionTestWithParam()
+      : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) {
+    max_subcompactions_ = std::get<0>(GetParam());
+    exclusive_manual_compaction_ = std::get<1>(GetParam());
+  }
+
+  // Required if inheriting from testing::WithParamInterface<>
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  uint32_t max_subcompactions_;
+  bool exclusive_manual_compaction_;
+};
+
+class DBCompactionTestWithBottommostParam
+    : public DBTestBase,
+      public testing::WithParamInterface<BottommostLevelCompaction> {
+ public:
+  DBCompactionTestWithBottommostParam()
+      : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) {
+    bottommost_level_compaction_ = GetParam();
+  }
+
+  BottommostLevelCompaction bottommost_level_compaction_;
+};
+
+class DBCompactionDirectIOTest : public DBCompactionTest,
+                                 public ::testing::WithParamInterface<bool> {
+ public:
+  DBCompactionDirectIOTest() : DBCompactionTest() {}
+};
+
+// Param = true : target level is non-empty
+// Param = false: level between target level and source level
+//  is not empty.
+class ChangeLevelConflictsWithAuto
+    : public DBCompactionTest,
+      public ::testing::WithParamInterface<bool> {
+ public:
+  ChangeLevelConflictsWithAuto() : DBCompactionTest() {}
+};
+
+// Param = true: grab the compaction pressure token (enable
+// parallel compactions)
+// Param = false: Not grab the token (no parallel compactions)
+class RoundRobinSubcompactionsAgainstPressureToken
+    : public DBCompactionTest,
+      public ::testing::WithParamInterface<bool> {
+ public:
+  RoundRobinSubcompactionsAgainstPressureToken() {
+    grab_pressure_token_ = GetParam();
+  }
+  bool grab_pressure_token_;
+};
+
+class RoundRobinSubcompactionsAgainstResources
+    : public DBCompactionTest,
+      public ::testing::WithParamInterface<std::tuple<int, int>> {
+ public:
+  RoundRobinSubcompactionsAgainstResources() {
+    total_low_pri_threads_ = std::get<0>(GetParam());
+    max_compaction_limits_ = std::get<1>(GetParam());
+  }
+  int total_low_pri_threads_;
+  int max_compaction_limits_;
+};
+
+namespace {
+class FlushedFileCollector : public EventListener {
+ public:
+  FlushedFileCollector() {}
+  ~FlushedFileCollector() override {}
+
+  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    flushed_files_.push_back(info.file_path);
+  }
+
+  std::vector<std::string> GetFlushedFiles() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    std::vector<std::string> result;
+    for (auto fname : flushed_files_) {
+      result.push_back(fname);
+    }
+    return result;
+  }
+
+  void ClearFlushedFiles() { flushed_files_.clear(); }
+
+ private:
+  std::vector<std::string> flushed_files_;
+  std::mutex mutex_;
+};
+
+class SstStatsCollector : public EventListener {
+ public:
+  SstStatsCollector() : num_ssts_creation_started_(0) {}
+
+  void OnTableFileCreationStarted(
+      const TableFileCreationBriefInfo& /* info */) override {
+    ++num_ssts_creation_started_;
+  }
+
+  int num_ssts_creation_started() { return num_ssts_creation_started_; }
+
+ private:
+  std::atomic<int> num_ssts_creation_started_;
+};
+
+static const int kCDTValueSize = 1000;
+static const int kCDTKeysPerBuffer = 4;
+static const int kCDTNumLevels = 8;
+Options DeletionTriggerOptions(Options options) {
+  options.compression = kNoCompression;
+  options.write_buffer_size = kCDTKeysPerBuffer * (kCDTValueSize + 24);
+  options.min_write_buffer_number_to_merge = 1;
+  options.max_write_buffer_size_to_maintain = 0;
+  options.num_levels = kCDTNumLevels;
+  options.level0_file_num_compaction_trigger = 1;
+  options.target_file_size_base = options.write_buffer_size * 2;
+  options.target_file_size_multiplier = 2;
+  options.max_bytes_for_level_base =
+      options.target_file_size_base * options.target_file_size_multiplier;
+  options.max_bytes_for_level_multiplier = 2;
+  options.disable_auto_compactions = false;
+  options.compaction_options_universal.max_size_amplification_percent = 100;
+  return options;
+}
+
+bool HaveOverlappingKeyRanges(const Comparator* c, const SstFileMetaData& a,
+                              const SstFileMetaData& b) {
+  if (c->CompareWithoutTimestamp(a.smallestkey, b.smallestkey) >= 0) {
+    if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) {
+      // b.smallestkey <= a.smallestkey <= b.largestkey
+      return true;
+    }
+  } else if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) {
+    // a.smallestkey < b.smallestkey <= a.largestkey
+    return true;
+  }
+  if (c->CompareWithoutTimestamp(a.largestkey, b.largestkey) <= 0) {
+    if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) {
+      // b.smallestkey <= a.largestkey <= b.largestkey
+      return true;
+    }
+  } else if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) {
+    // a.smallestkey <= b.largestkey < a.largestkey
+    return true;
+  }
+  return false;
+}
+
+// Identifies all files between level "min_level" and "max_level"
+// which has overlapping key range with "input_file_meta".
+void GetOverlappingFileNumbersForLevelCompaction(
+    const ColumnFamilyMetaData& cf_meta, const Comparator* comparator,
+    int min_level, int max_level, const SstFileMetaData* input_file_meta,
+    std::set<std::string>* overlapping_file_names) {
+  std::set<const SstFileMetaData*> overlapping_files;
+  overlapping_files.insert(input_file_meta);
+  for (int m = min_level; m <= max_level; ++m) {
+    for (auto& file : cf_meta.levels[m].files) {
+      for (auto* included_file : overlapping_files) {
+        if (HaveOverlappingKeyRanges(comparator, *included_file, file)) {
+          overlapping_files.insert(&file);
+          overlapping_file_names->insert(file.name);
+          break;
+        }
+      }
+    }
+  }
+}
+
+void VerifyCompactionResult(
+    const ColumnFamilyMetaData& cf_meta,
+    const std::set<std::string>& overlapping_file_numbers) {
+#ifndef NDEBUG
+  for (auto& level : cf_meta.levels) {
+    for (auto& file : level.files) {
+      assert(overlapping_file_numbers.find(file.name) ==
+             overlapping_file_numbers.end());
+    }
+  }
+#endif
+}
+
+const SstFileMetaData* PickFileRandomly(const ColumnFamilyMetaData& cf_meta,
+                                        Random* rand, int* level = nullptr) {
+  auto file_id = rand->Uniform(static_cast<int>(cf_meta.file_count)) + 1;
+  for (auto& level_meta : cf_meta.levels) {
+    if (file_id <= level_meta.files.size()) {
+      if (level != nullptr) {
+        *level = level_meta.level;
+      }
+      auto result = rand->Uniform(file_id);
+      return &(level_meta.files[result]);
+    }
+    file_id -= static_cast<uint32_t>(level_meta.files.size());
+  }
+  assert(false);
+  return nullptr;
+}
+}  // anonymous namespace
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+// All the TEST_P tests run once with sub_compactions disabled (i.e.
+// options.max_subcompactions = 1) and once with it enabled
+TEST_P(DBCompactionTestWithParam, CompactionDeletionTrigger) {
+  for (int tid = 0; tid < 3; ++tid) {
+    uint64_t db_size[2];
+    Options options = DeletionTriggerOptions(CurrentOptions());
+    options.max_subcompactions = max_subcompactions_;
+
+    if (tid == 1) {
+      // the following only disable stats update in DB::Open()
+      // and should not affect the result of this test.
+      options.skip_stats_update_on_db_open = true;
+    } else if (tid == 2) {
+      // third pass with universal compaction
+      options.compaction_style = kCompactionStyleUniversal;
+      options.num_levels = 1;
+    }
+
+    DestroyAndReopen(options);
+    Random rnd(301);
+
+    const int kTestSize = kCDTKeysPerBuffer * 1024;
+    std::vector<std::string> values;
+    for (int k = 0; k < kTestSize; ++k) {
+      values.push_back(rnd.RandomString(kCDTValueSize));
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0]));
+
+    for (int k = 0; k < kTestSize; ++k) {
+      ASSERT_OK(Delete(Key(k)));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1]));
+
+    if (options.compaction_style == kCompactionStyleUniversal) {
+      // Claim: in universal compaction none of the original data will remain
+      // once compactions settle.
+      //
+      // Proof: The compensated size of the file containing the most tombstones
+      // is enough on its own to trigger size amp compaction. Size amp
+      // compaction is a full compaction, so all tombstones meet the obsolete
+      // keys they cover.
+      ASSERT_EQ(0, db_size[1]);
+    } else {
+      // Claim: in level compaction at most `db_size[0] / 2` of the original
+      // data will remain once compactions settle.
+      //
+      // Proof: Assume the original data is all in the bottom level. If it were
+      // not, it would meet its tombstone sooner. The original data size is
+      // large enough to require fanout to bottom level to be greater than
+      // `max_bytes_for_level_multiplier == 2`. In the level just above,
+      // tombstones must cover less than `db_size[0] / 4` bytes since fanout >=
+      // 2 and file size is compensated by doubling the size of values we expect
+      // are covered (`kDeletionWeightOnCompaction == 2`). The tombstones in
+      // levels above must cover less than `db_size[0] / 8` bytes of original
+      // data, `db_size[0] / 16`, and so on.
+      ASSERT_GT(db_size[0] / 2, db_size[1]);
+    }
+  }
+}
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_F(DBCompactionTest, SkipStatsUpdateTest) {
+  // This test verify UpdateAccumulatedStats is not on
+  // if options.skip_stats_update_on_db_open = true
+  // The test will need to be updated if the internal behavior changes.
+
+  Options options = DeletionTriggerOptions(CurrentOptions());
+  options.disable_auto_compactions = true;
+  options.env = env_;
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  const int kTestSize = kCDTKeysPerBuffer * 512;
+  std::vector<std::string> values;
+  for (int k = 0; k < kTestSize; ++k) {
+    values.push_back(rnd.RandomString(kCDTValueSize));
+    ASSERT_OK(Put(Key(k), values[k]));
+  }
+
+  ASSERT_OK(Flush());
+
+  Close();
+
+  int update_acc_stats_called = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "VersionStorageInfo::UpdateAccumulatedStats",
+      [&](void* /* arg */) { ++update_acc_stats_called; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Reopen the DB with stats-update disabled
+  options.skip_stats_update_on_db_open = true;
+  options.max_open_files = 20;
+  Reopen(options);
+
+  ASSERT_EQ(update_acc_stats_called, 0);
+
+  // Repeat the reopen process, but this time we enable
+  // stats-update.
+  options.skip_stats_update_on_db_open = false;
+  Reopen(options);
+
+  ASSERT_GT(update_acc_stats_called, 0);
+
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, TestTableReaderForCompaction) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.max_open_files = 20;
+  options.level0_file_num_compaction_trigger = 3;
+  // Avoid many shards with small max_open_files, where as little as
+  // two table insertions could lead to an LRU eviction, depending on
+  // hash values.
+  options.table_cache_numshardbits = 2;
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  int num_table_cache_lookup = 0;
+  int num_new_table_reader = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "TableCache::FindTable:0", [&](void* arg) {
+        assert(arg != nullptr);
+        bool no_io = *(reinterpret_cast<bool*>(arg));
+        if (!no_io) {
+          // filter out cases for table properties queries.
+          num_table_cache_lookup++;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "TableCache::GetTableReader:0",
+      [&](void* /*arg*/) { num_new_table_reader++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int k = 0; k < options.level0_file_num_compaction_trigger; ++k) {
+    ASSERT_OK(Put(Key(k), Key(k)));
+    ASSERT_OK(Put(Key(10 - k), "bar"));
+    if (k < options.level0_file_num_compaction_trigger - 1) {
+      num_table_cache_lookup = 0;
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      // preloading iterator issues one table cache lookup and create
+      // a new table reader, if not preloaded.
+      int old_num_table_cache_lookup = num_table_cache_lookup;
+      ASSERT_GE(num_table_cache_lookup, 1);
+      ASSERT_EQ(num_new_table_reader, 1);
+
+      num_table_cache_lookup = 0;
+      num_new_table_reader = 0;
+      ASSERT_EQ(Key(k), Get(Key(k)));
+      // lookup iterator from table cache and no need to create a new one.
+      ASSERT_EQ(old_num_table_cache_lookup + num_table_cache_lookup, 2);
+      ASSERT_EQ(num_new_table_reader, 0);
+    }
+  }
+
+  num_table_cache_lookup = 0;
+  num_new_table_reader = 0;
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Preloading iterator issues one table cache lookup and creates
+  // a new table reader. One file is created for flush and one for compaction.
+  // Compaction inputs make no table cache look-up for data/range deletion
+  // iterators
+  // May preload table cache too.
+  ASSERT_GE(num_table_cache_lookup, 2);
+  int old_num_table_cache_lookup2 = num_table_cache_lookup;
+
+  // Create new iterator for:
+  // (1) 1 for verifying flush results
+  // (2) 1 for verifying compaction results.
+  // (3) New TableReaders will not be created for compaction inputs
+  ASSERT_EQ(num_new_table_reader, 2);
+
+  num_table_cache_lookup = 0;
+  num_new_table_reader = 0;
+  ASSERT_EQ(Key(1), Get(Key(1)));
+  ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 5);
+  ASSERT_EQ(num_new_table_reader, 0);
+
+  num_table_cache_lookup = 0;
+  num_new_table_reader = 0;
+  CompactRangeOptions cro;
+  cro.change_level = true;
+  cro.target_level = 2;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  // Only verifying compaction outputs issues one table cache lookup
+  // for both data block and range deletion block).
+  // May preload table cache too.
+  ASSERT_GE(num_table_cache_lookup, 1);
+  old_num_table_cache_lookup2 = num_table_cache_lookup;
+  // One for verifying compaction results.
+  // No new iterator created for compaction.
+  ASSERT_EQ(num_new_table_reader, 1);
+
+  num_table_cache_lookup = 0;
+  num_new_table_reader = 0;
+  ASSERT_EQ(Key(1), Get(Key(1)));
+  ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 3);
+  ASSERT_EQ(num_new_table_reader, 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBCompactionTestWithParam, CompactionDeletionTriggerReopen) {
+  for (int tid = 0; tid < 2; ++tid) {
+    uint64_t db_size[3];
+    Options options = DeletionTriggerOptions(CurrentOptions());
+    options.max_subcompactions = max_subcompactions_;
+
+    if (tid == 1) {
+      // second pass with universal compaction
+      options.compaction_style = kCompactionStyleUniversal;
+      options.num_levels = 1;
+    }
+
+    DestroyAndReopen(options);
+    Random rnd(301);
+
+    // round 1 --- insert key/value pairs.
+    const int kTestSize = kCDTKeysPerBuffer * 512;
+    std::vector<std::string> values;
+    for (int k = 0; k < kTestSize; ++k) {
+      values.push_back(rnd.RandomString(kCDTValueSize));
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0]));
+    Close();
+
+    // round 2 --- disable auto-compactions and issue deletions.
+    options.create_if_missing = false;
+    options.disable_auto_compactions = true;
+    Reopen(options);
+
+    for (int k = 0; k < kTestSize; ++k) {
+      ASSERT_OK(Delete(Key(k)));
+    }
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1]));
+    Close();
+    // as auto_compaction is off, we shouldn't see any reduction in db size.
+    ASSERT_LE(db_size[0], db_size[1]);
+
+    // round 3 --- reopen db with auto_compaction on and see if
+    // deletion compensation still work.
+    options.disable_auto_compactions = false;
+    Reopen(options);
+    // insert relatively small amount of data to trigger auto compaction.
+    for (int k = 0; k < kTestSize / 10; ++k) {
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[2]));
+    // this time we're expecting significant drop in size.
+    //
+    // See "CompactionDeletionTrigger" test for proof that at most
+    // `db_size[0] / 2` of the original data remains. In addition to that, this
+    // test inserts `db_size[0] / 10` to push the tombstones into SST files and
+    // then through automatic compactions. So in total `3 * db_size[0] / 5` of
+    // the original data may remain.
+    ASSERT_GT(3 * db_size[0] / 5, db_size[2]);
+  }
+}
+
+TEST_F(DBCompactionTest, CompactRangeBottomPri) {
+  ASSERT_OK(Put(Key(50), ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(100), ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(200), ""));
+  ASSERT_OK(Flush());
+
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 2;
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  }
+  ASSERT_EQ("0,0,3", FilesPerLevel(0));
+
+  ASSERT_OK(Put(Key(1), ""));
+  ASSERT_OK(Put(Key(199), ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(2), ""));
+  ASSERT_OK(Put(Key(199), ""));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("2,0,3", FilesPerLevel(0));
+
+  // Now we have 2 L0 files, and 3 L2 files, and a manual compaction will
+  // be triggered.
+  // Two compaction jobs will run. One compacts 2 L0 files in Low Pri Pool
+  // and one compact to L2 in bottom pri pool.
+  int low_pri_count = 0;
+  int bottom_pri_count = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "ThreadPoolImpl::Impl::BGThread:BeforeRun", [&](void* arg) {
+        Env::Priority* pri = reinterpret_cast<Env::Priority*>(arg);
+        // First time is low pri pool in the test case.
+        if (low_pri_count == 0 && bottom_pri_count == 0) {
+          ASSERT_EQ(Env::Priority::LOW, *pri);
+        }
+        if (*pri == Env::Priority::LOW) {
+          low_pri_count++;
+        } else {
+          bottom_pri_count++;
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  env_->SetBackgroundThreads(1, Env::Priority::BOTTOM);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(1, low_pri_count);
+  ASSERT_EQ(1, bottom_pri_count);
+  ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+  // Recompact bottom most level uses bottom pool
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ(1, low_pri_count);
+  ASSERT_EQ(2, bottom_pri_count);
+
+  env_->SetBackgroundThreads(0, Env::Priority::BOTTOM);
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  // Low pri pool is used if bottom pool has size 0.
+  ASSERT_EQ(2, low_pri_count);
+  ASSERT_EQ(2, bottom_pri_count);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, DisableStatsUpdateReopen) {
+  uint64_t db_size[3];
+  for (int test = 0; test < 2; ++test) {
+    Options options = DeletionTriggerOptions(CurrentOptions());
+    options.skip_stats_update_on_db_open = (test == 0);
+
+    env_->random_read_counter_.Reset();
+    DestroyAndReopen(options);
+    Random rnd(301);
+
+    // round 1 --- insert key/value pairs.
+    const int kTestSize = kCDTKeysPerBuffer * 512;
+    std::vector<std::string> values;
+    for (int k = 0; k < kTestSize; ++k) {
+      values.push_back(rnd.RandomString(kCDTValueSize));
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    // L1 and L2 can fit deletions iff size compensation does not take effect,
+    // i.e., when `skip_stats_update_on_db_open == true`. Move any remaining
+    // files at or above L2 down to L3 to ensure obsolete data does not
+    // accidentally meet its tombstone above L3. This makes the final size more
+    // deterministic and easy to see whether size compensation for deletions
+    // took effect.
+    MoveFilesToLevel(3 /* level */);
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0]));
+    Close();
+
+    // round 2 --- disable auto-compactions and issue deletions.
+    options.create_if_missing = false;
+    options.disable_auto_compactions = true;
+
+    env_->random_read_counter_.Reset();
+    Reopen(options);
+
+    for (int k = 0; k < kTestSize; ++k) {
+      ASSERT_OK(Delete(Key(k)));
+    }
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1]));
+    Close();
+    // as auto_compaction is off, we shouldn't see any reduction in db size.
+    ASSERT_LE(db_size[0], db_size[1]);
+
+    // round 3 --- reopen db with auto_compaction on and see if
+    // deletion compensation still work.
+    options.disable_auto_compactions = false;
+    Reopen(options);
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[2]));
+
+    if (options.skip_stats_update_on_db_open) {
+      // If update stats on DB::Open is disable, we don't expect
+      // deletion entries taking effect.
+      //
+      // The deletions are small enough to fit in L1 and L2, and obsolete keys
+      // were moved to L3+, so none of the original data should have been
+      // dropped.
+      ASSERT_LE(db_size[0], db_size[2]);
+    } else {
+      // Otherwise, we should see a significant drop in db size.
+      //
+      // See "CompactionDeletionTrigger" test for proof that at most
+      // `db_size[0] / 2` of the original data remains.
+      ASSERT_GT(db_size[0] / 2, db_size[2]);
+    }
+  }
+}
+
+TEST_P(DBCompactionTestWithParam, CompactionTrigger) {
+  const int kNumKeysPerFile = 100;
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.num_levels = 3;
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_subcompactions = max_subcompactions_;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Random rnd(301);
+
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    std::vector<std::string> values;
+    // Write 100KB (100 values, each 1K)
+    for (int i = 0; i < kNumKeysPerFile; i++) {
+      values.push_back(rnd.RandomString(990));
+      ASSERT_OK(Put(1, Key(i), values[i]));
+    }
+    // put extra key to trigger flush
+    ASSERT_OK(Put(1, "", ""));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1);
+  }
+
+  // generate one more file in level-0, and should trigger level-0 compaction
+  std::vector<std::string> values;
+  for (int i = 0; i < kNumKeysPerFile; i++) {
+    values.push_back(rnd.RandomString(990));
+    ASSERT_OK(Put(1, Key(i), values[i]));
+  }
+  // put extra key to trigger flush
+  ASSERT_OK(Put(1, "", ""));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 1);
+}
+
+TEST_F(DBCompactionTest, BGCompactionsAllowed) {
+  // Create several column families. Make compaction triggers in all of them
+  // and see number of compactions scheduled to be less than allowed.
+  const int kNumKeysPerFile = 100;
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.num_levels = 3;
+  // Should speed up compaction when there are 4 files.
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 20;
+  options.soft_pending_compaction_bytes_limit = 1 << 30;  // Infinitely large
+  options.max_background_compactions = 3;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+
+  // Block all threads in thread pool.
+  const size_t kTotalTasks = 4;
+  env_->SetBackgroundThreads(4, Env::LOW);
+  test::SleepingBackgroundTask sleeping_tasks[kTotalTasks];
+  for (size_t i = 0; i < kTotalTasks; i++) {
+    env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                   &sleeping_tasks[i], Env::Priority::LOW);
+    sleeping_tasks[i].WaitUntilSleeping();
+  }
+
+  CreateAndReopenWithCF({"one", "two", "three"}, options);
+
+  Random rnd(301);
+  for (int cf = 0; cf < 4; cf++) {
+    for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+      for (int i = 0; i < kNumKeysPerFile; i++) {
+        ASSERT_OK(Put(cf, Key(i), ""));
+      }
+      // put extra key to trigger flush
+      ASSERT_OK(Put(cf, "", ""));
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
+      ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1);
+    }
+  }
+
+  // Now all column families qualify compaction but only one should be
+  // scheduled, because no column family hits speed up condition.
+  ASSERT_EQ(1u, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+
+  // Create two more files for one column family, which triggers speed up
+  // condition, three compactions will be scheduled.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+    for (int i = 0; i < kNumKeysPerFile; i++) {
+      ASSERT_OK(Put(2, Key(i), ""));
+    }
+    // put extra key to trigger flush
+    ASSERT_OK(Put(2, "", ""));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2]));
+    ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1,
+              NumTableFilesAtLevel(0, 2));
+  }
+  ASSERT_EQ(3U, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+
+  // Unblock all threads to unblock all compactions.
+  for (size_t i = 0; i < kTotalTasks; i++) {
+    sleeping_tasks[i].WakeUp();
+    sleeping_tasks[i].WaitUntilDone();
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Verify number of compactions allowed will come back to 1.
+
+  for (size_t i = 0; i < kTotalTasks; i++) {
+    sleeping_tasks[i].Reset();
+    env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                   &sleeping_tasks[i], Env::Priority::LOW);
+    sleeping_tasks[i].WaitUntilSleeping();
+  }
+  for (int cf = 0; cf < 4; cf++) {
+    for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+      for (int i = 0; i < kNumKeysPerFile; i++) {
+        ASSERT_OK(Put(cf, Key(i), ""));
+      }
+      // put extra key to trigger flush
+      ASSERT_OK(Put(cf, "", ""));
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
+      ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1);
+    }
+  }
+
+  // Now all column families qualify compaction but only one should be
+  // scheduled, because no column family hits speed up condition.
+  ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+
+  for (size_t i = 0; i < kTotalTasks; i++) {
+    sleeping_tasks[i].WakeUp();
+    sleeping_tasks[i].WaitUntilDone();
+  }
+}
+
+TEST_P(DBCompactionTestWithParam, CompactionsGenerateMultipleFiles) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000000;  // Large write buffer
+  options.max_subcompactions = max_subcompactions_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Random rnd(301);
+
+  // Write 8MB (80 values, each 100K)
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  std::vector<std::string> values;
+  for (int i = 0; i < 80; i++) {
+    values.push_back(rnd.RandomString(100000));
+    ASSERT_OK(Put(1, Key(i), values[i]));
+  }
+
+  // Reopening moves updates to level-0
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
+                                        true /* disallow trivial move */));
+
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_GT(NumTableFilesAtLevel(1, 1), 1);
+  for (int i = 0; i < 80; i++) {
+    ASSERT_EQ(Get(1, Key(i)), values[i]);
+  }
+}
+
+TEST_F(DBCompactionTest, MinorCompactionsHappen) {
+  do {
+    Options options = CurrentOptions();
+    options.write_buffer_size = 10000;
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    const int N = 500;
+
+    int starting_num_tables = TotalTableFiles(1);
+    for (int i = 0; i < N; i++) {
+      ASSERT_OK(Put(1, Key(i), Key(i) + std::string(1000, 'v')));
+    }
+    int ending_num_tables = TotalTableFiles(1);
+    ASSERT_GT(ending_num_tables, starting_num_tables);
+
+    for (int i = 0; i < N; i++) {
+      ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i)));
+    }
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+    for (int i = 0; i < N; i++) {
+      ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i)));
+    }
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBCompactionTest, UserKeyCrossFile1) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.level0_file_num_compaction_trigger = 3;
+
+  DestroyAndReopen(options);
+
+  // create first file and flush to l0
+  ASSERT_OK(Put("4", "A"));
+  ASSERT_OK(Put("3", "A"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  ASSERT_OK(Put("2", "A"));
+  ASSERT_OK(Delete("3"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_EQ("NOT_FOUND", Get("3"));
+
+  // move both files down to l1
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("NOT_FOUND", Get("3"));
+
+  for (int i = 0; i < 3; i++) {
+    ASSERT_OK(Put("2", "B"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ("NOT_FOUND", Get("3"));
+}
+
+TEST_F(DBCompactionTest, UserKeyCrossFile2) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.level0_file_num_compaction_trigger = 3;
+
+  DestroyAndReopen(options);
+
+  // create first file and flush to l0
+  ASSERT_OK(Put("4", "A"));
+  ASSERT_OK(Put("3", "A"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  ASSERT_OK(Put("2", "A"));
+  ASSERT_OK(SingleDelete("3"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_EQ("NOT_FOUND", Get("3"));
+
+  // move both files down to l1
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("NOT_FOUND", Get("3"));
+
+  for (int i = 0; i < 3; i++) {
+    ASSERT_OK(Put("2", "B"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ("NOT_FOUND", Get("3"));
+}
+
+TEST_F(DBCompactionTest, CompactionSstPartitioner) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.level0_file_num_compaction_trigger = 3;
+  std::shared_ptr<SstPartitionerFactory> factory(
+      NewSstPartitionerFixedPrefixFactory(4));
+  options.sst_partitioner_factory = factory;
+
+  DestroyAndReopen(options);
+
+  // create first file and flush to l0
+  ASSERT_OK(Put("aaaa1", "A"));
+  ASSERT_OK(Put("bbbb1", "B"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  ASSERT_OK(Put("aaaa1", "A2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  // move both files down to l1
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  std::vector<LiveFileMetaData> files;
+  dbfull()->GetLiveFilesMetaData(&files);
+  ASSERT_EQ(2, files.size());
+  ASSERT_EQ("A2", Get("aaaa1"));
+  ASSERT_EQ("B", Get("bbbb1"));
+}
+
+TEST_F(DBCompactionTest, CompactionSstPartitionerNonTrivial) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.level0_file_num_compaction_trigger = 1;
+  std::shared_ptr<SstPartitionerFactory> factory(
+      NewSstPartitionerFixedPrefixFactory(4));
+  options.sst_partitioner_factory = factory;
+
+  DestroyAndReopen(options);
+
+  // create first file and flush to l0
+  ASSERT_OK(Put("aaaa1", "A"));
+  ASSERT_OK(Put("bbbb1", "B"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  std::vector<LiveFileMetaData> files;
+  dbfull()->GetLiveFilesMetaData(&files);
+  ASSERT_EQ(2, files.size());
+  ASSERT_EQ("A", Get("aaaa1"));
+  ASSERT_EQ("B", Get("bbbb1"));
+}
+
+TEST_F(DBCompactionTest, ZeroSeqIdCompaction) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.level0_file_num_compaction_trigger = 3;
+
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  // compaction options
+  CompactionOptions compact_opt;
+  compact_opt.compression = kNoCompression;
+  compact_opt.output_file_size_limit = 4096;
+  const size_t key_len =
+      static_cast<size_t>(compact_opt.output_file_size_limit) / 5;
+
+  DestroyAndReopen(options);
+
+  std::vector<const Snapshot*> snaps;
+
+  // create first file and flush to l0
+  for (auto& key : {"1", "2", "3", "3", "3", "3"}) {
+    ASSERT_OK(Put(key, std::string(key_len, 'A')));
+    snaps.push_back(dbfull()->GetSnapshot());
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  // create second file and flush to l0
+  for (auto& key : {"3", "4", "5", "6", "7", "8"}) {
+    ASSERT_OK(Put(key, std::string(key_len, 'A')));
+    snaps.push_back(dbfull()->GetSnapshot());
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  // move both files down to l1
+  ASSERT_OK(
+      dbfull()->CompactFiles(compact_opt, collector->GetFlushedFiles(), 1));
+
+  // release snap so that first instance of key(3) can have seqId=0
+  for (auto snap : snaps) {
+    dbfull()->ReleaseSnapshot(snap);
+  }
+
+  // create 3 files in l0 so to trigger compaction
+  for (int i = 0; i < options.level0_file_num_compaction_trigger; i++) {
+    ASSERT_OK(Put("2", std::string(1, 'A')));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_OK(Put("", ""));
+}
+
+TEST_F(DBCompactionTest, ManualCompactionUnknownOutputSize) {
+  // github issue #2249
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.level0_file_num_compaction_trigger = 3;
+  DestroyAndReopen(options);
+
+  // create two files in l1 that we can compact
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < options.level0_file_num_compaction_trigger; j++) {
+      ASSERT_OK(Put(std::to_string(2 * i), std::string(1, 'A')));
+      ASSERT_OK(Put(std::to_string(2 * i + 1), std::string(1, 'A')));
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_OK(
+      dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"}}));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 2);
+  ASSERT_OK(
+      dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "3"}}));
+
+  ColumnFamilyMetaData cf_meta;
+  dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta);
+  ASSERT_EQ(2, cf_meta.levels[1].files.size());
+  std::vector<std::string> input_filenames;
+  for (const auto& sst_file : cf_meta.levels[1].files) {
+    input_filenames.push_back(sst_file.name);
+  }
+
+  // note CompactionOptions::output_file_size_limit is unset.
+  CompactionOptions compact_opt;
+  compact_opt.compression = kNoCompression;
+  ASSERT_OK(dbfull()->CompactFiles(compact_opt, input_filenames, 1));
+}
+
+// Check that writes done during a memtable compaction are recovered
+// if the database is shutdown during the memtable compaction.
+TEST_F(DBCompactionTest, RecoverDuringMemtableCompaction) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Trigger a long memtable compaction and reopen the database during it
+    ASSERT_OK(Put(1, "foo", "v1"));  // Goes to 1st log file
+    ASSERT_OK(Put(1, "big1", std::string(10000000, 'x')));  // Fills memtable
+    ASSERT_OK(Put(1, "big2", std::string(1000, 'y')));  // Triggers compaction
+    ASSERT_OK(Put(1, "bar", "v2"));                     // Goes to new log file
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v2", Get(1, "bar"));
+    ASSERT_EQ(std::string(10000000, 'x'), Get(1, "big1"));
+    ASSERT_EQ(std::string(1000, 'y'), Get(1, "big2"));
+  } while (ChangeOptions());
+}
+
+TEST_P(DBCompactionTestWithParam, TrivialMoveOneFile) {
+  int32_t trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000000;
+  options.max_subcompactions = max_subcompactions_;
+  DestroyAndReopen(options);
+
+  int32_t num_keys = 80;
+  int32_t value_size = 100 * 1024;  // 100 KB
+
+  Random rnd(301);
+  std::vector<std::string> values;
+  for (int i = 0; i < num_keys; i++) {
+    values.push_back(rnd.RandomString(value_size));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+
+  // Reopening moves updates to L0
+  Reopen(options);
+  ASSERT_EQ(NumTableFilesAtLevel(0, 0), 1);  // 1 file in L0
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0);  // 0 files in L1
+
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(metadata.size(), 1U);
+  LiveFileMetaData level0_file = metadata[0];  // L0 file meta
+
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+
+  // Compaction will initiate a trivial move from L0 to L1
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+
+  // File moved From L0 to L1
+  ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);  // 0 files in L0
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 1);  // 1 file in L1
+
+  metadata.clear();
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(metadata.size(), 1U);
+  ASSERT_EQ(metadata[0].name /* level1_file.name */, level0_file.name);
+  ASSERT_EQ(metadata[0].size /* level1_file.size */, level0_file.size);
+
+  for (int i = 0; i < num_keys; i++) {
+    ASSERT_EQ(Get(Key(i)), values[i]);
+  }
+
+  ASSERT_EQ(trivial_move, 1);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, TrivialMoveNonOverlappingFiles) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* /*arg*/) { non_trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 10 * 1024 * 1024;
+  options.max_subcompactions = max_subcompactions_;
+
+  DestroyAndReopen(options);
+  // non overlapping ranges
+  std::vector<std::pair<int32_t, int32_t>> ranges = {
+      {100, 199}, {300, 399}, {0, 99},    {200, 299},
+      {600, 699}, {400, 499}, {500, 550}, {551, 599},
+  };
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  Random rnd(301);
+  std::map<int32_t, std::string> values;
+  for (size_t i = 0; i < ranges.size(); i++) {
+    for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+      values[j] = rnd.RandomString(value_size);
+      ASSERT_OK(Put(Key(j), values[j]));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  int32_t level0_files = NumTableFilesAtLevel(0, 0);
+  ASSERT_EQ(level0_files, ranges.size());    // Multiple files in L0
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0);  // No files in L1
+
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+
+  // Since data is non-overlapping we expect compaction to initiate
+  // a trivial move
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  // We expect that all the files were trivially moved from L0 to L1
+  ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0) /* level1_files */, level0_files);
+
+  for (size_t i = 0; i < ranges.size(); i++) {
+    for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+      ASSERT_EQ(Get(Key(j)), values[j]);
+    }
+  }
+
+  ASSERT_EQ(trivial_move, 1);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  trivial_move = 0;
+  non_trivial_move = 0;
+  values.clear();
+  DestroyAndReopen(options);
+  // Same ranges as above but overlapping
+  ranges = {
+      {100, 199},
+      {300, 399},
+      {0, 99},
+      {200, 299},
+      {600, 699},
+      {400, 499},
+      {500, 560},  // this range overlap with the next
+                   // one
+      {551, 599},
+  };
+  for (size_t i = 0; i < ranges.size(); i++) {
+    for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+      values[j] = rnd.RandomString(value_size);
+      ASSERT_OK(Put(Key(j), values[j]));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  for (size_t i = 0; i < ranges.size(); i++) {
+    for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+      ASSERT_EQ(Get(Key(j)), values[j]);
+    }
+  }
+  ASSERT_EQ(trivial_move, 0);
+  ASSERT_EQ(non_trivial_move, 1);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, TrivialMoveTargetLevel) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* /*arg*/) { non_trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 10 * 1024 * 1024;
+  options.num_levels = 7;
+  options.max_subcompactions = max_subcompactions_;
+
+  DestroyAndReopen(options);
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  // Add 2 non-overlapping files
+  Random rnd(301);
+  std::map<int32_t, std::string> values;
+
+  // file 1 [0 => 300]
+  for (int32_t i = 0; i <= 300; i++) {
+    values[i] = rnd.RandomString(value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // file 2 [600 => 700]
+  for (int32_t i = 600; i <= 700; i++) {
+    values[i] = rnd.RandomString(value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // 2 files in L0
+  ASSERT_EQ("2", FilesPerLevel(0));
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 6;
+  compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  // 2 files in L6
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel(0));
+
+  ASSERT_EQ(trivial_move, 1);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  for (int32_t i = 0; i <= 300; i++) {
+    ASSERT_EQ(Get(Key(i)), values[i]);
+  }
+  for (int32_t i = 600; i <= 700; i++) {
+    ASSERT_EQ(Get(Key(i)), values[i]);
+  }
+}
+
+TEST_P(DBCompactionTestWithParam, PartialOverlappingL0) {
+  class SubCompactionEventListener : public EventListener {
+   public:
+    void OnSubcompactionCompleted(const SubcompactionJobInfo&) override {
+      sub_compaction_finished_++;
+    }
+    std::atomic<int> sub_compaction_finished_{0};
+  };
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 10 * 1024 * 1024;
+  options.max_subcompactions = max_subcompactions_;
+  SubCompactionEventListener* listener = new SubCompactionEventListener();
+  options.listeners.emplace_back(listener);
+
+  DestroyAndReopen(options);
+
+  // For subcompactino to trigger, output level needs to be non-empty.
+  ASSERT_OK(Put("key", ""));
+  ASSERT_OK(Put("kez", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("key", ""));
+  ASSERT_OK(Put("kez", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // Ranges that are only briefly overlapping so that they won't be trivially
+  // moved but subcompaction ranges would only contain a subset of files.
+  std::vector<std::pair<int32_t, int32_t>> ranges = {
+      {100, 199}, {198, 399}, {397, 600}, {598, 800}, {799, 900}, {895, 999},
+  };
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  Random rnd(301);
+  std::map<int32_t, std::string> values;
+  for (size_t i = 0; i < ranges.size(); i++) {
+    for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+      values[j] = rnd.RandomString(value_size);
+      ASSERT_OK(Put(Key(j), values[j]));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  int32_t level0_files = NumTableFilesAtLevel(0, 0);
+  ASSERT_EQ(level0_files, ranges.size());    // Multiple files in L0
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 1);  // One file in L1
+
+  listener->sub_compaction_finished_ = 0;
+  ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()}));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  if (max_subcompactions_ > 3) {
+    // RocksDB might not generate the exact number of sub compactions.
+    // Here we validate that at least subcompaction happened.
+    ASSERT_GT(listener->sub_compaction_finished_.load(), 2);
+  }
+
+  // We expect that all the files were compacted to L1
+  ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
+  ASSERT_GT(NumTableFilesAtLevel(1, 0), 1);
+
+  for (size_t i = 0; i < ranges.size(); i++) {
+    for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+      ASSERT_EQ(Get(Key(j)), values[j]);
+    }
+  }
+}
+
+TEST_P(DBCompactionTestWithParam, ManualCompactionPartial) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* /*arg*/) { non_trivial_move++; });
+  bool first = true;
+  // Purpose of dependencies:
+  // 4 -> 1: ensure the order of two non-trivial compactions
+  // 5 -> 2 and 5 -> 3: ensure we do a check before two non-trivial compactions
+  // are installed
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBCompaction::ManualPartial:4", "DBCompaction::ManualPartial:1"},
+       {"DBCompaction::ManualPartial:5", "DBCompaction::ManualPartial:2"},
+       {"DBCompaction::ManualPartial:5", "DBCompaction::ManualPartial:3"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+        if (first) {
+          first = false;
+          TEST_SYNC_POINT("DBCompaction::ManualPartial:4");
+          TEST_SYNC_POINT("DBCompaction::ManualPartial:3");
+        } else {  // second non-trivial compaction
+          TEST_SYNC_POINT("DBCompaction::ManualPartial:2");
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 10 * 1024 * 1024;
+  options.num_levels = 7;
+  options.max_subcompactions = max_subcompactions_;
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_background_compactions = 3;
+  options.target_file_size_base = 1 << 23;  // 8 MB
+
+  DestroyAndReopen(options);
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  // Add 2 non-overlapping files
+  Random rnd(301);
+  std::map<int32_t, std::string> values;
+
+  // file 1 [0 => 100]
+  for (int32_t i = 0; i < 100; i++) {
+    values[i] = rnd.RandomString(value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // file 2 [100 => 300]
+  for (int32_t i = 100; i < 300; i++) {
+    values[i] = rnd.RandomString(value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // 2 files in L0
+  ASSERT_EQ("2", FilesPerLevel(0));
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 6;
+  compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+  // Trivial move the two non-overlapping files to level 6
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  // 2 files in L6
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel(0));
+
+  ASSERT_EQ(trivial_move, 1);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  // file 3 [ 0 => 200]
+  for (int32_t i = 0; i < 200; i++) {
+    values[i] = rnd.RandomString(value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // 1 files in L0
+  ASSERT_EQ("1,0,0,0,0,0,2", FilesPerLevel(0));
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, false));
+  ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr, false));
+  ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr, nullptr, false));
+  ASSERT_OK(dbfull()->TEST_CompactRange(3, nullptr, nullptr, nullptr, false));
+  ASSERT_OK(dbfull()->TEST_CompactRange(4, nullptr, nullptr, nullptr, false));
+  // 2 files in L6, 1 file in L5
+  ASSERT_EQ("0,0,0,0,0,1,2", FilesPerLevel(0));
+
+  ASSERT_EQ(trivial_move, 6);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  ROCKSDB_NAMESPACE::port::Thread threads([&] {
+    compact_options.change_level = false;
+    compact_options.exclusive_manual_compaction = false;
+    std::string begin_string = Key(0);
+    std::string end_string = Key(199);
+    Slice begin(begin_string);
+    Slice end(end_string);
+    // First non-trivial compaction is triggered
+    ASSERT_OK(db_->CompactRange(compact_options, &begin, &end));
+  });
+
+  TEST_SYNC_POINT("DBCompaction::ManualPartial:1");
+  // file 4 [300 => 400)
+  for (int32_t i = 300; i <= 400; i++) {
+    values[i] = rnd.RandomString(value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // file 5 [400 => 500)
+  for (int32_t i = 400; i <= 500; i++) {
+    values[i] = rnd.RandomString(value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // file 6 [500 => 600)
+  for (int32_t i = 500; i <= 600; i++) {
+    values[i] = rnd.RandomString(value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  // Second non-trivial compaction is triggered
+  ASSERT_OK(Flush());
+
+  // Before two non-trivial compactions are installed, there are 3 files in L0
+  ASSERT_EQ("3,0,0,0,0,1,2", FilesPerLevel(0));
+  TEST_SYNC_POINT("DBCompaction::ManualPartial:5");
+
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // After two non-trivial compactions are installed, there is 1 file in L6, and
+  // 1 file in L1
+  ASSERT_EQ("0,1,0,0,0,0,1", FilesPerLevel(0));
+  threads.join();
+
+  for (int32_t i = 0; i < 600; i++) {
+    ASSERT_EQ(Get(Key(i)), values[i]);
+  }
+}
+
+// Disable as the test is flaky.
+TEST_F(DBCompactionTest, DISABLED_ManualPartialFill) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* /*arg*/) { non_trivial_move++; });
+  bool first = true;
+  bool second = true;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBCompaction::PartialFill:4", "DBCompaction::PartialFill:1"},
+       {"DBCompaction::PartialFill:2", "DBCompaction::PartialFill:3"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+        if (first) {
+          TEST_SYNC_POINT("DBCompaction::PartialFill:4");
+          first = false;
+          TEST_SYNC_POINT("DBCompaction::PartialFill:3");
+        } else if (second) {
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 10 * 1024 * 1024;
+  options.max_bytes_for_level_multiplier = 2;
+  options.num_levels = 4;
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_background_compactions = 3;
+
+  DestroyAndReopen(options);
+  // make sure all background compaction jobs can be scheduled
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  // Add 2 non-overlapping files
+  Random rnd(301);
+  std::map<int32_t, std::string> values;
+
+  // file 1 [0 => 100]
+  for (int32_t i = 0; i < 100; i++) {
+    values[i] = rnd.RandomString(value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // file 2 [100 => 300]
+  for (int32_t i = 100; i < 300; i++) {
+    values[i] = rnd.RandomString(value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // 2 files in L0
+  ASSERT_EQ("2", FilesPerLevel(0));
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 2;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  // 2 files in L2
+  ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+  ASSERT_EQ(trivial_move, 1);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  // file 3 [ 0 => 200]
+  for (int32_t i = 0; i < 200; i++) {
+    values[i] = rnd.RandomString(value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // 2 files in L2, 1 in L0
+  ASSERT_EQ("1,0,2", FilesPerLevel(0));
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, false));
+  // 2 files in L2, 1 in L1
+  ASSERT_EQ("0,1,2", FilesPerLevel(0));
+
+  ASSERT_EQ(trivial_move, 2);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  ROCKSDB_NAMESPACE::port::Thread threads([&] {
+    compact_options.change_level = false;
+    compact_options.exclusive_manual_compaction = false;
+    std::string begin_string = Key(0);
+    std::string end_string = Key(199);
+    Slice begin(begin_string);
+    Slice end(end_string);
+    ASSERT_OK(db_->CompactRange(compact_options, &begin, &end));
+  });
+
+  TEST_SYNC_POINT("DBCompaction::PartialFill:1");
+  // Many files 4 [300 => 4300)
+  for (int32_t i = 0; i <= 5; i++) {
+    for (int32_t j = 300; j < 4300; j++) {
+      if (j == 2300) {
+        ASSERT_OK(Flush());
+        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+      }
+      values[j] = rnd.RandomString(value_size);
+      ASSERT_OK(Put(Key(j), values[j]));
+    }
+  }
+
+  // Verify level sizes
+  uint64_t target_size = 4 * options.max_bytes_for_level_base;
+  for (int32_t i = 1; i < options.num_levels; i++) {
+    ASSERT_LE(SizeAtLevel(i), target_size);
+    target_size = static_cast<uint64_t>(target_size *
+                                        options.max_bytes_for_level_multiplier);
+  }
+
+  TEST_SYNC_POINT("DBCompaction::PartialFill:2");
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  threads.join();
+
+  for (int32_t i = 0; i < 4300; i++) {
+    ASSERT_EQ(Get(Key(i)), values[i]);
+  }
+}
+
+TEST_F(DBCompactionTest, ManualCompactionWithUnorderedWrite) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL",
+        "DBCompactionTest::ManualCompactionWithUnorderedWrite:WaitWriteWAL"},
+       {"DBImpl::WaitForPendingWrites:BeforeBlock",
+        "DBImpl::WriteImpl:BeforeUnorderedWriteMemtable"}});
+
+  Options options = CurrentOptions();
+  options.unordered_write = true;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("bar", "v1"));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread writer([&]() { ASSERT_OK(Put("foo", "v2")); });
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::ManualCompactionWithUnorderedWrite:WaitWriteWAL");
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  writer.join();
+  ASSERT_EQ(Get("foo"), "v2");
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  Reopen(options);
+  ASSERT_EQ(Get("foo"), "v2");
+}
+
+TEST_F(DBCompactionTest, DeleteFileRange) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 10 * 1024 * 1024;
+  options.max_bytes_for_level_multiplier = 2;
+  options.num_levels = 4;
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_background_compactions = 3;
+
+  DestroyAndReopen(options);
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  // Add 2 non-overlapping files
+  Random rnd(301);
+  std::map<int32_t, std::string> values;
+
+  // file 1 [0 => 100]
+  for (int32_t i = 0; i < 100; i++) {
+    values[i] = rnd.RandomString(value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // file 2 [100 => 300]
+  for (int32_t i = 100; i < 300; i++) {
+    values[i] = rnd.RandomString(value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // 2 files in L0
+  ASSERT_EQ("2", FilesPerLevel(0));
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 2;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  // 2 files in L2
+  ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+  // file 3 [ 0 => 200]
+  for (int32_t i = 0; i < 200; i++) {
+    values[i] = rnd.RandomString(value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // Many files 4 [300 => 4300)
+  for (int32_t i = 0; i <= 5; i++) {
+    for (int32_t j = 300; j < 4300; j++) {
+      if (j == 2300) {
+        ASSERT_OK(Flush());
+        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+      }
+      values[j] = rnd.RandomString(value_size);
+      ASSERT_OK(Put(Key(j), values[j]));
+    }
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Verify level sizes
+  uint64_t target_size = 4 * options.max_bytes_for_level_base;
+  for (int32_t i = 1; i < options.num_levels; i++) {
+    ASSERT_LE(SizeAtLevel(i), target_size);
+    target_size = static_cast<uint64_t>(target_size *
+                                        options.max_bytes_for_level_multiplier);
+  }
+
+  const size_t old_num_files = CountFiles();
+  std::string begin_string = Key(1000);
+  std::string end_string = Key(2000);
+  Slice begin(begin_string);
+  Slice end(end_string);
+  ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
+
+  int32_t deleted_count = 0;
+  for (int32_t i = 0; i < 4300; i++) {
+    if (i < 1000 || i > 2000) {
+      ASSERT_EQ(Get(Key(i)), values[i]);
+    } else {
+      ReadOptions roptions;
+      std::string result;
+      Status s = db_->Get(roptions, Key(i), &result);
+      ASSERT_TRUE(s.IsNotFound() || s.ok());
+      if (s.IsNotFound()) {
+        deleted_count++;
+      }
+    }
+  }
+  ASSERT_GT(deleted_count, 0);
+  begin_string = Key(5000);
+  end_string = Key(6000);
+  Slice begin1(begin_string);
+  Slice end1(end_string);
+  // Try deleting files in range which contain no keys
+  ASSERT_OK(
+      DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin1, &end1));
+
+  // Push data from level 0 to level 1 to force all data to be deleted
+  // Note that we don't delete level 0 files
+  compact_options.change_level = true;
+  compact_options.target_level = 1;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_OK(
+      DeleteFilesInRange(db_, db_->DefaultColumnFamily(), nullptr, nullptr));
+
+  int32_t deleted_count2 = 0;
+  for (int32_t i = 0; i < 4300; i++) {
+    ReadOptions roptions;
+    std::string result;
+    ASSERT_TRUE(db_->Get(roptions, Key(i), &result).IsNotFound());
+    deleted_count2++;
+  }
+  ASSERT_GT(deleted_count2, deleted_count);
+  const size_t new_num_files = CountFiles();
+  ASSERT_GT(old_num_files, new_num_files);
+}
+
+TEST_F(DBCompactionTest, DeleteFilesInRanges) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 10 * 1024 * 1024;
+  options.max_bytes_for_level_multiplier = 2;
+  options.num_levels = 4;
+  options.max_background_compactions = 3;
+  options.disable_auto_compactions = true;
+
+  DestroyAndReopen(options);
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  Random rnd(301);
+  std::map<int32_t, std::string> values;
+
+  // file [0 => 100), [100 => 200), ... [900, 1000)
+  for (auto i = 0; i < 10; i++) {
+    for (auto j = 0; j < 100; j++) {
+      auto k = i * 100 + j;
+      values[k] = rnd.RandomString(value_size);
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ("10", FilesPerLevel(0));
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 2;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_EQ("0,0,10", FilesPerLevel(0));
+
+  // file [0 => 100), [200 => 300), ... [800, 900)
+  for (auto i = 0; i < 10; i += 2) {
+    for (auto j = 0; j < 100; j++) {
+      auto k = i * 100 + j;
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ("5,0,10", FilesPerLevel(0));
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+  ASSERT_EQ("0,5,10", FilesPerLevel(0));
+
+  // Delete files in range [0, 299] (inclusive)
+  {
+    auto begin_str1 = Key(0), end_str1 = Key(100);
+    auto begin_str2 = Key(100), end_str2 = Key(200);
+    auto begin_str3 = Key(200), end_str3 = Key(299);
+    Slice begin1(begin_str1), end1(end_str1);
+    Slice begin2(begin_str2), end2(end_str2);
+    Slice begin3(begin_str3), end3(end_str3);
+    std::vector<RangePtr> ranges;
+    ranges.push_back(RangePtr(&begin1, &end1));
+    ranges.push_back(RangePtr(&begin2, &end2));
+    ranges.push_back(RangePtr(&begin3, &end3));
+    ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(),
+                                  ranges.data(), ranges.size()));
+    ASSERT_EQ("0,3,7", FilesPerLevel(0));
+
+    // Keys [0, 300) should not exist.
+    for (auto i = 0; i < 300; i++) {
+      ReadOptions ropts;
+      std::string result;
+      auto s = db_->Get(ropts, Key(i), &result);
+      ASSERT_TRUE(s.IsNotFound());
+    }
+    for (auto i = 300; i < 1000; i++) {
+      ASSERT_EQ(Get(Key(i)), values[i]);
+    }
+  }
+
+  // Delete files in range [600, 999) (exclusive)
+  {
+    auto begin_str1 = Key(600), end_str1 = Key(800);
+    auto begin_str2 = Key(700), end_str2 = Key(900);
+    auto begin_str3 = Key(800), end_str3 = Key(999);
+    Slice begin1(begin_str1), end1(end_str1);
+    Slice begin2(begin_str2), end2(end_str2);
+    Slice begin3(begin_str3), end3(end_str3);
+    std::vector<RangePtr> ranges;
+    ranges.push_back(RangePtr(&begin1, &end1));
+    ranges.push_back(RangePtr(&begin2, &end2));
+    ranges.push_back(RangePtr(&begin3, &end3));
+    ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(),
+                                  ranges.data(), ranges.size(), false));
+    ASSERT_EQ("0,1,4", FilesPerLevel(0));
+
+    // Keys [600, 900) should not exist.
+    for (auto i = 600; i < 900; i++) {
+      ReadOptions ropts;
+      std::string result;
+      auto s = db_->Get(ropts, Key(i), &result);
+      ASSERT_TRUE(s.IsNotFound());
+    }
+    for (auto i = 300; i < 600; i++) {
+      ASSERT_EQ(Get(Key(i)), values[i]);
+    }
+    for (auto i = 900; i < 1000; i++) {
+      ASSERT_EQ(Get(Key(i)), values[i]);
+    }
+  }
+
+  // Delete all files.
+  {
+    RangePtr range;
+    ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(), &range, 1));
+    ASSERT_EQ("", FilesPerLevel(0));
+
+    for (auto i = 0; i < 1000; i++) {
+      ReadOptions ropts;
+      std::string result;
+      auto s = db_->Get(ropts, Key(i), &result);
+      ASSERT_TRUE(s.IsNotFound());
+    }
+  }
+}
+
+TEST_F(DBCompactionTest, DeleteFileRangeFileEndpointsOverlapBug) {
+  // regression test for #2833: groups of files whose user-keys overlap at the
+  // endpoints could be split by `DeleteFilesInRange`. This caused old data to
+  // reappear, either because a new version of the key was removed, or a range
+  // deletion was partially dropped. It could also cause non-overlapping
+  // invariant to be violated if the files dropped by DeleteFilesInRange were
+  // a subset of files that a range deletion spans.
+  const int kNumL0Files = 2;
+  const int kValSize = 8 << 10;  // 8KB
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.target_file_size_base = 1 << 10;  // 1KB
+  DestroyAndReopen(options);
+
+  // The snapshot prevents key 1 from having its old version dropped. The low
+  // `target_file_size_base` ensures two keys will be in each output file.
+  const Snapshot* snapshot = nullptr;
+  Random rnd(301);
+  // The value indicates which flush the key belonged to, which is enough
+  // for us to determine the keys' relative ages. After L0 flushes finish,
+  // files look like:
+  //
+  // File 0: 0 -> vals[0], 1 -> vals[0]
+  // File 1:               1 -> vals[1], 2 -> vals[1]
+  //
+  // Then L0->L1 compaction happens, which outputs keys as follows:
+  //
+  // File 0: 0 -> vals[0], 1 -> vals[1]
+  // File 1:               1 -> vals[0], 2 -> vals[1]
+  //
+  // DeleteFilesInRange shouldn't be allowed to drop just file 0, as that
+  // would cause `1 -> vals[0]` (an older key) to reappear.
+  std::string vals[kNumL0Files];
+  for (int i = 0; i < kNumL0Files; ++i) {
+    vals[i] = rnd.RandomString(kValSize);
+    ASSERT_OK(Put(Key(i), vals[i]));
+    ASSERT_OK(Put(Key(i + 1), vals[i]));
+    ASSERT_OK(Flush());
+    if (i == 0) {
+      snapshot = db_->GetSnapshot();
+    }
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Verify `DeleteFilesInRange` can't drop only file 0 which would cause
+  // "1 -> vals[0]" to reappear.
+  std::string begin_str = Key(0), end_str = Key(1);
+  Slice begin = begin_str, end = end_str;
+  ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
+  ASSERT_EQ(vals[1], Get(Key(1)));
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_P(DBCompactionTestWithParam, TrivialMoveToLastLevelWithFiles) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* /*arg*/) { non_trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000000;
+  options.max_subcompactions = max_subcompactions_;
+  DestroyAndReopen(options);
+
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  Random rnd(301);
+  std::vector<std::string> values;
+  // File with keys [ 0 => 99 ]
+  for (int i = 0; i < 100; i++) {
+    values.push_back(rnd.RandomString(value_size));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ("1", FilesPerLevel(0));
+  // Compaction will do L0=>L1 (trivial move) then move L1 files to L3
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 3;
+  compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
+  ASSERT_EQ(trivial_move, 1);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  // File with keys [ 100 => 199 ]
+  for (int i = 100; i < 200; i++) {
+    values.push_back(rnd.RandomString(value_size));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ("1,0,0,1", FilesPerLevel(0));
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+  // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves)
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,2", FilesPerLevel(0));
+  ASSERT_EQ(trivial_move, 4);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  for (int i = 0; i < 200; i++) {
+    ASSERT_EQ(Get(Key(i)), values[i]);
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, LevelCompactionThirdPath) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 4;
+  options.max_bytes_for_level_base = 400 * 1024;
+  options.max_subcompactions = max_subcompactions_;
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // First three 110KB files are not going to second path.
+  // After that, (100K, 200K)
+  for (int num = 0; num < 3; num++) {
+    GenerateNewFile(&rnd, &key_idx);
+  }
+
+  // Another 110KB triggers a compaction to 400K file to fill up first path
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(3, GetSstFileCount(options.db_paths[1].path));
+
+  // (1, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4", FilesPerLevel(0));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 1)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,1", FilesPerLevel(0));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 2)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,2", FilesPerLevel(0));
+  ASSERT_EQ(2, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 3)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,3", FilesPerLevel(0));
+  ASSERT_EQ(3, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,4", FilesPerLevel(0));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 5)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,5", FilesPerLevel(0));
+  ASSERT_EQ(5, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 6)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,6", FilesPerLevel(0));
+  ASSERT_EQ(6, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 7)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,7", FilesPerLevel(0));
+  ASSERT_EQ(7, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,8", FilesPerLevel(0));
+  ASSERT_EQ(8, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Reopen(options);
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Destroy(options);
+}
+
+TEST_P(DBCompactionTestWithParam, LevelCompactionPathUse) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 4;
+  options.max_bytes_for_level_base = 400 * 1024;
+  options.max_subcompactions = max_subcompactions_;
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // Always gets compacted into 1 Level1 file,
+  // 0/1 Level 0 file
+  for (int num = 0; num < 3; num++) {
+    key_idx = 0;
+    GenerateNewFile(&rnd, &key_idx);
+  }
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Reopen(options);
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Destroy(options);
+}
+
+TEST_P(DBCompactionTestWithParam, LevelCompactionCFPathUse) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 4;
+  options.max_bytes_for_level_base = 400 * 1024;
+  options.max_subcompactions = max_subcompactions_;
+
+  std::vector<Options> option_vector;
+  option_vector.emplace_back(options);
+  ColumnFamilyOptions cf_opt1(options), cf_opt2(options);
+  // Configure CF1 specific paths.
+  cf_opt1.cf_paths.emplace_back(dbname_ + "cf1", 500 * 1024);
+  cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_2", 4 * 1024 * 1024);
+  cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_3", 1024 * 1024 * 1024);
+  option_vector.emplace_back(DBOptions(options), cf_opt1);
+  CreateColumnFamilies({"one"}, option_vector[1]);
+
+  // Configure CF2 specific paths.
+  cf_opt2.cf_paths.emplace_back(dbname_ + "cf2", 500 * 1024);
+  cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_2", 4 * 1024 * 1024);
+  cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_3", 1024 * 1024 * 1024);
+  option_vector.emplace_back(DBOptions(options), cf_opt2);
+  CreateColumnFamilies({"two"}, option_vector[2]);
+
+  ReopenWithColumnFamilies({"default", "one", "two"}, option_vector);
+
+  Random rnd(301);
+  int key_idx = 0;
+  int key_idx1 = 0;
+  int key_idx2 = 0;
+
+  auto generate_file = [&]() {
+    GenerateNewFile(0, &rnd, &key_idx);
+    GenerateNewFile(1, &rnd, &key_idx1);
+    GenerateNewFile(2, &rnd, &key_idx2);
+  };
+
+  auto check_sstfilecount = [&](int path_id, int expected) {
+    ASSERT_EQ(expected, GetSstFileCount(options.db_paths[path_id].path));
+    ASSERT_EQ(expected, GetSstFileCount(cf_opt1.cf_paths[path_id].path));
+    ASSERT_EQ(expected, GetSstFileCount(cf_opt2.cf_paths[path_id].path));
+  };
+
+  auto check_filesperlevel = [&](const std::string& expected) {
+    ASSERT_EQ(expected, FilesPerLevel(0));
+    ASSERT_EQ(expected, FilesPerLevel(1));
+    ASSERT_EQ(expected, FilesPerLevel(2));
+  };
+
+  auto check_getvalues = [&]() {
+    for (int i = 0; i < key_idx; i++) {
+      auto v = Get(0, Key(i));
+      ASSERT_NE(v, "NOT_FOUND");
+      ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+    }
+
+    for (int i = 0; i < key_idx1; i++) {
+      auto v = Get(1, Key(i));
+      ASSERT_NE(v, "NOT_FOUND");
+      ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+    }
+
+    for (int i = 0; i < key_idx2; i++) {
+      auto v = Get(2, Key(i));
+      ASSERT_NE(v, "NOT_FOUND");
+      ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+    }
+  };
+
+  // Check that default column family uses db_paths.
+  // And Column family "one" uses cf_paths.
+
+  // The compaction in level0 outputs the sst files in level1.
+  // The first path cannot hold level1's data(400KB+400KB > 500KB),
+  // so every compaction move a sst file to second path. Please
+  // refer to LevelCompactionBuilder::GetPathId.
+  for (int num = 0; num < 3; num++) {
+    generate_file();
+  }
+  check_sstfilecount(0, 1);
+  check_sstfilecount(1, 2);
+
+  generate_file();
+  check_sstfilecount(1, 3);
+
+  // (1, 4)
+  generate_file();
+  check_filesperlevel("1,4");
+  check_sstfilecount(1, 4);
+  check_sstfilecount(0, 1);
+
+  // (1, 4, 1)
+  generate_file();
+  check_filesperlevel("1,4,1");
+  check_sstfilecount(2, 1);
+  check_sstfilecount(1, 4);
+  check_sstfilecount(0, 1);
+
+  // (1, 4, 2)
+  generate_file();
+  check_filesperlevel("1,4,2");
+  check_sstfilecount(2, 2);
+  check_sstfilecount(1, 4);
+  check_sstfilecount(0, 1);
+
+  check_getvalues();
+
+  {  // Also verify GetLiveFilesStorageInfo with db_paths / cf_paths
+    std::vector<LiveFileStorageInfo> new_infos;
+    LiveFilesStorageInfoOptions lfsio;
+    lfsio.wal_size_for_flush = UINT64_MAX;  // no flush
+    ASSERT_OK(db_->GetLiveFilesStorageInfo(lfsio, &new_infos));
+    std::unordered_map<std::string, int> live_sst_by_dir;
+    for (auto& info : new_infos) {
+      if (info.file_type == kTableFile) {
+        live_sst_by_dir[info.directory]++;
+        // Verify file on disk (no directory confusion)
+        uint64_t size;
+        ASSERT_OK(env_->GetFileSize(
+            info.directory + "/" + info.relative_filename, &size));
+        ASSERT_EQ(info.size, size);
+      }
+    }
+    ASSERT_EQ(3U * 3U, live_sst_by_dir.size());
+    for (auto& paths : {options.db_paths, cf_opt1.cf_paths, cf_opt2.cf_paths}) {
+      ASSERT_EQ(1, live_sst_by_dir[paths[0].path]);
+      ASSERT_EQ(4, live_sst_by_dir[paths[1].path]);
+      ASSERT_EQ(2, live_sst_by_dir[paths[2].path]);
+    }
+  }
+
+  ReopenWithColumnFamilies({"default", "one", "two"}, option_vector);
+
+  check_getvalues();
+
+  Destroy(options, true);
+}
+
+TEST_P(DBCompactionTestWithParam, ConvertCompactionStyle) {
+  Random rnd(301);
+  int max_key_level_insert = 200;
+  int max_key_universal_insert = 600;
+
+  // Stage 1: generate a db with level compaction
+  Options options = CurrentOptions();
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.num_levels = 4;
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_bytes_for_level_base = 500 << 10;  // 500KB
+  options.max_bytes_for_level_multiplier = 1;
+  options.target_file_size_base = 200 << 10;  // 200KB
+  options.target_file_size_multiplier = 1;
+  options.max_subcompactions = max_subcompactions_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  for (int i = 0; i <= max_key_level_insert; i++) {
+    // each value is 10K
+    ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000)));
+  }
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_GT(TotalTableFiles(1, 4), 1);
+  int non_level0_num_files = 0;
+  for (int i = 1; i < options.num_levels; i++) {
+    non_level0_num_files += NumTableFilesAtLevel(i, 1);
+  }
+  ASSERT_GT(non_level0_num_files, 0);
+
+  // Stage 2: reopen with universal compaction - should fail
+  options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 1;
+  options = CurrentOptions(options);
+  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  // Stage 3: compact into a single file and move the file to level 0
+  options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = INT_MAX;
+  options.target_file_size_multiplier = 1;
+  options.max_bytes_for_level_base = INT_MAX;
+  options.max_bytes_for_level_multiplier = 1;
+  options.num_levels = 4;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 0;
+  // cannot use kForceOptimized here because the compaction here is expected
+  // to generate one output file
+  compact_options.bottommost_level_compaction =
+      BottommostLevelCompaction::kForce;
+  compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+  ASSERT_OK(
+      dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+
+  // Only 1 file in L0
+  ASSERT_EQ("1", FilesPerLevel(1));
+
+  // Stage 4: re-open in universal compaction style and do some db operations
+  options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 4;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 3;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  options.num_levels = 1;
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  for (int i = max_key_level_insert / 2; i <= max_key_universal_insert; i++) {
+    ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000)));
+  }
+  ASSERT_OK(dbfull()->Flush(FlushOptions()));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  for (int i = 1; i < options.num_levels; i++) {
+    ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0);
+  }
+
+  // verify keys inserted in both level compaction style and universal
+  // compaction style
+  std::string keys_in_db;
+  Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]);
+  ASSERT_OK(iter->status());
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    keys_in_db.append(iter->key().ToString());
+    keys_in_db.push_back(',');
+  }
+  delete iter;
+
+  std::string expected_keys;
+  for (int i = 0; i <= max_key_universal_insert; i++) {
+    expected_keys.append(Key(i));
+    expected_keys.push_back(',');
+  }
+
+  ASSERT_EQ(keys_in_db, expected_keys);
+}
+
+TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_a) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "b", "v"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Delete(1, "b"));
+    ASSERT_OK(Delete(1, "a"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Delete(1, "a"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "a", "v"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("(a->v)", Contents(1));
+    env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
+    ASSERT_EQ("(a->v)", Contents(1));
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_b) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "", ""));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Delete(1, "e"));
+    ASSERT_OK(Put(1, "", ""));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "c", "cv"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "", ""));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "", ""));
+    env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "d", "dv"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "", ""));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Delete(1, "d"));
+    ASSERT_OK(Delete(1, "b"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("(->)(c->cv)", Contents(1));
+    env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
+    ASSERT_EQ("(->)(c->cv)", Contents(1));
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBCompactionTest, ManualAutoRace) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BGWorkCompaction", "DBCompactionTest::ManualAutoRace:1"},
+       {"DBImpl::RunManualCompaction:WaitScheduled",
+        "BackgroundCallCompaction:0"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(1, "foo", ""));
+  ASSERT_OK(Put(1, "bar", ""));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(1, "foo", ""));
+  ASSERT_OK(Put(1, "bar", ""));
+  // Generate four files in CF 0, which should trigger an auto compaction
+  ASSERT_OK(Put("foo", ""));
+  ASSERT_OK(Put("bar", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", ""));
+  ASSERT_OK(Put("bar", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", ""));
+  ASSERT_OK(Put("bar", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", ""));
+  ASSERT_OK(Put("bar", ""));
+  ASSERT_OK(Flush());
+
+  // The auto compaction is scheduled but waited until here
+  TEST_SYNC_POINT("DBCompactionTest::ManualAutoRace:1");
+  // The auto compaction will wait until the manual compaction is registerd
+  // before processing so that it will be cancelled.
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = true;
+  ASSERT_OK(dbfull()->CompactRange(cro, handles_[1], nullptr, nullptr));
+  ASSERT_EQ("0,1", FilesPerLevel(1));
+
+  // Eventually the cancelled compaction will be rescheduled and executed.
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, ManualCompaction) {
+  Options options = CurrentOptions();
+  options.max_subcompactions = max_subcompactions_;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // iter - 0 with 7 levels
+  // iter - 1 with 3 levels
+  for (int iter = 0; iter < 2; ++iter) {
+    MakeTables(3, "p", "q", 1);
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+    // Compaction range falls before files
+    Compact(1, "", "c");
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+    // Compaction range falls after files
+    Compact(1, "r", "z");
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+    // Compaction range overlaps files
+    Compact(1, "p", "q");
+    ASSERT_EQ("0,0,1", FilesPerLevel(1));
+
+    // Populate a different range
+    MakeTables(3, "c", "e", 1);
+    ASSERT_EQ("1,1,2", FilesPerLevel(1));
+
+    // Compact just the new range
+    Compact(1, "b", "f");
+    ASSERT_EQ("0,0,2", FilesPerLevel(1));
+
+    // Compact all
+    MakeTables(1, "a", "z", 1);
+    ASSERT_EQ("1,0,2", FilesPerLevel(1));
+
+    uint64_t prev_block_cache_add =
+        options.statistics->getTickerCount(BLOCK_CACHE_ADD);
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+    ASSERT_OK(db_->CompactRange(cro, handles_[1], nullptr, nullptr));
+    // Verify manual compaction doesn't fill block cache
+    ASSERT_EQ(prev_block_cache_add,
+              options.statistics->getTickerCount(BLOCK_CACHE_ADD));
+
+    ASSERT_EQ("0,0,1", FilesPerLevel(1));
+
+    if (iter == 0) {
+      options = CurrentOptions();
+      options.num_levels = 3;
+      options.create_if_missing = true;
+      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+      DestroyAndReopen(options);
+      CreateAndReopenWithCF({"pikachu"}, options);
+    }
+  }
+}
+
+TEST_P(DBCompactionTestWithParam, ManualLevelCompactionOutputPathId) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760);
+  options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760);
+  options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760);
+  options.max_subcompactions = max_subcompactions_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // iter - 0 with 7 levels
+  // iter - 1 with 3 levels
+  for (int iter = 0; iter < 2; ++iter) {
+    for (int i = 0; i < 3; ++i) {
+      ASSERT_OK(Put(1, "p", "begin"));
+      ASSERT_OK(Put(1, "q", "end"));
+      ASSERT_OK(Flush(1));
+    }
+    ASSERT_EQ("3", FilesPerLevel(1));
+    ASSERT_EQ(3, GetSstFileCount(options.db_paths[0].path));
+    ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+    // Compaction range falls before files
+    Compact(1, "", "c");
+    ASSERT_EQ("3", FilesPerLevel(1));
+
+    // Compaction range falls after files
+    Compact(1, "r", "z");
+    ASSERT_EQ("3", FilesPerLevel(1));
+
+    // Compaction range overlaps files
+    Compact(1, "p", "q", 1);
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ("0,1", FilesPerLevel(1));
+    ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+    ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+    ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+    // Populate a different range
+    for (int i = 0; i < 3; ++i) {
+      ASSERT_OK(Put(1, "c", "begin"));
+      ASSERT_OK(Put(1, "e", "end"));
+      ASSERT_OK(Flush(1));
+    }
+    ASSERT_EQ("3,1", FilesPerLevel(1));
+
+    // Compact just the new range
+    Compact(1, "b", "f", 1);
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ("0,2", FilesPerLevel(1));
+    ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+    ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+    ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+    // Compact all
+    ASSERT_OK(Put(1, "a", "begin"));
+    ASSERT_OK(Put(1, "z", "end"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ("1,2", FilesPerLevel(1));
+    ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+    ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+    CompactRangeOptions compact_options;
+    compact_options.target_path_id = 1;
+    compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+    ASSERT_OK(
+        db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+    ASSERT_EQ("0,1", FilesPerLevel(1));
+    ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+    ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+    ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+    if (iter == 0) {
+      DestroyAndReopen(options);
+      options = CurrentOptions();
+      options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760);
+      options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760);
+      options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760);
+      options.max_background_flushes = 1;
+      options.num_levels = 3;
+      options.create_if_missing = true;
+      CreateAndReopenWithCF({"pikachu"}, options);
+    }
+  }
+}
+
+TEST_F(DBCompactionTest, FilesDeletedAfterCompaction) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v2"));
+    Compact(1, "a", "z");
+    const size_t num_files = CountLiveFiles();
+    for (int i = 0; i < 10; i++) {
+      ASSERT_OK(Put(1, "foo", "v2"));
+      Compact(1, "a", "z");
+    }
+    ASSERT_EQ(CountLiveFiles(), num_files);
+  } while (ChangeCompactOptions());
+}
+
+// Check level comapction with compact files
+TEST_P(DBCompactionTestWithParam, DISABLED_CompactFilesOnLevelCompaction) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 100;
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base = options.target_file_size_base * 2;
+  options.level0_stop_writes_trigger = 2;
+  options.max_bytes_for_level_multiplier = 2;
+  options.compression = kNoCompression;
+  options.max_subcompactions = max_subcompactions_;
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Random rnd(301);
+  for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
+    ASSERT_OK(Put(1, std::to_string(key), rnd.RandomString(kTestValueSize)));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ColumnFamilyMetaData cf_meta;
+  dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  int output_level = static_cast<int>(cf_meta.levels.size()) - 1;
+  for (int file_picked = 5; file_picked > 0; --file_picked) {
+    std::set<std::string> overlapping_file_names;
+    std::vector<std::string> compaction_input_file_names;
+    for (int f = 0; f < file_picked; ++f) {
+      int level = 0;
+      auto file_meta = PickFileRandomly(cf_meta, &rnd, &level);
+      compaction_input_file_names.push_back(file_meta->name);
+      GetOverlappingFileNumbersForLevelCompaction(
+          cf_meta, options.comparator, level, output_level, file_meta,
+          &overlapping_file_names);
+    }
+
+    ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), handles_[1],
+                                     compaction_input_file_names,
+                                     output_level));
+
+    // Make sure all overlapping files do not exist after compaction
+    dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+    VerifyCompactionResult(cf_meta, overlapping_file_names);
+  }
+
+  // make sure all key-values are still there.
+  for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
+    ASSERT_NE(Get(1, std::to_string(key)), "NOT_FOUND");
+  }
+}
+
+TEST_P(DBCompactionTestWithParam, PartialCompactionFailure) {
+  Options options;
+  const int kKeySize = 16;
+  const int kKvSize = 1000;
+  const int kKeysPerBuffer = 100;
+  const int kNumL1Files = 5;
+  options.create_if_missing = true;
+  options.write_buffer_size = kKeysPerBuffer * kKvSize;
+  options.max_write_buffer_number = 2;
+  options.target_file_size_base =
+      options.write_buffer_size * (options.max_write_buffer_number - 1);
+  options.level0_file_num_compaction_trigger = kNumL1Files;
+  options.max_bytes_for_level_base =
+      options.level0_file_num_compaction_trigger *
+      options.target_file_size_base;
+  options.max_bytes_for_level_multiplier = 2;
+  options.compression = kNoCompression;
+  options.max_subcompactions = max_subcompactions_;
+
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  env_->SetBackgroundThreads(1, Env::LOW);
+  // stop the compaction thread until we simulate the file creation failure.
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  options.env = env_;
+
+  DestroyAndReopen(options);
+
+  const int kNumInsertedKeys = options.level0_file_num_compaction_trigger *
+                               (options.max_write_buffer_number - 1) *
+                               kKeysPerBuffer;
+
+  Random rnd(301);
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  for (int k = 0; k < kNumInsertedKeys; ++k) {
+    keys.emplace_back(rnd.RandomString(kKeySize));
+    values.emplace_back(rnd.RandomString(kKvSize - kKeySize));
+    ASSERT_OK(Put(Slice(keys[k]), Slice(values[k])));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+
+  ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
+  // Make sure the number of L0 files can trigger compaction.
+  ASSERT_GE(NumTableFilesAtLevel(0),
+            options.level0_file_num_compaction_trigger);
+
+  auto previous_num_level0_files = NumTableFilesAtLevel(0);
+
+  // Fail the first file creation.
+  env_->non_writable_count_ = 1;
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+
+  // Expect compaction to fail here as one file will fail its
+  // creation.
+  ASSERT_TRUE(!dbfull()->TEST_WaitForCompact().ok());
+
+  // Verify L0 -> L1 compaction does fail.
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+
+  // Verify all L0 files are still there.
+  ASSERT_EQ(NumTableFilesAtLevel(0), previous_num_level0_files);
+
+  // All key-values must exist after compaction fails.
+  for (int k = 0; k < kNumInsertedKeys; ++k) {
+    ASSERT_EQ(values[k], Get(keys[k]));
+  }
+
+  env_->non_writable_count_ = 0;
+
+  // Make sure RocksDB will not get into corrupted state.
+  Reopen(options);
+
+  // Verify again after reopen.
+  for (int k = 0; k < kNumInsertedKeys; ++k) {
+    ASSERT_EQ(values[k], Get(keys[k]));
+  }
+}
+
+TEST_P(DBCompactionTestWithParam, DeleteMovedFileAfterCompaction) {
+  // iter 1 -- delete_obsolete_files_period_micros == 0
+  for (int iter = 0; iter < 2; ++iter) {
+    // This test triggers move compaction and verifies that the file is not
+    // deleted when it's part of move compaction
+    Options options = CurrentOptions();
+    options.env = env_;
+    if (iter == 1) {
+      options.delete_obsolete_files_period_micros = 0;
+    }
+    options.create_if_missing = true;
+    options.level0_file_num_compaction_trigger =
+        2;  // trigger compaction when we have 2 files
+    OnFileDeletionListener* listener = new OnFileDeletionListener();
+    options.listeners.emplace_back(listener);
+    options.max_subcompactions = max_subcompactions_;
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    // Create two 1MB sst files
+    for (int i = 0; i < 2; ++i) {
+      // Create 1MB sst file
+      for (int j = 0; j < 100; ++j) {
+        ASSERT_OK(Put(Key(i * 50 + j), rnd.RandomString(10 * 1024)));
+      }
+      ASSERT_OK(Flush());
+    }
+    // this should execute L0->L1
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ("0,1", FilesPerLevel(0));
+
+    // block compactions
+    test::SleepingBackgroundTask sleeping_task;
+    env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+                   Env::Priority::LOW);
+
+    options.max_bytes_for_level_base = 1024 * 1024;  // 1 MB
+    Reopen(options);
+    std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
+    ASSERT_EQ("0,1", FilesPerLevel(0));
+    // let compactions go
+    sleeping_task.WakeUp();
+    sleeping_task.WaitUntilDone();
+
+    // this should execute L1->L2 (move)
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+    ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+    std::vector<LiveFileMetaData> metadata;
+    db_->GetLiveFilesMetaData(&metadata);
+    ASSERT_EQ(metadata.size(), 1U);
+    auto moved_file_name = metadata[0].name;
+
+    // Create two more 1MB sst files
+    for (int i = 0; i < 2; ++i) {
+      // Create 1MB sst file
+      for (int j = 0; j < 100; ++j) {
+        ASSERT_OK(Put(Key(i * 50 + j + 100), rnd.RandomString(10 * 1024)));
+      }
+      ASSERT_OK(Flush());
+    }
+    // this should execute both L0->L1 and L1->L2 (merge with previous file)
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+    ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+    // iterator is holding the file
+    ASSERT_OK(env_->FileExists(dbname_ + moved_file_name));
+
+    listener->SetExpectedFileName(dbname_ + moved_file_name);
+    ASSERT_OK(iterator->status());
+    iterator.reset();
+
+    // this file should have been compacted away
+    ASSERT_NOK(env_->FileExists(dbname_ + moved_file_name));
+    listener->VerifyMatchedCount(1);
+  }
+}
+
+TEST_P(DBCompactionTestWithParam, CompressLevelCompaction) {
+  if (!Zlib_Supported()) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 4;
+  options.max_bytes_for_level_base = 400 * 1024;
+  options.max_subcompactions = max_subcompactions_;
+  // First two levels have no compression, so that a trivial move between
+  // them will be allowed. Level 2 has Zlib compression so that a trivial
+  // move to level 3 will not be allowed
+  options.compression_per_level = {kNoCompression, kNoCompression,
+                                   kZlibCompression};
+  int matches = 0, didnt_match = 0, trivial_move = 0, non_trivial = 0;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "Compaction::InputCompressionMatchesOutput:Matches",
+      [&](void* /*arg*/) { matches++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "Compaction::InputCompressionMatchesOutput:DidntMatch",
+      [&](void* /*arg*/) { didnt_match++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* /*arg*/) { non_trivial++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Reopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // First three 110KB files are going to level 0
+  // After that, (100K, 200K)
+  for (int num = 0; num < 3; num++) {
+    GenerateNewFile(&rnd, &key_idx);
+  }
+
+  // Another 110KB triggers a compaction to 400K file to fill up level 0
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(4, GetSstFileCount(dbname_));
+
+  // (1, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4", FilesPerLevel(0));
+
+  // (1, 4, 1)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,1", FilesPerLevel(0));
+
+  // (1, 4, 2)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,2", FilesPerLevel(0));
+
+  // (1, 4, 3)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,3", FilesPerLevel(0));
+
+  // (1, 4, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,4", FilesPerLevel(0));
+
+  // (1, 4, 5)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,5", FilesPerLevel(0));
+
+  // (1, 4, 6)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,6", FilesPerLevel(0));
+
+  // (1, 4, 7)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,7", FilesPerLevel(0));
+
+  // (1, 4, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,8", FilesPerLevel(0));
+
+  ASSERT_EQ(matches, 12);
+  // Currently, the test relies on the number of calls to
+  // InputCompressionMatchesOutput() per compaction.
+  const int kCallsToInputCompressionMatch = 2;
+  ASSERT_EQ(didnt_match, 8 * kCallsToInputCompressionMatch);
+  ASSERT_EQ(trivial_move, 12);
+  ASSERT_EQ(non_trivial, 8);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Reopen(options);
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Destroy(options);
+}
+
+TEST_F(DBCompactionTest, SanitizeCompactionOptionsTest) {
+  Options options = CurrentOptions();
+  options.max_background_compactions = 5;
+  options.soft_pending_compaction_bytes_limit = 0;
+  options.hard_pending_compaction_bytes_limit = 100;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+  ASSERT_EQ(100, db_->GetOptions().soft_pending_compaction_bytes_limit);
+
+  options.max_background_compactions = 3;
+  options.soft_pending_compaction_bytes_limit = 200;
+  options.hard_pending_compaction_bytes_limit = 150;
+  DestroyAndReopen(options);
+  ASSERT_EQ(150, db_->GetOptions().soft_pending_compaction_bytes_limit);
+}
+
+// This tests for a bug that could cause two level0 compactions running
+// concurrently
+// TODO(aekmekji): Make sure that the reason this fails when run with
+// max_subcompactions > 1 is not a correctness issue but just inherent to
+// running parallel L0-L1 compactions
+TEST_F(DBCompactionTest, SuggestCompactRangeNoTwoLevel0Compactions) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 110 << 10;
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 4;
+  options.num_levels = 4;
+  options.compression = kNoCompression;
+  options.max_bytes_for_level_base = 450 << 10;
+  options.target_file_size_base = 98 << 10;
+  options.max_write_buffer_number = 2;
+  options.max_background_compactions = 2;
+
+  DestroyAndReopen(options);
+
+  // fill up the DB
+  Random rnd(301);
+  for (int num = 0; num < 10; num++) {
+    GenerateNewRandomFile(&rnd);
+  }
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"CompactionJob::Run():Start",
+        "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:1"},
+       {"DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:2",
+        "CompactionJob::Run():End"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // trigger L0 compaction
+  for (int num = 0; num < options.level0_file_num_compaction_trigger + 1;
+       num++) {
+    GenerateNewRandomFile(&rnd, /* nowait */ true);
+    ASSERT_OK(Flush());
+  }
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:1");
+
+  GenerateNewRandomFile(&rnd, /* nowait */ true);
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr));
+  for (int num = 0; num < options.level0_file_num_compaction_trigger + 1;
+       num++) {
+    GenerateNewRandomFile(&rnd, /* nowait */ true);
+    ASSERT_OK(Flush());
+  }
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:2");
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+}
+
+static std::string ShortKey(int i) {
+  assert(i < 10000);
+  char buf[100];
+  snprintf(buf, sizeof(buf), "key%04d", i);
+  return std::string(buf);
+}
+
+TEST_P(DBCompactionTestWithParam, ForceBottommostLevelCompaction) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* /*arg*/) { non_trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // The key size is guaranteed to be <= 8
+  class ShortKeyComparator : public Comparator {
+    int Compare(const ROCKSDB_NAMESPACE::Slice& a,
+                const ROCKSDB_NAMESPACE::Slice& b) const override {
+      assert(a.size() <= 8);
+      assert(b.size() <= 8);
+      return BytewiseComparator()->Compare(a, b);
+    }
+    const char* Name() const override { return "ShortKeyComparator"; }
+    void FindShortestSeparator(
+        std::string* start,
+        const ROCKSDB_NAMESPACE::Slice& limit) const override {
+      return BytewiseComparator()->FindShortestSeparator(start, limit);
+    }
+    void FindShortSuccessor(std::string* key) const override {
+      return BytewiseComparator()->FindShortSuccessor(key);
+    }
+  } short_key_cmp;
+  Options options = CurrentOptions();
+  options.target_file_size_base = 100000000;
+  options.write_buffer_size = 100000000;
+  options.max_subcompactions = max_subcompactions_;
+  options.comparator = &short_key_cmp;
+  DestroyAndReopen(options);
+
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  Random rnd(301);
+  std::vector<std::string> values;
+  // File with keys [ 0 => 99 ]
+  for (int i = 0; i < 100; i++) {
+    values.push_back(rnd.RandomString(value_size));
+    ASSERT_OK(Put(ShortKey(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ("1", FilesPerLevel(0));
+  // Compaction will do L0=>L1 (trivial move) then move L1 files to L3
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 3;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
+  ASSERT_EQ(trivial_move, 1);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  // File with keys [ 100 => 199 ]
+  for (int i = 100; i < 200; i++) {
+    values.push_back(rnd.RandomString(value_size));
+    ASSERT_OK(Put(ShortKey(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ("1,0,0,1", FilesPerLevel(0));
+  // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves)
+  // then compacte the bottommost level L3=>L3 (non trivial move)
+  compact_options = CompactRangeOptions();
+  compact_options.bottommost_level_compaction =
+      BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
+  ASSERT_EQ(trivial_move, 4);
+  ASSERT_EQ(non_trivial_move, 1);
+
+  // File with keys [ 200 => 299 ]
+  for (int i = 200; i < 300; i++) {
+    values.push_back(rnd.RandomString(value_size));
+    ASSERT_OK(Put(ShortKey(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ("1,0,0,1", FilesPerLevel(0));
+  trivial_move = 0;
+  non_trivial_move = 0;
+  compact_options = CompactRangeOptions();
+  compact_options.bottommost_level_compaction =
+      BottommostLevelCompaction::kSkip;
+  // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves)
+  // and will skip bottommost level compaction
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,2", FilesPerLevel(0));
+  ASSERT_EQ(trivial_move, 3);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  for (int i = 0; i < 300; i++) {
+    ASSERT_EQ(Get(ShortKey(i)), values[i]);
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, IntraL0Compaction) {
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 5;
+  options.max_background_compactions = 2;
+  options.max_subcompactions = max_subcompactions_;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.write_buffer_size = 2 << 20;  // 2MB
+
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = NewLRUCache(64 << 20);  // 64MB
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  DestroyAndReopen(options);
+
+  const size_t kValueSize = 1 << 20;
+  Random rnd(301);
+  std::string value(rnd.RandomString(kValueSize));
+
+  // The L0->L1 must be picked before we begin flushing files to trigger
+  // intra-L0 compaction, and must not finish until after an intra-L0
+  // compaction has been picked.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"LevelCompactionPicker::PickCompaction:Return",
+        "DBCompactionTest::IntraL0Compaction:L0ToL1Ready"},
+       {"LevelCompactionPicker::PickCompactionBySize:0",
+        "CompactionJob::Run():Start"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // index:   0   1   2   3   4   5   6   7   8   9
+  // size:  1MB 1MB 1MB 1MB 1MB 2MB 1MB 1MB 1MB 1MB
+  // score:                     1.5 1.3 1.5 2.0 inf
+  //
+  // Files 0-4 will be included in an L0->L1 compaction.
+  //
+  // L0->L0 will be triggered since the sync points guarantee compaction to base
+  // level is still blocked when files 5-9 trigger another compaction.
+  //
+  // Files 6-9 are the longest span of available files for which
+  // work-per-deleted-file decreases (see "score" row above).
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_OK(Put(Key(0), ""));  // prevents trivial move
+    if (i == 5) {
+      TEST_SYNC_POINT("DBCompactionTest::IntraL0Compaction:L0ToL1Ready");
+      ASSERT_OK(Put(Key(i + 1), value + value));
+    } else {
+      ASSERT_OK(Put(Key(i + 1), value));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  std::vector<std::vector<FileMetaData>> level_to_files;
+  dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+                                  &level_to_files);
+  ASSERT_GE(level_to_files.size(), 2);  // at least L0 and L1
+  // L0 has the 2MB file (not compacted) and 4MB file (output of L0->L0)
+  ASSERT_EQ(2, level_to_files[0].size());
+  ASSERT_GT(level_to_files[1].size(), 0);
+  for (int i = 0; i < 2; ++i) {
+    ASSERT_GE(level_to_files[0][i].fd.file_size, 1 << 21);
+  }
+
+  // The index/filter in the file produced by intra-L0 should not be pinned.
+  // That means clearing unref'd entries in block cache and re-accessing the
+  // file produced by intra-L0 should bump the index block miss count.
+  uint64_t prev_index_misses =
+      TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS);
+  table_options.block_cache->EraseUnRefEntries();
+  ASSERT_EQ("", Get(Key(0)));
+  ASSERT_EQ(prev_index_misses + 1,
+            TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+}
+
+TEST_P(DBCompactionTestWithParam, IntraL0CompactionDoesNotObsoleteDeletions) {
+  // regression test for issue #2722: L0->L0 compaction can resurrect deleted
+  // keys from older L0 files if L1+ files' key-ranges do not include the key.
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 5;
+  options.max_background_compactions = 2;
+  options.max_subcompactions = max_subcompactions_;
+  DestroyAndReopen(options);
+
+  const size_t kValueSize = 1 << 20;
+  Random rnd(301);
+  std::string value(rnd.RandomString(kValueSize));
+
+  // The L0->L1 must be picked before we begin flushing files to trigger
+  // intra-L0 compaction, and must not finish until after an intra-L0
+  // compaction has been picked.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"LevelCompactionPicker::PickCompaction:Return",
+        "DBCompactionTest::IntraL0CompactionDoesNotObsoleteDeletions:"
+        "L0ToL1Ready"},
+       {"LevelCompactionPicker::PickCompactionBySize:0",
+        "CompactionJob::Run():Start"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // index:   0   1   2   3   4    5    6   7   8   9
+  // size:  1MB 1MB 1MB 1MB 1MB  1MB  1MB 1MB 1MB 1MB
+  // score:                     1.25 1.33 1.5 2.0 inf
+  //
+  // Files 0-4 will be included in an L0->L1 compaction.
+  //
+  // L0->L0 will be triggered since the sync points guarantee compaction to base
+  // level is still blocked when files 5-9 trigger another compaction. All files
+  // 5-9 are included in the L0->L0 due to work-per-deleted file decreasing.
+  //
+  // Put a key-value in files 0-4. Delete that key in files 5-9. Verify the
+  // L0->L0 preserves the deletion such that the key remains deleted.
+  for (int i = 0; i < 10; ++i) {
+    // key 0 serves both to prevent trivial move and as the key we want to
+    // verify is not resurrected by L0->L0 compaction.
+    if (i < 5) {
+      ASSERT_OK(Put(Key(0), ""));
+    } else {
+      ASSERT_OK(Delete(Key(0)));
+    }
+    if (i == 5) {
+      TEST_SYNC_POINT(
+          "DBCompactionTest::IntraL0CompactionDoesNotObsoleteDeletions:"
+          "L0ToL1Ready");
+    }
+    ASSERT_OK(Put(Key(i + 1), value));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  std::vector<std::vector<FileMetaData>> level_to_files;
+  dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+                                  &level_to_files);
+  ASSERT_GE(level_to_files.size(), 2);  // at least L0 and L1
+  // L0 has a single output file from L0->L0
+  ASSERT_EQ(1, level_to_files[0].size());
+  ASSERT_GT(level_to_files[1].size(), 0);
+  ASSERT_GE(level_to_files[0][0].fd.file_size, 1 << 22);
+
+  ReadOptions roptions;
+  std::string result;
+  ASSERT_TRUE(db_->Get(roptions, Key(0), &result).IsNotFound());
+}
+
+TEST_P(DBCompactionTestWithParam, FullCompactionInBottomPriThreadPool) {
+  const int kNumFilesTrigger = 3;
+  Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM);
+  for (bool use_universal_compaction : {false, true}) {
+    Options options = CurrentOptions();
+    if (use_universal_compaction) {
+      options.compaction_style = kCompactionStyleUniversal;
+    } else {
+      options.compaction_style = kCompactionStyleLevel;
+      options.level_compaction_dynamic_level_bytes = true;
+    }
+    options.num_levels = 4;
+    options.write_buffer_size = 100 << 10;     // 100KB
+    options.target_file_size_base = 32 << 10;  // 32KB
+    options.level0_file_num_compaction_trigger = kNumFilesTrigger;
+    // Trigger compaction if size amplification exceeds 110%
+    options.compaction_options_universal.max_size_amplification_percent = 110;
+    DestroyAndReopen(options);
+
+    int num_bottom_pri_compactions = 0;
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::BGWorkBottomCompaction",
+        [&](void* /*arg*/) { ++num_bottom_pri_compactions; });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    Random rnd(301);
+    for (int num = 0; num < kNumFilesTrigger; num++) {
+      ASSERT_EQ(NumSortedRuns(), num);
+      int key_idx = 0;
+      GenerateNewFile(&rnd, &key_idx);
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+    ASSERT_EQ(1, num_bottom_pri_compactions);
+
+    // Verify that size amplification did occur
+    ASSERT_EQ(NumSortedRuns(), 1);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+  Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM);
+}
+
+TEST_F(DBCompactionTest, OptimizedDeletionObsoleting) {
+  // Deletions can be dropped when compacted to non-last level if they fall
+  // outside the lower-level files' key-ranges.
+  const int kNumL0Files = 4;
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  // put key 1 and 3 in separate L1, L2 files.
+  // So key 0, 2, and 4+ fall outside these levels' key-ranges.
+  for (int level = 2; level >= 1; --level) {
+    for (int i = 0; i < 2; ++i) {
+      ASSERT_OK(Put(Key(2 * i + 1), "val"));
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(level);
+    ASSERT_EQ(2, NumTableFilesAtLevel(level));
+  }
+
+  // Delete keys in range [1, 4]. These L0 files will be compacted with L1:
+  // - Tombstones for keys 2 and 4 can be dropped early.
+  // - Tombstones for keys 1 and 3 must be kept due to L2 files' key-ranges.
+  for (int i = 0; i < kNumL0Files; ++i) {
+    ASSERT_OK(Put(Key(0), "val"));  // sentinel to prevent trivial move
+    ASSERT_OK(Delete(Key(i + 1)));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  for (int i = 0; i < kNumL0Files; ++i) {
+    std::string value;
+    ASSERT_TRUE(db_->Get(ReadOptions(), Key(i + 1), &value).IsNotFound());
+  }
+  ASSERT_EQ(2, options.statistics->getTickerCount(
+                   COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE));
+  ASSERT_EQ(2,
+            options.statistics->getTickerCount(COMPACTION_KEY_DROP_OBSOLETE));
+}
+
+TEST_F(DBCompactionTest, CompactFilesPendingL0Bug) {
+  // https://www.facebook.com/groups/rocksdb.dev/permalink/1389452781153232/
+  // CompactFiles() had a bug where it failed to pick a compaction when an L0
+  // compaction existed, but marked it as scheduled anyways. It'd never be
+  // unmarked as scheduled, so future compactions or DB close could hang.
+  const int kNumL0Files = 5;
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files - 1;
+  options.max_background_compactions = 2;
+  DestroyAndReopen(options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"LevelCompactionPicker::PickCompaction:Return",
+        "DBCompactionTest::CompactFilesPendingL0Bug:Picked"},
+       {"DBCompactionTest::CompactFilesPendingL0Bug:ManualCompacted",
+        "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  auto schedule_multi_compaction_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  // Files 0-3 will be included in an L0->L1 compaction.
+  //
+  // File 4 will be included in a call to CompactFiles() while the first
+  // compaction is running.
+  for (int i = 0; i < kNumL0Files - 1; ++i) {
+    ASSERT_OK(Put(Key(0), "val"));  // sentinel to prevent trivial move
+    ASSERT_OK(Put(Key(i + 1), "val"));
+    ASSERT_OK(Flush());
+  }
+  TEST_SYNC_POINT("DBCompactionTest::CompactFilesPendingL0Bug:Picked");
+  // file 4 flushed after 0-3 picked
+  ASSERT_OK(Put(Key(kNumL0Files), "val"));
+  ASSERT_OK(Flush());
+
+  // previously DB close would hang forever as this situation caused scheduled
+  // compactions count to never decrement to zero.
+  ColumnFamilyMetaData cf_meta;
+  dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta);
+  ASSERT_EQ(kNumL0Files, cf_meta.levels[0].files.size());
+  std::vector<std::string> input_filenames;
+  input_filenames.push_back(cf_meta.levels[0].files.front().name);
+  ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), input_filenames,
+                                   0 /* output_level */));
+  TEST_SYNC_POINT("DBCompactionTest::CompactFilesPendingL0Bug:ManualCompacted");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, CompactFilesOverlapInL0Bug) {
+  // Regression test for bug of not pulling in L0 files that overlap the user-
+  // specified input files in time- and key-ranges.
+  ASSERT_OK(Put(Key(0), "old_val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(0), "new_val"));
+  ASSERT_OK(Flush());
+
+  ColumnFamilyMetaData cf_meta;
+  dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta);
+  ASSERT_GE(cf_meta.levels.size(), 2);
+  ASSERT_EQ(2, cf_meta.levels[0].files.size());
+
+  // Compacting {new L0 file, L1 file} should pull in the old L0 file since it
+  // overlaps in key-range and time-range.
+  std::vector<std::string> input_filenames;
+  input_filenames.push_back(cf_meta.levels[0].files.front().name);
+  ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), input_filenames,
+                                   1 /* output_level */));
+  ASSERT_EQ("new_val", Get(Key(0)));
+}
+
+TEST_F(DBCompactionTest, DeleteFilesInRangeConflictWithCompaction) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  const Snapshot* snapshot = nullptr;
+  const int kMaxKey = 10;
+
+  for (int i = 0; i < kMaxKey; i++) {
+    ASSERT_OK(Put(Key(i), Key(i)));
+    ASSERT_OK(Delete(Key(i)));
+    if (!snapshot) {
+      snapshot = db_->GetSnapshot();
+    }
+  }
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+  ASSERT_OK(Put(Key(kMaxKey), Key(kMaxKey)));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // test DeleteFilesInRange() deletes the files already picked for compaction
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"VersionSet::LogAndApply:WriteManifestStart",
+        "BackgroundCallCompaction:0"},
+       {"DBImpl::BackgroundCompaction:Finish",
+        "VersionSet::LogAndApply:WriteManifestDone"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // release snapshot which mark bottommost file for compaction
+  db_->ReleaseSnapshot(snapshot);
+  std::string begin_string = Key(0);
+  std::string end_string = Key(kMaxKey + 1);
+  Slice begin(begin_string);
+  Slice end(end_string);
+  ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, CompactBottomLevelFilesWithDeletions) {
+  // bottom-level files may contain deletions due to snapshots protecting the
+  // deleted keys. Once the snapshot is released, we should see files with many
+  // such deletions undergo single-file compactions.
+  const int kNumKeysPerFile = 1024;
+  const int kNumLevelFiles = 4;
+  const int kValueSize = 128;
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = kNumLevelFiles;
+  // inflate it a bit to account for key/metadata overhead
+  options.target_file_size_base = 120 * kNumKeysPerFile * kValueSize / 100;
+  CreateAndReopenWithCF({"one"}, options);
+
+  Random rnd(301);
+  const Snapshot* snapshot = nullptr;
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+    }
+    if (i == kNumLevelFiles - 1) {
+      snapshot = db_->GetSnapshot();
+      // delete every other key after grabbing a snapshot, so these deletions
+      // and the keys they cover can't be dropped until after the snapshot is
+      // released.
+      for (int j = 0; j < kNumLevelFiles * kNumKeysPerFile; j += 2) {
+        ASSERT_OK(Delete(Key(j)));
+      }
+    }
+    ASSERT_OK(Flush());
+    if (i < kNumLevelFiles - 1) {
+      ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+    }
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(kNumLevelFiles, NumTableFilesAtLevel(1));
+
+  std::vector<LiveFileMetaData> pre_release_metadata, post_release_metadata;
+  db_->GetLiveFilesMetaData(&pre_release_metadata);
+  // just need to bump seqnum so ReleaseSnapshot knows the newest key in the SST
+  // files does not need to be preserved in case of a future snapshot.
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_NE(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_);
+  // release snapshot and wait for compactions to finish. Single-file
+  // compactions should be triggered, which reduce the size of each bottom-level
+  // file without changing file count.
+  db_->ReleaseSnapshot(snapshot);
+  ASSERT_EQ(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(compaction->compaction_reason() ==
+                    CompactionReason::kBottommostFiles);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  db_->GetLiveFilesMetaData(&post_release_metadata);
+  ASSERT_EQ(pre_release_metadata.size(), post_release_metadata.size());
+
+  for (size_t i = 0; i < pre_release_metadata.size(); ++i) {
+    const auto& pre_file = pre_release_metadata[i];
+    const auto& post_file = post_release_metadata[i];
+    ASSERT_EQ(1, pre_file.level);
+    ASSERT_EQ(1, post_file.level);
+    // each file is smaller than it was before as it was rewritten without
+    // deletion markers/deleted keys.
+    ASSERT_LT(post_file.size, pre_file.size);
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, NoCompactBottomLevelFilesWithDeletions) {
+  // bottom-level files may contain deletions due to snapshots protecting the
+  // deleted keys. Once the snapshot is released, we should see files with many
+  // such deletions undergo single-file compactions. But when disabling auto
+  // compactions, it shouldn't be triggered which may causing too many
+  // background jobs.
+  const int kNumKeysPerFile = 1024;
+  const int kNumLevelFiles = 4;
+  const int kValueSize = 128;
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.level0_file_num_compaction_trigger = kNumLevelFiles;
+  // inflate it a bit to account for key/metadata overhead
+  options.target_file_size_base = 120 * kNumKeysPerFile * kValueSize / 100;
+  Reopen(options);
+
+  Random rnd(301);
+  const Snapshot* snapshot = nullptr;
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+    }
+    if (i == kNumLevelFiles - 1) {
+      snapshot = db_->GetSnapshot();
+      // delete every other key after grabbing a snapshot, so these deletions
+      // and the keys they cover can't be dropped until after the snapshot is
+      // released.
+      for (int j = 0; j < kNumLevelFiles * kNumKeysPerFile; j += 2) {
+        ASSERT_OK(Delete(Key(j)));
+      }
+    }
+    ASSERT_OK(Flush());
+    if (i < kNumLevelFiles - 1) {
+      ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+    }
+  }
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr));
+  ASSERT_EQ(kNumLevelFiles, NumTableFilesAtLevel(1));
+
+  std::vector<LiveFileMetaData> pre_release_metadata, post_release_metadata;
+  db_->GetLiveFilesMetaData(&pre_release_metadata);
+  // just need to bump seqnum so ReleaseSnapshot knows the newest key in the SST
+  // files does not need to be preserved in case of a future snapshot.
+  ASSERT_OK(Put(Key(0), "val"));
+
+  // release snapshot and no compaction should be triggered.
+  std::atomic<int> num_compactions{0};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:Start",
+      [&](void* /*arg*/) { num_compactions.fetch_add(1); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  db_->ReleaseSnapshot(snapshot);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(0, num_compactions);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  db_->GetLiveFilesMetaData(&post_release_metadata);
+  ASSERT_EQ(pre_release_metadata.size(), post_release_metadata.size());
+  for (size_t i = 0; i < pre_release_metadata.size(); ++i) {
+    const auto& pre_file = pre_release_metadata[i];
+    const auto& post_file = post_release_metadata[i];
+    ASSERT_EQ(1, pre_file.level);
+    ASSERT_EQ(1, post_file.level);
+    // each file is same as before with deletion markers/deleted keys.
+    ASSERT_EQ(post_file.size, pre_file.size);
+  }
+}
+
+TEST_F(DBCompactionTest, RoundRobinTtlCompactionNormal) {
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 20;
+  options.ttl = 24 * 60 * 60;  // 24 hours
+  options.compaction_pri = kRoundRobin;
+  env_->now_cpu_count_.store(0);
+  env_->SetMockSleep();
+  options.env = env_;
+
+  // add a small second for each wait time, to make sure the file is expired
+  int small_seconds = 1;
+
+  std::atomic_int ttl_compactions{0};
+  std::atomic_int round_robin_ttl_compactions{0};
+  std::atomic_int other_compactions{0};
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        auto compaction_reason = compaction->compaction_reason();
+        if (compaction_reason == CompactionReason::kTtl) {
+          ttl_compactions++;
+        } else if (compaction_reason == CompactionReason::kRoundRobinTtl) {
+          round_robin_ttl_compactions++;
+        } else {
+          other_compactions++;
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  DestroyAndReopen(options);
+
+  // Setup the files from lower level to up level, each file is 1 hour's older
+  // than the next one.
+  // create 10 files on the last level (L6)
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(i * 100 + j), "value" + std::to_string(i * 100 + j)));
+    }
+    ASSERT_OK(Flush());
+    env_->MockSleepForSeconds(60 * 60);  // generate 1 file per hour
+  }
+  MoveFilesToLevel(6);
+
+  // create 5 files on L5
+  for (int i = 0; i < 5; i++) {
+    for (int j = 0; j < 200; j++) {
+      ASSERT_OK(Put(Key(i * 200 + j), "value" + std::to_string(i * 200 + j)));
+    }
+    ASSERT_OK(Flush());
+    env_->MockSleepForSeconds(60 * 60);
+  }
+  MoveFilesToLevel(5);
+
+  // create 3 files on L4
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 300; j++) {
+      ASSERT_OK(Put(Key(i * 300 + j), "value" + std::to_string(i * 300 + j)));
+    }
+    ASSERT_OK(Flush());
+    env_->MockSleepForSeconds(60 * 60);
+  }
+  MoveFilesToLevel(4);
+
+  // The LSM tree should be like:
+  // L4: [0,            299], [300,      599], [600,     899]
+  // L5: [0,  199]      [200,         399]...............[800, 999]
+  // L6: [0,99][100,199][200,299][300,399]...............[800,899][900,999]
+  ASSERT_EQ("0,0,0,0,3,5,10", FilesPerLevel());
+
+  // make sure the first L5 file is expired
+  env_->MockSleepForSeconds(16 * 60 * 60 + small_seconds++);
+
+  // trigger TTL compaction
+  ASSERT_OK(Put(Key(4), "value" + std::to_string(1)));
+  ASSERT_OK(Put(Key(5), "value" + std::to_string(1)));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // verify there's a RoundRobin TTL compaction
+  ASSERT_EQ(1, round_robin_ttl_compactions);
+  round_robin_ttl_compactions = 0;
+
+  // expire 2 more files
+  env_->MockSleepForSeconds(2 * 60 * 60 + small_seconds++);
+  // trigger TTL compaction
+  ASSERT_OK(Put(Key(4), "value" + std::to_string(2)));
+  ASSERT_OK(Put(Key(5), "value" + std::to_string(2)));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ(2, round_robin_ttl_compactions);
+  round_robin_ttl_compactions = 0;
+
+  // expire 4 more files, 2 out of 3 files on L4 are expired
+  env_->MockSleepForSeconds(4 * 60 * 60 + small_seconds++);
+  // trigger TTL compaction
+  ASSERT_OK(Put(Key(6), "value" + std::to_string(3)));
+  ASSERT_OK(Put(Key(7), "value" + std::to_string(3)));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ(1, NumTableFilesAtLevel(4));
+  ASSERT_EQ(0, NumTableFilesAtLevel(5));
+
+  ASSERT_GT(round_robin_ttl_compactions, 0);
+  round_robin_ttl_compactions = 0;
+
+  // make the first L0 file expired, which triggers a normal TTL compaction
+  // instead of roundrobin TTL compaction, it will also include an extra file
+  // from L0 because of overlap
+  ASSERT_EQ(0, ttl_compactions);
+  env_->MockSleepForSeconds(19 * 60 * 60 + small_seconds++);
+
+  // trigger TTL compaction
+  ASSERT_OK(Put(Key(6), "value" + std::to_string(4)));
+  ASSERT_OK(Put(Key(7), "value" + std::to_string(4)));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // L0 -> L1 compaction is normal TTL compaction, L1 -> next levels compactions
+  // are RoundRobin TTL compaction.
+  ASSERT_GT(ttl_compactions, 0);
+  ttl_compactions = 0;
+  ASSERT_GT(round_robin_ttl_compactions, 0);
+  round_robin_ttl_compactions = 0;
+
+  // All files are expired, so only the last level has data
+  env_->MockSleepForSeconds(24 * 60 * 60);
+  // trigger TTL compaction
+  ASSERT_OK(Put(Key(6), "value" + std::to_string(4)));
+  ASSERT_OK(Put(Key(7), "value" + std::to_string(4)));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+
+  ASSERT_GT(ttl_compactions, 0);
+  ttl_compactions = 0;
+  ASSERT_GT(round_robin_ttl_compactions, 0);
+  round_robin_ttl_compactions = 0;
+
+  ASSERT_EQ(0, other_compactions);
+}
+
+TEST_F(DBCompactionTest, RoundRobinTtlCompactionUnsortedTime) {
+  // This is to test the case that the RoundRobin compaction cursor not pointing
+  // to the oldest file, RoundRobin compaction should still compact the file
+  // after cursor until all expired files are compacted.
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 20;
+  options.ttl = 24 * 60 * 60;  // 24 hours
+  options.compaction_pri = kRoundRobin;
+  env_->now_cpu_count_.store(0);
+  env_->SetMockSleep();
+  options.env = env_;
+
+  std::atomic_int ttl_compactions{0};
+  std::atomic_int round_robin_ttl_compactions{0};
+  std::atomic_int other_compactions{0};
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        auto compaction_reason = compaction->compaction_reason();
+        if (compaction_reason == CompactionReason::kTtl) {
+          ttl_compactions++;
+        } else if (compaction_reason == CompactionReason::kRoundRobinTtl) {
+          round_robin_ttl_compactions++;
+        } else {
+          other_compactions++;
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  DestroyAndReopen(options);
+
+  // create 10 files on the last level (L6)
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(i * 100 + j), "value" + std::to_string(i * 100 + j)));
+    }
+    ASSERT_OK(Flush());
+    env_->MockSleepForSeconds(60 * 60);  // generate 1 file per hour
+  }
+  MoveFilesToLevel(6);
+
+  // create 5 files on L5
+  for (int i = 0; i < 5; i++) {
+    for (int j = 0; j < 200; j++) {
+      ASSERT_OK(Put(Key(i * 200 + j), "value" + std::to_string(i * 200 + j)));
+    }
+    ASSERT_OK(Flush());
+    env_->MockSleepForSeconds(60 * 60);  // 1 hour
+  }
+  MoveFilesToLevel(5);
+
+  // The LSM tree should be like:
+  // L5: [0,  199]      [200,         399] [400,599] [600,799] [800, 999]
+  // L6: [0,99][100,199][200,299][300,399]....................[800,899][900,999]
+  ASSERT_EQ("0,0,0,0,0,5,10", FilesPerLevel());
+
+  // point the compaction cursor to the 4th file on L5
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  ASSERT_NE(cfd, nullptr);
+  Version* const current = cfd->current();
+  ASSERT_NE(current, nullptr);
+  VersionStorageInfo* storage_info = current->storage_info();
+  ASSERT_NE(storage_info, nullptr);
+  const InternalKey split_cursor = InternalKey(Key(600), 100000, kTypeValue);
+  storage_info->AddCursorForOneLevel(5, split_cursor);
+
+  // make the first file on L5 expired, there should be 3 TTL compactions:
+  // 4th one, 5th one, then 1st one.
+  env_->MockSleepForSeconds(19 * 60 * 60 + 1);
+  // trigger TTL compaction
+  ASSERT_OK(Put(Key(6), "value" + std::to_string(4)));
+  ASSERT_OK(Put(Key(7), "value" + std::to_string(4)));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(2, NumTableFilesAtLevel(5));
+
+  ASSERT_EQ(3, round_robin_ttl_compactions);
+  ASSERT_EQ(0, ttl_compactions);
+  ASSERT_EQ(0, other_compactions);
+}
+
+TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) {
+  const int kNumKeysPerFile = 32;
+  const int kNumLevelFiles = 2;
+  const int kValueSize = 1024;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.ttl = 24 * 60 * 60;  // 24 hours
+  options.max_open_files = -1;
+  env_->SetMockSleep();
+  options.env = env_;
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  MoveFilesToLevel(3);
+  ASSERT_EQ("0,0,0,2", FilesPerLevel());
+
+  // Delete previously written keys.
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("2,0,0,2", FilesPerLevel());
+  MoveFilesToLevel(1);
+  ASSERT_EQ("0,2,0,2", FilesPerLevel());
+
+  env_->MockSleepForSeconds(36 * 60 * 60);  // 36 hours
+  ASSERT_EQ("0,2,0,2", FilesPerLevel());
+
+  // Just do a simple write + flush so that the Ttl expired files get
+  // compacted.
+  ASSERT_OK(Put("a", "1"));
+  ASSERT_OK(Flush());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kTtl);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // All non-L0 files are deleted, as they contained only deleted data.
+  ASSERT_EQ("1", FilesPerLevel());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  // Test dynamically changing ttl.
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  MoveFilesToLevel(3);
+  ASSERT_EQ("0,0,0,2", FilesPerLevel());
+
+  // Delete previously written keys.
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("2,0,0,2", FilesPerLevel());
+  MoveFilesToLevel(1);
+  ASSERT_EQ("0,2,0,2", FilesPerLevel());
+
+  // Move time forward by 12 hours, and make sure that compaction still doesn't
+  // trigger as ttl is set to 24 hours.
+  env_->MockSleepForSeconds(12 * 60 * 60);
+  ASSERT_OK(Put("a", "1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("1,2,0,2", FilesPerLevel());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kTtl);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Dynamically change ttl to 10 hours.
+  // This should trigger a ttl compaction, as 12 hours have already passed.
+  ASSERT_OK(dbfull()->SetOptions({{"ttl", "36000"}}));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // All non-L0 files are deleted, as they contained only deleted data.
+  ASSERT_EQ("1", FilesPerLevel());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) {
+  env_->SetMockSleep();
+  const int kValueSize = 100;
+
+  for (bool if_restart : {false, true}) {
+    for (bool if_open_all_files : {false, true}) {
+      Options options = CurrentOptions();
+      options.compression = kNoCompression;
+      options.ttl = 24 * 60 * 60;  // 24 hours
+      if (if_open_all_files) {
+        options.max_open_files = -1;
+      } else {
+        options.max_open_files = 20;
+      }
+      // RocksDB sanitize max open files to at least 20. Modify it back.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+            int* max_open_files = static_cast<int*>(arg);
+            *max_open_files = 2;
+          });
+      // In the case where all files are opened and doing DB restart
+      // forcing the oldest ancester time in manifest file to be 0 to
+      // simulate the case of reading from an old version.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "VersionEdit::EncodeTo:VarintOldestAncesterTime", [&](void* arg) {
+            if (if_restart && if_open_all_files) {
+              std::string* encoded_fieled = static_cast<std::string*>(arg);
+              *encoded_fieled = "";
+              PutVarint64(encoded_fieled, 0);
+            }
+          });
+
+      options.env = env_;
+
+      // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+      DestroyAndReopen(options);
+
+      int ttl_compactions = 0;
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+            Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+            auto compaction_reason = compaction->compaction_reason();
+            if (compaction_reason == CompactionReason::kTtl) {
+              ttl_compactions++;
+            }
+          });
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+      // Add two L6 files with key ranges: [1 .. 100], [101 .. 200].
+      Random rnd(301);
+      for (int i = 1; i <= 100; ++i) {
+        ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
+      }
+      ASSERT_OK(Flush());
+      // Get the first file's creation time. This will be the oldest file in the
+      // DB. Compactions inolving this file's descendents should keep getting
+      // this time.
+      std::vector<std::vector<FileMetaData>> level_to_files;
+      dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+                                      &level_to_files);
+      uint64_t oldest_time = level_to_files[0][0].oldest_ancester_time;
+      // Add 1 hour and do another flush.
+      env_->MockSleepForSeconds(1 * 60 * 60);
+      for (int i = 101; i <= 200; ++i) {
+        ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
+      }
+      ASSERT_OK(Flush());
+      MoveFilesToLevel(6);
+      ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+
+      env_->MockSleepForSeconds(1 * 60 * 60);
+      // Add two L4 files with key ranges: [1 .. 50], [51 .. 150].
+      for (int i = 1; i <= 50; ++i) {
+        ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
+      }
+      ASSERT_OK(Flush());
+      env_->MockSleepForSeconds(1 * 60 * 60);
+      for (int i = 51; i <= 150; ++i) {
+        ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
+      }
+      ASSERT_OK(Flush());
+      MoveFilesToLevel(4);
+      ASSERT_EQ("0,0,0,0,2,0,2", FilesPerLevel());
+
+      env_->MockSleepForSeconds(1 * 60 * 60);
+      // Add one L1 file with key range: [26, 75].
+      for (int i = 26; i <= 75; ++i) {
+        ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      MoveFilesToLevel(1);
+      ASSERT_EQ("0,1,0,0,2,0,2", FilesPerLevel());
+
+      // LSM tree:
+      // L1:         [26 .. 75]
+      // L4:     [1 .. 50][51 ..... 150]
+      // L6:     [1 ........ 100][101 .... 200]
+      //
+      // On TTL expiry, TTL compaction should be initiated on L1 file, and the
+      // compactions should keep going on until the key range hits bottom level.
+      // In other words: the compaction on this data range "cascasdes" until
+      // reaching the bottom level.
+      //
+      // Order of events on TTL expiry:
+      // 1. L1 file falls to L3 via 2 trivial moves which are initiated by the
+      // ttl
+      //    compaction.
+      // 2. A TTL compaction happens between L3 and L4 files. Output file in L4.
+      // 3. The new output file from L4 falls to L5 via 1 trival move initiated
+      //    by the ttl compaction.
+      // 4. A TTL compaction happens between L5 and L6 files. Ouptut in L6.
+
+      // Add 25 hours and do a write
+      env_->MockSleepForSeconds(25 * 60 * 60);
+
+      ASSERT_OK(Put(Key(1), "1"));
+      if (if_restart) {
+        Reopen(options);
+      } else {
+        ASSERT_OK(Flush());
+      }
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel());
+      ASSERT_EQ(5, ttl_compactions);
+
+      dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+                                      &level_to_files);
+      ASSERT_EQ(oldest_time, level_to_files[6][0].oldest_ancester_time);
+
+      env_->MockSleepForSeconds(25 * 60 * 60);
+      ASSERT_OK(Put(Key(2), "1"));
+      if (if_restart) {
+        Reopen(options);
+      } else {
+        ASSERT_OK(Flush());
+      }
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel());
+      ASSERT_GE(ttl_compactions, 6);
+
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    }
+  }
+}
+
+TEST_F(DBCompactionTest, LevelPeriodicCompaction) {
+  env_->SetMockSleep();
+  const int kNumKeysPerFile = 32;
+  const int kNumLevelFiles = 2;
+  const int kValueSize = 100;
+
+  for (bool if_restart : {false, true}) {
+    for (bool if_open_all_files : {false, true}) {
+      Options options = CurrentOptions();
+      options.periodic_compaction_seconds = 48 * 60 * 60;  // 2 days
+      if (if_open_all_files) {
+        options.max_open_files = -1;  // needed for ttl compaction
+      } else {
+        options.max_open_files = 20;
+      }
+      // RocksDB sanitize max open files to at least 20. Modify it back.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+            int* max_open_files = static_cast<int*>(arg);
+            *max_open_files = 0;
+          });
+      // In the case where all files are opened and doing DB restart
+      // forcing the file creation time in manifest file to be 0 to
+      // simulate the case of reading from an old version.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "VersionEdit::EncodeTo:VarintFileCreationTime", [&](void* arg) {
+            if (if_restart && if_open_all_files) {
+              std::string* encoded_fieled = static_cast<std::string*>(arg);
+              *encoded_fieled = "";
+              PutVarint64(encoded_fieled, 0);
+            }
+          });
+
+      options.env = env_;
+
+      // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+      DestroyAndReopen(options);
+
+      int periodic_compactions = 0;
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+            Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+            auto compaction_reason = compaction->compaction_reason();
+            if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+              periodic_compactions++;
+            }
+          });
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+      Random rnd(301);
+      for (int i = 0; i < kNumLevelFiles; ++i) {
+        for (int j = 0; j < kNumKeysPerFile; ++j) {
+          ASSERT_OK(
+              Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+        }
+        ASSERT_OK(Flush());
+      }
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+      ASSERT_EQ("2", FilesPerLevel());
+      ASSERT_EQ(0, periodic_compactions);
+
+      // Add 50 hours and do a write
+      env_->MockSleepForSeconds(50 * 60 * 60);
+      ASSERT_OK(Put("a", "1"));
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      // Assert that the files stay in the same level
+      ASSERT_EQ("3", FilesPerLevel());
+      // The two old files go through the periodic compaction process
+      ASSERT_EQ(2, periodic_compactions);
+
+      MoveFilesToLevel(1);
+      ASSERT_EQ("0,3", FilesPerLevel());
+
+      // Add another 50 hours and do another write
+      env_->MockSleepForSeconds(50 * 60 * 60);
+      ASSERT_OK(Put("b", "2"));
+      if (if_restart) {
+        Reopen(options);
+      } else {
+        ASSERT_OK(Flush());
+      }
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      ASSERT_EQ("1,3", FilesPerLevel());
+      // The three old files now go through the periodic compaction process. 2
+      // + 3.
+      ASSERT_EQ(5, periodic_compactions);
+
+      // Add another 50 hours and do another write
+      env_->MockSleepForSeconds(50 * 60 * 60);
+      ASSERT_OK(Put("c", "3"));
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      ASSERT_EQ("2,3", FilesPerLevel());
+      // The four old files now go through the periodic compaction process. 5
+      // + 4.
+      ASSERT_EQ(9, periodic_compactions);
+
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    }
+  }
+}
+
+TEST_F(DBCompactionTest, LevelPeriodicCompactionWithOldDB) {
+  // This test makes sure that periodic compactions are working with a DB
+  // where file_creation_time of some files is 0.
+  // After compactions the new files are created with a valid file_creation_time
+
+  const int kNumKeysPerFile = 32;
+  const int kNumFiles = 4;
+  const int kValueSize = 100;
+
+  Options options = CurrentOptions();
+  env_->SetMockSleep();
+  options.env = env_;
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  DestroyAndReopen(options);
+
+  int periodic_compactions = 0;
+  bool set_file_creation_time_to_zero = true;
+  bool set_creation_time_to_zero = true;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        auto compaction_reason = compaction->compaction_reason();
+        if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+          periodic_compactions++;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "PropertyBlockBuilder::AddTableProperty:Start", [&](void* arg) {
+        TableProperties* props = reinterpret_cast<TableProperties*>(arg);
+        if (set_file_creation_time_to_zero) {
+          props->file_creation_time = 0;
+        }
+        if (set_creation_time_to_zero) {
+          props->creation_time = 0;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+    }
+    ASSERT_OK(Flush());
+    // Move the first two files to L2.
+    if (i == 1) {
+      MoveFilesToLevel(2);
+      set_creation_time_to_zero = false;
+    }
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ("2,0,2", FilesPerLevel());
+  ASSERT_EQ(0, periodic_compactions);
+
+  Close();
+
+  set_file_creation_time_to_zero = false;
+  // Forward the clock by 2 days.
+  env_->MockSleepForSeconds(2 * 24 * 60 * 60);
+  options.periodic_compaction_seconds = 1 * 24 * 60 * 60;  // 1 day
+
+  Reopen(options);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("2,0,2", FilesPerLevel());
+  // Make sure that all files go through periodic compaction.
+  ASSERT_EQ(kNumFiles, periodic_compactions);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, LevelPeriodicAndTtlCompaction) {
+  const int kNumKeysPerFile = 32;
+  const int kNumLevelFiles = 2;
+  const int kValueSize = 100;
+
+  Options options = CurrentOptions();
+  options.ttl = 10 * 60 * 60;                          // 10 hours
+  options.periodic_compaction_seconds = 48 * 60 * 60;  // 2 days
+  options.max_open_files = -1;  // needed for both periodic and ttl compactions
+  env_->SetMockSleep();
+  options.env = env_;
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  DestroyAndReopen(options);
+
+  int periodic_compactions = 0;
+  int ttl_compactions = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        auto compaction_reason = compaction->compaction_reason();
+        if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+          periodic_compactions++;
+        } else if (compaction_reason == CompactionReason::kTtl) {
+          ttl_compactions++;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  MoveFilesToLevel(3);
+
+  ASSERT_EQ("0,0,0,2", FilesPerLevel());
+  ASSERT_EQ(0, periodic_compactions);
+  ASSERT_EQ(0, ttl_compactions);
+
+  // Add some time greater than periodic_compaction_time.
+  env_->MockSleepForSeconds(50 * 60 * 60);
+  ASSERT_OK(Put("a", "1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Files in the bottom level go through periodic compactions.
+  ASSERT_EQ("1,0,0,2", FilesPerLevel());
+  ASSERT_EQ(2, periodic_compactions);
+  ASSERT_EQ(0, ttl_compactions);
+
+  // Add a little more time than ttl
+  env_->MockSleepForSeconds(11 * 60 * 60);
+  ASSERT_OK(Put("b", "1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Notice that the previous file in level 1 falls down to the bottom level
+  // due to ttl compactions, one level at a time.
+  // And bottom level files don't get picked up for ttl compactions.
+  ASSERT_EQ("1,0,0,3", FilesPerLevel());
+  ASSERT_EQ(2, periodic_compactions);
+  ASSERT_EQ(3, ttl_compactions);
+
+  // Add some time greater than periodic_compaction_time.
+  env_->MockSleepForSeconds(50 * 60 * 60);
+  ASSERT_OK(Put("c", "1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Previous L0 file falls one level at a time to bottom level due to ttl.
+  // And all 4 bottom files go through periodic compactions.
+  ASSERT_EQ("1,0,0,4", FilesPerLevel());
+  ASSERT_EQ(6, periodic_compactions);
+  ASSERT_EQ(6, ttl_compactions);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, LevelTtlBooster) {
+  const int kNumKeysPerFile = 32;
+  const int kNumLevelFiles = 3;
+  const int kValueSize = 1000;
+
+  Options options = CurrentOptions();
+  options.ttl = 10 * 60 * 60;                           // 10 hours
+  options.periodic_compaction_seconds = 480 * 60 * 60;  // very long
+  options.level0_file_num_compaction_trigger = 2;
+  options.max_bytes_for_level_base = 5 * uint64_t{kNumKeysPerFile * kValueSize};
+  options.max_open_files = -1;  // needed for both periodic and ttl compactions
+  options.compaction_pri = CompactionPri::kMinOverlappingRatio;
+  env_->SetMockSleep();
+  options.env = env_;
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  MoveFilesToLevel(2);
+
+  ASSERT_EQ("0,0,3", FilesPerLevel());
+
+  // Create some files for L1
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(Put(Key(2 * j + i), rnd.RandomString(kValueSize)));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+
+  ASSERT_EQ("0,1,3", FilesPerLevel());
+
+  // Make the new L0 files qualify TTL boosting and generate one more to trigger
+  // L1 -> L2 compaction. Old files will be picked even if their priority is
+  // lower without boosting.
+  env_->MockSleepForSeconds(8 * 60 * 60);
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(Put(Key(kNumKeysPerFile * 2 + 2 * j + i),
+                    rnd.RandomString(kValueSize * 2)));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  // Force files to be compacted to L1
+  ASSERT_OK(
+      dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "1"}}));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("0,1,2", FilesPerLevel());
+  ASSERT_OK(
+      dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"}}));
+
+  ASSERT_GT(SizeAtLevel(1), kNumKeysPerFile * 4 * kValueSize);
+}
+
+TEST_F(DBCompactionTest, LevelPeriodicCompactionWithCompactionFilters) {
+  class TestCompactionFilter : public CompactionFilter {
+    const char* Name() const override { return "TestCompactionFilter"; }
+  };
+  class TestCompactionFilterFactory : public CompactionFilterFactory {
+    const char* Name() const override { return "TestCompactionFilterFactory"; }
+    std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+        const CompactionFilter::Context& /*context*/) override {
+      return std::unique_ptr<CompactionFilter>(new TestCompactionFilter());
+    }
+  };
+
+  const int kNumKeysPerFile = 32;
+  const int kNumLevelFiles = 2;
+  const int kValueSize = 100;
+
+  Random rnd(301);
+
+  Options options = CurrentOptions();
+  TestCompactionFilter test_compaction_filter;
+  env_->SetMockSleep();
+  options.env = env_;
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  enum CompactionFilterType {
+    kUseCompactionFilter,
+    kUseCompactionFilterFactory
+  };
+
+  for (CompactionFilterType comp_filter_type :
+       {kUseCompactionFilter, kUseCompactionFilterFactory}) {
+    // Assert that periodic compactions are not enabled.
+    ASSERT_EQ(std::numeric_limits<uint64_t>::max() - 1,
+              options.periodic_compaction_seconds);
+
+    if (comp_filter_type == kUseCompactionFilter) {
+      options.compaction_filter = &test_compaction_filter;
+      options.compaction_filter_factory.reset();
+    } else if (comp_filter_type == kUseCompactionFilterFactory) {
+      options.compaction_filter = nullptr;
+      options.compaction_filter_factory.reset(
+          new TestCompactionFilterFactory());
+    }
+    DestroyAndReopen(options);
+
+    // periodic_compaction_seconds should be set to the sanitized value when
+    // a compaction filter or a compaction filter factory is used.
+    ASSERT_EQ(30 * 24 * 60 * 60,
+              dbfull()->GetOptions().periodic_compaction_seconds);
+
+    int periodic_compactions = 0;
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+          Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+          auto compaction_reason = compaction->compaction_reason();
+          if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+            periodic_compactions++;
+          }
+        });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    for (int i = 0; i < kNumLevelFiles; ++i) {
+      for (int j = 0; j < kNumKeysPerFile; ++j) {
+        ASSERT_OK(
+            Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+      }
+      ASSERT_OK(Flush());
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+    ASSERT_EQ("2", FilesPerLevel());
+    ASSERT_EQ(0, periodic_compactions);
+
+    // Add 31 days and do a write
+    env_->MockSleepForSeconds(31 * 24 * 60 * 60);
+    ASSERT_OK(Put("a", "1"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    // Assert that the files stay in the same level
+    ASSERT_EQ("3", FilesPerLevel());
+    // The two old files go through the periodic compaction process
+    ASSERT_EQ(2, periodic_compactions);
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(DBCompactionTest, CompactRangeDelayedByL0FileCount) {
+  // Verify that, when `CompactRangeOptions::allow_write_stall == false`, manual
+  // compaction only triggers flush after it's sure stall won't be triggered for
+  // L0 file count going too high.
+  const int kNumL0FilesTrigger = 4;
+  const int kNumL0FilesLimit = 8;
+  // i == 0: verifies normal case where stall is avoided by delay
+  // i == 1: verifies no delay in edge case where stall trigger is same as
+  //         compaction trigger, so stall can't be avoided
+  for (int i = 0; i < 2; ++i) {
+    Options options = CurrentOptions();
+    options.level0_slowdown_writes_trigger = kNumL0FilesLimit;
+    if (i == 0) {
+      options.level0_file_num_compaction_trigger = kNumL0FilesTrigger;
+    } else {
+      options.level0_file_num_compaction_trigger = kNumL0FilesLimit;
+    }
+    Reopen(options);
+
+    if (i == 0) {
+      // ensure the auto compaction doesn't finish until manual compaction has
+      // had a chance to be delayed.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+          {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+            "CompactionJob::Run():End"}});
+    } else {
+      // ensure the auto-compaction doesn't finish until manual compaction has
+      // continued without delay.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+          {{"DBImpl::FlushMemTable:StallWaitDone",
+            "CompactionJob::Run():End"}});
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    Random rnd(301);
+    for (int j = 0; j < kNumL0FilesLimit - 1; ++j) {
+      for (int k = 0; k < 2; ++k) {
+        ASSERT_OK(Put(Key(k), rnd.RandomString(1024)));
+      }
+      ASSERT_OK(Flush());
+    }
+    auto manual_compaction_thread = port::Thread([this]() {
+      CompactRangeOptions cro;
+      cro.allow_write_stall = false;
+      ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+    });
+
+    manual_compaction_thread.join();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ(0, NumTableFilesAtLevel(0));
+    ASSERT_GT(NumTableFilesAtLevel(1), 0);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(DBCompactionTest, CompactRangeDelayedByImmMemTableCount) {
+  // Verify that, when `CompactRangeOptions::allow_write_stall == false`, manual
+  // compaction only triggers flush after it's sure stall won't be triggered for
+  // immutable memtable count going too high.
+  const int kNumImmMemTableLimit = 8;
+  // i == 0: verifies normal case where stall is avoided by delay
+  // i == 1: verifies no delay in edge case where stall trigger is same as flush
+  //         trigger, so stall can't be avoided
+  for (int i = 0; i < 2; ++i) {
+    Options options = CurrentOptions();
+    options.disable_auto_compactions = true;
+    // the delay limit is one less than the stop limit. This test focuses on
+    // avoiding delay limit, but this option sets stop limit, so add one.
+    options.max_write_buffer_number = kNumImmMemTableLimit + 1;
+    if (i == 1) {
+      options.min_write_buffer_number_to_merge = kNumImmMemTableLimit;
+    }
+    Reopen(options);
+
+    if (i == 0) {
+      // ensure the flush doesn't finish until manual compaction has had a
+      // chance to be delayed.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+          {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+            "FlushJob::WriteLevel0Table"}});
+    } else {
+      // ensure the flush doesn't finish until manual compaction has continued
+      // without delay.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+          {{"DBImpl::FlushMemTable:StallWaitDone",
+            "FlushJob::WriteLevel0Table"}});
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    Random rnd(301);
+    for (int j = 0; j < kNumImmMemTableLimit - 1; ++j) {
+      ASSERT_OK(Put(Key(0), rnd.RandomString(1024)));
+      FlushOptions flush_opts;
+      flush_opts.wait = false;
+      flush_opts.allow_write_stall = true;
+      ASSERT_OK(dbfull()->Flush(flush_opts));
+    }
+
+    auto manual_compaction_thread = port::Thread([this]() {
+      CompactRangeOptions cro;
+      cro.allow_write_stall = false;
+      ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+    });
+
+    manual_compaction_thread.join();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_EQ(0, NumTableFilesAtLevel(0));
+    ASSERT_GT(NumTableFilesAtLevel(1), 0);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(DBCompactionTest, CompactRangeShutdownWhileDelayed) {
+  // Verify that, when `CompactRangeOptions::allow_write_stall == false`, delay
+  // does not hang if CF is dropped or DB is closed
+  const int kNumL0FilesTrigger = 4;
+  const int kNumL0FilesLimit = 8;
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0FilesTrigger;
+  options.level0_slowdown_writes_trigger = kNumL0FilesLimit;
+  // i == 0: DB::DropColumnFamily() on CompactRange's target CF unblocks it
+  // i == 1: DB::CancelAllBackgroundWork() unblocks CompactRange. This is to
+  //         simulate what happens during Close as we can't call Close (it
+  //         blocks on the auto-compaction, making a cycle).
+  for (int i = 0; i < 2; ++i) {
+    CreateAndReopenWithCF({"one"}, options);
+    // The calls to close CF/DB wait until the manual compaction stalls.
+    // The auto-compaction waits until the manual compaction finishes to ensure
+    // the signal comes from closing CF/DB, not from compaction making progress.
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+        {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+          "DBCompactionTest::CompactRangeShutdownWhileDelayed:PreShutdown"},
+         {"DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual",
+          "CompactionJob::Run():End"}});
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    Random rnd(301);
+    for (int j = 0; j < kNumL0FilesLimit - 1; ++j) {
+      for (int k = 0; k < 2; ++k) {
+        ASSERT_OK(Put(1, Key(k), rnd.RandomString(1024)));
+      }
+      ASSERT_OK(Flush(1));
+    }
+    auto manual_compaction_thread = port::Thread([this, i]() {
+      CompactRangeOptions cro;
+      cro.allow_write_stall = false;
+      if (i == 0) {
+        ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr)
+                        .IsColumnFamilyDropped());
+      } else {
+        ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr)
+                        .IsShutdownInProgress());
+      }
+    });
+
+    TEST_SYNC_POINT(
+        "DBCompactionTest::CompactRangeShutdownWhileDelayed:PreShutdown");
+    if (i == 0) {
+      ASSERT_OK(db_->DropColumnFamily(handles_[1]));
+    } else {
+      dbfull()->CancelAllBackgroundWork(false /* wait */);
+    }
+    manual_compaction_thread.join();
+    TEST_SYNC_POINT(
+        "DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual");
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(DBCompactionTest, CompactRangeSkipFlushAfterDelay) {
+  // Verify that, when `CompactRangeOptions::allow_write_stall == false`,
+  // CompactRange skips its flush if the delay is long enough that the memtables
+  // existing at the beginning of the call have already been flushed.
+  const int kNumL0FilesTrigger = 4;
+  const int kNumL0FilesLimit = 8;
+  Options options = CurrentOptions();
+  options.level0_slowdown_writes_trigger = kNumL0FilesLimit;
+  options.level0_file_num_compaction_trigger = kNumL0FilesTrigger;
+  Reopen(options);
+
+  Random rnd(301);
+  // The manual flush includes the memtable that was active when CompactRange
+  // began. So it unblocks CompactRange and precludes its flush. Throughout the
+  // test, stall conditions are upheld via high L0 file count.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+        "DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush"},
+       {"DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush",
+        "DBImpl::FlushMemTable:StallWaitDone"},
+       {"DBImpl::FlushMemTable:StallWaitDone", "CompactionJob::Run():End"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // used for the delayable flushes
+  FlushOptions flush_opts;
+  flush_opts.allow_write_stall = true;
+  for (int i = 0; i < kNumL0FilesLimit - 1; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      ASSERT_OK(Put(Key(j), rnd.RandomString(1024)));
+    }
+    ASSERT_OK(dbfull()->Flush(flush_opts));
+  }
+  auto manual_compaction_thread = port::Thread([this]() {
+    CompactRangeOptions cro;
+    cro.allow_write_stall = false;
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush");
+  ASSERT_OK(Put(std::to_string(0), rnd.RandomString(1024)));
+  ASSERT_OK(dbfull()->Flush(flush_opts));
+  ASSERT_OK(Put(std::to_string(0), rnd.RandomString(1024)));
+  TEST_SYNC_POINT(
+      "DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush");
+  manual_compaction_thread.join();
+
+  // If CompactRange's flush was skipped, the final Put above will still be
+  // in the active memtable.
+  std::string num_keys_in_memtable;
+  ASSERT_TRUE(db_->GetProperty(DB::Properties::kNumEntriesActiveMemTable,
+                               &num_keys_in_memtable));
+  ASSERT_EQ(std::to_string(1), num_keys_in_memtable);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, CompactRangeFlushOverlappingMemtable) {
+  // Verify memtable only gets flushed if it contains data overlapping the range
+  // provided to `CompactRange`. Tests all kinds of overlap/non-overlap.
+  const int kNumEndpointKeys = 5;
+  std::string keys[kNumEndpointKeys] = {"a", "b", "c", "d", "e"};
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  // One extra iteration for nullptr, which means left side of interval is
+  // unbounded.
+  for (int i = 0; i <= kNumEndpointKeys; ++i) {
+    Slice begin;
+    Slice* begin_ptr;
+    if (i == 0) {
+      begin_ptr = nullptr;
+    } else {
+      begin = keys[i - 1];
+      begin_ptr = &begin;
+    }
+    // Start at `i` so right endpoint comes after left endpoint. One extra
+    // iteration for nullptr, which means right side of interval is unbounded.
+    for (int j = std::max(0, i - 1); j <= kNumEndpointKeys; ++j) {
+      Slice end;
+      Slice* end_ptr;
+      if (j == kNumEndpointKeys) {
+        end_ptr = nullptr;
+      } else {
+        end = keys[j];
+        end_ptr = &end;
+      }
+      ASSERT_OK(Put("b", "val"));
+      ASSERT_OK(Put("d", "val"));
+      CompactRangeOptions compact_range_opts;
+      ASSERT_OK(db_->CompactRange(compact_range_opts, begin_ptr, end_ptr));
+
+      uint64_t get_prop_tmp, num_memtable_entries = 0;
+      ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesImmMemTables,
+                                      &get_prop_tmp));
+      num_memtable_entries += get_prop_tmp;
+      ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                                      &get_prop_tmp));
+      num_memtable_entries += get_prop_tmp;
+      if (begin_ptr == nullptr || end_ptr == nullptr ||
+          (i <= 4 && j >= 1 && (begin != "c" || end != "c"))) {
+        // In this case `CompactRange`'s range overlapped in some way with the
+        // memtable's range, so flush should've happened. Then "b" and "d" won't
+        // be in the memtable.
+        ASSERT_EQ(0, num_memtable_entries);
+      } else {
+        ASSERT_EQ(2, num_memtable_entries);
+        // flush anyways to prepare for next iteration
+        ASSERT_OK(db_->Flush(FlushOptions()));
+      }
+    }
+  }
+}
+
+TEST_F(DBCompactionTest, CompactionStatsTest) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  CompactionStatsCollector* collector = new CompactionStatsCollector();
+  options.listeners.emplace_back(collector);
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < 32; i++) {
+    for (int j = 0; j < 5000; j++) {
+      ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ColumnFamilyHandleImpl* cfh =
+      static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
+  ColumnFamilyData* cfd = cfh->cfd();
+
+  VerifyCompactionStats(*cfd, *collector);
+}
+
+TEST_F(DBCompactionTest, SubcompactionEvent) {
+  class SubCompactionEventListener : public EventListener {
+   public:
+    void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override {
+      InstrumentedMutexLock l(&mutex_);
+      ASSERT_EQ(running_compactions_.find(ci.job_id),
+                running_compactions_.end());
+      running_compactions_.emplace(ci.job_id, std::unordered_set<int>());
+    }
+
+    void OnCompactionCompleted(DB* /*db*/,
+                               const CompactionJobInfo& ci) override {
+      InstrumentedMutexLock l(&mutex_);
+      auto it = running_compactions_.find(ci.job_id);
+      ASSERT_NE(it, running_compactions_.end());
+      ASSERT_EQ(it->second.size(), 0);
+      running_compactions_.erase(it);
+    }
+
+    void OnSubcompactionBegin(const SubcompactionJobInfo& si) override {
+      InstrumentedMutexLock l(&mutex_);
+      auto it = running_compactions_.find(si.job_id);
+      ASSERT_NE(it, running_compactions_.end());
+      auto r = it->second.insert(si.subcompaction_job_id);
+      ASSERT_TRUE(r.second);  // each subcompaction_job_id should be different
+      total_subcompaction_cnt_++;
+    }
+
+    void OnSubcompactionCompleted(const SubcompactionJobInfo& si) override {
+      InstrumentedMutexLock l(&mutex_);
+      auto it = running_compactions_.find(si.job_id);
+      ASSERT_NE(it, running_compactions_.end());
+      auto r = it->second.erase(si.subcompaction_job_id);
+      ASSERT_EQ(r, 1);
+    }
+
+    size_t GetRunningCompactionCount() {
+      InstrumentedMutexLock l(&mutex_);
+      return running_compactions_.size();
+    }
+
+    size_t GetTotalSubcompactionCount() {
+      InstrumentedMutexLock l(&mutex_);
+      return total_subcompaction_cnt_;
+    }
+
+   private:
+    InstrumentedMutex mutex_;
+    std::unordered_map<int, std::unordered_set<int>> running_compactions_;
+    size_t total_subcompaction_cnt_ = 0;
+  };
+
+  Options options = CurrentOptions();
+  options.target_file_size_base = 1024;
+  options.level0_file_num_compaction_trigger = 10;
+  auto* listener = new SubCompactionEventListener();
+  options.listeners.emplace_back(listener);
+
+  DestroyAndReopen(options);
+
+  // generate 4 files @ L2
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 10 + j;
+      ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+  MoveFilesToLevel(2);
+
+  // generate 2 files @ L1 which overlaps with L2 files
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 20 + j * 2;
+      ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+  MoveFilesToLevel(1);
+  ASSERT_EQ(FilesPerLevel(), "0,2,4");
+
+  CompactRangeOptions comp_opts;
+  comp_opts.max_subcompactions = 4;
+  Status s = dbfull()->CompactRange(comp_opts, nullptr, nullptr);
+  ASSERT_OK(s);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // make sure there's no running compaction
+  ASSERT_EQ(listener->GetRunningCompactionCount(), 0);
+  // and sub compaction is triggered
+  ASSERT_GT(listener->GetTotalSubcompactionCount(), 0);
+}
+
+TEST_F(DBCompactionTest, CompactFilesOutputRangeConflict) {
+  // LSM setup:
+  // L1:      [ba bz]
+  // L2: [a b]       [c d]
+  // L3: [a b]       [c d]
+  //
+  // Thread 1:                        Thread 2:
+  // Begin compacting all L2->L3
+  //                                  Compact [ba bz] L1->L3
+  // End compacting all L2->L3
+  //
+  // The compaction operation in thread 2 should be disallowed because the range
+  // overlaps with the compaction in thread 1, which also covers that range in
+  // L3.
+  Options options = CurrentOptions();
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+  Reopen(options);
+
+  for (int level = 3; level >= 2; --level) {
+    ASSERT_OK(Put("a", "val"));
+    ASSERT_OK(Put("b", "val"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Put("c", "val"));
+    ASSERT_OK(Put("d", "val"));
+    ASSERT_OK(Flush());
+    MoveFilesToLevel(level);
+  }
+  ASSERT_OK(Put("ba", "val"));
+  ASSERT_OK(Put("bz", "val"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+
+  SyncPoint::GetInstance()->LoadDependency({
+      {"CompactFilesImpl:0",
+       "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2Begin"},
+      {"DBCompactionTest::CompactFilesOutputRangeConflict:Thread2End",
+       "CompactFilesImpl:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  auto bg_thread = port::Thread([&]() {
+    // Thread 1
+    std::vector<std::string> filenames = collector->GetFlushedFiles();
+    filenames.pop_back();
+    ASSERT_OK(db_->CompactFiles(CompactionOptions(), filenames,
+                                3 /* output_level */));
+  });
+
+  // Thread 2
+  TEST_SYNC_POINT(
+      "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2Begin");
+  std::string filename = collector->GetFlushedFiles().back();
+  ASSERT_FALSE(
+      db_->CompactFiles(CompactionOptions(), {filename}, 3 /* output_level */)
+          .ok());
+  TEST_SYNC_POINT(
+      "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2End");
+
+  bg_thread.join();
+}
+
+TEST_F(DBCompactionTest, CompactionHasEmptyOutput) {
+  Options options = CurrentOptions();
+  SstStatsCollector* collector = new SstStatsCollector();
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(collector);
+  Reopen(options);
+
+  // Make sure the L0 files overlap to prevent trivial move.
+  ASSERT_OK(Put("a", "val"));
+  ASSERT_OK(Put("b", "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Delete("a"));
+  ASSERT_OK(Delete("b"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+
+  // Expect one file creation to start for each flush, and zero for compaction
+  // since no keys are written.
+  ASSERT_EQ(2, collector->num_ssts_creation_started());
+}
+
+TEST_F(DBCompactionTest, CompactionLimiter) {
+  const int kNumKeysPerFile = 10;
+  const int kMaxBackgroundThreads = 64;
+
+  struct CompactionLimiter {
+    std::string name;
+    int limit_tasks;
+    int max_tasks;
+    int tasks;
+    std::shared_ptr<ConcurrentTaskLimiter> limiter;
+  };
+
+  std::vector<CompactionLimiter> limiter_settings;
+  limiter_settings.push_back({"limiter_1", 1, 0, 0, nullptr});
+  limiter_settings.push_back({"limiter_2", 2, 0, 0, nullptr});
+  limiter_settings.push_back({"limiter_3", 3, 0, 0, nullptr});
+
+  for (auto& ls : limiter_settings) {
+    ls.limiter.reset(NewConcurrentTaskLimiter(ls.name, ls.limit_tasks));
+  }
+
+  std::shared_ptr<ConcurrentTaskLimiter> unique_limiter(
+      NewConcurrentTaskLimiter("unique_limiter", -1));
+
+  const char* cf_names[] = {"default", "0", "1", "2", "3", "4", "5", "6", "7",
+                            "8",       "9", "a", "b", "c", "d", "e", "f"};
+  const unsigned int cf_count = sizeof cf_names / sizeof cf_names[0];
+
+  std::unordered_map<std::string, CompactionLimiter*> cf_to_limiter;
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 110 * 1024;  // 110KB
+  options.arena_block_size = 4096;
+  options.num_levels = 3;
+  options.level0_file_num_compaction_trigger = 4;
+  options.level0_slowdown_writes_trigger = 64;
+  options.level0_stop_writes_trigger = 64;
+  options.max_background_jobs = kMaxBackgroundThreads;  // Enough threads
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  options.max_write_buffer_number = 10;  // Enough memtables
+  DestroyAndReopen(options);
+
+  std::vector<Options> option_vector;
+  option_vector.reserve(cf_count);
+
+  for (unsigned int cf = 0; cf < cf_count; cf++) {
+    ColumnFamilyOptions cf_opt(options);
+    if (cf == 0) {
+      // "Default" CF does't use compaction limiter
+      cf_opt.compaction_thread_limiter = nullptr;
+    } else if (cf == 1) {
+      // "1" CF uses bypass compaction limiter
+      unique_limiter->SetMaxOutstandingTask(-1);
+      cf_opt.compaction_thread_limiter = unique_limiter;
+    } else {
+      // Assign limiter by mod
+      auto& ls = limiter_settings[cf % 3];
+      cf_opt.compaction_thread_limiter = ls.limiter;
+      cf_to_limiter[cf_names[cf]] = &ls;
+    }
+    option_vector.emplace_back(DBOptions(options), cf_opt);
+  }
+
+  for (unsigned int cf = 1; cf < cf_count; cf++) {
+    CreateColumnFamilies({cf_names[cf]}, option_vector[cf]);
+  }
+
+  ReopenWithColumnFamilies(
+      std::vector<std::string>(cf_names, cf_names + cf_count), option_vector);
+
+  port::Mutex mutex;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:BeforeCompaction", [&](void* arg) {
+        const auto& cf_name = static_cast<ColumnFamilyData*>(arg)->GetName();
+        auto iter = cf_to_limiter.find(cf_name);
+        if (iter != cf_to_limiter.end()) {
+          MutexLock l(&mutex);
+          ASSERT_GE(iter->second->limit_tasks, ++iter->second->tasks);
+          iter->second->max_tasks =
+              std::max(iter->second->max_tasks, iter->second->limit_tasks);
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:AfterCompaction", [&](void* arg) {
+        const auto& cf_name = static_cast<ColumnFamilyData*>(arg)->GetName();
+        auto iter = cf_to_limiter.find(cf_name);
+        if (iter != cf_to_limiter.end()) {
+          MutexLock l(&mutex);
+          ASSERT_GE(--iter->second->tasks, 0);
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Block all compact threads in thread pool.
+  const size_t kTotalFlushTasks = kMaxBackgroundThreads / 4;
+  const size_t kTotalCompactTasks = kMaxBackgroundThreads - kTotalFlushTasks;
+  env_->SetBackgroundThreads((int)kTotalFlushTasks, Env::HIGH);
+  env_->SetBackgroundThreads((int)kTotalCompactTasks, Env::LOW);
+
+  test::SleepingBackgroundTask sleeping_compact_tasks[kTotalCompactTasks];
+
+  // Block all compaction threads in thread pool.
+  for (size_t i = 0; i < kTotalCompactTasks; i++) {
+    env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                   &sleeping_compact_tasks[i], Env::LOW);
+    sleeping_compact_tasks[i].WaitUntilSleeping();
+  }
+
+  int keyIndex = 0;
+
+  for (int n = 0; n < options.level0_file_num_compaction_trigger; n++) {
+    for (unsigned int cf = 0; cf < cf_count; cf++) {
+      for (int i = 0; i < kNumKeysPerFile; i++) {
+        ASSERT_OK(Put(cf, Key(keyIndex++), ""));
+      }
+      // put extra key to trigger flush
+      ASSERT_OK(Put(cf, "", ""));
+    }
+
+    for (unsigned int cf = 0; cf < cf_count; cf++) {
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
+    }
+  }
+
+  // Enough L0 files to trigger compaction
+  for (unsigned int cf = 0; cf < cf_count; cf++) {
+    ASSERT_EQ(NumTableFilesAtLevel(0, cf),
+              options.level0_file_num_compaction_trigger);
+  }
+
+  // Create more files for one column family, which triggers speed up
+  // condition, all compactions will be scheduled.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+    for (int i = 0; i < kNumKeysPerFile; i++) {
+      ASSERT_OK(Put(0, Key(i), ""));
+    }
+    // put extra key to trigger flush
+    ASSERT_OK(Put(0, "", ""));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
+    ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1,
+              NumTableFilesAtLevel(0, 0));
+  }
+
+  // All CFs are pending compaction
+  ASSERT_EQ(cf_count, env_->GetThreadPoolQueueLen(Env::LOW));
+
+  // Unblock all compaction threads
+  for (size_t i = 0; i < kTotalCompactTasks; i++) {
+    sleeping_compact_tasks[i].WakeUp();
+    sleeping_compact_tasks[i].WaitUntilDone();
+  }
+
+  for (unsigned int cf = 0; cf < cf_count; cf++) {
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
+  }
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Max outstanding compact tasks reached limit
+  for (auto& ls : limiter_settings) {
+    ASSERT_EQ(ls.limit_tasks, ls.max_tasks);
+    ASSERT_EQ(0, ls.limiter->GetOutstandingTask());
+  }
+
+  // test manual compaction under a fully throttled limiter
+  int cf_test = 1;
+  unique_limiter->SetMaxOutstandingTask(0);
+
+  // flush one more file to cf 1
+  for (int i = 0; i < kNumKeysPerFile; i++) {
+    ASSERT_OK(Put(cf_test, Key(keyIndex++), ""));
+  }
+  // put extra key to trigger flush
+  ASSERT_OK(Put(cf_test, "", ""));
+
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf_test]));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0, cf_test));
+
+  Compact(cf_test, Key(0), Key(keyIndex));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+}
+
+INSTANTIATE_TEST_CASE_P(DBCompactionTestWithParam, DBCompactionTestWithParam,
+                        ::testing::Values(std::make_tuple(1, true),
+                                          std::make_tuple(1, false),
+                                          std::make_tuple(4, true),
+                                          std::make_tuple(4, false)));
+
+TEST_P(DBCompactionDirectIOTest, DirectIO) {
+  Options options = CurrentOptions();
+  Destroy(options);
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.use_direct_io_for_flush_and_compaction = GetParam();
+  options.env = MockEnv::Create(Env::Default());
+  Reopen(options);
+  bool readahead = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::OpenCompactionOutputFile", [&](void* arg) {
+        bool* use_direct_writes = static_cast<bool*>(arg);
+        ASSERT_EQ(*use_direct_writes,
+                  options.use_direct_io_for_flush_and_compaction);
+      });
+  if (options.use_direct_io_for_flush_and_compaction) {
+    SyncPoint::GetInstance()->SetCallBack(
+        "SanitizeOptions:direct_io", [&](void* /*arg*/) { readahead = true; });
+  }
+  SyncPoint::GetInstance()->EnableProcessing();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  MakeTables(3, "p", "q", 1);
+  ASSERT_EQ("1,1,1", FilesPerLevel(1));
+  Compact(1, "p", "q");
+  ASSERT_EQ(readahead, options.use_direct_reads);
+  ASSERT_EQ("0,0,1", FilesPerLevel(1));
+  Destroy(options);
+  delete options.env;
+}
+
+INSTANTIATE_TEST_CASE_P(DBCompactionDirectIOTest, DBCompactionDirectIOTest,
+                        testing::Bool());
+
+class CompactionPriTest : public DBTestBase,
+                          public testing::WithParamInterface<uint32_t> {
+ public:
+  CompactionPriTest()
+      : DBTestBase("compaction_pri_test", /*env_do_fsync=*/true) {
+    compaction_pri_ = GetParam();
+  }
+
+  // Required if inheriting from testing::WithParamInterface<>
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  uint32_t compaction_pri_;
+};
+
+TEST_P(CompactionPriTest, Test) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 16 * 1024;
+  options.compaction_pri = static_cast<CompactionPri>(compaction_pri_);
+  options.hard_pending_compaction_bytes_limit = 256 * 1024;
+  options.max_bytes_for_level_base = 64 * 1024;
+  options.max_bytes_for_level_multiplier = 4;
+  options.compression = kNoCompression;
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  const int kNKeys = 5000;
+  int keys[kNKeys];
+  for (int i = 0; i < kNKeys; i++) {
+    keys[i] = i;
+  }
+  RandomShuffle(std::begin(keys), std::end(keys), rnd.Next());
+
+  for (int i = 0; i < kNKeys; i++) {
+    ASSERT_OK(Put(Key(keys[i]), rnd.RandomString(102)));
+  }
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  for (int i = 0; i < kNKeys; i++) {
+    ASSERT_NE("NOT_FOUND", Get(Key(i)));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    CompactionPriTest, CompactionPriTest,
+    ::testing::Values(CompactionPri::kByCompensatedSize,
+                      CompactionPri::kOldestLargestSeqFirst,
+                      CompactionPri::kOldestSmallestSeqFirst,
+                      CompactionPri::kMinOverlappingRatio,
+                      CompactionPri::kRoundRobin));
+
+TEST_F(DBCompactionTest, PersistRoundRobinCompactCursor) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 16 * 1024;
+  options.max_bytes_for_level_base = 128 * 1024;
+  options.target_file_size_base = 64 * 1024;
+  options.level0_file_num_compaction_trigger = 4;
+  options.compaction_pri = CompactionPri::kRoundRobin;
+  options.max_bytes_for_level_multiplier = 4;
+  options.num_levels = 3;
+  options.compression = kNoCompression;
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+
+  // 30 Files in L0 to trigger compactions between L1 and L2
+  for (int i = 0; i < 30; i++) {
+    for (int j = 0; j < 16; j++) {
+      ASSERT_OK(Put(rnd.RandomString(24), rnd.RandomString(1000)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  ASSERT_NE(cfd, nullptr);
+
+  Version* const current = cfd->current();
+  ASSERT_NE(current, nullptr);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  ASSERT_NE(storage_info, nullptr);
+
+  const std::vector<InternalKey> compact_cursors =
+      storage_info->GetCompactCursors();
+
+  Reopen(options);
+
+  VersionSet* const reopened_versions = dbfull()->GetVersionSet();
+  assert(reopened_versions);
+
+  ColumnFamilyData* const reopened_cfd =
+      reopened_versions->GetColumnFamilySet()->GetDefault();
+  ASSERT_NE(reopened_cfd, nullptr);
+
+  Version* const reopened_current = reopened_cfd->current();
+  ASSERT_NE(reopened_current, nullptr);
+
+  const VersionStorageInfo* const reopened_storage_info =
+      reopened_current->storage_info();
+  ASSERT_NE(reopened_storage_info, nullptr);
+
+  const std::vector<InternalKey> reopened_compact_cursors =
+      reopened_storage_info->GetCompactCursors();
+  const auto icmp = reopened_storage_info->InternalComparator();
+  ASSERT_EQ(compact_cursors.size(), reopened_compact_cursors.size());
+  for (size_t i = 0; i < compact_cursors.size(); i++) {
+    if (compact_cursors[i].Valid()) {
+      ASSERT_EQ(0,
+                icmp->Compare(compact_cursors[i], reopened_compact_cursors[i]));
+    } else {
+      ASSERT_TRUE(!reopened_compact_cursors[i].Valid());
+    }
+  }
+}
+
+TEST_P(RoundRobinSubcompactionsAgainstPressureToken, PressureTokenTest) {
+  const int kKeysPerBuffer = 100;
+  Options options = CurrentOptions();
+  options.num_levels = 4;
+  options.max_bytes_for_level_multiplier = 2;
+  options.level0_file_num_compaction_trigger = 4;
+  options.target_file_size_base = kKeysPerBuffer * 1024;
+  options.compaction_pri = CompactionPri::kRoundRobin;
+  options.max_bytes_for_level_base = 8 * kKeysPerBuffer * 1024;
+  options.disable_auto_compactions = true;
+  // Setup 7 threads but limited subcompactions so that
+  // RoundRobin requires extra compactions from reserved threads
+  options.max_subcompactions = 1;
+  options.max_background_compactions = 7;
+  options.max_compaction_bytes = 100000000;
+  DestroyAndReopen(options);
+  env_->SetBackgroundThreads(7, Env::LOW);
+
+  Random rnd(301);
+  const std::vector<int> files_per_level = {0, 15, 25};
+  for (int lvl = 2; lvl > 0; lvl--) {
+    for (int i = 0; i < files_per_level[lvl]; i++) {
+      for (int j = 0; j < kKeysPerBuffer; j++) {
+        // Add (lvl-1) to ensure nearly equivallent number of files
+        // in L2 are overlapped with fils selected to compact from
+        // L1
+        ASSERT_OK(Put(Key(2 * i * kKeysPerBuffer + 2 * j + (lvl - 1)),
+                      rnd.RandomString(1010)));
+      }
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(lvl);
+    ASSERT_EQ(files_per_level[lvl], NumTableFilesAtLevel(lvl, 0));
+  }
+  // 15 files in L1; 25 files in L2
+
+  // This is a variable for making sure the following callback is called
+  // and the assertions in it are indeed excuted.
+  bool num_planned_subcompactions_verified = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::GenSubcompactionBoundaries:0", [&](void* arg) {
+        uint64_t num_planned_subcompactions = *(static_cast<uint64_t*>(arg));
+        if (grab_pressure_token_) {
+          // 7 files are selected for round-robin under auto
+          // compaction. The number of planned subcompaction is restricted by
+          // the limited number of max_background_compactions
+          ASSERT_EQ(num_planned_subcompactions, 7);
+        } else {
+          ASSERT_EQ(num_planned_subcompactions, 1);
+        }
+        num_planned_subcompactions_verified = true;
+      });
+
+  // The following 3 dependencies have to be added to ensure the auto
+  // compaction and the pressure token is correctly enabled. Same for
+  // RoundRobinSubcompactionsUsingResources and
+  // DBCompactionTest.RoundRobinSubcompactionsShrinkResources
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"RoundRobinSubcompactionsAgainstPressureToken:0",
+        "BackgroundCallCompaction:0"},
+       {"CompactionJob::AcquireSubcompactionResources:0",
+        "RoundRobinSubcompactionsAgainstPressureToken:1"},
+       {"RoundRobinSubcompactionsAgainstPressureToken:2",
+        "CompactionJob::AcquireSubcompactionResources:1"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()}));
+  TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstPressureToken:0");
+  TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstPressureToken:1");
+  std::unique_ptr<WriteControllerToken> pressure_token;
+  if (grab_pressure_token_) {
+    pressure_token =
+        dbfull()->TEST_write_controler().GetCompactionPressureToken();
+  }
+  TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstPressureToken:2");
+
+  ASSERT_OK(dbfull()->WaitForCompact());
+  ASSERT_TRUE(num_planned_subcompactions_verified);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+INSTANTIATE_TEST_CASE_P(RoundRobinSubcompactionsAgainstPressureToken,
+                        RoundRobinSubcompactionsAgainstPressureToken,
+                        testing::Bool());
+
+TEST_P(RoundRobinSubcompactionsAgainstResources, SubcompactionsUsingResources) {
+  const int kKeysPerBuffer = 200;
+  Options options = CurrentOptions();
+  options.num_levels = 4;
+  options.level0_file_num_compaction_trigger = 3;
+  options.target_file_size_base = kKeysPerBuffer * 1024;
+  options.compaction_pri = CompactionPri::kRoundRobin;
+  options.max_bytes_for_level_base = 30 * kKeysPerBuffer * 1024;
+  options.disable_auto_compactions = true;
+  options.max_subcompactions = 1;
+  options.max_background_compactions = max_compaction_limits_;
+  // Set a large number for max_compaction_bytes so that one round-robin
+  // compaction is enough to make post-compaction L1 size less than
+  // the maximum size (this test assumes only one round-robin compaction
+  // is triggered by kLevelMaxLevelSize)
+  options.max_compaction_bytes = 100000000;
+
+  DestroyAndReopen(options);
+  env_->SetBackgroundThreads(total_low_pri_threads_, Env::LOW);
+
+  Random rnd(301);
+  const std::vector<int> files_per_level = {0, 40, 100};
+  for (int lvl = 2; lvl > 0; lvl--) {
+    for (int i = 0; i < files_per_level[lvl]; i++) {
+      for (int j = 0; j < kKeysPerBuffer; j++) {
+        // Add (lvl-1) to ensure nearly equivallent number of files
+        // in L2 are overlapped with fils selected to compact from
+        // L1
+        ASSERT_OK(Put(Key(2 * i * kKeysPerBuffer + 2 * j + (lvl - 1)),
+                      rnd.RandomString(1010)));
+      }
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(lvl);
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ(files_per_level[lvl], NumTableFilesAtLevel(lvl, 0));
+  }
+
+  // 40 files in L1; 100 files in L2
+  // This is a variable for making sure the following callback is called
+  // and the assertions in it are indeed excuted.
+  bool num_planned_subcompactions_verified = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::GenSubcompactionBoundaries:0", [&](void* arg) {
+        uint64_t num_planned_subcompactions = *(static_cast<uint64_t*>(arg));
+        // More than 10 files are selected for round-robin under auto
+        // compaction. The number of planned subcompaction is restricted by
+        // the minimum number between available threads and compaction limits
+        ASSERT_EQ(num_planned_subcompactions - options.max_subcompactions,
+                  std::min(total_low_pri_threads_, max_compaction_limits_) - 1);
+        num_planned_subcompactions_verified = true;
+      });
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"RoundRobinSubcompactionsAgainstResources:0",
+        "BackgroundCallCompaction:0"},
+       {"CompactionJob::AcquireSubcompactionResources:0",
+        "RoundRobinSubcompactionsAgainstResources:1"},
+       {"RoundRobinSubcompactionsAgainstResources:2",
+        "CompactionJob::AcquireSubcompactionResources:1"},
+       {"CompactionJob::ReleaseSubcompactionResources:0",
+        "RoundRobinSubcompactionsAgainstResources:3"},
+       {"RoundRobinSubcompactionsAgainstResources:4",
+        "CompactionJob::ReleaseSubcompactionResources:1"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(dbfull()->WaitForCompact());
+  ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()}));
+  TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:0");
+  TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:1");
+  auto pressure_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:2");
+  TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:3");
+  // We can reserve more threads now except one is being used
+  ASSERT_EQ(total_low_pri_threads_ - 1,
+            env_->ReserveThreads(total_low_pri_threads_, Env::Priority::LOW));
+  ASSERT_EQ(
+      total_low_pri_threads_ - 1,
+      env_->ReleaseThreads(total_low_pri_threads_ - 1, Env::Priority::LOW));
+  TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:4");
+  ASSERT_OK(dbfull()->WaitForCompact());
+  ASSERT_TRUE(num_planned_subcompactions_verified);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+INSTANTIATE_TEST_CASE_P(RoundRobinSubcompactionsAgainstResources,
+                        RoundRobinSubcompactionsAgainstResources,
+                        ::testing::Values(std::make_tuple(1, 5),
+                                          std::make_tuple(5, 1),
+                                          std::make_tuple(10, 5),
+                                          std::make_tuple(5, 10),
+                                          std::make_tuple(10, 10)));
+
+TEST_P(DBCompactionTestWithParam, RoundRobinWithoutAdditionalResources) {
+  const int kKeysPerBuffer = 200;
+  Options options = CurrentOptions();
+  options.num_levels = 4;
+  options.level0_file_num_compaction_trigger = 3;
+  options.target_file_size_base = kKeysPerBuffer * 1024;
+  options.compaction_pri = CompactionPri::kRoundRobin;
+  options.max_bytes_for_level_base = 30 * kKeysPerBuffer * 1024;
+  options.disable_auto_compactions = true;
+  options.max_subcompactions = max_subcompactions_;
+  options.max_background_compactions = 1;
+  options.max_compaction_bytes = 100000000;
+  // Similar experiment setting as above except the max_subcompactions
+  // is given by max_subcompactions_ (1 or 4), and we fix the
+  // additional resources as (1, 1) and thus no more extra resources
+  // can be used
+  DestroyAndReopen(options);
+  env_->SetBackgroundThreads(1, Env::LOW);
+
+  Random rnd(301);
+  const std::vector<int> files_per_level = {0, 33, 100};
+  for (int lvl = 2; lvl > 0; lvl--) {
+    for (int i = 0; i < files_per_level[lvl]; i++) {
+      for (int j = 0; j < kKeysPerBuffer; j++) {
+        // Add (lvl-1) to ensure nearly equivallent number of files
+        // in L2 are overlapped with fils selected to compact from
+        // L1
+        ASSERT_OK(Put(Key(2 * i * kKeysPerBuffer + 2 * j + (lvl - 1)),
+                      rnd.RandomString(1010)));
+      }
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(lvl);
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ(files_per_level[lvl], NumTableFilesAtLevel(lvl, 0));
+  }
+
+  // 33 files in L1; 100 files in L2
+  // This is a variable for making sure the following callback is called
+  // and the assertions in it are indeed excuted.
+  bool num_planned_subcompactions_verified = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::GenSubcompactionBoundaries:0", [&](void* arg) {
+        uint64_t num_planned_subcompactions = *(static_cast<uint64_t*>(arg));
+        // At most 4 files are selected for round-robin under auto
+        // compaction. The number of planned subcompaction is restricted by
+        // the max_subcompactions since no extra resources can be used
+        ASSERT_EQ(num_planned_subcompactions, options.max_subcompactions);
+        num_planned_subcompactions_verified = true;
+      });
+  // No need to setup dependency for pressure token since
+  // AcquireSubcompactionResources may not be called and it anyway cannot
+  // reserve any additional resources
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBCompactionTest::RoundRobinWithoutAdditionalResources:0",
+        "BackgroundCallCompaction:0"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(dbfull()->WaitForCompact());
+  ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()}));
+  TEST_SYNC_POINT("DBCompactionTest::RoundRobinWithoutAdditionalResources:0");
+
+  ASSERT_OK(dbfull()->WaitForCompact());
+  ASSERT_TRUE(num_planned_subcompactions_verified);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBCompactionTest, RoundRobinCutOutputAtCompactCursor) {
+  Options options = CurrentOptions();
+  options.num_levels = 3;
+  options.compression = kNoCompression;
+  options.write_buffer_size = 4 * 1024;
+  options.max_bytes_for_level_base = 64 * 1024;
+  options.max_bytes_for_level_multiplier = 4;
+  options.level0_file_num_compaction_trigger = 4;
+  options.compaction_pri = CompactionPri::kRoundRobin;
+
+  DestroyAndReopen(options);
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  ASSERT_NE(cfd, nullptr);
+
+  Version* const current = cfd->current();
+  ASSERT_NE(current, nullptr);
+
+  VersionStorageInfo* storage_info = current->storage_info();
+  ASSERT_NE(storage_info, nullptr);
+
+  const InternalKey split_cursor = InternalKey(Key(600), 100, kTypeValue);
+  storage_info->AddCursorForOneLevel(2, split_cursor);
+
+  Random rnd(301);
+
+  for (int i = 0; i < 50; i++) {
+    for (int j = 0; j < 50; j++) {
+      ASSERT_OK(Put(Key(j * 2 + i * 100), rnd.RandomString(102)));
+    }
+  }
+  // Add more overlapping files (avoid trivial move) to trigger compaction that
+  // output files in L2. Note that trivial move does not trigger compaction and
+  // in that case the cursor is not necessarily the boundary of file.
+  for (int i = 0; i < 50; i++) {
+    for (int j = 0; j < 50; j++) {
+      ASSERT_OK(Put(Key(j * 2 + 1 + i * 100), rnd.RandomString(1014)));
+    }
+  }
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  std::vector<std::vector<FileMetaData>> level_to_files;
+  dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+                                  &level_to_files);
+  const auto icmp = cfd->current()->storage_info()->InternalComparator();
+  // Files in level 2 should be split by the cursor
+  for (const auto& file : level_to_files[2]) {
+    ASSERT_TRUE(
+        icmp->Compare(file.smallest.Encode(), split_cursor.Encode()) >= 0 ||
+        icmp->Compare(file.largest.Encode(), split_cursor.Encode()) < 0);
+  }
+}
+
+class NoopMergeOperator : public MergeOperator {
+ public:
+  NoopMergeOperator() {}
+
+  bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+                   MergeOperationOutput* merge_out) const override {
+    std::string val("bar");
+    merge_out->new_value = val;
+    return true;
+  }
+
+  const char* Name() const override { return "Noop"; }
+};
+
+TEST_F(DBCompactionTest, PartialManualCompaction) {
+  Options opts = CurrentOptions();
+  opts.num_levels = 3;
+  opts.level0_file_num_compaction_trigger = 10;
+  opts.compression = kNoCompression;
+  opts.merge_operator.reset(new NoopMergeOperator());
+  opts.target_file_size_base = 10240;
+  DestroyAndReopen(opts);
+
+  Random rnd(301);
+  for (auto i = 0; i < 8; ++i) {
+    for (auto j = 0; j < 10; ++j) {
+      ASSERT_OK(Merge("foo", rnd.RandomString(1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  MoveFilesToLevel(2);
+
+  std::string prop;
+  EXPECT_TRUE(dbfull()->GetProperty(DB::Properties::kLiveSstFilesSize, &prop));
+  uint64_t max_compaction_bytes = atoi(prop.c_str()) / 2;
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"max_compaction_bytes", std::to_string(max_compaction_bytes)}}));
+
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+}
+
+TEST_F(DBCompactionTest, ManualCompactionFailsInReadOnlyMode) {
+  // Regression test for bug where manual compaction hangs forever when the DB
+  // is in read-only mode. Verify it now at least returns, despite failing.
+  const int kNumL0Files = 4;
+  std::unique_ptr<FaultInjectionTestEnv> mock_env(
+      new FaultInjectionTestEnv(env_));
+  Options opts = CurrentOptions();
+  opts.disable_auto_compactions = true;
+  opts.env = mock_env.get();
+  DestroyAndReopen(opts);
+
+  Random rnd(301);
+  for (int i = 0; i < kNumL0Files; ++i) {
+    // Make sure files are overlapping in key-range to prevent trivial move.
+    ASSERT_OK(Put("key1", rnd.RandomString(1024)));
+    ASSERT_OK(Put("key2", rnd.RandomString(1024)));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(kNumL0Files, NumTableFilesAtLevel(0));
+
+  // Enter read-only mode by failing a write.
+  mock_env->SetFilesystemActive(false);
+  // Make sure this is outside `CompactRange`'s range so that it doesn't fail
+  // early trying to flush memtable.
+  ASSERT_NOK(Put("key3", rnd.RandomString(1024)));
+
+  // In the bug scenario, the first manual compaction would fail and forget to
+  // unregister itself, causing the second one to hang forever due to conflict
+  // with a non-running compaction.
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = false;
+  Slice begin_key("key1");
+  Slice end_key("key2");
+  ASSERT_NOK(dbfull()->CompactRange(cro, &begin_key, &end_key));
+  ASSERT_NOK(dbfull()->CompactRange(cro, &begin_key, &end_key));
+
+  // Close before mock_env destruct.
+  Close();
+}
+
+// ManualCompactionBottomLevelOptimization tests the bottom level manual
+// compaction optimization to skip recompacting files created by Ln-1 to Ln
+// compaction
+TEST_F(DBCompactionTest, ManualCompactionBottomLevelOptimized) {
+  Options opts = CurrentOptions();
+  opts.num_levels = 3;
+  opts.level0_file_num_compaction_trigger = 5;
+  opts.compression = kNoCompression;
+  opts.merge_operator.reset(new NoopMergeOperator());
+  opts.target_file_size_base = 1024;
+  opts.max_bytes_for_level_multiplier = 2;
+  opts.disable_auto_compactions = true;
+  DestroyAndReopen(opts);
+  ColumnFamilyHandleImpl* cfh =
+      static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
+  ColumnFamilyData* cfd = cfh->cfd();
+  InternalStats* internal_stats_ptr = cfd->internal_stats();
+  ASSERT_NE(internal_stats_ptr, nullptr);
+
+  Random rnd(301);
+  for (auto i = 0; i < 8; ++i) {
+    for (auto j = 0; j < 10; ++j) {
+      ASSERT_OK(
+          Put("foo" + std::to_string(i * 10 + j), rnd.RandomString(1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  MoveFilesToLevel(2);
+
+  for (auto i = 0; i < 8; ++i) {
+    for (auto j = 0; j < 10; ++j) {
+      ASSERT_OK(
+          Put("bar" + std::to_string(i * 10 + j), rnd.RandomString(1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+  const std::vector<InternalStats::CompactionStats>& comp_stats =
+      internal_stats_ptr->TEST_GetCompactionStats();
+  int num = comp_stats[2].num_input_files_in_output_level;
+  ASSERT_EQ(num, 0);
+
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+
+  const std::vector<InternalStats::CompactionStats>& comp_stats2 =
+      internal_stats_ptr->TEST_GetCompactionStats();
+  num = comp_stats2[2].num_input_files_in_output_level;
+  ASSERT_EQ(num, 0);
+}
+
+TEST_F(DBCompactionTest, ManualCompactionMax) {
+  uint64_t l1_avg_size = 0, l2_avg_size = 0;
+  auto generate_sst_func = [&]() {
+    Random rnd(301);
+    for (auto i = 0; i < 100; i++) {
+      for (auto j = 0; j < 10; j++) {
+        ASSERT_OK(Put(Key(i * 10 + j), rnd.RandomString(1024)));
+      }
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(2);
+
+    for (auto i = 0; i < 10; i++) {
+      for (auto j = 0; j < 10; j++) {
+        ASSERT_OK(Put(Key(i * 100 + j * 10), rnd.RandomString(1024)));
+      }
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(1);
+
+    std::vector<std::vector<FileMetaData>> level_to_files;
+    dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+                                    &level_to_files);
+
+    uint64_t total = 0;
+    for (const auto& file : level_to_files[1]) {
+      total += file.compensated_file_size;
+    }
+    l1_avg_size = total / level_to_files[1].size();
+
+    total = 0;
+    for (const auto& file : level_to_files[2]) {
+      total += file.compensated_file_size;
+    }
+    l2_avg_size = total / level_to_files[2].size();
+  };
+
+  std::atomic_int num_compactions(0);
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BGWorkCompaction", [&](void* /*arg*/) { ++num_compactions; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options opts = CurrentOptions();
+  opts.disable_auto_compactions = true;
+
+  // with default setting (1.6G by default), it should cover all files in 1
+  // compaction
+  DestroyAndReopen(opts);
+  generate_sst_func();
+  num_compactions.store(0);
+  CompactRangeOptions cro;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_TRUE(num_compactions.load() == 1);
+
+  // split the compaction to 5
+  int num_split = 5;
+  DestroyAndReopen(opts);
+  generate_sst_func();
+  uint64_t total_size = (l1_avg_size * 10) + (l2_avg_size * 100);
+  opts.max_compaction_bytes = total_size / num_split;
+  opts.target_file_size_base = total_size / num_split;
+  Reopen(opts);
+  num_compactions.store(0);
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_TRUE(num_compactions.load() == num_split);
+
+  // very small max_compaction_bytes, it should still move forward
+  opts.max_compaction_bytes = l1_avg_size / 2;
+  opts.target_file_size_base = l1_avg_size / 2;
+  DestroyAndReopen(opts);
+  generate_sst_func();
+  num_compactions.store(0);
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_TRUE(num_compactions.load() > 10);
+
+  // dynamically set the option
+  num_split = 2;
+  opts.max_compaction_bytes = 0;
+  DestroyAndReopen(opts);
+  generate_sst_func();
+  total_size = (l1_avg_size * 10) + (l2_avg_size * 100);
+  Status s = db_->SetOptions(
+      {{"max_compaction_bytes", std::to_string(total_size / num_split)},
+       {"target_file_size_base", std::to_string(total_size / num_split)}});
+  ASSERT_OK(s);
+
+  num_compactions.store(0);
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_TRUE(num_compactions.load() == num_split);
+}
+
+TEST_F(DBCompactionTest, CompactionDuringShutdown) {
+  Options opts = CurrentOptions();
+  opts.level0_file_num_compaction_trigger = 2;
+  opts.disable_auto_compactions = true;
+  DestroyAndReopen(opts);
+  ColumnFamilyHandleImpl* cfh =
+      static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
+  ColumnFamilyData* cfd = cfh->cfd();
+  InternalStats* internal_stats_ptr = cfd->internal_stats();
+  ASSERT_NE(internal_stats_ptr, nullptr);
+
+  Random rnd(301);
+  for (auto i = 0; i < 2; ++i) {
+    for (auto j = 0; j < 10; ++j) {
+      ASSERT_OK(
+          Put("foo" + std::to_string(i * 10 + j), rnd.RandomString(1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun",
+      [&](void* /*arg*/) { dbfull()->shutting_down_.store(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_TRUE(s.ok() || s.IsShutdownInProgress());
+  ASSERT_OK(dbfull()->error_handler_.GetBGError());
+}
+
+// FixFileIngestionCompactionDeadlock tests and verifies that compaction and
+// file ingestion do not cause deadlock in the event of write stall triggered
+// by number of L0 files reaching level0_stop_writes_trigger.
+TEST_P(DBCompactionTestWithParam, FixFileIngestionCompactionDeadlock) {
+  const int kNumKeysPerFile = 100;
+  // Generate SST files.
+  Options options = CurrentOptions();
+
+  // Generate an external SST file containing a single key, i.e. 99
+  std::string sst_files_dir = dbname_ + "/sst_files/";
+  ASSERT_OK(DestroyDir(env_, sst_files_dir));
+  ASSERT_OK(env_->CreateDir(sst_files_dir));
+  SstFileWriter sst_writer(EnvOptions(), options);
+  const std::string sst_file_path = sst_files_dir + "test.sst";
+  ASSERT_OK(sst_writer.Open(sst_file_path));
+  ASSERT_OK(sst_writer.Put(Key(kNumKeysPerFile - 1), "value"));
+  ASSERT_OK(sst_writer.Finish());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::IngestExternalFile:AfterIncIngestFileCounter",
+       "BackgroundCallCompaction:0"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.level0_file_num_compaction_trigger =
+      options.level0_stop_writes_trigger;
+  options.max_subcompactions = max_subcompactions_;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  // Generate level0_stop_writes_trigger L0 files to trigger write stop
+  for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) {
+    for (int j = 0; j != kNumKeysPerFile; ++j) {
+      ASSERT_OK(Put(Key(j), rnd.RandomString(990)));
+    }
+    if (i > 0) {
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+      ASSERT_EQ(NumTableFilesAtLevel(0 /*level*/, 0 /*cf*/), i);
+    }
+  }
+  // When we reach this point, there will be level0_stop_writes_trigger L0
+  // files and one extra key (99) in memory, which overlaps with the external
+  // SST file. Write stall triggers, and can be cleared only after compaction
+  // reduces the number of L0 files.
+
+  // Compaction will also be triggered since we have reached the threshold for
+  // auto compaction. Note that compaction may begin after the following file
+  // ingestion thread and waits for ingestion to finish.
+
+  // Thread to ingest file with overlapping key range with the current
+  // memtable. Consequently ingestion will trigger a flush. The flush MUST
+  // proceed without waiting for the write stall condition to clear, otherwise
+  // deadlock can happen.
+  port::Thread ingestion_thr([&]() {
+    IngestExternalFileOptions ifo;
+    Status s = db_->IngestExternalFile({sst_file_path}, ifo);
+    ASSERT_OK(s);
+  });
+
+  // More write to trigger write stop
+  ingestion_thr.join();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  Close();
+}
+
+TEST_F(DBCompactionTest, ConsistencyFailTest) {
+  Options options = CurrentOptions();
+  options.force_consistency_checks = true;
+  DestroyAndReopen(options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "VersionBuilder::CheckConsistency0", [&](void* arg) {
+        auto p =
+            reinterpret_cast<std::pair<FileMetaData**, FileMetaData**>*>(arg);
+        // just swap the two FileMetaData so that we hit error
+        // in CheckConsistency funcion
+        FileMetaData* temp = *(p->first);
+        *(p->first) = *(p->second);
+        *(p->second) = temp;
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int k = 0; k < 2; ++k) {
+    ASSERT_OK(Put("foo", "bar"));
+    Status s = Flush();
+    if (k < 1) {
+      ASSERT_OK(s);
+    } else {
+      ASSERT_TRUE(s.IsCorruption());
+    }
+  }
+
+  ASSERT_NOK(Put("foo", "bar"));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBCompactionTest, ConsistencyFailTest2) {
+  Options options = CurrentOptions();
+  options.force_consistency_checks = true;
+  options.target_file_size_base = 1000;
+  options.level0_file_num_compaction_trigger = 2;
+  BlockBasedTableOptions bbto;
+  bbto.block_size = 400;  // small block size
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "VersionBuilder::CheckConsistency1", [&](void* arg) {
+        auto p =
+            reinterpret_cast<std::pair<FileMetaData**, FileMetaData**>*>(arg);
+        // just swap the two FileMetaData so that we hit error
+        // in CheckConsistency funcion
+        FileMetaData* temp = *(p->first);
+        *(p->first) = *(p->second);
+        *(p->second) = temp;
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  std::string value = rnd.RandomString(1000);
+
+  ASSERT_OK(Put("foo1", value));
+  ASSERT_OK(Put("z", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo2", value));
+  ASSERT_OK(Put("z", ""));
+  Status s = Flush();
+  ASSERT_TRUE(s.ok() || s.IsCorruption());
+
+  // This probably returns non-OK, but we rely on the next Put()
+  // to determine the DB is frozen.
+  ASSERT_NOK(dbfull()->TEST_WaitForCompact());
+  ASSERT_NOK(Put("foo", "bar"));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+void IngestOneKeyValue(DBImpl* db, const std::string& key,
+                       const std::string& value, const Options& options) {
+  ExternalSstFileInfo info;
+  std::string f = test::PerThreadDBPath("sst_file" + key);
+  EnvOptions env;
+  ROCKSDB_NAMESPACE::SstFileWriter writer(env, options);
+  auto s = writer.Open(f);
+  ASSERT_OK(s);
+  // ASSERT_OK(writer.Put(Key(), ""));
+  ASSERT_OK(writer.Put(key, value));
+
+  ASSERT_OK(writer.Finish(&info));
+  IngestExternalFileOptions ingest_opt;
+
+  ASSERT_OK(db->IngestExternalFile({info.file_path}, ingest_opt));
+}
+
+TEST_P(DBCompactionTestWithParam,
+       FlushAfterIntraL0CompactionCheckConsistencyFail) {
+  Options options = CurrentOptions();
+  options.force_consistency_checks = true;
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 5;
+  options.max_background_compactions = 2;
+  options.max_subcompactions = max_subcompactions_;
+  DestroyAndReopen(options);
+
+  const size_t kValueSize = 1 << 20;
+  Random rnd(301);
+  std::atomic<int> pick_intra_l0_count(0);
+  std::string value(rnd.RandomString(kValueSize));
+
+  // The L0->L1 must be picked before we begin ingesting files to trigger
+  // intra-L0 compaction, and must not finish until after an intra-L0
+  // compaction has been picked.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"LevelCompactionPicker::PickCompaction:Return",
+        "DBCompactionTestWithParam::"
+        "FlushAfterIntraL0CompactionCheckConsistencyFail:L0ToL1Ready"},
+       {"LevelCompactionPicker::PickCompactionBySize:0",
+        "CompactionJob::Run():Start"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "FindIntraL0Compaction",
+      [&](void* /*arg*/) { pick_intra_l0_count.fetch_add(1); });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // prevents trivial move
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_OK(Put(Key(i), ""));  // prevents trivial move
+  }
+  ASSERT_OK(Flush());
+  Compact("", Key(99));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+  // Flush 5 L0 sst.
+  for (int i = 0; i < 5; ++i) {
+    ASSERT_OK(Put(Key(i + 1), value));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(5, NumTableFilesAtLevel(0));
+
+  // Put one key, to make smallest log sequence number in this memtable is less
+  // than sst which would be ingested in next step.
+  ASSERT_OK(Put(Key(0), "a"));
+
+  ASSERT_EQ(5, NumTableFilesAtLevel(0));
+  TEST_SYNC_POINT(
+      "DBCompactionTestWithParam::"
+      "FlushAfterIntraL0CompactionCheckConsistencyFail:L0ToL1Ready");
+
+  // Ingest 5 L0 sst. And this files would trigger PickIntraL0Compaction.
+  for (int i = 5; i < 10; i++) {
+    ASSERT_EQ(i, NumTableFilesAtLevel(0));
+    IngestOneKeyValue(dbfull(), Key(i), value, options);
+  }
+
+  // Put one key, to make biggest log sequence number in this memtable is bigger
+  // than sst which would be ingested in next step.
+  ASSERT_OK(Put(Key(2), "b"));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  std::vector<std::vector<FileMetaData>> level_to_files;
+  dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+                                  &level_to_files);
+  ASSERT_GT(level_to_files[0].size(), 0);
+  ASSERT_GT(pick_intra_l0_count.load(), 0);
+
+  ASSERT_OK(Flush());
+}
+
+TEST_P(DBCompactionTestWithParam,
+       IntraL0CompactionAfterFlushCheckConsistencyFail) {
+  Options options = CurrentOptions();
+  options.force_consistency_checks = true;
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 5;
+  options.max_background_compactions = 2;
+  options.max_subcompactions = max_subcompactions_;
+  options.write_buffer_size = 2 << 20;
+  options.max_write_buffer_number = 6;
+  DestroyAndReopen(options);
+
+  const size_t kValueSize = 1 << 20;
+  Random rnd(301);
+  std::string value(rnd.RandomString(kValueSize));
+  std::string value2(rnd.RandomString(kValueSize));
+  std::string bigvalue = value + value;
+
+  // prevents trivial move
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_OK(Put(Key(i), ""));  // prevents trivial move
+  }
+  ASSERT_OK(Flush());
+  Compact("", Key(99));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+  std::atomic<int> pick_intra_l0_count(0);
+  // The L0->L1 must be picked before we begin ingesting files to trigger
+  // intra-L0 compaction, and must not finish until after an intra-L0
+  // compaction has been picked.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"LevelCompactionPicker::PickCompaction:Return",
+        "DBCompactionTestWithParam::"
+        "IntraL0CompactionAfterFlushCheckConsistencyFail:L0ToL1Ready"},
+       {"LevelCompactionPicker::PickCompactionBySize:0",
+        "CompactionJob::Run():Start"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "FindIntraL0Compaction",
+      [&](void* /*arg*/) { pick_intra_l0_count.fetch_add(1); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  // Make 6 L0 sst.
+  for (int i = 0; i < 6; ++i) {
+    if (i % 2 == 0) {
+      IngestOneKeyValue(dbfull(), Key(i), value, options);
+    } else {
+      ASSERT_OK(Put(Key(i), value));
+      ASSERT_OK(Flush());
+    }
+  }
+
+  ASSERT_EQ(6, NumTableFilesAtLevel(0));
+
+  // Stop run flush job
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  test::SleepingBackgroundTask sleeping_tasks;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_tasks,
+                 Env::Priority::HIGH);
+  sleeping_tasks.WaitUntilSleeping();
+
+  // Put many keys to make memtable request to flush
+  for (int i = 0; i < 6; ++i) {
+    ASSERT_OK(Put(Key(i), bigvalue));
+  }
+
+  ASSERT_EQ(6, NumTableFilesAtLevel(0));
+  TEST_SYNC_POINT(
+      "DBCompactionTestWithParam::"
+      "IntraL0CompactionAfterFlushCheckConsistencyFail:L0ToL1Ready");
+  // ingest file to trigger IntraL0Compaction
+  for (int i = 6; i < 10; ++i) {
+    ASSERT_EQ(i, NumTableFilesAtLevel(0));
+    IngestOneKeyValue(dbfull(), Key(i), value2, options);
+  }
+
+  // Wake up flush job
+  sleeping_tasks.WakeUp();
+  sleeping_tasks.WaitUntilDone();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  uint64_t error_count = 0;
+  db_->GetIntProperty("rocksdb.background-errors", &error_count);
+  ASSERT_EQ(error_count, 0);
+  ASSERT_GT(pick_intra_l0_count.load(), 0);
+  for (int i = 0; i < 6; ++i) {
+    ASSERT_EQ(bigvalue, Get(Key(i)));
+  }
+  for (int i = 6; i < 10; ++i) {
+    ASSERT_EQ(value2, Get(Key(i)));
+  }
+}
+
+TEST_P(DBCompactionTestWithBottommostParam, SequenceKeysManualCompaction) {
+  constexpr int kSstNum = 10;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  // Generate some sst files on level 0 with sequence keys (no overlap)
+  for (int i = 0; i < kSstNum; i++) {
+    for (int j = 1; j < UCHAR_MAX; j++) {
+      auto key = std::string(kSstNum, '\0');
+      key[kSstNum - i] += static_cast<char>(j);
+      ASSERT_OK(Put(key, std::string(i % 1000, 'A')));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  ASSERT_EQ(std::to_string(kSstNum), FilesPerLevel(0));
+
+  auto cro = CompactRangeOptions();
+  cro.bottommost_level_compaction = bottommost_level_compaction_;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  if (bottommost_level_compaction_ == BottommostLevelCompaction::kForce ||
+      bottommost_level_compaction_ ==
+          BottommostLevelCompaction::kForceOptimized) {
+    // Real compaction to compact all sst files from level 0 to 1 file on level
+    // 1
+    ASSERT_EQ("0,1", FilesPerLevel(0));
+  } else {
+    // Just trivial move from level 0 -> 1
+    ASSERT_EQ("0," + std::to_string(kSstNum), FilesPerLevel(0));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DBCompactionTestWithBottommostParam, DBCompactionTestWithBottommostParam,
+    ::testing::Values(BottommostLevelCompaction::kSkip,
+                      BottommostLevelCompaction::kIfHaveCompactionFilter,
+                      BottommostLevelCompaction::kForce,
+                      BottommostLevelCompaction::kForceOptimized));
+
+TEST_F(DBCompactionTest, UpdateLevelSubCompactionTest) {
+  Options options = CurrentOptions();
+  options.max_subcompactions = 10;
+  options.target_file_size_base = 1 << 10;  // 1KB
+  DestroyAndReopen(options);
+
+  bool has_compaction = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(compaction->max_subcompactions() == 10);
+        has_compaction = true;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_TRUE(dbfull()->GetDBOptions().max_subcompactions == 10);
+  // Trigger compaction
+  for (int i = 0; i < 32; i++) {
+    for (int j = 0; j < 5000; j++) {
+      ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(has_compaction);
+
+  has_compaction = false;
+  ASSERT_OK(dbfull()->SetDBOptions({{"max_subcompactions", "2"}}));
+  ASSERT_TRUE(dbfull()->GetDBOptions().max_subcompactions == 2);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(compaction->max_subcompactions() == 2);
+        has_compaction = true;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Trigger compaction
+  for (int i = 0; i < 32; i++) {
+    for (int j = 0; j < 5000; j++) {
+      ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(has_compaction);
+}
+
+TEST_F(DBCompactionTest, UpdateUniversalSubCompactionTest) {
+  Options options = CurrentOptions();
+  options.max_subcompactions = 10;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.target_file_size_base = 1 << 10;  // 1KB
+  DestroyAndReopen(options);
+
+  bool has_compaction = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(compaction->max_subcompactions() == 10);
+        has_compaction = true;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Trigger compaction
+  for (int i = 0; i < 32; i++) {
+    for (int j = 0; j < 5000; j++) {
+      ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(has_compaction);
+  has_compaction = false;
+
+  ASSERT_OK(dbfull()->SetDBOptions({{"max_subcompactions", "2"}}));
+  ASSERT_TRUE(dbfull()->GetDBOptions().max_subcompactions == 2);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(compaction->max_subcompactions() == 2);
+        has_compaction = true;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Trigger compaction
+  for (int i = 0; i < 32; i++) {
+    for (int j = 0; j < 5000; j++) {
+      ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(has_compaction);
+}
+
+TEST_P(ChangeLevelConflictsWithAuto, TestConflict) {
+  // A `CompactRange()` may race with an automatic compaction, we'll need
+  // to make sure it doesn't corrupte the data.
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  Reopen(options);
+
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Put("bar", "v1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 2;
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  }
+  ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+  // Run a qury to refitting to level 1 while another thread writing to
+  // the same level.
+  SyncPoint::GetInstance()->LoadDependency({
+      // The first two dependencies ensure the foreground creates an L0 file
+      // between the background compaction's L0->L1 and its L1->L2.
+      {
+          "DBImpl::CompactRange:BeforeRefit:1",
+          "AutoCompactionFinished1",
+      },
+      {
+          "AutoCompactionFinished2",
+          "DBImpl::CompactRange:BeforeRefit:2",
+      },
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::thread auto_comp([&] {
+    TEST_SYNC_POINT("AutoCompactionFinished1");
+    ASSERT_OK(Put("bar", "v2"));
+    ASSERT_OK(Put("foo", "v2"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Put("bar", "v3"));
+    ASSERT_OK(Put("foo", "v3"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    TEST_SYNC_POINT("AutoCompactionFinished2");
+  });
+
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = GetParam() ? 1 : 0;
+    // This should return non-OK, but it's more important for the test to
+    // make sure that the DB is not corrupted.
+    ASSERT_NOK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  }
+  auto_comp.join();
+  // Refitting didn't happen.
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // Write something to DB just make sure that consistency check didn't
+  // fail and make the DB readable.
+}
+
+INSTANTIATE_TEST_CASE_P(ChangeLevelConflictsWithAuto,
+                        ChangeLevelConflictsWithAuto, testing::Bool());
+
+TEST_F(DBCompactionTest, ChangeLevelCompactRangeConflictsWithManual) {
+  // A `CompactRange()` with `change_level == true` needs to execute its final
+  // step, `ReFitLevel()`, in isolation. Previously there was a bug where
+  // refitting could target the same level as an ongoing manual compaction,
+  // leading to overlapping files in that level.
+  //
+  // This test ensures that case is not possible by verifying any manual
+  // compaction issued during the `ReFitLevel()` phase fails with
+  // `Status::Incomplete`.
+  Options options = CurrentOptions();
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 3;
+  Reopen(options);
+
+  // Setup an LSM with three levels populated.
+  Random rnd(301);
+  int key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 2;
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  }
+  ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+  GenerateNewFile(&rnd, &key_idx);
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1,2", FilesPerLevel(0));
+
+  // The background thread will refit L2->L1 while the
+  // foreground thread will try to simultaneously compact L0->L1.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      // The first two dependencies ensure the foreground creates an L0 file
+      // between the background compaction's L0->L1 and its L1->L2.
+      {
+          "DBImpl::RunManualCompaction()::1",
+          "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:"
+          "PutFG",
+      },
+      {
+          "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:"
+          "FlushedFG",
+          "DBImpl::RunManualCompaction()::2",
+      },
+      // The next two dependencies ensure the foreground invokes
+      // `CompactRange()` while the background is refitting. The
+      // foreground's `CompactRange()` is guaranteed to attempt an L0->L1
+      // as we set it up with an empty memtable and a new L0 file.
+      {
+          "DBImpl::CompactRange:PreRefitLevel",
+          "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:"
+          "CompactFG",
+      },
+      {
+          "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:"
+          "CompactedFG",
+          "DBImpl::CompactRange:PostRefitLevel",
+      },
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread refit_level_thread([&] {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 1;
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:PutFG");
+  // Make sure we have something new to compact in the foreground.
+  // Note key 1 is carefully chosen as it ensures the file we create here
+  // overlaps with one of the files being refitted L2->L1 in the background.
+  // If we chose key 0, the file created here would not overlap.
+  ASSERT_OK(Put(Key(1), "val"));
+  ASSERT_OK(Flush());
+  TEST_SYNC_POINT(
+      "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:FlushedFG");
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:CompactFG");
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(CompactRangeOptions(), nullptr, nullptr)
+                  .IsIncomplete());
+  TEST_SYNC_POINT(
+      "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:"
+      "CompactedFG");
+  refit_level_thread.join();
+}
+
+TEST_F(DBCompactionTest, ChangeLevelErrorPathTest) {
+  // This test is added to ensure that RefitLevel() error paths are clearing
+  // internal flags and to test that subsequent valid RefitLevel() calls
+  // succeeds
+  Options options = CurrentOptions();
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 3;
+  Reopen(options);
+
+  ASSERT_EQ("", FilesPerLevel(0));
+
+  // Setup an LSM with three levels populated.
+  Random rnd(301);
+  int key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1", FilesPerLevel(0));
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 2;
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  }
+  ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+  auto start_idx = key_idx;
+  GenerateNewFile(&rnd, &key_idx);
+  GenerateNewFile(&rnd, &key_idx);
+  auto end_idx = key_idx - 1;
+  ASSERT_EQ("1,1,2", FilesPerLevel(0));
+
+  // Next two CompactRange() calls are used to test exercise error paths within
+  // RefitLevel() before triggering a valid RefitLevel() call
+
+  // Trigger a refit to L1 first
+  {
+    std::string begin_string = Key(start_idx);
+    std::string end_string = Key(end_idx);
+    Slice begin(begin_string);
+    Slice end(end_string);
+
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 1;
+    ASSERT_OK(dbfull()->CompactRange(cro, &begin, &end));
+  }
+  ASSERT_EQ("0,3,2", FilesPerLevel(0));
+
+  // Try a refit from L2->L1 - this should fail and exercise error paths in
+  // RefitLevel()
+  {
+    // Select key range that matches the bottom most level (L2)
+    std::string begin_string = Key(0);
+    std::string end_string = Key(start_idx - 1);
+    Slice begin(begin_string);
+    Slice end(end_string);
+
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 1;
+    ASSERT_NOK(dbfull()->CompactRange(cro, &begin, &end));
+  }
+  ASSERT_EQ("0,3,2", FilesPerLevel(0));
+
+  // Try a valid Refit request to ensure, the path is still working
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 1;
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  }
+  ASSERT_EQ("0,5", FilesPerLevel(0));
+}
+
+TEST_F(DBCompactionTest, CompactionWithBlob) {
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+
+  Reopen(options);
+
+  constexpr char first_key[] = "first_key";
+  constexpr char second_key[] = "second_key";
+  constexpr char first_value[] = "first_value";
+  constexpr char second_value[] = "second_value";
+  constexpr char third_value[] = "third_value";
+
+  ASSERT_OK(Put(first_key, first_value));
+  ASSERT_OK(Put(second_key, first_value));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(first_key, second_value));
+  ASSERT_OK(Put(second_key, second_value));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(first_key, third_value));
+  ASSERT_OK(Put(second_key, third_value));
+  ASSERT_OK(Flush());
+
+  options.enable_blob_files = true;
+
+  Reopen(options);
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  ASSERT_EQ(Get(first_key), third_value);
+  ASSERT_EQ(Get(second_key), third_value);
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  ASSERT_NE(cfd, nullptr);
+
+  Version* const current = cfd->current();
+  ASSERT_NE(current, nullptr);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  ASSERT_NE(storage_info, nullptr);
+
+  const auto& l1_files = storage_info->LevelFiles(1);
+  ASSERT_EQ(l1_files.size(), 1);
+
+  const FileMetaData* const table_file = l1_files[0];
+  ASSERT_NE(table_file, nullptr);
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+  ASSERT_EQ(blob_files.size(), 1);
+
+  const auto& blob_file = blob_files.front();
+  ASSERT_NE(blob_file, nullptr);
+
+  ASSERT_EQ(table_file->smallest.user_key(), first_key);
+  ASSERT_EQ(table_file->largest.user_key(), second_key);
+  ASSERT_EQ(table_file->oldest_blob_file_number,
+            blob_file->GetBlobFileNumber());
+
+  ASSERT_EQ(blob_file->GetTotalBlobCount(), 2);
+
+  const InternalStats* const internal_stats = cfd->internal_stats();
+  ASSERT_NE(internal_stats, nullptr);
+
+  const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+  ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+  ASSERT_EQ(compaction_stats[1].bytes_written, table_file->fd.GetFileSize());
+  ASSERT_EQ(compaction_stats[1].bytes_written_blob,
+            blob_file->GetTotalBlobBytes());
+  ASSERT_EQ(compaction_stats[1].num_output_files, 1);
+  ASSERT_EQ(compaction_stats[1].num_output_files_blob, 1);
+}
+
+class DBCompactionTestBlobError
+    : public DBCompactionTest,
+      public testing::WithParamInterface<std::string> {
+ public:
+  DBCompactionTestBlobError() : sync_point_(GetParam()) {}
+
+  std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(DBCompactionTestBlobError, DBCompactionTestBlobError,
+                        ::testing::ValuesIn(std::vector<std::string>{
+                            "BlobFileBuilder::WriteBlobToFile:AddRecord",
+                            "BlobFileBuilder::WriteBlobToFile:AppendFooter"}));
+
+TEST_P(DBCompactionTestBlobError, CompactionError) {
+  Options options;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+
+  Reopen(options);
+
+  constexpr char first_key[] = "first_key";
+  constexpr char second_key[] = "second_key";
+  constexpr char first_value[] = "first_value";
+  constexpr char second_value[] = "second_value";
+  constexpr char third_value[] = "third_value";
+
+  ASSERT_OK(Put(first_key, first_value));
+  ASSERT_OK(Put(second_key, first_value));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(first_key, second_value));
+  ASSERT_OK(Put(second_key, second_value));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(first_key, third_value));
+  ASSERT_OK(Put(second_key, third_value));
+  ASSERT_OK(Flush());
+
+  options.enable_blob_files = true;
+
+  Reopen(options);
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) {
+    Status* const s = static_cast<Status*>(arg);
+    assert(s);
+
+    (*s) = Status::IOError(sync_point_);
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), begin, end).IsIOError());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  ASSERT_NE(cfd, nullptr);
+
+  Version* const current = cfd->current();
+  ASSERT_NE(current, nullptr);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  ASSERT_NE(storage_info, nullptr);
+
+  const auto& l1_files = storage_info->LevelFiles(1);
+  ASSERT_TRUE(l1_files.empty());
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+  ASSERT_TRUE(blob_files.empty());
+
+  const InternalStats* const internal_stats = cfd->internal_stats();
+  ASSERT_NE(internal_stats, nullptr);
+
+  const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+
+  if (sync_point_ == "BlobFileBuilder::WriteBlobToFile:AddRecord") {
+    ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+    ASSERT_EQ(compaction_stats[1].bytes_written, 0);
+    ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+    ASSERT_EQ(compaction_stats[1].num_output_files, 0);
+    ASSERT_EQ(compaction_stats[1].num_output_files_blob, 0);
+  } else {
+    // SST file writing succeeded; blob file writing failed (during Finish)
+    ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+    ASSERT_GT(compaction_stats[1].bytes_written, 0);
+    ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+    ASSERT_EQ(compaction_stats[1].num_output_files, 1);
+    ASSERT_EQ(compaction_stats[1].num_output_files_blob, 0);
+  }
+}
+
+class DBCompactionTestBlobGC
+    : public DBCompactionTest,
+      public testing::WithParamInterface<std::tuple<double, bool>> {
+ public:
+  DBCompactionTestBlobGC()
+      : blob_gc_age_cutoff_(std::get<0>(GetParam())),
+        updated_enable_blob_files_(std::get<1>(GetParam())) {}
+
+  double blob_gc_age_cutoff_;
+  bool updated_enable_blob_files_;
+};
+
+INSTANTIATE_TEST_CASE_P(DBCompactionTestBlobGC, DBCompactionTestBlobGC,
+                        ::testing::Combine(::testing::Values(0.0, 0.5, 1.0),
+                                           ::testing::Bool()));
+
+TEST_P(DBCompactionTestBlobGC, CompactionWithBlobGCOverrides) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.enable_blob_files = true;
+  options.blob_file_size = 32;  // one blob per file
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 0;
+
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < 128; i += 2) {
+    ASSERT_OK(Put("key" + std::to_string(i), "value" + std::to_string(i)));
+    ASSERT_OK(
+        Put("key" + std::to_string(i + 1), "value" + std::to_string(i + 1)));
+    ASSERT_OK(Flush());
+  }
+
+  std::vector<uint64_t> original_blob_files = GetBlobFileNumbers();
+  ASSERT_EQ(original_blob_files.size(), 128);
+
+  // Note: turning off enable_blob_files before the compaction results in
+  // garbage collected values getting inlined.
+  ASSERT_OK(db_->SetOptions({{"enable_blob_files", "false"}}));
+
+  CompactRangeOptions cro;
+  cro.blob_garbage_collection_policy = BlobGarbageCollectionPolicy::kForce;
+  cro.blob_garbage_collection_age_cutoff = blob_gc_age_cutoff_;
+
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  // Check that the GC stats are correct
+  {
+    VersionSet* const versions = dbfull()->GetVersionSet();
+    assert(versions);
+    assert(versions->GetColumnFamilySet());
+
+    ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+    assert(cfd);
+
+    const InternalStats* const internal_stats = cfd->internal_stats();
+    assert(internal_stats);
+
+    const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+    ASSERT_GE(compaction_stats.size(), 2);
+
+    ASSERT_GE(compaction_stats[1].bytes_read_blob, 0);
+    ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+  }
+
+  const size_t cutoff_index = static_cast<size_t>(
+      cro.blob_garbage_collection_age_cutoff * original_blob_files.size());
+  const size_t expected_num_files = original_blob_files.size() - cutoff_index;
+
+  const std::vector<uint64_t> new_blob_files = GetBlobFileNumbers();
+
+  ASSERT_EQ(new_blob_files.size(), expected_num_files);
+
+  // Original blob files below the cutoff should be gone, original blob files
+  // at or above the cutoff should be still there
+  for (size_t i = cutoff_index; i < original_blob_files.size(); ++i) {
+    ASSERT_EQ(new_blob_files[i - cutoff_index], original_blob_files[i]);
+  }
+
+  for (size_t i = 0; i < 128; ++i) {
+    ASSERT_EQ(Get("key" + std::to_string(i)), "value" + std::to_string(i));
+  }
+}
+
+TEST_P(DBCompactionTestBlobGC, CompactionWithBlobGC) {
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  options.enable_blob_files = true;
+  options.blob_file_size = 32;  // one blob per file
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = blob_gc_age_cutoff_;
+
+  Reopen(options);
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "first_value";
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "second_value";
+
+  ASSERT_OK(Put(first_key, first_value));
+  ASSERT_OK(Put(second_key, second_value));
+  ASSERT_OK(Flush());
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "third_value";
+  constexpr char fourth_key[] = "fourth_key";
+  constexpr char fourth_value[] = "fourth_value";
+
+  ASSERT_OK(Put(third_key, third_value));
+  ASSERT_OK(Put(fourth_key, fourth_value));
+  ASSERT_OK(Flush());
+
+  const std::vector<uint64_t> original_blob_files = GetBlobFileNumbers();
+
+  ASSERT_EQ(original_blob_files.size(), 4);
+
+  const size_t cutoff_index = static_cast<size_t>(
+      options.blob_garbage_collection_age_cutoff * original_blob_files.size());
+
+  // Note: turning off enable_blob_files before the compaction results in
+  // garbage collected values getting inlined.
+  size_t expected_number_of_files = original_blob_files.size();
+
+  if (!updated_enable_blob_files_) {
+    ASSERT_OK(db_->SetOptions({{"enable_blob_files", "false"}}));
+
+    expected_number_of_files -= cutoff_index;
+  }
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  ASSERT_EQ(Get(first_key), first_value);
+  ASSERT_EQ(Get(second_key), second_value);
+  ASSERT_EQ(Get(third_key), third_value);
+  ASSERT_EQ(Get(fourth_key), fourth_value);
+
+  const std::vector<uint64_t> new_blob_files = GetBlobFileNumbers();
+
+  ASSERT_EQ(new_blob_files.size(), expected_number_of_files);
+
+  // Original blob files below the cutoff should be gone, original blob files at
+  // or above the cutoff should be still there
+  for (size_t i = cutoff_index; i < original_blob_files.size(); ++i) {
+    ASSERT_EQ(new_blob_files[i - cutoff_index], original_blob_files[i]);
+  }
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+  assert(versions->GetColumnFamilySet());
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  assert(cfd);
+
+  const InternalStats* const internal_stats = cfd->internal_stats();
+  assert(internal_stats);
+
+  const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+
+  if (blob_gc_age_cutoff_ > 0.0) {
+    ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
+
+    if (updated_enable_blob_files_) {
+      // GC relocated some blobs to new blob files
+      ASSERT_GT(compaction_stats[1].bytes_written_blob, 0);
+      ASSERT_EQ(compaction_stats[1].bytes_read_blob,
+                compaction_stats[1].bytes_written_blob);
+    } else {
+      // GC moved some blobs back to the LSM, no new blob files
+      ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+    }
+  } else {
+    ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+    ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+  }
+}
+
+TEST_F(DBCompactionTest, CompactionWithBlobGCError_CorruptIndex) {
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  options.enable_blob_files = true;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 1.0;
+
+  Reopen(options);
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "first_value";
+  ASSERT_OK(Put(first_key, first_value));
+
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "second_value";
+  ASSERT_OK(Put(second_key, second_value));
+
+  ASSERT_OK(Flush());
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "third_value";
+  ASSERT_OK(Put(third_key, third_value));
+
+  constexpr char fourth_key[] = "fourth_key";
+  constexpr char fourth_value[] = "fourth_value";
+  ASSERT_OK(Put(fourth_key, fourth_value));
+
+  ASSERT_OK(Flush());
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::GarbageCollectBlobIfNeeded::TamperWithBlobIndex",
+      [](void* arg) {
+        Slice* const blob_index = static_cast<Slice*>(arg);
+        assert(blob_index);
+        assert(!blob_index->empty());
+        blob_index->remove_prefix(1);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_TRUE(
+      db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBCompactionTest, CompactionWithBlobGCError_InlinedTTLIndex) {
+  constexpr uint64_t min_blob_size = 10;
+
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = min_blob_size;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 1.0;
+
+  Reopen(options);
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "first_value";
+  ASSERT_OK(Put(first_key, first_value));
+
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "second_value";
+  ASSERT_OK(Put(second_key, second_value));
+
+  ASSERT_OK(Flush());
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "third_value";
+  ASSERT_OK(Put(third_key, third_value));
+
+  constexpr char fourth_key[] = "fourth_key";
+  constexpr char blob[] = "short";
+  static_assert(sizeof(short) - 1 < min_blob_size,
+                "Blob too long to be inlined");
+
+  // Fake an inlined TTL blob index.
+  std::string blob_index;
+
+  constexpr uint64_t expiration = 1234567890;
+
+  BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob);
+
+  WriteBatch batch;
+  ASSERT_OK(
+      WriteBatchInternal::PutBlobIndex(&batch, 0, fourth_key, blob_index));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  ASSERT_OK(Flush());
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_TRUE(
+      db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption());
+}
+
+TEST_F(DBCompactionTest, CompactionWithBlobGCError_IndexWithInvalidFileNumber) {
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  options.enable_blob_files = true;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 1.0;
+
+  Reopen(options);
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "first_value";
+  ASSERT_OK(Put(first_key, first_value));
+
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "second_value";
+  ASSERT_OK(Put(second_key, second_value));
+
+  ASSERT_OK(Flush());
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "third_value";
+  ASSERT_OK(Put(third_key, third_value));
+
+  constexpr char fourth_key[] = "fourth_key";
+
+  // Fake a blob index referencing a non-existent blob file.
+  std::string blob_index;
+
+  constexpr uint64_t blob_file_number = 1000;
+  constexpr uint64_t offset = 1234;
+  constexpr uint64_t size = 5678;
+
+  BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+                        kNoCompression);
+
+  WriteBatch batch;
+  ASSERT_OK(
+      WriteBatchInternal::PutBlobIndex(&batch, 0, fourth_key, blob_index));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  ASSERT_OK(Flush());
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_TRUE(
+      db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption());
+}
+
+TEST_F(DBCompactionTest, CompactionWithChecksumHandoff1) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 3;
+  options.env = fault_fs_env.get();
+  options.create_if_missing = true;
+  options.checksum_handoff_file_types.Add(FileType::kTableFile);
+  Status s;
+  Reopen(options);
+
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+  Destroy(options);
+  Reopen(options);
+
+  // The hash does not match, compaction write fails
+  // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+  // Since the file system returns IOStatus::Corruption, it is an
+  // unrecoverable error.
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) {
+        fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(),
+            ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  Destroy(options);
+  Reopen(options);
+
+  // The file system does not support checksum handoff. The check
+  // will be ignored.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+
+  // Each write will be similated as corrupted.
+  // Since the file system returns IOStatus::Corruption, it is an
+  // unrecoverable error.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0",
+      [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(),
+            ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  Destroy(options);
+}
+
+TEST_F(DBCompactionTest, CompactionWithChecksumHandoff2) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 3;
+  options.env = fault_fs_env.get();
+  options.create_if_missing = true;
+  Status s;
+  Reopen(options);
+
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+  Destroy(options);
+  Reopen(options);
+
+  // options is not set, the checksum handoff will not be triggered
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) {
+        fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+  SyncPoint::GetInstance()->DisableProcessing();
+  Destroy(options);
+  Reopen(options);
+
+  // The file system does not support checksum handoff. The check
+  // will be ignored.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+
+  // options is not set, the checksum handoff will not be triggered
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0",
+      [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+
+  Destroy(options);
+}
+
+TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest1) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 3;
+  options.env = fault_fs_env.get();
+  options.create_if_missing = true;
+  options.checksum_handoff_file_types.Add(FileType::kDescriptorFile);
+  Status s;
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  Reopen(options);
+
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+  Destroy(options);
+  Reopen(options);
+
+  // The hash does not match, compaction write fails
+  // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+  // Since the file system returns IOStatus::Corruption, it is mapped to
+  // kFatalError error.
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) {
+        fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  Destroy(options);
+}
+
+TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest2) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 3;
+  options.env = fault_fs_env.get();
+  options.create_if_missing = true;
+  options.checksum_handoff_file_types.Add(FileType::kDescriptorFile);
+  Status s;
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+  Reopen(options);
+
+  // The file system does not support checksum handoff. The check
+  // will be ignored.
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+
+  // Each write will be similated as corrupted.
+  // Since the file system returns IOStatus::Corruption, it is mapped to
+  // kFatalError error.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0",
+      [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  Destroy(options);
+}
+
+TEST_F(DBCompactionTest, FIFOWarm) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleFIFO;
+  options.num_levels = 1;
+  options.max_open_files = -1;
+  options.level0_file_num_compaction_trigger = 2;
+  options.create_if_missing = true;
+  CompactionOptionsFIFO fifo_options;
+  fifo_options.age_for_warm = 1000;
+  fifo_options.max_table_files_size = 100000000;
+  options.compaction_options_fifo = fifo_options;
+  env_->SetMockSleep();
+  Reopen(options);
+
+  int total_warm = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "NewWritableFile::FileOptions.temperature", [&](void* arg) {
+        Temperature temperature = *(static_cast<Temperature*>(arg));
+        if (temperature == Temperature::kWarm) {
+          total_warm++;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // The file system does not support checksum handoff. The check
+  // will be ignored.
+  ASSERT_OK(Put(Key(0), "value1"));
+  env_->MockSleepForSeconds(800);
+  ASSERT_OK(Put(Key(2), "value2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(0), "value1"));
+  env_->MockSleepForSeconds(800);
+  ASSERT_OK(Put(Key(2), "value2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(0), "value1"));
+  env_->MockSleepForSeconds(800);
+  ASSERT_OK(Put(Key(2), "value2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_OK(Put(Key(0), "value1"));
+  env_->MockSleepForSeconds(800);
+  ASSERT_OK(Put(Key(2), "value2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  ColumnFamilyMetaData metadata;
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(4, metadata.file_count);
+  ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
+  ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[1].temperature);
+  ASSERT_EQ(Temperature::kWarm, metadata.levels[0].files[2].temperature);
+  ASSERT_EQ(Temperature::kWarm, metadata.levels[0].files[3].temperature);
+  ASSERT_EQ(2, total_warm);
+
+  Destroy(options);
+}
+
+TEST_F(DBCompactionTest, DisableMultiManualCompaction) {
+  const int kNumL0Files = 10;
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  // Generate 2 levels of file to make sure the manual compaction is not skipped
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), "value"));
+    if (i % 2) {
+      ASSERT_OK(Flush());
+    }
+  }
+  MoveFilesToLevel(2);
+
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), "value"));
+    if (i % 2) {
+      ASSERT_OK(Flush());
+    }
+  }
+  MoveFilesToLevel(1);
+
+  // Block compaction queue
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  port::Thread compact_thread1([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = false;
+    std::string begin_str = Key(0);
+    std::string end_str = Key(3);
+    Slice b = begin_str;
+    Slice e = end_str;
+    auto s = db_->CompactRange(cro, &b, &e);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+
+  port::Thread compact_thread2([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = false;
+    std::string begin_str = Key(4);
+    std::string end_str = Key(7);
+    Slice b = begin_str;
+    Slice e = end_str;
+    auto s = db_->CompactRange(cro, &b, &e);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+
+  // Disable manual compaction should cancel both manual compactions and both
+  // compaction should return incomplete.
+  db_->DisableManualCompaction();
+
+  compact_thread1.join();
+  compact_thread2.join();
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+}
+
+TEST_F(DBCompactionTest, DisableJustStartedManualCompaction) {
+  const int kNumL0Files = 4;
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  // generate files, but avoid trigger auto compaction
+  for (int i = 0; i < kNumL0Files / 2; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+
+  // make sure the manual compaction background is started but not yet set the
+  // status to in_progress, then cancel the manual compaction, which should not
+  // result in segfault
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BGWorkCompaction",
+        "DBCompactionTest::DisableJustStartedManualCompaction:"
+        "PreDisableManualCompaction"},
+       {"DBImpl::RunManualCompaction:Unscheduled",
+        "BackgroundCallCompaction:0"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread compact_thread([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = true;
+    auto s = db_->CompactRange(cro, nullptr, nullptr);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+  TEST_SYNC_POINT(
+      "DBCompactionTest::DisableJustStartedManualCompaction:"
+      "PreDisableManualCompaction");
+  db_->DisableManualCompaction();
+
+  compact_thread.join();
+}
+
+TEST_F(DBCompactionTest, DisableInProgressManualCompaction) {
+  const int kNumL0Files = 4;
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCompaction:InProgress",
+        "DBCompactionTest::DisableInProgressManualCompaction:"
+        "PreDisableManualCompaction"},
+       {"DBImpl::RunManualCompaction:Unscheduled",
+        "CompactionJob::Run():Start"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // generate files, but avoid trigger auto compaction
+  for (int i = 0; i < kNumL0Files / 2; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+
+  port::Thread compact_thread([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = true;
+    auto s = db_->CompactRange(cro, nullptr, nullptr);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::DisableInProgressManualCompaction:"
+      "PreDisableManualCompaction");
+  db_->DisableManualCompaction();
+
+  compact_thread.join();
+}
+
+TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFull) {
+  const int kNumL0Files = 4;
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::RunManualCompaction:Scheduled",
+        "DBCompactionTest::DisableManualCompactionThreadQueueFull:"
+        "PreDisableManualCompaction"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  // Block compaction queue
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  // generate files, but avoid trigger auto compaction
+  for (int i = 0; i < kNumL0Files / 2; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+
+  port::Thread compact_thread([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = true;
+    auto s = db_->CompactRange(cro, nullptr, nullptr);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::DisableManualCompactionThreadQueueFull:"
+      "PreDisableManualCompaction");
+
+  // Generate more files to trigger auto compaction which is scheduled after
+  // manual compaction. Has to generate 4 more files because existing files are
+  // pending compaction
+  for (int i = 0; i < kNumL0Files; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(std::to_string(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0));
+
+  db_->DisableManualCompaction();
+
+  // CompactRange should return before the compaction has the chance to run
+  compact_thread.join();
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+}
+
+TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFullDBClose) {
+  const int kNumL0Files = 4;
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::RunManualCompaction:Scheduled",
+        "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
+        "PreDisableManualCompaction"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  // Block compaction queue
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  // generate files, but avoid trigger auto compaction
+  for (int i = 0; i < kNumL0Files / 2; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+
+  port::Thread compact_thread([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = true;
+    auto s = db_->CompactRange(cro, nullptr, nullptr);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
+      "PreDisableManualCompaction");
+
+  // Generate more files to trigger auto compaction which is scheduled after
+  // manual compaction. Has to generate 4 more files because existing files are
+  // pending compaction
+  for (int i = 0; i < kNumL0Files; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(std::to_string(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0));
+
+  db_->DisableManualCompaction();
+
+  // CompactRange should return before the compaction has the chance to run
+  compact_thread.join();
+
+  // Try close DB while manual compaction is canceled but still in the queue.
+  // And an auto-triggered compaction is also in the queue.
+  auto s = db_->Close();
+  ASSERT_OK(s);
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+}
+
+TEST_F(DBCompactionTest, DBCloseWithManualCompaction) {
+  const int kNumL0Files = 4;
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::RunManualCompaction:Scheduled",
+        "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
+        "PreDisableManualCompaction"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  // Block compaction queue
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  // generate files, but avoid trigger auto compaction
+  for (int i = 0; i < kNumL0Files / 2; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+
+  port::Thread compact_thread([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = true;
+    auto s = db_->CompactRange(cro, nullptr, nullptr);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
+      "PreDisableManualCompaction");
+
+  // Generate more files to trigger auto compaction which is scheduled after
+  // manual compaction. Has to generate 4 more files because existing files are
+  // pending compaction
+  for (int i = 0; i < kNumL0Files; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(std::to_string(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0));
+
+  // Close DB with manual compaction and auto triggered compaction in the queue.
+  auto s = db_->Close();
+  ASSERT_OK(s);
+
+  // manual compaction thread should return with Incomplete().
+  compact_thread.join();
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+}
+
+TEST_F(DBCompactionTest,
+       DisableManualCompactionDoesNotWaitForDrainingAutomaticCompaction) {
+  // When `CompactRangeOptions::exclusive_manual_compaction == true`, we wait
+  // for automatic compactions to drain before starting the manual compaction.
+  // This test verifies `DisableManualCompaction()` can cancel such a compaction
+  // without waiting for the drain to complete.
+  const int kNumL0Files = 4;
+
+  // Enforces manual compaction enters wait loop due to pending automatic
+  // compaction.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BGWorkCompaction", "DBImpl::RunManualCompaction:NotScheduled"},
+       {"DBImpl::RunManualCompaction:WaitScheduled",
+        "BackgroundCallCompaction:0"}});
+  // The automatic compaction will cancel the waiting manual compaction.
+  // Completing this implies the cancellation did not wait on automatic
+  // compactions to finish.
+  bool callback_completed = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void* /*arg*/) {
+        db_->DisableManualCompaction();
+        callback_completed = true;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  for (int i = 0; i < kNumL0Files; ++i) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = true;
+  ASSERT_TRUE(db_->CompactRange(cro, nullptr, nullptr).IsIncomplete());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(callback_completed);
+}
+
+TEST_F(DBCompactionTest, ChangeLevelConflictsWithManual) {
+  Options options = CurrentOptions();
+  options.num_levels = 3;
+  Reopen(options);
+
+  // Setup an LSM with L2 populated.
+  Random rnd(301);
+  ASSERT_OK(Put(Key(0), rnd.RandomString(990)));
+  ASSERT_OK(Put(Key(1), rnd.RandomString(990)));
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 2;
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  }
+  ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+  // The background thread will refit L2->L1 while the foreground thread will
+  // attempt to run a compaction on new data. The following dependencies
+  // ensure the background manual compaction's refitting phase disables manual
+  // compaction immediately before the foreground manual compaction can register
+  // itself. Manual compaction is kept disabled until the foreground manual
+  // checks for the failure once.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      // Only do Put()s for foreground CompactRange() once the background
+      // CompactRange() has reached the refitting phase.
+      {
+          "DBImpl::CompactRange:BeforeRefit:1",
+          "DBCompactionTest::ChangeLevelConflictsWithManual:"
+          "PreForegroundCompactRange",
+      },
+      // Right before we register the manual compaction, proceed with
+      // the refitting phase so manual compactions are disabled. Stay in
+      // the refitting phase with manual compactions disabled until it is
+      // noticed.
+      {
+          "DBImpl::RunManualCompaction:0",
+          "DBImpl::CompactRange:BeforeRefit:2",
+      },
+      {
+          "DBImpl::CompactRange:PreRefitLevel",
+          "DBImpl::RunManualCompaction:1",
+      },
+      {
+          "DBImpl::RunManualCompaction:PausedAtStart",
+          "DBImpl::CompactRange:PostRefitLevel",
+      },
+      // If compaction somehow were scheduled, let's let it run after reenabling
+      // manual compactions. This dependency is not expected to be hit but is
+      // here for speculatively coercing future bugs.
+      {
+          "DBImpl::CompactRange:PostRefitLevel:ManualCompactionEnabled",
+          "BackgroundCallCompaction:0",
+      },
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread refit_level_thread([&] {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 1;
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::ChangeLevelConflictsWithManual:"
+      "PreForegroundCompactRange");
+  ASSERT_OK(Put(Key(0), rnd.RandomString(990)));
+  ASSERT_OK(Put(Key(1), rnd.RandomString(990)));
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(CompactRangeOptions(), nullptr, nullptr)
+                  .IsIncomplete());
+
+  refit_level_thread.join();
+}
+
+TEST_F(DBCompactionTest, BottomPriCompactionCountsTowardConcurrencyLimit) {
+  // Flushes several files to trigger compaction while lock is released during
+  // a bottom-pri compaction. Verifies it does not get scheduled to thread pool
+  // because per-DB limit for compaction parallelism is one (default).
+  const int kNumL0Files = 4;
+  const int kNumLevels = 3;
+
+  env_->SetBackgroundThreads(1, Env::Priority::BOTTOM);
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.num_levels = kNumLevels;
+  DestroyAndReopen(options);
+
+  // Setup last level to be non-empty since it's a bit unclear whether
+  // compaction to an empty level would be considered "bottommost".
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(kNumLevels - 1);
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BGWorkBottomCompaction",
+        "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
+        "PreTriggerCompaction"},
+       {"DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
+        "PostTriggerCompaction",
+        "BackgroundCallCompaction:0"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread compact_range_thread([&] {
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+    cro.exclusive_manual_compaction = false;
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  });
+
+  // Sleep in the low-pri thread so any newly scheduled compaction will be
+  // queued. Otherwise it might finish before we check its existence.
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
+      "PreTriggerCompaction");
+  for (int i = 0; i < kNumL0Files; ++i) {
+    ASSERT_OK(Put(Key(0), "val"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(0u, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+  TEST_SYNC_POINT(
+      "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
+      "PostTriggerCompaction");
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+  compact_range_thread.join();
+}
+
+TEST_F(DBCompactionTest, BottommostFileCompactionAllowIngestBehind) {
+  // allow_ingest_behind prevents seqnum zeroing, and could cause
+  // compaction loop with reason kBottommostFiles.
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.compaction_style = kCompactionStyleLevel;
+  options.allow_ingest_behind = true;
+  options.comparator = BytewiseComparator();
+  DestroyAndReopen(options);
+
+  WriteOptions write_opts;
+  ASSERT_OK(db_->Put(write_opts, "infinite", "compaction loop"));
+  ASSERT_OK(db_->Put(write_opts, "infinite", "loop"));
+
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+  ASSERT_OK(db_->Put(write_opts, "bumpseqnum", ""));
+  ASSERT_OK(Flush());
+  auto snapshot = db_->GetSnapshot();
+  // Bump up oldest_snapshot_seqnum_ in VersionStorageInfo.
+  db_->ReleaseSnapshot(snapshot);
+  bool compacted = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* /* arg */) {
+        // There should not be a compaction.
+        compacted = true;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  // Wait for compaction to be scheduled.
+  env_->SleepForMicroseconds(2000000);
+  ASSERT_FALSE(compacted);
+  // The following assert can be used to check for compaction loop:
+  // it used to wait forever before the fix.
+  // ASSERT_OK(dbfull()->TEST_WaitForCompact(true /* wait_unscheduled */));
+}
+
+#endif  // !defined(ROCKSDB_LITE)
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  (void)argc;
+  (void)argv;
+  return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_dynamic_level_test.cc b/src/rocksdb/db/db_dynamic_level_test.cc
new file mode 100644
index 000000000..17fa67cb2
--- /dev/null
+++ b/src/rocksdb/db/db_dynamic_level_test.cc
@@ -0,0 +1,507 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Introduction of SyncPoint effectively disabled building and running this test
+// in Release build.
+// which is a pity, it is a good test
+#if !defined(ROCKSDB_LITE)
+
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/env.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+class DBTestDynamicLevel : public DBTestBase {
+ public:
+  DBTestDynamicLevel()
+      : DBTestBase("db_dynamic_level_test", /*env_do_fsync=*/true) {}
+};
+
+TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase) {
+  if (!Snappy_Supported() || !LZ4_Supported()) {
+    return;
+  }
+  // Use InMemoryEnv, or it would be too slow.
+  std::unique_ptr<Env> env(NewMemEnv(env_));
+
+  const int kNKeys = 1000;
+  int keys[kNKeys];
+
+  auto verify_func = [&]() {
+    for (int i = 0; i < kNKeys; i++) {
+      ASSERT_NE("NOT_FOUND", Get(Key(i)));
+      ASSERT_NE("NOT_FOUND", Get(Key(kNKeys * 2 + i)));
+      if (i < kNKeys / 10) {
+        ASSERT_EQ("NOT_FOUND", Get(Key(kNKeys + keys[i])));
+      } else {
+        ASSERT_NE("NOT_FOUND", Get(Key(kNKeys + keys[i])));
+      }
+    }
+  };
+
+  Random rnd(301);
+  for (int ordered_insert = 0; ordered_insert <= 1; ordered_insert++) {
+    for (int i = 0; i < kNKeys; i++) {
+      keys[i] = i;
+    }
+    if (ordered_insert == 0) {
+      RandomShuffle(std::begin(keys), std::end(keys), rnd.Next());
+    }
+    for (int max_background_compactions = 1; max_background_compactions < 4;
+         max_background_compactions += 2) {
+      Options options;
+      options.env = env.get();
+      options.create_if_missing = true;
+      options.write_buffer_size = 2048;
+      options.max_write_buffer_number = 2;
+      options.level0_file_num_compaction_trigger = 2;
+      options.level0_slowdown_writes_trigger = 2;
+      options.level0_stop_writes_trigger = 2;
+      options.target_file_size_base = 2048;
+      options.level_compaction_dynamic_level_bytes = true;
+      options.max_bytes_for_level_base = 10240;
+      options.max_bytes_for_level_multiplier = 4;
+      options.max_background_compactions = max_background_compactions;
+      options.num_levels = 5;
+
+      options.compression_per_level.resize(3);
+      options.compression_per_level[0] = kNoCompression;
+      options.compression_per_level[1] = kLZ4Compression;
+      options.compression_per_level[2] = kSnappyCompression;
+      options.env = env_;
+
+      DestroyAndReopen(options);
+
+      for (int i = 0; i < kNKeys; i++) {
+        int key = keys[i];
+        ASSERT_OK(Put(Key(kNKeys + key), rnd.RandomString(102)));
+        ASSERT_OK(Put(Key(key), rnd.RandomString(102)));
+        ASSERT_OK(Put(Key(kNKeys * 2 + key), rnd.RandomString(102)));
+        ASSERT_OK(Delete(Key(kNKeys + keys[i / 10])));
+        env_->SleepForMicroseconds(5000);
+      }
+
+      uint64_t int_prop;
+      ASSERT_TRUE(db_->GetIntProperty("rocksdb.background-errors", &int_prop));
+      ASSERT_EQ(0U, int_prop);
+
+      // Verify DB
+      for (int j = 0; j < 2; j++) {
+        verify_func();
+        if (j == 0) {
+          Reopen(options);
+        }
+      }
+
+      // Test compact range works
+      ASSERT_OK(
+          dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+      // All data should be in the last level.
+      ColumnFamilyMetaData cf_meta;
+      db_->GetColumnFamilyMetaData(&cf_meta);
+      ASSERT_EQ(5U, cf_meta.levels.size());
+      for (int i = 0; i < 4; i++) {
+        ASSERT_EQ(0U, cf_meta.levels[i].files.size());
+      }
+      ASSERT_GT(cf_meta.levels[4U].files.size(), 0U);
+      verify_func();
+
+      Close();
+    }
+  }
+
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+}
+
+// Test specific cases in dynamic max bytes
+TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) {
+  Random rnd(301);
+  int kMaxKey = 1000000;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options.write_buffer_size = 20480;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 9999;
+  options.level0_stop_writes_trigger = 9999;
+  options.target_file_size_base = 9102;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 40960;
+  options.max_bytes_for_level_multiplier = 4;
+  options.max_background_compactions = 2;
+  options.num_levels = 5;
+  options.max_compaction_bytes = 0;  // Force not expanding in compactions
+  options.db_host_id = "";  // Setting this messes up the file size calculation
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1024;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  DestroyAndReopen(options);
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "true"},
+  }));
+
+  uint64_t int_prop;
+  std::string str_prop;
+
+  // Initial base level is the last level
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(4U, int_prop);
+
+  // Put about 28K to L0
+  for (int i = 0; i < 70; i++) {
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  rnd.RandomString(380)));
+  }
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "false"},
+  }));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(4U, int_prop);
+
+  // Insert extra about 28K to L0. After they are compacted to L4, the base
+  // level should be changed to L3.
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "true"},
+  }));
+  for (int i = 0; i < 70; i++) {
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  rnd.RandomString(380)));
+  }
+
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "false"},
+  }));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(3U, int_prop);
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level1", &str_prop));
+  ASSERT_EQ("0", str_prop);
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level2", &str_prop));
+  ASSERT_EQ("0", str_prop);
+
+  // Write even more data while leaving the base level at L3.
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "true"},
+  }));
+  // Write about 40K more
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  rnd.RandomString(380)));
+  }
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "false"},
+  }));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(3U, int_prop);
+
+  // Fill up L0, and then run an (auto) L0->Lmax compaction to raise the base
+  // level to 2.
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "true"},
+  }));
+  // Write about 650K more.
+  // Each file is about 11KB, with 9KB of data.
+  for (int i = 0; i < 1300; i++) {
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  rnd.RandomString(380)));
+  }
+
+  // Make sure that the compaction starts before the last bit of data is
+  // flushed, so that the base level isn't raised to L1.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"CompactionJob::Run():Start", "DynamicLevelMaxBytesBase2:0"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "false"},
+  }));
+
+  TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:0");
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(2U, int_prop);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Write more data until the base level changes to L1. There will be
+  // a manual compaction going on at the same time.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"CompactionJob::Run():Start", "DynamicLevelMaxBytesBase2:1"},
+      {"DynamicLevelMaxBytesBase2:2", "CompactionJob::Run():End"},
+      {"DynamicLevelMaxBytesBase2:compact_range_finish",
+       "FlushJob::WriteLevel0Table"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread thread([this] {
+    TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:compact_range_start");
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:compact_range_finish");
+  });
+
+  TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:1");
+  for (int i = 0; i < 2; i++) {
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  rnd.RandomString(380)));
+  }
+  TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:2");
+
+  ASSERT_OK(Flush());
+
+  thread.join();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(1U, int_prop);
+}
+
+// Test specific cases in dynamic max bytes
+TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesCompactRange) {
+  Random rnd(301);
+  int kMaxKey = 1000000;
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.write_buffer_size = 2048;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 9999;
+  options.level0_stop_writes_trigger = 9999;
+  options.target_file_size_base = 2;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 10240;
+  options.max_bytes_for_level_multiplier = 4;
+  options.max_background_compactions = 1;
+  const int kNumLevels = 5;
+  options.num_levels = kNumLevels;
+  options.max_compaction_bytes = 1;  // Force not expanding in compactions
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1024;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  DestroyAndReopen(options);
+
+  // Compact against empty DB
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  uint64_t int_prop;
+  std::string str_prop;
+
+  // Initial base level is the last level
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(4U, int_prop);
+
+  // Put about 7K to L0
+  for (int i = 0; i < 140; i++) {
+    ASSERT_OK(
+        Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))), rnd.RandomString(80)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  if (NumTableFilesAtLevel(0) == 0) {
+    // Make sure level 0 is not empty
+    ASSERT_OK(
+        Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))), rnd.RandomString(80)));
+    ASSERT_OK(Flush());
+  }
+
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(3U, int_prop);
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level1", &str_prop));
+  ASSERT_EQ("0", str_prop);
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level2", &str_prop));
+  ASSERT_EQ("0", str_prop);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  std::set<int> output_levels;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionPicker::CompactRange:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        output_levels.insert(compaction->output_level());
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(output_levels.size(), 2);
+  ASSERT_TRUE(output_levels.find(3) != output_levels.end());
+  ASSERT_TRUE(output_levels.find(4) != output_levels.end());
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level0", &str_prop));
+  ASSERT_EQ("0", str_prop);
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level3", &str_prop));
+  ASSERT_EQ("0", str_prop);
+  // Base level is still level 3.
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(3U, int_prop);
+}
+
+TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBaseInc) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.write_buffer_size = 2048;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 2;
+  options.target_file_size_base = 2048;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 10240;
+  options.max_bytes_for_level_multiplier = 4;
+  options.max_background_compactions = 2;
+  options.num_levels = 5;
+  options.max_compaction_bytes = 100000000;
+
+  DestroyAndReopen(options);
+
+  int non_trivial = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* /*arg*/) { non_trivial++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  const int total_keys = 3000;
+  const int random_part_size = 100;
+  for (int i = 0; i < total_keys; i++) {
+    std::string value = rnd.RandomString(random_part_size);
+    PutFixed32(&value, static_cast<uint32_t>(i));
+    ASSERT_OK(Put(Key(i), value));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  ASSERT_EQ(non_trivial, 0);
+
+  for (int i = 0; i < total_keys; i++) {
+    std::string value = Get(Key(i));
+    ASSERT_EQ(DecodeFixed32(value.c_str() + random_part_size),
+              static_cast<uint32_t>(i));
+  }
+
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+}
+
+TEST_F(DBTestDynamicLevel, DISABLED_MigrateToDynamicLevelMaxBytesBase) {
+  Random rnd(301);
+  const int kMaxKey = 2000;
+
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = 2048;
+  options.max_write_buffer_number = 8;
+  options.level0_file_num_compaction_trigger = 4;
+  options.level0_slowdown_writes_trigger = 4;
+  options.level0_stop_writes_trigger = 8;
+  options.target_file_size_base = 2048;
+  options.level_compaction_dynamic_level_bytes = false;
+  options.max_bytes_for_level_base = 10240;
+  options.max_bytes_for_level_multiplier = 4;
+  options.num_levels = 8;
+
+  DestroyAndReopen(options);
+
+  auto verify_func = [&](int num_keys, bool if_sleep) {
+    for (int i = 0; i < num_keys; i++) {
+      ASSERT_NE("NOT_FOUND", Get(Key(kMaxKey + i)));
+      if (i < num_keys / 10) {
+        ASSERT_EQ("NOT_FOUND", Get(Key(i)));
+      } else {
+        ASSERT_NE("NOT_FOUND", Get(Key(i)));
+      }
+      if (if_sleep && i % 1000 == 0) {
+        // Without it, valgrind may choose not to give another
+        // thread a chance to run before finishing the function,
+        // causing the test to be extremely slow.
+        env_->SleepForMicroseconds(1);
+      }
+    }
+  };
+
+  int total_keys = 1000;
+  for (int i = 0; i < total_keys; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(102)));
+    ASSERT_OK(Put(Key(kMaxKey + i), rnd.RandomString(102)));
+    ASSERT_OK(Delete(Key(i / 10)));
+  }
+  verify_func(total_keys, false);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  options.level_compaction_dynamic_level_bytes = true;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  verify_func(total_keys, false);
+
+  std::atomic_bool compaction_finished;
+  compaction_finished = false;
+  // Issue manual compaction in one thread and still verify DB state
+  // in main thread.
+  ROCKSDB_NAMESPACE::port::Thread t([&]() {
+    CompactRangeOptions compact_options;
+    compact_options.change_level = true;
+    compact_options.target_level = options.num_levels - 1;
+    ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
+    compaction_finished.store(true);
+  });
+  do {
+    verify_func(total_keys, true);
+  } while (!compaction_finished.load());
+  t.join();
+
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "false"},
+  }));
+
+  int total_keys2 = 2000;
+  for (int i = total_keys; i < total_keys2; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(102)));
+    ASSERT_OK(Put(Key(kMaxKey + i), rnd.RandomString(102)));
+    ASSERT_OK(Delete(Key(i / 10)));
+  }
+
+  verify_func(total_keys2, false);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  verify_func(total_keys2, false);
+
+  // Base level is not level 1
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !defined(ROCKSDB_LITE)
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  (void)argc;
+  (void)argv;
+  return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_encryption_test.cc b/src/rocksdb/db/db_encryption_test.cc
new file mode 100644
index 000000000..73e89d158
--- /dev/null
+++ b/src/rocksdb/db/db_encryption_test.cc
@@ -0,0 +1,130 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/perf_context.h"
+#if !defined(ROCKSDB_LITE)
+#include "test_util/sync_point.h"
+#endif
+#include <iostream>
+#include <string>
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBEncryptionTest : public DBTestBase {
+ public:
+  DBEncryptionTest()
+      : DBTestBase("db_encryption_test", /*env_do_fsync=*/true) {}
+  Env* GetTargetEnv() {
+    if (encrypted_env_ != nullptr) {
+      return (static_cast<EnvWrapper*>(encrypted_env_))->target();
+    } else {
+      return env_;
+    }
+  }
+};
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBEncryptionTest, CheckEncrypted) {
+  ASSERT_OK(Put("foo567", "v1.fetdq"));
+  ASSERT_OK(Put("bar123", "v2.dfgkjdfghsd"));
+  Close();
+
+  // Open all files and look for the values we've put in there.
+  // They should not be found if encrypted, otherwise
+  // they should be found.
+  std::vector<std::string> fileNames;
+  auto status = env_->GetChildren(dbname_, &fileNames);
+  ASSERT_OK(status);
+
+  Env* target = GetTargetEnv();
+  int hits = 0;
+  for (auto it = fileNames.begin(); it != fileNames.end(); ++it) {
+    if (*it == "LOCK") {
+      continue;
+    }
+    auto filePath = dbname_ + "/" + *it;
+    std::unique_ptr<SequentialFile> seqFile;
+    auto envOptions = EnvOptions(CurrentOptions());
+    status = target->NewSequentialFile(filePath, &seqFile, envOptions);
+    ASSERT_OK(status);
+
+    uint64_t fileSize;
+    status = target->GetFileSize(filePath, &fileSize);
+    ASSERT_OK(status);
+
+    std::string scratch;
+    scratch.reserve(fileSize);
+    Slice data;
+    status = seqFile->Read(fileSize, &data, (char*)scratch.data());
+    ASSERT_OK(status);
+
+    if (data.ToString().find("foo567") != std::string::npos) {
+      hits++;
+      // std::cout << "Hit in " << filePath << "\n";
+    }
+    if (data.ToString().find("v1.fetdq") != std::string::npos) {
+      hits++;
+      // std::cout << "Hit in " << filePath << "\n";
+    }
+    if (data.ToString().find("bar123") != std::string::npos) {
+      hits++;
+      // std::cout << "Hit in " << filePath << "\n";
+    }
+    if (data.ToString().find("v2.dfgkjdfghsd") != std::string::npos) {
+      hits++;
+      // std::cout << "Hit in " << filePath << "\n";
+    }
+    if (data.ToString().find("dfgk") != std::string::npos) {
+      hits++;
+      // std::cout << "Hit in " << filePath << "\n";
+    }
+  }
+  if (encrypted_env_) {
+    ASSERT_EQ(hits, 0);
+  } else {
+    ASSERT_GE(hits, 4);
+  }
+}
+
+TEST_F(DBEncryptionTest, ReadEmptyFile) {
+  auto defaultEnv = GetTargetEnv();
+
+  // create empty file for reading it back in later
+  auto envOptions = EnvOptions(CurrentOptions());
+  auto filePath = dbname_ + "/empty.empty";
+
+  Status status;
+  {
+    std::unique_ptr<WritableFile> writableFile;
+    status = defaultEnv->NewWritableFile(filePath, &writableFile, envOptions);
+    ASSERT_OK(status);
+  }
+
+  std::unique_ptr<SequentialFile> seqFile;
+  status = defaultEnv->NewSequentialFile(filePath, &seqFile, envOptions);
+  ASSERT_OK(status);
+
+  std::string scratch;
+  Slice data;
+  // reading back 16 bytes from the empty file shouldn't trigger an assertion.
+  // it should just work and return an empty string
+  status = seqFile->Read(16, &data, (char*)scratch.data());
+  ASSERT_OK(status);
+
+  ASSERT_TRUE(data.empty());
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_filesnapshot.cc b/src/rocksdb/db/db_filesnapshot.cc
new file mode 100644
index 000000000..aa9bd738a
--- /dev/null
+++ b/src/rocksdb/db/db_filesnapshot.cc
@@ -0,0 +1,442 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/job_context.h"
+#include "db/version_set.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "logging/logging.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/metadata.h"
+#include "rocksdb/types.h"
+#include "test_util/sync_point.h"
+#include "util/file_checksum_helper.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status DBImpl::FlushForGetLiveFiles() {
+  mutex_.AssertHeld();
+
+  // flush all dirty data to disk.
+  Status status;
+  if (immutable_db_options_.atomic_flush) {
+    autovector<ColumnFamilyData*> cfds;
+    SelectColumnFamiliesForAtomicFlush(&cfds);
+    mutex_.Unlock();
+    status =
+        AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kGetLiveFiles);
+    if (status.IsColumnFamilyDropped()) {
+      status = Status::OK();
+    }
+    mutex_.Lock();
+  } else {
+    for (auto cfd : versions_->GetRefedColumnFamilySet()) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      mutex_.Unlock();
+      status = FlushMemTable(cfd, FlushOptions(), FlushReason::kGetLiveFiles);
+      TEST_SYNC_POINT("DBImpl::GetLiveFiles:1");
+      TEST_SYNC_POINT("DBImpl::GetLiveFiles:2");
+      mutex_.Lock();
+      if (!status.ok() && !status.IsColumnFamilyDropped()) {
+        break;
+      } else if (status.IsColumnFamilyDropped()) {
+        status = Status::OK();
+      }
+    }
+  }
+  return status;
+}
+
+Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
+                            uint64_t* manifest_file_size, bool flush_memtable) {
+  *manifest_file_size = 0;
+
+  mutex_.Lock();
+
+  if (flush_memtable) {
+    Status status = FlushForGetLiveFiles();
+    if (!status.ok()) {
+      mutex_.Unlock();
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log, "Cannot Flush data %s\n",
+                      status.ToString().c_str());
+      return status;
+    }
+  }
+
+  // Make a set of all of the live table and blob files
+  std::vector<uint64_t> live_table_files;
+  std::vector<uint64_t> live_blob_files;
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    if (cfd->IsDropped()) {
+      continue;
+    }
+    cfd->current()->AddLiveFiles(&live_table_files, &live_blob_files);
+  }
+
+  ret.clear();
+  ret.reserve(live_table_files.size() + live_blob_files.size() +
+              3);  // for CURRENT + MANIFEST + OPTIONS
+
+  // create names of the live files. The names are not absolute
+  // paths, instead they are relative to dbname_.
+  for (const auto& table_file_number : live_table_files) {
+    ret.emplace_back(MakeTableFileName("", table_file_number));
+  }
+
+  for (const auto& blob_file_number : live_blob_files) {
+    ret.emplace_back(BlobFileName("", blob_file_number));
+  }
+
+  ret.emplace_back(CurrentFileName(""));
+  ret.emplace_back(DescriptorFileName("", versions_->manifest_file_number()));
+  // The OPTIONS file number is zero in read-write mode when OPTIONS file
+  // writing failed and the DB was configured with
+  // `fail_if_options_file_error == false`. In read-only mode the OPTIONS file
+  // number is zero when no OPTIONS file exist at all. In those cases we do not
+  // record any OPTIONS file in the live file list.
+  if (versions_->options_file_number() != 0) {
+    ret.emplace_back(OptionsFileName("", versions_->options_file_number()));
+  }
+
+  // find length of manifest file while holding the mutex lock
+  *manifest_file_size = versions_->manifest_file_size();
+
+  mutex_.Unlock();
+  return Status::OK();
+}
+
+Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
+  // Record tracked WALs as a (minimum) cross-check for directory scan
+  std::vector<uint64_t> required_by_manifest;
+
+  // If caller disabled deletions, this function should return files that are
+  // guaranteed not to be deleted until deletions are re-enabled. We need to
+  // wait for pending purges to finish since WalManager doesn't know which
+  // files are going to be purged. Additional purges won't be scheduled as
+  // long as deletions are disabled (so the below loop must terminate).
+  // Also note that we disable deletions anyway to avoid the case where a
+  // file is deleted in the middle of the scan, causing IO error.
+  Status deletions_disabled = DisableFileDeletions();
+  {
+    InstrumentedMutexLock l(&mutex_);
+    while (pending_purge_obsolete_files_ > 0 || bg_purge_scheduled_ > 0) {
+      bg_cv_.Wait();
+    }
+
+    // Record tracked WALs as a (minimum) cross-check for directory scan
+    const auto& manifest_wals = versions_->GetWalSet().GetWals();
+    required_by_manifest.reserve(manifest_wals.size());
+    for (const auto& wal : manifest_wals) {
+      required_by_manifest.push_back(wal.first);
+    }
+  }
+
+  Status s = wal_manager_.GetSortedWalFiles(files);
+
+  // DisableFileDeletions / EnableFileDeletions not supported in read-only DB
+  if (deletions_disabled.ok()) {
+    Status s2 = EnableFileDeletions(/*force*/ false);
+    assert(s2.ok());
+    s2.PermitUncheckedError();
+  } else {
+    assert(deletions_disabled.IsNotSupported());
+  }
+
+  if (s.ok()) {
+    // Verify includes those required by manifest (one sorted list is superset
+    // of the other)
+    auto required = required_by_manifest.begin();
+    auto included = files.begin();
+
+    while (required != required_by_manifest.end()) {
+      if (included == files.end() || *required < (*included)->LogNumber()) {
+        // FAIL - did not find
+        return Status::Corruption(
+            "WAL file " + std::to_string(*required) +
+            " required by manifest but not in directory list");
+      }
+      if (*required == (*included)->LogNumber()) {
+        ++required;
+        ++included;
+      } else {
+        assert(*required > (*included)->LogNumber());
+        ++included;
+      }
+    }
+  }
+
+  return s;
+}
+
+Status DBImpl::GetCurrentWalFile(std::unique_ptr<LogFile>* current_log_file) {
+  uint64_t current_logfile_number;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    current_logfile_number = logfile_number_;
+  }
+
+  return wal_manager_.GetLiveWalFile(current_logfile_number, current_log_file);
+}
+
+Status DBImpl::GetLiveFilesStorageInfo(
+    const LiveFilesStorageInfoOptions& opts,
+    std::vector<LiveFileStorageInfo>* files) {
+  // To avoid returning partial results, only move results to files on success.
+  assert(files);
+  files->clear();
+  std::vector<LiveFileStorageInfo> results;
+
+  // NOTE: This implementation was largely migrated from Checkpoint.
+
+  Status s;
+  VectorLogPtr live_wal_files;
+  bool flush_memtable = true;
+  if (!immutable_db_options_.allow_2pc) {
+    if (opts.wal_size_for_flush == std::numeric_limits<uint64_t>::max()) {
+      flush_memtable = false;
+    } else if (opts.wal_size_for_flush > 0) {
+      // If the outstanding log files are small, we skip the flush.
+      s = GetSortedWalFiles(live_wal_files);
+
+      if (!s.ok()) {
+        return s;
+      }
+
+      // Don't flush column families if total log size is smaller than
+      // log_size_for_flush. We copy the log files instead.
+      // We may be able to cover 2PC case too.
+      uint64_t total_wal_size = 0;
+      for (auto& wal : live_wal_files) {
+        total_wal_size += wal->SizeFileBytes();
+      }
+      if (total_wal_size < opts.wal_size_for_flush) {
+        flush_memtable = false;
+      }
+      live_wal_files.clear();
+    }
+  }
+
+  // This is a modified version of GetLiveFiles, to get access to more
+  // metadata.
+  mutex_.Lock();
+  if (flush_memtable) {
+    Status status = FlushForGetLiveFiles();
+    if (!status.ok()) {
+      mutex_.Unlock();
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log, "Cannot Flush data %s\n",
+                      status.ToString().c_str());
+      return status;
+    }
+  }
+
+  // Make a set of all of the live table and blob files
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    if (cfd->IsDropped()) {
+      continue;
+    }
+    VersionStorageInfo& vsi = *cfd->current()->storage_info();
+    auto& cf_paths = cfd->ioptions()->cf_paths;
+
+    auto GetDir = [&](size_t path_id) {
+      // Matching TableFileName() behavior
+      if (path_id >= cf_paths.size()) {
+        assert(false);
+        return cf_paths.back().path;
+      } else {
+        return cf_paths[path_id].path;
+      }
+    };
+
+    for (int level = 0; level < vsi.num_levels(); ++level) {
+      const auto& level_files = vsi.LevelFiles(level);
+      for (const auto& meta : level_files) {
+        assert(meta);
+
+        results.emplace_back();
+        LiveFileStorageInfo& info = results.back();
+
+        info.relative_filename = MakeTableFileName(meta->fd.GetNumber());
+        info.directory = GetDir(meta->fd.GetPathId());
+        info.file_number = meta->fd.GetNumber();
+        info.file_type = kTableFile;
+        info.size = meta->fd.GetFileSize();
+        if (opts.include_checksum_info) {
+          info.file_checksum_func_name = meta->file_checksum_func_name;
+          info.file_checksum = meta->file_checksum;
+          if (info.file_checksum_func_name.empty()) {
+            info.file_checksum_func_name = kUnknownFileChecksumFuncName;
+            info.file_checksum = kUnknownFileChecksum;
+          }
+        }
+        info.temperature = meta->temperature;
+      }
+    }
+    const auto& blob_files = vsi.GetBlobFiles();
+    for (const auto& meta : blob_files) {
+      assert(meta);
+
+      results.emplace_back();
+      LiveFileStorageInfo& info = results.back();
+
+      info.relative_filename = BlobFileName(meta->GetBlobFileNumber());
+      info.directory = GetDir(/* path_id */ 0);
+      info.file_number = meta->GetBlobFileNumber();
+      info.file_type = kBlobFile;
+      info.size = meta->GetBlobFileSize();
+      if (opts.include_checksum_info) {
+        info.file_checksum_func_name = meta->GetChecksumMethod();
+        info.file_checksum = meta->GetChecksumValue();
+        if (info.file_checksum_func_name.empty()) {
+          info.file_checksum_func_name = kUnknownFileChecksumFuncName;
+          info.file_checksum = kUnknownFileChecksum;
+        }
+      }
+      // TODO?: info.temperature
+    }
+  }
+
+  // Capture some final info before releasing mutex
+  const uint64_t manifest_number = versions_->manifest_file_number();
+  const uint64_t manifest_size = versions_->manifest_file_size();
+  const uint64_t options_number = versions_->options_file_number();
+  const uint64_t options_size = versions_->options_file_size_;
+  const uint64_t min_log_num = MinLogNumberToKeep();
+
+  mutex_.Unlock();
+
+  std::string manifest_fname = DescriptorFileName(manifest_number);
+  {  // MANIFEST
+    results.emplace_back();
+    LiveFileStorageInfo& info = results.back();
+
+    info.relative_filename = manifest_fname;
+    info.directory = GetName();
+    info.file_number = manifest_number;
+    info.file_type = kDescriptorFile;
+    info.size = manifest_size;
+    info.trim_to_size = true;
+    if (opts.include_checksum_info) {
+      info.file_checksum_func_name = kUnknownFileChecksumFuncName;
+      info.file_checksum = kUnknownFileChecksum;
+    }
+  }
+
+  {  // CURRENT
+    results.emplace_back();
+    LiveFileStorageInfo& info = results.back();
+
+    info.relative_filename = kCurrentFileName;
+    info.directory = GetName();
+    info.file_type = kCurrentFile;
+    // CURRENT could be replaced so we have to record the contents as needed.
+    info.replacement_contents = manifest_fname + "\n";
+    info.size = manifest_fname.size() + 1;
+    if (opts.include_checksum_info) {
+      info.file_checksum_func_name = kUnknownFileChecksumFuncName;
+      info.file_checksum = kUnknownFileChecksum;
+    }
+  }
+
+  // The OPTIONS file number is zero in read-write mode when OPTIONS file
+  // writing failed and the DB was configured with
+  // `fail_if_options_file_error == false`. In read-only mode the OPTIONS file
+  // number is zero when no OPTIONS file exist at all. In those cases we do not
+  // record any OPTIONS file in the live file list.
+  if (options_number != 0) {
+    results.emplace_back();
+    LiveFileStorageInfo& info = results.back();
+
+    info.relative_filename = OptionsFileName(options_number);
+    info.directory = GetName();
+    info.file_number = options_number;
+    info.file_type = kOptionsFile;
+    info.size = options_size;
+    if (opts.include_checksum_info) {
+      info.file_checksum_func_name = kUnknownFileChecksumFuncName;
+      info.file_checksum = kUnknownFileChecksum;
+    }
+  }
+
+  // Some legacy testing stuff  TODO: carefully clean up obsolete parts
+  TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:FlushDone");
+
+  TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles1");
+  TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles2");
+
+  if (s.ok()) {
+    // To maximize the effectiveness of track_and_verify_wals_in_manifest,
+    // sync WAL when it is enabled.
+    s = FlushWAL(
+        immutable_db_options_.track_and_verify_wals_in_manifest /* sync */);
+    if (s.IsNotSupported()) {  // read-only DB or similar
+      s = Status::OK();
+    }
+  }
+
+  TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive1");
+  TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive2");
+
+  // If we have more than one column family, we also need to get WAL files.
+  if (s.ok()) {
+    s = GetSortedWalFiles(live_wal_files);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+
+  size_t wal_size = live_wal_files.size();
+
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "Number of log files %" ROCKSDB_PRIszt, live_wal_files.size());
+
+  // Link WAL files. Copy exact size of last one because it is the only one
+  // that has changes after the last flush.
+  auto wal_dir = immutable_db_options_.GetWalDir();
+  for (size_t i = 0; s.ok() && i < wal_size; ++i) {
+    if ((live_wal_files[i]->Type() == kAliveLogFile) &&
+        (!flush_memtable || live_wal_files[i]->LogNumber() >= min_log_num)) {
+      results.emplace_back();
+      LiveFileStorageInfo& info = results.back();
+      auto f = live_wal_files[i]->PathName();
+      assert(!f.empty() && f[0] == '/');
+      info.relative_filename = f.substr(1);
+      info.directory = wal_dir;
+      info.file_number = live_wal_files[i]->LogNumber();
+      info.file_type = kWalFile;
+      info.size = live_wal_files[i]->SizeFileBytes();
+      // Only last should need to be trimmed
+      info.trim_to_size = (i + 1 == wal_size);
+      if (opts.include_checksum_info) {
+        info.file_checksum_func_name = kUnknownFileChecksumFuncName;
+        info.file_checksum = kUnknownFileChecksum;
+      }
+    }
+  }
+
+  if (s.ok()) {
+    // Only move results to output on success.
+    *files = std::move(results);
+  }
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_flush_test.cc b/src/rocksdb/db/db_flush_test.cc
new file mode 100644
index 000000000..3b3f7e183
--- /dev/null
+++ b/src/rocksdb/db/db_flush_test.cc
@@ -0,0 +1,3084 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <atomic>
+#include <limits>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "test_util/sync_point.h"
+#include "test_util/testutil.h"
+#include "util/cast_util.h"
+#include "util/mutexlock.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/fault_injection_fs.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This is a static filter used for filtering
+// kvs during the compaction process.
+static std::string NEW_VALUE = "NewValue";
+
+class DBFlushTest : public DBTestBase {
+ public:
+  DBFlushTest() : DBTestBase("db_flush_test", /*env_do_fsync=*/true) {}
+};
+
+class DBFlushDirectIOTest : public DBFlushTest,
+                            public ::testing::WithParamInterface<bool> {
+ public:
+  DBFlushDirectIOTest() : DBFlushTest() {}
+};
+
+class DBAtomicFlushTest : public DBFlushTest,
+                          public ::testing::WithParamInterface<bool> {
+ public:
+  DBAtomicFlushTest() : DBFlushTest() {}
+};
+
+// We had issue when two background threads trying to flush at the same time,
+// only one of them get committed. The test verifies the issue is fixed.
+TEST_F(DBFlushTest, FlushWhileWritingManifest) {
+  Options options;
+  options.disable_auto_compactions = true;
+  options.max_background_flushes = 2;
+  options.env = env_;
+  Reopen(options);
+  FlushOptions no_wait;
+  no_wait.wait = false;
+  no_wait.allow_write_stall = true;
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"VersionSet::LogAndApply:WriteManifest",
+        "DBFlushTest::FlushWhileWritingManifest:1"},
+       {"MemTableList::TryInstallMemtableFlushResults:InProgress",
+        "VersionSet::LogAndApply:WriteManifestDone"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("foo", "v"));
+  ASSERT_OK(dbfull()->Flush(no_wait));
+  TEST_SYNC_POINT("DBFlushTest::FlushWhileWritingManifest:1");
+  ASSERT_OK(Put("bar", "v"));
+  ASSERT_OK(dbfull()->Flush(no_wait));
+  // If the issue is hit we will wait here forever.
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ(2, TotalTableFiles());
+#endif  // ROCKSDB_LITE
+}
+
+// Disable this test temporarily on Travis as it fails intermittently.
+// Github issue: #4151
+TEST_F(DBFlushTest, SyncFail) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  Options options;
+  options.disable_auto_compactions = true;
+  options.env = fault_injection_env.get();
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBFlushTest::SyncFail:1", "DBImpl::SyncClosedLogs:Start"},
+       {"DBImpl::SyncClosedLogs:Failed", "DBFlushTest::SyncFail:2"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_OK(Put("key", "value"));
+  FlushOptions flush_options;
+  flush_options.wait = false;
+  ASSERT_OK(dbfull()->Flush(flush_options));
+  // Flush installs a new super-version. Get the ref count after that.
+  fault_injection_env->SetFilesystemActive(false);
+  TEST_SYNC_POINT("DBFlushTest::SyncFail:1");
+  TEST_SYNC_POINT("DBFlushTest::SyncFail:2");
+  fault_injection_env->SetFilesystemActive(true);
+  // Now the background job will do the flush; wait for it.
+  // Returns the IO error happend during flush.
+  ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable());
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("", FilesPerLevel());  // flush failed.
+#endif                             // ROCKSDB_LITE
+  Destroy(options);
+}
+
+TEST_F(DBFlushTest, SyncSkip) {
+  Options options = CurrentOptions();
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBFlushTest::SyncSkip:1", "DBImpl::SyncClosedLogs:Skip"},
+       {"DBImpl::SyncClosedLogs:Skip", "DBFlushTest::SyncSkip:2"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Reopen(options);
+  ASSERT_OK(Put("key", "value"));
+
+  FlushOptions flush_options;
+  flush_options.wait = false;
+  ASSERT_OK(dbfull()->Flush(flush_options));
+
+  TEST_SYNC_POINT("DBFlushTest::SyncSkip:1");
+  TEST_SYNC_POINT("DBFlushTest::SyncSkip:2");
+
+  // Now the background job will do the flush; wait for it.
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  Destroy(options);
+}
+
+TEST_F(DBFlushTest, FlushInLowPriThreadPool) {
+  // Verify setting an empty high-pri (flush) thread pool causes flushes to be
+  // scheduled in the low-pri (compaction) thread pool.
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 4;
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
+  Reopen(options);
+  env_->SetBackgroundThreads(0, Env::HIGH);
+
+  std::thread::id tid;
+  int num_flushes = 0, num_compactions = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BGWorkFlush", [&](void* /*arg*/) {
+        if (tid == std::thread::id()) {
+          tid = std::this_thread::get_id();
+        } else {
+          ASSERT_EQ(tid, std::this_thread::get_id());
+        }
+        ++num_flushes;
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BGWorkCompaction", [&](void* /*arg*/) {
+        ASSERT_EQ(tid, std::this_thread::get_id());
+        ++num_compactions;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("key", "val"));
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_OK(Put("key", "val"));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(4, num_flushes);
+  ASSERT_EQ(1, num_compactions);
+}
+
+// Test when flush job is submitted to low priority thread pool and when DB is
+// closed in the meanwhile, CloseHelper doesn't hang.
+TEST_F(DBFlushTest, CloseDBWhenFlushInLowPri) {
+  Options options = CurrentOptions();
+  options.max_background_flushes = 1;
+  options.max_total_wal_size = 8192;
+
+  DestroyAndReopen(options);
+  CreateColumnFamilies({"cf1", "cf2"}, options);
+
+  env_->SetBackgroundThreads(0, Env::HIGH);
+  env_->SetBackgroundThreads(1, Env::LOW);
+  test::SleepingBackgroundTask sleeping_task_low;
+  int num_flushes = 0;
+
+  SyncPoint::GetInstance()->SetCallBack("DBImpl::BGWorkFlush",
+                                        [&](void* /*arg*/) { ++num_flushes; });
+
+  int num_low_flush_unscheduled = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::UnscheduleLowFlushCallback", [&](void* /*arg*/) {
+        num_low_flush_unscheduled++;
+        // There should be one flush job in low pool that needs to be
+        // unscheduled
+        ASSERT_EQ(num_low_flush_unscheduled, 1);
+      });
+
+  int num_high_flush_unscheduled = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::UnscheduleHighFlushCallback", [&](void* /*arg*/) {
+        num_high_flush_unscheduled++;
+        // There should be no flush job in high pool
+        ASSERT_EQ(num_high_flush_unscheduled, 0);
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(0, "key1", DummyString(8192)));
+  // Block thread so that flush cannot be run and can be removed from the queue
+  // when called Unschedule.
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  sleeping_task_low.WaitUntilSleeping();
+
+  // Trigger flush and flush job will be scheduled to LOW priority thread.
+  ASSERT_OK(Put(0, "key2", DummyString(8192)));
+
+  // Close DB and flush job in low priority queue will be removed without
+  // running.
+  Close();
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+  ASSERT_EQ(0, num_flushes);
+
+  TryReopenWithColumnFamilies({"default", "cf1", "cf2"}, options);
+  ASSERT_OK(Put(0, "key3", DummyString(8192)));
+  ASSERT_OK(Flush(0));
+  ASSERT_EQ(1, num_flushes);
+}
+
+TEST_F(DBFlushTest, ManualFlushWithMinWriteBufferNumberToMerge) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 3;
+  Reopen(options);
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BGWorkFlush",
+        "DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:1"},
+       {"DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:2",
+        "FlushJob::WriteLevel0Table"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("key1", "value1"));
+
+  port::Thread t([&]() {
+    // The call wait for flush to finish, i.e. with flush_options.wait = true.
+    ASSERT_OK(Flush());
+  });
+
+  // Wait for flush start.
+  TEST_SYNC_POINT("DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:1");
+  // Insert a second memtable before the manual flush finish.
+  // At the end of the manual flush job, it will check if further flush
+  // is needed, but it will not trigger flush of the second memtable because
+  // min_write_buffer_number_to_merge is not reached.
+  ASSERT_OK(Put("key2", "value2"));
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+  TEST_SYNC_POINT("DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:2");
+
+  // Manual flush should return, without waiting for flush indefinitely.
+  t.join();
+}
+
+TEST_F(DBFlushTest, ScheduleOnlyOneBgThread) {
+  Options options = CurrentOptions();
+  Reopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  int called = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::MaybeScheduleFlushOrCompaction:AfterSchedule:0", [&](void* arg) {
+        ASSERT_NE(nullptr, arg);
+        auto unscheduled_flushes = *reinterpret_cast<int*>(arg);
+        ASSERT_EQ(0, unscheduled_flushes);
+        ++called;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("a", "foo"));
+  FlushOptions flush_opts;
+  ASSERT_OK(dbfull()->Flush(flush_opts));
+  ASSERT_EQ(1, called);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// The following 3 tests are designed for testing garbage statistics at flush
+// time.
+//
+// ======= General Information ======= (from GitHub Wiki).
+// There are three scenarios where memtable flush can be triggered:
+//
+// 1 - Memtable size exceeds ColumnFamilyOptions::write_buffer_size
+//     after a write.
+// 2 - Total memtable size across all column families exceeds
+// DBOptions::db_write_buffer_size,
+//     or DBOptions::write_buffer_manager signals a flush. In this scenario
+//     the largest memtable will be flushed.
+// 3 - Total WAL file size exceeds DBOptions::max_total_wal_size.
+//     In this scenario the memtable with the oldest data will be flushed,
+//     in order to allow the WAL file with data from this memtable to be
+//     purged.
+//
+// As a result, a memtable can be flushed before it is full. This is one
+// reason the generated SST file can be smaller than the corresponding
+// memtable. Compression is another factor to make SST file smaller than
+// corresponding memtable, since data in memtable is uncompressed.
+
+TEST_F(DBFlushTest, StatisticsGarbageBasic) {
+  Options options = CurrentOptions();
+
+  // The following options are used to enforce several values that
+  // may already exist as default values to make this test resilient
+  // to default value updates in the future.
+  options.statistics = CreateDBStatistics();
+
+  // Record all statistics.
+  options.statistics->set_stats_level(StatsLevel::kAll);
+
+  // create the DB if it's not already present
+  options.create_if_missing = true;
+
+  // Useful for now as we are trying to compare uncompressed data savings on
+  // flush().
+  options.compression = kNoCompression;
+
+  // Prevent memtable in place updates. Should already be disabled
+  // (from Wiki:
+  //  In place updates can be enabled by toggling on the bool
+  //  inplace_update_support flag. However, this flag is by default set to
+  //  false
+  //  because this thread-safe in-place update support is not compatible
+  //  with concurrent memtable writes. Note that the bool
+  //  allow_concurrent_memtable_write is set to true by default )
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+
+  // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes).
+  options.write_buffer_size = 64 << 20;
+
+  ASSERT_OK(TryReopen(options));
+
+  // Put multiple times the same key-values.
+  // The encoded length of a db entry in the memtable is
+  // defined in db/memtable.cc (MemTable::Add) as the variable:
+  // encoded_len=  VarintLength(internal_key_size)  --> =
+  // log_256(internal_key).
+  // Min # of bytes
+  //                                                       necessary to
+  //                                                       store
+  //                                                       internal_key_size.
+  //             + internal_key_size                --> = actual key string,
+  //             (size key_size: w/o term null char)
+  //                                                      + 8 bytes for
+  //                                                      fixed uint64 "seq
+  //                                                      number
+  // +
+  //                                                      insertion type"
+  //             + VarintLength(val_size)           --> = min # of bytes to
+  //             store val_size
+  //             + val_size                         --> = actual value
+  //             string
+  // For example, in our situation, "key1" : size 4, "value1" : size 6
+  // (the terminating null characters are not copied over to the memtable).
+  // And therefore encoded_len = 1 + (4+8) + 1 + 6 = 20 bytes per entry.
+  // However in terms of raw data contained in the memtable, and written
+  // over to the SSTable, we only count internal_key_size and val_size,
+  // because this is the only raw chunk of bytes that contains everything
+  // necessary to reconstruct a user entry: sequence number, insertion type,
+  // key, and value.
+
+  // To test the relevance of our Memtable garbage statistics,
+  // namely MEMTABLE_PAYLOAD_BYTES_AT_FLUSH and MEMTABLE_GARBAGE_BYTES_AT_FLUSH,
+  // we insert K-V pairs with 3 distinct keys (of length 4),
+  // and random values of arbitrary length RAND_VALUES_LENGTH,
+  // and we repeat this step NUM_REPEAT times total.
+  // At the end, we insert 3 final K-V pairs with the same 3 keys
+  // and known values (these will be the final values, of length 6).
+  // I chose NUM_REPEAT=2,000 such that no automatic flush is
+  // triggered (the number of bytes in the memtable is therefore
+  // well below any meaningful heuristic for a memtable of size 64MB).
+  // As a result, since each K-V pair is inserted as a payload
+  // of N meaningful bytes (sequence number, insertion type,
+  // key, and value = 8 + 4 + RAND_VALUE_LENGTH),
+  // MEMTABLE_GARBAGE_BYTES_AT_FLUSH should be equal to 2,000 * N bytes
+  // and MEMTABLE_PAYLAOD_BYTES_AT_FLUSH = MEMTABLE_GARBAGE_BYTES_AT_FLUSH +
+  // (3*(8 + 4 + 6)) bytes. For RAND_VALUE_LENGTH = 172 (arbitrary value), we
+  // expect:
+  //      N = 8 + 4 + 172 = 184 bytes
+  //      MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 2,000 * 184 = 368,000 bytes.
+  //      MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 368,000 + 3*18 = 368,054 bytes.
+
+  const size_t NUM_REPEAT = 2000;
+  const size_t RAND_VALUES_LENGTH = 172;
+  const std::string KEY1 = "key1";
+  const std::string KEY2 = "key2";
+  const std::string KEY3 = "key3";
+  const std::string VALUE1 = "value1";
+  const std::string VALUE2 = "value2";
+  const std::string VALUE3 = "value3";
+  uint64_t EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 0;
+  uint64_t EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 0;
+
+  Random rnd(301);
+  // Insertion of of K-V pairs, multiple times.
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+    std::string p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
+    std::string p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
+    std::string p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
+    ASSERT_OK(Put(KEY1, p_v1));
+    ASSERT_OK(Put(KEY2, p_v2));
+    ASSERT_OK(Put(KEY3, p_v3));
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY1.size() + p_v1.size() + sizeof(uint64_t);
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY2.size() + p_v2.size() + sizeof(uint64_t);
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY3.size() + p_v3.size() + sizeof(uint64_t);
+  }
+
+  // The memtable data bytes includes the "garbage"
+  // bytes along with the useful payload.
+  EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH =
+      EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH;
+
+  ASSERT_OK(Put(KEY1, VALUE1));
+  ASSERT_OK(Put(KEY2, VALUE2));
+  ASSERT_OK(Put(KEY3, VALUE3));
+
+  // Add useful payload to the memtable data bytes:
+  EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH +=
+      KEY1.size() + VALUE1.size() + KEY2.size() + VALUE2.size() + KEY3.size() +
+      VALUE3.size() + 3 * sizeof(uint64_t);
+
+  // We assert that the last K-V pairs have been successfully inserted,
+  // and that the valid values are VALUE1, VALUE2, VALUE3.
+  PinnableSlice value;
+  ASSERT_OK(Get(KEY1, &value));
+  ASSERT_EQ(value.ToString(), VALUE1);
+  ASSERT_OK(Get(KEY2, &value));
+  ASSERT_EQ(value.ToString(), VALUE2);
+  ASSERT_OK(Get(KEY3, &value));
+  ASSERT_EQ(value.ToString(), VALUE3);
+
+  // Force flush to SST. Increments the statistics counter.
+  ASSERT_OK(Flush());
+
+  // Collect statistics.
+  uint64_t mem_data_bytes =
+      TestGetTickerCount(options, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+  uint64_t mem_garbage_bytes =
+      TestGetTickerCount(options, MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+  EXPECT_EQ(mem_data_bytes, EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+  EXPECT_EQ(mem_garbage_bytes, EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+  Close();
+}
+
+TEST_F(DBFlushTest, StatisticsGarbageInsertAndDeletes) {
+  Options options = CurrentOptions();
+  options.statistics = CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kAll);
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+  options.write_buffer_size = 67108864;
+
+  ASSERT_OK(TryReopen(options));
+
+  const size_t NUM_REPEAT = 2000;
+  const size_t RAND_VALUES_LENGTH = 37;
+  const std::string KEY1 = "key1";
+  const std::string KEY2 = "key2";
+  const std::string KEY3 = "key3";
+  const std::string KEY4 = "key4";
+  const std::string KEY5 = "key5";
+  const std::string KEY6 = "key6";
+
+  uint64_t EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 0;
+  uint64_t EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 0;
+
+  WriteBatch batch;
+
+  Random rnd(301);
+  // Insertion of of K-V pairs, multiple times.
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+    std::string p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
+    std::string p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
+    std::string p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
+    ASSERT_OK(Put(KEY1, p_v1));
+    ASSERT_OK(Put(KEY2, p_v2));
+    ASSERT_OK(Put(KEY3, p_v3));
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY1.size() + p_v1.size() + sizeof(uint64_t);
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY2.size() + p_v2.size() + sizeof(uint64_t);
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY3.size() + p_v3.size() + sizeof(uint64_t);
+    ASSERT_OK(Delete(KEY1));
+    ASSERT_OK(Delete(KEY2));
+    ASSERT_OK(Delete(KEY3));
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY1.size() + KEY2.size() + KEY3.size() + 3 * sizeof(uint64_t);
+  }
+
+  // The memtable data bytes includes the "garbage"
+  // bytes along with the useful payload.
+  EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH =
+      EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH;
+
+  // Note : one set of delete for KEY1, KEY2, KEY3 is written to
+  // SSTable to propagate the delete operations to K-V pairs
+  // that could have been inserted into the database during past Flush
+  // opeartions.
+  EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH -=
+      KEY1.size() + KEY2.size() + KEY3.size() + 3 * sizeof(uint64_t);
+
+  // Additional useful paylaod.
+  ASSERT_OK(Delete(KEY4));
+  ASSERT_OK(Delete(KEY5));
+  ASSERT_OK(Delete(KEY6));
+
+  // // Add useful payload to the memtable data bytes:
+  EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH +=
+      KEY4.size() + KEY5.size() + KEY6.size() + 3 * sizeof(uint64_t);
+
+  // We assert that the K-V pairs have been successfully deleted.
+  PinnableSlice value;
+  ASSERT_NOK(Get(KEY1, &value));
+  ASSERT_NOK(Get(KEY2, &value));
+  ASSERT_NOK(Get(KEY3, &value));
+
+  // Force flush to SST. Increments the statistics counter.
+  ASSERT_OK(Flush());
+
+  // Collect statistics.
+  uint64_t mem_data_bytes =
+      TestGetTickerCount(options, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+  uint64_t mem_garbage_bytes =
+      TestGetTickerCount(options, MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+  EXPECT_EQ(mem_data_bytes, EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+  EXPECT_EQ(mem_garbage_bytes, EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+  Close();
+}
+
+TEST_F(DBFlushTest, StatisticsGarbageRangeDeletes) {
+  Options options = CurrentOptions();
+  options.statistics = CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kAll);
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+  options.write_buffer_size = 67108864;
+
+  ASSERT_OK(TryReopen(options));
+
+  const size_t NUM_REPEAT = 1000;
+  const size_t RAND_VALUES_LENGTH = 42;
+  const std::string KEY1 = "key1";
+  const std::string KEY2 = "key2";
+  const std::string KEY3 = "key3";
+  const std::string KEY4 = "key4";
+  const std::string KEY5 = "key5";
+  const std::string KEY6 = "key6";
+  const std::string VALUE3 = "value3";
+
+  uint64_t EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 0;
+  uint64_t EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 0;
+
+  Random rnd(301);
+  // Insertion of of K-V pairs, multiple times.
+  // Also insert DeleteRange
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+    std::string p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
+    std::string p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
+    std::string p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
+    ASSERT_OK(Put(KEY1, p_v1));
+    ASSERT_OK(Put(KEY2, p_v2));
+    ASSERT_OK(Put(KEY3, p_v3));
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY1.size() + p_v1.size() + sizeof(uint64_t);
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY2.size() + p_v2.size() + sizeof(uint64_t);
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY3.size() + p_v3.size() + sizeof(uint64_t);
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY1,
+                               KEY2));
+    // Note: DeleteRange have an exclusive upper bound, e.g. here: [KEY2,KEY3)
+    // is deleted.
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY2,
+                               KEY3));
+    // Delete ranges are stored as a regular K-V pair, with key=STARTKEY,
+    // value=ENDKEY.
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        (KEY1.size() + KEY2.size() + sizeof(uint64_t)) +
+        (KEY2.size() + KEY3.size() + sizeof(uint64_t));
+  }
+
+  // The memtable data bytes includes the "garbage"
+  // bytes along with the useful payload.
+  EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH =
+      EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH;
+
+  // Note : one set of deleteRange for (KEY1, KEY2) and (KEY2, KEY3) is written
+  // to SSTable to propagate the deleteRange operations to K-V pairs that could
+  // have been inserted into the database during past Flush opeartions.
+  EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH -=
+      (KEY1.size() + KEY2.size() + sizeof(uint64_t)) +
+      (KEY2.size() + KEY3.size() + sizeof(uint64_t));
+
+  // Overwrite KEY3 with known value (VALUE3)
+  // Note that during the whole time KEY3 has never been deleted
+  // by the RangeDeletes.
+  ASSERT_OK(Put(KEY3, VALUE3));
+  EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH +=
+      KEY3.size() + VALUE3.size() + sizeof(uint64_t);
+
+  // Additional useful paylaod.
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY4, KEY5));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY5, KEY6));
+
+  // Add useful payload to the memtable data bytes:
+  EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH +=
+      (KEY4.size() + KEY5.size() + sizeof(uint64_t)) +
+      (KEY5.size() + KEY6.size() + sizeof(uint64_t));
+
+  // We assert that the K-V pairs have been successfully deleted.
+  PinnableSlice value;
+  ASSERT_NOK(Get(KEY1, &value));
+  ASSERT_NOK(Get(KEY2, &value));
+  // And that KEY3's value is correct.
+  ASSERT_OK(Get(KEY3, &value));
+  ASSERT_EQ(value, VALUE3);
+
+  // Force flush to SST. Increments the statistics counter.
+  ASSERT_OK(Flush());
+
+  // Collect statistics.
+  uint64_t mem_data_bytes =
+      TestGetTickerCount(options, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+  uint64_t mem_garbage_bytes =
+      TestGetTickerCount(options, MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+  EXPECT_EQ(mem_data_bytes, EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+  EXPECT_EQ(mem_garbage_bytes, EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+  Close();
+}
+
+#ifndef ROCKSDB_LITE
+// This simple Listener can only handle one flush at a time.
+class TestFlushListener : public EventListener {
+ public:
+  TestFlushListener(Env* env, DBFlushTest* test)
+      : slowdown_count(0), stop_count(0), db_closed(), env_(env), test_(test) {
+    db_closed = false;
+  }
+
+  ~TestFlushListener() override {
+    prev_fc_info_.status.PermitUncheckedError();  // Ignore the status
+  }
+
+  void OnTableFileCreated(const TableFileCreationInfo& info) override {
+    // remember the info for later checking the FlushJobInfo.
+    prev_fc_info_ = info;
+    ASSERT_GT(info.db_name.size(), 0U);
+    ASSERT_GT(info.cf_name.size(), 0U);
+    ASSERT_GT(info.file_path.size(), 0U);
+    ASSERT_GT(info.job_id, 0);
+    ASSERT_GT(info.table_properties.data_size, 0U);
+    ASSERT_GT(info.table_properties.raw_key_size, 0U);
+    ASSERT_GT(info.table_properties.raw_value_size, 0U);
+    ASSERT_GT(info.table_properties.num_data_blocks, 0U);
+    ASSERT_GT(info.table_properties.num_entries, 0U);
+    ASSERT_EQ(info.file_checksum, kUnknownFileChecksum);
+    ASSERT_EQ(info.file_checksum_func_name, kUnknownFileChecksumFuncName);
+  }
+
+  void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
+    flushed_dbs_.push_back(db);
+    flushed_column_family_names_.push_back(info.cf_name);
+    if (info.triggered_writes_slowdown) {
+      slowdown_count++;
+    }
+    if (info.triggered_writes_stop) {
+      stop_count++;
+    }
+    // verify whether the previously created file matches the flushed file.
+    ASSERT_EQ(prev_fc_info_.db_name, db->GetName());
+    ASSERT_EQ(prev_fc_info_.cf_name, info.cf_name);
+    ASSERT_EQ(prev_fc_info_.job_id, info.job_id);
+    ASSERT_EQ(prev_fc_info_.file_path, info.file_path);
+    ASSERT_EQ(TableFileNameToNumber(info.file_path), info.file_number);
+
+    // Note: the following chunk relies on the notification pertaining to the
+    // database pointed to by DBTestBase::db_, and is thus bypassed when
+    // that assumption does not hold (see the test case MultiDBMultiListeners
+    // below).
+    ASSERT_TRUE(test_);
+    if (db == test_->db_) {
+      std::vector<std::vector<FileMetaData>> files_by_level;
+      test_->dbfull()->TEST_GetFilesMetaData(db->DefaultColumnFamily(),
+                                             &files_by_level);
+
+      ASSERT_FALSE(files_by_level.empty());
+      auto it = std::find_if(files_by_level[0].begin(), files_by_level[0].end(),
+                             [&](const FileMetaData& meta) {
+                               return meta.fd.GetNumber() == info.file_number;
+                             });
+      ASSERT_NE(it, files_by_level[0].end());
+      ASSERT_EQ(info.oldest_blob_file_number, it->oldest_blob_file_number);
+    }
+
+    ASSERT_EQ(db->GetEnv()->GetThreadID(), info.thread_id);
+    ASSERT_GT(info.thread_id, 0U);
+  }
+
+  std::vector<std::string> flushed_column_family_names_;
+  std::vector<DB*> flushed_dbs_;
+  int slowdown_count;
+  int stop_count;
+  bool db_closing;
+  std::atomic_bool db_closed;
+  TableFileCreationInfo prev_fc_info_;
+
+ protected:
+  Env* env_;
+  DBFlushTest* test_;
+};
+#endif  // !ROCKSDB_LITE
+
+TEST_F(DBFlushTest, MemPurgeBasic) {
+  Options options = CurrentOptions();
+
+  // The following options are used to enforce several values that
+  // may already exist as default values to make this test resilient
+  // to default value updates in the future.
+  options.statistics = CreateDBStatistics();
+
+  // Record all statistics.
+  options.statistics->set_stats_level(StatsLevel::kAll);
+
+  // create the DB if it's not already present
+  options.create_if_missing = true;
+
+  // Useful for now as we are trying to compare uncompressed data savings on
+  // flush().
+  options.compression = kNoCompression;
+
+  // Prevent memtable in place updates. Should already be disabled
+  // (from Wiki:
+  //  In place updates can be enabled by toggling on the bool
+  //  inplace_update_support flag. However, this flag is by default set to
+  //  false
+  //  because this thread-safe in-place update support is not compatible
+  //  with concurrent memtable writes. Note that the bool
+  //  allow_concurrent_memtable_write is set to true by default )
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+
+  // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes).
+  options.write_buffer_size = 1 << 20;
+#ifndef ROCKSDB_LITE
+  // Initially deactivate the MemPurge prototype.
+  options.experimental_mempurge_threshold = 0.0;
+  TestFlushListener* listener = new TestFlushListener(options.env, this);
+  options.listeners.emplace_back(listener);
+#else
+  // Activate directly the MemPurge prototype.
+  // (RocksDB lite does not support dynamic options)
+  options.experimental_mempurge_threshold = 1.0;
+#endif  // !ROCKSDB_LITE
+  ASSERT_OK(TryReopen(options));
+
+  // RocksDB lite does not support dynamic options
+#ifndef ROCKSDB_LITE
+  // Dynamically activate the MemPurge prototype without restarting the DB.
+  ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+  ASSERT_OK(db_->SetOptions(cfh, {{"experimental_mempurge_threshold", "1.0"}}));
+#endif
+
+  std::atomic<uint32_t> mempurge_count{0};
+  std::atomic<uint32_t> sst_count{0};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:MemPurgeSuccessful",
+      [&](void* /*arg*/) { mempurge_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::string KEY1 = "IamKey1";
+  std::string KEY2 = "IamKey2";
+  std::string KEY3 = "IamKey3";
+  std::string KEY4 = "IamKey4";
+  std::string KEY5 = "IamKey5";
+  std::string KEY6 = "IamKey6";
+  std::string KEY7 = "IamKey7";
+  std::string KEY8 = "IamKey8";
+  std::string KEY9 = "IamKey9";
+  std::string RNDKEY1, RNDKEY2, RNDKEY3;
+  const std::string NOT_FOUND = "NOT_FOUND";
+
+  // Heavy overwrite workload,
+  // more than would fit in maximum allowed memtables.
+  Random rnd(719);
+  const size_t NUM_REPEAT = 100;
+  const size_t RAND_KEYS_LENGTH = 57;
+  const size_t RAND_VALUES_LENGTH = 10240;
+  std::string p_v1, p_v2, p_v3, p_v4, p_v5, p_v6, p_v7, p_v8, p_v9, p_rv1,
+      p_rv2, p_rv3;
+
+  // Insert a very first set of keys that will be
+  // mempurged at least once.
+  p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
+  p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
+  p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
+  p_v4 = rnd.RandomString(RAND_VALUES_LENGTH);
+  ASSERT_OK(Put(KEY1, p_v1));
+  ASSERT_OK(Put(KEY2, p_v2));
+  ASSERT_OK(Put(KEY3, p_v3));
+  ASSERT_OK(Put(KEY4, p_v4));
+  ASSERT_EQ(Get(KEY1), p_v1);
+  ASSERT_EQ(Get(KEY2), p_v2);
+  ASSERT_EQ(Get(KEY3), p_v3);
+  ASSERT_EQ(Get(KEY4), p_v4);
+
+  // Insertion of of K-V pairs, multiple times (overwrites).
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+    p_v5 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v6 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v7 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v8 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v9 = rnd.RandomString(RAND_VALUES_LENGTH);
+
+    ASSERT_OK(Put(KEY5, p_v5));
+    ASSERT_OK(Put(KEY6, p_v6));
+    ASSERT_OK(Put(KEY7, p_v7));
+    ASSERT_OK(Put(KEY8, p_v8));
+    ASSERT_OK(Put(KEY9, p_v9));
+
+    ASSERT_EQ(Get(KEY1), p_v1);
+    ASSERT_EQ(Get(KEY2), p_v2);
+    ASSERT_EQ(Get(KEY3), p_v3);
+    ASSERT_EQ(Get(KEY4), p_v4);
+    ASSERT_EQ(Get(KEY5), p_v5);
+    ASSERT_EQ(Get(KEY6), p_v6);
+    ASSERT_EQ(Get(KEY7), p_v7);
+    ASSERT_EQ(Get(KEY8), p_v8);
+    ASSERT_EQ(Get(KEY9), p_v9);
+  }
+
+  // Check that there was at least one mempurge
+  const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1;
+  // Check that there was no SST files created during flush.
+  const uint32_t EXPECTED_SST_COUNT = 0;
+
+  EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT);
+  EXPECT_EQ(sst_count.exchange(0), EXPECTED_SST_COUNT);
+
+  // Insertion of of K-V pairs, no overwrites.
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+    RNDKEY1 = rnd.RandomString(RAND_KEYS_LENGTH);
+    RNDKEY2 = rnd.RandomString(RAND_KEYS_LENGTH);
+    RNDKEY3 = rnd.RandomString(RAND_KEYS_LENGTH);
+    p_rv1 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_rv2 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_rv3 = rnd.RandomString(RAND_VALUES_LENGTH);
+
+    ASSERT_OK(Put(RNDKEY1, p_rv1));
+    ASSERT_OK(Put(RNDKEY2, p_rv2));
+    ASSERT_OK(Put(RNDKEY3, p_rv3));
+
+    ASSERT_EQ(Get(KEY1), p_v1);
+    ASSERT_EQ(Get(KEY2), p_v2);
+    ASSERT_EQ(Get(KEY3), p_v3);
+    ASSERT_EQ(Get(KEY4), p_v4);
+    ASSERT_EQ(Get(KEY5), p_v5);
+    ASSERT_EQ(Get(KEY6), p_v6);
+    ASSERT_EQ(Get(KEY7), p_v7);
+    ASSERT_EQ(Get(KEY8), p_v8);
+    ASSERT_EQ(Get(KEY9), p_v9);
+    ASSERT_EQ(Get(RNDKEY1), p_rv1);
+    ASSERT_EQ(Get(RNDKEY2), p_rv2);
+    ASSERT_EQ(Get(RNDKEY3), p_rv3);
+  }
+
+  // Assert that at least one flush to storage has been performed
+  EXPECT_GT(sst_count.exchange(0), EXPECTED_SST_COUNT);
+  // (which will consequently increase the number of mempurges recorded too).
+  EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT);
+
+  // Assert that there is no data corruption, even with
+  // a flush to storage.
+  ASSERT_EQ(Get(KEY1), p_v1);
+  ASSERT_EQ(Get(KEY2), p_v2);
+  ASSERT_EQ(Get(KEY3), p_v3);
+  ASSERT_EQ(Get(KEY4), p_v4);
+  ASSERT_EQ(Get(KEY5), p_v5);
+  ASSERT_EQ(Get(KEY6), p_v6);
+  ASSERT_EQ(Get(KEY7), p_v7);
+  ASSERT_EQ(Get(KEY8), p_v8);
+  ASSERT_EQ(Get(KEY9), p_v9);
+  ASSERT_EQ(Get(RNDKEY1), p_rv1);
+  ASSERT_EQ(Get(RNDKEY2), p_rv2);
+  ASSERT_EQ(Get(RNDKEY3), p_rv3);
+
+  Close();
+}
+
+// RocksDB lite does not support dynamic options
+#ifndef ROCKSDB_LITE
+TEST_F(DBFlushTest, MemPurgeBasicToggle) {
+  Options options = CurrentOptions();
+
+  // The following options are used to enforce several values that
+  // may already exist as default values to make this test resilient
+  // to default value updates in the future.
+  options.statistics = CreateDBStatistics();
+
+  // Record all statistics.
+  options.statistics->set_stats_level(StatsLevel::kAll);
+
+  // create the DB if it's not already present
+  options.create_if_missing = true;
+
+  // Useful for now as we are trying to compare uncompressed data savings on
+  // flush().
+  options.compression = kNoCompression;
+
+  // Prevent memtable in place updates. Should already be disabled
+  // (from Wiki:
+  //  In place updates can be enabled by toggling on the bool
+  //  inplace_update_support flag. However, this flag is by default set to
+  //  false
+  //  because this thread-safe in-place update support is not compatible
+  //  with concurrent memtable writes. Note that the bool
+  //  allow_concurrent_memtable_write is set to true by default )
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+
+  // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes).
+  options.write_buffer_size = 1 << 20;
+  // Initially deactivate the MemPurge prototype.
+  // (negative values are equivalent to 0.0).
+  options.experimental_mempurge_threshold = -25.3;
+  TestFlushListener* listener = new TestFlushListener(options.env, this);
+  options.listeners.emplace_back(listener);
+
+  ASSERT_OK(TryReopen(options));
+  // Dynamically activate the MemPurge prototype without restarting the DB.
+  ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+  // Values greater than 1.0 are equivalent to 1.0
+  ASSERT_OK(
+      db_->SetOptions(cfh, {{"experimental_mempurge_threshold", "3.7898"}}));
+  std::atomic<uint32_t> mempurge_count{0};
+  std::atomic<uint32_t> sst_count{0};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:MemPurgeSuccessful",
+      [&](void* /*arg*/) { mempurge_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  const size_t KVSIZE = 3;
+  std::vector<std::string> KEYS(KVSIZE);
+  for (size_t k = 0; k < KVSIZE; k++) {
+    KEYS[k] = "IamKey" + std::to_string(k);
+  }
+
+  std::vector<std::string> RNDVALS(KVSIZE);
+  const std::string NOT_FOUND = "NOT_FOUND";
+
+  // Heavy overwrite workload,
+  // more than would fit in maximum allowed memtables.
+  Random rnd(719);
+  const size_t NUM_REPEAT = 100;
+  const size_t RAND_VALUES_LENGTH = 10240;
+
+  // Insertion of of K-V pairs, multiple times (overwrites).
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    for (size_t j = 0; j < KEYS.size(); j++) {
+      RNDVALS[j] = rnd.RandomString(RAND_VALUES_LENGTH);
+      ASSERT_OK(Put(KEYS[j], RNDVALS[j]));
+      ASSERT_EQ(Get(KEYS[j]), RNDVALS[j]);
+    }
+    for (size_t j = 0; j < KEYS.size(); j++) {
+      ASSERT_EQ(Get(KEYS[j]), RNDVALS[j]);
+    }
+  }
+
+  // Check that there was at least one mempurge
+  const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1;
+  // Check that there was no SST files created during flush.
+  const uint32_t EXPECTED_SST_COUNT = 0;
+
+  EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT);
+  EXPECT_EQ(sst_count.exchange(0), EXPECTED_SST_COUNT);
+
+  // Dynamically deactivate MemPurge.
+  ASSERT_OK(
+      db_->SetOptions(cfh, {{"experimental_mempurge_threshold", "-1023.0"}}));
+
+  // Insertion of of K-V pairs, multiple times (overwrites).
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    for (size_t j = 0; j < KEYS.size(); j++) {
+      RNDVALS[j] = rnd.RandomString(RAND_VALUES_LENGTH);
+      ASSERT_OK(Put(KEYS[j], RNDVALS[j]));
+      ASSERT_EQ(Get(KEYS[j]), RNDVALS[j]);
+    }
+    for (size_t j = 0; j < KEYS.size(); j++) {
+      ASSERT_EQ(Get(KEYS[j]), RNDVALS[j]);
+    }
+  }
+
+  // Check that there was at least one mempurge
+  const uint32_t ZERO = 0;
+  // Assert that at least one flush to storage has been performed
+  EXPECT_GT(sst_count.exchange(0), EXPECTED_SST_COUNT);
+  // The mempurge count is expected to be set to 0 when the options are updated.
+  // We expect no mempurge at all.
+  EXPECT_EQ(mempurge_count.exchange(0), ZERO);
+
+  Close();
+}
+// Closes the "#ifndef ROCKSDB_LITE"
+// End of MemPurgeBasicToggle, which is not
+// supported with RocksDB LITE because it
+// relies on dynamically changing the option
+// flag experimental_mempurge_threshold.
+#endif
+
+// At the moment, MemPurge feature is deactivated
+// when atomic_flush is enabled. This is because the level
+// of garbage between Column Families is not guaranteed to
+// be consistent, therefore a CF could hypothetically
+// trigger a MemPurge while another CF would trigger
+// a regular Flush.
+TEST_F(DBFlushTest, MemPurgeWithAtomicFlush) {
+  Options options = CurrentOptions();
+
+  // The following options are used to enforce several values that
+  // may already exist as default values to make this test resilient
+  // to default value updates in the future.
+  options.statistics = CreateDBStatistics();
+
+  // Record all statistics.
+  options.statistics->set_stats_level(StatsLevel::kAll);
+
+  // create the DB if it's not already present
+  options.create_if_missing = true;
+
+  // Useful for now as we are trying to compare uncompressed data savings on
+  // flush().
+  options.compression = kNoCompression;
+
+  // Prevent memtable in place updates. Should already be disabled
+  // (from Wiki:
+  //  In place updates can be enabled by toggling on the bool
+  //  inplace_update_support flag. However, this flag is by default set to
+  //  false
+  //  because this thread-safe in-place update support is not compatible
+  //  with concurrent memtable writes. Note that the bool
+  //  allow_concurrent_memtable_write is set to true by default )
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+
+  // Enforce size of a single MemTable to 64KB (64KB = 65,536 bytes).
+  options.write_buffer_size = 1 << 20;
+  // Activate the MemPurge prototype.
+  options.experimental_mempurge_threshold = 153.245;
+  // Activate atomic_flush.
+  options.atomic_flush = true;
+
+  const std::vector<std::string> new_cf_names = {"pikachu", "eevie"};
+  CreateColumnFamilies(new_cf_names, options);
+
+  Close();
+
+  // 3 CFs: default will be filled with overwrites (would normally trigger
+  // mempurge)
+  //        new_cf_names[1] will be filled with random values (would trigger
+  //        flush) new_cf_names[2] not filled with anything.
+  ReopenWithColumnFamilies(
+      {kDefaultColumnFamilyName, new_cf_names[0], new_cf_names[1]}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(2, "bar", "baz"));
+
+  std::atomic<uint32_t> mempurge_count{0};
+  std::atomic<uint32_t> sst_count{0};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:MemPurgeSuccessful",
+      [&](void* /*arg*/) { mempurge_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  const size_t KVSIZE = 3;
+  std::vector<std::string> KEYS(KVSIZE);
+  for (size_t k = 0; k < KVSIZE; k++) {
+    KEYS[k] = "IamKey" + std::to_string(k);
+  }
+
+  std::string RNDKEY;
+  std::vector<std::string> RNDVALS(KVSIZE);
+  const std::string NOT_FOUND = "NOT_FOUND";
+
+  // Heavy overwrite workload,
+  // more than would fit in maximum allowed memtables.
+  Random rnd(106);
+  const size_t NUM_REPEAT = 100;
+  const size_t RAND_KEY_LENGTH = 128;
+  const size_t RAND_VALUES_LENGTH = 10240;
+
+  // Insertion of of K-V pairs, multiple times (overwrites).
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    for (size_t j = 0; j < KEYS.size(); j++) {
+      RNDKEY = rnd.RandomString(RAND_KEY_LENGTH);
+      RNDVALS[j] = rnd.RandomString(RAND_VALUES_LENGTH);
+      ASSERT_OK(Put(KEYS[j], RNDVALS[j]));
+      ASSERT_OK(Put(1, RNDKEY, RNDVALS[j]));
+      ASSERT_EQ(Get(KEYS[j]), RNDVALS[j]);
+      ASSERT_EQ(Get(1, RNDKEY), RNDVALS[j]);
+    }
+  }
+
+  // Check that there was no mempurge because atomic_flush option is true.
+  const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 0;
+  // Check that there was at least one SST files created during flush.
+  const uint32_t EXPECTED_SST_COUNT = 1;
+
+  EXPECT_EQ(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT);
+  EXPECT_GE(sst_count.exchange(0), EXPECTED_SST_COUNT);
+
+  Close();
+}
+
+TEST_F(DBFlushTest, MemPurgeDeleteAndDeleteRange) {
+  Options options = CurrentOptions();
+
+  options.statistics = CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kAll);
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+#ifndef ROCKSDB_LITE
+  TestFlushListener* listener = new TestFlushListener(options.env, this);
+  options.listeners.emplace_back(listener);
+#endif  // !ROCKSDB_LITE
+  // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes).
+  options.write_buffer_size = 1 << 20;
+  // Activate the MemPurge prototype.
+  options.experimental_mempurge_threshold = 15.0;
+
+  ASSERT_OK(TryReopen(options));
+
+  std::atomic<uint32_t> mempurge_count{0};
+  std::atomic<uint32_t> sst_count{0};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:MemPurgeSuccessful",
+      [&](void* /*arg*/) { mempurge_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::string KEY1 = "ThisIsKey1";
+  std::string KEY2 = "ThisIsKey2";
+  std::string KEY3 = "ThisIsKey3";
+  std::string KEY4 = "ThisIsKey4";
+  std::string KEY5 = "ThisIsKey5";
+  const std::string NOT_FOUND = "NOT_FOUND";
+
+  Random rnd(117);
+  const size_t NUM_REPEAT = 100;
+  const size_t RAND_VALUES_LENGTH = 10240;
+
+  std::string key, value, p_v1, p_v2, p_v3, p_v3b, p_v4, p_v5;
+  int count = 0;
+  const int EXPECTED_COUNT_FORLOOP = 3;
+  const int EXPECTED_COUNT_END = 4;
+
+  ReadOptions ropt;
+  ropt.pin_data = true;
+  ropt.total_order_seek = true;
+  Iterator* iter = nullptr;
+
+  // Insertion of of K-V pairs, multiple times.
+  // Also insert DeleteRange
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+    p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v3b = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v4 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v5 = rnd.RandomString(RAND_VALUES_LENGTH);
+    ASSERT_OK(Put(KEY1, p_v1));
+    ASSERT_OK(Put(KEY2, p_v2));
+    ASSERT_OK(Put(KEY3, p_v3));
+    ASSERT_OK(Put(KEY4, p_v4));
+    ASSERT_OK(Put(KEY5, p_v5));
+    ASSERT_OK(Delete(KEY2));
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY2,
+                               KEY4));
+    ASSERT_OK(Put(KEY3, p_v3b));
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY1,
+                               KEY3));
+    ASSERT_OK(Delete(KEY1));
+
+    ASSERT_EQ(Get(KEY1), NOT_FOUND);
+    ASSERT_EQ(Get(KEY2), NOT_FOUND);
+    ASSERT_EQ(Get(KEY3), p_v3b);
+    ASSERT_EQ(Get(KEY4), p_v4);
+    ASSERT_EQ(Get(KEY5), p_v5);
+
+    iter = db_->NewIterator(ropt);
+    iter->SeekToFirst();
+    count = 0;
+    for (; iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      key = (iter->key()).ToString(false);
+      value = (iter->value()).ToString(false);
+      if (key.compare(KEY3) == 0)
+        ASSERT_EQ(value, p_v3b);
+      else if (key.compare(KEY4) == 0)
+        ASSERT_EQ(value, p_v4);
+      else if (key.compare(KEY5) == 0)
+        ASSERT_EQ(value, p_v5);
+      else
+        ASSERT_EQ(value, NOT_FOUND);
+      count++;
+    }
+
+    // Expected count here is 3: KEY3, KEY4, KEY5.
+    ASSERT_EQ(count, EXPECTED_COUNT_FORLOOP);
+    if (iter) {
+      delete iter;
+    }
+  }
+
+  // Check that there was at least one mempurge
+  const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1;
+  // Check that there was no SST files created during flush.
+  const uint32_t EXPECTED_SST_COUNT = 0;
+
+  EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT);
+  EXPECT_EQ(sst_count.exchange(0), EXPECTED_SST_COUNT);
+
+  // Additional test for the iterator+memPurge.
+  ASSERT_OK(Put(KEY2, p_v2));
+  iter = db_->NewIterator(ropt);
+  iter->SeekToFirst();
+  ASSERT_OK(Put(KEY4, p_v4));
+  count = 0;
+  for (; iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    key = (iter->key()).ToString(false);
+    value = (iter->value()).ToString(false);
+    if (key.compare(KEY2) == 0)
+      ASSERT_EQ(value, p_v2);
+    else if (key.compare(KEY3) == 0)
+      ASSERT_EQ(value, p_v3b);
+    else if (key.compare(KEY4) == 0)
+      ASSERT_EQ(value, p_v4);
+    else if (key.compare(KEY5) == 0)
+      ASSERT_EQ(value, p_v5);
+    else
+      ASSERT_EQ(value, NOT_FOUND);
+    count++;
+  }
+
+  // Expected count here is 4: KEY2, KEY3, KEY4, KEY5.
+  ASSERT_EQ(count, EXPECTED_COUNT_END);
+  if (iter) delete iter;
+
+  Close();
+}
+
+// Create a Compaction Fitler that will be invoked
+// at flush time and will update the value of a KV pair
+// if the key string is "lower" than the filter_key_ string.
+class ConditionalUpdateFilter : public CompactionFilter {
+ public:
+  explicit ConditionalUpdateFilter(const std::string* filtered_key)
+      : filtered_key_(filtered_key) {}
+  bool Filter(int /*level*/, const Slice& key, const Slice& /*value*/,
+              std::string* new_value, bool* value_changed) const override {
+    // If key<filtered_key_, update the value of the KV-pair.
+    if (key.compare(*filtered_key_) < 0) {
+      assert(new_value != nullptr);
+      *new_value = NEW_VALUE;
+      *value_changed = true;
+    }
+    return false /*do not remove this KV-pair*/;
+  }
+
+  const char* Name() const override { return "ConditionalUpdateFilter"; }
+
+ private:
+  const std::string* filtered_key_;
+};
+
+class ConditionalUpdateFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit ConditionalUpdateFilterFactory(const Slice& filtered_key)
+      : filtered_key_(filtered_key.ToString()) {}
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& /*context*/) override {
+    return std::unique_ptr<CompactionFilter>(
+        new ConditionalUpdateFilter(&filtered_key_));
+  }
+
+  const char* Name() const override { return "ConditionalUpdateFilterFactory"; }
+
+  bool ShouldFilterTableFileCreation(
+      TableFileCreationReason reason) const override {
+    // This compaction filter will be invoked
+    // at flush time (and therefore at MemPurge time).
+    return (reason == TableFileCreationReason::kFlush);
+  }
+
+ private:
+  std::string filtered_key_;
+};
+
+TEST_F(DBFlushTest, MemPurgeAndCompactionFilter) {
+  Options options = CurrentOptions();
+
+  std::string KEY1 = "ThisIsKey1";
+  std::string KEY2 = "ThisIsKey2";
+  std::string KEY3 = "ThisIsKey3";
+  std::string KEY4 = "ThisIsKey4";
+  std::string KEY5 = "ThisIsKey5";
+  std::string KEY6 = "ThisIsKey6";
+  std::string KEY7 = "ThisIsKey7";
+  std::string KEY8 = "ThisIsKey8";
+  std::string KEY9 = "ThisIsKey9";
+  const std::string NOT_FOUND = "NOT_FOUND";
+
+  options.statistics = CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kAll);
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+#ifndef ROCKSDB_LITE
+  TestFlushListener* listener = new TestFlushListener(options.env, this);
+  options.listeners.emplace_back(listener);
+#endif  // !ROCKSDB_LITE
+  // Create a ConditionalUpdate compaction filter
+  // that will update all the values of the KV pairs
+  // where the keys are "lower" than KEY4.
+  options.compaction_filter_factory =
+      std::make_shared<ConditionalUpdateFilterFactory>(KEY4);
+
+  // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes).
+  options.write_buffer_size = 1 << 20;
+  // Activate the MemPurge prototype.
+  options.experimental_mempurge_threshold = 26.55;
+
+  ASSERT_OK(TryReopen(options));
+
+  std::atomic<uint32_t> mempurge_count{0};
+  std::atomic<uint32_t> sst_count{0};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:MemPurgeSuccessful",
+      [&](void* /*arg*/) { mempurge_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(53);
+  const size_t NUM_REPEAT = 1000;
+  const size_t RAND_VALUES_LENGTH = 10240;
+  std::string p_v1, p_v2, p_v3, p_v4, p_v5, p_v6, p_v7, p_v8, p_v9;
+
+  p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
+  p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
+  p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
+  p_v4 = rnd.RandomString(RAND_VALUES_LENGTH);
+  p_v5 = rnd.RandomString(RAND_VALUES_LENGTH);
+  ASSERT_OK(Put(KEY1, p_v1));
+  ASSERT_OK(Put(KEY2, p_v2));
+  ASSERT_OK(Put(KEY3, p_v3));
+  ASSERT_OK(Put(KEY4, p_v4));
+  ASSERT_OK(Put(KEY5, p_v5));
+  ASSERT_OK(Delete(KEY1));
+
+  // Insertion of of K-V pairs, multiple times.
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    // Create value strings of arbitrary
+    // length RAND_VALUES_LENGTH bytes.
+    p_v6 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v7 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v8 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v9 = rnd.RandomString(RAND_VALUES_LENGTH);
+    ASSERT_OK(Put(KEY6, p_v6));
+    ASSERT_OK(Put(KEY7, p_v7));
+    ASSERT_OK(Put(KEY8, p_v8));
+    ASSERT_OK(Put(KEY9, p_v9));
+
+    ASSERT_OK(Delete(KEY7));
+  }
+
+  // Check that there was at least one mempurge
+  const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1;
+  // Check that there was no SST files created during flush.
+  const uint32_t EXPECTED_SST_COUNT = 0;
+
+  EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT);
+  EXPECT_EQ(sst_count.exchange(0), EXPECTED_SST_COUNT);
+
+  // Verify that the ConditionalUpdateCompactionFilter
+  // updated the values of KEY2 and KEY3, and not KEY4 and KEY5.
+  ASSERT_EQ(Get(KEY1), NOT_FOUND);
+  ASSERT_EQ(Get(KEY2), NEW_VALUE);
+  ASSERT_EQ(Get(KEY3), NEW_VALUE);
+  ASSERT_EQ(Get(KEY4), p_v4);
+  ASSERT_EQ(Get(KEY5), p_v5);
+}
+
+TEST_F(DBFlushTest, DISABLED_MemPurgeWALSupport) {
+  Options options = CurrentOptions();
+
+  options.statistics = CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kAll);
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+
+  // Enforce size of a single MemTable to 128KB.
+  options.write_buffer_size = 128 << 10;
+  // Activate the MemPurge prototype
+  // (values >1.0 are equivalent to 1.0).
+  options.experimental_mempurge_threshold = 2.5;
+
+  ASSERT_OK(TryReopen(options));
+
+  const size_t KVSIZE = 10;
+
+  do {
+    CreateAndReopenWithCF({"pikachu"}, options);
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Put(1, "baz", "v5"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_EQ("v1", Get(1, "foo"));
+
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v5", Get(1, "baz"));
+    ASSERT_OK(Put(0, "bar", "v2"));
+    ASSERT_OK(Put(1, "bar", "v2"));
+    ASSERT_OK(Put(1, "foo", "v3"));
+    std::atomic<uint32_t> mempurge_count{0};
+    std::atomic<uint32_t> sst_count{0};
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::FlushJob:MemPurgeSuccessful",
+        [&](void* /*arg*/) { mempurge_count++; });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    std::vector<std::string> keys;
+    for (size_t k = 0; k < KVSIZE; k++) {
+      keys.push_back("IamKey" + std::to_string(k));
+    }
+
+    std::string RNDKEY, RNDVALUE;
+    const std::string NOT_FOUND = "NOT_FOUND";
+
+    // Heavy overwrite workload,
+    // more than would fit in maximum allowed memtables.
+    Random rnd(719);
+    const size_t NUM_REPEAT = 100;
+    const size_t RAND_KEY_LENGTH = 4096;
+    const size_t RAND_VALUES_LENGTH = 1024;
+    std::vector<std::string> values_default(KVSIZE), values_pikachu(KVSIZE);
+
+    // Insert a very first set of keys that will be
+    // mempurged at least once.
+    for (size_t k = 0; k < KVSIZE / 2; k++) {
+      values_default[k] = rnd.RandomString(RAND_VALUES_LENGTH);
+      values_pikachu[k] = rnd.RandomString(RAND_VALUES_LENGTH);
+    }
+
+    // Insert keys[0:KVSIZE/2] to
+    // both 'default' and 'pikachu' CFs.
+    for (size_t k = 0; k < KVSIZE / 2; k++) {
+      ASSERT_OK(Put(0, keys[k], values_default[k]));
+      ASSERT_OK(Put(1, keys[k], values_pikachu[k]));
+    }
+
+    // Check that the insertion was seamless.
+    for (size_t k = 0; k < KVSIZE / 2; k++) {
+      ASSERT_EQ(Get(0, keys[k]), values_default[k]);
+      ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]);
+    }
+
+    // Insertion of of K-V pairs, multiple times (overwrites)
+    // into 'default' CF. Will trigger mempurge.
+    for (size_t j = 0; j < NUM_REPEAT; j++) {
+      // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+      for (size_t k = KVSIZE / 2; k < KVSIZE; k++) {
+        values_default[k] = rnd.RandomString(RAND_VALUES_LENGTH);
+      }
+
+      // Insert K-V into default CF.
+      for (size_t k = KVSIZE / 2; k < KVSIZE; k++) {
+        ASSERT_OK(Put(0, keys[k], values_default[k]));
+      }
+
+      // Check key validity, for all keys, both in
+      // default and pikachu CFs.
+      for (size_t k = 0; k < KVSIZE; k++) {
+        ASSERT_EQ(Get(0, keys[k]), values_default[k]);
+      }
+      // Note that at this point, only keys[0:KVSIZE/2]
+      // have been inserted into Pikachu.
+      for (size_t k = 0; k < KVSIZE / 2; k++) {
+        ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]);
+      }
+    }
+
+    // Insertion of of K-V pairs, multiple times (overwrites)
+    // into 'pikachu' CF. Will trigger mempurge.
+    // Check that we keep the older logs for 'default' imm().
+    for (size_t j = 0; j < NUM_REPEAT; j++) {
+      // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+      for (size_t k = KVSIZE / 2; k < KVSIZE; k++) {
+        values_pikachu[k] = rnd.RandomString(RAND_VALUES_LENGTH);
+      }
+
+      // Insert K-V into pikachu CF.
+      for (size_t k = KVSIZE / 2; k < KVSIZE; k++) {
+        ASSERT_OK(Put(1, keys[k], values_pikachu[k]));
+      }
+
+      // Check key validity, for all keys,
+      // both in default and pikachu.
+      for (size_t k = 0; k < KVSIZE; k++) {
+        ASSERT_EQ(Get(0, keys[k]), values_default[k]);
+        ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]);
+      }
+    }
+
+    // Check that there was at least one mempurge
+    const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1;
+    // Check that there was no SST files created during flush.
+    const uint32_t EXPECTED_SST_COUNT = 0;
+
+    EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT);
+    if (options.experimental_mempurge_threshold ==
+        std::numeric_limits<double>::max()) {
+      EXPECT_EQ(sst_count.exchange(0), EXPECTED_SST_COUNT);
+    }
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    // Check that there was no data corruption anywhere,
+    // not in 'default' nor in 'Pikachu' CFs.
+    ASSERT_EQ("v3", Get(1, "foo"));
+    ASSERT_OK(Put(1, "foo", "v4"));
+    ASSERT_EQ("v4", Get(1, "foo"));
+    ASSERT_EQ("v2", Get(1, "bar"));
+    ASSERT_EQ("v5", Get(1, "baz"));
+    // Check keys in 'Default' and 'Pikachu'.
+    // keys[0:KVSIZE/2] were for sure contained
+    // in the imm() at Reopen/recovery time.
+    for (size_t k = 0; k < KVSIZE; k++) {
+      ASSERT_EQ(Get(0, keys[k]), values_default[k]);
+      ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]);
+    }
+    // Insertion of random K-V pairs to trigger
+    // a flush in the Pikachu CF.
+    for (size_t j = 0; j < NUM_REPEAT; j++) {
+      RNDKEY = rnd.RandomString(RAND_KEY_LENGTH);
+      RNDVALUE = rnd.RandomString(RAND_VALUES_LENGTH);
+      ASSERT_OK(Put(1, RNDKEY, RNDVALUE));
+    }
+    // ASsert than there was at least one flush to storage.
+    EXPECT_GT(sst_count.exchange(0), EXPECTED_SST_COUNT);
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_EQ("v4", Get(1, "foo"));
+    ASSERT_EQ("v2", Get(1, "bar"));
+    ASSERT_EQ("v5", Get(1, "baz"));
+    // Since values in default are held in mutable mem()
+    // and imm(), check if the flush in pikachu didn't
+    // affect these values.
+    for (size_t k = 0; k < KVSIZE; k++) {
+      ASSERT_EQ(Get(0, keys[k]), values_default[k]);
+      ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]);
+    }
+    ASSERT_EQ(Get(1, RNDKEY), RNDVALUE);
+  } while (ChangeWalOptions());
+}
+
+TEST_F(DBFlushTest, MemPurgeCorrectLogNumberAndSSTFileCreation) {
+  // Before our bug fix, we noticed that when 2 memtables were
+  // being flushed (with one memtable being the output of a
+  // previous MemPurge and one memtable being a newly-sealed memtable),
+  // the SST file created was not properly added to the DB version
+  // (via the VersionEdit obj), leading to data loss (the SST file
+  // was later being purged as an obsolete file).
+  // Therefore, we reproduce this scenario to test our fix.
+  Options options = CurrentOptions();
+
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+
+  // Enforce size of a single MemTable to 1MB (64MB = 1048576 bytes).
+  options.write_buffer_size = 1 << 20;
+  // Activate the MemPurge prototype.
+  options.experimental_mempurge_threshold = 1.0;
+
+  // Force to have more than one memtable to trigger a flush.
+  // For some reason this option does not seem to be enforced,
+  // so the following test is designed to make sure that we
+  // are testing the correct test case.
+  options.min_write_buffer_number_to_merge = 3;
+  options.max_write_buffer_number = 5;
+  options.max_write_buffer_size_to_maintain = 2 * (options.write_buffer_size);
+  options.disable_auto_compactions = true;
+  ASSERT_OK(TryReopen(options));
+
+  std::atomic<uint32_t> mempurge_count{0};
+  std::atomic<uint32_t> sst_count{0};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:MemPurgeSuccessful",
+      [&](void* /*arg*/) { mempurge_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Dummy variable used for the following callback function.
+  uint64_t ZERO = 0;
+  // We will first execute mempurge operations exclusively.
+  // Therefore, when the first flush is triggered, we want to make
+  // sure there is at least 2 memtables being flushed: one output
+  // from a previous mempurge, and one newly sealed memtable.
+  // This is when we observed in the past that some SST files created
+  // were not properly added to the DB version (via the VersionEdit obj).
+  std::atomic<uint64_t> num_memtable_at_first_flush(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table:num_memtables", [&](void* arg) {
+        uint64_t* mems_size = reinterpret_cast<uint64_t*>(arg);
+        // atomic_compare_exchange_strong sometimes updates the value
+        // of ZERO (the "expected" object), so we make sure ZERO is indeed...
+        // zero.
+        ZERO = 0;
+        std::atomic_compare_exchange_strong(&num_memtable_at_first_flush, &ZERO,
+                                            *mems_size);
+      });
+
+  const std::vector<std::string> KEYS = {
+      "ThisIsKey1", "ThisIsKey2", "ThisIsKey3", "ThisIsKey4", "ThisIsKey5",
+      "ThisIsKey6", "ThisIsKey7", "ThisIsKey8", "ThisIsKey9"};
+  const std::string NOT_FOUND = "NOT_FOUND";
+
+  Random rnd(117);
+  const uint64_t NUM_REPEAT_OVERWRITES = 100;
+  const uint64_t NUM_RAND_INSERTS = 500;
+  const uint64_t RAND_VALUES_LENGTH = 10240;
+
+  std::string key, value;
+  std::vector<std::string> values(9, "");
+
+  // Keys used to check that no SST file disappeared.
+  for (uint64_t k = 0; k < 5; k++) {
+    values[k] = rnd.RandomString(RAND_VALUES_LENGTH);
+    ASSERT_OK(Put(KEYS[k], values[k]));
+  }
+
+  // Insertion of of K-V pairs, multiple times.
+  // Trigger at least one mempurge and no SST file creation.
+  for (size_t i = 0; i < NUM_REPEAT_OVERWRITES; i++) {
+    // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+    for (uint64_t k = 5; k < values.size(); k++) {
+      values[k] = rnd.RandomString(RAND_VALUES_LENGTH);
+      ASSERT_OK(Put(KEYS[k], values[k]));
+    }
+    // Check database consistency.
+    for (uint64_t k = 0; k < values.size(); k++) {
+      ASSERT_EQ(Get(KEYS[k]), values[k]);
+    }
+  }
+
+  // Check that there was at least one mempurge
+  uint32_t expected_min_mempurge_count = 1;
+  // Check that there was no SST files created during flush.
+  uint32_t expected_sst_count = 0;
+  EXPECT_GE(mempurge_count.load(), expected_min_mempurge_count);
+  EXPECT_EQ(sst_count.load(), expected_sst_count);
+
+  // Trigger an SST file creation and no mempurge.
+  for (size_t i = 0; i < NUM_RAND_INSERTS; i++) {
+    key = rnd.RandomString(RAND_VALUES_LENGTH);
+    // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+    value = rnd.RandomString(RAND_VALUES_LENGTH);
+    ASSERT_OK(Put(key, value));
+    // Check database consistency.
+    for (uint64_t k = 0; k < values.size(); k++) {
+      ASSERT_EQ(Get(KEYS[k]), values[k]);
+    }
+    ASSERT_EQ(Get(key), value);
+  }
+
+  // Check that there was at least one SST files created during flush.
+  expected_sst_count = 1;
+  EXPECT_GE(sst_count.load(), expected_sst_count);
+
+  // Oddly enough, num_memtable_at_first_flush is not enforced to be
+  // equal to min_write_buffer_number_to_merge. So by asserting that
+  // the first SST file creation comes from one output memtable
+  // from a previous mempurge, and one newly sealed memtable. This
+  // is the scenario where we observed that some SST files created
+  // were not properly added to the DB version before our bug fix.
+  ASSERT_GE(num_memtable_at_first_flush.load(), 2);
+
+  // Check that no data was lost after SST file creation.
+  for (uint64_t k = 0; k < values.size(); k++) {
+    ASSERT_EQ(Get(KEYS[k]), values[k]);
+  }
+  // Extra check of database consistency.
+  ASSERT_EQ(Get(key), value);
+
+  Close();
+}
+
+TEST_P(DBFlushDirectIOTest, DirectIO) {
+  Options options;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.max_background_flushes = 2;
+  options.use_direct_io_for_flush_and_compaction = GetParam();
+  options.env = MockEnv::Create(Env::Default());
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:create_file", [&](void* arg) {
+        bool* use_direct_writes = static_cast<bool*>(arg);
+        ASSERT_EQ(*use_direct_writes,
+                  options.use_direct_io_for_flush_and_compaction);
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  Reopen(options);
+  ASSERT_OK(Put("foo", "v"));
+  FlushOptions flush_options;
+  flush_options.wait = true;
+  ASSERT_OK(dbfull()->Flush(flush_options));
+  Destroy(options);
+  delete options.env;
+}
+
+TEST_F(DBFlushTest, FlushError) {
+  Options options;
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  options.write_buffer_size = 100;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 3;
+  options.disable_auto_compactions = true;
+  options.env = fault_injection_env.get();
+  Reopen(options);
+
+  ASSERT_OK(Put("key1", "value1"));
+  ASSERT_OK(Put("key2", "value2"));
+  fault_injection_env->SetFilesystemActive(false);
+  Status s = dbfull()->TEST_SwitchMemtable();
+  fault_injection_env->SetFilesystemActive(true);
+  Destroy(options);
+  ASSERT_NE(s, Status::OK());
+}
+
+TEST_F(DBFlushTest, ManualFlushFailsInReadOnlyMode) {
+  // Regression test for bug where manual flush hangs forever when the DB
+  // is in read-only mode. Verify it now at least returns, despite failing.
+  Options options;
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  options.env = fault_injection_env.get();
+  options.max_write_buffer_number = 2;
+  Reopen(options);
+
+  // Trigger a first flush but don't let it run
+  ASSERT_OK(db_->PauseBackgroundWork());
+  ASSERT_OK(Put("key1", "value1"));
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  ASSERT_OK(db_->Flush(flush_opts));
+
+  // Write a key to the second memtable so we have something to flush later
+  // after the DB is in read-only mode.
+  ASSERT_OK(Put("key2", "value2"));
+
+  // Let the first flush continue, hit an error, and put the DB in read-only
+  // mode.
+  fault_injection_env->SetFilesystemActive(false);
+  ASSERT_OK(db_->ContinueBackgroundWork());
+  // We ingested the error to env, so the returned status is not OK.
+  ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable());
+#ifndef ROCKSDB_LITE
+  uint64_t num_bg_errors;
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBackgroundErrors, &num_bg_errors));
+  ASSERT_GT(num_bg_errors, 0);
+#endif  // ROCKSDB_LITE
+
+  // In the bug scenario, triggering another flush would cause the second flush
+  // to hang forever. After the fix we expect it to return an error.
+  ASSERT_NOK(db_->Flush(FlushOptions()));
+
+  Close();
+}
+
+TEST_F(DBFlushTest, CFDropRaceWithWaitForFlushMemTables) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:AfterScheduleFlush",
+        "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"},
+       {"DBFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree",
+        "DBImpl::BackgroundCallFlush:start"},
+       {"DBImpl::BackgroundCallFlush:start",
+        "DBImpl::FlushMemTable:BeforeWaitForBgFlush"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_EQ(2, handles_.size());
+  ASSERT_OK(Put(1, "key", "value"));
+  auto* cfd = static_cast<ColumnFamilyHandleImpl*>(handles_[1])->cfd();
+  port::Thread drop_cf_thr([&]() {
+    TEST_SYNC_POINT(
+        "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop");
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+    ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1]));
+    handles_.resize(1);
+    TEST_SYNC_POINT(
+        "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree");
+  });
+  FlushOptions flush_opts;
+  flush_opts.allow_write_stall = true;
+  ASSERT_NOK(dbfull()->TEST_FlushMemTable(cfd, flush_opts));
+  drop_cf_thr.join();
+  Close();
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBFlushTest, FireOnFlushCompletedAfterCommittedResult) {
+  class TestListener : public EventListener {
+   public:
+    void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
+      // There's only one key in each flush.
+      ASSERT_EQ(info.smallest_seqno, info.largest_seqno);
+      ASSERT_NE(0, info.smallest_seqno);
+      if (info.smallest_seqno == seq1) {
+        // First flush completed
+        ASSERT_FALSE(completed1);
+        completed1 = true;
+        CheckFlushResultCommitted(db, seq1);
+      } else {
+        // Second flush completed
+        ASSERT_FALSE(completed2);
+        completed2 = true;
+        ASSERT_EQ(info.smallest_seqno, seq2);
+        CheckFlushResultCommitted(db, seq2);
+      }
+    }
+
+    void CheckFlushResultCommitted(DB* db, SequenceNumber seq) {
+      DBImpl* db_impl = static_cast_with_check<DBImpl>(db);
+      InstrumentedMutex* mutex = db_impl->mutex();
+      mutex->Lock();
+      auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(
+                      db->DefaultColumnFamily())
+                      ->cfd();
+      ASSERT_LT(seq, cfd->imm()->current()->GetEarliestSequenceNumber());
+      mutex->Unlock();
+    }
+
+    std::atomic<SequenceNumber> seq1{0};
+    std::atomic<SequenceNumber> seq2{0};
+    std::atomic<bool> completed1{false};
+    std::atomic<bool> completed2{false};
+  };
+  std::shared_ptr<TestListener> listener = std::make_shared<TestListener>();
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTableToOutputFile:AfterPickMemtables",
+        "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitFirst"},
+       {"DBImpl::FlushMemTableToOutputFile:Finish",
+        "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitSecond"}});
+  SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table", [&listener](void* arg) {
+        // Wait for the second flush finished, out of mutex.
+        auto* mems = reinterpret_cast<autovector<MemTable*>*>(arg);
+        if (mems->front()->GetEarliestSequenceNumber() == listener->seq1 - 1) {
+          TEST_SYNC_POINT(
+              "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:"
+              "WaitSecond");
+        }
+      });
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.listeners.push_back(listener);
+  // Setting max_flush_jobs = max_background_jobs / 4 = 2.
+  options.max_background_jobs = 8;
+  // Allow 2 immutable memtables.
+  options.max_write_buffer_number = 3;
+  Reopen(options);
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put("foo", "v"));
+  listener->seq1 = db_->GetLatestSequenceNumber();
+  // t1 will wait for the second flush complete before committing flush result.
+  auto t1 = port::Thread([&]() {
+    // flush_opts.wait = true
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  });
+  // Wait for first flush started.
+  TEST_SYNC_POINT(
+      "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitFirst");
+  // The second flush will exit early without commit its result. The work
+  // is delegated to the first flush.
+  ASSERT_OK(Put("bar", "v"));
+  listener->seq2 = db_->GetLatestSequenceNumber();
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  ASSERT_OK(db_->Flush(flush_opts));
+  t1.join();
+  // Ensure background work is fully finished including listener callbacks
+  // before accessing listener state.
+  ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
+  ASSERT_TRUE(listener->completed1);
+  ASSERT_TRUE(listener->completed2);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_F(DBFlushTest, FlushWithBlob) {
+  constexpr uint64_t min_blob_size = 10;
+
+  Options options;
+  options.enable_blob_files = true;
+  options.min_blob_size = min_blob_size;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+
+  Reopen(options);
+
+  constexpr char short_value[] = "short";
+  static_assert(sizeof(short_value) - 1 < min_blob_size,
+                "short_value too long");
+
+  constexpr char long_value[] = "long_value";
+  static_assert(sizeof(long_value) - 1 >= min_blob_size,
+                "long_value too short");
+
+  ASSERT_OK(Put("key1", short_value));
+  ASSERT_OK(Put("key2", long_value));
+
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(Get("key1"), short_value);
+  ASSERT_EQ(Get("key2"), long_value);
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  assert(cfd);
+
+  Version* const current = cfd->current();
+  assert(current);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  assert(storage_info);
+
+  const auto& l0_files = storage_info->LevelFiles(0);
+  ASSERT_EQ(l0_files.size(), 1);
+
+  const FileMetaData* const table_file = l0_files[0];
+  assert(table_file);
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+  ASSERT_EQ(blob_files.size(), 1);
+
+  const auto& blob_file = blob_files.front();
+  assert(blob_file);
+
+  ASSERT_EQ(table_file->smallest.user_key(), "key1");
+  ASSERT_EQ(table_file->largest.user_key(), "key2");
+  ASSERT_EQ(table_file->fd.smallest_seqno, 1);
+  ASSERT_EQ(table_file->fd.largest_seqno, 2);
+  ASSERT_EQ(table_file->oldest_blob_file_number,
+            blob_file->GetBlobFileNumber());
+
+  ASSERT_EQ(blob_file->GetTotalBlobCount(), 1);
+
+#ifndef ROCKSDB_LITE
+  const InternalStats* const internal_stats = cfd->internal_stats();
+  assert(internal_stats);
+
+  const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+  ASSERT_FALSE(compaction_stats.empty());
+  ASSERT_EQ(compaction_stats[0].bytes_written, table_file->fd.GetFileSize());
+  ASSERT_EQ(compaction_stats[0].bytes_written_blob,
+            blob_file->GetTotalBlobBytes());
+  ASSERT_EQ(compaction_stats[0].num_output_files, 1);
+  ASSERT_EQ(compaction_stats[0].num_output_files_blob, 1);
+
+  const uint64_t* const cf_stats_value = internal_stats->TEST_GetCFStatsValue();
+  ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED],
+            compaction_stats[0].bytes_written +
+                compaction_stats[0].bytes_written_blob);
+#endif  // ROCKSDB_LITE
+}
+
+TEST_F(DBFlushTest, FlushWithChecksumHandoff1) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 3;
+  options.disable_auto_compactions = true;
+  options.env = fault_fs_env.get();
+  options.checksum_handoff_file_types.Add(FileType::kTableFile);
+  Reopen(options);
+
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  ASSERT_OK(Put("key1", "value1"));
+  ASSERT_OK(Put("key2", "value2"));
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+
+  // The hash does not match, write fails
+  // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+  // Since the file system returns IOStatus::Corruption, it is an
+  // unrecoverable error.
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+  });
+  ASSERT_OK(Put("key3", "value3"));
+  ASSERT_OK(Put("key4", "value4"));
+  SyncPoint::GetInstance()->EnableProcessing();
+  Status s = Flush();
+  ASSERT_EQ(s.severity(),
+            ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  Destroy(options);
+  Reopen(options);
+
+  // The file system does not support checksum handoff. The check
+  // will be ignored.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+  ASSERT_OK(Put("key5", "value5"));
+  ASSERT_OK(Put("key6", "value6"));
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+
+  // Each write will be similated as corrupted.
+  // Since the file system returns IOStatus::Corruption, it is an
+  // unrecoverable error.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_fs->IngestDataCorruptionBeforeWrite();
+  });
+  ASSERT_OK(Put("key7", "value7"));
+  ASSERT_OK(Put("key8", "value8"));
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(),
+            ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  Destroy(options);
+}
+
+TEST_F(DBFlushTest, FlushWithChecksumHandoff2) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 3;
+  options.disable_auto_compactions = true;
+  options.env = fault_fs_env.get();
+  Reopen(options);
+
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  ASSERT_OK(Put("key1", "value1"));
+  ASSERT_OK(Put("key2", "value2"));
+  ASSERT_OK(Flush());
+
+  // options is not set, the checksum handoff will not be triggered
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+  });
+  ASSERT_OK(Put("key3", "value3"));
+  ASSERT_OK(Put("key4", "value4"));
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Flush());
+  SyncPoint::GetInstance()->DisableProcessing();
+  Destroy(options);
+  Reopen(options);
+
+  // The file system does not support checksum handoff. The check
+  // will be ignored.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+  ASSERT_OK(Put("key5", "value5"));
+  ASSERT_OK(Put("key6", "value6"));
+  ASSERT_OK(Flush());
+
+  // options is not set, the checksum handoff will not be triggered
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_fs->IngestDataCorruptionBeforeWrite();
+  });
+  ASSERT_OK(Put("key7", "value7"));
+  ASSERT_OK(Put("key8", "value8"));
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Flush());
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  Destroy(options);
+}
+
+TEST_F(DBFlushTest, FlushWithChecksumHandoffManifest1) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 3;
+  options.disable_auto_compactions = true;
+  options.env = fault_fs_env.get();
+  options.checksum_handoff_file_types.Add(FileType::kDescriptorFile);
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  Reopen(options);
+
+  ASSERT_OK(Put("key1", "value1"));
+  ASSERT_OK(Put("key2", "value2"));
+  ASSERT_OK(Flush());
+
+  // The hash does not match, write fails
+  // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+  // Since the file system returns IOStatus::Corruption, it is mapped to
+  // kFatalError error.
+  ASSERT_OK(Put("key3", "value3"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+        fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+      });
+  ASSERT_OK(Put("key3", "value3"));
+  ASSERT_OK(Put("key4", "value4"));
+  SyncPoint::GetInstance()->EnableProcessing();
+  Status s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  Destroy(options);
+}
+
+TEST_F(DBFlushTest, FlushWithChecksumHandoffManifest2) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 3;
+  options.disable_auto_compactions = true;
+  options.env = fault_fs_env.get();
+  options.checksum_handoff_file_types.Add(FileType::kDescriptorFile);
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+  Reopen(options);
+  // The file system does not support checksum handoff. The check
+  // will be ignored.
+  ASSERT_OK(Put("key5", "value5"));
+  ASSERT_OK(Put("key6", "value6"));
+  ASSERT_OK(Flush());
+
+  // Each write will be similated as corrupted.
+  // Since the file system returns IOStatus::Corruption, it is mapped to
+  // kFatalError error.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest",
+      [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); });
+  ASSERT_OK(Put("key7", "value7"));
+  ASSERT_OK(Put("key8", "value8"));
+  SyncPoint::GetInstance()->EnableProcessing();
+  Status s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  Destroy(options);
+}
+
+TEST_F(DBFlushTest, PickRightMemtables) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  options.create_if_missing = true;
+
+  const std::string test_cf_name = "test_cf";
+  options.max_write_buffer_number = 128;
+  CreateColumnFamilies({test_cf_name}, options);
+
+  Close();
+
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, test_cf_name}, options);
+
+  ASSERT_OK(db_->Put(WriteOptions(), "key", "value"));
+
+  ASSERT_OK(db_->Put(WriteOptions(), handles_[1], "key", "value"));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::SyncClosedLogs:BeforeReLock", [&](void* /*arg*/) {
+        ASSERT_OK(db_->Put(WriteOptions(), handles_[1], "what", "v"));
+        auto* cfhi =
+            static_cast_with_check<ColumnFamilyHandleImpl>(handles_[1]);
+        assert(cfhi);
+        ASSERT_OK(dbfull()->TEST_SwitchMemtable(cfhi->cfd()));
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushMemTableToOutputFile:AfterPickMemtables", [&](void* arg) {
+        auto* job = reinterpret_cast<FlushJob*>(arg);
+        assert(job);
+        const auto& mems = job->GetMemTables();
+        assert(mems.size() == 1);
+        assert(mems[0]);
+        ASSERT_EQ(1, mems[0]->GetID());
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db_->Flush(FlushOptions(), handles_[1]));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+class DBFlushTestBlobError : public DBFlushTest,
+                             public testing::WithParamInterface<std::string> {
+ public:
+  DBFlushTestBlobError() : sync_point_(GetParam()) {}
+
+  std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(DBFlushTestBlobError, DBFlushTestBlobError,
+                        ::testing::ValuesIn(std::vector<std::string>{
+                            "BlobFileBuilder::WriteBlobToFile:AddRecord",
+                            "BlobFileBuilder::WriteBlobToFile:AppendFooter"}));
+
+TEST_P(DBFlushTestBlobError, FlushError) {
+  Options options;
+  options.enable_blob_files = true;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+
+  Reopen(options);
+
+  ASSERT_OK(Put("key", "blob"));
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) {
+    Status* const s = static_cast<Status*>(arg);
+    assert(s);
+
+    (*s) = Status::IOError(sync_point_);
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_NOK(Flush());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  assert(cfd);
+
+  Version* const current = cfd->current();
+  assert(current);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  assert(storage_info);
+
+  const auto& l0_files = storage_info->LevelFiles(0);
+  ASSERT_TRUE(l0_files.empty());
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+  ASSERT_TRUE(blob_files.empty());
+
+  // Make sure the files generated by the failed job have been deleted
+  std::vector<std::string> files;
+  ASSERT_OK(env_->GetChildren(dbname_, &files));
+  for (const auto& file : files) {
+    uint64_t number = 0;
+    FileType type = kTableFile;
+
+    if (!ParseFileName(file, &number, &type)) {
+      continue;
+    }
+
+    ASSERT_NE(type, kTableFile);
+    ASSERT_NE(type, kBlobFile);
+  }
+
+#ifndef ROCKSDB_LITE
+  const InternalStats* const internal_stats = cfd->internal_stats();
+  assert(internal_stats);
+
+  const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+  ASSERT_FALSE(compaction_stats.empty());
+
+  if (sync_point_ == "BlobFileBuilder::WriteBlobToFile:AddRecord") {
+    ASSERT_EQ(compaction_stats[0].bytes_written, 0);
+    ASSERT_EQ(compaction_stats[0].bytes_written_blob, 0);
+    ASSERT_EQ(compaction_stats[0].num_output_files, 0);
+    ASSERT_EQ(compaction_stats[0].num_output_files_blob, 0);
+  } else {
+    // SST file writing succeeded; blob file writing failed (during Finish)
+    ASSERT_GT(compaction_stats[0].bytes_written, 0);
+    ASSERT_EQ(compaction_stats[0].bytes_written_blob, 0);
+    ASSERT_EQ(compaction_stats[0].num_output_files, 1);
+    ASSERT_EQ(compaction_stats[0].num_output_files_blob, 0);
+  }
+
+  const uint64_t* const cf_stats_value = internal_stats->TEST_GetCFStatsValue();
+  ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED],
+            compaction_stats[0].bytes_written +
+                compaction_stats[0].bytes_written_blob);
+#endif  // ROCKSDB_LITE
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBFlushTest, TombstoneVisibleInSnapshot) {
+  class SimpleTestFlushListener : public EventListener {
+   public:
+    explicit SimpleTestFlushListener(DBFlushTest* _test) : test_(_test) {}
+    ~SimpleTestFlushListener() override {}
+
+    void OnFlushBegin(DB* db, const FlushJobInfo& info) override {
+      ASSERT_EQ(static_cast<uint32_t>(0), info.cf_id);
+
+      ASSERT_OK(db->Delete(WriteOptions(), "foo"));
+      snapshot_ = db->GetSnapshot();
+      ASSERT_OK(db->Put(WriteOptions(), "foo", "value"));
+
+      auto* dbimpl = static_cast_with_check<DBImpl>(db);
+      assert(dbimpl);
+
+      ColumnFamilyHandle* cfh = db->DefaultColumnFamily();
+      auto* cfhi = static_cast_with_check<ColumnFamilyHandleImpl>(cfh);
+      assert(cfhi);
+      ASSERT_OK(dbimpl->TEST_SwitchMemtable(cfhi->cfd()));
+    }
+
+    DBFlushTest* test_ = nullptr;
+    const Snapshot* snapshot_ = nullptr;
+  };
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  auto* listener = new SimpleTestFlushListener(this);
+  options.listeners.emplace_back(listener);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "value0"));
+
+  ManagedSnapshot snapshot_guard(db_);
+
+  ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily();
+  ASSERT_OK(db_->Flush(FlushOptions(), default_cf));
+
+  const Snapshot* snapshot = listener->snapshot_;
+  assert(snapshot);
+
+  ReadOptions read_opts;
+  read_opts.snapshot = snapshot;
+
+  // Using snapshot should not see "foo".
+  {
+    std::string value;
+    Status s = db_->Get(read_opts, "foo", &value);
+    ASSERT_TRUE(s.IsNotFound());
+  }
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_P(DBAtomicFlushTest, ManualFlushUnder2PC) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.allow_2pc = true;
+  options.atomic_flush = GetParam();
+  // 64MB so that memtable flush won't be trigger by the small writes.
+  options.write_buffer_size = (static_cast<size_t>(64) << 20);
+
+  // Destroy the DB to recreate as a TransactionDB.
+  Close();
+  Destroy(options, true);
+
+  // Create a TransactionDB.
+  TransactionDB* txn_db = nullptr;
+  TransactionDBOptions txn_db_opts;
+  txn_db_opts.write_policy = TxnDBWritePolicy::WRITE_COMMITTED;
+  ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db));
+  ASSERT_NE(txn_db, nullptr);
+  db_ = txn_db;
+
+  // Create two more columns other than default CF.
+  std::vector<std::string> cfs = {"puppy", "kitty"};
+  CreateColumnFamilies(cfs, options);
+  ASSERT_EQ(handles_.size(), 2);
+  ASSERT_EQ(handles_[0]->GetName(), cfs[0]);
+  ASSERT_EQ(handles_[1]->GetName(), cfs[1]);
+  const size_t kNumCfToFlush = options.atomic_flush ? 2 : 1;
+
+  WriteOptions wopts;
+  TransactionOptions txn_opts;
+  // txn1 only prepare, but does not commit.
+  // The WAL containing the prepared but uncommitted data must be kept.
+  Transaction* txn1 = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  // txn2 not only prepare, but also commit.
+  Transaction* txn2 = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  ASSERT_NE(txn1, nullptr);
+  ASSERT_NE(txn2, nullptr);
+  for (size_t i = 0; i < kNumCfToFlush; i++) {
+    ASSERT_OK(txn1->Put(handles_[i], "k1", "v1"));
+    ASSERT_OK(txn2->Put(handles_[i], "k2", "v2"));
+  }
+  // A txn must be named before prepare.
+  ASSERT_OK(txn1->SetName("txn1"));
+  ASSERT_OK(txn2->SetName("txn2"));
+  // Prepare writes to WAL, but not to memtable. (WriteCommitted)
+  ASSERT_OK(txn1->Prepare());
+  ASSERT_OK(txn2->Prepare());
+  // Commit writes to memtable.
+  ASSERT_OK(txn2->Commit());
+  delete txn1;
+  delete txn2;
+
+  // There are still data in memtable not flushed.
+  // But since data is small enough to reside in the active memtable,
+  // there are no immutable memtable.
+  for (size_t i = 0; i < kNumCfToFlush; i++) {
+    auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+    ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+    ASSERT_FALSE(cfh->cfd()->mem()->IsEmpty());
+  }
+
+  // Atomic flush memtables,
+  // the min log with prepared data should be written to MANIFEST.
+  std::vector<ColumnFamilyHandle*> cfs_to_flush(kNumCfToFlush);
+  for (size_t i = 0; i < kNumCfToFlush; i++) {
+    cfs_to_flush[i] = handles_[i];
+  }
+  ASSERT_OK(txn_db->Flush(FlushOptions(), cfs_to_flush));
+
+  // There are no remaining data in memtable after flush.
+  for (size_t i = 0; i < kNumCfToFlush; i++) {
+    auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+    ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+    ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
+    ASSERT_EQ(cfh->cfd()->GetFlushReason(), FlushReason::kManualFlush);
+  }
+
+  // The recovered min log number with prepared data should be non-zero.
+  // In 2pc mode, MinLogNumberToKeep returns the
+  // VersionSet::min_log_number_to_keep recovered from MANIFEST, if it's 0,
+  // it means atomic flush didn't write the min_log_number_to_keep to MANIFEST.
+  cfs.push_back(kDefaultColumnFamilyName);
+  ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+  DBImpl* db_impl = reinterpret_cast<DBImpl*>(db_);
+  ASSERT_TRUE(db_impl->allow_2pc());
+  ASSERT_NE(db_impl->MinLogNumberToKeep(), 0);
+}
+#endif  // ROCKSDB_LITE
+
+TEST_P(DBAtomicFlushTest, ManualAtomicFlush) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = GetParam();
+  options.write_buffer_size = (static_cast<size_t>(64) << 20);
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    ASSERT_OK(Put(static_cast<int>(i) /*cf*/, "key", "value", wopts));
+  }
+
+  for (size_t i = 0; i != num_cfs; ++i) {
+    auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+    ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+    ASSERT_FALSE(cfh->cfd()->mem()->IsEmpty());
+  }
+
+  std::vector<int> cf_ids;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    cf_ids.emplace_back(static_cast<int>(i));
+  }
+  ASSERT_OK(Flush(cf_ids));
+
+  for (size_t i = 0; i != num_cfs; ++i) {
+    auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+    ASSERT_EQ(cfh->cfd()->GetFlushReason(), FlushReason::kManualFlush);
+    ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+    ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
+  }
+}
+
+TEST_P(DBAtomicFlushTest, PrecomputeMinLogNumberToKeepNon2PC) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = GetParam();
+  options.write_buffer_size = (static_cast<size_t>(64) << 20);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  const size_t num_cfs = handles_.size();
+  ASSERT_EQ(num_cfs, 2);
+  WriteOptions wopts;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    ASSERT_OK(Put(static_cast<int>(i) /*cf*/, "key", "value", wopts));
+  }
+
+  {
+    // Flush the default CF only.
+    std::vector<int> cf_ids{0};
+    ASSERT_OK(Flush(cf_ids));
+
+    autovector<ColumnFamilyData*> flushed_cfds;
+    autovector<autovector<VersionEdit*>> flush_edits;
+    auto flushed_cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[0]);
+    flushed_cfds.push_back(flushed_cfh->cfd());
+    flush_edits.push_back({});
+    auto unflushed_cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[1]);
+
+    ASSERT_EQ(PrecomputeMinLogNumberToKeepNon2PC(dbfull()->GetVersionSet(),
+                                                 flushed_cfds, flush_edits),
+              unflushed_cfh->cfd()->GetLogNumber());
+  }
+
+  {
+    // Flush all CFs.
+    std::vector<int> cf_ids;
+    for (size_t i = 0; i != num_cfs; ++i) {
+      cf_ids.emplace_back(static_cast<int>(i));
+    }
+    ASSERT_OK(Flush(cf_ids));
+    uint64_t log_num_after_flush = dbfull()->TEST_GetCurrentLogNumber();
+
+    uint64_t min_log_number_to_keep = std::numeric_limits<uint64_t>::max();
+    autovector<ColumnFamilyData*> flushed_cfds;
+    autovector<autovector<VersionEdit*>> flush_edits;
+    for (size_t i = 0; i != num_cfs; ++i) {
+      auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+      flushed_cfds.push_back(cfh->cfd());
+      flush_edits.push_back({});
+      min_log_number_to_keep =
+          std::min(min_log_number_to_keep, cfh->cfd()->GetLogNumber());
+    }
+    ASSERT_EQ(min_log_number_to_keep, log_num_after_flush);
+    ASSERT_EQ(PrecomputeMinLogNumberToKeepNon2PC(dbfull()->GetVersionSet(),
+                                                 flushed_cfds, flush_edits),
+              min_log_number_to_keep);
+  }
+}
+
+TEST_P(DBAtomicFlushTest, AtomicFlushTriggeredByMemTableFull) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = GetParam();
+  // 4KB so that we can easily trigger auto flush.
+  options.write_buffer_size = 4096;
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCallFlush:FlushFinish:0",
+        "DBAtomicFlushTest::AtomicFlushTriggeredByMemTableFull:BeforeCheck"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    ASSERT_OK(Put(static_cast<int>(i) /*cf*/, "key", "value", wopts));
+  }
+  // Keep writing to one of them column families to trigger auto flush.
+  for (int i = 0; i != 4000; ++i) {
+    ASSERT_OK(Put(static_cast<int>(num_cfs) - 1 /*cf*/,
+                  "key" + std::to_string(i), "value" + std::to_string(i),
+                  wopts));
+  }
+
+  TEST_SYNC_POINT(
+      "DBAtomicFlushTest::AtomicFlushTriggeredByMemTableFull:BeforeCheck");
+  if (options.atomic_flush) {
+    for (size_t i = 0; i + 1 != num_cfs; ++i) {
+      auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+      ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+      ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
+    }
+  } else {
+    for (size_t i = 0; i + 1 != num_cfs; ++i) {
+      auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+      ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+      ASSERT_FALSE(cfh->cfd()->mem()->IsEmpty());
+    }
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBAtomicFlushTest, AtomicFlushRollbackSomeJobs) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+  options.env = fault_injection_env.get();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:1",
+        "DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:1"},
+       {"DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:2",
+        "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:2"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_OK(Put(cf_id, "key", "value", wopts));
+  }
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  ASSERT_OK(dbfull()->Flush(flush_opts, handles_));
+  TEST_SYNC_POINT("DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:1");
+  fault_injection_env->SetFilesystemActive(false);
+  TEST_SYNC_POINT("DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:2");
+  for (auto* cfh : handles_) {
+    // Returns the IO error happend during flush.
+    ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable(cfh));
+  }
+  for (size_t i = 0; i != num_cfs; ++i) {
+    auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+    ASSERT_EQ(1, cfh->cfd()->imm()->NumNotFlushed());
+    ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
+  }
+  fault_injection_env->SetFilesystemActive(true);
+  Destroy(options);
+}
+
+TEST_P(DBAtomicFlushTest, FlushMultipleCFs_DropSomeBeforeRequestFlush) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  std::vector<int> cf_ids;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_OK(Put(cf_id, "key", "value", wopts));
+    cf_ids.push_back(cf_id);
+  }
+  ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+  ASSERT_TRUE(Flush(cf_ids).IsColumnFamilyDropped());
+  Destroy(options);
+}
+
+TEST_P(DBAtomicFlushTest,
+       FlushMultipleCFs_DropSomeAfterScheduleFlushBeforeFlushJobRun) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::AtomicFlushMemTables:AfterScheduleFlush",
+        "DBAtomicFlushTest::BeforeDropCF"},
+       {"DBAtomicFlushTest::AfterDropCF",
+        "DBImpl::BackgroundCallFlush:start"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_OK(Put(cf_id, "key", "value", wopts));
+  }
+  port::Thread user_thread([&]() {
+    TEST_SYNC_POINT("DBAtomicFlushTest::BeforeDropCF");
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+    TEST_SYNC_POINT("DBAtomicFlushTest::AfterDropCF");
+  });
+  FlushOptions flush_opts;
+  flush_opts.wait = true;
+  ASSERT_OK(dbfull()->Flush(flush_opts, handles_));
+  user_thread.join();
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_EQ("value", Get(cf_id, "key"));
+  }
+
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "eevee"}, options);
+  num_cfs = handles_.size();
+  ASSERT_EQ(2, num_cfs);
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_EQ("value", Get(cf_id, "key"));
+  }
+  Destroy(options);
+}
+
+TEST_P(DBAtomicFlushTest, TriggerFlushAndClose) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  const int kNumKeysTriggerFlush = 4;
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysTriggerFlush));
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  for (int i = 0; i != kNumKeysTriggerFlush; ++i) {
+    ASSERT_OK(Put(0, "key" + std::to_string(i), "value" + std::to_string(i)));
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(0, "key", "value"));
+  Close();
+
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+  ASSERT_EQ("value", Get(0, "key"));
+}
+
+TEST_P(DBAtomicFlushTest, PickMemtablesRaceWithBackgroundFlush) {
+  bool atomic_flush = GetParam();
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+  options.max_write_buffer_number = 4;
+  // Set min_write_buffer_number_to_merge to be greater than 1, so that
+  // a column family with one memtable in the imm will not cause IsFlushPending
+  // to return true when flush_requested_ is false.
+  options.min_write_buffer_number_to_merge = 2;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_EQ(2, handles_.size());
+  ASSERT_OK(dbfull()->PauseBackgroundWork());
+  ASSERT_OK(Put(0, "key00", "value00"));
+  ASSERT_OK(Put(1, "key10", "value10"));
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  ASSERT_OK(dbfull()->Flush(flush_opts, handles_));
+  ASSERT_OK(Put(0, "key01", "value01"));
+  // Since max_write_buffer_number is 4, the following flush won't cause write
+  // stall.
+  ASSERT_OK(dbfull()->Flush(flush_opts));
+  ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+  ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1]));
+  handles_[1] = nullptr;
+  ASSERT_OK(dbfull()->ContinueBackgroundWork());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
+  delete handles_[0];
+  handles_.clear();
+}
+
+TEST_P(DBAtomicFlushTest, CFDropRaceWithWaitForFlushMemTables) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::AtomicFlushMemTables:AfterScheduleFlush",
+        "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"},
+       {"DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree",
+        "DBImpl::BackgroundCallFlush:start"},
+       {"DBImpl::BackgroundCallFlush:start",
+        "DBImpl::AtomicFlushMemTables:BeforeWaitForBgFlush"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_EQ(2, handles_.size());
+  ASSERT_OK(Put(0, "key", "value"));
+  ASSERT_OK(Put(1, "key", "value"));
+  auto* cfd_default =
+      static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily())
+          ->cfd();
+  auto* cfd_pikachu = static_cast<ColumnFamilyHandleImpl*>(handles_[1])->cfd();
+  port::Thread drop_cf_thr([&]() {
+    TEST_SYNC_POINT(
+        "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop");
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+    delete handles_[1];
+    handles_.resize(1);
+    TEST_SYNC_POINT(
+        "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree");
+  });
+  FlushOptions flush_opts;
+  flush_opts.allow_write_stall = true;
+  ASSERT_OK(dbfull()->TEST_AtomicFlushMemTables({cfd_default, cfd_pikachu},
+                                                flush_opts));
+  drop_cf_thr.join();
+  Close();
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBAtomicFlushTest, RollbackAfterFailToInstallResults) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  auto fault_injection_env = std::make_shared<FaultInjectionTestEnv>(env_);
+  Options options = CurrentOptions();
+  options.env = fault_injection_env.get();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_EQ(2, handles_.size());
+  for (size_t cf = 0; cf < handles_.size(); ++cf) {
+    ASSERT_OK(Put(static_cast<int>(cf), "a", "value"));
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0",
+      [&](void* /*arg*/) { fault_injection_env->SetFilesystemActive(false); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  FlushOptions flush_opts;
+  Status s = db_->Flush(flush_opts, handles_);
+  ASSERT_NOK(s);
+  fault_injection_env->SetFilesystemActive(true);
+  Close();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// In atomic flush, concurrent bg flush threads commit to the MANIFEST in
+// serial, in the order of their picked memtables for each column family.
+// Only when a bg flush thread finds out that its memtables are the earliest
+// unflushed ones for all the included column families will this bg flush
+// thread continue to commit to MANIFEST.
+// This unit test uses sync point to coordinate the execution of two bg threads
+// executing the same sequence of functions. The interleaving are as follows.
+// time            bg1                            bg2
+//  |   pick memtables to flush
+//  |   flush memtables cf1_m1, cf2_m1
+//  |   join MANIFEST write queue
+//  |                                     pick memtabls to flush
+//  |                                     flush memtables cf1_(m1+1)
+//  |                                     join MANIFEST write queue
+//  |                                     wait to write MANIFEST
+//  |   write MANIFEST
+//  |   IO error
+//  |                                     detect IO error and stop waiting
+//  V
+TEST_P(DBAtomicFlushTest, BgThreadNoWaitAfterManifestError) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  auto fault_injection_env = std::make_shared<FaultInjectionTestEnv>(env_);
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = true;
+  options.env = fault_injection_env.get();
+  // Set a larger value than default so that RocksDB can schedule concurrent
+  // background flush threads.
+  options.max_background_jobs = 8;
+  options.max_write_buffer_number = 8;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  assert(2 == handles_.size());
+
+  WriteOptions write_opts;
+  write_opts.disableWAL = true;
+
+  ASSERT_OK(Put(0, "a", "v_0_a", write_opts));
+  ASSERT_OK(Put(1, "a", "v_1_a", write_opts));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  SyncPoint::GetInstance()->LoadDependency({
+      {"BgFlushThr2:WaitToCommit", "BgFlushThr1:BeforeWriteManifest"},
+  });
+
+  std::thread::id bg_flush_thr1, bg_flush_thr2;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCallFlush:start", [&](void*) {
+        if (bg_flush_thr1 == std::thread::id()) {
+          bg_flush_thr1 = std::this_thread::get_id();
+        } else if (bg_flush_thr2 == std::thread::id()) {
+          bg_flush_thr2 = std::this_thread::get_id();
+        }
+      });
+
+  int called = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::AtomicFlushMemTablesToOutputFiles:WaitToCommit", [&](void* arg) {
+        if (std::this_thread::get_id() == bg_flush_thr2) {
+          const auto* ptr = reinterpret_cast<std::pair<Status, bool>*>(arg);
+          assert(ptr);
+          if (0 == called) {
+            // When bg flush thread 2 reaches here for the first time.
+            ASSERT_OK(ptr->first);
+            ASSERT_TRUE(ptr->second);
+          } else if (1 == called) {
+            // When bg flush thread 2 reaches here for the second time.
+            ASSERT_TRUE(ptr->first.IsIOError());
+            ASSERT_FALSE(ptr->second);
+          }
+          ++called;
+          TEST_SYNC_POINT("BgFlushThr2:WaitToCommit");
+        }
+      });
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0",
+      [&](void*) {
+        if (std::this_thread::get_id() == bg_flush_thr1) {
+          TEST_SYNC_POINT("BgFlushThr1:BeforeWriteManifest");
+        }
+      });
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+        if (std::this_thread::get_id() != bg_flush_thr1) {
+          return;
+        }
+        ASSERT_OK(db_->Put(write_opts, "b", "v_1_b"));
+
+        FlushOptions flush_opts;
+        flush_opts.wait = false;
+        std::vector<ColumnFamilyHandle*> cfhs(1, db_->DefaultColumnFamily());
+        ASSERT_OK(dbfull()->Flush(flush_opts, cfhs));
+      });
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:AfterSyncManifest", [&](void* arg) {
+        auto* ptr = reinterpret_cast<IOStatus*>(arg);
+        assert(ptr);
+        *ptr = IOStatus::IOError("Injected failure");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_TRUE(dbfull()->Flush(FlushOptions(), handles_).IsIOError());
+
+  Close();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBAtomicFlushTest, NoWaitWhenWritesStopped) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = GetParam();
+  options.max_write_buffer_number = 2;
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
+
+  Reopen(options);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::DelayWrite:Start",
+        "DBAtomicFlushTest::NoWaitWhenWritesStopped:0"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(dbfull()->PauseBackgroundWork());
+  for (int i = 0; i < options.max_write_buffer_number; ++i) {
+    ASSERT_OK(Put("k" + std::to_string(i), "v" + std::to_string(i)));
+  }
+  std::thread stalled_writer([&]() { ASSERT_OK(Put("k", "v")); });
+
+  TEST_SYNC_POINT("DBAtomicFlushTest::NoWaitWhenWritesStopped:0");
+
+  {
+    FlushOptions flush_opts;
+    flush_opts.wait = false;
+    flush_opts.allow_write_stall = true;
+    ASSERT_TRUE(db_->Flush(flush_opts).IsTryAgain());
+  }
+
+  ASSERT_OK(dbfull()->ContinueBackgroundWork());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  stalled_writer.join();
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(DBFlushDirectIOTest, DBFlushDirectIOTest,
+                        testing::Bool());
+
+INSTANTIATE_TEST_CASE_P(DBAtomicFlushTest, DBAtomicFlushTest, testing::Bool());
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_impl/compacted_db_impl.cc b/src/rocksdb/db/db_impl/compacted_db_impl.cc
new file mode 100644
index 000000000..f18ee0d72
--- /dev/null
+++ b/src/rocksdb/db/db_impl/compacted_db_impl.cc
@@ -0,0 +1,257 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "db/db_impl/compacted_db_impl.h"
+
+#include "db/db_impl/db_impl.h"
+#include "db/version_set.h"
+#include "logging/logging.h"
+#include "table/get_context.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+extern void MarkKeyMayExist(void* arg);
+extern bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
+                      const Slice& v, bool hit_and_return);
+
+CompactedDBImpl::CompactedDBImpl(const DBOptions& options,
+                                 const std::string& dbname)
+    : DBImpl(options, dbname, /*seq_per_batch*/ false, +/*batch_per_txn*/ true,
+             /*read_only*/ true),
+      cfd_(nullptr),
+      version_(nullptr),
+      user_comparator_(nullptr) {}
+
+CompactedDBImpl::~CompactedDBImpl() {}
+
+size_t CompactedDBImpl::FindFile(const Slice& key) {
+  size_t right = files_.num_files - 1;
+  auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool {
+    return user_comparator_->Compare(ExtractUserKey(f.largest_key), k) < 0;
+  };
+  return static_cast<size_t>(
+      std::lower_bound(files_.files, files_.files + right, key, cmp) -
+      files_.files);
+}
+
+Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*,
+                            const Slice& key, PinnableSlice* value) {
+  return Get(options, /*column_family*/ nullptr, key, value,
+             /*timestamp*/ nullptr);
+}
+
+Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*,
+                            const Slice& key, PinnableSlice* value,
+                            std::string* timestamp) {
+  assert(user_comparator_);
+  if (options.timestamp) {
+    const Status s = FailIfTsMismatchCf(
+        DefaultColumnFamily(), *(options.timestamp), /*ts_for_read=*/true);
+    if (!s.ok()) {
+      return s;
+    }
+  } else {
+    const Status s = FailIfCfHasTs(DefaultColumnFamily());
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  // Clear the timestamps for returning results so that we can distinguish
+  // between tombstone or key that has never been written
+  if (timestamp) {
+    timestamp->clear();
+  }
+
+  GetWithTimestampReadCallback read_cb(kMaxSequenceNumber);
+  std::string* ts =
+      user_comparator_->timestamp_size() > 0 ? timestamp : nullptr;
+  LookupKey lkey(key, kMaxSequenceNumber, options.timestamp);
+  GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
+                         GetContext::kNotFound, lkey.user_key(), value,
+                         /*columns=*/nullptr, ts, nullptr, nullptr, true,
+                         nullptr, nullptr, nullptr, nullptr, &read_cb);
+
+  const FdWithKeyRange& f = files_.files[FindFile(lkey.user_key())];
+  if (user_comparator_->CompareWithoutTimestamp(
+          key, /*a_has_ts=*/false,
+          ExtractUserKeyAndStripTimestamp(f.smallest_key,
+                                          user_comparator_->timestamp_size()),
+          /*b_has_ts=*/false) < 0) {
+    return Status::NotFound();
+  }
+  Status s = f.fd.table_reader->Get(options, lkey.internal_key(), &get_context,
+                                    nullptr);
+  if (!s.ok() && !s.IsNotFound()) {
+    return s;
+  }
+  if (get_context.State() == GetContext::kFound) {
+    return Status::OK();
+  }
+  return Status::NotFound();
+}
+
+std::vector<Status> CompactedDBImpl::MultiGet(
+    const ReadOptions& options, const std::vector<ColumnFamilyHandle*>&,
+    const std::vector<Slice>& keys, std::vector<std::string>* values) {
+  return MultiGet(options, keys, values, /*timestamps*/ nullptr);
+}
+
+std::vector<Status> CompactedDBImpl::MultiGet(
+    const ReadOptions& options, const std::vector<ColumnFamilyHandle*>&,
+    const std::vector<Slice>& keys, std::vector<std::string>* values,
+    std::vector<std::string>* timestamps) {
+  assert(user_comparator_);
+  size_t num_keys = keys.size();
+
+  if (options.timestamp) {
+    Status s = FailIfTsMismatchCf(DefaultColumnFamily(), *(options.timestamp),
+                                  /*ts_for_read=*/true);
+    if (!s.ok()) {
+      return std::vector<Status>(num_keys, s);
+    }
+  } else {
+    Status s = FailIfCfHasTs(DefaultColumnFamily());
+    if (!s.ok()) {
+      return std::vector<Status>(num_keys, s);
+    }
+  }
+
+  // Clear the timestamps for returning results so that we can distinguish
+  // between tombstone or key that has never been written
+  if (timestamps) {
+    for (auto& ts : *timestamps) {
+      ts.clear();
+    }
+  }
+
+  GetWithTimestampReadCallback read_cb(kMaxSequenceNumber);
+  autovector<TableReader*, 16> reader_list;
+  for (const auto& key : keys) {
+    LookupKey lkey(key, kMaxSequenceNumber, options.timestamp);
+    const FdWithKeyRange& f = files_.files[FindFile(lkey.user_key())];
+    if (user_comparator_->CompareWithoutTimestamp(
+            key, /*a_has_ts=*/false,
+            ExtractUserKeyAndStripTimestamp(f.smallest_key,
+                                            user_comparator_->timestamp_size()),
+            /*b_has_ts=*/false) < 0) {
+      reader_list.push_back(nullptr);
+    } else {
+      f.fd.table_reader->Prepare(lkey.internal_key());
+      reader_list.push_back(f.fd.table_reader);
+    }
+  }
+  std::vector<Status> statuses(num_keys, Status::NotFound());
+  values->resize(num_keys);
+  if (timestamps) {
+    timestamps->resize(num_keys);
+  }
+  int idx = 0;
+  for (auto* r : reader_list) {
+    if (r != nullptr) {
+      PinnableSlice pinnable_val;
+      std::string& value = (*values)[idx];
+      LookupKey lkey(keys[idx], kMaxSequenceNumber, options.timestamp);
+      std::string* timestamp = timestamps ? &(*timestamps)[idx] : nullptr;
+      GetContext get_context(
+          user_comparator_, nullptr, nullptr, nullptr, GetContext::kNotFound,
+          lkey.user_key(), &pinnable_val, /*columns=*/nullptr,
+          user_comparator_->timestamp_size() > 0 ? timestamp : nullptr, nullptr,
+          nullptr, true, nullptr, nullptr, nullptr, nullptr, &read_cb);
+      Status s = r->Get(options, lkey.internal_key(), &get_context, nullptr);
+      assert(static_cast<size_t>(idx) < statuses.size());
+      if (!s.ok() && !s.IsNotFound()) {
+        statuses[idx] = s;
+      } else {
+        value.assign(pinnable_val.data(), pinnable_val.size());
+        if (get_context.State() == GetContext::kFound) {
+          statuses[idx] = Status::OK();
+        }
+      }
+    }
+    ++idx;
+  }
+  return statuses;
+}
+
+Status CompactedDBImpl::Init(const Options& options) {
+  SuperVersionContext sv_context(/* create_superversion */ true);
+  mutex_.Lock();
+  ColumnFamilyDescriptor cf(kDefaultColumnFamilyName,
+                            ColumnFamilyOptions(options));
+  Status s = Recover({cf}, true /* read only */, false, true);
+  if (s.ok()) {
+    cfd_ = static_cast_with_check<ColumnFamilyHandleImpl>(DefaultColumnFamily())
+               ->cfd();
+    cfd_->InstallSuperVersion(&sv_context, &mutex_);
+  }
+  mutex_.Unlock();
+  sv_context.Clean();
+  if (!s.ok()) {
+    return s;
+  }
+  NewThreadStatusCfInfo(cfd_);
+  version_ = cfd_->GetSuperVersion()->current;
+  user_comparator_ = cfd_->user_comparator();
+  auto* vstorage = version_->storage_info();
+  if (vstorage->num_non_empty_levels() == 0) {
+    return Status::NotSupported("no file exists");
+  }
+  const LevelFilesBrief& l0 = vstorage->LevelFilesBrief(0);
+  // L0 should not have files
+  if (l0.num_files > 1) {
+    return Status::NotSupported("L0 contain more than 1 file");
+  }
+  if (l0.num_files == 1) {
+    if (vstorage->num_non_empty_levels() > 1) {
+      return Status::NotSupported("Both L0 and other level contain files");
+    }
+    files_ = l0;
+    return Status::OK();
+  }
+
+  for (int i = 1; i < vstorage->num_non_empty_levels() - 1; ++i) {
+    if (vstorage->LevelFilesBrief(i).num_files > 0) {
+      return Status::NotSupported("Other levels also contain files");
+    }
+  }
+
+  int level = vstorage->num_non_empty_levels() - 1;
+  if (vstorage->LevelFilesBrief(level).num_files > 0) {
+    files_ = vstorage->LevelFilesBrief(level);
+    return Status::OK();
+  }
+  return Status::NotSupported("no file exists");
+}
+
+Status CompactedDBImpl::Open(const Options& options, const std::string& dbname,
+                             DB** dbptr) {
+  *dbptr = nullptr;
+
+  if (options.max_open_files != -1) {
+    return Status::InvalidArgument("require max_open_files = -1");
+  }
+  if (options.merge_operator.get() != nullptr) {
+    return Status::InvalidArgument("merge operator is not supported");
+  }
+  DBOptions db_options(options);
+  std::unique_ptr<CompactedDBImpl> db(new CompactedDBImpl(db_options, dbname));
+  Status s = db->Init(options);
+  if (s.ok()) {
+    s = db->StartPeriodicTaskScheduler();
+  }
+  if (s.ok()) {
+    ROCKS_LOG_INFO(db->immutable_db_options_.info_log,
+                   "Opened the db as fully compacted mode");
+    LogFlush(db->immutable_db_options_.info_log);
+    *dbptr = db.release();
+  }
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_impl/compacted_db_impl.h b/src/rocksdb/db/db_impl/compacted_db_impl.h
new file mode 100644
index 000000000..eb458b85d
--- /dev/null
+++ b/src/rocksdb/db/db_impl/compacted_db_impl.h
@@ -0,0 +1,154 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO: Share common structure with DBImplSecondary and DBImplReadOnly
+class CompactedDBImpl : public DBImpl {
+ public:
+  CompactedDBImpl(const DBOptions& options, const std::string& dbname);
+  // No copying allowed
+  CompactedDBImpl(const CompactedDBImpl&) = delete;
+  void operator=(const CompactedDBImpl&) = delete;
+
+  ~CompactedDBImpl() override;
+
+  static Status Open(const Options& options, const std::string& dbname,
+                     DB** dbptr);
+
+  // Implementations of the DB interface
+  using DB::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* value) override;
+
+  Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
+             const Slice& key, PinnableSlice* value,
+             std::string* timestamp) override;
+
+  using DB::MultiGet;
+  // Note that CompactedDBImpl::MultiGet is not the optimized version of
+  // MultiGet to use.
+  // TODO: optimize CompactedDBImpl::MultiGet, see DBImpl::MultiGet for details.
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options, const std::vector<ColumnFamilyHandle*>&,
+      const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override;
+
+  std::vector<Status> MultiGet(const ReadOptions& options,
+                               const std::vector<ColumnFamilyHandle*>&,
+                               const std::vector<Slice>& keys,
+                               std::vector<std::string>* values,
+                               std::vector<std::string>* timestamps) override;
+
+  using DBImpl::Put;
+  virtual Status Put(const WriteOptions& /*options*/,
+                     ColumnFamilyHandle* /*column_family*/,
+                     const Slice& /*key*/, const Slice& /*value*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+
+  using DBImpl::PutEntity;
+  Status PutEntity(const WriteOptions& /* options */,
+                   ColumnFamilyHandle* /* column_family */,
+                   const Slice& /* key */,
+                   const WideColumns& /* columns */) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+
+  using DBImpl::Merge;
+  virtual Status Merge(const WriteOptions& /*options*/,
+                       ColumnFamilyHandle* /*column_family*/,
+                       const Slice& /*key*/, const Slice& /*value*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+
+  using DBImpl::Delete;
+  virtual Status Delete(const WriteOptions& /*options*/,
+                        ColumnFamilyHandle* /*column_family*/,
+                        const Slice& /*key*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  virtual Status Write(const WriteOptions& /*options*/,
+                       WriteBatch* /*updates*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  using DBImpl::CompactRange;
+  virtual Status CompactRange(const CompactRangeOptions& /*options*/,
+                              ColumnFamilyHandle* /*column_family*/,
+                              const Slice* /*begin*/,
+                              const Slice* /*end*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+
+  virtual Status DisableFileDeletions() override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  virtual Status EnableFileDeletions(bool /*force*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  virtual Status GetLiveFiles(std::vector<std::string>& ret,
+                              uint64_t* manifest_file_size,
+                              bool /*flush_memtable*/) override {
+    return DBImpl::GetLiveFiles(ret, manifest_file_size,
+                                false /* flush_memtable */);
+  }
+  using DBImpl::Flush;
+  virtual Status Flush(const FlushOptions& /*options*/,
+                       ColumnFamilyHandle* /*column_family*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+
+  virtual Status SyncWAL() override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+
+  using DB::IngestExternalFile;
+  virtual Status IngestExternalFile(
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*external_files*/,
+      const IngestExternalFileOptions& /*ingestion_options*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  using DB::CreateColumnFamilyWithImport;
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& /*options*/,
+      const std::string& /*column_family_name*/,
+      const ImportColumnFamilyOptions& /*import_options*/,
+      const ExportImportFilesMetaData& /*metadata*/,
+      ColumnFamilyHandle** /*handle*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+
+  // FIXME: some missing overrides for more "write" functions
+  // Share with DBImplReadOnly?
+
+ protected:
+#ifndef ROCKSDB_LITE
+  Status FlushForGetLiveFiles() override {
+    // No-op for read-only DB
+    return Status::OK();
+  }
+#endif  // !ROCKSDB_LITE
+
+ private:
+  friend class DB;
+  inline size_t FindFile(const Slice& key);
+  Status Init(const Options& options);
+
+  ColumnFamilyData* cfd_;
+  Version* version_;
+  const Comparator* user_comparator_;
+  LevelFilesBrief files_;
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_impl/db_impl.cc b/src/rocksdb/db/db_impl/db_impl.cc
new file mode 100644
index 000000000..a431111d4
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl.cc
@@ -0,0 +1,5918 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/db_impl/db_impl.h"
+
+#include <stdint.h>
+#ifdef OS_SOLARIS
+#include <alloca.h>
+#endif
+
+#include <algorithm>
+#include <cinttypes>
+#include <cstdio>
+#include <map>
+#include <set>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/builder.h"
+#include "db/compaction/compaction_job.h"
+#include "db/db_info_dumper.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "db/external_sst_file_ingestion_job.h"
+#include "db/flush_job.h"
+#include "db/forward_iterator.h"
+#include "db/import_column_family_job.h"
+#include "db/job_context.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/malloc_stats.h"
+#include "db/memtable.h"
+#include "db/memtable_list.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "db/periodic_task_scheduler.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/table_cache.h"
+#include "db/table_properties_collector.h"
+#include "db/transaction_log_impl.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "db/write_callback.h"
+#include "env/unique_id_gen.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "file/sst_file_manager_impl.h"
+#include "logging/auto_roll_logger.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "monitoring/in_memory_stats_history.h"
+#include "monitoring/instrumented_mutex.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/persistent_stats_history.h"
+#include "monitoring/thread_status_updater.h"
+#include "monitoring/thread_status_util.h"
+#include "options/cf_options.h"
+#include "options/options_helper.h"
+#include "options/options_parser.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/stats_history.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/version.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/get_context.h"
+#include "table/merging_iterator.h"
+#include "table/multiget_context.h"
+#include "table/sst_file_dumper.h"
+#include "table/table_builder.h"
+#include "table/two_level_iterator.h"
+#include "table/unique_id_impl.h"
+#include "test_util/sync_point.h"
+#include "trace_replay/trace_replay.h"
+#include "util/autovector.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/compression.h"
+#include "util/crc32c.h"
+#include "util/defer.h"
+#include "util/distributed_mutex.h"
+#include "util/hash_containers.h"
+#include "util/mutexlock.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "utilities/trace/replayer_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const std::string kDefaultColumnFamilyName("default");
+const std::string kPersistentStatsColumnFamilyName(
+    "___rocksdb_stats_history___");
+void DumpRocksDBBuildVersion(Logger* log);
+
+CompressionType GetCompressionFlush(
+    const ImmutableCFOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options) {
+  // Compressing memtable flushes might not help unless the sequential load
+  // optimization is used for leveled compaction. Otherwise the CPU and
+  // latency overhead is not offset by saving much space.
+  if (ioptions.compaction_style == kCompactionStyleUniversal &&
+      mutable_cf_options.compaction_options_universal
+              .compression_size_percent >= 0) {
+    return kNoCompression;
+  }
+  if (mutable_cf_options.compression_per_level.empty()) {
+    return mutable_cf_options.compression;
+  } else {
+    // For leveled compress when min_level_to_compress != 0.
+    return mutable_cf_options.compression_per_level[0];
+  }
+}
+
+namespace {
+void DumpSupportInfo(Logger* logger) {
+  ROCKS_LOG_HEADER(logger, "Compression algorithms supported:");
+  for (auto& compression : OptionsHelper::compression_type_string_map) {
+    if (compression.second != kNoCompression &&
+        compression.second != kDisableCompressionOption) {
+      ROCKS_LOG_HEADER(logger, "\t%s supported: %d", compression.first.c_str(),
+                       CompressionTypeSupported(compression.second));
+    }
+  }
+  ROCKS_LOG_HEADER(logger, "Fast CRC32 supported: %s",
+                   crc32c::IsFastCrc32Supported().c_str());
+
+  ROCKS_LOG_HEADER(logger, "DMutex implementation: %s", DMutex::kName());
+}
+}  // namespace
+
+DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
+               const bool seq_per_batch, const bool batch_per_txn,
+               bool read_only)
+    : dbname_(dbname),
+      own_info_log_(options.info_log == nullptr),
+      init_logger_creation_s_(),
+      initial_db_options_(SanitizeOptions(dbname, options, read_only,
+                                          &init_logger_creation_s_)),
+      env_(initial_db_options_.env),
+      io_tracer_(std::make_shared<IOTracer>()),
+      immutable_db_options_(initial_db_options_),
+      fs_(immutable_db_options_.fs, io_tracer_),
+      mutable_db_options_(initial_db_options_),
+      stats_(immutable_db_options_.stats),
+#ifdef COERCE_CONTEXT_SWITCH
+      mutex_(stats_, immutable_db_options_.clock, DB_MUTEX_WAIT_MICROS, &bg_cv_,
+             immutable_db_options_.use_adaptive_mutex),
+#else   // COERCE_CONTEXT_SWITCH
+      mutex_(stats_, immutable_db_options_.clock, DB_MUTEX_WAIT_MICROS,
+             immutable_db_options_.use_adaptive_mutex),
+#endif  // COERCE_CONTEXT_SWITCH
+      default_cf_handle_(nullptr),
+      error_handler_(this, immutable_db_options_, &mutex_),
+      event_logger_(immutable_db_options_.info_log.get()),
+      max_total_in_memory_state_(0),
+      file_options_(BuildDBOptions(immutable_db_options_, mutable_db_options_)),
+      file_options_for_compaction_(fs_->OptimizeForCompactionTableWrite(
+          file_options_, immutable_db_options_)),
+      seq_per_batch_(seq_per_batch),
+      batch_per_txn_(batch_per_txn),
+      next_job_id_(1),
+      shutting_down_(false),
+      db_lock_(nullptr),
+      manual_compaction_paused_(false),
+      bg_cv_(&mutex_),
+      logfile_number_(0),
+      log_dir_synced_(false),
+      log_empty_(true),
+      persist_stats_cf_handle_(nullptr),
+      log_sync_cv_(&log_write_mutex_),
+      total_log_size_(0),
+      is_snapshot_supported_(true),
+      write_buffer_manager_(immutable_db_options_.write_buffer_manager.get()),
+      write_thread_(immutable_db_options_),
+      nonmem_write_thread_(immutable_db_options_),
+      write_controller_(mutable_db_options_.delayed_write_rate),
+      last_batch_group_size_(0),
+      unscheduled_flushes_(0),
+      unscheduled_compactions_(0),
+      bg_bottom_compaction_scheduled_(0),
+      bg_compaction_scheduled_(0),
+      num_running_compactions_(0),
+      bg_flush_scheduled_(0),
+      num_running_flushes_(0),
+      bg_purge_scheduled_(0),
+      disable_delete_obsolete_files_(0),
+      pending_purge_obsolete_files_(0),
+      delete_obsolete_files_last_run_(immutable_db_options_.clock->NowMicros()),
+      last_stats_dump_time_microsec_(0),
+      has_unpersisted_data_(false),
+      unable_to_release_oldest_log_(false),
+      num_running_ingest_file_(0),
+#ifndef ROCKSDB_LITE
+      wal_manager_(immutable_db_options_, file_options_, io_tracer_,
+                   seq_per_batch),
+#endif  // ROCKSDB_LITE
+      bg_work_paused_(0),
+      bg_compaction_paused_(0),
+      refitting_level_(false),
+      opened_successfully_(false),
+#ifndef ROCKSDB_LITE
+      periodic_task_scheduler_(),
+#endif  // ROCKSDB_LITE
+      two_write_queues_(options.two_write_queues),
+      manual_wal_flush_(options.manual_wal_flush),
+      // last_sequencee_ is always maintained by the main queue that also writes
+      // to the memtable. When two_write_queues_ is disabled last seq in
+      // memtable is the same as last seq published to the readers. When it is
+      // enabled but seq_per_batch_ is disabled, last seq in memtable still
+      // indicates last published seq since wal-only writes that go to the 2nd
+      // queue do not consume a sequence number. Otherwise writes performed by
+      // the 2nd queue could change what is visible to the readers. In this
+      // cases, last_seq_same_as_publish_seq_==false, the 2nd queue maintains a
+      // separate variable to indicate the last published sequence.
+      last_seq_same_as_publish_seq_(
+          !(seq_per_batch && options.two_write_queues)),
+      // Since seq_per_batch_ is currently set only by WritePreparedTxn which
+      // requires a custom gc for compaction, we use that to set use_custom_gc_
+      // as well.
+      use_custom_gc_(seq_per_batch),
+      shutdown_initiated_(false),
+      own_sfm_(options.sst_file_manager == nullptr),
+      closed_(false),
+      atomic_flush_install_cv_(&mutex_),
+      blob_callback_(immutable_db_options_.sst_file_manager.get(), &mutex_,
+                     &error_handler_, &event_logger_,
+                     immutable_db_options_.listeners, dbname_) {
+  // !batch_per_trx_ implies seq_per_batch_ because it is only unset for
+  // WriteUnprepared, which should use seq_per_batch_.
+  assert(batch_per_txn_ || seq_per_batch_);
+
+  // Reserve ten files or so for other uses and give the rest to TableCache.
+  // Give a large number for setting of "infinite" open files.
+  const int table_cache_size = (mutable_db_options_.max_open_files == -1)
+                                   ? TableCache::kInfiniteCapacity
+                                   : mutable_db_options_.max_open_files - 10;
+  LRUCacheOptions co;
+  co.capacity = table_cache_size;
+  co.num_shard_bits = immutable_db_options_.table_cache_numshardbits;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  table_cache_ = NewLRUCache(co);
+  SetDbSessionId();
+  assert(!db_session_id_.empty());
+
+#ifndef ROCKSDB_LITE
+  periodic_task_functions_.emplace(PeriodicTaskType::kDumpStats,
+                                   [this]() { this->DumpStats(); });
+  periodic_task_functions_.emplace(PeriodicTaskType::kPersistStats,
+                                   [this]() { this->PersistStats(); });
+  periodic_task_functions_.emplace(PeriodicTaskType::kFlushInfoLog,
+                                   [this]() { this->FlushInfoLog(); });
+  periodic_task_functions_.emplace(
+      PeriodicTaskType::kRecordSeqnoTime,
+      [this]() { this->RecordSeqnoToTimeMapping(); });
+#endif  // ROCKSDB_LITE
+
+  versions_.reset(new VersionSet(dbname_, &immutable_db_options_, file_options_,
+                                 table_cache_.get(), write_buffer_manager_,
+                                 &write_controller_, &block_cache_tracer_,
+                                 io_tracer_, db_id_, db_session_id_));
+  column_family_memtables_.reset(
+      new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
+
+  DumpRocksDBBuildVersion(immutable_db_options_.info_log.get());
+  DumpDBFileSummary(immutable_db_options_, dbname_, db_session_id_);
+  immutable_db_options_.Dump(immutable_db_options_.info_log.get());
+  mutable_db_options_.Dump(immutable_db_options_.info_log.get());
+  DumpSupportInfo(immutable_db_options_.info_log.get());
+
+  max_total_wal_size_.store(mutable_db_options_.max_total_wal_size,
+                            std::memory_order_relaxed);
+  if (write_buffer_manager_) {
+    wbm_stall_.reset(new WBMStallInterface());
+  }
+}
+
+Status DBImpl::Resume() {
+  ROCKS_LOG_INFO(immutable_db_options_.info_log, "Resuming DB");
+
+  InstrumentedMutexLock db_mutex(&mutex_);
+
+  if (!error_handler_.IsDBStopped() && !error_handler_.IsBGWorkStopped()) {
+    // Nothing to do
+    return Status::OK();
+  }
+
+  if (error_handler_.IsRecoveryInProgress()) {
+    // Don't allow a mix of manual and automatic recovery
+    return Status::Busy();
+  }
+
+  mutex_.Unlock();
+  Status s = error_handler_.RecoverFromBGError(true);
+  mutex_.Lock();
+  return s;
+}
+
+// This function implements the guts of recovery from a background error. It
+// is eventually called for both manual as well as automatic recovery. It does
+// the following -
+// 1. Wait for currently scheduled background flush/compaction to exit, in
+//    order to inadvertently causing an error and thinking recovery failed
+// 2. Flush memtables if there's any data for all the CFs. This may result
+//    another error, which will be saved by error_handler_ and reported later
+//    as the recovery status
+// 3. Find and delete any obsolete files
+// 4. Schedule compactions if needed for all the CFs. This is needed as the
+//    flush in the prior step might have been a no-op for some CFs, which
+//    means a new super version wouldn't have been installed
+Status DBImpl::ResumeImpl(DBRecoverContext context) {
+  mutex_.AssertHeld();
+  WaitForBackgroundWork();
+
+  Status s;
+  if (shutdown_initiated_) {
+    // Returning shutdown status to SFM during auto recovery will cause it
+    // to abort the recovery and allow the shutdown to progress
+    s = Status::ShutdownInProgress();
+  }
+
+  if (s.ok()) {
+    Status bg_error = error_handler_.GetBGError();
+    if (bg_error.severity() > Status::Severity::kHardError) {
+      ROCKS_LOG_INFO(
+          immutable_db_options_.info_log,
+          "DB resume requested but failed due to Fatal/Unrecoverable error");
+      s = bg_error;
+    }
+  }
+
+  // Make sure the IO Status stored in version set is set to OK.
+  bool file_deletion_disabled = !IsFileDeletionsEnabled();
+  if (s.ok()) {
+    IOStatus io_s = versions_->io_status();
+    if (io_s.IsIOError()) {
+      // If resuming from IOError resulted from MANIFEST write, then assert
+      // that we must have already set the MANIFEST writer to nullptr during
+      // clean-up phase MANIFEST writing. We must have also disabled file
+      // deletions.
+      assert(!versions_->descriptor_log_);
+      assert(file_deletion_disabled);
+      // Since we are trying to recover from MANIFEST write error, we need to
+      // switch to a new MANIFEST anyway. The old MANIFEST can be corrupted.
+      // Therefore, force writing a dummy version edit because we do not know
+      // whether there are flush jobs with non-empty data to flush, triggering
+      // appends to MANIFEST.
+      VersionEdit edit;
+      auto cfh =
+          static_cast_with_check<ColumnFamilyHandleImpl>(default_cf_handle_);
+      assert(cfh);
+      ColumnFamilyData* cfd = cfh->cfd();
+      const MutableCFOptions& cf_opts = *cfd->GetLatestMutableCFOptions();
+      s = versions_->LogAndApply(cfd, cf_opts, &edit, &mutex_,
+                                 directories_.GetDbDir());
+      if (!s.ok()) {
+        io_s = versions_->io_status();
+        if (!io_s.ok()) {
+          s = error_handler_.SetBGError(io_s,
+                                        BackgroundErrorReason::kManifestWrite);
+        }
+      }
+    }
+  }
+
+  // We cannot guarantee consistency of the WAL. So force flush Memtables of
+  // all the column families
+  if (s.ok()) {
+    FlushOptions flush_opts;
+    // We allow flush to stall write since we are trying to resume from error.
+    flush_opts.allow_write_stall = true;
+    if (immutable_db_options_.atomic_flush) {
+      autovector<ColumnFamilyData*> cfds;
+      SelectColumnFamiliesForAtomicFlush(&cfds);
+      mutex_.Unlock();
+      s = AtomicFlushMemTables(cfds, flush_opts, context.flush_reason);
+      mutex_.Lock();
+    } else {
+      for (auto cfd : versions_->GetRefedColumnFamilySet()) {
+        if (cfd->IsDropped()) {
+          continue;
+        }
+        InstrumentedMutexUnlock u(&mutex_);
+        s = FlushMemTable(cfd, flush_opts, context.flush_reason);
+        if (!s.ok()) {
+          break;
+        }
+      }
+    }
+    if (!s.ok()) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "DB resume requested but failed due to Flush failure [%s]",
+                     s.ToString().c_str());
+    }
+  }
+
+  JobContext job_context(0);
+  FindObsoleteFiles(&job_context, true);
+  mutex_.Unlock();
+
+  job_context.manifest_file_number = 1;
+  if (job_context.HaveSomethingToDelete()) {
+    PurgeObsoleteFiles(job_context);
+  }
+  job_context.Clean();
+
+  if (s.ok()) {
+    assert(versions_->io_status().ok());
+    // If we reach here, we should re-enable file deletions if it was disabled
+    // during previous error handling.
+    if (file_deletion_disabled) {
+      // Always return ok
+      s = EnableFileDeletions(/*force=*/true);
+      if (!s.ok()) {
+        ROCKS_LOG_INFO(
+            immutable_db_options_.info_log,
+            "DB resume requested but could not enable file deletions [%s]",
+            s.ToString().c_str());
+        assert(false);
+      }
+    }
+  }
+
+  mutex_.Lock();
+  if (s.ok()) {
+    // This will notify and unblock threads waiting for error recovery to
+    // finish. Those previouly waiting threads can now proceed, which may
+    // include closing the db.
+    s = error_handler_.ClearBGError();
+  } else {
+    // NOTE: this is needed to pass ASSERT_STATUS_CHECKED
+    // in the DBSSTTest.DBWithMaxSpaceAllowedRandomized test.
+    // See https://github.com/facebook/rocksdb/pull/7715#issuecomment-754947952
+    error_handler_.GetRecoveryError().PermitUncheckedError();
+  }
+
+  if (s.ok()) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB");
+  } else {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "Failed to resume DB [%s]",
+                   s.ToString().c_str());
+  }
+
+  // Check for shutdown again before scheduling further compactions,
+  // since we released and re-acquired the lock above
+  if (shutdown_initiated_) {
+    s = Status::ShutdownInProgress();
+  }
+  if (s.ok()) {
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      SchedulePendingCompaction(cfd);
+    }
+    MaybeScheduleFlushOrCompaction();
+  }
+
+  // Wake up any waiters - in this case, it could be the shutdown thread
+  bg_cv_.SignalAll();
+
+  // No need to check BGError again. If something happened, event listener would
+  // be notified and the operation causing it would have failed
+  return s;
+}
+
+void DBImpl::WaitForBackgroundWork() {
+  // Wait for background work to finish
+  while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
+         bg_flush_scheduled_) {
+    bg_cv_.Wait();
+  }
+}
+
+// Will lock the mutex_,  will wait for completion if wait is true
+void DBImpl::CancelAllBackgroundWork(bool wait) {
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "Shutdown: canceling all background work");
+
+#ifndef ROCKSDB_LITE
+  for (uint8_t task_type = 0;
+       task_type < static_cast<uint8_t>(PeriodicTaskType::kMax); task_type++) {
+    Status s = periodic_task_scheduler_.Unregister(
+        static_cast<PeriodicTaskType>(task_type));
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                     "Failed to unregister periodic task %d, status: %s",
+                     task_type, s.ToString().c_str());
+    }
+  }
+#endif  // !ROCKSDB_LITE
+
+  InstrumentedMutexLock l(&mutex_);
+  if (!shutting_down_.load(std::memory_order_acquire) &&
+      has_unpersisted_data_.load(std::memory_order_relaxed) &&
+      !mutable_db_options_.avoid_flush_during_shutdown) {
+    if (immutable_db_options_.atomic_flush) {
+      autovector<ColumnFamilyData*> cfds;
+      SelectColumnFamiliesForAtomicFlush(&cfds);
+      mutex_.Unlock();
+      Status s =
+          AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kShutDown);
+      s.PermitUncheckedError();  //**TODO: What to do on error?
+      mutex_.Lock();
+    } else {
+      for (auto cfd : versions_->GetRefedColumnFamilySet()) {
+        if (!cfd->IsDropped() && cfd->initialized() && !cfd->mem()->IsEmpty()) {
+          InstrumentedMutexUnlock u(&mutex_);
+          Status s = FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown);
+          s.PermitUncheckedError();  //**TODO: What to do on error?
+        }
+      }
+    }
+  }
+
+  shutting_down_.store(true, std::memory_order_release);
+  bg_cv_.SignalAll();
+  if (!wait) {
+    return;
+  }
+  WaitForBackgroundWork();
+}
+
+Status DBImpl::MaybeReleaseTimestampedSnapshotsAndCheck() {
+  size_t num_snapshots = 0;
+  ReleaseTimestampedSnapshotsOlderThan(std::numeric_limits<uint64_t>::max(),
+                                       &num_snapshots);
+
+  // If there is unreleased snapshot, fail the close call
+  if (num_snapshots > 0) {
+    return Status::Aborted("Cannot close DB with unreleased snapshot.");
+  }
+
+  return Status::OK();
+}
+
+Status DBImpl::CloseHelper() {
+  // Guarantee that there is no background error recovery in progress before
+  // continuing with the shutdown
+  mutex_.Lock();
+  shutdown_initiated_ = true;
+  error_handler_.CancelErrorRecovery();
+  while (error_handler_.IsRecoveryInProgress()) {
+    bg_cv_.Wait();
+  }
+  mutex_.Unlock();
+
+  // Below check is added as recovery_error_ is not checked and it causes crash
+  // in DBSSTTest.DBWithMaxSpaceAllowedWithBlobFiles when space limit is
+  // reached.
+  error_handler_.GetRecoveryError().PermitUncheckedError();
+
+  // CancelAllBackgroundWork called with false means we just set the shutdown
+  // marker. After this we do a variant of the waiting and unschedule work
+  // (to consider: moving all the waiting into CancelAllBackgroundWork(true))
+  CancelAllBackgroundWork(false);
+
+  // Cancel manual compaction if there's any
+  if (HasPendingManualCompaction()) {
+    DisableManualCompaction();
+  }
+  mutex_.Lock();
+  // Unschedule all tasks for this DB
+  for (uint8_t i = 0; i < static_cast<uint8_t>(TaskType::kCount); i++) {
+    env_->UnSchedule(GetTaskTag(i), Env::Priority::BOTTOM);
+    env_->UnSchedule(GetTaskTag(i), Env::Priority::LOW);
+    env_->UnSchedule(GetTaskTag(i), Env::Priority::HIGH);
+  }
+
+  Status ret = Status::OK();
+
+  // Wait for background work to finish
+  while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
+         bg_flush_scheduled_ || bg_purge_scheduled_ ||
+         pending_purge_obsolete_files_ ||
+         error_handler_.IsRecoveryInProgress()) {
+    TEST_SYNC_POINT("DBImpl::~DBImpl:WaitJob");
+    bg_cv_.Wait();
+  }
+  TEST_SYNC_POINT_CALLBACK("DBImpl::CloseHelper:PendingPurgeFinished",
+                           &files_grabbed_for_purge_);
+  EraseThreadStatusDbInfo();
+  flush_scheduler_.Clear();
+  trim_history_scheduler_.Clear();
+
+  while (!flush_queue_.empty()) {
+    const FlushRequest& flush_req = PopFirstFromFlushQueue();
+    for (const auto& iter : flush_req) {
+      iter.first->UnrefAndTryDelete();
+    }
+  }
+
+  while (!compaction_queue_.empty()) {
+    auto cfd = PopFirstFromCompactionQueue();
+    cfd->UnrefAndTryDelete();
+  }
+
+  if (default_cf_handle_ != nullptr || persist_stats_cf_handle_ != nullptr) {
+    // we need to delete handle outside of lock because it does its own locking
+    mutex_.Unlock();
+    if (default_cf_handle_) {
+      delete default_cf_handle_;
+      default_cf_handle_ = nullptr;
+    }
+    if (persist_stats_cf_handle_) {
+      delete persist_stats_cf_handle_;
+      persist_stats_cf_handle_ = nullptr;
+    }
+    mutex_.Lock();
+  }
+
+  // Clean up obsolete files due to SuperVersion release.
+  // (1) Need to delete to obsolete files before closing because RepairDB()
+  // scans all existing files in the file system and builds manifest file.
+  // Keeping obsolete files confuses the repair process.
+  // (2) Need to check if we Open()/Recover() the DB successfully before
+  // deleting because if VersionSet recover fails (may be due to corrupted
+  // manifest file), it is not able to identify live files correctly. As a
+  // result, all "live" files can get deleted by accident. However, corrupted
+  // manifest is recoverable by RepairDB().
+  if (opened_successfully_) {
+    JobContext job_context(next_job_id_.fetch_add(1));
+    FindObsoleteFiles(&job_context, true);
+
+    mutex_.Unlock();
+    // manifest number starting from 2
+    job_context.manifest_file_number = 1;
+    if (job_context.HaveSomethingToDelete()) {
+      PurgeObsoleteFiles(job_context);
+    }
+    job_context.Clean();
+    mutex_.Lock();
+  }
+  {
+    InstrumentedMutexLock lock(&log_write_mutex_);
+    for (auto l : logs_to_free_) {
+      delete l;
+    }
+    for (auto& log : logs_) {
+      uint64_t log_number = log.writer->get_log_number();
+      Status s = log.ClearWriter();
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(
+            immutable_db_options_.info_log,
+            "Unable to Sync WAL file %s with error -- %s",
+            LogFileName(immutable_db_options_.GetWalDir(), log_number).c_str(),
+            s.ToString().c_str());
+        // Retain the first error
+        if (ret.ok()) {
+          ret = s;
+        }
+      }
+    }
+    logs_.clear();
+  }
+
+  // Table cache may have table handles holding blocks from the block cache.
+  // We need to release them before the block cache is destroyed. The block
+  // cache may be destroyed inside versions_.reset(), when column family data
+  // list is destroyed, so leaving handles in table cache after
+  // versions_.reset() may cause issues.
+  // Here we clean all unreferenced handles in table cache.
+  // Now we assume all user queries have finished, so only version set itself
+  // can possibly hold the blocks from block cache. After releasing unreferenced
+  // handles here, only handles held by version set left and inside
+  // versions_.reset(), we will release them. There, we need to make sure every
+  // time a handle is released, we erase it from the cache too. By doing that,
+  // we can guarantee that after versions_.reset(), table cache is empty
+  // so the cache can be safely destroyed.
+  table_cache_->EraseUnRefEntries();
+
+  for (auto& txn_entry : recovered_transactions_) {
+    delete txn_entry.second;
+  }
+
+  // versions need to be destroyed before table_cache since it can hold
+  // references to table_cache.
+  versions_.reset();
+  mutex_.Unlock();
+  if (db_lock_ != nullptr) {
+    // TODO: Check for unlock error
+    env_->UnlockFile(db_lock_).PermitUncheckedError();
+  }
+
+  ROCKS_LOG_INFO(immutable_db_options_.info_log, "Shutdown complete");
+  LogFlush(immutable_db_options_.info_log);
+
+#ifndef ROCKSDB_LITE
+  // If the sst_file_manager was allocated by us during DB::Open(), ccall
+  // Close() on it before closing the info_log. Otherwise, background thread
+  // in SstFileManagerImpl might try to log something
+  if (immutable_db_options_.sst_file_manager && own_sfm_) {
+    auto sfm = static_cast<SstFileManagerImpl*>(
+        immutable_db_options_.sst_file_manager.get());
+    sfm->Close();
+  }
+#endif  // ROCKSDB_LITE
+
+  if (immutable_db_options_.info_log && own_info_log_) {
+    Status s = immutable_db_options_.info_log->Close();
+    if (!s.ok() && !s.IsNotSupported() && ret.ok()) {
+      ret = s;
+    }
+  }
+
+  if (write_buffer_manager_ && wbm_stall_) {
+    write_buffer_manager_->RemoveDBFromQueue(wbm_stall_.get());
+  }
+
+  IOStatus io_s = directories_.Close(IOOptions(), nullptr /* dbg */);
+  if (!io_s.ok()) {
+    ret = io_s;
+  }
+  if (ret.IsAborted()) {
+    // Reserve IsAborted() error for those where users didn't release
+    // certain resource and they can release them and come back and
+    // retry. In this case, we wrap this exception to something else.
+    return Status::Incomplete(ret.ToString());
+  }
+
+  return ret;
+}
+
+Status DBImpl::CloseImpl() { return CloseHelper(); }
+
+DBImpl::~DBImpl() {
+  // TODO: remove this.
+  init_logger_creation_s_.PermitUncheckedError();
+
+  InstrumentedMutexLock closing_lock_guard(&closing_mutex_);
+  if (closed_) {
+    return;
+  }
+
+  closed_ = true;
+
+  {
+    const Status s = MaybeReleaseTimestampedSnapshotsAndCheck();
+    s.PermitUncheckedError();
+  }
+
+  closing_status_ = CloseImpl();
+  closing_status_.PermitUncheckedError();
+}
+
+void DBImpl::MaybeIgnoreError(Status* s) const {
+  if (s->ok() || immutable_db_options_.paranoid_checks) {
+    // No change needed
+  } else {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log, "Ignoring error %s",
+                   s->ToString().c_str());
+    *s = Status::OK();
+  }
+}
+
+const Status DBImpl::CreateArchivalDirectory() {
+  if (immutable_db_options_.WAL_ttl_seconds > 0 ||
+      immutable_db_options_.WAL_size_limit_MB > 0) {
+    std::string archivalPath =
+        ArchivalDirectory(immutable_db_options_.GetWalDir());
+    return env_->CreateDirIfMissing(archivalPath);
+  }
+  return Status::OK();
+}
+
+void DBImpl::PrintStatistics() {
+  auto dbstats = immutable_db_options_.stats;
+  if (dbstats) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "STATISTICS:\n %s",
+                   dbstats->ToString().c_str());
+  }
+}
+
+Status DBImpl::StartPeriodicTaskScheduler() {
+#ifndef ROCKSDB_LITE
+
+#ifndef NDEBUG
+  // It only used by test to disable scheduler
+  bool disable_scheduler = false;
+  TEST_SYNC_POINT_CALLBACK(
+      "DBImpl::StartPeriodicTaskScheduler:DisableScheduler",
+      &disable_scheduler);
+  if (disable_scheduler) {
+    return Status::OK();
+  }
+
+  {
+    InstrumentedMutexLock l(&mutex_);
+    TEST_SYNC_POINT_CALLBACK("DBImpl::StartPeriodicTaskScheduler:Init",
+                             &periodic_task_scheduler_);
+  }
+
+#endif  // !NDEBUG
+  if (mutable_db_options_.stats_dump_period_sec > 0) {
+    Status s = periodic_task_scheduler_.Register(
+        PeriodicTaskType::kDumpStats,
+        periodic_task_functions_.at(PeriodicTaskType::kDumpStats),
+        mutable_db_options_.stats_dump_period_sec);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  if (mutable_db_options_.stats_persist_period_sec > 0) {
+    Status s = periodic_task_scheduler_.Register(
+        PeriodicTaskType::kPersistStats,
+        periodic_task_functions_.at(PeriodicTaskType::kPersistStats),
+        mutable_db_options_.stats_persist_period_sec);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  Status s = periodic_task_scheduler_.Register(
+      PeriodicTaskType::kFlushInfoLog,
+      periodic_task_functions_.at(PeriodicTaskType::kFlushInfoLog));
+
+  return s;
+#else
+  return Status::OK();
+#endif  // !ROCKSDB_LITE
+}
+
+Status DBImpl::RegisterRecordSeqnoTimeWorker() {
+#ifndef ROCKSDB_LITE
+  uint64_t min_time_duration = std::numeric_limits<uint64_t>::max();
+  uint64_t max_time_duration = std::numeric_limits<uint64_t>::min();
+  {
+    InstrumentedMutexLock l(&mutex_);
+
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      // preserve time is the max of 2 options.
+      uint64_t preserve_time_duration =
+          std::max(cfd->ioptions()->preserve_internal_time_seconds,
+                   cfd->ioptions()->preclude_last_level_data_seconds);
+      if (!cfd->IsDropped() && preserve_time_duration > 0) {
+        min_time_duration = std::min(preserve_time_duration, min_time_duration);
+        max_time_duration = std::max(preserve_time_duration, max_time_duration);
+      }
+    }
+    if (min_time_duration == std::numeric_limits<uint64_t>::max()) {
+      seqno_time_mapping_.Resize(0, 0);
+    } else {
+      seqno_time_mapping_.Resize(min_time_duration, max_time_duration);
+    }
+  }
+
+  uint64_t seqno_time_cadence = 0;
+  if (min_time_duration != std::numeric_limits<uint64_t>::max()) {
+    // round up to 1 when the time_duration is smaller than
+    // kMaxSeqnoTimePairsPerCF
+    seqno_time_cadence =
+        (min_time_duration + SeqnoToTimeMapping::kMaxSeqnoTimePairsPerCF - 1) /
+        SeqnoToTimeMapping::kMaxSeqnoTimePairsPerCF;
+  }
+
+  Status s;
+  if (seqno_time_cadence == 0) {
+    s = periodic_task_scheduler_.Unregister(PeriodicTaskType::kRecordSeqnoTime);
+  } else {
+    s = periodic_task_scheduler_.Register(
+        PeriodicTaskType::kRecordSeqnoTime,
+        periodic_task_functions_.at(PeriodicTaskType::kRecordSeqnoTime),
+        seqno_time_cadence);
+  }
+
+  return s;
+#else
+  return Status::OK();
+#endif  // !ROCKSDB_LITE
+}
+
+// esitmate the total size of stats_history_
+size_t DBImpl::EstimateInMemoryStatsHistorySize() const {
+  size_t size_total =
+      sizeof(std::map<uint64_t, std::map<std::string, uint64_t>>);
+  if (stats_history_.size() == 0) return size_total;
+  size_t size_per_slice =
+      sizeof(uint64_t) + sizeof(std::map<std::string, uint64_t>);
+  // non-empty map, stats_history_.begin() guaranteed to exist
+  for (const auto& pairs : stats_history_.begin()->second) {
+    size_per_slice +=
+        pairs.first.capacity() + sizeof(pairs.first) + sizeof(pairs.second);
+  }
+  size_total = size_per_slice * stats_history_.size();
+  return size_total;
+}
+
+void DBImpl::PersistStats() {
+  TEST_SYNC_POINT("DBImpl::PersistStats:Entry");
+#ifndef ROCKSDB_LITE
+  if (shutdown_initiated_) {
+    return;
+  }
+  TEST_SYNC_POINT("DBImpl::PersistStats:StartRunning");
+  uint64_t now_seconds =
+      immutable_db_options_.clock->NowMicros() / kMicrosInSecond;
+
+  Statistics* statistics = immutable_db_options_.stats;
+  if (!statistics) {
+    return;
+  }
+  size_t stats_history_size_limit = 0;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    stats_history_size_limit = mutable_db_options_.stats_history_buffer_size;
+  }
+
+  std::map<std::string, uint64_t> stats_map;
+  if (!statistics->getTickerMap(&stats_map)) {
+    return;
+  }
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "------- PERSISTING STATS -------");
+
+  if (immutable_db_options_.persist_stats_to_disk) {
+    WriteBatch batch;
+    Status s = Status::OK();
+    if (stats_slice_initialized_) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Reading %" ROCKSDB_PRIszt " stats from statistics\n",
+                     stats_slice_.size());
+      for (const auto& stat : stats_map) {
+        if (s.ok()) {
+          char key[100];
+          int length =
+              EncodePersistentStatsKey(now_seconds, stat.first, 100, key);
+          // calculate the delta from last time
+          if (stats_slice_.find(stat.first) != stats_slice_.end()) {
+            uint64_t delta = stat.second - stats_slice_[stat.first];
+            s = batch.Put(persist_stats_cf_handle_,
+                          Slice(key, std::min(100, length)),
+                          std::to_string(delta));
+          }
+        }
+      }
+    }
+    stats_slice_initialized_ = true;
+    std::swap(stats_slice_, stats_map);
+    if (s.ok()) {
+      WriteOptions wo;
+      wo.low_pri = true;
+      wo.no_slowdown = true;
+      wo.sync = false;
+      s = Write(wo, &batch);
+    }
+    if (!s.ok()) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Writing to persistent stats CF failed -- %s",
+                     s.ToString().c_str());
+    } else {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Writing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64
+                     " to persistent stats CF succeeded",
+                     stats_slice_.size(), now_seconds);
+    }
+    // TODO(Zhongyi): add purging for persisted data
+  } else {
+    InstrumentedMutexLock l(&stats_history_mutex_);
+    // calculate the delta from last time
+    if (stats_slice_initialized_) {
+      std::map<std::string, uint64_t> stats_delta;
+      for (const auto& stat : stats_map) {
+        if (stats_slice_.find(stat.first) != stats_slice_.end()) {
+          stats_delta[stat.first] = stat.second - stats_slice_[stat.first];
+        }
+      }
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Storing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64
+                     " to in-memory stats history",
+                     stats_slice_.size(), now_seconds);
+      stats_history_[now_seconds] = stats_delta;
+    }
+    stats_slice_initialized_ = true;
+    std::swap(stats_slice_, stats_map);
+    TEST_SYNC_POINT("DBImpl::PersistStats:StatsCopied");
+
+    // delete older stats snapshots to control memory consumption
+    size_t stats_history_size = EstimateInMemoryStatsHistorySize();
+    bool purge_needed = stats_history_size > stats_history_size_limit;
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "[Pre-GC] In-memory stats history size: %" ROCKSDB_PRIszt
+                   " bytes, slice count: %" ROCKSDB_PRIszt,
+                   stats_history_size, stats_history_.size());
+    while (purge_needed && !stats_history_.empty()) {
+      stats_history_.erase(stats_history_.begin());
+      purge_needed =
+          EstimateInMemoryStatsHistorySize() > stats_history_size_limit;
+    }
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "[Post-GC] In-memory stats history size: %" ROCKSDB_PRIszt
+                   " bytes, slice count: %" ROCKSDB_PRIszt,
+                   stats_history_size, stats_history_.size());
+  }
+  TEST_SYNC_POINT("DBImpl::PersistStats:End");
+#endif  // !ROCKSDB_LITE
+}
+
+bool DBImpl::FindStatsByTime(uint64_t start_time, uint64_t end_time,
+                             uint64_t* new_time,
+                             std::map<std::string, uint64_t>* stats_map) {
+  assert(new_time);
+  assert(stats_map);
+  if (!new_time || !stats_map) return false;
+  // lock when search for start_time
+  {
+    InstrumentedMutexLock l(&stats_history_mutex_);
+    auto it = stats_history_.lower_bound(start_time);
+    if (it != stats_history_.end() && it->first < end_time) {
+      // make a copy for timestamp and stats_map
+      *new_time = it->first;
+      *stats_map = it->second;
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
+
+Status DBImpl::GetStatsHistory(
+    uint64_t start_time, uint64_t end_time,
+    std::unique_ptr<StatsHistoryIterator>* stats_iterator) {
+  if (!stats_iterator) {
+    return Status::InvalidArgument("stats_iterator not preallocated.");
+  }
+  if (immutable_db_options_.persist_stats_to_disk) {
+    stats_iterator->reset(
+        new PersistentStatsHistoryIterator(start_time, end_time, this));
+  } else {
+    stats_iterator->reset(
+        new InMemoryStatsHistoryIterator(start_time, end_time, this));
+  }
+  return (*stats_iterator)->status();
+}
+
+void DBImpl::DumpStats() {
+  TEST_SYNC_POINT("DBImpl::DumpStats:1");
+#ifndef ROCKSDB_LITE
+  std::string stats;
+  if (shutdown_initiated_) {
+    return;
+  }
+
+  // Also probe block cache(s) for problems, dump to info log
+  UnorderedSet<Cache*> probed_caches;
+  TEST_SYNC_POINT("DBImpl::DumpStats:StartRunning");
+  {
+    InstrumentedMutexLock l(&mutex_);
+    for (auto cfd : versions_->GetRefedColumnFamilySet()) {
+      if (!cfd->initialized()) {
+        continue;
+      }
+
+      // Release DB mutex for gathering cache entry stats. Pass over all
+      // column families for this first so that other stats are dumped
+      // near-atomically.
+      InstrumentedMutexUnlock u(&mutex_);
+      cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false);
+
+      // Probe block cache for problems (if not already via another CF)
+      if (immutable_db_options_.info_log) {
+        auto* table_factory = cfd->ioptions()->table_factory.get();
+        assert(table_factory != nullptr);
+        Cache* cache =
+            table_factory->GetOptions<Cache>(TableFactory::kBlockCacheOpts());
+        if (cache && probed_caches.insert(cache).second) {
+          cache->ReportProblems(immutable_db_options_.info_log);
+        }
+      }
+    }
+
+    const std::string* property = &DB::Properties::kDBStats;
+    const DBPropertyInfo* property_info = GetPropertyInfo(*property);
+    assert(property_info != nullptr);
+    assert(!property_info->need_out_of_mutex);
+    default_cf_internal_stats_->GetStringProperty(*property_info, *property,
+                                                  &stats);
+
+    property = &InternalStats::kPeriodicCFStats;
+    property_info = GetPropertyInfo(*property);
+    assert(property_info != nullptr);
+    assert(!property_info->need_out_of_mutex);
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->initialized()) {
+        cfd->internal_stats()->GetStringProperty(*property_info, *property,
+                                                 &stats);
+      }
+    }
+  }
+  TEST_SYNC_POINT("DBImpl::DumpStats:2");
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "------- DUMPING STATS -------");
+  ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", stats.c_str());
+  if (immutable_db_options_.dump_malloc_stats) {
+    stats.clear();
+    DumpMallocStats(&stats);
+    if (!stats.empty()) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "------- Malloc STATS -------");
+      ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", stats.c_str());
+    }
+  }
+#endif  // !ROCKSDB_LITE
+
+  PrintStatistics();
+}
+
+// Periodically flush info log out of application buffer at a low frequency.
+// This improves debuggability in case of RocksDB hanging since it ensures the
+// log messages leading up to the hang will eventually become visible in the
+// log.
+void DBImpl::FlushInfoLog() {
+  if (shutdown_initiated_) {
+    return;
+  }
+  TEST_SYNC_POINT("DBImpl::FlushInfoLog:StartRunning");
+  LogFlush(immutable_db_options_.info_log);
+}
+
+Status DBImpl::TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family,
+                                           int max_entries_to_print,
+                                           std::string* out_str) {
+  auto* cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  ColumnFamilyData* cfd = cfh->cfd();
+
+  SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+  Version* version = super_version->current;
+
+  Status s =
+      version->TablesRangeTombstoneSummary(max_entries_to_print, out_str);
+
+  CleanupSuperVersion(super_version);
+  return s;
+}
+
+void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) {
+  mutex_.AssertHeld();
+  if (!job_context->logs_to_free.empty()) {
+    for (auto l : job_context->logs_to_free) {
+      AddToLogsToFreeQueue(l);
+    }
+    job_context->logs_to_free.clear();
+  }
+}
+
+FSDirectory* DBImpl::GetDataDir(ColumnFamilyData* cfd, size_t path_id) const {
+  assert(cfd);
+  FSDirectory* ret_dir = cfd->GetDataDir(path_id);
+  if (ret_dir == nullptr) {
+    return directories_.GetDataDir(path_id);
+  }
+  return ret_dir;
+}
+
+Status DBImpl::SetOptions(
+    ColumnFamilyHandle* column_family,
+    const std::unordered_map<std::string, std::string>& options_map) {
+#ifdef ROCKSDB_LITE
+  (void)column_family;
+  (void)options_map;
+  return Status::NotSupported("Not supported in ROCKSDB LITE");
+#else
+  auto* cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+  if (options_map.empty()) {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "SetOptions() on column family [%s], empty input",
+                   cfd->GetName().c_str());
+    return Status::InvalidArgument("empty input");
+  }
+
+  MutableCFOptions new_options;
+  Status s;
+  Status persist_options_status;
+  SuperVersionContext sv_context(/* create_superversion */ true);
+  {
+    auto db_options = GetDBOptions();
+    InstrumentedMutexLock l(&mutex_);
+    s = cfd->SetOptions(db_options, options_map);
+    if (s.ok()) {
+      new_options = *cfd->GetLatestMutableCFOptions();
+      // Append new version to recompute compaction score.
+      VersionEdit dummy_edit;
+      s = versions_->LogAndApply(cfd, new_options, &dummy_edit, &mutex_,
+                                 directories_.GetDbDir());
+      // Trigger possible flush/compactions. This has to be before we persist
+      // options to file, otherwise there will be a deadlock with writer
+      // thread.
+      InstallSuperVersionAndScheduleWork(cfd, &sv_context, new_options);
+
+      persist_options_status = WriteOptionsFile(
+          false /*need_mutex_lock*/, true /*need_enter_write_thread*/);
+      bg_cv_.SignalAll();
+    }
+  }
+  sv_context.Clean();
+
+  ROCKS_LOG_INFO(
+      immutable_db_options_.info_log,
+      "SetOptions() on column family [%s], inputs:", cfd->GetName().c_str());
+  for (const auto& o : options_map) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(),
+                   o.second.c_str());
+  }
+  if (s.ok()) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "[%s] SetOptions() succeeded", cfd->GetName().c_str());
+    new_options.Dump(immutable_db_options_.info_log.get());
+    if (!persist_options_status.ok()) {
+      // NOTE: WriteOptionsFile already logs on failure
+      s = persist_options_status;
+    }
+  } else {
+    persist_options_status.PermitUncheckedError();  // less important
+    ROCKS_LOG_WARN(immutable_db_options_.info_log, "[%s] SetOptions() failed",
+                   cfd->GetName().c_str());
+  }
+  LogFlush(immutable_db_options_.info_log);
+  return s;
+#endif  // ROCKSDB_LITE
+}
+
+Status DBImpl::SetDBOptions(
+    const std::unordered_map<std::string, std::string>& options_map) {
+#ifdef ROCKSDB_LITE
+  (void)options_map;
+  return Status::NotSupported("Not supported in ROCKSDB LITE");
+#else
+  if (options_map.empty()) {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "SetDBOptions(), empty input.");
+    return Status::InvalidArgument("empty input");
+  }
+
+  MutableDBOptions new_options;
+  Status s;
+  Status persist_options_status = Status::OK();
+  bool wal_changed = false;
+  WriteContext write_context;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    s = GetMutableDBOptionsFromStrings(mutable_db_options_, options_map,
+                                       &new_options);
+
+    if (new_options.bytes_per_sync == 0) {
+      new_options.bytes_per_sync = 1024 * 1024;
+    }
+
+    if (MutableDBOptionsAreEqual(mutable_db_options_, new_options)) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "SetDBOptions(), input option value is not changed, "
+                     "skipping updating.");
+      persist_options_status.PermitUncheckedError();
+      return s;
+    }
+
+    DBOptions new_db_options =
+        BuildDBOptions(immutable_db_options_, new_options);
+    if (s.ok()) {
+      s = ValidateOptions(new_db_options);
+    }
+    if (s.ok()) {
+      for (auto c : *versions_->GetColumnFamilySet()) {
+        if (!c->IsDropped()) {
+          auto cf_options = c->GetLatestCFOptions();
+          s = ColumnFamilyData::ValidateOptions(new_db_options, cf_options);
+          if (!s.ok()) {
+            break;
+          }
+        }
+      }
+    }
+    if (s.ok()) {
+      const BGJobLimits current_bg_job_limits =
+          GetBGJobLimits(mutable_db_options_.max_background_flushes,
+                         mutable_db_options_.max_background_compactions,
+                         mutable_db_options_.max_background_jobs,
+                         /* parallelize_compactions */ true);
+      const BGJobLimits new_bg_job_limits = GetBGJobLimits(
+          new_options.max_background_flushes,
+          new_options.max_background_compactions,
+          new_options.max_background_jobs, /* parallelize_compactions */ true);
+
+      const bool max_flushes_increased =
+          new_bg_job_limits.max_flushes > current_bg_job_limits.max_flushes;
+      const bool max_compactions_increased =
+          new_bg_job_limits.max_compactions >
+          current_bg_job_limits.max_compactions;
+
+      if (max_flushes_increased || max_compactions_increased) {
+        if (max_flushes_increased) {
+          env_->IncBackgroundThreadsIfNeeded(new_bg_job_limits.max_flushes,
+                                             Env::Priority::HIGH);
+        }
+
+        if (max_compactions_increased) {
+          env_->IncBackgroundThreadsIfNeeded(new_bg_job_limits.max_compactions,
+                                             Env::Priority::LOW);
+        }
+
+        MaybeScheduleFlushOrCompaction();
+      }
+
+      mutex_.Unlock();
+      if (new_options.stats_dump_period_sec == 0) {
+        s = periodic_task_scheduler_.Unregister(PeriodicTaskType::kDumpStats);
+      } else {
+        s = periodic_task_scheduler_.Register(
+            PeriodicTaskType::kDumpStats,
+            periodic_task_functions_.at(PeriodicTaskType::kDumpStats),
+            new_options.stats_dump_period_sec);
+      }
+      if (new_options.max_total_wal_size !=
+          mutable_db_options_.max_total_wal_size) {
+        max_total_wal_size_.store(new_options.max_total_wal_size,
+                                  std::memory_order_release);
+      }
+      if (s.ok()) {
+        if (new_options.stats_persist_period_sec == 0) {
+          s = periodic_task_scheduler_.Unregister(
+              PeriodicTaskType::kPersistStats);
+        } else {
+          s = periodic_task_scheduler_.Register(
+              PeriodicTaskType::kPersistStats,
+              periodic_task_functions_.at(PeriodicTaskType::kPersistStats),
+              new_options.stats_persist_period_sec);
+        }
+      }
+      mutex_.Lock();
+      if (!s.ok()) {
+        return s;
+      }
+
+      write_controller_.set_max_delayed_write_rate(
+          new_options.delayed_write_rate);
+      table_cache_.get()->SetCapacity(new_options.max_open_files == -1
+                                          ? TableCache::kInfiniteCapacity
+                                          : new_options.max_open_files - 10);
+      wal_changed = mutable_db_options_.wal_bytes_per_sync !=
+                    new_options.wal_bytes_per_sync;
+      mutable_db_options_ = new_options;
+      file_options_for_compaction_ = FileOptions(new_db_options);
+      file_options_for_compaction_ = fs_->OptimizeForCompactionTableWrite(
+          file_options_for_compaction_, immutable_db_options_);
+      versions_->ChangeFileOptions(mutable_db_options_);
+      // TODO(xiez): clarify why apply optimize for read to write options
+      file_options_for_compaction_ = fs_->OptimizeForCompactionTableRead(
+          file_options_for_compaction_, immutable_db_options_);
+      file_options_for_compaction_.compaction_readahead_size =
+          mutable_db_options_.compaction_readahead_size;
+      WriteThread::Writer w;
+      write_thread_.EnterUnbatched(&w, &mutex_);
+      if (total_log_size_ > GetMaxTotalWalSize() || wal_changed) {
+        Status purge_wal_status = SwitchWAL(&write_context);
+        if (!purge_wal_status.ok()) {
+          ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                         "Unable to purge WAL files in SetDBOptions() -- %s",
+                         purge_wal_status.ToString().c_str());
+        }
+      }
+      persist_options_status = WriteOptionsFile(
+          false /*need_mutex_lock*/, false /*need_enter_write_thread*/);
+      write_thread_.ExitUnbatched(&w);
+    } else {
+      // To get here, we must have had invalid options and will not attempt to
+      // persist the options, which means the status is "OK/Uninitialized.
+      persist_options_status.PermitUncheckedError();
+    }
+  }
+  ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions(), inputs:");
+  for (const auto& o : options_map) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(),
+                   o.second.c_str());
+  }
+  if (s.ok()) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions() succeeded");
+    new_options.Dump(immutable_db_options_.info_log.get());
+    if (!persist_options_status.ok()) {
+      if (immutable_db_options_.fail_if_options_file_error) {
+        s = Status::IOError(
+            "SetDBOptions() succeeded, but unable to persist options",
+            persist_options_status.ToString());
+      }
+      ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                     "Unable to persist options in SetDBOptions() -- %s",
+                     persist_options_status.ToString().c_str());
+    }
+  } else {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log, "SetDBOptions failed");
+  }
+  LogFlush(immutable_db_options_.info_log);
+  return s;
+#endif  // ROCKSDB_LITE
+}
+
+// return the same level if it cannot be moved
+int DBImpl::FindMinimumEmptyLevelFitting(
+    ColumnFamilyData* cfd, const MutableCFOptions& /*mutable_cf_options*/,
+    int level) {
+  mutex_.AssertHeld();
+  const auto* vstorage = cfd->current()->storage_info();
+  int minimum_level = level;
+  for (int i = level - 1; i > 0; --i) {
+    // stop if level i is not empty
+    if (vstorage->NumLevelFiles(i) > 0) break;
+    // stop if level i is too small (cannot fit the level files)
+    if (vstorage->MaxBytesForLevel(i) < vstorage->NumLevelBytes(level)) {
+      break;
+    }
+
+    minimum_level = i;
+  }
+  return minimum_level;
+}
+
+Status DBImpl::FlushWAL(bool sync) {
+  if (manual_wal_flush_) {
+    IOStatus io_s;
+    {
+      // We need to lock log_write_mutex_ since logs_ might change concurrently
+      InstrumentedMutexLock wl(&log_write_mutex_);
+      log::Writer* cur_log_writer = logs_.back().writer;
+      io_s = cur_log_writer->WriteBuffer();
+    }
+    if (!io_s.ok()) {
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s",
+                      io_s.ToString().c_str());
+      // In case there is a fs error we should set it globally to prevent the
+      // future writes
+      IOStatusCheck(io_s);
+      // whether sync or not, we should abort the rest of function upon error
+      return static_cast<Status>(io_s);
+    }
+    if (!sync) {
+      ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=false");
+      return static_cast<Status>(io_s);
+    }
+  }
+  if (!sync) {
+    return Status::OK();
+  }
+  // sync = true
+  ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=true");
+  return SyncWAL();
+}
+
+bool DBImpl::WALBufferIsEmpty(bool lock) {
+  if (lock) {
+    log_write_mutex_.Lock();
+  }
+  log::Writer* cur_log_writer = logs_.back().writer;
+  auto res = cur_log_writer->BufferIsEmpty();
+  if (lock) {
+    log_write_mutex_.Unlock();
+  }
+  return res;
+}
+
+Status DBImpl::SyncWAL() {
+  TEST_SYNC_POINT("DBImpl::SyncWAL:Begin");
+  autovector<log::Writer*, 1> logs_to_sync;
+  bool need_log_dir_sync;
+  uint64_t current_log_number;
+
+  {
+    InstrumentedMutexLock l(&log_write_mutex_);
+    assert(!logs_.empty());
+
+    // This SyncWAL() call only cares about logs up to this number.
+    current_log_number = logfile_number_;
+
+    while (logs_.front().number <= current_log_number &&
+           logs_.front().IsSyncing()) {
+      log_sync_cv_.Wait();
+    }
+    // First check that logs are safe to sync in background.
+    for (auto it = logs_.begin();
+         it != logs_.end() && it->number <= current_log_number; ++it) {
+      if (!it->writer->file()->writable_file()->IsSyncThreadSafe()) {
+        return Status::NotSupported(
+            "SyncWAL() is not supported for this implementation of WAL file",
+            immutable_db_options_.allow_mmap_writes
+                ? "try setting Options::allow_mmap_writes to false"
+                : Slice());
+      }
+    }
+    for (auto it = logs_.begin();
+         it != logs_.end() && it->number <= current_log_number; ++it) {
+      auto& log = *it;
+      log.PrepareForSync();
+      logs_to_sync.push_back(log.writer);
+    }
+
+    need_log_dir_sync = !log_dir_synced_;
+  }
+
+  TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:1");
+  RecordTick(stats_, WAL_FILE_SYNCED);
+  Status status;
+  IOStatus io_s;
+  for (log::Writer* log : logs_to_sync) {
+    io_s = log->file()->SyncWithoutFlush(immutable_db_options_.use_fsync);
+    if (!io_s.ok()) {
+      status = io_s;
+      break;
+    }
+  }
+  if (!io_s.ok()) {
+    ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL Sync error %s",
+                    io_s.ToString().c_str());
+    // In case there is a fs error we should set it globally to prevent the
+    // future writes
+    IOStatusCheck(io_s);
+  }
+  if (status.ok() && need_log_dir_sync) {
+    status = directories_.GetWalDir()->FsyncWithDirOptions(
+        IOOptions(), nullptr,
+        DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+  }
+  TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2");
+
+  TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1");
+  VersionEdit synced_wals;
+  {
+    InstrumentedMutexLock l(&log_write_mutex_);
+    if (status.ok()) {
+      MarkLogsSynced(current_log_number, need_log_dir_sync, &synced_wals);
+    } else {
+      MarkLogsNotSynced(current_log_number);
+    }
+  }
+  if (status.ok() && synced_wals.IsWalAddition()) {
+    InstrumentedMutexLock l(&mutex_);
+    status = ApplyWALToManifest(&synced_wals);
+  }
+
+  TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2");
+
+  return status;
+}
+
+Status DBImpl::ApplyWALToManifest(VersionEdit* synced_wals) {
+  // not empty, write to MANIFEST.
+  mutex_.AssertHeld();
+  Status status = versions_->LogAndApplyToDefaultColumnFamily(
+      synced_wals, &mutex_, directories_.GetDbDir());
+  if (!status.ok() && versions_->io_status().IsIOError()) {
+    status = error_handler_.SetBGError(versions_->io_status(),
+                                       BackgroundErrorReason::kManifestWrite);
+  }
+  return status;
+}
+
+Status DBImpl::LockWAL() {
+  log_write_mutex_.Lock();
+  auto cur_log_writer = logs_.back().writer;
+  IOStatus status = cur_log_writer->WriteBuffer();
+  if (!status.ok()) {
+    ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s",
+                    status.ToString().c_str());
+    // In case there is a fs error we should set it globally to prevent the
+    // future writes
+    WriteStatusCheck(status);
+  }
+  return static_cast<Status>(status);
+}
+
+Status DBImpl::UnlockWAL() {
+  log_write_mutex_.Unlock();
+  return Status::OK();
+}
+
+void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir,
+                            VersionEdit* synced_wals) {
+  log_write_mutex_.AssertHeld();
+  if (synced_dir && logfile_number_ == up_to) {
+    log_dir_synced_ = true;
+  }
+  for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) {
+    auto& wal = *it;
+    assert(wal.IsSyncing());
+
+    if (wal.number < logs_.back().number) {
+      // Inactive WAL
+      if (immutable_db_options_.track_and_verify_wals_in_manifest &&
+          wal.GetPreSyncSize() > 0) {
+        synced_wals->AddWal(wal.number, WalMetadata(wal.GetPreSyncSize()));
+      }
+      if (wal.GetPreSyncSize() == wal.writer->file()->GetFlushedSize()) {
+        // Fully synced
+        logs_to_free_.push_back(wal.ReleaseWriter());
+        it = logs_.erase(it);
+      } else {
+        assert(wal.GetPreSyncSize() < wal.writer->file()->GetFlushedSize());
+        wal.FinishSync();
+        ++it;
+      }
+    } else {
+      assert(wal.number == logs_.back().number);
+      // Active WAL
+      wal.FinishSync();
+      ++it;
+    }
+  }
+  log_sync_cv_.SignalAll();
+}
+
+void DBImpl::MarkLogsNotSynced(uint64_t up_to) {
+  log_write_mutex_.AssertHeld();
+  for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;
+       ++it) {
+    auto& wal = *it;
+    wal.FinishSync();
+  }
+  log_sync_cv_.SignalAll();
+}
+
+SequenceNumber DBImpl::GetLatestSequenceNumber() const {
+  return versions_->LastSequence();
+}
+
+void DBImpl::SetLastPublishedSequence(SequenceNumber seq) {
+  versions_->SetLastPublishedSequence(seq);
+}
+
+Status DBImpl::GetFullHistoryTsLow(ColumnFamilyHandle* column_family,
+                                   std::string* ts_low) {
+  if (ts_low == nullptr) {
+    return Status::InvalidArgument("ts_low is nullptr");
+  }
+  ColumnFamilyData* cfd = nullptr;
+  if (column_family == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+    assert(cfh != nullptr);
+    cfd = cfh->cfd();
+  }
+  assert(cfd != nullptr && cfd->user_comparator() != nullptr);
+  if (cfd->user_comparator()->timestamp_size() == 0) {
+    return Status::InvalidArgument(
+        "Timestamp is not enabled in this column family");
+  }
+  InstrumentedMutexLock l(&mutex_);
+  *ts_low = cfd->GetFullHistoryTsLow();
+  assert(cfd->user_comparator()->timestamp_size() == ts_low->size());
+  return Status::OK();
+}
+
+InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options,
+                                              Arena* arena,
+                                              SequenceNumber sequence,
+                                              ColumnFamilyHandle* column_family,
+                                              bool allow_unprepared_value) {
+  ColumnFamilyData* cfd;
+  if (column_family == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+    cfd = cfh->cfd();
+  }
+
+  mutex_.Lock();
+  SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
+  mutex_.Unlock();
+  return NewInternalIterator(read_options, cfd, super_version, arena, sequence,
+                             allow_unprepared_value);
+}
+
+void DBImpl::SchedulePurge() {
+  mutex_.AssertHeld();
+  assert(opened_successfully_);
+
+  // Purge operations are put into High priority queue
+  bg_purge_scheduled_++;
+  env_->Schedule(&DBImpl::BGWorkPurge, this, Env::Priority::HIGH, nullptr);
+}
+
+void DBImpl::BackgroundCallPurge() {
+  mutex_.Lock();
+
+  while (!logs_to_free_queue_.empty()) {
+    assert(!logs_to_free_queue_.empty());
+    log::Writer* log_writer = *(logs_to_free_queue_.begin());
+    logs_to_free_queue_.pop_front();
+    mutex_.Unlock();
+    delete log_writer;
+    mutex_.Lock();
+  }
+  while (!superversions_to_free_queue_.empty()) {
+    assert(!superversions_to_free_queue_.empty());
+    SuperVersion* sv = superversions_to_free_queue_.front();
+    superversions_to_free_queue_.pop_front();
+    mutex_.Unlock();
+    delete sv;
+    mutex_.Lock();
+  }
+
+  assert(bg_purge_scheduled_ > 0);
+
+  // Can't use iterator to go over purge_files_ because inside the loop we're
+  // unlocking the mutex that protects purge_files_.
+  while (!purge_files_.empty()) {
+    auto it = purge_files_.begin();
+    // Need to make a copy of the PurgeFilesInfo before unlocking the mutex.
+    PurgeFileInfo purge_file = it->second;
+
+    const std::string& fname = purge_file.fname;
+    const std::string& dir_to_sync = purge_file.dir_to_sync;
+    FileType type = purge_file.type;
+    uint64_t number = purge_file.number;
+    int job_id = purge_file.job_id;
+
+    purge_files_.erase(it);
+
+    mutex_.Unlock();
+    DeleteObsoleteFileImpl(job_id, fname, dir_to_sync, type, number);
+    mutex_.Lock();
+  }
+
+  bg_purge_scheduled_--;
+
+  bg_cv_.SignalAll();
+  // IMPORTANT:there should be no code after calling SignalAll. This call may
+  // signal the DB destructor that it's OK to proceed with destruction. In
+  // that case, all DB variables will be dealloacated and referencing them
+  // will cause trouble.
+  mutex_.Unlock();
+}
+
+namespace {
+
+// A `SuperVersionHandle` holds a non-null `SuperVersion*` pointing at a
+// `SuperVersion` referenced once for this object. It also contains the state
+// needed to clean up the `SuperVersion` reference from outside of `DBImpl`
+// using `CleanupSuperVersionHandle()`.
+struct SuperVersionHandle {
+  // `_super_version` must be non-nullptr and `Ref()`'d once as long as the
+  // `SuperVersionHandle` may use it.
+  SuperVersionHandle(DBImpl* _db, InstrumentedMutex* _mu,
+                     SuperVersion* _super_version, bool _background_purge)
+      : db(_db),
+        mu(_mu),
+        super_version(_super_version),
+        background_purge(_background_purge) {}
+
+  DBImpl* db;
+  InstrumentedMutex* mu;
+  SuperVersion* super_version;
+  bool background_purge;
+};
+
+static void CleanupSuperVersionHandle(void* arg1, void* /*arg2*/) {
+  SuperVersionHandle* sv_handle = reinterpret_cast<SuperVersionHandle*>(arg1);
+
+  if (sv_handle->super_version->Unref()) {
+    // Job id == 0 means that this is not our background process, but rather
+    // user thread
+    JobContext job_context(0);
+
+    sv_handle->mu->Lock();
+    sv_handle->super_version->Cleanup();
+    sv_handle->db->FindObsoleteFiles(&job_context, false, true);
+    if (sv_handle->background_purge) {
+      sv_handle->db->ScheduleBgLogWriterClose(&job_context);
+      sv_handle->db->AddSuperVersionsToFreeQueue(sv_handle->super_version);
+      sv_handle->db->SchedulePurge();
+    }
+    sv_handle->mu->Unlock();
+
+    if (!sv_handle->background_purge) {
+      delete sv_handle->super_version;
+    }
+    if (job_context.HaveSomethingToDelete()) {
+      sv_handle->db->PurgeObsoleteFiles(job_context,
+                                        sv_handle->background_purge);
+    }
+    job_context.Clean();
+  }
+
+  delete sv_handle;
+}
+
+struct GetMergeOperandsState {
+  MergeContext merge_context;
+  PinnedIteratorsManager pinned_iters_mgr;
+  SuperVersionHandle* sv_handle;
+};
+
+static void CleanupGetMergeOperandsState(void* arg1, void* /*arg2*/) {
+  GetMergeOperandsState* state = static_cast<GetMergeOperandsState*>(arg1);
+  CleanupSuperVersionHandle(state->sv_handle /* arg1 */, nullptr /* arg2 */);
+  delete state;
+}
+
+}  // namespace
+
+InternalIterator* DBImpl::NewInternalIterator(
+    const ReadOptions& read_options, ColumnFamilyData* cfd,
+    SuperVersion* super_version, Arena* arena, SequenceNumber sequence,
+    bool allow_unprepared_value, ArenaWrappedDBIter* db_iter) {
+  InternalIterator* internal_iter;
+  assert(arena != nullptr);
+  // Need to create internal iterator from the arena.
+  MergeIteratorBuilder merge_iter_builder(
+      &cfd->internal_comparator(), arena,
+      !read_options.total_order_seek &&
+          super_version->mutable_cf_options.prefix_extractor != nullptr,
+      read_options.iterate_upper_bound);
+  // Collect iterator for mutable memtable
+  auto mem_iter = super_version->mem->NewIterator(read_options, arena);
+  Status s;
+  if (!read_options.ignore_range_deletions) {
+    TruncatedRangeDelIterator* mem_tombstone_iter = nullptr;
+    auto range_del_iter = super_version->mem->NewRangeTombstoneIterator(
+        read_options, sequence, false /* immutable_memtable */);
+    if (range_del_iter == nullptr || range_del_iter->empty()) {
+      delete range_del_iter;
+    } else {
+      mem_tombstone_iter = new TruncatedRangeDelIterator(
+          std::unique_ptr<FragmentedRangeTombstoneIterator>(range_del_iter),
+          &cfd->ioptions()->internal_comparator, nullptr /* smallest */,
+          nullptr /* largest */);
+    }
+    merge_iter_builder.AddPointAndTombstoneIterator(mem_iter,
+                                                    mem_tombstone_iter);
+  } else {
+    merge_iter_builder.AddIterator(mem_iter);
+  }
+
+  // Collect all needed child iterators for immutable memtables
+  if (s.ok()) {
+    super_version->imm->AddIterators(read_options, &merge_iter_builder,
+                                     !read_options.ignore_range_deletions);
+  }
+  TEST_SYNC_POINT_CALLBACK("DBImpl::NewInternalIterator:StatusCallback", &s);
+  if (s.ok()) {
+    // Collect iterators for files in L0 - Ln
+    if (read_options.read_tier != kMemtableTier) {
+      super_version->current->AddIterators(read_options, file_options_,
+                                           &merge_iter_builder,
+                                           allow_unprepared_value);
+    }
+    internal_iter = merge_iter_builder.Finish(
+        read_options.ignore_range_deletions ? nullptr : db_iter);
+    SuperVersionHandle* cleanup = new SuperVersionHandle(
+        this, &mutex_, super_version,
+        read_options.background_purge_on_iterator_cleanup ||
+            immutable_db_options_.avoid_unnecessary_blocking_io);
+    internal_iter->RegisterCleanup(CleanupSuperVersionHandle, cleanup, nullptr);
+
+    return internal_iter;
+  } else {
+    CleanupSuperVersion(super_version);
+  }
+  return NewErrorInternalIterator<Slice>(s, arena);
+}
+
+ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const {
+  return default_cf_handle_;
+}
+
+ColumnFamilyHandle* DBImpl::PersistentStatsColumnFamily() const {
+  return persist_stats_cf_handle_;
+}
+
+Status DBImpl::Get(const ReadOptions& read_options,
+                   ColumnFamilyHandle* column_family, const Slice& key,
+                   PinnableSlice* value) {
+  return Get(read_options, column_family, key, value, /*timestamp=*/nullptr);
+}
+
+Status DBImpl::Get(const ReadOptions& read_options,
+                   ColumnFamilyHandle* column_family, const Slice& key,
+                   PinnableSlice* value, std::string* timestamp) {
+  assert(value != nullptr);
+  value->Reset();
+  GetImplOptions get_impl_options;
+  get_impl_options.column_family = column_family;
+  get_impl_options.value = value;
+  get_impl_options.timestamp = timestamp;
+  Status s = GetImpl(read_options, key, get_impl_options);
+  return s;
+}
+
+Status DBImpl::GetEntity(const ReadOptions& read_options,
+                         ColumnFamilyHandle* column_family, const Slice& key,
+                         PinnableWideColumns* columns) {
+  if (!column_family) {
+    return Status::InvalidArgument(
+        "Cannot call GetEntity without a column family handle");
+  }
+
+  if (!columns) {
+    return Status::InvalidArgument(
+        "Cannot call GetEntity without a PinnableWideColumns object");
+  }
+
+  columns->Reset();
+
+  GetImplOptions get_impl_options;
+  get_impl_options.column_family = column_family;
+  get_impl_options.columns = columns;
+
+  return GetImpl(read_options, key, get_impl_options);
+}
+
+bool DBImpl::ShouldReferenceSuperVersion(const MergeContext& merge_context) {
+  // If both thresholds are reached, a function returning merge operands as
+  // `PinnableSlice`s should reference the `SuperVersion` to avoid large and/or
+  // numerous `memcpy()`s.
+  //
+  // The below constants enable the optimization conservatively. They are
+  // verified to not regress `GetMergeOperands()` latency in the following
+  // scenarios.
+  //
+  // - CPU: two socket Intel(R) Xeon(R) Gold 6138 CPU @ 2.00GHz
+  // - `GetMergeOperands()` threads: 1 - 32
+  // - Entry size: 32 bytes - 4KB
+  // - Merges per key: 1 - 16K
+  // - LSM component: memtable
+  //
+  // TODO(ajkr): expand measurement to SST files.
+  static const size_t kNumBytesForSvRef = 32768;
+  static const size_t kLog2AvgBytesForSvRef = 8;  // 256 bytes
+
+  size_t num_bytes = 0;
+  for (const Slice& sl : merge_context.GetOperands()) {
+    num_bytes += sl.size();
+  }
+  return num_bytes >= kNumBytesForSvRef &&
+         (num_bytes >> kLog2AvgBytesForSvRef) >=
+             merge_context.GetOperands().size();
+}
+
+Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
+                       GetImplOptions& get_impl_options) {
+  assert(get_impl_options.value != nullptr ||
+         get_impl_options.merge_operands != nullptr ||
+         get_impl_options.columns != nullptr);
+
+  assert(get_impl_options.column_family);
+
+  if (read_options.timestamp) {
+    const Status s = FailIfTsMismatchCf(get_impl_options.column_family,
+                                        *(read_options.timestamp),
+                                        /*ts_for_read=*/true);
+    if (!s.ok()) {
+      return s;
+    }
+  } else {
+    const Status s = FailIfCfHasTs(get_impl_options.column_family);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  // Clear the timestamps for returning results so that we can distinguish
+  // between tombstone or key that has never been written
+  if (get_impl_options.timestamp) {
+    get_impl_options.timestamp->clear();
+  }
+
+  GetWithTimestampReadCallback read_cb(0);  // Will call Refresh
+
+  PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock);
+  StopWatch sw(immutable_db_options_.clock, stats_, DB_GET);
+  PERF_TIMER_GUARD(get_snapshot_time);
+
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(
+      get_impl_options.column_family);
+  auto cfd = cfh->cfd();
+
+  if (tracer_) {
+    // TODO: This mutex should be removed later, to improve performance when
+    // tracing is enabled.
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      // TODO: maybe handle the tracing status?
+      tracer_->Get(get_impl_options.column_family, key).PermitUncheckedError();
+    }
+  }
+
+  if (get_impl_options.get_merge_operands_options != nullptr) {
+    for (int i = 0; i < get_impl_options.get_merge_operands_options
+                            ->expected_max_number_of_operands;
+         ++i) {
+      get_impl_options.merge_operands[i].Reset();
+    }
+  }
+
+  // Acquire SuperVersion
+  SuperVersion* sv = GetAndRefSuperVersion(cfd);
+
+  TEST_SYNC_POINT("DBImpl::GetImpl:1");
+  TEST_SYNC_POINT("DBImpl::GetImpl:2");
+
+  SequenceNumber snapshot;
+  if (read_options.snapshot != nullptr) {
+    if (get_impl_options.callback) {
+      // Already calculated based on read_options.snapshot
+      snapshot = get_impl_options.callback->max_visible_seq();
+    } else {
+      snapshot =
+          reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)->number_;
+    }
+  } else {
+    // Note that the snapshot is assigned AFTER referencing the super
+    // version because otherwise a flush happening in between may compact away
+    // data for the snapshot, so the reader would see neither data that was be
+    // visible to the snapshot before compaction nor the newer data inserted
+    // afterwards.
+    snapshot = GetLastPublishedSequence();
+    if (get_impl_options.callback) {
+      // The unprep_seqs are not published for write unprepared, so it could be
+      // that max_visible_seq is larger. Seek to the std::max of the two.
+      // However, we still want our callback to contain the actual snapshot so
+      // that it can do the correct visibility filtering.
+      get_impl_options.callback->Refresh(snapshot);
+
+      // Internally, WriteUnpreparedTxnReadCallback::Refresh would set
+      // max_visible_seq = max(max_visible_seq, snapshot)
+      //
+      // Currently, the commented out assert is broken by
+      // InvalidSnapshotReadCallback, but if write unprepared recovery followed
+      // the regular transaction flow, then this special read callback would not
+      // be needed.
+      //
+      // assert(callback->max_visible_seq() >= snapshot);
+      snapshot = get_impl_options.callback->max_visible_seq();
+    }
+  }
+  // If timestamp is used, we use read callback to ensure <key,t,s> is returned
+  // only if t <= read_opts.timestamp and s <= snapshot.
+  // HACK: temporarily overwrite input struct field but restore
+  SaveAndRestore<ReadCallback*> restore_callback(&get_impl_options.callback);
+  const Comparator* ucmp = get_impl_options.column_family->GetComparator();
+  assert(ucmp);
+  if (ucmp->timestamp_size() > 0) {
+    assert(!get_impl_options
+                .callback);  // timestamp with callback is not supported
+    read_cb.Refresh(snapshot);
+    get_impl_options.callback = &read_cb;
+  }
+  TEST_SYNC_POINT("DBImpl::GetImpl:3");
+  TEST_SYNC_POINT("DBImpl::GetImpl:4");
+
+  // Prepare to store a list of merge operations if merge occurs.
+  MergeContext merge_context;
+  SequenceNumber max_covering_tombstone_seq = 0;
+
+  Status s;
+  // First look in the memtable, then in the immutable memtable (if any).
+  // s is both in/out. When in, s could either be OK or MergeInProgress.
+  // merge_operands will contain the sequence of merges in the latter case.
+  LookupKey lkey(key, snapshot, read_options.timestamp);
+  PERF_TIMER_STOP(get_snapshot_time);
+
+  bool skip_memtable = (read_options.read_tier == kPersistedTier &&
+                        has_unpersisted_data_.load(std::memory_order_relaxed));
+  bool done = false;
+  std::string* timestamp =
+      ucmp->timestamp_size() > 0 ? get_impl_options.timestamp : nullptr;
+  if (!skip_memtable) {
+    // Get value associated with key
+    if (get_impl_options.get_value) {
+      if (sv->mem->Get(
+              lkey,
+              get_impl_options.value ? get_impl_options.value->GetSelf()
+                                     : nullptr,
+              get_impl_options.columns, timestamp, &s, &merge_context,
+              &max_covering_tombstone_seq, read_options,
+              false /* immutable_memtable */, get_impl_options.callback,
+              get_impl_options.is_blob_index)) {
+        done = true;
+
+        if (get_impl_options.value) {
+          get_impl_options.value->PinSelf();
+        }
+
+        RecordTick(stats_, MEMTABLE_HIT);
+      } else if ((s.ok() || s.IsMergeInProgress()) &&
+                 sv->imm->Get(lkey,
+                              get_impl_options.value
+                                  ? get_impl_options.value->GetSelf()
+                                  : nullptr,
+                              get_impl_options.columns, timestamp, &s,
+                              &merge_context, &max_covering_tombstone_seq,
+                              read_options, get_impl_options.callback,
+                              get_impl_options.is_blob_index)) {
+        done = true;
+
+        if (get_impl_options.value) {
+          get_impl_options.value->PinSelf();
+        }
+
+        RecordTick(stats_, MEMTABLE_HIT);
+      }
+    } else {
+      // Get Merge Operands associated with key, Merge Operands should not be
+      // merged and raw values should be returned to the user.
+      if (sv->mem->Get(lkey, /*value=*/nullptr, /*columns=*/nullptr,
+                       /*timestamp=*/nullptr, &s, &merge_context,
+                       &max_covering_tombstone_seq, read_options,
+                       false /* immutable_memtable */, nullptr, nullptr,
+                       false)) {
+        done = true;
+        RecordTick(stats_, MEMTABLE_HIT);
+      } else if ((s.ok() || s.IsMergeInProgress()) &&
+                 sv->imm->GetMergeOperands(lkey, &s, &merge_context,
+                                           &max_covering_tombstone_seq,
+                                           read_options)) {
+        done = true;
+        RecordTick(stats_, MEMTABLE_HIT);
+      }
+    }
+    if (!done && !s.ok() && !s.IsMergeInProgress()) {
+      ReturnAndCleanupSuperVersion(cfd, sv);
+      return s;
+    }
+  }
+  TEST_SYNC_POINT("DBImpl::GetImpl:PostMemTableGet:0");
+  TEST_SYNC_POINT("DBImpl::GetImpl:PostMemTableGet:1");
+  PinnedIteratorsManager pinned_iters_mgr;
+  if (!done) {
+    PERF_TIMER_GUARD(get_from_output_files_time);
+    sv->current->Get(
+        read_options, lkey, get_impl_options.value, get_impl_options.columns,
+        timestamp, &s, &merge_context, &max_covering_tombstone_seq,
+        &pinned_iters_mgr,
+        get_impl_options.get_value ? get_impl_options.value_found : nullptr,
+        nullptr, nullptr,
+        get_impl_options.get_value ? get_impl_options.callback : nullptr,
+        get_impl_options.get_value ? get_impl_options.is_blob_index : nullptr,
+        get_impl_options.get_value);
+    RecordTick(stats_, MEMTABLE_MISS);
+  }
+
+  {
+    PERF_TIMER_GUARD(get_post_process_time);
+
+    RecordTick(stats_, NUMBER_KEYS_READ);
+    size_t size = 0;
+    if (s.ok()) {
+      if (get_impl_options.get_value) {
+        if (get_impl_options.value) {
+          size = get_impl_options.value->size();
+        } else if (get_impl_options.columns) {
+          size = get_impl_options.columns->serialized_size();
+        }
+      } else {
+        // Return all merge operands for get_impl_options.key
+        *get_impl_options.number_of_operands =
+            static_cast<int>(merge_context.GetNumOperands());
+        if (*get_impl_options.number_of_operands >
+            get_impl_options.get_merge_operands_options
+                ->expected_max_number_of_operands) {
+          s = Status::Incomplete(
+              Status::SubCode::KMergeOperandsInsufficientCapacity);
+        } else {
+          // Each operand depends on one of the following resources: `sv`,
+          // `pinned_iters_mgr`, or `merge_context`. It would be crazy expensive
+          // to reference `sv` for each operand relying on it because `sv` is
+          // (un)ref'd in all threads using the DB. Furthermore, we do not track
+          // on which resource each operand depends.
+          //
+          // To solve this, we bundle the resources in a `GetMergeOperandsState`
+          // and manage them with a `SharedCleanablePtr` shared among the
+          // `PinnableSlice`s we return. This bundle includes one `sv` reference
+          // and ownership of the `merge_context` and `pinned_iters_mgr`
+          // objects.
+          bool ref_sv = ShouldReferenceSuperVersion(merge_context);
+          if (ref_sv) {
+            assert(!merge_context.GetOperands().empty());
+            SharedCleanablePtr shared_cleanable;
+            GetMergeOperandsState* state = nullptr;
+            state = new GetMergeOperandsState();
+            state->merge_context = std::move(merge_context);
+            state->pinned_iters_mgr = std::move(pinned_iters_mgr);
+
+            sv->Ref();
+
+            state->sv_handle = new SuperVersionHandle(
+                this, &mutex_, sv,
+                immutable_db_options_.avoid_unnecessary_blocking_io);
+
+            shared_cleanable.Allocate();
+            shared_cleanable->RegisterCleanup(CleanupGetMergeOperandsState,
+                                              state /* arg1 */,
+                                              nullptr /* arg2 */);
+            for (size_t i = 0; i < state->merge_context.GetOperands().size();
+                 ++i) {
+              const Slice& sl = state->merge_context.GetOperands()[i];
+              size += sl.size();
+
+              get_impl_options.merge_operands->PinSlice(
+                  sl, nullptr /* cleanable */);
+              if (i == state->merge_context.GetOperands().size() - 1) {
+                shared_cleanable.MoveAsCleanupTo(
+                    get_impl_options.merge_operands);
+              } else {
+                shared_cleanable.RegisterCopyWith(
+                    get_impl_options.merge_operands);
+              }
+              get_impl_options.merge_operands++;
+            }
+          } else {
+            for (const Slice& sl : merge_context.GetOperands()) {
+              size += sl.size();
+              get_impl_options.merge_operands->PinSelf(sl);
+              get_impl_options.merge_operands++;
+            }
+          }
+        }
+      }
+      RecordTick(stats_, BYTES_READ, size);
+      PERF_COUNTER_ADD(get_read_bytes, size);
+    }
+
+    ReturnAndCleanupSuperVersion(cfd, sv);
+
+    RecordInHistogram(stats_, BYTES_PER_READ, size);
+  }
+  return s;
+}
+
+std::vector<Status> DBImpl::MultiGet(
+    const ReadOptions& read_options,
+    const std::vector<ColumnFamilyHandle*>& column_family,
+    const std::vector<Slice>& keys, std::vector<std::string>* values) {
+  return MultiGet(read_options, column_family, keys, values,
+                  /*timestamps=*/nullptr);
+}
+
+std::vector<Status> DBImpl::MultiGet(
+    const ReadOptions& read_options,
+    const std::vector<ColumnFamilyHandle*>& column_family,
+    const std::vector<Slice>& keys, std::vector<std::string>* values,
+    std::vector<std::string>* timestamps) {
+  PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock);
+  StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET);
+  PERF_TIMER_GUARD(get_snapshot_time);
+
+  size_t num_keys = keys.size();
+  assert(column_family.size() == num_keys);
+  std::vector<Status> stat_list(num_keys);
+
+  bool should_fail = false;
+  for (size_t i = 0; i < num_keys; ++i) {
+    assert(column_family[i]);
+    if (read_options.timestamp) {
+      stat_list[i] = FailIfTsMismatchCf(
+          column_family[i], *(read_options.timestamp), /*ts_for_read=*/true);
+      if (!stat_list[i].ok()) {
+        should_fail = true;
+      }
+    } else {
+      stat_list[i] = FailIfCfHasTs(column_family[i]);
+      if (!stat_list[i].ok()) {
+        should_fail = true;
+      }
+    }
+  }
+
+  if (should_fail) {
+    for (auto& s : stat_list) {
+      if (s.ok()) {
+        s = Status::Incomplete(
+            "DB not queried due to invalid argument(s) in the same MultiGet");
+      }
+    }
+    return stat_list;
+  }
+
+  if (tracer_) {
+    // TODO: This mutex should be removed later, to improve performance when
+    // tracing is enabled.
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      // TODO: maybe handle the tracing status?
+      tracer_->MultiGet(column_family, keys).PermitUncheckedError();
+    }
+  }
+
+  SequenceNumber consistent_seqnum;
+
+  UnorderedMap<uint32_t, MultiGetColumnFamilyData> multiget_cf_data(
+      column_family.size());
+  for (auto cf : column_family) {
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(cf);
+    auto cfd = cfh->cfd();
+    if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) {
+      multiget_cf_data.emplace(cfd->GetID(),
+                               MultiGetColumnFamilyData(cfh, nullptr));
+    }
+  }
+
+  std::function<MultiGetColumnFamilyData*(
+      UnorderedMap<uint32_t, MultiGetColumnFamilyData>::iterator&)>
+      iter_deref_lambda =
+          [](UnorderedMap<uint32_t, MultiGetColumnFamilyData>::iterator&
+                 cf_iter) { return &cf_iter->second; };
+
+  bool unref_only =
+      MultiCFSnapshot<UnorderedMap<uint32_t, MultiGetColumnFamilyData>>(
+          read_options, nullptr, iter_deref_lambda, &multiget_cf_data,
+          &consistent_seqnum);
+
+  TEST_SYNC_POINT("DBImpl::MultiGet:AfterGetSeqNum1");
+  TEST_SYNC_POINT("DBImpl::MultiGet:AfterGetSeqNum2");
+
+  // Contain a list of merge operations if merge occurs.
+  MergeContext merge_context;
+
+  // Note: this always resizes the values array
+  values->resize(num_keys);
+  if (timestamps) {
+    timestamps->resize(num_keys);
+  }
+
+  // Keep track of bytes that we read for statistics-recording later
+  uint64_t bytes_read = 0;
+  PERF_TIMER_STOP(get_snapshot_time);
+
+  // For each of the given keys, apply the entire "get" process as follows:
+  // First look in the memtable, then in the immutable memtable (if any).
+  // s is both in/out. When in, s could either be OK or MergeInProgress.
+  // merge_operands will contain the sequence of merges in the latter case.
+  size_t num_found = 0;
+  size_t keys_read;
+  uint64_t curr_value_size = 0;
+
+  GetWithTimestampReadCallback timestamp_read_callback(0);
+  ReadCallback* read_callback = nullptr;
+  if (read_options.timestamp && read_options.timestamp->size() > 0) {
+    timestamp_read_callback.Refresh(consistent_seqnum);
+    read_callback = &timestamp_read_callback;
+  }
+
+  for (keys_read = 0; keys_read < num_keys; ++keys_read) {
+    merge_context.Clear();
+    Status& s = stat_list[keys_read];
+    std::string* value = &(*values)[keys_read];
+    std::string* timestamp = timestamps ? &(*timestamps)[keys_read] : nullptr;
+
+    LookupKey lkey(keys[keys_read], consistent_seqnum, read_options.timestamp);
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(
+        column_family[keys_read]);
+    SequenceNumber max_covering_tombstone_seq = 0;
+    auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID());
+    assert(mgd_iter != multiget_cf_data.end());
+    auto mgd = mgd_iter->second;
+    auto super_version = mgd.super_version;
+    bool skip_memtable =
+        (read_options.read_tier == kPersistedTier &&
+         has_unpersisted_data_.load(std::memory_order_relaxed));
+    bool done = false;
+    if (!skip_memtable) {
+      if (super_version->mem->Get(
+              lkey, value, /*columns=*/nullptr, timestamp, &s, &merge_context,
+              &max_covering_tombstone_seq, read_options,
+              false /* immutable_memtable */, read_callback)) {
+        done = true;
+        RecordTick(stats_, MEMTABLE_HIT);
+      } else if (super_version->imm->Get(lkey, value, /*columns=*/nullptr,
+                                         timestamp, &s, &merge_context,
+                                         &max_covering_tombstone_seq,
+                                         read_options, read_callback)) {
+        done = true;
+        RecordTick(stats_, MEMTABLE_HIT);
+      }
+    }
+    if (!done) {
+      PinnableSlice pinnable_val;
+      PERF_TIMER_GUARD(get_from_output_files_time);
+      PinnedIteratorsManager pinned_iters_mgr;
+      super_version->current->Get(read_options, lkey, &pinnable_val,
+                                  /*columns=*/nullptr, timestamp, &s,
+                                  &merge_context, &max_covering_tombstone_seq,
+                                  &pinned_iters_mgr, /*value_found=*/nullptr,
+                                  /*key_exists=*/nullptr,
+                                  /*seq=*/nullptr, read_callback);
+      value->assign(pinnable_val.data(), pinnable_val.size());
+      RecordTick(stats_, MEMTABLE_MISS);
+    }
+
+    if (s.ok()) {
+      bytes_read += value->size();
+      num_found++;
+      curr_value_size += value->size();
+      if (curr_value_size > read_options.value_size_soft_limit) {
+        while (++keys_read < num_keys) {
+          stat_list[keys_read] = Status::Aborted();
+        }
+        break;
+      }
+    }
+    if (read_options.deadline.count() &&
+        immutable_db_options_.clock->NowMicros() >
+            static_cast<uint64_t>(read_options.deadline.count())) {
+      break;
+    }
+  }
+
+  if (keys_read < num_keys) {
+    // The only reason to break out of the loop is when the deadline is
+    // exceeded
+    assert(immutable_db_options_.clock->NowMicros() >
+           static_cast<uint64_t>(read_options.deadline.count()));
+    for (++keys_read; keys_read < num_keys; ++keys_read) {
+      stat_list[keys_read] = Status::TimedOut();
+    }
+  }
+
+  // Post processing (decrement reference counts and record statistics)
+  PERF_TIMER_GUARD(get_post_process_time);
+  autovector<SuperVersion*> superversions_to_delete;
+
+  for (auto mgd_iter : multiget_cf_data) {
+    auto mgd = mgd_iter.second;
+    if (!unref_only) {
+      ReturnAndCleanupSuperVersion(mgd.cfd, mgd.super_version);
+    } else {
+      mgd.cfd->GetSuperVersion()->Unref();
+    }
+  }
+  RecordTick(stats_, NUMBER_MULTIGET_CALLS);
+  RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys);
+  RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found);
+  RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read);
+  RecordInHistogram(stats_, BYTES_PER_MULTIGET, bytes_read);
+  PERF_COUNTER_ADD(multiget_read_bytes, bytes_read);
+  PERF_TIMER_STOP(get_post_process_time);
+
+  return stat_list;
+}
+
+template <class T>
+bool DBImpl::MultiCFSnapshot(
+    const ReadOptions& read_options, ReadCallback* callback,
+    std::function<MultiGetColumnFamilyData*(typename T::iterator&)>&
+        iter_deref_func,
+    T* cf_list, SequenceNumber* snapshot) {
+  PERF_TIMER_GUARD(get_snapshot_time);
+
+  bool last_try = false;
+  if (cf_list->size() == 1) {
+    // Fast path for a single column family. We can simply get the thread loca
+    // super version
+    auto cf_iter = cf_list->begin();
+    auto node = iter_deref_func(cf_iter);
+    node->super_version = GetAndRefSuperVersion(node->cfd);
+    if (read_options.snapshot != nullptr) {
+      // Note: In WritePrepared txns this is not necessary but not harmful
+      // either.  Because prep_seq > snapshot => commit_seq > snapshot so if
+      // a snapshot is specified we should be fine with skipping seq numbers
+      // that are greater than that.
+      //
+      // In WriteUnprepared, we cannot set snapshot in the lookup key because we
+      // may skip uncommitted data that should be visible to the transaction for
+      // reading own writes.
+      *snapshot =
+          static_cast<const SnapshotImpl*>(read_options.snapshot)->number_;
+      if (callback) {
+        *snapshot = std::max(*snapshot, callback->max_visible_seq());
+      }
+    } else {
+      // Since we get and reference the super version before getting
+      // the snapshot number, without a mutex protection, it is possible
+      // that a memtable switch happened in the middle and not all the
+      // data for this snapshot is available. But it will contain all
+      // the data available in the super version we have, which is also
+      // a valid snapshot to read from.
+      // We shouldn't get snapshot before finding and referencing the super
+      // version because a flush happening in between may compact away data for
+      // the snapshot, but the snapshot is earlier than the data overwriting it,
+      // so users may see wrong results.
+      *snapshot = GetLastPublishedSequence();
+    }
+  } else {
+    // If we end up with the same issue of memtable geting sealed during 2
+    // consecutive retries, it means the write rate is very high. In that case
+    // its probably ok to take the mutex on the 3rd try so we can succeed for
+    // sure
+    constexpr int num_retries = 3;
+    for (int i = 0; i < num_retries; ++i) {
+      last_try = (i == num_retries - 1);
+      bool retry = false;
+
+      if (i > 0) {
+        for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end();
+             ++cf_iter) {
+          auto node = iter_deref_func(cf_iter);
+          SuperVersion* super_version = node->super_version;
+          ColumnFamilyData* cfd = node->cfd;
+          if (super_version != nullptr) {
+            ReturnAndCleanupSuperVersion(cfd, super_version);
+          }
+          node->super_version = nullptr;
+        }
+      }
+      if (read_options.snapshot == nullptr) {
+        if (last_try) {
+          TEST_SYNC_POINT("DBImpl::MultiGet::LastTry");
+          // We're close to max number of retries. For the last retry,
+          // acquire the lock so we're sure to succeed
+          mutex_.Lock();
+        }
+        *snapshot = GetLastPublishedSequence();
+      } else {
+        *snapshot =
+            static_cast_with_check<const SnapshotImpl>(read_options.snapshot)
+                ->number_;
+      }
+      for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end();
+           ++cf_iter) {
+        auto node = iter_deref_func(cf_iter);
+        if (!last_try) {
+          node->super_version = GetAndRefSuperVersion(node->cfd);
+        } else {
+          node->super_version = node->cfd->GetSuperVersion()->Ref();
+        }
+        TEST_SYNC_POINT("DBImpl::MultiGet::AfterRefSV");
+        if (read_options.snapshot != nullptr || last_try) {
+          // If user passed a snapshot, then we don't care if a memtable is
+          // sealed or compaction happens because the snapshot would ensure
+          // that older key versions are kept around. If this is the last
+          // retry, then we have the lock so nothing bad can happen
+          continue;
+        }
+        // We could get the earliest sequence number for the whole list of
+        // memtables, which will include immutable memtables as well, but that
+        // might be tricky to maintain in case we decide, in future, to do
+        // memtable compaction.
+        if (!last_try) {
+          SequenceNumber seq =
+              node->super_version->mem->GetEarliestSequenceNumber();
+          if (seq > *snapshot) {
+            retry = true;
+            break;
+          }
+        }
+      }
+      if (!retry) {
+        if (last_try) {
+          mutex_.Unlock();
+        }
+        break;
+      }
+    }
+  }
+
+  // Keep track of bytes that we read for statistics-recording later
+  PERF_TIMER_STOP(get_snapshot_time);
+
+  return last_try;
+}
+
+void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys,
+                      ColumnFamilyHandle** column_families, const Slice* keys,
+                      PinnableSlice* values, Status* statuses,
+                      const bool sorted_input) {
+  return MultiGet(read_options, num_keys, column_families, keys, values,
+                  /*timestamps=*/nullptr, statuses, sorted_input);
+}
+
+void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys,
+                      ColumnFamilyHandle** column_families, const Slice* keys,
+                      PinnableSlice* values, std::string* timestamps,
+                      Status* statuses, const bool sorted_input) {
+  if (num_keys == 0) {
+    return;
+  }
+
+  bool should_fail = false;
+  for (size_t i = 0; i < num_keys; ++i) {
+    ColumnFamilyHandle* cfh = column_families[i];
+    assert(cfh);
+    if (read_options.timestamp) {
+      statuses[i] = FailIfTsMismatchCf(cfh, *(read_options.timestamp),
+                                       /*ts_for_read=*/true);
+      if (!statuses[i].ok()) {
+        should_fail = true;
+      }
+    } else {
+      statuses[i] = FailIfCfHasTs(cfh);
+      if (!statuses[i].ok()) {
+        should_fail = true;
+      }
+    }
+  }
+  if (should_fail) {
+    for (size_t i = 0; i < num_keys; ++i) {
+      if (statuses[i].ok()) {
+        statuses[i] = Status::Incomplete(
+            "DB not queried due to invalid argument(s) in the same MultiGet");
+      }
+    }
+    return;
+  }
+
+  if (tracer_) {
+    // TODO: This mutex should be removed later, to improve performance when
+    // tracing is enabled.
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      // TODO: maybe handle the tracing status?
+      tracer_->MultiGet(num_keys, column_families, keys).PermitUncheckedError();
+    }
+  }
+
+  autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
+  autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
+  sorted_keys.resize(num_keys);
+  for (size_t i = 0; i < num_keys; ++i) {
+    values[i].Reset();
+    key_context.emplace_back(column_families[i], keys[i], &values[i],
+                             timestamps ? &timestamps[i] : nullptr,
+                             &statuses[i]);
+  }
+  for (size_t i = 0; i < num_keys; ++i) {
+    sorted_keys[i] = &key_context[i];
+  }
+  PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys);
+
+  autovector<MultiGetColumnFamilyData, MultiGetContext::MAX_BATCH_SIZE>
+      multiget_cf_data;
+  size_t cf_start = 0;
+  ColumnFamilyHandle* cf = sorted_keys[0]->column_family;
+
+  for (size_t i = 0; i < num_keys; ++i) {
+    KeyContext* key_ctx = sorted_keys[i];
+    if (key_ctx->column_family != cf) {
+      multiget_cf_data.emplace_back(cf, cf_start, i - cf_start, nullptr);
+      cf_start = i;
+      cf = key_ctx->column_family;
+    }
+  }
+
+  multiget_cf_data.emplace_back(cf, cf_start, num_keys - cf_start, nullptr);
+
+  std::function<MultiGetColumnFamilyData*(
+      autovector<MultiGetColumnFamilyData,
+                 MultiGetContext::MAX_BATCH_SIZE>::iterator&)>
+      iter_deref_lambda =
+          [](autovector<MultiGetColumnFamilyData,
+                        MultiGetContext::MAX_BATCH_SIZE>::iterator& cf_iter) {
+            return &(*cf_iter);
+          };
+
+  SequenceNumber consistent_seqnum;
+  bool unref_only = MultiCFSnapshot<
+      autovector<MultiGetColumnFamilyData, MultiGetContext::MAX_BATCH_SIZE>>(
+      read_options, nullptr, iter_deref_lambda, &multiget_cf_data,
+      &consistent_seqnum);
+
+  GetWithTimestampReadCallback timestamp_read_callback(0);
+  ReadCallback* read_callback = nullptr;
+  if (read_options.timestamp && read_options.timestamp->size() > 0) {
+    timestamp_read_callback.Refresh(consistent_seqnum);
+    read_callback = &timestamp_read_callback;
+  }
+
+  Status s;
+  auto cf_iter = multiget_cf_data.begin();
+  for (; cf_iter != multiget_cf_data.end(); ++cf_iter) {
+    s = MultiGetImpl(read_options, cf_iter->start, cf_iter->num_keys,
+                     &sorted_keys, cf_iter->super_version, consistent_seqnum,
+                     read_callback);
+    if (!s.ok()) {
+      break;
+    }
+  }
+  if (!s.ok()) {
+    assert(s.IsTimedOut() || s.IsAborted());
+    for (++cf_iter; cf_iter != multiget_cf_data.end(); ++cf_iter) {
+      for (size_t i = cf_iter->start; i < cf_iter->start + cf_iter->num_keys;
+           ++i) {
+        *sorted_keys[i]->s = s;
+      }
+    }
+  }
+
+  for (const auto& iter : multiget_cf_data) {
+    if (!unref_only) {
+      ReturnAndCleanupSuperVersion(iter.cfd, iter.super_version);
+    } else {
+      iter.cfd->GetSuperVersion()->Unref();
+    }
+  }
+}
+
+namespace {
+// Order keys by CF ID, followed by key contents
+struct CompareKeyContext {
+  inline bool operator()(const KeyContext* lhs, const KeyContext* rhs) {
+    ColumnFamilyHandleImpl* cfh =
+        static_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
+    uint32_t cfd_id1 = cfh->cfd()->GetID();
+    const Comparator* comparator = cfh->cfd()->user_comparator();
+    cfh = static_cast<ColumnFamilyHandleImpl*>(rhs->column_family);
+    uint32_t cfd_id2 = cfh->cfd()->GetID();
+
+    if (cfd_id1 < cfd_id2) {
+      return true;
+    } else if (cfd_id1 > cfd_id2) {
+      return false;
+    }
+
+    // Both keys are from the same column family
+    int cmp = comparator->CompareWithoutTimestamp(
+        *(lhs->key), /*a_has_ts=*/false, *(rhs->key), /*b_has_ts=*/false);
+    if (cmp < 0) {
+      return true;
+    }
+    return false;
+  }
+};
+
+}  // anonymous namespace
+
+void DBImpl::PrepareMultiGetKeys(
+    size_t num_keys, bool sorted_input,
+    autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys) {
+  if (sorted_input) {
+#ifndef NDEBUG
+    assert(std::is_sorted(sorted_keys->begin(), sorted_keys->end(),
+                          CompareKeyContext()));
+#endif
+    return;
+  }
+
+  std::sort(sorted_keys->begin(), sorted_keys->begin() + num_keys,
+            CompareKeyContext());
+}
+
+void DBImpl::MultiGet(const ReadOptions& read_options,
+                      ColumnFamilyHandle* column_family, const size_t num_keys,
+                      const Slice* keys, PinnableSlice* values,
+                      Status* statuses, const bool sorted_input) {
+  return MultiGet(read_options, column_family, num_keys, keys, values,
+                  /*timestamp=*/nullptr, statuses, sorted_input);
+}
+
+void DBImpl::MultiGet(const ReadOptions& read_options,
+                      ColumnFamilyHandle* column_family, const size_t num_keys,
+                      const Slice* keys, PinnableSlice* values,
+                      std::string* timestamps, Status* statuses,
+                      const bool sorted_input) {
+  if (tracer_) {
+    // TODO: This mutex should be removed later, to improve performance when
+    // tracing is enabled.
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      // TODO: maybe handle the tracing status?
+      tracer_->MultiGet(num_keys, column_family, keys).PermitUncheckedError();
+    }
+  }
+  autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
+  autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
+  sorted_keys.resize(num_keys);
+  for (size_t i = 0; i < num_keys; ++i) {
+    values[i].Reset();
+    key_context.emplace_back(column_family, keys[i], &values[i],
+                             timestamps ? &timestamps[i] : nullptr,
+                             &statuses[i]);
+  }
+  for (size_t i = 0; i < num_keys; ++i) {
+    sorted_keys[i] = &key_context[i];
+  }
+  PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys);
+  MultiGetWithCallback(read_options, column_family, nullptr, &sorted_keys);
+}
+
+void DBImpl::MultiGetWithCallback(
+    const ReadOptions& read_options, ColumnFamilyHandle* column_family,
+    ReadCallback* callback,
+    autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys) {
+  std::array<MultiGetColumnFamilyData, 1> multiget_cf_data;
+  multiget_cf_data[0] = MultiGetColumnFamilyData(column_family, nullptr);
+  std::function<MultiGetColumnFamilyData*(
+      std::array<MultiGetColumnFamilyData, 1>::iterator&)>
+      iter_deref_lambda =
+          [](std::array<MultiGetColumnFamilyData, 1>::iterator& cf_iter) {
+            return &(*cf_iter);
+          };
+
+  size_t num_keys = sorted_keys->size();
+  SequenceNumber consistent_seqnum;
+  bool unref_only = MultiCFSnapshot<std::array<MultiGetColumnFamilyData, 1>>(
+      read_options, callback, iter_deref_lambda, &multiget_cf_data,
+      &consistent_seqnum);
+#ifndef NDEBUG
+  assert(!unref_only);
+#else
+  // Silence unused variable warning
+  (void)unref_only;
+#endif  // NDEBUG
+
+  if (callback && read_options.snapshot == nullptr) {
+    // The unprep_seqs are not published for write unprepared, so it could be
+    // that max_visible_seq is larger. Seek to the std::max of the two.
+    // However, we still want our callback to contain the actual snapshot so
+    // that it can do the correct visibility filtering.
+    callback->Refresh(consistent_seqnum);
+
+    // Internally, WriteUnpreparedTxnReadCallback::Refresh would set
+    // max_visible_seq = max(max_visible_seq, snapshot)
+    //
+    // Currently, the commented out assert is broken by
+    // InvalidSnapshotReadCallback, but if write unprepared recovery followed
+    // the regular transaction flow, then this special read callback would not
+    // be needed.
+    //
+    // assert(callback->max_visible_seq() >= snapshot);
+    consistent_seqnum = callback->max_visible_seq();
+  }
+
+  GetWithTimestampReadCallback timestamp_read_callback(0);
+  ReadCallback* read_callback = callback;
+  if (read_options.timestamp && read_options.timestamp->size() > 0) {
+    assert(!read_callback);  // timestamp with callback is not supported
+    timestamp_read_callback.Refresh(consistent_seqnum);
+    read_callback = &timestamp_read_callback;
+  }
+
+  Status s = MultiGetImpl(read_options, 0, num_keys, sorted_keys,
+                          multiget_cf_data[0].super_version, consistent_seqnum,
+                          read_callback);
+  assert(s.ok() || s.IsTimedOut() || s.IsAborted());
+  ReturnAndCleanupSuperVersion(multiget_cf_data[0].cfd,
+                               multiget_cf_data[0].super_version);
+}
+
+// The actual implementation of batched MultiGet. Parameters -
+// start_key - Index in the sorted_keys vector to start processing from
+// num_keys - Number of keys to lookup, starting with sorted_keys[start_key]
+// sorted_keys - The entire batch of sorted keys for this CF
+//
+// The per key status is returned in the KeyContext structures pointed to by
+// sorted_keys. An overall Status is also returned, with the only possible
+// values being Status::OK() and Status::TimedOut(). The latter indicates
+// that the call exceeded read_options.deadline
+Status DBImpl::MultiGetImpl(
+    const ReadOptions& read_options, size_t start_key, size_t num_keys,
+    autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys,
+    SuperVersion* super_version, SequenceNumber snapshot,
+    ReadCallback* callback) {
+  PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock);
+  StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET);
+
+  assert(sorted_keys);
+  // Clear the timestamps for returning results so that we can distinguish
+  // between tombstone or key that has never been written
+  for (auto* kctx : *sorted_keys) {
+    assert(kctx);
+    if (kctx->timestamp) {
+      kctx->timestamp->clear();
+    }
+  }
+
+  // For each of the given keys, apply the entire "get" process as follows:
+  // First look in the memtable, then in the immutable memtable (if any).
+  // s is both in/out. When in, s could either be OK or MergeInProgress.
+  // merge_operands will contain the sequence of merges in the latter case.
+  size_t keys_left = num_keys;
+  Status s;
+  uint64_t curr_value_size = 0;
+  while (keys_left) {
+    if (read_options.deadline.count() &&
+        immutable_db_options_.clock->NowMicros() >
+            static_cast<uint64_t>(read_options.deadline.count())) {
+      s = Status::TimedOut();
+      break;
+    }
+
+    size_t batch_size = (keys_left > MultiGetContext::MAX_BATCH_SIZE)
+                            ? MultiGetContext::MAX_BATCH_SIZE
+                            : keys_left;
+    MultiGetContext ctx(sorted_keys, start_key + num_keys - keys_left,
+                        batch_size, snapshot, read_options, GetFileSystem(),
+                        stats_);
+    MultiGetRange range = ctx.GetMultiGetRange();
+    range.AddValueSize(curr_value_size);
+    bool lookup_current = false;
+
+    keys_left -= batch_size;
+    for (auto mget_iter = range.begin(); mget_iter != range.end();
+         ++mget_iter) {
+      mget_iter->merge_context.Clear();
+      *mget_iter->s = Status::OK();
+    }
+
+    bool skip_memtable =
+        (read_options.read_tier == kPersistedTier &&
+         has_unpersisted_data_.load(std::memory_order_relaxed));
+    if (!skip_memtable) {
+      super_version->mem->MultiGet(read_options, &range, callback,
+                                   false /* immutable_memtable */);
+      if (!range.empty()) {
+        super_version->imm->MultiGet(read_options, &range, callback);
+      }
+      if (!range.empty()) {
+        lookup_current = true;
+        uint64_t left = range.KeysLeft();
+        RecordTick(stats_, MEMTABLE_MISS, left);
+      }
+    }
+    if (lookup_current) {
+      PERF_TIMER_GUARD(get_from_output_files_time);
+      super_version->current->MultiGet(read_options, &range, callback);
+    }
+    curr_value_size = range.GetValueSize();
+    if (curr_value_size > read_options.value_size_soft_limit) {
+      s = Status::Aborted();
+      break;
+    }
+  }
+
+  // Post processing (decrement reference counts and record statistics)
+  PERF_TIMER_GUARD(get_post_process_time);
+  size_t num_found = 0;
+  uint64_t bytes_read = 0;
+  for (size_t i = start_key; i < start_key + num_keys - keys_left; ++i) {
+    KeyContext* key = (*sorted_keys)[i];
+    if (key->s->ok()) {
+      bytes_read += key->value->size();
+      num_found++;
+    }
+  }
+  if (keys_left) {
+    assert(s.IsTimedOut() || s.IsAborted());
+    for (size_t i = start_key + num_keys - keys_left; i < start_key + num_keys;
+         ++i) {
+      KeyContext* key = (*sorted_keys)[i];
+      *key->s = s;
+    }
+  }
+
+  RecordTick(stats_, NUMBER_MULTIGET_CALLS);
+  RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys);
+  RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found);
+  RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read);
+  RecordInHistogram(stats_, BYTES_PER_MULTIGET, bytes_read);
+  PERF_COUNTER_ADD(multiget_read_bytes, bytes_read);
+  PERF_TIMER_STOP(get_post_process_time);
+
+  return s;
+}
+
+Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
+                                  const std::string& column_family,
+                                  ColumnFamilyHandle** handle) {
+  assert(handle != nullptr);
+  Status s = CreateColumnFamilyImpl(cf_options, column_family, handle);
+  if (s.ok()) {
+    s = WriteOptionsFile(true /*need_mutex_lock*/,
+                         true /*need_enter_write_thread*/);
+  }
+  return s;
+}
+
+Status DBImpl::CreateColumnFamilies(
+    const ColumnFamilyOptions& cf_options,
+    const std::vector<std::string>& column_family_names,
+    std::vector<ColumnFamilyHandle*>* handles) {
+  assert(handles != nullptr);
+  handles->clear();
+  size_t num_cf = column_family_names.size();
+  Status s;
+  bool success_once = false;
+  for (size_t i = 0; i < num_cf; i++) {
+    ColumnFamilyHandle* handle;
+    s = CreateColumnFamilyImpl(cf_options, column_family_names[i], &handle);
+    if (!s.ok()) {
+      break;
+    }
+    handles->push_back(handle);
+    success_once = true;
+  }
+  if (success_once) {
+    Status persist_options_status = WriteOptionsFile(
+        true /*need_mutex_lock*/, true /*need_enter_write_thread*/);
+    if (s.ok() && !persist_options_status.ok()) {
+      s = persist_options_status;
+    }
+  }
+  return s;
+}
+
+Status DBImpl::CreateColumnFamilies(
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles) {
+  assert(handles != nullptr);
+  handles->clear();
+  size_t num_cf = column_families.size();
+  Status s;
+  bool success_once = false;
+  for (size_t i = 0; i < num_cf; i++) {
+    ColumnFamilyHandle* handle;
+    s = CreateColumnFamilyImpl(column_families[i].options,
+                               column_families[i].name, &handle);
+    if (!s.ok()) {
+      break;
+    }
+    handles->push_back(handle);
+    success_once = true;
+  }
+  if (success_once) {
+    Status persist_options_status = WriteOptionsFile(
+        true /*need_mutex_lock*/, true /*need_enter_write_thread*/);
+    if (s.ok() && !persist_options_status.ok()) {
+      s = persist_options_status;
+    }
+  }
+  return s;
+}
+
+Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options,
+                                      const std::string& column_family_name,
+                                      ColumnFamilyHandle** handle) {
+  Status s;
+  *handle = nullptr;
+
+  DBOptions db_options =
+      BuildDBOptions(immutable_db_options_, mutable_db_options_);
+  s = ColumnFamilyData::ValidateOptions(db_options, cf_options);
+  if (s.ok()) {
+    for (auto& cf_path : cf_options.cf_paths) {
+      s = env_->CreateDirIfMissing(cf_path.path);
+      if (!s.ok()) {
+        break;
+      }
+    }
+  }
+  if (!s.ok()) {
+    return s;
+  }
+
+  SuperVersionContext sv_context(/* create_superversion */ true);
+  {
+    InstrumentedMutexLock l(&mutex_);
+
+    if (versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name) !=
+        nullptr) {
+      return Status::InvalidArgument("Column family already exists");
+    }
+    VersionEdit edit;
+    edit.AddColumnFamily(column_family_name);
+    uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID();
+    edit.SetColumnFamily(new_id);
+    edit.SetLogNumber(logfile_number_);
+    edit.SetComparatorName(cf_options.comparator->Name());
+
+    // LogAndApply will both write the creation in MANIFEST and create
+    // ColumnFamilyData object
+    {  // write thread
+      WriteThread::Writer w;
+      write_thread_.EnterUnbatched(&w, &mutex_);
+      // LogAndApply will both write the creation in MANIFEST and create
+      // ColumnFamilyData object
+      s = versions_->LogAndApply(nullptr, MutableCFOptions(cf_options), &edit,
+                                 &mutex_, directories_.GetDbDir(), false,
+                                 &cf_options);
+      write_thread_.ExitUnbatched(&w);
+    }
+    if (s.ok()) {
+      auto* cfd =
+          versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
+      assert(cfd != nullptr);
+      std::map<std::string, std::shared_ptr<FSDirectory>> dummy_created_dirs;
+      s = cfd->AddDirectories(&dummy_created_dirs);
+    }
+    if (s.ok()) {
+      auto* cfd =
+          versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
+      assert(cfd != nullptr);
+      InstallSuperVersionAndScheduleWork(cfd, &sv_context,
+                                         *cfd->GetLatestMutableCFOptions());
+
+      if (!cfd->mem()->IsSnapshotSupported()) {
+        is_snapshot_supported_ = false;
+      }
+
+      cfd->set_initialized();
+
+      *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_);
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Created column family [%s] (ID %u)",
+                     column_family_name.c_str(), (unsigned)cfd->GetID());
+    } else {
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                      "Creating column family [%s] FAILED -- %s",
+                      column_family_name.c_str(), s.ToString().c_str());
+    }
+  }  // InstrumentedMutexLock l(&mutex_)
+
+  if (cf_options.preserve_internal_time_seconds > 0 ||
+      cf_options.preclude_last_level_data_seconds > 0) {
+    s = RegisterRecordSeqnoTimeWorker();
+  }
+  sv_context.Clean();
+  // this is outside the mutex
+  if (s.ok()) {
+    NewThreadStatusCfInfo(
+        static_cast_with_check<ColumnFamilyHandleImpl>(*handle)->cfd());
+  }
+  return s;
+}
+
+Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
+  assert(column_family != nullptr);
+  Status s = DropColumnFamilyImpl(column_family);
+  if (s.ok()) {
+    s = WriteOptionsFile(true /*need_mutex_lock*/,
+                         true /*need_enter_write_thread*/);
+  }
+  return s;
+}
+
+Status DBImpl::DropColumnFamilies(
+    const std::vector<ColumnFamilyHandle*>& column_families) {
+  Status s;
+  bool success_once = false;
+  for (auto* handle : column_families) {
+    s = DropColumnFamilyImpl(handle);
+    if (!s.ok()) {
+      break;
+    }
+    success_once = true;
+  }
+  if (success_once) {
+    Status persist_options_status = WriteOptionsFile(
+        true /*need_mutex_lock*/, true /*need_enter_write_thread*/);
+    if (s.ok() && !persist_options_status.ok()) {
+      s = persist_options_status;
+    }
+  }
+  return s;
+}
+
+Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  auto cfd = cfh->cfd();
+  if (cfd->GetID() == 0) {
+    return Status::InvalidArgument("Can't drop default column family");
+  }
+
+  bool cf_support_snapshot = cfd->mem()->IsSnapshotSupported();
+
+  VersionEdit edit;
+  edit.DropColumnFamily();
+  edit.SetColumnFamily(cfd->GetID());
+
+  Status s;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    if (cfd->IsDropped()) {
+      s = Status::InvalidArgument("Column family already dropped!\n");
+    }
+    if (s.ok()) {
+      // we drop column family from a single write thread
+      WriteThread::Writer w;
+      write_thread_.EnterUnbatched(&w, &mutex_);
+      s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit,
+                                 &mutex_, directories_.GetDbDir());
+      write_thread_.ExitUnbatched(&w);
+    }
+    if (s.ok()) {
+      auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+      max_total_in_memory_state_ -= mutable_cf_options->write_buffer_size *
+                                    mutable_cf_options->max_write_buffer_number;
+    }
+
+    if (!cf_support_snapshot) {
+      // Dropped Column Family doesn't support snapshot. Need to recalculate
+      // is_snapshot_supported_.
+      bool new_is_snapshot_supported = true;
+      for (auto c : *versions_->GetColumnFamilySet()) {
+        if (!c->IsDropped() && !c->mem()->IsSnapshotSupported()) {
+          new_is_snapshot_supported = false;
+          break;
+        }
+      }
+      is_snapshot_supported_ = new_is_snapshot_supported;
+    }
+    bg_cv_.SignalAll();
+  }
+
+  if (cfd->ioptions()->preserve_internal_time_seconds > 0 ||
+      cfd->ioptions()->preclude_last_level_data_seconds > 0) {
+    s = RegisterRecordSeqnoTimeWorker();
+  }
+
+  if (s.ok()) {
+    // Note that here we erase the associated cf_info of the to-be-dropped
+    // cfd before its ref-count goes to zero to avoid having to erase cf_info
+    // later inside db_mutex.
+    EraseThreadStatusCfInfo(cfd);
+    assert(cfd->IsDropped());
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Dropped column family with id %u\n", cfd->GetID());
+  } else {
+    ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                    "Dropping column family with id %u FAILED -- %s\n",
+                    cfd->GetID(), s.ToString().c_str());
+  }
+
+  return s;
+}
+
+bool DBImpl::KeyMayExist(const ReadOptions& read_options,
+                         ColumnFamilyHandle* column_family, const Slice& key,
+                         std::string* value, std::string* timestamp,
+                         bool* value_found) {
+  assert(value != nullptr);
+  if (value_found != nullptr) {
+    // falsify later if key-may-exist but can't fetch value
+    *value_found = true;
+  }
+  ReadOptions roptions = read_options;
+  roptions.read_tier = kBlockCacheTier;  // read from block cache only
+  PinnableSlice pinnable_val;
+  GetImplOptions get_impl_options;
+  get_impl_options.column_family = column_family;
+  get_impl_options.value = &pinnable_val;
+  get_impl_options.value_found = value_found;
+  get_impl_options.timestamp = timestamp;
+  auto s = GetImpl(roptions, key, get_impl_options);
+  value->assign(pinnable_val.data(), pinnable_val.size());
+
+  // If block_cache is enabled and the index block of the table didn't
+  // not present in block_cache, the return value will be Status::Incomplete.
+  // In this case, key may still exist in the table.
+  return s.ok() || s.IsIncomplete();
+}
+
+Iterator* DBImpl::NewIterator(const ReadOptions& read_options,
+                              ColumnFamilyHandle* column_family) {
+  if (read_options.managed) {
+    return NewErrorIterator(
+        Status::NotSupported("Managed iterator is not supported anymore."));
+  }
+  Iterator* result = nullptr;
+  if (read_options.read_tier == kPersistedTier) {
+    return NewErrorIterator(Status::NotSupported(
+        "ReadTier::kPersistedData is not yet supported in iterators."));
+  }
+
+  assert(column_family);
+
+  if (read_options.timestamp) {
+    const Status s = FailIfTsMismatchCf(
+        column_family, *(read_options.timestamp), /*ts_for_read=*/true);
+    if (!s.ok()) {
+      return NewErrorIterator(s);
+    }
+  } else {
+    const Status s = FailIfCfHasTs(column_family);
+    if (!s.ok()) {
+      return NewErrorIterator(s);
+    }
+  }
+
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  ColumnFamilyData* cfd = cfh->cfd();
+  assert(cfd != nullptr);
+  ReadCallback* read_callback = nullptr;  // No read callback provided.
+  if (read_options.tailing) {
+#ifdef ROCKSDB_LITE
+    // not supported in lite version
+    result = nullptr;
+
+#else
+    SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
+    auto iter = new ForwardIterator(this, read_options, cfd, sv,
+                                    /* allow_unprepared_value */ true);
+    result = NewDBIterator(
+        env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
+        cfd->user_comparator(), iter, sv->current, kMaxSequenceNumber,
+        sv->mutable_cf_options.max_sequential_skip_in_iterations, read_callback,
+        this, cfd);
+#endif
+  } else {
+    // Note: no need to consider the special case of
+    // last_seq_same_as_publish_seq_==false since NewIterator is overridden in
+    // WritePreparedTxnDB
+    result = NewIteratorImpl(read_options, cfd,
+                             (read_options.snapshot != nullptr)
+                                 ? read_options.snapshot->GetSequenceNumber()
+                                 : kMaxSequenceNumber,
+                             read_callback);
+  }
+  return result;
+}
+
+ArenaWrappedDBIter* DBImpl::NewIteratorImpl(const ReadOptions& read_options,
+                                            ColumnFamilyData* cfd,
+                                            SequenceNumber snapshot,
+                                            ReadCallback* read_callback,
+                                            bool expose_blob_index,
+                                            bool allow_refresh) {
+  SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
+
+  TEST_SYNC_POINT("DBImpl::NewIterator:1");
+  TEST_SYNC_POINT("DBImpl::NewIterator:2");
+
+  if (snapshot == kMaxSequenceNumber) {
+    // Note that the snapshot is assigned AFTER referencing the super
+    // version because otherwise a flush happening in between may compact away
+    // data for the snapshot, so the reader would see neither data that was be
+    // visible to the snapshot before compaction nor the newer data inserted
+    // afterwards.
+    // Note that the super version might not contain all the data available
+    // to this snapshot, but in that case it can see all the data in the
+    // super version, which is a valid consistent state after the user
+    // calls NewIterator().
+    snapshot = versions_->LastSequence();
+    TEST_SYNC_POINT("DBImpl::NewIterator:3");
+    TEST_SYNC_POINT("DBImpl::NewIterator:4");
+  }
+
+  // Try to generate a DB iterator tree in continuous memory area to be
+  // cache friendly. Here is an example of result:
+  // +-------------------------------+
+  // |                               |
+  // | ArenaWrappedDBIter            |
+  // |  +                            |
+  // |  +---> Inner Iterator   ------------+
+  // |  |                            |     |
+  // |  |    +-- -- -- -- -- -- -- --+     |
+  // |  +--- | Arena                 |     |
+  // |       |                       |     |
+  // |          Allocated Memory:    |     |
+  // |       |   +-------------------+     |
+  // |       |   | DBIter            | <---+
+  // |           |  +                |
+  // |       |   |  +-> iter_  ------------+
+  // |       |   |                   |     |
+  // |       |   +-------------------+     |
+  // |       |   | MergingIterator   | <---+
+  // |           |  +                |
+  // |       |   |  +->child iter1  ------------+
+  // |       |   |  |                |          |
+  // |           |  +->child iter2  ----------+ |
+  // |       |   |  |                |        | |
+  // |       |   |  +->child iter3  --------+ | |
+  // |           |                   |      | | |
+  // |       |   +-------------------+      | | |
+  // |       |   | Iterator1         | <--------+
+  // |       |   +-------------------+      | |
+  // |       |   | Iterator2         | <------+
+  // |       |   +-------------------+      |
+  // |       |   | Iterator3         | <----+
+  // |       |   +-------------------+
+  // |       |                       |
+  // +-------+-----------------------+
+  //
+  // ArenaWrappedDBIter inlines an arena area where all the iterators in
+  // the iterator tree are allocated in the order of being accessed when
+  // querying.
+  // Laying out the iterators in the order of being accessed makes it more
+  // likely that any iterator pointer is close to the iterator it points to so
+  // that they are likely to be in the same cache line and/or page.
+  ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
+      env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, sv->current,
+      snapshot, sv->mutable_cf_options.max_sequential_skip_in_iterations,
+      sv->version_number, read_callback, this, cfd, expose_blob_index,
+      read_options.snapshot != nullptr ? false : allow_refresh);
+
+  InternalIterator* internal_iter = NewInternalIterator(
+      db_iter->GetReadOptions(), cfd, sv, db_iter->GetArena(), snapshot,
+      /* allow_unprepared_value */ true, db_iter);
+  db_iter->SetIterUnderDBIter(internal_iter);
+
+  return db_iter;
+}
+
+Status DBImpl::NewIterators(
+    const ReadOptions& read_options,
+    const std::vector<ColumnFamilyHandle*>& column_families,
+    std::vector<Iterator*>* iterators) {
+  if (read_options.managed) {
+    return Status::NotSupported("Managed iterator is not supported anymore.");
+  }
+  if (read_options.read_tier == kPersistedTier) {
+    return Status::NotSupported(
+        "ReadTier::kPersistedData is not yet supported in iterators.");
+  }
+
+  if (read_options.timestamp) {
+    for (auto* cf : column_families) {
+      assert(cf);
+      const Status s = FailIfTsMismatchCf(cf, *(read_options.timestamp),
+                                          /*ts_for_read=*/true);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  } else {
+    for (auto* cf : column_families) {
+      assert(cf);
+      const Status s = FailIfCfHasTs(cf);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  }
+
+  ReadCallback* read_callback = nullptr;  // No read callback provided.
+  iterators->clear();
+  iterators->reserve(column_families.size());
+  if (read_options.tailing) {
+#ifdef ROCKSDB_LITE
+    return Status::InvalidArgument(
+        "Tailing iterator not supported in RocksDB lite");
+#else
+    for (auto cfh : column_families) {
+      auto cfd = static_cast_with_check<ColumnFamilyHandleImpl>(cfh)->cfd();
+      SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
+      auto iter = new ForwardIterator(this, read_options, cfd, sv,
+                                      /* allow_unprepared_value */ true);
+      iterators->push_back(NewDBIterator(
+          env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
+          cfd->user_comparator(), iter, sv->current, kMaxSequenceNumber,
+          sv->mutable_cf_options.max_sequential_skip_in_iterations,
+          read_callback, this, cfd));
+    }
+#endif
+  } else {
+    // Note: no need to consider the special case of
+    // last_seq_same_as_publish_seq_==false since NewIterators is overridden in
+    // WritePreparedTxnDB
+    auto snapshot = read_options.snapshot != nullptr
+                        ? read_options.snapshot->GetSequenceNumber()
+                        : versions_->LastSequence();
+    for (size_t i = 0; i < column_families.size(); ++i) {
+      auto* cfd =
+          static_cast_with_check<ColumnFamilyHandleImpl>(column_families[i])
+              ->cfd();
+      iterators->push_back(
+          NewIteratorImpl(read_options, cfd, snapshot, read_callback));
+    }
+  }
+
+  return Status::OK();
+}
+
+const Snapshot* DBImpl::GetSnapshot() { return GetSnapshotImpl(false); }
+
+#ifndef ROCKSDB_LITE
+const Snapshot* DBImpl::GetSnapshotForWriteConflictBoundary() {
+  return GetSnapshotImpl(true);
+}
+#endif  // ROCKSDB_LITE
+
+std::pair<Status, std::shared_ptr<const Snapshot>>
+DBImpl::CreateTimestampedSnapshot(SequenceNumber snapshot_seq, uint64_t ts) {
+  assert(ts != std::numeric_limits<uint64_t>::max());
+
+  auto ret = CreateTimestampedSnapshotImpl(snapshot_seq, ts, /*lock=*/true);
+  return ret;
+}
+
+std::shared_ptr<const SnapshotImpl> DBImpl::GetTimestampedSnapshot(
+    uint64_t ts) const {
+  InstrumentedMutexLock lock_guard(&mutex_);
+  return timestamped_snapshots_.GetSnapshot(ts);
+}
+
+void DBImpl::ReleaseTimestampedSnapshotsOlderThan(uint64_t ts,
+                                                  size_t* remaining_total_ss) {
+  autovector<std::shared_ptr<const SnapshotImpl>> snapshots_to_release;
+  {
+    InstrumentedMutexLock lock_guard(&mutex_);
+    timestamped_snapshots_.ReleaseSnapshotsOlderThan(ts, snapshots_to_release);
+  }
+  snapshots_to_release.clear();
+
+  if (remaining_total_ss) {
+    InstrumentedMutexLock lock_guard(&mutex_);
+    *remaining_total_ss = static_cast<size_t>(snapshots_.count());
+  }
+}
+
+Status DBImpl::GetTimestampedSnapshots(
+    uint64_t ts_lb, uint64_t ts_ub,
+    std::vector<std::shared_ptr<const Snapshot>>& timestamped_snapshots) const {
+  if (ts_lb >= ts_ub) {
+    return Status::InvalidArgument(
+        "timestamp lower bound must be smaller than upper bound");
+  }
+  timestamped_snapshots.clear();
+  InstrumentedMutexLock lock_guard(&mutex_);
+  timestamped_snapshots_.GetSnapshots(ts_lb, ts_ub, timestamped_snapshots);
+  return Status::OK();
+}
+
+SnapshotImpl* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary,
+                                      bool lock) {
+  int64_t unix_time = 0;
+  immutable_db_options_.clock->GetCurrentTime(&unix_time)
+      .PermitUncheckedError();  // Ignore error
+  SnapshotImpl* s = new SnapshotImpl;
+
+  if (lock) {
+    mutex_.Lock();
+  } else {
+    mutex_.AssertHeld();
+  }
+  // returns null if the underlying memtable does not support snapshot.
+  if (!is_snapshot_supported_) {
+    if (lock) {
+      mutex_.Unlock();
+    }
+    delete s;
+    return nullptr;
+  }
+  auto snapshot_seq = GetLastPublishedSequence();
+  SnapshotImpl* snapshot =
+      snapshots_.New(s, snapshot_seq, unix_time, is_write_conflict_boundary);
+  if (lock) {
+    mutex_.Unlock();
+  }
+  return snapshot;
+}
+
+std::pair<Status, std::shared_ptr<const SnapshotImpl>>
+DBImpl::CreateTimestampedSnapshotImpl(SequenceNumber snapshot_seq, uint64_t ts,
+                                      bool lock) {
+  int64_t unix_time = 0;
+  immutable_db_options_.clock->GetCurrentTime(&unix_time)
+      .PermitUncheckedError();  // Ignore error
+  SnapshotImpl* s = new SnapshotImpl;
+
+  const bool need_update_seq = (snapshot_seq != kMaxSequenceNumber);
+
+  if (lock) {
+    mutex_.Lock();
+  } else {
+    mutex_.AssertHeld();
+  }
+  // returns null if the underlying memtable does not support snapshot.
+  if (!is_snapshot_supported_) {
+    if (lock) {
+      mutex_.Unlock();
+    }
+    delete s;
+    return std::make_pair(
+        Status::NotSupported("Memtable does not support snapshot"), nullptr);
+  }
+
+  // Caller is not write thread, thus didn't provide a valid snapshot_seq.
+  // Obtain seq from db.
+  if (!need_update_seq) {
+    snapshot_seq = GetLastPublishedSequence();
+  }
+
+  std::shared_ptr<const SnapshotImpl> latest =
+      timestamped_snapshots_.GetSnapshot(std::numeric_limits<uint64_t>::max());
+
+  // If there is already a latest timestamped snapshot, then we need to do some
+  // checks.
+  if (latest) {
+    uint64_t latest_snap_ts = latest->GetTimestamp();
+    SequenceNumber latest_snap_seq = latest->GetSequenceNumber();
+    assert(latest_snap_seq <= snapshot_seq);
+    bool needs_create_snap = true;
+    Status status;
+    std::shared_ptr<const SnapshotImpl> ret;
+    if (latest_snap_ts > ts) {
+      // A snapshot created later cannot have smaller timestamp than a previous
+      // timestamped snapshot.
+      needs_create_snap = false;
+      std::ostringstream oss;
+      oss << "snapshot exists with larger timestamp " << latest_snap_ts << " > "
+          << ts;
+      status = Status::InvalidArgument(oss.str());
+    } else if (latest_snap_ts == ts) {
+      if (latest_snap_seq == snapshot_seq) {
+        // We are requesting the same sequence number and timestamp, thus can
+        // safely reuse (share) the current latest timestamped snapshot.
+        needs_create_snap = false;
+        ret = latest;
+      } else if (latest_snap_seq < snapshot_seq) {
+        // There may have been writes to the database since the latest
+        // timestamped snapshot, yet we are still requesting the same
+        // timestamp. In this case, we cannot create the new timestamped
+        // snapshot.
+        needs_create_snap = false;
+        std::ostringstream oss;
+        oss << "Allocated seq is " << snapshot_seq
+            << ", while snapshot exists with smaller seq " << latest_snap_seq
+            << " but same timestamp " << ts;
+        status = Status::InvalidArgument(oss.str());
+      }
+    }
+    if (!needs_create_snap) {
+      if (lock) {
+        mutex_.Unlock();
+      }
+      delete s;
+      return std::make_pair(status, ret);
+    } else {
+      status.PermitUncheckedError();
+    }
+  }
+
+  SnapshotImpl* snapshot =
+      snapshots_.New(s, snapshot_seq, unix_time,
+                     /*is_write_conflict_boundary=*/true, ts);
+
+  std::shared_ptr<const SnapshotImpl> ret(
+      snapshot,
+      std::bind(&DBImpl::ReleaseSnapshot, this, std::placeholders::_1));
+  timestamped_snapshots_.AddSnapshot(ret);
+
+  // Caller is from write thread, and we need to update database's sequence
+  // number.
+  if (need_update_seq) {
+    assert(versions_);
+    if (last_seq_same_as_publish_seq_) {
+      versions_->SetLastSequence(snapshot_seq);
+    } else {
+      // TODO: support write-prepared/write-unprepared transactions with two
+      // write queues.
+      assert(false);
+    }
+  }
+
+  if (lock) {
+    mutex_.Unlock();
+  }
+  return std::make_pair(Status::OK(), ret);
+}
+
+namespace {
+using CfdList = autovector<ColumnFamilyData*, 2>;
+bool CfdListContains(const CfdList& list, ColumnFamilyData* cfd) {
+  for (const ColumnFamilyData* t : list) {
+    if (t == cfd) {
+      return true;
+    }
+  }
+  return false;
+}
+}  //  namespace
+
+void DBImpl::ReleaseSnapshot(const Snapshot* s) {
+  if (s == nullptr) {
+    // DBImpl::GetSnapshot() can return nullptr when snapshot
+    // not supported by specifying the condition:
+    // inplace_update_support enabled.
+    return;
+  }
+  const SnapshotImpl* casted_s = reinterpret_cast<const SnapshotImpl*>(s);
+  {
+    InstrumentedMutexLock l(&mutex_);
+    snapshots_.Delete(casted_s);
+    uint64_t oldest_snapshot;
+    if (snapshots_.empty()) {
+      oldest_snapshot = GetLastPublishedSequence();
+    } else {
+      oldest_snapshot = snapshots_.oldest()->number_;
+    }
+    // Avoid to go through every column family by checking a global threshold
+    // first.
+    if (oldest_snapshot > bottommost_files_mark_threshold_) {
+      CfdList cf_scheduled;
+      for (auto* cfd : *versions_->GetColumnFamilySet()) {
+        if (!cfd->ioptions()->allow_ingest_behind) {
+          cfd->current()->storage_info()->UpdateOldestSnapshot(oldest_snapshot);
+          if (!cfd->current()
+                   ->storage_info()
+                   ->BottommostFilesMarkedForCompaction()
+                   .empty()) {
+            SchedulePendingCompaction(cfd);
+            MaybeScheduleFlushOrCompaction();
+            cf_scheduled.push_back(cfd);
+          }
+        }
+      }
+
+      // Calculate a new threshold, skipping those CFs where compactions are
+      // scheduled. We do not do the same pass as the previous loop because
+      // mutex might be unlocked during the loop, making the result inaccurate.
+      SequenceNumber new_bottommost_files_mark_threshold = kMaxSequenceNumber;
+      for (auto* cfd : *versions_->GetColumnFamilySet()) {
+        if (CfdListContains(cf_scheduled, cfd) ||
+            cfd->ioptions()->allow_ingest_behind) {
+          continue;
+        }
+        new_bottommost_files_mark_threshold = std::min(
+            new_bottommost_files_mark_threshold,
+            cfd->current()->storage_info()->bottommost_files_mark_threshold());
+      }
+      bottommost_files_mark_threshold_ = new_bottommost_files_mark_threshold;
+    }
+  }
+  delete casted_s;
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
+                                        TablePropertiesCollection* props) {
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  auto cfd = cfh->cfd();
+
+  // Increment the ref count
+  mutex_.Lock();
+  auto version = cfd->current();
+  version->Ref();
+  mutex_.Unlock();
+
+  auto s = version->GetPropertiesOfAllTables(props);
+
+  // Decrement the ref count
+  mutex_.Lock();
+  version->Unref();
+  mutex_.Unlock();
+
+  return s;
+}
+
+Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family,
+                                            const Range* range, std::size_t n,
+                                            TablePropertiesCollection* props) {
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  auto cfd = cfh->cfd();
+
+  // Increment the ref count
+  mutex_.Lock();
+  auto version = cfd->current();
+  version->Ref();
+  mutex_.Unlock();
+
+  auto s = version->GetPropertiesOfTablesInRange(range, n, props);
+
+  // Decrement the ref count
+  mutex_.Lock();
+  version->Unref();
+  mutex_.Unlock();
+
+  return s;
+}
+
+#endif  // ROCKSDB_LITE
+
+const std::string& DBImpl::GetName() const { return dbname_; }
+
+Env* DBImpl::GetEnv() const { return env_; }
+
+FileSystem* DB::GetFileSystem() const {
+  const auto& fs = GetEnv()->GetFileSystem();
+  return fs.get();
+}
+
+FileSystem* DBImpl::GetFileSystem() const {
+  return immutable_db_options_.fs.get();
+}
+
+SystemClock* DBImpl::GetSystemClock() const {
+  return immutable_db_options_.clock;
+}
+
+#ifndef ROCKSDB_LITE
+
+Status DBImpl::StartIOTrace(const TraceOptions& trace_options,
+                            std::unique_ptr<TraceWriter>&& trace_writer) {
+  assert(trace_writer != nullptr);
+  return io_tracer_->StartIOTrace(GetSystemClock(), trace_options,
+                                  std::move(trace_writer));
+}
+
+Status DBImpl::EndIOTrace() {
+  io_tracer_->EndIOTrace();
+  return Status::OK();
+}
+
+#endif  // ROCKSDB_LITE
+
+Options DBImpl::GetOptions(ColumnFamilyHandle* column_family) const {
+  InstrumentedMutexLock l(&mutex_);
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  return Options(BuildDBOptions(immutable_db_options_, mutable_db_options_),
+                 cfh->cfd()->GetLatestCFOptions());
+}
+
+DBOptions DBImpl::GetDBOptions() const {
+  InstrumentedMutexLock l(&mutex_);
+  return BuildDBOptions(immutable_db_options_, mutable_db_options_);
+}
+
+bool DBImpl::GetProperty(ColumnFamilyHandle* column_family,
+                         const Slice& property, std::string* value) {
+  const DBPropertyInfo* property_info = GetPropertyInfo(property);
+  value->clear();
+  auto cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+  if (property_info == nullptr) {
+    return false;
+  } else if (property_info->handle_int) {
+    uint64_t int_value;
+    bool ret_value =
+        GetIntPropertyInternal(cfd, *property_info, false, &int_value);
+    if (ret_value) {
+      *value = std::to_string(int_value);
+    }
+    return ret_value;
+  } else if (property_info->handle_string) {
+    if (property_info->need_out_of_mutex) {
+      return cfd->internal_stats()->GetStringProperty(*property_info, property,
+                                                      value);
+    } else {
+      InstrumentedMutexLock l(&mutex_);
+      return cfd->internal_stats()->GetStringProperty(*property_info, property,
+                                                      value);
+    }
+  } else if (property_info->handle_string_dbimpl) {
+    if (property_info->need_out_of_mutex) {
+      return (this->*(property_info->handle_string_dbimpl))(value);
+    } else {
+      InstrumentedMutexLock l(&mutex_);
+      return (this->*(property_info->handle_string_dbimpl))(value);
+    }
+  }
+  // Shouldn't reach here since exactly one of handle_string and handle_int
+  // should be non-nullptr.
+  assert(false);
+  return false;
+}
+
+bool DBImpl::GetMapProperty(ColumnFamilyHandle* column_family,
+                            const Slice& property,
+                            std::map<std::string, std::string>* value) {
+  const DBPropertyInfo* property_info = GetPropertyInfo(property);
+  value->clear();
+  auto cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+  if (property_info == nullptr) {
+    return false;
+  } else if (property_info->handle_map) {
+    if (property_info->need_out_of_mutex) {
+      return cfd->internal_stats()->GetMapProperty(*property_info, property,
+                                                   value);
+    } else {
+      InstrumentedMutexLock l(&mutex_);
+      return cfd->internal_stats()->GetMapProperty(*property_info, property,
+                                                   value);
+    }
+  }
+  // If we reach this point it means that handle_map is not provided for the
+  // requested property
+  return false;
+}
+
+bool DBImpl::GetIntProperty(ColumnFamilyHandle* column_family,
+                            const Slice& property, uint64_t* value) {
+  const DBPropertyInfo* property_info = GetPropertyInfo(property);
+  if (property_info == nullptr || property_info->handle_int == nullptr) {
+    return false;
+  }
+  auto cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+  return GetIntPropertyInternal(cfd, *property_info, false, value);
+}
+
+bool DBImpl::GetIntPropertyInternal(ColumnFamilyData* cfd,
+                                    const DBPropertyInfo& property_info,
+                                    bool is_locked, uint64_t* value) {
+  assert(property_info.handle_int != nullptr);
+  if (!property_info.need_out_of_mutex) {
+    if (is_locked) {
+      mutex_.AssertHeld();
+      return cfd->internal_stats()->GetIntProperty(property_info, value, this);
+    } else {
+      InstrumentedMutexLock l(&mutex_);
+      return cfd->internal_stats()->GetIntProperty(property_info, value, this);
+    }
+  } else {
+    SuperVersion* sv = nullptr;
+    if (is_locked) {
+      mutex_.Unlock();
+    }
+    sv = GetAndRefSuperVersion(cfd);
+
+    bool ret = cfd->internal_stats()->GetIntPropertyOutOfMutex(
+        property_info, sv->current, value);
+
+    ReturnAndCleanupSuperVersion(cfd, sv);
+    if (is_locked) {
+      mutex_.Lock();
+    }
+
+    return ret;
+  }
+}
+
+bool DBImpl::GetPropertyHandleOptionsStatistics(std::string* value) {
+  assert(value != nullptr);
+  Statistics* statistics = immutable_db_options_.stats;
+  if (!statistics) {
+    return false;
+  }
+  *value = statistics->ToString();
+  return true;
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::ResetStats() {
+  InstrumentedMutexLock l(&mutex_);
+  for (auto* cfd : *versions_->GetColumnFamilySet()) {
+    if (cfd->initialized()) {
+      cfd->internal_stats()->Clear();
+    }
+  }
+  return Status::OK();
+}
+#endif  // ROCKSDB_LITE
+
+bool DBImpl::GetAggregatedIntProperty(const Slice& property,
+                                      uint64_t* aggregated_value) {
+  const DBPropertyInfo* property_info = GetPropertyInfo(property);
+  if (property_info == nullptr || property_info->handle_int == nullptr) {
+    return false;
+  }
+
+  uint64_t sum = 0;
+  bool ret = true;
+  {
+    // Needs mutex to protect the list of column families.
+    InstrumentedMutexLock l(&mutex_);
+    uint64_t value;
+    for (auto* cfd : versions_->GetRefedColumnFamilySet()) {
+      if (!cfd->initialized()) {
+        continue;
+      }
+      ret = GetIntPropertyInternal(cfd, *property_info, true, &value);
+      // GetIntPropertyInternal may release db mutex and re-acquire it.
+      mutex_.AssertHeld();
+      if (ret) {
+        sum += value;
+      } else {
+        ret = false;
+        break;
+      }
+    }
+  }
+  *aggregated_value = sum;
+  return ret;
+}
+
+SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) {
+  // TODO(ljin): consider using GetReferencedSuperVersion() directly
+  return cfd->GetThreadLocalSuperVersion(this);
+}
+
+// REQUIRED: this function should only be called on the write thread or if the
+// mutex is held.
+SuperVersion* DBImpl::GetAndRefSuperVersion(uint32_t column_family_id) {
+  auto column_family_set = versions_->GetColumnFamilySet();
+  auto cfd = column_family_set->GetColumnFamily(column_family_id);
+  if (!cfd) {
+    return nullptr;
+  }
+
+  return GetAndRefSuperVersion(cfd);
+}
+
+void DBImpl::CleanupSuperVersion(SuperVersion* sv) {
+  // Release SuperVersion
+  if (sv->Unref()) {
+    bool defer_purge = immutable_db_options().avoid_unnecessary_blocking_io;
+    {
+      InstrumentedMutexLock l(&mutex_);
+      sv->Cleanup();
+      if (defer_purge) {
+        AddSuperVersionsToFreeQueue(sv);
+        SchedulePurge();
+      }
+    }
+    if (!defer_purge) {
+      delete sv;
+    }
+    RecordTick(stats_, NUMBER_SUPERVERSION_CLEANUPS);
+  }
+  RecordTick(stats_, NUMBER_SUPERVERSION_RELEASES);
+}
+
+void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd,
+                                          SuperVersion* sv) {
+  if (!cfd->ReturnThreadLocalSuperVersion(sv)) {
+    CleanupSuperVersion(sv);
+  }
+}
+
+// REQUIRED: this function should only be called on the write thread.
+void DBImpl::ReturnAndCleanupSuperVersion(uint32_t column_family_id,
+                                          SuperVersion* sv) {
+  auto column_family_set = versions_->GetColumnFamilySet();
+  auto cfd = column_family_set->GetColumnFamily(column_family_id);
+
+  // If SuperVersion is held, and we successfully fetched a cfd using
+  // GetAndRefSuperVersion(), it must still exist.
+  assert(cfd != nullptr);
+  ReturnAndCleanupSuperVersion(cfd, sv);
+}
+
+// REQUIRED: this function should only be called on the write thread or if the
+// mutex is held.
+ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) {
+  ColumnFamilyMemTables* cf_memtables = column_family_memtables_.get();
+
+  if (!cf_memtables->Seek(column_family_id)) {
+    return nullptr;
+  }
+
+  return cf_memtables->GetColumnFamilyHandle();
+}
+
+// REQUIRED: mutex is NOT held.
+std::unique_ptr<ColumnFamilyHandle> DBImpl::GetColumnFamilyHandleUnlocked(
+    uint32_t column_family_id) {
+  InstrumentedMutexLock l(&mutex_);
+
+  auto* cfd =
+      versions_->GetColumnFamilySet()->GetColumnFamily(column_family_id);
+  if (cfd == nullptr) {
+    return nullptr;
+  }
+
+  return std::unique_ptr<ColumnFamilyHandleImpl>(
+      new ColumnFamilyHandleImpl(cfd, this, &mutex_));
+}
+
+void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
+                                         const Range& range,
+                                         uint64_t* const count,
+                                         uint64_t* const size) {
+  ColumnFamilyHandleImpl* cfh =
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  ColumnFamilyData* cfd = cfh->cfd();
+  SuperVersion* sv = GetAndRefSuperVersion(cfd);
+
+  // Convert user_key into a corresponding internal key.
+  InternalKey k1(range.start, kMaxSequenceNumber, kValueTypeForSeek);
+  InternalKey k2(range.limit, kMaxSequenceNumber, kValueTypeForSeek);
+  MemTable::MemTableStats memStats =
+      sv->mem->ApproximateStats(k1.Encode(), k2.Encode());
+  MemTable::MemTableStats immStats =
+      sv->imm->ApproximateStats(k1.Encode(), k2.Encode());
+  *count = memStats.count + immStats.count;
+  *size = memStats.size + immStats.size;
+
+  ReturnAndCleanupSuperVersion(cfd, sv);
+}
+
+Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options,
+                                   ColumnFamilyHandle* column_family,
+                                   const Range* range, int n, uint64_t* sizes) {
+  if (!options.include_memtables && !options.include_files) {
+    return Status::InvalidArgument("Invalid options");
+  }
+
+  const Comparator* const ucmp = column_family->GetComparator();
+  assert(ucmp);
+  size_t ts_sz = ucmp->timestamp_size();
+
+  Version* v;
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  auto cfd = cfh->cfd();
+  SuperVersion* sv = GetAndRefSuperVersion(cfd);
+  v = sv->current;
+
+  for (int i = 0; i < n; i++) {
+    Slice start = range[i].start;
+    Slice limit = range[i].limit;
+
+    // Add timestamp if needed
+    std::string start_with_ts, limit_with_ts;
+    if (ts_sz > 0) {
+      // Maximum timestamp means including all key with any timestamp
+      AppendKeyWithMaxTimestamp(&start_with_ts, start, ts_sz);
+      // Append a maximum timestamp as the range limit is exclusive:
+      // [start, limit)
+      AppendKeyWithMaxTimestamp(&limit_with_ts, limit, ts_sz);
+      start = start_with_ts;
+      limit = limit_with_ts;
+    }
+    // Convert user_key into a corresponding internal key.
+    InternalKey k1(start, kMaxSequenceNumber, kValueTypeForSeek);
+    InternalKey k2(limit, kMaxSequenceNumber, kValueTypeForSeek);
+    sizes[i] = 0;
+    if (options.include_files) {
+      sizes[i] += versions_->ApproximateSize(
+          options, v, k1.Encode(), k2.Encode(), /*start_level=*/0,
+          /*end_level=*/-1, TableReaderCaller::kUserApproximateSize);
+    }
+    if (options.include_memtables) {
+      sizes[i] += sv->mem->ApproximateStats(k1.Encode(), k2.Encode()).size;
+      sizes[i] += sv->imm->ApproximateStats(k1.Encode(), k2.Encode()).size;
+    }
+  }
+
+  ReturnAndCleanupSuperVersion(cfd, sv);
+  return Status::OK();
+}
+
+std::list<uint64_t>::iterator
+DBImpl::CaptureCurrentFileNumberInPendingOutputs() {
+  // We need to remember the iterator of our insert, because after the
+  // background job is done, we need to remove that element from
+  // pending_outputs_.
+  pending_outputs_.push_back(versions_->current_next_file_number());
+  auto pending_outputs_inserted_elem = pending_outputs_.end();
+  --pending_outputs_inserted_elem;
+  return pending_outputs_inserted_elem;
+}
+
+void DBImpl::ReleaseFileNumberFromPendingOutputs(
+    std::unique_ptr<std::list<uint64_t>::iterator>& v) {
+  if (v.get() != nullptr) {
+    pending_outputs_.erase(*v.get());
+    v.reset();
+  }
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::GetUpdatesSince(
+    SequenceNumber seq, std::unique_ptr<TransactionLogIterator>* iter,
+    const TransactionLogIterator::ReadOptions& read_options) {
+  RecordTick(stats_, GET_UPDATES_SINCE_CALLS);
+  if (seq_per_batch_) {
+    return Status::NotSupported(
+        "This API is not yet compatible with write-prepared/write-unprepared "
+        "transactions");
+  }
+  if (seq > versions_->LastSequence()) {
+    return Status::NotFound("Requested sequence not yet written in the db");
+  }
+  return wal_manager_.GetUpdatesSince(seq, iter, read_options, versions_.get());
+}
+
+Status DBImpl::DeleteFile(std::string name) {
+  uint64_t number;
+  FileType type;
+  WalFileType log_type;
+  if (!ParseFileName(name, &number, &type, &log_type) ||
+      (type != kTableFile && type != kWalFile)) {
+    ROCKS_LOG_ERROR(immutable_db_options_.info_log, "DeleteFile %s failed.\n",
+                    name.c_str());
+    return Status::InvalidArgument("Invalid file name");
+  }
+
+  if (type == kWalFile) {
+    // Only allow deleting archived log files
+    if (log_type != kArchivedLogFile) {
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                      "DeleteFile %s failed - not archived log.\n",
+                      name.c_str());
+      return Status::NotSupported("Delete only supported for archived logs");
+    }
+    Status status = wal_manager_.DeleteFile(name, number);
+    if (!status.ok()) {
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                      "DeleteFile %s failed -- %s.\n", name.c_str(),
+                      status.ToString().c_str());
+    }
+    return status;
+  }
+
+  Status status;
+  int level;
+  FileMetaData* metadata;
+  ColumnFamilyData* cfd;
+  VersionEdit edit;
+  JobContext job_context(next_job_id_.fetch_add(1), true);
+  {
+    InstrumentedMutexLock l(&mutex_);
+    status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd);
+    if (!status.ok()) {
+      ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                     "DeleteFile %s failed. File not found\n", name.c_str());
+      job_context.Clean();
+      return Status::InvalidArgument("File not found");
+    }
+    assert(level < cfd->NumberLevels());
+
+    // If the file is being compacted no need to delete.
+    if (metadata->being_compacted) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "DeleteFile %s Skipped. File about to be compacted\n",
+                     name.c_str());
+      job_context.Clean();
+      return Status::OK();
+    }
+
+    // Only the files in the last level can be deleted externally.
+    // This is to make sure that any deletion tombstones are not
+    // lost. Check that the level passed is the last level.
+    auto* vstoreage = cfd->current()->storage_info();
+    for (int i = level + 1; i < cfd->NumberLevels(); i++) {
+      if (vstoreage->NumLevelFiles(i) != 0) {
+        ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                       "DeleteFile %s FAILED. File not in last level\n",
+                       name.c_str());
+        job_context.Clean();
+        return Status::InvalidArgument("File not in last level");
+      }
+    }
+    // if level == 0, it has to be the oldest file
+    if (level == 0 &&
+        vstoreage->LevelFiles(0).back()->fd.GetNumber() != number) {
+      ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                     "DeleteFile %s failed ---"
+                     " target file in level 0 must be the oldest.",
+                     name.c_str());
+      job_context.Clean();
+      return Status::InvalidArgument("File in level 0, but not oldest");
+    }
+    edit.SetColumnFamily(cfd->GetID());
+    edit.DeleteFile(level, number);
+    status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+                                    &edit, &mutex_, directories_.GetDbDir());
+    if (status.ok()) {
+      InstallSuperVersionAndScheduleWork(cfd,
+                                         &job_context.superversion_contexts[0],
+                                         *cfd->GetLatestMutableCFOptions());
+    }
+    FindObsoleteFiles(&job_context, false);
+  }  // lock released here
+
+  LogFlush(immutable_db_options_.info_log);
+  // remove files outside the db-lock
+  if (job_context.HaveSomethingToDelete()) {
+    // Call PurgeObsoleteFiles() without holding mutex.
+    PurgeObsoleteFiles(job_context);
+  }
+  job_context.Clean();
+  return status;
+}
+
+Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family,
+                                   const RangePtr* ranges, size_t n,
+                                   bool include_end) {
+  Status status = Status::OK();
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  ColumnFamilyData* cfd = cfh->cfd();
+  VersionEdit edit;
+  std::set<FileMetaData*> deleted_files;
+  JobContext job_context(next_job_id_.fetch_add(1), true);
+  {
+    InstrumentedMutexLock l(&mutex_);
+    Version* input_version = cfd->current();
+
+    auto* vstorage = input_version->storage_info();
+    for (size_t r = 0; r < n; r++) {
+      auto begin = ranges[r].start, end = ranges[r].limit;
+      for (int i = 1; i < cfd->NumberLevels(); i++) {
+        if (vstorage->LevelFiles(i).empty() ||
+            !vstorage->OverlapInLevel(i, begin, end)) {
+          continue;
+        }
+        std::vector<FileMetaData*> level_files;
+        InternalKey begin_storage, end_storage, *begin_key, *end_key;
+        if (begin == nullptr) {
+          begin_key = nullptr;
+        } else {
+          begin_storage.SetMinPossibleForUserKey(*begin);
+          begin_key = &begin_storage;
+        }
+        if (end == nullptr) {
+          end_key = nullptr;
+        } else {
+          end_storage.SetMaxPossibleForUserKey(*end);
+          end_key = &end_storage;
+        }
+
+        vstorage->GetCleanInputsWithinInterval(
+            i, begin_key, end_key, &level_files, -1 /* hint_index */,
+            nullptr /* file_index */);
+        FileMetaData* level_file;
+        for (uint32_t j = 0; j < level_files.size(); j++) {
+          level_file = level_files[j];
+          if (level_file->being_compacted) {
+            continue;
+          }
+          if (deleted_files.find(level_file) != deleted_files.end()) {
+            continue;
+          }
+          if (!include_end && end != nullptr &&
+              cfd->user_comparator()->Compare(level_file->largest.user_key(),
+                                              *end) == 0) {
+            continue;
+          }
+          edit.SetColumnFamily(cfd->GetID());
+          edit.DeleteFile(i, level_file->fd.GetNumber());
+          deleted_files.insert(level_file);
+          level_file->being_compacted = true;
+        }
+        vstorage->ComputeCompactionScore(*cfd->ioptions(),
+                                         *cfd->GetLatestMutableCFOptions());
+      }
+    }
+    if (edit.GetDeletedFiles().empty()) {
+      job_context.Clean();
+      return status;
+    }
+    input_version->Ref();
+    status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+                                    &edit, &mutex_, directories_.GetDbDir());
+    if (status.ok()) {
+      InstallSuperVersionAndScheduleWork(cfd,
+                                         &job_context.superversion_contexts[0],
+                                         *cfd->GetLatestMutableCFOptions());
+    }
+    for (auto* deleted_file : deleted_files) {
+      deleted_file->being_compacted = false;
+    }
+    input_version->Unref();
+    FindObsoleteFiles(&job_context, false);
+  }  // lock released here
+
+  LogFlush(immutable_db_options_.info_log);
+  // remove files outside the db-lock
+  if (job_context.HaveSomethingToDelete()) {
+    // Call PurgeObsoleteFiles() without holding mutex.
+    PurgeObsoleteFiles(job_context);
+  }
+  job_context.Clean();
+  return status;
+}
+
+void DBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
+  InstrumentedMutexLock l(&mutex_);
+  versions_->GetLiveFilesMetaData(metadata);
+}
+
+Status DBImpl::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) {
+  InstrumentedMutexLock l(&mutex_);
+  return versions_->GetLiveFilesChecksumInfo(checksum_list);
+}
+
+void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
+                                     ColumnFamilyMetaData* cf_meta) {
+  assert(column_family);
+  auto* cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+  auto* sv = GetAndRefSuperVersion(cfd);
+  {
+    // Without mutex, Version::GetColumnFamilyMetaData will have data race with
+    // Compaction::MarkFilesBeingCompacted. One solution is to use mutex, but
+    // this may cause regression. An alternative is to make
+    // FileMetaData::being_compacted atomic, but it will make FileMetaData
+    // non-copy-able. Another option is to separate these variables from
+    // original FileMetaData struct, and this requires re-organization of data
+    // structures. For now, we take the easy approach. If
+    // DB::GetColumnFamilyMetaData is not called frequently, the regression
+    // should not be big. We still need to keep an eye on it.
+    InstrumentedMutexLock l(&mutex_);
+    sv->current->GetColumnFamilyMetaData(cf_meta);
+  }
+  ReturnAndCleanupSuperVersion(cfd, sv);
+}
+
+void DBImpl::GetAllColumnFamilyMetaData(
+    std::vector<ColumnFamilyMetaData>* metadata) {
+  InstrumentedMutexLock l(&mutex_);
+  for (auto cfd : *(versions_->GetColumnFamilySet())) {
+    {
+      metadata->emplace_back();
+      cfd->current()->GetColumnFamilyMetaData(&metadata->back());
+    }
+  }
+}
+
+#endif  // ROCKSDB_LITE
+
+Status DBImpl::CheckConsistency() {
+  mutex_.AssertHeld();
+  std::vector<LiveFileMetaData> metadata;
+  versions_->GetLiveFilesMetaData(&metadata);
+  TEST_SYNC_POINT("DBImpl::CheckConsistency:AfterGetLiveFilesMetaData");
+
+  std::string corruption_messages;
+
+  if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) {
+    // Instead of calling GetFileSize() for each expected file, call
+    // GetChildren() for the DB directory and check that all expected files
+    // are listed, without checking their sizes.
+    // Since sst files might be in different directories, do it for each
+    // directory separately.
+    std::map<std::string, std::vector<std::string>> files_by_directory;
+    for (const auto& md : metadata) {
+      // md.name has a leading "/". Remove it.
+      std::string fname = md.name;
+      if (!fname.empty() && fname[0] == '/') {
+        fname = fname.substr(1);
+      }
+      files_by_directory[md.db_path].push_back(fname);
+    }
+
+    IOOptions io_opts;
+    io_opts.do_not_recurse = true;
+    for (const auto& dir_files : files_by_directory) {
+      std::string directory = dir_files.first;
+      std::vector<std::string> existing_files;
+      Status s = fs_->GetChildren(directory, io_opts, &existing_files,
+                                  /*IODebugContext*=*/nullptr);
+      if (!s.ok()) {
+        corruption_messages +=
+            "Can't list files in " + directory + ": " + s.ToString() + "\n";
+        continue;
+      }
+      std::sort(existing_files.begin(), existing_files.end());
+
+      for (const std::string& fname : dir_files.second) {
+        if (!std::binary_search(existing_files.begin(), existing_files.end(),
+                                fname) &&
+            !std::binary_search(existing_files.begin(), existing_files.end(),
+                                Rocks2LevelTableFileName(fname))) {
+          corruption_messages +=
+              "Missing sst file " + fname + " in " + directory + "\n";
+        }
+      }
+    }
+  } else {
+    for (const auto& md : metadata) {
+      // md.name has a leading "/".
+      std::string file_path = md.db_path + md.name;
+
+      uint64_t fsize = 0;
+      TEST_SYNC_POINT("DBImpl::CheckConsistency:BeforeGetFileSize");
+      Status s = env_->GetFileSize(file_path, &fsize);
+      if (!s.ok() &&
+          env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok()) {
+        s = Status::OK();
+      }
+      if (!s.ok()) {
+        corruption_messages +=
+            "Can't access " + md.name + ": " + s.ToString() + "\n";
+      } else if (fsize != md.size) {
+        corruption_messages += "Sst file size mismatch: " + file_path +
+                               ". Size recorded in manifest " +
+                               std::to_string(md.size) + ", actual size " +
+                               std::to_string(fsize) + "\n";
+      }
+    }
+  }
+
+  if (corruption_messages.size() == 0) {
+    return Status::OK();
+  } else {
+    return Status::Corruption(corruption_messages);
+  }
+}
+
+Status DBImpl::GetDbIdentity(std::string& identity) const {
+  identity.assign(db_id_);
+  return Status::OK();
+}
+
+Status DBImpl::GetDbIdentityFromIdentityFile(std::string* identity) const {
+  std::string idfilename = IdentityFileName(dbname_);
+  const FileOptions soptions;
+
+  Status s = ReadFileToString(fs_.get(), idfilename, identity);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // If last character is '\n' remove it from identity. (Old implementations
+  // of Env::GenerateUniqueId() would include a trailing '\n'.)
+  if (identity->size() > 0 && identity->back() == '\n') {
+    identity->pop_back();
+  }
+  return s;
+}
+
+Status DBImpl::GetDbSessionId(std::string& session_id) const {
+  session_id.assign(db_session_id_);
+  return Status::OK();
+}
+
+namespace {
+SemiStructuredUniqueIdGen* DbSessionIdGen() {
+  static SemiStructuredUniqueIdGen gen;
+  return &gen;
+}
+}  // namespace
+
+void DBImpl::TEST_ResetDbSessionIdGen() { DbSessionIdGen()->Reset(); }
+
+std::string DBImpl::GenerateDbSessionId(Env*) {
+  // See SemiStructuredUniqueIdGen for its desirable properties.
+  auto gen = DbSessionIdGen();
+
+  uint64_t lo, hi;
+  gen->GenerateNext(&hi, &lo);
+  if (lo == 0) {
+    // Avoid emitting session ID with lo==0, so that SST unique
+    // IDs can be more easily ensured non-zero
+    gen->GenerateNext(&hi, &lo);
+    assert(lo != 0);
+  }
+  return EncodeSessionId(hi, lo);
+}
+
+void DBImpl::SetDbSessionId() {
+  db_session_id_ = GenerateDbSessionId(env_);
+  TEST_SYNC_POINT_CALLBACK("DBImpl::SetDbSessionId", &db_session_id_);
+}
+
+// Default implementation -- returns not supported status
+Status DB::CreateColumnFamily(const ColumnFamilyOptions& /*cf_options*/,
+                              const std::string& /*column_family_name*/,
+                              ColumnFamilyHandle** /*handle*/) {
+  return Status::NotSupported("");
+}
+
+Status DB::CreateColumnFamilies(
+    const ColumnFamilyOptions& /*cf_options*/,
+    const std::vector<std::string>& /*column_family_names*/,
+    std::vector<ColumnFamilyHandle*>* /*handles*/) {
+  return Status::NotSupported("");
+}
+
+Status DB::CreateColumnFamilies(
+    const std::vector<ColumnFamilyDescriptor>& /*column_families*/,
+    std::vector<ColumnFamilyHandle*>* /*handles*/) {
+  return Status::NotSupported("");
+}
+
+Status DB::DropColumnFamily(ColumnFamilyHandle* /*column_family*/) {
+  return Status::NotSupported("");
+}
+
+Status DB::DropColumnFamilies(
+    const std::vector<ColumnFamilyHandle*>& /*column_families*/) {
+  return Status::NotSupported("");
+}
+
+Status DB::DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family) {
+  if (DefaultColumnFamily() == column_family) {
+    return Status::InvalidArgument(
+        "Cannot destroy the handle returned by DefaultColumnFamily()");
+  }
+  delete column_family;
+  return Status::OK();
+}
+
+DB::~DB() {}
+
+Status DBImpl::Close() {
+  InstrumentedMutexLock closing_lock_guard(&closing_mutex_);
+  if (closed_) {
+    return closing_status_;
+  }
+
+  {
+    const Status s = MaybeReleaseTimestampedSnapshotsAndCheck();
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  closing_status_ = CloseImpl();
+  closed_ = true;
+  return closing_status_;
+}
+
+Status DB::ListColumnFamilies(const DBOptions& db_options,
+                              const std::string& name,
+                              std::vector<std::string>* column_families) {
+  const std::shared_ptr<FileSystem>& fs = db_options.env->GetFileSystem();
+  return VersionSet::ListColumnFamilies(column_families, name, fs.get());
+}
+
+Snapshot::~Snapshot() {}
+
+Status DestroyDB(const std::string& dbname, const Options& options,
+                 const std::vector<ColumnFamilyDescriptor>& column_families) {
+  ImmutableDBOptions soptions(SanitizeOptions(dbname, options));
+  Env* env = soptions.env;
+  std::vector<std::string> filenames;
+  bool wal_in_db_path = soptions.IsWalDirSameAsDBPath();
+
+  // Reset the logger because it holds a handle to the
+  // log file and prevents cleanup and directory removal
+  soptions.info_log.reset();
+  IOOptions io_opts;
+  // Ignore error in case directory does not exist
+  soptions.fs
+      ->GetChildren(dbname, io_opts, &filenames,
+                    /*IODebugContext*=*/nullptr)
+      .PermitUncheckedError();
+
+  FileLock* lock;
+  const std::string lockname = LockFileName(dbname);
+  Status result = env->LockFile(lockname, &lock);
+  if (result.ok()) {
+    uint64_t number;
+    FileType type;
+    InfoLogPrefix info_log_prefix(!soptions.db_log_dir.empty(), dbname);
+    for (const auto& fname : filenames) {
+      if (ParseFileName(fname, &number, info_log_prefix.prefix, &type) &&
+          type != kDBLockFile) {  // Lock file will be deleted at end
+        Status del;
+        std::string path_to_delete = dbname + "/" + fname;
+        if (type == kMetaDatabase) {
+          del = DestroyDB(path_to_delete, options);
+        } else if (type == kTableFile || type == kWalFile ||
+                   type == kBlobFile) {
+          del = DeleteDBFile(
+              &soptions, path_to_delete, dbname,
+              /*force_bg=*/false,
+              /*force_fg=*/(type == kWalFile) ? !wal_in_db_path : false);
+        } else {
+          del = env->DeleteFile(path_to_delete);
+        }
+        if (!del.ok() && result.ok()) {
+          result = del;
+        }
+      }
+    }
+
+    std::set<std::string> paths;
+    for (const DbPath& db_path : options.db_paths) {
+      paths.insert(db_path.path);
+    }
+    for (const ColumnFamilyDescriptor& cf : column_families) {
+      for (const DbPath& cf_path : cf.options.cf_paths) {
+        paths.insert(cf_path.path);
+      }
+    }
+
+    for (const auto& path : paths) {
+      if (soptions.fs
+              ->GetChildren(path, io_opts, &filenames,
+                            /*IODebugContext*=*/nullptr)
+              .ok()) {
+        for (const auto& fname : filenames) {
+          if (ParseFileName(fname, &number, &type) &&
+              (type == kTableFile ||
+               type == kBlobFile)) {  // Lock file will be deleted at end
+            std::string file_path = path + "/" + fname;
+            Status del = DeleteDBFile(&soptions, file_path, dbname,
+                                      /*force_bg=*/false, /*force_fg=*/false);
+            if (!del.ok() && result.ok()) {
+              result = del;
+            }
+          }
+        }
+        // TODO: Should we return an error if we cannot delete the directory?
+        env->DeleteDir(path).PermitUncheckedError();
+      }
+    }
+
+    std::vector<std::string> walDirFiles;
+    std::string archivedir = ArchivalDirectory(dbname);
+    bool wal_dir_exists = false;
+    if (!soptions.IsWalDirSameAsDBPath(dbname)) {
+      wal_dir_exists =
+          soptions.fs
+              ->GetChildren(soptions.wal_dir, io_opts, &walDirFiles,
+                            /*IODebugContext*=*/nullptr)
+              .ok();
+      archivedir = ArchivalDirectory(soptions.wal_dir);
+    }
+
+    // Archive dir may be inside wal dir or dbname and should be
+    // processed and removed before those otherwise we have issues
+    // removing them
+    std::vector<std::string> archiveFiles;
+    if (soptions.fs
+            ->GetChildren(archivedir, io_opts, &archiveFiles,
+                          /*IODebugContext*=*/nullptr)
+            .ok()) {
+      // Delete archival files.
+      for (const auto& file : archiveFiles) {
+        if (ParseFileName(file, &number, &type) && type == kWalFile) {
+          Status del =
+              DeleteDBFile(&soptions, archivedir + "/" + file, archivedir,
+                           /*force_bg=*/false, /*force_fg=*/!wal_in_db_path);
+          if (!del.ok() && result.ok()) {
+            result = del;
+          }
+        }
+      }
+      // Ignore error in case dir contains other files
+      env->DeleteDir(archivedir).PermitUncheckedError();
+    }
+
+    // Delete log files in the WAL dir
+    if (wal_dir_exists) {
+      for (const auto& file : walDirFiles) {
+        if (ParseFileName(file, &number, &type) && type == kWalFile) {
+          Status del =
+              DeleteDBFile(&soptions, LogFileName(soptions.wal_dir, number),
+                           soptions.wal_dir, /*force_bg=*/false,
+                           /*force_fg=*/!wal_in_db_path);
+          if (!del.ok() && result.ok()) {
+            result = del;
+          }
+        }
+      }
+      // Ignore error in case dir contains other files
+      env->DeleteDir(soptions.wal_dir).PermitUncheckedError();
+    }
+
+    // Ignore error since state is already gone
+    env->UnlockFile(lock).PermitUncheckedError();
+    env->DeleteFile(lockname).PermitUncheckedError();
+
+    // sst_file_manager holds a ref to the logger. Make sure the logger is
+    // gone before trying to remove the directory.
+    soptions.sst_file_manager.reset();
+
+    // Ignore error in case dir contains other files
+    env->DeleteDir(dbname).PermitUncheckedError();
+    ;
+  }
+  return result;
+}
+
+Status DBImpl::WriteOptionsFile(bool need_mutex_lock,
+                                bool need_enter_write_thread) {
+#ifndef ROCKSDB_LITE
+  WriteThread::Writer w;
+  if (need_mutex_lock) {
+    mutex_.Lock();
+  } else {
+    mutex_.AssertHeld();
+  }
+  if (need_enter_write_thread) {
+    write_thread_.EnterUnbatched(&w, &mutex_);
+  }
+
+  std::vector<std::string> cf_names;
+  std::vector<ColumnFamilyOptions> cf_opts;
+
+  // This part requires mutex to protect the column family options
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    if (cfd->IsDropped()) {
+      continue;
+    }
+    cf_names.push_back(cfd->GetName());
+    cf_opts.push_back(cfd->GetLatestCFOptions());
+  }
+
+  // Unlock during expensive operations.  New writes cannot get here
+  // because the single write thread ensures all new writes get queued.
+  DBOptions db_options =
+      BuildDBOptions(immutable_db_options_, mutable_db_options_);
+  mutex_.Unlock();
+
+  TEST_SYNC_POINT("DBImpl::WriteOptionsFile:1");
+  TEST_SYNC_POINT("DBImpl::WriteOptionsFile:2");
+  TEST_SYNC_POINT_CALLBACK("DBImpl::WriteOptionsFile:PersistOptions",
+                           &db_options);
+
+  std::string file_name =
+      TempOptionsFileName(GetName(), versions_->NewFileNumber());
+  Status s = PersistRocksDBOptions(db_options, cf_names, cf_opts, file_name,
+                                   fs_.get());
+
+  if (s.ok()) {
+    s = RenameTempFileToOptionsFile(file_name);
+  }
+  // restore lock
+  if (!need_mutex_lock) {
+    mutex_.Lock();
+  }
+  if (need_enter_write_thread) {
+    write_thread_.ExitUnbatched(&w);
+  }
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "Unnable to persist options -- %s", s.ToString().c_str());
+    if (immutable_db_options_.fail_if_options_file_error) {
+      return Status::IOError("Unable to persist options.",
+                             s.ToString().c_str());
+    }
+  }
+#else
+  (void)need_mutex_lock;
+  (void)need_enter_write_thread;
+#endif  // !ROCKSDB_LITE
+  return Status::OK();
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+void DeleteOptionsFilesHelper(const std::map<uint64_t, std::string>& filenames,
+                              const size_t num_files_to_keep,
+                              const std::shared_ptr<Logger>& info_log,
+                              Env* env) {
+  if (filenames.size() <= num_files_to_keep) {
+    return;
+  }
+  for (auto iter = std::next(filenames.begin(), num_files_to_keep);
+       iter != filenames.end(); ++iter) {
+    if (!env->DeleteFile(iter->second).ok()) {
+      ROCKS_LOG_WARN(info_log, "Unable to delete options file %s",
+                     iter->second.c_str());
+    }
+  }
+}
+}  // namespace
+#endif  // !ROCKSDB_LITE
+
+Status DBImpl::DeleteObsoleteOptionsFiles() {
+#ifndef ROCKSDB_LITE
+  std::vector<std::string> filenames;
+  // use ordered map to store keep the filenames sorted from the newest
+  // to the oldest.
+  std::map<uint64_t, std::string> options_filenames;
+  Status s;
+  IOOptions io_opts;
+  io_opts.do_not_recurse = true;
+  s = fs_->GetChildren(GetName(), io_opts, &filenames,
+                       /*IODebugContext*=*/nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+  for (auto& filename : filenames) {
+    uint64_t file_number;
+    FileType type;
+    if (ParseFileName(filename, &file_number, &type) && type == kOptionsFile) {
+      options_filenames.insert(
+          {std::numeric_limits<uint64_t>::max() - file_number,
+           GetName() + "/" + filename});
+    }
+  }
+
+  // Keeps the latest 2 Options file
+  const size_t kNumOptionsFilesKept = 2;
+  DeleteOptionsFilesHelper(options_filenames, kNumOptionsFilesKept,
+                           immutable_db_options_.info_log, GetEnv());
+  return Status::OK();
+#else
+  return Status::OK();
+#endif  // !ROCKSDB_LITE
+}
+
+Status DBImpl::RenameTempFileToOptionsFile(const std::string& file_name) {
+#ifndef ROCKSDB_LITE
+  Status s;
+
+  uint64_t options_file_number = versions_->NewFileNumber();
+  std::string options_file_name =
+      OptionsFileName(GetName(), options_file_number);
+  uint64_t options_file_size = 0;
+  s = GetEnv()->GetFileSize(file_name, &options_file_size);
+  if (s.ok()) {
+    // Retry if the file name happen to conflict with an existing one.
+    s = GetEnv()->RenameFile(file_name, options_file_name);
+    std::unique_ptr<FSDirectory> dir_obj;
+    if (s.ok()) {
+      s = fs_->NewDirectory(GetName(), IOOptions(), &dir_obj, nullptr);
+    }
+    if (s.ok()) {
+      s = dir_obj->FsyncWithDirOptions(IOOptions(), nullptr,
+                                       DirFsyncOptions(options_file_name));
+    }
+    if (s.ok()) {
+      Status temp_s = dir_obj->Close(IOOptions(), nullptr);
+      // The default Close() could return "NotSupproted" and we bypass it
+      // if it is not impelmented. Detailed explanations can be found in
+      // db/db_impl/db_impl.h
+      if (!temp_s.ok()) {
+        if (temp_s.IsNotSupported()) {
+          temp_s.PermitUncheckedError();
+        } else {
+          s = temp_s;
+        }
+      }
+    }
+  }
+  if (s.ok()) {
+    InstrumentedMutexLock l(&mutex_);
+    versions_->options_file_number_ = options_file_number;
+    versions_->options_file_size_ = options_file_size;
+  }
+
+  if (0 == disable_delete_obsolete_files_) {
+    // TODO: Should we check for errors here?
+    DeleteObsoleteOptionsFiles().PermitUncheckedError();
+  }
+  return s;
+#else
+  (void)file_name;
+  return Status::OK();
+#endif  // !ROCKSDB_LITE
+}
+
+#ifdef ROCKSDB_USING_THREAD_STATUS
+
+void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* cfd) const {
+  if (immutable_db_options_.enable_thread_tracking) {
+    ThreadStatusUtil::NewColumnFamilyInfo(this, cfd, cfd->GetName(),
+                                          cfd->ioptions()->env);
+  }
+}
+
+void DBImpl::EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const {
+  if (immutable_db_options_.enable_thread_tracking) {
+    ThreadStatusUtil::EraseColumnFamilyInfo(cfd);
+  }
+}
+
+void DBImpl::EraseThreadStatusDbInfo() const {
+  if (immutable_db_options_.enable_thread_tracking) {
+    ThreadStatusUtil::EraseDatabaseInfo(this);
+  }
+}
+
+#else
+void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {}
+
+void DBImpl::EraseThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {}
+
+void DBImpl::EraseThreadStatusDbInfo() const {}
+#endif  // ROCKSDB_USING_THREAD_STATUS
+
+//
+// A global method that can dump out the build version
+void DumpRocksDBBuildVersion(Logger* log) {
+  ROCKS_LOG_HEADER(log, "RocksDB version: %s\n",
+                   GetRocksVersionAsString().c_str());
+  const auto& props = GetRocksBuildProperties();
+  const auto& sha = props.find("rocksdb_build_git_sha");
+  if (sha != props.end()) {
+    ROCKS_LOG_HEADER(log, "Git sha %s", sha->second.c_str());
+  }
+  const auto date = props.find("rocksdb_build_date");
+  if (date != props.end()) {
+    ROCKS_LOG_HEADER(log, "Compile date %s", date->second.c_str());
+  }
+}
+
+#ifndef ROCKSDB_LITE
+SequenceNumber DBImpl::GetEarliestMemTableSequenceNumber(SuperVersion* sv,
+                                                         bool include_history) {
+  // Find the earliest sequence number that we know we can rely on reading
+  // from the memtable without needing to check sst files.
+  SequenceNumber earliest_seq =
+      sv->imm->GetEarliestSequenceNumber(include_history);
+  if (earliest_seq == kMaxSequenceNumber) {
+    earliest_seq = sv->mem->GetEarliestSequenceNumber();
+  }
+  assert(sv->mem->GetEarliestSequenceNumber() >= earliest_seq);
+
+  return earliest_seq;
+}
+
+Status DBImpl::GetLatestSequenceForKey(
+    SuperVersion* sv, const Slice& key, bool cache_only,
+    SequenceNumber lower_bound_seq, SequenceNumber* seq, std::string* timestamp,
+    bool* found_record_for_key, bool* is_blob_index) {
+  Status s;
+  MergeContext merge_context;
+  SequenceNumber max_covering_tombstone_seq = 0;
+
+  ReadOptions read_options;
+  SequenceNumber current_seq = versions_->LastSequence();
+
+  ColumnFamilyData* cfd = sv->cfd;
+  assert(cfd);
+  const Comparator* const ucmp = cfd->user_comparator();
+  assert(ucmp);
+  size_t ts_sz = ucmp->timestamp_size();
+  std::string ts_buf;
+  if (ts_sz > 0) {
+    assert(timestamp);
+    ts_buf.assign(ts_sz, '\xff');
+  } else {
+    assert(!timestamp);
+  }
+  Slice ts(ts_buf);
+
+  LookupKey lkey(key, current_seq, ts_sz == 0 ? nullptr : &ts);
+
+  *seq = kMaxSequenceNumber;
+  *found_record_for_key = false;
+
+  // Check if there is a record for this key in the latest memtable
+  sv->mem->Get(lkey, /*value=*/nullptr, /*columns=*/nullptr, timestamp, &s,
+               &merge_context, &max_covering_tombstone_seq, seq, read_options,
+               false /* immutable_memtable */, nullptr /*read_callback*/,
+               is_blob_index);
+
+  if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+    // unexpected error reading memtable.
+    ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                    "Unexpected status returned from MemTable::Get: %s\n",
+                    s.ToString().c_str());
+
+    return s;
+  }
+  assert(!ts_sz ||
+         (*seq != kMaxSequenceNumber &&
+          *timestamp != std::string(ts_sz, '\xff')) ||
+         (*seq == kMaxSequenceNumber && timestamp->empty()));
+
+  TEST_SYNC_POINT_CALLBACK("DBImpl::GetLatestSequenceForKey:mem", timestamp);
+
+  if (*seq != kMaxSequenceNumber) {
+    // Found a sequence number, no need to check immutable memtables
+    *found_record_for_key = true;
+    return Status::OK();
+  }
+
+  SequenceNumber lower_bound_in_mem = sv->mem->GetEarliestSequenceNumber();
+  if (lower_bound_in_mem != kMaxSequenceNumber &&
+      lower_bound_in_mem < lower_bound_seq) {
+    *found_record_for_key = false;
+    return Status::OK();
+  }
+
+  // Check if there is a record for this key in the immutable memtables
+  sv->imm->Get(lkey, /*value=*/nullptr, /*columns=*/nullptr, timestamp, &s,
+               &merge_context, &max_covering_tombstone_seq, seq, read_options,
+               nullptr /*read_callback*/, is_blob_index);
+
+  if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+    // unexpected error reading memtable.
+    ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                    "Unexpected status returned from MemTableList::Get: %s\n",
+                    s.ToString().c_str());
+
+    return s;
+  }
+
+  assert(!ts_sz ||
+         (*seq != kMaxSequenceNumber &&
+          *timestamp != std::string(ts_sz, '\xff')) ||
+         (*seq == kMaxSequenceNumber && timestamp->empty()));
+
+  if (*seq != kMaxSequenceNumber) {
+    // Found a sequence number, no need to check memtable history
+    *found_record_for_key = true;
+    return Status::OK();
+  }
+
+  SequenceNumber lower_bound_in_imm = sv->imm->GetEarliestSequenceNumber();
+  if (lower_bound_in_imm != kMaxSequenceNumber &&
+      lower_bound_in_imm < lower_bound_seq) {
+    *found_record_for_key = false;
+    return Status::OK();
+  }
+
+  // Check if there is a record for this key in the immutable memtables
+  sv->imm->GetFromHistory(lkey, /*value=*/nullptr, /*columns=*/nullptr,
+                          timestamp, &s, &merge_context,
+                          &max_covering_tombstone_seq, seq, read_options,
+                          is_blob_index);
+
+  if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+    // unexpected error reading memtable.
+    ROCKS_LOG_ERROR(
+        immutable_db_options_.info_log,
+        "Unexpected status returned from MemTableList::GetFromHistory: %s\n",
+        s.ToString().c_str());
+
+    return s;
+  }
+
+  assert(!ts_sz ||
+         (*seq != kMaxSequenceNumber &&
+          *timestamp != std::string(ts_sz, '\xff')) ||
+         (*seq == kMaxSequenceNumber && timestamp->empty()));
+
+  if (*seq != kMaxSequenceNumber) {
+    // Found a sequence number, no need to check SST files
+    assert(0 == ts_sz || *timestamp != std::string(ts_sz, '\xff'));
+    *found_record_for_key = true;
+    return Status::OK();
+  }
+
+  // We could do a sv->imm->GetEarliestSequenceNumber(/*include_history*/ true)
+  // check here to skip the history if possible. But currently the caller
+  // already does that. Maybe we should move the logic here later.
+
+  // TODO(agiardullo): possible optimization: consider checking cached
+  // SST files if cache_only=true?
+  if (!cache_only) {
+    // Check tables
+    PinnedIteratorsManager pinned_iters_mgr;
+    sv->current->Get(read_options, lkey, /*value=*/nullptr, /*columns=*/nullptr,
+                     timestamp, &s, &merge_context, &max_covering_tombstone_seq,
+                     &pinned_iters_mgr, nullptr /* value_found */,
+                     found_record_for_key, seq, nullptr /*read_callback*/,
+                     is_blob_index);
+
+    if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+      // unexpected error reading SST files
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                      "Unexpected status returned from Version::Get: %s\n",
+                      s.ToString().c_str());
+    }
+  }
+
+  return s;
+}
+
+Status DBImpl::IngestExternalFile(
+    ColumnFamilyHandle* column_family,
+    const std::vector<std::string>& external_files,
+    const IngestExternalFileOptions& ingestion_options) {
+  IngestExternalFileArg arg;
+  arg.column_family = column_family;
+  arg.external_files = external_files;
+  arg.options = ingestion_options;
+  return IngestExternalFiles({arg});
+}
+
+Status DBImpl::IngestExternalFiles(
+    const std::vector<IngestExternalFileArg>& args) {
+  if (args.empty()) {
+    return Status::InvalidArgument("ingestion arg list is empty");
+  }
+  {
+    std::unordered_set<ColumnFamilyHandle*> unique_cfhs;
+    for (const auto& arg : args) {
+      if (arg.column_family == nullptr) {
+        return Status::InvalidArgument("column family handle is null");
+      } else if (unique_cfhs.count(arg.column_family) > 0) {
+        return Status::InvalidArgument(
+            "ingestion args have duplicate column families");
+      }
+      unique_cfhs.insert(arg.column_family);
+    }
+  }
+  // Ingest multiple external SST files atomically.
+  const size_t num_cfs = args.size();
+  for (size_t i = 0; i != num_cfs; ++i) {
+    if (args[i].external_files.empty()) {
+      char err_msg[128] = {0};
+      snprintf(err_msg, 128, "external_files[%zu] is empty", i);
+      return Status::InvalidArgument(err_msg);
+    }
+  }
+  for (const auto& arg : args) {
+    const IngestExternalFileOptions& ingest_opts = arg.options;
+    if (ingest_opts.ingest_behind &&
+        !immutable_db_options_.allow_ingest_behind) {
+      return Status::InvalidArgument(
+          "can't ingest_behind file in DB with allow_ingest_behind=false");
+    }
+  }
+
+  // TODO (yanqin) maybe handle the case in which column_families have
+  // duplicates
+  std::unique_ptr<std::list<uint64_t>::iterator> pending_output_elem;
+  size_t total = 0;
+  for (const auto& arg : args) {
+    total += arg.external_files.size();
+  }
+  uint64_t next_file_number = 0;
+  Status status = ReserveFileNumbersBeforeIngestion(
+      static_cast<ColumnFamilyHandleImpl*>(args[0].column_family)->cfd(), total,
+      pending_output_elem, &next_file_number);
+  if (!status.ok()) {
+    InstrumentedMutexLock l(&mutex_);
+    ReleaseFileNumberFromPendingOutputs(pending_output_elem);
+    return status;
+  }
+
+  std::vector<ExternalSstFileIngestionJob> ingestion_jobs;
+  for (const auto& arg : args) {
+    auto* cfd = static_cast<ColumnFamilyHandleImpl*>(arg.column_family)->cfd();
+    ingestion_jobs.emplace_back(versions_.get(), cfd, immutable_db_options_,
+                                file_options_, &snapshots_, arg.options,
+                                &directories_, &event_logger_, io_tracer_);
+  }
+
+  // TODO(yanqin) maybe make jobs run in parallel
+  uint64_t start_file_number = next_file_number;
+  for (size_t i = 1; i != num_cfs; ++i) {
+    start_file_number += args[i - 1].external_files.size();
+    auto* cfd =
+        static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+    SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+    Status es = ingestion_jobs[i].Prepare(
+        args[i].external_files, args[i].files_checksums,
+        args[i].files_checksum_func_names, args[i].file_temperature,
+        start_file_number, super_version);
+    // capture first error only
+    if (!es.ok() && status.ok()) {
+      status = es;
+    }
+    CleanupSuperVersion(super_version);
+  }
+  TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:0");
+  TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:1");
+  {
+    auto* cfd =
+        static_cast<ColumnFamilyHandleImpl*>(args[0].column_family)->cfd();
+    SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+    Status es = ingestion_jobs[0].Prepare(
+        args[0].external_files, args[0].files_checksums,
+        args[0].files_checksum_func_names, args[0].file_temperature,
+        next_file_number, super_version);
+    if (!es.ok()) {
+      status = es;
+    }
+    CleanupSuperVersion(super_version);
+  }
+  if (!status.ok()) {
+    for (size_t i = 0; i != num_cfs; ++i) {
+      ingestion_jobs[i].Cleanup(status);
+    }
+    InstrumentedMutexLock l(&mutex_);
+    ReleaseFileNumberFromPendingOutputs(pending_output_elem);
+    return status;
+  }
+
+  std::vector<SuperVersionContext> sv_ctxs;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    sv_ctxs.emplace_back(true /* create_superversion */);
+  }
+  TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeJobsRun:0");
+  TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeJobsRun:1");
+  TEST_SYNC_POINT("DBImpl::AddFile:Start");
+  {
+    InstrumentedMutexLock l(&mutex_);
+    TEST_SYNC_POINT("DBImpl::AddFile:MutexLock");
+
+    // Stop writes to the DB by entering both write threads
+    WriteThread::Writer w;
+    write_thread_.EnterUnbatched(&w, &mutex_);
+    WriteThread::Writer nonmem_w;
+    if (two_write_queues_) {
+      nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+    }
+
+    // When unordered_write is enabled, the keys are writing to memtable in an
+    // unordered way. If the ingestion job checks memtable key range before the
+    // key landing in memtable, the ingestion job may skip the necessary
+    // memtable flush.
+    // So wait here to ensure there is no pending write to memtable.
+    WaitForPendingWrites();
+
+    num_running_ingest_file_ += static_cast<int>(num_cfs);
+    TEST_SYNC_POINT("DBImpl::IngestExternalFile:AfterIncIngestFileCounter");
+
+    bool at_least_one_cf_need_flush = false;
+    std::vector<bool> need_flush(num_cfs, false);
+    for (size_t i = 0; i != num_cfs; ++i) {
+      auto* cfd =
+          static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+      if (cfd->IsDropped()) {
+        // TODO (yanqin) investigate whether we should abort ingestion or
+        // proceed with other non-dropped column families.
+        status = Status::InvalidArgument(
+            "cannot ingest an external file into a dropped CF");
+        break;
+      }
+      bool tmp = false;
+      status = ingestion_jobs[i].NeedsFlush(&tmp, cfd->GetSuperVersion());
+      need_flush[i] = tmp;
+      at_least_one_cf_need_flush = (at_least_one_cf_need_flush || tmp);
+      if (!status.ok()) {
+        break;
+      }
+    }
+    TEST_SYNC_POINT_CALLBACK("DBImpl::IngestExternalFile:NeedFlush",
+                             &at_least_one_cf_need_flush);
+
+    if (status.ok() && at_least_one_cf_need_flush) {
+      FlushOptions flush_opts;
+      flush_opts.allow_write_stall = true;
+      if (immutable_db_options_.atomic_flush) {
+        autovector<ColumnFamilyData*> cfds_to_flush;
+        SelectColumnFamiliesForAtomicFlush(&cfds_to_flush);
+        mutex_.Unlock();
+        status = AtomicFlushMemTables(cfds_to_flush, flush_opts,
+                                      FlushReason::kExternalFileIngestion,
+                                      true /* entered_write_thread */);
+        mutex_.Lock();
+      } else {
+        for (size_t i = 0; i != num_cfs; ++i) {
+          if (need_flush[i]) {
+            mutex_.Unlock();
+            auto* cfd =
+                static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)
+                    ->cfd();
+            status = FlushMemTable(cfd, flush_opts,
+                                   FlushReason::kExternalFileIngestion,
+                                   true /* entered_write_thread */);
+            mutex_.Lock();
+            if (!status.ok()) {
+              break;
+            }
+          }
+        }
+      }
+    }
+    // Run ingestion jobs.
+    if (status.ok()) {
+      for (size_t i = 0; i != num_cfs; ++i) {
+        status = ingestion_jobs[i].Run();
+        if (!status.ok()) {
+          break;
+        }
+      }
+    }
+    if (status.ok()) {
+      autovector<ColumnFamilyData*> cfds_to_commit;
+      autovector<const MutableCFOptions*> mutable_cf_options_list;
+      autovector<autovector<VersionEdit*>> edit_lists;
+      uint32_t num_entries = 0;
+      for (size_t i = 0; i != num_cfs; ++i) {
+        auto* cfd =
+            static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+        if (cfd->IsDropped()) {
+          continue;
+        }
+        cfds_to_commit.push_back(cfd);
+        mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions());
+        autovector<VersionEdit*> edit_list;
+        edit_list.push_back(ingestion_jobs[i].edit());
+        edit_lists.push_back(edit_list);
+        ++num_entries;
+      }
+      // Mark the version edits as an atomic group if the number of version
+      // edits exceeds 1.
+      if (cfds_to_commit.size() > 1) {
+        for (auto& edits : edit_lists) {
+          assert(edits.size() == 1);
+          edits[0]->MarkAtomicGroup(--num_entries);
+        }
+        assert(0 == num_entries);
+      }
+      status =
+          versions_->LogAndApply(cfds_to_commit, mutable_cf_options_list,
+                                 edit_lists, &mutex_, directories_.GetDbDir());
+      // It is safe to update VersionSet last seqno here after LogAndApply since
+      // LogAndApply persists last sequence number from VersionEdits,
+      // which are from file's largest seqno and not from VersionSet.
+      //
+      // It is necessary to update last seqno here since LogAndApply releases
+      // mutex when persisting MANIFEST file, and the snapshots taken during
+      // that period will not be stable if VersionSet last seqno is updated
+      // before LogAndApply.
+      int consumed_seqno_count =
+          ingestion_jobs[0].ConsumedSequenceNumbersCount();
+      for (size_t i = 1; i != num_cfs; ++i) {
+        consumed_seqno_count =
+            std::max(consumed_seqno_count,
+                     ingestion_jobs[i].ConsumedSequenceNumbersCount());
+      }
+      if (consumed_seqno_count > 0) {
+        const SequenceNumber last_seqno = versions_->LastSequence();
+        versions_->SetLastAllocatedSequence(last_seqno + consumed_seqno_count);
+        versions_->SetLastPublishedSequence(last_seqno + consumed_seqno_count);
+        versions_->SetLastSequence(last_seqno + consumed_seqno_count);
+      }
+    }
+
+    if (status.ok()) {
+      for (size_t i = 0; i != num_cfs; ++i) {
+        auto* cfd =
+            static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+        if (!cfd->IsDropped()) {
+          InstallSuperVersionAndScheduleWork(cfd, &sv_ctxs[i],
+                                             *cfd->GetLatestMutableCFOptions());
+#ifndef NDEBUG
+          if (0 == i && num_cfs > 1) {
+            TEST_SYNC_POINT(
+                "DBImpl::IngestExternalFiles:InstallSVForFirstCF:0");
+            TEST_SYNC_POINT(
+                "DBImpl::IngestExternalFiles:InstallSVForFirstCF:1");
+          }
+#endif  // !NDEBUG
+        }
+      }
+    } else if (versions_->io_status().IsIOError()) {
+      // Error while writing to MANIFEST.
+      // In fact, versions_->io_status() can also be the result of renaming
+      // CURRENT file. With current code, it's just difficult to tell. So just
+      // be pessimistic and try write to a new MANIFEST.
+      // TODO: distinguish between MANIFEST write and CURRENT renaming
+      const IOStatus& io_s = versions_->io_status();
+      // Should handle return error?
+      error_handler_.SetBGError(io_s, BackgroundErrorReason::kManifestWrite);
+    }
+
+    // Resume writes to the DB
+    if (two_write_queues_) {
+      nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+    }
+    write_thread_.ExitUnbatched(&w);
+
+    if (status.ok()) {
+      for (auto& job : ingestion_jobs) {
+        job.UpdateStats();
+      }
+    }
+    ReleaseFileNumberFromPendingOutputs(pending_output_elem);
+    num_running_ingest_file_ -= static_cast<int>(num_cfs);
+    if (0 == num_running_ingest_file_) {
+      bg_cv_.SignalAll();
+    }
+    TEST_SYNC_POINT("DBImpl::AddFile:MutexUnlock");
+  }
+  // mutex_ is unlocked here
+
+  // Cleanup
+  for (size_t i = 0; i != num_cfs; ++i) {
+    sv_ctxs[i].Clean();
+    // This may rollback jobs that have completed successfully. This is
+    // intended for atomicity.
+    ingestion_jobs[i].Cleanup(status);
+  }
+  if (status.ok()) {
+    for (size_t i = 0; i != num_cfs; ++i) {
+      auto* cfd =
+          static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+      if (!cfd->IsDropped()) {
+        NotifyOnExternalFileIngested(cfd, ingestion_jobs[i]);
+      }
+    }
+  }
+  return status;
+}
+
+Status DBImpl::CreateColumnFamilyWithImport(
+    const ColumnFamilyOptions& options, const std::string& column_family_name,
+    const ImportColumnFamilyOptions& import_options,
+    const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) {
+  assert(handle != nullptr);
+  assert(*handle == nullptr);
+  std::string cf_comparator_name = options.comparator->Name();
+  if (cf_comparator_name != metadata.db_comparator_name) {
+    return Status::InvalidArgument("Comparator name mismatch");
+  }
+
+  // Create column family.
+  auto status = CreateColumnFamily(options, column_family_name, handle);
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Import sst files from metadata.
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(*handle);
+  auto cfd = cfh->cfd();
+  ImportColumnFamilyJob import_job(versions_.get(), cfd, immutable_db_options_,
+                                   file_options_, import_options,
+                                   metadata.files, io_tracer_);
+
+  SuperVersionContext dummy_sv_ctx(/* create_superversion */ true);
+  VersionEdit dummy_edit;
+  uint64_t next_file_number = 0;
+  std::unique_ptr<std::list<uint64_t>::iterator> pending_output_elem;
+  {
+    // Lock db mutex
+    InstrumentedMutexLock l(&mutex_);
+    if (error_handler_.IsDBStopped()) {
+      // Don't import files when there is a bg_error
+      status = error_handler_.GetBGError();
+    }
+
+    // Make sure that bg cleanup wont delete the files that we are importing
+    pending_output_elem.reset(new std::list<uint64_t>::iterator(
+        CaptureCurrentFileNumberInPendingOutputs()));
+
+    if (status.ok()) {
+      // If crash happen after a hard link established, Recover function may
+      // reuse the file number that has already assigned to the internal file,
+      // and this will overwrite the external file. To protect the external
+      // file, we have to make sure the file number will never being reused.
+      next_file_number = versions_->FetchAddFileNumber(metadata.files.size());
+      auto cf_options = cfd->GetLatestMutableCFOptions();
+      status = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_,
+                                      directories_.GetDbDir());
+      if (status.ok()) {
+        InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options);
+      }
+    }
+  }
+  dummy_sv_ctx.Clean();
+
+  if (status.ok()) {
+    SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
+    status = import_job.Prepare(next_file_number, sv);
+    CleanupSuperVersion(sv);
+  }
+
+  if (status.ok()) {
+    SuperVersionContext sv_context(true /*create_superversion*/);
+    {
+      // Lock db mutex
+      InstrumentedMutexLock l(&mutex_);
+
+      // Stop writes to the DB by entering both write threads
+      WriteThread::Writer w;
+      write_thread_.EnterUnbatched(&w, &mutex_);
+      WriteThread::Writer nonmem_w;
+      if (two_write_queues_) {
+        nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+      }
+
+      num_running_ingest_file_++;
+      assert(!cfd->IsDropped());
+      status = import_job.Run();
+
+      // Install job edit [Mutex will be unlocked here]
+      if (status.ok()) {
+        auto cf_options = cfd->GetLatestMutableCFOptions();
+        status = versions_->LogAndApply(cfd, *cf_options, import_job.edit(),
+                                        &mutex_, directories_.GetDbDir());
+        if (status.ok()) {
+          InstallSuperVersionAndScheduleWork(cfd, &sv_context, *cf_options);
+        }
+      }
+
+      // Resume writes to the DB
+      if (two_write_queues_) {
+        nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+      }
+      write_thread_.ExitUnbatched(&w);
+
+      num_running_ingest_file_--;
+      if (num_running_ingest_file_ == 0) {
+        bg_cv_.SignalAll();
+      }
+    }
+    // mutex_ is unlocked here
+
+    sv_context.Clean();
+  }
+
+  {
+    InstrumentedMutexLock l(&mutex_);
+    ReleaseFileNumberFromPendingOutputs(pending_output_elem);
+  }
+
+  import_job.Cleanup(status);
+  if (!status.ok()) {
+    Status temp_s = DropColumnFamily(*handle);
+    if (!temp_s.ok()) {
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                      "DropColumnFamily failed with error %s",
+                      temp_s.ToString().c_str());
+    }
+    // Always returns Status::OK()
+    temp_s = DestroyColumnFamilyHandle(*handle);
+    assert(temp_s.ok());
+    *handle = nullptr;
+  }
+  return status;
+}
+
+Status DBImpl::VerifyFileChecksums(const ReadOptions& read_options) {
+  return VerifyChecksumInternal(read_options, /*use_file_checksum=*/true);
+}
+
+Status DBImpl::VerifyChecksum(const ReadOptions& read_options) {
+  return VerifyChecksumInternal(read_options, /*use_file_checksum=*/false);
+}
+
+Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options,
+                                      bool use_file_checksum) {
+  // `bytes_read` stat is enabled based on compile-time support and cannot
+  // be dynamically toggled. So we do not need to worry about `PerfLevel`
+  // here, unlike many other `IOStatsContext` / `PerfContext` stats.
+  uint64_t prev_bytes_read = IOSTATS(bytes_read);
+
+  Status s;
+
+  if (use_file_checksum) {
+    FileChecksumGenFactory* const file_checksum_gen_factory =
+        immutable_db_options_.file_checksum_gen_factory.get();
+    if (!file_checksum_gen_factory) {
+      s = Status::InvalidArgument(
+          "Cannot verify file checksum if options.file_checksum_gen_factory is "
+          "null");
+      return s;
+    }
+  }
+
+  // TODO: simplify using GetRefedColumnFamilySet?
+  std::vector<ColumnFamilyData*> cfd_list;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (!cfd->IsDropped() && cfd->initialized()) {
+        cfd->Ref();
+        cfd_list.push_back(cfd);
+      }
+    }
+  }
+  std::vector<SuperVersion*> sv_list;
+  for (auto cfd : cfd_list) {
+    sv_list.push_back(cfd->GetReferencedSuperVersion(this));
+  }
+
+  for (auto& sv : sv_list) {
+    VersionStorageInfo* vstorage = sv->current->storage_info();
+    ColumnFamilyData* cfd = sv->current->cfd();
+    Options opts;
+    if (!use_file_checksum) {
+      InstrumentedMutexLock l(&mutex_);
+      opts = Options(BuildDBOptions(immutable_db_options_, mutable_db_options_),
+                     cfd->GetLatestCFOptions());
+    }
+    for (int i = 0; i < vstorage->num_non_empty_levels() && s.ok(); i++) {
+      for (size_t j = 0; j < vstorage->LevelFilesBrief(i).num_files && s.ok();
+           j++) {
+        const auto& fd_with_krange = vstorage->LevelFilesBrief(i).files[j];
+        const auto& fd = fd_with_krange.fd;
+        const FileMetaData* fmeta = fd_with_krange.file_metadata;
+        assert(fmeta);
+        std::string fname = TableFileName(cfd->ioptions()->cf_paths,
+                                          fd.GetNumber(), fd.GetPathId());
+        if (use_file_checksum) {
+          s = VerifyFullFileChecksum(fmeta->file_checksum,
+                                     fmeta->file_checksum_func_name, fname,
+                                     read_options);
+        } else {
+          s = ROCKSDB_NAMESPACE::VerifySstFileChecksum(
+              opts, file_options_, read_options, fname, fd.largest_seqno);
+        }
+        RecordTick(stats_, VERIFY_CHECKSUM_READ_BYTES,
+                   IOSTATS(bytes_read) - prev_bytes_read);
+        prev_bytes_read = IOSTATS(bytes_read);
+      }
+    }
+
+    if (s.ok() && use_file_checksum) {
+      const auto& blob_files = vstorage->GetBlobFiles();
+      for (const auto& meta : blob_files) {
+        assert(meta);
+
+        const uint64_t blob_file_number = meta->GetBlobFileNumber();
+
+        const std::string blob_file_name = BlobFileName(
+            cfd->ioptions()->cf_paths.front().path, blob_file_number);
+        s = VerifyFullFileChecksum(meta->GetChecksumValue(),
+                                   meta->GetChecksumMethod(), blob_file_name,
+                                   read_options);
+        RecordTick(stats_, VERIFY_CHECKSUM_READ_BYTES,
+                   IOSTATS(bytes_read) - prev_bytes_read);
+        prev_bytes_read = IOSTATS(bytes_read);
+        if (!s.ok()) {
+          break;
+        }
+      }
+    }
+    if (!s.ok()) {
+      break;
+    }
+  }
+
+  bool defer_purge = immutable_db_options().avoid_unnecessary_blocking_io;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    for (auto sv : sv_list) {
+      if (sv && sv->Unref()) {
+        sv->Cleanup();
+        if (defer_purge) {
+          AddSuperVersionsToFreeQueue(sv);
+        } else {
+          delete sv;
+        }
+      }
+    }
+    if (defer_purge) {
+      SchedulePurge();
+    }
+    for (auto cfd : cfd_list) {
+      cfd->UnrefAndTryDelete();
+    }
+  }
+  RecordTick(stats_, VERIFY_CHECKSUM_READ_BYTES,
+             IOSTATS(bytes_read) - prev_bytes_read);
+  return s;
+}
+
+Status DBImpl::VerifyFullFileChecksum(const std::string& file_checksum_expected,
+                                      const std::string& func_name_expected,
+                                      const std::string& fname,
+                                      const ReadOptions& read_options) {
+  Status s;
+  if (file_checksum_expected == kUnknownFileChecksum) {
+    return s;
+  }
+  std::string file_checksum;
+  std::string func_name;
+  s = ROCKSDB_NAMESPACE::GenerateOneFileChecksum(
+      fs_.get(), fname, immutable_db_options_.file_checksum_gen_factory.get(),
+      func_name_expected, &file_checksum, &func_name,
+      read_options.readahead_size, immutable_db_options_.allow_mmap_reads,
+      io_tracer_, immutable_db_options_.rate_limiter.get(),
+      read_options.rate_limiter_priority);
+  if (s.ok()) {
+    assert(func_name_expected == func_name);
+    if (file_checksum != file_checksum_expected) {
+      std::ostringstream oss;
+      oss << fname << " file checksum mismatch, ";
+      oss << "expecting "
+          << Slice(file_checksum_expected).ToString(/*hex=*/true);
+      oss << ", but actual " << Slice(file_checksum).ToString(/*hex=*/true);
+      s = Status::Corruption(oss.str());
+      TEST_SYNC_POINT_CALLBACK("DBImpl::VerifyFullFileChecksum:mismatch", &s);
+    }
+  }
+  return s;
+}
+
+void DBImpl::NotifyOnExternalFileIngested(
+    ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job) {
+  if (immutable_db_options_.listeners.empty()) {
+    return;
+  }
+
+  for (const IngestedFileInfo& f : ingestion_job.files_to_ingest()) {
+    ExternalFileIngestionInfo info;
+    info.cf_name = cfd->GetName();
+    info.external_file_path = f.external_file_path;
+    info.internal_file_path = f.internal_file_path;
+    info.global_seqno = f.assigned_seqno;
+    info.table_properties = f.table_properties;
+    for (auto listener : immutable_db_options_.listeners) {
+      listener->OnExternalFileIngested(this, info);
+    }
+  }
+}
+
+void DBImpl::WaitForIngestFile() {
+  mutex_.AssertHeld();
+  while (num_running_ingest_file_ > 0) {
+    bg_cv_.Wait();
+  }
+}
+
+Status DBImpl::StartTrace(const TraceOptions& trace_options,
+                          std::unique_ptr<TraceWriter>&& trace_writer) {
+  InstrumentedMutexLock lock(&trace_mutex_);
+  tracer_.reset(new Tracer(immutable_db_options_.clock, trace_options,
+                           std::move(trace_writer)));
+  return Status::OK();
+}
+
+Status DBImpl::EndTrace() {
+  InstrumentedMutexLock lock(&trace_mutex_);
+  Status s;
+  if (tracer_ != nullptr) {
+    s = tracer_->Close();
+    tracer_.reset();
+  } else {
+    s = Status::IOError("No trace file to close");
+  }
+  return s;
+}
+
+Status DBImpl::NewDefaultReplayer(
+    const std::vector<ColumnFamilyHandle*>& handles,
+    std::unique_ptr<TraceReader>&& reader,
+    std::unique_ptr<Replayer>* replayer) {
+  replayer->reset(new ReplayerImpl(this, handles, std::move(reader)));
+  return Status::OK();
+}
+
+Status DBImpl::StartBlockCacheTrace(
+    const TraceOptions& trace_options,
+    std::unique_ptr<TraceWriter>&& trace_writer) {
+  BlockCacheTraceOptions block_trace_opts;
+  block_trace_opts.sampling_frequency = trace_options.sampling_frequency;
+
+  BlockCacheTraceWriterOptions trace_writer_opt;
+  trace_writer_opt.max_trace_file_size = trace_options.max_trace_file_size;
+
+  std::unique_ptr<BlockCacheTraceWriter> block_cache_trace_writer =
+      NewBlockCacheTraceWriter(env_->GetSystemClock().get(), trace_writer_opt,
+                               std::move(trace_writer));
+
+  return block_cache_tracer_.StartTrace(block_trace_opts,
+                                        std::move(block_cache_trace_writer));
+}
+
+Status DBImpl::StartBlockCacheTrace(
+    const BlockCacheTraceOptions& trace_options,
+    std::unique_ptr<BlockCacheTraceWriter>&& trace_writer) {
+  return block_cache_tracer_.StartTrace(trace_options, std::move(trace_writer));
+}
+
+Status DBImpl::EndBlockCacheTrace() {
+  block_cache_tracer_.EndTrace();
+  return Status::OK();
+}
+
+Status DBImpl::TraceIteratorSeek(const uint32_t& cf_id, const Slice& key,
+                                 const Slice& lower_bound,
+                                 const Slice upper_bound) {
+  Status s;
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      s = tracer_->IteratorSeek(cf_id, key, lower_bound, upper_bound);
+    }
+  }
+  return s;
+}
+
+Status DBImpl::TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key,
+                                        const Slice& lower_bound,
+                                        const Slice upper_bound) {
+  Status s;
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      s = tracer_->IteratorSeekForPrev(cf_id, key, lower_bound, upper_bound);
+    }
+  }
+  return s;
+}
+
+Status DBImpl::ReserveFileNumbersBeforeIngestion(
+    ColumnFamilyData* cfd, uint64_t num,
+    std::unique_ptr<std::list<uint64_t>::iterator>& pending_output_elem,
+    uint64_t* next_file_number) {
+  Status s;
+  SuperVersionContext dummy_sv_ctx(true /* create_superversion */);
+  assert(nullptr != next_file_number);
+  InstrumentedMutexLock l(&mutex_);
+  if (error_handler_.IsDBStopped()) {
+    // Do not ingest files when there is a bg_error
+    return error_handler_.GetBGError();
+  }
+  pending_output_elem.reset(new std::list<uint64_t>::iterator(
+      CaptureCurrentFileNumberInPendingOutputs()));
+  *next_file_number = versions_->FetchAddFileNumber(static_cast<uint64_t>(num));
+  auto cf_options = cfd->GetLatestMutableCFOptions();
+  VersionEdit dummy_edit;
+  // If crash happen after a hard link established, Recover function may
+  // reuse the file number that has already assigned to the internal file,
+  // and this will overwrite the external file. To protect the external
+  // file, we have to make sure the file number will never being reused.
+  s = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_,
+                             directories_.GetDbDir());
+  if (s.ok()) {
+    InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options);
+  }
+  dummy_sv_ctx.Clean();
+  return s;
+}
+
+Status DBImpl::GetCreationTimeOfOldestFile(uint64_t* creation_time) {
+  if (mutable_db_options_.max_open_files == -1) {
+    uint64_t oldest_time = std::numeric_limits<uint64_t>::max();
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (!cfd->IsDropped()) {
+        uint64_t ctime;
+        {
+          SuperVersion* sv = GetAndRefSuperVersion(cfd);
+          Version* version = sv->current;
+          version->GetCreationTimeOfOldestFile(&ctime);
+          ReturnAndCleanupSuperVersion(cfd, sv);
+        }
+
+        if (ctime < oldest_time) {
+          oldest_time = ctime;
+        }
+        if (oldest_time == 0) {
+          break;
+        }
+      }
+    }
+    *creation_time = oldest_time;
+    return Status::OK();
+  } else {
+    return Status::NotSupported("This API only works if max_open_files = -1");
+  }
+}
+
+void DBImpl::RecordSeqnoToTimeMapping() {
+  // Get time first then sequence number, so the actual time of seqno is <=
+  // unix_time recorded
+  int64_t unix_time = 0;
+  immutable_db_options_.clock->GetCurrentTime(&unix_time)
+      .PermitUncheckedError();  // Ignore error
+  SequenceNumber seqno = GetLatestSequenceNumber();
+  bool appended = false;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    appended = seqno_time_mapping_.Append(seqno, unix_time);
+  }
+  if (!appended) {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "Failed to insert sequence number to time entry: %" PRIu64
+                   " -> %" PRIu64,
+                   seqno, unix_time);
+  }
+}
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl.h b/src/rocksdb/db/db_impl/db_impl.h
new file mode 100644
index 000000000..725e77c18
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl.h
@@ -0,0 +1,2804 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <functional>
+#include <limits>
+#include <list>
+#include <map>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/compaction/compaction_iterator.h"
+#include "db/compaction/compaction_job.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "db/external_sst_file_ingestion_job.h"
+#include "db/flush_job.h"
+#include "db/flush_scheduler.h"
+#include "db/import_column_family_job.h"
+#include "db/internal_stats.h"
+#include "db/log_writer.h"
+#include "db/logs_with_prep_tracker.h"
+#include "db/memtable_list.h"
+#include "db/periodic_task_scheduler.h"
+#include "db/post_memtable_callback.h"
+#include "db/pre_release_callback.h"
+#include "db/range_del_aggregator.h"
+#include "db/read_callback.h"
+#include "db/seqno_to_time_mapping.h"
+#include "db/snapshot_checker.h"
+#include "db/snapshot_impl.h"
+#include "db/trim_history_scheduler.h"
+#include "db/version_edit.h"
+#include "db/wal_manager.h"
+#include "db/write_controller.h"
+#include "db/write_thread.h"
+#include "logging/event_logger.h"
+#include "monitoring/instrumented_mutex.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/status.h"
+#ifndef ROCKSDB_LITE
+#include "rocksdb/trace_reader_writer.h"
+#endif  // ROCKSDB_LITE
+#include "rocksdb/transaction_log.h"
+#ifndef ROCKSDB_LITE
+#include "rocksdb/utilities/replayer.h"
+#endif  // ROCKSDB_LITE
+#include "rocksdb/write_buffer_manager.h"
+#include "table/merging_iterator.h"
+#include "table/scoped_arena_iterator.h"
+#include "util/autovector.h"
+#include "util/hash.h"
+#include "util/repeatable_thread.h"
+#include "util/stop_watch.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena;
+class ArenaWrappedDBIter;
+class InMemoryStatsHistoryIterator;
+class MemTable;
+class PersistentStatsHistoryIterator;
+class TableCache;
+class TaskLimiterToken;
+class Version;
+class VersionEdit;
+class VersionSet;
+class WriteCallback;
+struct JobContext;
+struct ExternalSstFileInfo;
+struct MemTableInfo;
+
+// Class to maintain directories for all database paths other than main one.
+class Directories {
+ public:
+  IOStatus SetDirectories(FileSystem* fs, const std::string& dbname,
+                          const std::string& wal_dir,
+                          const std::vector<DbPath>& data_paths);
+
+  FSDirectory* GetDataDir(size_t path_id) const {
+    assert(path_id < data_dirs_.size());
+    FSDirectory* ret_dir = data_dirs_[path_id].get();
+    if (ret_dir == nullptr) {
+      // Should use db_dir_
+      return db_dir_.get();
+    }
+    return ret_dir;
+  }
+
+  FSDirectory* GetWalDir() {
+    if (wal_dir_) {
+      return wal_dir_.get();
+    }
+    return db_dir_.get();
+  }
+
+  FSDirectory* GetDbDir() { return db_dir_.get(); }
+
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) {
+    // close all directories for all database paths
+    IOStatus s = IOStatus::OK();
+
+    // The default implementation for Close() in Directory/FSDirectory class
+    // "NotSupported" status, the upper level interface should be able to
+    // handle this error so that Close() does not fail after upgrading when
+    // run on FileSystems that have not implemented `Directory::Close()` or
+    // `FSDirectory::Close()` yet
+
+    if (db_dir_) {
+      IOStatus temp_s = db_dir_->Close(options, dbg);
+      if (!temp_s.ok() && !temp_s.IsNotSupported() && s.ok()) {
+        s = std::move(temp_s);
+      }
+    }
+
+    // Attempt to close everything even if one fails
+    s.PermitUncheckedError();
+
+    if (wal_dir_) {
+      IOStatus temp_s = wal_dir_->Close(options, dbg);
+      if (!temp_s.ok() && !temp_s.IsNotSupported() && s.ok()) {
+        s = std::move(temp_s);
+      }
+    }
+
+    s.PermitUncheckedError();
+
+    for (auto& data_dir_ptr : data_dirs_) {
+      if (data_dir_ptr) {
+        IOStatus temp_s = data_dir_ptr->Close(options, dbg);
+        if (!temp_s.ok() && !temp_s.IsNotSupported() && s.ok()) {
+          s = std::move(temp_s);
+        }
+      }
+    }
+
+    // Ready for caller
+    s.MustCheck();
+    return s;
+  }
+
+ private:
+  std::unique_ptr<FSDirectory> db_dir_;
+  std::vector<std::unique_ptr<FSDirectory>> data_dirs_;
+  std::unique_ptr<FSDirectory> wal_dir_;
+};
+
+// While DB is the public interface of RocksDB, and DBImpl is the actual
+// class implementing it. It's the entrance of the core RocksdB engine.
+// All other DB implementations, e.g. TransactionDB, BlobDB, etc, wrap a
+// DBImpl internally.
+// Other than functions implementing the DB interface, some public
+// functions are there for other internal components to call. For
+// example, TransactionDB directly calls DBImpl::WriteImpl() and
+// BlobDB directly calls DBImpl::GetImpl(). Some other functions
+// are for sub-components to call. For example, ColumnFamilyHandleImpl
+// calls DBImpl::FindObsoleteFiles().
+//
+// Since it's a very large class, the definition of the functions is
+// divided in several db_impl_*.cc files, besides db_impl.cc.
+class DBImpl : public DB {
+ public:
+  DBImpl(const DBOptions& options, const std::string& dbname,
+         const bool seq_per_batch = false, const bool batch_per_txn = true,
+         bool read_only = false);
+  // No copying allowed
+  DBImpl(const DBImpl&) = delete;
+  void operator=(const DBImpl&) = delete;
+
+  virtual ~DBImpl();
+
+  // ---- Implementations of the DB interface ----
+
+  using DB::Resume;
+  Status Resume() override;
+
+  using DB::Put;
+  Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family,
+             const Slice& key, const Slice& value) override;
+  Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family,
+             const Slice& key, const Slice& ts, const Slice& value) override;
+
+  using DB::PutEntity;
+  Status PutEntity(const WriteOptions& options,
+                   ColumnFamilyHandle* column_family, const Slice& key,
+                   const WideColumns& columns) override;
+
+  using DB::Merge;
+  Status Merge(const WriteOptions& options, ColumnFamilyHandle* column_family,
+               const Slice& key, const Slice& value) override;
+  Status Merge(const WriteOptions& options, ColumnFamilyHandle* column_family,
+               const Slice& key, const Slice& ts, const Slice& value) override;
+
+  using DB::Delete;
+  Status Delete(const WriteOptions& options, ColumnFamilyHandle* column_family,
+                const Slice& key) override;
+  Status Delete(const WriteOptions& options, ColumnFamilyHandle* column_family,
+                const Slice& key, const Slice& ts) override;
+
+  using DB::SingleDelete;
+  Status SingleDelete(const WriteOptions& options,
+                      ColumnFamilyHandle* column_family,
+                      const Slice& key) override;
+  Status SingleDelete(const WriteOptions& options,
+                      ColumnFamilyHandle* column_family, const Slice& key,
+                      const Slice& ts) override;
+
+  using DB::DeleteRange;
+  Status DeleteRange(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& begin_key,
+                     const Slice& end_key) override;
+  Status DeleteRange(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& begin_key,
+                     const Slice& end_key, const Slice& ts) override;
+
+  using DB::Write;
+  virtual Status Write(const WriteOptions& options,
+                       WriteBatch* updates) override;
+
+  using DB::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* value) override;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* value, std::string* timestamp) override;
+
+  using DB::GetEntity;
+  Status GetEntity(const ReadOptions& options,
+                   ColumnFamilyHandle* column_family, const Slice& key,
+                   PinnableWideColumns* columns) override;
+
+  using DB::GetMergeOperands;
+  Status GetMergeOperands(const ReadOptions& options,
+                          ColumnFamilyHandle* column_family, const Slice& key,
+                          PinnableSlice* merge_operands,
+                          GetMergeOperandsOptions* get_merge_operands_options,
+                          int* number_of_operands) override {
+    GetImplOptions get_impl_options;
+    get_impl_options.column_family = column_family;
+    get_impl_options.merge_operands = merge_operands;
+    get_impl_options.get_merge_operands_options = get_merge_operands_options;
+    get_impl_options.number_of_operands = number_of_operands;
+    get_impl_options.get_value = false;
+    return GetImpl(options, key, get_impl_options);
+  }
+
+  using DB::MultiGet;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys, std::vector<std::string>* values,
+      std::vector<std::string>* timestamps) override;
+
+  // This MultiGet is a batched version, which may be faster than calling Get
+  // multiple times, especially if the keys have some spatial locality that
+  // enables them to be queried in the same SST files/set of files. The larger
+  // the batch size, the more scope for batching and performance improvement
+  // The values and statuses parameters are arrays with number of elements
+  // equal to keys.size(). This allows the storage for those to be alloacted
+  // by the caller on the stack for small batches
+  virtual void MultiGet(const ReadOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const size_t num_keys, const Slice* keys,
+                        PinnableSlice* values, Status* statuses,
+                        const bool sorted_input = false) override;
+  virtual void MultiGet(const ReadOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const size_t num_keys, const Slice* keys,
+                        PinnableSlice* values, std::string* timestamps,
+                        Status* statuses,
+                        const bool sorted_input = false) override;
+
+  virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
+                        ColumnFamilyHandle** column_families, const Slice* keys,
+                        PinnableSlice* values, Status* statuses,
+                        const bool sorted_input = false) override;
+  virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
+                        ColumnFamilyHandle** column_families, const Slice* keys,
+                        PinnableSlice* values, std::string* timestamps,
+                        Status* statuses,
+                        const bool sorted_input = false) override;
+
+  virtual void MultiGetWithCallback(
+      const ReadOptions& options, ColumnFamilyHandle* column_family,
+      ReadCallback* callback,
+      autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys);
+
+  virtual Status CreateColumnFamily(const ColumnFamilyOptions& cf_options,
+                                    const std::string& column_family,
+                                    ColumnFamilyHandle** handle) override;
+  virtual Status CreateColumnFamilies(
+      const ColumnFamilyOptions& cf_options,
+      const std::vector<std::string>& column_family_names,
+      std::vector<ColumnFamilyHandle*>* handles) override;
+  virtual Status CreateColumnFamilies(
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      std::vector<ColumnFamilyHandle*>* handles) override;
+  virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override;
+  virtual Status DropColumnFamilies(
+      const std::vector<ColumnFamilyHandle*>& column_families) override;
+
+  // Returns false if key doesn't exist in the database and true if it may.
+  // If value_found is not passed in as null, then return the value if found in
+  // memory. On return, if value was found, then value_found will be set to true
+  // , otherwise false.
+  using DB::KeyMayExist;
+  virtual bool KeyMayExist(const ReadOptions& options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           std::string* value, std::string* timestamp,
+                           bool* value_found = nullptr) override;
+
+  using DB::NewIterator;
+  virtual Iterator* NewIterator(const ReadOptions& options,
+                                ColumnFamilyHandle* column_family) override;
+  virtual Status NewIterators(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      std::vector<Iterator*>* iterators) override;
+
+  virtual const Snapshot* GetSnapshot() override;
+  virtual void ReleaseSnapshot(const Snapshot* snapshot) override;
+  // Create a timestamped snapshot. This snapshot can be shared by multiple
+  // readers. If any of them uses it for write conflict checking, then
+  // is_write_conflict_boundary is true. For simplicity, set it to true by
+  // default.
+  std::pair<Status, std::shared_ptr<const Snapshot>> CreateTimestampedSnapshot(
+      SequenceNumber snapshot_seq, uint64_t ts);
+  std::shared_ptr<const SnapshotImpl> GetTimestampedSnapshot(uint64_t ts) const;
+  void ReleaseTimestampedSnapshotsOlderThan(
+      uint64_t ts, size_t* remaining_total_ss = nullptr);
+  Status GetTimestampedSnapshots(uint64_t ts_lb, uint64_t ts_ub,
+                                 std::vector<std::shared_ptr<const Snapshot>>&
+                                     timestamped_snapshots) const;
+
+  using DB::GetProperty;
+  virtual bool GetProperty(ColumnFamilyHandle* column_family,
+                           const Slice& property, std::string* value) override;
+  using DB::GetMapProperty;
+  virtual bool GetMapProperty(
+      ColumnFamilyHandle* column_family, const Slice& property,
+      std::map<std::string, std::string>* value) override;
+  using DB::GetIntProperty;
+  virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
+                              const Slice& property, uint64_t* value) override;
+  using DB::GetAggregatedIntProperty;
+  virtual bool GetAggregatedIntProperty(const Slice& property,
+                                        uint64_t* aggregated_value) override;
+  using DB::GetApproximateSizes;
+  virtual Status GetApproximateSizes(const SizeApproximationOptions& options,
+                                     ColumnFamilyHandle* column_family,
+                                     const Range* range, int n,
+                                     uint64_t* sizes) override;
+  using DB::GetApproximateMemTableStats;
+  virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
+                                           const Range& range,
+                                           uint64_t* const count,
+                                           uint64_t* const size) override;
+  using DB::CompactRange;
+  virtual Status CompactRange(const CompactRangeOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice* begin, const Slice* end) override;
+
+  using DB::CompactFiles;
+  virtual Status CompactFiles(
+      const CompactionOptions& compact_options,
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& input_file_names, const int output_level,
+      const int output_path_id = -1,
+      std::vector<std::string>* const output_file_names = nullptr,
+      CompactionJobInfo* compaction_job_info = nullptr) override;
+
+  virtual Status PauseBackgroundWork() override;
+  virtual Status ContinueBackgroundWork() override;
+
+  virtual Status EnableAutoCompaction(
+      const std::vector<ColumnFamilyHandle*>& column_family_handles) override;
+
+  virtual void EnableManualCompaction() override;
+  virtual void DisableManualCompaction() override;
+
+  using DB::SetOptions;
+  Status SetOptions(
+      ColumnFamilyHandle* column_family,
+      const std::unordered_map<std::string, std::string>& options_map) override;
+
+  virtual Status SetDBOptions(
+      const std::unordered_map<std::string, std::string>& options_map) override;
+
+  using DB::NumberLevels;
+  virtual int NumberLevels(ColumnFamilyHandle* column_family) override;
+  using DB::MaxMemCompactionLevel;
+  virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) override;
+  using DB::Level0StopWriteTrigger;
+  virtual int Level0StopWriteTrigger(
+      ColumnFamilyHandle* column_family) override;
+  virtual const std::string& GetName() const override;
+  virtual Env* GetEnv() const override;
+  virtual FileSystem* GetFileSystem() const override;
+  using DB::GetOptions;
+  virtual Options GetOptions(ColumnFamilyHandle* column_family) const override;
+  using DB::GetDBOptions;
+  virtual DBOptions GetDBOptions() const override;
+  using DB::Flush;
+  virtual Status Flush(const FlushOptions& options,
+                       ColumnFamilyHandle* column_family) override;
+  virtual Status Flush(
+      const FlushOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families) override;
+  virtual Status FlushWAL(bool sync) override;
+  bool WALBufferIsEmpty(bool lock = true);
+  virtual Status SyncWAL() override;
+  virtual Status LockWAL() override;
+  virtual Status UnlockWAL() override;
+
+  virtual SequenceNumber GetLatestSequenceNumber() const override;
+
+  // IncreaseFullHistoryTsLow(ColumnFamilyHandle*, std::string) will acquire
+  // and release db_mutex
+  Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family,
+                                  std::string ts_low) override;
+
+  // GetFullHistoryTsLow(ColumnFamilyHandle*, std::string*) will acquire and
+  // release db_mutex
+  Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family,
+                             std::string* ts_low) override;
+
+  virtual Status GetDbIdentity(std::string& identity) const override;
+
+  virtual Status GetDbIdentityFromIdentityFile(std::string* identity) const;
+
+  virtual Status GetDbSessionId(std::string& session_id) const override;
+
+  ColumnFamilyHandle* DefaultColumnFamily() const override;
+
+  ColumnFamilyHandle* PersistentStatsColumnFamily() const;
+
+  virtual Status Close() override;
+
+  virtual Status DisableFileDeletions() override;
+
+  virtual Status EnableFileDeletions(bool force) override;
+
+  virtual bool IsFileDeletionsEnabled() const;
+
+  Status GetStatsHistory(
+      uint64_t start_time, uint64_t end_time,
+      std::unique_ptr<StatsHistoryIterator>* stats_iterator) override;
+
+#ifndef ROCKSDB_LITE
+  using DB::ResetStats;
+  virtual Status ResetStats() override;
+  // All the returned filenames start with "/"
+  virtual Status GetLiveFiles(std::vector<std::string>&,
+                              uint64_t* manifest_file_size,
+                              bool flush_memtable = true) override;
+  virtual Status GetSortedWalFiles(VectorLogPtr& files) override;
+  virtual Status GetCurrentWalFile(
+      std::unique_ptr<LogFile>* current_log_file) override;
+  virtual Status GetCreationTimeOfOldestFile(
+      uint64_t* creation_time) override;
+
+  virtual Status GetUpdatesSince(
+      SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
+      const TransactionLogIterator::ReadOptions& read_options =
+          TransactionLogIterator::ReadOptions()) override;
+  virtual Status DeleteFile(std::string name) override;
+  Status DeleteFilesInRanges(ColumnFamilyHandle* column_family,
+                             const RangePtr* ranges, size_t n,
+                             bool include_end = true);
+
+  virtual void GetLiveFilesMetaData(
+      std::vector<LiveFileMetaData>* metadata) override;
+
+  virtual Status GetLiveFilesChecksumInfo(
+      FileChecksumList* checksum_list) override;
+
+  virtual Status GetLiveFilesStorageInfo(
+      const LiveFilesStorageInfoOptions& opts,
+      std::vector<LiveFileStorageInfo>* files) override;
+
+  // Obtains the meta data of the specified column family of the DB.
+  // TODO(yhchiang): output parameter is placed in the end in this codebase.
+  virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
+                                       ColumnFamilyMetaData* metadata) override;
+
+  void GetAllColumnFamilyMetaData(
+      std::vector<ColumnFamilyMetaData>* metadata) override;
+
+  Status SuggestCompactRange(ColumnFamilyHandle* column_family,
+                             const Slice* begin, const Slice* end) override;
+
+  Status PromoteL0(ColumnFamilyHandle* column_family,
+                   int target_level) override;
+
+  using DB::IngestExternalFile;
+  virtual Status IngestExternalFile(
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& external_files,
+      const IngestExternalFileOptions& ingestion_options) override;
+
+  using DB::IngestExternalFiles;
+  virtual Status IngestExternalFiles(
+      const std::vector<IngestExternalFileArg>& args) override;
+
+  using DB::CreateColumnFamilyWithImport;
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& options, const std::string& column_family_name,
+      const ImportColumnFamilyOptions& import_options,
+      const ExportImportFilesMetaData& metadata,
+      ColumnFamilyHandle** handle) override;
+
+  using DB::VerifyFileChecksums;
+  Status VerifyFileChecksums(const ReadOptions& read_options) override;
+
+  using DB::VerifyChecksum;
+  virtual Status VerifyChecksum(const ReadOptions& /*read_options*/) override;
+  // Verify the checksums of files in db. Currently only tables are checked.
+  //
+  // read_options: controls file I/O behavior, e.g. read ahead size while
+  //               reading all the live table files.
+  //
+  // use_file_checksum: if false, verify the block checksums of all live table
+  //                    in db. Otherwise, obtain the file checksums and compare
+  //                    with the MANIFEST. Currently, file checksums are
+  //                    recomputed by reading all table files.
+  //
+  // Returns: OK if there is no file whose file or block checksum mismatches.
+  Status VerifyChecksumInternal(const ReadOptions& read_options,
+                                bool use_file_checksum);
+
+  Status VerifyFullFileChecksum(const std::string& file_checksum_expected,
+                                const std::string& func_name_expected,
+                                const std::string& fpath,
+                                const ReadOptions& read_options);
+
+  using DB::StartTrace;
+  virtual Status StartTrace(
+      const TraceOptions& options,
+      std::unique_ptr<TraceWriter>&& trace_writer) override;
+
+  using DB::EndTrace;
+  virtual Status EndTrace() override;
+
+  using DB::NewDefaultReplayer;
+  virtual Status NewDefaultReplayer(
+      const std::vector<ColumnFamilyHandle*>& handles,
+      std::unique_ptr<TraceReader>&& reader,
+      std::unique_ptr<Replayer>* replayer) override;
+
+  using DB::StartBlockCacheTrace;
+  Status StartBlockCacheTrace(
+      const TraceOptions& trace_options,
+      std::unique_ptr<TraceWriter>&& trace_writer) override;
+
+  Status StartBlockCacheTrace(
+      const BlockCacheTraceOptions& options,
+      std::unique_ptr<BlockCacheTraceWriter>&& trace_writer) override;
+
+  using DB::EndBlockCacheTrace;
+  Status EndBlockCacheTrace() override;
+
+  using DB::StartIOTrace;
+  Status StartIOTrace(const TraceOptions& options,
+                      std::unique_ptr<TraceWriter>&& trace_writer) override;
+
+  using DB::EndIOTrace;
+  Status EndIOTrace() override;
+
+  using DB::GetPropertiesOfAllTables;
+  virtual Status GetPropertiesOfAllTables(
+      ColumnFamilyHandle* column_family,
+      TablePropertiesCollection* props) override;
+  virtual Status GetPropertiesOfTablesInRange(
+      ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
+      TablePropertiesCollection* props) override;
+
+#endif  // ROCKSDB_LITE
+
+  // ---- End of implementations of the DB interface ----
+  SystemClock* GetSystemClock() const;
+
+  struct GetImplOptions {
+    ColumnFamilyHandle* column_family = nullptr;
+    PinnableSlice* value = nullptr;
+    PinnableWideColumns* columns = nullptr;
+    std::string* timestamp = nullptr;
+    bool* value_found = nullptr;
+    ReadCallback* callback = nullptr;
+    bool* is_blob_index = nullptr;
+    // If true return value associated with key via value pointer else return
+    // all merge operands for key via merge_operands pointer
+    bool get_value = true;
+    // Pointer to an array of size
+    // get_merge_operands_options.expected_max_number_of_operands allocated by
+    // user
+    PinnableSlice* merge_operands = nullptr;
+    GetMergeOperandsOptions* get_merge_operands_options = nullptr;
+    int* number_of_operands = nullptr;
+  };
+
+  // Function that Get and KeyMayExist call with no_io true or false
+  // Note: 'value_found' from KeyMayExist propagates here
+  // This function is also called by GetMergeOperands
+  // If get_impl_options.get_value = true get value associated with
+  // get_impl_options.key via get_impl_options.value
+  // If get_impl_options.get_value = false get merge operands associated with
+  // get_impl_options.key via get_impl_options.merge_operands
+  Status GetImpl(const ReadOptions& options, const Slice& key,
+                 GetImplOptions& get_impl_options);
+
+  // If `snapshot` == kMaxSequenceNumber, set a recent one inside the file.
+  ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options,
+                                      ColumnFamilyData* cfd,
+                                      SequenceNumber snapshot,
+                                      ReadCallback* read_callback,
+                                      bool expose_blob_index = false,
+                                      bool allow_refresh = true);
+
+  virtual SequenceNumber GetLastPublishedSequence() const {
+    if (last_seq_same_as_publish_seq_) {
+      return versions_->LastSequence();
+    } else {
+      return versions_->LastPublishedSequence();
+    }
+  }
+
+  // REQUIRES: joined the main write queue if two_write_queues is disabled, and
+  // the second write queue otherwise.
+  virtual void SetLastPublishedSequence(SequenceNumber seq);
+  // Returns LastSequence in last_seq_same_as_publish_seq_
+  // mode and LastAllocatedSequence otherwise. This is useful when visiblility
+  // depends also on data written to the WAL but not to the memtable.
+  SequenceNumber TEST_GetLastVisibleSequence() const;
+
+#ifndef ROCKSDB_LITE
+  // Similar to Write() but will call the callback once on the single write
+  // thread to determine whether it is safe to perform the write.
+  virtual Status WriteWithCallback(const WriteOptions& write_options,
+                                   WriteBatch* my_batch,
+                                   WriteCallback* callback);
+
+  // Returns the sequence number that is guaranteed to be smaller than or equal
+  // to the sequence number of any key that could be inserted into the current
+  // memtables. It can then be assumed that any write with a larger(or equal)
+  // sequence number will be present in this memtable or a later memtable.
+  //
+  // If the earliest sequence number could not be determined,
+  // kMaxSequenceNumber will be returned.
+  //
+  // If include_history=true, will also search Memtables in MemTableList
+  // History.
+  SequenceNumber GetEarliestMemTableSequenceNumber(SuperVersion* sv,
+                                                   bool include_history);
+
+  // For a given key, check to see if there are any records for this key
+  // in the memtables, including memtable history.  If cache_only is false,
+  // SST files will also be checked.
+  //
+  // `key` should NOT have user-defined timestamp appended to user key even if
+  // timestamp is enabled.
+  //
+  // If a key is found, *found_record_for_key will be set to true and
+  // *seq will be set to the stored sequence number for the latest
+  // operation on this key or kMaxSequenceNumber if unknown. If user-defined
+  // timestamp is enabled for this column family and timestamp is not nullptr,
+  // then *timestamp will be set to the stored timestamp for the latest
+  // operation on this key.
+  // If no key is found, *found_record_for_key will be set to false.
+  //
+  // Note: If cache_only=false, it is possible for *seq to be set to 0 if
+  // the sequence number has been cleared from the record.  If the caller is
+  // holding an active db snapshot, we know the missing sequence must be less
+  // than the snapshot's sequence number (sequence numbers are only cleared
+  // when there are no earlier active snapshots).
+  //
+  // If NotFound is returned and found_record_for_key is set to false, then no
+  // record for this key was found.  If the caller is holding an active db
+  // snapshot, we know that no key could have existing after this snapshot
+  // (since we do not compact keys that have an earlier snapshot).
+  //
+  // Only records newer than or at `lower_bound_seq` are guaranteed to be
+  // returned. Memtables and files may not be checked if it only contains data
+  // older than `lower_bound_seq`.
+  //
+  // Returns OK or NotFound on success,
+  // other status on unexpected error.
+  // TODO(andrewkr): this API need to be aware of range deletion operations
+  Status GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
+                                 bool cache_only,
+                                 SequenceNumber lower_bound_seq,
+                                 SequenceNumber* seq, std::string* timestamp,
+                                 bool* found_record_for_key,
+                                 bool* is_blob_index);
+
+  Status TraceIteratorSeek(const uint32_t& cf_id, const Slice& key,
+                           const Slice& lower_bound, const Slice upper_bound);
+  Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key,
+                                  const Slice& lower_bound,
+                                  const Slice upper_bound);
+#endif  // ROCKSDB_LITE
+
+  // Similar to GetSnapshot(), but also lets the db know that this snapshot
+  // will be used for transaction write-conflict checking.  The DB can then
+  // make sure not to compact any keys that would prevent a write-conflict from
+  // being detected.
+  const Snapshot* GetSnapshotForWriteConflictBoundary();
+
+  // checks if all live files exist on file system and that their file sizes
+  // match to our in-memory records
+  virtual Status CheckConsistency();
+
+  // max_file_num_to_ignore allows bottom level compaction to filter out newly
+  // compacted SST files. Setting max_file_num_to_ignore to kMaxUint64 will
+  // disable the filtering
+  Status RunManualCompaction(ColumnFamilyData* cfd, int input_level,
+                             int output_level,
+                             const CompactRangeOptions& compact_range_options,
+                             const Slice* begin, const Slice* end,
+                             bool exclusive, bool disallow_trivial_move,
+                             uint64_t max_file_num_to_ignore,
+                             const std::string& trim_ts);
+
+  // Return an internal iterator over the current state of the database.
+  // The keys of this iterator are internal keys (see format.h).
+  // The returned iterator should be deleted when no longer needed.
+  // If allow_unprepared_value is true, the returned iterator may defer reading
+  // the value and so will require PrepareValue() to be called before value();
+  // allow_unprepared_value = false is convenient when this optimization is not
+  // useful, e.g. when reading the whole column family.
+  //
+  // read_options.ignore_range_deletions determines whether range tombstones are
+  // processed in the returned interator internally, i.e., whether range
+  // tombstone covered keys are in this iterator's output.
+  // @param read_options Must outlive the returned iterator.
+  InternalIterator* NewInternalIterator(
+      const ReadOptions& read_options, Arena* arena, SequenceNumber sequence,
+      ColumnFamilyHandle* column_family = nullptr,
+      bool allow_unprepared_value = false);
+
+  // Note: to support DB iterator refresh, memtable range tombstones in the
+  // underlying merging iterator needs to be refreshed. If db_iter is not
+  // nullptr, db_iter->SetMemtableRangetombstoneIter() is called with the
+  // memtable range tombstone iterator used by the underlying merging iterator.
+  // This range tombstone iterator can be refreshed later by db_iter.
+  // @param read_options Must outlive the returned iterator.
+  InternalIterator* NewInternalIterator(const ReadOptions& read_options,
+                                        ColumnFamilyData* cfd,
+                                        SuperVersion* super_version,
+                                        Arena* arena, SequenceNumber sequence,
+                                        bool allow_unprepared_value,
+                                        ArenaWrappedDBIter* db_iter = nullptr);
+
+  LogsWithPrepTracker* logs_with_prep_tracker() {
+    return &logs_with_prep_tracker_;
+  }
+
+  struct BGJobLimits {
+    int max_flushes;
+    int max_compactions;
+  };
+  // Returns maximum background flushes and compactions allowed to be scheduled
+  BGJobLimits GetBGJobLimits() const;
+  // Need a static version that can be called during SanitizeOptions().
+  static BGJobLimits GetBGJobLimits(int max_background_flushes,
+                                    int max_background_compactions,
+                                    int max_background_jobs,
+                                    bool parallelize_compactions);
+
+  // move logs pending closing from job_context to the DB queue and
+  // schedule a purge
+  void ScheduleBgLogWriterClose(JobContext* job_context);
+
+  uint64_t MinLogNumberToKeep();
+
+  // Returns the lower bound file number for SSTs that won't be deleted, even if
+  // they're obsolete. This lower bound is used internally to prevent newly
+  // created flush/compaction output files from being deleted before they're
+  // installed. This technique avoids the need for tracking the exact numbers of
+  // files pending creation, although it prevents more files than necessary from
+  // being deleted.
+  uint64_t MinObsoleteSstNumberToKeep();
+
+  // Returns the list of live files in 'live' and the list
+  // of all files in the filesystem in 'candidate_files'.
+  // If force == false and the last call was less than
+  // db_options_.delete_obsolete_files_period_micros microseconds ago,
+  // it will not fill up the job_context
+  void FindObsoleteFiles(JobContext* job_context, bool force,
+                         bool no_full_scan = false);
+
+  // Diffs the files listed in filenames and those that do not
+  // belong to live files are possibly removed. Also, removes all the
+  // files in sst_delete_files and log_delete_files.
+  // It is not necessary to hold the mutex when invoking this method.
+  // If FindObsoleteFiles() was run, we need to also run
+  // PurgeObsoleteFiles(), even if disable_delete_obsolete_files_ is true
+  void PurgeObsoleteFiles(JobContext& background_contet,
+                          bool schedule_only = false);
+
+  // Schedule a background job to actually delete obsolete files.
+  void SchedulePurge();
+
+  const SnapshotList& snapshots() const { return snapshots_; }
+
+  // load list of snapshots to `snap_vector` that is no newer than `max_seq`
+  // in ascending order.
+  // `oldest_write_conflict_snapshot` is filled with the oldest snapshot
+  // which satisfies SnapshotImpl.is_write_conflict_boundary_ = true.
+  void LoadSnapshots(std::vector<SequenceNumber>* snap_vector,
+                     SequenceNumber* oldest_write_conflict_snapshot,
+                     const SequenceNumber& max_seq) const {
+    InstrumentedMutexLock l(mutex());
+    snapshots().GetAll(snap_vector, oldest_write_conflict_snapshot, max_seq);
+  }
+
+  const ImmutableDBOptions& immutable_db_options() const {
+    return immutable_db_options_;
+  }
+
+  // Cancel all background jobs, including flush, compaction, background
+  // purging, stats dumping threads, etc. If `wait` = true, wait for the
+  // running jobs to abort or finish before returning. Otherwise, only
+  // sends the signals.
+  void CancelAllBackgroundWork(bool wait);
+
+  // Find Super version and reference it. Based on options, it might return
+  // the thread local cached one.
+  // Call ReturnAndCleanupSuperVersion() when it is no longer needed.
+  SuperVersion* GetAndRefSuperVersion(ColumnFamilyData* cfd);
+
+  // Similar to the previous function but looks up based on a column family id.
+  // nullptr will be returned if this column family no longer exists.
+  // REQUIRED: this function should only be called on the write thread or if the
+  // mutex is held.
+  SuperVersion* GetAndRefSuperVersion(uint32_t column_family_id);
+
+  // Un-reference the super version and clean it up if it is the last reference.
+  void CleanupSuperVersion(SuperVersion* sv);
+
+  // Un-reference the super version and return it to thread local cache if
+  // needed. If it is the last reference of the super version. Clean it up
+  // after un-referencing it.
+  void ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd, SuperVersion* sv);
+
+  // Similar to the previous function but looks up based on a column family id.
+  // nullptr will be returned if this column family no longer exists.
+  // REQUIRED: this function should only be called on the write thread.
+  void ReturnAndCleanupSuperVersion(uint32_t colun_family_id, SuperVersion* sv);
+
+  // REQUIRED: this function should only be called on the write thread or if the
+  // mutex is held.  Return value only valid until next call to this function or
+  // mutex is released.
+  ColumnFamilyHandle* GetColumnFamilyHandle(uint32_t column_family_id);
+
+  // Same as above, should called without mutex held and not on write thread.
+  std::unique_ptr<ColumnFamilyHandle> GetColumnFamilyHandleUnlocked(
+      uint32_t column_family_id);
+
+  // Returns the number of currently running flushes.
+  // REQUIREMENT: mutex_ must be held when calling this function.
+  int num_running_flushes() {
+    mutex_.AssertHeld();
+    return num_running_flushes_;
+  }
+
+  // Returns the number of currently running compactions.
+  // REQUIREMENT: mutex_ must be held when calling this function.
+  int num_running_compactions() {
+    mutex_.AssertHeld();
+    return num_running_compactions_;
+  }
+
+  const WriteController& write_controller() { return write_controller_; }
+
+  // hollow transactions shell used for recovery.
+  // these will then be passed to TransactionDB so that
+  // locks can be reacquired before writing can resume.
+  struct RecoveredTransaction {
+    std::string name_;
+    bool unprepared_;
+
+    struct BatchInfo {
+      uint64_t log_number_;
+      // TODO(lth): For unprepared, the memory usage here can be big for
+      // unprepared transactions. This is only useful for rollbacks, and we
+      // can in theory just keep keyset for that.
+      WriteBatch* batch_;
+      // Number of sub-batches. A new sub-batch is created if txn attempts to
+      // insert a duplicate key,seq to memtable. This is currently used in
+      // WritePreparedTxn/WriteUnpreparedTxn.
+      size_t batch_cnt_;
+    };
+
+    // This maps the seq of the first key in the batch to BatchInfo, which
+    // contains WriteBatch and other information relevant to the batch.
+    //
+    // For WriteUnprepared, batches_ can have size greater than 1, but for
+    // other write policies, it must be of size 1.
+    std::map<SequenceNumber, BatchInfo> batches_;
+
+    explicit RecoveredTransaction(const uint64_t log, const std::string& name,
+                                  WriteBatch* batch, SequenceNumber seq,
+                                  size_t batch_cnt, bool unprepared)
+        : name_(name), unprepared_(unprepared) {
+      batches_[seq] = {log, batch, batch_cnt};
+    }
+
+    ~RecoveredTransaction() {
+      for (auto& it : batches_) {
+        delete it.second.batch_;
+      }
+    }
+
+    void AddBatch(SequenceNumber seq, uint64_t log_number, WriteBatch* batch,
+                  size_t batch_cnt, bool unprepared) {
+      assert(batches_.count(seq) == 0);
+      batches_[seq] = {log_number, batch, batch_cnt};
+      // Prior state must be unprepared, since the prepare batch must be the
+      // last batch.
+      assert(unprepared_);
+      unprepared_ = unprepared;
+    }
+  };
+
+  bool allow_2pc() const { return immutable_db_options_.allow_2pc; }
+
+  std::unordered_map<std::string, RecoveredTransaction*>
+  recovered_transactions() {
+    return recovered_transactions_;
+  }
+
+  RecoveredTransaction* GetRecoveredTransaction(const std::string& name) {
+    auto it = recovered_transactions_.find(name);
+    if (it == recovered_transactions_.end()) {
+      return nullptr;
+    } else {
+      return it->second;
+    }
+  }
+
+  void InsertRecoveredTransaction(const uint64_t log, const std::string& name,
+                                  WriteBatch* batch, SequenceNumber seq,
+                                  size_t batch_cnt, bool unprepared_batch) {
+    // For WriteUnpreparedTxn, InsertRecoveredTransaction is called multiple
+    // times for every unprepared batch encountered during recovery.
+    //
+    // If the transaction is prepared, then the last call to
+    // InsertRecoveredTransaction will have unprepared_batch = false.
+    auto rtxn = recovered_transactions_.find(name);
+    if (rtxn == recovered_transactions_.end()) {
+      recovered_transactions_[name] = new RecoveredTransaction(
+          log, name, batch, seq, batch_cnt, unprepared_batch);
+    } else {
+      rtxn->second->AddBatch(seq, log, batch, batch_cnt, unprepared_batch);
+    }
+    logs_with_prep_tracker_.MarkLogAsContainingPrepSection(log);
+  }
+
+  void DeleteRecoveredTransaction(const std::string& name) {
+    auto it = recovered_transactions_.find(name);
+    assert(it != recovered_transactions_.end());
+    auto* trx = it->second;
+    recovered_transactions_.erase(it);
+    for (const auto& info : trx->batches_) {
+      logs_with_prep_tracker_.MarkLogAsHavingPrepSectionFlushed(
+          info.second.log_number_);
+    }
+    delete trx;
+  }
+
+  void DeleteAllRecoveredTransactions() {
+    for (auto it = recovered_transactions_.begin();
+         it != recovered_transactions_.end(); ++it) {
+      delete it->second;
+    }
+    recovered_transactions_.clear();
+  }
+
+  void AddToLogsToFreeQueue(log::Writer* log_writer) {
+    mutex_.AssertHeld();
+    logs_to_free_queue_.push_back(log_writer);
+  }
+
+  void AddSuperVersionsToFreeQueue(SuperVersion* sv) {
+    superversions_to_free_queue_.push_back(sv);
+  }
+
+  void SetSnapshotChecker(SnapshotChecker* snapshot_checker);
+
+  // Fill JobContext with snapshot information needed by flush and compaction.
+  void GetSnapshotContext(JobContext* job_context,
+                          std::vector<SequenceNumber>* snapshot_seqs,
+                          SequenceNumber* earliest_write_conflict_snapshot,
+                          SnapshotChecker** snapshot_checker);
+
+  // Not thread-safe.
+  void SetRecoverableStatePreReleaseCallback(PreReleaseCallback* callback);
+
+  InstrumentedMutex* mutex() const { return &mutex_; }
+
+  // Initialize a brand new DB. The DB directory is expected to be empty before
+  // calling it. Push new manifest file name into `new_filenames`.
+  Status NewDB(std::vector<std::string>* new_filenames);
+
+  // This is to be used only by internal rocksdb classes.
+  static Status Open(const DBOptions& db_options, const std::string& name,
+                     const std::vector<ColumnFamilyDescriptor>& column_families,
+                     std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+                     const bool seq_per_batch, const bool batch_per_txn);
+
+  static IOStatus CreateAndNewDirectory(
+      FileSystem* fs, const std::string& dirname,
+      std::unique_ptr<FSDirectory>* directory);
+
+  // find stats map from stats_history_ with smallest timestamp in
+  // the range of [start_time, end_time)
+  bool FindStatsByTime(uint64_t start_time, uint64_t end_time,
+                       uint64_t* new_time,
+                       std::map<std::string, uint64_t>* stats_map);
+
+  // Print information of all tombstones of all iterators to the std::string
+  // This is only used by ldb. The output might be capped. Tombstones
+  // printed out are not guaranteed to be in any order.
+  Status TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family,
+                                     int max_entries_to_print,
+                                     std::string* out_str);
+
+  VersionSet* GetVersionSet() const { return versions_.get(); }
+
+  // Wait for any compaction
+  // We add a bool parameter to wait for unscheduledCompactions_ == 0, but this
+  // is only for the special test of CancelledCompactions
+  Status WaitForCompact(bool waitUnscheduled = false);
+
+#ifndef NDEBUG
+  // Compact any files in the named level that overlap [*begin, *end]
+  Status TEST_CompactRange(int level, const Slice* begin, const Slice* end,
+                           ColumnFamilyHandle* column_family = nullptr,
+                           bool disallow_trivial_move = false);
+
+  Status TEST_SwitchWAL();
+
+  bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; }
+
+  bool TEST_IsLogGettingFlushed() {
+    return alive_log_files_.begin()->getting_flushed;
+  }
+
+  Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr);
+
+  // Force current memtable contents to be flushed.
+  Status TEST_FlushMemTable(bool wait = true, bool allow_write_stall = false,
+                            ColumnFamilyHandle* cfh = nullptr);
+
+  Status TEST_FlushMemTable(ColumnFamilyData* cfd,
+                            const FlushOptions& flush_opts);
+
+  // Flush (multiple) ColumnFamilyData without using ColumnFamilyHandle. This
+  // is because in certain cases, we can flush column families, wait for the
+  // flush to complete, but delete the column family handle before the wait
+  // finishes. For example in CompactRange.
+  Status TEST_AtomicFlushMemTables(const autovector<ColumnFamilyData*>& cfds,
+                                   const FlushOptions& flush_opts);
+
+  // Wait for background threads to complete scheduled work.
+  Status TEST_WaitForBackgroundWork();
+
+  // Wait for memtable compaction
+  Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr);
+
+  // Wait for any compaction
+  // We add a bool parameter to wait for unscheduledCompactions_ == 0, but this
+  // is only for the special test of CancelledCompactions
+  Status TEST_WaitForCompact(bool waitUnscheduled = false);
+
+  // Wait for any background purge
+  Status TEST_WaitForPurge();
+
+  // Get the background error status
+  Status TEST_GetBGError();
+
+  // Return the maximum overlapping data (in bytes) at next level for any
+  // file at a level >= 1.
+  uint64_t TEST_MaxNextLevelOverlappingBytes(
+      ColumnFamilyHandle* column_family = nullptr);
+
+  // Return the current manifest file no.
+  uint64_t TEST_Current_Manifest_FileNo();
+
+  // Returns the number that'll be assigned to the next file that's created.
+  uint64_t TEST_Current_Next_FileNo();
+
+  // get total level0 file size. Only for testing.
+  uint64_t TEST_GetLevel0TotalSize();
+
+  void TEST_GetFilesMetaData(
+      ColumnFamilyHandle* column_family,
+      std::vector<std::vector<FileMetaData>>* metadata,
+      std::vector<std::shared_ptr<BlobFileMetaData>>* blob_metadata = nullptr);
+
+  void TEST_LockMutex();
+
+  void TEST_UnlockMutex();
+
+  // REQUIRES: mutex locked
+  void* TEST_BeginWrite();
+
+  // REQUIRES: mutex locked
+  // pass the pointer that you got from TEST_BeginWrite()
+  void TEST_EndWrite(void* w);
+
+  uint64_t TEST_MaxTotalInMemoryState() const {
+    return max_total_in_memory_state_;
+  }
+
+  size_t TEST_LogsToFreeSize();
+
+  uint64_t TEST_LogfileNumber();
+
+  uint64_t TEST_total_log_size() const { return total_log_size_; }
+
+  // Returns column family name to ImmutableCFOptions map.
+  Status TEST_GetAllImmutableCFOptions(
+      std::unordered_map<std::string, const ImmutableCFOptions*>* iopts_map);
+
+  // Return the lastest MutableCFOptions of a column family
+  Status TEST_GetLatestMutableCFOptions(ColumnFamilyHandle* column_family,
+                                        MutableCFOptions* mutable_cf_options);
+
+  Cache* TEST_table_cache() { return table_cache_.get(); }
+
+  WriteController& TEST_write_controler() { return write_controller_; }
+
+  uint64_t TEST_FindMinLogContainingOutstandingPrep();
+  uint64_t TEST_FindMinPrepLogReferencedByMemTable();
+  size_t TEST_PreparedSectionCompletedSize();
+  size_t TEST_LogsWithPrepSize();
+
+  int TEST_BGCompactionsAllowed() const;
+  int TEST_BGFlushesAllowed() const;
+  size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
+  void TEST_WaitForPeridicTaskRun(std::function<void()> callback) const;
+  SeqnoToTimeMapping TEST_GetSeqnoToTimeMapping() const;
+  size_t TEST_EstimateInMemoryStatsHistorySize() const;
+
+  uint64_t TEST_GetCurrentLogNumber() const {
+    InstrumentedMutexLock l(mutex());
+    assert(!logs_.empty());
+    return logs_.back().number;
+  }
+
+  const std::unordered_set<uint64_t>& TEST_GetFilesGrabbedForPurge() const {
+    return files_grabbed_for_purge_;
+  }
+
+#ifndef ROCKSDB_LITE
+  const PeriodicTaskScheduler& TEST_GetPeriodicTaskScheduler() const;
+#endif  // !ROCKSDB_LITE
+
+#endif  // NDEBUG
+
+  // persist stats to column family "_persistent_stats"
+  void PersistStats();
+
+  // dump rocksdb.stats to LOG
+  void DumpStats();
+
+  // flush LOG out of application buffer
+  void FlushInfoLog();
+
+  // record current sequence number to time mapping
+  void RecordSeqnoToTimeMapping();
+
+  // Interface to block and signal the DB in case of stalling writes by
+  // WriteBufferManager. Each DBImpl object contains ptr to WBMStallInterface.
+  // When DB needs to be blocked or signalled by WriteBufferManager,
+  // state_ is changed accordingly.
+  class WBMStallInterface : public StallInterface {
+   public:
+    enum State {
+      BLOCKED = 0,
+      RUNNING,
+    };
+
+    WBMStallInterface() : state_cv_(&state_mutex_) {
+      MutexLock lock(&state_mutex_);
+      state_ = State::RUNNING;
+    }
+
+    void SetState(State state) {
+      MutexLock lock(&state_mutex_);
+      state_ = state;
+    }
+
+    // Change the state_ to State::BLOCKED and wait until its state is
+    // changed by WriteBufferManager. When stall is cleared, Signal() is
+    // called to change the state and unblock the DB.
+    void Block() override {
+      MutexLock lock(&state_mutex_);
+      while (state_ == State::BLOCKED) {
+        TEST_SYNC_POINT("WBMStallInterface::BlockDB");
+        state_cv_.Wait();
+      }
+    }
+
+    // Called from WriteBufferManager. This function changes the state_
+    // to State::RUNNING indicating the stall is cleared and DB can proceed.
+    void Signal() override {
+      {
+        MutexLock lock(&state_mutex_);
+        state_ = State::RUNNING;
+      }
+      state_cv_.Signal();
+    }
+
+   private:
+    // Conditional variable and mutex to block and
+    // signal the DB during stalling process.
+    port::Mutex state_mutex_;
+    port::CondVar state_cv_;
+    // state represting whether DB is running or blocked because of stall by
+    // WriteBufferManager.
+    State state_;
+  };
+
+  static void TEST_ResetDbSessionIdGen();
+  static std::string GenerateDbSessionId(Env* env);
+
+  bool seq_per_batch() const { return seq_per_batch_; }
+
+ protected:
+  const std::string dbname_;
+  // TODO(peterd): unify with VersionSet::db_id_
+  std::string db_id_;
+  // db_session_id_ is an identifier that gets reset
+  // every time the DB is opened
+  std::string db_session_id_;
+  std::unique_ptr<VersionSet> versions_;
+  // Flag to check whether we allocated and own the info log file
+  bool own_info_log_;
+  Status init_logger_creation_s_;
+  const DBOptions initial_db_options_;
+  Env* const env_;
+  std::shared_ptr<IOTracer> io_tracer_;
+  const ImmutableDBOptions immutable_db_options_;
+  FileSystemPtr fs_;
+  MutableDBOptions mutable_db_options_;
+  Statistics* stats_;
+  std::unordered_map<std::string, RecoveredTransaction*>
+      recovered_transactions_;
+  std::unique_ptr<Tracer> tracer_;
+  InstrumentedMutex trace_mutex_;
+  BlockCacheTracer block_cache_tracer_;
+
+  // constant false canceled flag, used when the compaction is not manual
+  const std::atomic<bool> kManualCompactionCanceledFalse_{false};
+
+  // State below is protected by mutex_
+  // With two_write_queues enabled, some of the variables that accessed during
+  // WriteToWAL need different synchronization: log_empty_, alive_log_files_,
+  // logs_, logfile_number_. Refer to the definition of each variable below for
+  // more description.
+  //
+  // `mutex_` can be a hot lock in some workloads, so it deserves dedicated
+  // cachelines.
+  mutable CacheAlignedInstrumentedMutex mutex_;
+
+  ColumnFamilyHandleImpl* default_cf_handle_;
+  InternalStats* default_cf_internal_stats_;
+
+  // table_cache_ provides its own synchronization
+  std::shared_ptr<Cache> table_cache_;
+
+  ErrorHandler error_handler_;
+
+  // Unified interface for logging events
+  EventLogger event_logger_;
+
+  // only used for dynamically adjusting max_total_wal_size. it is a sum of
+  // [write_buffer_size * max_write_buffer_number] over all column families
+  std::atomic<uint64_t> max_total_in_memory_state_;
+
+  // The options to access storage files
+  const FileOptions file_options_;
+
+  // Additonal options for compaction and flush
+  FileOptions file_options_for_compaction_;
+
+  std::unique_ptr<ColumnFamilyMemTablesImpl> column_family_memtables_;
+
+  // Increase the sequence number after writing each batch, whether memtable is
+  // disabled for that or not. Otherwise the sequence number is increased after
+  // writing each key into memtable. This implies that when disable_memtable is
+  // set, the seq is not increased at all.
+  //
+  // Default: false
+  const bool seq_per_batch_;
+  // This determines during recovery whether we expect one writebatch per
+  // recovered transaction, or potentially multiple writebatches per
+  // transaction. For WriteUnprepared, this is set to false, since multiple
+  // batches can exist per transaction.
+  //
+  // Default: true
+  const bool batch_per_txn_;
+
+  // Each flush or compaction gets its own job id. this counter makes sure
+  // they're unique
+  std::atomic<int> next_job_id_;
+
+  std::atomic<bool> shutting_down_;
+
+  // RecoveryContext struct stores the context about version edits along
+  // with corresponding column_family_data and column_family_options.
+  class RecoveryContext {
+   public:
+    ~RecoveryContext() {
+      for (auto& edit_list : edit_lists_) {
+        for (auto* edit : edit_list) {
+          delete edit;
+        }
+      }
+    }
+
+    void UpdateVersionEdits(ColumnFamilyData* cfd, const VersionEdit& edit) {
+      assert(cfd != nullptr);
+      if (map_.find(cfd->GetID()) == map_.end()) {
+        uint32_t size = static_cast<uint32_t>(map_.size());
+        map_.emplace(cfd->GetID(), size);
+        cfds_.emplace_back(cfd);
+        mutable_cf_opts_.emplace_back(cfd->GetLatestMutableCFOptions());
+        edit_lists_.emplace_back(autovector<VersionEdit*>());
+      }
+      uint32_t i = map_[cfd->GetID()];
+      edit_lists_[i].emplace_back(new VersionEdit(edit));
+    }
+
+    std::unordered_map<uint32_t, uint32_t> map_;  // cf_id to index;
+    autovector<ColumnFamilyData*> cfds_;
+    autovector<const MutableCFOptions*> mutable_cf_opts_;
+    autovector<autovector<VersionEdit*>> edit_lists_;
+    // files_to_delete_ contains sst files
+    std::unordered_set<std::string> files_to_delete_;
+  };
+
+  // Except in DB::Open(), WriteOptionsFile can only be called when:
+  // Persist options to options file.
+  // If need_mutex_lock = false, the method will lock DB mutex.
+  // If need_enter_write_thread = false, the method will enter write thread.
+  Status WriteOptionsFile(bool need_mutex_lock, bool need_enter_write_thread);
+
+  Status CompactRangeInternal(const CompactRangeOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice* begin, const Slice* end,
+                              const std::string& trim_ts);
+
+  // The following two functions can only be called when:
+  // 1. WriteThread::Writer::EnterUnbatched() is used.
+  // 2. db_mutex is NOT held
+  Status RenameTempFileToOptionsFile(const std::string& file_name);
+  Status DeleteObsoleteOptionsFiles();
+
+  void NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
+                          const MutableCFOptions& mutable_cf_options,
+                          int job_id);
+
+  void NotifyOnFlushCompleted(
+      ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+      std::list<std::unique_ptr<FlushJobInfo>>* flush_jobs_info);
+
+  void NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
+                               const Status& st,
+                               const CompactionJobStats& job_stats, int job_id);
+
+  void NotifyOnCompactionCompleted(ColumnFamilyData* cfd, Compaction* c,
+                                   const Status& st,
+                                   const CompactionJobStats& job_stats,
+                                   int job_id);
+  void NotifyOnMemTableSealed(ColumnFamilyData* cfd,
+                              const MemTableInfo& mem_table_info);
+
+#ifndef ROCKSDB_LITE
+  void NotifyOnExternalFileIngested(
+      ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job);
+
+  virtual Status FlushForGetLiveFiles();
+#endif  // !ROCKSDB_LITE
+
+  void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const;
+
+  void EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const;
+
+  void EraseThreadStatusDbInfo() const;
+
+  // If disable_memtable is set the application logic must guarantee that the
+  // batch will still be skipped from memtable during the recovery. An excption
+  // to this is seq_per_batch_ mode, in which since each batch already takes one
+  // seq, it is ok for the batch to write to memtable during recovery as long as
+  // it only takes one sequence number: i.e., no duplicate keys.
+  // In WriteCommitted it is guarnateed since disable_memtable is used for
+  // prepare batch which will be written to memtable later during the commit,
+  // and in WritePrepared it is guaranteed since it will be used only for WAL
+  // markers which will never be written to memtable. If the commit marker is
+  // accompanied with CommitTimeWriteBatch that is not written to memtable as
+  // long as it has no duplicate keys, it does not violate the one-seq-per-batch
+  // policy.
+  // batch_cnt is expected to be non-zero in seq_per_batch mode and
+  // indicates the number of sub-patches. A sub-patch is a subset of the write
+  // batch that does not have duplicate keys.
+  Status WriteImpl(const WriteOptions& options, WriteBatch* updates,
+                   WriteCallback* callback = nullptr,
+                   uint64_t* log_used = nullptr, uint64_t log_ref = 0,
+                   bool disable_memtable = false, uint64_t* seq_used = nullptr,
+                   size_t batch_cnt = 0,
+                   PreReleaseCallback* pre_release_callback = nullptr,
+                   PostMemTableCallback* post_memtable_callback = nullptr);
+
+  Status PipelinedWriteImpl(const WriteOptions& options, WriteBatch* updates,
+                            WriteCallback* callback = nullptr,
+                            uint64_t* log_used = nullptr, uint64_t log_ref = 0,
+                            bool disable_memtable = false,
+                            uint64_t* seq_used = nullptr);
+
+  // Write only to memtables without joining any write queue
+  Status UnorderedWriteMemtable(const WriteOptions& write_options,
+                                WriteBatch* my_batch, WriteCallback* callback,
+                                uint64_t log_ref, SequenceNumber seq,
+                                const size_t sub_batch_cnt);
+
+  // Whether the batch requires to be assigned with an order
+  enum AssignOrder : bool { kDontAssignOrder, kDoAssignOrder };
+  // Whether it requires publishing last sequence or not
+  enum PublishLastSeq : bool { kDontPublishLastSeq, kDoPublishLastSeq };
+
+  // Join the write_thread to write the batch only to the WAL. It is the
+  // responsibility of the caller to also write the write batch to the memtable
+  // if it required.
+  //
+  // sub_batch_cnt is expected to be non-zero when assign_order = kDoAssignOrder
+  // indicating the number of sub-batches in my_batch. A sub-patch is a subset
+  // of the write batch that does not have duplicate keys. When seq_per_batch is
+  // not set, each key is a separate sub_batch. Otherwise each duplicate key
+  // marks start of a new sub-batch.
+  Status WriteImplWALOnly(
+      WriteThread* write_thread, const WriteOptions& options,
+      WriteBatch* updates, WriteCallback* callback, uint64_t* log_used,
+      const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt,
+      PreReleaseCallback* pre_release_callback, const AssignOrder assign_order,
+      const PublishLastSeq publish_last_seq, const bool disable_memtable);
+
+  // write cached_recoverable_state_ to memtable if it is not empty
+  // The writer must be the leader in write_thread_ and holding mutex_
+  Status WriteRecoverableState();
+
+  // Actual implementation of Close()
+  Status CloseImpl();
+
+  // Recover the descriptor from persistent storage.  May do a significant
+  // amount of work to recover recently logged updates.  Any changes to
+  // be made to the descriptor are added to *edit.
+  // recovered_seq is set to less than kMaxSequenceNumber if the log's tail is
+  // skipped.
+  // recovery_ctx stores the context about version edits and all those
+  // edits are persisted to new Manifest after successfully syncing the new WAL.
+  virtual Status Recover(
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      bool read_only = false, bool error_if_wal_file_exists = false,
+      bool error_if_data_exists_in_wals = false,
+      uint64_t* recovered_seq = nullptr,
+      RecoveryContext* recovery_ctx = nullptr);
+
+  virtual bool OwnTablesAndLogs() const { return true; }
+
+  // Setup DB identity file, and write DB ID to manifest if necessary.
+  Status SetupDBId(bool read_only, RecoveryContext* recovery_ctx);
+  // Assign db_id_ and write DB ID to manifest if necessary.
+  void SetDBId(std::string&& id, bool read_only, RecoveryContext* recovery_ctx);
+
+  // REQUIRES: db mutex held when calling this function, but the db mutex can
+  // be released and re-acquired. Db mutex will be held when the function
+  // returns.
+  // After recovery, there may be SST files in db/cf paths that are
+  // not referenced in the MANIFEST (e.g.
+  // 1. It's best effort recovery;
+  // 2. The VersionEdits referencing the SST files are appended to
+  // RecoveryContext, DB crashes when syncing the MANIFEST, the VersionEdits are
+  // still not synced to MANIFEST during recovery.)
+  // It stores the SST files to be deleted in RecoveryContext. In the
+  // meantime, we find out the largest file number present in the paths, and
+  // bump up the version set's next_file_number_ to be 1 + largest_file_number.
+  // recovery_ctx stores the context about version edits and files to be
+  // deleted. All those edits are persisted to new Manifest after successfully
+  // syncing the new WAL.
+  Status DeleteUnreferencedSstFiles(RecoveryContext* recovery_ctx);
+
+  // SetDbSessionId() should be called in the constuctor DBImpl()
+  // to ensure that db_session_id_ gets updated every time the DB is opened
+  void SetDbSessionId();
+
+  Status FailIfCfHasTs(const ColumnFamilyHandle* column_family) const;
+  Status FailIfTsMismatchCf(ColumnFamilyHandle* column_family, const Slice& ts,
+                            bool ts_for_read) const;
+
+  // recovery_ctx stores the context about version edits and
+  // LogAndApplyForRecovery persist all those edits to new Manifest after
+  // successfully syncing new WAL.
+  // LogAndApplyForRecovery should be called only once during recovery and it
+  // should be called when RocksDB writes to a first new MANIFEST since this
+  // recovery.
+  Status LogAndApplyForRecovery(const RecoveryContext& recovery_ctx);
+
+  void InvokeWalFilterIfNeededOnColumnFamilyToWalNumberMap();
+
+  // Return true to proceed with current WAL record whose content is stored in
+  // `batch`. Return false to skip current WAL record.
+  bool InvokeWalFilterIfNeededOnWalRecord(uint64_t wal_number,
+                                          const std::string& wal_fname,
+                                          log::Reader::Reporter& reporter,
+                                          Status& status, bool& stop_replay,
+                                          WriteBatch& batch);
+
+ private:
+  friend class DB;
+  friend class ErrorHandler;
+  friend class InternalStats;
+  friend class PessimisticTransaction;
+  friend class TransactionBaseImpl;
+  friend class WriteCommittedTxn;
+  friend class WritePreparedTxn;
+  friend class WritePreparedTxnDB;
+  friend class WriteBatchWithIndex;
+  friend class WriteUnpreparedTxnDB;
+  friend class WriteUnpreparedTxn;
+
+#ifndef ROCKSDB_LITE
+  friend class ForwardIterator;
+#endif
+  friend struct SuperVersion;
+  friend class CompactedDBImpl;
+  friend class DBTest_ConcurrentFlushWAL_Test;
+  friend class DBTest_MixedSlowdownOptionsStop_Test;
+  friend class DBCompactionTest_CompactBottomLevelFilesWithDeletions_Test;
+  friend class DBCompactionTest_CompactionDuringShutdown_Test;
+  friend class StatsHistoryTest_PersistentStatsCreateColumnFamilies_Test;
+#ifndef NDEBUG
+  friend class DBTest2_ReadCallbackTest_Test;
+  friend class WriteCallbackPTest_WriteWithCallbackTest_Test;
+  friend class XFTransactionWriteHandler;
+  friend class DBBlobIndexTest;
+  friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
+#endif
+
+  struct CompactionState;
+  struct PrepickedCompaction;
+  struct PurgeFileInfo;
+
+  struct WriteContext {
+    SuperVersionContext superversion_context;
+    autovector<MemTable*> memtables_to_free_;
+
+    explicit WriteContext(bool create_superversion = false)
+        : superversion_context(create_superversion) {}
+
+    ~WriteContext() {
+      superversion_context.Clean();
+      for (auto& m : memtables_to_free_) {
+        delete m;
+      }
+    }
+  };
+
+  struct LogFileNumberSize {
+    explicit LogFileNumberSize(uint64_t _number) : number(_number) {}
+    LogFileNumberSize() {}
+    void AddSize(uint64_t new_size) { size += new_size; }
+    uint64_t number;
+    uint64_t size = 0;
+    bool getting_flushed = false;
+  };
+
+  struct LogWriterNumber {
+    // pass ownership of _writer
+    LogWriterNumber(uint64_t _number, log::Writer* _writer)
+        : number(_number), writer(_writer) {}
+
+    log::Writer* ReleaseWriter() {
+      auto* w = writer;
+      writer = nullptr;
+      return w;
+    }
+    Status ClearWriter() {
+      Status s = writer->WriteBuffer();
+      delete writer;
+      writer = nullptr;
+      return s;
+    }
+
+    bool IsSyncing() { return getting_synced; }
+
+    uint64_t GetPreSyncSize() {
+      assert(getting_synced);
+      return pre_sync_size;
+    }
+
+    void PrepareForSync() {
+      assert(!getting_synced);
+      // Size is expected to be monotonically increasing.
+      assert(writer->file()->GetFlushedSize() >= pre_sync_size);
+      getting_synced = true;
+      pre_sync_size = writer->file()->GetFlushedSize();
+    }
+
+    void FinishSync() {
+      assert(getting_synced);
+      getting_synced = false;
+    }
+
+    uint64_t number;
+    // Visual Studio doesn't support deque's member to be noncopyable because
+    // of a std::unique_ptr as a member.
+    log::Writer* writer;  // own
+
+   private:
+    // true for some prefix of logs_
+    bool getting_synced = false;
+    // The size of the file before the sync happens. This amount is guaranteed
+    // to be persisted even if appends happen during sync so it can be used for
+    // tracking the synced size in MANIFEST.
+    uint64_t pre_sync_size = 0;
+  };
+
+  struct LogContext {
+    explicit LogContext(bool need_sync = false)
+        : need_log_sync(need_sync), need_log_dir_sync(need_sync) {}
+    bool need_log_sync = false;
+    bool need_log_dir_sync = false;
+    log::Writer* writer = nullptr;
+    LogFileNumberSize* log_file_number_size = nullptr;
+  };
+
+  // PurgeFileInfo is a structure to hold information of files to be deleted in
+  // purge_files_
+  struct PurgeFileInfo {
+    std::string fname;
+    std::string dir_to_sync;
+    FileType type;
+    uint64_t number;
+    int job_id;
+    PurgeFileInfo(std::string fn, std::string d, FileType t, uint64_t num,
+                  int jid)
+        : fname(fn), dir_to_sync(d), type(t), number(num), job_id(jid) {}
+  };
+
+  // Argument required by background flush thread.
+  struct BGFlushArg {
+    BGFlushArg()
+        : cfd_(nullptr), max_memtable_id_(0), superversion_context_(nullptr) {}
+    BGFlushArg(ColumnFamilyData* cfd, uint64_t max_memtable_id,
+               SuperVersionContext* superversion_context)
+        : cfd_(cfd),
+          max_memtable_id_(max_memtable_id),
+          superversion_context_(superversion_context) {}
+
+    // Column family to flush.
+    ColumnFamilyData* cfd_;
+    // Maximum ID of memtable to flush. In this column family, memtables with
+    // IDs smaller than this value must be flushed before this flush completes.
+    uint64_t max_memtable_id_;
+    // Pointer to a SuperVersionContext object. After flush completes, RocksDB
+    // installs a new superversion for the column family. This operation
+    // requires a SuperVersionContext object (currently embedded in JobContext).
+    SuperVersionContext* superversion_context_;
+  };
+
+  // Argument passed to flush thread.
+  struct FlushThreadArg {
+    DBImpl* db_;
+
+    Env::Priority thread_pri_;
+  };
+
+  // Information for a manual compaction
+  struct ManualCompactionState {
+    ManualCompactionState(ColumnFamilyData* _cfd, int _input_level,
+                          int _output_level, uint32_t _output_path_id,
+                          bool _exclusive, bool _disallow_trivial_move,
+                          std::atomic<bool>* _canceled)
+        : cfd(_cfd),
+          input_level(_input_level),
+          output_level(_output_level),
+          output_path_id(_output_path_id),
+          exclusive(_exclusive),
+          disallow_trivial_move(_disallow_trivial_move),
+          canceled(_canceled ? *_canceled : canceled_internal_storage) {}
+    // When _canceled is not provided by ther user, we assign the reference of
+    // canceled_internal_storage to it to consolidate canceled and
+    // manual_compaction_paused since DisableManualCompaction() might be
+    // called
+
+    ColumnFamilyData* cfd;
+    int input_level;
+    int output_level;
+    uint32_t output_path_id;
+    Status status;
+    bool done = false;
+    bool in_progress = false;    // compaction request being processed?
+    bool incomplete = false;     // only part of requested range compacted
+    bool exclusive;              // current behavior of only one manual
+    bool disallow_trivial_move;  // Force actual compaction to run
+    const InternalKey* begin = nullptr;  // nullptr means beginning of key range
+    const InternalKey* end = nullptr;    // nullptr means end of key range
+    InternalKey* manual_end = nullptr;   // how far we are compacting
+    InternalKey tmp_storage;      // Used to keep track of compaction progress
+    InternalKey tmp_storage1;     // Used to keep track of compaction progress
+
+    // When the user provides a canceled pointer in CompactRangeOptions, the
+    // above varaibe is the reference of the user-provided
+    // `canceled`, otherwise, it is the reference of canceled_internal_storage
+    std::atomic<bool> canceled_internal_storage = false;
+    std::atomic<bool>& canceled;  // Compaction canceled pointer reference
+  };
+  struct PrepickedCompaction {
+    // background compaction takes ownership of `compaction`.
+    Compaction* compaction;
+    // caller retains ownership of `manual_compaction_state` as it is reused
+    // across background compactions.
+    ManualCompactionState* manual_compaction_state;  // nullptr if non-manual
+    // task limiter token is requested during compaction picking.
+    std::unique_ptr<TaskLimiterToken> task_token;
+  };
+
+  struct CompactionArg {
+    // caller retains ownership of `db`.
+    DBImpl* db;
+    // background compaction takes ownership of `prepicked_compaction`.
+    PrepickedCompaction* prepicked_compaction;
+    Env::Priority compaction_pri_;
+  };
+
+  // Initialize the built-in column family for persistent stats. Depending on
+  // whether on-disk persistent stats have been enabled before, it may either
+  // create a new column family and column family handle or just a column family
+  // handle.
+  // Required: DB mutex held
+  Status InitPersistStatsColumnFamily();
+
+  // Persistent Stats column family has two format version key which are used
+  // for compatibility check. Write format version if it's created for the
+  // first time, read format version and check compatibility if recovering
+  // from disk. This function requires DB mutex held at entrance but may
+  // release and re-acquire DB mutex in the process.
+  // Required: DB mutex held
+  Status PersistentStatsProcessFormatVersion();
+
+  Status ResumeImpl(DBRecoverContext context);
+
+  void MaybeIgnoreError(Status* s) const;
+
+  const Status CreateArchivalDirectory();
+
+  Status CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options,
+                                const std::string& cf_name,
+                                ColumnFamilyHandle** handle);
+
+  Status DropColumnFamilyImpl(ColumnFamilyHandle* column_family);
+
+  // Delete any unneeded files and stale in-memory entries.
+  void DeleteObsoleteFiles();
+  // Delete obsolete files and log status and information of file deletion
+  void DeleteObsoleteFileImpl(int job_id, const std::string& fname,
+                              const std::string& path_to_sync, FileType type,
+                              uint64_t number);
+
+  // Background process needs to call
+  //     auto x = CaptureCurrentFileNumberInPendingOutputs()
+  //     auto file_num = versions_->NewFileNumber();
+  //     <do something>
+  //     ReleaseFileNumberFromPendingOutputs(x)
+  // This will protect any file with number `file_num` or greater from being
+  // deleted while <do something> is running.
+  // -----------
+  // This function will capture current file number and append it to
+  // pending_outputs_. This will prevent any background process to delete any
+  // file created after this point.
+  std::list<uint64_t>::iterator CaptureCurrentFileNumberInPendingOutputs();
+  // This function should be called with the result of
+  // CaptureCurrentFileNumberInPendingOutputs(). It then marks that any file
+  // created between the calls CaptureCurrentFileNumberInPendingOutputs() and
+  // ReleaseFileNumberFromPendingOutputs() can now be deleted (if it's not live
+  // and blocked by any other pending_outputs_ calls)
+  void ReleaseFileNumberFromPendingOutputs(
+      std::unique_ptr<std::list<uint64_t>::iterator>& v);
+
+  IOStatus SyncClosedLogs(JobContext* job_context, VersionEdit* synced_wals);
+
+  // Flush the in-memory write buffer to storage.  Switches to a new
+  // log-file/memtable and writes a new descriptor iff successful. Then
+  // installs a new super version for the column family.
+  Status FlushMemTableToOutputFile(
+      ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+      bool* madeProgress, JobContext* job_context,
+      SuperVersionContext* superversion_context,
+      std::vector<SequenceNumber>& snapshot_seqs,
+      SequenceNumber earliest_write_conflict_snapshot,
+      SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
+      Env::Priority thread_pri);
+
+  // Flush the memtables of (multiple) column families to multiple files on
+  // persistent storage.
+  Status FlushMemTablesToOutputFiles(
+      const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+      JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri);
+
+  Status AtomicFlushMemTablesToOutputFiles(
+      const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+      JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri);
+
+  // REQUIRES: log_numbers are sorted in ascending order
+  // corrupted_log_found is set to true if we recover from a corrupted log file.
+  Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
+                         SequenceNumber* next_sequence, bool read_only,
+                         bool* corrupted_log_found,
+                         RecoveryContext* recovery_ctx);
+
+  // The following two methods are used to flush a memtable to
+  // storage. The first one is used at database RecoveryTime (when the
+  // database is opened) and is heavyweight because it holds the mutex
+  // for the entire period. The second method WriteLevel0Table supports
+  // concurrent flush memtables to storage.
+  Status WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
+                                     MemTable* mem, VersionEdit* edit);
+
+  // Get the size of a log file and, if truncate is true, truncate the
+  // log file to its actual size, thereby freeing preallocated space.
+  // Return success even if truncate fails
+  Status GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate,
+                                    LogFileNumberSize* log);
+
+  // Restore alive_log_files_ and total_log_size_ after recovery.
+  // It needs to run only when there's no flush during recovery
+  // (e.g. avoid_flush_during_recovery=true). May also trigger flush
+  // in case total_log_size > max_total_wal_size.
+  Status RestoreAliveLogFiles(const std::vector<uint64_t>& log_numbers);
+
+  // num_bytes: for slowdown case, delay time is calculated based on
+  //            `num_bytes` going through.
+  Status DelayWrite(uint64_t num_bytes, const WriteOptions& write_options);
+
+  // Begin stalling of writes when memory usage increases beyond a certain
+  // threshold.
+  void WriteBufferManagerStallWrites();
+
+  Status ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
+                                      WriteBatch* my_batch);
+
+  // REQUIRES: mutex locked and in write thread.
+  Status ScheduleFlushes(WriteContext* context);
+
+  void MaybeFlushStatsCF(autovector<ColumnFamilyData*>* cfds);
+
+  Status TrimMemtableHistory(WriteContext* context);
+
+  Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context);
+
+  void SelectColumnFamiliesForAtomicFlush(autovector<ColumnFamilyData*>* cfds);
+
+  // Force current memtable contents to be flushed.
+  Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options,
+                       FlushReason flush_reason,
+                       bool entered_write_thread = false);
+
+  Status AtomicFlushMemTables(
+      const autovector<ColumnFamilyData*>& column_family_datas,
+      const FlushOptions& options, FlushReason flush_reason,
+      bool entered_write_thread = false);
+
+  // Wait until flushing this column family won't stall writes
+  Status WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
+                                           bool* flush_needed);
+
+  // Wait for memtable flushed.
+  // If flush_memtable_id is non-null, wait until the memtable with the ID
+  // gets flush. Otherwise, wait until the column family don't have any
+  // memtable pending flush.
+  // resuming_from_bg_err indicates whether the caller is attempting to resume
+  // from background error.
+  Status WaitForFlushMemTable(ColumnFamilyData* cfd,
+                              const uint64_t* flush_memtable_id = nullptr,
+                              bool resuming_from_bg_err = false) {
+    return WaitForFlushMemTables({cfd}, {flush_memtable_id},
+                                 resuming_from_bg_err);
+  }
+  // Wait for memtables to be flushed for multiple column families.
+  Status WaitForFlushMemTables(
+      const autovector<ColumnFamilyData*>& cfds,
+      const autovector<const uint64_t*>& flush_memtable_ids,
+      bool resuming_from_bg_err);
+
+  inline void WaitForPendingWrites() {
+    mutex_.AssertHeld();
+    TEST_SYNC_POINT("DBImpl::WaitForPendingWrites:BeforeBlock");
+    // In case of pipelined write is enabled, wait for all pending memtable
+    // writers.
+    if (immutable_db_options_.enable_pipelined_write) {
+      // Memtable writers may call DB::Get in case max_successive_merges > 0,
+      // which may lock mutex. Unlocking mutex here to avoid deadlock.
+      mutex_.Unlock();
+      write_thread_.WaitForMemTableWriters();
+      mutex_.Lock();
+    }
+
+    if (!immutable_db_options_.unordered_write) {
+      // Then the writes are finished before the next write group starts
+      return;
+    }
+
+    // Wait for the ones who already wrote to the WAL to finish their
+    // memtable write.
+    if (pending_memtable_writes_.load() != 0) {
+      std::unique_lock<std::mutex> guard(switch_mutex_);
+      switch_cv_.wait(guard,
+                      [&] { return pending_memtable_writes_.load() == 0; });
+    }
+  }
+
+  // TaskType is used to identify tasks in thread-pool, currently only
+  // differentiate manual compaction, which could be unscheduled from the
+  // thread-pool.
+  enum class TaskType : uint8_t {
+    kDefault = 0,
+    kManualCompaction = 1,
+    kCount = 2,
+  };
+
+  // Task tag is used to identity tasks in thread-pool, which is
+  // dbImpl obj address + type
+  inline void* GetTaskTag(TaskType type) {
+    return GetTaskTag(static_cast<uint8_t>(type));
+  }
+
+  inline void* GetTaskTag(uint8_t type) {
+    return static_cast<uint8_t*>(static_cast<void*>(this)) + type;
+  }
+
+  // REQUIRES: mutex locked and in write thread.
+  void AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds);
+
+  // REQUIRES: mutex locked and in write thread.
+  Status SwitchWAL(WriteContext* write_context);
+
+  // REQUIRES: mutex locked and in write thread.
+  Status HandleWriteBufferManagerFlush(WriteContext* write_context);
+
+  // REQUIRES: mutex locked
+  Status PreprocessWrite(const WriteOptions& write_options,
+                         LogContext* log_context, WriteContext* write_context);
+
+  // Merge write batches in the write group into merged_batch.
+  // Returns OK if merge is successful.
+  // Returns Corruption if corruption in write batch is detected.
+  Status MergeBatch(const WriteThread::WriteGroup& write_group,
+                    WriteBatch* tmp_batch, WriteBatch** merged_batch,
+                    size_t* write_with_wal, WriteBatch** to_be_cached_state);
+
+  // rate_limiter_priority is used to charge `DBOptions::rate_limiter`
+  // for automatic WAL flush (`Options::manual_wal_flush` == false)
+  // associated with this WriteToWAL
+  IOStatus WriteToWAL(const WriteBatch& merged_batch, log::Writer* log_writer,
+                      uint64_t* log_used, uint64_t* log_size,
+                      Env::IOPriority rate_limiter_priority,
+                      LogFileNumberSize& log_file_number_size);
+
+  IOStatus WriteToWAL(const WriteThread::WriteGroup& write_group,
+                      log::Writer* log_writer, uint64_t* log_used,
+                      bool need_log_sync, bool need_log_dir_sync,
+                      SequenceNumber sequence,
+                      LogFileNumberSize& log_file_number_size);
+
+  IOStatus ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
+                                uint64_t* log_used,
+                                SequenceNumber* last_sequence, size_t seq_inc);
+
+  // Used by WriteImpl to update bg_error_ if paranoid check is enabled.
+  // Caller must hold mutex_.
+  void WriteStatusCheckOnLocked(const Status& status);
+
+  // Used by WriteImpl to update bg_error_ if paranoid check is enabled.
+  void WriteStatusCheck(const Status& status);
+
+  // Used by WriteImpl to update bg_error_ when IO error happens, e.g., write
+  // WAL, sync WAL fails, if paranoid check is enabled.
+  void IOStatusCheck(const IOStatus& status);
+
+  // Used by WriteImpl to update bg_error_ in case of memtable insert error.
+  void MemTableInsertStatusCheck(const Status& memtable_insert_status);
+
+#ifndef ROCKSDB_LITE
+  Status CompactFilesImpl(const CompactionOptions& compact_options,
+                          ColumnFamilyData* cfd, Version* version,
+                          const std::vector<std::string>& input_file_names,
+                          std::vector<std::string>* const output_file_names,
+                          const int output_level, int output_path_id,
+                          JobContext* job_context, LogBuffer* log_buffer,
+                          CompactionJobInfo* compaction_job_info);
+
+  // Wait for current IngestExternalFile() calls to finish.
+  // REQUIRES: mutex_ held
+  void WaitForIngestFile();
+#else
+  // IngestExternalFile is not supported in ROCKSDB_LITE so this function
+  // will be no-op
+  void WaitForIngestFile() {}
+#endif  // ROCKSDB_LITE
+
+  ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name);
+
+  void MaybeScheduleFlushOrCompaction();
+
+  // A flush request specifies the column families to flush as well as the
+  // largest memtable id to persist for each column family. Once all the
+  // memtables whose IDs are smaller than or equal to this per-column-family
+  // specified value, this flush request is considered to have completed its
+  // work of flushing this column family. After completing the work for all
+  // column families in this request, this flush is considered complete.
+  using FlushRequest = std::vector<std::pair<ColumnFamilyData*, uint64_t>>;
+
+  void GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
+                            FlushRequest* req);
+
+  void SchedulePendingFlush(const FlushRequest& req, FlushReason flush_reason);
+
+  void SchedulePendingCompaction(ColumnFamilyData* cfd);
+  void SchedulePendingPurge(std::string fname, std::string dir_to_sync,
+                            FileType type, uint64_t number, int job_id);
+  static void BGWorkCompaction(void* arg);
+  // Runs a pre-chosen universal compaction involving bottom level in a
+  // separate, bottom-pri thread pool.
+  static void BGWorkBottomCompaction(void* arg);
+  static void BGWorkFlush(void* arg);
+  static void BGWorkPurge(void* arg);
+  static void UnscheduleCompactionCallback(void* arg);
+  static void UnscheduleFlushCallback(void* arg);
+  void BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
+                                Env::Priority thread_pri);
+  void BackgroundCallFlush(Env::Priority thread_pri);
+  void BackgroundCallPurge();
+  Status BackgroundCompaction(bool* madeProgress, JobContext* job_context,
+                              LogBuffer* log_buffer,
+                              PrepickedCompaction* prepicked_compaction,
+                              Env::Priority thread_pri);
+  Status BackgroundFlush(bool* madeProgress, JobContext* job_context,
+                         LogBuffer* log_buffer, FlushReason* reason,
+                         Env::Priority thread_pri);
+
+  bool EnoughRoomForCompaction(ColumnFamilyData* cfd,
+                               const std::vector<CompactionInputFiles>& inputs,
+                               bool* sfm_bookkeeping, LogBuffer* log_buffer);
+
+  // Request compaction tasks token from compaction thread limiter.
+  // It always succeeds if force = true or limiter is disable.
+  bool RequestCompactionToken(ColumnFamilyData* cfd, bool force,
+                              std::unique_ptr<TaskLimiterToken>* token,
+                              LogBuffer* log_buffer);
+
+  // Schedule background tasks
+  Status StartPeriodicTaskScheduler();
+
+  Status RegisterRecordSeqnoTimeWorker();
+
+  void PrintStatistics();
+
+  size_t EstimateInMemoryStatsHistorySize() const;
+
+  // Return the minimum empty level that could hold the total data in the
+  // input level. Return the input level, if such level could not be found.
+  int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd,
+                                   const MutableCFOptions& mutable_cf_options,
+                                   int level);
+
+  // Move the files in the input level to the target level.
+  // If target_level < 0, automatically calculate the minimum level that could
+  // hold the data set.
+  Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1);
+
+  // helper functions for adding and removing from flush & compaction queues
+  void AddToCompactionQueue(ColumnFamilyData* cfd);
+  ColumnFamilyData* PopFirstFromCompactionQueue();
+  FlushRequest PopFirstFromFlushQueue();
+
+  // Pick the first unthrottled compaction with task token from queue.
+  ColumnFamilyData* PickCompactionFromQueue(
+      std::unique_ptr<TaskLimiterToken>* token, LogBuffer* log_buffer);
+
+  // helper function to call after some of the logs_ were synced
+  void MarkLogsSynced(uint64_t up_to, bool synced_dir, VersionEdit* edit);
+  Status ApplyWALToManifest(VersionEdit* edit);
+  // WALs with log number up to up_to are not synced successfully.
+  void MarkLogsNotSynced(uint64_t up_to);
+
+  SnapshotImpl* GetSnapshotImpl(bool is_write_conflict_boundary,
+                                bool lock = true);
+
+  // If snapshot_seq != kMaxSequenceNumber, then this function can only be
+  // called from the write thread that publishes sequence numbers to readers.
+  // For 1) write-committed, or 2) write-prepared + one-write-queue, this will
+  // be the write thread performing memtable writes. For write-prepared with
+  // two write queues, this will be the write thread writing commit marker to
+  // the WAL.
+  // If snapshot_seq == kMaxSequenceNumber, this function is called by a caller
+  // ensuring no writes to the database.
+  std::pair<Status, std::shared_ptr<const SnapshotImpl>>
+  CreateTimestampedSnapshotImpl(SequenceNumber snapshot_seq, uint64_t ts,
+                                bool lock = true);
+
+  uint64_t GetMaxTotalWalSize() const;
+
+  FSDirectory* GetDataDir(ColumnFamilyData* cfd, size_t path_id) const;
+
+  Status MaybeReleaseTimestampedSnapshotsAndCheck();
+
+  Status CloseHelper();
+
+  void WaitForBackgroundWork();
+
+  // Background threads call this function, which is just a wrapper around
+  // the InstallSuperVersion() function. Background threads carry
+  // sv_context which can have new_superversion already
+  // allocated.
+  // All ColumnFamily state changes go through this function. Here we analyze
+  // the new state and we schedule background work if we detect that the new
+  // state needs flush or compaction.
+  void InstallSuperVersionAndScheduleWork(
+      ColumnFamilyData* cfd, SuperVersionContext* sv_context,
+      const MutableCFOptions& mutable_cf_options);
+
+  bool GetIntPropertyInternal(ColumnFamilyData* cfd,
+                              const DBPropertyInfo& property_info,
+                              bool is_locked, uint64_t* value);
+  bool GetPropertyHandleOptionsStatistics(std::string* value);
+
+  bool HasPendingManualCompaction();
+  bool HasExclusiveManualCompaction();
+  void AddManualCompaction(ManualCompactionState* m);
+  void RemoveManualCompaction(ManualCompactionState* m);
+  bool ShouldntRunManualCompaction(ManualCompactionState* m);
+  bool HaveManualCompaction(ColumnFamilyData* cfd);
+  bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1);
+#ifndef ROCKSDB_LITE
+  void BuildCompactionJobInfo(const ColumnFamilyData* cfd, Compaction* c,
+                              const Status& st,
+                              const CompactionJobStats& compaction_job_stats,
+                              const int job_id, const Version* current,
+                              CompactionJobInfo* compaction_job_info) const;
+  // Reserve the next 'num' file numbers for to-be-ingested external SST files,
+  // and return the current file_number in 'next_file_number'.
+  // Write a version edit to the MANIFEST.
+  Status ReserveFileNumbersBeforeIngestion(
+      ColumnFamilyData* cfd, uint64_t num,
+      std::unique_ptr<std::list<uint64_t>::iterator>& pending_output_elem,
+      uint64_t* next_file_number);
+#endif  //! ROCKSDB_LITE
+
+  bool ShouldPurge(uint64_t file_number) const;
+  void MarkAsGrabbedForPurge(uint64_t file_number);
+
+  size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
+  Env::WriteLifeTimeHint CalculateWALWriteHint() { return Env::WLTH_SHORT; }
+
+  IOStatus CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number,
+                     size_t preallocate_block_size, log::Writer** new_log);
+
+  // Validate self-consistency of DB options
+  static Status ValidateOptions(const DBOptions& db_options);
+  // Validate self-consistency of DB options and its consistency with cf options
+  static Status ValidateOptions(
+      const DBOptions& db_options,
+      const std::vector<ColumnFamilyDescriptor>& column_families);
+
+  // Utility function to do some debug validation and sort the given vector
+  // of MultiGet keys
+  void PrepareMultiGetKeys(
+      const size_t num_keys, bool sorted,
+      autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* key_ptrs);
+
+  // A structure to hold the information required to process MultiGet of keys
+  // belonging to one column family. For a multi column family MultiGet, there
+  // will be a container of these objects.
+  struct MultiGetColumnFamilyData {
+    ColumnFamilyHandle* cf;
+    ColumnFamilyData* cfd;
+
+    // For the batched MultiGet which relies on sorted keys, start specifies
+    // the index of first key belonging to this column family in the sorted
+    // list.
+    size_t start;
+
+    // For the batched MultiGet case, num_keys specifies the number of keys
+    // belonging to this column family in the sorted list
+    size_t num_keys;
+
+    // SuperVersion for the column family obtained in a manner that ensures a
+    // consistent view across all column families in the DB
+    SuperVersion* super_version;
+    MultiGetColumnFamilyData(ColumnFamilyHandle* column_family,
+                             SuperVersion* sv)
+        : cf(column_family),
+          cfd(static_cast<ColumnFamilyHandleImpl*>(cf)->cfd()),
+          start(0),
+          num_keys(0),
+          super_version(sv) {}
+
+    MultiGetColumnFamilyData(ColumnFamilyHandle* column_family, size_t first,
+                             size_t count, SuperVersion* sv)
+        : cf(column_family),
+          cfd(static_cast<ColumnFamilyHandleImpl*>(cf)->cfd()),
+          start(first),
+          num_keys(count),
+          super_version(sv) {}
+
+    MultiGetColumnFamilyData() = default;
+  };
+
+  // A common function to obtain a consistent snapshot, which can be implicit
+  // if the user doesn't specify a snapshot in read_options, across
+  // multiple column families for MultiGet. It will attempt to get an implicit
+  // snapshot without acquiring the db_mutes, but will give up after a few
+  // tries and acquire the mutex if a memtable flush happens. The template
+  // allows both the batched and non-batched MultiGet to call this with
+  // either an std::unordered_map or autovector of column families.
+  //
+  // If callback is non-null, the callback is refreshed with the snapshot
+  // sequence number
+  //
+  // A return value of true indicates that the SuperVersions were obtained
+  // from the ColumnFamilyData, whereas false indicates they are thread
+  // local
+  template <class T>
+  bool MultiCFSnapshot(
+      const ReadOptions& read_options, ReadCallback* callback,
+      std::function<MultiGetColumnFamilyData*(typename T::iterator&)>&
+          iter_deref_func,
+      T* cf_list, SequenceNumber* snapshot);
+
+  // The actual implementation of the batching MultiGet. The caller is expected
+  // to have acquired the SuperVersion and pass in a snapshot sequence number
+  // in order to construct the LookupKeys. The start_key and num_keys specify
+  // the range of keys in the sorted_keys vector for a single column family.
+  Status MultiGetImpl(
+      const ReadOptions& read_options, size_t start_key, size_t num_keys,
+      autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys,
+      SuperVersion* sv, SequenceNumber snap_seqnum, ReadCallback* callback);
+
+  Status DisableFileDeletionsWithLock();
+
+  Status IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd,
+                                      std::string ts_low);
+
+  bool ShouldReferenceSuperVersion(const MergeContext& merge_context);
+
+  // Lock over the persistent DB state.  Non-nullptr iff successfully acquired.
+  FileLock* db_lock_;
+
+  // In addition to mutex_, log_write_mutex_ protected writes to stats_history_
+  InstrumentedMutex stats_history_mutex_;
+  // In addition to mutex_, log_write_mutex_ protected writes to logs_ and
+  // logfile_number_. With two_write_queues it also protects alive_log_files_,
+  // and log_empty_. Refer to the definition of each variable below for more
+  // details.
+  // Note: to avoid dealock, if needed to acquire both log_write_mutex_ and
+  // mutex_, the order should be first mutex_ and then log_write_mutex_.
+  InstrumentedMutex log_write_mutex_;
+
+  // If zero, manual compactions are allowed to proceed. If non-zero, manual
+  // compactions may still be running, but will quickly fail with
+  // `Status::Incomplete`. The value indicates how many threads have paused
+  // manual compactions. It is accessed in read mode outside the DB mutex in
+  // compaction code paths.
+  std::atomic<int> manual_compaction_paused_;
+
+  // This condition variable is signaled on these conditions:
+  // * whenever bg_compaction_scheduled_ goes down to 0
+  // * if AnyManualCompaction, whenever a compaction finishes, even if it hasn't
+  // made any progress
+  // * whenever a compaction made any progress
+  // * whenever bg_flush_scheduled_ or bg_purge_scheduled_ value decreases
+  // (i.e. whenever a flush is done, even if it didn't make any progress)
+  // * whenever there is an error in background purge, flush or compaction
+  // * whenever num_running_ingest_file_ goes to 0.
+  // * whenever pending_purge_obsolete_files_ goes to 0.
+  // * whenever disable_delete_obsolete_files_ goes to 0.
+  // * whenever SetOptions successfully updates options.
+  // * whenever a column family is dropped.
+  InstrumentedCondVar bg_cv_;
+  // Writes are protected by locking both mutex_ and log_write_mutex_, and reads
+  // must be under either mutex_ or log_write_mutex_. Since after ::Open,
+  // logfile_number_ is currently updated only in write_thread_, it can be read
+  // from the same write_thread_ without any locks.
+  uint64_t logfile_number_;
+  // Log files that we can recycle. Must be protected by db mutex_.
+  std::deque<uint64_t> log_recycle_files_;
+  // Protected by log_write_mutex_.
+  bool log_dir_synced_;
+  // Without two_write_queues, read and writes to log_empty_ are protected by
+  // mutex_. Since it is currently updated/read only in write_thread_, it can be
+  // accessed from the same write_thread_ without any locks. With
+  // two_write_queues writes, where it can be updated in different threads,
+  // read and writes are protected by log_write_mutex_ instead. This is to avoid
+  // expensive mutex_ lock during WAL write, which update log_empty_.
+  bool log_empty_;
+
+  ColumnFamilyHandleImpl* persist_stats_cf_handle_;
+
+  bool persistent_stats_cfd_exists_ = true;
+
+  // alive_log_files_ is protected by mutex_ and log_write_mutex_ with details
+  // as follows:
+  // 1. read by FindObsoleteFiles() which can be called in either application
+  //    thread or RocksDB bg threads, both mutex_ and log_write_mutex_ are
+  //    held.
+  // 2. pop_front() by FindObsoleteFiles(), both mutex_ and log_write_mutex_
+  //    are held.
+  // 3. push_back() by DBImpl::Open() and DBImpl::RestoreAliveLogFiles()
+  //    (actually called by Open()), only mutex_ is held because at this point,
+  //    the DB::Open() call has not returned success to application, and the
+  //    only other thread(s) that can conflict are bg threads calling
+  //    FindObsoleteFiles() which ensure that both mutex_ and log_write_mutex_
+  //    are held when accessing alive_log_files_.
+  // 4. read by DBImpl::Open() is protected by mutex_.
+  // 5. push_back() by SwitchMemtable(). Both mutex_ and log_write_mutex_ are
+  //    held. This is done by the write group leader. Note that in the case of
+  //    two-write-queues, another WAL-only write thread can be writing to the
+  //    WAL concurrently. See 9.
+  // 6. read by SwitchWAL() with both mutex_ and log_write_mutex_ held. This is
+  //    done by write group leader.
+  // 7. read by ConcurrentWriteToWAL() by the write group leader in the case of
+  //    two-write-queues. Only log_write_mutex_ is held to protect concurrent
+  //    pop_front() by FindObsoleteFiles().
+  // 8. read by PreprocessWrite() by the write group leader. log_write_mutex_
+  //    is held to protect the data structure from concurrent pop_front() by
+  //    FindObsoleteFiles().
+  // 9. read by ConcurrentWriteToWAL() by a WAL-only write thread in the case
+  //    of two-write-queues. Only log_write_mutex_ is held. This suffices to
+  //    protect the data structure from concurrent push_back() by current
+  //    write group leader as well as pop_front() by FindObsoleteFiles().
+  std::deque<LogFileNumberSize> alive_log_files_;
+
+  // Log files that aren't fully synced, and the current log file.
+  // Synchronization:
+  // 1. read by FindObsoleteFiles() which can be called either in application
+  //    thread or RocksDB bg threads. log_write_mutex_ is always held, while
+  //    some reads are performed without mutex_.
+  // 2. pop_front() by FindObsoleteFiles() with only log_write_mutex_ held.
+  // 3. read by DBImpl::Open() with both mutex_ and log_write_mutex_.
+  // 4. emplace_back() by DBImpl::Open() with both mutex_ and log_write_mutex.
+  //    Note that at this point, DB::Open() has not returned success to
+  //    application, thus the only other thread(s) that can conflict are bg
+  //    threads calling FindObsoleteFiles(). See 1.
+  // 5. iteration and clear() from CloseHelper() always hold log_write_mutex
+  //    and mutex_.
+  // 6. back() called by APIs FlushWAL() and LockWAL() are protected by only
+  //    log_write_mutex_. These two can be called by application threads after
+  //    DB::Open() returns success to applications.
+  // 7. read by SyncWAL(), another API, protected by only log_write_mutex_.
+  // 8. read by MarkLogsNotSynced() and MarkLogsSynced() are protected by
+  //    log_write_mutex_.
+  // 9. erase() by MarkLogsSynced() protected by log_write_mutex_.
+  // 10. read by SyncClosedLogs() protected by only log_write_mutex_. This can
+  //     happen in bg flush threads after DB::Open() returns success to
+  //     applications.
+  // 11. reads, e.g. front(), iteration, and back() called by PreprocessWrite()
+  //     holds only the log_write_mutex_. This is done by the write group
+  //     leader. A bg thread calling FindObsoleteFiles() or MarkLogsSynced()
+  //     can happen concurrently. This is fine because log_write_mutex_ is used
+  //     by all parties. See 2, 5, 9.
+  // 12. reads, empty(), back() called by SwitchMemtable() hold both mutex_ and
+  //     log_write_mutex_. This happens in the write group leader.
+  // 13. emplace_back() by SwitchMemtable() hold both mutex_ and
+  //     log_write_mutex_. This happens in the write group leader. Can conflict
+  //     with bg threads calling FindObsoleteFiles(), MarkLogsSynced(),
+  //     SyncClosedLogs(), etc. as well as application threads calling
+  //     FlushWAL(), SyncWAL(), LockWAL(). This is fine because all parties
+  //     require at least log_write_mutex_.
+  // 14. iteration called in WriteToWAL(write_group) protected by
+  //     log_write_mutex_. This is done by write group leader when
+  //     two-write-queues is disabled and write needs to sync logs.
+  // 15. back() called in ConcurrentWriteToWAL() protected by log_write_mutex_.
+  //     This can be done by the write group leader if two-write-queues is
+  //     enabled. It can also be done by another WAL-only write thread.
+  //
+  // Other observations:
+  //  - back() and items with getting_synced=true are not popped,
+  //  - The same thread that sets getting_synced=true will reset it.
+  //  - it follows that the object referred by back() can be safely read from
+  //  the write_thread_ without using mutex. Note that calling back() without
+  //  mutex may be unsafe because different implementations of deque::back() may
+  //  access other member variables of deque, causing undefined behaviors.
+  //  Generally, do not access stl containers without proper synchronization.
+  //  - it follows that the items with getting_synced=true can be safely read
+  //  from the same thread that has set getting_synced=true
+  std::deque<LogWriterNumber> logs_;
+
+  // Signaled when getting_synced becomes false for some of the logs_.
+  InstrumentedCondVar log_sync_cv_;
+  // This is the app-level state that is written to the WAL but will be used
+  // only during recovery. Using this feature enables not writing the state to
+  // memtable on normal writes and hence improving the throughput. Each new
+  // write of the state will replace the previous state entirely even if the
+  // keys in the two consecutive states do not overlap.
+  // It is protected by log_write_mutex_ when two_write_queues_ is enabled.
+  // Otherwise only the heaad of write_thread_ can access it.
+  WriteBatch cached_recoverable_state_;
+  std::atomic<bool> cached_recoverable_state_empty_ = {true};
+  std::atomic<uint64_t> total_log_size_;
+
+  // If this is non-empty, we need to delete these log files in background
+  // threads. Protected by log_write_mutex_.
+  autovector<log::Writer*> logs_to_free_;
+
+  bool is_snapshot_supported_;
+
+  std::map<uint64_t, std::map<std::string, uint64_t>> stats_history_;
+
+  std::map<std::string, uint64_t> stats_slice_;
+
+  bool stats_slice_initialized_ = false;
+
+  Directories directories_;
+
+  WriteBufferManager* write_buffer_manager_;
+
+  WriteThread write_thread_;
+  WriteBatch tmp_batch_;
+  // The write thread when the writers have no memtable write. This will be used
+  // in 2PC to batch the prepares separately from the serial commit.
+  WriteThread nonmem_write_thread_;
+
+  WriteController write_controller_;
+
+  // Size of the last batch group. In slowdown mode, next write needs to
+  // sleep if it uses up the quota.
+  // Note: This is to protect memtable and compaction. If the batch only writes
+  // to the WAL its size need not to be included in this.
+  uint64_t last_batch_group_size_;
+
+  FlushScheduler flush_scheduler_;
+
+  TrimHistoryScheduler trim_history_scheduler_;
+
+  SnapshotList snapshots_;
+
+  TimestampedSnapshotList timestamped_snapshots_;
+
+  // For each background job, pending_outputs_ keeps the current file number at
+  // the time that background job started.
+  // FindObsoleteFiles()/PurgeObsoleteFiles() never deletes any file that has
+  // number bigger than any of the file number in pending_outputs_. Since file
+  // numbers grow monotonically, this also means that pending_outputs_ is always
+  // sorted. After a background job is done executing, its file number is
+  // deleted from pending_outputs_, which allows PurgeObsoleteFiles() to clean
+  // it up.
+  // State is protected with db mutex.
+  std::list<uint64_t> pending_outputs_;
+
+  // flush_queue_ and compaction_queue_ hold column families that we need to
+  // flush and compact, respectively.
+  // A column family is inserted into flush_queue_ when it satisfies condition
+  // cfd->imm()->IsFlushPending()
+  // A column family is inserted into compaction_queue_ when it satisfied
+  // condition cfd->NeedsCompaction()
+  // Column families in this list are all Ref()-erenced
+  // TODO(icanadi) Provide some kind of ReferencedColumnFamily class that will
+  // do RAII on ColumnFamilyData
+  // Column families are in this queue when they need to be flushed or
+  // compacted. Consumers of these queues are flush and compaction threads. When
+  // column family is put on this queue, we increase unscheduled_flushes_ and
+  // unscheduled_compactions_. When these variables are bigger than zero, that
+  // means we need to schedule background threads for flush and compaction.
+  // Once the background threads are scheduled, we decrease unscheduled_flushes_
+  // and unscheduled_compactions_. That way we keep track of number of
+  // compaction and flush threads we need to schedule. This scheduling is done
+  // in MaybeScheduleFlushOrCompaction()
+  // invariant(column family present in flush_queue_ <==>
+  // ColumnFamilyData::pending_flush_ == true)
+  std::deque<FlushRequest> flush_queue_;
+  // invariant(column family present in compaction_queue_ <==>
+  // ColumnFamilyData::pending_compaction_ == true)
+  std::deque<ColumnFamilyData*> compaction_queue_;
+
+  // A map to store file numbers and filenames of the files to be purged
+  std::unordered_map<uint64_t, PurgeFileInfo> purge_files_;
+
+  // A vector to store the file numbers that have been assigned to certain
+  // JobContext. Current implementation tracks table and blob files only.
+  std::unordered_set<uint64_t> files_grabbed_for_purge_;
+
+  // A queue to store log writers to close. Protected by db mutex_.
+  std::deque<log::Writer*> logs_to_free_queue_;
+
+  std::deque<SuperVersion*> superversions_to_free_queue_;
+
+  int unscheduled_flushes_;
+
+  int unscheduled_compactions_;
+
+  // count how many background compactions are running or have been scheduled in
+  // the BOTTOM pool
+  int bg_bottom_compaction_scheduled_;
+
+  // count how many background compactions are running or have been scheduled
+  int bg_compaction_scheduled_;
+
+  // stores the number of compactions are currently running
+  int num_running_compactions_;
+
+  // number of background memtable flush jobs, submitted to the HIGH pool
+  int bg_flush_scheduled_;
+
+  // stores the number of flushes are currently running
+  int num_running_flushes_;
+
+  // number of background obsolete file purge jobs, submitted to the HIGH pool
+  int bg_purge_scheduled_;
+
+  std::deque<ManualCompactionState*> manual_compaction_dequeue_;
+
+  // shall we disable deletion of obsolete files
+  // if 0 the deletion is enabled.
+  // if non-zero, files will not be getting deleted
+  // This enables two different threads to call
+  // EnableFileDeletions() and DisableFileDeletions()
+  // without any synchronization
+  int disable_delete_obsolete_files_;
+
+  // Number of times FindObsoleteFiles has found deletable files and the
+  // corresponding call to PurgeObsoleteFiles has not yet finished.
+  int pending_purge_obsolete_files_;
+
+  // last time when DeleteObsoleteFiles with full scan was executed. Originally
+  // initialized with startup time.
+  uint64_t delete_obsolete_files_last_run_;
+
+  // last time stats were dumped to LOG
+  std::atomic<uint64_t> last_stats_dump_time_microsec_;
+
+  // The thread that wants to switch memtable, can wait on this cv until the
+  // pending writes to memtable finishes.
+  std::condition_variable switch_cv_;
+  // The mutex used by switch_cv_. mutex_ should be acquired beforehand.
+  std::mutex switch_mutex_;
+  // Number of threads intending to write to memtable
+  std::atomic<size_t> pending_memtable_writes_ = {};
+
+  // A flag indicating whether the current rocksdb database has any
+  // data that is not yet persisted into either WAL or SST file.
+  // Used when disableWAL is true.
+  std::atomic<bool> has_unpersisted_data_;
+
+  // if an attempt was made to flush all column families that
+  // the oldest log depends on but uncommitted data in the oldest
+  // log prevents the log from being released.
+  // We must attempt to free the dependent memtables again
+  // at a later time after the transaction in the oldest
+  // log is fully commited.
+  bool unable_to_release_oldest_log_;
+
+  // Number of running IngestExternalFile() or CreateColumnFamilyWithImport()
+  // calls.
+  // REQUIRES: mutex held
+  int num_running_ingest_file_;
+
+#ifndef ROCKSDB_LITE
+  WalManager wal_manager_;
+#endif  // ROCKSDB_LITE
+
+  // A value of > 0 temporarily disables scheduling of background work
+  int bg_work_paused_;
+
+  // A value of > 0 temporarily disables scheduling of background compaction
+  int bg_compaction_paused_;
+
+  // Guard against multiple concurrent refitting
+  bool refitting_level_;
+
+  // Indicate DB was opened successfully
+  bool opened_successfully_;
+
+  // The min threshold to triggere bottommost compaction for removing
+  // garbages, among all column families.
+  SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+
+  LogsWithPrepTracker logs_with_prep_tracker_;
+
+  // Callback for compaction to check if a key is visible to a snapshot.
+  // REQUIRES: mutex held
+  std::unique_ptr<SnapshotChecker> snapshot_checker_;
+
+  // Callback for when the cached_recoverable_state_ is written to memtable
+  // Only to be set during initialization
+  std::unique_ptr<PreReleaseCallback> recoverable_state_pre_release_callback_;
+
+#ifndef ROCKSDB_LITE
+  // Scheduler to run DumpStats(), PersistStats(), and FlushInfoLog().
+  // Currently, internally it has a global timer instance for running the tasks.
+  PeriodicTaskScheduler periodic_task_scheduler_;
+
+  // It contains the implementations for each periodic task.
+  std::map<PeriodicTaskType, const PeriodicTaskFunc> periodic_task_functions_;
+#endif
+
+  // When set, we use a separate queue for writes that don't write to memtable.
+  // In 2PC these are the writes at Prepare phase.
+  const bool two_write_queues_;
+  const bool manual_wal_flush_;
+
+  // LastSequence also indicates last published sequence visibile to the
+  // readers. Otherwise LastPublishedSequence should be used.
+  const bool last_seq_same_as_publish_seq_;
+  // It indicates that a customized gc algorithm must be used for
+  // flush/compaction and if it is not provided vis SnapshotChecker, we should
+  // disable gc to be safe.
+  const bool use_custom_gc_;
+  // Flag to indicate that the DB instance shutdown has been initiated. This
+  // different from shutting_down_ atomic in that it is set at the beginning
+  // of shutdown sequence, specifically in order to prevent any background
+  // error recovery from going on in parallel. The latter, shutting_down_,
+  // is set a little later during the shutdown after scheduling memtable
+  // flushes
+  std::atomic<bool> shutdown_initiated_;
+  // Flag to indicate whether sst_file_manager object was allocated in
+  // DB::Open() or passed to us
+  bool own_sfm_;
+
+  // Flag to check whether Close() has been called on this DB
+  bool closed_;
+  // save the closing status, for re-calling the close()
+  Status closing_status_;
+  // mutex for DB::Close()
+  InstrumentedMutex closing_mutex_;
+
+  // Conditional variable to coordinate installation of atomic flush results.
+  // With atomic flush, each bg thread installs the result of flushing multiple
+  // column families, and different threads can flush different column
+  // families. It's difficult to rely on one thread to perform batch
+  // installation for all threads. This is different from the non-atomic flush
+  // case.
+  // atomic_flush_install_cv_ makes sure that threads install atomic flush
+  // results sequentially. Flush results of memtables with lower IDs get
+  // installed to MANIFEST first.
+  InstrumentedCondVar atomic_flush_install_cv_;
+
+  bool wal_in_db_path_;
+  std::atomic<uint64_t> max_total_wal_size_;
+
+  BlobFileCompletionCallback blob_callback_;
+
+  // Pointer to WriteBufferManager stalling interface.
+  std::unique_ptr<StallInterface> wbm_stall_;
+
+  // seqno_time_mapping_ stores the sequence number to time mapping, it's not
+  // thread safe, both read and write need db mutex hold.
+  SeqnoToTimeMapping seqno_time_mapping_;
+};
+
+class GetWithTimestampReadCallback : public ReadCallback {
+ public:
+  explicit GetWithTimestampReadCallback(SequenceNumber seq)
+      : ReadCallback(seq) {}
+  bool IsVisibleFullCheck(SequenceNumber seq) override {
+    return seq <= max_visible_seq_;
+  }
+};
+
+extern Options SanitizeOptions(const std::string& db, const Options& src,
+                               bool read_only = false,
+                               Status* logger_creation_s = nullptr);
+
+extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src,
+                                 bool read_only = false,
+                                 Status* logger_creation_s = nullptr);
+
+extern CompressionType GetCompressionFlush(
+    const ImmutableCFOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options);
+
+// Return the earliest log file to keep after the memtable flush is
+// finalized.
+// `cfd_to_flush` is the column family whose memtable (specified in
+// `memtables_to_flush`) will be flushed and thus will not depend on any WAL
+// file.
+// The function is only applicable to 2pc mode.
+extern uint64_t PrecomputeMinLogNumberToKeep2PC(
+    VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
+    const autovector<VersionEdit*>& edit_list,
+    const autovector<MemTable*>& memtables_to_flush,
+    LogsWithPrepTracker* prep_tracker);
+// For atomic flush.
+extern uint64_t PrecomputeMinLogNumberToKeep2PC(
+    VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
+    const autovector<autovector<VersionEdit*>>& edit_lists,
+    const autovector<const autovector<MemTable*>*>& memtables_to_flush,
+    LogsWithPrepTracker* prep_tracker);
+
+// In non-2PC mode, WALs with log number < the returned number can be
+// deleted after the cfd_to_flush column family is flushed successfully.
+extern uint64_t PrecomputeMinLogNumberToKeepNon2PC(
+    VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
+    const autovector<VersionEdit*>& edit_list);
+// For atomic flush.
+extern uint64_t PrecomputeMinLogNumberToKeepNon2PC(
+    VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
+    const autovector<autovector<VersionEdit*>>& edit_lists);
+
+// `cfd_to_flush` is the column family whose memtable will be flushed and thus
+// will not depend on any WAL file. nullptr means no memtable is being flushed.
+// The function is only applicable to 2pc mode.
+extern uint64_t FindMinPrepLogReferencedByMemTable(
+    VersionSet* vset, const autovector<MemTable*>& memtables_to_flush);
+// For atomic flush.
+extern uint64_t FindMinPrepLogReferencedByMemTable(
+    VersionSet* vset,
+    const autovector<const autovector<MemTable*>*>& memtables_to_flush);
+
+// Fix user-supplied options to be reasonable
+template <class T, class V>
+static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
+  if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
+  if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
+}
+
+inline Status DBImpl::FailIfCfHasTs(
+    const ColumnFamilyHandle* column_family) const {
+  column_family = column_family ? column_family : DefaultColumnFamily();
+  assert(column_family);
+  const Comparator* const ucmp = column_family->GetComparator();
+  assert(ucmp);
+  if (ucmp->timestamp_size() > 0) {
+    std::ostringstream oss;
+    oss << "cannot call this method on column family "
+        << column_family->GetName() << " that enables timestamp";
+    return Status::InvalidArgument(oss.str());
+  }
+  return Status::OK();
+}
+
+inline Status DBImpl::FailIfTsMismatchCf(ColumnFamilyHandle* column_family,
+                                         const Slice& ts,
+                                         bool ts_for_read) const {
+  if (!column_family) {
+    return Status::InvalidArgument("column family handle cannot be null");
+  }
+  assert(column_family);
+  const Comparator* const ucmp = column_family->GetComparator();
+  assert(ucmp);
+  if (0 == ucmp->timestamp_size()) {
+    std::stringstream oss;
+    oss << "cannot call this method on column family "
+        << column_family->GetName() << " that does not enable timestamp";
+    return Status::InvalidArgument(oss.str());
+  }
+  const size_t ts_sz = ts.size();
+  if (ts_sz != ucmp->timestamp_size()) {
+    std::stringstream oss;
+    oss << "Timestamp sizes mismatch: expect " << ucmp->timestamp_size() << ", "
+        << ts_sz << " given";
+    return Status::InvalidArgument(oss.str());
+  }
+  if (ts_for_read) {
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+    auto cfd = cfh->cfd();
+    std::string current_ts_low = cfd->GetFullHistoryTsLow();
+    if (!current_ts_low.empty() &&
+        ucmp->CompareTimestamp(ts, current_ts_low) < 0) {
+      std::stringstream oss;
+      oss << "Read timestamp: " << ts.ToString(true)
+          << " is smaller than full_history_ts_low: "
+          << Slice(current_ts_low).ToString(true) << std::endl;
+      return Status::InvalidArgument(oss.str());
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_compaction_flush.cc b/src/rocksdb/db/db_impl/db_impl_compaction_flush.cc
new file mode 100644
index 000000000..a605fac87
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_compaction_flush.cc
@@ -0,0 +1,3857 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <cinttypes>
+#include <deque>
+
+#include "db/builder.h"
+#include "db/db_impl/db_impl.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "file/sst_file_manager_impl.h"
+#include "logging/logging.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/thread_status_updater.h"
+#include "monitoring/thread_status_util.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/concurrent_task_limiter_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool DBImpl::EnoughRoomForCompaction(
+    ColumnFamilyData* cfd, const std::vector<CompactionInputFiles>& inputs,
+    bool* sfm_reserved_compact_space, LogBuffer* log_buffer) {
+  // Check if we have enough room to do the compaction
+  bool enough_room = true;
+#ifndef ROCKSDB_LITE
+  auto sfm = static_cast<SstFileManagerImpl*>(
+      immutable_db_options_.sst_file_manager.get());
+  if (sfm) {
+    // Pass the current bg_error_ to SFM so it can decide what checks to
+    // perform. If this DB instance hasn't seen any error yet, the SFM can be
+    // optimistic and not do disk space checks
+    Status bg_error = error_handler_.GetBGError();
+    enough_room = sfm->EnoughRoomForCompaction(cfd, inputs, bg_error);
+    bg_error.PermitUncheckedError();  // bg_error is just a copy of the Status
+                                      // from the error_handler_
+    if (enough_room) {
+      *sfm_reserved_compact_space = true;
+    }
+  }
+#else
+  (void)cfd;
+  (void)inputs;
+  (void)sfm_reserved_compact_space;
+#endif  // ROCKSDB_LITE
+  if (!enough_room) {
+    // Just in case tests want to change the value of enough_room
+    TEST_SYNC_POINT_CALLBACK(
+        "DBImpl::BackgroundCompaction():CancelledCompaction", &enough_room);
+    ROCKS_LOG_BUFFER(log_buffer,
+                     "Cancelled compaction because not enough room");
+    RecordTick(stats_, COMPACTION_CANCELLED, 1);
+  }
+  return enough_room;
+}
+
+bool DBImpl::RequestCompactionToken(ColumnFamilyData* cfd, bool force,
+                                    std::unique_ptr<TaskLimiterToken>* token,
+                                    LogBuffer* log_buffer) {
+  assert(*token == nullptr);
+  auto limiter = static_cast<ConcurrentTaskLimiterImpl*>(
+      cfd->ioptions()->compaction_thread_limiter.get());
+  if (limiter == nullptr) {
+    return true;
+  }
+  *token = limiter->GetToken(force);
+  if (*token != nullptr) {
+    ROCKS_LOG_BUFFER(log_buffer,
+                     "Thread limiter [%s] increase [%s] compaction task, "
+                     "force: %s, tasks after: %d",
+                     limiter->GetName().c_str(), cfd->GetName().c_str(),
+                     force ? "true" : "false", limiter->GetOutstandingTask());
+    return true;
+  }
+  return false;
+}
+
+IOStatus DBImpl::SyncClosedLogs(JobContext* job_context,
+                                VersionEdit* synced_wals) {
+  TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Start");
+  InstrumentedMutexLock l(&log_write_mutex_);
+  autovector<log::Writer*, 1> logs_to_sync;
+  uint64_t current_log_number = logfile_number_;
+  while (logs_.front().number < current_log_number &&
+         logs_.front().IsSyncing()) {
+    log_sync_cv_.Wait();
+  }
+  for (auto it = logs_.begin();
+       it != logs_.end() && it->number < current_log_number; ++it) {
+    auto& log = *it;
+    log.PrepareForSync();
+    logs_to_sync.push_back(log.writer);
+  }
+
+  IOStatus io_s;
+  if (!logs_to_sync.empty()) {
+    log_write_mutex_.Unlock();
+
+    assert(job_context);
+
+    for (log::Writer* log : logs_to_sync) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "[JOB %d] Syncing log #%" PRIu64, job_context->job_id,
+                     log->get_log_number());
+      if (error_handler_.IsRecoveryInProgress()) {
+        log->file()->reset_seen_error();
+      }
+      io_s = log->file()->Sync(immutable_db_options_.use_fsync);
+      if (!io_s.ok()) {
+        break;
+      }
+
+      if (immutable_db_options_.recycle_log_file_num > 0) {
+        if (error_handler_.IsRecoveryInProgress()) {
+          log->file()->reset_seen_error();
+        }
+        io_s = log->Close();
+        if (!io_s.ok()) {
+          break;
+        }
+      }
+    }
+    if (io_s.ok()) {
+      io_s = directories_.GetWalDir()->FsyncWithDirOptions(
+          IOOptions(), nullptr,
+          DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+    }
+
+    TEST_SYNC_POINT_CALLBACK("DBImpl::SyncClosedLogs:BeforeReLock",
+                             /*arg=*/nullptr);
+    log_write_mutex_.Lock();
+
+    // "number <= current_log_number - 1" is equivalent to
+    // "number < current_log_number".
+    if (io_s.ok()) {
+      MarkLogsSynced(current_log_number - 1, true, synced_wals);
+    } else {
+      MarkLogsNotSynced(current_log_number - 1);
+    }
+    if (!io_s.ok()) {
+      TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Failed");
+      return io_s;
+    }
+  }
+  TEST_SYNC_POINT("DBImpl::SyncClosedLogs:end");
+  return io_s;
+}
+
+Status DBImpl::FlushMemTableToOutputFile(
+    ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+    bool* made_progress, JobContext* job_context,
+    SuperVersionContext* superversion_context,
+    std::vector<SequenceNumber>& snapshot_seqs,
+    SequenceNumber earliest_write_conflict_snapshot,
+    SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
+    Env::Priority thread_pri) {
+  mutex_.AssertHeld();
+  assert(cfd);
+  assert(cfd->imm());
+  assert(cfd->imm()->NumNotFlushed() != 0);
+  assert(cfd->imm()->IsFlushPending());
+  assert(versions_);
+  assert(versions_->GetColumnFamilySet());
+  // If there are more than one column families, we need to make sure that
+  // all the log files except the most recent one are synced. Otherwise if
+  // the host crashes after flushing and before WAL is persistent, the
+  // flushed SST may contain data from write batches whose updates to
+  // other (unflushed) column families are missing.
+  const bool needs_to_sync_closed_wals =
+      logfile_number_ > 0 &&
+      versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 1;
+
+  // If needs_to_sync_closed_wals is true, we need to record the current
+  // maximum memtable ID of this column family so that a later PickMemtables()
+  // call will not pick memtables whose IDs are higher. This is due to the fact
+  // that SyncClosedLogs() may release the db mutex, and memtable switch can
+  // happen for this column family in the meantime. The newly created memtables
+  // have their data backed by unsynced WALs, thus they cannot be included in
+  // this flush job.
+  // Another reason why we must record the current maximum memtable ID of this
+  // column family: SyncClosedLogs() may release db mutex, thus it's possible
+  // for application to continue to insert into memtables increasing db's
+  // sequence number. The application may take a snapshot, but this snapshot is
+  // not included in `snapshot_seqs` which will be passed to flush job because
+  // `snapshot_seqs` has already been computed before this function starts.
+  // Recording the max memtable ID ensures that the flush job does not flush
+  // a memtable without knowing such snapshot(s).
+  uint64_t max_memtable_id = needs_to_sync_closed_wals
+                                 ? cfd->imm()->GetLatestMemTableID()
+                                 : std::numeric_limits<uint64_t>::max();
+
+  // If needs_to_sync_closed_wals is false, then the flush job will pick ALL
+  // existing memtables of the column family when PickMemTable() is called
+  // later. Although we won't call SyncClosedLogs() in this case, we may still
+  // call the callbacks of the listeners, i.e. NotifyOnFlushBegin() which also
+  // releases and re-acquires the db mutex. In the meantime, the application
+  // can still insert into the memtables and increase the db's sequence number.
+  // The application can take a snapshot, hoping that the latest visible state
+  // to this snapshto is preserved. This is hard to guarantee since db mutex
+  // not held. This newly-created snapshot is not included in `snapshot_seqs`
+  // and the flush job is unaware of its presence. Consequently, the flush job
+  // may drop certain keys when generating the L0, causing incorrect data to be
+  // returned for snapshot read using this snapshot.
+  // To address this, we make sure NotifyOnFlushBegin() executes after memtable
+  // picking so that no new snapshot can be taken between the two functions.
+
+  FlushJob flush_job(
+      dbname_, cfd, immutable_db_options_, mutable_cf_options, max_memtable_id,
+      file_options_for_compaction_, versions_.get(), &mutex_, &shutting_down_,
+      snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker,
+      job_context, log_buffer, directories_.GetDbDir(), GetDataDir(cfd, 0U),
+      GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_,
+      &event_logger_, mutable_cf_options.report_bg_io_stats,
+      true /* sync_output_directory */, true /* write_manifest */, thread_pri,
+      io_tracer_, seqno_time_mapping_, db_id_, db_session_id_,
+      cfd->GetFullHistoryTsLow(), &blob_callback_);
+  FileMetaData file_meta;
+
+  Status s;
+  bool need_cancel = false;
+  IOStatus log_io_s = IOStatus::OK();
+  if (needs_to_sync_closed_wals) {
+    // SyncClosedLogs() may unlock and re-lock the log_write_mutex multiple
+    // times.
+    VersionEdit synced_wals;
+    mutex_.Unlock();
+    log_io_s = SyncClosedLogs(job_context, &synced_wals);
+    mutex_.Lock();
+    if (log_io_s.ok() && synced_wals.IsWalAddition()) {
+      log_io_s = status_to_io_status(ApplyWALToManifest(&synced_wals));
+      TEST_SYNC_POINT_CALLBACK("DBImpl::FlushMemTableToOutputFile:CommitWal:1",
+                               nullptr);
+    }
+
+    if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() &&
+        !log_io_s.IsColumnFamilyDropped()) {
+      error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlush);
+    }
+  } else {
+    TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Skip");
+  }
+  s = log_io_s;
+
+  // If the log sync failed, we do not need to pick memtable. Otherwise,
+  // num_flush_not_started_ needs to be rollback.
+  TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:BeforePickMemtables");
+  if (s.ok()) {
+    flush_job.PickMemTable();
+    need_cancel = true;
+  }
+  TEST_SYNC_POINT_CALLBACK(
+      "DBImpl::FlushMemTableToOutputFile:AfterPickMemtables", &flush_job);
+
+#ifndef ROCKSDB_LITE
+  // may temporarily unlock and lock the mutex.
+  NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id);
+#endif  // ROCKSDB_LITE
+
+  bool switched_to_mempurge = false;
+  // Within flush_job.Run, rocksdb may call event listener to notify
+  // file creation and deletion.
+  //
+  // Note that flush_job.Run will unlock and lock the db_mutex,
+  // and EventListener callback will be called when the db_mutex
+  // is unlocked by the current thread.
+  if (s.ok()) {
+    s = flush_job.Run(&logs_with_prep_tracker_, &file_meta,
+                      &switched_to_mempurge);
+    need_cancel = false;
+  }
+
+  if (!s.ok() && need_cancel) {
+    flush_job.Cancel();
+  }
+
+  if (s.ok()) {
+    InstallSuperVersionAndScheduleWork(cfd, superversion_context,
+                                       mutable_cf_options);
+    if (made_progress) {
+      *made_progress = true;
+    }
+
+    const std::string& column_family_name = cfd->GetName();
+
+    Version* const current = cfd->current();
+    assert(current);
+
+    const VersionStorageInfo* const storage_info = current->storage_info();
+    assert(storage_info);
+
+    VersionStorageInfo::LevelSummaryStorage tmp;
+    ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n",
+                     column_family_name.c_str(),
+                     storage_info->LevelSummary(&tmp));
+
+    const auto& blob_files = storage_info->GetBlobFiles();
+    if (!blob_files.empty()) {
+      assert(blob_files.front());
+      assert(blob_files.back());
+
+      ROCKS_LOG_BUFFER(
+          log_buffer,
+          "[%s] Blob file summary: head=%" PRIu64 ", tail=%" PRIu64 "\n",
+          column_family_name.c_str(), blob_files.front()->GetBlobFileNumber(),
+          blob_files.back()->GetBlobFileNumber());
+    }
+  }
+
+  if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) {
+    if (log_io_s.ok()) {
+      // Error while writing to MANIFEST.
+      // In fact, versions_->io_status() can also be the result of renaming
+      // CURRENT file. With current code, it's just difficult to tell. So just
+      // be pessimistic and try write to a new MANIFEST.
+      // TODO: distinguish between MANIFEST write and CURRENT renaming
+      if (!versions_->io_status().ok()) {
+        // If WAL sync is successful (either WAL size is 0 or there is no IO
+        // error), all the Manifest write will be map to soft error.
+        // TODO: kManifestWriteNoWAL and kFlushNoWAL are misleading. Refactor is
+        // needed.
+        error_handler_.SetBGError(s,
+                                  BackgroundErrorReason::kManifestWriteNoWAL);
+      } else {
+        // If WAL sync is successful (either WAL size is 0 or there is no IO
+        // error), all the other SST file write errors will be set as
+        // kFlushNoWAL.
+        error_handler_.SetBGError(s, BackgroundErrorReason::kFlushNoWAL);
+      }
+    } else {
+      assert(s == log_io_s);
+      Status new_bg_error = s;
+      error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
+    }
+  }
+  // If flush ran smoothly and no mempurge happened
+  // install new SST file path.
+  if (s.ok() && (!switched_to_mempurge)) {
+#ifndef ROCKSDB_LITE
+    // may temporarily unlock and lock the mutex.
+    NotifyOnFlushCompleted(cfd, mutable_cf_options,
+                           flush_job.GetCommittedFlushJobsInfo());
+    auto sfm = static_cast<SstFileManagerImpl*>(
+        immutable_db_options_.sst_file_manager.get());
+    if (sfm) {
+      // Notify sst_file_manager that a new file was added
+      std::string file_path = MakeTableFileName(
+          cfd->ioptions()->cf_paths[0].path, file_meta.fd.GetNumber());
+      // TODO (PR7798).  We should only add the file to the FileManager if it
+      // exists. Otherwise, some tests may fail.  Ignore the error in the
+      // interim.
+      sfm->OnAddFile(file_path).PermitUncheckedError();
+      if (sfm->IsMaxAllowedSpaceReached()) {
+        Status new_bg_error =
+            Status::SpaceLimit("Max allowed space was reached");
+        TEST_SYNC_POINT_CALLBACK(
+            "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached",
+            &new_bg_error);
+        error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
+      }
+    }
+#endif  // ROCKSDB_LITE
+  }
+  TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:Finish");
+  return s;
+}
+
+Status DBImpl::FlushMemTablesToOutputFiles(
+    const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+    JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri) {
+  if (immutable_db_options_.atomic_flush) {
+    return AtomicFlushMemTablesToOutputFiles(
+        bg_flush_args, made_progress, job_context, log_buffer, thread_pri);
+  }
+  assert(bg_flush_args.size() == 1);
+  std::vector<SequenceNumber> snapshot_seqs;
+  SequenceNumber earliest_write_conflict_snapshot;
+  SnapshotChecker* snapshot_checker;
+  GetSnapshotContext(job_context, &snapshot_seqs,
+                     &earliest_write_conflict_snapshot, &snapshot_checker);
+  const auto& bg_flush_arg = bg_flush_args[0];
+  ColumnFamilyData* cfd = bg_flush_arg.cfd_;
+  // intentional infrequent copy for each flush
+  MutableCFOptions mutable_cf_options_copy = *cfd->GetLatestMutableCFOptions();
+  SuperVersionContext* superversion_context =
+      bg_flush_arg.superversion_context_;
+  Status s = FlushMemTableToOutputFile(
+      cfd, mutable_cf_options_copy, made_progress, job_context,
+      superversion_context, snapshot_seqs, earliest_write_conflict_snapshot,
+      snapshot_checker, log_buffer, thread_pri);
+  return s;
+}
+
+/*
+ * Atomically flushes multiple column families.
+ *
+ * For each column family, all memtables with ID smaller than or equal to the
+ * ID specified in bg_flush_args will be flushed. Only after all column
+ * families finish flush will this function commit to MANIFEST. If any of the
+ * column families are not flushed successfully, this function does not have
+ * any side-effect on the state of the database.
+ */
+Status DBImpl::AtomicFlushMemTablesToOutputFiles(
+    const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+    JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri) {
+  mutex_.AssertHeld();
+
+  autovector<ColumnFamilyData*> cfds;
+  for (const auto& arg : bg_flush_args) {
+    cfds.emplace_back(arg.cfd_);
+  }
+
+#ifndef NDEBUG
+  for (const auto cfd : cfds) {
+    assert(cfd->imm()->NumNotFlushed() != 0);
+    assert(cfd->imm()->IsFlushPending());
+    assert(cfd->GetFlushReason() == cfds[0]->GetFlushReason());
+  }
+#endif /* !NDEBUG */
+
+  std::vector<SequenceNumber> snapshot_seqs;
+  SequenceNumber earliest_write_conflict_snapshot;
+  SnapshotChecker* snapshot_checker;
+  GetSnapshotContext(job_context, &snapshot_seqs,
+                     &earliest_write_conflict_snapshot, &snapshot_checker);
+
+  autovector<FSDirectory*> distinct_output_dirs;
+  autovector<std::string> distinct_output_dir_paths;
+  std::vector<std::unique_ptr<FlushJob>> jobs;
+  std::vector<MutableCFOptions> all_mutable_cf_options;
+  int num_cfs = static_cast<int>(cfds.size());
+  all_mutable_cf_options.reserve(num_cfs);
+  for (int i = 0; i < num_cfs; ++i) {
+    auto cfd = cfds[i];
+    FSDirectory* data_dir = GetDataDir(cfd, 0U);
+    const std::string& curr_path = cfd->ioptions()->cf_paths[0].path;
+
+    // Add to distinct output directories if eligible. Use linear search. Since
+    // the number of elements in the vector is not large, performance should be
+    // tolerable.
+    bool found = false;
+    for (const auto& path : distinct_output_dir_paths) {
+      if (path == curr_path) {
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      distinct_output_dir_paths.emplace_back(curr_path);
+      distinct_output_dirs.emplace_back(data_dir);
+    }
+
+    all_mutable_cf_options.emplace_back(*cfd->GetLatestMutableCFOptions());
+    const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.back();
+    uint64_t max_memtable_id = bg_flush_args[i].max_memtable_id_;
+    jobs.emplace_back(new FlushJob(
+        dbname_, cfd, immutable_db_options_, mutable_cf_options,
+        max_memtable_id, file_options_for_compaction_, versions_.get(), &mutex_,
+        &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot,
+        snapshot_checker, job_context, log_buffer, directories_.GetDbDir(),
+        data_dir, GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
+        stats_, &event_logger_, mutable_cf_options.report_bg_io_stats,
+        false /* sync_output_directory */, false /* write_manifest */,
+        thread_pri, io_tracer_, seqno_time_mapping_, db_id_, db_session_id_,
+        cfd->GetFullHistoryTsLow(), &blob_callback_));
+  }
+
+  std::vector<FileMetaData> file_meta(num_cfs);
+  // Use of deque<bool> because vector<bool>
+  // is specific and doesn't allow &v[i].
+  std::deque<bool> switched_to_mempurge(num_cfs, false);
+  Status s;
+  IOStatus log_io_s = IOStatus::OK();
+  assert(num_cfs == static_cast<int>(jobs.size()));
+
+#ifndef ROCKSDB_LITE
+  for (int i = 0; i != num_cfs; ++i) {
+    const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.at(i);
+    // may temporarily unlock and lock the mutex.
+    NotifyOnFlushBegin(cfds[i], &file_meta[i], mutable_cf_options,
+                       job_context->job_id);
+  }
+#endif /* !ROCKSDB_LITE */
+
+  if (logfile_number_ > 0) {
+    // TODO (yanqin) investigate whether we should sync the closed logs for
+    // single column family case.
+    VersionEdit synced_wals;
+    mutex_.Unlock();
+    log_io_s = SyncClosedLogs(job_context, &synced_wals);
+    mutex_.Lock();
+    if (log_io_s.ok() && synced_wals.IsWalAddition()) {
+      log_io_s = status_to_io_status(ApplyWALToManifest(&synced_wals));
+    }
+
+    if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() &&
+        !log_io_s.IsColumnFamilyDropped()) {
+      if (total_log_size_ > 0) {
+        error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlush);
+      } else {
+        // If the WAL is empty, we use different error reason
+        error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlushNoWAL);
+      }
+    }
+  }
+  s = log_io_s;
+
+  // exec_status stores the execution status of flush_jobs as
+  // <bool /* executed */, Status /* status code */>
+  autovector<std::pair<bool, Status>> exec_status;
+  std::vector<bool> pick_status;
+  for (int i = 0; i != num_cfs; ++i) {
+    // Initially all jobs are not executed, with status OK.
+    exec_status.emplace_back(false, Status::OK());
+    pick_status.push_back(false);
+  }
+
+  if (s.ok()) {
+    for (int i = 0; i != num_cfs; ++i) {
+      jobs[i]->PickMemTable();
+      pick_status[i] = true;
+    }
+  }
+
+  if (s.ok()) {
+    assert(switched_to_mempurge.size() ==
+           static_cast<long unsigned int>(num_cfs));
+    // TODO (yanqin): parallelize jobs with threads.
+    for (int i = 1; i != num_cfs; ++i) {
+      exec_status[i].second =
+          jobs[i]->Run(&logs_with_prep_tracker_, &file_meta[i],
+                       &(switched_to_mempurge.at(i)));
+      exec_status[i].first = true;
+    }
+    if (num_cfs > 1) {
+      TEST_SYNC_POINT(
+          "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:1");
+      TEST_SYNC_POINT(
+          "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:2");
+    }
+    assert(exec_status.size() > 0);
+    assert(!file_meta.empty());
+    exec_status[0].second = jobs[0]->Run(
+        &logs_with_prep_tracker_, file_meta.data() /* &file_meta[0] */,
+        switched_to_mempurge.empty() ? nullptr : &(switched_to_mempurge.at(0)));
+    exec_status[0].first = true;
+
+    Status error_status;
+    for (const auto& e : exec_status) {
+      if (!e.second.ok()) {
+        s = e.second;
+        if (!e.second.IsShutdownInProgress() &&
+            !e.second.IsColumnFamilyDropped()) {
+          // If a flush job did not return OK, and the CF is not dropped, and
+          // the DB is not shutting down, then we have to return this result to
+          // caller later.
+          error_status = e.second;
+        }
+      }
+    }
+
+    s = error_status.ok() ? s : error_status;
+  }
+
+  if (s.IsColumnFamilyDropped()) {
+    s = Status::OK();
+  }
+
+  if (s.ok() || s.IsShutdownInProgress()) {
+    // Sync on all distinct output directories.
+    for (auto dir : distinct_output_dirs) {
+      if (dir != nullptr) {
+        Status error_status = dir->FsyncWithDirOptions(
+            IOOptions(), nullptr,
+            DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+        if (!error_status.ok()) {
+          s = error_status;
+          break;
+        }
+      }
+    }
+  } else {
+    // Need to undo atomic flush if something went wrong, i.e. s is not OK and
+    // it is not because of CF drop.
+    // Have to cancel the flush jobs that have NOT executed because we need to
+    // unref the versions.
+    for (int i = 0; i != num_cfs; ++i) {
+      if (pick_status[i] && !exec_status[i].first) {
+        jobs[i]->Cancel();
+      }
+    }
+    for (int i = 0; i != num_cfs; ++i) {
+      if (exec_status[i].second.ok() && exec_status[i].first) {
+        auto& mems = jobs[i]->GetMemTables();
+        cfds[i]->imm()->RollbackMemtableFlush(mems,
+                                              file_meta[i].fd.GetNumber());
+      }
+    }
+  }
+
+  if (s.ok()) {
+    const auto wait_to_install_func =
+        [&]() -> std::pair<Status, bool /*continue to wait*/> {
+      if (!versions_->io_status().ok()) {
+        // Something went wrong elsewhere, we cannot count on waiting for our
+        // turn to write/sync to MANIFEST or CURRENT. Just return.
+        return std::make_pair(versions_->io_status(), false);
+      } else if (shutting_down_.load(std::memory_order_acquire)) {
+        return std::make_pair(Status::ShutdownInProgress(), false);
+      }
+      bool ready = true;
+      for (size_t i = 0; i != cfds.size(); ++i) {
+        const auto& mems = jobs[i]->GetMemTables();
+        if (cfds[i]->IsDropped()) {
+          // If the column family is dropped, then do not wait.
+          continue;
+        } else if (!mems.empty() &&
+                   cfds[i]->imm()->GetEarliestMemTableID() < mems[0]->GetID()) {
+          // If a flush job needs to install the flush result for mems and
+          // mems[0] is not the earliest memtable, it means another thread must
+          // be installing flush results for the same column family, then the
+          // current thread needs to wait.
+          ready = false;
+          break;
+        } else if (mems.empty() && cfds[i]->imm()->GetEarliestMemTableID() <=
+                                       bg_flush_args[i].max_memtable_id_) {
+          // If a flush job does not need to install flush results, then it has
+          // to wait until all memtables up to max_memtable_id_ (inclusive) are
+          // installed.
+          ready = false;
+          break;
+        }
+      }
+      return std::make_pair(Status::OK(), !ready);
+    };
+
+    bool resuming_from_bg_err =
+        error_handler_.IsDBStopped() ||
+        (cfds[0]->GetFlushReason() == FlushReason::kErrorRecovery ||
+         cfds[0]->GetFlushReason() == FlushReason::kErrorRecoveryRetryFlush);
+    while ((!resuming_from_bg_err || error_handler_.GetRecoveryError().ok())) {
+      std::pair<Status, bool> res = wait_to_install_func();
+
+      TEST_SYNC_POINT_CALLBACK(
+          "DBImpl::AtomicFlushMemTablesToOutputFiles:WaitToCommit", &res);
+
+      if (!res.first.ok()) {
+        s = res.first;
+        break;
+      } else if (!res.second) {
+        break;
+      }
+      atomic_flush_install_cv_.Wait();
+
+      resuming_from_bg_err =
+          error_handler_.IsDBStopped() ||
+          (cfds[0]->GetFlushReason() == FlushReason::kErrorRecovery ||
+           cfds[0]->GetFlushReason() == FlushReason::kErrorRecoveryRetryFlush);
+    }
+
+    if (!resuming_from_bg_err) {
+      // If not resuming from bg err, then we determine future action based on
+      // whether we hit background error.
+      if (s.ok()) {
+        s = error_handler_.GetBGError();
+      }
+    } else if (s.ok()) {
+      // If resuming from bg err, we still rely on wait_to_install_func()'s
+      // result to determine future action. If wait_to_install_func() returns
+      // non-ok already, then we should not proceed to flush result
+      // installation.
+      s = error_handler_.GetRecoveryError();
+    }
+  }
+
+  if (s.ok()) {
+    autovector<ColumnFamilyData*> tmp_cfds;
+    autovector<const autovector<MemTable*>*> mems_list;
+    autovector<const MutableCFOptions*> mutable_cf_options_list;
+    autovector<FileMetaData*> tmp_file_meta;
+    autovector<std::list<std::unique_ptr<FlushJobInfo>>*>
+        committed_flush_jobs_info;
+    for (int i = 0; i != num_cfs; ++i) {
+      const auto& mems = jobs[i]->GetMemTables();
+      if (!cfds[i]->IsDropped() && !mems.empty()) {
+        tmp_cfds.emplace_back(cfds[i]);
+        mems_list.emplace_back(&mems);
+        mutable_cf_options_list.emplace_back(&all_mutable_cf_options[i]);
+        tmp_file_meta.emplace_back(&file_meta[i]);
+#ifndef ROCKSDB_LITE
+        committed_flush_jobs_info.emplace_back(
+            jobs[i]->GetCommittedFlushJobsInfo());
+#endif  //! ROCKSDB_LITE
+      }
+    }
+
+    s = InstallMemtableAtomicFlushResults(
+        nullptr /* imm_lists */, tmp_cfds, mutable_cf_options_list, mems_list,
+        versions_.get(), &logs_with_prep_tracker_, &mutex_, tmp_file_meta,
+        committed_flush_jobs_info, &job_context->memtables_to_free,
+        directories_.GetDbDir(), log_buffer);
+  }
+
+  if (s.ok()) {
+    assert(num_cfs ==
+           static_cast<int>(job_context->superversion_contexts.size()));
+    for (int i = 0; i != num_cfs; ++i) {
+      assert(cfds[i]);
+
+      if (cfds[i]->IsDropped()) {
+        continue;
+      }
+      InstallSuperVersionAndScheduleWork(cfds[i],
+                                         &job_context->superversion_contexts[i],
+                                         all_mutable_cf_options[i]);
+
+      const std::string& column_family_name = cfds[i]->GetName();
+
+      Version* const current = cfds[i]->current();
+      assert(current);
+
+      const VersionStorageInfo* const storage_info = current->storage_info();
+      assert(storage_info);
+
+      VersionStorageInfo::LevelSummaryStorage tmp;
+      ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n",
+                       column_family_name.c_str(),
+                       storage_info->LevelSummary(&tmp));
+
+      const auto& blob_files = storage_info->GetBlobFiles();
+      if (!blob_files.empty()) {
+        assert(blob_files.front());
+        assert(blob_files.back());
+
+        ROCKS_LOG_BUFFER(
+            log_buffer,
+            "[%s] Blob file summary: head=%" PRIu64 ", tail=%" PRIu64 "\n",
+            column_family_name.c_str(), blob_files.front()->GetBlobFileNumber(),
+            blob_files.back()->GetBlobFileNumber());
+      }
+    }
+    if (made_progress) {
+      *made_progress = true;
+    }
+#ifndef ROCKSDB_LITE
+    auto sfm = static_cast<SstFileManagerImpl*>(
+        immutable_db_options_.sst_file_manager.get());
+    assert(all_mutable_cf_options.size() == static_cast<size_t>(num_cfs));
+    for (int i = 0; s.ok() && i != num_cfs; ++i) {
+      // If mempurge happened instead of Flush,
+      // no NotifyOnFlushCompleted call (no SST file created).
+      if (switched_to_mempurge[i]) {
+        continue;
+      }
+      if (cfds[i]->IsDropped()) {
+        continue;
+      }
+      NotifyOnFlushCompleted(cfds[i], all_mutable_cf_options[i],
+                             jobs[i]->GetCommittedFlushJobsInfo());
+      if (sfm) {
+        std::string file_path = MakeTableFileName(
+            cfds[i]->ioptions()->cf_paths[0].path, file_meta[i].fd.GetNumber());
+        // TODO (PR7798).  We should only add the file to the FileManager if it
+        // exists. Otherwise, some tests may fail.  Ignore the error in the
+        // interim.
+        sfm->OnAddFile(file_path).PermitUncheckedError();
+        if (sfm->IsMaxAllowedSpaceReached() &&
+            error_handler_.GetBGError().ok()) {
+          Status new_bg_error =
+              Status::SpaceLimit("Max allowed space was reached");
+          error_handler_.SetBGError(new_bg_error,
+                                    BackgroundErrorReason::kFlush);
+        }
+      }
+    }
+#endif  // ROCKSDB_LITE
+  }
+
+  // Need to undo atomic flush if something went wrong, i.e. s is not OK and
+  // it is not because of CF drop.
+  if (!s.ok() && !s.IsColumnFamilyDropped()) {
+    if (log_io_s.ok()) {
+      // Error while writing to MANIFEST.
+      // In fact, versions_->io_status() can also be the result of renaming
+      // CURRENT file. With current code, it's just difficult to tell. So just
+      // be pessimistic and try write to a new MANIFEST.
+      // TODO: distinguish between MANIFEST write and CURRENT renaming
+      if (!versions_->io_status().ok()) {
+        // If WAL sync is successful (either WAL size is 0 or there is no IO
+        // error), all the Manifest write will be map to soft error.
+        // TODO: kManifestWriteNoWAL and kFlushNoWAL are misleading. Refactor
+        // is needed.
+        error_handler_.SetBGError(s,
+                                  BackgroundErrorReason::kManifestWriteNoWAL);
+      } else {
+        // If WAL sync is successful (either WAL size is 0 or there is no IO
+        // error), all the other SST file write errors will be set as
+        // kFlushNoWAL.
+        error_handler_.SetBGError(s, BackgroundErrorReason::kFlushNoWAL);
+      }
+    } else {
+      assert(s == log_io_s);
+      Status new_bg_error = s;
+      error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
+    }
+  }
+
+  return s;
+}
+
+void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
+                                const MutableCFOptions& mutable_cf_options,
+                                int job_id) {
+#ifndef ROCKSDB_LITE
+  if (immutable_db_options_.listeners.size() == 0U) {
+    return;
+  }
+  mutex_.AssertHeld();
+  if (shutting_down_.load(std::memory_order_acquire)) {
+    return;
+  }
+  bool triggered_writes_slowdown =
+      (cfd->current()->storage_info()->NumLevelFiles(0) >=
+       mutable_cf_options.level0_slowdown_writes_trigger);
+  bool triggered_writes_stop =
+      (cfd->current()->storage_info()->NumLevelFiles(0) >=
+       mutable_cf_options.level0_stop_writes_trigger);
+  // release lock while notifying events
+  mutex_.Unlock();
+  {
+    FlushJobInfo info{};
+    info.cf_id = cfd->GetID();
+    info.cf_name = cfd->GetName();
+    // TODO(yhchiang): make db_paths dynamic in case flush does not
+    //                 go to L0 in the future.
+    const uint64_t file_number = file_meta->fd.GetNumber();
+    info.file_path =
+        MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_number);
+    info.file_number = file_number;
+    info.thread_id = env_->GetThreadID();
+    info.job_id = job_id;
+    info.triggered_writes_slowdown = triggered_writes_slowdown;
+    info.triggered_writes_stop = triggered_writes_stop;
+    info.smallest_seqno = file_meta->fd.smallest_seqno;
+    info.largest_seqno = file_meta->fd.largest_seqno;
+    info.flush_reason = cfd->GetFlushReason();
+    for (auto listener : immutable_db_options_.listeners) {
+      listener->OnFlushBegin(this, info);
+    }
+  }
+  mutex_.Lock();
+// no need to signal bg_cv_ as it will be signaled at the end of the
+// flush process.
+#else
+  (void)cfd;
+  (void)file_meta;
+  (void)mutable_cf_options;
+  (void)job_id;
+#endif  // ROCKSDB_LITE
+}
+
+void DBImpl::NotifyOnFlushCompleted(
+    ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+    std::list<std::unique_ptr<FlushJobInfo>>* flush_jobs_info) {
+#ifndef ROCKSDB_LITE
+  assert(flush_jobs_info != nullptr);
+  if (immutable_db_options_.listeners.size() == 0U) {
+    return;
+  }
+  mutex_.AssertHeld();
+  if (shutting_down_.load(std::memory_order_acquire)) {
+    return;
+  }
+  bool triggered_writes_slowdown =
+      (cfd->current()->storage_info()->NumLevelFiles(0) >=
+       mutable_cf_options.level0_slowdown_writes_trigger);
+  bool triggered_writes_stop =
+      (cfd->current()->storage_info()->NumLevelFiles(0) >=
+       mutable_cf_options.level0_stop_writes_trigger);
+  // release lock while notifying events
+  mutex_.Unlock();
+  {
+    for (auto& info : *flush_jobs_info) {
+      info->triggered_writes_slowdown = triggered_writes_slowdown;
+      info->triggered_writes_stop = triggered_writes_stop;
+      for (auto listener : immutable_db_options_.listeners) {
+        listener->OnFlushCompleted(this, *info);
+      }
+      TEST_SYNC_POINT(
+          "DBImpl::NotifyOnFlushCompleted::PostAllOnFlushCompleted");
+    }
+    flush_jobs_info->clear();
+  }
+  mutex_.Lock();
+  // no need to signal bg_cv_ as it will be signaled at the end of the
+  // flush process.
+#else
+  (void)cfd;
+  (void)mutable_cf_options;
+  (void)flush_jobs_info;
+#endif  // ROCKSDB_LITE
+}
+
+Status DBImpl::CompactRange(const CompactRangeOptions& options,
+                            ColumnFamilyHandle* column_family,
+                            const Slice* begin_without_ts,
+                            const Slice* end_without_ts) {
+  if (manual_compaction_paused_.load(std::memory_order_acquire) > 0) {
+    return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+  }
+
+  if (options.canceled && options.canceled->load(std::memory_order_acquire)) {
+    return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+  }
+
+  const Comparator* const ucmp = column_family->GetComparator();
+  assert(ucmp);
+  size_t ts_sz = ucmp->timestamp_size();
+  if (ts_sz == 0) {
+    return CompactRangeInternal(options, column_family, begin_without_ts,
+                                end_without_ts, "" /*trim_ts*/);
+  }
+
+  std::string begin_str;
+  std::string end_str;
+
+  // CompactRange compact all keys: [begin, end] inclusively. Add maximum
+  // timestamp to include all `begin` keys, and add minimal timestamp to include
+  // all `end` keys.
+  if (begin_without_ts != nullptr) {
+    AppendKeyWithMaxTimestamp(&begin_str, *begin_without_ts, ts_sz);
+  }
+  if (end_without_ts != nullptr) {
+    AppendKeyWithMinTimestamp(&end_str, *end_without_ts, ts_sz);
+  }
+  Slice begin(begin_str);
+  Slice end(end_str);
+
+  Slice* begin_with_ts = begin_without_ts ? &begin : nullptr;
+  Slice* end_with_ts = end_without_ts ? &end : nullptr;
+
+  return CompactRangeInternal(options, column_family, begin_with_ts,
+                              end_with_ts, "" /*trim_ts*/);
+}
+
+Status DBImpl::IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family,
+                                        std::string ts_low) {
+  ColumnFamilyData* cfd = nullptr;
+  if (column_family == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+    assert(cfh != nullptr);
+    cfd = cfh->cfd();
+  }
+  assert(cfd != nullptr && cfd->user_comparator() != nullptr);
+  if (cfd->user_comparator()->timestamp_size() == 0) {
+    return Status::InvalidArgument(
+        "Timestamp is not enabled in this column family");
+  }
+  if (cfd->user_comparator()->timestamp_size() != ts_low.size()) {
+    return Status::InvalidArgument("ts_low size mismatch");
+  }
+  return IncreaseFullHistoryTsLowImpl(cfd, ts_low);
+}
+
+Status DBImpl::IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd,
+                                            std::string ts_low) {
+  VersionEdit edit;
+  edit.SetColumnFamily(cfd->GetID());
+  edit.SetFullHistoryTsLow(ts_low);
+  TEST_SYNC_POINT_CALLBACK("DBImpl::IncreaseFullHistoryTsLowImpl:BeforeEdit",
+                           &edit);
+
+  InstrumentedMutexLock l(&mutex_);
+  std::string current_ts_low = cfd->GetFullHistoryTsLow();
+  const Comparator* ucmp = cfd->user_comparator();
+  assert(ucmp->timestamp_size() == ts_low.size() && !ts_low.empty());
+  if (!current_ts_low.empty() &&
+      ucmp->CompareTimestamp(ts_low, current_ts_low) < 0) {
+    return Status::InvalidArgument("Cannot decrease full_history_ts_low");
+  }
+
+  Status s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+                                    &edit, &mutex_, directories_.GetDbDir());
+  if (!s.ok()) {
+    return s;
+  }
+  current_ts_low = cfd->GetFullHistoryTsLow();
+  if (!current_ts_low.empty() &&
+      ucmp->CompareTimestamp(current_ts_low, ts_low) > 0) {
+    std::stringstream oss;
+    oss << "full_history_ts_low: " << Slice(current_ts_low).ToString(true)
+        << " is set to be higher than the requested "
+           "timestamp: "
+        << Slice(ts_low).ToString(true) << std::endl;
+    return Status::TryAgain(oss.str());
+  }
+  return Status::OK();
+}
+
+Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
+                                    ColumnFamilyHandle* column_family,
+                                    const Slice* begin, const Slice* end,
+                                    const std::string& trim_ts) {
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  auto cfd = cfh->cfd();
+
+  if (options.target_path_id >= cfd->ioptions()->cf_paths.size()) {
+    return Status::InvalidArgument("Invalid target path ID");
+  }
+
+  bool flush_needed = true;
+
+  // Update full_history_ts_low if it's set
+  if (options.full_history_ts_low != nullptr &&
+      !options.full_history_ts_low->empty()) {
+    std::string ts_low = options.full_history_ts_low->ToString();
+    if (begin != nullptr || end != nullptr) {
+      return Status::InvalidArgument(
+          "Cannot specify compaction range with full_history_ts_low");
+    }
+    Status s = IncreaseFullHistoryTsLowImpl(cfd, ts_low);
+    if (!s.ok()) {
+      LogFlush(immutable_db_options_.info_log);
+      return s;
+    }
+  }
+
+  Status s;
+  if (begin != nullptr && end != nullptr) {
+    // TODO(ajkr): We could also optimize away the flush in certain cases where
+    // one/both sides of the interval are unbounded. But it requires more
+    // changes to RangesOverlapWithMemtables.
+    Range range(*begin, *end);
+    SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+    s = cfd->RangesOverlapWithMemtables(
+        {range}, super_version, immutable_db_options_.allow_data_in_errors,
+        &flush_needed);
+    CleanupSuperVersion(super_version);
+  }
+
+  if (s.ok() && flush_needed) {
+    FlushOptions fo;
+    fo.allow_write_stall = options.allow_write_stall;
+    if (immutable_db_options_.atomic_flush) {
+      autovector<ColumnFamilyData*> cfds;
+      mutex_.Lock();
+      SelectColumnFamiliesForAtomicFlush(&cfds);
+      mutex_.Unlock();
+      s = AtomicFlushMemTables(cfds, fo, FlushReason::kManualCompaction,
+                               false /* entered_write_thread */);
+    } else {
+      s = FlushMemTable(cfd, fo, FlushReason::kManualCompaction,
+                        false /* entered_write_thread */);
+    }
+    if (!s.ok()) {
+      LogFlush(immutable_db_options_.info_log);
+      return s;
+    }
+  }
+
+  constexpr int kInvalidLevel = -1;
+  int final_output_level = kInvalidLevel;
+  bool exclusive = options.exclusive_manual_compaction;
+  if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal &&
+      cfd->NumberLevels() > 1) {
+    // Always compact all files together.
+    final_output_level = cfd->NumberLevels() - 1;
+    // if bottom most level is reserved
+    if (immutable_db_options_.allow_ingest_behind) {
+      final_output_level--;
+    }
+    s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels,
+                            final_output_level, options, begin, end, exclusive,
+                            false, std::numeric_limits<uint64_t>::max(),
+                            trim_ts);
+  } else {
+    int first_overlapped_level = kInvalidLevel;
+    int max_overlapped_level = kInvalidLevel;
+    {
+      SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+      Version* current_version = super_version->current;
+      ReadOptions ro;
+      ro.total_order_seek = true;
+      bool overlap;
+      for (int level = 0;
+           level < current_version->storage_info()->num_non_empty_levels();
+           level++) {
+        overlap = true;
+        if (begin != nullptr && end != nullptr) {
+          Status status = current_version->OverlapWithLevelIterator(
+              ro, file_options_, *begin, *end, level, &overlap);
+          if (!status.ok()) {
+            overlap = current_version->storage_info()->OverlapInLevel(
+                level, begin, end);
+          }
+        } else {
+          overlap = current_version->storage_info()->OverlapInLevel(level,
+                                                                    begin, end);
+        }
+        if (overlap) {
+          if (first_overlapped_level == kInvalidLevel) {
+            first_overlapped_level = level;
+          }
+          max_overlapped_level = level;
+        }
+      }
+      CleanupSuperVersion(super_version);
+    }
+    if (s.ok() && first_overlapped_level != kInvalidLevel) {
+      // max_file_num_to_ignore can be used to filter out newly created SST
+      // files, useful for bottom level compaction in a manual compaction
+      uint64_t max_file_num_to_ignore = std::numeric_limits<uint64_t>::max();
+      uint64_t next_file_number = versions_->current_next_file_number();
+      final_output_level = max_overlapped_level;
+      int output_level;
+      for (int level = first_overlapped_level; level <= max_overlapped_level;
+           level++) {
+        bool disallow_trivial_move = false;
+        // in case the compaction is universal or if we're compacting the
+        // bottom-most level, the output level will be the same as input one.
+        // level 0 can never be the bottommost level (i.e. if all files are in
+        // level 0, we will compact to level 1)
+        if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+            cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
+          output_level = level;
+        } else if (level == max_overlapped_level && level > 0) {
+          if (options.bottommost_level_compaction ==
+              BottommostLevelCompaction::kSkip) {
+            // Skip bottommost level compaction
+            continue;
+          } else if (options.bottommost_level_compaction ==
+                         BottommostLevelCompaction::kIfHaveCompactionFilter &&
+                     cfd->ioptions()->compaction_filter == nullptr &&
+                     cfd->ioptions()->compaction_filter_factory == nullptr) {
+            // Skip bottommost level compaction since we don't have a compaction
+            // filter
+            continue;
+          }
+          output_level = level;
+          // update max_file_num_to_ignore only for bottom level compaction
+          // because data in newly compacted files in middle levels may still
+          // need to be pushed down
+          max_file_num_to_ignore = next_file_number;
+        } else {
+          output_level = level + 1;
+          if (cfd->ioptions()->compaction_style == kCompactionStyleLevel &&
+              cfd->ioptions()->level_compaction_dynamic_level_bytes &&
+              level == 0) {
+            output_level = ColumnFamilyData::kCompactToBaseLevel;
+          }
+          // if it's a BottommostLevel compaction and `kForce*` compaction is
+          // set, disallow trivial move
+          if (level == max_overlapped_level &&
+              (options.bottommost_level_compaction ==
+                   BottommostLevelCompaction::kForce ||
+               options.bottommost_level_compaction ==
+                   BottommostLevelCompaction::kForceOptimized)) {
+            disallow_trivial_move = true;
+          }
+        }
+        // trim_ts need real compaction to remove latest record
+        if (!trim_ts.empty()) {
+          disallow_trivial_move = true;
+        }
+        s = RunManualCompaction(cfd, level, output_level, options, begin, end,
+                                exclusive, disallow_trivial_move,
+                                max_file_num_to_ignore, trim_ts);
+        if (!s.ok()) {
+          break;
+        }
+        if (output_level == ColumnFamilyData::kCompactToBaseLevel) {
+          final_output_level = cfd->NumberLevels() - 1;
+        } else if (output_level > final_output_level) {
+          final_output_level = output_level;
+        }
+        TEST_SYNC_POINT("DBImpl::RunManualCompaction()::1");
+        TEST_SYNC_POINT("DBImpl::RunManualCompaction()::2");
+      }
+    }
+  }
+  if (!s.ok() || final_output_level == kInvalidLevel) {
+    LogFlush(immutable_db_options_.info_log);
+    return s;
+  }
+
+  if (options.change_level) {
+    TEST_SYNC_POINT("DBImpl::CompactRange:BeforeRefit:1");
+    TEST_SYNC_POINT("DBImpl::CompactRange:BeforeRefit:2");
+
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "[RefitLevel] waiting for background threads to stop");
+    DisableManualCompaction();
+    s = PauseBackgroundWork();
+    if (s.ok()) {
+      TEST_SYNC_POINT("DBImpl::CompactRange:PreRefitLevel");
+      s = ReFitLevel(cfd, final_output_level, options.target_level);
+      TEST_SYNC_POINT("DBImpl::CompactRange:PostRefitLevel");
+      // ContinueBackgroundWork always return Status::OK().
+      Status temp_s = ContinueBackgroundWork();
+      assert(temp_s.ok());
+    }
+    EnableManualCompaction();
+    TEST_SYNC_POINT(
+        "DBImpl::CompactRange:PostRefitLevel:ManualCompactionEnabled");
+  }
+  LogFlush(immutable_db_options_.info_log);
+
+  {
+    InstrumentedMutexLock l(&mutex_);
+    // an automatic compaction that has been scheduled might have been
+    // preempted by the manual compactions. Need to schedule it back.
+    MaybeScheduleFlushOrCompaction();
+  }
+
+  return s;
+}
+
+Status DBImpl::CompactFiles(const CompactionOptions& compact_options,
+                            ColumnFamilyHandle* column_family,
+                            const std::vector<std::string>& input_file_names,
+                            const int output_level, const int output_path_id,
+                            std::vector<std::string>* const output_file_names,
+                            CompactionJobInfo* compaction_job_info) {
+#ifdef ROCKSDB_LITE
+  (void)compact_options;
+  (void)column_family;
+  (void)input_file_names;
+  (void)output_level;
+  (void)output_path_id;
+  (void)output_file_names;
+  (void)compaction_job_info;
+  // not supported in lite version
+  return Status::NotSupported("Not supported in ROCKSDB LITE");
+#else
+  if (column_family == nullptr) {
+    return Status::InvalidArgument("ColumnFamilyHandle must be non-null.");
+  }
+
+  auto cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+  assert(cfd);
+
+  Status s;
+  JobContext job_context(next_job_id_.fetch_add(1), true);
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+                       immutable_db_options_.info_log.get());
+
+  // Perform CompactFiles
+  TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile2");
+  TEST_SYNC_POINT_CALLBACK(
+      "TestCompactFiles:PausingManualCompaction:3",
+      reinterpret_cast<void*>(
+          const_cast<std::atomic<int>*>(&manual_compaction_paused_)));
+  {
+    InstrumentedMutexLock l(&mutex_);
+
+    // This call will unlock/lock the mutex to wait for current running
+    // IngestExternalFile() calls to finish.
+    WaitForIngestFile();
+
+    // We need to get current after `WaitForIngestFile`, because
+    // `IngestExternalFile` may add files that overlap with `input_file_names`
+    auto* current = cfd->current();
+    current->Ref();
+
+    s = CompactFilesImpl(compact_options, cfd, current, input_file_names,
+                         output_file_names, output_level, output_path_id,
+                         &job_context, &log_buffer, compaction_job_info);
+
+    current->Unref();
+  }
+
+  // Find and delete obsolete files
+  {
+    InstrumentedMutexLock l(&mutex_);
+    // If !s.ok(), this means that Compaction failed. In that case, we want
+    // to delete all obsolete files we might have created and we force
+    // FindObsoleteFiles(). This is because job_context does not
+    // catch all created files if compaction failed.
+    FindObsoleteFiles(&job_context, !s.ok());
+  }  // release the mutex
+
+  // delete unnecessary files if any, this is done outside the mutex
+  if (job_context.HaveSomethingToClean() ||
+      job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+    // Have to flush the info logs before bg_compaction_scheduled_--
+    // because if bg_flush_scheduled_ becomes 0 and the lock is
+    // released, the deconstructor of DB can kick in and destroy all the
+    // states of DB so info_log might not be available after that point.
+    // It also applies to access other states that DB owns.
+    log_buffer.FlushBufferToLog();
+    if (job_context.HaveSomethingToDelete()) {
+      // no mutex is locked here.  No need to Unlock() and Lock() here.
+      PurgeObsoleteFiles(job_context);
+    }
+    job_context.Clean();
+  }
+
+  return s;
+#endif  // ROCKSDB_LITE
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::CompactFilesImpl(
+    const CompactionOptions& compact_options, ColumnFamilyData* cfd,
+    Version* version, const std::vector<std::string>& input_file_names,
+    std::vector<std::string>* const output_file_names, const int output_level,
+    int output_path_id, JobContext* job_context, LogBuffer* log_buffer,
+    CompactionJobInfo* compaction_job_info) {
+  mutex_.AssertHeld();
+
+  if (shutting_down_.load(std::memory_order_acquire)) {
+    return Status::ShutdownInProgress();
+  }
+  if (manual_compaction_paused_.load(std::memory_order_acquire) > 0) {
+    return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+  }
+
+  std::unordered_set<uint64_t> input_set;
+  for (const auto& file_name : input_file_names) {
+    input_set.insert(TableFileNameToNumber(file_name));
+  }
+
+  ColumnFamilyMetaData cf_meta;
+  // TODO(yhchiang): can directly use version here if none of the
+  // following functions call is pluggable to external developers.
+  version->GetColumnFamilyMetaData(&cf_meta);
+
+  if (output_path_id < 0) {
+    if (cfd->ioptions()->cf_paths.size() == 1U) {
+      output_path_id = 0;
+    } else {
+      return Status::NotSupported(
+          "Automatic output path selection is not "
+          "yet supported in CompactFiles()");
+    }
+  }
+
+  Status s = cfd->compaction_picker()->SanitizeCompactionInputFiles(
+      &input_set, cf_meta, output_level);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::vector<CompactionInputFiles> input_files;
+  s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, version->storage_info(), compact_options);
+  if (!s.ok()) {
+    return s;
+  }
+
+  for (const auto& inputs : input_files) {
+    if (cfd->compaction_picker()->AreFilesInCompaction(inputs.files)) {
+      return Status::Aborted(
+          "Some of the necessary compaction input "
+          "files are already being compacted");
+    }
+  }
+  bool sfm_reserved_compact_space = false;
+  // First check if we have enough room to do the compaction
+  bool enough_room = EnoughRoomForCompaction(
+      cfd, input_files, &sfm_reserved_compact_space, log_buffer);
+
+  if (!enough_room) {
+    // m's vars will get set properly at the end of this function,
+    // as long as status == CompactionTooLarge
+    return Status::CompactionTooLarge();
+  }
+
+  // At this point, CompactFiles will be run.
+  bg_compaction_scheduled_++;
+
+  std::unique_ptr<Compaction> c;
+  assert(cfd->compaction_picker());
+  c.reset(cfd->compaction_picker()->CompactFiles(
+      compact_options, input_files, output_level, version->storage_info(),
+      *cfd->GetLatestMutableCFOptions(), mutable_db_options_, output_path_id));
+  // we already sanitized the set of input files and checked for conflicts
+  // without releasing the lock, so we're guaranteed a compaction can be formed.
+  assert(c != nullptr);
+
+  c->SetInputVersion(version);
+  // deletion compaction currently not allowed in CompactFiles.
+  assert(!c->deletion_compaction());
+
+  std::vector<SequenceNumber> snapshot_seqs;
+  SequenceNumber earliest_write_conflict_snapshot;
+  SnapshotChecker* snapshot_checker;
+  GetSnapshotContext(job_context, &snapshot_seqs,
+                     &earliest_write_conflict_snapshot, &snapshot_checker);
+
+  std::unique_ptr<std::list<uint64_t>::iterator> pending_outputs_inserted_elem(
+      new std::list<uint64_t>::iterator(
+          CaptureCurrentFileNumberInPendingOutputs()));
+
+  assert(is_snapshot_supported_ || snapshots_.empty());
+  CompactionJobStats compaction_job_stats;
+  CompactionJob compaction_job(
+      job_context->job_id, c.get(), immutable_db_options_, mutable_db_options_,
+      file_options_for_compaction_, versions_.get(), &shutting_down_,
+      log_buffer, directories_.GetDbDir(),
+      GetDataDir(c->column_family_data(), c->output_path_id()),
+      GetDataDir(c->column_family_data(), 0), stats_, &mutex_, &error_handler_,
+      snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker,
+      job_context, table_cache_, &event_logger_,
+      c->mutable_cf_options()->paranoid_file_checks,
+      c->mutable_cf_options()->report_bg_io_stats, dbname_,
+      &compaction_job_stats, Env::Priority::USER, io_tracer_,
+      kManualCompactionCanceledFalse_, db_id_, db_session_id_,
+      c->column_family_data()->GetFullHistoryTsLow(), c->trim_ts(),
+      &blob_callback_, &bg_compaction_scheduled_,
+      &bg_bottom_compaction_scheduled_);
+
+  // Creating a compaction influences the compaction score because the score
+  // takes running compactions into account (by skipping files that are already
+  // being compacted). Since we just changed compaction score, we recalculate it
+  // here.
+  version->storage_info()->ComputeCompactionScore(*cfd->ioptions(),
+                                                  *c->mutable_cf_options());
+
+  compaction_job.Prepare();
+
+  mutex_.Unlock();
+  TEST_SYNC_POINT("CompactFilesImpl:0");
+  TEST_SYNC_POINT("CompactFilesImpl:1");
+  // Ignore the status here, as it will be checked in the Install down below...
+  compaction_job.Run().PermitUncheckedError();
+  TEST_SYNC_POINT("CompactFilesImpl:2");
+  TEST_SYNC_POINT("CompactFilesImpl:3");
+  mutex_.Lock();
+
+  Status status = compaction_job.Install(*c->mutable_cf_options());
+  if (status.ok()) {
+    assert(compaction_job.io_status().ok());
+    InstallSuperVersionAndScheduleWork(c->column_family_data(),
+                                       &job_context->superversion_contexts[0],
+                                       *c->mutable_cf_options());
+  }
+  // status above captures any error during compaction_job.Install, so its ok
+  // not check compaction_job.io_status() explicitly if we're not calling
+  // SetBGError
+  compaction_job.io_status().PermitUncheckedError();
+  c->ReleaseCompactionFiles(s);
+#ifndef ROCKSDB_LITE
+  // Need to make sure SstFileManager does its bookkeeping
+  auto sfm = static_cast<SstFileManagerImpl*>(
+      immutable_db_options_.sst_file_manager.get());
+  if (sfm && sfm_reserved_compact_space) {
+    sfm->OnCompactionCompletion(c.get());
+  }
+#endif  // ROCKSDB_LITE
+
+  ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
+
+  if (compaction_job_info != nullptr) {
+    BuildCompactionJobInfo(cfd, c.get(), s, compaction_job_stats,
+                           job_context->job_id, version, compaction_job_info);
+  }
+
+  if (status.ok()) {
+    // Done
+  } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) {
+    // Ignore compaction errors found during shutting down
+  } else if (status.IsManualCompactionPaused()) {
+    // Don't report stopping manual compaction as error
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "[%s] [JOB %d] Stopping manual compaction",
+                   c->column_family_data()->GetName().c_str(),
+                   job_context->job_id);
+  } else {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "[%s] [JOB %d] Compaction error: %s",
+                   c->column_family_data()->GetName().c_str(),
+                   job_context->job_id, status.ToString().c_str());
+    IOStatus io_s = compaction_job.io_status();
+    if (!io_s.ok()) {
+      error_handler_.SetBGError(io_s, BackgroundErrorReason::kCompaction);
+    } else {
+      error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction);
+    }
+  }
+
+  if (output_file_names != nullptr) {
+    for (const auto& newf : c->edit()->GetNewFiles()) {
+      output_file_names->push_back(TableFileName(
+          c->immutable_options()->cf_paths, newf.second.fd.GetNumber(),
+          newf.second.fd.GetPathId()));
+    }
+
+    for (const auto& blob_file : c->edit()->GetBlobFileAdditions()) {
+      output_file_names->push_back(
+          BlobFileName(c->immutable_options()->cf_paths.front().path,
+                       blob_file.GetBlobFileNumber()));
+    }
+  }
+
+  c.reset();
+
+  bg_compaction_scheduled_--;
+  if (bg_compaction_scheduled_ == 0) {
+    bg_cv_.SignalAll();
+  }
+  MaybeScheduleFlushOrCompaction();
+  TEST_SYNC_POINT("CompactFilesImpl:End");
+
+  return status;
+}
+#endif  // ROCKSDB_LITE
+
+Status DBImpl::PauseBackgroundWork() {
+  InstrumentedMutexLock guard_lock(&mutex_);
+  bg_compaction_paused_++;
+  while (bg_bottom_compaction_scheduled_ > 0 || bg_compaction_scheduled_ > 0 ||
+         bg_flush_scheduled_ > 0) {
+    bg_cv_.Wait();
+  }
+  bg_work_paused_++;
+  return Status::OK();
+}
+
+Status DBImpl::ContinueBackgroundWork() {
+  InstrumentedMutexLock guard_lock(&mutex_);
+  if (bg_work_paused_ == 0) {
+    return Status::InvalidArgument();
+  }
+  assert(bg_work_paused_ > 0);
+  assert(bg_compaction_paused_ > 0);
+  bg_compaction_paused_--;
+  bg_work_paused_--;
+  // It's sufficient to check just bg_work_paused_ here since
+  // bg_work_paused_ is always no greater than bg_compaction_paused_
+  if (bg_work_paused_ == 0) {
+    MaybeScheduleFlushOrCompaction();
+  }
+  return Status::OK();
+}
+
+void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
+                                     const Status& st,
+                                     const CompactionJobStats& job_stats,
+                                     int job_id) {
+#ifndef ROCKSDB_LITE
+  if (immutable_db_options_.listeners.empty()) {
+    return;
+  }
+  mutex_.AssertHeld();
+  if (shutting_down_.load(std::memory_order_acquire)) {
+    return;
+  }
+  if (c->is_manual_compaction() &&
+      manual_compaction_paused_.load(std::memory_order_acquire) > 0) {
+    return;
+  }
+
+  c->SetNotifyOnCompactionCompleted();
+  Version* current = cfd->current();
+  current->Ref();
+  // release lock while notifying events
+  mutex_.Unlock();
+  TEST_SYNC_POINT("DBImpl::NotifyOnCompactionBegin::UnlockMutex");
+  {
+    CompactionJobInfo info{};
+    BuildCompactionJobInfo(cfd, c, st, job_stats, job_id, current, &info);
+    for (auto listener : immutable_db_options_.listeners) {
+      listener->OnCompactionBegin(this, info);
+    }
+    info.status.PermitUncheckedError();
+  }
+  mutex_.Lock();
+  current->Unref();
+#else
+  (void)cfd;
+  (void)c;
+  (void)st;
+  (void)job_stats;
+  (void)job_id;
+#endif  // ROCKSDB_LITE
+}
+
+void DBImpl::NotifyOnCompactionCompleted(
+    ColumnFamilyData* cfd, Compaction* c, const Status& st,
+    const CompactionJobStats& compaction_job_stats, const int job_id) {
+#ifndef ROCKSDB_LITE
+  if (immutable_db_options_.listeners.size() == 0U) {
+    return;
+  }
+  mutex_.AssertHeld();
+  if (shutting_down_.load(std::memory_order_acquire)) {
+    return;
+  }
+
+  if (c->ShouldNotifyOnCompactionCompleted() == false) {
+    return;
+  }
+
+  Version* current = cfd->current();
+  current->Ref();
+  // release lock while notifying events
+  mutex_.Unlock();
+  TEST_SYNC_POINT("DBImpl::NotifyOnCompactionCompleted::UnlockMutex");
+  {
+    CompactionJobInfo info{};
+    BuildCompactionJobInfo(cfd, c, st, compaction_job_stats, job_id, current,
+                           &info);
+    for (auto listener : immutable_db_options_.listeners) {
+      listener->OnCompactionCompleted(this, info);
+    }
+  }
+  mutex_.Lock();
+  current->Unref();
+  // no need to signal bg_cv_ as it will be signaled at the end of the
+  // flush process.
+#else
+  (void)cfd;
+  (void)c;
+  (void)st;
+  (void)compaction_job_stats;
+  (void)job_id;
+#endif  // ROCKSDB_LITE
+}
+
+// REQUIREMENT: block all background work by calling PauseBackgroundWork()
+// before calling this function
+Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
+  assert(level < cfd->NumberLevels());
+  if (target_level >= cfd->NumberLevels()) {
+    return Status::InvalidArgument("Target level exceeds number of levels");
+  }
+
+  SuperVersionContext sv_context(/* create_superversion */ true);
+
+  InstrumentedMutexLock guard_lock(&mutex_);
+
+  // only allow one thread refitting
+  if (refitting_level_) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "[ReFitLevel] another thread is refitting");
+    return Status::NotSupported("another thread is refitting");
+  }
+  refitting_level_ = true;
+
+  const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+  // move to a smaller level
+  int to_level = target_level;
+  if (target_level < 0) {
+    to_level = FindMinimumEmptyLevelFitting(cfd, mutable_cf_options, level);
+  }
+
+  auto* vstorage = cfd->current()->storage_info();
+  if (to_level != level) {
+    if (to_level > level) {
+      if (level == 0) {
+        refitting_level_ = false;
+        return Status::NotSupported(
+            "Cannot change from level 0 to other levels.");
+      }
+      // Check levels are empty for a trivial move
+      for (int l = level + 1; l <= to_level; l++) {
+        if (vstorage->NumLevelFiles(l) > 0) {
+          refitting_level_ = false;
+          return Status::NotSupported(
+              "Levels between source and target are not empty for a move.");
+        }
+      }
+    } else {
+      // to_level < level
+      // Check levels are empty for a trivial move
+      for (int l = to_level; l < level; l++) {
+        if (vstorage->NumLevelFiles(l) > 0) {
+          refitting_level_ = false;
+          return Status::NotSupported(
+              "Levels between source and target are not empty for a move.");
+        }
+      }
+    }
+    ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+                    "[%s] Before refitting:\n%s", cfd->GetName().c_str(),
+                    cfd->current()->DebugString().data());
+
+    VersionEdit edit;
+    edit.SetColumnFamily(cfd->GetID());
+    for (const auto& f : vstorage->LevelFiles(level)) {
+      edit.DeleteFile(level, f->fd.GetNumber());
+      edit.AddFile(
+          to_level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(),
+          f->smallest, f->largest, f->fd.smallest_seqno, f->fd.largest_seqno,
+          f->marked_for_compaction, f->temperature, f->oldest_blob_file_number,
+          f->oldest_ancester_time, f->file_creation_time, f->file_checksum,
+          f->file_checksum_func_name, f->unique_id);
+    }
+    ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+                    "[%s] Apply version edit:\n%s", cfd->GetName().c_str(),
+                    edit.DebugString().data());
+
+    Status status = versions_->LogAndApply(cfd, mutable_cf_options, &edit,
+                                           &mutex_, directories_.GetDbDir());
+
+    InstallSuperVersionAndScheduleWork(cfd, &sv_context, mutable_cf_options);
+
+    ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] LogAndApply: %s\n",
+                    cfd->GetName().c_str(), status.ToString().data());
+
+    if (status.ok()) {
+      ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+                      "[%s] After refitting:\n%s", cfd->GetName().c_str(),
+                      cfd->current()->DebugString().data());
+    }
+    sv_context.Clean();
+    refitting_level_ = false;
+
+    return status;
+  }
+
+  refitting_level_ = false;
+  return Status::OK();
+}
+
+int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) {
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  return cfh->cfd()->NumberLevels();
+}
+
+int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) {
+  return 0;
+}
+
+int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) {
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  InstrumentedMutexLock l(&mutex_);
+  return cfh->cfd()
+      ->GetSuperVersion()
+      ->mutable_cf_options.level0_stop_writes_trigger;
+}
+
+Status DBImpl::Flush(const FlushOptions& flush_options,
+                     ColumnFamilyHandle* column_family) {
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  ROCKS_LOG_INFO(immutable_db_options_.info_log, "[%s] Manual flush start.",
+                 cfh->GetName().c_str());
+  Status s;
+  if (immutable_db_options_.atomic_flush) {
+    s = AtomicFlushMemTables({cfh->cfd()}, flush_options,
+                             FlushReason::kManualFlush);
+  } else {
+    s = FlushMemTable(cfh->cfd(), flush_options, FlushReason::kManualFlush);
+  }
+
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "[%s] Manual flush finished, status: %s\n",
+                 cfh->GetName().c_str(), s.ToString().c_str());
+  return s;
+}
+
+Status DBImpl::Flush(const FlushOptions& flush_options,
+                     const std::vector<ColumnFamilyHandle*>& column_families) {
+  Status s;
+  if (!immutable_db_options_.atomic_flush) {
+    for (auto cfh : column_families) {
+      s = Flush(flush_options, cfh);
+      if (!s.ok()) {
+        break;
+      }
+    }
+  } else {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Manual atomic flush start.\n"
+                   "=====Column families:=====");
+    for (auto cfh : column_families) {
+      auto cfhi = static_cast<ColumnFamilyHandleImpl*>(cfh);
+      ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s",
+                     cfhi->GetName().c_str());
+    }
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "=====End of column families list=====");
+    autovector<ColumnFamilyData*> cfds;
+    std::for_each(column_families.begin(), column_families.end(),
+                  [&cfds](ColumnFamilyHandle* elem) {
+                    auto cfh = static_cast<ColumnFamilyHandleImpl*>(elem);
+                    cfds.emplace_back(cfh->cfd());
+                  });
+    s = AtomicFlushMemTables(cfds, flush_options, FlushReason::kManualFlush);
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Manual atomic flush finished, status: %s\n"
+                   "=====Column families:=====",
+                   s.ToString().c_str());
+    for (auto cfh : column_families) {
+      auto cfhi = static_cast<ColumnFamilyHandleImpl*>(cfh);
+      ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s",
+                     cfhi->GetName().c_str());
+    }
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "=====End of column families list=====");
+  }
+  return s;
+}
+
+Status DBImpl::RunManualCompaction(
+    ColumnFamilyData* cfd, int input_level, int output_level,
+    const CompactRangeOptions& compact_range_options, const Slice* begin,
+    const Slice* end, bool exclusive, bool disallow_trivial_move,
+    uint64_t max_file_num_to_ignore, const std::string& trim_ts) {
+  assert(input_level == ColumnFamilyData::kCompactAllLevels ||
+         input_level >= 0);
+
+  InternalKey begin_storage, end_storage;
+  CompactionArg* ca = nullptr;
+
+  bool scheduled = false;
+  bool unscheduled = false;
+  Env::Priority thread_pool_priority = Env::Priority::TOTAL;
+  bool manual_conflict = false;
+
+  ManualCompactionState manual(
+      cfd, input_level, output_level, compact_range_options.target_path_id,
+      exclusive, disallow_trivial_move, compact_range_options.canceled);
+  // For universal compaction, we enforce every manual compaction to compact
+  // all files.
+  if (begin == nullptr ||
+      cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+      cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
+    manual.begin = nullptr;
+  } else {
+    begin_storage.SetMinPossibleForUserKey(*begin);
+    manual.begin = &begin_storage;
+  }
+  if (end == nullptr ||
+      cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+      cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
+    manual.end = nullptr;
+  } else {
+    end_storage.SetMaxPossibleForUserKey(*end);
+    manual.end = &end_storage;
+  }
+
+  TEST_SYNC_POINT("DBImpl::RunManualCompaction:0");
+  TEST_SYNC_POINT("DBImpl::RunManualCompaction:1");
+  InstrumentedMutexLock l(&mutex_);
+
+  if (manual_compaction_paused_ > 0) {
+    // Does not make sense to `AddManualCompaction()` in this scenario since
+    // `DisableManualCompaction()` just waited for the manual compaction queue
+    // to drain. So return immediately.
+    TEST_SYNC_POINT("DBImpl::RunManualCompaction:PausedAtStart");
+    manual.status =
+        Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+    manual.done = true;
+    return manual.status;
+  }
+
+  // When a manual compaction arrives, temporarily disable scheduling of
+  // non-manual compactions and wait until the number of scheduled compaction
+  // jobs drops to zero. This used to be needed to ensure that this manual
+  // compaction can compact any range of keys/files. Now it is optional
+  // (see `CompactRangeOptions::exclusive_manual_compaction`). The use case for
+  // `exclusive_manual_compaction=true` is unclear beyond not trusting the code.
+  //
+  // HasPendingManualCompaction() is true when at least one thread is inside
+  // RunManualCompaction(), i.e. during that time no other compaction will
+  // get scheduled (see MaybeScheduleFlushOrCompaction).
+  //
+  // Note that the following loop doesn't stop more that one thread calling
+  // RunManualCompaction() from getting to the second while loop below.
+  // However, only one of them will actually schedule compaction, while
+  // others will wait on a condition variable until it completes.
+
+  AddManualCompaction(&manual);
+  TEST_SYNC_POINT_CALLBACK("DBImpl::RunManualCompaction:NotScheduled", &mutex_);
+  if (exclusive) {
+    // Limitation: there's no way to wake up the below loop when user sets
+    // `*manual.canceled`. So `CompactRangeOptions::exclusive_manual_compaction`
+    // and `CompactRangeOptions::canceled` might not work well together.
+    while (bg_bottom_compaction_scheduled_ > 0 ||
+           bg_compaction_scheduled_ > 0) {
+      if (manual_compaction_paused_ > 0 || manual.canceled == true) {
+        // Pretend the error came from compaction so the below cleanup/error
+        // handling code can process it.
+        manual.done = true;
+        manual.status =
+            Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+        break;
+      }
+      TEST_SYNC_POINT("DBImpl::RunManualCompaction:WaitScheduled");
+      ROCKS_LOG_INFO(
+          immutable_db_options_.info_log,
+          "[%s] Manual compaction waiting for all other scheduled background "
+          "compactions to finish",
+          cfd->GetName().c_str());
+      bg_cv_.Wait();
+    }
+  }
+
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+                       immutable_db_options_.info_log.get());
+
+  ROCKS_LOG_BUFFER(&log_buffer, "[%s] Manual compaction starting",
+                   cfd->GetName().c_str());
+
+  // We don't check bg_error_ here, because if we get the error in compaction,
+  // the compaction will set manual.status to bg_error_ and set manual.done to
+  // true.
+  while (!manual.done) {
+    assert(HasPendingManualCompaction());
+    manual_conflict = false;
+    Compaction* compaction = nullptr;
+    if (ShouldntRunManualCompaction(&manual) || (manual.in_progress == true) ||
+        scheduled ||
+        (((manual.manual_end = &manual.tmp_storage1) != nullptr) &&
+         ((compaction = manual.cfd->CompactRange(
+               *manual.cfd->GetLatestMutableCFOptions(), mutable_db_options_,
+               manual.input_level, manual.output_level, compact_range_options,
+               manual.begin, manual.end, &manual.manual_end, &manual_conflict,
+               max_file_num_to_ignore, trim_ts)) == nullptr &&
+          manual_conflict))) {
+      // exclusive manual compactions should not see a conflict during
+      // CompactRange
+      assert(!exclusive || !manual_conflict);
+      // Running either this or some other manual compaction
+      bg_cv_.Wait();
+      if (manual_compaction_paused_ > 0 && scheduled && !unscheduled) {
+        assert(thread_pool_priority != Env::Priority::TOTAL);
+        // unschedule all manual compactions
+        auto unscheduled_task_num = env_->UnSchedule(
+            GetTaskTag(TaskType::kManualCompaction), thread_pool_priority);
+        if (unscheduled_task_num > 0) {
+          ROCKS_LOG_INFO(
+              immutable_db_options_.info_log,
+              "[%s] Unscheduled %d number of manual compactions from the "
+              "thread-pool",
+              cfd->GetName().c_str(), unscheduled_task_num);
+          // it may unschedule other manual compactions, notify others.
+          bg_cv_.SignalAll();
+        }
+        unscheduled = true;
+        TEST_SYNC_POINT("DBImpl::RunManualCompaction:Unscheduled");
+      }
+      if (scheduled && manual.incomplete == true) {
+        assert(!manual.in_progress);
+        scheduled = false;
+        manual.incomplete = false;
+      }
+    } else if (!scheduled) {
+      if (compaction == nullptr) {
+        manual.done = true;
+        bg_cv_.SignalAll();
+        continue;
+      }
+      ca = new CompactionArg;
+      ca->db = this;
+      ca->prepicked_compaction = new PrepickedCompaction;
+      ca->prepicked_compaction->manual_compaction_state = &manual;
+      ca->prepicked_compaction->compaction = compaction;
+      if (!RequestCompactionToken(
+              cfd, true, &ca->prepicked_compaction->task_token, &log_buffer)) {
+        // Don't throttle manual compaction, only count outstanding tasks.
+        assert(false);
+      }
+      manual.incomplete = false;
+      if (compaction->bottommost_level() &&
+          env_->GetBackgroundThreads(Env::Priority::BOTTOM) > 0) {
+        bg_bottom_compaction_scheduled_++;
+        ca->compaction_pri_ = Env::Priority::BOTTOM;
+        env_->Schedule(&DBImpl::BGWorkBottomCompaction, ca,
+                       Env::Priority::BOTTOM,
+                       GetTaskTag(TaskType::kManualCompaction),
+                       &DBImpl::UnscheduleCompactionCallback);
+        thread_pool_priority = Env::Priority::BOTTOM;
+      } else {
+        bg_compaction_scheduled_++;
+        ca->compaction_pri_ = Env::Priority::LOW;
+        env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW,
+                       GetTaskTag(TaskType::kManualCompaction),
+                       &DBImpl::UnscheduleCompactionCallback);
+        thread_pool_priority = Env::Priority::LOW;
+      }
+      scheduled = true;
+      TEST_SYNC_POINT("DBImpl::RunManualCompaction:Scheduled");
+    }
+  }
+
+  log_buffer.FlushBufferToLog();
+  assert(!manual.in_progress);
+  assert(HasPendingManualCompaction());
+  RemoveManualCompaction(&manual);
+  // if the manual job is unscheduled, try schedule other jobs in case there's
+  // any unscheduled compaction job which was blocked by exclusive manual
+  // compaction.
+  if (manual.status.IsIncomplete() &&
+      manual.status.subcode() == Status::SubCode::kManualCompactionPaused) {
+    MaybeScheduleFlushOrCompaction();
+  }
+  bg_cv_.SignalAll();
+  return manual.status;
+}
+
+void DBImpl::GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
+                                  FlushRequest* req) {
+  assert(req != nullptr);
+  req->reserve(cfds.size());
+  for (const auto cfd : cfds) {
+    if (nullptr == cfd) {
+      // cfd may be null, see DBImpl::ScheduleFlushes
+      continue;
+    }
+    uint64_t max_memtable_id = cfd->imm()->GetLatestMemTableID();
+    req->emplace_back(cfd, max_memtable_id);
+  }
+}
+
+Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
+                             const FlushOptions& flush_options,
+                             FlushReason flush_reason,
+                             bool entered_write_thread) {
+  // This method should not be called if atomic_flush is true.
+  assert(!immutable_db_options_.atomic_flush);
+  if (!flush_options.wait && write_controller_.IsStopped()) {
+    std::ostringstream oss;
+    oss << "Writes have been stopped, thus unable to perform manual flush. "
+           "Please try again later after writes are resumed";
+    return Status::TryAgain(oss.str());
+  }
+  Status s;
+  if (!flush_options.allow_write_stall) {
+    bool flush_needed = true;
+    s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed);
+    TEST_SYNC_POINT("DBImpl::FlushMemTable:StallWaitDone");
+    if (!s.ok() || !flush_needed) {
+      return s;
+    }
+  }
+
+  const bool needs_to_join_write_thread = !entered_write_thread;
+  autovector<FlushRequest> flush_reqs;
+  autovector<uint64_t> memtable_ids_to_wait;
+  {
+    WriteContext context;
+    InstrumentedMutexLock guard_lock(&mutex_);
+
+    WriteThread::Writer w;
+    WriteThread::Writer nonmem_w;
+    if (needs_to_join_write_thread) {
+      write_thread_.EnterUnbatched(&w, &mutex_);
+      if (two_write_queues_) {
+        nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+      }
+    }
+    WaitForPendingWrites();
+
+    if (flush_reason != FlushReason::kErrorRecoveryRetryFlush &&
+        (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load())) {
+      // Note that, when flush reason is kErrorRecoveryRetryFlush, during the
+      // auto retry resume, we want to avoid creating new small memtables.
+      // Therefore, SwitchMemtable will not be called. Also, since ResumeImpl
+      // will iterate through all the CFs and call FlushMemtable during auto
+      // retry resume, it is possible that in some CFs,
+      // cfd->imm()->NumNotFlushed() = 0. In this case, so no flush request will
+      // be created and scheduled, status::OK() will be returned.
+      s = SwitchMemtable(cfd, &context);
+    }
+    const uint64_t flush_memtable_id = std::numeric_limits<uint64_t>::max();
+    if (s.ok()) {
+      if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
+          !cached_recoverable_state_empty_.load()) {
+        FlushRequest req{{cfd, flush_memtable_id}};
+        flush_reqs.emplace_back(std::move(req));
+        memtable_ids_to_wait.emplace_back(cfd->imm()->GetLatestMemTableID());
+      }
+      if (immutable_db_options_.persist_stats_to_disk &&
+          flush_reason != FlushReason::kErrorRecoveryRetryFlush) {
+        ColumnFamilyData* cfd_stats =
+            versions_->GetColumnFamilySet()->GetColumnFamily(
+                kPersistentStatsColumnFamilyName);
+        if (cfd_stats != nullptr && cfd_stats != cfd &&
+            !cfd_stats->mem()->IsEmpty()) {
+          // only force flush stats CF when it will be the only CF lagging
+          // behind after the current flush
+          bool stats_cf_flush_needed = true;
+          for (auto* loop_cfd : *versions_->GetColumnFamilySet()) {
+            if (loop_cfd == cfd_stats || loop_cfd == cfd) {
+              continue;
+            }
+            if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) {
+              stats_cf_flush_needed = false;
+            }
+          }
+          if (stats_cf_flush_needed) {
+            ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                           "Force flushing stats CF with manual flush of %s "
+                           "to avoid holding old logs",
+                           cfd->GetName().c_str());
+            s = SwitchMemtable(cfd_stats, &context);
+            FlushRequest req{{cfd_stats, flush_memtable_id}};
+            flush_reqs.emplace_back(std::move(req));
+            memtable_ids_to_wait.emplace_back(
+                cfd->imm()->GetLatestMemTableID());
+          }
+        }
+      }
+    }
+
+    if (s.ok() && !flush_reqs.empty()) {
+      for (const auto& req : flush_reqs) {
+        assert(req.size() == 1);
+        ColumnFamilyData* loop_cfd = req[0].first;
+        loop_cfd->imm()->FlushRequested();
+      }
+      // If the caller wants to wait for this flush to complete, it indicates
+      // that the caller expects the ColumnFamilyData not to be free'ed by
+      // other threads which may drop the column family concurrently.
+      // Therefore, we increase the cfd's ref count.
+      if (flush_options.wait) {
+        for (const auto& req : flush_reqs) {
+          assert(req.size() == 1);
+          ColumnFamilyData* loop_cfd = req[0].first;
+          loop_cfd->Ref();
+        }
+      }
+      for (const auto& req : flush_reqs) {
+        SchedulePendingFlush(req, flush_reason);
+      }
+      MaybeScheduleFlushOrCompaction();
+    }
+
+    if (needs_to_join_write_thread) {
+      write_thread_.ExitUnbatched(&w);
+      if (two_write_queues_) {
+        nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+      }
+    }
+  }
+  TEST_SYNC_POINT("DBImpl::FlushMemTable:AfterScheduleFlush");
+  TEST_SYNC_POINT("DBImpl::FlushMemTable:BeforeWaitForBgFlush");
+  if (s.ok() && flush_options.wait) {
+    autovector<ColumnFamilyData*> cfds;
+    autovector<const uint64_t*> flush_memtable_ids;
+    assert(flush_reqs.size() == memtable_ids_to_wait.size());
+    for (size_t i = 0; i < flush_reqs.size(); ++i) {
+      assert(flush_reqs[i].size() == 1);
+      cfds.push_back(flush_reqs[i][0].first);
+      flush_memtable_ids.push_back(&(memtable_ids_to_wait[i]));
+    }
+    s = WaitForFlushMemTables(
+        cfds, flush_memtable_ids,
+        (flush_reason == FlushReason::kErrorRecovery ||
+         flush_reason == FlushReason::kErrorRecoveryRetryFlush));
+    InstrumentedMutexLock lock_guard(&mutex_);
+    for (auto* tmp_cfd : cfds) {
+      tmp_cfd->UnrefAndTryDelete();
+    }
+  }
+  TEST_SYNC_POINT("DBImpl::FlushMemTable:FlushMemTableFinished");
+  return s;
+}
+
+// Flush all elements in 'column_family_datas'
+// and atomically record the result to the MANIFEST.
+Status DBImpl::AtomicFlushMemTables(
+    const autovector<ColumnFamilyData*>& column_family_datas,
+    const FlushOptions& flush_options, FlushReason flush_reason,
+    bool entered_write_thread) {
+  assert(immutable_db_options_.atomic_flush);
+  if (!flush_options.wait && write_controller_.IsStopped()) {
+    std::ostringstream oss;
+    oss << "Writes have been stopped, thus unable to perform manual flush. "
+           "Please try again later after writes are resumed";
+    return Status::TryAgain(oss.str());
+  }
+  Status s;
+  if (!flush_options.allow_write_stall) {
+    int num_cfs_to_flush = 0;
+    for (auto cfd : column_family_datas) {
+      bool flush_needed = true;
+      s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed);
+      if (!s.ok()) {
+        return s;
+      } else if (flush_needed) {
+        ++num_cfs_to_flush;
+      }
+    }
+    if (0 == num_cfs_to_flush) {
+      return s;
+    }
+  }
+  const bool needs_to_join_write_thread = !entered_write_thread;
+  FlushRequest flush_req;
+  autovector<ColumnFamilyData*> cfds;
+  {
+    WriteContext context;
+    InstrumentedMutexLock guard_lock(&mutex_);
+
+    WriteThread::Writer w;
+    WriteThread::Writer nonmem_w;
+    if (needs_to_join_write_thread) {
+      write_thread_.EnterUnbatched(&w, &mutex_);
+      if (two_write_queues_) {
+        nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+      }
+    }
+    WaitForPendingWrites();
+
+    for (auto cfd : column_family_datas) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
+          !cached_recoverable_state_empty_.load()) {
+        cfds.emplace_back(cfd);
+      }
+    }
+    for (auto cfd : cfds) {
+      if ((cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) ||
+          flush_reason == FlushReason::kErrorRecoveryRetryFlush) {
+        continue;
+      }
+      cfd->Ref();
+      s = SwitchMemtable(cfd, &context);
+      cfd->UnrefAndTryDelete();
+      if (!s.ok()) {
+        break;
+      }
+    }
+    if (s.ok()) {
+      AssignAtomicFlushSeq(cfds);
+      for (auto cfd : cfds) {
+        cfd->imm()->FlushRequested();
+      }
+      // If the caller wants to wait for this flush to complete, it indicates
+      // that the caller expects the ColumnFamilyData not to be free'ed by
+      // other threads which may drop the column family concurrently.
+      // Therefore, we increase the cfd's ref count.
+      if (flush_options.wait) {
+        for (auto cfd : cfds) {
+          cfd->Ref();
+        }
+      }
+      GenerateFlushRequest(cfds, &flush_req);
+      SchedulePendingFlush(flush_req, flush_reason);
+      MaybeScheduleFlushOrCompaction();
+    }
+
+    if (needs_to_join_write_thread) {
+      write_thread_.ExitUnbatched(&w);
+      if (two_write_queues_) {
+        nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+      }
+    }
+  }
+  TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:AfterScheduleFlush");
+  TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:BeforeWaitForBgFlush");
+  if (s.ok() && flush_options.wait) {
+    autovector<const uint64_t*> flush_memtable_ids;
+    for (auto& iter : flush_req) {
+      flush_memtable_ids.push_back(&(iter.second));
+    }
+    s = WaitForFlushMemTables(
+        cfds, flush_memtable_ids,
+        (flush_reason == FlushReason::kErrorRecovery ||
+         flush_reason == FlushReason::kErrorRecoveryRetryFlush));
+    InstrumentedMutexLock lock_guard(&mutex_);
+    for (auto* cfd : cfds) {
+      cfd->UnrefAndTryDelete();
+    }
+  }
+  return s;
+}
+
+// Calling FlushMemTable(), whether from DB::Flush() or from Backup Engine, can
+// cause write stall, for example if one memtable is being flushed already.
+// This method tries to avoid write stall (similar to CompactRange() behavior)
+// it emulates how the SuperVersion / LSM would change if flush happens, checks
+// it against various constrains and delays flush if it'd cause write stall.
+// Caller should check status and flush_needed to see if flush already happened.
+Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
+                                                 bool* flush_needed) {
+  {
+    *flush_needed = true;
+    InstrumentedMutexLock l(&mutex_);
+    uint64_t orig_active_memtable_id = cfd->mem()->GetID();
+    WriteStallCondition write_stall_condition = WriteStallCondition::kNormal;
+    do {
+      if (write_stall_condition != WriteStallCondition::kNormal) {
+        // Same error handling as user writes: Don't wait if there's a
+        // background error, even if it's a soft error. We might wait here
+        // indefinitely as the pending flushes/compactions may never finish
+        // successfully, resulting in the stall condition lasting indefinitely
+        if (error_handler_.IsBGWorkStopped()) {
+          return error_handler_.GetBGError();
+        }
+
+        TEST_SYNC_POINT("DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait");
+        ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                       "[%s] WaitUntilFlushWouldNotStallWrites"
+                       " waiting on stall conditions to clear",
+                       cfd->GetName().c_str());
+        bg_cv_.Wait();
+      }
+      if (cfd->IsDropped()) {
+        return Status::ColumnFamilyDropped();
+      }
+      if (shutting_down_.load(std::memory_order_acquire)) {
+        return Status::ShutdownInProgress();
+      }
+
+      uint64_t earliest_memtable_id =
+          std::min(cfd->mem()->GetID(), cfd->imm()->GetEarliestMemTableID());
+      if (earliest_memtable_id > orig_active_memtable_id) {
+        // We waited so long that the memtable we were originally waiting on was
+        // flushed.
+        *flush_needed = false;
+        return Status::OK();
+      }
+
+      const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+      const auto* vstorage = cfd->current()->storage_info();
+
+      // Skip stalling check if we're below auto-flush and auto-compaction
+      // triggers. If it stalled in these conditions, that'd mean the stall
+      // triggers are so low that stalling is needed for any background work. In
+      // that case we shouldn't wait since background work won't be scheduled.
+      if (cfd->imm()->NumNotFlushed() <
+              cfd->ioptions()->min_write_buffer_number_to_merge &&
+          vstorage->l0_delay_trigger_count() <
+              mutable_cf_options.level0_file_num_compaction_trigger) {
+        break;
+      }
+
+      // check whether one extra immutable memtable or an extra L0 file would
+      // cause write stalling mode to be entered. It could still enter stall
+      // mode due to pending compaction bytes, but that's less common
+      write_stall_condition = ColumnFamilyData::GetWriteStallConditionAndCause(
+                                  cfd->imm()->NumNotFlushed() + 1,
+                                  vstorage->l0_delay_trigger_count() + 1,
+                                  vstorage->estimated_compaction_needed_bytes(),
+                                  mutable_cf_options, *cfd->ioptions())
+                                  .first;
+    } while (write_stall_condition != WriteStallCondition::kNormal);
+  }
+  return Status::OK();
+}
+
+// Wait for memtables to be flushed for multiple column families.
+// let N = cfds.size()
+// for i in [0, N),
+//  1) if flush_memtable_ids[i] is not null, then the memtables with lower IDs
+//     have to be flushed for THIS column family;
+//  2) if flush_memtable_ids[i] is null, then all memtables in THIS column
+//     family have to be flushed.
+// Finish waiting when ALL column families finish flushing memtables.
+// resuming_from_bg_err indicates whether the caller is trying to resume from
+// background error or in normal processing.
+Status DBImpl::WaitForFlushMemTables(
+    const autovector<ColumnFamilyData*>& cfds,
+    const autovector<const uint64_t*>& flush_memtable_ids,
+    bool resuming_from_bg_err) {
+  int num = static_cast<int>(cfds.size());
+  // Wait until the compaction completes
+  InstrumentedMutexLock l(&mutex_);
+  Status s;
+  // If the caller is trying to resume from bg error, then
+  // error_handler_.IsDBStopped() is true.
+  while (resuming_from_bg_err || !error_handler_.IsDBStopped()) {
+    if (shutting_down_.load(std::memory_order_acquire)) {
+      s = Status::ShutdownInProgress();
+      return s;
+    }
+    // If an error has occurred during resumption, then no need to wait.
+    // But flush operation may fail because of this error, so need to
+    // return the status.
+    if (!error_handler_.GetRecoveryError().ok()) {
+      s = error_handler_.GetRecoveryError();
+      break;
+    }
+    // If BGWorkStopped, which indicate that there is a BG error and
+    // 1) soft error but requires no BG work, 2) no in auto_recovery_
+    if (!resuming_from_bg_err && error_handler_.IsBGWorkStopped() &&
+        error_handler_.GetBGError().severity() < Status::Severity::kHardError) {
+      s = error_handler_.GetBGError();
+      return s;
+    }
+
+    // Number of column families that have been dropped.
+    int num_dropped = 0;
+    // Number of column families that have finished flush.
+    int num_finished = 0;
+    for (int i = 0; i < num; ++i) {
+      if (cfds[i]->IsDropped()) {
+        ++num_dropped;
+      } else if (cfds[i]->imm()->NumNotFlushed() == 0 ||
+                 (flush_memtable_ids[i] != nullptr &&
+                  cfds[i]->imm()->GetEarliestMemTableID() >
+                      *flush_memtable_ids[i])) {
+        ++num_finished;
+      }
+    }
+    if (1 == num_dropped && 1 == num) {
+      s = Status::ColumnFamilyDropped();
+      return s;
+    }
+    // Column families involved in this flush request have either been dropped
+    // or finished flush. Then it's time to finish waiting.
+    if (num_dropped + num_finished == num) {
+      break;
+    }
+    bg_cv_.Wait();
+  }
+  // If not resuming from bg error, and an error has caused the DB to stop,
+  // then report the bg error to caller.
+  if (!resuming_from_bg_err && error_handler_.IsDBStopped()) {
+    s = error_handler_.GetBGError();
+  }
+  return s;
+}
+
+Status DBImpl::EnableAutoCompaction(
+    const std::vector<ColumnFamilyHandle*>& column_family_handles) {
+  Status s;
+  for (auto cf_ptr : column_family_handles) {
+    Status status =
+        this->SetOptions(cf_ptr, {{"disable_auto_compactions", "false"}});
+    if (!status.ok()) {
+      s = status;
+    }
+  }
+
+  return s;
+}
+
+// NOTE: Calling DisableManualCompaction() may overwrite the
+// user-provided canceled variable in CompactRangeOptions
+void DBImpl::DisableManualCompaction() {
+  InstrumentedMutexLock l(&mutex_);
+  manual_compaction_paused_.fetch_add(1, std::memory_order_release);
+
+  // Mark the canceled as true when the cancellation is triggered by
+  // manual_compaction_paused (may overwrite user-provided `canceled`)
+  for (const auto& manual_compaction : manual_compaction_dequeue_) {
+    manual_compaction->canceled = true;
+  }
+
+  // Wake up manual compactions waiting to start.
+  bg_cv_.SignalAll();
+
+  // Wait for any pending manual compactions to finish (typically through
+  // failing with `Status::Incomplete`) prior to returning. This way we are
+  // guaranteed no pending manual compaction will commit while manual
+  // compactions are "disabled".
+  while (HasPendingManualCompaction()) {
+    bg_cv_.Wait();
+  }
+}
+
+// NOTE: In contrast to DisableManualCompaction(), calling
+// EnableManualCompaction() does NOT overwrite the user-provided *canceled
+// variable to be false since there is NO CHANCE a canceled compaction
+// is uncanceled. In other words, a canceled compaction must have been
+// dropped out of the manual compaction queue, when we disable it.
+void DBImpl::EnableManualCompaction() {
+  InstrumentedMutexLock l(&mutex_);
+  assert(manual_compaction_paused_ > 0);
+  manual_compaction_paused_.fetch_sub(1, std::memory_order_release);
+}
+
+void DBImpl::MaybeScheduleFlushOrCompaction() {
+  mutex_.AssertHeld();
+  if (!opened_successfully_) {
+    // Compaction may introduce data race to DB open
+    return;
+  }
+  if (bg_work_paused_ > 0) {
+    // we paused the background work
+    return;
+  } else if (error_handler_.IsBGWorkStopped() &&
+             !error_handler_.IsRecoveryInProgress()) {
+    // There has been a hard error and this call is not part of the recovery
+    // sequence. Bail out here so we don't get into an endless loop of
+    // scheduling BG work which will again call this function
+    return;
+  } else if (shutting_down_.load(std::memory_order_acquire)) {
+    // DB is being deleted; no more background compactions
+    return;
+  }
+  auto bg_job_limits = GetBGJobLimits();
+  bool is_flush_pool_empty =
+      env_->GetBackgroundThreads(Env::Priority::HIGH) == 0;
+  while (!is_flush_pool_empty && unscheduled_flushes_ > 0 &&
+         bg_flush_scheduled_ < bg_job_limits.max_flushes) {
+    bg_flush_scheduled_++;
+    FlushThreadArg* fta = new FlushThreadArg;
+    fta->db_ = this;
+    fta->thread_pri_ = Env::Priority::HIGH;
+    env_->Schedule(&DBImpl::BGWorkFlush, fta, Env::Priority::HIGH, this,
+                   &DBImpl::UnscheduleFlushCallback);
+    --unscheduled_flushes_;
+    TEST_SYNC_POINT_CALLBACK(
+        "DBImpl::MaybeScheduleFlushOrCompaction:AfterSchedule:0",
+        &unscheduled_flushes_);
+  }
+
+  // special case -- if high-pri (flush) thread pool is empty, then schedule
+  // flushes in low-pri (compaction) thread pool.
+  if (is_flush_pool_empty) {
+    while (unscheduled_flushes_ > 0 &&
+           bg_flush_scheduled_ + bg_compaction_scheduled_ <
+               bg_job_limits.max_flushes) {
+      bg_flush_scheduled_++;
+      FlushThreadArg* fta = new FlushThreadArg;
+      fta->db_ = this;
+      fta->thread_pri_ = Env::Priority::LOW;
+      env_->Schedule(&DBImpl::BGWorkFlush, fta, Env::Priority::LOW, this,
+                     &DBImpl::UnscheduleFlushCallback);
+      --unscheduled_flushes_;
+    }
+  }
+
+  if (bg_compaction_paused_ > 0) {
+    // we paused the background compaction
+    return;
+  } else if (error_handler_.IsBGWorkStopped()) {
+    // Compaction is not part of the recovery sequence from a hard error. We
+    // might get here because recovery might do a flush and install a new
+    // super version, which will try to schedule pending compactions. Bail
+    // out here and let the higher level recovery handle compactions
+    return;
+  }
+
+  if (HasExclusiveManualCompaction()) {
+    // only manual compactions are allowed to run. don't schedule automatic
+    // compactions
+    TEST_SYNC_POINT("DBImpl::MaybeScheduleFlushOrCompaction:Conflict");
+    return;
+  }
+
+  while (bg_compaction_scheduled_ + bg_bottom_compaction_scheduled_ <
+             bg_job_limits.max_compactions &&
+         unscheduled_compactions_ > 0) {
+    CompactionArg* ca = new CompactionArg;
+    ca->db = this;
+    ca->compaction_pri_ = Env::Priority::LOW;
+    ca->prepicked_compaction = nullptr;
+    bg_compaction_scheduled_++;
+    unscheduled_compactions_--;
+    env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this,
+                   &DBImpl::UnscheduleCompactionCallback);
+  }
+}
+
+DBImpl::BGJobLimits DBImpl::GetBGJobLimits() const {
+  mutex_.AssertHeld();
+  return GetBGJobLimits(mutable_db_options_.max_background_flushes,
+                        mutable_db_options_.max_background_compactions,
+                        mutable_db_options_.max_background_jobs,
+                        write_controller_.NeedSpeedupCompaction());
+}
+
+DBImpl::BGJobLimits DBImpl::GetBGJobLimits(int max_background_flushes,
+                                           int max_background_compactions,
+                                           int max_background_jobs,
+                                           bool parallelize_compactions) {
+  BGJobLimits res;
+  if (max_background_flushes == -1 && max_background_compactions == -1) {
+    // for our first stab implementing max_background_jobs, simply allocate a
+    // quarter of the threads to flushes.
+    res.max_flushes = std::max(1, max_background_jobs / 4);
+    res.max_compactions = std::max(1, max_background_jobs - res.max_flushes);
+  } else {
+    // compatibility code in case users haven't migrated to max_background_jobs,
+    // which automatically computes flush/compaction limits
+    res.max_flushes = std::max(1, max_background_flushes);
+    res.max_compactions = std::max(1, max_background_compactions);
+  }
+  if (!parallelize_compactions) {
+    // throttle background compactions until we deem necessary
+    res.max_compactions = 1;
+  }
+  return res;
+}
+
+void DBImpl::AddToCompactionQueue(ColumnFamilyData* cfd) {
+  assert(!cfd->queued_for_compaction());
+  cfd->Ref();
+  compaction_queue_.push_back(cfd);
+  cfd->set_queued_for_compaction(true);
+}
+
+ColumnFamilyData* DBImpl::PopFirstFromCompactionQueue() {
+  assert(!compaction_queue_.empty());
+  auto cfd = *compaction_queue_.begin();
+  compaction_queue_.pop_front();
+  assert(cfd->queued_for_compaction());
+  cfd->set_queued_for_compaction(false);
+  return cfd;
+}
+
+DBImpl::FlushRequest DBImpl::PopFirstFromFlushQueue() {
+  assert(!flush_queue_.empty());
+  FlushRequest flush_req = flush_queue_.front();
+  flush_queue_.pop_front();
+  if (!immutable_db_options_.atomic_flush) {
+    assert(flush_req.size() == 1);
+  }
+  for (const auto& elem : flush_req) {
+    if (!immutable_db_options_.atomic_flush) {
+      ColumnFamilyData* cfd = elem.first;
+      assert(cfd);
+      assert(cfd->queued_for_flush());
+      cfd->set_queued_for_flush(false);
+    }
+  }
+  // TODO: need to unset flush reason?
+  return flush_req;
+}
+
+ColumnFamilyData* DBImpl::PickCompactionFromQueue(
+    std::unique_ptr<TaskLimiterToken>* token, LogBuffer* log_buffer) {
+  assert(!compaction_queue_.empty());
+  assert(*token == nullptr);
+  autovector<ColumnFamilyData*> throttled_candidates;
+  ColumnFamilyData* cfd = nullptr;
+  while (!compaction_queue_.empty()) {
+    auto first_cfd = *compaction_queue_.begin();
+    compaction_queue_.pop_front();
+    assert(first_cfd->queued_for_compaction());
+    if (!RequestCompactionToken(first_cfd, false, token, log_buffer)) {
+      throttled_candidates.push_back(first_cfd);
+      continue;
+    }
+    cfd = first_cfd;
+    cfd->set_queued_for_compaction(false);
+    break;
+  }
+  // Add throttled compaction candidates back to queue in the original order.
+  for (auto iter = throttled_candidates.rbegin();
+       iter != throttled_candidates.rend(); ++iter) {
+    compaction_queue_.push_front(*iter);
+  }
+  return cfd;
+}
+
+void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req,
+                                  FlushReason flush_reason) {
+  mutex_.AssertHeld();
+  if (flush_req.empty()) {
+    return;
+  }
+  if (!immutable_db_options_.atomic_flush) {
+    // For the non-atomic flush case, we never schedule multiple column
+    // families in the same flush request.
+    assert(flush_req.size() == 1);
+    ColumnFamilyData* cfd = flush_req[0].first;
+    assert(cfd);
+
+    if (!cfd->queued_for_flush() && cfd->imm()->IsFlushPending()) {
+      cfd->Ref();
+      cfd->set_queued_for_flush(true);
+      cfd->SetFlushReason(flush_reason);
+      ++unscheduled_flushes_;
+      flush_queue_.push_back(flush_req);
+    }
+  } else {
+    for (auto& iter : flush_req) {
+      ColumnFamilyData* cfd = iter.first;
+      cfd->Ref();
+      cfd->SetFlushReason(flush_reason);
+    }
+    ++unscheduled_flushes_;
+    flush_queue_.push_back(flush_req);
+  }
+}
+
+void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) {
+  mutex_.AssertHeld();
+  if (!cfd->queued_for_compaction() && cfd->NeedsCompaction()) {
+    AddToCompactionQueue(cfd);
+    ++unscheduled_compactions_;
+  }
+}
+
+void DBImpl::SchedulePendingPurge(std::string fname, std::string dir_to_sync,
+                                  FileType type, uint64_t number, int job_id) {
+  mutex_.AssertHeld();
+  PurgeFileInfo file_info(fname, dir_to_sync, type, number, job_id);
+  purge_files_.insert({{number, std::move(file_info)}});
+}
+
+void DBImpl::BGWorkFlush(void* arg) {
+  FlushThreadArg fta = *(reinterpret_cast<FlushThreadArg*>(arg));
+  delete reinterpret_cast<FlushThreadArg*>(arg);
+
+  IOSTATS_SET_THREAD_POOL_ID(fta.thread_pri_);
+  TEST_SYNC_POINT("DBImpl::BGWorkFlush");
+  static_cast_with_check<DBImpl>(fta.db_)->BackgroundCallFlush(fta.thread_pri_);
+  TEST_SYNC_POINT("DBImpl::BGWorkFlush:done");
+}
+
+void DBImpl::BGWorkCompaction(void* arg) {
+  CompactionArg ca = *(reinterpret_cast<CompactionArg*>(arg));
+  delete reinterpret_cast<CompactionArg*>(arg);
+  IOSTATS_SET_THREAD_POOL_ID(Env::Priority::LOW);
+  TEST_SYNC_POINT("DBImpl::BGWorkCompaction");
+  auto prepicked_compaction =
+      static_cast<PrepickedCompaction*>(ca.prepicked_compaction);
+  static_cast_with_check<DBImpl>(ca.db)->BackgroundCallCompaction(
+      prepicked_compaction, Env::Priority::LOW);
+  delete prepicked_compaction;
+}
+
+void DBImpl::BGWorkBottomCompaction(void* arg) {
+  CompactionArg ca = *(static_cast<CompactionArg*>(arg));
+  delete static_cast<CompactionArg*>(arg);
+  IOSTATS_SET_THREAD_POOL_ID(Env::Priority::BOTTOM);
+  TEST_SYNC_POINT("DBImpl::BGWorkBottomCompaction");
+  auto* prepicked_compaction = ca.prepicked_compaction;
+  assert(prepicked_compaction && prepicked_compaction->compaction);
+  ca.db->BackgroundCallCompaction(prepicked_compaction, Env::Priority::BOTTOM);
+  delete prepicked_compaction;
+}
+
+void DBImpl::BGWorkPurge(void* db) {
+  IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);
+  TEST_SYNC_POINT("DBImpl::BGWorkPurge:start");
+  reinterpret_cast<DBImpl*>(db)->BackgroundCallPurge();
+  TEST_SYNC_POINT("DBImpl::BGWorkPurge:end");
+}
+
+void DBImpl::UnscheduleCompactionCallback(void* arg) {
+  CompactionArg* ca_ptr = reinterpret_cast<CompactionArg*>(arg);
+  Env::Priority compaction_pri = ca_ptr->compaction_pri_;
+  if (Env::Priority::BOTTOM == compaction_pri) {
+    // Decrement bg_bottom_compaction_scheduled_ if priority is BOTTOM
+    ca_ptr->db->bg_bottom_compaction_scheduled_--;
+  } else if (Env::Priority::LOW == compaction_pri) {
+    // Decrement bg_compaction_scheduled_ if priority is LOW
+    ca_ptr->db->bg_compaction_scheduled_--;
+  }
+  CompactionArg ca = *(ca_ptr);
+  delete reinterpret_cast<CompactionArg*>(arg);
+  if (ca.prepicked_compaction != nullptr) {
+    // if it's a manual compaction, set status to ManualCompactionPaused
+    if (ca.prepicked_compaction->manual_compaction_state) {
+      ca.prepicked_compaction->manual_compaction_state->done = true;
+      ca.prepicked_compaction->manual_compaction_state->status =
+          Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+    }
+    if (ca.prepicked_compaction->compaction != nullptr) {
+      ca.prepicked_compaction->compaction->ReleaseCompactionFiles(
+          Status::Incomplete(Status::SubCode::kManualCompactionPaused));
+      delete ca.prepicked_compaction->compaction;
+    }
+    delete ca.prepicked_compaction;
+  }
+  TEST_SYNC_POINT("DBImpl::UnscheduleCompactionCallback");
+}
+
+void DBImpl::UnscheduleFlushCallback(void* arg) {
+  // Decrement bg_flush_scheduled_ in flush callback
+  reinterpret_cast<FlushThreadArg*>(arg)->db_->bg_flush_scheduled_--;
+  Env::Priority flush_pri = reinterpret_cast<FlushThreadArg*>(arg)->thread_pri_;
+  if (Env::Priority::LOW == flush_pri) {
+    TEST_SYNC_POINT("DBImpl::UnscheduleLowFlushCallback");
+  } else if (Env::Priority::HIGH == flush_pri) {
+    TEST_SYNC_POINT("DBImpl::UnscheduleHighFlushCallback");
+  }
+  delete reinterpret_cast<FlushThreadArg*>(arg);
+  TEST_SYNC_POINT("DBImpl::UnscheduleFlushCallback");
+}
+
+Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
+                               LogBuffer* log_buffer, FlushReason* reason,
+                               Env::Priority thread_pri) {
+  mutex_.AssertHeld();
+
+  Status status;
+  *reason = FlushReason::kOthers;
+  // If BG work is stopped due to an error, but a recovery is in progress,
+  // that means this flush is part of the recovery. So allow it to go through
+  if (!error_handler_.IsBGWorkStopped()) {
+    if (shutting_down_.load(std::memory_order_acquire)) {
+      status = Status::ShutdownInProgress();
+    }
+  } else if (!error_handler_.IsRecoveryInProgress()) {
+    status = error_handler_.GetBGError();
+  }
+
+  if (!status.ok()) {
+    return status;
+  }
+
+  autovector<BGFlushArg> bg_flush_args;
+  std::vector<SuperVersionContext>& superversion_contexts =
+      job_context->superversion_contexts;
+  autovector<ColumnFamilyData*> column_families_not_to_flush;
+  while (!flush_queue_.empty()) {
+    // This cfd is already referenced
+    const FlushRequest& flush_req = PopFirstFromFlushQueue();
+    superversion_contexts.clear();
+    superversion_contexts.reserve(flush_req.size());
+
+    for (const auto& iter : flush_req) {
+      ColumnFamilyData* cfd = iter.first;
+      if (cfd->GetMempurgeUsed()) {
+        // If imm() contains silent memtables (e.g.: because
+        // MemPurge was activated), requesting a flush will
+        // mark the imm_needed as true.
+        cfd->imm()->FlushRequested();
+      }
+
+      if (cfd->IsDropped() || !cfd->imm()->IsFlushPending()) {
+        // can't flush this CF, try next one
+        column_families_not_to_flush.push_back(cfd);
+        continue;
+      }
+      superversion_contexts.emplace_back(SuperVersionContext(true));
+      bg_flush_args.emplace_back(cfd, iter.second,
+                                 &(superversion_contexts.back()));
+    }
+    if (!bg_flush_args.empty()) {
+      break;
+    }
+  }
+
+  if (!bg_flush_args.empty()) {
+    auto bg_job_limits = GetBGJobLimits();
+    for (const auto& arg : bg_flush_args) {
+      ColumnFamilyData* cfd = arg.cfd_;
+      ROCKS_LOG_BUFFER(
+          log_buffer,
+          "Calling FlushMemTableToOutputFile with column "
+          "family [%s], flush slots available %d, compaction slots available "
+          "%d, "
+          "flush slots scheduled %d, compaction slots scheduled %d",
+          cfd->GetName().c_str(), bg_job_limits.max_flushes,
+          bg_job_limits.max_compactions, bg_flush_scheduled_,
+          bg_compaction_scheduled_);
+    }
+    status = FlushMemTablesToOutputFiles(bg_flush_args, made_progress,
+                                         job_context, log_buffer, thread_pri);
+    TEST_SYNC_POINT("DBImpl::BackgroundFlush:BeforeFlush");
+    // All the CFDs in the FlushReq must have the same flush reason, so just
+    // grab the first one
+    *reason = bg_flush_args[0].cfd_->GetFlushReason();
+    for (auto& arg : bg_flush_args) {
+      ColumnFamilyData* cfd = arg.cfd_;
+      if (cfd->UnrefAndTryDelete()) {
+        arg.cfd_ = nullptr;
+      }
+    }
+  }
+  for (auto cfd : column_families_not_to_flush) {
+    cfd->UnrefAndTryDelete();
+  }
+  return status;
+}
+
+void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) {
+  bool made_progress = false;
+  JobContext job_context(next_job_id_.fetch_add(1), true);
+
+  TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCallFlush:start", nullptr);
+
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+                       immutable_db_options_.info_log.get());
+  TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:Start:1");
+  TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:Start:2");
+  {
+    InstrumentedMutexLock l(&mutex_);
+    assert(bg_flush_scheduled_);
+    num_running_flushes_++;
+
+    std::unique_ptr<std::list<uint64_t>::iterator>
+        pending_outputs_inserted_elem(new std::list<uint64_t>::iterator(
+            CaptureCurrentFileNumberInPendingOutputs()));
+    FlushReason reason;
+
+    Status s = BackgroundFlush(&made_progress, &job_context, &log_buffer,
+                               &reason, thread_pri);
+    if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped() &&
+        reason != FlushReason::kErrorRecovery) {
+      // Wait a little bit before retrying background flush in
+      // case this is an environmental problem and we do not want to
+      // chew up resources for failed flushes for the duration of
+      // the problem.
+      uint64_t error_cnt =
+          default_cf_internal_stats_->BumpAndGetBackgroundErrorCount();
+      bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
+      mutex_.Unlock();
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                      "Waiting after background flush error: %s"
+                      "Accumulated background error counts: %" PRIu64,
+                      s.ToString().c_str(), error_cnt);
+      log_buffer.FlushBufferToLog();
+      LogFlush(immutable_db_options_.info_log);
+      immutable_db_options_.clock->SleepForMicroseconds(1000000);
+      mutex_.Lock();
+    }
+
+    TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FlushFinish:0");
+    ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
+
+    // If flush failed, we want to delete all temporary files that we might have
+    // created. Thus, we force full scan in FindObsoleteFiles()
+    FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
+                                        !s.IsColumnFamilyDropped());
+    // delete unnecessary files if any, this is done outside the mutex
+    if (job_context.HaveSomethingToClean() ||
+        job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+      mutex_.Unlock();
+      TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FilesFound");
+      // Have to flush the info logs before bg_flush_scheduled_--
+      // because if bg_flush_scheduled_ becomes 0 and the lock is
+      // released, the deconstructor of DB can kick in and destroy all the
+      // states of DB so info_log might not be available after that point.
+      // It also applies to access other states that DB owns.
+      log_buffer.FlushBufferToLog();
+      if (job_context.HaveSomethingToDelete()) {
+        PurgeObsoleteFiles(job_context);
+      }
+      job_context.Clean();
+      mutex_.Lock();
+    }
+    TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:ContextCleanedUp");
+
+    assert(num_running_flushes_ > 0);
+    num_running_flushes_--;
+    bg_flush_scheduled_--;
+    // See if there's more work to be done
+    MaybeScheduleFlushOrCompaction();
+    atomic_flush_install_cv_.SignalAll();
+    bg_cv_.SignalAll();
+    // IMPORTANT: there should be no code after calling SignalAll. This call may
+    // signal the DB destructor that it's OK to proceed with destruction. In
+    // that case, all DB variables will be dealloacated and referencing them
+    // will cause trouble.
+  }
+}
+
+void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
+                                      Env::Priority bg_thread_pri) {
+  bool made_progress = false;
+  JobContext job_context(next_job_id_.fetch_add(1), true);
+  TEST_SYNC_POINT("BackgroundCallCompaction:0");
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+                       immutable_db_options_.info_log.get());
+  {
+    InstrumentedMutexLock l(&mutex_);
+
+    // This call will unlock/lock the mutex to wait for current running
+    // IngestExternalFile() calls to finish.
+    WaitForIngestFile();
+
+    num_running_compactions_++;
+
+    std::unique_ptr<std::list<uint64_t>::iterator>
+        pending_outputs_inserted_elem(new std::list<uint64_t>::iterator(
+            CaptureCurrentFileNumberInPendingOutputs()));
+
+    assert((bg_thread_pri == Env::Priority::BOTTOM &&
+            bg_bottom_compaction_scheduled_) ||
+           (bg_thread_pri == Env::Priority::LOW && bg_compaction_scheduled_));
+    Status s = BackgroundCompaction(&made_progress, &job_context, &log_buffer,
+                                    prepicked_compaction, bg_thread_pri);
+    TEST_SYNC_POINT("BackgroundCallCompaction:1");
+    if (s.IsBusy()) {
+      bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
+      mutex_.Unlock();
+      immutable_db_options_.clock->SleepForMicroseconds(
+          10000);  // prevent hot loop
+      mutex_.Lock();
+    } else if (!s.ok() && !s.IsShutdownInProgress() &&
+               !s.IsManualCompactionPaused() && !s.IsColumnFamilyDropped()) {
+      // Wait a little bit before retrying background compaction in
+      // case this is an environmental problem and we do not want to
+      // chew up resources for failed compactions for the duration of
+      // the problem.
+      uint64_t error_cnt =
+          default_cf_internal_stats_->BumpAndGetBackgroundErrorCount();
+      bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
+      mutex_.Unlock();
+      log_buffer.FlushBufferToLog();
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                      "Waiting after background compaction error: %s, "
+                      "Accumulated background error counts: %" PRIu64,
+                      s.ToString().c_str(), error_cnt);
+      LogFlush(immutable_db_options_.info_log);
+      immutable_db_options_.clock->SleepForMicroseconds(1000000);
+      mutex_.Lock();
+    } else if (s.IsManualCompactionPaused()) {
+      assert(prepicked_compaction);
+      ManualCompactionState* m = prepicked_compaction->manual_compaction_state;
+      assert(m);
+      ROCKS_LOG_BUFFER(&log_buffer, "[%s] [JOB %d] Manual compaction paused",
+                       m->cfd->GetName().c_str(), job_context.job_id);
+    }
+
+    ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
+
+    // If compaction failed, we want to delete all temporary files that we
+    // might have created (they might not be all recorded in job_context in
+    // case of a failure). Thus, we force full scan in FindObsoleteFiles()
+    FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
+                                        !s.IsManualCompactionPaused() &&
+                                        !s.IsColumnFamilyDropped() &&
+                                        !s.IsBusy());
+    TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:FoundObsoleteFiles");
+
+    // delete unnecessary files if any, this is done outside the mutex
+    if (job_context.HaveSomethingToClean() ||
+        job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+      mutex_.Unlock();
+      // Have to flush the info logs before bg_compaction_scheduled_--
+      // because if bg_flush_scheduled_ becomes 0 and the lock is
+      // released, the deconstructor of DB can kick in and destroy all the
+      // states of DB so info_log might not be available after that point.
+      // It also applies to access other states that DB owns.
+      log_buffer.FlushBufferToLog();
+      if (job_context.HaveSomethingToDelete()) {
+        PurgeObsoleteFiles(job_context);
+        TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles");
+      }
+      job_context.Clean();
+      mutex_.Lock();
+    }
+
+    assert(num_running_compactions_ > 0);
+    num_running_compactions_--;
+
+    if (bg_thread_pri == Env::Priority::LOW) {
+      bg_compaction_scheduled_--;
+    } else {
+      assert(bg_thread_pri == Env::Priority::BOTTOM);
+      bg_bottom_compaction_scheduled_--;
+    }
+
+    // See if there's more work to be done
+    MaybeScheduleFlushOrCompaction();
+
+    if (prepicked_compaction != nullptr &&
+        prepicked_compaction->task_token != nullptr) {
+      // Releasing task tokens affects (and asserts on) the DB state, so
+      // must be done before we potentially signal the DB close process to
+      // proceed below.
+      prepicked_compaction->task_token.reset();
+    }
+
+    if (made_progress ||
+        (bg_compaction_scheduled_ == 0 &&
+         bg_bottom_compaction_scheduled_ == 0) ||
+        HasPendingManualCompaction() || unscheduled_compactions_ == 0) {
+      // signal if
+      // * made_progress -- need to wakeup DelayWrite
+      // * bg_{bottom,}_compaction_scheduled_ == 0 -- need to wakeup ~DBImpl
+      // * HasPendingManualCompaction -- need to wakeup RunManualCompaction
+      // If none of this is true, there is no need to signal since nobody is
+      // waiting for it
+      bg_cv_.SignalAll();
+    }
+    // IMPORTANT: there should be no code after calling SignalAll. This call may
+    // signal the DB destructor that it's OK to proceed with destruction. In
+    // that case, all DB variables will be dealloacated and referencing them
+    // will cause trouble.
+  }
+}
+
+Status DBImpl::BackgroundCompaction(bool* made_progress,
+                                    JobContext* job_context,
+                                    LogBuffer* log_buffer,
+                                    PrepickedCompaction* prepicked_compaction,
+                                    Env::Priority thread_pri) {
+  ManualCompactionState* manual_compaction =
+      prepicked_compaction == nullptr
+          ? nullptr
+          : prepicked_compaction->manual_compaction_state;
+  *made_progress = false;
+  mutex_.AssertHeld();
+  TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Start");
+
+  bool is_manual = (manual_compaction != nullptr);
+  std::unique_ptr<Compaction> c;
+  if (prepicked_compaction != nullptr &&
+      prepicked_compaction->compaction != nullptr) {
+    c.reset(prepicked_compaction->compaction);
+  }
+  bool is_prepicked = is_manual || c;
+
+  // (manual_compaction->in_progress == false);
+  bool trivial_move_disallowed =
+      is_manual && manual_compaction->disallow_trivial_move;
+
+  CompactionJobStats compaction_job_stats;
+  Status status;
+  if (!error_handler_.IsBGWorkStopped()) {
+    if (shutting_down_.load(std::memory_order_acquire)) {
+      status = Status::ShutdownInProgress();
+    } else if (is_manual &&
+               manual_compaction->canceled.load(std::memory_order_acquire)) {
+      status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+    }
+  } else {
+    status = error_handler_.GetBGError();
+    // If we get here, it means a hard error happened after this compaction
+    // was scheduled by MaybeScheduleFlushOrCompaction(), but before it got
+    // a chance to execute. Since we didn't pop a cfd from the compaction
+    // queue, increment unscheduled_compactions_
+    unscheduled_compactions_++;
+  }
+
+  if (!status.ok()) {
+    if (is_manual) {
+      manual_compaction->status = status;
+      manual_compaction->done = true;
+      manual_compaction->in_progress = false;
+      manual_compaction = nullptr;
+    }
+    if (c) {
+      c->ReleaseCompactionFiles(status);
+      c.reset();
+    }
+    return status;
+  }
+
+  if (is_manual) {
+    // another thread cannot pick up the same work
+    manual_compaction->in_progress = true;
+  }
+
+  TEST_SYNC_POINT("DBImpl::BackgroundCompaction:InProgress");
+
+  std::unique_ptr<TaskLimiterToken> task_token;
+
+  // InternalKey manual_end_storage;
+  // InternalKey* manual_end = &manual_end_storage;
+  bool sfm_reserved_compact_space = false;
+  if (is_manual) {
+    ManualCompactionState* m = manual_compaction;
+    assert(m->in_progress);
+    if (!c) {
+      m->done = true;
+      m->manual_end = nullptr;
+      ROCKS_LOG_BUFFER(
+          log_buffer,
+          "[%s] Manual compaction from level-%d from %s .. "
+          "%s; nothing to do\n",
+          m->cfd->GetName().c_str(), m->input_level,
+          (m->begin ? m->begin->DebugString(true).c_str() : "(begin)"),
+          (m->end ? m->end->DebugString(true).c_str() : "(end)"));
+    } else {
+      // First check if we have enough room to do the compaction
+      bool enough_room = EnoughRoomForCompaction(
+          m->cfd, *(c->inputs()), &sfm_reserved_compact_space, log_buffer);
+
+      if (!enough_room) {
+        // Then don't do the compaction
+        c->ReleaseCompactionFiles(status);
+        c.reset();
+        // m's vars will get set properly at the end of this function,
+        // as long as status == CompactionTooLarge
+        status = Status::CompactionTooLarge();
+      } else {
+        ROCKS_LOG_BUFFER(
+            log_buffer,
+            "[%s] Manual compaction from level-%d to level-%d from %s .. "
+            "%s; will stop at %s\n",
+            m->cfd->GetName().c_str(), m->input_level, c->output_level(),
+            (m->begin ? m->begin->DebugString(true).c_str() : "(begin)"),
+            (m->end ? m->end->DebugString(true).c_str() : "(end)"),
+            ((m->done || m->manual_end == nullptr)
+                 ? "(end)"
+                 : m->manual_end->DebugString(true).c_str()));
+      }
+    }
+  } else if (!is_prepicked && !compaction_queue_.empty()) {
+    if (HasExclusiveManualCompaction()) {
+      // Can't compact right now, but try again later
+      TEST_SYNC_POINT("DBImpl::BackgroundCompaction()::Conflict");
+
+      // Stay in the compaction queue.
+      unscheduled_compactions_++;
+
+      return Status::OK();
+    }
+
+    auto cfd = PickCompactionFromQueue(&task_token, log_buffer);
+    if (cfd == nullptr) {
+      // Can't find any executable task from the compaction queue.
+      // All tasks have been throttled by compaction thread limiter.
+      ++unscheduled_compactions_;
+      return Status::Busy();
+    }
+
+    // We unreference here because the following code will take a Ref() on
+    // this cfd if it is going to use it (Compaction class holds a
+    // reference).
+    // This will all happen under a mutex so we don't have to be afraid of
+    // somebody else deleting it.
+    if (cfd->UnrefAndTryDelete()) {
+      // This was the last reference of the column family, so no need to
+      // compact.
+      return Status::OK();
+    }
+
+    // Pick up latest mutable CF Options and use it throughout the
+    // compaction job
+    // Compaction makes a copy of the latest MutableCFOptions. It should be used
+    // throughout the compaction procedure to make sure consistency. It will
+    // eventually be installed into SuperVersion
+    auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+    if (!mutable_cf_options->disable_auto_compactions && !cfd->IsDropped()) {
+      // NOTE: try to avoid unnecessary copy of MutableCFOptions if
+      // compaction is not necessary. Need to make sure mutex is held
+      // until we make a copy in the following code
+      TEST_SYNC_POINT("DBImpl::BackgroundCompaction():BeforePickCompaction");
+      c.reset(cfd->PickCompaction(*mutable_cf_options, mutable_db_options_,
+                                  log_buffer));
+      TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction");
+
+      if (c != nullptr) {
+        bool enough_room = EnoughRoomForCompaction(
+            cfd, *(c->inputs()), &sfm_reserved_compact_space, log_buffer);
+
+        if (!enough_room) {
+          // Then don't do the compaction
+          c->ReleaseCompactionFiles(status);
+          c->column_family_data()
+              ->current()
+              ->storage_info()
+              ->ComputeCompactionScore(*(c->immutable_options()),
+                                       *(c->mutable_cf_options()));
+          AddToCompactionQueue(cfd);
+          ++unscheduled_compactions_;
+
+          c.reset();
+          // Don't need to sleep here, because BackgroundCallCompaction
+          // will sleep if !s.ok()
+          status = Status::CompactionTooLarge();
+        } else {
+          // update statistics
+          size_t num_files = 0;
+          for (auto& each_level : *c->inputs()) {
+            num_files += each_level.files.size();
+          }
+          RecordInHistogram(stats_, NUM_FILES_IN_SINGLE_COMPACTION, num_files);
+
+          // There are three things that can change compaction score:
+          // 1) When flush or compaction finish. This case is covered by
+          // InstallSuperVersionAndScheduleWork
+          // 2) When MutableCFOptions changes. This case is also covered by
+          // InstallSuperVersionAndScheduleWork, because this is when the new
+          // options take effect.
+          // 3) When we Pick a new compaction, we "remove" those files being
+          // compacted from the calculation, which then influences compaction
+          // score. Here we check if we need the new compaction even without the
+          // files that are currently being compacted. If we need another
+          // compaction, we might be able to execute it in parallel, so we add
+          // it to the queue and schedule a new thread.
+          if (cfd->NeedsCompaction()) {
+            // Yes, we need more compactions!
+            AddToCompactionQueue(cfd);
+            ++unscheduled_compactions_;
+            MaybeScheduleFlushOrCompaction();
+          }
+        }
+      }
+    }
+  }
+
+  IOStatus io_s;
+  if (!c) {
+    // Nothing to do
+    ROCKS_LOG_BUFFER(log_buffer, "Compaction nothing to do");
+  } else if (c->deletion_compaction()) {
+    // TODO(icanadi) Do we want to honor snapshots here? i.e. not delete old
+    // file if there is alive snapshot pointing to it
+    TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction",
+                             c->column_family_data());
+    assert(c->num_input_files(1) == 0);
+    assert(c->column_family_data()->ioptions()->compaction_style ==
+           kCompactionStyleFIFO);
+
+    compaction_job_stats.num_input_files = c->num_input_files(0);
+
+    NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
+                            compaction_job_stats, job_context->job_id);
+
+    for (const auto& f : *c->inputs(0)) {
+      c->edit()->DeleteFile(c->level(), f->fd.GetNumber());
+    }
+    status = versions_->LogAndApply(c->column_family_data(),
+                                    *c->mutable_cf_options(), c->edit(),
+                                    &mutex_, directories_.GetDbDir());
+    io_s = versions_->io_status();
+    InstallSuperVersionAndScheduleWork(c->column_family_data(),
+                                       &job_context->superversion_contexts[0],
+                                       *c->mutable_cf_options());
+    ROCKS_LOG_BUFFER(log_buffer, "[%s] Deleted %d files\n",
+                     c->column_family_data()->GetName().c_str(),
+                     c->num_input_files(0));
+    *made_progress = true;
+    TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
+                             c->column_family_data());
+  } else if (!trivial_move_disallowed && c->IsTrivialMove()) {
+    TEST_SYNC_POINT("DBImpl::BackgroundCompaction:TrivialMove");
+    TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction",
+                             c->column_family_data());
+    // Instrument for event update
+    // TODO(yhchiang): add op details for showing trivial-move.
+    ThreadStatusUtil::SetColumnFamily(
+        c->column_family_data(), c->column_family_data()->ioptions()->env,
+        immutable_db_options_.enable_thread_tracking);
+    ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
+
+    compaction_job_stats.num_input_files = c->num_input_files(0);
+
+    NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
+                            compaction_job_stats, job_context->job_id);
+
+    // Move files to next level
+    int32_t moved_files = 0;
+    int64_t moved_bytes = 0;
+    for (unsigned int l = 0; l < c->num_input_levels(); l++) {
+      if (c->level(l) == c->output_level()) {
+        continue;
+      }
+      for (size_t i = 0; i < c->num_input_files(l); i++) {
+        FileMetaData* f = c->input(l, i);
+        c->edit()->DeleteFile(c->level(l), f->fd.GetNumber());
+        c->edit()->AddFile(
+            c->output_level(), f->fd.GetNumber(), f->fd.GetPathId(),
+            f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno,
+            f->fd.largest_seqno, f->marked_for_compaction, f->temperature,
+            f->oldest_blob_file_number, f->oldest_ancester_time,
+            f->file_creation_time, f->file_checksum, f->file_checksum_func_name,
+            f->unique_id);
+
+        ROCKS_LOG_BUFFER(
+            log_buffer,
+            "[%s] Moving #%" PRIu64 " to level-%d %" PRIu64 " bytes\n",
+            c->column_family_data()->GetName().c_str(), f->fd.GetNumber(),
+            c->output_level(), f->fd.GetFileSize());
+        ++moved_files;
+        moved_bytes += f->fd.GetFileSize();
+      }
+    }
+    if (c->compaction_reason() == CompactionReason::kLevelMaxLevelSize &&
+        c->immutable_options()->compaction_pri == kRoundRobin) {
+      int start_level = c->start_level();
+      if (start_level > 0) {
+        auto vstorage = c->input_version()->storage_info();
+        c->edit()->AddCompactCursor(
+            start_level,
+            vstorage->GetNextCompactCursor(start_level, c->num_input_files(0)));
+      }
+    }
+    status = versions_->LogAndApply(c->column_family_data(),
+                                    *c->mutable_cf_options(), c->edit(),
+                                    &mutex_, directories_.GetDbDir());
+    io_s = versions_->io_status();
+    // Use latest MutableCFOptions
+    InstallSuperVersionAndScheduleWork(c->column_family_data(),
+                                       &job_context->superversion_contexts[0],
+                                       *c->mutable_cf_options());
+
+    VersionStorageInfo::LevelSummaryStorage tmp;
+    c->column_family_data()->internal_stats()->IncBytesMoved(c->output_level(),
+                                                             moved_bytes);
+    {
+      event_logger_.LogToBuffer(log_buffer)
+          << "job" << job_context->job_id << "event"
+          << "trivial_move"
+          << "destination_level" << c->output_level() << "files" << moved_files
+          << "total_files_size" << moved_bytes;
+    }
+    ROCKS_LOG_BUFFER(
+        log_buffer,
+        "[%s] Moved #%d files to level-%d %" PRIu64 " bytes %s: %s\n",
+        c->column_family_data()->GetName().c_str(), moved_files,
+        c->output_level(), moved_bytes, status.ToString().c_str(),
+        c->column_family_data()->current()->storage_info()->LevelSummary(&tmp));
+    *made_progress = true;
+
+    // Clear Instrument
+    ThreadStatusUtil::ResetThreadStatus();
+    TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
+                             c->column_family_data());
+  } else if (!is_prepicked && c->output_level() > 0 &&
+             c->output_level() ==
+                 c->column_family_data()
+                     ->current()
+                     ->storage_info()
+                     ->MaxOutputLevel(
+                         immutable_db_options_.allow_ingest_behind) &&
+             env_->GetBackgroundThreads(Env::Priority::BOTTOM) > 0) {
+    // Forward compactions involving last level to the bottom pool if it exists,
+    // such that compactions unlikely to contribute to write stalls can be
+    // delayed or deprioritized.
+    TEST_SYNC_POINT("DBImpl::BackgroundCompaction:ForwardToBottomPriPool");
+    CompactionArg* ca = new CompactionArg;
+    ca->db = this;
+    ca->compaction_pri_ = Env::Priority::BOTTOM;
+    ca->prepicked_compaction = new PrepickedCompaction;
+    ca->prepicked_compaction->compaction = c.release();
+    ca->prepicked_compaction->manual_compaction_state = nullptr;
+    // Transfer requested token, so it doesn't need to do it again.
+    ca->prepicked_compaction->task_token = std::move(task_token);
+    ++bg_bottom_compaction_scheduled_;
+    env_->Schedule(&DBImpl::BGWorkBottomCompaction, ca, Env::Priority::BOTTOM,
+                   this, &DBImpl::UnscheduleCompactionCallback);
+  } else {
+    TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction",
+                             c->column_family_data());
+    int output_level __attribute__((__unused__));
+    output_level = c->output_level();
+    TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:NonTrivial",
+                             &output_level);
+    std::vector<SequenceNumber> snapshot_seqs;
+    SequenceNumber earliest_write_conflict_snapshot;
+    SnapshotChecker* snapshot_checker;
+    GetSnapshotContext(job_context, &snapshot_seqs,
+                       &earliest_write_conflict_snapshot, &snapshot_checker);
+    assert(is_snapshot_supported_ || snapshots_.empty());
+
+    CompactionJob compaction_job(
+        job_context->job_id, c.get(), immutable_db_options_,
+        mutable_db_options_, file_options_for_compaction_, versions_.get(),
+        &shutting_down_, log_buffer, directories_.GetDbDir(),
+        GetDataDir(c->column_family_data(), c->output_path_id()),
+        GetDataDir(c->column_family_data(), 0), stats_, &mutex_,
+        &error_handler_, snapshot_seqs, earliest_write_conflict_snapshot,
+        snapshot_checker, job_context, table_cache_, &event_logger_,
+        c->mutable_cf_options()->paranoid_file_checks,
+        c->mutable_cf_options()->report_bg_io_stats, dbname_,
+        &compaction_job_stats, thread_pri, io_tracer_,
+        is_manual ? manual_compaction->canceled
+                  : kManualCompactionCanceledFalse_,
+        db_id_, db_session_id_, c->column_family_data()->GetFullHistoryTsLow(),
+        c->trim_ts(), &blob_callback_, &bg_compaction_scheduled_,
+        &bg_bottom_compaction_scheduled_);
+    compaction_job.Prepare();
+
+    NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
+                            compaction_job_stats, job_context->job_id);
+    mutex_.Unlock();
+    TEST_SYNC_POINT_CALLBACK(
+        "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", nullptr);
+    // Should handle erorr?
+    compaction_job.Run().PermitUncheckedError();
+    TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun");
+    mutex_.Lock();
+
+    status = compaction_job.Install(*c->mutable_cf_options());
+    io_s = compaction_job.io_status();
+    if (status.ok()) {
+      InstallSuperVersionAndScheduleWork(c->column_family_data(),
+                                         &job_context->superversion_contexts[0],
+                                         *c->mutable_cf_options());
+    }
+    *made_progress = true;
+    TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
+                             c->column_family_data());
+  }
+
+  if (status.ok() && !io_s.ok()) {
+    status = io_s;
+  } else {
+    io_s.PermitUncheckedError();
+  }
+
+  if (c != nullptr) {
+    c->ReleaseCompactionFiles(status);
+    *made_progress = true;
+
+#ifndef ROCKSDB_LITE
+    // Need to make sure SstFileManager does its bookkeeping
+    auto sfm = static_cast<SstFileManagerImpl*>(
+        immutable_db_options_.sst_file_manager.get());
+    if (sfm && sfm_reserved_compact_space) {
+      sfm->OnCompactionCompletion(c.get());
+    }
+#endif  // ROCKSDB_LITE
+
+    NotifyOnCompactionCompleted(c->column_family_data(), c.get(), status,
+                                compaction_job_stats, job_context->job_id);
+  }
+
+  if (status.ok() || status.IsCompactionTooLarge() ||
+      status.IsManualCompactionPaused()) {
+    // Done
+  } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) {
+    // Ignore compaction errors found during shutting down
+  } else {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log, "Compaction error: %s",
+                   status.ToString().c_str());
+    if (!io_s.ok()) {
+      // Error while writing to MANIFEST.
+      // In fact, versions_->io_status() can also be the result of renaming
+      // CURRENT file. With current code, it's just difficult to tell. So just
+      // be pessimistic and try write to a new MANIFEST.
+      // TODO: distinguish between MANIFEST write and CURRENT renaming
+      auto err_reason = versions_->io_status().ok()
+                            ? BackgroundErrorReason::kCompaction
+                            : BackgroundErrorReason::kManifestWrite;
+      error_handler_.SetBGError(io_s, err_reason);
+    } else {
+      error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction);
+    }
+    if (c != nullptr && !is_manual && !error_handler_.IsBGWorkStopped()) {
+      // Put this cfd back in the compaction queue so we can retry after some
+      // time
+      auto cfd = c->column_family_data();
+      assert(cfd != nullptr);
+      // Since this compaction failed, we need to recompute the score so it
+      // takes the original input files into account
+      c->column_family_data()
+          ->current()
+          ->storage_info()
+          ->ComputeCompactionScore(*(c->immutable_options()),
+                                   *(c->mutable_cf_options()));
+      if (!cfd->queued_for_compaction()) {
+        AddToCompactionQueue(cfd);
+        ++unscheduled_compactions_;
+      }
+    }
+  }
+  // this will unref its input_version and column_family_data
+  c.reset();
+
+  if (is_manual) {
+    ManualCompactionState* m = manual_compaction;
+    if (!status.ok()) {
+      m->status = status;
+      m->done = true;
+    }
+    // For universal compaction:
+    //   Because universal compaction always happens at level 0, so one
+    //   compaction will pick up all overlapped files. No files will be
+    //   filtered out due to size limit and left for a successive compaction.
+    //   So we can safely conclude the current compaction.
+    //
+    //   Also note that, if we don't stop here, then the current compaction
+    //   writes a new file back to level 0, which will be used in successive
+    //   compaction. Hence the manual compaction will never finish.
+    //
+    // Stop the compaction if manual_end points to nullptr -- this means
+    // that we compacted the whole range. manual_end should always point
+    // to nullptr in case of universal compaction
+    if (m->manual_end == nullptr) {
+      m->done = true;
+    }
+    if (!m->done) {
+      // We only compacted part of the requested range.  Update *m
+      // to the range that is left to be compacted.
+      // Universal and FIFO compactions should always compact the whole range
+      assert(m->cfd->ioptions()->compaction_style !=
+                 kCompactionStyleUniversal ||
+             m->cfd->ioptions()->num_levels > 1);
+      assert(m->cfd->ioptions()->compaction_style != kCompactionStyleFIFO);
+      m->tmp_storage = *m->manual_end;
+      m->begin = &m->tmp_storage;
+      m->incomplete = true;
+    }
+    m->in_progress = false;  // not being processed anymore
+  }
+  TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Finish");
+  return status;
+}
+
+bool DBImpl::HasPendingManualCompaction() {
+  return (!manual_compaction_dequeue_.empty());
+}
+
+void DBImpl::AddManualCompaction(DBImpl::ManualCompactionState* m) {
+  assert(manual_compaction_paused_ == 0);
+  manual_compaction_dequeue_.push_back(m);
+}
+
+void DBImpl::RemoveManualCompaction(DBImpl::ManualCompactionState* m) {
+  // Remove from queue
+  std::deque<ManualCompactionState*>::iterator it =
+      manual_compaction_dequeue_.begin();
+  while (it != manual_compaction_dequeue_.end()) {
+    if (m == (*it)) {
+      it = manual_compaction_dequeue_.erase(it);
+      return;
+    }
+    ++it;
+  }
+  assert(false);
+  return;
+}
+
+bool DBImpl::ShouldntRunManualCompaction(ManualCompactionState* m) {
+  if (num_running_ingest_file_ > 0) {
+    // We need to wait for other IngestExternalFile() calls to finish
+    // before running a manual compaction.
+    return true;
+  }
+  if (m->exclusive) {
+    return (bg_bottom_compaction_scheduled_ > 0 ||
+            bg_compaction_scheduled_ > 0);
+  }
+  std::deque<ManualCompactionState*>::iterator it =
+      manual_compaction_dequeue_.begin();
+  bool seen = false;
+  while (it != manual_compaction_dequeue_.end()) {
+    if (m == (*it)) {
+      ++it;
+      seen = true;
+      continue;
+    } else if (MCOverlap(m, (*it)) && (!seen && !(*it)->in_progress)) {
+      // Consider the other manual compaction *it, conflicts if:
+      // overlaps with m
+      // and (*it) is ahead in the queue and is not yet in progress
+      return true;
+    }
+    ++it;
+  }
+  return false;
+}
+
+bool DBImpl::HaveManualCompaction(ColumnFamilyData* cfd) {
+  // Remove from priority queue
+  std::deque<ManualCompactionState*>::iterator it =
+      manual_compaction_dequeue_.begin();
+  while (it != manual_compaction_dequeue_.end()) {
+    if ((*it)->exclusive) {
+      return true;
+    }
+    if ((cfd == (*it)->cfd) && (!((*it)->in_progress || (*it)->done))) {
+      // Allow automatic compaction if manual compaction is
+      // in progress
+      return true;
+    }
+    ++it;
+  }
+  return false;
+}
+
+bool DBImpl::HasExclusiveManualCompaction() {
+  // Remove from priority queue
+  std::deque<ManualCompactionState*>::iterator it =
+      manual_compaction_dequeue_.begin();
+  while (it != manual_compaction_dequeue_.end()) {
+    if ((*it)->exclusive) {
+      return true;
+    }
+    ++it;
+  }
+  return false;
+}
+
+bool DBImpl::MCOverlap(ManualCompactionState* m, ManualCompactionState* m1) {
+  if ((m->exclusive) || (m1->exclusive)) {
+    return true;
+  }
+  if (m->cfd != m1->cfd) {
+    return false;
+  }
+  return false;
+}
+
+#ifndef ROCKSDB_LITE
+void DBImpl::BuildCompactionJobInfo(
+    const ColumnFamilyData* cfd, Compaction* c, const Status& st,
+    const CompactionJobStats& compaction_job_stats, const int job_id,
+    const Version* current, CompactionJobInfo* compaction_job_info) const {
+  assert(compaction_job_info != nullptr);
+  compaction_job_info->cf_id = cfd->GetID();
+  compaction_job_info->cf_name = cfd->GetName();
+  compaction_job_info->status = st;
+  compaction_job_info->thread_id = env_->GetThreadID();
+  compaction_job_info->job_id = job_id;
+  compaction_job_info->base_input_level = c->start_level();
+  compaction_job_info->output_level = c->output_level();
+  compaction_job_info->stats = compaction_job_stats;
+  compaction_job_info->table_properties = c->GetOutputTableProperties();
+  compaction_job_info->compaction_reason = c->compaction_reason();
+  compaction_job_info->compression = c->output_compression();
+  for (size_t i = 0; i < c->num_input_levels(); ++i) {
+    for (const auto fmd : *c->inputs(i)) {
+      const FileDescriptor& desc = fmd->fd;
+      const uint64_t file_number = desc.GetNumber();
+      auto fn = TableFileName(c->immutable_options()->cf_paths, file_number,
+                              desc.GetPathId());
+      compaction_job_info->input_files.push_back(fn);
+      compaction_job_info->input_file_infos.push_back(CompactionFileInfo{
+          static_cast<int>(i), file_number, fmd->oldest_blob_file_number});
+      if (compaction_job_info->table_properties.count(fn) == 0) {
+        std::shared_ptr<const TableProperties> tp;
+        auto s = current->GetTableProperties(&tp, fmd, &fn);
+        if (s.ok()) {
+          compaction_job_info->table_properties[fn] = tp;
+        }
+      }
+    }
+  }
+  for (const auto& newf : c->edit()->GetNewFiles()) {
+    const FileMetaData& meta = newf.second;
+    const FileDescriptor& desc = meta.fd;
+    const uint64_t file_number = desc.GetNumber();
+    compaction_job_info->output_files.push_back(TableFileName(
+        c->immutable_options()->cf_paths, file_number, desc.GetPathId()));
+    compaction_job_info->output_file_infos.push_back(CompactionFileInfo{
+        newf.first, file_number, meta.oldest_blob_file_number});
+  }
+  compaction_job_info->blob_compression_type =
+      c->mutable_cf_options()->blob_compression_type;
+
+  // Update BlobFilesInfo.
+  for (const auto& blob_file : c->edit()->GetBlobFileAdditions()) {
+    BlobFileAdditionInfo blob_file_addition_info(
+        BlobFileName(c->immutable_options()->cf_paths.front().path,
+                     blob_file.GetBlobFileNumber()) /*blob_file_path*/,
+        blob_file.GetBlobFileNumber(), blob_file.GetTotalBlobCount(),
+        blob_file.GetTotalBlobBytes());
+    compaction_job_info->blob_file_addition_infos.emplace_back(
+        std::move(blob_file_addition_info));
+  }
+
+  // Update BlobFilesGarbageInfo.
+  for (const auto& blob_file : c->edit()->GetBlobFileGarbages()) {
+    BlobFileGarbageInfo blob_file_garbage_info(
+        BlobFileName(c->immutable_options()->cf_paths.front().path,
+                     blob_file.GetBlobFileNumber()) /*blob_file_path*/,
+        blob_file.GetBlobFileNumber(), blob_file.GetGarbageBlobCount(),
+        blob_file.GetGarbageBlobBytes());
+    compaction_job_info->blob_file_garbage_infos.emplace_back(
+        std::move(blob_file_garbage_info));
+  }
+}
+#endif
+
+// SuperVersionContext gets created and destructed outside of the lock --
+// we use this conveniently to:
+// * malloc one SuperVersion() outside of the lock -- new_superversion
+// * delete SuperVersion()s outside of the lock -- superversions_to_free
+//
+// However, if InstallSuperVersionAndScheduleWork() gets called twice with the
+// same sv_context, we can't reuse the SuperVersion() that got
+// malloced because
+// first call already used it. In that rare case, we take a hit and create a
+// new SuperVersion() inside of the mutex. We do similar thing
+// for superversion_to_free
+
+void DBImpl::InstallSuperVersionAndScheduleWork(
+    ColumnFamilyData* cfd, SuperVersionContext* sv_context,
+    const MutableCFOptions& mutable_cf_options) {
+  mutex_.AssertHeld();
+
+  // Update max_total_in_memory_state_
+  size_t old_memtable_size = 0;
+  auto* old_sv = cfd->GetSuperVersion();
+  if (old_sv) {
+    old_memtable_size = old_sv->mutable_cf_options.write_buffer_size *
+                        old_sv->mutable_cf_options.max_write_buffer_number;
+  }
+
+  // this branch is unlikely to step in
+  if (UNLIKELY(sv_context->new_superversion == nullptr)) {
+    sv_context->NewSuperVersion();
+  }
+  cfd->InstallSuperVersion(sv_context, mutable_cf_options);
+
+  // There may be a small data race here. The snapshot tricking bottommost
+  // compaction may already be released here. But assuming there will always be
+  // newer snapshot created and released frequently, the compaction will be
+  // triggered soon anyway.
+  bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+  for (auto* my_cfd : *versions_->GetColumnFamilySet()) {
+    if (!my_cfd->ioptions()->allow_ingest_behind) {
+      bottommost_files_mark_threshold_ = std::min(
+          bottommost_files_mark_threshold_,
+          my_cfd->current()->storage_info()->bottommost_files_mark_threshold());
+    }
+  }
+
+  // Whenever we install new SuperVersion, we might need to issue new flushes or
+  // compactions.
+  SchedulePendingCompaction(cfd);
+  MaybeScheduleFlushOrCompaction();
+
+  // Update max_total_in_memory_state_
+  max_total_in_memory_state_ = max_total_in_memory_state_ - old_memtable_size +
+                               mutable_cf_options.write_buffer_size *
+                                   mutable_cf_options.max_write_buffer_number;
+}
+
+// ShouldPurge is called by FindObsoleteFiles when doing a full scan,
+// and db mutex (mutex_) should already be held.
+// Actually, the current implementation of FindObsoleteFiles with
+// full_scan=true can issue I/O requests to obtain list of files in
+// directories, e.g. env_->getChildren while holding db mutex.
+bool DBImpl::ShouldPurge(uint64_t file_number) const {
+  return files_grabbed_for_purge_.find(file_number) ==
+             files_grabbed_for_purge_.end() &&
+         purge_files_.find(file_number) == purge_files_.end();
+}
+
+// MarkAsGrabbedForPurge is called by FindObsoleteFiles, and db mutex
+// (mutex_) should already be held.
+void DBImpl::MarkAsGrabbedForPurge(uint64_t file_number) {
+  files_grabbed_for_purge_.insert(file_number);
+}
+
+void DBImpl::SetSnapshotChecker(SnapshotChecker* snapshot_checker) {
+  InstrumentedMutexLock l(&mutex_);
+  // snapshot_checker_ should only set once. If we need to set it multiple
+  // times, we need to make sure the old one is not deleted while it is still
+  // using by a compaction job.
+  assert(!snapshot_checker_);
+  snapshot_checker_.reset(snapshot_checker);
+}
+
+void DBImpl::GetSnapshotContext(
+    JobContext* job_context, std::vector<SequenceNumber>* snapshot_seqs,
+    SequenceNumber* earliest_write_conflict_snapshot,
+    SnapshotChecker** snapshot_checker_ptr) {
+  mutex_.AssertHeld();
+  assert(job_context != nullptr);
+  assert(snapshot_seqs != nullptr);
+  assert(earliest_write_conflict_snapshot != nullptr);
+  assert(snapshot_checker_ptr != nullptr);
+
+  *snapshot_checker_ptr = snapshot_checker_.get();
+  if (use_custom_gc_ && *snapshot_checker_ptr == nullptr) {
+    *snapshot_checker_ptr = DisableGCSnapshotChecker::Instance();
+  }
+  if (*snapshot_checker_ptr != nullptr) {
+    // If snapshot_checker is used, that means the flush/compaction may
+    // contain values not visible to snapshot taken after
+    // flush/compaction job starts. Take a snapshot and it will appear
+    // in snapshot_seqs and force compaction iterator to consider such
+    // snapshots.
+    const Snapshot* job_snapshot =
+        GetSnapshotImpl(false /*write_conflict_boundary*/, false /*lock*/);
+    job_context->job_snapshot.reset(new ManagedSnapshot(this, job_snapshot));
+  }
+  *snapshot_seqs = snapshots_.GetAll(earliest_write_conflict_snapshot);
+}
+
+Status DBImpl::WaitForCompact(bool wait_unscheduled) {
+  // Wait until the compaction completes
+  InstrumentedMutexLock l(&mutex_);
+  while ((bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
+          bg_flush_scheduled_ ||
+          (wait_unscheduled && unscheduled_compactions_)) &&
+         (error_handler_.GetBGError().ok())) {
+    bg_cv_.Wait();
+  }
+  return error_handler_.GetBGError();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_debug.cc b/src/rocksdb/db/db_impl/db_impl_debug.cc
new file mode 100644
index 000000000..7054b0669
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_debug.cc
@@ -0,0 +1,312 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef NDEBUG
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/error_handler.h"
+#include "db/periodic_task_scheduler.h"
+#include "monitoring/thread_status_updater.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+uint64_t DBImpl::TEST_GetLevel0TotalSize() {
+  InstrumentedMutexLock l(&mutex_);
+  return default_cf_handle_->cfd()->current()->storage_info()->NumLevelBytes(0);
+}
+
+Status DBImpl::TEST_SwitchWAL() {
+  WriteContext write_context;
+  InstrumentedMutexLock l(&mutex_);
+  void* writer = TEST_BeginWrite();
+  auto s = SwitchWAL(&write_context);
+  TEST_EndWrite(writer);
+  return s;
+}
+
+uint64_t DBImpl::TEST_MaxNextLevelOverlappingBytes(
+    ColumnFamilyHandle* column_family) {
+  ColumnFamilyData* cfd;
+  if (column_family == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+    cfd = cfh->cfd();
+  }
+  InstrumentedMutexLock l(&mutex_);
+  return cfd->current()->storage_info()->MaxNextLevelOverlappingBytes();
+}
+
+void DBImpl::TEST_GetFilesMetaData(
+    ColumnFamilyHandle* column_family,
+    std::vector<std::vector<FileMetaData>>* metadata,
+    std::vector<std::shared_ptr<BlobFileMetaData>>* blob_metadata) {
+  assert(metadata);
+
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  assert(cfh);
+
+  auto cfd = cfh->cfd();
+  assert(cfd);
+
+  InstrumentedMutexLock l(&mutex_);
+
+  const auto* current = cfd->current();
+  assert(current);
+
+  const auto* vstorage = current->storage_info();
+  assert(vstorage);
+
+  metadata->resize(NumberLevels());
+
+  for (int level = 0; level < NumberLevels(); ++level) {
+    const std::vector<FileMetaData*>& files = vstorage->LevelFiles(level);
+
+    (*metadata)[level].clear();
+    (*metadata)[level].reserve(files.size());
+
+    for (const auto& f : files) {
+      (*metadata)[level].push_back(*f);
+    }
+  }
+
+  if (blob_metadata) {
+    *blob_metadata = vstorage->GetBlobFiles();
+  }
+}
+
+uint64_t DBImpl::TEST_Current_Manifest_FileNo() {
+  return versions_->manifest_file_number();
+}
+
+uint64_t DBImpl::TEST_Current_Next_FileNo() {
+  return versions_->current_next_file_number();
+}
+
+Status DBImpl::TEST_CompactRange(int level, const Slice* begin,
+                                 const Slice* end,
+                                 ColumnFamilyHandle* column_family,
+                                 bool disallow_trivial_move) {
+  ColumnFamilyData* cfd;
+  if (column_family == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+    cfd = cfh->cfd();
+  }
+  int output_level =
+      (cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+       cfd->ioptions()->compaction_style == kCompactionStyleFIFO)
+          ? level
+          : level + 1;
+  return RunManualCompaction(
+      cfd, level, output_level, CompactRangeOptions(), begin, end, true,
+      disallow_trivial_move,
+      std::numeric_limits<uint64_t>::max() /*max_file_num_to_ignore*/,
+      "" /*trim_ts*/);
+}
+
+Status DBImpl::TEST_SwitchMemtable(ColumnFamilyData* cfd) {
+  WriteContext write_context;
+  InstrumentedMutexLock l(&mutex_);
+  if (cfd == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  }
+
+  Status s;
+  void* writer = TEST_BeginWrite();
+  if (two_write_queues_) {
+    WriteThread::Writer nonmem_w;
+    nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+    s = SwitchMemtable(cfd, &write_context);
+    nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+  } else {
+    s = SwitchMemtable(cfd, &write_context);
+  }
+  TEST_EndWrite(writer);
+  return s;
+}
+
+Status DBImpl::TEST_FlushMemTable(bool wait, bool allow_write_stall,
+                                  ColumnFamilyHandle* cfh) {
+  FlushOptions fo;
+  fo.wait = wait;
+  fo.allow_write_stall = allow_write_stall;
+  ColumnFamilyData* cfd;
+  if (cfh == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfhi = static_cast_with_check<ColumnFamilyHandleImpl>(cfh);
+    cfd = cfhi->cfd();
+  }
+  return FlushMemTable(cfd, fo, FlushReason::kTest);
+}
+
+Status DBImpl::TEST_FlushMemTable(ColumnFamilyData* cfd,
+                                  const FlushOptions& flush_opts) {
+  return FlushMemTable(cfd, flush_opts, FlushReason::kTest);
+}
+
+Status DBImpl::TEST_AtomicFlushMemTables(
+    const autovector<ColumnFamilyData*>& cfds, const FlushOptions& flush_opts) {
+  return AtomicFlushMemTables(cfds, flush_opts, FlushReason::kTest);
+}
+
+Status DBImpl::TEST_WaitForBackgroundWork() {
+  InstrumentedMutexLock l(&mutex_);
+  WaitForBackgroundWork();
+  return error_handler_.GetBGError();
+}
+
+Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) {
+  ColumnFamilyData* cfd;
+  if (column_family == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+    cfd = cfh->cfd();
+  }
+  return WaitForFlushMemTable(cfd, nullptr, false);
+}
+
+Status DBImpl::TEST_WaitForCompact(bool wait_unscheduled) {
+  // Wait until the compaction completes
+  return WaitForCompact(wait_unscheduled);
+}
+
+Status DBImpl::TEST_WaitForPurge() {
+  InstrumentedMutexLock l(&mutex_);
+  while (bg_purge_scheduled_ && error_handler_.GetBGError().ok()) {
+    bg_cv_.Wait();
+  }
+  return error_handler_.GetBGError();
+}
+
+Status DBImpl::TEST_GetBGError() {
+  InstrumentedMutexLock l(&mutex_);
+  return error_handler_.GetBGError();
+}
+
+void DBImpl::TEST_LockMutex() { mutex_.Lock(); }
+
+void DBImpl::TEST_UnlockMutex() { mutex_.Unlock(); }
+
+void* DBImpl::TEST_BeginWrite() {
+  auto w = new WriteThread::Writer();
+  write_thread_.EnterUnbatched(w, &mutex_);
+  return reinterpret_cast<void*>(w);
+}
+
+void DBImpl::TEST_EndWrite(void* w) {
+  auto writer = reinterpret_cast<WriteThread::Writer*>(w);
+  write_thread_.ExitUnbatched(writer);
+  delete writer;
+}
+
+size_t DBImpl::TEST_LogsToFreeSize() {
+  InstrumentedMutexLock l(&log_write_mutex_);
+  return logs_to_free_.size();
+}
+
+uint64_t DBImpl::TEST_LogfileNumber() {
+  InstrumentedMutexLock l(&mutex_);
+  return logfile_number_;
+}
+
+Status DBImpl::TEST_GetAllImmutableCFOptions(
+    std::unordered_map<std::string, const ImmutableCFOptions*>* iopts_map) {
+  std::vector<std::string> cf_names;
+  std::vector<const ImmutableCFOptions*> iopts;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      cf_names.push_back(cfd->GetName());
+      iopts.push_back(cfd->ioptions());
+    }
+  }
+  iopts_map->clear();
+  for (size_t i = 0; i < cf_names.size(); ++i) {
+    iopts_map->insert({cf_names[i], iopts[i]});
+  }
+
+  return Status::OK();
+}
+
+uint64_t DBImpl::TEST_FindMinLogContainingOutstandingPrep() {
+  return logs_with_prep_tracker_.FindMinLogContainingOutstandingPrep();
+}
+
+size_t DBImpl::TEST_PreparedSectionCompletedSize() {
+  return logs_with_prep_tracker_.TEST_PreparedSectionCompletedSize();
+}
+
+size_t DBImpl::TEST_LogsWithPrepSize() {
+  return logs_with_prep_tracker_.TEST_LogsWithPrepSize();
+}
+
+uint64_t DBImpl::TEST_FindMinPrepLogReferencedByMemTable() {
+  autovector<MemTable*> empty_list;
+  return FindMinPrepLogReferencedByMemTable(versions_.get(), empty_list);
+}
+
+Status DBImpl::TEST_GetLatestMutableCFOptions(
+    ColumnFamilyHandle* column_family, MutableCFOptions* mutable_cf_options) {
+  InstrumentedMutexLock l(&mutex_);
+
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  *mutable_cf_options = *cfh->cfd()->GetLatestMutableCFOptions();
+  return Status::OK();
+}
+
+int DBImpl::TEST_BGCompactionsAllowed() const {
+  InstrumentedMutexLock l(&mutex_);
+  return GetBGJobLimits().max_compactions;
+}
+
+int DBImpl::TEST_BGFlushesAllowed() const {
+  InstrumentedMutexLock l(&mutex_);
+  return GetBGJobLimits().max_flushes;
+}
+
+SequenceNumber DBImpl::TEST_GetLastVisibleSequence() const {
+  if (last_seq_same_as_publish_seq_) {
+    return versions_->LastSequence();
+  } else {
+    return versions_->LastAllocatedSequence();
+  }
+}
+
+size_t DBImpl::TEST_GetWalPreallocateBlockSize(
+    uint64_t write_buffer_size) const {
+  InstrumentedMutexLock l(&mutex_);
+  return GetWalPreallocateBlockSize(write_buffer_size);
+}
+
+#ifndef ROCKSDB_LITE
+void DBImpl::TEST_WaitForPeridicTaskRun(std::function<void()> callback) const {
+  periodic_task_scheduler_.TEST_WaitForRun(callback);
+}
+
+const PeriodicTaskScheduler& DBImpl::TEST_GetPeriodicTaskScheduler() const {
+  return periodic_task_scheduler_;
+}
+
+SeqnoToTimeMapping DBImpl::TEST_GetSeqnoToTimeMapping() const {
+  InstrumentedMutexLock l(&mutex_);
+  return seqno_time_mapping_;
+}
+
+#endif  // !ROCKSDB_LITE
+
+size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const {
+  return EstimateInMemoryStatsHistorySize();
+}
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // NDEBUG
diff --git a/src/rocksdb/db/db_impl/db_impl_experimental.cc b/src/rocksdb/db/db_impl/db_impl_experimental.cc
new file mode 100644
index 000000000..c1b1e4137
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_experimental.cc
@@ -0,0 +1,158 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <cinttypes>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/job_context.h"
+#include "db/version_set.h"
+#include "logging/logging.h"
+#include "rocksdb/status.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::SuggestCompactRange(ColumnFamilyHandle* column_family,
+                                   const Slice* begin, const Slice* end) {
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  auto cfd = cfh->cfd();
+  InternalKey start_key, end_key;
+  if (begin != nullptr) {
+    start_key.SetMinPossibleForUserKey(*begin);
+  }
+  if (end != nullptr) {
+    end_key.SetMaxPossibleForUserKey(*end);
+  }
+  {
+    InstrumentedMutexLock l(&mutex_);
+    auto vstorage = cfd->current()->storage_info();
+    for (int level = 0; level < vstorage->num_non_empty_levels() - 1; ++level) {
+      std::vector<FileMetaData*> inputs;
+      vstorage->GetOverlappingInputs(
+          level, begin == nullptr ? nullptr : &start_key,
+          end == nullptr ? nullptr : &end_key, &inputs);
+      for (auto f : inputs) {
+        f->marked_for_compaction = true;
+      }
+    }
+    // Since we have some more files to compact, we should also recompute
+    // compaction score
+    vstorage->ComputeCompactionScore(*cfd->ioptions(),
+                                     *cfd->GetLatestMutableCFOptions());
+    SchedulePendingCompaction(cfd);
+    MaybeScheduleFlushOrCompaction();
+  }
+  return Status::OK();
+}
+
+Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) {
+  assert(column_family);
+
+  if (target_level < 1) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "PromoteL0 FAILED. Invalid target level %d\n", target_level);
+    return Status::InvalidArgument("Invalid target level");
+  }
+
+  Status status;
+  VersionEdit edit;
+  JobContext job_context(next_job_id_.fetch_add(1), true);
+  {
+    InstrumentedMutexLock l(&mutex_);
+    auto* cfd = static_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+    const auto* vstorage = cfd->current()->storage_info();
+
+    if (target_level >= vstorage->num_levels()) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "PromoteL0 FAILED. Target level %d does not exist\n",
+                     target_level);
+      job_context.Clean();
+      status = Status::InvalidArgument("Target level does not exist");
+      return status;
+    }
+
+    // Sort L0 files by range.
+    const InternalKeyComparator* icmp = &cfd->internal_comparator();
+    auto l0_files = vstorage->LevelFiles(0);
+    std::sort(l0_files.begin(), l0_files.end(),
+              [icmp](FileMetaData* f1, FileMetaData* f2) {
+                return icmp->Compare(f1->largest, f2->largest) < 0;
+              });
+
+    // Check that no L0 file is being compacted and that they have
+    // non-overlapping ranges.
+    for (size_t i = 0; i < l0_files.size(); ++i) {
+      auto f = l0_files[i];
+      if (f->being_compacted) {
+        ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                       "PromoteL0 FAILED. File %" PRIu64 " being compacted\n",
+                       f->fd.GetNumber());
+        job_context.Clean();
+        status =
+            Status::InvalidArgument("PromoteL0 called during L0 compaction");
+        return status;
+      }
+
+      if (i == 0) continue;
+      auto prev_f = l0_files[i - 1];
+      if (icmp->Compare(prev_f->largest, f->smallest) >= 0) {
+        ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                       "PromoteL0 FAILED. Files %" PRIu64 " and %" PRIu64
+                       " have overlapping ranges\n",
+                       prev_f->fd.GetNumber(), f->fd.GetNumber());
+        job_context.Clean();
+        status = Status::InvalidArgument("L0 has overlapping files");
+        return status;
+      }
+    }
+
+    // Check that all levels up to target_level are empty.
+    for (int level = 1; level <= target_level; ++level) {
+      if (vstorage->NumLevelFiles(level) > 0) {
+        ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                       "PromoteL0 FAILED. Level %d not empty\n", level);
+        job_context.Clean();
+        status = Status::InvalidArgument(
+            "All levels up to target_level "
+            "must be empty");
+        return status;
+      }
+    }
+
+    edit.SetColumnFamily(cfd->GetID());
+    for (const auto& f : l0_files) {
+      edit.DeleteFile(0, f->fd.GetNumber());
+      edit.AddFile(target_level, f->fd.GetNumber(), f->fd.GetPathId(),
+                   f->fd.GetFileSize(), f->smallest, f->largest,
+                   f->fd.smallest_seqno, f->fd.largest_seqno,
+                   f->marked_for_compaction, f->temperature,
+                   f->oldest_blob_file_number, f->oldest_ancester_time,
+                   f->file_creation_time, f->file_checksum,
+                   f->file_checksum_func_name, f->unique_id);
+    }
+
+    status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+                                    &edit, &mutex_, directories_.GetDbDir());
+    if (status.ok()) {
+      InstallSuperVersionAndScheduleWork(cfd,
+                                         &job_context.superversion_contexts[0],
+                                         *cfd->GetLatestMutableCFOptions());
+    }
+  }  // lock released here
+  LogFlush(immutable_db_options_.info_log);
+  job_context.Clean();
+
+  return status;
+}
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_files.cc b/src/rocksdb/db/db_impl/db_impl_files.cc
new file mode 100644
index 000000000..058df4da7
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_files.cc
@@ -0,0 +1,1013 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <cinttypes>
+#include <set>
+#include <unordered_set>
+
+#include "db/db_impl/db_impl.h"
+#include "db/event_helpers.h"
+#include "db/memtable_list.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "file/sst_file_manager_impl.h"
+#include "logging/logging.h"
+#include "port/port.h"
+#include "util/autovector.h"
+#include "util/defer.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+uint64_t DBImpl::MinLogNumberToKeep() {
+  return versions_->min_log_number_to_keep();
+}
+
+uint64_t DBImpl::MinObsoleteSstNumberToKeep() {
+  mutex_.AssertHeld();
+  if (!pending_outputs_.empty()) {
+    return *pending_outputs_.begin();
+  }
+  return std::numeric_limits<uint64_t>::max();
+}
+
+Status DBImpl::DisableFileDeletions() {
+  Status s;
+  int my_disable_delete_obsolete_files;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    s = DisableFileDeletionsWithLock();
+    my_disable_delete_obsolete_files = disable_delete_obsolete_files_;
+  }
+  if (my_disable_delete_obsolete_files == 1) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Disabled");
+  } else {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "File Deletions Disabled, but already disabled. Counter: %d",
+                   my_disable_delete_obsolete_files);
+  }
+  return s;
+}
+
+// FIXME: can be inconsistent with DisableFileDeletions in cases like
+// DBImplReadOnly
+Status DBImpl::DisableFileDeletionsWithLock() {
+  mutex_.AssertHeld();
+  ++disable_delete_obsolete_files_;
+  return Status::OK();
+}
+
+Status DBImpl::EnableFileDeletions(bool force) {
+  // Job id == 0 means that this is not our background process, but rather
+  // user thread
+  JobContext job_context(0);
+  int saved_counter;  // initialize on all paths
+  {
+    InstrumentedMutexLock l(&mutex_);
+    if (force) {
+      // if force, we need to enable file deletions right away
+      disable_delete_obsolete_files_ = 0;
+    } else if (disable_delete_obsolete_files_ > 0) {
+      --disable_delete_obsolete_files_;
+    }
+    saved_counter = disable_delete_obsolete_files_;
+    if (saved_counter == 0) {
+      FindObsoleteFiles(&job_context, true);
+      bg_cv_.SignalAll();
+    }
+  }
+  if (saved_counter == 0) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Enabled");
+    if (job_context.HaveSomethingToDelete()) {
+      PurgeObsoleteFiles(job_context);
+    }
+  } else {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "File Deletions Enable, but not really enabled. Counter: %d",
+                   saved_counter);
+  }
+  job_context.Clean();
+  LogFlush(immutable_db_options_.info_log);
+  return Status::OK();
+}
+
+bool DBImpl::IsFileDeletionsEnabled() const {
+  return 0 == disable_delete_obsolete_files_;
+}
+
+// * Returns the list of live files in 'sst_live' and 'blob_live'.
+// If it's doing full scan:
+// * Returns the list of all files in the filesystem in
+// 'full_scan_candidate_files'.
+// Otherwise, gets obsolete files from VersionSet.
+// no_full_scan = true -- never do the full scan using GetChildren()
+// force = false -- don't force the full scan, except every
+//  mutable_db_options_.delete_obsolete_files_period_micros
+// force = true -- force the full scan
+void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
+                               bool no_full_scan) {
+  mutex_.AssertHeld();
+
+  // if deletion is disabled, do nothing
+  if (disable_delete_obsolete_files_ > 0) {
+    return;
+  }
+
+  bool doing_the_full_scan = false;
+
+  // logic for figuring out if we're doing the full scan
+  if (no_full_scan) {
+    doing_the_full_scan = false;
+  } else if (force ||
+             mutable_db_options_.delete_obsolete_files_period_micros == 0) {
+    doing_the_full_scan = true;
+  } else {
+    const uint64_t now_micros = immutable_db_options_.clock->NowMicros();
+    if ((delete_obsolete_files_last_run_ +
+         mutable_db_options_.delete_obsolete_files_period_micros) <
+        now_micros) {
+      doing_the_full_scan = true;
+      delete_obsolete_files_last_run_ = now_micros;
+    }
+  }
+
+  // don't delete files that might be currently written to from compaction
+  // threads
+  // Since job_context->min_pending_output is set, until file scan finishes,
+  // mutex_ cannot be released. Otherwise, we might see no min_pending_output
+  // here but later find newer generated unfinalized files while scanning.
+  job_context->min_pending_output = MinObsoleteSstNumberToKeep();
+
+  // Get obsolete files.  This function will also update the list of
+  // pending files in VersionSet().
+  versions_->GetObsoleteFiles(
+      &job_context->sst_delete_files, &job_context->blob_delete_files,
+      &job_context->manifest_delete_files, job_context->min_pending_output);
+
+  // Mark the elements in job_context->sst_delete_files and
+  // job_context->blob_delete_files as "grabbed for purge" so that other threads
+  // calling FindObsoleteFiles with full_scan=true will not add these files to
+  // candidate list for purge.
+  for (const auto& sst_to_del : job_context->sst_delete_files) {
+    MarkAsGrabbedForPurge(sst_to_del.metadata->fd.GetNumber());
+  }
+
+  for (const auto& blob_file : job_context->blob_delete_files) {
+    MarkAsGrabbedForPurge(blob_file.GetBlobFileNumber());
+  }
+
+  // store the current filenum, lognum, etc
+  job_context->manifest_file_number = versions_->manifest_file_number();
+  job_context->pending_manifest_file_number =
+      versions_->pending_manifest_file_number();
+  job_context->log_number = MinLogNumberToKeep();
+  job_context->prev_log_number = versions_->prev_log_number();
+
+  if (doing_the_full_scan) {
+    versions_->AddLiveFiles(&job_context->sst_live, &job_context->blob_live);
+    InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(),
+                                  dbname_);
+    std::set<std::string> paths;
+    for (size_t path_id = 0; path_id < immutable_db_options_.db_paths.size();
+         path_id++) {
+      paths.insert(immutable_db_options_.db_paths[path_id].path);
+    }
+
+    // Note that if cf_paths is not specified in the ColumnFamilyOptions
+    // of a particular column family, we use db_paths as the cf_paths
+    // setting. Hence, there can be multiple duplicates of files from db_paths
+    // in the following code. The duplicate are removed while identifying
+    // unique files in PurgeObsoleteFiles.
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      for (size_t path_id = 0; path_id < cfd->ioptions()->cf_paths.size();
+           path_id++) {
+        auto& path = cfd->ioptions()->cf_paths[path_id].path;
+
+        if (paths.find(path) == paths.end()) {
+          paths.insert(path);
+        }
+      }
+    }
+
+    IOOptions io_opts;
+    io_opts.do_not_recurse = true;
+    for (auto& path : paths) {
+      // set of all files in the directory. We'll exclude files that are still
+      // alive in the subsequent processings.
+      std::vector<std::string> files;
+      Status s = immutable_db_options_.fs->GetChildren(
+          path, io_opts, &files, /*IODebugContext*=*/nullptr);
+      s.PermitUncheckedError();  // TODO: What should we do on error?
+      for (const std::string& file : files) {
+        uint64_t number;
+        FileType type;
+        // 1. If we cannot parse the file name, we skip;
+        // 2. If the file with file_number equals number has already been
+        // grabbed for purge by another compaction job, or it has already been
+        // schedule for purge, we also skip it if we
+        // are doing full scan in order to avoid double deletion of the same
+        // file under race conditions. See
+        // https://github.com/facebook/rocksdb/issues/3573
+        if (!ParseFileName(file, &number, info_log_prefix.prefix, &type) ||
+            !ShouldPurge(number)) {
+          continue;
+        }
+
+        // TODO(icanadi) clean up this mess to avoid having one-off "/"
+        // prefixes
+        job_context->full_scan_candidate_files.emplace_back("/" + file, path);
+      }
+    }
+
+    // Add log files in wal_dir
+    if (!immutable_db_options_.IsWalDirSameAsDBPath(dbname_)) {
+      std::vector<std::string> log_files;
+      Status s = immutable_db_options_.fs->GetChildren(
+          immutable_db_options_.wal_dir, io_opts, &log_files,
+          /*IODebugContext*=*/nullptr);
+      s.PermitUncheckedError();  // TODO: What should we do on error?
+      for (const std::string& log_file : log_files) {
+        job_context->full_scan_candidate_files.emplace_back(
+            log_file, immutable_db_options_.wal_dir);
+      }
+    }
+
+    // Add info log files in db_log_dir
+    if (!immutable_db_options_.db_log_dir.empty() &&
+        immutable_db_options_.db_log_dir != dbname_) {
+      std::vector<std::string> info_log_files;
+      Status s = immutable_db_options_.fs->GetChildren(
+          immutable_db_options_.db_log_dir, io_opts, &info_log_files,
+          /*IODebugContext*=*/nullptr);
+      s.PermitUncheckedError();  // TODO: What should we do on error?
+      for (std::string& log_file : info_log_files) {
+        job_context->full_scan_candidate_files.emplace_back(
+            log_file, immutable_db_options_.db_log_dir);
+      }
+    }
+  } else {
+    // Instead of filling ob_context->sst_live and job_context->blob_live,
+    // directly remove files that show up in any Version. This is because
+    // candidate files tend to be a small percentage of all files, so it is
+    // usually cheaper to check them against every version, compared to
+    // building a map for all files.
+    versions_->RemoveLiveFiles(job_context->sst_delete_files,
+                               job_context->blob_delete_files);
+  }
+
+  // Before potentially releasing mutex and waiting on condvar, increment
+  // pending_purge_obsolete_files_ so that another thread executing
+  // `GetSortedWals` will wait until this thread finishes execution since the
+  // other thread will be waiting for `pending_purge_obsolete_files_`.
+  // pending_purge_obsolete_files_ MUST be decremented if there is nothing to
+  // delete.
+  ++pending_purge_obsolete_files_;
+
+  Defer cleanup([job_context, this]() {
+    assert(job_context != nullptr);
+    if (!job_context->HaveSomethingToDelete()) {
+      mutex_.AssertHeld();
+      --pending_purge_obsolete_files_;
+    }
+  });
+
+  // logs_ is empty when called during recovery, in which case there can't yet
+  // be any tracked obsolete logs
+  log_write_mutex_.Lock();
+
+  if (alive_log_files_.empty() || logs_.empty()) {
+    mutex_.AssertHeld();
+    // We may reach here if the db is DBImplSecondary
+    log_write_mutex_.Unlock();
+    return;
+  }
+
+  if (!alive_log_files_.empty() && !logs_.empty()) {
+    uint64_t min_log_number = job_context->log_number;
+    size_t num_alive_log_files = alive_log_files_.size();
+    // find newly obsoleted log files
+    while (alive_log_files_.begin()->number < min_log_number) {
+      auto& earliest = *alive_log_files_.begin();
+      if (immutable_db_options_.recycle_log_file_num >
+          log_recycle_files_.size()) {
+        ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                       "adding log %" PRIu64 " to recycle list\n",
+                       earliest.number);
+        log_recycle_files_.push_back(earliest.number);
+      } else {
+        job_context->log_delete_files.push_back(earliest.number);
+      }
+      if (job_context->size_log_to_delete == 0) {
+        job_context->prev_total_log_size = total_log_size_;
+        job_context->num_alive_log_files = num_alive_log_files;
+      }
+      job_context->size_log_to_delete += earliest.size;
+      total_log_size_ -= earliest.size;
+      alive_log_files_.pop_front();
+
+      // Current log should always stay alive since it can't have
+      // number < MinLogNumber().
+      assert(alive_log_files_.size());
+    }
+    log_write_mutex_.Unlock();
+    mutex_.Unlock();
+    log_write_mutex_.Lock();
+    while (!logs_.empty() && logs_.front().number < min_log_number) {
+      auto& log = logs_.front();
+      if (log.IsSyncing()) {
+        log_sync_cv_.Wait();
+        // logs_ could have changed while we were waiting.
+        continue;
+      }
+      logs_to_free_.push_back(log.ReleaseWriter());
+      logs_.pop_front();
+    }
+    // Current log cannot be obsolete.
+    assert(!logs_.empty());
+  }
+
+  // We're just cleaning up for DB::Write().
+  assert(job_context->logs_to_free.empty());
+  job_context->logs_to_free = logs_to_free_;
+
+  logs_to_free_.clear();
+  log_write_mutex_.Unlock();
+  mutex_.Lock();
+  job_context->log_recycle_files.assign(log_recycle_files_.begin(),
+                                        log_recycle_files_.end());
+}
+
+// Delete obsolete files and log status and information of file deletion
+void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname,
+                                    const std::string& path_to_sync,
+                                    FileType type, uint64_t number) {
+  TEST_SYNC_POINT_CALLBACK("DBImpl::DeleteObsoleteFileImpl::BeforeDeletion",
+                           const_cast<std::string*>(&fname));
+
+  Status file_deletion_status;
+  if (type == kTableFile || type == kBlobFile || type == kWalFile) {
+    // Rate limit WAL deletion only if its in the DB dir
+    file_deletion_status = DeleteDBFile(
+        &immutable_db_options_, fname, path_to_sync,
+        /*force_bg=*/false,
+        /*force_fg=*/(type == kWalFile) ? !wal_in_db_path_ : false);
+  } else {
+    file_deletion_status = env_->DeleteFile(fname);
+  }
+  TEST_SYNC_POINT_CALLBACK("DBImpl::DeleteObsoleteFileImpl:AfterDeletion",
+                           &file_deletion_status);
+  if (file_deletion_status.ok()) {
+    ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+                    "[JOB %d] Delete %s type=%d #%" PRIu64 " -- %s\n", job_id,
+                    fname.c_str(), type, number,
+                    file_deletion_status.ToString().c_str());
+  } else if (env_->FileExists(fname).IsNotFound()) {
+    ROCKS_LOG_INFO(
+        immutable_db_options_.info_log,
+        "[JOB %d] Tried to delete a non-existing file %s type=%d #%" PRIu64
+        " -- %s\n",
+        job_id, fname.c_str(), type, number,
+        file_deletion_status.ToString().c_str());
+  } else {
+    ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                    "[JOB %d] Failed to delete %s type=%d #%" PRIu64 " -- %s\n",
+                    job_id, fname.c_str(), type, number,
+                    file_deletion_status.ToString().c_str());
+  }
+  if (type == kTableFile) {
+    EventHelpers::LogAndNotifyTableFileDeletion(
+        &event_logger_, job_id, number, fname, file_deletion_status, GetName(),
+        immutable_db_options_.listeners);
+  }
+  if (type == kBlobFile) {
+    EventHelpers::LogAndNotifyBlobFileDeletion(
+        &event_logger_, immutable_db_options_.listeners, job_id, number, fname,
+        file_deletion_status, GetName());
+  }
+}
+
+// Diffs the files listed in filenames and those that do not
+// belong to live files are possibly removed. Also, removes all the
+// files in sst_delete_files and log_delete_files.
+// It is not necessary to hold the mutex when invoking this method.
+void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
+  TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:Begin");
+  // we'd better have sth to delete
+  assert(state.HaveSomethingToDelete());
+
+  // FindObsoleteFiles() should've populated this so nonzero
+  assert(state.manifest_file_number != 0);
+
+  // Now, convert lists to unordered sets, WITHOUT mutex held; set is slow.
+  std::unordered_set<uint64_t> sst_live_set(state.sst_live.begin(),
+                                            state.sst_live.end());
+  std::unordered_set<uint64_t> blob_live_set(state.blob_live.begin(),
+                                             state.blob_live.end());
+  std::unordered_set<uint64_t> log_recycle_files_set(
+      state.log_recycle_files.begin(), state.log_recycle_files.end());
+
+  auto candidate_files = state.full_scan_candidate_files;
+  candidate_files.reserve(
+      candidate_files.size() + state.sst_delete_files.size() +
+      state.blob_delete_files.size() + state.log_delete_files.size() +
+      state.manifest_delete_files.size());
+  // We may ignore the dbname when generating the file names.
+  for (auto& file : state.sst_delete_files) {
+    if (!file.only_delete_metadata) {
+      candidate_files.emplace_back(
+          MakeTableFileName(file.metadata->fd.GetNumber()), file.path);
+    }
+    if (file.metadata->table_reader_handle) {
+      table_cache_->Release(file.metadata->table_reader_handle);
+    }
+    file.DeleteMetadata();
+  }
+
+  for (const auto& blob_file : state.blob_delete_files) {
+    candidate_files.emplace_back(BlobFileName(blob_file.GetBlobFileNumber()),
+                                 blob_file.GetPath());
+  }
+
+  auto wal_dir = immutable_db_options_.GetWalDir();
+  for (auto file_num : state.log_delete_files) {
+    if (file_num > 0) {
+      candidate_files.emplace_back(LogFileName(file_num), wal_dir);
+    }
+  }
+  for (const auto& filename : state.manifest_delete_files) {
+    candidate_files.emplace_back(filename, dbname_);
+  }
+
+  // dedup state.candidate_files so we don't try to delete the same
+  // file twice
+  std::sort(candidate_files.begin(), candidate_files.end(),
+            [](const JobContext::CandidateFileInfo& lhs,
+               const JobContext::CandidateFileInfo& rhs) {
+              if (lhs.file_name > rhs.file_name) {
+                return true;
+              } else if (lhs.file_name < rhs.file_name) {
+                return false;
+              } else {
+                return (lhs.file_path > rhs.file_path);
+              }
+            });
+  candidate_files.erase(
+      std::unique(candidate_files.begin(), candidate_files.end()),
+      candidate_files.end());
+
+  if (state.prev_total_log_size > 0) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "[JOB %d] Try to delete WAL files size %" PRIu64
+                   ", prev total WAL file size %" PRIu64
+                   ", number of live WAL files %" ROCKSDB_PRIszt ".\n",
+                   state.job_id, state.size_log_to_delete,
+                   state.prev_total_log_size, state.num_alive_log_files);
+  }
+
+  std::vector<std::string> old_info_log_files;
+  InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(),
+                                dbname_);
+
+  // File numbers of most recent two OPTIONS file in candidate_files (found in
+  // previos FindObsoleteFiles(full_scan=true))
+  // At this point, there must not be any duplicate file numbers in
+  // candidate_files.
+  uint64_t optsfile_num1 = std::numeric_limits<uint64_t>::min();
+  uint64_t optsfile_num2 = std::numeric_limits<uint64_t>::min();
+  for (const auto& candidate_file : candidate_files) {
+    const std::string& fname = candidate_file.file_name;
+    uint64_t number;
+    FileType type;
+    if (!ParseFileName(fname, &number, info_log_prefix.prefix, &type) ||
+        type != kOptionsFile) {
+      continue;
+    }
+    if (number > optsfile_num1) {
+      optsfile_num2 = optsfile_num1;
+      optsfile_num1 = number;
+    } else if (number > optsfile_num2) {
+      optsfile_num2 = number;
+    }
+  }
+
+  // Close WALs before trying to delete them.
+  for (const auto w : state.logs_to_free) {
+    // TODO: maybe check the return value of Close.
+    auto s = w->Close();
+    s.PermitUncheckedError();
+  }
+
+  bool own_files = OwnTablesAndLogs();
+  std::unordered_set<uint64_t> files_to_del;
+  for (const auto& candidate_file : candidate_files) {
+    const std::string& to_delete = candidate_file.file_name;
+    uint64_t number;
+    FileType type;
+    // Ignore file if we cannot recognize it.
+    if (!ParseFileName(to_delete, &number, info_log_prefix.prefix, &type)) {
+      continue;
+    }
+
+    bool keep = true;
+    switch (type) {
+      case kWalFile:
+        keep = ((number >= state.log_number) ||
+                (number == state.prev_log_number) ||
+                (log_recycle_files_set.find(number) !=
+                 log_recycle_files_set.end()));
+        break;
+      case kDescriptorFile:
+        // Keep my manifest file, and any newer incarnations'
+        // (can happen during manifest roll)
+        keep = (number >= state.manifest_file_number);
+        break;
+      case kTableFile:
+        // If the second condition is not there, this makes
+        // DontDeletePendingOutputs fail
+        keep = (sst_live_set.find(number) != sst_live_set.end()) ||
+               number >= state.min_pending_output;
+        if (!keep) {
+          files_to_del.insert(number);
+        }
+        break;
+      case kBlobFile:
+        keep = number >= state.min_pending_output ||
+               (blob_live_set.find(number) != blob_live_set.end());
+        if (!keep) {
+          files_to_del.insert(number);
+        }
+        break;
+      case kTempFile:
+        // Any temp files that are currently being written to must
+        // be recorded in pending_outputs_, which is inserted into "live".
+        // Also, SetCurrentFile creates a temp file when writing out new
+        // manifest, which is equal to state.pending_manifest_file_number. We
+        // should not delete that file
+        //
+        // TODO(yhchiang): carefully modify the third condition to safely
+        //                 remove the temp options files.
+        keep = (sst_live_set.find(number) != sst_live_set.end()) ||
+               (blob_live_set.find(number) != blob_live_set.end()) ||
+               (number == state.pending_manifest_file_number) ||
+               (to_delete.find(kOptionsFileNamePrefix) != std::string::npos);
+        break;
+      case kInfoLogFile:
+        keep = true;
+        if (number != 0) {
+          old_info_log_files.push_back(to_delete);
+        }
+        break;
+      case kOptionsFile:
+        keep = (number >= optsfile_num2);
+        break;
+      case kCurrentFile:
+      case kDBLockFile:
+      case kIdentityFile:
+      case kMetaDatabase:
+        keep = true;
+        break;
+    }
+
+    if (keep) {
+      continue;
+    }
+
+    std::string fname;
+    std::string dir_to_sync;
+    if (type == kTableFile) {
+      // evict from cache
+      TableCache::Evict(table_cache_.get(), number);
+      fname = MakeTableFileName(candidate_file.file_path, number);
+      dir_to_sync = candidate_file.file_path;
+    } else if (type == kBlobFile) {
+      fname = BlobFileName(candidate_file.file_path, number);
+      dir_to_sync = candidate_file.file_path;
+    } else {
+      dir_to_sync = (type == kWalFile) ? wal_dir : dbname_;
+      fname = dir_to_sync +
+              ((!dir_to_sync.empty() && dir_to_sync.back() == '/') ||
+                       (!to_delete.empty() && to_delete.front() == '/')
+                   ? ""
+                   : "/") +
+              to_delete;
+    }
+
+#ifndef ROCKSDB_LITE
+    if (type == kWalFile && (immutable_db_options_.WAL_ttl_seconds > 0 ||
+                             immutable_db_options_.WAL_size_limit_MB > 0)) {
+      wal_manager_.ArchiveWALFile(fname, number);
+      continue;
+    }
+#endif  // !ROCKSDB_LITE
+
+    // If I do not own these files, e.g. secondary instance with max_open_files
+    // = -1, then no need to delete or schedule delete these files since they
+    // will be removed by their owner, e.g. the primary instance.
+    if (!own_files) {
+      continue;
+    }
+    if (schedule_only) {
+      InstrumentedMutexLock guard_lock(&mutex_);
+      SchedulePendingPurge(fname, dir_to_sync, type, number, state.job_id);
+    } else {
+      DeleteObsoleteFileImpl(state.job_id, fname, dir_to_sync, type, number);
+    }
+  }
+
+  {
+    // After purging obsolete files, remove them from files_grabbed_for_purge_.
+    InstrumentedMutexLock guard_lock(&mutex_);
+    autovector<uint64_t> to_be_removed;
+    for (auto fn : files_grabbed_for_purge_) {
+      if (files_to_del.count(fn) != 0) {
+        to_be_removed.emplace_back(fn);
+      }
+    }
+    for (auto fn : to_be_removed) {
+      files_grabbed_for_purge_.erase(fn);
+    }
+  }
+
+  // Delete old info log files.
+  size_t old_info_log_file_count = old_info_log_files.size();
+  if (old_info_log_file_count != 0 &&
+      old_info_log_file_count >= immutable_db_options_.keep_log_file_num) {
+    std::sort(old_info_log_files.begin(), old_info_log_files.end());
+    size_t end =
+        old_info_log_file_count - immutable_db_options_.keep_log_file_num;
+    for (unsigned int i = 0; i <= end; i++) {
+      std::string& to_delete = old_info_log_files.at(i);
+      std::string full_path_to_delete =
+          (immutable_db_options_.db_log_dir.empty()
+               ? dbname_
+               : immutable_db_options_.db_log_dir) +
+          "/" + to_delete;
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "[JOB %d] Delete info log file %s\n", state.job_id,
+                     full_path_to_delete.c_str());
+      Status s = env_->DeleteFile(full_path_to_delete);
+      if (!s.ok()) {
+        if (env_->FileExists(full_path_to_delete).IsNotFound()) {
+          ROCKS_LOG_INFO(
+              immutable_db_options_.info_log,
+              "[JOB %d] Tried to delete non-existing info log file %s FAILED "
+              "-- %s\n",
+              state.job_id, to_delete.c_str(), s.ToString().c_str());
+        } else {
+          ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                          "[JOB %d] Delete info log file %s FAILED -- %s\n",
+                          state.job_id, to_delete.c_str(),
+                          s.ToString().c_str());
+        }
+      }
+    }
+  }
+#ifndef ROCKSDB_LITE
+  wal_manager_.PurgeObsoleteWALFiles();
+#endif  // ROCKSDB_LITE
+  LogFlush(immutable_db_options_.info_log);
+  InstrumentedMutexLock l(&mutex_);
+  --pending_purge_obsolete_files_;
+  assert(pending_purge_obsolete_files_ >= 0);
+  if (schedule_only) {
+    // Must change from pending_purge_obsolete_files_ to bg_purge_scheduled_
+    // while holding mutex (for GetSortedWalFiles() etc.)
+    SchedulePurge();
+  }
+  if (pending_purge_obsolete_files_ == 0) {
+    bg_cv_.SignalAll();
+  }
+  TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:End");
+}
+
+void DBImpl::DeleteObsoleteFiles() {
+  mutex_.AssertHeld();
+  JobContext job_context(next_job_id_.fetch_add(1));
+  FindObsoleteFiles(&job_context, true);
+
+  mutex_.Unlock();
+  if (job_context.HaveSomethingToDelete()) {
+    bool defer_purge = immutable_db_options_.avoid_unnecessary_blocking_io;
+    PurgeObsoleteFiles(job_context, defer_purge);
+  }
+  job_context.Clean();
+  mutex_.Lock();
+}
+
+uint64_t FindMinPrepLogReferencedByMemTable(
+    VersionSet* vset, const autovector<MemTable*>& memtables_to_flush) {
+  uint64_t min_log = 0;
+
+  // we must look through the memtables for two phase transactions
+  // that have been committed but not yet flushed
+  std::unordered_set<MemTable*> memtables_to_flush_set(
+      memtables_to_flush.begin(), memtables_to_flush.end());
+  for (auto loop_cfd : *vset->GetColumnFamilySet()) {
+    if (loop_cfd->IsDropped()) {
+      continue;
+    }
+
+    auto log = loop_cfd->imm()->PrecomputeMinLogContainingPrepSection(
+        &memtables_to_flush_set);
+
+    if (log > 0 && (min_log == 0 || log < min_log)) {
+      min_log = log;
+    }
+
+    log = loop_cfd->mem()->GetMinLogContainingPrepSection();
+
+    if (log > 0 && (min_log == 0 || log < min_log)) {
+      min_log = log;
+    }
+  }
+
+  return min_log;
+}
+
+uint64_t FindMinPrepLogReferencedByMemTable(
+    VersionSet* vset,
+    const autovector<const autovector<MemTable*>*>& memtables_to_flush) {
+  uint64_t min_log = 0;
+
+  std::unordered_set<MemTable*> memtables_to_flush_set;
+  for (const autovector<MemTable*>* memtables : memtables_to_flush) {
+    memtables_to_flush_set.insert(memtables->begin(), memtables->end());
+  }
+  for (auto loop_cfd : *vset->GetColumnFamilySet()) {
+    if (loop_cfd->IsDropped()) {
+      continue;
+    }
+
+    auto log = loop_cfd->imm()->PrecomputeMinLogContainingPrepSection(
+        &memtables_to_flush_set);
+    if (log > 0 && (min_log == 0 || log < min_log)) {
+      min_log = log;
+    }
+
+    log = loop_cfd->mem()->GetMinLogContainingPrepSection();
+    if (log > 0 && (min_log == 0 || log < min_log)) {
+      min_log = log;
+    }
+  }
+
+  return min_log;
+}
+
+uint64_t PrecomputeMinLogNumberToKeepNon2PC(
+    VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
+    const autovector<VersionEdit*>& edit_list) {
+  assert(vset != nullptr);
+
+  // Precompute the min log number containing unflushed data for the column
+  // family being flushed (`cfd_to_flush`).
+  uint64_t cf_min_log_number_to_keep = 0;
+  for (auto& e : edit_list) {
+    if (e->HasLogNumber()) {
+      cf_min_log_number_to_keep =
+          std::max(cf_min_log_number_to_keep, e->GetLogNumber());
+    }
+  }
+  if (cf_min_log_number_to_keep == 0) {
+    // No version edit contains information on log number. The log number
+    // for this column family should stay the same as it is.
+    cf_min_log_number_to_keep = cfd_to_flush.GetLogNumber();
+  }
+
+  // Get min log number containing unflushed data for other column families.
+  uint64_t min_log_number_to_keep =
+      vset->PreComputeMinLogNumberWithUnflushedData(&cfd_to_flush);
+  if (cf_min_log_number_to_keep != 0) {
+    min_log_number_to_keep =
+        std::min(cf_min_log_number_to_keep, min_log_number_to_keep);
+  }
+  return min_log_number_to_keep;
+}
+
+uint64_t PrecomputeMinLogNumberToKeepNon2PC(
+    VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
+    const autovector<autovector<VersionEdit*>>& edit_lists) {
+  assert(vset != nullptr);
+  assert(!cfds_to_flush.empty());
+  assert(cfds_to_flush.size() == edit_lists.size());
+
+  uint64_t min_log_number_to_keep = std::numeric_limits<uint64_t>::max();
+  for (const auto& edit_list : edit_lists) {
+    uint64_t log = 0;
+    for (const auto& e : edit_list) {
+      if (e->HasLogNumber()) {
+        log = std::max(log, e->GetLogNumber());
+      }
+    }
+    if (log != 0) {
+      min_log_number_to_keep = std::min(min_log_number_to_keep, log);
+    }
+  }
+  if (min_log_number_to_keep == std::numeric_limits<uint64_t>::max()) {
+    min_log_number_to_keep = cfds_to_flush[0]->GetLogNumber();
+    for (size_t i = 1; i < cfds_to_flush.size(); i++) {
+      min_log_number_to_keep =
+          std::min(min_log_number_to_keep, cfds_to_flush[i]->GetLogNumber());
+    }
+  }
+
+  std::unordered_set<const ColumnFamilyData*> flushed_cfds(
+      cfds_to_flush.begin(), cfds_to_flush.end());
+  min_log_number_to_keep =
+      std::min(min_log_number_to_keep,
+               vset->PreComputeMinLogNumberWithUnflushedData(flushed_cfds));
+
+  return min_log_number_to_keep;
+}
+
+uint64_t PrecomputeMinLogNumberToKeep2PC(
+    VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
+    const autovector<VersionEdit*>& edit_list,
+    const autovector<MemTable*>& memtables_to_flush,
+    LogsWithPrepTracker* prep_tracker) {
+  assert(vset != nullptr);
+  assert(prep_tracker != nullptr);
+  // Calculate updated min_log_number_to_keep
+  // Since the function should only be called in 2pc mode, log number in
+  // the version edit should be sufficient.
+
+  uint64_t min_log_number_to_keep =
+      PrecomputeMinLogNumberToKeepNon2PC(vset, cfd_to_flush, edit_list);
+
+  // if are 2pc we must consider logs containing prepared
+  // sections of outstanding transactions.
+  //
+  // We must check min logs with outstanding prep before we check
+  // logs references by memtables because a log referenced by the
+  // first data structure could transition to the second under us.
+  //
+  // TODO: iterating over all column families under db mutex.
+  // should find more optimal solution
+  auto min_log_in_prep_heap =
+      prep_tracker->FindMinLogContainingOutstandingPrep();
+
+  if (min_log_in_prep_heap != 0 &&
+      min_log_in_prep_heap < min_log_number_to_keep) {
+    min_log_number_to_keep = min_log_in_prep_heap;
+  }
+
+  uint64_t min_log_refed_by_mem =
+      FindMinPrepLogReferencedByMemTable(vset, memtables_to_flush);
+
+  if (min_log_refed_by_mem != 0 &&
+      min_log_refed_by_mem < min_log_number_to_keep) {
+    min_log_number_to_keep = min_log_refed_by_mem;
+  }
+  return min_log_number_to_keep;
+}
+
+uint64_t PrecomputeMinLogNumberToKeep2PC(
+    VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
+    const autovector<autovector<VersionEdit*>>& edit_lists,
+    const autovector<const autovector<MemTable*>*>& memtables_to_flush,
+    LogsWithPrepTracker* prep_tracker) {
+  assert(vset != nullptr);
+  assert(prep_tracker != nullptr);
+  assert(cfds_to_flush.size() == edit_lists.size());
+  assert(cfds_to_flush.size() == memtables_to_flush.size());
+
+  uint64_t min_log_number_to_keep =
+      PrecomputeMinLogNumberToKeepNon2PC(vset, cfds_to_flush, edit_lists);
+
+  uint64_t min_log_in_prep_heap =
+      prep_tracker->FindMinLogContainingOutstandingPrep();
+
+  if (min_log_in_prep_heap != 0 &&
+      min_log_in_prep_heap < min_log_number_to_keep) {
+    min_log_number_to_keep = min_log_in_prep_heap;
+  }
+
+  uint64_t min_log_refed_by_mem =
+      FindMinPrepLogReferencedByMemTable(vset, memtables_to_flush);
+
+  if (min_log_refed_by_mem != 0 &&
+      min_log_refed_by_mem < min_log_number_to_keep) {
+    min_log_number_to_keep = min_log_refed_by_mem;
+  }
+
+  return min_log_number_to_keep;
+}
+
+void DBImpl::SetDBId(std::string&& id, bool read_only,
+                     RecoveryContext* recovery_ctx) {
+  assert(db_id_.empty());
+  assert(!id.empty());
+  db_id_ = std::move(id);
+  if (!read_only && immutable_db_options_.write_dbid_to_manifest) {
+    assert(recovery_ctx != nullptr);
+    assert(versions_->GetColumnFamilySet() != nullptr);
+    VersionEdit edit;
+    edit.SetDBId(db_id_);
+    versions_->db_id_ = db_id_;
+    recovery_ctx->UpdateVersionEdits(
+        versions_->GetColumnFamilySet()->GetDefault(), edit);
+  }
+}
+
+Status DBImpl::SetupDBId(bool read_only, RecoveryContext* recovery_ctx) {
+  Status s;
+  // Check for the IDENTITY file and create it if not there or
+  // broken or not matching manifest
+  std::string db_id_in_file;
+  s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr);
+  if (s.ok()) {
+    s = GetDbIdentityFromIdentityFile(&db_id_in_file);
+    if (s.ok() && !db_id_in_file.empty()) {
+      if (db_id_.empty()) {
+        // Loaded from file and wasn't already known from manifest
+        SetDBId(std::move(db_id_in_file), read_only, recovery_ctx);
+        return s;
+      } else if (db_id_ == db_id_in_file) {
+        // Loaded from file and matches manifest
+        return s;
+      }
+    }
+  }
+  if (s.IsNotFound()) {
+    s = Status::OK();
+  }
+  if (!s.ok()) {
+    assert(s.IsIOError());
+    return s;
+  }
+  // Otherwise IDENTITY file is missing or no good.
+  // Generate new id if needed
+  if (db_id_.empty()) {
+    SetDBId(env_->GenerateUniqueId(), read_only, recovery_ctx);
+  }
+  // Persist it to IDENTITY file if allowed
+  if (!read_only) {
+    s = SetIdentityFile(env_, dbname_, db_id_);
+  }
+  return s;
+}
+
+Status DBImpl::DeleteUnreferencedSstFiles(RecoveryContext* recovery_ctx) {
+  mutex_.AssertHeld();
+  std::vector<std::string> paths;
+  paths.push_back(NormalizePath(dbname_ + std::string(1, kFilePathSeparator)));
+  for (const auto& db_path : immutable_db_options_.db_paths) {
+    paths.push_back(
+        NormalizePath(db_path.path + std::string(1, kFilePathSeparator)));
+  }
+  for (const auto* cfd : *versions_->GetColumnFamilySet()) {
+    for (const auto& cf_path : cfd->ioptions()->cf_paths) {
+      paths.push_back(
+          NormalizePath(cf_path.path + std::string(1, kFilePathSeparator)));
+    }
+  }
+  // Dedup paths
+  std::sort(paths.begin(), paths.end());
+  paths.erase(std::unique(paths.begin(), paths.end()), paths.end());
+
+  uint64_t next_file_number = versions_->current_next_file_number();
+  uint64_t largest_file_number = next_file_number;
+  Status s;
+  for (const auto& path : paths) {
+    std::vector<std::string> files;
+    s = env_->GetChildren(path, &files);
+    if (!s.ok()) {
+      break;
+    }
+    for (const auto& fname : files) {
+      uint64_t number = 0;
+      FileType type;
+      if (!ParseFileName(fname, &number, &type)) {
+        continue;
+      }
+      // path ends with '/' or '\\'
+      const std::string normalized_fpath = path + fname;
+      largest_file_number = std::max(largest_file_number, number);
+      if (type == kTableFile && number >= next_file_number &&
+          recovery_ctx->files_to_delete_.find(normalized_fpath) ==
+              recovery_ctx->files_to_delete_.end()) {
+        recovery_ctx->files_to_delete_.emplace(normalized_fpath);
+      }
+    }
+  }
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (largest_file_number >= next_file_number) {
+    versions_->next_file_number_.store(largest_file_number + 1);
+  }
+
+  VersionEdit edit;
+  edit.SetNextFile(versions_->next_file_number_.load());
+  assert(versions_->GetColumnFamilySet());
+  ColumnFamilyData* default_cfd = versions_->GetColumnFamilySet()->GetDefault();
+  assert(default_cfd);
+  recovery_ctx->UpdateVersionEdits(default_cfd, edit);
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_open.cc b/src/rocksdb/db/db_impl/db_impl_open.cc
new file mode 100644
index 000000000..40ffa2e85
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_open.cc
@@ -0,0 +1,2106 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <cinttypes>
+
+#include "db/builder.h"
+#include "db/db_impl/db_impl.h"
+#include "db/error_handler.h"
+#include "db/periodic_task_scheduler.h"
+#include "env/composite_env_wrapper.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "file/writable_file_writer.h"
+#include "logging/logging.h"
+#include "monitoring/persistent_stats_history.h"
+#include "options/options_helper.h"
+#include "rocksdb/table.h"
+#include "rocksdb/wal_filter.h"
+#include "test_util/sync_point.h"
+#include "util/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+Options SanitizeOptions(const std::string& dbname, const Options& src,
+                        bool read_only, Status* logger_creation_s) {
+  auto db_options =
+      SanitizeOptions(dbname, DBOptions(src), read_only, logger_creation_s);
+  ImmutableDBOptions immutable_db_options(db_options);
+  auto cf_options =
+      SanitizeOptions(immutable_db_options, ColumnFamilyOptions(src));
+  return Options(db_options, cf_options);
+}
+
+DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src,
+                          bool read_only, Status* logger_creation_s) {
+  DBOptions result(src);
+
+  if (result.env == nullptr) {
+    result.env = Env::Default();
+  }
+
+  // result.max_open_files means an "infinite" open files.
+  if (result.max_open_files != -1) {
+    int max_max_open_files = port::GetMaxOpenFiles();
+    if (max_max_open_files == -1) {
+      max_max_open_files = 0x400000;
+    }
+    ClipToRange(&result.max_open_files, 20, max_max_open_files);
+    TEST_SYNC_POINT_CALLBACK("SanitizeOptions::AfterChangeMaxOpenFiles",
+                             &result.max_open_files);
+  }
+
+  if (result.info_log == nullptr && !read_only) {
+    Status s = CreateLoggerFromOptions(dbname, result, &result.info_log);
+    if (!s.ok()) {
+      // No place suitable for logging
+      result.info_log = nullptr;
+      if (logger_creation_s) {
+        *logger_creation_s = s;
+      }
+    }
+  }
+
+  if (!result.write_buffer_manager) {
+    result.write_buffer_manager.reset(
+        new WriteBufferManager(result.db_write_buffer_size));
+  }
+  auto bg_job_limits = DBImpl::GetBGJobLimits(
+      result.max_background_flushes, result.max_background_compactions,
+      result.max_background_jobs, true /* parallelize_compactions */);
+  result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_compactions,
+                                           Env::Priority::LOW);
+  result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_flushes,
+                                           Env::Priority::HIGH);
+
+  if (result.rate_limiter.get() != nullptr) {
+    if (result.bytes_per_sync == 0) {
+      result.bytes_per_sync = 1024 * 1024;
+    }
+  }
+
+  if (result.delayed_write_rate == 0) {
+    if (result.rate_limiter.get() != nullptr) {
+      result.delayed_write_rate = result.rate_limiter->GetBytesPerSecond();
+    }
+    if (result.delayed_write_rate == 0) {
+      result.delayed_write_rate = 16 * 1024 * 1024;
+    }
+  }
+
+  if (result.WAL_ttl_seconds > 0 || result.WAL_size_limit_MB > 0) {
+    result.recycle_log_file_num = false;
+  }
+
+  if (result.recycle_log_file_num &&
+      (result.wal_recovery_mode ==
+           WALRecoveryMode::kTolerateCorruptedTailRecords ||
+       result.wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery ||
+       result.wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency)) {
+    // - kTolerateCorruptedTailRecords is inconsistent with recycle log file
+    //   feature. WAL recycling expects recovery success upon encountering a
+    //   corrupt record at the point where new data ends and recycled data
+    //   remains at the tail. However, `kTolerateCorruptedTailRecords` must fail
+    //   upon encountering any such corrupt record, as it cannot differentiate
+    //   between this and a real corruption, which would cause committed updates
+    //   to be truncated -- a violation of the recovery guarantee.
+    // - kPointInTimeRecovery and kAbsoluteConsistency are incompatible with
+    //   recycle log file feature temporarily due to a bug found introducing a
+    //   hole in the recovered data
+    //   (https://github.com/facebook/rocksdb/pull/7252#issuecomment-673766236).
+    //   Besides this bug, we believe the features are fundamentally compatible.
+    result.recycle_log_file_num = 0;
+  }
+
+  if (result.db_paths.size() == 0) {
+    result.db_paths.emplace_back(dbname, std::numeric_limits<uint64_t>::max());
+  } else if (result.wal_dir.empty()) {
+    // Use dbname as default
+    result.wal_dir = dbname;
+  }
+  if (!result.wal_dir.empty()) {
+    // If there is a wal_dir already set, check to see if the wal_dir is the
+    // same as the dbname AND the same as the db_path[0] (which must exist from
+    // a few lines ago). If the wal_dir matches both of these values, then clear
+    // the wal_dir value, which will make wal_dir == dbname.  Most likely this
+    // condition was the result of reading an old options file where we forced
+    // wal_dir to be set (to dbname).
+    auto npath = NormalizePath(dbname + "/");
+    if (npath == NormalizePath(result.wal_dir + "/") &&
+        npath == NormalizePath(result.db_paths[0].path + "/")) {
+      result.wal_dir.clear();
+    }
+  }
+
+  if (!result.wal_dir.empty() && result.wal_dir.back() == '/') {
+    result.wal_dir = result.wal_dir.substr(0, result.wal_dir.size() - 1);
+  }
+
+  if (result.use_direct_reads && result.compaction_readahead_size == 0) {
+    TEST_SYNC_POINT_CALLBACK("SanitizeOptions:direct_io", nullptr);
+    result.compaction_readahead_size = 1024 * 1024 * 2;
+  }
+
+  // Force flush on DB open if 2PC is enabled, since with 2PC we have no
+  // guarantee that consecutive log files have consecutive sequence id, which
+  // make recovery complicated.
+  if (result.allow_2pc) {
+    result.avoid_flush_during_recovery = false;
+  }
+
+#ifndef ROCKSDB_LITE
+  ImmutableDBOptions immutable_db_options(result);
+  if (!immutable_db_options.IsWalDirSameAsDBPath()) {
+    // Either the WAL dir and db_paths[0]/db_name are not the same, or we
+    // cannot tell for sure. In either case, assume they're different and
+    // explicitly cleanup the trash log files (bypass DeleteScheduler)
+    // Do this first so even if we end up calling
+    // DeleteScheduler::CleanupDirectory on the same dir later, it will be
+    // safe
+    std::vector<std::string> filenames;
+    IOOptions io_opts;
+    io_opts.do_not_recurse = true;
+    auto wal_dir = immutable_db_options.GetWalDir();
+    Status s = immutable_db_options.fs->GetChildren(
+        wal_dir, io_opts, &filenames, /*IODebugContext*=*/nullptr);
+    s.PermitUncheckedError();  //**TODO: What to do on error?
+    for (std::string& filename : filenames) {
+      if (filename.find(".log.trash", filename.length() -
+                                          std::string(".log.trash").length()) !=
+          std::string::npos) {
+        std::string trash_file = wal_dir + "/" + filename;
+        result.env->DeleteFile(trash_file).PermitUncheckedError();
+      }
+    }
+  }
+  // When the DB is stopped, it's possible that there are some .trash files that
+  // were not deleted yet, when we open the DB we will find these .trash files
+  // and schedule them to be deleted (or delete immediately if SstFileManager
+  // was not used)
+  auto sfm = static_cast<SstFileManagerImpl*>(result.sst_file_manager.get());
+  for (size_t i = 0; i < result.db_paths.size(); i++) {
+    DeleteScheduler::CleanupDirectory(result.env, sfm, result.db_paths[i].path)
+        .PermitUncheckedError();
+  }
+
+  // Create a default SstFileManager for purposes of tracking compaction size
+  // and facilitating recovery from out of space errors.
+  if (result.sst_file_manager.get() == nullptr) {
+    std::shared_ptr<SstFileManager> sst_file_manager(
+        NewSstFileManager(result.env, result.info_log));
+    result.sst_file_manager = sst_file_manager;
+  }
+#endif  // !ROCKSDB_LITE
+
+  // Supported wal compression types
+  if (!StreamingCompressionTypeSupported(result.wal_compression)) {
+    result.wal_compression = kNoCompression;
+    ROCKS_LOG_WARN(result.info_log,
+                   "wal_compression is disabled since only zstd is supported");
+  }
+
+  if (!result.paranoid_checks) {
+    result.skip_checking_sst_file_sizes_on_db_open = true;
+    ROCKS_LOG_INFO(result.info_log,
+                   "file size check will be skipped during open.");
+  }
+
+  return result;
+}
+
+namespace {
+Status ValidateOptionsByTable(
+    const DBOptions& db_opts,
+    const std::vector<ColumnFamilyDescriptor>& column_families) {
+  Status s;
+  for (auto cf : column_families) {
+    s = ValidateOptions(db_opts, cf.options);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  return Status::OK();
+}
+}  // namespace
+
+Status DBImpl::ValidateOptions(
+    const DBOptions& db_options,
+    const std::vector<ColumnFamilyDescriptor>& column_families) {
+  Status s;
+  for (auto& cfd : column_families) {
+    s = ColumnFamilyData::ValidateOptions(db_options, cfd.options);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  s = ValidateOptions(db_options);
+  return s;
+}
+
+Status DBImpl::ValidateOptions(const DBOptions& db_options) {
+  if (db_options.db_paths.size() > 4) {
+    return Status::NotSupported(
+        "More than four DB paths are not supported yet. ");
+  }
+
+  if (db_options.allow_mmap_reads && db_options.use_direct_reads) {
+    // Protect against assert in PosixMMapReadableFile constructor
+    return Status::NotSupported(
+        "If memory mapped reads (allow_mmap_reads) are enabled "
+        "then direct I/O reads (use_direct_reads) must be disabled. ");
+  }
+
+  if (db_options.allow_mmap_writes &&
+      db_options.use_direct_io_for_flush_and_compaction) {
+    return Status::NotSupported(
+        "If memory mapped writes (allow_mmap_writes) are enabled "
+        "then direct I/O writes (use_direct_io_for_flush_and_compaction) must "
+        "be disabled. ");
+  }
+
+  if (db_options.keep_log_file_num == 0) {
+    return Status::InvalidArgument("keep_log_file_num must be greater than 0");
+  }
+
+  if (db_options.unordered_write &&
+      !db_options.allow_concurrent_memtable_write) {
+    return Status::InvalidArgument(
+        "unordered_write is incompatible with "
+        "!allow_concurrent_memtable_write");
+  }
+
+  if (db_options.unordered_write && db_options.enable_pipelined_write) {
+    return Status::InvalidArgument(
+        "unordered_write is incompatible with enable_pipelined_write");
+  }
+
+  if (db_options.atomic_flush && db_options.enable_pipelined_write) {
+    return Status::InvalidArgument(
+        "atomic_flush is incompatible with enable_pipelined_write");
+  }
+
+  // TODO remove this restriction
+  if (db_options.atomic_flush && db_options.best_efforts_recovery) {
+    return Status::InvalidArgument(
+        "atomic_flush is currently incompatible with best-efforts recovery");
+  }
+
+  if (db_options.use_direct_io_for_flush_and_compaction &&
+      0 == db_options.writable_file_max_buffer_size) {
+    return Status::InvalidArgument(
+        "writes in direct IO require writable_file_max_buffer_size > 0");
+  }
+
+  return Status::OK();
+}
+
+Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
+  VersionEdit new_db;
+  Status s = SetIdentityFile(env_, dbname_);
+  if (!s.ok()) {
+    return s;
+  }
+  if (immutable_db_options_.write_dbid_to_manifest) {
+    std::string temp_db_id;
+    GetDbIdentityFromIdentityFile(&temp_db_id);
+    new_db.SetDBId(temp_db_id);
+  }
+  new_db.SetLogNumber(0);
+  new_db.SetNextFile(2);
+  new_db.SetLastSequence(0);
+
+  ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n");
+  const std::string manifest = DescriptorFileName(dbname_, 1);
+  {
+    if (fs_->FileExists(manifest, IOOptions(), nullptr).ok()) {
+      fs_->DeleteFile(manifest, IOOptions(), nullptr).PermitUncheckedError();
+    }
+    std::unique_ptr<FSWritableFile> file;
+    FileOptions file_options = fs_->OptimizeForManifestWrite(file_options_);
+    s = NewWritableFile(fs_.get(), manifest, &file, file_options);
+    if (!s.ok()) {
+      return s;
+    }
+    FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types;
+    file->SetPreallocationBlockSize(
+        immutable_db_options_.manifest_preallocation_size);
+    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+        std::move(file), manifest, file_options, immutable_db_options_.clock,
+        io_tracer_, nullptr /* stats */, immutable_db_options_.listeners,
+        nullptr, tmp_set.Contains(FileType::kDescriptorFile),
+        tmp_set.Contains(FileType::kDescriptorFile)));
+    log::Writer log(std::move(file_writer), 0, false);
+    std::string record;
+    new_db.EncodeTo(&record);
+    s = log.AddRecord(record);
+    if (s.ok()) {
+      s = SyncManifest(&immutable_db_options_, log.file());
+    }
+  }
+  if (s.ok()) {
+    // Make "CURRENT" file that points to the new manifest file.
+    s = SetCurrentFile(fs_.get(), dbname_, 1, directories_.GetDbDir());
+    if (new_filenames) {
+      new_filenames->emplace_back(
+          manifest.substr(manifest.find_last_of("/\\") + 1));
+    }
+  } else {
+    fs_->DeleteFile(manifest, IOOptions(), nullptr).PermitUncheckedError();
+  }
+  return s;
+}
+
+IOStatus DBImpl::CreateAndNewDirectory(
+    FileSystem* fs, const std::string& dirname,
+    std::unique_ptr<FSDirectory>* directory) {
+  // We call CreateDirIfMissing() as the directory may already exist (if we
+  // are reopening a DB), when this happens we don't want creating the
+  // directory to cause an error. However, we need to check if creating the
+  // directory fails or else we may get an obscure message about the lock
+  // file not existing. One real-world example of this occurring is if
+  // env->CreateDirIfMissing() doesn't create intermediate directories, e.g.
+  // when dbname_ is "dir/db" but when "dir" doesn't exist.
+  IOStatus io_s = fs->CreateDirIfMissing(dirname, IOOptions(), nullptr);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  return fs->NewDirectory(dirname, IOOptions(), directory, nullptr);
+}
+
+IOStatus Directories::SetDirectories(FileSystem* fs, const std::string& dbname,
+                                     const std::string& wal_dir,
+                                     const std::vector<DbPath>& data_paths) {
+  IOStatus io_s = DBImpl::CreateAndNewDirectory(fs, dbname, &db_dir_);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  if (!wal_dir.empty() && dbname != wal_dir) {
+    io_s = DBImpl::CreateAndNewDirectory(fs, wal_dir, &wal_dir_);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+  }
+
+  data_dirs_.clear();
+  for (auto& p : data_paths) {
+    const std::string db_path = p.path;
+    if (db_path == dbname) {
+      data_dirs_.emplace_back(nullptr);
+    } else {
+      std::unique_ptr<FSDirectory> path_directory;
+      io_s = DBImpl::CreateAndNewDirectory(fs, db_path, &path_directory);
+      if (!io_s.ok()) {
+        return io_s;
+      }
+      data_dirs_.emplace_back(path_directory.release());
+    }
+  }
+  assert(data_dirs_.size() == data_paths.size());
+  return IOStatus::OK();
+}
+
+Status DBImpl::Recover(
+    const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
+    bool error_if_wal_file_exists, bool error_if_data_exists_in_wals,
+    uint64_t* recovered_seq, RecoveryContext* recovery_ctx) {
+  mutex_.AssertHeld();
+
+  bool is_new_db = false;
+  assert(db_lock_ == nullptr);
+  std::vector<std::string> files_in_dbname;
+  if (!read_only) {
+    Status s = directories_.SetDirectories(fs_.get(), dbname_,
+                                           immutable_db_options_.wal_dir,
+                                           immutable_db_options_.db_paths);
+    if (!s.ok()) {
+      return s;
+    }
+
+    s = env_->LockFile(LockFileName(dbname_), &db_lock_);
+    if (!s.ok()) {
+      return s;
+    }
+
+    std::string current_fname = CurrentFileName(dbname_);
+    // Path to any MANIFEST file in the db dir. It does not matter which one.
+    // Since best-efforts recovery ignores CURRENT file, existence of a
+    // MANIFEST indicates the recovery to recover existing db. If no MANIFEST
+    // can be found, a new db will be created.
+    std::string manifest_path;
+    if (!immutable_db_options_.best_efforts_recovery) {
+      s = env_->FileExists(current_fname);
+    } else {
+      s = Status::NotFound();
+      IOOptions io_opts;
+      io_opts.do_not_recurse = true;
+      Status io_s = immutable_db_options_.fs->GetChildren(
+          dbname_, io_opts, &files_in_dbname, /*IODebugContext*=*/nullptr);
+      if (!io_s.ok()) {
+        s = io_s;
+        files_in_dbname.clear();
+      }
+      for (const std::string& file : files_in_dbname) {
+        uint64_t number = 0;
+        FileType type = kWalFile;  // initialize
+        if (ParseFileName(file, &number, &type) && type == kDescriptorFile) {
+          uint64_t bytes;
+          s = env_->GetFileSize(DescriptorFileName(dbname_, number), &bytes);
+          if (s.ok() && bytes != 0) {
+            // Found non-empty MANIFEST (descriptor log), thus best-efforts
+            // recovery does not have to treat the db as empty.
+            manifest_path = dbname_ + "/" + file;
+            break;
+          }
+        }
+      }
+    }
+    if (s.IsNotFound()) {
+      if (immutable_db_options_.create_if_missing) {
+        s = NewDB(&files_in_dbname);
+        is_new_db = true;
+        if (!s.ok()) {
+          return s;
+        }
+      } else {
+        return Status::InvalidArgument(
+            current_fname, "does not exist (create_if_missing is false)");
+      }
+    } else if (s.ok()) {
+      if (immutable_db_options_.error_if_exists) {
+        return Status::InvalidArgument(dbname_,
+                                       "exists (error_if_exists is true)");
+      }
+    } else {
+      // Unexpected error reading file
+      assert(s.IsIOError());
+      return s;
+    }
+    // Verify compatibility of file_options_ and filesystem
+    {
+      std::unique_ptr<FSRandomAccessFile> idfile;
+      FileOptions customized_fs(file_options_);
+      customized_fs.use_direct_reads |=
+          immutable_db_options_.use_direct_io_for_flush_and_compaction;
+      const std::string& fname =
+          manifest_path.empty() ? current_fname : manifest_path;
+      s = fs_->NewRandomAccessFile(fname, customized_fs, &idfile, nullptr);
+      if (!s.ok()) {
+        std::string error_str = s.ToString();
+        // Check if unsupported Direct I/O is the root cause
+        customized_fs.use_direct_reads = false;
+        s = fs_->NewRandomAccessFile(fname, customized_fs, &idfile, nullptr);
+        if (s.ok()) {
+          return Status::InvalidArgument(
+              "Direct I/O is not supported by the specified DB.");
+        } else {
+          return Status::InvalidArgument(
+              "Found options incompatible with filesystem", error_str.c_str());
+        }
+      }
+    }
+  } else if (immutable_db_options_.best_efforts_recovery) {
+    assert(files_in_dbname.empty());
+    IOOptions io_opts;
+    io_opts.do_not_recurse = true;
+    Status s = immutable_db_options_.fs->GetChildren(
+        dbname_, io_opts, &files_in_dbname, /*IODebugContext*=*/nullptr);
+    if (s.IsNotFound()) {
+      return Status::InvalidArgument(dbname_,
+                                     "does not exist (open for read only)");
+    } else if (s.IsIOError()) {
+      return s;
+    }
+    assert(s.ok());
+  }
+  assert(db_id_.empty());
+  Status s;
+  bool missing_table_file = false;
+  if (!immutable_db_options_.best_efforts_recovery) {
+    s = versions_->Recover(column_families, read_only, &db_id_);
+  } else {
+    assert(!files_in_dbname.empty());
+    s = versions_->TryRecover(column_families, read_only, files_in_dbname,
+                              &db_id_, &missing_table_file);
+    if (s.ok()) {
+      // TryRecover may delete previous column_family_set_.
+      column_family_memtables_.reset(
+          new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
+    }
+  }
+  if (!s.ok()) {
+    return s;
+  }
+  s = SetupDBId(read_only, recovery_ctx);
+  ROCKS_LOG_INFO(immutable_db_options_.info_log, "DB ID: %s\n", db_id_.c_str());
+  if (s.ok() && !read_only) {
+    s = DeleteUnreferencedSstFiles(recovery_ctx);
+  }
+
+  if (immutable_db_options_.paranoid_checks && s.ok()) {
+    s = CheckConsistency();
+  }
+  if (s.ok() && !read_only) {
+    // TODO: share file descriptors (FSDirectory) with SetDirectories above
+    std::map<std::string, std::shared_ptr<FSDirectory>> created_dirs;
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      s = cfd->AddDirectories(&created_dirs);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  }
+
+  std::vector<std::string> files_in_wal_dir;
+  if (s.ok()) {
+    // Initial max_total_in_memory_state_ before recovery wals. Log recovery
+    // may check this value to decide whether to flush.
+    max_total_in_memory_state_ = 0;
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+      max_total_in_memory_state_ += mutable_cf_options->write_buffer_size *
+                                    mutable_cf_options->max_write_buffer_number;
+    }
+
+    SequenceNumber next_sequence(kMaxSequenceNumber);
+    default_cf_handle_ = new ColumnFamilyHandleImpl(
+        versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_);
+    default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats();
+
+    // Recover from all newer log files than the ones named in the
+    // descriptor (new log files may have been added by the previous
+    // incarnation without registering them in the descriptor).
+    //
+    // Note that prev_log_number() is no longer used, but we pay
+    // attention to it in case we are recovering a database
+    // produced by an older version of rocksdb.
+    auto wal_dir = immutable_db_options_.GetWalDir();
+    if (!immutable_db_options_.best_efforts_recovery) {
+      IOOptions io_opts;
+      io_opts.do_not_recurse = true;
+      s = immutable_db_options_.fs->GetChildren(
+          wal_dir, io_opts, &files_in_wal_dir, /*IODebugContext*=*/nullptr);
+    }
+    if (s.IsNotFound()) {
+      return Status::InvalidArgument("wal_dir not found", wal_dir);
+    } else if (!s.ok()) {
+      return s;
+    }
+
+    std::unordered_map<uint64_t, std::string> wal_files;
+    for (const auto& file : files_in_wal_dir) {
+      uint64_t number;
+      FileType type;
+      if (ParseFileName(file, &number, &type) && type == kWalFile) {
+        if (is_new_db) {
+          return Status::Corruption(
+              "While creating a new Db, wal_dir contains "
+              "existing log file: ",
+              file);
+        } else {
+          wal_files[number] = LogFileName(wal_dir, number);
+        }
+      }
+    }
+
+    if (immutable_db_options_.track_and_verify_wals_in_manifest) {
+      if (!immutable_db_options_.best_efforts_recovery) {
+        // Verify WALs in MANIFEST.
+        s = versions_->GetWalSet().CheckWals(env_, wal_files);
+      }  // else since best effort recovery does not recover from WALs, no need
+         // to check WALs.
+    } else if (!versions_->GetWalSet().GetWals().empty()) {
+      // Tracking is disabled, clear previously tracked WALs from MANIFEST,
+      // otherwise, in the future, if WAL tracking is enabled again,
+      // since the WALs deleted when WAL tracking is disabled are not persisted
+      // into MANIFEST, WAL check may fail.
+      VersionEdit edit;
+      WalNumber max_wal_number =
+          versions_->GetWalSet().GetWals().rbegin()->first;
+      edit.DeleteWalsBefore(max_wal_number + 1);
+      assert(recovery_ctx != nullptr);
+      assert(versions_->GetColumnFamilySet() != nullptr);
+      recovery_ctx->UpdateVersionEdits(
+          versions_->GetColumnFamilySet()->GetDefault(), edit);
+    }
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (!wal_files.empty()) {
+      if (error_if_wal_file_exists) {
+        return Status::Corruption(
+            "The db was opened in readonly mode with error_if_wal_file_exists"
+            "flag but a WAL file already exists");
+      } else if (error_if_data_exists_in_wals) {
+        for (auto& wal_file : wal_files) {
+          uint64_t bytes;
+          s = env_->GetFileSize(wal_file.second, &bytes);
+          if (s.ok()) {
+            if (bytes > 0) {
+              return Status::Corruption(
+                  "error_if_data_exists_in_wals is set but there are data "
+                  " in WAL files.");
+            }
+          }
+        }
+      }
+    }
+
+    if (!wal_files.empty()) {
+      // Recover in the order in which the wals were generated
+      std::vector<uint64_t> wals;
+      wals.reserve(wal_files.size());
+      for (const auto& wal_file : wal_files) {
+        wals.push_back(wal_file.first);
+      }
+      std::sort(wals.begin(), wals.end());
+
+      bool corrupted_wal_found = false;
+      s = RecoverLogFiles(wals, &next_sequence, read_only, &corrupted_wal_found,
+                          recovery_ctx);
+      if (corrupted_wal_found && recovered_seq != nullptr) {
+        *recovered_seq = next_sequence;
+      }
+      if (!s.ok()) {
+        // Clear memtables if recovery failed
+        for (auto cfd : *versions_->GetColumnFamilySet()) {
+          cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                                 kMaxSequenceNumber);
+        }
+      }
+    }
+  }
+
+  if (read_only) {
+    // If we are opening as read-only, we need to update options_file_number_
+    // to reflect the most recent OPTIONS file. It does not matter for regular
+    // read-write db instance because options_file_number_ will later be
+    // updated to versions_->NewFileNumber() in RenameTempFileToOptionsFile.
+    std::vector<std::string> filenames;
+    if (s.ok()) {
+      const std::string normalized_dbname = NormalizePath(dbname_);
+      const std::string normalized_wal_dir =
+          NormalizePath(immutable_db_options_.GetWalDir());
+      if (immutable_db_options_.best_efforts_recovery) {
+        filenames = std::move(files_in_dbname);
+      } else if (normalized_dbname == normalized_wal_dir) {
+        filenames = std::move(files_in_wal_dir);
+      } else {
+        IOOptions io_opts;
+        io_opts.do_not_recurse = true;
+        s = immutable_db_options_.fs->GetChildren(
+            GetName(), io_opts, &filenames, /*IODebugContext*=*/nullptr);
+      }
+    }
+    if (s.ok()) {
+      uint64_t number = 0;
+      uint64_t options_file_number = 0;
+      FileType type;
+      for (const auto& fname : filenames) {
+        if (ParseFileName(fname, &number, &type) && type == kOptionsFile) {
+          options_file_number = std::max(number, options_file_number);
+        }
+      }
+      versions_->options_file_number_ = options_file_number;
+      uint64_t options_file_size = 0;
+      if (options_file_number > 0) {
+        s = env_->GetFileSize(OptionsFileName(GetName(), options_file_number),
+                              &options_file_size);
+      }
+      versions_->options_file_size_ = options_file_size;
+    }
+  }
+  return s;
+}
+
+Status DBImpl::PersistentStatsProcessFormatVersion() {
+  mutex_.AssertHeld();
+  Status s;
+  // persist version when stats CF doesn't exist
+  bool should_persist_format_version = !persistent_stats_cfd_exists_;
+  mutex_.Unlock();
+  if (persistent_stats_cfd_exists_) {
+    // Check persistent stats format version compatibility. Drop and recreate
+    // persistent stats CF if format version is incompatible
+    uint64_t format_version_recovered = 0;
+    Status s_format = DecodePersistentStatsVersionNumber(
+        this, StatsVersionKeyType::kFormatVersion, &format_version_recovered);
+    uint64_t compatible_version_recovered = 0;
+    Status s_compatible = DecodePersistentStatsVersionNumber(
+        this, StatsVersionKeyType::kCompatibleVersion,
+        &compatible_version_recovered);
+    // abort reading from existing stats CF if any of following is true:
+    // 1. failed to read format version or compatible version from disk
+    // 2. sst's format version is greater than current format version, meaning
+    // this sst is encoded with a newer RocksDB release, and current compatible
+    // version is below the sst's compatible version
+    if (!s_format.ok() || !s_compatible.ok() ||
+        (kStatsCFCurrentFormatVersion < format_version_recovered &&
+         kStatsCFCompatibleFormatVersion < compatible_version_recovered)) {
+      if (!s_format.ok() || !s_compatible.ok()) {
+        ROCKS_LOG_WARN(
+            immutable_db_options_.info_log,
+            "Recreating persistent stats column family since reading "
+            "persistent stats version key failed. Format key: %s, compatible "
+            "key: %s",
+            s_format.ToString().c_str(), s_compatible.ToString().c_str());
+      } else {
+        ROCKS_LOG_WARN(
+            immutable_db_options_.info_log,
+            "Recreating persistent stats column family due to corrupted or "
+            "incompatible format version. Recovered format: %" PRIu64
+            "; recovered format compatible since: %" PRIu64 "\n",
+            format_version_recovered, compatible_version_recovered);
+      }
+      s = DropColumnFamily(persist_stats_cf_handle_);
+      if (s.ok()) {
+        s = DestroyColumnFamilyHandle(persist_stats_cf_handle_);
+      }
+      ColumnFamilyHandle* handle = nullptr;
+      if (s.ok()) {
+        ColumnFamilyOptions cfo;
+        OptimizeForPersistentStats(&cfo);
+        s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle);
+      }
+      if (s.ok()) {
+        persist_stats_cf_handle_ = static_cast<ColumnFamilyHandleImpl*>(handle);
+        // should also persist version here because old stats CF is discarded
+        should_persist_format_version = true;
+      }
+    }
+  }
+  if (should_persist_format_version) {
+    // Persistent stats CF being created for the first time, need to write
+    // format version key
+    WriteBatch batch;
+    if (s.ok()) {
+      s = batch.Put(persist_stats_cf_handle_, kFormatVersionKeyString,
+                    std::to_string(kStatsCFCurrentFormatVersion));
+    }
+    if (s.ok()) {
+      s = batch.Put(persist_stats_cf_handle_, kCompatibleVersionKeyString,
+                    std::to_string(kStatsCFCompatibleFormatVersion));
+    }
+    if (s.ok()) {
+      WriteOptions wo;
+      wo.low_pri = true;
+      wo.no_slowdown = true;
+      wo.sync = false;
+      s = Write(wo, &batch);
+    }
+  }
+  mutex_.Lock();
+  return s;
+}
+
+Status DBImpl::InitPersistStatsColumnFamily() {
+  mutex_.AssertHeld();
+  assert(!persist_stats_cf_handle_);
+  ColumnFamilyData* persistent_stats_cfd =
+      versions_->GetColumnFamilySet()->GetColumnFamily(
+          kPersistentStatsColumnFamilyName);
+  persistent_stats_cfd_exists_ = persistent_stats_cfd != nullptr;
+
+  Status s;
+  if (persistent_stats_cfd != nullptr) {
+    // We are recovering from a DB which already contains persistent stats CF,
+    // the CF is already created in VersionSet::ApplyOneVersionEdit, but
+    // column family handle was not. Need to explicitly create handle here.
+    persist_stats_cf_handle_ =
+        new ColumnFamilyHandleImpl(persistent_stats_cfd, this, &mutex_);
+  } else {
+    mutex_.Unlock();
+    ColumnFamilyHandle* handle = nullptr;
+    ColumnFamilyOptions cfo;
+    OptimizeForPersistentStats(&cfo);
+    s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle);
+    persist_stats_cf_handle_ = static_cast<ColumnFamilyHandleImpl*>(handle);
+    mutex_.Lock();
+  }
+  return s;
+}
+
+Status DBImpl::LogAndApplyForRecovery(const RecoveryContext& recovery_ctx) {
+  mutex_.AssertHeld();
+  assert(versions_->descriptor_log_ == nullptr);
+  Status s = versions_->LogAndApply(
+      recovery_ctx.cfds_, recovery_ctx.mutable_cf_opts_,
+      recovery_ctx.edit_lists_, &mutex_, directories_.GetDbDir());
+  if (s.ok() && !(recovery_ctx.files_to_delete_.empty())) {
+    mutex_.Unlock();
+    for (const auto& fname : recovery_ctx.files_to_delete_) {
+      s = env_->DeleteFile(fname);
+      if (!s.ok()) {
+        break;
+      }
+    }
+    mutex_.Lock();
+  }
+  return s;
+}
+
+void DBImpl::InvokeWalFilterIfNeededOnColumnFamilyToWalNumberMap() {
+#ifndef ROCKSDB_LITE
+  if (immutable_db_options_.wal_filter == nullptr) {
+    return;
+  }
+  assert(immutable_db_options_.wal_filter != nullptr);
+  WalFilter& wal_filter = *(immutable_db_options_.wal_filter);
+
+  std::map<std::string, uint32_t> cf_name_id_map;
+  std::map<uint32_t, uint64_t> cf_lognumber_map;
+  assert(versions_);
+  assert(versions_->GetColumnFamilySet());
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    assert(cfd);
+    cf_name_id_map.insert(std::make_pair(cfd->GetName(), cfd->GetID()));
+    cf_lognumber_map.insert(std::make_pair(cfd->GetID(), cfd->GetLogNumber()));
+  }
+
+  wal_filter.ColumnFamilyLogNumberMap(cf_lognumber_map, cf_name_id_map);
+#endif  // !ROCKSDB_LITE
+}
+
+bool DBImpl::InvokeWalFilterIfNeededOnWalRecord(uint64_t wal_number,
+                                                const std::string& wal_fname,
+                                                log::Reader::Reporter& reporter,
+                                                Status& status,
+                                                bool& stop_replay,
+                                                WriteBatch& batch) {
+#ifndef ROCKSDB_LITE
+  if (immutable_db_options_.wal_filter == nullptr) {
+    return true;
+  }
+  assert(immutable_db_options_.wal_filter != nullptr);
+  WalFilter& wal_filter = *(immutable_db_options_.wal_filter);
+
+  WriteBatch new_batch;
+  bool batch_changed = false;
+
+  bool process_current_record = true;
+
+  WalFilter::WalProcessingOption wal_processing_option =
+      wal_filter.LogRecordFound(wal_number, wal_fname, batch, &new_batch,
+                                &batch_changed);
+
+  switch (wal_processing_option) {
+    case WalFilter::WalProcessingOption::kContinueProcessing:
+      // do nothing, proceeed normally
+      break;
+    case WalFilter::WalProcessingOption::kIgnoreCurrentRecord:
+      // skip current record
+      process_current_record = false;
+      break;
+    case WalFilter::WalProcessingOption::kStopReplay:
+      // skip current record and stop replay
+      process_current_record = false;
+      stop_replay = true;
+      break;
+    case WalFilter::WalProcessingOption::kCorruptedRecord: {
+      status = Status::Corruption("Corruption reported by Wal Filter ",
+                                  wal_filter.Name());
+      MaybeIgnoreError(&status);
+      if (!status.ok()) {
+        process_current_record = false;
+        reporter.Corruption(batch.GetDataSize(), status);
+      }
+      break;
+    }
+    default: {
+      // logical error which should not happen. If RocksDB throws, we would
+      // just do `throw std::logic_error`.
+      assert(false);
+      status = Status::NotSupported(
+          "Unknown WalProcessingOption returned by Wal Filter ",
+          wal_filter.Name());
+      MaybeIgnoreError(&status);
+      if (!status.ok()) {
+        // Ignore the error with current record processing.
+        stop_replay = true;
+      }
+      break;
+    }
+  }
+
+  if (!process_current_record) {
+    return false;
+  }
+
+  if (batch_changed) {
+    // Make sure that the count in the new batch is
+    // within the orignal count.
+    int new_count = WriteBatchInternal::Count(&new_batch);
+    int original_count = WriteBatchInternal::Count(&batch);
+    if (new_count > original_count) {
+      ROCKS_LOG_FATAL(
+          immutable_db_options_.info_log,
+          "Recovering log #%" PRIu64
+          " mode %d log filter %s returned "
+          "more records (%d) than original (%d) which is not allowed. "
+          "Aborting recovery.",
+          wal_number, static_cast<int>(immutable_db_options_.wal_recovery_mode),
+          wal_filter.Name(), new_count, original_count);
+      status = Status::NotSupported(
+          "More than original # of records "
+          "returned by Wal Filter ",
+          wal_filter.Name());
+      return false;
+    }
+    // Set the same sequence number in the new_batch
+    // as the original batch.
+    WriteBatchInternal::SetSequence(&new_batch,
+                                    WriteBatchInternal::Sequence(&batch));
+    batch = new_batch;
+  }
+  return true;
+#else   // !ROCKSDB_LITE
+  (void)wal_number;
+  (void)wal_fname;
+  (void)reporter;
+  (void)status;
+  (void)stop_replay;
+  (void)batch;
+  return true;
+#endif  // ROCKSDB_LITE
+}
+
+// REQUIRES: wal_numbers are sorted in ascending order
+Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
+                               SequenceNumber* next_sequence, bool read_only,
+                               bool* corrupted_wal_found,
+                               RecoveryContext* recovery_ctx) {
+  struct LogReporter : public log::Reader::Reporter {
+    Env* env;
+    Logger* info_log;
+    const char* fname;
+    Status* status;  // nullptr if immutable_db_options_.paranoid_checks==false
+    void Corruption(size_t bytes, const Status& s) override {
+      ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s",
+                     (status == nullptr ? "(ignoring error) " : ""), fname,
+                     static_cast<int>(bytes), s.ToString().c_str());
+      if (status != nullptr && status->ok()) {
+        *status = s;
+      }
+    }
+  };
+
+  mutex_.AssertHeld();
+  Status status;
+  std::unordered_map<int, VersionEdit> version_edits;
+  // no need to refcount because iteration is under mutex
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    VersionEdit edit;
+    edit.SetColumnFamily(cfd->GetID());
+    version_edits.insert({cfd->GetID(), edit});
+  }
+  int job_id = next_job_id_.fetch_add(1);
+  {
+    auto stream = event_logger_.Log();
+    stream << "job" << job_id << "event"
+           << "recovery_started";
+    stream << "wal_files";
+    stream.StartArray();
+    for (auto wal_number : wal_numbers) {
+      stream << wal_number;
+    }
+    stream.EndArray();
+  }
+
+  // No-op for immutable_db_options_.wal_filter == nullptr.
+  InvokeWalFilterIfNeededOnColumnFamilyToWalNumberMap();
+
+  bool stop_replay_by_wal_filter = false;
+  bool stop_replay_for_corruption = false;
+  bool flushed = false;
+  uint64_t corrupted_wal_number = kMaxSequenceNumber;
+  uint64_t min_wal_number = MinLogNumberToKeep();
+  if (!allow_2pc()) {
+    // In non-2pc mode, we skip WALs that do not back unflushed data.
+    min_wal_number =
+        std::max(min_wal_number, versions_->MinLogNumberWithUnflushedData());
+  }
+  for (auto wal_number : wal_numbers) {
+    if (wal_number < min_wal_number) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Skipping log #%" PRIu64
+                     " since it is older than min log to keep #%" PRIu64,
+                     wal_number, min_wal_number);
+      continue;
+    }
+    // The previous incarnation may not have written any MANIFEST
+    // records after allocating this log number.  So we manually
+    // update the file number allocation counter in VersionSet.
+    versions_->MarkFileNumberUsed(wal_number);
+    // Open the log file
+    std::string fname =
+        LogFileName(immutable_db_options_.GetWalDir(), wal_number);
+
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Recovering log #%" PRIu64 " mode %d", wal_number,
+                   static_cast<int>(immutable_db_options_.wal_recovery_mode));
+    auto logFileDropped = [this, &fname]() {
+      uint64_t bytes;
+      if (env_->GetFileSize(fname, &bytes).ok()) {
+        auto info_log = immutable_db_options_.info_log.get();
+        ROCKS_LOG_WARN(info_log, "%s: dropping %d bytes", fname.c_str(),
+                       static_cast<int>(bytes));
+      }
+    };
+    if (stop_replay_by_wal_filter) {
+      logFileDropped();
+      continue;
+    }
+
+    std::unique_ptr<SequentialFileReader> file_reader;
+    {
+      std::unique_ptr<FSSequentialFile> file;
+      status = fs_->NewSequentialFile(
+          fname, fs_->OptimizeForLogRead(file_options_), &file, nullptr);
+      if (!status.ok()) {
+        MaybeIgnoreError(&status);
+        if (!status.ok()) {
+          return status;
+        } else {
+          // Fail with one log file, but that's ok.
+          // Try next one.
+          continue;
+        }
+      }
+      file_reader.reset(new SequentialFileReader(
+          std::move(file), fname, immutable_db_options_.log_readahead_size,
+          io_tracer_));
+    }
+
+    // Create the log reader.
+    LogReporter reporter;
+    reporter.env = env_;
+    reporter.info_log = immutable_db_options_.info_log.get();
+    reporter.fname = fname.c_str();
+    if (!immutable_db_options_.paranoid_checks ||
+        immutable_db_options_.wal_recovery_mode ==
+            WALRecoveryMode::kSkipAnyCorruptedRecords) {
+      reporter.status = nullptr;
+    } else {
+      reporter.status = &status;
+    }
+    // We intentially make log::Reader do checksumming even if
+    // paranoid_checks==false so that corruptions cause entire commits
+    // to be skipped instead of propagating bad information (like overly
+    // large sequence numbers).
+    log::Reader reader(immutable_db_options_.info_log, std::move(file_reader),
+                       &reporter, true /*checksum*/, wal_number);
+
+    // Determine if we should tolerate incomplete records at the tail end of the
+    // Read all the records and add to a memtable
+    std::string scratch;
+    Slice record;
+
+    TEST_SYNC_POINT_CALLBACK("DBImpl::RecoverLogFiles:BeforeReadWal",
+                             /*arg=*/nullptr);
+    uint64_t record_checksum;
+    while (!stop_replay_by_wal_filter &&
+           reader.ReadRecord(&record, &scratch,
+                             immutable_db_options_.wal_recovery_mode,
+                             &record_checksum) &&
+           status.ok()) {
+      if (record.size() < WriteBatchInternal::kHeader) {
+        reporter.Corruption(record.size(),
+                            Status::Corruption("log record too small"));
+        continue;
+      }
+
+      // We create a new batch and initialize with a valid prot_info_ to store
+      // the data checksums
+      WriteBatch batch;
+
+      status = WriteBatchInternal::SetContents(&batch, record);
+      if (!status.ok()) {
+        return status;
+      }
+      TEST_SYNC_POINT_CALLBACK(
+          "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:batch", &batch);
+      TEST_SYNC_POINT_CALLBACK(
+          "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:checksum",
+          &record_checksum);
+      status = WriteBatchInternal::UpdateProtectionInfo(
+          &batch, 8 /* bytes_per_key */, &record_checksum);
+      if (!status.ok()) {
+        return status;
+      }
+
+      SequenceNumber sequence = WriteBatchInternal::Sequence(&batch);
+
+      if (immutable_db_options_.wal_recovery_mode ==
+          WALRecoveryMode::kPointInTimeRecovery) {
+        // In point-in-time recovery mode, if sequence id of log files are
+        // consecutive, we continue recovery despite corruption. This could
+        // happen when we open and write to a corrupted DB, where sequence id
+        // will start from the last sequence id we recovered.
+        if (sequence == *next_sequence) {
+          stop_replay_for_corruption = false;
+        }
+        if (stop_replay_for_corruption) {
+          logFileDropped();
+          break;
+        }
+      }
+
+      // For the default case of wal_filter == nullptr, always performs no-op
+      // and returns true.
+      if (!InvokeWalFilterIfNeededOnWalRecord(wal_number, fname, reporter,
+                                              status, stop_replay_by_wal_filter,
+                                              batch)) {
+        continue;
+      }
+
+      // If column family was not found, it might mean that the WAL write
+      // batch references to the column family that was dropped after the
+      // insert. We don't want to fail the whole write batch in that case --
+      // we just ignore the update.
+      // That's why we set ignore missing column families to true
+      bool has_valid_writes = false;
+      status = WriteBatchInternal::InsertInto(
+          &batch, column_family_memtables_.get(), &flush_scheduler_,
+          &trim_history_scheduler_, true, wal_number, this,
+          false /* concurrent_memtable_writes */, next_sequence,
+          &has_valid_writes, seq_per_batch_, batch_per_txn_);
+      MaybeIgnoreError(&status);
+      if (!status.ok()) {
+        // We are treating this as a failure while reading since we read valid
+        // blocks that do not form coherent data
+        reporter.Corruption(record.size(), status);
+        continue;
+      }
+
+      if (has_valid_writes && !read_only) {
+        // we can do this because this is called before client has access to the
+        // DB and there is only a single thread operating on DB
+        ColumnFamilyData* cfd;
+
+        while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
+          cfd->UnrefAndTryDelete();
+          // If this asserts, it means that InsertInto failed in
+          // filtering updates to already-flushed column families
+          assert(cfd->GetLogNumber() <= wal_number);
+          auto iter = version_edits.find(cfd->GetID());
+          assert(iter != version_edits.end());
+          VersionEdit* edit = &iter->second;
+          status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit);
+          if (!status.ok()) {
+            // Reflect errors immediately so that conditions like full
+            // file-systems cause the DB::Open() to fail.
+            return status;
+          }
+          flushed = true;
+
+          cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                                 *next_sequence);
+        }
+      }
+    }
+
+    if (!status.ok()) {
+      if (status.IsNotSupported()) {
+        // We should not treat NotSupported as corruption. It is rather a clear
+        // sign that we are processing a WAL that is produced by an incompatible
+        // version of the code.
+        return status;
+      }
+      if (immutable_db_options_.wal_recovery_mode ==
+          WALRecoveryMode::kSkipAnyCorruptedRecords) {
+        // We should ignore all errors unconditionally
+        status = Status::OK();
+      } else if (immutable_db_options_.wal_recovery_mode ==
+                 WALRecoveryMode::kPointInTimeRecovery) {
+        if (status.IsIOError()) {
+          ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                          "IOError during point-in-time reading log #%" PRIu64
+                          " seq #%" PRIu64
+                          ". %s. This likely mean loss of synced WAL, "
+                          "thus recovery fails.",
+                          wal_number, *next_sequence,
+                          status.ToString().c_str());
+          return status;
+        }
+        // We should ignore the error but not continue replaying
+        status = Status::OK();
+        stop_replay_for_corruption = true;
+        corrupted_wal_number = wal_number;
+        if (corrupted_wal_found != nullptr) {
+          *corrupted_wal_found = true;
+        }
+        ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                       "Point in time recovered to log #%" PRIu64
+                       " seq #%" PRIu64,
+                       wal_number, *next_sequence);
+      } else {
+        assert(immutable_db_options_.wal_recovery_mode ==
+                   WALRecoveryMode::kTolerateCorruptedTailRecords ||
+               immutable_db_options_.wal_recovery_mode ==
+                   WALRecoveryMode::kAbsoluteConsistency);
+        return status;
+      }
+    }
+
+    flush_scheduler_.Clear();
+    trim_history_scheduler_.Clear();
+    auto last_sequence = *next_sequence - 1;
+    if ((*next_sequence != kMaxSequenceNumber) &&
+        (versions_->LastSequence() <= last_sequence)) {
+      versions_->SetLastAllocatedSequence(last_sequence);
+      versions_->SetLastPublishedSequence(last_sequence);
+      versions_->SetLastSequence(last_sequence);
+    }
+  }
+  // Compare the corrupted log number to all columnfamily's current log number.
+  // Abort Open() if any column family's log number is greater than
+  // the corrupted log number, which means CF contains data beyond the point of
+  // corruption. This could during PIT recovery when the WAL is corrupted and
+  // some (but not all) CFs are flushed
+  // Exclude the PIT case where no log is dropped after the corruption point.
+  // This is to cover the case for empty wals after corrupted log, in which we
+  // don't reset stop_replay_for_corruption.
+  if (stop_replay_for_corruption == true &&
+      (immutable_db_options_.wal_recovery_mode ==
+           WALRecoveryMode::kPointInTimeRecovery ||
+       immutable_db_options_.wal_recovery_mode ==
+           WALRecoveryMode::kTolerateCorruptedTailRecords)) {
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      // One special case cause cfd->GetLogNumber() > corrupted_wal_number but
+      // the CF is still consistent: If a new column family is created during
+      // the flush and the WAL sync fails at the same time, the new CF points to
+      // the new WAL but the old WAL is curropted. Since the new CF is empty, it
+      // is still consistent. We add the check of CF sst file size to avoid the
+      // false positive alert.
+
+      // Note that, the check of (cfd->GetLiveSstFilesSize() > 0) may leads to
+      // the ignorance of a very rare inconsistency case caused in data
+      // canclation. One CF is empty due to KV deletion. But those operations
+      // are in the WAL. If the WAL is corrupted, the status of this CF might
+      // not be consistent with others. However, the consistency check will be
+      // bypassed due to empty CF.
+      // TODO: a better and complete implementation is needed to ensure strict
+      // consistency check in WAL recovery including hanlding the tailing
+      // issues.
+      if (cfd->GetLogNumber() > corrupted_wal_number &&
+          cfd->GetLiveSstFilesSize() > 0) {
+        ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                        "Column family inconsistency: SST file contains data"
+                        " beyond the point of corruption.");
+        return Status::Corruption("SST file is ahead of WALs in CF " +
+                                  cfd->GetName());
+      }
+    }
+  }
+
+  // True if there's any data in the WALs; if not, we can skip re-processing
+  // them later
+  bool data_seen = false;
+  if (!read_only) {
+    // no need to refcount since client still doesn't have access
+    // to the DB and can not drop column families while we iterate
+    const WalNumber max_wal_number = wal_numbers.back();
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      auto iter = version_edits.find(cfd->GetID());
+      assert(iter != version_edits.end());
+      VersionEdit* edit = &iter->second;
+
+      if (cfd->GetLogNumber() > max_wal_number) {
+        // Column family cfd has already flushed the data
+        // from all wals. Memtable has to be empty because
+        // we filter the updates based on wal_number
+        // (in WriteBatch::InsertInto)
+        assert(cfd->mem()->GetFirstSequenceNumber() == 0);
+        assert(edit->NumEntries() == 0);
+        continue;
+      }
+
+      TEST_SYNC_POINT_CALLBACK(
+          "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable", /*arg=*/nullptr);
+
+      // flush the final memtable (if non-empty)
+      if (cfd->mem()->GetFirstSequenceNumber() != 0) {
+        // If flush happened in the middle of recovery (e.g. due to memtable
+        // being full), we flush at the end. Otherwise we'll need to record
+        // where we were on last flush, which make the logic complicated.
+        if (flushed || !immutable_db_options_.avoid_flush_during_recovery) {
+          status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit);
+          if (!status.ok()) {
+            // Recovery failed
+            break;
+          }
+          flushed = true;
+
+          cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                                 versions_->LastSequence());
+        }
+        data_seen = true;
+      }
+
+      // Update the log number info in the version edit corresponding to this
+      // column family. Note that the version edits will be written to MANIFEST
+      // together later.
+      // writing wal_number in the manifest means that any log file
+      // with number strongly less than (wal_number + 1) is already
+      // recovered and should be ignored on next reincarnation.
+      // Since we already recovered max_wal_number, we want all wals
+      // with numbers `<= max_wal_number` (includes this one) to be ignored
+      if (flushed || cfd->mem()->GetFirstSequenceNumber() == 0) {
+        edit->SetLogNumber(max_wal_number + 1);
+      }
+    }
+    if (status.ok()) {
+      // we must mark the next log number as used, even though it's
+      // not actually used. that is because VersionSet assumes
+      // VersionSet::next_file_number_ always to be strictly greater than any
+      // log number
+      versions_->MarkFileNumberUsed(max_wal_number + 1);
+      assert(recovery_ctx != nullptr);
+
+      for (auto* cfd : *versions_->GetColumnFamilySet()) {
+        auto iter = version_edits.find(cfd->GetID());
+        assert(iter != version_edits.end());
+        recovery_ctx->UpdateVersionEdits(cfd, iter->second);
+      }
+
+      if (flushed) {
+        VersionEdit wal_deletion;
+        if (immutable_db_options_.track_and_verify_wals_in_manifest) {
+          wal_deletion.DeleteWalsBefore(max_wal_number + 1);
+        }
+        if (!allow_2pc()) {
+          // In non-2pc mode, flushing the memtables of the column families
+          // means we can advance min_log_number_to_keep.
+          wal_deletion.SetMinLogNumberToKeep(max_wal_number + 1);
+        }
+        assert(versions_->GetColumnFamilySet() != nullptr);
+        recovery_ctx->UpdateVersionEdits(
+            versions_->GetColumnFamilySet()->GetDefault(), wal_deletion);
+      }
+    }
+  }
+
+  if (status.ok()) {
+    if (data_seen && !flushed) {
+      status = RestoreAliveLogFiles(wal_numbers);
+    } else if (!wal_numbers.empty()) {  // If there's no data in the WAL, or we
+                                        // flushed all the data, still
+      // truncate the log file. If the process goes into a crash loop before
+      // the file is deleted, the preallocated space will never get freed.
+      const bool truncate = !read_only;
+      GetLogSizeAndMaybeTruncate(wal_numbers.back(), truncate, nullptr)
+          .PermitUncheckedError();
+    }
+  }
+
+  event_logger_.Log() << "job" << job_id << "event"
+                      << "recovery_finished";
+
+  return status;
+}
+
+Status DBImpl::GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate,
+                                          LogFileNumberSize* log_ptr) {
+  LogFileNumberSize log(wal_number);
+  std::string fname =
+      LogFileName(immutable_db_options_.GetWalDir(), wal_number);
+  Status s;
+  // This gets the appear size of the wals, not including preallocated space.
+  s = env_->GetFileSize(fname, &log.size);
+  TEST_SYNC_POINT_CALLBACK("DBImpl::GetLogSizeAndMaybeTruncate:0", /*arg=*/&s);
+  if (s.ok() && truncate) {
+    std::unique_ptr<FSWritableFile> last_log;
+    Status truncate_status = fs_->ReopenWritableFile(
+        fname,
+        fs_->OptimizeForLogWrite(
+            file_options_,
+            BuildDBOptions(immutable_db_options_, mutable_db_options_)),
+        &last_log, nullptr);
+    if (truncate_status.ok()) {
+      truncate_status = last_log->Truncate(log.size, IOOptions(), nullptr);
+    }
+    if (truncate_status.ok()) {
+      truncate_status = last_log->Close(IOOptions(), nullptr);
+    }
+    // Not a critical error if fail to truncate.
+    if (!truncate_status.ok() && !truncate_status.IsNotSupported()) {
+      ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                     "Failed to truncate log #%" PRIu64 ": %s", wal_number,
+                     truncate_status.ToString().c_str());
+    }
+  }
+  if (log_ptr) {
+    *log_ptr = log;
+  }
+  return s;
+}
+
+Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
+  if (wal_numbers.empty()) {
+    return Status::OK();
+  }
+  Status s;
+  mutex_.AssertHeld();
+  assert(immutable_db_options_.avoid_flush_during_recovery);
+  // Mark these as alive so they'll be considered for deletion later by
+  // FindObsoleteFiles()
+  total_log_size_ = 0;
+  log_empty_ = false;
+  uint64_t min_wal_with_unflushed_data =
+      versions_->MinLogNumberWithUnflushedData();
+  for (auto wal_number : wal_numbers) {
+    if (!allow_2pc() && wal_number < min_wal_with_unflushed_data) {
+      // In non-2pc mode, the WAL files not backing unflushed data are not
+      // alive, thus should not be added to the alive_log_files_.
+      continue;
+    }
+    // We preallocate space for wals, but then after a crash and restart, those
+    // preallocated space are not needed anymore. It is likely only the last
+    // log has such preallocated space, so we only truncate for the last log.
+    LogFileNumberSize log;
+    s = GetLogSizeAndMaybeTruncate(
+        wal_number, /*truncate=*/(wal_number == wal_numbers.back()), &log);
+    if (!s.ok()) {
+      break;
+    }
+    total_log_size_ += log.size;
+    alive_log_files_.push_back(log);
+  }
+  return s;
+}
+
+Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
+                                           MemTable* mem, VersionEdit* edit) {
+  mutex_.AssertHeld();
+  assert(cfd);
+  assert(cfd->imm());
+  // The immutable memtable list must be empty.
+  assert(std::numeric_limits<uint64_t>::max() ==
+         cfd->imm()->GetEarliestMemTableID());
+
+  const uint64_t start_micros = immutable_db_options_.clock->NowMicros();
+
+  FileMetaData meta;
+  std::vector<BlobFileAddition> blob_file_additions;
+
+  std::unique_ptr<std::list<uint64_t>::iterator> pending_outputs_inserted_elem(
+      new std::list<uint64_t>::iterator(
+          CaptureCurrentFileNumberInPendingOutputs()));
+  meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
+  ReadOptions ro;
+  ro.total_order_seek = true;
+  Arena arena;
+  Status s;
+  TableProperties table_properties;
+  {
+    ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
+    ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+                    "[%s] [WriteLevel0TableForRecovery]"
+                    " Level-0 table #%" PRIu64 ": started",
+                    cfd->GetName().c_str(), meta.fd.GetNumber());
+
+    // Get the latest mutable cf options while the mutex is still locked
+    const MutableCFOptions mutable_cf_options =
+        *cfd->GetLatestMutableCFOptions();
+    bool paranoid_file_checks =
+        cfd->GetLatestMutableCFOptions()->paranoid_file_checks;
+
+    int64_t _current_time = 0;
+    immutable_db_options_.clock->GetCurrentTime(&_current_time)
+        .PermitUncheckedError();  // ignore error
+    const uint64_t current_time = static_cast<uint64_t>(_current_time);
+    meta.oldest_ancester_time = current_time;
+
+    {
+      auto write_hint = cfd->CalculateSSTWriteHint(0);
+      mutex_.Unlock();
+
+      SequenceNumber earliest_write_conflict_snapshot;
+      std::vector<SequenceNumber> snapshot_seqs =
+          snapshots_.GetAll(&earliest_write_conflict_snapshot);
+      auto snapshot_checker = snapshot_checker_.get();
+      if (use_custom_gc_ && snapshot_checker == nullptr) {
+        snapshot_checker = DisableGCSnapshotChecker::Instance();
+      }
+      std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+          range_del_iters;
+      auto range_del_iter =
+          // This is called during recovery, where a live memtable is flushed
+          // directly. In this case, no fragmented tombstone list is cached in
+          // this memtable yet.
+          mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber,
+                                         false /* immutable_memtable */);
+      if (range_del_iter != nullptr) {
+        range_del_iters.emplace_back(range_del_iter);
+      }
+
+      IOStatus io_s;
+      TableBuilderOptions tboptions(
+          *cfd->ioptions(), mutable_cf_options, cfd->internal_comparator(),
+          cfd->int_tbl_prop_collector_factories(),
+          GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
+          mutable_cf_options.compression_opts, cfd->GetID(), cfd->GetName(),
+          0 /* level */, false /* is_bottommost */,
+          TableFileCreationReason::kRecovery, 0 /* oldest_key_time */,
+          0 /* file_creation_time */, db_id_, db_session_id_,
+          0 /* target_file_size */, meta.fd.GetNumber());
+      SeqnoToTimeMapping empty_seqno_time_mapping;
+      s = BuildTable(
+          dbname_, versions_.get(), immutable_db_options_, tboptions,
+          file_options_for_compaction_, cfd->table_cache(), iter.get(),
+          std::move(range_del_iters), &meta, &blob_file_additions,
+          snapshot_seqs, earliest_write_conflict_snapshot, kMaxSequenceNumber,
+          snapshot_checker, paranoid_file_checks, cfd->internal_stats(), &io_s,
+          io_tracer_, BlobFileCreationReason::kRecovery,
+          empty_seqno_time_mapping, &event_logger_, job_id, Env::IO_HIGH,
+          nullptr /* table_properties */, write_hint,
+          nullptr /*full_history_ts_low*/, &blob_callback_);
+      LogFlush(immutable_db_options_.info_log);
+      ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+                      "[%s] [WriteLevel0TableForRecovery]"
+                      " Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s",
+                      cfd->GetName().c_str(), meta.fd.GetNumber(),
+                      meta.fd.GetFileSize(), s.ToString().c_str());
+      mutex_.Lock();
+
+      // TODO(AR) is this ok?
+      if (!io_s.ok() && s.ok()) {
+        s = io_s;
+      }
+    }
+  }
+  ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
+
+  // Note that if file_size is zero, the file has been deleted and
+  // should not be added to the manifest.
+  const bool has_output = meta.fd.GetFileSize() > 0;
+
+  constexpr int level = 0;
+
+  if (s.ok() && has_output) {
+    edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
+                  meta.fd.GetFileSize(), meta.smallest, meta.largest,
+                  meta.fd.smallest_seqno, meta.fd.largest_seqno,
+                  meta.marked_for_compaction, meta.temperature,
+                  meta.oldest_blob_file_number, meta.oldest_ancester_time,
+                  meta.file_creation_time, meta.file_checksum,
+                  meta.file_checksum_func_name, meta.unique_id);
+
+    for (const auto& blob : blob_file_additions) {
+      edit->AddBlobFile(blob);
+    }
+  }
+
+  InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
+  stats.micros = immutable_db_options_.clock->NowMicros() - start_micros;
+
+  if (has_output) {
+    stats.bytes_written = meta.fd.GetFileSize();
+    stats.num_output_files = 1;
+  }
+
+  const auto& blobs = edit->GetBlobFileAdditions();
+  for (const auto& blob : blobs) {
+    stats.bytes_written_blob += blob.GetTotalBlobBytes();
+  }
+
+  stats.num_output_files_blob = static_cast<int>(blobs.size());
+
+  cfd->internal_stats()->AddCompactionStats(level, Env::Priority::USER, stats);
+  cfd->internal_stats()->AddCFStats(
+      InternalStats::BYTES_FLUSHED,
+      stats.bytes_written + stats.bytes_written_blob);
+  RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize());
+  return s;
+}
+
+Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
+  DBOptions db_options(options);
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+  if (db_options.persist_stats_to_disk) {
+    column_families.push_back(
+        ColumnFamilyDescriptor(kPersistentStatsColumnFamilyName, cf_options));
+  }
+  std::vector<ColumnFamilyHandle*> handles;
+  Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr);
+  if (s.ok()) {
+    if (db_options.persist_stats_to_disk) {
+      assert(handles.size() == 2);
+    } else {
+      assert(handles.size() == 1);
+    }
+    // i can delete the handle since DBImpl is always holding a reference to
+    // default column family
+    if (db_options.persist_stats_to_disk && handles[1] != nullptr) {
+      delete handles[1];
+    }
+    delete handles[0];
+  }
+  return s;
+}
+
+Status DB::Open(const DBOptions& db_options, const std::string& dbname,
+                const std::vector<ColumnFamilyDescriptor>& column_families,
+                std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
+  const bool kSeqPerBatch = true;
+  const bool kBatchPerTxn = true;
+  return DBImpl::Open(db_options, dbname, column_families, handles, dbptr,
+                      !kSeqPerBatch, kBatchPerTxn);
+}
+
+// TODO: Implement the trimming in flush code path.
+// TODO: Perform trimming before inserting into memtable during recovery.
+// TODO: Pick files with max_timestamp > trim_ts by each file's timestamp meta
+// info, and handle only these files to reduce io.
+Status DB::OpenAndTrimHistory(
+    const DBOptions& db_options, const std::string& dbname,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+    std::string trim_ts) {
+  assert(dbptr != nullptr);
+  assert(handles != nullptr);
+  auto validate_options = [&db_options] {
+    if (db_options.avoid_flush_during_recovery) {
+      return Status::InvalidArgument(
+          "avoid_flush_during_recovery incompatible with "
+          "OpenAndTrimHistory");
+    }
+    return Status::OK();
+  };
+  auto s = validate_options();
+  if (!s.ok()) {
+    return s;
+  }
+
+  DB* db = nullptr;
+  s = DB::Open(db_options, dbname, column_families, handles, &db);
+  if (!s.ok()) {
+    return s;
+  }
+  assert(db);
+  CompactRangeOptions options;
+  options.bottommost_level_compaction =
+      BottommostLevelCompaction::kForceOptimized;
+  auto db_impl = static_cast_with_check<DBImpl>(db);
+  for (auto handle : *handles) {
+    assert(handle != nullptr);
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(handle);
+    auto cfd = cfh->cfd();
+    assert(cfd != nullptr);
+    // Only compact column families with timestamp enabled
+    if (cfd->user_comparator() != nullptr &&
+        cfd->user_comparator()->timestamp_size() > 0) {
+      s = db_impl->CompactRangeInternal(options, handle, nullptr, nullptr,
+                                        trim_ts);
+      if (!s.ok()) {
+        break;
+      }
+    }
+  }
+  auto clean_op = [&handles, &db] {
+    for (auto handle : *handles) {
+      auto temp_s = db->DestroyColumnFamilyHandle(handle);
+      assert(temp_s.ok());
+    }
+    handles->clear();
+    delete db;
+  };
+  if (!s.ok()) {
+    clean_op();
+    return s;
+  }
+
+  *dbptr = db;
+  return s;
+}
+
+IOStatus DBImpl::CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number,
+                           size_t preallocate_block_size,
+                           log::Writer** new_log) {
+  IOStatus io_s;
+  std::unique_ptr<FSWritableFile> lfile;
+
+  DBOptions db_options =
+      BuildDBOptions(immutable_db_options_, mutable_db_options_);
+  FileOptions opt_file_options =
+      fs_->OptimizeForLogWrite(file_options_, db_options);
+  std::string wal_dir = immutable_db_options_.GetWalDir();
+  std::string log_fname = LogFileName(wal_dir, log_file_num);
+
+  if (recycle_log_number) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "reusing log %" PRIu64 " from recycle list\n",
+                   recycle_log_number);
+    std::string old_log_fname = LogFileName(wal_dir, recycle_log_number);
+    TEST_SYNC_POINT("DBImpl::CreateWAL:BeforeReuseWritableFile1");
+    TEST_SYNC_POINT("DBImpl::CreateWAL:BeforeReuseWritableFile2");
+    io_s = fs_->ReuseWritableFile(log_fname, old_log_fname, opt_file_options,
+                                  &lfile, /*dbg=*/nullptr);
+  } else {
+    io_s = NewWritableFile(fs_.get(), log_fname, &lfile, opt_file_options);
+  }
+
+  if (io_s.ok()) {
+    lfile->SetWriteLifeTimeHint(CalculateWALWriteHint());
+    lfile->SetPreallocationBlockSize(preallocate_block_size);
+
+    const auto& listeners = immutable_db_options_.listeners;
+    FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types;
+    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+        std::move(lfile), log_fname, opt_file_options,
+        immutable_db_options_.clock, io_tracer_, nullptr /* stats */, listeners,
+        nullptr, tmp_set.Contains(FileType::kWalFile),
+        tmp_set.Contains(FileType::kWalFile)));
+    *new_log = new log::Writer(std::move(file_writer), log_file_num,
+                               immutable_db_options_.recycle_log_file_num > 0,
+                               immutable_db_options_.manual_wal_flush,
+                               immutable_db_options_.wal_compression);
+    io_s = (*new_log)->AddCompressionTypeRecord();
+  }
+  return io_s;
+}
+
+Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
+                    const std::vector<ColumnFamilyDescriptor>& column_families,
+                    std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+                    const bool seq_per_batch, const bool batch_per_txn) {
+  Status s = ValidateOptionsByTable(db_options, column_families);
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = ValidateOptions(db_options, column_families);
+  if (!s.ok()) {
+    return s;
+  }
+
+  *dbptr = nullptr;
+  assert(handles);
+  handles->clear();
+
+  size_t max_write_buffer_size = 0;
+  for (auto cf : column_families) {
+    max_write_buffer_size =
+        std::max(max_write_buffer_size, cf.options.write_buffer_size);
+  }
+
+  DBImpl* impl = new DBImpl(db_options, dbname, seq_per_batch, batch_per_txn);
+  if (!impl->immutable_db_options_.info_log) {
+    s = impl->init_logger_creation_s_;
+    delete impl;
+    return s;
+  } else {
+    assert(impl->init_logger_creation_s_.ok());
+  }
+  s = impl->env_->CreateDirIfMissing(impl->immutable_db_options_.GetWalDir());
+  if (s.ok()) {
+    std::vector<std::string> paths;
+    for (auto& db_path : impl->immutable_db_options_.db_paths) {
+      paths.emplace_back(db_path.path);
+    }
+    for (auto& cf : column_families) {
+      for (auto& cf_path : cf.options.cf_paths) {
+        paths.emplace_back(cf_path.path);
+      }
+    }
+    for (auto& path : paths) {
+      s = impl->env_->CreateDirIfMissing(path);
+      if (!s.ok()) {
+        break;
+      }
+    }
+
+    // For recovery from NoSpace() error, we can only handle
+    // the case where the database is stored in a single path
+    if (paths.size() <= 1) {
+      impl->error_handler_.EnableAutoRecovery();
+    }
+  }
+  if (s.ok()) {
+    s = impl->CreateArchivalDirectory();
+  }
+  if (!s.ok()) {
+    delete impl;
+    return s;
+  }
+
+  impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath();
+  RecoveryContext recovery_ctx;
+  impl->mutex_.Lock();
+
+  // Handles create_if_missing, error_if_exists
+  uint64_t recovered_seq(kMaxSequenceNumber);
+  s = impl->Recover(column_families, false, false, false, &recovered_seq,
+                    &recovery_ctx);
+  if (s.ok()) {
+    uint64_t new_log_number = impl->versions_->NewFileNumber();
+    log::Writer* new_log = nullptr;
+    const size_t preallocate_block_size =
+        impl->GetWalPreallocateBlockSize(max_write_buffer_size);
+    s = impl->CreateWAL(new_log_number, 0 /*recycle_log_number*/,
+                        preallocate_block_size, &new_log);
+    if (s.ok()) {
+      InstrumentedMutexLock wl(&impl->log_write_mutex_);
+      impl->logfile_number_ = new_log_number;
+      assert(new_log != nullptr);
+      assert(impl->logs_.empty());
+      impl->logs_.emplace_back(new_log_number, new_log);
+    }
+
+    if (s.ok()) {
+      impl->alive_log_files_.push_back(
+          DBImpl::LogFileNumberSize(impl->logfile_number_));
+      // In WritePrepared there could be gap in sequence numbers. This breaks
+      // the trick we use in kPointInTimeRecovery which assumes the first seq in
+      // the log right after the corrupted log is one larger than the last seq
+      // we read from the wals. To let this trick keep working, we add a dummy
+      // entry with the expected sequence to the first log right after recovery.
+      // In non-WritePrepared case also the new log after recovery could be
+      // empty, and thus missing the consecutive seq hint to distinguish
+      // middle-log corruption to corrupted-log-remained-after-recovery. This
+      // case also will be addressed by a dummy write.
+      if (recovered_seq != kMaxSequenceNumber) {
+        WriteBatch empty_batch;
+        WriteBatchInternal::SetSequence(&empty_batch, recovered_seq);
+        WriteOptions write_options;
+        uint64_t log_used, log_size;
+        log::Writer* log_writer = impl->logs_.back().writer;
+        LogFileNumberSize& log_file_number_size = impl->alive_log_files_.back();
+
+        assert(log_writer->get_log_number() == log_file_number_size.number);
+        impl->mutex_.AssertHeld();
+        s = impl->WriteToWAL(empty_batch, log_writer, &log_used, &log_size,
+                             Env::IO_TOTAL, log_file_number_size);
+        if (s.ok()) {
+          // Need to fsync, otherwise it might get lost after a power reset.
+          s = impl->FlushWAL(false);
+          TEST_SYNC_POINT_CALLBACK("DBImpl::Open::BeforeSyncWAL", /*arg=*/&s);
+          if (s.ok()) {
+            s = log_writer->file()->Sync(impl->immutable_db_options_.use_fsync);
+          }
+        }
+      }
+    }
+  }
+  if (s.ok()) {
+    s = impl->LogAndApplyForRecovery(recovery_ctx);
+  }
+
+  if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
+    impl->mutex_.AssertHeld();
+    s = impl->InitPersistStatsColumnFamily();
+  }
+
+  if (s.ok()) {
+    // set column family handles
+    for (auto cf : column_families) {
+      auto cfd =
+          impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
+      if (cfd != nullptr) {
+        handles->push_back(
+            new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
+        impl->NewThreadStatusCfInfo(cfd);
+      } else {
+        if (db_options.create_missing_column_families) {
+          // missing column family, create it
+          ColumnFamilyHandle* handle = nullptr;
+          impl->mutex_.Unlock();
+          s = impl->CreateColumnFamily(cf.options, cf.name, &handle);
+          impl->mutex_.Lock();
+          if (s.ok()) {
+            handles->push_back(handle);
+          } else {
+            break;
+          }
+        } else {
+          s = Status::InvalidArgument("Column family not found", cf.name);
+          break;
+        }
+      }
+    }
+  }
+
+  if (s.ok()) {
+    SuperVersionContext sv_context(/* create_superversion */ true);
+    for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+      impl->InstallSuperVersionAndScheduleWork(
+          cfd, &sv_context, *cfd->GetLatestMutableCFOptions());
+    }
+    sv_context.Clean();
+  }
+
+  if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
+    // try to read format version
+    s = impl->PersistentStatsProcessFormatVersion();
+  }
+
+  if (s.ok()) {
+    for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+      if (!cfd->mem()->IsSnapshotSupported()) {
+        impl->is_snapshot_supported_ = false;
+      }
+      if (cfd->ioptions()->merge_operator != nullptr &&
+          !cfd->mem()->IsMergeOperatorSupported()) {
+        s = Status::InvalidArgument(
+            "The memtable of column family %s does not support merge operator "
+            "its options.merge_operator is non-null",
+            cfd->GetName().c_str());
+      }
+      if (!s.ok()) {
+        break;
+      }
+    }
+  }
+  TEST_SYNC_POINT("DBImpl::Open:Opened");
+  Status persist_options_status;
+  if (s.ok()) {
+    // Persist RocksDB Options before scheduling the compaction.
+    // The WriteOptionsFile() will release and lock the mutex internally.
+    persist_options_status = impl->WriteOptionsFile(
+        false /*need_mutex_lock*/, false /*need_enter_write_thread*/);
+
+    *dbptr = impl;
+    impl->opened_successfully_ = true;
+    impl->DeleteObsoleteFiles();
+    TEST_SYNC_POINT("DBImpl::Open:AfterDeleteFiles");
+    impl->MaybeScheduleFlushOrCompaction();
+  } else {
+    persist_options_status.PermitUncheckedError();
+  }
+  impl->mutex_.Unlock();
+
+#ifndef ROCKSDB_LITE
+  auto sfm = static_cast<SstFileManagerImpl*>(
+      impl->immutable_db_options_.sst_file_manager.get());
+  if (s.ok() && sfm) {
+    // Set Statistics ptr for SstFileManager to dump the stats of
+    // DeleteScheduler.
+    sfm->SetStatisticsPtr(impl->immutable_db_options_.statistics);
+    ROCKS_LOG_INFO(impl->immutable_db_options_.info_log,
+                   "SstFileManager instance %p", sfm);
+
+    // Notify SstFileManager about all sst files that already exist in
+    // db_paths[0] and cf_paths[0] when the DB is opened.
+
+    // SstFileManagerImpl needs to know sizes of the files. For files whose size
+    // we already know (sst files that appear in manifest - typically that's the
+    // vast majority of all files), we'll pass the size to SstFileManager.
+    // For all other files SstFileManager will query the size from filesystem.
+
+    std::vector<ColumnFamilyMetaData> metadata;
+    impl->GetAllColumnFamilyMetaData(&metadata);
+
+    std::unordered_map<std::string, uint64_t> known_file_sizes;
+    for (const auto& md : metadata) {
+      for (const auto& lmd : md.levels) {
+        for (const auto& fmd : lmd.files) {
+          known_file_sizes[fmd.relative_filename] = fmd.size;
+        }
+      }
+      for (const auto& bmd : md.blob_files) {
+        std::string name = bmd.blob_file_name;
+        // The BlobMetaData.blob_file_name may start with "/".
+        if (!name.empty() && name[0] == '/') {
+          name = name.substr(1);
+        }
+        known_file_sizes[name] = bmd.blob_file_size;
+      }
+    }
+
+    std::vector<std::string> paths;
+    paths.emplace_back(impl->immutable_db_options_.db_paths[0].path);
+    for (auto& cf : column_families) {
+      if (!cf.options.cf_paths.empty()) {
+        paths.emplace_back(cf.options.cf_paths[0].path);
+      }
+    }
+    // Remove duplicate paths.
+    std::sort(paths.begin(), paths.end());
+    paths.erase(std::unique(paths.begin(), paths.end()), paths.end());
+    IOOptions io_opts;
+    io_opts.do_not_recurse = true;
+    for (auto& path : paths) {
+      std::vector<std::string> existing_files;
+      impl->immutable_db_options_.fs
+          ->GetChildren(path, io_opts, &existing_files,
+                        /*IODebugContext*=*/nullptr)
+          .PermitUncheckedError();  //**TODO: What do to on error?
+      for (auto& file_name : existing_files) {
+        uint64_t file_number;
+        FileType file_type;
+        std::string file_path = path + "/" + file_name;
+        if (ParseFileName(file_name, &file_number, &file_type) &&
+            (file_type == kTableFile || file_type == kBlobFile)) {
+          // TODO: Check for errors from OnAddFile?
+          if (known_file_sizes.count(file_name)) {
+            // We're assuming that each sst file name exists in at most one of
+            // the paths.
+            sfm->OnAddFile(file_path, known_file_sizes.at(file_name))
+                .PermitUncheckedError();
+          } else {
+            sfm->OnAddFile(file_path).PermitUncheckedError();
+          }
+        }
+      }
+    }
+
+    // Reserve some disk buffer space. This is a heuristic - when we run out
+    // of disk space, this ensures that there is atleast write_buffer_size
+    // amount of free space before we resume DB writes. In low disk space
+    // conditions, we want to avoid a lot of small L0 files due to frequent
+    // WAL write failures and resultant forced flushes
+    sfm->ReserveDiskBuffer(max_write_buffer_size,
+                           impl->immutable_db_options_.db_paths[0].path);
+  }
+
+#endif  // !ROCKSDB_LITE
+
+  if (s.ok()) {
+    ROCKS_LOG_HEADER(impl->immutable_db_options_.info_log, "DB pointer %p",
+                     impl);
+    LogFlush(impl->immutable_db_options_.info_log);
+    if (!impl->WALBufferIsEmpty()) {
+      s = impl->FlushWAL(false);
+      if (s.ok()) {
+        // Sync is needed otherwise WAL buffered data might get lost after a
+        // power reset.
+        log::Writer* log_writer = impl->logs_.back().writer;
+        s = log_writer->file()->Sync(impl->immutable_db_options_.use_fsync);
+      }
+    }
+    if (s.ok() && !persist_options_status.ok()) {
+      s = Status::IOError(
+          "DB::Open() failed --- Unable to persist Options file",
+          persist_options_status.ToString());
+    }
+  }
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(impl->immutable_db_options_.info_log,
+                   "DB::Open() failed: %s", s.ToString().c_str());
+  }
+  if (s.ok()) {
+    s = impl->StartPeriodicTaskScheduler();
+  }
+
+  if (s.ok()) {
+    s = impl->RegisterRecordSeqnoTimeWorker();
+  }
+  if (!s.ok()) {
+    for (auto* h : *handles) {
+      delete h;
+    }
+    handles->clear();
+    delete impl;
+    *dbptr = nullptr;
+  }
+  return s;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_readonly.cc b/src/rocksdb/db/db_impl/db_impl_readonly.cc
new file mode 100644
index 000000000..0f10baf24
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_readonly.cc
@@ -0,0 +1,341 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_impl/db_impl_readonly.h"
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/db_impl/compacted_db_impl.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_iter.h"
+#include "db/merge_context.h"
+#include "logging/logging.h"
+#include "monitoring/perf_context_imp.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+
+DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options,
+                               const std::string& dbname)
+    : DBImpl(db_options, dbname, /*seq_per_batch*/ false,
+             /*batch_per_txn*/ true, /*read_only*/ true) {
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "Opening the db in read only mode");
+  LogFlush(immutable_db_options_.info_log);
+}
+
+DBImplReadOnly::~DBImplReadOnly() {}
+
+// Implementations of the DB interface
+Status DBImplReadOnly::Get(const ReadOptions& read_options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           PinnableSlice* pinnable_val) {
+  return Get(read_options, column_family, key, pinnable_val,
+             /*timestamp*/ nullptr);
+}
+
+Status DBImplReadOnly::Get(const ReadOptions& read_options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           PinnableSlice* pinnable_val,
+                           std::string* timestamp) {
+  assert(pinnable_val != nullptr);
+  // TODO: stopwatch DB_GET needed?, perf timer needed?
+  PERF_TIMER_GUARD(get_snapshot_time);
+
+  assert(column_family);
+  if (read_options.timestamp) {
+    const Status s = FailIfTsMismatchCf(
+        column_family, *(read_options.timestamp), /*ts_for_read=*/true);
+    if (!s.ok()) {
+      return s;
+    }
+  } else {
+    const Status s = FailIfCfHasTs(column_family);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  // Clear the timestamps for returning results so that we can distinguish
+  // between tombstone or key that has never been written
+  if (timestamp) {
+    timestamp->clear();
+  }
+
+  const Comparator* ucmp = column_family->GetComparator();
+  assert(ucmp);
+  std::string* ts = ucmp->timestamp_size() > 0 ? timestamp : nullptr;
+
+  Status s;
+  SequenceNumber snapshot = versions_->LastSequence();
+  GetWithTimestampReadCallback read_cb(snapshot);
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  auto cfd = cfh->cfd();
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      tracer_->Get(column_family, key);
+    }
+  }
+  SuperVersion* super_version = cfd->GetSuperVersion();
+  MergeContext merge_context;
+  SequenceNumber max_covering_tombstone_seq = 0;
+  LookupKey lkey(key, snapshot, read_options.timestamp);
+  PERF_TIMER_STOP(get_snapshot_time);
+  if (super_version->mem->Get(lkey, pinnable_val->GetSelf(),
+                              /*columns=*/nullptr, ts, &s, &merge_context,
+                              &max_covering_tombstone_seq, read_options,
+                              false /* immutable_memtable */, &read_cb)) {
+    pinnable_val->PinSelf();
+    RecordTick(stats_, MEMTABLE_HIT);
+  } else {
+    PERF_TIMER_GUARD(get_from_output_files_time);
+    PinnedIteratorsManager pinned_iters_mgr;
+    super_version->current->Get(
+        read_options, lkey, pinnable_val, /*columns=*/nullptr, ts, &s,
+        &merge_context, &max_covering_tombstone_seq, &pinned_iters_mgr,
+        /*value_found*/ nullptr,
+        /*key_exists*/ nullptr, /*seq*/ nullptr, &read_cb,
+        /*is_blob*/ nullptr,
+        /*do_merge*/ true);
+    RecordTick(stats_, MEMTABLE_MISS);
+  }
+  RecordTick(stats_, NUMBER_KEYS_READ);
+  size_t size = pinnable_val->size();
+  RecordTick(stats_, BYTES_READ, size);
+  RecordInHistogram(stats_, BYTES_PER_READ, size);
+  PERF_COUNTER_ADD(get_read_bytes, size);
+  return s;
+}
+
+Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options,
+                                      ColumnFamilyHandle* column_family) {
+  assert(column_family);
+  if (read_options.timestamp) {
+    const Status s = FailIfTsMismatchCf(
+        column_family, *(read_options.timestamp), /*ts_for_read=*/true);
+    if (!s.ok()) {
+      return NewErrorIterator(s);
+    }
+  } else {
+    const Status s = FailIfCfHasTs(column_family);
+    if (!s.ok()) {
+      return NewErrorIterator(s);
+    }
+  }
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  auto cfd = cfh->cfd();
+  SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
+  SequenceNumber latest_snapshot = versions_->LastSequence();
+  SequenceNumber read_seq =
+      read_options.snapshot != nullptr
+          ? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
+                ->number_
+          : latest_snapshot;
+  ReadCallback* read_callback = nullptr;  // No read callback provided.
+  auto db_iter = NewArenaWrappedDbIterator(
+      env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options,
+      super_version->current, read_seq,
+      super_version->mutable_cf_options.max_sequential_skip_in_iterations,
+      super_version->version_number, read_callback);
+  auto internal_iter = NewInternalIterator(
+      db_iter->GetReadOptions(), cfd, super_version, db_iter->GetArena(),
+      read_seq, /* allow_unprepared_value */ true, db_iter);
+  db_iter->SetIterUnderDBIter(internal_iter);
+  return db_iter;
+}
+
+Status DBImplReadOnly::NewIterators(
+    const ReadOptions& read_options,
+    const std::vector<ColumnFamilyHandle*>& column_families,
+    std::vector<Iterator*>* iterators) {
+  if (read_options.timestamp) {
+    for (auto* cf : column_families) {
+      assert(cf);
+      const Status s = FailIfTsMismatchCf(cf, *(read_options.timestamp),
+                                          /*ts_for_read=*/true);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  } else {
+    for (auto* cf : column_families) {
+      assert(cf);
+      const Status s = FailIfCfHasTs(cf);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  }
+
+  ReadCallback* read_callback = nullptr;  // No read callback provided.
+  if (iterators == nullptr) {
+    return Status::InvalidArgument("iterators not allowed to be nullptr");
+  }
+  iterators->clear();
+  iterators->reserve(column_families.size());
+  SequenceNumber latest_snapshot = versions_->LastSequence();
+  SequenceNumber read_seq =
+      read_options.snapshot != nullptr
+          ? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
+                ->number_
+          : latest_snapshot;
+
+  for (auto cfh : column_families) {
+    auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(cfh)->cfd();
+    auto* sv = cfd->GetSuperVersion()->Ref();
+    auto* db_iter = NewArenaWrappedDbIterator(
+        env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
+        sv->current, read_seq,
+        sv->mutable_cf_options.max_sequential_skip_in_iterations,
+        sv->version_number, read_callback);
+    auto* internal_iter = NewInternalIterator(
+        db_iter->GetReadOptions(), cfd, sv, db_iter->GetArena(), read_seq,
+        /* allow_unprepared_value */ true, db_iter);
+    db_iter->SetIterUnderDBIter(internal_iter);
+    iterators->push_back(db_iter);
+  }
+
+  return Status::OK();
+}
+
+namespace {
+// Return OK if dbname exists in the file system or create it if
+// create_if_missing
+Status OpenForReadOnlyCheckExistence(const DBOptions& db_options,
+                                     const std::string& dbname) {
+  Status s;
+  if (!db_options.create_if_missing) {
+    // Attempt to read "CURRENT" file
+    const std::shared_ptr<FileSystem>& fs = db_options.env->GetFileSystem();
+    std::string manifest_path;
+    uint64_t manifest_file_number;
+    s = VersionSet::GetCurrentManifestPath(dbname, fs.get(), &manifest_path,
+                                           &manifest_file_number);
+  } else {
+    // Historic behavior that doesn't necessarily make sense
+    s = db_options.env->CreateDirIfMissing(dbname);
+  }
+  return s;
+}
+}  // namespace
+
+Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
+                           DB** dbptr, bool /*error_if_wal_file_exists*/) {
+  Status s = OpenForReadOnlyCheckExistence(options, dbname);
+  if (!s.ok()) {
+    return s;
+  }
+
+  *dbptr = nullptr;
+
+  // Try to first open DB as fully compacted DB
+  s = CompactedDBImpl::Open(options, dbname, dbptr);
+  if (s.ok()) {
+    return s;
+  }
+
+  DBOptions db_options(options);
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+  std::vector<ColumnFamilyHandle*> handles;
+
+  s = DBImplReadOnly::OpenForReadOnlyWithoutCheck(
+      db_options, dbname, column_families, &handles, dbptr);
+  if (s.ok()) {
+    assert(handles.size() == 1);
+    // i can delete the handle since DBImpl is always holding a
+    // reference to default column family
+    delete handles[0];
+  }
+  return s;
+}
+
+Status DB::OpenForReadOnly(
+    const DBOptions& db_options, const std::string& dbname,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+    bool error_if_wal_file_exists) {
+  // If dbname does not exist in the file system, should not do anything
+  Status s = OpenForReadOnlyCheckExistence(db_options, dbname);
+  if (!s.ok()) {
+    return s;
+  }
+
+  return DBImplReadOnly::OpenForReadOnlyWithoutCheck(
+      db_options, dbname, column_families, handles, dbptr,
+      error_if_wal_file_exists);
+}
+
+Status DBImplReadOnly::OpenForReadOnlyWithoutCheck(
+    const DBOptions& db_options, const std::string& dbname,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+    bool error_if_wal_file_exists) {
+  *dbptr = nullptr;
+  handles->clear();
+
+  SuperVersionContext sv_context(/* create_superversion */ true);
+  DBImplReadOnly* impl = new DBImplReadOnly(db_options, dbname);
+  impl->mutex_.Lock();
+  Status s = impl->Recover(column_families, true /* read only */,
+                           error_if_wal_file_exists);
+  if (s.ok()) {
+    // set column family handles
+    for (auto cf : column_families) {
+      auto cfd =
+          impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
+      if (cfd == nullptr) {
+        s = Status::InvalidArgument("Column family not found", cf.name);
+        break;
+      }
+      handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
+    }
+  }
+  if (s.ok()) {
+    for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+      sv_context.NewSuperVersion();
+      cfd->InstallSuperVersion(&sv_context, &impl->mutex_);
+    }
+  }
+  impl->mutex_.Unlock();
+  sv_context.Clean();
+  if (s.ok()) {
+    *dbptr = impl;
+    for (auto* h : *handles) {
+      impl->NewThreadStatusCfInfo(
+          static_cast_with_check<ColumnFamilyHandleImpl>(h)->cfd());
+    }
+  } else {
+    for (auto h : *handles) {
+      delete h;
+    }
+    handles->clear();
+    delete impl;
+  }
+  return s;
+}
+
+#else   // !ROCKSDB_LITE
+
+Status DB::OpenForReadOnly(const Options& /*options*/,
+                           const std::string& /*dbname*/, DB** /*dbptr*/,
+                           bool /*error_if_wal_file_exists*/) {
+  return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+
+Status DB::OpenForReadOnly(
+    const DBOptions& /*db_options*/, const std::string& /*dbname*/,
+    const std::vector<ColumnFamilyDescriptor>& /*column_families*/,
+    std::vector<ColumnFamilyHandle*>* /*handles*/, DB** /*dbptr*/,
+    bool /*error_if_wal_file_exists*/) {
+  return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_readonly.h b/src/rocksdb/db/db_impl/db_impl_readonly.h
new file mode 100644
index 000000000..b876a0fda
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_readonly.h
@@ -0,0 +1,170 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO: Share common structure with CompactedDBImpl and DBImplSecondary
+class DBImplReadOnly : public DBImpl {
+ public:
+  DBImplReadOnly(const DBOptions& options, const std::string& dbname);
+  // No copying allowed
+  DBImplReadOnly(const DBImplReadOnly&) = delete;
+  void operator=(const DBImplReadOnly&) = delete;
+
+  virtual ~DBImplReadOnly();
+
+  // Implementations of the DB interface
+  using DB::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* value) override;
+  Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
+             const Slice& key, PinnableSlice* value,
+             std::string* timestamp) override;
+
+  // TODO: Implement ReadOnly MultiGet?
+
+  using DBImpl::NewIterator;
+  virtual Iterator* NewIterator(const ReadOptions&,
+                                ColumnFamilyHandle* column_family) override;
+
+  virtual Status NewIterators(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      std::vector<Iterator*>* iterators) override;
+
+  using DBImpl::Put;
+  virtual Status Put(const WriteOptions& /*options*/,
+                     ColumnFamilyHandle* /*column_family*/,
+                     const Slice& /*key*/, const Slice& /*value*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  using DBImpl::PutEntity;
+  Status PutEntity(const WriteOptions& /* options */,
+                   ColumnFamilyHandle* /* column_family */,
+                   const Slice& /* key */,
+                   const WideColumns& /* columns */) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  using DBImpl::Merge;
+  virtual Status Merge(const WriteOptions& /*options*/,
+                       ColumnFamilyHandle* /*column_family*/,
+                       const Slice& /*key*/, const Slice& /*value*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  using DBImpl::Delete;
+  virtual Status Delete(const WriteOptions& /*options*/,
+                        ColumnFamilyHandle* /*column_family*/,
+                        const Slice& /*key*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  using DBImpl::SingleDelete;
+  virtual Status SingleDelete(const WriteOptions& /*options*/,
+                              ColumnFamilyHandle* /*column_family*/,
+                              const Slice& /*key*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  virtual Status Write(const WriteOptions& /*options*/,
+                       WriteBatch* /*updates*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  using DBImpl::CompactRange;
+  virtual Status CompactRange(const CompactRangeOptions& /*options*/,
+                              ColumnFamilyHandle* /*column_family*/,
+                              const Slice* /*begin*/,
+                              const Slice* /*end*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  using DBImpl::CompactFiles;
+  virtual Status CompactFiles(
+      const CompactionOptions& /*compact_options*/,
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*input_file_names*/,
+      const int /*output_level*/, const int /*output_path_id*/ = -1,
+      std::vector<std::string>* const /*output_file_names*/ = nullptr,
+      CompactionJobInfo* /*compaction_job_info*/ = nullptr) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  virtual Status DisableFileDeletions() override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  virtual Status EnableFileDeletions(bool /*force*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  virtual Status GetLiveFiles(std::vector<std::string>& ret,
+                              uint64_t* manifest_file_size,
+                              bool /*flush_memtable*/) override {
+    return DBImpl::GetLiveFiles(ret, manifest_file_size,
+                                false /* flush_memtable */);
+  }
+
+  using DBImpl::Flush;
+  virtual Status Flush(const FlushOptions& /*options*/,
+                       ColumnFamilyHandle* /*column_family*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  using DBImpl::SyncWAL;
+  virtual Status SyncWAL() override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  using DB::IngestExternalFile;
+  virtual Status IngestExternalFile(
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*external_files*/,
+      const IngestExternalFileOptions& /*ingestion_options*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  using DB::CreateColumnFamilyWithImport;
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& /*options*/,
+      const std::string& /*column_family_name*/,
+      const ImportColumnFamilyOptions& /*import_options*/,
+      const ExportImportFilesMetaData& /*metadata*/,
+      ColumnFamilyHandle** /*handle*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  // FIXME: some missing overrides for more "write" functions
+
+ protected:
+#ifndef ROCKSDB_LITE
+  Status FlushForGetLiveFiles() override {
+    // No-op for read-only DB
+    return Status::OK();
+  }
+#endif  // !ROCKSDB_LITE
+
+ private:
+  // A "helper" function for DB::OpenForReadOnly without column families
+  // to reduce unnecessary I/O
+  // It has the same functionality as DB::OpenForReadOnly with column families
+  // but does not check the existence of dbname in the file system
+  static Status OpenForReadOnlyWithoutCheck(
+      const DBOptions& db_options, const std::string& dbname,
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+      bool error_if_wal_file_exists = false);
+  friend class DB;
+};
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_impl/db_impl_secondary.cc b/src/rocksdb/db/db_impl/db_impl_secondary.cc
new file mode 100644
index 000000000..5189d17d9
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_secondary.cc
@@ -0,0 +1,967 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_impl/db_impl_secondary.h"
+
+#include <cinttypes>
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/merge_context.h"
+#include "logging/auto_roll_logger.h"
+#include "logging/logging.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/configurable.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+DBImplSecondary::DBImplSecondary(const DBOptions& db_options,
+                                 const std::string& dbname,
+                                 std::string secondary_path)
+    : DBImpl(db_options, dbname, false, true, true),
+      secondary_path_(std::move(secondary_path)) {
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "Opening the db in secondary mode");
+  LogFlush(immutable_db_options_.info_log);
+}
+
+DBImplSecondary::~DBImplSecondary() {}
+
+Status DBImplSecondary::Recover(
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    bool /*readonly*/, bool /*error_if_wal_file_exists*/,
+    bool /*error_if_data_exists_in_wals*/, uint64_t*,
+    RecoveryContext* /*recovery_ctx*/) {
+  mutex_.AssertHeld();
+
+  JobContext job_context(0);
+  Status s;
+  s = static_cast<ReactiveVersionSet*>(versions_.get())
+          ->Recover(column_families, &manifest_reader_, &manifest_reporter_,
+                    &manifest_reader_status_);
+  if (!s.ok()) {
+    if (manifest_reader_status_) {
+      manifest_reader_status_->PermitUncheckedError();
+    }
+    return s;
+  }
+  if (immutable_db_options_.paranoid_checks && s.ok()) {
+    s = CheckConsistency();
+  }
+  // Initial max_total_in_memory_state_ before recovery logs.
+  max_total_in_memory_state_ = 0;
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+    max_total_in_memory_state_ += mutable_cf_options->write_buffer_size *
+                                  mutable_cf_options->max_write_buffer_number;
+  }
+  if (s.ok()) {
+    default_cf_handle_ = new ColumnFamilyHandleImpl(
+        versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_);
+    default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats();
+
+    std::unordered_set<ColumnFamilyData*> cfds_changed;
+    s = FindAndRecoverLogFiles(&cfds_changed, &job_context);
+  }
+
+  if (s.IsPathNotFound()) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Secondary tries to read WAL, but WAL file(s) have already "
+                   "been purged by primary.");
+    s = Status::OK();
+  }
+  // TODO: update options_file_number_ needed?
+
+  job_context.Clean();
+  return s;
+}
+
+// find new WAL and apply them in order to the secondary instance
+Status DBImplSecondary::FindAndRecoverLogFiles(
+    std::unordered_set<ColumnFamilyData*>* cfds_changed,
+    JobContext* job_context) {
+  assert(nullptr != cfds_changed);
+  assert(nullptr != job_context);
+  Status s;
+  std::vector<uint64_t> logs;
+  s = FindNewLogNumbers(&logs);
+  if (s.ok() && !logs.empty()) {
+    SequenceNumber next_sequence(kMaxSequenceNumber);
+    s = RecoverLogFiles(logs, &next_sequence, cfds_changed, job_context);
+  }
+  return s;
+}
+
+// List wal_dir and find all new WALs, return these log numbers
+Status DBImplSecondary::FindNewLogNumbers(std::vector<uint64_t>* logs) {
+  assert(logs != nullptr);
+  std::vector<std::string> filenames;
+  Status s;
+  IOOptions io_opts;
+  io_opts.do_not_recurse = true;
+  s = immutable_db_options_.fs->GetChildren(immutable_db_options_.GetWalDir(),
+                                            io_opts, &filenames,
+                                            /*IODebugContext*=*/nullptr);
+  if (s.IsNotFound()) {
+    return Status::InvalidArgument("Failed to open wal_dir",
+                                   immutable_db_options_.GetWalDir());
+  } else if (!s.ok()) {
+    return s;
+  }
+
+  // if log_readers_ is non-empty, it means we have applied all logs with log
+  // numbers smaller than the smallest log in log_readers_, so there is no
+  // need to pass these logs to RecoverLogFiles
+  uint64_t log_number_min = 0;
+  if (!log_readers_.empty()) {
+    log_number_min = log_readers_.begin()->first;
+  }
+  for (size_t i = 0; i < filenames.size(); i++) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(filenames[i], &number, &type) && type == kWalFile &&
+        number >= log_number_min) {
+      logs->push_back(number);
+    }
+  }
+  // Recover logs in the order that they were generated
+  if (!logs->empty()) {
+    std::sort(logs->begin(), logs->end());
+  }
+  return s;
+}
+
+Status DBImplSecondary::MaybeInitLogReader(
+    uint64_t log_number, log::FragmentBufferedReader** log_reader) {
+  auto iter = log_readers_.find(log_number);
+  // make sure the log file is still present
+  if (iter == log_readers_.end() ||
+      iter->second->reader_->GetLogNumber() != log_number) {
+    // delete the obsolete log reader if log number mismatch
+    if (iter != log_readers_.end()) {
+      log_readers_.erase(iter);
+    }
+    // initialize log reader from log_number
+    // TODO: min_log_number_to_keep_2pc check needed?
+    // Open the log file
+    std::string fname =
+        LogFileName(immutable_db_options_.GetWalDir(), log_number);
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Recovering log #%" PRIu64 " mode %d", log_number,
+                   static_cast<int>(immutable_db_options_.wal_recovery_mode));
+
+    std::unique_ptr<SequentialFileReader> file_reader;
+    {
+      std::unique_ptr<FSSequentialFile> file;
+      Status status = fs_->NewSequentialFile(
+          fname, fs_->OptimizeForLogRead(file_options_), &file, nullptr);
+      if (!status.ok()) {
+        *log_reader = nullptr;
+        return status;
+      }
+      file_reader.reset(new SequentialFileReader(
+          std::move(file), fname, immutable_db_options_.log_readahead_size,
+          io_tracer_));
+    }
+
+    // Create the log reader.
+    LogReaderContainer* log_reader_container = new LogReaderContainer(
+        env_, immutable_db_options_.info_log, std::move(fname),
+        std::move(file_reader), log_number);
+    log_readers_.insert(std::make_pair(
+        log_number, std::unique_ptr<LogReaderContainer>(log_reader_container)));
+  }
+  iter = log_readers_.find(log_number);
+  assert(iter != log_readers_.end());
+  *log_reader = iter->second->reader_;
+  return Status::OK();
+}
+
+// After manifest recovery, replay WALs and refresh log_readers_ if necessary
+// REQUIRES: log_numbers are sorted in ascending order
+Status DBImplSecondary::RecoverLogFiles(
+    const std::vector<uint64_t>& log_numbers, SequenceNumber* next_sequence,
+    std::unordered_set<ColumnFamilyData*>* cfds_changed,
+    JobContext* job_context) {
+  assert(nullptr != cfds_changed);
+  assert(nullptr != job_context);
+  mutex_.AssertHeld();
+  Status status;
+  for (auto log_number : log_numbers) {
+    log::FragmentBufferedReader* reader = nullptr;
+    status = MaybeInitLogReader(log_number, &reader);
+    if (!status.ok()) {
+      return status;
+    }
+    assert(reader != nullptr);
+  }
+  for (auto log_number : log_numbers) {
+    auto it = log_readers_.find(log_number);
+    assert(it != log_readers_.end());
+    log::FragmentBufferedReader* reader = it->second->reader_;
+    Status* wal_read_status = it->second->status_;
+    assert(wal_read_status);
+    // Manually update the file number allocation counter in VersionSet.
+    versions_->MarkFileNumberUsed(log_number);
+
+    // Determine if we should tolerate incomplete records at the tail end of the
+    // Read all the records and add to a memtable
+    std::string scratch;
+    Slice record;
+    WriteBatch batch;
+
+    while (reader->ReadRecord(&record, &scratch,
+                              immutable_db_options_.wal_recovery_mode) &&
+           wal_read_status->ok() && status.ok()) {
+      if (record.size() < WriteBatchInternal::kHeader) {
+        reader->GetReporter()->Corruption(
+            record.size(), Status::Corruption("log record too small"));
+        continue;
+      }
+      status = WriteBatchInternal::SetContents(&batch, record);
+      if (!status.ok()) {
+        break;
+      }
+      SequenceNumber seq_of_batch = WriteBatchInternal::Sequence(&batch);
+      std::vector<uint32_t> column_family_ids;
+      status = CollectColumnFamilyIdsFromWriteBatch(batch, &column_family_ids);
+      if (status.ok()) {
+        for (const auto id : column_family_ids) {
+          ColumnFamilyData* cfd =
+              versions_->GetColumnFamilySet()->GetColumnFamily(id);
+          if (cfd == nullptr) {
+            continue;
+          }
+          if (cfds_changed->count(cfd) == 0) {
+            cfds_changed->insert(cfd);
+          }
+          const std::vector<FileMetaData*>& l0_files =
+              cfd->current()->storage_info()->LevelFiles(0);
+          SequenceNumber seq =
+              l0_files.empty() ? 0 : l0_files.back()->fd.largest_seqno;
+          // If the write batch's sequence number is smaller than the last
+          // sequence number of the largest sequence persisted for this column
+          // family, then its data must reside in an SST that has already been
+          // added in the prior MANIFEST replay.
+          if (seq_of_batch <= seq) {
+            continue;
+          }
+          auto curr_log_num = std::numeric_limits<uint64_t>::max();
+          if (cfd_to_current_log_.count(cfd) > 0) {
+            curr_log_num = cfd_to_current_log_[cfd];
+          }
+          // If the active memtable contains records added by replaying an
+          // earlier WAL, then we need to seal the memtable, add it to the
+          // immutable memtable list and create a new active memtable.
+          if (!cfd->mem()->IsEmpty() &&
+              (curr_log_num == std::numeric_limits<uint64_t>::max() ||
+               curr_log_num != log_number)) {
+            const MutableCFOptions mutable_cf_options =
+                *cfd->GetLatestMutableCFOptions();
+            MemTable* new_mem =
+                cfd->ConstructNewMemtable(mutable_cf_options, seq_of_batch);
+            cfd->mem()->SetNextLogNumber(log_number);
+            cfd->mem()->ConstructFragmentedRangeTombstones();
+            cfd->imm()->Add(cfd->mem(), &job_context->memtables_to_free);
+            new_mem->Ref();
+            cfd->SetMemtable(new_mem);
+          }
+        }
+        bool has_valid_writes = false;
+        status = WriteBatchInternal::InsertInto(
+            &batch, column_family_memtables_.get(),
+            nullptr /* flush_scheduler */, nullptr /* trim_history_scheduler*/,
+            true, log_number, this, false /* concurrent_memtable_writes */,
+            next_sequence, &has_valid_writes, seq_per_batch_, batch_per_txn_);
+      }
+      // If column family was not found, it might mean that the WAL write
+      // batch references to the column family that was dropped after the
+      // insert. We don't want to fail the whole write batch in that case --
+      // we just ignore the update.
+      // That's why we set ignore missing column families to true
+      // passing null flush_scheduler will disable memtable flushing which is
+      // needed for secondary instances
+      if (status.ok()) {
+        for (const auto id : column_family_ids) {
+          ColumnFamilyData* cfd =
+              versions_->GetColumnFamilySet()->GetColumnFamily(id);
+          if (cfd == nullptr) {
+            continue;
+          }
+          std::unordered_map<ColumnFamilyData*, uint64_t>::iterator iter =
+              cfd_to_current_log_.find(cfd);
+          if (iter == cfd_to_current_log_.end()) {
+            cfd_to_current_log_.insert({cfd, log_number});
+          } else if (log_number > iter->second) {
+            iter->second = log_number;
+          }
+        }
+        auto last_sequence = *next_sequence - 1;
+        if ((*next_sequence != kMaxSequenceNumber) &&
+            (versions_->LastSequence() <= last_sequence)) {
+          versions_->SetLastAllocatedSequence(last_sequence);
+          versions_->SetLastPublishedSequence(last_sequence);
+          versions_->SetLastSequence(last_sequence);
+        }
+      } else {
+        // We are treating this as a failure while reading since we read valid
+        // blocks that do not form coherent data
+        reader->GetReporter()->Corruption(record.size(), status);
+      }
+    }
+    if (status.ok() && !wal_read_status->ok()) {
+      status = *wal_read_status;
+    }
+    if (!status.ok()) {
+      return status;
+    }
+  }
+  // remove logreaders from map after successfully recovering the WAL
+  if (log_readers_.size() > 1) {
+    auto erase_iter = log_readers_.begin();
+    std::advance(erase_iter, log_readers_.size() - 1);
+    log_readers_.erase(log_readers_.begin(), erase_iter);
+  }
+  return status;
+}
+
+// Implementation of the DB interface
+Status DBImplSecondary::Get(const ReadOptions& read_options,
+                            ColumnFamilyHandle* column_family, const Slice& key,
+                            PinnableSlice* value) {
+  return GetImpl(read_options, column_family, key, value,
+                 /*timestamp*/ nullptr);
+}
+
+Status DBImplSecondary::Get(const ReadOptions& read_options,
+                            ColumnFamilyHandle* column_family, const Slice& key,
+                            PinnableSlice* value, std::string* timestamp) {
+  return GetImpl(read_options, column_family, key, value, timestamp);
+}
+
+Status DBImplSecondary::GetImpl(const ReadOptions& read_options,
+                                ColumnFamilyHandle* column_family,
+                                const Slice& key, PinnableSlice* pinnable_val,
+                                std::string* timestamp) {
+  assert(pinnable_val != nullptr);
+  PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock);
+  StopWatch sw(immutable_db_options_.clock, stats_, DB_GET);
+  PERF_TIMER_GUARD(get_snapshot_time);
+
+  assert(column_family);
+  if (read_options.timestamp) {
+    const Status s = FailIfTsMismatchCf(
+        column_family, *(read_options.timestamp), /*ts_for_read=*/true);
+    if (!s.ok()) {
+      return s;
+    }
+  } else {
+    const Status s = FailIfCfHasTs(column_family);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  // Clear the timestamp for returning results so that we can distinguish
+  // between tombstone or key that has never been written later.
+  if (timestamp) {
+    timestamp->clear();
+  }
+
+  auto cfh = static_cast<ColumnFamilyHandleImpl*>(column_family);
+  ColumnFamilyData* cfd = cfh->cfd();
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      tracer_->Get(column_family, key);
+    }
+  }
+  // Acquire SuperVersion
+  SuperVersion* super_version = GetAndRefSuperVersion(cfd);
+  SequenceNumber snapshot = versions_->LastSequence();
+  GetWithTimestampReadCallback read_cb(snapshot);
+  MergeContext merge_context;
+  SequenceNumber max_covering_tombstone_seq = 0;
+  Status s;
+  LookupKey lkey(key, snapshot, read_options.timestamp);
+  PERF_TIMER_STOP(get_snapshot_time);
+
+  bool done = false;
+  const Comparator* ucmp = column_family->GetComparator();
+  assert(ucmp);
+  std::string* ts = ucmp->timestamp_size() > 0 ? timestamp : nullptr;
+  if (super_version->mem->Get(lkey, pinnable_val->GetSelf(),
+                              /*columns=*/nullptr, ts, &s, &merge_context,
+                              &max_covering_tombstone_seq, read_options,
+                              false /* immutable_memtable */, &read_cb)) {
+    done = true;
+    pinnable_val->PinSelf();
+    RecordTick(stats_, MEMTABLE_HIT);
+  } else if ((s.ok() || s.IsMergeInProgress()) &&
+             super_version->imm->Get(
+                 lkey, pinnable_val->GetSelf(), /*columns=*/nullptr, ts, &s,
+                 &merge_context, &max_covering_tombstone_seq, read_options,
+                 &read_cb)) {
+    done = true;
+    pinnable_val->PinSelf();
+    RecordTick(stats_, MEMTABLE_HIT);
+  }
+  if (!done && !s.ok() && !s.IsMergeInProgress()) {
+    ReturnAndCleanupSuperVersion(cfd, super_version);
+    return s;
+  }
+  if (!done) {
+    PERF_TIMER_GUARD(get_from_output_files_time);
+    PinnedIteratorsManager pinned_iters_mgr;
+    super_version->current->Get(
+        read_options, lkey, pinnable_val, /*columns=*/nullptr, ts, &s,
+        &merge_context, &max_covering_tombstone_seq, &pinned_iters_mgr,
+        /*value_found*/ nullptr,
+        /*key_exists*/ nullptr, /*seq*/ nullptr, &read_cb, /*is_blob*/ nullptr,
+        /*do_merge*/ true);
+    RecordTick(stats_, MEMTABLE_MISS);
+  }
+  {
+    PERF_TIMER_GUARD(get_post_process_time);
+    ReturnAndCleanupSuperVersion(cfd, super_version);
+    RecordTick(stats_, NUMBER_KEYS_READ);
+    size_t size = pinnable_val->size();
+    RecordTick(stats_, BYTES_READ, size);
+    RecordTimeToHistogram(stats_, BYTES_PER_READ, size);
+    PERF_COUNTER_ADD(get_read_bytes, size);
+  }
+  return s;
+}
+
+Iterator* DBImplSecondary::NewIterator(const ReadOptions& read_options,
+                                       ColumnFamilyHandle* column_family) {
+  if (read_options.managed) {
+    return NewErrorIterator(
+        Status::NotSupported("Managed iterator is not supported anymore."));
+  }
+  if (read_options.read_tier == kPersistedTier) {
+    return NewErrorIterator(Status::NotSupported(
+        "ReadTier::kPersistedData is not yet supported in iterators."));
+  }
+
+  assert(column_family);
+  if (read_options.timestamp) {
+    const Status s = FailIfTsMismatchCf(
+        column_family, *(read_options.timestamp), /*ts_for_read=*/true);
+    if (!s.ok()) {
+      return NewErrorIterator(s);
+    }
+  } else {
+    const Status s = FailIfCfHasTs(column_family);
+    if (!s.ok()) {
+      return NewErrorIterator(s);
+    }
+  }
+
+  Iterator* result = nullptr;
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  auto cfd = cfh->cfd();
+  ReadCallback* read_callback = nullptr;  // No read callback provided.
+  if (read_options.tailing) {
+    return NewErrorIterator(Status::NotSupported(
+        "tailing iterator not supported in secondary mode"));
+  } else if (read_options.snapshot != nullptr) {
+    // TODO (yanqin) support snapshot.
+    return NewErrorIterator(
+        Status::NotSupported("snapshot not supported in secondary mode"));
+  } else {
+    SequenceNumber snapshot(kMaxSequenceNumber);
+    result = NewIteratorImpl(read_options, cfd, snapshot, read_callback);
+  }
+  return result;
+}
+
+ArenaWrappedDBIter* DBImplSecondary::NewIteratorImpl(
+    const ReadOptions& read_options, ColumnFamilyData* cfd,
+    SequenceNumber snapshot, ReadCallback* read_callback,
+    bool expose_blob_index, bool allow_refresh) {
+  assert(nullptr != cfd);
+  SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+  assert(snapshot == kMaxSequenceNumber);
+  snapshot = versions_->LastSequence();
+  assert(snapshot != kMaxSequenceNumber);
+  auto db_iter = NewArenaWrappedDbIterator(
+      env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options,
+      super_version->current, snapshot,
+      super_version->mutable_cf_options.max_sequential_skip_in_iterations,
+      super_version->version_number, read_callback, this, cfd,
+      expose_blob_index, read_options.snapshot ? false : allow_refresh);
+  auto internal_iter = NewInternalIterator(
+      db_iter->GetReadOptions(), cfd, super_version, db_iter->GetArena(),
+      snapshot, /* allow_unprepared_value */ true, db_iter);
+  db_iter->SetIterUnderDBIter(internal_iter);
+  return db_iter;
+}
+
+Status DBImplSecondary::NewIterators(
+    const ReadOptions& read_options,
+    const std::vector<ColumnFamilyHandle*>& column_families,
+    std::vector<Iterator*>* iterators) {
+  if (read_options.managed) {
+    return Status::NotSupported("Managed iterator is not supported anymore.");
+  }
+  if (read_options.read_tier == kPersistedTier) {
+    return Status::NotSupported(
+        "ReadTier::kPersistedData is not yet supported in iterators.");
+  }
+  ReadCallback* read_callback = nullptr;  // No read callback provided.
+  if (iterators == nullptr) {
+    return Status::InvalidArgument("iterators not allowed to be nullptr");
+  }
+
+  if (read_options.timestamp) {
+    for (auto* cf : column_families) {
+      assert(cf);
+      const Status s = FailIfTsMismatchCf(cf, *(read_options.timestamp),
+                                          /*ts_for_read=*/true);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  } else {
+    for (auto* cf : column_families) {
+      assert(cf);
+      const Status s = FailIfCfHasTs(cf);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  }
+  iterators->clear();
+  iterators->reserve(column_families.size());
+  if (read_options.tailing) {
+    return Status::NotSupported(
+        "tailing iterator not supported in secondary mode");
+  } else if (read_options.snapshot != nullptr) {
+    // TODO (yanqin) support snapshot.
+    return Status::NotSupported("snapshot not supported in secondary mode");
+  } else {
+    SequenceNumber read_seq(kMaxSequenceNumber);
+    for (auto cfh : column_families) {
+      ColumnFamilyData* cfd = static_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
+      iterators->push_back(
+          NewIteratorImpl(read_options, cfd, read_seq, read_callback));
+    }
+  }
+  return Status::OK();
+}
+
+Status DBImplSecondary::CheckConsistency() {
+  mutex_.AssertHeld();
+  Status s = DBImpl::CheckConsistency();
+  // If DBImpl::CheckConsistency() which is stricter returns success, then we
+  // do not need to give a second chance.
+  if (s.ok()) {
+    return s;
+  }
+  // It's possible that DBImpl::CheckConssitency() can fail because the primary
+  // may have removed certain files, causing the GetFileSize(name) call to
+  // fail and returning a PathNotFound. In this case, we take a best-effort
+  // approach and just proceed.
+  TEST_SYNC_POINT_CALLBACK(
+      "DBImplSecondary::CheckConsistency:AfterFirstAttempt", &s);
+
+  if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) {
+    return Status::OK();
+  }
+
+  std::vector<LiveFileMetaData> metadata;
+  versions_->GetLiveFilesMetaData(&metadata);
+
+  std::string corruption_messages;
+  for (const auto& md : metadata) {
+    // md.name has a leading "/".
+    std::string file_path = md.db_path + md.name;
+
+    uint64_t fsize = 0;
+    s = env_->GetFileSize(file_path, &fsize);
+    if (!s.ok() &&
+        (env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok() ||
+         s.IsPathNotFound())) {
+      s = Status::OK();
+    }
+    if (!s.ok()) {
+      corruption_messages +=
+          "Can't access " + md.name + ": " + s.ToString() + "\n";
+    }
+  }
+  return corruption_messages.empty() ? Status::OK()
+                                     : Status::Corruption(corruption_messages);
+}
+
+Status DBImplSecondary::TryCatchUpWithPrimary() {
+  assert(versions_.get() != nullptr);
+  assert(manifest_reader_.get() != nullptr);
+  Status s;
+  // read the manifest and apply new changes to the secondary instance
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  JobContext job_context(0, true /*create_superversion*/);
+  {
+    InstrumentedMutexLock lock_guard(&mutex_);
+    s = static_cast_with_check<ReactiveVersionSet>(versions_.get())
+            ->ReadAndApply(&mutex_, &manifest_reader_,
+                           manifest_reader_status_.get(), &cfds_changed);
+
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "Last sequence is %" PRIu64,
+                   static_cast<uint64_t>(versions_->LastSequence()));
+    for (ColumnFamilyData* cfd : cfds_changed) {
+      if (cfd->IsDropped()) {
+        ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] is dropped\n",
+                        cfd->GetName().c_str());
+        continue;
+      }
+      VersionStorageInfo::LevelSummaryStorage tmp;
+      ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+                      "[%s] Level summary: %s\n", cfd->GetName().c_str(),
+                      cfd->current()->storage_info()->LevelSummary(&tmp));
+    }
+
+    // list wal_dir to discover new WALs and apply new changes to the secondary
+    // instance
+    if (s.ok()) {
+      s = FindAndRecoverLogFiles(&cfds_changed, &job_context);
+    }
+    if (s.IsPathNotFound()) {
+      ROCKS_LOG_INFO(
+          immutable_db_options_.info_log,
+          "Secondary tries to read WAL, but WAL file(s) have already "
+          "been purged by primary.");
+      s = Status::OK();
+    }
+    if (s.ok()) {
+      for (auto cfd : cfds_changed) {
+        cfd->imm()->RemoveOldMemTables(cfd->GetLogNumber(),
+                                       &job_context.memtables_to_free);
+        auto& sv_context = job_context.superversion_contexts.back();
+        cfd->InstallSuperVersion(&sv_context, &mutex_);
+        sv_context.NewSuperVersion();
+      }
+    }
+  }
+  job_context.Clean();
+
+  // Cleanup unused, obsolete files.
+  JobContext purge_files_job_context(0);
+  {
+    InstrumentedMutexLock lock_guard(&mutex_);
+    // Currently, secondary instance does not own the database files, thus it
+    // is unnecessary for the secondary to force full scan.
+    FindObsoleteFiles(&purge_files_job_context, /*force=*/false);
+  }
+  if (purge_files_job_context.HaveSomethingToDelete()) {
+    PurgeObsoleteFiles(purge_files_job_context);
+  }
+  purge_files_job_context.Clean();
+  return s;
+}
+
+Status DB::OpenAsSecondary(const Options& options, const std::string& dbname,
+                           const std::string& secondary_path, DB** dbptr) {
+  *dbptr = nullptr;
+
+  DBOptions db_options(options);
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.emplace_back(kDefaultColumnFamilyName, cf_options);
+  std::vector<ColumnFamilyHandle*> handles;
+
+  Status s = DB::OpenAsSecondary(db_options, dbname, secondary_path,
+                                 column_families, &handles, dbptr);
+  if (s.ok()) {
+    assert(handles.size() == 1);
+    delete handles[0];
+  }
+  return s;
+}
+
+Status DB::OpenAsSecondary(
+    const DBOptions& db_options, const std::string& dbname,
+    const std::string& secondary_path,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
+  *dbptr = nullptr;
+
+  DBOptions tmp_opts(db_options);
+  Status s;
+  if (nullptr == tmp_opts.info_log) {
+    s = CreateLoggerFromOptions(secondary_path, tmp_opts, &tmp_opts.info_log);
+    if (!s.ok()) {
+      tmp_opts.info_log = nullptr;
+      return s;
+    }
+  }
+
+  assert(tmp_opts.info_log != nullptr);
+  if (db_options.max_open_files != -1) {
+    std::ostringstream oss;
+    oss << "The primary instance may delete all types of files after they "
+           "become obsolete. The application can coordinate the primary and "
+           "secondary so that primary does not delete/rename files that are "
+           "currently being used by the secondary. Alternatively, a custom "
+           "Env/FS can be provided such that files become inaccessible only "
+           "after all primary and secondaries indicate that they are obsolete "
+           "and deleted. If the above two are not possible, you can open the "
+           "secondary instance with `max_open_files==-1` so that secondary "
+           "will eagerly keep all table files open. Even if a file is deleted, "
+           "its content can still be accessed via a prior open file "
+           "descriptor. This is a hacky workaround for only table files. If "
+           "none of the above is done, then point lookup or "
+           "range scan via the secondary instance can result in IOError: file "
+           "not found. This can be resolved by retrying "
+           "TryCatchUpWithPrimary().";
+    ROCKS_LOG_WARN(tmp_opts.info_log, "%s", oss.str().c_str());
+  }
+
+  handles->clear();
+  DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname, secondary_path);
+  impl->versions_.reset(new ReactiveVersionSet(
+      dbname, &impl->immutable_db_options_, impl->file_options_,
+      impl->table_cache_.get(), impl->write_buffer_manager_,
+      &impl->write_controller_, impl->io_tracer_));
+  impl->column_family_memtables_.reset(
+      new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet()));
+  impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath();
+
+  impl->mutex_.Lock();
+  s = impl->Recover(column_families, true, false, false);
+  if (s.ok()) {
+    for (auto cf : column_families) {
+      auto cfd =
+          impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
+      if (nullptr == cfd) {
+        s = Status::InvalidArgument("Column family not found", cf.name);
+        break;
+      }
+      handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
+    }
+  }
+  SuperVersionContext sv_context(true /* create_superversion */);
+  if (s.ok()) {
+    for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+      sv_context.NewSuperVersion();
+      cfd->InstallSuperVersion(&sv_context, &impl->mutex_);
+    }
+  }
+  impl->mutex_.Unlock();
+  sv_context.Clean();
+  if (s.ok()) {
+    *dbptr = impl;
+    for (auto h : *handles) {
+      impl->NewThreadStatusCfInfo(
+          static_cast_with_check<ColumnFamilyHandleImpl>(h)->cfd());
+    }
+  } else {
+    for (auto h : *handles) {
+      delete h;
+    }
+    handles->clear();
+    delete impl;
+  }
+  return s;
+}
+
+Status DBImplSecondary::CompactWithoutInstallation(
+    const OpenAndCompactOptions& options, ColumnFamilyHandle* cfh,
+    const CompactionServiceInput& input, CompactionServiceResult* result) {
+  if (options.canceled && options.canceled->load(std::memory_order_acquire)) {
+    return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+  }
+  InstrumentedMutexLock l(&mutex_);
+  auto cfd = static_cast_with_check<ColumnFamilyHandleImpl>(cfh)->cfd();
+  if (!cfd) {
+    return Status::InvalidArgument("Cannot find column family" +
+                                   cfh->GetName());
+  }
+
+  std::unordered_set<uint64_t> input_set;
+  for (const auto& file_name : input.input_files) {
+    input_set.insert(TableFileNameToNumber(file_name));
+  }
+
+  auto* version = cfd->current();
+
+  ColumnFamilyMetaData cf_meta;
+  version->GetColumnFamilyMetaData(&cf_meta);
+
+  const MutableCFOptions* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+  ColumnFamilyOptions cf_options = cfd->GetLatestCFOptions();
+  VersionStorageInfo* vstorage = version->storage_info();
+
+  // Use comp_options to reuse some CompactFiles functions
+  CompactionOptions comp_options;
+  comp_options.compression = kDisableCompressionOption;
+  comp_options.output_file_size_limit = MaxFileSizeForLevel(
+      *mutable_cf_options, input.output_level, cf_options.compaction_style,
+      vstorage->base_level(), cf_options.level_compaction_dynamic_level_bytes);
+
+  std::vector<CompactionInputFiles> input_files;
+  Status s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage, comp_options);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::unique_ptr<Compaction> c;
+  assert(cfd->compaction_picker());
+  c.reset(cfd->compaction_picker()->CompactFiles(
+      comp_options, input_files, input.output_level, vstorage,
+      *mutable_cf_options, mutable_db_options_, 0));
+  assert(c != nullptr);
+
+  c->SetInputVersion(version);
+
+  // Create output directory if it's not existed yet
+  std::unique_ptr<FSDirectory> output_dir;
+  s = CreateAndNewDirectory(fs_.get(), secondary_path_, &output_dir);
+  if (!s.ok()) {
+    return s;
+  }
+
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+                       immutable_db_options_.info_log.get());
+
+  const int job_id = next_job_id_.fetch_add(1);
+
+  // use primary host's db_id for running the compaction, but db_session_id is
+  // using the local one, which is to make sure the unique id is unique from
+  // the remote compactors. Because the id is generated from db_id,
+  // db_session_id and orig_file_number, unlike the local compaction, remote
+  // compaction cannot guarantee the uniqueness of orig_file_number, the file
+  // number is only assigned when compaction is done.
+  CompactionServiceCompactionJob compaction_job(
+      job_id, c.get(), immutable_db_options_, mutable_db_options_,
+      file_options_for_compaction_, versions_.get(), &shutting_down_,
+      &log_buffer, output_dir.get(), stats_, &mutex_, &error_handler_,
+      input.snapshots, table_cache_, &event_logger_, dbname_, io_tracer_,
+      options.canceled ? *options.canceled : kManualCompactionCanceledFalse_,
+      input.db_id, db_session_id_, secondary_path_, input, result);
+
+  mutex_.Unlock();
+  s = compaction_job.Run();
+  mutex_.Lock();
+
+  // clean up
+  compaction_job.io_status().PermitUncheckedError();
+  compaction_job.CleanupCompaction();
+  c->ReleaseCompactionFiles(s);
+  c.reset();
+
+  TEST_SYNC_POINT_CALLBACK("DBImplSecondary::CompactWithoutInstallation::End",
+                           &s);
+  result->status = s;
+  return s;
+}
+
+Status DB::OpenAndCompact(
+    const OpenAndCompactOptions& options, const std::string& name,
+    const std::string& output_directory, const std::string& input,
+    std::string* output,
+    const CompactionServiceOptionsOverride& override_options) {
+  if (options.canceled && options.canceled->load(std::memory_order_acquire)) {
+    return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+  }
+  CompactionServiceInput compaction_input;
+  Status s = CompactionServiceInput::Read(input, &compaction_input);
+  if (!s.ok()) {
+    return s;
+  }
+
+  compaction_input.db_options.max_open_files = -1;
+  compaction_input.db_options.compaction_service = nullptr;
+  if (compaction_input.db_options.statistics) {
+    compaction_input.db_options.statistics.reset();
+  }
+  compaction_input.db_options.env = override_options.env;
+  compaction_input.db_options.file_checksum_gen_factory =
+      override_options.file_checksum_gen_factory;
+  compaction_input.db_options.statistics = override_options.statistics;
+  compaction_input.column_family.options.comparator =
+      override_options.comparator;
+  compaction_input.column_family.options.merge_operator =
+      override_options.merge_operator;
+  compaction_input.column_family.options.compaction_filter =
+      override_options.compaction_filter;
+  compaction_input.column_family.options.compaction_filter_factory =
+      override_options.compaction_filter_factory;
+  compaction_input.column_family.options.prefix_extractor =
+      override_options.prefix_extractor;
+  compaction_input.column_family.options.table_factory =
+      override_options.table_factory;
+  compaction_input.column_family.options.sst_partitioner_factory =
+      override_options.sst_partitioner_factory;
+  compaction_input.column_family.options.table_properties_collector_factories =
+      override_options.table_properties_collector_factories;
+  compaction_input.db_options.listeners = override_options.listeners;
+
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(compaction_input.column_family);
+  // TODO: we have to open default CF, because of an implementation limitation,
+  // currently we just use the same CF option from input, which is not collect
+  // and open may fail.
+  if (compaction_input.column_family.name != kDefaultColumnFamilyName) {
+    column_families.emplace_back(kDefaultColumnFamilyName,
+                                 compaction_input.column_family.options);
+  }
+
+  DB* db;
+  std::vector<ColumnFamilyHandle*> handles;
+
+  s = DB::OpenAsSecondary(compaction_input.db_options, name, output_directory,
+                          column_families, &handles, &db);
+  if (!s.ok()) {
+    return s;
+  }
+
+  CompactionServiceResult compaction_result;
+  DBImplSecondary* db_secondary = static_cast_with_check<DBImplSecondary>(db);
+  assert(handles.size() > 0);
+  s = db_secondary->CompactWithoutInstallation(
+      options, handles[0], compaction_input, &compaction_result);
+
+  Status serialization_status = compaction_result.Write(output);
+
+  for (auto& handle : handles) {
+    delete handle;
+  }
+  delete db;
+  if (s.ok()) {
+    return serialization_status;
+  }
+  return s;
+}
+
+Status DB::OpenAndCompact(
+    const std::string& name, const std::string& output_directory,
+    const std::string& input, std::string* output,
+    const CompactionServiceOptionsOverride& override_options) {
+  return OpenAndCompact(OpenAndCompactOptions(), name, output_directory, input,
+                        output, override_options);
+}
+
+#else   // !ROCKSDB_LITE
+
+Status DB::OpenAsSecondary(const Options& /*options*/,
+                           const std::string& /*name*/,
+                           const std::string& /*secondary_path*/,
+                           DB** /*dbptr*/) {
+  return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+
+Status DB::OpenAsSecondary(
+    const DBOptions& /*db_options*/, const std::string& /*dbname*/,
+    const std::string& /*secondary_path*/,
+    const std::vector<ColumnFamilyDescriptor>& /*column_families*/,
+    std::vector<ColumnFamilyHandle*>* /*handles*/, DB** /*dbptr*/) {
+  return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_secondary.h b/src/rocksdb/db/db_impl/db_impl_secondary.h
new file mode 100644
index 000000000..eb9361875
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_secondary.h
@@ -0,0 +1,410 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "logging/logging.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A wrapper class to hold log reader, log reporter, log status.
+class LogReaderContainer {
+ public:
+  LogReaderContainer()
+      : reader_(nullptr), reporter_(nullptr), status_(nullptr) {}
+  LogReaderContainer(Env* env, std::shared_ptr<Logger> info_log,
+                     std::string fname,
+                     std::unique_ptr<SequentialFileReader>&& file_reader,
+                     uint64_t log_number) {
+    LogReporter* reporter = new LogReporter();
+    status_ = new Status();
+    reporter->env = env;
+    reporter->info_log = info_log.get();
+    reporter->fname = std::move(fname);
+    reporter->status = status_;
+    reporter_ = reporter;
+    // We intentially make log::Reader do checksumming even if
+    // paranoid_checks==false so that corruptions cause entire commits
+    // to be skipped instead of propagating bad information (like overly
+    // large sequence numbers).
+    reader_ = new log::FragmentBufferedReader(info_log, std::move(file_reader),
+                                              reporter, true /*checksum*/,
+                                              log_number);
+  }
+  log::FragmentBufferedReader* reader_;
+  log::Reader::Reporter* reporter_;
+  Status* status_;
+  ~LogReaderContainer() {
+    delete reader_;
+    delete reporter_;
+    delete status_;
+  }
+
+ private:
+  struct LogReporter : public log::Reader::Reporter {
+    Env* env;
+    Logger* info_log;
+    std::string fname;
+    Status* status;  // nullptr if immutable_db_options_.paranoid_checks==false
+    void Corruption(size_t bytes, const Status& s) override {
+      ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s",
+                     (this->status == nullptr ? "(ignoring error) " : ""),
+                     fname.c_str(), static_cast<int>(bytes),
+                     s.ToString().c_str());
+      if (this->status != nullptr && this->status->ok()) {
+        *this->status = s;
+      }
+    }
+  };
+};
+
+// The secondary instance shares access to the storage as the primary.
+// The secondary is able to read and replay changes described in both the
+// MANIFEST and the WAL files without coordination with the primary.
+// The secondary instance can be opened using `DB::OpenAsSecondary`. After
+// that, it can call `DBImplSecondary::TryCatchUpWithPrimary` to make best
+// effort attempts to catch up with the primary.
+// TODO: Share common structure with CompactedDBImpl and DBImplReadOnly
+class DBImplSecondary : public DBImpl {
+ public:
+  DBImplSecondary(const DBOptions& options, const std::string& dbname,
+                  std::string secondary_path);
+  ~DBImplSecondary() override;
+
+  // Recover by replaying MANIFEST and WAL. Also initialize manifest_reader_
+  // and log_readers_ to facilitate future operations.
+  Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
+                 bool read_only, bool error_if_wal_file_exists,
+                 bool error_if_data_exists_in_wals, uint64_t* = nullptr,
+                 RecoveryContext* recovery_ctx = nullptr) override;
+
+  // Implementations of the DB interface.
+  using DB::Get;
+  // Can return IOError due to files being deleted by the primary. To avoid
+  // IOError in this case, application can coordinate between primary and
+  // secondaries so that primary will not delete files that are currently being
+  // used by the secondaries. The application can also provide a custom FS/Env
+  // implementation so that files will remain present until all primary and
+  // secondaries indicate that they can be deleted. As a partial hacky
+  // workaround, the secondaries can be opened with `max_open_files=-1` so that
+  // it eagerly keeps all talbe files open and is able to access the contents of
+  // deleted files via prior open fd.
+  Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
+             const Slice& key, PinnableSlice* value) override;
+
+  Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
+             const Slice& key, PinnableSlice* value,
+             std::string* timestamp) override;
+
+  Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
+                 const Slice& key, PinnableSlice* value,
+                 std::string* timestamp);
+
+  using DBImpl::NewIterator;
+  // Operations on the created iterators can return IOError due to files being
+  // deleted by the primary. To avoid IOError in this case, application can
+  // coordinate between primary and secondaries so that primary will not delete
+  // files that are currently being used by the secondaries. The application can
+  // also provide a custom FS/Env implementation so that files will remain
+  // present until all primary and secondaries indicate that they can be
+  // deleted. As a partial hacky workaround, the secondaries can be opened with
+  // `max_open_files=-1` so that it eagerly keeps all talbe files open and is
+  // able to access the contents of deleted files via prior open fd.
+  Iterator* NewIterator(const ReadOptions&,
+                        ColumnFamilyHandle* column_family) override;
+
+  ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& read_options,
+                                      ColumnFamilyData* cfd,
+                                      SequenceNumber snapshot,
+                                      ReadCallback* read_callback,
+                                      bool expose_blob_index = false,
+                                      bool allow_refresh = true);
+
+  Status NewIterators(const ReadOptions& options,
+                      const std::vector<ColumnFamilyHandle*>& column_families,
+                      std::vector<Iterator*>* iterators) override;
+
+  using DBImpl::Put;
+  Status Put(const WriteOptions& /*options*/,
+             ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
+             const Slice& /*value*/) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::PutEntity;
+  Status PutEntity(const WriteOptions& /* options */,
+                   ColumnFamilyHandle* /* column_family */,
+                   const Slice& /* key */,
+                   const WideColumns& /* columns */) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::Merge;
+  Status Merge(const WriteOptions& /*options*/,
+               ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
+               const Slice& /*value*/) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::Delete;
+  Status Delete(const WriteOptions& /*options*/,
+                ColumnFamilyHandle* /*column_family*/,
+                const Slice& /*key*/) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::SingleDelete;
+  Status SingleDelete(const WriteOptions& /*options*/,
+                      ColumnFamilyHandle* /*column_family*/,
+                      const Slice& /*key*/) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  Status Write(const WriteOptions& /*options*/,
+               WriteBatch* /*updates*/) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::CompactRange;
+  Status CompactRange(const CompactRangeOptions& /*options*/,
+                      ColumnFamilyHandle* /*column_family*/,
+                      const Slice* /*begin*/, const Slice* /*end*/) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::CompactFiles;
+  Status CompactFiles(
+      const CompactionOptions& /*compact_options*/,
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*input_file_names*/,
+      const int /*output_level*/, const int /*output_path_id*/ = -1,
+      std::vector<std::string>* const /*output_file_names*/ = nullptr,
+      CompactionJobInfo* /*compaction_job_info*/ = nullptr) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  Status DisableFileDeletions() override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  Status EnableFileDeletions(bool /*force*/) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  Status GetLiveFiles(std::vector<std::string>&,
+                      uint64_t* /*manifest_file_size*/,
+                      bool /*flush_memtable*/ = true) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::Flush;
+  Status Flush(const FlushOptions& /*options*/,
+               ColumnFamilyHandle* /*column_family*/) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::SetDBOptions;
+  Status SetDBOptions(const std::unordered_map<std::string, std::string>&
+                      /*options_map*/) override {
+    // Currently not supported because changing certain options may cause
+    // flush/compaction.
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::SetOptions;
+  Status SetOptions(
+      ColumnFamilyHandle* /*cfd*/,
+      const std::unordered_map<std::string, std::string>& /*options_map*/)
+      override {
+    // Currently not supported because changing certain options may cause
+    // flush/compaction and/or write to MANIFEST.
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::SyncWAL;
+  Status SyncWAL() override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DB::IngestExternalFile;
+  Status IngestExternalFile(
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*external_files*/,
+      const IngestExternalFileOptions& /*ingestion_options*/) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  // Try to catch up with the primary by reading as much as possible from the
+  // log files until there is nothing more to read or encounters an error. If
+  // the amount of information in the log files to process is huge, this
+  // method can take long time due to all the I/O and CPU costs.
+  Status TryCatchUpWithPrimary() override;
+
+  // Try to find log reader using log_number from log_readers_ map, initialize
+  // if it doesn't exist
+  Status MaybeInitLogReader(uint64_t log_number,
+                            log::FragmentBufferedReader** log_reader);
+
+  // Check if all live files exist on file system and that their file sizes
+  // matche to the in-memory records. It is possible that some live files may
+  // have been deleted by the primary. In this case, CheckConsistency() does
+  // not flag the missing file as inconsistency.
+  Status CheckConsistency() override;
+
+#ifndef NDEBUG
+  Status TEST_CompactWithoutInstallation(const OpenAndCompactOptions& options,
+                                         ColumnFamilyHandle* cfh,
+                                         const CompactionServiceInput& input,
+                                         CompactionServiceResult* result) {
+    return CompactWithoutInstallation(options, cfh, input, result);
+  }
+#endif  // NDEBUG
+
+ protected:
+#ifndef ROCKSDB_LITE
+  Status FlushForGetLiveFiles() override {
+    // No-op for read-only DB
+    return Status::OK();
+  }
+#endif  // !ROCKSDB_LITE
+
+  // ColumnFamilyCollector is a write batch handler which does nothing
+  // except recording unique column family IDs
+  class ColumnFamilyCollector : public WriteBatch::Handler {
+    std::unordered_set<uint32_t> column_family_ids_;
+
+    Status AddColumnFamilyId(uint32_t column_family_id) {
+      if (column_family_ids_.find(column_family_id) ==
+          column_family_ids_.end()) {
+        column_family_ids_.insert(column_family_id);
+      }
+      return Status::OK();
+    }
+
+   public:
+    explicit ColumnFamilyCollector() {}
+
+    ~ColumnFamilyCollector() override {}
+
+    Status PutCF(uint32_t column_family_id, const Slice&,
+                 const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    Status DeleteCF(uint32_t column_family_id, const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    Status SingleDeleteCF(uint32_t column_family_id, const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    Status DeleteRangeCF(uint32_t column_family_id, const Slice&,
+                         const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    Status MergeCF(uint32_t column_family_id, const Slice&,
+                   const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    Status PutBlobIndexCF(uint32_t column_family_id, const Slice&,
+                          const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    Status MarkBeginPrepare(bool) override { return Status::OK(); }
+
+    Status MarkEndPrepare(const Slice&) override { return Status::OK(); }
+
+    Status MarkRollback(const Slice&) override { return Status::OK(); }
+
+    Status MarkCommit(const Slice&) override { return Status::OK(); }
+
+    Status MarkCommitWithTimestamp(const Slice&, const Slice&) override {
+      return Status::OK();
+    }
+
+    Status MarkNoop(bool) override { return Status::OK(); }
+
+    const std::unordered_set<uint32_t>& column_families() const {
+      return column_family_ids_;
+    }
+  };
+
+  Status CollectColumnFamilyIdsFromWriteBatch(
+      const WriteBatch& batch, std::vector<uint32_t>* column_family_ids) {
+    assert(column_family_ids != nullptr);
+    column_family_ids->clear();
+    ColumnFamilyCollector handler;
+    Status s = batch.Iterate(&handler);
+    if (s.ok()) {
+      for (const auto& cf : handler.column_families()) {
+        column_family_ids->push_back(cf);
+      }
+    }
+    return s;
+  }
+
+  bool OwnTablesAndLogs() const override {
+    // Currently, the secondary instance does not own the database files. It
+    // simply opens the files of the primary instance and tracks their file
+    // descriptors until they become obsolete. In the future, the secondary may
+    // create links to database files. OwnTablesAndLogs will return true then.
+    return false;
+  }
+
+ private:
+  friend class DB;
+
+  // No copying allowed
+  DBImplSecondary(const DBImplSecondary&);
+  void operator=(const DBImplSecondary&);
+
+  using DBImpl::Recover;
+
+  Status FindAndRecoverLogFiles(
+      std::unordered_set<ColumnFamilyData*>* cfds_changed,
+      JobContext* job_context);
+  Status FindNewLogNumbers(std::vector<uint64_t>* logs);
+  // After manifest recovery, replay WALs and refresh log_readers_ if necessary
+  // REQUIRES: log_numbers are sorted in ascending order
+  Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
+                         SequenceNumber* next_sequence,
+                         std::unordered_set<ColumnFamilyData*>* cfds_changed,
+                         JobContext* job_context);
+
+  // Run compaction without installation, the output files will be placed in the
+  // secondary DB path. The LSM tree won't be changed, the secondary DB is still
+  // in read-only mode.
+  Status CompactWithoutInstallation(const OpenAndCompactOptions& options,
+                                    ColumnFamilyHandle* cfh,
+                                    const CompactionServiceInput& input,
+                                    CompactionServiceResult* result);
+
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader_;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter_;
+  std::unique_ptr<Status> manifest_reader_status_;
+
+  // Cache log readers for each log number, used for continue WAL replay
+  // after recovery
+  std::map<uint64_t, std::unique_ptr<LogReaderContainer>> log_readers_;
+
+  // Current WAL number replayed for each column family.
+  std::unordered_map<ColumnFamilyData*, uint64_t> cfd_to_current_log_;
+
+  const std::string secondary_path_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_impl/db_impl_write.cc b/src/rocksdb/db/db_impl/db_impl_write.cc
new file mode 100644
index 000000000..a597c168d
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_write.cc
@@ -0,0 +1,2435 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <cinttypes>
+
+#include "db/db_impl/db_impl.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "logging/logging.h"
+#include "monitoring/perf_context_imp.h"
+#include "options/options_helper.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Convenience methods
+Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family,
+                   const Slice& key, const Slice& val) {
+  const Status s = FailIfCfHasTs(column_family);
+  if (!s.ok()) {
+    return s;
+  }
+  return DB::Put(o, column_family, key, val);
+}
+
+Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family,
+                   const Slice& key, const Slice& ts, const Slice& val) {
+  const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false);
+  if (!s.ok()) {
+    return s;
+  }
+  return DB::Put(o, column_family, key, ts, val);
+}
+
+Status DBImpl::PutEntity(const WriteOptions& options,
+                         ColumnFamilyHandle* column_family, const Slice& key,
+                         const WideColumns& columns) {
+  const Status s = FailIfCfHasTs(column_family);
+  if (!s.ok()) {
+    return s;
+  }
+
+  return DB::PutEntity(options, column_family, key, columns);
+}
+
+Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family,
+                     const Slice& key, const Slice& val) {
+  const Status s = FailIfCfHasTs(column_family);
+  if (!s.ok()) {
+    return s;
+  }
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  if (!cfh->cfd()->ioptions()->merge_operator) {
+    return Status::NotSupported("Provide a merge_operator when opening DB");
+  } else {
+    return DB::Merge(o, column_family, key, val);
+  }
+}
+
+Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family,
+                     const Slice& key, const Slice& ts, const Slice& val) {
+  const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false);
+  if (!s.ok()) {
+    return s;
+  }
+  return DB::Merge(o, column_family, key, ts, val);
+}
+
+Status DBImpl::Delete(const WriteOptions& write_options,
+                      ColumnFamilyHandle* column_family, const Slice& key) {
+  const Status s = FailIfCfHasTs(column_family);
+  if (!s.ok()) {
+    return s;
+  }
+  return DB::Delete(write_options, column_family, key);
+}
+
+Status DBImpl::Delete(const WriteOptions& write_options,
+                      ColumnFamilyHandle* column_family, const Slice& key,
+                      const Slice& ts) {
+  const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false);
+  if (!s.ok()) {
+    return s;
+  }
+  return DB::Delete(write_options, column_family, key, ts);
+}
+
+Status DBImpl::SingleDelete(const WriteOptions& write_options,
+                            ColumnFamilyHandle* column_family,
+                            const Slice& key) {
+  const Status s = FailIfCfHasTs(column_family);
+  if (!s.ok()) {
+    return s;
+  }
+  return DB::SingleDelete(write_options, column_family, key);
+}
+
+Status DBImpl::SingleDelete(const WriteOptions& write_options,
+                            ColumnFamilyHandle* column_family, const Slice& key,
+                            const Slice& ts) {
+  const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false);
+  if (!s.ok()) {
+    return s;
+  }
+  return DB::SingleDelete(write_options, column_family, key, ts);
+}
+
+Status DBImpl::DeleteRange(const WriteOptions& write_options,
+                           ColumnFamilyHandle* column_family,
+                           const Slice& begin_key, const Slice& end_key) {
+  const Status s = FailIfCfHasTs(column_family);
+  if (!s.ok()) {
+    return s;
+  }
+  return DB::DeleteRange(write_options, column_family, begin_key, end_key);
+}
+
+Status DBImpl::DeleteRange(const WriteOptions& write_options,
+                           ColumnFamilyHandle* column_family,
+                           const Slice& begin_key, const Slice& end_key,
+                           const Slice& ts) {
+  const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false);
+  if (!s.ok()) {
+    return s;
+  }
+  return DB::DeleteRange(write_options, column_family, begin_key, end_key, ts);
+}
+
+void DBImpl::SetRecoverableStatePreReleaseCallback(
+    PreReleaseCallback* callback) {
+  recoverable_state_pre_release_callback_.reset(callback);
+}
+
+Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
+  Status s;
+  if (write_options.protection_bytes_per_key > 0) {
+    s = WriteBatchInternal::UpdateProtectionInfo(
+        my_batch, write_options.protection_bytes_per_key);
+  }
+  if (s.ok()) {
+    s = WriteImpl(write_options, my_batch, /*callback=*/nullptr,
+                  /*log_used=*/nullptr);
+  }
+  return s;
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::WriteWithCallback(const WriteOptions& write_options,
+                                 WriteBatch* my_batch,
+                                 WriteCallback* callback) {
+  Status s;
+  if (write_options.protection_bytes_per_key > 0) {
+    s = WriteBatchInternal::UpdateProtectionInfo(
+        my_batch, write_options.protection_bytes_per_key);
+  }
+  if (s.ok()) {
+    s = WriteImpl(write_options, my_batch, callback, nullptr);
+  }
+  return s;
+}
+#endif  // ROCKSDB_LITE
+
+// The main write queue. This is the only write queue that updates LastSequence.
+// When using one write queue, the same sequence also indicates the last
+// published sequence.
+Status DBImpl::WriteImpl(const WriteOptions& write_options,
+                         WriteBatch* my_batch, WriteCallback* callback,
+                         uint64_t* log_used, uint64_t log_ref,
+                         bool disable_memtable, uint64_t* seq_used,
+                         size_t batch_cnt,
+                         PreReleaseCallback* pre_release_callback,
+                         PostMemTableCallback* post_memtable_callback) {
+  assert(!seq_per_batch_ || batch_cnt != 0);
+  assert(my_batch == nullptr || my_batch->Count() == 0 ||
+         write_options.protection_bytes_per_key == 0 ||
+         write_options.protection_bytes_per_key ==
+             my_batch->GetProtectionBytesPerKey());
+  if (my_batch == nullptr) {
+    return Status::InvalidArgument("Batch is nullptr!");
+  } else if (!disable_memtable &&
+             WriteBatchInternal::TimestampsUpdateNeeded(*my_batch)) {
+    // If writing to memtable, then we require the caller to set/update the
+    // timestamps for the keys in the write batch.
+    // Otherwise, it means we are just writing to the WAL, and we allow
+    // timestamps unset for the keys in the write batch. This can happen if we
+    // use TransactionDB with write-committed policy, and we currently do not
+    // support user-defined timestamp with other policies.
+    // In the prepare phase, a transaction can write the batch to the WAL
+    // without inserting to memtable. The keys in the batch do not have to be
+    // assigned timestamps because they will be used only during recovery if
+    // there is a commit marker which includes their commit timestamp.
+    return Status::InvalidArgument("write batch must have timestamp(s) set");
+  } else if (write_options.rate_limiter_priority != Env::IO_TOTAL &&
+             write_options.rate_limiter_priority != Env::IO_USER) {
+    return Status::InvalidArgument(
+        "WriteOptions::rate_limiter_priority only allows "
+        "Env::IO_TOTAL and Env::IO_USER due to implementation constraints");
+  } else if (write_options.rate_limiter_priority != Env::IO_TOTAL &&
+             (write_options.disableWAL || manual_wal_flush_)) {
+    return Status::InvalidArgument(
+        "WriteOptions::rate_limiter_priority currently only supports "
+        "rate-limiting automatic WAL flush, which requires "
+        "`WriteOptions::disableWAL` and "
+        "`DBOptions::manual_wal_flush` both set to false");
+  } else if (write_options.protection_bytes_per_key != 0 &&
+             write_options.protection_bytes_per_key != 8) {
+    return Status::InvalidArgument(
+        "`WriteOptions::protection_bytes_per_key` must be zero or eight");
+  }
+  // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
+  // grabs but does not seem thread-safe.
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_ && !tracer_->IsWriteOrderPreserved()) {
+      // We don't have to preserve write order so can trace anywhere. It's more
+      // efficient to trace here than to add latency to a phase of the log/apply
+      // pipeline.
+      // TODO: maybe handle the tracing status?
+      tracer_->Write(my_batch).PermitUncheckedError();
+    }
+  }
+  if (write_options.sync && write_options.disableWAL) {
+    return Status::InvalidArgument("Sync writes has to enable WAL.");
+  }
+  if (two_write_queues_ && immutable_db_options_.enable_pipelined_write) {
+    return Status::NotSupported(
+        "pipelined_writes is not compatible with concurrent prepares");
+  }
+  if (seq_per_batch_ && immutable_db_options_.enable_pipelined_write) {
+    // TODO(yiwu): update pipeline write with seq_per_batch and batch_cnt
+    return Status::NotSupported(
+        "pipelined_writes is not compatible with seq_per_batch");
+  }
+  if (immutable_db_options_.unordered_write &&
+      immutable_db_options_.enable_pipelined_write) {
+    return Status::NotSupported(
+        "pipelined_writes is not compatible with unordered_write");
+  }
+  if (immutable_db_options_.enable_pipelined_write &&
+      post_memtable_callback != nullptr) {
+    return Status::NotSupported(
+        "pipelined write currently does not honor post_memtable_callback");
+  }
+  if (seq_per_batch_ && post_memtable_callback != nullptr) {
+    return Status::NotSupported(
+        "seq_per_batch currently does not honor post_memtable_callback");
+  }
+  // Otherwise IsLatestPersistentState optimization does not make sense
+  assert(!WriteBatchInternal::IsLatestPersistentState(my_batch) ||
+         disable_memtable);
+
+  if (write_options.low_pri) {
+    Status s = ThrottleLowPriWritesIfNeeded(write_options, my_batch);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (two_write_queues_ && disable_memtable) {
+    AssignOrder assign_order =
+        seq_per_batch_ ? kDoAssignOrder : kDontAssignOrder;
+    // Otherwise it is WAL-only Prepare batches in WriteCommitted policy and
+    // they don't consume sequence.
+    return WriteImplWALOnly(&nonmem_write_thread_, write_options, my_batch,
+                            callback, log_used, log_ref, seq_used, batch_cnt,
+                            pre_release_callback, assign_order,
+                            kDontPublishLastSeq, disable_memtable);
+  }
+
+  if (immutable_db_options_.unordered_write) {
+    const size_t sub_batch_cnt = batch_cnt != 0
+                                     ? batch_cnt
+                                     // every key is a sub-batch consuming a seq
+                                     : WriteBatchInternal::Count(my_batch);
+    uint64_t seq = 0;
+    // Use a write thread to i) optimize for WAL write, ii) publish last
+    // sequence in in increasing order, iii) call pre_release_callback serially
+    Status status = WriteImplWALOnly(
+        &write_thread_, write_options, my_batch, callback, log_used, log_ref,
+        &seq, sub_batch_cnt, pre_release_callback, kDoAssignOrder,
+        kDoPublishLastSeq, disable_memtable);
+    TEST_SYNC_POINT("DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL");
+    if (!status.ok()) {
+      return status;
+    }
+    if (seq_used) {
+      *seq_used = seq;
+    }
+    if (!disable_memtable) {
+      TEST_SYNC_POINT("DBImpl::WriteImpl:BeforeUnorderedWriteMemtable");
+      status = UnorderedWriteMemtable(write_options, my_batch, callback,
+                                      log_ref, seq, sub_batch_cnt);
+    }
+    return status;
+  }
+
+  if (immutable_db_options_.enable_pipelined_write) {
+    return PipelinedWriteImpl(write_options, my_batch, callback, log_used,
+                              log_ref, disable_memtable, seq_used);
+  }
+
+  PERF_TIMER_GUARD(write_pre_and_post_process_time);
+  WriteThread::Writer w(write_options, my_batch, callback, log_ref,
+                        disable_memtable, batch_cnt, pre_release_callback,
+                        post_memtable_callback);
+  StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
+
+  write_thread_.JoinBatchGroup(&w);
+  if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) {
+    // we are a non-leader in a parallel group
+
+    if (w.ShouldWriteToMemtable()) {
+      PERF_TIMER_STOP(write_pre_and_post_process_time);
+      PERF_TIMER_GUARD(write_memtable_time);
+
+      ColumnFamilyMemTablesImpl column_family_memtables(
+          versions_->GetColumnFamilySet());
+      w.status = WriteBatchInternal::InsertInto(
+          &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+          &trim_history_scheduler_,
+          write_options.ignore_missing_column_families, 0 /*log_number*/, this,
+          true /*concurrent_memtable_writes*/, seq_per_batch_, w.batch_cnt,
+          batch_per_txn_, write_options.memtable_insert_hint_per_batch);
+
+      PERF_TIMER_START(write_pre_and_post_process_time);
+    }
+
+    if (write_thread_.CompleteParallelMemTableWriter(&w)) {
+      // we're responsible for exit batch group
+      // TODO(myabandeh): propagate status to write_group
+      auto last_sequence = w.write_group->last_sequence;
+      for (auto* tmp_w : *(w.write_group)) {
+        assert(tmp_w);
+        if (tmp_w->post_memtable_callback) {
+          Status tmp_s =
+              (*tmp_w->post_memtable_callback)(last_sequence, disable_memtable);
+          // TODO: propagate the execution status of post_memtable_callback to
+          // caller.
+          assert(tmp_s.ok());
+        }
+      }
+      versions_->SetLastSequence(last_sequence);
+      MemTableInsertStatusCheck(w.status);
+      write_thread_.ExitAsBatchGroupFollower(&w);
+    }
+    assert(w.state == WriteThread::STATE_COMPLETED);
+    // STATE_COMPLETED conditional below handles exit
+  }
+  if (w.state == WriteThread::STATE_COMPLETED) {
+    if (log_used != nullptr) {
+      *log_used = w.log_used;
+    }
+    if (seq_used != nullptr) {
+      *seq_used = w.sequence;
+    }
+    // write is complete and leader has updated sequence
+    return w.FinalStatus();
+  }
+  // else we are the leader of the write batch group
+  assert(w.state == WriteThread::STATE_GROUP_LEADER);
+  Status status;
+  // Once reaches this point, the current writer "w" will try to do its write
+  // job.  It may also pick up some of the remaining writers in the "writers_"
+  // when it finds suitable, and finish them in the same write batch.
+  // This is how a write job could be done by the other writer.
+  WriteContext write_context;
+  LogContext log_context(write_options.sync);
+  WriteThread::WriteGroup write_group;
+  bool in_parallel_group = false;
+  uint64_t last_sequence = kMaxSequenceNumber;
+
+  assert(!two_write_queues_ || !disable_memtable);
+  {
+    // With concurrent writes we do preprocess only in the write thread that
+    // also does write to memtable to avoid sync issue on shared data structure
+    // with the other thread
+
+    // PreprocessWrite does its own perf timing.
+    PERF_TIMER_STOP(write_pre_and_post_process_time);
+
+    status = PreprocessWrite(write_options, &log_context, &write_context);
+    if (!two_write_queues_) {
+      // Assign it after ::PreprocessWrite since the sequence might advance
+      // inside it by WriteRecoverableState
+      last_sequence = versions_->LastSequence();
+    }
+
+    PERF_TIMER_START(write_pre_and_post_process_time);
+  }
+
+  // Add to log and apply to memtable.  We can release the lock
+  // during this phase since &w is currently responsible for logging
+  // and protects against concurrent loggers and concurrent writes
+  // into memtables
+
+  TEST_SYNC_POINT("DBImpl::WriteImpl:BeforeLeaderEnters");
+  last_batch_group_size_ =
+      write_thread_.EnterAsBatchGroupLeader(&w, &write_group);
+
+  IOStatus io_s;
+  Status pre_release_cb_status;
+  if (status.ok()) {
+    // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
+    // grabs but does not seem thread-safe.
+    if (tracer_) {
+      InstrumentedMutexLock lock(&trace_mutex_);
+      if (tracer_ && tracer_->IsWriteOrderPreserved()) {
+        for (auto* writer : write_group) {
+          // TODO: maybe handle the tracing status?
+          tracer_->Write(writer->batch).PermitUncheckedError();
+        }
+      }
+    }
+    // Rules for when we can update the memtable concurrently
+    // 1. supported by memtable
+    // 2. Puts are not okay if inplace_update_support
+    // 3. Merges are not okay
+    //
+    // Rules 1..2 are enforced by checking the options
+    // during startup (CheckConcurrentWritesSupported), so if
+    // options.allow_concurrent_memtable_write is true then they can be
+    // assumed to be true.  Rule 3 is checked for each batch.  We could
+    // relax rules 2 if we could prevent write batches from referring
+    // more than once to a particular key.
+    bool parallel = immutable_db_options_.allow_concurrent_memtable_write &&
+                    write_group.size > 1;
+    size_t total_count = 0;
+    size_t valid_batches = 0;
+    size_t total_byte_size = 0;
+    size_t pre_release_callback_cnt = 0;
+    for (auto* writer : write_group) {
+      assert(writer);
+      if (writer->CheckCallback(this)) {
+        valid_batches += writer->batch_cnt;
+        if (writer->ShouldWriteToMemtable()) {
+          total_count += WriteBatchInternal::Count(writer->batch);
+          parallel = parallel && !writer->batch->HasMerge();
+        }
+        total_byte_size = WriteBatchInternal::AppendedByteSize(
+            total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
+        if (writer->pre_release_callback) {
+          pre_release_callback_cnt++;
+        }
+      }
+    }
+    // Note about seq_per_batch_: either disableWAL is set for the entire write
+    // group or not. In either case we inc seq for each write batch with no
+    // failed callback. This means that there could be a batch with
+    // disalbe_memtable in between; although we do not write this batch to
+    // memtable it still consumes a seq. Otherwise, if !seq_per_batch_, we inc
+    // the seq per valid written key to mem.
+    size_t seq_inc = seq_per_batch_ ? valid_batches : total_count;
+
+    const bool concurrent_update = two_write_queues_;
+    // Update stats while we are an exclusive group leader, so we know
+    // that nobody else can be writing to these particular stats.
+    // We're optimistic, updating the stats before we successfully
+    // commit.  That lets us release our leader status early.
+    auto stats = default_cf_internal_stats_;
+    stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count,
+                      concurrent_update);
+    RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
+    stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size,
+                      concurrent_update);
+    RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
+    stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1,
+                      concurrent_update);
+    RecordTick(stats_, WRITE_DONE_BY_SELF);
+    auto write_done_by_other = write_group.size - 1;
+    if (write_done_by_other > 0) {
+      stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
+                        write_done_by_other, concurrent_update);
+      RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other);
+    }
+    RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
+
+    if (write_options.disableWAL) {
+      has_unpersisted_data_.store(true, std::memory_order_relaxed);
+    }
+
+    PERF_TIMER_STOP(write_pre_and_post_process_time);
+
+    if (!two_write_queues_) {
+      if (status.ok() && !write_options.disableWAL) {
+        assert(log_context.log_file_number_size);
+        LogFileNumberSize& log_file_number_size =
+            *(log_context.log_file_number_size);
+        PERF_TIMER_GUARD(write_wal_time);
+        io_s =
+            WriteToWAL(write_group, log_context.writer, log_used,
+                       log_context.need_log_sync, log_context.need_log_dir_sync,
+                       last_sequence + 1, log_file_number_size);
+      }
+    } else {
+      if (status.ok() && !write_options.disableWAL) {
+        PERF_TIMER_GUARD(write_wal_time);
+        // LastAllocatedSequence is increased inside WriteToWAL under
+        // wal_write_mutex_ to ensure ordered events in WAL
+        io_s = ConcurrentWriteToWAL(write_group, log_used, &last_sequence,
+                                    seq_inc);
+      } else {
+        // Otherwise we inc seq number for memtable writes
+        last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
+      }
+    }
+    status = io_s;
+    assert(last_sequence != kMaxSequenceNumber);
+    const SequenceNumber current_sequence = last_sequence + 1;
+    last_sequence += seq_inc;
+
+    // PreReleaseCallback is called after WAL write and before memtable write
+    if (status.ok()) {
+      SequenceNumber next_sequence = current_sequence;
+      size_t index = 0;
+      // Note: the logic for advancing seq here must be consistent with the
+      // logic in WriteBatchInternal::InsertInto(write_group...) as well as
+      // with WriteBatchInternal::InsertInto(write_batch...) that is called on
+      // the merged batch during recovery from the WAL.
+      for (auto* writer : write_group) {
+        if (writer->CallbackFailed()) {
+          continue;
+        }
+        writer->sequence = next_sequence;
+        if (writer->pre_release_callback) {
+          Status ws = writer->pre_release_callback->Callback(
+              writer->sequence, disable_memtable, writer->log_used, index++,
+              pre_release_callback_cnt);
+          if (!ws.ok()) {
+            status = pre_release_cb_status = ws;
+            break;
+          }
+        }
+        if (seq_per_batch_) {
+          assert(writer->batch_cnt);
+          next_sequence += writer->batch_cnt;
+        } else if (writer->ShouldWriteToMemtable()) {
+          next_sequence += WriteBatchInternal::Count(writer->batch);
+        }
+      }
+    }
+
+    if (status.ok()) {
+      PERF_TIMER_GUARD(write_memtable_time);
+
+      if (!parallel) {
+        // w.sequence will be set inside InsertInto
+        w.status = WriteBatchInternal::InsertInto(
+            write_group, current_sequence, column_family_memtables_.get(),
+            &flush_scheduler_, &trim_history_scheduler_,
+            write_options.ignore_missing_column_families,
+            0 /*recovery_log_number*/, this, parallel, seq_per_batch_,
+            batch_per_txn_);
+      } else {
+        write_group.last_sequence = last_sequence;
+        write_thread_.LaunchParallelMemTableWriters(&write_group);
+        in_parallel_group = true;
+
+        // Each parallel follower is doing each own writes. The leader should
+        // also do its own.
+        if (w.ShouldWriteToMemtable()) {
+          ColumnFamilyMemTablesImpl column_family_memtables(
+              versions_->GetColumnFamilySet());
+          assert(w.sequence == current_sequence);
+          w.status = WriteBatchInternal::InsertInto(
+              &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+              &trim_history_scheduler_,
+              write_options.ignore_missing_column_families, 0 /*log_number*/,
+              this, true /*concurrent_memtable_writes*/, seq_per_batch_,
+              w.batch_cnt, batch_per_txn_,
+              write_options.memtable_insert_hint_per_batch);
+        }
+      }
+      if (seq_used != nullptr) {
+        *seq_used = w.sequence;
+      }
+    }
+  }
+  PERF_TIMER_START(write_pre_and_post_process_time);
+
+  if (!io_s.ok()) {
+    // Check WriteToWAL status
+    IOStatusCheck(io_s);
+  }
+  if (!w.CallbackFailed()) {
+    if (!io_s.ok()) {
+      assert(pre_release_cb_status.ok());
+    } else {
+      WriteStatusCheck(pre_release_cb_status);
+    }
+  } else {
+    assert(pre_release_cb_status.ok());
+  }
+
+  if (log_context.need_log_sync) {
+    VersionEdit synced_wals;
+    log_write_mutex_.Lock();
+    if (status.ok()) {
+      MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync,
+                     &synced_wals);
+    } else {
+      MarkLogsNotSynced(logfile_number_);
+    }
+    log_write_mutex_.Unlock();
+    if (status.ok() && synced_wals.IsWalAddition()) {
+      InstrumentedMutexLock l(&mutex_);
+      status = ApplyWALToManifest(&synced_wals);
+    }
+
+    // Requesting sync with two_write_queues_ is expected to be very rare. We
+    // hence provide a simple implementation that is not necessarily efficient.
+    if (two_write_queues_) {
+      if (manual_wal_flush_) {
+        status = FlushWAL(true);
+      } else {
+        status = SyncWAL();
+      }
+    }
+  }
+
+  bool should_exit_batch_group = true;
+  if (in_parallel_group) {
+    // CompleteParallelWorker returns true if this thread should
+    // handle exit, false means somebody else did
+    should_exit_batch_group = write_thread_.CompleteParallelMemTableWriter(&w);
+  }
+  if (should_exit_batch_group) {
+    if (status.ok()) {
+      for (auto* tmp_w : write_group) {
+        assert(tmp_w);
+        if (tmp_w->post_memtable_callback) {
+          Status tmp_s =
+              (*tmp_w->post_memtable_callback)(last_sequence, disable_memtable);
+          // TODO: propagate the execution status of post_memtable_callback to
+          // caller.
+          assert(tmp_s.ok());
+        }
+      }
+      // Note: if we are to resume after non-OK statuses we need to revisit how
+      // we reacts to non-OK statuses here.
+      versions_->SetLastSequence(last_sequence);
+    }
+    MemTableInsertStatusCheck(w.status);
+    write_thread_.ExitAsBatchGroupLeader(write_group, status);
+  }
+
+  if (status.ok()) {
+    status = w.FinalStatus();
+  }
+  return status;
+}
+
+Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
+                                  WriteBatch* my_batch, WriteCallback* callback,
+                                  uint64_t* log_used, uint64_t log_ref,
+                                  bool disable_memtable, uint64_t* seq_used) {
+  PERF_TIMER_GUARD(write_pre_and_post_process_time);
+  StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
+
+  WriteContext write_context;
+
+  WriteThread::Writer w(write_options, my_batch, callback, log_ref,
+                        disable_memtable, /*_batch_cnt=*/0,
+                        /*_pre_release_callback=*/nullptr);
+  write_thread_.JoinBatchGroup(&w);
+  TEST_SYNC_POINT("DBImplWrite::PipelinedWriteImpl:AfterJoinBatchGroup");
+  if (w.state == WriteThread::STATE_GROUP_LEADER) {
+    WriteThread::WriteGroup wal_write_group;
+    if (w.callback && !w.callback->AllowWriteBatching()) {
+      write_thread_.WaitForMemTableWriters();
+    }
+    LogContext log_context(!write_options.disableWAL && write_options.sync);
+    // PreprocessWrite does its own perf timing.
+    PERF_TIMER_STOP(write_pre_and_post_process_time);
+    w.status = PreprocessWrite(write_options, &log_context, &write_context);
+    PERF_TIMER_START(write_pre_and_post_process_time);
+
+    // This can set non-OK status if callback fail.
+    last_batch_group_size_ =
+        write_thread_.EnterAsBatchGroupLeader(&w, &wal_write_group);
+    const SequenceNumber current_sequence =
+        write_thread_.UpdateLastSequence(versions_->LastSequence()) + 1;
+    size_t total_count = 0;
+    size_t total_byte_size = 0;
+
+    if (w.status.ok()) {
+      // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
+      // grabs but does not seem thread-safe.
+      if (tracer_) {
+        InstrumentedMutexLock lock(&trace_mutex_);
+        if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
+          for (auto* writer : wal_write_group) {
+            // TODO: maybe handle the tracing status?
+            tracer_->Write(writer->batch).PermitUncheckedError();
+          }
+        }
+      }
+      SequenceNumber next_sequence = current_sequence;
+      for (auto* writer : wal_write_group) {
+        assert(writer);
+        if (writer->CheckCallback(this)) {
+          if (writer->ShouldWriteToMemtable()) {
+            writer->sequence = next_sequence;
+            size_t count = WriteBatchInternal::Count(writer->batch);
+            next_sequence += count;
+            total_count += count;
+          }
+          total_byte_size = WriteBatchInternal::AppendedByteSize(
+              total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
+        }
+      }
+      if (w.disable_wal) {
+        has_unpersisted_data_.store(true, std::memory_order_relaxed);
+      }
+      write_thread_.UpdateLastSequence(current_sequence + total_count - 1);
+    }
+
+    auto stats = default_cf_internal_stats_;
+    stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count);
+    RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
+    stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size);
+    RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
+    RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
+
+    PERF_TIMER_STOP(write_pre_and_post_process_time);
+
+    IOStatus io_s;
+    io_s.PermitUncheckedError();  // Allow io_s to be uninitialized
+
+    if (w.status.ok() && !write_options.disableWAL) {
+      PERF_TIMER_GUARD(write_wal_time);
+      stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1);
+      RecordTick(stats_, WRITE_DONE_BY_SELF, 1);
+      if (wal_write_group.size > 1) {
+        stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
+                          wal_write_group.size - 1);
+        RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1);
+      }
+      assert(log_context.log_file_number_size);
+      LogFileNumberSize& log_file_number_size =
+          *(log_context.log_file_number_size);
+      io_s =
+          WriteToWAL(wal_write_group, log_context.writer, log_used,
+                     log_context.need_log_sync, log_context.need_log_dir_sync,
+                     current_sequence, log_file_number_size);
+      w.status = io_s;
+    }
+
+    if (!io_s.ok()) {
+      // Check WriteToWAL status
+      IOStatusCheck(io_s);
+    } else if (!w.CallbackFailed()) {
+      WriteStatusCheck(w.status);
+    }
+
+    VersionEdit synced_wals;
+    if (log_context.need_log_sync) {
+      InstrumentedMutexLock l(&log_write_mutex_);
+      if (w.status.ok()) {
+        MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync,
+                       &synced_wals);
+      } else {
+        MarkLogsNotSynced(logfile_number_);
+      }
+    }
+    if (w.status.ok() && synced_wals.IsWalAddition()) {
+      InstrumentedMutexLock l(&mutex_);
+      w.status = ApplyWALToManifest(&synced_wals);
+    }
+    write_thread_.ExitAsBatchGroupLeader(wal_write_group, w.status);
+  }
+
+  // NOTE: the memtable_write_group is declared before the following
+  // `if` statement because its lifetime needs to be longer
+  // that the inner context  of the `if` as a reference to it
+  // may be used further below within the outer _write_thread
+  WriteThread::WriteGroup memtable_write_group;
+
+  if (w.state == WriteThread::STATE_MEMTABLE_WRITER_LEADER) {
+    PERF_TIMER_GUARD(write_memtable_time);
+    assert(w.ShouldWriteToMemtable());
+    write_thread_.EnterAsMemTableWriter(&w, &memtable_write_group);
+    if (memtable_write_group.size > 1 &&
+        immutable_db_options_.allow_concurrent_memtable_write) {
+      write_thread_.LaunchParallelMemTableWriters(&memtable_write_group);
+    } else {
+      memtable_write_group.status = WriteBatchInternal::InsertInto(
+          memtable_write_group, w.sequence, column_family_memtables_.get(),
+          &flush_scheduler_, &trim_history_scheduler_,
+          write_options.ignore_missing_column_families, 0 /*log_number*/, this,
+          false /*concurrent_memtable_writes*/, seq_per_batch_, batch_per_txn_);
+      versions_->SetLastSequence(memtable_write_group.last_sequence);
+      write_thread_.ExitAsMemTableWriter(&w, memtable_write_group);
+    }
+  } else {
+    // NOTE: the memtable_write_group is never really used,
+    // so we need to set its status to pass ASSERT_STATUS_CHECKED
+    memtable_write_group.status.PermitUncheckedError();
+  }
+
+  if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) {
+    assert(w.ShouldWriteToMemtable());
+    ColumnFamilyMemTablesImpl column_family_memtables(
+        versions_->GetColumnFamilySet());
+    w.status = WriteBatchInternal::InsertInto(
+        &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+        &trim_history_scheduler_, write_options.ignore_missing_column_families,
+        0 /*log_number*/, this, true /*concurrent_memtable_writes*/,
+        false /*seq_per_batch*/, 0 /*batch_cnt*/, true /*batch_per_txn*/,
+        write_options.memtable_insert_hint_per_batch);
+    if (write_thread_.CompleteParallelMemTableWriter(&w)) {
+      MemTableInsertStatusCheck(w.status);
+      versions_->SetLastSequence(w.write_group->last_sequence);
+      write_thread_.ExitAsMemTableWriter(&w, *w.write_group);
+    }
+  }
+  if (seq_used != nullptr) {
+    *seq_used = w.sequence;
+  }
+
+  assert(w.state == WriteThread::STATE_COMPLETED);
+  return w.FinalStatus();
+}
+
+Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options,
+                                      WriteBatch* my_batch,
+                                      WriteCallback* callback, uint64_t log_ref,
+                                      SequenceNumber seq,
+                                      const size_t sub_batch_cnt) {
+  PERF_TIMER_GUARD(write_pre_and_post_process_time);
+  StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
+
+  WriteThread::Writer w(write_options, my_batch, callback, log_ref,
+                        false /*disable_memtable*/);
+
+  if (w.CheckCallback(this) && w.ShouldWriteToMemtable()) {
+    w.sequence = seq;
+    size_t total_count = WriteBatchInternal::Count(my_batch);
+    InternalStats* stats = default_cf_internal_stats_;
+    stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count);
+    RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
+
+    ColumnFamilyMemTablesImpl column_family_memtables(
+        versions_->GetColumnFamilySet());
+    w.status = WriteBatchInternal::InsertInto(
+        &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+        &trim_history_scheduler_, write_options.ignore_missing_column_families,
+        0 /*log_number*/, this, true /*concurrent_memtable_writes*/,
+        seq_per_batch_, sub_batch_cnt, true /*batch_per_txn*/,
+        write_options.memtable_insert_hint_per_batch);
+    if (write_options.disableWAL) {
+      has_unpersisted_data_.store(true, std::memory_order_relaxed);
+    }
+  }
+
+  size_t pending_cnt = pending_memtable_writes_.fetch_sub(1) - 1;
+  if (pending_cnt == 0) {
+    // switch_cv_ waits until pending_memtable_writes_ = 0. Locking its mutex
+    // before notify ensures that cv is in waiting state when it is notified
+    // thus not missing the update to pending_memtable_writes_ even though it is
+    // not modified under the mutex.
+    std::lock_guard<std::mutex> lck(switch_mutex_);
+    switch_cv_.notify_all();
+  }
+  WriteStatusCheck(w.status);
+
+  if (!w.FinalStatus().ok()) {
+    return w.FinalStatus();
+  }
+  return Status::OK();
+}
+
+// The 2nd write queue. If enabled it will be used only for WAL-only writes.
+// This is the only queue that updates LastPublishedSequence which is only
+// applicable in a two-queue setting.
+Status DBImpl::WriteImplWALOnly(
+    WriteThread* write_thread, const WriteOptions& write_options,
+    WriteBatch* my_batch, WriteCallback* callback, uint64_t* log_used,
+    const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt,
+    PreReleaseCallback* pre_release_callback, const AssignOrder assign_order,
+    const PublishLastSeq publish_last_seq, const bool disable_memtable) {
+  PERF_TIMER_GUARD(write_pre_and_post_process_time);
+  WriteThread::Writer w(write_options, my_batch, callback, log_ref,
+                        disable_memtable, sub_batch_cnt, pre_release_callback);
+  StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
+
+  write_thread->JoinBatchGroup(&w);
+  assert(w.state != WriteThread::STATE_PARALLEL_MEMTABLE_WRITER);
+  if (w.state == WriteThread::STATE_COMPLETED) {
+    if (log_used != nullptr) {
+      *log_used = w.log_used;
+    }
+    if (seq_used != nullptr) {
+      *seq_used = w.sequence;
+    }
+    return w.FinalStatus();
+  }
+  // else we are the leader of the write batch group
+  assert(w.state == WriteThread::STATE_GROUP_LEADER);
+
+  if (publish_last_seq == kDoPublishLastSeq) {
+    Status status;
+
+    // Currently we only use kDoPublishLastSeq in unordered_write
+    assert(immutable_db_options_.unordered_write);
+    WriteContext write_context;
+    if (error_handler_.IsDBStopped()) {
+      status = error_handler_.GetBGError();
+    }
+    // TODO(myabandeh): Make preliminary checks thread-safe so we could do them
+    // without paying the cost of obtaining the mutex.
+    if (status.ok()) {
+      LogContext log_context;
+      status = PreprocessWrite(write_options, &log_context, &write_context);
+      WriteStatusCheckOnLocked(status);
+    }
+    if (!status.ok()) {
+      WriteThread::WriteGroup write_group;
+      write_thread->EnterAsBatchGroupLeader(&w, &write_group);
+      write_thread->ExitAsBatchGroupLeader(write_group, status);
+      return status;
+    }
+  }
+
+  WriteThread::WriteGroup write_group;
+  uint64_t last_sequence;
+  write_thread->EnterAsBatchGroupLeader(&w, &write_group);
+  // Note: no need to update last_batch_group_size_ here since the batch writes
+  // to WAL only
+  // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
+  // grabs but does not seem thread-safe.
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
+      for (auto* writer : write_group) {
+        // TODO: maybe handle the tracing status?
+        tracer_->Write(writer->batch).PermitUncheckedError();
+      }
+    }
+  }
+
+  size_t pre_release_callback_cnt = 0;
+  size_t total_byte_size = 0;
+  for (auto* writer : write_group) {
+    assert(writer);
+    if (writer->CheckCallback(this)) {
+      total_byte_size = WriteBatchInternal::AppendedByteSize(
+          total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
+      if (writer->pre_release_callback) {
+        pre_release_callback_cnt++;
+      }
+    }
+  }
+
+  const bool concurrent_update = true;
+  // Update stats while we are an exclusive group leader, so we know
+  // that nobody else can be writing to these particular stats.
+  // We're optimistic, updating the stats before we successfully
+  // commit.  That lets us release our leader status early.
+  auto stats = default_cf_internal_stats_;
+  stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size,
+                    concurrent_update);
+  RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
+  stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1,
+                    concurrent_update);
+  RecordTick(stats_, WRITE_DONE_BY_SELF);
+  auto write_done_by_other = write_group.size - 1;
+  if (write_done_by_other > 0) {
+    stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
+                      write_done_by_other, concurrent_update);
+    RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other);
+  }
+  RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
+
+  PERF_TIMER_STOP(write_pre_and_post_process_time);
+
+  PERF_TIMER_GUARD(write_wal_time);
+  // LastAllocatedSequence is increased inside WriteToWAL under
+  // wal_write_mutex_ to ensure ordered events in WAL
+  size_t seq_inc = 0 /* total_count */;
+  if (assign_order == kDoAssignOrder) {
+    size_t total_batch_cnt = 0;
+    for (auto* writer : write_group) {
+      assert(writer->batch_cnt || !seq_per_batch_);
+      if (!writer->CallbackFailed()) {
+        total_batch_cnt += writer->batch_cnt;
+      }
+    }
+    seq_inc = total_batch_cnt;
+  }
+  Status status;
+  if (!write_options.disableWAL) {
+    IOStatus io_s =
+        ConcurrentWriteToWAL(write_group, log_used, &last_sequence, seq_inc);
+    status = io_s;
+    // last_sequence may not be set if there is an error
+    // This error checking and return is moved up to avoid using uninitialized
+    // last_sequence.
+    if (!io_s.ok()) {
+      IOStatusCheck(io_s);
+      write_thread->ExitAsBatchGroupLeader(write_group, status);
+      return status;
+    }
+  } else {
+    // Otherwise we inc seq number to do solely the seq allocation
+    last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
+  }
+
+  size_t memtable_write_cnt = 0;
+  auto curr_seq = last_sequence + 1;
+  for (auto* writer : write_group) {
+    if (writer->CallbackFailed()) {
+      continue;
+    }
+    writer->sequence = curr_seq;
+    if (assign_order == kDoAssignOrder) {
+      assert(writer->batch_cnt || !seq_per_batch_);
+      curr_seq += writer->batch_cnt;
+    }
+    if (!writer->disable_memtable) {
+      memtable_write_cnt++;
+    }
+    // else seq advances only by memtable writes
+  }
+  if (status.ok() && write_options.sync) {
+    assert(!write_options.disableWAL);
+    // Requesting sync with two_write_queues_ is expected to be very rare. We
+    // hance provide a simple implementation that is not necessarily efficient.
+    if (manual_wal_flush_) {
+      status = FlushWAL(true);
+    } else {
+      status = SyncWAL();
+    }
+  }
+  PERF_TIMER_START(write_pre_and_post_process_time);
+
+  if (!w.CallbackFailed()) {
+    WriteStatusCheck(status);
+  }
+  if (status.ok()) {
+    size_t index = 0;
+    for (auto* writer : write_group) {
+      if (!writer->CallbackFailed() && writer->pre_release_callback) {
+        assert(writer->sequence != kMaxSequenceNumber);
+        Status ws = writer->pre_release_callback->Callback(
+            writer->sequence, disable_memtable, writer->log_used, index++,
+            pre_release_callback_cnt);
+        if (!ws.ok()) {
+          status = ws;
+          break;
+        }
+      }
+    }
+  }
+  if (publish_last_seq == kDoPublishLastSeq) {
+    versions_->SetLastSequence(last_sequence + seq_inc);
+    // Currently we only use kDoPublishLastSeq in unordered_write
+    assert(immutable_db_options_.unordered_write);
+  }
+  if (immutable_db_options_.unordered_write && status.ok()) {
+    pending_memtable_writes_ += memtable_write_cnt;
+  }
+  write_thread->ExitAsBatchGroupLeader(write_group, status);
+  if (status.ok()) {
+    status = w.FinalStatus();
+  }
+  if (seq_used != nullptr) {
+    *seq_used = w.sequence;
+  }
+  return status;
+}
+
+void DBImpl::WriteStatusCheckOnLocked(const Status& status) {
+  // Is setting bg_error_ enough here?  This will at least stop
+  // compaction and fail any further writes.
+  InstrumentedMutexLock l(&mutex_);
+  assert(!status.IsIOFenced() || !error_handler_.GetBGError().ok());
+  if (immutable_db_options_.paranoid_checks && !status.ok() &&
+      !status.IsBusy() && !status.IsIncomplete()) {
+    // Maybe change the return status to void?
+    error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback);
+  }
+}
+
+void DBImpl::WriteStatusCheck(const Status& status) {
+  // Is setting bg_error_ enough here?  This will at least stop
+  // compaction and fail any further writes.
+  assert(!status.IsIOFenced() || !error_handler_.GetBGError().ok());
+  if (immutable_db_options_.paranoid_checks && !status.ok() &&
+      !status.IsBusy() && !status.IsIncomplete()) {
+    mutex_.Lock();
+    // Maybe change the return status to void?
+    error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback);
+    mutex_.Unlock();
+  }
+}
+
+void DBImpl::IOStatusCheck(const IOStatus& io_status) {
+  // Is setting bg_error_ enough here?  This will at least stop
+  // compaction and fail any further writes.
+  if ((immutable_db_options_.paranoid_checks && !io_status.ok() &&
+       !io_status.IsBusy() && !io_status.IsIncomplete()) ||
+      io_status.IsIOFenced()) {
+    mutex_.Lock();
+    // Maybe change the return status to void?
+    error_handler_.SetBGError(io_status, BackgroundErrorReason::kWriteCallback);
+    mutex_.Unlock();
+  } else {
+    // Force writable file to be continue writable.
+    logs_.back().writer->file()->reset_seen_error();
+  }
+}
+
+void DBImpl::MemTableInsertStatusCheck(const Status& status) {
+  // A non-OK status here indicates that the state implied by the
+  // WAL has diverged from the in-memory state.  This could be
+  // because of a corrupt write_batch (very bad), or because the
+  // client specified an invalid column family and didn't specify
+  // ignore_missing_column_families.
+  if (!status.ok()) {
+    mutex_.Lock();
+    assert(!error_handler_.IsBGWorkStopped());
+    // Maybe change the return status to void?
+    error_handler_.SetBGError(status, BackgroundErrorReason::kMemTable)
+        .PermitUncheckedError();
+    mutex_.Unlock();
+  }
+}
+
+Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
+                               LogContext* log_context,
+                               WriteContext* write_context) {
+  assert(write_context != nullptr && log_context != nullptr);
+  Status status;
+
+  if (error_handler_.IsDBStopped()) {
+    InstrumentedMutexLock l(&mutex_);
+    status = error_handler_.GetBGError();
+  }
+
+  PERF_TIMER_GUARD(write_scheduling_flushes_compactions_time);
+
+  if (UNLIKELY(status.ok() && total_log_size_ > GetMaxTotalWalSize())) {
+    assert(versions_);
+    InstrumentedMutexLock l(&mutex_);
+    const ColumnFamilySet* const column_families =
+        versions_->GetColumnFamilySet();
+    assert(column_families);
+    size_t num_cfs = column_families->NumberOfColumnFamilies();
+    assert(num_cfs >= 1);
+    if (num_cfs > 1) {
+      WaitForPendingWrites();
+      status = SwitchWAL(write_context);
+    }
+  }
+
+  if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldFlush())) {
+    // Before a new memtable is added in SwitchMemtable(),
+    // write_buffer_manager_->ShouldFlush() will keep returning true. If another
+    // thread is writing to another DB with the same write buffer, they may also
+    // be flushed. We may end up with flushing much more DBs than needed. It's
+    // suboptimal but still correct.
+    InstrumentedMutexLock l(&mutex_);
+    WaitForPendingWrites();
+    status = HandleWriteBufferManagerFlush(write_context);
+  }
+
+  if (UNLIKELY(status.ok() && !trim_history_scheduler_.Empty())) {
+    InstrumentedMutexLock l(&mutex_);
+    status = TrimMemtableHistory(write_context);
+  }
+
+  if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) {
+    InstrumentedMutexLock l(&mutex_);
+    WaitForPendingWrites();
+    status = ScheduleFlushes(write_context);
+  }
+
+  PERF_TIMER_STOP(write_scheduling_flushes_compactions_time);
+  PERF_TIMER_GUARD(write_pre_and_post_process_time);
+
+  if (UNLIKELY(status.ok() && (write_controller_.IsStopped() ||
+                               write_controller_.NeedsDelay()))) {
+    PERF_TIMER_STOP(write_pre_and_post_process_time);
+    PERF_TIMER_GUARD(write_delay_time);
+    // We don't know size of curent batch so that we always use the size
+    // for previous one. It might create a fairness issue that expiration
+    // might happen for smaller writes but larger writes can go through.
+    // Can optimize it if it is an issue.
+    InstrumentedMutexLock l(&mutex_);
+    status = DelayWrite(last_batch_group_size_, write_options);
+    PERF_TIMER_START(write_pre_and_post_process_time);
+  }
+
+  // If memory usage exceeded beyond a certain threshold,
+  // write_buffer_manager_->ShouldStall() returns true to all threads writing to
+  // all DBs and writers will be stalled.
+  // It does soft checking because WriteBufferManager::buffer_limit_ has already
+  // exceeded at this point so no new write (including current one) will go
+  // through until memory usage is decreased.
+  if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldStall())) {
+    if (write_options.no_slowdown) {
+      status = Status::Incomplete("Write stall");
+    } else {
+      InstrumentedMutexLock l(&mutex_);
+      WriteBufferManagerStallWrites();
+    }
+  }
+  InstrumentedMutexLock l(&log_write_mutex_);
+  if (status.ok() && log_context->need_log_sync) {
+    // Wait until the parallel syncs are finished. Any sync process has to sync
+    // the front log too so it is enough to check the status of front()
+    // We do a while loop since log_sync_cv_ is signalled when any sync is
+    // finished
+    // Note: there does not seem to be a reason to wait for parallel sync at
+    // this early step but it is not important since parallel sync (SyncWAL) and
+    // need_log_sync are usually not used together.
+    while (logs_.front().IsSyncing()) {
+      log_sync_cv_.Wait();
+    }
+    for (auto& log : logs_) {
+      // This is just to prevent the logs to be synced by a parallel SyncWAL
+      // call. We will do the actual syncing later after we will write to the
+      // WAL.
+      // Note: there does not seem to be a reason to set this early before we
+      // actually write to the WAL
+      log.PrepareForSync();
+    }
+  } else {
+    log_context->need_log_sync = false;
+  }
+  log_context->writer = logs_.back().writer;
+  log_context->need_log_dir_sync =
+      log_context->need_log_dir_sync && !log_dir_synced_;
+  log_context->log_file_number_size = std::addressof(alive_log_files_.back());
+
+  return status;
+}
+
+Status DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group,
+                          WriteBatch* tmp_batch, WriteBatch** merged_batch,
+                          size_t* write_with_wal,
+                          WriteBatch** to_be_cached_state) {
+  assert(write_with_wal != nullptr);
+  assert(tmp_batch != nullptr);
+  assert(*to_be_cached_state == nullptr);
+  *write_with_wal = 0;
+  auto* leader = write_group.leader;
+  assert(!leader->disable_wal);  // Same holds for all in the batch group
+  if (write_group.size == 1 && !leader->CallbackFailed() &&
+      leader->batch->GetWalTerminationPoint().is_cleared()) {
+    // we simply write the first WriteBatch to WAL if the group only
+    // contains one batch, that batch should be written to the WAL,
+    // and the batch is not wanting to be truncated
+    *merged_batch = leader->batch;
+    if (WriteBatchInternal::IsLatestPersistentState(*merged_batch)) {
+      *to_be_cached_state = *merged_batch;
+    }
+    *write_with_wal = 1;
+  } else {
+    // WAL needs all of the batches flattened into a single batch.
+    // We could avoid copying here with an iov-like AddRecord
+    // interface
+    *merged_batch = tmp_batch;
+    for (auto writer : write_group) {
+      if (!writer->CallbackFailed()) {
+        Status s = WriteBatchInternal::Append(*merged_batch, writer->batch,
+                                              /*WAL_only*/ true);
+        if (!s.ok()) {
+          tmp_batch->Clear();
+          return s;
+        }
+        if (WriteBatchInternal::IsLatestPersistentState(writer->batch)) {
+          // We only need to cache the last of such write batch
+          *to_be_cached_state = writer->batch;
+        }
+        (*write_with_wal)++;
+      }
+    }
+  }
+  // return merged_batch;
+  return Status::OK();
+}
+
+// When two_write_queues_ is disabled, this function is called from the only
+// write thread. Otherwise this must be called holding log_write_mutex_.
+IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
+                            log::Writer* log_writer, uint64_t* log_used,
+                            uint64_t* log_size,
+                            Env::IOPriority rate_limiter_priority,
+                            LogFileNumberSize& log_file_number_size) {
+  assert(log_size != nullptr);
+
+  Slice log_entry = WriteBatchInternal::Contents(&merged_batch);
+  TEST_SYNC_POINT_CALLBACK("DBImpl::WriteToWAL:log_entry", &log_entry);
+  auto s = merged_batch.VerifyChecksum();
+  if (!s.ok()) {
+    return status_to_io_status(std::move(s));
+  }
+  *log_size = log_entry.size();
+  // When two_write_queues_ WriteToWAL has to be protected from concurretn calls
+  // from the two queues anyway and log_write_mutex_ is already held. Otherwise
+  // if manual_wal_flush_ is enabled we need to protect log_writer->AddRecord
+  // from possible concurrent calls via the FlushWAL by the application.
+  const bool needs_locking = manual_wal_flush_ && !two_write_queues_;
+  // Due to performance cocerns of missed branch prediction penalize the new
+  // manual_wal_flush_ feature (by UNLIKELY) instead of the more common case
+  // when we do not need any locking.
+  if (UNLIKELY(needs_locking)) {
+    log_write_mutex_.Lock();
+  }
+  IOStatus io_s = log_writer->AddRecord(log_entry, rate_limiter_priority);
+
+  if (UNLIKELY(needs_locking)) {
+    log_write_mutex_.Unlock();
+  }
+  if (log_used != nullptr) {
+    *log_used = logfile_number_;
+  }
+  total_log_size_ += log_entry.size();
+  log_file_number_size.AddSize(*log_size);
+  log_empty_ = false;
+  return io_s;
+}
+
+IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
+                            log::Writer* log_writer, uint64_t* log_used,
+                            bool need_log_sync, bool need_log_dir_sync,
+                            SequenceNumber sequence,
+                            LogFileNumberSize& log_file_number_size) {
+  IOStatus io_s;
+  assert(!two_write_queues_);
+  assert(!write_group.leader->disable_wal);
+  // Same holds for all in the batch group
+  size_t write_with_wal = 0;
+  WriteBatch* to_be_cached_state = nullptr;
+  WriteBatch* merged_batch;
+  io_s = status_to_io_status(MergeBatch(write_group, &tmp_batch_, &merged_batch,
+                                        &write_with_wal, &to_be_cached_state));
+  if (UNLIKELY(!io_s.ok())) {
+    return io_s;
+  }
+
+  if (merged_batch == write_group.leader->batch) {
+    write_group.leader->log_used = logfile_number_;
+  } else if (write_with_wal > 1) {
+    for (auto writer : write_group) {
+      writer->log_used = logfile_number_;
+    }
+  }
+
+  WriteBatchInternal::SetSequence(merged_batch, sequence);
+
+  uint64_t log_size;
+  io_s = WriteToWAL(*merged_batch, log_writer, log_used, &log_size,
+                    write_group.leader->rate_limiter_priority,
+                    log_file_number_size);
+  if (to_be_cached_state) {
+    cached_recoverable_state_ = *to_be_cached_state;
+    cached_recoverable_state_empty_ = false;
+  }
+
+  if (io_s.ok() && need_log_sync) {
+    StopWatch sw(immutable_db_options_.clock, stats_, WAL_FILE_SYNC_MICROS);
+    // It's safe to access logs_ with unlocked mutex_ here because:
+    //  - we've set getting_synced=true for all logs,
+    //    so other threads won't pop from logs_ while we're here,
+    //  - only writer thread can push to logs_, and we're in
+    //    writer thread, so no one will push to logs_,
+    //  - as long as other threads don't modify it, it's safe to read
+    //    from std::deque from multiple threads concurrently.
+    //
+    // Sync operation should work with locked log_write_mutex_, because:
+    //   when DBOptions.manual_wal_flush_ is set,
+    //   FlushWAL function will be invoked by another thread.
+    //   if without locked log_write_mutex_, the log file may get data
+    //   corruption
+
+    const bool needs_locking = manual_wal_flush_ && !two_write_queues_;
+    if (UNLIKELY(needs_locking)) {
+      log_write_mutex_.Lock();
+    }
+
+    for (auto& log : logs_) {
+      io_s = log.writer->file()->Sync(immutable_db_options_.use_fsync);
+      if (!io_s.ok()) {
+        break;
+      }
+    }
+
+    if (UNLIKELY(needs_locking)) {
+      log_write_mutex_.Unlock();
+    }
+
+    if (io_s.ok() && need_log_dir_sync) {
+      // We only sync WAL directory the first time WAL syncing is
+      // requested, so that in case users never turn on WAL sync,
+      // we can avoid the disk I/O in the write code path.
+      io_s = directories_.GetWalDir()->FsyncWithDirOptions(
+          IOOptions(), nullptr,
+          DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+    }
+  }
+
+  if (merged_batch == &tmp_batch_) {
+    tmp_batch_.Clear();
+  }
+  if (io_s.ok()) {
+    auto stats = default_cf_internal_stats_;
+    if (need_log_sync) {
+      stats->AddDBStats(InternalStats::kIntStatsWalFileSynced, 1);
+      RecordTick(stats_, WAL_FILE_SYNCED);
+    }
+    stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size);
+    RecordTick(stats_, WAL_FILE_BYTES, log_size);
+    stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal);
+    RecordTick(stats_, WRITE_WITH_WAL, write_with_wal);
+  }
+  return io_s;
+}
+
+IOStatus DBImpl::ConcurrentWriteToWAL(
+    const WriteThread::WriteGroup& write_group, uint64_t* log_used,
+    SequenceNumber* last_sequence, size_t seq_inc) {
+  IOStatus io_s;
+
+  assert(two_write_queues_ || immutable_db_options_.unordered_write);
+  assert(!write_group.leader->disable_wal);
+  // Same holds for all in the batch group
+  WriteBatch tmp_batch;
+  size_t write_with_wal = 0;
+  WriteBatch* to_be_cached_state = nullptr;
+  WriteBatch* merged_batch;
+  io_s = status_to_io_status(MergeBatch(write_group, &tmp_batch, &merged_batch,
+                                        &write_with_wal, &to_be_cached_state));
+  if (UNLIKELY(!io_s.ok())) {
+    return io_s;
+  }
+
+  // We need to lock log_write_mutex_ since logs_ and alive_log_files might be
+  // pushed back concurrently
+  log_write_mutex_.Lock();
+  if (merged_batch == write_group.leader->batch) {
+    write_group.leader->log_used = logfile_number_;
+  } else if (write_with_wal > 1) {
+    for (auto writer : write_group) {
+      writer->log_used = logfile_number_;
+    }
+  }
+  *last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
+  auto sequence = *last_sequence + 1;
+  WriteBatchInternal::SetSequence(merged_batch, sequence);
+
+  log::Writer* log_writer = logs_.back().writer;
+  LogFileNumberSize& log_file_number_size = alive_log_files_.back();
+
+  assert(log_writer->get_log_number() == log_file_number_size.number);
+
+  uint64_t log_size;
+  io_s = WriteToWAL(*merged_batch, log_writer, log_used, &log_size,
+                    write_group.leader->rate_limiter_priority,
+                    log_file_number_size);
+  if (to_be_cached_state) {
+    cached_recoverable_state_ = *to_be_cached_state;
+    cached_recoverable_state_empty_ = false;
+  }
+  log_write_mutex_.Unlock();
+
+  if (io_s.ok()) {
+    const bool concurrent = true;
+    auto stats = default_cf_internal_stats_;
+    stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size,
+                      concurrent);
+    RecordTick(stats_, WAL_FILE_BYTES, log_size);
+    stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal,
+                      concurrent);
+    RecordTick(stats_, WRITE_WITH_WAL, write_with_wal);
+  }
+  return io_s;
+}
+
+Status DBImpl::WriteRecoverableState() {
+  mutex_.AssertHeld();
+  if (!cached_recoverable_state_empty_) {
+    bool dont_care_bool;
+    SequenceNumber next_seq;
+    if (two_write_queues_) {
+      log_write_mutex_.Lock();
+    }
+    SequenceNumber seq;
+    if (two_write_queues_) {
+      seq = versions_->FetchAddLastAllocatedSequence(0);
+    } else {
+      seq = versions_->LastSequence();
+    }
+    WriteBatchInternal::SetSequence(&cached_recoverable_state_, seq + 1);
+    auto status = WriteBatchInternal::InsertInto(
+        &cached_recoverable_state_, column_family_memtables_.get(),
+        &flush_scheduler_, &trim_history_scheduler_, true,
+        0 /*recovery_log_number*/, this, false /* concurrent_memtable_writes */,
+        &next_seq, &dont_care_bool, seq_per_batch_);
+    auto last_seq = next_seq - 1;
+    if (two_write_queues_) {
+      versions_->FetchAddLastAllocatedSequence(last_seq - seq);
+      versions_->SetLastPublishedSequence(last_seq);
+    }
+    versions_->SetLastSequence(last_seq);
+    if (two_write_queues_) {
+      log_write_mutex_.Unlock();
+    }
+    if (status.ok() && recoverable_state_pre_release_callback_) {
+      const bool DISABLE_MEMTABLE = true;
+      for (uint64_t sub_batch_seq = seq + 1;
+           sub_batch_seq < next_seq && status.ok(); sub_batch_seq++) {
+        uint64_t const no_log_num = 0;
+        // Unlock it since the callback might end up locking mutex. e.g.,
+        // AddCommitted -> AdvanceMaxEvictedSeq -> GetSnapshotListFromDB
+        mutex_.Unlock();
+        status = recoverable_state_pre_release_callback_->Callback(
+            sub_batch_seq, !DISABLE_MEMTABLE, no_log_num, 0, 1);
+        mutex_.Lock();
+      }
+    }
+    if (status.ok()) {
+      cached_recoverable_state_.Clear();
+      cached_recoverable_state_empty_ = true;
+    }
+    return status;
+  }
+  return Status::OK();
+}
+
+void DBImpl::SelectColumnFamiliesForAtomicFlush(
+    autovector<ColumnFamilyData*>* cfds) {
+  for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) {
+    if (cfd->IsDropped()) {
+      continue;
+    }
+    if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
+        !cached_recoverable_state_empty_.load()) {
+      cfds->push_back(cfd);
+    }
+  }
+}
+
+// Assign sequence number for atomic flush.
+void DBImpl::AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds) {
+  assert(immutable_db_options_.atomic_flush);
+  auto seq = versions_->LastSequence();
+  for (auto cfd : cfds) {
+    cfd->imm()->AssignAtomicFlushSeq(seq);
+  }
+}
+
+Status DBImpl::SwitchWAL(WriteContext* write_context) {
+  mutex_.AssertHeld();
+  assert(write_context != nullptr);
+  Status status;
+
+  if (alive_log_files_.begin()->getting_flushed) {
+    return status;
+  }
+
+  auto oldest_alive_log = alive_log_files_.begin()->number;
+  bool flush_wont_release_oldest_log = false;
+  if (allow_2pc()) {
+    auto oldest_log_with_uncommitted_prep =
+        logs_with_prep_tracker_.FindMinLogContainingOutstandingPrep();
+
+    assert(oldest_log_with_uncommitted_prep == 0 ||
+           oldest_log_with_uncommitted_prep >= oldest_alive_log);
+    if (oldest_log_with_uncommitted_prep > 0 &&
+        oldest_log_with_uncommitted_prep == oldest_alive_log) {
+      if (unable_to_release_oldest_log_) {
+        // we already attempted to flush all column families dependent on
+        // the oldest alive log but the log still contained uncommitted
+        // transactions so there is still nothing that we can do.
+        return status;
+      } else {
+        ROCKS_LOG_WARN(
+            immutable_db_options_.info_log,
+            "Unable to release oldest log due to uncommitted transaction");
+        unable_to_release_oldest_log_ = true;
+        flush_wont_release_oldest_log = true;
+      }
+    }
+  }
+  if (!flush_wont_release_oldest_log) {
+    // we only mark this log as getting flushed if we have successfully
+    // flushed all data in this log. If this log contains outstanding prepared
+    // transactions then we cannot flush this log until those transactions are
+    // commited.
+    unable_to_release_oldest_log_ = false;
+    alive_log_files_.begin()->getting_flushed = true;
+  }
+
+  ROCKS_LOG_INFO(
+      immutable_db_options_.info_log,
+      "Flushing all column families with data in WAL number %" PRIu64
+      ". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64,
+      oldest_alive_log, total_log_size_.load(), GetMaxTotalWalSize());
+  // no need to refcount because drop is happening in write thread, so can't
+  // happen while we're in the write thread
+  autovector<ColumnFamilyData*> cfds;
+  if (immutable_db_options_.atomic_flush) {
+    SelectColumnFamiliesForAtomicFlush(&cfds);
+  } else {
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      if (cfd->OldestLogToKeep() <= oldest_alive_log) {
+        cfds.push_back(cfd);
+      }
+    }
+    MaybeFlushStatsCF(&cfds);
+  }
+  WriteThread::Writer nonmem_w;
+  if (two_write_queues_) {
+    nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+  }
+
+  for (const auto cfd : cfds) {
+    cfd->Ref();
+    status = SwitchMemtable(cfd, write_context);
+    cfd->UnrefAndTryDelete();
+    if (!status.ok()) {
+      break;
+    }
+  }
+  if (two_write_queues_) {
+    nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+  }
+
+  if (status.ok()) {
+    if (immutable_db_options_.atomic_flush) {
+      AssignAtomicFlushSeq(cfds);
+    }
+    for (auto cfd : cfds) {
+      cfd->imm()->FlushRequested();
+      if (!immutable_db_options_.atomic_flush) {
+        FlushRequest flush_req;
+        GenerateFlushRequest({cfd}, &flush_req);
+        SchedulePendingFlush(flush_req, FlushReason::kWalFull);
+      }
+    }
+    if (immutable_db_options_.atomic_flush) {
+      FlushRequest flush_req;
+      GenerateFlushRequest(cfds, &flush_req);
+      SchedulePendingFlush(flush_req, FlushReason::kWalFull);
+    }
+    MaybeScheduleFlushOrCompaction();
+  }
+  return status;
+}
+
+Status DBImpl::HandleWriteBufferManagerFlush(WriteContext* write_context) {
+  mutex_.AssertHeld();
+  assert(write_context != nullptr);
+  Status status;
+
+  // Before a new memtable is added in SwitchMemtable(),
+  // write_buffer_manager_->ShouldFlush() will keep returning true. If another
+  // thread is writing to another DB with the same write buffer, they may also
+  // be flushed. We may end up with flushing much more DBs than needed. It's
+  // suboptimal but still correct.
+  // no need to refcount because drop is happening in write thread, so can't
+  // happen while we're in the write thread
+  autovector<ColumnFamilyData*> cfds;
+  if (immutable_db_options_.atomic_flush) {
+    SelectColumnFamiliesForAtomicFlush(&cfds);
+  } else {
+    ColumnFamilyData* cfd_picked = nullptr;
+    SequenceNumber seq_num_for_cf_picked = kMaxSequenceNumber;
+
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      if (!cfd->mem()->IsEmpty() && !cfd->imm()->IsFlushPendingOrRunning()) {
+        // We only consider flush on CFs with bytes in the mutable memtable,
+        // and no immutable memtables for which flush has yet to finish. If
+        // we triggered flush on CFs already trying to flush, we would risk
+        // creating too many immutable memtables leading to write stalls.
+        uint64_t seq = cfd->mem()->GetCreationSeq();
+        if (cfd_picked == nullptr || seq < seq_num_for_cf_picked) {
+          cfd_picked = cfd;
+          seq_num_for_cf_picked = seq;
+        }
+      }
+    }
+    if (cfd_picked != nullptr) {
+      cfds.push_back(cfd_picked);
+    }
+    MaybeFlushStatsCF(&cfds);
+  }
+  if (!cfds.empty()) {
+    ROCKS_LOG_INFO(
+        immutable_db_options_.info_log,
+        "Flushing triggered to alleviate write buffer memory usage. Write "
+        "buffer is using %" ROCKSDB_PRIszt
+        " bytes out of a total of %" ROCKSDB_PRIszt ".",
+        write_buffer_manager_->memory_usage(),
+        write_buffer_manager_->buffer_size());
+  }
+
+  WriteThread::Writer nonmem_w;
+  if (two_write_queues_) {
+    nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+  }
+  for (const auto cfd : cfds) {
+    if (cfd->mem()->IsEmpty()) {
+      continue;
+    }
+    cfd->Ref();
+    status = SwitchMemtable(cfd, write_context);
+    cfd->UnrefAndTryDelete();
+    if (!status.ok()) {
+      break;
+    }
+  }
+  if (two_write_queues_) {
+    nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+  }
+
+  if (status.ok()) {
+    if (immutable_db_options_.atomic_flush) {
+      AssignAtomicFlushSeq(cfds);
+    }
+    for (const auto cfd : cfds) {
+      cfd->imm()->FlushRequested();
+      if (!immutable_db_options_.atomic_flush) {
+        FlushRequest flush_req;
+        GenerateFlushRequest({cfd}, &flush_req);
+        SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager);
+      }
+    }
+    if (immutable_db_options_.atomic_flush) {
+      FlushRequest flush_req;
+      GenerateFlushRequest(cfds, &flush_req);
+      SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager);
+    }
+    MaybeScheduleFlushOrCompaction();
+  }
+  return status;
+}
+
+uint64_t DBImpl::GetMaxTotalWalSize() const {
+  uint64_t max_total_wal_size =
+      max_total_wal_size_.load(std::memory_order_acquire);
+  if (max_total_wal_size > 0) {
+    return max_total_wal_size;
+  }
+  return 4 * max_total_in_memory_state_.load(std::memory_order_acquire);
+}
+
+// REQUIRES: mutex_ is held
+// REQUIRES: this thread is currently at the front of the writer queue
+Status DBImpl::DelayWrite(uint64_t num_bytes,
+                          const WriteOptions& write_options) {
+  uint64_t time_delayed = 0;
+  bool delayed = false;
+  {
+    StopWatch sw(immutable_db_options_.clock, stats_, WRITE_STALL,
+                 &time_delayed);
+    uint64_t delay =
+        write_controller_.GetDelay(immutable_db_options_.clock, num_bytes);
+    TEST_SYNC_POINT("DBImpl::DelayWrite:Start");
+    if (delay > 0) {
+      if (write_options.no_slowdown) {
+        return Status::Incomplete("Write stall");
+      }
+      TEST_SYNC_POINT("DBImpl::DelayWrite:Sleep");
+
+      // Notify write_thread_ about the stall so it can setup a barrier and
+      // fail any pending writers with no_slowdown
+      write_thread_.BeginWriteStall();
+      mutex_.Unlock();
+      TEST_SYNC_POINT("DBImpl::DelayWrite:BeginWriteStallDone");
+      // We will delay the write until we have slept for `delay` microseconds
+      // or we don't need a delay anymore. We check for cancellation every 1ms
+      // (slightly longer because WriteController minimum delay is 1ms, in
+      // case of sleep imprecision, rounding, etc.)
+      const uint64_t kDelayInterval = 1001;
+      uint64_t stall_end = sw.start_time() + delay;
+      while (write_controller_.NeedsDelay()) {
+        if (immutable_db_options_.clock->NowMicros() >= stall_end) {
+          // We already delayed this write `delay` microseconds
+          break;
+        }
+
+        delayed = true;
+        // Sleep for 0.001 seconds
+        immutable_db_options_.clock->SleepForMicroseconds(kDelayInterval);
+      }
+      mutex_.Lock();
+      write_thread_.EndWriteStall();
+    }
+
+    // Don't wait if there's a background error, even if its a soft error. We
+    // might wait here indefinitely as the background compaction may never
+    // finish successfully, resulting in the stall condition lasting
+    // indefinitely
+    while (error_handler_.GetBGError().ok() && write_controller_.IsStopped() &&
+           !shutting_down_.load(std::memory_order_relaxed)) {
+      if (write_options.no_slowdown) {
+        return Status::Incomplete("Write stall");
+      }
+      delayed = true;
+
+      // Notify write_thread_ about the stall so it can setup a barrier and
+      // fail any pending writers with no_slowdown
+      write_thread_.BeginWriteStall();
+      TEST_SYNC_POINT("DBImpl::DelayWrite:Wait");
+      bg_cv_.Wait();
+      write_thread_.EndWriteStall();
+    }
+  }
+  assert(!delayed || !write_options.no_slowdown);
+  if (delayed) {
+    default_cf_internal_stats_->AddDBStats(
+        InternalStats::kIntStatsWriteStallMicros, time_delayed);
+    RecordTick(stats_, STALL_MICROS, time_delayed);
+  }
+
+  // If DB is not in read-only mode and write_controller is not stopping
+  // writes, we can ignore any background errors and allow the write to
+  // proceed
+  Status s;
+  if (write_controller_.IsStopped()) {
+    if (!shutting_down_.load(std::memory_order_relaxed)) {
+      // If writes are still stopped and db not shutdown, it means we bailed
+      // due to a background error
+      s = Status::Incomplete(error_handler_.GetBGError().ToString());
+    } else {
+      s = Status::ShutdownInProgress("stalled writes");
+    }
+  }
+  if (error_handler_.IsDBStopped()) {
+    s = error_handler_.GetBGError();
+  }
+  return s;
+}
+
+// REQUIRES: mutex_ is held
+// REQUIRES: this thread is currently at the front of the writer queue
+void DBImpl::WriteBufferManagerStallWrites() {
+  mutex_.AssertHeld();
+  // First block future writer threads who want to add themselves to the queue
+  // of WriteThread.
+  write_thread_.BeginWriteStall();
+  mutex_.Unlock();
+
+  // Change the state to State::Blocked.
+  static_cast<WBMStallInterface*>(wbm_stall_.get())
+      ->SetState(WBMStallInterface::State::BLOCKED);
+  // Then WriteBufferManager will add DB instance to its queue
+  // and block this thread by calling WBMStallInterface::Block().
+  write_buffer_manager_->BeginWriteStall(wbm_stall_.get());
+  wbm_stall_->Block();
+
+  mutex_.Lock();
+  // Stall has ended. Signal writer threads so that they can add
+  // themselves to the WriteThread queue for writes.
+  write_thread_.EndWriteStall();
+}
+
+Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
+                                            WriteBatch* my_batch) {
+  assert(write_options.low_pri);
+  // This is called outside the DB mutex. Although it is safe to make the call,
+  // the consistency condition is not guaranteed to hold. It's OK to live with
+  // it in this case.
+  // If we need to speed compaction, it means the compaction is left behind
+  // and we start to limit low pri writes to a limit.
+  if (write_controller_.NeedSpeedupCompaction()) {
+    if (allow_2pc() && (my_batch->HasCommit() || my_batch->HasRollback())) {
+      // For 2PC, we only rate limit prepare, not commit.
+      return Status::OK();
+    }
+    if (write_options.no_slowdown) {
+      return Status::Incomplete("Low priority write stall");
+    } else {
+      assert(my_batch != nullptr);
+      // Rate limit those writes. The reason that we don't completely wait
+      // is that in case the write is heavy, low pri writes may never have
+      // a chance to run. Now we guarantee we are still slowly making
+      // progress.
+      PERF_TIMER_GUARD(write_delay_time);
+      write_controller_.low_pri_rate_limiter()->Request(
+          my_batch->GetDataSize(), Env::IO_HIGH, nullptr /* stats */,
+          RateLimiter::OpType::kWrite);
+    }
+  }
+  return Status::OK();
+}
+
+void DBImpl::MaybeFlushStatsCF(autovector<ColumnFamilyData*>* cfds) {
+  assert(cfds != nullptr);
+  if (!cfds->empty() && immutable_db_options_.persist_stats_to_disk) {
+    ColumnFamilyData* cfd_stats =
+        versions_->GetColumnFamilySet()->GetColumnFamily(
+            kPersistentStatsColumnFamilyName);
+    if (cfd_stats != nullptr && !cfd_stats->mem()->IsEmpty()) {
+      for (ColumnFamilyData* cfd : *cfds) {
+        if (cfd == cfd_stats) {
+          // stats CF already included in cfds
+          return;
+        }
+      }
+      // force flush stats CF when its log number is less than all other CF's
+      // log numbers
+      bool force_flush_stats_cf = true;
+      for (auto* loop_cfd : *versions_->GetColumnFamilySet()) {
+        if (loop_cfd == cfd_stats) {
+          continue;
+        }
+        if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) {
+          force_flush_stats_cf = false;
+        }
+      }
+      if (force_flush_stats_cf) {
+        cfds->push_back(cfd_stats);
+        ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                       "Force flushing stats CF with automated flush "
+                       "to avoid holding old logs");
+      }
+    }
+  }
+}
+
+Status DBImpl::TrimMemtableHistory(WriteContext* context) {
+  autovector<ColumnFamilyData*> cfds;
+  ColumnFamilyData* tmp_cfd;
+  while ((tmp_cfd = trim_history_scheduler_.TakeNextColumnFamily()) !=
+         nullptr) {
+    cfds.push_back(tmp_cfd);
+  }
+  for (auto& cfd : cfds) {
+    autovector<MemTable*> to_delete;
+    bool trimmed = cfd->imm()->TrimHistory(&context->memtables_to_free_,
+                                           cfd->mem()->MemoryAllocatedBytes());
+    if (trimmed) {
+      context->superversion_context.NewSuperVersion();
+      assert(context->superversion_context.new_superversion.get() != nullptr);
+      cfd->InstallSuperVersion(&context->superversion_context, &mutex_);
+    }
+
+    if (cfd->UnrefAndTryDelete()) {
+      cfd = nullptr;
+    }
+  }
+  return Status::OK();
+}
+
+Status DBImpl::ScheduleFlushes(WriteContext* context) {
+  autovector<ColumnFamilyData*> cfds;
+  if (immutable_db_options_.atomic_flush) {
+    SelectColumnFamiliesForAtomicFlush(&cfds);
+    for (auto cfd : cfds) {
+      cfd->Ref();
+    }
+    flush_scheduler_.Clear();
+  } else {
+    ColumnFamilyData* tmp_cfd;
+    while ((tmp_cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
+      cfds.push_back(tmp_cfd);
+    }
+    MaybeFlushStatsCF(&cfds);
+  }
+  Status status;
+  WriteThread::Writer nonmem_w;
+  if (two_write_queues_) {
+    nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+  }
+
+  for (auto& cfd : cfds) {
+    if (!cfd->mem()->IsEmpty()) {
+      status = SwitchMemtable(cfd, context);
+    }
+    if (cfd->UnrefAndTryDelete()) {
+      cfd = nullptr;
+    }
+    if (!status.ok()) {
+      break;
+    }
+  }
+
+  if (two_write_queues_) {
+    nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+  }
+
+  if (status.ok()) {
+    if (immutable_db_options_.atomic_flush) {
+      AssignAtomicFlushSeq(cfds);
+      FlushRequest flush_req;
+      GenerateFlushRequest(cfds, &flush_req);
+      SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
+    } else {
+      for (auto* cfd : cfds) {
+        FlushRequest flush_req;
+        GenerateFlushRequest({cfd}, &flush_req);
+        SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
+      }
+    }
+    MaybeScheduleFlushOrCompaction();
+  }
+  return status;
+}
+
+#ifndef ROCKSDB_LITE
+void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* /*cfd*/,
+                                    const MemTableInfo& mem_table_info) {
+  if (immutable_db_options_.listeners.size() == 0U) {
+    return;
+  }
+  if (shutting_down_.load(std::memory_order_acquire)) {
+    return;
+  }
+
+  mutex_.Unlock();
+  for (auto listener : immutable_db_options_.listeners) {
+    listener->OnMemTableSealed(mem_table_info);
+  }
+  mutex_.Lock();
+}
+#endif  // ROCKSDB_LITE
+
+// REQUIRES: mutex_ is held
+// REQUIRES: this thread is currently at the front of the writer queue
+// REQUIRES: this thread is currently at the front of the 2nd writer queue if
+// two_write_queues_ is true (This is to simplify the reasoning.)
+Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
+  mutex_.AssertHeld();
+  log::Writer* new_log = nullptr;
+  MemTable* new_mem = nullptr;
+  IOStatus io_s;
+
+  // Recoverable state is persisted in WAL. After memtable switch, WAL might
+  // be deleted, so we write the state to memtable to be persisted as well.
+  Status s = WriteRecoverableState();
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Attempt to switch to a new memtable and trigger flush of old.
+  // Do this without holding the dbmutex lock.
+  assert(versions_->prev_log_number() == 0);
+  if (two_write_queues_) {
+    log_write_mutex_.Lock();
+  }
+  bool creating_new_log = !log_empty_;
+  if (two_write_queues_) {
+    log_write_mutex_.Unlock();
+  }
+  uint64_t recycle_log_number = 0;
+  if (creating_new_log && immutable_db_options_.recycle_log_file_num &&
+      !log_recycle_files_.empty()) {
+    recycle_log_number = log_recycle_files_.front();
+  }
+  uint64_t new_log_number =
+      creating_new_log ? versions_->NewFileNumber() : logfile_number_;
+  const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+
+  // Set memtable_info for memtable sealed callback
+#ifndef ROCKSDB_LITE
+  MemTableInfo memtable_info;
+  memtable_info.cf_name = cfd->GetName();
+  memtable_info.first_seqno = cfd->mem()->GetFirstSequenceNumber();
+  memtable_info.earliest_seqno = cfd->mem()->GetEarliestSequenceNumber();
+  memtable_info.num_entries = cfd->mem()->num_entries();
+  memtable_info.num_deletes = cfd->mem()->num_deletes();
+#endif  // ROCKSDB_LITE
+  // Log this later after lock release. It may be outdated, e.g., if background
+  // flush happens before logging, but that should be ok.
+  int num_imm_unflushed = cfd->imm()->NumNotFlushed();
+  const auto preallocate_block_size =
+      GetWalPreallocateBlockSize(mutable_cf_options.write_buffer_size);
+  mutex_.Unlock();
+  if (creating_new_log) {
+    // TODO: Write buffer size passed in should be max of all CF's instead
+    // of mutable_cf_options.write_buffer_size.
+    io_s = CreateWAL(new_log_number, recycle_log_number, preallocate_block_size,
+                     &new_log);
+    if (s.ok()) {
+      s = io_s;
+    }
+  }
+  if (s.ok()) {
+    SequenceNumber seq = versions_->LastSequence();
+    new_mem = cfd->ConstructNewMemtable(mutable_cf_options, seq);
+    context->superversion_context.NewSuperVersion();
+  }
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "[%s] New memtable created with log file: #%" PRIu64
+                 ". Immutable memtables: %d.\n",
+                 cfd->GetName().c_str(), new_log_number, num_imm_unflushed);
+  // There should be no concurrent write as the thread is at the front of
+  // writer queue
+  cfd->mem()->ConstructFragmentedRangeTombstones();
+
+  mutex_.Lock();
+  if (recycle_log_number != 0) {
+    // Since renaming the file is done outside DB mutex, we need to ensure
+    // concurrent full purges don't delete the file while we're recycling it.
+    // To achieve that we hold the old log number in the recyclable list until
+    // after it has been renamed.
+    assert(log_recycle_files_.front() == recycle_log_number);
+    log_recycle_files_.pop_front();
+  }
+  if (s.ok() && creating_new_log) {
+    InstrumentedMutexLock l(&log_write_mutex_);
+    assert(new_log != nullptr);
+    if (!logs_.empty()) {
+      // Alway flush the buffer of the last log before switching to a new one
+      log::Writer* cur_log_writer = logs_.back().writer;
+      if (error_handler_.IsRecoveryInProgress()) {
+        // In recovery path, we force another try of writing WAL buffer.
+        cur_log_writer->file()->reset_seen_error();
+      }
+      io_s = cur_log_writer->WriteBuffer();
+      if (s.ok()) {
+        s = io_s;
+      }
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                       "[%s] Failed to switch from #%" PRIu64 " to #%" PRIu64
+                       "  WAL file\n",
+                       cfd->GetName().c_str(), cur_log_writer->get_log_number(),
+                       new_log_number);
+      }
+    }
+    if (s.ok()) {
+      logfile_number_ = new_log_number;
+      log_empty_ = true;
+      log_dir_synced_ = false;
+      logs_.emplace_back(logfile_number_, new_log);
+      alive_log_files_.push_back(LogFileNumberSize(logfile_number_));
+    }
+  }
+
+  if (!s.ok()) {
+    // how do we fail if we're not creating new log?
+    assert(creating_new_log);
+    delete new_mem;
+    delete new_log;
+    context->superversion_context.new_superversion.reset();
+    // We may have lost data from the WritableFileBuffer in-memory buffer for
+    // the current log, so treat it as a fatal error and set bg_error
+    if (!io_s.ok()) {
+      error_handler_.SetBGError(io_s, BackgroundErrorReason::kMemTable);
+    } else {
+      error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable);
+    }
+    // Read back bg_error in order to get the right severity
+    s = error_handler_.GetBGError();
+    return s;
+  }
+
+  bool empty_cf_updated = false;
+  if (immutable_db_options_.track_and_verify_wals_in_manifest &&
+      !immutable_db_options_.allow_2pc && creating_new_log) {
+    // In non-2pc mode, WALs become obsolete if they do not contain unflushed
+    // data. Updating the empty CF's log number might cause some WALs to become
+    // obsolete. So we should track the WAL obsoletion event before actually
+    // updating the empty CF's log number.
+    uint64_t min_wal_number_to_keep =
+        versions_->PreComputeMinLogNumberWithUnflushedData(logfile_number_);
+    if (min_wal_number_to_keep >
+        versions_->GetWalSet().GetMinWalNumberToKeep()) {
+      // Get a snapshot of the empty column families.
+      // LogAndApply may release and reacquire db
+      // mutex, during that period, column family may become empty (e.g. its
+      // flush succeeds), then it affects the computed min_log_number_to_keep,
+      // so we take a snapshot for consistency of column family data
+      // status. If a column family becomes non-empty afterwards, its active log
+      // should still be the created new log, so the min_log_number_to_keep is
+      // not affected.
+      autovector<ColumnFamilyData*> empty_cfs;
+      for (auto cf : *versions_->GetColumnFamilySet()) {
+        if (cf->IsEmpty()) {
+          empty_cfs.push_back(cf);
+        }
+      }
+
+      VersionEdit wal_deletion;
+      wal_deletion.DeleteWalsBefore(min_wal_number_to_keep);
+      s = versions_->LogAndApplyToDefaultColumnFamily(&wal_deletion, &mutex_,
+                                                      directories_.GetDbDir());
+      if (!s.ok() && versions_->io_status().IsIOError()) {
+        s = error_handler_.SetBGError(versions_->io_status(),
+                                      BackgroundErrorReason::kManifestWrite);
+      }
+      if (!s.ok()) {
+        return s;
+      }
+
+      for (auto cf : empty_cfs) {
+        if (cf->IsEmpty()) {
+          cf->SetLogNumber(logfile_number_);
+          // MEMPURGE: No need to change this, because new adds
+          // should still receive new sequence numbers.
+          cf->mem()->SetCreationSeq(versions_->LastSequence());
+        }  // cf may become non-empty.
+      }
+      empty_cf_updated = true;
+    }
+  }
+  if (!empty_cf_updated) {
+    for (auto cf : *versions_->GetColumnFamilySet()) {
+      // all this is just optimization to delete logs that
+      // are no longer needed -- if CF is empty, that means it
+      // doesn't need that particular log to stay alive, so we just
+      // advance the log number. no need to persist this in the manifest
+      if (cf->IsEmpty()) {
+        if (creating_new_log) {
+          cf->SetLogNumber(logfile_number_);
+        }
+        cf->mem()->SetCreationSeq(versions_->LastSequence());
+      }
+    }
+  }
+
+  cfd->mem()->SetNextLogNumber(logfile_number_);
+  assert(new_mem != nullptr);
+  cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_);
+  new_mem->Ref();
+  cfd->SetMemtable(new_mem);
+  InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context,
+                                     mutable_cf_options);
+
+#ifndef ROCKSDB_LITE
+  // Notify client that memtable is sealed, now that we have successfully
+  // installed a new memtable
+  NotifyOnMemTableSealed(cfd, memtable_info);
+#endif  // ROCKSDB_LITE
+  // It is possible that we got here without checking the value of i_os, but
+  // that is okay.  If we did, it most likely means that s was already an error.
+  // In any case, ignore any unchecked error for i_os here.
+  io_s.PermitUncheckedError();
+  return s;
+}
+
+size_t DBImpl::GetWalPreallocateBlockSize(uint64_t write_buffer_size) const {
+  mutex_.AssertHeld();
+  size_t bsize =
+      static_cast<size_t>(write_buffer_size / 10 + write_buffer_size);
+  // Some users might set very high write_buffer_size and rely on
+  // max_total_wal_size or other parameters to control the WAL size.
+  if (mutable_db_options_.max_total_wal_size > 0) {
+    bsize = std::min<size_t>(
+        bsize, static_cast<size_t>(mutable_db_options_.max_total_wal_size));
+  }
+  if (immutable_db_options_.db_write_buffer_size > 0) {
+    bsize = std::min<size_t>(bsize, immutable_db_options_.db_write_buffer_size);
+  }
+  if (immutable_db_options_.write_buffer_manager &&
+      immutable_db_options_.write_buffer_manager->enabled()) {
+    bsize = std::min<size_t>(
+        bsize, immutable_db_options_.write_buffer_manager->buffer_size());
+  }
+
+  return bsize;
+}
+
+// Default implementations of convenience methods that subclasses of DB
+// can call if they wish
+Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+               const Slice& key, const Slice& value) {
+  // Pre-allocate size of write batch conservatively.
+  // 8 bytes are taken by header, 4 bytes for count, 1 byte for type,
+  // and we allocate 11 extra bytes for key length, as well as value length.
+  WriteBatch batch(key.size() + value.size() + 24, 0 /* max_bytes */,
+                   opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */);
+  Status s = batch.Put(column_family, key, value);
+  if (!s.ok()) {
+    return s;
+  }
+  return Write(opt, &batch);
+}
+
+Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+               const Slice& key, const Slice& ts, const Slice& value) {
+  ColumnFamilyHandle* default_cf = DefaultColumnFamily();
+  assert(default_cf);
+  const Comparator* const default_cf_ucmp = default_cf->GetComparator();
+  assert(default_cf_ucmp);
+  WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+                   opt.protection_bytes_per_key,
+                   default_cf_ucmp->timestamp_size());
+  Status s = batch.Put(column_family, key, ts, value);
+  if (!s.ok()) {
+    return s;
+  }
+  return Write(opt, &batch);
+}
+
+Status DB::PutEntity(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     const WideColumns& columns) {
+  const ColumnFamilyHandle* const default_cf = DefaultColumnFamily();
+  assert(default_cf);
+
+  const Comparator* const default_cf_ucmp = default_cf->GetComparator();
+  assert(default_cf_ucmp);
+
+  WriteBatch batch(/* reserved_bytes */ 0, /* max_bytes */ 0,
+                   options.protection_bytes_per_key,
+                   default_cf_ucmp->timestamp_size());
+
+  const Status s = batch.PutEntity(column_family, key, columns);
+  if (!s.ok()) {
+    return s;
+  }
+
+  return Write(options, &batch);
+}
+
+Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+                  const Slice& key) {
+  WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+                   opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */);
+  Status s = batch.Delete(column_family, key);
+  if (!s.ok()) {
+    return s;
+  }
+  return Write(opt, &batch);
+}
+
+Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+                  const Slice& key, const Slice& ts) {
+  ColumnFamilyHandle* default_cf = DefaultColumnFamily();
+  assert(default_cf);
+  const Comparator* const default_cf_ucmp = default_cf->GetComparator();
+  assert(default_cf_ucmp);
+  WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+                   opt.protection_bytes_per_key,
+                   default_cf_ucmp->timestamp_size());
+  Status s = batch.Delete(column_family, key, ts);
+  if (!s.ok()) {
+    return s;
+  }
+  return Write(opt, &batch);
+}
+
+Status DB::SingleDelete(const WriteOptions& opt,
+                        ColumnFamilyHandle* column_family, const Slice& key) {
+  WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+                   opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */);
+  Status s = batch.SingleDelete(column_family, key);
+  if (!s.ok()) {
+    return s;
+  }
+  return Write(opt, &batch);
+}
+
+Status DB::SingleDelete(const WriteOptions& opt,
+                        ColumnFamilyHandle* column_family, const Slice& key,
+                        const Slice& ts) {
+  ColumnFamilyHandle* default_cf = DefaultColumnFamily();
+  assert(default_cf);
+  const Comparator* const default_cf_ucmp = default_cf->GetComparator();
+  assert(default_cf_ucmp);
+  WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+                   opt.protection_bytes_per_key,
+                   default_cf_ucmp->timestamp_size());
+  Status s = batch.SingleDelete(column_family, key, ts);
+  if (!s.ok()) {
+    return s;
+  }
+  return Write(opt, &batch);
+}
+
+Status DB::DeleteRange(const WriteOptions& opt,
+                       ColumnFamilyHandle* column_family,
+                       const Slice& begin_key, const Slice& end_key) {
+  WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+                   opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */);
+  Status s = batch.DeleteRange(column_family, begin_key, end_key);
+  if (!s.ok()) {
+    return s;
+  }
+  return Write(opt, &batch);
+}
+
+Status DB::DeleteRange(const WriteOptions& opt,
+                       ColumnFamilyHandle* column_family,
+                       const Slice& begin_key, const Slice& end_key,
+                       const Slice& ts) {
+  ColumnFamilyHandle* default_cf = DefaultColumnFamily();
+  assert(default_cf);
+  const Comparator* const default_cf_ucmp = default_cf->GetComparator();
+  assert(default_cf_ucmp);
+  WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+                   opt.protection_bytes_per_key,
+                   default_cf_ucmp->timestamp_size());
+  Status s = batch.DeleteRange(column_family, begin_key, end_key, ts);
+  if (!s.ok()) {
+    return s;
+  }
+  return Write(opt, &batch);
+}
+
+Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+                 const Slice& key, const Slice& value) {
+  WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+                   opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */);
+  Status s = batch.Merge(column_family, key, value);
+  if (!s.ok()) {
+    return s;
+  }
+  return Write(opt, &batch);
+}
+
+Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+                 const Slice& key, const Slice& ts, const Slice& value) {
+  ColumnFamilyHandle* default_cf = DefaultColumnFamily();
+  assert(default_cf);
+  const Comparator* const default_cf_ucmp = default_cf->GetComparator();
+  assert(default_cf_ucmp);
+  WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+                   opt.protection_bytes_per_key,
+                   default_cf_ucmp->timestamp_size());
+  Status s = batch.Merge(column_family, key, ts, value);
+  if (!s.ok()) {
+    return s;
+  }
+  return Write(opt, &batch);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_info_dumper.cc b/src/rocksdb/db/db_info_dumper.cc
new file mode 100644
index 000000000..be8d5bee1
--- /dev/null
+++ b/src/rocksdb/db/db_info_dumper.cc
@@ -0,0 +1,147 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_info_dumper.h"
+
+#include <stdio.h>
+
+#include <algorithm>
+#include <cinttypes>
+#include <string>
+#include <vector>
+
+#include "file/filename.h"
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void DumpDBFileSummary(const ImmutableDBOptions& options,
+                       const std::string& dbname,
+                       const std::string& session_id) {
+  if (options.info_log == nullptr) {
+    return;
+  }
+
+  auto* env = options.env;
+  uint64_t number = 0;
+  FileType type = kInfoLogFile;
+
+  std::vector<std::string> files;
+  uint64_t file_num = 0;
+  uint64_t file_size;
+  std::string file_info, wal_info;
+
+  Header(options.info_log, "DB SUMMARY\n");
+  Header(options.info_log, "DB Session ID:  %s\n", session_id.c_str());
+
+  Status s;
+  // Get files in dbname dir
+  s = env->GetChildren(dbname, &files);
+  if (!s.ok()) {
+    Error(options.info_log, "Error when reading %s dir %s\n", dbname.c_str(),
+          s.ToString().c_str());
+  }
+  std::sort(files.begin(), files.end());
+  for (const std::string& file : files) {
+    if (!ParseFileName(file, &number, &type)) {
+      continue;
+    }
+    switch (type) {
+      case kCurrentFile:
+        Header(options.info_log, "CURRENT file:  %s\n", file.c_str());
+        break;
+      case kIdentityFile:
+        Header(options.info_log, "IDENTITY file:  %s\n", file.c_str());
+        break;
+      case kDescriptorFile:
+        s = env->GetFileSize(dbname + "/" + file, &file_size);
+        if (s.ok()) {
+          Header(options.info_log,
+                 "MANIFEST file:  %s size: %" PRIu64 " Bytes\n", file.c_str(),
+                 file_size);
+        } else {
+          Error(options.info_log,
+                "Error when reading MANIFEST file: %s/%s %s\n", dbname.c_str(),
+                file.c_str(), s.ToString().c_str());
+        }
+        break;
+      case kWalFile:
+        s = env->GetFileSize(dbname + "/" + file, &file_size);
+        if (s.ok()) {
+          wal_info.append(file)
+              .append(" size: ")
+              .append(std::to_string(file_size))
+              .append(" ; ");
+        } else {
+          Error(options.info_log, "Error when reading LOG file: %s/%s %s\n",
+                dbname.c_str(), file.c_str(), s.ToString().c_str());
+        }
+        break;
+      case kTableFile:
+        if (++file_num < 10) {
+          file_info.append(file).append(" ");
+        }
+        break;
+      default:
+        break;
+    }
+  }
+
+  // Get sst files in db_path dir
+  for (auto& db_path : options.db_paths) {
+    if (dbname.compare(db_path.path) != 0) {
+      s = env->GetChildren(db_path.path, &files);
+      if (!s.ok()) {
+        Error(options.info_log, "Error when reading %s dir %s\n",
+              db_path.path.c_str(), s.ToString().c_str());
+        continue;
+      }
+      std::sort(files.begin(), files.end());
+      for (const std::string& file : files) {
+        if (ParseFileName(file, &number, &type)) {
+          if (type == kTableFile && ++file_num < 10) {
+            file_info.append(file).append(" ");
+          }
+        }
+      }
+    }
+    Header(options.info_log,
+           "SST files in %s dir, Total Num: %" PRIu64 ", files: %s\n",
+           db_path.path.c_str(), file_num, file_info.c_str());
+    file_num = 0;
+    file_info.clear();
+  }
+
+  // Get wal file in wal_dir
+  const auto& wal_dir = options.GetWalDir(dbname);
+  if (!options.IsWalDirSameAsDBPath(dbname)) {
+    s = env->GetChildren(wal_dir, &files);
+    if (!s.ok()) {
+      Error(options.info_log, "Error when reading %s dir %s\n", wal_dir.c_str(),
+            s.ToString().c_str());
+      return;
+    }
+    wal_info.clear();
+    for (const std::string& file : files) {
+      if (ParseFileName(file, &number, &type)) {
+        if (type == kWalFile) {
+          s = env->GetFileSize(wal_dir + "/" + file, &file_size);
+          if (s.ok()) {
+            wal_info.append(file)
+                .append(" size: ")
+                .append(std::to_string(file_size))
+                .append(" ; ");
+          } else {
+            Error(options.info_log, "Error when reading LOG file %s/%s %s\n",
+                  wal_dir.c_str(), file.c_str(), s.ToString().c_str());
+          }
+        }
+      }
+    }
+  }
+  Header(options.info_log, "Write Ahead Log file in %s: %s\n", wal_dir.c_str(),
+         wal_info.c_str());
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_info_dumper.h b/src/rocksdb/db/db_info_dumper.h
new file mode 100644
index 000000000..f518e840f
--- /dev/null
+++ b/src/rocksdb/db/db_info_dumper.h
@@ -0,0 +1,15 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <string>
+
+#include "options/db_options.h"
+
+namespace ROCKSDB_NAMESPACE {
+void DumpDBFileSummary(const ImmutableDBOptions& options,
+                       const std::string& dbname,
+                       const std::string& session_id = "");
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_inplace_update_test.cc b/src/rocksdb/db/db_inplace_update_test.cc
new file mode 100644
index 000000000..3921a3b00
--- /dev/null
+++ b/src/rocksdb/db/db_inplace_update_test.cc
@@ -0,0 +1,262 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBTestInPlaceUpdate : public DBTestBase {
+ public:
+  DBTestInPlaceUpdate()
+      : DBTestBase("db_inplace_update_test", /*env_do_fsync=*/true) {}
+};
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdate) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.allow_concurrent_memtable_write = false;
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of smaller size
+    int numValues = 10;
+    for (int i = numValues; i > 0; i--) {
+      std::string value = DummyString(i, 'a');
+      ASSERT_OK(Put(1, "key", value));
+      ASSERT_EQ(value, Get(1, "key"));
+    }
+
+    // Only 1 instance for that key.
+    validateNumberOfEntries(1, 1);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateLargeNewValue) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.allow_concurrent_memtable_write = false;
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of larger size
+    int numValues = 10;
+    for (int i = 0; i < numValues; i++) {
+      std::string value = DummyString(i, 'a');
+      ASSERT_OK(Put(1, "key", value));
+      ASSERT_EQ(value, Get(1, "key"));
+    }
+
+    // All 10 updates exist in the internal iterator
+    validateNumberOfEntries(numValues, 1);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateEntitySmallerNewValue) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+    options.env = env_;
+    options.allow_concurrent_memtable_write = false;
+
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of smaller size
+    constexpr int num_values = 10;
+    for (int i = num_values; i > 0; --i) {
+      constexpr char key[] = "key";
+      const std::string value = DummyString(i, 'a');
+      WideColumns wide_columns{{"attr", value}};
+
+      ASSERT_OK(db_->PutEntity(WriteOptions(), handles_[1], key, wide_columns));
+      // TODO: use Get to check entity once it's supported
+    }
+
+    // Only 1 instance for that key.
+    validateNumberOfEntries(1, 1);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateEntityLargerNewValue) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+    options.env = env_;
+    options.allow_concurrent_memtable_write = false;
+
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of larger size
+    constexpr int num_values = 10;
+    for (int i = 0; i < num_values; ++i) {
+      constexpr char key[] = "key";
+      const std::string value = DummyString(i, 'a');
+      WideColumns wide_columns{{"attr", value}};
+
+      ASSERT_OK(db_->PutEntity(WriteOptions(), handles_[1], key, wide_columns));
+      // TODO: use Get to check entity once it's supported
+    }
+
+    // All 10 updates exist in the internal iterator
+    validateNumberOfEntries(num_values, 1);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackSmallerSize) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.inplace_callback =
+        ROCKSDB_NAMESPACE::DBTestInPlaceUpdate::updateInPlaceSmallerSize;
+    options.allow_concurrent_memtable_write = false;
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of smaller size
+    int numValues = 10;
+    ASSERT_OK(Put(1, "key", DummyString(numValues, 'a')));
+    ASSERT_EQ(DummyString(numValues, 'c'), Get(1, "key"));
+
+    for (int i = numValues; i > 0; i--) {
+      ASSERT_OK(Put(1, "key", DummyString(i, 'a')));
+      ASSERT_EQ(DummyString(i - 1, 'b'), Get(1, "key"));
+    }
+
+    // Only 1 instance for that key.
+    validateNumberOfEntries(1, 1);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackSmallerVarintSize) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.inplace_callback =
+        ROCKSDB_NAMESPACE::DBTestInPlaceUpdate::updateInPlaceSmallerVarintSize;
+    options.allow_concurrent_memtable_write = false;
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of smaller varint size
+    int numValues = 265;
+    ASSERT_OK(Put(1, "key", DummyString(numValues, 'a')));
+    ASSERT_EQ(DummyString(numValues, 'c'), Get(1, "key"));
+
+    for (int i = numValues; i > 0; i--) {
+      ASSERT_OK(Put(1, "key", DummyString(i, 'a')));
+      ASSERT_EQ(DummyString(1, 'b'), Get(1, "key"));
+    }
+
+    // Only 1 instance for that key.
+    validateNumberOfEntries(1, 1);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackLargeNewValue) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.inplace_callback =
+        ROCKSDB_NAMESPACE::DBTestInPlaceUpdate::updateInPlaceLargerSize;
+    options.allow_concurrent_memtable_write = false;
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of larger size
+    int numValues = 10;
+    for (int i = 0; i < numValues; i++) {
+      ASSERT_OK(Put(1, "key", DummyString(i, 'a')));
+      ASSERT_EQ(DummyString(i, 'c'), Get(1, "key"));
+    }
+
+    // No inplace updates. All updates are puts with new seq number
+    // All 10 updates exist in the internal iterator
+    validateNumberOfEntries(numValues, 1);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackNoAction) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.inplace_callback =
+        ROCKSDB_NAMESPACE::DBTestInPlaceUpdate::updateInPlaceNoAction;
+    options.allow_concurrent_memtable_write = false;
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Callback function requests no actions from db
+    ASSERT_OK(Put(1, "key", DummyString(1, 'a')));
+    ASSERT_EQ(Get(1, "key"), "NOT_FOUND");
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateAndSnapshot) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.allow_concurrent_memtable_write = false;
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of smaller size, and
+    // run GetSnapshot and ReleaseSnapshot
+    int numValues = 2;
+    for (int i = numValues; i > 0; i--) {
+      const Snapshot* s = db_->GetSnapshot();
+      ASSERT_EQ(nullptr, s);
+      std::string value = DummyString(i, 'a');
+      ASSERT_OK(Put(1, "key", value));
+      ASSERT_EQ(value, Get(1, "key"));
+      // release s (nullptr)
+      db_->ReleaseSnapshot(s);
+    }
+
+    // Only 1 instance for that key.
+    validateNumberOfEntries(1, 1);
+  } while (ChangeCompactOptions());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_io_failure_test.cc b/src/rocksdb/db/db_io_failure_test.cc
new file mode 100644
index 000000000..2a405fd38
--- /dev/null
+++ b/src/rocksdb/db/db_io_failure_test.cc
@@ -0,0 +1,593 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBIOFailureTest : public DBTestBase {
+ public:
+  DBIOFailureTest() : DBTestBase("db_io_failure_test", /*env_do_fsync=*/true) {}
+};
+
+#ifndef ROCKSDB_LITE
+// Check that number of files does not grow when writes are dropped
+TEST_F(DBIOFailureTest, DropWrites) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.paranoid_checks = false;
+    Reopen(options);
+
+    ASSERT_OK(Put("foo", "v1"));
+    ASSERT_EQ("v1", Get("foo"));
+    Compact("a", "z");
+    const size_t num_files = CountFiles();
+    // Force out-of-space errors
+    env_->drop_writes_.store(true, std::memory_order_release);
+    env_->sleep_counter_.Reset();
+    env_->SetMockSleep();
+    for (int i = 0; i < 5; i++) {
+      if (option_config_ != kUniversalCompactionMultiLevel &&
+          option_config_ != kUniversalSubcompactions) {
+        for (int level = 0; level < dbfull()->NumberLevels(); level++) {
+          if (level > 0 && level == dbfull()->NumberLevels() - 1) {
+            break;
+          }
+          Status s =
+              dbfull()->TEST_CompactRange(level, nullptr, nullptr, nullptr,
+                                          true /* disallow trivial move */);
+          ASSERT_TRUE(s.ok() || s.IsCorruption());
+        }
+      } else {
+        Status s =
+            dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+        ASSERT_TRUE(s.ok() || s.IsCorruption());
+      }
+    }
+
+    std::string property_value;
+    ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
+    ASSERT_EQ("5", property_value);
+
+    env_->drop_writes_.store(false, std::memory_order_release);
+    const size_t count = CountFiles();
+    ASSERT_LT(count, num_files + 3);
+
+    // Check that compaction attempts slept after errors
+    // TODO @krad: Figure out why ASSERT_EQ 5 keeps failing in certain compiler
+    // versions
+    ASSERT_GE(env_->sleep_counter_.Read(), 4);
+  } while (ChangeCompactOptions());
+}
+
+// Check background error counter bumped on flush failures.
+TEST_F(DBIOFailureTest, DropWritesFlush) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.max_background_flushes = 1;
+    Reopen(options);
+
+    ASSERT_OK(Put("foo", "v1"));
+    // Force out-of-space errors
+    env_->drop_writes_.store(true, std::memory_order_release);
+
+    std::string property_value;
+    // Background error count is 0 now.
+    ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
+    ASSERT_EQ("0", property_value);
+
+    // ASSERT file is too short
+    ASSERT_TRUE(dbfull()->TEST_FlushMemTable(true).IsCorruption());
+
+    ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
+    ASSERT_EQ("1", property_value);
+
+    env_->drop_writes_.store(false, std::memory_order_release);
+  } while (ChangeCompactOptions());
+}
+
+// Check that CompactRange() returns failure if there is not enough space left
+// on device
+TEST_F(DBIOFailureTest, NoSpaceCompactRange) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.disable_auto_compactions = true;
+    Reopen(options);
+
+    // generate 5 tables
+    for (int i = 0; i < 5; ++i) {
+      ASSERT_OK(Put(Key(i), Key(i) + "v"));
+      ASSERT_OK(Flush());
+    }
+
+    // Force out-of-space errors
+    env_->no_space_.store(true, std::memory_order_release);
+
+    Status s = dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                           true /* disallow trivial move */);
+    ASSERT_TRUE(s.IsIOError());
+    ASSERT_TRUE(s.IsNoSpace());
+
+    env_->no_space_.store(false, std::memory_order_release);
+  } while (ChangeCompactOptions());
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBIOFailureTest, NonWritableFileSystem) {
+  do {
+    Options options = CurrentOptions();
+    options.write_buffer_size = 4096;
+    options.arena_block_size = 4096;
+    options.env = env_;
+    Reopen(options);
+    ASSERT_OK(Put("foo", "v1"));
+    env_->non_writeable_rate_.store(100);
+    std::string big(100000, 'x');
+    int errors = 0;
+    for (int i = 0; i < 20; i++) {
+      if (!Put("foo", big).ok()) {
+        errors++;
+        env_->SleepForMicroseconds(100000);
+      }
+    }
+    ASSERT_GT(errors, 0);
+    env_->non_writeable_rate_.store(0);
+  } while (ChangeCompactOptions());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBIOFailureTest, ManifestWriteError) {
+  // Test for the following problem:
+  // (a) Compaction produces file F
+  // (b) Log record containing F is written to MANIFEST file, but Sync() fails
+  // (c) GC deletes F
+  // (d) After reopening DB, reads fail since deleted F is named in log record
+
+  // We iterate twice.  In the second iteration, everything is the
+  // same except the log record never makes it to the MANIFEST file.
+  for (int iter = 0; iter < 2; iter++) {
+    std::atomic<bool>* error_type = (iter == 0) ? &env_->manifest_sync_error_
+                                                : &env_->manifest_write_error_;
+
+    // Insert foo=>bar mapping
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.create_if_missing = true;
+    options.error_if_exists = false;
+    options.paranoid_checks = true;
+    DestroyAndReopen(options);
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_EQ("bar", Get("foo"));
+
+    // Memtable compaction (will succeed)
+    ASSERT_OK(Flush());
+    ASSERT_EQ("bar", Get("foo"));
+    const int last = 2;
+    MoveFilesToLevel(2);
+    ASSERT_EQ(NumTableFilesAtLevel(last), 1);  // foo=>bar is now in last level
+
+    // Merging compaction (will fail)
+    error_type->store(true, std::memory_order_release);
+    ASSERT_NOK(
+        dbfull()->TEST_CompactRange(last, nullptr, nullptr));  // Should fail
+    ASSERT_EQ("bar", Get("foo"));
+
+    error_type->store(false, std::memory_order_release);
+
+    // Since paranoid_checks=true, writes should fail
+    ASSERT_NOK(Put("foo2", "bar2"));
+
+    // Recovery: should not lose data
+    ASSERT_EQ("bar", Get("foo"));
+
+    // Try again with paranoid_checks=false
+    Close();
+    options.paranoid_checks = false;
+    Reopen(options);
+
+    // Merging compaction (will fail)
+    error_type->store(true, std::memory_order_release);
+    Status s =
+        dbfull()->TEST_CompactRange(last, nullptr, nullptr);  // Should fail
+    if (iter == 0) {
+      ASSERT_OK(s);
+    } else {
+      ASSERT_TRUE(s.IsIOError());
+    }
+    ASSERT_EQ("bar", Get("foo"));
+
+    // Recovery: should not lose data
+    error_type->store(false, std::memory_order_release);
+    Reopen(options);
+    ASSERT_EQ("bar", Get("foo"));
+
+    // Since paranoid_checks=false, writes should succeed
+    ASSERT_OK(Put("foo2", "bar2"));
+    ASSERT_EQ("bar", Get("foo"));
+    ASSERT_EQ("bar2", Get("foo2"));
+  }
+}
+
+TEST_F(DBIOFailureTest, PutFailsParanoid) {
+  // Test the following:
+  // (a) A random put fails in paranoid mode (simulate by sync fail)
+  // (b) All other puts have to fail, even if writes would succeed
+  // (c) All of that should happen ONLY if paranoid_checks = true
+
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  options.paranoid_checks = true;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "foo1", "bar1"));
+  // simulate error
+  env_->log_write_error_.store(true, std::memory_order_release);
+  ASSERT_NOK(Put(1, "foo2", "bar2"));
+  env_->log_write_error_.store(false, std::memory_order_release);
+  // the next put should fail, too
+  ASSERT_NOK(Put(1, "foo3", "bar3"));
+  // but we're still able to read
+  ASSERT_EQ("bar", Get(1, "foo"));
+
+  // do the same thing with paranoid checks off
+  options.paranoid_checks = false;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "foo1", "bar1"));
+  // simulate error
+  env_->log_write_error_.store(true, std::memory_order_release);
+  ASSERT_NOK(Put(1, "foo2", "bar2"));
+  env_->log_write_error_.store(false, std::memory_order_release);
+  // the next put should NOT fail
+  ASSERT_OK(Put(1, "foo3", "bar3"));
+}
+#if !(defined NDEBUG) || !defined(OS_WIN)
+TEST_F(DBIOFailureTest, FlushSstRangeSyncError) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  options.paranoid_checks = true;
+  options.write_buffer_size = 256 * 1024 * 1024;
+  options.writable_file_max_buffer_size = 128 * 1024;
+  options.bytes_per_sync = 128 * 1024;
+  options.level0_file_num_compaction_trigger = 4;
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(10));
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  const char* io_error_msg = "range sync dummy error";
+  std::atomic<int> range_sync_called(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SpecialEnv::SStableFile::RangeSync", [&](void* arg) {
+        if (range_sync_called.fetch_add(1) == 0) {
+          Status* st = static_cast<Status*>(arg);
+          *st = Status::IOError(io_error_msg);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  std::string rnd_str =
+      rnd.RandomString(static_cast<int>(options.bytes_per_sync / 2));
+  std::string rnd_str_512kb = rnd.RandomString(512 * 1024);
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  // First 1MB doesn't get range synced
+  ASSERT_OK(Put(1, "foo0_0", rnd_str_512kb));
+  ASSERT_OK(Put(1, "foo0_1", rnd_str_512kb));
+  ASSERT_OK(Put(1, "foo1_1", rnd_str));
+  ASSERT_OK(Put(1, "foo1_2", rnd_str));
+  ASSERT_OK(Put(1, "foo1_3", rnd_str));
+  ASSERT_OK(Put(1, "foo2", "bar"));
+  ASSERT_OK(Put(1, "foo3_1", rnd_str));
+  ASSERT_OK(Put(1, "foo3_2", rnd_str));
+  ASSERT_OK(Put(1, "foo3_3", rnd_str));
+  ASSERT_OK(Put(1, "foo4", "bar"));
+  Status s = dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_STREQ(s.getState(), io_error_msg);
+
+  // Following writes should fail as flush failed.
+  ASSERT_NOK(Put(1, "foo2", "bar3"));
+  ASSERT_EQ("bar", Get(1, "foo"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_GE(1, range_sync_called.load());
+
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ("bar", Get(1, "foo"));
+}
+
+TEST_F(DBIOFailureTest, CompactSstRangeSyncError) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  options.paranoid_checks = true;
+  options.write_buffer_size = 256 * 1024 * 1024;
+  options.writable_file_max_buffer_size = 128 * 1024;
+  options.bytes_per_sync = 128 * 1024;
+  options.level0_file_num_compaction_trigger = 2;
+  options.target_file_size_base = 256 * 1024 * 1024;
+  options.disable_auto_compactions = true;
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Random rnd(301);
+  std::string rnd_str =
+      rnd.RandomString(static_cast<int>(options.bytes_per_sync / 2));
+  std::string rnd_str_512kb = rnd.RandomString(512 * 1024);
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  // First 1MB doesn't get range synced
+  ASSERT_OK(Put(1, "foo0_0", rnd_str_512kb));
+  ASSERT_OK(Put(1, "foo0_1", rnd_str_512kb));
+  ASSERT_OK(Put(1, "foo1_1", rnd_str));
+  ASSERT_OK(Put(1, "foo1_2", rnd_str));
+  ASSERT_OK(Put(1, "foo1_3", rnd_str));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "foo3_1", rnd_str));
+  ASSERT_OK(Put(1, "foo3_2", rnd_str));
+  ASSERT_OK(Put(1, "foo3_3", rnd_str));
+  ASSERT_OK(Put(1, "foo4", "bar"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+
+  const char* io_error_msg = "range sync dummy error";
+  std::atomic<int> range_sync_called(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SpecialEnv::SStableFile::RangeSync", [&](void* arg) {
+        if (range_sync_called.fetch_add(1) == 0) {
+          Status* st = static_cast<Status*>(arg);
+          *st = Status::IOError(io_error_msg);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(dbfull()->SetOptions(handles_[1],
+                                 {
+                                     {"disable_auto_compactions", "false"},
+                                 }));
+  Status s = dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_STREQ(s.getState(), io_error_msg);
+
+  // Following writes should fail as flush failed.
+  ASSERT_NOK(Put(1, "foo2", "bar3"));
+  ASSERT_EQ("bar", Get(1, "foo"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_GE(1, range_sync_called.load());
+
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ("bar", Get(1, "foo"));
+}
+
+TEST_F(DBIOFailureTest, FlushSstCloseError) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  options.paranoid_checks = true;
+  options.level0_file_num_compaction_trigger = 4;
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(2));
+
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  const char* io_error_msg = "close dummy error";
+  std::atomic<int> close_called(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SpecialEnv::SStableFile::Close", [&](void* arg) {
+        if (close_called.fetch_add(1) == 0) {
+          Status* st = static_cast<Status*>(arg);
+          *st = Status::IOError(io_error_msg);
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "foo1", "bar1"));
+  ASSERT_OK(Put(1, "foo", "bar2"));
+  Status s = dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_STREQ(s.getState(), io_error_msg);
+
+  // Following writes should fail as flush failed.
+  ASSERT_NOK(Put(1, "foo2", "bar3"));
+  ASSERT_EQ("bar2", Get(1, "foo"));
+  ASSERT_EQ("bar1", Get(1, "foo1"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ("bar2", Get(1, "foo"));
+  ASSERT_EQ("bar1", Get(1, "foo1"));
+}
+
+TEST_F(DBIOFailureTest, CompactionSstCloseError) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  options.paranoid_checks = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.disable_auto_compactions = true;
+
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "foo2", "bar"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(1, "foo", "bar2"));
+  ASSERT_OK(Put(1, "foo2", "bar"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(1, "foo", "bar3"));
+  ASSERT_OK(Put(1, "foo2", "bar"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  const char* io_error_msg = "close dummy error";
+  std::atomic<int> close_called(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SpecialEnv::SStableFile::Close", [&](void* arg) {
+        if (close_called.fetch_add(1) == 0) {
+          Status* st = static_cast<Status*>(arg);
+          *st = Status::IOError(io_error_msg);
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(dbfull()->SetOptions(handles_[1],
+                                 {
+                                     {"disable_auto_compactions", "false"},
+                                 }));
+  Status s = dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_STREQ(s.getState(), io_error_msg);
+
+  // Following writes should fail as compaction failed.
+  ASSERT_NOK(Put(1, "foo2", "bar3"));
+  ASSERT_EQ("bar3", Get(1, "foo"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ("bar3", Get(1, "foo"));
+}
+
+TEST_F(DBIOFailureTest, FlushSstSyncError) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  options.paranoid_checks = true;
+  options.use_fsync = false;
+  options.level0_file_num_compaction_trigger = 4;
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(2));
+
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  const char* io_error_msg = "sync dummy error";
+  std::atomic<int> sync_called(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SpecialEnv::SStableFile::Sync", [&](void* arg) {
+        if (sync_called.fetch_add(1) == 0) {
+          Status* st = static_cast<Status*>(arg);
+          *st = Status::IOError(io_error_msg);
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "foo1", "bar1"));
+  ASSERT_OK(Put(1, "foo", "bar2"));
+  Status s = dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_STREQ(s.getState(), io_error_msg);
+
+  // Following writes should fail as flush failed.
+  ASSERT_NOK(Put(1, "foo2", "bar3"));
+  ASSERT_EQ("bar2", Get(1, "foo"));
+  ASSERT_EQ("bar1", Get(1, "foo1"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ("bar2", Get(1, "foo"));
+  ASSERT_EQ("bar1", Get(1, "foo1"));
+}
+
+TEST_F(DBIOFailureTest, CompactionSstSyncError) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  options.paranoid_checks = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.disable_auto_compactions = true;
+  options.use_fsync = false;
+
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "foo2", "bar"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(1, "foo", "bar2"));
+  ASSERT_OK(Put(1, "foo2", "bar"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(1, "foo", "bar3"));
+  ASSERT_OK(Put(1, "foo2", "bar"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  const char* io_error_msg = "sync dummy error";
+  std::atomic<int> sync_called(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SpecialEnv::SStableFile::Sync", [&](void* arg) {
+        if (sync_called.fetch_add(1) == 0) {
+          Status* st = static_cast<Status*>(arg);
+          *st = Status::IOError(io_error_msg);
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(dbfull()->SetOptions(handles_[1],
+                                 {
+                                     {"disable_auto_compactions", "false"},
+                                 }));
+  Status s = dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_STREQ(s.getState(), io_error_msg);
+
+  // Following writes should fail as compaction failed.
+  ASSERT_NOK(Put(1, "foo2", "bar3"));
+  ASSERT_EQ("bar3", Get(1, "foo"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ("bar3", Get(1, "foo"));
+}
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_iter.cc b/src/rocksdb/db/db_iter.cc
new file mode 100644
index 000000000..e1375deb7
--- /dev/null
+++ b/src/rocksdb/db/db_iter.cc
@@ -0,0 +1,1708 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_iter.h"
+
+#include <iostream>
+#include <limits>
+#include <string>
+
+#include "db/dbformat.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/wide/wide_column_serialization.h"
+#include "file/filename.h"
+#include "logging/logging.h"
+#include "memory/arena.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/system_clock.h"
+#include "table/internal_iterator.h"
+#include "table/iterator_wrapper.h"
+#include "trace_replay/trace_replay.h"
+#include "util/mutexlock.h"
+#include "util/string_util.h"
+#include "util/user_comparator_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+DBIter::DBIter(Env* _env, const ReadOptions& read_options,
+               const ImmutableOptions& ioptions,
+               const MutableCFOptions& mutable_cf_options,
+               const Comparator* cmp, InternalIterator* iter,
+               const Version* version, SequenceNumber s, bool arena_mode,
+               uint64_t max_sequential_skip_in_iterations,
+               ReadCallback* read_callback, DBImpl* db_impl,
+               ColumnFamilyData* cfd, bool expose_blob_index)
+    : prefix_extractor_(mutable_cf_options.prefix_extractor.get()),
+      env_(_env),
+      clock_(ioptions.clock),
+      logger_(ioptions.logger),
+      user_comparator_(cmp),
+      merge_operator_(ioptions.merge_operator.get()),
+      iter_(iter),
+      version_(version),
+      read_callback_(read_callback),
+      sequence_(s),
+      statistics_(ioptions.stats),
+      max_skip_(max_sequential_skip_in_iterations),
+      max_skippable_internal_keys_(read_options.max_skippable_internal_keys),
+      num_internal_keys_skipped_(0),
+      iterate_lower_bound_(read_options.iterate_lower_bound),
+      iterate_upper_bound_(read_options.iterate_upper_bound),
+      direction_(kForward),
+      valid_(false),
+      current_entry_is_merged_(false),
+      is_key_seqnum_zero_(false),
+      prefix_same_as_start_(mutable_cf_options.prefix_extractor
+                                ? read_options.prefix_same_as_start
+                                : false),
+      pin_thru_lifetime_(read_options.pin_data),
+      expect_total_order_inner_iter_(prefix_extractor_ == nullptr ||
+                                     read_options.total_order_seek ||
+                                     read_options.auto_prefix_mode),
+      read_tier_(read_options.read_tier),
+      fill_cache_(read_options.fill_cache),
+      verify_checksums_(read_options.verify_checksums),
+      expose_blob_index_(expose_blob_index),
+      is_blob_(false),
+      arena_mode_(arena_mode),
+      db_impl_(db_impl),
+      cfd_(cfd),
+      timestamp_ub_(read_options.timestamp),
+      timestamp_lb_(read_options.iter_start_ts),
+      timestamp_size_(timestamp_ub_ ? timestamp_ub_->size() : 0) {
+  RecordTick(statistics_, NO_ITERATOR_CREATED);
+  if (pin_thru_lifetime_) {
+    pinned_iters_mgr_.StartPinning();
+  }
+  if (iter_.iter()) {
+    iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_);
+  }
+  status_.PermitUncheckedError();
+  assert(timestamp_size_ ==
+         user_comparator_.user_comparator()->timestamp_size());
+}
+
+Status DBIter::GetProperty(std::string prop_name, std::string* prop) {
+  if (prop == nullptr) {
+    return Status::InvalidArgument("prop is nullptr");
+  }
+  if (prop_name == "rocksdb.iterator.super-version-number") {
+    // First try to pass the value returned from inner iterator.
+    return iter_.iter()->GetProperty(prop_name, prop);
+  } else if (prop_name == "rocksdb.iterator.is-key-pinned") {
+    if (valid_) {
+      *prop = (pin_thru_lifetime_ && saved_key_.IsKeyPinned()) ? "1" : "0";
+    } else {
+      *prop = "Iterator is not valid.";
+    }
+    return Status::OK();
+  } else if (prop_name == "rocksdb.iterator.internal-key") {
+    *prop = saved_key_.GetUserKey().ToString();
+    return Status::OK();
+  }
+  return Status::InvalidArgument("Unidentified property.");
+}
+
+bool DBIter::ParseKey(ParsedInternalKey* ikey) {
+  Status s = ParseInternalKey(iter_.key(), ikey, false /* log_err_key */);
+  if (!s.ok()) {
+    status_ = Status::Corruption("In DBIter: ", s.getState());
+    valid_ = false;
+    ROCKS_LOG_ERROR(logger_, "In DBIter: %s", status_.getState());
+    return false;
+  } else {
+    return true;
+  }
+}
+
+void DBIter::Next() {
+  assert(valid_);
+  assert(status_.ok());
+
+  PERF_CPU_TIMER_GUARD(iter_next_cpu_nanos, clock_);
+  // Release temporarily pinned blocks from last operation
+  ReleaseTempPinnedData();
+  ResetBlobValue();
+  ResetValueAndColumns();
+  local_stats_.skip_count_ += num_internal_keys_skipped_;
+  local_stats_.skip_count_--;
+  num_internal_keys_skipped_ = 0;
+  bool ok = true;
+  if (direction_ == kReverse) {
+    is_key_seqnum_zero_ = false;
+    if (!ReverseToForward()) {
+      ok = false;
+    }
+  } else if (!current_entry_is_merged_) {
+    // If the current value is not a merge, the iter position is the
+    // current key, which is already returned. We can safely issue a
+    // Next() without checking the current key.
+    // If the current key is a merge, very likely iter already points
+    // to the next internal position.
+    assert(iter_.Valid());
+    iter_.Next();
+    PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+  }
+
+  local_stats_.next_count_++;
+  if (ok && iter_.Valid()) {
+    ClearSavedValue();
+
+    if (prefix_same_as_start_) {
+      assert(prefix_extractor_ != nullptr);
+      const Slice prefix = prefix_.GetUserKey();
+      FindNextUserEntry(true /* skipping the current user key */, &prefix);
+    } else {
+      FindNextUserEntry(true /* skipping the current user key */, nullptr);
+    }
+  } else {
+    is_key_seqnum_zero_ = false;
+    valid_ = false;
+  }
+  if (statistics_ != nullptr && valid_) {
+    local_stats_.next_found_count_++;
+    local_stats_.bytes_read_ += (key().size() + value().size());
+  }
+}
+
+bool DBIter::SetBlobValueIfNeeded(const Slice& user_key,
+                                  const Slice& blob_index) {
+  assert(!is_blob_);
+  assert(blob_value_.empty());
+
+  if (expose_blob_index_) {  // Stacked BlobDB implementation
+    is_blob_ = true;
+    return true;
+  }
+
+  if (!version_) {
+    status_ = Status::Corruption("Encountered unexpected blob index.");
+    valid_ = false;
+    return false;
+  }
+
+  // TODO: consider moving ReadOptions from ArenaWrappedDBIter to DBIter to
+  // avoid having to copy options back and forth.
+  ReadOptions read_options;
+  read_options.read_tier = read_tier_;
+  read_options.fill_cache = fill_cache_;
+  read_options.verify_checksums = verify_checksums_;
+
+  constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+  constexpr uint64_t* bytes_read = nullptr;
+
+  const Status s = version_->GetBlob(read_options, user_key, blob_index,
+                                     prefetch_buffer, &blob_value_, bytes_read);
+
+  if (!s.ok()) {
+    status_ = s;
+    valid_ = false;
+    return false;
+  }
+
+  is_blob_ = true;
+  return true;
+}
+
+bool DBIter::SetValueAndColumnsFromEntity(Slice slice) {
+  assert(value_.empty());
+  assert(wide_columns_.empty());
+
+  const Status s = WideColumnSerialization::Deserialize(slice, wide_columns_);
+
+  if (!s.ok()) {
+    status_ = s;
+    valid_ = false;
+    return false;
+  }
+
+  if (!wide_columns_.empty() &&
+      wide_columns_[0].name() == kDefaultWideColumnName) {
+    value_ = wide_columns_[0].value();
+  }
+
+  return true;
+}
+
+// PRE: saved_key_ has the current user key if skipping_saved_key
+// POST: saved_key_ should have the next user key if valid_,
+//       if the current entry is a result of merge
+//           current_entry_is_merged_ => true
+//           saved_value_             => the merged value
+//
+// NOTE: In between, saved_key_ can point to a user key that has
+//       a delete marker or a sequence number higher than sequence_
+//       saved_key_ MUST have a proper user_key before calling this function
+//
+// The prefix parameter, if not null, indicates that we need to iterate
+// within the prefix, and the iterator needs to be made invalid, if no
+// more entry for the prefix can be found.
+bool DBIter::FindNextUserEntry(bool skipping_saved_key, const Slice* prefix) {
+  PERF_TIMER_GUARD(find_next_user_entry_time);
+  return FindNextUserEntryInternal(skipping_saved_key, prefix);
+}
+
+// Actual implementation of DBIter::FindNextUserEntry()
+bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
+                                       const Slice* prefix) {
+  // Loop until we hit an acceptable entry to yield
+  assert(iter_.Valid());
+  assert(status_.ok());
+  assert(direction_ == kForward);
+  current_entry_is_merged_ = false;
+
+  // How many times in a row we have skipped an entry with user key less than
+  // or equal to saved_key_. We could skip these entries either because
+  // sequence numbers were too high or because skipping_saved_key = true.
+  // What saved_key_ contains throughout this method:
+  //  - if skipping_saved_key : saved_key_ contains the key that we need
+  //                            to skip, and we haven't seen any keys greater
+  //                            than that,
+  //  - if num_skipped > 0    : saved_key_ contains the key that we have skipped
+  //                            num_skipped times, and we haven't seen any keys
+  //                            greater than that,
+  //  - none of the above     : saved_key_ can contain anything, it doesn't
+  //                            matter.
+  uint64_t num_skipped = 0;
+  // For write unprepared, the target sequence number in reseek could be larger
+  // than the snapshot, and thus needs to be skipped again. This could result in
+  // an infinite loop of reseeks. To avoid that, we limit the number of reseeks
+  // to one.
+  bool reseek_done = false;
+
+  do {
+    // Will update is_key_seqnum_zero_ as soon as we parsed the current key
+    // but we need to save the previous value to be used in the loop.
+    bool is_prev_key_seqnum_zero = is_key_seqnum_zero_;
+    if (!ParseKey(&ikey_)) {
+      is_key_seqnum_zero_ = false;
+      return false;
+    }
+    Slice user_key_without_ts =
+        StripTimestampFromUserKey(ikey_.user_key, timestamp_size_);
+
+    is_key_seqnum_zero_ = (ikey_.sequence == 0);
+
+    assert(iterate_upper_bound_ == nullptr ||
+           iter_.UpperBoundCheckResult() != IterBoundCheck::kInbound ||
+           user_comparator_.CompareWithoutTimestamp(
+               user_key_without_ts, /*a_has_ts=*/false, *iterate_upper_bound_,
+               /*b_has_ts=*/false) < 0);
+    if (iterate_upper_bound_ != nullptr &&
+        iter_.UpperBoundCheckResult() != IterBoundCheck::kInbound &&
+        user_comparator_.CompareWithoutTimestamp(
+            user_key_without_ts, /*a_has_ts=*/false, *iterate_upper_bound_,
+            /*b_has_ts=*/false) >= 0) {
+      break;
+    }
+
+    assert(prefix == nullptr || prefix_extractor_ != nullptr);
+    if (prefix != nullptr &&
+        prefix_extractor_->Transform(user_key_without_ts).compare(*prefix) !=
+            0) {
+      assert(prefix_same_as_start_);
+      break;
+    }
+
+    if (TooManyInternalKeysSkipped()) {
+      return false;
+    }
+
+    assert(ikey_.user_key.size() >= timestamp_size_);
+    Slice ts = timestamp_size_ > 0 ? ExtractTimestampFromUserKey(
+                                         ikey_.user_key, timestamp_size_)
+                                   : Slice();
+    bool more_recent = false;
+    if (IsVisible(ikey_.sequence, ts, &more_recent)) {
+      // If the previous entry is of seqnum 0, the current entry will not
+      // possibly be skipped. This condition can potentially be relaxed to
+      // prev_key.seq <= ikey_.sequence. We are cautious because it will be more
+      // prone to bugs causing the same user key with the same sequence number.
+      // Note that with current timestamp implementation, the same user key can
+      // have different timestamps and zero sequence number on the bottommost
+      // level. This may change in the future.
+      if ((!is_prev_key_seqnum_zero || timestamp_size_ > 0) &&
+          skipping_saved_key &&
+          CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) <= 0) {
+        num_skipped++;  // skip this entry
+        PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+      } else {
+        assert(!skipping_saved_key ||
+               CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) > 0);
+        if (!iter_.PrepareValue()) {
+          assert(!iter_.status().ok());
+          valid_ = false;
+          return false;
+        }
+        num_skipped = 0;
+        reseek_done = false;
+        switch (ikey_.type) {
+          case kTypeDeletion:
+          case kTypeDeletionWithTimestamp:
+          case kTypeSingleDeletion:
+            // Arrange to skip all upcoming entries for this key since
+            // they are hidden by this deletion.
+            if (timestamp_lb_) {
+              saved_key_.SetInternalKey(ikey_);
+              valid_ = true;
+              return true;
+            } else {
+              saved_key_.SetUserKey(
+                  ikey_.user_key, !pin_thru_lifetime_ ||
+                                      !iter_.iter()->IsKeyPinned() /* copy */);
+              skipping_saved_key = true;
+              PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+            }
+            break;
+          case kTypeValue:
+          case kTypeBlobIndex:
+          case kTypeWideColumnEntity:
+            if (timestamp_lb_) {
+              saved_key_.SetInternalKey(ikey_);
+            } else {
+              saved_key_.SetUserKey(
+                  ikey_.user_key, !pin_thru_lifetime_ ||
+                                      !iter_.iter()->IsKeyPinned() /* copy */);
+            }
+
+            if (ikey_.type == kTypeBlobIndex) {
+              if (!SetBlobValueIfNeeded(ikey_.user_key, iter_.value())) {
+                return false;
+              }
+
+              SetValueAndColumnsFromPlain(expose_blob_index_ ? iter_.value()
+                                                             : blob_value_);
+            } else if (ikey_.type == kTypeWideColumnEntity) {
+              if (!SetValueAndColumnsFromEntity(iter_.value())) {
+                return false;
+              }
+            } else {
+              assert(ikey_.type == kTypeValue);
+              SetValueAndColumnsFromPlain(iter_.value());
+            }
+
+            valid_ = true;
+            return true;
+            break;
+          case kTypeMerge:
+            saved_key_.SetUserKey(
+                ikey_.user_key,
+                !pin_thru_lifetime_ || !iter_.iter()->IsKeyPinned() /* copy */);
+            // By now, we are sure the current ikey is going to yield a value
+            current_entry_is_merged_ = true;
+            valid_ = true;
+            return MergeValuesNewToOld();  // Go to a different state machine
+            break;
+          default:
+            valid_ = false;
+            status_ = Status::Corruption(
+                "Unknown value type: " +
+                std::to_string(static_cast<unsigned int>(ikey_.type)));
+            return false;
+        }
+      }
+    } else {
+      if (more_recent) {
+        PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
+      }
+
+      // This key was inserted after our snapshot was taken or skipped by
+      // timestamp range. If this happens too many times in a row for the same
+      // user key, we want to seek to the target sequence number.
+      int cmp = user_comparator_.CompareWithoutTimestamp(
+          ikey_.user_key, saved_key_.GetUserKey());
+      if (cmp == 0 || (skipping_saved_key && cmp < 0)) {
+        num_skipped++;
+      } else {
+        saved_key_.SetUserKey(
+            ikey_.user_key,
+            !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
+        skipping_saved_key = false;
+        num_skipped = 0;
+        reseek_done = false;
+      }
+    }
+
+    // If we have sequentially iterated via numerous equal keys, then it's
+    // better to seek so that we can avoid too many key comparisons.
+    //
+    // To avoid infinite loops, do not reseek if we have already attempted to
+    // reseek previously.
+    //
+    // TODO(lth): If we reseek to sequence number greater than ikey_.sequence,
+    // then it does not make sense to reseek as we would actually land further
+    // away from the desired key. There is opportunity for optimization here.
+    if (num_skipped > max_skip_ && !reseek_done) {
+      is_key_seqnum_zero_ = false;
+      num_skipped = 0;
+      reseek_done = true;
+      std::string last_key;
+      if (skipping_saved_key) {
+        // We're looking for the next user-key but all we see are the same
+        // user-key with decreasing sequence numbers. Fast forward to
+        // sequence number 0 and type deletion (the smallest type).
+        if (timestamp_size_ == 0) {
+          AppendInternalKey(
+              &last_key,
+              ParsedInternalKey(saved_key_.GetUserKey(), 0, kTypeDeletion));
+        } else {
+          const std::string kTsMin(timestamp_size_, '\0');
+          AppendInternalKeyWithDifferentTimestamp(
+              &last_key,
+              ParsedInternalKey(saved_key_.GetUserKey(), 0, kTypeDeletion),
+              kTsMin);
+        }
+        // Don't set skipping_saved_key = false because we may still see more
+        // user-keys equal to saved_key_.
+      } else {
+        // We saw multiple entries with this user key and sequence numbers
+        // higher than sequence_. Fast forward to sequence_.
+        // Note that this only covers a case when a higher key was overwritten
+        // many times since our snapshot was taken, not the case when a lot of
+        // different keys were inserted after our snapshot was taken.
+        if (timestamp_size_ == 0) {
+          AppendInternalKey(
+              &last_key, ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
+                                           kValueTypeForSeek));
+        } else {
+          AppendInternalKeyWithDifferentTimestamp(
+              &last_key,
+              ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
+                                kValueTypeForSeek),
+              *timestamp_ub_);
+        }
+      }
+      iter_.Seek(last_key);
+      RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+    } else {
+      iter_.Next();
+    }
+  } while (iter_.Valid());
+
+  valid_ = false;
+  return iter_.status().ok();
+}
+
+// Merge values of the same user key starting from the current iter_ position
+// Scan from the newer entries to older entries.
+// PRE: iter_.key() points to the first merge type entry
+//      saved_key_ stores the user key
+//      iter_.PrepareValue() has been called
+// POST: saved_value_ has the merged value for the user key
+//       iter_ points to the next entry (or invalid)
+bool DBIter::MergeValuesNewToOld() {
+  if (!merge_operator_) {
+    ROCKS_LOG_ERROR(logger_, "Options::merge_operator is null.");
+    status_ = Status::InvalidArgument("merge_operator_ must be set.");
+    valid_ = false;
+    return false;
+  }
+
+  // Temporarily pin the blocks that hold merge operands
+  TempPinData();
+  merge_context_.Clear();
+  // Start the merge process by pushing the first operand
+  merge_context_.PushOperand(
+      iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
+  TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:PushedFirstOperand");
+
+  ParsedInternalKey ikey;
+  for (iter_.Next(); iter_.Valid(); iter_.Next()) {
+    TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:SteppedToNextOperand");
+    if (!ParseKey(&ikey)) {
+      return false;
+    }
+
+    if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key,
+                                                saved_key_.GetUserKey())) {
+      // hit the next user key, stop right here
+      break;
+    }
+    if (kTypeDeletion == ikey.type || kTypeSingleDeletion == ikey.type ||
+        kTypeDeletionWithTimestamp == ikey.type) {
+      // hit a delete with the same user key, stop right here
+      // iter_ is positioned after delete
+      iter_.Next();
+      break;
+    }
+    if (!iter_.PrepareValue()) {
+      valid_ = false;
+      return false;
+    }
+
+    if (kTypeValue == ikey.type) {
+      // hit a put, merge the put value with operands and store the
+      // final result in saved_value_. We are done!
+      const Slice val = iter_.value();
+      if (!Merge(&val, ikey.user_key)) {
+        return false;
+      }
+      // iter_ is positioned after put
+      iter_.Next();
+      if (!iter_.status().ok()) {
+        valid_ = false;
+        return false;
+      }
+      return true;
+    } else if (kTypeMerge == ikey.type) {
+      // hit a merge, add the value as an operand and run associative merge.
+      // when complete, add result to operands and continue.
+      merge_context_.PushOperand(
+          iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
+      PERF_COUNTER_ADD(internal_merge_count, 1);
+    } else if (kTypeBlobIndex == ikey.type) {
+      if (expose_blob_index_) {
+        status_ =
+            Status::NotSupported("BlobDB does not support merge operator.");
+        valid_ = false;
+        return false;
+      }
+      // hit a put, merge the put value with operands and store the
+      // final result in saved_value_. We are done!
+      if (!SetBlobValueIfNeeded(ikey.user_key, iter_.value())) {
+        return false;
+      }
+      valid_ = true;
+      if (!Merge(&blob_value_, ikey.user_key)) {
+        return false;
+      }
+
+      ResetBlobValue();
+
+      // iter_ is positioned after put
+      iter_.Next();
+      if (!iter_.status().ok()) {
+        valid_ = false;
+        return false;
+      }
+      return true;
+    } else if (kTypeWideColumnEntity == ikey.type) {
+      if (!MergeEntity(iter_.value(), ikey.user_key)) {
+        return false;
+      }
+
+      // iter_ is positioned after put
+      iter_.Next();
+      if (!iter_.status().ok()) {
+        valid_ = false;
+        return false;
+      }
+
+      return true;
+    } else {
+      valid_ = false;
+      status_ = Status::Corruption(
+          "Unrecognized value type: " +
+          std::to_string(static_cast<unsigned int>(ikey.type)));
+      return false;
+    }
+  }
+
+  if (!iter_.status().ok()) {
+    valid_ = false;
+    return false;
+  }
+
+  // we either exhausted all internal keys under this user key, or hit
+  // a deletion marker.
+  // feed null as the existing value to the merge operator, such that
+  // client can differentiate this scenario and do things accordingly.
+  if (!Merge(nullptr, saved_key_.GetUserKey())) {
+    return false;
+  }
+  assert(status_.ok());
+  return true;
+}
+
+void DBIter::Prev() {
+  assert(valid_);
+  assert(status_.ok());
+
+  PERF_CPU_TIMER_GUARD(iter_prev_cpu_nanos, clock_);
+  ReleaseTempPinnedData();
+  ResetBlobValue();
+  ResetValueAndColumns();
+  ResetInternalKeysSkippedCounter();
+  bool ok = true;
+  if (direction_ == kForward) {
+    if (!ReverseToBackward()) {
+      ok = false;
+    }
+  }
+  if (ok) {
+    ClearSavedValue();
+
+    Slice prefix;
+    if (prefix_same_as_start_) {
+      assert(prefix_extractor_ != nullptr);
+      prefix = prefix_.GetUserKey();
+    }
+    PrevInternal(prefix_same_as_start_ ? &prefix : nullptr);
+  }
+
+  if (statistics_ != nullptr) {
+    local_stats_.prev_count_++;
+    if (valid_) {
+      local_stats_.prev_found_count_++;
+      local_stats_.bytes_read_ += (key().size() + value().size());
+    }
+  }
+}
+
+bool DBIter::ReverseToForward() {
+  assert(iter_.status().ok());
+
+  // When moving backwards, iter_ is positioned on _previous_ key, which may
+  // not exist or may have different prefix than the current key().
+  // If that's the case, seek iter_ to current key.
+  if (!expect_total_order_inner_iter() || !iter_.Valid()) {
+    IterKey last_key;
+    ParsedInternalKey pikey(saved_key_.GetUserKey(), kMaxSequenceNumber,
+                            kValueTypeForSeek);
+    if (timestamp_size_ > 0) {
+      // TODO: pre-create kTsMax.
+      const std::string kTsMax(timestamp_size_, '\xff');
+      pikey.SetTimestamp(kTsMax);
+    }
+    last_key.SetInternalKey(pikey);
+    iter_.Seek(last_key.GetInternalKey());
+    RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+  }
+
+  direction_ = kForward;
+  // Skip keys less than the current key() (a.k.a. saved_key_).
+  while (iter_.Valid()) {
+    ParsedInternalKey ikey;
+    if (!ParseKey(&ikey)) {
+      return false;
+    }
+    if (user_comparator_.Compare(ikey.user_key, saved_key_.GetUserKey()) >= 0) {
+      return true;
+    }
+    iter_.Next();
+  }
+
+  if (!iter_.status().ok()) {
+    valid_ = false;
+    return false;
+  }
+
+  return true;
+}
+
+// Move iter_ to the key before saved_key_.
+bool DBIter::ReverseToBackward() {
+  assert(iter_.status().ok());
+
+  // When current_entry_is_merged_ is true, iter_ may be positioned on the next
+  // key, which may not exist or may have prefix different from current.
+  // If that's the case, seek to saved_key_.
+  if (current_entry_is_merged_ &&
+      (!expect_total_order_inner_iter() || !iter_.Valid())) {
+    IterKey last_key;
+    // Using kMaxSequenceNumber and kValueTypeForSeek
+    // (not kValueTypeForSeekForPrev) to seek to a key strictly smaller
+    // than saved_key_.
+    last_key.SetInternalKey(ParsedInternalKey(
+        saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek));
+    if (!expect_total_order_inner_iter()) {
+      iter_.SeekForPrev(last_key.GetInternalKey());
+    } else {
+      // Some iterators may not support SeekForPrev(), so we avoid using it
+      // when prefix seek mode is disabled. This is somewhat expensive
+      // (an extra Prev(), as well as an extra change of direction of iter_),
+      // so we may need to reconsider it later.
+      iter_.Seek(last_key.GetInternalKey());
+      if (!iter_.Valid() && iter_.status().ok()) {
+        iter_.SeekToLast();
+      }
+    }
+    RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+  }
+
+  direction_ = kReverse;
+  return FindUserKeyBeforeSavedKey();
+}
+
+void DBIter::PrevInternal(const Slice* prefix) {
+  while (iter_.Valid()) {
+    saved_key_.SetUserKey(
+        ExtractUserKey(iter_.key()),
+        !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
+
+    assert(prefix == nullptr || prefix_extractor_ != nullptr);
+    if (prefix != nullptr &&
+        prefix_extractor_
+                ->Transform(StripTimestampFromUserKey(saved_key_.GetUserKey(),
+                                                      timestamp_size_))
+                .compare(*prefix) != 0) {
+      assert(prefix_same_as_start_);
+      // Current key does not have the same prefix as start
+      valid_ = false;
+      return;
+    }
+
+    assert(iterate_lower_bound_ == nullptr || iter_.MayBeOutOfLowerBound() ||
+           user_comparator_.CompareWithoutTimestamp(
+               saved_key_.GetUserKey(), /*a_has_ts=*/true,
+               *iterate_lower_bound_, /*b_has_ts=*/false) >= 0);
+    if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() &&
+        user_comparator_.CompareWithoutTimestamp(
+            saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_lower_bound_,
+            /*b_has_ts=*/false) < 0) {
+      // We've iterated earlier than the user-specified lower bound.
+      valid_ = false;
+      return;
+    }
+
+    if (!FindValueForCurrentKey()) {  // assigns valid_
+      return;
+    }
+
+    // Whether or not we found a value for current key, we need iter_ to end up
+    // on a smaller key.
+    if (!FindUserKeyBeforeSavedKey()) {
+      return;
+    }
+
+    if (valid_) {
+      // Found the value.
+      return;
+    }
+
+    if (TooManyInternalKeysSkipped(false)) {
+      return;
+    }
+  }
+
+  // We haven't found any key - iterator is not valid
+  valid_ = false;
+}
+
+// Used for backwards iteration.
+// Looks at the entries with user key saved_key_ and finds the most up-to-date
+// value for it, or executes a merge, or determines that the value was deleted.
+// Sets valid_ to true if the value is found and is ready to be presented to
+// the user through value().
+// Sets valid_ to false if the value was deleted, and we should try another key.
+// Returns false if an error occurred, and !status().ok() and !valid_.
+//
+// PRE: iter_ is positioned on the last entry with user key equal to saved_key_.
+// POST: iter_ is positioned on one of the entries equal to saved_key_, or on
+//       the entry just before them, or on the entry just after them.
+bool DBIter::FindValueForCurrentKey() {
+  assert(iter_.Valid());
+  merge_context_.Clear();
+  current_entry_is_merged_ = false;
+  // last entry before merge (could be kTypeDeletion,
+  // kTypeDeletionWithTimestamp, kTypeSingleDeletion, kTypeValue,
+  // kTypeBlobIndex, or kTypeWideColumnEntity)
+  ValueType last_not_merge_type = kTypeDeletion;
+  ValueType last_key_entry_type = kTypeDeletion;
+
+  // If false, it indicates that we have not seen any valid entry, even though
+  // last_key_entry_type is initialized to kTypeDeletion.
+  bool valid_entry_seen = false;
+
+  // Temporarily pin blocks that hold (merge operands / the value)
+  ReleaseTempPinnedData();
+  TempPinData();
+  size_t num_skipped = 0;
+  while (iter_.Valid()) {
+    ParsedInternalKey ikey;
+    if (!ParseKey(&ikey)) {
+      return false;
+    }
+
+    if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key,
+                                                saved_key_.GetUserKey())) {
+      // Found a smaller user key, thus we are done with current user key.
+      break;
+    }
+
+    assert(ikey.user_key.size() >= timestamp_size_);
+    Slice ts;
+    if (timestamp_size_ > 0) {
+      ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_,
+                 timestamp_size_);
+    }
+
+    bool visible = IsVisible(ikey.sequence, ts);
+    if (!visible &&
+        (timestamp_lb_ == nullptr ||
+         user_comparator_.CompareTimestamp(ts, *timestamp_ub_) > 0)) {
+      // Found an invisible version of the current user key, and it must have
+      // a higher sequence number or timestamp. Therefore, we are done with the
+      // current user key.
+      break;
+    }
+
+    if (!ts.empty()) {
+      saved_timestamp_.assign(ts.data(), ts.size());
+    }
+
+    if (TooManyInternalKeysSkipped()) {
+      return false;
+    }
+
+    // This user key has lots of entries.
+    // We're going from old to new, and it's taking too long. Let's do a Seek()
+    // and go from new to old. This helps when a key was overwritten many times.
+    if (num_skipped >= max_skip_) {
+      return FindValueForCurrentKeyUsingSeek();
+    }
+
+    if (!iter_.PrepareValue()) {
+      valid_ = false;
+      return false;
+    }
+
+    if (timestamp_lb_ != nullptr) {
+      // Only needed when timestamp_lb_ is not null
+      [[maybe_unused]] const bool ret = ParseKey(&ikey_);
+      saved_ikey_.assign(iter_.key().data(), iter_.key().size());
+      // Since the preceding ParseKey(&ikey) succeeds, so must this.
+      assert(ret);
+    }
+
+    valid_entry_seen = true;
+    last_key_entry_type = ikey.type;
+    switch (last_key_entry_type) {
+      case kTypeValue:
+      case kTypeBlobIndex:
+      case kTypeWideColumnEntity:
+        if (iter_.iter()->IsValuePinned()) {
+          pinned_value_ = iter_.value();
+        } else {
+          valid_ = false;
+          status_ = Status::NotSupported(
+              "Backward iteration not supported if underlying iterator's value "
+              "cannot be pinned.");
+        }
+        merge_context_.Clear();
+        last_not_merge_type = last_key_entry_type;
+        if (!status_.ok()) {
+          return false;
+        }
+        break;
+      case kTypeDeletion:
+      case kTypeDeletionWithTimestamp:
+      case kTypeSingleDeletion:
+        merge_context_.Clear();
+        last_not_merge_type = last_key_entry_type;
+        PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+        break;
+      case kTypeMerge: {
+        assert(merge_operator_ != nullptr);
+        merge_context_.PushOperandBack(
+            iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
+        PERF_COUNTER_ADD(internal_merge_count, 1);
+      } break;
+      default:
+        valid_ = false;
+        status_ = Status::Corruption(
+            "Unknown value type: " +
+            std::to_string(static_cast<unsigned int>(last_key_entry_type)));
+        return false;
+    }
+
+    PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+    iter_.Prev();
+    ++num_skipped;
+
+    if (visible && timestamp_lb_ != nullptr) {
+      // If timestamp_lb_ is not nullptr, we do not have to look further for
+      // another internal key. We can return this current internal key. Yet we
+      // still keep the invariant that iter_ is positioned before the returned
+      // key.
+      break;
+    }
+  }
+
+  if (!iter_.status().ok()) {
+    valid_ = false;
+    return false;
+  }
+
+  if (!valid_entry_seen) {
+    // Since we haven't seen any valid entry, last_key_entry_type remains
+    // unchanged and the same as its initial value.
+    assert(last_key_entry_type == kTypeDeletion);
+    assert(last_not_merge_type == kTypeDeletion);
+    valid_ = false;
+    return true;
+  }
+
+  if (timestamp_lb_ != nullptr) {
+    assert(last_key_entry_type == ikey_.type);
+  }
+
+  Status s;
+  s.PermitUncheckedError();
+
+  switch (last_key_entry_type) {
+    case kTypeDeletion:
+    case kTypeDeletionWithTimestamp:
+    case kTypeSingleDeletion:
+      if (timestamp_lb_ == nullptr) {
+        valid_ = false;
+      } else {
+        saved_key_.SetInternalKey(saved_ikey_);
+        valid_ = true;
+      }
+      return true;
+    case kTypeMerge:
+      current_entry_is_merged_ = true;
+      if (last_not_merge_type == kTypeDeletion ||
+          last_not_merge_type == kTypeSingleDeletion ||
+          last_not_merge_type == kTypeDeletionWithTimestamp) {
+        if (!Merge(nullptr, saved_key_.GetUserKey())) {
+          return false;
+        }
+        return true;
+      } else if (last_not_merge_type == kTypeBlobIndex) {
+        if (expose_blob_index_) {
+          status_ =
+              Status::NotSupported("BlobDB does not support merge operator.");
+          valid_ = false;
+          return false;
+        }
+        if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) {
+          return false;
+        }
+        valid_ = true;
+        if (!Merge(&blob_value_, saved_key_.GetUserKey())) {
+          return false;
+        }
+
+        ResetBlobValue();
+
+        return true;
+      } else if (last_not_merge_type == kTypeWideColumnEntity) {
+        if (!MergeEntity(pinned_value_, saved_key_.GetUserKey())) {
+          return false;
+        }
+
+        return true;
+      } else {
+        assert(last_not_merge_type == kTypeValue);
+        if (!Merge(&pinned_value_, saved_key_.GetUserKey())) {
+          return false;
+        }
+        return true;
+      }
+      break;
+    case kTypeValue:
+      if (timestamp_lb_ != nullptr) {
+        saved_key_.SetInternalKey(saved_ikey_);
+      }
+
+      SetValueAndColumnsFromPlain(pinned_value_);
+
+      break;
+    case kTypeBlobIndex:
+      if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) {
+        return false;
+      }
+
+      SetValueAndColumnsFromPlain(expose_blob_index_ ? pinned_value_
+                                                     : blob_value_);
+
+      break;
+    case kTypeWideColumnEntity:
+      if (!SetValueAndColumnsFromEntity(pinned_value_)) {
+        return false;
+      }
+      break;
+    default:
+      valid_ = false;
+      status_ = Status::Corruption(
+          "Unknown value type: " +
+          std::to_string(static_cast<unsigned int>(last_key_entry_type)));
+      return false;
+  }
+  if (!s.ok()) {
+    valid_ = false;
+    status_ = s;
+    return false;
+  }
+  valid_ = true;
+  return true;
+}
+
+// This function is used in FindValueForCurrentKey.
+// We use Seek() function instead of Prev() to find necessary value
+// TODO: This is very similar to FindNextUserEntry() and MergeValuesNewToOld().
+//       Would be nice to reuse some code.
+bool DBIter::FindValueForCurrentKeyUsingSeek() {
+  // FindValueForCurrentKey will enable pinning before calling
+  // FindValueForCurrentKeyUsingSeek()
+  assert(pinned_iters_mgr_.PinningEnabled());
+  std::string last_key;
+  if (0 == timestamp_size_) {
+    AppendInternalKey(&last_key,
+                      ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
+                                        kValueTypeForSeek));
+  } else {
+    AppendInternalKeyWithDifferentTimestamp(
+        &last_key,
+        ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
+                          kValueTypeForSeek),
+        timestamp_lb_ == nullptr ? *timestamp_ub_ : *timestamp_lb_);
+  }
+  iter_.Seek(last_key);
+  RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+
+  // In case read_callback presents, the value we seek to may not be visible.
+  // Find the next value that's visible.
+  ParsedInternalKey ikey;
+
+  while (true) {
+    if (!iter_.Valid()) {
+      valid_ = false;
+      return iter_.status().ok();
+    }
+
+    if (!ParseKey(&ikey)) {
+      return false;
+    }
+    assert(ikey.user_key.size() >= timestamp_size_);
+    Slice ts;
+    if (timestamp_size_ > 0) {
+      ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_,
+                 timestamp_size_);
+    }
+
+    if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key,
+                                                saved_key_.GetUserKey())) {
+      // No visible values for this key, even though FindValueForCurrentKey()
+      // has seen some. This is possible if we're using a tailing iterator, and
+      // the entries were discarded in a compaction.
+      valid_ = false;
+      return true;
+    }
+
+    if (IsVisible(ikey.sequence, ts)) {
+      break;
+    }
+
+    iter_.Next();
+  }
+
+  if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion ||
+      kTypeDeletionWithTimestamp == ikey.type) {
+    if (timestamp_lb_ == nullptr) {
+      valid_ = false;
+    } else {
+      valid_ = true;
+      saved_key_.SetInternalKey(ikey);
+    }
+    return true;
+  }
+  if (!iter_.PrepareValue()) {
+    valid_ = false;
+    return false;
+  }
+  if (timestamp_size_ > 0) {
+    Slice ts = ExtractTimestampFromUserKey(ikey.user_key, timestamp_size_);
+    saved_timestamp_.assign(ts.data(), ts.size());
+  }
+  if (ikey.type == kTypeValue || ikey.type == kTypeBlobIndex ||
+      ikey.type == kTypeWideColumnEntity) {
+    assert(iter_.iter()->IsValuePinned());
+    pinned_value_ = iter_.value();
+    if (ikey.type == kTypeBlobIndex) {
+      if (!SetBlobValueIfNeeded(ikey.user_key, pinned_value_)) {
+        return false;
+      }
+
+      SetValueAndColumnsFromPlain(expose_blob_index_ ? pinned_value_
+                                                     : blob_value_);
+    } else if (ikey.type == kTypeWideColumnEntity) {
+      if (!SetValueAndColumnsFromEntity(pinned_value_)) {
+        return false;
+      }
+    } else {
+      assert(ikey.type == kTypeValue);
+      SetValueAndColumnsFromPlain(pinned_value_);
+    }
+
+    if (timestamp_lb_ != nullptr) {
+      saved_key_.SetInternalKey(ikey);
+    }
+
+    valid_ = true;
+    return true;
+  }
+
+  // kTypeMerge. We need to collect all kTypeMerge values and save them
+  // in operands
+  assert(ikey.type == kTypeMerge);
+  current_entry_is_merged_ = true;
+  merge_context_.Clear();
+  merge_context_.PushOperand(
+      iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
+  while (true) {
+    iter_.Next();
+
+    if (!iter_.Valid()) {
+      if (!iter_.status().ok()) {
+        valid_ = false;
+        return false;
+      }
+      break;
+    }
+    if (!ParseKey(&ikey)) {
+      return false;
+    }
+    if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key,
+                                                saved_key_.GetUserKey())) {
+      break;
+    }
+    if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion ||
+        ikey.type == kTypeDeletionWithTimestamp) {
+      break;
+    }
+    if (!iter_.PrepareValue()) {
+      valid_ = false;
+      return false;
+    }
+
+    if (ikey.type == kTypeValue) {
+      const Slice val = iter_.value();
+      if (!Merge(&val, saved_key_.GetUserKey())) {
+        return false;
+      }
+      return true;
+    } else if (ikey.type == kTypeMerge) {
+      merge_context_.PushOperand(
+          iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
+      PERF_COUNTER_ADD(internal_merge_count, 1);
+    } else if (ikey.type == kTypeBlobIndex) {
+      if (expose_blob_index_) {
+        status_ =
+            Status::NotSupported("BlobDB does not support merge operator.");
+        valid_ = false;
+        return false;
+      }
+      if (!SetBlobValueIfNeeded(ikey.user_key, iter_.value())) {
+        return false;
+      }
+      valid_ = true;
+      if (!Merge(&blob_value_, saved_key_.GetUserKey())) {
+        return false;
+      }
+
+      ResetBlobValue();
+
+      return true;
+    } else if (ikey.type == kTypeWideColumnEntity) {
+      if (!MergeEntity(iter_.value(), saved_key_.GetUserKey())) {
+        return false;
+      }
+
+      return true;
+    } else {
+      valid_ = false;
+      status_ = Status::Corruption(
+          "Unknown value type: " +
+          std::to_string(static_cast<unsigned int>(ikey.type)));
+      return false;
+    }
+  }
+
+  if (!Merge(nullptr, saved_key_.GetUserKey())) {
+    return false;
+  }
+
+  // Make sure we leave iter_ in a good state. If it's valid and we don't care
+  // about prefixes, that's already good enough. Otherwise it needs to be
+  // seeked to the current key.
+  if (!expect_total_order_inner_iter() || !iter_.Valid()) {
+    if (!expect_total_order_inner_iter()) {
+      iter_.SeekForPrev(last_key);
+    } else {
+      iter_.Seek(last_key);
+      if (!iter_.Valid() && iter_.status().ok()) {
+        iter_.SeekToLast();
+      }
+    }
+    RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+  }
+
+  valid_ = true;
+  return true;
+}
+
+bool DBIter::Merge(const Slice* val, const Slice& user_key) {
+  Status s = MergeHelper::TimedFullMerge(
+      merge_operator_, user_key, val, merge_context_.GetOperands(),
+      &saved_value_, logger_, statistics_, clock_, &pinned_value_,
+      /* update_num_ops_stats */ true);
+  if (!s.ok()) {
+    valid_ = false;
+    status_ = s;
+    return false;
+  }
+
+  SetValueAndColumnsFromPlain(pinned_value_.data() ? pinned_value_
+                                                   : saved_value_);
+
+  valid_ = true;
+  return true;
+}
+
+bool DBIter::MergeEntity(const Slice& entity, const Slice& user_key) {
+  Status s = MergeHelper::TimedFullMergeWithEntity(
+      merge_operator_, user_key, entity, merge_context_.GetOperands(),
+      &saved_value_, logger_, statistics_, clock_,
+      /* update_num_ops_stats */ true);
+  if (!s.ok()) {
+    valid_ = false;
+    status_ = s;
+    return false;
+  }
+
+  if (!SetValueAndColumnsFromEntity(saved_value_)) {
+    return false;
+  }
+
+  valid_ = true;
+  return true;
+}
+
+// Move backwards until the key smaller than saved_key_.
+// Changes valid_ only if return value is false.
+bool DBIter::FindUserKeyBeforeSavedKey() {
+  assert(status_.ok());
+  size_t num_skipped = 0;
+  while (iter_.Valid()) {
+    ParsedInternalKey ikey;
+    if (!ParseKey(&ikey)) {
+      return false;
+    }
+
+    if (CompareKeyForSkip(ikey.user_key, saved_key_.GetUserKey()) < 0) {
+      return true;
+    }
+
+    if (TooManyInternalKeysSkipped()) {
+      return false;
+    }
+
+    assert(ikey.sequence != kMaxSequenceNumber);
+    assert(ikey.user_key.size() >= timestamp_size_);
+    Slice ts;
+    if (timestamp_size_ > 0) {
+      ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_,
+                 timestamp_size_);
+    }
+    if (!IsVisible(ikey.sequence, ts)) {
+      PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
+    } else {
+      PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+    }
+
+    if (num_skipped >= max_skip_) {
+      num_skipped = 0;
+      IterKey last_key;
+      ParsedInternalKey pikey(saved_key_.GetUserKey(), kMaxSequenceNumber,
+                              kValueTypeForSeek);
+      if (timestamp_size_ > 0) {
+        // TODO: pre-create kTsMax.
+        const std::string kTsMax(timestamp_size_, '\xff');
+        pikey.SetTimestamp(kTsMax);
+      }
+      last_key.SetInternalKey(pikey);
+      // It would be more efficient to use SeekForPrev() here, but some
+      // iterators may not support it.
+      iter_.Seek(last_key.GetInternalKey());
+      RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+      if (!iter_.Valid()) {
+        break;
+      }
+    } else {
+      ++num_skipped;
+    }
+
+    iter_.Prev();
+  }
+
+  if (!iter_.status().ok()) {
+    valid_ = false;
+    return false;
+  }
+
+  return true;
+}
+
+bool DBIter::TooManyInternalKeysSkipped(bool increment) {
+  if ((max_skippable_internal_keys_ > 0) &&
+      (num_internal_keys_skipped_ > max_skippable_internal_keys_)) {
+    valid_ = false;
+    status_ = Status::Incomplete("Too many internal keys skipped.");
+    return true;
+  } else if (increment) {
+    num_internal_keys_skipped_++;
+  }
+  return false;
+}
+
+bool DBIter::IsVisible(SequenceNumber sequence, const Slice& ts,
+                       bool* more_recent) {
+  // Remember that comparator orders preceding timestamp as larger.
+  // TODO(yanqin): support timestamp in read_callback_.
+  bool visible_by_seq = (read_callback_ == nullptr)
+                            ? sequence <= sequence_
+                            : read_callback_->IsVisible(sequence);
+
+  bool visible_by_ts =
+      (timestamp_ub_ == nullptr ||
+       user_comparator_.CompareTimestamp(ts, *timestamp_ub_) <= 0) &&
+      (timestamp_lb_ == nullptr ||
+       user_comparator_.CompareTimestamp(ts, *timestamp_lb_) >= 0);
+
+  if (more_recent) {
+    *more_recent = !visible_by_seq;
+  }
+  return visible_by_seq && visible_by_ts;
+}
+
+void DBIter::SetSavedKeyToSeekTarget(const Slice& target) {
+  is_key_seqnum_zero_ = false;
+  SequenceNumber seq = sequence_;
+  saved_key_.Clear();
+  saved_key_.SetInternalKey(target, seq, kValueTypeForSeek, timestamp_ub_);
+
+  if (iterate_lower_bound_ != nullptr &&
+      user_comparator_.CompareWithoutTimestamp(
+          saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_lower_bound_,
+          /*b_has_ts=*/false) < 0) {
+    // Seek key is smaller than the lower bound.
+    saved_key_.Clear();
+    saved_key_.SetInternalKey(*iterate_lower_bound_, seq, kValueTypeForSeek,
+                              timestamp_ub_);
+  }
+}
+
+void DBIter::SetSavedKeyToSeekForPrevTarget(const Slice& target) {
+  is_key_seqnum_zero_ = false;
+  saved_key_.Clear();
+  // now saved_key is used to store internal key.
+  saved_key_.SetInternalKey(target, 0 /* sequence_number */,
+                            kValueTypeForSeekForPrev, timestamp_ub_);
+
+  if (timestamp_size_ > 0) {
+    const std::string kTsMin(timestamp_size_, '\0');
+    Slice ts = kTsMin;
+    saved_key_.UpdateInternalKey(
+        /*seq=*/0, kValueTypeForSeekForPrev,
+        timestamp_lb_ == nullptr ? &ts : timestamp_lb_);
+  }
+
+  if (iterate_upper_bound_ != nullptr &&
+      user_comparator_.CompareWithoutTimestamp(
+          saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_upper_bound_,
+          /*b_has_ts=*/false) >= 0) {
+    saved_key_.Clear();
+    saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber,
+                              kValueTypeForSeekForPrev, timestamp_ub_);
+    if (timestamp_size_ > 0) {
+      const std::string kTsMax(timestamp_size_, '\xff');
+      Slice ts = kTsMax;
+      saved_key_.UpdateInternalKey(
+          kMaxSequenceNumber, kValueTypeForSeekForPrev,
+          timestamp_lb_ != nullptr ? timestamp_lb_ : &ts);
+    }
+  }
+}
+
+void DBIter::Seek(const Slice& target) {
+  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
+  StopWatch sw(clock_, statistics_, DB_SEEK);
+
+#ifndef ROCKSDB_LITE
+  if (db_impl_ != nullptr && cfd_ != nullptr) {
+    // TODO: What do we do if this returns an error?
+    Slice lower_bound, upper_bound;
+    if (iterate_lower_bound_ != nullptr) {
+      lower_bound = *iterate_lower_bound_;
+    } else {
+      lower_bound = Slice("");
+    }
+    if (iterate_upper_bound_ != nullptr) {
+      upper_bound = *iterate_upper_bound_;
+    } else {
+      upper_bound = Slice("");
+    }
+    db_impl_->TraceIteratorSeek(cfd_->GetID(), target, lower_bound, upper_bound)
+        .PermitUncheckedError();
+  }
+#endif  // ROCKSDB_LITE
+
+  status_ = Status::OK();
+  ReleaseTempPinnedData();
+  ResetBlobValue();
+  ResetValueAndColumns();
+  ResetInternalKeysSkippedCounter();
+
+  // Seek the inner iterator based on the target key.
+  {
+    PERF_TIMER_GUARD(seek_internal_seek_time);
+
+    SetSavedKeyToSeekTarget(target);
+    iter_.Seek(saved_key_.GetInternalKey());
+
+    RecordTick(statistics_, NUMBER_DB_SEEK);
+  }
+  if (!iter_.Valid()) {
+    valid_ = false;
+    return;
+  }
+  direction_ = kForward;
+
+  // Now the inner iterator is placed to the target position. From there,
+  // we need to find out the next key that is visible to the user.
+  ClearSavedValue();
+  if (prefix_same_as_start_) {
+    // The case where the iterator needs to be invalidated if it has exhausted
+    // keys within the same prefix of the seek key.
+    assert(prefix_extractor_ != nullptr);
+    Slice target_prefix = prefix_extractor_->Transform(target);
+    FindNextUserEntry(false /* not skipping saved_key */,
+                      &target_prefix /* prefix */);
+    if (valid_) {
+      // Remember the prefix of the seek key for the future Next() call to
+      // check.
+      prefix_.SetUserKey(target_prefix);
+    }
+  } else {
+    FindNextUserEntry(false /* not skipping saved_key */, nullptr);
+  }
+  if (!valid_) {
+    return;
+  }
+
+  // Updating stats and perf context counters.
+  if (statistics_ != nullptr) {
+    // Decrement since we don't want to count this key as skipped
+    RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
+    RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+  }
+  PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
+}
+
+void DBIter::SeekForPrev(const Slice& target) {
+  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
+  StopWatch sw(clock_, statistics_, DB_SEEK);
+
+#ifndef ROCKSDB_LITE
+  if (db_impl_ != nullptr && cfd_ != nullptr) {
+    // TODO: What do we do if this returns an error?
+    Slice lower_bound, upper_bound;
+    if (iterate_lower_bound_ != nullptr) {
+      lower_bound = *iterate_lower_bound_;
+    } else {
+      lower_bound = Slice("");
+    }
+    if (iterate_upper_bound_ != nullptr) {
+      upper_bound = *iterate_upper_bound_;
+    } else {
+      upper_bound = Slice("");
+    }
+    db_impl_
+        ->TraceIteratorSeekForPrev(cfd_->GetID(), target, lower_bound,
+                                   upper_bound)
+        .PermitUncheckedError();
+  }
+#endif  // ROCKSDB_LITE
+
+  status_ = Status::OK();
+  ReleaseTempPinnedData();
+  ResetBlobValue();
+  ResetValueAndColumns();
+  ResetInternalKeysSkippedCounter();
+
+  // Seek the inner iterator based on the target key.
+  {
+    PERF_TIMER_GUARD(seek_internal_seek_time);
+    SetSavedKeyToSeekForPrevTarget(target);
+    iter_.SeekForPrev(saved_key_.GetInternalKey());
+    RecordTick(statistics_, NUMBER_DB_SEEK);
+  }
+  if (!iter_.Valid()) {
+    valid_ = false;
+    return;
+  }
+  direction_ = kReverse;
+
+  // Now the inner iterator is placed to the target position. From there,
+  // we need to find out the first key that is visible to the user in the
+  // backward direction.
+  ClearSavedValue();
+  if (prefix_same_as_start_) {
+    // The case where the iterator needs to be invalidated if it has exhausted
+    // keys within the same prefix of the seek key.
+    assert(prefix_extractor_ != nullptr);
+    Slice target_prefix = prefix_extractor_->Transform(target);
+    PrevInternal(&target_prefix);
+    if (valid_) {
+      // Remember the prefix of the seek key for the future Prev() call to
+      // check.
+      prefix_.SetUserKey(target_prefix);
+    }
+  } else {
+    PrevInternal(nullptr);
+  }
+
+  // Report stats and perf context.
+  if (statistics_ != nullptr && valid_) {
+    RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
+    RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+    PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
+  }
+}
+
+void DBIter::SeekToFirst() {
+  if (iterate_lower_bound_ != nullptr) {
+    Seek(*iterate_lower_bound_);
+    return;
+  }
+  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
+  // Don't use iter_::Seek() if we set a prefix extractor
+  // because prefix seek will be used.
+  if (!expect_total_order_inner_iter()) {
+    max_skip_ = std::numeric_limits<uint64_t>::max();
+  }
+  status_ = Status::OK();
+  // if iterator is empty, this status_ could be unchecked.
+  status_.PermitUncheckedError();
+  direction_ = kForward;
+  ReleaseTempPinnedData();
+  ResetBlobValue();
+  ResetValueAndColumns();
+  ResetInternalKeysSkippedCounter();
+  ClearSavedValue();
+  is_key_seqnum_zero_ = false;
+
+  {
+    PERF_TIMER_GUARD(seek_internal_seek_time);
+    iter_.SeekToFirst();
+  }
+
+  RecordTick(statistics_, NUMBER_DB_SEEK);
+  if (iter_.Valid()) {
+    saved_key_.SetUserKey(
+        ExtractUserKey(iter_.key()),
+        !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
+    FindNextUserEntry(false /* not skipping saved_key */,
+                      nullptr /* no prefix check */);
+    if (statistics_ != nullptr) {
+      if (valid_) {
+        RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
+        RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+        PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
+      }
+    }
+  } else {
+    valid_ = false;
+  }
+  if (valid_ && prefix_same_as_start_) {
+    assert(prefix_extractor_ != nullptr);
+    prefix_.SetUserKey(prefix_extractor_->Transform(
+        StripTimestampFromUserKey(saved_key_.GetUserKey(), timestamp_size_)));
+  }
+}
+
+void DBIter::SeekToLast() {
+  if (iterate_upper_bound_ != nullptr) {
+    // Seek to last key strictly less than ReadOptions.iterate_upper_bound.
+    SeekForPrev(*iterate_upper_bound_);
+    const bool is_ikey = (timestamp_size_ > 0 && timestamp_lb_ != nullptr);
+    Slice k = Valid() ? key() : Slice();
+    if (is_ikey && Valid()) {
+      k.remove_suffix(kNumInternalBytes + timestamp_size_);
+    }
+    while (Valid() && 0 == user_comparator_.CompareWithoutTimestamp(
+                               *iterate_upper_bound_, /*a_has_ts=*/false, k,
+                               /*b_has_ts=*/false)) {
+      ReleaseTempPinnedData();
+      ResetBlobValue();
+      ResetValueAndColumns();
+      PrevInternal(nullptr);
+
+      k = key();
+      if (is_ikey) {
+        k.remove_suffix(kNumInternalBytes + timestamp_size_);
+      }
+    }
+    return;
+  }
+
+  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
+  // Don't use iter_::Seek() if we set a prefix extractor
+  // because prefix seek will be used.
+  if (!expect_total_order_inner_iter()) {
+    max_skip_ = std::numeric_limits<uint64_t>::max();
+  }
+  status_ = Status::OK();
+  // if iterator is empty, this status_ could be unchecked.
+  status_.PermitUncheckedError();
+  direction_ = kReverse;
+  ReleaseTempPinnedData();
+  ResetBlobValue();
+  ResetValueAndColumns();
+  ResetInternalKeysSkippedCounter();
+  ClearSavedValue();
+  is_key_seqnum_zero_ = false;
+
+  {
+    PERF_TIMER_GUARD(seek_internal_seek_time);
+    iter_.SeekToLast();
+  }
+  PrevInternal(nullptr);
+  if (statistics_ != nullptr) {
+    RecordTick(statistics_, NUMBER_DB_SEEK);
+    if (valid_) {
+      RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
+      RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+      PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
+    }
+  }
+  if (valid_ && prefix_same_as_start_) {
+    assert(prefix_extractor_ != nullptr);
+    prefix_.SetUserKey(prefix_extractor_->Transform(
+        StripTimestampFromUserKey(saved_key_.GetUserKey(), timestamp_size_)));
+  }
+}
+
+Iterator* NewDBIterator(Env* env, const ReadOptions& read_options,
+                        const ImmutableOptions& ioptions,
+                        const MutableCFOptions& mutable_cf_options,
+                        const Comparator* user_key_comparator,
+                        InternalIterator* internal_iter, const Version* version,
+                        const SequenceNumber& sequence,
+                        uint64_t max_sequential_skip_in_iterations,
+                        ReadCallback* read_callback, DBImpl* db_impl,
+                        ColumnFamilyData* cfd, bool expose_blob_index) {
+  DBIter* db_iter =
+      new DBIter(env, read_options, ioptions, mutable_cf_options,
+                 user_key_comparator, internal_iter, version, sequence, false,
+                 max_sequential_skip_in_iterations, read_callback, db_impl, cfd,
+                 expose_blob_index);
+  return db_iter;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_iter.h b/src/rocksdb/db/db_iter.h
new file mode 100644
index 000000000..e87c2b4c9
--- /dev/null
+++ b/src/rocksdb/db/db_iter.h
@@ -0,0 +1,420 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <cstdint>
+#include <string>
+
+#include "db/db_impl/db_impl.h"
+#include "db/range_del_aggregator.h"
+#include "memory/arena.h"
+#include "options/cf_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/wide_columns.h"
+#include "table/iterator_wrapper.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Version;
+
+// This file declares the factory functions of DBIter, in its original form
+// or a wrapped form with class ArenaWrappedDBIter, which is defined here.
+// Class DBIter, which is declared and implemented inside db_iter.cc, is
+// an iterator that converts internal keys (yielded by an InternalIterator)
+// that were live at the specified sequence number into appropriate user
+// keys.
+// Each internal key consists of a user key, a sequence number, and a value
+// type. DBIter deals with multiple key versions, tombstones, merge operands,
+// etc, and exposes an Iterator.
+// For example, DBIter may wrap following InternalIterator:
+//    user key: AAA  value: v3   seqno: 100    type: Put
+//    user key: AAA  value: v2   seqno: 97     type: Put
+//    user key: AAA  value: v1   seqno: 95     type: Put
+//    user key: BBB  value: v1   seqno: 90     type: Put
+//    user key: BBC  value: N/A  seqno: 98     type: Delete
+//    user key: BBC  value: v1   seqno: 95     type: Put
+// If the snapshot passed in is 102, then the DBIter is expected to
+// expose the following iterator:
+//    key: AAA  value: v3
+//    key: BBB  value: v1
+// If the snapshot passed in is 96, then it should expose:
+//    key: AAA  value: v1
+//    key: BBB  value: v1
+//    key: BBC  value: v1
+//
+
+// Memtables and sstables that make the DB representation contain
+// (userkey,seq,type) => uservalue entries.  DBIter
+// combines multiple entries for the same userkey found in the DB
+// representation into a single entry while accounting for sequence
+// numbers, deletion markers, overwrites, etc.
+class DBIter final : public Iterator {
+ public:
+  // The following is grossly complicated. TODO: clean it up
+  // Which direction is the iterator currently moving?
+  // (1) When moving forward:
+  //   (1a) if current_entry_is_merged_ = false, the internal iterator is
+  //        positioned at the exact entry that yields this->key(), this->value()
+  //   (1b) if current_entry_is_merged_ = true, the internal iterator is
+  //        positioned immediately after the last entry that contributed to the
+  //        current this->value(). That entry may or may not have key equal to
+  //        this->key().
+  // (2) When moving backwards, the internal iterator is positioned
+  //     just before all entries whose user key == this->key().
+  enum Direction : uint8_t { kForward, kReverse };
+
+  // LocalStatistics contain Statistics counters that will be aggregated per
+  // each iterator instance and then will be sent to the global statistics when
+  // the iterator is destroyed.
+  //
+  // The purpose of this approach is to avoid perf regression happening
+  // when multiple threads bump the atomic counters from a DBIter::Next().
+  struct LocalStatistics {
+    explicit LocalStatistics() { ResetCounters(); }
+
+    void ResetCounters() {
+      next_count_ = 0;
+      next_found_count_ = 0;
+      prev_count_ = 0;
+      prev_found_count_ = 0;
+      bytes_read_ = 0;
+      skip_count_ = 0;
+    }
+
+    void BumpGlobalStatistics(Statistics* global_statistics) {
+      RecordTick(global_statistics, NUMBER_DB_NEXT, next_count_);
+      RecordTick(global_statistics, NUMBER_DB_NEXT_FOUND, next_found_count_);
+      RecordTick(global_statistics, NUMBER_DB_PREV, prev_count_);
+      RecordTick(global_statistics, NUMBER_DB_PREV_FOUND, prev_found_count_);
+      RecordTick(global_statistics, ITER_BYTES_READ, bytes_read_);
+      RecordTick(global_statistics, NUMBER_ITER_SKIP, skip_count_);
+      PERF_COUNTER_ADD(iter_read_bytes, bytes_read_);
+      ResetCounters();
+    }
+
+    // Map to Tickers::NUMBER_DB_NEXT
+    uint64_t next_count_;
+    // Map to Tickers::NUMBER_DB_NEXT_FOUND
+    uint64_t next_found_count_;
+    // Map to Tickers::NUMBER_DB_PREV
+    uint64_t prev_count_;
+    // Map to Tickers::NUMBER_DB_PREV_FOUND
+    uint64_t prev_found_count_;
+    // Map to Tickers::ITER_BYTES_READ
+    uint64_t bytes_read_;
+    // Map to Tickers::NUMBER_ITER_SKIP
+    uint64_t skip_count_;
+  };
+
+  DBIter(Env* _env, const ReadOptions& read_options,
+         const ImmutableOptions& ioptions,
+         const MutableCFOptions& mutable_cf_options, const Comparator* cmp,
+         InternalIterator* iter, const Version* version, SequenceNumber s,
+         bool arena_mode, uint64_t max_sequential_skip_in_iterations,
+         ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
+         bool expose_blob_index);
+
+  // No copying allowed
+  DBIter(const DBIter&) = delete;
+  void operator=(const DBIter&) = delete;
+
+  ~DBIter() override {
+    // Release pinned data if any
+    if (pinned_iters_mgr_.PinningEnabled()) {
+      pinned_iters_mgr_.ReleasePinnedData();
+    }
+    RecordTick(statistics_, NO_ITERATOR_DELETED);
+    ResetInternalKeysSkippedCounter();
+    local_stats_.BumpGlobalStatistics(statistics_);
+    iter_.DeleteIter(arena_mode_);
+  }
+  void SetIter(InternalIterator* iter) {
+    assert(iter_.iter() == nullptr);
+    iter_.Set(iter);
+    iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_);
+  }
+
+  bool Valid() const override {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+    if (valid_) {
+      status_.PermitUncheckedError();
+    }
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    return valid_;
+  }
+  Slice key() const override {
+    assert(valid_);
+    if (timestamp_lb_) {
+      return saved_key_.GetInternalKey();
+    } else {
+      const Slice ukey_and_ts = saved_key_.GetUserKey();
+      return Slice(ukey_and_ts.data(), ukey_and_ts.size() - timestamp_size_);
+    }
+  }
+  Slice value() const override {
+    assert(valid_);
+
+    return value_;
+  }
+
+  const WideColumns& columns() const override {
+    assert(valid_);
+
+    return wide_columns_;
+  }
+
+  Status status() const override {
+    if (status_.ok()) {
+      return iter_.status();
+    } else {
+      assert(!valid_);
+      return status_;
+    }
+  }
+  Slice timestamp() const override {
+    assert(valid_);
+    assert(timestamp_size_ > 0);
+    if (direction_ == kReverse) {
+      return saved_timestamp_;
+    }
+    const Slice ukey_and_ts = saved_key_.GetUserKey();
+    assert(timestamp_size_ < ukey_and_ts.size());
+    return ExtractTimestampFromUserKey(ukey_and_ts, timestamp_size_);
+  }
+  bool IsBlob() const {
+    assert(valid_);
+    return is_blob_;
+  }
+
+  Status GetProperty(std::string prop_name, std::string* prop) override;
+
+  void Next() final override;
+  void Prev() final override;
+  // 'target' does not contain timestamp, even if user timestamp feature is
+  // enabled.
+  void Seek(const Slice& target) final override;
+  void SeekForPrev(const Slice& target) final override;
+  void SeekToFirst() final override;
+  void SeekToLast() final override;
+  Env* env() const { return env_; }
+  void set_sequence(uint64_t s) {
+    sequence_ = s;
+    if (read_callback_) {
+      read_callback_->Refresh(s);
+    }
+  }
+  void set_valid(bool v) { valid_ = v; }
+
+ private:
+  // For all methods in this block:
+  // PRE: iter_->Valid() && status_.ok()
+  // Return false if there was an error, and status() is non-ok, valid_ = false;
+  // in this case callers would usually stop what they were doing and return.
+  bool ReverseToForward();
+  bool ReverseToBackward();
+  // Set saved_key_ to the seek key to target, with proper sequence number set.
+  // It might get adjusted if the seek key is smaller than iterator lower bound.
+  // target does not have timestamp.
+  void SetSavedKeyToSeekTarget(const Slice& target);
+  // Set saved_key_ to the seek key to target, with proper sequence number set.
+  // It might get adjusted if the seek key is larger than iterator upper bound.
+  // target does not have timestamp.
+  void SetSavedKeyToSeekForPrevTarget(const Slice& target);
+  bool FindValueForCurrentKey();
+  bool FindValueForCurrentKeyUsingSeek();
+  bool FindUserKeyBeforeSavedKey();
+  // If `skipping_saved_key` is true, the function will keep iterating until it
+  // finds a user key that is larger than `saved_key_`.
+  // If `prefix` is not null, the iterator needs to stop when all keys for the
+  // prefix are exhausted and the iterator is set to invalid.
+  bool FindNextUserEntry(bool skipping_saved_key, const Slice* prefix);
+  // Internal implementation of FindNextUserEntry().
+  bool FindNextUserEntryInternal(bool skipping_saved_key, const Slice* prefix);
+  bool ParseKey(ParsedInternalKey* key);
+  bool MergeValuesNewToOld();
+
+  // If prefix is not null, we need to set the iterator to invalid if no more
+  // entry can be found within the prefix.
+  void PrevInternal(const Slice* prefix);
+  bool TooManyInternalKeysSkipped(bool increment = true);
+  bool IsVisible(SequenceNumber sequence, const Slice& ts,
+                 bool* more_recent = nullptr);
+
+  // Temporarily pin the blocks that we encounter until ReleaseTempPinnedData()
+  // is called
+  void TempPinData() {
+    if (!pin_thru_lifetime_) {
+      pinned_iters_mgr_.StartPinning();
+    }
+  }
+
+  // Release blocks pinned by TempPinData()
+  void ReleaseTempPinnedData() {
+    if (!pin_thru_lifetime_ && pinned_iters_mgr_.PinningEnabled()) {
+      pinned_iters_mgr_.ReleasePinnedData();
+    }
+  }
+
+  inline void ClearSavedValue() {
+    if (saved_value_.capacity() > 1048576) {
+      std::string empty;
+      swap(empty, saved_value_);
+    } else {
+      saved_value_.clear();
+    }
+  }
+
+  inline void ResetInternalKeysSkippedCounter() {
+    local_stats_.skip_count_ += num_internal_keys_skipped_;
+    if (valid_) {
+      local_stats_.skip_count_--;
+    }
+    num_internal_keys_skipped_ = 0;
+  }
+
+  bool expect_total_order_inner_iter() {
+    assert(expect_total_order_inner_iter_ || prefix_extractor_ != nullptr);
+    return expect_total_order_inner_iter_;
+  }
+
+  // If lower bound of timestamp is given by ReadOptions.iter_start_ts, we need
+  // to return versions of the same key. We cannot just skip if the key value
+  // is the same but timestamps are different but fall in timestamp range.
+  inline int CompareKeyForSkip(const Slice& a, const Slice& b) {
+    return timestamp_lb_ != nullptr
+               ? user_comparator_.Compare(a, b)
+               : user_comparator_.CompareWithoutTimestamp(a, b);
+  }
+
+  // Retrieves the blob value for the specified user key using the given blob
+  // index when using the integrated BlobDB implementation.
+  bool SetBlobValueIfNeeded(const Slice& user_key, const Slice& blob_index);
+
+  void ResetBlobValue() {
+    is_blob_ = false;
+    blob_value_.Reset();
+  }
+
+  void SetValueAndColumnsFromPlain(const Slice& slice) {
+    assert(value_.empty());
+    assert(wide_columns_.empty());
+
+    value_ = slice;
+    wide_columns_.emplace_back(kDefaultWideColumnName, slice);
+  }
+
+  bool SetValueAndColumnsFromEntity(Slice slice);
+
+  void ResetValueAndColumns() {
+    value_.clear();
+    wide_columns_.clear();
+  }
+
+  // If user-defined timestamp is enabled, `user_key` includes timestamp.
+  bool Merge(const Slice* val, const Slice& user_key);
+  bool MergeEntity(const Slice& entity, const Slice& user_key);
+
+  const SliceTransform* prefix_extractor_;
+  Env* const env_;
+  SystemClock* clock_;
+  Logger* logger_;
+  UserComparatorWrapper user_comparator_;
+  const MergeOperator* const merge_operator_;
+  IteratorWrapper iter_;
+  const Version* version_;
+  ReadCallback* read_callback_;
+  // Max visible sequence number. It is normally the snapshot seq unless we have
+  // uncommitted data in db as in WriteUnCommitted.
+  SequenceNumber sequence_;
+
+  IterKey saved_key_;
+  // Reusable internal key data structure. This is only used inside one function
+  // and should not be used across functions. Reusing this object can reduce
+  // overhead of calling construction of the function if creating it each time.
+  ParsedInternalKey ikey_;
+  std::string saved_value_;
+  Slice pinned_value_;
+  // for prefix seek mode to support prev()
+  PinnableSlice blob_value_;
+  // Value of the default column
+  Slice value_;
+  // All columns (i.e. name-value pairs)
+  WideColumns wide_columns_;
+  Statistics* statistics_;
+  uint64_t max_skip_;
+  uint64_t max_skippable_internal_keys_;
+  uint64_t num_internal_keys_skipped_;
+  const Slice* iterate_lower_bound_;
+  const Slice* iterate_upper_bound_;
+
+  // The prefix of the seek key. It is only used when prefix_same_as_start_
+  // is true and prefix extractor is not null. In Next() or Prev(), current keys
+  // will be checked against this prefix, so that the iterator can be
+  // invalidated if the keys in this prefix has been exhausted. Set it using
+  // SetUserKey() and use it using GetUserKey().
+  IterKey prefix_;
+
+  Status status_;
+  Direction direction_;
+  bool valid_;
+  bool current_entry_is_merged_;
+  // True if we know that the current entry's seqnum is 0.
+  // This information is used as that the next entry will be for another
+  // user key.
+  bool is_key_seqnum_zero_;
+  const bool prefix_same_as_start_;
+  // Means that we will pin all data blocks we read as long the Iterator
+  // is not deleted, will be true if ReadOptions::pin_data is true
+  const bool pin_thru_lifetime_;
+  // Expect the inner iterator to maintain a total order.
+  // prefix_extractor_ must be non-NULL if the value is false.
+  const bool expect_total_order_inner_iter_;
+  ReadTier read_tier_;
+  bool fill_cache_;
+  bool verify_checksums_;
+  // Whether the iterator is allowed to expose blob references. Set to true when
+  // the stacked BlobDB implementation is used, false otherwise.
+  bool expose_blob_index_;
+  bool is_blob_;
+  bool arena_mode_;
+  // List of operands for merge operator.
+  MergeContext merge_context_;
+  LocalStatistics local_stats_;
+  PinnedIteratorsManager pinned_iters_mgr_;
+#ifdef ROCKSDB_LITE
+  ROCKSDB_FIELD_UNUSED
+#endif
+  DBImpl* db_impl_;
+#ifdef ROCKSDB_LITE
+  ROCKSDB_FIELD_UNUSED
+#endif
+  ColumnFamilyData* cfd_;
+  const Slice* const timestamp_ub_;
+  const Slice* const timestamp_lb_;
+  const size_t timestamp_size_;
+  std::string saved_timestamp_;
+
+  // Used only if timestamp_lb_ is not nullptr.
+  std::string saved_ikey_;
+};
+
+// Return a new iterator that converts internal keys (yielded by
+// "*internal_iter") that were live at the specified `sequence` number
+// into appropriate user keys.
+extern Iterator* NewDBIterator(
+    Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options,
+    const Comparator* user_key_comparator, InternalIterator* internal_iter,
+    const Version* version, const SequenceNumber& sequence,
+    uint64_t max_sequential_skip_in_iterations, ReadCallback* read_callback,
+    DBImpl* db_impl = nullptr, ColumnFamilyData* cfd = nullptr,
+    bool expose_blob_index = false);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_iter_stress_test.cc b/src/rocksdb/db/db_iter_stress_test.cc
new file mode 100644
index 000000000..872f7e6bd
--- /dev/null
+++ b/src/rocksdb/db/db_iter_stress_test.cc
@@ -0,0 +1,658 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "test_util/testharness.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+#ifdef GFLAGS
+
+#include "util/gflags_compat.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_bool(verbose, false,
+            "Print huge, detailed trace. Intended for debugging failures.");
+
+#else
+
+void ParseCommandLineFlags(int*, char***, bool) {}
+bool FLAGS_verbose = false;
+
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBIteratorStressTest : public testing::Test {
+ public:
+  Env* env_;
+
+  DBIteratorStressTest() : env_(Env::Default()) {}
+};
+
+namespace {
+
+struct Entry {
+  std::string key;
+  ValueType type;  // kTypeValue, kTypeDeletion, kTypeMerge
+  uint64_t sequence;
+  std::string ikey;  // internal key, made from `key`, `sequence` and `type`
+  std::string value;
+  // If false, we'll pretend that this entry doesn't exist.
+  bool visible = true;
+
+  bool operator<(const Entry& e) const {
+    if (key != e.key) return key < e.key;
+    return std::tie(sequence, type) > std::tie(e.sequence, e.type);
+  }
+};
+
+struct Data {
+  std::vector<Entry> entries;
+
+  // Indices in `entries` with `visible` = false.
+  std::vector<size_t> hidden;
+  // Keys of entries whose `visible` changed since the last seek of iterators.
+  std::set<std::string> recently_touched_keys;
+};
+
+struct StressTestIterator : public InternalIterator {
+  Data* data;
+  Random64* rnd;
+  InternalKeyComparator cmp;
+
+  // Each operation will return error with this probability...
+  double error_probability = 0;
+  // ... and add/remove entries with this probability.
+  double mutation_probability = 0;
+  // The probability of adding vs removing entries will be chosen so that the
+  // amount of removed entries stays somewhat close to this number.
+  double target_hidden_fraction = 0;
+  // If true, print all mutations to stdout for debugging.
+  bool trace = false;
+
+  int iter = -1;
+  Status status_;
+
+  StressTestIterator(Data* _data, Random64* _rnd, const Comparator* _cmp)
+      : data(_data), rnd(_rnd), cmp(_cmp) {}
+
+  bool Valid() const override {
+    if (iter >= 0 && iter < (int)data->entries.size()) {
+      assert(status_.ok());
+      return true;
+    }
+    return false;
+  }
+
+  Status status() const override { return status_; }
+
+  bool MaybeFail() {
+    if (rnd->Next() >=
+        static_cast<double>(std::numeric_limits<uint64_t>::max()) *
+            error_probability) {
+      return false;
+    }
+    if (rnd->Next() % 2) {
+      status_ = Status::Incomplete("test");
+    } else {
+      status_ = Status::IOError("test");
+    }
+    if (trace) {
+      std::cout << "injecting " << status_.ToString() << std::endl;
+    }
+    iter = -1;
+    return true;
+  }
+
+  void MaybeMutate() {
+    if (rnd->Next() >=
+        static_cast<double>(std::numeric_limits<uint64_t>::max()) *
+            mutation_probability) {
+      return;
+    }
+    do {
+      // If too many entries are hidden, hide less, otherwise hide more.
+      double hide_probability =
+          data->hidden.size() > data->entries.size() * target_hidden_fraction
+              ? 1. / 3
+              : 2. / 3;
+      if (data->hidden.empty()) {
+        hide_probability = 1;
+      }
+      bool do_hide = rnd->Next() <
+                     static_cast<double>(std::numeric_limits<uint64_t>::max()) *
+                         hide_probability;
+      if (do_hide) {
+        // Hide a random entry.
+        size_t idx = rnd->Next() % data->entries.size();
+        Entry& e = data->entries[idx];
+        if (e.visible) {
+          if (trace) {
+            std::cout << "hiding idx " << idx << std::endl;
+          }
+          e.visible = false;
+          data->hidden.push_back(idx);
+          data->recently_touched_keys.insert(e.key);
+        } else {
+          // Already hidden. Let's go unhide something instead, just because
+          // it's easy and it doesn't really matter what we do.
+          do_hide = false;
+        }
+      }
+      if (!do_hide) {
+        // Unhide a random entry.
+        size_t hi = rnd->Next() % data->hidden.size();
+        size_t idx = data->hidden[hi];
+        if (trace) {
+          std::cout << "unhiding idx " << idx << std::endl;
+        }
+        Entry& e = data->entries[idx];
+        assert(!e.visible);
+        e.visible = true;
+        data->hidden[hi] = data->hidden.back();
+        data->hidden.pop_back();
+        data->recently_touched_keys.insert(e.key);
+      }
+    } while (rnd->Next() % 3 != 0);  // do 3 mutations on average
+  }
+
+  void SkipForward() {
+    while (iter < (int)data->entries.size() && !data->entries[iter].visible) {
+      ++iter;
+    }
+  }
+  void SkipBackward() {
+    while (iter >= 0 && !data->entries[iter].visible) {
+      --iter;
+    }
+  }
+
+  void SeekToFirst() override {
+    if (MaybeFail()) return;
+    MaybeMutate();
+
+    status_ = Status::OK();
+    iter = 0;
+    SkipForward();
+  }
+  void SeekToLast() override {
+    if (MaybeFail()) return;
+    MaybeMutate();
+
+    status_ = Status::OK();
+    iter = (int)data->entries.size() - 1;
+    SkipBackward();
+  }
+
+  void Seek(const Slice& target) override {
+    if (MaybeFail()) return;
+    MaybeMutate();
+
+    status_ = Status::OK();
+    // Binary search.
+    auto it = std::partition_point(
+        data->entries.begin(), data->entries.end(),
+        [&](const Entry& e) { return cmp.Compare(e.ikey, target) < 0; });
+    iter = (int)(it - data->entries.begin());
+    SkipForward();
+  }
+  void SeekForPrev(const Slice& target) override {
+    if (MaybeFail()) return;
+    MaybeMutate();
+
+    status_ = Status::OK();
+    // Binary search.
+    auto it = std::partition_point(
+        data->entries.begin(), data->entries.end(),
+        [&](const Entry& e) { return cmp.Compare(e.ikey, target) <= 0; });
+    iter = (int)(it - data->entries.begin());
+    --iter;
+    SkipBackward();
+  }
+
+  void Next() override {
+    assert(Valid());
+    if (MaybeFail()) return;
+    MaybeMutate();
+    ++iter;
+    SkipForward();
+  }
+  void Prev() override {
+    assert(Valid());
+    if (MaybeFail()) return;
+    MaybeMutate();
+    --iter;
+    SkipBackward();
+  }
+
+  Slice key() const override {
+    assert(Valid());
+    return data->entries[iter].ikey;
+  }
+  Slice value() const override {
+    assert(Valid());
+    return data->entries[iter].value;
+  }
+
+  bool IsKeyPinned() const override { return true; }
+  bool IsValuePinned() const override { return true; }
+};
+
+// A small reimplementation of DBIter, supporting only some of the features,
+// and doing everything in O(log n).
+// Skips all keys that are in recently_touched_keys.
+struct ReferenceIterator {
+  Data* data;
+  uint64_t sequence;  // ignore entries with sequence number below this
+
+  bool valid = false;
+  std::string key;
+  std::string value;
+
+  ReferenceIterator(Data* _data, uint64_t _sequence)
+      : data(_data), sequence(_sequence) {}
+
+  bool Valid() const { return valid; }
+
+  // Finds the first entry with key
+  // greater/less/greater-or-equal/less-or-equal than `key`, depending on
+  // arguments: if `skip`, inequality is strict; if `forward`, it's
+  // greater/greater-or-equal, otherwise less/less-or-equal.
+  // Sets `key` to the result.
+  // If no such key exists, returns false. Doesn't check `visible`.
+  bool FindNextKey(bool skip, bool forward) {
+    valid = false;
+    auto it = std::partition_point(data->entries.begin(), data->entries.end(),
+                                   [&](const Entry& e) {
+                                     if (forward != skip) {
+                                       return e.key < key;
+                                     } else {
+                                       return e.key <= key;
+                                     }
+                                   });
+    if (forward) {
+      if (it != data->entries.end()) {
+        key = it->key;
+        return true;
+      }
+    } else {
+      if (it != data->entries.begin()) {
+        --it;
+        key = it->key;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  bool FindValueForCurrentKey() {
+    if (data->recently_touched_keys.count(key)) {
+      return false;
+    }
+
+    // Find the first entry for the key. The caller promises that it exists.
+    auto it = std::partition_point(data->entries.begin(), data->entries.end(),
+                                   [&](const Entry& e) {
+                                     if (e.key != key) {
+                                       return e.key < key;
+                                     }
+                                     return e.sequence > sequence;
+                                   });
+
+    // Find the first visible entry.
+    for (;; ++it) {
+      if (it == data->entries.end()) {
+        return false;
+      }
+      Entry& e = *it;
+      if (e.key != key) {
+        return false;
+      }
+      assert(e.sequence <= sequence);
+      if (!e.visible) continue;
+      if (e.type == kTypeDeletion) {
+        return false;
+      }
+      if (e.type == kTypeValue) {
+        value = e.value;
+        valid = true;
+        return true;
+      }
+      assert(e.type == kTypeMerge);
+      break;
+    }
+
+    // Collect merge operands.
+    std::vector<Slice> operands;
+    for (; it != data->entries.end(); ++it) {
+      Entry& e = *it;
+      if (e.key != key) {
+        break;
+      }
+      assert(e.sequence <= sequence);
+      if (!e.visible) continue;
+      if (e.type == kTypeDeletion) {
+        break;
+      }
+      operands.push_back(e.value);
+      if (e.type == kTypeValue) {
+        break;
+      }
+    }
+
+    // Do a merge.
+    value = operands.back().ToString();
+    for (int i = (int)operands.size() - 2; i >= 0; --i) {
+      value.append(",");
+      value.append(operands[i].data(), operands[i].size());
+    }
+
+    valid = true;
+    return true;
+  }
+
+  // Start at `key` and move until we encounter a valid value.
+  // `forward` defines the direction of movement.
+  // If `skip` is true, we're looking for key not equal to `key`.
+  void DoTheThing(bool skip, bool forward) {
+    while (FindNextKey(skip, forward) && !FindValueForCurrentKey()) {
+      skip = true;
+    }
+  }
+
+  void Seek(const Slice& target) {
+    key = target.ToString();
+    DoTheThing(false, true);
+  }
+  void SeekForPrev(const Slice& target) {
+    key = target.ToString();
+    DoTheThing(false, false);
+  }
+  void SeekToFirst() { Seek(""); }
+  void SeekToLast() {
+    key = data->entries.back().key;
+    DoTheThing(false, false);
+  }
+  void Next() {
+    assert(Valid());
+    DoTheThing(true, true);
+  }
+  void Prev() {
+    assert(Valid());
+    DoTheThing(true, false);
+  }
+};
+
+}  // anonymous namespace
+
+// Use an internal iterator that sometimes returns errors and sometimes
+// adds/removes entries on the fly. Do random operations on a DBIter and
+// check results.
+// TODO: can be improved for more coverage:
+//   * Override IsKeyPinned() and IsValuePinned() to actually use
+//     PinnedIteratorManager and check that there's no use-after free.
+//   * Try different combinations of prefix_extractor, total_order_seek,
+//     prefix_same_as_start, iterate_lower_bound, iterate_upper_bound.
+TEST_F(DBIteratorStressTest, StressTest) {
+  // We use a deterministic RNG, and everything happens in a single thread.
+  Random64 rnd(826909345792864532ll);
+
+  auto gen_key = [&](int max_key) {
+    assert(max_key > 0);
+    int len = 0;
+    int a = max_key;
+    while (a) {
+      a /= 10;
+      ++len;
+    }
+    std::string s = std::to_string(rnd.Next() % static_cast<uint64_t>(max_key));
+    s.insert(0, len - (int)s.size(), '0');
+    return s;
+  };
+
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  ReadOptions ropt;
+
+  size_t num_matching = 0;
+  size_t num_at_end = 0;
+  size_t num_not_ok = 0;
+  size_t num_recently_removed = 0;
+
+  // Number of iterations for each combination of parameters
+  // (there are ~250 of those).
+  // Tweak this to change the test run time.
+  // As of the time of writing, the test takes ~4 seconds for value of 5000.
+  const int num_iterations = 5000;
+  // Enable this to print all the operations for debugging.
+  bool trace = FLAGS_verbose;
+
+  for (int num_entries : {5, 10, 100}) {
+    for (double key_space : {0.1, 1.0, 3.0}) {
+      for (ValueType prevalent_entry_type :
+           {kTypeValue, kTypeDeletion, kTypeMerge}) {
+        for (double error_probability : {0.01, 0.1}) {
+          for (double mutation_probability : {0.01, 0.5}) {
+            for (double target_hidden_fraction : {0.1, 0.5}) {
+              std::string trace_str =
+                  "entries: " + std::to_string(num_entries) +
+                  ", key_space: " + std::to_string(key_space) +
+                  ", error_probability: " + std::to_string(error_probability) +
+                  ", mutation_probability: " +
+                  std::to_string(mutation_probability) +
+                  ", target_hidden_fraction: " +
+                  std::to_string(target_hidden_fraction);
+              SCOPED_TRACE(trace_str);
+              if (trace) {
+                std::cout << trace_str << std::endl;
+              }
+
+              // Generate data.
+              Data data;
+              int max_key = (int)(num_entries * key_space) + 1;
+              for (int i = 0; i < num_entries; ++i) {
+                Entry e;
+                e.key = gen_key(max_key);
+                if (rnd.Next() % 10 != 0) {
+                  e.type = prevalent_entry_type;
+                } else {
+                  const ValueType types[] = {kTypeValue, kTypeDeletion,
+                                             kTypeMerge};
+                  e.type =
+                      types[rnd.Next() % (sizeof(types) / sizeof(types[0]))];
+                }
+                e.sequence = i;
+                e.value = "v" + std::to_string(i);
+                ParsedInternalKey internal_key(e.key, e.sequence, e.type);
+                AppendInternalKey(&e.ikey, internal_key);
+
+                data.entries.push_back(e);
+              }
+              std::sort(data.entries.begin(), data.entries.end());
+              if (trace) {
+                std::cout << "entries:";
+                for (size_t i = 0; i < data.entries.size(); ++i) {
+                  Entry& e = data.entries[i];
+                  std::cout << "\n  idx " << i << ": \"" << e.key << "\": \""
+                            << e.value << "\" seq: " << e.sequence << " type: "
+                            << (e.type == kTypeValue      ? "val"
+                                : e.type == kTypeDeletion ? "del"
+                                                          : "merge");
+                }
+                std::cout << std::endl;
+              }
+
+              std::unique_ptr<Iterator> db_iter;
+              std::unique_ptr<ReferenceIterator> ref_iter;
+              for (int iteration = 0; iteration < num_iterations; ++iteration) {
+                SCOPED_TRACE(iteration);
+                // Create a new iterator every ~30 operations.
+                if (db_iter == nullptr || rnd.Next() % 30 == 0) {
+                  uint64_t sequence = rnd.Next() % (data.entries.size() + 2);
+                  ref_iter.reset(new ReferenceIterator(&data, sequence));
+                  if (trace) {
+                    std::cout << "new iterator, seq: " << sequence << std::endl;
+                  }
+
+                  auto internal_iter =
+                      new StressTestIterator(&data, &rnd, BytewiseComparator());
+                  internal_iter->error_probability = error_probability;
+                  internal_iter->mutation_probability = mutation_probability;
+                  internal_iter->target_hidden_fraction =
+                      target_hidden_fraction;
+                  internal_iter->trace = trace;
+                  db_iter.reset(NewDBIterator(
+                      env_, ropt, ImmutableOptions(options),
+                      MutableCFOptions(options), BytewiseComparator(),
+                      internal_iter, nullptr /* version */, sequence,
+                      options.max_sequential_skip_in_iterations,
+                      nullptr /*read_callback*/));
+                }
+
+                // Do a random operation. It's important to do it on ref_it
+                // later than on db_iter to make sure ref_it sees the correct
+                // recently_touched_keys.
+                std::string old_key;
+                bool forward = rnd.Next() % 2 > 0;
+                // Do Next()/Prev() ~90% of the time.
+                bool seek = !ref_iter->Valid() || rnd.Next() % 10 == 0;
+                if (trace) {
+                  std::cout << iteration << ": ";
+                }
+
+                if (!seek) {
+                  assert(db_iter->Valid());
+                  old_key = ref_iter->key;
+                  if (trace) {
+                    std::cout << (forward ? "Next" : "Prev") << std::endl;
+                  }
+
+                  if (forward) {
+                    db_iter->Next();
+                    ref_iter->Next();
+                  } else {
+                    db_iter->Prev();
+                    ref_iter->Prev();
+                  }
+                } else {
+                  data.recently_touched_keys.clear();
+                  // Do SeekToFirst less often than Seek.
+                  if (rnd.Next() % 4 == 0) {
+                    if (trace) {
+                      std::cout << (forward ? "SeekToFirst" : "SeekToLast")
+                                << std::endl;
+                    }
+
+                    if (forward) {
+                      old_key = "";
+                      db_iter->SeekToFirst();
+                      ref_iter->SeekToFirst();
+                    } else {
+                      old_key = data.entries.back().key;
+                      db_iter->SeekToLast();
+                      ref_iter->SeekToLast();
+                    }
+                  } else {
+                    old_key = gen_key(max_key);
+                    if (trace) {
+                      std::cout << (forward ? "Seek" : "SeekForPrev") << " \""
+                                << old_key << '"' << std::endl;
+                    }
+                    if (forward) {
+                      db_iter->Seek(old_key);
+                      ref_iter->Seek(old_key);
+                    } else {
+                      db_iter->SeekForPrev(old_key);
+                      ref_iter->SeekForPrev(old_key);
+                    }
+                  }
+                }
+
+                // Check the result.
+                if (db_iter->Valid()) {
+                  ASSERT_TRUE(db_iter->status().ok());
+                  if (data.recently_touched_keys.count(
+                          db_iter->key().ToString())) {
+                    // Ended on a key that may have been mutated during the
+                    // operation. Reference iterator skips such keys, so we
+                    // can't check the exact result.
+
+                    // Check that the key moved in the right direction.
+                    if (forward) {
+                      if (seek)
+                        ASSERT_GE(db_iter->key().ToString(), old_key);
+                      else
+                        ASSERT_GT(db_iter->key().ToString(), old_key);
+                    } else {
+                      if (seek)
+                        ASSERT_LE(db_iter->key().ToString(), old_key);
+                      else
+                        ASSERT_LT(db_iter->key().ToString(), old_key);
+                    }
+
+                    if (ref_iter->Valid()) {
+                      // Check that DBIter didn't miss any non-mutated key.
+                      if (forward) {
+                        ASSERT_LT(db_iter->key().ToString(), ref_iter->key);
+                      } else {
+                        ASSERT_GT(db_iter->key().ToString(), ref_iter->key);
+                      }
+                    }
+                    // Tell the next iteration of the loop to reseek the
+                    // iterators.
+                    ref_iter->valid = false;
+
+                    ++num_recently_removed;
+                  } else {
+                    ASSERT_TRUE(ref_iter->Valid());
+                    ASSERT_EQ(ref_iter->key, db_iter->key().ToString());
+                    ASSERT_EQ(ref_iter->value, db_iter->value());
+                    ++num_matching;
+                  }
+                } else if (db_iter->status().ok()) {
+                  ASSERT_FALSE(ref_iter->Valid());
+                  ++num_at_end;
+                } else {
+                  // Non-ok status. Nothing to check here.
+                  // Tell the next iteration of the loop to reseek the
+                  // iterators.
+                  ref_iter->valid = false;
+                  ++num_not_ok;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Check that all cases were hit many times.
+  EXPECT_GT(num_matching, 10000);
+  EXPECT_GT(num_at_end, 10000);
+  EXPECT_GT(num_not_ok, 10000);
+  EXPECT_GT(num_recently_removed, 10000);
+
+  std::cout << "stats:\n  exact matches: " << num_matching
+            << "\n  end reached: " << num_at_end
+            << "\n  non-ok status: " << num_not_ok
+            << "\n  mutated on the fly: " << num_recently_removed << std::endl;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  ParseCommandLineFlags(&argc, &argv, true);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_iter_test.cc b/src/rocksdb/db/db_iter_test.cc
new file mode 100644
index 000000000..65290bfad
--- /dev/null
+++ b/src/rocksdb/db/db_iter_test.cc
@@ -0,0 +1,3195 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_iter.h"
+
+#include <algorithm>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include "table/iterator_wrapper.h"
+#include "table/merging_iterator.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static uint64_t TestGetTickerCount(const Options& options,
+                                   Tickers ticker_type) {
+  return options.statistics->getTickerCount(ticker_type);
+}
+
+class TestIterator : public InternalIterator {
+ public:
+  explicit TestIterator(const Comparator* comparator)
+      : initialized_(false),
+        valid_(false),
+        sequence_number_(0),
+        iter_(0),
+        cmp(comparator) {
+    data_.reserve(16);
+  }
+
+  void AddPut(std::string argkey, std::string argvalue) {
+    Add(argkey, kTypeValue, argvalue);
+  }
+
+  void AddDeletion(std::string argkey) {
+    Add(argkey, kTypeDeletion, std::string());
+  }
+
+  void AddSingleDeletion(std::string argkey) {
+    Add(argkey, kTypeSingleDeletion, std::string());
+  }
+
+  void AddMerge(std::string argkey, std::string argvalue) {
+    Add(argkey, kTypeMerge, argvalue);
+  }
+
+  void Add(std::string argkey, ValueType type, std::string argvalue) {
+    Add(argkey, type, argvalue, sequence_number_++);
+  }
+
+  void Add(std::string argkey, ValueType type, std::string argvalue,
+           size_t seq_num, bool update_iter = false) {
+    valid_ = true;
+    ParsedInternalKey internal_key(argkey, seq_num, type);
+    data_.push_back(
+        std::pair<std::string, std::string>(std::string(), argvalue));
+    AppendInternalKey(&data_.back().first, internal_key);
+    if (update_iter && valid_ && cmp.Compare(data_.back().first, key()) < 0) {
+      // insert a key smaller than current key
+      Finish();
+      // data_[iter_] is not anymore the current element of the iterator.
+      // Increment it to reposition it to the right position.
+      iter_++;
+    }
+  }
+
+  // should be called before operations with iterator
+  void Finish() {
+    initialized_ = true;
+    std::sort(data_.begin(), data_.end(),
+              [this](std::pair<std::string, std::string> a,
+                     std::pair<std::string, std::string> b) {
+                return (cmp.Compare(a.first, b.first) < 0);
+              });
+  }
+
+  // Removes the key from the set of keys over which this iterator iterates.
+  // Not to be confused with AddDeletion().
+  // If the iterator is currently positioned on this key, the deletion will
+  // apply next time the iterator moves.
+  // Used for simulating ForwardIterator updating to a new version that doesn't
+  // have some of the keys (e.g. after compaction with a filter).
+  void Vanish(std::string _key) {
+    if (valid_ && data_[iter_].first == _key) {
+      delete_current_ = true;
+      return;
+    }
+    for (auto it = data_.begin(); it != data_.end(); ++it) {
+      ParsedInternalKey ikey;
+      Status pik_status =
+          ParseInternalKey(it->first, &ikey, true /* log_err_key */);
+      pik_status.PermitUncheckedError();
+      assert(pik_status.ok());
+      if (!pik_status.ok() || ikey.user_key != _key) {
+        continue;
+      }
+      if (valid_ && data_.begin() + iter_ > it) {
+        --iter_;
+      }
+      data_.erase(it);
+      return;
+    }
+    assert(false);
+  }
+
+  // Number of operations done on this iterator since construction.
+  size_t steps() const { return steps_; }
+
+  bool Valid() const override {
+    assert(initialized_);
+    return valid_;
+  }
+
+  void SeekToFirst() override {
+    assert(initialized_);
+    ++steps_;
+    DeleteCurrentIfNeeded();
+    valid_ = (data_.size() > 0);
+    iter_ = 0;
+  }
+
+  void SeekToLast() override {
+    assert(initialized_);
+    ++steps_;
+    DeleteCurrentIfNeeded();
+    valid_ = (data_.size() > 0);
+    iter_ = data_.size() - 1;
+  }
+
+  void Seek(const Slice& target) override {
+    assert(initialized_);
+    SeekToFirst();
+    ++steps_;
+    if (!valid_) {
+      return;
+    }
+    while (iter_ < data_.size() &&
+           (cmp.Compare(data_[iter_].first, target) < 0)) {
+      ++iter_;
+    }
+
+    if (iter_ == data_.size()) {
+      valid_ = false;
+    }
+  }
+
+  void SeekForPrev(const Slice& target) override {
+    assert(initialized_);
+    DeleteCurrentIfNeeded();
+    SeekForPrevImpl(target, &cmp);
+  }
+
+  void Next() override {
+    assert(initialized_);
+    assert(valid_);
+    assert(iter_ < data_.size());
+
+    ++steps_;
+    if (delete_current_) {
+      DeleteCurrentIfNeeded();
+    } else {
+      ++iter_;
+    }
+    valid_ = iter_ < data_.size();
+  }
+
+  void Prev() override {
+    assert(initialized_);
+    assert(valid_);
+    assert(iter_ < data_.size());
+
+    ++steps_;
+    DeleteCurrentIfNeeded();
+    if (iter_ == 0) {
+      valid_ = false;
+    } else {
+      --iter_;
+    }
+  }
+
+  Slice key() const override {
+    assert(initialized_);
+    return data_[iter_].first;
+  }
+
+  Slice value() const override {
+    assert(initialized_);
+    return data_[iter_].second;
+  }
+
+  Status status() const override {
+    assert(initialized_);
+    return Status::OK();
+  }
+
+  bool IsKeyPinned() const override { return true; }
+  bool IsValuePinned() const override { return true; }
+
+ private:
+  bool initialized_;
+  bool valid_;
+  size_t sequence_number_;
+  size_t iter_;
+  size_t steps_ = 0;
+
+  InternalKeyComparator cmp;
+  std::vector<std::pair<std::string, std::string>> data_;
+  bool delete_current_ = false;
+
+  void DeleteCurrentIfNeeded() {
+    if (!delete_current_) {
+      return;
+    }
+    data_.erase(data_.begin() + iter_);
+    delete_current_ = false;
+  }
+};
+
+class DBIteratorTest : public testing::Test {
+ public:
+  Env* env_;
+
+  DBIteratorTest() : env_(Env::Default()) {}
+};
+
+TEST_F(DBIteratorTest, DBIteratorPrevNext) {
+  Options options;
+  ImmutableOptions ioptions = ImmutableOptions(options);
+  MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddDeletion("a");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddPut("a", "val_a");
+
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->Finish();
+
+    ReadOptions ro;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+  // Test to check the SeekToLast() with iterate_upper_bound not set
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->Finish();
+
+    ReadOptions ro;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+  }
+
+  // Test to check the SeekToLast() with iterate_upper_bound set
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("d", "val_d");
+    internal_iter->AddPut("e", "val_e");
+    internal_iter->AddPut("f", "val_f");
+    internal_iter->Finish();
+
+    Slice prefix("d");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+  }
+  // Test to check the SeekToLast() iterate_upper_bound set to a key that
+  // is not Put yet
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("d", "val_d");
+    internal_iter->Finish();
+
+    Slice prefix("z");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "d");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "d");
+
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+  }
+  // Test to check the SeekToLast() with iterate_upper_bound set to the
+  // first key
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->Finish();
+
+    Slice prefix("a");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
+  }
+  // Test case to check SeekToLast with iterate_upper_bound set
+  // (same key put may times - SeekToLast should start with the
+  // maximum sequence id of the upper bound)
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->Finish();
+
+    Slice prefix("c");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 7 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    SetPerfLevel(kEnableCount);
+    ASSERT_TRUE(GetPerfLevel() == kEnableCount);
+
+    get_perf_context()->Reset();
+    db_iter->SeekToLast();
+
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(static_cast<int>(get_perf_context()->internal_key_skipped_count),
+              1);
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+
+    SetPerfLevel(kDisable);
+  }
+  // Test to check the SeekToLast() with the iterate_upper_bound set
+  // (Checking the value of the key which has sequence ids greater than
+  // and less that the iterator's sequence id)
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+
+    internal_iter->AddPut("a", "val_a1");
+    internal_iter->AddPut("a", "val_a2");
+    internal_iter->AddPut("b", "val_b1");
+    internal_iter->AddPut("c", "val_c1");
+    internal_iter->AddPut("c", "val_c2");
+    internal_iter->AddPut("c", "val_c3");
+    internal_iter->AddPut("b", "val_b2");
+    internal_iter->AddPut("d", "val_d1");
+    internal_iter->Finish();
+
+    Slice prefix("c");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 4 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val_b1");
+  }
+
+  // Test to check the SeekToLast() with the iterate_upper_bound set to the
+  // key that is deleted
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->Finish();
+
+    Slice prefix("a");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
+  }
+  // Test to check the SeekToLast() with the iterate_upper_bound set
+  // (Deletion cases)
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->Finish();
+
+    Slice prefix("c");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+  }
+  // Test to check the SeekToLast() with iterate_upper_bound set
+  // (Deletion cases - Lot of internal keys after the upper_bound
+  // is deleted)
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddDeletion("c");
+    internal_iter->AddDeletion("d");
+    internal_iter->AddDeletion("e");
+    internal_iter->AddDeletion("f");
+    internal_iter->AddDeletion("g");
+    internal_iter->AddDeletion("h");
+    internal_iter->Finish();
+
+    Slice prefix("c");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 7 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    SetPerfLevel(kEnableCount);
+    ASSERT_TRUE(GetPerfLevel() == kEnableCount);
+
+    get_perf_context()->Reset();
+    db_iter->SeekToLast();
+
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(
+        static_cast<int>(get_perf_context()->internal_delete_skipped_count), 0);
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+
+    SetPerfLevel(kDisable);
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddDeletion("a");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddPut("a", "val_a");
+
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->Finish();
+
+    ReadOptions ro;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->Finish();
+
+    ReadOptions ro;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 2 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val_b");
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("a", "val_a");
+
+    internal_iter->AddPut("b", "val_b");
+
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->Finish();
+
+    ReadOptions ro;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "val_c");
+  }
+}
+
+TEST_F(DBIteratorTest, DBIteratorEmpty) {
+  Options options;
+  ImmutableOptions ioptions = ImmutableOptions(options);
+  MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+  ReadOptions ro;
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 0 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 0 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
+  }
+}
+
+TEST_F(DBIteratorTest, DBIteratorUseSkipCountSkips) {
+  ReadOptions ro;
+  Options options;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  for (size_t i = 0; i < 200; ++i) {
+    internal_iter->AddPut("a", "a");
+    internal_iter->AddPut("b", "b");
+    internal_iter->AddPut("c", "c");
+  }
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      2 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
+  db_iter->SeekToLast();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "c");
+  ASSERT_EQ(db_iter->value().ToString(), "c");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1u);
+
+  db_iter->Prev();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  ASSERT_EQ(db_iter->value().ToString(), "b");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2u);
+
+  db_iter->Prev();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "a");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3u);
+
+  db_iter->Prev();
+  ASSERT_TRUE(!db_iter->Valid());
+  ASSERT_OK(db_iter->status());
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3u);
+}
+
+TEST_F(DBIteratorTest, DBIteratorUseSkip) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  ImmutableOptions ioptions = ImmutableOptions(options);
+  MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+
+  {
+    for (size_t i = 0; i < 200; ++i) {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      internal_iter->AddMerge("b", "merge_1");
+      internal_iter->AddMerge("a", "merge_2");
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddPut("c", std::to_string(k));
+      }
+      internal_iter->Finish();
+
+      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, i + 2 /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
+      db_iter->SeekToLast();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "c");
+      ASSERT_EQ(db_iter->value().ToString(), std::to_string(i));
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "b");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "a");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_2");
+      db_iter->Prev();
+
+      ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_OK(db_iter->status());
+    }
+  }
+
+  {
+    for (size_t i = 0; i < 200; ++i) {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      internal_iter->AddMerge("b", "merge_1");
+      internal_iter->AddMerge("a", "merge_2");
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddDeletion("c");
+      }
+      internal_iter->AddPut("c", "200");
+      internal_iter->Finish();
+
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, i + 2 /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
+      db_iter->SeekToLast();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "b");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "a");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_2");
+      db_iter->Prev();
+
+      ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_OK(db_iter->status());
+    }
+
+    {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      internal_iter->AddMerge("b", "merge_1");
+      internal_iter->AddMerge("a", "merge_2");
+      for (size_t i = 0; i < 200; ++i) {
+        internal_iter->AddDeletion("c");
+      }
+      internal_iter->AddPut("c", "200");
+      internal_iter->Finish();
+
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, 202 /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
+      db_iter->SeekToLast();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "c");
+      ASSERT_EQ(db_iter->value().ToString(), "200");
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "b");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "a");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_2");
+      db_iter->Prev();
+
+      ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_OK(db_iter->status());
+    }
+  }
+
+  {
+    for (size_t i = 0; i < 200; ++i) {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddDeletion("c");
+      }
+      internal_iter->AddPut("c", "200");
+      internal_iter->Finish();
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, i /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
+      db_iter->SeekToLast();
+      ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_OK(db_iter->status());
+
+      db_iter->SeekToFirst();
+      ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_OK(db_iter->status());
+    }
+
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    for (size_t i = 0; i < 200; ++i) {
+      internal_iter->AddDeletion("c");
+    }
+    internal_iter->AddPut("c", "200");
+    internal_iter->Finish();
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 200 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "200");
+
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
+
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "200");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
+  }
+
+  {
+    for (size_t i = 0; i < 200; ++i) {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      internal_iter->AddMerge("b", "merge_1");
+      internal_iter->AddMerge("a", "merge_2");
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddPut("d", std::to_string(k));
+      }
+
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddPut("c", std::to_string(k));
+      }
+      internal_iter->Finish();
+
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, i + 2 /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
+      db_iter->SeekToLast();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "d");
+      ASSERT_EQ(db_iter->value().ToString(), std::to_string(i));
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "b");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "a");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_2");
+      db_iter->Prev();
+
+      ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_OK(db_iter->status());
+    }
+  }
+
+  {
+    for (size_t i = 0; i < 200; ++i) {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      internal_iter->AddMerge("b", "b");
+      internal_iter->AddMerge("a", "a");
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddMerge("c", std::to_string(k));
+      }
+      internal_iter->Finish();
+
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, i + 2 /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
+      db_iter->SeekToLast();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "c");
+      std::string merge_result = "0";
+      for (size_t j = 1; j <= i; ++j) {
+        merge_result += "," + std::to_string(j);
+      }
+      ASSERT_EQ(db_iter->value().ToString(), merge_result);
+
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+      ASSERT_EQ(db_iter->key().ToString(), "b");
+      ASSERT_EQ(db_iter->value().ToString(), "b");
+
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+      ASSERT_EQ(db_iter->key().ToString(), "a");
+      ASSERT_EQ(db_iter->value().ToString(), "a");
+
+      db_iter->Prev();
+      ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_OK(db_iter->status());
+    }
+  }
+}
+
+TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
+  Options options;
+  ImmutableOptions ioptions = ImmutableOptions(options);
+  MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+  ReadOptions ro;
+
+  // Basic test case ... Make sure explicityly passing the default value works.
+  // Skipping internal keys is disabled by default, when the value is 0.
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddDeletion("c");
+    internal_iter->AddPut("d", "val_d");
+    internal_iter->Finish();
+
+    ro.max_skippable_internal_keys = 0;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "d");
+    ASSERT_EQ(db_iter->value().ToString(), "val_d");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().ok());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "d");
+    ASSERT_EQ(db_iter->value().ToString(), "val_d");
+
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
+  }
+
+  // Test to make sure that the request will *not* fail as incomplete if
+  // num_internal_keys_skipped is *equal* to max_skippable_internal_keys
+  // threshold. (It will fail as incomplete only when the threshold is
+  // exceeded.)
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->Finish();
+
+    ro.max_skippable_internal_keys = 2;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().ok());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+    db_iter->Prev();
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().ok());
+  }
+
+  // Fail the request as incomplete when num_internal_keys_skipped >
+  // max_skippable_internal_keys
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->Finish();
+
+    ro.max_skippable_internal_keys = 2;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().IsIncomplete());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().IsIncomplete());
+  }
+
+  // Test that the num_internal_keys_skipped counter resets after a successful
+  // read.
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddDeletion("d");
+    internal_iter->AddDeletion("d");
+    internal_iter->AddDeletion("d");
+    internal_iter->AddPut("e", "val_e");
+    internal_iter->Finish();
+
+    ro.max_skippable_internal_keys = 2;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+    db_iter->Next();  // num_internal_keys_skipped counter resets here.
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().IsIncomplete());
+  }
+
+  // Test that the num_internal_keys_skipped counter resets after a successful
+  // read.
+  // Reverse direction
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddDeletion("d");
+    internal_iter->AddDeletion("d");
+    internal_iter->AddPut("e", "val_e");
+    internal_iter->Finish();
+
+    ro.max_skippable_internal_keys = 2;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "e");
+    ASSERT_EQ(db_iter->value().ToString(), "val_e");
+
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+    db_iter->Prev();  // num_internal_keys_skipped counter resets here.
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().IsIncomplete());
+  }
+
+  // Test that skipping separate keys is handled
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddDeletion("c");
+    internal_iter->AddDeletion("d");
+    internal_iter->AddPut("e", "val_e");
+    internal_iter->Finish();
+
+    ro.max_skippable_internal_keys = 2;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().IsIncomplete());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "e");
+    ASSERT_EQ(db_iter->value().ToString(), "val_e");
+
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().IsIncomplete());
+  }
+
+  // Test if alternating puts and deletes of the same key are handled correctly.
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddDeletion("c");
+    internal_iter->AddPut("d", "val_d");
+    internal_iter->AddDeletion("d");
+    internal_iter->AddPut("e", "val_e");
+    internal_iter->Finish();
+
+    ro.max_skippable_internal_keys = 2;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().IsIncomplete());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "e");
+    ASSERT_EQ(db_iter->value().ToString(), "val_e");
+
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().IsIncomplete());
+  }
+
+  // Test for large number of skippable internal keys with *default*
+  // max_sequential_skip_in_iterations.
+  {
+    for (size_t i = 1; i <= 200; ++i) {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      internal_iter->AddPut("a", "val_a");
+      for (size_t j = 1; j <= i; ++j) {
+        internal_iter->AddPut("b", "val_b");
+        internal_iter->AddDeletion("b");
+      }
+      internal_iter->AddPut("c", "val_c");
+      internal_iter->Finish();
+
+      ro.max_skippable_internal_keys = i;
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, 2 * i + 1 /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
+
+      db_iter->SeekToFirst();
+      ASSERT_TRUE(db_iter->Valid());
+      ASSERT_EQ(db_iter->key().ToString(), "a");
+      ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+      db_iter->Next();
+      if ((options.max_sequential_skip_in_iterations + 1) >=
+          ro.max_skippable_internal_keys) {
+        ASSERT_TRUE(!db_iter->Valid());
+        ASSERT_TRUE(db_iter->status().IsIncomplete());
+      } else {
+        ASSERT_TRUE(db_iter->Valid());
+        ASSERT_EQ(db_iter->key().ToString(), "c");
+        ASSERT_EQ(db_iter->value().ToString(), "val_c");
+      }
+
+      db_iter->SeekToLast();
+      ASSERT_TRUE(db_iter->Valid());
+      ASSERT_EQ(db_iter->key().ToString(), "c");
+      ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+      db_iter->Prev();
+      if ((options.max_sequential_skip_in_iterations + 1) >=
+          ro.max_skippable_internal_keys) {
+        ASSERT_TRUE(!db_iter->Valid());
+        ASSERT_TRUE(db_iter->status().IsIncomplete());
+      } else {
+        ASSERT_TRUE(db_iter->Valid());
+        ASSERT_EQ(db_iter->key().ToString(), "a");
+        ASSERT_EQ(db_iter->value().ToString(), "val_a");
+      }
+    }
+  }
+
+  // Test for large number of skippable internal keys with a *non-default*
+  // max_sequential_skip_in_iterations.
+  {
+    for (size_t i = 1; i <= 200; ++i) {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      internal_iter->AddPut("a", "val_a");
+      for (size_t j = 1; j <= i; ++j) {
+        internal_iter->AddPut("b", "val_b");
+        internal_iter->AddDeletion("b");
+      }
+      internal_iter->AddPut("c", "val_c");
+      internal_iter->Finish();
+
+      options.max_sequential_skip_in_iterations = 1000;
+      ro.max_skippable_internal_keys = i;
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, 2 * i + 1 /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
+
+      db_iter->SeekToFirst();
+      ASSERT_TRUE(db_iter->Valid());
+      ASSERT_EQ(db_iter->key().ToString(), "a");
+      ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+      db_iter->Next();
+      ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_TRUE(db_iter->status().IsIncomplete());
+
+      db_iter->SeekToLast();
+      ASSERT_TRUE(db_iter->Valid());
+      ASSERT_EQ(db_iter->key().ToString(), "c");
+      ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+      db_iter->Prev();
+      ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_TRUE(db_iter->status().IsIncomplete());
+    }
+  }
+}
+
+TEST_F(DBIteratorTest, DBIterator1) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->AddDeletion("b");
+  internal_iter->AddMerge("a", "1");
+  internal_iter->AddMerge("b", "2");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      1 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
+  db_iter->SeekToFirst();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  db_iter->Next();
+  ASSERT_FALSE(db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator2) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->AddDeletion("b");
+  internal_iter->AddMerge("a", "1");
+  internal_iter->AddMerge("b", "2");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      0 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
+  db_iter->SeekToFirst();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+  db_iter->Next();
+  ASSERT_TRUE(!db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator3) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->AddDeletion("b");
+  internal_iter->AddMerge("a", "1");
+  internal_iter->AddMerge("b", "2");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      2 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
+  db_iter->SeekToFirst();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+  db_iter->Next();
+  ASSERT_TRUE(!db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator4) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->AddDeletion("b");
+  internal_iter->AddMerge("a", "1");
+  internal_iter->AddMerge("b", "2");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      4 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
+  db_iter->SeekToFirst();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0,1");
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  ASSERT_EQ(db_iter->value().ToString(), "2");
+  db_iter->Next();
+  ASSERT_TRUE(!db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator5) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  ImmutableOptions ioptions = ImmutableOptions(options);
+  MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 0 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 1 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 2 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2,merge_3");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 3 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "put_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 4 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 5 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 6 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4,merge_5,merge_6");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    // put, singledelete, merge
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddSingleDeletion("a");
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->Finish();
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->Seek("b");
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+  }
+}
+
+TEST_F(DBIteratorTest, DBIterator6) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  ImmutableOptions ioptions = ImmutableOptions(options);
+  MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 0 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 1 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 2 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2,merge_3");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 3 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 4 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 5 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 6 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5,merge_6");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+}
+
+TEST_F(DBIteratorTest, DBIterator7) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  ImmutableOptions ioptions = ImmutableOptions(options);
+  MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 0 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 2 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val,merge_2");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 4 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 5 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4");
+    db_iter->Prev();
+
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 6 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 7 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 9 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_6,merge_7");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 13 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(),
+              "merge_6,merge_7,merge_8,merge_9,merge_10,merge_11");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 14 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(),
+              "merge_6,merge_7,merge_8,merge_9,merge_10,merge_11");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+}
+
+TEST_F(DBIteratorTest, DBIterator8) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddDeletion("a");
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
+  db_iter->SeekToLast();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+
+  db_iter->Prev();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+}
+
+// TODO(3.13): fix the issue of Seek() then Prev() which might not necessary
+//             return the biggest element smaller than the seek key.
+TEST_F(DBIteratorTest, DBIterator9) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("b", "merge_3");
+    internal_iter->AddMerge("b", "merge_4");
+    internal_iter->AddMerge("d", "merge_5");
+    internal_iter->AddMerge("d", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+        BytewiseComparator(), internal_iter, nullptr /* version */,
+        10 /* sequence */, options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4");
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "d");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6");
+
+    db_iter->Seek("b");
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2");
+
+    db_iter->SeekForPrev("b");
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4");
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "d");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6");
+
+    db_iter->Seek("c");
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "d");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4");
+
+    db_iter->SeekForPrev("c");
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4");
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "d");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6");
+  }
+}
+
+// TODO(3.13): fix the issue of Seek() then Prev() which might not necessary
+//             return the biggest element smaller than the seek key.
+TEST_F(DBIteratorTest, DBIterator10) {
+  ReadOptions ro;
+  Options options;
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "1");
+  internal_iter->AddPut("b", "2");
+  internal_iter->AddPut("c", "3");
+  internal_iter->AddPut("d", "4");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
+
+  db_iter->Seek("c");
+  ASSERT_TRUE(db_iter->Valid());
+  db_iter->Prev();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  ASSERT_EQ(db_iter->value().ToString(), "2");
+
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "c");
+  ASSERT_EQ(db_iter->value().ToString(), "3");
+
+  db_iter->SeekForPrev("c");
+  ASSERT_TRUE(db_iter->Valid());
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "d");
+  ASSERT_EQ(db_iter->value().ToString(), "4");
+
+  db_iter->Prev();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "c");
+  ASSERT_EQ(db_iter->value().ToString(), "3");
+}
+
+TEST_F(DBIteratorTest, SeekToLastOccurrenceSeq0) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = nullptr;
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "1");
+  internal_iter->AddPut("b", "2");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, 0 /* force seek */, nullptr /* read_callback */));
+  db_iter->SeekToFirst();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "1");
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  ASSERT_EQ(db_iter->value().ToString(), "2");
+  db_iter->Next();
+  ASSERT_FALSE(db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator11) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->AddSingleDeletion("b");
+  internal_iter->AddMerge("a", "1");
+  internal_iter->AddMerge("b", "2");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      1 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
+  db_iter->SeekToFirst();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  db_iter->Next();
+  ASSERT_FALSE(db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator12) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = nullptr;
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "1");
+  internal_iter->AddPut("b", "2");
+  internal_iter->AddPut("c", "3");
+  internal_iter->AddSingleDeletion("b");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, 0 /* force seek */, nullptr /* read_callback */));
+  db_iter->SeekToLast();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "c");
+  ASSERT_EQ(db_iter->value().ToString(), "3");
+  db_iter->Prev();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "1");
+  db_iter->Prev();
+  ASSERT_FALSE(db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator13) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = nullptr;
+
+  std::string key;
+  key.resize(9);
+  key.assign(9, static_cast<char>(0));
+  key[0] = 'b';
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut(key, "0");
+  internal_iter->AddPut(key, "1");
+  internal_iter->AddPut(key, "2");
+  internal_iter->AddPut(key, "3");
+  internal_iter->AddPut(key, "4");
+  internal_iter->AddPut(key, "5");
+  internal_iter->AddPut(key, "6");
+  internal_iter->AddPut(key, "7");
+  internal_iter->AddPut(key, "8");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      2 /* sequence */, 3 /* max_sequential_skip_in_iterations */,
+      nullptr /* read_callback */));
+  db_iter->Seek("b");
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), key);
+  ASSERT_EQ(db_iter->value().ToString(), "2");
+}
+
+TEST_F(DBIteratorTest, DBIterator14) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = nullptr;
+
+  std::string key("b");
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("b", "0");
+  internal_iter->AddPut("b", "1");
+  internal_iter->AddPut("b", "2");
+  internal_iter->AddPut("b", "3");
+  internal_iter->AddPut("a", "4");
+  internal_iter->AddPut("a", "5");
+  internal_iter->AddPut("a", "6");
+  internal_iter->AddPut("c", "7");
+  internal_iter->AddPut("c", "8");
+  internal_iter->AddPut("c", "9");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      4 /* sequence */, 1 /* max_sequential_skip_in_iterations */,
+      nullptr /* read_callback */));
+  db_iter->Seek("b");
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  ASSERT_EQ(db_iter->value().ToString(), "3");
+  db_iter->SeekToFirst();
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "4");
+}
+
+class DBIterWithMergeIterTest : public testing::Test {
+ public:
+  DBIterWithMergeIterTest()
+      : env_(Env::Default()), icomp_(BytewiseComparator()) {
+    options_.merge_operator = nullptr;
+
+    internal_iter1_ = new TestIterator(BytewiseComparator());
+    internal_iter1_->Add("a", kTypeValue, "1", 3u);
+    internal_iter1_->Add("f", kTypeValue, "2", 5u);
+    internal_iter1_->Add("g", kTypeValue, "3", 7u);
+    internal_iter1_->Finish();
+
+    internal_iter2_ = new TestIterator(BytewiseComparator());
+    internal_iter2_->Add("a", kTypeValue, "4", 6u);
+    internal_iter2_->Add("b", kTypeValue, "5", 1u);
+    internal_iter2_->Add("c", kTypeValue, "6", 2u);
+    internal_iter2_->Add("d", kTypeValue, "7", 3u);
+    internal_iter2_->Finish();
+
+    std::vector<InternalIterator*> child_iters;
+    child_iters.push_back(internal_iter1_);
+    child_iters.push_back(internal_iter2_);
+    InternalKeyComparator icomp(BytewiseComparator());
+    InternalIterator* merge_iter =
+        NewMergingIterator(&icomp_, &child_iters[0], 2u);
+
+    db_iter_.reset(NewDBIterator(
+        env_, ro_, ImmutableOptions(options_), MutableCFOptions(options_),
+        BytewiseComparator(), merge_iter, nullptr /* version */,
+        8 /* read data earlier than seqId 8 */,
+        3 /* max iterators before reseek */, nullptr /* read_callback */));
+  }
+
+  Env* env_;
+  ReadOptions ro_;
+  Options options_;
+  TestIterator* internal_iter1_;
+  TestIterator* internal_iter2_;
+  InternalKeyComparator icomp_;
+  Iterator* merge_iter_;
+  std::unique_ptr<Iterator> db_iter_;
+};
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIterator1) {
+  db_iter_->SeekToFirst();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+  db_iter_->Next();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Next();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Next();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Next();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+  db_iter_->Next();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "g");
+  ASSERT_EQ(db_iter_->value().ToString(), "3");
+  db_iter_->Next();
+  ASSERT_FALSE(db_iter_->Valid());
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIterator2) {
+  // Test Prev() when one child iterator is at its end.
+  db_iter_->SeekForPrev("g");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "g");
+  ASSERT_EQ(db_iter_->value().ToString(), "3");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace1) {
+  // Test Prev() when one child iterator is at its end but more rows
+  // are added.
+  db_iter_->Seek("f");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+
+  // Test call back inserts a key in the end of the mem table after
+  // MergeIterator::Prev() realized the mem table iterator is at its end
+  // and before an SeekToLast() is called.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev",
+      [&](void* /*arg*/) { internal_iter2_->Add("z", kTypeValue, "7", 12u); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace2) {
+  // Test Prev() when one child iterator is at its end but more rows
+  // are added.
+  db_iter_->Seek("f");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+
+  // Test call back inserts entries for update a key in the end of the
+  // mem table after MergeIterator::Prev() realized the mem tableiterator is at
+  // its end and before an SeekToLast() is called.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev", [&](void* /*arg*/) {
+        internal_iter2_->Add("z", kTypeValue, "7", 12u);
+        internal_iter2_->Add("z", kTypeValue, "7", 11u);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace3) {
+  // Test Prev() when one child iterator is at its end but more rows
+  // are added and max_skipped is triggered.
+  db_iter_->Seek("f");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+
+  // Test call back inserts entries for update a key in the end of the
+  // mem table after MergeIterator::Prev() realized the mem table iterator is at
+  // its end and before an SeekToLast() is called.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev", [&](void* /*arg*/) {
+        internal_iter2_->Add("z", kTypeValue, "7", 16u, true);
+        internal_iter2_->Add("z", kTypeValue, "7", 15u, true);
+        internal_iter2_->Add("z", kTypeValue, "7", 14u, true);
+        internal_iter2_->Add("z", kTypeValue, "7", 13u, true);
+        internal_iter2_->Add("z", kTypeValue, "7", 12u, true);
+        internal_iter2_->Add("z", kTypeValue, "7", 11u, true);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace4) {
+  // Test Prev() when one child iterator has more rows inserted
+  // between Seek() and Prev() when changing directions.
+  internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+  db_iter_->Seek("g");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "g");
+  ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+  // Test call back inserts entries for update a key before "z" in
+  // mem table after MergeIterator::Prev() calls mem table iterator's
+  // Seek() and before calling Prev()
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+        IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+        if (it->key().starts_with("z")) {
+          internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 15u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 14u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 13u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 12u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 11u, true);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace5) {
+  internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+  // Test Prev() when one child iterator has more rows inserted
+  // between Seek() and Prev() when changing directions.
+  db_iter_->Seek("g");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "g");
+  ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+  // Test call back inserts entries for update a key before "z" in
+  // mem table after MergeIterator::Prev() calls mem table iterator's
+  // Seek() and before calling Prev()
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+        IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+        if (it->key().starts_with("z")) {
+          internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 15u, true);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace6) {
+  internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+  // Test Prev() when one child iterator has more rows inserted
+  // between Seek() and Prev() when changing directions.
+  db_iter_->Seek("g");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "g");
+  ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+  // Test call back inserts an entry for update a key before "z" in
+  // mem table after MergeIterator::Prev() calls mem table iterator's
+  // Seek() and before calling Prev()
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+        IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+        if (it->key().starts_with("z")) {
+          internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace7) {
+  internal_iter1_->Add("u", kTypeValue, "10", 4u);
+  internal_iter1_->Add("v", kTypeValue, "11", 4u);
+  internal_iter1_->Add("w", kTypeValue, "12", 4u);
+  internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+  // Test Prev() when one child iterator has more rows inserted
+  // between Seek() and Prev() when changing directions.
+  db_iter_->Seek("g");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "g");
+  ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+  // Test call back inserts entries for update a key before "z" in
+  // mem table after MergeIterator::Prev() calls mem table iterator's
+  // Seek() and before calling Prev()
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+        IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+        if (it->key().starts_with("z")) {
+          internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 15u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 14u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 13u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 12u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 11u, true);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace8) {
+  // internal_iter1_: a, f, g
+  // internal_iter2_: a, b, c, d, adding (z)
+  internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+  // Test Prev() when one child iterator has more rows inserted
+  // between Seek() and Prev() when changing directions.
+  db_iter_->Seek("g");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "g");
+  ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+  // Test call back inserts two keys before "z" in mem table after
+  // MergeIterator::Prev() calls mem table iterator's Seek() and
+  // before calling Prev()
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+        IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+        if (it->key().starts_with("z")) {
+          internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+          internal_iter2_->Add("y", kTypeValue, "7", 17u, true);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIteratorTest, SeekPrefixTombstones) {
+  ReadOptions ro;
+  Options options;
+  options.prefix_extractor.reset(NewNoopTransform());
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddDeletion("b");
+  internal_iter->AddDeletion("c");
+  internal_iter->AddDeletion("d");
+  internal_iter->AddDeletion("e");
+  internal_iter->AddDeletion("f");
+  internal_iter->AddDeletion("g");
+  internal_iter->Finish();
+
+  ro.prefix_same_as_start = true;
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
+
+  int skipped_keys = 0;
+
+  get_perf_context()->Reset();
+  db_iter->SeekForPrev("z");
+  skipped_keys =
+      static_cast<int>(get_perf_context()->internal_key_skipped_count);
+  ASSERT_EQ(skipped_keys, 0);
+
+  get_perf_context()->Reset();
+  db_iter->Seek("a");
+  skipped_keys =
+      static_cast<int>(get_perf_context()->internal_key_skipped_count);
+  ASSERT_EQ(skipped_keys, 0);
+}
+
+TEST_F(DBIteratorTest, SeekToFirstLowerBound) {
+  const int kNumKeys = 3;
+  for (int i = 0; i < kNumKeys + 2; ++i) {
+    // + 2 for two special cases: lower bound before and lower bound after the
+    // internal iterator's keys
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    for (int j = 1; j <= kNumKeys; ++j) {
+      internal_iter->AddPut(std::to_string(j), "val");
+    }
+    internal_iter->Finish();
+
+    ReadOptions ro;
+    auto lower_bound_str = std::to_string(i);
+    Slice lower_bound(lower_bound_str);
+    ro.iterate_lower_bound = &lower_bound;
+    Options options;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+        BytewiseComparator(), internal_iter, nullptr /* version */,
+        10 /* sequence */, options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToFirst();
+    if (i == kNumKeys + 1) {
+      // lower bound was beyond the last key
+      ASSERT_FALSE(db_iter->Valid());
+      ASSERT_OK(db_iter->status());
+    } else {
+      ASSERT_TRUE(db_iter->Valid());
+      int expected;
+      if (i == 0) {
+        // lower bound was before the first key
+        expected = 1;
+      } else {
+        // lower bound was at the ith key
+        expected = i;
+      }
+      ASSERT_EQ(std::to_string(expected), db_iter->key().ToString());
+    }
+  }
+}
+
+TEST_F(DBIteratorTest, PrevLowerBound) {
+  const int kNumKeys = 3;
+  const int kLowerBound = 2;
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  for (int j = 1; j <= kNumKeys; ++j) {
+    internal_iter->AddPut(std::to_string(j), "val");
+  }
+  internal_iter->Finish();
+
+  ReadOptions ro;
+  auto lower_bound_str = std::to_string(kLowerBound);
+  Slice lower_bound(lower_bound_str);
+  ro.iterate_lower_bound = &lower_bound;
+  Options options;
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
+
+  db_iter->SeekToLast();
+  for (int i = kNumKeys; i >= kLowerBound; --i) {
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(std::to_string(i), db_iter->key().ToString());
+    db_iter->Prev();
+  }
+  ASSERT_FALSE(db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, SeekLessLowerBound) {
+  const int kNumKeys = 3;
+  const int kLowerBound = 2;
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  for (int j = 1; j <= kNumKeys; ++j) {
+    internal_iter->AddPut(std::to_string(j), "val");
+  }
+  internal_iter->Finish();
+
+  ReadOptions ro;
+  auto lower_bound_str = std::to_string(kLowerBound);
+  Slice lower_bound(lower_bound_str);
+  ro.iterate_lower_bound = &lower_bound;
+  Options options;
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
+
+  auto before_lower_bound_str = std::to_string(kLowerBound - 1);
+  Slice before_lower_bound(lower_bound_str);
+
+  db_iter->Seek(before_lower_bound);
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(lower_bound_str, db_iter->key().ToString());
+}
+
+TEST_F(DBIteratorTest, ReverseToForwardWithDisappearingKeys) {
+  Options options;
+  options.prefix_extractor.reset(NewCappedPrefixTransform(0));
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "A");
+  internal_iter->AddPut("b", "B");
+  for (int i = 0; i < 100; ++i) {
+    internal_iter->AddPut("c" + std::to_string(i), "");
+  }
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ReadOptions(), ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
+
+  db_iter->SeekForPrev("a");
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_OK(db_iter->status());
+  ASSERT_EQ("a", db_iter->key().ToString());
+
+  internal_iter->Vanish("a");
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_OK(db_iter->status());
+  ASSERT_EQ("b", db_iter->key().ToString());
+
+  // A (sort of) bug used to cause DBIter to pointlessly drag the internal
+  // iterator all the way to the end. But this doesn't really matter at the time
+  // of writing because the only iterator that can see disappearing keys is
+  // ForwardIterator, which doesn't support SeekForPrev().
+  EXPECT_LT(internal_iter->steps(), 20);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_iterator_test.cc b/src/rocksdb/db/db_iterator_test.cc
new file mode 100644
index 000000000..aaf1408b4
--- /dev/null
+++ b/src/rocksdb/db/db_iterator_test.cc
@@ -0,0 +1,3265 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <functional>
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/db_iter.h"
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/iostats_context.h"
+#include "rocksdb/perf_context.h"
+#include "table/block_based/flush_block_policy.h"
+#include "util/random.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A dumb ReadCallback which saying every key is committed.
+class DummyReadCallback : public ReadCallback {
+ public:
+  DummyReadCallback() : ReadCallback(kMaxSequenceNumber) {}
+  bool IsVisibleFullCheck(SequenceNumber /*seq*/) override { return true; }
+  void SetSnapshot(SequenceNumber seq) { max_visible_seq_ = seq; }
+};
+
+// Test param:
+//   bool: whether to pass read_callback to NewIterator().
+class DBIteratorTest : public DBTestBase,
+                       public testing::WithParamInterface<bool> {
+ public:
+  DBIteratorTest() : DBTestBase("db_iterator_test", /*env_do_fsync=*/true) {}
+
+  Iterator* NewIterator(const ReadOptions& read_options,
+                        ColumnFamilyHandle* column_family = nullptr) {
+    if (column_family == nullptr) {
+      column_family = db_->DefaultColumnFamily();
+    }
+    auto* cfd =
+        static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+    SequenceNumber seq = read_options.snapshot != nullptr
+                             ? read_options.snapshot->GetSequenceNumber()
+                             : db_->GetLatestSequenceNumber();
+    bool use_read_callback = GetParam();
+    DummyReadCallback* read_callback = nullptr;
+    if (use_read_callback) {
+      read_callback = new DummyReadCallback();
+      read_callback->SetSnapshot(seq);
+      InstrumentedMutexLock lock(&mutex_);
+      read_callbacks_.push_back(
+          std::unique_ptr<DummyReadCallback>(read_callback));
+    }
+    return dbfull()->NewIteratorImpl(read_options, cfd, seq, read_callback);
+  }
+
+ private:
+  InstrumentedMutex mutex_;
+  std::vector<std::unique_ptr<DummyReadCallback>> read_callbacks_;
+};
+
+TEST_P(DBIteratorTest, IteratorProperty) {
+  // The test needs to be changed if kPersistedTier is supported in iterator.
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_OK(Put(1, "1", "2"));
+  ASSERT_OK(Delete(1, "2"));
+  ReadOptions ropt;
+  ropt.pin_data = false;
+  {
+    std::unique_ptr<Iterator> iter(NewIterator(ropt, handles_[1]));
+    iter->SeekToFirst();
+    std::string prop_value;
+    ASSERT_NOK(iter->GetProperty("non_existing.value", &prop_value));
+    ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+    ASSERT_EQ("0", prop_value);
+    ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value));
+    ASSERT_EQ("1", prop_value);
+    iter->Next();
+    ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+    ASSERT_EQ("Iterator is not valid.", prop_value);
+
+    // Get internal key at which the iteration stopped (tombstone in this case).
+    ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value));
+    ASSERT_EQ("2", prop_value);
+  }
+  Close();
+}
+
+TEST_P(DBIteratorTest, PersistedTierOnIterator) {
+  // The test needs to be changed if kPersistedTier is supported in iterator.
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ReadOptions ropt;
+  ropt.read_tier = kPersistedTier;
+
+  auto* iter = db_->NewIterator(ropt, handles_[1]);
+  ASSERT_TRUE(iter->status().IsNotSupported());
+  delete iter;
+
+  std::vector<Iterator*> iters;
+  ASSERT_TRUE(db_->NewIterators(ropt, {handles_[1]}, &iters).IsNotSupported());
+  Close();
+}
+
+TEST_P(DBIteratorTest, NonBlockingIteration) {
+  do {
+    ReadOptions non_blocking_opts, regular_opts;
+    anon::OptionsOverride options_override;
+    options_override.full_block_cache = true;
+    Options options = CurrentOptions(options_override);
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    non_blocking_opts.read_tier = kBlockCacheTier;
+
+    CreateAndReopenWithCF({"pikachu"}, options);
+    // write one kv to the database.
+    ASSERT_OK(Put(1, "a", "b"));
+
+    // scan using non-blocking iterator. We should find it because
+    // it is in memtable.
+    Iterator* iter = NewIterator(non_blocking_opts, handles_[1]);
+    int count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      count++;
+    }
+    ASSERT_EQ(count, 1);
+    delete iter;
+
+    // flush memtable to storage. Now, the key should not be in the
+    // memtable neither in the block cache.
+    ASSERT_OK(Flush(1));
+
+    // verify that a non-blocking iterator does not find any
+    // kvs. Neither does it do any IOs to storage.
+    uint64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    iter = NewIterator(non_blocking_opts, handles_[1]);
+    count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      count++;
+    }
+    ASSERT_EQ(count, 0);
+    ASSERT_TRUE(iter->status().IsIncomplete());
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+    delete iter;
+
+    // read in the specified block via a regular get
+    ASSERT_EQ(Get(1, "a"), "b");
+
+    // verify that we can find it via a non-blocking scan
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    iter = NewIterator(non_blocking_opts, handles_[1]);
+    count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      count++;
+    }
+    ASSERT_EQ(count, 1);
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+    delete iter;
+
+    // This test verifies block cache behaviors, which is not used by plain
+    // table format.
+  } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipMmapReads));
+}
+
+TEST_P(DBIteratorTest, IterSeekBeforePrev) {
+  ASSERT_OK(Put("a", "b"));
+  ASSERT_OK(Put("c", "d"));
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
+  ASSERT_OK(Put("0", "f"));
+  ASSERT_OK(Put("1", "h"));
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
+  ASSERT_OK(Put("2", "j"));
+  auto iter = NewIterator(ReadOptions());
+  iter->Seek(Slice("c"));
+  iter->Prev();
+  iter->Seek(Slice("a"));
+  iter->Prev();
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, IterReseekNewUpperBound) {
+  Random rnd(301);
+  Options options = CurrentOptions();
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1024;
+  table_options.block_size_deviation = 50;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.compression = kNoCompression;
+  Reopen(options);
+
+  ASSERT_OK(Put("a", rnd.RandomString(400)));
+  ASSERT_OK(Put("aabb", rnd.RandomString(400)));
+  ASSERT_OK(Put("aaef", rnd.RandomString(400)));
+  ASSERT_OK(Put("b", rnd.RandomString(400)));
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
+  ReadOptions opts;
+  Slice ub = Slice("aa");
+  opts.iterate_upper_bound = &ub;
+  auto iter = NewIterator(opts);
+  iter->Seek(Slice("a"));
+  ub = Slice("b");
+  iter->Seek(Slice("aabc"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "aaef");
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, IterSeekForPrevBeforeNext) {
+  ASSERT_OK(Put("a", "b"));
+  ASSERT_OK(Put("c", "d"));
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
+  ASSERT_OK(Put("0", "f"));
+  ASSERT_OK(Put("1", "h"));
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
+  ASSERT_OK(Put("2", "j"));
+  auto iter = NewIterator(ReadOptions());
+  iter->SeekForPrev(Slice("0"));
+  iter->Next();
+  iter->SeekForPrev(Slice("1"));
+  iter->Next();
+  delete iter;
+}
+
+namespace {
+std::string MakeLongKey(size_t length, char c) {
+  return std::string(length, c);
+}
+}  // anonymous namespace
+
+TEST_P(DBIteratorTest, IterLongKeys) {
+  ASSERT_OK(Put(MakeLongKey(20, 0), "0"));
+  ASSERT_OK(Put(MakeLongKey(32, 2), "2"));
+  ASSERT_OK(Put("a", "b"));
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
+  ASSERT_OK(Put(MakeLongKey(50, 1), "1"));
+  ASSERT_OK(Put(MakeLongKey(127, 3), "3"));
+  ASSERT_OK(Put(MakeLongKey(64, 4), "4"));
+  auto iter = NewIterator(ReadOptions());
+
+  // Create a key that needs to be skipped for Seq too new
+  iter->Seek(MakeLongKey(20, 0));
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(20, 0) + "->0");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(64, 4) + "->4");
+
+  iter->SeekForPrev(MakeLongKey(127, 3));
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1");
+  delete iter;
+
+  iter = NewIterator(ReadOptions());
+  iter->Seek(MakeLongKey(50, 1));
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3");
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, IterNextWithNewerSeq) {
+  ASSERT_OK(Put("0", "0"));
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
+  ASSERT_OK(Put("a", "b"));
+  ASSERT_OK(Put("c", "d"));
+  ASSERT_OK(Put("d", "e"));
+  auto iter = NewIterator(ReadOptions());
+
+  // Create a key that needs to be skipped for Seq too new
+  for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
+       i++) {
+    ASSERT_OK(Put("b", "f"));
+  }
+
+  iter->Seek(Slice("a"));
+  ASSERT_EQ(IterStatus(iter), "a->b");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), "c->d");
+  iter->SeekForPrev(Slice("b"));
+  ASSERT_EQ(IterStatus(iter), "a->b");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), "c->d");
+
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, IterPrevWithNewerSeq) {
+  ASSERT_OK(Put("0", "0"));
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
+  ASSERT_OK(Put("a", "b"));
+  ASSERT_OK(Put("c", "d"));
+  ASSERT_OK(Put("d", "e"));
+  auto iter = NewIterator(ReadOptions());
+
+  // Create a key that needs to be skipped for Seq too new
+  for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
+       i++) {
+    ASSERT_OK(Put("b", "f"));
+  }
+
+  iter->Seek(Slice("d"));
+  ASSERT_EQ(IterStatus(iter), "d->e");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "c->d");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "a->b");
+  iter->Prev();
+  iter->SeekForPrev(Slice("d"));
+  ASSERT_EQ(IterStatus(iter), "d->e");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "c->d");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "a->b");
+  iter->Prev();
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, IterPrevWithNewerSeq2) {
+  ASSERT_OK(Put("0", "0"));
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
+  ASSERT_OK(Put("a", "b"));
+  ASSERT_OK(Put("c", "d"));
+  ASSERT_OK(Put("e", "f"));
+  auto iter = NewIterator(ReadOptions());
+  auto iter2 = NewIterator(ReadOptions());
+  iter->Seek(Slice("c"));
+  iter2->SeekForPrev(Slice("d"));
+  ASSERT_EQ(IterStatus(iter), "c->d");
+  ASSERT_EQ(IterStatus(iter2), "c->d");
+
+  // Create a key that needs to be skipped for Seq too new
+  for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
+       i++) {
+    ASSERT_OK(Put("b", "f"));
+  }
+
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "a->b");
+  iter->Prev();
+  iter2->Prev();
+  ASSERT_EQ(IterStatus(iter2), "a->b");
+  iter2->Prev();
+  delete iter;
+  delete iter2;
+}
+
+TEST_P(DBIteratorTest, IterEmpty) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->Seek("foo");
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->SeekForPrev("foo");
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    ASSERT_OK(iter->status());
+
+    delete iter;
+  } while (ChangeCompactOptions());
+}
+
+TEST_P(DBIteratorTest, IterSingle) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "a", "va"));
+    Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->Seek("");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekForPrev("");
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->Seek("a");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekForPrev("a");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->Seek("b");
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekForPrev("b");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    delete iter;
+  } while (ChangeCompactOptions());
+}
+
+TEST_P(DBIteratorTest, IterMulti) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "a", "va"));
+    ASSERT_OK(Put(1, "b", "vb"));
+    ASSERT_OK(Put(1, "c", "vc"));
+    Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->Seek("");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Seek("a");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Seek("ax");
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->SeekForPrev("d");
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->SeekForPrev("c");
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->SeekForPrev("bx");
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+
+    iter->Seek("b");
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Seek("z");
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekForPrev("b");
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->SeekForPrev("");
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    // Switch from reverse to forward
+    iter->SeekToLast();
+    iter->Prev();
+    iter->Prev();
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+
+    // Switch from forward to reverse
+    iter->SeekToFirst();
+    iter->Next();
+    iter->Next();
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+
+    // Make sure iter stays at snapshot
+    ASSERT_OK(Put(1, "a", "va2"));
+    ASSERT_OK(Put(1, "a2", "va3"));
+    ASSERT_OK(Put(1, "b", "vb2"));
+    ASSERT_OK(Put(1, "c", "vc2"));
+    ASSERT_OK(Delete(1, "b"));
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    delete iter;
+  } while (ChangeCompactOptions());
+}
+
+// Check that we can skip over a run of user keys
+// by using reseek rather than sequential scan
+TEST_P(DBIteratorTest, IterReseek) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  Options options = CurrentOptions(options_override);
+  options.max_sequential_skip_in_iterations = 3;
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // insert three keys with same userkey and verify that
+  // reseek is not invoked. For each of these test cases,
+  // verify that we can find the next key "b".
+  ASSERT_OK(Put(1, "a", "zero"));
+  ASSERT_OK(Put(1, "a", "one"));
+  ASSERT_OK(Put(1, "a", "two"));
+  ASSERT_OK(Put(1, "b", "bone"));
+  Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+  iter->SeekToFirst();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "a->two");
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "b->bone");
+  delete iter;
+
+  // insert a total of three keys with same userkey and verify
+  // that reseek is still not invoked.
+  ASSERT_OK(Put(1, "a", "three"));
+  iter = NewIterator(ReadOptions(), handles_[1]);
+  iter->SeekToFirst();
+  ASSERT_EQ(IterStatus(iter), "a->three");
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "b->bone");
+  delete iter;
+
+  // insert a total of four keys with same userkey and verify
+  // that reseek is invoked.
+  ASSERT_OK(Put(1, "a", "four"));
+  iter = NewIterator(ReadOptions(), handles_[1]);
+  iter->SeekToFirst();
+  ASSERT_EQ(IterStatus(iter), "a->four");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(IterStatus(iter), "b->bone");
+  delete iter;
+
+  // Testing reverse iterator
+  // At this point, we have three versions of "a" and one version of "b".
+  // The reseek statistics is already at 1.
+  int num_reseeks = static_cast<int>(
+      TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION));
+
+  // Insert another version of b and assert that reseek is not invoked
+  ASSERT_OK(Put(1, "b", "btwo"));
+  iter = NewIterator(ReadOptions(), handles_[1]);
+  iter->SeekToLast();
+  ASSERT_EQ(IterStatus(iter), "b->btwo");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+            num_reseeks);
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+            num_reseeks + 1);
+  ASSERT_EQ(IterStatus(iter), "a->four");
+  delete iter;
+
+  // insert two more versions of b. This makes a total of 4 versions
+  // of b and 4 versions of a.
+  ASSERT_OK(Put(1, "b", "bthree"));
+  ASSERT_OK(Put(1, "b", "bfour"));
+  iter = NewIterator(ReadOptions(), handles_[1]);
+  iter->SeekToLast();
+  ASSERT_EQ(IterStatus(iter), "b->bfour");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+            num_reseeks + 2);
+  iter->Prev();
+
+  // the previous Prev call should have invoked reseek
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+            num_reseeks + 3);
+  ASSERT_EQ(IterStatus(iter), "a->four");
+  delete iter;
+}
+
+TEST_F(DBIteratorTest, ReseekUponDirectionChange) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.merge_operator.reset(
+      new StringAppendTESTOperator(/*delim_char=*/' '));
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "value"));
+  ASSERT_OK(Put("bar", "value"));
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->SeekToLast();
+    it->Prev();
+    it->Next();
+  }
+  ASSERT_EQ(1,
+            options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+
+  const std::string merge_key("good");
+  ASSERT_OK(Put(merge_key, "orig"));
+  ASSERT_OK(Merge(merge_key, "suffix"));
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->Seek(merge_key);
+    ASSERT_TRUE(it->Valid());
+    const uint64_t prev_reseek_count =
+        options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION);
+    it->Prev();
+    ASSERT_EQ(prev_reseek_count + 1, options.statistics->getTickerCount(
+                                         NUMBER_OF_RESEEKS_IN_ITERATION));
+  }
+}
+
+TEST_P(DBIteratorTest, IterSmallAndLargeMix) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "a", "va"));
+    ASSERT_OK(Put(1, "b", std::string(100000, 'b')));
+    ASSERT_OK(Put(1, "c", "vc"));
+    ASSERT_OK(Put(1, "d", std::string(100000, 'd')));
+    ASSERT_OK(Put(1, "e", std::string(100000, 'e')));
+
+    Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b'));
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd'));
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e'));
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e'));
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd'));
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b'));
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    delete iter;
+  } while (ChangeCompactOptions());
+}
+
+TEST_P(DBIteratorTest, IterMultiWithDelete) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "ka", "va"));
+    ASSERT_OK(Put(1, "kb", "vb"));
+    ASSERT_OK(Put(1, "kc", "vc"));
+    ASSERT_OK(Delete(1, "kb"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "kb"));
+
+    Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+    iter->Seek("kc");
+    ASSERT_EQ(IterStatus(iter), "kc->vc");
+    if (!CurrentOptions().merge_operator) {
+      // TODO: merge operator does not support backward iteration yet
+      if (kPlainTableAllBytesPrefix != option_config_ &&
+          kBlockBasedTableWithWholeKeyHashIndex != option_config_ &&
+          kHashLinkList != option_config_ &&
+          kHashSkipList != option_config_) {  // doesn't support SeekToLast
+        iter->Prev();
+        ASSERT_EQ(IterStatus(iter), "ka->va");
+      }
+    }
+    delete iter;
+  } while (ChangeOptions());
+}
+
+TEST_P(DBIteratorTest, IterPrevMaxSkip) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    for (int i = 0; i < 2; i++) {
+      ASSERT_OK(Put(1, "key1", "v1"));
+      ASSERT_OK(Put(1, "key2", "v2"));
+      ASSERT_OK(Put(1, "key3", "v3"));
+      ASSERT_OK(Put(1, "key4", "v4"));
+      ASSERT_OK(Put(1, "key5", "v5"));
+    }
+
+    VerifyIterLast("key5->v5", 1);
+
+    ASSERT_OK(Delete(1, "key5"));
+    VerifyIterLast("key4->v4", 1);
+
+    ASSERT_OK(Delete(1, "key4"));
+    VerifyIterLast("key3->v3", 1);
+
+    ASSERT_OK(Delete(1, "key3"));
+    VerifyIterLast("key2->v2", 1);
+
+    ASSERT_OK(Delete(1, "key2"));
+    VerifyIterLast("key1->v1", 1);
+
+    ASSERT_OK(Delete(1, "key1"));
+    VerifyIterLast("(invalid)", 1);
+  } while (ChangeOptions(kSkipMergePut | kSkipNoSeekToLast));
+}
+
+TEST_P(DBIteratorTest, IterWithSnapshot) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
+    ASSERT_OK(Put(1, "key1", "val1"));
+    ASSERT_OK(Put(1, "key2", "val2"));
+    ASSERT_OK(Put(1, "key3", "val3"));
+    ASSERT_OK(Put(1, "key4", "val4"));
+    ASSERT_OK(Put(1, "key5", "val5"));
+
+    const Snapshot* snapshot = db_->GetSnapshot();
+    ReadOptions options;
+    options.snapshot = snapshot;
+    Iterator* iter = NewIterator(options, handles_[1]);
+
+    ASSERT_OK(Put(1, "key0", "val0"));
+    // Put more values after the snapshot
+    ASSERT_OK(Put(1, "key100", "val100"));
+    ASSERT_OK(Put(1, "key101", "val101"));
+
+    iter->Seek("key5");
+    ASSERT_EQ(IterStatus(iter), "key5->val5");
+    if (!CurrentOptions().merge_operator) {
+      // TODO: merge operator does not support backward iteration yet
+      if (kPlainTableAllBytesPrefix != option_config_ &&
+          kBlockBasedTableWithWholeKeyHashIndex != option_config_ &&
+          kHashLinkList != option_config_ && kHashSkipList != option_config_) {
+        iter->Prev();
+        ASSERT_EQ(IterStatus(iter), "key4->val4");
+        iter->Prev();
+        ASSERT_EQ(IterStatus(iter), "key3->val3");
+
+        iter->Next();
+        ASSERT_EQ(IterStatus(iter), "key4->val4");
+        iter->Next();
+        ASSERT_EQ(IterStatus(iter), "key5->val5");
+      }
+      iter->Next();
+      ASSERT_TRUE(!iter->Valid());
+    }
+
+    if (!CurrentOptions().merge_operator) {
+      // TODO(gzh): merge operator does not support backward iteration yet
+      if (kPlainTableAllBytesPrefix != option_config_ &&
+          kBlockBasedTableWithWholeKeyHashIndex != option_config_ &&
+          kHashLinkList != option_config_ && kHashSkipList != option_config_) {
+        iter->SeekForPrev("key1");
+        ASSERT_EQ(IterStatus(iter), "key1->val1");
+        iter->Next();
+        ASSERT_EQ(IterStatus(iter), "key2->val2");
+        iter->Next();
+        ASSERT_EQ(IterStatus(iter), "key3->val3");
+        iter->Prev();
+        ASSERT_EQ(IterStatus(iter), "key2->val2");
+        iter->Prev();
+        ASSERT_EQ(IterStatus(iter), "key1->val1");
+        iter->Prev();
+        ASSERT_TRUE(!iter->Valid());
+      }
+    }
+    db_->ReleaseSnapshot(snapshot);
+    delete iter;
+  } while (ChangeOptions());
+}
+
+TEST_P(DBIteratorTest, IteratorPinsRef) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "hello"));
+
+    // Get iterator that will yield the current contents of the DB.
+    Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+
+    // Write to force compactions
+    ASSERT_OK(Put(1, "foo", "newvalue1"));
+    for (int i = 0; i < 100; i++) {
+      // 100K values
+      ASSERT_OK(Put(1, Key(i), Key(i) + std::string(100000, 'v')));
+    }
+    ASSERT_OK(Put(1, "foo", "newvalue2"));
+
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("foo", iter->key().ToString());
+    ASSERT_EQ("hello", iter->value().ToString());
+    iter->Next();
+    ASSERT_TRUE(!iter->Valid());
+    delete iter;
+  } while (ChangeCompactOptions());
+}
+
+TEST_P(DBIteratorTest, IteratorDeleteAfterCfDelete) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+
+  ASSERT_OK(Put(1, "foo", "delete-cf-then-delete-iter"));
+  ASSERT_OK(Put(1, "hello", "value2"));
+
+  ColumnFamilyHandle* cf = handles_[1];
+  ReadOptions ro;
+
+  auto* iter = db_->NewIterator(ro, cf);
+  iter->SeekToFirst();
+  ASSERT_EQ(IterStatus(iter), "foo->delete-cf-then-delete-iter");
+
+  // delete CF handle
+  EXPECT_OK(db_->DestroyColumnFamilyHandle(cf));
+  handles_.erase(std::begin(handles_) + 1);
+
+  // delete Iterator after CF handle is deleted
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), "hello->value2");
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, IteratorDeleteAfterCfDrop) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+
+  ASSERT_OK(Put(1, "foo", "drop-cf-then-delete-iter"));
+
+  ReadOptions ro;
+  ColumnFamilyHandle* cf = handles_[1];
+
+  auto* iter = db_->NewIterator(ro, cf);
+  iter->SeekToFirst();
+  ASSERT_EQ(IterStatus(iter), "foo->drop-cf-then-delete-iter");
+
+  // drop and delete CF
+  EXPECT_OK(db_->DropColumnFamily(cf));
+  EXPECT_OK(db_->DestroyColumnFamilyHandle(cf));
+  handles_.erase(std::begin(handles_) + 1);
+
+  // delete Iterator after CF handle is dropped
+  delete iter;
+}
+
+// SetOptions not defined in ROCKSDB LITE
+#ifndef ROCKSDB_LITE
+TEST_P(DBIteratorTest, DBIteratorBoundTest) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+
+  options.prefix_extractor = nullptr;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("a", "0"));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("foo1", "bar1"));
+  ASSERT_OK(Put("g1", "0"));
+
+  // testing basic case with no iterate_upper_bound and no prefix_extractor
+  {
+    ReadOptions ro;
+    ro.iterate_upper_bound = nullptr;
+
+    std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+    iter->Seek("foo");
+
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("g1")), 0);
+
+    iter->SeekForPrev("g1");
+
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("g1")), 0);
+
+    iter->Prev();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
+
+    iter->Prev();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo")), 0);
+  }
+
+  // testing iterate_upper_bound and forward iterator
+  // to make sure it stops at bound
+  {
+    ReadOptions ro;
+    // iterate_upper_bound points beyond the last expected entry
+    Slice prefix("foo2");
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+    iter->Seek("foo");
+
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(("foo1")), 0);
+
+    iter->Next();
+    // should stop here...
+    ASSERT_TRUE(!iter->Valid());
+  }
+  // Testing SeekToLast with iterate_upper_bound set
+  {
+    ReadOptions ro;
+
+    Slice prefix("foo");
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+    iter->SeekToLast();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("a")), 0);
+  }
+
+  // prefix is the first letter of the key
+  ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:1"}}));
+  ASSERT_OK(Put("a", "0"));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("foo1", "bar1"));
+  ASSERT_OK(Put("g1", "0"));
+
+  // testing with iterate_upper_bound and prefix_extractor
+  // Seek target and iterate_upper_bound are not is same prefix
+  // This should be an error
+  {
+    ReadOptions ro;
+    Slice upper_bound("g");
+    ro.iterate_upper_bound = &upper_bound;
+
+    std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+    iter->Seek("foo");
+
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("foo", iter->key().ToString());
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("foo1", iter->key().ToString());
+
+    iter->Next();
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  // testing that iterate_upper_bound prevents iterating over deleted items
+  // if the bound has already reached
+  {
+    options.prefix_extractor = nullptr;
+    DestroyAndReopen(options);
+    ASSERT_OK(Put("a", "0"));
+    ASSERT_OK(Put("b", "0"));
+    ASSERT_OK(Put("b1", "0"));
+    ASSERT_OK(Put("c", "0"));
+    ASSERT_OK(Put("d", "0"));
+    ASSERT_OK(Put("e", "0"));
+    ASSERT_OK(Delete("c"));
+    ASSERT_OK(Delete("d"));
+
+    // base case with no bound
+    ReadOptions ro;
+    ro.iterate_upper_bound = nullptr;
+
+    std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+    iter->Seek("b");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("b")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(("b1")), 0);
+
+    get_perf_context()->Reset();
+    iter->Next();
+
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(
+        static_cast<int>(get_perf_context()->internal_delete_skipped_count), 2);
+
+    // now testing with iterate_bound
+    Slice prefix("c");
+    ro.iterate_upper_bound = &prefix;
+
+    iter.reset(NewIterator(ro));
+
+    get_perf_context()->Reset();
+
+    iter->Seek("b");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("b")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(("b1")), 0);
+
+    iter->Next();
+    // the iteration should stop as soon as the bound key is reached
+    // even though the key is deleted
+    // hence internal_delete_skipped_count should be 0
+    ASSERT_TRUE(!iter->Valid());
+    ASSERT_EQ(
+        static_cast<int>(get_perf_context()->internal_delete_skipped_count), 0);
+  }
+}
+
+TEST_P(DBIteratorTest, DBIteratorBoundMultiSeek) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.prefix_extractor = nullptr;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("a", "0"));
+  ASSERT_OK(Put("z", "0"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo1", "bar1"));
+  ASSERT_OK(Put("foo2", "bar2"));
+  ASSERT_OK(Put("foo3", "bar3"));
+  ASSERT_OK(Put("foo4", "bar4"));
+
+  {
+    std::string up_str = "foo5";
+    Slice up(up_str);
+    ReadOptions ro;
+    ro.iterate_upper_bound = &up;
+    std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+    iter->Seek("foo1");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
+
+    uint64_t prev_block_cache_hit =
+        TestGetTickerCount(options, BLOCK_CACHE_HIT);
+    uint64_t prev_block_cache_miss =
+        TestGetTickerCount(options, BLOCK_CACHE_MISS);
+
+    ASSERT_GT(prev_block_cache_hit + prev_block_cache_miss, 0);
+
+    iter->Seek("foo4");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo4")), 0);
+    ASSERT_EQ(prev_block_cache_hit,
+              TestGetTickerCount(options, BLOCK_CACHE_HIT));
+    ASSERT_EQ(prev_block_cache_miss,
+              TestGetTickerCount(options, BLOCK_CACHE_MISS));
+
+    iter->Seek("foo2");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo2")), 0);
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo3")), 0);
+    ASSERT_EQ(prev_block_cache_hit,
+              TestGetTickerCount(options, BLOCK_CACHE_HIT));
+    ASSERT_EQ(prev_block_cache_miss,
+              TestGetTickerCount(options, BLOCK_CACHE_MISS));
+  }
+}
+#endif
+
+TEST_P(DBIteratorTest, DBIteratorBoundOptimizationTest) {
+  for (auto format_version : {2, 3, 4}) {
+    int upper_bound_hits = 0;
+    Options options = CurrentOptions();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "BlockBasedTableIterator:out_of_bound",
+        [&upper_bound_hits](void*) { upper_bound_hits++; });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    options.env = env_;
+    options.create_if_missing = true;
+    options.prefix_extractor = nullptr;
+    BlockBasedTableOptions table_options;
+    table_options.format_version = format_version;
+    table_options.flush_block_policy_factory =
+        std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    DestroyAndReopen(options);
+    ASSERT_OK(Put("foo1", "bar1"));
+    ASSERT_OK(Put("foo2", "bar2"));
+    ASSERT_OK(Put("foo4", "bar4"));
+    ASSERT_OK(Flush());
+
+    Slice ub("foo3");
+    ReadOptions ro;
+    ro.iterate_upper_bound = &ub;
+
+    std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+    iter->Seek("foo");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
+    ASSERT_EQ(upper_bound_hits, 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo2")), 0);
+    ASSERT_EQ(upper_bound_hits, 0);
+
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_EQ(upper_bound_hits, 1);
+  }
+}
+
+// Enable kBinarySearchWithFirstKey, do some iterator operations and check that
+// they don't do unnecessary block reads.
+TEST_P(DBIteratorTest, IndexWithFirstKey) {
+  for (int tailing = 0; tailing < 2; ++tailing) {
+    SCOPED_TRACE("tailing = " + std::to_string(tailing));
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.create_if_missing = true;
+    options.prefix_extractor = nullptr;
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    Statistics* stats = options.statistics.get();
+    BlockBasedTableOptions table_options;
+    table_options.index_type =
+        BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey;
+    table_options.index_shortening =
+        BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+    table_options.flush_block_policy_factory =
+        std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+    table_options.block_cache =
+        NewLRUCache(8000);  // fits all blocks and their cache metadata overhead
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    DestroyAndReopen(options);
+    ASSERT_OK(Merge("a1", "x1"));
+    ASSERT_OK(Merge("b1", "y1"));
+    ASSERT_OK(Merge("c0", "z1"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Merge("a2", "x2"));
+    ASSERT_OK(Merge("b2", "y2"));
+    ASSERT_OK(Merge("c0", "z2"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Merge("a3", "x3"));
+    ASSERT_OK(Merge("b3", "y3"));
+    ASSERT_OK(Merge("c3", "z3"));
+    ASSERT_OK(Flush());
+
+    // Block cache is not important for this test.
+    // We use BLOCK_CACHE_DATA_* counters just because they're the most readily
+    // available way of counting block accesses.
+
+    ReadOptions ropt;
+    ropt.tailing = tailing;
+    std::unique_ptr<Iterator> iter(NewIterator(ropt));
+
+    ropt.read_tier = ReadTier::kBlockCacheTier;
+    std::unique_ptr<Iterator> nonblocking_iter(NewIterator(ropt));
+
+    iter->Seek("b10");
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ("b2", iter->key().ToString());
+    EXPECT_EQ("y2", iter->value().ToString());
+    EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    // The cache-only iterator should succeed too, using the blocks pulled into
+    // the cache by the previous iterator.
+    nonblocking_iter->Seek("b10");
+    ASSERT_TRUE(nonblocking_iter->Valid());
+    EXPECT_EQ("b2", nonblocking_iter->key().ToString());
+    EXPECT_EQ("y2", nonblocking_iter->value().ToString());
+    EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // ... but it shouldn't be able to step forward since the next block is
+    // not in cache yet.
+    nonblocking_iter->Next();
+    ASSERT_FALSE(nonblocking_iter->Valid());
+    ASSERT_TRUE(nonblocking_iter->status().IsIncomplete());
+
+    // ... nor should a seek to the next key succeed.
+    nonblocking_iter->Seek("b20");
+    ASSERT_FALSE(nonblocking_iter->Valid());
+    ASSERT_TRUE(nonblocking_iter->status().IsIncomplete());
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ("b3", iter->key().ToString());
+    EXPECT_EQ("y3", iter->value().ToString());
+    EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // After the blocking iterator loaded the next block, the nonblocking
+    // iterator's seek should succeed.
+    nonblocking_iter->Seek("b20");
+    ASSERT_TRUE(nonblocking_iter->Valid());
+    EXPECT_EQ("b3", nonblocking_iter->key().ToString());
+    EXPECT_EQ("y3", nonblocking_iter->value().ToString());
+    EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    iter->Seek("c0");
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ("c0", iter->key().ToString());
+    EXPECT_EQ("z1,z2", iter->value().ToString());
+    EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(6, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ("c3", iter->key().ToString());
+    EXPECT_EQ("z3", iter->value().ToString());
+    EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(7, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    iter.reset();
+
+    // Enable iterate_upper_bound and check that iterator is not trying to read
+    // blocks that are fully above upper bound.
+    std::string ub = "b3";
+    Slice ub_slice(ub);
+    ropt.iterate_upper_bound = &ub_slice;
+    iter.reset(NewIterator(ropt));
+
+    iter->Seek("b2");
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ("b2", iter->key().ToString());
+    EXPECT_EQ("y2", iter->value().ToString());
+    EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(7, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(7, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+  }
+}
+
+TEST_P(DBIteratorTest, IndexWithFirstKeyGet) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.prefix_extractor = nullptr;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  Statistics* stats = options.statistics.get();
+  BlockBasedTableOptions table_options;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey;
+  table_options.index_shortening =
+      BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+  table_options.flush_block_policy_factory =
+      std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+  table_options.block_cache = NewLRUCache(1000);  // fits all blocks
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  DestroyAndReopen(options);
+  ASSERT_OK(Merge("a", "x1"));
+  ASSERT_OK(Merge("c", "y1"));
+  ASSERT_OK(Merge("e", "z1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("c", "y2"));
+  ASSERT_OK(Merge("e", "z2"));
+  ASSERT_OK(Flush());
+
+  // Get() between blocks shouldn't read any blocks.
+  ASSERT_EQ("NOT_FOUND", Get("b"));
+  EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+  EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+  // Get() of an existing key shouldn't read any unnecessary blocks when there's
+  // only one key per block.
+
+  ASSERT_EQ("y1,y2", Get("c"));
+  EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+  EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+  ASSERT_EQ("x1", Get("a"));
+  EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+  EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+  EXPECT_EQ(std::vector<std::string>({"NOT_FOUND", "z1,z2"}),
+            MultiGet({"b", "e"}));
+}
+
+// TODO(3.13): fix the issue of Seek() + Prev() which might not necessary
+//             return the biggest key which is smaller than the seek key.
+TEST_P(DBIteratorTest, PrevAfterAndNextAfterMerge) {
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  options.env = env_;
+  DestroyAndReopen(options);
+
+  // write three entries with different keys using Merge()
+  WriteOptions wopts;
+  ASSERT_OK(db_->Merge(wopts, "1", "data1"));
+  ASSERT_OK(db_->Merge(wopts, "2", "data2"));
+  ASSERT_OK(db_->Merge(wopts, "3", "data3"));
+
+  std::unique_ptr<Iterator> it(NewIterator(ReadOptions()));
+
+  it->Seek("2");
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("2", it->key().ToString());
+
+  it->Prev();
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("1", it->key().ToString());
+
+  it->SeekForPrev("1");
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("1", it->key().ToString());
+
+  it->Next();
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("2", it->key().ToString());
+}
+
+class DBIteratorTestForPinnedData : public DBIteratorTest {
+ public:
+  enum TestConfig {
+    NORMAL,
+    CLOSE_AND_OPEN,
+    COMPACT_BEFORE_READ,
+    FLUSH_EVERY_1000,
+    MAX
+  };
+  DBIteratorTestForPinnedData() : DBIteratorTest() {}
+  void PinnedDataIteratorRandomized(TestConfig run_config) {
+    // Generate Random data
+    Random rnd(301);
+
+    int puts = 100000;
+    int key_pool = static_cast<int>(puts * 0.7);
+    int key_size = 100;
+    int val_size = 1000;
+    int seeks_percentage = 20;   // 20% of keys will be used to test seek()
+    int delete_percentage = 20;  // 20% of keys will be deleted
+    int merge_percentage = 20;   // 20% of keys will be added using Merge()
+
+    Options options = CurrentOptions();
+    BlockBasedTableOptions table_options;
+    table_options.use_delta_encoding = false;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    options.merge_operator = MergeOperators::CreatePutOperator();
+    DestroyAndReopen(options);
+
+    std::vector<std::string> generated_keys(key_pool);
+    for (int i = 0; i < key_pool; i++) {
+      generated_keys[i] = rnd.RandomString(key_size);
+    }
+
+    std::map<std::string, std::string> true_data;
+    std::vector<std::string> random_keys;
+    std::vector<std::string> deleted_keys;
+    for (int i = 0; i < puts; i++) {
+      auto& k = generated_keys[rnd.Next() % key_pool];
+      auto v = rnd.RandomString(val_size);
+
+      // Insert data to true_data map and to DB
+      true_data[k] = v;
+      if (rnd.PercentTrue(merge_percentage)) {
+        ASSERT_OK(db_->Merge(WriteOptions(), k, v));
+      } else {
+        ASSERT_OK(Put(k, v));
+      }
+
+      // Pick random keys to be used to test Seek()
+      if (rnd.PercentTrue(seeks_percentage)) {
+        random_keys.push_back(k);
+      }
+
+      // Delete some random keys
+      if (rnd.PercentTrue(delete_percentage)) {
+        deleted_keys.push_back(k);
+        true_data.erase(k);
+        ASSERT_OK(Delete(k));
+      }
+
+      if (run_config == TestConfig::FLUSH_EVERY_1000) {
+        if (i && i % 1000 == 0) {
+          ASSERT_OK(Flush());
+        }
+      }
+    }
+
+    if (run_config == TestConfig::CLOSE_AND_OPEN) {
+      Close();
+      Reopen(options);
+    } else if (run_config == TestConfig::COMPACT_BEFORE_READ) {
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    }
+
+    ReadOptions ro;
+    ro.pin_data = true;
+    auto iter = NewIterator(ro);
+
+    {
+      // Test Seek to random keys
+      std::vector<Slice> keys_slices;
+      std::vector<std::string> true_keys;
+      for (auto& k : random_keys) {
+        iter->Seek(k);
+        if (!iter->Valid()) {
+          ASSERT_EQ(true_data.lower_bound(k), true_data.end());
+          continue;
+        }
+        std::string prop_value;
+        ASSERT_OK(
+            iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+        ASSERT_EQ("1", prop_value);
+        keys_slices.push_back(iter->key());
+        true_keys.push_back(true_data.lower_bound(k)->first);
+      }
+
+      for (size_t i = 0; i < keys_slices.size(); i++) {
+        ASSERT_EQ(keys_slices[i].ToString(), true_keys[i]);
+      }
+    }
+
+    {
+      // Test SeekForPrev to random keys
+      std::vector<Slice> keys_slices;
+      std::vector<std::string> true_keys;
+      for (auto& k : random_keys) {
+        iter->SeekForPrev(k);
+        if (!iter->Valid()) {
+          ASSERT_EQ(true_data.upper_bound(k), true_data.begin());
+          continue;
+        }
+        std::string prop_value;
+        ASSERT_OK(
+            iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+        ASSERT_EQ("1", prop_value);
+        keys_slices.push_back(iter->key());
+        true_keys.push_back((--true_data.upper_bound(k))->first);
+      }
+
+      for (size_t i = 0; i < keys_slices.size(); i++) {
+        ASSERT_EQ(keys_slices[i].ToString(), true_keys[i]);
+      }
+    }
+
+    {
+      // Test iterating all data forward
+      std::vector<Slice> all_keys;
+      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+        std::string prop_value;
+        ASSERT_OK(
+            iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+        ASSERT_EQ("1", prop_value);
+        all_keys.push_back(iter->key());
+      }
+      ASSERT_EQ(all_keys.size(), true_data.size());
+
+      // Verify that all keys slices are valid
+      auto data_iter = true_data.begin();
+      for (size_t i = 0; i < all_keys.size(); i++) {
+        ASSERT_EQ(all_keys[i].ToString(), data_iter->first);
+        data_iter++;
+      }
+    }
+
+    {
+      // Test iterating all data backward
+      std::vector<Slice> all_keys;
+      for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+        std::string prop_value;
+        ASSERT_OK(
+            iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+        ASSERT_EQ("1", prop_value);
+        all_keys.push_back(iter->key());
+      }
+      ASSERT_EQ(all_keys.size(), true_data.size());
+
+      // Verify that all keys slices are valid (backward)
+      auto data_iter = true_data.rbegin();
+      for (size_t i = 0; i < all_keys.size(); i++) {
+        ASSERT_EQ(all_keys[i].ToString(), data_iter->first);
+        data_iter++;
+      }
+    }
+
+    delete iter;
+  }
+};
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedNormal) {
+  PinnedDataIteratorRandomized(TestConfig::NORMAL);
+}
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedCLoseAndOpen) {
+  PinnedDataIteratorRandomized(TestConfig::CLOSE_AND_OPEN);
+}
+
+TEST_P(DBIteratorTestForPinnedData,
+       PinnedDataIteratorRandomizedCompactBeforeRead) {
+  PinnedDataIteratorRandomized(TestConfig::COMPACT_BEFORE_READ);
+}
+
+TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedFlush) {
+  PinnedDataIteratorRandomized(TestConfig::FLUSH_EVERY_1000);
+}
+
+INSTANTIATE_TEST_CASE_P(DBIteratorTestForPinnedDataInstance,
+                        DBIteratorTestForPinnedData,
+                        testing::Values(true, false));
+
+#ifndef ROCKSDB_LITE
+TEST_P(DBIteratorTest, PinnedDataIteratorMultipleFiles) {
+  Options options = CurrentOptions();
+  BlockBasedTableOptions table_options;
+  table_options.use_delta_encoding = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 1024 * 1024 * 10;  // 10 Mb
+  DestroyAndReopen(options);
+
+  std::map<std::string, std::string> true_data;
+
+  // Generate 4 sst files in L2
+  Random rnd(301);
+  for (int i = 1; i <= 1000; i++) {
+    std::string k = Key(i * 3);
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(Put(k, v));
+    true_data[k] = v;
+    if (i % 250 == 0) {
+      ASSERT_OK(Flush());
+    }
+  }
+  ASSERT_EQ(FilesPerLevel(0), "4");
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(FilesPerLevel(0), "0,4");
+
+  // Generate 4 sst files in L0
+  for (int i = 1; i <= 1000; i++) {
+    std::string k = Key(i * 2);
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(Put(k, v));
+    true_data[k] = v;
+    if (i % 250 == 0) {
+      ASSERT_OK(Flush());
+    }
+  }
+  ASSERT_EQ(FilesPerLevel(0), "4,4");
+
+  // Add some keys/values in memtables
+  for (int i = 1; i <= 1000; i++) {
+    std::string k = Key(i);
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(Put(k, v));
+    true_data[k] = v;
+  }
+  ASSERT_EQ(FilesPerLevel(0), "4,4");
+
+  ReadOptions ro;
+  ro.pin_data = true;
+  auto iter = NewIterator(ro);
+
+  std::vector<std::pair<Slice, std::string>> results;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    std::string prop_value;
+    ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+    ASSERT_EQ("1", prop_value);
+    results.emplace_back(iter->key(), iter->value().ToString());
+  }
+
+  ASSERT_EQ(results.size(), true_data.size());
+  auto data_iter = true_data.begin();
+  for (size_t i = 0; i < results.size(); i++, data_iter++) {
+    auto& kv = results[i];
+    ASSERT_EQ(kv.first, data_iter->first);
+    ASSERT_EQ(kv.second, data_iter->second);
+  }
+
+  delete iter;
+}
+#endif
+
+TEST_P(DBIteratorTest, PinnedDataIteratorMergeOperator) {
+  Options options = CurrentOptions();
+  BlockBasedTableOptions table_options;
+  table_options.use_delta_encoding = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  DestroyAndReopen(options);
+
+  std::string numbers[7];
+  for (int val = 0; val <= 6; val++) {
+    PutFixed64(numbers + val, val);
+  }
+
+  // +1 all keys in range [ 0 => 999]
+  for (int i = 0; i < 1000; i++) {
+    WriteOptions wo;
+    ASSERT_OK(db_->Merge(wo, Key(i), numbers[1]));
+  }
+
+  // +2 all keys divisible by 2 in range [ 0 => 999]
+  for (int i = 0; i < 1000; i += 2) {
+    WriteOptions wo;
+    ASSERT_OK(db_->Merge(wo, Key(i), numbers[2]));
+  }
+
+  // +3 all keys divisible by 5 in range [ 0 => 999]
+  for (int i = 0; i < 1000; i += 5) {
+    WriteOptions wo;
+    ASSERT_OK(db_->Merge(wo, Key(i), numbers[3]));
+  }
+
+  ReadOptions ro;
+  ro.pin_data = true;
+  auto iter = NewIterator(ro);
+
+  std::vector<std::pair<Slice, std::string>> results;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    std::string prop_value;
+    ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+    ASSERT_EQ("1", prop_value);
+    results.emplace_back(iter->key(), iter->value().ToString());
+  }
+
+  ASSERT_EQ(results.size(), 1000);
+  for (size_t i = 0; i < results.size(); i++) {
+    auto& kv = results[i];
+    ASSERT_EQ(kv.first, Key(static_cast<int>(i)));
+    int expected_val = 1;
+    if (i % 2 == 0) {
+      expected_val += 2;
+    }
+    if (i % 5 == 0) {
+      expected_val += 3;
+    }
+    ASSERT_EQ(kv.second, numbers[expected_val]);
+  }
+
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, PinnedDataIteratorReadAfterUpdate) {
+  Options options = CurrentOptions();
+  BlockBasedTableOptions table_options;
+  table_options.use_delta_encoding = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.write_buffer_size = 100000;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+
+  std::map<std::string, std::string> true_data;
+  for (int i = 0; i < 1000; i++) {
+    std::string k = rnd.RandomString(10);
+    std::string v = rnd.RandomString(1000);
+    ASSERT_OK(Put(k, v));
+    true_data[k] = v;
+  }
+
+  ReadOptions ro;
+  ro.pin_data = true;
+  auto iter = NewIterator(ro);
+
+  // Delete 50% of the keys and update the other 50%
+  for (auto& kv : true_data) {
+    if (rnd.OneIn(2)) {
+      ASSERT_OK(Delete(kv.first));
+    } else {
+      std::string new_val = rnd.RandomString(1000);
+      ASSERT_OK(Put(kv.first, new_val));
+    }
+  }
+
+  std::vector<std::pair<Slice, std::string>> results;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    std::string prop_value;
+    ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+    ASSERT_EQ("1", prop_value);
+    results.emplace_back(iter->key(), iter->value().ToString());
+  }
+
+  auto data_iter = true_data.begin();
+  for (size_t i = 0; i < results.size(); i++, data_iter++) {
+    auto& kv = results[i];
+    ASSERT_EQ(kv.first, data_iter->first);
+    ASSERT_EQ(kv.second, data_iter->second);
+  }
+
+  delete iter;
+}
+
+class SliceTransformLimitedDomainGeneric : public SliceTransform {
+  const char* Name() const override {
+    return "SliceTransformLimitedDomainGeneric";
+  }
+
+  Slice Transform(const Slice& src) const override {
+    return Slice(src.data(), 1);
+  }
+
+  bool InDomain(const Slice& src) const override {
+    // prefix will be x????
+    return src.size() >= 1;
+  }
+
+  bool InRange(const Slice& dst) const override {
+    // prefix will be x????
+    return dst.size() == 1;
+  }
+};
+
+TEST_P(DBIteratorTest, IterSeekForPrevCrossingFiles) {
+  Options options = CurrentOptions();
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+  options.disable_auto_compactions = true;
+  // Enable prefix bloom for SST files
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("a1", "va1"));
+  ASSERT_OK(Put("a2", "va2"));
+  ASSERT_OK(Put("a3", "va3"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("b1", "vb1"));
+  ASSERT_OK(Put("b2", "vb2"));
+  ASSERT_OK(Put("b3", "vb3"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("b4", "vb4"));
+  ASSERT_OK(Put("d1", "vd1"));
+  ASSERT_OK(Put("d2", "vd2"));
+  ASSERT_OK(Put("d4", "vd4"));
+  ASSERT_OK(Flush());
+
+  MoveFilesToLevel(1);
+  {
+    ReadOptions ro;
+    Iterator* iter = NewIterator(ro);
+
+    iter->SeekForPrev("a4");
+    ASSERT_EQ(iter->key().ToString(), "a3");
+    ASSERT_EQ(iter->value().ToString(), "va3");
+
+    iter->SeekForPrev("c2");
+    ASSERT_EQ(iter->key().ToString(), "b3");
+    iter->SeekForPrev("d3");
+    ASSERT_EQ(iter->key().ToString(), "d2");
+    iter->SeekForPrev("b5");
+    ASSERT_EQ(iter->key().ToString(), "b4");
+    delete iter;
+  }
+
+  {
+    ReadOptions ro;
+    ro.prefix_same_as_start = true;
+    Iterator* iter = NewIterator(ro);
+    iter->SeekForPrev("c2");
+    ASSERT_TRUE(!iter->Valid());
+    ASSERT_OK(iter->status());
+    delete iter;
+  }
+}
+
+TEST_P(DBIteratorTest, IterSeekForPrevCrossingFilesCustomPrefixExtractor) {
+  Options options = CurrentOptions();
+  options.prefix_extractor =
+      std::make_shared<SliceTransformLimitedDomainGeneric>();
+  options.disable_auto_compactions = true;
+  // Enable prefix bloom for SST files
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("a1", "va1"));
+  ASSERT_OK(Put("a2", "va2"));
+  ASSERT_OK(Put("a3", "va3"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("b1", "vb1"));
+  ASSERT_OK(Put("b2", "vb2"));
+  ASSERT_OK(Put("b3", "vb3"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("b4", "vb4"));
+  ASSERT_OK(Put("d1", "vd1"));
+  ASSERT_OK(Put("d2", "vd2"));
+  ASSERT_OK(Put("d4", "vd4"));
+  ASSERT_OK(Flush());
+
+  MoveFilesToLevel(1);
+  {
+    ReadOptions ro;
+    Iterator* iter = NewIterator(ro);
+
+    iter->SeekForPrev("a4");
+    ASSERT_EQ(iter->key().ToString(), "a3");
+    ASSERT_EQ(iter->value().ToString(), "va3");
+
+    iter->SeekForPrev("c2");
+    ASSERT_EQ(iter->key().ToString(), "b3");
+    iter->SeekForPrev("d3");
+    ASSERT_EQ(iter->key().ToString(), "d2");
+    iter->SeekForPrev("b5");
+    ASSERT_EQ(iter->key().ToString(), "b4");
+    delete iter;
+  }
+
+  {
+    ReadOptions ro;
+    ro.prefix_same_as_start = true;
+    Iterator* iter = NewIterator(ro);
+    iter->SeekForPrev("c2");
+    ASSERT_TRUE(!iter->Valid());
+    ASSERT_OK(iter->status());
+    delete iter;
+  }
+}
+
+TEST_P(DBIteratorTest, IterPrevKeyCrossingBlocks) {
+  Options options = CurrentOptions();
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1;  // every block will contain one entry
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
+  options.disable_auto_compactions = true;
+  options.max_sequential_skip_in_iterations = 8;
+
+  DestroyAndReopen(options);
+
+  // Putting such deletes will force DBIter::Prev() to fallback to a Seek
+  for (int file_num = 0; file_num < 10; file_num++) {
+    ASSERT_OK(Delete("key4"));
+    ASSERT_OK(Flush());
+  }
+
+  // First File containing 5 blocks of puts
+  ASSERT_OK(Put("key1", "val1.0"));
+  ASSERT_OK(Put("key2", "val2.0"));
+  ASSERT_OK(Put("key3", "val3.0"));
+  ASSERT_OK(Put("key4", "val4.0"));
+  ASSERT_OK(Put("key5", "val5.0"));
+  ASSERT_OK(Flush());
+
+  // Second file containing 9 blocks of merge operands
+  ASSERT_OK(db_->Merge(WriteOptions(), "key1", "val1.1"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key1", "val1.2"));
+
+  ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.1"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.2"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.3"));
+
+  ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.1"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.2"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.3"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.4"));
+  ASSERT_OK(Flush());
+
+  {
+    ReadOptions ro;
+    ro.fill_cache = false;
+    Iterator* iter = NewIterator(ro);
+
+    iter->SeekToLast();
+    ASSERT_EQ(iter->key().ToString(), "key5");
+    ASSERT_EQ(iter->value().ToString(), "val5.0");
+
+    iter->Prev();
+    ASSERT_EQ(iter->key().ToString(), "key4");
+    ASSERT_EQ(iter->value().ToString(), "val4.0");
+
+    iter->Prev();
+    ASSERT_EQ(iter->key().ToString(), "key3");
+    ASSERT_EQ(iter->value().ToString(), "val3.0,val3.1,val3.2,val3.3,val3.4");
+
+    iter->Prev();
+    ASSERT_EQ(iter->key().ToString(), "key2");
+    ASSERT_EQ(iter->value().ToString(), "val2.0,val2.1,val2.2,val2.3");
+
+    iter->Prev();
+    ASSERT_EQ(iter->key().ToString(), "key1");
+    ASSERT_EQ(iter->value().ToString(), "val1.0,val1.1,val1.2");
+
+    delete iter;
+  }
+}
+
+TEST_P(DBIteratorTest, IterPrevKeyCrossingBlocksRandomized) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
+  options.disable_auto_compactions = true;
+  options.level0_slowdown_writes_trigger = (1 << 30);
+  options.level0_stop_writes_trigger = (1 << 30);
+  options.max_sequential_skip_in_iterations = 8;
+  DestroyAndReopen(options);
+
+  const int kNumKeys = 500;
+  // Small number of merge operands to make sure that DBIter::Prev() don't
+  // fall back to Seek()
+  const int kNumMergeOperands = 3;
+  // Use value size that will make sure that every block contain 1 key
+  const int kValSize =
+      static_cast<int>(BlockBasedTableOptions().block_size) * 4;
+  // Percentage of keys that wont get merge operations
+  const int kNoMergeOpPercentage = 20;
+  // Percentage of keys that will be deleted
+  const int kDeletePercentage = 10;
+
+  // For half of the key range we will write multiple deletes first to
+  // force DBIter::Prev() to fall back to Seek()
+  for (int file_num = 0; file_num < 10; file_num++) {
+    for (int i = 0; i < kNumKeys; i += 2) {
+      ASSERT_OK(Delete(Key(i)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  Random rnd(301);
+  std::map<std::string, std::string> true_data;
+  std::string gen_key;
+  std::string gen_val;
+
+  for (int i = 0; i < kNumKeys; i++) {
+    gen_key = Key(i);
+    gen_val = rnd.RandomString(kValSize);
+
+    ASSERT_OK(Put(gen_key, gen_val));
+    true_data[gen_key] = gen_val;
+  }
+  ASSERT_OK(Flush());
+
+  // Separate values and merge operands in different file so that we
+  // make sure that we don't merge them while flushing but actually
+  // merge them in the read path
+  for (int i = 0; i < kNumKeys; i++) {
+    if (rnd.PercentTrue(kNoMergeOpPercentage)) {
+      // Dont give merge operations for some keys
+      continue;
+    }
+
+    for (int j = 0; j < kNumMergeOperands; j++) {
+      gen_key = Key(i);
+      gen_val = rnd.RandomString(kValSize);
+
+      ASSERT_OK(db_->Merge(WriteOptions(), gen_key, gen_val));
+      true_data[gen_key] += "," + gen_val;
+    }
+  }
+  ASSERT_OK(Flush());
+
+  for (int i = 0; i < kNumKeys; i++) {
+    if (rnd.PercentTrue(kDeletePercentage)) {
+      gen_key = Key(i);
+
+      ASSERT_OK(Delete(gen_key));
+      true_data.erase(gen_key);
+    }
+  }
+  ASSERT_OK(Flush());
+
+  {
+    ReadOptions ro;
+    ro.fill_cache = false;
+    Iterator* iter = NewIterator(ro);
+    auto data_iter = true_data.rbegin();
+
+    for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+      ASSERT_EQ(iter->key().ToString(), data_iter->first);
+      ASSERT_EQ(iter->value().ToString(), data_iter->second);
+      data_iter++;
+    }
+    ASSERT_EQ(data_iter, true_data.rend());
+
+    delete iter;
+  }
+
+  {
+    ReadOptions ro;
+    ro.fill_cache = false;
+    Iterator* iter = NewIterator(ro);
+    auto data_iter = true_data.rbegin();
+
+    int entries_right = 0;
+    std::string seek_key;
+    for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+      // Verify key/value of current position
+      ASSERT_EQ(iter->key().ToString(), data_iter->first);
+      ASSERT_EQ(iter->value().ToString(), data_iter->second);
+
+      bool restore_position_with_seek = rnd.Uniform(2);
+      if (restore_position_with_seek) {
+        seek_key = iter->key().ToString();
+      }
+
+      // Do some Next() operations the restore the iterator to orignal position
+      int next_count =
+          entries_right > 0 ? rnd.Uniform(std::min(entries_right, 10)) : 0;
+      for (int i = 0; i < next_count; i++) {
+        iter->Next();
+        data_iter--;
+
+        ASSERT_EQ(iter->key().ToString(), data_iter->first);
+        ASSERT_EQ(iter->value().ToString(), data_iter->second);
+      }
+
+      if (restore_position_with_seek) {
+        // Restore orignal position using Seek()
+        iter->Seek(seek_key);
+        for (int i = 0; i < next_count; i++) {
+          data_iter++;
+        }
+
+        ASSERT_EQ(iter->key().ToString(), data_iter->first);
+        ASSERT_EQ(iter->value().ToString(), data_iter->second);
+      } else {
+        // Restore original position using Prev()
+        for (int i = 0; i < next_count; i++) {
+          iter->Prev();
+          data_iter++;
+
+          ASSERT_EQ(iter->key().ToString(), data_iter->first);
+          ASSERT_EQ(iter->value().ToString(), data_iter->second);
+        }
+      }
+
+      entries_right++;
+      data_iter++;
+    }
+    ASSERT_EQ(data_iter, true_data.rend());
+
+    delete iter;
+  }
+}
+
+TEST_P(DBIteratorTest, IteratorWithLocalStatistics) {
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 1000; i++) {
+    // Key 10 bytes / Value 10 bytes
+    ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10)));
+  }
+
+  std::atomic<uint64_t> total_next(0);
+  std::atomic<uint64_t> total_next_found(0);
+  std::atomic<uint64_t> total_prev(0);
+  std::atomic<uint64_t> total_prev_found(0);
+  std::atomic<uint64_t> total_bytes(0);
+
+  std::vector<port::Thread> threads;
+  std::function<void()> reader_func_next = [&]() {
+    SetPerfLevel(kEnableCount);
+    get_perf_context()->Reset();
+    Iterator* iter = NewIterator(ReadOptions());
+
+    iter->SeekToFirst();
+    // Seek will bump ITER_BYTES_READ
+    uint64_t bytes = 0;
+    bytes += iter->key().size();
+    bytes += iter->value().size();
+    while (true) {
+      iter->Next();
+      total_next++;
+
+      if (!iter->Valid()) {
+        break;
+      }
+      total_next_found++;
+      bytes += iter->key().size();
+      bytes += iter->value().size();
+    }
+
+    delete iter;
+    ASSERT_EQ(bytes, get_perf_context()->iter_read_bytes);
+    SetPerfLevel(kDisable);
+    total_bytes += bytes;
+  };
+
+  std::function<void()> reader_func_prev = [&]() {
+    SetPerfLevel(kEnableCount);
+    Iterator* iter = NewIterator(ReadOptions());
+
+    iter->SeekToLast();
+    // Seek will bump ITER_BYTES_READ
+    uint64_t bytes = 0;
+    bytes += iter->key().size();
+    bytes += iter->value().size();
+    while (true) {
+      iter->Prev();
+      total_prev++;
+
+      if (!iter->Valid()) {
+        break;
+      }
+      total_prev_found++;
+      bytes += iter->key().size();
+      bytes += iter->value().size();
+    }
+
+    delete iter;
+    ASSERT_EQ(bytes, get_perf_context()->iter_read_bytes);
+    SetPerfLevel(kDisable);
+    total_bytes += bytes;
+  };
+
+  for (int i = 0; i < 10; i++) {
+    threads.emplace_back(reader_func_next);
+  }
+  for (int i = 0; i < 15; i++) {
+    threads.emplace_back(reader_func_prev);
+  }
+
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_NEXT), (uint64_t)total_next);
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_NEXT_FOUND),
+            (uint64_t)total_next_found);
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_PREV), (uint64_t)total_prev);
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_PREV_FOUND),
+            (uint64_t)total_prev_found);
+  ASSERT_EQ(TestGetTickerCount(options, ITER_BYTES_READ),
+            (uint64_t)total_bytes);
+}
+
+TEST_P(DBIteratorTest, ReadAhead) {
+  Options options;
+  env_->count_random_reads_ = true;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 4 << 20;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1024;
+  table_options.no_block_cache = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  std::string value(1024, 'a');
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(i), value));
+  }
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(2);
+
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(i), value));
+  }
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(i), value));
+  }
+  ASSERT_OK(Flush());
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("1,1,1", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  env_->random_read_bytes_counter_ = 0;
+  options.statistics->setTickerCount(NO_FILE_OPENS, 0);
+  ReadOptions read_options;
+  auto* iter = NewIterator(read_options);
+  iter->SeekToFirst();
+  int64_t num_file_opens = TestGetTickerCount(options, NO_FILE_OPENS);
+  size_t bytes_read = env_->random_read_bytes_counter_;
+  delete iter;
+
+  int64_t num_file_closes = TestGetTickerCount(options, NO_FILE_CLOSES);
+  env_->random_read_bytes_counter_ = 0;
+  options.statistics->setTickerCount(NO_FILE_OPENS, 0);
+  read_options.readahead_size = 1024 * 10;
+  iter = NewIterator(read_options);
+  iter->SeekToFirst();
+  int64_t num_file_opens_readahead = TestGetTickerCount(options, NO_FILE_OPENS);
+  size_t bytes_read_readahead = env_->random_read_bytes_counter_;
+  delete iter;
+  int64_t num_file_closes_readahead =
+      TestGetTickerCount(options, NO_FILE_CLOSES);
+  ASSERT_EQ(num_file_opens, num_file_opens_readahead);
+  ASSERT_EQ(num_file_closes, num_file_closes_readahead);
+  ASSERT_GT(bytes_read_readahead, bytes_read);
+  ASSERT_GT(bytes_read_readahead, read_options.readahead_size * 3);
+
+  // Verify correctness.
+  iter = NewIterator(read_options);
+  int count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_EQ(value, iter->value());
+    count++;
+  }
+  ASSERT_EQ(100, count);
+  for (int i = 0; i < 100; i++) {
+    iter->Seek(Key(i));
+    ASSERT_EQ(value, iter->value());
+  }
+  delete iter;
+}
+
+// Insert a key, create a snapshot iterator, overwrite key lots of times,
+// seek to a smaller key. Expect DBIter to fall back to a seek instead of
+// going through all the overwrites linearly.
+TEST_P(DBIteratorTest, DBIteratorSkipRecentDuplicatesTest) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.max_sequential_skip_in_iterations = 3;
+  options.prefix_extractor = nullptr;
+  options.write_buffer_size = 1 << 27;  // big enough to avoid flush
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  // Insert.
+  ASSERT_OK(Put("b", "0"));
+
+  // Create iterator.
+  ReadOptions ro;
+  std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+  // Insert a lot.
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(Put("b", std::to_string(i + 1).c_str()));
+  }
+
+#ifndef ROCKSDB_LITE
+  // Check that memtable wasn't flushed.
+  std::string val;
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level0", &val));
+  EXPECT_EQ("0", val);
+#endif
+
+  // Seek iterator to a smaller key.
+  get_perf_context()->Reset();
+  iter->Seek("a");
+  ASSERT_TRUE(iter->Valid());
+  EXPECT_EQ("b", iter->key().ToString());
+  EXPECT_EQ("0", iter->value().ToString());
+
+  // Check that the seek didn't do too much work.
+  // Checks are not tight, just make sure that everything is well below 100.
+  EXPECT_LT(get_perf_context()->internal_key_skipped_count, 4);
+  EXPECT_LT(get_perf_context()->internal_recent_skipped_count, 8);
+  EXPECT_LT(get_perf_context()->seek_on_memtable_count, 10);
+  EXPECT_LT(get_perf_context()->next_on_memtable_count, 10);
+  EXPECT_LT(get_perf_context()->prev_on_memtable_count, 10);
+
+  // Check that iterator did something like what we expect.
+  EXPECT_EQ(get_perf_context()->internal_delete_skipped_count, 0);
+  EXPECT_EQ(get_perf_context()->internal_merge_count, 0);
+  EXPECT_GE(get_perf_context()->internal_recent_skipped_count, 2);
+  EXPECT_GE(get_perf_context()->seek_on_memtable_count, 2);
+  EXPECT_EQ(1,
+            options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+}
+
+TEST_P(DBIteratorTest, Refresh) {
+  ASSERT_OK(Put("x", "y"));
+
+  std::unique_ptr<Iterator> iter(NewIterator(ReadOptions()));
+  ASSERT_OK(iter->status());
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  ASSERT_OK(Put("c", "d"));
+
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  ASSERT_OK(iter->status());
+  ASSERT_OK(iter->Refresh());
+
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("c")), 0);
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
+
+  ASSERT_OK(Put("m", "n"));
+
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("c")), 0);
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  ASSERT_OK(iter->status());
+  ASSERT_OK(iter->Refresh());
+
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("c")), 0);
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("m")), 0);
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  iter.reset();
+}
+
+TEST_P(DBIteratorTest, RefreshWithSnapshot) {
+  ASSERT_OK(Put("x", "y"));
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ReadOptions options;
+  options.snapshot = snapshot;
+  Iterator* iter = NewIterator(options);
+  ASSERT_OK(iter->status());
+
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  ASSERT_OK(Put("c", "d"));
+
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  ASSERT_OK(iter->status());
+  Status s = iter->Refresh();
+  ASSERT_TRUE(s.IsNotSupported());
+  db_->ReleaseSnapshot(snapshot);
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, CreationFailure) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::NewInternalIterator:StatusCallback", [](void* arg) {
+        *(reinterpret_cast<Status*>(arg)) = Status::Corruption("test status");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Iterator* iter = NewIterator(ReadOptions());
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_TRUE(iter->status().IsCorruption());
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, UpperBoundWithChangeDirection) {
+  Options options = CurrentOptions();
+  options.max_sequential_skip_in_iterations = 3;
+  DestroyAndReopen(options);
+
+  // write a bunch of kvs to the database.
+  ASSERT_OK(Put("a", "1"));
+  ASSERT_OK(Put("y", "1"));
+  ASSERT_OK(Put("y1", "1"));
+  ASSERT_OK(Put("y2", "1"));
+  ASSERT_OK(Put("y3", "1"));
+  ASSERT_OK(Put("z", "1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("a", "1"));
+  ASSERT_OK(Put("z", "1"));
+  ASSERT_OK(Put("bar", "1"));
+  ASSERT_OK(Put("foo", "1"));
+
+  std::string upper_bound = "x";
+  Slice ub_slice(upper_bound);
+  ReadOptions ro;
+  ro.iterate_upper_bound = &ub_slice;
+  ro.max_skippable_internal_keys = 1000;
+
+  Iterator* iter = NewIterator(ro);
+  iter->Seek("foo");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("foo", iter->key().ToString());
+
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("bar", iter->key().ToString());
+
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, TableFilter) {
+  ASSERT_OK(Put("a", "1"));
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
+  ASSERT_OK(Put("b", "2"));
+  ASSERT_OK(Put("c", "3"));
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
+  ASSERT_OK(Put("d", "4"));
+  ASSERT_OK(Put("e", "5"));
+  ASSERT_OK(Put("f", "6"));
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
+
+  // Ensure the table_filter callback is called once for each table.
+  {
+    std::set<uint64_t> unseen{1, 2, 3};
+    ReadOptions opts;
+    opts.table_filter = [&](const TableProperties& props) {
+      auto it = unseen.find(props.num_entries);
+      if (it == unseen.end()) {
+        ADD_FAILURE() << "saw table properties with an unexpected "
+                      << props.num_entries << " entries";
+      } else {
+        unseen.erase(it);
+      }
+      return true;
+    };
+    auto iter = NewIterator(opts);
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->1");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->2");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "c->3");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "d->4");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "e->5");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "f->6");
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_TRUE(unseen.empty());
+    delete iter;
+  }
+
+  // Ensure returning false in the table_filter hides the keys from that table
+  // during iteration.
+  {
+    ReadOptions opts;
+    opts.table_filter = [](const TableProperties& props) {
+      return props.num_entries != 2;
+    };
+    auto iter = NewIterator(opts);
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->1");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "d->4");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "e->5");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "f->6");
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    delete iter;
+  }
+}
+
+TEST_P(DBIteratorTest, UpperBoundWithPrevReseek) {
+  Options options = CurrentOptions();
+  options.max_sequential_skip_in_iterations = 3;
+  DestroyAndReopen(options);
+
+  // write a bunch of kvs to the database.
+  ASSERT_OK(Put("a", "1"));
+  ASSERT_OK(Put("y", "1"));
+  ASSERT_OK(Put("z", "1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("a", "1"));
+  ASSERT_OK(Put("z", "1"));
+  ASSERT_OK(Put("bar", "1"));
+  ASSERT_OK(Put("foo", "1"));
+  ASSERT_OK(Put("foo", "2"));
+
+  ASSERT_OK(Put("foo", "3"));
+  ASSERT_OK(Put("foo", "4"));
+  ASSERT_OK(Put("foo", "5"));
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(Put("foo", "6"));
+
+  std::string upper_bound = "x";
+  Slice ub_slice(upper_bound);
+  ReadOptions ro;
+  ro.snapshot = snapshot;
+  ro.iterate_upper_bound = &ub_slice;
+
+  Iterator* iter = NewIterator(ro);
+  iter->SeekForPrev("goo");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("foo", iter->key().ToString());
+  iter->Prev();
+
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("bar", iter->key().ToString());
+
+  delete iter;
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_P(DBIteratorTest, SkipStatistics) {
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  int skip_count = 0;
+
+  // write a bunch of kvs to the database.
+  ASSERT_OK(Put("a", "1"));
+  ASSERT_OK(Put("b", "1"));
+  ASSERT_OK(Put("c", "1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("d", "1"));
+  ASSERT_OK(Put("e", "1"));
+  ASSERT_OK(Put("f", "1"));
+  ASSERT_OK(Put("a", "2"));
+  ASSERT_OK(Put("b", "2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Delete("d"));
+  ASSERT_OK(Delete("e"));
+  ASSERT_OK(Delete("f"));
+
+  Iterator* iter = NewIterator(ReadOptions());
+  int count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    count++;
+  }
+  ASSERT_EQ(count, 3);
+  delete iter;
+  skip_count += 8;  // 3 deletes + 3 original keys + 2 lower in sequence
+  ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP));
+
+  iter = NewIterator(ReadOptions());
+  count = 0;
+  for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+    ASSERT_OK(iter->status());
+    count++;
+  }
+  ASSERT_EQ(count, 3);
+  delete iter;
+  skip_count += 8;  // Same as above, but in reverse order
+  ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP));
+
+  ASSERT_OK(Put("aa", "1"));
+  ASSERT_OK(Put("ab", "1"));
+  ASSERT_OK(Put("ac", "1"));
+  ASSERT_OK(Put("ad", "1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Delete("ab"));
+  ASSERT_OK(Delete("ac"));
+  ASSERT_OK(Delete("ad"));
+
+  ReadOptions ro;
+  Slice prefix("b");
+  ro.iterate_upper_bound = &prefix;
+
+  iter = NewIterator(ro);
+  count = 0;
+  for (iter->Seek("aa"); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    count++;
+  }
+  ASSERT_EQ(count, 1);
+  delete iter;
+  skip_count += 6;  // 3 deletes + 3 original keys
+  ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP));
+
+  iter = NewIterator(ro);
+  count = 0;
+  for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+    ASSERT_OK(iter->status());
+    count++;
+  }
+  ASSERT_EQ(count, 2);
+  delete iter;
+  // 3 deletes + 3 original keys + lower sequence of "a"
+  skip_count += 7;
+  ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP));
+}
+
+TEST_P(DBIteratorTest, SeekAfterHittingManyInternalKeys) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  ReadOptions ropts;
+  ropts.max_skippable_internal_keys = 2;
+
+  ASSERT_OK(Put("1", "val_1"));
+  // Add more tombstones than max_skippable_internal_keys so that Next() fails.
+  ASSERT_OK(Delete("2"));
+  ASSERT_OK(Delete("3"));
+  ASSERT_OK(Delete("4"));
+  ASSERT_OK(Delete("5"));
+  ASSERT_OK(Put("6", "val_6"));
+
+  std::unique_ptr<Iterator> iter(NewIterator(ropts));
+  iter->SeekToFirst();
+
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "1");
+  ASSERT_EQ(iter->value().ToString(), "val_1");
+
+  // This should fail as incomplete due to too many non-visible internal keys on
+  // the way to the next valid user key.
+  iter->Next();
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_TRUE(iter->status().IsIncomplete());
+
+  // Get the internal key at which Next() failed.
+  std::string prop_value;
+  ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value));
+  ASSERT_EQ("4", prop_value);
+
+  // Create a new iterator to seek to the internal key.
+  std::unique_ptr<Iterator> iter2(NewIterator(ropts));
+  iter2->Seek(prop_value);
+  ASSERT_TRUE(iter2->Valid());
+  ASSERT_OK(iter2->status());
+
+  ASSERT_EQ(iter2->key().ToString(), "6");
+  ASSERT_EQ(iter2->value().ToString(), "val_6");
+}
+
+// Reproduces a former bug where iterator would skip some records when DBIter
+// re-seeks subiterator with Incomplete status.
+TEST_P(DBIteratorTest, NonBlockingIterationBugRepro) {
+  Options options = CurrentOptions();
+  BlockBasedTableOptions table_options;
+  // Make sure the sst file has more than one block.
+  table_options.flush_block_policy_factory =
+      std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  // Two records in sst file, each in its own block.
+  ASSERT_OK(Put("b", ""));
+  ASSERT_OK(Put("d", ""));
+  ASSERT_OK(Flush());
+
+  // Create a nonblocking iterator before writing to memtable.
+  ReadOptions ropt;
+  ropt.read_tier = kBlockCacheTier;
+  std::unique_ptr<Iterator> iter(NewIterator(ropt));
+
+  // Overwrite a key in memtable many times to hit
+  // max_sequential_skip_in_iterations (which is 8 by default).
+  for (int i = 0; i < 20; ++i) {
+    ASSERT_OK(Put("c", ""));
+  }
+
+  // Load the second block in sst file into the block cache.
+  {
+    std::unique_ptr<Iterator> iter2(NewIterator(ReadOptions()));
+    iter2->Seek("d");
+  }
+
+  // Finally seek the nonblocking iterator.
+  iter->Seek("a");
+  // With the bug, the status used to be OK, and the iterator used to point to
+  // "d".
+  EXPECT_TRUE(iter->status().IsIncomplete());
+}
+
+TEST_P(DBIteratorTest, SeekBackwardAfterOutOfUpperBound) {
+  ASSERT_OK(Put("a", ""));
+  ASSERT_OK(Put("b", ""));
+  ASSERT_OK(Flush());
+
+  ReadOptions ropt;
+  Slice ub = "b";
+  ropt.iterate_upper_bound = &ub;
+
+  std::unique_ptr<Iterator> it(dbfull()->NewIterator(ropt));
+  it->SeekForPrev("a");
+  ASSERT_TRUE(it->Valid());
+  ASSERT_OK(it->status());
+  ASSERT_EQ("a", it->key().ToString());
+  it->Next();
+  ASSERT_FALSE(it->Valid());
+  ASSERT_OK(it->status());
+  it->SeekForPrev("a");
+  ASSERT_OK(it->status());
+
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("a", it->key().ToString());
+}
+
+TEST_P(DBIteratorTest, AvoidReseekLevelIterator) {
+  Options options = CurrentOptions();
+  options.compression = CompressionType::kNoCompression;
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 800;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  Random rnd(301);
+  std::string random_str = rnd.RandomString(180);
+
+  ASSERT_OK(Put("1", random_str));
+  ASSERT_OK(Put("2", random_str));
+  ASSERT_OK(Put("3", random_str));
+  ASSERT_OK(Put("4", random_str));
+  // A new block
+  ASSERT_OK(Put("5", random_str));
+  ASSERT_OK(Put("6", random_str));
+  ASSERT_OK(Put("7", random_str));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("8", random_str));
+  ASSERT_OK(Put("9", random_str));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  int num_find_file_in_level = 0;
+  int num_idx_blk_seek = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "LevelIterator::Seek:BeforeFindFile",
+      [&](void* /*arg*/) { num_find_file_in_level++; });
+  SyncPoint::GetInstance()->SetCallBack(
+      "IndexBlockIter::Seek:0", [&](void* /*arg*/) { num_idx_blk_seek++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  {
+    std::unique_ptr<Iterator> iter(NewIterator(ReadOptions()));
+    iter->Seek("1");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(1, num_idx_blk_seek);
+
+    iter->Seek("2");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(1, num_idx_blk_seek);
+
+    iter->Seek("3");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(1, num_idx_blk_seek);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(1, num_idx_blk_seek);
+
+    iter->Seek("5");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(2, num_idx_blk_seek);
+
+    iter->Seek("6");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(2, num_idx_blk_seek);
+
+    iter->Seek("7");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(3, num_idx_blk_seek);
+
+    iter->Seek("8");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(2, num_find_file_in_level);
+    // Still re-seek because "8" is the boundary key, which has
+    // the same user key as the seek key.
+    ASSERT_EQ(4, num_idx_blk_seek);
+
+    iter->Seek("5");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(3, num_find_file_in_level);
+    ASSERT_EQ(5, num_idx_blk_seek);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(3, num_find_file_in_level);
+    ASSERT_EQ(5, num_idx_blk_seek);
+
+    // Seek backward never triggers the index block seek to be skipped
+    iter->Seek("5");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(3, num_find_file_in_level);
+    ASSERT_EQ(6, num_idx_blk_seek);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// MyRocks may change iterate bounds before seek. Simply test to make sure such
+// usage doesn't break iterator.
+TEST_P(DBIteratorTest, IterateBoundChangedBeforeSeek) {
+  Options options = CurrentOptions();
+  options.compression = CompressionType::kNoCompression;
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 100;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  std::string value(50, 'v');
+  Reopen(options);
+  ASSERT_OK(Put("aaa", value));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("bbb", "v"));
+  ASSERT_OK(Put("ccc", "v"));
+  ASSERT_OK(Put("ddd", "v"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("eee", "v"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  std::string ub1 = "e";
+  std::string ub2 = "c";
+  Slice ub(ub1);
+  ReadOptions read_opts1;
+  read_opts1.iterate_upper_bound = &ub;
+  Iterator* iter = NewIterator(read_opts1);
+  // Seek and iterate accross block boundary.
+  iter->Seek("b");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("bbb", iter->key());
+  ub = Slice(ub2);
+  iter->Seek("b");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("bbb", iter->key());
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+  delete iter;
+
+  std::string lb1 = "a";
+  std::string lb2 = "c";
+  Slice lb(lb1);
+  ReadOptions read_opts2;
+  read_opts2.iterate_lower_bound = &lb;
+  iter = NewIterator(read_opts2);
+  iter->SeekForPrev("d");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("ccc", iter->key());
+  lb = Slice(lb2);
+  iter->SeekForPrev("d");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("ccc", iter->key());
+  iter->Prev();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, IterateWithLowerBoundAcrossFileBoundary) {
+  ASSERT_OK(Put("aaa", "v"));
+  ASSERT_OK(Put("bbb", "v"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("ccc", "v"));
+  ASSERT_OK(Put("ddd", "v"));
+  ASSERT_OK(Flush());
+  // Move both files to bottom level.
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  Slice lower_bound("b");
+  ReadOptions read_opts;
+  read_opts.iterate_lower_bound = &lower_bound;
+  std::unique_ptr<Iterator> iter(NewIterator(read_opts));
+  iter->SeekForPrev("d");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("ccc", iter->key());
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("bbb", iter->key());
+  iter->Prev();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+}
+
+TEST_P(DBIteratorTest, Blob) {
+  Options options = CurrentOptions();
+  options.enable_blob_files = true;
+  options.max_sequential_skip_in_iterations = 2;
+  options.statistics = CreateDBStatistics();
+
+  Reopen(options);
+
+  // Note: we have 4 KVs (3 of which are hidden) for key "b" and
+  // max_sequential_skip_in_iterations is set to 2. Thus, we need to do a reseek
+  // anytime we move from "b" to "c" or vice versa.
+  ASSERT_OK(Put("a", "va"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("b", "vb0"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("b", "vb1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("b", "vb2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("b", "vb3"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("c", "vc"));
+  ASSERT_OK(Flush());
+
+  std::unique_ptr<Iterator> iter_guard(NewIterator(ReadOptions()));
+  Iterator* const iter = iter_guard.get();
+
+  iter->SeekToFirst();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(IterStatus(iter), "c->vc");
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+  iter->SeekToFirst();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+  iter->SeekToLast();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(IterStatus(iter), "c->vc");
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+  iter->SeekToLast();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "c->vc");
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+  iter->Seek("");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Seek("a");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Seek("ax");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+
+  iter->SeekForPrev("d");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "c->vc");
+  iter->SeekForPrev("c");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "c->vc");
+  iter->SeekForPrev("bx");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+
+  iter->Seek("b");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+  iter->Seek("z");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3);
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+  iter->SeekForPrev("b");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 4);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+  iter->SeekForPrev("");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 4);
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+  // Switch from reverse to forward
+  iter->SeekToLast();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 4);
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 5);
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 5);
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 6);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+
+  // Switch from forward to reverse
+  iter->SeekToFirst();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 6);
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 6);
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 7);
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 8);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+}
+
+INSTANTIATE_TEST_CASE_P(DBIteratorTestInstance, DBIteratorTest,
+                        testing::Values(true, false));
+
+// Tests how DBIter work with ReadCallback
+class DBIteratorWithReadCallbackTest : public DBIteratorTest {};
+
+TEST_F(DBIteratorWithReadCallbackTest, ReadCallback) {
+  class TestReadCallback : public ReadCallback {
+   public:
+    explicit TestReadCallback(SequenceNumber _max_visible_seq)
+        : ReadCallback(_max_visible_seq) {}
+
+    bool IsVisibleFullCheck(SequenceNumber seq) override {
+      return seq <= max_visible_seq_;
+    }
+  };
+
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Put("foo", "v2"));
+  ASSERT_OK(Put("foo", "v3"));
+  ASSERT_OK(Put("a", "va"));
+  ASSERT_OK(Put("z", "vz"));
+  SequenceNumber seq1 = db_->GetLatestSequenceNumber();
+  TestReadCallback callback1(seq1);
+  ASSERT_OK(Put("foo", "v4"));
+  ASSERT_OK(Put("foo", "v5"));
+  ASSERT_OK(Put("bar", "v7"));
+
+  SequenceNumber seq2 = db_->GetLatestSequenceNumber();
+  auto* cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
+          ->cfd();
+  // The iterator are suppose to see data before seq1.
+  Iterator* iter =
+      dbfull()->NewIteratorImpl(ReadOptions(), cfd, seq2, &callback1);
+
+  // Seek
+  // The latest value of "foo" before seq1 is "v3"
+  iter->Seek("foo");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("foo", iter->key());
+  ASSERT_EQ("v3", iter->value());
+  // "bar" is not visible to the iterator. It will move on to the next key
+  // "foo".
+  iter->Seek("bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("foo", iter->key());
+  ASSERT_EQ("v3", iter->value());
+
+  // Next
+  // Seek to "a"
+  iter->Seek("a");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("va", iter->value());
+  // "bar" is not visible to the iterator. It will move on to the next key
+  // "foo".
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("foo", iter->key());
+  ASSERT_EQ("v3", iter->value());
+
+  // Prev
+  // Seek to "z"
+  iter->Seek("z");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("vz", iter->value());
+  // The previous key is "foo", which is visible to the iterator.
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("foo", iter->key());
+  ASSERT_EQ("v3", iter->value());
+  // "bar" is not visible to the iterator. It will move on to the next key "a".
+  iter->Prev();  // skipping "bar"
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("a", iter->key());
+  ASSERT_EQ("va", iter->value());
+
+  // SeekForPrev
+  // The previous key is "foo", which is visible to the iterator.
+  iter->SeekForPrev("y");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("foo", iter->key());
+  ASSERT_EQ("v3", iter->value());
+  // "bar" is not visible to the iterator. It will move on to the next key "a".
+  iter->SeekForPrev("bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("a", iter->key());
+  ASSERT_EQ("va", iter->value());
+
+  delete iter;
+
+  // Prev beyond max_sequential_skip_in_iterations
+  uint64_t num_versions =
+      CurrentOptions().max_sequential_skip_in_iterations + 10;
+  for (uint64_t i = 0; i < num_versions; i++) {
+    ASSERT_OK(Put("bar", std::to_string(i)));
+  }
+  SequenceNumber seq3 = db_->GetLatestSequenceNumber();
+  TestReadCallback callback2(seq3);
+  ASSERT_OK(Put("bar", "v8"));
+  SequenceNumber seq4 = db_->GetLatestSequenceNumber();
+
+  // The iterator is suppose to see data before seq3.
+  iter = dbfull()->NewIteratorImpl(ReadOptions(), cfd, seq4, &callback2);
+  // Seek to "z", which is visible.
+  iter->Seek("z");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("vz", iter->value());
+  // Previous key is "foo" and the last value "v5" is visible.
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("foo", iter->key());
+  ASSERT_EQ("v5", iter->value());
+  // Since the number of values of "bar" is more than
+  // max_sequential_skip_in_iterations, Prev() will ultimately fallback to
+  // seek in forward direction. Here we test the fallback seek is correct.
+  // The last visible value should be (num_versions - 1), as "v8" is not
+  // visible.
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("bar", iter->key());
+  ASSERT_EQ(std::to_string(num_versions - 1), iter->value());
+
+  delete iter;
+}
+
+TEST_F(DBIteratorTest, BackwardIterationOnInplaceUpdateMemtable) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.inplace_update_support = false;
+  options.env = env_;
+  DestroyAndReopen(options);
+  constexpr int kNumKeys = 10;
+
+  // Write kNumKeys to WAL.
+  for (int i = 0; i < kNumKeys; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ReadOptions read_opts;
+  read_opts.total_order_seek = true;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    int count = 0;
+    for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+      ++count;
+    }
+    ASSERT_EQ(kNumKeys, count);
+  }
+
+  // Reopen and rebuild the memtable from WAL.
+  options.create_if_missing = false;
+  options.avoid_flush_during_recovery = true;
+  options.inplace_update_support = true;
+  options.allow_concurrent_memtable_write = false;
+  Reopen(options);
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    iter->SeekToLast();
+    // Backward iteration not supported due to inplace_update_support = true.
+    ASSERT_TRUE(iter->status().IsNotSupported());
+    ASSERT_FALSE(iter->Valid());
+  }
+}
+
+TEST_F(DBIteratorTest, IteratorRefreshReturnSV) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+  std::unique_ptr<Iterator> iter{db_->NewIterator(ReadOptions())};
+  SyncPoint::GetInstance()->SetCallBack(
+      "ArenaWrappedDBIter::Refresh:SV", [&](void*) {
+        ASSERT_OK(db_->Put(WriteOptions(), "dummy", "new SV"));
+        // This makes the local SV obselete.
+        ASSERT_OK(Flush());
+        SyncPoint::GetInstance()->DisableProcessing();
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(iter->Refresh());
+  iter.reset();
+  // iter used to not cleanup SV, so the Close() below would hit an assertion
+  // error.
+  Close();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_kv_checksum_test.cc b/src/rocksdb/db/db_kv_checksum_test.cc
new file mode 100644
index 000000000..614399243
--- /dev/null
+++ b/src/rocksdb/db/db_kv_checksum_test.cc
@@ -0,0 +1,885 @@
+//  Copyright (c) 2020-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_index.h"
+#include "db/db_test_util.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+enum class WriteBatchOpType {
+  kPut = 0,
+  kDelete,
+  kSingleDelete,
+  kMerge,
+  kPutEntity,
+  kDeleteRange,
+  kNum,
+};
+
+// Integer addition is needed for `::testing::Range()` to take the enum type.
+WriteBatchOpType operator+(WriteBatchOpType lhs, const int rhs) {
+  using T = std::underlying_type<WriteBatchOpType>::type;
+  return static_cast<WriteBatchOpType>(static_cast<T>(lhs) + rhs);
+}
+
+enum class WriteMode {
+  // `Write()` a `WriteBatch` constructed with `protection_bytes_per_key = 0`
+  // and `WriteOptions::protection_bytes_per_key = 0`
+  kWriteUnprotectedBatch = 0,
+  // `Write()` a `WriteBatch` constructed with `protection_bytes_per_key > 0`.
+  kWriteProtectedBatch,
+  // `Write()` a `WriteBatch` constructed with `protection_bytes_per_key == 0`.
+  // Protection is enabled via `WriteOptions::protection_bytes_per_key > 0`.
+  kWriteOptionProtectedBatch,
+  // TODO(ajkr): add a mode that uses `Write()` wrappers, e.g., `Put()`.
+  kNum,
+};
+
+// Integer addition is needed for `::testing::Range()` to take the enum type.
+WriteMode operator+(WriteMode lhs, const int rhs) {
+  using T = std::underlying_type<WriteMode>::type;
+  return static_cast<WriteMode>(static_cast<T>(lhs) + rhs);
+}
+
+std::pair<WriteBatch, Status> GetWriteBatch(ColumnFamilyHandle* cf_handle,
+                                            size_t protection_bytes_per_key,
+                                            WriteBatchOpType op_type) {
+  Status s;
+  WriteBatch wb(0 /* reserved_bytes */, 0 /* max_bytes */,
+                protection_bytes_per_key, 0 /* default_cf_ts_sz */);
+  switch (op_type) {
+    case WriteBatchOpType::kPut:
+      s = wb.Put(cf_handle, "key", "val");
+      break;
+    case WriteBatchOpType::kDelete:
+      s = wb.Delete(cf_handle, "key");
+      break;
+    case WriteBatchOpType::kSingleDelete:
+      s = wb.SingleDelete(cf_handle, "key");
+      break;
+    case WriteBatchOpType::kDeleteRange:
+      s = wb.DeleteRange(cf_handle, "begin", "end");
+      break;
+    case WriteBatchOpType::kMerge:
+      s = wb.Merge(cf_handle, "key", "val");
+      break;
+    case WriteBatchOpType::kPutEntity:
+      s = wb.PutEntity(cf_handle, "key",
+                       {{"attr_name1", "foo"}, {"attr_name2", "bar"}});
+      break;
+    case WriteBatchOpType::kNum:
+      assert(false);
+  }
+  return {std::move(wb), std::move(s)};
+}
+
+class DbKvChecksumTestBase : public DBTestBase {
+ public:
+  DbKvChecksumTestBase(const std::string& path, bool env_do_fsync)
+      : DBTestBase(path, env_do_fsync) {}
+
+  ColumnFamilyHandle* GetCFHandleToUse(ColumnFamilyHandle* column_family,
+                                       WriteBatchOpType op_type) const {
+    // Note: PutEntity cannot be called without column family
+    if (op_type == WriteBatchOpType::kPutEntity && !column_family) {
+      return db_->DefaultColumnFamily();
+    }
+
+    return column_family;
+  }
+};
+
+class DbKvChecksumTest
+    : public DbKvChecksumTestBase,
+      public ::testing::WithParamInterface<
+          std::tuple<WriteBatchOpType, char, WriteMode,
+                     uint32_t /* memtable_protection_bytes_per_key */>> {
+ public:
+  DbKvChecksumTest()
+      : DbKvChecksumTestBase("db_kv_checksum_test", /*env_do_fsync=*/false) {
+    op_type_ = std::get<0>(GetParam());
+    corrupt_byte_addend_ = std::get<1>(GetParam());
+    write_mode_ = std::get<2>(GetParam());
+    memtable_protection_bytes_per_key_ = std::get<3>(GetParam());
+  }
+
+  Status ExecuteWrite(ColumnFamilyHandle* cf_handle) {
+    switch (write_mode_) {
+      case WriteMode::kWriteUnprotectedBatch: {
+        auto batch_and_status =
+            GetWriteBatch(GetCFHandleToUse(cf_handle, op_type_),
+                          0 /* protection_bytes_per_key */, op_type_);
+        assert(batch_and_status.second.ok());
+        // Default write option has protection_bytes_per_key = 0
+        return db_->Write(WriteOptions(), &batch_and_status.first);
+      }
+      case WriteMode::kWriteProtectedBatch: {
+        auto batch_and_status =
+            GetWriteBatch(GetCFHandleToUse(cf_handle, op_type_),
+                          8 /* protection_bytes_per_key */, op_type_);
+        assert(batch_and_status.second.ok());
+        return db_->Write(WriteOptions(), &batch_and_status.first);
+      }
+      case WriteMode::kWriteOptionProtectedBatch: {
+        auto batch_and_status =
+            GetWriteBatch(GetCFHandleToUse(cf_handle, op_type_),
+                          0 /* protection_bytes_per_key */, op_type_);
+        assert(batch_and_status.second.ok());
+        WriteOptions write_opts;
+        write_opts.protection_bytes_per_key = 8;
+        return db_->Write(write_opts, &batch_and_status.first);
+      }
+      case WriteMode::kNum:
+        assert(false);
+    }
+    return Status::NotSupported("WriteMode " +
+                                std::to_string(static_cast<int>(write_mode_)));
+  }
+
+  void CorruptNextByteCallBack(void* arg) {
+    Slice encoded = *static_cast<Slice*>(arg);
+    if (entry_len_ == std::numeric_limits<size_t>::max()) {
+      // We learn the entry size on the first attempt
+      entry_len_ = encoded.size();
+    }
+    char* buf = const_cast<char*>(encoded.data());
+    buf[corrupt_byte_offset_] += corrupt_byte_addend_;
+    ++corrupt_byte_offset_;
+  }
+
+  bool MoreBytesToCorrupt() { return corrupt_byte_offset_ < entry_len_; }
+
+ protected:
+  WriteBatchOpType op_type_;
+  char corrupt_byte_addend_;
+  WriteMode write_mode_;
+  uint32_t memtable_protection_bytes_per_key_;
+  size_t corrupt_byte_offset_ = 0;
+  size_t entry_len_ = std::numeric_limits<size_t>::max();
+};
+
+std::string GetOpTypeString(const WriteBatchOpType& op_type) {
+  switch (op_type) {
+    case WriteBatchOpType::kPut:
+      return "Put";
+    case WriteBatchOpType::kDelete:
+      return "Delete";
+    case WriteBatchOpType::kSingleDelete:
+      return "SingleDelete";
+    case WriteBatchOpType::kDeleteRange:
+      return "DeleteRange";
+    case WriteBatchOpType::kMerge:
+      return "Merge";
+    case WriteBatchOpType::kPutEntity:
+      return "PutEntity";
+    case WriteBatchOpType::kNum:
+      assert(false);
+  }
+  assert(false);
+  return "";
+}
+
+std::string GetWriteModeString(const WriteMode& mode) {
+  switch (mode) {
+    case WriteMode::kWriteUnprotectedBatch:
+      return "WriteUnprotectedBatch";
+    case WriteMode::kWriteProtectedBatch:
+      return "WriteProtectedBatch";
+    case WriteMode::kWriteOptionProtectedBatch:
+      return "kWriteOptionProtectedBatch";
+    case WriteMode::kNum:
+      assert(false);
+  }
+  return "";
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DbKvChecksumTest, DbKvChecksumTest,
+    ::testing::Combine(::testing::Range(static_cast<WriteBatchOpType>(0),
+                                        WriteBatchOpType::kNum),
+                       ::testing::Values(2, 103, 251),
+                       ::testing::Range(WriteMode::kWriteProtectedBatch,
+                                        WriteMode::kNum),
+                       ::testing::Values(0)),
+    [](const testing::TestParamInfo<
+        std::tuple<WriteBatchOpType, char, WriteMode, uint32_t>>& args) {
+      std::ostringstream oss;
+      oss << GetOpTypeString(std::get<0>(args.param)) << "Add"
+          << static_cast<int>(
+                 static_cast<unsigned char>(std::get<1>(args.param)))
+          << GetWriteModeString(std::get<2>(args.param))
+          << static_cast<uint32_t>(std::get<3>(args.param));
+      return oss.str();
+    });
+
+// TODO(ajkr): add a test that corrupts the `WriteBatch` contents. Such
+// corruptions should only be detectable in `WriteMode::kWriteProtectedBatch`.
+
+TEST_P(DbKvChecksumTest, MemTableAddCorrupted) {
+  // This test repeatedly attempts to write `WriteBatch`es containing a single
+  // entry of type `op_type_`. Each attempt has one byte corrupted in its
+  // memtable entry by adding `corrupt_byte_addend_` to its original value. The
+  // test repeats until an attempt has been made on each byte in the encoded
+  // memtable entry. All attempts are expected to fail with `Status::Corruption`
+  SyncPoint::GetInstance()->SetCallBack(
+      "MemTable::Add:Encoded",
+      std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
+                std::placeholders::_1));
+
+  while (MoreBytesToCorrupt()) {
+    // Failed memtable insert always leads to read-only mode, so we have to
+    // reopen for every attempt.
+    Options options = CurrentOptions();
+    if (op_type_ == WriteBatchOpType::kMerge) {
+      options.merge_operator = MergeOperators::CreateStringAppendOperator();
+    }
+    Reopen(options);
+
+    SyncPoint::GetInstance()->EnableProcessing();
+    ASSERT_TRUE(ExecuteWrite(nullptr /* cf_handle */).IsCorruption());
+    SyncPoint::GetInstance()->DisableProcessing();
+
+    // In case the above callback is not invoked, this test will run
+    // numeric_limits<size_t>::max() times until it reports an error (or will
+    // exhaust disk space). Added this assert to report error early.
+    ASSERT_TRUE(entry_len_ < std::numeric_limits<size_t>::max());
+  }
+}
+
+TEST_P(DbKvChecksumTest, MemTableAddWithColumnFamilyCorrupted) {
+  // This test repeatedly attempts to write `WriteBatch`es containing a single
+  // entry of type `op_type_` to a non-default column family. Each attempt has
+  // one byte corrupted in its memtable entry by adding `corrupt_byte_addend_`
+  // to its original value. The test repeats until an attempt has been made on
+  // each byte in the encoded memtable entry. All attempts are expected to fail
+  // with `Status::Corruption`.
+  Options options = CurrentOptions();
+  if (op_type_ == WriteBatchOpType::kMerge) {
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  }
+  CreateAndReopenWithCF({"pikachu"}, options);
+  SyncPoint::GetInstance()->SetCallBack(
+      "MemTable::Add:Encoded",
+      std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
+                std::placeholders::_1));
+
+  while (MoreBytesToCorrupt()) {
+    // Failed memtable insert always leads to read-only mode, so we have to
+    // reopen for every attempt.
+    ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+
+    SyncPoint::GetInstance()->EnableProcessing();
+    ASSERT_TRUE(ExecuteWrite(handles_[1]).IsCorruption());
+    SyncPoint::GetInstance()->DisableProcessing();
+
+    // In case the above callback is not invoked, this test will run
+    // numeric_limits<size_t>::max() times until it reports an error (or will
+    // exhaust disk space). Added this assert to report error early.
+    ASSERT_TRUE(entry_len_ < std::numeric_limits<size_t>::max());
+  }
+}
+
+TEST_P(DbKvChecksumTest, NoCorruptionCase) {
+  // If this test fails, we may have found a piece of malfunctioned hardware
+  auto batch_and_status =
+      GetWriteBatch(GetCFHandleToUse(nullptr, op_type_),
+                    8 /* protection_bytes_per_key */, op_type_);
+  ASSERT_OK(batch_and_status.second);
+  ASSERT_OK(batch_and_status.first.VerifyChecksum());
+}
+
+TEST_P(DbKvChecksumTest, WriteToWALCorrupted) {
+  // This test repeatedly attempts to write `WriteBatch`es containing a single
+  // entry of type `op_type_`. Each attempt has one byte corrupted by adding
+  // `corrupt_byte_addend_` to its original value. The test repeats until an
+  // attempt has been made on each byte in the encoded write batch. All attempts
+  // are expected to fail with `Status::Corruption`
+  Options options = CurrentOptions();
+  if (op_type_ == WriteBatchOpType::kMerge) {
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  }
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::WriteToWAL:log_entry",
+      std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
+                std::placeholders::_1));
+  // First 8 bytes are for sequence number which is not protected in write batch
+  corrupt_byte_offset_ = 8;
+
+  while (MoreBytesToCorrupt()) {
+    // Corrupted write batch leads to read-only mode, so we have to
+    // reopen for every attempt.
+    Reopen(options);
+    auto log_size_pre_write = dbfull()->TEST_total_log_size();
+
+    SyncPoint::GetInstance()->EnableProcessing();
+    ASSERT_TRUE(ExecuteWrite(nullptr /* cf_handle */).IsCorruption());
+    // Confirm that nothing was written to WAL
+    ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
+    ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
+    SyncPoint::GetInstance()->DisableProcessing();
+
+    // In case the above callback is not invoked, this test will run
+    // numeric_limits<size_t>::max() times until it reports an error (or will
+    // exhaust disk space). Added this assert to report error early.
+    ASSERT_TRUE(entry_len_ < std::numeric_limits<size_t>::max());
+  }
+}
+
+TEST_P(DbKvChecksumTest, WriteToWALWithColumnFamilyCorrupted) {
+  // This test repeatedly attempts to write `WriteBatch`es containing a single
+  // entry of type `op_type_`. Each attempt has one byte corrupted by adding
+  // `corrupt_byte_addend_` to its original value. The test repeats until an
+  // attempt has been made on each byte in the encoded write batch. All attempts
+  // are expected to fail with `Status::Corruption`
+  Options options = CurrentOptions();
+  if (op_type_ == WriteBatchOpType::kMerge) {
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  }
+  CreateAndReopenWithCF({"pikachu"}, options);
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::WriteToWAL:log_entry",
+      std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
+                std::placeholders::_1));
+  // First 8 bytes are for sequence number which is not protected in write batch
+  corrupt_byte_offset_ = 8;
+
+  while (MoreBytesToCorrupt()) {
+    // Corrupted write batch leads to read-only mode, so we have to
+    // reopen for every attempt.
+    ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+    auto log_size_pre_write = dbfull()->TEST_total_log_size();
+
+    SyncPoint::GetInstance()->EnableProcessing();
+    ASSERT_TRUE(ExecuteWrite(nullptr /* cf_handle */).IsCorruption());
+    // Confirm that nothing was written to WAL
+    ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
+    ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
+    SyncPoint::GetInstance()->DisableProcessing();
+
+    // In case the above callback is not invoked, this test will run
+    // numeric_limits<size_t>::max() times until it reports an error (or will
+    // exhaust disk space). Added this assert to report error early.
+    ASSERT_TRUE(entry_len_ < std::numeric_limits<size_t>::max());
+  }
+}
+
+class DbKvChecksumTestMergedBatch
+    : public DbKvChecksumTestBase,
+      public ::testing::WithParamInterface<
+          std::tuple<WriteBatchOpType, WriteBatchOpType, char>> {
+ public:
+  DbKvChecksumTestMergedBatch()
+      : DbKvChecksumTestBase("db_kv_checksum_test", /*env_do_fsync=*/false) {
+    op_type1_ = std::get<0>(GetParam());
+    op_type2_ = std::get<1>(GetParam());
+    corrupt_byte_addend_ = std::get<2>(GetParam());
+  }
+
+ protected:
+  WriteBatchOpType op_type1_;
+  WriteBatchOpType op_type2_;
+  char corrupt_byte_addend_;
+};
+
+void CorruptWriteBatch(Slice* content, size_t offset,
+                       char corrupt_byte_addend) {
+  ASSERT_TRUE(offset < content->size());
+  char* buf = const_cast<char*>(content->data());
+  buf[offset] += corrupt_byte_addend;
+}
+
+TEST_P(DbKvChecksumTestMergedBatch, NoCorruptionCase) {
+  // Veirfy write batch checksum after write batch append
+  auto batch1 = GetWriteBatch(GetCFHandleToUse(nullptr, op_type1_),
+                              8 /* protection_bytes_per_key */, op_type1_);
+  ASSERT_OK(batch1.second);
+  auto batch2 = GetWriteBatch(GetCFHandleToUse(nullptr, op_type2_),
+                              8 /* protection_bytes_per_key */, op_type2_);
+  ASSERT_OK(batch2.second);
+  ASSERT_OK(WriteBatchInternal::Append(&batch1.first, &batch2.first));
+  ASSERT_OK(batch1.first.VerifyChecksum());
+}
+
+TEST_P(DbKvChecksumTestMergedBatch, WriteToWALCorrupted) {
+  // This test has two writers repeatedly attempt to write `WriteBatch`es
+  // containing a single entry of type op_type1_ and op_type2_ respectively. The
+  // leader of the write group writes the batch containinng the entry of type
+  // op_type1_. One byte of the pre-merged write batches is corrupted by adding
+  // `corrupt_byte_addend_` to the batch's original value during each attempt.
+  // The test repeats until an attempt has been made on each byte in both
+  // pre-merged write batches. All attempts are expected to fail with
+  // `Status::Corruption`.
+  Options options = CurrentOptions();
+  if (op_type1_ == WriteBatchOpType::kMerge ||
+      op_type2_ == WriteBatchOpType::kMerge) {
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  }
+
+  auto leader_batch_and_status =
+      GetWriteBatch(GetCFHandleToUse(nullptr, op_type1_),
+                    8 /* protection_bytes_per_key */, op_type1_);
+  ASSERT_OK(leader_batch_and_status.second);
+  auto follower_batch_and_status =
+      GetWriteBatch(GetCFHandleToUse(nullptr, op_type2_),
+                    8 /* protection_bytes_per_key */, op_type2_);
+  size_t leader_batch_size = leader_batch_and_status.first.GetDataSize();
+  size_t total_bytes =
+      leader_batch_size + follower_batch_and_status.first.GetDataSize();
+  // First 8 bytes are for sequence number which is not protected in write batch
+  size_t corrupt_byte_offset = 8;
+
+  std::atomic<bool> follower_joined{false};
+  std::atomic<int> leader_count{0};
+  port::Thread follower_thread;
+  // This callback should only be called by the leader thread
+  SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::JoinBatchGroup:Wait2", [&](void* arg_leader) {
+        auto* leader = reinterpret_cast<WriteThread::Writer*>(arg_leader);
+        ASSERT_EQ(leader->state, WriteThread::STATE_GROUP_LEADER);
+
+        // This callback should only be called by the follower thread
+        SyncPoint::GetInstance()->SetCallBack(
+            "WriteThread::JoinBatchGroup:Wait", [&](void* arg_follower) {
+              auto* follower =
+                  reinterpret_cast<WriteThread::Writer*>(arg_follower);
+              // The leader thread will wait on this bool and hence wait until
+              // this writer joins the write group
+              ASSERT_NE(follower->state, WriteThread::STATE_GROUP_LEADER);
+              if (corrupt_byte_offset >= leader_batch_size) {
+                Slice batch_content = follower->batch->Data();
+                CorruptWriteBatch(&batch_content,
+                                  corrupt_byte_offset - leader_batch_size,
+                                  corrupt_byte_addend_);
+              }
+              // Leader busy waits on this flag
+              follower_joined = true;
+              // So the follower does not enter the outer callback at
+              // WriteThread::JoinBatchGroup:Wait2
+              SyncPoint::GetInstance()->DisableProcessing();
+            });
+
+        // Start the other writer thread which will join the write group as
+        // follower
+        follower_thread = port::Thread([&]() {
+          follower_batch_and_status =
+              GetWriteBatch(GetCFHandleToUse(nullptr, op_type2_),
+                            8 /* protection_bytes_per_key */, op_type2_);
+          ASSERT_OK(follower_batch_and_status.second);
+          ASSERT_TRUE(
+              db_->Write(WriteOptions(), &follower_batch_and_status.first)
+                  .IsCorruption());
+        });
+
+        ASSERT_EQ(leader->batch->GetDataSize(), leader_batch_size);
+        if (corrupt_byte_offset < leader_batch_size) {
+          Slice batch_content = leader->batch->Data();
+          CorruptWriteBatch(&batch_content, corrupt_byte_offset,
+                            corrupt_byte_addend_);
+        }
+        leader_count++;
+        while (!follower_joined) {
+          // busy waiting
+        }
+      });
+  while (corrupt_byte_offset < total_bytes) {
+    // Reopen DB since it failed WAL write which lead to read-only mode
+    Reopen(options);
+    SyncPoint::GetInstance()->EnableProcessing();
+    auto log_size_pre_write = dbfull()->TEST_total_log_size();
+    leader_batch_and_status =
+        GetWriteBatch(GetCFHandleToUse(nullptr, op_type1_),
+                      8 /* protection_bytes_per_key */, op_type1_);
+    ASSERT_OK(leader_batch_and_status.second);
+    ASSERT_TRUE(db_->Write(WriteOptions(), &leader_batch_and_status.first)
+                    .IsCorruption());
+    follower_thread.join();
+    // Prevent leader thread from entering this callback
+    SyncPoint::GetInstance()->ClearCallBack("WriteThread::JoinBatchGroup:Wait");
+    ASSERT_EQ(1, leader_count);
+    // Nothing should have been written to WAL
+    ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
+    ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
+
+    corrupt_byte_offset++;
+    if (corrupt_byte_offset == leader_batch_size) {
+      // skip over the sequence number part of follower's write batch
+      corrupt_byte_offset += 8;
+    }
+    follower_joined = false;
+    leader_count = 0;
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DbKvChecksumTestMergedBatch, WriteToWALWithColumnFamilyCorrupted) {
+  // This test has two writers repeatedly attempt to write `WriteBatch`es
+  // containing a single entry of type op_type1_ and op_type2_ respectively. The
+  // leader of the write group writes the batch containinng the entry of type
+  // op_type1_. One byte of the pre-merged write batches is corrupted by adding
+  // `corrupt_byte_addend_` to the batch's original value during each attempt.
+  // The test repeats until an attempt has been made on each byte in both
+  // pre-merged write batches. All attempts are expected to fail with
+  // `Status::Corruption`.
+  Options options = CurrentOptions();
+  if (op_type1_ == WriteBatchOpType::kMerge ||
+      op_type2_ == WriteBatchOpType::kMerge) {
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  }
+  CreateAndReopenWithCF({"ramen"}, options);
+
+  auto leader_batch_and_status =
+      GetWriteBatch(GetCFHandleToUse(handles_[1], op_type1_),
+                    8 /* protection_bytes_per_key */, op_type1_);
+  ASSERT_OK(leader_batch_and_status.second);
+  auto follower_batch_and_status =
+      GetWriteBatch(GetCFHandleToUse(handles_[1], op_type2_),
+                    8 /* protection_bytes_per_key */, op_type2_);
+  size_t leader_batch_size = leader_batch_and_status.first.GetDataSize();
+  size_t total_bytes =
+      leader_batch_size + follower_batch_and_status.first.GetDataSize();
+  // First 8 bytes are for sequence number which is not protected in write batch
+  size_t corrupt_byte_offset = 8;
+
+  std::atomic<bool> follower_joined{false};
+  std::atomic<int> leader_count{0};
+  port::Thread follower_thread;
+  // This callback should only be called by the leader thread
+  SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::JoinBatchGroup:Wait2", [&](void* arg_leader) {
+        auto* leader = reinterpret_cast<WriteThread::Writer*>(arg_leader);
+        ASSERT_EQ(leader->state, WriteThread::STATE_GROUP_LEADER);
+
+        // This callback should only be called by the follower thread
+        SyncPoint::GetInstance()->SetCallBack(
+            "WriteThread::JoinBatchGroup:Wait", [&](void* arg_follower) {
+              auto* follower =
+                  reinterpret_cast<WriteThread::Writer*>(arg_follower);
+              // The leader thread will wait on this bool and hence wait until
+              // this writer joins the write group
+              ASSERT_NE(follower->state, WriteThread::STATE_GROUP_LEADER);
+              if (corrupt_byte_offset >= leader_batch_size) {
+                Slice batch_content =
+                    WriteBatchInternal::Contents(follower->batch);
+                CorruptWriteBatch(&batch_content,
+                                  corrupt_byte_offset - leader_batch_size,
+                                  corrupt_byte_addend_);
+              }
+              follower_joined = true;
+              // So the follower does not enter the outer callback at
+              // WriteThread::JoinBatchGroup:Wait2
+              SyncPoint::GetInstance()->DisableProcessing();
+            });
+
+        // Start the other writer thread which will join the write group as
+        // follower
+        follower_thread = port::Thread([&]() {
+          follower_batch_and_status =
+              GetWriteBatch(GetCFHandleToUse(handles_[1], op_type2_),
+                            8 /* protection_bytes_per_key */, op_type2_);
+          ASSERT_OK(follower_batch_and_status.second);
+          ASSERT_TRUE(
+              db_->Write(WriteOptions(), &follower_batch_and_status.first)
+                  .IsCorruption());
+        });
+
+        ASSERT_EQ(leader->batch->GetDataSize(), leader_batch_size);
+        if (corrupt_byte_offset < leader_batch_size) {
+          Slice batch_content = WriteBatchInternal::Contents(leader->batch);
+          CorruptWriteBatch(&batch_content, corrupt_byte_offset,
+                            corrupt_byte_addend_);
+        }
+        leader_count++;
+        while (!follower_joined) {
+          // busy waiting
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  while (corrupt_byte_offset < total_bytes) {
+    // Reopen DB since it failed WAL write which lead to read-only mode
+    ReopenWithColumnFamilies({kDefaultColumnFamilyName, "ramen"}, options);
+    SyncPoint::GetInstance()->EnableProcessing();
+    auto log_size_pre_write = dbfull()->TEST_total_log_size();
+    leader_batch_and_status =
+        GetWriteBatch(GetCFHandleToUse(handles_[1], op_type1_),
+                      8 /* protection_bytes_per_key */, op_type1_);
+    ASSERT_OK(leader_batch_and_status.second);
+    ASSERT_TRUE(db_->Write(WriteOptions(), &leader_batch_and_status.first)
+                    .IsCorruption());
+    follower_thread.join();
+    // Prevent leader thread from entering this callback
+    SyncPoint::GetInstance()->ClearCallBack("WriteThread::JoinBatchGroup:Wait");
+
+    ASSERT_EQ(1, leader_count);
+    // Nothing should have been written to WAL
+    ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
+    ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
+
+    corrupt_byte_offset++;
+    if (corrupt_byte_offset == leader_batch_size) {
+      // skip over the sequence number part of follower's write batch
+      corrupt_byte_offset += 8;
+    }
+    follower_joined = false;
+    leader_count = 0;
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DbKvChecksumTestMergedBatch, DbKvChecksumTestMergedBatch,
+    ::testing::Combine(::testing::Range(static_cast<WriteBatchOpType>(0),
+                                        WriteBatchOpType::kNum),
+                       ::testing::Range(static_cast<WriteBatchOpType>(0),
+                                        WriteBatchOpType::kNum),
+                       ::testing::Values(2, 103, 251)),
+    [](const testing::TestParamInfo<
+        std::tuple<WriteBatchOpType, WriteBatchOpType, char>>& args) {
+      std::ostringstream oss;
+      oss << GetOpTypeString(std::get<0>(args.param))
+          << GetOpTypeString(std::get<1>(args.param)) << "Add"
+          << static_cast<int>(
+                 static_cast<unsigned char>(std::get<2>(args.param)));
+      return oss.str();
+    });
+
+// TODO: add test for transactions
+// TODO: add test for corrupted write batch with WAL disabled
+
+class DbKVChecksumWALToWriteBatchTest : public DBTestBase {
+ public:
+  DbKVChecksumWALToWriteBatchTest()
+      : DBTestBase("db_kv_checksum_test", /*env_do_fsync=*/false) {}
+};
+
+TEST_F(DbKVChecksumWALToWriteBatchTest, WriteBatchChecksumHandoff) {
+  Options options = CurrentOptions();
+  Reopen(options);
+  ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
+  std::string content = "";
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:batch",
+      [&](void* batch_ptr) {
+        WriteBatch* batch = reinterpret_cast<WriteBatch*>(batch_ptr);
+        content.assign(batch->Data().data(), batch->GetDataSize());
+        Slice batch_content = batch->Data();
+        // Corrupt first bit
+        CorruptWriteBatch(&batch_content, 0, 1);
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:checksum",
+      [&](void* checksum_ptr) {
+        // Verify that checksum is produced on the batch content
+        uint64_t checksum = *reinterpret_cast<uint64_t*>(checksum_ptr);
+        ASSERT_EQ(checksum, XXH3_64bits(content.data(), content.size()));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_TRUE(TryReopen(options).IsCorruption());
+  SyncPoint::GetInstance()->DisableProcessing();
+};
+
+// TODO (cbi): add DeleteRange coverage once it is implemented
+class DbMemtableKVChecksumTest : public DbKvChecksumTest {
+ public:
+  DbMemtableKVChecksumTest() : DbKvChecksumTest() {}
+
+ protected:
+  // Indices in the memtable entry that we will not corrupt.
+  // For memtable entry format, see comments in MemTable::Add().
+  // We do not corrupt key length and value length fields in this test
+  // case since it causes segfault and ASAN will complain.
+  // For this test case, key and value are all of length 3, so
+  // key length field is at index 0 and value length field is at index 12.
+  const std::set<size_t> index_not_to_corrupt{0, 12};
+
+  void SkipNotToCorruptEntry() {
+    if (index_not_to_corrupt.find(corrupt_byte_offset_) !=
+        index_not_to_corrupt.end()) {
+      corrupt_byte_offset_++;
+    }
+  }
+};
+
+INSTANTIATE_TEST_CASE_P(
+    DbMemtableKVChecksumTest, DbMemtableKVChecksumTest,
+    ::testing::Combine(::testing::Range(static_cast<WriteBatchOpType>(0),
+                                        WriteBatchOpType::kDeleteRange),
+                       ::testing::Values(2, 103, 251),
+                       ::testing::Range(static_cast<WriteMode>(0),
+                                        WriteMode::kWriteOptionProtectedBatch),
+                       // skip 1 byte checksum as it makes test flaky
+                       ::testing::Values(2, 4, 8)),
+    [](const testing::TestParamInfo<
+        std::tuple<WriteBatchOpType, char, WriteMode, uint32_t>>& args) {
+      std::ostringstream oss;
+      oss << GetOpTypeString(std::get<0>(args.param)) << "Add"
+          << static_cast<int>(
+                 static_cast<unsigned char>(std::get<1>(args.param)))
+          << GetWriteModeString(std::get<2>(args.param))
+          << static_cast<uint32_t>(std::get<3>(args.param));
+      return oss.str();
+    });
+
+TEST_P(DbMemtableKVChecksumTest, GetWithCorruptAfterMemtableInsert) {
+  // Record memtable entry size.
+  // Not corrupting memtable entry here since it will segfault
+  // or fail some asserts inside memtablerep implementation
+  // e.g., when key_len is corrupted.
+  SyncPoint::GetInstance()->SetCallBack(
+      "MemTable::Add:BeforeReturn:Encoded", [&](void* arg) {
+        Slice encoded = *static_cast<Slice*>(arg);
+        entry_len_ = encoded.size();
+      });
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "Memtable::SaveValue:Begin:entry", [&](void* entry) {
+        char* buf = *static_cast<char**>(entry);
+        buf[corrupt_byte_offset_] += corrupt_byte_addend_;
+        ++corrupt_byte_offset_;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  Options options = CurrentOptions();
+  options.memtable_protection_bytes_per_key =
+      memtable_protection_bytes_per_key_;
+  if (op_type_ == WriteBatchOpType::kMerge) {
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  }
+
+  SkipNotToCorruptEntry();
+  while (MoreBytesToCorrupt()) {
+    Reopen(options);
+    ASSERT_OK(ExecuteWrite(nullptr));
+    std::string val;
+    ASSERT_TRUE(db_->Get(ReadOptions(), "key", &val).IsCorruption());
+    Destroy(options);
+    SkipNotToCorruptEntry();
+  }
+}
+
+TEST_P(DbMemtableKVChecksumTest,
+       GetWithColumnFamilyCorruptAfterMemtableInsert) {
+  // Record memtable entry size.
+  // Not corrupting memtable entry here since it will segfault
+  // or fail some asserts inside memtablerep implementation
+  // e.g., when key_len is corrupted.
+  SyncPoint::GetInstance()->SetCallBack(
+      "MemTable::Add:BeforeReturn:Encoded", [&](void* arg) {
+        Slice encoded = *static_cast<Slice*>(arg);
+        entry_len_ = encoded.size();
+      });
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "Memtable::SaveValue:Begin:entry", [&](void* entry) {
+        char* buf = *static_cast<char**>(entry);
+        buf[corrupt_byte_offset_] += corrupt_byte_addend_;
+        ++corrupt_byte_offset_;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  Options options = CurrentOptions();
+  options.memtable_protection_bytes_per_key =
+      memtable_protection_bytes_per_key_;
+  if (op_type_ == WriteBatchOpType::kMerge) {
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  }
+
+  SkipNotToCorruptEntry();
+  while (MoreBytesToCorrupt()) {
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+    ASSERT_OK(ExecuteWrite(handles_[1]));
+    std::string val;
+    ASSERT_TRUE(
+        db_->Get(ReadOptions(), handles_[1], "key", &val).IsCorruption());
+    Destroy(options);
+    SkipNotToCorruptEntry();
+  }
+}
+
+TEST_P(DbMemtableKVChecksumTest, IteratorWithCorruptAfterMemtableInsert) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "MemTable::Add:BeforeReturn:Encoded",
+      std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
+                std::placeholders::_1));
+  SyncPoint::GetInstance()->EnableProcessing();
+  Options options = CurrentOptions();
+  options.memtable_protection_bytes_per_key =
+      memtable_protection_bytes_per_key_;
+  if (op_type_ == WriteBatchOpType::kMerge) {
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  }
+
+  SkipNotToCorruptEntry();
+  while (MoreBytesToCorrupt()) {
+    Reopen(options);
+    ASSERT_OK(ExecuteWrite(nullptr));
+    Iterator* it = db_->NewIterator(ReadOptions());
+    it->SeekToFirst();
+    ASSERT_FALSE(it->Valid());
+    ASSERT_TRUE(it->status().IsCorruption());
+    delete it;
+    Destroy(options);
+    SkipNotToCorruptEntry();
+  }
+}
+
+TEST_P(DbMemtableKVChecksumTest,
+       IteratorWithColumnFamilyCorruptAfterMemtableInsert) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "MemTable::Add:BeforeReturn:Encoded",
+      std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
+                std::placeholders::_1));
+  SyncPoint::GetInstance()->EnableProcessing();
+  Options options = CurrentOptions();
+  options.memtable_protection_bytes_per_key =
+      memtable_protection_bytes_per_key_;
+  if (op_type_ == WriteBatchOpType::kMerge) {
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  }
+
+  SkipNotToCorruptEntry();
+  while (MoreBytesToCorrupt()) {
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+    ASSERT_OK(ExecuteWrite(handles_[1]));
+    Iterator* it = db_->NewIterator(ReadOptions(), handles_[1]);
+    it->SeekToFirst();
+    ASSERT_FALSE(it->Valid());
+    ASSERT_TRUE(it->status().IsCorruption());
+    delete it;
+    Destroy(options);
+    SkipNotToCorruptEntry();
+  }
+}
+
+TEST_P(DbMemtableKVChecksumTest, FlushWithCorruptAfterMemtableInsert) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "MemTable::Add:BeforeReturn:Encoded",
+      std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
+                std::placeholders::_1));
+  SyncPoint::GetInstance()->EnableProcessing();
+  Options options = CurrentOptions();
+  options.memtable_protection_bytes_per_key =
+      memtable_protection_bytes_per_key_;
+  if (op_type_ == WriteBatchOpType::kMerge) {
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  }
+
+  SkipNotToCorruptEntry();
+  // Not corruping each byte like other tests since Flush() is relatively slow.
+  Reopen(options);
+  ASSERT_OK(ExecuteWrite(nullptr));
+  ASSERT_TRUE(Flush().IsCorruption());
+  // DB enters read-only state when flush reads corrupted data
+  ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
+  Destroy(options);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_log_iter_test.cc b/src/rocksdb/db/db_log_iter_test.cc
new file mode 100644
index 000000000..4e982858c
--- /dev/null
+++ b/src/rocksdb/db/db_log_iter_test.cc
@@ -0,0 +1,305 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Introduction of SyncPoint effectively disabled building and running this test
+// in Release build.
+// which is a pity, it is a good test
+#if !defined(ROCKSDB_LITE)
+
+#include "db/db_test_util.h"
+#include "env/mock_env.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBTestXactLogIterator : public DBTestBase {
+ public:
+  DBTestXactLogIterator()
+      : DBTestBase("db_log_iter_test", /*env_do_fsync=*/true) {}
+
+  std::unique_ptr<TransactionLogIterator> OpenTransactionLogIter(
+      const SequenceNumber seq) {
+    std::unique_ptr<TransactionLogIterator> iter;
+    Status status = dbfull()->GetUpdatesSince(seq, &iter);
+    EXPECT_OK(status);
+    EXPECT_TRUE(iter->Valid());
+    return iter;
+  }
+};
+
+namespace {
+SequenceNumber ReadRecords(std::unique_ptr<TransactionLogIterator>& iter,
+                           int& count, bool expect_ok = true) {
+  count = 0;
+  SequenceNumber lastSequence = 0;
+  BatchResult res;
+  while (iter->Valid()) {
+    res = iter->GetBatch();
+    EXPECT_TRUE(res.sequence > lastSequence);
+    ++count;
+    lastSequence = res.sequence;
+    EXPECT_OK(iter->status());
+    iter->Next();
+  }
+  if (expect_ok) {
+    EXPECT_OK(iter->status());
+  } else {
+    EXPECT_NOK(iter->status());
+  }
+  return res.sequence;
+}
+
+void ExpectRecords(const int expected_no_records,
+                   std::unique_ptr<TransactionLogIterator>& iter) {
+  int num_records;
+  ReadRecords(iter, num_records);
+  ASSERT_EQ(num_records, expected_no_records);
+}
+}  // anonymous namespace
+
+TEST_F(DBTestXactLogIterator, TransactionLogIterator) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+    ASSERT_OK(Put(0, "key1", DummyString(1024)));
+    ASSERT_OK(Put(1, "key2", DummyString(1024)));
+    ASSERT_OK(Put(1, "key2", DummyString(1024)));
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3U);
+    {
+      auto iter = OpenTransactionLogIter(0);
+      ExpectRecords(3, iter);
+    }
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    env_->SleepForMicroseconds(2 * 1000 * 1000);
+    {
+      ASSERT_OK(Put(0, "key4", DummyString(1024)));
+      ASSERT_OK(Put(1, "key5", DummyString(1024)));
+      ASSERT_OK(Put(0, "key6", DummyString(1024)));
+    }
+    {
+      auto iter = OpenTransactionLogIter(0);
+      ExpectRecords(6, iter);
+    }
+  } while (ChangeCompactOptions());
+}
+
+#ifndef NDEBUG  // sync point is not included with DNDEBUG build
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorRace) {
+  static const int LOG_ITERATOR_RACE_TEST_COUNT = 2;
+  static const char* sync_points[LOG_ITERATOR_RACE_TEST_COUNT][4] = {
+      {"WalManager::GetSortedWalFiles:1", "WalManager::PurgeObsoleteFiles:1",
+       "WalManager::PurgeObsoleteFiles:2", "WalManager::GetSortedWalFiles:2"},
+      {"WalManager::GetSortedWalsOfType:1", "WalManager::PurgeObsoleteFiles:1",
+       "WalManager::PurgeObsoleteFiles:2",
+       "WalManager::GetSortedWalsOfType:2"}};
+  for (int test = 0; test < LOG_ITERATOR_RACE_TEST_COUNT; ++test) {
+    // Setup sync point dependency to reproduce the race condition of
+    // a log file moved to archived dir, in the middle of GetSortedWalFiles
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+        {sync_points[test][0], sync_points[test][1]},
+        {sync_points[test][2], sync_points[test][3]},
+    });
+
+    do {
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+      Options options = OptionsForLogIterTest();
+      DestroyAndReopen(options);
+      ASSERT_OK(Put("key1", DummyString(1024)));
+      ASSERT_OK(dbfull()->Flush(FlushOptions()));
+      ASSERT_OK(Put("key2", DummyString(1024)));
+      ASSERT_OK(dbfull()->Flush(FlushOptions()));
+      ASSERT_OK(Put("key3", DummyString(1024)));
+      ASSERT_OK(dbfull()->Flush(FlushOptions()));
+      ASSERT_OK(Put("key4", DummyString(1024)));
+      ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4U);
+      ASSERT_OK(dbfull()->FlushWAL(false));
+
+      {
+        auto iter = OpenTransactionLogIter(0);
+        ExpectRecords(4, iter);
+      }
+
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+      // trigger async flush, and log move. Well, log move will
+      // wait until the GetSortedWalFiles:1 to reproduce the race
+      // condition
+      FlushOptions flush_options;
+      flush_options.wait = false;
+      ASSERT_OK(dbfull()->Flush(flush_options));
+
+      // "key5" would be written in a new memtable and log
+      ASSERT_OK(Put("key5", DummyString(1024)));
+      ASSERT_OK(dbfull()->FlushWAL(false));
+      {
+        // this iter would miss "key4" if not fixed
+        auto iter = OpenTransactionLogIter(0);
+        ExpectRecords(5, iter);
+      }
+    } while (ChangeCompactOptions());
+  }
+}
+#endif
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorStallAtLastRecord) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(options);
+    ASSERT_OK(Put("key1", DummyString(1024)));
+    auto iter = OpenTransactionLogIter(0);
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(!iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_OK(Put("key2", DummyString(1024)));
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorCheckAfterRestart) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(options);
+    ASSERT_OK(Put("key1", DummyString(1024)));
+    ASSERT_OK(Put("key2", DummyString(1023)));
+    ASSERT_OK(dbfull()->Flush(FlushOptions()));
+    Reopen(options);
+    auto iter = OpenTransactionLogIter(0);
+    ExpectRecords(2, iter);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorCorruptedLog) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(options);
+
+    for (int i = 0; i < 1024; i++) {
+      ASSERT_OK(Put("key" + std::to_string(i), DummyString(10)));
+    }
+
+    ASSERT_OK(Flush());
+    ASSERT_OK(db_->FlushWAL(false));
+
+    // Corrupt this log to create a gap
+    ASSERT_OK(db_->DisableFileDeletions());
+
+    VectorLogPtr wal_files;
+    ASSERT_OK(db_->GetSortedWalFiles(wal_files));
+    ASSERT_FALSE(wal_files.empty());
+
+    const auto logfile_path = dbname_ + "/" + wal_files.front()->PathName();
+    ASSERT_OK(test::TruncateFile(env_, logfile_path,
+                                 wal_files.front()->SizeFileBytes() / 2));
+
+    ASSERT_OK(db_->EnableFileDeletions());
+
+    // Insert a new entry to a new log file
+    ASSERT_OK(Put("key1025", DummyString(10)));
+    ASSERT_OK(db_->FlushWAL(false));
+
+    // Try to read from the beginning. Should stop before the gap and read less
+    // than 1025 entries
+    auto iter = OpenTransactionLogIter(0);
+    int count = 0;
+    SequenceNumber last_sequence_read = ReadRecords(iter, count, false);
+    ASSERT_LT(last_sequence_read, 1025U);
+
+    // Try to read past the gap, should be able to seek to key1025
+    auto iter2 = OpenTransactionLogIter(last_sequence_read + 1);
+    ExpectRecords(1, iter2);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorBatchOperations) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+    WriteBatch batch;
+    ASSERT_OK(batch.Put(handles_[1], "key1", DummyString(1024)));
+    ASSERT_OK(batch.Put(handles_[0], "key2", DummyString(1024)));
+    ASSERT_OK(batch.Put(handles_[1], "key3", DummyString(1024)));
+    ASSERT_OK(batch.Delete(handles_[0], "key2"));
+    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Flush(0));
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_OK(Put(1, "key4", DummyString(1024)));
+    auto iter = OpenTransactionLogIter(3);
+    ExpectRecords(2, iter);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorBlobs) {
+  Options options = OptionsForLogIterTest();
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  {
+    WriteBatch batch;
+    ASSERT_OK(batch.Put(handles_[1], "key1", DummyString(1024)));
+    ASSERT_OK(batch.Put(handles_[0], "key2", DummyString(1024)));
+    ASSERT_OK(batch.PutLogData(Slice("blob1")));
+    ASSERT_OK(batch.Put(handles_[1], "key3", DummyString(1024)));
+    ASSERT_OK(batch.PutLogData(Slice("blob2")));
+    ASSERT_OK(batch.Delete(handles_[0], "key2"));
+    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  }
+
+  auto res = OpenTransactionLogIter(0)->GetBatch();
+  struct Handler : public WriteBatch::Handler {
+    std::string seen;
+    Status PutCF(uint32_t cf, const Slice& key, const Slice& value) override {
+      seen += "Put(" + std::to_string(cf) + ", " + key.ToString() + ", " +
+              std::to_string(value.size()) + ")";
+      return Status::OK();
+    }
+    Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) override {
+      seen += "Merge(" + std::to_string(cf) + ", " + key.ToString() + ", " +
+              std::to_string(value.size()) + ")";
+      return Status::OK();
+    }
+    void LogData(const Slice& blob) override {
+      seen += "LogData(" + blob.ToString() + ")";
+    }
+    Status DeleteCF(uint32_t cf, const Slice& key) override {
+      seen += "Delete(" + std::to_string(cf) + ", " + key.ToString() + ")";
+      return Status::OK();
+    }
+  } handler;
+  ASSERT_OK(res.writeBatchPtr->Iterate(&handler));
+  ASSERT_EQ(
+      "Put(1, key1, 1024)"
+      "Put(0, key2, 1024)"
+      "LogData(blob1)"
+      "Put(1, key3, 1024)"
+      "LogData(blob2)"
+      "Delete(0, key2)",
+      handler.seen);
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !defined(ROCKSDB_LITE)
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  (void)argc;
+  (void)argv;
+  return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_logical_block_size_cache_test.cc b/src/rocksdb/db/db_logical_block_size_cache_test.cc
new file mode 100644
index 000000000..13c16618e
--- /dev/null
+++ b/src/rocksdb/db/db_logical_block_size_cache_test.cc
@@ -0,0 +1,521 @@
+// Copyright (c) 2020-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "test_util/testharness.h"
+
+#ifdef OS_LINUX
+#include "env/io_posix.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+class EnvWithCustomLogicalBlockSizeCache : public EnvWrapper {
+ public:
+  EnvWithCustomLogicalBlockSizeCache(Env* env, LogicalBlockSizeCache* cache)
+      : EnvWrapper(env), cache_(cache) {}
+
+  Status RegisterDbPaths(const std::vector<std::string>& paths) override {
+    return cache_->RefAndCacheLogicalBlockSize(paths);
+  }
+
+  Status UnregisterDbPaths(const std::vector<std::string>& paths) override {
+    cache_->UnrefAndTryRemoveCachedLogicalBlockSize(paths);
+    return Status::OK();
+  }
+
+ private:
+  LogicalBlockSizeCache* cache_;
+};
+
+class DBLogicalBlockSizeCacheTest : public testing::Test {
+ public:
+  DBLogicalBlockSizeCacheTest()
+      : dbname_(test::PerThreadDBPath("logical_block_size_cache_test")),
+        data_path_0_(dbname_ + "/data_path_0"),
+        data_path_1_(dbname_ + "/data_path_1"),
+        cf_path_0_(dbname_ + "/cf_path_0"),
+        cf_path_1_(dbname_ + "/cf_path_1") {
+    auto get_fd_block_size = [&](int fd) { return fd; };
+    auto get_dir_block_size = [&](const std::string& /*dir*/, size_t* size) {
+      *size = 1024;
+      return Status::OK();
+    };
+    cache_.reset(
+        new LogicalBlockSizeCache(get_fd_block_size, get_dir_block_size));
+    env_.reset(
+        new EnvWithCustomLogicalBlockSizeCache(Env::Default(), cache_.get()));
+  }
+
+ protected:
+  std::string dbname_;
+  std::string data_path_0_;
+  std::string data_path_1_;
+  std::string cf_path_0_;
+  std::string cf_path_1_;
+  std::unique_ptr<LogicalBlockSizeCache> cache_;
+  std::unique_ptr<Env> env_;
+};
+
+TEST_F(DBLogicalBlockSizeCacheTest, OpenClose) {
+  // Tests that Open will cache the logical block size for data paths,
+  // and Close will remove the cached sizes.
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_.get();
+  options.db_paths = {{data_path_0_, 2048}, {data_path_1_, 2048}};
+
+  for (int i = 0; i < 2; i++) {
+    DB* db;
+    if (!i) {
+      printf("Open\n");
+      ASSERT_OK(DB::Open(options, dbname_, &db));
+    } else {
+#ifdef ROCKSDB_LITE
+      break;
+#else
+      printf("OpenForReadOnly\n");
+      ASSERT_OK(DB::OpenForReadOnly(options, dbname_, &db));
+#endif
+    }
+    ASSERT_EQ(2, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(data_path_0_));
+    ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
+    ASSERT_TRUE(cache_->Contains(data_path_1_));
+    ASSERT_EQ(1, cache_->GetRefCount(data_path_1_));
+    ASSERT_OK(db->Close());
+    ASSERT_EQ(0, cache_->Size());
+    delete db;
+  }
+  ASSERT_OK(DestroyDB(dbname_, options, {}));
+}
+
+TEST_F(DBLogicalBlockSizeCacheTest, OpenDelete) {
+  // Tests that Open will cache the logical block size for data paths,
+  // and delete the db pointer will remove the cached sizes.
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_.get();
+
+  for (int i = 0; i < 2; i++) {
+    DB* db;
+    if (!i) {
+      printf("Open\n");
+      ASSERT_OK(DB::Open(options, dbname_, &db));
+    } else {
+#ifdef ROCKSDB_LITE
+      break;
+#else
+      printf("OpenForReadOnly\n");
+      ASSERT_OK(DB::OpenForReadOnly(options, dbname_, &db));
+#endif
+    }
+    ASSERT_EQ(1, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(dbname_));
+    ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+    delete db;
+    ASSERT_EQ(0, cache_->Size());
+  }
+  ASSERT_OK(DestroyDB(dbname_, options, {}));
+}
+
+TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamily) {
+  // Tests that CreateColumnFamily will cache the cf_paths,
+  // drop the column family handle won't drop the cache,
+  // drop and then delete the column family handle will drop the cache.
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_.get();
+  ColumnFamilyOptions cf_options;
+  cf_options.cf_paths = {{cf_path_0_, 1024}, {cf_path_1_, 2048}};
+
+  DB* db;
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  ASSERT_EQ(1, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+
+  ColumnFamilyHandle* cf = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(cf_options, "cf", &cf));
+  ASSERT_EQ(3, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+  ASSERT_TRUE(cache_->Contains(cf_path_1_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_));
+
+  // Drop column family does not drop cache.
+  ASSERT_OK(db->DropColumnFamily(cf));
+  ASSERT_EQ(3, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+  ASSERT_TRUE(cache_->Contains(cf_path_1_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_));
+
+  // Delete handle will drop cache.
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cf));
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+
+  delete db;
+  ASSERT_EQ(0, cache_->Size());
+  ASSERT_OK(DestroyDB(dbname_, options, {{"cf", cf_options}}));
+}
+
+TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamilies) {
+  // To test:
+  // (1) CreateColumnFamilies will cache the cf_paths in
+  // DBLogicalBlockSizeCache
+  // (2) Dropping column family handles associated with
+  // that cf_paths won't drop the cached cf_paths
+  // (3) Deleting all the column family handles associated
+  //  with that cf_paths will drop the cached cf_paths
+
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_.get();
+  ColumnFamilyOptions cf_options;
+  cf_options.cf_paths = {{cf_path_0_, 1024}};
+
+  DB* db;
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  ASSERT_EQ(1, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+
+  std::vector<ColumnFamilyHandle*> cfs;
+  ASSERT_OK(db->CreateColumnFamilies(cf_options, {"cf1", "cf2"}, &cfs));
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_));
+
+  // Drop column family does not drop cf_path_0_'s entry from cache
+  for (ColumnFamilyHandle* cf : cfs) {
+    ASSERT_OK(db->DropColumnFamily(cf));
+    ASSERT_EQ(2, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(dbname_));
+    ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+    ASSERT_TRUE(cache_->Contains(cf_path_0_));
+    ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_));
+  }
+
+  // Delete one cf handle will not drop cf_path_0_'s entry from cache because
+  // another handle is still referencing cf_path_0_.
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[0]));
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+
+  // Delete all cf handles and ensure the ref count of cf_path_0_ in cache_
+  // can be properly decreased by releasing any background reference to the
+  // ColumnFamilyData during db deletion
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[1]));
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+  delete db;
+
+  // Now cf_path_0_ in cache_ has been properly decreased and cf_path_0_'s entry
+  // is dropped from cache
+  ASSERT_EQ(0, cache_->Size());
+  ASSERT_OK(
+      DestroyDB(dbname_, options, {{"cf1", cf_options}, {"cf2", cf_options}}));
+}
+
+TEST_F(DBLogicalBlockSizeCacheTest, OpenWithColumnFamilies) {
+  // Tests that Open two column families with the same cf_path will cache the
+  // cf_path and have 2 references to the cached size,
+  // drop the column family handle won't drop the cache,
+  // drop and then delete the column family handle will drop the cache.
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_.get();
+
+  ColumnFamilyOptions cf_options;
+  cf_options.cf_paths = {{cf_path_0_, 1024}};
+
+  for (int i = 0; i < 2; i++) {
+    DB* db;
+    ColumnFamilyHandle* cf1 = nullptr;
+    ColumnFamilyHandle* cf2 = nullptr;
+    ASSERT_OK(DB::Open(options, dbname_, &db));
+    ASSERT_OK(db->CreateColumnFamily(cf_options, "cf1", &cf1));
+    ASSERT_OK(db->CreateColumnFamily(cf_options, "cf2", &cf2));
+    ASSERT_OK(db->DestroyColumnFamilyHandle(cf1));
+    ASSERT_OK(db->DestroyColumnFamilyHandle(cf2));
+    delete db;
+    ASSERT_EQ(0, cache_->Size());
+
+    std::vector<ColumnFamilyHandle*> cfs;
+    if (!i) {
+      printf("Open\n");
+      ASSERT_OK(DB::Open(options, dbname_,
+                         {{"cf1", cf_options},
+                          {"cf2", cf_options},
+                          {"default", ColumnFamilyOptions()}},
+                         &cfs, &db));
+    } else {
+#ifdef ROCKSDB_LITE
+      break;
+#else
+      printf("OpenForReadOnly\n");
+      ASSERT_OK(DB::OpenForReadOnly(options, dbname_,
+                                    {{"cf1", cf_options},
+                                     {"cf2", cf_options},
+                                     {"default", ColumnFamilyOptions()}},
+                                    &cfs, &db));
+#endif
+    }
+
+    // Logical block sizes of dbname_ and cf_path_0_ are cached during Open.
+    ASSERT_EQ(2, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(dbname_));
+    ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+    ASSERT_TRUE(cache_->Contains(cf_path_0_));
+    ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_));
+
+    // Drop handles won't drop the cache.
+    ASSERT_OK(db->DropColumnFamily(cfs[0]));
+    ASSERT_OK(db->DropColumnFamily(cfs[1]));
+    ASSERT_EQ(2, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(dbname_));
+    ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+    ASSERT_TRUE(cache_->Contains(cf_path_0_));
+    ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_));
+
+    // Delete 1st handle won't drop the cache for cf_path_0_.
+    ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[0]));
+    ASSERT_EQ(2, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(dbname_));
+    ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+    ASSERT_TRUE(cache_->Contains(cf_path_0_));
+    ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+    // Delete 2nd handle will drop the cache for cf_path_0_.
+    ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[1]));
+    ASSERT_EQ(1, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(dbname_));
+    ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+
+    // Delete the default handle won't affect the cache because db still refers
+    // to the default CF.
+    ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[2]));
+    ASSERT_EQ(1, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(dbname_));
+    ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+
+    delete db;
+    ASSERT_EQ(0, cache_->Size());
+  }
+  ASSERT_OK(
+      DestroyDB(dbname_, options, {{"cf1", cf_options}, {"cf2", cf_options}}));
+}
+
+TEST_F(DBLogicalBlockSizeCacheTest, DestroyColumnFamilyHandle) {
+  // Tests that destroy column family without dropping won't drop the cache,
+  // because compaction and flush might still need to get logical block size
+  // when opening new files.
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_.get();
+  ColumnFamilyOptions cf_options;
+  cf_options.cf_paths = {{cf_path_0_, 1024}};
+
+  DB* db;
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  ASSERT_EQ(1, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+  ColumnFamilyHandle* cf = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(cf_options, "cf", &cf));
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+  // Delete handle won't drop cache.
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cf));
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+  delete db;
+  ASSERT_EQ(0, cache_->Size());
+
+  // Open with column families.
+  std::vector<ColumnFamilyHandle*> cfs;
+  for (int i = 0; i < 2; i++) {
+    if (!i) {
+      printf("Open\n");
+      ASSERT_OK(DB::Open(
+          options, dbname_,
+          {{"cf", cf_options}, {"default", ColumnFamilyOptions()}}, &cfs, &db));
+    } else {
+#ifdef ROCKSDB_LITE
+      break;
+#else
+      printf("OpenForReadOnly\n");
+      ASSERT_OK(DB::OpenForReadOnly(
+          options, dbname_,
+          {{"cf", cf_options}, {"default", ColumnFamilyOptions()}}, &cfs, &db));
+#endif
+    }
+    // cf_path_0_ and dbname_ are cached.
+    ASSERT_EQ(2, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(dbname_));
+    ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+    ASSERT_TRUE(cache_->Contains(cf_path_0_));
+    ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+    // Deleting handle won't drop cache.
+    ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[0]));
+    ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[1]));
+    ASSERT_EQ(2, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(dbname_));
+    ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+    ASSERT_TRUE(cache_->Contains(cf_path_0_));
+    ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+    delete db;
+    ASSERT_EQ(0, cache_->Size());
+  }
+  ASSERT_OK(DestroyDB(dbname_, options, {{"cf", cf_options}}));
+}
+
+TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithDifferentPaths) {
+  // Tests the cache behavior when there are multiple DBs sharing the same env
+  // with different db_paths and cf_paths.
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_.get();
+
+  ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+
+  DB* db0;
+  ASSERT_OK(DB::Open(options, data_path_0_, &db0));
+  ASSERT_EQ(1, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_0_));
+
+  ColumnFamilyOptions cf_options0;
+  cf_options0.cf_paths = {{cf_path_0_, 1024}};
+  ColumnFamilyHandle* cf0;
+  ASSERT_OK(db0->CreateColumnFamily(cf_options0, "cf", &cf0));
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+  DB* db1;
+  ASSERT_OK(DB::Open(options, data_path_1_, &db1));
+  ASSERT_EQ(3, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+  ASSERT_TRUE(cache_->Contains(data_path_1_));
+  ASSERT_EQ(1, cache_->GetRefCount(data_path_1_));
+
+  ColumnFamilyOptions cf_options1;
+  cf_options1.cf_paths = {{cf_path_1_, 1024}};
+  ColumnFamilyHandle* cf1;
+  ASSERT_OK(db1->CreateColumnFamily(cf_options1, "cf", &cf1));
+  ASSERT_EQ(4, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+  ASSERT_TRUE(cache_->Contains(data_path_1_));
+  ASSERT_EQ(1, cache_->GetRefCount(data_path_1_));
+  ASSERT_TRUE(cache_->Contains(cf_path_1_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_));
+
+  ASSERT_OK(db0->DestroyColumnFamilyHandle(cf0));
+  delete db0;
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_1_));
+  ASSERT_EQ(1, cache_->GetRefCount(data_path_1_));
+  ASSERT_TRUE(cache_->Contains(cf_path_1_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_));
+  ASSERT_OK(DestroyDB(data_path_0_, options, {{"cf", cf_options0}}));
+
+  ASSERT_OK(db1->DestroyColumnFamilyHandle(cf1));
+  delete db1;
+  ASSERT_EQ(0, cache_->Size());
+  ASSERT_OK(DestroyDB(data_path_1_, options, {{"cf", cf_options1}}));
+}
+
+TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithSamePaths) {
+  // Tests the cache behavior when there are multiple DBs sharing the same env
+  // with the same db_paths and cf_paths.
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_.get();
+  options.db_paths = {{data_path_0_, 1024}};
+  ColumnFamilyOptions cf_options;
+  cf_options.cf_paths = {{cf_path_0_, 1024}};
+
+  ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+
+  DB* db0;
+  ASSERT_OK(DB::Open(options, dbname_ + "/db0", &db0));
+  ASSERT_EQ(1, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
+
+  ColumnFamilyHandle* cf0;
+  ASSERT_OK(db0->CreateColumnFamily(cf_options, "cf", &cf0));
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+  DB* db1;
+  ASSERT_OK(DB::Open(options, dbname_ + "/db1", &db1));
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_0_));
+  ASSERT_EQ(2, cache_->GetRefCount(data_path_0_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+  ColumnFamilyHandle* cf1;
+  ASSERT_OK(db1->CreateColumnFamily(cf_options, "cf", &cf1));
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_0_));
+  ASSERT_EQ(2, cache_->GetRefCount(data_path_0_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_));
+
+  ASSERT_OK(db0->DestroyColumnFamilyHandle(cf0));
+  delete db0;
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+  ASSERT_OK(DestroyDB(dbname_ + "/db0", options, {{"cf", cf_options}}));
+
+  ASSERT_OK(db1->DestroyColumnFamilyHandle(cf1));
+  delete db1;
+  ASSERT_EQ(0, cache_->Size());
+  ASSERT_OK(DestroyDB(dbname_ + "/db1", options, {{"cf", cf_options}}));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // OS_LINUX
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_memtable_test.cc b/src/rocksdb/db/db_memtable_test.cc
new file mode 100644
index 000000000..cae592db3
--- /dev/null
+++ b/src/rocksdb/db/db_memtable_test.cc
@@ -0,0 +1,344 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <memory>
+#include <string>
+
+#include "db/db_test_util.h"
+#include "db/memtable.h"
+#include "db/range_del_aggregator.h"
+#include "port/stack_trace.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/slice_transform.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBMemTableTest : public DBTestBase {
+ public:
+  DBMemTableTest() : DBTestBase("db_memtable_test", /*env_do_fsync=*/true) {}
+};
+
+class MockMemTableRep : public MemTableRep {
+ public:
+  explicit MockMemTableRep(Allocator* allocator, MemTableRep* rep)
+      : MemTableRep(allocator), rep_(rep), num_insert_with_hint_(0) {}
+
+  KeyHandle Allocate(const size_t len, char** buf) override {
+    return rep_->Allocate(len, buf);
+  }
+
+  void Insert(KeyHandle handle) override { rep_->Insert(handle); }
+
+  void InsertWithHint(KeyHandle handle, void** hint) override {
+    num_insert_with_hint_++;
+    EXPECT_NE(nullptr, hint);
+    last_hint_in_ = *hint;
+    rep_->InsertWithHint(handle, hint);
+    last_hint_out_ = *hint;
+  }
+
+  bool Contains(const char* key) const override { return rep_->Contains(key); }
+
+  void Get(const LookupKey& k, void* callback_args,
+           bool (*callback_func)(void* arg, const char* entry)) override {
+    rep_->Get(k, callback_args, callback_func);
+  }
+
+  size_t ApproximateMemoryUsage() override {
+    return rep_->ApproximateMemoryUsage();
+  }
+
+  Iterator* GetIterator(Arena* arena) override {
+    return rep_->GetIterator(arena);
+  }
+
+  void* last_hint_in() { return last_hint_in_; }
+  void* last_hint_out() { return last_hint_out_; }
+  int num_insert_with_hint() { return num_insert_with_hint_; }
+
+ private:
+  std::unique_ptr<MemTableRep> rep_;
+  void* last_hint_in_;
+  void* last_hint_out_;
+  int num_insert_with_hint_;
+};
+
+class MockMemTableRepFactory : public MemTableRepFactory {
+ public:
+  MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator& cmp,
+                                 Allocator* allocator,
+                                 const SliceTransform* transform,
+                                 Logger* logger) override {
+    SkipListFactory factory;
+    MemTableRep* skiplist_rep =
+        factory.CreateMemTableRep(cmp, allocator, transform, logger);
+    mock_rep_ = new MockMemTableRep(allocator, skiplist_rep);
+    return mock_rep_;
+  }
+
+  MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator& cmp,
+                                 Allocator* allocator,
+                                 const SliceTransform* transform,
+                                 Logger* logger,
+                                 uint32_t column_family_id) override {
+    last_column_family_id_ = column_family_id;
+    return CreateMemTableRep(cmp, allocator, transform, logger);
+  }
+
+  const char* Name() const override { return "MockMemTableRepFactory"; }
+
+  MockMemTableRep* rep() { return mock_rep_; }
+
+  bool IsInsertConcurrentlySupported() const override { return false; }
+
+  uint32_t GetLastColumnFamilyId() { return last_column_family_id_; }
+
+ private:
+  MockMemTableRep* mock_rep_;
+  // workaround since there's no std::numeric_limits<uint32_t>::max() yet.
+  uint32_t last_column_family_id_ = static_cast<uint32_t>(-1);
+};
+
+class TestPrefixExtractor : public SliceTransform {
+ public:
+  const char* Name() const override { return "TestPrefixExtractor"; }
+
+  Slice Transform(const Slice& key) const override {
+    const char* p = separator(key);
+    if (p == nullptr) {
+      return Slice();
+    }
+    return Slice(key.data(), p - key.data() + 1);
+  }
+
+  bool InDomain(const Slice& key) const override {
+    return separator(key) != nullptr;
+  }
+
+  bool InRange(const Slice& /*key*/) const override { return false; }
+
+ private:
+  const char* separator(const Slice& key) const {
+    return reinterpret_cast<const char*>(memchr(key.data(), '_', key.size()));
+  }
+};
+
+// Test that ::Add properly returns false when inserting duplicate keys
+TEST_F(DBMemTableTest, DuplicateSeq) {
+  SequenceNumber seq = 123;
+  std::string value;
+  MergeContext merge_context;
+  Options options;
+  InternalKeyComparator ikey_cmp(options.comparator);
+  ReadRangeDelAggregator range_del_agg(&ikey_cmp,
+                                       kMaxSequenceNumber /* upper_bound */);
+
+  // Create a MemTable
+  InternalKeyComparator cmp(BytewiseComparator());
+  auto factory = std::make_shared<SkipListFactory>();
+  options.memtable_factory = factory;
+  ImmutableOptions ioptions(options);
+  WriteBufferManager wb(options.db_write_buffer_size);
+  MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+                               kMaxSequenceNumber, 0 /* column_family_id */);
+
+  // Write some keys and make sure it returns false on duplicates
+  ASSERT_OK(
+      mem->Add(seq, kTypeValue, "key", "value2", nullptr /* kv_prot_info */));
+  ASSERT_TRUE(
+      mem->Add(seq, kTypeValue, "key", "value2", nullptr /* kv_prot_info */)
+          .IsTryAgain());
+  // Changing the type should still cause the duplicatae key
+  ASSERT_TRUE(
+      mem->Add(seq, kTypeMerge, "key", "value2", nullptr /* kv_prot_info */)
+          .IsTryAgain());
+  // Changing the seq number will make the key fresh
+  ASSERT_OK(mem->Add(seq + 1, kTypeMerge, "key", "value2",
+                     nullptr /* kv_prot_info */));
+  // Test with different types for duplicate keys
+  ASSERT_TRUE(
+      mem->Add(seq, kTypeDeletion, "key", "", nullptr /* kv_prot_info */)
+          .IsTryAgain());
+  ASSERT_TRUE(
+      mem->Add(seq, kTypeSingleDeletion, "key", "", nullptr /* kv_prot_info */)
+          .IsTryAgain());
+
+  // Test the duplicate keys under stress
+  for (int i = 0; i < 10000; i++) {
+    bool insert_dup = i % 10 == 1;
+    if (!insert_dup) {
+      seq++;
+    }
+    Status s = mem->Add(seq, kTypeValue, "foo", "value" + std::to_string(seq),
+                        nullptr /* kv_prot_info */);
+    if (insert_dup) {
+      ASSERT_TRUE(s.IsTryAgain());
+    } else {
+      ASSERT_OK(s);
+    }
+  }
+  delete mem;
+
+  // Test with InsertWithHint
+  options.memtable_insert_with_hint_prefix_extractor.reset(
+      new TestPrefixExtractor());  // which uses _ to extract the prefix
+  ioptions = ImmutableOptions(options);
+  mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+                     kMaxSequenceNumber, 0 /* column_family_id */);
+  // Insert a duplicate key with _ in it
+  ASSERT_OK(
+      mem->Add(seq, kTypeValue, "key_1", "value", nullptr /* kv_prot_info */));
+  ASSERT_TRUE(
+      mem->Add(seq, kTypeValue, "key_1", "value", nullptr /* kv_prot_info */)
+          .IsTryAgain());
+  delete mem;
+
+  // Test when InsertConcurrently will be invoked
+  options.allow_concurrent_memtable_write = true;
+  ioptions = ImmutableOptions(options);
+  mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+                     kMaxSequenceNumber, 0 /* column_family_id */);
+  MemTablePostProcessInfo post_process_info;
+  ASSERT_OK(mem->Add(seq, kTypeValue, "key", "value",
+                     nullptr /* kv_prot_info */, true, &post_process_info));
+  ASSERT_TRUE(mem->Add(seq, kTypeValue, "key", "value",
+                       nullptr /* kv_prot_info */, true, &post_process_info)
+                  .IsTryAgain());
+  delete mem;
+}
+
+// A simple test to verify that the concurrent merge writes is functional
+TEST_F(DBMemTableTest, ConcurrentMergeWrite) {
+  int num_ops = 1000;
+  std::string value;
+  MergeContext merge_context;
+  Options options;
+  // A merge operator that is not sensitive to concurrent writes since in this
+  // test we don't order the writes.
+  options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+
+  // Create a MemTable
+  InternalKeyComparator cmp(BytewiseComparator());
+  auto factory = std::make_shared<SkipListFactory>();
+  options.memtable_factory = factory;
+  options.allow_concurrent_memtable_write = true;
+  ImmutableOptions ioptions(options);
+  WriteBufferManager wb(options.db_write_buffer_size);
+  MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+                               kMaxSequenceNumber, 0 /* column_family_id */);
+
+  // Put 0 as the base
+  PutFixed64(&value, static_cast<uint64_t>(0));
+  ASSERT_OK(mem->Add(0, kTypeValue, "key", value, nullptr /* kv_prot_info */));
+  value.clear();
+
+  // Write Merge concurrently
+  ROCKSDB_NAMESPACE::port::Thread write_thread1([&]() {
+    MemTablePostProcessInfo post_process_info1;
+    std::string v1;
+    for (int seq = 1; seq < num_ops / 2; seq++) {
+      PutFixed64(&v1, seq);
+      ASSERT_OK(mem->Add(seq, kTypeMerge, "key", v1, nullptr /* kv_prot_info */,
+                         true, &post_process_info1));
+      v1.clear();
+    }
+  });
+  ROCKSDB_NAMESPACE::port::Thread write_thread2([&]() {
+    MemTablePostProcessInfo post_process_info2;
+    std::string v2;
+    for (int seq = num_ops / 2; seq < num_ops; seq++) {
+      PutFixed64(&v2, seq);
+      ASSERT_OK(mem->Add(seq, kTypeMerge, "key", v2, nullptr /* kv_prot_info */,
+                         true, &post_process_info2));
+      v2.clear();
+    }
+  });
+  write_thread1.join();
+  write_thread2.join();
+
+  Status status;
+  ReadOptions roptions;
+  SequenceNumber max_covering_tombstone_seq = 0;
+  LookupKey lkey("key", kMaxSequenceNumber);
+  bool res = mem->Get(lkey, &value, /*columns=*/nullptr, /*timestamp=*/nullptr,
+                      &status, &merge_context, &max_covering_tombstone_seq,
+                      roptions, false /* immutable_memtable */);
+  ASSERT_OK(status);
+  ASSERT_TRUE(res);
+  uint64_t ivalue = DecodeFixed64(Slice(value).data());
+  uint64_t sum = 0;
+  for (int seq = 0; seq < num_ops; seq++) {
+    sum += seq;
+  }
+  ASSERT_EQ(ivalue, sum);
+
+  delete mem;
+}
+
+TEST_F(DBMemTableTest, InsertWithHint) {
+  Options options;
+  options.allow_concurrent_memtable_write = false;
+  options.create_if_missing = true;
+  options.memtable_factory.reset(new MockMemTableRepFactory());
+  options.memtable_insert_with_hint_prefix_extractor.reset(
+      new TestPrefixExtractor());
+  options.env = env_;
+  Reopen(options);
+  MockMemTableRep* rep =
+      reinterpret_cast<MockMemTableRepFactory*>(options.memtable_factory.get())
+          ->rep();
+  ASSERT_OK(Put("foo_k1", "foo_v1"));
+  ASSERT_EQ(nullptr, rep->last_hint_in());
+  void* hint_foo = rep->last_hint_out();
+  ASSERT_OK(Put("foo_k2", "foo_v2"));
+  ASSERT_EQ(hint_foo, rep->last_hint_in());
+  ASSERT_EQ(hint_foo, rep->last_hint_out());
+  ASSERT_OK(Put("foo_k3", "foo_v3"));
+  ASSERT_EQ(hint_foo, rep->last_hint_in());
+  ASSERT_EQ(hint_foo, rep->last_hint_out());
+  ASSERT_OK(Put("bar_k1", "bar_v1"));
+  ASSERT_EQ(nullptr, rep->last_hint_in());
+  void* hint_bar = rep->last_hint_out();
+  ASSERT_NE(hint_foo, hint_bar);
+  ASSERT_OK(Put("bar_k2", "bar_v2"));
+  ASSERT_EQ(hint_bar, rep->last_hint_in());
+  ASSERT_EQ(hint_bar, rep->last_hint_out());
+  ASSERT_EQ(5, rep->num_insert_with_hint());
+  ASSERT_OK(Put("NotInPrefixDomain", "vvv"));
+  ASSERT_EQ(5, rep->num_insert_with_hint());
+  ASSERT_EQ("foo_v1", Get("foo_k1"));
+  ASSERT_EQ("foo_v2", Get("foo_k2"));
+  ASSERT_EQ("foo_v3", Get("foo_k3"));
+  ASSERT_EQ("bar_v1", Get("bar_k1"));
+  ASSERT_EQ("bar_v2", Get("bar_k2"));
+  ASSERT_EQ("vvv", Get("NotInPrefixDomain"));
+}
+
+TEST_F(DBMemTableTest, ColumnFamilyId) {
+  // Verifies MemTableRepFactory is told the right column family id.
+  Options options;
+  options.env = CurrentOptions().env;
+  options.allow_concurrent_memtable_write = false;
+  options.create_if_missing = true;
+  options.memtable_factory.reset(new MockMemTableRepFactory());
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  for (uint32_t cf = 0; cf < 2; ++cf) {
+    ASSERT_OK(Put(cf, "key", "val"));
+    ASSERT_OK(Flush(cf));
+    ASSERT_EQ(
+        cf, static_cast<MockMemTableRepFactory*>(options.memtable_factory.get())
+                ->GetLastColumnFamilyId());
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_merge_operand_test.cc b/src/rocksdb/db/db_merge_operand_test.cc
new file mode 100644
index 000000000..cbec37138
--- /dev/null
+++ b/src/rocksdb/db/db_merge_operand_test.cc
@@ -0,0 +1,448 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/utilities/debug.h"
+#include "table/block_based/block_builder.h"
+#if !defined(ROCKSDB_LITE)
+#include "test_util/sync_point.h"
+#endif
+#include "rocksdb/merge_operator.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/sortlist.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+class LimitedStringAppendMergeOp : public StringAppendTESTOperator {
+ public:
+  LimitedStringAppendMergeOp(int limit, char delim)
+      : StringAppendTESTOperator(delim), limit_(limit) {}
+
+  const char* Name() const override {
+    return "DBMergeOperatorTest::LimitedStringAppendMergeOp";
+  }
+
+  bool ShouldMerge(const std::vector<Slice>& operands) const override {
+    if (operands.size() > 0 && limit_ > 0 && operands.size() >= limit_) {
+      return true;
+    }
+    return false;
+  }
+
+ private:
+  size_t limit_ = 0;
+};
+}  // anonymous namespace
+
+class DBMergeOperandTest : public DBTestBase {
+ public:
+  DBMergeOperandTest()
+      : DBTestBase("db_merge_operand_test", /*env_do_fsync=*/true) {}
+};
+
+TEST_F(DBMergeOperandTest, CacheEvictedMergeOperandReadAfterFreeBug) {
+  // There was a bug of reading merge operands after they are mistakely freed
+  // in DB::GetMergeOperands, which is surfaced by cache full.
+  // See PR#9507 for more.
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  options.env = env_;
+  BlockBasedTableOptions table_options;
+
+  // Small cache to simulate cache full
+  table_options.block_cache = NewLRUCache(1);
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  Reopen(options);
+  int num_records = 4;
+  int number_of_operands = 0;
+  std::vector<PinnableSlice> values(num_records);
+  GetMergeOperandsOptions merge_operands_info;
+  merge_operands_info.expected_max_number_of_operands = num_records;
+
+  ASSERT_OK(Merge("k1", "v1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k1", "v2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k1", "v3"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k1", "v4"));
+
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k1", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(number_of_operands, 4);
+  ASSERT_EQ(values[0].ToString(), "v1");
+  ASSERT_EQ(values[1].ToString(), "v2");
+  ASSERT_EQ(values[2].ToString(), "v3");
+  ASSERT_EQ(values[3].ToString(), "v4");
+}
+
+TEST_F(DBMergeOperandTest, FlushedMergeOperandReadAfterFreeBug) {
+  // Repro for a bug where a memtable containing a merge operand could be
+  // deleted before the merge operand was saved to the result.
+  auto options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  Reopen(options);
+
+  ASSERT_OK(Merge("key", "value"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::GetImpl:PostMemTableGet:0",
+        "DBMergeOperandTest::FlushedMergeOperandReadAfterFreeBug:PreFlush"},
+       {"DBMergeOperandTest::FlushedMergeOperandReadAfterFreeBug:PostFlush",
+        "DBImpl::GetImpl:PostMemTableGet:1"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  auto flush_thread = port::Thread([&]() {
+    TEST_SYNC_POINT(
+        "DBMergeOperandTest::FlushedMergeOperandReadAfterFreeBug:PreFlush");
+    ASSERT_OK(Flush());
+    TEST_SYNC_POINT(
+        "DBMergeOperandTest::FlushedMergeOperandReadAfterFreeBug:PostFlush");
+  });
+
+  PinnableSlice value;
+  GetMergeOperandsOptions merge_operands_info;
+  merge_operands_info.expected_max_number_of_operands = 1;
+  int number_of_operands;
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "key", &value, &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(1, number_of_operands);
+
+  flush_thread.join();
+}
+
+TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) {
+  Options options;
+  options.create_if_missing = true;
+  // Use only the latest two merge operands.
+  options.merge_operator = std::make_shared<LimitedStringAppendMergeOp>(2, ',');
+  options.env = env_;
+  Reopen(options);
+  int num_records = 4;
+  int number_of_operands = 0;
+  std::vector<PinnableSlice> values(num_records);
+  GetMergeOperandsOptions merge_operands_info;
+  merge_operands_info.expected_max_number_of_operands = num_records;
+
+  // k0 value in memtable
+  ASSERT_OK(Put("k0", "PutARock"));
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k0", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "PutARock");
+
+  // k0.1 value in SST
+  ASSERT_OK(Put("k0.1", "RockInSST"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k0.1", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "RockInSST");
+
+  // All k1 values are in memtable.
+  ASSERT_OK(Merge("k1", "a"));
+  ASSERT_OK(Put("k1", "x"));
+  ASSERT_OK(Merge("k1", "b"));
+  ASSERT_OK(Merge("k1", "c"));
+  ASSERT_OK(Merge("k1", "d"));
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k1", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "x");
+  ASSERT_EQ(values[1], "b");
+  ASSERT_EQ(values[2], "c");
+  ASSERT_EQ(values[3], "d");
+
+  // expected_max_number_of_operands is less than number of merge operands so
+  // status should be Incomplete.
+  merge_operands_info.expected_max_number_of_operands = num_records - 1;
+  Status status = db_->GetMergeOperands(
+      ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(),
+      &merge_operands_info, &number_of_operands);
+  ASSERT_EQ(status.IsIncomplete(), true);
+  merge_operands_info.expected_max_number_of_operands = num_records;
+
+  // All k1.1 values are in memtable.
+  ASSERT_OK(Merge("k1.1", "r"));
+  ASSERT_OK(Delete("k1.1"));
+  ASSERT_OK(Merge("k1.1", "c"));
+  ASSERT_OK(Merge("k1.1", "k"));
+  ASSERT_OK(Merge("k1.1", "s"));
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k1.1", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "c");
+  ASSERT_EQ(values[1], "k");
+  ASSERT_EQ(values[2], "s");
+
+  // All k2 values are flushed to L0 into a single file.
+  ASSERT_OK(Merge("k2", "q"));
+  ASSERT_OK(Merge("k2", "w"));
+  ASSERT_OK(Merge("k2", "e"));
+  ASSERT_OK(Merge("k2", "r"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k2", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "q");
+  ASSERT_EQ(values[1], "w");
+  ASSERT_EQ(values[2], "e");
+  ASSERT_EQ(values[3], "r");
+
+  // All k2.1 values are flushed to L0 into a single file.
+  ASSERT_OK(Merge("k2.1", "m"));
+  ASSERT_OK(Put("k2.1", "l"));
+  ASSERT_OK(Merge("k2.1", "n"));
+  ASSERT_OK(Merge("k2.1", "o"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k2.1", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "l,n,o");
+
+  // All k2.2 values are flushed to L0 into a single file.
+  ASSERT_OK(Merge("k2.2", "g"));
+  ASSERT_OK(Delete("k2.2"));
+  ASSERT_OK(Merge("k2.2", "o"));
+  ASSERT_OK(Merge("k2.2", "t"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k2.2", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "o,t");
+
+  // Do some compaction that will make the following tests more predictable
+  //  Slice start("PutARock");
+  //  Slice end("t");
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // All k3 values are flushed and are in different files.
+  ASSERT_OK(Merge("k3", "ab"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "bc"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "cd"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "de"));
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k3", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "ab");
+  ASSERT_EQ(values[1], "bc");
+  ASSERT_EQ(values[2], "cd");
+  ASSERT_EQ(values[3], "de");
+
+  // All k3.1 values are flushed and are in different files.
+  ASSERT_OK(Merge("k3.1", "ab"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("k3.1", "bc"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3.1", "cd"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3.1", "de"));
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k3.1", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "bc");
+  ASSERT_EQ(values[1], "cd");
+  ASSERT_EQ(values[2], "de");
+
+  // All k3.2 values are flushed and are in different files.
+  ASSERT_OK(Merge("k3.2", "ab"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Delete("k3.2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3.2", "cd"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3.2", "de"));
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k3.2", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "cd");
+  ASSERT_EQ(values[1], "de");
+
+  // All K4 values are in different levels
+  ASSERT_OK(Merge("k4", "ba"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(4);
+  ASSERT_OK(Merge("k4", "cb"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(3);
+  ASSERT_OK(Merge("k4", "dc"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+  ASSERT_OK(Merge("k4", "ed"));
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k4", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "ba");
+  ASSERT_EQ(values[1], "cb");
+  ASSERT_EQ(values[2], "dc");
+  ASSERT_EQ(values[3], "ed");
+
+  // First 3 k5 values are in SST and next 4 k5 values are in Immutable
+  // Memtable
+  ASSERT_OK(Merge("k5", "who"));
+  ASSERT_OK(Merge("k5", "am"));
+  ASSERT_OK(Merge("k5", "i"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("k5", "remember"));
+  ASSERT_OK(Merge("k5", "i"));
+  ASSERT_OK(Merge("k5", "am"));
+  ASSERT_OK(Merge("k5", "rocks"));
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k5", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "remember");
+  ASSERT_EQ(values[1], "i");
+  ASSERT_EQ(values[2], "am");
+}
+
+TEST_F(DBMergeOperandTest, BlobDBGetMergeOperandsBasic) {
+  Options options;
+  options.create_if_missing = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  // Use only the latest two merge operands.
+  options.merge_operator = std::make_shared<LimitedStringAppendMergeOp>(2, ',');
+  options.env = env_;
+  Reopen(options);
+  int num_records = 4;
+  int number_of_operands = 0;
+  std::vector<PinnableSlice> values(num_records);
+  GetMergeOperandsOptions merge_operands_info;
+  merge_operands_info.expected_max_number_of_operands = num_records;
+
+  // All k1 values are in memtable.
+  ASSERT_OK(Put("k1", "x"));
+  ASSERT_OK(Merge("k1", "b"));
+  ASSERT_OK(Merge("k1", "c"));
+  ASSERT_OK(Merge("k1", "d"));
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k1", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "x");
+  ASSERT_EQ(values[1], "b");
+  ASSERT_EQ(values[2], "c");
+  ASSERT_EQ(values[3], "d");
+
+  // expected_max_number_of_operands is less than number of merge operands so
+  // status should be Incomplete.
+  merge_operands_info.expected_max_number_of_operands = num_records - 1;
+  Status status = db_->GetMergeOperands(
+      ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(),
+      &merge_operands_info, &number_of_operands);
+  ASSERT_EQ(status.IsIncomplete(), true);
+  merge_operands_info.expected_max_number_of_operands = num_records;
+
+  // All k2 values are flushed to L0 into a single file.
+  ASSERT_OK(Put("k2", "q"));
+  ASSERT_OK(Merge("k2", "w"));
+  ASSERT_OK(Merge("k2", "e"));
+  ASSERT_OK(Merge("k2", "r"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k2", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "q,w,e,r");
+
+  // Do some compaction that will make the following tests more predictable
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // All k3 values are flushed and are in different files.
+  ASSERT_OK(Put("k3", "ab"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "bc"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "cd"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "de"));
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k3", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "ab");
+  ASSERT_EQ(values[1], "bc");
+  ASSERT_EQ(values[2], "cd");
+  ASSERT_EQ(values[3], "de");
+
+  // All K4 values are in different levels
+  ASSERT_OK(Put("k4", "ba"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(4);
+  ASSERT_OK(Merge("k4", "cb"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(3);
+  ASSERT_OK(Merge("k4", "dc"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+  ASSERT_OK(Merge("k4", "ed"));
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k4", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "ba");
+  ASSERT_EQ(values[1], "cb");
+  ASSERT_EQ(values[2], "dc");
+  ASSERT_EQ(values[3], "ed");
+}
+
+TEST_F(DBMergeOperandTest, GetMergeOperandsLargeResultOptimization) {
+  // These constants are chosen to trigger the large result optimization
+  // (pinning a bundle of `DBImpl` resources).
+  const int kNumOperands = 1024;
+  const int kOperandLen = 1024;
+
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  std::vector<std::string> expected_merge_operands;
+  expected_merge_operands.reserve(kNumOperands);
+  for (int i = 0; i < kNumOperands; ++i) {
+    expected_merge_operands.emplace_back(rnd.RandomString(kOperandLen));
+    ASSERT_OK(Merge("key", expected_merge_operands.back()));
+  }
+
+  std::vector<PinnableSlice> merge_operands(kNumOperands);
+  GetMergeOperandsOptions merge_operands_info;
+  merge_operands_info.expected_max_number_of_operands = kNumOperands;
+  int num_merge_operands = 0;
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "key", merge_operands.data(),
+                                  &merge_operands_info, &num_merge_operands));
+  ASSERT_EQ(num_merge_operands, kNumOperands);
+
+  // Ensures the large result optimization was used.
+  for (int i = 0; i < kNumOperands; ++i) {
+    ASSERT_TRUE(merge_operands[i].IsPinned());
+  }
+
+  // Add a Flush() to change the `SuperVersion` to challenge the resource
+  // pinning.
+  ASSERT_OK(Flush());
+
+  for (int i = 0; i < kNumOperands; ++i) {
+    ASSERT_EQ(expected_merge_operands[i], merge_operands[i]);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_merge_operator_test.cc b/src/rocksdb/db/db_merge_operator_test.cc
new file mode 100644
index 000000000..7c5505bd1
--- /dev/null
+++ b/src/rocksdb/db/db_merge_operator_test.cc
@@ -0,0 +1,669 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#include <string>
+#include <vector>
+
+#include "db/db_test_util.h"
+#include "db/forward_iterator.h"
+#include "port/stack_trace.h"
+#include "rocksdb/merge_operator.h"
+#include "util/random.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TestReadCallback : public ReadCallback {
+ public:
+  TestReadCallback(SnapshotChecker* snapshot_checker,
+                   SequenceNumber snapshot_seq)
+      : ReadCallback(snapshot_seq),
+        snapshot_checker_(snapshot_checker),
+        snapshot_seq_(snapshot_seq) {}
+
+  bool IsVisibleFullCheck(SequenceNumber seq) override {
+    return snapshot_checker_->CheckInSnapshot(seq, snapshot_seq_) ==
+           SnapshotCheckerResult::kInSnapshot;
+  }
+
+ private:
+  SnapshotChecker* snapshot_checker_;
+  SequenceNumber snapshot_seq_;
+};
+
+// Test merge operator functionality.
+class DBMergeOperatorTest : public DBTestBase {
+ public:
+  DBMergeOperatorTest()
+      : DBTestBase("db_merge_operator_test", /*env_do_fsync=*/false) {}
+
+  std::string GetWithReadCallback(SnapshotChecker* snapshot_checker,
+                                  const Slice& key,
+                                  const Snapshot* snapshot = nullptr) {
+    SequenceNumber seq = snapshot == nullptr ? db_->GetLatestSequenceNumber()
+                                             : snapshot->GetSequenceNumber();
+    TestReadCallback read_callback(snapshot_checker, seq);
+    ReadOptions read_opt;
+    read_opt.snapshot = snapshot;
+    PinnableSlice value;
+    DBImpl::GetImplOptions get_impl_options;
+    get_impl_options.column_family = db_->DefaultColumnFamily();
+    get_impl_options.value = &value;
+    get_impl_options.callback = &read_callback;
+    Status s = dbfull()->GetImpl(read_opt, key, get_impl_options);
+    if (!s.ok()) {
+      return s.ToString();
+    }
+    return value.ToString();
+  }
+};
+
+TEST_F(DBMergeOperatorTest, LimitMergeOperands) {
+  class LimitedStringAppendMergeOp : public StringAppendTESTOperator {
+   public:
+    LimitedStringAppendMergeOp(int limit, char delim)
+        : StringAppendTESTOperator(delim), limit_(limit) {}
+
+    const char* Name() const override {
+      return "DBMergeOperatorTest::LimitedStringAppendMergeOp";
+    }
+
+    bool ShouldMerge(const std::vector<Slice>& operands) const override {
+      if (operands.size() > 0 && limit_ > 0 && operands.size() >= limit_) {
+        return true;
+      }
+      return false;
+    }
+
+   private:
+    size_t limit_ = 0;
+  };
+
+  Options options;
+  options.create_if_missing = true;
+  // Use only the latest two merge operands.
+  options.merge_operator = std::make_shared<LimitedStringAppendMergeOp>(2, ',');
+  options.env = env_;
+  Reopen(options);
+  // All K1 values are in memtable.
+  ASSERT_OK(Merge("k1", "a"));
+  ASSERT_OK(Merge("k1", "b"));
+  ASSERT_OK(Merge("k1", "c"));
+  ASSERT_OK(Merge("k1", "d"));
+  std::string value;
+  ASSERT_OK(db_->Get(ReadOptions(), "k1", &value));
+  // Make sure that only the latest two merge operands are used. If this was
+  // not the case the value would be "a,b,c,d".
+  ASSERT_EQ(value, "c,d");
+
+  // All K2 values are flushed to L0 into a single file.
+  ASSERT_OK(Merge("k2", "a"));
+  ASSERT_OK(Merge("k2", "b"));
+  ASSERT_OK(Merge("k2", "c"));
+  ASSERT_OK(Merge("k2", "d"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Get(ReadOptions(), "k2", &value));
+  ASSERT_EQ(value, "c,d");
+
+  // All K3 values are flushed and are in different files.
+  ASSERT_OK(Merge("k3", "ab"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "bc"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "cd"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "de"));
+  ASSERT_OK(db_->Get(ReadOptions(), "k3", &value));
+  ASSERT_EQ(value, "cd,de");
+
+  // All K4 values are in different levels
+  ASSERT_OK(Merge("k4", "ab"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(4);
+  ASSERT_OK(Merge("k4", "bc"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(3);
+  ASSERT_OK(Merge("k4", "cd"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+  ASSERT_OK(Merge("k4", "de"));
+  ASSERT_OK(db_->Get(ReadOptions(), "k4", &value));
+  ASSERT_EQ(value, "cd,de");
+}
+
+TEST_F(DBMergeOperatorTest, MergeErrorOnRead) {
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator.reset(new TestPutOperator());
+  options.env = env_;
+  Reopen(options);
+  ASSERT_OK(Merge("k1", "v1"));
+  ASSERT_OK(Merge("k1", "corrupted"));
+  std::string value;
+  ASSERT_TRUE(db_->Get(ReadOptions(), "k1", &value).IsCorruption());
+  VerifyDBInternal({{"k1", "corrupted"}, {"k1", "v1"}});
+}
+
+TEST_F(DBMergeOperatorTest, MergeErrorOnWrite) {
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator.reset(new TestPutOperator());
+  options.max_successive_merges = 3;
+  options.env = env_;
+  Reopen(options);
+  ASSERT_OK(Merge("k1", "v1"));
+  ASSERT_OK(Merge("k1", "v2"));
+  // Will trigger a merge when hitting max_successive_merges and the merge
+  // will fail. The delta will be inserted nevertheless.
+  ASSERT_OK(Merge("k1", "corrupted"));
+  // Data should stay unmerged after the error.
+  VerifyDBInternal({{"k1", "corrupted"}, {"k1", "v2"}, {"k1", "v1"}});
+}
+
+TEST_F(DBMergeOperatorTest, MergeErrorOnIteration) {
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator.reset(new TestPutOperator());
+  options.env = env_;
+
+  DestroyAndReopen(options);
+  ASSERT_OK(Merge("k1", "v1"));
+  ASSERT_OK(Merge("k1", "corrupted"));
+  ASSERT_OK(Put("k2", "v2"));
+  auto* iter = db_->NewIterator(ReadOptions());
+  iter->Seek("k1");
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_TRUE(iter->status().IsCorruption());
+  delete iter;
+  iter = db_->NewIterator(ReadOptions());
+  iter->Seek("k2");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  iter->Prev();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_TRUE(iter->status().IsCorruption());
+  delete iter;
+  VerifyDBInternal({{"k1", "corrupted"}, {"k1", "v1"}, {"k2", "v2"}});
+
+  DestroyAndReopen(options);
+  ASSERT_OK(Merge("k1", "v1"));
+  ASSERT_OK(Put("k2", "v2"));
+  ASSERT_OK(Merge("k2", "corrupted"));
+  iter = db_->NewIterator(ReadOptions());
+  iter->Seek("k1");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_TRUE(iter->status().IsCorruption());
+  delete iter;
+  VerifyDBInternal({{"k1", "v1"}, {"k2", "corrupted"}, {"k2", "v2"}});
+}
+
+class MergeOperatorPinningTest : public DBMergeOperatorTest,
+                                 public testing::WithParamInterface<bool> {
+ public:
+  MergeOperatorPinningTest() { disable_block_cache_ = GetParam(); }
+
+  bool disable_block_cache_;
+};
+
+INSTANTIATE_TEST_CASE_P(MergeOperatorPinningTest, MergeOperatorPinningTest,
+                        ::testing::Bool());
+
+#ifndef ROCKSDB_LITE
+TEST_P(MergeOperatorPinningTest, OperandsMultiBlocks) {
+  Options options = CurrentOptions();
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1;  // every block will contain one entry
+  table_options.no_block_cache = disable_block_cache_;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
+  options.level0_slowdown_writes_trigger = (1 << 30);
+  options.level0_stop_writes_trigger = (1 << 30);
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  const int kKeysPerFile = 10;
+  const int kOperandsPerKeyPerFile = 7;
+  const int kOperandSize = 100;
+  // Filse to write in L0 before compacting to lower level
+  const int kFilesPerLevel = 3;
+
+  Random rnd(301);
+  std::map<std::string, std::string> true_data;
+  int batch_num = 1;
+  int lvl_to_fill = 4;
+  int key_id = 0;
+  while (true) {
+    for (int j = 0; j < kKeysPerFile; j++) {
+      std::string key = Key(key_id % 35);
+      key_id++;
+      for (int k = 0; k < kOperandsPerKeyPerFile; k++) {
+        std::string val = rnd.RandomString(kOperandSize);
+        ASSERT_OK(db_->Merge(WriteOptions(), key, val));
+        if (true_data[key].size() == 0) {
+          true_data[key] = val;
+        } else {
+          true_data[key] += "," + val;
+        }
+      }
+    }
+
+    if (lvl_to_fill == -1) {
+      // Keep last batch in memtable and stop
+      break;
+    }
+
+    ASSERT_OK(Flush());
+    if (batch_num % kFilesPerLevel == 0) {
+      if (lvl_to_fill != 0) {
+        MoveFilesToLevel(lvl_to_fill);
+      }
+      lvl_to_fill--;
+    }
+    batch_num++;
+  }
+
+  // 3 L0 files
+  // 1 L1 file
+  // 3 L2 files
+  // 1 L3 file
+  // 3 L4 Files
+  ASSERT_EQ(FilesPerLevel(), "3,1,3,1,3");
+
+  VerifyDBFromMap(true_data);
+}
+
+class MergeOperatorHook : public MergeOperator {
+ public:
+  explicit MergeOperatorHook(std::shared_ptr<MergeOperator> _merge_op)
+      : merge_op_(_merge_op) {}
+
+  bool FullMergeV2(const MergeOperationInput& merge_in,
+                   MergeOperationOutput* merge_out) const override {
+    before_merge_();
+    bool res = merge_op_->FullMergeV2(merge_in, merge_out);
+    after_merge_();
+    return res;
+  }
+
+  const char* Name() const override { return merge_op_->Name(); }
+
+  std::shared_ptr<MergeOperator> merge_op_;
+  std::function<void()> before_merge_ = []() {};
+  std::function<void()> after_merge_ = []() {};
+};
+
+TEST_P(MergeOperatorPinningTest, EvictCacheBeforeMerge) {
+  Options options = CurrentOptions();
+
+  auto merge_hook =
+      std::make_shared<MergeOperatorHook>(MergeOperators::CreateMaxOperator());
+  options.merge_operator = merge_hook;
+  options.disable_auto_compactions = true;
+  options.level0_slowdown_writes_trigger = (1 << 30);
+  options.level0_stop_writes_trigger = (1 << 30);
+  options.max_open_files = 20;
+  BlockBasedTableOptions bbto;
+  bbto.no_block_cache = disable_block_cache_;
+  if (bbto.no_block_cache == false) {
+    bbto.block_cache = NewLRUCache(64 * 1024 * 1024);
+  } else {
+    bbto.block_cache = nullptr;
+  }
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  const int kNumOperands = 30;
+  const int kNumKeys = 1000;
+  const int kOperandSize = 100;
+  Random rnd(301);
+
+  // 1000 keys every key have 30 operands, every operand is in a different file
+  std::map<std::string, std::string> true_data;
+  for (int i = 0; i < kNumOperands; i++) {
+    for (int j = 0; j < kNumKeys; j++) {
+      std::string k = Key(j);
+      std::string v = rnd.RandomString(kOperandSize);
+      ASSERT_OK(db_->Merge(WriteOptions(), k, v));
+
+      true_data[k] = std::max(true_data[k], v);
+    }
+    ASSERT_OK(Flush());
+  }
+
+  std::vector<uint64_t> file_numbers = ListTableFiles(env_, dbname_);
+  ASSERT_EQ(file_numbers.size(), kNumOperands);
+  int merge_cnt = 0;
+
+  // Code executed before merge operation
+  merge_hook->before_merge_ = [&]() {
+    // Evict all tables from cache before every merge operation
+    auto* table_cache = dbfull()->TEST_table_cache();
+    for (uint64_t num : file_numbers) {
+      TableCache::Evict(table_cache, num);
+    }
+    // Decrease cache capacity to force all unrefed blocks to be evicted
+    if (bbto.block_cache) {
+      bbto.block_cache->SetCapacity(1);
+    }
+    merge_cnt++;
+  };
+
+  // Code executed after merge operation
+  merge_hook->after_merge_ = [&]() {
+    // Increase capacity again after doing the merge
+    if (bbto.block_cache) {
+      bbto.block_cache->SetCapacity(64 * 1024 * 1024);
+    }
+  };
+
+  size_t total_reads;
+  VerifyDBFromMap(true_data, &total_reads);
+  ASSERT_EQ(merge_cnt, total_reads);
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  VerifyDBFromMap(true_data, &total_reads);
+}
+
+TEST_P(MergeOperatorPinningTest, TailingIterator) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreateMaxOperator();
+  BlockBasedTableOptions bbto;
+  bbto.no_block_cache = disable_block_cache_;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  const int kNumOperands = 100;
+  const int kNumWrites = 100000;
+
+  std::function<void()> writer_func = [&]() {
+    int k = 0;
+    for (int i = 0; i < kNumWrites; i++) {
+      ASSERT_OK(db_->Merge(WriteOptions(), Key(k), Key(k)));
+
+      if (i && i % kNumOperands == 0) {
+        k++;
+      }
+      if (i && i % 127 == 0) {
+        ASSERT_OK(Flush());
+      }
+      if (i && i % 317 == 0) {
+        ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+      }
+    }
+  };
+
+  std::function<void()> reader_func = [&]() {
+    ReadOptions ro;
+    ro.tailing = true;
+    Iterator* iter = db_->NewIterator(ro);
+    ASSERT_OK(iter->status());
+    iter->SeekToFirst();
+    for (int i = 0; i < (kNumWrites / kNumOperands); i++) {
+      while (!iter->Valid()) {
+        // wait for the key to be written
+        env_->SleepForMicroseconds(100);
+        iter->Seek(Key(i));
+      }
+      ASSERT_EQ(iter->key(), Key(i));
+      ASSERT_EQ(iter->value(), Key(i));
+
+      iter->Next();
+    }
+    ASSERT_OK(iter->status());
+
+    delete iter;
+  };
+
+  ROCKSDB_NAMESPACE::port::Thread writer_thread(writer_func);
+  ROCKSDB_NAMESPACE::port::Thread reader_thread(reader_func);
+
+  writer_thread.join();
+  reader_thread.join();
+}
+
+TEST_F(DBMergeOperatorTest, TailingIteratorMemtableUnrefedBySomeoneElse) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  DestroyAndReopen(options);
+
+  // Overview of the test:
+  //  * There are two merge operands for the same key: one in an sst file,
+  //    another in a memtable.
+  //  * Seek a tailing iterator to this key.
+  //  * As part of the seek, the iterator will:
+  //      (a) first visit the operand in the memtable and tell ForwardIterator
+  //          to pin this operand, then
+  //      (b) move on to the operand in the sst file, then pass both operands
+  //          to merge operator.
+  //  * The memtable may get flushed and unreferenced by another thread between
+  //    (a) and (b). The test simulates it by flushing the memtable inside a
+  //    SyncPoint callback located between (a) and (b).
+  //  * In this case it's ForwardIterator's responsibility to keep the memtable
+  //    pinned until (b) is complete. There used to be a bug causing
+  //    ForwardIterator to not pin it in some circumstances. This test
+  //    reproduces it.
+
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "sst"));
+  ASSERT_OK(db_->Flush(FlushOptions()));  // Switch to SuperVersion A
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "memtable"));
+
+  // Pin SuperVersion A
+  std::unique_ptr<Iterator> someone_else(db_->NewIterator(ReadOptions()));
+  ASSERT_OK(someone_else->status());
+
+  bool pushed_first_operand = false;
+  bool stepped_to_next_operand = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBIter::MergeValuesNewToOld:PushedFirstOperand", [&](void*) {
+        EXPECT_FALSE(pushed_first_operand);
+        pushed_first_operand = true;
+        EXPECT_OK(db_->Flush(FlushOptions()));  // Switch to SuperVersion B
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBIter::MergeValuesNewToOld:SteppedToNextOperand", [&](void*) {
+        EXPECT_FALSE(stepped_to_next_operand);
+        stepped_to_next_operand = true;
+        someone_else.reset();  // Unpin SuperVersion A
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ReadOptions ro;
+  ro.tailing = true;
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+  iter->Seek("key");
+
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  EXPECT_EQ(std::string("sst,memtable"), iter->value().ToString());
+  EXPECT_TRUE(pushed_first_operand);
+  EXPECT_TRUE(stepped_to_next_operand);
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBMergeOperatorTest, SnapshotCheckerAndReadCallback) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  DestroyAndReopen(options);
+
+  class TestSnapshotChecker : public SnapshotChecker {
+   public:
+    SnapshotCheckerResult CheckInSnapshot(
+        SequenceNumber seq, SequenceNumber snapshot_seq) const override {
+      return IsInSnapshot(seq, snapshot_seq)
+                 ? SnapshotCheckerResult::kInSnapshot
+                 : SnapshotCheckerResult::kNotInSnapshot;
+    }
+
+    bool IsInSnapshot(SequenceNumber seq, SequenceNumber snapshot_seq) const {
+      switch (snapshot_seq) {
+        case 0:
+          return seq == 0;
+        case 1:
+          return seq <= 1;
+        case 2:
+          // seq = 2 not visible to snapshot with seq = 2
+          return seq <= 1;
+        case 3:
+          return seq <= 3;
+        case 4:
+          // seq = 4 not visible to snpahost with seq = 4
+          return seq <= 3;
+        default:
+          // seq >=4 is uncommitted
+          return seq <= 4;
+      };
+    }
+  };
+  TestSnapshotChecker* snapshot_checker = new TestSnapshotChecker();
+  dbfull()->SetSnapshotChecker(snapshot_checker);
+
+  std::string value;
+  ASSERT_OK(Merge("foo", "v1"));
+  ASSERT_EQ(1, db_->GetLatestSequenceNumber());
+  ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo"));
+  ASSERT_OK(Merge("foo", "v2"));
+  ASSERT_EQ(2, db_->GetLatestSequenceNumber());
+  // v2 is not visible to latest snapshot, which has seq = 2.
+  ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo"));
+  // Take a snapshot with seq = 2.
+  const Snapshot* snapshot1 = db_->GetSnapshot();
+  ASSERT_EQ(2, snapshot1->GetSequenceNumber());
+  // v2 is not visible to snapshot1, which has seq = 2
+  ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1));
+
+  // Verify flush doesn't alter the result.
+  ASSERT_OK(Flush());
+  ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1));
+  ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo"));
+
+  ASSERT_OK(Merge("foo", "v3"));
+  ASSERT_EQ(3, db_->GetLatestSequenceNumber());
+  ASSERT_EQ("v1,v2,v3", GetWithReadCallback(snapshot_checker, "foo"));
+  ASSERT_OK(Merge("foo", "v4"));
+  ASSERT_EQ(4, db_->GetLatestSequenceNumber());
+  // v4 is not visible to latest snapshot, which has seq = 4.
+  ASSERT_EQ("v1,v2,v3", GetWithReadCallback(snapshot_checker, "foo"));
+  const Snapshot* snapshot2 = db_->GetSnapshot();
+  ASSERT_EQ(4, snapshot2->GetSequenceNumber());
+  // v4 is not visible to snapshot2, which has seq = 4.
+  ASSERT_EQ("v1,v2,v3",
+            GetWithReadCallback(snapshot_checker, "foo", snapshot2));
+
+  // Verify flush doesn't alter the result.
+  ASSERT_OK(Flush());
+  ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1));
+  ASSERT_EQ("v1,v2,v3",
+            GetWithReadCallback(snapshot_checker, "foo", snapshot2));
+  ASSERT_EQ("v1,v2,v3", GetWithReadCallback(snapshot_checker, "foo"));
+
+  ASSERT_OK(Merge("foo", "v5"));
+  ASSERT_EQ(5, db_->GetLatestSequenceNumber());
+  // v5 is uncommitted
+  ASSERT_EQ("v1,v2,v3,v4", GetWithReadCallback(snapshot_checker, "foo"));
+
+  // full manual compaction.
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // Verify compaction doesn't alter the result.
+  ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1));
+  ASSERT_EQ("v1,v2,v3",
+            GetWithReadCallback(snapshot_checker, "foo", snapshot2));
+  ASSERT_EQ("v1,v2,v3,v4", GetWithReadCallback(snapshot_checker, "foo"));
+
+  db_->ReleaseSnapshot(snapshot1);
+  db_->ReleaseSnapshot(snapshot2);
+}
+
+class PerConfigMergeOperatorPinningTest
+    : public DBMergeOperatorTest,
+      public testing::WithParamInterface<std::tuple<bool, int>> {
+ public:
+  PerConfigMergeOperatorPinningTest() {
+    std::tie(disable_block_cache_, option_config_) = GetParam();
+  }
+
+  bool disable_block_cache_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    MergeOperatorPinningTest, PerConfigMergeOperatorPinningTest,
+    ::testing::Combine(::testing::Bool(),
+                       ::testing::Range(static_cast<int>(DBTestBase::kDefault),
+                                        static_cast<int>(DBTestBase::kEnd))));
+
+TEST_P(PerConfigMergeOperatorPinningTest, Randomized) {
+  if (ShouldSkipOptions(option_config_, kSkipMergePut)) {
+    return;
+  }
+
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreateMaxOperator();
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = disable_block_cache_;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  std::map<std::string, std::string> true_data;
+
+  const int kTotalMerges = 5000;
+  // Every key gets ~10 operands
+  const int kKeyRange = kTotalMerges / 10;
+  const int kOperandSize = 20;
+  const int kNumPutBefore = kKeyRange / 10;  // 10% value
+  const int kNumPutAfter = kKeyRange / 10;   // 10% overwrite
+  const int kNumDelete = kKeyRange / 10;     // 10% delete
+
+  // kNumPutBefore keys will have base values
+  for (int i = 0; i < kNumPutBefore; i++) {
+    std::string key = Key(rnd.Next() % kKeyRange);
+    std::string value = rnd.RandomString(kOperandSize);
+    ASSERT_OK(db_->Put(WriteOptions(), key, value));
+
+    true_data[key] = value;
+  }
+
+  // Do kTotalMerges merges
+  for (int i = 0; i < kTotalMerges; i++) {
+    std::string key = Key(rnd.Next() % kKeyRange);
+    std::string value = rnd.RandomString(kOperandSize);
+    ASSERT_OK(db_->Merge(WriteOptions(), key, value));
+
+    if (true_data[key] < value) {
+      true_data[key] = value;
+    }
+  }
+
+  // Overwrite random kNumPutAfter keys
+  for (int i = 0; i < kNumPutAfter; i++) {
+    std::string key = Key(rnd.Next() % kKeyRange);
+    std::string value = rnd.RandomString(kOperandSize);
+    ASSERT_OK(db_->Put(WriteOptions(), key, value));
+
+    true_data[key] = value;
+  }
+
+  // Delete random kNumDelete keys
+  for (int i = 0; i < kNumDelete; i++) {
+    std::string key = Key(rnd.Next() % kKeyRange);
+    ASSERT_OK(db_->Delete(WriteOptions(), key));
+
+    true_data.erase(key);
+  }
+
+  VerifyDBFromMap(true_data);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_options_test.cc b/src/rocksdb/db/db_options_test.cc
new file mode 100644
index 000000000..691081db9
--- /dev/null
+++ b/src/rocksdb/db/db_options_test.cc
@@ -0,0 +1,1219 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <limits>
+#include <string>
+#include <unordered_map>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "options/options_helper.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/stats_history.h"
+#include "test_util/sync_point.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBOptionsTest : public DBTestBase {
+ public:
+  DBOptionsTest() : DBTestBase("db_options_test", /*env_do_fsync=*/true) {}
+
+#ifndef ROCKSDB_LITE
+  std::unordered_map<std::string, std::string> GetMutableDBOptionsMap(
+      const DBOptions& options) {
+    std::string options_str;
+    std::unordered_map<std::string, std::string> mutable_map;
+    ConfigOptions config_options(options);
+    config_options.delimiter = "; ";
+
+    EXPECT_OK(GetStringFromMutableDBOptions(
+        config_options, MutableDBOptions(options), &options_str));
+    EXPECT_OK(StringToMap(options_str, &mutable_map));
+
+    return mutable_map;
+  }
+
+  std::unordered_map<std::string, std::string> GetMutableCFOptionsMap(
+      const ColumnFamilyOptions& options) {
+    std::string options_str;
+    ConfigOptions config_options;
+    config_options.delimiter = "; ";
+
+    std::unordered_map<std::string, std::string> mutable_map;
+    EXPECT_OK(GetStringFromMutableCFOptions(
+        config_options, MutableCFOptions(options), &options_str));
+    EXPECT_OK(StringToMap(options_str, &mutable_map));
+    return mutable_map;
+  }
+
+  std::unordered_map<std::string, std::string> GetRandomizedMutableCFOptionsMap(
+      Random* rnd) {
+    Options options = CurrentOptions();
+    options.env = env_;
+    ImmutableDBOptions db_options(options);
+    test::RandomInitCFOptions(&options, options, rnd);
+    auto sanitized_options = SanitizeOptions(db_options, options);
+    auto opt_map = GetMutableCFOptionsMap(sanitized_options);
+    delete options.compaction_filter;
+    return opt_map;
+  }
+
+  std::unordered_map<std::string, std::string> GetRandomizedMutableDBOptionsMap(
+      Random* rnd) {
+    DBOptions db_options;
+    test::RandomInitDBOptions(&db_options, rnd);
+    auto sanitized_options = SanitizeOptions(dbname_, db_options);
+    return GetMutableDBOptionsMap(sanitized_options);
+  }
+#endif  // ROCKSDB_LITE
+};
+
+TEST_F(DBOptionsTest, ImmutableTrackAndVerifyWalsInManifest) {
+  Options options;
+  options.env = env_;
+  options.track_and_verify_wals_in_manifest = true;
+
+  ImmutableDBOptions db_options(options);
+  ASSERT_TRUE(db_options.track_and_verify_wals_in_manifest);
+
+  Reopen(options);
+  ASSERT_TRUE(dbfull()->GetDBOptions().track_and_verify_wals_in_manifest);
+
+  Status s =
+      dbfull()->SetDBOptions({{"track_and_verify_wals_in_manifest", "false"}});
+  ASSERT_FALSE(s.ok());
+}
+
+TEST_F(DBOptionsTest, ImmutableVerifySstUniqueIdInManifest) {
+  Options options;
+  options.env = env_;
+  options.verify_sst_unique_id_in_manifest = true;
+
+  ImmutableDBOptions db_options(options);
+  ASSERT_TRUE(db_options.verify_sst_unique_id_in_manifest);
+
+  Reopen(options);
+  ASSERT_TRUE(dbfull()->GetDBOptions().verify_sst_unique_id_in_manifest);
+
+  Status s =
+      dbfull()->SetDBOptions({{"verify_sst_unique_id_in_manifest", "false"}});
+  ASSERT_FALSE(s.ok());
+}
+
+// RocksDB lite don't support dynamic options.
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBOptionsTest, AvoidUpdatingOptions) {
+  Options options;
+  options.env = env_;
+  options.max_background_jobs = 4;
+  options.delayed_write_rate = 1024;
+
+  Reopen(options);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  bool is_changed_stats = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::WriteOptionsFile:PersistOptions", [&](void* /*arg*/) {
+        ASSERT_FALSE(is_changed_stats);  // should only save options file once
+        is_changed_stats = true;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // helper function to check the status and reset after each check
+  auto is_changed = [&] {
+    bool ret = is_changed_stats;
+    is_changed_stats = false;
+    return ret;
+  };
+
+  // without changing the value, but it's sanitized to a different value
+  ASSERT_OK(dbfull()->SetDBOptions({{"bytes_per_sync", "0"}}));
+  ASSERT_TRUE(is_changed());
+
+  // without changing the value
+  ASSERT_OK(dbfull()->SetDBOptions({{"max_background_jobs", "4"}}));
+  ASSERT_FALSE(is_changed());
+
+  // changing the value
+  ASSERT_OK(dbfull()->SetDBOptions({{"bytes_per_sync", "123"}}));
+  ASSERT_TRUE(is_changed());
+
+  // update again
+  ASSERT_OK(dbfull()->SetDBOptions({{"bytes_per_sync", "123"}}));
+  ASSERT_FALSE(is_changed());
+
+  // without changing a default value
+  ASSERT_OK(dbfull()->SetDBOptions({{"strict_bytes_per_sync", "false"}}));
+  ASSERT_FALSE(is_changed());
+
+  // now change
+  ASSERT_OK(dbfull()->SetDBOptions({{"strict_bytes_per_sync", "true"}}));
+  ASSERT_TRUE(is_changed());
+
+  // multiple values without change
+  ASSERT_OK(dbfull()->SetDBOptions(
+      {{"max_total_wal_size", "0"}, {"stats_dump_period_sec", "600"}}));
+  ASSERT_FALSE(is_changed());
+
+  // multiple values with change
+  ASSERT_OK(dbfull()->SetDBOptions(
+      {{"max_open_files", "100"}, {"stats_dump_period_sec", "600"}}));
+  ASSERT_TRUE(is_changed());
+}
+
+TEST_F(DBOptionsTest, GetLatestDBOptions) {
+  // GetOptions should be able to get latest option changed by SetOptions.
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+  Random rnd(228);
+  Reopen(options);
+  auto new_options = GetRandomizedMutableDBOptionsMap(&rnd);
+  ASSERT_OK(dbfull()->SetDBOptions(new_options));
+  ASSERT_EQ(new_options, GetMutableDBOptionsMap(dbfull()->GetDBOptions()));
+}
+
+TEST_F(DBOptionsTest, GetLatestCFOptions) {
+  // GetOptions should be able to get latest option changed by SetOptions.
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+  Random rnd(228);
+  Reopen(options);
+  CreateColumnFamilies({"foo"}, options);
+  ReopenWithColumnFamilies({"default", "foo"}, options);
+  auto options_default = GetRandomizedMutableCFOptionsMap(&rnd);
+  auto options_foo = GetRandomizedMutableCFOptionsMap(&rnd);
+  ASSERT_OK(dbfull()->SetOptions(handles_[0], options_default));
+  ASSERT_OK(dbfull()->SetOptions(handles_[1], options_foo));
+  ASSERT_EQ(options_default,
+            GetMutableCFOptionsMap(dbfull()->GetOptions(handles_[0])));
+  ASSERT_EQ(options_foo,
+            GetMutableCFOptionsMap(dbfull()->GetOptions(handles_[1])));
+}
+
+TEST_F(DBOptionsTest, SetMutableTableOptions) {
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+  options.blob_file_size = 16384;
+  BlockBasedTableOptions bbto;
+  bbto.no_block_cache = true;
+  bbto.block_size = 8192;
+  bbto.block_restart_interval = 7;
+
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+  Options c_opts = dbfull()->GetOptions(cfh);
+
+  const auto* c_bbto =
+      c_opts.table_factory->GetOptions<BlockBasedTableOptions>();
+  ASSERT_NE(c_bbto, nullptr);
+  ASSERT_EQ(c_opts.blob_file_size, 16384);
+  ASSERT_EQ(c_bbto->no_block_cache, true);
+  ASSERT_EQ(c_bbto->block_size, 8192);
+  ASSERT_EQ(c_bbto->block_restart_interval, 7);
+  ASSERT_OK(dbfull()->SetOptions(
+      cfh, {{"table_factory.block_size", "16384"},
+            {"table_factory.block_restart_interval", "11"}}));
+  ASSERT_EQ(c_bbto->block_size, 16384);
+  ASSERT_EQ(c_bbto->block_restart_interval, 11);
+
+  // Now set an option that is not mutable - options should not change
+  ASSERT_NOK(
+      dbfull()->SetOptions(cfh, {{"table_factory.no_block_cache", "false"}}));
+  ASSERT_EQ(c_bbto->no_block_cache, true);
+  ASSERT_EQ(c_bbto->block_size, 16384);
+  ASSERT_EQ(c_bbto->block_restart_interval, 11);
+
+  // Set some that are mutable and some that are not - options should not change
+  ASSERT_NOK(dbfull()->SetOptions(
+      cfh, {{"table_factory.no_block_cache", "false"},
+            {"table_factory.block_size", "8192"},
+            {"table_factory.block_restart_interval", "7"}}));
+  ASSERT_EQ(c_bbto->no_block_cache, true);
+  ASSERT_EQ(c_bbto->block_size, 16384);
+  ASSERT_EQ(c_bbto->block_restart_interval, 11);
+
+  // Set some that are mutable and some that do not exist - options should not
+  // change
+  ASSERT_NOK(dbfull()->SetOptions(
+      cfh, {{"table_factory.block_size", "8192"},
+            {"table_factory.does_not_exist", "true"},
+            {"table_factory.block_restart_interval", "7"}}));
+  ASSERT_EQ(c_bbto->no_block_cache, true);
+  ASSERT_EQ(c_bbto->block_size, 16384);
+  ASSERT_EQ(c_bbto->block_restart_interval, 11);
+
+  // Trying to change the table factory fails
+  ASSERT_NOK(dbfull()->SetOptions(
+      cfh, {{"table_factory", TableFactory::kPlainTableName()}}));
+
+  // Set some on the table and some on the Column Family
+  ASSERT_OK(dbfull()->SetOptions(
+      cfh, {{"table_factory.block_size", "16384"},
+            {"blob_file_size", "32768"},
+            {"table_factory.block_restart_interval", "13"}}));
+  c_opts = dbfull()->GetOptions(cfh);
+  ASSERT_EQ(c_opts.blob_file_size, 32768);
+  ASSERT_EQ(c_bbto->block_size, 16384);
+  ASSERT_EQ(c_bbto->block_restart_interval, 13);
+  // Set some on the table and a bad one on the ColumnFamily - options should
+  // not change
+  ASSERT_NOK(dbfull()->SetOptions(
+      cfh, {{"table_factory.block_size", "1024"},
+            {"no_such_option", "32768"},
+            {"table_factory.block_restart_interval", "7"}}));
+  ASSERT_EQ(c_bbto->block_size, 16384);
+  ASSERT_EQ(c_bbto->block_restart_interval, 13);
+}
+
+TEST_F(DBOptionsTest, SetWithCustomMemTableFactory) {
+  class DummySkipListFactory : public SkipListFactory {
+   public:
+    static const char* kClassName() { return "DummySkipListFactory"; }
+    const char* Name() const override { return kClassName(); }
+    explicit DummySkipListFactory() : SkipListFactory(2) {}
+  };
+  {
+    // Verify the DummySkipList cannot be created
+    ConfigOptions config_options;
+    config_options.ignore_unsupported_options = false;
+    std::unique_ptr<MemTableRepFactory> factory;
+    ASSERT_NOK(MemTableRepFactory::CreateFromString(
+        config_options, DummySkipListFactory::kClassName(), &factory));
+  }
+  Options options;
+  options.create_if_missing = true;
+  // Try with fail_if_options_file_error=false/true to update the options
+  for (bool on_error : {false, true}) {
+    options.fail_if_options_file_error = on_error;
+    options.env = env_;
+    options.disable_auto_compactions = false;
+
+    options.memtable_factory.reset(new DummySkipListFactory());
+    Reopen(options);
+
+    ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+    ASSERT_OK(
+        dbfull()->SetOptions(cfh, {{"disable_auto_compactions", "true"}}));
+    ColumnFamilyDescriptor cfd;
+    ASSERT_OK(cfh->GetDescriptor(&cfd));
+    ASSERT_STREQ(cfd.options.memtable_factory->Name(),
+                 DummySkipListFactory::kClassName());
+    ColumnFamilyHandle* test = nullptr;
+    ASSERT_OK(dbfull()->CreateColumnFamily(options, "test", &test));
+    ASSERT_OK(test->GetDescriptor(&cfd));
+    ASSERT_STREQ(cfd.options.memtable_factory->Name(),
+                 DummySkipListFactory::kClassName());
+
+    ASSERT_OK(dbfull()->DropColumnFamily(test));
+    delete test;
+  }
+}
+
+TEST_F(DBOptionsTest, SetBytesPerSync) {
+  const size_t kValueSize = 1024 * 1024;  // 1MB
+  Options options;
+  options.create_if_missing = true;
+  options.bytes_per_sync = 1024 * 1024;
+  options.use_direct_reads = false;
+  options.write_buffer_size = 400 * kValueSize;
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  options.env = env_;
+  Reopen(options);
+  int counter = 0;
+  int low_bytes_per_sync = 0;
+  int i = 0;
+  const std::string kValue(kValueSize, 'v');
+  ASSERT_EQ(options.bytes_per_sync, dbfull()->GetDBOptions().bytes_per_sync);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WritableFileWriter::RangeSync:0", [&](void* /*arg*/) { counter++; });
+
+  WriteOptions write_opts;
+  // should sync approximately 40MB/1MB ~= 40 times.
+  for (i = 0; i < 40; i++) {
+    ASSERT_OK(Put(Key(i), kValue, write_opts));
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  low_bytes_per_sync = counter;
+  ASSERT_GT(low_bytes_per_sync, 35);
+  ASSERT_LT(low_bytes_per_sync, 45);
+
+  counter = 0;
+  // 8388608 = 8 * 1024 * 1024
+  ASSERT_OK(dbfull()->SetDBOptions({{"bytes_per_sync", "8388608"}}));
+  ASSERT_EQ(8388608, dbfull()->GetDBOptions().bytes_per_sync);
+  // should sync approximately 40MB*2/8MB ~= 10 times.
+  // data will be 40*2MB because of previous Puts too.
+  for (i = 0; i < 40; i++) {
+    ASSERT_OK(Put(Key(i), kValue, write_opts));
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_GT(counter, 5);
+  ASSERT_LT(counter, 15);
+
+  // Redundant assert. But leaving it here just to get the point across that
+  // low_bytes_per_sync > counter.
+  ASSERT_GT(low_bytes_per_sync, counter);
+}
+
+TEST_F(DBOptionsTest, SetWalBytesPerSync) {
+  const size_t kValueSize = 1024 * 1024 * 3;
+  Options options;
+  options.create_if_missing = true;
+  options.wal_bytes_per_sync = 512;
+  options.write_buffer_size = 100 * kValueSize;
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  options.env = env_;
+  Reopen(options);
+  ASSERT_EQ(512, dbfull()->GetDBOptions().wal_bytes_per_sync);
+  std::atomic_int counter{0};
+  int low_bytes_per_sync = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WritableFileWriter::RangeSync:0",
+      [&](void* /*arg*/) { counter.fetch_add(1); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  const std::string kValue(kValueSize, 'v');
+  int i = 0;
+  for (; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), kValue));
+  }
+  // Do not flush. If we flush here, SwitchWAL will reuse old WAL file since its
+  // empty and will not get the new wal_bytes_per_sync value.
+  low_bytes_per_sync = counter;
+  // 5242880 = 1024 * 1024 * 5
+  ASSERT_OK(dbfull()->SetDBOptions({{"wal_bytes_per_sync", "5242880"}}));
+  ASSERT_EQ(5242880, dbfull()->GetDBOptions().wal_bytes_per_sync);
+  counter = 0;
+  i = 0;
+  for (; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), kValue));
+  }
+  ASSERT_GT(counter, 0);
+  ASSERT_GT(low_bytes_per_sync, 0);
+  ASSERT_GT(low_bytes_per_sync, counter);
+}
+
+TEST_F(DBOptionsTest, WritableFileMaxBufferSize) {
+  Options options;
+  options.create_if_missing = true;
+  options.writable_file_max_buffer_size = 1024 * 1024;
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_manifest_file_size = 1;
+  options.env = env_;
+  int buffer_size = 1024 * 1024;
+  Reopen(options);
+  ASSERT_EQ(buffer_size,
+            dbfull()->GetDBOptions().writable_file_max_buffer_size);
+
+  std::atomic<int> match_cnt(0);
+  std::atomic<int> unmatch_cnt(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WritableFileWriter::WritableFileWriter:0", [&](void* arg) {
+        int value = static_cast<int>(reinterpret_cast<uintptr_t>(arg));
+        if (value == buffer_size) {
+          match_cnt++;
+        } else {
+          unmatch_cnt++;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  int i = 0;
+  for (; i < 3; i++) {
+    ASSERT_OK(Put("foo", std::to_string(i)));
+    ASSERT_OK(Put("bar", std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(unmatch_cnt, 0);
+  ASSERT_GE(match_cnt, 11);
+
+  ASSERT_OK(
+      dbfull()->SetDBOptions({{"writable_file_max_buffer_size", "524288"}}));
+  buffer_size = 512 * 1024;
+  match_cnt = 0;
+  unmatch_cnt = 0;  // SetDBOptions() will create a WritableFileWriter
+
+  ASSERT_EQ(buffer_size,
+            dbfull()->GetDBOptions().writable_file_max_buffer_size);
+  i = 0;
+  for (; i < 3; i++) {
+    ASSERT_OK(Put("foo", std::to_string(i)));
+    ASSERT_OK(Put("bar", std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(unmatch_cnt, 0);
+  ASSERT_GE(match_cnt, 11);
+}
+
+TEST_F(DBOptionsTest, SetOptionsAndReopen) {
+  Random rnd(1044);
+  auto rand_opts = GetRandomizedMutableCFOptionsMap(&rnd);
+  ASSERT_OK(dbfull()->SetOptions(rand_opts));
+  // Verify if DB can be reopen after setting options.
+  Options options;
+  options.env = env_;
+  ASSERT_OK(TryReopen(options));
+}
+
+TEST_F(DBOptionsTest, EnableAutoCompactionAndTriggerStall) {
+  const std::string kValue(1024, 'v');
+  for (int method_type = 0; method_type < 2; method_type++) {
+    for (int option_type = 0; option_type < 4; option_type++) {
+      Options options;
+      options.create_if_missing = true;
+      options.disable_auto_compactions = true;
+      options.write_buffer_size = 1024 * 1024 * 10;
+      options.compression = CompressionType::kNoCompression;
+      options.level0_file_num_compaction_trigger = 1;
+      options.level0_stop_writes_trigger = std::numeric_limits<int>::max();
+      options.level0_slowdown_writes_trigger = std::numeric_limits<int>::max();
+      options.hard_pending_compaction_bytes_limit =
+          std::numeric_limits<uint64_t>::max();
+      options.soft_pending_compaction_bytes_limit =
+          std::numeric_limits<uint64_t>::max();
+      options.env = env_;
+
+      DestroyAndReopen(options);
+      int i = 0;
+      for (; i < 1024; i++) {
+        ASSERT_OK(Put(Key(i), kValue));
+      }
+      ASSERT_OK(Flush());
+      for (; i < 1024 * 2; i++) {
+        ASSERT_OK(Put(Key(i), kValue));
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+      ASSERT_EQ(2, NumTableFilesAtLevel(0));
+      uint64_t l0_size = SizeAtLevel(0);
+
+      switch (option_type) {
+        case 0:
+          // test with level0_stop_writes_trigger
+          options.level0_stop_writes_trigger = 2;
+          options.level0_slowdown_writes_trigger = 2;
+          break;
+        case 1:
+          options.level0_slowdown_writes_trigger = 2;
+          break;
+        case 2:
+          options.hard_pending_compaction_bytes_limit = l0_size;
+          options.soft_pending_compaction_bytes_limit = l0_size;
+          break;
+        case 3:
+          options.soft_pending_compaction_bytes_limit = l0_size;
+          break;
+      }
+      Reopen(options);
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped());
+      ASSERT_FALSE(dbfull()->TEST_write_controler().NeedsDelay());
+
+      SyncPoint::GetInstance()->LoadDependency(
+          {{"DBOptionsTest::EnableAutoCompactionAndTriggerStall:1",
+            "BackgroundCallCompaction:0"},
+           {"DBImpl::BackgroundCompaction():BeforePickCompaction",
+            "DBOptionsTest::EnableAutoCompactionAndTriggerStall:2"},
+           {"DBOptionsTest::EnableAutoCompactionAndTriggerStall:3",
+            "DBImpl::BackgroundCompaction():AfterPickCompaction"}});
+      // Block background compaction.
+      SyncPoint::GetInstance()->EnableProcessing();
+
+      switch (method_type) {
+        case 0:
+          ASSERT_OK(
+              dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+          break;
+        case 1:
+          ASSERT_OK(dbfull()->EnableAutoCompaction(
+              {dbfull()->DefaultColumnFamily()}));
+          break;
+      }
+      TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionAndTriggerStall:1");
+      // Wait for stall condition recalculate.
+      TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionAndTriggerStall:2");
+
+      switch (option_type) {
+        case 0:
+          ASSERT_TRUE(dbfull()->TEST_write_controler().IsStopped());
+          break;
+        case 1:
+          ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped());
+          ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+          break;
+        case 2:
+          ASSERT_TRUE(dbfull()->TEST_write_controler().IsStopped());
+          break;
+        case 3:
+          ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped());
+          ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+          break;
+      }
+      TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionAndTriggerStall:3");
+
+      // Background compaction executed.
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped());
+      ASSERT_FALSE(dbfull()->TEST_write_controler().NeedsDelay());
+    }
+  }
+}
+
+TEST_F(DBOptionsTest, SetOptionsMayTriggerCompaction) {
+  Options options;
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 1000;
+  options.env = env_;
+  Reopen(options);
+  for (int i = 0; i < 3; i++) {
+    // Need to insert two keys to avoid trivial move.
+    ASSERT_OK(Put("foo", std::to_string(i)));
+    ASSERT_OK(Put("bar", std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ("3", FilesPerLevel());
+  ASSERT_OK(
+      dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "3"}}));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("0,1", FilesPerLevel());
+}
+
+TEST_F(DBOptionsTest, SetBackgroundCompactionThreads) {
+  Options options;
+  options.create_if_missing = true;
+  options.max_background_compactions = 1;  // default value
+  options.env = env_;
+  Reopen(options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+  ASSERT_OK(dbfull()->SetDBOptions({{"max_background_compactions", "3"}}));
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+  auto stop_token = dbfull()->TEST_write_controler().GetStopToken();
+  ASSERT_EQ(3, dbfull()->TEST_BGCompactionsAllowed());
+}
+
+TEST_F(DBOptionsTest, SetBackgroundFlushThreads) {
+  Options options;
+  options.create_if_missing = true;
+  options.max_background_flushes = 1;
+  options.env = env_;
+  Reopen(options);
+  ASSERT_EQ(1, dbfull()->TEST_BGFlushesAllowed());
+  ASSERT_EQ(1, env_->GetBackgroundThreads(Env::Priority::HIGH));
+  ASSERT_OK(dbfull()->SetDBOptions({{"max_background_flushes", "3"}}));
+  ASSERT_EQ(3, env_->GetBackgroundThreads(Env::Priority::HIGH));
+  ASSERT_EQ(3, dbfull()->TEST_BGFlushesAllowed());
+}
+
+TEST_F(DBOptionsTest, SetBackgroundJobs) {
+  Options options;
+  options.create_if_missing = true;
+  options.max_background_jobs = 8;
+  options.env = env_;
+  Reopen(options);
+
+  for (int i = 0; i < 2; ++i) {
+    if (i > 0) {
+      options.max_background_jobs = 12;
+      ASSERT_OK(dbfull()->SetDBOptions(
+          {{"max_background_jobs",
+            std::to_string(options.max_background_jobs)}}));
+    }
+
+    const int expected_max_flushes = options.max_background_jobs / 4;
+
+    ASSERT_EQ(expected_max_flushes, dbfull()->TEST_BGFlushesAllowed());
+    ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+    auto stop_token = dbfull()->TEST_write_controler().GetStopToken();
+
+    const int expected_max_compactions = 3 * expected_max_flushes;
+
+    ASSERT_EQ(expected_max_flushes, dbfull()->TEST_BGFlushesAllowed());
+    ASSERT_EQ(expected_max_compactions, dbfull()->TEST_BGCompactionsAllowed());
+
+    ASSERT_EQ(expected_max_flushes,
+              env_->GetBackgroundThreads(Env::Priority::HIGH));
+    ASSERT_EQ(expected_max_compactions,
+              env_->GetBackgroundThreads(Env::Priority::LOW));
+  }
+}
+
+TEST_F(DBOptionsTest, AvoidFlushDuringShutdown) {
+  Options options;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+  WriteOptions write_without_wal;
+  write_without_wal.disableWAL = true;
+
+  ASSERT_FALSE(options.avoid_flush_during_shutdown);
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "v1", write_without_wal));
+  Reopen(options);
+  ASSERT_EQ("v1", Get("foo"));
+  ASSERT_EQ("1", FilesPerLevel());
+
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "v2", write_without_wal));
+  ASSERT_OK(dbfull()->SetDBOptions({{"avoid_flush_during_shutdown", "true"}}));
+  Reopen(options);
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+  ASSERT_EQ("", FilesPerLevel());
+}
+
+TEST_F(DBOptionsTest, SetDelayedWriteRateOption) {
+  Options options;
+  options.create_if_missing = true;
+  options.delayed_write_rate = 2 * 1024U * 1024U;
+  options.env = env_;
+  Reopen(options);
+  ASSERT_EQ(2 * 1024U * 1024U,
+            dbfull()->TEST_write_controler().max_delayed_write_rate());
+
+  ASSERT_OK(dbfull()->SetDBOptions({{"delayed_write_rate", "20000"}}));
+  ASSERT_EQ(20000, dbfull()->TEST_write_controler().max_delayed_write_rate());
+}
+
+TEST_F(DBOptionsTest, MaxTotalWalSizeChange) {
+  Random rnd(1044);
+  const auto value_size = size_t(1024);
+  std::string value = rnd.RandomString(value_size);
+
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+  CreateColumnFamilies({"1", "2", "3"}, options);
+  ReopenWithColumnFamilies({"default", "1", "2", "3"}, options);
+
+  WriteOptions write_options;
+
+  const int key_count = 100;
+  for (int i = 0; i < key_count; ++i) {
+    for (size_t cf = 0; cf < handles_.size(); ++cf) {
+      ASSERT_OK(Put(static_cast<int>(cf), Key(i), value));
+    }
+  }
+  ASSERT_OK(dbfull()->SetDBOptions({{"max_total_wal_size", "10"}}));
+
+  for (size_t cf = 0; cf < handles_.size(); ++cf) {
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
+    ASSERT_EQ("1", FilesPerLevel(static_cast<int>(cf)));
+  }
+}
+
+TEST_F(DBOptionsTest, SetStatsDumpPeriodSec) {
+  Options options;
+  options.create_if_missing = true;
+  options.stats_dump_period_sec = 5;
+  options.env = env_;
+  Reopen(options);
+  ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_dump_period_sec);
+
+  for (int i = 0; i < 20; i++) {
+    unsigned int num = rand() % 5000 + 1;
+    ASSERT_OK(dbfull()->SetDBOptions(
+        {{"stats_dump_period_sec", std::to_string(num)}}));
+    ASSERT_EQ(num, dbfull()->GetDBOptions().stats_dump_period_sec);
+  }
+  Close();
+}
+
+TEST_F(DBOptionsTest, SetOptionsStatsPersistPeriodSec) {
+  Options options;
+  options.create_if_missing = true;
+  options.stats_persist_period_sec = 5;
+  options.env = env_;
+  Reopen(options);
+  ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_persist_period_sec);
+
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "12345"}}));
+  ASSERT_EQ(12345u, dbfull()->GetDBOptions().stats_persist_period_sec);
+  ASSERT_NOK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "abcde"}}));
+  ASSERT_EQ(12345u, dbfull()->GetDBOptions().stats_persist_period_sec);
+}
+
+static void assert_candidate_files_empty(DBImpl* dbfull, const bool empty) {
+  dbfull->TEST_LockMutex();
+  JobContext job_context(0);
+  dbfull->FindObsoleteFiles(&job_context, false);
+  ASSERT_EQ(empty, job_context.full_scan_candidate_files.empty());
+  dbfull->TEST_UnlockMutex();
+  if (job_context.HaveSomethingToDelete()) {
+    // fulfill the contract of FindObsoleteFiles by calling PurgeObsoleteFiles
+    // afterwards; otherwise the test may hang on shutdown
+    dbfull->PurgeObsoleteFiles(job_context);
+  }
+  job_context.Clean();
+}
+
+TEST_F(DBOptionsTest, DeleteObsoleteFilesPeriodChange) {
+  Options options;
+  options.env = env_;
+  SetTimeElapseOnlySleepOnReopen(&options);
+  options.create_if_missing = true;
+  ASSERT_OK(TryReopen(options));
+
+  // Verify that candidate files set is empty when no full scan requested.
+  assert_candidate_files_empty(dbfull(), true);
+
+  ASSERT_OK(
+      dbfull()->SetDBOptions({{"delete_obsolete_files_period_micros", "0"}}));
+
+  // After delete_obsolete_files_period_micros updated to 0, the next call
+  // to FindObsoleteFiles should make a full scan
+  assert_candidate_files_empty(dbfull(), false);
+
+  ASSERT_OK(
+      dbfull()->SetDBOptions({{"delete_obsolete_files_period_micros", "20"}}));
+
+  assert_candidate_files_empty(dbfull(), true);
+
+  env_->MockSleepForMicroseconds(20);
+  assert_candidate_files_empty(dbfull(), true);
+
+  env_->MockSleepForMicroseconds(1);
+  assert_candidate_files_empty(dbfull(), false);
+
+  Close();
+}
+
+TEST_F(DBOptionsTest, MaxOpenFilesChange) {
+  SpecialEnv env(env_);
+  Options options;
+  options.env = CurrentOptions().env;
+  options.max_open_files = -1;
+
+  Reopen(options);
+
+  Cache* tc = dbfull()->TEST_table_cache();
+
+  ASSERT_EQ(-1, dbfull()->GetDBOptions().max_open_files);
+  ASSERT_LT(2000, tc->GetCapacity());
+  ASSERT_OK(dbfull()->SetDBOptions({{"max_open_files", "1024"}}));
+  ASSERT_EQ(1024, dbfull()->GetDBOptions().max_open_files);
+  // examine the table cache (actual size should be 1014)
+  ASSERT_GT(1500, tc->GetCapacity());
+  Close();
+}
+
+TEST_F(DBOptionsTest, SanitizeDelayedWriteRate) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.delayed_write_rate = 0;
+  Reopen(options);
+  ASSERT_EQ(16 * 1024 * 1024, dbfull()->GetDBOptions().delayed_write_rate);
+
+  options.rate_limiter.reset(NewGenericRateLimiter(31 * 1024 * 1024));
+  Reopen(options);
+  ASSERT_EQ(31 * 1024 * 1024, dbfull()->GetDBOptions().delayed_write_rate);
+}
+
+TEST_F(DBOptionsTest, SanitizeUniversalTTLCompaction) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.compaction_style = kCompactionStyleUniversal;
+
+  options.ttl = 0;
+  options.periodic_compaction_seconds = 0;
+  Reopen(options);
+  ASSERT_EQ(0, dbfull()->GetOptions().ttl);
+  ASSERT_EQ(0, dbfull()->GetOptions().periodic_compaction_seconds);
+
+  options.ttl = 0;
+  options.periodic_compaction_seconds = 100;
+  Reopen(options);
+  ASSERT_EQ(0, dbfull()->GetOptions().ttl);
+  ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds);
+
+  options.ttl = 100;
+  options.periodic_compaction_seconds = 0;
+  Reopen(options);
+  ASSERT_EQ(100, dbfull()->GetOptions().ttl);
+  ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds);
+
+  options.ttl = 100;
+  options.periodic_compaction_seconds = 500;
+  Reopen(options);
+  ASSERT_EQ(100, dbfull()->GetOptions().ttl);
+  ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds);
+}
+
+TEST_F(DBOptionsTest, SanitizeTtlDefault) {
+  Options options;
+  options.env = CurrentOptions().env;
+  Reopen(options);
+  ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl);
+
+  options.compaction_style = kCompactionStyleLevel;
+  options.ttl = 0;
+  Reopen(options);
+  ASSERT_EQ(0, dbfull()->GetOptions().ttl);
+
+  options.ttl = 100;
+  Reopen(options);
+  ASSERT_EQ(100, dbfull()->GetOptions().ttl);
+}
+
+TEST_F(DBOptionsTest, SanitizeFIFOPeriodicCompaction) {
+  Options options;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.env = CurrentOptions().env;
+  options.ttl = 0;
+  Reopen(options);
+  ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl);
+
+  options.ttl = 100;
+  Reopen(options);
+  ASSERT_EQ(100, dbfull()->GetOptions().ttl);
+
+  options.ttl = 100 * 24 * 60 * 60;
+  Reopen(options);
+  ASSERT_EQ(100 * 24 * 60 * 60, dbfull()->GetOptions().ttl);
+
+  options.ttl = 200;
+  options.periodic_compaction_seconds = 300;
+  Reopen(options);
+  ASSERT_EQ(200, dbfull()->GetOptions().ttl);
+
+  options.ttl = 500;
+  options.periodic_compaction_seconds = 300;
+  Reopen(options);
+  ASSERT_EQ(300, dbfull()->GetOptions().ttl);
+}
+
+TEST_F(DBOptionsTest, SetFIFOCompactionOptions) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.write_buffer_size = 10 << 10;  // 10KB
+  options.arena_block_size = 4096;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options.compaction_options_fifo.allow_compaction = false;
+  env_->SetMockSleep();
+  options.env = env_;
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  // Test dynamically changing ttl.
+  options.ttl = 1 * 60 * 60;  // 1 hour
+  ASSERT_OK(TryReopen(options));
+
+  Random rnd(301);
+  for (int i = 0; i < 10; i++) {
+    // Generate and flush a file about 10KB.
+    for (int j = 0; j < 10; j++) {
+      ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  env_->MockSleepForSeconds(61);
+
+  // No files should be compacted as ttl is set to 1 hour.
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 3600);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  // Set ttl to 1 minute. So all files should get deleted.
+  ASSERT_OK(dbfull()->SetOptions({{"ttl", "60"}}));
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 60);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  // Test dynamically changing compaction_options_fifo.max_table_files_size
+  options.compaction_options_fifo.max_table_files_size = 500 << 10;  // 00KB
+  options.ttl = 0;
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < 10; i++) {
+    // Generate and flush a file about 10KB.
+    for (int j = 0; j < 10; j++) {
+      ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  // No files should be compacted as max_table_files_size is set to 500 KB.
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            500 << 10);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  // Set max_table_files_size to 12 KB. So only 1 file should remain now.
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo", "{max_table_files_size=12288;}"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            12 << 10);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+
+  // Test dynamically changing compaction_options_fifo.allow_compaction
+  options.compaction_options_fifo.max_table_files_size = 500 << 10;  // 500KB
+  options.ttl = 0;
+  options.compaction_options_fifo.allow_compaction = false;
+  options.level0_file_num_compaction_trigger = 6;
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < 10; i++) {
+    // Generate and flush a file about 10KB.
+    for (int j = 0; j < 10; j++) {
+      ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  // No files should be compacted as max_table_files_size is set to 500 KB and
+  // allow_compaction is false
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            false);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  // Set allow_compaction to true. So number of files should be between 1 and 5.
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo", "{allow_compaction=true;}"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            true);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_GE(NumTableFilesAtLevel(0), 1);
+  ASSERT_LE(NumTableFilesAtLevel(0), 5);
+}
+
+TEST_F(DBOptionsTest, CompactionReadaheadSizeChange) {
+  SpecialEnv env(env_);
+  Options options;
+  options.env = &env;
+
+  options.compaction_readahead_size = 0;
+  options.level0_file_num_compaction_trigger = 2;
+  const std::string kValue(1024, 'v');
+  Reopen(options);
+
+  ASSERT_EQ(0, dbfull()->GetDBOptions().compaction_readahead_size);
+  ASSERT_OK(dbfull()->SetDBOptions({{"compaction_readahead_size", "256"}}));
+  ASSERT_EQ(256, dbfull()->GetDBOptions().compaction_readahead_size);
+  for (int i = 0; i < 1024; i++) {
+    ASSERT_OK(Put(Key(i), kValue));
+  }
+  ASSERT_OK(Flush());
+  for (int i = 0; i < 1024 * 2; i++) {
+    ASSERT_OK(Put(Key(i), kValue));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(256, env_->compaction_readahead_size_);
+  Close();
+}
+
+TEST_F(DBOptionsTest, FIFOTtlBackwardCompatible) {
+  Options options;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.write_buffer_size = 10 << 10;  // 10KB
+  options.create_if_missing = true;
+  options.env = CurrentOptions().env;
+
+  ASSERT_OK(TryReopen(options));
+
+  Random rnd(301);
+  for (int i = 0; i < 10; i++) {
+    // Generate and flush a file about 10KB.
+    for (int j = 0; j < 10; j++) {
+      ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  // In release 6.0, ttl was promoted from a secondary level option under
+  // compaction_options_fifo to a top level option under ColumnFamilyOptions.
+  // We still need to handle old SetOptions calls but should ignore
+  // ttl under compaction_options_fifo.
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo",
+        "{allow_compaction=true;max_table_files_size=1024;ttl=731;}"},
+       {"ttl", "60"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            true);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            1024);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 60);
+
+  // Put ttl as the first option inside compaction_options_fifo. That works as
+  // it doesn't overwrite any other option.
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo",
+        "{ttl=985;allow_compaction=true;max_table_files_size=1024;}"},
+       {"ttl", "191"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            true);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            1024);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 191);
+}
+
+TEST_F(DBOptionsTest, ChangeCompression) {
+  if (!Snappy_Supported() || !LZ4_Supported()) {
+    return;
+  }
+  Options options;
+  options.write_buffer_size = 10 << 10;  // 10KB
+  options.level0_file_num_compaction_trigger = 2;
+  options.create_if_missing = true;
+  options.compression = CompressionType::kLZ4Compression;
+  options.bottommost_compression = CompressionType::kNoCompression;
+  options.bottommost_compression_opts.level = 2;
+  options.bottommost_compression_opts.parallel_threads = 1;
+  options.env = CurrentOptions().env;
+
+  ASSERT_OK(TryReopen(options));
+
+  CompressionType compression_used = CompressionType::kLZ4Compression;
+  CompressionOptions compression_opt_used;
+  bool compacted = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* c = reinterpret_cast<Compaction*>(arg);
+        compression_used = c->output_compression();
+        compression_opt_used = c->output_compression_opts();
+        compacted = true;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("foo", "foofoofoo"));
+  ASSERT_OK(Put("bar", "foofoofoo"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", "foofoofoo"));
+  ASSERT_OK(Put("bar", "foofoofoo"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(compacted);
+  ASSERT_EQ(CompressionType::kNoCompression, compression_used);
+  ASSERT_EQ(options.compression_opts.level, compression_opt_used.level);
+  ASSERT_EQ(options.compression_opts.parallel_threads,
+            compression_opt_used.parallel_threads);
+
+  compression_used = CompressionType::kLZ4Compression;
+  compacted = false;
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"bottommost_compression", "kSnappyCompression"},
+       {"bottommost_compression_opts", "0:6:0:0:4:true"}}));
+  ASSERT_OK(Put("foo", "foofoofoo"));
+  ASSERT_OK(Put("bar", "foofoofoo"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", "foofoofoo"));
+  ASSERT_OK(Put("bar", "foofoofoo"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(compacted);
+  ASSERT_EQ(CompressionType::kSnappyCompression, compression_used);
+  ASSERT_EQ(6, compression_opt_used.level);
+  // Right now parallel_level is not yet allowed to be changed.
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBOptionsTest, BottommostCompressionOptsWithFallbackType) {
+  // Verify the bottommost compression options still take effect even when the
+  // bottommost compression type is left at its default value. Verify for both
+  // automatic and manual compaction.
+  if (!Snappy_Supported() || !LZ4_Supported()) {
+    return;
+  }
+
+  constexpr int kUpperCompressionLevel = 1;
+  constexpr int kBottommostCompressionLevel = 2;
+  constexpr int kNumL0Files = 2;
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.compression = CompressionType::kLZ4Compression;
+  options.compression_opts.level = kUpperCompressionLevel;
+  options.bottommost_compression_opts.level = kBottommostCompressionLevel;
+  options.bottommost_compression_opts.enabled = true;
+  Reopen(options);
+
+  CompressionType compression_used = CompressionType::kDisableCompressionOption;
+  CompressionOptions compression_opt_used;
+  bool compacted = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionPicker::RegisterCompaction:Registered", [&](void* arg) {
+        Compaction* c = static_cast<Compaction*>(arg);
+        compression_used = c->output_compression();
+        compression_opt_used = c->output_compression_opts();
+        compacted = true;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // First, verify for automatic compaction.
+  for (int i = 0; i < kNumL0Files; ++i) {
+    ASSERT_OK(Put("foo", "foofoofoo"));
+    ASSERT_OK(Put("bar", "foofoofoo"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_TRUE(compacted);
+  ASSERT_EQ(CompressionType::kLZ4Compression, compression_used);
+  ASSERT_EQ(kBottommostCompressionLevel, compression_opt_used.level);
+
+  // Second, verify for manual compaction.
+  compacted = false;
+  compression_used = CompressionType::kDisableCompressionOption;
+  compression_opt_used = CompressionOptions();
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_TRUE(compacted);
+  ASSERT_EQ(CompressionType::kLZ4Compression, compression_used);
+  ASSERT_EQ(kBottommostCompressionLevel, compression_opt_used.level);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_properties_test.cc b/src/rocksdb/db/db_properties_test.cc
new file mode 100644
index 000000000..85cd5c04e
--- /dev/null
+++ b/src/rocksdb/db/db_properties_test.cc
@@ -0,0 +1,2206 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <stdio.h>
+
+#include <algorithm>
+#include <string>
+
+#include "db/db_test_util.h"
+#include "options/cf_options.h"
+#include "port/stack_trace.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/perf_level.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "table/table_builder.h"
+#include "test_util/mock_time_env.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBPropertiesTest : public DBTestBase {
+ public:
+  DBPropertiesTest()
+      : DBTestBase("db_properties_test", /*env_do_fsync=*/false) {}
+
+  void AssertDbStats(const std::map<std::string, std::string>& db_stats,
+                     double expected_uptime, int expected_user_bytes_written,
+                     int expected_wal_bytes_written,
+                     int expected_user_writes_by_self,
+                     int expected_user_writes_with_wal) {
+    ASSERT_EQ(std::to_string(expected_uptime), db_stats.at("db.uptime"));
+    ASSERT_EQ(std::to_string(expected_wal_bytes_written),
+              db_stats.at("db.wal_bytes_written"));
+    ASSERT_EQ("0", db_stats.at("db.wal_syncs"));
+    ASSERT_EQ(std::to_string(expected_user_bytes_written),
+              db_stats.at("db.user_bytes_written"));
+    ASSERT_EQ("0", db_stats.at("db.user_writes_by_other"));
+    ASSERT_EQ(std::to_string(expected_user_writes_by_self),
+              db_stats.at("db.user_writes_by_self"));
+    ASSERT_EQ(std::to_string(expected_user_writes_with_wal),
+              db_stats.at("db.user_writes_with_wal"));
+    ASSERT_EQ("0", db_stats.at("db.user_write_stall_micros"));
+  }
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBPropertiesTest, Empty) {
+  do {
+    Options options;
+    options.env = env_;
+    options.write_buffer_size = 100000;  // Small write buffer
+    options.allow_concurrent_memtable_write = false;
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    std::string num;
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ("0", num);
+
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ("1", num);
+
+    // Block sync calls
+    env_->delay_sstable_sync_.store(true, std::memory_order_release);
+    ASSERT_OK(Put(1, "k1", std::string(100000, 'x')));  // Fill memtable
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ("2", num);
+
+    ASSERT_OK(Put(1, "k2", std::string(100000, 'y')));  // Trigger compaction
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ("1", num);
+
+    ASSERT_EQ("v1", Get(1, "foo"));
+    // Release sync calls
+    env_->delay_sstable_sync_.store(false, std::memory_order_release);
+
+    ASSERT_OK(db_->DisableFileDeletions());
+    ASSERT_TRUE(
+        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+    ASSERT_EQ("0", num);
+
+    ASSERT_OK(db_->DisableFileDeletions());
+    ASSERT_TRUE(
+        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+    ASSERT_EQ("0", num);
+
+    ASSERT_OK(db_->DisableFileDeletions());
+    ASSERT_TRUE(
+        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+    ASSERT_EQ("0", num);
+
+    ASSERT_OK(db_->EnableFileDeletions(false));
+    ASSERT_TRUE(
+        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+    ASSERT_EQ("0", num);
+
+    ASSERT_OK(db_->EnableFileDeletions());
+    ASSERT_TRUE(
+        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+    ASSERT_EQ("1", num);
+  } while (ChangeOptions());
+}
+
+TEST_F(DBPropertiesTest, CurrentVersionNumber) {
+  uint64_t v1, v2, v3;
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v1));
+  ASSERT_OK(Put("12345678", ""));
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v2));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v3));
+
+  ASSERT_EQ(v1, v2);
+  ASSERT_GT(v3, v2);
+}
+
+TEST_F(DBPropertiesTest, GetAggregatedIntPropertyTest) {
+  const int kKeySize = 100;
+  const int kValueSize = 500;
+  const int kKeyNum = 100;
+
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.write_buffer_size = (kKeySize + kValueSize) * kKeyNum / 10;
+  // Make them never flush
+  options.min_write_buffer_number_to_merge = 1000;
+  options.max_write_buffer_number = 1000;
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"one", "two", "three", "four"}, options);
+
+  Random rnd(301);
+  for (auto* handle : handles_) {
+    for (int i = 0; i < kKeyNum; ++i) {
+      ASSERT_OK(db_->Put(WriteOptions(), handle, rnd.RandomString(kKeySize),
+                         rnd.RandomString(kValueSize)));
+    }
+  }
+
+  uint64_t manual_sum = 0;
+  uint64_t api_sum = 0;
+  uint64_t value = 0;
+  for (auto* handle : handles_) {
+    ASSERT_TRUE(
+        db_->GetIntProperty(handle, DB::Properties::kSizeAllMemTables, &value));
+    manual_sum += value;
+  }
+  ASSERT_TRUE(db_->GetAggregatedIntProperty(DB::Properties::kSizeAllMemTables,
+                                            &api_sum));
+  ASSERT_GT(manual_sum, 0);
+  ASSERT_EQ(manual_sum, api_sum);
+
+  ASSERT_FALSE(db_->GetAggregatedIntProperty(DB::Properties::kDBStats, &value));
+
+  uint64_t before_flush_trm;
+  uint64_t after_flush_trm;
+  for (auto* handle : handles_) {
+    ASSERT_TRUE(db_->GetAggregatedIntProperty(
+        DB::Properties::kEstimateTableReadersMem, &before_flush_trm));
+
+    // Issue flush and expect larger memory usage of table readers.
+    ASSERT_OK(db_->Flush(FlushOptions(), handle));
+
+    ASSERT_TRUE(db_->GetAggregatedIntProperty(
+        DB::Properties::kEstimateTableReadersMem, &after_flush_trm));
+    ASSERT_GT(after_flush_trm, before_flush_trm);
+  }
+}
+
+namespace {
+void ResetTableProperties(TableProperties* tp) {
+  tp->data_size = 0;
+  tp->index_size = 0;
+  tp->filter_size = 0;
+  tp->raw_key_size = 0;
+  tp->raw_value_size = 0;
+  tp->num_data_blocks = 0;
+  tp->num_entries = 0;
+  tp->num_deletions = 0;
+  tp->num_merge_operands = 0;
+  tp->num_range_deletions = 0;
+}
+
+void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) {
+  double dummy_double;
+  std::replace(tp_string.begin(), tp_string.end(), ';', ' ');
+  std::replace(tp_string.begin(), tp_string.end(), '=', ' ');
+  ResetTableProperties(tp);
+  sscanf(tp_string.c_str(),
+         "# data blocks %" SCNu64 " # entries %" SCNu64 " # deletions %" SCNu64
+         " # merge operands %" SCNu64 " # range deletions %" SCNu64
+         " raw key size %" SCNu64
+         " raw average key size %lf "
+         " raw value size %" SCNu64
+         " raw average value size %lf "
+         " data block size %" SCNu64 " index block size (user-key? %" SCNu64
+         ", delta-value? %" SCNu64 ") %" SCNu64 " filter block size %" SCNu64,
+         &tp->num_data_blocks, &tp->num_entries, &tp->num_deletions,
+         &tp->num_merge_operands, &tp->num_range_deletions, &tp->raw_key_size,
+         &dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size,
+         &tp->index_key_is_user_key, &tp->index_value_is_delta_encoded,
+         &tp->index_size, &tp->filter_size);
+}
+
+void VerifySimilar(uint64_t a, uint64_t b, double bias) {
+  ASSERT_EQ(a == 0U, b == 0U);
+  if (a == 0) {
+    return;
+  }
+  double dbl_a = static_cast<double>(a);
+  double dbl_b = static_cast<double>(b);
+  if (dbl_a > dbl_b) {
+    ASSERT_LT(static_cast<double>(dbl_a - dbl_b) / (dbl_a + dbl_b), bias);
+  } else {
+    ASSERT_LT(static_cast<double>(dbl_b - dbl_a) / (dbl_a + dbl_b), bias);
+  }
+}
+
+void VerifyTableProperties(
+    const TableProperties& base_tp, const TableProperties& new_tp,
+    double filter_size_bias = CACHE_LINE_SIZE >= 256 ? 0.18 : 0.1,
+    double index_size_bias = 0.1, double data_size_bias = 0.1,
+    double num_data_blocks_bias = 0.05) {
+  VerifySimilar(base_tp.data_size, new_tp.data_size, data_size_bias);
+  VerifySimilar(base_tp.index_size, new_tp.index_size, index_size_bias);
+  VerifySimilar(base_tp.filter_size, new_tp.filter_size, filter_size_bias);
+  VerifySimilar(base_tp.num_data_blocks, new_tp.num_data_blocks,
+                num_data_blocks_bias);
+
+  ASSERT_EQ(base_tp.raw_key_size, new_tp.raw_key_size);
+  ASSERT_EQ(base_tp.raw_value_size, new_tp.raw_value_size);
+  ASSERT_EQ(base_tp.num_entries, new_tp.num_entries);
+  ASSERT_EQ(base_tp.num_deletions, new_tp.num_deletions);
+  ASSERT_EQ(base_tp.num_range_deletions, new_tp.num_range_deletions);
+
+  // Merge operands may become Puts, so we only have an upper bound the exact
+  // number of merge operands.
+  ASSERT_GE(base_tp.num_merge_operands, new_tp.num_merge_operands);
+}
+
+void GetExpectedTableProperties(
+    TableProperties* expected_tp, const int kKeySize, const int kValueSize,
+    const int kPutsPerTable, const int kDeletionsPerTable,
+    const int kMergeOperandsPerTable, const int kRangeDeletionsPerTable,
+    const int kTableCount, const int kBloomBitsPerKey, const size_t kBlockSize,
+    const bool index_key_is_user_key, const bool value_delta_encoding) {
+  const int kKeysPerTable =
+      kPutsPerTable + kDeletionsPerTable + kMergeOperandsPerTable;
+  const int kPutCount = kTableCount * kPutsPerTable;
+  const int kDeletionCount = kTableCount * kDeletionsPerTable;
+  const int kMergeCount = kTableCount * kMergeOperandsPerTable;
+  const int kRangeDeletionCount = kTableCount * kRangeDeletionsPerTable;
+  const int kKeyCount =
+      kPutCount + kDeletionCount + kMergeCount + kRangeDeletionCount;
+  const int kAvgSuccessorSize = kKeySize / 5;
+  const int kEncodingSavePerKey = kKeySize / 4;
+  expected_tp->raw_key_size = kKeyCount * (kKeySize + 8);
+  expected_tp->raw_value_size =
+      (kPutCount + kMergeCount + kRangeDeletionCount) * kValueSize;
+  expected_tp->num_entries = kKeyCount;
+  expected_tp->num_deletions = kDeletionCount + kRangeDeletionCount;
+  expected_tp->num_merge_operands = kMergeCount;
+  expected_tp->num_range_deletions = kRangeDeletionCount;
+  expected_tp->num_data_blocks =
+      kTableCount *
+      (kKeysPerTable * (kKeySize - kEncodingSavePerKey + kValueSize)) /
+      kBlockSize;
+  expected_tp->data_size =
+      kTableCount * (kKeysPerTable * (kKeySize + 8 + kValueSize));
+  expected_tp->index_size =
+      expected_tp->num_data_blocks *
+      (kAvgSuccessorSize + (index_key_is_user_key ? 0 : 8) -
+       // discount 1 byte as value size is not encoded in value delta encoding
+       (value_delta_encoding ? 1 : 0));
+  expected_tp->filter_size =
+      kTableCount * ((kKeysPerTable * kBloomBitsPerKey + 7) / 8 +
+                     /*average-ish overhead*/ CACHE_LINE_SIZE / 2);
+}
+}  // anonymous namespace
+
+TEST_F(DBPropertiesTest, ValidatePropertyInfo) {
+  for (const auto& ppt_name_and_info : InternalStats::ppt_name_to_info) {
+    // If C++ gets a std::string_literal, this would be better to check at
+    // compile-time using static_assert.
+    ASSERT_TRUE(ppt_name_and_info.first.empty() ||
+                !isdigit(ppt_name_and_info.first.back()));
+
+    int count = 0;
+    count += (ppt_name_and_info.second.handle_string == nullptr) ? 0 : 1;
+    count += (ppt_name_and_info.second.handle_int == nullptr) ? 0 : 1;
+    count += (ppt_name_and_info.second.handle_string_dbimpl == nullptr) ? 0 : 1;
+    ASSERT_TRUE(count == 1);
+  }
+}
+
+TEST_F(DBPropertiesTest, ValidateSampleNumber) {
+  // When "max_open_files" is -1, we read all the files for
+  // "rocksdb.estimate-num-keys" computation, which is the ground truth.
+  // Otherwise, we sample 20 newest files to make an estimation.
+  // Formula: lastest_20_files_active_key_ratio * total_files
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.level0_stop_writes_trigger = 1000;
+  DestroyAndReopen(options);
+  int key = 0;
+  for (int files = 20; files >= 10; files -= 10) {
+    for (int i = 0; i < files; i++) {
+      int rows = files / 10;
+      for (int j = 0; j < rows; j++) {
+        ASSERT_OK(db_->Put(WriteOptions(), std::to_string(++key), "foo"));
+      }
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    }
+  }
+  std::string num;
+  Reopen(options);
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+  ASSERT_EQ("45", num);
+  options.max_open_files = -1;
+  Reopen(options);
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+  ASSERT_EQ("50", num);
+}
+
+TEST_F(DBPropertiesTest, AggregatedTableProperties) {
+  for (int kTableCount = 40; kTableCount <= 100; kTableCount += 30) {
+    const int kDeletionsPerTable = 0;
+    const int kMergeOperandsPerTable = 15;
+    const int kRangeDeletionsPerTable = 5;
+    const int kPutsPerTable = 100;
+    const int kKeySize = 80;
+    const int kValueSize = 200;
+    const int kBloomBitsPerKey = 20;
+
+    Options options = CurrentOptions();
+    options.level0_file_num_compaction_trigger = 8;
+    options.compression = kNoCompression;
+    options.create_if_missing = true;
+    options.merge_operator.reset(new TestPutOperator());
+
+    BlockBasedTableOptions table_options;
+    table_options.filter_policy.reset(
+        NewBloomFilterPolicy(kBloomBitsPerKey, false));
+    table_options.block_size = 1024;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    DestroyAndReopen(options);
+
+    // Hold open a snapshot to prevent range tombstones from being compacted
+    // away.
+    ManagedSnapshot snapshot(db_);
+
+    Random rnd(5632);
+    for (int table = 1; table <= kTableCount; ++table) {
+      for (int i = 0; i < kPutsPerTable; ++i) {
+        ASSERT_OK(db_->Put(WriteOptions(), rnd.RandomString(kKeySize),
+                           rnd.RandomString(kValueSize)));
+      }
+      for (int i = 0; i < kDeletionsPerTable; i++) {
+        ASSERT_OK(db_->Delete(WriteOptions(), rnd.RandomString(kKeySize)));
+      }
+      for (int i = 0; i < kMergeOperandsPerTable; i++) {
+        ASSERT_OK(db_->Merge(WriteOptions(), rnd.RandomString(kKeySize),
+                             rnd.RandomString(kValueSize)));
+      }
+      for (int i = 0; i < kRangeDeletionsPerTable; i++) {
+        std::string start = rnd.RandomString(kKeySize);
+        std::string end = start;
+        end.resize(kValueSize);
+        ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                   start, end));
+      }
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    }
+    std::string property;
+    db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property);
+    TableProperties output_tp;
+    ParseTablePropertiesString(property, &output_tp);
+    bool index_key_is_user_key = output_tp.index_key_is_user_key > 0;
+    bool value_is_delta_encoded = output_tp.index_value_is_delta_encoded > 0;
+
+    TableProperties expected_tp;
+    GetExpectedTableProperties(
+        &expected_tp, kKeySize, kValueSize, kPutsPerTable, kDeletionsPerTable,
+        kMergeOperandsPerTable, kRangeDeletionsPerTable, kTableCount,
+        kBloomBitsPerKey, table_options.block_size, index_key_is_user_key,
+        value_is_delta_encoded);
+
+    VerifyTableProperties(expected_tp, output_tp);
+  }
+}
+
+TEST_F(DBPropertiesTest, ReadLatencyHistogramByLevel) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 110 << 10;
+  options.level0_file_num_compaction_trigger = 6;
+  options.num_levels = 4;
+  options.compression = kNoCompression;
+  options.max_bytes_for_level_base = 4500 << 10;
+  options.target_file_size_base = 98 << 10;
+  options.max_write_buffer_number = 2;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.max_open_files = 11;  // Make sure no proloading of table readers
+
+  // RocksDB sanitize max open files to at least 20. Modify it back.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+        int* max_open_files = static_cast<int*>(arg);
+        *max_open_files = 11;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+
+  CreateAndReopenWithCF({"pikachu"}, options);
+  int key_index = 0;
+  Random rnd(301);
+  for (int num = 0; num < 8; num++) {
+    ASSERT_OK(Put("foo", "bar"));
+    GenerateNewFile(&rnd, &key_index);
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  std::string prop;
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop));
+
+  // Get() after flushes, See latency histogram tracked.
+  for (int key = 0; key < key_index; key++) {
+    Get(Key(key));
+  }
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cfstats", &prop));
+  ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+
+  // Reopen and issue Get(). See thee latency tracked
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  for (int key = 0; key < key_index; key++) {
+    Get(Key(key));
+  }
+
+  // Test for getting immutable_db_options_.statistics
+  ASSERT_TRUE(dbfull()->GetProperty(dbfull()->DefaultColumnFamily(),
+                                    "rocksdb.options-statistics", &prop));
+  ASSERT_NE(std::string::npos, prop.find("rocksdb.block.cache.miss"));
+  ASSERT_EQ(std::string::npos, prop.find("rocksdb.db.f.micros"));
+
+  ASSERT_TRUE(dbfull()->GetProperty(dbfull()->DefaultColumnFamily(),
+                                    "rocksdb.cf-file-histogram", &prop));
+  ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+
+  // Reopen and issue iterating. See thee latency tracked
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cf-file-histogram", &prop));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) {
+    }
+    ASSERT_OK(iter->status());
+  }
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cf-file-histogram", &prop));
+  ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+
+  // CF 1 should show no histogram.
+  ASSERT_TRUE(
+      dbfull()->GetProperty(handles_[1], "rocksdb.cf-file-histogram", &prop));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+  // put something and read it back , CF 1 should show histogram.
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("bar", Get(1, "foo"));
+
+  ASSERT_TRUE(
+      dbfull()->GetProperty(handles_[1], "rocksdb.cf-file-histogram", &prop));
+  ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+
+  // options.max_open_files preloads table readers.
+  options.max_open_files = -1;
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_TRUE(dbfull()->GetProperty(dbfull()->DefaultColumnFamily(),
+                                    "rocksdb.cf-file-histogram", &prop));
+  ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+  for (int key = 0; key < key_index; key++) {
+    Get(Key(key));
+  }
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cfstats", &prop));
+  ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+
+  // Clear internal stats
+  ASSERT_OK(dbfull()->ResetStats());
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cfstats", &prop));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+}
+
+TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
+  const int kTableCount = 100;
+  const int kDeletionsPerTable = 0;
+  const int kMergeOperandsPerTable = 2;
+  const int kRangeDeletionsPerTable = 2;
+  const int kPutsPerTable = 10;
+  const int kKeySize = 50;
+  const int kValueSize = 400;
+  const int kMaxLevel = 7;
+  const int kBloomBitsPerKey = 20;
+  Random rnd(301);
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 8;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.target_file_size_base = 8192;
+  options.max_bytes_for_level_base = 10000;
+  options.max_bytes_for_level_multiplier = 2;
+  // This ensures there no compaction happening when we call GetProperty().
+  options.disable_auto_compactions = true;
+  options.merge_operator.reset(new TestPutOperator());
+
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(
+      NewBloomFilterPolicy(kBloomBitsPerKey, false));
+  table_options.block_size = 1024;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  DestroyAndReopen(options);
+
+  // Hold open a snapshot to prevent range tombstones from being compacted away.
+  ManagedSnapshot snapshot(db_);
+
+  std::string level_tp_strings[kMaxLevel];
+  std::string tp_string;
+  TableProperties level_tps[kMaxLevel];
+  TableProperties tp, sum_tp, expected_tp;
+  for (int table = 1; table <= kTableCount; ++table) {
+    for (int i = 0; i < kPutsPerTable; ++i) {
+      ASSERT_OK(db_->Put(WriteOptions(), rnd.RandomString(kKeySize),
+                         rnd.RandomString(kValueSize)));
+    }
+    for (int i = 0; i < kDeletionsPerTable; i++) {
+      ASSERT_OK(db_->Delete(WriteOptions(), rnd.RandomString(kKeySize)));
+    }
+    for (int i = 0; i < kMergeOperandsPerTable; i++) {
+      ASSERT_OK(db_->Merge(WriteOptions(), rnd.RandomString(kKeySize),
+                           rnd.RandomString(kValueSize)));
+    }
+    for (int i = 0; i < kRangeDeletionsPerTable; i++) {
+      std::string start = rnd.RandomString(kKeySize);
+      std::string end = start;
+      end.resize(kValueSize);
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 start, end));
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    ResetTableProperties(&sum_tp);
+    for (int level = 0; level < kMaxLevel; ++level) {
+      db_->GetProperty(DB::Properties::kAggregatedTablePropertiesAtLevel +
+                           std::to_string(level),
+                       &level_tp_strings[level]);
+      ParseTablePropertiesString(level_tp_strings[level], &level_tps[level]);
+      sum_tp.data_size += level_tps[level].data_size;
+      sum_tp.index_size += level_tps[level].index_size;
+      sum_tp.filter_size += level_tps[level].filter_size;
+      sum_tp.raw_key_size += level_tps[level].raw_key_size;
+      sum_tp.raw_value_size += level_tps[level].raw_value_size;
+      sum_tp.num_data_blocks += level_tps[level].num_data_blocks;
+      sum_tp.num_entries += level_tps[level].num_entries;
+      sum_tp.num_deletions += level_tps[level].num_deletions;
+      sum_tp.num_merge_operands += level_tps[level].num_merge_operands;
+      sum_tp.num_range_deletions += level_tps[level].num_range_deletions;
+    }
+    db_->GetProperty(DB::Properties::kAggregatedTableProperties, &tp_string);
+    ParseTablePropertiesString(tp_string, &tp);
+    bool index_key_is_user_key = tp.index_key_is_user_key > 0;
+    bool value_is_delta_encoded = tp.index_value_is_delta_encoded > 0;
+    ASSERT_EQ(sum_tp.data_size, tp.data_size);
+    ASSERT_EQ(sum_tp.index_size, tp.index_size);
+    ASSERT_EQ(sum_tp.filter_size, tp.filter_size);
+    ASSERT_EQ(sum_tp.raw_key_size, tp.raw_key_size);
+    ASSERT_EQ(sum_tp.raw_value_size, tp.raw_value_size);
+    ASSERT_EQ(sum_tp.num_data_blocks, tp.num_data_blocks);
+    ASSERT_EQ(sum_tp.num_entries, tp.num_entries);
+    ASSERT_EQ(sum_tp.num_deletions, tp.num_deletions);
+    ASSERT_EQ(sum_tp.num_merge_operands, tp.num_merge_operands);
+    ASSERT_EQ(sum_tp.num_range_deletions, tp.num_range_deletions);
+    if (table > 3) {
+      GetExpectedTableProperties(
+          &expected_tp, kKeySize, kValueSize, kPutsPerTable, kDeletionsPerTable,
+          kMergeOperandsPerTable, kRangeDeletionsPerTable, table,
+          kBloomBitsPerKey, table_options.block_size, index_key_is_user_key,
+          value_is_delta_encoded);
+      // Gives larger bias here as index block size, filter block size,
+      // and data block size become much harder to estimate in this test.
+      VerifyTableProperties(expected_tp, tp, CACHE_LINE_SIZE >= 256 ? 0.6 : 0.5,
+                            0.5, 0.5, 0.25);
+    }
+  }
+}
+
+TEST_F(DBPropertiesTest, NumImmutableMemTable) {
+  do {
+    Options options = CurrentOptions();
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    options.max_write_buffer_number = 4;
+    options.min_write_buffer_number_to_merge = 3;
+    options.write_buffer_size = 1000000;
+    options.max_write_buffer_size_to_maintain =
+        5 * static_cast<int64_t>(options.write_buffer_size);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    std::string big_value(1000000 * 2, 'x');
+    std::string num;
+    uint64_t value;
+    SetPerfLevel(kEnableTime);
+    ASSERT_TRUE(GetPerfLevel() == kEnableTime);
+
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k1", big_value));
+    ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
+                                      "rocksdb.num-immutable-mem-table", &num));
+    ASSERT_EQ(num, "0");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], DB::Properties::kNumImmutableMemTableFlushed, &num));
+    ASSERT_EQ(num, "0");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ(num, "1");
+    get_perf_context()->Reset();
+    Get(1, "k1");
+    ASSERT_EQ(1, static_cast<int>(get_perf_context()->get_from_memtable_count));
+
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value));
+    ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
+                                      "rocksdb.num-immutable-mem-table", &num));
+    ASSERT_EQ(num, "1");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ(num, "1");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-imm-mem-tables", &num));
+    ASSERT_EQ(num, "1");
+
+    get_perf_context()->Reset();
+    Get(1, "k1");
+    ASSERT_EQ(2, static_cast<int>(get_perf_context()->get_from_memtable_count));
+    get_perf_context()->Reset();
+    Get(1, "k2");
+    ASSERT_EQ(1, static_cast<int>(get_perf_context()->get_from_memtable_count));
+
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", big_value));
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.cur-size-active-mem-table", &num));
+    ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
+                                      "rocksdb.num-immutable-mem-table", &num));
+    ASSERT_EQ(num, "2");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ(num, "1");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-imm-mem-tables", &num));
+    ASSERT_EQ(num, "2");
+    get_perf_context()->Reset();
+    Get(1, "k2");
+    ASSERT_EQ(2, static_cast<int>(get_perf_context()->get_from_memtable_count));
+    get_perf_context()->Reset();
+    Get(1, "k3");
+    ASSERT_EQ(1, static_cast<int>(get_perf_context()->get_from_memtable_count));
+    get_perf_context()->Reset();
+    Get(1, "k1");
+    ASSERT_EQ(3, static_cast<int>(get_perf_context()->get_from_memtable_count));
+
+    ASSERT_OK(Flush(1));
+    ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
+                                      "rocksdb.num-immutable-mem-table", &num));
+    ASSERT_EQ(num, "0");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], DB::Properties::kNumImmutableMemTableFlushed, &num));
+    ASSERT_EQ(num, "3");
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.cur-size-active-mem-table", &value));
+    // "192" is the size of the metadata of two empty skiplists, this would
+    // break if we change the default skiplist implementation
+    ASSERT_GE(value, 192);
+
+    uint64_t int_num;
+    uint64_t base_total_size;
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.estimate-num-keys", &base_total_size));
+
+    ASSERT_OK(dbfull()->Delete(writeOpt, handles_[1], "k2"));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", ""));
+    ASSERT_OK(dbfull()->Delete(writeOpt, handles_[1], "k3"));
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.num-deletes-active-mem-table", &int_num));
+    ASSERT_EQ(int_num, 2U);
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &int_num));
+    ASSERT_EQ(int_num, 3U);
+
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value));
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.num-entries-imm-mem-tables", &int_num));
+    ASSERT_EQ(int_num, 4U);
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.num-deletes-imm-mem-tables", &int_num));
+    ASSERT_EQ(int_num, 2U);
+
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.estimate-num-keys", &int_num));
+    ASSERT_EQ(int_num, base_total_size + 1);
+
+    SetPerfLevel(kDisable);
+    ASSERT_TRUE(GetPerfLevel() == kDisable);
+  } while (ChangeCompactOptions());
+}
+
+// TODO(techdept) : Disabled flaky test #12863555
+TEST_F(DBPropertiesTest, DISABLED_GetProperty) {
+  // Set sizes to both background thread pool to be 1 and block them.
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  env_->SetBackgroundThreads(1, Env::LOW);
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  test::SleepingBackgroundTask sleeping_task_high;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                 &sleeping_task_high, Env::Priority::HIGH);
+
+  Options options = CurrentOptions();
+  WriteOptions writeOpt = WriteOptions();
+  writeOpt.disableWAL = true;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.level0_file_num_compaction_trigger = 1;
+  options.compaction_options_universal.size_ratio = 50;
+  options.max_background_compactions = 1;
+  options.max_background_flushes = 1;
+  options.max_write_buffer_number = 10;
+  options.min_write_buffer_number_to_merge = 1;
+  options.max_write_buffer_size_to_maintain = 0;
+  options.write_buffer_size = 1000000;
+  Reopen(options);
+
+  std::string big_value(1000000 * 2, 'x');
+  std::string num;
+  uint64_t int_num;
+  SetPerfLevel(kEnableTime);
+
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_EQ(int_num, 0U);
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-live-data-size", &int_num));
+  ASSERT_EQ(int_num, 0U);
+
+  ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value));
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
+  ASSERT_EQ(num, "0");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num));
+  ASSERT_EQ(num, "0");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num));
+  ASSERT_EQ(num, "0");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+  ASSERT_EQ(num, "1");
+  get_perf_context()->Reset();
+
+  ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value));
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
+  ASSERT_EQ(num, "1");
+  ASSERT_OK(dbfull()->Delete(writeOpt, "k-non-existing"));
+  ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value));
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
+  ASSERT_EQ(num, "2");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num));
+  ASSERT_EQ(num, "1");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num));
+  ASSERT_EQ(num, "0");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+  ASSERT_EQ(num, "2");
+  // Verify the same set of properties through GetIntProperty
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.num-immutable-mem-table", &int_num));
+  ASSERT_EQ(int_num, 2U);
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.mem-table-flush-pending", &int_num));
+  ASSERT_EQ(int_num, 1U);
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.compaction-pending", &int_num));
+  ASSERT_EQ(int_num, 0U);
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &int_num));
+  ASSERT_EQ(int_num, 2U);
+
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_EQ(int_num, 0U);
+
+  sleeping_task_high.WakeUp();
+  sleeping_task_high.WaitUntilDone();
+  dbfull()->TEST_WaitForFlushMemTable();
+
+  ASSERT_OK(dbfull()->Put(writeOpt, "k4", big_value));
+  ASSERT_OK(dbfull()->Put(writeOpt, "k5", big_value));
+  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num));
+  ASSERT_EQ(num, "0");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num));
+  ASSERT_EQ(num, "1");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+  ASSERT_EQ(num, "4");
+
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_GT(int_num, 0U);
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+
+  // Wait for compaction to be done. This is important because otherwise RocksDB
+  // might schedule a compaction when reopening the database, failing assertion
+  // (A) as a result.
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  options.max_open_files = 10;
+  Reopen(options);
+  // After reopening, no table reader is loaded, so no memory for table readers
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_EQ(int_num, 0U);  // (A)
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &int_num));
+  ASSERT_GT(int_num, 0U);
+
+  // After reading a key, at least one table reader is loaded.
+  Get("k5");
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_GT(int_num, 0U);
+
+  // Test rocksdb.num-live-versions
+  {
+    options.level0_file_num_compaction_trigger = 20;
+    Reopen(options);
+    ASSERT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+    ASSERT_EQ(int_num, 1U);
+
+    // Use an iterator to hold current version
+    std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
+
+    ASSERT_OK(dbfull()->Put(writeOpt, "k6", big_value));
+    ASSERT_OK(Flush());
+    ASSERT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+    ASSERT_EQ(int_num, 2U);
+
+    // Use an iterator to hold current version
+    std::unique_ptr<Iterator> iter2(dbfull()->NewIterator(ReadOptions()));
+
+    ASSERT_OK(dbfull()->Put(writeOpt, "k7", big_value));
+    ASSERT_OK(Flush());
+    ASSERT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+    ASSERT_EQ(int_num, 3U);
+
+    iter2.reset();
+    ASSERT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+    ASSERT_EQ(int_num, 2U);
+
+    iter1.reset();
+    ASSERT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+    ASSERT_EQ(int_num, 1U);
+  }
+}
+
+TEST_F(DBPropertiesTest, ApproximateMemoryUsage) {
+  const int kNumRounds = 10;
+  // TODO(noetzli) kFlushesPerRound does not really correlate with how many
+  // flushes happen.
+  const int kFlushesPerRound = 10;
+  const int kWritesPerFlush = 10;
+  const int kKeySize = 100;
+  const int kValueSize = 1000;
+  Options options;
+  options.write_buffer_size = 1000;  // small write buffer
+  options.min_write_buffer_number_to_merge = 4;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+
+  std::vector<Iterator*> iters;
+
+  uint64_t active_mem;
+  uint64_t unflushed_mem;
+  uint64_t all_mem;
+  uint64_t prev_all_mem;
+
+  // Phase 0. The verify the initial value of all these properties are the same
+  // as we have no mem-tables.
+  dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem);
+  dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+  dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+  ASSERT_EQ(all_mem, active_mem);
+  ASSERT_EQ(all_mem, unflushed_mem);
+
+  // Phase 1. Simply issue Put() and expect "cur-size-all-mem-tables" equals to
+  // "size-all-mem-tables"
+  for (int r = 0; r < kNumRounds; ++r) {
+    for (int f = 0; f < kFlushesPerRound; ++f) {
+      for (int w = 0; w < kWritesPerFlush; ++w) {
+        ASSERT_OK(
+            Put(rnd.RandomString(kKeySize), rnd.RandomString(kValueSize)));
+      }
+    }
+    // Make sure that there is no flush between getting the two properties.
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+    dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+    // in no iterator case, these two number should be the same.
+    ASSERT_EQ(unflushed_mem, all_mem);
+  }
+  prev_all_mem = all_mem;
+
+  // Phase 2. Keep issuing Put() but also create new iterators. This time we
+  // expect "size-all-mem-tables" > "cur-size-all-mem-tables".
+  for (int r = 0; r < kNumRounds; ++r) {
+    iters.push_back(db_->NewIterator(ReadOptions()));
+    for (int f = 0; f < kFlushesPerRound; ++f) {
+      for (int w = 0; w < kWritesPerFlush; ++w) {
+        ASSERT_OK(
+            Put(rnd.RandomString(kKeySize), rnd.RandomString(kValueSize)));
+      }
+    }
+    // Force flush to prevent flush from happening between getting the
+    // properties or after getting the properties and before the new round.
+    ASSERT_OK(Flush());
+
+    // In the second round, add iterators.
+    dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem);
+    dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+    dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+    ASSERT_GT(all_mem, active_mem);
+    ASSERT_GT(all_mem, unflushed_mem);
+    ASSERT_GT(all_mem, prev_all_mem);
+    prev_all_mem = all_mem;
+  }
+
+  // Phase 3. Delete iterators and expect "size-all-mem-tables" shrinks
+  // whenever we release an iterator.
+  for (auto* iter : iters) {
+    ASSERT_OK(iter->status());
+    delete iter;
+    dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+    // Expect the size shrinking
+    ASSERT_LT(all_mem, prev_all_mem);
+    prev_all_mem = all_mem;
+  }
+
+  // Expect all these three counters to be the same.
+  dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem);
+  dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+  dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+  ASSERT_EQ(active_mem, unflushed_mem);
+  ASSERT_EQ(unflushed_mem, all_mem);
+
+  // Phase 5. Reopen, and expect all these three counters to be the same again.
+  Reopen(options);
+  dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem);
+  dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+  dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+  ASSERT_EQ(active_mem, unflushed_mem);
+  ASSERT_EQ(unflushed_mem, all_mem);
+}
+
+TEST_F(DBPropertiesTest, EstimatePendingCompBytes) {
+  // Set sizes to both background thread pool to be 1 and block them.
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  env_->SetBackgroundThreads(1, Env::LOW);
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  Options options = CurrentOptions();
+  WriteOptions writeOpt = WriteOptions();
+  writeOpt.disableWAL = true;
+  options.compaction_style = kCompactionStyleLevel;
+  options.level0_file_num_compaction_trigger = 2;
+  options.max_background_compactions = 1;
+  options.max_background_flushes = 1;
+  options.max_write_buffer_number = 10;
+  options.min_write_buffer_number_to_merge = 1;
+  options.max_write_buffer_size_to_maintain = 0;
+  options.write_buffer_size = 1000000;
+  Reopen(options);
+
+  std::string big_value(1000000 * 2, 'x');
+  std::string num;
+  uint64_t int_num;
+
+  ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(dbfull()->GetIntProperty(
+      "rocksdb.estimate-pending-compaction-bytes", &int_num));
+  ASSERT_EQ(int_num, 0U);
+
+  ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(dbfull()->GetIntProperty(
+      "rocksdb.estimate-pending-compaction-bytes", &int_num));
+  ASSERT_GT(int_num, 0U);
+
+  ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(dbfull()->GetIntProperty(
+      "rocksdb.estimate-pending-compaction-bytes", &int_num));
+  ASSERT_GT(int_num, 0U);
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(dbfull()->GetIntProperty(
+      "rocksdb.estimate-pending-compaction-bytes", &int_num));
+  ASSERT_EQ(int_num, 0U);
+}
+
+TEST_F(DBPropertiesTest, EstimateCompressionRatio) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+  const int kNumL0Files = 3;
+  const int kNumEntriesPerFile = 1000;
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = 3;
+  Reopen(options);
+
+  ASSERT_OK(db_->SetOptions(
+      {{"compression_per_level", "kNoCompression:kSnappyCompression"}}));
+  auto opts = db_->GetOptions();
+  ASSERT_EQ(opts.compression_per_level.size(), 2);
+  ASSERT_EQ(opts.compression_per_level[0], kNoCompression);
+  ASSERT_EQ(opts.compression_per_level[1], kSnappyCompression);
+
+  // compression ratio is -1.0 when no open files at level
+  ASSERT_EQ(CompressionRatioAtLevel(0), -1.0);
+
+  const std::string kVal(100, 'a');
+  for (int i = 0; i < kNumL0Files; ++i) {
+    for (int j = 0; j < kNumEntriesPerFile; ++j) {
+      // Put common data ("key") at end to prevent delta encoding from
+      // compressing the key effectively
+      std::string key = std::to_string(i) + std::to_string(j) + "key";
+      ASSERT_OK(dbfull()->Put(WriteOptions(), key, kVal));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // no compression at L0, so ratio is less than one
+  ASSERT_LT(CompressionRatioAtLevel(0), 1.0);
+  ASSERT_GT(CompressionRatioAtLevel(0), 0.0);
+  ASSERT_EQ(CompressionRatioAtLevel(1), -1.0);
+
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+
+  ASSERT_EQ(CompressionRatioAtLevel(0), -1.0);
+  // Data at L1 should be highly compressed thanks to Snappy and redundant data
+  // in values (ratio is 12.846 as of 4/19/2016).
+  ASSERT_GT(CompressionRatioAtLevel(1), 10.0);
+}
+
+#endif  // ROCKSDB_LITE
+
+class CountingUserTblPropCollector : public TablePropertiesCollector {
+ public:
+  const char* Name() const override { return "CountingUserTblPropCollector"; }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    std::string encoded;
+    PutVarint32(&encoded, count_);
+    *properties = UserCollectedProperties{
+        {"CountingUserTblPropCollector", message_},
+        {"Count", encoded},
+    };
+    return Status::OK();
+  }
+
+  Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
+                    EntryType /*type*/, SequenceNumber /*seq*/,
+                    uint64_t /*file_size*/) override {
+    ++count_;
+    return Status::OK();
+  }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{};
+  }
+
+ private:
+  std::string message_ = "Rocksdb";
+  uint32_t count_ = 0;
+};
+
+class CountingUserTblPropCollectorFactory
+    : public TablePropertiesCollectorFactory {
+ public:
+  explicit CountingUserTblPropCollectorFactory(
+      uint32_t expected_column_family_id)
+      : expected_column_family_id_(expected_column_family_id),
+        num_created_(0) {}
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context context) override {
+    EXPECT_EQ(expected_column_family_id_, context.column_family_id);
+    num_created_++;
+    return new CountingUserTblPropCollector();
+  }
+  const char* Name() const override {
+    return "CountingUserTblPropCollectorFactory";
+  }
+  void set_expected_column_family_id(uint32_t v) {
+    expected_column_family_id_ = v;
+  }
+  uint32_t expected_column_family_id_;
+  uint32_t num_created_;
+};
+
+class CountingDeleteTabPropCollector : public TablePropertiesCollector {
+ public:
+  const char* Name() const override { return "CountingDeleteTabPropCollector"; }
+
+  Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
+                    EntryType type, SequenceNumber /*seq*/,
+                    uint64_t /*file_size*/) override {
+    if (type == kEntryDelete) {
+      num_deletes_++;
+    }
+    return Status::OK();
+  }
+
+  bool NeedCompact() const override { return num_deletes_ > 10; }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{};
+  }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    *properties =
+        UserCollectedProperties{{"num_delete", std::to_string(num_deletes_)}};
+    return Status::OK();
+  }
+
+ private:
+  uint32_t num_deletes_ = 0;
+};
+
+class CountingDeleteTabPropCollectorFactory
+    : public TablePropertiesCollectorFactory {
+ public:
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /*context*/) override {
+    return new CountingDeleteTabPropCollector();
+  }
+  const char* Name() const override {
+    return "CountingDeleteTabPropCollectorFactory";
+  }
+};
+
+class BlockCountingTablePropertiesCollector : public TablePropertiesCollector {
+ public:
+  static const std::string kNumSampledBlocksPropertyName;
+
+  const char* Name() const override {
+    return "BlockCountingTablePropertiesCollector";
+  }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    (*properties)[kNumSampledBlocksPropertyName] =
+        std::to_string(num_sampled_blocks_);
+    return Status::OK();
+  }
+
+  Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
+                    EntryType /*type*/, SequenceNumber /*seq*/,
+                    uint64_t /*file_size*/) override {
+    return Status::OK();
+  }
+
+  void BlockAdd(uint64_t /* block_uncomp_bytes */,
+                uint64_t block_compressed_bytes_fast,
+                uint64_t block_compressed_bytes_slow) override {
+    if (block_compressed_bytes_fast > 0 || block_compressed_bytes_slow > 0) {
+      num_sampled_blocks_++;
+    }
+  }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{
+        {kNumSampledBlocksPropertyName, std::to_string(num_sampled_blocks_)},
+    };
+  }
+
+ private:
+  uint32_t num_sampled_blocks_ = 0;
+};
+
+const std::string
+    BlockCountingTablePropertiesCollector::kNumSampledBlocksPropertyName =
+        "NumSampledBlocks";
+
+class BlockCountingTablePropertiesCollectorFactory
+    : public TablePropertiesCollectorFactory {
+ public:
+  const char* Name() const override {
+    return "BlockCountingTablePropertiesCollectorFactory";
+  }
+
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /* context */) override {
+    return new BlockCountingTablePropertiesCollector();
+  }
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBPropertiesTest, GetUserDefinedTableProperties) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = (1 << 30);
+  options.table_properties_collector_factories.resize(1);
+  std::shared_ptr<CountingUserTblPropCollectorFactory> collector_factory =
+      std::make_shared<CountingUserTblPropCollectorFactory>(0);
+  options.table_properties_collector_factories[0] = collector_factory;
+  Reopen(options);
+  // Create 4 tables
+  for (int table = 0; table < 4; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      ASSERT_OK(
+          db_->Put(WriteOptions(), std::to_string(table * 100 + i), "val"));
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  }
+
+  TablePropertiesCollection props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+  ASSERT_EQ(4U, props.size());
+  uint32_t sum = 0;
+  for (const auto& item : props) {
+    auto& user_collected = item.second->user_collected_properties;
+    ASSERT_TRUE(user_collected.find("CountingUserTblPropCollector") !=
+                user_collected.end());
+    ASSERT_EQ(user_collected.at("CountingUserTblPropCollector"), "Rocksdb");
+    ASSERT_TRUE(user_collected.find("Count") != user_collected.end());
+    Slice key(user_collected.at("Count"));
+    uint32_t count;
+    ASSERT_TRUE(GetVarint32(&key, &count));
+    sum += count;
+  }
+  ASSERT_EQ(10u + 11u + 12u + 13u, sum);
+
+  ASSERT_GT(collector_factory->num_created_, 0U);
+  collector_factory->num_created_ = 0;
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+  ASSERT_GT(collector_factory->num_created_, 0U);
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBPropertiesTest, UserDefinedTablePropertiesContext) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 3;
+  options.table_properties_collector_factories.resize(1);
+  std::shared_ptr<CountingUserTblPropCollectorFactory> collector_factory =
+      std::make_shared<CountingUserTblPropCollectorFactory>(1);
+  options.table_properties_collector_factories[0] = collector_factory,
+  CreateAndReopenWithCF({"pikachu"}, options);
+  // Create 2 files
+  for (int table = 0; table < 2; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      ASSERT_OK(Put(1, std::to_string(table * 100 + i), "val"));
+    }
+    ASSERT_OK(Flush(1));
+  }
+  ASSERT_GT(collector_factory->num_created_, 0U);
+
+  collector_factory->num_created_ = 0;
+  // Trigger automatic compactions.
+  for (int table = 0; table < 3; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      ASSERT_OK(Put(1, std::to_string(table * 100 + i), "val"));
+    }
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_GT(collector_factory->num_created_, 0U);
+
+  collector_factory->num_created_ = 0;
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+  ASSERT_GT(collector_factory->num_created_, 0U);
+
+  // Come back to write to default column family
+  collector_factory->num_created_ = 0;
+  collector_factory->set_expected_column_family_id(0);  // default CF
+  // Create 4 tables in default column family
+  for (int table = 0; table < 2; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      ASSERT_OK(Put(std::to_string(table * 100 + i), "val"));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_GT(collector_factory->num_created_, 0U);
+
+  collector_factory->num_created_ = 0;
+  // Trigger automatic compactions.
+  for (int table = 0; table < 3; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      ASSERT_OK(Put(std::to_string(table * 100 + i), "val"));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_GT(collector_factory->num_created_, 0U);
+
+  collector_factory->num_created_ = 0;
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+  ASSERT_GT(collector_factory->num_created_, 0U);
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBPropertiesTest, TablePropertiesNeedCompactTest) {
+  Random rnd(301);
+
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = 4096;
+  options.max_write_buffer_number = 8;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 4;
+  options.target_file_size_base = 2048;
+  options.max_bytes_for_level_base = 10240;
+  options.max_bytes_for_level_multiplier = 4;
+  options.soft_pending_compaction_bytes_limit = 1024 * 1024;
+  options.num_levels = 8;
+  options.env = env_;
+
+  std::shared_ptr<TablePropertiesCollectorFactory> collector_factory =
+      std::make_shared<CountingDeleteTabPropCollectorFactory>();
+  options.table_properties_collector_factories.resize(1);
+  options.table_properties_collector_factories[0] = collector_factory;
+
+  DestroyAndReopen(options);
+
+  const int kMaxKey = 1000;
+  for (int i = 0; i < kMaxKey; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(102)));
+    ASSERT_OK(Put(Key(kMaxKey + i), rnd.RandomString(102)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  if (NumTableFilesAtLevel(0) == 1) {
+    // Clear Level 0 so that when later flush a file with deletions,
+    // we don't trigger an organic compaction.
+    ASSERT_OK(Put(Key(0), ""));
+    ASSERT_OK(Put(Key(kMaxKey * 2), ""));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  {
+    int c = 0;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    iter->Seek(Key(kMaxKey - 100));
+    while (iter->Valid() && iter->key().compare(Key(kMaxKey + 100)) < 0) {
+      iter->Next();
+      ++c;
+    }
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(c, 200);
+  }
+
+  ASSERT_OK(Delete(Key(0)));
+  for (int i = kMaxKey - 100; i < kMaxKey + 100; i++) {
+    ASSERT_OK(Delete(Key(i)));
+  }
+  ASSERT_OK(Delete(Key(kMaxKey * 2)));
+
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  {
+    SetPerfLevel(kEnableCount);
+    get_perf_context()->Reset();
+    int c = 0;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    iter->Seek(Key(kMaxKey - 100));
+    while (iter->Valid() && iter->key().compare(Key(kMaxKey + 100)) < 0) {
+      iter->Next();
+    }
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(c, 0);
+    ASSERT_LT(get_perf_context()->internal_delete_skipped_count, 30u);
+    ASSERT_LT(get_perf_context()->internal_key_skipped_count, 30u);
+    SetPerfLevel(kDisable);
+  }
+}
+
+TEST_F(DBPropertiesTest, NeedCompactHintPersistentTest) {
+  Random rnd(301);
+
+  Options options;
+  options.create_if_missing = true;
+  options.max_write_buffer_number = 8;
+  options.level0_file_num_compaction_trigger = 10;
+  options.level0_slowdown_writes_trigger = 10;
+  options.level0_stop_writes_trigger = 10;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+
+  std::shared_ptr<TablePropertiesCollectorFactory> collector_factory =
+      std::make_shared<CountingDeleteTabPropCollectorFactory>();
+  options.table_properties_collector_factories.resize(1);
+  options.table_properties_collector_factories[0] = collector_factory;
+
+  DestroyAndReopen(options);
+
+  const int kMaxKey = 100;
+  for (int i = 0; i < kMaxKey; i++) {
+    ASSERT_OK(Put(Key(i), ""));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  for (int i = 1; i < kMaxKey - 1; i++) {
+    ASSERT_OK(Delete(Key(i)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+
+  // Restart the DB. Although number of files didn't reach
+  // options.level0_file_num_compaction_trigger, compaction should
+  // still be triggered because of the need-compaction hint.
+  options.disable_auto_compactions = false;
+  Reopen(options);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  {
+    SetPerfLevel(kEnableCount);
+    get_perf_context()->Reset();
+    int c = 0;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) {
+      c++;
+    }
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(c, 2);
+    ASSERT_EQ(get_perf_context()->internal_delete_skipped_count, 0);
+    // We iterate every key twice. Is it a bug?
+    ASSERT_LE(get_perf_context()->internal_key_skipped_count, 2);
+    SetPerfLevel(kDisable);
+  }
+}
+
+// Excluded from RocksDB lite tests due to `GetPropertiesOfAllTables()` usage.
+TEST_F(DBPropertiesTest, BlockAddForCompressionSampling) {
+  // Sampled compression requires at least one of the following four types.
+  if (!Snappy_Supported() && !Zlib_Supported() && !LZ4_Supported() &&
+      !ZSTD_Supported()) {
+    return;
+  }
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.table_properties_collector_factories.emplace_back(
+      std::make_shared<BlockCountingTablePropertiesCollectorFactory>());
+
+  for (bool sample_for_compression : {false, true}) {
+    // For simplicity/determinism, sample 100% when enabled, or 0% when disabled
+    options.sample_for_compression = sample_for_compression ? 1 : 0;
+
+    DestroyAndReopen(options);
+
+    // Setup the following LSM:
+    //
+    // L0_0 ["a", "b"]
+    // L1_0 ["a", "b"]
+    //
+    // L0_0 was created by flush. L1_0 was created by compaction. Each file
+    // contains one data block.
+    for (int i = 0; i < 3; ++i) {
+      ASSERT_OK(Put("a", "val"));
+      ASSERT_OK(Put("b", "val"));
+      ASSERT_OK(Flush());
+      if (i == 1) {
+        ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+      }
+    }
+
+    // A `BlockAdd()` should have been seen for files generated by flush or
+    // compaction when `sample_for_compression` is enabled.
+    TablePropertiesCollection file_to_props;
+    ASSERT_OK(db_->GetPropertiesOfAllTables(&file_to_props));
+    ASSERT_EQ(2, file_to_props.size());
+    for (const auto& file_and_props : file_to_props) {
+      auto& user_props = file_and_props.second->user_collected_properties;
+      ASSERT_TRUE(user_props.find(BlockCountingTablePropertiesCollector::
+                                      kNumSampledBlocksPropertyName) !=
+                  user_props.end());
+      ASSERT_EQ(user_props.at(BlockCountingTablePropertiesCollector::
+                                  kNumSampledBlocksPropertyName),
+                std::to_string(sample_for_compression ? 1 : 0));
+    }
+  }
+}
+
+class CompressionSamplingDBPropertiesTest
+    : public DBPropertiesTest,
+      public ::testing::WithParamInterface<bool> {
+ public:
+  CompressionSamplingDBPropertiesTest() : fast_(GetParam()) {}
+
+ protected:
+  const bool fast_;
+};
+
+INSTANTIATE_TEST_CASE_P(CompressionSamplingDBPropertiesTest,
+                        CompressionSamplingDBPropertiesTest, ::testing::Bool());
+
+// Excluded from RocksDB lite tests due to `GetPropertiesOfAllTables()` usage.
+TEST_P(CompressionSamplingDBPropertiesTest,
+       EstimateDataSizeWithCompressionSampling) {
+  Options options = CurrentOptions();
+  if (fast_) {
+    // One of the following light compression libraries must be present.
+    if (LZ4_Supported()) {
+      options.compression = kLZ4Compression;
+    } else if (Snappy_Supported()) {
+      options.compression = kSnappyCompression;
+    } else {
+      return;
+    }
+  } else {
+    // One of the following heavy compression libraries must be present.
+    if (ZSTD_Supported()) {
+      options.compression = kZSTD;
+    } else if (Zlib_Supported()) {
+      options.compression = kZlibCompression;
+    } else {
+      return;
+    }
+  }
+  options.disable_auto_compactions = true;
+  // For simplicity/determinism, sample 100%.
+  options.sample_for_compression = 1;
+  Reopen(options);
+
+  // Setup the following LSM:
+  //
+  // L0_0 ["a", "b"]
+  // L1_0 ["a", "b"]
+  //
+  // L0_0 was created by flush. L1_0 was created by compaction. Each file
+  // contains one data block. The value consists of compressible data so the
+  // data block should be stored compressed.
+  std::string val(1024, 'a');
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("a", val));
+    ASSERT_OK(Put("b", val));
+    ASSERT_OK(Flush());
+    if (i == 1) {
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    }
+  }
+
+  TablePropertiesCollection file_to_props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&file_to_props));
+  ASSERT_EQ(2, file_to_props.size());
+  for (const auto& file_and_props : file_to_props) {
+    ASSERT_GT(file_and_props.second->data_size, 0);
+    if (fast_) {
+      ASSERT_EQ(file_and_props.second->data_size,
+                file_and_props.second->fast_compression_estimated_data_size);
+    } else {
+      ASSERT_EQ(file_and_props.second->data_size,
+                file_and_props.second->slow_compression_estimated_data_size);
+    }
+  }
+}
+
+TEST_F(DBPropertiesTest, EstimateNumKeysUnderflow) {
+  Options options = CurrentOptions();
+  Reopen(options);
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Delete("foo"));
+  ASSERT_OK(Delete("foo"));
+  uint64_t num_keys = 0;
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &num_keys));
+  ASSERT_EQ(0, num_keys);
+}
+
+TEST_F(DBPropertiesTest, EstimateOldestKeyTime) {
+  uint64_t oldest_key_time = 0;
+  Options options = CurrentOptions();
+  SetTimeElapseOnlySleepOnReopen(&options);
+
+  // "rocksdb.estimate-oldest-key-time" only available to fifo compaction.
+  for (auto compaction : {kCompactionStyleLevel, kCompactionStyleUniversal,
+                          kCompactionStyleNone}) {
+    options.compaction_style = compaction;
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_FALSE(dbfull()->GetIntProperty(
+        DB::Properties::kEstimateOldestKeyTime, &oldest_key_time));
+  }
+
+  int64_t mock_start_time;
+  ASSERT_OK(env_->GetCurrentTime(&mock_start_time));
+
+  options.compaction_style = kCompactionStyleFIFO;
+  options.ttl = 300;
+  options.max_open_files = -1;
+  options.compaction_options_fifo.allow_compaction = false;
+  DestroyAndReopen(options);
+
+  env_->MockSleepForSeconds(100);
+  ASSERT_OK(Put("k1", "v1"));
+  ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+                                       &oldest_key_time));
+  ASSERT_EQ(100, oldest_key_time - mock_start_time);
+  ASSERT_OK(Flush());
+  ASSERT_EQ("1", FilesPerLevel());
+  ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+                                       &oldest_key_time));
+  ASSERT_EQ(100, oldest_key_time - mock_start_time);
+
+  env_->MockSleepForSeconds(100);  // -> 200
+  ASSERT_OK(Put("k2", "v2"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("2", FilesPerLevel());
+  ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+                                       &oldest_key_time));
+  ASSERT_EQ(100, oldest_key_time - mock_start_time);
+
+  env_->MockSleepForSeconds(100);  // -> 300
+  ASSERT_OK(Put("k3", "v3"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("3", FilesPerLevel());
+  ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+                                       &oldest_key_time));
+  ASSERT_EQ(100, oldest_key_time - mock_start_time);
+
+  env_->MockSleepForSeconds(150);  // -> 450
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("2", FilesPerLevel());
+  ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+                                       &oldest_key_time));
+  ASSERT_EQ(200, oldest_key_time - mock_start_time);
+
+  env_->MockSleepForSeconds(100);  // -> 550
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("1", FilesPerLevel());
+  ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+                                       &oldest_key_time));
+  ASSERT_EQ(300, oldest_key_time - mock_start_time);
+
+  env_->MockSleepForSeconds(100);  // -> 650
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("", FilesPerLevel());
+  ASSERT_FALSE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+                                        &oldest_key_time));
+}
+
+TEST_F(DBPropertiesTest, SstFilesSize) {
+  struct TestListener : public EventListener {
+    void OnCompactionCompleted(DB* db,
+                               const CompactionJobInfo& /*info*/) override {
+      assert(callback_triggered == false);
+      assert(size_before_compaction > 0);
+      callback_triggered = true;
+      uint64_t total_sst_size = 0;
+      uint64_t live_sst_size = 0;
+      bool ok = db->GetIntProperty(DB::Properties::kTotalSstFilesSize,
+                                   &total_sst_size);
+      ASSERT_TRUE(ok);
+      // total_sst_size include files before and after compaction.
+      ASSERT_GT(total_sst_size, size_before_compaction);
+      ok =
+          db->GetIntProperty(DB::Properties::kLiveSstFilesSize, &live_sst_size);
+      ASSERT_TRUE(ok);
+      // live_sst_size only include files after compaction.
+      ASSERT_GT(live_sst_size, 0);
+      ASSERT_LT(live_sst_size, size_before_compaction);
+    }
+
+    uint64_t size_before_compaction = 0;
+    bool callback_triggered = false;
+  };
+  std::shared_ptr<TestListener> listener = std::make_shared<TestListener>();
+
+  Options options;
+  options.env = CurrentOptions().env;
+  options.disable_auto_compactions = true;
+  options.listeners.push_back(listener);
+  Reopen(options);
+
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put("key" + std::to_string(i), std::string(1000, 'v')));
+  }
+  ASSERT_OK(Flush());
+  for (int i = 0; i < 5; i++) {
+    ASSERT_OK(Delete("key" + std::to_string(i)));
+  }
+  ASSERT_OK(Flush());
+  uint64_t sst_size;
+  bool ok = db_->GetIntProperty(DB::Properties::kTotalSstFilesSize, &sst_size);
+  ASSERT_TRUE(ok);
+  ASSERT_GT(sst_size, 0);
+  listener->size_before_compaction = sst_size;
+  // Compact to clean all keys and trigger listener.
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_TRUE(listener->callback_triggered);
+}
+
+TEST_F(DBPropertiesTest, MinObsoleteSstNumberToKeep) {
+  class TestListener : public EventListener {
+   public:
+    void OnTableFileCreated(const TableFileCreationInfo& info) override {
+      if (info.reason == TableFileCreationReason::kCompaction) {
+        // Verify the property indicates that SSTs created by a running
+        // compaction cannot be deleted.
+        uint64_t created_file_num;
+        FileType created_file_type;
+        std::string filename =
+            info.file_path.substr(info.file_path.rfind('/') + 1);
+        ASSERT_TRUE(
+            ParseFileName(filename, &created_file_num, &created_file_type));
+        ASSERT_EQ(kTableFile, created_file_type);
+
+        uint64_t keep_sst_lower_bound;
+        ASSERT_TRUE(
+            db_->GetIntProperty(DB::Properties::kMinObsoleteSstNumberToKeep,
+                                &keep_sst_lower_bound));
+
+        ASSERT_LE(keep_sst_lower_bound, created_file_num);
+        validated_ = true;
+      }
+    }
+
+    void SetDB(DB* db) { db_ = db; }
+
+    int GetNumCompactions() { return num_compactions_; }
+
+    // True if we've verified the property for at least one output file
+    bool Validated() { return validated_; }
+
+   private:
+    int num_compactions_ = 0;
+    bool validated_ = false;
+    DB* db_ = nullptr;
+  };
+
+  const int kNumL0Files = 4;
+
+  std::shared_ptr<TestListener> listener = std::make_shared<TestListener>();
+
+  Options options = CurrentOptions();
+  options.listeners.push_back(listener);
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  DestroyAndReopen(options);
+  listener->SetDB(db_);
+
+  for (int i = 0; i < kNumL0Files; ++i) {
+    // Make sure they overlap in keyspace to prevent trivial move
+    ASSERT_OK(Put("key1", "val"));
+    ASSERT_OK(Put("key2", "val"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(listener->Validated());
+}
+
+TEST_F(DBPropertiesTest, BlobCacheProperties) {
+  Options options;
+  uint64_t value;
+
+  options.env = CurrentOptions().env;
+
+  // Test with empty blob cache.
+  constexpr size_t kCapacity = 100;
+  LRUCacheOptions co;
+  co.capacity = kCapacity;
+  co.num_shard_bits = 0;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  auto blob_cache = NewLRUCache(co);
+  options.blob_cache = blob_cache;
+
+  Reopen(options);
+
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheUsage, &value));
+  ASSERT_EQ(0, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlobCachePinnedUsage, &value));
+  ASSERT_EQ(0, value);
+
+  // Insert unpinned blob to the cache and check size.
+  constexpr size_t kSize1 = 70;
+  ASSERT_OK(blob_cache->Insert("blob1", nullptr /*value*/, kSize1,
+                               nullptr /*deleter*/));
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheUsage, &value));
+  ASSERT_EQ(kSize1, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlobCachePinnedUsage, &value));
+  ASSERT_EQ(0, value);
+
+  // Insert pinned blob to the cache and check size.
+  constexpr size_t kSize2 = 60;
+  Cache::Handle* blob2 = nullptr;
+  ASSERT_OK(blob_cache->Insert("blob2", nullptr /*value*/, kSize2,
+                               nullptr /*deleter*/, &blob2));
+  ASSERT_NE(nullptr, blob2);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheUsage, &value));
+  // blob1 is evicted.
+  ASSERT_EQ(kSize2, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlobCachePinnedUsage, &value));
+  ASSERT_EQ(kSize2, value);
+
+  // Insert another pinned blob to make the cache over-sized.
+  constexpr size_t kSize3 = 80;
+  Cache::Handle* blob3 = nullptr;
+  ASSERT_OK(blob_cache->Insert("blob3", nullptr /*value*/, kSize3,
+                               nullptr /*deleter*/, &blob3));
+  ASSERT_NE(nullptr, blob3);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheUsage, &value));
+  ASSERT_EQ(kSize2 + kSize3, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlobCachePinnedUsage, &value));
+  ASSERT_EQ(kSize2 + kSize3, value);
+
+  // Check size after release.
+  blob_cache->Release(blob2);
+  blob_cache->Release(blob3);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheUsage, &value));
+  // blob2 will be evicted, while blob3 remain in cache after release.
+  ASSERT_EQ(kSize3, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlobCachePinnedUsage, &value));
+  ASSERT_EQ(0, value);
+}
+
+TEST_F(DBPropertiesTest, BlockCacheProperties) {
+  Options options;
+  uint64_t value;
+
+  options.env = CurrentOptions().env;
+
+  // Block cache properties are not available for tables other than
+  // block-based table.
+  options.table_factory.reset(NewPlainTableFactory());
+  Reopen(options);
+  ASSERT_FALSE(
+      db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_FALSE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  ASSERT_FALSE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+
+  options.table_factory.reset(NewCuckooTableFactory());
+  Reopen(options);
+  ASSERT_FALSE(
+      db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_FALSE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  ASSERT_FALSE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+
+  // Block cache properties are not available if block cache is not used.
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  ASSERT_FALSE(
+      db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_FALSE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  ASSERT_FALSE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+
+  // Test with empty block cache.
+  constexpr size_t kCapacity = 100;
+  LRUCacheOptions co;
+  co.capacity = kCapacity;
+  co.num_shard_bits = 0;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  auto block_cache = NewLRUCache(co);
+  table_options.block_cache = block_cache;
+  table_options.no_block_cache = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  ASSERT_EQ(0, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+  ASSERT_EQ(0, value);
+
+  // Insert unpinned item to the cache and check size.
+  constexpr size_t kSize1 = 50;
+  ASSERT_OK(block_cache->Insert("item1", nullptr /*value*/, kSize1,
+                                nullptr /*deleter*/));
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  ASSERT_EQ(kSize1, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+  ASSERT_EQ(0, value);
+
+  // Insert pinned item to the cache and check size.
+  constexpr size_t kSize2 = 30;
+  Cache::Handle* item2 = nullptr;
+  ASSERT_OK(block_cache->Insert("item2", nullptr /*value*/, kSize2,
+                                nullptr /*deleter*/, &item2));
+  ASSERT_NE(nullptr, item2);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  ASSERT_EQ(kSize1 + kSize2, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+  ASSERT_EQ(kSize2, value);
+
+  // Insert another pinned item to make the cache over-sized.
+  constexpr size_t kSize3 = 80;
+  Cache::Handle* item3 = nullptr;
+  ASSERT_OK(block_cache->Insert("item3", nullptr /*value*/, kSize3,
+                                nullptr /*deleter*/, &item3));
+  ASSERT_NE(nullptr, item2);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  // Item 1 is evicted.
+  ASSERT_EQ(kSize2 + kSize3, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+  ASSERT_EQ(kSize2 + kSize3, value);
+
+  // Check size after release.
+  block_cache->Release(item2);
+  block_cache->Release(item3);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  // item2 will be evicted, while item3 remain in cache after release.
+  ASSERT_EQ(kSize3, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+  ASSERT_EQ(0, value);
+}
+
+TEST_F(DBPropertiesTest, GetMapPropertyDbStats) {
+  auto mock_clock = std::make_shared<MockSystemClock>(env_->GetSystemClock());
+  CompositeEnvWrapper env(env_, mock_clock);
+
+  Options opts = CurrentOptions();
+  opts.env = &env;
+  Reopen(opts);
+
+  {
+    std::map<std::string, std::string> db_stats;
+    ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kDBStats, &db_stats));
+    AssertDbStats(db_stats, 0.0 /* expected_uptime */,
+                  0 /* expected_user_bytes_written */,
+                  0 /* expected_wal_bytes_written */,
+                  0 /* expected_user_writes_by_self */,
+                  0 /* expected_user_writes_with_wal */);
+  }
+
+  {
+    mock_clock->SleepForMicroseconds(1500000);
+
+    std::map<std::string, std::string> db_stats;
+    ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kDBStats, &db_stats));
+    AssertDbStats(db_stats, 1.5 /* expected_uptime */,
+                  0 /* expected_user_bytes_written */,
+                  0 /* expected_wal_bytes_written */,
+                  0 /* expected_user_writes_by_self */,
+                  0 /* expected_user_writes_with_wal */);
+  }
+
+  int expected_user_bytes_written = 0;
+  {
+    // Write with WAL disabled.
+    WriteOptions write_opts;
+    write_opts.disableWAL = true;
+
+    WriteBatch batch;
+    ASSERT_OK(batch.Put("key", "val"));
+    expected_user_bytes_written += static_cast<int>(batch.GetDataSize());
+
+    ASSERT_OK(db_->Write(write_opts, &batch));
+
+    std::map<std::string, std::string> db_stats;
+    ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kDBStats, &db_stats));
+    AssertDbStats(db_stats, 1.5 /* expected_uptime */,
+                  expected_user_bytes_written,
+                  0 /* expected_wal_bytes_written */,
+                  1 /* expected_user_writes_by_self */,
+                  0 /* expected_user_writes_with_wal */);
+  }
+
+  int expected_wal_bytes_written = 0;
+  {
+    // Write with WAL enabled.
+    WriteBatch batch;
+    ASSERT_OK(batch.Delete("key"));
+    expected_user_bytes_written += static_cast<int>(batch.GetDataSize());
+    expected_wal_bytes_written += static_cast<int>(batch.GetDataSize());
+
+    ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+    std::map<std::string, std::string> db_stats;
+    ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kDBStats, &db_stats));
+    AssertDbStats(db_stats, 1.5 /* expected_uptime */,
+                  expected_user_bytes_written, expected_wal_bytes_written,
+                  2 /* expected_user_writes_by_self */,
+                  1 /* expected_user_writes_with_wal */);
+  }
+
+  Close();
+}
+
+TEST_F(DBPropertiesTest, GetMapPropertyBlockCacheEntryStats) {
+  // Currently only verifies the expected properties are present
+  std::map<std::string, std::string> values;
+  ASSERT_TRUE(
+      db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats, &values));
+
+  ASSERT_TRUE(values.find(BlockCacheEntryStatsMapKeys::CacheId()) !=
+              values.end());
+  ASSERT_TRUE(values.find(BlockCacheEntryStatsMapKeys::CacheCapacityBytes()) !=
+              values.end());
+  ASSERT_TRUE(
+      values.find(
+          BlockCacheEntryStatsMapKeys::LastCollectionDurationSeconds()) !=
+      values.end());
+  ASSERT_TRUE(
+      values.find(BlockCacheEntryStatsMapKeys::LastCollectionAgeSeconds()) !=
+      values.end());
+  for (size_t i = 0; i < kNumCacheEntryRoles; ++i) {
+    CacheEntryRole role = static_cast<CacheEntryRole>(i);
+    ASSERT_TRUE(values.find(BlockCacheEntryStatsMapKeys::EntryCount(role)) !=
+                values.end());
+    ASSERT_TRUE(values.find(BlockCacheEntryStatsMapKeys::UsedBytes(role)) !=
+                values.end());
+    ASSERT_TRUE(values.find(BlockCacheEntryStatsMapKeys::UsedPercent(role)) !=
+                values.end());
+  }
+
+  // There should be no extra values in the map.
+  ASSERT_EQ(3 * kNumCacheEntryRoles + 4, values.size());
+}
+
+namespace {
+std::string PopMetaIndexKey(InternalIterator* meta_iter) {
+  Status s = meta_iter->status();
+  if (!s.ok()) {
+    return s.ToString();
+  } else if (meta_iter->Valid()) {
+    std::string rv = meta_iter->key().ToString();
+    meta_iter->Next();
+    return rv;
+  } else {
+    return "NOT_FOUND";
+  }
+}
+
+}  // anonymous namespace
+
+TEST_F(DBPropertiesTest, TableMetaIndexKeys) {
+  // This is to detect unexpected churn in metaindex block keys. This is more
+  // of a "table test" but table_test.cc doesn't depend on db_test_util.h and
+  // we need ChangeOptions() for broad coverage.
+  constexpr int kKeyCount = 100;
+  do {
+    Options options;
+    options = CurrentOptions(options);
+    DestroyAndReopen(options);
+
+    // Create an SST file
+    for (int key = 0; key < kKeyCount; key++) {
+      ASSERT_OK(Put(Key(key), "val"));
+    }
+    ASSERT_OK(Flush());
+
+    // Find its file number
+    std::vector<LiveFileMetaData> files;
+    db_->GetLiveFilesMetaData(&files);
+    // 1 SST file
+    ASSERT_EQ(1, files.size());
+
+    // Open it for inspection
+    std::string sst_file =
+        files[0].directory + "/" + files[0].relative_filename;
+    std::unique_ptr<FSRandomAccessFile> f;
+    ASSERT_OK(env_->GetFileSystem()->NewRandomAccessFile(
+        sst_file, FileOptions(), &f, nullptr));
+    std::unique_ptr<RandomAccessFileReader> r;
+    r.reset(new RandomAccessFileReader(std::move(f), sst_file));
+    uint64_t file_size = 0;
+    ASSERT_OK(env_->GetFileSize(sst_file, &file_size));
+
+    // Read metaindex
+    BlockContents bc;
+    ASSERT_OK(ReadMetaIndexBlockInFile(r.get(), file_size, 0U,
+                                       ImmutableOptions(options), &bc));
+    Block metaindex_block(std::move(bc));
+    std::unique_ptr<InternalIterator> meta_iter;
+    meta_iter.reset(metaindex_block.NewMetaIterator());
+    meta_iter->SeekToFirst();
+
+    if (strcmp(options.table_factory->Name(),
+               TableFactory::kBlockBasedTableName()) == 0) {
+      auto bbto = options.table_factory->GetOptions<BlockBasedTableOptions>();
+      if (bbto->filter_policy) {
+        if (bbto->partition_filters) {
+          // The key names are intentionally hard-coded here to detect
+          // accidental regression on compatibility.
+          EXPECT_EQ("partitionedfilter.rocksdb.BuiltinBloomFilter",
+                    PopMetaIndexKey(meta_iter.get()));
+        } else {
+          EXPECT_EQ("fullfilter.rocksdb.BuiltinBloomFilter",
+                    PopMetaIndexKey(meta_iter.get()));
+        }
+      }
+      if (bbto->index_type == BlockBasedTableOptions::kHashSearch) {
+        EXPECT_EQ("rocksdb.hashindex.metadata",
+                  PopMetaIndexKey(meta_iter.get()));
+        EXPECT_EQ("rocksdb.hashindex.prefixes",
+                  PopMetaIndexKey(meta_iter.get()));
+      }
+    }
+    EXPECT_EQ("rocksdb.properties", PopMetaIndexKey(meta_iter.get()));
+    EXPECT_EQ("NOT_FOUND", PopMetaIndexKey(meta_iter.get()));
+  } while (ChangeOptions());
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_range_del_test.cc b/src/rocksdb/db/db_range_del_test.cc
new file mode 100644
index 000000000..d576f2217
--- /dev/null
+++ b/src/rocksdb/db/db_range_del_test.cc
@@ -0,0 +1,2807 @@
+//  Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "db/version_set.h"
+#include "port/stack_trace.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO(cbi): parameterize the test to cover user-defined timestamp cases
+class DBRangeDelTest : public DBTestBase {
+ public:
+  DBRangeDelTest() : DBTestBase("db_range_del_test", /*env_do_fsync=*/false) {}
+
+  std::string GetNumericStr(int key) {
+    uint64_t uint64_key = static_cast<uint64_t>(key);
+    std::string str;
+    str.resize(8);
+    memcpy(&str[0], static_cast<void*>(&uint64_key), 8);
+    return str;
+  }
+};
+
+// PlainTableFactory, WriteBatchWithIndex, and NumTableFilesAtLevel() are not
+// supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+TEST_F(DBRangeDelTest, NonBlockBasedTableNotSupported) {
+  // TODO: figure out why MmapReads trips the iterator pinning assertion in
+  // RangeDelAggregator. Ideally it would be supported; otherwise it should at
+  // least be explicitly unsupported.
+  for (auto config : {kPlainTableAllBytesPrefix, /* kWalDirAndMmapReads */}) {
+    option_config_ = config;
+    DestroyAndReopen(CurrentOptions());
+    ASSERT_TRUE(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 "dr1", "dr1")
+                    .IsNotSupported());
+  }
+}
+
+TEST_F(DBRangeDelTest, WriteBatchWithIndexNotSupported) {
+  WriteBatchWithIndex indexedBatch{};
+  ASSERT_TRUE(indexedBatch.DeleteRange(db_->DefaultColumnFamily(), "dr1", "dr1")
+                  .IsNotSupported());
+  ASSERT_TRUE(indexedBatch.DeleteRange("dr1", "dr1").IsNotSupported());
+}
+
+TEST_F(DBRangeDelTest, EndSameAsStartCoversNothing) {
+  ASSERT_OK(db_->Put(WriteOptions(), "b", "val"));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "b", "b"));
+  ASSERT_EQ("val", Get("b"));
+}
+
+TEST_F(DBRangeDelTest, EndComesBeforeStartInvalidArgument) {
+  ASSERT_OK(db_->Put(WriteOptions(), "b", "val"));
+  ASSERT_TRUE(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "b", "a")
+          .IsInvalidArgument());
+  ASSERT_EQ("val", Get("b"));
+}
+
+TEST_F(DBRangeDelTest, FlushOutputHasOnlyRangeTombstones) {
+  do {
+    DestroyAndReopen(CurrentOptions());
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               "dr1", "dr2"));
+    ASSERT_OK(db_->Flush(FlushOptions()));
+    ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  } while (ChangeOptions(kRangeDelSkipConfigs));
+}
+
+TEST_F(DBRangeDelTest, DictionaryCompressionWithOnlyRangeTombstones) {
+  Options opts = CurrentOptions();
+  opts.compression_opts.max_dict_bytes = 16384;
+  Reopen(opts);
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1",
+                             "dr2"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+}
+
+TEST_F(DBRangeDelTest, CompactionOutputHasOnlyRangeTombstone) {
+  do {
+    Options opts = CurrentOptions();
+    opts.disable_auto_compactions = true;
+    opts.statistics = CreateDBStatistics();
+    DestroyAndReopen(opts);
+
+    // snapshot protects range tombstone from dropping due to becoming obsolete.
+    const Snapshot* snapshot = db_->GetSnapshot();
+    ASSERT_OK(
+        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+    ASSERT_OK(db_->Flush(FlushOptions()));
+
+    ASSERT_EQ(1, NumTableFilesAtLevel(0));
+    ASSERT_EQ(0, NumTableFilesAtLevel(1));
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                          true /* disallow_trivial_move */));
+    ASSERT_EQ(0, NumTableFilesAtLevel(0));
+    ASSERT_EQ(1, NumTableFilesAtLevel(1));
+    ASSERT_EQ(0, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE));
+    db_->ReleaseSnapshot(snapshot);
+    // Skip cuckoo memtables, which do not support snapshots. Skip non-leveled
+    // compactions as the above assertions about the number of files in a level
+    // do not hold true.
+  } while (ChangeOptions(kRangeDelSkipConfigs | kSkipUniversalCompaction |
+                         kSkipFIFOCompaction));
+}
+
+TEST_F(DBRangeDelTest, CompactionOutputFilesExactlyFilled) {
+  // regression test for exactly filled compaction output files. Previously
+  // another file would be generated containing all range deletions, which
+  // could invalidate the non-overlapping file boundary invariant.
+  const int kNumPerFile = 4, kNumFiles = 2, kFileBytes = 9 << 10;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.level0_file_num_compaction_trigger = kNumFiles;
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+  options.num_levels = 2;
+  options.target_file_size_base = kFileBytes;
+  BlockBasedTableOptions table_options;
+  table_options.block_size_deviation = 50;  // each block holds two keys
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  // snapshot protects range tombstone from dropping due to becoming obsolete.
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(1)));
+
+  Random rnd(301);
+  for (int i = 0; i < kNumFiles; ++i) {
+    std::vector<std::string> values;
+    // Write 12K (4 values, each 3K)
+    for (int j = 0; j < kNumPerFile; j++) {
+      values.push_back(rnd.RandomString(3 << 10));
+      ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
+      if (j == 0 && i > 0) {
+        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+      }
+    }
+  }
+  // put extra key to trigger final flush
+  ASSERT_OK(Put("", ""));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                        true /* disallow_trivial_move */));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, MaxCompactionBytesCutsOutputFiles) {
+  // Ensures range deletion spanning multiple compaction output files that are
+  // cut by max_compaction_bytes will have non-overlapping key-ranges.
+  // https://github.com/facebook/rocksdb/issues/1778
+  const int kNumFiles = 2, kNumPerFile = 1 << 8, kBytesPerVal = 1 << 12;
+  Options opts = CurrentOptions();
+  opts.comparator = test::Uint64Comparator();
+  opts.disable_auto_compactions = true;
+  opts.level0_file_num_compaction_trigger = kNumFiles;
+  opts.max_compaction_bytes = kNumPerFile * kBytesPerVal;
+  opts.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+  // Want max_compaction_bytes to trigger the end of compaction output file, not
+  // target_file_size_base, so make the latter much bigger
+  //  opts.target_file_size_base = 100 * opts.max_compaction_bytes;
+  opts.target_file_size_base = 1;
+  DestroyAndReopen(opts);
+
+  // snapshot protects range tombstone from dropping due to becoming obsolete.
+  const Snapshot* snapshot = db_->GetSnapshot();
+
+  Random rnd(301);
+
+  ASSERT_OK(Put(GetNumericStr(0), rnd.RandomString(kBytesPerVal)));
+  ASSERT_OK(
+      Put(GetNumericStr(kNumPerFile - 1), rnd.RandomString(kBytesPerVal)));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(GetNumericStr(kNumPerFile), rnd.RandomString(kBytesPerVal)));
+  ASSERT_OK(
+      Put(GetNumericStr(kNumPerFile * 2 - 1), rnd.RandomString(kBytesPerVal)));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(2);
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(NumTableFilesAtLevel(2), 2);
+
+  ASSERT_OK(
+      db_->SetOptions(db_->DefaultColumnFamily(),
+                      {{"target_file_size_base",
+                        std::to_string(100 * opts.max_compaction_bytes)}}));
+
+  // It spans the whole key-range, thus will be included in all output files
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                             GetNumericStr(0),
+                             GetNumericStr(kNumFiles * kNumPerFile - 1)));
+
+  for (int i = 0; i < kNumFiles; ++i) {
+    std::vector<std::string> values;
+    // Write 1MB (256 values, each 4K)
+    for (int j = 0; j < kNumPerFile; j++) {
+      values.push_back(rnd.RandomString(kBytesPerVal));
+      ASSERT_OK(Put(GetNumericStr(kNumPerFile * i + j), values[j]));
+    }
+    // extra entry to trigger SpecialSkipListFactory's flush
+    ASSERT_OK(Put(GetNumericStr(kNumPerFile), ""));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+  }
+
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr,
+                                        /*column_family=*/nullptr,
+                                        /*disallow_trivial_move=*/true));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GE(NumTableFilesAtLevel(1), 2);
+  std::vector<std::vector<FileMetaData>> files;
+  dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
+
+  for (size_t i = 0; i + 1 < files[1].size(); ++i) {
+    ASSERT_TRUE(InternalKeyComparator(opts.comparator)
+                    .Compare(files[1][i].largest, files[1][i + 1].smallest) <
+                0);
+  }
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, SentinelsOmittedFromOutputFile) {
+  // Regression test for bug where sentinel range deletions (i.e., ones with
+  // sequence number of zero) were included in output files.
+  // snapshot protects range tombstone from dropping due to becoming obsolete.
+  const Snapshot* snapshot = db_->GetSnapshot();
+
+  // gaps between ranges creates sentinels in our internal representation
+  std::vector<std::pair<std::string, std::string>> range_dels = {
+      {"a", "b"}, {"c", "d"}, {"e", "f"}};
+  for (const auto& range_del : range_dels) {
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               range_del.first, range_del.second));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  std::vector<std::vector<FileMetaData>> files;
+  dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
+  ASSERT_GT(files[0][0].fd.smallest_seqno, 0);
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, FlushRangeDelsSameStartKey) {
+  ASSERT_OK(db_->Put(WriteOptions(), "b1", "val"));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c"));
+  ASSERT_OK(db_->Put(WriteOptions(), "b2", "val"));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b"));
+  // first iteration verifies query correctness in memtable, second verifies
+  // query correctness for a single SST file
+  for (int i = 0; i < 2; ++i) {
+    if (i > 0) {
+      ASSERT_OK(db_->Flush(FlushOptions()));
+      ASSERT_EQ(1, NumTableFilesAtLevel(0));
+    }
+    std::string value;
+    ASSERT_TRUE(db_->Get(ReadOptions(), "b1", &value).IsNotFound());
+    ASSERT_OK(db_->Get(ReadOptions(), "b2", &value));
+  }
+}
+
+TEST_F(DBRangeDelTest, CompactRangeDelsSameStartKey) {
+  ASSERT_OK(db_->Put(WriteOptions(), "unused",
+                     "val"));  // prevents empty after compaction
+  ASSERT_OK(db_->Put(WriteOptions(), "b1", "val"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(3, NumTableFilesAtLevel(0));
+
+  for (int i = 0; i < 2; ++i) {
+    if (i > 0) {
+      ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                            true /* disallow_trivial_move */));
+      ASSERT_EQ(0, NumTableFilesAtLevel(0));
+      ASSERT_EQ(1, NumTableFilesAtLevel(1));
+    }
+    std::string value;
+    ASSERT_TRUE(db_->Get(ReadOptions(), "b1", &value).IsNotFound());
+  }
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBRangeDelTest, FlushRemovesCoveredKeys) {
+  const int kNum = 300, kRangeBegin = 50, kRangeEnd = 250;
+  Options opts = CurrentOptions();
+  opts.comparator = test::Uint64Comparator();
+  DestroyAndReopen(opts);
+
+  // Write a third before snapshot, a third between snapshot and tombstone, and
+  // a third after the tombstone. Keys older than snapshot or newer than the
+  // tombstone should be preserved.
+  const Snapshot* snapshot = nullptr;
+  for (int i = 0; i < kNum; ++i) {
+    if (i == kNum / 3) {
+      snapshot = db_->GetSnapshot();
+    } else if (i == 2 * kNum / 3) {
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 GetNumericStr(kRangeBegin),
+                                 GetNumericStr(kRangeEnd)));
+    }
+    ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val"));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  for (int i = 0; i < kNum; ++i) {
+    ReadOptions read_opts;
+    read_opts.ignore_range_deletions = true;
+    std::string value;
+    if (i < kRangeBegin || i > kRangeEnd || i < kNum / 3 || i >= 2 * kNum / 3) {
+      ASSERT_OK(db_->Get(read_opts, GetNumericStr(i), &value));
+    } else {
+      ASSERT_TRUE(db_->Get(read_opts, GetNumericStr(i), &value).IsNotFound());
+    }
+  }
+  db_->ReleaseSnapshot(snapshot);
+}
+
+// NumTableFilesAtLevel() is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+TEST_F(DBRangeDelTest, CompactionRemovesCoveredKeys) {
+  const int kNumPerFile = 100, kNumFiles = 4;
+  Options opts = CurrentOptions();
+  opts.comparator = test::Uint64Comparator();
+  opts.disable_auto_compactions = true;
+  opts.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+  opts.num_levels = 2;
+  opts.statistics = CreateDBStatistics();
+  DestroyAndReopen(opts);
+
+  for (int i = 0; i < kNumFiles; ++i) {
+    if (i > 0) {
+      // range tombstone covers first half of the previous file
+      ASSERT_OK(db_->DeleteRange(
+          WriteOptions(), db_->DefaultColumnFamily(),
+          GetNumericStr((i - 1) * kNumPerFile),
+          GetNumericStr((i - 1) * kNumPerFile + kNumPerFile / 2)));
+    }
+    // Make sure a given key appears in each file so compaction won't be able to
+    // use trivial move, which would happen if the ranges were non-overlapping.
+    // Also, we need an extra element since flush is only triggered when the
+    // number of keys is one greater than SpecialSkipListFactory's limit.
+    // We choose a key outside the key-range used by the test to avoid conflict.
+    ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(kNumPerFile * kNumFiles),
+                       "val"));
+
+    for (int j = 0; j < kNumPerFile; ++j) {
+      ASSERT_OK(
+          db_->Put(WriteOptions(), GetNumericStr(i * kNumPerFile + j), "val"));
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+  }
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ((kNumFiles - 1) * kNumPerFile / 2,
+            TestGetTickerCount(opts, COMPACTION_KEY_DROP_RANGE_DEL));
+
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kNumPerFile; ++j) {
+      ReadOptions read_opts;
+      read_opts.ignore_range_deletions = true;
+      std::string value;
+      if (i == kNumFiles - 1 || j >= kNumPerFile / 2) {
+        ASSERT_OK(
+            db_->Get(read_opts, GetNumericStr(i * kNumPerFile + j), &value));
+      } else {
+        ASSERT_TRUE(
+            db_->Get(read_opts, GetNumericStr(i * kNumPerFile + j), &value)
+                .IsNotFound());
+      }
+    }
+  }
+}
+
+TEST_F(DBRangeDelTest, ValidLevelSubcompactionBoundaries) {
+  const int kNumPerFile = 100, kNumFiles = 4, kFileBytes = 100 << 10;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.level0_file_num_compaction_trigger = kNumFiles;
+  options.max_bytes_for_level_base = 2 * kFileBytes;
+  options.max_subcompactions = 4;
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+  options.num_levels = 3;
+  options.target_file_size_base = kFileBytes;
+  options.target_file_size_multiplier = 1;
+  options.max_compaction_bytes = 1500;
+  Reopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < kNumFiles; ++j) {
+      if (i > 0) {
+        // delete [95,105) in two files, [295,305) in next two
+        int mid = (j + (1 - j % 2)) * kNumPerFile;
+        ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                   Key(mid - 5), Key(mid + 5)));
+      }
+      std::vector<std::string> values;
+      // Write 100KB (100 values, each 1K)
+      for (int k = 0; k < kNumPerFile; k++) {
+        values.push_back(rnd.RandomString(990));
+        ASSERT_OK(Put(Key(j * kNumPerFile + k), values[k]));
+      }
+      // put extra key to trigger flush
+      ASSERT_OK(Put("", ""));
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+      if (j < kNumFiles - 1) {
+        // background compaction may happen early for kNumFiles'th file
+        ASSERT_EQ(NumTableFilesAtLevel(0), j + 1);
+      }
+      if (j == options.level0_file_num_compaction_trigger - 1) {
+        // When i == 1, compaction will output some files to L1, at which point
+        // L1 is not bottommost so range deletions cannot be compacted away. The
+        // new L1 files must be generated with non-overlapping key ranges even
+        // though multiple subcompactions see the same ranges deleted, else an
+        // assertion will fail.
+        //
+        // Only enable auto-compactions when we're ready; otherwise, the
+        // oversized L0 (relative to base_level) causes the compaction to run
+        // earlier.
+        ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()}));
+        ASSERT_OK(dbfull()->TEST_WaitForCompact());
+        ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
+                                  {{"disable_auto_compactions", "true"}}));
+        ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+        ASSERT_GT(NumTableFilesAtLevel(1), 0);
+        ASSERT_GT(NumTableFilesAtLevel(2), 0);
+      }
+    }
+  }
+}
+
+TEST_F(DBRangeDelTest, ValidUniversalSubcompactionBoundaries) {
+  const int kNumPerFile = 100, kFilesPerLevel = 4, kNumLevels = 4;
+  Options options = CurrentOptions();
+  options.compaction_options_universal.min_merge_width = kFilesPerLevel;
+  options.compaction_options_universal.max_merge_width = kFilesPerLevel;
+  options.compaction_options_universal.size_ratio = 10;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.level0_file_num_compaction_trigger = kFilesPerLevel;
+  options.max_subcompactions = 4;
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+  options.num_levels = kNumLevels;
+  options.target_file_size_base = kNumPerFile << 10;
+  options.target_file_size_multiplier = 1;
+  Reopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < kNumLevels - 1; ++i) {
+    for (int j = 0; j < kFilesPerLevel; ++j) {
+      if (i == kNumLevels - 2) {
+        // insert range deletions [95,105) in two files, [295,305) in next two
+        // to prepare L1 for later manual compaction.
+        int mid = (j + (1 - j % 2)) * kNumPerFile;
+        ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                   Key(mid - 5), Key(mid + 5)));
+      }
+      std::vector<std::string> values;
+      // Write 100KB (100 values, each 1K)
+      for (int k = 0; k < kNumPerFile; k++) {
+        values.push_back(rnd.RandomString(990));
+        ASSERT_OK(Put(Key(j * kNumPerFile + k), values[k]));
+      }
+      // put extra key to trigger flush
+      ASSERT_OK(Put("", ""));
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+      if (j < kFilesPerLevel - 1) {
+        // background compaction may happen early for kFilesPerLevel'th file
+        ASSERT_EQ(NumTableFilesAtLevel(0), j + 1);
+      }
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+    ASSERT_GT(NumTableFilesAtLevel(kNumLevels - 1 - i), kFilesPerLevel - 1);
+  }
+  // Now L1-L3 are full, when we compact L1->L2 we should see (1) subcompactions
+  // happen since input level > 0; (2) range deletions are not dropped since
+  // output level is not bottommost. If no file boundary assertion fails, that
+  // probably means universal compaction + subcompaction + range deletion are
+  // compatible.
+  ASSERT_OK(dbfull()->RunManualCompaction(
+      static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
+          ->cfd(),
+      1 /* input_level */, 2 /* output_level */, CompactRangeOptions(),
+      nullptr /* begin */, nullptr /* end */, true /* exclusive */,
+      true /* disallow_trivial_move */,
+      std::numeric_limits<uint64_t>::max() /* max_file_num_to_ignore */,
+      "" /*trim_ts*/));
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBRangeDelTest, CompactionRemovesCoveredMergeOperands) {
+  const int kNumPerFile = 3, kNumFiles = 3;
+  Options opts = CurrentOptions();
+  opts.disable_auto_compactions = true;
+  opts.memtable_factory.reset(test::NewSpecialSkipListFactory(2 * kNumPerFile));
+  opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  opts.num_levels = 2;
+  Reopen(opts);
+
+  // Iterates kNumFiles * kNumPerFile + 1 times since flushing the last file
+  // requires an extra entry.
+  for (int i = 0; i <= kNumFiles * kNumPerFile; ++i) {
+    if (i % kNumPerFile == 0 && i / kNumPerFile == kNumFiles - 1) {
+      // Delete merge operands from all but the last file
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 "key", "key_"));
+    }
+    std::string val;
+    PutFixed64(&val, i);
+    ASSERT_OK(db_->Merge(WriteOptions(), "key", val));
+    // we need to prevent trivial move using Puts so compaction will actually
+    // process the merge operands.
+    ASSERT_OK(db_->Put(WriteOptions(), "prevent_trivial_move", ""));
+    if (i > 0 && i % kNumPerFile == 0) {
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    }
+  }
+
+  ReadOptions read_opts;
+  read_opts.ignore_range_deletions = true;
+  std::string expected, actual;
+  ASSERT_OK(db_->Get(read_opts, "key", &actual));
+  PutFixed64(&expected, 45);  // 1+2+...+9
+  ASSERT_EQ(expected, actual);
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  expected.clear();
+  ASSERT_OK(db_->Get(read_opts, "key", &actual));
+  uint64_t tmp;
+  Slice tmp2(actual);
+  GetFixed64(&tmp2, &tmp);
+  PutFixed64(&expected, 30);  // 6+7+8+9 (earlier operands covered by tombstone)
+  ASSERT_EQ(expected, actual);
+}
+
+TEST_F(DBRangeDelTest, PutDeleteRangeMergeFlush) {
+  // Test the sequence of operations: (1) Put, (2) DeleteRange, (3) Merge, (4)
+  // Flush. The `CompactionIterator` previously had a bug where we forgot to
+  // check for covering range tombstones when processing the (1) Put, causing
+  // it to reappear after the flush.
+  Options opts = CurrentOptions();
+  opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  Reopen(opts);
+
+  std::string val;
+  PutFixed64(&val, 1);
+  ASSERT_OK(db_->Put(WriteOptions(), "key", val));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key",
+                             "key_"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", val));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  ReadOptions read_opts;
+  std::string expected, actual;
+  ASSERT_OK(db_->Get(read_opts, "key", &actual));
+  PutFixed64(&expected, 1);
+  ASSERT_EQ(expected, actual);
+}
+
+// NumTableFilesAtLevel() is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+TEST_F(DBRangeDelTest, ObsoleteTombstoneCleanup) {
+  // During compaction to bottommost level, verify range tombstones older than
+  // the oldest snapshot are removed, while others are preserved.
+  Options opts = CurrentOptions();
+  opts.disable_auto_compactions = true;
+  opts.num_levels = 2;
+  opts.statistics = CreateDBStatistics();
+  Reopen(opts);
+
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1",
+                             "dr10"));  // obsolete after compaction
+  ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr2",
+                             "dr20"));  // protected by snapshot
+  ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+  ASSERT_EQ(1, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE));
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, TableEvictedDuringScan) {
+  // The RangeDelAggregator holds pointers into range deletion blocks created by
+  // table readers. This test ensures the aggregator can still access those
+  // blocks even if it outlives the table readers that created them.
+  //
+  // DBIter always keeps readers open for L0 files. So, in order to test
+  // aggregator outliving reader, we need to have deletions in L1 files, which
+  // are opened/closed on-demand during the scan. This is accomplished by
+  // setting kNumRanges > level0_stop_writes_trigger, which prevents deletions
+  // from all lingering in L0 (there is at most one range deletion per L0 file).
+  //
+  // The first L1 file will contain a range deletion since its begin key is 0.
+  // SeekToFirst() references that table's reader and adds its range tombstone
+  // to the aggregator. Upon advancing beyond that table's key-range via Next(),
+  // the table reader will be unreferenced by the iterator. Since we manually
+  // call Evict() on all readers before the full scan, this unreference causes
+  // the reader's refcount to drop to zero and thus be destroyed.
+  //
+  // When it is destroyed, we do not remove its range deletions from the
+  // aggregator. So, subsequent calls to Next() must be able to use these
+  // deletions to decide whether a key is covered. This will work as long as
+  // the aggregator properly references the range deletion block.
+  const int kNum = 25, kRangeBegin = 0, kRangeEnd = 7, kNumRanges = 5;
+  Options opts = CurrentOptions();
+  opts.comparator = test::Uint64Comparator();
+  opts.level0_file_num_compaction_trigger = 4;
+  opts.level0_stop_writes_trigger = 4;
+  opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
+  opts.num_levels = 2;
+  BlockBasedTableOptions bbto;
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.block_cache = NewLRUCache(8 << 20);
+  opts.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(opts);
+
+  // Hold a snapshot so range deletions can't become obsolete during compaction
+  // to bottommost level (i.e., L1).
+  const Snapshot* snapshot = db_->GetSnapshot();
+  for (int i = 0; i < kNum; ++i) {
+    ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val"));
+    if (i > 0) {
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    }
+    if (i >= kNum / 2 && i < kNum / 2 + kNumRanges) {
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 GetNumericStr(kRangeBegin),
+                                 GetNumericStr(kRangeEnd)));
+    }
+  }
+  // Must be > 1 so the first L1 file can be closed before scan finishes
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+  std::vector<uint64_t> file_numbers = ListTableFiles(env_, dbname_);
+
+  ReadOptions read_opts;
+  auto* iter = db_->NewIterator(read_opts);
+  ASSERT_OK(iter->status());
+  int expected = kRangeEnd;
+  iter->SeekToFirst();
+  for (auto file_number : file_numbers) {
+    // This puts table caches in the state of being externally referenced only
+    // so they are destroyed immediately upon iterator unreferencing.
+    TableCache::Evict(dbfull()->TEST_table_cache(), file_number);
+  }
+  for (; iter->Valid(); iter->Next()) {
+    ASSERT_EQ(GetNumericStr(expected), iter->key());
+    ++expected;
+    // Keep clearing block cache's LRU so range deletion block can be freed as
+    // soon as its refcount drops to zero.
+    bbto.block_cache->EraseUnRefEntries();
+  }
+  ASSERT_EQ(kNum, expected);
+  delete iter;
+  db_->ReleaseSnapshot(snapshot);
+
+  // Also test proper cache handling in GetRangeTombstoneIterator,
+  // via TablesRangeTombstoneSummary. (This once triggered memory leak
+  // report with ASAN.)
+  opts.max_open_files = 1;
+  Reopen(opts);
+
+  std::string str;
+  ASSERT_OK(dbfull()->TablesRangeTombstoneSummary(db_->DefaultColumnFamily(),
+                                                  100, &str));
+}
+
+TEST_F(DBRangeDelTest, GetCoveredKeyFromMutableMemtable) {
+  do {
+    DestroyAndReopen(CurrentOptions());
+    ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
+    ASSERT_OK(
+        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+
+    ReadOptions read_opts;
+    std::string value;
+    ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
+  } while (ChangeOptions(kRangeDelSkipConfigs));
+}
+
+TEST_F(DBRangeDelTest, GetCoveredKeyFromImmutableMemtable) {
+  do {
+    Options opts = CurrentOptions();
+    opts.max_write_buffer_number = 3;
+    opts.min_write_buffer_number_to_merge = 2;
+    // SpecialSkipListFactory lets us specify maximum number of elements the
+    // memtable can hold. It switches the active memtable to immutable (flush is
+    // prevented by the above options) upon inserting an element that would
+    // overflow the memtable.
+    opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
+    DestroyAndReopen(opts);
+
+    ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
+    ASSERT_OK(
+        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+    ASSERT_OK(db_->Put(WriteOptions(), "blah", "val"));
+
+    ReadOptions read_opts;
+    std::string value;
+    ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
+  } while (ChangeOptions(kRangeDelSkipConfigs));
+}
+
+TEST_F(DBRangeDelTest, GetCoveredKeyFromSst) {
+  do {
+    DestroyAndReopen(CurrentOptions());
+    ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
+    // snapshot prevents key from being deleted during flush
+    const Snapshot* snapshot = db_->GetSnapshot();
+    ASSERT_OK(
+        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+    ASSERT_OK(db_->Flush(FlushOptions()));
+
+    ReadOptions read_opts;
+    std::string value;
+    ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
+    db_->ReleaseSnapshot(snapshot);
+  } while (ChangeOptions(kRangeDelSkipConfigs));
+}
+
+TEST_F(DBRangeDelTest, GetCoveredMergeOperandFromMemtable) {
+  const int kNumMergeOps = 10;
+  Options opts = CurrentOptions();
+  opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  Reopen(opts);
+
+  for (int i = 0; i < kNumMergeOps; ++i) {
+    std::string val;
+    PutFixed64(&val, i);
+    ASSERT_OK(db_->Merge(WriteOptions(), "key", val));
+    if (i == kNumMergeOps / 2) {
+      // deletes [0, 5]
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 "key", "key_"));
+    }
+  }
+
+  ReadOptions read_opts;
+  std::string expected, actual;
+  ASSERT_OK(db_->Get(read_opts, "key", &actual));
+  PutFixed64(&expected, 30);  // 6+7+8+9
+  ASSERT_EQ(expected, actual);
+
+  expected.clear();
+  read_opts.ignore_range_deletions = true;
+  ASSERT_OK(db_->Get(read_opts, "key", &actual));
+  PutFixed64(&expected, 45);  // 0+1+2+...+9
+  ASSERT_EQ(expected, actual);
+}
+
+TEST_F(DBRangeDelTest, GetIgnoresRangeDeletions) {
+  Options opts = CurrentOptions();
+  opts.max_write_buffer_number = 4;
+  opts.min_write_buffer_number_to_merge = 3;
+  opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
+  Reopen(opts);
+
+  ASSERT_OK(db_->Put(WriteOptions(), "sst_key", "val"));
+  // snapshot prevents key from being deleted during flush
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_OK(db_->Put(WriteOptions(), "imm_key", "val"));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+  ASSERT_OK(db_->Put(WriteOptions(), "mem_key", "val"));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+
+  ReadOptions read_opts;
+  read_opts.ignore_range_deletions = true;
+  for (std::string key : {"sst_key", "imm_key", "mem_key"}) {
+    std::string value;
+    ASSERT_OK(db_->Get(read_opts, key, &value));
+  }
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, IteratorRemovesCoveredKeys) {
+  const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25;
+  Options opts = CurrentOptions();
+  opts.comparator = test::Uint64Comparator();
+  opts.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+  DestroyAndReopen(opts);
+
+  // Write half of the keys before the tombstone and half after the tombstone.
+  // Only covered keys (i.e., within the range and older than the tombstone)
+  // should be deleted.
+  for (int i = 0; i < kNum; ++i) {
+    if (i == kNum / 2) {
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 GetNumericStr(kRangeBegin),
+                                 GetNumericStr(kRangeEnd)));
+    }
+    ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val"));
+  }
+  ReadOptions read_opts;
+  auto* iter = db_->NewIterator(read_opts);
+  ASSERT_OK(iter->status());
+
+  int expected = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_EQ(GetNumericStr(expected), iter->key());
+    if (expected == kRangeBegin - 1) {
+      expected = kNum / 2;
+    } else {
+      ++expected;
+    }
+  }
+  ASSERT_EQ(kNum, expected);
+  delete iter;
+}
+
+TEST_F(DBRangeDelTest, IteratorOverUserSnapshot) {
+  const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25;
+  Options opts = CurrentOptions();
+  opts.comparator = test::Uint64Comparator();
+  opts.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+  DestroyAndReopen(opts);
+
+  const Snapshot* snapshot = nullptr;
+  // Put a snapshot before the range tombstone, verify an iterator using that
+  // snapshot sees all inserted keys.
+  for (int i = 0; i < kNum; ++i) {
+    if (i == kNum / 2) {
+      snapshot = db_->GetSnapshot();
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 GetNumericStr(kRangeBegin),
+                                 GetNumericStr(kRangeEnd)));
+    }
+    ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val"));
+  }
+  ReadOptions read_opts;
+  read_opts.snapshot = snapshot;
+  auto* iter = db_->NewIterator(read_opts);
+  ASSERT_OK(iter->status());
+
+  int expected = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_EQ(GetNumericStr(expected), iter->key());
+    ++expected;
+  }
+  ASSERT_EQ(kNum / 2, expected);
+  delete iter;
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, IteratorIgnoresRangeDeletions) {
+  Options opts = CurrentOptions();
+  opts.max_write_buffer_number = 4;
+  opts.min_write_buffer_number_to_merge = 3;
+  opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
+  Reopen(opts);
+
+  ASSERT_OK(db_->Put(WriteOptions(), "sst_key", "val"));
+  // snapshot prevents key from being deleted during flush
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_OK(db_->Put(WriteOptions(), "imm_key", "val"));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+  ASSERT_OK(db_->Put(WriteOptions(), "mem_key", "val"));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+
+  ReadOptions read_opts;
+  read_opts.ignore_range_deletions = true;
+  auto* iter = db_->NewIterator(read_opts);
+  ASSERT_OK(iter->status());
+  int i = 0;
+  std::string expected[] = {"imm_key", "mem_key", "sst_key"};
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++i) {
+    std::string key;
+    ASSERT_EQ(expected[i], iter->key());
+  }
+  ASSERT_EQ(3, i);
+  delete iter;
+  db_->ReleaseSnapshot(snapshot);
+}
+
+#ifndef ROCKSDB_UBSAN_RUN
+TEST_F(DBRangeDelTest, TailingIteratorRangeTombstoneUnsupported) {
+  ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
+  // snapshot prevents key from being deleted during flush
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+
+  // iterations check unsupported in memtable, l0, and then l1
+  for (int i = 0; i < 3; ++i) {
+    ReadOptions read_opts;
+    read_opts.tailing = true;
+    auto* iter = db_->NewIterator(read_opts);
+    if (i == 2) {
+      // For L1+, iterators over files are created on-demand, so need seek
+      iter->SeekToFirst();
+    }
+    ASSERT_TRUE(iter->status().IsNotSupported());
+
+    delete iter;
+    if (i == 0) {
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    } else if (i == 1) {
+      MoveFilesToLevel(1);
+    }
+  }
+  db_->ReleaseSnapshot(snapshot);
+}
+#endif  // !ROCKSDB_UBSAN_RUN
+
+TEST_F(DBRangeDelTest, SubcompactionHasEmptyDedicatedRangeDelFile) {
+  const int kNumFiles = 2, kNumKeysPerFile = 4;
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.level0_file_num_compaction_trigger = kNumFiles;
+  options.max_subcompactions = 2;
+  options.num_levels = 2;
+  options.target_file_size_base = 4096;
+  Reopen(options);
+
+  // need a L1 file for subcompaction to be triggered
+  ASSERT_OK(
+      db_->Put(WriteOptions(), db_->DefaultColumnFamily(), Key(0), "val"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+
+  // put enough keys to fill up the first subcompaction, and later range-delete
+  // them so that the first subcompaction outputs no key-values. In that case
+  // it'll consider making an SST file dedicated to range deletions.
+  for (int i = 0; i < kNumKeysPerFile; ++i) {
+    ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), Key(i),
+                       std::string(1024, 'a')));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(kNumKeysPerFile)));
+
+  // the above range tombstone can be dropped, so that one alone won't cause a
+  // dedicated file to be opened. We can make one protected by snapshot that
+  // must be considered. Make its range outside the first subcompaction's range
+  // to exercise the tricky part of the code.
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                             Key(kNumKeysPerFile + 1),
+                             Key(kNumKeysPerFile + 2)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()}));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, MemtableBloomFilter) {
+  // regression test for #2743. the range delete tombstones in memtable should
+  // be added even when Get() skips searching due to its prefix bloom filter
+  const int kMemtableSize = 1 << 20;              // 1MB
+  const int kMemtablePrefixFilterSize = 1 << 13;  // 8KB
+  const int kNumKeys = 1000;
+  const int kPrefixLen = 8;
+  Options options = CurrentOptions();
+  options.memtable_prefix_bloom_size_ratio =
+      static_cast<double>(kMemtablePrefixFilterSize) / kMemtableSize;
+  options.prefix_extractor.reset(
+      ROCKSDB_NAMESPACE::NewFixedPrefixTransform(kPrefixLen));
+  options.write_buffer_size = kMemtableSize;
+  Reopen(options);
+
+  for (int i = 0; i < kNumKeys; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(kNumKeys)));
+  for (int i = 0; i < kNumKeys; ++i) {
+    std::string value;
+    ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound());
+  }
+}
+
+TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) {
+  // This test originally verified that compaction treated files containing a
+  // split range deletion in the input level as an atomic unit. I.e.,
+  // compacting any input-level file(s) containing a portion of the range
+  // deletion causes all other input-level files containing portions of that
+  // same range deletion to be included in the compaction. Range deletion
+  // tombstones are now truncated to sstable boundaries which removed the need
+  // for that behavior (which could lead to excessively large
+  // compactions).
+  const int kNumFilesPerLevel = 4, kValueBytes = 4 << 10;
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = kNumFilesPerLevel;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(2 /* num_entries_flush */));
+  // max file size could be 2x of target file size, so set it to half of that
+  options.target_file_size_base = kValueBytes / 2;
+  // disable dynamic_file_size, as it will cut L1 files into more files (than
+  // kNumFilesPerLevel).
+  options.level_compaction_dynamic_file_size = false;
+  options.max_compaction_bytes = 1500;
+  // i == 0: CompactFiles
+  // i == 1: CompactRange
+  // i == 2: automatic compaction
+  for (int i = 0; i < 3; ++i) {
+    DestroyAndReopen(options);
+
+    ASSERT_OK(Put(Key(0), ""));
+    ASSERT_OK(db_->Flush(FlushOptions()));
+    MoveFilesToLevel(2);
+    ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+    // snapshot protects range tombstone from dropping due to becoming obsolete.
+    const Snapshot* snapshot = db_->GetSnapshot();
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(0), Key(2 * kNumFilesPerLevel)));
+
+    Random rnd(301);
+    std::string value = rnd.RandomString(kValueBytes);
+    for (int j = 0; j < kNumFilesPerLevel; ++j) {
+      // give files overlapping key-ranges to prevent trivial move
+      ASSERT_OK(Put(Key(j), value));
+      ASSERT_OK(Put(Key(2 * kNumFilesPerLevel - 1 - j), value));
+      if (j > 0) {
+        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+        ASSERT_EQ(j, NumTableFilesAtLevel(0));
+      }
+    }
+    // put extra key to trigger final flush
+    ASSERT_OK(Put("", ""));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ(0, NumTableFilesAtLevel(0));
+    ASSERT_EQ(kNumFilesPerLevel, NumTableFilesAtLevel(1));
+
+    ColumnFamilyMetaData meta;
+    db_->GetColumnFamilyMetaData(&meta);
+    if (i == 0) {
+      ASSERT_OK(db_->CompactFiles(
+          CompactionOptions(), {meta.levels[1].files[0].name}, 2 /* level */));
+      ASSERT_EQ(0, NumTableFilesAtLevel(1));
+    } else if (i == 1) {
+      auto begin_str = Key(0), end_str = Key(1);
+      Slice begin = begin_str, end = end_str;
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin, &end));
+      ASSERT_EQ(3, NumTableFilesAtLevel(1));
+    } else if (i == 2) {
+      ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
+                                {{"max_bytes_for_level_base", "10000"}}));
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      ASSERT_EQ(1, NumTableFilesAtLevel(1));
+    }
+    ASSERT_GT(NumTableFilesAtLevel(2), 0);
+
+    db_->ReleaseSnapshot(snapshot);
+  }
+}
+
+TEST_F(DBRangeDelTest, RangeTombstoneEndKeyAsSstableUpperBound) {
+  // Test the handling of the range-tombstone end-key as the
+  // upper-bound for an sstable.
+
+  const int kNumFilesPerLevel = 2, kValueBytes = 4 << 10;
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = kNumFilesPerLevel;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(2 /* num_entries_flush */));
+  options.target_file_size_base = kValueBytes;
+  options.disable_auto_compactions = true;
+  // disable it for now, otherwise the L1 files are going be cut before data 1:
+  // L1: [0]   [1,4]
+  // L2: [0,0]
+  // because the grandparent file is between [0]->[1] and it's size is more than
+  // 1/8 of target size (4k).
+  options.level_compaction_dynamic_file_size = false;
+
+  DestroyAndReopen(options);
+
+  // Create an initial sstable at L2:
+  //   [key000000#1,1, key000000#1,1]
+  ASSERT_OK(Put(Key(0), ""));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  // A snapshot protects the range tombstone from dropping due to
+  // becoming obsolete.
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(2 * kNumFilesPerLevel)));
+
+  // Create 2 additional sstables in L0. Note that the first sstable
+  // contains the range tombstone.
+  //   [key000000#3,1, key000004#72057594037927935,15]
+  //   [key000001#5,1, key000002#6,1]
+  Random rnd(301);
+  std::string value = rnd.RandomString(kValueBytes);
+  for (int j = 0; j < kNumFilesPerLevel; ++j) {
+    // Give files overlapping key-ranges to prevent a trivial move when we
+    // compact from L0 to L1.
+    ASSERT_OK(Put(Key(j), value));
+    ASSERT_OK(Put(Key(2 * kNumFilesPerLevel - 1 - j), value));
+    ASSERT_OK(db_->Flush(FlushOptions()));
+    ASSERT_EQ(j + 1, NumTableFilesAtLevel(0));
+  }
+  // Compact the 2 L0 sstables to L1, resulting in the following LSM. There
+  // are 2 sstables generated in L1 due to the target_file_size_base setting.
+  //   L1:
+  //     [key000000#3,1, key000002#72057594037927935,15]
+  //     [key000002#6,1, key000004#72057594037927935,15]
+  //   L2:
+  //     [key000000#1,1, key000000#1,1]
+  MoveFilesToLevel(1);
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+  {
+    // Compact the second sstable in L1:
+    //   L1:
+    //     [key000000#3,1, key000002#72057594037927935,15]
+    //   L2:
+    //     [key000000#1,1, key000000#1,1]
+    //     [key000002#6,1, key000004#72057594037927935,15]
+    //
+    // At the same time, verify the compaction does not cause the key at the
+    // endpoint (key000002#6,1) to disappear.
+    ASSERT_EQ(value, Get(Key(2)));
+    auto begin_str = Key(3);
+    const ROCKSDB_NAMESPACE::Slice begin = begin_str;
+    ASSERT_OK(dbfull()->TEST_CompactRange(1, &begin, nullptr));
+    ASSERT_EQ(1, NumTableFilesAtLevel(1));
+    ASSERT_EQ(2, NumTableFilesAtLevel(2));
+    ASSERT_EQ(value, Get(Key(2)));
+  }
+
+  {
+    // Compact the first sstable in L1. This should be copacetic, but
+    // was previously resulting in overlapping sstables in L2 due to
+    // mishandling of the range tombstone end-key when used as the
+    // largest key for an sstable. The resulting LSM structure should
+    // be:
+    //
+    //   L2:
+    //     [key000000#1,1, key000001#72057594037927935,15]
+    //     [key000001#5,1, key000002#72057594037927935,15]
+    //     [key000002#6,1, key000004#72057594037927935,15]
+    auto begin_str = Key(0);
+    const ROCKSDB_NAMESPACE::Slice begin = begin_str;
+    ASSERT_OK(dbfull()->TEST_CompactRange(1, &begin, &begin));
+    ASSERT_EQ(0, NumTableFilesAtLevel(1));
+    ASSERT_EQ(3, NumTableFilesAtLevel(2));
+  }
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, UnorderedTombstones) {
+  // Regression test for #2752. Range delete tombstones between
+  // different snapshot stripes are not stored in order, so the first
+  // tombstone of each snapshot stripe should be checked as a smallest
+  // candidate.
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+
+  auto cf = db_->DefaultColumnFamily();
+
+  ASSERT_OK(db_->Put(WriteOptions(), cf, "a", "a"));
+  ASSERT_OK(db_->Flush(FlushOptions(), cf));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), cf, "b", "c"));
+  // Hold a snapshot to separate these two delete ranges.
+  auto snapshot = db_->GetSnapshot();
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), cf, "a", "b"));
+  ASSERT_OK(db_->Flush(FlushOptions(), cf));
+  db_->ReleaseSnapshot(snapshot);
+
+  std::vector<std::vector<FileMetaData>> files;
+  dbfull()->TEST_GetFilesMetaData(cf, &files);
+  ASSERT_EQ(1, files[0].size());
+  ASSERT_EQ("a", files[0][0].smallest.user_key());
+  ASSERT_EQ("c", files[0][0].largest.user_key());
+
+  std::string v;
+  auto s = db_->Get(ReadOptions(), "a", &v);
+  ASSERT_TRUE(s.IsNotFound());
+}
+
+class MockMergeOperator : public MergeOperator {
+  // Mock non-associative operator. Non-associativity is expressed by lack of
+  // implementation for any `PartialMerge*` functions.
+ public:
+  bool FullMergeV2(const MergeOperationInput& merge_in,
+                   MergeOperationOutput* merge_out) const override {
+    assert(merge_out != nullptr);
+    merge_out->new_value = merge_in.operand_list.back().ToString();
+    return true;
+  }
+
+  const char* Name() const override { return "MockMergeOperator"; }
+};
+
+TEST_F(DBRangeDelTest, KeyAtOverlappingEndpointReappears) {
+  // This test uses a non-associative merge operator since that is a convenient
+  // way to get compaction to write out files with overlapping user-keys at the
+  // endpoints. Note, however, overlapping endpoints can also occur with other
+  // value types (Put, etc.), assuming the right snapshots are present.
+  const int kFileBytes = 1 << 20;
+  const int kValueBytes = 1 << 10;
+  const int kNumFiles = 4;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.merge_operator.reset(new MockMergeOperator());
+  options.target_file_size_base = kFileBytes;
+  Reopen(options);
+
+  // Push dummy data to L3 so that our actual test files on L0-L2
+  // will not be considered "bottommost" level, otherwise compaction
+  // may prevent us from creating overlapping user keys
+  // as on the bottommost layer MergeHelper
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "dummy"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(3);
+
+  Random rnd(301);
+  const Snapshot* snapshot = nullptr;
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
+      auto value = rnd.RandomString(kValueBytes);
+      ASSERT_OK(db_->Merge(WriteOptions(), "key", value));
+    }
+    if (i == kNumFiles - 1) {
+      // Take snapshot to prevent covered merge operands from being dropped by
+      // compaction.
+      snapshot = db_->GetSnapshot();
+      // The DeleteRange is the last write so all merge operands are covered.
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 "key", "key_"));
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  }
+  ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+  std::string value;
+  ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
+
+  ASSERT_OK(dbfull()->TEST_CompactRange(
+      0 /* level */, nullptr /* begin */, nullptr /* end */,
+      nullptr /* column_family */, true /* disallow_trivial_move */));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  // Now we have multiple files at L1 all containing a single user key, thus
+  // guaranteeing overlap in the file endpoints.
+  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+
+  // Verify no merge operands reappeared after the compaction.
+  ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
+
+  // Compact and verify again. It's worthwhile because now the files have
+  // tighter endpoints, so we can verify that doesn't mess anything up.
+  ASSERT_OK(dbfull()->TEST_CompactRange(
+      1 /* level */, nullptr /* begin */, nullptr /* end */,
+      nullptr /* column_family */, true /* disallow_trivial_move */));
+  ASSERT_GT(NumTableFilesAtLevel(2), 1);
+  ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, UntruncatedTombstoneDoesNotDeleteNewerKey) {
+  // Verify a key newer than a range tombstone cannot be deleted by being
+  // compacted to the bottom level (and thus having its seqnum zeroed) before
+  // the range tombstone. This used to happen when range tombstones were
+  // untruncated on reads such that they extended past their file boundaries.
+  //
+  // Test summary:
+  //
+  // - L1 is bottommost.
+  // - A couple snapshots are strategically taken to prevent seqnums from being
+  //   zeroed, range tombstone from being dropped, merge operands from being
+  //   dropped, and merge operands from being combined.
+  // - Left half of files in L1 all have same user key, ensuring their file
+  //   boundaries overlap. In the past this would cause range tombstones to be
+  //   untruncated.
+  // - Right half of L1 files all have different keys, ensuring no overlap.
+  // - A range tombstone spans all L1 keys, so it is stored in every L1 file.
+  // - Keys in the right side of the key-range are overwritten. These are
+  //   compacted down to L1 after releasing snapshots such that their seqnums
+  //   will be zeroed.
+  // - A full range scan is performed. If the tombstone in the left L1 files
+  //   were untruncated, it would now cover keys newer than it (but with zeroed
+  //   seqnums) in the right L1 files.
+  const int kFileBytes = 1 << 20;
+  const int kValueBytes = 1 << 10;
+  const int kNumFiles = 4;
+  const int kMaxKey = kNumFiles * kFileBytes / kValueBytes;
+  const int kKeysOverwritten = 10;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.merge_operator.reset(new MockMergeOperator());
+  options.num_levels = 2;
+  options.target_file_size_base = kFileBytes;
+  Reopen(options);
+
+  Random rnd(301);
+  // - snapshots[0] prevents merge operands from being combined during
+  //   compaction.
+  // - snapshots[1] prevents merge operands from being dropped due to the
+  //   covering range tombstone.
+  const Snapshot* snapshots[] = {nullptr, nullptr};
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
+      auto value = rnd.RandomString(kValueBytes);
+      std::string key;
+      if (i < kNumFiles / 2) {
+        key = Key(0);
+      } else {
+        key = Key(1 + i * kFileBytes / kValueBytes + j);
+      }
+      ASSERT_OK(db_->Merge(WriteOptions(), key, value));
+    }
+    if (i == 0) {
+      snapshots[0] = db_->GetSnapshot();
+    }
+    if (i == kNumFiles - 1) {
+      snapshots[1] = db_->GetSnapshot();
+      // The DeleteRange is the last write so all merge operands are covered.
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 Key(0), Key(kMaxKey + 1)));
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  }
+  ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+
+  auto get_key_count = [this]() -> int {
+    auto* iter = db_->NewIterator(ReadOptions());
+    assert(iter->status().ok());
+    iter->SeekToFirst();
+    int keys_found = 0;
+    for (; iter->Valid(); iter->Next()) {
+      ++keys_found;
+    }
+    delete iter;
+    return keys_found;
+  };
+
+  // All keys should be covered
+  ASSERT_EQ(0, get_key_count());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr /* begin_key */,
+                              nullptr /* end_key */));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  // Roughly the left half of L1 files should have overlapping boundary keys,
+  // while the right half should not.
+  ASSERT_GE(NumTableFilesAtLevel(1), kNumFiles);
+
+  // Now overwrite a few keys that are in L1 files that definitely don't have
+  // overlapping boundary keys.
+  for (int i = kMaxKey; i > kMaxKey - kKeysOverwritten; --i) {
+    auto value = rnd.RandomString(kValueBytes);
+    ASSERT_OK(db_->Merge(WriteOptions(), Key(i), value));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  // The overwritten keys are in L0 now, so clearly aren't covered by the range
+  // tombstone in L1.
+  ASSERT_EQ(kKeysOverwritten, get_key_count());
+
+  // Release snapshots so seqnums can be zeroed when L0->L1 happens.
+  db_->ReleaseSnapshot(snapshots[0]);
+  db_->ReleaseSnapshot(snapshots[1]);
+
+  auto begin_key_storage = Key(kMaxKey - kKeysOverwritten + 1);
+  auto end_key_storage = Key(kMaxKey);
+  Slice begin_key(begin_key_storage);
+  Slice end_key(end_key_storage);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin_key, &end_key));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GE(NumTableFilesAtLevel(1), kNumFiles);
+
+  ASSERT_EQ(kKeysOverwritten, get_key_count());
+}
+
+TEST_F(DBRangeDelTest, DeletedMergeOperandReappearsIterPrev) {
+  // Exposes a bug where we were using
+  // `RangeDelPositioningMode::kBackwardTraversal` while scanning merge operands
+  // in the forward direction. Confusingly, this case happened during
+  // `DBIter::Prev`. It could cause assertion failure, or reappearing keys.
+  const int kFileBytes = 1 << 20;
+  const int kValueBytes = 1 << 10;
+  // Need multiple keys so we can get results when calling `Prev()` after
+  // `SeekToLast()`.
+  const int kNumKeys = 3;
+  const int kNumFiles = 4;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.merge_operator.reset(new MockMergeOperator());
+  options.target_file_size_base = kFileBytes;
+  Reopen(options);
+
+  Random rnd(301);
+  const Snapshot* snapshot = nullptr;
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
+      auto value = rnd.RandomString(kValueBytes);
+      ASSERT_OK(db_->Merge(WriteOptions(), Key(j % kNumKeys), value));
+      if (i == 0 && j == kNumKeys) {
+        // Take snapshot to prevent covered merge operands from being dropped or
+        // merged by compaction.
+        snapshot = db_->GetSnapshot();
+        // Do a DeleteRange near the beginning so only the oldest merge operand
+        // for each key is covered. This ensures the sequence of events:
+        //
+        // - `DBIter::Prev()` is called
+        // - After several same versions of the same user key are encountered,
+        //   it decides to seek using `DBIter::FindValueForCurrentKeyUsingSeek`.
+        // - Binary searches to the newest version of the key, which is in the
+        //   leftmost file containing the user key.
+        // - Scans forwards to collect all merge operands. Eventually reaches
+        //   the rightmost file containing the oldest merge operand, which
+        //   should be covered by the `DeleteRange`. If `RangeDelAggregator`
+        //   were not properly using `kForwardTraversal` here, that operand
+        //   would reappear.
+        ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                   Key(0), Key(kNumKeys + 1)));
+      }
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  }
+  ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr /* begin_key */,
+                              nullptr /* end_key */));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+
+  auto* iter = db_->NewIterator(ReadOptions());
+  ASSERT_OK(iter->status());
+  iter->SeekToLast();
+  int keys_found = 0;
+  for (; iter->Valid(); iter->Prev()) {
+    ++keys_found;
+  }
+  delete iter;
+  ASSERT_EQ(kNumKeys, keys_found);
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeys) {
+  const int kFileBytes = 1 << 20;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = kFileBytes;
+  Reopen(options);
+
+  ASSERT_OK(Put(Key(0), "a"));
+  const Snapshot* snapshot = db_->GetSnapshot();
+
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(10)));
+
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  ReadOptions read_opts;
+  read_opts.snapshot = snapshot;
+  auto* iter = db_->NewIterator(read_opts);
+  ASSERT_OK(iter->status());
+
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(Key(0), iter->key());
+
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  delete iter;
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeysInImmMemTables) {
+  const int kFileBytes = 1 << 20;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = kFileBytes;
+  Reopen(options);
+
+  // block flush thread -> pin immtables in memory
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"SnapshotPreventsDroppedKeysInImmMemTables:AfterNewIterator",
+       "DBImpl::BGWorkFlush"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(0), "a"));
+  std::unique_ptr<const Snapshot, std::function<void(const Snapshot*)>>
+      snapshot(db_->GetSnapshot(),
+               [this](const Snapshot* s) { db_->ReleaseSnapshot(s); });
+
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(10)));
+
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+
+  ReadOptions read_opts;
+  read_opts.snapshot = snapshot.get();
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+  ASSERT_OK(iter->status());
+
+  TEST_SYNC_POINT("SnapshotPreventsDroppedKeysInImmMemTables:AfterNewIterator");
+
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(Key(0), iter->key());
+
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+}
+
+TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) {
+  // Adapted from
+  // https://github.com/cockroachdb/cockroach/blob/de8b3ea603dd1592d9dc26443c2cc92c356fbc2f/pkg/storage/engine/rocksdb_test.go#L1267-L1398.
+  // Regression test for issue where range tombstone was written to more files
+  // than necessary when it began exactly at the begin key in the next
+  // compaction output file.
+  const int kFileBytes = 1 << 20;
+  const int kValueBytes = 4 << 10;
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  // Have a bit of slack in the size limits but we enforce them more strictly
+  // when manually flushing/compacting.
+  options.max_compaction_bytes = 2 * kFileBytes;
+  options.target_file_size_base = 2 * kFileBytes;
+  options.write_buffer_size = 2 * kFileBytes;
+  Reopen(options);
+
+  Random rnd(301);
+  for (char first_char : {'a', 'b', 'c'}) {
+    for (int i = 0; i < kFileBytes / kValueBytes; ++i) {
+      std::string key(1, first_char);
+      key.append(Key(i));
+      std::string value = rnd.RandomString(kValueBytes);
+      ASSERT_OK(Put(key, value));
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+    MoveFilesToLevel(2);
+  }
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(3, NumTableFilesAtLevel(2));
+
+  // Populate the memtable lightly while spanning the whole key-space. The
+  // setting of `max_compaction_bytes` will cause the L0->L1 to output multiple
+  // files to prevent a large L1->L2 compaction later.
+  ASSERT_OK(Put("a", "val"));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                             "c" + Key(1), "d"));
+  // Our compaction output file cutting logic currently only considers point
+  // keys. So, in order for the range tombstone to have a chance at landing at
+  // the start of a new file, we need a point key at the range tombstone's
+  // start.
+  // TODO(ajkr): remove this `Put` after file cutting accounts for range
+  // tombstones (#3977).
+  ASSERT_OK(Put("c" + Key(1), "value"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  // Ensure manual L0->L1 compaction cuts the outputs before the range tombstone
+  // and the range tombstone is only placed in the second SST.
+  std::string begin_key_storage("c" + Key(1));
+  Slice begin_key(begin_key_storage);
+  std::string end_key_storage("d");
+  Slice end_key(end_key_storage);
+  ASSERT_OK(dbfull()->TEST_CompactRange(
+      0 /* level */, &begin_key /* begin */, &end_key /* end */,
+      nullptr /* column_family */, true /* disallow_trivial_move */));
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+  std::vector<LiveFileMetaData> all_metadata;
+  std::vector<LiveFileMetaData> l1_metadata;
+  db_->GetLiveFilesMetaData(&all_metadata);
+  for (const auto& metadata : all_metadata) {
+    if (metadata.level == 1) {
+      l1_metadata.push_back(metadata);
+    }
+  }
+  std::sort(l1_metadata.begin(), l1_metadata.end(),
+            [&](const LiveFileMetaData& a, const LiveFileMetaData& b) {
+              return options.comparator->Compare(a.smallestkey, b.smallestkey) <
+                     0;
+            });
+  ASSERT_EQ("a", l1_metadata[0].smallestkey);
+  ASSERT_EQ("a", l1_metadata[0].largestkey);
+  ASSERT_EQ("c" + Key(1), l1_metadata[1].smallestkey);
+  ASSERT_EQ("d", l1_metadata[1].largestkey);
+
+  TablePropertiesCollection all_table_props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&all_table_props));
+  int64_t num_range_deletions = 0;
+  for (const auto& name_and_table_props : all_table_props) {
+    const auto& name = name_and_table_props.first;
+    const auto& table_props = name_and_table_props.second;
+    // The range tombstone should only be output to the second L1 SST.
+    if (name.size() >= l1_metadata[1].name.size() &&
+        name.substr(name.size() - l1_metadata[1].name.size())
+                .compare(l1_metadata[1].name) == 0) {
+      ASSERT_EQ(1, table_props->num_range_deletions);
+      ++num_range_deletions;
+    } else {
+      ASSERT_EQ(0, table_props->num_range_deletions);
+    }
+  }
+  ASSERT_EQ(1, num_range_deletions);
+}
+
+TEST_F(DBRangeDelTest, OverlappedTombstones) {
+  const int kNumPerFile = 4, kNumFiles = 2;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 9 * 1024;
+  options.max_compaction_bytes = 9 * 1024;
+  DestroyAndReopen(options);
+  Random rnd(301);
+  for (int i = 0; i < kNumFiles; ++i) {
+    std::vector<std::string> values;
+    // Write 12K (4 values, each 3K)
+    for (int j = 0; j < kNumPerFile; j++) {
+      values.push_back(rnd.RandomString(3 << 10));
+      ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
+    }
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(2, NumTableFilesAtLevel(2));
+
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(1),
+                             Key((kNumFiles)*kNumPerFile + 1)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                        true /* disallow_trivial_move */));
+
+  // The tombstone range is not broken up into multiple SSTs which may incur a
+  // large compaction with L2.
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+  std::vector<std::vector<FileMetaData>> files;
+  ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr,
+                                        true /* disallow_trivial_move */));
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+}
+
+TEST_F(DBRangeDelTest, OverlappedKeys) {
+  const int kNumPerFile = 4, kNumFiles = 2;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 9 * 1024;
+  options.max_compaction_bytes = 9 * 1024;
+  DestroyAndReopen(options);
+  Random rnd(301);
+  for (int i = 0; i < kNumFiles; ++i) {
+    std::vector<std::string> values;
+    // Write 12K (4 values, each 3K)
+    for (int j = 0; j < kNumPerFile; j++) {
+      values.push_back(rnd.RandomString(3 << 10));
+      ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
+    }
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(2, NumTableFilesAtLevel(2));
+
+  for (int i = 1; i < kNumFiles * kNumPerFile + 1; i++) {
+    ASSERT_OK(Put(Key(i), "0x123"));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  // The key range is broken up into three SSTs to avoid a future big compaction
+  // with the grandparent
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                        true /* disallow_trivial_move */));
+  ASSERT_EQ(3, NumTableFilesAtLevel(1));
+
+  ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr,
+                                        true /* disallow_trivial_move */));
+  // L1->L2 compaction size is limited to max_compaction_bytes
+  ASSERT_EQ(3, NumTableFilesAtLevel(2));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+}
+
+TEST_F(DBRangeDelTest, IteratorRefresh) {
+  // Refreshing an iterator after a range tombstone is added should cause the
+  // deleted range of keys to disappear.
+  for (bool sv_changed : {false, true}) {
+    ASSERT_OK(db_->Put(WriteOptions(), "key1", "value1"));
+    ASSERT_OK(db_->Put(WriteOptions(), "key2", "value2"));
+
+    auto* iter = db_->NewIterator(ReadOptions());
+    ASSERT_OK(iter->status());
+
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               "key2", "key3"));
+
+    if (sv_changed) {
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    }
+
+    ASSERT_OK(iter->Refresh());
+    ASSERT_OK(iter->status());
+    iter->SeekToFirst();
+    ASSERT_EQ("key1", iter->key());
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+
+    delete iter;
+  }
+}
+
+void VerifyIteratorReachesEnd(InternalIterator* iter) {
+  ASSERT_TRUE(!iter->Valid() && iter->status().ok());
+}
+
+void VerifyIteratorReachesEnd(Iterator* iter) {
+  ASSERT_TRUE(!iter->Valid() && iter->status().ok());
+}
+
+TEST_F(DBRangeDelTest, IteratorReseek) {
+  // Range tombstone triggers reseek (seeking to a range tombstone end key) in
+  // merging iterator. Test set up:
+  //    one memtable: range tombstone [0, 1)
+  //    one immutable memtable: range tombstone [1, 2)
+  //    one L0 file with range tombstone [2, 3)
+  //    one L1 file with range tombstone [3, 4)
+  // Seek(0) should trigger cascading reseeks at all levels below memtable.
+  // Seek(1) should trigger cascading reseeks at all levels below immutable
+  // memtable. SeekToFirst and SeekToLast trigger no reseek.
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+
+  DestroyAndReopen(options);
+  // L1
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3),
+                             Key(4)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+  // L0
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
+                             Key(3)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  // Immutable memtable
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(1),
+                             Key(2)));
+  ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
+  std::string value;
+  ASSERT_TRUE(dbfull()->GetProperty(db_->DefaultColumnFamily(),
+                                    "rocksdb.num-immutable-mem-table", &value));
+  ASSERT_EQ(1, std::stoi(value));
+  // live memtable
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(1)));
+  // this memtable is still active
+  ASSERT_TRUE(dbfull()->GetProperty(db_->DefaultColumnFamily(),
+                                    "rocksdb.num-immutable-mem-table", &value));
+  ASSERT_EQ(1, std::stoi(value));
+
+  auto iter = db_->NewIterator(ReadOptions());
+  get_perf_context()->Reset();
+  iter->Seek(Key(0));
+  // Reseeked immutable memtable, L0 and L1
+  ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 3);
+  VerifyIteratorReachesEnd(iter);
+  get_perf_context()->Reset();
+  iter->SeekForPrev(Key(1));
+  // Reseeked L0 and L1
+  ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2);
+  VerifyIteratorReachesEnd(iter);
+  get_perf_context()->Reset();
+  iter->SeekToFirst();
+  ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 0);
+  VerifyIteratorReachesEnd(iter);
+  iter->SeekToLast();
+  ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 0);
+  VerifyIteratorReachesEnd(iter);
+  delete iter;
+}
+
+TEST_F(DBRangeDelTest, ReseekDuringNextAndPrev) {
+  // Range tombstone triggers reseek during Next()/Prev() in merging iterator.
+  // Test set up:
+  //    memtable has: [0, 1) [2, 3)
+  //    L0 has: 2
+  //    L1 has: 1, 2, 3
+  // Seek(0) will reseek to 1 for L0 and L1. Seek(1) will not trigger any
+  // reseek. Then Next() determines 2 is covered by [2, 3), it will try to
+  // reseek to 3 for L0 and L1. Similar story for Prev() and SeekForPrev() is
+  // tested.
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+
+  DestroyAndReopen(options);
+  // L1
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), "foo"));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo"));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(3), "foo"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  // L0
+  ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  // Memtable
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(1)));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
+                             Key(3)));
+
+  auto iter = db_->NewIterator(ReadOptions());
+  auto iter_test_forward = [&] {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), Key(1));
+
+    get_perf_context()->Reset();
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), Key(3));
+    // Reseeked L0 and L1
+    ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2);
+
+    // Next to Prev
+    get_perf_context()->Reset();
+    iter->Prev();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), Key(1));
+    // Reseeked L0 and L1
+    ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2);
+
+    // Prev to Next
+    get_perf_context()->Reset();
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), Key(3));
+    // Reseeked L0 and L1
+    ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2);
+
+    iter->Next();
+    VerifyIteratorReachesEnd(iter);
+  };
+
+  get_perf_context()->Reset();
+  iter->Seek(Key(0));
+  // Reseeked L0 and L1
+  ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2);
+  iter_test_forward();
+  get_perf_context()->Reset();
+  iter->Seek(Key(1));
+  ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 0);
+  iter_test_forward();
+
+  get_perf_context()->Reset();
+  iter->SeekForPrev(Key(2));
+  // Reseeked L0 and L1
+  ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2);
+  iter_test_forward();
+  get_perf_context()->Reset();
+  iter->SeekForPrev(Key(1));
+  ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 0);
+  iter_test_forward();
+
+  get_perf_context()->Reset();
+  iter->SeekToFirst();
+  ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 0);
+  iter_test_forward();
+
+  iter->SeekToLast();
+  iter->Prev();
+  iter_test_forward();
+  delete iter;
+}
+
+TEST_F(DBRangeDelTest, TombstoneFromCurrentLevel) {
+  // Range tombstone triggers reseek when covering key from the same level.
+  // in merging iterator. Test set up:
+  //    memtable has: [0, 1)
+  //    L0 has: [2, 3), 2
+  //    L1 has: 1, 2, 3
+  // Seek(0) will reseek to 1 for L0 and L1.
+  // Then Next() will reseek to 3 for L1 since 2 in L0 is covered by [2, 3) in
+  // L0.
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+
+  DestroyAndReopen(options);
+  // L1
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), "foo"));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo"));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(3), "foo"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  // L0
+  ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo"));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
+                             Key(3)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  // Memtable
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(1)));
+
+  auto iter = db_->NewIterator(ReadOptions());
+  get_perf_context()->Reset();
+  iter->Seek(Key(0));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), Key(1));
+  ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2);
+
+  get_perf_context()->Reset();
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), Key(3));
+  ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 1);
+
+  delete iter;
+}
+
+class TombstoneTestSstPartitioner : public SstPartitioner {
+ public:
+  const char* Name() const override { return "SingleKeySstPartitioner"; }
+
+  PartitionerResult ShouldPartition(
+      const PartitionerRequest& request) override {
+    if (cmp->Compare(*request.current_user_key, DBTestBase::Key(5)) == 0) {
+      return kRequired;
+    } else {
+      return kNotRequired;
+    }
+  }
+
+  bool CanDoTrivialMove(const Slice& /*smallest_user_key*/,
+                        const Slice& /*largest_user_key*/) override {
+    return false;
+  }
+
+  const Comparator* cmp = BytewiseComparator();
+};
+
+class TombstoneTestSstPartitionerFactory : public SstPartitionerFactory {
+ public:
+  static const char* kClassName() {
+    return "TombstoneTestSstPartitionerFactory";
+  }
+  const char* Name() const override { return kClassName(); }
+
+  std::unique_ptr<SstPartitioner> CreatePartitioner(
+      const SstPartitioner::Context& /* context */) const override {
+    return std::unique_ptr<SstPartitioner>(new TombstoneTestSstPartitioner());
+  }
+};
+
+TEST_F(DBRangeDelTest, TombstoneAcrossFileBoundary) {
+  // Verify that a range tombstone across file boundary covers keys from older
+  // levels. Test set up:
+  //    L1_0: 1, 3, [2, 6)   L1_1: 5, 7, [2, 6) ([2, 6) is from compaction with
+  //    L1_0) L2 has: 5
+  // Seek(1) and then Next() should move the L1 level iterator to
+  // L1_1. Check if 5 is returned after Next().
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 2 * 1024;
+  options.max_compaction_bytes = 2 * 1024;
+
+  // Make sure L1 files are split before "5"
+  auto factory = std::make_shared<TombstoneTestSstPartitionerFactory>();
+  options.sst_partitioner_factory = factory;
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  // L2
+  // the file should be smaller than max_compaction_bytes, otherwise the file
+  // will be cut before 7.
+  ASSERT_OK(db_->Put(WriteOptions(), Key(5), rnd.RandomString(1 << 9)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  // L1_1
+  ASSERT_OK(db_->Put(WriteOptions(), Key(5), rnd.RandomString(1 << 10)));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(7), rnd.RandomString(1 << 10)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  // L1_0
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(1 << 10)));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(3), rnd.RandomString(1 << 10)));
+  // Prevent keys being compacted away
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
+                             Key(6)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+  auto iter = db_->NewIterator(ReadOptions());
+  get_perf_context()->Reset();
+  iter->Seek(Key(1));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), Key(1));
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), Key(7));
+  // 1 reseek into L2 when key 5 in L2 is covered by [2, 6) from L1
+  ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 1);
+
+  delete iter;
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, NonOverlappingTombstonAtBoundary) {
+  // Verify that a range tombstone across file boundary covers keys from older
+  // levels.
+  // Test set up:
+  //    L1_0: 1, 3, [4, 7)         L1_1: 6, 8, [4, 7)
+  //    L2: 5
+  // Note that [4, 7) is at end of L1_0 and not overlapping with any point key
+  // in L1_0. [4, 7) from L1_0 should cover 5 is sentinel works
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 2 * 1024;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  // L2
+  ASSERT_OK(db_->Put(WriteOptions(), Key(5), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  // L1_1
+  ASSERT_OK(db_->Put(WriteOptions(), Key(6), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(8), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  // L1_0
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(3), rnd.RandomString(4 << 10)));
+  // Prevent keys being compacted away
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(4),
+                             Key(7)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+  auto iter = db_->NewIterator(ReadOptions());
+  iter->Seek(Key(3));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key(), Key(3));
+  get_perf_context()->Reset();
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), Key(8));
+  // 1 reseek into L1 since 5 from L2 is covered by [4, 7) from L1
+  ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 1);
+  for (auto& k : {4, 5, 6}) {
+    get_perf_context()->Reset();
+    iter->Seek(Key(k));
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), Key(8));
+    // 1 reseek into L1
+    ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 1);
+  }
+  delete iter;
+}
+
+TEST_F(DBRangeDelTest, OlderLevelHasNewerData) {
+  // L1_0: 1, 3, [2, 7)   L1_1: 5, 6 at a newer sequence number than [2, 7)
+  // Compact L1_1 to L2. Seek(3) should not skip 5 or 6.
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 3 * 1024;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  // L1_0
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(3), rnd.RandomString(4 << 10)));
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
+                             Key(7)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  // L1_1
+  ASSERT_OK(db_->Put(WriteOptions(), Key(5), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(6), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+  auto key = Key(6);
+  Slice begin(key);
+  EXPECT_OK(dbfull()->TEST_CompactRange(1, &begin, nullptr));
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  auto iter = db_->NewIterator(ReadOptions());
+  iter->Seek(Key(3));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), Key(5));
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), Key(6));
+  delete iter;
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, LevelBoundaryDefinedByTombstone) {
+  // L1 has: 1, 2, [4, 5)
+  // L2 has: 4
+  // Seek(3), which is over all points keys in L1, check whether
+  // sentinel key from L1 works in this case.
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 3 * 1024;
+  DestroyAndReopen(options);
+  Random rnd(301);
+  // L2
+  ASSERT_OK(db_->Put(WriteOptions(), Key(4), "foo"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  const Snapshot* snapshot = db_->GetSnapshot();
+  MoveFilesToLevel(2);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  // L1_0
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(2), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(4),
+                             Key(5)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  auto iter = db_->NewIterator(ReadOptions());
+  iter->Seek(Key(3));
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_OK(iter->status());
+
+  get_perf_context()->Reset();
+  iter->SeekForPrev(Key(5));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key(), Key(2));
+  db_->ReleaseSnapshot(snapshot);
+  delete iter;
+}
+
+TEST_F(DBRangeDelTest, TombstoneOnlyFile) {
+  // L1_0: 1, 2, L1_1: [3, 5)
+  // L2: 3
+  // Seek(2) then Next() should advance L1 iterator into L1_1.
+  // If sentinel works with tombstone only file, it should cover the key in L2.
+  // Similar story for SeekForPrev(4).
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 3 * 1024;
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+  // L2
+  ASSERT_OK(db_->Put(WriteOptions(), Key(3), "foo"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  // L1_0
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(2), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  // L1_1
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3),
+                             Key(5)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  auto iter = db_->NewIterator(ReadOptions());
+  iter->Seek(Key(2));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key(), Key(2));
+  iter->Next();
+  VerifyIteratorReachesEnd(iter);
+  iter->SeekForPrev(Key(4));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key(), Key(2));
+  iter->Next();
+  VerifyIteratorReachesEnd(iter);
+  delete iter;
+}
+
+void VerifyIteratorKey(InternalIterator* iter,
+                       const std::vector<std::string>& expected_keys,
+                       bool forward = true) {
+  for (auto& key : expected_keys) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->user_key(), key);
+    if (forward) {
+      iter->Next();
+    } else {
+      iter->Prev();
+    }
+  }
+}
+
+TEST_F(DBRangeDelTest, TombstoneOnlyLevel) {
+  // L1 [3, 5)
+  // L2 has: 3, 4
+  // Any kind of iterator seek should skip 3 and 4 in L2.
+  // L1 level iterator should produce sentinel key.
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 3 * 1024;
+
+  DestroyAndReopen(options);
+  // L2
+  ASSERT_OK(db_->Put(WriteOptions(), Key(3), "foo"));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(4), "bar"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  // L1
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3),
+                             Key(5)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  auto iter = db_->NewIterator(ReadOptions());
+  get_perf_context()->Reset();
+  uint64_t expected_reseek = 0;
+  for (auto i = 0; i < 7; ++i) {
+    iter->Seek(Key(i));
+    VerifyIteratorReachesEnd(iter);
+    if (i < 5) {
+      ++expected_reseek;
+    }
+    ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count,
+              expected_reseek);
+    iter->SeekForPrev(Key(i));
+    VerifyIteratorReachesEnd(iter);
+    if (i > 2) {
+      ++expected_reseek;
+    }
+    ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count,
+              expected_reseek);
+    iter->SeekToFirst();
+    VerifyIteratorReachesEnd(iter);
+    ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count,
+              ++expected_reseek);
+    iter->SeekToLast();
+    VerifyIteratorReachesEnd(iter);
+    ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count,
+              ++expected_reseek);
+  }
+  delete iter;
+
+  // Check L1 LevelIterator behavior
+  ColumnFamilyData* cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
+          ->cfd();
+  SuperVersion* sv = cfd->GetSuperVersion();
+  Arena arena;
+  ReadOptions read_options;
+  MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), &arena,
+                                          false /* prefix seek */);
+  InternalIterator* level_iter = sv->current->TEST_GetLevelIterator(
+      read_options, &merge_iter_builder, 1 /* level */, true);
+  // This is needed to make LevelIterator range tombstone aware
+  auto miter = merge_iter_builder.Finish();
+  auto k = Key(3);
+  IterKey target;
+  target.SetInternalKey(k, kMaxSequenceNumber, kValueTypeForSeek);
+  level_iter->Seek(target.GetInternalKey());
+  // sentinel key (file boundary as a fake key)
+  VerifyIteratorKey(level_iter, {Key(5)});
+  VerifyIteratorReachesEnd(level_iter);
+
+  k = Key(5);
+  target.SetInternalKey(k, 0, kValueTypeForSeekForPrev);
+  level_iter->SeekForPrev(target.GetInternalKey());
+  VerifyIteratorKey(level_iter, {Key(3)}, false);
+  VerifyIteratorReachesEnd(level_iter);
+
+  level_iter->SeekToFirst();
+  VerifyIteratorKey(level_iter, {Key(5)});
+  VerifyIteratorReachesEnd(level_iter);
+
+  level_iter->SeekToLast();
+  VerifyIteratorKey(level_iter, {Key(3)}, false);
+  VerifyIteratorReachesEnd(level_iter);
+
+  miter->~InternalIterator();
+}
+
+TEST_F(DBRangeDelTest, TombstoneOnlyWithOlderVisibleKey) {
+  // L1: [3, 5)
+  // L2: 2, 4, 5
+  // 2 and 5 should be visible
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 3 * 1024;
+
+  DestroyAndReopen(options);
+  // L2
+  ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo"));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(4), "bar"));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(5), "foobar"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  // l1
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3),
+                             Key(5)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  auto iter = db_->NewIterator(ReadOptions());
+  auto iter_test_backward = [&] {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key(), Key(5));
+    iter->Prev();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key(), Key(2));
+    iter->Prev();
+    VerifyIteratorReachesEnd(iter);
+  };
+  auto iter_test_forward = [&] {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key(), Key(2));
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key(), Key(5));
+    iter->Next();
+    VerifyIteratorReachesEnd(iter);
+  };
+  iter->Seek(Key(4));
+  iter_test_backward();
+  iter->SeekForPrev(Key(4));
+  iter->Next();
+  iter_test_backward();
+
+  iter->Seek(Key(4));
+  iter->Prev();
+  iter_test_forward();
+  iter->SeekForPrev(Key(4));
+  iter_test_forward();
+
+  iter->SeekToFirst();
+  iter_test_forward();
+  iter->SeekToLast();
+  iter_test_backward();
+
+  delete iter;
+}
+
+TEST_F(DBRangeDelTest, TombstoneSentinelDirectionChange) {
+  // L1: 7
+  // L2: [4, 6)
+  // L3: 4
+  // Seek(5) will have sentinel key 6 at the top of minHeap in merging iterator.
+  //  then do a prev, how would sentinel work?
+  // Redo the test after Put(5) into L1 so that there is a visible key in range
+  // [4, 6).
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 3 * 1024;
+
+  DestroyAndReopen(options);
+  // L3
+  ASSERT_OK(db_->Put(WriteOptions(), Key(4), "bar"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(3);
+  ASSERT_EQ(1, NumTableFilesAtLevel(3));
+  // L2
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(4),
+                             Key(6)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  // L1
+  ASSERT_OK(db_->Put(WriteOptions(), Key(7), "foobar"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  auto iter = db_->NewIterator(ReadOptions());
+  iter->Seek(Key(5));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key(), Key(7));
+  iter->Prev();
+  ASSERT_TRUE(!iter->Valid() && iter->status().ok());
+  delete iter;
+
+  ASSERT_OK(db_->Put(WriteOptions(), Key(5), "foobar"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+  iter = db_->NewIterator(ReadOptions());
+  iter->Seek(Key(5));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key(), Key(5));
+  iter->Prev();
+  ASSERT_TRUE(!iter->Valid() && iter->status().ok());
+  delete iter;
+}
+
+// Right sentinel tested in many test cases above
+TEST_F(DBRangeDelTest, LeftSentinelKeyTest) {
+  // L1_0: 0, 1    L1_1: [2, 3), 5
+  // L2: 2
+  // SeekForPrev(4) should give 1 due to sentinel key keeping [2, 3) alive.
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 3 * 1024;
+  options.max_compaction_bytes = 1024;
+
+  DestroyAndReopen(options);
+  // L2
+  ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  // L1_0
+  Random rnd(301);
+  ASSERT_OK(db_->Put(WriteOptions(), Key(0), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  // L1_1
+  ASSERT_OK(db_->Put(WriteOptions(), Key(5), "bar"));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
+                             Key(3)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+  auto iter = db_->NewIterator(ReadOptions());
+  iter->SeekForPrev(Key(4));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key(), Key(1));
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key(), Key(0));
+  iter->Prev();
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_OK(iter->status());
+  delete iter;
+}
+
+TEST_F(DBRangeDelTest, LeftSentinelKeyTestWithNewerKey) {
+  // L1_0: 1, 2 newer than L1_1,    L1_1: [2, 4), 5
+  // L2: 3
+  // SeekForPrev(4) then Prev() should give 2 and then 1.
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 3 * 1024;
+  options.max_compaction_bytes = 1024;
+
+  DestroyAndReopen(options);
+  // L2
+  ASSERT_OK(db_->Put(WriteOptions(), Key(3), "foo"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  // L1_1
+  ASSERT_OK(db_->Put(WriteOptions(), Key(5), "bar"));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
+                             Key(4)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  // L1_0
+  Random rnd(301);
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(2), rnd.RandomString(4 << 10)));
+  // Used to verify sequence number of iterator key later.
+  auto seq = dbfull()->TEST_GetLastVisibleSequence();
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+  Arena arena;
+  InternalKeyComparator icmp(options.comparator);
+  ReadOptions read_options;
+  ScopedArenaIterator iter;
+  iter.set(
+      dbfull()->NewInternalIterator(read_options, &arena, kMaxSequenceNumber));
+
+  auto k = Key(4);
+  IterKey target;
+  target.SetInternalKey(k, 0 /* sequence_number */, kValueTypeForSeekForPrev);
+  iter->SeekForPrev(target.GetInternalKey());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->user_key(), Key(2));
+  SequenceNumber actual_seq;
+  ValueType type;
+  UnPackSequenceAndType(ExtractInternalKeyFooter(iter->key()), &actual_seq,
+                        &type);
+  ASSERT_EQ(seq, actual_seq);
+  // might as well check type
+  ASSERT_EQ(type, kTypeValue);
+
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->user_key(), Key(1));
+  iter->Prev();
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_OK(iter->status());
+}
+
+TEST_F(DBRangeDelTest, SentinelKeyCommonCaseTest) {
+  // L1 has 3 files
+  // L1_0: 1, 2     L1_1: [3, 4) 5, 6, [7, 8)     L1_2: 9
+  // Check iterator operations on LevelIterator.
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 3 * 1024;
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+  // L1_0
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(2), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  // L1_1
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3),
+                             Key(4)));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(5), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(6), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(7),
+                             Key(8)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+  // L1_2
+  ASSERT_OK(db_->Put(WriteOptions(), Key(9), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(3, NumTableFilesAtLevel(1));
+
+  ColumnFamilyData* cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
+          ->cfd();
+  SuperVersion* sv = cfd->GetSuperVersion();
+  Arena arena;
+  ReadOptions read_options;
+  MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), &arena,
+                                          false /* prefix seek */);
+  InternalIterator* level_iter = sv->current->TEST_GetLevelIterator(
+      read_options, &merge_iter_builder, 1 /* level */, true);
+  // This is needed to make LevelIterator range tombstone aware
+  auto miter = merge_iter_builder.Finish();
+  auto k = Key(7);
+  IterKey target;
+  target.SetInternalKey(k, kMaxSequenceNumber, kValueTypeForSeek);
+  level_iter->Seek(target.GetInternalKey());
+  // The last Key(9) is a sentinel key.
+  VerifyIteratorKey(level_iter, {Key(8), Key(9), Key(9)});
+  ASSERT_TRUE(!level_iter->Valid() && level_iter->status().ok());
+
+  k = Key(6);
+  target.SetInternalKey(k, kMaxSequenceNumber, kValueTypeForSeek);
+  level_iter->Seek(target.GetInternalKey());
+  VerifyIteratorKey(level_iter, {Key(6), Key(8), Key(9), Key(9)});
+  ASSERT_TRUE(!level_iter->Valid() && level_iter->status().ok());
+
+  k = Key(4);
+  target.SetInternalKey(k, 0, kValueTypeForSeekForPrev);
+  level_iter->SeekForPrev(target.GetInternalKey());
+  VerifyIteratorKey(level_iter, {Key(3), Key(2), Key(1), Key(1)}, false);
+  ASSERT_TRUE(!level_iter->Valid() && level_iter->status().ok());
+
+  k = Key(5);
+  target.SetInternalKey(k, 0, kValueTypeForSeekForPrev);
+  level_iter->SeekForPrev(target.GetInternalKey());
+  VerifyIteratorKey(level_iter, {Key(5), Key(3), Key(2), Key(1), Key(1)},
+                    false);
+
+  level_iter->SeekToFirst();
+  VerifyIteratorKey(level_iter, {Key(1), Key(2), Key(2), Key(5), Key(6), Key(8),
+                                 Key(9), Key(9)});
+  ASSERT_TRUE(!level_iter->Valid() && level_iter->status().ok());
+
+  level_iter->SeekToLast();
+  VerifyIteratorKey(
+      level_iter,
+      {Key(9), Key(9), Key(6), Key(5), Key(3), Key(2), Key(1), Key(1)}, false);
+  ASSERT_TRUE(!level_iter->Valid() && level_iter->status().ok());
+
+  miter->~InternalIterator();
+}
+
+TEST_F(DBRangeDelTest, PrefixSentinelKey) {
+  // L1: ['aaaa', 'aaad'), 'bbbb'
+  // L2: 'aaac', 'aaae'
+  // Prefix extracts first 3 chars
+  // Seek('aaab') should give 'aaae' as first key.
+  // This is to test a previous bug where prefix seek sees there is no prefix in
+  // the SST file, and will just set file iter to null in LevelIterator and may
+  // just skip to the next SST file. But in this case, we should keep the file's
+  // tombstone alive.
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  table_options.whole_key_filtering = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  // L2:
+  ASSERT_OK(db_->Put(WriteOptions(), "aaac", rnd.RandomString(10)));
+  ASSERT_OK(db_->Put(WriteOptions(), "aaae", rnd.RandomString(10)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  // L1
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "aaaa",
+                             "aaad"));
+  ASSERT_OK(db_->Put(WriteOptions(), "bbbb", rnd.RandomString(10)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  auto iter = db_->NewIterator(ReadOptions());
+  iter->Seek("aaab");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key(), "aaae");
+  delete iter;
+}
+
+TEST_F(DBRangeDelTest, RefreshMemtableIter) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+  ReadOptions ro;
+  ro.read_tier = kMemtableTier;
+  std::unique_ptr<Iterator> iter{db_->NewIterator(ro)};
+  ASSERT_OK(Flush());
+  // First refresh reinits iter, which had a bug where
+  // iter.memtable_range_tombstone_iter_ was not set to nullptr, and caused
+  // subsequent refresh to double free.
+  ASSERT_OK(iter->Refresh());
+  ASSERT_OK(iter->Refresh());
+}
+
+TEST_F(DBRangeDelTest, RangeTombstoneRespectIterateUpperBound) {
+  // Memtable: a, [b, bz)
+  // Do a Seek on `a` with iterate_upper_bound being az
+  // range tombstone [b, bz) should not be processed (added to and
+  // popped from the min_heap in MergingIterator).
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("a", "bar"));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "b", "bz"));
+
+  // I could not find a cleaner way to test this without relying on
+  // implementation detail. Tried to test the value of
+  // `internal_range_del_reseek_count` but that did not work
+  // since BlockBasedTable iterator becomes !Valid() when point key
+  // is out of bound and that reseek only happens when a point key
+  // is covered by some range tombstone.
+  SyncPoint::GetInstance()->SetCallBack("MergeIterator::PopDeleteRangeStart",
+                                        [](void*) {
+                                          // there should not be any range
+                                          // tombstone in the heap.
+                                          FAIL();
+                                        });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ReadOptions read_opts;
+  std::string upper_bound = "az";
+  Slice upper_bound_slice = upper_bound;
+  read_opts.iterate_upper_bound = &upper_bound_slice;
+  std::unique_ptr<Iterator> iter{db_->NewIterator(read_opts)};
+  iter->Seek("a");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key(), "a");
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_rate_limiter_test.cc b/src/rocksdb/db/db_rate_limiter_test.cc
new file mode 100644
index 000000000..e44cc047d
--- /dev/null
+++ b/src/rocksdb/db/db_rate_limiter_test.cc
@@ -0,0 +1,451 @@
+//  Copyright (c) 2022-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <gtest/gtest.h>
+
+#include <cstdint>
+#include <string>
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "test_util/testharness.h"
+#include "util/file_checksum_helper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBRateLimiterOnReadTest
+    : public DBTestBase,
+      public ::testing::WithParamInterface<std::tuple<bool, bool, bool>> {
+ public:
+  explicit DBRateLimiterOnReadTest()
+      : DBTestBase("db_rate_limiter_on_read_test", /*env_do_fsync=*/false),
+        use_direct_io_(std::get<0>(GetParam())),
+        use_block_cache_(std::get<1>(GetParam())),
+        use_readahead_(std::get<2>(GetParam())) {}
+
+  void Init() {
+    options_ = GetOptions();
+    Reopen(options_);
+    for (int i = 0; i < kNumFiles; ++i) {
+      for (int j = 0; j < kNumKeysPerFile; ++j) {
+        ASSERT_OK(Put(Key(i * kNumKeysPerFile + j), "val"));
+      }
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(1);
+  }
+
+  BlockBasedTableOptions GetTableOptions() {
+    BlockBasedTableOptions table_options;
+    table_options.no_block_cache = !use_block_cache_;
+    return table_options;
+  }
+
+  ReadOptions GetReadOptions() {
+    ReadOptions read_options;
+    read_options.rate_limiter_priority = Env::IO_USER;
+    read_options.readahead_size = use_readahead_ ? kReadaheadBytes : 0;
+    return read_options;
+  }
+
+  Options GetOptions() {
+    Options options = CurrentOptions();
+    options.disable_auto_compactions = true;
+    options.file_checksum_gen_factory.reset(new FileChecksumGenCrc32cFactory());
+    options.rate_limiter.reset(NewGenericRateLimiter(
+        1 << 20 /* rate_bytes_per_sec */, 100 * 1000 /* refill_period_us */,
+        10 /* fairness */, RateLimiter::Mode::kAllIo));
+    options.table_factory.reset(NewBlockBasedTableFactory(GetTableOptions()));
+    options.use_direct_reads = use_direct_io_;
+    return options;
+  }
+
+ protected:
+  const static int kNumKeysPerFile = 1;
+  const static int kNumFiles = 3;
+  const static int kReadaheadBytes = 32 << 10;  // 32KB
+
+  Options options_;
+  const bool use_direct_io_;
+  const bool use_block_cache_;
+  const bool use_readahead_;
+};
+
+std::string GetTestNameSuffix(
+    ::testing::TestParamInfo<std::tuple<bool, bool, bool>> info) {
+  std::ostringstream oss;
+  if (std::get<0>(info.param)) {
+    oss << "DirectIO";
+  } else {
+    oss << "BufferedIO";
+  }
+  if (std::get<1>(info.param)) {
+    oss << "_BlockCache";
+  } else {
+    oss << "_NoBlockCache";
+  }
+  if (std::get<2>(info.param)) {
+    oss << "_Readahead";
+  } else {
+    oss << "_NoReadahead";
+  }
+  return oss.str();
+}
+
+#ifndef ROCKSDB_LITE
+INSTANTIATE_TEST_CASE_P(DBRateLimiterOnReadTest, DBRateLimiterOnReadTest,
+                        ::testing::Combine(::testing::Bool(), ::testing::Bool(),
+                                           ::testing::Bool()),
+                        GetTestNameSuffix);
+#else   // ROCKSDB_LITE
+// Cannot use direct I/O in lite mode.
+INSTANTIATE_TEST_CASE_P(DBRateLimiterOnReadTest, DBRateLimiterOnReadTest,
+                        ::testing::Combine(::testing::Values(false),
+                                           ::testing::Bool(),
+                                           ::testing::Bool()),
+                        GetTestNameSuffix);
+#endif  // ROCKSDB_LITE
+
+TEST_P(DBRateLimiterOnReadTest, Get) {
+  if (use_direct_io_ && !IsDirectIOSupported()) {
+    return;
+  }
+  Init();
+
+  ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+
+  int expected = 0;
+  for (int i = 0; i < kNumFiles; ++i) {
+    {
+      std::string value;
+      ASSERT_OK(db_->Get(GetReadOptions(), Key(i * kNumKeysPerFile), &value));
+      ++expected;
+    }
+    ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+
+    {
+      std::string value;
+      ASSERT_OK(db_->Get(GetReadOptions(), Key(i * kNumKeysPerFile), &value));
+      if (!use_block_cache_) {
+        ++expected;
+      }
+    }
+    ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+  }
+}
+
+TEST_P(DBRateLimiterOnReadTest, NewMultiGet) {
+  if (use_direct_io_ && !IsDirectIOSupported()) {
+    return;
+  }
+  Init();
+
+  ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+
+  const int kNumKeys = kNumFiles * kNumKeysPerFile;
+  int64_t expected = 0;
+  {
+    std::vector<std::string> key_bufs;
+    key_bufs.reserve(kNumKeys);
+    std::vector<Slice> keys;
+    keys.reserve(kNumKeys);
+    for (int i = 0; i < kNumKeys; ++i) {
+      key_bufs.emplace_back(Key(i));
+      keys.emplace_back(key_bufs[i]);
+    }
+    std::vector<Status> statuses(kNumKeys);
+    std::vector<PinnableSlice> values(kNumKeys);
+    const int64_t prev_total_rl_req = options_.rate_limiter->GetTotalRequests();
+    db_->MultiGet(GetReadOptions(), dbfull()->DefaultColumnFamily(), kNumKeys,
+                  keys.data(), values.data(), statuses.data());
+    const int64_t cur_total_rl_req = options_.rate_limiter->GetTotalRequests();
+    for (int i = 0; i < kNumKeys; ++i) {
+      ASSERT_TRUE(statuses[i].ok());
+    }
+    ASSERT_GT(cur_total_rl_req, prev_total_rl_req);
+    ASSERT_EQ(cur_total_rl_req - prev_total_rl_req,
+              options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+  }
+  expected += kNumKeys;
+  ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+}
+
+TEST_P(DBRateLimiterOnReadTest, OldMultiGet) {
+  // The old `vector<Status>`-returning `MultiGet()` APIs use `Read()`, which
+  // supports rate limiting.
+  if (use_direct_io_ && !IsDirectIOSupported()) {
+    return;
+  }
+  Init();
+
+  ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+
+  const int kNumKeys = kNumFiles * kNumKeysPerFile;
+  int expected = 0;
+  {
+    std::vector<std::string> key_bufs;
+    key_bufs.reserve(kNumKeys);
+    std::vector<Slice> keys;
+    keys.reserve(kNumKeys);
+    for (int i = 0; i < kNumKeys; ++i) {
+      key_bufs.emplace_back(Key(i));
+      keys.emplace_back(key_bufs[i]);
+    }
+    std::vector<std::string> values;
+    std::vector<Status> statuses =
+        db_->MultiGet(GetReadOptions(), keys, &values);
+    for (int i = 0; i < kNumKeys; ++i) {
+      ASSERT_OK(statuses[i]);
+    }
+  }
+  expected += kNumKeys;
+  ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+}
+
+TEST_P(DBRateLimiterOnReadTest, Iterator) {
+  if (use_direct_io_ && !IsDirectIOSupported()) {
+    return;
+  }
+  Init();
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(GetReadOptions()));
+  ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+
+  int expected = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ++expected;
+    ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+  }
+
+  for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+    // When `use_block_cache_ == true`, the reverse scan will access the blocks
+    // loaded to cache during the above forward scan, in which case no further
+    // file reads are expected.
+    if (!use_block_cache_) {
+      ++expected;
+    }
+  }
+  // Reverse scan does not read evenly (one block per iteration) due to
+  // descending seqno ordering, so wait until after the loop to check total.
+  ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+}
+
+#if !defined(ROCKSDB_LITE)
+
+TEST_P(DBRateLimiterOnReadTest, VerifyChecksum) {
+  if (use_direct_io_ && !IsDirectIOSupported()) {
+    return;
+  }
+  Init();
+
+  ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+
+  ASSERT_OK(db_->VerifyChecksum(GetReadOptions()));
+  // The files are tiny so there should have just been one read per file.
+  int expected = kNumFiles;
+  ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+}
+
+TEST_P(DBRateLimiterOnReadTest, VerifyFileChecksums) {
+  if (use_direct_io_ && !IsDirectIOSupported()) {
+    return;
+  }
+  Init();
+
+  ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+
+  ASSERT_OK(db_->VerifyFileChecksums(GetReadOptions()));
+  // The files are tiny so there should have just been one read per file.
+  int expected = kNumFiles;
+  ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+}
+
+#endif  // !defined(ROCKSDB_LITE)
+
+class DBRateLimiterOnWriteTest : public DBTestBase {
+ public:
+  explicit DBRateLimiterOnWriteTest()
+      : DBTestBase("db_rate_limiter_on_write_test", /*env_do_fsync=*/false) {}
+
+  void Init() {
+    options_ = GetOptions();
+    ASSERT_OK(TryReopenWithColumnFamilies({"default"}, options_));
+    Random rnd(301);
+    for (int i = 0; i < kNumFiles; i++) {
+      ASSERT_OK(Put(0, kStartKey, rnd.RandomString(2)));
+      ASSERT_OK(Put(0, kEndKey, rnd.RandomString(2)));
+      ASSERT_OK(Flush(0));
+    }
+  }
+
+  Options GetOptions() {
+    Options options = CurrentOptions();
+    options.disable_auto_compactions = true;
+    options.rate_limiter.reset(NewGenericRateLimiter(
+        1 << 20 /* rate_bytes_per_sec */, 100 * 1000 /* refill_period_us */,
+        10 /* fairness */, RateLimiter::Mode::kWritesOnly));
+    options.table_factory.reset(
+        NewBlockBasedTableFactory(BlockBasedTableOptions()));
+    return options;
+  }
+
+ protected:
+  inline const static int64_t kNumFiles = 3;
+  inline const static std::string kStartKey = "a";
+  inline const static std::string kEndKey = "b";
+  Options options_;
+};
+
+TEST_F(DBRateLimiterOnWriteTest, Flush) {
+  std::int64_t prev_total_request = 0;
+
+  Init();
+
+  std::int64_t actual_flush_request =
+      options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL) -
+      prev_total_request;
+  std::int64_t exepcted_flush_request = kNumFiles;
+  EXPECT_EQ(actual_flush_request, exepcted_flush_request);
+  EXPECT_EQ(actual_flush_request,
+            options_.rate_limiter->GetTotalRequests(Env::IO_HIGH));
+}
+
+TEST_F(DBRateLimiterOnWriteTest, Compact) {
+  Init();
+
+  // Pre-comaction:
+  // level-0 : `kNumFiles` SST files overlapping on [kStartKey, kEndKey]
+#ifndef ROCKSDB_LITE
+  std::string files_per_level_pre_compaction = std::to_string(kNumFiles);
+  ASSERT_EQ(files_per_level_pre_compaction, FilesPerLevel(0 /* cf */));
+#endif  // !ROCKSDB_LITE
+
+  std::int64_t prev_total_request =
+      options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL);
+  ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_LOW));
+
+  Compact(kStartKey, kEndKey);
+
+  std::int64_t actual_compaction_request =
+      options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL) -
+      prev_total_request;
+
+  // Post-comaction:
+  // level-0 : 0 SST file
+  // level-1 : 1 SST file
+#ifndef ROCKSDB_LITE
+  std::string files_per_level_post_compaction = "0,1";
+  ASSERT_EQ(files_per_level_post_compaction, FilesPerLevel(0 /* cf */));
+#endif  // !ROCKSDB_LITE
+
+  std::int64_t exepcted_compaction_request = 1;
+  EXPECT_EQ(actual_compaction_request, exepcted_compaction_request);
+  EXPECT_EQ(actual_compaction_request,
+            options_.rate_limiter->GetTotalRequests(Env::IO_LOW));
+}
+
+class DBRateLimiterOnWriteWALTest
+    : public DBRateLimiterOnWriteTest,
+      public ::testing::WithParamInterface<std::tuple<
+          bool /* WriteOptions::disableWal */,
+          bool /* Options::manual_wal_flush */,
+          Env::IOPriority /* WriteOptions::rate_limiter_priority */>> {
+ public:
+  static std::string GetTestNameSuffix(
+      ::testing::TestParamInfo<std::tuple<bool, bool, Env::IOPriority>> info) {
+    std::ostringstream oss;
+    if (std::get<0>(info.param)) {
+      oss << "DisableWAL";
+    } else {
+      oss << "EnableWAL";
+    }
+    if (std::get<1>(info.param)) {
+      oss << "_ManualWALFlush";
+    } else {
+      oss << "_AutoWALFlush";
+    }
+    if (std::get<2>(info.param) == Env::IO_USER) {
+      oss << "_RateLimitAutoWALFlush";
+    } else if (std::get<2>(info.param) == Env::IO_TOTAL) {
+      oss << "_NoRateLimitAutoWALFlush";
+    } else {
+      oss << "_RateLimitAutoWALFlushWithIncorrectPriority";
+    }
+    return oss.str();
+  }
+
+  explicit DBRateLimiterOnWriteWALTest()
+      : disable_wal_(std::get<0>(GetParam())),
+        manual_wal_flush_(std::get<1>(GetParam())),
+        rate_limiter_priority_(std::get<2>(GetParam())) {}
+
+  void Init() {
+    options_ = GetOptions();
+    options_.manual_wal_flush = manual_wal_flush_;
+    Reopen(options_);
+  }
+
+  WriteOptions GetWriteOptions() {
+    WriteOptions write_options;
+    write_options.disableWAL = disable_wal_;
+    write_options.rate_limiter_priority = rate_limiter_priority_;
+    return write_options;
+  }
+
+ protected:
+  bool disable_wal_;
+  bool manual_wal_flush_;
+  Env::IOPriority rate_limiter_priority_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    DBRateLimiterOnWriteWALTest, DBRateLimiterOnWriteWALTest,
+    ::testing::Values(std::make_tuple(false, false, Env::IO_TOTAL),
+                      std::make_tuple(false, false, Env::IO_USER),
+                      std::make_tuple(false, false, Env::IO_HIGH),
+                      std::make_tuple(false, true, Env::IO_USER),
+                      std::make_tuple(true, false, Env::IO_USER)),
+    DBRateLimiterOnWriteWALTest::GetTestNameSuffix);
+
+TEST_P(DBRateLimiterOnWriteWALTest, AutoWalFlush) {
+  Init();
+
+  const bool no_rate_limit_auto_wal_flush =
+      (rate_limiter_priority_ == Env::IO_TOTAL);
+  const bool valid_arg = (rate_limiter_priority_ == Env::IO_USER &&
+                          !disable_wal_ && !manual_wal_flush_);
+
+  std::int64_t prev_total_request =
+      options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL);
+  ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+
+  Status s = Put("foo", "v1", GetWriteOptions());
+
+  if (no_rate_limit_auto_wal_flush || valid_arg) {
+    EXPECT_TRUE(s.ok());
+  } else {
+    EXPECT_TRUE(s.IsInvalidArgument());
+    EXPECT_TRUE(s.ToString().find("WriteOptions::rate_limiter_priority") !=
+                std::string::npos);
+  }
+
+  std::int64_t actual_auto_wal_flush_request =
+      options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL) -
+      prev_total_request;
+  std::int64_t expected_auto_wal_flush_request = valid_arg ? 1 : 0;
+
+  EXPECT_EQ(actual_auto_wal_flush_request, expected_auto_wal_flush_request);
+  EXPECT_EQ(actual_auto_wal_flush_request,
+            options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_readonly_with_timestamp_test.cc b/src/rocksdb/db/db_readonly_with_timestamp_test.cc
new file mode 100644
index 000000000..3f53e7806
--- /dev/null
+++ b/src/rocksdb/db/db_readonly_with_timestamp_test.cc
@@ -0,0 +1,960 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_with_timestamp_test_util.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+class DBReadOnlyTestWithTimestamp : public DBBasicTestWithTimestampBase {
+ public:
+  DBReadOnlyTestWithTimestamp()
+      : DBBasicTestWithTimestampBase("db_readonly_test_with_timestamp") {}
+
+ protected:
+#ifndef ROCKSDB_LITE
+  void CheckDBOpenedAsCompactedDBWithOneLevel0File() {
+    VersionSet* const versions = dbfull()->GetVersionSet();
+    ASSERT_NE(versions, nullptr);
+
+    ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+    ASSERT_NE(cfd, nullptr);
+
+    Version* const current = cfd->current();
+    ASSERT_NE(current, nullptr);
+
+    const VersionStorageInfo* const storage_info = current->storage_info();
+    ASSERT_NE(storage_info, nullptr);
+
+    // Only 1 L0 file.
+    ASSERT_EQ(1, NumTableFilesAtLevel(0));
+    // L0 is the max level.
+    ASSERT_EQ(storage_info->num_non_empty_levels(), 1);
+  }
+
+  void CheckDBOpenedAsCompactedDBWithOnlyHighestNonEmptyLevelFiles() {
+    VersionSet* const versions = dbfull()->GetVersionSet();
+    ASSERT_NE(versions, nullptr);
+
+    ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+    ASSERT_NE(cfd, nullptr);
+
+    Version* const current = cfd->current();
+    ASSERT_NE(current, nullptr);
+
+    const VersionStorageInfo* const storage_info = current->storage_info();
+    ASSERT_NE(storage_info, nullptr);
+
+    // L0 has no files.
+    ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+    // All other levels have no files except the highest level with files.
+    for (int i = 1; i < storage_info->num_non_empty_levels() - 1; ++i) {
+      ASSERT_FALSE(storage_info->LevelFilesBrief(i).num_files > 0);
+    }
+
+    // The highest level with files have some files.
+    int highest_non_empty_level = storage_info->num_non_empty_levels() - 1;
+    ASSERT_TRUE(
+        storage_info->LevelFilesBrief(highest_non_empty_level).num_files > 0);
+  }
+#endif  // !ROCKSDB_LITE
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBReadOnlyTestWithTimestamp, IteratorAndGetReadTimestampSizeMismatch) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::string write_timestamp = Timestamp(1, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database in read only mode to test its timestamp support.
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ReadOptions read_opts;
+  std::string different_size_read_timestamp;
+  PutFixed32(&different_size_read_timestamp, 2);
+  Slice different_size_read_ts = different_size_read_timestamp;
+  read_opts.timestamp = &different_size_read_ts;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_TRUE(iter->status().IsInvalidArgument());
+  }
+
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    std::string value_from_get;
+    std::string timestamp;
+    ASSERT_TRUE(db_->Get(read_opts, Key1(key), &value_from_get, &timestamp)
+                    .IsInvalidArgument());
+  }
+
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+       IteratorAndGetReadTimestampSpecifiedWithoutWriteTimestamp) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database in read only mode to test its timestamp support.
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ReadOptions read_opts;
+  const std::string read_timestamp = Timestamp(2, 0);
+  Slice read_ts = read_timestamp;
+  read_opts.timestamp = &read_ts;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_TRUE(iter->status().IsInvalidArgument());
+  }
+
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    std::string value_from_get;
+    std::string timestamp;
+    ASSERT_TRUE(db_->Get(read_opts, Key1(key), &value_from_get, &timestamp)
+                    .IsInvalidArgument());
+  }
+
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+       IteratorAndGetWriteWithTimestampReadWithoutTimestamp) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::string write_timestamp = Timestamp(1, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database in read only mode to test its timestamp support.
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ReadOptions read_opts;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_TRUE(iter->status().IsInvalidArgument());
+  }
+
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    std::string value_from_get;
+    ASSERT_TRUE(
+        db_->Get(read_opts, Key1(key), &value_from_get).IsInvalidArgument());
+  }
+
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp, IteratorAndGet) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::vector<uint64_t> start_keys = {1, 0};
+  const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+                                                     Timestamp(3, 0)};
+  const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+                                                    Timestamp(4, 0)};
+  for (size_t i = 0; i < write_timestamps.size(); ++i) {
+    WriteOptions write_opts;
+    for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+      Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
+                          "value" + std::to_string(i));
+      ASSERT_OK(s);
+    }
+  }
+
+  // Reopen the database in read only mode to test its timestamp support.
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+
+  auto get_value_and_check = [](DB* db, ReadOptions read_opts, Slice key,
+                                Slice expected_value, std::string expected_ts) {
+    std::string value_from_get;
+    std::string timestamp;
+    ASSERT_OK(db->Get(read_opts, key.ToString(), &value_from_get, &timestamp));
+    ASSERT_EQ(expected_value, value_from_get);
+    ASSERT_EQ(expected_ts, timestamp);
+  };
+  for (size_t i = 0; i < read_timestamps.size(); ++i) {
+    ReadOptions read_opts;
+    Slice read_ts = read_timestamps[i];
+    read_opts.timestamp = &read_ts;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    int count = 0;
+    uint64_t key = 0;
+    // Forward iterate.
+    for (it->Seek(Key1(0)), key = start_keys[i]; it->Valid();
+         it->Next(), ++count, ++key) {
+      CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+                         "value" + std::to_string(i), write_timestamps[i]);
+      get_value_and_check(db_, read_opts, it->key(), it->value(),
+                          write_timestamps[i]);
+    }
+    size_t expected_count = kMaxKey - start_keys[i] + 1;
+    ASSERT_EQ(expected_count, count);
+
+    // Backward iterate.
+    count = 0;
+    for (it->SeekForPrev(Key1(kMaxKey)), key = kMaxKey; it->Valid();
+         it->Prev(), ++count, --key) {
+      CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+                         "value" + std::to_string(i), write_timestamps[i]);
+      get_value_and_check(db_, read_opts, it->key(), it->value(),
+                          write_timestamps[i]);
+    }
+    ASSERT_EQ(static_cast<size_t>(kMaxKey) - start_keys[i] + 1, count);
+
+    // SeekToFirst()/SeekToLast() with lower/upper bounds.
+    // Then iter with lower and upper bounds.
+    uint64_t l = 0;
+    uint64_t r = kMaxKey + 1;
+    while (l < r) {
+      std::string lb_str = Key1(l);
+      Slice lb = lb_str;
+      std::string ub_str = Key1(r);
+      Slice ub = ub_str;
+      read_opts.iterate_lower_bound = &lb;
+      read_opts.iterate_upper_bound = &ub;
+      it.reset(db_->NewIterator(read_opts));
+      for (it->SeekToFirst(), key = std::max(l, start_keys[i]), count = 0;
+           it->Valid(); it->Next(), ++key, ++count) {
+        CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+                           "value" + std::to_string(i), write_timestamps[i]);
+        get_value_and_check(db_, read_opts, it->key(), it->value(),
+                            write_timestamps[i]);
+      }
+      ASSERT_EQ(r - std::max(l, start_keys[i]), count);
+
+      for (it->SeekToLast(), key = std::min(r, kMaxKey + 1), count = 0;
+           it->Valid(); it->Prev(), --key, ++count) {
+        CheckIterUserEntry(it.get(), Key1(key - 1), kTypeValue,
+                           "value" + std::to_string(i), write_timestamps[i]);
+        get_value_and_check(db_, read_opts, it->key(), it->value(),
+                            write_timestamps[i]);
+      }
+      l += (kMaxKey / 100);
+      r -= (kMaxKey / 100);
+    }
+  }
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp, Iterators) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::string write_timestamp = Timestamp(1, 0);
+  const std::string read_timestamp = Timestamp(2, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database in read only mode to test its timestamp support.
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ReadOptions read_opts;
+  Slice read_ts = read_timestamp;
+  read_opts.timestamp = &read_ts;
+  std::vector<Iterator*> iters;
+  ASSERT_OK(db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters));
+  ASSERT_EQ(static_cast<uint64_t>(1), iters.size());
+
+  int count = 0;
+  uint64_t key = 0;
+  // Forward iterate.
+  for (iters[0]->Seek(Key1(0)), key = 0; iters[0]->Valid();
+       iters[0]->Next(), ++count, ++key) {
+    CheckIterUserEntry(iters[0], Key1(key), kTypeValue,
+                       "value" + std::to_string(key), write_timestamp);
+  }
+
+  size_t expected_count = kMaxKey - 0 + 1;
+  ASSERT_EQ(expected_count, count);
+  delete iters[0];
+
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp, IteratorsReadTimestampSizeMismatch) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::string write_timestamp = Timestamp(1, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database in read only mode to test its timestamp support.
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ReadOptions read_opts;
+  std::string different_size_read_timestamp;
+  PutFixed32(&different_size_read_timestamp, 2);
+  Slice different_size_read_ts = different_size_read_timestamp;
+  read_opts.timestamp = &different_size_read_ts;
+  {
+    std::vector<Iterator*> iters;
+    ASSERT_TRUE(
+        db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters)
+            .IsInvalidArgument());
+  }
+
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+       IteratorsReadTimestampSpecifiedWithoutWriteTimestamp) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database in read only mode to test its timestamp support.
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ReadOptions read_opts;
+  const std::string read_timestamp = Timestamp(2, 0);
+  Slice read_ts = read_timestamp;
+  read_opts.timestamp = &read_ts;
+  {
+    std::vector<Iterator*> iters;
+    ASSERT_TRUE(
+        db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters)
+            .IsInvalidArgument());
+  }
+
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+       IteratorsWriteWithTimestampReadWithoutTimestamp) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::string write_timestamp = Timestamp(1, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database in read only mode to test its timestamp support.
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ReadOptions read_opts;
+  {
+    std::vector<Iterator*> iters;
+    ASSERT_TRUE(
+        db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters)
+            .IsInvalidArgument());
+  }
+
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp, CompactedDBGetReadTimestampSizeMismatch) {
+  const int kNumKeysPerFile = 1026;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  std::string write_timestamp = Timestamp(1, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(0));
+    ASSERT_OK(s);
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  Close();
+
+  // Reopen the database in read only mode as a Compacted DB to test its
+  // timestamp support.
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  CheckDBOpenedAsCompactedDBWithOneLevel0File();
+
+  ReadOptions read_opts;
+  std::string different_size_read_timestamp;
+  PutFixed32(&different_size_read_timestamp, 2);
+  Slice different_size_read_ts = different_size_read_timestamp;
+  read_opts.timestamp = &different_size_read_ts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    std::string value_from_get;
+    std::string timestamp;
+    ASSERT_TRUE(db_->Get(read_opts, Key1(key), &value_from_get, &timestamp)
+                    .IsInvalidArgument());
+  }
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+       CompactedDBGetReadTimestampSpecifiedWithoutWriteTimestamp) {
+  const int kNumKeysPerFile = 1026;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(0));
+    ASSERT_OK(s);
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  Close();
+
+  // Reopen the database in read only mode as a Compacted DB to test its
+  // timestamp support.
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  CheckDBOpenedAsCompactedDBWithOneLevel0File();
+
+  ReadOptions read_opts;
+  const std::string read_timestamp = Timestamp(2, 0);
+  Slice read_ts = read_timestamp;
+  read_opts.timestamp = &read_ts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    std::string value_from_get;
+    std::string timestamp;
+    ASSERT_TRUE(db_->Get(read_opts, Key1(key), &value_from_get, &timestamp)
+                    .IsInvalidArgument());
+  }
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+       CompactedDBGetWriteWithTimestampReadWithoutTimestamp) {
+  const int kNumKeysPerFile = 1026;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  std::string write_timestamp = Timestamp(1, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(0));
+    ASSERT_OK(s);
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  Close();
+
+  // Reopen the database in read only mode as a Compacted DB to test its
+  // timestamp support.
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  CheckDBOpenedAsCompactedDBWithOneLevel0File();
+
+  ReadOptions read_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    std::string value_from_get;
+    ASSERT_TRUE(
+        db_->Get(read_opts, Key1(key), &value_from_get).IsInvalidArgument());
+  }
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp, CompactedDBGetWithOnlyOneL0File) {
+  const int kNumKeysPerFile = 1026 * 2;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::vector<uint64_t> start_keys = {1, 0};
+  const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+                                                     Timestamp(3, 0)};
+  const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+                                                    Timestamp(4, 0)};
+  for (size_t i = 0; i < write_timestamps.size(); ++i) {
+    WriteOptions write_opts;
+    for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+      Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
+                          "value" + std::to_string(i));
+      ASSERT_OK(s);
+    }
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  Close();
+
+  // Reopen the database in read only mode as a Compacted DB to test its
+  // timestamp support.
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  CheckDBOpenedAsCompactedDBWithOneLevel0File();
+
+  for (size_t i = 0; i < read_timestamps.size(); ++i) {
+    ReadOptions read_opts;
+    Slice read_ts = read_timestamps[i];
+    read_opts.timestamp = &read_ts;
+    int count = 0;
+    for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key, ++count) {
+      std::string value_from_get;
+      std::string timestamp;
+      ASSERT_OK(db_->Get(read_opts, Key1(key), &value_from_get, &timestamp));
+      ASSERT_EQ("value" + std::to_string(i), value_from_get);
+      ASSERT_EQ(write_timestamps[i], timestamp);
+    }
+    size_t expected_count = kMaxKey - start_keys[i] + 1;
+    ASSERT_EQ(expected_count, count);
+  }
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+       CompactedDBGetWithOnlyHighestNonEmptyLevelFiles) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::vector<uint64_t> start_keys = {1, 0};
+  const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+                                                     Timestamp(3, 0)};
+  const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+                                                    Timestamp(4, 0)};
+  for (size_t i = 0; i < write_timestamps.size(); ++i) {
+    WriteOptions write_opts;
+    for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+      Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
+                          "value" + std::to_string(i));
+      ASSERT_OK(s);
+    }
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  Close();
+
+  // Reopen the database in read only mode as a Compacted DB to test its
+  // timestamp support.
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  CheckDBOpenedAsCompactedDBWithOnlyHighestNonEmptyLevelFiles();
+
+  for (size_t i = 0; i < read_timestamps.size(); ++i) {
+    ReadOptions read_opts;
+    Slice read_ts = read_timestamps[i];
+    read_opts.timestamp = &read_ts;
+    int count = 0;
+    for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key, ++count) {
+      std::string value_from_get;
+      std::string timestamp;
+      ASSERT_OK(db_->Get(read_opts, Key1(key), &value_from_get, &timestamp));
+      ASSERT_EQ("value" + std::to_string(i), value_from_get);
+      ASSERT_EQ(write_timestamps[i], timestamp);
+    }
+    size_t expected_count = kMaxKey - start_keys[i] + 1;
+    ASSERT_EQ(expected_count, count);
+  }
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+       CompactedDBMultiGetReadTimestampSizeMismatch) {
+  const int kNumKeysPerFile = 1026;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  std::string write_timestamp = Timestamp(1, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(0));
+    ASSERT_OK(s);
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  Close();
+
+  // Reopen the database in read only mode as a Compacted DB to test its
+  // timestamp support.
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  CheckDBOpenedAsCompactedDBWithOneLevel0File();
+
+  ReadOptions read_opts;
+  std::string different_size_read_timestamp;
+  PutFixed32(&different_size_read_timestamp, 2);
+  Slice different_size_read_ts = different_size_read_timestamp;
+  read_opts.timestamp = &different_size_read_ts;
+  std::vector<std::string> key_strs;
+  std::vector<Slice> keys;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    key_strs.push_back(Key1(key));
+  }
+  for (const auto& key_str : key_strs) {
+    keys.emplace_back(key_str);
+  }
+  std::vector<std::string> values;
+  std::vector<std::string> timestamps;
+  std::vector<Status> status_list =
+      db_->MultiGet(read_opts, keys, &values, &timestamps);
+  for (const auto& status : status_list) {
+    ASSERT_TRUE(status.IsInvalidArgument());
+  }
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+       CompactedDBMultiGetReadTimestampSpecifiedWithoutWriteTimestamp) {
+  const int kNumKeysPerFile = 1026;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(0));
+    ASSERT_OK(s);
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  Close();
+
+  // Reopen the database in read only mode as a Compacted DB to test its
+  // timestamp support.
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  CheckDBOpenedAsCompactedDBWithOneLevel0File();
+
+  ReadOptions read_opts;
+  std::string read_timestamp = Timestamp(2, 0);
+  Slice read_ts = read_timestamp;
+  read_opts.timestamp = &read_ts;
+  std::vector<std::string> key_strs;
+  std::vector<Slice> keys;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    key_strs.push_back(Key1(key));
+  }
+  for (const auto& key_str : key_strs) {
+    keys.emplace_back(key_str);
+  }
+  std::vector<std::string> values;
+  std::vector<std::string> timestamps;
+  std::vector<Status> status_list =
+      db_->MultiGet(read_opts, keys, &values, &timestamps);
+  for (const auto& status : status_list) {
+    ASSERT_TRUE(status.IsInvalidArgument());
+  }
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+       CompactedDBMultiGetWriteWithTimestampReadWithoutTimestamp) {
+  const int kNumKeysPerFile = 1026;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  std::string write_timestamp = Timestamp(1, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(0));
+    ASSERT_OK(s);
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  Close();
+
+  // Reopen the database in read only mode as a Compacted DB to test its
+  // timestamp support.
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  CheckDBOpenedAsCompactedDBWithOneLevel0File();
+
+  ReadOptions read_opts;
+  std::vector<std::string> key_strs;
+  std::vector<Slice> keys;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    key_strs.push_back(Key1(key));
+  }
+  for (const auto& key_str : key_strs) {
+    keys.emplace_back(key_str);
+  }
+  std::vector<std::string> values;
+  std::vector<Status> status_list = db_->MultiGet(read_opts, keys, &values);
+  for (const auto& status : status_list) {
+    ASSERT_TRUE(status.IsInvalidArgument());
+  }
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp, CompactedDBMultiGetWithOnlyOneL0File) {
+  const int kNumKeysPerFile = 1026 * 2;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::vector<uint64_t> start_keys = {1, 0};
+  const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+                                                     Timestamp(3, 0)};
+  const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+                                                    Timestamp(4, 0)};
+  for (size_t i = 0; i < write_timestamps.size(); ++i) {
+    WriteOptions write_opts;
+    for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+      Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
+                          "value" + std::to_string(i));
+      ASSERT_OK(s);
+    }
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  Close();
+
+  // Reopen the database in read only mode as a Compacted DB to test its
+  // timestamp support.
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  CheckDBOpenedAsCompactedDBWithOneLevel0File();
+
+  for (size_t i = 0; i < write_timestamps.size(); ++i) {
+    ReadOptions read_opts;
+    Slice read_ts = read_timestamps[i];
+    read_opts.timestamp = &read_ts;
+    std::vector<std::string> key_strs;
+    std::vector<Slice> keys;
+    for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+      key_strs.push_back(Key1(key));
+    }
+    for (const auto& key_str : key_strs) {
+      keys.emplace_back(key_str);
+    }
+    size_t batch_size = kMaxKey - start_keys[i] + 1;
+    std::vector<std::string> values;
+    std::vector<std::string> timestamps;
+    std::vector<Status> status_list =
+        db_->MultiGet(read_opts, keys, &values, &timestamps);
+    ASSERT_EQ(batch_size, values.size());
+    ASSERT_EQ(batch_size, timestamps.size());
+    for (uint64_t idx = 0; idx < values.size(); ++idx) {
+      ASSERT_EQ("value" + std::to_string(i), values[idx]);
+      ASSERT_EQ(write_timestamps[i], timestamps[idx]);
+      ASSERT_OK(status_list[idx]);
+    }
+  }
+
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+       CompactedDBMultiGetWithOnlyHighestNonEmptyLevelFiles) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::vector<uint64_t> start_keys = {1, 0};
+  const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+                                                     Timestamp(3, 0)};
+  const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+                                                    Timestamp(4, 0)};
+  for (size_t i = 0; i < write_timestamps.size(); ++i) {
+    WriteOptions write_opts;
+    for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+      Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
+                          "value" + std::to_string(i));
+      ASSERT_OK(s);
+    }
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  Close();
+
+  // Reopen the database in read only mode as a Compacted DB to test its
+  // timestamp support.
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  CheckDBOpenedAsCompactedDBWithOnlyHighestNonEmptyLevelFiles();
+
+  for (size_t i = 0; i < write_timestamps.size(); ++i) {
+    ReadOptions read_opts;
+    Slice read_ts = read_timestamps[i];
+    read_opts.timestamp = &read_ts;
+    std::vector<std::string> key_strs;
+    std::vector<Slice> keys;
+    for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+      key_strs.push_back(Key1(key));
+    }
+    for (const auto& key_str : key_strs) {
+      keys.emplace_back(key_str);
+    }
+    size_t batch_size = kMaxKey - start_keys[i] + 1;
+    std::vector<std::string> values;
+    std::vector<std::string> timestamps;
+    std::vector<Status> status_list =
+        db_->MultiGet(read_opts, keys, &values, &timestamps);
+    ASSERT_EQ(batch_size, values.size());
+    ASSERT_EQ(batch_size, timestamps.size());
+    for (uint64_t idx = 0; idx < values.size(); ++idx) {
+      ASSERT_EQ("value" + std::to_string(i), values[idx]);
+      ASSERT_EQ(write_timestamps[i], timestamps[idx]);
+      ASSERT_OK(status_list[idx]);
+    }
+  }
+
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_secondary_test.cc b/src/rocksdb/db/db_secondary_test.cc
new file mode 100644
index 000000000..20d7534e0
--- /dev/null
+++ b/src/rocksdb/db/db_secondary_test.cc
@@ -0,0 +1,1693 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl/db_impl_secondary.h"
+#include "db/db_test_util.h"
+#include "db/db_with_timestamp_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "test_util/sync_point.h"
+#include "test_util/testutil.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+class DBSecondaryTestBase : public DBBasicTestWithTimestampBase {
+ public:
+  explicit DBSecondaryTestBase(const std::string& dbname)
+      : DBBasicTestWithTimestampBase(dbname),
+        secondary_path_(),
+        handles_secondary_(),
+        db_secondary_(nullptr) {
+    secondary_path_ =
+        test::PerThreadDBPath(env_, "/db_secondary_test_secondary");
+  }
+
+  ~DBSecondaryTestBase() override {
+    CloseSecondary();
+    if (getenv("KEEP_DB") != nullptr) {
+      fprintf(stdout, "Secondary DB is still at %s\n", secondary_path_.c_str());
+    } else {
+      Options options;
+      options.env = env_;
+      EXPECT_OK(DestroyDB(secondary_path_, options));
+    }
+  }
+
+ protected:
+  Status ReopenAsSecondary(const Options& options) {
+    return DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_);
+  }
+
+  void OpenSecondary(const Options& options);
+
+  Status TryOpenSecondary(const Options& options);
+
+  void OpenSecondaryWithColumnFamilies(
+      const std::vector<std::string>& column_families, const Options& options);
+
+  void CloseSecondary() {
+    for (auto h : handles_secondary_) {
+      ASSERT_OK(db_secondary_->DestroyColumnFamilyHandle(h));
+    }
+    handles_secondary_.clear();
+    delete db_secondary_;
+    db_secondary_ = nullptr;
+  }
+
+  DBImplSecondary* db_secondary_full() {
+    return static_cast<DBImplSecondary*>(db_secondary_);
+  }
+
+  void CheckFileTypeCounts(const std::string& dir, int expected_log,
+                           int expected_sst, int expected_manifest) const;
+
+  std::string secondary_path_;
+  std::vector<ColumnFamilyHandle*> handles_secondary_;
+  DB* db_secondary_;
+};
+
+void DBSecondaryTestBase::OpenSecondary(const Options& options) {
+  ASSERT_OK(TryOpenSecondary(options));
+}
+
+Status DBSecondaryTestBase::TryOpenSecondary(const Options& options) {
+  Status s =
+      DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_secondary_);
+  return s;
+}
+
+void DBSecondaryTestBase::OpenSecondaryWithColumnFamilies(
+    const std::vector<std::string>& column_families, const Options& options) {
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+  for (const auto& cf_name : column_families) {
+    cf_descs.emplace_back(cf_name, options);
+  }
+  Status s = DB::OpenAsSecondary(options, dbname_, secondary_path_, cf_descs,
+                                 &handles_secondary_, &db_secondary_);
+  ASSERT_OK(s);
+}
+
+void DBSecondaryTestBase::CheckFileTypeCounts(const std::string& dir,
+                                              int expected_log,
+                                              int expected_sst,
+                                              int expected_manifest) const {
+  std::vector<std::string> filenames;
+  ASSERT_OK(env_->GetChildren(dir, &filenames));
+
+  int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0;
+  for (auto file : filenames) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(file, &number, &type)) {
+      log_cnt += (type == kWalFile);
+      sst_cnt += (type == kTableFile);
+      manifest_cnt += (type == kDescriptorFile);
+    }
+  }
+  ASSERT_EQ(expected_log, log_cnt);
+  ASSERT_EQ(expected_sst, sst_cnt);
+  ASSERT_EQ(expected_manifest, manifest_cnt);
+}
+
+class DBSecondaryTest : public DBSecondaryTestBase {
+ public:
+  explicit DBSecondaryTest() : DBSecondaryTestBase("db_secondary_test") {}
+};
+
+TEST_F(DBSecondaryTest, FailOpenIfLoggerCreationFail) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  Reopen(options);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "rocksdb::CreateLoggerFromOptions:AfterGetPath", [&](void* arg) {
+        auto* s = reinterpret_cast<Status*>(arg);
+        assert(s);
+        *s = Status::IOError("Injected");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  options.max_open_files = -1;
+  Status s = TryOpenSecondary(options);
+  ASSERT_EQ(nullptr, options.info_log);
+  ASSERT_TRUE(s.IsIOError());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBSecondaryTest, NonExistingDb) {
+  Destroy(last_options_);
+
+  Options options = GetDefaultOptions();
+  options.env = env_;
+  options.max_open_files = -1;
+  const std::string dbname = "/doesnt/exist";
+  Status s =
+      DB::OpenAsSecondary(options, dbname, secondary_path_, &db_secondary_);
+  ASSERT_TRUE(s.IsIOError());
+}
+
+TEST_F(DBSecondaryTest, ReopenAsSecondary) {
+  Options options;
+  options.env = env_;
+  Reopen(options);
+  ASSERT_OK(Put("foo", "foo_value"));
+  ASSERT_OK(Put("bar", "bar_value"));
+  ASSERT_OK(dbfull()->Flush(FlushOptions()));
+  Close();
+
+  ASSERT_OK(ReopenAsSecondary(options));
+  ASSERT_EQ("foo_value", Get("foo"));
+  ASSERT_EQ("bar_value", Get("bar"));
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  auto db1 = static_cast<DBImplSecondary*>(db_);
+  ASSERT_NE(nullptr, db1);
+  Iterator* iter = db1->NewIterator(ropts);
+  ASSERT_NE(nullptr, iter);
+  size_t count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    if (0 == count) {
+      ASSERT_EQ("bar", iter->key().ToString());
+      ASSERT_EQ("bar_value", iter->value().ToString());
+    } else if (1 == count) {
+      ASSERT_EQ("foo", iter->key().ToString());
+      ASSERT_EQ("foo_value", iter->value().ToString());
+    }
+    ++count;
+  }
+  delete iter;
+  ASSERT_EQ(2, count);
+}
+
+TEST_F(DBSecondaryTest, SimpleInternalCompaction) {
+  Options options;
+  options.env = env_;
+  Reopen(options);
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+  CompactionServiceInput input;
+
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(&meta);
+  for (auto& file : meta.levels[0].files) {
+    ASSERT_EQ(0, meta.levels[0].level);
+    input.input_files.push_back(file.name);
+  }
+  ASSERT_EQ(input.input_files.size(), 3);
+
+  input.output_level = 1;
+  ASSERT_OK(db_->GetDbIdentity(input.db_id));
+  Close();
+
+  options.max_open_files = -1;
+  OpenSecondary(options);
+  auto cfh = db_secondary_->DefaultColumnFamily();
+
+  CompactionServiceResult result;
+  ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(
+      OpenAndCompactOptions(), cfh, input, &result));
+
+  ASSERT_EQ(result.output_files.size(), 1);
+  InternalKey smallest, largest;
+  smallest.DecodeFrom(result.output_files[0].smallest_internal_key);
+  largest.DecodeFrom(result.output_files[0].largest_internal_key);
+  ASSERT_EQ(smallest.user_key().ToString(), "bar");
+  ASSERT_EQ(largest.user_key().ToString(), "foo");
+  ASSERT_EQ(result.output_level, 1);
+  ASSERT_EQ(result.output_path, this->secondary_path_);
+  ASSERT_EQ(result.num_output_records, 2);
+  ASSERT_GT(result.bytes_written, 0);
+  ASSERT_OK(result.status);
+}
+
+TEST_F(DBSecondaryTest, InternalCompactionMultiLevels) {
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  const int kRangeL2 = 10;
+  const int kRangeL1 = 30;
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i * kRangeL2), "value" + std::to_string(i)));
+    ASSERT_OK(Put(Key((i + 1) * kRangeL2 - 1), "value" + std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+  MoveFilesToLevel(2);
+  for (int i = 0; i < 5; i++) {
+    ASSERT_OK(Put(Key(i * kRangeL1), "value" + std::to_string(i)));
+    ASSERT_OK(Put(Key((i + 1) * kRangeL1 - 1), "value" + std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+  MoveFilesToLevel(1);
+  for (int i = 0; i < 4; i++) {
+    ASSERT_OK(Put(Key(i * 30), "value" + std::to_string(i)));
+    ASSERT_OK(Put(Key(i * 30 + 50), "value" + std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(&meta);
+
+  // pick 2 files on level 0 for compaction, which has 3 overlap files on L1
+  CompactionServiceInput input1;
+  input1.input_files.push_back(meta.levels[0].files[2].name);
+  input1.input_files.push_back(meta.levels[0].files[3].name);
+  input1.input_files.push_back(meta.levels[1].files[0].name);
+  input1.input_files.push_back(meta.levels[1].files[1].name);
+  input1.input_files.push_back(meta.levels[1].files[2].name);
+
+  input1.output_level = 1;
+  ASSERT_OK(db_->GetDbIdentity(input1.db_id));
+
+  options.max_open_files = -1;
+  Close();
+
+  OpenSecondary(options);
+  auto cfh = db_secondary_->DefaultColumnFamily();
+  CompactionServiceResult result;
+  ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(
+      OpenAndCompactOptions(), cfh, input1, &result));
+  ASSERT_OK(result.status);
+
+  // pick 2 files on level 1 for compaction, which has 6 overlap files on L2
+  CompactionServiceInput input2;
+  input2.input_files.push_back(meta.levels[1].files[1].name);
+  input2.input_files.push_back(meta.levels[1].files[2].name);
+  for (int i = 3; i < 9; i++) {
+    input2.input_files.push_back(meta.levels[2].files[i].name);
+  }
+
+  input2.output_level = 2;
+  input2.db_id = input1.db_id;
+  ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(
+      OpenAndCompactOptions(), cfh, input2, &result));
+  ASSERT_OK(result.status);
+
+  CloseSecondary();
+
+  // delete all l2 files, without update manifest
+  for (auto& file : meta.levels[2].files) {
+    ASSERT_OK(env_->DeleteFile(dbname_ + file.name));
+  }
+  OpenSecondary(options);
+  cfh = db_secondary_->DefaultColumnFamily();
+  Status s = db_secondary_full()->TEST_CompactWithoutInstallation(
+      OpenAndCompactOptions(), cfh, input2, &result);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_OK(result.status);
+
+  // TODO: L0 -> L1 compaction should success, currently version is not built
+  // if files is missing.
+  //  ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(OpenAndCompactOptions(),
+  //  cfh, input1, &result));
+}
+
+TEST_F(DBSecondaryTest, InternalCompactionCompactedFiles) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+  CompactionServiceInput input;
+
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(&meta);
+  for (auto& file : meta.levels[0].files) {
+    ASSERT_EQ(0, meta.levels[0].level);
+    input.input_files.push_back(file.name);
+  }
+  ASSERT_EQ(input.input_files.size(), 3);
+
+  input.output_level = 1;
+  ASSERT_OK(db_->GetDbIdentity(input.db_id));
+
+  // trigger compaction to delete the files for secondary instance compaction
+  ASSERT_OK(Put("foo", "foo_value" + std::to_string(3)));
+  ASSERT_OK(Put("bar", "bar_value" + std::to_string(3)));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  Close();
+
+  options.max_open_files = -1;
+  OpenSecondary(options);
+  auto cfh = db_secondary_->DefaultColumnFamily();
+
+  CompactionServiceResult result;
+  Status s = db_secondary_full()->TEST_CompactWithoutInstallation(
+      OpenAndCompactOptions(), cfh, input, &result);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_OK(result.status);
+}
+
+TEST_F(DBSecondaryTest, InternalCompactionMissingFiles) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+  CompactionServiceInput input;
+
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(&meta);
+  for (auto& file : meta.levels[0].files) {
+    ASSERT_EQ(0, meta.levels[0].level);
+    input.input_files.push_back(file.name);
+  }
+  ASSERT_EQ(input.input_files.size(), 3);
+
+  input.output_level = 1;
+  ASSERT_OK(db_->GetDbIdentity(input.db_id));
+
+  Close();
+
+  ASSERT_OK(env_->DeleteFile(dbname_ + input.input_files[0]));
+
+  options.max_open_files = -1;
+  OpenSecondary(options);
+  auto cfh = db_secondary_->DefaultColumnFamily();
+
+  CompactionServiceResult result;
+  Status s = db_secondary_full()->TEST_CompactWithoutInstallation(
+      OpenAndCompactOptions(), cfh, input, &result);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_OK(result.status);
+
+  input.input_files.erase(input.input_files.begin());
+
+  ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(
+      OpenAndCompactOptions(), cfh, input, &result));
+  ASSERT_OK(result.status);
+}
+
+TEST_F(DBSecondaryTest, OpenAsSecondary) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  const auto verify_db_func = [&](const std::string& foo_val,
+                                  const std::string& bar_val) {
+    std::string value;
+    ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+    ASSERT_EQ(foo_val, value);
+    ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+    ASSERT_EQ(bar_val, value);
+    Iterator* iter = db_secondary_->NewIterator(ropts);
+    ASSERT_NE(nullptr, iter);
+    iter->Seek("foo");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("foo", iter->key().ToString());
+    ASSERT_EQ(foo_val, iter->value().ToString());
+    iter->Seek("bar");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("bar", iter->key().ToString());
+    ASSERT_EQ(bar_val, iter->value().ToString());
+    size_t count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ++count;
+    }
+    ASSERT_EQ(2, count);
+    delete iter;
+  };
+
+  verify_db_func("foo_value2", "bar_value2");
+
+  ASSERT_OK(Put("foo", "new_foo_value"));
+  ASSERT_OK(Put("bar", "new_bar_value"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  verify_db_func("new_foo_value", "new_bar_value");
+}
+
+namespace {
+class TraceFileEnv : public EnvWrapper {
+ public:
+  explicit TraceFileEnv(Env* _target) : EnvWrapper(_target) {}
+  static const char* kClassName() { return "TraceFileEnv"; }
+  const char* Name() const override { return kClassName(); }
+
+  Status NewRandomAccessFile(const std::string& f,
+                             std::unique_ptr<RandomAccessFile>* r,
+                             const EnvOptions& env_options) override {
+    class TracedRandomAccessFile : public RandomAccessFile {
+     public:
+      TracedRandomAccessFile(std::unique_ptr<RandomAccessFile>&& target,
+                             std::atomic<int>& counter)
+          : target_(std::move(target)), files_closed_(counter) {}
+      ~TracedRandomAccessFile() override {
+        files_closed_.fetch_add(1, std::memory_order_relaxed);
+      }
+      Status Read(uint64_t offset, size_t n, Slice* result,
+                  char* scratch) const override {
+        return target_->Read(offset, n, result, scratch);
+      }
+
+     private:
+      std::unique_ptr<RandomAccessFile> target_;
+      std::atomic<int>& files_closed_;
+    };
+    Status s = target()->NewRandomAccessFile(f, r, env_options);
+    if (s.ok()) {
+      r->reset(new TracedRandomAccessFile(std::move(*r), files_closed_));
+    }
+    return s;
+  }
+
+  int files_closed() const {
+    return files_closed_.load(std::memory_order_relaxed);
+  }
+
+ private:
+  std::atomic<int> files_closed_{0};
+};
+}  // anonymous namespace
+
+TEST_F(DBSecondaryTest, SecondaryCloseFiles) {
+  Options options;
+  options.env = env_;
+  options.max_open_files = 1;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  Options options1;
+  std::unique_ptr<Env> traced_env(new TraceFileEnv(env_));
+  options1.env = traced_env.get();
+  OpenSecondary(options1);
+
+  static const auto verify_db = [&]() {
+    std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
+    std::unique_ptr<Iterator> iter2(db_secondary_->NewIterator(ReadOptions()));
+    for (iter1->SeekToFirst(), iter2->SeekToFirst();
+         iter1->Valid() && iter2->Valid(); iter1->Next(), iter2->Next()) {
+      ASSERT_EQ(iter1->key(), iter2->key());
+      ASSERT_EQ(iter1->value(), iter2->value());
+    }
+    ASSERT_FALSE(iter1->Valid());
+    ASSERT_FALSE(iter2->Valid());
+  };
+
+  ASSERT_OK(Put("a", "value"));
+  ASSERT_OK(Put("c", "value"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  verify_db();
+
+  ASSERT_OK(Put("b", "value"));
+  ASSERT_OK(Put("d", "value"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  verify_db();
+
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  ASSERT_EQ(2, static_cast<TraceFileEnv*>(traced_env.get())->files_closed());
+
+  Status s = db_secondary_->SetDBOptions({{"max_open_files", "-1"}});
+  ASSERT_TRUE(s.IsNotSupported());
+  CloseSecondary();
+}
+
+TEST_F(DBSecondaryTest, OpenAsSecondaryWALTailing) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+  }
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  const auto verify_db_func = [&](const std::string& foo_val,
+                                  const std::string& bar_val) {
+    std::string value;
+    ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+    ASSERT_EQ(foo_val, value);
+    ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+    ASSERT_EQ(bar_val, value);
+    Iterator* iter = db_secondary_->NewIterator(ropts);
+    ASSERT_NE(nullptr, iter);
+    iter->Seek("foo");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("foo", iter->key().ToString());
+    ASSERT_EQ(foo_val, iter->value().ToString());
+    iter->Seek("bar");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("bar", iter->key().ToString());
+    ASSERT_EQ(bar_val, iter->value().ToString());
+    size_t count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ++count;
+    }
+    ASSERT_EQ(2, count);
+    delete iter;
+  };
+
+  verify_db_func("foo_value2", "bar_value2");
+
+  ASSERT_OK(Put("foo", "new_foo_value"));
+  ASSERT_OK(Put("bar", "new_bar_value"));
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  verify_db_func("new_foo_value", "new_bar_value");
+
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", "new_foo_value_1"));
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  verify_db_func("new_foo_value_1", "new_bar_value");
+}
+
+TEST_F(DBSecondaryTest, SecondaryTailingBug_ISSUE_8467) {
+  Options options;
+  options.env = env_;
+  Reopen(options);
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+  }
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  const auto verify_db = [&](const std::string& foo_val,
+                             const std::string& bar_val) {
+    std::string value;
+    ReadOptions ropts;
+    Status s = db_secondary_->Get(ropts, "foo", &value);
+    ASSERT_OK(s);
+    ASSERT_EQ(foo_val, value);
+
+    s = db_secondary_->Get(ropts, "bar", &value);
+    ASSERT_OK(s);
+    ASSERT_EQ(bar_val, value);
+  };
+
+  for (int i = 0; i < 2; ++i) {
+    ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+    verify_db("foo_value2", "bar_value2");
+  }
+}
+
+TEST_F(DBSecondaryTest, RefreshIterator) {
+  Options options;
+  options.env = env_;
+  Reopen(options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  std::unique_ptr<Iterator> it(db_secondary_->NewIterator(ReadOptions()));
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+
+    ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+    if (0 == i) {
+      it->Seek("foo");
+      ASSERT_FALSE(it->Valid());
+      ASSERT_OK(it->status());
+
+      ASSERT_OK(it->Refresh());
+
+      it->Seek("foo");
+      ASSERT_OK(it->status());
+      ASSERT_TRUE(it->Valid());
+      ASSERT_EQ("foo", it->key());
+      ASSERT_EQ("foo_value0", it->value());
+    } else {
+      it->Seek("foo");
+      ASSERT_TRUE(it->Valid());
+      ASSERT_EQ("foo", it->key());
+      ASSERT_EQ("foo_value" + std::to_string(i - 1), it->value());
+      ASSERT_OK(it->status());
+
+      ASSERT_OK(it->Refresh());
+
+      it->Seek("foo");
+      ASSERT_OK(it->status());
+      ASSERT_TRUE(it->Valid());
+      ASSERT_EQ("foo", it->key());
+      ASSERT_EQ("foo_value" + std::to_string(i), it->value());
+    }
+  }
+}
+
+TEST_F(DBSecondaryTest, OpenWithNonExistColumnFamily) {
+  Options options;
+  options.env = env_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, options1);
+  cf_descs.emplace_back("pikachu", options1);
+  cf_descs.emplace_back("eevee", options1);
+  Status s = DB::OpenAsSecondary(options1, dbname_, secondary_path_, cf_descs,
+                                 &handles_secondary_, &db_secondary_);
+  ASSERT_NOK(s);
+}
+
+TEST_F(DBSecondaryTest, OpenWithSubsetOfColumnFamilies) {
+  Options options;
+  options.env = env_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+  ASSERT_EQ(0, handles_secondary_.size());
+  ASSERT_NE(nullptr, db_secondary_);
+
+  ASSERT_OK(Put(0 /*cf*/, "foo", "foo_value"));
+  ASSERT_OK(Put(1 /*cf*/, "foo", "foo_value"));
+  ASSERT_OK(Flush(0 /*cf*/));
+  ASSERT_OK(Flush(1 /*cf*/));
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  std::string value;
+  ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+  ASSERT_EQ("foo_value", value);
+}
+
+TEST_F(DBSecondaryTest, SwitchToNewManifestDuringOpen) {
+  Options options;
+  options.env = env_;
+  Reopen(options);
+  Close();
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:0",
+        "VersionSet::ProcessManifestWrites:BeforeNewManifest"},
+       {"DBImpl::Open:AfterDeleteFiles",
+        "ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:"
+        "1"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread ro_db_thread([&]() {
+    Options options1;
+    options1.env = env_;
+    options1.max_open_files = -1;
+    Status s = TryOpenSecondary(options1);
+    ASSERT_TRUE(s.IsTryAgain());
+
+    // Try again
+    OpenSecondary(options1);
+    CloseSecondary();
+  });
+  Reopen(options);
+  ro_db_thread.join();
+}
+
+TEST_F(DBSecondaryTest, MissingTableFileDuringOpen) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+  for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(dbfull()->Flush(FlushOptions()));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  std::string value;
+  ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+  ASSERT_EQ("foo_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            value);
+  ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+  ASSERT_EQ("bar_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            value);
+  Iterator* iter = db_secondary_->NewIterator(ropts);
+  ASSERT_NE(nullptr, iter);
+  iter->Seek("bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("bar", iter->key().ToString());
+  ASSERT_EQ("bar_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            iter->value().ToString());
+  iter->Seek("foo");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("foo", iter->key().ToString());
+  ASSERT_EQ("foo_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            iter->value().ToString());
+  size_t count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ++count;
+  }
+  ASSERT_EQ(2, count);
+  delete iter;
+}
+
+TEST_F(DBSecondaryTest, MissingTableFile) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(dbfull()->Flush(FlushOptions()));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_NE(nullptr, db_secondary_full());
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  std::string value;
+  ASSERT_NOK(db_secondary_->Get(ropts, "foo", &value));
+  ASSERT_NOK(db_secondary_->Get(ropts, "bar", &value));
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+  ASSERT_EQ("foo_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            value);
+  ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+  ASSERT_EQ("bar_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            value);
+  Iterator* iter = db_secondary_->NewIterator(ropts);
+  ASSERT_NE(nullptr, iter);
+  iter->Seek("bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("bar", iter->key().ToString());
+  ASSERT_EQ("bar_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            iter->value().ToString());
+  iter->Seek("foo");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("foo", iter->key().ToString());
+  ASSERT_EQ("foo_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            iter->value().ToString());
+  size_t count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ++count;
+  }
+  ASSERT_EQ(2, count);
+  delete iter;
+}
+
+TEST_F(DBSecondaryTest, PrimaryDropColumnFamily) {
+  Options options;
+  options.env = env_;
+  const std::string kCfName1 = "pikachu";
+  CreateAndReopenWithCF({kCfName1}, options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondaryWithColumnFamilies({kCfName1}, options1);
+  ASSERT_EQ(2, handles_secondary_.size());
+
+  ASSERT_OK(Put(1 /*cf*/, "foo", "foo_val_1"));
+  ASSERT_OK(Flush(1 /*cf*/));
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  std::string value;
+  ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value));
+  ASSERT_EQ("foo_val_1", value);
+
+  ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+  Close();
+  CheckFileTypeCounts(dbname_, 1, 0, 1);
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  value.clear();
+  ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value));
+  ASSERT_EQ("foo_val_1", value);
+}
+
+TEST_F(DBSecondaryTest, SwitchManifest) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  const std::string cf1_name("test_cf");
+  CreateAndReopenWithCF({cf1_name}, options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondaryWithColumnFamilies({kDefaultColumnFamilyName, cf1_name},
+                                  options1);
+
+  const int kNumFiles = options.level0_file_num_compaction_trigger - 1;
+  // Keep it smaller than 10 so that key0, key1, ..., key9 are sorted as 0, 1,
+  // ..., 9.
+  const int kNumKeys = 10;
+  // Create two sst
+  for (int i = 0; i != kNumFiles; ++i) {
+    for (int j = 0; j != kNumKeys; ++j) {
+      ASSERT_OK(Put("key" + std::to_string(j), "value_" + std::to_string(i)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  const auto& range_scan_db = [&]() {
+    ReadOptions tmp_ropts;
+    tmp_ropts.total_order_seek = true;
+    tmp_ropts.verify_checksums = true;
+    std::unique_ptr<Iterator> iter(db_secondary_->NewIterator(tmp_ropts));
+    int cnt = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++cnt) {
+      ASSERT_EQ("key" + std::to_string(cnt), iter->key().ToString());
+      ASSERT_EQ("value_" + std::to_string(kNumFiles - 1),
+                iter->value().ToString());
+    }
+  };
+
+  range_scan_db();
+
+  // While secondary instance still keeps old MANIFEST open, we close primary,
+  // restart primary, performs full compaction, close again, restart again so
+  // that next time secondary tries to catch up with primary, the secondary
+  // will skip the MANIFEST in middle.
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, cf1_name}, options);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, cf1_name}, options);
+  ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  range_scan_db();
+}
+
+TEST_F(DBSecondaryTest, SwitchManifestTwice) {
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  const std::string cf1_name("test_cf");
+  CreateAndReopenWithCF({cf1_name}, options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondaryWithColumnFamilies({kDefaultColumnFamilyName, cf1_name},
+                                  options1);
+
+  ASSERT_OK(Put("0", "value0"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  std::string value;
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  ASSERT_OK(db_secondary_->Get(ropts, "0", &value));
+  ASSERT_EQ("value0", value);
+
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, cf1_name}, options);
+  ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, cf1_name}, options);
+  ASSERT_OK(Put("0", "value1"));
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+
+  ASSERT_OK(db_secondary_->Get(ropts, "0", &value));
+  ASSERT_EQ("value1", value);
+}
+
+TEST_F(DBSecondaryTest, DISABLED_SwitchWAL) {
+  const int kNumKeysPerMemtable = 1;
+  Options options;
+  options.env = env_;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 2;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerMemtable));
+  Reopen(options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  const auto& verify_db = [](DB* db1, DB* db2) {
+    ASSERT_NE(nullptr, db1);
+    ASSERT_NE(nullptr, db2);
+    ReadOptions read_opts;
+    read_opts.verify_checksums = true;
+    std::unique_ptr<Iterator> it1(db1->NewIterator(read_opts));
+    std::unique_ptr<Iterator> it2(db2->NewIterator(read_opts));
+    it1->SeekToFirst();
+    it2->SeekToFirst();
+    for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) {
+      ASSERT_EQ(it1->key(), it2->key());
+      ASSERT_EQ(it1->value(), it2->value());
+    }
+    ASSERT_FALSE(it1->Valid());
+    ASSERT_FALSE(it2->Valid());
+
+    for (it1->SeekToFirst(); it1->Valid(); it1->Next()) {
+      std::string value;
+      ASSERT_OK(db2->Get(read_opts, it1->key(), &value));
+      ASSERT_EQ(it1->value(), value);
+    }
+    for (it2->SeekToFirst(); it2->Valid(); it2->Next()) {
+      std::string value;
+      ASSERT_OK(db1->Get(read_opts, it2->key(), &value));
+      ASSERT_EQ(it2->value(), value);
+    }
+  };
+  for (int k = 0; k != 16; ++k) {
+    ASSERT_OK(Put("key" + std::to_string(k), "value" + std::to_string(k)));
+    ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+    verify_db(dbfull(), db_secondary_);
+  }
+}
+
+TEST_F(DBSecondaryTest, DISABLED_SwitchWALMultiColumnFamilies) {
+  const int kNumKeysPerMemtable = 1;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCallFlush:ContextCleanedUp",
+        "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+  const std::string kCFName1 = "pikachu";
+  Options options;
+  options.env = env_;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 2;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerMemtable));
+  CreateAndReopenWithCF({kCFName1}, options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondaryWithColumnFamilies({kCFName1}, options1);
+  ASSERT_EQ(2, handles_secondary_.size());
+
+  const auto& verify_db = [](DB* db1,
+                             const std::vector<ColumnFamilyHandle*>& handles1,
+                             DB* db2,
+                             const std::vector<ColumnFamilyHandle*>& handles2) {
+    ASSERT_NE(nullptr, db1);
+    ASSERT_NE(nullptr, db2);
+    ReadOptions read_opts;
+    read_opts.verify_checksums = true;
+    ASSERT_EQ(handles1.size(), handles2.size());
+    for (size_t i = 0; i != handles1.size(); ++i) {
+      std::unique_ptr<Iterator> it1(db1->NewIterator(read_opts, handles1[i]));
+      std::unique_ptr<Iterator> it2(db2->NewIterator(read_opts, handles2[i]));
+      it1->SeekToFirst();
+      it2->SeekToFirst();
+      for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) {
+        ASSERT_EQ(it1->key(), it2->key());
+        ASSERT_EQ(it1->value(), it2->value());
+      }
+      ASSERT_FALSE(it1->Valid());
+      ASSERT_FALSE(it2->Valid());
+
+      for (it1->SeekToFirst(); it1->Valid(); it1->Next()) {
+        std::string value;
+        ASSERT_OK(db2->Get(read_opts, handles2[i], it1->key(), &value));
+        ASSERT_EQ(it1->value(), value);
+      }
+      for (it2->SeekToFirst(); it2->Valid(); it2->Next()) {
+        std::string value;
+        ASSERT_OK(db1->Get(read_opts, handles1[i], it2->key(), &value));
+        ASSERT_EQ(it2->value(), value);
+      }
+    }
+  };
+  for (int k = 0; k != 8; ++k) {
+    for (int j = 0; j < 2; ++j) {
+      ASSERT_OK(Put(0 /*cf*/, "key" + std::to_string(k),
+                    "value" + std::to_string(k)));
+      ASSERT_OK(Put(1 /*cf*/, "key" + std::to_string(k),
+                    "value" + std::to_string(k)));
+    }
+    TEST_SYNC_POINT(
+        "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp");
+    ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+    verify_db(dbfull(), handles_, db_secondary_, handles_secondary_);
+    SyncPoint::GetInstance()->ClearTrace();
+  }
+}
+
+TEST_F(DBSecondaryTest, CatchUpAfterFlush) {
+  const int kNumKeysPerMemtable = 16;
+  Options options;
+  options.env = env_;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 2;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerMemtable));
+  Reopen(options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  WriteOptions write_opts;
+  WriteBatch wb;
+  ASSERT_OK(wb.Put("key0", "value0"));
+  ASSERT_OK(wb.Put("key1", "value1"));
+  ASSERT_OK(dbfull()->Write(write_opts, &wb));
+  ReadOptions read_opts;
+  std::unique_ptr<Iterator> iter1(db_secondary_->NewIterator(read_opts));
+  iter1->Seek("key0");
+  ASSERT_FALSE(iter1->Valid());
+  iter1->Seek("key1");
+  ASSERT_FALSE(iter1->Valid());
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  iter1->Seek("key0");
+  ASSERT_FALSE(iter1->Valid());
+  iter1->Seek("key1");
+  ASSERT_FALSE(iter1->Valid());
+  ASSERT_OK(iter1->status());
+  std::unique_ptr<Iterator> iter2(db_secondary_->NewIterator(read_opts));
+  iter2->Seek("key0");
+  ASSERT_TRUE(iter2->Valid());
+  ASSERT_EQ("value0", iter2->value());
+  iter2->Seek("key1");
+  ASSERT_TRUE(iter2->Valid());
+  ASSERT_OK(iter2->status());
+  ASSERT_EQ("value1", iter2->value());
+
+  {
+    WriteBatch wb1;
+    ASSERT_OK(wb1.Put("key0", "value01"));
+    ASSERT_OK(wb1.Put("key1", "value11"));
+    ASSERT_OK(dbfull()->Write(write_opts, &wb1));
+  }
+
+  {
+    WriteBatch wb2;
+    ASSERT_OK(wb2.Put("key0", "new_value0"));
+    ASSERT_OK(wb2.Delete("key1"));
+    ASSERT_OK(dbfull()->Write(write_opts, &wb2));
+  }
+
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  std::unique_ptr<Iterator> iter3(db_secondary_->NewIterator(read_opts));
+  // iter3 should not see value01 and value11 at all.
+  iter3->Seek("key0");
+  ASSERT_TRUE(iter3->Valid());
+  ASSERT_EQ("new_value0", iter3->value());
+  iter3->Seek("key1");
+  ASSERT_FALSE(iter3->Valid());
+  ASSERT_OK(iter3->status());
+}
+
+TEST_F(DBSecondaryTest, CheckConsistencyWhenOpen) {
+  bool called = false;
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImplSecondary::CheckConsistency:AfterFirstAttempt", [&](void* arg) {
+        ASSERT_NE(nullptr, arg);
+        called = true;
+        auto* s = reinterpret_cast<Status*>(arg);
+        ASSERT_NOK(*s);
+      });
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::CheckConsistency:AfterGetLiveFilesMetaData",
+        "BackgroundCallCompaction:0"},
+       {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
+        "DBImpl::CheckConsistency:BeforeGetFileSize"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("a", "value0"));
+  ASSERT_OK(Put("c", "value0"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("b", "value1"));
+  ASSERT_OK(Put("d", "value1"));
+  ASSERT_OK(Flush());
+  port::Thread thread([this]() {
+    Options opts;
+    opts.env = env_;
+    opts.max_open_files = -1;
+    OpenSecondary(opts);
+  });
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  thread.join();
+  ASSERT_TRUE(called);
+}
+
+TEST_F(DBSecondaryTest, StartFromInconsistent) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "value"));
+  ASSERT_OK(Flush());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) {
+        ASSERT_NE(nullptr, arg);
+        *(reinterpret_cast<Status*>(arg)) =
+            Status::Corruption("Inject corruption");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  Options options1;
+  options1.env = env_;
+  Status s = TryOpenSecondary(options1);
+  ASSERT_TRUE(s.IsCorruption());
+}
+
+TEST_F(DBSecondaryTest, InconsistencyDuringCatchUp) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "value"));
+  ASSERT_OK(Flush());
+
+  Options options1;
+  options1.env = env_;
+  OpenSecondary(options1);
+
+  {
+    std::string value;
+    ASSERT_OK(db_secondary_->Get(ReadOptions(), "foo", &value));
+    ASSERT_EQ("value", value);
+  }
+
+  ASSERT_OK(Put("bar", "value1"));
+  ASSERT_OK(Flush());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) {
+        ASSERT_NE(nullptr, arg);
+        *(reinterpret_cast<Status*>(arg)) =
+            Status::Corruption("Inject corruption");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  Status s = db_secondary_->TryCatchUpWithPrimary();
+  ASSERT_TRUE(s.IsCorruption());
+}
+
+TEST_F(DBSecondaryTest, OpenWithTransactionDB) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+
+  // Destroy the DB to recreate as a TransactionDB.
+  Close();
+  Destroy(options, true);
+
+  // Create a TransactionDB.
+  TransactionDB* txn_db = nullptr;
+  TransactionDBOptions txn_db_opts;
+  ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db));
+  ASSERT_NE(txn_db, nullptr);
+  db_ = txn_db;
+
+  std::vector<std::string> cfs = {"new_CF"};
+  CreateColumnFamilies(cfs, options);
+  ASSERT_EQ(handles_.size(), 1);
+
+  WriteOptions wopts;
+  TransactionOptions txn_opts;
+  Transaction* txn1 = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  ASSERT_NE(txn1, nullptr);
+  ASSERT_OK(txn1->Put(handles_[0], "k1", "v1"));
+  ASSERT_OK(txn1->Commit());
+  delete txn1;
+
+  options = CurrentOptions();
+  options.max_open_files = -1;
+  ASSERT_OK(TryOpenSecondary(options));
+}
+
+class DBSecondaryTestWithTimestamp : public DBSecondaryTestBase {
+ public:
+  explicit DBSecondaryTestWithTimestamp()
+      : DBSecondaryTestBase("db_secondary_test_with_timestamp") {}
+};
+TEST_F(DBSecondaryTestWithTimestamp, IteratorAndGetReadTimestampSizeMismatch) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::string write_timestamp = Timestamp(1, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database as secondary instance to test its timestamp support.
+  Close();
+  options.max_open_files = -1;
+  ASSERT_OK(ReopenAsSecondary(options));
+
+  ReadOptions read_opts;
+  std::string different_size_read_timestamp;
+  PutFixed32(&different_size_read_timestamp, 2);
+  Slice different_size_read_ts = different_size_read_timestamp;
+  read_opts.timestamp = &different_size_read_ts;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_TRUE(iter->status().IsInvalidArgument());
+  }
+
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    std::string value_from_get;
+    std::string timestamp;
+    ASSERT_TRUE(db_->Get(read_opts, Key1(key), &value_from_get, &timestamp)
+                    .IsInvalidArgument());
+  }
+
+  Close();
+}
+
+TEST_F(DBSecondaryTestWithTimestamp,
+       IteratorAndGetReadTimestampSpecifiedWithoutWriteTimestamp) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database as secondary instance to test its timestamp support.
+  Close();
+  options.max_open_files = -1;
+  ASSERT_OK(ReopenAsSecondary(options));
+
+  ReadOptions read_opts;
+  const std::string read_timestamp = Timestamp(2, 0);
+  Slice read_ts = read_timestamp;
+  read_opts.timestamp = &read_ts;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_TRUE(iter->status().IsInvalidArgument());
+  }
+
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    std::string value_from_get;
+    std::string timestamp;
+    ASSERT_TRUE(db_->Get(read_opts, Key1(key), &value_from_get, &timestamp)
+                    .IsInvalidArgument());
+  }
+
+  Close();
+}
+
+TEST_F(DBSecondaryTestWithTimestamp,
+       IteratorAndGetWriteWithTimestampReadWithoutTimestamp) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::string write_timestamp = Timestamp(1, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database as secondary instance to test its timestamp support.
+  Close();
+  options.max_open_files = -1;
+  ASSERT_OK(ReopenAsSecondary(options));
+
+  ReadOptions read_opts;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_TRUE(iter->status().IsInvalidArgument());
+  }
+
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    std::string value_from_get;
+    ASSERT_TRUE(
+        db_->Get(read_opts, Key1(key), &value_from_get).IsInvalidArgument());
+  }
+
+  Close();
+}
+
+TEST_F(DBSecondaryTestWithTimestamp, IteratorAndGet) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::vector<uint64_t> start_keys = {1, 0};
+  const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+                                                     Timestamp(3, 0)};
+  const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+                                                    Timestamp(4, 0)};
+  for (size_t i = 0; i < write_timestamps.size(); ++i) {
+    WriteOptions write_opts;
+    for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+      Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
+                          "value" + std::to_string(i));
+      ASSERT_OK(s);
+    }
+  }
+
+  // Reopen the database as secondary instance to test its timestamp support.
+  Close();
+  options.max_open_files = -1;
+  ASSERT_OK(ReopenAsSecondary(options));
+
+  auto get_value_and_check = [](DB* db, ReadOptions read_opts, Slice key,
+                                Slice expected_value, std::string expected_ts) {
+    std::string value_from_get;
+    std::string timestamp;
+    ASSERT_OK(db->Get(read_opts, key.ToString(), &value_from_get, &timestamp));
+    ASSERT_EQ(expected_value, value_from_get);
+    ASSERT_EQ(expected_ts, timestamp);
+  };
+  for (size_t i = 0; i < read_timestamps.size(); ++i) {
+    ReadOptions read_opts;
+    Slice read_ts = read_timestamps[i];
+    read_opts.timestamp = &read_ts;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    int count = 0;
+    uint64_t key = 0;
+    // Forward iterate.
+    for (it->Seek(Key1(0)), key = start_keys[i]; it->Valid();
+         it->Next(), ++count, ++key) {
+      CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+                         "value" + std::to_string(i), write_timestamps[i]);
+      get_value_and_check(db_, read_opts, it->key(), it->value(),
+                          write_timestamps[i]);
+    }
+    size_t expected_count = kMaxKey - start_keys[i] + 1;
+    ASSERT_EQ(expected_count, count);
+
+    // Backward iterate.
+    count = 0;
+    for (it->SeekForPrev(Key1(kMaxKey)), key = kMaxKey; it->Valid();
+         it->Prev(), ++count, --key) {
+      CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+                         "value" + std::to_string(i), write_timestamps[i]);
+      get_value_and_check(db_, read_opts, it->key(), it->value(),
+                          write_timestamps[i]);
+    }
+    ASSERT_EQ(static_cast<size_t>(kMaxKey) - start_keys[i] + 1, count);
+
+    // SeekToFirst()/SeekToLast() with lower/upper bounds.
+    // Then iter with lower and upper bounds.
+    uint64_t l = 0;
+    uint64_t r = kMaxKey + 1;
+    while (l < r) {
+      std::string lb_str = Key1(l);
+      Slice lb = lb_str;
+      std::string ub_str = Key1(r);
+      Slice ub = ub_str;
+      read_opts.iterate_lower_bound = &lb;
+      read_opts.iterate_upper_bound = &ub;
+      it.reset(db_->NewIterator(read_opts));
+      for (it->SeekToFirst(), key = std::max(l, start_keys[i]), count = 0;
+           it->Valid(); it->Next(), ++key, ++count) {
+        CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+                           "value" + std::to_string(i), write_timestamps[i]);
+        get_value_and_check(db_, read_opts, it->key(), it->value(),
+                            write_timestamps[i]);
+      }
+      ASSERT_EQ(r - std::max(l, start_keys[i]), count);
+
+      for (it->SeekToLast(), key = std::min(r, kMaxKey + 1), count = 0;
+           it->Valid(); it->Prev(), --key, ++count) {
+        CheckIterUserEntry(it.get(), Key1(key - 1), kTypeValue,
+                           "value" + std::to_string(i), write_timestamps[i]);
+        get_value_and_check(db_, read_opts, it->key(), it->value(),
+                            write_timestamps[i]);
+      }
+      l += (kMaxKey / 100);
+      r -= (kMaxKey / 100);
+    }
+  }
+  Close();
+}
+
+TEST_F(DBSecondaryTestWithTimestamp, IteratorsReadTimestampSizeMismatch) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::string write_timestamp = Timestamp(1, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database as secondary instance to test its timestamp support.
+  Close();
+  options.max_open_files = -1;
+  ASSERT_OK(ReopenAsSecondary(options));
+
+  ReadOptions read_opts;
+  std::string different_size_read_timestamp;
+  PutFixed32(&different_size_read_timestamp, 2);
+  Slice different_size_read_ts = different_size_read_timestamp;
+  read_opts.timestamp = &different_size_read_ts;
+  {
+    std::vector<Iterator*> iters;
+    ASSERT_TRUE(
+        db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters)
+            .IsInvalidArgument());
+  }
+
+  Close();
+}
+
+TEST_F(DBSecondaryTestWithTimestamp,
+       IteratorsReadTimestampSpecifiedWithoutWriteTimestamp) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database as secondary instance to test its timestamp support.
+  Close();
+  options.max_open_files = -1;
+  ASSERT_OK(ReopenAsSecondary(options));
+
+  ReadOptions read_opts;
+  const std::string read_timestamp = Timestamp(2, 0);
+  Slice read_ts = read_timestamp;
+  read_opts.timestamp = &read_ts;
+  {
+    std::vector<Iterator*> iters;
+    ASSERT_TRUE(
+        db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters)
+            .IsInvalidArgument());
+  }
+
+  Close();
+}
+
+TEST_F(DBSecondaryTestWithTimestamp,
+       IteratorsWriteWithTimestampReadWithoutTimestamp) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::string write_timestamp = Timestamp(1, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database as secondary instance to test its timestamp support.
+  Close();
+  options.max_open_files = -1;
+  ASSERT_OK(ReopenAsSecondary(options));
+
+  ReadOptions read_opts;
+  {
+    std::vector<Iterator*> iters;
+    ASSERT_TRUE(
+        db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters)
+            .IsInvalidArgument());
+  }
+
+  Close();
+}
+
+TEST_F(DBSecondaryTestWithTimestamp, Iterators) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::string write_timestamp = Timestamp(1, 0);
+  const std::string read_timestamp = Timestamp(2, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database as secondary instance to test its timestamp support.
+  Close();
+  options.max_open_files = -1;
+  ASSERT_OK(ReopenAsSecondary(options));
+
+  ReadOptions read_opts;
+  Slice read_ts = read_timestamp;
+  read_opts.timestamp = &read_ts;
+  std::vector<Iterator*> iters;
+  ASSERT_OK(db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters));
+  ASSERT_EQ(static_cast<uint64_t>(1), iters.size());
+
+  int count = 0;
+  uint64_t key = 0;
+  // Forward iterate.
+  for (iters[0]->Seek(Key1(0)), key = 0; iters[0]->Valid();
+       iters[0]->Next(), ++count, ++key) {
+    CheckIterUserEntry(iters[0], Key1(key), kTypeValue,
+                       "value" + std::to_string(key), write_timestamp);
+  }
+
+  size_t expected_count = kMaxKey - 0 + 1;
+  ASSERT_EQ(expected_count, count);
+  delete iters[0];
+
+  Close();
+}
+#endif  //! ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_sst_test.cc b/src/rocksdb/db/db_sst_test.cc
new file mode 100644
index 000000000..7f031444a
--- /dev/null
+++ b/src/rocksdb/db/db_sst_test.cc
@@ -0,0 +1,1868 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "env/mock_env.h"
+#include "file/sst_file_manager_impl.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/sst_file_manager.h"
+#include "rocksdb/table.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBSSTTest : public DBTestBase {
+ public:
+  DBSSTTest() : DBTestBase("db_sst_test", /*env_do_fsync=*/true) {}
+};
+
+#ifndef ROCKSDB_LITE
+// A class which remembers the name of each flushed file.
+class FlushedFileCollector : public EventListener {
+ public:
+  FlushedFileCollector() {}
+  ~FlushedFileCollector() override {}
+
+  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    flushed_files_.push_back(info.file_path);
+  }
+
+  std::vector<std::string> GetFlushedFiles() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    std::vector<std::string> result;
+    for (auto fname : flushed_files_) {
+      result.push_back(fname);
+    }
+    return result;
+  }
+  void ClearFlushedFiles() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    flushed_files_.clear();
+  }
+
+ private:
+  std::vector<std::string> flushed_files_;
+  std::mutex mutex_;
+};
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBSSTTest, DontDeletePendingOutputs) {
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+
+  // Every time we write to a table file, call FOF/POF with full DB scan. This
+  // will make sure our pending_outputs_ protection work correctly
+  std::function<void()> purge_obsolete_files_function = [&]() {
+    JobContext job_context(0);
+    dbfull()->TEST_LockMutex();
+    dbfull()->FindObsoleteFiles(&job_context, true /*force*/);
+    dbfull()->TEST_UnlockMutex();
+    dbfull()->PurgeObsoleteFiles(job_context);
+    job_context.Clean();
+  };
+
+  env_->table_write_callback_ = &purge_obsolete_files_function;
+
+  for (int i = 0; i < 2; ++i) {
+    ASSERT_OK(Put("a", "begin"));
+    ASSERT_OK(Put("z", "end"));
+    ASSERT_OK(Flush());
+  }
+
+  // If pending output guard does not work correctly, PurgeObsoleteFiles() will
+  // delete the file that Compaction is trying to create, causing this: error
+  // db/db_test.cc:975: IO error:
+  // /tmp/rocksdbtest-1552237650/db_test/000009.sst: No such file or directory
+  Compact("a", "b");
+}
+
+// 1 Create some SST files by inserting K-V pairs into DB
+// 2 Close DB and change suffix from ".sst" to ".ldb" for every other SST file
+// 3 Open DB and check if all key can be read
+TEST_F(DBSSTTest, SSTsWithLdbSuffixHandling) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.num_levels = 4;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_id = 0;
+  for (int i = 0; i < 10; ++i) {
+    GenerateNewFile(&rnd, &key_id, false);
+  }
+  ASSERT_OK(Flush());
+  Close();
+  int const num_files = GetSstFileCount(dbname_);
+  ASSERT_GT(num_files, 0);
+
+  Reopen(options);
+  std::vector<std::string> values;
+  values.reserve(key_id);
+  for (int k = 0; k < key_id; ++k) {
+    values.push_back(Get(Key(k)));
+  }
+  Close();
+
+  std::vector<std::string> filenames;
+  GetSstFiles(env_, dbname_, &filenames);
+  int num_ldb_files = 0;
+  for (size_t i = 0; i < filenames.size(); ++i) {
+    if (i & 1) {
+      continue;
+    }
+    std::string const rdb_name = dbname_ + "/" + filenames[i];
+    std::string const ldb_name = Rocks2LevelTableFileName(rdb_name);
+    ASSERT_TRUE(env_->RenameFile(rdb_name, ldb_name).ok());
+    ++num_ldb_files;
+  }
+  ASSERT_GT(num_ldb_files, 0);
+  ASSERT_EQ(num_files, GetSstFileCount(dbname_));
+
+  Reopen(options);
+  for (int k = 0; k < key_id; ++k) {
+    ASSERT_EQ(values[k], Get(Key(k)));
+  }
+  Destroy(options);
+}
+
+// Check that we don't crash when opening DB with
+// DBOptions::skip_checking_sst_file_sizes_on_db_open = true.
+TEST_F(DBSSTTest, SkipCheckingSSTFileSizesOnDBOpen) {
+  ASSERT_OK(Put("pika", "choo"));
+  ASSERT_OK(Flush());
+
+  // Just open the DB with the option set to true and check that we don't crash.
+  Options options;
+  options.env = env_;
+  options.skip_checking_sst_file_sizes_on_db_open = true;
+  Reopen(options);
+
+  ASSERT_EQ("choo", Get("pika"));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBSSTTest, DontDeleteMovedFile) {
+  // This test triggers move compaction and verifies that the file is not
+  // deleted when it's part of move compaction
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.max_bytes_for_level_base = 1024 * 1024;  // 1 MB
+  options.level0_file_num_compaction_trigger =
+      2;  // trigger compaction when we have 2 files
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  // Create two 1MB sst files
+  for (int i = 0; i < 2; ++i) {
+    // Create 1MB sst file
+    for (int j = 0; j < 100; ++j) {
+      ASSERT_OK(Put(Key(i * 50 + j), rnd.RandomString(10 * 1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+  // this should execute both L0->L1 and L1->(move)->L2 compactions
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+  // If the moved file is actually deleted (the move-safeguard in
+  // ~Version::Version() is not there), we get this failure:
+  // Corruption: Can't access /000009.sst
+  Reopen(options);
+}
+
+// This reproduces a bug where we don't delete a file because when it was
+// supposed to be deleted, it was blocked by pending_outputs
+// Consider:
+// 1. current file_number is 13
+// 2. compaction (1) starts, blocks deletion of all files starting with 13
+// (pending outputs)
+// 3. file 13 is created by compaction (2)
+// 4. file 13 is consumed by compaction (3) and file 15 was created. Since file
+// 13 has no references, it is put into VersionSet::obsolete_files_
+// 5. FindObsoleteFiles() gets file 13 from VersionSet::obsolete_files_. File 13
+// is deleted from obsolete_files_ set.
+// 6. PurgeObsoleteFiles() tries to delete file 13, but this file is blocked by
+// pending outputs since compaction (1) is still running. It is not deleted and
+// it is not present in obsolete_files_ anymore. Therefore, we never delete it.
+TEST_F(DBSSTTest, DeleteObsoleteFilesPendingOutputs) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 2 * 1024 * 1024;     // 2 MB
+  options.max_bytes_for_level_base = 1024 * 1024;  // 1 MB
+  options.level0_file_num_compaction_trigger =
+      2;  // trigger compaction when we have 2 files
+  options.max_background_flushes = 2;
+  options.max_background_compactions = 2;
+
+  OnFileDeletionListener* listener = new OnFileDeletionListener();
+  options.listeners.emplace_back(listener);
+
+  Reopen(options);
+
+  Random rnd(301);
+  // Create two 1MB sst files
+  for (int i = 0; i < 2; ++i) {
+    // Create 1MB sst file
+    for (int j = 0; j < 100; ++j) {
+      ASSERT_OK(Put(Key(i * 50 + j), rnd.RandomString(10 * 1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+  // this should execute both L0->L1 and L1->(move)->L2 compactions
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+  test::SleepingBackgroundTask blocking_thread;
+  port::Mutex mutex_;
+  bool already_blocked(false);
+
+  // block the flush
+  std::function<void()> block_first_time = [&]() {
+    bool blocking = false;
+    {
+      MutexLock l(&mutex_);
+      if (!already_blocked) {
+        blocking = true;
+        already_blocked = true;
+      }
+    }
+    if (blocking) {
+      blocking_thread.DoSleep();
+    }
+  };
+  env_->table_write_callback_ = &block_first_time;
+  // Insert 2.5MB data, which should trigger a flush because we exceed
+  // write_buffer_size. The flush will be blocked with block_first_time
+  // pending_file is protecting all the files created after
+  for (int j = 0; j < 256; ++j) {
+    ASSERT_OK(Put(Key(j), rnd.RandomString(10 * 1024)));
+  }
+  blocking_thread.WaitUntilSleeping();
+
+  ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr));
+
+  ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(metadata.size(), 1U);
+  auto file_on_L2 = metadata[0].name;
+  listener->SetExpectedFileName(dbname_ + file_on_L2);
+
+  ASSERT_OK(dbfull()->TEST_CompactRange(3, nullptr, nullptr, nullptr,
+                                        true /* disallow trivial move */));
+  ASSERT_EQ("0,0,0,0,1", FilesPerLevel(0));
+
+  // finish the flush!
+  blocking_thread.WakeUp();
+  blocking_thread.WaitUntilDone();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  // File just flushed is too big for L0 and L1 so gets moved to L2.
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("0,0,1,0,1", FilesPerLevel(0));
+
+  metadata.clear();
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(metadata.size(), 2U);
+
+  // This file should have been deleted during last compaction
+  ASSERT_EQ(Status::NotFound(), env_->FileExists(dbname_ + file_on_L2));
+  listener->VerifyMatchedCount(1);
+}
+
+// Test that producing an empty .sst file does not write it out to
+// disk, and that the DeleteFile() env method is not called for
+// removing the non-existing file later.
+TEST_F(DBSSTTest, DeleteFileNotCalledForNotCreatedSSTFile) {
+  Options options = CurrentOptions();
+  options.env = env_;
+
+  OnFileDeletionListener* listener = new OnFileDeletionListener();
+  options.listeners.emplace_back(listener);
+
+  Reopen(options);
+
+  // Flush the empty database.
+  ASSERT_OK(Flush());
+  ASSERT_EQ("", FilesPerLevel(0));
+
+  // We expect no .sst files.
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(metadata.size(), 0U);
+
+  // We expect no file deletions.
+  listener->VerifyMatchedCount(0);
+}
+
+// Test that producing a non-empty .sst file does write it out to
+// disk, and that the DeleteFile() env method is not called for removing
+// the file later.
+TEST_F(DBSSTTest, DeleteFileNotCalledForCreatedSSTFile) {
+  Options options = CurrentOptions();
+  options.env = env_;
+
+  OnFileDeletionListener* listener = new OnFileDeletionListener();
+  options.listeners.emplace_back(listener);
+
+  Reopen(options);
+
+  ASSERT_OK(Put("pika", "choo"));
+
+  // Flush the non-empty database.
+  ASSERT_OK(Flush());
+  ASSERT_EQ("1", FilesPerLevel(0));
+
+  // We expect 1 .sst files.
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(metadata.size(), 1U);
+
+  // We expect no file deletions.
+  listener->VerifyMatchedCount(0);
+}
+
+TEST_F(DBSSTTest, DBWithSstFileManager) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  int files_added = 0;
+  int files_deleted = 0;
+  int files_moved = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnAddFile", [&](void* /*arg*/) { files_added++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnDeleteFile",
+      [&](void* /*arg*/) { files_deleted++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 25; i++) {
+    GenerateNewRandomFile(&rnd);
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    // Verify that we are tracking all sst files in dbname_
+    std::unordered_map<std::string, uint64_t> files_in_db;
+    ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
+    ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  }
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  std::unordered_map<std::string, uint64_t> files_in_db;
+  ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
+  // Verify that we are tracking all sst files in dbname_
+  ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  // Verify the total files size
+  uint64_t total_files_size = 0;
+  for (auto& file_to_size : files_in_db) {
+    total_files_size += file_to_size.second;
+  }
+  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+  // We flushed at least 25 files
+  ASSERT_GE(files_added, 25);
+  // Compaction must have deleted some files
+  ASSERT_GT(files_deleted, 0);
+  // No files were moved
+  ASSERT_EQ(files_moved, 0);
+
+  Close();
+  Reopen(options);
+  ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+
+  // Verify that we track all the files again after the DB is closed and opened
+  Close();
+  sst_file_manager.reset(NewSstFileManager(env_));
+  options.sst_file_manager = sst_file_manager;
+  sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  Reopen(options);
+  ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFiles) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  int files_added = 0;
+  int files_deleted = 0;
+  int files_moved = 0;
+  int files_scheduled_to_delete = 0;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnAddFile", [&](void* arg) {
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (file_path->find(".blob") != std::string::npos) {
+          files_added++;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnDeleteFile", [&](void* arg) {
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (file_path->find(".blob") != std::string::npos) {
+          files_deleted++;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::ScheduleFileDeletion", [&](void* arg) {
+        assert(arg);
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (file_path->find(".blob") != std::string::npos) {
+          ++files_scheduled_to_delete;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  options.enable_blob_files = true;
+  options.blob_file_size = 32;  // create one blob per file
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put("Key_" + std::to_string(i), "Value_" + std::to_string(i)));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+    // Verify that we are tracking all sst and blob files in dbname_
+    std::unordered_map<std::string, uint64_t> files_in_db;
+    ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
+    ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db));
+    ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  }
+
+  std::vector<uint64_t> blob_files = GetBlobFileNumbers();
+  ASSERT_EQ(files_added, blob_files.size());
+  // No blob file is obsoleted.
+  ASSERT_EQ(files_deleted, 0);
+  ASSERT_EQ(files_scheduled_to_delete, 0);
+  // No files were moved.
+  ASSERT_EQ(files_moved, 0);
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  std::unordered_map<std::string, uint64_t> files_in_db;
+  ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
+  ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db));
+
+  // Verify that we are tracking all sst and blob files in dbname_
+  ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  // Verify the total files size
+  uint64_t total_files_size = 0;
+  for (auto& file_to_size : files_in_db) {
+    total_files_size += file_to_size.second;
+  }
+  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+  Close();
+
+  Reopen(options);
+  ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+
+  // Verify that we track all the files again after the DB is closed and opened.
+  Close();
+
+  sst_file_manager.reset(NewSstFileManager(env_));
+  options.sst_file_manager = sst_file_manager;
+  sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  Reopen(options);
+
+  ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+
+  // Destroy DB and it will remove all the blob files from sst file manager and
+  // blob files deletion will go through ScheduleFileDeletion.
+  ASSERT_EQ(files_deleted, 0);
+  ASSERT_EQ(files_scheduled_to_delete, 0);
+  Close();
+  ASSERT_OK(DestroyDB(dbname_, options));
+  ASSERT_EQ(files_deleted, blob_files.size());
+  ASSERT_EQ(files_scheduled_to_delete, blob_files.size());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFilesWithGC) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  options.enable_blob_files = true;
+  options.blob_file_size = 32;  // create one blob per file
+  options.disable_auto_compactions = true;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 0.5;
+
+  int files_added = 0;
+  int files_deleted = 0;
+  int files_moved = 0;
+  int files_scheduled_to_delete = 0;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnAddFile", [&](void* arg) {
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (file_path->find(".blob") != std::string::npos) {
+          files_added++;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnDeleteFile", [&](void* arg) {
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (file_path->find(".blob") != std::string::npos) {
+          files_deleted++;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::ScheduleFileDeletion", [&](void* arg) {
+        assert(arg);
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (file_path->find(".blob") != std::string::npos) {
+          ++files_scheduled_to_delete;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "first_value";
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "second_value";
+
+  ASSERT_OK(Put(first_key, first_value));
+  ASSERT_OK(Put(second_key, second_value));
+  ASSERT_OK(Flush());
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "third_value";
+  constexpr char fourth_key[] = "fourth_key";
+  constexpr char fourth_value[] = "fourth_value";
+  constexpr char fifth_key[] = "fifth_key";
+  constexpr char fifth_value[] = "fifth_value";
+
+  ASSERT_OK(Put(third_key, third_value));
+  ASSERT_OK(Put(fourth_key, fourth_value));
+  ASSERT_OK(Put(fifth_key, fifth_value));
+  ASSERT_OK(Flush());
+
+  const std::vector<uint64_t> original_blob_files = GetBlobFileNumbers();
+
+  ASSERT_EQ(original_blob_files.size(), 5);
+  ASSERT_EQ(files_added, 5);
+  ASSERT_EQ(files_deleted, 0);
+  ASSERT_EQ(files_scheduled_to_delete, 0);
+  ASSERT_EQ(files_moved, 0);
+  {
+    // Verify that we are tracking all sst and blob files in dbname_
+    std::unordered_map<std::string, uint64_t> files_in_db;
+    ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
+    ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db));
+    ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  }
+
+  const size_t cutoff_index = static_cast<size_t>(
+      options.blob_garbage_collection_age_cutoff * original_blob_files.size());
+
+  size_t expected_number_of_files = original_blob_files.size();
+  // Note: turning off enable_blob_files before the compaction results in
+  // garbage collected values getting inlined.
+  ASSERT_OK(db_->SetOptions({{"enable_blob_files", "false"}}));
+  expected_number_of_files -= cutoff_index;
+  files_added = 0;
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  sfm->WaitForEmptyTrash();
+
+  ASSERT_EQ(Get(first_key), first_value);
+  ASSERT_EQ(Get(second_key), second_value);
+  ASSERT_EQ(Get(third_key), third_value);
+  ASSERT_EQ(Get(fourth_key), fourth_value);
+  ASSERT_EQ(Get(fifth_key), fifth_value);
+
+  const std::vector<uint64_t> new_blob_files = GetBlobFileNumbers();
+
+  ASSERT_EQ(new_blob_files.size(), expected_number_of_files);
+  // No new file is added.
+  ASSERT_EQ(files_added, 0);
+  ASSERT_EQ(files_deleted, cutoff_index);
+  ASSERT_EQ(files_scheduled_to_delete, cutoff_index);
+  ASSERT_EQ(files_moved, 0);
+
+  // Original blob files below the cutoff should be gone, original blob files at
+  // or above the cutoff should be still there
+  for (size_t i = cutoff_index; i < original_blob_files.size(); ++i) {
+    ASSERT_EQ(new_blob_files[i - cutoff_index], original_blob_files[i]);
+  }
+
+  {
+    // Verify that we are tracking all sst and blob files in dbname_
+    std::unordered_map<std::string, uint64_t> files_in_db;
+    ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
+    ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db));
+    ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  }
+
+  Close();
+  ASSERT_OK(DestroyDB(dbname_, options));
+  sfm->WaitForEmptyTrash();
+  ASSERT_EQ(files_deleted, 5);
+  ASSERT_EQ(files_scheduled_to_delete, 5);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+class DBSSTTestRateLimit : public DBSSTTest,
+                           public ::testing::WithParamInterface<bool> {
+ public:
+  DBSSTTestRateLimit() : DBSSTTest() {}
+  ~DBSSTTestRateLimit() override {}
+};
+
+TEST_P(DBSSTTestRateLimit, RateLimitedDelete) {
+  Destroy(last_options_);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DBSSTTest::RateLimitedDelete:1",
+       "DeleteScheduler::BackgroundEmptyTrash"},
+  });
+
+  std::vector<uint64_t> penalties;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::BackgroundEmptyTrash:Wait",
+      [&](void* arg) { penalties.push_back(*(static_cast<uint64_t*>(arg))); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
+        // Turn timed wait into a simulated sleep
+        uint64_t* abs_time_us = static_cast<uint64_t*>(arg);
+        uint64_t cur_time = env_->NowMicros();
+        if (*abs_time_us > cur_time) {
+          env_->MockSleepForMicroseconds(*abs_time_us - cur_time);
+        }
+
+        // Plus an additional short, random amount
+        env_->MockSleepForMicroseconds(Random::GetTLSInstance()->Uniform(10));
+
+        // Set wait until time to before (actual) current time to force not
+        // to sleep
+        *abs_time_us = Env::Default()->NowMicros();
+      });
+
+  // Disable PeriodicTaskScheduler as it also has TimedWait, which could update
+  // the simulated sleep time
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::StartPeriodicTaskScheduler:DisableScheduler", [&](void* arg) {
+        bool* disable_scheduler = static_cast<bool*>(arg);
+        *disable_scheduler = true;
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  bool different_wal_dir = GetParam();
+  Options options = CurrentOptions();
+  SetTimeElapseOnlySleepOnReopen(&options);
+  options.disable_auto_compactions = true;
+  options.env = env_;
+  options.statistics = CreateDBStatistics();
+  if (different_wal_dir) {
+    options.wal_dir = alternative_wal_dir_;
+  }
+
+  int64_t rate_bytes_per_sec = 1024 * 10;  // 10 Kbs / Sec
+  Status s;
+  options.sst_file_manager.reset(
+      NewSstFileManager(env_, nullptr, "", 0, false, &s, 0));
+  ASSERT_OK(s);
+  options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec);
+  auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+  sfm->delete_scheduler()->SetMaxTrashDBRatio(1.1);
+
+  WriteOptions wo;
+  if (!different_wal_dir) {
+    wo.disableWAL = true;
+  }
+  Reopen(options);
+  // Create 4 files in L0
+  for (char v = 'a'; v <= 'd'; v++) {
+    ASSERT_OK(Put("Key2", DummyString(1024, v), wo));
+    ASSERT_OK(Put("Key3", DummyString(1024, v), wo));
+    ASSERT_OK(Put("Key4", DummyString(1024, v), wo));
+    ASSERT_OK(Put("Key1", DummyString(1024, v), wo));
+    ASSERT_OK(Put("Key4", DummyString(1024, v), wo));
+    ASSERT_OK(Flush());
+  }
+  // We created 4 sst files in L0
+  ASSERT_EQ("4", FilesPerLevel(0));
+
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+
+  // Compaction will move the 4 files in L0 to trash and create 1 L1 file
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+
+  uint64_t delete_start_time = env_->NowMicros();
+  // Hold BackgroundEmptyTrash
+  TEST_SYNC_POINT("DBSSTTest::RateLimitedDelete:1");
+  sfm->WaitForEmptyTrash();
+  uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time;
+
+  uint64_t total_files_size = 0;
+  uint64_t expected_penlty = 0;
+  ASSERT_EQ(penalties.size(), metadata.size());
+  for (size_t i = 0; i < metadata.size(); i++) {
+    total_files_size += metadata[i].size;
+    expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec);
+    ASSERT_EQ(expected_penlty, penalties[i]);
+  }
+  ASSERT_GT(time_spent_deleting, expected_penlty * 0.9);
+  ASSERT_LT(time_spent_deleting, expected_penlty * 1.1);
+  ASSERT_EQ(4, options.statistics->getAndResetTickerCount(FILES_MARKED_TRASH));
+  ASSERT_EQ(
+      0, options.statistics->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(RateLimitedDelete, DBSSTTestRateLimit,
+                        ::testing::Bool());
+
+TEST_F(DBSSTTest, RateLimitedWALDelete) {
+  Destroy(last_options_);
+
+  std::vector<uint64_t> penalties;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::BackgroundEmptyTrash:Wait",
+      [&](void* arg) { penalties.push_back(*(static_cast<uint64_t*>(arg))); });
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  options.env = env_;
+
+  int64_t rate_bytes_per_sec = 1024 * 10;  // 10 Kbs / Sec
+  Status s;
+  options.sst_file_manager.reset(
+      NewSstFileManager(env_, nullptr, "", 0, false, &s, 0));
+  ASSERT_OK(s);
+  options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec);
+  auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+  sfm->delete_scheduler()->SetMaxTrashDBRatio(3.1);
+  SetTimeElapseOnlySleepOnReopen(&options);
+
+  ASSERT_OK(TryReopen(options));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Create 4 files in L0
+  for (char v = 'a'; v <= 'd'; v++) {
+    ASSERT_OK(Put("Key2", DummyString(1024, v)));
+    ASSERT_OK(Put("Key3", DummyString(1024, v)));
+    ASSERT_OK(Put("Key4", DummyString(1024, v)));
+    ASSERT_OK(Put("Key1", DummyString(1024, v)));
+    ASSERT_OK(Put("Key4", DummyString(1024, v)));
+    ASSERT_OK(Flush());
+  }
+  // We created 4 sst files in L0
+  ASSERT_EQ("4", FilesPerLevel(0));
+
+  // Compaction will move the 4 files in L0 to trash and create 1 L1 file
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+
+  sfm->WaitForEmptyTrash();
+  ASSERT_EQ(penalties.size(), 8);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+class DBWALTestWithParam
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<std::string, bool>> {
+ public:
+  explicit DBWALTestWithParam()
+      : DBTestBase("db_wal_test_with_params", /*env_do_fsync=*/true) {
+    wal_dir_ = std::get<0>(GetParam());
+    wal_dir_same_as_dbname_ = std::get<1>(GetParam());
+  }
+
+  std::string wal_dir_;
+  bool wal_dir_same_as_dbname_;
+};
+
+TEST_P(DBWALTestWithParam, WALTrashCleanupOnOpen) {
+  class MyEnv : public EnvWrapper {
+   public:
+    MyEnv(Env* t) : EnvWrapper(t), fake_log_delete(false) {}
+    const char* Name() const override { return "MyEnv"; }
+    Status DeleteFile(const std::string& fname) override {
+      if (fname.find(".log.trash") != std::string::npos && fake_log_delete) {
+        return Status::OK();
+      }
+
+      return target()->DeleteFile(fname);
+    }
+
+    void set_fake_log_delete(bool fake) { fake_log_delete = fake; }
+
+   private:
+    bool fake_log_delete;
+  };
+
+  std::unique_ptr<MyEnv> env(new MyEnv(env_));
+  Destroy(last_options_);
+
+  env->set_fake_log_delete(true);
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  options.wal_dir = dbname_ + wal_dir_;
+
+  int64_t rate_bytes_per_sec = 1024 * 10;  // 10 Kbs / Sec
+  Status s;
+  options.sst_file_manager.reset(
+      NewSstFileManager(env_, nullptr, "", 0, false, &s, 0));
+  ASSERT_OK(s);
+  options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec);
+  auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+  sfm->delete_scheduler()->SetMaxTrashDBRatio(3.1);
+
+  Reopen(options);
+
+  // Create 4 files in L0
+  for (char v = 'a'; v <= 'd'; v++) {
+    if (v == 'c') {
+      // Maximize the change that the last log file will be preserved in trash
+      // before restarting the DB.
+      // We have to set this on the 2nd to last file for it to delay deletion
+      // on the last file. (Quirk of DeleteScheduler::BackgroundEmptyTrash())
+      options.sst_file_manager->SetDeleteRateBytesPerSecond(1);
+    }
+    ASSERT_OK(Put("Key2", DummyString(1024, v)));
+    ASSERT_OK(Put("Key3", DummyString(1024, v)));
+    ASSERT_OK(Put("Key4", DummyString(1024, v)));
+    ASSERT_OK(Put("Key1", DummyString(1024, v)));
+    ASSERT_OK(Put("Key4", DummyString(1024, v)));
+    ASSERT_OK(Flush());
+  }
+  // We created 4 sst files in L0
+  ASSERT_EQ("4", FilesPerLevel(0));
+
+  Close();
+
+  options.sst_file_manager.reset();
+  std::vector<std::string> filenames;
+  int trash_log_count = 0;
+  if (!wal_dir_same_as_dbname_) {
+    // Forcibly create some trash log files
+    std::unique_ptr<WritableFile> result;
+    ASSERT_OK(env->NewWritableFile(options.wal_dir + "/1000.log.trash", &result,
+                                   EnvOptions()));
+    result.reset();
+  }
+  ASSERT_OK(env->GetChildren(options.wal_dir, &filenames));
+  for (const std::string& fname : filenames) {
+    if (fname.find(".log.trash") != std::string::npos) {
+      trash_log_count++;
+    }
+  }
+  ASSERT_GE(trash_log_count, 1);
+
+  env->set_fake_log_delete(false);
+  Reopen(options);
+
+  filenames.clear();
+  trash_log_count = 0;
+  ASSERT_OK(env->GetChildren(options.wal_dir, &filenames));
+  for (const std::string& fname : filenames) {
+    if (fname.find(".log.trash") != std::string::npos) {
+      trash_log_count++;
+    }
+  }
+  ASSERT_EQ(trash_log_count, 0);
+  Close();
+}
+
+INSTANTIATE_TEST_CASE_P(DBWALTestWithParam, DBWALTestWithParam,
+                        ::testing::Values(std::make_tuple("", true),
+                                          std::make_tuple("_wal_dir", false)));
+
+TEST_F(DBSSTTest, OpenDBWithExistingTrash) {
+  Options options = CurrentOptions();
+
+  options.sst_file_manager.reset(
+      NewSstFileManager(env_, nullptr, "", 1024 * 1024 /* 1 MB/sec */));
+  auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+
+  Destroy(last_options_);
+
+  // Add some trash files to the db directory so the DB can clean them up
+  ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+  ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "001.sst.trash"));
+  ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "002.sst.trash"));
+  ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "003.sst.trash"));
+
+  // Reopen the DB and verify that it deletes existing trash files
+  Reopen(options);
+  sfm->WaitForEmptyTrash();
+  ASSERT_NOK(env_->FileExists(dbname_ + "/" + "001.sst.trash"));
+  ASSERT_NOK(env_->FileExists(dbname_ + "/" + "002.sst.trash"));
+  ASSERT_NOK(env_->FileExists(dbname_ + "/" + "003.sst.trash"));
+}
+
+// Create a DB with 2 db_paths, and generate multiple files in the 2
+// db_paths using CompactRangeOptions, make sure that files that were
+// deleted from first db_path were deleted using DeleteScheduler and
+// files in the second path were not.
+TEST_F(DBSSTTest, DeleteSchedulerMultipleDBPaths) {
+  std::atomic<int> bg_delete_file(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void* /*arg*/) { bg_delete_file++; });
+  // The deletion scheduler sometimes skips marking file as trash according to
+  // a heuristic. In that case the deletion will go through the below SyncPoint.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteFile", [&](void* /*arg*/) { bg_delete_file++; });
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.db_paths.emplace_back(dbname_, 1024 * 100);
+  options.db_paths.emplace_back(dbname_ + "_2", 1024 * 100);
+  options.env = env_;
+
+  int64_t rate_bytes_per_sec = 1024 * 1024;  // 1 Mb / Sec
+  Status s;
+  options.sst_file_manager.reset(
+      NewSstFileManager(env_, nullptr, "", rate_bytes_per_sec, false, &s,
+                        /* max_trash_db_ratio= */ 1.1));
+
+  ASSERT_OK(s);
+  auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+
+  DestroyAndReopen(options);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  // Create 4 files in L0
+  for (int i = 0; i < 4; i++) {
+    ASSERT_OK(Put("Key" + std::to_string(i), DummyString(1024, 'A'), wo));
+    ASSERT_OK(Flush());
+  }
+  // We created 4 sst files in L0
+  ASSERT_EQ("4", FilesPerLevel(0));
+  // Compaction will delete files from L0 in first db path and generate a new
+  // file in L1 in second db path
+  CompactRangeOptions compact_options;
+  compact_options.target_path_id = 1;
+  Slice begin("Key0");
+  Slice end("Key3");
+  ASSERT_OK(db_->CompactRange(compact_options, &begin, &end));
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+
+  // Create 4 files in L0
+  for (int i = 4; i < 8; i++) {
+    ASSERT_OK(Put("Key" + std::to_string(i), DummyString(1024, 'B'), wo));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ("4,1", FilesPerLevel(0));
+
+  // Compaction will delete files from L0 in first db path and generate a new
+  // file in L1 in second db path
+  begin = "Key4";
+  end = "Key7";
+  ASSERT_OK(db_->CompactRange(compact_options, &begin, &end));
+  ASSERT_EQ("0,2", FilesPerLevel(0));
+
+  sfm->WaitForEmptyTrash();
+  ASSERT_EQ(bg_delete_file, 8);
+
+  // Compaction will delete both files and regenerate a file in L1 in second
+  // db path. The deleted files should still be cleaned up via delete scheduler.
+  compact_options.bottommost_level_compaction =
+      BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+
+  sfm->WaitForEmptyTrash();
+  ASSERT_EQ(bg_delete_file, 10);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, DestroyDBWithRateLimitedDelete) {
+  int bg_delete_file = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void* /*arg*/) { bg_delete_file++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.env = env_;
+  options.sst_file_manager.reset(
+      NewSstFileManager(env_, nullptr, "", 0, false, &s, 0));
+  ASSERT_OK(s);
+  DestroyAndReopen(options);
+
+  // Create 4 files in L0
+  for (int i = 0; i < 4; i++) {
+    ASSERT_OK(Put("Key" + std::to_string(i), DummyString(1024, 'A')));
+    ASSERT_OK(Flush());
+  }
+  // We created 4 sst files in L0
+  ASSERT_EQ("4", FilesPerLevel(0));
+
+  // Close DB and destroy it using DeleteScheduler
+  Close();
+
+  int num_sst_files = 0;
+  int num_wal_files = 0;
+  std::vector<std::string> db_files;
+  ASSERT_OK(env_->GetChildren(dbname_, &db_files));
+  for (std::string f : db_files) {
+    if (f.substr(f.find_last_of(".") + 1) == "sst") {
+      num_sst_files++;
+    } else if (f.substr(f.find_last_of(".") + 1) == "log") {
+      num_wal_files++;
+    }
+  }
+  ASSERT_GT(num_sst_files, 0);
+  ASSERT_GT(num_wal_files, 0);
+
+  auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+
+  sfm->SetDeleteRateBytesPerSecond(1024 * 1024);
+  // Set an extra high trash ratio to prevent immediate/non-rate limited
+  // deletions
+  sfm->delete_scheduler()->SetMaxTrashDBRatio(1000.0);
+  ASSERT_OK(DestroyDB(dbname_, options));
+  sfm->WaitForEmptyTrash();
+  ASSERT_EQ(bg_delete_file, num_sst_files + num_wal_files);
+}
+
+TEST_F(DBSSTTest, DBWithMaxSpaceAllowed) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+
+  // Generate a file containing 100 keys.
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
+  }
+  ASSERT_OK(Flush());
+
+  uint64_t first_file_size = 0;
+  std::unordered_map<std::string, uint64_t> files_in_db;
+  ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &first_file_size));
+  ASSERT_EQ(sfm->GetTotalSize(), first_file_size);
+
+  // Set the maximum allowed space usage to the current total size
+  sfm->SetMaxAllowedSpaceUsage(first_file_size + 1);
+
+  ASSERT_OK(Put("key1", "val1"));
+  // This flush will cause bg_error_ and will fail
+  ASSERT_NOK(Flush());
+}
+
+TEST_F(DBSSTTest, DBWithMaxSpaceAllowedWithBlobFiles) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  options.disable_auto_compactions = true;
+  options.enable_blob_files = true;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+
+  // Generate a file containing keys.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
+  }
+  ASSERT_OK(Flush());
+
+  uint64_t files_size = 0;
+  uint64_t total_files_size = 0;
+  std::unordered_map<std::string, uint64_t> files_in_db;
+
+  ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db, &files_size));
+  // Make sure blob files are considered by SSTFileManage in size limits.
+  ASSERT_GT(files_size, 0);
+  total_files_size = files_size;
+  ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &files_size));
+  total_files_size += files_size;
+  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+
+  // Set the maximum allowed space usage to the current total size.
+  sfm->SetMaxAllowedSpaceUsage(total_files_size + 1);
+
+  bool max_allowed_space_reached = false;
+  bool delete_blob_file = false;
+  // Sync point called after blob file is closed and max allowed space is
+  // checked.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileCompletionCallback::CallBack::MaxAllowedSpaceReached",
+      [&](void* /*arg*/) { max_allowed_space_reached = true; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable::AfterDeleteFile",
+      [&](void* /*arg*/) { delete_blob_file = true; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {
+          "BuildTable::AfterDeleteFile",
+          "DBSSTTest::DBWithMaxSpaceAllowedWithBlobFiles:1",
+      },
+  });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("key1", "val1"));
+  // This flush will fail
+  ASSERT_NOK(Flush());
+  ASSERT_TRUE(max_allowed_space_reached);
+
+  TEST_SYNC_POINT("DBSSTTest::DBWithMaxSpaceAllowedWithBlobFiles:1");
+  ASSERT_TRUE(delete_blob_file);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, CancellingCompactionsWorks) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  options.level0_file_num_compaction_trigger = 2;
+  options.statistics = CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  int completed_compactions = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction():CancelledCompaction", [&](void* /*arg*/) {
+        sfm->SetMaxAllowedSpaceUsage(0);
+        ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun",
+      [&](void* /*arg*/) { completed_compactions++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+
+  // Generate a file containing 10 keys.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
+  }
+  ASSERT_OK(Flush());
+  uint64_t total_file_size = 0;
+  std::unordered_map<std::string, uint64_t> files_in_db;
+  ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &total_file_size));
+  // Set the maximum allowed space usage to the current total size
+  sfm->SetMaxAllowedSpaceUsage(2 * total_file_size + 1);
+
+  // Generate another file to trigger compaction.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  // Because we set a callback in CancelledCompaction, we actually
+  // let the compaction run
+  ASSERT_GT(completed_compactions, 0);
+  ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+  // Make sure the stat is bumped
+  ASSERT_GT(dbfull()->immutable_db_options().statistics.get()->getTickerCount(
+                COMPACTION_CANCELLED),
+            0);
+  ASSERT_EQ(0,
+            dbfull()->immutable_db_options().statistics.get()->getTickerCount(
+                FILES_MARKED_TRASH));
+  ASSERT_EQ(4,
+            dbfull()->immutable_db_options().statistics.get()->getTickerCount(
+                FILES_DELETED_IMMEDIATELY));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, CancellingManualCompactionsWorks) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  options.statistics = CreateDBStatistics();
+
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+
+  // Generate a file containing 10 keys.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
+  }
+  ASSERT_OK(Flush());
+  uint64_t total_file_size = 0;
+  std::unordered_map<std::string, uint64_t> files_in_db;
+  ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &total_file_size));
+  // Set the maximum allowed space usage to the current total size
+  sfm->SetMaxAllowedSpaceUsage(2 * total_file_size + 1);
+
+  // Generate another file to trigger compaction.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
+  }
+  ASSERT_OK(Flush());
+
+  // OK, now trigger a manual compaction
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(CompactRangeOptions(), nullptr, nullptr)
+                  .IsCompactionTooLarge());
+
+  // Wait for manual compaction to get scheduled and finish
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+  // Make sure the stat is bumped
+  ASSERT_EQ(dbfull()->immutable_db_options().statistics.get()->getTickerCount(
+                COMPACTION_CANCELLED),
+            1);
+
+  // Now make sure CompactFiles also gets cancelled
+  auto l0_files = collector->GetFlushedFiles();
+  ASSERT_TRUE(
+      dbfull()
+          ->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), l0_files, 0)
+          .IsCompactionTooLarge());
+
+  // Wait for manual compaction to get scheduled and finish
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  ASSERT_EQ(dbfull()->immutable_db_options().statistics.get()->getTickerCount(
+                COMPACTION_CANCELLED),
+            2);
+  ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+
+  // Now let the flush through and make sure GetCompactionsReservedSize
+  // returns to normal
+  sfm->SetMaxAllowedSpaceUsage(0);
+  int completed_compactions = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactFilesImpl:End", [&](void* /*arg*/) { completed_compactions++; });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(dbfull()->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(),
+                                   l0_files, 0));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+  ASSERT_GT(completed_compactions, 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, DBWithMaxSpaceAllowedRandomized) {
+  // This test will set a maximum allowed space for the DB, then it will
+  // keep filling the DB until the limit is reached and bg_error_ is set.
+  // When bg_error_ is set we will verify that the DB size is greater
+  // than the limit.
+
+  std::vector<int> max_space_limits_mbs = {1, 10};
+  std::atomic<bool> bg_error_set(false);
+
+  std::atomic<int> reached_max_space_on_flush(0);
+  std::atomic<int> reached_max_space_on_compaction(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached",
+      [&](void* arg) {
+        Status* bg_error = static_cast<Status*>(arg);
+        bg_error_set = true;
+        reached_max_space_on_flush++;
+        // clear error to ensure compaction callback is called
+        *bg_error = Status::OK();
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction():CancelledCompaction", [&](void* arg) {
+        bool* enough_room = static_cast<bool*>(arg);
+        *enough_room = true;
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::FinishCompactionOutputFile:MaxAllowedSpaceReached",
+      [&](void* /*arg*/) {
+        bg_error_set = true;
+        reached_max_space_on_compaction++;
+      });
+
+  for (auto limit_mb : max_space_limits_mbs) {
+    bg_error_set = false;
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+    auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+    Options options = CurrentOptions();
+    options.sst_file_manager = sst_file_manager;
+    options.write_buffer_size = 1024 * 512;  // 512 Kb
+    DestroyAndReopen(options);
+    Random rnd(301);
+
+    sfm->SetMaxAllowedSpaceUsage(limit_mb * 1024 * 1024);
+
+    // It is easy to detect if the test is stuck in a loop. No need for
+    // complex termination logic.
+    while (true) {
+      auto s = Put(rnd.RandomString(10), rnd.RandomString(50));
+      if (!s.ok()) {
+        break;
+      }
+    }
+    ASSERT_TRUE(bg_error_set);
+    uint64_t total_sst_files_size = 0;
+    std::unordered_map<std::string, uint64_t> files_in_db;
+    ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &total_sst_files_size));
+    ASSERT_GE(total_sst_files_size, limit_mb * 1024 * 1024);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+
+  ASSERT_GT(reached_max_space_on_flush, 0);
+  ASSERT_GT(reached_max_space_on_compaction, 0);
+}
+
+TEST_F(DBSSTTest, OpenDBWithInfiniteMaxOpenFiles) {
+  // Open DB with infinite max open files
+  //  - First iteration use 1 thread to open files
+  //  - Second iteration use 5 threads to open files
+  for (int iter = 0; iter < 2; iter++) {
+    Options options;
+    options.create_if_missing = true;
+    options.write_buffer_size = 100000;
+    options.disable_auto_compactions = true;
+    options.max_open_files = -1;
+    if (iter == 0) {
+      options.max_file_opening_threads = 1;
+    } else {
+      options.max_file_opening_threads = 5;
+    }
+    options = CurrentOptions(options);
+    DestroyAndReopen(options);
+
+    // Create 12 Files in L0 (then move then to L2)
+    for (int i = 0; i < 12; i++) {
+      std::string k = "L2_" + Key(i);
+      ASSERT_OK(Put(k, k + std::string(1000, 'a')));
+      ASSERT_OK(Flush());
+    }
+    CompactRangeOptions compact_options;
+    compact_options.change_level = true;
+    compact_options.target_level = 2;
+    ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+
+    // Create 12 Files in L0
+    for (int i = 0; i < 12; i++) {
+      std::string k = "L0_" + Key(i);
+      ASSERT_OK(Put(k, k + std::string(1000, 'a')));
+      ASSERT_OK(Flush());
+    }
+    Close();
+
+    // Reopening the DB will load all existing files
+    Reopen(options);
+    ASSERT_EQ("12,0,12", FilesPerLevel(0));
+    std::vector<std::vector<FileMetaData>> files;
+    dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
+
+    for (const auto& level : files) {
+      for (const auto& file : level) {
+        ASSERT_TRUE(file.table_reader_handle != nullptr);
+      }
+    }
+
+    for (int i = 0; i < 12; i++) {
+      ASSERT_EQ(Get("L0_" + Key(i)), "L0_" + Key(i) + std::string(1000, 'a'));
+      ASSERT_EQ(Get("L2_" + Key(i)), "L2_" + Key(i) + std::string(1000, 'a'));
+    }
+  }
+}
+
+TEST_F(DBSSTTest, OpenDBWithInfiniteMaxOpenFilesSubjectToMemoryLimit) {
+  for (CacheEntryRoleOptions::Decision charge_table_reader :
+       {CacheEntryRoleOptions::Decision::kEnabled,
+        CacheEntryRoleOptions::Decision::kDisabled}) {
+    // Open DB with infinite max open files
+    //  - First iteration use 1 thread to open files
+    //  - Second iteration use 5 threads to open files
+    for (int iter = 0; iter < 2; iter++) {
+      Options options;
+      options.create_if_missing = true;
+      options.write_buffer_size = 100000;
+      options.disable_auto_compactions = true;
+      options.max_open_files = -1;
+
+      BlockBasedTableOptions table_options;
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+      if (iter == 0) {
+        options.max_file_opening_threads = 1;
+      } else {
+        options.max_file_opening_threads = 5;
+      }
+
+      DestroyAndReopen(options);
+
+      // Create 5 Files in L0 (then move then to L2)
+      for (int i = 0; i < 5; i++) {
+        std::string k = "L2_" + Key(i);
+        ASSERT_OK(Put(k, k + std::string(1000, 'a')));
+        ASSERT_OK(Flush()) << i;
+      }
+      CompactRangeOptions compact_options;
+      compact_options.change_level = true;
+      compact_options.target_level = 2;
+      ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+
+      // Create 5 Files in L0
+      for (int i = 0; i < 5; i++) {
+        std::string k = "L0_" + Key(i);
+        ASSERT_OK(Put(k, k + std::string(1000, 'a')));
+        ASSERT_OK(Flush());
+      }
+      Close();
+
+      table_options.cache_usage_options.options_overrides.insert(
+          {CacheEntryRole::kBlockBasedTableReader,
+           {/*.charged = */ charge_table_reader}});
+      table_options.block_cache =
+          NewLRUCache(1024 /* capacity */, 0 /* num_shard_bits */,
+                      true /* strict_capacity_limit */);
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+      // Reopening the DB will try to load all existing files, conditionally
+      // subject to memory limit
+      Status s = TryReopen(options);
+
+      if (charge_table_reader == CacheEntryRoleOptions::Decision::kEnabled) {
+        EXPECT_TRUE(s.IsMemoryLimit());
+        EXPECT_TRUE(s.ToString().find(
+                        kCacheEntryRoleToCamelString[static_cast<std::uint32_t>(
+                            CacheEntryRole::kBlockBasedTableReader)]) !=
+                    std::string::npos);
+        EXPECT_TRUE(s.ToString().find("memory limit based on cache capacity") !=
+                    std::string::npos);
+
+      } else {
+        EXPECT_TRUE(s.ok());
+        ASSERT_EQ("5,0,5", FilesPerLevel(0));
+      }
+    }
+  }
+}
+
+TEST_F(DBSSTTest, GetTotalSstFilesSize) {
+  // We don't propagate oldest-key-time table property on compaction and
+  // just write 0 as default value. This affect the exact table size, since
+  // we encode table properties as varint64. Force time to be 0 to work around
+  // it. Should remove the workaround after we propagate the property on
+  // compaction.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table:oldest_ancester_time", [&](void* arg) {
+        uint64_t* current_time = static_cast<uint64_t*>(arg);
+        *current_time = 0;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  DestroyAndReopen(options);
+  // Generate 5 files in L0
+  for (int i = 0; i < 5; i++) {
+    for (int j = 0; j < 10; j++) {
+      std::string val = "val_file_" + std::to_string(i);
+      ASSERT_OK(Put(Key(j), val));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ("5", FilesPerLevel(0));
+
+  std::vector<LiveFileMetaData> live_files_meta;
+  dbfull()->GetLiveFilesMetaData(&live_files_meta);
+  ASSERT_EQ(live_files_meta.size(), 5);
+  uint64_t single_file_size = live_files_meta[0].size;
+
+  uint64_t live_sst_files_size = 0;
+  uint64_t total_sst_files_size = 0;
+  for (const auto& file_meta : live_files_meta) {
+    live_sst_files_size += file_meta.size;
+  }
+
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 5
+  // Total SST files = 5
+  ASSERT_EQ(live_sst_files_size, 5 * single_file_size);
+  ASSERT_EQ(total_sst_files_size, 5 * single_file_size);
+
+  // hold current version
+  std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
+  ASSERT_OK(iter1->status());
+
+  // Compact 5 files into 1 file in L0
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+
+  live_files_meta.clear();
+  dbfull()->GetLiveFilesMetaData(&live_files_meta);
+  ASSERT_EQ(live_files_meta.size(), 1);
+
+  live_sst_files_size = 0;
+  total_sst_files_size = 0;
+  for (const auto& file_meta : live_files_meta) {
+    live_sst_files_size += file_meta.size;
+  }
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 1 (compacted file)
+  // Total SST files = 6 (5 original files + compacted file)
+  ASSERT_EQ(live_sst_files_size, 1 * single_file_size);
+  ASSERT_EQ(total_sst_files_size, 6 * single_file_size);
+
+  // hold current version
+  std::unique_ptr<Iterator> iter2(dbfull()->NewIterator(ReadOptions()));
+  ASSERT_OK(iter2->status());
+
+  // Delete all keys and compact, this will delete all live files
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Delete(Key(i)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("", FilesPerLevel(0));
+
+  live_files_meta.clear();
+  dbfull()->GetLiveFilesMetaData(&live_files_meta);
+  ASSERT_EQ(live_files_meta.size(), 0);
+
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 0
+  // Total SST files = 6 (5 original files + compacted file)
+  ASSERT_EQ(total_sst_files_size, 6 * single_file_size);
+
+  ASSERT_OK(iter1->status());
+  iter1.reset();
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 0
+  // Total SST files = 1 (compacted file)
+  ASSERT_EQ(total_sst_files_size, 1 * single_file_size);
+
+  ASSERT_OK(iter2->status());
+  iter2.reset();
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 0
+  // Total SST files = 0
+  ASSERT_EQ(total_sst_files_size, 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, OpenDBWithoutGetFileSizeInvocations) {
+  Options options = CurrentOptions();
+  std::unique_ptr<MockEnv> env{MockEnv::Create(Env::Default())};
+  options.env = env.get();
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  options.enable_blob_files = true;
+  options.blob_file_size = 32;  // create one blob per file
+  options.skip_checking_sst_file_sizes_on_db_open = true;
+
+  DestroyAndReopen(options);
+  // Generate 5 files in L0
+  for (int i = 0; i < 5; i++) {
+    for (int j = 0; j < 10; j++) {
+      std::string val = "val_file_" + std::to_string(i);
+      ASSERT_OK(Put(Key(j), val));
+    }
+    ASSERT_OK(Flush());
+  }
+  Close();
+
+  bool is_get_file_size_called = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "MockFileSystem::GetFileSize:CheckFileType", [&](void* arg) {
+        std::string* filename = reinterpret_cast<std::string*>(arg);
+        if (filename->find(".blob") != std::string::npos) {
+          is_get_file_size_called = true;
+        }
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  Reopen(options);
+  ASSERT_FALSE(is_get_file_size_called);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  Destroy(options);
+}
+
+TEST_F(DBSSTTest, GetTotalSstFilesSizeVersionsFilesShared) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  DestroyAndReopen(options);
+  // Generate 5 files in L0
+  for (int i = 0; i < 5; i++) {
+    ASSERT_OK(Put(Key(i), "val"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ("5", FilesPerLevel(0));
+
+  std::vector<LiveFileMetaData> live_files_meta;
+  dbfull()->GetLiveFilesMetaData(&live_files_meta);
+  ASSERT_EQ(live_files_meta.size(), 5);
+  uint64_t single_file_size = live_files_meta[0].size;
+
+  uint64_t live_sst_files_size = 0;
+  uint64_t total_sst_files_size = 0;
+  for (const auto& file_meta : live_files_meta) {
+    live_sst_files_size += file_meta.size;
+  }
+
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+
+  // Live SST files = 5
+  // Total SST files = 5
+  ASSERT_EQ(live_sst_files_size, 5 * single_file_size);
+  ASSERT_EQ(total_sst_files_size, 5 * single_file_size);
+
+  // hold current version
+  std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
+  ASSERT_OK(iter1->status());
+
+  // Compaction will do trivial move from L0 to L1
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,5", FilesPerLevel(0));
+
+  live_files_meta.clear();
+  dbfull()->GetLiveFilesMetaData(&live_files_meta);
+  ASSERT_EQ(live_files_meta.size(), 5);
+
+  live_sst_files_size = 0;
+  total_sst_files_size = 0;
+  for (const auto& file_meta : live_files_meta) {
+    live_sst_files_size += file_meta.size;
+  }
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 5
+  // Total SST files = 5 (used in 2 version)
+  ASSERT_EQ(live_sst_files_size, 5 * single_file_size);
+  ASSERT_EQ(total_sst_files_size, 5 * single_file_size);
+
+  // hold current version
+  std::unique_ptr<Iterator> iter2(dbfull()->NewIterator(ReadOptions()));
+  ASSERT_OK(iter2->status());
+
+  // Delete all keys and compact, this will delete all live files
+  for (int i = 0; i < 5; i++) {
+    ASSERT_OK(Delete(Key(i)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("", FilesPerLevel(0));
+
+  live_files_meta.clear();
+  dbfull()->GetLiveFilesMetaData(&live_files_meta);
+  ASSERT_EQ(live_files_meta.size(), 0);
+
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 0
+  // Total SST files = 5 (used in 2 version)
+  ASSERT_EQ(total_sst_files_size, 5 * single_file_size);
+
+  ASSERT_OK(iter1->status());
+  iter1.reset();
+  ASSERT_OK(iter2->status());
+  iter2.reset();
+
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 0
+  // Total SST files = 0
+  ASSERT_EQ(total_sst_files_size, 0);
+}
+
+// This test if blob files are recorded by SST File Manager when Compaction job
+// creates/delete them and in case of AtomicFlush.
+TEST_F(DBSSTTest, DBWithSFMForBlobFilesAtomicFlush) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.disable_auto_compactions = true;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 0.5;
+  options.atomic_flush = true;
+
+  int files_added = 0;
+  int files_deleted = 0;
+  int files_scheduled_to_delete = 0;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnAddFile", [&](void* arg) {
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (EndsWith(*file_path, ".blob")) {
+          files_added++;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnDeleteFile", [&](void* arg) {
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (EndsWith(*file_path, ".blob")) {
+          files_deleted++;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::ScheduleFileDeletion", [&](void* arg) {
+        assert(arg);
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (EndsWith(*file_path, ".blob")) {
+          ++files_scheduled_to_delete;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  ASSERT_OK(Put("key_1", "value_1"));
+  ASSERT_OK(Put("key_2", "value_2"));
+  ASSERT_OK(Put("key_3", "value_3"));
+  ASSERT_OK(Put("key_4", "value_4"));
+  ASSERT_OK(Flush());
+
+  // Overwrite will create the garbage data.
+  ASSERT_OK(Put("key_3", "new_value_3"));
+  ASSERT_OK(Put("key_4", "new_value_4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key5", "blob_value5"));
+  ASSERT_OK(Put("Key6", "blob_value6"));
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(files_added, 3);
+  ASSERT_EQ(files_deleted, 0);
+  ASSERT_EQ(files_scheduled_to_delete, 0);
+  files_added = 0;
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+  // Compaction job will create a new file and delete the older files.
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ(files_added, 1);
+  ASSERT_EQ(files_scheduled_to_delete, 1);
+
+  sfm->WaitForEmptyTrash();
+
+  ASSERT_EQ(files_deleted, 1);
+
+  Close();
+  ASSERT_OK(DestroyDB(dbname_, options));
+
+  ASSERT_EQ(files_scheduled_to_delete, 4);
+
+  sfm->WaitForEmptyTrash();
+
+  ASSERT_EQ(files_deleted, 4);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_statistics_test.cc b/src/rocksdb/db/db_statistics_test.cc
new file mode 100644
index 000000000..4d4655361
--- /dev/null
+++ b/src/rocksdb/db/db_statistics_test.cc
@@ -0,0 +1,215 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <string>
+
+#include "db/db_test_util.h"
+#include "monitoring/thread_status_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/statistics.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBStatisticsTest : public DBTestBase {
+ public:
+  DBStatisticsTest()
+      : DBTestBase("db_statistics_test", /*env_do_fsync=*/true) {}
+};
+
+TEST_F(DBStatisticsTest, CompressionStatsTest) {
+  CompressionType type;
+
+  if (Snappy_Supported()) {
+    type = kSnappyCompression;
+    fprintf(stderr, "using snappy\n");
+  } else if (Zlib_Supported()) {
+    type = kZlibCompression;
+    fprintf(stderr, "using zlib\n");
+  } else if (BZip2_Supported()) {
+    type = kBZip2Compression;
+    fprintf(stderr, "using bzip2\n");
+  } else if (LZ4_Supported()) {
+    type = kLZ4Compression;
+    fprintf(stderr, "using lz4\n");
+  } else if (XPRESS_Supported()) {
+    type = kXpressCompression;
+    fprintf(stderr, "using xpress\n");
+  } else if (ZSTD_Supported()) {
+    type = kZSTD;
+    fprintf(stderr, "using ZSTD\n");
+  } else {
+    fprintf(stderr, "skipping test, compression disabled\n");
+    return;
+  }
+
+  Options options = CurrentOptions();
+  options.compression = type;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
+  DestroyAndReopen(options);
+
+  int kNumKeysWritten = 100000;
+
+  // Check that compressions occur and are counted when compression is turned on
+  Random rnd(301);
+  for (int i = 0; i < kNumKeysWritten; ++i) {
+    // compressible string
+    ASSERT_OK(Put(Key(i), rnd.RandomString(128) + std::string(128, 'a')));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_GT(options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED), 0);
+
+  for (int i = 0; i < kNumKeysWritten; ++i) {
+    auto r = Get(Key(i));
+  }
+  ASSERT_GT(options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED), 0);
+
+  options.compression = kNoCompression;
+  DestroyAndReopen(options);
+  uint64_t currentCompressions =
+      options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
+  uint64_t currentDecompressions =
+      options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED);
+
+  // Check that compressions do not occur when turned off
+  for (int i = 0; i < kNumKeysWritten; ++i) {
+    // compressible string
+    ASSERT_OK(Put(Key(i), rnd.RandomString(128) + std::string(128, 'a')));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED) -
+                currentCompressions,
+            0);
+
+  for (int i = 0; i < kNumKeysWritten; ++i) {
+    auto r = Get(Key(i));
+  }
+  ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED) -
+                currentDecompressions,
+            0);
+}
+
+TEST_F(DBStatisticsTest, MutexWaitStatsDisabledByDefault) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  const uint64_t kMutexWaitDelay = 100;
+  ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT,
+                                       kMutexWaitDelay);
+  ASSERT_OK(Put("hello", "rocksdb"));
+  ASSERT_EQ(TestGetTickerCount(options, DB_MUTEX_WAIT_MICROS), 0);
+  ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0);
+}
+
+TEST_F(DBStatisticsTest, MutexWaitStats) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kAll);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  const uint64_t kMutexWaitDelay = 100;
+  ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT,
+                                       kMutexWaitDelay);
+  ASSERT_OK(Put("hello", "rocksdb"));
+  ASSERT_GE(TestGetTickerCount(options, DB_MUTEX_WAIT_MICROS), kMutexWaitDelay);
+  ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0);
+}
+
+TEST_F(DBStatisticsTest, ResetStats) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+  for (int i = 0; i < 2; ++i) {
+    // pick arbitrary ticker and histogram. On first iteration they're zero
+    // because db is unused. On second iteration they're zero due to Reset().
+    ASSERT_EQ(0, TestGetTickerCount(options, NUMBER_KEYS_WRITTEN));
+    HistogramData histogram_data;
+    options.statistics->histogramData(DB_WRITE, &histogram_data);
+    ASSERT_EQ(0.0, histogram_data.max);
+
+    if (i == 0) {
+      // The Put() makes some of the ticker/histogram stats nonzero until we
+      // Reset().
+      ASSERT_OK(Put("hello", "rocksdb"));
+      ASSERT_EQ(1, TestGetTickerCount(options, NUMBER_KEYS_WRITTEN));
+      options.statistics->histogramData(DB_WRITE, &histogram_data);
+      ASSERT_GT(histogram_data.max, 0.0);
+      ASSERT_OK(options.statistics->Reset());
+    }
+  }
+}
+
+TEST_F(DBStatisticsTest, ExcludeTickers) {
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+  options.statistics->set_stats_level(StatsLevel::kExceptTickers);
+  ASSERT_OK(Put("foo", "value"));
+  ASSERT_EQ(0, options.statistics->getTickerCount(BYTES_WRITTEN));
+  options.statistics->set_stats_level(StatsLevel::kExceptHistogramOrTimers);
+  Reopen(options);
+  ASSERT_EQ("value", Get("foo"));
+  ASSERT_GT(options.statistics->getTickerCount(BYTES_READ), 0);
+}
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBStatisticsTest, VerifyChecksumReadStat) {
+  Options options = CurrentOptions();
+  options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  Reopen(options);
+
+  // Expected to be populated regardless of `PerfLevel` in user thread
+  SetPerfLevel(kDisable);
+
+  {
+    // Scenario 0: only WAL data. Not verified so require ticker to be zero.
+    ASSERT_OK(Put("foo", "value"));
+    ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+    ASSERT_OK(db_->VerifyChecksum());
+    ASSERT_EQ(0,
+              options.statistics->getTickerCount(VERIFY_CHECKSUM_READ_BYTES));
+  }
+
+  // Create one SST.
+  ASSERT_OK(Flush());
+  std::unordered_map<std::string, uint64_t> table_files;
+  uint64_t table_files_size = 0;
+  GetAllDataFiles(kTableFile, &table_files, &table_files_size);
+
+  {
+    // Scenario 1: Table verified in `VerifyFileChecksums()`. This should read
+    // the whole file so we require the ticker stat exactly matches the file
+    // size.
+    ASSERT_OK(options.statistics->Reset());
+    ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+    ASSERT_EQ(table_files_size,
+              options.statistics->getTickerCount(VERIFY_CHECKSUM_READ_BYTES));
+  }
+
+  {
+    // Scenario 2: Table verified in `VerifyChecksum()`. This opens a
+    // `TableReader` to verify each block. It can involve duplicate reads of the
+    // same data so we set a lower-bound only.
+    ASSERT_OK(options.statistics->Reset());
+    ASSERT_OK(db_->VerifyChecksum());
+    ASSERT_GE(options.statistics->getTickerCount(VERIFY_CHECKSUM_READ_BYTES),
+              table_files_size);
+  }
+}
+
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_table_properties_test.cc b/src/rocksdb/db/db_table_properties_test.cc
new file mode 100644
index 000000000..981a514ad
--- /dev/null
+++ b/src/rocksdb/db/db_table_properties_test.cc
@@ -0,0 +1,625 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <memory>
+#include <unordered_set>
+#include <vector>
+
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/table_properties_collectors.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "table/table_properties_internal.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+#ifndef ROCKSDB_LITE
+
+namespace ROCKSDB_NAMESPACE {
+
+// A helper function that ensures the table properties returned in
+// `GetPropertiesOfAllTablesTest` is correct.
+// This test assumes entries size is different for each of the tables.
+namespace {
+
+void VerifyTableProperties(DB* db, uint64_t expected_entries_size) {
+  TablePropertiesCollection props;
+  ASSERT_OK(db->GetPropertiesOfAllTables(&props));
+
+  ASSERT_EQ(4U, props.size());
+  std::unordered_set<uint64_t> unique_entries;
+
+  // Indirect test
+  uint64_t sum = 0;
+  for (const auto& item : props) {
+    unique_entries.insert(item.second->num_entries);
+    sum += item.second->num_entries;
+  }
+
+  ASSERT_EQ(props.size(), unique_entries.size());
+  ASSERT_EQ(expected_entries_size, sum);
+
+  VerifySstUniqueIds(props);
+}
+}  // anonymous namespace
+
+class DBTablePropertiesTest : public DBTestBase,
+                              public testing::WithParamInterface<std::string> {
+ public:
+  DBTablePropertiesTest()
+      : DBTestBase("db_table_properties_test", /*env_do_fsync=*/false) {}
+  TablePropertiesCollection TestGetPropertiesOfTablesInRange(
+      std::vector<Range> ranges, std::size_t* num_properties = nullptr,
+      std::size_t* num_files = nullptr);
+};
+
+TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 8;
+  // Part of strategy to prevent pinning table files
+  options.max_open_files = 42;
+  Reopen(options);
+
+  // Create 4 tables
+  for (int table = 0; table < 4; ++table) {
+    // Use old meta name for table properties for one file
+    if (table == 3) {
+      SyncPoint::GetInstance()->SetCallBack(
+          "BlockBasedTableBuilder::WritePropertiesBlock:Meta", [&](void* meta) {
+            *reinterpret_cast<const std::string**>(meta) =
+                &kPropertiesBlockOldName;
+          });
+      SyncPoint::GetInstance()->EnableProcessing();
+    }
+    // Build file
+    for (int i = 0; i < 10 + table; ++i) {
+      ASSERT_OK(
+          db_->Put(WriteOptions(), std::to_string(table * 100 + i), "val"));
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  std::string original_session_id;
+  ASSERT_OK(db_->GetDbSessionId(original_session_id));
+
+  // Part of strategy to prevent pinning table files
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionEditHandler::LoadTables:skip_load_table_files",
+      [&](void* skip_load) { *reinterpret_cast<bool*>(skip_load) = true; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // 1. Read table properties directly from file
+  Reopen(options);
+  // Clear out auto-opened files
+  dbfull()->TEST_table_cache()->EraseUnRefEntries();
+  ASSERT_EQ(dbfull()->TEST_table_cache()->GetUsage(), 0U);
+  VerifyTableProperties(db_, 10 + 11 + 12 + 13);
+
+  // 2. Put two tables to table cache and
+  Reopen(options);
+  // Clear out auto-opened files
+  dbfull()->TEST_table_cache()->EraseUnRefEntries();
+  ASSERT_EQ(dbfull()->TEST_table_cache()->GetUsage(), 0U);
+  // fetch key from 1st and 2nd table, which will internally place that table to
+  // the table cache.
+  for (int i = 0; i < 2; ++i) {
+    Get(std::to_string(i * 100 + 0));
+  }
+
+  VerifyTableProperties(db_, 10 + 11 + 12 + 13);
+
+  // 3. Put all tables to table cache
+  Reopen(options);
+  // fetch key from all tables, which will place them in table cache.
+  for (int i = 0; i < 4; ++i) {
+    Get(std::to_string(i * 100 + 0));
+  }
+  VerifyTableProperties(db_, 10 + 11 + 12 + 13);
+
+  // 4. Try to read CORRUPT properties (a) directly from file, and (b)
+  // through reader on Get
+
+  // It's not practical to prevent table file read on Open, so we
+  // corrupt after open and after purging table cache.
+  for (bool direct : {true, false}) {
+    Reopen(options);
+    // Clear out auto-opened files
+    dbfull()->TEST_table_cache()->EraseUnRefEntries();
+    ASSERT_EQ(dbfull()->TEST_table_cache()->GetUsage(), 0U);
+
+    TablePropertiesCollection props;
+    ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+    std::string sst_file = props.begin()->first;
+
+    // Corrupt the file's TableProperties using session id
+    std::string contents;
+    ASSERT_OK(
+        ReadFileToString(env_->GetFileSystem().get(), sst_file, &contents));
+    size_t pos = contents.find(original_session_id);
+    ASSERT_NE(pos, std::string::npos);
+    ASSERT_OK(test::CorruptFile(env_, sst_file, static_cast<int>(pos), 1,
+                                /*verify checksum fails*/ false));
+
+    // Try to read CORRUPT properties
+    if (direct) {
+      ASSERT_TRUE(db_->GetPropertiesOfAllTables(&props).IsCorruption());
+    } else {
+      bool found_corruption = false;
+      for (int i = 0; i < 4; ++i) {
+        std::string result = Get(std::to_string(i * 100 + 0));
+        if (result.find_first_of("Corruption: block checksum mismatch") !=
+            std::string::npos) {
+          found_corruption = true;
+        }
+      }
+      ASSERT_TRUE(found_corruption);
+    }
+
+    // UN-corrupt file for next iteration
+    ASSERT_OK(test::CorruptFile(env_, sst_file, static_cast<int>(pos), 1,
+                                /*verify checksum fails*/ false));
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTablePropertiesTest, InvalidIgnored) {
+  // RocksDB versions 2.5 - 2.7 generate some properties that Block considers
+  // invalid in some way. This approximates that.
+
+  // Inject properties block data that Block considers invalid
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WritePropertiesBlock:BlockData",
+      [&](void* block_data) {
+        *reinterpret_cast<Slice*>(block_data) = Slice("X");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Corrupting the table properties corrupts the unique id.
+  // Ignore the unique id recorded in the manifest.
+  auto options = CurrentOptions();
+  options.verify_sst_unique_id_in_manifest = false;
+  Reopen(options);
+
+  // Build file
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_OK(db_->Put(WriteOptions(), std::to_string(i), "val"));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // Not crashing is good enough
+  TablePropertiesCollection props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+}
+
+TEST_F(DBTablePropertiesTest, CreateOnDeletionCollectorFactory) {
+  ConfigOptions options;
+  options.ignore_unsupported_options = false;
+
+  std::shared_ptr<TablePropertiesCollectorFactory> factory;
+  std::string id = CompactOnDeletionCollectorFactory::kClassName();
+  ASSERT_OK(
+      TablePropertiesCollectorFactory::CreateFromString(options, id, &factory));
+  auto del_factory = factory->CheckedCast<CompactOnDeletionCollectorFactory>();
+  ASSERT_NE(del_factory, nullptr);
+  ASSERT_EQ(0U, del_factory->GetWindowSize());
+  ASSERT_EQ(0U, del_factory->GetDeletionTrigger());
+  ASSERT_EQ(0.0, del_factory->GetDeletionRatio());
+  ASSERT_OK(TablePropertiesCollectorFactory::CreateFromString(
+      options, "window_size=100; deletion_trigger=90; id=" + id, &factory));
+  del_factory = factory->CheckedCast<CompactOnDeletionCollectorFactory>();
+  ASSERT_NE(del_factory, nullptr);
+  ASSERT_EQ(100U, del_factory->GetWindowSize());
+  ASSERT_EQ(90U, del_factory->GetDeletionTrigger());
+  ASSERT_EQ(0.0, del_factory->GetDeletionRatio());
+  ASSERT_OK(TablePropertiesCollectorFactory::CreateFromString(
+      options,
+      "window_size=100; deletion_trigger=90; deletion_ratio=0.5; id=" + id,
+      &factory));
+  del_factory = factory->CheckedCast<CompactOnDeletionCollectorFactory>();
+  ASSERT_NE(del_factory, nullptr);
+  ASSERT_EQ(100U, del_factory->GetWindowSize());
+  ASSERT_EQ(90U, del_factory->GetDeletionTrigger());
+  ASSERT_EQ(0.5, del_factory->GetDeletionRatio());
+}
+
+TablePropertiesCollection
+DBTablePropertiesTest::TestGetPropertiesOfTablesInRange(
+    std::vector<Range> ranges, std::size_t* num_properties,
+    std::size_t* num_files) {
+  // Since we deref zero element in the vector it can not be empty
+  // otherwise we pass an address to some random memory
+  EXPECT_GT(ranges.size(), 0U);
+  // run the query
+  TablePropertiesCollection props;
+  EXPECT_OK(db_->GetPropertiesOfTablesInRange(
+      db_->DefaultColumnFamily(), &ranges[0], ranges.size(), &props));
+
+  // Make sure that we've received properties for those and for those files
+  // only which fall within requested ranges
+  std::vector<LiveFileMetaData> vmd;
+  db_->GetLiveFilesMetaData(&vmd);
+  for (auto& md : vmd) {
+    std::string fn = md.db_path + md.name;
+    bool in_range = false;
+    for (auto& r : ranges) {
+      // smallestkey < limit && largestkey >= start
+      if (r.limit.compare(md.smallestkey) >= 0 &&
+          r.start.compare(md.largestkey) <= 0) {
+        in_range = true;
+        EXPECT_GT(props.count(fn), 0);
+      }
+    }
+    if (!in_range) {
+      EXPECT_EQ(props.count(fn), 0);
+    }
+  }
+
+  if (num_properties) {
+    *num_properties = props.size();
+  }
+
+  if (num_files) {
+    *num_files = vmd.size();
+  }
+  return props;
+}
+
+TEST_F(DBTablePropertiesTest, GetPropertiesOfTablesInRange) {
+  // Fixed random sead
+  Random rnd(301);
+
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = 4096;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 2;
+  options.target_file_size_base = 2048;
+  options.max_bytes_for_level_base = 40960;
+  options.max_bytes_for_level_multiplier = 4;
+  options.hard_pending_compaction_bytes_limit = 16 * 1024;
+  options.num_levels = 8;
+  options.env = env_;
+
+  DestroyAndReopen(options);
+
+  // build a decent LSM
+  for (int i = 0; i < 10000; i++) {
+    ASSERT_OK(Put(test::RandomKey(&rnd, 5), rnd.RandomString(102)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  if (NumTableFilesAtLevel(0) == 0) {
+    ASSERT_OK(Put(test::RandomKey(&rnd, 5), rnd.RandomString(102)));
+    ASSERT_OK(Flush());
+  }
+
+  ASSERT_OK(db_->PauseBackgroundWork());
+
+  // Ensure that we have at least L0, L1 and L2
+  ASSERT_GT(NumTableFilesAtLevel(0), 0);
+  ASSERT_GT(NumTableFilesAtLevel(1), 0);
+  ASSERT_GT(NumTableFilesAtLevel(2), 0);
+
+  // Query the largest range
+  std::size_t num_properties, num_files;
+  TestGetPropertiesOfTablesInRange(
+      {Range(test::RandomKey(&rnd, 5, test::RandomKeyType::SMALLEST),
+             test::RandomKey(&rnd, 5, test::RandomKeyType::LARGEST))},
+      &num_properties, &num_files);
+  ASSERT_EQ(num_properties, num_files);
+
+  // Query the empty range
+  TestGetPropertiesOfTablesInRange(
+      {Range(test::RandomKey(&rnd, 5, test::RandomKeyType::LARGEST),
+             test::RandomKey(&rnd, 5, test::RandomKeyType::SMALLEST))},
+      &num_properties, &num_files);
+  ASSERT_GT(num_files, 0);
+  ASSERT_EQ(num_properties, 0);
+
+  // Query the middle rangee
+  TestGetPropertiesOfTablesInRange(
+      {Range(test::RandomKey(&rnd, 5, test::RandomKeyType::MIDDLE),
+             test::RandomKey(&rnd, 5, test::RandomKeyType::LARGEST))},
+      &num_properties, &num_files);
+  ASSERT_GT(num_files, 0);
+  ASSERT_GT(num_files, num_properties);
+  ASSERT_GT(num_properties, 0);
+
+  // Query a bunch of random ranges
+  for (int j = 0; j < 100; j++) {
+    // create a bunch of ranges
+    std::vector<std::string> random_keys;
+    // Random returns numbers with zero included
+    // when we pass empty ranges TestGetPropertiesOfTablesInRange()
+    // derefs random memory in the empty ranges[0]
+    // so want to be greater than zero and even since
+    // the below loop requires that random_keys.size() to be even.
+    auto n = 2 * (rnd.Uniform(50) + 1);
+
+    for (uint32_t i = 0; i < n; ++i) {
+      random_keys.push_back(test::RandomKey(&rnd, 5));
+    }
+
+    ASSERT_GT(random_keys.size(), 0U);
+    ASSERT_EQ((random_keys.size() % 2), 0U);
+
+    std::vector<Range> ranges;
+    auto it = random_keys.begin();
+    while (it != random_keys.end()) {
+      ranges.push_back(Range(*it, *(it + 1)));
+      it += 2;
+    }
+
+    TestGetPropertiesOfTablesInRange(std::move(ranges));
+  }
+}
+
+TEST_F(DBTablePropertiesTest, GetColumnFamilyNameProperty) {
+  std::string kExtraCfName = "pikachu";
+  CreateAndReopenWithCF({kExtraCfName}, CurrentOptions());
+
+  // Create one table per CF, then verify it was created with the column family
+  // name property.
+  for (uint32_t cf = 0; cf < 2; ++cf) {
+    ASSERT_OK(Put(cf, "key", "val"));
+    ASSERT_OK(Flush(cf));
+
+    TablePropertiesCollection fname_to_props;
+    ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[cf], &fname_to_props));
+    ASSERT_EQ(1U, fname_to_props.size());
+
+    std::string expected_cf_name;
+    if (cf > 0) {
+      expected_cf_name = kExtraCfName;
+    } else {
+      expected_cf_name = kDefaultColumnFamilyName;
+    }
+    ASSERT_EQ(expected_cf_name,
+              fname_to_props.begin()->second->column_family_name);
+    ASSERT_EQ(cf, static_cast<uint32_t>(
+                      fname_to_props.begin()->second->column_family_id));
+  }
+}
+
+TEST_F(DBTablePropertiesTest, GetDbIdentifiersProperty) {
+  CreateAndReopenWithCF({"goku"}, CurrentOptions());
+
+  for (uint32_t cf = 0; cf < 2; ++cf) {
+    ASSERT_OK(Put(cf, "key", "val"));
+    ASSERT_OK(Put(cf, "foo", "bar"));
+    ASSERT_OK(Flush(cf));
+
+    TablePropertiesCollection fname_to_props;
+    ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[cf], &fname_to_props));
+    ASSERT_EQ(1U, fname_to_props.size());
+
+    std::string id, sid;
+    ASSERT_OK(db_->GetDbIdentity(id));
+    ASSERT_OK(db_->GetDbSessionId(sid));
+    ASSERT_EQ(id, fname_to_props.begin()->second->db_id);
+    ASSERT_EQ(sid, fname_to_props.begin()->second->db_session_id);
+  }
+}
+
+class DBTableHostnamePropertyTest
+    : public DBTestBase,
+      public ::testing::WithParamInterface<std::tuple<int, std::string>> {
+ public:
+  DBTableHostnamePropertyTest()
+      : DBTestBase("db_table_hostname_property_test",
+                   /*env_do_fsync=*/false) {}
+};
+
+TEST_P(DBTableHostnamePropertyTest, DbHostLocationProperty) {
+  option_config_ = std::get<0>(GetParam());
+  Options opts = CurrentOptions();
+  std::string expected_host_id = std::get<1>(GetParam());
+  ;
+  if (expected_host_id == kHostnameForDbHostId) {
+    ASSERT_OK(env_->GetHostNameString(&expected_host_id));
+  } else {
+    opts.db_host_id = expected_host_id;
+  }
+  CreateAndReopenWithCF({"goku"}, opts);
+
+  for (uint32_t cf = 0; cf < 2; ++cf) {
+    ASSERT_OK(Put(cf, "key", "val"));
+    ASSERT_OK(Put(cf, "foo", "bar"));
+    ASSERT_OK(Flush(cf));
+
+    TablePropertiesCollection fname_to_props;
+    ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[cf], &fname_to_props));
+    ASSERT_EQ(1U, fname_to_props.size());
+
+    ASSERT_EQ(fname_to_props.begin()->second->db_host_id, expected_host_id);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DBTableHostnamePropertyTest, DBTableHostnamePropertyTest,
+    ::testing::Values(
+        // OptionConfig, override db_host_location
+        std::make_tuple(DBTestBase::OptionConfig::kDefault,
+                        kHostnameForDbHostId),
+        std::make_tuple(DBTestBase::OptionConfig::kDefault, "foobar"),
+        std::make_tuple(DBTestBase::OptionConfig::kDefault, ""),
+        std::make_tuple(DBTestBase::OptionConfig::kPlainTableFirstBytePrefix,
+                        kHostnameForDbHostId),
+        std::make_tuple(DBTestBase::OptionConfig::kPlainTableFirstBytePrefix,
+                        "foobar"),
+        std::make_tuple(DBTestBase::OptionConfig::kPlainTableFirstBytePrefix,
+                        "")));
+
+class DeletionTriggeredCompactionTestListener : public EventListener {
+ public:
+  void OnCompactionBegin(DB*, const CompactionJobInfo& ci) override {
+    ASSERT_EQ(ci.compaction_reason,
+              CompactionReason::kFilesMarkedForCompaction);
+  }
+
+  void OnCompactionCompleted(DB*, const CompactionJobInfo& ci) override {
+    ASSERT_EQ(ci.compaction_reason,
+              CompactionReason::kFilesMarkedForCompaction);
+  }
+};
+
+TEST_P(DBTablePropertiesTest, DeletionTriggeredCompactionMarking) {
+  int kNumKeys = 1000;
+  int kWindowSize = 100;
+  int kNumDelsTrigger = 90;
+  std::shared_ptr<TablePropertiesCollectorFactory> compact_on_del =
+      NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger);
+
+  Options opts = CurrentOptions();
+  opts.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  opts.table_properties_collector_factories.emplace_back(compact_on_del);
+
+  if (GetParam() == "kCompactionStyleUniversal") {
+    opts.compaction_style = kCompactionStyleUniversal;
+  }
+  Reopen(opts);
+
+  // add an L1 file to prevent tombstones from dropping due to obsolescence
+  // during flush
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+
+  DeletionTriggeredCompactionTestListener* listener =
+      new DeletionTriggeredCompactionTestListener();
+  opts.listeners.emplace_back(listener);
+  Reopen(opts);
+
+  for (int i = 0; i < kNumKeys; ++i) {
+    if (i >= kNumKeys - kWindowSize &&
+        i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+      ASSERT_OK(Delete(Key(i)));
+    } else {
+      ASSERT_OK(Put(Key(i), "val"));
+    }
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+  // Change the window size and deletion trigger and ensure new values take
+  // effect
+  kWindowSize = 50;
+  kNumDelsTrigger = 40;
+  static_cast<CompactOnDeletionCollectorFactory*>(compact_on_del.get())
+      ->SetWindowSize(kWindowSize);
+  static_cast<CompactOnDeletionCollectorFactory*>(compact_on_del.get())
+      ->SetDeletionTrigger(kNumDelsTrigger);
+  for (int i = 0; i < kNumKeys; ++i) {
+    if (i >= kNumKeys - kWindowSize &&
+        i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+      ASSERT_OK(Delete(Key(i)));
+    } else {
+      ASSERT_OK(Put(Key(i), "val"));
+    }
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+  // Change the window size to disable delete triggered compaction
+  kWindowSize = 0;
+  static_cast<CompactOnDeletionCollectorFactory*>(compact_on_del.get())
+      ->SetWindowSize(kWindowSize);
+  static_cast<CompactOnDeletionCollectorFactory*>(compact_on_del.get())
+      ->SetDeletionTrigger(kNumDelsTrigger);
+  for (int i = 0; i < kNumKeys; ++i) {
+    if (i >= kNumKeys - kWindowSize &&
+        i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+      ASSERT_OK(Delete(Key(i)));
+    } else {
+      ASSERT_OK(Put(Key(i), "val"));
+    }
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  ASSERT_LT(0, opts.statistics->getTickerCount(COMPACT_WRITE_BYTES_MARKED));
+  ASSERT_LT(0, opts.statistics->getTickerCount(COMPACT_READ_BYTES_MARKED));
+}
+
+TEST_P(DBTablePropertiesTest, RatioBasedDeletionTriggeredCompactionMarking) {
+  constexpr int kNumKeys = 1000;
+  constexpr int kWindowSize = 0;
+  constexpr int kNumDelsTrigger = 0;
+  constexpr double kDeletionRatio = 0.1;
+  std::shared_ptr<TablePropertiesCollectorFactory> compact_on_del =
+      NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger,
+                                           kDeletionRatio);
+
+  Options opts = CurrentOptions();
+  opts.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  opts.table_properties_collector_factories.emplace_back(compact_on_del);
+
+  Reopen(opts);
+
+  // Add an L2 file to prevent tombstones from dropping due to obsolescence
+  // during flush
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(2);
+
+  auto* listener = new DeletionTriggeredCompactionTestListener();
+  opts.listeners.emplace_back(listener);
+  Reopen(opts);
+
+  // Generate one L0 with kNumKeys Put.
+  for (int i = 0; i < kNumKeys; ++i) {
+    ASSERT_OK(Put(Key(i), "not important"));
+  }
+  ASSERT_OK(Flush());
+
+  // Generate another L0 with kNumKeys Delete.
+  // This file, due to deletion ratio, will trigger compaction: 2@0 files to L1.
+  // The resulting L1 file has only one tombstone for user key 'Key(0)'.
+  // Again, due to deletion ratio, a compaction will be triggered: 1@1 + 1@2
+  // files to L2. However, the resulting file is empty because the tombstone
+  // and value are both dropped.
+  for (int i = 0; i < kNumKeys; ++i) {
+    ASSERT_OK(Delete(Key(i)));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_EQ(0, NumTableFilesAtLevel(i));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(DBTablePropertiesTest, DBTablePropertiesTest,
+                        ::testing::Values("kCompactionStyleLevel",
+                                          "kCompactionStyleUniversal"));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_tailing_iter_test.cc b/src/rocksdb/db/db_tailing_iter_test.cc
new file mode 100644
index 000000000..af3194ac4
--- /dev/null
+++ b/src/rocksdb/db/db_tailing_iter_test.cc
@@ -0,0 +1,604 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Introduction of SyncPoint effectively disabled building and running this test
+// in Release build.
+// which is a pity, it is a good test
+#if !defined(ROCKSDB_LITE)
+
+#include "db/db_test_util.h"
+#include "db/forward_iterator.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBTestTailingIterator : public DBTestBase,
+                              public ::testing::WithParamInterface<bool> {
+ public:
+  DBTestTailingIterator()
+      : DBTestBase("db_tailing_iterator_test", /*env_do_fsync=*/true) {}
+};
+
+INSTANTIATE_TEST_CASE_P(DBTestTailingIterator, DBTestTailingIterator,
+                        ::testing::Bool());
+
+TEST_P(DBTestTailingIterator, TailingIteratorSingle) {
+  ReadOptions read_options;
+  read_options.tailing = true;
+  if (GetParam()) {
+    read_options.async_io = true;
+  }
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  iter->SeekToFirst();
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_OK(iter->status());
+
+  // add a record and check that iter can see it
+  ASSERT_OK(db_->Put(WriteOptions(), "mirko", "fodor"));
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "mirko");
+
+  iter->Next();
+  ASSERT_TRUE(!iter->Valid());
+}
+
+TEST_P(DBTestTailingIterator, TailingIteratorKeepAdding) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ReadOptions read_options;
+  read_options.tailing = true;
+  if (GetParam()) {
+    read_options.async_io = true;
+  }
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iter->status());
+  std::string value(1024, 'a');
+
+  const int num_records = 10000;
+  for (int i = 0; i < num_records; ++i) {
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%016d", i);
+
+    Slice key(buf, 16);
+    ASSERT_OK(Put(1, key, value));
+
+    iter->Seek(key);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+  }
+}
+
+TEST_P(DBTestTailingIterator, TailingIteratorSeekToNext) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ReadOptions read_options;
+  read_options.tailing = true;
+  if (GetParam()) {
+    read_options.async_io = true;
+  }
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iter->status());
+  std::unique_ptr<Iterator> itern(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(itern->status());
+  std::string value(1024, 'a');
+
+  const int num_records = 1000;
+  for (int i = 1; i < num_records; ++i) {
+    char buf1[32];
+    char buf2[32];
+    snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+
+    Slice key(buf1, 20);
+    ASSERT_OK(Put(1, key, value));
+
+    if (i % 100 == 99) {
+      ASSERT_OK(Flush(1));
+    }
+
+    snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+    Slice target(buf2, 20);
+    iter->Seek(target);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+    if (i == 1) {
+      itern->SeekToFirst();
+    } else {
+      itern->Next();
+    }
+    ASSERT_TRUE(itern->Valid());
+    ASSERT_EQ(itern->key().compare(key), 0);
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  for (int i = 2 * num_records; i > 0; --i) {
+    char buf1[32];
+    char buf2[32];
+    snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+
+    Slice key(buf1, 20);
+    ASSERT_OK(Put(1, key, value));
+
+    if (i % 100 == 99) {
+      ASSERT_OK(Flush(1));
+    }
+
+    snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+    Slice target(buf2, 20);
+    iter->Seek(target);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+  }
+}
+
+TEST_P(DBTestTailingIterator, TailingIteratorTrimSeekToNext) {
+  const uint64_t k150KB = 150 * 1024;
+  Options options;
+  options.write_buffer_size = k150KB;
+  options.max_write_buffer_number = 3;
+  options.min_write_buffer_number_to_merge = 2;
+  options.env = env_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ReadOptions read_options;
+  read_options.tailing = true;
+  if (GetParam()) {
+    read_options.async_io = true;
+  }
+  int num_iters, deleted_iters;
+
+  char bufe[32];
+  snprintf(bufe, sizeof(bufe), "00b0%016d", 0);
+  Slice keyu(bufe, 20);
+  read_options.iterate_upper_bound = &keyu;
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iter->status());
+  std::unique_ptr<Iterator> itern(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(itern->status());
+  std::unique_ptr<Iterator> iterh(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iterh->status());
+  std::string value(1024, 'a');
+  bool file_iters_deleted = false;
+  bool file_iters_renewed_null = false;
+  bool file_iters_renewed_copy = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ForwardIterator::SeekInternal:Return", [&](void* arg) {
+        ForwardIterator* fiter = reinterpret_cast<ForwardIterator*>(arg);
+        ASSERT_TRUE(!file_iters_deleted ||
+                    fiter->TEST_CheckDeletedIters(&deleted_iters, &num_iters));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ForwardIterator::Next:Return", [&](void* arg) {
+        ForwardIterator* fiter = reinterpret_cast<ForwardIterator*>(arg);
+        ASSERT_TRUE(!file_iters_deleted ||
+                    fiter->TEST_CheckDeletedIters(&deleted_iters, &num_iters));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ForwardIterator::RenewIterators:Null",
+      [&](void* /*arg*/) { file_iters_renewed_null = true; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ForwardIterator::RenewIterators:Copy",
+      [&](void* /*arg*/) { file_iters_renewed_copy = true; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  const int num_records = 1000;
+  for (int i = 1; i < num_records; ++i) {
+    char buf1[32];
+    char buf2[32];
+    char buf3[32];
+    char buf4[32];
+    snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+    snprintf(buf3, sizeof(buf3), "00b0%016d", i * 5);
+
+    Slice key(buf1, 20);
+    ASSERT_OK(Put(1, key, value));
+    Slice keyn(buf3, 20);
+    ASSERT_OK(Put(1, keyn, value));
+
+    if (i % 100 == 99) {
+      ASSERT_OK(Flush(1));
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      if (i == 299) {
+        file_iters_deleted = true;
+      }
+      snprintf(buf4, sizeof(buf4), "00a0%016d", i * 5 / 2);
+      Slice target(buf4, 20);
+      iterh->Seek(target);
+      ASSERT_TRUE(iter->Valid());
+      for (int j = (i + 1) * 5 / 2; j < i * 5; j += 5) {
+        iterh->Next();
+        ASSERT_TRUE(iterh->Valid());
+      }
+      if (i == 299) {
+        file_iters_deleted = false;
+      }
+    }
+
+    file_iters_deleted = true;
+    snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+    Slice target(buf2, 20);
+    iter->Seek(target);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+    ASSERT_LE(num_iters, 1);
+    if (i == 1) {
+      itern->SeekToFirst();
+    } else {
+      itern->Next();
+    }
+    ASSERT_TRUE(itern->Valid());
+    ASSERT_EQ(itern->key().compare(key), 0);
+    ASSERT_LE(num_iters, 1);
+    file_iters_deleted = false;
+  }
+  ASSERT_TRUE(file_iters_renewed_null);
+  ASSERT_TRUE(file_iters_renewed_copy);
+  iter = nullptr;
+  itern = nullptr;
+  iterh = nullptr;
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.block_cache_compressed = nullptr;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  read_options.read_tier = kBlockCacheTier;
+  std::unique_ptr<Iterator> iteri(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iteri->status());
+  char buf5[32];
+  snprintf(buf5, sizeof(buf5), "00a0%016d", (num_records / 2) * 5 - 2);
+  Slice target1(buf5, 20);
+  iteri->Seek(target1);
+  ASSERT_TRUE(iteri->status().IsIncomplete());
+  iteri = nullptr;
+
+  read_options.read_tier = kReadAllTier;
+  options.table_factory.reset(NewBlockBasedTableFactory());
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  iter.reset(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iter->status());
+  for (int i = 2 * num_records; i > 0; --i) {
+    char buf1[32];
+    char buf2[32];
+    snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+
+    Slice key(buf1, 20);
+    ASSERT_OK(Put(1, key, value));
+
+    if (i % 100 == 99) {
+      ASSERT_OK(Flush(1));
+    }
+
+    snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+    Slice target(buf2, 20);
+    iter->Seek(target);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+  }
+}
+
+TEST_P(DBTestTailingIterator, TailingIteratorDeletes) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ReadOptions read_options;
+  read_options.tailing = true;
+  if (GetParam()) {
+    read_options.async_io = true;
+  }
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iter->status());
+
+  // write a single record, read it using the iterator, then delete it
+  ASSERT_OK(Put(1, "0test", "test"));
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "0test");
+  ASSERT_OK(Delete(1, "0test"));
+
+  // write many more records
+  const int num_records = 10000;
+  std::string value(1024, 'A');
+
+  for (int i = 0; i < num_records; ++i) {
+    char buf[32];
+    snprintf(buf, sizeof(buf), "1%015d", i);
+
+    Slice key(buf, 16);
+    ASSERT_OK(Put(1, key, value));
+  }
+
+  // force a flush to make sure that no records are read from memtable
+  ASSERT_OK(Flush(1));
+
+  // skip "0test"
+  iter->Next();
+
+  // make sure we can read all new records using the existing iterator
+  int count = 0;
+  for (; iter->Valid(); iter->Next(), ++count)
+    ;
+
+  ASSERT_EQ(count, num_records);
+}
+
+TEST_P(DBTestTailingIterator, TailingIteratorPrefixSeek) {
+  ReadOptions read_options;
+  read_options.tailing = true;
+  if (GetParam()) {
+    read_options.async_io = true;
+  }
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+  options.memtable_factory.reset(NewHashSkipListRepFactory(16));
+  options.allow_concurrent_memtable_write = false;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iter->status());
+  ASSERT_OK(Put(1, "0101", "test"));
+
+  ASSERT_OK(Flush(1));
+
+  ASSERT_OK(Put(1, "0202", "test"));
+
+  // Seek(0102) shouldn't find any records since 0202 has a different prefix
+  iter->Seek("0102");
+  ASSERT_TRUE(!iter->Valid());
+
+  iter->Seek("0202");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "0202");
+
+  iter->Next();
+  ASSERT_TRUE(!iter->Valid());
+}
+
+TEST_P(DBTestTailingIterator, TailingIteratorIncomplete) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ReadOptions read_options;
+  read_options.tailing = true;
+  if (GetParam()) {
+    read_options.async_io = true;
+  }
+  read_options.read_tier = kBlockCacheTier;
+
+  std::string key("key");
+  std::string value("value");
+
+  ASSERT_OK(db_->Put(WriteOptions(), key, value));
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  ASSERT_OK(iter->status());
+  iter->SeekToFirst();
+  // we either see the entry or it's not in cache
+  ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  iter->SeekToFirst();
+  // should still be true after compaction
+  ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete());
+}
+
+TEST_P(DBTestTailingIterator, TailingIteratorSeekToSame) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 1000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ReadOptions read_options;
+  read_options.tailing = true;
+  if (GetParam()) {
+    read_options.async_io = true;
+  }
+  const int NROWS = 10000;
+  // Write rows with keys 00000, 00002, 00004 etc.
+  for (int i = 0; i < NROWS; ++i) {
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%05d", 2 * i);
+    std::string key(buf);
+    std::string value("value");
+    ASSERT_OK(db_->Put(WriteOptions(), key, value));
+  }
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  ASSERT_OK(iter->status());
+  // Seek to 00001.  We expect to find 00002.
+  std::string start_key = "00001";
+  iter->Seek(start_key);
+  ASSERT_TRUE(iter->Valid());
+
+  std::string found = iter->key().ToString();
+  ASSERT_EQ("00002", found);
+
+  // Now seek to the same key.  The iterator should remain in the same
+  // position.
+  iter->Seek(found);
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(found, iter->key().ToString());
+}
+
+// Sets iterate_upper_bound and verifies that ForwardIterator doesn't call
+// Seek() on immutable iterators when target key is >= prev_key and all
+// iterators, including the memtable iterator, are over the upper bound.
+TEST_P(DBTestTailingIterator, TailingIteratorUpperBound) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+
+  const Slice upper_bound("20", 3);
+  ReadOptions read_options;
+  read_options.tailing = true;
+  read_options.iterate_upper_bound = &upper_bound;
+  if (GetParam()) {
+    read_options.async_io = true;
+  }
+  ASSERT_OK(Put(1, "11", "11"));
+  ASSERT_OK(Put(1, "12", "12"));
+  ASSERT_OK(Put(1, "22", "22"));
+  ASSERT_OK(Flush(1));  // flush all those keys to an immutable SST file
+
+  // Add another key to the memtable.
+  ASSERT_OK(Put(1, "21", "21"));
+
+  std::unique_ptr<Iterator> it(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(it->status());
+  it->Seek("12");
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("12", it->key().ToString());
+
+  it->Next();
+  // Not valid since "21" is over the upper bound.
+  ASSERT_FALSE(it->Valid());
+  ASSERT_OK(it->status());
+  // This keeps track of the number of times NeedToSeekImmutable() was true.
+  int immutable_seeks = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ForwardIterator::SeekInternal:Immutable",
+      [&](void* /*arg*/) { ++immutable_seeks; });
+
+  // Seek to 13. This should not require any immutable seeks.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  it->Seek("13");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  ASSERT_FALSE(it->Valid());
+  ASSERT_OK(it->status());
+  if (GetParam()) {
+    ASSERT_EQ(1, immutable_seeks);
+  } else {
+    ASSERT_EQ(0, immutable_seeks);
+  }
+}
+
+TEST_P(DBTestTailingIterator, TailingIteratorGap) {
+  // level 1:            [20, 25]  [35, 40]
+  // level 2:  [10 - 15]                    [45 - 50]
+  // level 3:            [20,    30,    40]
+  // Previously there is a bug in tailing_iterator that if there is a gap in
+  // lower level, the key will be skipped if it is within the range between
+  // the largest key of index n file and the smallest key of index n+1 file
+  // if both file fit in that gap. In this example, 25 < key < 35
+  // https://github.com/facebook/rocksdb/issues/1372
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+
+  ReadOptions read_options;
+  read_options.tailing = true;
+  if (GetParam()) {
+    read_options.async_io = true;
+  }
+  ASSERT_OK(Put(1, "20", "20"));
+  ASSERT_OK(Put(1, "30", "30"));
+  ASSERT_OK(Put(1, "40", "40"));
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(3, 1);
+
+  ASSERT_OK(Put(1, "10", "10"));
+  ASSERT_OK(Put(1, "15", "15"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(1, "45", "45"));
+  ASSERT_OK(Put(1, "50", "50"));
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(2, 1);
+
+  ASSERT_OK(Put(1, "20", "20"));
+  ASSERT_OK(Put(1, "25", "25"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(1, "35", "35"));
+  ASSERT_OK(Put(1, "40", "40"));
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(1, 1);
+
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(handles_[1], &meta);
+
+  std::unique_ptr<Iterator> it(db_->NewIterator(read_options, handles_[1]));
+  it->Seek("30");
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("30", it->key().ToString());
+
+  it->Next();
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("35", it->key().ToString());
+
+  it->Next();
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("40", it->key().ToString());
+
+  ASSERT_OK(it->status());
+}
+
+TEST_P(DBTestTailingIterator, SeekWithUpperBoundBug) {
+  ReadOptions read_options;
+  read_options.tailing = true;
+  if (GetParam()) {
+    read_options.async_io = true;
+  }
+  const Slice upper_bound("cc", 3);
+  read_options.iterate_upper_bound = &upper_bound;
+
+  // 1st L0 file
+  ASSERT_OK(db_->Put(WriteOptions(), "aa", "SEEN"));
+  ASSERT_OK(Flush());
+
+  // 2nd L0 file
+  ASSERT_OK(db_->Put(WriteOptions(), "zz", "NOT-SEEN"));
+  ASSERT_OK(Flush());
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  ASSERT_OK(iter->status());
+
+  iter->Seek("aa");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "aa");
+}
+
+TEST_P(DBTestTailingIterator, SeekToFirstWithUpperBoundBug) {
+  ReadOptions read_options;
+  read_options.tailing = true;
+  if (GetParam()) {
+    read_options.async_io = true;
+  }
+  const Slice upper_bound("cc", 3);
+  read_options.iterate_upper_bound = &upper_bound;
+
+  // 1st L0 file
+  ASSERT_OK(db_->Put(WriteOptions(), "aa", "SEEN"));
+  ASSERT_OK(Flush());
+
+  // 2nd L0 file
+  ASSERT_OK(db_->Put(WriteOptions(), "zz", "NOT-SEEN"));
+  ASSERT_OK(Flush());
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  ASSERT_OK(iter->status());
+
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "aa");
+
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "aa");
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !defined(ROCKSDB_LITE)
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  (void)argc;
+  (void)argv;
+  return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_test.cc b/src/rocksdb/db/db_test.cc
new file mode 100644
index 000000000..9575248b4
--- /dev/null
+++ b/src/rocksdb/db/db_test.cc
@@ -0,0 +1,7397 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Introduction of SyncPoint effectively disabled building and running this test
+// in Release build.
+// which is a pity, it is a good test
+#include <fcntl.h>
+
+#include <algorithm>
+#include <set>
+#include <thread>
+#include <unordered_set>
+#include <utility>
+
+#ifndef OS_WIN
+#include <unistd.h>
+#endif
+#ifdef OS_SOLARIS
+#include <alloca.h>
+#endif
+
+#include "cache/lru_cache.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "db/job_context.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "monitoring/thread_status_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/experimental.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/snapshot.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/thread_status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "table/mock_table.h"
+#include "table/scoped_arena_iterator.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/compression.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Note that whole DBTest and its child classes disable fsync on files
+// and directories for speed.
+// If fsync needs to be covered in a test, put it in other places.
+class DBTest : public DBTestBase {
+ public:
+  DBTest() : DBTestBase("db_test", /*env_do_fsync=*/false) {}
+};
+
+class DBTestWithParam
+    : public DBTest,
+      public testing::WithParamInterface<std::tuple<uint32_t, bool>> {
+ public:
+  DBTestWithParam() {
+    max_subcompactions_ = std::get<0>(GetParam());
+    exclusive_manual_compaction_ = std::get<1>(GetParam());
+  }
+
+  // Required if inheriting from testing::WithParamInterface<>
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  uint32_t max_subcompactions_;
+  bool exclusive_manual_compaction_;
+};
+
+TEST_F(DBTest, MockEnvTest) {
+  std::unique_ptr<MockEnv> env{MockEnv::Create(Env::Default())};
+  Options options;
+  options.create_if_missing = true;
+  options.env = env.get();
+  DB* db;
+
+  const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
+  const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
+
+  ASSERT_OK(DB::Open(options, "/dir/db", &db));
+  for (size_t i = 0; i < 3; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
+  }
+
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+
+  Iterator* iterator = db->NewIterator(ReadOptions());
+  iterator->SeekToFirst();
+  for (size_t i = 0; i < 3; ++i) {
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_TRUE(keys[i] == iterator->key());
+    ASSERT_TRUE(vals[i] == iterator->value());
+    iterator->Next();
+  }
+  ASSERT_TRUE(!iterator->Valid());
+  delete iterator;
+
+// TEST_FlushMemTable() is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+#endif  // ROCKSDB_LITE
+
+  delete db;
+}
+
+// NewMemEnv returns nullptr in ROCKSDB_LITE since class InMemoryEnv isn't
+// defined.
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, MemEnvTest) {
+  std::unique_ptr<Env> env{NewMemEnv(Env::Default())};
+  Options options;
+  options.create_if_missing = true;
+  options.env = env.get();
+  DB* db;
+
+  const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
+  const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
+
+  ASSERT_OK(DB::Open(options, "/dir/db", &db));
+  for (size_t i = 0; i < 3; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
+  }
+
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+
+  Iterator* iterator = db->NewIterator(ReadOptions());
+  iterator->SeekToFirst();
+  for (size_t i = 0; i < 3; ++i) {
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_TRUE(keys[i] == iterator->key());
+    ASSERT_TRUE(vals[i] == iterator->value());
+    iterator->Next();
+  }
+  ASSERT_TRUE(!iterator->Valid());
+  delete iterator;
+
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+
+  delete db;
+
+  options.create_if_missing = false;
+  ASSERT_OK(DB::Open(options, "/dir/db", &db));
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+  delete db;
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, WriteEmptyBatch) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  WriteOptions wo;
+  wo.sync = true;
+  wo.disableWAL = false;
+  WriteBatch empty_batch;
+  ASSERT_OK(dbfull()->Write(wo, &empty_batch));
+
+  // make sure we can re-open it.
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+  ASSERT_EQ("bar", Get(1, "foo"));
+}
+
+TEST_F(DBTest, SkipDelay) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  for (bool sync : {true, false}) {
+    for (bool disableWAL : {true, false}) {
+      if (sync && disableWAL) {
+        // sync and disableWAL is incompatible.
+        continue;
+      }
+      // Use a small number to ensure a large delay that is still effective
+      // when we do Put
+      // TODO(myabandeh): this is time dependent and could potentially make
+      // the test flaky
+      auto token = dbfull()->TEST_write_controler().GetDelayToken(1);
+      std::atomic<int> sleep_count(0);
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "DBImpl::DelayWrite:Sleep",
+          [&](void* /*arg*/) { sleep_count.fetch_add(1); });
+      std::atomic<int> wait_count(0);
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "DBImpl::DelayWrite:Wait",
+          [&](void* /*arg*/) { wait_count.fetch_add(1); });
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+      WriteOptions wo;
+      wo.sync = sync;
+      wo.disableWAL = disableWAL;
+      wo.no_slowdown = true;
+      // Large enough to exceed allowance for one time interval
+      std::string large_value(1024, 'x');
+      // Perhaps ideally this first write would fail because of delay, but
+      // the current implementation does not guarantee that.
+      dbfull()->Put(wo, "foo", large_value).PermitUncheckedError();
+      // We need the 2nd write to trigger delay. This is because delay is
+      // estimated based on the last write size which is 0 for the first write.
+      ASSERT_NOK(dbfull()->Put(wo, "foo2", large_value));
+      ASSERT_GE(sleep_count.load(), 0);
+      ASSERT_GE(wait_count.load(), 0);
+      token.reset();
+
+      token = dbfull()->TEST_write_controler().GetDelayToken(1000000);
+      wo.no_slowdown = false;
+      ASSERT_OK(dbfull()->Put(wo, "foo3", large_value));
+      ASSERT_GE(sleep_count.load(), 1);
+      token.reset();
+    }
+  }
+}
+
+TEST_F(DBTest, MixedSlowdownOptions) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  std::vector<port::Thread> threads;
+  std::atomic<int> thread_num(0);
+
+  std::function<void()> write_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = false;
+    ASSERT_OK(dbfull()->Put(wo, key, "bar"));
+  };
+  std::function<void()> write_no_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = true;
+    ASSERT_NOK(dbfull()->Put(wo, key, "bar"));
+  };
+  // Use a small number to ensure a large delay that is still effective
+  // when we do Put
+  // TODO(myabandeh): this is time dependent and could potentially make
+  // the test flaky
+  auto token = dbfull()->TEST_write_controler().GetDelayToken(1);
+  std::atomic<int> sleep_count(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:BeginWriteStallDone", [&](void* /*arg*/) {
+        sleep_count.fetch_add(1);
+        if (threads.empty()) {
+          for (int i = 0; i < 2; ++i) {
+            threads.emplace_back(write_slowdown_func);
+          }
+          for (int i = 0; i < 2; ++i) {
+            threads.emplace_back(write_no_slowdown_func);
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions wo;
+  wo.sync = false;
+  wo.disableWAL = false;
+  wo.no_slowdown = false;
+  ASSERT_OK(dbfull()->Put(wo, "foo", "bar"));
+  // We need the 2nd write to trigger delay. This is because delay is
+  // estimated based on the last write size which is 0 for the first write.
+  ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
+  token.reset();
+
+  for (auto& t : threads) {
+    t.join();
+  }
+  ASSERT_GE(sleep_count.load(), 1);
+
+  wo.no_slowdown = true;
+  ASSERT_OK(dbfull()->Put(wo, "foo3", "bar"));
+}
+
+TEST_F(DBTest, MixedSlowdownOptionsInQueue) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  std::vector<port::Thread> threads;
+  std::atomic<int> thread_num(0);
+
+  std::function<void()> write_no_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = true;
+    ASSERT_NOK(dbfull()->Put(wo, key, "bar"));
+  };
+  // Use a small number to ensure a large delay that is still effective
+  // when we do Put
+  // TODO(myabandeh): this is time dependent and could potentially make
+  // the test flaky
+  auto token = dbfull()->TEST_write_controler().GetDelayToken(1);
+  std::atomic<int> sleep_count(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:Sleep", [&](void* /*arg*/) {
+        sleep_count.fetch_add(1);
+        if (threads.empty()) {
+          for (int i = 0; i < 2; ++i) {
+            threads.emplace_back(write_no_slowdown_func);
+          }
+          // Sleep for 2s to allow the threads to insert themselves into the
+          // write queue
+          env_->SleepForMicroseconds(3000000ULL);
+        }
+      });
+  std::atomic<int> wait_count(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:Wait",
+      [&](void* /*arg*/) { wait_count.fetch_add(1); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions wo;
+  wo.sync = false;
+  wo.disableWAL = false;
+  wo.no_slowdown = false;
+  ASSERT_OK(dbfull()->Put(wo, "foo", "bar"));
+  // We need the 2nd write to trigger delay. This is because delay is
+  // estimated based on the last write size which is 0 for the first write.
+  ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
+  token.reset();
+
+  for (auto& t : threads) {
+    t.join();
+  }
+  ASSERT_EQ(sleep_count.load(), 1);
+  ASSERT_GE(wait_count.load(), 0);
+}
+
+TEST_F(DBTest, MixedSlowdownOptionsStop) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  std::vector<port::Thread> threads;
+  std::atomic<int> thread_num(0);
+
+  std::function<void()> write_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = false;
+    ASSERT_OK(dbfull()->Put(wo, key, "bar"));
+  };
+  std::function<void()> write_no_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = true;
+    ASSERT_NOK(dbfull()->Put(wo, key, "bar"));
+  };
+  std::function<void()> wakeup_writer = [&]() {
+    dbfull()->mutex_.Lock();
+    dbfull()->bg_cv_.SignalAll();
+    dbfull()->mutex_.Unlock();
+  };
+  // Use a small number to ensure a large delay that is still effective
+  // when we do Put
+  // TODO(myabandeh): this is time dependent and could potentially make
+  // the test flaky
+  auto token = dbfull()->TEST_write_controler().GetStopToken();
+  std::atomic<int> wait_count(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:Wait", [&](void* /*arg*/) {
+        wait_count.fetch_add(1);
+        if (threads.empty()) {
+          for (int i = 0; i < 2; ++i) {
+            threads.emplace_back(write_slowdown_func);
+          }
+          for (int i = 0; i < 2; ++i) {
+            threads.emplace_back(write_no_slowdown_func);
+          }
+          // Sleep for 2s to allow the threads to insert themselves into the
+          // write queue
+          env_->SleepForMicroseconds(3000000ULL);
+        }
+        token.reset();
+        threads.emplace_back(wakeup_writer);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions wo;
+  wo.sync = false;
+  wo.disableWAL = false;
+  wo.no_slowdown = false;
+  ASSERT_OK(dbfull()->Put(wo, "foo", "bar"));
+  // We need the 2nd write to trigger delay. This is because delay is
+  // estimated based on the last write size which is 0 for the first write.
+  ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
+  token.reset();
+
+  for (auto& t : threads) {
+    t.join();
+  }
+  ASSERT_GE(wait_count.load(), 1);
+
+  wo.no_slowdown = true;
+  ASSERT_OK(dbfull()->Put(wo, "foo3", "bar"));
+}
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBTest, LevelLimitReopen) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  const std::string value(1024 * 1024, ' ');
+  int i = 0;
+  while (NumTableFilesAtLevel(2, 1) == 0) {
+    ASSERT_OK(Put(1, Key(i++), value));
+  }
+
+  options.num_levels = 1;
+  options.max_bytes_for_level_multiplier_additional.resize(1, 1);
+  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ(s.IsInvalidArgument(), true);
+  ASSERT_EQ(s.ToString(),
+            "Invalid argument: db has more levels than options.num_levels");
+
+  options.num_levels = 10;
+  options.max_bytes_for_level_multiplier_additional.resize(10, 1);
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+}
+#endif  // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, LevelReopenWithFIFO) {
+  const int kLevelCount = 4;
+  const int kKeyCount = 5;
+  const int kTotalSstFileCount = kLevelCount * kKeyCount;
+  const int kCF = 1;
+
+  Options options = CurrentOptions();
+  // Config level0_file_num_compaction_trigger to prevent L0 files being
+  // automatically compacted while we are constructing a LSM tree structure
+  // to test multi-level FIFO compaction.
+  options.level0_file_num_compaction_trigger = kKeyCount + 1;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // The expected number of files per level after each file creation.
+  const std::string expected_files_per_level[kLevelCount][kKeyCount] = {
+      {"0,0,0,1", "0,0,0,2", "0,0,0,3", "0,0,0,4", "0,0,0,5"},
+      {"0,0,1,5", "0,0,2,5", "0,0,3,5", "0,0,4,5", "0,0,5,5"},
+      {"0,1,5,5", "0,2,5,5", "0,3,5,5", "0,4,5,5", "0,5,5,5"},
+      {"1,5,5,5", "2,5,5,5", "3,5,5,5", "4,5,5,5", "5,5,5,5"},
+  };
+
+  const std::string expected_entries[kKeyCount][kLevelCount + 1] = {
+      {"[ ]", "[ a3 ]", "[ a2, a3 ]", "[ a1, a2, a3 ]", "[ a0, a1, a2, a3 ]"},
+      {"[ ]", "[ b3 ]", "[ b2, b3 ]", "[ b1, b2, b3 ]", "[ b0, b1, b2, b3 ]"},
+      {"[ ]", "[ c3 ]", "[ c2, c3 ]", "[ c1, c2, c3 ]", "[ c0, c1, c2, c3 ]"},
+      {"[ ]", "[ d3 ]", "[ d2, d3 ]", "[ d1, d2, d3 ]", "[ d0, d1, d2, d3 ]"},
+      {"[ ]", "[ e3 ]", "[ e2, e3 ]", "[ e1, e2, e3 ]", "[ e0, e1, e2, e3 ]"},
+  };
+
+  // The loop below creates the following LSM tree where each (k, v) pair
+  // represents a file that contains that entry.  When a file is created,
+  // the db is reopend with FIFO compaction and verified the LSM tree
+  // structure is still the same.
+  //
+  // The resulting LSM tree will contain 5 different keys.  Each key as
+  // 4 different versions, located in different level.
+  //
+  // L0:  (e, e0) (d, d0) (c, c0) (b, b0) (a, a0)
+  // L1:  (a, a1) (b, b1) (c, c1) (d, d1) (e, e1)
+  // L2:  (a, a2) (b, b2) (c, c2) (d, d2) (e, e2)
+  // L3:  (a, a3) (b, b3) (c, c3) (d, d3) (e, e3)
+  for (int l = 0; l < kLevelCount; ++l) {
+    int level = kLevelCount - 1 - l;
+    for (int p = 0; p < kKeyCount; ++p) {
+      std::string put_key = std::string(1, char('a' + p));
+      ASSERT_OK(Put(kCF, put_key, put_key + std::to_string(level)));
+      ASSERT_OK(Flush(kCF));
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+      for (int g = 0; g < kKeyCount; ++g) {
+        int entry_count = (p >= g) ? l + 1 : l;
+        std::string get_key = std::string(1, char('a' + g));
+        CheckAllEntriesWithFifoReopen(expected_entries[g][entry_count], get_key,
+                                      kCF, {"pikachu"}, options);
+      }
+      if (level != 0) {
+        MoveFilesToLevel(level, kCF);
+        for (int g = 0; g < kKeyCount; ++g) {
+          int entry_count = (p >= g) ? l + 1 : l;
+          std::string get_key = std::string(1, char('a' + g));
+          CheckAllEntriesWithFifoReopen(expected_entries[g][entry_count],
+                                        get_key, kCF, {"pikachu"}, options);
+        }
+      }
+      ASSERT_EQ(expected_files_per_level[l][p], FilesPerLevel(kCF));
+    }
+  }
+
+  // The expected number of sst files in each level after each FIFO compaction
+  // that deletes the oldest sst file.
+  const std::string expected_files_per_level_after_fifo[] = {
+      "5,5,5,4", "5,5,5,3", "5,5,5,2", "5,5,5,1", "5,5,5", "5,5,4", "5,5,3",
+      "5,5,2",   "5,5,1",   "5,5",     "5,4",     "5,3",   "5,2",   "5,1",
+      "5",       "4",       "3",       "2",       "1",     "",
+  };
+
+  // The expected value entries of each key after each FIFO compaction.
+  // This verifies whether FIFO removes the file with the smallest key in non-L0
+  // files first then the oldest files in L0.
+  const std::string expected_entries_after_fifo[kKeyCount][kLevelCount + 1] = {
+      {"[ a0, a1, a2, a3 ]", "[ a0, a1, a2 ]", "[ a0, a1 ]", "[ a0 ]", "[ ]"},
+      {"[ b0, b1, b2, b3 ]", "[ b0, b1, b2 ]", "[ b0, b1 ]", "[ b0 ]", "[ ]"},
+      {"[ c0, c1, c2, c3 ]", "[ c0, c1, c2 ]", "[ c0, c1 ]", "[ c0 ]", "[ ]"},
+      {"[ d0, d1, d2, d3 ]", "[ d0, d1, d2 ]", "[ d0, d1 ]", "[ d0 ]", "[ ]"},
+      {"[ e0, e1, e2, e3 ]", "[ e0, e1, e2 ]", "[ e0, e1 ]", "[ e0 ]", "[ ]"},
+  };
+
+  // In the 2nd phase, we reopen the DB with FIFO compaction.  In each reopen,
+  // we config max_table_files_size so that FIFO will remove exactly one file
+  // at a time upon compaction, and we will use it to verify whether the sst
+  // files are deleted in the correct order.
+  for (int i = 0; i < kTotalSstFileCount; ++i) {
+    uint64_t total_sst_files_size = 0;
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.total-sst-files-size", &total_sst_files_size));
+    ASSERT_TRUE(total_sst_files_size > 0);
+
+    Options fifo_options(options);
+    fifo_options.compaction_style = kCompactionStyleFIFO;
+    options.create_if_missing = false;
+    fifo_options.max_open_files = -1;
+    fifo_options.disable_auto_compactions = false;
+    // Config max_table_files_size to be total_sst_files_size - 1 so that
+    // FIFO will delete one file.
+    fifo_options.compaction_options_fifo.max_table_files_size =
+        total_sst_files_size - 1;
+    ASSERT_OK(
+        TryReopenWithColumnFamilies({"default", "pikachu"}, fifo_options));
+    // For FIFO to pick a compaction
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+    ASSERT_OK(dbfull()->TEST_WaitForCompact(false));
+    for (int g = 0; g < kKeyCount; ++g) {
+      std::string get_key = std::string(1, char('a' + g));
+      int status_index = i / kKeyCount;
+      if ((i % kKeyCount) >= g) {
+        // If true, then it means the sst file containing the get_key in the
+        // current level has already been deleted, so we need to move the
+        // status_index for checking the expected value.
+        status_index++;
+      }
+      CheckAllEntriesWithFifoReopen(
+          expected_entries_after_fifo[g][status_index], get_key, kCF,
+          {"pikachu"}, options);
+    }
+    ASSERT_EQ(expected_files_per_level_after_fifo[i], FilesPerLevel(kCF));
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_F(DBTest, PutSingleDeleteGet) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_OK(Put(1, "foo2", "v2"));
+    ASSERT_EQ("v2", Get(1, "foo2"));
+    ASSERT_OK(SingleDelete(1, "foo"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+    // Skip FIFO and universal compaction because they do not apply to the test
+    // case. Skip MergePut because single delete does not get removed when it
+    // encounters a merge.
+  } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+                         kSkipMergePut));
+}
+
+TEST_F(DBTest, ReadFromPersistedTier) {
+  do {
+    Random rnd(301);
+    Options options = CurrentOptions();
+    for (int disableWAL = 0; disableWAL <= 1; ++disableWAL) {
+      CreateAndReopenWithCF({"pikachu"}, options);
+      WriteOptions wopt;
+      wopt.disableWAL = (disableWAL == 1);
+      // 1st round: put but not flush
+      ASSERT_OK(db_->Put(wopt, handles_[1], "foo", "first"));
+      ASSERT_OK(db_->Put(wopt, handles_[1], "bar", "one"));
+      ASSERT_EQ("first", Get(1, "foo"));
+      ASSERT_EQ("one", Get(1, "bar"));
+
+      // Read directly from persited data.
+      ReadOptions ropt;
+      ropt.read_tier = kPersistedTier;
+      std::string value;
+      if (wopt.disableWAL) {
+        // as data has not yet being flushed, we expect not found.
+        ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).IsNotFound());
+        ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).IsNotFound());
+      } else {
+        ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value));
+        ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value));
+      }
+
+      // Multiget
+      std::vector<ColumnFamilyHandle*> multiget_cfs;
+      multiget_cfs.push_back(handles_[1]);
+      multiget_cfs.push_back(handles_[1]);
+      std::vector<Slice> multiget_keys;
+      multiget_keys.push_back("foo");
+      multiget_keys.push_back("bar");
+      std::vector<std::string> multiget_values;
+      auto statuses =
+          db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values);
+      if (wopt.disableWAL) {
+        ASSERT_TRUE(statuses[0].IsNotFound());
+        ASSERT_TRUE(statuses[1].IsNotFound());
+      } else {
+        ASSERT_OK(statuses[0]);
+        ASSERT_OK(statuses[1]);
+      }
+
+      // 2nd round: flush and put a new value in memtable.
+      ASSERT_OK(Flush(1));
+      ASSERT_OK(db_->Put(wopt, handles_[1], "rocksdb", "hello"));
+
+      // once the data has been flushed, we are able to get the
+      // data when kPersistedTier is used.
+      ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).ok());
+      ASSERT_EQ(value, "first");
+      ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).ok());
+      ASSERT_EQ(value, "one");
+      if (wopt.disableWAL) {
+        ASSERT_TRUE(
+            db_->Get(ropt, handles_[1], "rocksdb", &value).IsNotFound());
+      } else {
+        ASSERT_OK(db_->Get(ropt, handles_[1], "rocksdb", &value));
+        ASSERT_EQ(value, "hello");
+      }
+
+      // Expect same result in multiget
+      multiget_cfs.push_back(handles_[1]);
+      multiget_keys.push_back("rocksdb");
+      statuses =
+          db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values);
+      ASSERT_TRUE(statuses[0].ok());
+      ASSERT_EQ("first", multiget_values[0]);
+      ASSERT_TRUE(statuses[1].ok());
+      ASSERT_EQ("one", multiget_values[1]);
+      if (wopt.disableWAL) {
+        ASSERT_TRUE(statuses[2].IsNotFound());
+      } else {
+        ASSERT_OK(statuses[2]);
+      }
+
+      // 3rd round: delete and flush
+      ASSERT_OK(db_->Delete(wopt, handles_[1], "foo"));
+      Flush(1);
+      ASSERT_OK(db_->Delete(wopt, handles_[1], "bar"));
+
+      ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).IsNotFound());
+      if (wopt.disableWAL) {
+        // Still expect finding the value as its delete has not yet being
+        // flushed.
+        ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).ok());
+        ASSERT_EQ(value, "one");
+      } else {
+        ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).IsNotFound());
+      }
+      ASSERT_TRUE(db_->Get(ropt, handles_[1], "rocksdb", &value).ok());
+      ASSERT_EQ(value, "hello");
+
+      statuses =
+          db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values);
+      ASSERT_TRUE(statuses[0].IsNotFound());
+      if (wopt.disableWAL) {
+        ASSERT_TRUE(statuses[1].ok());
+        ASSERT_EQ("one", multiget_values[1]);
+      } else {
+        ASSERT_TRUE(statuses[1].IsNotFound());
+      }
+      ASSERT_TRUE(statuses[2].ok());
+      ASSERT_EQ("hello", multiget_values[2]);
+      if (wopt.disableWAL == 0) {
+        DestroyAndReopen(options);
+      }
+    }
+  } while (ChangeOptions());
+}
+
+TEST_F(DBTest, SingleDeleteFlush) {
+  // Test to check whether flushing preserves a single delete hidden
+  // behind a put.
+  do {
+    Random rnd(301);
+
+    Options options = CurrentOptions();
+    options.disable_auto_compactions = true;
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Put values on second level (so that they will not be in the same
+    // compaction as the other operations.
+    ASSERT_OK(Put(1, "foo", "first"));
+    ASSERT_OK(Put(1, "bar", "one"));
+    ASSERT_OK(Flush(1));
+    MoveFilesToLevel(2, 1);
+
+    // (Single) delete hidden by a put
+    ASSERT_OK(SingleDelete(1, "foo"));
+    ASSERT_OK(Put(1, "foo", "second"));
+    ASSERT_OK(Delete(1, "bar"));
+    ASSERT_OK(Put(1, "bar", "two"));
+    ASSERT_OK(Flush(1));
+
+    ASSERT_OK(SingleDelete(1, "foo"));
+    ASSERT_OK(Delete(1, "bar"));
+    ASSERT_OK(Flush(1));
+
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
+
+    ASSERT_EQ("NOT_FOUND", Get(1, "bar"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+    // Skip FIFO and universal compaction beccaus they do not apply to the test
+    // case. Skip MergePut because single delete does not get removed when it
+    // encounters a merge.
+  } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+                         kSkipMergePut));
+}
+
+TEST_F(DBTest, SingleDeletePutFlush) {
+  // Single deletes that encounter the matching put in a flush should get
+  // removed.
+  do {
+    Random rnd(301);
+
+    Options options = CurrentOptions();
+    options.disable_auto_compactions = true;
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    ASSERT_OK(Put(1, "foo", Slice()));
+    ASSERT_OK(Put(1, "a", Slice()));
+    ASSERT_OK(SingleDelete(1, "a"));
+    ASSERT_OK(Flush(1));
+
+    ASSERT_EQ("[ ]", AllEntriesFor("a", 1));
+    // Skip FIFO and universal compaction because they do not apply to the test
+    // case. Skip MergePut because single delete does not get removed when it
+    // encounters a merge.
+  } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+                         kSkipMergePut));
+}
+
+// Disable because not all platform can run it.
+// It requires more than 9GB memory to run it, With single allocation
+// of more than 3GB.
+TEST_F(DBTest, DISABLED_SanitizeVeryVeryLargeValue) {
+  const size_t kValueSize = 4 * size_t{1024 * 1024 * 1024};  // 4GB value
+  std::string raw(kValueSize, 'v');
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  options.write_buffer_size = 100000;  // Small write buffer
+  options.paranoid_checks = true;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("boo", "v1"));
+  ASSERT_TRUE(Put("foo", raw).IsInvalidArgument());
+  ASSERT_TRUE(Merge("foo", raw).IsInvalidArgument());
+
+  WriteBatch wb;
+  ASSERT_TRUE(wb.Put("foo", raw).IsInvalidArgument());
+  ASSERT_TRUE(wb.Merge("foo", raw).IsInvalidArgument());
+
+  Slice value_slice = raw;
+  Slice key_slice = "foo";
+  SliceParts sp_key(&key_slice, 1);
+  SliceParts sp_value(&value_slice, 1);
+
+  ASSERT_TRUE(wb.Put(sp_key, sp_value).IsInvalidArgument());
+  ASSERT_TRUE(wb.Merge(sp_key, sp_value).IsInvalidArgument());
+}
+
+// Disable because not all platform can run it.
+// It requires more than 9GB memory to run it, With single allocation
+// of more than 3GB.
+TEST_F(DBTest, DISABLED_VeryLargeValue) {
+  const size_t kValueSize = 3221225472u;  // 3GB value
+  const size_t kKeySize = 8388608u;       // 8MB key
+  std::string raw(kValueSize, 'v');
+  std::string key1(kKeySize, 'c');
+  std::string key2(kKeySize, 'd');
+
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;  // Small write buffer
+  options.paranoid_checks = true;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("boo", "v1"));
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Put(key1, raw));
+  raw[0] = 'w';
+  ASSERT_OK(Put(key2, raw));
+  dbfull()->TEST_WaitForFlushMemTable();
+
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+#endif  // !ROCKSDB_LITE
+
+  std::string value;
+  Status s = db_->Get(ReadOptions(), key1, &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(kValueSize, value.size());
+  ASSERT_EQ('v', value[0]);
+
+  s = db_->Get(ReadOptions(), key2, &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(kValueSize, value.size());
+  ASSERT_EQ('w', value[0]);
+
+  // Compact all files.
+  Flush();
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  // Check DB is not in read-only state.
+  ASSERT_OK(Put("boo", "v1"));
+
+  s = db_->Get(ReadOptions(), key1, &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(kValueSize, value.size());
+  ASSERT_EQ('v', value[0]);
+
+  s = db_->Get(ReadOptions(), key2, &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(kValueSize, value.size());
+  ASSERT_EQ('w', value[0]);
+}
+
+TEST_F(DBTest, GetFromImmutableLayer) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_EQ("v1", Get(1, "foo"));
+
+    // Block sync calls
+    env_->delay_sstable_sync_.store(true, std::memory_order_release);
+    ASSERT_OK(Put(1, "k1", std::string(100000, 'x')));  // Fill memtable
+    ASSERT_OK(Put(1, "k2", std::string(100000, 'y')));  // Trigger flush
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
+    // Release sync calls
+    env_->delay_sstable_sync_.store(false, std::memory_order_release);
+  } while (ChangeOptions());
+}
+
+TEST_F(DBTest, GetLevel0Ordering) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    // Check that we process level-0 files in correct order.  The code
+    // below generates two level-0 files where the earlier one comes
+    // before the later one in the level-0 file list since the earlier
+    // one has a smaller "smallest" key.
+    ASSERT_OK(Put(1, "bar", "b"));
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Put(1, "foo", "v2"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ("v2", Get(1, "foo"));
+  } while (ChangeOptions());
+}
+
+TEST_F(DBTest, WrongLevel0Config) {
+  Options options = CurrentOptions();
+  Close();
+  ASSERT_OK(DestroyDB(dbname_, options));
+  options.level0_stop_writes_trigger = 1;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_file_num_compaction_trigger = 3;
+  ASSERT_OK(DB::Open(options, dbname_, &db_));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, GetOrderedByLevels) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    Compact(1, "a", "z");
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_OK(Put(1, "foo", "v2"));
+    ASSERT_EQ("v2", Get(1, "foo"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ("v2", Get(1, "foo"));
+  } while (ChangeOptions());
+}
+
+TEST_F(DBTest, GetPicksCorrectFile) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    // Arrange to have multiple files in a non-level-0 level.
+    ASSERT_OK(Put(1, "a", "va"));
+    Compact(1, "a", "b");
+    ASSERT_OK(Put(1, "x", "vx"));
+    Compact(1, "x", "y");
+    ASSERT_OK(Put(1, "f", "vf"));
+    Compact(1, "f", "g");
+    ASSERT_EQ("va", Get(1, "a"));
+    ASSERT_EQ("vf", Get(1, "f"));
+    ASSERT_EQ("vx", Get(1, "x"));
+  } while (ChangeOptions());
+}
+
+TEST_F(DBTest, GetEncountersEmptyLevel) {
+  do {
+    Options options = CurrentOptions();
+    CreateAndReopenWithCF({"pikachu"}, options);
+    // Arrange for the following to happen:
+    //   * sstable A in level 0
+    //   * nothing in level 1
+    //   * sstable B in level 2
+    // Then do enough Get() calls to arrange for an automatic compaction
+    // of sstable A.  A bug would cause the compaction to be marked as
+    // occurring at level 1 (instead of the correct level 0).
+
+    // Step 1: First place sstables in levels 0 and 2
+    ASSERT_OK(Put(1, "a", "begin"));
+    ASSERT_OK(Put(1, "z", "end"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+    ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
+    ASSERT_OK(Put(1, "a", "begin"));
+    ASSERT_OK(Put(1, "z", "end"));
+    ASSERT_OK(Flush(1));
+    ASSERT_GT(NumTableFilesAtLevel(0, 1), 0);
+    ASSERT_GT(NumTableFilesAtLevel(2, 1), 0);
+
+    // Step 2: clear level 1 if necessary.
+    ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1);
+    ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+    ASSERT_EQ(NumTableFilesAtLevel(2, 1), 1);
+
+    // Step 3: read a bunch of times
+    for (int i = 0; i < 1000; i++) {
+      ASSERT_EQ("NOT_FOUND", Get(1, "missing"));
+    }
+
+    // Step 4: Wait for compaction to finish
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1);  // XXX
+  } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction));
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, FlushMultipleMemtable) {
+  do {
+    Options options = CurrentOptions();
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    options.max_write_buffer_number = 4;
+    options.min_write_buffer_number_to_merge = 3;
+    options.max_write_buffer_size_to_maintain = -1;
+    CreateAndReopenWithCF({"pikachu"}, options);
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v1", Get(1, "bar"));
+    ASSERT_OK(Flush(1));
+  } while (ChangeCompactOptions());
+}
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, FlushSchedule) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.level0_stop_writes_trigger = 1 << 10;
+  options.level0_slowdown_writes_trigger = 1 << 10;
+  options.min_write_buffer_number_to_merge = 1;
+  options.max_write_buffer_size_to_maintain =
+      static_cast<int64_t>(options.write_buffer_size);
+  options.max_write_buffer_number = 2;
+  options.write_buffer_size = 120 * 1024;
+  auto flush_listener = std::make_shared<FlushCounterListener>();
+  flush_listener->expected_flush_reason = FlushReason::kWriteBufferFull;
+  options.listeners.push_back(flush_listener);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  std::vector<port::Thread> threads;
+
+  std::atomic<int> thread_num(0);
+  // each column family will have 5 thread, each thread generating 2 memtables.
+  // each column family should end up with 10 table files
+  std::function<void()> fill_memtable_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    Random rnd(a);
+    WriteOptions wo;
+    // this should fill up 2 memtables
+    for (int k = 0; k < 5000; ++k) {
+      ASSERT_OK(db_->Put(wo, handles_[a & 1], rnd.RandomString(13), ""));
+    }
+  };
+
+  for (int i = 0; i < 10; ++i) {
+    threads.emplace_back(fill_memtable_func);
+  }
+
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  auto default_tables = GetNumberOfSstFilesForColumnFamily(db_, "default");
+  auto pikachu_tables = GetNumberOfSstFilesForColumnFamily(db_, "pikachu");
+  ASSERT_LE(default_tables, static_cast<uint64_t>(10));
+  ASSERT_GT(default_tables, static_cast<uint64_t>(0));
+  ASSERT_LE(pikachu_tables, static_cast<uint64_t>(10));
+  ASSERT_GT(pikachu_tables, static_cast<uint64_t>(0));
+}
+#endif  // ROCKSDB_LITE
+
+namespace {
+class KeepFilter : public CompactionFilter {
+ public:
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    return false;
+  }
+
+  const char* Name() const override { return "KeepFilter"; }
+};
+
+class KeepFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit KeepFilterFactory(bool check_context = false)
+      : check_context_(check_context) {}
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    if (check_context_) {
+      EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction);
+      EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction);
+    }
+    return std::unique_ptr<CompactionFilter>(new KeepFilter());
+  }
+
+  const char* Name() const override { return "KeepFilterFactory"; }
+  bool check_context_;
+  std::atomic_bool expect_full_compaction_;
+  std::atomic_bool expect_manual_compaction_;
+};
+
+class DelayFilter : public CompactionFilter {
+ public:
+  explicit DelayFilter(DBTestBase* d) : db_test(d) {}
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    db_test->env_->MockSleepForMicroseconds(1000);
+    return true;
+  }
+
+  const char* Name() const override { return "DelayFilter"; }
+
+ private:
+  DBTestBase* db_test;
+};
+
+class DelayFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {}
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& /*context*/) override {
+    return std::unique_ptr<CompactionFilter>(new DelayFilter(db_test));
+  }
+
+  const char* Name() const override { return "DelayFilterFactory"; }
+
+ private:
+  DBTestBase* db_test;
+};
+}  // anonymous namespace
+
+#ifndef ROCKSDB_LITE
+
+static std::string CompressibleString(Random* rnd, int len) {
+  std::string r;
+  test::CompressibleString(rnd, 0.8, len, &r);
+  return r;
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, FailMoreDbPaths) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 10000000);
+  options.db_paths.emplace_back(dbname_ + "_2", 1000000);
+  options.db_paths.emplace_back(dbname_ + "_3", 1000000);
+  options.db_paths.emplace_back(dbname_ + "_4", 1000000);
+  options.db_paths.emplace_back(dbname_ + "_5", 1000000);
+  ASSERT_TRUE(TryReopen(options).IsNotSupported());
+}
+
+void CheckColumnFamilyMeta(
+    const ColumnFamilyMetaData& cf_meta, const std::string& cf_name,
+    const std::vector<std::vector<FileMetaData>>& files_by_level,
+    uint64_t start_time, uint64_t end_time) {
+  ASSERT_EQ(cf_meta.name, cf_name);
+  ASSERT_EQ(cf_meta.levels.size(), files_by_level.size());
+
+  uint64_t cf_size = 0;
+  size_t file_count = 0;
+
+  for (size_t i = 0; i < cf_meta.levels.size(); ++i) {
+    const auto& level_meta_from_cf = cf_meta.levels[i];
+    const auto& level_meta_from_files = files_by_level[i];
+
+    ASSERT_EQ(level_meta_from_cf.level, i);
+    ASSERT_EQ(level_meta_from_cf.files.size(), level_meta_from_files.size());
+
+    file_count += level_meta_from_cf.files.size();
+
+    uint64_t level_size = 0;
+    for (size_t j = 0; j < level_meta_from_cf.files.size(); ++j) {
+      const auto& file_meta_from_cf = level_meta_from_cf.files[j];
+      const auto& file_meta_from_files = level_meta_from_files[j];
+
+      level_size += file_meta_from_cf.size;
+
+      ASSERT_EQ(file_meta_from_cf.file_number,
+                file_meta_from_files.fd.GetNumber());
+      ASSERT_EQ(file_meta_from_cf.file_number,
+                TableFileNameToNumber(file_meta_from_cf.name));
+      ASSERT_EQ(file_meta_from_cf.size, file_meta_from_files.fd.file_size);
+      ASSERT_EQ(file_meta_from_cf.smallest_seqno,
+                file_meta_from_files.fd.smallest_seqno);
+      ASSERT_EQ(file_meta_from_cf.largest_seqno,
+                file_meta_from_files.fd.largest_seqno);
+      ASSERT_EQ(file_meta_from_cf.smallestkey,
+                file_meta_from_files.smallest.user_key().ToString());
+      ASSERT_EQ(file_meta_from_cf.largestkey,
+                file_meta_from_files.largest.user_key().ToString());
+      ASSERT_EQ(file_meta_from_cf.oldest_blob_file_number,
+                file_meta_from_files.oldest_blob_file_number);
+      ASSERT_EQ(file_meta_from_cf.oldest_ancester_time,
+                file_meta_from_files.oldest_ancester_time);
+      ASSERT_EQ(file_meta_from_cf.file_creation_time,
+                file_meta_from_files.file_creation_time);
+      ASSERT_GE(file_meta_from_cf.file_creation_time, start_time);
+      ASSERT_LE(file_meta_from_cf.file_creation_time, end_time);
+      ASSERT_GE(file_meta_from_cf.oldest_ancester_time, start_time);
+      ASSERT_LE(file_meta_from_cf.oldest_ancester_time, end_time);
+      // More from FileStorageInfo
+      ASSERT_EQ(file_meta_from_cf.file_type, kTableFile);
+      ASSERT_EQ(file_meta_from_cf.name,
+                "/" + file_meta_from_cf.relative_filename);
+      ASSERT_EQ(file_meta_from_cf.directory, file_meta_from_cf.db_path);
+    }
+
+    ASSERT_EQ(level_meta_from_cf.size, level_size);
+    cf_size += level_size;
+  }
+
+  ASSERT_EQ(cf_meta.file_count, file_count);
+  ASSERT_EQ(cf_meta.size, cf_size);
+}
+
+void CheckLiveFilesMeta(
+    const std::vector<LiveFileMetaData>& live_file_meta,
+    const std::vector<std::vector<FileMetaData>>& files_by_level) {
+  size_t total_file_count = 0;
+  for (const auto& f : files_by_level) {
+    total_file_count += f.size();
+  }
+
+  ASSERT_EQ(live_file_meta.size(), total_file_count);
+
+  int level = 0;
+  int i = 0;
+
+  for (const auto& meta : live_file_meta) {
+    if (level != meta.level) {
+      level = meta.level;
+      i = 0;
+    }
+
+    ASSERT_LT(i, files_by_level[level].size());
+
+    const auto& expected_meta = files_by_level[level][i];
+
+    ASSERT_EQ(meta.column_family_name, kDefaultColumnFamilyName);
+    ASSERT_EQ(meta.file_number, expected_meta.fd.GetNumber());
+    ASSERT_EQ(meta.file_number, TableFileNameToNumber(meta.name));
+    ASSERT_EQ(meta.size, expected_meta.fd.file_size);
+    ASSERT_EQ(meta.smallest_seqno, expected_meta.fd.smallest_seqno);
+    ASSERT_EQ(meta.largest_seqno, expected_meta.fd.largest_seqno);
+    ASSERT_EQ(meta.smallestkey, expected_meta.smallest.user_key().ToString());
+    ASSERT_EQ(meta.largestkey, expected_meta.largest.user_key().ToString());
+    ASSERT_EQ(meta.oldest_blob_file_number,
+              expected_meta.oldest_blob_file_number);
+
+    // More from FileStorageInfo
+    ASSERT_EQ(meta.file_type, kTableFile);
+    ASSERT_EQ(meta.name, "/" + meta.relative_filename);
+    ASSERT_EQ(meta.directory, meta.db_path);
+
+    ++i;
+  }
+}
+
+#ifndef ROCKSDB_LITE
+void AddBlobFile(const ColumnFamilyHandle* cfh, uint64_t blob_file_number,
+                 uint64_t total_blob_count, uint64_t total_blob_bytes,
+                 const std::string& checksum_method,
+                 const std::string& checksum_value,
+                 uint64_t garbage_blob_count = 0,
+                 uint64_t garbage_blob_bytes = 0) {
+  ColumnFamilyData* cfd =
+      (static_cast<const ColumnFamilyHandleImpl*>(cfh))->cfd();
+  assert(cfd);
+
+  Version* const version = cfd->current();
+  assert(version);
+
+  VersionStorageInfo* const storage_info = version->storage_info();
+  assert(storage_info);
+
+  // Add a live blob file.
+
+  auto shared_meta = SharedBlobFileMetaData::Create(
+      blob_file_number, total_blob_count, total_blob_bytes, checksum_method,
+      checksum_value);
+
+  auto meta = BlobFileMetaData::Create(std::move(shared_meta),
+                                       BlobFileMetaData::LinkedSsts(),
+                                       garbage_blob_count, garbage_blob_bytes);
+
+  storage_info->AddBlobFile(std::move(meta));
+}
+
+static void CheckBlobMetaData(
+    const BlobMetaData& bmd, uint64_t blob_file_number,
+    uint64_t total_blob_count, uint64_t total_blob_bytes,
+    const std::string& checksum_method, const std::string& checksum_value,
+    uint64_t garbage_blob_count = 0, uint64_t garbage_blob_bytes = 0) {
+  ASSERT_EQ(bmd.blob_file_number, blob_file_number);
+  ASSERT_EQ(bmd.blob_file_name, BlobFileName("", blob_file_number));
+  ASSERT_EQ(bmd.blob_file_size,
+            total_blob_bytes + BlobLogHeader::kSize + BlobLogFooter::kSize);
+
+  ASSERT_EQ(bmd.total_blob_count, total_blob_count);
+  ASSERT_EQ(bmd.total_blob_bytes, total_blob_bytes);
+  ASSERT_EQ(bmd.garbage_blob_count, garbage_blob_count);
+  ASSERT_EQ(bmd.garbage_blob_bytes, garbage_blob_bytes);
+  ASSERT_EQ(bmd.checksum_method, checksum_method);
+  ASSERT_EQ(bmd.checksum_value, checksum_value);
+}
+
+TEST_F(DBTest, MetaDataTest) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+
+  int64_t temp_time = 0;
+  options.env->GetCurrentTime(&temp_time);
+  uint64_t start_time = static_cast<uint64_t>(temp_time);
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_index = 0;
+  for (int i = 0; i < 100; ++i) {
+    // Add a single blob reference to each file
+    std::string blob_index;
+    BlobIndex::EncodeBlob(&blob_index, /* blob_file_number */ i + 1000,
+                          /* offset */ 1234, /* size */ 5678, kNoCompression);
+
+    WriteBatch batch;
+    ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, Key(key_index),
+                                               blob_index));
+    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+
+    ++key_index;
+
+    // Fill up the rest of the file with random values.
+    GenerateNewFile(&rnd, &key_index, /* nowait */ true);
+
+    ASSERT_OK(Flush());
+  }
+
+  std::vector<std::vector<FileMetaData>> files_by_level;
+  dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files_by_level);
+
+  options.env->GetCurrentTime(&temp_time);
+  uint64_t end_time = static_cast<uint64_t>(temp_time);
+
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(&cf_meta);
+  CheckColumnFamilyMeta(cf_meta, kDefaultColumnFamilyName, files_by_level,
+                        start_time, end_time);
+  std::vector<LiveFileMetaData> live_file_meta;
+  db_->GetLiveFilesMetaData(&live_file_meta);
+  CheckLiveFilesMeta(live_file_meta, files_by_level);
+}
+
+TEST_F(DBTest, AllMetaDataTest) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  constexpr uint64_t blob_file_number = 234;
+  constexpr uint64_t total_blob_count = 555;
+  constexpr uint64_t total_blob_bytes = 66666;
+  constexpr char checksum_method[] = "CRC32";
+  constexpr char checksum_value[] = "\x3d\x87\xff\x57";
+
+  int64_t temp_time = 0;
+  options.env->GetCurrentTime(&temp_time).PermitUncheckedError();
+  uint64_t start_time = static_cast<uint64_t>(temp_time);
+
+  Random rnd(301);
+  dbfull()->TEST_LockMutex();
+  for (int cf = 0; cf < 2; cf++) {
+    AddBlobFile(handles_[cf], blob_file_number * (cf + 1),
+                total_blob_count * (cf + 1), total_blob_bytes * (cf + 1),
+                checksum_method, checksum_value);
+  }
+  dbfull()->TEST_UnlockMutex();
+
+  std::vector<ColumnFamilyMetaData> all_meta;
+  db_->GetAllColumnFamilyMetaData(&all_meta);
+
+  std::vector<std::vector<FileMetaData>> default_files_by_level;
+  std::vector<std::vector<FileMetaData>> pikachu_files_by_level;
+  dbfull()->TEST_GetFilesMetaData(handles_[0], &default_files_by_level);
+  dbfull()->TEST_GetFilesMetaData(handles_[1], &pikachu_files_by_level);
+
+  options.env->GetCurrentTime(&temp_time).PermitUncheckedError();
+  uint64_t end_time = static_cast<uint64_t>(temp_time);
+
+  ASSERT_EQ(all_meta.size(), 2);
+  for (int cf = 0; cf < 2; cf++) {
+    const auto& cfmd = all_meta[cf];
+    if (cf == 0) {
+      CheckColumnFamilyMeta(cfmd, "default", default_files_by_level, start_time,
+                            end_time);
+    } else {
+      CheckColumnFamilyMeta(cfmd, "pikachu", pikachu_files_by_level, start_time,
+                            end_time);
+    }
+    ASSERT_EQ(cfmd.blob_files.size(), 1U);
+    const auto& bmd = cfmd.blob_files[0];
+    ASSERT_EQ(cfmd.blob_file_count, 1U);
+    ASSERT_EQ(cfmd.blob_file_size, bmd.blob_file_size);
+    ASSERT_EQ(NormalizePath(bmd.blob_file_path), NormalizePath(dbname_));
+    CheckBlobMetaData(bmd, blob_file_number * (cf + 1),
+                      total_blob_count * (cf + 1), total_blob_bytes * (cf + 1),
+                      checksum_method, checksum_value);
+  }
+}
+
+namespace {
+void MinLevelHelper(DBTest* self, Options& options) {
+  Random rnd(301);
+
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    std::vector<std::string> values;
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      values.push_back(rnd.RandomString(10000));
+      ASSERT_OK(self->Put(DBTestBase::Key(i), values[i]));
+    }
+    ASSERT_OK(self->dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_EQ(self->NumTableFilesAtLevel(0), num + 1);
+  }
+
+  // generate one more file in level-0, and should trigger level-0 compaction
+  std::vector<std::string> values;
+  for (int i = 0; i < 12; i++) {
+    values.push_back(rnd.RandomString(10000));
+    ASSERT_OK(self->Put(DBTestBase::Key(i), values[i]));
+  }
+  ASSERT_OK(self->dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ(self->NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(self->NumTableFilesAtLevel(1), 1);
+}
+
+// returns false if the calling-Test should be skipped
+bool MinLevelToCompress(CompressionType& type, Options& options, int wbits,
+                        int lev, int strategy) {
+  fprintf(stderr,
+          "Test with compression options : window_bits = %d, level =  %d, "
+          "strategy = %d}\n",
+          wbits, lev, strategy);
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.arena_block_size = 4096;
+  options.num_levels = 3;
+  options.level0_file_num_compaction_trigger = 3;
+  options.create_if_missing = true;
+
+  if (Snappy_Supported()) {
+    type = kSnappyCompression;
+    fprintf(stderr, "using snappy\n");
+  } else if (Zlib_Supported()) {
+    type = kZlibCompression;
+    fprintf(stderr, "using zlib\n");
+  } else if (BZip2_Supported()) {
+    type = kBZip2Compression;
+    fprintf(stderr, "using bzip2\n");
+  } else if (LZ4_Supported()) {
+    type = kLZ4Compression;
+    fprintf(stderr, "using lz4\n");
+  } else if (XPRESS_Supported()) {
+    type = kXpressCompression;
+    fprintf(stderr, "using xpress\n");
+  } else if (ZSTD_Supported()) {
+    type = kZSTD;
+    fprintf(stderr, "using ZSTD\n");
+  } else {
+    fprintf(stderr, "skipping test, compression disabled\n");
+    return false;
+  }
+  options.compression_per_level.resize(options.num_levels);
+
+  // do not compress L0
+  for (int i = 0; i < 1; i++) {
+    options.compression_per_level[i] = kNoCompression;
+  }
+  for (int i = 1; i < options.num_levels; i++) {
+    options.compression_per_level[i] = type;
+  }
+  return true;
+}
+}  // anonymous namespace
+
+TEST_F(DBTest, MinLevelToCompress1) {
+  Options options = CurrentOptions();
+  CompressionType type = kSnappyCompression;
+  if (!MinLevelToCompress(type, options, -14, -1, 0)) {
+    return;
+  }
+  Reopen(options);
+  MinLevelHelper(this, options);
+
+  // do not compress L0 and L1
+  for (int i = 0; i < 2; i++) {
+    options.compression_per_level[i] = kNoCompression;
+  }
+  for (int i = 2; i < options.num_levels; i++) {
+    options.compression_per_level[i] = type;
+  }
+  DestroyAndReopen(options);
+  MinLevelHelper(this, options);
+}
+
+TEST_F(DBTest, MinLevelToCompress2) {
+  Options options = CurrentOptions();
+  CompressionType type = kSnappyCompression;
+  if (!MinLevelToCompress(type, options, 15, -1, 0)) {
+    return;
+  }
+  Reopen(options);
+  MinLevelHelper(this, options);
+
+  // do not compress L0 and L1
+  for (int i = 0; i < 2; i++) {
+    options.compression_per_level[i] = kNoCompression;
+  }
+  for (int i = 2; i < options.num_levels; i++) {
+    options.compression_per_level[i] = type;
+  }
+  DestroyAndReopen(options);
+  MinLevelHelper(this, options);
+}
+
+// This test may fail because of a legit case that multiple L0 files
+// are trivial moved to L1.
+TEST_F(DBTest, DISABLED_RepeatedWritesToSameKey) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.write_buffer_size = 100000;  // Small write buffer
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // We must have at most one file per level except for level-0,
+    // which may have up to kL0_StopWritesTrigger files.
+    const int kMaxFiles =
+        options.num_levels + options.level0_stop_writes_trigger;
+
+    Random rnd(301);
+    std::string value =
+        rnd.RandomString(static_cast<int>(2 * options.write_buffer_size));
+    for (int i = 0; i < 5 * kMaxFiles; i++) {
+      ASSERT_OK(Put(1, "key", value));
+      ASSERT_LE(TotalTableFiles(1), kMaxFiles);
+    }
+  } while (ChangeCompactOptions());
+}
+#endif  // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+static bool Between(uint64_t val, uint64_t low, uint64_t high) {
+  bool result = (val >= low) && (val <= high);
+  if (!result) {
+    fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
+            (unsigned long long)(val), (unsigned long long)(low),
+            (unsigned long long)(high));
+  }
+  return result;
+}
+
+TEST_F(DBTest, ApproximateSizesMemTable) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000000;  // Large write buffer
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+  auto default_cf = db_->DefaultColumnFamily();
+
+  const int N = 128;
+  Random rnd(301);
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(1024)));
+  }
+
+  uint64_t size;
+  std::string start = Key(50);
+  std::string end = Key(60);
+  Range r(start, end);
+  SizeApproximationOptions size_approx_options;
+  size_approx_options.include_memtables = true;
+  size_approx_options.include_files = true;
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_GT(size, 6000);
+  ASSERT_LT(size, 204800);
+  // Zero if not including mem table
+  ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size));
+  ASSERT_EQ(size, 0);
+
+  start = Key(500);
+  end = Key(600);
+  r = Range(start, end);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_EQ(size, 0);
+
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(Put(Key(1000 + i), rnd.RandomString(1024)));
+  }
+
+  start = Key(500);
+  end = Key(600);
+  r = Range(start, end);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_EQ(size, 0);
+
+  start = Key(100);
+  end = Key(1020);
+  r = Range(start, end);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_GT(size, 6000);
+
+  options.max_write_buffer_number = 8;
+  options.min_write_buffer_number_to_merge = 5;
+  options.write_buffer_size = 1024 * N;  // Not very large
+  DestroyAndReopen(options);
+  default_cf = db_->DefaultColumnFamily();
+
+  int keys[N * 3];
+  for (int i = 0; i < N; i++) {
+    keys[i * 3] = i * 5;
+    keys[i * 3 + 1] = i * 5 + 1;
+    keys[i * 3 + 2] = i * 5 + 2;
+  }
+  // MemTable entry counting is estimated and can vary greatly depending on
+  // layout. Thus, using deterministic seed for test stability.
+  RandomShuffle(std::begin(keys), std::end(keys), rnd.Next());
+
+  for (int i = 0; i < N * 3; i++) {
+    ASSERT_OK(Put(Key(keys[i] + 1000), rnd.RandomString(1024)));
+  }
+
+  start = Key(100);
+  end = Key(300);
+  r = Range(start, end);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_EQ(size, 0);
+
+  start = Key(1050);
+  end = Key(1080);
+  r = Range(start, end);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_GT(size, 6000);
+
+  start = Key(2100);
+  end = Key(2300);
+  r = Range(start, end);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_EQ(size, 0);
+
+  start = Key(1050);
+  end = Key(1080);
+  r = Range(start, end);
+  uint64_t size_with_mt, size_without_mt;
+  ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
+                                     &size_with_mt));
+  ASSERT_GT(size_with_mt, 6000);
+  ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size_without_mt));
+  ASSERT_EQ(size_without_mt, 0);
+
+  ASSERT_OK(Flush());
+
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(Put(Key(i + 1000), rnd.RandomString(1024)));
+  }
+
+  start = Key(1050);
+  end = Key(1080);
+  r = Range(start, end);
+  ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
+                                     &size_with_mt));
+  ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size_without_mt));
+  ASSERT_GT(size_with_mt, size_without_mt);
+  ASSERT_GT(size_without_mt, 6000);
+
+  // Check that include_memtables flag works as expected
+  size_approx_options.include_memtables = false;
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_EQ(size, size_without_mt);
+
+  // Check that files_size_error_margin works as expected, when the heuristic
+  // conditions are not met
+  start = Key(1);
+  end = Key(1000 + N - 2);
+  r = Range(start, end);
+  size_approx_options.files_size_error_margin = -1.0;  // disabled
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  uint64_t size2;
+  size_approx_options.files_size_error_margin = 0.5;  // enabled, but not used
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2));
+  ASSERT_EQ(size, size2);
+}
+
+TEST_F(DBTest, ApproximateSizesFilesWithErrorMargin) {
+  // Roughly 4 keys per data block, 1000 keys per file,
+  // with filter substantially larger than a data block
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(16));
+  table_options.block_size = 100;
+  Options options = CurrentOptions();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.write_buffer_size = 24 * 1024;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options.target_file_size_base = 24 * 1024;
+  DestroyAndReopen(options);
+  const auto default_cf = db_->DefaultColumnFamily();
+
+  const int N = 64000;
+  Random rnd(301);
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(24)));
+  }
+  // Flush everything to files
+  ASSERT_OK(Flush());
+  // Compact the entire key space into the next level
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), default_cf, nullptr, nullptr));
+
+  // Write more keys
+  for (int i = N; i < (N + N / 4); i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(24)));
+  }
+  // Flush everything to files again
+  ASSERT_OK(Flush());
+
+  // Wait for compaction to finish
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  {
+    const std::string start = Key(0);
+    const std::string end = Key(2 * N);
+    const Range r(start, end);
+
+    SizeApproximationOptions size_approx_options;
+    size_approx_options.include_memtables = false;
+    size_approx_options.include_files = true;
+    size_approx_options.files_size_error_margin = -1.0;  // disabled
+
+    // Get the precise size without any approximation heuristic
+    uint64_t size;
+    ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
+                                       &size));
+    ASSERT_NE(size, 0);
+
+    // Get the size with an approximation heuristic
+    uint64_t size2;
+    const double error_margin = 0.2;
+    size_approx_options.files_size_error_margin = error_margin;
+    ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
+                                       &size2));
+    ASSERT_LT(size2, size * (1 + error_margin));
+    ASSERT_GT(size2, size * (1 - error_margin));
+  }
+
+  {
+    // Ensure that metadata is not falsely attributed only to the last data in
+    // the file. (In some applications, filters can be large portion of data
+    // size.)
+    // Perform many queries over small range, enough to ensure crossing file
+    // boundary, and make sure we never see a spike for large filter.
+    for (int i = 0; i < 3000; i += 10) {
+      const std::string start = Key(i);
+      const std::string end = Key(i + 11);  // overlap by 1 key
+      const Range r(start, end);
+      uint64_t size;
+      ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size));
+      ASSERT_LE(size, 11 * 100);
+    }
+  }
+}
+
+TEST_F(DBTest, GetApproximateMemTableStats) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000000;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+
+  const int N = 128;
+  Random rnd(301);
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(1024)));
+  }
+
+  uint64_t count;
+  uint64_t size;
+
+  std::string start = Key(50);
+  std::string end = Key(60);
+  Range r(start, end);
+  db_->GetApproximateMemTableStats(r, &count, &size);
+  ASSERT_GT(count, 0);
+  ASSERT_LE(count, N);
+  ASSERT_GT(size, 6000);
+  ASSERT_LT(size, 204800);
+
+  start = Key(500);
+  end = Key(600);
+  r = Range(start, end);
+  db_->GetApproximateMemTableStats(r, &count, &size);
+  ASSERT_EQ(count, 0);
+  ASSERT_EQ(size, 0);
+
+  ASSERT_OK(Flush());
+
+  start = Key(50);
+  end = Key(60);
+  r = Range(start, end);
+  db_->GetApproximateMemTableStats(r, &count, &size);
+  ASSERT_EQ(count, 0);
+  ASSERT_EQ(size, 0);
+
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(Put(Key(1000 + i), rnd.RandomString(1024)));
+  }
+
+  start = Key(100);
+  end = Key(1020);
+  r = Range(start, end);
+  db_->GetApproximateMemTableStats(r, &count, &size);
+  ASSERT_GT(count, 20);
+  ASSERT_GT(size, 6000);
+}
+
+TEST_F(DBTest, ApproximateSizes) {
+  do {
+    Options options = CurrentOptions();
+    options.write_buffer_size = 100000000;  // Large write buffer
+    options.compression = kNoCompression;
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    uint64_t size;
+    ASSERT_OK(Size("", "xyz", 1, &size));
+    ASSERT_TRUE(Between(size, 0, 0));
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_OK(Size("", "xyz", 1, &size));
+    ASSERT_TRUE(Between(size, 0, 0));
+
+    // Write 8MB (80 values, each 100K)
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+    const int N = 80;
+    static const int S1 = 100000;
+    static const int S2 = 105000;  // Allow some expansion from metadata
+    Random rnd(301);
+    for (int i = 0; i < N; i++) {
+      ASSERT_OK(Put(1, Key(i), rnd.RandomString(S1)));
+    }
+
+    // 0 because GetApproximateSizes() does not account for memtable space
+    ASSERT_OK(Size("", Key(50), 1, &size));
+    ASSERT_TRUE(Between(size, 0, 0));
+
+    // Check sizes across recovery by reopening a few times
+    for (int run = 0; run < 3; run++) {
+      ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+      for (int compact_start = 0; compact_start < N; compact_start += 10) {
+        for (int i = 0; i < N; i += 10) {
+          ASSERT_OK(Size("", Key(i), 1, &size));
+          ASSERT_TRUE(Between(size, S1 * i, S2 * i));
+          ASSERT_OK(Size("", Key(i) + ".suffix", 1, &size));
+          ASSERT_TRUE(Between(size, S1 * (i + 1), S2 * (i + 1)));
+          ASSERT_OK(Size(Key(i), Key(i + 10), 1, &size));
+          ASSERT_TRUE(Between(size, S1 * 10, S2 * 10));
+        }
+        ASSERT_OK(Size("", Key(50), 1, &size));
+        ASSERT_TRUE(Between(size, S1 * 50, S2 * 50));
+        ASSERT_OK(Size("", Key(50) + ".suffix", 1, &size));
+        ASSERT_TRUE(Between(size, S1 * 50, S2 * 50));
+
+        std::string cstart_str = Key(compact_start);
+        std::string cend_str = Key(compact_start + 9);
+        Slice cstart = cstart_str;
+        Slice cend = cend_str;
+        ASSERT_OK(dbfull()->TEST_CompactRange(0, &cstart, &cend, handles_[1]));
+      }
+
+      ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+      ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
+    }
+    // ApproximateOffsetOf() is not yet implemented in plain table format.
+  } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction |
+                         kSkipPlainTable | kSkipHashIndex));
+}
+
+TEST_F(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
+  do {
+    Options options = CurrentOptions();
+    options.compression = kNoCompression;
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    Random rnd(301);
+    std::string big1 = rnd.RandomString(100000);
+    ASSERT_OK(Put(1, Key(0), rnd.RandomString(10000)));
+    ASSERT_OK(Put(1, Key(1), rnd.RandomString(10000)));
+    ASSERT_OK(Put(1, Key(2), big1));
+    ASSERT_OK(Put(1, Key(3), rnd.RandomString(10000)));
+    ASSERT_OK(Put(1, Key(4), big1));
+    ASSERT_OK(Put(1, Key(5), rnd.RandomString(10000)));
+    ASSERT_OK(Put(1, Key(6), rnd.RandomString(300000)));
+    ASSERT_OK(Put(1, Key(7), rnd.RandomString(10000)));
+
+    // Check sizes across recovery by reopening a few times
+    uint64_t size;
+    for (int run = 0; run < 3; run++) {
+      ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+      ASSERT_OK(Size("", Key(0), 1, &size));
+      ASSERT_TRUE(Between(size, 0, 0));
+      ASSERT_OK(Size("", Key(1), 1, &size));
+      ASSERT_TRUE(Between(size, 10000, 11000));
+      ASSERT_OK(Size("", Key(2), 1, &size));
+      ASSERT_TRUE(Between(size, 20000, 21000));
+      ASSERT_OK(Size("", Key(3), 1, &size));
+      ASSERT_TRUE(Between(size, 120000, 121000));
+      ASSERT_OK(Size("", Key(4), 1, &size));
+      ASSERT_TRUE(Between(size, 130000, 131000));
+      ASSERT_OK(Size("", Key(5), 1, &size));
+      ASSERT_TRUE(Between(size, 230000, 232000));
+      ASSERT_OK(Size("", Key(6), 1, &size));
+      ASSERT_TRUE(Between(size, 240000, 242000));
+      // Ensure some overhead is accounted for, even without including all
+      ASSERT_OK(Size("", Key(7), 1, &size));
+      ASSERT_TRUE(Between(size, 540500, 545000));
+      ASSERT_OK(Size("", Key(8), 1, &size));
+      ASSERT_TRUE(Between(size, 550500, 555000));
+
+      ASSERT_OK(Size(Key(3), Key(5), 1, &size));
+      ASSERT_TRUE(Between(size, 110100, 111000));
+
+      ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+    }
+    // ApproximateOffsetOf() is not yet implemented in plain table format.
+  } while (ChangeOptions(kSkipPlainTable));
+}
+#endif  // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, Snapshot) {
+  env_->SetMockSleep();
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
+    ASSERT_OK(Put(0, "foo", "0v1"));
+    ASSERT_OK(Put(1, "foo", "1v1"));
+
+    const Snapshot* s1 = db_->GetSnapshot();
+    ASSERT_EQ(1U, GetNumSnapshots());
+    uint64_t time_snap1 = GetTimeOldestSnapshots();
+    ASSERT_GT(time_snap1, 0U);
+    ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+    ASSERT_EQ(GetTimeOldestSnapshots(),
+              static_cast<uint64_t>(s1->GetUnixTime()));
+    ASSERT_OK(Put(0, "foo", "0v2"));
+    ASSERT_OK(Put(1, "foo", "1v2"));
+
+    env_->MockSleepForSeconds(1);
+
+    const Snapshot* s2 = db_->GetSnapshot();
+    ASSERT_EQ(2U, GetNumSnapshots());
+    ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+    ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+    ASSERT_EQ(GetTimeOldestSnapshots(),
+              static_cast<uint64_t>(s1->GetUnixTime()));
+    ASSERT_OK(Put(0, "foo", "0v3"));
+    ASSERT_OK(Put(1, "foo", "1v3"));
+
+    {
+      ManagedSnapshot s3(db_);
+      ASSERT_EQ(3U, GetNumSnapshots());
+      ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+      ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+      ASSERT_EQ(GetTimeOldestSnapshots(),
+                static_cast<uint64_t>(s1->GetUnixTime()));
+
+      ASSERT_OK(Put(0, "foo", "0v4"));
+      ASSERT_OK(Put(1, "foo", "1v4"));
+      ASSERT_EQ("0v1", Get(0, "foo", s1));
+      ASSERT_EQ("1v1", Get(1, "foo", s1));
+      ASSERT_EQ("0v2", Get(0, "foo", s2));
+      ASSERT_EQ("1v2", Get(1, "foo", s2));
+      ASSERT_EQ("0v3", Get(0, "foo", s3.snapshot()));
+      ASSERT_EQ("1v3", Get(1, "foo", s3.snapshot()));
+      ASSERT_EQ("0v4", Get(0, "foo"));
+      ASSERT_EQ("1v4", Get(1, "foo"));
+    }
+
+    ASSERT_EQ(2U, GetNumSnapshots());
+    ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+    ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+    ASSERT_EQ(GetTimeOldestSnapshots(),
+              static_cast<uint64_t>(s1->GetUnixTime()));
+    ASSERT_EQ("0v1", Get(0, "foo", s1));
+    ASSERT_EQ("1v1", Get(1, "foo", s1));
+    ASSERT_EQ("0v2", Get(0, "foo", s2));
+    ASSERT_EQ("1v2", Get(1, "foo", s2));
+    ASSERT_EQ("0v4", Get(0, "foo"));
+    ASSERT_EQ("1v4", Get(1, "foo"));
+
+    db_->ReleaseSnapshot(s1);
+    ASSERT_EQ("0v2", Get(0, "foo", s2));
+    ASSERT_EQ("1v2", Get(1, "foo", s2));
+    ASSERT_EQ("0v4", Get(0, "foo"));
+    ASSERT_EQ("1v4", Get(1, "foo"));
+    ASSERT_EQ(1U, GetNumSnapshots());
+    ASSERT_LT(time_snap1, GetTimeOldestSnapshots());
+    ASSERT_EQ(GetSequenceOldestSnapshots(), s2->GetSequenceNumber());
+    ASSERT_EQ(GetTimeOldestSnapshots(),
+              static_cast<uint64_t>(s2->GetUnixTime()));
+
+    db_->ReleaseSnapshot(s2);
+    ASSERT_EQ(0U, GetNumSnapshots());
+    ASSERT_EQ(GetSequenceOldestSnapshots(), 0);
+    ASSERT_EQ("0v4", Get(0, "foo"));
+    ASSERT_EQ("1v4", Get(1, "foo"));
+  } while (ChangeOptions());
+}
+
+TEST_F(DBTest, HiddenValuesAreRemoved) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  uint64_t size;
+  do {
+    Options options = CurrentOptions(options_override);
+    CreateAndReopenWithCF({"pikachu"}, options);
+    Random rnd(301);
+    FillLevels("a", "z", 1);
+
+    std::string big = rnd.RandomString(50000);
+    ASSERT_OK(Put(1, "foo", big));
+    ASSERT_OK(Put(1, "pastfoo", "v"));
+    const Snapshot* snapshot = db_->GetSnapshot();
+    ASSERT_OK(Put(1, "foo", "tiny"));
+    ASSERT_OK(Put(1, "pastfoo2", "v2"));  // Advance sequence number one more
+
+    ASSERT_OK(Flush(1));
+    ASSERT_GT(NumTableFilesAtLevel(0, 1), 0);
+
+    ASSERT_EQ(big, Get(1, "foo", snapshot));
+    ASSERT_OK(Size("", "pastfoo", 1, &size));
+    ASSERT_TRUE(Between(size, 50000, 60000));
+    db_->ReleaseSnapshot(snapshot);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny, " + big + " ]");
+    Slice x("x");
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, &x, handles_[1]));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]");
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+    ASSERT_GE(NumTableFilesAtLevel(1, 1), 1);
+    ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, &x, handles_[1]));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]");
+
+    ASSERT_OK(Size("", "pastfoo", 1, &size));
+    ASSERT_TRUE(Between(size, 0, 1000));
+    // ApproximateOffsetOf() is not yet implemented in plain table format,
+    // which is used by Size().
+  } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction |
+                         kSkipPlainTable));
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, UnremovableSingleDelete) {
+  // If we compact:
+  //
+  // Put(A, v1) Snapshot SingleDelete(A) Put(A, v2)
+  //
+  // We do not want to end up with:
+  //
+  // Put(A, v1) Snapshot Put(A, v2)
+  //
+  // Because a subsequent SingleDelete(A) would delete the Put(A, v2)
+  // but not Put(A, v1), so Get(A) would return v1.
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  do {
+    Options options = CurrentOptions(options_override);
+    options.disable_auto_compactions = true;
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    ASSERT_OK(Put(1, "foo", "first"));
+    const Snapshot* snapshot = db_->GetSnapshot();
+    ASSERT_OK(SingleDelete(1, "foo"));
+    ASSERT_OK(Put(1, "foo", "second"));
+    ASSERT_OK(Flush(1));
+
+    ASSERT_EQ("first", Get(1, "foo", snapshot));
+    ASSERT_EQ("second", Get(1, "foo"));
+
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
+    ASSERT_EQ("[ second, SDEL, first ]", AllEntriesFor("foo", 1));
+
+    ASSERT_OK(SingleDelete(1, "foo"));
+
+    ASSERT_EQ("first", Get(1, "foo", snapshot));
+    ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
+
+    ASSERT_EQ("first", Get(1, "foo", snapshot));
+    ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+    db_->ReleaseSnapshot(snapshot);
+    // Skip FIFO and universal compaction because they do not apply to the test
+    // case. Skip MergePut because single delete does not get removed when it
+    // encounters a merge.
+  } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+                         kSkipMergePut));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, DeletionMarkers1) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_OK(Put(1, "foo", "v1"));
+  ASSERT_OK(Flush(1));
+  const int last = 2;
+  MoveFilesToLevel(last, 1);
+  // foo => v1 is now in last level
+  ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
+
+  // Place a table at level last-1 to prevent merging with preceding mutation
+  ASSERT_OK(Put(1, "a", "begin"));
+  ASSERT_OK(Put(1, "z", "end"));
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(last - 1, 1);
+  ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
+  ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1);
+
+  ASSERT_OK(Delete(1, "foo"));
+  ASSERT_OK(Put(1, "foo", "v2"));
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]");
+  ASSERT_OK(Flush(1));  // Moves to level last-2
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
+  Slice z("z");
+  ASSERT_OK(dbfull()->TEST_CompactRange(last - 2, nullptr, &z, handles_[1]));
+  // DEL eliminated, but v1 remains because we aren't compacting that level
+  // (DEL can be eliminated because v2 hides v1).
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
+  ASSERT_OK(
+      dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]));
+  // Merging last-1 w/ last, so we are the base level for "foo", so
+  // DEL is removed.  (as is v1).
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]");
+}
+
+TEST_F(DBTest, DeletionMarkers2) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_OK(Put(1, "foo", "v1"));
+  ASSERT_OK(Flush(1));
+  const int last = 2;
+  MoveFilesToLevel(last, 1);
+  // foo => v1 is now in last level
+  ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
+
+  // Place a table at level last-1 to prevent merging with preceding mutation
+  ASSERT_OK(Put(1, "a", "begin"));
+  ASSERT_OK(Put(1, "z", "end"));
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(last - 1, 1);
+  ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
+  ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1);
+
+  ASSERT_OK(Delete(1, "foo"));
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
+  ASSERT_OK(Flush(1));  // Moves to level last-2
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
+  ASSERT_OK(
+      dbfull()->TEST_CompactRange(last - 2, nullptr, nullptr, handles_[1]));
+  // DEL kept: "last" file overlaps
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
+  ASSERT_OK(
+      dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]));
+  // Merging last-1 w/ last, so we are the base level for "foo", so
+  // DEL is removed.  (as is v1).
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+}
+
+TEST_F(DBTest, OverlapInLevel0) {
+  do {
+    Options options = CurrentOptions();
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Fill levels 1 and 2 to disable the pushing of new memtables to levels >
+    // 0.
+    ASSERT_OK(Put(1, "100", "v100"));
+    ASSERT_OK(Put(1, "999", "v999"));
+    ASSERT_OK(Flush(1));
+    MoveFilesToLevel(2, 1);
+    ASSERT_OK(Delete(1, "100"));
+    ASSERT_OK(Delete(1, "999"));
+    ASSERT_OK(Flush(1));
+    MoveFilesToLevel(1, 1);
+    ASSERT_EQ("0,1,1", FilesPerLevel(1));
+
+    // Make files spanning the following ranges in level-0:
+    //  files[0]  200 .. 900
+    //  files[1]  300 .. 500
+    // Note that files are sorted by smallest key.
+    ASSERT_OK(Put(1, "300", "v300"));
+    ASSERT_OK(Put(1, "500", "v500"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Put(1, "200", "v200"));
+    ASSERT_OK(Put(1, "600", "v600"));
+    ASSERT_OK(Put(1, "900", "v900"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ("2,1,1", FilesPerLevel(1));
+
+    // BEGIN addition to existing test
+    // Take this opportunity to verify SST unique ids (including Plain table)
+    TablePropertiesCollection tbc;
+    ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[1], &tbc));
+    VerifySstUniqueIds(tbc);
+    // END addition to existing test
+
+    // Compact away the placeholder files we created initially
+    ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
+    ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr, handles_[1]));
+    ASSERT_EQ("2", FilesPerLevel(1));
+
+    // Do a memtable compaction.  Before bug-fix, the compaction would
+    // not detect the overlap with level-0 files and would incorrectly place
+    // the deletion in a deeper level.
+    ASSERT_OK(Delete(1, "600"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ("3", FilesPerLevel(1));
+    ASSERT_EQ("NOT_FOUND", Get(1, "600"));
+  } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction));
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, ComparatorCheck) {
+  class NewComparator : public Comparator {
+   public:
+    const char* Name() const override { return "rocksdb.NewComparator"; }
+    int Compare(const Slice& a, const Slice& b) const override {
+      return BytewiseComparator()->Compare(a, b);
+    }
+    void FindShortestSeparator(std::string* s, const Slice& l) const override {
+      BytewiseComparator()->FindShortestSeparator(s, l);
+    }
+    void FindShortSuccessor(std::string* key) const override {
+      BytewiseComparator()->FindShortSuccessor(key);
+    }
+  };
+  Options new_options, options;
+  NewComparator cmp;
+  do {
+    options = CurrentOptions();
+    CreateAndReopenWithCF({"pikachu"}, options);
+    new_options = CurrentOptions();
+    new_options.comparator = &cmp;
+    // only the non-default column family has non-matching comparator
+    Status s = TryReopenWithColumnFamilies(
+        {"default", "pikachu"}, std::vector<Options>({options, new_options}));
+    ASSERT_TRUE(!s.ok());
+    ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos)
+        << s.ToString();
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTest, CustomComparator) {
+  class NumberComparator : public Comparator {
+   public:
+    const char* Name() const override { return "test.NumberComparator"; }
+    int Compare(const Slice& a, const Slice& b) const override {
+      return ToNumber(a) - ToNumber(b);
+    }
+    void FindShortestSeparator(std::string* s, const Slice& l) const override {
+      ToNumber(*s);  // Check format
+      ToNumber(l);   // Check format
+    }
+    void FindShortSuccessor(std::string* key) const override {
+      ToNumber(*key);  // Check format
+    }
+
+   private:
+    static int ToNumber(const Slice& x) {
+      // Check that there are no extra characters.
+      EXPECT_TRUE(x.size() >= 2 && x[0] == '[' && x[x.size() - 1] == ']')
+          << EscapeString(x);
+      int val;
+      char ignored;
+      EXPECT_TRUE(sscanf(x.ToString().c_str(), "[%i]%c", &val, &ignored) == 1)
+          << EscapeString(x);
+      return val;
+    }
+  };
+  Options new_options;
+  NumberComparator cmp;
+  do {
+    new_options = CurrentOptions();
+    new_options.create_if_missing = true;
+    new_options.comparator = &cmp;
+    new_options.write_buffer_size = 4096;  // Compact more often
+    new_options.arena_block_size = 4096;
+    new_options = CurrentOptions(new_options);
+    DestroyAndReopen(new_options);
+    CreateAndReopenWithCF({"pikachu"}, new_options);
+    ASSERT_OK(Put(1, "[10]", "ten"));
+    ASSERT_OK(Put(1, "[0x14]", "twenty"));
+    for (int i = 0; i < 2; i++) {
+      ASSERT_EQ("ten", Get(1, "[10]"));
+      ASSERT_EQ("ten", Get(1, "[0xa]"));
+      ASSERT_EQ("twenty", Get(1, "[20]"));
+      ASSERT_EQ("twenty", Get(1, "[0x14]"));
+      ASSERT_EQ("NOT_FOUND", Get(1, "[15]"));
+      ASSERT_EQ("NOT_FOUND", Get(1, "[0xf]"));
+      Compact(1, "[0]", "[9999]");
+    }
+
+    for (int run = 0; run < 2; run++) {
+      for (int i = 0; i < 1000; i++) {
+        char buf[100];
+        snprintf(buf, sizeof(buf), "[%d]", i * 10);
+        ASSERT_OK(Put(1, buf, buf));
+      }
+      Compact(1, "[0]", "[1000000]");
+    }
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTest, DBOpen_Options) {
+  Options options = CurrentOptions();
+  std::string dbname = test::PerThreadDBPath("db_options_test");
+  ASSERT_OK(DestroyDB(dbname, options));
+
+  // Does not exist, and create_if_missing == false: error
+  DB* db = nullptr;
+  options.create_if_missing = false;
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr);
+  ASSERT_TRUE(db == nullptr);
+
+  // Does not exist, and create_if_missing == true: OK
+  options.create_if_missing = true;
+  s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  delete db;
+  db = nullptr;
+
+  // Does exist, and error_if_exists == true: error
+  options.create_if_missing = false;
+  options.error_if_exists = true;
+  s = DB::Open(options, dbname, &db);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr);
+  ASSERT_TRUE(db == nullptr);
+
+  // Does exist, and error_if_exists == false: OK
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  delete db;
+  db = nullptr;
+}
+
+TEST_F(DBTest, DBOpen_Change_NumLevels) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+  ASSERT_TRUE(db_ != nullptr);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "a", "123"));
+  ASSERT_OK(Put(1, "b", "234"));
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(3, 1);
+  Close();
+
+  options.create_if_missing = false;
+  options.num_levels = 2;
+  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr);
+  ASSERT_TRUE(db_ == nullptr);
+}
+
+TEST_F(DBTest, DestroyDBMetaDatabase) {
+  std::string dbname = test::PerThreadDBPath("db_meta");
+  ASSERT_OK(env_->CreateDirIfMissing(dbname));
+  std::string metadbname = MetaDatabaseName(dbname, 0);
+  ASSERT_OK(env_->CreateDirIfMissing(metadbname));
+  std::string metametadbname = MetaDatabaseName(metadbname, 0);
+  ASSERT_OK(env_->CreateDirIfMissing(metametadbname));
+
+  // Destroy previous versions if they exist. Using the long way.
+  Options options = CurrentOptions();
+  ASSERT_OK(DestroyDB(metametadbname, options));
+  ASSERT_OK(DestroyDB(metadbname, options));
+  ASSERT_OK(DestroyDB(dbname, options));
+
+  // Setup databases
+  DB* db = nullptr;
+  ASSERT_OK(DB::Open(options, dbname, &db));
+  delete db;
+  db = nullptr;
+  ASSERT_OK(DB::Open(options, metadbname, &db));
+  delete db;
+  db = nullptr;
+  ASSERT_OK(DB::Open(options, metametadbname, &db));
+  delete db;
+  db = nullptr;
+
+  // Delete databases
+  ASSERT_OK(DestroyDB(dbname, options));
+
+  // Check if deletion worked.
+  options.create_if_missing = false;
+  ASSERT_TRUE(!(DB::Open(options, dbname, &db)).ok());
+  ASSERT_TRUE(!(DB::Open(options, metadbname, &db)).ok());
+  ASSERT_TRUE(!(DB::Open(options, metametadbname, &db)).ok());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, SnapshotFiles) {
+  do {
+    Options options = CurrentOptions();
+    options.write_buffer_size = 100000000;  // Large write buffer
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    Random rnd(301);
+
+    // Write 8MB (80 values, each 100K)
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+    std::vector<std::string> values;
+    for (int i = 0; i < 80; i++) {
+      values.push_back(rnd.RandomString(100000));
+      ASSERT_OK(Put((i < 40), Key(i), values[i]));
+    }
+
+    // assert that nothing makes it to disk yet.
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+
+    // get a file snapshot
+    uint64_t manifest_number = 0;
+    uint64_t manifest_size = 0;
+    std::vector<std::string> files;
+    ASSERT_OK(dbfull()->DisableFileDeletions());
+    ASSERT_OK(dbfull()->GetLiveFiles(files, &manifest_size));
+
+    // CURRENT, MANIFEST, OPTIONS, *.sst files (one for each CF)
+    ASSERT_EQ(files.size(), 5U);
+
+    uint64_t number = 0;
+    FileType type;
+
+    // copy these files to a new snapshot directory
+    std::string snapdir = dbname_ + ".snapdir/";
+    if (env_->FileExists(snapdir).ok()) {
+      ASSERT_OK(DestroyDir(env_, snapdir));
+    }
+    ASSERT_OK(env_->CreateDir(snapdir));
+
+    for (size_t i = 0; i < files.size(); i++) {
+      // our clients require that GetLiveFiles returns
+      // files with "/" as first character!
+      ASSERT_EQ(files[i][0], '/');
+      std::string src = dbname_ + files[i];
+      std::string dest = snapdir + files[i];
+
+      uint64_t size;
+      ASSERT_OK(env_->GetFileSize(src, &size));
+
+      // record the number and the size of the
+      // latest manifest file
+      if (ParseFileName(files[i].substr(1), &number, &type)) {
+        if (type == kDescriptorFile) {
+          ASSERT_EQ(manifest_number, 0);
+          manifest_number = number;
+          ASSERT_GE(size, manifest_size);
+          size = manifest_size;  // copy only valid MANIFEST data
+        }
+      }
+      CopyFile(src, dest, size);
+    }
+
+    // release file snapshot
+    ASSERT_OK(dbfull()->EnableFileDeletions(/*force*/ false));
+    // overwrite one key, this key should not appear in the snapshot
+    std::vector<std::string> extras;
+    for (unsigned int i = 0; i < 1; i++) {
+      extras.push_back(rnd.RandomString(100000));
+      ASSERT_OK(Put(0, Key(i), extras[i]));
+    }
+
+    // verify that data in the snapshot are correct
+    std::vector<ColumnFamilyDescriptor> column_families;
+    column_families.emplace_back("default", ColumnFamilyOptions());
+    column_families.emplace_back("pikachu", ColumnFamilyOptions());
+    std::vector<ColumnFamilyHandle*> cf_handles;
+    DB* snapdb;
+    DBOptions opts;
+    opts.env = env_;
+    opts.create_if_missing = false;
+    Status stat =
+        DB::Open(opts, snapdir, column_families, &cf_handles, &snapdb);
+    ASSERT_OK(stat);
+
+    ReadOptions roptions;
+    std::string val;
+    for (unsigned int i = 0; i < 80; i++) {
+      ASSERT_OK(snapdb->Get(roptions, cf_handles[i < 40], Key(i), &val));
+      ASSERT_EQ(values[i].compare(val), 0);
+    }
+    for (auto cfh : cf_handles) {
+      delete cfh;
+    }
+    delete snapdb;
+
+    // look at the new live files after we added an 'extra' key
+    // and after we took the first snapshot.
+    uint64_t new_manifest_number = 0;
+    uint64_t new_manifest_size = 0;
+    std::vector<std::string> newfiles;
+    ASSERT_OK(dbfull()->DisableFileDeletions());
+    ASSERT_OK(dbfull()->GetLiveFiles(newfiles, &new_manifest_size));
+
+    // find the new manifest file. assert that this manifest file is
+    // the same one as in the previous snapshot. But its size should be
+    // larger because we added an extra key after taking the
+    // previous shapshot.
+    for (size_t i = 0; i < newfiles.size(); i++) {
+      std::string src = dbname_ + "/" + newfiles[i];
+      // record the lognumber and the size of the
+      // latest manifest file
+      if (ParseFileName(newfiles[i].substr(1), &number, &type)) {
+        if (type == kDescriptorFile) {
+          ASSERT_EQ(new_manifest_number, 0);
+          uint64_t size;
+          new_manifest_number = number;
+          ASSERT_OK(env_->GetFileSize(src, &size));
+          ASSERT_GE(size, new_manifest_size);
+        }
+      }
+    }
+    ASSERT_EQ(manifest_number, new_manifest_number);
+    ASSERT_GT(new_manifest_size, manifest_size);
+
+    // Also test GetLiveFilesStorageInfo
+    std::vector<LiveFileStorageInfo> new_infos;
+    ASSERT_OK(db_->GetLiveFilesStorageInfo(LiveFilesStorageInfoOptions(),
+                                           &new_infos));
+
+    // Close DB (while deletions disabled)
+    Close();
+
+    // Validate
+    for (auto& info : new_infos) {
+      std::string path = info.directory + "/" + info.relative_filename;
+      uint64_t size;
+      ASSERT_OK(env_->GetFileSize(path, &size));
+      if (info.trim_to_size) {
+        ASSERT_LE(info.size, size);
+      } else if (!info.replacement_contents.empty()) {
+        ASSERT_EQ(info.size, info.replacement_contents.size());
+      } else {
+        ASSERT_EQ(info.size, size);
+      }
+      if (info.file_type == kDescriptorFile) {
+        ASSERT_EQ(info.file_number, manifest_number);
+      }
+    }
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTest, ReadonlyDBGetLiveManifestSize) {
+  do {
+    Options options = CurrentOptions();
+    options.level0_file_num_compaction_trigger = 2;
+    DestroyAndReopen(options);
+
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+    Close();
+    ASSERT_OK(ReadOnlyReopen(options));
+
+    uint64_t manifest_size = 0;
+    std::vector<std::string> files;
+    ASSERT_OK(dbfull()->GetLiveFiles(files, &manifest_size));
+
+    for (const std::string& f : files) {
+      uint64_t number = 0;
+      FileType type;
+      if (ParseFileName(f.substr(1), &number, &type)) {
+        if (type == kDescriptorFile) {
+          uint64_t size_on_disk;
+          ASSERT_OK(env_->GetFileSize(dbname_ + "/" + f, &size_on_disk));
+          ASSERT_EQ(manifest_size, size_on_disk);
+          break;
+        }
+      }
+    }
+    Close();
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTest, GetLiveBlobFiles) {
+  // Note: the following prevents an otherwise harmless data race between the
+  // test setup code (AddBlobFile) below and the periodic stat dumping thread.
+  Options options = CurrentOptions();
+  options.stats_dump_period_sec = 0;
+
+  constexpr uint64_t blob_file_number = 234;
+  constexpr uint64_t total_blob_count = 555;
+  constexpr uint64_t total_blob_bytes = 66666;
+  constexpr char checksum_method[] = "CRC32";
+  constexpr char checksum_value[] = "\x3d\x87\xff\x57";
+  constexpr uint64_t garbage_blob_count = 0;
+  constexpr uint64_t garbage_blob_bytes = 0;
+
+  Reopen(options);
+
+  AddBlobFile(db_->DefaultColumnFamily(), blob_file_number, total_blob_count,
+              total_blob_bytes, checksum_method, checksum_value,
+              garbage_blob_count, garbage_blob_bytes);
+  // Make sure it appears in the results returned by GetLiveFiles.
+  uint64_t manifest_size = 0;
+  std::vector<std::string> files;
+  ASSERT_OK(dbfull()->GetLiveFiles(files, &manifest_size));
+
+  ASSERT_FALSE(files.empty());
+  ASSERT_EQ(files[0], BlobFileName("", blob_file_number));
+
+  ColumnFamilyMetaData cfmd;
+
+  db_->GetColumnFamilyMetaData(&cfmd);
+  ASSERT_EQ(cfmd.blob_files.size(), 1);
+  const BlobMetaData& bmd = cfmd.blob_files[0];
+
+  CheckBlobMetaData(bmd, blob_file_number, total_blob_count, total_blob_bytes,
+                    checksum_method, checksum_value, garbage_blob_count,
+                    garbage_blob_bytes);
+  ASSERT_EQ(NormalizePath(bmd.blob_file_path), NormalizePath(dbname_));
+  ASSERT_EQ(cfmd.blob_file_count, 1U);
+  ASSERT_EQ(cfmd.blob_file_size, bmd.blob_file_size);
+}
+#endif
+
+TEST_F(DBTest, PurgeInfoLogs) {
+  Options options = CurrentOptions();
+  options.keep_log_file_num = 5;
+  options.create_if_missing = true;
+  options.env = env_;
+  for (int mode = 0; mode <= 1; mode++) {
+    if (mode == 1) {
+      options.db_log_dir = dbname_ + "_logs";
+      ASSERT_OK(env_->CreateDirIfMissing(options.db_log_dir));
+    } else {
+      options.db_log_dir = "";
+    }
+    for (int i = 0; i < 8; i++) {
+      Reopen(options);
+    }
+
+    std::vector<std::string> files;
+    ASSERT_OK(env_->GetChildren(
+        options.db_log_dir.empty() ? dbname_ : options.db_log_dir, &files));
+    int info_log_count = 0;
+    for (std::string file : files) {
+      if (file.find("LOG") != std::string::npos) {
+        info_log_count++;
+      }
+    }
+    ASSERT_EQ(5, info_log_count);
+
+    Destroy(options);
+    // For mode (1), test DestroyDB() to delete all the logs under DB dir.
+    // For mode (2), no info log file should have been put under DB dir.
+    // Since dbname_ has no children, there is no need to loop db_files
+    std::vector<std::string> db_files;
+    ASSERT_TRUE(env_->GetChildren(dbname_, &db_files).IsNotFound());
+    ASSERT_TRUE(db_files.empty());
+
+    if (mode == 1) {
+      // Cleaning up
+      ASSERT_OK(env_->GetChildren(options.db_log_dir, &files));
+      for (std::string file : files) {
+        ASSERT_OK(env_->DeleteFile(options.db_log_dir + "/" + file));
+      }
+      ASSERT_OK(env_->DeleteDir(options.db_log_dir));
+    }
+  }
+}
+
+#ifndef ROCKSDB_LITE
+// Multi-threaded test:
+namespace {
+
+static const int kColumnFamilies = 10;
+static const int kNumThreads = 10;
+static const int kTestSeconds = 10;
+static const int kNumKeys = 1000;
+
+struct MTState {
+  DBTest* test;
+  std::atomic<int> counter[kNumThreads];
+};
+
+struct MTThread {
+  MTState* state;
+  int id;
+  bool multiget_batched;
+};
+
+static void MTThreadBody(void* arg) {
+  MTThread* t = reinterpret_cast<MTThread*>(arg);
+  int id = t->id;
+  DB* db = t->state->test->db_;
+  int counter = 0;
+  std::shared_ptr<SystemClock> clock = SystemClock::Default();
+  auto end_micros = clock->NowMicros() + kTestSeconds * 1000000U;
+
+  fprintf(stderr, "... starting thread %d\n", id);
+  Random rnd(1000 + id);
+  char valbuf[1500];
+  while (clock->NowMicros() < end_micros) {
+    t->state->counter[id].store(counter, std::memory_order_release);
+
+    int key = rnd.Uniform(kNumKeys);
+    char keybuf[20];
+    snprintf(keybuf, sizeof(keybuf), "%016d", key);
+
+    if (rnd.OneIn(2)) {
+      // Write values of the form <key, my id, counter, cf, unique_id>.
+      // into each of the CFs
+      // We add some padding for force compactions.
+      int unique_id = rnd.Uniform(1000000);
+
+      // Half of the time directly use WriteBatch. Half of the time use
+      // WriteBatchWithIndex.
+      if (rnd.OneIn(2)) {
+        WriteBatch batch;
+        for (int cf = 0; cf < kColumnFamilies; ++cf) {
+          snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id,
+                   static_cast<int>(counter), cf, unique_id);
+          ASSERT_OK(batch.Put(t->state->test->handles_[cf], Slice(keybuf),
+                              Slice(valbuf)));
+        }
+        ASSERT_OK(db->Write(WriteOptions(), &batch));
+      } else {
+        WriteBatchWithIndex batch(db->GetOptions().comparator);
+        for (int cf = 0; cf < kColumnFamilies; ++cf) {
+          snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id,
+                   static_cast<int>(counter), cf, unique_id);
+          ASSERT_OK(batch.Put(t->state->test->handles_[cf], Slice(keybuf),
+                              Slice(valbuf)));
+        }
+        ASSERT_OK(db->Write(WriteOptions(), batch.GetWriteBatch()));
+      }
+    } else {
+      // Read a value and verify that it matches the pattern written above
+      // and that writes to all column families were atomic (unique_id is the
+      // same)
+      std::vector<Slice> keys(kColumnFamilies, Slice(keybuf));
+      std::vector<std::string> values;
+      std::vector<Status> statuses;
+      if (!t->multiget_batched) {
+        statuses = db->MultiGet(ReadOptions(), t->state->test->handles_, keys,
+                                &values);
+      } else {
+        std::vector<PinnableSlice> pin_values(keys.size());
+        statuses.resize(keys.size());
+        const Snapshot* snapshot = db->GetSnapshot();
+        ReadOptions ro;
+        ro.snapshot = snapshot;
+        for (int cf = 0; cf < kColumnFamilies; ++cf) {
+          db->MultiGet(ro, t->state->test->handles_[cf], 1, &keys[cf],
+                       &pin_values[cf], &statuses[cf]);
+        }
+        db->ReleaseSnapshot(snapshot);
+        values.resize(keys.size());
+        for (int cf = 0; cf < kColumnFamilies; ++cf) {
+          if (statuses[cf].ok()) {
+            values[cf].assign(pin_values[cf].data(), pin_values[cf].size());
+          }
+        }
+      }
+      Status s = statuses[0];
+      // all statuses have to be the same
+      for (size_t i = 1; i < statuses.size(); ++i) {
+        // they are either both ok or both not-found
+        ASSERT_TRUE((s.ok() && statuses[i].ok()) ||
+                    (s.IsNotFound() && statuses[i].IsNotFound()));
+      }
+      if (s.IsNotFound()) {
+        // Key has not yet been written
+      } else {
+        // Check that the writer thread counter is >= the counter in the value
+        ASSERT_OK(s);
+        int unique_id = -1;
+        for (int i = 0; i < kColumnFamilies; ++i) {
+          int k, w, c, cf, u;
+          ASSERT_EQ(5, sscanf(values[i].c_str(), "%d.%d.%d.%d.%d", &k, &w, &c,
+                              &cf, &u))
+              << values[i];
+          ASSERT_EQ(k, key);
+          ASSERT_GE(w, 0);
+          ASSERT_LT(w, kNumThreads);
+          ASSERT_LE(c, t->state->counter[w].load(std::memory_order_acquire));
+          ASSERT_EQ(cf, i);
+          if (i == 0) {
+            unique_id = u;
+          } else {
+            // this checks that updates across column families happened
+            // atomically -- all unique ids are the same
+            ASSERT_EQ(u, unique_id);
+          }
+        }
+      }
+    }
+    counter++;
+  }
+  fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter));
+}
+
+}  // anonymous namespace
+
+class MultiThreadedDBTest
+    : public DBTest,
+      public ::testing::WithParamInterface<std::tuple<int, bool>> {
+ public:
+  void SetUp() override {
+    std::tie(option_config_, multiget_batched_) = GetParam();
+  }
+
+  static std::vector<int> GenerateOptionConfigs() {
+    std::vector<int> optionConfigs;
+    for (int optionConfig = kDefault; optionConfig < kEnd; ++optionConfig) {
+      optionConfigs.push_back(optionConfig);
+    }
+    return optionConfigs;
+  }
+
+  bool multiget_batched_;
+};
+
+TEST_P(MultiThreadedDBTest, MultiThreaded) {
+  if (option_config_ == kPipelinedWrite) return;
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  Options options = CurrentOptions(options_override);
+  std::vector<std::string> cfs;
+  for (int i = 1; i < kColumnFamilies; ++i) {
+    cfs.push_back(std::to_string(i));
+  }
+  Reopen(options);
+  CreateAndReopenWithCF(cfs, options);
+  // Initialize state
+  MTState mt;
+  mt.test = this;
+  for (int id = 0; id < kNumThreads; id++) {
+    mt.counter[id].store(0, std::memory_order_release);
+  }
+
+  // Start threads
+  MTThread thread[kNumThreads];
+  for (int id = 0; id < kNumThreads; id++) {
+    thread[id].state = &mt;
+    thread[id].id = id;
+    thread[id].multiget_batched = multiget_batched_;
+    env_->StartThread(MTThreadBody, &thread[id]);
+  }
+
+  env_->WaitForJoin();
+}
+
+INSTANTIATE_TEST_CASE_P(
+    MultiThreaded, MultiThreadedDBTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(MultiThreadedDBTest::GenerateOptionConfigs()),
+        ::testing::Bool()));
+#endif  // ROCKSDB_LITE
+
+// Group commit test:
+#if !defined(OS_WIN)
+// Disable this test temporarily on Travis and appveyor as it fails
+// intermittently. Github issue: #4151
+namespace {
+
+static const int kGCNumThreads = 4;
+static const int kGCNumKeys = 1000;
+
+struct GCThread {
+  DB* db;
+  int id;
+  std::atomic<bool> done;
+};
+
+static void GCThreadBody(void* arg) {
+  GCThread* t = reinterpret_cast<GCThread*>(arg);
+  int id = t->id;
+  DB* db = t->db;
+  WriteOptions wo;
+
+  for (int i = 0; i < kGCNumKeys; ++i) {
+    std::string kv(std::to_string(i + id * kGCNumKeys));
+    ASSERT_OK(db->Put(wo, kv, kv));
+  }
+  t->done = true;
+}
+
+}  // anonymous namespace
+
+TEST_F(DBTest, GroupCommitTest) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    Reopen(options);
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+        {{"WriteThread::JoinBatchGroup:BeganWaiting",
+          "DBImpl::WriteImpl:BeforeLeaderEnters"},
+         {"WriteThread::AwaitState:BlockingWaiting",
+          "WriteThread::EnterAsBatchGroupLeader:End"}});
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    // Start threads
+    GCThread thread[kGCNumThreads];
+    for (int id = 0; id < kGCNumThreads; id++) {
+      thread[id].id = id;
+      thread[id].db = db_;
+      thread[id].done = false;
+      env_->StartThread(GCThreadBody, &thread[id]);
+    }
+    env_->WaitForJoin();
+
+    ASSERT_GT(TestGetTickerCount(options, WRITE_DONE_BY_OTHER), 0);
+
+    std::vector<std::string> expected_db;
+    for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) {
+      expected_db.push_back(std::to_string(i));
+    }
+    std::sort(expected_db.begin(), expected_db.end());
+
+    Iterator* itr = db_->NewIterator(ReadOptions());
+    itr->SeekToFirst();
+    for (auto x : expected_db) {
+      ASSERT_TRUE(itr->Valid());
+      ASSERT_EQ(itr->key().ToString(), x);
+      ASSERT_EQ(itr->value().ToString(), x);
+      itr->Next();
+    }
+    ASSERT_TRUE(!itr->Valid());
+    delete itr;
+
+    HistogramData hist_data;
+    options.statistics->histogramData(DB_WRITE, &hist_data);
+    ASSERT_GT(hist_data.average, 0.0);
+  } while (ChangeOptions(kSkipNoSeekToLast));
+}
+#endif  // OS_WIN
+
+namespace {
+using KVMap = std::map<std::string, std::string>;
+}
+
+class ModelDB : public DB {
+ public:
+  class ModelSnapshot : public Snapshot {
+   public:
+    KVMap map_;
+
+    SequenceNumber GetSequenceNumber() const override {
+      // no need to call this
+      assert(false);
+      return 0;
+    }
+
+    int64_t GetUnixTime() const override {
+      // no need to call this
+      assert(false);
+      return 0;
+    }
+
+    uint64_t GetTimestamp() const override {
+      // no need to call this
+      assert(false);
+      return 0;
+    }
+  };
+
+  explicit ModelDB(const Options& options) : options_(options) {}
+  using DB::Put;
+  Status Put(const WriteOptions& o, ColumnFamilyHandle* cf, const Slice& k,
+             const Slice& v) override {
+    WriteBatch batch;
+    Status s = batch.Put(cf, k, v);
+    if (!s.ok()) {
+      return s;
+    }
+    return Write(o, &batch);
+  }
+  Status Put(const WriteOptions& /*o*/, ColumnFamilyHandle* /*cf*/,
+             const Slice& /*k*/, const Slice& /*ts*/,
+             const Slice& /*v*/) override {
+    return Status::NotSupported();
+  }
+
+  using DB::PutEntity;
+  Status PutEntity(const WriteOptions& /* options */,
+                   ColumnFamilyHandle* /* column_family */,
+                   const Slice& /* key */,
+                   const WideColumns& /* columns */) override {
+    return Status::NotSupported();
+  }
+
+  using DB::Close;
+  Status Close() override { return Status::OK(); }
+  using DB::Delete;
+  Status Delete(const WriteOptions& o, ColumnFamilyHandle* cf,
+                const Slice& key) override {
+    WriteBatch batch;
+    Status s = batch.Delete(cf, key);
+    if (!s.ok()) {
+      return s;
+    }
+    return Write(o, &batch);
+  }
+  Status Delete(const WriteOptions& /*o*/, ColumnFamilyHandle* /*cf*/,
+                const Slice& /*key*/, const Slice& /*ts*/) override {
+    return Status::NotSupported();
+  }
+  using DB::SingleDelete;
+  Status SingleDelete(const WriteOptions& o, ColumnFamilyHandle* cf,
+                      const Slice& key) override {
+    WriteBatch batch;
+    Status s = batch.SingleDelete(cf, key);
+    if (!s.ok()) {
+      return s;
+    }
+    return Write(o, &batch);
+  }
+  Status SingleDelete(const WriteOptions& /*o*/, ColumnFamilyHandle* /*cf*/,
+                      const Slice& /*key*/, const Slice& /*ts*/) override {
+    return Status::NotSupported();
+  }
+  using DB::Merge;
+  Status Merge(const WriteOptions& o, ColumnFamilyHandle* cf, const Slice& k,
+               const Slice& v) override {
+    WriteBatch batch;
+    Status s = batch.Merge(cf, k, v);
+    if (!s.ok()) {
+      return s;
+    }
+    return Write(o, &batch);
+  }
+  Status Merge(const WriteOptions& /*o*/, ColumnFamilyHandle* /*cf*/,
+               const Slice& /*k*/, const Slice& /*ts*/,
+               const Slice& /*value*/) override {
+    return Status::NotSupported();
+  }
+  using DB::Get;
+  Status Get(const ReadOptions& /*options*/, ColumnFamilyHandle* /*cf*/,
+             const Slice& key, PinnableSlice* /*value*/) override {
+    return Status::NotSupported(key);
+  }
+
+  using DB::GetMergeOperands;
+  virtual Status GetMergeOperands(
+      const ReadOptions& /*options*/, ColumnFamilyHandle* /*column_family*/,
+      const Slice& key, PinnableSlice* /*slice*/,
+      GetMergeOperandsOptions* /*merge_operands_options*/,
+      int* /*number_of_operands*/) override {
+    return Status::NotSupported(key);
+  }
+
+  using DB::MultiGet;
+  std::vector<Status> MultiGet(
+      const ReadOptions& /*options*/,
+      const std::vector<ColumnFamilyHandle*>& /*column_family*/,
+      const std::vector<Slice>& keys,
+      std::vector<std::string>* /*values*/) override {
+    std::vector<Status> s(keys.size(),
+                          Status::NotSupported("Not implemented."));
+    return s;
+  }
+
+#ifndef ROCKSDB_LITE
+  using DB::IngestExternalFile;
+  Status IngestExternalFile(
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*external_files*/,
+      const IngestExternalFileOptions& /*options*/) override {
+    return Status::NotSupported("Not implemented.");
+  }
+
+  using DB::IngestExternalFiles;
+  Status IngestExternalFiles(
+      const std::vector<IngestExternalFileArg>& /*args*/) override {
+    return Status::NotSupported("Not implemented");
+  }
+
+  using DB::CreateColumnFamilyWithImport;
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& /*options*/,
+      const std::string& /*column_family_name*/,
+      const ImportColumnFamilyOptions& /*import_options*/,
+      const ExportImportFilesMetaData& /*metadata*/,
+      ColumnFamilyHandle** /*handle*/) override {
+    return Status::NotSupported("Not implemented.");
+  }
+
+  using DB::VerifyChecksum;
+  Status VerifyChecksum(const ReadOptions&) override {
+    return Status::NotSupported("Not implemented.");
+  }
+
+  using DB::GetPropertiesOfAllTables;
+  Status GetPropertiesOfAllTables(
+      ColumnFamilyHandle* /*column_family*/,
+      TablePropertiesCollection* /*props*/) override {
+    return Status();
+  }
+
+  Status GetPropertiesOfTablesInRange(
+      ColumnFamilyHandle* /*column_family*/, const Range* /*range*/,
+      std::size_t /*n*/, TablePropertiesCollection* /*props*/) override {
+    return Status();
+  }
+#endif  // ROCKSDB_LITE
+
+  using DB::KeyMayExist;
+  bool KeyMayExist(const ReadOptions& /*options*/,
+                   ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
+                   std::string* /*value*/,
+                   bool* value_found = nullptr) override {
+    if (value_found != nullptr) {
+      *value_found = false;
+    }
+    return true;  // Not Supported directly
+  }
+  using DB::NewIterator;
+  Iterator* NewIterator(const ReadOptions& options,
+                        ColumnFamilyHandle* /*column_family*/) override {
+    if (options.snapshot == nullptr) {
+      KVMap* saved = new KVMap;
+      *saved = map_;
+      return new ModelIter(saved, true);
+    } else {
+      const KVMap* snapshot_state =
+          &(reinterpret_cast<const ModelSnapshot*>(options.snapshot)->map_);
+      return new ModelIter(snapshot_state, false);
+    }
+  }
+  Status NewIterators(const ReadOptions& /*options*/,
+                      const std::vector<ColumnFamilyHandle*>& /*column_family*/,
+                      std::vector<Iterator*>* /*iterators*/) override {
+    return Status::NotSupported("Not supported yet");
+  }
+  const Snapshot* GetSnapshot() override {
+    ModelSnapshot* snapshot = new ModelSnapshot;
+    snapshot->map_ = map_;
+    return snapshot;
+  }
+
+  void ReleaseSnapshot(const Snapshot* snapshot) override {
+    delete reinterpret_cast<const ModelSnapshot*>(snapshot);
+  }
+
+  Status Write(const WriteOptions& /*options*/, WriteBatch* batch) override {
+    class Handler : public WriteBatch::Handler {
+     public:
+      KVMap* map_;
+      void Put(const Slice& key, const Slice& value) override {
+        (*map_)[key.ToString()] = value.ToString();
+      }
+      void Merge(const Slice& /*key*/, const Slice& /*value*/) override {
+        // ignore merge for now
+        // (*map_)[key.ToString()] = value.ToString();
+      }
+      void Delete(const Slice& key) override { map_->erase(key.ToString()); }
+    };
+    Handler handler;
+    handler.map_ = &map_;
+    return batch->Iterate(&handler);
+  }
+
+  using DB::GetProperty;
+  bool GetProperty(ColumnFamilyHandle* /*column_family*/,
+                   const Slice& /*property*/, std::string* /*value*/) override {
+    return false;
+  }
+  using DB::GetIntProperty;
+  bool GetIntProperty(ColumnFamilyHandle* /*column_family*/,
+                      const Slice& /*property*/, uint64_t* /*value*/) override {
+    return false;
+  }
+  using DB::GetMapProperty;
+  bool GetMapProperty(ColumnFamilyHandle* /*column_family*/,
+                      const Slice& /*property*/,
+                      std::map<std::string, std::string>* /*value*/) override {
+    return false;
+  }
+  using DB::GetAggregatedIntProperty;
+  bool GetAggregatedIntProperty(const Slice& /*property*/,
+                                uint64_t* /*value*/) override {
+    return false;
+  }
+  using DB::GetApproximateSizes;
+  Status GetApproximateSizes(const SizeApproximationOptions& /*options*/,
+                             ColumnFamilyHandle* /*column_family*/,
+                             const Range* /*range*/, int n,
+                             uint64_t* sizes) override {
+    for (int i = 0; i < n; i++) {
+      sizes[i] = 0;
+    }
+    return Status::OK();
+  }
+  using DB::GetApproximateMemTableStats;
+  void GetApproximateMemTableStats(ColumnFamilyHandle* /*column_family*/,
+                                   const Range& /*range*/,
+                                   uint64_t* const count,
+                                   uint64_t* const size) override {
+    *count = 0;
+    *size = 0;
+  }
+  using DB::CompactRange;
+  Status CompactRange(const CompactRangeOptions& /*options*/,
+                      ColumnFamilyHandle* /*column_family*/,
+                      const Slice* /*start*/, const Slice* /*end*/) override {
+    return Status::NotSupported("Not supported operation.");
+  }
+
+  Status SetDBOptions(
+      const std::unordered_map<std::string, std::string>& /*new_options*/)
+      override {
+    return Status::NotSupported("Not supported operation.");
+  }
+
+  using DB::CompactFiles;
+  Status CompactFiles(
+      const CompactionOptions& /*compact_options*/,
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*input_file_names*/,
+      const int /*output_level*/, const int /*output_path_id*/ = -1,
+      std::vector<std::string>* const /*output_file_names*/ = nullptr,
+      CompactionJobInfo* /*compaction_job_info*/ = nullptr) override {
+    return Status::NotSupported("Not supported operation.");
+  }
+
+  Status PauseBackgroundWork() override {
+    return Status::NotSupported("Not supported operation.");
+  }
+
+  Status ContinueBackgroundWork() override {
+    return Status::NotSupported("Not supported operation.");
+  }
+
+  Status EnableAutoCompaction(
+      const std::vector<ColumnFamilyHandle*>& /*column_family_handles*/)
+      override {
+    return Status::NotSupported("Not supported operation.");
+  }
+
+  void EnableManualCompaction() override { return; }
+
+  void DisableManualCompaction() override { return; }
+
+  using DB::NumberLevels;
+  int NumberLevels(ColumnFamilyHandle* /*column_family*/) override { return 1; }
+
+  using DB::MaxMemCompactionLevel;
+  int MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) override {
+    return 1;
+  }
+
+  using DB::Level0StopWriteTrigger;
+  int Level0StopWriteTrigger(ColumnFamilyHandle* /*column_family*/) override {
+    return -1;
+  }
+
+  const std::string& GetName() const override { return name_; }
+
+  Env* GetEnv() const override { return nullptr; }
+
+  using DB::GetOptions;
+  Options GetOptions(ColumnFamilyHandle* /*column_family*/) const override {
+    return options_;
+  }
+
+  using DB::GetDBOptions;
+  DBOptions GetDBOptions() const override { return options_; }
+
+  using DB::Flush;
+  Status Flush(const ROCKSDB_NAMESPACE::FlushOptions& /*options*/,
+               ColumnFamilyHandle* /*column_family*/) override {
+    Status ret;
+    return ret;
+  }
+  Status Flush(
+      const ROCKSDB_NAMESPACE::FlushOptions& /*options*/,
+      const std::vector<ColumnFamilyHandle*>& /*column_families*/) override {
+    return Status::OK();
+  }
+
+  Status SyncWAL() override { return Status::OK(); }
+
+  Status DisableFileDeletions() override { return Status::OK(); }
+
+  Status EnableFileDeletions(bool /*force*/) override { return Status::OK(); }
+#ifndef ROCKSDB_LITE
+
+  Status GetLiveFiles(std::vector<std::string>&, uint64_t* /*size*/,
+                      bool /*flush_memtable*/ = true) override {
+    return Status::OK();
+  }
+
+  Status GetLiveFilesChecksumInfo(
+      FileChecksumList* /*checksum_list*/) override {
+    return Status::OK();
+  }
+
+  Status GetLiveFilesStorageInfo(
+      const LiveFilesStorageInfoOptions& /*opts*/,
+      std::vector<LiveFileStorageInfo>* /*files*/) override {
+    return Status::OK();
+  }
+
+  Status GetSortedWalFiles(VectorLogPtr& /*files*/) override {
+    return Status::OK();
+  }
+
+  Status GetCurrentWalFile(
+      std::unique_ptr<LogFile>* /*current_log_file*/) override {
+    return Status::OK();
+  }
+
+  virtual Status GetCreationTimeOfOldestFile(
+      uint64_t* /*creation_time*/) override {
+    return Status::NotSupported();
+  }
+
+  Status DeleteFile(std::string /*name*/) override { return Status::OK(); }
+
+  Status GetUpdatesSince(
+      ROCKSDB_NAMESPACE::SequenceNumber,
+      std::unique_ptr<ROCKSDB_NAMESPACE::TransactionLogIterator>*,
+      const TransactionLogIterator::ReadOptions& /*read_options*/ =
+          TransactionLogIterator::ReadOptions()) override {
+    return Status::NotSupported("Not supported in Model DB");
+  }
+
+  void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/,
+                               ColumnFamilyMetaData* /*metadata*/) override {}
+#endif  // ROCKSDB_LITE
+
+  Status GetDbIdentity(std::string& /*identity*/) const override {
+    return Status::OK();
+  }
+
+  Status GetDbSessionId(std::string& /*session_id*/) const override {
+    return Status::OK();
+  }
+
+  SequenceNumber GetLatestSequenceNumber() const override { return 0; }
+
+  Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* /*cf*/,
+                                  std::string /*ts_low*/) override {
+    return Status::OK();
+  }
+
+  Status GetFullHistoryTsLow(ColumnFamilyHandle* /*cf*/,
+                             std::string* /*ts_low*/) override {
+    return Status::OK();
+  }
+
+  ColumnFamilyHandle* DefaultColumnFamily() const override { return nullptr; }
+
+ private:
+  class ModelIter : public Iterator {
+   public:
+    ModelIter(const KVMap* map, bool owned)
+        : map_(map), owned_(owned), iter_(map_->end()) {}
+    ~ModelIter() override {
+      if (owned_) delete map_;
+    }
+    bool Valid() const override { return iter_ != map_->end(); }
+    void SeekToFirst() override { iter_ = map_->begin(); }
+    void SeekToLast() override {
+      if (map_->empty()) {
+        iter_ = map_->end();
+      } else {
+        iter_ = map_->find(map_->rbegin()->first);
+      }
+    }
+    void Seek(const Slice& k) override {
+      iter_ = map_->lower_bound(k.ToString());
+    }
+    void SeekForPrev(const Slice& k) override {
+      iter_ = map_->upper_bound(k.ToString());
+      Prev();
+    }
+    void Next() override { ++iter_; }
+    void Prev() override {
+      if (iter_ == map_->begin()) {
+        iter_ = map_->end();
+        return;
+      }
+      --iter_;
+    }
+
+    Slice key() const override { return iter_->first; }
+    Slice value() const override { return iter_->second; }
+    Status status() const override { return Status::OK(); }
+
+   private:
+    const KVMap* const map_;
+    const bool owned_;  // Do we own map_
+    KVMap::const_iterator iter_;
+  };
+  const Options options_;
+  KVMap map_;
+  std::string name_ = "";
+};
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+static std::string RandomKey(Random* rnd, int minimum = 0) {
+  int len;
+  do {
+    len = (rnd->OneIn(3)
+               ? 1  // Short sometimes to encourage collisions
+               : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10)));
+  } while (len < minimum);
+  return test::RandomKey(rnd, len);
+}
+
+static bool CompareIterators(int step, DB* model, DB* db,
+                             const Snapshot* model_snap,
+                             const Snapshot* db_snap) {
+  ReadOptions options;
+  options.snapshot = model_snap;
+  Iterator* miter = model->NewIterator(options);
+  options.snapshot = db_snap;
+  Iterator* dbiter = db->NewIterator(options);
+  bool ok = true;
+  int count = 0;
+  for (miter->SeekToFirst(), dbiter->SeekToFirst();
+       ok && miter->Valid() && dbiter->Valid(); miter->Next(), dbiter->Next()) {
+    count++;
+    if (miter->key().compare(dbiter->key()) != 0) {
+      fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n", step,
+              EscapeString(miter->key()).c_str(),
+              EscapeString(dbiter->key()).c_str());
+      ok = false;
+      break;
+    }
+
+    if (miter->value().compare(dbiter->value()) != 0) {
+      fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n",
+              step, EscapeString(miter->key()).c_str(),
+              EscapeString(miter->value()).c_str(),
+              EscapeString(dbiter->value()).c_str());
+      ok = false;
+    }
+  }
+
+  if (ok) {
+    if (miter->Valid() != dbiter->Valid()) {
+      fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n",
+              step, miter->Valid(), dbiter->Valid());
+      ok = false;
+    }
+  }
+  delete miter;
+  delete dbiter;
+  return ok;
+}
+
+class DBTestRandomized : public DBTest,
+                         public ::testing::WithParamInterface<int> {
+ public:
+  void SetUp() override { option_config_ = GetParam(); }
+
+  static std::vector<int> GenerateOptionConfigs() {
+    std::vector<int> option_configs;
+    // skip cuckoo hash as it does not support snapshot.
+    for (int option_config = kDefault; option_config < kEnd; ++option_config) {
+      if (!ShouldSkipOptions(option_config,
+                             kSkipDeletesFilterFirst | kSkipNoSeekToLast)) {
+        option_configs.push_back(option_config);
+      }
+    }
+    option_configs.push_back(kBlockBasedTableWithIndexRestartInterval);
+    return option_configs;
+  }
+};
+
+INSTANTIATE_TEST_CASE_P(
+    DBTestRandomized, DBTestRandomized,
+    ::testing::ValuesIn(DBTestRandomized::GenerateOptionConfigs()));
+
+TEST_P(DBTestRandomized, Randomized) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  Options options = CurrentOptions(options_override);
+  DestroyAndReopen(options);
+
+  Random rnd(test::RandomSeed() + GetParam());
+  ModelDB model(options);
+  const int N = 10000;
+  const Snapshot* model_snap = nullptr;
+  const Snapshot* db_snap = nullptr;
+  std::string k, v;
+  for (int step = 0; step < N; step++) {
+    // TODO(sanjay): Test Get() works
+    int p = rnd.Uniform(100);
+    int minimum = 0;
+    if (option_config_ == kHashSkipList || option_config_ == kHashLinkList ||
+        option_config_ == kPlainTableFirstBytePrefix ||
+        option_config_ == kBlockBasedTableWithWholeKeyHashIndex ||
+        option_config_ == kBlockBasedTableWithPrefixHashIndex) {
+      minimum = 1;
+    }
+    if (p < 45) {  // Put
+      k = RandomKey(&rnd, minimum);
+      v = rnd.RandomString(rnd.OneIn(20) ? 100 + rnd.Uniform(100)
+                                         : rnd.Uniform(8));
+      ASSERT_OK(model.Put(WriteOptions(), k, v));
+      ASSERT_OK(db_->Put(WriteOptions(), k, v));
+    } else if (p < 90) {  // Delete
+      k = RandomKey(&rnd, minimum);
+      ASSERT_OK(model.Delete(WriteOptions(), k));
+      ASSERT_OK(db_->Delete(WriteOptions(), k));
+    } else {  // Multi-element batch
+      WriteBatch b;
+      const int num = rnd.Uniform(8);
+      for (int i = 0; i < num; i++) {
+        if (i == 0 || !rnd.OneIn(10)) {
+          k = RandomKey(&rnd, minimum);
+        } else {
+          // Periodically re-use the same key from the previous iter, so
+          // we have multiple entries in the write batch for the same key
+        }
+        if (rnd.OneIn(2)) {
+          v = rnd.RandomString(rnd.Uniform(10));
+          ASSERT_OK(b.Put(k, v));
+        } else {
+          ASSERT_OK(b.Delete(k));
+        }
+      }
+      ASSERT_OK(model.Write(WriteOptions(), &b));
+      ASSERT_OK(db_->Write(WriteOptions(), &b));
+    }
+
+    if ((step % 100) == 0) {
+      // For DB instances that use the hash index + block-based table, the
+      // iterator will be invalid right when seeking a non-existent key, right
+      // than return a key that is close to it.
+      if (option_config_ != kBlockBasedTableWithWholeKeyHashIndex &&
+          option_config_ != kBlockBasedTableWithPrefixHashIndex) {
+        ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
+        ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
+      }
+
+      // Save a snapshot from each DB this time that we'll use next
+      // time we compare things, to make sure the current state is
+      // preserved with the snapshot
+      if (model_snap != nullptr) model.ReleaseSnapshot(model_snap);
+      if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap);
+
+      Reopen(options);
+      ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
+
+      model_snap = model.GetSnapshot();
+      db_snap = db_->GetSnapshot();
+    }
+  }
+  if (model_snap != nullptr) model.ReleaseSnapshot(model_snap);
+  if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap);
+}
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_F(DBTest, BlockBasedTablePrefixIndexTest) {
+  // create a DB with block prefix index
+  BlockBasedTableOptions table_options;
+  Options options = CurrentOptions();
+  table_options.index_type = BlockBasedTableOptions::kHashSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+
+  Reopen(options);
+  ASSERT_OK(Put("k1", "v1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("k2", "v2"));
+
+  // Reopen with different prefix extractor, make sure everything still works.
+  // RocksDB should just fall back to the binary index.
+  options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+
+  Reopen(options);
+  ASSERT_EQ("v1", Get("k1"));
+  ASSERT_EQ("v2", Get("k2"));
+
+#ifndef ROCKSDB_LITE
+  // Back to original
+  ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:1"}}));
+  ASSERT_EQ("v1", Get("k1"));
+  ASSERT_EQ("v2", Get("k2"));
+#endif  // !ROCKSDB_LITE
+
+  // Same if there's a problem initally loading prefix transform
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTable::Open::ForceNullTablePrefixExtractor",
+      [&](void* arg) { *static_cast<bool*>(arg) = true; });
+  SyncPoint::GetInstance()->EnableProcessing();
+  Reopen(options);
+  ASSERT_EQ("v1", Get("k1"));
+  ASSERT_EQ("v2", Get("k2"));
+
+#ifndef ROCKSDB_LITE
+  // Change again
+  ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:2"}}));
+  ASSERT_EQ("v1", Get("k1"));
+  ASSERT_EQ("v2", Get("k2"));
+#endif  // !ROCKSDB_LITE
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // Reopen with no prefix extractor, make sure everything still works.
+  // RocksDB should just fall back to the binary index.
+  table_options.index_type = BlockBasedTableOptions::kBinarySearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset();
+
+  Reopen(options);
+  ASSERT_EQ("v1", Get("k1"));
+  ASSERT_EQ("v2", Get("k2"));
+}
+
+TEST_F(DBTest, BlockBasedTablePrefixHashIndexTest) {
+  // create a DB with block prefix index
+  BlockBasedTableOptions table_options;
+  Options options = CurrentOptions();
+  table_options.index_type = BlockBasedTableOptions::kHashSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset(NewCappedPrefixTransform(2));
+
+  Reopen(options);
+  ASSERT_OK(Put("kk1", "v1"));
+  ASSERT_OK(Put("kk2", "v2"));
+  ASSERT_OK(Put("kk", "v3"));
+  ASSERT_OK(Put("k", "v4"));
+  Flush();
+
+  ASSERT_EQ("v1", Get("kk1"));
+  ASSERT_EQ("v2", Get("kk2"));
+
+  ASSERT_EQ("v3", Get("kk"));
+  ASSERT_EQ("v4", Get("k"));
+}
+
+TEST_F(DBTest, BlockBasedTablePrefixIndexTotalOrderSeek) {
+  // create a DB with block prefix index
+  BlockBasedTableOptions table_options;
+  Options options = CurrentOptions();
+  options.max_open_files = 10;
+  table_options.index_type = BlockBasedTableOptions::kHashSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+
+  // RocksDB sanitize max open files to at least 20. Modify it back.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+        int* max_open_files = static_cast<int*>(arg);
+        *max_open_files = 11;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Reopen(options);
+  ASSERT_OK(Put("k1", "v1"));
+  ASSERT_OK(Flush());
+
+  CompactRangeOptions cro;
+  cro.change_level = true;
+  cro.target_level = 1;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  // Force evict tables
+  dbfull()->TEST_table_cache()->SetCapacity(0);
+  // Make table cache to keep one entry.
+  dbfull()->TEST_table_cache()->SetCapacity(1);
+
+  ReadOptions read_options;
+  read_options.total_order_seek = true;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    iter->Seek("k1");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("k1", iter->key().ToString());
+  }
+
+  // After total order seek, prefix index should still be used.
+  read_options.total_order_seek = false;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    iter->Seek("k1");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("k1", iter->key().ToString());
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest, ChecksumTest) {
+  BlockBasedTableOptions table_options;
+  Options options = CurrentOptions();
+
+  table_options.checksum = kCRC32c;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  ASSERT_OK(Put("a", "b"));
+  ASSERT_OK(Put("c", "d"));
+  ASSERT_OK(Flush());  // table with crc checksum
+
+  table_options.checksum = kxxHash;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  ASSERT_OK(Put("e", "f"));
+  ASSERT_OK(Put("g", "h"));
+  ASSERT_OK(Flush());  // table with xxhash checksum
+
+  table_options.checksum = kCRC32c;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  ASSERT_EQ("b", Get("a"));
+  ASSERT_EQ("d", Get("c"));
+  ASSERT_EQ("f", Get("e"));
+  ASSERT_EQ("h", Get("g"));
+
+  table_options.checksum = kCRC32c;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  ASSERT_EQ("b", Get("a"));
+  ASSERT_EQ("d", Get("c"));
+  ASSERT_EQ("f", Get("e"));
+  ASSERT_EQ("h", Get("g"));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(DBTestWithParam, FIFOCompactionTest) {
+  for (int iter = 0; iter < 2; ++iter) {
+    // first iteration -- auto compaction
+    // second iteration -- manual compaction
+    Options options;
+    options.compaction_style = kCompactionStyleFIFO;
+    options.write_buffer_size = 100 << 10;  // 100KB
+    options.arena_block_size = 4096;
+    options.compaction_options_fifo.max_table_files_size = 500 << 10;  // 500KB
+    options.compression = kNoCompression;
+    options.create_if_missing = true;
+    options.max_subcompactions = max_subcompactions_;
+    if (iter == 1) {
+      options.disable_auto_compactions = true;
+    }
+    options = CurrentOptions(options);
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    for (int i = 0; i < 6; ++i) {
+      for (int j = 0; j < 110; ++j) {
+        ASSERT_OK(Put(std::to_string(i * 100 + j), rnd.RandomString(980)));
+      }
+      // flush should happen here
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    }
+    if (iter == 0) {
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    } else {
+      CompactRangeOptions cro;
+      cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+      ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+    }
+    // only 5 files should survive
+    ASSERT_EQ(NumTableFilesAtLevel(0), 5);
+    for (int i = 0; i < 50; ++i) {
+      // these keys should be deleted in previous compaction
+      ASSERT_EQ("NOT_FOUND", Get(std::to_string(i)));
+    }
+  }
+}
+
+TEST_F(DBTest, FIFOCompactionTestWithCompaction) {
+  Options options;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.write_buffer_size = 20 << 10;  // 20K
+  options.arena_block_size = 4096;
+  options.compaction_options_fifo.max_table_files_size = 1500 << 10;  // 1MB
+  options.compaction_options_fifo.allow_compaction = true;
+  options.level0_file_num_compaction_trigger = 6;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 60; i++) {
+    // Generate and flush a file about 20KB.
+    for (int j = 0; j < 20; j++) {
+      ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  // It should be compacted to 10 files.
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  for (int i = 0; i < 60; i++) {
+    // Generate and flush a file about 20KB.
+    for (int j = 0; j < 20; j++) {
+      ASSERT_OK(Put(std::to_string(i * 20 + j + 2000), rnd.RandomString(980)));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+
+  // It should be compacted to no more than 20 files.
+  ASSERT_GT(NumTableFilesAtLevel(0), 10);
+  ASSERT_LT(NumTableFilesAtLevel(0), 18);
+  // Size limit is still guaranteed.
+  ASSERT_LE(SizeAtLevel(0),
+            options.compaction_options_fifo.max_table_files_size);
+}
+
+TEST_F(DBTest, FIFOCompactionStyleWithCompactionAndDelete) {
+  Options options;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.write_buffer_size = 20 << 10;  // 20K
+  options.arena_block_size = 4096;
+  options.compaction_options_fifo.max_table_files_size = 1500 << 10;  // 1MB
+  options.compaction_options_fifo.allow_compaction = true;
+  options.level0_file_num_compaction_trigger = 3;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 3; i++) {
+    // Each file contains a different key which will be dropped later.
+    ASSERT_OK(Put("a" + std::to_string(i), rnd.RandomString(500)));
+    ASSERT_OK(Put("key" + std::to_string(i), ""));
+    ASSERT_OK(Put("z" + std::to_string(i), rnd.RandomString(500)));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+  for (int i = 0; i < 3; i++) {
+    ASSERT_EQ("", Get("key" + std::to_string(i)));
+  }
+  for (int i = 0; i < 3; i++) {
+    // Each file contains a different key which will be dropped later.
+    ASSERT_OK(Put("a" + std::to_string(i), rnd.RandomString(500)));
+    ASSERT_OK(Delete("key" + std::to_string(i)));
+    ASSERT_OK(Put("z" + std::to_string(i), rnd.RandomString(500)));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+  for (int i = 0; i < 3; i++) {
+    ASSERT_EQ("NOT_FOUND", Get("key" + std::to_string(i)));
+  }
+}
+
+// Check that FIFO-with-TTL is not supported with max_open_files != -1.
+// Github issue #8014
+TEST_F(DBTest, FIFOCompactionWithTTLAndMaxOpenFilesTest) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleFIFO;
+  options.create_if_missing = true;
+  options.ttl = 600;  // seconds
+
+  // TTL is not supported with max_open_files != -1.
+  options.max_open_files = 0;
+  ASSERT_TRUE(TryReopen(options).IsNotSupported());
+
+  options.max_open_files = 100;
+  ASSERT_TRUE(TryReopen(options).IsNotSupported());
+
+  // TTL is supported with unlimited max_open_files
+  options.max_open_files = -1;
+  ASSERT_OK(TryReopen(options));
+}
+
+// Check that FIFO-with-TTL is supported only with BlockBasedTableFactory.
+TEST_F(DBTest, FIFOCompactionWithTTLAndVariousTableFormatsTest) {
+  Options options;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.create_if_missing = true;
+  options.ttl = 600;  // seconds
+
+  options = CurrentOptions(options);
+  options.table_factory.reset(NewBlockBasedTableFactory());
+  ASSERT_OK(TryReopen(options));
+
+  Destroy(options);
+  options.table_factory.reset(NewPlainTableFactory());
+  ASSERT_TRUE(TryReopen(options).IsNotSupported());
+
+  Destroy(options);
+  options.table_factory.reset(NewAdaptiveTableFactory());
+  ASSERT_TRUE(TryReopen(options).IsNotSupported());
+}
+
+TEST_F(DBTest, FIFOCompactionWithTTLTest) {
+  Options options;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.write_buffer_size = 10 << 10;  // 10KB
+  options.arena_block_size = 4096;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  env_->SetMockSleep();
+  options.env = env_;
+
+  // Test to make sure that all files with expired ttl are deleted on next
+  // manual compaction.
+  {
+    // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+    options.compaction_options_fifo.max_table_files_size = 150 << 10;  // 150KB
+    options.compaction_options_fifo.allow_compaction = false;
+    options.ttl = 1 * 60 * 60;  // 1 hour
+    options = CurrentOptions(options);
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    for (int i = 0; i < 10; i++) {
+      // Generate and flush a file about 10KB.
+      for (int j = 0; j < 10; j++) {
+        ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+    ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+    // Sleep for 2 hours -- which is much greater than TTL.
+    env_->MockSleepForSeconds(2 * 60 * 60);
+
+    // Since no flushes and compactions have run, the db should still be in
+    // the same state even after considerable time has passed.
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  }
+
+  // Test to make sure that all files with expired ttl are deleted on next
+  // automatic compaction.
+  {
+    options.compaction_options_fifo.max_table_files_size = 150 << 10;  // 150KB
+    options.compaction_options_fifo.allow_compaction = false;
+    options.ttl = 1 * 60 * 60;  // 1 hour
+    options = CurrentOptions(options);
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    for (int i = 0; i < 10; i++) {
+      // Generate and flush a file about 10KB.
+      for (int j = 0; j < 10; j++) {
+        ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+    ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+    // Sleep for 2 hours -- which is much greater than TTL.
+    env_->MockSleepForSeconds(2 * 60 * 60);
+    // Just to make sure that we are in the same state even after sleeping.
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+    // Create 1 more file to trigger TTL compaction. The old files are dropped.
+    for (int i = 0; i < 1; i++) {
+      for (int j = 0; j < 10; j++) {
+        ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+      }
+      ASSERT_OK(Flush());
+    }
+
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    // Only the new 10 files remain.
+    ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+    ASSERT_LE(SizeAtLevel(0),
+              options.compaction_options_fifo.max_table_files_size);
+  }
+
+  // Test that shows the fall back to size-based FIFO compaction if TTL-based
+  // deletion doesn't move the total size to be less than max_table_files_size.
+  {
+    options.write_buffer_size = 10 << 10;                              // 10KB
+    options.compaction_options_fifo.max_table_files_size = 150 << 10;  // 150KB
+    options.compaction_options_fifo.allow_compaction = false;
+    options.ttl = 1 * 60 * 60;  // 1 hour
+    options = CurrentOptions(options);
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    for (int i = 0; i < 3; i++) {
+      // Generate and flush a file about 10KB.
+      for (int j = 0; j < 10; j++) {
+        ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+    ASSERT_EQ(NumTableFilesAtLevel(0), 3);
+
+    // Sleep for 2 hours -- which is much greater than TTL.
+    env_->MockSleepForSeconds(2 * 60 * 60);
+    // Just to make sure that we are in the same state even after sleeping.
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ(NumTableFilesAtLevel(0), 3);
+
+    for (int i = 0; i < 5; i++) {
+      for (int j = 0; j < 140; j++) {
+        ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+    // Size limit is still guaranteed.
+    ASSERT_LE(SizeAtLevel(0),
+              options.compaction_options_fifo.max_table_files_size);
+  }
+
+  // Test with TTL + Intra-L0 compactions.
+  {
+    options.compaction_options_fifo.max_table_files_size = 150 << 10;  // 150KB
+    options.compaction_options_fifo.allow_compaction = true;
+    options.ttl = 1 * 60 * 60;  // 1 hour
+    options.level0_file_num_compaction_trigger = 6;
+    options = CurrentOptions(options);
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    for (int i = 0; i < 10; i++) {
+      // Generate and flush a file about 10KB.
+      for (int j = 0; j < 10; j++) {
+        ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+    // With Intra-L0 compaction, out of 10 files, 6 files will be compacted to 1
+    // (due to level0_file_num_compaction_trigger = 6).
+    // So total files = 1 + remaining 4 = 5.
+    ASSERT_EQ(NumTableFilesAtLevel(0), 5);
+
+    // Sleep for 2 hours -- which is much greater than TTL.
+    env_->MockSleepForSeconds(2 * 60 * 60);
+    // Just to make sure that we are in the same state even after sleeping.
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ(NumTableFilesAtLevel(0), 5);
+
+    // Create 10 more files. The old 5 files are dropped as their ttl expired.
+    for (int i = 0; i < 10; i++) {
+      for (int j = 0; j < 10; j++) {
+        ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+    ASSERT_EQ(NumTableFilesAtLevel(0), 5);
+    ASSERT_LE(SizeAtLevel(0),
+              options.compaction_options_fifo.max_table_files_size);
+  }
+
+  // Test with large TTL + Intra-L0 compactions.
+  // Files dropped based on size, as ttl doesn't kick in.
+  {
+    options.write_buffer_size = 20 << 10;                               // 20K
+    options.compaction_options_fifo.max_table_files_size = 1500 << 10;  // 1.5MB
+    options.compaction_options_fifo.allow_compaction = true;
+    options.ttl = 1 * 60 * 60;  // 1 hour
+    options.level0_file_num_compaction_trigger = 6;
+    options = CurrentOptions(options);
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    for (int i = 0; i < 60; i++) {
+      // Generate and flush a file about 20KB.
+      for (int j = 0; j < 20; j++) {
+        ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+    // It should be compacted to 10 files.
+    ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+    for (int i = 0; i < 60; i++) {
+      // Generate and flush a file about 20KB.
+      for (int j = 0; j < 20; j++) {
+        ASSERT_OK(
+            Put(std::to_string(i * 20 + j + 2000), rnd.RandomString(980)));
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+
+    // It should be compacted to no more than 20 files.
+    ASSERT_GT(NumTableFilesAtLevel(0), 10);
+    ASSERT_LT(NumTableFilesAtLevel(0), 18);
+    // Size limit is still guaranteed.
+    ASSERT_LE(SizeAtLevel(0),
+              options.compaction_options_fifo.max_table_files_size);
+  }
+}
+#endif  // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+/*
+ * This test is not reliable enough as it heavily depends on disk behavior.
+ * Disable as it is flaky.
+ */
+TEST_F(DBTest, DISABLED_RateLimitingTest) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1 << 20;  // 1MB
+  options.level0_file_num_compaction_trigger = 2;
+  options.target_file_size_base = 1 << 20;     // 1MB
+  options.max_bytes_for_level_base = 4 << 20;  // 4MB
+  options.max_bytes_for_level_multiplier = 4;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options.env = env_;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.IncreaseParallelism(4);
+  DestroyAndReopen(options);
+
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  // # no rate limiting
+  Random rnd(301);
+  uint64_t start = env_->NowMicros();
+  // Write ~96M data
+  for (int64_t i = 0; i < (96 << 10); ++i) {
+    ASSERT_OK(Put(rnd.RandomString(32), rnd.RandomString((1 << 10) + 1), wo));
+  }
+  uint64_t elapsed = env_->NowMicros() - start;
+  double raw_rate = env_->bytes_written_ * 1000000.0 / elapsed;
+  uint64_t rate_limiter_drains =
+      TestGetTickerCount(options, NUMBER_RATE_LIMITER_DRAINS);
+  ASSERT_EQ(0, rate_limiter_drains);
+  Close();
+
+  // # rate limiting with 0.7 x threshold
+  options.rate_limiter.reset(
+      NewGenericRateLimiter(static_cast<int64_t>(0.7 * raw_rate)));
+  env_->bytes_written_ = 0;
+  DestroyAndReopen(options);
+
+  start = env_->NowMicros();
+  // Write ~96M data
+  for (int64_t i = 0; i < (96 << 10); ++i) {
+    ASSERT_OK(Put(rnd.RandomString(32), rnd.RandomString((1 << 10) + 1), wo));
+  }
+  rate_limiter_drains =
+      TestGetTickerCount(options, NUMBER_RATE_LIMITER_DRAINS) -
+      rate_limiter_drains;
+  elapsed = env_->NowMicros() - start;
+  Close();
+  ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_);
+  // Most intervals should've been drained (interval time is 100ms, elapsed is
+  // micros)
+  ASSERT_GT(rate_limiter_drains, 0);
+  ASSERT_LE(rate_limiter_drains, elapsed / 100000 + 1);
+  double ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate;
+  fprintf(stderr, "write rate ratio = %.2lf, expected 0.7\n", ratio);
+  ASSERT_TRUE(ratio < 0.8);
+
+  // # rate limiting with half of the raw_rate
+  options.rate_limiter.reset(
+      NewGenericRateLimiter(static_cast<int64_t>(raw_rate / 2)));
+  env_->bytes_written_ = 0;
+  DestroyAndReopen(options);
+
+  start = env_->NowMicros();
+  // Write ~96M data
+  for (int64_t i = 0; i < (96 << 10); ++i) {
+    ASSERT_OK(Put(rnd.RandomString(32), rnd.RandomString((1 << 10) + 1), wo));
+  }
+  elapsed = env_->NowMicros() - start;
+  rate_limiter_drains =
+      TestGetTickerCount(options, NUMBER_RATE_LIMITER_DRAINS) -
+      rate_limiter_drains;
+  Close();
+  ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_);
+  // Most intervals should've been drained (interval time is 100ms, elapsed is
+  // micros)
+  ASSERT_GT(rate_limiter_drains, elapsed / 100000 / 2);
+  ASSERT_LE(rate_limiter_drains, elapsed / 100000 + 1);
+  ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate;
+  fprintf(stderr, "write rate ratio = %.2lf, expected 0.5\n", ratio);
+  ASSERT_LT(ratio, 0.6);
+}
+
+// This is a mocked customed rate limiter without implementing optional APIs
+// (e.g, RateLimiter::GetTotalPendingRequests())
+class MockedRateLimiterWithNoOptionalAPIImpl : public RateLimiter {
+ public:
+  MockedRateLimiterWithNoOptionalAPIImpl() {}
+
+  ~MockedRateLimiterWithNoOptionalAPIImpl() override {}
+
+  void SetBytesPerSecond(int64_t bytes_per_second) override {
+    (void)bytes_per_second;
+  }
+
+  using RateLimiter::Request;
+  void Request(const int64_t bytes, const Env::IOPriority pri,
+               Statistics* stats) override {
+    (void)bytes;
+    (void)pri;
+    (void)stats;
+  }
+
+  int64_t GetSingleBurstBytes() const override { return 200; }
+
+  int64_t GetTotalBytesThrough(
+      const Env::IOPriority pri = Env::IO_TOTAL) const override {
+    (void)pri;
+    return 0;
+  }
+
+  int64_t GetTotalRequests(
+      const Env::IOPriority pri = Env::IO_TOTAL) const override {
+    (void)pri;
+    return 0;
+  }
+
+  int64_t GetBytesPerSecond() const override { return 0; }
+};
+
+// To test that customed rate limiter not implementing optional APIs (e.g,
+// RateLimiter::GetTotalPendingRequests()) works fine with RocksDB basic
+// operations (e.g, Put, Get, Flush)
+TEST_F(DBTest, CustomedRateLimiterWithNoOptionalAPIImplTest) {
+  Options options = CurrentOptions();
+  options.rate_limiter.reset(new MockedRateLimiterWithNoOptionalAPIImpl());
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("abc", "def"));
+  ASSERT_EQ(Get("abc"), "def");
+  ASSERT_OK(Flush());
+  ASSERT_EQ(Get("abc"), "def");
+}
+
+TEST_F(DBTest, TableOptionsSanitizeTest) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+  ASSERT_EQ(db_->GetOptions().allow_mmap_reads, false);
+
+  options.table_factory.reset(NewPlainTableFactory());
+  options.prefix_extractor.reset(NewNoopTransform());
+  Destroy(options);
+  ASSERT_TRUE(!TryReopen(options).IsNotSupported());
+
+  // Test for check of prefix_extractor when hash index is used for
+  // block-based table
+  BlockBasedTableOptions to;
+  to.index_type = BlockBasedTableOptions::kHashSearch;
+  options = CurrentOptions();
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(to));
+  ASSERT_TRUE(TryReopen(options).IsInvalidArgument());
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+  ASSERT_OK(TryReopen(options));
+}
+
+TEST_F(DBTest, ConcurrentMemtableNotSupported) {
+  Options options = CurrentOptions();
+  options.allow_concurrent_memtable_write = true;
+  options.soft_pending_compaction_bytes_limit = 0;
+  options.hard_pending_compaction_bytes_limit = 100;
+  options.create_if_missing = true;
+
+  DestroyDB(dbname_, options);
+  options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true, 4));
+  ASSERT_NOK(TryReopen(options));
+
+  options.memtable_factory.reset(new SkipListFactory);
+  ASSERT_OK(TryReopen(options));
+
+  ColumnFamilyOptions cf_options(options);
+  cf_options.memtable_factory.reset(
+      NewHashLinkListRepFactory(4, 0, 3, true, 4));
+  ColumnFamilyHandle* handle;
+  ASSERT_NOK(db_->CreateColumnFamily(cf_options, "name", &handle));
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, SanitizeNumThreads) {
+  for (int attempt = 0; attempt < 2; attempt++) {
+    const size_t kTotalTasks = 8;
+    test::SleepingBackgroundTask sleeping_tasks[kTotalTasks];
+
+    Options options = CurrentOptions();
+    if (attempt == 0) {
+      options.max_background_compactions = 3;
+      options.max_background_flushes = 2;
+    }
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+
+    for (size_t i = 0; i < kTotalTasks; i++) {
+      // Insert 5 tasks to low priority queue and 5 tasks to high priority queue
+      env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                     &sleeping_tasks[i],
+                     (i < 4) ? Env::Priority::LOW : Env::Priority::HIGH);
+    }
+
+    // Wait until 10s for they are scheduled.
+    for (int i = 0; i < 10000; i++) {
+      if (options.env->GetThreadPoolQueueLen(Env::Priority::LOW) <= 1 &&
+          options.env->GetThreadPoolQueueLen(Env::Priority::HIGH) <= 2) {
+        break;
+      }
+      env_->SleepForMicroseconds(1000);
+    }
+
+    // pool size 3, total task 4. Queue size should be 1.
+    ASSERT_EQ(1U, options.env->GetThreadPoolQueueLen(Env::Priority::LOW));
+    // pool size 2, total task 4. Queue size should be 2.
+    ASSERT_EQ(2U, options.env->GetThreadPoolQueueLen(Env::Priority::HIGH));
+
+    for (size_t i = 0; i < kTotalTasks; i++) {
+      sleeping_tasks[i].WakeUp();
+      sleeping_tasks[i].WaitUntilDone();
+    }
+
+    ASSERT_OK(Put("abc", "def"));
+    ASSERT_EQ("def", Get("abc"));
+    ASSERT_OK(Flush());
+    ASSERT_EQ("def", Get("abc"));
+  }
+}
+
+TEST_F(DBTest, WriteSingleThreadEntry) {
+  std::vector<port::Thread> threads;
+  dbfull()->TEST_LockMutex();
+  auto w = dbfull()->TEST_BeginWrite();
+  threads.emplace_back([&] { ASSERT_OK(Put("a", "b")); });
+  env_->SleepForMicroseconds(10000);
+  threads.emplace_back([&] { ASSERT_OK(Flush()); });
+  env_->SleepForMicroseconds(10000);
+  dbfull()->TEST_UnlockMutex();
+  dbfull()->TEST_LockMutex();
+  dbfull()->TEST_EndWrite(w);
+  dbfull()->TEST_UnlockMutex();
+
+  for (auto& t : threads) {
+    t.join();
+  }
+}
+
+TEST_F(DBTest, ConcurrentFlushWAL) {
+  const size_t cnt = 100;
+  Options options;
+  options.env = env_;
+  WriteOptions wopt;
+  ReadOptions ropt;
+  for (bool two_write_queues : {false, true}) {
+    for (bool manual_wal_flush : {false, true}) {
+      options.two_write_queues = two_write_queues;
+      options.manual_wal_flush = manual_wal_flush;
+      options.create_if_missing = true;
+      DestroyAndReopen(options);
+      std::vector<port::Thread> threads;
+      threads.emplace_back([&] {
+        for (size_t i = 0; i < cnt; i++) {
+          auto istr = std::to_string(i);
+          ASSERT_OK(db_->Put(wopt, db_->DefaultColumnFamily(), "a" + istr,
+                             "b" + istr));
+        }
+      });
+      if (two_write_queues) {
+        threads.emplace_back([&] {
+          for (size_t i = cnt; i < 2 * cnt; i++) {
+            auto istr = std::to_string(i);
+            WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+                             wopt.protection_bytes_per_key,
+                             0 /* default_cf_ts_sz */);
+            ASSERT_OK(batch.Put("a" + istr, "b" + istr));
+            ASSERT_OK(
+                dbfull()->WriteImpl(wopt, &batch, nullptr, nullptr, 0, true));
+          }
+        });
+      }
+      threads.emplace_back([&] {
+        for (size_t i = 0; i < cnt * 100; i++) {  // FlushWAL is faster than Put
+          ASSERT_OK(db_->FlushWAL(false));
+        }
+      });
+      for (auto& t : threads) {
+        t.join();
+      }
+      options.create_if_missing = false;
+      // Recover from the wal and make sure that it is not corrupted
+      Reopen(options);
+      for (size_t i = 0; i < cnt; i++) {
+        PinnableSlice pval;
+        auto istr = std::to_string(i);
+        ASSERT_OK(
+            db_->Get(ropt, db_->DefaultColumnFamily(), "a" + istr, &pval));
+        ASSERT_TRUE(pval == ("b" + istr));
+      }
+    }
+  }
+}
+
+// This test failure will be caught with a probability
+TEST_F(DBTest, ManualFlushWalAndWriteRace) {
+  Options options;
+  options.env = env_;
+  options.manual_wal_flush = true;
+  options.create_if_missing = true;
+
+  DestroyAndReopen(options);
+
+  WriteOptions wopts;
+  wopts.sync = true;
+
+  port::Thread writeThread([&]() {
+    for (int i = 0; i < 100; i++) {
+      auto istr = std::to_string(i);
+      ASSERT_OK(dbfull()->Put(wopts, "key_" + istr, "value_" + istr));
+    }
+  });
+  port::Thread flushThread([&]() {
+    for (int i = 0; i < 100; i++) {
+      ASSERT_OK(dbfull()->FlushWAL(false));
+    }
+  });
+
+  writeThread.join();
+  flushThread.join();
+  ASSERT_OK(dbfull()->Put(wopts, "foo1", "value1"));
+  ASSERT_OK(dbfull()->Put(wopts, "foo2", "value2"));
+  Reopen(options);
+  ASSERT_EQ("value1", Get("foo1"));
+  ASSERT_EQ("value2", Get("foo2"));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, DynamicMemtableOptions) {
+  const uint64_t k64KB = 1 << 16;
+  const uint64_t k128KB = 1 << 17;
+  const uint64_t k5KB = 5 * 1024;
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.max_background_compactions = 1;
+  options.write_buffer_size = k64KB;
+  options.arena_block_size = 16 * 1024;
+  options.max_write_buffer_number = 2;
+  // Don't trigger compact/slowdown/stop
+  options.level0_file_num_compaction_trigger = 1024;
+  options.level0_slowdown_writes_trigger = 1024;
+  options.level0_stop_writes_trigger = 1024;
+  DestroyAndReopen(options);
+
+  auto gen_l0_kb = [this](int size) {
+    const int kNumPutsBeforeWaitForFlush = 64;
+    Random rnd(301);
+    for (int i = 0; i < size; i++) {
+      ASSERT_OK(Put(Key(i), rnd.RandomString(1024)));
+
+      // The following condition prevents a race condition between flush jobs
+      // acquiring work and this thread filling up multiple memtables. Without
+      // this, the flush might produce less files than expected because
+      // multiple memtables are flushed into a single L0 file. This race
+      // condition affects assertion (A).
+      if (i % kNumPutsBeforeWaitForFlush == kNumPutsBeforeWaitForFlush - 1) {
+        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+      }
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  };
+
+  // Test write_buffer_size
+  gen_l0_kb(64);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+  ASSERT_LT(SizeAtLevel(0), k64KB + k5KB);
+  ASSERT_GT(SizeAtLevel(0), k64KB - k5KB * 2);
+
+  // Clean up L0
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  // Increase buffer size
+  ASSERT_OK(dbfull()->SetOptions({
+      {"write_buffer_size", "131072"},
+  }));
+
+  // The existing memtable inflated 64KB->128KB when we invoked SetOptions().
+  // Write 192KB, we should have a 128KB L0 file and a memtable with 64KB data.
+  gen_l0_kb(192);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);  // (A)
+  ASSERT_LT(SizeAtLevel(0), k128KB + 2 * k5KB);
+  ASSERT_GT(SizeAtLevel(0), k128KB - 4 * k5KB);
+
+  // Decrease buffer size below current usage
+  ASSERT_OK(dbfull()->SetOptions({
+      {"write_buffer_size", "65536"},
+  }));
+  // The existing memtable became eligible for flush when we reduced its
+  // capacity to 64KB. Two keys need to be added to trigger flush: first causes
+  // memtable to be marked full, second schedules the flush. Then we should have
+  // a 128KB L0 file, a 64KB L0 file, and a memtable with just one key.
+  gen_l0_kb(2);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+  ASSERT_LT(SizeAtLevel(0), k128KB + k64KB + 2 * k5KB);
+  ASSERT_GT(SizeAtLevel(0), k128KB + k64KB - 4 * k5KB);
+
+  // Test max_write_buffer_number
+  // Block compaction thread, which will also block the flushes because
+  // max_background_flushes == 0, so flushes are getting executed by the
+  // compaction thread
+  env_->SetBackgroundThreads(1, Env::LOW);
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  // Start from scratch and disable compaction/flush. Flush can only happen
+  // during compaction but trigger is pretty high
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+  env_->SetBackgroundThreads(0, Env::HIGH);
+
+  // Put until writes are stopped, bounded by 256 puts. We should see stop at
+  // ~128KB
+  int count = 0;
+  Random rnd(301);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:Wait",
+      [&](void* /*arg*/) { sleeping_task_low.WakeUp(); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  while (!sleeping_task_low.WokenUp() && count < 256) {
+    ASSERT_OK(Put(Key(count), rnd.RandomString(1024), WriteOptions()));
+    count++;
+  }
+  ASSERT_GT(static_cast<double>(count), 128 * 0.8);
+  ASSERT_LT(static_cast<double>(count), 128 * 1.2);
+
+  sleeping_task_low.WaitUntilDone();
+
+  // Increase
+  ASSERT_OK(dbfull()->SetOptions({
+      {"max_write_buffer_number", "8"},
+  }));
+  // Clean up memtable and L0
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  sleeping_task_low.Reset();
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  count = 0;
+  while (!sleeping_task_low.WokenUp() && count < 1024) {
+    ASSERT_OK(Put(Key(count), rnd.RandomString(1024), WriteOptions()));
+    count++;
+  }
+// Windows fails this test. Will tune in the future and figure out
+// approp number
+#ifndef OS_WIN
+  ASSERT_GT(static_cast<double>(count), 512 * 0.8);
+  ASSERT_LT(static_cast<double>(count), 512 * 1.2);
+#endif
+  sleeping_task_low.WaitUntilDone();
+
+  // Decrease
+  ASSERT_OK(dbfull()->SetOptions({
+      {"max_write_buffer_number", "4"},
+  }));
+  // Clean up memtable and L0
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  sleeping_task_low.Reset();
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  count = 0;
+  while (!sleeping_task_low.WokenUp() && count < 1024) {
+    ASSERT_OK(Put(Key(count), rnd.RandomString(1024), WriteOptions()));
+    count++;
+  }
+// Windows fails this test. Will tune in the future and figure out
+// approp number
+#ifndef OS_WIN
+  ASSERT_GT(static_cast<double>(count), 256 * 0.8);
+  ASSERT_LT(static_cast<double>(count), 266 * 1.2);
+#endif
+  sleeping_task_low.WaitUntilDone();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+#endif  // ROCKSDB_LITE
+
+#ifdef ROCKSDB_USING_THREAD_STATUS
+namespace {
+void VerifyOperationCount(Env* env, ThreadStatus::OperationType op_type,
+                          int expected_count) {
+  int op_count = 0;
+  std::vector<ThreadStatus> thread_list;
+  ASSERT_OK(env->GetThreadList(&thread_list));
+  for (auto thread : thread_list) {
+    if (thread.operation_type == op_type) {
+      op_count++;
+    }
+  }
+  ASSERT_EQ(op_count, expected_count);
+}
+}  // anonymous namespace
+
+TEST_F(DBTest, GetThreadStatus) {
+  Options options;
+  options.env = env_;
+  options.enable_thread_tracking = true;
+  TryReopen(options);
+
+  std::vector<ThreadStatus> thread_list;
+  Status s = env_->GetThreadList(&thread_list);
+
+  for (int i = 0; i < 2; ++i) {
+    // repeat the test with differet number of high / low priority threads
+    const int kTestCount = 3;
+    const unsigned int kHighPriCounts[kTestCount] = {3, 2, 5};
+    const unsigned int kLowPriCounts[kTestCount] = {10, 15, 3};
+    const unsigned int kBottomPriCounts[kTestCount] = {2, 1, 4};
+    for (int test = 0; test < kTestCount; ++test) {
+      // Change the number of threads in high / low priority pool.
+      env_->SetBackgroundThreads(kHighPriCounts[test], Env::HIGH);
+      env_->SetBackgroundThreads(kLowPriCounts[test], Env::LOW);
+      env_->SetBackgroundThreads(kBottomPriCounts[test], Env::BOTTOM);
+      // Wait to ensure the all threads has been registered
+      unsigned int thread_type_counts[ThreadStatus::NUM_THREAD_TYPES];
+      // TODO(ajkr): it'd be better if SetBackgroundThreads returned only after
+      // all threads have been registered.
+      // Try up to 60 seconds.
+      for (int num_try = 0; num_try < 60000; num_try++) {
+        env_->SleepForMicroseconds(1000);
+        thread_list.clear();
+        s = env_->GetThreadList(&thread_list);
+        ASSERT_OK(s);
+        memset(thread_type_counts, 0, sizeof(thread_type_counts));
+        for (auto thread : thread_list) {
+          ASSERT_LT(thread.thread_type, ThreadStatus::NUM_THREAD_TYPES);
+          thread_type_counts[thread.thread_type]++;
+        }
+        if (thread_type_counts[ThreadStatus::HIGH_PRIORITY] ==
+                kHighPriCounts[test] &&
+            thread_type_counts[ThreadStatus::LOW_PRIORITY] ==
+                kLowPriCounts[test] &&
+            thread_type_counts[ThreadStatus::BOTTOM_PRIORITY] ==
+                kBottomPriCounts[test]) {
+          break;
+        }
+      }
+      // Verify the number of high-priority threads
+      ASSERT_EQ(thread_type_counts[ThreadStatus::HIGH_PRIORITY],
+                kHighPriCounts[test]);
+      // Verify the number of low-priority threads
+      ASSERT_EQ(thread_type_counts[ThreadStatus::LOW_PRIORITY],
+                kLowPriCounts[test]);
+      // Verify the number of bottom-priority threads
+      ASSERT_EQ(thread_type_counts[ThreadStatus::BOTTOM_PRIORITY],
+                kBottomPriCounts[test]);
+    }
+    if (i == 0) {
+      // repeat the test with multiple column families
+      CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options);
+      env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_,
+                                                                     true);
+    }
+  }
+  ASSERT_OK(db_->DropColumnFamily(handles_[2]));
+  delete handles_[2];
+  handles_.erase(handles_.begin() + 2);
+  env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_,
+                                                                 true);
+  Close();
+  env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_,
+                                                                 true);
+}
+
+TEST_F(DBTest, DisableThreadStatus) {
+  Options options;
+  options.env = env_;
+  options.enable_thread_tracking = false;
+  TryReopen(options);
+  CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options);
+  // Verify non of the column family info exists
+  env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_,
+                                                                 false);
+}
+
+TEST_F(DBTest, ThreadStatusFlush) {
+  Options options;
+  options.env = env_;
+  options.write_buffer_size = 100000;  // Small write buffer
+  options.enable_thread_tracking = true;
+  options = CurrentOptions(options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"FlushJob::FlushJob()", "DBTest::ThreadStatusFlush:1"},
+      {"DBTest::ThreadStatusFlush:2", "FlushJob::WriteLevel0Table"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu"}, options);
+  VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0);
+
+  ASSERT_OK(Put(1, "foo", "v1"));
+  ASSERT_EQ("v1", Get(1, "foo"));
+  VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0);
+
+  uint64_t num_running_flushes = 0;
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumRunningFlushes,
+                                  &num_running_flushes));
+  ASSERT_EQ(num_running_flushes, 0);
+
+  ASSERT_OK(Put(1, "k1", std::string(100000, 'x')));  // Fill memtable
+  ASSERT_OK(Put(1, "k2", std::string(100000, 'y')));  // Trigger flush
+
+  // The first sync point is to make sure there's one flush job
+  // running when we perform VerifyOperationCount().
+  TEST_SYNC_POINT("DBTest::ThreadStatusFlush:1");
+  VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 1);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumRunningFlushes,
+                                  &num_running_flushes));
+  ASSERT_EQ(num_running_flushes, 1);
+  // This second sync point is to ensure the flush job will not
+  // be completed until we already perform VerifyOperationCount().
+  TEST_SYNC_POINT("DBTest::ThreadStatusFlush:2");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBTestWithParam, ThreadStatusSingleCompaction) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 100;
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base = options.target_file_size_base * 2;
+  options.max_bytes_for_level_multiplier = 2;
+  options.compression = kNoCompression;
+  options = CurrentOptions(options);
+  options.env = env_;
+  options.enable_thread_tracking = true;
+  const int kNumL0Files = 4;
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.max_subcompactions = max_subcompactions_;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DBTest::ThreadStatusSingleCompaction:0", "DBImpl::BGWorkCompaction"},
+      {"CompactionJob::Run():Start", "DBTest::ThreadStatusSingleCompaction:1"},
+      {"DBTest::ThreadStatusSingleCompaction:2", "CompactionJob::Run():End"},
+  });
+  for (int tests = 0; tests < 2; ++tests) {
+    DestroyAndReopen(options);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    Random rnd(301);
+    // The Put Phase.
+    for (int file = 0; file < kNumL0Files; ++file) {
+      for (int key = 0; key < kEntriesPerBuffer; ++key) {
+        ASSERT_OK(Put(std::to_string(key + file * kEntriesPerBuffer),
+                      rnd.RandomString(kTestValueSize)));
+      }
+      ASSERT_OK(Flush());
+    }
+    // This makes sure a compaction won't be scheduled until
+    // we have done with the above Put Phase.
+    uint64_t num_running_compactions = 0;
+    ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumRunningCompactions,
+                                    &num_running_compactions));
+    ASSERT_EQ(num_running_compactions, 0);
+    TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:0");
+    ASSERT_GE(NumTableFilesAtLevel(0),
+              options.level0_file_num_compaction_trigger);
+
+    // This makes sure at least one compaction is running.
+    TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:1");
+
+    if (options.enable_thread_tracking) {
+      // expecting one single L0 to L1 compaction
+      VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 1);
+    } else {
+      // If thread tracking is not enabled, compaction count should be 0.
+      VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 0);
+    }
+    ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumRunningCompactions,
+                                    &num_running_compactions));
+    ASSERT_EQ(num_running_compactions, 1);
+    // TODO(yhchiang): adding assert to verify each compaction stage.
+    TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:2");
+
+    // repeat the test with disabling thread tracking.
+    options.enable_thread_tracking = false;
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_P(DBTestWithParam, PreShutdownManualCompaction) {
+  Options options = CurrentOptions();
+  options.max_subcompactions = max_subcompactions_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // iter - 0 with 7 levels
+  // iter - 1 with 3 levels
+  for (int iter = 0; iter < 2; ++iter) {
+    MakeTables(3, "p", "q", 1);
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+    // Compaction range falls before files
+    Compact(1, "", "c");
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+    // Compaction range falls after files
+    Compact(1, "r", "z");
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+    // Compaction range overlaps files
+    Compact(1, "p", "q");
+    ASSERT_EQ("0,0,1", FilesPerLevel(1));
+
+    // Populate a different range
+    MakeTables(3, "c", "e", 1);
+    ASSERT_EQ("1,1,2", FilesPerLevel(1));
+
+    // Compact just the new range
+    Compact(1, "b", "f");
+    ASSERT_EQ("0,0,2", FilesPerLevel(1));
+
+    // Compact all
+    MakeTables(1, "a", "z", 1);
+    ASSERT_EQ("1,0,2", FilesPerLevel(1));
+    CancelAllBackgroundWork(db_);
+    ASSERT_TRUE(
+        db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)
+            .IsShutdownInProgress());
+    ASSERT_EQ("1,0,2", FilesPerLevel(1));
+
+    if (iter == 0) {
+      options = CurrentOptions();
+      options.num_levels = 3;
+      options.create_if_missing = true;
+      DestroyAndReopen(options);
+      CreateAndReopenWithCF({"pikachu"}, options);
+    }
+  }
+}
+
+TEST_F(DBTest, PreShutdownFlush) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_OK(Put(1, "key", "value"));
+  CancelAllBackgroundWork(db_);
+  Status s =
+      db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr);
+  ASSERT_TRUE(s.IsShutdownInProgress());
+}
+
+TEST_P(DBTestWithParam, PreShutdownMultipleCompaction) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 40;
+  const int kNumL0Files = 4;
+
+  const int kHighPriCount = 3;
+  const int kLowPriCount = 5;
+  env_->SetBackgroundThreads(kHighPriCount, Env::HIGH);
+  env_->SetBackgroundThreads(kLowPriCount, Env::LOW);
+
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base =
+      options.target_file_size_base * kNumL0Files;
+  options.compression = kNoCompression;
+  options = CurrentOptions(options);
+  options.env = env_;
+  options.enable_thread_tracking = true;
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.max_bytes_for_level_multiplier = 2;
+  options.max_background_compactions = kLowPriCount;
+  options.level0_stop_writes_trigger = 1 << 10;
+  options.level0_slowdown_writes_trigger = 1 << 10;
+  options.max_subcompactions = max_subcompactions_;
+
+  TryReopen(options);
+  Random rnd(301);
+
+  std::vector<ThreadStatus> thread_list;
+  // Delay both flush and compaction
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"FlushJob::FlushJob()", "CompactionJob::Run():Start"},
+       {"CompactionJob::Run():Start",
+        "DBTest::PreShutdownMultipleCompaction:Preshutdown"},
+       {"CompactionJob::Run():Start",
+        "DBTest::PreShutdownMultipleCompaction:VerifyCompaction"},
+       {"DBTest::PreShutdownMultipleCompaction:Preshutdown",
+        "CompactionJob::Run():End"},
+       {"CompactionJob::Run():End",
+        "DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Make rocksdb busy
+  int key = 0;
+  // check how many threads are doing compaction using GetThreadList
+  int operation_count[ThreadStatus::NUM_OP_TYPES] = {0};
+  for (int file = 0; file < 16 * kNumL0Files; ++file) {
+    for (int k = 0; k < kEntriesPerBuffer; ++k) {
+      ASSERT_OK(Put(std::to_string(key++), rnd.RandomString(kTestValueSize)));
+    }
+
+    ASSERT_OK(env_->GetThreadList(&thread_list));
+    for (auto thread : thread_list) {
+      operation_count[thread.operation_type]++;
+    }
+
+    // Speed up the test
+    if (operation_count[ThreadStatus::OP_FLUSH] > 1 &&
+        operation_count[ThreadStatus::OP_COMPACTION] >
+            0.6 * options.max_background_compactions) {
+      break;
+    }
+    if (file == 15 * kNumL0Files) {
+      TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown");
+    }
+  }
+
+  TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown");
+  ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1);
+  CancelAllBackgroundWork(db_);
+  TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown");
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Record the number of compactions at a time.
+  for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) {
+    operation_count[i] = 0;
+  }
+  ASSERT_OK(env_->GetThreadList(&thread_list));
+  for (auto thread : thread_list) {
+    operation_count[thread.operation_type]++;
+  }
+  ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0);
+}
+
+TEST_P(DBTestWithParam, PreShutdownCompactionMiddle) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 40;
+  const int kNumL0Files = 4;
+
+  const int kHighPriCount = 3;
+  const int kLowPriCount = 5;
+  env_->SetBackgroundThreads(kHighPriCount, Env::HIGH);
+  env_->SetBackgroundThreads(kLowPriCount, Env::LOW);
+
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base =
+      options.target_file_size_base * kNumL0Files;
+  options.compression = kNoCompression;
+  options = CurrentOptions(options);
+  options.env = env_;
+  options.enable_thread_tracking = true;
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.max_bytes_for_level_multiplier = 2;
+  options.max_background_compactions = kLowPriCount;
+  options.level0_stop_writes_trigger = 1 << 10;
+  options.level0_slowdown_writes_trigger = 1 << 10;
+  options.max_subcompactions = max_subcompactions_;
+
+  TryReopen(options);
+  Random rnd(301);
+
+  std::vector<ThreadStatus> thread_list;
+  // Delay both flush and compaction
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBTest::PreShutdownCompactionMiddle:Preshutdown",
+        "CompactionJob::Run():Inprogress"},
+       {"CompactionJob::Run():Start",
+        "DBTest::PreShutdownCompactionMiddle:VerifyCompaction"},
+       {"CompactionJob::Run():Inprogress", "CompactionJob::Run():End"},
+       {"CompactionJob::Run():End",
+        "DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Make rocksdb busy
+  int key = 0;
+  // check how many threads are doing compaction using GetThreadList
+  int operation_count[ThreadStatus::NUM_OP_TYPES] = {0};
+  for (int file = 0; file < 16 * kNumL0Files; ++file) {
+    for (int k = 0; k < kEntriesPerBuffer; ++k) {
+      ASSERT_OK(Put(std::to_string(key++), rnd.RandomString(kTestValueSize)));
+    }
+
+    ASSERT_OK(env_->GetThreadList(&thread_list));
+    for (auto thread : thread_list) {
+      operation_count[thread.operation_type]++;
+    }
+
+    // Speed up the test
+    if (operation_count[ThreadStatus::OP_FLUSH] > 1 &&
+        operation_count[ThreadStatus::OP_COMPACTION] >
+            0.6 * options.max_background_compactions) {
+      break;
+    }
+    if (file == 15 * kNumL0Files) {
+      TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyCompaction");
+    }
+  }
+
+  ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1);
+  CancelAllBackgroundWork(db_);
+  TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:Preshutdown");
+  TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown");
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Record the number of compactions at a time.
+  for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) {
+    operation_count[i] = 0;
+  }
+  ASSERT_OK(env_->GetThreadList(&thread_list));
+  for (auto thread : thread_list) {
+    operation_count[thread.operation_type]++;
+  }
+  ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0);
+}
+
+#endif  // ROCKSDB_USING_THREAD_STATUS
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, FlushOnDestroy) {
+  WriteOptions wo;
+  wo.disableWAL = true;
+  ASSERT_OK(Put("foo", "v1", wo));
+  CancelAllBackgroundWork(db_);
+}
+
+TEST_F(DBTest, DynamicLevelCompressionPerLevel) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+  const int kNKeys = 120;
+  int keys[kNKeys];
+  for (int i = 0; i < kNKeys; i++) {
+    keys[i] = i;
+  }
+  RandomShuffle(std::begin(keys), std::end(keys));
+
+  Random rnd(301);
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.db_write_buffer_size = 20480;
+  options.write_buffer_size = 20480;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 2;
+  options.target_file_size_base = 20480;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 102400;
+  options.max_bytes_for_level_multiplier = 4;
+  options.max_background_compactions = 1;
+  options.num_levels = 5;
+
+  options.compression_per_level.resize(3);
+  options.compression_per_level[0] = kNoCompression;
+  options.compression_per_level[1] = kNoCompression;
+  options.compression_per_level[2] = kSnappyCompression;
+
+  OnFileDeletionListener* listener = new OnFileDeletionListener();
+  options.listeners.emplace_back(listener);
+
+  DestroyAndReopen(options);
+
+  // Insert more than 80K. L4 should be base level. Neither L0 nor L4 should
+  // be compressed, so total data size should be more than 80K.
+  for (int i = 0; i < 20; i++) {
+    ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(3), 0);
+  // Assuming each files' metadata is at least 50 bytes/
+  ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(4), 20U * 4000U + 50U * 4);
+
+  // Insert 400KB. Some data will be compressed
+  for (int i = 21; i < 120; i++) {
+    ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+
+  ASSERT_LT(SizeAtLevel(0) + SizeAtLevel(3) + SizeAtLevel(4),
+            120U * 4000U + 50U * 24);
+  // Make sure data in files in L3 is not compacted by removing all files
+  // in L4 and calculate number of rows
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "true"},
+  }));
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(&cf_meta);
+  for (auto file : cf_meta.levels[4].files) {
+    listener->SetExpectedFileName(dbname_ + file.name);
+    ASSERT_OK(dbfull()->DeleteFile(file.name));
+  }
+  listener->VerifyMatchedCount(cf_meta.levels[4].files.size());
+
+  int num_keys = 0;
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    num_keys++;
+  }
+  ASSERT_OK(iter->status());
+  ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(3), num_keys * 4000U + num_keys * 10U);
+}
+
+TEST_F(DBTest, DynamicLevelCompressionPerLevel2) {
+  if (!Snappy_Supported() || !LZ4_Supported() || !Zlib_Supported()) {
+    return;
+  }
+  const int kNKeys = 500;
+  int keys[kNKeys];
+  for (int i = 0; i < kNKeys; i++) {
+    keys[i] = i;
+  }
+  RandomShuffle(std::begin(keys), std::end(keys));
+
+  Random rnd(301);
+  Options options;
+  options.create_if_missing = true;
+  options.db_write_buffer_size = 6000000;
+  options.write_buffer_size = 600000;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 2;
+  options.soft_pending_compaction_bytes_limit = 1024 * 1024;
+  options.target_file_size_base = 20;
+  options.env = env_;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 200;
+  options.max_bytes_for_level_multiplier = 8;
+  options.max_background_compactions = 1;
+  options.num_levels = 5;
+  std::shared_ptr<mock::MockTableFactory> mtf(new mock::MockTableFactory);
+  options.table_factory = mtf;
+
+  options.compression_per_level.resize(3);
+  options.compression_per_level[0] = kNoCompression;
+  options.compression_per_level[1] = kLZ4Compression;
+  options.compression_per_level[2] = kZlibCompression;
+
+  DestroyAndReopen(options);
+  // When base level is L4, L4 is LZ4.
+  std::atomic<int> num_zlib(0);
+  std::atomic<int> num_lz4(0);
+  std::atomic<int> num_no(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        if (compaction->output_level() == 4) {
+          ASSERT_TRUE(compaction->output_compression() == kLZ4Compression);
+          num_lz4.fetch_add(1);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) {
+        auto* compression = reinterpret_cast<CompressionType*>(arg);
+        ASSERT_TRUE(*compression == kNoCompression);
+        num_no.fetch_add(1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int i = 0; i < 100; i++) {
+    std::string value = rnd.RandomString(200);
+    ASSERT_OK(Put(Key(keys[i]), value));
+    if (i % 25 == 24) {
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+  }
+
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(3), 0);
+  ASSERT_GT(NumTableFilesAtLevel(4), 0);
+  ASSERT_GT(num_no.load(), 2);
+  ASSERT_GT(num_lz4.load(), 0);
+  int prev_num_files_l4 = NumTableFilesAtLevel(4);
+
+  // After base level turn L4->L3, L3 becomes LZ4 and L4 becomes Zlib
+  num_lz4.store(0);
+  num_no.store(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        if (compaction->output_level() == 4 && compaction->start_level() == 3) {
+          ASSERT_TRUE(compaction->output_compression() == kZlibCompression);
+          num_zlib.fetch_add(1);
+        } else {
+          ASSERT_TRUE(compaction->output_compression() == kLZ4Compression);
+          num_lz4.fetch_add(1);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) {
+        auto* compression = reinterpret_cast<CompressionType*>(arg);
+        ASSERT_TRUE(*compression == kNoCompression);
+        num_no.fetch_add(1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int i = 101; i < 500; i++) {
+    std::string value = rnd.RandomString(200);
+    ASSERT_OK(Put(Key(keys[i]), value));
+    if (i % 100 == 99) {
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_GT(NumTableFilesAtLevel(3), 0);
+  ASSERT_GT(NumTableFilesAtLevel(4), prev_num_files_l4);
+  ASSERT_GT(num_no.load(), 2);
+  ASSERT_GT(num_lz4.load(), 0);
+  ASSERT_GT(num_zlib.load(), 0);
+}
+
+TEST_F(DBTest, DynamicCompactionOptions) {
+  // minimum write buffer size is enforced at 64KB
+  const uint64_t k32KB = 1 << 15;
+  const uint64_t k64KB = 1 << 16;
+  const uint64_t k128KB = 1 << 17;
+  const uint64_t k1MB = 1 << 20;
+  const uint64_t k4KB = 1 << 12;
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.soft_pending_compaction_bytes_limit = 1024 * 1024;
+  options.write_buffer_size = k64KB;
+  options.arena_block_size = 4 * k4KB;
+  options.max_write_buffer_number = 2;
+  // Compaction related options
+  options.level0_file_num_compaction_trigger = 3;
+  options.level0_slowdown_writes_trigger = 4;
+  options.level0_stop_writes_trigger = 8;
+  options.target_file_size_base = k64KB;
+  options.max_compaction_bytes = options.target_file_size_base * 10;
+  options.target_file_size_multiplier = 1;
+  options.max_bytes_for_level_base = k128KB;
+  options.max_bytes_for_level_multiplier = 4;
+
+  // Block flush thread and disable compaction thread
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  DestroyAndReopen(options);
+
+  auto gen_l0_kb = [this](int start, int size, int stride) {
+    Random rnd(301);
+    for (int i = 0; i < size; i++) {
+      ASSERT_OK(Put(Key(start + stride * i), rnd.RandomString(1024)));
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  };
+
+  // Write 3 files that have the same key range.
+  // Since level0_file_num_compaction_trigger is 3, compaction should be
+  // triggered. The compaction should result in one L1 file
+  gen_l0_kb(0, 64, 1);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+  gen_l0_kb(0, 64, 1);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+  gen_l0_kb(0, 64, 1);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("0,1", FilesPerLevel());
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(1U, metadata.size());
+  ASSERT_LE(metadata[0].size, k64KB + k4KB);
+  ASSERT_GE(metadata[0].size, k64KB - k4KB);
+
+  // Test compaction trigger and target_file_size_base
+  // Reduce compaction trigger to 2, and reduce L1 file size to 32KB.
+  // Writing to 64KB L0 files should trigger a compaction. Since these
+  // 2 L0 files have the same key range, compaction merge them and should
+  // result in 2 32KB L1 files.
+  ASSERT_OK(
+      dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"},
+                            {"target_file_size_base", std::to_string(k32KB)}}));
+
+  gen_l0_kb(0, 64, 1);
+  ASSERT_EQ("1,1", FilesPerLevel());
+  gen_l0_kb(0, 64, 1);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("0,2", FilesPerLevel());
+  metadata.clear();
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(2U, metadata.size());
+  ASSERT_LE(metadata[0].size, k32KB + k4KB);
+  ASSERT_GE(metadata[0].size, k32KB - k4KB);
+  ASSERT_LE(metadata[1].size, k32KB + k4KB);
+  ASSERT_GE(metadata[1].size, k32KB - k4KB);
+
+  // Test max_bytes_for_level_base
+  // Increase level base size to 256KB and write enough data that will
+  // fill L1 and L2. L1 size should be around 256KB while L2 size should be
+  // around 256KB x 4.
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"max_bytes_for_level_base", std::to_string(k1MB)}}));
+
+  // writing 96 x 64KB => 6 * 1024KB
+  // (L1 + L2) = (1 + 4) * 1024KB
+  for (int i = 0; i < 96; ++i) {
+    gen_l0_kb(i, 64, 96);
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_GT(SizeAtLevel(1), k1MB / 2);
+  ASSERT_LT(SizeAtLevel(1), k1MB + k1MB / 2);
+
+  // Within (0.5, 1.5) of 4MB.
+  ASSERT_GT(SizeAtLevel(2), 2 * k1MB);
+  ASSERT_LT(SizeAtLevel(2), 6 * k1MB);
+
+  // Test max_bytes_for_level_multiplier and
+  // max_bytes_for_level_base. Now, reduce both mulitplier and level base,
+  // After filling enough data that can fit in L1 - L3, we should see L1 size
+  // reduces to 128KB from 256KB which was asserted previously. Same for L2.
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"max_bytes_for_level_multiplier", "2"},
+       {"max_bytes_for_level_base", std::to_string(k128KB)}}));
+
+  // writing 20 x 64KB = 10 x 128KB
+  // (L1 + L2 + L3) = (1 + 2 + 4) * 128KB
+  for (int i = 0; i < 20; ++i) {
+    gen_l0_kb(i, 64, 32);
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  uint64_t total_size = SizeAtLevel(1) + SizeAtLevel(2) + SizeAtLevel(3);
+  ASSERT_TRUE(total_size < k128KB * 7 * 1.5);
+
+  // Test level0_stop_writes_trigger.
+  // Clean up memtable and L0. Block compaction threads. If continue to write
+  // and flush memtables. We should see put stop after 8 memtable flushes
+  // since level0_stop_writes_trigger = 8
+  ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  // Block compaction
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  sleeping_task_low.WaitUntilSleeping();
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  int count = 0;
+  Random rnd(301);
+  WriteOptions wo;
+  while (count < 64) {
+    ASSERT_OK(Put(Key(count), rnd.RandomString(1024), wo));
+    ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
+    count++;
+    if (dbfull()->TEST_write_controler().IsStopped()) {
+      sleeping_task_low.WakeUp();
+      break;
+    }
+  }
+  // Stop trigger = 8
+  ASSERT_EQ(count, 8);
+  // Unblock
+  sleeping_task_low.WaitUntilDone();
+
+  // Now reduce level0_stop_writes_trigger to 6. Clear up memtables and L0.
+  // Block compaction thread again. Perform the put and memtable flushes
+  // until we see the stop after 6 memtable flushes.
+  ASSERT_OK(dbfull()->SetOptions({{"level0_stop_writes_trigger", "6"}}));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  // Block compaction again
+  sleeping_task_low.Reset();
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  sleeping_task_low.WaitUntilSleeping();
+  count = 0;
+  while (count < 64) {
+    ASSERT_OK(Put(Key(count), rnd.RandomString(1024), wo));
+    ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
+    count++;
+    if (dbfull()->TEST_write_controler().IsStopped()) {
+      sleeping_task_low.WakeUp();
+      break;
+    }
+  }
+  ASSERT_EQ(count, 6);
+  // Unblock
+  sleeping_task_low.WaitUntilDone();
+
+  // Test disable_auto_compactions
+  // Compaction thread is unblocked but auto compaction is disabled. Write
+  // 4 L0 files and compaction should be triggered. If auto compaction is
+  // disabled, then TEST_WaitForCompact will be waiting for nothing. Number of
+  // L0 files do not change after the call.
+  ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "true"}}));
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(1024)));
+    // Wait for compaction so that put won't stop
+    ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 4);
+
+  // Enable auto compaction and perform the same test, # of L0 files should be
+  // reduced after compaction.
+  ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(1024)));
+    // Wait for compaction so that put won't stop
+    ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_LT(NumTableFilesAtLevel(0), 4);
+}
+
+// Test dynamic FIFO compaction options.
+// This test covers just option parsing and makes sure that the options are
+// correctly assigned. Also look at DBOptionsTest.SetFIFOCompactionOptions
+// test which makes sure that the FIFO compaction funcionality is working
+// as expected on dynamically changing the options.
+// Even more FIFOCompactionTests are at DBTest.FIFOCompaction* .
+TEST_F(DBTest, DynamicFIFOCompactionOptions) {
+  Options options;
+  options.ttl = 0;
+  options.create_if_missing = true;
+  options.env = env_;
+  DestroyAndReopen(options);
+
+  // Initial defaults
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            1024 * 1024 * 1024);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 0);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            false);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo", "{max_table_files_size=23;}"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            23);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 0);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            false);
+
+  ASSERT_OK(dbfull()->SetOptions({{"ttl", "97"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            23);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 97);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            false);
+
+  ASSERT_OK(dbfull()->SetOptions({{"ttl", "203"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            23);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 203);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            false);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo", "{allow_compaction=true;}"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            23);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 203);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            true);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo", "{max_table_files_size=31;}"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            31);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 203);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            true);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo",
+        "{max_table_files_size=51;allow_compaction=true;}"}}));
+  ASSERT_OK(dbfull()->SetOptions({{"ttl", "49"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            51);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 49);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            true);
+}
+
+TEST_F(DBTest, DynamicUniversalCompactionOptions) {
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+  DestroyAndReopen(options);
+
+  // Initial defaults
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 1U);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width,
+            2u);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width,
+            UINT_MAX);
+  ASSERT_EQ(dbfull()
+                ->GetOptions()
+                .compaction_options_universal.max_size_amplification_percent,
+            200u);
+  ASSERT_EQ(dbfull()
+                ->GetOptions()
+                .compaction_options_universal.compression_size_percent,
+            -1);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.stop_style,
+            kCompactionStopStyleTotalSize);
+  ASSERT_EQ(
+      dbfull()->GetOptions().compaction_options_universal.allow_trivial_move,
+      false);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_universal", "{size_ratio=7;}"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7u);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width,
+            2u);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width,
+            UINT_MAX);
+  ASSERT_EQ(dbfull()
+                ->GetOptions()
+                .compaction_options_universal.max_size_amplification_percent,
+            200u);
+  ASSERT_EQ(dbfull()
+                ->GetOptions()
+                .compaction_options_universal.compression_size_percent,
+            -1);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.stop_style,
+            kCompactionStopStyleTotalSize);
+  ASSERT_EQ(
+      dbfull()->GetOptions().compaction_options_universal.allow_trivial_move,
+      false);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_universal", "{min_merge_width=11;}"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7u);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width,
+            11u);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width,
+            UINT_MAX);
+  ASSERT_EQ(dbfull()
+                ->GetOptions()
+                .compaction_options_universal.max_size_amplification_percent,
+            200u);
+  ASSERT_EQ(dbfull()
+                ->GetOptions()
+                .compaction_options_universal.compression_size_percent,
+            -1);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.stop_style,
+            kCompactionStopStyleTotalSize);
+  ASSERT_EQ(
+      dbfull()->GetOptions().compaction_options_universal.allow_trivial_move,
+      false);
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, FileCreationRandomFailure) {
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.write_buffer_size = 100000;  // Small write buffer
+  options.target_file_size_base = 200000;
+  options.max_bytes_for_level_base = 1000000;
+  options.max_bytes_for_level_multiplier = 2;
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  constexpr int kCDTKeysPerBuffer = 4;
+  constexpr int kTestSize = kCDTKeysPerBuffer * 4096;
+  constexpr int kTotalIteration = 20;
+  // the second half of the test involves in random failure
+  // of file creation.
+  constexpr int kRandomFailureTest = kTotalIteration / 2;
+
+  std::vector<std::string> values;
+  for (int i = 0; i < kTestSize; ++i) {
+    values.push_back("NOT_FOUND");
+  }
+  for (int j = 0; j < kTotalIteration; ++j) {
+    if (j == kRandomFailureTest) {
+      env_->non_writeable_rate_.store(90);
+    }
+    for (int k = 0; k < kTestSize; ++k) {
+      // here we expect some of the Put fails.
+      std::string value = rnd.RandomString(100);
+      Status s = Put(Key(k), Slice(value));
+      if (s.ok()) {
+        // update the latest successful put
+        values[k] = value;
+      }
+      // But everything before we simulate the failure-test should succeed.
+      if (j < kRandomFailureTest) {
+        ASSERT_OK(s);
+      }
+    }
+  }
+
+  // If rocksdb does not do the correct job, internal assert will fail here.
+  ASSERT_TRUE(dbfull()->TEST_WaitForFlushMemTable().IsIOError());
+  ASSERT_TRUE(dbfull()->TEST_WaitForCompact().IsIOError());
+
+  // verify we have the latest successful update
+  for (int k = 0; k < kTestSize; ++k) {
+    auto v = Get(Key(k));
+    ASSERT_EQ(v, values[k]);
+  }
+
+  // reopen and reverify we have the latest successful update
+  env_->non_writeable_rate_.store(0);
+  Reopen(options);
+  for (int k = 0; k < kTestSize; ++k) {
+    auto v = Get(Key(k));
+    ASSERT_EQ(v, values[k]);
+  }
+}
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBTest, DynamicMiscOptions) {
+  // Test max_sequential_skip_in_iterations
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.max_sequential_skip_in_iterations = 16;
+  options.compression = kNoCompression;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  auto assert_reseek_count = [this, &options](int key_start, int num_reseek) {
+    int key0 = key_start;
+    int key1 = key_start + 1;
+    int key2 = key_start + 2;
+    Random rnd(301);
+    ASSERT_OK(Put(Key(key0), rnd.RandomString(8)));
+    for (int i = 0; i < 10; ++i) {
+      ASSERT_OK(Put(Key(key1), rnd.RandomString(8)));
+    }
+    ASSERT_OK(Put(Key(key2), rnd.RandomString(8)));
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    iter->Seek(Key(key1));
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Key(key1)), 0);
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Key(key2)), 0);
+    ASSERT_EQ(num_reseek,
+              TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION));
+  };
+  // No reseek
+  assert_reseek_count(100, 0);
+
+  ASSERT_OK(dbfull()->SetOptions({{"max_sequential_skip_in_iterations", "4"}}));
+  // Clear memtable and make new option effective
+  ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
+  // Trigger reseek
+  assert_reseek_count(200, 1);
+
+  ASSERT_OK(
+      dbfull()->SetOptions({{"max_sequential_skip_in_iterations", "16"}}));
+  // Clear memtable and make new option effective
+  ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
+  // No reseek
+  assert_reseek_count(300, 1);
+
+  MutableCFOptions mutable_cf_options;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  // Test soft_pending_compaction_bytes_limit,
+  // hard_pending_compaction_bytes_limit
+  ASSERT_OK(dbfull()->SetOptions(
+      handles_[1], {{"soft_pending_compaction_bytes_limit", "200"},
+                    {"hard_pending_compaction_bytes_limit", "300"}}));
+  ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+                                                     &mutable_cf_options));
+  ASSERT_EQ(200, mutable_cf_options.soft_pending_compaction_bytes_limit);
+  ASSERT_EQ(300, mutable_cf_options.hard_pending_compaction_bytes_limit);
+  // Test report_bg_io_stats
+  ASSERT_OK(
+      dbfull()->SetOptions(handles_[1], {{"report_bg_io_stats", "true"}}));
+  // sanity check
+  ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+                                                     &mutable_cf_options));
+  ASSERT_TRUE(mutable_cf_options.report_bg_io_stats);
+  // Test compression
+  // sanity check
+  ASSERT_OK(dbfull()->SetOptions({{"compression", "kNoCompression"}}));
+  ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0],
+                                                     &mutable_cf_options));
+  ASSERT_EQ(CompressionType::kNoCompression, mutable_cf_options.compression);
+
+  if (Snappy_Supported()) {
+    ASSERT_OK(dbfull()->SetOptions({{"compression", "kSnappyCompression"}}));
+    ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0],
+                                                       &mutable_cf_options));
+    ASSERT_EQ(CompressionType::kSnappyCompression,
+              mutable_cf_options.compression);
+  }
+
+  // Test paranoid_file_checks already done in db_block_cache_test
+  ASSERT_OK(
+      dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "true"}}));
+  ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+                                                     &mutable_cf_options));
+  ASSERT_TRUE(mutable_cf_options.report_bg_io_stats);
+  ASSERT_TRUE(mutable_cf_options.check_flush_compaction_key_order);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      handles_[1], {{"check_flush_compaction_key_order", "false"}}));
+  ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+                                                     &mutable_cf_options));
+  ASSERT_FALSE(mutable_cf_options.check_flush_compaction_key_order);
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, L0L1L2AndUpHitCounter) {
+  const int kNumLevels = 3;
+  const int kNumKeysPerLevel = 10000;
+  const int kNumKeysPerDb = kNumLevels * kNumKeysPerLevel;
+
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  Reopen(options);
+
+  // After the below loop there will be one file on each of L0, L1, and L2.
+  int key = 0;
+  for (int output_level = kNumLevels - 1; output_level >= 0; --output_level) {
+    for (int i = 0; i < kNumKeysPerLevel; ++i) {
+      ASSERT_OK(Put(Key(key), "val"));
+      key++;
+    }
+    ASSERT_OK(Flush());
+    for (int input_level = 0; input_level < output_level; ++input_level) {
+      // `TEST_CompactRange(input_level, ...)` compacts from `input_level` to
+      // `input_level + 1`.
+      ASSERT_OK(dbfull()->TEST_CompactRange(input_level, nullptr, nullptr));
+    }
+  }
+  assert(key == kNumKeysPerDb);
+
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0));
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1));
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+
+  for (int i = 0; i < kNumKeysPerDb; i++) {
+    ASSERT_EQ(Get(Key(i)), "val");
+  }
+
+  ASSERT_EQ(kNumKeysPerLevel, TestGetTickerCount(options, GET_HIT_L0));
+  ASSERT_EQ(kNumKeysPerLevel, TestGetTickerCount(options, GET_HIT_L1));
+  ASSERT_EQ(kNumKeysPerLevel, TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+
+  ASSERT_EQ(kNumKeysPerDb, TestGetTickerCount(options, GET_HIT_L0) +
+                               TestGetTickerCount(options, GET_HIT_L1) +
+                               TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+}
+
+TEST_F(DBTest, EncodeDecompressedBlockSizeTest) {
+  // iter 0 -- zlib
+  // iter 1 -- bzip2
+  // iter 2 -- lz4
+  // iter 3 -- lz4HC
+  // iter 4 -- xpress
+  CompressionType compressions[] = {kZlibCompression, kBZip2Compression,
+                                    kLZ4Compression, kLZ4HCCompression,
+                                    kXpressCompression};
+  for (auto comp : compressions) {
+    if (!CompressionTypeSupported(comp)) {
+      continue;
+    }
+    // first_table_version 1 -- generate with table_version == 1, read with
+    // table_version == 2
+    // first_table_version 2 -- generate with table_version == 2, read with
+    // table_version == 1
+    for (int first_table_version = 1; first_table_version <= 2;
+         ++first_table_version) {
+      BlockBasedTableOptions table_options;
+      table_options.format_version = first_table_version;
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+      Options options = CurrentOptions();
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+      options.create_if_missing = true;
+      options.compression = comp;
+      DestroyAndReopen(options);
+
+      int kNumKeysWritten = 1000;
+
+      Random rnd(301);
+      for (int i = 0; i < kNumKeysWritten; ++i) {
+        // compressible string
+        ASSERT_OK(Put(Key(i), rnd.RandomString(128) + std::string(128, 'a')));
+      }
+
+      table_options.format_version = first_table_version == 1 ? 2 : 1;
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+      Reopen(options);
+      for (int i = 0; i < kNumKeysWritten; ++i) {
+        auto r = Get(Key(i));
+        ASSERT_EQ(r.substr(128), std::string(128, 'a'));
+      }
+    }
+  }
+}
+
+TEST_F(DBTest, CloseSpeedup) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 4;
+  options.max_bytes_for_level_base = 400 * 1024;
+  options.max_write_buffer_number = 16;
+
+  // Block background threads
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  test::SleepingBackgroundTask sleeping_task_high;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                 &sleeping_task_high, Env::Priority::HIGH);
+
+  std::vector<std::string> filenames;
+  ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+  // In Windows, LOCK file cannot be deleted because it is locked by db_test
+  // After closing db_test, the LOCK file is unlocked and can be deleted
+  // Delete archival files.
+  bool deleteDir = true;
+  for (size_t i = 0; i < filenames.size(); ++i) {
+    Status s = env_->DeleteFile(dbname_ + "/" + filenames[i]);
+    if (!s.ok()) {
+      deleteDir = false;
+    }
+  }
+  if (deleteDir) {
+    ASSERT_OK(env_->DeleteDir(dbname_));
+  }
+  DestroyAndReopen(options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  Random rnd(301);
+  int key_idx = 0;
+
+  // First three 110KB files are not going to level 2
+  // After that, (100K, 200K)
+  for (int num = 0; num < 5; num++) {
+    GenerateNewFile(&rnd, &key_idx, true);
+  }
+
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  Close();
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // Unblock background threads
+  sleeping_task_high.WakeUp();
+  sleeping_task_high.WaitUntilDone();
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+
+  Destroy(options);
+}
+
+class DelayedMergeOperator : public MergeOperator {
+ private:
+  DBTest* db_test_;
+
+ public:
+  explicit DelayedMergeOperator(DBTest* d) : db_test_(d) {}
+
+  bool FullMergeV2(const MergeOperationInput& merge_in,
+                   MergeOperationOutput* merge_out) const override {
+    db_test_->env_->MockSleepForMicroseconds(1000 *
+                                             merge_in.operand_list.size());
+    merge_out->new_value = "";
+    return true;
+  }
+
+  const char* Name() const override { return "DelayedMergeOperator"; }
+};
+
+TEST_F(DBTest, MergeTestTime) {
+  std::string one, two, three;
+  PutFixed64(&one, 1);
+  PutFixed64(&two, 2);
+  PutFixed64(&three, 3);
+
+  // Enable time profiling
+  SetPerfLevel(kEnableTime);
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.merge_operator.reset(new DelayedMergeOperator(this));
+  SetTimeElapseOnlySleepOnReopen(&options);
+  DestroyAndReopen(options);
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  ASSERT_EQ(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0);
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", one));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "foo", two));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "foo", three));
+  ASSERT_OK(Flush());
+
+  ReadOptions opt;
+  opt.verify_checksums = true;
+  opt.snapshot = nullptr;
+  std::string result;
+  ASSERT_OK(db_->Get(opt, "foo", &result));
+
+  ASSERT_EQ(2000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME));
+
+  ReadOptions read_options;
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  int count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    ++count;
+  }
+
+  ASSERT_EQ(1, count);
+  ASSERT_EQ(4000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME));
+#ifdef ROCKSDB_USING_THREAD_STATUS
+  ASSERT_GT(TestGetTickerCount(options, FLUSH_WRITE_BYTES), 0);
+#endif  // ROCKSDB_USING_THREAD_STATUS
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(DBTestWithParam, MergeCompactionTimeTest) {
+  SetPerfLevel(kEnableTime);
+  Options options = CurrentOptions();
+  options.compaction_filter_factory = std::make_shared<KeepFilterFactory>();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.merge_operator.reset(new DelayedMergeOperator(this));
+  options.disable_auto_compactions = true;
+  options.max_subcompactions = max_subcompactions_;
+  SetTimeElapseOnlySleepOnReopen(&options);
+  DestroyAndReopen(options);
+
+  constexpr unsigned n = 1000;
+  for (unsigned i = 0; i < n; i++) {
+    ASSERT_OK(db_->Merge(WriteOptions(), "foo", "TEST"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  ASSERT_EQ(uint64_t{n} * 1000000U,
+            TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME));
+}
+
+TEST_P(DBTestWithParam, FilterCompactionTimeTest) {
+  Options options = CurrentOptions();
+  options.compaction_filter_factory =
+      std::make_shared<DelayFilterFactory>(this);
+  options.disable_auto_compactions = true;
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.statistics->set_stats_level(kExceptTimeForMutex);
+  options.max_subcompactions = max_subcompactions_;
+  SetTimeElapseOnlySleepOnReopen(&options);
+  DestroyAndReopen(options);
+
+  unsigned n = 0;
+  // put some data
+  for (int table = 0; table < 4; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      ASSERT_OK(Put(std::to_string(table * 100 + i), "val"));
+      ++n;
+    }
+    ASSERT_OK(Flush());
+  }
+
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ(0U, CountLiveFiles());
+
+  Reopen(options);
+
+  Iterator* itr = db_->NewIterator(ReadOptions());
+  itr->SeekToFirst();
+  ASSERT_OK(itr->status());
+  ASSERT_EQ(uint64_t{n} * 1000000U,
+            TestGetTickerCount(options, FILTER_OPERATION_TOTAL_TIME));
+  delete itr;
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, TestLogCleanup) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 64 * 1024;  // very small
+  // only two memtables allowed ==> only two log files
+  options.max_write_buffer_number = 2;
+  Reopen(options);
+
+  for (int i = 0; i < 100000; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+    // only 2 memtables will be alive, so logs_to_free needs to always be below
+    // 2
+    ASSERT_LT(dbfull()->TEST_LogsToFreeSize(), static_cast<size_t>(3));
+  }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, EmptyCompactedDB) {
+  Options options = CurrentOptions();
+  options.max_open_files = -1;
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  Status s = Put("new", "value");
+  ASSERT_TRUE(s.IsNotSupported());
+  Close();
+}
+#endif  // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, SuggestCompactRangeTest) {
+  class CompactionFilterFactoryGetContext : public CompactionFilterFactory {
+   public:
+    std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+        const CompactionFilter::Context& context) override {
+      saved_context = context;
+      std::unique_ptr<CompactionFilter> empty_filter;
+      return empty_filter;
+    }
+    const char* Name() const override {
+      return "CompactionFilterFactoryGetContext";
+    }
+    static bool IsManual(CompactionFilterFactory* compaction_filter_factory) {
+      return reinterpret_cast<CompactionFilterFactoryGetContext*>(
+                 compaction_filter_factory)
+          ->saved_context.is_manual_compaction;
+    }
+    CompactionFilter::Context saved_context;
+  };
+
+  Options options = CurrentOptions();
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(
+      DBTestBase::kNumKeysByGenerateNewRandomFile));
+  options.compaction_style = kCompactionStyleLevel;
+  options.compaction_filter_factory.reset(
+      new CompactionFilterFactoryGetContext());
+  options.write_buffer_size = 200 << 10;
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 4;
+  options.num_levels = 4;
+  options.compression = kNoCompression;
+  options.max_bytes_for_level_base = 450 << 10;
+  options.target_file_size_base = 98 << 10;
+  options.max_compaction_bytes = static_cast<uint64_t>(1) << 60;  // inf
+
+  Reopen(options);
+
+  Random rnd(301);
+
+  for (int num = 0; num < 10; num++) {
+    GenerateNewRandomFile(&rnd);
+  }
+
+  ASSERT_TRUE(!CompactionFilterFactoryGetContext::IsManual(
+      options.compaction_filter_factory.get()));
+
+  // make sure either L0 or L1 has file
+  while (NumTableFilesAtLevel(0) == 0 && NumTableFilesAtLevel(1) == 0) {
+    GenerateNewRandomFile(&rnd);
+  }
+
+  // compact it three times
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr));
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+
+  // All files are compacted
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  // nonoverlapping with the file on level 0
+  Slice start("a"), end("b");
+  ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // should not compact the level 0 file
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  start = Slice("j");
+  end = Slice("m");
+  ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // SuggestCompactRange() is not going to be reported as manual compaction
+  ASSERT_TRUE(!CompactionFilterFactoryGetContext::IsManual(
+      options.compaction_filter_factory.get()));
+
+  // now it should compact the level 0 file
+  // as it's a trivial move to L1, it triggers another one to compact to L2
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+}
+
+TEST_F(DBTest, SuggestCompactRangeUniversal) {
+  Options options = CurrentOptions();
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(
+      DBTestBase::kNumKeysByGenerateNewRandomFile));
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 200 << 10;
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 4;
+  options.num_levels = 4;
+  options.compression = kNoCompression;
+  options.max_bytes_for_level_base = 450 << 10;
+  options.target_file_size_base = 98 << 10;
+  options.max_compaction_bytes = static_cast<uint64_t>(1) << 60;  // inf
+
+  Reopen(options);
+
+  Random rnd(301);
+
+  for (int num = 0; num < 10; num++) {
+    GenerateNewRandomFile(&rnd);
+  }
+
+  ASSERT_EQ("1,2,3,4", FilesPerLevel());
+  for (int i = 0; i < 3; i++) {
+    ASSERT_OK(
+        db_->SuggestCompactRange(db_->DefaultColumnFamily(), nullptr, nullptr));
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+
+  // All files are compacted
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+  ASSERT_EQ(0, NumTableFilesAtLevel(2));
+
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  // nonoverlapping with the file on level 0
+  Slice start("a"), end("b");
+  ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // should not compact the level 0 file
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  start = Slice("j");
+  end = Slice("m");
+  ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // now it should compact the level 0 file to the last level
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+}
+
+TEST_F(DBTest, PromoteL0) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 10 * 1024 * 1024;
+  DestroyAndReopen(options);
+
+  // non overlapping ranges
+  std::vector<std::pair<int32_t, int32_t>> ranges = {
+      {81, 160}, {0, 80}, {161, 240}, {241, 320}};
+
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  Random rnd(301);
+  std::map<int32_t, std::string> values;
+  for (const auto& range : ranges) {
+    for (int32_t j = range.first; j < range.second; j++) {
+      values[j] = rnd.RandomString(value_size);
+      ASSERT_OK(Put(Key(j), values[j]));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  int32_t level0_files = NumTableFilesAtLevel(0, 0);
+  ASSERT_EQ(level0_files, ranges.size());
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0);  // No files in L1
+
+  // Promote L0 level to L2.
+  ASSERT_OK(experimental::PromoteL0(db_, db_->DefaultColumnFamily(), 2));
+  // We expect that all the files were trivially moved from L0 to L2
+  ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2, 0), level0_files);
+
+  for (const auto& kv : values) {
+    ASSERT_EQ(Get(Key(kv.first)), kv.second);
+  }
+}
+
+TEST_F(DBTest, PromoteL0Failure) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 10 * 1024 * 1024;
+  DestroyAndReopen(options);
+
+  // Produce two L0 files with overlapping ranges.
+  ASSERT_OK(Put(Key(0), ""));
+  ASSERT_OK(Put(Key(3), ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), ""));
+  ASSERT_OK(Flush());
+
+  Status status;
+  // Fails because L0 has overlapping files.
+  status = experimental::PromoteL0(db_, db_->DefaultColumnFamily());
+  ASSERT_TRUE(status.IsInvalidArgument());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  // Now there is a file in L1.
+  ASSERT_GE(NumTableFilesAtLevel(1, 0), 1);
+
+  ASSERT_OK(Put(Key(5), ""));
+  ASSERT_OK(Flush());
+  // Fails because L1 is non-empty.
+  status = experimental::PromoteL0(db_, db_->DefaultColumnFamily());
+  ASSERT_TRUE(status.IsInvalidArgument());
+}
+
+// Github issue #596
+TEST_F(DBTest, CompactRangeWithEmptyBottomLevel) {
+  const int kNumLevels = 2;
+  const int kNumL0Files = 2;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = kNumLevels;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < kNumL0Files; ++i) {
+    ASSERT_OK(Put(Key(0), rnd.RandomString(1024)));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(NumTableFilesAtLevel(0), kNumL0Files);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), kNumL0Files);
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, AutomaticConflictsWithManualCompaction) {
+  const int kNumL0Files = 50;
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 4;
+  // never slowdown / stop
+  options.level0_slowdown_writes_trigger = 999999;
+  options.level0_stop_writes_trigger = 999999;
+  options.max_background_compactions = 10;
+  DestroyAndReopen(options);
+
+  // schedule automatic compactions after the manual one starts, but before it
+  // finishes to ensure conflict.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCompaction:Start",
+        "DBTest::AutomaticConflictsWithManualCompaction:PrePuts"},
+       {"DBTest::AutomaticConflictsWithManualCompaction:PostPuts",
+        "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"}});
+  std::atomic<int> callback_count(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::MaybeScheduleFlushOrCompaction:Conflict",
+      [&](void* /*arg*/) { callback_count.fetch_add(1); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  for (int i = 0; i < 2; ++i) {
+    // put two keys to ensure no trivial move
+    for (int j = 0; j < 2; ++j) {
+      ASSERT_OK(Put(Key(j), rnd.RandomString(1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+  port::Thread manual_compaction_thread([this]() {
+    CompactRangeOptions croptions;
+    croptions.exclusive_manual_compaction = true;
+    ASSERT_OK(db_->CompactRange(croptions, nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT("DBTest::AutomaticConflictsWithManualCompaction:PrePuts");
+  for (int i = 0; i < kNumL0Files; ++i) {
+    // put two keys to ensure no trivial move
+    for (int j = 0; j < 2; ++j) {
+      ASSERT_OK(Put(Key(j), rnd.RandomString(1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+  TEST_SYNC_POINT("DBTest::AutomaticConflictsWithManualCompaction:PostPuts");
+
+  ASSERT_GE(callback_count.load(), 1);
+  for (int i = 0; i < 2; ++i) {
+    ASSERT_NE("NOT_FOUND", Get(Key(i)));
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  manual_compaction_thread.join();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, CompactFilesShouldTriggerAutoCompaction) {
+  Options options = CurrentOptions();
+  options.max_background_compactions = 1;
+  options.level0_file_num_compaction_trigger = 4;
+  options.level0_slowdown_writes_trigger = 36;
+  options.level0_stop_writes_trigger = 36;
+  DestroyAndReopen(options);
+
+  // generate files for manual compaction
+  Random rnd(301);
+  for (int i = 0; i < 2; ++i) {
+    // put two keys to ensure no trivial move
+    for (int j = 0; j < 2; ++j) {
+      ASSERT_OK(Put(Key(j), rnd.RandomString(1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  ROCKSDB_NAMESPACE::ColumnFamilyMetaData cf_meta_data;
+  db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data);
+
+  std::vector<std::string> input_files;
+  input_files.push_back(cf_meta_data.levels[0].files[0].name);
+
+  SyncPoint::GetInstance()->LoadDependency({
+      {"CompactFilesImpl:0",
+       "DBTest::CompactFilesShouldTriggerAutoCompaction:Begin"},
+      {"DBTest::CompactFilesShouldTriggerAutoCompaction:End",
+       "CompactFilesImpl:1"},
+  });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread manual_compaction_thread([&]() {
+    auto s = db_->CompactFiles(CompactionOptions(), db_->DefaultColumnFamily(),
+                               input_files, 0);
+    ASSERT_OK(s);
+  });
+
+  TEST_SYNC_POINT("DBTest::CompactFilesShouldTriggerAutoCompaction:Begin");
+  // generate enough files to trigger compaction
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      ASSERT_OK(Put(Key(j), rnd.RandomString(1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+  db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data);
+  ASSERT_GT(cf_meta_data.levels[0].files.size(),
+            options.level0_file_num_compaction_trigger);
+  TEST_SYNC_POINT("DBTest::CompactFilesShouldTriggerAutoCompaction:End");
+
+  manual_compaction_thread.join();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data);
+  ASSERT_LE(cf_meta_data.levels[0].files.size(),
+            options.level0_file_num_compaction_trigger);
+}
+#endif  // ROCKSDB_LITE
+
+// Github issue #595
+// Large write batch with column families
+TEST_F(DBTest, LargeBatchWithColumnFamilies) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;  // Small write buffer
+  CreateAndReopenWithCF({"pikachu"}, options);
+  int64_t j = 0;
+  for (int i = 0; i < 5; i++) {
+    for (int pass = 1; pass <= 3; pass++) {
+      WriteBatch batch;
+      size_t write_size = 1024 * 1024 * (5 + i);
+      fprintf(stderr, "prepare: %" ROCKSDB_PRIszt " MB, pass:%d\n",
+              (write_size / 1024 / 1024), pass);
+      for (;;) {
+        std::string data(3000, j++ % 127 + 20);
+        data += std::to_string(j);
+        ASSERT_OK(batch.Put(handles_[0], Slice(data), Slice(data)));
+        if (batch.GetDataSize() > write_size) {
+          break;
+        }
+      }
+      fprintf(stderr, "write: %" ROCKSDB_PRIszt " MB\n",
+              (batch.GetDataSize() / 1024 / 1024));
+      ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+      fprintf(stderr, "done\n");
+    }
+  }
+  // make sure we can re-open it.
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+}
+
+// Make sure that Flushes can proceed in parallel with CompactRange()
+TEST_F(DBTest, FlushesInParallelWithCompactRange) {
+  // iter == 0 -- leveled
+  // iter == 1 -- leveled, but throw in a flush between two levels compacting
+  // iter == 2 -- universal
+  for (int iter = 0; iter < 3; ++iter) {
+    Options options = CurrentOptions();
+    if (iter < 2) {
+      options.compaction_style = kCompactionStyleLevel;
+    } else {
+      options.compaction_style = kCompactionStyleUniversal;
+    }
+    options.write_buffer_size = 110 << 10;
+    options.level0_file_num_compaction_trigger = 4;
+    options.num_levels = 4;
+    options.compression = kNoCompression;
+    options.max_bytes_for_level_base = 450 << 10;
+    options.target_file_size_base = 98 << 10;
+    options.max_write_buffer_number = 2;
+
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    for (int num = 0; num < 14; num++) {
+      GenerateNewRandomFile(&rnd);
+    }
+
+    if (iter == 1) {
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+          {{"DBImpl::RunManualCompaction()::1",
+            "DBTest::FlushesInParallelWithCompactRange:1"},
+           {"DBTest::FlushesInParallelWithCompactRange:2",
+            "DBImpl::RunManualCompaction()::2"}});
+    } else {
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+          {{"CompactionJob::Run():Start",
+            "DBTest::FlushesInParallelWithCompactRange:1"},
+           {"DBTest::FlushesInParallelWithCompactRange:2",
+            "CompactionJob::Run():End"}});
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    std::vector<port::Thread> threads;
+    threads.emplace_back([&]() { Compact("a", "z"); });
+
+    TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:1");
+
+    // this has to start a flush. if flushes are blocked, this will try to
+    // create
+    // 3 memtables, and that will fail because max_write_buffer_number is 2
+    for (int num = 0; num < 3; num++) {
+      GenerateNewRandomFile(&rnd, /* nowait */ true);
+    }
+
+    TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:2");
+
+    for (auto& t : threads) {
+      t.join();
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(DBTest, DelayedWriteRate) {
+  const int kEntriesPerMemTable = 100;
+  const int kTotalFlushes = 12;
+
+  Options options = CurrentOptions();
+  env_->SetBackgroundThreads(1, Env::LOW);
+  options.env = env_;
+  options.write_buffer_size = 100000000;
+  options.max_write_buffer_number = 256;
+  options.max_background_compactions = 1;
+  options.level0_file_num_compaction_trigger = 3;
+  options.level0_slowdown_writes_trigger = 3;
+  options.level0_stop_writes_trigger = 999999;
+  options.delayed_write_rate = 20000000;  // Start with 200MB/s
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kEntriesPerMemTable));
+
+  SetTimeElapseOnlySleepOnReopen(&options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Block compactions
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  for (int i = 0; i < 3; i++) {
+    ASSERT_OK(Put(Key(i), std::string(10000, 'x')));
+    ASSERT_OK(Flush());
+  }
+
+  // These writes will be slowed down to 1KB/s
+  uint64_t estimated_sleep_time = 0;
+  Random rnd(301);
+  ASSERT_OK(Put("", ""));
+  uint64_t cur_rate = options.delayed_write_rate;
+  for (int i = 0; i < kTotalFlushes; i++) {
+    uint64_t size_memtable = 0;
+    for (int j = 0; j < kEntriesPerMemTable; j++) {
+      auto rand_num = rnd.Uniform(20);
+      // Spread the size range to more.
+      size_t entry_size = rand_num * rand_num * rand_num;
+      WriteOptions wo;
+      ASSERT_OK(Put(Key(i), std::string(entry_size, 'x'), wo));
+      size_memtable += entry_size + 18;
+      // Occasionally sleep a while
+      if (rnd.Uniform(20) == 6) {
+        env_->SleepForMicroseconds(2666);
+      }
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    estimated_sleep_time += size_memtable * 1000000u / cur_rate;
+    // Slow down twice. One for memtable switch and one for flush finishes.
+    cur_rate = static_cast<uint64_t>(static_cast<double>(cur_rate) *
+                                     kIncSlowdownRatio * kIncSlowdownRatio);
+  }
+  // Estimate the total sleep time fall into the rough range.
+  ASSERT_GT(env_->NowMicros(), estimated_sleep_time / 2);
+  ASSERT_LT(env_->NowMicros(), estimated_sleep_time * 2);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+}
+
+TEST_F(DBTest, HardLimit) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  env_->SetBackgroundThreads(1, Env::LOW);
+  options.max_write_buffer_number = 256;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 * 1024;
+  options.level0_file_num_compaction_trigger = 4;
+  options.level0_slowdown_writes_trigger = 999999;
+  options.level0_stop_writes_trigger = 999999;
+  options.hard_pending_compaction_bytes_limit = 800 << 10;
+  options.max_bytes_for_level_base = 10000000000u;
+  options.max_background_compactions = 1;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+
+  env_->SetBackgroundThreads(1, Env::LOW);
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  std::atomic<int> callback_count(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:Wait", [&](void* /*arg*/) {
+        callback_count.fetch_add(1);
+        sleeping_task_low.WakeUp();
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  int key_idx = 0;
+  for (int num = 0; num < 5; num++) {
+    GenerateNewFile(&rnd, &key_idx, true);
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+
+  ASSERT_EQ(0, callback_count.load());
+
+  for (int num = 0; num < 5; num++) {
+    GenerateNewFile(&rnd, &key_idx, true);
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+  ASSERT_GE(callback_count.load(), 1);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  sleeping_task_low.WaitUntilDone();
+}
+
+#if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+class WriteStallListener : public EventListener {
+ public:
+  WriteStallListener() : condition_(WriteStallCondition::kNormal) {}
+  void OnStallConditionsChanged(const WriteStallInfo& info) override {
+    MutexLock l(&mutex_);
+    condition_ = info.condition.cur;
+  }
+  bool CheckCondition(WriteStallCondition expected) {
+    MutexLock l(&mutex_);
+    return expected == condition_;
+  }
+
+ private:
+  port::Mutex mutex_;
+  WriteStallCondition condition_;
+};
+
+TEST_F(DBTest, SoftLimit) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;  // Small write buffer
+  options.max_write_buffer_number = 256;
+  options.level0_file_num_compaction_trigger = 1;
+  options.level0_slowdown_writes_trigger = 3;
+  options.level0_stop_writes_trigger = 999999;
+  options.delayed_write_rate = 20000;  // About 200KB/s limited rate
+  options.soft_pending_compaction_bytes_limit = 160000;
+  options.target_file_size_base = 99999999;  // All into one file
+  options.max_bytes_for_level_base = 50000;
+  options.max_bytes_for_level_multiplier = 10;
+  options.max_background_compactions = 1;
+  options.compression = kNoCompression;
+  WriteStallListener* listener = new WriteStallListener();
+  options.listeners.emplace_back(listener);
+
+  // FlushMemtable with opt.wait=true does not wait for
+  // `OnStallConditionsChanged` being called. The event listener is triggered
+  // on `JobContext::Clean`, which happens after flush result is installed.
+  // We use sync point to create a custom WaitForFlush that waits for
+  // context cleanup.
+  port::Mutex flush_mutex;
+  port::CondVar flush_cv(&flush_mutex);
+  bool flush_finished = false;
+  auto InstallFlushCallback = [&]() {
+    {
+      MutexLock l(&flush_mutex);
+      flush_finished = false;
+    }
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::BackgroundCallFlush:ContextCleanedUp", [&](void*) {
+          {
+            MutexLock l(&flush_mutex);
+            flush_finished = true;
+          }
+          flush_cv.SignalAll();
+        });
+  };
+  auto WaitForFlush = [&]() {
+    {
+      MutexLock l(&flush_mutex);
+      while (!flush_finished) {
+        flush_cv.Wait();
+      }
+    }
+    SyncPoint::GetInstance()->ClearCallBack(
+        "DBImpl::BackgroundCallFlush:ContextCleanedUp");
+  };
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Reopen(options);
+
+  // Generating 360KB in Level 3
+  for (int i = 0; i < 72; i++) {
+    ASSERT_OK(Put(Key(i), std::string(5000, 'x')));
+    if (i % 10 == 0) {
+      ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
+    }
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  MoveFilesToLevel(3);
+
+  // Generating 360KB in Level 2
+  for (int i = 0; i < 72; i++) {
+    ASSERT_OK(Put(Key(i), std::string(5000, 'x')));
+    if (i % 10 == 0) {
+      ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
+    }
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  MoveFilesToLevel(2);
+
+  ASSERT_OK(Put(Key(0), ""));
+
+  test::SleepingBackgroundTask sleeping_task_low;
+  // Block compactions
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  sleeping_task_low.WaitUntilSleeping();
+
+  // Create 3 L0 files, making score of L0 to be 3.
+  for (int i = 0; i < 3; i++) {
+    ASSERT_OK(Put(Key(i), std::string(5000, 'x')));
+    ASSERT_OK(Put(Key(100 - i), std::string(5000, 'x')));
+    // Flush the file. File size is around 30KB.
+    InstallFlushCallback();
+    ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
+    WaitForFlush();
+  }
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed));
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+  sleeping_task_low.Reset();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Now there is one L1 file but doesn't trigger soft_rate_limit
+  //
+  // TODO: soft_rate_limit is depreciated. If this test
+  // relies on soft_rate_limit, then we need to change the test.
+  //
+  // The L1 file size is around 30KB.
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kNormal));
+
+  // Only allow one compactin going through.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void* /*arg*/) {
+        // Schedule a sleeping task.
+        sleeping_task_low.Reset();
+        env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                       &sleeping_task_low, Env::Priority::LOW);
+      });
+
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  sleeping_task_low.WaitUntilSleeping();
+  // Create 3 L0 files, making score of L0 to be 3
+  for (int i = 0; i < 3; i++) {
+    ASSERT_OK(Put(Key(10 + i), std::string(5000, 'x')));
+    ASSERT_OK(Put(Key(90 - i), std::string(5000, 'x')));
+    // Flush the file. File size is around 30KB.
+    InstallFlushCallback();
+    ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
+    WaitForFlush();
+  }
+
+  // Wake up sleep task to enable compaction to run and waits
+  // for it to go to sleep state again to make sure one compaction
+  // goes through.
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilSleeping();
+
+  // Now there is one L1 file (around 60KB) which exceeds 50KB base by 10KB
+  // Given level multiplier 10, estimated pending compaction is around 100KB
+  // doesn't trigger soft_pending_compaction_bytes_limit
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kNormal));
+
+  // Create 3 L0 files, making score of L0 to be 3, higher than L0.
+  for (int i = 0; i < 3; i++) {
+    ASSERT_OK(Put(Key(20 + i), std::string(5000, 'x')));
+    ASSERT_OK(Put(Key(80 - i), std::string(5000, 'x')));
+    // Flush the file. File size is around 30KB.
+    InstallFlushCallback();
+    ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
+    WaitForFlush();
+  }
+  // Wake up sleep task to enable compaction to run and waits
+  // for it to go to sleep state again to make sure one compaction
+  // goes through.
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilSleeping();
+
+  // Now there is one L1 file (around 90KB) which exceeds 50KB base by 40KB
+  // L2 size is 360KB, so the estimated level fanout 4, estimated pending
+  // compaction is around 200KB
+  // triggerring soft_pending_compaction_bytes_limit
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed));
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilSleeping();
+
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kNormal));
+
+  // shrink level base so L2 will hit soft limit easier.
+  ASSERT_OK(dbfull()->SetOptions({
+      {"max_bytes_for_level_base", "5000"},
+  }));
+
+  ASSERT_OK(Put("", ""));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed));
+
+  sleeping_task_low.WaitUntilSleeping();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+}
+
+TEST_F(DBTest, LastWriteBufferDelay) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;
+  options.max_write_buffer_number = 4;
+  options.delayed_write_rate = 20000;
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  int kNumKeysPerMemtable = 3;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerMemtable));
+
+  Reopen(options);
+  test::SleepingBackgroundTask sleeping_task;
+  // Block flushes
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+                 Env::Priority::HIGH);
+  sleeping_task.WaitUntilSleeping();
+
+  // Create 3 L0 files, making score of L0 to be 3.
+  for (int i = 0; i < 3; i++) {
+    // Fill one mem table
+    for (int j = 0; j < kNumKeysPerMemtable; j++) {
+      ASSERT_OK(Put(Key(j), ""));
+    }
+    ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+  }
+  // Inserting a new entry would create a new mem table, triggering slow down.
+  ASSERT_OK(Put(Key(0), ""));
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+
+  sleeping_task.WakeUp();
+  sleeping_task.WaitUntilDone();
+}
+#endif  // !defined(ROCKSDB_LITE) &&
+        // !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+
+TEST_F(DBTest, FailWhenCompressionNotSupportedTest) {
+  CompressionType compressions[] = {kZlibCompression, kBZip2Compression,
+                                    kLZ4Compression, kLZ4HCCompression,
+                                    kXpressCompression};
+  for (auto comp : compressions) {
+    if (!CompressionTypeSupported(comp)) {
+      // not supported, we should fail the Open()
+      Options options = CurrentOptions();
+      options.compression = comp;
+      ASSERT_TRUE(!TryReopen(options).ok());
+      // Try if CreateColumnFamily also fails
+      options.compression = kNoCompression;
+      ASSERT_OK(TryReopen(options));
+      ColumnFamilyOptions cf_options(options);
+      cf_options.compression = comp;
+      ColumnFamilyHandle* handle;
+      ASSERT_TRUE(!db_->CreateColumnFamily(cf_options, "name", &handle).ok());
+    }
+  }
+}
+
+TEST_F(DBTest, CreateColumnFamilyShouldFailOnIncompatibleOptions) {
+  Options options = CurrentOptions();
+  options.max_open_files = 100;
+  Reopen(options);
+
+  ColumnFamilyOptions cf_options(options);
+  // ttl is now supported when max_open_files is -1.
+  cf_options.ttl = 3600;
+  ColumnFamilyHandle* handle;
+  ASSERT_OK(db_->CreateColumnFamily(cf_options, "pikachu", &handle));
+  delete handle;
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, RowCache) {
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.row_cache = NewLRUCache(8192);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0);
+  ASSERT_EQ(Get("foo"), "bar");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+  ASSERT_EQ(Get("foo"), "bar");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+}
+
+TEST_F(DBTest, PinnableSliceAndRowCache) {
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.row_cache = NewLRUCache(8192);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(Get("foo"), "bar");
+  ASSERT_EQ(
+      reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+      1);
+
+  {
+    PinnableSlice pin_slice;
+    ASSERT_EQ(Get("foo", &pin_slice), Status::OK());
+    ASSERT_EQ(pin_slice.ToString(), "bar");
+    // Entry is already in cache, lookup will remove the element from lru
+    ASSERT_EQ(
+        reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+        0);
+  }
+  // After PinnableSlice destruction element is added back in LRU
+  ASSERT_EQ(
+      reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+      1);
+}
+
+TEST_F(DBTest, ReusePinnableSlice) {
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.row_cache = NewLRUCache(8192);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(Get("foo"), "bar");
+  ASSERT_EQ(
+      reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+      1);
+
+  {
+    PinnableSlice pin_slice;
+    ASSERT_EQ(Get("foo", &pin_slice), Status::OK());
+    ASSERT_EQ(Get("foo", &pin_slice), Status::OK());
+    ASSERT_EQ(pin_slice.ToString(), "bar");
+
+    // Entry is already in cache, lookup will remove the element from lru
+    ASSERT_EQ(
+        reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+        0);
+  }
+  // After PinnableSlice destruction element is added back in LRU
+  ASSERT_EQ(
+      reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+      1);
+
+  {
+    std::vector<Slice> multiget_keys;
+    multiget_keys.push_back("foo");
+    std::vector<PinnableSlice> multiget_values(1);
+    std::vector<Status> statuses({Status::NotFound()});
+    ReadOptions ropt;
+    dbfull()->MultiGet(ropt, dbfull()->DefaultColumnFamily(),
+                       multiget_keys.size(), multiget_keys.data(),
+                       multiget_values.data(), statuses.data());
+    ASSERT_EQ(Status::OK(), statuses[0]);
+    dbfull()->MultiGet(ropt, dbfull()->DefaultColumnFamily(),
+                       multiget_keys.size(), multiget_keys.data(),
+                       multiget_values.data(), statuses.data());
+    ASSERT_EQ(Status::OK(), statuses[0]);
+
+    // Entry is already in cache, lookup will remove the element from lru
+    ASSERT_EQ(
+        reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+        0);
+  }
+  // After PinnableSlice destruction element is added back in LRU
+  ASSERT_EQ(
+      reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+      1);
+
+  {
+    std::vector<ColumnFamilyHandle*> multiget_cfs;
+    multiget_cfs.push_back(dbfull()->DefaultColumnFamily());
+    std::vector<Slice> multiget_keys;
+    multiget_keys.push_back("foo");
+    std::vector<PinnableSlice> multiget_values(1);
+    std::vector<Status> statuses({Status::NotFound()});
+    ReadOptions ropt;
+    dbfull()->MultiGet(ropt, multiget_keys.size(), multiget_cfs.data(),
+                       multiget_keys.data(), multiget_values.data(),
+                       statuses.data());
+    ASSERT_EQ(Status::OK(), statuses[0]);
+    dbfull()->MultiGet(ropt, multiget_keys.size(), multiget_cfs.data(),
+                       multiget_keys.data(), multiget_values.data(),
+                       statuses.data());
+    ASSERT_EQ(Status::OK(), statuses[0]);
+
+    // Entry is already in cache, lookup will remove the element from lru
+    ASSERT_EQ(
+        reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+        0);
+  }
+  // After PinnableSlice destruction element is added back in LRU
+  ASSERT_EQ(
+      reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+      1);
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, DeletingOldWalAfterDrop) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"Test:AllowFlushes", "DBImpl::BGWorkFlush"},
+       {"DBImpl::BGWorkFlush:done", "Test:WaitForFlush"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  Options options = CurrentOptions();
+  options.max_total_wal_size = 8192;
+  options.compression = kNoCompression;
+  options.write_buffer_size = 1 << 20;
+  options.level0_file_num_compaction_trigger = (1 << 30);
+  options.level0_slowdown_writes_trigger = (1 << 30);
+  options.level0_stop_writes_trigger = (1 << 30);
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateColumnFamilies({"cf1", "cf2"}, options);
+  ASSERT_OK(Put(0, "key1", DummyString(8192)));
+  ASSERT_OK(Put(0, "key2", DummyString(8192)));
+  // the oldest wal should now be getting_flushed
+  ASSERT_OK(db_->DropColumnFamily(handles_[0]));
+  // all flushes should now do nothing because their CF is dropped
+  TEST_SYNC_POINT("Test:AllowFlushes");
+  TEST_SYNC_POINT("Test:WaitForFlush");
+  uint64_t lognum1 = dbfull()->TEST_LogfileNumber();
+  ASSERT_OK(Put(1, "key3", DummyString(8192)));
+  ASSERT_OK(Put(1, "key4", DummyString(8192)));
+  // new wal should have been created
+  uint64_t lognum2 = dbfull()->TEST_LogfileNumber();
+  EXPECT_GT(lognum2, lognum1);
+}
+
+TEST_F(DBTest, UnsupportedManualSync) {
+  DestroyAndReopen(CurrentOptions());
+  env_->is_wal_sync_thread_safe_.store(false);
+  Status s = db_->SyncWAL();
+  ASSERT_TRUE(s.IsNotSupported());
+}
+
+INSTANTIATE_TEST_CASE_P(DBTestWithParam, DBTestWithParam,
+                        ::testing::Combine(::testing::Values(1, 4),
+                                           ::testing::Bool()));
+
+TEST_F(DBTest, PauseBackgroundWorkTest) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000;  // Small write buffer
+  Reopen(options);
+
+  std::vector<port::Thread> threads;
+  std::atomic<bool> done(false);
+  ASSERT_OK(db_->PauseBackgroundWork());
+  threads.emplace_back([&]() {
+    Random rnd(301);
+    for (int i = 0; i < 10000; ++i) {
+      ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10)));
+    }
+    done.store(true);
+  });
+  env_->SleepForMicroseconds(200000);
+  // make sure the thread is not done
+  ASSERT_FALSE(done.load());
+  ASSERT_OK(db_->ContinueBackgroundWork());
+  for (auto& t : threads) {
+    t.join();
+  }
+  // now it's done
+  ASSERT_TRUE(done.load());
+}
+
+// Keep spawning short-living threads that create an iterator and quit.
+// Meanwhile in another thread keep flushing memtables.
+// This used to cause a deadlock.
+TEST_F(DBTest, ThreadLocalPtrDeadlock) {
+  std::atomic<int> flushes_done{0};
+  std::atomic<int> threads_destroyed{0};
+  auto done = [&] { return flushes_done.load() > 10; };
+
+  port::Thread flushing_thread([&] {
+    for (int i = 0; !done(); ++i) {
+      ASSERT_OK(db_->Put(WriteOptions(), Slice("hi"),
+                         Slice(std::to_string(i).c_str())));
+      ASSERT_OK(db_->Flush(FlushOptions()));
+      int cnt = ++flushes_done;
+      fprintf(stderr, "Flushed %d times\n", cnt);
+    }
+  });
+
+  std::vector<port::Thread> thread_spawning_threads(10);
+  for (auto& t : thread_spawning_threads) {
+    t = port::Thread([&] {
+      while (!done()) {
+        {
+          port::Thread tmp_thread([&] {
+            auto it = db_->NewIterator(ReadOptions());
+            ASSERT_OK(it->status());
+            delete it;
+          });
+          tmp_thread.join();
+        }
+        ++threads_destroyed;
+      }
+    });
+  }
+
+  for (auto& t : thread_spawning_threads) {
+    t.join();
+  }
+  flushing_thread.join();
+  fprintf(stderr, "Done. Flushed %d times, destroyed %d threads\n",
+          flushes_done.load(), threads_destroyed.load());
+}
+
+TEST_F(DBTest, LargeBlockSizeTest) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_OK(Put(0, "foo", "bar"));
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 8LL * 1024 * 1024 * 1024LL;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_NOK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+}
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBTest, CreationTimeOfOldestFile) {
+  const int kNumKeysPerFile = 32;
+  const int kNumLevelFiles = 2;
+  const int kValueSize = 100;
+
+  Options options = CurrentOptions();
+  options.max_open_files = -1;
+  env_->SetMockSleep();
+  options.env = env_;
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  DestroyAndReopen(options);
+
+  bool set_file_creation_time_to_zero = true;
+  int idx = 0;
+
+  int64_t time_1 = 0;
+  env_->GetCurrentTime(&time_1);
+  const uint64_t uint_time_1 = static_cast<uint64_t>(time_1);
+
+  // Add 50 hours
+  env_->MockSleepForSeconds(50 * 60 * 60);
+
+  int64_t time_2 = 0;
+  env_->GetCurrentTime(&time_2);
+  const uint64_t uint_time_2 = static_cast<uint64_t>(time_2);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "PropertyBlockBuilder::AddTableProperty:Start", [&](void* arg) {
+        TableProperties* props = reinterpret_cast<TableProperties*>(arg);
+        if (set_file_creation_time_to_zero) {
+          if (idx == 0) {
+            props->file_creation_time = 0;
+            idx++;
+          } else if (idx == 1) {
+            props->file_creation_time = uint_time_1;
+            idx = 0;
+          }
+        } else {
+          if (idx == 0) {
+            props->file_creation_time = uint_time_1;
+            idx++;
+          } else if (idx == 1) {
+            props->file_creation_time = uint_time_2;
+          }
+        }
+      });
+  // Set file creation time in manifest all to 0.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "FileMetaData::FileMetaData", [&](void* arg) {
+        FileMetaData* meta = static_cast<FileMetaData*>(arg);
+        meta->file_creation_time = 0;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // At this point there should be 2 files, one with file_creation_time = 0 and
+  // the other non-zero. GetCreationTimeOfOldestFile API should return 0.
+  uint64_t creation_time;
+  Status s1 = dbfull()->GetCreationTimeOfOldestFile(&creation_time);
+  ASSERT_EQ(0, creation_time);
+  ASSERT_EQ(s1, Status::OK());
+
+  // Testing with non-zero file creation time.
+  set_file_creation_time_to_zero = false;
+  options = CurrentOptions();
+  options.max_open_files = -1;
+  options.env = env_;
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // At this point there should be 2 files with non-zero file creation time.
+  // GetCreationTimeOfOldestFile API should return non-zero value.
+  uint64_t ctime;
+  Status s2 = dbfull()->GetCreationTimeOfOldestFile(&ctime);
+  ASSERT_EQ(uint_time_1, ctime);
+  ASSERT_EQ(s2, Status::OK());
+
+  // Testing with max_open_files != -1
+  options = CurrentOptions();
+  options.max_open_files = 10;
+  DestroyAndReopen(options);
+  Status s3 = dbfull()->GetCreationTimeOfOldestFile(&ctime);
+  ASSERT_EQ(s3, Status::NotSupported());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest, MemoryUsageWithMaxWriteBufferSizeToMaintain) {
+  Options options = CurrentOptions();
+  options.max_write_buffer_size_to_maintain = 10000;
+  options.write_buffer_size = 160000;
+  Reopen(options);
+  Random rnd(301);
+  bool memory_limit_exceeded = false;
+
+  ColumnFamilyData* cfd =
+      static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())->cfd();
+
+  for (int i = 0; i < 1000; i++) {
+    std::string value = rnd.RandomString(1000);
+    ASSERT_OK(Put("keykey_" + std::to_string(i), value));
+
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+    const uint64_t cur_active_mem = cfd->mem()->ApproximateMemoryUsage();
+    const uint64_t size_all_mem_table =
+        cur_active_mem + cfd->imm()->ApproximateMemoryUsage();
+
+    // Errors out if memory usage keeps on increasing beyond the limit.
+    // Once memory limit exceeds,  memory_limit_exceeded  is set and if
+    // size_all_mem_table doesn't drop out in the next write then it errors out
+    // (not expected behaviour). If memory usage drops then
+    // memory_limit_exceeded is set to false.
+    if ((size_all_mem_table > cur_active_mem) &&
+        (cur_active_mem >=
+         static_cast<uint64_t>(options.max_write_buffer_size_to_maintain)) &&
+        (size_all_mem_table >
+         static_cast<uint64_t>(options.max_write_buffer_size_to_maintain) +
+             options.write_buffer_size)) {
+      ASSERT_FALSE(memory_limit_exceeded);
+      memory_limit_exceeded = true;
+    } else {
+      memory_limit_exceeded = false;
+    }
+  }
+}
+
+TEST_F(DBTest, ShuttingDownNotBlockStalledWrites) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  Random rnd(403);
+
+  for (int i = 0; i < 20; i++) {
+    ASSERT_OK(Put("key_" + std::to_string(i), rnd.RandomString(10)));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(GetSstFileCount(dbname_), 20);
+
+  // We need !disable_auto_compactions for writes to stall but also want to
+  // delay compaction so stalled writes unblocked due to kShutdownInProgress. BG
+  // compaction will first wait for the sync point
+  // DBTest::ShuttingDownNotBlockStalledWrites. Then it waits extra 2 sec to
+  // allow CancelAllBackgroundWork() to set shutting_down_.
+  SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0",
+      [&](void* /* arg */) { env_->SleepForMicroseconds(2 * 1000 * 1000); });
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::DelayWrite:Wait", "DBTest::ShuttingDownNotBlockStalledWrites"},
+       {"DBTest::ShuttingDownNotBlockStalledWrites",
+        "BackgroundCallCompaction:0"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  options.level0_stop_writes_trigger = 20;
+  options.disable_auto_compactions = false;
+  Reopen(options);
+
+  std::thread thd([&]() {
+    Status s = Put("key_" + std::to_string(101), "101");
+    ASSERT_EQ(s.code(), Status::kShutdownInProgress);
+  });
+
+  TEST_SYNC_POINT("DBTest::ShuttingDownNotBlockStalledWrites");
+  CancelAllBackgroundWork(db_, true);
+
+  thd.join();
+}
+#endif
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_test2.cc b/src/rocksdb/db/db_test2.cc
new file mode 100644
index 000000000..8adde3680
--- /dev/null
+++ b/src/rocksdb/db/db_test2.cc
@@ -0,0 +1,7652 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <atomic>
+#include <cstdlib>
+#include <functional>
+#include <memory>
+
+#include "db/db_test_util.h"
+#include "db/read_callback.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/experimental.h"
+#include "rocksdb/iostats_context.h"
+#include "rocksdb/persistent_cache.h"
+#include "rocksdb/trace_record.h"
+#include "rocksdb/trace_record_result.h"
+#include "rocksdb/utilities/replayer.h"
+#include "rocksdb/wal_filter.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBTest2 : public DBTestBase {
+ public:
+  DBTest2() : DBTestBase("db_test2", /*env_do_fsync=*/true) {}
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, OpenForReadOnly) {
+  DB* db_ptr = nullptr;
+  std::string dbname = test::PerThreadDBPath("db_readonly");
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  // OpenForReadOnly should fail but will create <dbname> in the file system
+  ASSERT_NOK(DB::OpenForReadOnly(options, dbname, &db_ptr));
+  // Since <dbname> is created, we should be able to delete the dir
+  // We first get the list files under <dbname>
+  // There should not be any subdirectories -- this is not checked here
+  std::vector<std::string> files;
+  ASSERT_OK(env_->GetChildren(dbname, &files));
+  for (auto& f : files) {
+    ASSERT_OK(env_->DeleteFile(dbname + "/" + f));
+  }
+  // <dbname> should be empty now and we should be able to delete it
+  ASSERT_OK(env_->DeleteDir(dbname));
+  options.create_if_missing = false;
+  // OpenForReadOnly should fail since <dbname> was successfully deleted
+  ASSERT_NOK(DB::OpenForReadOnly(options, dbname, &db_ptr));
+  // With create_if_missing false, there should not be a dir in the file system
+  ASSERT_NOK(env_->FileExists(dbname));
+}
+
+TEST_F(DBTest2, OpenForReadOnlyWithColumnFamilies) {
+  DB* db_ptr = nullptr;
+  std::string dbname = test::PerThreadDBPath("db_readonly");
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+  column_families.push_back(ColumnFamilyDescriptor("goku", cf_options));
+  std::vector<ColumnFamilyHandle*> handles;
+  // OpenForReadOnly should fail but will create <dbname> in the file system
+  ASSERT_NOK(
+      DB::OpenForReadOnly(options, dbname, column_families, &handles, &db_ptr));
+  // Since <dbname> is created, we should be able to delete the dir
+  // We first get the list files under <dbname>
+  // There should not be any subdirectories -- this is not checked here
+  std::vector<std::string> files;
+  ASSERT_OK(env_->GetChildren(dbname, &files));
+  for (auto& f : files) {
+    ASSERT_OK(env_->DeleteFile(dbname + "/" + f));
+  }
+  // <dbname> should be empty now and we should be able to delete it
+  ASSERT_OK(env_->DeleteDir(dbname));
+  options.create_if_missing = false;
+  // OpenForReadOnly should fail since <dbname> was successfully deleted
+  ASSERT_NOK(
+      DB::OpenForReadOnly(options, dbname, column_families, &handles, &db_ptr));
+  // With create_if_missing false, there should not be a dir in the file system
+  ASSERT_NOK(env_->FileExists(dbname));
+}
+
+class TestReadOnlyWithCompressedCache
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<int, bool>> {
+ public:
+  TestReadOnlyWithCompressedCache()
+      : DBTestBase("test_readonly_with_compressed_cache",
+                   /*env_do_fsync=*/true) {
+    max_open_files_ = std::get<0>(GetParam());
+    use_mmap_ = std::get<1>(GetParam());
+  }
+  int max_open_files_;
+  bool use_mmap_;
+};
+
+TEST_P(TestReadOnlyWithCompressedCache, ReadOnlyWithCompressedCache) {
+  if (use_mmap_ && !IsMemoryMappedAccessSupported()) {
+    ROCKSDB_GTEST_SKIP("Test requires MMAP support");
+    return;
+  }
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("foo2", "barbarbarbarbarbarbarbar"));
+  ASSERT_OK(Flush());
+
+  DB* db_ptr = nullptr;
+  Options options = CurrentOptions();
+  options.allow_mmap_reads = use_mmap_;
+  options.max_open_files = max_open_files_;
+  options.compression = kSnappyCompression;
+  BlockBasedTableOptions table_options;
+  table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024);
+  table_options.no_block_cache = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.statistics = CreateDBStatistics();
+
+  ASSERT_OK(DB::OpenForReadOnly(options, dbname_, &db_ptr));
+
+  std::string v;
+  ASSERT_OK(db_ptr->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("bar", v);
+  ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_HIT));
+  ASSERT_OK(db_ptr->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("bar", v);
+  if (Snappy_Supported()) {
+    if (use_mmap_) {
+      ASSERT_EQ(0,
+                options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_HIT));
+    } else {
+      ASSERT_EQ(1,
+                options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_HIT));
+    }
+  }
+
+  delete db_ptr;
+}
+
+INSTANTIATE_TEST_CASE_P(TestReadOnlyWithCompressedCache,
+                        TestReadOnlyWithCompressedCache,
+                        ::testing::Combine(::testing::Values(-1, 100),
+                                           ::testing::Bool()));
+
+class PartitionedIndexTestListener : public EventListener {
+ public:
+  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+    ASSERT_GT(info.table_properties.index_partitions, 1);
+    ASSERT_EQ(info.table_properties.index_key_is_user_key, 0);
+  }
+};
+
+TEST_F(DBTest2, PartitionedIndexUserToInternalKey) {
+  const int kValueSize = 10500;
+  const int kNumEntriesPerFile = 1000;
+  const int kNumFiles = 3;
+  const int kNumDistinctKeys = 30;
+
+  BlockBasedTableOptions table_options;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+  PartitionedIndexTestListener* listener = new PartitionedIndexTestListener();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.listeners.emplace_back(listener);
+  std::vector<const Snapshot*> snapshots;
+  Reopen(options);
+  Random rnd(301);
+
+  for (int i = 0; i < kNumFiles; i++) {
+    for (int j = 0; j < kNumEntriesPerFile; j++) {
+      int key_id = (i * kNumEntriesPerFile + j) % kNumDistinctKeys;
+      std::string value = rnd.RandomString(kValueSize);
+      ASSERT_OK(Put("keykey_" + std::to_string(key_id), value));
+      snapshots.push_back(db_->GetSnapshot());
+    }
+    ASSERT_OK(Flush());
+  }
+
+  for (auto s : snapshots) {
+    db_->ReleaseSnapshot(s);
+  }
+}
+
+#endif  // ROCKSDB_LITE
+
+class PrefixFullBloomWithReverseComparator
+    : public DBTestBase,
+      public ::testing::WithParamInterface<bool> {
+ public:
+  PrefixFullBloomWithReverseComparator()
+      : DBTestBase("prefix_bloom_reverse", /*env_do_fsync=*/true) {}
+  void SetUp() override { if_cache_filter_ = GetParam(); }
+  bool if_cache_filter_;
+};
+
+TEST_P(PrefixFullBloomWithReverseComparator,
+       PrefixFullBloomWithReverseComparator) {
+  Options options = last_options_;
+  options.comparator = ReverseBytewiseComparator();
+  options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  if (if_cache_filter_) {
+    bbto.no_block_cache = false;
+    bbto.cache_index_and_filter_blocks = true;
+    bbto.block_cache = NewLRUCache(1);
+  }
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  ASSERT_OK(dbfull()->Put(WriteOptions(), "bar123", "foo"));
+  ASSERT_OK(dbfull()->Put(WriteOptions(), "bar234", "foo2"));
+  ASSERT_OK(dbfull()->Put(WriteOptions(), "foo123", "foo3"));
+
+  ASSERT_OK(dbfull()->Flush(FlushOptions()));
+
+  if (bbto.block_cache) {
+    bbto.block_cache->EraseUnRefEntries();
+  }
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+  iter->Seek("bar345");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("bar234", iter->key().ToString());
+  ASSERT_EQ("foo2", iter->value().ToString());
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("bar123", iter->key().ToString());
+  ASSERT_EQ("foo", iter->value().ToString());
+
+  iter->Seek("foo234");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("foo123", iter->key().ToString());
+  ASSERT_EQ("foo3", iter->value().ToString());
+
+  iter->Seek("bar");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+}
+
+INSTANTIATE_TEST_CASE_P(PrefixFullBloomWithReverseComparator,
+                        PrefixFullBloomWithReverseComparator, testing::Bool());
+
+TEST_F(DBTest2, IteratorPropertyVersionNumber) {
+  ASSERT_OK(Put("", ""));
+  Iterator* iter1 = db_->NewIterator(ReadOptions());
+  ASSERT_OK(iter1->status());
+  std::string prop_value;
+  ASSERT_OK(
+      iter1->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
+  uint64_t version_number1 =
+      static_cast<uint64_t>(std::atoi(prop_value.c_str()));
+
+  ASSERT_OK(Put("", ""));
+  ASSERT_OK(Flush());
+
+  Iterator* iter2 = db_->NewIterator(ReadOptions());
+  ASSERT_OK(iter2->status());
+  ASSERT_OK(
+      iter2->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
+  uint64_t version_number2 =
+      static_cast<uint64_t>(std::atoi(prop_value.c_str()));
+
+  ASSERT_GT(version_number2, version_number1);
+
+  ASSERT_OK(Put("", ""));
+
+  Iterator* iter3 = db_->NewIterator(ReadOptions());
+  ASSERT_OK(iter3->status());
+  ASSERT_OK(
+      iter3->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
+  uint64_t version_number3 =
+      static_cast<uint64_t>(std::atoi(prop_value.c_str()));
+
+  ASSERT_EQ(version_number2, version_number3);
+
+  iter1->SeekToFirst();
+  ASSERT_OK(
+      iter1->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
+  uint64_t version_number1_new =
+      static_cast<uint64_t>(std::atoi(prop_value.c_str()));
+  ASSERT_EQ(version_number1, version_number1_new);
+
+  delete iter1;
+  delete iter2;
+  delete iter3;
+}
+
+TEST_F(DBTest2, CacheIndexAndFilterWithDBRestart) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "a", "begin"));
+  ASSERT_OK(Put(1, "z", "end"));
+  ASSERT_OK(Flush(1));
+  TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  std::string value;
+  value = Get(1, "a");
+}
+
+TEST_F(DBTest2, MaxSuccessiveMergesChangeWithDBRecovery) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.max_successive_merges = 3;
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("poi", "Finch"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Reese"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Shaw"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Root"));
+  options.max_successive_merges = 2;
+  Reopen(options);
+}
+
+#ifndef ROCKSDB_LITE
+class DBTestSharedWriteBufferAcrossCFs
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  DBTestSharedWriteBufferAcrossCFs()
+      : DBTestBase("db_test_shared_write_buffer", /*env_do_fsync=*/true) {}
+  void SetUp() override {
+    use_old_interface_ = std::get<0>(GetParam());
+    cost_cache_ = std::get<1>(GetParam());
+  }
+  bool use_old_interface_;
+  bool cost_cache_;
+};
+
+TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  auto flush_listener = std::make_shared<FlushCounterListener>();
+  options.listeners.push_back(flush_listener);
+  // Don't trip the listener at shutdown.
+  options.avoid_flush_during_shutdown = true;
+
+  // Avoid undeterministic value by malloc_usable_size();
+  // Force arena block size to 1
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "Arena::Arena:0", [&](void* arg) {
+        size_t* block_size = static_cast<size_t*>(arg);
+        *block_size = 1;
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "Arena::AllocateNewBlock:0", [&](void* arg) {
+        std::pair<size_t*, size_t*>* pair =
+            static_cast<std::pair<size_t*, size_t*>*>(arg);
+        *std::get<0>(*pair) = *std::get<1>(*pair);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // The total soft write buffer size is about 105000
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+
+  if (use_old_interface_) {
+    options.db_write_buffer_size = 120000;  // this is the real limit
+  } else if (!cost_cache_) {
+    options.write_buffer_manager.reset(new WriteBufferManager(114285));
+  } else {
+    options.write_buffer_manager.reset(new WriteBufferManager(114285, cache));
+  }
+  options.write_buffer_size = 500000;  // this is never hit
+  CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
+
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  std::function<void()> wait_flush = [&]() {
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2]));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[3]));
+    // Ensure background work is fully finished including listener callbacks
+    // before accessing listener state.
+    ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
+  };
+
+  // Create some data and flush "default" and "nikitich" so that they
+  // are newer CFs created.
+  flush_listener->expected_flush_reason = FlushReason::kManualFlush;
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  Flush(3);
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+  Flush(0);
+  ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+            static_cast<uint64_t>(1));
+  ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+            static_cast<uint64_t>(1));
+
+  flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager;
+  ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
+  if (cost_cache_) {
+    ASSERT_GE(cache->GetUsage(), 256 * 1024);
+    ASSERT_LE(cache->GetUsage(), 2 * 256 * 1024);
+  }
+  wait_flush();
+  ASSERT_OK(Put(0, Key(1), DummyString(60000), wo));
+  if (cost_cache_) {
+    ASSERT_GE(cache->GetUsage(), 256 * 1024);
+    ASSERT_LE(cache->GetUsage(), 2 * 256 * 1024);
+  }
+  wait_flush();
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+  // No flush should trigger
+  wait_flush();
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(1));
+  }
+
+  // Trigger a flush. Flushing "nikitich".
+  ASSERT_OK(Put(3, Key(2), DummyString(30000), wo));
+  wait_flush();
+  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+  wait_flush();
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(2));
+  }
+
+  // Without hitting the threshold, no flush should trigger.
+  ASSERT_OK(Put(2, Key(1), DummyString(30000), wo));
+  wait_flush();
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+  wait_flush();
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+  wait_flush();
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(2));
+  }
+
+  // Hit the write buffer limit again. "default"
+  // will have been flushed.
+  ASSERT_OK(Put(2, Key(2), DummyString(10000), wo));
+  wait_flush();
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  wait_flush();
+  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+  wait_flush();
+  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+  wait_flush();
+  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+  wait_flush();
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(2));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(2));
+  }
+
+  // Trigger another flush. This time "dobrynia". "pikachu" should not
+  // be flushed, althrough it was never flushed.
+  ASSERT_OK(Put(1, Key(1), DummyString(1), wo));
+  wait_flush();
+  ASSERT_OK(Put(2, Key(1), DummyString(80000), wo));
+  wait_flush();
+  ASSERT_OK(Put(1, Key(1), DummyString(1), wo));
+  wait_flush();
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+  wait_flush();
+
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(2));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(2));
+  }
+  if (cost_cache_) {
+    ASSERT_GE(cache->GetUsage(), 256 * 1024);
+    Close();
+    options.write_buffer_manager.reset();
+    last_options_.write_buffer_manager.reset();
+    ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(DBTestSharedWriteBufferAcrossCFs,
+                        DBTestSharedWriteBufferAcrossCFs,
+                        ::testing::Values(std::make_tuple(true, false),
+                                          std::make_tuple(false, false),
+                                          std::make_tuple(false, true)));
+
+TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
+  std::string dbname2 = test::PerThreadDBPath("db_shared_wb_db2");
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  auto flush_listener = std::make_shared<FlushCounterListener>();
+  options.listeners.push_back(flush_listener);
+  // Don't trip the listener at shutdown.
+  options.avoid_flush_during_shutdown = true;
+  // Avoid undeterministic value by malloc_usable_size();
+  // Force arena block size to 1
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "Arena::Arena:0", [&](void* arg) {
+        size_t* block_size = static_cast<size_t*>(arg);
+        *block_size = 1;
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "Arena::AllocateNewBlock:0", [&](void* arg) {
+        std::pair<size_t*, size_t*>* pair =
+            static_cast<std::pair<size_t*, size_t*>*>(arg);
+        *std::get<0>(*pair) = *std::get<1>(*pair);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  options.write_buffer_size = 500000;  // this is never hit
+  // Use a write buffer total size so that the soft limit is about
+  // 105000.
+  options.write_buffer_manager.reset(new WriteBufferManager(120000));
+  CreateAndReopenWithCF({"cf1", "cf2"}, options);
+
+  ASSERT_OK(DestroyDB(dbname2, options));
+  DB* db2 = nullptr;
+  ASSERT_OK(DB::Open(options, dbname2, &db2));
+
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  std::function<void()> wait_flush = [&]() {
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2]));
+    ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
+    // Ensure background work is fully finished including listener callbacks
+    // before accessing listener state.
+    ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
+    ASSERT_OK(
+        static_cast_with_check<DBImpl>(db2)->TEST_WaitForBackgroundWork());
+  };
+
+  // Trigger a flush on cf2
+  flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager;
+  ASSERT_OK(Put(2, Key(1), DummyString(70000), wo));
+  wait_flush();
+  ASSERT_OK(Put(0, Key(1), DummyString(20000), wo));
+  wait_flush();
+
+  // Insert to DB2
+  ASSERT_OK(db2->Put(wo, Key(2), DummyString(20000)));
+  wait_flush();
+
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+  wait_flush();
+  ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default") +
+                  GetNumberOfSstFilesForColumnFamily(db_, "cf1") +
+                  GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
+              static_cast<uint64_t>(0));
+  }
+
+  // Triggering to flush another CF in DB1
+  ASSERT_OK(db2->Put(wo, Key(2), DummyString(70000)));
+  wait_flush();
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+  wait_flush();
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
+              static_cast<uint64_t>(0));
+  }
+
+  // Triggering flush in DB2.
+  ASSERT_OK(db2->Put(wo, Key(3), DummyString(40000)));
+  wait_flush();
+  ASSERT_OK(db2->Put(wo, Key(1), DummyString(1)));
+  wait_flush();
+  ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
+              static_cast<uint64_t>(1));
+  }
+
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, TestWriteBufferNoLimitWithCache) {
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  std::shared_ptr<Cache> cache = NewLRUCache(LRUCacheOptions(
+      10000000 /* capacity */, 1 /* num_shard_bits */,
+      false /* strict_capacity_limit */, 0.0 /* high_pri_pool_ratio */,
+      nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+      kDontChargeCacheMetadata));
+
+  options.write_buffer_size = 50000;  // this is never hit
+  // Use a write buffer total size so that the soft limit is about
+  // 105000.
+  options.write_buffer_manager.reset(new WriteBufferManager(0, cache));
+  Reopen(options);
+
+  ASSERT_OK(Put("foo", "bar"));
+  // One dummy entry is 256KB.
+  ASSERT_GT(cache->GetUsage(), 128000);
+}
+
+namespace {
+void ValidateKeyExistence(DB* db, const std::vector<Slice>& keys_must_exist,
+                          const std::vector<Slice>& keys_must_not_exist) {
+  // Ensure that expected keys exist
+  std::vector<std::string> values;
+  if (keys_must_exist.size() > 0) {
+    std::vector<Status> status_list =
+        db->MultiGet(ReadOptions(), keys_must_exist, &values);
+    for (size_t i = 0; i < keys_must_exist.size(); i++) {
+      ASSERT_OK(status_list[i]);
+    }
+  }
+
+  // Ensure that given keys don't exist
+  if (keys_must_not_exist.size() > 0) {
+    std::vector<Status> status_list =
+        db->MultiGet(ReadOptions(), keys_must_not_exist, &values);
+    for (size_t i = 0; i < keys_must_not_exist.size(); i++) {
+      ASSERT_TRUE(status_list[i].IsNotFound());
+    }
+  }
+}
+
+}  // anonymous namespace
+
+TEST_F(DBTest2, WalFilterTest) {
+  class TestWalFilter : public WalFilter {
+   private:
+    // Processing option that is requested to be applied at the given index
+    WalFilter::WalProcessingOption wal_processing_option_;
+    // Index at which to apply wal_processing_option_
+    // At other indexes default wal_processing_option::kContinueProcessing is
+    // returned.
+    size_t apply_option_at_record_index_;
+    // Current record index, incremented with each record encountered.
+    size_t current_record_index_;
+
+   public:
+    TestWalFilter(WalFilter::WalProcessingOption wal_processing_option,
+                  size_t apply_option_for_record_index)
+        : wal_processing_option_(wal_processing_option),
+          apply_option_at_record_index_(apply_option_for_record_index),
+          current_record_index_(0) {}
+
+    WalProcessingOption LogRecord(const WriteBatch& /*batch*/,
+                                  WriteBatch* /*new_batch*/,
+                                  bool* /*batch_changed*/) const override {
+      WalFilter::WalProcessingOption option_to_return;
+
+      if (current_record_index_ == apply_option_at_record_index_) {
+        option_to_return = wal_processing_option_;
+      } else {
+        option_to_return = WalProcessingOption::kContinueProcessing;
+      }
+
+      // Filter is passed as a const object for RocksDB to not modify the
+      // object, however we modify it for our own purpose here and hence
+      // cast the constness away.
+      (const_cast<TestWalFilter*>(this)->current_record_index_)++;
+
+      return option_to_return;
+    }
+
+    const char* Name() const override { return "TestWalFilter"; }
+  };
+
+  // Create 3 batches with two keys each
+  std::vector<std::vector<std::string>> batch_keys(3);
+
+  batch_keys[0].push_back("key1");
+  batch_keys[0].push_back("key2");
+  batch_keys[1].push_back("key3");
+  batch_keys[1].push_back("key4");
+  batch_keys[2].push_back("key5");
+  batch_keys[2].push_back("key6");
+
+  // Test with all WAL processing options
+  for (int option = 0;
+       option < static_cast<int>(
+                    WalFilter::WalProcessingOption::kWalProcessingOptionMax);
+       option++) {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Write given keys in given batches
+    for (size_t i = 0; i < batch_keys.size(); i++) {
+      WriteBatch batch;
+      for (size_t j = 0; j < batch_keys[i].size(); j++) {
+        ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)));
+      }
+      ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+    }
+
+    WalFilter::WalProcessingOption wal_processing_option =
+        static_cast<WalFilter::WalProcessingOption>(option);
+
+    // Create a test filter that would apply wal_processing_option at the first
+    // record
+    size_t apply_option_for_record_index = 1;
+    TestWalFilter test_wal_filter(wal_processing_option,
+                                  apply_option_for_record_index);
+
+    // Reopen database with option to use WAL filter
+    options = OptionsForLogIterTest();
+    options.wal_filter = &test_wal_filter;
+    Status status =
+        TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+    if (wal_processing_option ==
+        WalFilter::WalProcessingOption::kCorruptedRecord) {
+      ASSERT_NOK(status);
+      // In case of corruption we can turn off paranoid_checks to reopen
+      // databse
+      options.paranoid_checks = false;
+      ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    } else {
+      ASSERT_OK(status);
+    }
+
+    // Compute which keys we expect to be found
+    // and which we expect not to be found after recovery.
+    std::vector<Slice> keys_must_exist;
+    std::vector<Slice> keys_must_not_exist;
+    switch (wal_processing_option) {
+      case WalFilter::WalProcessingOption::kCorruptedRecord:
+      case WalFilter::WalProcessingOption::kContinueProcessing: {
+        fprintf(stderr, "Testing with complete WAL processing\n");
+        // we expect all records to be processed
+        for (size_t i = 0; i < batch_keys.size(); i++) {
+          for (size_t j = 0; j < batch_keys[i].size(); j++) {
+            keys_must_exist.push_back(Slice(batch_keys[i][j]));
+          }
+        }
+        break;
+      }
+      case WalFilter::WalProcessingOption::kIgnoreCurrentRecord: {
+        fprintf(stderr,
+                "Testing with ignoring record %" ROCKSDB_PRIszt " only\n",
+                apply_option_for_record_index);
+        // We expect the record with apply_option_for_record_index to be not
+        // found.
+        for (size_t i = 0; i < batch_keys.size(); i++) {
+          for (size_t j = 0; j < batch_keys[i].size(); j++) {
+            if (i == apply_option_for_record_index) {
+              keys_must_not_exist.push_back(Slice(batch_keys[i][j]));
+            } else {
+              keys_must_exist.push_back(Slice(batch_keys[i][j]));
+            }
+          }
+        }
+        break;
+      }
+      case WalFilter::WalProcessingOption::kStopReplay: {
+        fprintf(stderr,
+                "Testing with stopping replay from record %" ROCKSDB_PRIszt
+                "\n",
+                apply_option_for_record_index);
+        // We expect records beyond apply_option_for_record_index to be not
+        // found.
+        for (size_t i = 0; i < batch_keys.size(); i++) {
+          for (size_t j = 0; j < batch_keys[i].size(); j++) {
+            if (i >= apply_option_for_record_index) {
+              keys_must_not_exist.push_back(Slice(batch_keys[i][j]));
+            } else {
+              keys_must_exist.push_back(Slice(batch_keys[i][j]));
+            }
+          }
+        }
+        break;
+      }
+      default:
+        FAIL();  // unhandled case
+    }
+
+    bool checked_after_reopen = false;
+
+    while (true) {
+      // Ensure that expected keys exists
+      // and not expected keys don't exist after recovery
+      ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
+
+      if (checked_after_reopen) {
+        break;
+      }
+
+      // reopen database again to make sure previous log(s) are not used
+      //(even if they were skipped)
+      // reopn database with option to use WAL filter
+      options = OptionsForLogIterTest();
+      ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+      checked_after_reopen = true;
+    }
+  }
+}
+
+TEST_F(DBTest2, WalFilterTestWithChangeBatch) {
+  class ChangeBatchHandler : public WriteBatch::Handler {
+   private:
+    // Batch to insert keys in
+    WriteBatch* new_write_batch_;
+    // Number of keys to add in the new batch
+    size_t num_keys_to_add_in_new_batch_;
+    // Number of keys added to new batch
+    size_t num_keys_added_;
+
+   public:
+    ChangeBatchHandler(WriteBatch* new_write_batch,
+                       size_t num_keys_to_add_in_new_batch)
+        : new_write_batch_(new_write_batch),
+          num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch),
+          num_keys_added_(0) {}
+    void Put(const Slice& key, const Slice& value) override {
+      if (num_keys_added_ < num_keys_to_add_in_new_batch_) {
+        ASSERT_OK(new_write_batch_->Put(key, value));
+        ++num_keys_added_;
+      }
+    }
+  };
+
+  class TestWalFilterWithChangeBatch : public WalFilter {
+   private:
+    // Index at which to start changing records
+    size_t change_records_from_index_;
+    // Number of keys to add in the new batch
+    size_t num_keys_to_add_in_new_batch_;
+    // Current record index, incremented with each record encountered.
+    size_t current_record_index_;
+
+   public:
+    TestWalFilterWithChangeBatch(size_t change_records_from_index,
+                                 size_t num_keys_to_add_in_new_batch)
+        : change_records_from_index_(change_records_from_index),
+          num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch),
+          current_record_index_(0) {}
+
+    WalProcessingOption LogRecord(const WriteBatch& batch,
+                                  WriteBatch* new_batch,
+                                  bool* batch_changed) const override {
+      if (current_record_index_ >= change_records_from_index_) {
+        ChangeBatchHandler handler(new_batch, num_keys_to_add_in_new_batch_);
+        Status s = batch.Iterate(&handler);
+        if (s.ok()) {
+          *batch_changed = true;
+        } else {
+          assert(false);
+        }
+      }
+
+      // Filter is passed as a const object for RocksDB to not modify the
+      // object, however we modify it for our own purpose here and hence
+      // cast the constness away.
+      (const_cast<TestWalFilterWithChangeBatch*>(this)
+           ->current_record_index_)++;
+
+      return WalProcessingOption::kContinueProcessing;
+    }
+
+    const char* Name() const override { return "TestWalFilterWithChangeBatch"; }
+  };
+
+  std::vector<std::vector<std::string>> batch_keys(3);
+
+  batch_keys[0].push_back("key1");
+  batch_keys[0].push_back("key2");
+  batch_keys[1].push_back("key3");
+  batch_keys[1].push_back("key4");
+  batch_keys[2].push_back("key5");
+  batch_keys[2].push_back("key6");
+
+  Options options = OptionsForLogIterTest();
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Write given keys in given batches
+  for (size_t i = 0; i < batch_keys.size(); i++) {
+    WriteBatch batch;
+    for (size_t j = 0; j < batch_keys[i].size(); j++) {
+      ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)));
+    }
+    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+  }
+
+  // Create a test filter that would apply wal_processing_option at the first
+  // record
+  size_t change_records_from_index = 1;
+  size_t num_keys_to_add_in_new_batch = 1;
+  TestWalFilterWithChangeBatch test_wal_filter_with_change_batch(
+      change_records_from_index, num_keys_to_add_in_new_batch);
+
+  // Reopen database with option to use WAL filter
+  options = OptionsForLogIterTest();
+  options.wal_filter = &test_wal_filter_with_change_batch;
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  // Ensure that all keys exist before change_records_from_index_
+  // And after that index only single key exists
+  // as our filter adds only single key for each batch
+  std::vector<Slice> keys_must_exist;
+  std::vector<Slice> keys_must_not_exist;
+
+  for (size_t i = 0; i < batch_keys.size(); i++) {
+    for (size_t j = 0; j < batch_keys[i].size(); j++) {
+      if (i >= change_records_from_index && j >= num_keys_to_add_in_new_batch) {
+        keys_must_not_exist.push_back(Slice(batch_keys[i][j]));
+      } else {
+        keys_must_exist.push_back(Slice(batch_keys[i][j]));
+      }
+    }
+  }
+
+  bool checked_after_reopen = false;
+
+  while (true) {
+    // Ensure that expected keys exists
+    // and not expected keys don't exist after recovery
+    ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
+
+    if (checked_after_reopen) {
+      break;
+    }
+
+    // reopen database again to make sure previous log(s) are not used
+    //(even if they were skipped)
+    // reopn database with option to use WAL filter
+    options = OptionsForLogIterTest();
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+    checked_after_reopen = true;
+  }
+}
+
+TEST_F(DBTest2, WalFilterTestWithChangeBatchExtraKeys) {
+  class TestWalFilterWithChangeBatchAddExtraKeys : public WalFilter {
+   public:
+    WalProcessingOption LogRecord(const WriteBatch& batch,
+                                  WriteBatch* new_batch,
+                                  bool* batch_changed) const override {
+      *new_batch = batch;
+      Status s = new_batch->Put("key_extra", "value_extra");
+      if (s.ok()) {
+        *batch_changed = true;
+      } else {
+        assert(false);
+      }
+      return WalProcessingOption::kContinueProcessing;
+    }
+
+    const char* Name() const override {
+      return "WalFilterTestWithChangeBatchExtraKeys";
+    }
+  };
+
+  std::vector<std::vector<std::string>> batch_keys(3);
+
+  batch_keys[0].push_back("key1");
+  batch_keys[0].push_back("key2");
+  batch_keys[1].push_back("key3");
+  batch_keys[1].push_back("key4");
+  batch_keys[2].push_back("key5");
+  batch_keys[2].push_back("key6");
+
+  Options options = OptionsForLogIterTest();
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Write given keys in given batches
+  for (size_t i = 0; i < batch_keys.size(); i++) {
+    WriteBatch batch;
+    for (size_t j = 0; j < batch_keys[i].size(); j++) {
+      ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)));
+    }
+    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+  }
+
+  // Create a test filter that would add extra keys
+  TestWalFilterWithChangeBatchAddExtraKeys test_wal_filter_extra_keys;
+
+  // Reopen database with option to use WAL filter
+  options = OptionsForLogIterTest();
+  options.wal_filter = &test_wal_filter_extra_keys;
+  Status status = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_TRUE(status.IsNotSupported());
+
+  // Reopen without filter, now reopen should succeed - previous
+  // attempt to open must not have altered the db.
+  options = OptionsForLogIterTest();
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  std::vector<Slice> keys_must_exist;
+  std::vector<Slice> keys_must_not_exist;  // empty vector
+
+  for (size_t i = 0; i < batch_keys.size(); i++) {
+    for (size_t j = 0; j < batch_keys[i].size(); j++) {
+      keys_must_exist.push_back(Slice(batch_keys[i][j]));
+    }
+  }
+
+  ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
+}
+
+TEST_F(DBTest2, WalFilterTestWithColumnFamilies) {
+  class TestWalFilterWithColumnFamilies : public WalFilter {
+   private:
+    // column_family_id -> log_number map (provided to WALFilter)
+    std::map<uint32_t, uint64_t> cf_log_number_map_;
+    // column_family_name -> column_family_id map (provided to WALFilter)
+    std::map<std::string, uint32_t> cf_name_id_map_;
+    // column_family_name -> keys_found_in_wal map
+    // We store keys that are applicable to the column_family
+    // during recovery (i.e. aren't already flushed to SST file(s))
+    // for verification against the keys we expect.
+    std::map<uint32_t, std::vector<std::string>> cf_wal_keys_;
+
+   public:
+    void ColumnFamilyLogNumberMap(
+        const std::map<uint32_t, uint64_t>& cf_lognumber_map,
+        const std::map<std::string, uint32_t>& cf_name_id_map) override {
+      cf_log_number_map_ = cf_lognumber_map;
+      cf_name_id_map_ = cf_name_id_map;
+    }
+
+    WalProcessingOption LogRecordFound(unsigned long long log_number,
+                                       const std::string& /*log_file_name*/,
+                                       const WriteBatch& batch,
+                                       WriteBatch* /*new_batch*/,
+                                       bool* /*batch_changed*/) override {
+      class LogRecordBatchHandler : public WriteBatch::Handler {
+       private:
+        const std::map<uint32_t, uint64_t>& cf_log_number_map_;
+        std::map<uint32_t, std::vector<std::string>>& cf_wal_keys_;
+        unsigned long long log_number_;
+
+       public:
+        LogRecordBatchHandler(
+            unsigned long long current_log_number,
+            const std::map<uint32_t, uint64_t>& cf_log_number_map,
+            std::map<uint32_t, std::vector<std::string>>& cf_wal_keys)
+            : cf_log_number_map_(cf_log_number_map),
+              cf_wal_keys_(cf_wal_keys),
+              log_number_(current_log_number) {}
+
+        Status PutCF(uint32_t column_family_id, const Slice& key,
+                     const Slice& /*value*/) override {
+          auto it = cf_log_number_map_.find(column_family_id);
+          assert(it != cf_log_number_map_.end());
+          unsigned long long log_number_for_cf = it->second;
+          // If the current record is applicable for column_family_id
+          // (i.e. isn't flushed to SST file(s) for column_family_id)
+          // add it to the cf_wal_keys_ map for verification.
+          if (log_number_ >= log_number_for_cf) {
+            cf_wal_keys_[column_family_id].push_back(
+                std::string(key.data(), key.size()));
+          }
+          return Status::OK();
+        }
+      } handler(log_number, cf_log_number_map_, cf_wal_keys_);
+
+      Status s = batch.Iterate(&handler);
+      if (!s.ok()) {
+        // TODO(AR) is this ok?
+        return WalProcessingOption::kCorruptedRecord;
+      }
+
+      return WalProcessingOption::kContinueProcessing;
+    }
+
+    const char* Name() const override {
+      return "WalFilterTestWithColumnFamilies";
+    }
+
+    const std::map<uint32_t, std::vector<std::string>>& GetColumnFamilyKeys() {
+      return cf_wal_keys_;
+    }
+
+    const std::map<std::string, uint32_t>& GetColumnFamilyNameIdMap() {
+      return cf_name_id_map_;
+    }
+  };
+
+  std::vector<std::vector<std::string>> batch_keys_pre_flush(3);
+
+  batch_keys_pre_flush[0].push_back("key1");
+  batch_keys_pre_flush[0].push_back("key2");
+  batch_keys_pre_flush[1].push_back("key3");
+  batch_keys_pre_flush[1].push_back("key4");
+  batch_keys_pre_flush[2].push_back("key5");
+  batch_keys_pre_flush[2].push_back("key6");
+
+  Options options = OptionsForLogIterTest();
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Write given keys in given batches
+  for (size_t i = 0; i < batch_keys_pre_flush.size(); i++) {
+    WriteBatch batch;
+    for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) {
+      ASSERT_OK(batch.Put(handles_[0], batch_keys_pre_flush[i][j],
+                          DummyString(1024)));
+      ASSERT_OK(batch.Put(handles_[1], batch_keys_pre_flush[i][j],
+                          DummyString(1024)));
+    }
+    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+  }
+
+  // Flush default column-family
+  ASSERT_OK(db_->Flush(FlushOptions(), handles_[0]));
+
+  // Do some more writes
+  std::vector<std::vector<std::string>> batch_keys_post_flush(3);
+
+  batch_keys_post_flush[0].push_back("key7");
+  batch_keys_post_flush[0].push_back("key8");
+  batch_keys_post_flush[1].push_back("key9");
+  batch_keys_post_flush[1].push_back("key10");
+  batch_keys_post_flush[2].push_back("key11");
+  batch_keys_post_flush[2].push_back("key12");
+
+  // Write given keys in given batches
+  for (size_t i = 0; i < batch_keys_post_flush.size(); i++) {
+    WriteBatch batch;
+    for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) {
+      ASSERT_OK(batch.Put(handles_[0], batch_keys_post_flush[i][j],
+                          DummyString(1024)));
+      ASSERT_OK(batch.Put(handles_[1], batch_keys_post_flush[i][j],
+                          DummyString(1024)));
+    }
+    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+  }
+
+  // On Recovery we should only find the second batch applicable to default CF
+  // But both batches applicable to pikachu CF
+
+  // Create a test filter that would add extra keys
+  TestWalFilterWithColumnFamilies test_wal_filter_column_families;
+
+  // Reopen database with option to use WAL filter
+  options = OptionsForLogIterTest();
+  options.wal_filter = &test_wal_filter_column_families;
+  Status status = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_TRUE(status.ok());
+
+  // verify that handles_[0] only has post_flush keys
+  // while handles_[1] has pre and post flush keys
+  auto cf_wal_keys = test_wal_filter_column_families.GetColumnFamilyKeys();
+  auto name_id_map = test_wal_filter_column_families.GetColumnFamilyNameIdMap();
+  size_t index = 0;
+  auto keys_cf = cf_wal_keys[name_id_map[kDefaultColumnFamilyName]];
+  // default column-family, only post_flush keys are expected
+  for (size_t i = 0; i < batch_keys_post_flush.size(); i++) {
+    for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) {
+      Slice key_from_the_log(keys_cf[index++]);
+      Slice batch_key(batch_keys_post_flush[i][j]);
+      ASSERT_EQ(key_from_the_log.compare(batch_key), 0);
+    }
+  }
+  ASSERT_EQ(index, keys_cf.size());
+
+  index = 0;
+  keys_cf = cf_wal_keys[name_id_map["pikachu"]];
+  // pikachu column-family, all keys are expected
+  for (size_t i = 0; i < batch_keys_pre_flush.size(); i++) {
+    for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) {
+      Slice key_from_the_log(keys_cf[index++]);
+      Slice batch_key(batch_keys_pre_flush[i][j]);
+      ASSERT_EQ(key_from_the_log.compare(batch_key), 0);
+    }
+  }
+
+  for (size_t i = 0; i < batch_keys_post_flush.size(); i++) {
+    for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) {
+      Slice key_from_the_log(keys_cf[index++]);
+      Slice batch_key(batch_keys_post_flush[i][j]);
+      ASSERT_EQ(key_from_the_log.compare(batch_key), 0);
+    }
+  }
+  ASSERT_EQ(index, keys_cf.size());
+}
+
+TEST_F(DBTest2, PresetCompressionDict) {
+  // Verifies that compression ratio improves when dictionary is enabled, and
+  // improves even further when the dictionary is trained by ZSTD.
+  const size_t kBlockSizeBytes = 4 << 10;
+  const size_t kL0FileBytes = 128 << 10;
+  const size_t kApproxPerBlockOverheadBytes = 50;
+  const int kNumL0Files = 5;
+
+  Options options;
+  // Make sure to use any custom env that the test is configured with.
+  options.env = CurrentOptions().env;
+  options.allow_concurrent_memtable_write = false;
+  options.arena_block_size = kBlockSizeBytes;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kL0FileBytes / kBlockSizeBytes));
+  options.num_levels = 2;
+  options.target_file_size_base = kL0FileBytes;
+  options.target_file_size_multiplier = 2;
+  options.write_buffer_size = kL0FileBytes;
+  BlockBasedTableOptions table_options;
+  table_options.block_size = kBlockSizeBytes;
+  std::vector<CompressionType> compression_types;
+  if (Zlib_Supported()) {
+    compression_types.push_back(kZlibCompression);
+  }
+#if LZ4_VERSION_NUMBER >= 10400  // r124+
+  compression_types.push_back(kLZ4Compression);
+  compression_types.push_back(kLZ4HCCompression);
+#endif  // LZ4_VERSION_NUMBER >= 10400
+  if (ZSTD_Supported()) {
+    compression_types.push_back(kZSTD);
+  }
+
+  enum DictionaryTypes : int {
+    kWithoutDict,
+    kWithDict,
+    kWithZSTDfinalizeDict,
+    kWithZSTDTrainedDict,
+    kDictEnd,
+  };
+
+  for (auto compression_type : compression_types) {
+    options.compression = compression_type;
+    size_t bytes_without_dict = 0;
+    size_t bytes_with_dict = 0;
+    size_t bytes_with_zstd_finalize_dict = 0;
+    size_t bytes_with_zstd_trained_dict = 0;
+    for (int i = kWithoutDict; i < kDictEnd; i++) {
+      // First iteration: compress without preset dictionary
+      // Second iteration: compress with preset dictionary
+      // Third iteration (zstd only): compress with zstd-trained dictionary
+      //
+      // To make sure the compression dictionary has the intended effect, we
+      // verify the compressed size is smaller in successive iterations. Also in
+      // the non-first iterations, verify the data we get out is the same data
+      // we put in.
+      switch (i) {
+        case kWithoutDict:
+          options.compression_opts.max_dict_bytes = 0;
+          options.compression_opts.zstd_max_train_bytes = 0;
+          break;
+        case kWithDict:
+          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
+          options.compression_opts.zstd_max_train_bytes = 0;
+          break;
+        case kWithZSTDfinalizeDict:
+          if (compression_type != kZSTD ||
+              !ZSTD_FinalizeDictionarySupported()) {
+            continue;
+          }
+          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
+          options.compression_opts.zstd_max_train_bytes = kL0FileBytes;
+          options.compression_opts.use_zstd_dict_trainer = false;
+          break;
+        case kWithZSTDTrainedDict:
+          if (compression_type != kZSTD || !ZSTD_TrainDictionarySupported()) {
+            continue;
+          }
+          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
+          options.compression_opts.zstd_max_train_bytes = kL0FileBytes;
+          options.compression_opts.use_zstd_dict_trainer = true;
+          break;
+        default:
+          assert(false);
+      }
+
+      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+      CreateAndReopenWithCF({"pikachu"}, options);
+      Random rnd(301);
+      std::string seq_datas[10];
+      for (int j = 0; j < 10; ++j) {
+        seq_datas[j] =
+            rnd.RandomString(kBlockSizeBytes - kApproxPerBlockOverheadBytes);
+      }
+
+      ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
+      for (int j = 0; j < kNumL0Files; ++j) {
+        for (size_t k = 0; k < kL0FileBytes / kBlockSizeBytes + 1; ++k) {
+          auto key_num = j * (kL0FileBytes / kBlockSizeBytes) + k;
+          ASSERT_OK(Put(1, Key(static_cast<int>(key_num)),
+                        seq_datas[(key_num / 10) % 10]));
+        }
+        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+        ASSERT_EQ(j + 1, NumTableFilesAtLevel(0, 1));
+      }
+      ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
+                                            true /* disallow_trivial_move */));
+      ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
+      ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
+
+      // Get the live sst files size
+      size_t total_sst_bytes = TotalSize(1);
+      if (i == kWithoutDict) {
+        bytes_without_dict = total_sst_bytes;
+      } else if (i == kWithDict) {
+        bytes_with_dict = total_sst_bytes;
+      } else if (i == kWithZSTDfinalizeDict) {
+        bytes_with_zstd_finalize_dict = total_sst_bytes;
+      } else if (i == kWithZSTDTrainedDict) {
+        bytes_with_zstd_trained_dict = total_sst_bytes;
+      }
+
+      for (size_t j = 0; j < kNumL0Files * (kL0FileBytes / kBlockSizeBytes);
+           j++) {
+        ASSERT_EQ(seq_datas[(j / 10) % 10], Get(1, Key(static_cast<int>(j))));
+      }
+      if (i == kWithDict) {
+        ASSERT_GT(bytes_without_dict, bytes_with_dict);
+      } else if (i == kWithZSTDTrainedDict) {
+        // In zstd compression, it is sometimes possible that using a finalized
+        // dictionary does not get as good a compression ratio as raw content
+        // dictionary. But using a dictionary should always get better
+        // compression ratio than not using one.
+        ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_finalize_dict ||
+                    bytes_without_dict > bytes_with_zstd_finalize_dict);
+      } else if (i == kWithZSTDTrainedDict) {
+        // In zstd compression, it is sometimes possible that using a trained
+        // dictionary does not get as good a compression ratio as without
+        // training.
+        // But using a dictionary (with or without training) should always get
+        // better compression ratio than not using one.
+        ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_trained_dict ||
+                    bytes_without_dict > bytes_with_zstd_trained_dict);
+      }
+
+      DestroyAndReopen(options);
+    }
+  }
+}
+
+TEST_F(DBTest2, PresetCompressionDictLocality) {
+  if (!ZSTD_Supported()) {
+    return;
+  }
+  // Verifies that compression dictionary is generated from local data. The
+  // verification simply checks all output SSTs have different compression
+  // dictionaries. We do not verify effectiveness as that'd likely be flaky in
+  // the future.
+  const int kNumEntriesPerFile = 1 << 10;  // 1KB
+  const int kNumBytesPerEntry = 1 << 10;   // 1KB
+  const int kNumFiles = 4;
+  Options options = CurrentOptions();
+  options.compression = kZSTD;
+  options.compression_opts.max_dict_bytes = 1 << 14;        // 16KB
+  options.compression_opts.zstd_max_train_bytes = 1 << 18;  // 256KB
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry;
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kNumEntriesPerFile; ++j) {
+      ASSERT_OK(Put(Key(i * kNumEntriesPerFile + j),
+                    rnd.RandomString(kNumBytesPerEntry)));
+    }
+    ASSERT_OK(Flush());
+    MoveFilesToLevel(1);
+    ASSERT_EQ(NumTableFilesAtLevel(1), i + 1);
+  }
+
+  // Store all the dictionaries generated during a full compaction.
+  std::vector<std::string> compression_dicts;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
+      [&](void* arg) {
+        compression_dicts.emplace_back(static_cast<Slice*>(arg)->ToString());
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  CompactRangeOptions compact_range_opts;
+  compact_range_opts.bottommost_level_compaction =
+      BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));
+
+  // Dictionary compression should not be so good as to compress four totally
+  // random files into one. If it does then there's probably something wrong
+  // with the test.
+  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+
+  // Furthermore, there should be one compression dictionary generated per file.
+  // And they should all be different from each other.
+  ASSERT_EQ(NumTableFilesAtLevel(1),
+            static_cast<int>(compression_dicts.size()));
+  for (size_t i = 1; i < compression_dicts.size(); ++i) {
+    std::string& a = compression_dicts[i - 1];
+    std::string& b = compression_dicts[i];
+    size_t alen = a.size();
+    size_t blen = b.size();
+    ASSERT_TRUE(alen != blen || memcmp(a.data(), b.data(), alen) != 0);
+  }
+}
+
+class PresetCompressionDictTest
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<CompressionType, bool>> {
+ public:
+  PresetCompressionDictTest()
+      : DBTestBase("db_test2", false /* env_do_fsync */),
+        compression_type_(std::get<0>(GetParam())),
+        bottommost_(std::get<1>(GetParam())) {}
+
+ protected:
+  const CompressionType compression_type_;
+  const bool bottommost_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    DBTest2, PresetCompressionDictTest,
+    ::testing::Combine(::testing::ValuesIn(GetSupportedDictCompressions()),
+                       ::testing::Bool()));
+
+TEST_P(PresetCompressionDictTest, Flush) {
+  // Verifies that dictionary is generated and written during flush only when
+  // `ColumnFamilyOptions::compression` enables dictionary. Also verifies the
+  // size of the dictionary is within expectations according to the limit on
+  // buffering set by `CompressionOptions::max_dict_buffer_bytes`.
+  const size_t kValueLen = 256;
+  const size_t kKeysPerFile = 1 << 10;
+  const size_t kDictLen = 16 << 10;
+  const size_t kBlockLen = 4 << 10;
+
+  Options options = CurrentOptions();
+  if (bottommost_) {
+    options.bottommost_compression = compression_type_;
+    options.bottommost_compression_opts.enabled = true;
+    options.bottommost_compression_opts.max_dict_bytes = kDictLen;
+    options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
+  } else {
+    options.compression = compression_type_;
+    options.compression_opts.max_dict_bytes = kDictLen;
+    options.compression_opts.max_dict_buffer_bytes = kBlockLen;
+  }
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(kKeysPerFile));
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.block_size = kBlockLen;
+  bbto.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  Random rnd(301);
+  for (size_t i = 0; i <= kKeysPerFile; ++i) {
+    ASSERT_OK(Put(Key(static_cast<int>(i)), rnd.RandomString(kValueLen)));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a
+  // compression dictionary exists since dictionaries would be preloaded when
+  // the flush finishes.
+  if (bottommost_) {
+    // Flush is never considered bottommost. This should change in the future
+    // since flushed files may have nothing underneath them, like the one in
+    // this test case.
+    ASSERT_EQ(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        0);
+  } else {
+    ASSERT_GT(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        0);
+    // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
+    // number of bytes needs to be adjusted in case the cached block is in
+    // ZSTD's digested dictionary format.
+    if (compression_type_ != kZSTD &&
+        compression_type_ != kZSTDNotFinalCompression) {
+      // Although we limited buffering to `kBlockLen`, there may be up to two
+      // blocks of data included in the dictionary since we only check limit
+      // after each block is built.
+      ASSERT_LE(TestGetTickerCount(options,
+                                   BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+                2 * kBlockLen);
+    }
+  }
+}
+
+TEST_P(PresetCompressionDictTest, CompactNonBottommost) {
+  // Verifies that dictionary is generated and written during compaction to
+  // non-bottommost level only when `ColumnFamilyOptions::compression` enables
+  // dictionary. Also verifies the size of the dictionary is within expectations
+  // according to the limit on buffering set by
+  // `CompressionOptions::max_dict_buffer_bytes`.
+  const size_t kValueLen = 256;
+  const size_t kKeysPerFile = 1 << 10;
+  const size_t kDictLen = 16 << 10;
+  const size_t kBlockLen = 4 << 10;
+
+  Options options = CurrentOptions();
+  if (bottommost_) {
+    options.bottommost_compression = compression_type_;
+    options.bottommost_compression_opts.enabled = true;
+    options.bottommost_compression_opts.max_dict_bytes = kDictLen;
+    options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
+  } else {
+    options.compression = compression_type_;
+    options.compression_opts.max_dict_bytes = kDictLen;
+    options.compression_opts.max_dict_buffer_bytes = kBlockLen;
+  }
+  options.disable_auto_compactions = true;
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.block_size = kBlockLen;
+  bbto.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  Random rnd(301);
+  for (size_t j = 0; j <= kKeysPerFile; ++j) {
+    ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
+  }
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (size_t j = 0; j <= kKeysPerFile; ++j) {
+      ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
+    }
+    ASSERT_OK(Flush());
+  }
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,0,1", FilesPerLevel(0));
+#endif  // ROCKSDB_LITE
+
+  uint64_t prev_compression_dict_bytes_inserted =
+      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT);
+  // This L0->L1 compaction merges the two L0 files into L1. The produced L1
+  // file is not bottommost due to the existing L2 file covering the same key-
+  // range.
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,1,1", FilesPerLevel(0));
+#endif  // ROCKSDB_LITE
+  // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a
+  // compression dictionary exists since dictionaries would be preloaded when
+  // the compaction finishes.
+  if (bottommost_) {
+    ASSERT_EQ(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        prev_compression_dict_bytes_inserted);
+  } else {
+    ASSERT_GT(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        prev_compression_dict_bytes_inserted);
+    // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
+    // number of bytes needs to be adjusted in case the cached block is in
+    // ZSTD's digested dictionary format.
+    if (compression_type_ != kZSTD &&
+        compression_type_ != kZSTDNotFinalCompression) {
+      // Although we limited buffering to `kBlockLen`, there may be up to two
+      // blocks of data included in the dictionary since we only check limit
+      // after each block is built.
+      ASSERT_LE(TestGetTickerCount(options,
+                                   BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+                prev_compression_dict_bytes_inserted + 2 * kBlockLen);
+    }
+  }
+}
+
+TEST_P(PresetCompressionDictTest, CompactBottommost) {
+  // Verifies that dictionary is generated and written during compaction to
+  // non-bottommost level only when either `ColumnFamilyOptions::compression` or
+  // `ColumnFamilyOptions::bottommost_compression` enables dictionary. Also
+  // verifies the size of the dictionary is within expectations according to the
+  // limit on buffering set by `CompressionOptions::max_dict_buffer_bytes`.
+  const size_t kValueLen = 256;
+  const size_t kKeysPerFile = 1 << 10;
+  const size_t kDictLen = 16 << 10;
+  const size_t kBlockLen = 4 << 10;
+
+  Options options = CurrentOptions();
+  if (bottommost_) {
+    options.bottommost_compression = compression_type_;
+    options.bottommost_compression_opts.enabled = true;
+    options.bottommost_compression_opts.max_dict_bytes = kDictLen;
+    options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
+  } else {
+    options.compression = compression_type_;
+    options.compression_opts.max_dict_bytes = kDictLen;
+    options.compression_opts.max_dict_buffer_bytes = kBlockLen;
+  }
+  options.disable_auto_compactions = true;
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.block_size = kBlockLen;
+  bbto.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 2; ++i) {
+    for (size_t j = 0; j <= kKeysPerFile; ++j) {
+      ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
+    }
+    ASSERT_OK(Flush());
+  }
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2", FilesPerLevel(0));
+#endif  // ROCKSDB_LITE
+
+  uint64_t prev_compression_dict_bytes_inserted =
+      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT);
+  CompactRangeOptions cro;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+#endif  // ROCKSDB_LITE
+  ASSERT_GT(
+      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+      prev_compression_dict_bytes_inserted);
+  // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
+  // number of bytes needs to be adjusted in case the cached block is in ZSTD's
+  // digested dictionary format.
+  if (compression_type_ != kZSTD &&
+      compression_type_ != kZSTDNotFinalCompression) {
+    // Although we limited buffering to `kBlockLen`, there may be up to two
+    // blocks of data included in the dictionary since we only check limit after
+    // each block is built.
+    ASSERT_LE(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        prev_compression_dict_bytes_inserted + 2 * kBlockLen);
+  }
+}
+
+class CompactionCompressionListener : public EventListener {
+ public:
+  explicit CompactionCompressionListener(Options* db_options)
+      : db_options_(db_options) {}
+
+  void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override {
+    // Figure out last level with files
+    int bottommost_level = 0;
+    for (int level = 0; level < db->NumberLevels(); level++) {
+      std::string files_at_level;
+      ASSERT_TRUE(
+          db->GetProperty("rocksdb.num-files-at-level" + std::to_string(level),
+                          &files_at_level));
+      if (files_at_level != "0") {
+        bottommost_level = level;
+      }
+    }
+
+    if (db_options_->bottommost_compression != kDisableCompressionOption &&
+        ci.output_level == bottommost_level) {
+      ASSERT_EQ(ci.compression, db_options_->bottommost_compression);
+    } else if (db_options_->compression_per_level.size() != 0) {
+      ASSERT_EQ(ci.compression,
+                db_options_->compression_per_level[ci.output_level]);
+    } else {
+      ASSERT_EQ(ci.compression, db_options_->compression);
+    }
+    max_level_checked = std::max(max_level_checked, ci.output_level);
+  }
+
+  int max_level_checked = 0;
+  const Options* db_options_;
+};
+
+enum CompressionFailureType {
+  kTestCompressionFail,
+  kTestDecompressionFail,
+  kTestDecompressionCorruption
+};
+
+class CompressionFailuresTest
+    : public DBTest2,
+      public testing::WithParamInterface<std::tuple<
+          CompressionFailureType, CompressionType, uint32_t, uint32_t>> {
+ public:
+  CompressionFailuresTest() {
+    std::tie(compression_failure_type_, compression_type_,
+             compression_max_dict_bytes_, compression_parallel_threads_) =
+        GetParam();
+  }
+
+  CompressionFailureType compression_failure_type_ = kTestCompressionFail;
+  CompressionType compression_type_ = kNoCompression;
+  uint32_t compression_max_dict_bytes_ = 0;
+  uint32_t compression_parallel_threads_ = 0;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    DBTest2, CompressionFailuresTest,
+    ::testing::Combine(::testing::Values(kTestCompressionFail,
+                                         kTestDecompressionFail,
+                                         kTestDecompressionCorruption),
+                       ::testing::ValuesIn(GetSupportedCompressions()),
+                       ::testing::Values(0, 10), ::testing::Values(1, 4)));
+
+TEST_P(CompressionFailuresTest, CompressionFailures) {
+  if (compression_type_ == kNoCompression) {
+    return;
+  }
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.max_bytes_for_level_base = 1024;
+  options.max_bytes_for_level_multiplier = 2;
+  options.num_levels = 7;
+  options.max_background_compactions = 1;
+  options.target_file_size_base = 512;
+
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 512;
+  table_options.verify_compression = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  options.compression = compression_type_;
+  options.compression_opts.parallel_threads = compression_parallel_threads_;
+  options.compression_opts.max_dict_bytes = compression_max_dict_bytes_;
+  options.bottommost_compression_opts.parallel_threads =
+      compression_parallel_threads_;
+  options.bottommost_compression_opts.max_dict_bytes =
+      compression_max_dict_bytes_;
+
+  if (compression_failure_type_ == kTestCompressionFail) {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "CompressData:TamperWithReturnValue", [](void* arg) {
+          bool* ret = static_cast<bool*>(arg);
+          *ret = false;
+        });
+  } else if (compression_failure_type_ == kTestDecompressionFail) {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "UncompressBlockData:TamperWithReturnValue", [](void* arg) {
+          Status* ret = static_cast<Status*>(arg);
+          ASSERT_OK(*ret);
+          *ret = Status::Corruption("kTestDecompressionFail");
+        });
+  } else if (compression_failure_type_ == kTestDecompressionCorruption) {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "UncompressBlockData:"
+        "TamperWithDecompressionOutput",
+        [](void* arg) {
+          BlockContents* contents = static_cast<BlockContents*>(arg);
+          // Ensure uncompressed data != original data
+          const size_t len = contents->data.size() + 1;
+          std::unique_ptr<char[]> fake_data(new char[len]());
+          *contents = BlockContents(std::move(fake_data), len);
+        });
+  }
+
+  std::map<std::string, std::string> key_value_written;
+
+  const int kKeySize = 5;
+  const int kValUnitSize = 16;
+  const int kValSize = 256;
+  Random rnd(405);
+
+  Status s = Status::OK();
+
+  DestroyAndReopen(options);
+  // Write 10 random files
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 5; j++) {
+      std::string key = rnd.RandomString(kKeySize);
+      // Ensure good compression ratio
+      std::string valueUnit = rnd.RandomString(kValUnitSize);
+      std::string value;
+      for (int k = 0; k < kValSize; k += kValUnitSize) {
+        value += valueUnit;
+      }
+      s = Put(key, value);
+      if (compression_failure_type_ == kTestCompressionFail) {
+        key_value_written[key] = value;
+        ASSERT_OK(s);
+      }
+    }
+    s = Flush();
+    if (compression_failure_type_ == kTestCompressionFail) {
+      ASSERT_OK(s);
+    }
+    s = dbfull()->TEST_WaitForCompact();
+    if (compression_failure_type_ == kTestCompressionFail) {
+      ASSERT_OK(s);
+    }
+    if (i == 4) {
+      // Make compression fail at the mid of table building
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    }
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  if (compression_failure_type_ == kTestCompressionFail) {
+    // Should be kNoCompression, check content consistency
+    std::unique_ptr<Iterator> db_iter(db_->NewIterator(ReadOptions()));
+    for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+      std::string key = db_iter->key().ToString();
+      std::string value = db_iter->value().ToString();
+      ASSERT_NE(key_value_written.find(key), key_value_written.end());
+      ASSERT_EQ(key_value_written[key], value);
+      key_value_written.erase(key);
+    }
+    ASSERT_EQ(0, key_value_written.size());
+  } else if (compression_failure_type_ == kTestDecompressionFail) {
+    ASSERT_EQ(std::string(s.getState()),
+              "Could not decompress: kTestDecompressionFail");
+  } else if (compression_failure_type_ == kTestDecompressionCorruption) {
+    ASSERT_EQ(std::string(s.getState()),
+              "Decompressed block did not match pre-compression block");
+  }
+}
+
+TEST_F(DBTest2, CompressionOptions) {
+  if (!Zlib_Supported() || !Snappy_Supported()) {
+    return;
+  }
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.max_bytes_for_level_base = 100;
+  options.max_bytes_for_level_multiplier = 2;
+  options.num_levels = 7;
+  options.max_background_compactions = 1;
+
+  CompactionCompressionListener* listener =
+      new CompactionCompressionListener(&options);
+  options.listeners.emplace_back(listener);
+
+  const int kKeySize = 5;
+  const int kValSize = 20;
+  Random rnd(301);
+
+  std::vector<uint32_t> compression_parallel_threads = {1, 4};
+
+  std::map<std::string, std::string> key_value_written;
+
+  for (int iter = 0; iter <= 2; iter++) {
+    listener->max_level_checked = 0;
+
+    if (iter == 0) {
+      // Use different compression algorithms for different levels but
+      // always use Zlib for bottommost level
+      options.compression_per_level = {kNoCompression,     kNoCompression,
+                                       kNoCompression,     kSnappyCompression,
+                                       kSnappyCompression, kSnappyCompression,
+                                       kZlibCompression};
+      options.compression = kNoCompression;
+      options.bottommost_compression = kZlibCompression;
+    } else if (iter == 1) {
+      // Use Snappy except for bottommost level use ZLib
+      options.compression_per_level = {};
+      options.compression = kSnappyCompression;
+      options.bottommost_compression = kZlibCompression;
+    } else if (iter == 2) {
+      // Use Snappy everywhere
+      options.compression_per_level = {};
+      options.compression = kSnappyCompression;
+      options.bottommost_compression = kDisableCompressionOption;
+    }
+
+    for (auto num_threads : compression_parallel_threads) {
+      options.compression_opts.parallel_threads = num_threads;
+      options.bottommost_compression_opts.parallel_threads = num_threads;
+
+      DestroyAndReopen(options);
+      // Write 10 random files
+      for (int i = 0; i < 10; i++) {
+        for (int j = 0; j < 5; j++) {
+          std::string key = rnd.RandomString(kKeySize);
+          std::string value = rnd.RandomString(kValSize);
+          key_value_written[key] = value;
+          ASSERT_OK(Put(key, value));
+        }
+        ASSERT_OK(Flush());
+        ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      }
+
+      // Make sure that we wrote enough to check all 7 levels
+      ASSERT_EQ(listener->max_level_checked, 6);
+
+      // Make sure database content is the same as key_value_written
+      std::unique_ptr<Iterator> db_iter(db_->NewIterator(ReadOptions()));
+      for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+        std::string key = db_iter->key().ToString();
+        std::string value = db_iter->value().ToString();
+        ASSERT_NE(key_value_written.find(key), key_value_written.end());
+        ASSERT_EQ(key_value_written[key], value);
+        key_value_written.erase(key);
+      }
+      ASSERT_OK(db_iter->status());
+      ASSERT_EQ(0, key_value_written.size());
+    }
+  }
+}
+
+class CompactionStallTestListener : public EventListener {
+ public:
+  CompactionStallTestListener()
+      : compacting_files_cnt_(0), compacted_files_cnt_(0) {}
+
+  void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override {
+    ASSERT_EQ(ci.cf_name, "default");
+    ASSERT_EQ(ci.base_input_level, 0);
+    ASSERT_EQ(ci.compaction_reason, CompactionReason::kLevelL0FilesNum);
+    compacting_files_cnt_ += ci.input_files.size();
+  }
+
+  void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
+    ASSERT_EQ(ci.cf_name, "default");
+    ASSERT_EQ(ci.base_input_level, 0);
+    ASSERT_EQ(ci.compaction_reason, CompactionReason::kLevelL0FilesNum);
+    compacted_files_cnt_ += ci.input_files.size();
+  }
+
+  std::atomic<size_t> compacting_files_cnt_;
+  std::atomic<size_t> compacted_files_cnt_;
+};
+
+TEST_F(DBTest2, CompactionStall) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BGWorkCompaction", "DBTest2::CompactionStall:0"},
+       {"DBImpl::BGWorkCompaction", "DBTest2::CompactionStall:1"},
+       {"DBTest2::CompactionStall:2",
+        "DBImpl::NotifyOnCompactionBegin::UnlockMutex"},
+       {"DBTest2::CompactionStall:3",
+        "DBImpl::NotifyOnCompactionCompleted::UnlockMutex"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 4;
+  options.max_background_compactions = 40;
+  CompactionStallTestListener* listener = new CompactionStallTestListener();
+  options.listeners.emplace_back(listener);
+  DestroyAndReopen(options);
+  // make sure all background compaction jobs can be scheduled
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  Random rnd(301);
+
+  // 4 Files in L0
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 10; j++) {
+      ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // Wait for compaction to be triggered
+  TEST_SYNC_POINT("DBTest2::CompactionStall:0");
+
+  // Clear "DBImpl::BGWorkCompaction" SYNC_POINT since we want to hold it again
+  // at DBTest2::CompactionStall::1
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+
+  // Another 6 L0 files to trigger compaction again
+  for (int i = 0; i < 6; i++) {
+    for (int j = 0; j < 10; j++) {
+      ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // Wait for another compaction to be triggered
+  TEST_SYNC_POINT("DBTest2::CompactionStall:1");
+
+  // Hold NotifyOnCompactionBegin in the unlock mutex section
+  TEST_SYNC_POINT("DBTest2::CompactionStall:2");
+
+  // Hold NotifyOnCompactionCompleted in the unlock mutex section
+  TEST_SYNC_POINT("DBTest2::CompactionStall:3");
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_LT(NumTableFilesAtLevel(0),
+            options.level0_file_num_compaction_trigger);
+  ASSERT_GT(listener->compacted_files_cnt_.load(),
+            10 - options.level0_file_num_compaction_trigger);
+  ASSERT_EQ(listener->compacting_files_cnt_.load(),
+            listener->compacted_files_cnt_.load());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest2, FirstSnapshotTest) {
+  Options options;
+  options.write_buffer_size = 100000;  // Small write buffer
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // This snapshot will have sequence number 0 what is expected behaviour.
+  const Snapshot* s1 = db_->GetSnapshot();
+
+  ASSERT_OK(Put(1, "k1", std::string(100000, 'x')));  // Fill memtable
+  ASSERT_OK(Put(1, "k2", std::string(100000, 'y')));  // Trigger flush
+
+  db_->ReleaseSnapshot(s1);
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, DuplicateSnapshot) {
+  Options options;
+  options = CurrentOptions(options);
+  std::vector<const Snapshot*> snapshots;
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  SequenceNumber oldest_ww_snap, first_ww_snap;
+
+  ASSERT_OK(Put("k", "v"));  // inc seq
+  snapshots.push_back(db_->GetSnapshot());
+  snapshots.push_back(db_->GetSnapshot());
+  ASSERT_OK(Put("k", "v"));  // inc seq
+  snapshots.push_back(db_->GetSnapshot());
+  snapshots.push_back(dbi->GetSnapshotForWriteConflictBoundary());
+  first_ww_snap = snapshots.back()->GetSequenceNumber();
+  ASSERT_OK(Put("k", "v"));  // inc seq
+  snapshots.push_back(dbi->GetSnapshotForWriteConflictBoundary());
+  snapshots.push_back(db_->GetSnapshot());
+  ASSERT_OK(Put("k", "v"));  // inc seq
+  snapshots.push_back(db_->GetSnapshot());
+
+  {
+    InstrumentedMutexLock l(dbi->mutex());
+    auto seqs = dbi->snapshots().GetAll(&oldest_ww_snap);
+    ASSERT_EQ(seqs.size(), 4);  // duplicates are not counted
+    ASSERT_EQ(oldest_ww_snap, first_ww_snap);
+  }
+
+  for (auto s : snapshots) {
+    db_->ReleaseSnapshot(s);
+  }
+}
+#endif  // ROCKSDB_LITE
+
+class PinL0IndexAndFilterBlocksTest
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  PinL0IndexAndFilterBlocksTest()
+      : DBTestBase("db_pin_l0_index_bloom_test", /*env_do_fsync=*/true) {}
+  void SetUp() override {
+    infinite_max_files_ = std::get<0>(GetParam());
+    disallow_preload_ = std::get<1>(GetParam());
+  }
+
+  void CreateTwoLevels(Options* options, bool close_afterwards) {
+    if (infinite_max_files_) {
+      options->max_open_files = -1;
+    }
+    options->create_if_missing = true;
+    options->statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    BlockBasedTableOptions table_options;
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+    table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+    options->table_factory.reset(NewBlockBasedTableFactory(table_options));
+    CreateAndReopenWithCF({"pikachu"}, *options);
+
+    ASSERT_OK(Put(1, "a", "begin"));
+    ASSERT_OK(Put(1, "z", "end"));
+    ASSERT_OK(Flush(1));
+    // move this table to L1
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+
+    // reset block cache
+    table_options.block_cache = NewLRUCache(64 * 1024);
+    options->table_factory.reset(NewBlockBasedTableFactory(table_options));
+    TryReopenWithColumnFamilies({"default", "pikachu"}, *options);
+    // create new table at L0
+    ASSERT_OK(Put(1, "a2", "begin2"));
+    ASSERT_OK(Put(1, "z2", "end2"));
+    ASSERT_OK(Flush(1));
+
+    if (close_afterwards) {
+      Close();  // This ensures that there is no ref to block cache entries
+    }
+    table_options.block_cache->EraseUnRefEntries();
+  }
+
+  bool infinite_max_files_;
+  bool disallow_preload_;
+};
+
+TEST_P(PinL0IndexAndFilterBlocksTest,
+       IndexAndFilterBlocksOfNewTableAddedToCacheWithPinning) {
+  Options options = CurrentOptions();
+  if (infinite_max_files_) {
+    options.max_open_files = -1;
+  }
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "key", "val"));
+  // Create a new table.
+  ASSERT_OK(Flush(1));
+
+  // index/filter blocks added to block cache right after table creation.
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+
+  // only index/filter were added
+  ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
+
+  std::string value;
+  // Miss and hit count should remain the same, they're all pinned.
+  ASSERT_TRUE(db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+
+  // Miss and hit count should remain the same, they're all pinned.
+  value = Get(1, "key");
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+}
+
+TEST_P(PinL0IndexAndFilterBlocksTest,
+       MultiLevelIndexAndFilterBlocksCachedWithPinning) {
+  Options options = CurrentOptions();
+  PinL0IndexAndFilterBlocksTest::CreateTwoLevels(&options, false);
+  // get base cache values
+  uint64_t fm = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+  uint64_t fh = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+  uint64_t im = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS);
+  uint64_t ih = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT);
+
+  std::string value;
+  // this should be read from L0
+  // so cache values don't change
+  value = Get(1, "a2");
+  ASSERT_EQ(fm, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+  ASSERT_EQ(im, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+
+  // this should be read from L1
+  // the file is opened, prefetching results in a cache filter miss
+  // the block is loaded and added to the cache,
+  // then the get results in a cache hit for L1
+  // When we have inifinite max_files, there is still cache miss because we have
+  // reset the block cache
+  value = Get(1, "a");
+  ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+}
+
+TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) {
+  Options options = CurrentOptions();
+  // This ensures that db does not ref anything in the block cache, so
+  // EraseUnRefEntries could clear them up.
+  bool close_afterwards = true;
+  PinL0IndexAndFilterBlocksTest::CreateTwoLevels(&options, close_afterwards);
+
+  // Get base cache values
+  uint64_t fm = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+  uint64_t fh = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+  uint64_t im = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS);
+  uint64_t ih = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT);
+
+  if (disallow_preload_) {
+    // Now we have two files. We narrow the max open files to allow 3 entries
+    // so that preloading SST files won't happen.
+    options.max_open_files = 13;
+    // RocksDB sanitize max open files to at least 20. Modify it back.
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+          int* max_open_files = static_cast<int*>(arg);
+          *max_open_files = 13;
+        });
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Reopen database. If max_open_files is set as -1, table readers will be
+  // preloaded. This will trigger a BlockBasedTable::Open() and prefetch
+  // L0 index and filter. Level 1's prefetching is disabled in DB::Open()
+  TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  if (!disallow_preload_) {
+    // After reopen, cache miss are increased by one because we read (and only
+    // read) filter and index on L0
+    ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+  } else {
+    // If max_open_files is not -1, we do not preload table readers, so there is
+    // no change.
+    ASSERT_EQ(fm, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_EQ(im, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+  }
+  std::string value;
+  // this should be read from L0
+  value = Get(1, "a2");
+  // If max_open_files is -1, we have pinned index and filter in Rep, so there
+  // will not be changes in index and filter misses or hits. If max_open_files
+  // is not -1, Get() will open a TableReader and prefetch index and filter.
+  ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+  ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+
+  // this should be read from L1
+  value = Get(1, "a");
+  if (!disallow_preload_) {
+    // In inifinite max files case, there's a cache miss in executing Get()
+    // because index and filter are not prefetched before.
+    ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_EQ(im + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+  } else {
+    // In this case, cache miss will be increased by one in
+    // BlockBasedTable::Open() because this is not in DB::Open() code path so we
+    // will prefetch L1's index and filter. Cache hit will also be increased by
+    // one because Get() will read index and filter from the block cache
+    // prefetched in previous Open() call.
+    ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_EQ(im + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(ih + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+  }
+
+  // Force a full compaction to one single file. There will be a block
+  // cache read for both of index and filter. If prefetch doesn't explicitly
+  // happen, it will happen when verifying the file.
+  Compact(1, "a", "zzzzz");
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  if (!disallow_preload_) {
+    ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+  } else {
+    ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+  }
+
+  // Bloom and index hit will happen when a Get() happens.
+  value = Get(1, "a");
+  if (!disallow_preload_) {
+    ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+  } else {
+    ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(fh + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(ih + 5, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(PinL0IndexAndFilterBlocksTest,
+                        PinL0IndexAndFilterBlocksTest,
+                        ::testing::Values(std::make_tuple(true, false),
+                                          std::make_tuple(false, false),
+                                          std::make_tuple(false, true)));
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, MaxCompactionBytesTest) {
+  Options options = CurrentOptions();
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(
+      DBTestBase::kNumKeysByGenerateNewRandomFile));
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 200 << 10;
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 4;
+  options.num_levels = 4;
+  options.compression = kNoCompression;
+  options.max_bytes_for_level_base = 450 << 10;
+  options.target_file_size_base = 100 << 10;
+  // Infinite for full compaction.
+  options.max_compaction_bytes = options.target_file_size_base * 100;
+
+  Reopen(options);
+
+  Random rnd(301);
+
+  for (int num = 0; num < 8; num++) {
+    GenerateNewRandomFile(&rnd);
+  }
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,8", FilesPerLevel(0));
+
+  // When compact from Ln -> Ln+1, cut a file if the file overlaps with
+  // more than three files in Ln+1.
+  options.max_compaction_bytes = options.target_file_size_base * 3;
+  Reopen(options);
+
+  GenerateNewRandomFile(&rnd);
+  // Add three more small files that overlap with the previous file
+  for (int i = 0; i < 3; i++) {
+    ASSERT_OK(Put("a", "z"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Output files to L1 are cut to 4 pieces, according to
+  // options.max_compaction_bytes (300K)
+  // There are 8 files on L2 (grandparents level), each one is 100K. The first
+  // file overlaps with a, b which max_compaction_bytes is less than 300K, the
+  // second one overlaps with d, e, which is also less than 300K. Including any
+  // extra grandparent file will make the future compaction larger than 300K.
+  // L1: [  1  ] [  2 ]  [  3  ] [ 4 ]
+  // L2: [a] [b] [c] [d] [e] [f] [g] [h]
+  ASSERT_EQ("0,4,8", FilesPerLevel(0));
+}
+
+static void UniqueIdCallback(void* arg) {
+  int* result = reinterpret_cast<int*>(arg);
+  if (*result == -1) {
+    *result = 0;
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback);
+}
+
+class MockPersistentCache : public PersistentCache {
+ public:
+  explicit MockPersistentCache(const bool is_compressed, const size_t max_size)
+      : is_compressed_(is_compressed), max_size_(max_size) {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback);
+  }
+
+  ~MockPersistentCache() override {}
+
+  PersistentCache::StatsType Stats() override {
+    return PersistentCache::StatsType();
+  }
+
+  uint64_t NewId() override {
+    return last_id_.fetch_add(1, std::memory_order_relaxed);
+  }
+
+  Status Insert(const Slice& page_key, const char* data,
+                const size_t size) override {
+    MutexLock _(&lock_);
+
+    if (size_ > max_size_) {
+      size_ -= data_.begin()->second.size();
+      data_.erase(data_.begin());
+    }
+
+    data_.insert(std::make_pair(page_key.ToString(), std::string(data, size)));
+    size_ += size;
+    return Status::OK();
+  }
+
+  Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,
+                size_t* size) override {
+    MutexLock _(&lock_);
+    auto it = data_.find(page_key.ToString());
+    if (it == data_.end()) {
+      return Status::NotFound();
+    }
+
+    assert(page_key.ToString() == it->first);
+    data->reset(new char[it->second.size()]);
+    memcpy(data->get(), it->second.c_str(), it->second.size());
+    *size = it->second.size();
+    return Status::OK();
+  }
+
+  bool IsCompressed() override { return is_compressed_; }
+
+  std::string GetPrintableOptions() const override {
+    return "MockPersistentCache";
+  }
+
+  port::Mutex lock_;
+  std::map<std::string, std::string> data_;
+  const bool is_compressed_ = true;
+  size_t size_ = 0;
+  const size_t max_size_ = 10 * 1024;  // 10KiB
+  std::atomic<uint64_t> last_id_{1};
+};
+
+#ifdef OS_LINUX
+// Make sure that in CPU time perf context counters, Env::NowCPUNanos()
+// is used, rather than Env::CPUNanos();
+TEST_F(DBTest2, TestPerfContextGetCpuTime) {
+  // force resizing table cache so table handle is not preloaded so that
+  // we can measure find_table_nanos during Get().
+  dbfull()->TEST_table_cache()->SetCapacity(0);
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+  env_->now_cpu_count_.store(0);
+  env_->SetMockSleep();
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  // CPU timing is not enabled with kEnableTimeExceptForMutex
+  SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex);
+  ASSERT_EQ("bar", Get("foo"));
+  ASSERT_EQ(0, get_perf_context()->get_cpu_nanos);
+  ASSERT_EQ(0, env_->now_cpu_count_.load());
+
+  constexpr uint64_t kDummyAddonSeconds = uint64_t{1000000};
+  constexpr uint64_t kDummyAddonNanos = 1000000000U * kDummyAddonSeconds;
+
+  // Add time to NowNanos() reading.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "TableCache::FindTable:0",
+      [&](void* /*arg*/) { env_->MockSleepForSeconds(kDummyAddonSeconds); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
+  ASSERT_EQ("bar", Get("foo"));
+  ASSERT_GT(env_->now_cpu_count_.load(), 2);
+  ASSERT_LT(get_perf_context()->get_cpu_nanos, kDummyAddonNanos);
+  ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonNanos);
+
+  SetPerfLevel(PerfLevel::kDisable);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, TestPerfContextIterCpuTime) {
+  DestroyAndReopen(CurrentOptions());
+  // force resizing table cache so table handle is not preloaded so that
+  // we can measure find_table_nanos during iteration
+  dbfull()->TEST_table_cache()->SetCapacity(0);
+
+  const size_t kNumEntries = 10;
+  for (size_t i = 0; i < kNumEntries; ++i) {
+    ASSERT_OK(Put("k" + std::to_string(i), "v" + std::to_string(i)));
+  }
+  ASSERT_OK(Flush());
+  for (size_t i = 0; i < kNumEntries; ++i) {
+    ASSERT_EQ("v" + std::to_string(i), Get("k" + std::to_string(i)));
+  }
+  std::string last_key = "k" + std::to_string(kNumEntries - 1);
+  std::string last_value = "v" + std::to_string(kNumEntries - 1);
+  env_->now_cpu_count_.store(0);
+  env_->SetMockSleep();
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  // CPU timing is not enabled with kEnableTimeExceptForMutex
+  SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex);
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  iter->Seek("k0");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("v0", iter->value().ToString());
+  iter->SeekForPrev(last_key);
+  ASSERT_TRUE(iter->Valid());
+  iter->SeekToLast();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(last_value, iter->value().ToString());
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("v0", iter->value().ToString());
+  ASSERT_EQ(0, get_perf_context()->iter_seek_cpu_nanos);
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("v1", iter->value().ToString());
+  ASSERT_EQ(0, get_perf_context()->iter_next_cpu_nanos);
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("v0", iter->value().ToString());
+  ASSERT_EQ(0, get_perf_context()->iter_prev_cpu_nanos);
+  ASSERT_EQ(0, env_->now_cpu_count_.load());
+  delete iter;
+
+  constexpr uint64_t kDummyAddonSeconds = uint64_t{1000000};
+  constexpr uint64_t kDummyAddonNanos = 1000000000U * kDummyAddonSeconds;
+
+  // Add time to NowNanos() reading.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "TableCache::FindTable:0",
+      [&](void* /*arg*/) { env_->MockSleepForSeconds(kDummyAddonSeconds); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
+  iter = db_->NewIterator(ReadOptions());
+  iter->Seek("k0");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("v0", iter->value().ToString());
+  iter->SeekForPrev(last_key);
+  ASSERT_TRUE(iter->Valid());
+  iter->SeekToLast();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(last_value, iter->value().ToString());
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("v0", iter->value().ToString());
+  ASSERT_GT(get_perf_context()->iter_seek_cpu_nanos, 0);
+  ASSERT_LT(get_perf_context()->iter_seek_cpu_nanos, kDummyAddonNanos);
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("v1", iter->value().ToString());
+  ASSERT_GT(get_perf_context()->iter_next_cpu_nanos, 0);
+  ASSERT_LT(get_perf_context()->iter_next_cpu_nanos, kDummyAddonNanos);
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("v0", iter->value().ToString());
+  ASSERT_GT(get_perf_context()->iter_prev_cpu_nanos, 0);
+  ASSERT_LT(get_perf_context()->iter_prev_cpu_nanos, kDummyAddonNanos);
+  ASSERT_GE(env_->now_cpu_count_.load(), 12);
+  ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonNanos);
+
+  SetPerfLevel(PerfLevel::kDisable);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  delete iter;
+}
+#endif  // OS_LINUX
+
+#if !defined OS_SOLARIS
+TEST_F(DBTest2, PersistentCache) {
+  int num_iter = 80;
+
+  Options options;
+  options.write_buffer_size = 64 * 1024;  // small write buffer
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options = CurrentOptions(options);
+
+  auto bsizes = {/*no block cache*/ 0, /*1M*/ 1 * 1024 * 1024};
+  auto types = {/*compressed*/ 1, /*uncompressed*/ 0};
+  for (auto bsize : bsizes) {
+    for (auto type : types) {
+      BlockBasedTableOptions table_options;
+      table_options.persistent_cache.reset(
+          new MockPersistentCache(type, 10 * 1024));
+      table_options.no_block_cache = true;
+      table_options.block_cache = bsize ? NewLRUCache(bsize) : nullptr;
+      table_options.block_cache_compressed = nullptr;
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+      DestroyAndReopen(options);
+      CreateAndReopenWithCF({"pikachu"}, options);
+      // default column family doesn't have block cache
+      Options no_block_cache_opts;
+      no_block_cache_opts.statistics = options.statistics;
+      no_block_cache_opts = CurrentOptions(no_block_cache_opts);
+      BlockBasedTableOptions table_options_no_bc;
+      table_options_no_bc.no_block_cache = true;
+      no_block_cache_opts.table_factory.reset(
+          NewBlockBasedTableFactory(table_options_no_bc));
+      ReopenWithColumnFamilies(
+          {"default", "pikachu"},
+          std::vector<Options>({no_block_cache_opts, options}));
+
+      Random rnd(301);
+
+      // Write 8MB (80 values, each 100K)
+      ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+      std::vector<std::string> values;
+      std::string str;
+      for (int i = 0; i < num_iter; i++) {
+        if (i % 4 == 0) {  // high compression ratio
+          str = rnd.RandomString(1000);
+        }
+        values.push_back(str);
+        ASSERT_OK(Put(1, Key(i), values[i]));
+      }
+
+      // flush all data from memtable so that reads are from block cache
+      ASSERT_OK(Flush(1));
+
+      for (int i = 0; i < num_iter; i++) {
+        ASSERT_EQ(Get(1, Key(i)), values[i]);
+      }
+
+      auto hit = options.statistics->getTickerCount(PERSISTENT_CACHE_HIT);
+      auto miss = options.statistics->getTickerCount(PERSISTENT_CACHE_MISS);
+
+      ASSERT_GT(hit, 0);
+      ASSERT_GT(miss, 0);
+    }
+  }
+}
+#endif  // !defined OS_SOLARIS
+
+namespace {
+void CountSyncPoint() {
+  TEST_SYNC_POINT_CALLBACK("DBTest2::MarkedPoint", nullptr /* arg */);
+}
+}  // anonymous namespace
+
+TEST_F(DBTest2, SyncPointMarker) {
+  std::atomic<int> sync_point_called(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBTest2::MarkedPoint",
+      [&](void* /*arg*/) { sync_point_called.fetch_add(1); });
+
+  // The first dependency enforces Marker can be loaded before MarkedPoint.
+  // The second checks that thread 1's MarkedPoint should be disabled here.
+  // Execution order:
+  // |   Thread 1    |  Thread 2   |
+  // |               |   Marker    |
+  // |  MarkedPoint  |             |
+  // | Thread1First  |             |
+  // |               | MarkedPoint |
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependencyAndMarkers(
+      {{"DBTest2::SyncPointMarker:Thread1First", "DBTest2::MarkedPoint"}},
+      {{"DBTest2::SyncPointMarker:Marker", "DBTest2::MarkedPoint"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::function<void()> func1 = [&]() {
+    CountSyncPoint();
+    TEST_SYNC_POINT("DBTest2::SyncPointMarker:Thread1First");
+  };
+
+  std::function<void()> func2 = [&]() {
+    TEST_SYNC_POINT("DBTest2::SyncPointMarker:Marker");
+    CountSyncPoint();
+  };
+
+  auto thread1 = port::Thread(func1);
+  auto thread2 = port::Thread(func2);
+  thread1.join();
+  thread2.join();
+
+  // Callback is only executed once
+  ASSERT_EQ(sync_point_called.load(), 1);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+#endif
+
+size_t GetEncodedEntrySize(size_t key_size, size_t value_size) {
+  std::string buffer;
+
+  PutVarint32(&buffer, static_cast<uint32_t>(0));
+  PutVarint32(&buffer, static_cast<uint32_t>(key_size));
+  PutVarint32(&buffer, static_cast<uint32_t>(value_size));
+
+  return buffer.size() + key_size + value_size;
+}
+
+TEST_F(DBTest2, ReadAmpBitmap) {
+  Options options = CurrentOptions();
+  BlockBasedTableOptions bbto;
+  uint32_t bytes_per_bit[2] = {1, 16};
+  for (size_t k = 0; k < 2; k++) {
+    // Disable delta encoding to make it easier to calculate read amplification
+    bbto.use_delta_encoding = false;
+    // Huge block cache to make it easier to calculate read amplification
+    bbto.block_cache = NewLRUCache(1024 * 1024 * 1024);
+    bbto.read_amp_bytes_per_bit = bytes_per_bit[k];
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    DestroyAndReopen(options);
+
+    const size_t kNumEntries = 10000;
+
+    Random rnd(301);
+    for (size_t i = 0; i < kNumEntries; i++) {
+      ASSERT_OK(Put(Key(static_cast<int>(i)), rnd.RandomString(100)));
+    }
+    ASSERT_OK(Flush());
+
+    Close();
+    Reopen(options);
+
+    // Read keys/values randomly and verify that reported read amp error
+    // is less than 2%
+    uint64_t total_useful_bytes = 0;
+    std::set<int> read_keys;
+    std::string value;
+    for (size_t i = 0; i < kNumEntries * 5; i++) {
+      int key_idx = rnd.Next() % kNumEntries;
+      std::string key = Key(key_idx);
+      ASSERT_OK(db_->Get(ReadOptions(), key, &value));
+
+      if (read_keys.find(key_idx) == read_keys.end()) {
+        auto internal_key = InternalKey(key, 0, ValueType::kTypeValue);
+        total_useful_bytes +=
+            GetEncodedEntrySize(internal_key.size(), value.size());
+        read_keys.insert(key_idx);
+      }
+
+      double expected_read_amp =
+          static_cast<double>(total_useful_bytes) /
+          options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+      double read_amp =
+          static_cast<double>(options.statistics->getTickerCount(
+              READ_AMP_ESTIMATE_USEFUL_BYTES)) /
+          options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+      double error_pct = fabs(expected_read_amp - read_amp) * 100;
+      // Error between reported read amp and real read amp should be less than
+      // 2%
+      EXPECT_LE(error_pct, 2);
+    }
+
+    // Make sure we read every thing in the DB (which is smaller than our cache)
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_EQ(iter->value().ToString(), Get(iter->key().ToString()));
+    }
+    ASSERT_OK(iter->status());
+    delete iter;
+
+    // Read amp is on average 100% since we read all what we loaded in memory
+    if (k == 0) {
+      ASSERT_EQ(
+          options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES),
+          options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES));
+    } else {
+      ASSERT_NEAR(
+          options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES) *
+              1.0f /
+              options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES),
+          1, .01);
+    }
+  }
+}
+
+#ifndef OS_SOLARIS  // GetUniqueIdFromFile is not implemented
+TEST_F(DBTest2, ReadAmpBitmapLiveInCacheAfterDBClose) {
+  {
+    const int kIdBufLen = 100;
+    char id_buf[kIdBufLen];
+    Status s = Status::NotSupported();
+#ifndef OS_WIN
+    // You can't open a directory on windows using random access file
+    std::unique_ptr<RandomAccessFile> file;
+    s = env_->NewRandomAccessFile(dbname_, &file, EnvOptions());
+    if (s.ok()) {
+      if (file->GetUniqueId(id_buf, kIdBufLen) == 0) {
+        // fs holding db directory doesn't support getting a unique file id,
+        // this means that running this test will fail because lru_cache will
+        // load the blocks again regardless of them being already in the cache
+        return;
+      }
+    }
+#endif
+    if (!s.ok()) {
+      std::unique_ptr<Directory> dir;
+      ASSERT_OK(env_->NewDirectory(dbname_, &dir));
+      if (dir->GetUniqueId(id_buf, kIdBufLen) == 0) {
+        // fs holding db directory doesn't support getting a unique file id,
+        // this means that running this test will fail because lru_cache will
+        // load the blocks again regardless of them being already in the cache
+        return;
+      }
+    }
+  }
+  uint32_t bytes_per_bit[2] = {1, 16};
+  for (size_t k = 0; k < 2; k++) {
+    std::shared_ptr<Cache> lru_cache = NewLRUCache(1024 * 1024 * 1024);
+    std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+    Options options = CurrentOptions();
+    BlockBasedTableOptions bbto;
+    // Disable delta encoding to make it easier to calculate read amplification
+    bbto.use_delta_encoding = false;
+    // Huge block cache to make it easier to calculate read amplification
+    bbto.block_cache = lru_cache;
+    bbto.read_amp_bytes_per_bit = bytes_per_bit[k];
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    options.statistics = stats;
+    DestroyAndReopen(options);
+
+    const int kNumEntries = 10000;
+
+    Random rnd(301);
+    for (int i = 0; i < kNumEntries; i++) {
+      ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+    }
+    ASSERT_OK(Flush());
+
+    Close();
+    Reopen(options);
+
+    std::set<int> read_keys;
+    std::string value;
+    // Iter1: Read half the DB, Read even keys
+    // Key(0), Key(2), Key(4), Key(6), Key(8), ...
+    for (int i = 0; i < kNumEntries; i += 2) {
+      std::string key = Key(i);
+      ASSERT_OK(db_->Get(ReadOptions(), key, &value));
+
+      if (read_keys.find(i) == read_keys.end()) {
+        auto internal_key = InternalKey(key, 0, ValueType::kTypeValue);
+        read_keys.insert(i);
+      }
+    }
+
+    size_t total_useful_bytes_iter1 =
+        options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES);
+    size_t total_loaded_bytes_iter1 =
+        options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+    Close();
+    std::shared_ptr<Statistics> new_statistics =
+        ROCKSDB_NAMESPACE::CreateDBStatistics();
+    // Destroy old statistics obj that the blocks in lru_cache are pointing to
+    options.statistics.reset();
+    // Use the statistics object that we just created
+    options.statistics = new_statistics;
+    Reopen(options);
+
+    // Iter2: Read half the DB, Read odd keys
+    // Key(1), Key(3), Key(5), Key(7), Key(9), ...
+    for (int i = 1; i < kNumEntries; i += 2) {
+      std::string key = Key(i);
+      ASSERT_OK(db_->Get(ReadOptions(), key, &value));
+
+      if (read_keys.find(i) == read_keys.end()) {
+        auto internal_key = InternalKey(key, 0, ValueType::kTypeValue);
+        read_keys.insert(i);
+      }
+    }
+
+    size_t total_useful_bytes_iter2 =
+        options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES);
+    size_t total_loaded_bytes_iter2 =
+        options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+    // Read amp is on average 100% since we read all what we loaded in memory
+    if (k == 0) {
+      ASSERT_EQ(total_useful_bytes_iter1 + total_useful_bytes_iter2,
+                total_loaded_bytes_iter1 + total_loaded_bytes_iter2);
+    } else {
+      ASSERT_NEAR((total_useful_bytes_iter1 + total_useful_bytes_iter2) * 1.0f /
+                      (total_loaded_bytes_iter1 + total_loaded_bytes_iter2),
+                  1, .01);
+    }
+  }
+}
+#endif  // !OS_SOLARIS
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, AutomaticCompactionOverlapManualCompaction) {
+  Options options = CurrentOptions();
+  options.num_levels = 3;
+  options.IncreaseParallelism(20);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "a"));
+  ASSERT_OK(Put(Key(5), "a"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(10), "a"));
+  ASSERT_OK(Put(Key(15), "a"));
+  ASSERT_OK(Flush());
+
+  CompactRangeOptions cro;
+  cro.change_level = true;
+  cro.target_level = 2;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  auto get_stat = [](std::string level_str, LevelStatType type,
+                     std::map<std::string, std::string> props) {
+    auto prop_str =
+        "compaction." + level_str + "." +
+        InternalStats::compaction_level_stats.at(type).property_name.c_str();
+    auto prop_item = props.find(prop_str);
+    return prop_item == props.end() ? 0 : std::stod(prop_item->second);
+  };
+
+  // Trivial move 2 files to L2
+  ASSERT_EQ("0,0,2", FilesPerLevel());
+  // Also test that the stats GetMapProperty API reporting the same result
+  {
+    std::map<std::string, std::string> prop;
+    ASSERT_TRUE(dbfull()->GetMapProperty("rocksdb.cfstats", &prop));
+    ASSERT_EQ(0, get_stat("L0", LevelStatType::NUM_FILES, prop));
+    ASSERT_EQ(0, get_stat("L1", LevelStatType::NUM_FILES, prop));
+    ASSERT_EQ(2, get_stat("L2", LevelStatType::NUM_FILES, prop));
+    ASSERT_EQ(2, get_stat("Sum", LevelStatType::NUM_FILES, prop));
+  }
+
+  // While the compaction is running, we will create 2 new files that
+  // can fit in L2, these 2 files will be moved to L2 and overlap with
+  // the running compaction and break the LSM consistency.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():Start", [&](void* /*arg*/) {
+        ASSERT_OK(
+            dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"},
+                                  {"max_bytes_for_level_base", "1"}}));
+        ASSERT_OK(Put(Key(6), "a"));
+        ASSERT_OK(Put(Key(7), "a"));
+        ASSERT_OK(Flush());
+
+        ASSERT_OK(Put(Key(8), "a"));
+        ASSERT_OK(Put(Key(9), "a"));
+        ASSERT_OK(Flush());
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Run a manual compaction that will compact the 2 files in L2
+  // into 1 file in L2
+  cro.exclusive_manual_compaction = false;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  // Test that the stats GetMapProperty API reporting 1 file in L2
+  {
+    std::map<std::string, std::string> prop;
+    ASSERT_TRUE(dbfull()->GetMapProperty("rocksdb.cfstats", &prop));
+    ASSERT_EQ(1, get_stat("L2", LevelStatType::NUM_FILES, prop));
+  }
+}
+
+TEST_F(DBTest2, ManualCompactionOverlapManualCompaction) {
+  Options options = CurrentOptions();
+  options.num_levels = 2;
+  options.IncreaseParallelism(20);
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "a"));
+  ASSERT_OK(Put(Key(5), "a"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(10), "a"));
+  ASSERT_OK(Put(Key(15), "a"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // Trivial move 2 files to L1
+  ASSERT_EQ("0,2", FilesPerLevel());
+
+  std::function<void()> bg_manual_compact = [&]() {
+    std::string k1 = Key(6);
+    std::string k2 = Key(9);
+    Slice k1s(k1);
+    Slice k2s(k2);
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = false;
+    ASSERT_OK(db_->CompactRange(cro, &k1s, &k2s));
+  };
+  ROCKSDB_NAMESPACE::port::Thread bg_thread;
+
+  // While the compaction is running, we will create 2 new files that
+  // can fit in L1, these 2 files will be moved to L1 and overlap with
+  // the running compaction and break the LSM consistency.
+  std::atomic<bool> flag(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():Start", [&](void* /*arg*/) {
+        if (flag.exchange(true)) {
+          // We want to make sure to call this callback only once
+          return;
+        }
+        ASSERT_OK(Put(Key(6), "a"));
+        ASSERT_OK(Put(Key(7), "a"));
+        ASSERT_OK(Flush());
+
+        ASSERT_OK(Put(Key(8), "a"));
+        ASSERT_OK(Put(Key(9), "a"));
+        ASSERT_OK(Flush());
+
+        // Start a non-exclusive manual compaction in a bg thread
+        bg_thread = port::Thread(bg_manual_compact);
+        // This manual compaction conflict with the other manual compaction
+        // so it should wait until the first compaction finish
+        env_->SleepForMicroseconds(1000000);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Run a manual compaction that will compact the 2 files in L1
+  // into 1 file in L1
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = false;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  bg_thread.join();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, PausingManualCompaction1) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+  // Generate a file containing 10 keys.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
+  }
+  ASSERT_OK(Flush());
+
+  // Generate another file containing same keys
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
+  }
+  ASSERT_OK(Flush());
+
+  int manual_compactions_paused = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():PausingManualCompaction:1", [&](void* arg) {
+        auto canceled = static_cast<std::atomic<bool>*>(arg);
+        // CompactRange triggers manual compaction and cancel the compaction
+        // by set *canceled as true
+        if (canceled != nullptr) {
+          canceled->store(true, std::memory_order_release);
+        }
+        manual_compactions_paused += 1;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "TestCompactFiles:PausingManualCompaction:3", [&](void* arg) {
+        auto paused = static_cast<std::atomic<int>*>(arg);
+        // CompactFiles() relies on manual_compactions_paused to
+        // determine if thie compaction should be paused or not
+        ASSERT_EQ(0, paused->load(std::memory_order_acquire));
+        paused->fetch_add(1, std::memory_order_release);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<std::string> files_before_compact, files_after_compact;
+  // Remember file name before compaction is triggered
+  std::vector<LiveFileMetaData> files_meta;
+  dbfull()->GetLiveFilesMetaData(&files_meta);
+  for (auto file : files_meta) {
+    files_before_compact.push_back(file.name);
+  }
+
+  // OK, now trigger a manual compaction
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(CompactRangeOptions(), nullptr, nullptr)
+                  .IsManualCompactionPaused());
+
+  // Wait for compactions to get scheduled and stopped
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  // Get file names after compaction is stopped
+  files_meta.clear();
+  dbfull()->GetLiveFilesMetaData(&files_meta);
+  for (auto file : files_meta) {
+    files_after_compact.push_back(file.name);
+  }
+
+  // Like nothing happened
+  ASSERT_EQ(files_before_compact, files_after_compact);
+  ASSERT_EQ(manual_compactions_paused, 1);
+
+  manual_compactions_paused = 0;
+  // Now make sure CompactFiles also not run
+  ASSERT_TRUE(dbfull()
+                  ->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(),
+                                 files_before_compact, 0)
+                  .IsManualCompactionPaused());
+  // Wait for manual compaction to get scheduled and finish
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  files_meta.clear();
+  files_after_compact.clear();
+  dbfull()->GetLiveFilesMetaData(&files_meta);
+  for (auto file : files_meta) {
+    files_after_compact.push_back(file.name);
+  }
+
+  ASSERT_EQ(files_before_compact, files_after_compact);
+  // CompactFiles returns at entry point
+  ASSERT_EQ(manual_compactions_paused, 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// PausingManualCompaction does not affect auto compaction
+TEST_F(DBTest2, PausingManualCompaction2) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.disable_auto_compactions = false;
+
+  DestroyAndReopen(options);
+  dbfull()->DisableManualCompaction();
+
+  Random rnd(301);
+  for (int i = 0; i < 2; i++) {
+    // Generate a file containing 100 keys.
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(j), rnd.RandomString(50)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  std::vector<LiveFileMetaData> files_meta;
+  dbfull()->GetLiveFilesMetaData(&files_meta);
+  ASSERT_EQ(files_meta.size(), 1);
+}
+
+TEST_F(DBTest2, PausingManualCompaction3) {
+  CompactRangeOptions compact_options;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+
+  Random rnd(301);
+  auto generate_files = [&]() {
+    for (int i = 0; i < options.num_levels; i++) {
+      for (int j = 0; j < options.num_levels - i + 1; j++) {
+        for (int k = 0; k < 1000; k++) {
+          ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
+        }
+        ASSERT_OK(Flush());
+      }
+
+      for (int l = 1; l < options.num_levels - i; l++) {
+        MoveFilesToLevel(l);
+      }
+    }
+  };
+
+  DestroyAndReopen(options);
+  generate_files();
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+  int run_manual_compactions = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():PausingManualCompaction:1",
+      [&](void* /*arg*/) { run_manual_compactions++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  dbfull()->DisableManualCompaction();
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(compact_options, nullptr, nullptr)
+                  .IsManualCompactionPaused());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+  // As manual compaction disabled, not even reach sync point
+  ASSERT_EQ(run_manual_compactions, 0);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "CompactionJob::Run():PausingManualCompaction:1");
+  dbfull()->EnableManualCompaction();
+  ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, PausingManualCompaction4) {
+  CompactRangeOptions compact_options;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+
+  Random rnd(301);
+  auto generate_files = [&]() {
+    for (int i = 0; i < options.num_levels; i++) {
+      for (int j = 0; j < options.num_levels - i + 1; j++) {
+        for (int k = 0; k < 1000; k++) {
+          ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
+        }
+        ASSERT_OK(Flush());
+      }
+
+      for (int l = 1; l < options.num_levels - i; l++) {
+        MoveFilesToLevel(l);
+      }
+    }
+  };
+
+  DestroyAndReopen(options);
+  generate_files();
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+  int run_manual_compactions = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():PausingManualCompaction:2", [&](void* arg) {
+        auto canceled = static_cast<std::atomic<bool>*>(arg);
+        // CompactRange triggers manual compaction and cancel the compaction
+        // by set *canceled as true
+        if (canceled != nullptr) {
+          canceled->store(true, std::memory_order_release);
+        }
+        run_manual_compactions++;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "TestCompactFiles:PausingManualCompaction:3", [&](void* arg) {
+        auto paused = static_cast<std::atomic<int>*>(arg);
+        // CompactFiles() relies on manual_compactions_paused to
+        // determine if thie compaction should be paused or not
+        ASSERT_EQ(0, paused->load(std::memory_order_acquire));
+        paused->fetch_add(1, std::memory_order_release);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(compact_options, nullptr, nullptr)
+                  .IsManualCompactionPaused());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+  ASSERT_EQ(run_manual_compactions, 1);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "CompactionJob::Run():PausingManualCompaction:2");
+  ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, CancelManualCompaction1) {
+  CompactRangeOptions compact_options;
+  auto canceledPtr =
+      std::unique_ptr<std::atomic<bool>>(new std::atomic<bool>{true});
+  compact_options.canceled = canceledPtr.get();
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+
+  Random rnd(301);
+  auto generate_files = [&]() {
+    for (int i = 0; i < options.num_levels; i++) {
+      for (int j = 0; j < options.num_levels - i + 1; j++) {
+        for (int k = 0; k < 1000; k++) {
+          ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
+        }
+        ASSERT_OK(Flush());
+      }
+
+      for (int l = 1; l < options.num_levels - i; l++) {
+        MoveFilesToLevel(l);
+      }
+    }
+  };
+
+  DestroyAndReopen(options);
+  generate_files();
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  int run_manual_compactions = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():PausingManualCompaction:1",
+      [&](void* /*arg*/) { run_manual_compactions++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Setup a callback to disable compactions after a couple of levels are
+  // compacted
+  int compactions_run = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::RunManualCompaction()::1",
+      [&](void* /*arg*/) { ++compactions_run; });
+
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(compact_options, nullptr, nullptr)
+                  .IsManualCompactionPaused());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  // Since compactions are disabled, we shouldn't start compacting.
+  // E.g. we should call the compaction function exactly one time.
+  ASSERT_EQ(compactions_run, 0);
+  ASSERT_EQ(run_manual_compactions, 0);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  compactions_run = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "DBImpl::RunManualCompaction()::1");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::RunManualCompaction()::1", [&](void* /*arg*/) {
+        ++compactions_run;
+        // After 3 compactions disable
+        if (compactions_run == 3) {
+          compact_options.canceled->store(true, std::memory_order_release);
+        }
+      });
+
+  compact_options.canceled->store(false, std::memory_order_release);
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(compact_options, nullptr, nullptr)
+                  .IsManualCompactionPaused());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  ASSERT_EQ(compactions_run, 3);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "DBImpl::RunManualCompaction()::1");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "CompactionJob::Run():PausingManualCompaction:1");
+
+  // Compactions should work again if we re-enable them..
+  compact_options.canceled->store(false, std::memory_order_relaxed);
+  ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, CancelManualCompaction2) {
+  CompactRangeOptions compact_options;
+  auto canceledPtr =
+      std::unique_ptr<std::atomic<bool>>(new std::atomic<bool>{true});
+  compact_options.canceled = canceledPtr.get();
+  compact_options.max_subcompactions = 1;
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+
+  Random rnd(301);
+  auto generate_files = [&]() {
+    for (int i = 0; i < options.num_levels; i++) {
+      for (int j = 0; j < options.num_levels - i + 1; j++) {
+        for (int k = 0; k < 1000; k++) {
+          ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
+        }
+        ASSERT_OK(Flush());
+      }
+
+      for (int l = 1; l < options.num_levels - i; l++) {
+        MoveFilesToLevel(l);
+      }
+    }
+  };
+
+  DestroyAndReopen(options);
+  generate_files();
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  int compactions_run = 0;
+  std::atomic<int> kv_compactions{0};
+  int compactions_stopped_at = 0;
+  int kv_compactions_stopped_at = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::RunManualCompaction()::1", [&](void* /*arg*/) {
+        ++compactions_run;
+        // After 3 compactions disable
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator:ProcessKV", [&](void* /*arg*/) {
+        int kv_compactions_run =
+            kv_compactions.fetch_add(1, std::memory_order_release);
+        if (kv_compactions_run == 5) {
+          compact_options.canceled->store(true, std::memory_order_release);
+          kv_compactions_stopped_at = kv_compactions_run;
+          compactions_stopped_at = compactions_run;
+        }
+      });
+
+  compact_options.canceled->store(false, std::memory_order_release);
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(compact_options, nullptr, nullptr)
+                  .IsManualCompactionPaused());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  // NOTE: as we set compact_options.max_subcompacitons = 1, and store true to
+  // the canceled variable from the single compacting thread (via callback),
+  // this value is deterministically kv_compactions_stopped_at + 1.
+  ASSERT_EQ(kv_compactions, kv_compactions_stopped_at + 1);
+  ASSERT_EQ(compactions_run, compactions_stopped_at);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "CompactionIterator::ProcessKV");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "DBImpl::RunManualCompaction()::1");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "CompactionJob::Run():PausingManualCompaction:1");
+
+  // Compactions should work again if we re-enable them..
+  compact_options.canceled->store(false, std::memory_order_relaxed);
+  ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+class CancelCompactionListener : public EventListener {
+ public:
+  CancelCompactionListener()
+      : num_compaction_started_(0), num_compaction_ended_(0) {}
+
+  void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override {
+    ASSERT_EQ(ci.cf_name, "default");
+    ASSERT_EQ(ci.base_input_level, 0);
+    num_compaction_started_++;
+  }
+
+  void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
+    ASSERT_EQ(ci.cf_name, "default");
+    ASSERT_EQ(ci.base_input_level, 0);
+    ASSERT_EQ(ci.status.code(), code_);
+    ASSERT_EQ(ci.status.subcode(), subcode_);
+    num_compaction_ended_++;
+  }
+
+  std::atomic<size_t> num_compaction_started_;
+  std::atomic<size_t> num_compaction_ended_;
+  Status::Code code_;
+  Status::SubCode subcode_;
+};
+
+TEST_F(DBTest2, CancelManualCompactionWithListener) {
+  CompactRangeOptions compact_options;
+  auto canceledPtr =
+      std::unique_ptr<std::atomic<bool>>(new std::atomic<bool>{true});
+  compact_options.canceled = canceledPtr.get();
+  compact_options.max_subcompactions = 1;
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  CancelCompactionListener* listener = new CancelCompactionListener();
+  options.listeners.emplace_back(listener);
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      ASSERT_OK(Put(Key(i + j * 10), rnd.RandomString(50)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator:ProcessKV", [&](void* /*arg*/) {
+        compact_options.canceled->store(true, std::memory_order_release);
+      });
+
+  int running_compaction = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::FinishCompactionOutputFile1",
+      [&](void* /*arg*/) { running_compaction++; });
+
+  // Case I: 1 Notify begin compaction, 2 Set *canceled as true to disable
+  // manual compaction in the callback function, 3 Compaction not run,
+  // 4 Notify compaction end.
+  listener->code_ = Status::kIncomplete;
+  listener->subcode_ = Status::SubCode::kManualCompactionPaused;
+
+  compact_options.canceled->store(false, std::memory_order_release);
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(compact_options, nullptr, nullptr)
+                  .IsManualCompactionPaused());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  ASSERT_GT(listener->num_compaction_started_, 0);
+  ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_);
+  ASSERT_EQ(running_compaction, 0);
+
+  listener->num_compaction_started_ = 0;
+  listener->num_compaction_ended_ = 0;
+
+  // Case II: 1 Set *canceled as true in the callback function to disable manual
+  // compaction, 2 Notify begin compaction (return without notifying), 3 Notify
+  // compaction end (return without notifying).
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(compact_options, nullptr, nullptr)
+                  .IsManualCompactionPaused());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  ASSERT_EQ(listener->num_compaction_started_, 0);
+  ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_);
+  ASSERT_EQ(running_compaction, 0);
+
+  // Case III: 1 Notify begin compaction, 2 Compaction in between
+  // 3. Set *canceled as true in the callback function to disable manual
+  // compaction, 4 Notify compaction end.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "CompactionIterator:ProcessKV");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run:BeforeVerify", [&](void* /*arg*/) {
+        compact_options.canceled->store(true, std::memory_order_release);
+      });
+
+  listener->code_ = Status::kOk;
+  listener->subcode_ = Status::SubCode::kNone;
+
+  compact_options.canceled->store(false, std::memory_order_release);
+  ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  ASSERT_GT(listener->num_compaction_started_, 0);
+  ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_);
+
+  // Compaction job will succeed.
+  ASSERT_GT(running_compaction, 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, CompactionOnBottomPriorityWithListener) {
+  int num_levels = 3;
+  const int kNumFilesTrigger = 4;
+
+  Options options = CurrentOptions();
+  env_->SetBackgroundThreads(0, Env::Priority::HIGH);
+  env_->SetBackgroundThreads(0, Env::Priority::LOW);
+  env_->SetBackgroundThreads(1, Env::Priority::BOTTOM);
+  options.env = env_;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = kNumFilesTrigger;
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+
+  CancelCompactionListener* listener = new CancelCompactionListener();
+  options.listeners.emplace_back(listener);
+
+  DestroyAndReopen(options);
+
+  int num_bottom_thread_compaction_scheduled = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:ForwardToBottomPriPool",
+      [&](void* /*arg*/) { num_bottom_thread_compaction_scheduled++; });
+
+  int num_compaction_jobs = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():End",
+      [&](void* /*arg*/) { num_compaction_jobs++; });
+
+  listener->code_ = Status::kOk;
+  listener->subcode_ = Status::SubCode::kNone;
+
+  Random rnd(301);
+  for (int i = 0; i < 1; ++i) {
+    for (int num = 0; num < kNumFilesTrigger; num++) {
+      int key_idx = 0;
+      GenerateNewFile(&rnd, &key_idx, true /* no_wait */);
+      // use no_wait above because that one waits for flush and compaction. We
+      // don't want to wait for compaction because the full compaction is
+      // intentionally blocked while more files are flushed.
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    }
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_GT(num_bottom_thread_compaction_scheduled, 0);
+  ASSERT_EQ(num_compaction_jobs, 1);
+  ASSERT_GT(listener->num_compaction_started_, 0);
+  ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, OptimizeForPointLookup) {
+  Options options = CurrentOptions();
+  Close();
+  options.OptimizeForPointLookup(2);
+  ASSERT_OK(DB::Open(options, dbname_, &db_));
+
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_EQ("v1", Get("foo"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("v1", Get("foo"));
+}
+
+TEST_F(DBTest2, OptimizeForSmallDB) {
+  Options options = CurrentOptions();
+  Close();
+  options.OptimizeForSmallDb();
+
+  // Find the cache object
+  ASSERT_TRUE(options.table_factory->IsInstanceOf(
+      TableFactory::kBlockBasedTableName()));
+  auto table_options =
+      options.table_factory->GetOptions<BlockBasedTableOptions>();
+
+  ASSERT_TRUE(table_options != nullptr);
+  std::shared_ptr<Cache> cache = table_options->block_cache;
+
+  ASSERT_EQ(0, cache->GetUsage());
+  ASSERT_OK(DB::Open(options, dbname_, &db_));
+  ASSERT_OK(Put("foo", "v1"));
+
+  // memtable size is costed to the block cache
+  ASSERT_NE(0, cache->GetUsage());
+
+  ASSERT_EQ("v1", Get("foo"));
+  ASSERT_OK(Flush());
+
+  size_t prev_size = cache->GetUsage();
+  // Remember block cache size, so that we can find that
+  // it is filled after Get().
+  // Use pinnable slice so that it can ping the block so that
+  // when we check the size it is not evicted.
+  PinnableSlice value;
+  ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), "foo", &value));
+  ASSERT_GT(cache->GetUsage(), prev_size);
+  value.Reset();
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest2, IterRaceFlush1) {
+  ASSERT_OK(Put("foo", "v1"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::NewIterator:1", "DBTest2::IterRaceFlush:1"},
+       {"DBTest2::IterRaceFlush:2", "DBImpl::NewIterator:2"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread t1([&] {
+    TEST_SYNC_POINT("DBTest2::IterRaceFlush:1");
+    ASSERT_OK(Put("foo", "v2"));
+    ASSERT_OK(Flush());
+    TEST_SYNC_POINT("DBTest2::IterRaceFlush:2");
+  });
+
+  // iterator is created after the first Put(), and its snapshot sequence is
+  // assigned after second Put(), so it must see v2.
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->Seek("foo");
+    ASSERT_TRUE(it->Valid());
+    ASSERT_OK(it->status());
+    ASSERT_EQ("foo", it->key().ToString());
+    ASSERT_EQ("v2", it->value().ToString());
+  }
+
+  t1.join();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, IterRaceFlush2) {
+  ASSERT_OK(Put("foo", "v1"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::NewIterator:3", "DBTest2::IterRaceFlush2:1"},
+       {"DBTest2::IterRaceFlush2:2", "DBImpl::NewIterator:4"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread t1([&] {
+    TEST_SYNC_POINT("DBTest2::IterRaceFlush2:1");
+    ASSERT_OK(Put("foo", "v2"));
+    ASSERT_OK(Flush());
+    TEST_SYNC_POINT("DBTest2::IterRaceFlush2:2");
+  });
+
+  // iterator is created after the first Put(), and its snapshot sequence is
+  // assigned before second Put(), thus it must see v1.
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->Seek("foo");
+    ASSERT_TRUE(it->Valid());
+    ASSERT_OK(it->status());
+    ASSERT_EQ("foo", it->key().ToString());
+    ASSERT_EQ("v1", it->value().ToString());
+  }
+
+  t1.join();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, IterRefreshRaceFlush) {
+  ASSERT_OK(Put("foo", "v1"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"ArenaWrappedDBIter::Refresh:1", "DBTest2::IterRefreshRaceFlush:1"},
+       {"DBTest2::IterRefreshRaceFlush:2", "ArenaWrappedDBIter::Refresh:2"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread t1([&] {
+    TEST_SYNC_POINT("DBTest2::IterRefreshRaceFlush:1");
+    ASSERT_OK(Put("foo", "v2"));
+    ASSERT_OK(Flush());
+    TEST_SYNC_POINT("DBTest2::IterRefreshRaceFlush:2");
+  });
+
+  // iterator is refreshed after the first Put(), and its sequence number is
+  // assigned after second Put(), thus it must see v2.
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    ASSERT_OK(it->status());
+    ASSERT_OK(it->Refresh());
+    it->Seek("foo");
+    ASSERT_TRUE(it->Valid());
+    ASSERT_OK(it->status());
+    ASSERT_EQ("foo", it->key().ToString());
+    ASSERT_EQ("v2", it->value().ToString());
+  }
+
+  t1.join();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, GetRaceFlush1) {
+  ASSERT_OK(Put("foo", "v1"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::GetImpl:1", "DBTest2::GetRaceFlush:1"},
+       {"DBTest2::GetRaceFlush:2", "DBImpl::GetImpl:2"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread t1([&] {
+    TEST_SYNC_POINT("DBTest2::GetRaceFlush:1");
+    ASSERT_OK(Put("foo", "v2"));
+    ASSERT_OK(Flush());
+    TEST_SYNC_POINT("DBTest2::GetRaceFlush:2");
+  });
+
+  // Get() is issued after the first Put(), so it should see either
+  // "v1" or "v2".
+  ASSERT_NE("NOT_FOUND", Get("foo"));
+  t1.join();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, GetRaceFlush2) {
+  ASSERT_OK(Put("foo", "v1"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::GetImpl:3", "DBTest2::GetRaceFlush:1"},
+       {"DBTest2::GetRaceFlush:2", "DBImpl::GetImpl:4"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread t1([&] {
+    TEST_SYNC_POINT("DBTest2::GetRaceFlush:1");
+    ASSERT_OK(Put("foo", "v2"));
+    ASSERT_OK(Flush());
+    TEST_SYNC_POINT("DBTest2::GetRaceFlush:2");
+  });
+
+  // Get() is issued after the first Put(), so it should see either
+  // "v1" or "v2".
+  ASSERT_NE("NOT_FOUND", Get("foo"));
+  t1.join();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, DirectIO) {
+  if (!IsDirectIOSupported()) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.use_direct_reads = options.use_direct_io_for_flush_and_compaction =
+      true;
+  options.allow_mmap_reads = options.allow_mmap_writes = false;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "a"));
+  ASSERT_OK(Put(Key(5), "a"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(10), "a"));
+  ASSERT_OK(Put(Key(15), "a"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  Reopen(options);
+}
+
+TEST_F(DBTest2, MemtableOnlyIterator) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "foo", "first"));
+  ASSERT_OK(Put(1, "bar", "second"));
+
+  ReadOptions ropt;
+  ropt.read_tier = kMemtableTier;
+  std::string value;
+  Iterator* it = nullptr;
+
+  // Before flushing
+  // point lookups
+  ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value));
+  ASSERT_EQ("first", value);
+  ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value));
+  ASSERT_EQ("second", value);
+
+  // Memtable-only iterator (read_tier=kMemtableTier); data not flushed yet.
+  it = db_->NewIterator(ropt, handles_[1]);
+  int count = 0;
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    ASSERT_TRUE(it->Valid());
+    count++;
+  }
+  ASSERT_TRUE(!it->Valid());
+  ASSERT_EQ(2, count);
+  delete it;
+
+  Flush(1);
+
+  // After flushing
+  // point lookups
+  ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value));
+  ASSERT_EQ("first", value);
+  ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value));
+  ASSERT_EQ("second", value);
+  // nothing should be returned using memtable-only iterator after flushing.
+  it = db_->NewIterator(ropt, handles_[1]);
+  ASSERT_OK(it->status());
+  count = 0;
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    ASSERT_TRUE(it->Valid());
+    count++;
+  }
+  ASSERT_TRUE(!it->Valid());
+  ASSERT_EQ(0, count);
+  ASSERT_OK(it->status());
+  delete it;
+
+  // Add a key to memtable
+  ASSERT_OK(Put(1, "foobar", "third"));
+  it = db_->NewIterator(ropt, handles_[1]);
+  ASSERT_OK(it->status());
+  count = 0;
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    ASSERT_TRUE(it->Valid());
+    ASSERT_EQ("foobar", it->key().ToString());
+    ASSERT_EQ("third", it->value().ToString());
+    count++;
+  }
+  ASSERT_TRUE(!it->Valid());
+  ASSERT_EQ(1, count);
+  ASSERT_OK(it->status());
+  delete it;
+}
+
+TEST_F(DBTest2, LowPriWrite) {
+  Options options = CurrentOptions();
+  // Compaction pressure should trigger since 6 files
+  options.level0_file_num_compaction_trigger = 4;
+  options.level0_slowdown_writes_trigger = 12;
+  options.level0_stop_writes_trigger = 30;
+  options.delayed_write_rate = 8 * 1024 * 1024;
+  Reopen(options);
+
+  std::atomic<int> rate_limit_count(0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "GenericRateLimiter::Request:1", [&](void* arg) {
+        rate_limit_count.fetch_add(1);
+        int64_t* rate_bytes_per_sec = static_cast<int64_t*>(arg);
+        ASSERT_EQ(1024 * 1024, *rate_bytes_per_sec);
+      });
+  // Block compaction
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DBTest.LowPriWrite:0", "DBImpl::BGWorkCompaction"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  WriteOptions wo;
+  for (int i = 0; i < 6; i++) {
+    wo.low_pri = false;
+    ASSERT_OK(Put("", "", wo));
+    wo.low_pri = true;
+    ASSERT_OK(Put("", "", wo));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(0, rate_limit_count.load());
+  wo.low_pri = true;
+  ASSERT_OK(Put("", "", wo));
+  ASSERT_EQ(1, rate_limit_count.load());
+  wo.low_pri = false;
+  ASSERT_OK(Put("", "", wo));
+  ASSERT_EQ(1, rate_limit_count.load());
+
+  TEST_SYNC_POINT("DBTest.LowPriWrite:0");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  wo.low_pri = true;
+  ASSERT_OK(Put("", "", wo));
+  ASSERT_EQ(1, rate_limit_count.load());
+  wo.low_pri = false;
+  ASSERT_OK(Put("", "", wo));
+  ASSERT_EQ(1, rate_limit_count.load());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, RateLimitedCompactionReads) {
+  // compaction input has 512KB data
+  const int kNumKeysPerFile = 128;
+  const int kBytesPerKey = 1024;
+  const int kNumL0Files = 4;
+
+  for (int compaction_readahead_size : {0, 32 << 10}) {
+    for (auto use_direct_io : {false, true}) {
+      if (use_direct_io && !IsDirectIOSupported()) {
+        continue;
+      }
+      Options options = CurrentOptions();
+      options.compaction_readahead_size = compaction_readahead_size;
+      options.compression = kNoCompression;
+      options.level0_file_num_compaction_trigger = kNumL0Files;
+      options.memtable_factory.reset(
+          test::NewSpecialSkipListFactory(kNumKeysPerFile));
+      // takes roughly one second, split into 100 x 10ms intervals. Each
+      // interval permits 5.12KB, which is smaller than the block size, so this
+      // test exercises the code for chunking reads.
+      options.rate_limiter.reset(NewGenericRateLimiter(
+          static_cast<int64_t>(kNumL0Files * kNumKeysPerFile *
+                               kBytesPerKey) /* rate_bytes_per_sec */,
+          10 * 1000 /* refill_period_us */, 10 /* fairness */,
+          RateLimiter::Mode::kReadsOnly));
+      options.use_direct_reads =
+          options.use_direct_io_for_flush_and_compaction = use_direct_io;
+      BlockBasedTableOptions bbto;
+      bbto.block_size = 16384;
+      bbto.no_block_cache = true;
+      options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+      DestroyAndReopen(options);
+
+      for (int i = 0; i < kNumL0Files; ++i) {
+        for (int j = 0; j <= kNumKeysPerFile; ++j) {
+          ASSERT_OK(Put(Key(j), DummyString(kBytesPerKey)));
+        }
+        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+        if (i + 1 < kNumL0Files) {
+          ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+        }
+      }
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+      // should be slightly above 512KB due to non-data blocks read. Arbitrarily
+      // chose 1MB as the upper bound on the total bytes read.
+      size_t rate_limited_bytes = static_cast<size_t>(
+          options.rate_limiter->GetTotalBytesThrough(Env::IO_TOTAL));
+      // The charges can exist for `IO_LOW` and `IO_USER` priorities.
+      size_t rate_limited_bytes_by_pri =
+          options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW) +
+          options.rate_limiter->GetTotalBytesThrough(Env::IO_USER);
+      ASSERT_EQ(rate_limited_bytes,
+                static_cast<size_t>(rate_limited_bytes_by_pri));
+      // Include the explicit prefetch of the footer in direct I/O case.
+      size_t direct_io_extra = use_direct_io ? 512 * 1024 : 0;
+      ASSERT_GE(
+          rate_limited_bytes,
+          static_cast<size_t>(kNumKeysPerFile * kBytesPerKey * kNumL0Files));
+      ASSERT_LT(
+          rate_limited_bytes,
+          static_cast<size_t>(2 * kNumKeysPerFile * kBytesPerKey * kNumL0Files +
+                              direct_io_extra));
+
+      Iterator* iter = db_->NewIterator(ReadOptions());
+      ASSERT_OK(iter->status());
+      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+        ASSERT_EQ(iter->value().ToString(), DummyString(kBytesPerKey));
+      }
+      delete iter;
+      // bytes read for user iterator shouldn't count against the rate limit.
+      rate_limited_bytes_by_pri =
+          options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW) +
+          options.rate_limiter->GetTotalBytesThrough(Env::IO_USER);
+      ASSERT_EQ(rate_limited_bytes,
+                static_cast<size_t>(rate_limited_bytes_by_pri));
+    }
+  }
+}
+#endif  // ROCKSDB_LITE
+
+// Make sure DB can be reopen with reduced number of levels, given no file
+// is on levels higher than the new num_levels.
+TEST_F(DBTest2, ReduceLevel) {
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+  Reopen(options);
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(6);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 1;
+  ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,1", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+  options.num_levels = 3;
+  Reopen(options);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,1", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+}
+
+// Test that ReadCallback is actually used in both memtbale and sst tables
+TEST_F(DBTest2, ReadCallbackTest) {
+  Options options;
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+  options.env = env_;
+  Reopen(options);
+  std::vector<const Snapshot*> snapshots;
+  // Try to create a db with multiple layers and a memtable
+  const std::string key = "foo";
+  const std::string value = "bar";
+  // This test assumes that the seq start with 1 and increased by 1 after each
+  // write batch of size 1. If that behavior changes, the test needs to be
+  // updated as well.
+  // TODO(myabandeh): update this test to use the seq number that is returned by
+  // the DB instead of assuming what seq the DB used.
+  int i = 1;
+  for (; i < 10; i++) {
+    ASSERT_OK(Put(key, value + std::to_string(i)));
+    // Take a snapshot to avoid the value being removed during compaction
+    auto snapshot = dbfull()->GetSnapshot();
+    snapshots.push_back(snapshot);
+  }
+  ASSERT_OK(Flush());
+  for (; i < 20; i++) {
+    ASSERT_OK(Put(key, value + std::to_string(i)));
+    // Take a snapshot to avoid the value being removed during compaction
+    auto snapshot = dbfull()->GetSnapshot();
+    snapshots.push_back(snapshot);
+  }
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(6);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+  for (; i < 30; i++) {
+    ASSERT_OK(Put(key, value + std::to_string(i)));
+    auto snapshot = dbfull()->GetSnapshot();
+    snapshots.push_back(snapshot);
+  }
+  ASSERT_OK(Flush());
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("1,0,0,0,0,0,2", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+  // And also add some values to the memtable
+  for (; i < 40; i++) {
+    ASSERT_OK(Put(key, value + std::to_string(i)));
+    auto snapshot = dbfull()->GetSnapshot();
+    snapshots.push_back(snapshot);
+  }
+
+  class TestReadCallback : public ReadCallback {
+   public:
+    explicit TestReadCallback(SequenceNumber snapshot)
+        : ReadCallback(snapshot), snapshot_(snapshot) {}
+    bool IsVisibleFullCheck(SequenceNumber seq) override {
+      return seq <= snapshot_;
+    }
+
+   private:
+    SequenceNumber snapshot_;
+  };
+
+  for (int seq = 1; seq < i; seq++) {
+    PinnableSlice pinnable_val;
+    ReadOptions roptions;
+    TestReadCallback callback(seq);
+    bool dont_care = true;
+    DBImpl::GetImplOptions get_impl_options;
+    get_impl_options.column_family = dbfull()->DefaultColumnFamily();
+    get_impl_options.value = &pinnable_val;
+    get_impl_options.value_found = &dont_care;
+    get_impl_options.callback = &callback;
+    Status s = dbfull()->GetImpl(roptions, key, get_impl_options);
+    ASSERT_TRUE(s.ok());
+    // Assuming that after each Put the DB increased seq by one, the value and
+    // seq number must be equal since we also inc value by 1 after each Put.
+    ASSERT_EQ(value + std::to_string(seq), pinnable_val.ToString());
+  }
+
+  for (auto snapshot : snapshots) {
+    dbfull()->ReleaseSnapshot(snapshot);
+  }
+}
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBTest2, LiveFilesOmitObsoleteFiles) {
+  // Regression test for race condition where an obsolete file is returned to
+  // user as a "live file" but then deleted, all while file deletions are
+  // disabled.
+  //
+  // It happened like this:
+  //
+  // 1. [flush thread] Log file "x.log" found by FindObsoleteFiles
+  // 2. [user thread] DisableFileDeletions, GetSortedWalFiles are called and the
+  //    latter returned "x.log"
+  // 3. [flush thread] PurgeObsoleteFiles deleted "x.log"
+  // 4. [user thread] Reading "x.log" failed
+  //
+  // Unfortunately the only regression test I can come up with involves sleep.
+  // We cannot set SyncPoints to repro since, once the fix is applied, the
+  // SyncPoints would cause a deadlock as the repro's sequence of events is now
+  // prohibited.
+  //
+  // Instead, if we sleep for a second between Find and Purge, and ensure the
+  // read attempt happens after purge, then the sequence of events will almost
+  // certainly happen on the old code.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::BackgroundCallFlush:FilesFound",
+       "DBTest2::LiveFilesOmitObsoleteFiles:FlushTriggered"},
+      {"DBImpl::PurgeObsoleteFiles:End",
+       "DBTest2::LiveFilesOmitObsoleteFiles:LiveFilesCaptured"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::PurgeObsoleteFiles:Begin",
+      [&](void* /*arg*/) { env_->SleepForMicroseconds(1000000); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("key", "val"));
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  db_->Flush(flush_opts);
+  TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:FlushTriggered");
+
+  ASSERT_OK(db_->DisableFileDeletions());
+  VectorLogPtr log_files;
+  ASSERT_OK(db_->GetSortedWalFiles(log_files));
+  TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:LiveFilesCaptured");
+  for (const auto& log_file : log_files) {
+    ASSERT_OK(env_->FileExists(LogFileName(dbname_, log_file->LogNumber())));
+  }
+
+  ASSERT_OK(db_->EnableFileDeletions());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, TestNumPread) {
+  Options options = CurrentOptions();
+  bool prefetch_supported =
+      test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
+  // disable block cache
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  env_->count_random_reads_ = true;
+  env_->random_file_open_counter_.store(0);
+  ASSERT_OK(Put("bar", "foo"));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+  if (prefetch_supported) {
+    // After flush, we'll open the file and read footer, meta block,
+    // property block and index block.
+    ASSERT_EQ(4, env_->random_read_counter_.Read());
+  } else {
+    // With prefetch not supported, we will do a single read into a buffer
+    ASSERT_EQ(1, env_->random_read_counter_.Read());
+  }
+  ASSERT_EQ(1, env_->random_file_open_counter_.load());
+
+  // One pread per a normal data block read
+  env_->random_file_open_counter_.store(0);
+  env_->random_read_counter_.Reset();
+  ASSERT_EQ("bar", Get("foo"));
+  ASSERT_EQ(1, env_->random_read_counter_.Read());
+  // All files are already opened.
+  ASSERT_EQ(0, env_->random_file_open_counter_.load());
+
+  env_->random_file_open_counter_.store(0);
+  env_->random_read_counter_.Reset();
+  ASSERT_OK(Put("bar2", "foo2"));
+  ASSERT_OK(Put("foo2", "bar2"));
+  ASSERT_OK(Flush());
+  if (prefetch_supported) {
+    // After flush, we'll open the file and read footer, meta block,
+    // property block and index block.
+    ASSERT_EQ(4, env_->random_read_counter_.Read());
+  } else {
+    // With prefetch not supported, we will do a single read into a buffer
+    ASSERT_EQ(1, env_->random_read_counter_.Read());
+  }
+  ASSERT_EQ(1, env_->random_file_open_counter_.load());
+
+  env_->random_file_open_counter_.store(0);
+  env_->random_read_counter_.Reset();
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  if (prefetch_supported) {
+    // Compaction needs two input blocks, which requires 2 preads, and
+    // generate a new SST file which needs 4 preads (footer, meta block,
+    // property block and index block). In total 6.
+    ASSERT_EQ(6, env_->random_read_counter_.Read());
+  } else {
+    // With prefetch off, compaction needs two input blocks,
+    // followed by a single buffered read.  In total 3.
+    ASSERT_EQ(3, env_->random_read_counter_.Read());
+  }
+  // All compaction input files should have already been opened.
+  ASSERT_EQ(1, env_->random_file_open_counter_.load());
+
+  // One pread per a normal data block read
+  env_->random_file_open_counter_.store(0);
+  env_->random_read_counter_.Reset();
+  ASSERT_EQ("foo2", Get("bar2"));
+  ASSERT_EQ(1, env_->random_read_counter_.Read());
+  // SST files are already opened.
+  ASSERT_EQ(0, env_->random_file_open_counter_.load());
+}
+
+class TraceExecutionResultHandler : public TraceRecordResult::Handler {
+ public:
+  TraceExecutionResultHandler() {}
+  ~TraceExecutionResultHandler() override {}
+
+  virtual Status Handle(const StatusOnlyTraceExecutionResult& result) override {
+    if (result.GetStartTimestamp() > result.GetEndTimestamp()) {
+      return Status::InvalidArgument("Invalid timestamps.");
+    }
+    result.GetStatus().PermitUncheckedError();
+    switch (result.GetTraceType()) {
+      case kTraceWrite: {
+        total_latency_ += result.GetLatency();
+        cnt_++;
+        writes_++;
+        break;
+      }
+      default:
+        return Status::Corruption("Type mismatch.");
+    }
+    return Status::OK();
+  }
+
+  virtual Status Handle(
+      const SingleValueTraceExecutionResult& result) override {
+    if (result.GetStartTimestamp() > result.GetEndTimestamp()) {
+      return Status::InvalidArgument("Invalid timestamps.");
+    }
+    result.GetStatus().PermitUncheckedError();
+    switch (result.GetTraceType()) {
+      case kTraceGet: {
+        total_latency_ += result.GetLatency();
+        cnt_++;
+        gets_++;
+        break;
+      }
+      default:
+        return Status::Corruption("Type mismatch.");
+    }
+    return Status::OK();
+  }
+
+  virtual Status Handle(
+      const MultiValuesTraceExecutionResult& result) override {
+    if (result.GetStartTimestamp() > result.GetEndTimestamp()) {
+      return Status::InvalidArgument("Invalid timestamps.");
+    }
+    for (const Status& s : result.GetMultiStatus()) {
+      s.PermitUncheckedError();
+    }
+    switch (result.GetTraceType()) {
+      case kTraceMultiGet: {
+        total_latency_ += result.GetLatency();
+        cnt_++;
+        multigets_++;
+        break;
+      }
+      default:
+        return Status::Corruption("Type mismatch.");
+    }
+    return Status::OK();
+  }
+
+  virtual Status Handle(const IteratorTraceExecutionResult& result) override {
+    if (result.GetStartTimestamp() > result.GetEndTimestamp()) {
+      return Status::InvalidArgument("Invalid timestamps.");
+    }
+    result.GetStatus().PermitUncheckedError();
+    switch (result.GetTraceType()) {
+      case kTraceIteratorSeek:
+      case kTraceIteratorSeekForPrev: {
+        total_latency_ += result.GetLatency();
+        cnt_++;
+        seeks_++;
+        break;
+      }
+      default:
+        return Status::Corruption("Type mismatch.");
+    }
+    return Status::OK();
+  }
+
+  void Reset() {
+    total_latency_ = 0;
+    cnt_ = 0;
+    writes_ = 0;
+    gets_ = 0;
+    seeks_ = 0;
+    multigets_ = 0;
+  }
+
+  double GetAvgLatency() const {
+    return cnt_ == 0 ? 0.0 : 1.0 * total_latency_ / cnt_;
+  }
+
+  int GetNumWrites() const { return writes_; }
+
+  int GetNumGets() const { return gets_; }
+
+  int GetNumIterSeeks() const { return seeks_; }
+
+  int GetNumMultiGets() const { return multigets_; }
+
+ private:
+  std::atomic<uint64_t> total_latency_{0};
+  std::atomic<uint32_t> cnt_{0};
+  std::atomic<int> writes_{0};
+  std::atomic<int> gets_{0};
+  std::atomic<int> seeks_{0};
+  std::atomic<int> multigets_{0};
+};
+
+TEST_F(DBTest2, TraceAndReplay) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  ReadOptions ro;
+  WriteOptions wo;
+  TraceOptions trace_opts;
+  EnvOptions env_opts;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Random rnd(301);
+  Iterator* single_iter = nullptr;
+
+  ASSERT_TRUE(db_->EndTrace().IsIOError());
+
+  std::string trace_filename = dbname_ + "/rocksdb.trace";
+  std::unique_ptr<TraceWriter> trace_writer;
+  ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+  ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+
+  // 5 Writes
+  ASSERT_OK(Put(0, "a", "1"));
+  ASSERT_OK(Merge(0, "b", "2"));
+  ASSERT_OK(Delete(0, "c"));
+  ASSERT_OK(SingleDelete(0, "d"));
+  ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f"));
+
+  // 6th Write
+  WriteBatch batch;
+  ASSERT_OK(batch.Put("f", "11"));
+  ASSERT_OK(batch.Merge("g", "12"));
+  ASSERT_OK(batch.Delete("h"));
+  ASSERT_OK(batch.SingleDelete("i"));
+  ASSERT_OK(batch.DeleteRange("j", "k"));
+  ASSERT_OK(db_->Write(wo, &batch));
+
+  // 2 Seek(ForPrev)s
+  single_iter = db_->NewIterator(ro);
+  single_iter->Seek("f");  // Seek 1
+  single_iter->SeekForPrev("g");
+  ASSERT_OK(single_iter->status());
+  delete single_iter;
+
+  // 2 Gets
+  ASSERT_EQ("1", Get(0, "a"));
+  ASSERT_EQ("12", Get(0, "g"));
+
+  // 7th and 8th Write, 3rd Get
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "rocksdb", "rocks"));
+  ASSERT_EQ("NOT_FOUND", Get(1, "leveldb"));
+
+  // Total Write x 8, Get x 3, Seek x 2.
+  ASSERT_OK(db_->EndTrace());
+  // These should not get into the trace file as it is after EndTrace.
+  ASSERT_OK(Put("hello", "world"));
+  ASSERT_OK(Merge("foo", "bar"));
+
+  // Open another db, replay, and verify the data
+  std::string value;
+  std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay");
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  // Using a different name than db2, to pacify infer's use-after-lifetime
+  // warnings (http://fbinfer.com).
+  DB* db2_init = nullptr;
+  options.create_if_missing = true;
+  ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+  ColumnFamilyHandle* cf;
+  ASSERT_OK(
+      db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+  delete cf;
+  delete db2_init;
+
+  DB* db2 = nullptr;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  ColumnFamilyOptions cf_options;
+  cf_options.merge_operator = MergeOperators::CreatePutOperator();
+  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+  column_families.push_back(
+      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+  std::vector<ColumnFamilyHandle*> handles;
+  DBOptions db_opts;
+  db_opts.env = env_;
+  ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
+
+  env_->SleepForMicroseconds(100);
+  // Verify that the keys don't already exist
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
+
+  std::unique_ptr<TraceReader> trace_reader;
+  ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+  std::unique_ptr<Replayer> replayer;
+  ASSERT_OK(
+      db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
+
+  TraceExecutionResultHandler res_handler;
+  std::function<void(Status, std::unique_ptr<TraceRecordResult> &&)> res_cb =
+      [&res_handler](Status exec_s, std::unique_ptr<TraceRecordResult>&& res) {
+        ASSERT_TRUE(exec_s.ok() || exec_s.IsNotSupported());
+        if (res != nullptr) {
+          ASSERT_OK(res->Accept(&res_handler));
+          res.reset();
+        }
+      };
+
+  // Unprepared replay should fail with Status::Incomplete()
+  ASSERT_TRUE(replayer->Replay(ReplayOptions(), nullptr).IsIncomplete());
+  ASSERT_OK(replayer->Prepare());
+  // Ok to repeatedly Prepare().
+  ASSERT_OK(replayer->Prepare());
+  // Replay using 1 thread, 1x speed.
+  ASSERT_OK(replayer->Replay(ReplayOptions(1, 1.0), res_cb));
+  ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+  ASSERT_EQ(res_handler.GetNumWrites(), 8);
+  ASSERT_EQ(res_handler.GetNumGets(), 3);
+  ASSERT_EQ(res_handler.GetNumIterSeeks(), 2);
+  ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
+  res_handler.Reset();
+
+  ASSERT_OK(db2->Get(ro, handles[0], "a", &value));
+  ASSERT_EQ("1", value);
+  ASSERT_OK(db2->Get(ro, handles[0], "g", &value));
+  ASSERT_EQ("12", value);
+  ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound());
+
+  ASSERT_OK(db2->Get(ro, handles[1], "foo", &value));
+  ASSERT_EQ("bar", value);
+  ASSERT_OK(db2->Get(ro, handles[1], "rocksdb", &value));
+  ASSERT_EQ("rocks", value);
+
+  // Re-replay should fail with Status::Incomplete() if Prepare() was not
+  // called. Currently we don't distinguish between unprepared and trace end.
+  ASSERT_TRUE(replayer->Replay(ReplayOptions(), nullptr).IsIncomplete());
+
+  // Re-replay using 2 threads, 2x speed.
+  ASSERT_OK(replayer->Prepare());
+  ASSERT_OK(replayer->Replay(ReplayOptions(2, 2.0), res_cb));
+  ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+  ASSERT_EQ(res_handler.GetNumWrites(), 8);
+  ASSERT_EQ(res_handler.GetNumGets(), 3);
+  ASSERT_EQ(res_handler.GetNumIterSeeks(), 2);
+  ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
+  res_handler.Reset();
+
+  // Re-replay using 2 threads, 1/2 speed.
+  ASSERT_OK(replayer->Prepare());
+  ASSERT_OK(replayer->Replay(ReplayOptions(2, 0.5), res_cb));
+  ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+  ASSERT_EQ(res_handler.GetNumWrites(), 8);
+  ASSERT_EQ(res_handler.GetNumGets(), 3);
+  ASSERT_EQ(res_handler.GetNumIterSeeks(), 2);
+  ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
+  res_handler.Reset();
+
+  replayer.reset();
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname2, options));
+}
+
+TEST_F(DBTest2, TraceAndManualReplay) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  ReadOptions ro;
+  WriteOptions wo;
+  TraceOptions trace_opts;
+  EnvOptions env_opts;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Random rnd(301);
+  Iterator* single_iter = nullptr;
+
+  ASSERT_TRUE(db_->EndTrace().IsIOError());
+
+  std::string trace_filename = dbname_ + "/rocksdb.trace";
+  std::unique_ptr<TraceWriter> trace_writer;
+  ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+  ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+
+  ASSERT_OK(Put(0, "a", "1"));
+  ASSERT_OK(Merge(0, "b", "2"));
+  ASSERT_OK(Delete(0, "c"));
+  ASSERT_OK(SingleDelete(0, "d"));
+  ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f"));
+
+  WriteBatch batch;
+  ASSERT_OK(batch.Put("f", "11"));
+  ASSERT_OK(batch.Merge("g", "12"));
+  ASSERT_OK(batch.Delete("h"));
+  ASSERT_OK(batch.SingleDelete("i"));
+  ASSERT_OK(batch.DeleteRange("j", "k"));
+  ASSERT_OK(db_->Write(wo, &batch));
+
+  single_iter = db_->NewIterator(ro);
+  single_iter->Seek("f");
+  single_iter->SeekForPrev("g");
+  ASSERT_OK(single_iter->status());
+  delete single_iter;
+
+  // Write some sequenced keys for testing lower/upper bounds of iterator.
+  batch.Clear();
+  ASSERT_OK(batch.Put("iter-0", "iter-0"));
+  ASSERT_OK(batch.Put("iter-1", "iter-1"));
+  ASSERT_OK(batch.Put("iter-2", "iter-2"));
+  ASSERT_OK(batch.Put("iter-3", "iter-3"));
+  ASSERT_OK(batch.Put("iter-4", "iter-4"));
+  ASSERT_OK(db_->Write(wo, &batch));
+
+  ReadOptions bounded_ro = ro;
+  Slice lower_bound("iter-1");
+  Slice upper_bound("iter-3");
+  bounded_ro.iterate_lower_bound = &lower_bound;
+  bounded_ro.iterate_upper_bound = &upper_bound;
+  single_iter = db_->NewIterator(bounded_ro);
+  single_iter->Seek("iter-0");
+  ASSERT_EQ(single_iter->key().ToString(), "iter-1");
+  single_iter->Seek("iter-2");
+  ASSERT_EQ(single_iter->key().ToString(), "iter-2");
+  single_iter->Seek("iter-4");
+  ASSERT_FALSE(single_iter->Valid());
+  single_iter->SeekForPrev("iter-0");
+  ASSERT_FALSE(single_iter->Valid());
+  single_iter->SeekForPrev("iter-2");
+  ASSERT_EQ(single_iter->key().ToString(), "iter-2");
+  single_iter->SeekForPrev("iter-4");
+  ASSERT_EQ(single_iter->key().ToString(), "iter-2");
+  ASSERT_OK(single_iter->status());
+  delete single_iter;
+
+  ASSERT_EQ("1", Get(0, "a"));
+  ASSERT_EQ("12", Get(0, "g"));
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "rocksdb", "rocks"));
+  ASSERT_EQ("NOT_FOUND", Get(1, "leveldb"));
+
+  // Same as TraceAndReplay, Write x 8, Get x 3, Seek x 2.
+  // Plus 1 WriteBatch for iterator with lower/upper bounds, and 6
+  // Seek(ForPrev)s.
+  // Total Write x 9, Get x 3, Seek x 8
+  ASSERT_OK(db_->EndTrace());
+  // These should not get into the trace file as it is after EndTrace.
+  ASSERT_OK(Put("hello", "world"));
+  ASSERT_OK(Merge("foo", "bar"));
+
+  // Open another db, replay, and verify the data
+  std::string value;
+  std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay");
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  // Using a different name than db2, to pacify infer's use-after-lifetime
+  // warnings (http://fbinfer.com).
+  DB* db2_init = nullptr;
+  options.create_if_missing = true;
+  ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+  ColumnFamilyHandle* cf;
+  ASSERT_OK(
+      db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+  delete cf;
+  delete db2_init;
+
+  DB* db2 = nullptr;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  ColumnFamilyOptions cf_options;
+  cf_options.merge_operator = MergeOperators::CreatePutOperator();
+  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+  column_families.push_back(
+      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+  std::vector<ColumnFamilyHandle*> handles;
+  DBOptions db_opts;
+  db_opts.env = env_;
+  ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
+
+  env_->SleepForMicroseconds(100);
+  // Verify that the keys don't already exist
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
+
+  std::unique_ptr<TraceReader> trace_reader;
+  ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+  std::unique_ptr<Replayer> replayer;
+  ASSERT_OK(
+      db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
+
+  TraceExecutionResultHandler res_handler;
+
+  // Manual replay for 2 times. The 2nd checks if the replay can restart.
+  std::unique_ptr<TraceRecord> record;
+  std::unique_ptr<TraceRecordResult> result;
+  for (int i = 0; i < 2; i++) {
+    // Next should fail if unprepared.
+    ASSERT_TRUE(replayer->Next(nullptr).IsIncomplete());
+    ASSERT_OK(replayer->Prepare());
+    Status s = Status::OK();
+    // Looping until trace end.
+    while (s.ok()) {
+      s = replayer->Next(&record);
+      // Skip unsupported operations.
+      if (s.IsNotSupported()) {
+        continue;
+      }
+      if (s.ok()) {
+        ASSERT_OK(replayer->Execute(record, &result));
+        if (result != nullptr) {
+          ASSERT_OK(result->Accept(&res_handler));
+          if (record->GetTraceType() == kTraceIteratorSeek ||
+              record->GetTraceType() == kTraceIteratorSeekForPrev) {
+            IteratorSeekQueryTraceRecord* iter_rec =
+                dynamic_cast<IteratorSeekQueryTraceRecord*>(record.get());
+            IteratorTraceExecutionResult* iter_res =
+                dynamic_cast<IteratorTraceExecutionResult*>(result.get());
+            // Check if lower/upper bounds are correctly saved and decoded.
+            std::string lower_str = iter_rec->GetLowerBound().ToString();
+            std::string upper_str = iter_rec->GetUpperBound().ToString();
+            std::string iter_key = iter_res->GetKey().ToString();
+            std::string iter_value = iter_res->GetValue().ToString();
+            if (!lower_str.empty() && !upper_str.empty()) {
+              ASSERT_EQ(lower_str, "iter-1");
+              ASSERT_EQ(upper_str, "iter-3");
+              if (iter_res->GetValid()) {
+                // If iterator is valid, then lower_bound <= key < upper_bound.
+                ASSERT_GE(iter_key, lower_str);
+                ASSERT_LT(iter_key, upper_str);
+              } else {
+                // If iterator is invalid, then
+                //   key < lower_bound or key >= upper_bound.
+                ASSERT_TRUE(iter_key < lower_str || iter_key >= upper_str);
+              }
+            }
+            // If iterator is invalid, the key and value should be empty.
+            if (!iter_res->GetValid()) {
+              ASSERT_TRUE(iter_key.empty());
+              ASSERT_TRUE(iter_value.empty());
+            }
+          }
+          result.reset();
+        }
+      }
+    }
+    // Status::Incomplete() will be returned when manually reading the trace
+    // end, or Prepare() was not called.
+    ASSERT_TRUE(s.IsIncomplete());
+    ASSERT_TRUE(replayer->Next(nullptr).IsIncomplete());
+    ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+    ASSERT_EQ(res_handler.GetNumWrites(), 9);
+    ASSERT_EQ(res_handler.GetNumGets(), 3);
+    ASSERT_EQ(res_handler.GetNumIterSeeks(), 8);
+    ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
+    res_handler.Reset();
+  }
+
+  ASSERT_OK(db2->Get(ro, handles[0], "a", &value));
+  ASSERT_EQ("1", value);
+  ASSERT_OK(db2->Get(ro, handles[0], "g", &value));
+  ASSERT_EQ("12", value);
+  ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound());
+
+  ASSERT_OK(db2->Get(ro, handles[1], "foo", &value));
+  ASSERT_EQ("bar", value);
+  ASSERT_OK(db2->Get(ro, handles[1], "rocksdb", &value));
+  ASSERT_EQ("rocks", value);
+
+  // Test execution of artificially created TraceRecords.
+  uint64_t fake_ts = 1U;
+  // Write
+  batch.Clear();
+  ASSERT_OK(batch.Put("trace-record-write1", "write1"));
+  ASSERT_OK(batch.Put("trace-record-write2", "write2"));
+  record.reset(new WriteQueryTraceRecord(batch.Data(), fake_ts++));
+  ASSERT_OK(replayer->Execute(record, &result));
+  ASSERT_TRUE(result != nullptr);
+  ASSERT_OK(result->Accept(&res_handler));  // Write x 1
+  ASSERT_OK(db2->Get(ro, handles[0], "trace-record-write1", &value));
+  ASSERT_EQ("write1", value);
+  ASSERT_OK(db2->Get(ro, handles[0], "trace-record-write2", &value));
+  ASSERT_EQ("write2", value);
+  ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+  ASSERT_EQ(res_handler.GetNumWrites(), 1);
+  ASSERT_EQ(res_handler.GetNumGets(), 0);
+  ASSERT_EQ(res_handler.GetNumIterSeeks(), 0);
+  ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
+  res_handler.Reset();
+
+  // Get related
+  // Get an existing key.
+  record.reset(new GetQueryTraceRecord(handles[0]->GetID(),
+                                       "trace-record-write1", fake_ts++));
+  ASSERT_OK(replayer->Execute(record, &result));
+  ASSERT_TRUE(result != nullptr);
+  ASSERT_OK(result->Accept(&res_handler));  // Get x 1
+  // Get an non-existing key, should still return Status::OK().
+  record.reset(new GetQueryTraceRecord(handles[0]->GetID(), "trace-record-get",
+                                       fake_ts++));
+  ASSERT_OK(replayer->Execute(record, &result));
+  ASSERT_TRUE(result != nullptr);
+  ASSERT_OK(result->Accept(&res_handler));  // Get x 2
+  // Get from an invalid (non-existing) cf_id.
+  uint32_t invalid_cf_id = handles[1]->GetID() + 1;
+  record.reset(new GetQueryTraceRecord(invalid_cf_id, "whatever", fake_ts++));
+  ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption());
+  ASSERT_TRUE(result == nullptr);
+  ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+  ASSERT_EQ(res_handler.GetNumWrites(), 0);
+  ASSERT_EQ(res_handler.GetNumGets(), 2);
+  ASSERT_EQ(res_handler.GetNumIterSeeks(), 0);
+  ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
+  res_handler.Reset();
+
+  // Iteration related
+  for (IteratorSeekQueryTraceRecord::SeekType seekType :
+       {IteratorSeekQueryTraceRecord::kSeek,
+        IteratorSeekQueryTraceRecord::kSeekForPrev}) {
+    // Seek to an existing key.
+    record.reset(new IteratorSeekQueryTraceRecord(
+        seekType, handles[0]->GetID(), "trace-record-write1", fake_ts++));
+    ASSERT_OK(replayer->Execute(record, &result));
+    ASSERT_TRUE(result != nullptr);
+    ASSERT_OK(result->Accept(&res_handler));  // Seek x 1 in one iteration
+    // Seek to an non-existing key, should still return Status::OK().
+    record.reset(new IteratorSeekQueryTraceRecord(
+        seekType, handles[0]->GetID(), "trace-record-get", fake_ts++));
+    ASSERT_OK(replayer->Execute(record, &result));
+    ASSERT_TRUE(result != nullptr);
+    ASSERT_OK(result->Accept(&res_handler));  // Seek x 2 in one iteration
+    // Seek from an invalid cf_id.
+    record.reset(new IteratorSeekQueryTraceRecord(seekType, invalid_cf_id,
+                                                  "whatever", fake_ts++));
+    ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption());
+    ASSERT_TRUE(result == nullptr);
+  }
+  ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+  ASSERT_EQ(res_handler.GetNumWrites(), 0);
+  ASSERT_EQ(res_handler.GetNumGets(), 0);
+  ASSERT_EQ(res_handler.GetNumIterSeeks(), 4);  // Seek x 2 in two iterations
+  ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
+  res_handler.Reset();
+
+  // MultiGet related
+  // Get existing keys.
+  record.reset(new MultiGetQueryTraceRecord(
+      std::vector<uint32_t>({handles[0]->GetID(), handles[1]->GetID()}),
+      std::vector<std::string>({"a", "foo"}), fake_ts++));
+  ASSERT_OK(replayer->Execute(record, &result));
+  ASSERT_TRUE(result != nullptr);
+  ASSERT_OK(result->Accept(&res_handler));  // MultiGet x 1
+  // Get all non-existing keys, should still return Status::OK().
+  record.reset(new MultiGetQueryTraceRecord(
+      std::vector<uint32_t>({handles[0]->GetID(), handles[1]->GetID()}),
+      std::vector<std::string>({"no1", "no2"}), fake_ts++));
+  ASSERT_OK(replayer->Execute(record, &result));
+  ASSERT_TRUE(result != nullptr);
+  ASSERT_OK(result->Accept(&res_handler));  // MultiGet x 2
+  // Get mixed of existing and non-existing keys, should still return
+  // Status::OK().
+  record.reset(new MultiGetQueryTraceRecord(
+      std::vector<uint32_t>({handles[0]->GetID(), handles[1]->GetID()}),
+      std::vector<std::string>({"a", "no2"}), fake_ts++));
+  ASSERT_OK(replayer->Execute(record, &result));
+  ASSERT_TRUE(result != nullptr);
+  MultiValuesTraceExecutionResult* mvr =
+      dynamic_cast<MultiValuesTraceExecutionResult*>(result.get());
+  ASSERT_TRUE(mvr != nullptr);
+  ASSERT_OK(mvr->GetMultiStatus()[0]);
+  ASSERT_TRUE(mvr->GetMultiStatus()[1].IsNotFound());
+  ASSERT_EQ(mvr->GetValues()[0], "1");
+  ASSERT_EQ(mvr->GetValues()[1], "");
+  ASSERT_OK(result->Accept(&res_handler));  // MultiGet x 3
+  // Get from an invalid (non-existing) cf_id.
+  record.reset(new MultiGetQueryTraceRecord(
+      std::vector<uint32_t>(
+          {handles[0]->GetID(), handles[1]->GetID(), invalid_cf_id}),
+      std::vector<std::string>({"a", "foo", "whatever"}), fake_ts++));
+  ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption());
+  ASSERT_TRUE(result == nullptr);
+  // Empty MultiGet
+  record.reset(new MultiGetQueryTraceRecord(
+      std::vector<uint32_t>(), std::vector<std::string>(), fake_ts++));
+  ASSERT_TRUE(replayer->Execute(record, &result).IsInvalidArgument());
+  ASSERT_TRUE(result == nullptr);
+  // MultiGet size mismatch
+  record.reset(new MultiGetQueryTraceRecord(
+      std::vector<uint32_t>({handles[0]->GetID(), handles[1]->GetID()}),
+      std::vector<std::string>({"a"}), fake_ts++));
+  ASSERT_TRUE(replayer->Execute(record, &result).IsInvalidArgument());
+  ASSERT_TRUE(result == nullptr);
+  ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+  ASSERT_EQ(res_handler.GetNumWrites(), 0);
+  ASSERT_EQ(res_handler.GetNumGets(), 0);
+  ASSERT_EQ(res_handler.GetNumIterSeeks(), 0);
+  ASSERT_EQ(res_handler.GetNumMultiGets(), 3);
+  res_handler.Reset();
+
+  replayer.reset();
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname2, options));
+}
+
+TEST_F(DBTest2, TraceWithLimit) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  ReadOptions ro;
+  WriteOptions wo;
+  TraceOptions trace_opts;
+  EnvOptions env_opts;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Random rnd(301);
+
+  // test the max trace file size options
+  trace_opts.max_trace_file_size = 5;
+  std::string trace_filename = dbname_ + "/rocksdb.trace1";
+  std::unique_ptr<TraceWriter> trace_writer;
+  ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+  ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+  ASSERT_OK(Put(0, "a", "1"));
+  ASSERT_OK(Put(0, "b", "1"));
+  ASSERT_OK(Put(0, "c", "1"));
+  ASSERT_OK(db_->EndTrace());
+
+  std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay2");
+  std::string value;
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  // Using a different name than db2, to pacify infer's use-after-lifetime
+  // warnings (http://fbinfer.com).
+  DB* db2_init = nullptr;
+  options.create_if_missing = true;
+  ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+  ColumnFamilyHandle* cf;
+  ASSERT_OK(
+      db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+  delete cf;
+  delete db2_init;
+
+  DB* db2 = nullptr;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  ColumnFamilyOptions cf_options;
+  cf_options.merge_operator = MergeOperators::CreatePutOperator();
+  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+  column_families.push_back(
+      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+  std::vector<ColumnFamilyHandle*> handles;
+  DBOptions db_opts;
+  db_opts.env = env_;
+  ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
+
+  env_->SleepForMicroseconds(100);
+  // Verify that the keys don't already exist
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
+
+  std::unique_ptr<TraceReader> trace_reader;
+  ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+  std::unique_ptr<Replayer> replayer;
+  ASSERT_OK(
+      db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
+  ASSERT_OK(replayer->Prepare());
+  ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr));
+  replayer.reset();
+
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname2, options));
+}
+
+TEST_F(DBTest2, TraceWithSampling) {
+  Options options = CurrentOptions();
+  ReadOptions ro;
+  WriteOptions wo;
+  TraceOptions trace_opts;
+  EnvOptions env_opts;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Random rnd(301);
+
+  // test the trace file sampling options
+  trace_opts.sampling_frequency = 2;
+  std::string trace_filename = dbname_ + "/rocksdb.trace_sampling";
+  std::unique_ptr<TraceWriter> trace_writer;
+  ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+  ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+  ASSERT_OK(Put(0, "a", "1"));
+  ASSERT_OK(Put(0, "b", "2"));
+  ASSERT_OK(Put(0, "c", "3"));
+  ASSERT_OK(Put(0, "d", "4"));
+  ASSERT_OK(Put(0, "e", "5"));
+  ASSERT_OK(db_->EndTrace());
+
+  std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay_sampling");
+  std::string value;
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  // Using a different name than db2, to pacify infer's use-after-lifetime
+  // warnings (http://fbinfer.com).
+  DB* db2_init = nullptr;
+  options.create_if_missing = true;
+  ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+  ColumnFamilyHandle* cf;
+  ASSERT_OK(
+      db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+  delete cf;
+  delete db2_init;
+
+  DB* db2 = nullptr;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  ColumnFamilyOptions cf_options;
+  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+  column_families.push_back(
+      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+  std::vector<ColumnFamilyHandle*> handles;
+  DBOptions db_opts;
+  db_opts.env = env_;
+  ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
+
+  env_->SleepForMicroseconds(100);
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "d", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "e", &value).IsNotFound());
+
+  std::unique_ptr<TraceReader> trace_reader;
+  ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+  std::unique_ptr<Replayer> replayer;
+  ASSERT_OK(
+      db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
+  ASSERT_OK(replayer->Prepare());
+  ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr));
+  replayer.reset();
+
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_FALSE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
+  ASSERT_FALSE(db2->Get(ro, handles[0], "d", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "e", &value).IsNotFound());
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname2, options));
+}
+
+TEST_F(DBTest2, TraceWithFilter) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  ReadOptions ro;
+  WriteOptions wo;
+  TraceOptions trace_opts;
+  EnvOptions env_opts;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Random rnd(301);
+  Iterator* single_iter = nullptr;
+
+  trace_opts.filter = TraceFilterType::kTraceFilterWrite;
+
+  std::string trace_filename = dbname_ + "/rocksdb.trace";
+  std::unique_ptr<TraceWriter> trace_writer;
+  ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+  ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+
+  ASSERT_OK(Put(0, "a", "1"));
+  ASSERT_OK(Merge(0, "b", "2"));
+  ASSERT_OK(Delete(0, "c"));
+  ASSERT_OK(SingleDelete(0, "d"));
+  ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f"));
+
+  WriteBatch batch;
+  ASSERT_OK(batch.Put("f", "11"));
+  ASSERT_OK(batch.Merge("g", "12"));
+  ASSERT_OK(batch.Delete("h"));
+  ASSERT_OK(batch.SingleDelete("i"));
+  ASSERT_OK(batch.DeleteRange("j", "k"));
+  ASSERT_OK(db_->Write(wo, &batch));
+
+  single_iter = db_->NewIterator(ro);
+  single_iter->Seek("f");
+  single_iter->SeekForPrev("g");
+  delete single_iter;
+
+  ASSERT_EQ("1", Get(0, "a"));
+  ASSERT_EQ("12", Get(0, "g"));
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "rocksdb", "rocks"));
+  ASSERT_EQ("NOT_FOUND", Get(1, "leveldb"));
+
+  ASSERT_OK(db_->EndTrace());
+  // These should not get into the trace file as it is after EndTrace.
+  ASSERT_OK(Put("hello", "world"));
+  ASSERT_OK(Merge("foo", "bar"));
+
+  // Open another db, replay, and verify the data
+  std::string value;
+  std::string dbname2 = test::PerThreadDBPath(env_, "db_replay");
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  // Using a different name than db2, to pacify infer's use-after-lifetime
+  // warnings (http://fbinfer.com).
+  DB* db2_init = nullptr;
+  options.create_if_missing = true;
+  ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+  ColumnFamilyHandle* cf;
+  ASSERT_OK(
+      db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+  delete cf;
+  delete db2_init;
+
+  DB* db2 = nullptr;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  ColumnFamilyOptions cf_options;
+  cf_options.merge_operator = MergeOperators::CreatePutOperator();
+  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+  column_families.push_back(
+      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+  std::vector<ColumnFamilyHandle*> handles;
+  DBOptions db_opts;
+  db_opts.env = env_;
+  ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
+
+  env_->SleepForMicroseconds(100);
+  // Verify that the keys don't already exist
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
+
+  std::unique_ptr<TraceReader> trace_reader;
+  ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+  std::unique_ptr<Replayer> replayer;
+  ASSERT_OK(
+      db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
+  ASSERT_OK(replayer->Prepare());
+  ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr));
+  replayer.reset();
+
+  // All the key-values should not present since we filter out the WRITE ops.
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "foo", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "rocksdb", &value).IsNotFound());
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  // Set up a new db.
+  std::string dbname3 = test::PerThreadDBPath(env_, "db_not_trace_read");
+  ASSERT_OK(DestroyDB(dbname3, options));
+
+  DB* db3_init = nullptr;
+  options.create_if_missing = true;
+  ColumnFamilyHandle* cf3;
+  ASSERT_OK(DB::Open(options, dbname3, &db3_init));
+  ASSERT_OK(
+      db3_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf3));
+  delete cf3;
+  delete db3_init;
+
+  column_families.clear();
+  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+  column_families.push_back(
+      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+  handles.clear();
+
+  DB* db3 = nullptr;
+  ASSERT_OK(DB::Open(db_opts, dbname3, column_families, &handles, &db3));
+
+  env_->SleepForMicroseconds(100);
+  // Verify that the keys don't already exist
+  ASSERT_TRUE(db3->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db3->Get(ro, handles[0], "g", &value).IsNotFound());
+
+  // The tracer will not record the READ ops.
+  trace_opts.filter = TraceFilterType::kTraceFilterGet;
+  std::string trace_filename3 = dbname_ + "/rocksdb.trace_3";
+  std::unique_ptr<TraceWriter> trace_writer3;
+  ASSERT_OK(
+      NewFileTraceWriter(env_, env_opts, trace_filename3, &trace_writer3));
+  ASSERT_OK(db3->StartTrace(trace_opts, std::move(trace_writer3)));
+
+  ASSERT_OK(db3->Put(wo, handles[0], "a", "1"));
+  ASSERT_OK(db3->Merge(wo, handles[0], "b", "2"));
+  ASSERT_OK(db3->Delete(wo, handles[0], "c"));
+  ASSERT_OK(db3->SingleDelete(wo, handles[0], "d"));
+
+  ASSERT_OK(db3->Get(ro, handles[0], "a", &value));
+  ASSERT_EQ(value, "1");
+  ASSERT_TRUE(db3->Get(ro, handles[0], "c", &value).IsNotFound());
+
+  ASSERT_OK(db3->EndTrace());
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+  delete db3;
+  ASSERT_OK(DestroyDB(dbname3, options));
+
+  std::unique_ptr<TraceReader> trace_reader3;
+  ASSERT_OK(
+      NewFileTraceReader(env_, env_opts, trace_filename3, &trace_reader3));
+
+  // Count the number of records in the trace file;
+  int count = 0;
+  std::string data;
+  Status s;
+  while (true) {
+    s = trace_reader3->Read(&data);
+    if (!s.ok()) {
+      break;
+    }
+    count += 1;
+  }
+  // We also need to count the header and footer
+  // 4 WRITE + HEADER + FOOTER = 6
+  ASSERT_EQ(count, 6);
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest2, PinnableSliceAndMmapReads) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  if (!IsMemoryMappedAccessSupported()) {
+    ROCKSDB_GTEST_SKIP("Test requires default environment");
+    return;
+  }
+  options.allow_mmap_reads = true;
+  options.max_open_files = 100;
+  options.compression = kNoCompression;
+  Reopen(options);
+
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  PinnableSlice pinned_value;
+  ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
+  // It is not safe to pin mmap files as they might disappear by compaction
+  ASSERT_FALSE(pinned_value.IsPinned());
+  ASSERT_EQ(pinned_value.ToString(), "bar");
+
+  ASSERT_OK(dbfull()->TEST_CompactRange(
+      0 /* level */, nullptr /* begin */, nullptr /* end */,
+      nullptr /* column_family */, true /* disallow_trivial_move */));
+
+  // Ensure pinned_value doesn't rely on memory munmap'd by the above
+  // compaction. It crashes if it does.
+  ASSERT_EQ(pinned_value.ToString(), "bar");
+
+#ifndef ROCKSDB_LITE
+  pinned_value.Reset();
+  // Unsafe to pin mmap files when they could be kicked out of table cache
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
+  ASSERT_FALSE(pinned_value.IsPinned());
+  ASSERT_EQ(pinned_value.ToString(), "bar");
+
+  pinned_value.Reset();
+  // In read-only mode with infinite capacity on table cache it should pin the
+  // value and avoid the memcpy
+  Close();
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
+  ASSERT_TRUE(pinned_value.IsPinned());
+  ASSERT_EQ(pinned_value.ToString(), "bar");
+#endif
+}
+
+TEST_F(DBTest2, DISABLED_IteratorPinnedMemory) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.no_block_cache = false;
+  bbto.cache_index_and_filter_blocks = false;
+  bbto.block_cache = NewLRUCache(100000);
+  bbto.block_size = 400;  // small block size
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  Random rnd(301);
+  std::string v = rnd.RandomString(400);
+
+  // Since v is the size of a block, each key should take a block
+  // of 400+ bytes.
+  ASSERT_OK(Put("1", v));
+  ASSERT_OK(Put("3", v));
+  ASSERT_OK(Put("5", v));
+  ASSERT_OK(Put("7", v));
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage());
+
+  // Verify that iterators don't pin more than one data block in block cache
+  // at each time.
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    iter->SeekToFirst();
+
+    for (int i = 0; i < 4; i++) {
+      ASSERT_TRUE(iter->Valid());
+      // Block cache should contain exactly one block.
+      ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
+      ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
+      iter->Next();
+    }
+    ASSERT_FALSE(iter->Valid());
+
+    iter->Seek("4");
+    ASSERT_TRUE(iter->Valid());
+
+    ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
+    ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
+
+    iter->Seek("3");
+    ASSERT_TRUE(iter->Valid());
+
+    ASSERT_OK(iter->status());
+
+    ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
+    ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
+  }
+  ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage());
+
+  // Test compaction case
+  ASSERT_OK(Put("2", v));
+  ASSERT_OK(Put("5", v));
+  ASSERT_OK(Put("6", v));
+  ASSERT_OK(Put("8", v));
+  ASSERT_OK(Flush());
+
+  // Clear existing data in block cache
+  bbto.block_cache->SetCapacity(0);
+  bbto.block_cache->SetCapacity(100000);
+
+  // Verify compaction input iterators don't hold more than one data blocks at
+  // one time.
+  std::atomic<bool> finished(false);
+  std::atomic<int> block_newed(0);
+  std::atomic<int> block_destroyed(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "Block::Block:0", [&](void* /*arg*/) {
+        if (finished) {
+          return;
+        }
+        // Two iterators. At most 2 outstanding blocks.
+        EXPECT_GE(block_newed.load(), block_destroyed.load());
+        EXPECT_LE(block_newed.load(), block_destroyed.load() + 1);
+        block_newed.fetch_add(1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "Block::~Block", [&](void* /*arg*/) {
+        if (finished) {
+          return;
+        }
+        // Two iterators. At most 2 outstanding blocks.
+        EXPECT_GE(block_newed.load(), block_destroyed.load() + 1);
+        EXPECT_LE(block_newed.load(), block_destroyed.load() + 2);
+        block_destroyed.fetch_add(1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run:BeforeVerify",
+      [&](void* /*arg*/) { finished = true; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // Two input files. Each of them has 4 data blocks.
+  ASSERT_EQ(8, block_newed.load());
+  ASSERT_EQ(8, block_destroyed.load());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, TestBBTTailPrefetch) {
+  std::atomic<bool> called(false);
+  size_t expected_lower_bound = 512 * 1024;
+  size_t expected_higher_bound = 512 * 1024;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) {
+        size_t* prefetch_size = static_cast<size_t*>(arg);
+        EXPECT_LE(expected_lower_bound, *prefetch_size);
+        EXPECT_GE(expected_higher_bound, *prefetch_size);
+        called = true;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("1", "1"));
+  ASSERT_OK(Put("9", "1"));
+  ASSERT_OK(Flush());
+
+  expected_lower_bound = 0;
+  expected_higher_bound = 8 * 1024;
+
+  ASSERT_OK(Put("1", "1"));
+  ASSERT_OK(Put("9", "1"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("1", "1"));
+  ASSERT_OK(Put("9", "1"));
+  ASSERT_OK(Flush());
+
+  // Full compaction to make sure there is no L0 file after the open.
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_TRUE(called.load());
+  called = false;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  std::atomic<bool> first_call(true);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) {
+        size_t* prefetch_size = static_cast<size_t*>(arg);
+        if (first_call) {
+          EXPECT_EQ(4 * 1024, *prefetch_size);
+          first_call = false;
+        } else {
+          EXPECT_GE(4 * 1024, *prefetch_size);
+        }
+        called = true;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.max_file_opening_threads = 1;  // one thread
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.max_open_files = -1;
+  Reopen(options);
+
+  ASSERT_OK(Put("1", "1"));
+  ASSERT_OK(Put("9", "1"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("1", "1"));
+  ASSERT_OK(Put("9", "1"));
+  ASSERT_OK(Flush());
+
+  ASSERT_TRUE(called.load());
+  called = false;
+
+  // Parallel loading SST files
+  options.max_file_opening_threads = 16;
+  Reopen(options);
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_TRUE(called.load());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBTest2, TestGetColumnFamilyHandleUnlocked) {
+  // Setup sync point dependency to reproduce the race condition of
+  // DBImpl::GetColumnFamilyHandleUnlocked
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked1",
+       "TestGetColumnFamilyHandleUnlocked::PreGetColumnFamilyHandleUnlocked2"},
+      {"TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked2",
+       "TestGetColumnFamilyHandleUnlocked::ReadColumnFamilyHandle1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateColumnFamilies({"test1", "test2"}, Options());
+  ASSERT_EQ(handles_.size(), 2);
+
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  port::Thread user_thread1([&]() {
+    auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[0]->GetID());
+    ASSERT_EQ(cfh->GetID(), handles_[0]->GetID());
+    TEST_SYNC_POINT(
+        "TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked1");
+    TEST_SYNC_POINT(
+        "TestGetColumnFamilyHandleUnlocked::ReadColumnFamilyHandle1");
+    ASSERT_EQ(cfh->GetID(), handles_[0]->GetID());
+  });
+
+  port::Thread user_thread2([&]() {
+    TEST_SYNC_POINT(
+        "TestGetColumnFamilyHandleUnlocked::PreGetColumnFamilyHandleUnlocked2");
+    auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[1]->GetID());
+    ASSERT_EQ(cfh->GetID(), handles_[1]->GetID());
+    TEST_SYNC_POINT(
+        "TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked2");
+    ASSERT_EQ(cfh->GetID(), handles_[1]->GetID());
+  });
+
+  user_thread1.join();
+  user_thread2.join();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, TestCompactFiles) {
+  // Setup sync point dependency to reproduce the race condition of
+  // DBImpl::GetColumnFamilyHandleUnlocked
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"TestCompactFiles::IngestExternalFile1",
+       "TestCompactFiles::IngestExternalFile2"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options;
+  options.env = env_;
+  options.num_levels = 2;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  auto* handle = db_->DefaultColumnFamily();
+  ASSERT_EQ(db_->NumberLevels(handle), 2);
+
+  ROCKSDB_NAMESPACE::SstFileWriter sst_file_writer{
+      ROCKSDB_NAMESPACE::EnvOptions(), options};
+  std::string external_file1 = dbname_ + "/test_compact_files1.sst_t";
+  std::string external_file2 = dbname_ + "/test_compact_files2.sst_t";
+  std::string external_file3 = dbname_ + "/test_compact_files3.sst_t";
+
+  ASSERT_OK(sst_file_writer.Open(external_file1));
+  ASSERT_OK(sst_file_writer.Put("1", "1"));
+  ASSERT_OK(sst_file_writer.Put("2", "2"));
+  ASSERT_OK(sst_file_writer.Finish());
+
+  ASSERT_OK(sst_file_writer.Open(external_file2));
+  ASSERT_OK(sst_file_writer.Put("3", "3"));
+  ASSERT_OK(sst_file_writer.Put("4", "4"));
+  ASSERT_OK(sst_file_writer.Finish());
+
+  ASSERT_OK(sst_file_writer.Open(external_file3));
+  ASSERT_OK(sst_file_writer.Put("5", "5"));
+  ASSERT_OK(sst_file_writer.Put("6", "6"));
+  ASSERT_OK(sst_file_writer.Finish());
+
+  ASSERT_OK(db_->IngestExternalFile(handle, {external_file1, external_file3},
+                                    IngestExternalFileOptions()));
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 2);
+  std::vector<std::string> files;
+  GetSstFiles(env_, dbname_, &files);
+  ASSERT_EQ(files.size(), 2);
+
+  Status user_thread1_status;
+  port::Thread user_thread1([&]() {
+    user_thread1_status =
+        db_->CompactFiles(CompactionOptions(), handle, files, 1);
+  });
+
+  Status user_thread2_status;
+  port::Thread user_thread2([&]() {
+    user_thread2_status = db_->IngestExternalFile(handle, {external_file2},
+                                                  IngestExternalFileOptions());
+    TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile1");
+  });
+
+  user_thread1.join();
+  user_thread2.join();
+
+  ASSERT_OK(user_thread1_status);
+  ASSERT_OK(user_thread2_status);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest2, MultiDBParallelOpenTest) {
+  const int kNumDbs = 2;
+  Options options = CurrentOptions();
+  std::vector<std::string> dbnames;
+  for (int i = 0; i < kNumDbs; ++i) {
+    dbnames.emplace_back(test::PerThreadDBPath(env_, "db" + std::to_string(i)));
+    ASSERT_OK(DestroyDB(dbnames.back(), options));
+  }
+
+  // Verify empty DBs can be created in parallel
+  std::vector<std::thread> open_threads;
+  std::vector<DB*> dbs{static_cast<unsigned int>(kNumDbs), nullptr};
+  options.create_if_missing = true;
+  for (int i = 0; i < kNumDbs; ++i) {
+    open_threads.emplace_back(
+        [&](int dbnum) {
+          ASSERT_OK(DB::Open(options, dbnames[dbnum], &dbs[dbnum]));
+        },
+        i);
+  }
+
+  // Now add some data and close, so next we can verify non-empty DBs can be
+  // recovered in parallel
+  for (int i = 0; i < kNumDbs; ++i) {
+    open_threads[i].join();
+    ASSERT_OK(dbs[i]->Put(WriteOptions(), "xi", "gua"));
+    delete dbs[i];
+  }
+
+  // Verify non-empty DBs can be recovered in parallel
+  open_threads.clear();
+  for (int i = 0; i < kNumDbs; ++i) {
+    open_threads.emplace_back(
+        [&](int dbnum) {
+          ASSERT_OK(DB::Open(options, dbnames[dbnum], &dbs[dbnum]));
+        },
+        i);
+  }
+
+  // Wait and cleanup
+  for (int i = 0; i < kNumDbs; ++i) {
+    open_threads[i].join();
+    delete dbs[i];
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+  }
+}
+
+namespace {
+class DummyOldStats : public Statistics {
+ public:
+  const char* Name() const override { return "DummyOldStats"; }
+  uint64_t getTickerCount(uint32_t /*ticker_type*/) const override { return 0; }
+  void recordTick(uint32_t /* ticker_type */, uint64_t /* count */) override {
+    num_rt++;
+  }
+  void setTickerCount(uint32_t /*ticker_type*/, uint64_t /*count*/) override {}
+  uint64_t getAndResetTickerCount(uint32_t /*ticker_type*/) override {
+    return 0;
+  }
+  void measureTime(uint32_t /*histogram_type*/, uint64_t /*count*/) override {
+    num_mt++;
+  }
+  void histogramData(
+      uint32_t /*histogram_type*/,
+      ROCKSDB_NAMESPACE::HistogramData* const /*data*/) const override {}
+  std::string getHistogramString(uint32_t /*type*/) const override {
+    return "";
+  }
+  bool HistEnabledForType(uint32_t /*type*/) const override { return false; }
+  std::string ToString() const override { return ""; }
+  std::atomic<int> num_rt{0};
+  std::atomic<int> num_mt{0};
+};
+}  // anonymous namespace
+
+TEST_F(DBTest2, OldStatsInterface) {
+  DummyOldStats* dos = new DummyOldStats();
+  std::shared_ptr<Statistics> stats(dos);
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = stats;
+  Reopen(options);
+
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_EQ("bar", Get("foo"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("bar", Get("foo"));
+
+  ASSERT_GT(dos->num_rt, 0);
+  ASSERT_GT(dos->num_mt, 0);
+}
+
+TEST_F(DBTest2, CloseWithUnreleasedSnapshot) {
+  const Snapshot* ss = db_->GetSnapshot();
+
+  for (auto h : handles_) {
+    db_->DestroyColumnFamilyHandle(h);
+  }
+  handles_.clear();
+
+  ASSERT_NOK(db_->Close());
+  db_->ReleaseSnapshot(ss);
+  ASSERT_OK(db_->Close());
+  delete db_;
+  db_ = nullptr;
+}
+
+TEST_F(DBTest2, PrefixBloomReseek) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  // Construct two L1 files with keys:
+  // f1:[aaa1 ccc1] f2:[ddd0]
+  ASSERT_OK(Put("aaa1", ""));
+  ASSERT_OK(Put("ccc1", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("ddd0", ""));
+  ASSERT_OK(Flush());
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  ASSERT_OK(Put("bbb1", ""));
+
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  ASSERT_OK(iter->status());
+
+  // Seeking into f1, the iterator will check bloom filter which returns the
+  // file iterator ot be invalidate, and the cursor will put into f2, with
+  // the next key to be "ddd0".
+  iter->Seek("bbb1");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("bbb1", iter->key().ToString());
+
+  // Reseek ccc1, the L1 iterator needs to go back to f1 and reseek.
+  iter->Seek("ccc1");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("ccc1", iter->key().ToString());
+
+  delete iter;
+}
+
+TEST_F(DBTest2, PrefixBloomFilteredOut) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  // Construct two L1 files with keys:
+  // f1:[aaa1 ccc1] f2:[ddd0]
+  ASSERT_OK(Put("aaa1", ""));
+  ASSERT_OK(Put("ccc1", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("ddd0", ""));
+  ASSERT_OK(Flush());
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  ASSERT_OK(iter->status());
+
+  // Bloom filter is filterd out by f1.
+  // This is just one of several valid position following the contract.
+  // Postioning to ccc1 or ddd0 is also valid. This is just to validate
+  // the behavior of the current implementation. If underlying implementation
+  // changes, the test might fail here.
+  iter->Seek("bbb1");
+  ASSERT_OK(iter->status());
+  ASSERT_FALSE(iter->Valid());
+
+  delete iter;
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, RowCacheSnapshot) {
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.row_cache = NewLRUCache(8 * 8192);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("foo", "bar1"));
+
+  const Snapshot* s1 = db_->GetSnapshot();
+
+  ASSERT_OK(Put("foo", "bar2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("foo2", "bar"));
+  const Snapshot* s2 = db_->GetSnapshot();
+  ASSERT_OK(Put("foo3", "bar"));
+  const Snapshot* s3 = db_->GetSnapshot();
+
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0);
+  ASSERT_EQ(Get("foo"), "bar2");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+  ASSERT_EQ(Get("foo"), "bar2");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+  ASSERT_EQ(Get("foo", s1), "bar1");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+  ASSERT_EQ(Get("foo", s2), "bar2");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 2);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+  ASSERT_EQ(Get("foo", s1), "bar1");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 3);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+  ASSERT_EQ(Get("foo", s3), "bar2");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 4);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+
+  db_->ReleaseSnapshot(s1);
+  db_->ReleaseSnapshot(s2);
+  db_->ReleaseSnapshot(s3);
+}
+#endif  // ROCKSDB_LITE
+
+// When DB is reopened with multiple column families, the manifest file
+// is written after the first CF is flushed, and it is written again
+// after each flush. If DB crashes between the flushes, the flushed CF
+// flushed will pass the latest log file, and now we require it not
+// to be corrupted, and triggering a corruption report.
+// We need to fix the bug and enable the test.
+TEST_F(DBTest2, CrashInRecoveryMultipleCF) {
+  const std::vector<std::string> sync_points = {
+      "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable",
+      "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0"};
+  for (const auto& test_sync_point : sync_points) {
+    Options options = CurrentOptions();
+    // First destroy original db to ensure a clean start.
+    DestroyAndReopen(options);
+    options.create_if_missing = true;
+    options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+    CreateAndReopenWithCF({"pikachu"}, options);
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Put(1, "foo", "bar"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Put(1, "foo", "bar"));
+    // The value is large enough to be divided to two blocks.
+    std::string large_value(400, ' ');
+    ASSERT_OK(Put("foo1", large_value));
+    ASSERT_OK(Put("foo2", large_value));
+    Close();
+
+    // Corrupt the log file in the middle, so that it is not corrupted
+    // in the tail.
+    std::vector<std::string> filenames;
+    ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+    for (const auto& f : filenames) {
+      uint64_t number;
+      FileType type;
+      if (ParseFileName(f, &number, &type) && type == FileType::kWalFile) {
+        std::string fname = dbname_ + "/" + f;
+        std::string file_content;
+        ASSERT_OK(ReadFileToString(env_, fname, &file_content));
+        file_content[400] = 'h';
+        file_content[401] = 'a';
+        ASSERT_OK(WriteStringToFile(env_, file_content, fname));
+        break;
+      }
+    }
+
+    // Reopen and freeze the file system after the first manifest write.
+    FaultInjectionTestEnv fit_env(options.env);
+    options.env = &fit_env;
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        test_sync_point,
+        [&](void* /*arg*/) { fit_env.SetFilesystemActive(false); });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    ASSERT_NOK(TryReopenWithColumnFamilies(
+        {kDefaultColumnFamilyName, "pikachu"}, options));
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+    fit_env.SetFilesystemActive(true);
+    // If we continue using failure ingestion Env, it will conplain something
+    // when renaming current file, which is not expected. Need to investigate
+    // why.
+    options.env = env_;
+    ASSERT_OK(TryReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"},
+                                          options));
+  }
+}
+
+TEST_F(DBTest2, SeekFileRangeDeleteTail) {
+  Options options = CurrentOptions();
+  options.prefix_extractor.reset(NewCappedPrefixTransform(1));
+  options.num_levels = 3;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("a", "a"));
+  const Snapshot* s1 = db_->GetSnapshot();
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "f"));
+  ASSERT_OK(Put("b", "a"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("x", "a"));
+  ASSERT_OK(Put("z", "a"));
+  ASSERT_OK(Flush());
+
+  CompactRangeOptions cro;
+  cro.change_level = true;
+  cro.target_level = 2;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  {
+    ReadOptions ro;
+    ro.total_order_seek = true;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+    ASSERT_OK(iter->status());
+    iter->Seek("e");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("x", iter->key().ToString());
+  }
+  db_->ReleaseSnapshot(s1);
+}
+
+TEST_F(DBTest2, BackgroundPurgeTest) {
+  Options options = CurrentOptions();
+  options.write_buffer_manager =
+      std::make_shared<ROCKSDB_NAMESPACE::WriteBufferManager>(1 << 20);
+  options.avoid_unnecessary_blocking_io = true;
+  DestroyAndReopen(options);
+  size_t base_value = options.write_buffer_manager->memory_usage();
+
+  ASSERT_OK(Put("a", "a"));
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  ASSERT_OK(iter->status());
+  ASSERT_OK(Flush());
+  size_t value = options.write_buffer_manager->memory_usage();
+  ASSERT_GT(value, base_value);
+
+  db_->GetEnv()->SetBackgroundThreads(1, Env::Priority::HIGH);
+  test::SleepingBackgroundTask sleeping_task_after;
+  db_->GetEnv()->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                          &sleeping_task_after, Env::Priority::HIGH);
+  delete iter;
+
+  Env::Default()->SleepForMicroseconds(100000);
+  value = options.write_buffer_manager->memory_usage();
+  ASSERT_GT(value, base_value);
+
+  sleeping_task_after.WakeUp();
+  sleeping_task_after.WaitUntilDone();
+
+  test::SleepingBackgroundTask sleeping_task_after2;
+  db_->GetEnv()->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                          &sleeping_task_after2, Env::Priority::HIGH);
+  sleeping_task_after2.WakeUp();
+  sleeping_task_after2.WaitUntilDone();
+
+  value = options.write_buffer_manager->memory_usage();
+  ASSERT_EQ(base_value, value);
+}
+
+TEST_F(DBTest2, SwitchMemtableRaceWithNewManifest) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  options.max_manifest_file_size = 10;
+  options.create_if_missing = true;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_EQ(2, handles_.size());
+
+  ASSERT_OK(Put("foo", "value"));
+  const int kL0Files = options.level0_file_num_compaction_trigger;
+  for (int i = 0; i < kL0Files; ++i) {
+    ASSERT_OK(Put(/*cf=*/1, "a", std::to_string(i)));
+    ASSERT_OK(Flush(/*cf=*/1));
+  }
+
+  port::Thread thread([&]() { ASSERT_OK(Flush()); });
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  thread.join();
+}
+
+TEST_F(DBTest2, SameSmallestInSameLevel) {
+  // This test validates fractional casacading logic when several files at one
+  // one level only contains the same user key.
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("key", "1"));
+  ASSERT_OK(Put("key", "2"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "3"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "4"));
+  ASSERT_OK(Flush());
+  CompactRangeOptions cro;
+  cro.change_level = true;
+  cro.target_level = 2;
+  ASSERT_OK(dbfull()->CompactRange(cro, db_->DefaultColumnFamily(), nullptr,
+                                   nullptr));
+
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "5"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "6"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "7"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "8"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,4,1", FilesPerLevel());
+#endif  // ROCKSDB_LITE
+
+  ASSERT_EQ("2,3,4,5,6,7,8", Get("key"));
+}
+
+TEST_F(DBTest2, FileConsistencyCheckInOpen) {
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) {
+        Status* ret_s = static_cast<Status*>(arg);
+        *ret_s = Status::Corruption("fcc");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.force_consistency_checks = true;
+  ASSERT_NOK(TryReopen(options));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, BlockBasedTablePrefixIndexSeekForPrev) {
+  // create a DB with block prefix index
+  BlockBasedTableOptions table_options;
+  Options options = CurrentOptions();
+  table_options.block_size = 300;
+  table_options.index_type = BlockBasedTableOptions::kHashSearch;
+  table_options.index_shortening =
+      BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+
+  Reopen(options);
+
+  Random rnd(301);
+  std::string large_value = rnd.RandomString(500);
+
+  ASSERT_OK(Put("a1", large_value));
+  ASSERT_OK(Put("x1", large_value));
+  ASSERT_OK(Put("y1", large_value));
+  ASSERT_OK(Flush());
+
+  {
+    std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
+    ASSERT_OK(iterator->status());
+    iterator->SeekForPrev("x3");
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_EQ("x1", iterator->key().ToString());
+
+    iterator->SeekForPrev("a3");
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_EQ("a1", iterator->key().ToString());
+
+    iterator->SeekForPrev("y3");
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_EQ("y1", iterator->key().ToString());
+
+    // Query more than one non-existing prefix to cover the case both
+    // of empty hash bucket and hash bucket conflict.
+    iterator->SeekForPrev("b1");
+    // Result should be not valid or "a1".
+    if (iterator->Valid()) {
+      ASSERT_EQ("a1", iterator->key().ToString());
+    }
+
+    iterator->SeekForPrev("c1");
+    // Result should be not valid or "a1".
+    if (iterator->Valid()) {
+      ASSERT_EQ("a1", iterator->key().ToString());
+    }
+
+    iterator->SeekForPrev("d1");
+    // Result should be not valid or "a1".
+    if (iterator->Valid()) {
+      ASSERT_EQ("a1", iterator->key().ToString());
+    }
+
+    iterator->SeekForPrev("y3");
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_EQ("y1", iterator->key().ToString());
+  }
+}
+
+TEST_F(DBTest2, PartitionedIndexPrefetchFailure) {
+  Options options = last_options_;
+  options.env = env_;
+  options.max_open_files = 20;
+  BlockBasedTableOptions bbto;
+  bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  bbto.metadata_block_size = 128;
+  bbto.block_size = 128;
+  bbto.block_cache = NewLRUCache(16777216);
+  bbto.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  // Force no table cache so every read will preload the SST file.
+  dbfull()->TEST_table_cache()->SetCapacity(0);
+  bbto.block_cache->SetCapacity(0);
+
+  Random rnd(301);
+  for (int i = 0; i < 4096; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(32)));
+  }
+  ASSERT_OK(Flush());
+
+  // Try different random failures in table open for 300 times.
+  for (int i = 0; i < 300; i++) {
+    env_->num_reads_fails_ = 0;
+    env_->rand_reads_fail_odd_ = 8;
+
+    std::string value;
+    Status s = dbfull()->Get(ReadOptions(), Key(1), &value);
+    if (env_->num_reads_fails_ > 0) {
+      ASSERT_NOK(s);
+    } else {
+      ASSERT_OK(s);
+    }
+  }
+
+  env_->rand_reads_fail_odd_ = 0;
+}
+
+TEST_F(DBTest2, ChangePrefixExtractor) {
+  for (bool use_partitioned_filter : {true, false}) {
+    // create a DB with block prefix index
+    BlockBasedTableOptions table_options;
+    Options options = CurrentOptions();
+
+    // Sometimes filter is checked based on upper bound. Assert counters
+    // for that case. Otherwise, only check data correctness.
+#ifndef ROCKSDB_LITE
+    bool expect_filter_check = !use_partitioned_filter;
+#else
+    bool expect_filter_check = false;
+#endif
+    table_options.partition_filters = use_partitioned_filter;
+    if (use_partitioned_filter) {
+      table_options.index_type =
+          BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    }
+    table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    options.statistics = CreateDBStatistics();
+
+    options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+
+    ASSERT_OK(Put("aa", ""));
+    ASSERT_OK(Put("xb", ""));
+    ASSERT_OK(Put("xx1", ""));
+    ASSERT_OK(Put("xz1", ""));
+    ASSERT_OK(Put("zz", ""));
+    ASSERT_OK(Flush());
+
+    // After reopening DB with prefix size 2 => 1, prefix extractor
+    // won't take effective unless it won't change results based
+    // on upper bound and seek key.
+    options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+    Reopen(options);
+
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
+      ASSERT_OK(iterator->status());
+      iterator->Seek("xa");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xb", iterator->key().ToString());
+      // It's a bug that the counter BLOOM_FILTER_PREFIX_CHECKED is not
+      // correct in this case. So don't check counters in this case.
+      if (expect_filter_check) {
+        ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+
+      iterator->Seek("xz");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xz1", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+    }
+
+    std::string ub_str = "xg9";
+    Slice ub(ub_str);
+    ReadOptions ro;
+    ro.iterate_upper_bound = &ub;
+
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+      ASSERT_OK(iterator->status());
+
+      // SeekForPrev() never uses prefix bloom if it is changed.
+      iterator->SeekForPrev("xg0");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xb", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+    }
+
+    ub_str = "xx9";
+    ub = Slice(ub_str);
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+      ASSERT_OK(iterator->status());
+
+      iterator->Seek("x");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xb", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+
+      iterator->Seek("xx0");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xx1", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+    }
+
+    CompactRangeOptions compact_range_opts;
+    compact_range_opts.bottommost_level_compaction =
+        BottommostLevelCompaction::kForce;
+    ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));
+    ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));
+
+    // Re-execute similar queries after a full compaction
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
+
+      iterator->Seek("x");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xb", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(2, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+
+      iterator->Seek("xg");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xx1", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+
+      iterator->Seek("xz");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xz1", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+
+      ASSERT_OK(iterator->status());
+    }
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+
+      iterator->SeekForPrev("xx0");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xb", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(5, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+
+      iterator->Seek("xx0");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xx1", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(6, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+
+      ASSERT_OK(iterator->status());
+    }
+
+    ub_str = "xg9";
+    ub = Slice(ub_str);
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+      iterator->SeekForPrev("xg0");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xb", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(7, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+      ASSERT_OK(iterator->status());
+    }
+  }
+}
+
+TEST_F(DBTest2, BlockBasedTablePrefixGetIndexNotFound) {
+  // create a DB with block prefix index
+  BlockBasedTableOptions table_options;
+  Options options = CurrentOptions();
+  table_options.block_size = 300;
+  table_options.index_type = BlockBasedTableOptions::kHashSearch;
+  table_options.index_shortening =
+      BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+  options.level0_file_num_compaction_trigger = 8;
+
+  Reopen(options);
+
+  ASSERT_OK(Put("b1", "ok"));
+  ASSERT_OK(Flush());
+
+  // Flushing several files so that the chance that hash bucket
+  // is empty fo "b" in at least one of the files is high.
+  ASSERT_OK(Put("a1", ""));
+  ASSERT_OK(Put("c1", ""));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("a2", ""));
+  ASSERT_OK(Put("c2", ""));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("a3", ""));
+  ASSERT_OK(Put("c3", ""));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("a4", ""));
+  ASSERT_OK(Put("c4", ""));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("a5", ""));
+  ASSERT_OK(Put("c5", ""));
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ("ok", Get("b1"));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, AutoPrefixMode1) {
+  do {
+    // create a DB with block prefix index
+    Options options = CurrentOptions();
+    BlockBasedTableOptions table_options =
+        *options.table_factory->GetOptions<BlockBasedTableOptions>();
+    table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+    options.statistics = CreateDBStatistics();
+
+    Reopen(options);
+
+    Random rnd(301);
+    std::string large_value = rnd.RandomString(500);
+
+    ASSERT_OK(Put("a1", large_value));
+    ASSERT_OK(Put("x1", large_value));
+    ASSERT_OK(Put("y1", large_value));
+    ASSERT_OK(Flush());
+
+    ReadOptions ro;
+    ro.total_order_seek = false;
+    ro.auto_prefix_mode = true;
+
+    const auto stat = BLOOM_FILTER_PREFIX_CHECKED;
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+      iterator->Seek("b1");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("x1", iterator->key().ToString());
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+      ASSERT_OK(iterator->status());
+    }
+
+    Slice ub;
+    ro.iterate_upper_bound = &ub;
+
+    ub = "b9";
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+      iterator->Seek("b1");
+      ASSERT_FALSE(iterator->Valid());
+      EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
+      ASSERT_OK(iterator->status());
+    }
+
+    ub = "z";
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+      iterator->Seek("b1");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("x1", iterator->key().ToString());
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+      ASSERT_OK(iterator->status());
+    }
+
+    ub = "c";
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+      iterator->Seek("b1");
+      ASSERT_FALSE(iterator->Valid());
+      EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
+      ASSERT_OK(iterator->status());
+    }
+
+    ub = "c1";
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+      iterator->Seek("b1");
+      ASSERT_FALSE(iterator->Valid());
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+      ASSERT_OK(iterator->status());
+    }
+
+    // The same queries without recreating iterator
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+
+      ub = "b9";
+      iterator->Seek("b1");
+      ASSERT_FALSE(iterator->Valid());
+      EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
+      ASSERT_OK(iterator->status());
+
+      ub = "z";
+      iterator->Seek("b1");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("x1", iterator->key().ToString());
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+      ub = "c";
+      iterator->Seek("b1");
+      ASSERT_FALSE(iterator->Valid());
+      EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
+
+      ub = "b9";
+      iterator->SeekForPrev("b1");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("a1", iterator->key().ToString());
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+      ub = "zz";
+      iterator->SeekToLast();
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("y1", iterator->key().ToString());
+
+      iterator->SeekToFirst();
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("a1", iterator->key().ToString());
+    }
+
+    // Similar, now with reverse comparator
+    // Technically, we are violating axiom 2 of prefix_extractors, but
+    // it should be revised because of major use-cases using
+    // ReverseBytewiseComparator with capped/fixed prefix Seek. (FIXME)
+    options.comparator = ReverseBytewiseComparator();
+    options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+
+    DestroyAndReopen(options);
+
+    ASSERT_OK(Put("a1", large_value));
+    ASSERT_OK(Put("x1", large_value));
+    ASSERT_OK(Put("y1", large_value));
+    ASSERT_OK(Flush());
+
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+
+      ub = "b1";
+      iterator->Seek("b9");
+      ASSERT_FALSE(iterator->Valid());
+      EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
+      ASSERT_OK(iterator->status());
+
+      ub = "b1";
+      iterator->Seek("z");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("y1", iterator->key().ToString());
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+      ub = "b1";
+      iterator->Seek("c");
+      ASSERT_FALSE(iterator->Valid());
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+      ub = "b";
+      iterator->Seek("c9");
+      ASSERT_FALSE(iterator->Valid());
+      // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor
+      // is "correctly" implemented.
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+      ub = "a";
+      iterator->Seek("b9");
+      // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor
+      // is "correctly" implemented.
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("a1", iterator->key().ToString());
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+      ub = "b";
+      iterator->Seek("a");
+      ASSERT_FALSE(iterator->Valid());
+      // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor
+      // matches BytewiseComparator::IsSameLengthImmediateSuccessor. Upper
+      // comparing before seek key prevents a real bug from surfacing.
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+      ub = "b1";
+      iterator->SeekForPrev("b9");
+      ASSERT_TRUE(iterator->Valid());
+      // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor
+      // is "correctly" implemented.
+      ASSERT_EQ("x1", iterator->key().ToString());
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+      ub = "a";
+      iterator->SeekToLast();
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("a1", iterator->key().ToString());
+
+      iterator->SeekToFirst();
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("y1", iterator->key().ToString());
+    }
+
+    // Now something a bit different, related to "short" keys that
+    // auto_prefix_mode can omit. See "BUG" section of auto_prefix_mode.
+    options.comparator = BytewiseComparator();
+    for (const auto config : {"fixed:2", "capped:2"}) {
+      ASSERT_OK(SliceTransform::CreateFromString(ConfigOptions(), config,
+                                                 &options.prefix_extractor));
+
+      // FIXME: kHashSearch, etc. requires all keys be InDomain
+      if (StartsWith(config, "fixed") &&
+          (table_options.index_type == BlockBasedTableOptions::kHashSearch ||
+           StartsWith(options.memtable_factory->Name(), "Hash"))) {
+        continue;
+      }
+      DestroyAndReopen(options);
+
+      const char* a_end_stuff = "a\xffXYZ";
+      const char* b_begin_stuff = "b\x00XYZ";
+      ASSERT_OK(Put("a", large_value));
+      ASSERT_OK(Put("b", large_value));
+      ASSERT_OK(Put(Slice(b_begin_stuff, 3), large_value));
+      ASSERT_OK(Put("c", large_value));
+      ASSERT_OK(Flush());
+
+      // control showing valid optimization with auto_prefix mode
+      ub = Slice(a_end_stuff, 4);
+      ro.iterate_upper_bound = &ub;
+
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+      iterator->Seek(Slice(a_end_stuff, 2));
+      ASSERT_FALSE(iterator->Valid());
+      EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
+      ASSERT_OK(iterator->status());
+
+      // test, cannot be validly optimized with auto_prefix_mode
+      ub = Slice(b_begin_stuff, 2);
+      ro.iterate_upper_bound = &ub;
+
+      iterator->Seek(Slice(a_end_stuff, 2));
+      // !!! BUG !!! See "BUG" section of auto_prefix_mode.
+      ASSERT_FALSE(iterator->Valid());
+      EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
+      ASSERT_OK(iterator->status());
+
+      // To prove that is the wrong result, now use total order seek
+      ReadOptions tos_ro = ro;
+      tos_ro.total_order_seek = true;
+      tos_ro.auto_prefix_mode = false;
+      iterator.reset(db_->NewIterator(tos_ro));
+      iterator->Seek(Slice(a_end_stuff, 2));
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("b", iterator->key().ToString());
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+      ASSERT_OK(iterator->status());
+    }
+  } while (ChangeOptions(kSkipPlainTable));
+}
+
+class RenameCurrentTest : public DBTestBase,
+                          public testing::WithParamInterface<std::string> {
+ public:
+  RenameCurrentTest()
+      : DBTestBase("rename_current_test", /*env_do_fsync=*/true),
+        sync_point_(GetParam()) {}
+
+  ~RenameCurrentTest() override {}
+
+  void SetUp() override {
+    env_->no_file_overwrite_.store(true, std::memory_order_release);
+  }
+
+  void TearDown() override {
+    env_->no_file_overwrite_.store(false, std::memory_order_release);
+  }
+
+  void SetupSyncPoints() {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->SetCallBack(sync_point_, [&](void* arg) {
+      Status* s = reinterpret_cast<Status*>(arg);
+      assert(s);
+      *s = Status::IOError("Injected IO error.");
+    });
+  }
+
+  const std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(DistributedFS, RenameCurrentTest,
+                        ::testing::Values("SetCurrentFile:BeforeRename",
+                                          "SetCurrentFile:AfterRename"));
+
+TEST_P(RenameCurrentTest, Open) {
+  Destroy(last_options_);
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  SetupSyncPoints();
+  SyncPoint::GetInstance()->EnableProcessing();
+  Status s = TryReopen(options);
+  ASSERT_NOK(s);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  Reopen(options);
+}
+
+TEST_P(RenameCurrentTest, Flush) {
+  Destroy(last_options_);
+  Options options = GetDefaultOptions();
+  options.max_manifest_file_size = 1;
+  options.create_if_missing = true;
+  Reopen(options);
+  ASSERT_OK(Put("key", "value"));
+  SetupSyncPoints();
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_NOK(Flush());
+
+  ASSERT_NOK(Put("foo", "value"));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  Reopen(options);
+  ASSERT_EQ("value", Get("key"));
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+}
+
+TEST_P(RenameCurrentTest, Compaction) {
+  Destroy(last_options_);
+  Options options = GetDefaultOptions();
+  options.max_manifest_file_size = 1;
+  options.create_if_missing = true;
+  Reopen(options);
+  ASSERT_OK(Put("a", "a_value"));
+  ASSERT_OK(Put("c", "c_value"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("b", "b_value"));
+  ASSERT_OK(Put("d", "d_value"));
+  ASSERT_OK(Flush());
+
+  SetupSyncPoints();
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_NOK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                               /*end=*/nullptr));
+
+  ASSERT_NOK(Put("foo", "value"));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  Reopen(options);
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+  ASSERT_EQ("d_value", Get("d"));
+}
+
+TEST_F(DBTest2, LastLevelTemperature) {
+  class TestListener : public EventListener {
+   public:
+    void OnFileReadFinish(const FileOperationInfo& info) override {
+      UpdateFileTemperature(info);
+    }
+
+    void OnFileWriteFinish(const FileOperationInfo& info) override {
+      UpdateFileTemperature(info);
+    }
+
+    void OnFileFlushFinish(const FileOperationInfo& info) override {
+      UpdateFileTemperature(info);
+    }
+
+    void OnFileSyncFinish(const FileOperationInfo& info) override {
+      UpdateFileTemperature(info);
+    }
+
+    void OnFileCloseFinish(const FileOperationInfo& info) override {
+      UpdateFileTemperature(info);
+    }
+
+    bool ShouldBeNotifiedOnFileIO() override { return true; }
+
+    std::unordered_map<uint64_t, Temperature> file_temperatures;
+
+   private:
+    void UpdateFileTemperature(const FileOperationInfo& info) {
+      auto filename = GetFileName(info.path);
+      uint64_t number;
+      FileType type;
+      ASSERT_TRUE(ParseFileName(filename, &number, &type));
+      if (type == kTableFile) {
+        MutexLock l(&mutex_);
+        auto ret = file_temperatures.insert({number, info.temperature});
+        if (!ret.second) {
+          // the same file temperature should always be the same for all events
+          ASSERT_TRUE(ret.first->second == info.temperature);
+        }
+      }
+    }
+
+    std::string GetFileName(const std::string& fname) {
+      auto filename = fname.substr(fname.find_last_of(kFilePathSeparator) + 1);
+      // workaround only for Windows that the file path could contain both
+      // Windows FilePathSeparator and '/'
+      filename = filename.substr(filename.find_last_of('/') + 1);
+      return filename;
+    }
+
+    port::Mutex mutex_;
+  };
+
+  const int kNumLevels = 7;
+  const int kLastLevel = kNumLevels - 1;
+
+  auto* listener = new TestListener();
+
+  Options options = CurrentOptions();
+  options.bottommost_temperature = Temperature::kWarm;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.num_levels = kNumLevels;
+  options.statistics = CreateDBStatistics();
+  options.listeners.emplace_back(listener);
+  Reopen(options);
+
+  auto size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kHot);
+  ASSERT_EQ(size, 0);
+
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  get_iostats_context()->Reset();
+  IOStatsContext* iostats = get_iostats_context();
+
+  ColumnFamilyMetaData metadata;
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(1, metadata.file_count);
+  SstFileMetaData meta = metadata.levels[kLastLevel].files[0];
+  ASSERT_EQ(Temperature::kWarm, meta.temperature);
+  uint64_t number;
+  FileType type;
+  ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
+  ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
+
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_GT(size, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
+  ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
+
+  ASSERT_EQ("bar", Get("foo"));
+
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 1);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0);
+  ASSERT_GT(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
+  ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0);
+  ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0);
+
+  // non-bottommost file still has unknown temperature
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("bar", Get("bar"));
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 1);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0);
+  ASSERT_GT(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
+  ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0);
+  ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0);
+
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(2, metadata.file_count);
+  meta = metadata.levels[0].files[0];
+  ASSERT_EQ(Temperature::kUnknown, meta.temperature);
+  ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
+  ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
+
+  meta = metadata.levels[kLastLevel].files[0];
+  ASSERT_EQ(Temperature::kWarm, meta.temperature);
+  ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
+  ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
+
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_GT(size, 0);
+
+  // reopen and check the information is persisted
+  Reopen(options);
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(2, metadata.file_count);
+  meta = metadata.levels[0].files[0];
+  ASSERT_EQ(Temperature::kUnknown, meta.temperature);
+  ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
+  ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
+
+  meta = metadata.levels[kLastLevel].files[0];
+  ASSERT_EQ(Temperature::kWarm, meta.temperature);
+  ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
+  ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_GT(size, 0);
+
+  // check other non-exist temperatures
+  size = GetSstSizeHelper(Temperature::kHot);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kCold);
+  ASSERT_EQ(size, 0);
+  std::string prop;
+  ASSERT_TRUE(dbfull()->GetProperty(
+      DB::Properties::kLiveSstFilesSizeAtTemperature + std::to_string(22),
+      &prop));
+  ASSERT_EQ(std::atoi(prop.c_str()), 0);
+
+  Reopen(options);
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(2, metadata.file_count);
+  meta = metadata.levels[0].files[0];
+  ASSERT_EQ(Temperature::kUnknown, meta.temperature);
+  ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
+  ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
+
+  meta = metadata.levels[kLastLevel].files[0];
+  ASSERT_EQ(Temperature::kWarm, meta.temperature);
+  ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
+  ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
+}
+
+TEST_F(DBTest2, LastLevelTemperatureUniversal) {
+  const int kTriggerNum = 3;
+  const int kNumLevels = 5;
+  const int kBottommostLevel = kNumLevels - 1;
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.level0_file_num_compaction_trigger = kTriggerNum;
+  options.num_levels = kNumLevels;
+  options.statistics = CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  auto size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kHot);
+  ASSERT_EQ(size, 0);
+  get_iostats_context()->Reset();
+  IOStatsContext* iostats = get_iostats_context();
+
+  for (int i = 0; i < kTriggerNum; i++) {
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Put("bar", "bar"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ColumnFamilyMetaData metadata;
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(1, metadata.file_count);
+  ASSERT_EQ(Temperature::kUnknown,
+            metadata.levels[kBottommostLevel].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_EQ(size, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_read_count, 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0);
+  ASSERT_EQ("bar", Get("foo"));
+
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0);
+
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(2, metadata.file_count);
+  ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_EQ(size, 0);
+
+  // Update bottommost temperature
+  options.bottommost_temperature = Temperature::kWarm;
+  Reopen(options);
+  db_->GetColumnFamilyMetaData(&metadata);
+  // Should not impact existing ones
+  ASSERT_EQ(Temperature::kUnknown,
+            metadata.levels[kBottommostLevel].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_EQ(size, 0);
+
+  // new generated file should have the new settings
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(1, metadata.file_count);
+  ASSERT_EQ(Temperature::kWarm,
+            metadata.levels[kBottommostLevel].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_GT(size, 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
+  ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0);
+  ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0);
+
+  // non-bottommost file still has unknown temperature
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(2, metadata.file_count);
+  ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_GT(size, 0);
+
+  // check other non-exist temperatures
+  size = GetSstSizeHelper(Temperature::kHot);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kCold);
+  ASSERT_EQ(size, 0);
+  std::string prop;
+  ASSERT_TRUE(dbfull()->GetProperty(
+      DB::Properties::kLiveSstFilesSizeAtTemperature + std::to_string(22),
+      &prop));
+  ASSERT_EQ(std::atoi(prop.c_str()), 0);
+
+  // Update bottommost temperature dynamically with SetOptions
+  auto s = db_->SetOptions({{"last_level_temperature", "kCold"}});
+  ASSERT_OK(s);
+  ASSERT_EQ(db_->GetOptions().bottommost_temperature, Temperature::kCold);
+  db_->GetColumnFamilyMetaData(&metadata);
+  // Should not impact the existing files
+  ASSERT_EQ(Temperature::kWarm,
+            metadata.levels[kBottommostLevel].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kCold);
+  ASSERT_EQ(size, 0);
+
+  // new generated files should have the new settings
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(1, metadata.file_count);
+  ASSERT_EQ(Temperature::kCold,
+            metadata.levels[kBottommostLevel].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kCold);
+  ASSERT_GT(size, 0);
+
+  // kLastTemperature is an invalid temperature
+  options.bottommost_temperature = Temperature::kLastTemperature;
+  s = TryReopen(options);
+  ASSERT_TRUE(s.IsIOError());
+}
+
+TEST_F(DBTest2, LastLevelStatistics) {
+  Options options = CurrentOptions();
+  options.bottommost_temperature = Temperature::kWarm;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.statistics = CreateDBStatistics();
+  Reopen(options);
+
+  // generate 1 sst on level 0
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("bar", Get("bar"));
+
+  ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), 0);
+  ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), 0);
+
+  // 2nd flush to trigger compaction
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("bar", Get("bar"));
+
+  ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES),
+            options.statistics->getTickerCount(WARM_FILE_READ_BYTES));
+  ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT),
+            options.statistics->getTickerCount(WARM_FILE_READ_COUNT));
+
+  auto pre_bytes =
+      options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES);
+  auto pre_count =
+      options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+
+  // 3rd flush to generate 1 sst on level 0
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("bar", Get("bar"));
+
+  ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES),
+            pre_bytes);
+  ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT),
+            pre_count);
+  ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES),
+            options.statistics->getTickerCount(WARM_FILE_READ_BYTES));
+  ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT),
+            options.statistics->getTickerCount(WARM_FILE_READ_COUNT));
+}
+
+TEST_F(DBTest2, CheckpointFileTemperature) {
+  class NoLinkTestFS : public FileTemperatureTestFS {
+    using FileTemperatureTestFS::FileTemperatureTestFS;
+
+    IOStatus LinkFile(const std::string&, const std::string&, const IOOptions&,
+                      IODebugContext*) override {
+      // return not supported to force checkpoint copy the file instead of just
+      // link
+      return IOStatus::NotSupported();
+    }
+  };
+  auto test_fs = std::make_shared<NoLinkTestFS>(env_->GetFileSystem());
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, test_fs));
+  Options options = CurrentOptions();
+  options.bottommost_temperature = Temperature::kWarm;
+  // set dynamic_level to true so the compaction would compact the data to the
+  // last level directly which will have the last_level_temperature
+  options.level_compaction_dynamic_level_bytes = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.env = env.get();
+  Reopen(options);
+
+  // generate a bottommost file and a non-bottommost file
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  auto size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_GT(size, 0);
+
+  std::map<uint64_t, Temperature> temperatures;
+  std::vector<LiveFileStorageInfo> infos;
+  ASSERT_OK(
+      dbfull()->GetLiveFilesStorageInfo(LiveFilesStorageInfoOptions(), &infos));
+  for (auto info : infos) {
+    temperatures.emplace(info.file_number, info.temperature);
+  }
+
+  test_fs->PopRequestedSstFileTemperatures();
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(
+      checkpoint->CreateCheckpoint(dbname_ + kFilePathSeparator + "tempcp"));
+
+  // checking src file src_temperature hints: 2 sst files: 1 sst is kWarm,
+  // another is kUnknown
+  std::vector<std::pair<uint64_t, Temperature>> requested_temps;
+  test_fs->PopRequestedSstFileTemperatures(&requested_temps);
+  // Two requests
+  ASSERT_EQ(requested_temps.size(), 2);
+  std::set<uint64_t> distinct_requests;
+  for (const auto& requested_temp : requested_temps) {
+    // Matching manifest temperatures
+    ASSERT_EQ(temperatures.at(requested_temp.first), requested_temp.second);
+    distinct_requests.insert(requested_temp.first);
+  }
+  // Each request to distinct file
+  ASSERT_EQ(distinct_requests.size(), requested_temps.size());
+
+  delete checkpoint;
+  Close();
+}
+
+TEST_F(DBTest2, FileTemperatureManifestFixup) {
+  auto test_fs = std::make_shared<FileTemperatureTestFS>(env_->GetFileSystem());
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, test_fs));
+  Options options = CurrentOptions();
+  options.bottommost_temperature = Temperature::kWarm;
+  // set dynamic_level to true so the compaction would compact the data to the
+  // last level directly which will have the last_level_temperature
+  options.level_compaction_dynamic_level_bytes = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.env = env.get();
+  std::vector<std::string> cfs = {/*"default",*/ "test1", "test2"};
+  CreateAndReopenWithCF(cfs, options);
+  // Needed for later re-opens (weird)
+  cfs.insert(cfs.begin(), kDefaultColumnFamilyName);
+
+  // Generate a bottommost file in all CFs
+  for (int cf = 0; cf < 3; ++cf) {
+    ASSERT_OK(Put(cf, "a", "val"));
+    ASSERT_OK(Put(cf, "c", "val"));
+    ASSERT_OK(Flush(cf));
+    ASSERT_OK(Put(cf, "b", "val"));
+    ASSERT_OK(Put(cf, "d", "val"));
+    ASSERT_OK(Flush(cf));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // verify
+  ASSERT_GT(GetSstSizeHelper(Temperature::kWarm), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kHot), 0);
+
+  // Generate a non-bottommost file in all CFs
+  for (int cf = 0; cf < 3; ++cf) {
+    ASSERT_OK(Put(cf, "e", "val"));
+    ASSERT_OK(Flush(cf));
+  }
+
+  // re-verify
+  ASSERT_GT(GetSstSizeHelper(Temperature::kWarm), 0);
+  // Not supported: ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kHot), 0);
+
+  // Now change FS temperature on bottommost file(s) to kCold
+  std::map<uint64_t, Temperature> current_temps;
+  test_fs->CopyCurrentSstFileTemperatures(&current_temps);
+  for (auto e : current_temps) {
+    if (e.second == Temperature::kWarm) {
+      test_fs->OverrideSstFileTemperature(e.first, Temperature::kCold);
+    }
+  }
+  // Metadata not yet updated
+  ASSERT_EQ(Get("a"), "val");
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // Update with Close and UpdateManifestForFilesState, but first save cf
+  // descriptors
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (size_t i = 0; i < handles_.size(); ++i) {
+    ColumnFamilyDescriptor cfdescriptor;
+    // GetDescriptor is not implemented for ROCKSDB_LITE
+    handles_[i]->GetDescriptor(&cfdescriptor).PermitUncheckedError();
+    column_families.push_back(cfdescriptor);
+  }
+  Close();
+  experimental::UpdateManifestForFilesStateOptions update_opts;
+  update_opts.update_temperatures = true;
+
+  ASSERT_OK(experimental::UpdateManifestForFilesState(
+      options, dbname_, column_families, update_opts));
+
+  // Re-open and re-verify after update
+  ReopenWithColumnFamilies(cfs, options);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+  // Not supported: ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kWarm), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kHot), 0);
+
+  // Change kUnknown to kHot
+  test_fs->CopyCurrentSstFileTemperatures(&current_temps);
+  for (auto e : current_temps) {
+    if (e.second == Temperature::kUnknown) {
+      test_fs->OverrideSstFileTemperature(e.first, Temperature::kHot);
+    }
+  }
+
+  // Update with Close and UpdateManifestForFilesState
+  Close();
+  ASSERT_OK(experimental::UpdateManifestForFilesState(
+      options, dbname_, column_families, update_opts));
+
+  // Re-open and re-verify after update
+  ReopenWithColumnFamilies(cfs, options);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kWarm), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kHot), 0);
+
+  Close();
+}
+#endif  // ROCKSDB_LITE
+
+// WAL recovery mode is WALRecoveryMode::kPointInTimeRecovery.
+TEST_F(DBTest2, PointInTimeRecoveryWithIOErrorWhileReadingWal) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "value0"));
+  Close();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  bool should_inject_error = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::RecoverLogFiles:BeforeReadWal",
+      [&](void* /*arg*/) { should_inject_error = true; });
+  SyncPoint::GetInstance()->SetCallBack(
+      "LogReader::ReadMore:AfterReadFile", [&](void* arg) {
+        if (should_inject_error) {
+          ASSERT_NE(nullptr, arg);
+          *reinterpret_cast<Status*>(arg) = Status::IOError("Injected IOError");
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  options.avoid_flush_during_recovery = true;
+  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+  Status s = TryReopen(options);
+  ASSERT_TRUE(s.IsIOError());
+}
+
+TEST_F(DBTest2, PointInTimeRecoveryWithSyncFailureInCFCreation) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCallFlush:Start:1",
+        "PointInTimeRecoveryWithSyncFailureInCFCreation:1"},
+       {"PointInTimeRecoveryWithSyncFailureInCFCreation:2",
+        "DBImpl::BackgroundCallFlush:Start:2"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateColumnFamilies({"test1"}, Options());
+  ASSERT_OK(Put("foo", "bar"));
+
+  // Creating a CF when a flush is going on, log is synced but the
+  // closed log file is not synced and corrupted.
+  port::Thread flush_thread([&]() { ASSERT_NOK(Flush()); });
+  TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:1");
+  CreateColumnFamilies({"test2"}, Options());
+  env_->corrupt_in_sync_ = true;
+  TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:2");
+  flush_thread.join();
+  env_->corrupt_in_sync_ = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  // Reopening the DB should not corrupt anything
+  Options options = CurrentOptions();
+  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+  ReopenWithColumnFamilies({"default", "test1", "test2"}, options);
+}
+
+TEST_F(DBTest2, RenameDirectory) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "value0"));
+  Close();
+  auto old_dbname = dbname_;
+  auto new_dbname = dbname_ + "_2";
+  EXPECT_OK(env_->RenameFile(dbname_, new_dbname));
+  options.create_if_missing = false;
+  dbname_ = new_dbname;
+  ASSERT_OK(TryReopen(options));
+  ASSERT_EQ("value0", Get("foo"));
+  Destroy(options);
+  dbname_ = old_dbname;
+}
+
+TEST_F(DBTest2, SstUniqueIdVerifyBackwardCompatible) {
+  const int kNumSst = 3;
+  const int kLevel0Trigger = 4;
+  auto options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kLevel0Trigger;
+  options.statistics = CreateDBStatistics();
+  // Skip for now
+  options.verify_sst_unique_id_in_manifest = false;
+  Reopen(options);
+
+  std::atomic_int skipped = 0;
+  std::atomic_int passed = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTable::Open::SkippedVerifyUniqueId",
+      [&](void* /*arg*/) { skipped++; });
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTable::Open::PassedVerifyUniqueId",
+      [&](void* /*arg*/) { passed++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // generate a few SSTs
+  for (int i = 0; i < kNumSst; i++) {
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(i * 10 + j), "value"));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // Verification has been skipped on files so far
+  EXPECT_EQ(skipped, kNumSst);
+  EXPECT_EQ(passed, 0);
+
+  // Reopen with verification
+  options.verify_sst_unique_id_in_manifest = true;
+  skipped = 0;
+  passed = 0;
+  Reopen(options);
+  EXPECT_EQ(skipped, 0);
+  EXPECT_EQ(passed, kNumSst);
+
+  // Now simulate no unique id in manifest for next file
+  // NOTE: this only works for loading manifest from disk,
+  // not in-memory manifest, so we need to re-open below.
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionEdit::EncodeTo:UniqueId", [&](void* arg) {
+        auto unique_id = static_cast<UniqueId64x2*>(arg);
+        // remove id before writing it to manifest
+        (*unique_id)[0] = 0;
+        (*unique_id)[1] = 0;
+      });
+
+  // test compaction generated Sst
+  for (int i = kNumSst; i < kLevel0Trigger; i++) {
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(i * 10 + j), "value"));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+#endif  // ROCKSDB_LITE
+
+  // Reopen (with verification)
+  ASSERT_TRUE(options.verify_sst_unique_id_in_manifest);
+  skipped = 0;
+  passed = 0;
+  Reopen(options);
+  EXPECT_EQ(skipped, 1);
+  EXPECT_EQ(passed, 0);
+}
+
+TEST_F(DBTest2, SstUniqueIdVerify) {
+  const int kNumSst = 3;
+  const int kLevel0Trigger = 4;
+  auto options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kLevel0Trigger;
+  // Allow mismatch for now
+  options.verify_sst_unique_id_in_manifest = false;
+  Reopen(options);
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "PropertyBlockBuilder::AddTableProperty:Start", [&](void* props_vs) {
+        auto props = static_cast<TableProperties*>(props_vs);
+        // update table property session_id to a different one, which
+        // changes unique ID
+        props->db_session_id = DBImpl::GenerateDbSessionId(nullptr);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // generate a few SSTs
+  for (int i = 0; i < kNumSst; i++) {
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(i * 10 + j), "value"));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // Reopen with verification should report corruption
+  options.verify_sst_unique_id_in_manifest = true;
+  auto s = TryReopen(options);
+  ASSERT_TRUE(s.IsCorruption());
+
+  // Reopen without verification should be fine
+  options.verify_sst_unique_id_in_manifest = false;
+  Reopen(options);
+
+  // test compaction generated Sst
+  for (int i = kNumSst; i < kLevel0Trigger; i++) {
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(i * 10 + j), "value"));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+#endif  // ROCKSDB_LITE
+
+  // Reopen with verification should fail
+  options.verify_sst_unique_id_in_manifest = true;
+  s = TryReopen(options);
+  ASSERT_TRUE(s.IsCorruption());
+}
+
+TEST_F(DBTest2, SstUniqueIdVerifyMultiCFs) {
+  const int kNumSst = 3;
+  const int kLevel0Trigger = 4;
+  auto options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kLevel0Trigger;
+  // Allow mismatch for now
+  options.verify_sst_unique_id_in_manifest = false;
+
+  CreateAndReopenWithCF({"one", "two"}, options);
+
+  // generate good SSTs
+  for (int cf_num : {0, 2}) {
+    for (int i = 0; i < kNumSst; i++) {
+      for (int j = 0; j < 100; j++) {
+        ASSERT_OK(Put(cf_num, Key(i * 10 + j), "value"));
+      }
+      ASSERT_OK(Flush(cf_num));
+    }
+  }
+
+  // generate SSTs with bad unique id
+  SyncPoint::GetInstance()->SetCallBack(
+      "PropertyBlockBuilder::AddTableProperty:Start", [&](void* props_vs) {
+        auto props = static_cast<TableProperties*>(props_vs);
+        // update table property session_id to a different one
+        props->db_session_id = DBImpl::GenerateDbSessionId(nullptr);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  for (int i = 0; i < kNumSst; i++) {
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(1, Key(i * 10 + j), "value"));
+    }
+    ASSERT_OK(Flush(1));
+  }
+
+  // Reopen with verification should report corruption
+  options.verify_sst_unique_id_in_manifest = true;
+  auto s = TryReopenWithColumnFamilies({"default", "one", "two"}, options);
+  ASSERT_TRUE(s.IsCorruption());
+}
+
+TEST_F(DBTest2, BestEffortsRecoveryWithSstUniqueIdVerification) {
+  const auto tamper_with_uniq_id = [&](void* arg) {
+    auto props = static_cast<TableProperties*>(arg);
+    assert(props);
+    // update table property session_id to a different one
+    props->db_session_id = DBImpl::GenerateDbSessionId(nullptr);
+  };
+
+  const auto assert_db = [&](size_t expected_count,
+                             const std::string& expected_v) {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    size_t cnt = 0;
+    for (it->SeekToFirst(); it->Valid(); it->Next(), ++cnt) {
+      ASSERT_EQ(std::to_string(cnt), it->key());
+      ASSERT_EQ(expected_v, it->value());
+    }
+    ASSERT_EQ(expected_count, cnt);
+  };
+
+  const int num_l0_compaction_trigger = 8;
+  const int num_l0 = num_l0_compaction_trigger - 1;
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = num_l0_compaction_trigger;
+
+  for (int k = 0; k < num_l0; ++k) {
+    // Allow mismatch for now
+    options.verify_sst_unique_id_in_manifest = false;
+
+    DestroyAndReopen(options);
+
+    constexpr size_t num_keys_per_file = 10;
+    for (int i = 0; i < num_l0; ++i) {
+      for (size_t j = 0; j < num_keys_per_file; ++j) {
+        ASSERT_OK(Put(std::to_string(j), "v" + std::to_string(i)));
+      }
+      if (i == k) {
+        SyncPoint::GetInstance()->DisableProcessing();
+        SyncPoint::GetInstance()->SetCallBack(
+            "PropertyBlockBuilder::AddTableProperty:Start",
+            tamper_with_uniq_id);
+        SyncPoint::GetInstance()->EnableProcessing();
+      }
+      ASSERT_OK(Flush());
+    }
+
+    options.verify_sst_unique_id_in_manifest = true;
+    Status s = TryReopen(options);
+    ASSERT_TRUE(s.IsCorruption());
+
+    options.best_efforts_recovery = true;
+    Reopen(options);
+    assert_db(k == 0 ? 0 : num_keys_per_file, "v" + std::to_string(k - 1));
+
+    // Reopen with regular recovery
+    options.best_efforts_recovery = false;
+    Reopen(options);
+    assert_db(k == 0 ? 0 : num_keys_per_file, "v" + std::to_string(k - 1));
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+
+    for (size_t i = 0; i < num_keys_per_file; ++i) {
+      ASSERT_OK(Put(std::to_string(i), "v"));
+    }
+    ASSERT_OK(Flush());
+    Reopen(options);
+    {
+      for (size_t i = 0; i < num_keys_per_file; ++i) {
+        ASSERT_EQ("v", Get(std::to_string(i)));
+      }
+    }
+  }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, GetLatestSeqAndTsForKey) {
+  Destroy(last_options_);
+
+  Options options = CurrentOptions();
+  options.max_write_buffer_size_to_maintain = 64 << 10;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+  options.statistics = CreateDBStatistics();
+
+  Reopen(options);
+
+  constexpr uint64_t kTsU64Value = 12;
+
+  for (uint64_t key = 0; key < 100; ++key) {
+    std::string ts;
+    PutFixed64(&ts, kTsU64Value);
+
+    std::string key_str;
+    PutFixed64(&key_str, key);
+    std::reverse(key_str.begin(), key_str.end());
+    ASSERT_OK(db_->Put(WriteOptions(), key_str, ts, "value"));
+  }
+
+  ASSERT_OK(Flush());
+
+  constexpr bool cache_only = true;
+  constexpr SequenceNumber lower_bound_seq = 0;
+  auto* cfhi = static_cast_with_check<ColumnFamilyHandleImpl>(
+      dbfull()->DefaultColumnFamily());
+  assert(cfhi);
+  assert(cfhi->cfd());
+  SuperVersion* sv = cfhi->cfd()->GetSuperVersion();
+  for (uint64_t key = 0; key < 100; ++key) {
+    std::string key_str;
+    PutFixed64(&key_str, key);
+    std::reverse(key_str.begin(), key_str.end());
+    std::string ts;
+    SequenceNumber seq = kMaxSequenceNumber;
+    bool found_record_for_key = false;
+    bool is_blob_index = false;
+
+    const Status s = dbfull()->GetLatestSequenceForKey(
+        sv, key_str, cache_only, lower_bound_seq, &seq, &ts,
+        &found_record_for_key, &is_blob_index);
+    ASSERT_OK(s);
+    std::string expected_ts;
+    PutFixed64(&expected_ts, kTsU64Value);
+    ASSERT_EQ(expected_ts, ts);
+    ASSERT_TRUE(found_record_for_key);
+    ASSERT_FALSE(is_blob_index);
+  }
+
+  // Verify that no read to SST files.
+  ASSERT_EQ(0, options.statistics->getTickerCount(GET_HIT_L0));
+}
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_test_util.cc b/src/rocksdb/db/db_test_util.cc
new file mode 100644
index 000000000..d53bca51a
--- /dev/null
+++ b/src/rocksdb/db/db_test_util.cc
@@ -0,0 +1,1773 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+
+#include "cache/cache_reservation_manager.h"
+#include "db/forward_iterator.h"
+#include "env/mock_env.h"
+#include "port/lang.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/env_encryption.h"
+#include "rocksdb/unique_id.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "table/format.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+int64_t MaybeCurrentTime(Env* env) {
+  int64_t time = 1337346000;  // arbitrary fallback default
+  env->GetCurrentTime(&time).PermitUncheckedError();
+  return time;
+}
+}  // anonymous namespace
+
+// Special Env used to delay background operations
+
+SpecialEnv::SpecialEnv(Env* base, bool time_elapse_only_sleep)
+    : EnvWrapper(base),
+      maybe_starting_time_(MaybeCurrentTime(base)),
+      rnd_(301),
+      sleep_counter_(this),
+      time_elapse_only_sleep_(time_elapse_only_sleep),
+      no_slowdown_(time_elapse_only_sleep) {
+  delay_sstable_sync_.store(false, std::memory_order_release);
+  drop_writes_.store(false, std::memory_order_release);
+  no_space_.store(false, std::memory_order_release);
+  non_writable_.store(false, std::memory_order_release);
+  count_random_reads_ = false;
+  count_sequential_reads_ = false;
+  manifest_sync_error_.store(false, std::memory_order_release);
+  manifest_write_error_.store(false, std::memory_order_release);
+  log_write_error_.store(false, std::memory_order_release);
+  no_file_overwrite_.store(false, std::memory_order_release);
+  random_file_open_counter_.store(0, std::memory_order_relaxed);
+  delete_count_.store(0, std::memory_order_relaxed);
+  num_open_wal_file_.store(0);
+  log_write_slowdown_ = 0;
+  bytes_written_ = 0;
+  sync_counter_ = 0;
+  non_writeable_rate_ = 0;
+  new_writable_count_ = 0;
+  non_writable_count_ = 0;
+  table_write_callback_ = nullptr;
+}
+DBTestBase::DBTestBase(const std::string path, bool env_do_fsync)
+    : mem_env_(nullptr), encrypted_env_(nullptr), option_config_(kDefault) {
+  Env* base_env = Env::Default();
+  ConfigOptions config_options;
+  EXPECT_OK(test::CreateEnvFromSystem(config_options, &base_env, &env_guard_));
+  EXPECT_NE(nullptr, base_env);
+  if (getenv("MEM_ENV")) {
+    mem_env_ = MockEnv::Create(base_env, base_env->GetSystemClock());
+  }
+#ifndef ROCKSDB_LITE
+  if (getenv("ENCRYPTED_ENV")) {
+    std::shared_ptr<EncryptionProvider> provider;
+    std::string provider_id = getenv("ENCRYPTED_ENV");
+    if (provider_id.find("=") == std::string::npos &&
+        !EndsWith(provider_id, "://test")) {
+      provider_id = provider_id + "://test";
+    }
+    EXPECT_OK(EncryptionProvider::CreateFromString(ConfigOptions(), provider_id,
+                                                   &provider));
+    encrypted_env_ = NewEncryptedEnv(mem_env_ ? mem_env_ : base_env, provider);
+  }
+#endif  // !ROCKSDB_LITE
+  env_ = new SpecialEnv(encrypted_env_ ? encrypted_env_
+                                       : (mem_env_ ? mem_env_ : base_env));
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  env_->skip_fsync_ = !env_do_fsync;
+  dbname_ = test::PerThreadDBPath(env_, path);
+  alternative_wal_dir_ = dbname_ + "/wal";
+  alternative_db_log_dir_ = dbname_ + "/db_log_dir";
+  auto options = CurrentOptions();
+  options.env = env_;
+  auto delete_options = options;
+  delete_options.wal_dir = alternative_wal_dir_;
+  EXPECT_OK(DestroyDB(dbname_, delete_options));
+  // Destroy it for not alternative WAL dir is used.
+  EXPECT_OK(DestroyDB(dbname_, options));
+  db_ = nullptr;
+  Reopen(options);
+  Random::GetTLSInstance()->Reset(0xdeadbeef);
+}
+
+DBTestBase::~DBTestBase() {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  Close();
+  Options options;
+  options.db_paths.emplace_back(dbname_, 0);
+  options.db_paths.emplace_back(dbname_ + "_2", 0);
+  options.db_paths.emplace_back(dbname_ + "_3", 0);
+  options.db_paths.emplace_back(dbname_ + "_4", 0);
+  options.env = env_;
+
+  if (getenv("KEEP_DB")) {
+    printf("DB is still at %s\n", dbname_.c_str());
+  } else {
+    EXPECT_OK(DestroyDB(dbname_, options));
+  }
+  delete env_;
+}
+
+bool DBTestBase::ShouldSkipOptions(int option_config, int skip_mask) {
+#ifdef ROCKSDB_LITE
+  // These options are not supported in ROCKSDB_LITE
+  if (option_config == kHashSkipList ||
+      option_config == kPlainTableFirstBytePrefix ||
+      option_config == kPlainTableCappedPrefix ||
+      option_config == kPlainTableCappedPrefixNonMmap ||
+      option_config == kPlainTableAllBytesPrefix ||
+      option_config == kVectorRep || option_config == kHashLinkList ||
+      option_config == kUniversalCompaction ||
+      option_config == kUniversalCompactionMultiLevel ||
+      option_config == kUniversalSubcompactions ||
+      option_config == kFIFOCompaction ||
+      option_config == kConcurrentSkipList) {
+    return true;
+  }
+#endif
+
+  if ((skip_mask & kSkipUniversalCompaction) &&
+      (option_config == kUniversalCompaction ||
+       option_config == kUniversalCompactionMultiLevel ||
+       option_config == kUniversalSubcompactions)) {
+    return true;
+  }
+  if ((skip_mask & kSkipMergePut) && option_config == kMergePut) {
+    return true;
+  }
+  if ((skip_mask & kSkipNoSeekToLast) &&
+      (option_config == kHashLinkList || option_config == kHashSkipList)) {
+    return true;
+  }
+  if ((skip_mask & kSkipPlainTable) &&
+      (option_config == kPlainTableAllBytesPrefix ||
+       option_config == kPlainTableFirstBytePrefix ||
+       option_config == kPlainTableCappedPrefix ||
+       option_config == kPlainTableCappedPrefixNonMmap)) {
+    return true;
+  }
+  if ((skip_mask & kSkipHashIndex) &&
+      (option_config == kBlockBasedTableWithPrefixHashIndex ||
+       option_config == kBlockBasedTableWithWholeKeyHashIndex)) {
+    return true;
+  }
+  if ((skip_mask & kSkipFIFOCompaction) && option_config == kFIFOCompaction) {
+    return true;
+  }
+  if ((skip_mask & kSkipMmapReads) && option_config == kWalDirAndMmapReads) {
+    return true;
+  }
+  return false;
+}
+
+// Switch to a fresh database with the next option configuration to
+// test.  Return false if there are no more configurations to test.
+bool DBTestBase::ChangeOptions(int skip_mask) {
+  for (option_config_++; option_config_ < kEnd; option_config_++) {
+    if (ShouldSkipOptions(option_config_, skip_mask)) {
+      continue;
+    }
+    break;
+  }
+
+  if (option_config_ >= kEnd) {
+    Destroy(last_options_);
+    return false;
+  } else {
+    auto options = CurrentOptions();
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+    return true;
+  }
+}
+
+// Switch between different compaction styles.
+bool DBTestBase::ChangeCompactOptions() {
+  if (option_config_ == kDefault) {
+    option_config_ = kUniversalCompaction;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    options.create_if_missing = true;
+    Reopen(options);
+    return true;
+  } else if (option_config_ == kUniversalCompaction) {
+    option_config_ = kUniversalCompactionMultiLevel;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    options.create_if_missing = true;
+    Reopen(options);
+    return true;
+  } else if (option_config_ == kUniversalCompactionMultiLevel) {
+    option_config_ = kLevelSubcompactions;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    assert(options.max_subcompactions > 1);
+    Reopen(options);
+    return true;
+  } else if (option_config_ == kLevelSubcompactions) {
+    option_config_ = kUniversalSubcompactions;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    assert(options.max_subcompactions > 1);
+    Reopen(options);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Switch between different WAL settings
+bool DBTestBase::ChangeWalOptions() {
+  if (option_config_ == kDefault) {
+    option_config_ = kDBLogDir;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    Destroy(options);
+    options.create_if_missing = true;
+    Reopen(options);
+    return true;
+  } else if (option_config_ == kDBLogDir) {
+    option_config_ = kWalDirAndMmapReads;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    Destroy(options);
+    options.create_if_missing = true;
+    Reopen(options);
+    return true;
+  } else if (option_config_ == kWalDirAndMmapReads) {
+    option_config_ = kRecycleLogFiles;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    Destroy(options);
+    Reopen(options);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Switch between different filter policy
+// Jump from kDefault to kFilter to kFullFilter
+bool DBTestBase::ChangeFilterOptions() {
+  if (option_config_ == kDefault) {
+    option_config_ = kFilter;
+  } else if (option_config_ == kFilter) {
+    option_config_ = kFullFilterWithNewTableReaderForCompactions;
+  } else if (option_config_ == kFullFilterWithNewTableReaderForCompactions) {
+    option_config_ = kPartitionedFilterWithNewTableReaderForCompactions;
+  } else {
+    return false;
+  }
+  Destroy(last_options_);
+
+  auto options = CurrentOptions();
+  options.create_if_missing = true;
+  TryReopen(options);
+  return true;
+}
+
+// Switch between different DB options for file ingestion tests.
+bool DBTestBase::ChangeOptionsForFileIngestionTest() {
+  if (option_config_ == kDefault) {
+    option_config_ = kUniversalCompaction;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    options.create_if_missing = true;
+    TryReopen(options);
+    return true;
+  } else if (option_config_ == kUniversalCompaction) {
+    option_config_ = kUniversalCompactionMultiLevel;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    options.create_if_missing = true;
+    TryReopen(options);
+    return true;
+  } else if (option_config_ == kUniversalCompactionMultiLevel) {
+    option_config_ = kLevelSubcompactions;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    assert(options.max_subcompactions > 1);
+    TryReopen(options);
+    return true;
+  } else if (option_config_ == kLevelSubcompactions) {
+    option_config_ = kUniversalSubcompactions;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    assert(options.max_subcompactions > 1);
+    TryReopen(options);
+    return true;
+  } else if (option_config_ == kUniversalSubcompactions) {
+    option_config_ = kDirectIO;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    TryReopen(options);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Return the current option configuration.
+Options DBTestBase::CurrentOptions(
+    const anon::OptionsOverride& options_override) const {
+  return GetOptions(option_config_, GetDefaultOptions(), options_override);
+}
+
+Options DBTestBase::CurrentOptions(
+    const Options& default_options,
+    const anon::OptionsOverride& options_override) const {
+  return GetOptions(option_config_, default_options, options_override);
+}
+
+Options DBTestBase::GetDefaultOptions() const {
+  Options options;
+  options.write_buffer_size = 4090 * 4096;
+  options.target_file_size_base = 2 * 1024 * 1024;
+  options.max_bytes_for_level_base = 10 * 1024 * 1024;
+  options.max_open_files = 5000;
+  options.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords;
+  options.compaction_pri = CompactionPri::kByCompensatedSize;
+  options.env = env_;
+  if (!env_->skip_fsync_) {
+    options.track_and_verify_wals_in_manifest = true;
+  }
+  return options;
+}
+
+Options DBTestBase::GetOptions(
+    int option_config, const Options& default_options,
+    const anon::OptionsOverride& options_override) const {
+  // this redundant copy is to minimize code change w/o having lint error.
+  Options options = default_options;
+  BlockBasedTableOptions table_options;
+  bool set_block_based_table_factory = true;
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+    !defined(OS_AIX)
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "NewRandomAccessFile:O_DIRECT");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "NewWritableFile:O_DIRECT");
+#endif
+  // kMustFreeHeapAllocations -> indicates ASAN build
+  if (kMustFreeHeapAllocations && !options_override.full_block_cache) {
+    // Detecting block cache use-after-free is normally difficult in unit
+    // tests, because as a cache, it tends to keep unreferenced entries in
+    // memory, and we normally want unit tests to take advantage of block
+    // cache for speed. However, we also want a strong chance of detecting
+    // block cache use-after-free in unit tests in ASAN builds, so for ASAN
+    // builds we use a trivially small block cache to which entries can be
+    // added but are immediately freed on no more references.
+    table_options.block_cache = NewLRUCache(/* too small */ 1);
+  }
+
+  bool can_allow_mmap = IsMemoryMappedAccessSupported();
+  switch (option_config) {
+#ifndef ROCKSDB_LITE
+    case kHashSkipList:
+      options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+      options.memtable_factory.reset(NewHashSkipListRepFactory(16));
+      options.allow_concurrent_memtable_write = false;
+      options.unordered_write = false;
+      break;
+    case kPlainTableFirstBytePrefix:
+      options.table_factory.reset(NewPlainTableFactory());
+      options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+      options.allow_mmap_reads = can_allow_mmap;
+      options.max_sequential_skip_in_iterations = 999999;
+      set_block_based_table_factory = false;
+      break;
+    case kPlainTableCappedPrefix:
+      options.table_factory.reset(NewPlainTableFactory());
+      options.prefix_extractor.reset(NewCappedPrefixTransform(8));
+      options.allow_mmap_reads = can_allow_mmap;
+      options.max_sequential_skip_in_iterations = 999999;
+      set_block_based_table_factory = false;
+      break;
+    case kPlainTableCappedPrefixNonMmap:
+      options.table_factory.reset(NewPlainTableFactory());
+      options.prefix_extractor.reset(NewCappedPrefixTransform(8));
+      options.allow_mmap_reads = false;
+      options.max_sequential_skip_in_iterations = 999999;
+      set_block_based_table_factory = false;
+      break;
+    case kPlainTableAllBytesPrefix:
+      options.table_factory.reset(NewPlainTableFactory());
+      options.prefix_extractor.reset(NewNoopTransform());
+      options.allow_mmap_reads = can_allow_mmap;
+      options.max_sequential_skip_in_iterations = 999999;
+      set_block_based_table_factory = false;
+      break;
+    case kVectorRep:
+      options.memtable_factory.reset(new VectorRepFactory(100));
+      options.allow_concurrent_memtable_write = false;
+      options.unordered_write = false;
+      break;
+    case kHashLinkList:
+      options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+      options.memtable_factory.reset(
+          NewHashLinkListRepFactory(4, 0, 3, true, 4));
+      options.allow_concurrent_memtable_write = false;
+      options.unordered_write = false;
+      break;
+    case kDirectIO: {
+      options.use_direct_reads = true;
+      options.use_direct_io_for_flush_and_compaction = true;
+      options.compaction_readahead_size = 2 * 1024 * 1024;
+      SetupSyncPointsToMockDirectIO();
+      break;
+    }
+#endif  // ROCKSDB_LITE
+    case kMergePut:
+      options.merge_operator = MergeOperators::CreatePutOperator();
+      break;
+    case kFilter:
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+      break;
+    case kFullFilterWithNewTableReaderForCompactions:
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+      options.compaction_readahead_size = 10 * 1024 * 1024;
+      break;
+    case kPartitionedFilterWithNewTableReaderForCompactions:
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+      table_options.partition_filters = true;
+      table_options.index_type =
+          BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+      options.compaction_readahead_size = 10 * 1024 * 1024;
+      break;
+    case kUncompressed:
+      options.compression = kNoCompression;
+      break;
+    case kNumLevel_3:
+      options.num_levels = 3;
+      break;
+    case kDBLogDir:
+      options.db_log_dir = alternative_db_log_dir_;
+      break;
+    case kWalDirAndMmapReads:
+      options.wal_dir = alternative_wal_dir_;
+      // mmap reads should be orthogonal to WalDir setting, so we piggyback to
+      // this option config to test mmap reads as well
+      options.allow_mmap_reads = can_allow_mmap;
+      break;
+    case kManifestFileSize:
+      options.max_manifest_file_size = 50;  // 50 bytes
+      break;
+    case kPerfOptions:
+      options.delayed_write_rate = 8 * 1024 * 1024;
+      options.report_bg_io_stats = true;
+      // TODO(3.13) -- test more options
+      break;
+    case kUniversalCompaction:
+      options.compaction_style = kCompactionStyleUniversal;
+      options.num_levels = 1;
+      break;
+    case kUniversalCompactionMultiLevel:
+      options.compaction_style = kCompactionStyleUniversal;
+      options.num_levels = 8;
+      break;
+    case kCompressedBlockCache:
+      options.allow_mmap_writes = can_allow_mmap;
+      table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024);
+      break;
+    case kInfiniteMaxOpenFiles:
+      options.max_open_files = -1;
+      break;
+    case kCRC32cChecksum: {
+      // Old default was CRC32c, but XXH3 (new default) is faster on common
+      // hardware
+      table_options.checksum = kCRC32c;
+      // Thrown in here for basic coverage:
+      options.DisableExtraChecks();
+      break;
+    }
+    case kFIFOCompaction: {
+      options.compaction_style = kCompactionStyleFIFO;
+      options.max_open_files = -1;
+      break;
+    }
+    case kBlockBasedTableWithPrefixHashIndex: {
+      table_options.index_type = BlockBasedTableOptions::kHashSearch;
+      options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+      break;
+    }
+    case kBlockBasedTableWithWholeKeyHashIndex: {
+      table_options.index_type = BlockBasedTableOptions::kHashSearch;
+      options.prefix_extractor.reset(NewNoopTransform());
+      break;
+    }
+    case kBlockBasedTableWithPartitionedIndex: {
+      table_options.format_version = 3;
+      table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+      options.prefix_extractor.reset(NewNoopTransform());
+      break;
+    }
+    case kBlockBasedTableWithPartitionedIndexFormat4: {
+      table_options.format_version = 4;
+      // Format 4 changes the binary index format. Since partitioned index is a
+      // super-set of simple indexes, we are also using kTwoLevelIndexSearch to
+      // test this format.
+      table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+      // The top-level index in partition filters are also affected by format 4.
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+      table_options.partition_filters = true;
+      table_options.index_block_restart_interval = 8;
+      break;
+    }
+    case kBlockBasedTableWithIndexRestartInterval: {
+      table_options.index_block_restart_interval = 8;
+      break;
+    }
+    case kBlockBasedTableWithLatestFormat: {
+      // In case different from default
+      table_options.format_version = kLatestFormatVersion;
+      break;
+    }
+    case kOptimizeFiltersForHits: {
+      options.optimize_filters_for_hits = true;
+      set_block_based_table_factory = true;
+      break;
+    }
+    case kRowCache: {
+      options.row_cache = NewLRUCache(1024 * 1024);
+      break;
+    }
+    case kRecycleLogFiles: {
+      options.recycle_log_file_num = 2;
+      break;
+    }
+    case kLevelSubcompactions: {
+      options.max_subcompactions = 4;
+      break;
+    }
+    case kUniversalSubcompactions: {
+      options.compaction_style = kCompactionStyleUniversal;
+      options.num_levels = 8;
+      options.max_subcompactions = 4;
+      break;
+    }
+    case kConcurrentSkipList: {
+      options.allow_concurrent_memtable_write = true;
+      options.enable_write_thread_adaptive_yield = true;
+      break;
+    }
+    case kPipelinedWrite: {
+      options.enable_pipelined_write = true;
+      break;
+    }
+    case kConcurrentWALWrites: {
+      // This options optimize 2PC commit path
+      options.two_write_queues = true;
+      options.manual_wal_flush = true;
+      break;
+    }
+    case kUnorderedWrite: {
+      options.allow_concurrent_memtable_write = false;
+      options.unordered_write = false;
+      break;
+    }
+
+    default:
+      break;
+  }
+
+  if (options_override.filter_policy) {
+    table_options.filter_policy = options_override.filter_policy;
+    table_options.partition_filters = options_override.partition_filters;
+    table_options.metadata_block_size = options_override.metadata_block_size;
+  }
+  if (set_block_based_table_factory) {
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  }
+  options.env = env_;
+  options.create_if_missing = true;
+  options.fail_if_options_file_error = true;
+  return options;
+}
+
+void DBTestBase::CreateColumnFamilies(const std::vector<std::string>& cfs,
+                                      const Options& options) {
+  ColumnFamilyOptions cf_opts(options);
+  size_t cfi = handles_.size();
+  handles_.resize(cfi + cfs.size());
+  for (auto cf : cfs) {
+    Status s = db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]);
+    ASSERT_OK(s);
+  }
+}
+
+void DBTestBase::CreateAndReopenWithCF(const std::vector<std::string>& cfs,
+                                       const Options& options) {
+  CreateColumnFamilies(cfs, options);
+  std::vector<std::string> cfs_plus_default = cfs;
+  cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
+  ReopenWithColumnFamilies(cfs_plus_default, options);
+}
+
+void DBTestBase::ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                          const std::vector<Options>& options) {
+  ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+}
+
+void DBTestBase::ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                          const Options& options) {
+  ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+}
+
+void DBTestBase::SetTimeElapseOnlySleepOnReopen(DBOptions* options) {
+  time_elapse_only_sleep_on_reopen_ = true;
+
+  // Need to disable stats dumping and persisting which also use
+  // RepeatableThread, which uses InstrumentedCondVar::TimedWaitInternal.
+  // With time_elapse_only_sleep_, this can hang on some platforms (MacOS)
+  // because (a) on some platforms, pthread_cond_timedwait does not appear
+  // to release the lock for other threads to operate if the deadline time
+  // is already passed, and (b) TimedWait calls are currently a bad abstraction
+  // because the deadline parameter is usually computed from Env time,
+  // but is interpreted in real clock time.
+  options->stats_dump_period_sec = 0;
+  options->stats_persist_period_sec = 0;
+}
+
+void DBTestBase::MaybeInstallTimeElapseOnlySleep(const DBOptions& options) {
+  if (time_elapse_only_sleep_on_reopen_) {
+    assert(options.env == env_ ||
+           static_cast_with_check<CompositeEnvWrapper>(options.env)
+                   ->env_target() == env_);
+    assert(options.stats_dump_period_sec == 0);
+    assert(options.stats_persist_period_sec == 0);
+    // We cannot set these before destroying the last DB because they might
+    // cause a deadlock or similar without the appropriate options set in
+    // the DB.
+    env_->time_elapse_only_sleep_ = true;
+    env_->no_slowdown_ = true;
+  } else {
+    // Going back in same test run is not yet supported, so no
+    // reset in this case.
+  }
+}
+
+Status DBTestBase::TryReopenWithColumnFamilies(
+    const std::vector<std::string>& cfs, const std::vector<Options>& options) {
+  Close();
+  EXPECT_EQ(cfs.size(), options.size());
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (size_t i = 0; i < cfs.size(); ++i) {
+    column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i]));
+  }
+  DBOptions db_opts = DBOptions(options[0]);
+  last_options_ = options[0];
+  MaybeInstallTimeElapseOnlySleep(db_opts);
+  return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+}
+
+Status DBTestBase::TryReopenWithColumnFamilies(
+    const std::vector<std::string>& cfs, const Options& options) {
+  Close();
+  std::vector<Options> v_opts(cfs.size(), options);
+  return TryReopenWithColumnFamilies(cfs, v_opts);
+}
+
+void DBTestBase::Reopen(const Options& options) {
+  ASSERT_OK(TryReopen(options));
+}
+
+void DBTestBase::Close() {
+  for (auto h : handles_) {
+    EXPECT_OK(db_->DestroyColumnFamilyHandle(h));
+  }
+  handles_.clear();
+  delete db_;
+  db_ = nullptr;
+}
+
+void DBTestBase::DestroyAndReopen(const Options& options) {
+  // Destroy using last options
+  Destroy(last_options_);
+  Reopen(options);
+}
+
+void DBTestBase::Destroy(const Options& options, bool delete_cf_paths) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  if (delete_cf_paths) {
+    for (size_t i = 0; i < handles_.size(); ++i) {
+      ColumnFamilyDescriptor cfdescriptor;
+      // GetDescriptor is not implemented for ROCKSDB_LITE
+      handles_[i]->GetDescriptor(&cfdescriptor).PermitUncheckedError();
+      column_families.push_back(cfdescriptor);
+    }
+  }
+  Close();
+  ASSERT_OK(DestroyDB(dbname_, options, column_families));
+}
+
+Status DBTestBase::ReadOnlyReopen(const Options& options) {
+  MaybeInstallTimeElapseOnlySleep(options);
+  return DB::OpenForReadOnly(options, dbname_, &db_);
+}
+
+Status DBTestBase::TryReopen(const Options& options) {
+  Close();
+  last_options_.table_factory.reset();
+  // Note: operator= is an unsafe approach here since it destructs
+  // std::shared_ptr in the same order of their creation, in contrast to
+  // destructors which destructs them in the opposite order of creation. One
+  // particular problem is that the cache destructor might invoke callback
+  // functions that use Option members such as statistics. To work around this
+  // problem, we manually call destructor of table_factory which eventually
+  // clears the block cache.
+  last_options_ = options;
+  MaybeInstallTimeElapseOnlySleep(options);
+  return DB::Open(options, dbname_, &db_);
+}
+
+bool DBTestBase::IsDirectIOSupported() {
+  return test::IsDirectIOSupported(env_, dbname_);
+}
+
+bool DBTestBase::IsMemoryMappedAccessSupported() const {
+  return (!encrypted_env_);
+}
+
+Status DBTestBase::Flush(int cf) {
+  if (cf == 0) {
+    return db_->Flush(FlushOptions());
+  } else {
+    return db_->Flush(FlushOptions(), handles_[cf]);
+  }
+}
+
+Status DBTestBase::Flush(const std::vector<int>& cf_ids) {
+  std::vector<ColumnFamilyHandle*> cfhs;
+  std::for_each(cf_ids.begin(), cf_ids.end(),
+                [&cfhs, this](int id) { cfhs.emplace_back(handles_[id]); });
+  return db_->Flush(FlushOptions(), cfhs);
+}
+
+Status DBTestBase::Put(const Slice& k, const Slice& v, WriteOptions wo) {
+  if (kMergePut == option_config_) {
+    return db_->Merge(wo, k, v);
+  } else {
+    return db_->Put(wo, k, v);
+  }
+}
+
+Status DBTestBase::Put(int cf, const Slice& k, const Slice& v,
+                       WriteOptions wo) {
+  if (kMergePut == option_config_) {
+    return db_->Merge(wo, handles_[cf], k, v);
+  } else {
+    return db_->Put(wo, handles_[cf], k, v);
+  }
+}
+
+Status DBTestBase::Merge(const Slice& k, const Slice& v, WriteOptions wo) {
+  return db_->Merge(wo, k, v);
+}
+
+Status DBTestBase::Merge(int cf, const Slice& k, const Slice& v,
+                         WriteOptions wo) {
+  return db_->Merge(wo, handles_[cf], k, v);
+}
+
+Status DBTestBase::Delete(const std::string& k) {
+  return db_->Delete(WriteOptions(), k);
+}
+
+Status DBTestBase::Delete(int cf, const std::string& k) {
+  return db_->Delete(WriteOptions(), handles_[cf], k);
+}
+
+Status DBTestBase::SingleDelete(const std::string& k) {
+  return db_->SingleDelete(WriteOptions(), k);
+}
+
+Status DBTestBase::SingleDelete(int cf, const std::string& k) {
+  return db_->SingleDelete(WriteOptions(), handles_[cf], k);
+}
+
+std::string DBTestBase::Get(const std::string& k, const Snapshot* snapshot) {
+  ReadOptions options;
+  options.verify_checksums = true;
+  options.snapshot = snapshot;
+  std::string result;
+  Status s = db_->Get(options, k, &result);
+  if (s.IsNotFound()) {
+    result = "NOT_FOUND";
+  } else if (!s.ok()) {
+    result = s.ToString();
+  }
+  return result;
+}
+
+std::string DBTestBase::Get(int cf, const std::string& k,
+                            const Snapshot* snapshot) {
+  ReadOptions options;
+  options.verify_checksums = true;
+  options.snapshot = snapshot;
+  std::string result;
+  Status s = db_->Get(options, handles_[cf], k, &result);
+  if (s.IsNotFound()) {
+    result = "NOT_FOUND";
+  } else if (!s.ok()) {
+    result = s.ToString();
+  }
+  return result;
+}
+
+std::vector<std::string> DBTestBase::MultiGet(std::vector<int> cfs,
+                                              const std::vector<std::string>& k,
+                                              const Snapshot* snapshot,
+                                              const bool batched,
+                                              const bool async) {
+  ReadOptions options;
+  options.verify_checksums = true;
+  options.snapshot = snapshot;
+  options.async_io = async;
+  std::vector<ColumnFamilyHandle*> handles;
+  std::vector<Slice> keys;
+  std::vector<std::string> result;
+
+  for (unsigned int i = 0; i < cfs.size(); ++i) {
+    handles.push_back(handles_[cfs[i]]);
+    keys.push_back(k[i]);
+  }
+  std::vector<Status> s;
+  if (!batched) {
+    s = db_->MultiGet(options, handles, keys, &result);
+    for (size_t i = 0; i < s.size(); ++i) {
+      if (s[i].IsNotFound()) {
+        result[i] = "NOT_FOUND";
+      } else if (!s[i].ok()) {
+        result[i] = s[i].ToString();
+      }
+    }
+  } else {
+    std::vector<PinnableSlice> pin_values(cfs.size());
+    result.resize(cfs.size());
+    s.resize(cfs.size());
+    db_->MultiGet(options, cfs.size(), handles.data(), keys.data(),
+                  pin_values.data(), s.data());
+    for (size_t i = 0; i < s.size(); ++i) {
+      if (s[i].IsNotFound()) {
+        result[i] = "NOT_FOUND";
+      } else if (!s[i].ok()) {
+        result[i] = s[i].ToString();
+      } else {
+        result[i].assign(pin_values[i].data(), pin_values[i].size());
+        // Increase likelihood of detecting potential use-after-free bugs with
+        // PinnableSlices tracking the same resource
+        pin_values[i].Reset();
+      }
+    }
+  }
+  return result;
+}
+
+std::vector<std::string> DBTestBase::MultiGet(const std::vector<std::string>& k,
+                                              const Snapshot* snapshot,
+                                              const bool async) {
+  ReadOptions options;
+  options.verify_checksums = true;
+  options.snapshot = snapshot;
+  options.async_io = async;
+  std::vector<Slice> keys;
+  std::vector<std::string> result(k.size());
+  std::vector<Status> statuses(k.size());
+  std::vector<PinnableSlice> pin_values(k.size());
+
+  for (size_t i = 0; i < k.size(); ++i) {
+    keys.push_back(k[i]);
+  }
+  db_->MultiGet(options, dbfull()->DefaultColumnFamily(), keys.size(),
+                keys.data(), pin_values.data(), statuses.data());
+  for (size_t i = 0; i < statuses.size(); ++i) {
+    if (statuses[i].IsNotFound()) {
+      result[i] = "NOT_FOUND";
+    } else if (!statuses[i].ok()) {
+      result[i] = statuses[i].ToString();
+    } else {
+      result[i].assign(pin_values[i].data(), pin_values[i].size());
+      // Increase likelihood of detecting potential use-after-free bugs with
+      // PinnableSlices tracking the same resource
+      pin_values[i].Reset();
+    }
+  }
+  return result;
+}
+
+Status DBTestBase::Get(const std::string& k, PinnableSlice* v) {
+  ReadOptions options;
+  options.verify_checksums = true;
+  Status s = dbfull()->Get(options, dbfull()->DefaultColumnFamily(), k, v);
+  return s;
+}
+
+uint64_t DBTestBase::GetNumSnapshots() {
+  uint64_t int_num;
+  EXPECT_TRUE(dbfull()->GetIntProperty("rocksdb.num-snapshots", &int_num));
+  return int_num;
+}
+
+uint64_t DBTestBase::GetTimeOldestSnapshots() {
+  uint64_t int_num;
+  EXPECT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.oldest-snapshot-time", &int_num));
+  return int_num;
+}
+
+uint64_t DBTestBase::GetSequenceOldestSnapshots() {
+  uint64_t int_num;
+  EXPECT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.oldest-snapshot-sequence", &int_num));
+  return int_num;
+}
+
+// Return a string that contains all key,value pairs in order,
+// formatted like "(k1->v1)(k2->v2)".
+std::string DBTestBase::Contents(int cf) {
+  std::vector<std::string> forward;
+  std::string result;
+  Iterator* iter = (cf == 0) ? db_->NewIterator(ReadOptions())
+                             : db_->NewIterator(ReadOptions(), handles_[cf]);
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    std::string s = IterStatus(iter);
+    result.push_back('(');
+    result.append(s);
+    result.push_back(')');
+    forward.push_back(s);
+  }
+
+  // Check reverse iteration results are the reverse of forward results
+  unsigned int matched = 0;
+  for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+    EXPECT_LT(matched, forward.size());
+    EXPECT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]);
+    matched++;
+  }
+  EXPECT_EQ(matched, forward.size());
+
+  delete iter;
+  return result;
+}
+
+void DBTestBase::CheckAllEntriesWithFifoReopen(
+    const std::string& expected_value, const Slice& user_key, int cf,
+    const std::vector<std::string>& cfs, const Options& options) {
+  ASSERT_EQ(AllEntriesFor(user_key, cf), expected_value);
+
+  std::vector<std::string> cfs_plus_default = cfs;
+  cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
+
+  Options fifo_options(options);
+  fifo_options.compaction_style = kCompactionStyleFIFO;
+  fifo_options.max_open_files = -1;
+  fifo_options.disable_auto_compactions = true;
+  ASSERT_OK(TryReopenWithColumnFamilies(cfs_plus_default, fifo_options));
+  ASSERT_EQ(AllEntriesFor(user_key, cf), expected_value);
+
+  ASSERT_OK(TryReopenWithColumnFamilies(cfs_plus_default, options));
+  ASSERT_EQ(AllEntriesFor(user_key, cf), expected_value);
+}
+
+std::string DBTestBase::AllEntriesFor(const Slice& user_key, int cf) {
+  Arena arena;
+  auto options = CurrentOptions();
+  InternalKeyComparator icmp(options.comparator);
+  ReadOptions read_options;
+  ScopedArenaIterator iter;
+  if (cf == 0) {
+    iter.set(dbfull()->NewInternalIterator(read_options, &arena,
+                                           kMaxSequenceNumber));
+  } else {
+    iter.set(dbfull()->NewInternalIterator(read_options, &arena,
+                                           kMaxSequenceNumber, handles_[cf]));
+  }
+  InternalKey target(user_key, kMaxSequenceNumber, kTypeValue);
+  iter->Seek(target.Encode());
+  std::string result;
+  if (!iter->status().ok()) {
+    result = iter->status().ToString();
+  } else {
+    result = "[ ";
+    bool first = true;
+    while (iter->Valid()) {
+      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+      if (ParseInternalKey(iter->key(), &ikey, true /* log_err_key */) !=
+          Status::OK()) {
+        result += "CORRUPTED";
+      } else {
+        if (!last_options_.comparator->Equal(ikey.user_key, user_key)) {
+          break;
+        }
+        if (!first) {
+          result += ", ";
+        }
+        first = false;
+        switch (ikey.type) {
+          case kTypeValue:
+            result += iter->value().ToString();
+            break;
+          case kTypeMerge:
+            // keep it the same as kTypeValue for testing kMergePut
+            result += iter->value().ToString();
+            break;
+          case kTypeDeletion:
+            result += "DEL";
+            break;
+          case kTypeSingleDeletion:
+            result += "SDEL";
+            break;
+          default:
+            assert(false);
+            break;
+        }
+      }
+      iter->Next();
+    }
+    if (!first) {
+      result += " ";
+    }
+    result += "]";
+  }
+  return result;
+}
+
+#ifndef ROCKSDB_LITE
+int DBTestBase::NumSortedRuns(int cf) {
+  ColumnFamilyMetaData cf_meta;
+  if (cf == 0) {
+    db_->GetColumnFamilyMetaData(&cf_meta);
+  } else {
+    db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta);
+  }
+  int num_sr = static_cast<int>(cf_meta.levels[0].files.size());
+  for (size_t i = 1U; i < cf_meta.levels.size(); i++) {
+    if (cf_meta.levels[i].files.size() > 0) {
+      num_sr++;
+    }
+  }
+  return num_sr;
+}
+
+uint64_t DBTestBase::TotalSize(int cf) {
+  ColumnFamilyMetaData cf_meta;
+  if (cf == 0) {
+    db_->GetColumnFamilyMetaData(&cf_meta);
+  } else {
+    db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta);
+  }
+  return cf_meta.size;
+}
+
+uint64_t DBTestBase::SizeAtLevel(int level) {
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  uint64_t sum = 0;
+  for (const auto& m : metadata) {
+    if (m.level == level) {
+      sum += m.size;
+    }
+  }
+  return sum;
+}
+
+size_t DBTestBase::TotalLiveFiles(int cf) {
+  ColumnFamilyMetaData cf_meta;
+  if (cf == 0) {
+    db_->GetColumnFamilyMetaData(&cf_meta);
+  } else {
+    db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta);
+  }
+  size_t num_files = 0;
+  for (auto& level : cf_meta.levels) {
+    num_files += level.files.size();
+  }
+  return num_files;
+}
+
+size_t DBTestBase::CountLiveFiles() {
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  return metadata.size();
+}
+
+int DBTestBase::NumTableFilesAtLevel(int level, int cf) {
+  std::string property;
+  if (cf == 0) {
+    // default cfd
+    EXPECT_TRUE(db_->GetProperty(
+        "rocksdb.num-files-at-level" + std::to_string(level), &property));
+  } else {
+    EXPECT_TRUE(db_->GetProperty(
+        handles_[cf], "rocksdb.num-files-at-level" + std::to_string(level),
+        &property));
+  }
+  return atoi(property.c_str());
+}
+
+double DBTestBase::CompressionRatioAtLevel(int level, int cf) {
+  std::string property;
+  if (cf == 0) {
+    // default cfd
+    EXPECT_TRUE(db_->GetProperty(
+        "rocksdb.compression-ratio-at-level" + std::to_string(level),
+        &property));
+  } else {
+    EXPECT_TRUE(db_->GetProperty(
+        handles_[cf],
+        "rocksdb.compression-ratio-at-level" + std::to_string(level),
+        &property));
+  }
+  return std::stod(property);
+}
+
+int DBTestBase::TotalTableFiles(int cf, int levels) {
+  if (levels == -1) {
+    levels = (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]);
+  }
+  int result = 0;
+  for (int level = 0; level < levels; level++) {
+    result += NumTableFilesAtLevel(level, cf);
+  }
+  return result;
+}
+
+// Return spread of files per level
+std::string DBTestBase::FilesPerLevel(int cf) {
+  int num_levels =
+      (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]);
+  std::string result;
+  size_t last_non_zero_offset = 0;
+  for (int level = 0; level < num_levels; level++) {
+    int f = NumTableFilesAtLevel(level, cf);
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+    result += buf;
+    if (f > 0) {
+      last_non_zero_offset = result.size();
+    }
+  }
+  result.resize(last_non_zero_offset);
+  return result;
+}
+
+#endif  // !ROCKSDB_LITE
+
+std::vector<uint64_t> DBTestBase::GetBlobFileNumbers() {
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  assert(cfd);
+
+  Version* const current = cfd->current();
+  assert(current);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  assert(storage_info);
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+
+  std::vector<uint64_t> result;
+  result.reserve(blob_files.size());
+
+  for (const auto& blob_file : blob_files) {
+    assert(blob_file);
+    result.emplace_back(blob_file->GetBlobFileNumber());
+  }
+
+  return result;
+}
+
+size_t DBTestBase::CountFiles() {
+  size_t count = 0;
+  std::vector<std::string> files;
+  if (env_->GetChildren(dbname_, &files).ok()) {
+    count += files.size();
+  }
+
+  if (dbname_ != last_options_.wal_dir) {
+    if (env_->GetChildren(last_options_.wal_dir, &files).ok()) {
+      count += files.size();
+    }
+  }
+
+  return count;
+};
+
+Status DBTestBase::CountFiles(size_t* count) {
+  std::vector<std::string> files;
+  Status s = env_->GetChildren(dbname_, &files);
+  if (!s.ok()) {
+    return s;
+  }
+  size_t files_count = files.size();
+
+  if (dbname_ != last_options_.wal_dir) {
+    s = env_->GetChildren(last_options_.wal_dir, &files);
+    if (!s.ok()) {
+      return s;
+    }
+    *count = files_count + files.size();
+  }
+
+  return Status::OK();
+}
+
+Status DBTestBase::Size(const Slice& start, const Slice& limit, int cf,
+                        uint64_t* size) {
+  Range r(start, limit);
+  if (cf == 0) {
+    return db_->GetApproximateSizes(&r, 1, size);
+  } else {
+    return db_->GetApproximateSizes(handles_[1], &r, 1, size);
+  }
+}
+
+void DBTestBase::Compact(int cf, const Slice& start, const Slice& limit,
+                         uint32_t target_path_id) {
+  CompactRangeOptions compact_options;
+  compact_options.target_path_id = target_path_id;
+  ASSERT_OK(db_->CompactRange(compact_options, handles_[cf], &start, &limit));
+}
+
+void DBTestBase::Compact(int cf, const Slice& start, const Slice& limit) {
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit));
+}
+
+void DBTestBase::Compact(const Slice& start, const Slice& limit) {
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &limit));
+}
+
+// Do n memtable compactions, each of which produces an sstable
+// covering the range [small,large].
+void DBTestBase::MakeTables(int n, const std::string& small,
+                            const std::string& large, int cf) {
+  for (int i = 0; i < n; i++) {
+    ASSERT_OK(Put(cf, small, "begin"));
+    ASSERT_OK(Put(cf, large, "end"));
+    ASSERT_OK(Flush(cf));
+    MoveFilesToLevel(n - i - 1, cf);
+  }
+}
+
+// Prevent pushing of new sstables into deeper levels by adding
+// tables that cover a specified range to all levels.
+void DBTestBase::FillLevels(const std::string& smallest,
+                            const std::string& largest, int cf) {
+  MakeTables(db_->NumberLevels(handles_[cf]), smallest, largest, cf);
+}
+
+void DBTestBase::MoveFilesToLevel(int level, int cf) {
+  for (int l = 0; l < level; ++l) {
+    if (cf > 0) {
+      EXPECT_OK(dbfull()->TEST_CompactRange(l, nullptr, nullptr, handles_[cf]));
+    } else {
+      EXPECT_OK(dbfull()->TEST_CompactRange(l, nullptr, nullptr));
+    }
+  }
+}
+
+#ifndef ROCKSDB_LITE
+void DBTestBase::DumpFileCounts(const char* label) {
+  fprintf(stderr, "---\n%s:\n", label);
+  fprintf(stderr, "maxoverlap: %" PRIu64 "\n",
+          dbfull()->TEST_MaxNextLevelOverlappingBytes());
+  for (int level = 0; level < db_->NumberLevels(); level++) {
+    int num = NumTableFilesAtLevel(level);
+    if (num > 0) {
+      fprintf(stderr, "  level %3d : %d files\n", level, num);
+    }
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+std::string DBTestBase::DumpSSTableList() {
+  std::string property;
+  db_->GetProperty("rocksdb.sstables", &property);
+  return property;
+}
+
+void DBTestBase::GetSstFiles(Env* env, std::string path,
+                             std::vector<std::string>* files) {
+  EXPECT_OK(env->GetChildren(path, files));
+
+  files->erase(std::remove_if(files->begin(), files->end(),
+                              [](std::string name) {
+                                uint64_t number;
+                                FileType type;
+                                return !(ParseFileName(name, &number, &type) &&
+                                         type == kTableFile);
+                              }),
+               files->end());
+}
+
+int DBTestBase::GetSstFileCount(std::string path) {
+  std::vector<std::string> files;
+  DBTestBase::GetSstFiles(env_, path, &files);
+  return static_cast<int>(files.size());
+}
+
+// this will generate non-overlapping files since it keeps increasing key_idx
+void DBTestBase::GenerateNewFile(int cf, Random* rnd, int* key_idx,
+                                 bool nowait) {
+  for (int i = 0; i < KNumKeysByGenerateNewFile; i++) {
+    ASSERT_OK(Put(cf, Key(*key_idx), rnd->RandomString((i == 99) ? 1 : 990)));
+    (*key_idx)++;
+  }
+  if (!nowait) {
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+}
+
+// this will generate non-overlapping files since it keeps increasing key_idx
+void DBTestBase::GenerateNewFile(Random* rnd, int* key_idx, bool nowait) {
+  for (int i = 0; i < KNumKeysByGenerateNewFile; i++) {
+    ASSERT_OK(Put(Key(*key_idx), rnd->RandomString((i == 99) ? 1 : 990)));
+    (*key_idx)++;
+  }
+  if (!nowait) {
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+}
+
+const int DBTestBase::kNumKeysByGenerateNewRandomFile = 51;
+
+void DBTestBase::GenerateNewRandomFile(Random* rnd, bool nowait) {
+  for (int i = 0; i < kNumKeysByGenerateNewRandomFile; i++) {
+    ASSERT_OK(Put("key" + rnd->RandomString(7), rnd->RandomString(2000)));
+  }
+  ASSERT_OK(Put("key" + rnd->RandomString(7), rnd->RandomString(200)));
+  if (!nowait) {
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+}
+
+std::string DBTestBase::IterStatus(Iterator* iter) {
+  std::string result;
+  if (iter->Valid()) {
+    result = iter->key().ToString() + "->" + iter->value().ToString();
+  } else {
+    result = "(invalid)";
+  }
+  return result;
+}
+
+Options DBTestBase::OptionsForLogIterTest() {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.WAL_ttl_seconds = 1000;
+  return options;
+}
+
+std::string DBTestBase::DummyString(size_t len, char c) {
+  return std::string(len, c);
+}
+
+void DBTestBase::VerifyIterLast(std::string expected_key, int cf) {
+  Iterator* iter;
+  ReadOptions ro;
+  if (cf == 0) {
+    iter = db_->NewIterator(ro);
+  } else {
+    iter = db_->NewIterator(ro, handles_[cf]);
+  }
+  iter->SeekToLast();
+  ASSERT_EQ(IterStatus(iter), expected_key);
+  delete iter;
+}
+
+// Used to test InplaceUpdate
+
+// If previous value is nullptr or delta is > than previous value,
+//   sets newValue with delta
+// If previous value is not empty,
+//   updates previous value with 'b' string of previous value size - 1.
+UpdateStatus DBTestBase::updateInPlaceSmallerSize(char* prevValue,
+                                                  uint32_t* prevSize,
+                                                  Slice delta,
+                                                  std::string* newValue) {
+  if (prevValue == nullptr) {
+    *newValue = std::string(delta.size(), 'c');
+    return UpdateStatus::UPDATED;
+  } else {
+    *prevSize = *prevSize - 1;
+    std::string str_b = std::string(*prevSize, 'b');
+    memcpy(prevValue, str_b.c_str(), str_b.size());
+    return UpdateStatus::UPDATED_INPLACE;
+  }
+}
+
+UpdateStatus DBTestBase::updateInPlaceSmallerVarintSize(char* prevValue,
+                                                        uint32_t* prevSize,
+                                                        Slice delta,
+                                                        std::string* newValue) {
+  if (prevValue == nullptr) {
+    *newValue = std::string(delta.size(), 'c');
+    return UpdateStatus::UPDATED;
+  } else {
+    *prevSize = 1;
+    std::string str_b = std::string(*prevSize, 'b');
+    memcpy(prevValue, str_b.c_str(), str_b.size());
+    return UpdateStatus::UPDATED_INPLACE;
+  }
+}
+
+UpdateStatus DBTestBase::updateInPlaceLargerSize(char* /*prevValue*/,
+                                                 uint32_t* /*prevSize*/,
+                                                 Slice delta,
+                                                 std::string* newValue) {
+  *newValue = std::string(delta.size(), 'c');
+  return UpdateStatus::UPDATED;
+}
+
+UpdateStatus DBTestBase::updateInPlaceNoAction(char* /*prevValue*/,
+                                               uint32_t* /*prevSize*/,
+                                               Slice /*delta*/,
+                                               std::string* /*newValue*/) {
+  return UpdateStatus::UPDATE_FAILED;
+}
+
+// Utility method to test InplaceUpdate
+void DBTestBase::validateNumberOfEntries(int numValues, int cf) {
+  Arena arena;
+  auto options = CurrentOptions();
+  InternalKeyComparator icmp(options.comparator);
+  ReadOptions read_options;
+  ScopedArenaIterator iter;
+  if (cf != 0) {
+    iter.set(dbfull()->NewInternalIterator(read_options, &arena,
+                                           kMaxSequenceNumber, handles_[cf]));
+  } else {
+    iter.set(dbfull()->NewInternalIterator(read_options, &arena,
+                                           kMaxSequenceNumber));
+  }
+  iter->SeekToFirst();
+  ASSERT_OK(iter->status());
+  int seq = numValues;
+  while (iter->Valid()) {
+    ParsedInternalKey ikey;
+    ikey.clear();
+    ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
+
+    // checks sequence number for updates
+    ASSERT_EQ(ikey.sequence, (unsigned)seq--);
+    iter->Next();
+  }
+  ASSERT_EQ(0, seq);
+}
+
+void DBTestBase::CopyFile(const std::string& source,
+                          const std::string& destination, uint64_t size) {
+  const EnvOptions soptions;
+  std::unique_ptr<SequentialFile> srcfile;
+  ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions));
+  std::unique_ptr<WritableFile> destfile;
+  ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions));
+
+  if (size == 0) {
+    // default argument means copy everything
+    ASSERT_OK(env_->GetFileSize(source, &size));
+  }
+
+  char buffer[4096];
+  Slice slice;
+  while (size > 0) {
+    uint64_t one = std::min(uint64_t(sizeof(buffer)), size);
+    ASSERT_OK(srcfile->Read(one, &slice, buffer));
+    ASSERT_OK(destfile->Append(slice));
+    size -= slice.size();
+  }
+  ASSERT_OK(destfile->Close());
+}
+
+Status DBTestBase::GetAllDataFiles(
+    const FileType file_type, std::unordered_map<std::string, uint64_t>* files,
+    uint64_t* total_size /* = nullptr */) {
+  if (total_size) {
+    *total_size = 0;
+  }
+  std::vector<std::string> children;
+  Status s = env_->GetChildren(dbname_, &children);
+  if (s.ok()) {
+    for (auto& file_name : children) {
+      uint64_t number;
+      FileType type;
+      if (ParseFileName(file_name, &number, &type) && type == file_type) {
+        std::string file_path = dbname_ + "/" + file_name;
+        uint64_t file_size = 0;
+        s = env_->GetFileSize(file_path, &file_size);
+        if (!s.ok()) {
+          break;
+        }
+        (*files)[file_path] = file_size;
+        if (total_size) {
+          *total_size += file_size;
+        }
+      }
+    }
+  }
+  return s;
+}
+
+std::vector<std::uint64_t> DBTestBase::ListTableFiles(Env* env,
+                                                      const std::string& path) {
+  std::vector<std::string> files;
+  std::vector<uint64_t> file_numbers;
+  EXPECT_OK(env->GetChildren(path, &files));
+  uint64_t number;
+  FileType type;
+  for (size_t i = 0; i < files.size(); ++i) {
+    if (ParseFileName(files[i], &number, &type)) {
+      if (type == kTableFile) {
+        file_numbers.push_back(number);
+      }
+    }
+  }
+  return file_numbers;
+}
+
+void DBTestBase::VerifyDBFromMap(std::map<std::string, std::string> true_data,
+                                 size_t* total_reads_res, bool tailing_iter,
+                                 std::map<std::string, Status> status) {
+  size_t total_reads = 0;
+
+  for (auto& kv : true_data) {
+    Status s = status[kv.first];
+    if (s.ok()) {
+      ASSERT_EQ(Get(kv.first), kv.second);
+    } else {
+      std::string value;
+      ASSERT_EQ(s, db_->Get(ReadOptions(), kv.first, &value));
+    }
+    total_reads++;
+  }
+
+  // Normal Iterator
+  {
+    int iter_cnt = 0;
+    ReadOptions ro;
+    ro.total_order_seek = true;
+    Iterator* iter = db_->NewIterator(ro);
+    // Verify Iterator::Next()
+    iter_cnt = 0;
+    auto data_iter = true_data.begin();
+    Status s;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next(), data_iter++) {
+      ASSERT_EQ(iter->key().ToString(), data_iter->first);
+      Status current_status = status[data_iter->first];
+      if (!current_status.ok()) {
+        s = current_status;
+      }
+      ASSERT_EQ(iter->status(), s);
+      if (current_status.ok()) {
+        ASSERT_EQ(iter->value().ToString(), data_iter->second);
+      }
+      iter_cnt++;
+      total_reads++;
+    }
+    ASSERT_EQ(data_iter, true_data.end())
+        << iter_cnt << " / " << true_data.size();
+    delete iter;
+
+    // Verify Iterator::Prev()
+    // Use a new iterator to make sure its status is clean.
+    iter = db_->NewIterator(ro);
+    iter_cnt = 0;
+    s = Status::OK();
+    auto data_rev = true_data.rbegin();
+    for (iter->SeekToLast(); iter->Valid(); iter->Prev(), data_rev++) {
+      ASSERT_EQ(iter->key().ToString(), data_rev->first);
+      Status current_status = status[data_rev->first];
+      if (!current_status.ok()) {
+        s = current_status;
+      }
+      ASSERT_EQ(iter->status(), s);
+      if (current_status.ok()) {
+        ASSERT_EQ(iter->value().ToString(), data_rev->second);
+      }
+      iter_cnt++;
+      total_reads++;
+    }
+    ASSERT_EQ(data_rev, true_data.rend())
+        << iter_cnt << " / " << true_data.size();
+
+    // Verify Iterator::Seek()
+    for (auto kv : true_data) {
+      iter->Seek(kv.first);
+      ASSERT_EQ(kv.first, iter->key().ToString());
+      ASSERT_EQ(kv.second, iter->value().ToString());
+      total_reads++;
+    }
+    delete iter;
+  }
+
+  if (tailing_iter) {
+#ifndef ROCKSDB_LITE
+    // Tailing iterator
+    int iter_cnt = 0;
+    ReadOptions ro;
+    ro.tailing = true;
+    ro.total_order_seek = true;
+    Iterator* iter = db_->NewIterator(ro);
+
+    // Verify ForwardIterator::Next()
+    iter_cnt = 0;
+    auto data_iter = true_data.begin();
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next(), data_iter++) {
+      ASSERT_EQ(iter->key().ToString(), data_iter->first);
+      ASSERT_EQ(iter->value().ToString(), data_iter->second);
+      iter_cnt++;
+      total_reads++;
+    }
+    ASSERT_EQ(data_iter, true_data.end())
+        << iter_cnt << " / " << true_data.size();
+
+    // Verify ForwardIterator::Seek()
+    for (auto kv : true_data) {
+      iter->Seek(kv.first);
+      ASSERT_EQ(kv.first, iter->key().ToString());
+      ASSERT_EQ(kv.second, iter->value().ToString());
+      total_reads++;
+    }
+
+    delete iter;
+#endif  // ROCKSDB_LITE
+  }
+
+  if (total_reads_res) {
+    *total_reads_res = total_reads;
+  }
+}
+
+void DBTestBase::VerifyDBInternal(
+    std::vector<std::pair<std::string, std::string>> true_data) {
+  Arena arena;
+  InternalKeyComparator icmp(last_options_.comparator);
+  ReadOptions read_options;
+  auto iter =
+      dbfull()->NewInternalIterator(read_options, &arena, kMaxSequenceNumber);
+  iter->SeekToFirst();
+  for (auto p : true_data) {
+    ASSERT_TRUE(iter->Valid());
+    ParsedInternalKey ikey;
+    ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
+    ASSERT_EQ(p.first, ikey.user_key);
+    ASSERT_EQ(p.second, iter->value());
+    iter->Next();
+  };
+  ASSERT_FALSE(iter->Valid());
+  iter->~InternalIterator();
+}
+
+#ifndef ROCKSDB_LITE
+
+uint64_t DBTestBase::GetNumberOfSstFilesForColumnFamily(
+    DB* db, std::string column_family_name) {
+  std::vector<LiveFileMetaData> metadata;
+  db->GetLiveFilesMetaData(&metadata);
+  uint64_t result = 0;
+  for (auto& fileMetadata : metadata) {
+    result += (fileMetadata.column_family_name == column_family_name);
+  }
+  return result;
+}
+
+uint64_t DBTestBase::GetSstSizeHelper(Temperature temperature) {
+  std::string prop;
+  EXPECT_TRUE(dbfull()->GetProperty(
+      DB::Properties::kLiveSstFilesSizeAtTemperature +
+          std::to_string(static_cast<uint8_t>(temperature)),
+      &prop));
+  return static_cast<uint64_t>(std::atoi(prop.c_str()));
+}
+#endif  // ROCKSDB_LITE
+
+void VerifySstUniqueIds(const TablePropertiesCollection& props) {
+  ASSERT_FALSE(props.empty());  // suspicious test if empty
+  std::unordered_set<std::string> seen;
+  for (auto& pair : props) {
+    std::string id;
+    ASSERT_OK(GetUniqueIdFromTableProperties(*pair.second, &id));
+    ASSERT_TRUE(seen.insert(id).second);
+  }
+}
+
+template <CacheEntryRole R>
+TargetCacheChargeTrackingCache<R>::TargetCacheChargeTrackingCache(
+    std::shared_ptr<Cache> target)
+    : CacheWrapper(std::move(target)),
+      cur_cache_charge_(0),
+      cache_charge_peak_(0),
+      cache_charge_increment_(0),
+      last_peak_tracked_(false),
+      cache_charge_increments_sum_(0) {}
+
+template <CacheEntryRole R>
+Status TargetCacheChargeTrackingCache<R>::Insert(
+    const Slice& key, void* value, size_t charge,
+    void (*deleter)(const Slice& key, void* value), Handle** handle,
+    Priority priority) {
+  Status s = target_->Insert(key, value, charge, deleter, handle, priority);
+  if (deleter == kNoopDeleter) {
+    if (last_peak_tracked_) {
+      cache_charge_peak_ = 0;
+      cache_charge_increment_ = 0;
+      last_peak_tracked_ = false;
+    }
+    if (s.ok()) {
+      cur_cache_charge_ += charge;
+    }
+    cache_charge_peak_ = std::max(cache_charge_peak_, cur_cache_charge_);
+    cache_charge_increment_ += charge;
+  }
+
+  return s;
+}
+
+template <CacheEntryRole R>
+bool TargetCacheChargeTrackingCache<R>::Release(Handle* handle,
+                                                bool erase_if_last_ref) {
+  auto deleter = GetDeleter(handle);
+  if (deleter == kNoopDeleter) {
+    if (!last_peak_tracked_) {
+      cache_charge_peaks_.push_back(cache_charge_peak_);
+      cache_charge_increments_sum_ += cache_charge_increment_;
+      last_peak_tracked_ = true;
+    }
+    cur_cache_charge_ -= GetCharge(handle);
+  }
+  bool is_successful = target_->Release(handle, erase_if_last_ref);
+  return is_successful;
+}
+
+template <CacheEntryRole R>
+const Cache::DeleterFn TargetCacheChargeTrackingCache<R>::kNoopDeleter =
+    CacheReservationManagerImpl<R>::TEST_GetNoopDeleterForRole();
+
+template class TargetCacheChargeTrackingCache<
+    CacheEntryRole::kFilterConstruction>;
+template class TargetCacheChargeTrackingCache<
+    CacheEntryRole::kBlockBasedTableReader>;
+template class TargetCacheChargeTrackingCache<CacheEntryRole::kFileMetadata>;
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_test_util.h b/src/rocksdb/db/db_test_util.h
new file mode 100644
index 000000000..29d5cd9d7
--- /dev/null
+++ b/src/rocksdb/db/db_test_util.h
@@ -0,0 +1,1402 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <fcntl.h>
+
+#include <algorithm>
+#include <cinttypes>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <thread>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "file/filename.h"
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/sst_file_writer.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "table/mock_table.h"
+#include "table/scoped_arena_iterator.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/cast_util.h"
+#include "util/compression.h"
+#include "util/mutexlock.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+class MockEnv;
+
+namespace anon {
+class AtomicCounter {
+ public:
+  explicit AtomicCounter(Env* env = NULL)
+      : env_(env), cond_count_(&mu_), count_(0) {}
+
+  void Increment() {
+    MutexLock l(&mu_);
+    count_++;
+    cond_count_.SignalAll();
+  }
+
+  int Read() {
+    MutexLock l(&mu_);
+    return count_;
+  }
+
+  bool WaitFor(int count) {
+    MutexLock l(&mu_);
+
+    uint64_t start = env_->NowMicros();
+    while (count_ < count) {
+      uint64_t now = env_->NowMicros();
+      cond_count_.TimedWait(now + /*1s*/ 1 * 1000 * 1000);
+      if (env_->NowMicros() - start > /*10s*/ 10 * 1000 * 1000) {
+        return false;
+      }
+      if (count_ < count) {
+        GTEST_LOG_(WARNING) << "WaitFor is taking more time than usual";
+      }
+    }
+
+    return true;
+  }
+
+  void Reset() {
+    MutexLock l(&mu_);
+    count_ = 0;
+    cond_count_.SignalAll();
+  }
+
+ private:
+  Env* env_;
+  port::Mutex mu_;
+  port::CondVar cond_count_;
+  int count_;
+};
+
+struct OptionsOverride {
+  std::shared_ptr<const FilterPolicy> filter_policy = nullptr;
+  // These will be used only if filter_policy is set
+  bool partition_filters = false;
+  // Force using a default block cache. (Setting to false allows ASAN build
+  // use a trivially small block cache for better UAF error detection.)
+  bool full_block_cache = false;
+  uint64_t metadata_block_size = 1024;
+
+  // Used as a bit mask of individual enums in which to skip an XF test point
+  int skip_policy = 0;
+};
+
+}  // namespace anon
+
+enum SkipPolicy { kSkipNone = 0, kSkipNoSnapshot = 1, kSkipNoPrefix = 2 };
+
+// Special Env used to delay background operations
+class SpecialEnv : public EnvWrapper {
+ public:
+  explicit SpecialEnv(Env* base, bool time_elapse_only_sleep = false);
+
+  static const char* kClassName() { return "SpecialEnv"; }
+  const char* Name() const override { return kClassName(); }
+
+  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
+                         const EnvOptions& soptions) override {
+    class SSTableFile : public WritableFile {
+     private:
+      SpecialEnv* env_;
+      std::unique_ptr<WritableFile> base_;
+
+     public:
+      SSTableFile(SpecialEnv* env, std::unique_ptr<WritableFile>&& base)
+          : env_(env), base_(std::move(base)) {}
+      Status Append(const Slice& data) override {
+        if (env_->table_write_callback_) {
+          (*env_->table_write_callback_)();
+        }
+        if (env_->drop_writes_.load(std::memory_order_acquire)) {
+          // Drop writes on the floor
+          return Status::OK();
+        } else if (env_->no_space_.load(std::memory_order_acquire)) {
+          return Status::NoSpace("No space left on device");
+        } else {
+          env_->bytes_written_ += data.size();
+          return base_->Append(data);
+        }
+      }
+      Status Append(
+          const Slice& data,
+          const DataVerificationInfo& /* verification_info */) override {
+        return Append(data);
+      }
+      Status PositionedAppend(const Slice& data, uint64_t offset) override {
+        if (env_->table_write_callback_) {
+          (*env_->table_write_callback_)();
+        }
+        if (env_->drop_writes_.load(std::memory_order_acquire)) {
+          // Drop writes on the floor
+          return Status::OK();
+        } else if (env_->no_space_.load(std::memory_order_acquire)) {
+          return Status::NoSpace("No space left on device");
+        } else {
+          env_->bytes_written_ += data.size();
+          return base_->PositionedAppend(data, offset);
+        }
+      }
+      Status PositionedAppend(
+          const Slice& data, uint64_t offset,
+          const DataVerificationInfo& /* verification_info */) override {
+        return PositionedAppend(data, offset);
+      }
+      Status Truncate(uint64_t size) override { return base_->Truncate(size); }
+      Status RangeSync(uint64_t offset, uint64_t nbytes) override {
+        Status s = base_->RangeSync(offset, nbytes);
+#if !(defined NDEBUG) || !defined(OS_WIN)
+        TEST_SYNC_POINT_CALLBACK("SpecialEnv::SStableFile::RangeSync", &s);
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+        return s;
+      }
+      Status Close() override {
+// SyncPoint is not supported in Released Windows Mode.
+#if !(defined NDEBUG) || !defined(OS_WIN)
+        // Check preallocation size
+        // preallocation size is never passed to base file.
+        size_t preallocation_size = preallocation_block_size();
+        TEST_SYNC_POINT_CALLBACK("DBTestWritableFile.GetPreallocationStatus",
+                                 &preallocation_size);
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+        Status s = base_->Close();
+#if !(defined NDEBUG) || !defined(OS_WIN)
+        TEST_SYNC_POINT_CALLBACK("SpecialEnv::SStableFile::Close", &s);
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+        return s;
+      }
+      Status Flush() override { return base_->Flush(); }
+      Status Sync() override {
+        ++env_->sync_counter_;
+        while (env_->delay_sstable_sync_.load(std::memory_order_acquire)) {
+          env_->SleepForMicroseconds(100000);
+        }
+        Status s;
+        if (!env_->skip_fsync_) {
+          s = base_->Sync();
+        }
+#if !(defined NDEBUG) || !defined(OS_WIN)
+        TEST_SYNC_POINT_CALLBACK("SpecialEnv::SStableFile::Sync", &s);
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+        return s;
+      }
+      void SetIOPriority(Env::IOPriority pri) override {
+        base_->SetIOPriority(pri);
+      }
+      Env::IOPriority GetIOPriority() override {
+        return base_->GetIOPriority();
+      }
+      bool use_direct_io() const override { return base_->use_direct_io(); }
+      Status Allocate(uint64_t offset, uint64_t len) override {
+        return base_->Allocate(offset, len);
+      }
+      size_t GetUniqueId(char* id, size_t max_size) const override {
+        return base_->GetUniqueId(id, max_size);
+      }
+    };
+    class ManifestFile : public WritableFile {
+     public:
+      ManifestFile(SpecialEnv* env, std::unique_ptr<WritableFile>&& b)
+          : env_(env), base_(std::move(b)) {}
+      Status Append(const Slice& data) override {
+        if (env_->manifest_write_error_.load(std::memory_order_acquire)) {
+          return Status::IOError("simulated writer error");
+        } else {
+          return base_->Append(data);
+        }
+      }
+      Status Append(
+          const Slice& data,
+          const DataVerificationInfo& /*verification_info*/) override {
+        return Append(data);
+      }
+
+      Status Truncate(uint64_t size) override { return base_->Truncate(size); }
+      Status Close() override { return base_->Close(); }
+      Status Flush() override { return base_->Flush(); }
+      Status Sync() override {
+        ++env_->sync_counter_;
+        if (env_->manifest_sync_error_.load(std::memory_order_acquire)) {
+          return Status::IOError("simulated sync error");
+        } else {
+          if (env_->skip_fsync_) {
+            return Status::OK();
+          } else {
+            return base_->Sync();
+          }
+        }
+      }
+      uint64_t GetFileSize() override { return base_->GetFileSize(); }
+      Status Allocate(uint64_t offset, uint64_t len) override {
+        return base_->Allocate(offset, len);
+      }
+
+     private:
+      SpecialEnv* env_;
+      std::unique_ptr<WritableFile> base_;
+    };
+    class WalFile : public WritableFile {
+     public:
+      WalFile(SpecialEnv* env, std::unique_ptr<WritableFile>&& b)
+          : env_(env), base_(std::move(b)) {
+        env_->num_open_wal_file_.fetch_add(1);
+      }
+      virtual ~WalFile() { env_->num_open_wal_file_.fetch_add(-1); }
+      Status Append(const Slice& data) override {
+#if !(defined NDEBUG) || !defined(OS_WIN)
+        TEST_SYNC_POINT("SpecialEnv::WalFile::Append:1");
+#endif
+        Status s;
+        if (env_->log_write_error_.load(std::memory_order_acquire)) {
+          s = Status::IOError("simulated writer error");
+        } else {
+          int slowdown =
+              env_->log_write_slowdown_.load(std::memory_order_acquire);
+          if (slowdown > 0) {
+            env_->SleepForMicroseconds(slowdown);
+          }
+          s = base_->Append(data);
+        }
+#if !(defined NDEBUG) || !defined(OS_WIN)
+        TEST_SYNC_POINT("SpecialEnv::WalFile::Append:2");
+#endif
+        return s;
+      }
+      Status Append(
+          const Slice& data,
+          const DataVerificationInfo& /* verification_info */) override {
+        return Append(data);
+      }
+      Status Truncate(uint64_t size) override { return base_->Truncate(size); }
+      void PrepareWrite(size_t offset, size_t len) override {
+        base_->PrepareWrite(offset, len);
+      }
+      void SetPreallocationBlockSize(size_t size) override {
+        base_->SetPreallocationBlockSize(size);
+      }
+      Status Close() override {
+// SyncPoint is not supported in Released Windows Mode.
+#if !(defined NDEBUG) || !defined(OS_WIN)
+        // Check preallocation size
+        size_t block_size, last_allocated_block;
+        base_->GetPreallocationStatus(&block_size, &last_allocated_block);
+        TEST_SYNC_POINT_CALLBACK("DBTestWalFile.GetPreallocationStatus",
+                                 &block_size);
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+
+        return base_->Close();
+      }
+      Status Flush() override { return base_->Flush(); }
+      Status Sync() override {
+        ++env_->sync_counter_;
+        if (env_->corrupt_in_sync_) {
+          EXPECT_OK(Append(std::string(33000, ' ')));
+          return Status::IOError("Ingested Sync Failure");
+        }
+        if (env_->skip_fsync_) {
+          return Status::OK();
+        } else {
+          return base_->Sync();
+        }
+      }
+      bool IsSyncThreadSafe() const override {
+        return env_->is_wal_sync_thread_safe_.load();
+      }
+      Status Allocate(uint64_t offset, uint64_t len) override {
+        return base_->Allocate(offset, len);
+      }
+
+     private:
+      SpecialEnv* env_;
+      std::unique_ptr<WritableFile> base_;
+    };
+    class OtherFile : public WritableFile {
+     public:
+      OtherFile(SpecialEnv* env, std::unique_ptr<WritableFile>&& b)
+          : env_(env), base_(std::move(b)) {}
+      Status Append(const Slice& data) override { return base_->Append(data); }
+      Status Append(
+          const Slice& data,
+          const DataVerificationInfo& /*verification_info*/) override {
+        return Append(data);
+      }
+      Status Truncate(uint64_t size) override { return base_->Truncate(size); }
+      Status Close() override { return base_->Close(); }
+      Status Flush() override { return base_->Flush(); }
+      Status Sync() override {
+        if (env_->skip_fsync_) {
+          return Status::OK();
+        } else {
+          return base_->Sync();
+        }
+      }
+      uint64_t GetFileSize() override { return base_->GetFileSize(); }
+      Status Allocate(uint64_t offset, uint64_t len) override {
+        return base_->Allocate(offset, len);
+      }
+
+     private:
+      SpecialEnv* env_;
+      std::unique_ptr<WritableFile> base_;
+    };
+
+    if (no_file_overwrite_.load(std::memory_order_acquire) &&
+        target()->FileExists(f).ok()) {
+      return Status::NotSupported("SpecialEnv::no_file_overwrite_ is true.");
+    }
+
+    if (non_writeable_rate_.load(std::memory_order_acquire) > 0) {
+      uint32_t random_number;
+      {
+        MutexLock l(&rnd_mutex_);
+        random_number = rnd_.Uniform(100);
+      }
+      if (random_number < non_writeable_rate_.load()) {
+        return Status::IOError("simulated random write error");
+      }
+    }
+
+    new_writable_count_++;
+
+    if (non_writable_count_.load() > 0) {
+      non_writable_count_--;
+      return Status::IOError("simulated write error");
+    }
+
+    EnvOptions optimized = soptions;
+    if (strstr(f.c_str(), "MANIFEST") != nullptr ||
+        strstr(f.c_str(), "log") != nullptr) {
+      optimized.use_mmap_writes = false;
+      optimized.use_direct_writes = false;
+    }
+
+    Status s = target()->NewWritableFile(f, r, optimized);
+    if (s.ok()) {
+      if (strstr(f.c_str(), ".sst") != nullptr) {
+        r->reset(new SSTableFile(this, std::move(*r)));
+      } else if (strstr(f.c_str(), "MANIFEST") != nullptr) {
+        r->reset(new ManifestFile(this, std::move(*r)));
+      } else if (strstr(f.c_str(), "log") != nullptr) {
+        r->reset(new WalFile(this, std::move(*r)));
+      } else {
+        r->reset(new OtherFile(this, std::move(*r)));
+      }
+    }
+    return s;
+  }
+
+  Status NewRandomAccessFile(const std::string& f,
+                             std::unique_ptr<RandomAccessFile>* r,
+                             const EnvOptions& soptions) override {
+    class CountingFile : public RandomAccessFile {
+     public:
+      CountingFile(std::unique_ptr<RandomAccessFile>&& target,
+                   anon::AtomicCounter* counter,
+                   std::atomic<size_t>* bytes_read)
+          : target_(std::move(target)),
+            counter_(counter),
+            bytes_read_(bytes_read) {}
+      virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                          char* scratch) const override {
+        counter_->Increment();
+        Status s = target_->Read(offset, n, result, scratch);
+        *bytes_read_ += result->size();
+        return s;
+      }
+
+      virtual Status Prefetch(uint64_t offset, size_t n) override {
+        Status s = target_->Prefetch(offset, n);
+        *bytes_read_ += n;
+        return s;
+      }
+
+     private:
+      std::unique_ptr<RandomAccessFile> target_;
+      anon::AtomicCounter* counter_;
+      std::atomic<size_t>* bytes_read_;
+    };
+
+    class RandomFailureFile : public RandomAccessFile {
+     public:
+      RandomFailureFile(std::unique_ptr<RandomAccessFile>&& target,
+                        std::atomic<uint64_t>* failure_cnt, uint32_t fail_odd)
+          : target_(std::move(target)),
+            fail_cnt_(failure_cnt),
+            fail_odd_(fail_odd) {}
+      virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                          char* scratch) const override {
+        if (Random::GetTLSInstance()->OneIn(fail_odd_)) {
+          fail_cnt_->fetch_add(1);
+          return Status::IOError("random error");
+        }
+        return target_->Read(offset, n, result, scratch);
+      }
+
+      virtual Status Prefetch(uint64_t offset, size_t n) override {
+        return target_->Prefetch(offset, n);
+      }
+
+     private:
+      std::unique_ptr<RandomAccessFile> target_;
+      std::atomic<uint64_t>* fail_cnt_;
+      uint32_t fail_odd_;
+    };
+
+    Status s = target()->NewRandomAccessFile(f, r, soptions);
+    random_file_open_counter_++;
+    if (s.ok()) {
+      if (count_random_reads_) {
+        r->reset(new CountingFile(std::move(*r), &random_read_counter_,
+                                  &random_read_bytes_counter_));
+      } else if (rand_reads_fail_odd_ > 0) {
+        r->reset(new RandomFailureFile(std::move(*r), &num_reads_fails_,
+                                       rand_reads_fail_odd_));
+      }
+    }
+
+    if (s.ok() && soptions.compaction_readahead_size > 0) {
+      compaction_readahead_size_ = soptions.compaction_readahead_size;
+    }
+    return s;
+  }
+
+  virtual Status NewSequentialFile(const std::string& f,
+                                   std::unique_ptr<SequentialFile>* r,
+                                   const EnvOptions& soptions) override {
+    class CountingFile : public SequentialFile {
+     public:
+      CountingFile(std::unique_ptr<SequentialFile>&& target,
+                   anon::AtomicCounter* counter)
+          : target_(std::move(target)), counter_(counter) {}
+      virtual Status Read(size_t n, Slice* result, char* scratch) override {
+        counter_->Increment();
+        return target_->Read(n, result, scratch);
+      }
+      virtual Status Skip(uint64_t n) override { return target_->Skip(n); }
+
+     private:
+      std::unique_ptr<SequentialFile> target_;
+      anon::AtomicCounter* counter_;
+    };
+
+    Status s = target()->NewSequentialFile(f, r, soptions);
+    if (s.ok() && count_sequential_reads_) {
+      r->reset(new CountingFile(std::move(*r), &sequential_read_counter_));
+    }
+    return s;
+  }
+
+  virtual void SleepForMicroseconds(int micros) override {
+    sleep_counter_.Increment();
+    if (no_slowdown_ || time_elapse_only_sleep_) {
+      addon_microseconds_.fetch_add(micros);
+    }
+    if (!no_slowdown_) {
+      target()->SleepForMicroseconds(micros);
+    }
+  }
+
+  void MockSleepForMicroseconds(int64_t micros) {
+    sleep_counter_.Increment();
+    assert(no_slowdown_);
+    addon_microseconds_.fetch_add(micros);
+  }
+
+  void MockSleepForSeconds(int64_t seconds) {
+    sleep_counter_.Increment();
+    assert(no_slowdown_);
+    addon_microseconds_.fetch_add(seconds * 1000000);
+  }
+
+  virtual Status GetCurrentTime(int64_t* unix_time) override {
+    Status s;
+    if (time_elapse_only_sleep_) {
+      *unix_time = maybe_starting_time_;
+    } else {
+      s = target()->GetCurrentTime(unix_time);
+    }
+    if (s.ok()) {
+      // mock microseconds elapsed to seconds of time
+      *unix_time += addon_microseconds_.load() / 1000000;
+    }
+    return s;
+  }
+
+  virtual uint64_t NowCPUNanos() override {
+    now_cpu_count_.fetch_add(1);
+    return target()->NowCPUNanos();
+  }
+
+  virtual uint64_t NowNanos() override {
+    return (time_elapse_only_sleep_ ? 0 : target()->NowNanos()) +
+           addon_microseconds_.load() * 1000;
+  }
+
+  virtual uint64_t NowMicros() override {
+    return (time_elapse_only_sleep_ ? 0 : target()->NowMicros()) +
+           addon_microseconds_.load();
+  }
+
+  virtual Status DeleteFile(const std::string& fname) override {
+    delete_count_.fetch_add(1);
+    return target()->DeleteFile(fname);
+  }
+
+  void SetMockSleep(bool enabled = true) { no_slowdown_ = enabled; }
+
+  Status NewDirectory(const std::string& name,
+                      std::unique_ptr<Directory>* result) override {
+    if (!skip_fsync_) {
+      return target()->NewDirectory(name, result);
+    } else {
+      class NoopDirectory : public Directory {
+       public:
+        NoopDirectory() {}
+        ~NoopDirectory() {}
+
+        Status Fsync() override { return Status::OK(); }
+        Status Close() override { return Status::OK(); }
+      };
+
+      result->reset(new NoopDirectory());
+      return Status::OK();
+    }
+  }
+
+  Status RenameFile(const std::string& src, const std::string& dest) override {
+    rename_count_.fetch_add(1);
+    if (rename_error_.load(std::memory_order_acquire)) {
+      return Status::NotSupported("Simulated `RenameFile()` error.");
+    }
+    return target()->RenameFile(src, dest);
+  }
+
+  // Something to return when mocking current time
+  const int64_t maybe_starting_time_;
+
+  Random rnd_;
+  port::Mutex rnd_mutex_;  // Lock to pretect rnd_
+
+  // sstable Sync() calls are blocked while this pointer is non-nullptr.
+  std::atomic<bool> delay_sstable_sync_;
+
+  // Drop writes on the floor while this pointer is non-nullptr.
+  std::atomic<bool> drop_writes_;
+
+  // Simulate no-space errors while this pointer is non-nullptr.
+  std::atomic<bool> no_space_;
+
+  // Simulate non-writable file system while this pointer is non-nullptr
+  std::atomic<bool> non_writable_;
+
+  // Force sync of manifest files to fail while this pointer is non-nullptr
+  std::atomic<bool> manifest_sync_error_;
+
+  // Force write to manifest files to fail while this pointer is non-nullptr
+  std::atomic<bool> manifest_write_error_;
+
+  // Force write to log files to fail while this pointer is non-nullptr
+  std::atomic<bool> log_write_error_;
+
+  // Force `RenameFile()` to fail while this pointer is non-nullptr
+  std::atomic<bool> rename_error_{false};
+
+  // Slow down every log write, in micro-seconds.
+  std::atomic<int> log_write_slowdown_;
+
+  // If true, returns Status::NotSupported for file overwrite.
+  std::atomic<bool> no_file_overwrite_;
+
+  // Number of WAL files that are still open for write.
+  std::atomic<int> num_open_wal_file_;
+
+  bool count_random_reads_;
+  uint32_t rand_reads_fail_odd_ = 0;
+  std::atomic<uint64_t> num_reads_fails_;
+  anon::AtomicCounter random_read_counter_;
+  std::atomic<size_t> random_read_bytes_counter_;
+  std::atomic<int> random_file_open_counter_;
+
+  bool count_sequential_reads_;
+  anon::AtomicCounter sequential_read_counter_;
+
+  anon::AtomicCounter sleep_counter_;
+
+  std::atomic<int64_t> bytes_written_;
+
+  std::atomic<int> sync_counter_;
+
+  // If true, all fsync to files and directories are skipped.
+  bool skip_fsync_ = false;
+
+  // If true, ingest the corruption to file during sync.
+  bool corrupt_in_sync_ = false;
+
+  std::atomic<uint32_t> non_writeable_rate_;
+
+  std::atomic<uint32_t> new_writable_count_;
+
+  std::atomic<uint32_t> non_writable_count_;
+
+  std::function<void()>* table_write_callback_;
+
+  std::atomic<int> now_cpu_count_;
+
+  std::atomic<int> delete_count_;
+
+  std::atomic<int> rename_count_{0};
+
+  std::atomic<bool> is_wal_sync_thread_safe_{true};
+
+  std::atomic<size_t> compaction_readahead_size_{};
+
+ private:  // accessing these directly is prone to error
+  friend class DBTestBase;
+
+  std::atomic<int64_t> addon_microseconds_{0};
+
+  // Do not modify in the env of a running DB (could cause deadlock)
+  std::atomic<bool> time_elapse_only_sleep_;
+
+  bool no_slowdown_;
+};
+
+#ifndef ROCKSDB_LITE
+class FileTemperatureTestFS : public FileSystemWrapper {
+ public:
+  explicit FileTemperatureTestFS(const std::shared_ptr<FileSystem>& fs)
+      : FileSystemWrapper(fs) {}
+
+  static const char* kClassName() { return "FileTemperatureTestFS"; }
+  const char* Name() const override { return kClassName(); }
+
+  IOStatus NewSequentialFile(const std::string& fname, const FileOptions& opts,
+                             std::unique_ptr<FSSequentialFile>* result,
+                             IODebugContext* dbg) override {
+    IOStatus s = target()->NewSequentialFile(fname, opts, result, dbg);
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(GetFileName(fname), &number, &type) &&
+        type == kTableFile) {
+      MutexLock lock(&mu_);
+      requested_sst_file_temperatures_.emplace_back(number, opts.temperature);
+      if (s.ok()) {
+        if (opts.temperature != Temperature::kUnknown) {
+          // Be extra picky and don't open if a wrong non-unknown temperature is
+          // provided
+          auto e = current_sst_file_temperatures_.find(number);
+          if (e != current_sst_file_temperatures_.end() &&
+              e->second != opts.temperature) {
+            result->reset();
+            return IOStatus::PathNotFound("Temperature mismatch on " + fname);
+          }
+        }
+        *result = WrapWithTemperature<FSSequentialFileOwnerWrapper>(
+            number, std::move(*result));
+      }
+    }
+    return s;
+  }
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& opts,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override {
+    IOStatus s = target()->NewRandomAccessFile(fname, opts, result, dbg);
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(GetFileName(fname), &number, &type) &&
+        type == kTableFile) {
+      MutexLock lock(&mu_);
+      requested_sst_file_temperatures_.emplace_back(number, opts.temperature);
+      if (s.ok()) {
+        if (opts.temperature != Temperature::kUnknown) {
+          // Be extra picky and don't open if a wrong non-unknown temperature is
+          // provided
+          auto e = current_sst_file_temperatures_.find(number);
+          if (e != current_sst_file_temperatures_.end() &&
+              e->second != opts.temperature) {
+            result->reset();
+            return IOStatus::PathNotFound("Temperature mismatch on " + fname);
+          }
+        }
+        *result = WrapWithTemperature<FSRandomAccessFileOwnerWrapper>(
+            number, std::move(*result));
+      }
+    }
+    return s;
+  }
+
+  void PopRequestedSstFileTemperatures(
+      std::vector<std::pair<uint64_t, Temperature>>* out = nullptr) {
+    MutexLock lock(&mu_);
+    if (out) {
+      *out = std::move(requested_sst_file_temperatures_);
+      assert(requested_sst_file_temperatures_.empty());
+    } else {
+      requested_sst_file_temperatures_.clear();
+    }
+  }
+
+  IOStatus NewWritableFile(const std::string& fname, const FileOptions& opts,
+                           std::unique_ptr<FSWritableFile>* result,
+                           IODebugContext* dbg) override {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(GetFileName(fname), &number, &type) &&
+        type == kTableFile) {
+      MutexLock lock(&mu_);
+      current_sst_file_temperatures_[number] = opts.temperature;
+    }
+    return target()->NewWritableFile(fname, opts, result, dbg);
+  }
+
+  void CopyCurrentSstFileTemperatures(std::map<uint64_t, Temperature>* out) {
+    MutexLock lock(&mu_);
+    *out = current_sst_file_temperatures_;
+  }
+
+  void OverrideSstFileTemperature(uint64_t number, Temperature temp) {
+    MutexLock lock(&mu_);
+    current_sst_file_temperatures_[number] = temp;
+  }
+
+ protected:
+  port::Mutex mu_;
+  std::vector<std::pair<uint64_t, Temperature>>
+      requested_sst_file_temperatures_;
+  std::map<uint64_t, Temperature> current_sst_file_temperatures_;
+
+  std::string GetFileName(const std::string& fname) {
+    auto filename = fname.substr(fname.find_last_of(kFilePathSeparator) + 1);
+    // workaround only for Windows that the file path could contain both Windows
+    // FilePathSeparator and '/'
+    filename = filename.substr(filename.find_last_of('/') + 1);
+    return filename;
+  }
+
+  template <class FileOwnerWrapperT, /*inferred*/ class FileT>
+  std::unique_ptr<FileT> WrapWithTemperature(uint64_t number,
+                                             std::unique_ptr<FileT>&& t) {
+    class FileWithTemp : public FileOwnerWrapperT {
+     public:
+      FileWithTemp(FileTemperatureTestFS* fs, uint64_t number,
+                   std::unique_ptr<FileT>&& t)
+          : FileOwnerWrapperT(std::move(t)), fs_(fs), number_(number) {}
+
+      Temperature GetTemperature() const override {
+        MutexLock lock(&fs_->mu_);
+        return fs_->current_sst_file_temperatures_[number_];
+      }
+
+     private:
+      FileTemperatureTestFS* fs_;
+      uint64_t number_;
+    };
+    return std::make_unique<FileWithTemp>(this, number, std::move(t));
+  }
+};
+
+class OnFileDeletionListener : public EventListener {
+ public:
+  OnFileDeletionListener() : matched_count_(0), expected_file_name_("") {}
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "OnFileDeletionListener"; }
+
+  void SetExpectedFileName(const std::string file_name) {
+    expected_file_name_ = file_name;
+  }
+
+  void VerifyMatchedCount(size_t expected_value) {
+    ASSERT_EQ(matched_count_, expected_value);
+  }
+
+  void OnTableFileDeleted(const TableFileDeletionInfo& info) override {
+    if (expected_file_name_ != "") {
+      ASSERT_EQ(expected_file_name_, info.file_path);
+      expected_file_name_ = "";
+      matched_count_++;
+    }
+  }
+
+ private:
+  size_t matched_count_;
+  std::string expected_file_name_;
+};
+
+class FlushCounterListener : public EventListener {
+ public:
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "FlushCounterListener"; }
+  std::atomic<int> count{0};
+  std::atomic<FlushReason> expected_flush_reason{FlushReason::kOthers};
+
+  void OnFlushBegin(DB* /*db*/, const FlushJobInfo& flush_job_info) override {
+    count++;
+    ASSERT_EQ(expected_flush_reason.load(), flush_job_info.flush_reason);
+  }
+};
+#endif
+
+// A test merge operator mimics put but also fails if one of merge operands is
+// "corrupted".
+class TestPutOperator : public MergeOperator {
+ public:
+  virtual bool FullMergeV2(const MergeOperationInput& merge_in,
+                           MergeOperationOutput* merge_out) const override {
+    if (merge_in.existing_value != nullptr &&
+        *(merge_in.existing_value) == "corrupted") {
+      return false;
+    }
+    for (auto value : merge_in.operand_list) {
+      if (value == "corrupted") {
+        return false;
+      }
+    }
+    merge_out->existing_operand = merge_in.operand_list.back();
+    return true;
+  }
+
+  virtual const char* Name() const override { return "TestPutOperator"; }
+};
+
+// A wrapper around Cache that can easily be extended with instrumentation,
+// etc.
+class CacheWrapper : public Cache {
+ public:
+  explicit CacheWrapper(std::shared_ptr<Cache> target)
+      : target_(std::move(target)) {}
+
+  const char* Name() const override { return target_->Name(); }
+
+  using Cache::Insert;
+  Status Insert(const Slice& key, void* value, size_t charge,
+                void (*deleter)(const Slice& key, void* value),
+                Handle** handle = nullptr,
+                Priority priority = Priority::LOW) override {
+    return target_->Insert(key, value, charge, deleter, handle, priority);
+  }
+
+  using Cache::Lookup;
+  Handle* Lookup(const Slice& key, Statistics* stats = nullptr) override {
+    return target_->Lookup(key, stats);
+  }
+
+  bool Ref(Handle* handle) override { return target_->Ref(handle); }
+
+  using Cache::Release;
+  bool Release(Handle* handle, bool erase_if_last_ref = false) override {
+    return target_->Release(handle, erase_if_last_ref);
+  }
+
+  void* Value(Handle* handle) override { return target_->Value(handle); }
+
+  void Erase(const Slice& key) override { target_->Erase(key); }
+  uint64_t NewId() override { return target_->NewId(); }
+
+  void SetCapacity(size_t capacity) override { target_->SetCapacity(capacity); }
+
+  void SetStrictCapacityLimit(bool strict_capacity_limit) override {
+    target_->SetStrictCapacityLimit(strict_capacity_limit);
+  }
+
+  bool HasStrictCapacityLimit() const override {
+    return target_->HasStrictCapacityLimit();
+  }
+
+  size_t GetCapacity() const override { return target_->GetCapacity(); }
+
+  size_t GetUsage() const override { return target_->GetUsage(); }
+
+  size_t GetUsage(Handle* handle) const override {
+    return target_->GetUsage(handle);
+  }
+
+  size_t GetPinnedUsage() const override { return target_->GetPinnedUsage(); }
+
+  size_t GetCharge(Handle* handle) const override {
+    return target_->GetCharge(handle);
+  }
+
+  DeleterFn GetDeleter(Handle* handle) const override {
+    return target_->GetDeleter(handle);
+  }
+
+  void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+                              bool thread_safe) override {
+    target_->ApplyToAllCacheEntries(callback, thread_safe);
+  }
+
+  void ApplyToAllEntries(
+      const std::function<void(const Slice& key, void* value, size_t charge,
+                               DeleterFn deleter)>& callback,
+      const ApplyToAllEntriesOptions& opts) override {
+    target_->ApplyToAllEntries(callback, opts);
+  }
+
+  void EraseUnRefEntries() override { target_->EraseUnRefEntries(); }
+
+ protected:
+  std::shared_ptr<Cache> target_;
+};
+
+/*
+ * A cache wrapper that tracks certain CacheEntryRole's cache charge, its
+ * peaks and increments
+ *
+ *        p0
+ *       / \   p1
+ *      /   \  /\
+ *     /     \/  \
+ *  a /       b   \
+ * peaks = {p0, p1}
+ * increments = {p1-a, p2-b}
+ */
+template <CacheEntryRole R>
+class TargetCacheChargeTrackingCache : public CacheWrapper {
+ public:
+  explicit TargetCacheChargeTrackingCache(std::shared_ptr<Cache> target);
+
+  using Cache::Insert;
+  Status Insert(const Slice& key, void* value, size_t charge,
+                void (*deleter)(const Slice& key, void* value),
+                Handle** handle = nullptr,
+                Priority priority = Priority::LOW) override;
+
+  using Cache::Release;
+  bool Release(Handle* handle, bool erase_if_last_ref = false) override;
+
+  std::size_t GetCacheCharge() { return cur_cache_charge_; }
+
+  std::deque<std::size_t> GetChargedCachePeaks() { return cache_charge_peaks_; }
+
+  std::size_t GetChargedCacheIncrementSum() {
+    return cache_charge_increments_sum_;
+  }
+
+ private:
+  static const Cache::DeleterFn kNoopDeleter;
+
+  std::size_t cur_cache_charge_;
+  std::size_t cache_charge_peak_;
+  std::size_t cache_charge_increment_;
+  bool last_peak_tracked_;
+  std::deque<std::size_t> cache_charge_peaks_;
+  std::size_t cache_charge_increments_sum_;
+};
+
+class DBTestBase : public testing::Test {
+ public:
+  // Sequence of option configurations to try
+  enum OptionConfig : int {
+    kDefault = 0,
+    kBlockBasedTableWithPrefixHashIndex = 1,
+    kBlockBasedTableWithWholeKeyHashIndex = 2,
+    kPlainTableFirstBytePrefix = 3,
+    kPlainTableCappedPrefix = 4,
+    kPlainTableCappedPrefixNonMmap = 5,
+    kPlainTableAllBytesPrefix = 6,
+    kVectorRep = 7,
+    kHashLinkList = 8,
+    kMergePut = 9,
+    kFilter = 10,
+    kFullFilterWithNewTableReaderForCompactions = 11,
+    kUncompressed = 12,
+    kNumLevel_3 = 13,
+    kDBLogDir = 14,
+    kWalDirAndMmapReads = 15,
+    kManifestFileSize = 16,
+    kPerfOptions = 17,
+    kHashSkipList = 18,
+    kUniversalCompaction = 19,
+    kUniversalCompactionMultiLevel = 20,
+    kCompressedBlockCache = 21,
+    kInfiniteMaxOpenFiles = 22,
+    kCRC32cChecksum = 23,
+    kFIFOCompaction = 24,
+    kOptimizeFiltersForHits = 25,
+    kRowCache = 26,
+    kRecycleLogFiles = 27,
+    kConcurrentSkipList = 28,
+    kPipelinedWrite = 29,
+    kConcurrentWALWrites = 30,
+    kDirectIO,
+    kLevelSubcompactions,
+    kBlockBasedTableWithIndexRestartInterval,
+    kBlockBasedTableWithPartitionedIndex,
+    kBlockBasedTableWithPartitionedIndexFormat4,
+    kBlockBasedTableWithLatestFormat,
+    kPartitionedFilterWithNewTableReaderForCompactions,
+    kUniversalSubcompactions,
+    kUnorderedWrite,
+    // This must be the last line
+    kEnd,
+  };
+
+ public:
+  std::string dbname_;
+  std::string alternative_wal_dir_;
+  std::string alternative_db_log_dir_;
+  MockEnv* mem_env_;
+  Env* encrypted_env_;
+  SpecialEnv* env_;
+  std::shared_ptr<Env> env_guard_;
+  DB* db_;
+  std::vector<ColumnFamilyHandle*> handles_;
+
+  int option_config_;
+  Options last_options_;
+
+  // Skip some options, as they may not be applicable to a specific test.
+  // To add more skip constants, use values 4, 8, 16, etc.
+  enum OptionSkip {
+    kNoSkip = 0,
+    kSkipDeletesFilterFirst = 1,
+    kSkipUniversalCompaction = 2,
+    kSkipMergePut = 4,
+    kSkipPlainTable = 8,
+    kSkipHashIndex = 16,
+    kSkipNoSeekToLast = 32,
+    kSkipFIFOCompaction = 128,
+    kSkipMmapReads = 256,
+  };
+
+  const int kRangeDelSkipConfigs =
+      // Plain tables do not support range deletions.
+      kSkipPlainTable |
+      // MmapReads disables the iterator pinning that RangeDelAggregator
+      // requires.
+      kSkipMmapReads;
+
+  // `env_do_fsync` decides whether the special Env would do real
+  // fsync for files and directories. Skipping fsync can speed up
+  // tests, but won't cover the exact fsync logic.
+  DBTestBase(const std::string path, bool env_do_fsync);
+
+  ~DBTestBase();
+
+  static std::string Key(int i) {
+    char buf[100];
+    snprintf(buf, sizeof(buf), "key%06d", i);
+    return std::string(buf);
+  }
+
+  static bool ShouldSkipOptions(int option_config, int skip_mask = kNoSkip);
+
+  // Switch to a fresh database with the next option configuration to
+  // test.  Return false if there are no more configurations to test.
+  bool ChangeOptions(int skip_mask = kNoSkip);
+
+  // Switch between different compaction styles.
+  bool ChangeCompactOptions();
+
+  // Switch between different WAL-realted options.
+  bool ChangeWalOptions();
+
+  // Switch between different filter policy
+  // Jump from kDefault to kFilter to kFullFilter
+  bool ChangeFilterOptions();
+
+  // Switch between different DB options for file ingestion tests.
+  bool ChangeOptionsForFileIngestionTest();
+
+  // Return the current option configuration.
+  Options CurrentOptions(const anon::OptionsOverride& options_override =
+                             anon::OptionsOverride()) const;
+
+  Options CurrentOptions(const Options& default_options,
+                         const anon::OptionsOverride& options_override =
+                             anon::OptionsOverride()) const;
+
+  Options GetDefaultOptions() const;
+
+  Options GetOptions(int option_config) const {
+    return GetOptions(option_config, GetDefaultOptions());
+  }
+
+  Options GetOptions(int option_config, const Options& default_options,
+                     const anon::OptionsOverride& options_override =
+                         anon::OptionsOverride()) const;
+
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
+
+  void CreateColumnFamilies(const std::vector<std::string>& cfs,
+                            const Options& options);
+
+  void CreateAndReopenWithCF(const std::vector<std::string>& cfs,
+                             const Options& options);
+
+  void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                const std::vector<Options>& options);
+
+  void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                const Options& options);
+
+  Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                     const std::vector<Options>& options);
+
+  Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                     const Options& options);
+
+  void Reopen(const Options& options);
+
+  void Close();
+
+  void DestroyAndReopen(const Options& options);
+
+  void Destroy(const Options& options, bool delete_cf_paths = false);
+
+  Status ReadOnlyReopen(const Options& options);
+
+  Status TryReopen(const Options& options);
+
+  bool IsDirectIOSupported();
+
+  bool IsMemoryMappedAccessSupported() const;
+
+  Status Flush(int cf = 0);
+
+  Status Flush(const std::vector<int>& cf_ids);
+
+  Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions());
+
+  Status Put(int cf, const Slice& k, const Slice& v,
+             WriteOptions wo = WriteOptions());
+
+  Status Merge(const Slice& k, const Slice& v,
+               WriteOptions wo = WriteOptions());
+
+  Status Merge(int cf, const Slice& k, const Slice& v,
+               WriteOptions wo = WriteOptions());
+
+  Status Delete(const std::string& k);
+
+  Status Delete(int cf, const std::string& k);
+
+  Status SingleDelete(const std::string& k);
+
+  Status SingleDelete(int cf, const std::string& k);
+
+  std::string Get(const std::string& k, const Snapshot* snapshot = nullptr);
+
+  std::string Get(int cf, const std::string& k,
+                  const Snapshot* snapshot = nullptr);
+
+  Status Get(const std::string& k, PinnableSlice* v);
+
+  std::vector<std::string> MultiGet(std::vector<int> cfs,
+                                    const std::vector<std::string>& k,
+                                    const Snapshot* snapshot,
+                                    const bool batched,
+                                    const bool async = false);
+
+  std::vector<std::string> MultiGet(const std::vector<std::string>& k,
+                                    const Snapshot* snapshot = nullptr,
+                                    const bool async = false);
+
+  uint64_t GetNumSnapshots();
+
+  uint64_t GetTimeOldestSnapshots();
+
+  uint64_t GetSequenceOldestSnapshots();
+
+  // Return a string that contains all key,value pairs in order,
+  // formatted like "(k1->v1)(k2->v2)".
+  std::string Contents(int cf = 0);
+
+  std::string AllEntriesFor(const Slice& user_key, int cf = 0);
+
+  // Similar to AllEntriesFor but this function also covers reopen with fifo.
+  // Note that test cases with snapshots or entries in memtable should simply
+  // use AllEntriesFor instead as snapshots and entries in memtable will
+  // survive after db reopen.
+  void CheckAllEntriesWithFifoReopen(const std::string& expected_value,
+                                     const Slice& user_key, int cf,
+                                     const std::vector<std::string>& cfs,
+                                     const Options& options);
+
+#ifndef ROCKSDB_LITE
+  int NumSortedRuns(int cf = 0);
+
+  uint64_t TotalSize(int cf = 0);
+
+  uint64_t SizeAtLevel(int level);
+
+  size_t TotalLiveFiles(int cf = 0);
+
+  size_t CountLiveFiles();
+
+  int NumTableFilesAtLevel(int level, int cf = 0);
+
+  double CompressionRatioAtLevel(int level, int cf = 0);
+
+  int TotalTableFiles(int cf = 0, int levels = -1);
+#endif  // ROCKSDB_LITE
+
+  std::vector<uint64_t> GetBlobFileNumbers();
+
+  // Return spread of files per level
+  std::string FilesPerLevel(int cf = 0);
+
+  size_t CountFiles();
+
+  Status CountFiles(size_t* count);
+
+  Status Size(const Slice& start, const Slice& limit, uint64_t* size) {
+    return Size(start, limit, 0, size);
+  }
+
+  Status Size(const Slice& start, const Slice& limit, int cf, uint64_t* size);
+
+  void Compact(int cf, const Slice& start, const Slice& limit,
+               uint32_t target_path_id);
+
+  void Compact(int cf, const Slice& start, const Slice& limit);
+
+  void Compact(const Slice& start, const Slice& limit);
+
+  // Do n memtable compactions, each of which produces an sstable
+  // covering the range [small,large].
+  void MakeTables(int n, const std::string& small, const std::string& large,
+                  int cf = 0);
+
+  // Prevent pushing of new sstables into deeper levels by adding
+  // tables that cover a specified range to all levels.
+  void FillLevels(const std::string& smallest, const std::string& largest,
+                  int cf);
+
+  void MoveFilesToLevel(int level, int cf = 0);
+
+#ifndef ROCKSDB_LITE
+  void DumpFileCounts(const char* label);
+#endif  // ROCKSDB_LITE
+
+  std::string DumpSSTableList();
+
+  static void GetSstFiles(Env* env, std::string path,
+                          std::vector<std::string>* files);
+
+  int GetSstFileCount(std::string path);
+
+  // this will generate non-overlapping files since it keeps increasing key_idx
+  void GenerateNewFile(Random* rnd, int* key_idx, bool nowait = false);
+
+  void GenerateNewFile(int fd, Random* rnd, int* key_idx, bool nowait = false);
+
+  static const int kNumKeysByGenerateNewRandomFile;
+  static const int KNumKeysByGenerateNewFile = 100;
+
+  void GenerateNewRandomFile(Random* rnd, bool nowait = false);
+
+  std::string IterStatus(Iterator* iter);
+
+  Options OptionsForLogIterTest();
+
+  std::string DummyString(size_t len, char c = 'a');
+
+  void VerifyIterLast(std::string expected_key, int cf = 0);
+
+  // Used to test InplaceUpdate
+
+  // If previous value is nullptr or delta is > than previous value,
+  //   sets newValue with delta
+  // If previous value is not empty,
+  //   updates previous value with 'b' string of previous value size - 1.
+  static UpdateStatus updateInPlaceSmallerSize(char* prevValue,
+                                               uint32_t* prevSize, Slice delta,
+                                               std::string* newValue);
+
+  static UpdateStatus updateInPlaceSmallerVarintSize(char* prevValue,
+                                                     uint32_t* prevSize,
+                                                     Slice delta,
+                                                     std::string* newValue);
+
+  static UpdateStatus updateInPlaceLargerSize(char* prevValue,
+                                              uint32_t* prevSize, Slice delta,
+                                              std::string* newValue);
+
+  static UpdateStatus updateInPlaceNoAction(char* prevValue, uint32_t* prevSize,
+                                            Slice delta, std::string* newValue);
+
+  // Utility method to test InplaceUpdate
+  void validateNumberOfEntries(int numValues, int cf = 0);
+
+  void CopyFile(const std::string& source, const std::string& destination,
+                uint64_t size = 0);
+
+  Status GetAllDataFiles(const FileType file_type,
+                         std::unordered_map<std::string, uint64_t>* sst_files,
+                         uint64_t* total_size = nullptr);
+
+  std::vector<std::uint64_t> ListTableFiles(Env* env, const std::string& path);
+
+  void VerifyDBFromMap(
+      std::map<std::string, std::string> true_data,
+      size_t* total_reads_res = nullptr, bool tailing_iter = false,
+      std::map<std::string, Status> status = std::map<std::string, Status>());
+
+  void VerifyDBInternal(
+      std::vector<std::pair<std::string, std::string>> true_data);
+
+#ifndef ROCKSDB_LITE
+  uint64_t GetNumberOfSstFilesForColumnFamily(DB* db,
+                                              std::string column_family_name);
+
+  uint64_t GetSstSizeHelper(Temperature temperature);
+#endif  // ROCKSDB_LITE
+
+  uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) {
+    return options.statistics->getTickerCount(ticker_type);
+  }
+
+  uint64_t TestGetAndResetTickerCount(const Options& options,
+                                      Tickers ticker_type) {
+    return options.statistics->getAndResetTickerCount(ticker_type);
+  }
+
+  // Note: reverting this setting within the same test run is not yet
+  // supported
+  void SetTimeElapseOnlySleepOnReopen(DBOptions* options);
+
+ private:  // Prone to error on direct use
+  void MaybeInstallTimeElapseOnlySleep(const DBOptions& options);
+
+  bool time_elapse_only_sleep_on_reopen_ = false;
+};
+
+// For verifying that all files generated by current version have SST
+// unique ids.
+void VerifySstUniqueIds(const TablePropertiesCollection& props);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_universal_compaction_test.cc b/src/rocksdb/db/db_universal_compaction_test.cc
new file mode 100644
index 000000000..f53c36f22
--- /dev/null
+++ b/src/rocksdb/db/db_universal_compaction_test.cc
@@ -0,0 +1,2235 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#if !defined(ROCKSDB_LITE)
+#include "rocksdb/utilities/table_properties_collectors.h"
+#include "test_util/sync_point.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static std::string CompressibleString(Random* rnd, int len) {
+  std::string r;
+  test::CompressibleString(rnd, 0.8, len, &r);
+  return r;
+}
+
+class DBTestUniversalCompactionBase
+    : public DBTestBase,
+      public ::testing::WithParamInterface<std::tuple<int, bool>> {
+ public:
+  explicit DBTestUniversalCompactionBase(const std::string& path)
+      : DBTestBase(path, /*env_do_fsync=*/false) {}
+  void SetUp() override {
+    num_levels_ = std::get<0>(GetParam());
+    exclusive_manual_compaction_ = std::get<1>(GetParam());
+  }
+  int num_levels_;
+  bool exclusive_manual_compaction_;
+};
+
+class DBTestUniversalCompaction : public DBTestUniversalCompactionBase {
+ public:
+  DBTestUniversalCompaction()
+      : DBTestUniversalCompactionBase("/db_universal_compaction_test") {}
+};
+
+class DBTestUniversalCompaction2 : public DBTestBase {
+ public:
+  DBTestUniversalCompaction2()
+      : DBTestBase("db_universal_compaction_test2", /*env_do_fsync=*/false) {}
+};
+
+namespace {
+void VerifyCompactionResult(
+    const ColumnFamilyMetaData& cf_meta,
+    const std::set<std::string>& overlapping_file_numbers) {
+#ifndef NDEBUG
+  for (auto& level : cf_meta.levels) {
+    for (auto& file : level.files) {
+      assert(overlapping_file_numbers.find(file.name) ==
+             overlapping_file_numbers.end());
+    }
+  }
+#endif
+}
+
+class KeepFilter : public CompactionFilter {
+ public:
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    return false;
+  }
+
+  const char* Name() const override { return "KeepFilter"; }
+};
+
+class KeepFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit KeepFilterFactory(bool check_context = false)
+      : check_context_(check_context) {}
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    if (check_context_) {
+      EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction);
+      EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction);
+    }
+    return std::unique_ptr<CompactionFilter>(new KeepFilter());
+  }
+
+  const char* Name() const override { return "KeepFilterFactory"; }
+  bool check_context_;
+  std::atomic_bool expect_full_compaction_;
+  std::atomic_bool expect_manual_compaction_;
+};
+}  // anonymous namespace
+
+// Make sure we don't trigger a problem if the trigger condtion is given
+// to be 0, which is invalid.
+TEST_P(DBTestUniversalCompaction, UniversalCompactionSingleSortedRun) {
+  Options options = CurrentOptions();
+
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels_;
+  // Config universal compaction to always compact to one single sorted run.
+  options.level0_file_num_compaction_trigger = 0;
+  options.compaction_options_universal.size_ratio = 10;
+  options.compaction_options_universal.min_merge_width = 2;
+  options.compaction_options_universal.max_size_amplification_percent = 0;
+
+  options.write_buffer_size = 105 << 10;  // 105KB
+  options.arena_block_size = 4 << 10;
+  options.target_file_size_base = 32 << 10;  // 32KB
+  // trigger compaction if there are >= 4 files
+  KeepFilterFactory* filter = new KeepFilterFactory(true);
+  filter->expect_manual_compaction_.store(false);
+  options.compaction_filter_factory.reset(filter);
+
+  DestroyAndReopen(options);
+  ASSERT_EQ(1, db_->GetOptions().level0_file_num_compaction_trigger);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  filter->expect_full_compaction_.store(true);
+
+  for (int num = 0; num < 16; num++) {
+    // Write 100KB file. And immediately it should be compacted to one file.
+    GenerateNewFile(&rnd, &key_idx);
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ(NumSortedRuns(0), 1);
+  }
+  ASSERT_OK(Put(Key(key_idx), ""));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumSortedRuns(0), 1);
+}
+
+TEST_P(DBTestUniversalCompaction, OptimizeFiltersForHits) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.size_ratio = 5;
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 105 << 10;  // 105KB
+  options.arena_block_size = 4 << 10;
+  options.target_file_size_base = 32 << 10;  // 32KB
+  // trigger compaction if there are >= 4 files
+  options.level0_file_num_compaction_trigger = 4;
+  BlockBasedTableOptions bbto;
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  options.optimize_filters_for_hits = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(3));
+
+  DestroyAndReopen(options);
+
+  // block compaction from happening
+  env_->SetBackgroundThreads(1, Env::LOW);
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+    ASSERT_OK(Put(Key(num * 10), "val"));
+    if (num) {
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    }
+    ASSERT_OK(Put(Key(30 + num * 10), "val"));
+    ASSERT_OK(Put(Key(60 + num * 10), "val"));
+  }
+  ASSERT_OK(Put("", ""));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  // Query set of non existing keys
+  for (int i = 5; i < 90; i += 10) {
+    ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+  }
+
+  // Make sure bloom filter is used at least once.
+  ASSERT_GT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+  auto prev_counter = TestGetTickerCount(options, BLOOM_FILTER_USEFUL);
+
+  // Make sure bloom filter is used for all but the last L0 file when looking
+  // up a non-existent key that's in the range of all L0 files.
+  ASSERT_EQ(Get(Key(35)), "NOT_FOUND");
+  ASSERT_EQ(prev_counter + NumTableFilesAtLevel(0) - 1,
+            TestGetTickerCount(options, BLOOM_FILTER_USEFUL));
+  prev_counter = TestGetTickerCount(options, BLOOM_FILTER_USEFUL);
+
+  // Unblock compaction and wait it for happening.
+  sleeping_task_low.WakeUp();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // The same queries will not trigger bloom filter
+  for (int i = 5; i < 90; i += 10) {
+    ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+  }
+  ASSERT_EQ(prev_counter, TestGetTickerCount(options, BLOOM_FILTER_USEFUL));
+}
+
+// TODO(kailiu) The tests on UniversalCompaction has some issues:
+//  1. A lot of magic numbers ("11" or "12").
+//  2. Made assumption on the memtable flush conditions, which may change from
+//     time to time.
+TEST_P(DBTestUniversalCompaction, UniversalCompactionTrigger) {
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.size_ratio = 5;
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 105 << 10;  // 105KB
+  options.arena_block_size = 4 << 10;
+  options.target_file_size_base = 32 << 10;  // 32KB
+  // trigger compaction if there are >= 4 files
+  options.level0_file_num_compaction_trigger = 4;
+  KeepFilterFactory* filter = new KeepFilterFactory(true);
+  filter->expect_manual_compaction_.store(false);
+  options.compaction_filter_factory.reset(filter);
+
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBTestWritableFile.GetPreallocationStatus", [&](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        size_t preallocation_size = *(static_cast<size_t*>(arg));
+        if (num_levels_ > 3) {
+          ASSERT_LE(preallocation_size, options.target_file_size_base * 1.1);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  filter->expect_full_compaction_.store(true);
+  // Stage 1:
+  //   Generate a set of files at level 0, but don't trigger level-0
+  //   compaction.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    // Write 100KB
+    GenerateNewFile(1, &rnd, &key_idx);
+  }
+
+  // Generate one more file at level-0, which should trigger level-0
+  // compaction.
+  GenerateNewFile(1, &rnd, &key_idx);
+  // Suppose each file flushed from mem table has size 1. Now we compact
+  // (level0_file_num_compaction_trigger+1)=4 files and should have a big
+  // file of size 4.
+  ASSERT_EQ(NumSortedRuns(1), 1);
+
+  // Stage 2:
+  //   Now we have one file at level 0, with size 4. We also have some data in
+  //   mem table. Let's continue generating new files at level 0, but don't
+  //   trigger level-0 compaction.
+  //   First, clean up memtable before inserting new data. This will generate
+  //   a level-0 file, with size around 0.4 (according to previously written
+  //   data amount).
+  filter->expect_full_compaction_.store(false);
+  ASSERT_OK(Flush(1));
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 3;
+       num++) {
+    GenerateNewFile(1, &rnd, &key_idx);
+    ASSERT_EQ(NumSortedRuns(1), num + 3);
+  }
+
+  // Generate one more file at level-0, which should trigger level-0
+  // compaction.
+  GenerateNewFile(1, &rnd, &key_idx);
+  // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1.
+  // After compaction, we should have 2 files, with size 4, 2.4.
+  ASSERT_EQ(NumSortedRuns(1), 2);
+
+  // Stage 3:
+  //   Now we have 2 files at level 0, with size 4 and 2.4. Continue
+  //   generating new files at level 0.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 3;
+       num++) {
+    GenerateNewFile(1, &rnd, &key_idx);
+    ASSERT_EQ(NumSortedRuns(1), num + 3);
+  }
+
+  // Generate one more file at level-0, which should trigger level-0
+  // compaction.
+  GenerateNewFile(1, &rnd, &key_idx);
+  // Before compaction, we have 4 files at level 0, with size 4, 2.4, 1, 1.
+  // After compaction, we should have 3 files, with size 4, 2.4, 2.
+  ASSERT_EQ(NumSortedRuns(1), 3);
+
+  // Stage 4:
+  //   Now we have 3 files at level 0, with size 4, 2.4, 2. Let's generate a
+  //   new file of size 1.
+  GenerateNewFile(1, &rnd, &key_idx);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Level-0 compaction is triggered, but no file will be picked up.
+  ASSERT_EQ(NumSortedRuns(1), 4);
+
+  // Stage 5:
+  //   Now we have 4 files at level 0, with size 4, 2.4, 2, 1. Let's generate
+  //   a new file of size 1.
+  filter->expect_full_compaction_.store(true);
+  GenerateNewFile(1, &rnd, &key_idx);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // All files at level 0 will be compacted into a single one.
+  ASSERT_EQ(NumSortedRuns(1), 1);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionSizeAmplification) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = 3;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  //   Generate two files in Level 0. Both files are approx the same size.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(1, Key(key_idx), rnd.RandomString(10000)));
+      key_idx++;
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+    ASSERT_EQ(NumSortedRuns(1), num + 1);
+  }
+  ASSERT_EQ(NumSortedRuns(1), 2);
+
+  // Flush whatever is remaining in memtable. This is typically
+  // small, which should not trigger size ratio based compaction
+  // but will instead trigger size amplification.
+  ASSERT_OK(Flush(1));
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Verify that size amplification did occur
+  ASSERT_EQ(NumSortedRuns(1), 1);
+}
+
+TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionSizeAmplification) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 1;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = 3;
+  // Initial setup of compaction_options_universal will prevent universal
+  // compaction from happening
+  options.compaction_options_universal.size_ratio = 100;
+  options.compaction_options_universal.min_merge_width = 100;
+  DestroyAndReopen(options);
+
+  int total_picked_compactions = 0;
+  int total_size_amp_compactions = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+        if (arg) {
+          total_picked_compactions++;
+          Compaction* c = static_cast<Compaction*>(arg);
+          if (c->compaction_reason() ==
+              CompactionReason::kUniversalSizeAmplification) {
+            total_size_amp_compactions++;
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  MutableCFOptions mutable_cf_options;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  //   Generate two files in Level 0. Both files are approx the same size.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(1, Key(key_idx), rnd.RandomString(10000)));
+      key_idx++;
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+    ASSERT_EQ(NumSortedRuns(1), num + 1);
+  }
+  ASSERT_EQ(NumSortedRuns(1), 2);
+
+  // Flush whatever is remaining in memtable. This is typically
+  // small, which should not trigger size ratio based compaction
+  // but could instead trigger size amplification if it's set
+  // to 110.
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Verify compaction did not happen
+  ASSERT_EQ(NumSortedRuns(1), 3);
+
+  // Trigger compaction if size amplification exceeds 110% without reopening DB
+  ASSERT_EQ(dbfull()
+                ->GetOptions(handles_[1])
+                .compaction_options_universal.max_size_amplification_percent,
+            200U);
+  ASSERT_OK(dbfull()->SetOptions(handles_[1],
+                                 {{"compaction_options_universal",
+                                   "{max_size_amplification_percent=110;}"}}));
+  ASSERT_EQ(dbfull()
+                ->GetOptions(handles_[1])
+                .compaction_options_universal.max_size_amplification_percent,
+            110u);
+  ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+                                                     &mutable_cf_options));
+  ASSERT_EQ(110u, mutable_cf_options.compaction_options_universal
+                      .max_size_amplification_percent);
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Verify that size amplification did happen
+  ASSERT_EQ(NumSortedRuns(1), 1);
+  ASSERT_EQ(total_picked_compactions, 1);
+  ASSERT_EQ(total_size_amp_compactions, 1);
+}
+
+TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionReadAmplification) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 1;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = 3;
+  // Initial setup of compaction_options_universal will prevent universal
+  // compaction from happening
+  options.compaction_options_universal.max_size_amplification_percent = 2000;
+  options.compaction_options_universal.size_ratio = 0;
+  options.compaction_options_universal.min_merge_width = 100;
+  DestroyAndReopen(options);
+
+  int total_picked_compactions = 0;
+  int total_size_ratio_compactions = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+        if (arg) {
+          total_picked_compactions++;
+          Compaction* c = static_cast<Compaction*>(arg);
+          if (c->compaction_reason() == CompactionReason::kUniversalSizeRatio) {
+            total_size_ratio_compactions++;
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  MutableCFOptions mutable_cf_options;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // Generate three files in Level 0. All files are approx the same size.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(1, Key(key_idx), rnd.RandomString(10000)));
+      key_idx++;
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+    ASSERT_EQ(NumSortedRuns(1), num + 1);
+  }
+  ASSERT_EQ(NumSortedRuns(1), options.level0_file_num_compaction_trigger);
+
+  // Flush whatever is remaining in memtable. This is typically small, about
+  // 30KB.
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Verify compaction did not happen
+  ASSERT_EQ(NumSortedRuns(1), options.level0_file_num_compaction_trigger + 1);
+  ASSERT_EQ(total_picked_compactions, 0);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      handles_[1],
+      {{"compaction_options_universal",
+        "{min_merge_width=2;max_merge_width=2;size_ratio=100;}"}}));
+  ASSERT_EQ(dbfull()
+                ->GetOptions(handles_[1])
+                .compaction_options_universal.min_merge_width,
+            2u);
+  ASSERT_EQ(dbfull()
+                ->GetOptions(handles_[1])
+                .compaction_options_universal.max_merge_width,
+            2u);
+  ASSERT_EQ(
+      dbfull()->GetOptions(handles_[1]).compaction_options_universal.size_ratio,
+      100u);
+
+  ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+                                                     &mutable_cf_options));
+  ASSERT_EQ(mutable_cf_options.compaction_options_universal.size_ratio, 100u);
+  ASSERT_EQ(mutable_cf_options.compaction_options_universal.min_merge_width,
+            2u);
+  ASSERT_EQ(mutable_cf_options.compaction_options_universal.max_merge_width,
+            2u);
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Files in L0 are approx: 0.3 (30KB), 1, 1, 1.
+  // On compaction: the files are below the size amp threshold, so we
+  // fallthrough to checking read amp conditions. The configured size ratio is
+  // not big enough to take 0.3 into consideration. So the next files 1 and 1
+  // are compacted together first as they satisfy size ratio condition and
+  // (min_merge_width, max_merge_width) condition, to give out a file size of 2.
+  // Next, the newly generated 2 and the last file 1 are compacted together. So
+  // at the end: #sortedRuns = 2, #picked_compactions = 2, and all the picked
+  // ones are size ratio based compactions.
+  ASSERT_EQ(NumSortedRuns(1), 2);
+  // If max_merge_width had not been changed dynamically above, and if it
+  // continued to be the default value of UINIT_MAX, total_picked_compactions
+  // would have been 1.
+  ASSERT_EQ(total_picked_compactions, 2);
+  ASSERT_EQ(total_size_ratio_compactions, 2);
+}
+
+TEST_P(DBTestUniversalCompaction, CompactFilesOnUniversalCompaction) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 10;
+
+  ChangeCompactOptions();
+  Options options;
+  options.create_if_missing = true;
+  options.compaction_style = kCompactionStyleLevel;
+  options.num_levels = 1;
+  options.target_file_size_base = options.write_buffer_size;
+  options.compression = kNoCompression;
+  options = CurrentOptions(options);
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_EQ(options.compaction_style, kCompactionStyleUniversal);
+  Random rnd(301);
+  for (int key = 1024 * kEntriesPerBuffer; key >= 0; --key) {
+    ASSERT_OK(Put(1, std::to_string(key), rnd.RandomString(kTestValueSize)));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ColumnFamilyMetaData cf_meta;
+  dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  std::vector<std::string> compaction_input_file_names;
+  for (auto file : cf_meta.levels[0].files) {
+    if (rnd.OneIn(2)) {
+      compaction_input_file_names.push_back(file.name);
+    }
+  }
+
+  if (compaction_input_file_names.size() == 0) {
+    compaction_input_file_names.push_back(cf_meta.levels[0].files[0].name);
+  }
+
+  // expect fail since universal compaction only allow L0 output
+  ASSERT_FALSE(dbfull()
+                   ->CompactFiles(CompactionOptions(), handles_[1],
+                                  compaction_input_file_names, 1)
+                   .ok());
+
+  // expect ok and verify the compacted files no longer exist.
+  ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), handles_[1],
+                                   compaction_input_file_names, 0));
+
+  dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  VerifyCompactionResult(
+      cf_meta, std::set<std::string>(compaction_input_file_names.begin(),
+                                     compaction_input_file_names.end()));
+
+  compaction_input_file_names.clear();
+
+  // Pick the first and the last file, expect everything is
+  // compacted into one single file.
+  compaction_input_file_names.push_back(cf_meta.levels[0].files[0].name);
+  compaction_input_file_names.push_back(
+      cf_meta.levels[0].files[cf_meta.levels[0].files.size() - 1].name);
+  ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), handles_[1],
+                                   compaction_input_file_names, 0));
+
+  dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  ASSERT_EQ(cf_meta.levels[0].files.size(), 1U);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionTargetLevel) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.num_levels = 7;
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  // Generate 3 overlapping files
+  Random rnd(301);
+  for (int i = 0; i < 210; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+  }
+  ASSERT_OK(Flush());
+
+  for (int i = 200; i < 300; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+  }
+  ASSERT_OK(Flush());
+
+  for (int i = 250; i < 260; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ("3", FilesPerLevel(0));
+  // Compact all files into 1 file and put it in L4
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 4;
+  compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,1", FilesPerLevel(0));
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+class DBTestUniversalCompactionMultiLevels
+    : public DBTestUniversalCompactionBase {
+ public:
+  DBTestUniversalCompactionMultiLevels()
+      : DBTestUniversalCompactionBase(
+            "/db_universal_compaction_multi_levels_test") {}
+};
+
+TEST_P(DBTestUniversalCompactionMultiLevels, UniversalCompactionMultiLevels) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 8;
+  options.max_background_compactions = 3;
+  options.target_file_size_base = 32 * 1024;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  Random rnd(301);
+  int num_keys = 100000;
+  for (int i = 0; i < num_keys * 2; i++) {
+    ASSERT_OK(Put(1, Key(i % num_keys), Key(i)));
+  }
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  for (int i = num_keys; i < num_keys * 2; i++) {
+    ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i));
+  }
+}
+
+// Tests universal compaction with trivial move enabled
+TEST_P(DBTestUniversalCompactionMultiLevels, UniversalCompactionTrivialMove) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial", [&](void* arg) {
+        non_trivial_move++;
+        ASSERT_TRUE(arg != nullptr);
+        int output_level = *(static_cast<int*>(arg));
+        ASSERT_EQ(output_level, 0);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.allow_trivial_move = true;
+  options.num_levels = 3;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_background_compactions = 2;
+  options.target_file_size_base = 32 * 1024;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  Random rnd(301);
+  int num_keys = 150000;
+  for (int i = 0; i < num_keys; i++) {
+    ASSERT_OK(Put(1, Key(i), Key(i)));
+  }
+  std::vector<std::string> values;
+
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_GT(trivial_move, 0);
+  ASSERT_GT(non_trivial_move, 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(MultiLevels, DBTestUniversalCompactionMultiLevels,
+                        ::testing::Combine(::testing::Values(3, 20),
+                                           ::testing::Bool()));
+
+class DBTestUniversalCompactionParallel : public DBTestUniversalCompactionBase {
+ public:
+  DBTestUniversalCompactionParallel()
+      : DBTestUniversalCompactionBase("/db_universal_compaction_prallel_test") {
+  }
+};
+
+TEST_P(DBTestUniversalCompactionParallel, UniversalCompactionParallel) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels_;
+  options.env = env_;
+  options.write_buffer_size = 1 << 10;  // 1KB
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_background_compactions = 3;
+  options.max_background_flushes = 3;
+  options.target_file_size_base = 1 * 1024;
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Delay every compaction so multiple compactions will happen.
+  std::atomic<int> num_compactions_running(0);
+  std::atomic<bool> has_parallel(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():Start", [&](void* /*arg*/) {
+        if (num_compactions_running.fetch_add(1) > 0) {
+          has_parallel.store(true);
+          return;
+        }
+        for (int nwait = 0; nwait < 20000; nwait++) {
+          if (has_parallel.load() || num_compactions_running.load() > 1) {
+            has_parallel.store(true);
+            break;
+          }
+          env_->SleepForMicroseconds(1000);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():End",
+      [&](void* /*arg*/) { num_compactions_running.fetch_add(-1); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  Random rnd(301);
+  int num_keys = 30000;
+  for (int i = 0; i < num_keys * 2; i++) {
+    ASSERT_OK(Put(1, Key(i % num_keys), Key(i)));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(num_compactions_running.load(), 0);
+  ASSERT_TRUE(has_parallel.load());
+
+  for (int i = num_keys; i < num_keys * 2; i++) {
+    ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i));
+  }
+
+  // Reopen and check.
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  for (int i = num_keys; i < num_keys * 2; i++) {
+    ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i));
+  }
+}
+
+TEST_P(DBTestUniversalCompactionParallel, PickByFileNumberBug) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 1 * 1024;  // 1KB
+  options.level0_file_num_compaction_trigger = 7;
+  options.max_background_compactions = 2;
+  options.target_file_size_base = 1024 * 1024;  // 1MB
+
+  // Disable size amplifiction compaction
+  options.compaction_options_universal.max_size_amplification_percent =
+      UINT_MAX;
+  DestroyAndReopen(options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBTestUniversalCompactionParallel::PickByFileNumberBug:0",
+        "BackgroundCallCompaction:0"},
+       {"UniversalCompactionBuilder::PickCompaction:Return",
+        "DBTestUniversalCompactionParallel::PickByFileNumberBug:1"},
+       {"DBTestUniversalCompactionParallel::PickByFileNumberBug:2",
+        "CompactionJob::Run():Start"}});
+
+  int total_picked_compactions = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+        if (arg) {
+          total_picked_compactions++;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Write 7 files to trigger compaction
+  int key_idx = 1;
+  for (int i = 1; i <= 70; i++) {
+    std::string k = Key(key_idx++);
+    ASSERT_OK(Put(k, k));
+    if (i % 10 == 0) {
+      ASSERT_OK(Flush());
+    }
+  }
+
+  // Wait for the 1st background compaction process to start
+  TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:0");
+  TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:1");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+
+  // Write 3 files while 1st compaction is held
+  // These 3 files have different sizes to avoid compacting based on size_ratio
+  int num_keys = 1000;
+  for (int i = 0; i < 3; i++) {
+    for (int j = 1; j <= num_keys; j++) {
+      std::string k = Key(key_idx++);
+      ASSERT_OK(Put(k, k));
+    }
+    ASSERT_OK(Flush());
+    num_keys -= 100;
+  }
+
+  // Hold the 1st compaction from finishing
+  TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:2");
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // There should only be one picked compaction as the score drops below one
+  // after the first one is picked.
+  EXPECT_EQ(total_picked_compactions, 1);
+  EXPECT_EQ(TotalTableFiles(), 4);
+
+  // Stop SyncPoint and destroy the DB and reopen it again
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  key_idx = 1;
+  total_picked_compactions = 0;
+  DestroyAndReopen(options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Write 7 files to trigger compaction
+  for (int i = 1; i <= 70; i++) {
+    std::string k = Key(key_idx++);
+    ASSERT_OK(Put(k, k));
+    if (i % 10 == 0) {
+      ASSERT_OK(Flush());
+    }
+  }
+
+  // Wait for the 1st background compaction process to start
+  TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:0");
+  TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:1");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+
+  // Write 8 files while 1st compaction is held
+  // These 8 files have different sizes to avoid compacting based on size_ratio
+  num_keys = 1000;
+  for (int i = 0; i < 8; i++) {
+    for (int j = 1; j <= num_keys; j++) {
+      std::string k = Key(key_idx++);
+      ASSERT_OK(Put(k, k));
+    }
+    ASSERT_OK(Flush());
+    num_keys -= 100;
+  }
+
+  // Wait for the 2nd background compaction process to start
+  TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:0");
+  TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:1");
+
+  // Hold the 1st and 2nd compaction from finishing
+  TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:2");
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // This time we will trigger a compaction because of size ratio and
+  // another compaction because of number of files that are not compacted
+  // greater than 7
+  EXPECT_GE(total_picked_compactions, 2);
+}
+
+INSTANTIATE_TEST_CASE_P(Parallel, DBTestUniversalCompactionParallel,
+                        ::testing::Combine(::testing::Values(1, 10),
+                                           ::testing::Values(false)));
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionOptions) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 105 << 10;     // 105KB
+  options.arena_block_size = 4 << 10;        // 4KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = 4;
+  options.num_levels = num_levels_;
+  options.compaction_options_universal.compression_size_percent = -1;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+    // Write 100KB (100 values, each 1K)
+    for (int i = 0; i < 100; i++) {
+      ASSERT_OK(Put(1, Key(key_idx), rnd.RandomString(990)));
+      key_idx++;
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+
+    if (num < options.level0_file_num_compaction_trigger - 1) {
+      ASSERT_EQ(NumSortedRuns(1), num + 1);
+    }
+  }
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumSortedRuns(1), 1);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionStopStyleSimilarSize) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 105 << 10;     // 105KB
+  options.arena_block_size = 4 << 10;        // 4KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  // trigger compaction if there are >= 4 files
+  options.level0_file_num_compaction_trigger = 4;
+  options.compaction_options_universal.size_ratio = 10;
+  options.compaction_options_universal.stop_style =
+      kCompactionStopStyleSimilarSize;
+  options.num_levels = num_levels_;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // Stage 1:
+  //   Generate a set of files at level 0, but don't trigger level-0
+  //   compaction.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    // Write 100KB (100 values, each 1K)
+    for (int i = 0; i < 100; i++) {
+      ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990)));
+      key_idx++;
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_EQ(NumSortedRuns(), num + 1);
+  }
+
+  // Generate one more file at level-0, which should trigger level-0
+  // compaction.
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990)));
+    key_idx++;
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Suppose each file flushed from mem table has size 1. Now we compact
+  // (level0_file_num_compaction_trigger+1)=4 files and should have a big
+  // file of size 4.
+  ASSERT_EQ(NumSortedRuns(), 1);
+
+  // Stage 2:
+  //   Now we have one file at level 0, with size 4. We also have some data in
+  //   mem table. Let's continue generating new files at level 0, but don't
+  //   trigger level-0 compaction.
+  //   First, clean up memtable before inserting new data. This will generate
+  //   a level-0 file, with size around 0.4 (according to previously written
+  //   data amount).
+  ASSERT_OK(dbfull()->Flush(FlushOptions()));
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 3;
+       num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 100; i++) {
+      ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990)));
+      key_idx++;
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_EQ(NumSortedRuns(), num + 3);
+  }
+
+  // Generate one more file at level-0, which should trigger level-0
+  // compaction.
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990)));
+    key_idx++;
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1.
+  // After compaction, we should have 3 files, with size 4, 0.4, 2.
+  ASSERT_EQ(NumSortedRuns(), 3);
+  // Stage 3:
+  //   Now we have 3 files at level 0, with size 4, 0.4, 2. Generate one
+  //   more file at level-0, which should trigger level-0 compaction.
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990)));
+    key_idx++;
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Level-0 compaction is triggered, but no file will be picked up.
+  ASSERT_EQ(NumSortedRuns(), 4);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionCompressRatio1) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = num_levels_;
+  options.compaction_options_universal.compression_size_percent = 70;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // The first compaction (2) is compressed.
+  for (int num = 0; num < 2; num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_LT(TotalSize(), 110000U * 2 * 0.9);
+
+  // The second compaction (4) is compressed
+  for (int num = 0; num < 2; num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_LT(TotalSize(), 110000 * 4 * 0.9);
+
+  // The third compaction (2 4) is compressed since this time it is
+  // (1 1 3.2) and 3.2/5.2 doesn't reach ratio.
+  for (int num = 0; num < 2; num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_LT(TotalSize(), 110000 * 6 * 0.9);
+
+  // When we start for the compaction up to (2 4 8), the latest
+  // compressed is not compressed.
+  for (int num = 0; num < 8; num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_GT(TotalSize(), 110000 * 11 * 0.8 + 110000 * 2);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionCompressRatio2) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = num_levels_;
+  options.compaction_options_universal.compression_size_percent = 95;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // When we start for the compaction up to (2 4 8), the latest
+  // compressed is compressed given the size ratio to compress.
+  for (int num = 0; num < 14; num++) {
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_LT(TotalSize(), 120000U * 12 * 0.82 + 120000 * 2);
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+// Test that checks trivial move in universal compaction
+TEST_P(DBTestUniversalCompaction, UniversalCompactionTrivialMoveTest1) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial", [&](void* arg) {
+        non_trivial_move++;
+        ASSERT_TRUE(arg != nullptr);
+        int output_level = *(static_cast<int*>(arg));
+        ASSERT_EQ(output_level, 0);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.allow_trivial_move = true;
+  options.num_levels = 2;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_background_compactions = 1;
+  options.target_file_size_base = 32 * 1024;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  Random rnd(301);
+  int num_keys = 250000;
+  for (int i = 0; i < num_keys; i++) {
+    ASSERT_OK(Put(1, Key(i), Key(i)));
+  }
+  std::vector<std::string> values;
+
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_GT(trivial_move, 0);
+  ASSERT_GT(non_trivial_move, 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+// Test that checks trivial move in universal compaction
+TEST_P(DBTestUniversalCompaction, UniversalCompactionTrivialMoveTest2) {
+  int32_t trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial", [&](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        int output_level = *(static_cast<int*>(arg));
+        ASSERT_EQ(output_level, 0);
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.allow_trivial_move = true;
+  options.num_levels = 15;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 8;
+  options.max_background_compactions = 2;
+  options.target_file_size_base = 64 * 1024;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  Random rnd(301);
+  int num_keys = 500000;
+  for (int i = 0; i < num_keys; i++) {
+    ASSERT_OK(Put(1, Key(i), Key(i)));
+  }
+  std::vector<std::string> values;
+
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_GT(trivial_move, 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionFourPaths) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 300 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 300 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_3", 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_4", 1024 * 1024 * 1024);
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.size_ratio = 5;
+  options.write_buffer_size = 111 << 10;  // 114KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 1;
+
+  std::vector<std::string> filenames;
+  if (env_->GetChildren(options.db_paths[1].path, &filenames).ok()) {
+    // Delete archival files.
+    for (size_t i = 0; i < filenames.size(); ++i) {
+      ASSERT_OK(
+          env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]));
+    }
+    ASSERT_OK(env_->DeleteDir(options.db_paths[1].path));
+  }
+  Reopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // First three 110KB files are not going to second path.
+  // After that, (100K, 200K)
+  for (int num = 0; num < 3; num++) {
+    GenerateNewFile(&rnd, &key_idx);
+  }
+
+  // Another 110KB triggers a compaction to 400K file to second path
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+
+  // (1, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1,1,4) -> (2, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // (1, 2, 4) -> (3, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // (1, 3, 4) -> (8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+
+  // (1, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 1, 8) -> (2, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  // (1, 2, 8) -> (3, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // (1, 3, 8) -> (4, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+
+  // (1, 4, 8) -> (5, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Reopen(options);
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Destroy(options);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionCFPathUse) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 300 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 300 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_3", 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_4", 1024 * 1024 * 1024);
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.size_ratio = 10;
+  options.write_buffer_size = 111 << 10;  // 114KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 1;
+
+  std::vector<Options> option_vector;
+  option_vector.emplace_back(options);
+  ColumnFamilyOptions cf_opt1(options), cf_opt2(options);
+  // Configure CF1 specific paths.
+  cf_opt1.cf_paths.emplace_back(dbname_ + "cf1", 300 * 1024);
+  cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_2", 300 * 1024);
+  cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_3", 500 * 1024);
+  cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_4", 1024 * 1024 * 1024);
+  option_vector.emplace_back(DBOptions(options), cf_opt1);
+  CreateColumnFamilies({"one"}, option_vector[1]);
+
+  // Configura CF2 specific paths.
+  cf_opt2.cf_paths.emplace_back(dbname_ + "cf2", 300 * 1024);
+  cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_2", 300 * 1024);
+  cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_3", 500 * 1024);
+  cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_4", 1024 * 1024 * 1024);
+  option_vector.emplace_back(DBOptions(options), cf_opt2);
+  CreateColumnFamilies({"two"}, option_vector[2]);
+
+  ReopenWithColumnFamilies({"default", "one", "two"}, option_vector);
+
+  Random rnd(301);
+  int key_idx = 0;
+  int key_idx1 = 0;
+  int key_idx2 = 0;
+
+  auto generate_file = [&]() {
+    GenerateNewFile(0, &rnd, &key_idx);
+    GenerateNewFile(1, &rnd, &key_idx1);
+    GenerateNewFile(2, &rnd, &key_idx2);
+  };
+
+  auto check_sstfilecount = [&](int path_id, int expected) {
+    ASSERT_EQ(expected, GetSstFileCount(options.db_paths[path_id].path));
+    ASSERT_EQ(expected, GetSstFileCount(cf_opt1.cf_paths[path_id].path));
+    ASSERT_EQ(expected, GetSstFileCount(cf_opt2.cf_paths[path_id].path));
+  };
+
+  auto check_getvalues = [&]() {
+    for (int i = 0; i < key_idx; i++) {
+      auto v = Get(0, Key(i));
+      ASSERT_NE(v, "NOT_FOUND");
+      ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+    }
+
+    for (int i = 0; i < key_idx1; i++) {
+      auto v = Get(1, Key(i));
+      ASSERT_NE(v, "NOT_FOUND");
+      ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+    }
+
+    for (int i = 0; i < key_idx2; i++) {
+      auto v = Get(2, Key(i));
+      ASSERT_NE(v, "NOT_FOUND");
+      ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+    }
+  };
+
+  // First three 110KB files are not going to second path.
+  // After that, (100K, 200K)
+  for (int num = 0; num < 3; num++) {
+    generate_file();
+  }
+
+  // Another 110KB triggers a compaction to 400K file to second path
+  generate_file();
+  check_sstfilecount(2, 1);
+
+  // (1, 4)
+  generate_file();
+  check_sstfilecount(2, 1);
+  check_sstfilecount(0, 1);
+
+  // (1,1,4) -> (2, 4)
+  generate_file();
+  check_sstfilecount(2, 1);
+  check_sstfilecount(1, 1);
+  check_sstfilecount(0, 0);
+
+  // (1, 2, 4) -> (3, 4)
+  generate_file();
+  check_sstfilecount(2, 1);
+  check_sstfilecount(1, 1);
+  check_sstfilecount(0, 0);
+
+  // (1, 3, 4) -> (8)
+  generate_file();
+  check_sstfilecount(3, 1);
+
+  // (1, 8)
+  generate_file();
+  check_sstfilecount(3, 1);
+  check_sstfilecount(0, 1);
+
+  // (1, 1, 8) -> (2, 8)
+  generate_file();
+  check_sstfilecount(3, 1);
+  check_sstfilecount(1, 1);
+
+  // (1, 2, 8) -> (3, 8)
+  generate_file();
+  check_sstfilecount(3, 1);
+  check_sstfilecount(1, 1);
+  check_sstfilecount(0, 0);
+
+  // (1, 3, 8) -> (4, 8)
+  generate_file();
+  check_sstfilecount(2, 1);
+  check_sstfilecount(3, 1);
+
+  // (1, 4, 8) -> (5, 8)
+  generate_file();
+  check_sstfilecount(3, 1);
+  check_sstfilecount(2, 1);
+  check_sstfilecount(0, 0);
+
+  check_getvalues();
+
+  ReopenWithColumnFamilies({"default", "one", "two"}, option_vector);
+
+  check_getvalues();
+
+  Destroy(options, true);
+}
+
+TEST_P(DBTestUniversalCompaction, IncreaseUniversalCompactionNumLevels) {
+  std::function<void(int)> verify_func = [&](int num_keys_in_db) {
+    std::string keys_in_db;
+    Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]);
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      keys_in_db.append(iter->key().ToString());
+      keys_in_db.push_back(',');
+    }
+    delete iter;
+
+    std::string expected_keys;
+    for (int i = 0; i <= num_keys_in_db; i++) {
+      expected_keys.append(Key(i));
+      expected_keys.push_back(',');
+    }
+
+    ASSERT_EQ(keys_in_db, expected_keys);
+  };
+
+  Random rnd(301);
+  int max_key1 = 200;
+  int max_key2 = 600;
+  int max_key3 = 800;
+  const int KNumKeysPerFile = 10;
+
+  // Stage 1: open a DB with universal compaction, num_levels=1
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 1;
+  options.write_buffer_size = 200 << 10;  // 200KB
+  options.level0_file_num_compaction_trigger = 3;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(KNumKeysPerFile));
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  for (int i = 0; i <= max_key1; i++) {
+    // each value is 10K
+    ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000)));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Stage 2: reopen with universal compaction, num_levels=4
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 4;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  verify_func(max_key1);
+
+  // Insert more keys
+  for (int i = max_key1 + 1; i <= max_key2; i++) {
+    // each value is 10K
+    ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000)));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  verify_func(max_key2);
+  // Compaction to non-L0 has happened.
+  ASSERT_GT(NumTableFilesAtLevel(options.num_levels - 1, 1), 0);
+
+  // Stage 3: Revert it back to one level and revert to num_levels=1.
+  options.num_levels = 4;
+  options.target_file_size_base = INT_MAX;
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  // Compact all to level 0
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 0;
+  compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+  ASSERT_OK(
+      dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+  // Need to restart it once to remove higher level records in manifest.
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  // Final reopen
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 1;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  // Insert more keys
+  for (int i = max_key2 + 1; i <= max_key3; i++) {
+    // each value is 10K
+    ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000)));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  verify_func(max_key3);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionSecondPathRatio) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 1024 * 1024 * 1024);
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.size_ratio = 5;
+  options.write_buffer_size = 111 << 10;  // 114KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 1;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+
+  std::vector<std::string> filenames;
+  if (env_->GetChildren(options.db_paths[1].path, &filenames).ok()) {
+    // Delete archival files.
+    for (size_t i = 0; i < filenames.size(); ++i) {
+      ASSERT_OK(
+          env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]));
+    }
+    ASSERT_OK(env_->DeleteDir(options.db_paths[1].path));
+  }
+  Reopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // First three 110KB files are not going to second path.
+  // After that, (100K, 200K)
+  for (int num = 0; num < 3; num++) {
+    GenerateNewFile(&rnd, &key_idx);
+  }
+
+  // Another 110KB triggers a compaction to 400K file to second path
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  // (1, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1,1,4) -> (2, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 2, 4) -> (3, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // (1, 3, 4) -> (8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // (1, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 1, 8) -> (2, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 2, 8) -> (3, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // (1, 3, 8) -> (4, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // (1, 4, 8) -> (5, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Reopen(options);
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Destroy(options);
+}
+
+TEST_P(DBTestUniversalCompaction, ConcurrentBottomPriLowPriCompactions) {
+  if (num_levels_ == 1) {
+    // for single-level universal, everything's bottom level so nothing should
+    // be executed in bottom-pri thread pool.
+    return;
+  }
+  const int kNumFilesTrigger = 3;
+  Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM);
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.max_background_compactions = 2;
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = kNumFilesTrigger;
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  DestroyAndReopen(options);
+
+  // Need to get a token to enable compaction parallelism up to
+  // `max_background_compactions` jobs.
+  auto pressure_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {// wait for the full compaction to be picked before adding files intended
+       // for the second one.
+       {"DBImpl::BackgroundCompaction:ForwardToBottomPriPool",
+        "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0"},
+       // the full (bottom-pri) compaction waits until a partial (low-pri)
+       // compaction has started to verify they can run in parallel.
+       {"DBImpl::BackgroundCompaction:NonTrivial",
+        "DBImpl::BGWorkBottomCompaction"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  for (int i = 0; i < 2; ++i) {
+    for (int num = 0; num < kNumFilesTrigger; num++) {
+      int key_idx = 0;
+      GenerateNewFile(&rnd, &key_idx, true /* no_wait */);
+      // use no_wait above because that one waits for flush and compaction. We
+      // don't want to wait for compaction because the full compaction is
+      // intentionally blocked while more files are flushed.
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    }
+    if (i == 0) {
+      TEST_SYNC_POINT(
+          "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0");
+    }
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // First compaction should output to bottom level. Second should output to L0
+  // since older L0 files pending compaction prevent it from being placed lower.
+  ASSERT_EQ(NumSortedRuns(), 2);
+  ASSERT_GT(NumTableFilesAtLevel(0), 0);
+  ASSERT_GT(NumTableFilesAtLevel(num_levels_ - 1), 0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM);
+}
+
+TEST_P(DBTestUniversalCompaction, RecalculateScoreAfterPicking) {
+  // Regression test for extra compactions scheduled. Once enough compactions
+  // have been scheduled to bring the score below one, we should stop
+  // scheduling more; otherwise, other CFs/DBs may be delayed unnecessarily.
+  const int kNumFilesTrigger = 8;
+  Options options = CurrentOptions();
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.compaction_options_universal.max_merge_width = kNumFilesTrigger / 2;
+  options.compaction_options_universal.max_size_amplification_percent =
+      static_cast<unsigned int>(-1);
+  options.compaction_style = kCompactionStyleUniversal;
+  options.level0_file_num_compaction_trigger = kNumFilesTrigger;
+  options.num_levels = num_levels_;
+  Reopen(options);
+
+  std::atomic<int> num_compactions_attempted(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:Start",
+      [&](void* /*arg*/) { ++num_compactions_attempted; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  for (int num = 0; num < kNumFilesTrigger; num++) {
+    ASSERT_EQ(NumSortedRuns(), num);
+    int key_idx = 0;
+    GenerateNewFile(&rnd, &key_idx);
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Compacting the first four files was enough to bring the score below one so
+  // there's no need to schedule any more compactions.
+  ASSERT_EQ(1, num_compactions_attempted);
+  ASSERT_EQ(NumSortedRuns(), 5);
+}
+
+TEST_P(DBTestUniversalCompaction, FinalSortedRunCompactFilesConflict) {
+  // Regression test for conflict between:
+  // (1) Running CompactFiles including file in the final sorted run; and
+  // (2) Picking universal size-amp-triggered compaction, which always includes
+  //     the final sorted run.
+  if (exclusive_manual_compaction_) {
+    return;
+  }
+
+  Options opts = CurrentOptions();
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.compaction_options_universal.max_size_amplification_percent = 50;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compression = kNoCompression;
+  opts.level0_file_num_compaction_trigger = 2;
+  opts.max_background_compactions = 2;
+  opts.num_levels = num_levels_;
+  Reopen(opts);
+
+  // make sure compaction jobs can be parallelized
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  ASSERT_OK(Put("key", "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(NumTableFilesAtLevel(num_levels_ - 1), 1);
+  ColumnFamilyMetaData cf_meta;
+  ColumnFamilyHandle* default_cfh = db_->DefaultColumnFamily();
+  dbfull()->GetColumnFamilyMetaData(default_cfh, &cf_meta);
+  ASSERT_EQ(1, cf_meta.levels[num_levels_ - 1].files.size());
+  std::string first_sst_filename =
+      cf_meta.levels[num_levels_ - 1].files[0].name;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"CompactFilesImpl:0",
+        "DBTestUniversalCompaction:FinalSortedRunCompactFilesConflict:0"},
+       {"DBImpl::BackgroundCompaction():AfterPickCompaction",
+        "CompactFilesImpl:1"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread compact_files_thread([&]() {
+    ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), default_cfh,
+                                     {first_sst_filename}, num_levels_ - 1));
+  });
+
+  TEST_SYNC_POINT(
+      "DBTestUniversalCompaction:FinalSortedRunCompactFilesConflict:0");
+  for (int i = 0; i < 2; ++i) {
+    ASSERT_OK(Put("key", "val"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  compact_files_thread.join();
+}
+
+INSTANTIATE_TEST_CASE_P(NumLevels, DBTestUniversalCompaction,
+                        ::testing::Combine(::testing::Values(1, 3, 5),
+                                           ::testing::Bool()));
+
+class DBTestUniversalManualCompactionOutputPathId
+    : public DBTestUniversalCompactionBase {
+ public:
+  DBTestUniversalManualCompactionOutputPathId()
+      : DBTestUniversalCompactionBase(
+            "/db_universal_compaction_manual_pid_test") {}
+};
+
+TEST_P(DBTestUniversalManualCompactionOutputPathId,
+       ManualCompactionOutputPathId) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.db_paths.emplace_back(dbname_, 1000000000);
+  options.db_paths.emplace_back(dbname_ + "_2", 1000000000);
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels_;
+  options.target_file_size_base = 1 << 30;  // Big size
+  options.level0_file_num_compaction_trigger = 10;
+  Destroy(options);
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  MakeTables(3, "p", "q", 1);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(2, TotalLiveFiles(1));
+  ASSERT_EQ(2, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[1].path));
+
+  // Full compaction to DB path 0
+  CompactRangeOptions compact_options;
+  compact_options.target_path_id = 1;
+  compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+  ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+  ASSERT_EQ(1, TotalLiveFiles(1));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+  ASSERT_EQ(1, TotalLiveFiles(1));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  MakeTables(1, "p", "q", 1);
+  ASSERT_EQ(2, TotalLiveFiles(1));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+  ASSERT_EQ(2, TotalLiveFiles(1));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  // Full compaction to DB path 0
+  compact_options.target_path_id = 0;
+  compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+  ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+  ASSERT_EQ(1, TotalLiveFiles(1));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[1].path));
+
+  // Fail when compacting to an invalid path ID
+  compact_options.target_path_id = 2;
+  compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+  ASSERT_TRUE(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)
+                  .IsInvalidArgument());
+}
+
+INSTANTIATE_TEST_CASE_P(OutputPathId,
+                        DBTestUniversalManualCompactionOutputPathId,
+                        ::testing::Combine(::testing::Values(1, 8),
+                                           ::testing::Bool()));
+
+TEST_F(DBTestUniversalCompaction2, BasicL0toL1) {
+  const int kNumKeys = 3000;
+  const int kWindowSize = 100;
+  const int kNumDelsTrigger = 90;
+
+  Options opts = CurrentOptions();
+  opts.table_properties_collector_factories.emplace_back(
+      NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.level0_file_num_compaction_trigger = 2;
+  opts.compression = kNoCompression;
+  opts.compaction_options_universal.size_ratio = 10;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compaction_options_universal.max_size_amplification_percent = 200;
+  Reopen(opts);
+
+  // add an L1 file to prevent tombstones from dropping due to obsolescence
+  // during flush
+  int i;
+  for (i = 0; i < 2000; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ASSERT_OK(Flush());
+  //  MoveFilesToLevel(6);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  for (i = 1999; i < kNumKeys; ++i) {
+    if (i >= kNumKeys - kWindowSize &&
+        i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+      ASSERT_OK(Delete(Key(i)));
+    } else {
+      ASSERT_OK(Put(Key(i), "val"));
+    }
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(6), 0);
+}
+
+#if defined(ENABLE_SINGLE_LEVEL_DTC)
+TEST_F(DBTestUniversalCompaction2, SingleLevel) {
+  const int kNumKeys = 3000;
+  const int kWindowSize = 100;
+  const int kNumDelsTrigger = 90;
+
+  Options opts = CurrentOptions();
+  opts.table_properties_collector_factories.emplace_back(
+      NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.level0_file_num_compaction_trigger = 2;
+  opts.compression = kNoCompression;
+  opts.num_levels = 1;
+  opts.compaction_options_universal.size_ratio = 10;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compaction_options_universal.max_size_amplification_percent = 200;
+  Reopen(opts);
+
+  // add an L1 file to prevent tombstones from dropping due to obsolescence
+  // during flush
+  int i;
+  for (i = 0; i < 2000; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ASSERT_OK(Flush());
+
+  for (i = 1999; i < kNumKeys; ++i) {
+    if (i >= kNumKeys - kWindowSize &&
+        i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+      ASSERT_OK(Delete(Key(i)));
+    } else {
+      ASSERT_OK(Put(Key(i), "val"));
+    }
+  }
+  ASSERT_OK(Flush()(;
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+}
+#endif  // ENABLE_SINGLE_LEVEL_DTC
+
+TEST_F(DBTestUniversalCompaction2, MultipleLevels) {
+  const int kWindowSize = 100;
+  const int kNumDelsTrigger = 90;
+
+  Options opts = CurrentOptions();
+  opts.table_properties_collector_factories.emplace_back(
+      NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.level0_file_num_compaction_trigger = 4;
+  opts.compression = kNoCompression;
+  opts.compaction_options_universal.size_ratio = 10;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compaction_options_universal.max_size_amplification_percent = 200;
+  Reopen(opts);
+
+  // add an L1 file to prevent tombstones from dropping due to obsolescence
+  // during flush
+  int i;
+  for (i = 0; i < 500; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ASSERT_OK(Flush());
+  for (i = 500; i < 1000; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ASSERT_OK(Flush());
+  for (i = 1000; i < 1500; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ASSERT_OK(Flush());
+  for (i = 1500; i < 2000; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(6), 0);
+
+  for (i = 1999; i < 2333; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ASSERT_OK(Flush());
+  for (i = 2333; i < 2666; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ASSERT_OK(Flush());
+  for (i = 2666; i < 2999; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(6), 0);
+  ASSERT_GT(NumTableFilesAtLevel(5), 0);
+
+  for (i = 1900; i < 2100; ++i) {
+    ASSERT_OK(Delete(Key(i)));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+  ASSERT_EQ(0, NumTableFilesAtLevel(2));
+  ASSERT_EQ(0, NumTableFilesAtLevel(3));
+  ASSERT_EQ(0, NumTableFilesAtLevel(4));
+  ASSERT_EQ(0, NumTableFilesAtLevel(5));
+  ASSERT_GT(NumTableFilesAtLevel(6), 0);
+}
+
+TEST_F(DBTestUniversalCompaction2, OverlappingL0) {
+  const int kWindowSize = 100;
+  const int kNumDelsTrigger = 90;
+
+  Options opts = CurrentOptions();
+  opts.table_properties_collector_factories.emplace_back(
+      NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.level0_file_num_compaction_trigger = 5;
+  opts.compression = kNoCompression;
+  opts.compaction_options_universal.size_ratio = 10;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compaction_options_universal.max_size_amplification_percent = 200;
+  Reopen(opts);
+
+  // add an L1 file to prevent tombstones from dropping due to obsolescence
+  // during flush
+  int i;
+  for (i = 0; i < 2000; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ASSERT_OK(Flush());
+  for (i = 2000; i < 3000; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ASSERT_OK(Flush());
+  for (i = 3500; i < 4000; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ASSERT_OK(Flush());
+  for (i = 2900; i < 3100; ++i) {
+    ASSERT_OK(Delete(Key(i)));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(6), 0);
+}
+
+TEST_F(DBTestUniversalCompaction2, IngestBehind) {
+  const int kNumKeys = 3000;
+  const int kWindowSize = 100;
+  const int kNumDelsTrigger = 90;
+
+  Options opts = CurrentOptions();
+  opts.table_properties_collector_factories.emplace_back(
+      NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.level0_file_num_compaction_trigger = 2;
+  opts.compression = kNoCompression;
+  opts.allow_ingest_behind = true;
+  opts.compaction_options_universal.size_ratio = 10;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compaction_options_universal.max_size_amplification_percent = 200;
+  Reopen(opts);
+
+  // add an L1 file to prevent tombstones from dropping due to obsolescence
+  // during flush
+  int i;
+  for (i = 0; i < 2000; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ASSERT_OK(Flush());
+  //  MoveFilesToLevel(6);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  for (i = 1999; i < kNumKeys; ++i) {
+    if (i >= kNumKeys - kWindowSize &&
+        i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+      ASSERT_OK(Delete(Key(i)));
+    } else {
+      ASSERT_OK(Put(Key(i), "val"));
+    }
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(6));
+  ASSERT_GT(NumTableFilesAtLevel(5), 0);
+}
+
+TEST_F(DBTestUniversalCompaction2, PeriodicCompactionDefault) {
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.env = env_;
+  KeepFilterFactory* filter = new KeepFilterFactory(true);
+  options.compaction_filter_factory.reset(filter);
+  Reopen(options);
+  ASSERT_EQ(30 * 24 * 60 * 60,
+            dbfull()->GetOptions().periodic_compaction_seconds);
+
+  KeepFilter df;
+  options.compaction_filter_factory.reset();
+  options.compaction_filter = &df;
+  Reopen(options);
+  ASSERT_EQ(30 * 24 * 60 * 60,
+            dbfull()->GetOptions().periodic_compaction_seconds);
+
+  options.ttl = 60 * 24 * 60 * 60;
+  options.compaction_filter = nullptr;
+  Reopen(options);
+  ASSERT_EQ(60 * 24 * 60 * 60,
+            dbfull()->GetOptions().periodic_compaction_seconds);
+}
+
+TEST_F(DBTestUniversalCompaction2, PeriodicCompaction) {
+  Options opts = CurrentOptions();
+  opts.env = env_;
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.level0_file_num_compaction_trigger = 10;
+  opts.max_open_files = -1;
+  opts.compaction_options_universal.size_ratio = 10;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compaction_options_universal.max_size_amplification_percent = 200;
+  opts.periodic_compaction_seconds = 48 * 60 * 60;  // 2 days
+  opts.num_levels = 5;
+  env_->SetMockSleep();
+  Reopen(opts);
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  int periodic_compactions = 0;
+  int start_level = -1;
+  int output_level = -1;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "UniversalCompactionPicker::PickPeriodicCompaction:Return",
+      [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(arg != nullptr);
+        ASSERT_TRUE(compaction->compaction_reason() ==
+                    CompactionReason::kPeriodicCompaction);
+        start_level = compaction->start_level();
+        output_level = compaction->output_level();
+        periodic_compactions++;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Case 1: Oldest flushed file excceeds periodic compaction threshold.
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ(0, periodic_compactions);
+  // Move clock forward so that the flushed file would qualify periodic
+  // compaction.
+  env_->MockSleepForSeconds(48 * 60 * 60 + 100);
+
+  // Another flush would trigger compaction the oldest file.
+  ASSERT_OK(Put("foo", "bar2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ(1, periodic_compactions);
+  ASSERT_EQ(0, start_level);
+  ASSERT_EQ(4, output_level);
+
+  // Case 2: Oldest compacted file excceeds periodic compaction threshold
+  periodic_compactions = 0;
+  // A flush doesn't trigger a periodic compaction when threshold not hit
+  ASSERT_OK(Put("foo", "bar2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(0, periodic_compactions);
+
+  // After periodic compaction threshold hits, a flush will trigger
+  // a compaction
+  ASSERT_OK(Put("foo", "bar2"));
+  env_->MockSleepForSeconds(48 * 60 * 60 + 100);
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(1, periodic_compactions);
+  ASSERT_EQ(0, start_level);
+  ASSERT_EQ(4, output_level);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !defined(ROCKSDB_LITE)
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  (void)argc;
+  (void)argv;
+  return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_wal_test.cc b/src/rocksdb/db/db_wal_test.cc
new file mode 100644
index 000000000..5b5ec76af
--- /dev/null
+++ b/src/rocksdb/db/db_wal_test.cc
@@ -0,0 +1,2314 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/file_system.h"
+#include "test_util/sync_point.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/fault_injection_fs.h"
+
+namespace ROCKSDB_NAMESPACE {
+class DBWALTestBase : public DBTestBase {
+ protected:
+  explicit DBWALTestBase(const std::string& dir_name)
+      : DBTestBase(dir_name, /*env_do_fsync=*/true) {}
+
+#if defined(ROCKSDB_PLATFORM_POSIX)
+ public:
+#if defined(ROCKSDB_FALLOCATE_PRESENT)
+  bool IsFallocateSupported() {
+    // Test fallocate support of running file system.
+    // Skip this test if fallocate is not supported.
+    std::string fname_test_fallocate = dbname_ + "/preallocate_testfile";
+    int fd = -1;
+    do {
+      fd = open(fname_test_fallocate.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
+    } while (fd < 0 && errno == EINTR);
+    assert(fd > 0);
+    int alloc_status = fallocate(fd, 0, 0, 1);
+    int err_number = errno;
+    close(fd);
+    assert(env_->DeleteFile(fname_test_fallocate) == Status::OK());
+    if (err_number == ENOSYS || err_number == EOPNOTSUPP) {
+      fprintf(stderr, "Skipped preallocated space check: %s\n",
+              errnoStr(err_number).c_str());
+      return false;
+    }
+    assert(alloc_status == 0);
+    return true;
+  }
+#endif  // ROCKSDB_FALLOCATE_PRESENT
+
+  uint64_t GetAllocatedFileSize(std::string file_name) {
+    struct stat sbuf;
+    int err = stat(file_name.c_str(), &sbuf);
+    assert(err == 0);
+    return sbuf.st_blocks * 512;
+  }
+#endif  // ROCKSDB_PLATFORM_POSIX
+};
+
+class DBWALTest : public DBWALTestBase {
+ public:
+  DBWALTest() : DBWALTestBase("/db_wal_test") {}
+};
+
+// A SpecialEnv enriched to give more insight about deleted files
+class EnrichedSpecialEnv : public SpecialEnv {
+ public:
+  explicit EnrichedSpecialEnv(Env* base) : SpecialEnv(base) {}
+  Status NewSequentialFile(const std::string& f,
+                           std::unique_ptr<SequentialFile>* r,
+                           const EnvOptions& soptions) override {
+    InstrumentedMutexLock l(&env_mutex_);
+    if (f == skipped_wal) {
+      deleted_wal_reopened = true;
+      if (IsWAL(f) && largest_deleted_wal.size() != 0 &&
+          f.compare(largest_deleted_wal) <= 0) {
+        gap_in_wals = true;
+      }
+    }
+    return SpecialEnv::NewSequentialFile(f, r, soptions);
+  }
+  Status DeleteFile(const std::string& fname) override {
+    if (IsWAL(fname)) {
+      deleted_wal_cnt++;
+      InstrumentedMutexLock l(&env_mutex_);
+      // If this is the first WAL, remember its name and skip deleting it. We
+      // remember its name partly because the application might attempt to
+      // delete the file again.
+      if (skipped_wal.size() != 0 && skipped_wal != fname) {
+        if (largest_deleted_wal.size() == 0 ||
+            largest_deleted_wal.compare(fname) < 0) {
+          largest_deleted_wal = fname;
+        }
+      } else {
+        skipped_wal = fname;
+        return Status::OK();
+      }
+    }
+    return SpecialEnv::DeleteFile(fname);
+  }
+  bool IsWAL(const std::string& fname) {
+    // printf("iswal %s\n", fname.c_str());
+    return fname.compare(fname.size() - 3, 3, "log") == 0;
+  }
+
+  InstrumentedMutex env_mutex_;
+  // the wal whose actual delete was skipped by the env
+  std::string skipped_wal = "";
+  // the largest WAL that was requested to be deleted
+  std::string largest_deleted_wal = "";
+  // number of WALs that were successfully deleted
+  std::atomic<size_t> deleted_wal_cnt = {0};
+  // the WAL whose delete from fs was skipped is reopened during recovery
+  std::atomic<bool> deleted_wal_reopened = {false};
+  // whether a gap in the WALs was detected during recovery
+  std::atomic<bool> gap_in_wals = {false};
+};
+
+class DBWALTestWithEnrichedEnv : public DBTestBase {
+ public:
+  DBWALTestWithEnrichedEnv()
+      : DBTestBase("db_wal_test", /*env_do_fsync=*/true) {
+    enriched_env_ = new EnrichedSpecialEnv(env_->target());
+    auto options = CurrentOptions();
+    options.env = enriched_env_;
+    options.allow_2pc = true;
+    Reopen(options);
+    delete env_;
+    // to be deleted by the parent class
+    env_ = enriched_env_;
+  }
+
+ protected:
+  EnrichedSpecialEnv* enriched_env_;
+};
+
+// Test that the recovery would successfully avoid the gaps between the logs.
+// One known scenario that could cause this is that the application issue the
+// WAL deletion out of order. For the sake of simplicity in the test, here we
+// create the gap by manipulating the env to skip deletion of the first WAL but
+// not the ones after it.
+TEST_F(DBWALTestWithEnrichedEnv, SkipDeletedWALs) {
+  auto options = last_options_;
+  // To cause frequent WAL deletion
+  options.write_buffer_size = 128;
+  Reopen(options);
+
+  WriteOptions writeOpt = WriteOptions();
+  for (int i = 0; i < 128 * 5; i++) {
+    ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1"));
+  }
+  FlushOptions fo;
+  fo.wait = true;
+  ASSERT_OK(db_->Flush(fo));
+
+  // some wals are deleted
+  ASSERT_NE(0, enriched_env_->deleted_wal_cnt);
+  // but not the first one
+  ASSERT_NE(0, enriched_env_->skipped_wal.size());
+
+  // Test that the WAL that was not deleted will be skipped during recovery
+  options = last_options_;
+  Reopen(options);
+  ASSERT_FALSE(enriched_env_->deleted_wal_reopened);
+  ASSERT_FALSE(enriched_env_->gap_in_wals);
+}
+
+TEST_F(DBWALTest, WAL) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v1", Get(1, "bar"));
+
+    writeOpt.disableWAL = false;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2"));
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    // Both value's should be present.
+    ASSERT_EQ("v2", Get(1, "bar"));
+    ASSERT_EQ("v2", Get(1, "foo"));
+
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3"));
+    writeOpt.disableWAL = false;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    // again both values should be present.
+    ASSERT_EQ("v3", Get(1, "foo"));
+    ASSERT_EQ("v3", Get(1, "bar"));
+  } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, RollLog) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Put(1, "baz", "v5"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    for (int i = 0; i < 10; i++) {
+      ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    }
+    ASSERT_OK(Put(1, "foo", "v4"));
+    for (int i = 0; i < 10; i++) {
+      ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    }
+  } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, SyncWALNotBlockWrite) {
+  Options options = CurrentOptions();
+  options.max_write_buffer_number = 4;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("foo1", "bar1"));
+  ASSERT_OK(Put("foo5", "bar5"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"WritableFileWriter::SyncWithoutFlush:1",
+       "DBWALTest::SyncWALNotBlockWrite:1"},
+      {"DBWALTest::SyncWALNotBlockWrite:2",
+       "WritableFileWriter::SyncWithoutFlush:2"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread thread([&]() { ASSERT_OK(db_->SyncWAL()); });
+
+  TEST_SYNC_POINT("DBWALTest::SyncWALNotBlockWrite:1");
+  ASSERT_OK(Put("foo2", "bar2"));
+  ASSERT_OK(Put("foo3", "bar3"));
+  FlushOptions fo;
+  fo.wait = false;
+  ASSERT_OK(db_->Flush(fo));
+  ASSERT_OK(Put("foo4", "bar4"));
+
+  TEST_SYNC_POINT("DBWALTest::SyncWALNotBlockWrite:2");
+
+  thread.join();
+
+  ASSERT_EQ(Get("foo1"), "bar1");
+  ASSERT_EQ(Get("foo2"), "bar2");
+  ASSERT_EQ(Get("foo3"), "bar3");
+  ASSERT_EQ(Get("foo4"), "bar4");
+  ASSERT_EQ(Get("foo5"), "bar5");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBWALTest, SyncWALNotWaitWrite) {
+  ASSERT_OK(Put("foo1", "bar1"));
+  ASSERT_OK(Put("foo3", "bar3"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"SpecialEnv::WalFile::Append:1", "DBWALTest::SyncWALNotWaitWrite:1"},
+      {"DBWALTest::SyncWALNotWaitWrite:2", "SpecialEnv::WalFile::Append:2"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread thread(
+      [&]() { ASSERT_OK(Put("foo2", "bar2")); });
+  // Moving this to SyncWAL before the actual fsync
+  // TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:1");
+  ASSERT_OK(db_->SyncWAL());
+  // Moving this to SyncWAL after actual fsync
+  // TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2");
+
+  thread.join();
+
+  ASSERT_EQ(Get("foo1"), "bar1");
+  ASSERT_EQ(Get("foo2"), "bar2");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBWALTest, Recover) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Put(1, "baz", "v5"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v5", Get(1, "baz"));
+    ASSERT_OK(Put(1, "bar", "v2"));
+    ASSERT_OK(Put(1, "foo", "v3"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("v3", Get(1, "foo"));
+    ASSERT_OK(Put(1, "foo", "v4"));
+    ASSERT_EQ("v4", Get(1, "foo"));
+    ASSERT_EQ("v2", Get(1, "bar"));
+    ASSERT_EQ("v5", Get(1, "baz"));
+  } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, RecoverWithTableHandle) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.disable_auto_compactions = true;
+    options.avoid_flush_during_recovery = false;
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Put(1, "bar", "v2"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Put(1, "foo", "v3"));
+    ASSERT_OK(Put(1, "bar", "v4"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Put(1, "big", std::string(100, 'a')));
+
+    options = CurrentOptions();
+    const int kSmallMaxOpenFiles = 13;
+    if (option_config_ == kDBLogDir) {
+      // Use this option to check not preloading files
+      // Set the max open files to be small enough so no preload will
+      // happen.
+      options.max_open_files = kSmallMaxOpenFiles;
+      // RocksDB sanitize max open files to at least 20. Modify it back.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+            int* max_open_files = static_cast<int*>(arg);
+            *max_open_files = kSmallMaxOpenFiles;
+          });
+
+    } else if (option_config_ == kWalDirAndMmapReads) {
+      // Use this option to check always loading all files.
+      options.max_open_files = 100;
+    } else {
+      options.max_open_files = -1;
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+    std::vector<std::vector<FileMetaData>> files;
+    dbfull()->TEST_GetFilesMetaData(handles_[1], &files);
+    size_t total_files = 0;
+    for (const auto& level : files) {
+      total_files += level.size();
+    }
+    ASSERT_EQ(total_files, 3);
+    for (const auto& level : files) {
+      for (const auto& file : level) {
+        if (options.max_open_files == kSmallMaxOpenFiles) {
+          ASSERT_TRUE(file.table_reader_handle == nullptr);
+        } else {
+          ASSERT_TRUE(file.table_reader_handle != nullptr);
+        }
+      }
+    }
+  } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, RecoverWithBlob) {
+  // Write a value that's below the prospective size limit for blobs and another
+  // one that's above. Note that blob files are not actually enabled at this
+  // point.
+  constexpr uint64_t min_blob_size = 10;
+
+  constexpr char short_value[] = "short";
+  static_assert(sizeof(short_value) - 1 < min_blob_size,
+                "short_value too long");
+
+  constexpr char long_value[] = "long_value";
+  static_assert(sizeof(long_value) - 1 >= min_blob_size,
+                "long_value too short");
+
+  ASSERT_OK(Put("key1", short_value));
+  ASSERT_OK(Put("key2", long_value));
+
+  // There should be no files just yet since we haven't flushed.
+  {
+    VersionSet* const versions = dbfull()->GetVersionSet();
+    ASSERT_NE(versions, nullptr);
+
+    ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+    ASSERT_NE(cfd, nullptr);
+
+    Version* const current = cfd->current();
+    ASSERT_NE(current, nullptr);
+
+    const VersionStorageInfo* const storage_info = current->storage_info();
+    ASSERT_NE(storage_info, nullptr);
+
+    ASSERT_EQ(storage_info->num_non_empty_levels(), 0);
+    ASSERT_TRUE(storage_info->GetBlobFiles().empty());
+  }
+
+  // Reopen the database with blob files enabled. A new table file/blob file
+  // pair should be written during recovery.
+  Options options;
+  options.enable_blob_files = true;
+  options.min_blob_size = min_blob_size;
+  options.avoid_flush_during_recovery = false;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+
+  Reopen(options);
+
+  ASSERT_EQ(Get("key1"), short_value);
+  ASSERT_EQ(Get("key2"), long_value);
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  ASSERT_NE(versions, nullptr);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  ASSERT_NE(cfd, nullptr);
+
+  Version* const current = cfd->current();
+  ASSERT_NE(current, nullptr);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  ASSERT_NE(storage_info, nullptr);
+
+  const auto& l0_files = storage_info->LevelFiles(0);
+  ASSERT_EQ(l0_files.size(), 1);
+
+  const FileMetaData* const table_file = l0_files[0];
+  ASSERT_NE(table_file, nullptr);
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+  ASSERT_EQ(blob_files.size(), 1);
+
+  const auto& blob_file = blob_files.front();
+  ASSERT_NE(blob_file, nullptr);
+
+  ASSERT_EQ(table_file->smallest.user_key(), "key1");
+  ASSERT_EQ(table_file->largest.user_key(), "key2");
+  ASSERT_EQ(table_file->fd.smallest_seqno, 1);
+  ASSERT_EQ(table_file->fd.largest_seqno, 2);
+  ASSERT_EQ(table_file->oldest_blob_file_number,
+            blob_file->GetBlobFileNumber());
+
+  ASSERT_EQ(blob_file->GetTotalBlobCount(), 1);
+
+#ifndef ROCKSDB_LITE
+  const InternalStats* const internal_stats = cfd->internal_stats();
+  ASSERT_NE(internal_stats, nullptr);
+
+  const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+  ASSERT_FALSE(compaction_stats.empty());
+  ASSERT_EQ(compaction_stats[0].bytes_written, table_file->fd.GetFileSize());
+  ASSERT_EQ(compaction_stats[0].bytes_written_blob,
+            blob_file->GetTotalBlobBytes());
+  ASSERT_EQ(compaction_stats[0].num_output_files, 1);
+  ASSERT_EQ(compaction_stats[0].num_output_files_blob, 1);
+
+  const uint64_t* const cf_stats_value = internal_stats->TEST_GetCFStatsValue();
+  ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED],
+            compaction_stats[0].bytes_written +
+                compaction_stats[0].bytes_written_blob);
+#endif  // ROCKSDB_LITE
+}
+
+TEST_F(DBWALTest, RecoverWithBlobMultiSST) {
+  // Write several large (4 KB) values without flushing. Note that blob files
+  // are not actually enabled at this point.
+  std::string large_value(1 << 12, 'a');
+
+  constexpr int num_keys = 64;
+
+  for (int i = 0; i < num_keys; ++i) {
+    ASSERT_OK(Put(Key(i), large_value));
+  }
+
+  // There should be no files just yet since we haven't flushed.
+  {
+    VersionSet* const versions = dbfull()->GetVersionSet();
+    ASSERT_NE(versions, nullptr);
+
+    ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+    ASSERT_NE(cfd, nullptr);
+
+    Version* const current = cfd->current();
+    ASSERT_NE(current, nullptr);
+
+    const VersionStorageInfo* const storage_info = current->storage_info();
+    ASSERT_NE(storage_info, nullptr);
+
+    ASSERT_EQ(storage_info->num_non_empty_levels(), 0);
+    ASSERT_TRUE(storage_info->GetBlobFiles().empty());
+  }
+
+  // Reopen the database with blob files enabled and write buffer size set to a
+  // smaller value. Multiple table files+blob files should be written and added
+  // to the Version during recovery.
+  Options options;
+  options.write_buffer_size = 1 << 16;  // 64 KB
+  options.enable_blob_files = true;
+  options.avoid_flush_during_recovery = false;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+
+  Reopen(options);
+
+  for (int i = 0; i < num_keys; ++i) {
+    ASSERT_EQ(Get(Key(i)), large_value);
+  }
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  ASSERT_NE(versions, nullptr);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  ASSERT_NE(cfd, nullptr);
+
+  Version* const current = cfd->current();
+  ASSERT_NE(current, nullptr);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  ASSERT_NE(storage_info, nullptr);
+
+  const auto& l0_files = storage_info->LevelFiles(0);
+  ASSERT_GT(l0_files.size(), 1);
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+  ASSERT_GT(blob_files.size(), 1);
+
+  ASSERT_EQ(l0_files.size(), blob_files.size());
+}
+
+TEST_F(DBWALTest, WALWithChecksumHandoff) {
+#ifndef ROCKSDB_ASSERT_STATUS_CHECKED
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  do {
+    Options options = CurrentOptions();
+
+    options.checksum_handoff_file_types.Add(FileType::kWalFile);
+    options.env = fault_fs_env.get();
+    fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+
+    CreateAndReopenWithCF({"pikachu"}, options);
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v1", Get(1, "bar"));
+
+    writeOpt.disableWAL = false;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2"));
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    // Both value's should be present.
+    ASSERT_EQ("v2", Get(1, "bar"));
+    ASSERT_EQ("v2", Get(1, "foo"));
+
+    writeOpt.disableWAL = true;
+    // This put, data is persisted by Flush
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    writeOpt.disableWAL = false;
+    // Data is persisted in the WAL
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "zoo", "v3"));
+    // The hash does not match, write fails
+    fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+    writeOpt.disableWAL = false;
+    ASSERT_NOK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    // Due to the write failure, Get should not find
+    ASSERT_NE("v3", Get(1, "foo"));
+    ASSERT_EQ("v3", Get(1, "zoo"));
+    ASSERT_EQ("v3", Get(1, "bar"));
+
+    fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+    // Each write will be similated as corrupted.
+    fault_fs->IngestDataCorruptionBeforeWrite();
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v4"));
+    writeOpt.disableWAL = false;
+    ASSERT_NOK(dbfull()->Put(writeOpt, handles_[1], "foo", "v4"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_NE("v4", Get(1, "foo"));
+    ASSERT_NE("v4", Get(1, "bar"));
+    fault_fs->NoDataCorruptionBeforeWrite();
+
+    fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+    // The file system does not provide checksum method and verification.
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v5"));
+    writeOpt.disableWAL = false;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v5"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_EQ("v5", Get(1, "foo"));
+    ASSERT_EQ("v5", Get(1, "bar"));
+
+    Destroy(options);
+  } while (ChangeWalOptions());
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+}
+
+class DBRecoveryTestBlobError
+    : public DBWALTest,
+      public testing::WithParamInterface<std::string> {
+ public:
+  DBRecoveryTestBlobError() : sync_point_(GetParam()) {}
+
+  std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(DBRecoveryTestBlobError, DBRecoveryTestBlobError,
+                        ::testing::ValuesIn(std::vector<std::string>{
+                            "BlobFileBuilder::WriteBlobToFile:AddRecord",
+                            "BlobFileBuilder::WriteBlobToFile:AppendFooter"}));
+
+TEST_P(DBRecoveryTestBlobError, RecoverWithBlobError) {
+  // Write a value. Note that blob files are not actually enabled at this point.
+  ASSERT_OK(Put("key", "blob"));
+
+  // Reopen with blob files enabled but make blob file writing fail during
+  // recovery.
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) {
+    Status* const s = static_cast<Status*>(arg);
+    assert(s);
+
+    (*s) = Status::IOError(sync_point_);
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options;
+  options.enable_blob_files = true;
+  options.avoid_flush_during_recovery = false;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+
+  ASSERT_NOK(TryReopen(options));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Make sure the files generated by the failed recovery have been deleted.
+  std::vector<std::string> files;
+  ASSERT_OK(env_->GetChildren(dbname_, &files));
+  for (const auto& file : files) {
+    uint64_t number = 0;
+    FileType type = kTableFile;
+
+    if (!ParseFileName(file, &number, &type)) {
+      continue;
+    }
+
+    ASSERT_NE(type, kTableFile);
+    ASSERT_NE(type, kBlobFile);
+  }
+}
+
+TEST_F(DBWALTest, IgnoreRecoveredLog) {
+  std::string backup_logs = dbname_ + "/backup_logs";
+
+  do {
+    // delete old files in backup_logs directory
+    ASSERT_OK(env_->CreateDirIfMissing(backup_logs));
+    std::vector<std::string> old_files;
+    ASSERT_OK(env_->GetChildren(backup_logs, &old_files));
+    for (auto& file : old_files) {
+      ASSERT_OK(env_->DeleteFile(backup_logs + "/" + file));
+    }
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+    options.wal_dir = dbname_ + "/logs";
+    DestroyAndReopen(options);
+
+    // fill up the DB
+    std::string one, two;
+    PutFixed64(&one, 1);
+    PutFixed64(&two, 2);
+    ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one)));
+    ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one)));
+    ASSERT_OK(db_->Merge(WriteOptions(), Slice("bar"), Slice(one)));
+
+    // copy the logs to backup
+    std::vector<std::string> logs;
+    ASSERT_OK(env_->GetChildren(options.wal_dir, &logs));
+    for (auto& log : logs) {
+      CopyFile(options.wal_dir + "/" + log, backup_logs + "/" + log);
+    }
+
+    // recover the DB
+    Reopen(options);
+    ASSERT_EQ(two, Get("foo"));
+    ASSERT_EQ(one, Get("bar"));
+    Close();
+
+    // copy the logs from backup back to wal dir
+    for (auto& log : logs) {
+      CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
+    }
+    // this should ignore the log files, recovery should not happen again
+    // if the recovery happens, the same merge operator would be called twice,
+    // leading to incorrect results
+    Reopen(options);
+    ASSERT_EQ(two, Get("foo"));
+    ASSERT_EQ(one, Get("bar"));
+    Close();
+    Destroy(options);
+    Reopen(options);
+    Close();
+
+    // copy the logs from backup back to wal dir
+    ASSERT_OK(env_->CreateDirIfMissing(options.wal_dir));
+    for (auto& log : logs) {
+      CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
+    }
+    // assert that we successfully recovered only from logs, even though we
+    // destroyed the DB
+    Reopen(options);
+    ASSERT_EQ(two, Get("foo"));
+    ASSERT_EQ(one, Get("bar"));
+
+    // Recovery will fail if DB directory doesn't exist.
+    Destroy(options);
+    // copy the logs from backup back to wal dir
+    ASSERT_OK(env_->CreateDirIfMissing(options.wal_dir));
+    for (auto& log : logs) {
+      CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
+      // we won't be needing this file no more
+      ASSERT_OK(env_->DeleteFile(backup_logs + "/" + log));
+    }
+    Status s = TryReopen(options);
+    ASSERT_NOK(s);
+    Destroy(options);
+  } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, RecoveryWithEmptyLog) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Put(1, "foo", "v2"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v3"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("v3", Get(1, "foo"));
+  } while (ChangeWalOptions());
+}
+
+#if !(defined NDEBUG) || !defined(OS_WIN)
+TEST_F(DBWALTest, PreallocateBlock) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 10 * 1000 * 1000;
+  options.max_total_wal_size = 0;
+
+  size_t expected_preallocation_size = static_cast<size_t>(
+      options.write_buffer_size + options.write_buffer_size / 10);
+
+  DestroyAndReopen(options);
+
+  std::atomic<int> called(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBTestWalFile.GetPreallocationStatus", [&](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        size_t preallocation_size = *(static_cast<size_t*>(arg));
+        ASSERT_EQ(expected_preallocation_size, preallocation_size);
+        called.fetch_add(1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put("", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("", ""));
+  Close();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(2, called.load());
+
+  options.max_total_wal_size = 1000 * 1000;
+  expected_preallocation_size = static_cast<size_t>(options.max_total_wal_size);
+  Reopen(options);
+  called.store(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBTestWalFile.GetPreallocationStatus", [&](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        size_t preallocation_size = *(static_cast<size_t*>(arg));
+        ASSERT_EQ(expected_preallocation_size, preallocation_size);
+        called.fetch_add(1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put("", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("", ""));
+  Close();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(2, called.load());
+
+  options.db_write_buffer_size = 800 * 1000;
+  expected_preallocation_size =
+      static_cast<size_t>(options.db_write_buffer_size);
+  Reopen(options);
+  called.store(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBTestWalFile.GetPreallocationStatus", [&](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        size_t preallocation_size = *(static_cast<size_t*>(arg));
+        ASSERT_EQ(expected_preallocation_size, preallocation_size);
+        called.fetch_add(1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put("", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("", ""));
+  Close();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(2, called.load());
+
+  expected_preallocation_size = 700 * 1000;
+  std::shared_ptr<WriteBufferManager> write_buffer_manager =
+      std::make_shared<WriteBufferManager>(static_cast<uint64_t>(700 * 1000));
+  options.write_buffer_manager = write_buffer_manager;
+  Reopen(options);
+  called.store(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBTestWalFile.GetPreallocationStatus", [&](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        size_t preallocation_size = *(static_cast<size_t*>(arg));
+        ASSERT_EQ(expected_preallocation_size, preallocation_size);
+        called.fetch_add(1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put("", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("", ""));
+  Close();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(2, called.load());
+}
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBWALTest, DISABLED_FullPurgePreservesRecycledLog) {
+  // TODO(ajkr): Disabled until WAL recycling is fixed for
+  // `kPointInTimeRecovery`.
+
+  // For github issue #1303
+  for (int i = 0; i < 2; ++i) {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.recycle_log_file_num = 2;
+    if (i != 0) {
+      options.wal_dir = alternative_wal_dir_;
+    }
+
+    DestroyAndReopen(options);
+    ASSERT_OK(Put("foo", "v1"));
+    VectorLogPtr log_files;
+    ASSERT_OK(dbfull()->GetSortedWalFiles(log_files));
+    ASSERT_GT(log_files.size(), 0);
+    ASSERT_OK(Flush());
+
+    // Now the original WAL is in log_files[0] and should be marked for
+    // recycling.
+    // Verify full purge cannot remove this file.
+    JobContext job_context(0);
+    dbfull()->TEST_LockMutex();
+    dbfull()->FindObsoleteFiles(&job_context, true /* force */);
+    dbfull()->TEST_UnlockMutex();
+    dbfull()->PurgeObsoleteFiles(job_context);
+
+    if (i == 0) {
+      ASSERT_OK(
+          env_->FileExists(LogFileName(dbname_, log_files[0]->LogNumber())));
+    } else {
+      ASSERT_OK(env_->FileExists(
+          LogFileName(alternative_wal_dir_, log_files[0]->LogNumber())));
+    }
+  }
+}
+
+TEST_F(DBWALTest, DISABLED_FullPurgePreservesLogPendingReuse) {
+  // TODO(ajkr): Disabled until WAL recycling is fixed for
+  // `kPointInTimeRecovery`.
+
+  // Ensures full purge cannot delete a WAL while it's in the process of being
+  // recycled. In particular, we force the full purge after a file has been
+  // chosen for reuse, but before it has been renamed.
+  for (int i = 0; i < 2; ++i) {
+    Options options = CurrentOptions();
+    options.recycle_log_file_num = 1;
+    if (i != 0) {
+      options.wal_dir = alternative_wal_dir_;
+    }
+    DestroyAndReopen(options);
+
+    // The first flush creates a second log so writes can continue before the
+    // flush finishes.
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Flush());
+
+    // The second flush can recycle the first log. Sync points enforce the
+    // full purge happens after choosing the log to recycle and before it is
+    // renamed.
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+        {"DBImpl::CreateWAL:BeforeReuseWritableFile1",
+         "DBWALTest::FullPurgePreservesLogPendingReuse:PreFullPurge"},
+        {"DBWALTest::FullPurgePreservesLogPendingReuse:PostFullPurge",
+         "DBImpl::CreateWAL:BeforeReuseWritableFile2"},
+    });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    ROCKSDB_NAMESPACE::port::Thread thread([&]() {
+      TEST_SYNC_POINT(
+          "DBWALTest::FullPurgePreservesLogPendingReuse:PreFullPurge");
+      ASSERT_OK(db_->EnableFileDeletions(true));
+      TEST_SYNC_POINT(
+          "DBWALTest::FullPurgePreservesLogPendingReuse:PostFullPurge");
+    });
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Flush());
+    thread.join();
+  }
+}
+
+TEST_F(DBWALTest, GetSortedWalFiles) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    VectorLogPtr log_files;
+    ASSERT_OK(dbfull()->GetSortedWalFiles(log_files));
+    ASSERT_EQ(0, log_files.size());
+
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(dbfull()->GetSortedWalFiles(log_files));
+    ASSERT_EQ(1, log_files.size());
+  } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, GetCurrentWalFile) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+
+    std::unique_ptr<LogFile>* bad_log_file = nullptr;
+    ASSERT_NOK(dbfull()->GetCurrentWalFile(bad_log_file));
+
+    std::unique_ptr<LogFile> log_file;
+    ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file));
+
+    // nothing has been written to the log yet
+    ASSERT_EQ(log_file->StartSequence(), 0);
+    ASSERT_EQ(log_file->SizeFileBytes(), 0);
+    ASSERT_EQ(log_file->Type(), kAliveLogFile);
+    ASSERT_GT(log_file->LogNumber(), 0);
+
+    // add some data and verify that the file size actually moves foward
+    ASSERT_OK(Put(0, "foo", "v1"));
+    ASSERT_OK(Put(0, "foo2", "v2"));
+    ASSERT_OK(Put(0, "foo3", "v3"));
+
+    ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file));
+
+    ASSERT_EQ(log_file->StartSequence(), 0);
+    ASSERT_GT(log_file->SizeFileBytes(), 0);
+    ASSERT_EQ(log_file->Type(), kAliveLogFile);
+    ASSERT_GT(log_file->LogNumber(), 0);
+
+    // force log files to cycle and add some more data, then check if
+    // log number moves forward
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    for (int i = 0; i < 10; i++) {
+      ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    }
+
+    ASSERT_OK(Put(0, "foo4", "v4"));
+    ASSERT_OK(Put(0, "foo5", "v5"));
+    ASSERT_OK(Put(0, "foo6", "v6"));
+
+    ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file));
+
+    ASSERT_EQ(log_file->StartSequence(), 0);
+    ASSERT_GT(log_file->SizeFileBytes(), 0);
+    ASSERT_EQ(log_file->Type(), kAliveLogFile);
+    ASSERT_GT(log_file->LogNumber(), 0);
+
+  } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, RecoveryWithLogDataForSomeCFs) {
+  // Test for regression of WAL cleanup missing files that don't contain data
+  // for every column family.
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Put(1, "foo", "v2"));
+    uint64_t earliest_log_nums[2];
+    for (int i = 0; i < 2; ++i) {
+      if (i > 0) {
+        ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+      }
+      VectorLogPtr log_files;
+      ASSERT_OK(dbfull()->GetSortedWalFiles(log_files));
+      if (log_files.size() > 0) {
+        earliest_log_nums[i] = log_files[0]->LogNumber();
+      } else {
+        earliest_log_nums[i] = std::numeric_limits<uint64_t>::max();
+      }
+    }
+    // Check at least the first WAL was cleaned up during the recovery.
+    ASSERT_LT(earliest_log_nums[0], earliest_log_nums[1]);
+  } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, RecoverWithLargeLog) {
+  do {
+    {
+      Options options = CurrentOptions();
+      CreateAndReopenWithCF({"pikachu"}, options);
+      ASSERT_OK(Put(1, "big1", std::string(200000, '1')));
+      ASSERT_OK(Put(1, "big2", std::string(200000, '2')));
+      ASSERT_OK(Put(1, "small3", std::string(10, '3')));
+      ASSERT_OK(Put(1, "small4", std::string(10, '4')));
+      ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+    }
+
+    // Make sure that if we re-open with a small write buffer size that
+    // we flush table files in the middle of a large log file.
+    Options options;
+    options.write_buffer_size = 100000;
+    options = CurrentOptions(options);
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 3);
+    ASSERT_EQ(std::string(200000, '1'), Get(1, "big1"));
+    ASSERT_EQ(std::string(200000, '2'), Get(1, "big2"));
+    ASSERT_EQ(std::string(10, '3'), Get(1, "small3"));
+    ASSERT_EQ(std::string(10, '4'), Get(1, "small4"));
+    ASSERT_GT(NumTableFilesAtLevel(0, 1), 1);
+  } while (ChangeWalOptions());
+}
+
+// In https://reviews.facebook.net/D20661 we change
+// recovery behavior: previously for each log file each column family
+// memtable was flushed, even it was empty. Now it's changed:
+// we try to create the smallest number of table files by merging
+// updates from multiple logs
+TEST_F(DBWALTest, RecoverCheckFileAmountWithSmallWriteBuffer) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 5000000;
+  CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
+
+  // Since we will reopen DB with smaller write_buffer_size,
+  // each key will go to new SST file
+  ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+  ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+  ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+  ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+
+  ASSERT_OK(Put(3, Key(10), DummyString(1)));
+  // Make 'dobrynia' to be flushed and new WAL file to be created
+  ASSERT_OK(Put(2, Key(10), DummyString(7500000)));
+  ASSERT_OK(Put(2, Key(1), DummyString(1)));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2]));
+  {
+    auto tables = ListTableFiles(env_, dbname_);
+    ASSERT_EQ(tables.size(), static_cast<size_t>(1));
+    // Make sure 'dobrynia' was flushed: check sst files amount
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(1));
+  }
+  // New WAL file
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(3, Key(10), DummyString(1)));
+  ASSERT_OK(Put(3, Key(10), DummyString(1)));
+  ASSERT_OK(Put(3, Key(10), DummyString(1)));
+
+  options.write_buffer_size = 4096;
+  options.arena_block_size = 4096;
+  ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"},
+                           options);
+  {
+    // No inserts => default is empty
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(0));
+    // First 4 keys goes to separate SSTs + 1 more SST for 2 smaller keys
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(5));
+    // 1 SST for big key + 1 SST for small one
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(2));
+    // 1 SST for all keys
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(1));
+  }
+}
+
+// In https://reviews.facebook.net/D20661 we change
+// recovery behavior: previously for each log file each column family
+// memtable was flushed, even it wasn't empty. Now it's changed:
+// we try to create the smallest number of table files by merging
+// updates from multiple logs
+TEST_F(DBWALTest, RecoverCheckFileAmount) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000;
+  options.arena_block_size = 4 * 1024;
+  options.avoid_flush_during_recovery = false;
+  CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
+
+  ASSERT_OK(Put(0, Key(1), DummyString(1)));
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(2, Key(1), DummyString(1)));
+
+  // Make 'nikitich' memtable to be flushed
+  ASSERT_OK(Put(3, Key(10), DummyString(1002400)));
+  ASSERT_OK(Put(3, Key(1), DummyString(1)));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[3]));
+  // 4 memtable are not flushed, 1 sst file
+  {
+    auto tables = ListTableFiles(env_, dbname_);
+    ASSERT_EQ(tables.size(), static_cast<size_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(1));
+  }
+  // Memtable for 'nikitich' has flushed, new WAL file has opened
+  // 4 memtable still not flushed
+
+  // Write to new WAL file
+  ASSERT_OK(Put(0, Key(1), DummyString(1)));
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(2, Key(1), DummyString(1)));
+
+  // Fill up 'nikitich' one more time
+  ASSERT_OK(Put(3, Key(10), DummyString(1002400)));
+  // make it flush
+  ASSERT_OK(Put(3, Key(1), DummyString(1)));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[3]));
+  // There are still 4 memtable not flushed, and 2 sst tables
+  ASSERT_OK(Put(0, Key(1), DummyString(1)));
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(2, Key(1), DummyString(1)));
+
+  {
+    auto tables = ListTableFiles(env_, dbname_);
+    ASSERT_EQ(tables.size(), static_cast<size_t>(2));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(2));
+  }
+
+  ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"},
+                           options);
+  {
+    std::vector<uint64_t> table_files = ListTableFiles(env_, dbname_);
+    // Check, that records for 'default', 'dobrynia' and 'pikachu' from
+    // first, second and third WALs  went to the same SST.
+    // So, there is 6 SSTs: three  for 'nikitich', one for 'default', one for
+    // 'dobrynia', one for 'pikachu'
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(3));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(1));
+  }
+}
+
+TEST_F(DBWALTest, SyncMultipleLogs) {
+  const uint64_t kNumBatches = 2;
+  const int kBatchSize = 1000;
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.write_buffer_size = 4096;
+  Reopen(options);
+
+  WriteBatch batch;
+  WriteOptions wo;
+  wo.sync = true;
+
+  for (uint64_t b = 0; b < kNumBatches; b++) {
+    batch.Clear();
+    for (int i = 0; i < kBatchSize; i++) {
+      ASSERT_OK(batch.Put(Key(i), DummyString(128)));
+    }
+
+    ASSERT_OK(dbfull()->Write(wo, &batch));
+  }
+
+  ASSERT_OK(dbfull()->SyncWAL());
+}
+
+// Github issue 1339. Prior the fix we read sequence id from the first log to
+// a local variable, then keep increase the variable as we replay logs,
+// ignoring actual sequence id of the records. This is incorrect if some writes
+// come with WAL disabled.
+TEST_F(DBWALTest, PartOfWritesWithWALDisabled) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = CurrentOptions();
+  options.env = fault_env.get();
+  options.disable_auto_compactions = true;
+  WriteOptions wal_on, wal_off;
+  wal_on.sync = true;
+  wal_on.disableWAL = false;
+  wal_off.disableWAL = true;
+  CreateAndReopenWithCF({"dummy"}, options);
+  ASSERT_OK(Put(1, "dummy", "d1", wal_on));  // seq id 1
+  ASSERT_OK(Put(1, "dummy", "d2", wal_off));
+  ASSERT_OK(Put(1, "dummy", "d3", wal_off));
+  ASSERT_OK(Put(0, "key", "v4", wal_on));  // seq id 4
+  ASSERT_OK(Flush(0));
+  ASSERT_OK(Put(0, "key", "v5", wal_on));  // seq id 5
+  ASSERT_EQ("v5", Get(0, "key"));
+  ASSERT_OK(dbfull()->FlushWAL(false));
+  // Simulate a crash.
+  fault_env->SetFilesystemActive(false);
+  Close();
+  fault_env->ResetState();
+  ReopenWithColumnFamilies({"default", "dummy"}, options);
+  // Prior to the fix, we may incorrectly recover "v5" with sequence id = 3.
+  ASSERT_EQ("v5", Get(0, "key"));
+  // Destroy DB before destruct fault_env.
+  Destroy(options);
+}
+
+//
+// Test WAL recovery for the various modes available
+//
+class RecoveryTestHelper {
+ public:
+  // Number of WAL files to generate
+  static constexpr int kWALFilesCount = 10;
+  // Starting number for the WAL file name like 00010.log
+  static constexpr int kWALFileOffset = 10;
+  // Keys to be written per WAL file
+  static constexpr int kKeysPerWALFile = 133;
+  // Size of the value
+  static constexpr int kValueSize = 96;
+
+  // Create WAL files with values filled in
+  static void FillData(DBWALTestBase* test, const Options& options,
+                       const size_t wal_count, size_t* count) {
+    // Calling internal functions requires sanitized options.
+    Options sanitized_options = SanitizeOptions(test->dbname_, options);
+    const ImmutableDBOptions db_options(sanitized_options);
+
+    *count = 0;
+
+    std::shared_ptr<Cache> table_cache = NewLRUCache(50, 0);
+    FileOptions file_options;
+    WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
+
+    std::unique_ptr<VersionSet> versions;
+    std::unique_ptr<WalManager> wal_manager;
+    WriteController write_controller;
+
+    versions.reset(new VersionSet(
+        test->dbname_, &db_options, file_options, table_cache.get(),
+        &write_buffer_manager, &write_controller,
+        /*block_cache_tracer=*/nullptr,
+        /*io_tracer=*/nullptr, /*db_id*/ "", /*db_session_id*/ ""));
+
+    wal_manager.reset(
+        new WalManager(db_options, file_options, /*io_tracer=*/nullptr));
+
+    std::unique_ptr<log::Writer> current_log_writer;
+
+    for (size_t j = kWALFileOffset; j < wal_count + kWALFileOffset; j++) {
+      uint64_t current_log_number = j;
+      std::string fname = LogFileName(test->dbname_, current_log_number);
+      std::unique_ptr<WritableFileWriter> file_writer;
+      ASSERT_OK(WritableFileWriter::Create(db_options.env->GetFileSystem(),
+                                           fname, file_options, &file_writer,
+                                           nullptr));
+      log::Writer* log_writer =
+          new log::Writer(std::move(file_writer), current_log_number,
+                          db_options.recycle_log_file_num > 0, false,
+                          db_options.wal_compression);
+      ASSERT_OK(log_writer->AddCompressionTypeRecord());
+      current_log_writer.reset(log_writer);
+
+      WriteBatch batch;
+      for (int i = 0; i < kKeysPerWALFile; i++) {
+        std::string key = "key" + std::to_string((*count)++);
+        std::string value = test->DummyString(kValueSize);
+        ASSERT_NE(current_log_writer.get(), nullptr);
+        uint64_t seq = versions->LastSequence() + 1;
+        batch.Clear();
+        ASSERT_OK(batch.Put(key, value));
+        WriteBatchInternal::SetSequence(&batch, seq);
+        ASSERT_OK(current_log_writer->AddRecord(
+            WriteBatchInternal::Contents(&batch)));
+        versions->SetLastAllocatedSequence(seq);
+        versions->SetLastPublishedSequence(seq);
+        versions->SetLastSequence(seq);
+      }
+    }
+  }
+
+  // Recreate and fill the store with some data
+  static size_t FillData(DBWALTestBase* test, Options* options) {
+    options->create_if_missing = true;
+    test->DestroyAndReopen(*options);
+    test->Close();
+
+    size_t count = 0;
+    FillData(test, *options, kWALFilesCount, &count);
+    return count;
+  }
+
+  // Read back all the keys we wrote and return the number of keys found
+  static size_t GetData(DBWALTestBase* test) {
+    size_t count = 0;
+    for (size_t i = 0; i < kWALFilesCount * kKeysPerWALFile; i++) {
+      if (test->Get("key" + std::to_string(i)) != "NOT_FOUND") {
+        ++count;
+      }
+    }
+    return count;
+  }
+
+  // Manuall corrupt the specified WAL
+  static void CorruptWAL(DBWALTestBase* test, const Options& options,
+                         const double off, const double len,
+                         const int wal_file_id, const bool trunc = false) {
+    Env* env = options.env;
+    std::string fname = LogFileName(test->dbname_, wal_file_id);
+    uint64_t size;
+    ASSERT_OK(env->GetFileSize(fname, &size));
+    ASSERT_GT(size, 0);
+#ifdef OS_WIN
+    // Windows disk cache behaves differently. When we truncate
+    // the original content is still in the cache due to the original
+    // handle is still open. Generally, in Windows, one prohibits
+    // shared access to files and it is not needed for WAL but we allow
+    // it to induce corruption at various tests.
+    test->Close();
+#endif
+    if (trunc) {
+      ASSERT_OK(
+          test::TruncateFile(env, fname, static_cast<uint64_t>(size * off)));
+    } else {
+      ASSERT_OK(test::CorruptFile(env, fname, static_cast<int>(size * off + 8),
+                                  static_cast<int>(size * len), false));
+    }
+  }
+};
+
+class DBWALTestWithParams : public DBWALTestBase,
+                            public ::testing::WithParamInterface<
+                                std::tuple<bool, int, int, CompressionType>> {
+ public:
+  DBWALTestWithParams() : DBWALTestBase("/db_wal_test_with_params") {}
+};
+
+INSTANTIATE_TEST_CASE_P(
+    Wal, DBWALTestWithParams,
+    ::testing::Combine(::testing::Bool(), ::testing::Range(0, 4, 1),
+                       ::testing::Range(RecoveryTestHelper::kWALFileOffset,
+                                        RecoveryTestHelper::kWALFileOffset +
+                                            RecoveryTestHelper::kWALFilesCount,
+                                        1),
+                       ::testing::Values(CompressionType::kNoCompression,
+                                         CompressionType::kZSTD)));
+
+class DBWALTestWithParamsVaryingRecoveryMode
+    : public DBWALTestBase,
+      public ::testing::WithParamInterface<
+          std::tuple<bool, int, int, WALRecoveryMode, CompressionType>> {
+ public:
+  DBWALTestWithParamsVaryingRecoveryMode()
+      : DBWALTestBase("/db_wal_test_with_params_mode") {}
+};
+
+INSTANTIATE_TEST_CASE_P(
+    Wal, DBWALTestWithParamsVaryingRecoveryMode,
+    ::testing::Combine(
+        ::testing::Bool(), ::testing::Range(0, 4, 1),
+        ::testing::Range(RecoveryTestHelper::kWALFileOffset,
+                         RecoveryTestHelper::kWALFileOffset +
+                             RecoveryTestHelper::kWALFilesCount,
+                         1),
+        ::testing::Values(WALRecoveryMode::kTolerateCorruptedTailRecords,
+                          WALRecoveryMode::kAbsoluteConsistency,
+                          WALRecoveryMode::kPointInTimeRecovery,
+                          WALRecoveryMode::kSkipAnyCorruptedRecords),
+        ::testing::Values(CompressionType::kNoCompression,
+                          CompressionType::kZSTD)));
+
+// Test scope:
+// - We expect to open the data store when there is incomplete trailing writes
+// at the end of any of the logs
+// - We do not expect to open the data store for corruption
+TEST_P(DBWALTestWithParams, kTolerateCorruptedTailRecords) {
+  bool trunc = std::get<0>(GetParam());  // Corruption style
+  // Corruption offset position
+  int corrupt_offset = std::get<1>(GetParam());
+  int wal_file_id = std::get<2>(GetParam());  // WAL file
+
+  // Fill data for testing
+  Options options = CurrentOptions();
+  const size_t row_count = RecoveryTestHelper::FillData(this, &options);
+  // test checksum failure or parsing
+  RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .3,
+                                 /*len%=*/.1, wal_file_id, trunc);
+
+  options.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords;
+  if (trunc) {
+    options.create_if_missing = false;
+    ASSERT_OK(TryReopen(options));
+    const size_t recovered_row_count = RecoveryTestHelper::GetData(this);
+    ASSERT_TRUE(corrupt_offset == 0 || recovered_row_count > 0);
+    ASSERT_LT(recovered_row_count, row_count);
+  } else {
+    ASSERT_NOK(TryReopen(options));
+  }
+}
+
+// Test scope:
+// We don't expect the data store to be opened if there is any corruption
+// (leading, middle or trailing -- incomplete writes or corruption)
+TEST_P(DBWALTestWithParams, kAbsoluteConsistency) {
+  // Verify clean slate behavior
+  Options options = CurrentOptions();
+  const size_t row_count = RecoveryTestHelper::FillData(this, &options);
+  options.create_if_missing = false;
+  ASSERT_OK(TryReopen(options));
+  ASSERT_EQ(RecoveryTestHelper::GetData(this), row_count);
+
+  bool trunc = std::get<0>(GetParam());  // Corruption style
+  // Corruption offset position
+  int corrupt_offset = std::get<1>(GetParam());
+  int wal_file_id = std::get<2>(GetParam());  // WAL file
+  // WAL compression type
+  CompressionType compression_type = std::get<3>(GetParam());
+  options.wal_compression = compression_type;
+
+  if (trunc && corrupt_offset == 0) {
+    return;
+  }
+
+  // fill with new date
+  RecoveryTestHelper::FillData(this, &options);
+  // corrupt the wal
+  RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .33,
+                                 /*len%=*/.1, wal_file_id, trunc);
+  // verify
+  options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency;
+  options.create_if_missing = false;
+  ASSERT_NOK(TryReopen(options));
+}
+
+// Test scope:
+// We don't expect the data store to be opened if there is any inconsistency
+// between WAL and SST files
+TEST_F(DBWALTest, kPointInTimeRecoveryCFConsistency) {
+  Options options = CurrentOptions();
+  options.avoid_flush_during_recovery = true;
+
+  // Create DB with multiple column families.
+  CreateAndReopenWithCF({"one", "two"}, options);
+  ASSERT_OK(Put(1, "key1", "val1"));
+  ASSERT_OK(Put(2, "key2", "val2"));
+
+  // Record the offset at this point
+  Env* env = options.env;
+  uint64_t wal_file_id = dbfull()->TEST_LogfileNumber();
+  std::string fname = LogFileName(dbname_, wal_file_id);
+  uint64_t offset_to_corrupt;
+  ASSERT_OK(env->GetFileSize(fname, &offset_to_corrupt));
+  ASSERT_GT(offset_to_corrupt, 0);
+
+  ASSERT_OK(Put(1, "key3", "val3"));
+  // Corrupt WAL at location of key3
+  ASSERT_OK(test::CorruptFile(env, fname, static_cast<int>(offset_to_corrupt),
+                              4, false));
+  ASSERT_OK(Put(2, "key4", "val4"));
+  ASSERT_OK(Put(1, "key5", "val5"));
+  ASSERT_OK(Flush(2));
+
+  // PIT recovery & verify
+  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+  ASSERT_NOK(TryReopenWithColumnFamilies({"default", "one", "two"}, options));
+}
+
+TEST_F(DBWALTest, RaceInstallFlushResultsWithWalObsoletion) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.track_and_verify_wals_in_manifest = true;
+  // The following make sure there are two bg flush threads.
+  options.max_background_jobs = 8;
+
+  DestroyAndReopen(options);
+
+  const std::string cf1_name("cf1");
+  CreateAndReopenWithCF({cf1_name}, options);
+  assert(handles_.size() == 2);
+
+  {
+    dbfull()->TEST_LockMutex();
+    ASSERT_LE(2, dbfull()->GetBGJobLimits().max_flushes);
+    dbfull()->TEST_UnlockMutex();
+  }
+
+  ASSERT_OK(dbfull()->PauseBackgroundWork());
+
+  ASSERT_OK(db_->Put(WriteOptions(), handles_[1], "foo", "value"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "value"));
+
+  ASSERT_OK(dbfull()->TEST_FlushMemTable(
+      /*wait=*/false, /*allow_write_stall=*/true, handles_[1]));
+
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "value"));
+
+  ASSERT_OK(dbfull()->TEST_FlushMemTable(
+      /*wait=*/false, /*allow_write_stall=*/true, handles_[0]));
+
+  bool called = false;
+  std::atomic<int> bg_flush_threads{0};
+  std::atomic<bool> wal_synced{false};
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCallFlush:start", [&](void* /*arg*/) {
+        int cur = bg_flush_threads.load();
+        int desired = cur + 1;
+        if (cur > 0 ||
+            !bg_flush_threads.compare_exchange_strong(cur, desired)) {
+          while (!wal_synced.load()) {
+            // Wait until the other bg flush thread finishes committing WAL sync
+            // operation to the MANIFEST.
+          }
+        }
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushMemTableToOutputFile:CommitWal:1",
+      [&](void* /*arg*/) { wal_synced.store(true); });
+  // This callback will be called when the first bg flush thread reaches the
+  // point before entering the MANIFEST write queue after flushing the SST
+  // file.
+  // The purpose of the sync points here is to ensure both bg flush threads
+  // finish computing `min_wal_number_to_keep` before any of them updates the
+  // `log_number` for the column family that's being flushed.
+  SyncPoint::GetInstance()->SetCallBack(
+      "MemTableList::TryInstallMemtableFlushResults:AfterComputeMinWalToKeep",
+      [&](void* /*arg*/) {
+        dbfull()->mutex()->AssertHeld();
+        if (!called) {
+          // We are the first bg flush thread in the MANIFEST write queue.
+          // We set up the dependency between sync points for two threads that
+          // will be executing the same code.
+          // For the interleaving of events, see
+          // https://github.com/facebook/rocksdb/pull/9715.
+          // bg flush thread1 will release the db mutex while in the MANIFEST
+          // write queue. In the meantime, bg flush thread2 locks db mutex and
+          // computes the min_wal_number_to_keep (before thread1 writes to
+          // MANIFEST thus before cf1->log_number is updated). Bg thread2 joins
+          // the MANIFEST write queue afterwards and bg flush thread1 proceeds
+          // with writing to MANIFEST.
+          called = true;
+          SyncPoint::GetInstance()->LoadDependency({
+              {"VersionSet::LogAndApply:WriteManifestStart",
+               "DBWALTest::RaceInstallFlushResultsWithWalObsoletion:BgFlush2"},
+              {"DBWALTest::RaceInstallFlushResultsWithWalObsoletion:BgFlush2",
+               "VersionSet::LogAndApply:WriteManifest"},
+          });
+        } else {
+          // The other bg flush thread has already been in the MANIFEST write
+          // queue, and we are after.
+          TEST_SYNC_POINT(
+              "DBWALTest::RaceInstallFlushResultsWithWalObsoletion:BgFlush2");
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(dbfull()->ContinueBackgroundWork());
+
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+
+  ASSERT_TRUE(called);
+
+  Close();
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  DB* db1 = nullptr;
+  Status s = DB::OpenForReadOnly(options, dbname_, &db1);
+  ASSERT_OK(s);
+  assert(db1);
+  delete db1;
+}
+
+// Test scope:
+// - We expect to open data store under all circumstances
+// - We expect only data upto the point where the first error was encountered
+TEST_P(DBWALTestWithParams, kPointInTimeRecovery) {
+  const int maxkeys =
+      RecoveryTestHelper::kWALFilesCount * RecoveryTestHelper::kKeysPerWALFile;
+
+  bool trunc = std::get<0>(GetParam());  // Corruption style
+  // Corruption offset position
+  int corrupt_offset = std::get<1>(GetParam());
+  int wal_file_id = std::get<2>(GetParam());  // WAL file
+  // WAL compression type
+  CompressionType compression_type = std::get<3>(GetParam());
+
+  // Fill data for testing
+  Options options = CurrentOptions();
+  options.wal_compression = compression_type;
+  const size_t row_count = RecoveryTestHelper::FillData(this, &options);
+
+  // Corrupt the wal
+  // The offset here was 0.3 which cuts off right at the end of a
+  // valid fragment after wal zstd compression checksum is enabled,
+  // so changed the value to 0.33.
+  RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .33,
+                                 /*len%=*/.1, wal_file_id, trunc);
+
+  // Verify
+  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+  options.create_if_missing = false;
+  ASSERT_OK(TryReopen(options));
+
+  // Probe data for invariants
+  size_t recovered_row_count = RecoveryTestHelper::GetData(this);
+  ASSERT_LT(recovered_row_count, row_count);
+
+  // Verify a prefix of keys were recovered. But not in the case of full WAL
+  // truncation, because we have no way to know there was a corruption when
+  // truncation happened on record boundaries (preventing recovery holes in
+  // that case requires using `track_and_verify_wals_in_manifest`).
+  if (!trunc || corrupt_offset != 0) {
+    bool expect_data = true;
+    for (size_t k = 0; k < maxkeys; ++k) {
+      bool found = Get("key" + std::to_string(k)) != "NOT_FOUND";
+      if (expect_data && !found) {
+        expect_data = false;
+      }
+      ASSERT_EQ(found, expect_data);
+    }
+  }
+
+  const size_t min = RecoveryTestHelper::kKeysPerWALFile *
+                     (wal_file_id - RecoveryTestHelper::kWALFileOffset);
+  ASSERT_GE(recovered_row_count, min);
+  if (!trunc && corrupt_offset != 0) {
+    const size_t max = RecoveryTestHelper::kKeysPerWALFile *
+                       (wal_file_id - RecoveryTestHelper::kWALFileOffset + 1);
+    ASSERT_LE(recovered_row_count, max);
+  }
+}
+
+// Test scope:
+// - We expect to open the data store under all scenarios
+// - We expect to have recovered records past the corruption zone
+TEST_P(DBWALTestWithParams, kSkipAnyCorruptedRecords) {
+  bool trunc = std::get<0>(GetParam());  // Corruption style
+  // Corruption offset position
+  int corrupt_offset = std::get<1>(GetParam());
+  int wal_file_id = std::get<2>(GetParam());  // WAL file
+  // WAL compression type
+  CompressionType compression_type = std::get<3>(GetParam());
+
+  // Fill data for testing
+  Options options = CurrentOptions();
+  options.wal_compression = compression_type;
+  const size_t row_count = RecoveryTestHelper::FillData(this, &options);
+
+  // Corrupt the WAL
+  RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .3,
+                                 /*len%=*/.1, wal_file_id, trunc);
+
+  // Verify behavior
+  options.wal_recovery_mode = WALRecoveryMode::kSkipAnyCorruptedRecords;
+  options.create_if_missing = false;
+  ASSERT_OK(TryReopen(options));
+
+  // Probe data for invariants
+  size_t recovered_row_count = RecoveryTestHelper::GetData(this);
+  ASSERT_LT(recovered_row_count, row_count);
+
+  if (!trunc) {
+    ASSERT_TRUE(corrupt_offset != 0 || recovered_row_count > 0);
+  }
+}
+
+TEST_F(DBWALTest, AvoidFlushDuringRecovery) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.avoid_flush_during_recovery = false;
+
+  // Test with flush after recovery.
+  Reopen(options);
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Put("bar", "v2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", "v3"));
+  ASSERT_OK(Put("bar", "v4"));
+  ASSERT_EQ(1, TotalTableFiles());
+  // Reopen DB. Check if WAL logs flushed.
+  Reopen(options);
+  ASSERT_EQ("v3", Get("foo"));
+  ASSERT_EQ("v4", Get("bar"));
+  ASSERT_EQ(2, TotalTableFiles());
+
+  // Test without flush after recovery.
+  options.avoid_flush_during_recovery = true;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "v5"));
+  ASSERT_OK(Put("bar", "v6"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", "v7"));
+  ASSERT_OK(Put("bar", "v8"));
+  ASSERT_EQ(1, TotalTableFiles());
+  // Reopen DB. WAL logs should not be flushed this time.
+  Reopen(options);
+  ASSERT_EQ("v7", Get("foo"));
+  ASSERT_EQ("v8", Get("bar"));
+  ASSERT_EQ(1, TotalTableFiles());
+
+  // Force flush with allow_2pc.
+  options.avoid_flush_during_recovery = true;
+  options.allow_2pc = true;
+  ASSERT_OK(Put("foo", "v9"));
+  ASSERT_OK(Put("bar", "v10"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", "v11"));
+  ASSERT_OK(Put("bar", "v12"));
+  Reopen(options);
+  ASSERT_EQ("v11", Get("foo"));
+  ASSERT_EQ("v12", Get("bar"));
+  ASSERT_EQ(3, TotalTableFiles());
+}
+
+TEST_F(DBWALTest, WalCleanupAfterAvoidFlushDuringRecovery) {
+  // Verifies WAL files that were present during recovery, but not flushed due
+  // to avoid_flush_during_recovery, will be considered for deletion at a later
+  // stage. We check at least one such file is deleted during Flush().
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.avoid_flush_during_recovery = true;
+  Reopen(options);
+
+  ASSERT_OK(Put("foo", "v1"));
+  Reopen(options);
+  for (int i = 0; i < 2; ++i) {
+    if (i > 0) {
+      // Flush() triggers deletion of obsolete tracked files
+      ASSERT_OK(Flush());
+    }
+    VectorLogPtr log_files;
+    ASSERT_OK(dbfull()->GetSortedWalFiles(log_files));
+    if (i == 0) {
+      ASSERT_GT(log_files.size(), 0);
+    } else {
+      ASSERT_EQ(0, log_files.size());
+    }
+  }
+}
+
+TEST_F(DBWALTest, RecoverWithoutFlush) {
+  Options options = CurrentOptions();
+  options.avoid_flush_during_recovery = true;
+  options.create_if_missing = false;
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 64 * 1024 * 1024;
+
+  size_t count = RecoveryTestHelper::FillData(this, &options);
+  auto validateData = [this, count]() {
+    for (size_t i = 0; i < count; i++) {
+      ASSERT_NE(Get("key" + std::to_string(i)), "NOT_FOUND");
+    }
+  };
+  Reopen(options);
+  validateData();
+  // Insert some data without flush
+  ASSERT_OK(Put("foo", "foo_v1"));
+  ASSERT_OK(Put("bar", "bar_v1"));
+  Reopen(options);
+  validateData();
+  ASSERT_EQ(Get("foo"), "foo_v1");
+  ASSERT_EQ(Get("bar"), "bar_v1");
+  // Insert again and reopen
+  ASSERT_OK(Put("foo", "foo_v2"));
+  ASSERT_OK(Put("bar", "bar_v2"));
+  Reopen(options);
+  validateData();
+  ASSERT_EQ(Get("foo"), "foo_v2");
+  ASSERT_EQ(Get("bar"), "bar_v2");
+  // manual flush and insert again
+  ASSERT_OK(Flush());
+  ASSERT_EQ(Get("foo"), "foo_v2");
+  ASSERT_EQ(Get("bar"), "bar_v2");
+  ASSERT_OK(Put("foo", "foo_v3"));
+  ASSERT_OK(Put("bar", "bar_v3"));
+  Reopen(options);
+  validateData();
+  ASSERT_EQ(Get("foo"), "foo_v3");
+  ASSERT_EQ(Get("bar"), "bar_v3");
+}
+
+TEST_F(DBWALTest, RecoverWithoutFlushMultipleCF) {
+  const std::string kSmallValue = "v";
+  const std::string kLargeValue = DummyString(1024);
+  Options options = CurrentOptions();
+  options.avoid_flush_during_recovery = true;
+  options.create_if_missing = false;
+  options.disable_auto_compactions = true;
+
+  auto countWalFiles = [this]() {
+    VectorLogPtr log_files;
+    if (!dbfull()->GetSortedWalFiles(log_files).ok()) {
+      return size_t{0};
+    }
+    return log_files.size();
+  };
+
+  // Create DB with multiple column families and multiple log files.
+  CreateAndReopenWithCF({"one", "two"}, options);
+  ASSERT_OK(Put(0, "key1", kSmallValue));
+  ASSERT_OK(Put(1, "key2", kLargeValue));
+  ASSERT_OK(Flush(1));
+  ASSERT_EQ(1, countWalFiles());
+  ASSERT_OK(Put(0, "key3", kSmallValue));
+  ASSERT_OK(Put(2, "key4", kLargeValue));
+  ASSERT_OK(Flush(2));
+  ASSERT_EQ(2, countWalFiles());
+
+  // Reopen, insert and flush.
+  options.db_write_buffer_size = 64 * 1024 * 1024;
+  ReopenWithColumnFamilies({"default", "one", "two"}, options);
+  ASSERT_EQ(Get(0, "key1"), kSmallValue);
+  ASSERT_EQ(Get(1, "key2"), kLargeValue);
+  ASSERT_EQ(Get(0, "key3"), kSmallValue);
+  ASSERT_EQ(Get(2, "key4"), kLargeValue);
+  // Insert more data.
+  ASSERT_OK(Put(0, "key5", kLargeValue));
+  ASSERT_OK(Put(1, "key6", kLargeValue));
+  ASSERT_EQ(3, countWalFiles());
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(2, "key7", kLargeValue));
+  ASSERT_OK(dbfull()->FlushWAL(false));
+  ASSERT_EQ(4, countWalFiles());
+
+  // Reopen twice and validate.
+  for (int i = 0; i < 2; i++) {
+    ReopenWithColumnFamilies({"default", "one", "two"}, options);
+    ASSERT_EQ(Get(0, "key1"), kSmallValue);
+    ASSERT_EQ(Get(1, "key2"), kLargeValue);
+    ASSERT_EQ(Get(0, "key3"), kSmallValue);
+    ASSERT_EQ(Get(2, "key4"), kLargeValue);
+    ASSERT_EQ(Get(0, "key5"), kLargeValue);
+    ASSERT_EQ(Get(1, "key6"), kLargeValue);
+    ASSERT_EQ(Get(2, "key7"), kLargeValue);
+    ASSERT_EQ(4, countWalFiles());
+  }
+}
+
+// In this test we are trying to do the following:
+//   1. Create a DB with corrupted WAL log;
+//   2. Open with avoid_flush_during_recovery = true;
+//   3. Append more data without flushing, which creates new WAL log.
+//   4. Open again. See if it can correctly handle previous corruption.
+TEST_P(DBWALTestWithParamsVaryingRecoveryMode,
+       RecoverFromCorruptedWALWithoutFlush) {
+  const int kAppendKeys = 100;
+  Options options = CurrentOptions();
+  options.avoid_flush_during_recovery = true;
+  options.create_if_missing = false;
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 64 * 1024 * 1024;
+
+  auto getAll = [this]() {
+    std::vector<std::pair<std::string, std::string>> data;
+    ReadOptions ropt;
+    Iterator* iter = dbfull()->NewIterator(ropt);
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      data.push_back(
+          std::make_pair(iter->key().ToString(), iter->value().ToString()));
+    }
+    delete iter;
+    return data;
+  };
+
+  bool trunc = std::get<0>(GetParam());  // Corruption style
+  // Corruption offset position
+  int corrupt_offset = std::get<1>(GetParam());
+  int wal_file_id = std::get<2>(GetParam());  // WAL file
+  WALRecoveryMode recovery_mode = std::get<3>(GetParam());
+  // WAL compression type
+  CompressionType compression_type = std::get<4>(GetParam());
+
+  options.wal_recovery_mode = recovery_mode;
+  options.wal_compression = compression_type;
+  // Create corrupted WAL
+  RecoveryTestHelper::FillData(this, &options);
+  RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .3,
+                                 /*len%=*/.1, wal_file_id, trunc);
+  // Skip the test if DB won't open.
+  if (!TryReopen(options).ok()) {
+    ASSERT_TRUE(options.wal_recovery_mode ==
+                    WALRecoveryMode::kAbsoluteConsistency ||
+                (!trunc && options.wal_recovery_mode ==
+                               WALRecoveryMode::kTolerateCorruptedTailRecords));
+    return;
+  }
+  ASSERT_OK(TryReopen(options));
+  // Append some more data.
+  for (int k = 0; k < kAppendKeys; k++) {
+    std::string key = "extra_key" + std::to_string(k);
+    std::string value = DummyString(RecoveryTestHelper::kValueSize);
+    ASSERT_OK(Put(key, value));
+  }
+  // Save data for comparison.
+  auto data = getAll();
+  // Reopen. Verify data.
+  ASSERT_OK(TryReopen(options));
+  auto actual_data = getAll();
+  ASSERT_EQ(data, actual_data);
+}
+
+// Tests that total log size is recovered if we set
+// avoid_flush_during_recovery=true.
+// Flush should trigger if max_total_wal_size is reached.
+TEST_F(DBWALTest, RestoreTotalLogSizeAfterRecoverWithoutFlush) {
+  auto test_listener = std::make_shared<FlushCounterListener>();
+  test_listener->expected_flush_reason = FlushReason::kWalFull;
+
+  constexpr size_t kKB = 1024;
+  constexpr size_t kMB = 1024 * 1024;
+  Options options = CurrentOptions();
+  options.avoid_flush_during_recovery = true;
+  options.max_total_wal_size = 1 * kMB;
+  options.listeners.push_back(test_listener);
+  // Have to open DB in multi-CF mode to trigger flush when
+  // max_total_wal_size is reached.
+  CreateAndReopenWithCF({"one"}, options);
+  // Write some keys and we will end up with one log file which is slightly
+  // smaller than 1MB.
+  std::string value_100k(100 * kKB, 'v');
+  std::string value_300k(300 * kKB, 'v');
+  ASSERT_OK(Put(0, "foo", "v1"));
+  for (int i = 0; i < 9; i++) {
+    ASSERT_OK(Put(1, "key" + std::to_string(i), value_100k));
+  }
+  // Get log files before reopen.
+  VectorLogPtr log_files_before;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before));
+  ASSERT_EQ(1, log_files_before.size());
+  uint64_t log_size_before = log_files_before[0]->SizeFileBytes();
+  ASSERT_GT(log_size_before, 900 * kKB);
+  ASSERT_LT(log_size_before, 1 * kMB);
+  ReopenWithColumnFamilies({"default", "one"}, options);
+  // Write one more value to make log larger than 1MB.
+  ASSERT_OK(Put(1, "bar", value_300k));
+  // Get log files again. A new log file will be opened.
+  VectorLogPtr log_files_after_reopen;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after_reopen));
+  ASSERT_EQ(2, log_files_after_reopen.size());
+  ASSERT_EQ(log_files_before[0]->LogNumber(),
+            log_files_after_reopen[0]->LogNumber());
+  ASSERT_GT(log_files_after_reopen[0]->SizeFileBytes() +
+                log_files_after_reopen[1]->SizeFileBytes(),
+            1 * kMB);
+  // Write one more key to trigger flush.
+  ASSERT_OK(Put(0, "foo", "v2"));
+  for (auto* h : handles_) {
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(h));
+  }
+  // Flushed two column families.
+  ASSERT_EQ(2, test_listener->count.load());
+}
+
+#if defined(ROCKSDB_PLATFORM_POSIX)
+#if defined(ROCKSDB_FALLOCATE_PRESENT)
+// Tests that we will truncate the preallocated space of the last log from
+// previous.
+TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithoutFlush) {
+  constexpr size_t kKB = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.avoid_flush_during_recovery = true;
+  if (mem_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem environment");
+    return;
+  }
+  if (!IsFallocateSupported()) {
+    return;
+  }
+
+  DestroyAndReopen(options);
+  size_t preallocated_size =
+      dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size);
+  ASSERT_OK(Put("foo", "v1"));
+  VectorLogPtr log_files_before;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before));
+  ASSERT_EQ(1, log_files_before.size());
+  auto& file_before = log_files_before[0];
+  ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB);
+  // The log file has preallocated space.
+  ASSERT_GE(GetAllocatedFileSize(dbname_ + file_before->PathName()),
+            preallocated_size);
+  Reopen(options);
+  VectorLogPtr log_files_after;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after));
+  ASSERT_EQ(1, log_files_after.size());
+  ASSERT_LT(log_files_after[0]->SizeFileBytes(), 1 * kKB);
+  // The preallocated space should be truncated.
+  ASSERT_LT(GetAllocatedFileSize(dbname_ + file_before->PathName()),
+            preallocated_size);
+}
+// Tests that we will truncate the preallocated space of the last log from
+// previous.
+TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithFlush) {
+  constexpr size_t kKB = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.avoid_flush_during_recovery = false;
+  options.avoid_flush_during_shutdown = true;
+  if (mem_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem environment");
+    return;
+  }
+  if (!IsFallocateSupported()) {
+    return;
+  }
+
+  DestroyAndReopen(options);
+  size_t preallocated_size =
+      dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size);
+  ASSERT_OK(Put("foo", "v1"));
+  VectorLogPtr log_files_before;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before));
+  ASSERT_EQ(1, log_files_before.size());
+  auto& file_before = log_files_before[0];
+  ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB);
+  ASSERT_GE(GetAllocatedFileSize(dbname_ + file_before->PathName()),
+            preallocated_size);
+  // The log file has preallocated space.
+  Close();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::PurgeObsoleteFiles:Begin",
+        "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover"},
+       {"DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate",
+        "DBImpl::DeleteObsoleteFileImpl::BeforeDeletion"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread reopen_thread([&]() { Reopen(options); });
+
+  TEST_SYNC_POINT(
+      "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover");
+  // After the flush during Open, the log file should get deleted.  However,
+  // if  the process is in a crash loop, the log file may not get
+  // deleted and thte preallocated space will keep accumulating. So we need
+  // to ensure it gets trtuncated.
+  EXPECT_LT(GetAllocatedFileSize(dbname_ + file_before->PathName()),
+            preallocated_size);
+  TEST_SYNC_POINT(
+      "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate");
+  reopen_thread.join();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBWALTest, TruncateLastLogAfterRecoverWALEmpty) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.avoid_flush_during_recovery = false;
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem/non-encrypted  environment");
+    return;
+  }
+  if (!IsFallocateSupported()) {
+    return;
+  }
+
+  DestroyAndReopen(options);
+  size_t preallocated_size =
+      dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size);
+  Close();
+  std::vector<std::string> filenames;
+  std::string last_log;
+  uint64_t last_log_num = 0;
+  ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+  for (auto fname : filenames) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(fname, &number, &type, nullptr)) {
+      if (type == kWalFile && number > last_log_num) {
+        last_log = fname;
+      }
+    }
+  }
+  ASSERT_NE(last_log, "");
+  last_log = dbname_ + '/' + last_log;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::PurgeObsoleteFiles:Begin",
+        "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover"},
+       {"DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate",
+        "DBImpl::DeleteObsoleteFileImpl::BeforeDeletion"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "PosixWritableFile::Close",
+      [](void* arg) { *(reinterpret_cast<size_t*>(arg)) = 0; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  // Preallocate space for the empty log file. This could happen if WAL data
+  // was buffered in memory and the process crashed.
+  std::unique_ptr<WritableFile> log_file;
+  ASSERT_OK(env_->ReopenWritableFile(last_log, &log_file, EnvOptions()));
+  log_file->SetPreallocationBlockSize(preallocated_size);
+  log_file->PrepareWrite(0, 4096);
+  log_file.reset();
+
+  ASSERT_GE(GetAllocatedFileSize(last_log), preallocated_size);
+
+  port::Thread reopen_thread([&]() { Reopen(options); });
+
+  TEST_SYNC_POINT(
+      "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover");
+  // The preallocated space should be truncated.
+  EXPECT_LT(GetAllocatedFileSize(last_log), preallocated_size);
+  TEST_SYNC_POINT(
+      "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate");
+  reopen_thread.join();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBWALTest, ReadOnlyRecoveryNoTruncate) {
+  constexpr size_t kKB = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.avoid_flush_during_recovery = true;
+  if (mem_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem environment");
+    return;
+  }
+  if (!IsFallocateSupported()) {
+    return;
+  }
+
+  // create DB and close with file truncate disabled
+  std::atomic_bool enable_truncate{false};
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "PosixWritableFile::Close", [&](void* arg) {
+        if (!enable_truncate) {
+          *(reinterpret_cast<size_t*>(arg)) = 0;
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  DestroyAndReopen(options);
+  size_t preallocated_size =
+      dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size);
+  ASSERT_OK(Put("foo", "v1"));
+  VectorLogPtr log_files_before;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before));
+  ASSERT_EQ(1, log_files_before.size());
+  auto& file_before = log_files_before[0];
+  ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB);
+  // The log file has preallocated space.
+  auto db_size = GetAllocatedFileSize(dbname_ + file_before->PathName());
+  ASSERT_GE(db_size, preallocated_size);
+  Close();
+
+  // enable truncate and open DB as readonly, the file should not be truncated
+  // and DB size is not changed.
+  enable_truncate = true;
+  ASSERT_OK(ReadOnlyReopen(options));
+  VectorLogPtr log_files_after;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after));
+  ASSERT_EQ(1, log_files_after.size());
+  ASSERT_LT(log_files_after[0]->SizeFileBytes(), 1 * kKB);
+  ASSERT_EQ(log_files_after[0]->PathName(), file_before->PathName());
+  // The preallocated space should NOT be truncated.
+  // the DB size is almost the same.
+  ASSERT_NEAR(GetAllocatedFileSize(dbname_ + file_before->PathName()), db_size,
+              db_size / 100);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif  // ROCKSDB_FALLOCATE_PRESENT
+#endif  // ROCKSDB_PLATFORM_POSIX
+
+TEST_F(DBWALTest, WalInManifestButNotInSortedWals) {
+  Options options = CurrentOptions();
+  options.track_and_verify_wals_in_manifest = true;
+  options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency;
+
+  // Build a way to make wal files selectively go missing
+  bool wals_go_missing = false;
+  struct MissingWalFs : public FileSystemWrapper {
+    MissingWalFs(const std::shared_ptr<FileSystem>& t,
+                 bool* _wals_go_missing_flag)
+        : FileSystemWrapper(t), wals_go_missing_flag(_wals_go_missing_flag) {}
+    bool* wals_go_missing_flag;
+    IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts,
+                         std::vector<std::string>* r,
+                         IODebugContext* dbg) override {
+      IOStatus s = target_->GetChildren(dir, io_opts, r, dbg);
+      if (s.ok() && *wals_go_missing_flag) {
+        for (size_t i = 0; i < r->size();) {
+          if (EndsWith(r->at(i), ".log")) {
+            r->erase(r->begin() + i);
+          } else {
+            ++i;
+          }
+        }
+      }
+      return s;
+    }
+    const char* Name() const override { return "MissingWalFs"; }
+  };
+  auto my_fs =
+      std::make_shared<MissingWalFs>(env_->GetFileSystem(), &wals_go_missing);
+  std::unique_ptr<Env> my_env(NewCompositeEnv(my_fs));
+  options.env = my_env.get();
+
+  CreateAndReopenWithCF({"blah"}, options);
+
+  // Currently necessary to get a WAL tracked in manifest; see
+  // https://github.com/facebook/rocksdb/issues/10080
+  ASSERT_OK(Put(0, "x", "y"));
+  ASSERT_OK(db_->SyncWAL());
+  ASSERT_OK(Put(1, "x", "y"));
+  ASSERT_OK(db_->SyncWAL());
+  ASSERT_OK(Flush(1));
+
+  ASSERT_FALSE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty());
+  std::vector<std::unique_ptr<LogFile>> wals;
+  ASSERT_OK(db_->GetSortedWalFiles(wals));
+  wals_go_missing = true;
+  ASSERT_NOK(db_->GetSortedWalFiles(wals));
+  wals_go_missing = false;
+  Close();
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBWALTest, WalTermTest) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+
+  WriteOptions wo;
+  wo.sync = true;
+  wo.disableWAL = false;
+
+  WriteBatch batch;
+  ASSERT_OK(batch.Put("foo", "bar"));
+  batch.MarkWalTerminationPoint();
+  ASSERT_OK(batch.Put("foo2", "bar2"));
+
+  ASSERT_OK(dbfull()->Write(wo, &batch));
+
+  // make sure we can re-open it.
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+  ASSERT_EQ("bar", Get(1, "foo"));
+  ASSERT_EQ("NOT_FOUND", Get(1, "foo2"));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBWALTest, GetCompressedWalsAfterSync) {
+  if (db_->GetOptions().wal_compression == kNoCompression) {
+    ROCKSDB_GTEST_BYPASS("stream compression not present");
+    return;
+  }
+  Options options = GetDefaultOptions();
+  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+  options.create_if_missing = true;
+  options.env = env_;
+  options.avoid_flush_during_recovery = true;
+  options.track_and_verify_wals_in_manifest = true;
+  // Enable WAL compression so that the newly-created WAL will be non-empty
+  // after DB open, even if point-in-time WAL recovery encounters no
+  // corruption.
+  options.wal_compression = kZSTD;
+  DestroyAndReopen(options);
+
+  // Write something to memtable and WAL so that log_empty_ will be false after
+  // next DB::Open().
+  ASSERT_OK(Put("a", "v"));
+
+  Reopen(options);
+
+  // New WAL is created, thanks to !log_empty_.
+  ASSERT_OK(dbfull()->TEST_SwitchWAL());
+
+  ASSERT_OK(Put("b", "v"));
+
+  ASSERT_OK(db_->SyncWAL());
+
+  VectorLogPtr wals;
+  Status s = dbfull()->GetSortedWalFiles(wals);
+  ASSERT_OK(s);
+}
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_with_timestamp_basic_test.cc b/src/rocksdb/db/db_with_timestamp_basic_test.cc
new file mode 100644
index 000000000..6ea1aaf46
--- /dev/null
+++ b/src/rocksdb/db/db_with_timestamp_basic_test.cc
@@ -0,0 +1,3880 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_with_timestamp_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/utilities/debug.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#if !defined(ROCKSDB_LITE)
+#include "test_util/sync_point.h"
+#endif
+#include "test_util/testutil.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
+
+namespace ROCKSDB_NAMESPACE {
+class DBBasicTestWithTimestamp : public DBBasicTestWithTimestampBase {
+ public:
+  DBBasicTestWithTimestamp()
+      : DBBasicTestWithTimestampBase("db_basic_test_with_timestamp") {}
+};
+
+TEST_F(DBBasicTestWithTimestamp, SanityChecks) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.avoid_flush_during_shutdown = true;
+  options.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
+  DestroyAndReopen(options);
+
+  Options options1 = CurrentOptions();
+  options1.env = env_;
+  options1.comparator = test::BytewiseComparatorWithU64TsWrapper();
+  options1.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
+  assert(options1.comparator &&
+         options1.comparator->timestamp_size() == sizeof(uint64_t));
+  ColumnFamilyHandle* handle = nullptr;
+  Status s = db_->CreateColumnFamily(options1, "data", &handle);
+  ASSERT_OK(s);
+
+  std::string dummy_ts(sizeof(uint64_t), '\0');
+  // Perform timestamp operations on default cf.
+  ASSERT_TRUE(
+      db_->Put(WriteOptions(), "key", dummy_ts, "value").IsInvalidArgument());
+  ASSERT_TRUE(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), "key",
+                         dummy_ts, "value")
+                  .IsInvalidArgument());
+  ASSERT_TRUE(db_->Delete(WriteOptions(), "key", dummy_ts).IsInvalidArgument());
+  ASSERT_TRUE(
+      db_->SingleDelete(WriteOptions(), "key", dummy_ts).IsInvalidArgument());
+  ASSERT_TRUE(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               "begin_key", "end_key", dummy_ts)
+                  .IsInvalidArgument());
+
+  // Perform non-timestamp operations on "data" cf.
+  ASSERT_TRUE(
+      db_->Put(WriteOptions(), handle, "key", "value").IsInvalidArgument());
+  ASSERT_TRUE(db_->Delete(WriteOptions(), handle, "key").IsInvalidArgument());
+  ASSERT_TRUE(
+      db_->SingleDelete(WriteOptions(), handle, "key").IsInvalidArgument());
+
+  ASSERT_TRUE(
+      db_->Merge(WriteOptions(), handle, "key", "value").IsInvalidArgument());
+  ASSERT_TRUE(db_->DeleteRange(WriteOptions(), handle, "begin_key", "end_key")
+                  .IsInvalidArgument());
+
+  {
+    WriteBatch wb;
+    ASSERT_OK(wb.Put(handle, "key", "value"));
+    ASSERT_TRUE(db_->Write(WriteOptions(), &wb).IsInvalidArgument());
+  }
+  {
+    WriteBatch wb;
+    ASSERT_OK(wb.Delete(handle, "key"));
+    ASSERT_TRUE(db_->Write(WriteOptions(), &wb).IsInvalidArgument());
+  }
+  {
+    WriteBatch wb;
+    ASSERT_OK(wb.SingleDelete(handle, "key"));
+    ASSERT_TRUE(db_->Write(WriteOptions(), &wb).IsInvalidArgument());
+  }
+  {
+    WriteBatch wb;
+    ASSERT_OK(wb.DeleteRange(handle, "begin_key", "end_key"));
+    ASSERT_TRUE(db_->Write(WriteOptions(), &wb).IsInvalidArgument());
+  }
+
+  // Perform timestamp operations with timestamps of incorrect size.
+  const std::string wrong_ts(sizeof(uint32_t), '\0');
+  ASSERT_TRUE(db_->Put(WriteOptions(), handle, "key", wrong_ts, "value")
+                  .IsInvalidArgument());
+  ASSERT_TRUE(db_->Merge(WriteOptions(), handle, "key", wrong_ts, "value")
+                  .IsInvalidArgument());
+  ASSERT_TRUE(
+      db_->Delete(WriteOptions(), handle, "key", wrong_ts).IsInvalidArgument());
+  ASSERT_TRUE(db_->SingleDelete(WriteOptions(), handle, "key", wrong_ts)
+                  .IsInvalidArgument());
+  ASSERT_TRUE(
+      db_->DeleteRange(WriteOptions(), handle, "begin_key", "end_key", wrong_ts)
+          .IsInvalidArgument());
+
+  delete handle;
+}
+
+TEST_F(DBBasicTestWithTimestamp, MixedCfs) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.avoid_flush_during_shutdown = true;
+  DestroyAndReopen(options);
+
+  Options options1 = CurrentOptions();
+  options1.env = env_;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options1.comparator = &test_cmp;
+  ColumnFamilyHandle* handle = nullptr;
+  Status s = db_->CreateColumnFamily(options1, "data", &handle);
+  ASSERT_OK(s);
+
+  WriteBatch wb;
+  ASSERT_OK(wb.Put("a", "value"));
+  ASSERT_OK(wb.Put(handle, "a", "value"));
+  {
+    std::string ts = Timestamp(1, 0);
+    const auto ts_sz_func = [kTimestampSize, handle](uint32_t cf_id) {
+      assert(handle);
+      if (cf_id == 0) {
+        return static_cast<size_t>(0);
+      } else if (cf_id == handle->GetID()) {
+        return kTimestampSize;
+      } else {
+        assert(false);
+        return std::numeric_limits<size_t>::max();
+      }
+    };
+    ASSERT_OK(wb.UpdateTimestamps(ts, ts_sz_func));
+    ASSERT_OK(db_->Write(WriteOptions(), &wb));
+  }
+
+  const auto verify_db = [this](ColumnFamilyHandle* h, const std::string& key,
+                                const std::string& ts,
+                                const std::string& expected_value) {
+    ASSERT_EQ(expected_value, Get(key));
+    Slice read_ts_slice(ts);
+    ReadOptions read_opts;
+    read_opts.timestamp = &read_ts_slice;
+    std::string value;
+    ASSERT_OK(db_->Get(read_opts, h, key, &value));
+    ASSERT_EQ(expected_value, value);
+  };
+
+  verify_db(handle, "a", Timestamp(1, 0), "value");
+
+  delete handle;
+  Close();
+
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+  cf_descs.emplace_back("data", options1);
+  options.create_if_missing = false;
+  s = DB::Open(options, dbname_, cf_descs, &handles_, &db_);
+  ASSERT_OK(s);
+
+  verify_db(handles_[1], "a", Timestamp(1, 0), "value");
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, CompactRangeWithSpecifiedRange) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  WriteOptions write_opts;
+  std::string ts = Timestamp(1, 0);
+
+  ASSERT_OK(db_->Put(write_opts, "foo1", ts, "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->Put(write_opts, "foo2", ts, "bar"));
+  ASSERT_OK(Flush());
+
+  std::string start_str = "foo";
+  std::string end_str = "foo2";
+  Slice start(start_str), end(end_str);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end));
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, GcPreserveLatestVersionBelowFullHistoryLow) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  std::string ts_str = Timestamp(1, 0);
+  WriteOptions wopts;
+  ASSERT_OK(db_->Put(wopts, "k1", ts_str, "v1"));
+  ASSERT_OK(db_->Put(wopts, "k2", ts_str, "v2"));
+  ASSERT_OK(db_->Put(wopts, "k3", ts_str, "v3"));
+
+  ts_str = Timestamp(2, 0);
+  ASSERT_OK(db_->Delete(wopts, "k3", ts_str));
+
+  ts_str = Timestamp(4, 0);
+  ASSERT_OK(db_->Put(wopts, "k1", ts_str, "v5"));
+
+  ts_str = Timestamp(5, 0);
+  ASSERT_OK(
+      db_->DeleteRange(wopts, db_->DefaultColumnFamily(), "k0", "k9", ts_str));
+
+  ts_str = Timestamp(3, 0);
+  Slice ts = ts_str;
+  CompactRangeOptions cro;
+  cro.full_history_ts_low = &ts;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  ASSERT_OK(Flush());
+
+  ReadOptions ropts;
+  ropts.timestamp = &ts;
+  std::string value;
+  Status s = db_->Get(ropts, "k1", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("v1", value);
+
+  std::string key_ts;
+  ASSERT_TRUE(db_->Get(ropts, "k3", &value, &key_ts).IsNotFound());
+  ASSERT_EQ(Timestamp(2, 0), key_ts);
+
+  ts_str = Timestamp(5, 0);
+  ts = ts_str;
+  ropts.timestamp = &ts;
+  ASSERT_TRUE(db_->Get(ropts, "k2", &value, &key_ts).IsNotFound());
+  ASSERT_EQ(Timestamp(5, 0), key_ts);
+  ASSERT_TRUE(db_->Get(ropts, "k2", &value).IsNotFound());
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, UpdateFullHistoryTsLow) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  const std::string kKey = "test kKey";
+
+  // Test set ts_low first and flush()
+  int current_ts_low = 5;
+  std::string ts_low_str = Timestamp(current_ts_low, 0);
+  Slice ts_low = ts_low_str;
+  CompactRangeOptions comp_opts;
+  comp_opts.full_history_ts_low = &ts_low;
+  comp_opts.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+
+  ASSERT_OK(db_->CompactRange(comp_opts, nullptr, nullptr));
+
+  auto* cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
+          ->cfd();
+  auto result_ts_low = cfd->GetFullHistoryTsLow();
+
+  ASSERT_TRUE(test_cmp.CompareTimestamp(ts_low, result_ts_low) == 0);
+
+  for (int i = 0; i < 10; i++) {
+    WriteOptions write_opts;
+    std::string ts = Timestamp(i, 0);
+    ASSERT_OK(db_->Put(write_opts, kKey, ts, Key(i)));
+  }
+  ASSERT_OK(Flush());
+
+  for (int i = 0; i < 10; i++) {
+    ReadOptions read_opts;
+    std::string ts_str = Timestamp(i, 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::string value;
+    Status status = db_->Get(read_opts, kKey, &value);
+    if (i < current_ts_low) {
+      ASSERT_TRUE(status.IsInvalidArgument());
+    } else {
+      ASSERT_OK(status);
+      ASSERT_TRUE(value.compare(Key(i)) == 0);
+    }
+  }
+
+  // Test set ts_low and then trigger compaction
+  for (int i = 10; i < 20; i++) {
+    WriteOptions write_opts;
+    std::string ts = Timestamp(i, 0);
+    ASSERT_OK(db_->Put(write_opts, kKey, ts, Key(i)));
+  }
+
+  ASSERT_OK(Flush());
+
+  current_ts_low = 15;
+  ts_low_str = Timestamp(current_ts_low, 0);
+  ts_low = ts_low_str;
+  comp_opts.full_history_ts_low = &ts_low;
+  ASSERT_OK(db_->CompactRange(comp_opts, nullptr, nullptr));
+  result_ts_low = cfd->GetFullHistoryTsLow();
+  ASSERT_TRUE(test_cmp.CompareTimestamp(ts_low, result_ts_low) == 0);
+
+  for (int i = current_ts_low; i < 20; i++) {
+    ReadOptions read_opts;
+    std::string ts_str = Timestamp(i, 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::string value;
+    Status status = db_->Get(read_opts, kKey, &value);
+    ASSERT_OK(status);
+    ASSERT_TRUE(value.compare(Key(i)) == 0);
+  }
+
+  // Test invalid compaction with range
+  Slice start(kKey), end(kKey);
+  Status s = db_->CompactRange(comp_opts, &start, &end);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  s = db_->CompactRange(comp_opts, &start, nullptr);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  s = db_->CompactRange(comp_opts, nullptr, &end);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  // Test invalid compaction with the decreasing ts_low
+  ts_low_str = Timestamp(current_ts_low - 1, 0);
+  ts_low = ts_low_str;
+  comp_opts.full_history_ts_low = &ts_low;
+  s = db_->CompactRange(comp_opts, nullptr, nullptr);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, UpdateFullHistoryTsLowWithPublicAPI) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  std::string ts_low_str = Timestamp(9, 0);
+  ASSERT_OK(
+      db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), ts_low_str));
+  std::string result_ts_low;
+  ASSERT_OK(db_->GetFullHistoryTsLow(nullptr, &result_ts_low));
+  ASSERT_TRUE(test_cmp.CompareTimestamp(ts_low_str, result_ts_low) == 0);
+  // test increase full_history_low backward
+  std::string ts_low_str_back = Timestamp(8, 0);
+  auto s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(),
+                                         ts_low_str_back);
+  ASSERT_EQ(s, Status::InvalidArgument());
+  // test IncreaseFullHistoryTsLow with a timestamp whose length is longger
+  // than the cf's timestamp size
+  std::string ts_low_str_long(Timestamp(0, 0).size() + 1, 'a');
+  s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(),
+                                    ts_low_str_long);
+  ASSERT_EQ(s, Status::InvalidArgument());
+  // test IncreaseFullHistoryTsLow with a timestamp which is null
+  std::string ts_low_str_null = "";
+  s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(),
+                                    ts_low_str_null);
+  ASSERT_EQ(s, Status::InvalidArgument());
+  // test IncreaseFullHistoryTsLow for a column family that does not enable
+  // timestamp
+  options.comparator = BytewiseComparator();
+  DestroyAndReopen(options);
+  ts_low_str = Timestamp(10, 0);
+  s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), ts_low_str);
+  ASSERT_EQ(s, Status::InvalidArgument());
+  // test GetFullHistoryTsLow for a column family that does not enable
+  // timestamp
+  std::string current_ts_low;
+  s = db_->GetFullHistoryTsLow(db_->DefaultColumnFamily(), &current_ts_low);
+  ASSERT_EQ(s, Status::InvalidArgument());
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, GetApproximateSizes) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000000;  // Large write buffer
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  auto default_cf = db_->DefaultColumnFamily();
+
+  WriteOptions write_opts;
+  std::string ts = Timestamp(1, 0);
+
+  const int N = 128;
+  Random rnd(301);
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(db_->Put(write_opts, Key(i), ts, rnd.RandomString(1024)));
+  }
+
+  uint64_t size;
+  std::string start = Key(50);
+  std::string end = Key(60);
+  Range r(start, end);
+  SizeApproximationOptions size_approx_options;
+  size_approx_options.include_memtables = true;
+  size_approx_options.include_files = true;
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_GT(size, 6000);
+  ASSERT_LT(size, 204800);
+
+  // test multiple ranges
+  std::vector<Range> ranges;
+  std::string start_tmp = Key(10);
+  std::string end_tmp = Key(20);
+  ranges.emplace_back(Range(start_tmp, end_tmp));
+  ranges.emplace_back(Range(start, end));
+  uint64_t range_sizes[2];
+  ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf,
+                                     ranges.data(), 2, range_sizes));
+
+  ASSERT_EQ(range_sizes[1], size);
+
+  // Zero if not including mem table
+  ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size));
+  ASSERT_EQ(size, 0);
+
+  start = Key(500);
+  end = Key(600);
+  r = Range(start, end);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_EQ(size, 0);
+
+  // Test range boundaries
+  ASSERT_OK(db_->Put(write_opts, Key(1000), ts, rnd.RandomString(1024)));
+  // Should include start key
+  start = Key(1000);
+  end = Key(1100);
+  r = Range(start, end);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_GT(size, 0);
+
+  // Should exclude end key
+  start = Key(900);
+  end = Key(1000);
+  r = Range(start, end);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_EQ(size, 0);
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, SimpleIterate) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::vector<uint64_t> start_keys = {1, 0};
+  const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+                                                     Timestamp(3, 0)};
+  const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+                                                    Timestamp(4, 0)};
+  for (size_t i = 0; i < write_timestamps.size(); ++i) {
+    WriteOptions write_opts;
+    for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+      Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
+                          "value" + std::to_string(i));
+      ASSERT_OK(s);
+    }
+  }
+  for (size_t i = 0; i < read_timestamps.size(); ++i) {
+    ReadOptions read_opts;
+    Slice read_ts = read_timestamps[i];
+    read_opts.timestamp = &read_ts;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    int count = 0;
+    uint64_t key = 0;
+    // Forward iterate.
+    for (it->Seek(Key1(0)), key = start_keys[i]; it->Valid();
+         it->Next(), ++count, ++key) {
+      CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+                         "value" + std::to_string(i), write_timestamps[i]);
+    }
+    size_t expected_count = kMaxKey - start_keys[i] + 1;
+    ASSERT_EQ(expected_count, count);
+
+    // Backward iterate.
+    count = 0;
+    for (it->SeekForPrev(Key1(kMaxKey)), key = kMaxKey; it->Valid();
+         it->Prev(), ++count, --key) {
+      CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+                         "value" + std::to_string(i), write_timestamps[i]);
+    }
+    ASSERT_EQ(static_cast<size_t>(kMaxKey) - start_keys[i] + 1, count);
+
+    // SeekToFirst()/SeekToLast() with lower/upper bounds.
+    // Then iter with lower and upper bounds.
+    uint64_t l = 0;
+    uint64_t r = kMaxKey + 1;
+    while (l < r) {
+      std::string lb_str = Key1(l);
+      Slice lb = lb_str;
+      std::string ub_str = Key1(r);
+      Slice ub = ub_str;
+      read_opts.iterate_lower_bound = &lb;
+      read_opts.iterate_upper_bound = &ub;
+      it.reset(db_->NewIterator(read_opts));
+      for (it->SeekToFirst(), key = std::max(l, start_keys[i]), count = 0;
+           it->Valid(); it->Next(), ++key, ++count) {
+        CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+                           "value" + std::to_string(i), write_timestamps[i]);
+      }
+      ASSERT_EQ(r - std::max(l, start_keys[i]), count);
+
+      for (it->SeekToLast(), key = std::min(r, kMaxKey + 1), count = 0;
+           it->Valid(); it->Prev(), --key, ++count) {
+        CheckIterUserEntry(it.get(), Key1(key - 1), kTypeValue,
+                           "value" + std::to_string(i), write_timestamps[i]);
+      }
+      l += (kMaxKey / 100);
+      r -= (kMaxKey / 100);
+    }
+  }
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, TrimHistoryTest) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  auto check_value_by_ts = [](DB* db, Slice key, std::string readTs,
+                              Status status, std::string checkValue,
+                              std::string expected_ts) {
+    ReadOptions ropts;
+    Slice ts = readTs;
+    ropts.timestamp = &ts;
+    std::string value;
+    std::string key_ts;
+    Status s = db->Get(ropts, key, &value, &key_ts);
+    ASSERT_TRUE(s == status);
+    if (s.ok()) {
+      ASSERT_EQ(checkValue, value);
+    }
+    if (s.ok() || s.IsNotFound()) {
+      ASSERT_EQ(expected_ts, key_ts);
+    }
+  };
+  // Construct data of different versions with different ts
+  ASSERT_OK(db_->Put(WriteOptions(), "k1", Timestamp(2, 0), "v1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "k1", Timestamp(4, 0), "v2"));
+  ASSERT_OK(db_->Delete(WriteOptions(), "k1", Timestamp(5, 0)));
+  ASSERT_OK(db_->Put(WriteOptions(), "k1", Timestamp(6, 0), "v3"));
+  check_value_by_ts(db_, "k1", Timestamp(7, 0), Status::OK(), "v3",
+                    Timestamp(6, 0));
+  ASSERT_OK(Flush());
+  Close();
+
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+  DBOptions db_options(options);
+
+  // Trim data whose version > Timestamp(5, 0), read(k1, ts(7)) <- NOT_FOUND.
+  ASSERT_OK(DB::OpenAndTrimHistory(db_options, dbname_, column_families,
+                                   &handles_, &db_, Timestamp(5, 0)));
+  check_value_by_ts(db_, "k1", Timestamp(7, 0), Status::NotFound(), "",
+                    Timestamp(5, 0));
+  Close();
+
+  // Trim data whose timestamp > Timestamp(4, 0), read(k1, ts(7)) <- v2
+  ASSERT_OK(DB::OpenAndTrimHistory(db_options, dbname_, column_families,
+                                   &handles_, &db_, Timestamp(4, 0)));
+  check_value_by_ts(db_, "k1", Timestamp(7, 0), Status::OK(), "v2",
+                    Timestamp(4, 0));
+  Close();
+
+  Reopen(options);
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "k1",
+                             "k3", Timestamp(7, 0)));
+  check_value_by_ts(db_, "k1", Timestamp(8, 0), Status::NotFound(), "",
+                    Timestamp(7, 0));
+  Close();
+  // Trim data whose timestamp > Timestamp(6, 0), read(k1, ts(8)) <- v2
+  ASSERT_OK(DB::OpenAndTrimHistory(db_options, dbname_, column_families,
+                                   &handles_, &db_, Timestamp(6, 0)));
+  check_value_by_ts(db_, "k1", Timestamp(8, 0), Status::OK(), "v2",
+                    Timestamp(4, 0));
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, OpenAndTrimHistoryInvalidOptionTest) {
+  Destroy(last_options_);
+
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+  DBOptions db_options(options);
+
+  // OpenAndTrimHistory should not work with avoid_flush_during_recovery
+  db_options.avoid_flush_during_recovery = true;
+  ASSERT_TRUE(DB::OpenAndTrimHistory(db_options, dbname_, column_families,
+                                     &handles_, &db_, Timestamp(0, 0))
+                  .IsInvalidArgument());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBasicTestWithTimestamp, GetTimestampTableProperties) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  // Create 2 tables
+  for (int table = 0; table < 2; ++table) {
+    for (int i = 0; i < 10; i++) {
+      std::string ts = Timestamp(i, 0);
+      ASSERT_OK(db_->Put(WriteOptions(), "key", ts, Key(i)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  TablePropertiesCollection props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+  ASSERT_EQ(2U, props.size());
+  for (const auto& item : props) {
+    auto& user_collected = item.second->user_collected_properties;
+    ASSERT_TRUE(user_collected.find("rocksdb.timestamp_min") !=
+                user_collected.end());
+    ASSERT_TRUE(user_collected.find("rocksdb.timestamp_max") !=
+                user_collected.end());
+    ASSERT_EQ(user_collected.at("rocksdb.timestamp_min"), Timestamp(0, 0));
+    ASSERT_EQ(user_collected.at("rocksdb.timestamp_max"), Timestamp(9, 0));
+  }
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+class DBBasicTestWithTimestampTableOptions
+    : public DBBasicTestWithTimestampBase,
+      public testing::WithParamInterface<BlockBasedTableOptions::IndexType> {
+ public:
+  explicit DBBasicTestWithTimestampTableOptions()
+      : DBBasicTestWithTimestampBase(
+            "db_basic_test_with_timestamp_table_options") {}
+};
+
+INSTANTIATE_TEST_CASE_P(
+    Timestamp, DBBasicTestWithTimestampTableOptions,
+    testing::Values(
+        BlockBasedTableOptions::IndexType::kBinarySearch,
+        BlockBasedTableOptions::IndexType::kHashSearch,
+        BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
+        BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey));
+
+TEST_P(DBBasicTestWithTimestampTableOptions, GetAndMultiGet) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+  options.compression = kNoCompression;
+  BlockBasedTableOptions bbto;
+  bbto.index_type = GetParam();
+  bbto.block_size = 100;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator cmp(kTimestampSize);
+  options.comparator = &cmp;
+  DestroyAndReopen(options);
+  constexpr uint64_t kNumKeys = 1024;
+  for (uint64_t k = 0; k < kNumKeys; ++k) {
+    WriteOptions write_opts;
+    ASSERT_OK(db_->Put(write_opts, Key1(k), Timestamp(1, 0),
+                       "value" + std::to_string(k)));
+  }
+  ASSERT_OK(Flush());
+  {
+    ReadOptions read_opts;
+    read_opts.total_order_seek = true;
+    std::string ts_str = Timestamp(2, 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    // verify Get()
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      std::string value_from_get;
+      std::string key_str(it->key().data(), it->key().size());
+      std::string timestamp;
+      ASSERT_OK(db_->Get(read_opts, key_str, &value_from_get, &timestamp));
+      ASSERT_EQ(it->value(), value_from_get);
+      ASSERT_EQ(Timestamp(1, 0), timestamp);
+    }
+
+    // verify MultiGet()
+    constexpr uint64_t step = 2;
+    static_assert(0 == (kNumKeys % step),
+                  "kNumKeys must be a multiple of step");
+    for (uint64_t k = 0; k < kNumKeys; k += 2) {
+      std::vector<std::string> key_strs;
+      std::vector<Slice> keys;
+      for (size_t i = 0; i < step; ++i) {
+        key_strs.push_back(Key1(k + i));
+      }
+      for (size_t i = 0; i < step; ++i) {
+        keys.emplace_back(key_strs[i]);
+      }
+      std::vector<std::string> values;
+      std::vector<std::string> timestamps;
+      std::vector<Status> statuses =
+          db_->MultiGet(read_opts, keys, &values, &timestamps);
+      ASSERT_EQ(step, statuses.size());
+      ASSERT_EQ(step, values.size());
+      ASSERT_EQ(step, timestamps.size());
+      for (uint64_t i = 0; i < step; ++i) {
+        ASSERT_OK(statuses[i]);
+        ASSERT_EQ("value" + std::to_string(k + i), values[i]);
+        ASSERT_EQ(Timestamp(1, 0), timestamps[i]);
+      }
+    }
+  }
+  Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithPrefixLessThanKey) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+  options.memtable_whole_key_filtering = true;
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = true;
+  bbto.index_type = GetParam();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  WriteOptions write_opts;
+  std::string ts = Timestamp(1, 0);
+
+  ASSERT_OK(db_->Put(write_opts, "foo1", ts, "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->Put(write_opts, "foo2", ts, "bar"));
+  ASSERT_OK(Flush());
+
+  // Move sst file to next level
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_OK(db_->Put(write_opts, "foo3", ts, "bar"));
+  ASSERT_OK(Flush());
+
+  ReadOptions read_opts;
+  Slice read_ts = ts;
+  read_opts.timestamp = &read_ts;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    iter->Seek("foo");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+
+    iter->Seek("bbb");
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_OK(iter->status());
+  }
+
+  Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithCappedPrefix) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  // All of the keys or this test must be longer than 3 characters
+  constexpr int kMinKeyLen = 3;
+  options.prefix_extractor.reset(NewCappedPrefixTransform(kMinKeyLen));
+  options.memtable_whole_key_filtering = true;
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = true;
+  bbto.index_type = GetParam();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  WriteOptions write_opts;
+  std::string ts = Timestamp(1, 0);
+
+  ASSERT_OK(db_->Put(write_opts, "foo1", ts, "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->Put(write_opts, "foo2", ts, "bar"));
+  ASSERT_OK(Flush());
+
+  // Move sst file to next level
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_OK(db_->Put(write_opts, "foo3", ts, "bar"));
+  ASSERT_OK(Flush());
+
+  ReadOptions read_opts;
+  ts = Timestamp(2, 0);
+  Slice read_ts = ts;
+  read_opts.timestamp = &read_ts;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    // Make sure the prefix extractor doesn't include timestamp, otherwise it
+    // may return invalid result.
+    iter->Seek("foo");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+  }
+
+  Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithBound) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = true;
+  bbto.index_type = GetParam();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  WriteOptions write_opts;
+  std::string ts = Timestamp(1, 0);
+
+  ASSERT_OK(db_->Put(write_opts, "foo1", ts, "bar1"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->Put(write_opts, "foo2", ts, "bar2"));
+  ASSERT_OK(Flush());
+
+  // Move sst file to next level
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  for (int i = 3; i < 9; ++i) {
+    ASSERT_OK(db_->Put(write_opts, "foo" + std::to_string(i), ts,
+                       "bar" + std::to_string(i)));
+  }
+  ASSERT_OK(Flush());
+
+  ReadOptions read_opts;
+  ts = Timestamp(2, 0);
+  Slice read_ts = ts;
+  read_opts.timestamp = &read_ts;
+  std::string up_bound = "foo5";  // exclusive
+  Slice up_bound_slice = up_bound;
+  std::string lo_bound = "foo2";  // inclusive
+  Slice lo_bound_slice = lo_bound;
+  read_opts.iterate_upper_bound = &up_bound_slice;
+  read_opts.iterate_lower_bound = &lo_bound_slice;
+  read_opts.auto_prefix_mode = true;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    // Make sure the prefix extractor doesn't include timestamp, otherwise it
+    // may return invalid result.
+    iter->Seek("foo");
+    CheckIterUserEntry(iter.get(), lo_bound, kTypeValue, "bar2",
+                       Timestamp(1, 0));
+    iter->SeekToFirst();
+    CheckIterUserEntry(iter.get(), lo_bound, kTypeValue, "bar2",
+                       Timestamp(1, 0));
+    iter->SeekForPrev("g");
+    CheckIterUserEntry(iter.get(), "foo4", kTypeValue, "bar4", Timestamp(1, 0));
+    iter->SeekToLast();
+    CheckIterUserEntry(iter.get(), "foo4", kTypeValue, "bar4", Timestamp(1, 0));
+  }
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, ChangeIterationDirection) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.env = env_;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+  const std::vector<std::string> timestamps = {Timestamp(1, 1), Timestamp(0, 2),
+                                               Timestamp(4, 3)};
+  const std::vector<std::tuple<std::string, std::string>> kvs = {
+      std::make_tuple("aa", "value1"), std::make_tuple("ab", "value2")};
+  for (const auto& ts : timestamps) {
+    WriteBatch wb(0, 0, 0, kTimestampSize);
+    for (const auto& kv : kvs) {
+      const std::string& key = std::get<0>(kv);
+      const std::string& value = std::get<1>(kv);
+      ASSERT_OK(wb.Put(key, value));
+    }
+
+    ASSERT_OK(wb.UpdateTimestamps(
+        ts, [kTimestampSize](uint32_t) { return kTimestampSize; }));
+    ASSERT_OK(db_->Write(WriteOptions(), &wb));
+  }
+  std::string read_ts_str = Timestamp(5, 3);
+  Slice read_ts = read_ts_str;
+  ReadOptions read_opts;
+  read_opts.timestamp = &read_ts;
+  std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+
+  it->SeekToFirst();
+  ASSERT_TRUE(it->Valid());
+  it->Prev();
+  ASSERT_FALSE(it->Valid());
+
+  it->SeekToLast();
+  ASSERT_TRUE(it->Valid());
+  uint64_t prev_reseek_count =
+      options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION);
+  ASSERT_EQ(0, prev_reseek_count);
+  it->Next();
+  ASSERT_FALSE(it->Valid());
+  ASSERT_EQ(1 + prev_reseek_count,
+            options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+
+  it->Seek(std::get<0>(kvs[0]));
+  CheckIterUserEntry(it.get(), std::get<0>(kvs[0]), kTypeValue,
+                     std::get<1>(kvs[0]), Timestamp(4, 3));
+  it->Next();
+  CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue,
+                     std::get<1>(kvs[1]), Timestamp(4, 3));
+  it->Prev();
+  CheckIterUserEntry(it.get(), std::get<0>(kvs[0]), kTypeValue,
+                     std::get<1>(kvs[0]), Timestamp(4, 3));
+
+  prev_reseek_count =
+      options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION);
+  ASSERT_EQ(1, prev_reseek_count);
+  it->Next();
+  CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue,
+                     std::get<1>(kvs[1]), Timestamp(4, 3));
+  ASSERT_EQ(1 + prev_reseek_count,
+            options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+
+  it->SeekForPrev(std::get<0>(kvs[1]));
+  CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue,
+                     std::get<1>(kvs[1]), Timestamp(4, 3));
+  it->Prev();
+  CheckIterUserEntry(it.get(), std::get<0>(kvs[0]), kTypeValue,
+                     std::get<1>(kvs[0]), Timestamp(4, 3));
+
+  prev_reseek_count =
+      options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION);
+  it->Next();
+  CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue,
+                     std::get<1>(kvs[1]), Timestamp(4, 3));
+  ASSERT_EQ(1 + prev_reseek_count,
+            options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+
+  it.reset();
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, SimpleForwardIterateLowerTsBound) {
+  constexpr int kNumKeysPerFile = 128;
+  constexpr uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+                                                     Timestamp(3, 0)};
+  const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+                                                    Timestamp(4, 0)};
+  const std::vector<std::string> read_timestamps_lb = {Timestamp(1, 0),
+                                                       Timestamp(1, 0)};
+  for (size_t i = 0; i < write_timestamps.size(); ++i) {
+    WriteOptions write_opts;
+    for (uint64_t key = 0; key <= kMaxKey; ++key) {
+      Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
+                          "value" + std::to_string(i));
+      ASSERT_OK(s);
+    }
+  }
+  for (size_t i = 0; i < read_timestamps.size(); ++i) {
+    ReadOptions read_opts;
+    Slice read_ts = read_timestamps[i];
+    Slice read_ts_lb = read_timestamps_lb[i];
+    read_opts.timestamp = &read_ts;
+    read_opts.iter_start_ts = &read_ts_lb;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    int count = 0;
+    uint64_t key = 0;
+    for (it->Seek(Key1(0)), key = 0; it->Valid(); it->Next(), ++count, ++key) {
+      CheckIterEntry(it.get(), Key1(key), kTypeValue,
+                     "value" + std::to_string(i), write_timestamps[i]);
+      if (i > 0) {
+        it->Next();
+        CheckIterEntry(it.get(), Key1(key), kTypeValue,
+                       "value" + std::to_string(i - 1),
+                       write_timestamps[i - 1]);
+      }
+    }
+    size_t expected_count = kMaxKey + 1;
+    ASSERT_EQ(expected_count, count);
+  }
+  // Delete all keys@ts=5 and check iteration result with start ts set
+  {
+    std::string write_timestamp = Timestamp(5, 0);
+    WriteOptions write_opts;
+    for (uint64_t key = 0; key < kMaxKey + 1; ++key) {
+      Status s = db_->Delete(write_opts, Key1(key), write_timestamp);
+      ASSERT_OK(s);
+    }
+
+    std::string read_timestamp = Timestamp(6, 0);
+    ReadOptions read_opts;
+    Slice read_ts = read_timestamp;
+    read_opts.timestamp = &read_ts;
+    std::string read_timestamp_lb = Timestamp(2, 0);
+    Slice read_ts_lb = read_timestamp_lb;
+    read_opts.iter_start_ts = &read_ts_lb;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    int count = 0;
+    uint64_t key = 0;
+    for (it->Seek(Key1(0)), key = 0; it->Valid(); it->Next(), ++count, ++key) {
+      CheckIterEntry(it.get(), Key1(key), kTypeDeletionWithTimestamp, Slice(),
+                     write_timestamp);
+      // Skip key@ts=3 and land on tombstone key@ts=5
+      it->Next();
+    }
+    ASSERT_EQ(kMaxKey + 1, count);
+  }
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, BackwardIterateLowerTsBound) {
+  constexpr int kNumKeysPerFile = 128;
+  constexpr uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+                                                     Timestamp(3, 0)};
+  const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+                                                    Timestamp(4, 0)};
+  const std::vector<std::string> read_timestamps_lb = {Timestamp(1, 0),
+                                                       Timestamp(1, 0)};
+  for (size_t i = 0; i < write_timestamps.size(); ++i) {
+    WriteOptions write_opts;
+    for (uint64_t key = 0; key <= kMaxKey; ++key) {
+      Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
+                          "value" + std::to_string(i));
+      ASSERT_OK(s);
+    }
+  }
+  for (size_t i = 0; i < read_timestamps.size(); ++i) {
+    ReadOptions read_opts;
+    Slice read_ts = read_timestamps[i];
+    Slice read_ts_lb = read_timestamps_lb[i];
+    read_opts.timestamp = &read_ts;
+    read_opts.iter_start_ts = &read_ts_lb;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    int count = 0;
+    uint64_t key = 0;
+    for (it->SeekForPrev(Key1(kMaxKey)), key = kMaxKey; it->Valid();
+         it->Prev(), ++count, --key) {
+      CheckIterEntry(it.get(), Key1(key), kTypeValue, "value0",
+                     write_timestamps[0]);
+      if (i > 0) {
+        it->Prev();
+        CheckIterEntry(it.get(), Key1(key), kTypeValue, "value1",
+                       write_timestamps[1]);
+      }
+    }
+    size_t expected_count = kMaxKey + 1;
+    ASSERT_EQ(expected_count, count);
+  }
+  // Delete all keys@ts=5 and check iteration result with start ts set
+  {
+    std::string write_timestamp = Timestamp(5, 0);
+    WriteOptions write_opts;
+    for (uint64_t key = 0; key < kMaxKey + 1; ++key) {
+      Status s = db_->Delete(write_opts, Key1(key), write_timestamp);
+      ASSERT_OK(s);
+    }
+
+    std::string read_timestamp = Timestamp(6, 0);
+    ReadOptions read_opts;
+    Slice read_ts = read_timestamp;
+    read_opts.timestamp = &read_ts;
+    std::string read_timestamp_lb = Timestamp(2, 0);
+    Slice read_ts_lb = read_timestamp_lb;
+    read_opts.iter_start_ts = &read_ts_lb;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    int count = 0;
+    uint64_t key = kMaxKey;
+    for (it->SeekForPrev(Key1(key)), key = kMaxKey; it->Valid();
+         it->Prev(), ++count, --key) {
+      CheckIterEntry(it.get(), Key1(key), kTypeValue, "value1",
+                     Timestamp(3, 0));
+      it->Prev();
+      CheckIterEntry(it.get(), Key1(key), kTypeDeletionWithTimestamp, Slice(),
+                     write_timestamp);
+    }
+    ASSERT_EQ(kMaxKey + 1, count);
+  }
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, SimpleBackwardIterateLowerTsBound) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  std::string ts_ub_buf = Timestamp(5, 0);
+  Slice ts_ub = ts_ub_buf;
+  std::string ts_lb_buf = Timestamp(1, 0);
+  Slice ts_lb = ts_lb_buf;
+
+  {
+    ReadOptions read_opts;
+    read_opts.timestamp = &ts_ub;
+    read_opts.iter_start_ts = &ts_lb;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    it->SeekToLast();
+    ASSERT_FALSE(it->Valid());
+    ASSERT_OK(it->status());
+
+    it->SeekForPrev("foo");
+    ASSERT_FALSE(it->Valid());
+    ASSERT_OK(it->status());
+  }
+
+  // Test iterate_upper_bound
+  ASSERT_OK(db_->Put(WriteOptions(), "a", Timestamp(0, 0), "v0"));
+  ASSERT_OK(db_->SingleDelete(WriteOptions(), "a", Timestamp(1, 0)));
+
+  for (int i = 0; i < 5; ++i) {
+    ASSERT_OK(db_->Put(WriteOptions(), "b", Timestamp(i, 0),
+                       "v" + std::to_string(i)));
+  }
+
+  {
+    ReadOptions read_opts;
+    read_opts.timestamp = &ts_ub;
+    read_opts.iter_start_ts = &ts_lb;
+    std::string key_ub_str = "b";  // exclusive
+    Slice key_ub = key_ub_str;
+    read_opts.iterate_upper_bound = &key_ub;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    it->SeekToLast();
+    CheckIterEntry(it.get(), "a", kTypeSingleDeletion, Slice(),
+                   Timestamp(1, 0));
+
+    key_ub_str = "a";  // exclusive
+    key_ub = key_ub_str;
+    read_opts.iterate_upper_bound = &key_ub;
+    it.reset(db_->NewIterator(read_opts));
+    it->SeekToLast();
+    ASSERT_FALSE(it->Valid());
+    ASSERT_OK(it->status());
+  }
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, BackwardIterateLowerTsBound_Reseek) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.max_sequential_skip_in_iterations = 2;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_OK(db_->Put(WriteOptions(), "a", Timestamp(i, 0),
+                       "v" + std::to_string(i)));
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_OK(db_->Put(WriteOptions(), "b", Timestamp(i, 0),
+                       "v" + std::to_string(i)));
+  }
+
+  {
+    std::string ts_ub_buf = Timestamp(6, 0);
+    Slice ts_ub = ts_ub_buf;
+    std::string ts_lb_buf = Timestamp(4, 0);
+    Slice ts_lb = ts_lb_buf;
+
+    ReadOptions read_opts;
+    read_opts.timestamp = &ts_ub;
+    read_opts.iter_start_ts = &ts_lb;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    it->SeekToLast();
+    for (int i = 0; i < 3 && it->Valid(); it->Prev(), ++i) {
+      CheckIterEntry(it.get(), "b", kTypeValue, "v" + std::to_string(4 + i),
+                     Timestamp(4 + i, 0));
+    }
+    for (int i = 0; i < 3 && it->Valid(); it->Prev(), ++i) {
+      CheckIterEntry(it.get(), "a", kTypeValue, "v" + std::to_string(4 + i),
+                     Timestamp(4 + i, 0));
+    }
+  }
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, ReseekToTargetTimestamp) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  constexpr size_t kNumKeys = 16;
+  options.max_sequential_skip_in_iterations = kNumKeys / 2;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  // Insert kNumKeys
+  WriteOptions write_opts;
+  Status s;
+  for (size_t i = 0; i != kNumKeys; ++i) {
+    std::string ts = Timestamp(static_cast<uint64_t>(i + 1), 0);
+    s = db_->Put(write_opts, "foo", ts, "value" + std::to_string(i));
+    ASSERT_OK(s);
+  }
+  {
+    ReadOptions read_opts;
+    std::string ts_str = Timestamp(1, 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    iter->SeekToFirst();
+    CheckIterUserEntry(iter.get(), "foo", kTypeValue, "value0", ts_str);
+    ASSERT_EQ(
+        1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+
+    ts_str = Timestamp(kNumKeys, 0);
+    ts = ts_str;
+    read_opts.timestamp = &ts;
+    iter.reset(db_->NewIterator(read_opts));
+    iter->SeekToLast();
+    CheckIterUserEntry(iter.get(), "foo", kTypeValue,
+                       "value" + std::to_string(kNumKeys - 1), ts_str);
+    ASSERT_EQ(
+        2, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+  }
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, ReseekToNextUserKey) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  constexpr size_t kNumKeys = 16;
+  options.max_sequential_skip_in_iterations = kNumKeys / 2;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  // Write kNumKeys + 1 keys
+  WriteOptions write_opts;
+  Status s;
+  for (size_t i = 0; i != kNumKeys; ++i) {
+    std::string ts = Timestamp(static_cast<uint64_t>(i + 1), 0);
+    s = db_->Put(write_opts, "a", ts, "value" + std::to_string(i));
+    ASSERT_OK(s);
+  }
+  {
+    std::string ts_str = Timestamp(static_cast<uint64_t>(kNumKeys + 1), 0);
+    WriteBatch batch(0, 0, 0, kTimestampSize);
+    { ASSERT_OK(batch.Put("a", "new_value")); }
+    { ASSERT_OK(batch.Put("b", "new_value")); }
+    s = batch.UpdateTimestamps(
+        ts_str, [kTimestampSize](uint32_t) { return kTimestampSize; });
+    ASSERT_OK(s);
+    s = db_->Write(write_opts, &batch);
+    ASSERT_OK(s);
+  }
+  {
+    ReadOptions read_opts;
+    std::string ts_str = Timestamp(static_cast<uint64_t>(kNumKeys + 1), 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    iter->Seek("a");
+    iter->Next();
+    CheckIterUserEntry(iter.get(), "b", kTypeValue, "new_value", ts_str);
+    ASSERT_EQ(
+        1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+  }
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, ReseekToUserKeyBeforeSavedKey) {
+  Options options = GetDefaultOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  constexpr size_t kNumKeys = 16;
+  options.max_sequential_skip_in_iterations = kNumKeys / 2;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  for (size_t i = 0; i < kNumKeys; ++i) {
+    std::string ts = Timestamp(static_cast<uint64_t>(i + 1), 0);
+    WriteOptions write_opts;
+    Status s = db_->Put(write_opts, "b", ts, "value" + std::to_string(i));
+    ASSERT_OK(s);
+  }
+  {
+    std::string ts = Timestamp(1, 0);
+    WriteOptions write_opts;
+    ASSERT_OK(db_->Put(write_opts, "a", ts, "value"));
+  }
+  {
+    ReadOptions read_opts;
+    std::string ts_str = Timestamp(1, 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    iter->SeekToLast();
+    iter->Prev();
+    CheckIterUserEntry(iter.get(), "a", kTypeValue, "value", ts_str);
+    ASSERT_EQ(
+        1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+  }
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, MultiGetWithFastLocalBloom) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  // Write any value
+  WriteOptions write_opts;
+  std::string ts = Timestamp(1, 0);
+
+  ASSERT_OK(db_->Put(write_opts, "foo", ts, "bar"));
+
+  ASSERT_OK(Flush());
+
+  // Read with MultiGet
+  ReadOptions read_opts;
+  Slice read_ts = ts;
+  read_opts.timestamp = &read_ts;
+  size_t batch_size = 1;
+  std::vector<Slice> keys(batch_size);
+  std::vector<PinnableSlice> values(batch_size);
+  std::vector<Status> statuses(batch_size);
+  std::vector<std::string> timestamps(batch_size);
+  keys[0] = "foo";
+  ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+  db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+                timestamps.data(), statuses.data(), true);
+
+  ASSERT_OK(statuses[0]);
+  ASSERT_EQ(Timestamp(1, 0), timestamps[0]);
+  for (auto& elem : values) {
+    elem.Reset();
+  }
+
+  ASSERT_OK(db_->SingleDelete(WriteOptions(), "foo", Timestamp(2, 0)));
+  ts = Timestamp(3, 0);
+  read_ts = ts;
+  read_opts.timestamp = &read_ts;
+  db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+                timestamps.data(), statuses.data(), true);
+  ASSERT_TRUE(statuses[0].IsNotFound());
+  ASSERT_EQ(Timestamp(2, 0), timestamps[0]);
+
+  Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetWithPrefix) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewCappedPrefixTransform(5));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = false;
+  bbto.index_type = GetParam();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  // Write any value
+  WriteOptions write_opts;
+  std::string ts = Timestamp(1, 0);
+
+  ASSERT_OK(db_->Put(write_opts, "foo", ts, "bar"));
+
+  ASSERT_OK(Flush());
+
+  // Read with MultiGet
+  ReadOptions read_opts;
+  Slice read_ts = ts;
+  read_opts.timestamp = &read_ts;
+  size_t batch_size = 1;
+  std::vector<Slice> keys(batch_size);
+  std::vector<PinnableSlice> values(batch_size);
+  std::vector<Status> statuses(batch_size);
+  std::vector<std::string> timestamps(batch_size);
+  keys[0] = "foo";
+  ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+  db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+                timestamps.data(), statuses.data(), true);
+
+  ASSERT_OK(statuses[0]);
+  ASSERT_EQ(Timestamp(1, 0), timestamps[0]);
+  for (auto& elem : values) {
+    elem.Reset();
+  }
+
+  ASSERT_OK(db_->SingleDelete(WriteOptions(), "foo", Timestamp(2, 0)));
+  // TODO re-enable after fixing a bug of kHashSearch
+  if (GetParam() != BlockBasedTableOptions::IndexType::kHashSearch) {
+    ASSERT_OK(Flush());
+  }
+
+  ts = Timestamp(3, 0);
+  read_ts = ts;
+  db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+                timestamps.data(), statuses.data(), true);
+  ASSERT_TRUE(statuses[0].IsNotFound());
+  ASSERT_EQ(Timestamp(2, 0), timestamps[0]);
+
+  Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetWithMemBloomFilter) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewCappedPrefixTransform(5));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = false;
+  bbto.index_type = GetParam();
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  // Write any value
+  WriteOptions write_opts;
+  std::string ts = Timestamp(1, 0);
+
+  ASSERT_OK(db_->Put(write_opts, "foo", ts, "bar"));
+
+  // Read with MultiGet
+  ts = Timestamp(2, 0);
+  Slice read_ts = ts;
+  ReadOptions read_opts;
+  read_opts.timestamp = &read_ts;
+  size_t batch_size = 1;
+  std::vector<Slice> keys(batch_size);
+  std::vector<PinnableSlice> values(batch_size);
+  std::vector<Status> statuses(batch_size);
+  keys[0] = "foo";
+  ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+  db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+                statuses.data());
+
+  ASSERT_OK(statuses[0]);
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, MultiGetRangeFiltering) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = false;
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  // Write any value
+  WriteOptions write_opts;
+  std::string ts = Timestamp(1, 0);
+
+  // random data
+  for (int i = 0; i < 3; i++) {
+    auto key = std::to_string(i * 10);
+    auto value = std::to_string(i * 10);
+    Slice key_slice = key;
+    Slice value_slice = value;
+    ASSERT_OK(db_->Put(write_opts, key_slice, ts, value_slice));
+    ASSERT_OK(Flush());
+  }
+
+  // Make num_levels to 2 to do key range filtering of sst files
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_OK(db_->Put(write_opts, "foo", ts, "bar"));
+
+  ASSERT_OK(Flush());
+
+  // Read with MultiGet
+  ts = Timestamp(2, 0);
+  Slice read_ts = ts;
+  ReadOptions read_opts;
+  read_opts.timestamp = &read_ts;
+  size_t batch_size = 1;
+  std::vector<Slice> keys(batch_size);
+  std::vector<PinnableSlice> values(batch_size);
+  std::vector<Status> statuses(batch_size);
+  keys[0] = "foo";
+  ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+  db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+                statuses.data());
+
+  ASSERT_OK(statuses[0]);
+  Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetPrefixFilter) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = false;
+  bbto.index_type = GetParam();
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  WriteOptions write_opts;
+  std::string ts = Timestamp(1, 0);
+
+  ASSERT_OK(db_->Put(write_opts, "foo", ts, "bar"));
+
+  ASSERT_OK(Flush());
+  // Read with MultiGet
+  ts = Timestamp(2, 0);
+  Slice read_ts = ts;
+  ReadOptions read_opts;
+  read_opts.timestamp = &read_ts;
+  size_t batch_size = 1;
+  std::vector<Slice> keys(batch_size);
+  std::vector<std::string> values(batch_size);
+  std::vector<std::string> timestamps(batch_size);
+  keys[0] = "foo";
+  ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+  std::vector<ColumnFamilyHandle*> cfhs(keys.size(), cfh);
+  std::vector<Status> statuses =
+      db_->MultiGet(read_opts, cfhs, keys, &values, &timestamps);
+
+  ASSERT_OK(statuses[0]);
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, MaxKeysSkippedDuringNext) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  constexpr size_t max_skippable_internal_keys = 2;
+  const size_t kNumKeys = max_skippable_internal_keys + 2;
+  WriteOptions write_opts;
+  Status s;
+  {
+    std::string ts = Timestamp(1, 0);
+    ASSERT_OK(db_->Put(write_opts, "a", ts, "value"));
+  }
+  for (size_t i = 0; i < kNumKeys; ++i) {
+    std::string ts = Timestamp(static_cast<uint64_t>(i + 1), 0);
+    s = db_->Put(write_opts, "b", ts, "value" + std::to_string(i));
+    ASSERT_OK(s);
+  }
+  {
+    ReadOptions read_opts;
+    read_opts.max_skippable_internal_keys = max_skippable_internal_keys;
+    std::string ts_str = Timestamp(1, 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    iter->SeekToFirst();
+    iter->Next();
+    ASSERT_TRUE(iter->status().IsIncomplete());
+  }
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, MaxKeysSkippedDuringPrev) {
+  Options options = GetDefaultOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  constexpr size_t max_skippable_internal_keys = 2;
+  const size_t kNumKeys = max_skippable_internal_keys + 2;
+  WriteOptions write_opts;
+  Status s;
+  {
+    std::string ts = Timestamp(1, 0);
+    ASSERT_OK(db_->Put(write_opts, "b", ts, "value"));
+  }
+  for (size_t i = 0; i < kNumKeys; ++i) {
+    std::string ts = Timestamp(static_cast<uint64_t>(i + 1), 0);
+    s = db_->Put(write_opts, "a", ts, "value" + std::to_string(i));
+    ASSERT_OK(s);
+  }
+  {
+    ReadOptions read_opts;
+    read_opts.max_skippable_internal_keys = max_skippable_internal_keys;
+    std::string ts_str = Timestamp(1, 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    iter->SeekToLast();
+    iter->Prev();
+    ASSERT_TRUE(iter->status().IsIncomplete());
+  }
+  Close();
+}
+
+// Create two L0, and compact them to a new L1. In this test, L1 is L_bottom.
+// Two L0s:
+//       f1                                  f2
+// <a, 1, kTypeValue>    <a, 3, kTypeDeletionWithTimestamp>...<b, 2, kTypeValue>
+// Since f2.smallest < f1.largest < f2.largest
+// f1 and f2 will be the inputs of a real compaction instead of trivial move.
+TEST_F(DBBasicTestWithTimestamp, CompactDeletionWithTimestampMarkerToBottom) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.num_levels = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  DestroyAndReopen(options);
+  WriteOptions write_opts;
+  std::string ts = Timestamp(1, 0);
+  ASSERT_OK(db_->Put(write_opts, "a", ts, "value0"));
+  ASSERT_OK(Flush());
+
+  ts = Timestamp(2, 0);
+  ASSERT_OK(db_->Put(write_opts, "b", ts, "value0"));
+  ts = Timestamp(3, 0);
+  ASSERT_OK(db_->Delete(write_opts, "a", ts));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ReadOptions read_opts;
+  ts = Timestamp(1, 0);
+  Slice read_ts = ts;
+  read_opts.timestamp = &read_ts;
+  std::string value;
+  Status s = db_->Get(read_opts, "a", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("value0", value);
+
+  ts = Timestamp(3, 0);
+  read_ts = ts;
+  read_opts.timestamp = &read_ts;
+  std::string key_ts;
+  s = db_->Get(read_opts, "a", &value, &key_ts);
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_EQ(Timestamp(3, 0), key_ts);
+
+  // Time-travel to the past before deletion
+  ts = Timestamp(2, 0);
+  read_ts = ts;
+  read_opts.timestamp = &read_ts;
+  s = db_->Get(read_opts, "a", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("value0", value);
+  Close();
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+class DBBasicTestWithTimestampFilterPrefixSettings
+    : public DBBasicTestWithTimestampBase,
+      public testing::WithParamInterface<
+          std::tuple<std::shared_ptr<const FilterPolicy>, bool, bool,
+                     std::shared_ptr<const SliceTransform>, bool, double,
+                     BlockBasedTableOptions::IndexType>> {
+ public:
+  DBBasicTestWithTimestampFilterPrefixSettings()
+      : DBBasicTestWithTimestampBase(
+            "db_basic_test_with_timestamp_filter_prefix") {}
+};
+
+TEST_P(DBBasicTestWithTimestampFilterPrefixSettings, GetAndMultiGet) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy = std::get<0>(GetParam());
+  bbto.whole_key_filtering = std::get<1>(GetParam());
+  bbto.cache_index_and_filter_blocks = std::get<2>(GetParam());
+  bbto.index_type = std::get<6>(GetParam());
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  options.prefix_extractor = std::get<3>(GetParam());
+  options.memtable_whole_key_filtering = std::get<4>(GetParam());
+  options.memtable_prefix_bloom_size_ratio = std::get<5>(GetParam());
+
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  const int kMaxKey = 1000;
+
+  // Write any value
+  WriteOptions write_opts;
+  std::string ts = Timestamp(1, 0);
+
+  int idx = 0;
+  for (; idx < kMaxKey / 4; idx++) {
+    ASSERT_OK(db_->Put(write_opts, Key1(idx), ts, "bar"));
+    ASSERT_OK(db_->Put(write_opts, KeyWithPrefix("foo", idx), ts, "bar"));
+  }
+
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  for (; idx < kMaxKey / 2; idx++) {
+    ASSERT_OK(db_->Put(write_opts, Key1(idx), ts, "bar"));
+    ASSERT_OK(db_->Put(write_opts, KeyWithPrefix("foo", idx), ts, "bar"));
+  }
+
+  ASSERT_OK(Flush());
+
+  for (; idx < kMaxKey; idx++) {
+    ASSERT_OK(db_->Put(write_opts, Key1(idx), ts, "bar"));
+    ASSERT_OK(db_->Put(write_opts, KeyWithPrefix("foo", idx), ts, "bar"));
+  }
+
+  // Read with MultiGet
+  ReadOptions read_opts;
+  Slice read_ts = ts;
+  read_opts.timestamp = &read_ts;
+
+  for (idx = 0; idx < kMaxKey; idx++) {
+    size_t batch_size = 4;
+    std::vector<std::string> keys_str(batch_size);
+    std::vector<PinnableSlice> values(batch_size);
+    std::vector<Status> statuses(batch_size);
+    ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+
+    keys_str[0] = Key1(idx);
+    keys_str[1] = KeyWithPrefix("foo", idx);
+    keys_str[2] = Key1(kMaxKey + idx);
+    keys_str[3] = KeyWithPrefix("foo", kMaxKey + idx);
+
+    auto keys = ConvertStrToSlice(keys_str);
+
+    db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+                  statuses.data());
+
+    for (int i = 0; i < 2; i++) {
+      ASSERT_OK(statuses[i]);
+    }
+    for (int i = 2; i < 4; i++) {
+      ASSERT_TRUE(statuses[i].IsNotFound());
+    }
+
+    for (int i = 0; i < 2; i++) {
+      std::string value;
+      ASSERT_OK(db_->Get(read_opts, keys[i], &value));
+      std::unique_ptr<Iterator> it1(db_->NewIterator(read_opts));
+      ASSERT_NE(nullptr, it1);
+      ASSERT_OK(it1->status());
+      it1->Seek(keys[i]);
+      ASSERT_TRUE(it1->Valid());
+    }
+
+    for (int i = 2; i < 4; i++) {
+      std::string value;
+      Status s = db_->Get(read_opts, keys[i], &value);
+      ASSERT_TRUE(s.IsNotFound());
+    }
+  }
+  Close();
+}
+
+INSTANTIATE_TEST_CASE_P(
+    Timestamp, DBBasicTestWithTimestampFilterPrefixSettings,
+    ::testing::Combine(
+        ::testing::Values(
+            std::shared_ptr<const FilterPolicy>(nullptr),
+            std::shared_ptr<const FilterPolicy>(NewBloomFilterPolicy(10, true)),
+            std::shared_ptr<const FilterPolicy>(NewBloomFilterPolicy(10,
+                                                                     false))),
+        ::testing::Bool(), ::testing::Bool(),
+        ::testing::Values(
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(1)),
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(4)),
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(7)),
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(8))),
+        ::testing::Bool(), ::testing::Values(0, 0.1),
+        ::testing::Values(
+            BlockBasedTableOptions::IndexType::kBinarySearch,
+            BlockBasedTableOptions::IndexType::kHashSearch,
+            BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
+            BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey)));
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+class DataVisibilityTest : public DBBasicTestWithTimestampBase {
+ public:
+  DataVisibilityTest() : DBBasicTestWithTimestampBase("data_visibility_test") {
+    // Initialize test data
+    for (int i = 0; i < kTestDataSize; i++) {
+      test_data_[i].key = "key" + std::to_string(i);
+      test_data_[i].value = "value" + std::to_string(i);
+      test_data_[i].timestamp = Timestamp(i, 0);
+      test_data_[i].ts = i;
+      test_data_[i].seq_num = kMaxSequenceNumber;
+    }
+  }
+
+ protected:
+  struct TestData {
+    std::string key;
+    std::string value;
+    int ts;
+    std::string timestamp;
+    SequenceNumber seq_num;
+  };
+
+  constexpr static int kTestDataSize = 3;
+  TestData test_data_[kTestDataSize];
+
+  void PutTestData(int index, ColumnFamilyHandle* cfh = nullptr) {
+    ASSERT_LE(index, kTestDataSize);
+    WriteOptions write_opts;
+
+    if (cfh == nullptr) {
+      ASSERT_OK(db_->Put(write_opts, test_data_[index].key,
+                         test_data_[index].timestamp, test_data_[index].value));
+      const Snapshot* snap = db_->GetSnapshot();
+      test_data_[index].seq_num = snap->GetSequenceNumber();
+      if (index > 0) {
+        ASSERT_GT(test_data_[index].seq_num, test_data_[index - 1].seq_num);
+      }
+      db_->ReleaseSnapshot(snap);
+    } else {
+      ASSERT_OK(db_->Put(write_opts, cfh, test_data_[index].key,
+                         test_data_[index].timestamp, test_data_[index].value));
+    }
+  }
+
+  void AssertVisibility(int ts, SequenceNumber seq,
+                        std::vector<Status> statuses) {
+    ASSERT_EQ(kTestDataSize, statuses.size());
+    for (int i = 0; i < kTestDataSize; i++) {
+      if (test_data_[i].seq_num <= seq && test_data_[i].ts <= ts) {
+        ASSERT_OK(statuses[i]);
+      } else {
+        ASSERT_TRUE(statuses[i].IsNotFound());
+      }
+    }
+  }
+
+  std::vector<Slice> GetKeys() {
+    std::vector<Slice> ret(kTestDataSize);
+    for (int i = 0; i < kTestDataSize; i++) {
+      ret[i] = test_data_[i].key;
+    }
+    return ret;
+  }
+
+  void VerifyDefaultCF(int ts, const Snapshot* snap = nullptr) {
+    ReadOptions read_opts;
+    std::string read_ts = Timestamp(ts, 0);
+    Slice read_ts_slice = read_ts;
+    read_opts.timestamp = &read_ts_slice;
+    read_opts.snapshot = snap;
+
+    ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+    std::vector<ColumnFamilyHandle*> cfs(kTestDataSize, cfh);
+    SequenceNumber seq =
+        snap ? snap->GetSequenceNumber() : kMaxSequenceNumber - 1;
+
+    // There're several MultiGet interfaces with not exactly the same
+    // implementations, query data with all of them.
+    auto keys = GetKeys();
+    std::vector<std::string> values;
+    auto s1 = db_->MultiGet(read_opts, cfs, keys, &values);
+    AssertVisibility(ts, seq, s1);
+
+    auto s2 = db_->MultiGet(read_opts, keys, &values);
+    AssertVisibility(ts, seq, s2);
+
+    std::vector<std::string> timestamps;
+    auto s3 = db_->MultiGet(read_opts, cfs, keys, &values, &timestamps);
+    AssertVisibility(ts, seq, s3);
+
+    auto s4 = db_->MultiGet(read_opts, keys, &values, &timestamps);
+    AssertVisibility(ts, seq, s4);
+
+    std::vector<PinnableSlice> values_ps5(kTestDataSize);
+    std::vector<Status> s5(kTestDataSize);
+    db_->MultiGet(read_opts, cfh, kTestDataSize, keys.data(), values_ps5.data(),
+                  s5.data());
+    AssertVisibility(ts, seq, s5);
+
+    std::vector<PinnableSlice> values_ps6(kTestDataSize);
+    std::vector<Status> s6(kTestDataSize);
+    std::vector<std::string> timestamps_array(kTestDataSize);
+    db_->MultiGet(read_opts, cfh, kTestDataSize, keys.data(), values_ps6.data(),
+                  timestamps_array.data(), s6.data());
+    AssertVisibility(ts, seq, s6);
+
+    std::vector<PinnableSlice> values_ps7(kTestDataSize);
+    std::vector<Status> s7(kTestDataSize);
+    db_->MultiGet(read_opts, kTestDataSize, cfs.data(), keys.data(),
+                  values_ps7.data(), s7.data());
+    AssertVisibility(ts, seq, s7);
+
+    std::vector<PinnableSlice> values_ps8(kTestDataSize);
+    std::vector<Status> s8(kTestDataSize);
+    db_->MultiGet(read_opts, kTestDataSize, cfs.data(), keys.data(),
+                  values_ps8.data(), timestamps_array.data(), s8.data());
+    AssertVisibility(ts, seq, s8);
+  }
+
+  void VerifyDefaultCF(const Snapshot* snap = nullptr) {
+    for (int i = 0; i <= kTestDataSize; i++) {
+      VerifyDefaultCF(i, snap);
+    }
+  }
+};
+constexpr int DataVisibilityTest::kTestDataSize;
+
+// Application specifies timestamp but not snapshot.
+//           reader              writer
+//                               ts'=90
+//           ts=100
+//           seq=10
+//                               seq'=11
+//                               write finishes
+//         GetImpl(ts,seq)
+// It is OK to return <k, t1, s1> if ts>=t1 AND seq>=s1. If ts>=t1 but seq<s1,
+// the key should not be returned.
+TEST_F(DataVisibilityTest, PointLookupWithoutSnapshot1) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::GetImpl:3",
+       "DataVisibilityTest::PointLookupWithoutSnapshot1:BeforePut"},
+      {"DataVisibilityTest::PointLookupWithoutSnapshot1:AfterPut",
+       "DBImpl::GetImpl:4"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread writer_thread([this]() {
+    std::string write_ts = Timestamp(1, 0);
+    WriteOptions write_opts;
+    TEST_SYNC_POINT(
+        "DataVisibilityTest::PointLookupWithoutSnapshot1:BeforePut");
+    Status s = db_->Put(write_opts, "foo", write_ts, "value");
+    ASSERT_OK(s);
+    TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithoutSnapshot1:AfterPut");
+  });
+  ReadOptions read_opts;
+  std::string read_ts_str = Timestamp(3, 0);
+  Slice read_ts = read_ts_str;
+  read_opts.timestamp = &read_ts;
+  std::string value;
+  Status s = db_->Get(read_opts, "foo", &value);
+
+  writer_thread.join();
+  ASSERT_TRUE(s.IsNotFound());
+  Close();
+}
+
+// Application specifies timestamp but not snapshot.
+//           reader              writer
+//                               ts'=90
+//           ts=100
+//           seq=10
+//                               seq'=11
+//                               write finishes
+//                               Flush
+//         GetImpl(ts,seq)
+// It is OK to return <k, t1, s1> if ts>=t1 AND seq>=s1. If ts>=t1 but seq<s1,
+// the key should not be returned.
+TEST_F(DataVisibilityTest, PointLookupWithoutSnapshot2) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::GetImpl:3",
+       "DataVisibilityTest::PointLookupWithoutSnapshot2:BeforePut"},
+      {"DataVisibilityTest::PointLookupWithoutSnapshot2:AfterPut",
+       "DBImpl::GetImpl:4"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread writer_thread([this]() {
+    std::string write_ts = Timestamp(1, 0);
+    WriteOptions write_opts;
+    TEST_SYNC_POINT(
+        "DataVisibilityTest::PointLookupWithoutSnapshot2:BeforePut");
+    Status s = db_->Put(write_opts, "foo", write_ts, "value");
+    ASSERT_OK(s);
+    ASSERT_OK(Flush());
+
+    write_ts = Timestamp(2, 0);
+    s = db_->Put(write_opts, "bar", write_ts, "value");
+    ASSERT_OK(s);
+    TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithoutSnapshot2:AfterPut");
+  });
+  ReadOptions read_opts;
+  std::string read_ts_str = Timestamp(3, 0);
+  Slice read_ts = read_ts_str;
+  read_opts.timestamp = &read_ts;
+  std::string value;
+  Status s = db_->Get(read_opts, "foo", &value);
+  writer_thread.join();
+  ASSERT_TRUE(s.IsNotFound());
+  Close();
+}
+
+// Application specifies both timestamp and snapshot.
+//       reader               writer
+//       seq=10
+//                            ts'=90
+//       ts=100
+//                            seq'=11
+//                            write finishes
+//       GetImpl(ts,seq)
+// Since application specifies both timestamp and snapshot, application expects
+// to see data that visible in BOTH timestamp and sequence number. Therefore,
+// <k, t1, s1> can be returned only if t1<=ts AND s1<=seq.
+TEST_F(DataVisibilityTest, PointLookupWithSnapshot1) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DataVisibilityTest::PointLookupWithSnapshot1:AfterTakingSnap",
+       "DataVisibilityTest::PointLookupWithSnapshot1:BeforePut"},
+      {"DataVisibilityTest::PointLookupWithSnapshot1:AfterPut",
+       "DBImpl::GetImpl:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread writer_thread([this]() {
+    std::string write_ts = Timestamp(1, 0);
+    WriteOptions write_opts;
+    TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithSnapshot1:BeforePut");
+    Status s = db_->Put(write_opts, "foo", write_ts, "value");
+    TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithSnapshot1:AfterPut");
+    ASSERT_OK(s);
+  });
+  ReadOptions read_opts;
+  const Snapshot* snap = db_->GetSnapshot();
+  TEST_SYNC_POINT(
+      "DataVisibilityTest::PointLookupWithSnapshot1:AfterTakingSnap");
+  read_opts.snapshot = snap;
+  std::string read_ts_str = Timestamp(3, 0);
+  Slice read_ts = read_ts_str;
+  read_opts.timestamp = &read_ts;
+  std::string value;
+  Status s = db_->Get(read_opts, "foo", &value);
+  writer_thread.join();
+
+  ASSERT_TRUE(s.IsNotFound());
+
+  db_->ReleaseSnapshot(snap);
+  Close();
+}
+
+// Application specifies both timestamp and snapshot.
+//       reader               writer
+//       seq=10
+//                            ts'=90
+//       ts=100
+//                            seq'=11
+//                            write finishes
+//                            Flush
+//       GetImpl(ts,seq)
+// Since application specifies both timestamp and snapshot, application expects
+// to see data that visible in BOTH timestamp and sequence number. Therefore,
+// <k, t1, s1> can be returned only if t1<=ts AND s1<=seq.
+TEST_F(DataVisibilityTest, PointLookupWithSnapshot2) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DataVisibilityTest::PointLookupWithSnapshot2:AfterTakingSnap",
+       "DataVisibilityTest::PointLookupWithSnapshot2:BeforePut"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread writer_thread([this]() {
+    std::string write_ts = Timestamp(1, 0);
+    WriteOptions write_opts;
+    TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithSnapshot2:BeforePut");
+    Status s = db_->Put(write_opts, "foo", write_ts, "value1");
+    ASSERT_OK(s);
+    ASSERT_OK(Flush());
+
+    write_ts = Timestamp(2, 0);
+    s = db_->Put(write_opts, "bar", write_ts, "value2");
+    ASSERT_OK(s);
+  });
+  const Snapshot* snap = db_->GetSnapshot();
+  TEST_SYNC_POINT(
+      "DataVisibilityTest::PointLookupWithSnapshot2:AfterTakingSnap");
+  writer_thread.join();
+  std::string read_ts_str = Timestamp(3, 0);
+  Slice read_ts = read_ts_str;
+  ReadOptions read_opts;
+  read_opts.snapshot = snap;
+  read_opts.timestamp = &read_ts;
+  std::string value;
+  Status s = db_->Get(read_opts, "foo", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  db_->ReleaseSnapshot(snap);
+  Close();
+}
+
+// Application specifies timestamp but not snapshot.
+//      reader                writer
+//                            ts'=90
+//      ts=100
+//      seq=10
+//                            seq'=11
+//                            write finishes
+//      scan(ts,seq)
+// <k, t1, s1> can be seen in scan as long as ts>=t1 AND seq>=s1. If ts>=t1 but
+// seq<s1, then the key should not be returned.
+TEST_F(DataVisibilityTest, RangeScanWithoutSnapshot) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::NewIterator:3",
+       "DataVisibilityTest::RangeScanWithoutSnapshot:BeforePut"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread writer_thread([this]() {
+    WriteOptions write_opts;
+    TEST_SYNC_POINT("DataVisibilityTest::RangeScanWithoutSnapshot:BeforePut");
+    for (int i = 0; i < 3; ++i) {
+      std::string write_ts = Timestamp(i + 1, 0);
+      Status s = db_->Put(write_opts, "key" + std::to_string(i), write_ts,
+                          "value" + std::to_string(i));
+      ASSERT_OK(s);
+    }
+  });
+  std::string read_ts_str = Timestamp(10, 0);
+  Slice read_ts = read_ts_str;
+  ReadOptions read_opts;
+  read_opts.total_order_seek = true;
+  read_opts.timestamp = &read_ts;
+  Iterator* it = db_->NewIterator(read_opts);
+  ASSERT_NE(nullptr, it);
+  writer_thread.join();
+  it->SeekToFirst();
+  ASSERT_FALSE(it->Valid());
+  delete it;
+  Close();
+}
+
+// Application specifies both timestamp and snapshot.
+//       reader         writer
+//       seq=10
+//                      ts'=90
+//       ts=100         seq'=11
+//                      write finishes
+//       scan(ts,seq)
+// <k, t1, s1> can be seen by the scan only if t1<=ts AND s1<=seq. If t1<=ts
+// but s1>seq, then the key should not be returned.
+TEST_F(DataVisibilityTest, RangeScanWithSnapshot) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DataVisibilityTest::RangeScanWithSnapshot:AfterTakingSnapshot",
+       "DataVisibilityTest::RangeScanWithSnapshot:BeforePut"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread writer_thread([this]() {
+    WriteOptions write_opts;
+    TEST_SYNC_POINT("DataVisibilityTest::RangeScanWithSnapshot:BeforePut");
+    for (int i = 0; i < 3; ++i) {
+      std::string write_ts = Timestamp(i + 1, 0);
+      Status s = db_->Put(write_opts, "key" + std::to_string(i), write_ts,
+                          "value" + std::to_string(i));
+      ASSERT_OK(s);
+    }
+  });
+  const Snapshot* snap = db_->GetSnapshot();
+  TEST_SYNC_POINT(
+      "DataVisibilityTest::RangeScanWithSnapshot:AfterTakingSnapshot");
+
+  writer_thread.join();
+
+  std::string read_ts_str = Timestamp(10, 0);
+  Slice read_ts = read_ts_str;
+  ReadOptions read_opts;
+  read_opts.snapshot = snap;
+  read_opts.total_order_seek = true;
+  read_opts.timestamp = &read_ts;
+  Iterator* it = db_->NewIterator(read_opts);
+  ASSERT_NE(nullptr, it);
+  it->Seek("key0");
+  ASSERT_FALSE(it->Valid());
+
+  delete it;
+  db_->ReleaseSnapshot(snap);
+  Close();
+}
+
+// Application specifies both timestamp and snapshot.
+// Query each combination and make sure for MultiGet key <k, t1, s1>, only
+// return keys that ts>=t1 AND seq>=s1.
+TEST_F(DataVisibilityTest, MultiGetWithTimestamp) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  const Snapshot* snap0 = db_->GetSnapshot();
+  PutTestData(0);
+  VerifyDefaultCF();
+  VerifyDefaultCF(snap0);
+
+  const Snapshot* snap1 = db_->GetSnapshot();
+  PutTestData(1);
+  VerifyDefaultCF();
+  VerifyDefaultCF(snap0);
+  VerifyDefaultCF(snap1);
+
+  ASSERT_OK(Flush());
+
+  const Snapshot* snap2 = db_->GetSnapshot();
+  PutTestData(2);
+  VerifyDefaultCF();
+  VerifyDefaultCF(snap0);
+  VerifyDefaultCF(snap1);
+  VerifyDefaultCF(snap2);
+
+  db_->ReleaseSnapshot(snap0);
+  db_->ReleaseSnapshot(snap1);
+  db_->ReleaseSnapshot(snap2);
+
+  Close();
+}
+
+// Application specifies timestamp but not snapshot.
+//           reader              writer
+//                               ts'=0, 1
+//           ts=3
+//           seq=10
+//                               seq'=11, 12
+//                               write finishes
+//         MultiGet(ts,seq)
+// For MultiGet <k, t1, s1>, only return keys that ts>=t1 AND seq>=s1.
+TEST_F(DataVisibilityTest, MultiGetWithoutSnapshot) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::MultiGet:AfterGetSeqNum1",
+       "DataVisibilityTest::MultiGetWithoutSnapshot:BeforePut"},
+      {"DataVisibilityTest::MultiGetWithoutSnapshot:AfterPut",
+       "DBImpl::MultiGet:AfterGetSeqNum2"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread writer_thread([this]() {
+    TEST_SYNC_POINT("DataVisibilityTest::MultiGetWithoutSnapshot:BeforePut");
+    PutTestData(0);
+    PutTestData(1);
+    TEST_SYNC_POINT("DataVisibilityTest::MultiGetWithoutSnapshot:AfterPut");
+  });
+
+  ReadOptions read_opts;
+  std::string read_ts = Timestamp(kTestDataSize, 0);
+  Slice read_ts_slice = read_ts;
+  read_opts.timestamp = &read_ts_slice;
+  auto keys = GetKeys();
+  std::vector<std::string> values;
+  auto ss = db_->MultiGet(read_opts, keys, &values);
+
+  writer_thread.join();
+  for (auto s : ss) {
+    ASSERT_TRUE(s.IsNotFound());
+  }
+  VerifyDefaultCF();
+  Close();
+}
+
+TEST_F(DataVisibilityTest, MultiGetCrossCF) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  CreateAndReopenWithCF({"second"}, options);
+  ColumnFamilyHandle* second_cf = handles_[1];
+
+  const Snapshot* snap0 = db_->GetSnapshot();
+  PutTestData(0);
+  PutTestData(0, second_cf);
+  VerifyDefaultCF();
+  VerifyDefaultCF(snap0);
+
+  const Snapshot* snap1 = db_->GetSnapshot();
+  PutTestData(1);
+  PutTestData(1, second_cf);
+  VerifyDefaultCF();
+  VerifyDefaultCF(snap0);
+  VerifyDefaultCF(snap1);
+
+  ASSERT_OK(Flush());
+
+  const Snapshot* snap2 = db_->GetSnapshot();
+  PutTestData(2);
+  PutTestData(2, second_cf);
+  VerifyDefaultCF();
+  VerifyDefaultCF(snap0);
+  VerifyDefaultCF(snap1);
+  VerifyDefaultCF(snap2);
+
+  ReadOptions read_opts;
+  std::string read_ts = Timestamp(kTestDataSize, 0);
+  Slice read_ts_slice = read_ts;
+  read_opts.timestamp = &read_ts_slice;
+  read_opts.snapshot = snap1;
+  auto keys = GetKeys();
+  auto keys2 = GetKeys();
+  keys.insert(keys.end(), keys2.begin(), keys2.end());
+  std::vector<ColumnFamilyHandle*> cfs(kTestDataSize,
+                                       db_->DefaultColumnFamily());
+  std::vector<ColumnFamilyHandle*> cfs2(kTestDataSize, second_cf);
+  cfs.insert(cfs.end(), cfs2.begin(), cfs2.end());
+
+  std::vector<std::string> values;
+  auto ss = db_->MultiGet(read_opts, cfs, keys, &values);
+  for (int i = 0; i < 2 * kTestDataSize; i++) {
+    if (i % 3 == 0) {
+      // only the first key for each column family should be returned
+      ASSERT_OK(ss[i]);
+    } else {
+      ASSERT_TRUE(ss[i].IsNotFound());
+    }
+  }
+
+  db_->ReleaseSnapshot(snap0);
+  db_->ReleaseSnapshot(snap1);
+  db_->ReleaseSnapshot(snap2);
+  Close();
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+class DBBasicTestWithTimestampCompressionSettings
+    : public DBBasicTestWithTimestampBase,
+      public testing::WithParamInterface<
+          std::tuple<std::shared_ptr<const FilterPolicy>, CompressionType,
+                     uint32_t, uint32_t>> {
+ public:
+  DBBasicTestWithTimestampCompressionSettings()
+      : DBBasicTestWithTimestampBase(
+            "db_basic_test_with_timestamp_compression") {}
+};
+
+TEST_P(DBBasicTestWithTimestampCompressionSettings, PutAndGet) {
+  const int kNumKeysPerFile = 1024;
+  const size_t kNumTimestamps = 4;
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.env = env_;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  size_t ts_sz = Timestamp(0, 0).size();
+  TestComparator test_cmp(ts_sz);
+  options.comparator = &test_cmp;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy = std::get<0>(GetParam());
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  const CompressionType comp_type = std::get<1>(GetParam());
+#if LZ4_VERSION_NUMBER < 10400  // r124+
+  if (comp_type == kLZ4Compression || comp_type == kLZ4HCCompression) {
+    return;
+  }
+#endif  // LZ4_VERSION_NUMBER >= 10400
+  if (!ZSTD_Supported() && comp_type == kZSTD) {
+    return;
+  }
+  if (!Zlib_Supported() && comp_type == kZlibCompression) {
+    return;
+  }
+
+  options.compression = comp_type;
+  options.compression_opts.max_dict_bytes = std::get<2>(GetParam());
+  if (comp_type == kZSTD) {
+    options.compression_opts.zstd_max_train_bytes = std::get<2>(GetParam());
+  }
+  options.compression_opts.parallel_threads = std::get<3>(GetParam());
+  options.target_file_size_base = 1 << 26;  // 64MB
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(2, num_cfs);
+  std::vector<std::string> write_ts_list;
+  std::vector<std::string> read_ts_list;
+
+  for (size_t i = 0; i != kNumTimestamps; ++i) {
+    write_ts_list.push_back(Timestamp(i * 2, 0));
+    read_ts_list.push_back(Timestamp(1 + i * 2, 0));
+    const Slice write_ts = write_ts_list.back();
+    WriteOptions wopts;
+    for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+      for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) {
+        ASSERT_OK(
+            db_->Put(wopts, handles_[cf], Key1(j), write_ts,
+                     "value_" + std::to_string(j) + "_" + std::to_string(i)));
+      }
+    }
+  }
+  const auto& verify_db_func = [&]() {
+    for (size_t i = 0; i != kNumTimestamps; ++i) {
+      ReadOptions ropts;
+      const Slice read_ts = read_ts_list[i];
+      ropts.timestamp = &read_ts;
+      for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+        ColumnFamilyHandle* cfh = handles_[cf];
+        for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) {
+          std::string value;
+          ASSERT_OK(db_->Get(ropts, cfh, Key1(j), &value));
+          ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i),
+                    value);
+        }
+      }
+    }
+  };
+  verify_db_func();
+  Close();
+}
+
+TEST_P(DBBasicTestWithTimestampCompressionSettings, PutDeleteGet) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  const int kNumKeysPerFile = 1024;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy = std::get<0>(GetParam());
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  const CompressionType comp_type = std::get<1>(GetParam());
+#if LZ4_VERSION_NUMBER < 10400  // r124+
+  if (comp_type == kLZ4Compression || comp_type == kLZ4HCCompression) {
+    return;
+  }
+#endif  // LZ4_VERSION_NUMBER >= 10400
+  if (!ZSTD_Supported() && comp_type == kZSTD) {
+    return;
+  }
+  if (!Zlib_Supported() && comp_type == kZlibCompression) {
+    return;
+  }
+
+  options.compression = comp_type;
+  options.compression_opts.max_dict_bytes = std::get<2>(GetParam());
+  if (comp_type == kZSTD) {
+    options.compression_opts.zstd_max_train_bytes = std::get<2>(GetParam());
+  }
+  options.compression_opts.parallel_threads = std::get<3>(GetParam());
+  options.target_file_size_base = 1 << 26;  // 64MB
+
+  DestroyAndReopen(options);
+
+  const size_t kNumL0Files =
+      static_cast<size_t>(Options().level0_file_num_compaction_trigger);
+  {
+    // Half of the keys will go through Deletion and remaining half with
+    // SingleDeletion. Generate enough L0 files with ts=1 to trigger compaction
+    // to L1
+    std::string ts = Timestamp(1, 0);
+    WriteOptions wopts;
+    for (size_t i = 0; i < kNumL0Files; ++i) {
+      for (int j = 0; j < kNumKeysPerFile; ++j) {
+        ASSERT_OK(db_->Put(wopts, Key1(j), ts, "value" + std::to_string(i)));
+      }
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    // Generate another L0 at ts=3
+    ts = Timestamp(3, 0);
+    for (int i = 0; i < kNumKeysPerFile; ++i) {
+      std::string key_str = Key1(i);
+      Slice key(key_str);
+      if ((i % 3) == 0) {
+        if (i < kNumKeysPerFile / 2) {
+          ASSERT_OK(db_->Delete(wopts, key, ts));
+        } else {
+          ASSERT_OK(db_->SingleDelete(wopts, key, ts));
+        }
+      } else {
+        ASSERT_OK(db_->Put(wopts, key, ts, "new_value"));
+      }
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+    // Populate memtable at ts=5
+    ts = Timestamp(5, 0);
+    for (int i = 0; i != kNumKeysPerFile; ++i) {
+      std::string key_str = Key1(i);
+      Slice key(key_str);
+      if ((i % 3) == 1) {
+        if (i < kNumKeysPerFile / 2) {
+          ASSERT_OK(db_->Delete(wopts, key, ts));
+        } else {
+          ASSERT_OK(db_->SingleDelete(wopts, key, ts));
+        }
+      } else if ((i % 3) == 2) {
+        ASSERT_OK(db_->Put(wopts, key, ts, "new_value_2"));
+      }
+    }
+  }
+  {
+    std::string ts_str = Timestamp(6, 0);
+    Slice ts = ts_str;
+    ReadOptions ropts;
+    ropts.timestamp = &ts;
+    for (uint64_t i = 0; i != static_cast<uint64_t>(kNumKeysPerFile); ++i) {
+      std::string value;
+      std::string key_ts;
+      Status s = db_->Get(ropts, Key1(i), &value, &key_ts);
+      if ((i % 3) == 2) {
+        ASSERT_OK(s);
+        ASSERT_EQ("new_value_2", value);
+        ASSERT_EQ(Timestamp(5, 0), key_ts);
+      } else if ((i % 3) == 1) {
+        ASSERT_TRUE(s.IsNotFound());
+        ASSERT_EQ(Timestamp(5, 0), key_ts);
+      } else {
+        ASSERT_TRUE(s.IsNotFound());
+        ASSERT_EQ(Timestamp(3, 0), key_ts);
+      }
+    }
+  }
+}
+
+#ifndef ROCKSDB_LITE
+// A class which remembers the name of each flushed file.
+class FlushedFileCollector : public EventListener {
+ public:
+  FlushedFileCollector() {}
+  ~FlushedFileCollector() override {}
+
+  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+    InstrumentedMutexLock lock(&mutex_);
+    flushed_files_.push_back(info.file_path);
+  }
+
+  std::vector<std::string> GetFlushedFiles() {
+    std::vector<std::string> result;
+    {
+      InstrumentedMutexLock lock(&mutex_);
+      result = flushed_files_;
+    }
+    return result;
+  }
+
+  void ClearFlushedFiles() {
+    InstrumentedMutexLock lock(&mutex_);
+    flushed_files_.clear();
+  }
+
+ private:
+  std::vector<std::string> flushed_files_;
+  InstrumentedMutex mutex_;
+};
+
+TEST_P(DBBasicTestWithTimestampCompressionSettings, PutAndGetWithCompaction) {
+  const int kNumKeysPerFile = 1024;
+  const size_t kNumTimestamps = 2;
+  const size_t kNumKeysPerTimestamp = (kNumKeysPerFile - 1) / kNumTimestamps;
+  const size_t kSplitPosBase = kNumKeysPerTimestamp / 2;
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.env = env_;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  size_t ts_sz = Timestamp(0, 0).size();
+  TestComparator test_cmp(ts_sz);
+  options.comparator = &test_cmp;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy = std::get<0>(GetParam());
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  const CompressionType comp_type = std::get<1>(GetParam());
+#if LZ4_VERSION_NUMBER < 10400  // r124+
+  if (comp_type == kLZ4Compression || comp_type == kLZ4HCCompression) {
+    return;
+  }
+#endif  // LZ4_VERSION_NUMBER >= 10400
+  if (!ZSTD_Supported() && comp_type == kZSTD) {
+    return;
+  }
+  if (!Zlib_Supported() && comp_type == kZlibCompression) {
+    return;
+  }
+
+  options.compression = comp_type;
+  options.compression_opts.max_dict_bytes = std::get<2>(GetParam());
+  if (comp_type == kZSTD) {
+    options.compression_opts.zstd_max_train_bytes = std::get<2>(GetParam());
+  }
+  options.compression_opts.parallel_threads = std::get<3>(GetParam());
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(2, num_cfs);
+  std::vector<std::string> write_ts_list;
+  std::vector<std::string> read_ts_list;
+
+  const auto& verify_records_func = [&](size_t i, size_t begin, size_t end,
+                                        ColumnFamilyHandle* cfh) {
+    std::string value;
+    std::string timestamp;
+
+    ReadOptions ropts;
+    const Slice read_ts = read_ts_list[i];
+    ropts.timestamp = &read_ts;
+    std::string expected_timestamp =
+        std::string(write_ts_list[i].data(), write_ts_list[i].size());
+
+    for (size_t j = begin; j <= end; ++j) {
+      ASSERT_OK(db_->Get(ropts, cfh, Key1(j), &value, &timestamp));
+      ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i), value);
+      ASSERT_EQ(expected_timestamp, timestamp);
+    }
+  };
+
+  for (size_t i = 0; i != kNumTimestamps; ++i) {
+    write_ts_list.push_back(Timestamp(i * 2, 0));
+    read_ts_list.push_back(Timestamp(1 + i * 2, 0));
+    const Slice write_ts = write_ts_list.back();
+    WriteOptions wopts;
+    for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+      size_t memtable_get_start = 0;
+      for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
+        ASSERT_OK(
+            db_->Put(wopts, handles_[cf], Key1(j), write_ts,
+                     "value_" + std::to_string(j) + "_" + std::to_string(i)));
+        if (j == kSplitPosBase + i || j == kNumKeysPerTimestamp - 1) {
+          verify_records_func(i, memtable_get_start, j, handles_[cf]);
+          memtable_get_start = j + 1;
+
+          // flush all keys with the same timestamp to two sst files, split at
+          // incremental positions such that lowerlevel[1].smallest.userkey ==
+          // higherlevel[0].largest.userkey
+          ASSERT_OK(Flush(cf));
+          ASSERT_OK(dbfull()->TEST_WaitForCompact());  // wait for flush (which
+                                                       // is also a compaction)
+
+          // compact files (2 at each level) to a lower level such that all
+          // keys with the same timestamp is at one level, with newer versions
+          // at higher levels.
+          CompactionOptions compact_opt;
+          compact_opt.compression = kNoCompression;
+          ASSERT_OK(db_->CompactFiles(compact_opt, handles_[cf],
+                                      collector->GetFlushedFiles(),
+                                      static_cast<int>(kNumTimestamps - i)));
+          collector->ClearFlushedFiles();
+        }
+      }
+    }
+  }
+  const auto& verify_db_func = [&]() {
+    for (size_t i = 0; i != kNumTimestamps; ++i) {
+      ReadOptions ropts;
+      const Slice read_ts = read_ts_list[i];
+      ropts.timestamp = &read_ts;
+      std::string expected_timestamp(write_ts_list[i].data(),
+                                     write_ts_list[i].size());
+      for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+        ColumnFamilyHandle* cfh = handles_[cf];
+        verify_records_func(i, 0, kNumKeysPerTimestamp - 1, cfh);
+      }
+    }
+  };
+  verify_db_func();
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, BatchWriteAndMultiGet) {
+  const int kNumKeysPerFile = 8192;
+  const size_t kNumTimestamps = 2;
+  const size_t kNumKeysPerTimestamp = (kNumKeysPerFile - 1) / kNumTimestamps;
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.env = env_;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  options.memtable_whole_key_filtering = true;
+
+  size_t ts_sz = Timestamp(0, 0).size();
+  TestComparator test_cmp(ts_sz);
+  options.comparator = &test_cmp;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(
+      10 /*bits_per_key*/, false /*use_block_based_builder*/));
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(2, num_cfs);
+  std::vector<std::string> write_ts_list;
+  std::vector<std::string> read_ts_list;
+
+  const auto& verify_records_func = [&](size_t i, ColumnFamilyHandle* cfh) {
+    std::vector<Slice> keys;
+    std::vector<std::string> key_vals;
+    std::vector<std::string> values;
+    std::vector<std::string> timestamps;
+
+    for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
+      key_vals.push_back(Key1(j));
+    }
+    for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
+      keys.push_back(key_vals[j]);
+    }
+
+    ReadOptions ropts;
+    const Slice read_ts = read_ts_list[i];
+    ropts.timestamp = &read_ts;
+    std::string expected_timestamp(write_ts_list[i].data(),
+                                   write_ts_list[i].size());
+
+    std::vector<ColumnFamilyHandle*> cfhs(keys.size(), cfh);
+    std::vector<Status> statuses =
+        db_->MultiGet(ropts, cfhs, keys, &values, &timestamps);
+    for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
+      ASSERT_OK(statuses[j]);
+      ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i),
+                values[j]);
+      ASSERT_EQ(expected_timestamp, timestamps[j]);
+    }
+  };
+
+  const std::string dummy_ts(ts_sz, '\0');
+  for (size_t i = 0; i != kNumTimestamps; ++i) {
+    write_ts_list.push_back(Timestamp(i * 2, 0));
+    read_ts_list.push_back(Timestamp(1 + i * 2, 0));
+    const Slice& write_ts = write_ts_list.back();
+    for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+      WriteOptions wopts;
+      WriteBatch batch(0, 0, 0, ts_sz);
+      for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
+        const std::string key = Key1(j);
+        const std::string value =
+            "value_" + std::to_string(j) + "_" + std::to_string(i);
+        ASSERT_OK(batch.Put(handles_[cf], key, value));
+      }
+      ASSERT_OK(batch.UpdateTimestamps(write_ts,
+                                       [ts_sz](uint32_t) { return ts_sz; }));
+      ASSERT_OK(db_->Write(wopts, &batch));
+
+      verify_records_func(i, handles_[cf]);
+
+      ASSERT_OK(Flush(cf));
+    }
+  }
+
+  const auto& verify_db_func = [&]() {
+    for (size_t i = 0; i != kNumTimestamps; ++i) {
+      ReadOptions ropts;
+      const Slice read_ts = read_ts_list[i];
+      ropts.timestamp = &read_ts;
+      for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+        ColumnFamilyHandle* cfh = handles_[cf];
+        verify_records_func(i, cfh);
+      }
+    }
+  };
+  verify_db_func();
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, MultiGetNoReturnTs) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  WriteOptions write_opts;
+  std::string ts = Timestamp(1, 0);
+  ASSERT_OK(db_->Put(write_opts, "foo", ts, "value"));
+  ASSERT_OK(db_->Put(write_opts, "bar", ts, "value"));
+  ASSERT_OK(db_->Put(write_opts, "fooxxxxxxxxxxxxxxxx", ts, "value"));
+  ASSERT_OK(db_->Put(write_opts, "barxxxxxxxxxxxxxxxx", ts, "value"));
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+  ts = Timestamp(2, 0);
+  Slice read_ts = ts;
+  ReadOptions read_opts;
+  read_opts.timestamp = &read_ts;
+  {
+    ColumnFamilyHandle* column_families[] = {cfh, cfh};
+    Slice keys[] = {"foo", "bar"};
+    PinnableSlice values[] = {PinnableSlice(), PinnableSlice()};
+    Status statuses[] = {Status::OK(), Status::OK()};
+    dbfull()->MultiGet(read_opts, /*num_keys=*/2, &column_families[0], &keys[0],
+                       &values[0], &statuses[0], /*sorted_input=*/false);
+    for (const auto& s : statuses) {
+      ASSERT_OK(s);
+    }
+  }
+  {
+    ColumnFamilyHandle* column_families[] = {cfh, cfh, cfh, cfh};
+    // Make user keys longer than configured timestamp size (16 bytes) to
+    // verify RocksDB does not use the trailing bytes 'x' as timestamp.
+    Slice keys[] = {"fooxxxxxxxxxxxxxxxx", "barxxxxxxxxxxxxxxxx", "foo", "bar"};
+    PinnableSlice values[] = {PinnableSlice(), PinnableSlice(), PinnableSlice(),
+                              PinnableSlice()};
+    Status statuses[] = {Status::OK(), Status::OK(), Status::OK(),
+                         Status::OK()};
+    dbfull()->MultiGet(read_opts, /*num_keys=*/4, &column_families[0], &keys[0],
+                       &values[0], &statuses[0], /*sorted_input=*/false);
+    for (const auto& s : statuses) {
+      ASSERT_OK(s);
+    }
+  }
+  Close();
+}
+
+#endif  // !ROCKSDB_LITE
+
+INSTANTIATE_TEST_CASE_P(
+    Timestamp, DBBasicTestWithTimestampCompressionSettings,
+    ::testing::Combine(
+        ::testing::Values(std::shared_ptr<const FilterPolicy>(nullptr),
+                          std::shared_ptr<const FilterPolicy>(
+                              NewBloomFilterPolicy(10, false))),
+        ::testing::Values(kNoCompression, kZlibCompression, kLZ4Compression,
+                          kLZ4HCCompression, kZSTD),
+        ::testing::Values(0, 1 << 14), ::testing::Values(1, 4)));
+
+class DBBasicTestWithTimestampPrefixSeek
+    : public DBBasicTestWithTimestampBase,
+      public testing::WithParamInterface<
+          std::tuple<std::shared_ptr<const SliceTransform>,
+                     std::shared_ptr<const FilterPolicy>, bool,
+                     BlockBasedTableOptions::IndexType>> {
+ public:
+  DBBasicTestWithTimestampPrefixSeek()
+      : DBBasicTestWithTimestampBase(
+            "/db_basic_test_with_timestamp_prefix_seek") {}
+};
+
+TEST_P(DBBasicTestWithTimestampPrefixSeek, IterateWithPrefix) {
+  const size_t kNumKeysPerFile = 128;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.prefix_extractor = std::get<0>(GetParam());
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy = std::get<1>(GetParam());
+  bbto.index_type = std::get<3>(GetParam());
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  const uint64_t kMaxKey = 0xffffffffffffffff;
+  const uint64_t kMinKey = 0xfffffffffffff000;
+  const std::vector<std::string> write_ts_list = {Timestamp(3, 0xffffffff),
+                                                  Timestamp(6, 0xffffffff)};
+  WriteOptions write_opts;
+  {
+    for (size_t i = 0; i != write_ts_list.size(); ++i) {
+      for (uint64_t key = kMaxKey; key >= kMinKey; --key) {
+        Status s = db_->Put(write_opts, Key1(key), write_ts_list[i],
+                            "value" + std::to_string(i));
+        ASSERT_OK(s);
+      }
+    }
+  }
+  const std::vector<std::string> read_ts_list = {Timestamp(5, 0xffffffff),
+                                                 Timestamp(9, 0xffffffff)};
+  {
+    ReadOptions read_opts;
+    read_opts.total_order_seek = false;
+    read_opts.prefix_same_as_start = std::get<2>(GetParam());
+    fprintf(stdout, "%s %s %d\n", options.prefix_extractor->Name(),
+            bbto.filter_policy ? bbto.filter_policy->Name() : "null",
+            static_cast<int>(read_opts.prefix_same_as_start));
+    for (size_t i = 0; i != read_ts_list.size(); ++i) {
+      Slice read_ts = read_ts_list[i];
+      read_opts.timestamp = &read_ts;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+
+      // Seek to kMaxKey
+      iter->Seek(Key1(kMaxKey));
+      CheckIterUserEntry(iter.get(), Key1(kMaxKey), kTypeValue,
+                         "value" + std::to_string(i), write_ts_list[i]);
+      iter->Next();
+      ASSERT_FALSE(iter->Valid());
+
+      // Seek to kMinKey
+      iter->Seek(Key1(kMinKey));
+      CheckIterUserEntry(iter.get(), Key1(kMinKey), kTypeValue,
+                         "value" + std::to_string(i), write_ts_list[i]);
+      iter->Prev();
+      ASSERT_FALSE(iter->Valid());
+    }
+    const std::vector<uint64_t> targets = {kMinKey, kMinKey + 0x10,
+                                           kMinKey + 0x100, kMaxKey};
+    const SliceTransform* const pe = options.prefix_extractor.get();
+    ASSERT_NE(nullptr, pe);
+    const size_t kPrefixShift =
+        8 * (Key1(0).size() - pe->Transform(Key1(0)).size());
+    const uint64_t kPrefixMask =
+        ~((static_cast<uint64_t>(1) << kPrefixShift) - 1);
+    const uint64_t kNumKeysWithinPrefix =
+        (static_cast<uint64_t>(1) << kPrefixShift);
+    for (size_t i = 0; i != read_ts_list.size(); ++i) {
+      Slice read_ts = read_ts_list[i];
+      read_opts.timestamp = &read_ts;
+      std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+      // Forward and backward iterate.
+      for (size_t j = 0; j != targets.size(); ++j) {
+        std::string start_key = Key1(targets[j]);
+        uint64_t expected_ub =
+            (targets[j] & kPrefixMask) - 1 + kNumKeysWithinPrefix;
+        uint64_t expected_key = targets[j];
+        size_t count = 0;
+        it->Seek(Key1(targets[j]));
+        while (it->Valid()) {
+          std::string saved_prev_key;
+          saved_prev_key.assign(it->key().data(), it->key().size());
+
+          // Out of prefix
+          if (!read_opts.prefix_same_as_start &&
+              pe->Transform(saved_prev_key) != pe->Transform(start_key)) {
+            break;
+          }
+          CheckIterUserEntry(it.get(), Key1(expected_key), kTypeValue,
+                             "value" + std::to_string(i), write_ts_list[i]);
+          ++count;
+          ++expected_key;
+          it->Next();
+        }
+        ASSERT_EQ(expected_ub - targets[j] + 1, count);
+
+        count = 0;
+        expected_key = targets[j];
+        it->SeekForPrev(start_key);
+        uint64_t expected_lb = (targets[j] & kPrefixMask);
+        while (it->Valid()) {
+          // Out of prefix
+          if (!read_opts.prefix_same_as_start &&
+              pe->Transform(it->key()) != pe->Transform(start_key)) {
+            break;
+          }
+          CheckIterUserEntry(it.get(), Key1(expected_key), kTypeValue,
+                             "value" + std::to_string(i), write_ts_list[i]);
+          ++count;
+          --expected_key;
+          it->Prev();
+        }
+        ASSERT_EQ(targets[j] - std::max(expected_lb, kMinKey) + 1, count);
+      }
+    }
+  }
+  Close();
+}
+
+// TODO(yanqin): consider handling non-fixed-length prefix extractors, e.g.
+// NoopTransform.
+INSTANTIATE_TEST_CASE_P(
+    Timestamp, DBBasicTestWithTimestampPrefixSeek,
+    ::testing::Combine(
+        ::testing::Values(
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(1)),
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(4)),
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(7)),
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(8))),
+        ::testing::Values(std::shared_ptr<const FilterPolicy>(nullptr),
+                          std::shared_ptr<const FilterPolicy>(
+                              NewBloomFilterPolicy(10 /*bits_per_key*/, false)),
+                          std::shared_ptr<const FilterPolicy>(
+                              NewBloomFilterPolicy(20 /*bits_per_key*/,
+                                                   false))),
+        ::testing::Bool(),
+        ::testing::Values(
+            BlockBasedTableOptions::IndexType::kBinarySearch,
+            BlockBasedTableOptions::IndexType::kHashSearch,
+            BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
+            BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey)));
+
+class DBBasicTestWithTsIterTombstones
+    : public DBBasicTestWithTimestampBase,
+      public testing::WithParamInterface<
+          std::tuple<std::shared_ptr<const SliceTransform>,
+                     std::shared_ptr<const FilterPolicy>, int,
+                     BlockBasedTableOptions::IndexType>> {
+ public:
+  DBBasicTestWithTsIterTombstones()
+      : DBBasicTestWithTimestampBase("/db_basic_ts_iter_tombstones") {}
+};
+
+TEST_P(DBBasicTestWithTsIterTombstones, IterWithDelete) {
+  constexpr size_t kNumKeysPerFile = 128;
+  Options options = CurrentOptions();
+  options.env = env_;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.prefix_extractor = std::get<0>(GetParam());
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy = std::get<1>(GetParam());
+  bbto.index_type = std::get<3>(GetParam());
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  options.num_levels = std::get<2>(GetParam());
+  DestroyAndReopen(options);
+  std::vector<std::string> write_ts_strs = {Timestamp(2, 0), Timestamp(4, 0)};
+  constexpr uint64_t kMaxKey = 0xffffffffffffffff;
+  constexpr uint64_t kMinKey = 0xfffffffffffff000;
+  // Insert kMinKey...kMaxKey
+  uint64_t key = kMinKey;
+  WriteOptions write_opts;
+  Slice ts = write_ts_strs[0];
+  do {
+    Status s = db_->Put(write_opts, Key1(key), write_ts_strs[0],
+                        "value" + std::to_string(key));
+    ASSERT_OK(s);
+    if (kMaxKey == key) {
+      break;
+    }
+    ++key;
+  } while (true);
+
+  for (key = kMaxKey; key >= kMinKey; --key) {
+    Status s;
+    if (0 != (key % 2)) {
+      s = db_->Put(write_opts, Key1(key), write_ts_strs[1],
+                   "value1" + std::to_string(key));
+    } else {
+      s = db_->Delete(write_opts, Key1(key), write_ts_strs[1]);
+    }
+    ASSERT_OK(s);
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  {
+    std::string read_ts = Timestamp(4, 0);
+    ts = read_ts;
+    ReadOptions read_opts;
+    read_opts.total_order_seek = true;
+    read_opts.timestamp = &ts;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    size_t count = 0;
+    key = kMinKey + 1;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++count, key += 2) {
+      ASSERT_EQ(Key1(key), iter->key());
+      ASSERT_EQ("value1" + std::to_string(key), iter->value());
+    }
+    ASSERT_EQ((kMaxKey - kMinKey + 1) / 2, count);
+
+    for (iter->SeekToLast(), count = 0, key = kMaxKey; iter->Valid();
+         key -= 2, ++count, iter->Prev()) {
+      ASSERT_EQ(Key1(key), iter->key());
+      ASSERT_EQ("value1" + std::to_string(key), iter->value());
+    }
+    ASSERT_EQ((kMaxKey - kMinKey + 1) / 2, count);
+  }
+  Close();
+}
+
+INSTANTIATE_TEST_CASE_P(
+    Timestamp, DBBasicTestWithTsIterTombstones,
+    ::testing::Combine(
+        ::testing::Values(
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(7)),
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(8))),
+        ::testing::Values(std::shared_ptr<const FilterPolicy>(nullptr),
+                          std::shared_ptr<const FilterPolicy>(
+                              NewBloomFilterPolicy(10, false)),
+                          std::shared_ptr<const FilterPolicy>(
+                              NewBloomFilterPolicy(20, false))),
+        ::testing::Values(2, 6),
+        ::testing::Values(
+            BlockBasedTableOptions::IndexType::kBinarySearch,
+            BlockBasedTableOptions::IndexType::kHashSearch,
+            BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
+            BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey)));
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+class UpdateFullHistoryTsLowTest : public DBBasicTestWithTimestampBase {
+ public:
+  UpdateFullHistoryTsLowTest()
+      : DBBasicTestWithTimestampBase("/update_full_history_ts_low_test") {}
+};
+
+TEST_F(UpdateFullHistoryTsLowTest, ConcurrentUpdate) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  std::string lower_ts_low = Timestamp(10, 0);
+  std::string higher_ts_low = Timestamp(25, 0);
+  const size_t kTimestampSize = lower_ts_low.size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+
+  DestroyAndReopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  // This workaround swaps `lower_ts_low` originally used for update by the
+  // caller to `higher_ts_low` after its writer is queued to make sure
+  // the caller will always get a TryAgain error.
+  // It mimics cases where two threads update full_history_ts_low concurrently
+  // with one thread writing a higher ts_low and one thread writing a lower
+  // ts_low.
+  VersionEdit* version_edit;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::IncreaseFullHistoryTsLowImpl:BeforeEdit",
+      [&](void* arg) { version_edit = reinterpret_cast<VersionEdit*>(arg); });
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:BeforeWriterWaiting",
+      [&](void* /*arg*/) { version_edit->SetFullHistoryTsLow(higher_ts_low); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_TRUE(
+      db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), lower_ts_low)
+          .IsTryAgain());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp,
+       GCPreserveRangeTombstoneWhenNoOrSmallFullHistoryLow) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  std::string ts_str = Timestamp(1, 0);
+  WriteOptions wopts;
+  ASSERT_OK(db_->Put(wopts, "k1", ts_str, "v1"));
+  ASSERT_OK(db_->Put(wopts, "k2", ts_str, "v2"));
+  ASSERT_OK(db_->Put(wopts, "k3", ts_str, "v3"));
+  ts_str = Timestamp(2, 0);
+  ASSERT_OK(
+      db_->DeleteRange(wopts, db_->DefaultColumnFamily(), "k1", "k3", ts_str));
+
+  ts_str = Timestamp(3, 0);
+  Slice ts = ts_str;
+  ReadOptions ropts;
+  ropts.timestamp = &ts;
+  CompactRangeOptions cro;
+  cro.full_history_ts_low = nullptr;
+  std::string value, key_ts;
+  Status s;
+  auto verify = [&] {
+    s = db_->Get(ropts, "k1", &value);
+    ASSERT_TRUE(s.IsNotFound());
+
+    s = db_->Get(ropts, "k2", &value, &key_ts);
+    ASSERT_TRUE(s.IsNotFound());
+    ASSERT_EQ(key_ts, Timestamp(2, 0));
+
+    ASSERT_OK(db_->Get(ropts, "k3", &value, &key_ts));
+    ASSERT_EQ(value, "v3");
+    ASSERT_EQ(Timestamp(1, 0), key_ts);
+
+    size_t batch_size = 3;
+    std::vector<std::string> key_strs = {"k1", "k2", "k3"};
+    std::vector<Slice> keys{key_strs.begin(), key_strs.end()};
+    std::vector<PinnableSlice> values(batch_size);
+    std::vector<Status> statuses(batch_size);
+    db_->MultiGet(ropts, db_->DefaultColumnFamily(), batch_size, keys.data(),
+                  values.data(), statuses.data(), true /* sorted_input */);
+    ASSERT_TRUE(statuses[0].IsNotFound());
+    ASSERT_TRUE(statuses[1].IsNotFound());
+    ASSERT_OK(statuses[2]);
+    ;
+    ASSERT_EQ(values[2], "v3");
+  };
+  verify();
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  verify();
+  std::string lb = Timestamp(0, 0);
+  Slice lb_slice = lb;
+  cro.full_history_ts_low = &lb_slice;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  verify();
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp,
+       GCRangeTombstonesAndCoveredKeysRespectingTslow) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.num_levels = 2;
+  DestroyAndReopen(options);
+
+  WriteOptions wopts;
+  ASSERT_OK(db_->Put(wopts, "k1", Timestamp(1, 0), "v1"));
+  ASSERT_OK(db_->Delete(wopts, "k2", Timestamp(2, 0)));
+  ASSERT_OK(db_->DeleteRange(wopts, db_->DefaultColumnFamily(), "k1", "k3",
+                             Timestamp(3, 0)));
+  ASSERT_OK(db_->Put(wopts, "k3", Timestamp(4, 0), "v3"));
+
+  ReadOptions ropts;
+  std::string read_ts = Timestamp(5, 0);
+  Slice read_ts_slice = read_ts;
+  ropts.timestamp = &read_ts_slice;
+  size_t batch_size = 3;
+  std::vector<std::string> key_strs = {"k1", "k2", "k3"};
+  std::vector<Slice> keys = {key_strs.begin(), key_strs.end()};
+  std::vector<PinnableSlice> values(batch_size);
+  std::vector<Status> statuses(batch_size);
+  std::vector<std::string> timestamps(batch_size);
+  db_->MultiGet(ropts, db_->DefaultColumnFamily(), batch_size, keys.data(),
+                values.data(), timestamps.data(), statuses.data(),
+                true /* sorted_input */);
+  ASSERT_TRUE(statuses[0].IsNotFound());
+  ASSERT_EQ(timestamps[0], Timestamp(3, 0));
+  ASSERT_TRUE(statuses[1].IsNotFound());
+  // DeleteRange has a higher timestamp than Delete for "k2"
+  ASSERT_EQ(timestamps[1], Timestamp(3, 0));
+  ASSERT_OK(statuses[2]);
+  ASSERT_EQ(values[2], "v3");
+  ASSERT_EQ(timestamps[2], Timestamp(4, 0));
+
+  CompactRangeOptions cro;
+  // Range tombstone has timestamp >= full_history_ts_low, covered keys
+  // are not dropped.
+  std::string compaction_ts_str = Timestamp(2, 0);
+  Slice compaction_ts = compaction_ts_str;
+  cro.full_history_ts_low = &compaction_ts;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ropts.timestamp = &compaction_ts;
+  std::string value, ts;
+  ASSERT_OK(db_->Get(ropts, "k1", &value, &ts));
+  ASSERT_EQ(value, "v1");
+  // timestamp is below full_history_ts_low, zeroed out as the key goes into
+  // bottommost level
+  ASSERT_EQ(ts, Timestamp(0, 0));
+  ASSERT_TRUE(db_->Get(ropts, "k2", &value, &ts).IsNotFound());
+  ASSERT_EQ(ts, Timestamp(2, 0));
+
+  compaction_ts_str = Timestamp(4, 0);
+  compaction_ts = compaction_ts_str;
+  cro.full_history_ts_low = &compaction_ts;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ropts.timestamp = &read_ts_slice;
+  // k1, k2 and the range tombstone should be dropped
+  // k3 should still exist
+  db_->MultiGet(ropts, db_->DefaultColumnFamily(), batch_size, keys.data(),
+                values.data(), timestamps.data(), statuses.data(),
+                true /* sorted_input */);
+  ASSERT_TRUE(statuses[0].IsNotFound());
+  ASSERT_TRUE(timestamps[0].empty());
+  ASSERT_TRUE(statuses[1].IsNotFound());
+  ASSERT_TRUE(timestamps[1].empty());
+  ASSERT_OK(statuses[2]);
+  ASSERT_EQ(values[2], "v3");
+  ASSERT_EQ(timestamps[2], Timestamp(4, 0));
+
+  Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, DeleteRangeBaiscReadAndIterate) {
+  const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25;
+  Options options = CurrentOptions();
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+  options.compression = kNoCompression;
+  BlockBasedTableOptions bbto;
+  bbto.index_type = GetParam();
+  bbto.block_size = 100;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+  DestroyAndReopen(options);
+
+  // Write half of the keys before the tombstone and half after the tombstone.
+  // Only covered keys (i.e., within the range and older than the tombstone)
+  // should be deleted.
+  for (int i = 0; i < kNum; ++i) {
+    if (i == kNum / 2) {
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 Key1(kRangeBegin), Key1(kRangeEnd),
+                                 Timestamp(i, 0)));
+    }
+    ASSERT_OK(db_->Put(WriteOptions(), Key1(i), Timestamp(i, 0),
+                       "val" + std::to_string(i)));
+    if (i == kNum - kNumPerFile) {
+      ASSERT_OK(Flush());
+    }
+  }
+
+  ReadOptions read_opts;
+  read_opts.total_order_seek = true;
+  std::string read_ts = Timestamp(kNum, 0);
+  Slice read_ts_slice = read_ts;
+  read_opts.timestamp = &read_ts_slice;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    ASSERT_OK(iter->status());
+
+    int expected = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_EQ(Key1(expected), iter->key());
+      if (expected == kRangeBegin - 1) {
+        expected = kNum / 2;
+      } else {
+        ++expected;
+      }
+    }
+    ASSERT_EQ(kNum, expected);
+
+    expected = kNum / 2;
+    for (iter->Seek(Key1(kNum / 2)); iter->Valid(); iter->Next()) {
+      ASSERT_EQ(Key1(expected), iter->key());
+      ++expected;
+    }
+    ASSERT_EQ(kNum, expected);
+
+    expected = kRangeBegin - 1;
+    for (iter->SeekForPrev(Key1(kNum / 2 - 1)); iter->Valid(); iter->Prev()) {
+      ASSERT_EQ(Key1(expected), iter->key());
+      --expected;
+    }
+    ASSERT_EQ(-1, expected);
+
+    read_ts = Timestamp(0, 0);
+    read_ts_slice = read_ts;
+    read_opts.timestamp = &read_ts_slice;
+    iter.reset(db_->NewIterator(read_opts));
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key(), Key1(0));
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_OK(iter->status());
+  }
+
+  read_ts = Timestamp(kNum, 0);
+  read_ts_slice = read_ts;
+  read_opts.timestamp = &read_ts_slice;
+  std::string value, timestamp;
+  Status s;
+  for (int i = 0; i < kNum; ++i) {
+    s = db_->Get(read_opts, Key1(i), &value, &timestamp);
+    if (i >= kRangeBegin && i < kNum / 2) {
+      ASSERT_TRUE(s.IsNotFound());
+      ASSERT_EQ(timestamp, Timestamp(kNum / 2, 0));
+    } else {
+      ASSERT_OK(s);
+      ASSERT_EQ(value, "val" + std::to_string(i));
+      ASSERT_EQ(timestamp, Timestamp(i, 0));
+    }
+  }
+
+  size_t batch_size = kNum;
+  std::vector<std::string> key_strs(batch_size);
+  std::vector<Slice> keys(batch_size);
+  std::vector<PinnableSlice> values(batch_size);
+  std::vector<Status> statuses(batch_size);
+  std::vector<std::string> timestamps(batch_size);
+  for (int i = 0; i < kNum; ++i) {
+    key_strs[i] = Key1(i);
+    keys[i] = key_strs[i];
+  }
+  db_->MultiGet(read_opts, db_->DefaultColumnFamily(), batch_size, keys.data(),
+                values.data(), timestamps.data(), statuses.data(),
+                true /* sorted_input */);
+  for (int i = 0; i < kNum; ++i) {
+    if (i >= kRangeBegin && i < kNum / 2) {
+      ASSERT_TRUE(statuses[i].IsNotFound());
+      ASSERT_EQ(timestamps[i], Timestamp(kNum / 2, 0));
+    } else {
+      ASSERT_OK(statuses[i]);
+      ASSERT_EQ(values[i], "val" + std::to_string(i));
+      ASSERT_EQ(timestamps[i], Timestamp(i, 0));
+    }
+  }
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, DeleteRangeGetIteratorWithSnapshot) {
+  // 4 keys 0, 1, 2, 3 at timestamps 0, 1, 2, 3 respectively.
+  // A range tombstone [1, 3) at timestamp 1 and has a sequence number between
+  // key 1 and 2.
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  WriteOptions write_opts;
+  std::string put_ts = Timestamp(0, 0);
+  const int kNum = 4, kNumPerFile = 1, kRangeBegin = 1, kRangeEnd = 3;
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+  const Snapshot* before_tombstone = nullptr;
+  const Snapshot* after_tombstone = nullptr;
+  for (int i = 0; i < kNum; ++i) {
+    ASSERT_OK(db_->Put(WriteOptions(), Key1(i), Timestamp(i, 0),
+                       "val" + std::to_string(i)));
+    if (i == kRangeBegin) {
+      before_tombstone = db_->GetSnapshot();
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 Key1(kRangeBegin), Key1(kRangeEnd),
+                                 Timestamp(kRangeBegin, 0)));
+    }
+    if (i == kNum / 2) {
+      ASSERT_OK(Flush());
+    }
+  }
+  assert(before_tombstone);
+  after_tombstone = db_->GetSnapshot();
+  // snapshot and ts before tombstone
+  std::string read_ts_str = Timestamp(kRangeBegin - 1, 0);  // (0, 0)
+  Slice read_ts = read_ts_str;
+  ReadOptions read_opts;
+  read_opts.timestamp = &read_ts;
+  read_opts.snapshot = before_tombstone;
+  std::vector<Status> expected_status = {
+      Status::OK(), Status::NotFound(), Status::NotFound(), Status::NotFound()};
+  std::vector<std::string> expected_values(kNum);
+  expected_values[0] = "val" + std::to_string(0);
+  std::vector<std::string> expected_timestamps(kNum);
+  expected_timestamps[0] = Timestamp(0, 0);
+
+  size_t batch_size = kNum;
+  std::vector<std::string> key_strs(batch_size);
+  std::vector<Slice> keys(batch_size);
+  std::vector<PinnableSlice> values(batch_size);
+  std::vector<Status> statuses(batch_size);
+  std::vector<std::string> timestamps(batch_size);
+  for (int i = 0; i < kNum; ++i) {
+    key_strs[i] = Key1(i);
+    keys[i] = key_strs[i];
+  }
+
+  auto verify = [&] {
+    db_->MultiGet(read_opts, db_->DefaultColumnFamily(), batch_size,
+                  keys.data(), values.data(), timestamps.data(),
+                  statuses.data(), true /* sorted_input */);
+    std::string value, timestamp;
+    Status s;
+    for (int i = 0; i < kNum; ++i) {
+      s = db_->Get(read_opts, Key1(i), &value, &timestamp);
+      ASSERT_EQ(s, expected_status[i]);
+      ASSERT_EQ(statuses[i], expected_status[i]);
+      if (s.ok()) {
+        ASSERT_EQ(value, expected_values[i]);
+        ASSERT_EQ(values[i], expected_values[i]);
+      }
+      if (!timestamp.empty()) {
+        ASSERT_EQ(timestamp, expected_timestamps[i]);
+        ASSERT_EQ(timestamps[i], expected_timestamps[i]);
+      } else {
+        ASSERT_TRUE(timestamps[i].empty());
+      }
+    }
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    std::unique_ptr<Iterator> iter_for_seek(db_->NewIterator(read_opts));
+    iter->SeekToFirst();
+    for (int i = 0; i < kNum; ++i) {
+      if (expected_status[i].ok()) {
+        auto verify_iter = [&](Iterator* iter_ptr) {
+          ASSERT_TRUE(iter_ptr->Valid());
+          ASSERT_EQ(iter_ptr->key(), keys[i]);
+          ASSERT_EQ(iter_ptr->value(), expected_values[i]);
+          ASSERT_EQ(iter_ptr->timestamp(), expected_timestamps[i]);
+        };
+        verify_iter(iter.get());
+        iter->Next();
+
+        iter_for_seek->Seek(keys[i]);
+        verify_iter(iter_for_seek.get());
+
+        iter_for_seek->SeekForPrev(keys[i]);
+        verify_iter(iter_for_seek.get());
+      }
+    }
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_OK(iter->status());
+  };
+
+  verify();
+
+  // snapshot before tombstone and ts after tombstone
+  read_ts_str = Timestamp(kNum, 0);  // (4, 0)
+  read_ts = read_ts_str;
+  read_opts.timestamp = &read_ts;
+  read_opts.snapshot = before_tombstone;
+  expected_status[1] = Status::OK();
+  expected_timestamps[1] = Timestamp(1, 0);
+  expected_values[1] = "val" + std::to_string(1);
+  verify();
+
+  // snapshot after tombstone and ts before tombstone
+  read_ts_str = Timestamp(kRangeBegin - 1, 0);  // (0, 0)
+  read_ts = read_ts_str;
+  read_opts.timestamp = &read_ts;
+  read_opts.snapshot = after_tombstone;
+  expected_status[1] = Status::NotFound();
+  expected_timestamps[1].clear();
+  expected_values[1].clear();
+  verify();
+
+  // snapshot and ts after tombstone
+  read_ts_str = Timestamp(kNum, 0);  // (4, 0)
+  read_ts = read_ts_str;
+  read_opts.timestamp = &read_ts;
+  read_opts.snapshot = after_tombstone;
+  for (int i = 0; i < kNum; ++i) {
+    if (i == kRangeBegin) {
+      expected_status[i] = Status::NotFound();
+      expected_values[i].clear();
+    } else {
+      expected_status[i] = Status::OK();
+      expected_values[i] = "val" + std::to_string(i);
+    }
+    expected_timestamps[i] = Timestamp(i, 0);
+  }
+  verify();
+
+  db_->ReleaseSnapshot(before_tombstone);
+  db_->ReleaseSnapshot(after_tombstone);
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, MergeBasic) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.merge_operator = std::make_shared<StringAppendTESTOperator>('.');
+  DestroyAndReopen(options);
+
+  const std::array<std::string, 3> write_ts_strs = {
+      Timestamp(100, 0), Timestamp(200, 0), Timestamp(300, 0)};
+  constexpr size_t kNumOfUniqKeys = 100;
+  ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily();
+
+  for (size_t i = 0; i < write_ts_strs.size(); ++i) {
+    for (size_t j = 0; j < kNumOfUniqKeys; ++j) {
+      Status s;
+      if (i == 0) {
+        const std::string val = "v" + std::to_string(j) + "_0";
+        s = db_->Put(WriteOptions(), Key1(j), write_ts_strs[i], val);
+      } else {
+        const std::string merge_op = std::to_string(i);
+        s = db_->Merge(WriteOptions(), default_cf, Key1(j), write_ts_strs[i],
+                       merge_op);
+      }
+      ASSERT_OK(s);
+    }
+  }
+
+  std::array<std::string, 3> read_ts_strs = {
+      Timestamp(150, 0), Timestamp(250, 0), Timestamp(350, 0)};
+
+  const auto verify_db_with_get = [&]() {
+    for (size_t i = 0; i < kNumOfUniqKeys; ++i) {
+      const std::string base_val = "v" + std::to_string(i) + "_0";
+      const std::array<std::string, 3> expected_values = {
+          base_val, base_val + ".1", base_val + ".1.2"};
+      const std::array<std::string, 3>& expected_ts = write_ts_strs;
+      ReadOptions read_opts;
+      for (size_t j = 0; j < read_ts_strs.size(); ++j) {
+        Slice read_ts = read_ts_strs[j];
+        read_opts.timestamp = &read_ts;
+        std::string value;
+        std::string ts;
+        const Status s = db_->Get(read_opts, Key1(i), &value, &ts);
+        ASSERT_OK(s);
+        ASSERT_EQ(expected_values[j], value);
+        ASSERT_EQ(expected_ts[j], ts);
+
+        // Do Seek/SeekForPrev
+        std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+        it->Seek(Key1(i));
+        ASSERT_TRUE(it->Valid());
+        ASSERT_EQ(expected_values[j], it->value());
+        ASSERT_EQ(expected_ts[j], it->timestamp());
+
+        it->SeekForPrev(Key1(i));
+        ASSERT_TRUE(it->Valid());
+        ASSERT_EQ(expected_values[j], it->value());
+        ASSERT_EQ(expected_ts[j], it->timestamp());
+      }
+    }
+  };
+
+  const auto verify_db_with_iterator = [&]() {
+    std::string value_suffix;
+    for (size_t i = 0; i < read_ts_strs.size(); ++i) {
+      ReadOptions read_opts;
+      Slice read_ts = read_ts_strs[i];
+      read_opts.timestamp = &read_ts;
+      std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+      size_t key_int_val = 0;
+      for (it->SeekToFirst(); it->Valid(); it->Next(), ++key_int_val) {
+        const std::string key = Key1(key_int_val);
+        const std::string value =
+            "v" + std::to_string(key_int_val) + "_0" + value_suffix;
+        ASSERT_EQ(key, it->key());
+        ASSERT_EQ(value, it->value());
+        ASSERT_EQ(write_ts_strs[i], it->timestamp());
+      }
+      ASSERT_EQ(kNumOfUniqKeys, key_int_val);
+
+      key_int_val = kNumOfUniqKeys - 1;
+      for (it->SeekToLast(); it->Valid(); it->Prev(), --key_int_val) {
+        const std::string key = Key1(key_int_val);
+        const std::string value =
+            "v" + std::to_string(key_int_val) + "_0" + value_suffix;
+        ASSERT_EQ(key, it->key());
+        ASSERT_EQ(value, it->value());
+        ASSERT_EQ(write_ts_strs[i], it->timestamp());
+      }
+      ASSERT_EQ(std::numeric_limits<size_t>::max(), key_int_val);
+
+      value_suffix = value_suffix + "." + std::to_string(i + 1);
+    }
+  };
+
+  verify_db_with_get();
+  verify_db_with_iterator();
+
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  verify_db_with_get();
+  verify_db_with_iterator();
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, MergeAfterDeletion) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.merge_operator = std::make_shared<StringAppendTESTOperator>('.');
+  DestroyAndReopen(options);
+
+  ColumnFamilyHandle* const column_family = db_->DefaultColumnFamily();
+
+  const size_t num_keys_per_file = 10;
+  const size_t num_merges_per_key = 2;
+  for (size_t i = 0; i < num_keys_per_file; ++i) {
+    std::string ts = Timestamp(i + 10000, 0);
+    Status s = db_->Delete(WriteOptions(), Key1(i), ts);
+    ASSERT_OK(s);
+    for (size_t j = 1; j <= num_merges_per_key; ++j) {
+      ts = Timestamp(i + 10000 + j, 0);
+      s = db_->Merge(WriteOptions(), column_family, Key1(i), ts,
+                     std::to_string(j));
+      ASSERT_OK(s);
+    }
+  }
+
+  const auto verify_db = [&]() {
+    ReadOptions read_opts;
+    std::string read_ts_str = Timestamp(20000, 0);
+    Slice ts = read_ts_str;
+    read_opts.timestamp = &ts;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    size_t count = 0;
+    for (it->SeekToFirst(); it->Valid(); it->Next(), ++count) {
+      std::string key = Key1(count);
+      ASSERT_EQ(key, it->key());
+      std::string value;
+      for (size_t j = 1; j <= num_merges_per_key; ++j) {
+        value.append(std::to_string(j));
+        if (j < num_merges_per_key) {
+          value.push_back('.');
+        }
+      }
+      ASSERT_EQ(value, it->value());
+      std::string ts1 = Timestamp(count + 10000 + num_merges_per_key, 0);
+      ASSERT_EQ(ts1, it->timestamp());
+    }
+    ASSERT_OK(it->status());
+    ASSERT_EQ(num_keys_per_file, count);
+    for (it->SeekToLast(); it->Valid(); it->Prev(), --count) {
+      std::string key = Key1(count - 1);
+      ASSERT_EQ(key, it->key());
+      std::string value;
+      for (size_t j = 1; j <= num_merges_per_key; ++j) {
+        value.append(std::to_string(j));
+        if (j < num_merges_per_key) {
+          value.push_back('.');
+        }
+      }
+      ASSERT_EQ(value, it->value());
+      std::string ts1 = Timestamp(count - 1 + 10000 + num_merges_per_key, 0);
+      ASSERT_EQ(ts1, it->timestamp());
+    }
+    ASSERT_OK(it->status());
+    ASSERT_EQ(0, count);
+  };
+
+  verify_db();
+
+  Close();
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_with_timestamp_compaction_test.cc b/src/rocksdb/db/db_with_timestamp_compaction_test.cc
new file mode 100644
index 000000000..d28f67e05
--- /dev/null
+++ b/src/rocksdb/db/db_with_timestamp_compaction_test.cc
@@ -0,0 +1,334 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction.h"
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+std::string Key1(uint64_t key) {
+  std::string ret;
+  PutFixed64(&ret, key);
+  std::reverse(ret.begin(), ret.end());
+  return ret;
+}
+
+std::string Timestamp(uint64_t ts) {
+  std::string ret;
+  PutFixed64(&ret, ts);
+  return ret;
+}
+}  // anonymous namespace
+
+class TimestampCompatibleCompactionTest : public DBTestBase {
+ public:
+  TimestampCompatibleCompactionTest()
+      : DBTestBase("ts_compatible_compaction_test", /*env_do_fsync=*/true) {}
+
+  std::string Get(const std::string& key, uint64_t ts) {
+    ReadOptions read_opts;
+    std::string ts_str = Timestamp(ts);
+    Slice ts_slice = ts_str;
+    read_opts.timestamp = &ts_slice;
+    std::string value;
+    Status s = db_->Get(read_opts, key, &value);
+    if (s.IsNotFound()) {
+      value.assign("NOT_FOUND");
+    } else if (!s.ok()) {
+      value.assign(s.ToString());
+    }
+    return value;
+  }
+};
+
+TEST_F(TimestampCompatibleCompactionTest, UserKeyCrossFileBoundary) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.compaction_style = kCompactionStyleLevel;
+  options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+  options.level0_file_num_compaction_trigger = 3;
+  constexpr size_t kNumKeysPerFile = 101;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        const auto* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_NE(nullptr, compaction);
+        ASSERT_EQ(0, compaction->start_level());
+        ASSERT_EQ(1, compaction->num_input_levels());
+        // Check that all 3 L0 ssts are picked for level compaction.
+        ASSERT_EQ(3, compaction->num_input_files(0));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  // Write a L0 with keys 0, 1, ..., 99 with ts from 100 to 199.
+  uint64_t ts = 100;
+  uint64_t key = 0;
+  WriteOptions write_opts;
+  for (; key < kNumKeysPerFile - 1; ++key, ++ts) {
+    std::string ts_str = Timestamp(ts);
+    ASSERT_OK(
+        db_->Put(write_opts, Key1(key), ts_str, "foo_" + std::to_string(key)));
+  }
+  // Write another L0 with keys 99 with newer ts.
+  ASSERT_OK(Flush());
+  uint64_t saved_read_ts1 = ts++;
+  key = 99;
+  for (int i = 0; i < 4; ++i, ++ts) {
+    std::string ts_str = Timestamp(ts);
+    ASSERT_OK(
+        db_->Put(write_opts, Key1(key), ts_str, "bar_" + std::to_string(key)));
+  }
+  ASSERT_OK(Flush());
+  uint64_t saved_read_ts2 = ts++;
+  // Write another L0 with keys 99, 100, 101, ..., 150
+  for (; key <= 150; ++key, ++ts) {
+    std::string ts_str = Timestamp(ts);
+    ASSERT_OK(
+        db_->Put(write_opts, Key1(key), ts_str, "foo1_" + std::to_string(key)));
+  }
+  ASSERT_OK(Flush());
+  // Wait for compaction to finish
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  uint64_t read_ts = ts;
+  ASSERT_EQ("foo_99", Get(Key1(99), saved_read_ts1));
+  ASSERT_EQ("bar_99", Get(Key1(99), saved_read_ts2));
+  ASSERT_EQ("foo1_99", Get(Key1(99), read_ts));
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(TimestampCompatibleCompactionTest, MultipleSubCompactions) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_subcompactions = 3;
+  options.target_file_size_base = 1024;
+  options.statistics = CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  uint64_t ts = 100;
+  uint64_t key = 0;
+  WriteOptions write_opts;
+
+  // Write keys 0, 1, ..., 499 with ts from 100 to 599.
+  {
+    for (; key <= 499; ++key, ++ts) {
+      std::string ts_str = Timestamp(ts);
+      ASSERT_OK(db_->Put(write_opts, Key1(key), ts_str,
+                         "foo_" + std::to_string(key)));
+    }
+  }
+
+  // Write keys 500, ..., 999 with ts from 600 to 1099.
+  {
+    for (; key <= 999; ++key, ++ts) {
+      std::string ts_str = Timestamp(ts);
+      ASSERT_OK(db_->Put(write_opts, Key1(key), ts_str,
+                         "foo_" + std::to_string(key)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // Wait for compaction to finish
+  {
+    ASSERT_OK(dbfull()->RunManualCompaction(
+        static_cast_with_check<ColumnFamilyHandleImpl>(
+            db_->DefaultColumnFamily())
+            ->cfd(),
+        0 /* input_level */, 1 /* output_level */, CompactRangeOptions(),
+        nullptr /* begin */, nullptr /* end */, true /* exclusive */,
+        true /* disallow_trivial_move */,
+        std::numeric_limits<uint64_t>::max() /* max_file_num_to_ignore */,
+        "" /*trim_ts*/));
+  }
+
+  // Check stats to make sure multiple subcompactions were scheduled for
+  // boundaries not to be nullptr.
+  {
+    HistogramData num_sub_compactions;
+    options.statistics->histogramData(NUM_SUBCOMPACTIONS_SCHEDULED,
+                                      &num_sub_compactions);
+    ASSERT_GT(num_sub_compactions.sum, 1);
+  }
+
+  for (key = 0; key <= 999; ++key) {
+    ASSERT_EQ("foo_" + std::to_string(key), Get(Key1(key), ts));
+  }
+}
+
+class TestFilePartitioner : public SstPartitioner {
+ public:
+  explicit TestFilePartitioner() {}
+  ~TestFilePartitioner() override {}
+
+  const char* Name() const override { return "TestFilePartitioner"; }
+  PartitionerResult ShouldPartition(
+      const PartitionerRequest& /*request*/) override {
+    return PartitionerResult::kRequired;
+  }
+  bool CanDoTrivialMove(const Slice& /*smallest_user_key*/,
+                        const Slice& /*largest_user_key*/) override {
+    return false;
+  }
+};
+
+class TestFilePartitionerFactory : public SstPartitionerFactory {
+ public:
+  explicit TestFilePartitionerFactory() {}
+  std::unique_ptr<SstPartitioner> CreatePartitioner(
+      const SstPartitioner::Context& /*context*/) const override {
+    std::unique_ptr<SstPartitioner> ret =
+        std::make_unique<TestFilePartitioner>();
+    return ret;
+  }
+  const char* Name() const override { return "TestFilePartitionerFactory"; }
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(TimestampCompatibleCompactionTest, CompactFilesRangeCheckL0) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.sst_partitioner_factory =
+      std::make_shared<TestFilePartitionerFactory>();
+  options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  constexpr int kNumFiles = 10;
+  constexpr int kKeysPerFile = 2;
+  const std::string user_key = "foo";
+  constexpr uint64_t start_ts = 10000;
+
+  uint64_t cur_ts = start_ts;
+  for (int k = 0; k < kNumFiles; ++k) {
+    for (int i = 0; i < kKeysPerFile; ++i) {
+      ASSERT_OK(db_->Put(WriteOptions(), user_key, Timestamp(cur_ts),
+                         "v" + std::to_string(i)));
+      ++cur_ts;
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  }
+
+  std::vector<std::string> input_files{};
+  {
+    std::vector<std::string> files;
+    ASSERT_OK(env_->GetChildren(dbname_, &files));
+    for (const auto& f : files) {
+      uint64_t file_num = 0;
+      FileType file_type = FileType::kWalFile;
+      if (!ParseFileName(f, &file_num, &file_type) ||
+          file_type != FileType::kTableFile) {
+        continue;
+      }
+      input_files.emplace_back(f);
+    }
+    // sorting here by name, which also happens to sort by generation date.
+    std::sort(input_files.begin(), input_files.end());
+    assert(kNumFiles == input_files.size());
+    std::vector<std::string> tmp;
+    tmp.emplace_back(input_files[input_files.size() / 2]);
+    input_files.swap(tmp);
+  }
+
+  {
+    std::vector<std::string> output_file_names;
+    CompactionJobInfo compaction_job_info;
+    ASSERT_OK(db_->CompactFiles(CompactionOptions(), input_files,
+                                /*output_level=*/1, /*output_path_id=*/-1,
+                                &output_file_names, &compaction_job_info));
+    // We expect the L0 files older than the original provided input were all
+    // included in the compaction.
+    ASSERT_EQ(static_cast<size_t>(kNumFiles / 2 + 1),
+              compaction_job_info.input_files.size());
+  }
+}
+
+TEST_F(TimestampCompatibleCompactionTest, CompactFilesRangeCheckL1) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.sst_partitioner_factory =
+      std::make_shared<TestFilePartitionerFactory>();
+  options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+
+  constexpr int kNumFiles = 4;
+  options.level0_file_num_compaction_trigger = kNumFiles;
+
+  DestroyAndReopen(options);
+
+  constexpr int kKeysPerFile = 2;
+  const std::string user_key = "foo";
+  constexpr uint64_t start_ts = 10000;
+
+  uint64_t cur_ts = start_ts;
+  // Generate some initial files in both L0 and L1.
+  for (int k = 0; k < kNumFiles; ++k) {
+    for (int i = 0; i < kKeysPerFile; ++i) {
+      ASSERT_OK(db_->Put(WriteOptions(), user_key, Timestamp(cur_ts),
+                         "v" + std::to_string(i)));
+      ++cur_ts;
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ(0, NumTableFilesAtLevel(/*level=*/0, /*cf=*/0));
+  ASSERT_EQ(kNumFiles * kKeysPerFile,
+            NumTableFilesAtLevel(/*level=*/1, /*cf=*/0));
+
+  constexpr int additional_l0s = 2;
+  for (int i = 0; i < additional_l0s; ++i, ++cur_ts) {
+    ASSERT_OK(db_->Put(WriteOptions(), user_key, Timestamp(cur_ts), "v"));
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  }
+  ASSERT_EQ(additional_l0s, NumTableFilesAtLevel(/*level=*/0, /*cf=*/0));
+
+  std::vector<std::string> inputs;
+  {
+    std::vector<LiveFileMetaData> fmetas;
+    db_->GetLiveFilesMetaData(&fmetas);
+    bool included_one_l1 = false;
+    for (const auto& meta : fmetas) {
+      if (meta.level == 0) {
+        inputs.emplace_back(meta.relative_filename);
+      } else if (!included_one_l1) {
+        inputs.emplace_back(meta.relative_filename);
+        included_one_l1 = true;
+      }
+    }
+  }
+  ASSERT_EQ(static_cast<size_t>(3), inputs.size());
+  {
+    std::vector<std::string> output_file_names;
+    CompactionJobInfo compaction_job_info;
+
+    ASSERT_OK(db_->CompactFiles(CompactionOptions(), inputs, /*output_level=*/1,
+                                /*output_path_id=*/-1, &output_file_names,
+                                &compaction_job_info));
+    ASSERT_EQ(kNumFiles * kKeysPerFile + 2, output_file_names.size());
+    ASSERT_EQ(kNumFiles * kKeysPerFile + 2,
+              static_cast<int>(compaction_job_info.input_files.size()));
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_with_timestamp_test_util.cc b/src/rocksdb/db/db_with_timestamp_test_util.cc
new file mode 100644
index 000000000..f562bcb48
--- /dev/null
+++ b/src/rocksdb/db/db_with_timestamp_test_util.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_with_timestamp_test_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+std::string DBBasicTestWithTimestampBase::Key1(uint64_t k) {
+  std::string ret;
+  PutFixed64(&ret, k);
+  std::reverse(ret.begin(), ret.end());
+  return ret;
+}
+
+std::string DBBasicTestWithTimestampBase::KeyWithPrefix(std::string prefix,
+                                                        uint64_t k) {
+  std::string ret;
+  PutFixed64(&ret, k);
+  std::reverse(ret.begin(), ret.end());
+  return prefix + ret;
+}
+
+std::vector<Slice> DBBasicTestWithTimestampBase::ConvertStrToSlice(
+    std::vector<std::string>& strings) {
+  std::vector<Slice> ret;
+  for (const auto& s : strings) {
+    ret.emplace_back(s);
+  }
+  return ret;
+}
+
+std::string DBBasicTestWithTimestampBase::Timestamp(uint64_t low,
+                                                    uint64_t high) {
+  std::string ts;
+  PutFixed64(&ts, low);
+  PutFixed64(&ts, high);
+  return ts;
+}
+
+void DBBasicTestWithTimestampBase::CheckIterUserEntry(
+    const Iterator* it, const Slice& expected_key,
+    ValueType expected_value_type, const Slice& expected_value,
+    const Slice& expected_ts) const {
+  ASSERT_TRUE(it->Valid());
+  ASSERT_OK(it->status());
+  ASSERT_EQ(expected_key, it->key());
+  if (kTypeValue == expected_value_type) {
+    ASSERT_EQ(expected_value, it->value());
+  }
+  ASSERT_EQ(expected_ts, it->timestamp());
+}
+
+void DBBasicTestWithTimestampBase::CheckIterEntry(
+    const Iterator* it, const Slice& expected_ukey, SequenceNumber expected_seq,
+    ValueType expected_val_type, const Slice& expected_value,
+    const Slice& expected_ts) const {
+  ASSERT_TRUE(it->Valid());
+  ASSERT_OK(it->status());
+  std::string ukey_and_ts;
+  ukey_and_ts.assign(expected_ukey.data(), expected_ukey.size());
+  ukey_and_ts.append(expected_ts.data(), expected_ts.size());
+  ParsedInternalKey parsed_ikey;
+  ASSERT_OK(ParseInternalKey(it->key(), &parsed_ikey, true /* log_err_key */));
+  ASSERT_EQ(ukey_and_ts, parsed_ikey.user_key);
+  ASSERT_EQ(expected_val_type, parsed_ikey.type);
+  ASSERT_EQ(expected_seq, parsed_ikey.sequence);
+  if (expected_val_type == kTypeValue) {
+    ASSERT_EQ(expected_value, it->value());
+  }
+  ASSERT_EQ(expected_ts, it->timestamp());
+}
+
+void DBBasicTestWithTimestampBase::CheckIterEntry(
+    const Iterator* it, const Slice& expected_ukey, ValueType expected_val_type,
+    const Slice& expected_value, const Slice& expected_ts) const {
+  ASSERT_TRUE(it->Valid());
+  ASSERT_OK(it->status());
+  std::string ukey_and_ts;
+  ukey_and_ts.assign(expected_ukey.data(), expected_ukey.size());
+  ukey_and_ts.append(expected_ts.data(), expected_ts.size());
+
+  ParsedInternalKey parsed_ikey;
+  ASSERT_OK(ParseInternalKey(it->key(), &parsed_ikey, true /* log_err_key */));
+  ASSERT_EQ(expected_val_type, parsed_ikey.type);
+  ASSERT_EQ(Slice(ukey_and_ts), parsed_ikey.user_key);
+  if (expected_val_type == kTypeValue) {
+    ASSERT_EQ(expected_value, it->value());
+  }
+  ASSERT_EQ(expected_ts, it->timestamp());
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_with_timestamp_test_util.h b/src/rocksdb/db/db_with_timestamp_test_util.h
new file mode 100644
index 000000000..8a0d8e4e3
--- /dev/null
+++ b/src/rocksdb/db/db_with_timestamp_test_util.h
@@ -0,0 +1,126 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+class DBBasicTestWithTimestampBase : public DBTestBase {
+ public:
+  explicit DBBasicTestWithTimestampBase(const std::string& dbname)
+      : DBTestBase(dbname, /*env_do_fsync=*/true) {}
+
+ protected:
+  static std::string Key1(uint64_t k);
+
+  static std::string KeyWithPrefix(std::string prefix, uint64_t k);
+
+  static std::vector<Slice> ConvertStrToSlice(
+      std::vector<std::string>& strings);
+
+  class TestComparator : public Comparator {
+   private:
+    const Comparator* cmp_without_ts_;
+
+   public:
+    explicit TestComparator(size_t ts_sz)
+        : Comparator(ts_sz), cmp_without_ts_(nullptr) {
+      cmp_without_ts_ = BytewiseComparator();
+    }
+
+    const char* Name() const override { return "TestComparator"; }
+
+    void FindShortSuccessor(std::string*) const override {}
+
+    void FindShortestSeparator(std::string*, const Slice&) const override {}
+
+    int Compare(const Slice& a, const Slice& b) const override {
+      int r = CompareWithoutTimestamp(a, b);
+      if (r != 0 || 0 == timestamp_size()) {
+        return r;
+      }
+      return -CompareTimestamp(
+          Slice(a.data() + a.size() - timestamp_size(), timestamp_size()),
+          Slice(b.data() + b.size() - timestamp_size(), timestamp_size()));
+    }
+
+    using Comparator::CompareWithoutTimestamp;
+    int CompareWithoutTimestamp(const Slice& a, bool a_has_ts, const Slice& b,
+                                bool b_has_ts) const override {
+      if (a_has_ts) {
+        assert(a.size() >= timestamp_size());
+      }
+      if (b_has_ts) {
+        assert(b.size() >= timestamp_size());
+      }
+      Slice lhs = a_has_ts ? StripTimestampFromUserKey(a, timestamp_size()) : a;
+      Slice rhs = b_has_ts ? StripTimestampFromUserKey(b, timestamp_size()) : b;
+      return cmp_without_ts_->Compare(lhs, rhs);
+    }
+
+    int CompareTimestamp(const Slice& ts1, const Slice& ts2) const override {
+      if (!ts1.data() && !ts2.data()) {
+        return 0;
+      } else if (ts1.data() && !ts2.data()) {
+        return 1;
+      } else if (!ts1.data() && ts2.data()) {
+        return -1;
+      }
+      assert(ts1.size() == ts2.size());
+      uint64_t low1 = 0;
+      uint64_t low2 = 0;
+      uint64_t high1 = 0;
+      uint64_t high2 = 0;
+      const size_t kSize = ts1.size();
+      std::unique_ptr<char[]> ts1_buf(new char[kSize]);
+      memcpy(ts1_buf.get(), ts1.data(), ts1.size());
+      std::unique_ptr<char[]> ts2_buf(new char[kSize]);
+      memcpy(ts2_buf.get(), ts2.data(), ts2.size());
+      Slice ts1_copy = Slice(ts1_buf.get(), kSize);
+      Slice ts2_copy = Slice(ts2_buf.get(), kSize);
+      auto* ptr1 = const_cast<Slice*>(&ts1_copy);
+      auto* ptr2 = const_cast<Slice*>(&ts2_copy);
+      if (!GetFixed64(ptr1, &low1) || !GetFixed64(ptr1, &high1) ||
+          !GetFixed64(ptr2, &low2) || !GetFixed64(ptr2, &high2)) {
+        assert(false);
+      }
+      if (high1 < high2) {
+        return -1;
+      } else if (high1 > high2) {
+        return 1;
+      }
+      if (low1 < low2) {
+        return -1;
+      } else if (low1 > low2) {
+        return 1;
+      }
+      return 0;
+    }
+  };
+
+  std::string Timestamp(uint64_t low, uint64_t high);
+
+  void CheckIterUserEntry(const Iterator* it, const Slice& expected_key,
+                          ValueType expected_value_type,
+                          const Slice& expected_value,
+                          const Slice& expected_ts) const;
+
+  void CheckIterEntry(const Iterator* it, const Slice& expected_ukey,
+                      SequenceNumber expected_seq, ValueType expected_val_type,
+                      const Slice& expected_value,
+                      const Slice& expected_ts) const;
+
+  void CheckIterEntry(const Iterator* it, const Slice& expected_ukey,
+                      ValueType expected_val_type, const Slice& expected_value,
+                      const Slice& expected_ts) const;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_write_buffer_manager_test.cc b/src/rocksdb/db/db_write_buffer_manager_test.cc
new file mode 100644
index 000000000..4c31a7824
--- /dev/null
+++ b/src/rocksdb/db/db_write_buffer_manager_test.cc
@@ -0,0 +1,862 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "db/write_thread.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBWriteBufferManagerTest : public DBTestBase,
+                                 public testing::WithParamInterface<bool> {
+ public:
+  DBWriteBufferManagerTest()
+      : DBTestBase("db_write_buffer_manager_test", /*env_do_fsync=*/false) {}
+  bool cost_cache_;
+};
+
+TEST_P(DBWriteBufferManagerTest, SharedBufferAcrossCFs1) {
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;  // this is never hit
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
+
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, true));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, true));
+  }
+
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options);
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  Flush(3);
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+  Flush(0);
+
+  // Write to "Default", "cf2" and "cf3".
+  ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(40000), wo));
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+
+  ASSERT_OK(Put(3, Key(2), DummyString(40000), wo));
+  // WriteBufferManager::buffer_size_ has exceeded after the previous write is
+  // completed.
+
+  // This make sures write will go through and if stall was in effect, it will
+  // end.
+  ASSERT_OK(Put(0, Key(2), DummyString(1), wo));
+}
+
+// Test Single DB with multiple writer threads get blocked when
+// WriteBufferManager execeeds buffer_size_ and flush is waiting to be
+// finished.
+TEST_P(DBWriteBufferManagerTest, SharedWriteBufferAcrossCFs2) {
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;  // this is never hit
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
+
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, true));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, true));
+  }
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options);
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  Flush(3);
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+  Flush(0);
+
+  // Write to "Default", "cf2" and "cf3". No flush will be triggered.
+  ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(40000), wo));
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+
+  ASSERT_OK(Put(3, Key(2), DummyString(40000), wo));
+  // WriteBufferManager::buffer_size_ has exceeded after the previous write is
+  // completed.
+
+  std::unordered_set<WriteThread::Writer*> w_set;
+  std::vector<port::Thread> threads;
+  int wait_count_db = 0;
+  int num_writers = 4;
+  InstrumentedMutex mutex;
+  InstrumentedCondVar cv(&mutex);
+  std::atomic<int> thread_num(0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
+        "DBImpl::BackgroundCallFlush:start"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WBMStallInterface::BlockDB", [&](void*) {
+        InstrumentedMutexLock lock(&mutex);
+        wait_count_db++;
+        cv.SignalAll();
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::WriteStall::Wait", [&](void* arg) {
+        InstrumentedMutexLock lock(&mutex);
+        WriteThread::Writer* w = reinterpret_cast<WriteThread::Writer*>(arg);
+        w_set.insert(w);
+        // Allow the flush to continue if all writer threads are blocked.
+        if (w_set.size() == (unsigned long)num_writers) {
+          TEST_SYNC_POINT(
+              "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  bool s = true;
+
+  std::function<void(int)> writer = [&](int cf) {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    Status tmp = Put(cf, Slice(key), DummyString(1), wo);
+    InstrumentedMutexLock lock(&mutex);
+    s = s && tmp.ok();
+  };
+
+  // Flow:
+  // main_writer thread will write but will be blocked (as Flush will on hold,
+  // buffer_size_ has exceeded, thus will create stall in effect).
+  //  |
+  //  |
+  //  multiple writer threads will be created to write across multiple columns
+  //  and they will be blocked.
+  //  |
+  //  |
+  //  Last writer thread will write and when its blocked it will signal Flush to
+  //  continue to clear the stall.
+
+  threads.emplace_back(writer, 1);
+  // Wait untill first thread (main_writer) writing to DB is blocked and then
+  // create the multiple writers which will be blocked from getting added to the
+  // queue because stall is in effect.
+  {
+    InstrumentedMutexLock lock(&mutex);
+    while (wait_count_db != 1) {
+      cv.Wait();
+    }
+  }
+  for (int i = 0; i < num_writers; i++) {
+    threads.emplace_back(writer, i % 4);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  ASSERT_TRUE(s);
+
+  // Number of DBs blocked.
+  ASSERT_EQ(wait_count_db, 1);
+  // Number of Writer threads blocked.
+  ASSERT_EQ(w_set.size(), num_writers);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Test multiple DBs get blocked when WriteBufferManager limit exceeds and flush
+// is waiting to be finished but DBs tries to write meanwhile.
+TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) {
+  std::vector<std::string> dbnames;
+  std::vector<DB*> dbs;
+  int num_dbs = 3;
+
+  for (int i = 0; i < num_dbs; i++) {
+    dbs.push_back(nullptr);
+    dbnames.push_back(
+        test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i)));
+  }
+
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;  // this is never hit
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
+
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, true));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, true));
+  }
+  CreateAndReopenWithCF({"cf1", "cf2"}, options);
+
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+    ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i])));
+  }
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000)));
+  }
+  // Insert to db_.
+  ASSERT_OK(Put(0, Key(1), DummyString(30000), wo));
+
+  // WriteBufferManager Limit exceeded.
+  std::vector<port::Thread> threads;
+  int wait_count_db = 0;
+  InstrumentedMutex mutex;
+  InstrumentedCondVar cv(&mutex);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
+        "DBImpl::BackgroundCallFlush:start"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WBMStallInterface::BlockDB", [&](void*) {
+        {
+          InstrumentedMutexLock lock(&mutex);
+          wait_count_db++;
+          cv.Signal();
+          // Since this is the last DB, signal Flush to continue.
+          if (wait_count_db == num_dbs + 1) {
+            TEST_SYNC_POINT(
+                "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  bool s = true;
+
+  // Write to DB.
+  std::function<void(DB*)> write_db = [&](DB* db) {
+    Status tmp = db->Put(wo, Key(3), DummyString(1));
+    InstrumentedMutexLock lock(&mutex);
+    s = s && tmp.ok();
+  };
+
+  // Flow:
+  // db_ will write and will be blocked (as Flush will on hold and will create
+  // stall in effect).
+  //  |
+  //  multiple dbs writers will be created to write to that db and they will be
+  //  blocked.
+  //  |
+  //  |
+  //  Last writer will write and when its blocked it will signal Flush to
+  //  continue to clear the stall.
+
+  threads.emplace_back(write_db, db_);
+  // Wait untill first DB is blocked and then create the multiple writers for
+  // different DBs which will be blocked from getting added to the queue because
+  // stall is in effect.
+  {
+    InstrumentedMutexLock lock(&mutex);
+    while (wait_count_db != 1) {
+      cv.Wait();
+    }
+  }
+  for (int i = 0; i < num_dbs; i++) {
+    threads.emplace_back(write_db, dbs[i]);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  ASSERT_TRUE(s);
+  ASSERT_EQ(num_dbs + 1, wait_count_db);
+  // Clean up DBs.
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(dbs[i]->Close());
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+    delete dbs[i];
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Test multiple threads writing across multiple DBs and multiple columns get
+// blocked when stall by WriteBufferManager is in effect.
+TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB1) {
+  std::vector<std::string> dbnames;
+  std::vector<DB*> dbs;
+  int num_dbs = 3;
+
+  for (int i = 0; i < num_dbs; i++) {
+    dbs.push_back(nullptr);
+    dbnames.push_back(
+        test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i)));
+  }
+
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;  // this is never hit
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
+
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, true));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, true));
+  }
+  CreateAndReopenWithCF({"cf1", "cf2"}, options);
+
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+    ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i])));
+  }
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000)));
+  }
+  // Insert to db_.
+  ASSERT_OK(Put(0, Key(1), DummyString(30000), wo));
+
+  // WriteBufferManager::buffer_size_ has exceeded after the previous write to
+  // dbs[0] is completed.
+  std::vector<port::Thread> threads;
+  int wait_count_db = 0;
+  InstrumentedMutex mutex;
+  InstrumentedCondVar cv(&mutex);
+  std::unordered_set<WriteThread::Writer*> w_set;
+  std::vector<port::Thread> writer_threads;
+  std::atomic<int> thread_num(0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
+        "DBImpl::BackgroundCallFlush:start"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WBMStallInterface::BlockDB", [&](void*) {
+        {
+          InstrumentedMutexLock lock(&mutex);
+          wait_count_db++;
+          thread_num.fetch_add(1);
+          cv.Signal();
+          // Allow the flush to continue if all writer threads are blocked.
+          if (thread_num.load(std::memory_order_relaxed) == 2 * num_dbs + 1) {
+            TEST_SYNC_POINT(
+                "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::WriteStall::Wait", [&](void* arg) {
+        WriteThread::Writer* w = reinterpret_cast<WriteThread::Writer*>(arg);
+        {
+          InstrumentedMutexLock lock(&mutex);
+          w_set.insert(w);
+          thread_num.fetch_add(1);
+          // Allow the flush continue if all writer threads are blocked.
+          if (thread_num.load(std::memory_order_relaxed) == 2 * num_dbs + 1) {
+            TEST_SYNC_POINT(
+                "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  bool s1 = true, s2 = true;
+  // Write to multiple columns of db_.
+  std::function<void(int)> write_cf = [&](int cf) {
+    Status tmp = Put(cf, Key(3), DummyString(1), wo);
+    InstrumentedMutexLock lock(&mutex);
+    s1 = s1 && tmp.ok();
+  };
+  // Write to multiple DBs.
+  std::function<void(DB*)> write_db = [&](DB* db) {
+    Status tmp = db->Put(wo, Key(3), DummyString(1));
+    InstrumentedMutexLock lock(&mutex);
+    s2 = s2 && tmp.ok();
+  };
+
+  // Flow:
+  // thread will write to db_ will be blocked (as Flush will on hold,
+  // buffer_size_ has exceeded and will create stall in effect).
+  //  |
+  //  |
+  //  multiple writers threads writing to different DBs and to db_ across
+  //  multiple columns will be created and they will be blocked due to stall.
+  //  |
+  //  |
+  //  Last writer thread will write and when its blocked it will signal Flush to
+  //  continue to clear the stall.
+  threads.emplace_back(write_db, db_);
+  // Wait untill first thread is blocked and then create the multiple writer
+  // threads.
+  {
+    InstrumentedMutexLock lock(&mutex);
+    while (wait_count_db != 1) {
+      cv.Wait();
+    }
+  }
+
+  for (int i = 0; i < num_dbs; i++) {
+    // Write to multiple columns of db_.
+    writer_threads.emplace_back(write_cf, i % 3);
+    // Write to different dbs.
+    threads.emplace_back(write_db, dbs[i]);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+  for (auto& t : writer_threads) {
+    t.join();
+  }
+
+  ASSERT_TRUE(s1);
+  ASSERT_TRUE(s2);
+
+  // Number of DBs blocked.
+  ASSERT_EQ(num_dbs + 1, wait_count_db);
+  // Number of Writer threads blocked.
+  ASSERT_EQ(w_set.size(), num_dbs);
+  // Clean up DBs.
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(dbs[i]->Close());
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+    delete dbs[i];
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Test multiple threads writing across multiple columns of db_ by passing
+// different values to WriteOption.no_slown_down.
+TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsSingleDB) {
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;  // this is never hit
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
+
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, true));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, true));
+  }
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options);
+
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  Flush(3);
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+  Flush(0);
+
+  // Write to "Default", "cf2" and "cf3". No flush will be triggered.
+  ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(40000), wo));
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+  ASSERT_OK(Put(3, Key(2), DummyString(40000), wo));
+
+  // WriteBufferManager::buffer_size_ has exceeded after the previous write to
+  // db_ is completed.
+
+  std::unordered_set<WriteThread::Writer*> w_slowdown_set;
+  std::vector<port::Thread> threads;
+  int wait_count_db = 0;
+  int num_writers = 4;
+  InstrumentedMutex mutex;
+  InstrumentedCondVar cv(&mutex);
+  std::atomic<int> thread_num(0);
+  std::atomic<int> w_no_slowdown(0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
+        "DBImpl::BackgroundCallFlush:start"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WBMStallInterface::BlockDB", [&](void*) {
+        {
+          InstrumentedMutexLock lock(&mutex);
+          wait_count_db++;
+          cv.SignalAll();
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::WriteStall::Wait", [&](void* arg) {
+        {
+          InstrumentedMutexLock lock(&mutex);
+          WriteThread::Writer* w = reinterpret_cast<WriteThread::Writer*>(arg);
+          w_slowdown_set.insert(w);
+          // Allow the flush continue if all writer threads are blocked.
+          if (w_slowdown_set.size() + (unsigned long)w_no_slowdown.load(
+                                          std::memory_order_relaxed) ==
+              (unsigned long)num_writers) {
+            TEST_SYNC_POINT(
+                "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  bool s1 = true, s2 = true;
+
+  std::function<void(int)> write_slow_down = [&](int cf) {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions write_op;
+    write_op.no_slowdown = false;
+    Status tmp = Put(cf, Slice(key), DummyString(1), write_op);
+    InstrumentedMutexLock lock(&mutex);
+    s1 = s1 && tmp.ok();
+  };
+
+  std::function<void(int)> write_no_slow_down = [&](int cf) {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions write_op;
+    write_op.no_slowdown = true;
+    Status tmp = Put(cf, Slice(key), DummyString(1), write_op);
+    {
+      InstrumentedMutexLock lock(&mutex);
+      s2 = s2 && !tmp.ok();
+      w_no_slowdown.fetch_add(1);
+      // Allow the flush continue if all writer threads are blocked.
+      if (w_slowdown_set.size() +
+              (unsigned long)w_no_slowdown.load(std::memory_order_relaxed) ==
+          (unsigned long)num_writers) {
+        TEST_SYNC_POINT(
+            "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+      }
+    }
+  };
+
+  // Flow:
+  // main_writer thread will write but will be blocked (as Flush will on hold,
+  // buffer_size_ has exceeded, thus will create stall in effect).
+  //  |
+  //  |
+  //  multiple writer threads will be created to write across multiple columns
+  //  with different values of WriteOptions.no_slowdown. Some of them will
+  //  be blocked and some of them will return with Incomplete status.
+  //  |
+  //  |
+  //  Last writer thread will write and when its blocked/return it will signal
+  //  Flush to continue to clear the stall.
+  threads.emplace_back(write_slow_down, 1);
+  // Wait untill first thread (main_writer) writing to DB is blocked and then
+  // create the multiple writers which will be blocked from getting added to the
+  // queue because stall is in effect.
+  {
+    InstrumentedMutexLock lock(&mutex);
+    while (wait_count_db != 1) {
+      cv.Wait();
+    }
+  }
+
+  for (int i = 0; i < num_writers; i += 2) {
+    threads.emplace_back(write_no_slow_down, (i) % 4);
+    threads.emplace_back(write_slow_down, (i + 1) % 4);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  ASSERT_TRUE(s1);
+  ASSERT_TRUE(s2);
+  // Number of DBs blocked.
+  ASSERT_EQ(wait_count_db, 1);
+  // Number of Writer threads blocked.
+  ASSERT_EQ(w_slowdown_set.size(), num_writers / 2);
+  // Number of Writer threads with WriteOptions.no_slowdown = true.
+  ASSERT_EQ(w_no_slowdown.load(std::memory_order_relaxed), num_writers / 2);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Test multiple threads writing across multiple columns of db_ and different
+// dbs by passing different values to WriteOption.no_slown_down.
+TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) {
+  std::vector<std::string> dbnames;
+  std::vector<DB*> dbs;
+  int num_dbs = 4;
+
+  for (int i = 0; i < num_dbs; i++) {
+    dbs.push_back(nullptr);
+    dbnames.push_back(
+        test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i)));
+  }
+
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;  // this is never hit
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
+
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, true));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, true));
+  }
+  CreateAndReopenWithCF({"cf1", "cf2"}, options);
+
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+    ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i])));
+  }
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000)));
+  }
+  // Insert to db_.
+  ASSERT_OK(Put(0, Key(1), DummyString(30000), wo));
+
+  // WriteBufferManager::buffer_size_ has exceeded after the previous write to
+  // dbs[0] is completed.
+  std::vector<port::Thread> threads;
+  int wait_count_db = 0;
+  InstrumentedMutex mutex;
+  InstrumentedCondVar cv(&mutex);
+  std::unordered_set<WriteThread::Writer*> w_slowdown_set;
+  std::vector<port::Thread> writer_threads;
+  std::atomic<int> thread_num(0);
+  std::atomic<int> w_no_slowdown(0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
+        "DBImpl::BackgroundCallFlush:start"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WBMStallInterface::BlockDB", [&](void*) {
+        InstrumentedMutexLock lock(&mutex);
+        wait_count_db++;
+        cv.Signal();
+        // Allow the flush continue if all writer threads are blocked.
+        if (w_slowdown_set.size() +
+                (unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) +
+                                wait_count_db) ==
+            (unsigned long)(2 * num_dbs + 1)) {
+          TEST_SYNC_POINT(
+              "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::WriteStall::Wait", [&](void* arg) {
+        WriteThread::Writer* w = reinterpret_cast<WriteThread::Writer*>(arg);
+        InstrumentedMutexLock lock(&mutex);
+        w_slowdown_set.insert(w);
+        // Allow the flush continue if all writer threads are blocked.
+        if (w_slowdown_set.size() +
+                (unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) +
+                                wait_count_db) ==
+            (unsigned long)(2 * num_dbs + 1)) {
+          TEST_SYNC_POINT(
+              "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  bool s1 = true, s2 = true;
+  std::function<void(DB*)> write_slow_down = [&](DB* db) {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions write_op;
+    write_op.no_slowdown = false;
+    Status tmp = db->Put(write_op, Slice(key), DummyString(1));
+    InstrumentedMutexLock lock(&mutex);
+    s1 = s1 && tmp.ok();
+  };
+
+  std::function<void(DB*)> write_no_slow_down = [&](DB* db) {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions write_op;
+    write_op.no_slowdown = true;
+    Status tmp = db->Put(write_op, Slice(key), DummyString(1));
+    {
+      InstrumentedMutexLock lock(&mutex);
+      s2 = s2 && !tmp.ok();
+      w_no_slowdown.fetch_add(1);
+      if (w_slowdown_set.size() +
+              (unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) +
+                              wait_count_db) ==
+          (unsigned long)(2 * num_dbs + 1)) {
+        TEST_SYNC_POINT(
+            "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+      }
+    }
+  };
+
+  // Flow:
+  // first thread will write but will be blocked (as Flush will on hold,
+  // buffer_size_ has exceeded, thus will create stall in effect).
+  //  |
+  //  |
+  //  multiple writer threads will be created to write across multiple columns
+  //  of db_ and different DBs with different values of
+  //  WriteOptions.no_slowdown. Some of them will be blocked and some of them
+  //  will return with Incomplete status.
+  //  |
+  //  |
+  //  Last writer thread will write and when its blocked/return it will signal
+  //  Flush to continue to clear the stall.
+  threads.emplace_back(write_slow_down, db_);
+  // Wait untill first thread writing to DB is blocked and then
+  // create the multiple writers.
+  {
+    InstrumentedMutexLock lock(&mutex);
+    while (wait_count_db != 1) {
+      cv.Wait();
+    }
+  }
+
+  for (int i = 0; i < num_dbs; i += 2) {
+    // Write to multiple columns of db_.
+    writer_threads.emplace_back(write_slow_down, db_);
+    writer_threads.emplace_back(write_no_slow_down, db_);
+    // Write to different DBs.
+    threads.emplace_back(write_slow_down, dbs[i]);
+    threads.emplace_back(write_no_slow_down, dbs[i + 1]);
+  }
+
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  for (auto& t : writer_threads) {
+    t.join();
+  }
+
+  ASSERT_TRUE(s1);
+  ASSERT_TRUE(s2);
+  // Number of DBs blocked.
+  ASSERT_EQ((num_dbs / 2) + 1, wait_count_db);
+  // Number of writer threads writing to db_ blocked from getting added to the
+  // queue.
+  ASSERT_EQ(w_slowdown_set.size(), num_dbs / 2);
+  // Number of threads with WriteOptions.no_slowdown = true.
+  ASSERT_EQ(w_no_slowdown.load(std::memory_order_relaxed), num_dbs);
+
+  // Clean up DBs.
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(dbs[i]->Close());
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+    delete dbs[i];
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+#ifndef ROCKSDB_LITE
+
+// Tests a `WriteBufferManager` constructed with `allow_stall == false` does not
+// thrash memtable switching when full and a CF receives multiple writes.
+// Instead, we expect to switch a CF's memtable for flush only when that CF does
+// not have any pending or running flush.
+//
+// This test uses multiple DBs each with a single CF instead of a single DB
+// with multiple CFs. That way we can control which CF is considered for switch
+// by writing to that CF's DB.
+//
+// Not supported in LITE mode due to `GetProperty()` unavailable.
+TEST_P(DBWriteBufferManagerTest, StopSwitchingMemTablesOnceFlushing) {
+  Options options = CurrentOptions();
+  options.arena_block_size = 4 << 10;   // 4KB
+  options.write_buffer_size = 1 << 20;  // 1MB
+  std::shared_ptr<Cache> cache =
+      NewLRUCache(4 << 20 /* capacity (4MB) */, 2 /* num_shard_bits */);
+  ASSERT_LT(cache->GetUsage(), 256 << 10 /* 256KB */);
+  cost_cache_ = GetParam();
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(new WriteBufferManager(
+        512 << 10 /* buffer_size (512KB) */, cache, false /* allow_stall */));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(512 << 10 /* buffer_size (512KB) */,
+                               nullptr /* cache */, false /* allow_stall */));
+  }
+
+  Reopen(options);
+  std::string dbname = test::PerThreadDBPath("db_shared_wbm_db");
+  DB* shared_wbm_db = nullptr;
+
+  ASSERT_OK(DestroyDB(dbname, options));
+  ASSERT_OK(DB::Open(options, dbname, &shared_wbm_db));
+
+  // The last write will make WBM need flush, but it won't flush yet.
+  ASSERT_OK(Put(Key(1), DummyString(256 << 10 /* 256KB */), WriteOptions()));
+  ASSERT_FALSE(options.write_buffer_manager->ShouldFlush());
+  ASSERT_OK(Put(Key(1), DummyString(256 << 10 /* 256KB */), WriteOptions()));
+  ASSERT_TRUE(options.write_buffer_manager->ShouldFlush());
+
+  // Flushes will be pending, not running because flush threads are blocked.
+  test::SleepingBackgroundTask sleeping_task_high;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                 &sleeping_task_high, Env::Priority::HIGH);
+
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(
+        shared_wbm_db->Put(WriteOptions(), Key(1), DummyString(1 /* len */)));
+    std::string prop;
+    ASSERT_TRUE(
+        shared_wbm_db->GetProperty("rocksdb.num-immutable-mem-table", &prop));
+    ASSERT_EQ(std::to_string(i > 0 ? 1 : 0), prop);
+    ASSERT_TRUE(
+        shared_wbm_db->GetProperty("rocksdb.mem-table-flush-pending", &prop));
+    ASSERT_EQ(std::to_string(i > 0 ? 1 : 0), prop);
+  }
+
+  // Clean up DBs.
+  sleeping_task_high.WakeUp();
+  sleeping_task_high.WaitUntilDone();
+  ASSERT_OK(shared_wbm_db->Close());
+  ASSERT_OK(DestroyDB(dbname, options));
+  delete shared_wbm_db;
+}
+
+#endif  // ROCKSDB_LITE
+
+INSTANTIATE_TEST_CASE_P(DBWriteBufferManagerTest, DBWriteBufferManagerTest,
+                        testing::Bool());
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_write_test.cc b/src/rocksdb/db/db_write_test.cc
new file mode 100644
index 000000000..1011d5c9e
--- /dev/null
+++ b/src/rocksdb/db/db_write_test.cc
@@ -0,0 +1,679 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <atomic>
+#include <fstream>
+#include <memory>
+#include <thread>
+#include <vector>
+
+#include "db/db_test_util.h"
+#include "db/write_batch_internal.h"
+#include "db/write_thread.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Test variations of WriteImpl.
+class DBWriteTest : public DBTestBase, public testing::WithParamInterface<int> {
+ public:
+  DBWriteTest() : DBTestBase("db_write_test", /*env_do_fsync=*/true) {}
+
+  Options GetOptions() { return DBTestBase::GetOptions(GetParam()); }
+
+  void Open() { DBTestBase::Reopen(GetOptions()); }
+};
+
+class DBWriteTestUnparameterized : public DBTestBase {
+ public:
+  explicit DBWriteTestUnparameterized()
+      : DBTestBase("pipelined_write_test", /*env_do_fsync=*/false) {}
+};
+
+// It is invalid to do sync write while disabling WAL.
+TEST_P(DBWriteTest, SyncAndDisableWAL) {
+  WriteOptions write_options;
+  write_options.sync = true;
+  write_options.disableWAL = true;
+  ASSERT_TRUE(dbfull()->Put(write_options, "foo", "bar").IsInvalidArgument());
+  WriteBatch batch;
+  ASSERT_OK(batch.Put("foo", "bar"));
+  ASSERT_TRUE(dbfull()->Write(write_options, &batch).IsInvalidArgument());
+}
+
+TEST_P(DBWriteTest, WriteStallRemoveNoSlowdownWrite) {
+  Options options = GetOptions();
+  options.level0_stop_writes_trigger = options.level0_slowdown_writes_trigger =
+      4;
+  std::vector<port::Thread> threads;
+  std::atomic<int> thread_num(0);
+  port::Mutex mutex;
+  port::CondVar cv(&mutex);
+  // Guarded by mutex
+  int writers = 0;
+
+  Reopen(options);
+
+  std::function<void()> write_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = false;
+    ASSERT_OK(dbfull()->Put(wo, key, "bar"));
+  };
+  std::function<void()> write_no_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = true;
+    Status s = dbfull()->Put(wo, key, "bar");
+    ASSERT_TRUE(s.ok() || s.IsIncomplete());
+  };
+  std::function<void(void*)> unblock_main_thread_func = [&](void*) {
+    mutex.Lock();
+    ++writers;
+    cv.SignalAll();
+    mutex.Unlock();
+  };
+
+  // Create 3 L0 files and schedule 4th without waiting
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::JoinBatchGroup:Start", unblock_main_thread_func);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteTest::WriteStallRemoveNoSlowdownWrite:1",
+        "DBImpl::BackgroundCallFlush:start"},
+       {"DBWriteTest::WriteStallRemoveNoSlowdownWrite:2",
+        "DBImplWrite::PipelinedWriteImpl:AfterJoinBatchGroup"},
+       // Make compaction start wait for the write stall to be detected and
+       // implemented by a write group leader
+       {"DBWriteTest::WriteStallRemoveNoSlowdownWrite:3",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Schedule creation of 4th L0 file without waiting. This will seal the
+  // memtable and then wait for a sync point before writing the file. We need
+  // to do it this way because SwitchMemtable() needs to enter the
+  // write_thread
+  FlushOptions fopt;
+  fopt.wait = false;
+  ASSERT_OK(dbfull()->Flush(fopt));
+
+  // Create a mix of slowdown/no_slowdown write threads
+  mutex.Lock();
+  // First leader
+  threads.emplace_back(write_slowdown_func);
+  while (writers != 1) {
+    cv.Wait();
+  }
+
+  // Second leader. Will stall writes
+  // Build a writers list with no slowdown in the middle:
+  //  +-------------+
+  //  | slowdown    +<----+ newest
+  //  +--+----------+
+  //     |
+  //     v
+  //  +--+----------+
+  //  | no slowdown |
+  //  +--+----------+
+  //     |
+  //     v
+  //  +--+----------+
+  //  | slowdown    +
+  //  +-------------+
+  threads.emplace_back(write_slowdown_func);
+  while (writers != 2) {
+    cv.Wait();
+  }
+  threads.emplace_back(write_no_slowdown_func);
+  while (writers != 3) {
+    cv.Wait();
+  }
+  threads.emplace_back(write_slowdown_func);
+  while (writers != 4) {
+    cv.Wait();
+  }
+
+  mutex.Unlock();
+
+  TEST_SYNC_POINT("DBWriteTest::WriteStallRemoveNoSlowdownWrite:1");
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(nullptr));
+  // This would have triggered a write stall. Unblock the write group leader
+  TEST_SYNC_POINT("DBWriteTest::WriteStallRemoveNoSlowdownWrite:2");
+  // The leader is going to create missing newer links. When the leader
+  // finishes, the next leader is going to delay writes and fail writers with
+  // no_slowdown
+
+  TEST_SYNC_POINT("DBWriteTest::WriteStallRemoveNoSlowdownWrite:3");
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) {
+  Options options = GetOptions();
+  options.level0_stop_writes_trigger = options.level0_slowdown_writes_trigger =
+      4;
+  std::vector<port::Thread> threads;
+  std::atomic<int> thread_num(0);
+  port::Mutex mutex;
+  port::CondVar cv(&mutex);
+  // Guarded by mutex
+  int writers = 0;
+
+  Reopen(options);
+
+  std::function<void()> write_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = false;
+    ASSERT_OK(dbfull()->Put(wo, key, "bar"));
+  };
+  std::function<void()> write_no_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = true;
+    Status s = dbfull()->Put(wo, key, "bar");
+    ASSERT_TRUE(s.ok() || s.IsIncomplete());
+  };
+  std::function<void(void*)> unblock_main_thread_func = [&](void*) {
+    mutex.Lock();
+    ++writers;
+    cv.SignalAll();
+    mutex.Unlock();
+  };
+
+  // Create 3 L0 files and schedule 4th without waiting
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::JoinBatchGroup:Start", unblock_main_thread_func);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteTest::WriteThreadHangOnWriteStall:1",
+        "DBImpl::BackgroundCallFlush:start"},
+       {"DBWriteTest::WriteThreadHangOnWriteStall:2",
+        "DBImpl::WriteImpl:BeforeLeaderEnters"},
+       // Make compaction start wait for the write stall to be detected and
+       // implemented by a write group leader
+       {"DBWriteTest::WriteThreadHangOnWriteStall:3",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Schedule creation of 4th L0 file without waiting. This will seal the
+  // memtable and then wait for a sync point before writing the file. We need
+  // to do it this way because SwitchMemtable() needs to enter the
+  // write_thread
+  FlushOptions fopt;
+  fopt.wait = false;
+  ASSERT_OK(dbfull()->Flush(fopt));
+
+  // Create a mix of slowdown/no_slowdown write threads
+  mutex.Lock();
+  // First leader
+  threads.emplace_back(write_slowdown_func);
+  while (writers != 1) {
+    cv.Wait();
+  }
+  // Second leader. Will stall writes
+  threads.emplace_back(write_slowdown_func);
+  threads.emplace_back(write_no_slowdown_func);
+  threads.emplace_back(write_slowdown_func);
+  threads.emplace_back(write_no_slowdown_func);
+  threads.emplace_back(write_slowdown_func);
+  while (writers != 6) {
+    cv.Wait();
+  }
+  mutex.Unlock();
+
+  TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:1");
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(nullptr));
+  // This would have triggered a write stall. Unblock the write group leader
+  TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:2");
+  // The leader is going to create missing newer links. When the leader
+  // finishes, the next leader is going to delay writes and fail writers with
+  // no_slowdown
+
+  TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:3");
+  for (auto& t : threads) {
+    t.join();
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) {
+  constexpr int kNumThreads = 5;
+  std::unique_ptr<FaultInjectionTestEnv> mock_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = GetOptions();
+  options.env = mock_env.get();
+  Reopen(options);
+  std::atomic<int> ready_count{0};
+  std::atomic<int> leader_count{0};
+  std::vector<port::Thread> threads;
+  mock_env->SetFilesystemActive(false);
+
+  // Wait until all threads linked to write threads, to make sure
+  // all threads join the same batch group.
+  SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::JoinBatchGroup:Wait", [&](void* arg) {
+        ready_count++;
+        auto* w = reinterpret_cast<WriteThread::Writer*>(arg);
+        if (w->state == WriteThread::STATE_GROUP_LEADER) {
+          leader_count++;
+          while (ready_count < kNumThreads) {
+            // busy waiting
+          }
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  for (int i = 0; i < kNumThreads; i++) {
+    threads.push_back(port::Thread(
+        [&](int index) {
+          // All threads should fail.
+          auto res = Put("key" + std::to_string(index), "value");
+          if (options.manual_wal_flush) {
+            ASSERT_TRUE(res.ok());
+            // we should see fs error when we do the flush
+
+            // TSAN reports a false alarm for lock-order-inversion but Open and
+            // FlushWAL are not run concurrently. Disabling this until TSAN is
+            // fixed.
+            // res = dbfull()->FlushWAL(false);
+            // ASSERT_FALSE(res.ok());
+          } else {
+            ASSERT_FALSE(res.ok());
+          }
+        },
+        i));
+  }
+  for (int i = 0; i < kNumThreads; i++) {
+    threads[i].join();
+  }
+  ASSERT_EQ(1, leader_count);
+
+  // The Failed PUT operations can cause a BG error to be set.
+  // Mark it as Checked for the ASSERT_STATUS_CHECKED
+  dbfull()->Resume().PermitUncheckedError();
+
+  // Close before mock_env destruct.
+  Close();
+}
+
+TEST_F(DBWriteTestUnparameterized, PipelinedWriteRace) {
+  // This test was written to trigger a race in ExitAsBatchGroupLeader in case
+  // enable_pipelined_write_ was true.
+  // Writers for which ShouldWriteToMemtable() evaluates to false are removed
+  // from the write_group via CompleteFollower/ CompleteLeader. Writers in the
+  // middle of the group are fully unlinked, but if that writers is the
+  // last_writer, then we did not update the predecessor's link_older, i.e.,
+  // this writer was still reachable via newest_writer_.
+  //
+  // But the problem was, that CompleteFollower already wakes up the thread
+  // owning that writer before the writer has been removed. This resulted in a
+  // race - if the leader thread was fast enough, then everything was fine.
+  // However, if the woken up thread finished the current write operation and
+  // then performed yet another write, then a new writer instance was added
+  // to newest_writer_. It is possible that the new writer is located on the
+  // same address on stack, and if this happened, then we had a problem,
+  // because the old code tried to find the last_writer in the list to unlink
+  // it, which in this case produced a cycle in the list.
+  // Whether two invocations of PipelinedWriteImpl() by the same thread actually
+  // allocate the writer on the same address depends on the OS and/or compiler,
+  // so it is rather hard to create a deterministic test for this.
+
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.enable_pipelined_write = true;
+  std::vector<port::Thread> threads;
+
+  std::atomic<int> write_counter{0};
+  std::atomic<int> active_writers{0};
+  std::atomic<bool> second_write_starting{false};
+  std::atomic<bool> second_write_in_progress{false};
+  std::atomic<WriteThread::Writer*> leader{nullptr};
+  std::atomic<bool> finished_WAL_write{false};
+
+  DestroyAndReopen(options);
+
+  auto write_one_doc = [&]() {
+    int a = write_counter.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    ASSERT_OK(dbfull()->Put(wo, key, "bar"));
+    --active_writers;
+  };
+
+  auto write_two_docs = [&]() {
+    write_one_doc();
+    second_write_starting = true;
+    write_one_doc();
+  };
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::JoinBatchGroup:Wait", [&](void* arg) {
+        if (second_write_starting.load()) {
+          second_write_in_progress = true;
+          return;
+        }
+        auto* w = reinterpret_cast<WriteThread::Writer*>(arg);
+        if (w->state == WriteThread::STATE_GROUP_LEADER) {
+          active_writers++;
+          if (leader.load() == nullptr) {
+            leader.store(w);
+            while (active_writers.load() < 2) {
+              // wait for another thread to join the write_group
+            }
+          }
+        } else {
+          // we disable the memtable for all followers so that they they are
+          // removed from the write_group before enqueuing it for the memtable
+          // write
+          w->disable_memtable = true;
+          active_writers++;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::ExitAsBatchGroupLeader:Start", [&](void* arg) {
+        auto* wg = reinterpret_cast<WriteThread::WriteGroup*>(arg);
+        if (wg->leader == leader && !finished_WAL_write) {
+          finished_WAL_write = true;
+          while (active_writers.load() < 3) {
+            // wait for the new writer to be enqueued
+          }
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::ExitAsBatchGroupLeader:AfterCompleteWriters",
+      [&](void* arg) {
+        auto* wg = reinterpret_cast<WriteThread::WriteGroup*>(arg);
+        if (wg->leader == leader) {
+          while (!second_write_in_progress.load()) {
+            // wait for the old follower thread to start the next write
+          }
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // start leader + one follower
+  threads.emplace_back(write_one_doc);
+  while (leader.load() == nullptr) {
+    // wait for leader
+  }
+
+  // we perform two writes in the follower, so that for the second write
+  // the thread reinserts a Writer with the same address
+  threads.emplace_back(write_two_docs);
+
+  // wait for the leader to enter ExitAsBatchGroupLeader
+  while (!finished_WAL_write.load()) {
+    // wait for write_group to have finished the WAL writes
+  }
+
+  // start another writer thread to be enqueued before the leader can
+  // complete the writers from its write_group
+  threads.emplace_back(write_one_doc);
+
+  for (auto& t : threads) {
+    t.join();
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBWriteTest, ManualWalFlushInEffect) {
+  Options options = GetOptions();
+  Reopen(options);
+  // try the 1st WAL created during open
+  ASSERT_TRUE(Put("key" + std::to_string(0), "value").ok());
+  ASSERT_TRUE(options.manual_wal_flush != dbfull()->WALBufferIsEmpty());
+  ASSERT_TRUE(dbfull()->FlushWAL(false).ok());
+  ASSERT_TRUE(dbfull()->WALBufferIsEmpty());
+  // try the 2nd wal created during SwitchWAL
+  ASSERT_OK(dbfull()->TEST_SwitchWAL());
+  ASSERT_TRUE(Put("key" + std::to_string(0), "value").ok());
+  ASSERT_TRUE(options.manual_wal_flush != dbfull()->WALBufferIsEmpty());
+  ASSERT_TRUE(dbfull()->FlushWAL(false).ok());
+  ASSERT_TRUE(dbfull()->WALBufferIsEmpty());
+}
+
+TEST_P(DBWriteTest, UnflushedPutRaceWithTrackedWalSync) {
+  // Repro race condition bug where unflushed WAL data extended the synced size
+  // recorded to MANIFEST despite being unrecoverable.
+  Options options = GetOptions();
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(env_));
+  options.env = fault_env.get();
+  options.manual_wal_flush = true;
+  options.track_and_verify_wals_in_manifest = true;
+  Reopen(options);
+
+  ASSERT_OK(Put("key1", "val1"));
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::SyncWAL:Begin",
+      [this](void* /* arg */) { ASSERT_OK(Put("key2", "val2")); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db_->FlushWAL(true /* sync */));
+
+  // Ensure callback ran.
+  ASSERT_EQ("val2", Get("key2"));
+
+  Close();
+
+  // Simulate full loss of unsynced data. This drops "key2" -> "val2" from the
+  // DB WAL.
+  fault_env->DropUnsyncedFileData();
+
+  Reopen(options);
+
+  // Need to close before `fault_env` goes out of scope.
+  Close();
+}
+
+TEST_P(DBWriteTest, InactiveWalFullySyncedBeforeUntracked) {
+  // Repro bug where a WAL is appended and switched after
+  // `FlushWAL(true /* sync */)`'s sync finishes and before it untracks fully
+  // synced inactive logs. Previously such a WAL would be wrongly untracked
+  // so the final append would never be synced.
+  Options options = GetOptions();
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(env_));
+  options.env = fault_env.get();
+  Reopen(options);
+
+  ASSERT_OK(Put("key1", "val1"));
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::SyncWAL:BeforeMarkLogsSynced:1", [this](void* /* arg */) {
+        ASSERT_OK(Put("key2", "val2"));
+        ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db_->FlushWAL(true /* sync */));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_OK(Put("key3", "val3"));
+
+  ASSERT_OK(db_->FlushWAL(true /* sync */));
+
+  Close();
+
+  // Simulate full loss of unsynced data. This should drop nothing since we did
+  // `FlushWAL(true /* sync */)` before `Close()`.
+  fault_env->DropUnsyncedFileData();
+
+  Reopen(options);
+
+  ASSERT_EQ("val1", Get("key1"));
+  ASSERT_EQ("val2", Get("key2"));
+  ASSERT_EQ("val3", Get("key3"));
+
+  // Need to close before `fault_env` goes out of scope.
+  Close();
+}
+
+TEST_P(DBWriteTest, IOErrorOnWALWriteTriggersReadOnlyMode) {
+  std::unique_ptr<FaultInjectionTestEnv> mock_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = GetOptions();
+  options.env = mock_env.get();
+  Reopen(options);
+  for (int i = 0; i < 2; i++) {
+    // Forcibly fail WAL write for the first Put only. Subsequent Puts should
+    // fail due to read-only mode
+    mock_env->SetFilesystemActive(i != 0);
+    auto res = Put("key" + std::to_string(i), "value");
+    // TSAN reports a false alarm for lock-order-inversion but Open and
+    // FlushWAL are not run concurrently. Disabling this until TSAN is
+    // fixed.
+    /*
+    if (options.manual_wal_flush && i == 0) {
+      // even with manual_wal_flush the 2nd Put should return error because of
+      // the read-only mode
+      ASSERT_TRUE(res.ok());
+      // we should see fs error when we do the flush
+      res = dbfull()->FlushWAL(false);
+    }
+    */
+    if (!options.manual_wal_flush) {
+      ASSERT_NOK(res);
+    } else {
+      ASSERT_OK(res);
+    }
+  }
+  // Close before mock_env destruct.
+  Close();
+}
+
+TEST_P(DBWriteTest, IOErrorOnSwitchMemtable) {
+  Random rnd(301);
+  std::unique_ptr<FaultInjectionTestEnv> mock_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = GetOptions();
+  options.env = mock_env.get();
+  options.writable_file_max_buffer_size = 4 * 1024 * 1024;
+  options.write_buffer_size = 3 * 512 * 1024;
+  options.wal_bytes_per_sync = 256 * 1024;
+  options.manual_wal_flush = true;
+  Reopen(options);
+  mock_env->SetFilesystemActive(false, Status::IOError("Not active"));
+  Status s;
+  for (int i = 0; i < 4 * 512; ++i) {
+    s = Put(Key(i), rnd.RandomString(1024));
+    if (!s.ok()) {
+      break;
+    }
+  }
+  ASSERT_EQ(s.severity(), Status::Severity::kFatalError);
+
+  mock_env->SetFilesystemActive(true);
+  // Close before mock_env destruct.
+  Close();
+}
+
+// Test that db->LockWAL() flushes the WAL after locking.
+TEST_P(DBWriteTest, LockWalInEffect) {
+  Options options = GetOptions();
+  Reopen(options);
+  // try the 1st WAL created during open
+  ASSERT_OK(Put("key" + std::to_string(0), "value"));
+  ASSERT_TRUE(options.manual_wal_flush != dbfull()->WALBufferIsEmpty());
+  ASSERT_OK(dbfull()->LockWAL());
+  ASSERT_TRUE(dbfull()->WALBufferIsEmpty(false));
+  ASSERT_OK(dbfull()->UnlockWAL());
+  // try the 2nd wal created during SwitchWAL
+  ASSERT_OK(dbfull()->TEST_SwitchWAL());
+  ASSERT_OK(Put("key" + std::to_string(0), "value"));
+  ASSERT_TRUE(options.manual_wal_flush != dbfull()->WALBufferIsEmpty());
+  ASSERT_OK(dbfull()->LockWAL());
+  ASSERT_TRUE(dbfull()->WALBufferIsEmpty(false));
+  ASSERT_OK(dbfull()->UnlockWAL());
+}
+
+TEST_P(DBWriteTest, ConcurrentlyDisabledWAL) {
+  Options options = GetOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kAll);
+  Reopen(options);
+  std::string wal_key_prefix = "WAL_KEY_";
+  std::string no_wal_key_prefix = "K_";
+  // 100 KB value each for NO-WAL operation
+  std::string no_wal_value(1024 * 100, 'X');
+  // 1B value each for WAL operation
+  std::string wal_value = "0";
+  std::thread threads[10];
+  for (int t = 0; t < 10; t++) {
+    threads[t] = std::thread([t, wal_key_prefix, wal_value, no_wal_key_prefix,
+                              no_wal_value, this] {
+      for (int i = 0; i < 10; i++) {
+        ROCKSDB_NAMESPACE::WriteOptions write_option_disable;
+        write_option_disable.disableWAL = true;
+        ROCKSDB_NAMESPACE::WriteOptions write_option_default;
+        std::string no_wal_key =
+            no_wal_key_prefix + std::to_string(t) + "_" + std::to_string(i);
+        ASSERT_OK(this->Put(no_wal_key, no_wal_value, write_option_disable));
+        std::string wal_key =
+            wal_key_prefix + std::to_string(i) + "_" + std::to_string(i);
+        ASSERT_OK(this->Put(wal_key, wal_value, write_option_default));
+        ASSERT_OK(dbfull()->SyncWAL());
+      }
+      return;
+    });
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+  uint64_t bytes_num = options.statistics->getTickerCount(
+      ROCKSDB_NAMESPACE::Tickers::WAL_FILE_BYTES);
+  // written WAL size should less than 100KB (even included HEADER & FOOTER
+  // overhead)
+  ASSERT_LE(bytes_num, 1024 * 100);
+}
+
+INSTANTIATE_TEST_CASE_P(DBWriteTestInstance, DBWriteTest,
+                        testing::Values(DBTestBase::kDefault,
+                                        DBTestBase::kConcurrentWALWrites,
+                                        DBTestBase::kPipelinedWrite));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/dbformat.cc b/src/rocksdb/db/dbformat.cc
new file mode 100644
index 000000000..b0ac6c339
--- /dev/null
+++ b/src/rocksdb/db/dbformat.cc
@@ -0,0 +1,188 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/dbformat.h"
+
+#include <stdio.h>
+
+#include <cinttypes>
+
+#include "db/lookup_key.h"
+#include "monitoring/perf_context_imp.h"
+#include "port/port.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// kValueTypeForSeek defines the ValueType that should be passed when
+// constructing a ParsedInternalKey object for seeking to a particular
+// sequence number (since we sort sequence numbers in decreasing order
+// and the value type is embedded as the low 8 bits in the sequence
+// number in internal keys, we need to use the highest-numbered
+// ValueType, not the lowest).
+const ValueType kValueTypeForSeek = kTypeWideColumnEntity;
+const ValueType kValueTypeForSeekForPrev = kTypeDeletion;
+const std::string kDisableUserTimestamp("");
+
+EntryType GetEntryType(ValueType value_type) {
+  switch (value_type) {
+    case kTypeValue:
+      return kEntryPut;
+    case kTypeDeletion:
+      return kEntryDelete;
+    case kTypeDeletionWithTimestamp:
+      return kEntryDeleteWithTimestamp;
+    case kTypeSingleDeletion:
+      return kEntrySingleDelete;
+    case kTypeMerge:
+      return kEntryMerge;
+    case kTypeRangeDeletion:
+      return kEntryRangeDeletion;
+    case kTypeBlobIndex:
+      return kEntryBlobIndex;
+    case kTypeWideColumnEntity:
+      return kEntryWideColumnEntity;
+    default:
+      return kEntryOther;
+  }
+}
+
+void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
+  result->append(key.user_key.data(), key.user_key.size());
+  PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
+}
+
+void AppendInternalKeyWithDifferentTimestamp(std::string* result,
+                                             const ParsedInternalKey& key,
+                                             const Slice& ts) {
+  assert(key.user_key.size() >= ts.size());
+  result->append(key.user_key.data(), key.user_key.size() - ts.size());
+  result->append(ts.data(), ts.size());
+  PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
+}
+
+void AppendInternalKeyFooter(std::string* result, SequenceNumber s,
+                             ValueType t) {
+  PutFixed64(result, PackSequenceAndType(s, t));
+}
+
+void AppendKeyWithMinTimestamp(std::string* result, const Slice& key,
+                               size_t ts_sz) {
+  assert(ts_sz > 0);
+  const std::string kTsMin(ts_sz, static_cast<unsigned char>(0));
+  result->append(key.data(), key.size());
+  result->append(kTsMin.data(), ts_sz);
+}
+
+void AppendKeyWithMaxTimestamp(std::string* result, const Slice& key,
+                               size_t ts_sz) {
+  assert(ts_sz > 0);
+  const std::string kTsMax(ts_sz, static_cast<unsigned char>(0xff));
+  result->append(key.data(), key.size());
+  result->append(kTsMax.data(), ts_sz);
+}
+
+void AppendUserKeyWithMaxTimestamp(std::string* result, const Slice& key,
+                                   size_t ts_sz) {
+  assert(ts_sz > 0);
+  result->append(key.data(), key.size() - ts_sz);
+
+  static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
+  if (ts_sz < strlen(kTsMax)) {
+    result->append(kTsMax, ts_sz);
+  } else {
+    result->append(std::string(ts_sz, '\xff'));
+  }
+}
+
+std::string ParsedInternalKey::DebugString(bool log_err_key, bool hex) const {
+  std::string result = "'";
+  if (log_err_key) {
+    result += user_key.ToString(hex);
+  } else {
+    result += "<redacted>";
+  }
+
+  char buf[50];
+  snprintf(buf, sizeof(buf), "' seq:%" PRIu64 ", type:%d", sequence,
+           static_cast<int>(type));
+
+  result += buf;
+  return result;
+}
+
+std::string InternalKey::DebugString(bool hex) const {
+  std::string result;
+  ParsedInternalKey parsed;
+  if (ParseInternalKey(rep_, &parsed, false /* log_err_key */).ok()) {
+    result = parsed.DebugString(true /* log_err_key */, hex);  // TODO
+  } else {
+    result = "(bad)";
+    result.append(EscapeString(rep_));
+  }
+  return result;
+}
+
+int InternalKeyComparator::Compare(const ParsedInternalKey& a,
+                                   const ParsedInternalKey& b) const {
+  // Order by:
+  //    increasing user key (according to user-supplied comparator)
+  //    decreasing sequence number
+  //    decreasing type (though sequence# should be enough to disambiguate)
+  int r = user_comparator_.Compare(a.user_key, b.user_key);
+  if (r == 0) {
+    if (a.sequence > b.sequence) {
+      r = -1;
+    } else if (a.sequence < b.sequence) {
+      r = +1;
+    } else if (a.type > b.type) {
+      r = -1;
+    } else if (a.type < b.type) {
+      r = +1;
+    }
+  }
+  return r;
+}
+
+LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s,
+                     const Slice* ts) {
+  size_t usize = _user_key.size();
+  size_t ts_sz = (nullptr == ts) ? 0 : ts->size();
+  size_t needed = usize + ts_sz + 13;  // A conservative estimate
+  char* dst;
+  if (needed <= sizeof(space_)) {
+    dst = space_;
+  } else {
+    dst = new char[needed];
+  }
+  start_ = dst;
+  // NOTE: We don't support users keys of more than 2GB :)
+  dst = EncodeVarint32(dst, static_cast<uint32_t>(usize + ts_sz + 8));
+  kstart_ = dst;
+  memcpy(dst, _user_key.data(), usize);
+  dst += usize;
+  if (nullptr != ts) {
+    memcpy(dst, ts->data(), ts_sz);
+    dst += ts_sz;
+  }
+  EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek));
+  dst += 8;
+  end_ = dst;
+}
+
+void IterKey::EnlargeBuffer(size_t key_size) {
+  // If size is smaller than buffer size, continue using current buffer,
+  // or the static allocated one, as default
+  assert(key_size > buf_size_);
+  // Need to enlarge the buffer.
+  ResetBuffer();
+  buf_ = new char[key_size];
+  buf_size_ = key_size;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/dbformat.h b/src/rocksdb/db/dbformat.h
new file mode 100644
index 000000000..8c1fc7055
--- /dev/null
+++ b/src/rocksdb/db/dbformat.h
@@ -0,0 +1,865 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdio.h>
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/types.h"
+#include "util/coding.h"
+#include "util/user_comparator_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The file declares data structures and functions that deal with internal
+// keys.
+// Each internal key contains a user key, a sequence number (SequenceNumber)
+// and a type (ValueType), and they are usually encoded together.
+// There are some related helper classes here.
+
+class InternalKey;
+
+// Value types encoded as the last component of internal keys.
+// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
+// data structures.
+// The highest bit of the value type needs to be reserved to SST tables
+// for them to do more flexible encoding.
+enum ValueType : unsigned char {
+  kTypeDeletion = 0x0,
+  kTypeValue = 0x1,
+  kTypeMerge = 0x2,
+  kTypeLogData = 0x3,               // WAL only.
+  kTypeColumnFamilyDeletion = 0x4,  // WAL only.
+  kTypeColumnFamilyValue = 0x5,     // WAL only.
+  kTypeColumnFamilyMerge = 0x6,     // WAL only.
+  kTypeSingleDeletion = 0x7,
+  kTypeColumnFamilySingleDeletion = 0x8,  // WAL only.
+  kTypeBeginPrepareXID = 0x9,             // WAL only.
+  kTypeEndPrepareXID = 0xA,               // WAL only.
+  kTypeCommitXID = 0xB,                   // WAL only.
+  kTypeRollbackXID = 0xC,                 // WAL only.
+  kTypeNoop = 0xD,                        // WAL only.
+  kTypeColumnFamilyRangeDeletion = 0xE,   // WAL only.
+  kTypeRangeDeletion = 0xF,               // meta block
+  kTypeColumnFamilyBlobIndex = 0x10,      // Blob DB only
+  kTypeBlobIndex = 0x11,                  // Blob DB only
+  // When the prepared record is also persisted in db, we use a different
+  // record. This is to ensure that the WAL that is generated by a WritePolicy
+  // is not mistakenly read by another, which would result into data
+  // inconsistency.
+  kTypeBeginPersistedPrepareXID = 0x12,  // WAL only.
+  // Similar to kTypeBeginPersistedPrepareXID, this is to ensure that WAL
+  // generated by WriteUnprepared write policy is not mistakenly read by
+  // another.
+  kTypeBeginUnprepareXID = 0x13,  // WAL only.
+  kTypeDeletionWithTimestamp = 0x14,
+  kTypeCommitXIDAndTimestamp = 0x15,  // WAL only
+  kTypeWideColumnEntity = 0x16,
+  kTypeColumnFamilyWideColumnEntity = 0x17,  // WAL only
+  kTypeMaxValid,    // Should be after the last valid type, only used for
+                    // validation
+  kMaxValue = 0x7F  // Not used for storing records.
+};
+
+// Defined in dbformat.cc
+extern const ValueType kValueTypeForSeek;
+extern const ValueType kValueTypeForSeekForPrev;
+
+// Checks whether a type is an inline value type
+// (i.e. a type used in memtable skiplist and sst file datablock).
+inline bool IsValueType(ValueType t) {
+  return t <= kTypeMerge || kTypeSingleDeletion == t || kTypeBlobIndex == t ||
+         kTypeDeletionWithTimestamp == t || kTypeWideColumnEntity == t;
+}
+
+// Checks whether a type is from user operation
+// kTypeRangeDeletion is in meta block so this API is separated from above
+inline bool IsExtendedValueType(ValueType t) {
+  return IsValueType(t) || t == kTypeRangeDeletion;
+}
+
+// We leave eight bits empty at the bottom so a type and sequence#
+// can be packed together into 64-bits.
+static const SequenceNumber kMaxSequenceNumber = ((0x1ull << 56) - 1);
+
+static const SequenceNumber kDisableGlobalSequenceNumber =
+    std::numeric_limits<uint64_t>::max();
+
+constexpr uint64_t kNumInternalBytes = 8;
+
+// Defined in dbformat.cc
+extern const std::string kDisableUserTimestamp;
+
+// The data structure that represents an internal key in the way that user_key,
+// sequence number and type are stored in separated forms.
+struct ParsedInternalKey {
+  Slice user_key;
+  SequenceNumber sequence;
+  ValueType type;
+
+  ParsedInternalKey()
+      : sequence(kMaxSequenceNumber),
+        type(kTypeDeletion)  // Make code analyzer happy
+  {}                         // Intentionally left uninitialized (for speed)
+  // u contains timestamp if user timestamp feature is enabled.
+  ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
+      : user_key(u), sequence(seq), type(t) {}
+  std::string DebugString(bool log_err_key, bool hex) const;
+
+  void clear() {
+    user_key.clear();
+    sequence = 0;
+    type = kTypeDeletion;
+  }
+
+  void SetTimestamp(const Slice& ts) {
+    assert(ts.size() <= user_key.size());
+    const char* addr = user_key.data() + user_key.size() - ts.size();
+    memcpy(const_cast<char*>(addr), ts.data(), ts.size());
+  }
+
+  Slice GetTimestamp(size_t ts_sz) {
+    assert(ts_sz <= user_key.size());
+    const char* addr = user_key.data() + user_key.size() - ts_sz;
+    return Slice(const_cast<char*>(addr), ts_sz);
+  }
+};
+
+// Return the length of the encoding of "key".
+inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
+  return key.user_key.size() + kNumInternalBytes;
+}
+
+// Pack a sequence number and a ValueType into a uint64_t
+inline uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
+  assert(seq <= kMaxSequenceNumber);
+  // kTypeMaxValid is used in TruncatedRangeDelIterator, see its constructor.
+  assert(IsExtendedValueType(t) || t == kTypeMaxValid);
+  return (seq << 8) | t;
+}
+
+// Given the result of PackSequenceAndType, store the sequence number in *seq
+// and the ValueType in *t.
+inline void UnPackSequenceAndType(uint64_t packed, uint64_t* seq,
+                                  ValueType* t) {
+  *seq = packed >> 8;
+  *t = static_cast<ValueType>(packed & 0xff);
+
+  // Commented the following two assertions in order to test key-value checksum
+  // on corrupted keys without crashing ("DbKvChecksumTest").
+  // assert(*seq <= kMaxSequenceNumber);
+  // assert(IsExtendedValueType(*t));
+}
+
+EntryType GetEntryType(ValueType value_type);
+
+// Append the serialization of "key" to *result.
+extern void AppendInternalKey(std::string* result,
+                              const ParsedInternalKey& key);
+
+// Append the serialization of "key" to *result, replacing the original
+// timestamp with argument ts.
+extern void AppendInternalKeyWithDifferentTimestamp(
+    std::string* result, const ParsedInternalKey& key, const Slice& ts);
+
+// Serialized internal key consists of user key followed by footer.
+// This function appends the footer to *result, assuming that *result already
+// contains the user key at the end.
+extern void AppendInternalKeyFooter(std::string* result, SequenceNumber s,
+                                    ValueType t);
+
+// Append the key and a minimal timestamp to *result
+extern void AppendKeyWithMinTimestamp(std::string* result, const Slice& key,
+                                      size_t ts_sz);
+
+// Append the key and a maximal timestamp to *result
+extern void AppendKeyWithMaxTimestamp(std::string* result, const Slice& key,
+                                      size_t ts_sz);
+
+// `key` is a user key with timestamp. Append the user key without timestamp
+// and the maximal timestamp to *result.
+extern void AppendUserKeyWithMaxTimestamp(std::string* result, const Slice& key,
+                                          size_t ts_sz);
+
+// Attempt to parse an internal key from "internal_key".  On success,
+// stores the parsed data in "*result", and returns true.
+//
+// On error, returns false, leaves "*result" in an undefined state.
+extern Status ParseInternalKey(const Slice& internal_key,
+                               ParsedInternalKey* result, bool log_err_key);
+
+// Returns the user key portion of an internal key.
+inline Slice ExtractUserKey(const Slice& internal_key) {
+  assert(internal_key.size() >= kNumInternalBytes);
+  return Slice(internal_key.data(), internal_key.size() - kNumInternalBytes);
+}
+
+inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key,
+                                             size_t ts_sz) {
+  Slice ret = internal_key;
+  ret.remove_suffix(kNumInternalBytes + ts_sz);
+  return ret;
+}
+
+inline Slice StripTimestampFromUserKey(const Slice& user_key, size_t ts_sz) {
+  Slice ret = user_key;
+  ret.remove_suffix(ts_sz);
+  return ret;
+}
+
+inline Slice ExtractTimestampFromUserKey(const Slice& user_key, size_t ts_sz) {
+  assert(user_key.size() >= ts_sz);
+  return Slice(user_key.data() + user_key.size() - ts_sz, ts_sz);
+}
+
+inline Slice ExtractTimestampFromKey(const Slice& internal_key, size_t ts_sz) {
+  const size_t key_size = internal_key.size();
+  assert(key_size >= kNumInternalBytes + ts_sz);
+  return Slice(internal_key.data() + key_size - ts_sz - kNumInternalBytes,
+               ts_sz);
+}
+
+inline uint64_t ExtractInternalKeyFooter(const Slice& internal_key) {
+  assert(internal_key.size() >= kNumInternalBytes);
+  const size_t n = internal_key.size();
+  return DecodeFixed64(internal_key.data() + n - kNumInternalBytes);
+}
+
+inline ValueType ExtractValueType(const Slice& internal_key) {
+  uint64_t num = ExtractInternalKeyFooter(internal_key);
+  unsigned char c = num & 0xff;
+  return static_cast<ValueType>(c);
+}
+
+// A comparator for internal keys that uses a specified comparator for
+// the user key portion and breaks ties by decreasing sequence number.
+class InternalKeyComparator
+#ifdef NDEBUG
+    final
+#endif
+    : public CompareInterface {
+ private:
+  UserComparatorWrapper user_comparator_;
+
+ public:
+  // `InternalKeyComparator`s constructed with the default constructor are not
+  // usable and will segfault on any attempt to use them for comparisons.
+  InternalKeyComparator() = default;
+
+  // @param named If true, assign a name to this comparator based on the
+  //    underlying comparator's name. This involves an allocation and copy in
+  //    this constructor to precompute the result of `Name()`. To avoid this
+  //    overhead, set `named` to false. In that case, `Name()` will return a
+  //    generic name that is non-specific to the underlying comparator.
+  explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) {}
+  virtual ~InternalKeyComparator() {}
+
+  int Compare(const Slice& a, const Slice& b) const override;
+
+  bool Equal(const Slice& a, const Slice& b) const {
+    // TODO Use user_comparator_.Equal(). Perhaps compare seqno before
+    // comparing the user key too.
+    return Compare(a, b) == 0;
+  }
+
+  // Same as Compare except that it excludes the value type from comparison
+  int CompareKeySeq(const Slice& a, const Slice& b) const;
+
+  const Comparator* user_comparator() const {
+    return user_comparator_.user_comparator();
+  }
+
+  int Compare(const InternalKey& a, const InternalKey& b) const;
+  int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const;
+  // In this `Compare()` overload, the sequence numbers provided in
+  // `a_global_seqno` and `b_global_seqno` override the sequence numbers in `a`
+  // and `b`, respectively. To disable sequence number override(s), provide the
+  // value `kDisableGlobalSequenceNumber`.
+  int Compare(const Slice& a, SequenceNumber a_global_seqno, const Slice& b,
+              SequenceNumber b_global_seqno) const;
+};
+
+// The class represent the internal key in encoded form.
+class InternalKey {
+ private:
+  std::string rep_;
+
+ public:
+  InternalKey() {}  // Leave rep_ as empty to indicate it is invalid
+  InternalKey(const Slice& _user_key, SequenceNumber s, ValueType t) {
+    AppendInternalKey(&rep_, ParsedInternalKey(_user_key, s, t));
+  }
+  InternalKey(const Slice& _user_key, SequenceNumber s, ValueType t, Slice ts) {
+    AppendInternalKeyWithDifferentTimestamp(
+        &rep_, ParsedInternalKey(_user_key, s, t), ts);
+  }
+
+  // sets the internal key to be bigger or equal to all internal keys with this
+  // user key
+  void SetMaxPossibleForUserKey(const Slice& _user_key) {
+    AppendInternalKey(
+        &rep_, ParsedInternalKey(_user_key, 0, static_cast<ValueType>(0)));
+  }
+
+  // sets the internal key to be smaller or equal to all internal keys with this
+  // user key
+  void SetMinPossibleForUserKey(const Slice& _user_key) {
+    AppendInternalKey(&rep_, ParsedInternalKey(_user_key, kMaxSequenceNumber,
+                                               kValueTypeForSeek));
+  }
+
+  bool Valid() const {
+    ParsedInternalKey parsed;
+    return (ParseInternalKey(Slice(rep_), &parsed, false /* log_err_key */)
+                .ok());  // TODO
+  }
+
+  void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
+  Slice Encode() const {
+    assert(!rep_.empty());
+    return rep_;
+  }
+
+  Slice user_key() const { return ExtractUserKey(rep_); }
+  size_t size() const { return rep_.size(); }
+
+  void Set(const Slice& _user_key, SequenceNumber s, ValueType t) {
+    SetFrom(ParsedInternalKey(_user_key, s, t));
+  }
+
+  void Set(const Slice& _user_key_with_ts, SequenceNumber s, ValueType t,
+           const Slice& ts) {
+    ParsedInternalKey pik = ParsedInternalKey(_user_key_with_ts, s, t);
+    // Should not call pik.SetTimestamp() directly as it overwrites the buffer
+    // containing _user_key.
+    SetFrom(pik, ts);
+  }
+
+  void SetFrom(const ParsedInternalKey& p) {
+    rep_.clear();
+    AppendInternalKey(&rep_, p);
+  }
+
+  void SetFrom(const ParsedInternalKey& p, const Slice& ts) {
+    rep_.clear();
+    AppendInternalKeyWithDifferentTimestamp(&rep_, p, ts);
+  }
+
+  void Clear() { rep_.clear(); }
+
+  // The underlying representation.
+  // Intended only to be used together with ConvertFromUserKey().
+  std::string* rep() { return &rep_; }
+
+  // Assuming that *rep() contains a user key, this method makes internal key
+  // out of it in-place. This saves a memcpy compared to Set()/SetFrom().
+  void ConvertFromUserKey(SequenceNumber s, ValueType t) {
+    AppendInternalKeyFooter(&rep_, s, t);
+  }
+
+  std::string DebugString(bool hex) const;
+};
+
+inline int InternalKeyComparator::Compare(const InternalKey& a,
+                                          const InternalKey& b) const {
+  return Compare(a.Encode(), b.Encode());
+}
+
+inline Status ParseInternalKey(const Slice& internal_key,
+                               ParsedInternalKey* result, bool log_err_key) {
+  const size_t n = internal_key.size();
+
+  if (n < kNumInternalBytes) {
+    return Status::Corruption("Corrupted Key: Internal Key too small. Size=" +
+                              std::to_string(n) + ". ");
+  }
+
+  uint64_t num = DecodeFixed64(internal_key.data() + n - kNumInternalBytes);
+  unsigned char c = num & 0xff;
+  result->sequence = num >> 8;
+  result->type = static_cast<ValueType>(c);
+  assert(result->type <= ValueType::kMaxValue);
+  result->user_key = Slice(internal_key.data(), n - kNumInternalBytes);
+
+  if (IsExtendedValueType(result->type)) {
+    return Status::OK();
+  } else {
+    return Status::Corruption("Corrupted Key",
+                              result->DebugString(log_err_key, true));
+  }
+}
+
+// Update the sequence number in the internal key.
+// Guarantees not to invalidate ikey.data().
+inline void UpdateInternalKey(std::string* ikey, uint64_t seq, ValueType t) {
+  size_t ikey_sz = ikey->size();
+  assert(ikey_sz >= kNumInternalBytes);
+  uint64_t newval = (seq << 8) | t;
+
+  // Note: Since C++11, strings are guaranteed to be stored contiguously and
+  // string::operator[]() is guaranteed not to change ikey.data().
+  EncodeFixed64(&(*ikey)[ikey_sz - kNumInternalBytes], newval);
+}
+
+// Get the sequence number from the internal key
+inline uint64_t GetInternalKeySeqno(const Slice& internal_key) {
+  const size_t n = internal_key.size();
+  assert(n >= kNumInternalBytes);
+  uint64_t num = DecodeFixed64(internal_key.data() + n - kNumInternalBytes);
+  return num >> 8;
+}
+
+// The class to store keys in an efficient way. It allows:
+// 1. Users can either copy the key into it, or have it point to an unowned
+//    address.
+// 2. For copied key, a short inline buffer is kept to reduce memory
+//    allocation for smaller keys.
+// 3. It tracks user key or internal key, and allow conversion between them.
+class IterKey {
+ public:
+  IterKey()
+      : buf_(space_),
+        key_(buf_),
+        key_size_(0),
+        buf_size_(sizeof(space_)),
+        is_user_key_(true) {}
+  // No copying allowed
+  IterKey(const IterKey&) = delete;
+  void operator=(const IterKey&) = delete;
+
+  ~IterKey() { ResetBuffer(); }
+
+  // The bool will be picked up by the next calls to SetKey
+  void SetIsUserKey(bool is_user_key) { is_user_key_ = is_user_key; }
+
+  // Returns the key in whichever format that was provided to KeyIter
+  // If user-defined timestamp is enabled, then timestamp is included in the
+  // return result.
+  Slice GetKey() const { return Slice(key_, key_size_); }
+
+  Slice GetInternalKey() const {
+    assert(!IsUserKey());
+    return Slice(key_, key_size_);
+  }
+
+  // If user-defined timestamp is enabled, then timestamp is included in the
+  // return result of GetUserKey();
+  Slice GetUserKey() const {
+    if (IsUserKey()) {
+      return Slice(key_, key_size_);
+    } else {
+      assert(key_size_ >= kNumInternalBytes);
+      return Slice(key_, key_size_ - kNumInternalBytes);
+    }
+  }
+
+  size_t Size() const { return key_size_; }
+
+  void Clear() { key_size_ = 0; }
+
+  // Append "non_shared_data" to its back, from "shared_len"
+  // This function is used in Block::Iter::ParseNextKey
+  // shared_len: bytes in [0, shard_len-1] would be remained
+  // non_shared_data: data to be append, its length must be >= non_shared_len
+  void TrimAppend(const size_t shared_len, const char* non_shared_data,
+                  const size_t non_shared_len) {
+    assert(shared_len <= key_size_);
+    size_t total_size = shared_len + non_shared_len;
+
+    if (IsKeyPinned() /* key is not in buf_ */) {
+      // Copy the key from external memory to buf_ (copy shared_len bytes)
+      EnlargeBufferIfNeeded(total_size);
+      memcpy(buf_, key_, shared_len);
+    } else if (total_size > buf_size_) {
+      // Need to allocate space, delete previous space
+      char* p = new char[total_size];
+      memcpy(p, key_, shared_len);
+
+      if (buf_ != space_) {
+        delete[] buf_;
+      }
+
+      buf_ = p;
+      buf_size_ = total_size;
+    }
+
+    memcpy(buf_ + shared_len, non_shared_data, non_shared_len);
+    key_ = buf_;
+    key_size_ = total_size;
+  }
+
+  Slice SetKey(const Slice& key, bool copy = true) {
+    // is_user_key_ expected to be set already via SetIsUserKey
+    return SetKeyImpl(key, copy);
+  }
+
+  // If user-defined timestamp is enabled, then `key` includes timestamp.
+  // TODO(yanqin) this is also used to set prefix, which do not include
+  // timestamp. Should be handled.
+  Slice SetUserKey(const Slice& key, bool copy = true) {
+    is_user_key_ = true;
+    return SetKeyImpl(key, copy);
+  }
+
+  Slice SetInternalKey(const Slice& key, bool copy = true) {
+    is_user_key_ = false;
+    return SetKeyImpl(key, copy);
+  }
+
+  // Copies the content of key, updates the reference to the user key in ikey
+  // and returns a Slice referencing the new copy.
+  Slice SetInternalKey(const Slice& key, ParsedInternalKey* ikey) {
+    size_t key_n = key.size();
+    assert(key_n >= kNumInternalBytes);
+    SetInternalKey(key);
+    ikey->user_key = Slice(key_, key_n - kNumInternalBytes);
+    return Slice(key_, key_n);
+  }
+
+  // Copy the key into IterKey own buf_
+  void OwnKey() {
+    assert(IsKeyPinned() == true);
+
+    Reserve(key_size_);
+    memcpy(buf_, key_, key_size_);
+    key_ = buf_;
+  }
+
+  // Update the sequence number in the internal key.  Guarantees not to
+  // invalidate slices to the key (and the user key).
+  void UpdateInternalKey(uint64_t seq, ValueType t, const Slice* ts = nullptr) {
+    assert(!IsKeyPinned());
+    assert(key_size_ >= kNumInternalBytes);
+    if (ts) {
+      assert(key_size_ >= kNumInternalBytes + ts->size());
+      memcpy(&buf_[key_size_ - kNumInternalBytes - ts->size()], ts->data(),
+             ts->size());
+    }
+    uint64_t newval = (seq << 8) | t;
+    EncodeFixed64(&buf_[key_size_ - kNumInternalBytes], newval);
+  }
+
+  bool IsKeyPinned() const { return (key_ != buf_); }
+
+  // If `ts` is provided, user_key should not contain timestamp,
+  // and `ts` is appended after user_key.
+  // TODO: more efficient storage for timestamp.
+  void SetInternalKey(const Slice& key_prefix, const Slice& user_key,
+                      SequenceNumber s,
+                      ValueType value_type = kValueTypeForSeek,
+                      const Slice* ts = nullptr) {
+    size_t psize = key_prefix.size();
+    size_t usize = user_key.size();
+    size_t ts_sz = (ts != nullptr ? ts->size() : 0);
+    EnlargeBufferIfNeeded(psize + usize + sizeof(uint64_t) + ts_sz);
+    if (psize > 0) {
+      memcpy(buf_, key_prefix.data(), psize);
+    }
+    memcpy(buf_ + psize, user_key.data(), usize);
+    if (ts) {
+      memcpy(buf_ + psize + usize, ts->data(), ts_sz);
+    }
+    EncodeFixed64(buf_ + usize + psize + ts_sz,
+                  PackSequenceAndType(s, value_type));
+
+    key_ = buf_;
+    key_size_ = psize + usize + sizeof(uint64_t) + ts_sz;
+    is_user_key_ = false;
+  }
+
+  void SetInternalKey(const Slice& user_key, SequenceNumber s,
+                      ValueType value_type = kValueTypeForSeek,
+                      const Slice* ts = nullptr) {
+    SetInternalKey(Slice(), user_key, s, value_type, ts);
+  }
+
+  void Reserve(size_t size) {
+    EnlargeBufferIfNeeded(size);
+    key_size_ = size;
+  }
+
+  void SetInternalKey(const ParsedInternalKey& parsed_key) {
+    SetInternalKey(Slice(), parsed_key);
+  }
+
+  void SetInternalKey(const Slice& key_prefix,
+                      const ParsedInternalKey& parsed_key_suffix) {
+    SetInternalKey(key_prefix, parsed_key_suffix.user_key,
+                   parsed_key_suffix.sequence, parsed_key_suffix.type);
+  }
+
+  void EncodeLengthPrefixedKey(const Slice& key) {
+    auto size = key.size();
+    EnlargeBufferIfNeeded(size + static_cast<size_t>(VarintLength(size)));
+    char* ptr = EncodeVarint32(buf_, static_cast<uint32_t>(size));
+    memcpy(ptr, key.data(), size);
+    key_ = buf_;
+    is_user_key_ = true;
+  }
+
+  bool IsUserKey() const { return is_user_key_; }
+
+ private:
+  char* buf_;
+  const char* key_;
+  size_t key_size_;
+  size_t buf_size_;
+  char space_[32];  // Avoid allocation for short keys
+  bool is_user_key_;
+
+  Slice SetKeyImpl(const Slice& key, bool copy) {
+    size_t size = key.size();
+    if (copy) {
+      // Copy key to buf_
+      EnlargeBufferIfNeeded(size);
+      memcpy(buf_, key.data(), size);
+      key_ = buf_;
+    } else {
+      // Update key_ to point to external memory
+      key_ = key.data();
+    }
+    key_size_ = size;
+    return Slice(key_, key_size_);
+  }
+
+  void ResetBuffer() {
+    if (buf_ != space_) {
+      delete[] buf_;
+      buf_ = space_;
+    }
+    buf_size_ = sizeof(space_);
+    key_size_ = 0;
+  }
+
+  // Enlarge the buffer size if needed based on key_size.
+  // By default, static allocated buffer is used. Once there is a key
+  // larger than the static allocated buffer, another buffer is dynamically
+  // allocated, until a larger key buffer is requested. In that case, we
+  // reallocate buffer and delete the old one.
+  void EnlargeBufferIfNeeded(size_t key_size) {
+    // If size is smaller than buffer size, continue using current buffer,
+    // or the static allocated one, as default
+    if (key_size > buf_size_) {
+      EnlargeBuffer(key_size);
+    }
+  }
+
+  void EnlargeBuffer(size_t key_size);
+};
+
+// Convert from a SliceTransform of user keys, to a SliceTransform of
+// internal keys.
+class InternalKeySliceTransform : public SliceTransform {
+ public:
+  explicit InternalKeySliceTransform(const SliceTransform* transform)
+      : transform_(transform) {}
+
+  virtual const char* Name() const override { return transform_->Name(); }
+
+  virtual Slice Transform(const Slice& src) const override {
+    auto user_key = ExtractUserKey(src);
+    return transform_->Transform(user_key);
+  }
+
+  virtual bool InDomain(const Slice& src) const override {
+    auto user_key = ExtractUserKey(src);
+    return transform_->InDomain(user_key);
+  }
+
+  virtual bool InRange(const Slice& dst) const override {
+    auto user_key = ExtractUserKey(dst);
+    return transform_->InRange(user_key);
+  }
+
+  const SliceTransform* user_prefix_extractor() const { return transform_; }
+
+ private:
+  // Like comparator, InternalKeySliceTransform will not take care of the
+  // deletion of transform_
+  const SliceTransform* const transform_;
+};
+
+// Read the key of a record from a write batch.
+// if this record represent the default column family then cf_record
+// must be passed as false, otherwise it must be passed as true.
+extern bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key,
+                                       bool cf_record);
+
+// Read record from a write batch piece from input.
+// tag, column_family, key, value and blob are return values. Callers own the
+// slice they point to.
+// Tag is defined as ValueType.
+// input will be advanced to after the record.
+// If user-defined timestamp is enabled for a column family, then the `key`
+// resulting from this call will include timestamp.
+extern Status ReadRecordFromWriteBatch(Slice* input, char* tag,
+                                       uint32_t* column_family, Slice* key,
+                                       Slice* value, Slice* blob, Slice* xid);
+
+// When user call DeleteRange() to delete a range of keys,
+// we will store a serialized RangeTombstone in MemTable and SST.
+// the struct here is an easy-understood form
+// start/end_key_ is the start/end user key of the range to be deleted
+struct RangeTombstone {
+  Slice start_key_;
+  Slice end_key_;
+  SequenceNumber seq_;
+  // TODO: we should optimize the storage here when user-defined timestamp
+  //  is NOT enabled: they currently take up (16 + 32 + 32) bytes per tombstone.
+  Slice ts_;
+  std::string pinned_start_key_;
+  std::string pinned_end_key_;
+
+  RangeTombstone() = default;
+  RangeTombstone(Slice sk, Slice ek, SequenceNumber sn)
+      : start_key_(sk), end_key_(ek), seq_(sn) {}
+
+  // User-defined timestamp is enabled, `sk` and `ek` should be user key
+  // with timestamp, `ts` will replace the timestamps in `sk` and
+  // `ek`.
+  RangeTombstone(Slice sk, Slice ek, SequenceNumber sn, Slice ts)
+      : seq_(sn), ts_(ts) {
+    assert(!ts.empty());
+    pinned_start_key_.reserve(sk.size());
+    pinned_start_key_.append(sk.data(), sk.size() - ts.size());
+    pinned_start_key_.append(ts.data(), ts.size());
+    pinned_end_key_.reserve(ek.size());
+    pinned_end_key_.append(ek.data(), ek.size() - ts.size());
+    pinned_end_key_.append(ts.data(), ts.size());
+    start_key_ = pinned_start_key_;
+    end_key_ = pinned_end_key_;
+  }
+
+  RangeTombstone(ParsedInternalKey parsed_key, Slice value) {
+    start_key_ = parsed_key.user_key;
+    seq_ = parsed_key.sequence;
+    end_key_ = value;
+  }
+
+  // be careful to use Serialize(), allocates new memory
+  std::pair<InternalKey, Slice> Serialize() const {
+    auto key = InternalKey(start_key_, seq_, kTypeRangeDeletion);
+    return std::make_pair(std::move(key), end_key_);
+  }
+
+  // be careful to use SerializeKey(), allocates new memory
+  InternalKey SerializeKey() const {
+    return InternalKey(start_key_, seq_, kTypeRangeDeletion);
+  }
+
+  // The tombstone end-key is exclusive, so we generate an internal-key here
+  // which has a similar property. Using kMaxSequenceNumber guarantees that
+  // the returned internal-key will compare less than any other internal-key
+  // with the same user-key. This in turn guarantees that the serialized
+  // end-key for a tombstone such as [a-b] will compare less than the key "b".
+  //
+  // be careful to use SerializeEndKey(), allocates new memory
+  InternalKey SerializeEndKey() const {
+    if (!ts_.empty()) {
+      static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
+      if (ts_.size() <= strlen(kTsMax)) {
+        return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion,
+                           Slice(kTsMax, ts_.size()));
+      } else {
+        return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion,
+                           std::string(ts_.size(), '\xff'));
+      }
+    }
+    return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion);
+  }
+};
+
+inline int InternalKeyComparator::Compare(const Slice& akey,
+                                          const Slice& bkey) const {
+  // Order by:
+  //    increasing user key (according to user-supplied comparator)
+  //    decreasing sequence number
+  //    decreasing type (though sequence# should be enough to disambiguate)
+  int r = user_comparator_.Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
+  if (r == 0) {
+    const uint64_t anum =
+        DecodeFixed64(akey.data() + akey.size() - kNumInternalBytes);
+    const uint64_t bnum =
+        DecodeFixed64(bkey.data() + bkey.size() - kNumInternalBytes);
+    if (anum > bnum) {
+      r = -1;
+    } else if (anum < bnum) {
+      r = +1;
+    }
+  }
+  return r;
+}
+
+inline int InternalKeyComparator::CompareKeySeq(const Slice& akey,
+                                                const Slice& bkey) const {
+  // Order by:
+  //    increasing user key (according to user-supplied comparator)
+  //    decreasing sequence number
+  int r = user_comparator_.Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
+  if (r == 0) {
+    // Shift the number to exclude the last byte which contains the value type
+    const uint64_t anum =
+        DecodeFixed64(akey.data() + akey.size() - kNumInternalBytes) >> 8;
+    const uint64_t bnum =
+        DecodeFixed64(bkey.data() + bkey.size() - kNumInternalBytes) >> 8;
+    if (anum > bnum) {
+      r = -1;
+    } else if (anum < bnum) {
+      r = +1;
+    }
+  }
+  return r;
+}
+
+inline int InternalKeyComparator::Compare(const Slice& a,
+                                          SequenceNumber a_global_seqno,
+                                          const Slice& b,
+                                          SequenceNumber b_global_seqno) const {
+  int r = user_comparator_.Compare(ExtractUserKey(a), ExtractUserKey(b));
+  if (r == 0) {
+    uint64_t a_footer, b_footer;
+    if (a_global_seqno == kDisableGlobalSequenceNumber) {
+      a_footer = ExtractInternalKeyFooter(a);
+    } else {
+      a_footer = PackSequenceAndType(a_global_seqno, ExtractValueType(a));
+    }
+    if (b_global_seqno == kDisableGlobalSequenceNumber) {
+      b_footer = ExtractInternalKeyFooter(b);
+    } else {
+      b_footer = PackSequenceAndType(b_global_seqno, ExtractValueType(b));
+    }
+    if (a_footer > b_footer) {
+      r = -1;
+    } else if (a_footer < b_footer) {
+      r = +1;
+    }
+  }
+  return r;
+}
+
+// Wrap InternalKeyComparator as a comparator class for ParsedInternalKey.
+struct ParsedInternalKeyComparator {
+  explicit ParsedInternalKeyComparator(const InternalKeyComparator* c)
+      : cmp(c) {}
+
+  bool operator()(const ParsedInternalKey& a,
+                  const ParsedInternalKey& b) const {
+    return cmp->Compare(a, b) < 0;
+  }
+
+  const InternalKeyComparator* cmp;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/dbformat_test.cc b/src/rocksdb/db/dbformat_test.cc
new file mode 100644
index 000000000..8dc3387df
--- /dev/null
+++ b/src/rocksdb/db/dbformat_test.cc
@@ -0,0 +1,214 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/dbformat.h"
+
+#include "table/block_based/index_builder.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static std::string IKey(const std::string& user_key, uint64_t seq,
+                        ValueType vt) {
+  std::string encoded;
+  AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt));
+  return encoded;
+}
+
+static std::string Shorten(const std::string& s, const std::string& l) {
+  std::string result = s;
+  ShortenedIndexBuilder::FindShortestInternalKeySeparator(*BytewiseComparator(),
+                                                          &result, l);
+  return result;
+}
+
+static std::string ShortSuccessor(const std::string& s) {
+  std::string result = s;
+  ShortenedIndexBuilder::FindShortInternalKeySuccessor(*BytewiseComparator(),
+                                                       &result);
+  return result;
+}
+
+static void TestKey(const std::string& key, uint64_t seq, ValueType vt) {
+  std::string encoded = IKey(key, seq, vt);
+
+  Slice in(encoded);
+  ParsedInternalKey decoded("", 0, kTypeValue);
+
+  ASSERT_OK(ParseInternalKey(in, &decoded, true /* log_err_key */));
+  ASSERT_EQ(key, decoded.user_key.ToString());
+  ASSERT_EQ(seq, decoded.sequence);
+  ASSERT_EQ(vt, decoded.type);
+
+  ASSERT_NOK(ParseInternalKey(Slice("bar"), &decoded, true /* log_err_key */));
+}
+
+class FormatTest : public testing::Test {};
+
+TEST_F(FormatTest, InternalKey_EncodeDecode) {
+  const char* keys[] = {"", "k", "hello", "longggggggggggggggggggggg"};
+  const uint64_t seq[] = {1,
+                          2,
+                          3,
+                          (1ull << 8) - 1,
+                          1ull << 8,
+                          (1ull << 8) + 1,
+                          (1ull << 16) - 1,
+                          1ull << 16,
+                          (1ull << 16) + 1,
+                          (1ull << 32) - 1,
+                          1ull << 32,
+                          (1ull << 32) + 1};
+  for (unsigned int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) {
+    for (unsigned int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) {
+      TestKey(keys[k], seq[s], kTypeValue);
+      TestKey("hello", 1, kTypeDeletion);
+    }
+  }
+}
+
+TEST_F(FormatTest, InternalKeyShortSeparator) {
+  // When user keys are same
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue), IKey("foo", 99, kTypeValue)));
+  ASSERT_EQ(
+      IKey("foo", 100, kTypeValue),
+      Shorten(IKey("foo", 100, kTypeValue), IKey("foo", 101, kTypeValue)));
+  ASSERT_EQ(
+      IKey("foo", 100, kTypeValue),
+      Shorten(IKey("foo", 100, kTypeValue), IKey("foo", 100, kTypeValue)));
+  ASSERT_EQ(
+      IKey("foo", 100, kTypeValue),
+      Shorten(IKey("foo", 100, kTypeValue), IKey("foo", 100, kTypeDeletion)));
+
+  // When user keys are misordered
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue), IKey("bar", 99, kTypeValue)));
+
+  // When user keys are different, but correctly ordered
+  ASSERT_EQ(
+      IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
+      Shorten(IKey("foo", 100, kTypeValue), IKey("hello", 200, kTypeValue)));
+
+  ASSERT_EQ(IKey("ABC2", kMaxSequenceNumber, kValueTypeForSeek),
+            Shorten(IKey("ABC1AAAAA", 100, kTypeValue),
+                    IKey("ABC2ABB", 200, kTypeValue)));
+
+  ASSERT_EQ(IKey("AAA2", kMaxSequenceNumber, kValueTypeForSeek),
+            Shorten(IKey("AAA1AAA", 100, kTypeValue),
+                    IKey("AAA2AA", 200, kTypeValue)));
+
+  ASSERT_EQ(
+      IKey("AAA2", kMaxSequenceNumber, kValueTypeForSeek),
+      Shorten(IKey("AAA1AAA", 100, kTypeValue), IKey("AAA4", 200, kTypeValue)));
+
+  ASSERT_EQ(
+      IKey("AAA1B", kMaxSequenceNumber, kValueTypeForSeek),
+      Shorten(IKey("AAA1AAA", 100, kTypeValue), IKey("AAA2", 200, kTypeValue)));
+
+  ASSERT_EQ(IKey("AAA2", kMaxSequenceNumber, kValueTypeForSeek),
+            Shorten(IKey("AAA1AAA", 100, kTypeValue),
+                    IKey("AAA2A", 200, kTypeValue)));
+
+  ASSERT_EQ(
+      IKey("AAA1", 100, kTypeValue),
+      Shorten(IKey("AAA1", 100, kTypeValue), IKey("AAA2", 200, kTypeValue)));
+
+  // When start user key is prefix of limit user key
+  ASSERT_EQ(
+      IKey("foo", 100, kTypeValue),
+      Shorten(IKey("foo", 100, kTypeValue), IKey("foobar", 200, kTypeValue)));
+
+  // When limit user key is prefix of start user key
+  ASSERT_EQ(
+      IKey("foobar", 100, kTypeValue),
+      Shorten(IKey("foobar", 100, kTypeValue), IKey("foo", 200, kTypeValue)));
+}
+
+TEST_F(FormatTest, InternalKeyShortestSuccessor) {
+  ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
+            ShortSuccessor(IKey("foo", 100, kTypeValue)));
+  ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue),
+            ShortSuccessor(IKey("\xff\xff", 100, kTypeValue)));
+}
+
+TEST_F(FormatTest, IterKeyOperation) {
+  IterKey k;
+  const char p[] = "abcdefghijklmnopqrstuvwxyz";
+  const char q[] = "0123456789";
+
+  ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+            std::string(""));
+
+  k.TrimAppend(0, p, 3);
+  ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+            std::string("abc"));
+
+  k.TrimAppend(1, p, 3);
+  ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+            std::string("aabc"));
+
+  k.TrimAppend(0, p, 26);
+  ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+            std::string("abcdefghijklmnopqrstuvwxyz"));
+
+  k.TrimAppend(26, q, 10);
+  ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+            std::string("abcdefghijklmnopqrstuvwxyz0123456789"));
+
+  k.TrimAppend(36, q, 1);
+  ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+            std::string("abcdefghijklmnopqrstuvwxyz01234567890"));
+
+  k.TrimAppend(26, q, 1);
+  ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+            std::string("abcdefghijklmnopqrstuvwxyz0"));
+
+  // Size going up, memory allocation is triggered
+  k.TrimAppend(27, p, 26);
+  ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+            std::string("abcdefghijklmnopqrstuvwxyz0"
+                        "abcdefghijklmnopqrstuvwxyz"));
+}
+
+TEST_F(FormatTest, UpdateInternalKey) {
+  std::string user_key("abcdefghijklmnopqrstuvwxyz");
+  uint64_t new_seq = 0x123456;
+  ValueType new_val_type = kTypeDeletion;
+
+  std::string ikey;
+  AppendInternalKey(&ikey, ParsedInternalKey(user_key, 100U, kTypeValue));
+  size_t ikey_size = ikey.size();
+  UpdateInternalKey(&ikey, new_seq, new_val_type);
+  ASSERT_EQ(ikey_size, ikey.size());
+
+  Slice in(ikey);
+  ParsedInternalKey decoded;
+  ASSERT_OK(ParseInternalKey(in, &decoded, true /* log_err_key */));
+  ASSERT_EQ(user_key, decoded.user_key.ToString());
+  ASSERT_EQ(new_seq, decoded.sequence);
+  ASSERT_EQ(new_val_type, decoded.type);
+}
+
+TEST_F(FormatTest, RangeTombstoneSerializeEndKey) {
+  RangeTombstone t("a", "b", 2);
+  InternalKey k("b", 3, kTypeValue);
+  const InternalKeyComparator cmp(BytewiseComparator());
+  ASSERT_LT(cmp.Compare(t.SerializeEndKey(), k), 0);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/deletefile_test.cc b/src/rocksdb/db/deletefile_test.cc
new file mode 100644
index 000000000..34925e828
--- /dev/null
+++ b/src/rocksdb/db/deletefile_test.cc
@@ -0,0 +1,614 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include <stdlib.h>
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/transaction_log.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DeleteFileTest : public DBTestBase {
+ public:
+  const int numlevels_;
+  const std::string wal_dir_;
+
+  DeleteFileTest()
+      : DBTestBase("deletefile_test", /*env_do_fsync=*/true),
+        numlevels_(7),
+        wal_dir_(dbname_ + "/wal_files") {}
+
+  void SetOptions(Options* options) {
+    ASSERT_NE(options, nullptr);
+    options->delete_obsolete_files_period_micros = 0;  // always do full purge
+    options->enable_thread_tracking = true;
+    options->write_buffer_size = 1024 * 1024 * 1000;
+    options->target_file_size_base = 1024 * 1024 * 1000;
+    options->max_bytes_for_level_base = 1024 * 1024 * 1000;
+    options->WAL_ttl_seconds = 300;     // Used to test log files
+    options->WAL_size_limit_MB = 1024;  // Used to test log files
+    options->wal_dir = wal_dir_;
+  }
+
+  void AddKeys(int numkeys, int startkey = 0) {
+    WriteOptions options;
+    options.sync = false;
+    ReadOptions roptions;
+    for (int i = startkey; i < (numkeys + startkey); i++) {
+      std::string temp = std::to_string(i);
+      Slice key(temp);
+      Slice value(temp);
+      ASSERT_OK(db_->Put(options, key, value));
+    }
+  }
+
+  int numKeysInLevels(std::vector<LiveFileMetaData>& metadata,
+                      std::vector<int>* keysperlevel = nullptr) {
+    if (keysperlevel != nullptr) {
+      keysperlevel->resize(numlevels_);
+    }
+
+    int numKeys = 0;
+    for (size_t i = 0; i < metadata.size(); i++) {
+      int startkey = atoi(metadata[i].smallestkey.c_str());
+      int endkey = atoi(metadata[i].largestkey.c_str());
+      int numkeysinfile = (endkey - startkey + 1);
+      numKeys += numkeysinfile;
+      if (keysperlevel != nullptr) {
+        (*keysperlevel)[(int)metadata[i].level] += numkeysinfile;
+      }
+      fprintf(stderr, "level %d name %s smallest %s largest %s\n",
+              metadata[i].level, metadata[i].name.c_str(),
+              metadata[i].smallestkey.c_str(), metadata[i].largestkey.c_str());
+    }
+    return numKeys;
+  }
+
+  void CreateTwoLevels() {
+    AddKeys(50000, 10000);
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    for (int i = 0; i < 2; ++i) {
+      ASSERT_OK(dbfull()->TEST_CompactRange(i, nullptr, nullptr));
+    }
+
+    AddKeys(50000, 10000);
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+  }
+
+  void CheckFileTypeCounts(const std::string& dir, int required_log,
+                           int required_sst, int required_manifest) {
+    std::vector<std::string> filenames;
+    ASSERT_OK(env_->GetChildren(dir, &filenames));
+
+    int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0;
+    for (auto file : filenames) {
+      uint64_t number;
+      FileType type;
+      if (ParseFileName(file, &number, &type)) {
+        log_cnt += (type == kWalFile);
+        sst_cnt += (type == kTableFile);
+        manifest_cnt += (type == kDescriptorFile);
+      }
+    }
+    if (required_log >= 0) {
+      ASSERT_EQ(required_log, log_cnt);
+    }
+    if (required_sst >= 0) {
+      ASSERT_EQ(required_sst, sst_cnt);
+    }
+    if (required_manifest >= 0) {
+      ASSERT_EQ(required_manifest, manifest_cnt);
+    }
+  }
+
+  static void DoSleep(void* arg) {
+    auto test = reinterpret_cast<DeleteFileTest*>(arg);
+    test->env_->SleepForMicroseconds(2 * 1000 * 1000);
+  }
+
+  // An empty job to guard all jobs are processed
+  static void GuardFinish(void* /*arg*/) {
+    TEST_SYNC_POINT("DeleteFileTest::GuardFinish");
+  }
+};
+
+TEST_F(DeleteFileTest, AddKeysAndQueryLevels) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
+  CreateTwoLevels();
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+
+  std::string level1file = "";
+  int level1keycount = 0;
+  std::string level2file = "";
+  int level2keycount = 0;
+  int level1index = 0;
+  int level2index = 1;
+
+  ASSERT_EQ((int)metadata.size(), 2);
+  if (metadata[0].level == 2) {
+    level1index = 1;
+    level2index = 0;
+  }
+
+  level1file = metadata[level1index].name;
+  int startkey = atoi(metadata[level1index].smallestkey.c_str());
+  int endkey = atoi(metadata[level1index].largestkey.c_str());
+  level1keycount = (endkey - startkey + 1);
+  level2file = metadata[level2index].name;
+  startkey = atoi(metadata[level2index].smallestkey.c_str());
+  endkey = atoi(metadata[level2index].largestkey.c_str());
+  level2keycount = (endkey - startkey + 1);
+
+  // COntrolled setup. Levels 1 and 2 should both have 50K files.
+  // This is a little fragile as it depends on the current
+  // compaction heuristics.
+  ASSERT_EQ(level1keycount, 50000);
+  ASSERT_EQ(level2keycount, 50000);
+
+  Status status = db_->DeleteFile("0.sst");
+  ASSERT_TRUE(status.IsInvalidArgument());
+
+  // intermediate level files cannot be deleted.
+  status = db_->DeleteFile(level1file);
+  ASSERT_TRUE(status.IsInvalidArgument());
+
+  // Lowest level file deletion should succeed.
+  status = db_->DeleteFile(level2file);
+  ASSERT_OK(status);
+}
+
+TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
+  CreateTwoLevels();
+  // there should be only one (empty) log file because CreateTwoLevels()
+  // flushes the memtables to disk
+  CheckFileTypeCounts(wal_dir_, 1, 0, 0);
+  // 2 ssts, 1 manifest
+  CheckFileTypeCounts(dbname_, 0, 2, 1);
+  std::string first("0"), last("999999");
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 2;
+  Slice first_slice(first), last_slice(last);
+  ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice));
+  // 1 sst after compaction
+  CheckFileTypeCounts(dbname_, 0, 1, 1);
+
+  // this time, we keep an iterator alive
+  Reopen(options);
+  Iterator* itr = nullptr;
+  CreateTwoLevels();
+  itr = db_->NewIterator(ReadOptions());
+  ASSERT_OK(itr->status());
+  ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice));
+  ASSERT_OK(itr->status());
+  // 3 sst after compaction with live iterator
+  CheckFileTypeCounts(dbname_, 0, 3, 1);
+  delete itr;
+  // 1 sst after iterator deletion
+  CheckFileTypeCounts(dbname_, 0, 1, 1);
+}
+
+TEST_F(DeleteFileTest, BackgroundPurgeIteratorTest) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
+  std::string first("0"), last("999999");
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 2;
+  Slice first_slice(first), last_slice(last);
+
+  // We keep an iterator alive
+  Iterator* itr = nullptr;
+  CreateTwoLevels();
+  ReadOptions read_options;
+  read_options.background_purge_on_iterator_cleanup = true;
+  itr = db_->NewIterator(read_options);
+  ASSERT_OK(itr->status());
+  ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice));
+  // 3 sst after compaction with live iterator
+  CheckFileTypeCounts(dbname_, 0, 3, 1);
+  test::SleepingBackgroundTask sleeping_task_before;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                 &sleeping_task_before, Env::Priority::HIGH);
+  delete itr;
+  test::SleepingBackgroundTask sleeping_task_after;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                 &sleeping_task_after, Env::Priority::HIGH);
+
+  // Make sure no purges are executed foreground
+  CheckFileTypeCounts(dbname_, 0, 3, 1);
+  sleeping_task_before.WakeUp();
+  sleeping_task_before.WaitUntilDone();
+
+  // Make sure all background purges are executed
+  sleeping_task_after.WakeUp();
+  sleeping_task_after.WaitUntilDone();
+  // 1 sst after iterator deletion
+  CheckFileTypeCounts(dbname_, 0, 1, 1);
+}
+
+TEST_F(DeleteFileTest, PurgeDuringOpen) {
+  Options options = CurrentOptions();
+  CheckFileTypeCounts(dbname_, -1, 0, -1);
+  Close();
+  std::unique_ptr<WritableFile> file;
+  ASSERT_OK(options.env->NewWritableFile(dbname_ + "/000002.sst", &file,
+                                         EnvOptions()));
+  ASSERT_OK(file->Close());
+  CheckFileTypeCounts(dbname_, -1, 1, -1);
+  options.avoid_unnecessary_blocking_io = false;
+  options.create_if_missing = false;
+  Reopen(options);
+  CheckFileTypeCounts(dbname_, -1, 0, -1);
+  Close();
+
+  // test background purge
+  options.avoid_unnecessary_blocking_io = true;
+  options.create_if_missing = false;
+  ASSERT_OK(options.env->NewWritableFile(dbname_ + "/000002.sst", &file,
+                                         EnvOptions()));
+  ASSERT_OK(file->Close());
+  CheckFileTypeCounts(dbname_, -1, 1, -1);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DeleteFileTest::PurgeDuringOpen:1", "DBImpl::BGWorkPurge:start"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+  Reopen(options);
+  // the obsolete file is not deleted until the background purge job is ran
+  CheckFileTypeCounts(dbname_, -1, 1, -1);
+  TEST_SYNC_POINT("DeleteFileTest::PurgeDuringOpen:1");
+  ASSERT_OK(dbfull()->TEST_WaitForPurge());
+  CheckFileTypeCounts(dbname_, -1, 0, -1);
+}
+
+TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
+  auto do_test = [&](bool bg_purge) {
+    ColumnFamilyOptions co;
+    co.max_write_buffer_size_to_maintain =
+        static_cast<int64_t>(co.write_buffer_size);
+    WriteOptions wo;
+    FlushOptions fo;
+    ColumnFamilyHandle* cfh = nullptr;
+
+    ASSERT_OK(db_->CreateColumnFamily(co, "dropme", &cfh));
+
+    ASSERT_OK(db_->Put(wo, cfh, "pika", "chu"));
+    ASSERT_OK(db_->Flush(fo, cfh));
+    // Expect 1 sst file.
+    CheckFileTypeCounts(dbname_, 0, 1, 1);
+
+    ASSERT_OK(db_->DropColumnFamily(cfh));
+    // Still 1 file, it won't be deleted while ColumnFamilyHandle is alive.
+    CheckFileTypeCounts(dbname_, 0, 1, 1);
+
+    delete cfh;
+    test::SleepingBackgroundTask sleeping_task_after;
+    env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                   &sleeping_task_after, Env::Priority::HIGH);
+    // If background purge is enabled, the file should still be there.
+    CheckFileTypeCounts(dbname_, 0, bg_purge ? 1 : 0, 1);
+    TEST_SYNC_POINT("DeleteFileTest::BackgroundPurgeCFDropTest:1");
+
+    // Execute background purges.
+    sleeping_task_after.WakeUp();
+    sleeping_task_after.WaitUntilDone();
+    // The file should have been deleted.
+    CheckFileTypeCounts(dbname_, 0, 0, 1);
+  };
+
+  {
+    SCOPED_TRACE("avoid_unnecessary_blocking_io = false");
+    do_test(false);
+  }
+
+  options.avoid_unnecessary_blocking_io = true;
+  options.create_if_missing = false;
+  Reopen(options);
+  ASSERT_OK(dbfull()->TEST_WaitForPurge());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DeleteFileTest::BackgroundPurgeCFDropTest:1",
+        "DBImpl::BGWorkPurge:start"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  {
+    SCOPED_TRACE("avoid_unnecessary_blocking_io = true");
+    do_test(true);
+  }
+}
+
+// This test is to reproduce a bug that read invalid ReadOption in iterator
+// cleanup function
+TEST_F(DeleteFileTest, BackgroundPurgeCopyOptions) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
+  std::string first("0"), last("999999");
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 2;
+  Slice first_slice(first), last_slice(last);
+
+  // We keep an iterator alive
+  Iterator* itr = nullptr;
+  CreateTwoLevels();
+  {
+    ReadOptions read_options;
+    read_options.background_purge_on_iterator_cleanup = true;
+    itr = db_->NewIterator(read_options);
+    ASSERT_OK(itr->status());
+    // ReadOptions is deleted, but iterator cleanup function should not be
+    // affected
+  }
+
+  ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice));
+  // 3 sst after compaction with live iterator
+  CheckFileTypeCounts(dbname_, 0, 3, 1);
+  delete itr;
+
+  test::SleepingBackgroundTask sleeping_task_after;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                 &sleeping_task_after, Env::Priority::HIGH);
+
+  // Make sure all background purges are executed
+  sleeping_task_after.WakeUp();
+  sleeping_task_after.WaitUntilDone();
+  // 1 sst after iterator deletion
+  CheckFileTypeCounts(dbname_, 0, 1, 1);
+}
+
+TEST_F(DeleteFileTest, BackgroundPurgeTestMultipleJobs) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
+  std::string first("0"), last("999999");
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 2;
+  Slice first_slice(first), last_slice(last);
+
+  // We keep an iterator alive
+  CreateTwoLevels();
+  ReadOptions read_options;
+  read_options.background_purge_on_iterator_cleanup = true;
+  Iterator* itr1 = db_->NewIterator(read_options);
+  ASSERT_OK(itr1->status());
+  CreateTwoLevels();
+  Iterator* itr2 = db_->NewIterator(read_options);
+  ASSERT_OK(itr2->status());
+  ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice));
+  // 5 sst files after 2 compactions with 2 live iterators
+  CheckFileTypeCounts(dbname_, 0, 5, 1);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  // ~DBImpl should wait until all BGWorkPurge are finished
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::~DBImpl:WaitJob", "DBImpl::BGWorkPurge"},
+       {"DeleteFileTest::GuardFinish",
+        "DeleteFileTest::BackgroundPurgeTestMultipleJobs:DBClose"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  delete itr1;
+  env_->Schedule(&DeleteFileTest::DoSleep, this, Env::Priority::HIGH);
+  delete itr2;
+  env_->Schedule(&DeleteFileTest::GuardFinish, nullptr, Env::Priority::HIGH);
+  Close();
+
+  TEST_SYNC_POINT("DeleteFileTest::BackgroundPurgeTestMultipleJobs:DBClose");
+  // 1 sst after iterator deletion
+  CheckFileTypeCounts(dbname_, 0, 1, 1);
+}
+
+TEST_F(DeleteFileTest, DeleteFileWithIterator) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
+  CreateTwoLevels();
+  ReadOptions read_options;
+  Iterator* it = db_->NewIterator(read_options);
+  ASSERT_OK(it->status());
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+
+  std::string level2file;
+
+  ASSERT_EQ(metadata.size(), static_cast<size_t>(2));
+  if (metadata[0].level == 1) {
+    level2file = metadata[1].name;
+  } else {
+    level2file = metadata[0].name;
+  }
+
+  Status status = db_->DeleteFile(level2file);
+  fprintf(stdout, "Deletion status %s: %s\n", level2file.c_str(),
+          status.ToString().c_str());
+  ASSERT_OK(status);
+  it->SeekToFirst();
+  int numKeysIterated = 0;
+  while (it->Valid()) {
+    numKeysIterated++;
+    it->Next();
+  }
+  ASSERT_EQ(numKeysIterated, 50000);
+  delete it;
+}
+
+TEST_F(DeleteFileTest, DeleteLogFiles) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
+  AddKeys(10, 0);
+  VectorLogPtr logfiles;
+  ASSERT_OK(db_->GetSortedWalFiles(logfiles));
+  ASSERT_GT(logfiles.size(), 0UL);
+  // Take the last log file which is expected to be alive and try to delete it
+  // Should not succeed because live logs are not allowed to be deleted
+  std::unique_ptr<LogFile> alive_log = std::move(logfiles.back());
+  ASSERT_EQ(alive_log->Type(), kAliveLogFile);
+  ASSERT_OK(env_->FileExists(wal_dir_ + "/" + alive_log->PathName()));
+  fprintf(stdout, "Deleting alive log file %s\n",
+          alive_log->PathName().c_str());
+  ASSERT_NOK(db_->DeleteFile(alive_log->PathName()));
+  ASSERT_OK(env_->FileExists(wal_dir_ + "/" + alive_log->PathName()));
+  logfiles.clear();
+
+  // Call Flush to bring about a new working log file and add more keys
+  // Call Flush again to flush out memtable and move alive log to archived log
+  // and try to delete the archived log file
+  FlushOptions fopts;
+  ASSERT_OK(db_->Flush(fopts));
+  AddKeys(10, 0);
+  ASSERT_OK(db_->Flush(fopts));
+  ASSERT_OK(db_->GetSortedWalFiles(logfiles));
+  ASSERT_GT(logfiles.size(), 0UL);
+  std::unique_ptr<LogFile> archived_log = std::move(logfiles.front());
+  ASSERT_EQ(archived_log->Type(), kArchivedLogFile);
+  ASSERT_OK(env_->FileExists(wal_dir_ + "/" + archived_log->PathName()));
+  fprintf(stdout, "Deleting archived log file %s\n",
+          archived_log->PathName().c_str());
+  ASSERT_OK(db_->DeleteFile(archived_log->PathName()));
+  ASSERT_TRUE(
+      env_->FileExists(wal_dir_ + "/" + archived_log->PathName()).IsNotFound());
+}
+
+TEST_F(DeleteFileTest, DeleteNonDefaultColumnFamily) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+  CreateAndReopenWithCF({"new_cf"}, options);
+
+  Random rnd(5);
+  for (int i = 0; i < 1000; ++i) {
+    ASSERT_OK(db_->Put(WriteOptions(), handles_[1], test::RandomKey(&rnd, 10),
+                       test::RandomKey(&rnd, 10)));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions(), handles_[1]));
+  for (int i = 0; i < 1000; ++i) {
+    ASSERT_OK(db_->Put(WriteOptions(), handles_[1], test::RandomKey(&rnd, 10),
+                       test::RandomKey(&rnd, 10)));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions(), handles_[1]));
+
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(2U, metadata.size());
+  ASSERT_EQ("new_cf", metadata[0].column_family_name);
+  ASSERT_EQ("new_cf", metadata[1].column_family_name);
+  auto old_file = metadata[0].smallest_seqno < metadata[1].smallest_seqno
+                      ? metadata[0].name
+                      : metadata[1].name;
+  auto new_file = metadata[0].smallest_seqno > metadata[1].smallest_seqno
+                      ? metadata[0].name
+                      : metadata[1].name;
+  ASSERT_TRUE(db_->DeleteFile(new_file).IsInvalidArgument());
+  ASSERT_OK(db_->DeleteFile(old_file));
+
+  {
+    std::unique_ptr<Iterator> itr(db_->NewIterator(ReadOptions(), handles_[1]));
+    ASSERT_OK(itr->status());
+    int count = 0;
+    for (itr->SeekToFirst(); itr->Valid(); itr->Next()) {
+      ASSERT_OK(itr->status());
+      ++count;
+    }
+    ASSERT_EQ(count, 1000);
+  }
+
+  Close();
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "new_cf"}, options);
+
+  {
+    std::unique_ptr<Iterator> itr(db_->NewIterator(ReadOptions(), handles_[1]));
+    int count = 0;
+    for (itr->SeekToFirst(); itr->Valid(); itr->Next()) {
+      ASSERT_OK(itr->status());
+      ++count;
+    }
+    ASSERT_EQ(count, 1000);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as DBImpl::DeleteFile is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/error_handler.cc b/src/rocksdb/db/error_handler.cc
new file mode 100644
index 000000000..7f68bb026
--- /dev/null
+++ b/src/rocksdb/db/error_handler.cc
@@ -0,0 +1,819 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/error_handler.h"
+
+#include "db/db_impl/db_impl.h"
+#include "db/event_helpers.h"
+#include "file/sst_file_manager_impl.h"
+#include "logging/logging.h"
+#include "port/lang.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Maps to help decide the severity of an error based on the
+// BackgroundErrorReason, Code, SubCode and whether db_options.paranoid_checks
+// is set or not. There are 3 maps, going from most specific to least specific
+// (i.e from all 4 fields in a tuple to only the BackgroundErrorReason and
+// paranoid_checks). The less specific map serves as a catch all in case we miss
+// a specific error code or subcode.
+std::map<std::tuple<BackgroundErrorReason, Status::Code, Status::SubCode, bool>,
+         Status::Severity>
+    ErrorSeverityMap = {
+        // Errors during BG compaction
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         true),
+         Status::Severity::kSoftError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kIOError, Status::SubCode::kSpaceLimit,
+                         true),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         false),
+         Status::Severity::kFatalError},
+        // Errors during BG flush
+        {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+                         Status::SubCode::kNoSpace, true),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+                         Status::SubCode::kNoSpace, false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+                         Status::SubCode::kSpaceLimit, true),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+                         Status::SubCode::kIOFenced, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+                         Status::SubCode::kIOFenced, false),
+         Status::Severity::kFatalError},
+        // Errors during Write
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         true),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         false),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         false),
+         Status::Severity::kFatalError},
+        // Errors during MANIFEST write
+        {std::make_tuple(BackgroundErrorReason::kManifestWrite,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         true),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWrite,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         false),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWrite,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWrite,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         false),
+         Status::Severity::kFatalError},
+        // Errors during BG flush with WAL disabled
+        {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         true),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kSpaceLimit,
+                         true),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         false),
+         Status::Severity::kFatalError},
+        // Errors during MANIFEST write when WAL is disabled
+        {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         true),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         false),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         false),
+         Status::Severity::kFatalError},
+
+};
+
+std::map<std::tuple<BackgroundErrorReason, Status::Code, bool>,
+         Status::Severity>
+    DefaultErrorSeverityMap = {
+        // Errors during BG compaction
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kCorruption, true),
+         Status::Severity::kUnrecoverableError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kCorruption, false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kIOError, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kIOError, false),
+         Status::Severity::kNoError},
+        // Errors during BG flush
+        {std::make_tuple(BackgroundErrorReason::kFlush,
+                         Status::Code::kCorruption, true),
+         Status::Severity::kUnrecoverableError},
+        {std::make_tuple(BackgroundErrorReason::kFlush,
+                         Status::Code::kCorruption, false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+                         true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+                         false),
+         Status::Severity::kNoError},
+        // Errors during Write
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kCorruption, true),
+         Status::Severity::kUnrecoverableError},
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kCorruption, false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kIOError, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kIOError, false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWrite,
+                         Status::Code::kIOError, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWrite,
+                         Status::Code::kIOError, false),
+         Status::Severity::kFatalError},
+        // Errors during BG flush with WAL disabled
+        {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+                         Status::Code::kCorruption, true),
+         Status::Severity::kUnrecoverableError},
+        {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+                         Status::Code::kCorruption, false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+                         Status::Code::kIOError, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+                         Status::Code::kIOError, false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+                         Status::Code::kIOError, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+                         Status::Code::kIOError, false),
+         Status::Severity::kFatalError},
+};
+
+std::map<std::tuple<BackgroundErrorReason, bool>, Status::Severity>
+    DefaultReasonMap = {
+        // Errors during BG compaction
+        {std::make_tuple(BackgroundErrorReason::kCompaction, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction, false),
+         Status::Severity::kNoError},
+        // Errors during BG flush
+        {std::make_tuple(BackgroundErrorReason::kFlush, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kFlush, false),
+         Status::Severity::kNoError},
+        // Errors during Write
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback, false),
+         Status::Severity::kFatalError},
+        // Errors during Memtable update
+        {std::make_tuple(BackgroundErrorReason::kMemTable, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kMemTable, false),
+         Status::Severity::kFatalError},
+};
+
+void ErrorHandler::CancelErrorRecovery() {
+#ifndef ROCKSDB_LITE
+  db_mutex_->AssertHeld();
+
+  // We'll release the lock before calling sfm, so make sure no new
+  // recovery gets scheduled at that point
+  auto_recovery_ = false;
+  SstFileManagerImpl* sfm =
+      reinterpret_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
+  if (sfm) {
+    // This may or may not cancel a pending recovery
+    db_mutex_->Unlock();
+    bool cancelled = sfm->CancelErrorRecovery(this);
+    db_mutex_->Lock();
+    if (cancelled) {
+      recovery_in_prog_ = false;
+    }
+  }
+
+  // If auto recovery is also runing to resume from the retryable error,
+  // we should wait and end the auto recovery.
+  EndAutoRecovery();
+#endif
+}
+
+STATIC_AVOID_DESTRUCTION(const Status, kOkStatus){Status::OK()};
+
+// This is the main function for looking at an error during a background
+// operation and deciding the severity, and error recovery strategy. The high
+// level algorithm is as follows -
+// 1. Classify the severity of the error based on the ErrorSeverityMap,
+//    DefaultErrorSeverityMap and DefaultReasonMap defined earlier
+// 2. Call a Status code specific override function to adjust the severity
+//    if needed. The reason for this is our ability to recover may depend on
+//    the exact options enabled in DBOptions
+// 3. Determine if auto recovery is possible. A listener notification callback
+//    is called, which can disable the auto recovery even if we decide its
+//    feasible
+// 4. For Status::NoSpace() errors, rely on SstFileManagerImpl to control
+//    the actual recovery. If no sst file manager is specified in DBOptions,
+//    a default one is allocated during DB::Open(), so there will always be
+//    one.
+// This can also get called as part of a recovery operation. In that case, we
+// also track the error separately in recovery_error_ so we can tell in the
+// end whether recovery succeeded or not
+const Status& ErrorHandler::HandleKnownErrors(const Status& bg_err,
+                                              BackgroundErrorReason reason) {
+  db_mutex_->AssertHeld();
+  if (bg_err.ok()) {
+    return kOkStatus;
+  }
+
+  if (bg_error_stats_ != nullptr) {
+    RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT);
+  }
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "ErrorHandler: Set regular background error\n");
+
+  bool paranoid = db_options_.paranoid_checks;
+  Status::Severity sev = Status::Severity::kFatalError;
+  Status new_bg_err;
+  DBRecoverContext context;
+  bool found = false;
+
+  {
+    auto entry = ErrorSeverityMap.find(
+        std::make_tuple(reason, bg_err.code(), bg_err.subcode(), paranoid));
+    if (entry != ErrorSeverityMap.end()) {
+      sev = entry->second;
+      found = true;
+    }
+  }
+
+  if (!found) {
+    auto entry = DefaultErrorSeverityMap.find(
+        std::make_tuple(reason, bg_err.code(), paranoid));
+    if (entry != DefaultErrorSeverityMap.end()) {
+      sev = entry->second;
+      found = true;
+    }
+  }
+
+  if (!found) {
+    auto entry = DefaultReasonMap.find(std::make_tuple(reason, paranoid));
+    if (entry != DefaultReasonMap.end()) {
+      sev = entry->second;
+    }
+  }
+
+  new_bg_err = Status(bg_err, sev);
+
+  // Check if recovery is currently in progress. If it is, we will save this
+  // error so we can check it at the end to see if recovery succeeded or not
+  if (recovery_in_prog_ && recovery_error_.ok()) {
+    recovery_error_ = new_bg_err;
+  }
+
+  bool auto_recovery = auto_recovery_;
+  if (new_bg_err.severity() >= Status::Severity::kFatalError && auto_recovery) {
+    auto_recovery = false;
+  }
+
+  // Allow some error specific overrides
+  if (new_bg_err.subcode() == IOStatus::SubCode::kNoSpace ||
+      new_bg_err.subcode() == IOStatus::SubCode::kSpaceLimit) {
+    new_bg_err = OverrideNoSpaceError(new_bg_err, &auto_recovery);
+  }
+
+  if (!new_bg_err.ok()) {
+    Status s = new_bg_err;
+    EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &s,
+                                          db_mutex_, &auto_recovery);
+    if (!s.ok() && (s.severity() > bg_error_.severity())) {
+      bg_error_ = s;
+    } else {
+      // This error is less severe than previously encountered error. Don't
+      // take any further action
+      return bg_error_;
+    }
+  }
+
+  recover_context_ = context;
+  if (auto_recovery) {
+    recovery_in_prog_ = true;
+
+    // Kick-off error specific recovery
+    if (new_bg_err.subcode() == IOStatus::SubCode::kNoSpace ||
+        new_bg_err.subcode() == IOStatus::SubCode::kSpaceLimit) {
+      RecoverFromNoSpace();
+    }
+  }
+  if (bg_error_.severity() >= Status::Severity::kHardError) {
+    is_db_stopped_.store(true, std::memory_order_release);
+  }
+  return bg_error_;
+}
+
+// This is the main function for looking at IO related error during the
+// background operations. The main logic is:
+// 1) File scope IO error is treated as retryable IO error in the write
+//    path. In RocksDB, If a file has write IO error and it is at file scope,
+//    RocksDB never write to the same file again. RocksDB will create a new
+//    file and rewrite the whole content. Thus, it is retryable.
+// 1) if the error is caused by data loss, the error is mapped to
+//    unrecoverable error. Application/user must take action to handle
+//    this situation (File scope case is excluded).
+// 2) if the error is a Retryable IO error (i.e., it is a file scope IO error,
+//     or its retryable flag is set and not a data loss error), auto resume
+//     will be called and the auto resume can be controlled by resume count
+//     and resume interval options. There are three sub-cases:
+//    a) if the error happens during compaction, it is mapped to a soft error.
+//       the compaction thread will reschedule a new compaction.
+//    b) if the error happens during flush and also WAL is empty, it is mapped
+//       to a soft error. Note that, it includes the case that IO error happens
+//       in SST or manifest write during flush.
+//    c) all other errors are mapped to hard error.
+// 3) for other cases, SetBGError(const Status& bg_err, BackgroundErrorReason
+//    reason) will be called to handle other error cases.
+const Status& ErrorHandler::SetBGError(const Status& bg_status,
+                                       BackgroundErrorReason reason) {
+  db_mutex_->AssertHeld();
+  Status tmp_status = bg_status;
+  IOStatus bg_io_err = status_to_io_status(std::move(tmp_status));
+
+  if (bg_io_err.ok()) {
+    return kOkStatus;
+  }
+  ROCKS_LOG_WARN(db_options_.info_log, "Background IO error %s",
+                 bg_io_err.ToString().c_str());
+
+  if (recovery_in_prog_ && recovery_io_error_.ok()) {
+    recovery_io_error_ = bg_io_err;
+  }
+  if (BackgroundErrorReason::kManifestWrite == reason ||
+      BackgroundErrorReason::kManifestWriteNoWAL == reason) {
+    // Always returns ok
+    ROCKS_LOG_INFO(db_options_.info_log, "Disabling File Deletions");
+    db_->DisableFileDeletionsWithLock().PermitUncheckedError();
+  }
+
+  Status new_bg_io_err = bg_io_err;
+  DBRecoverContext context;
+  if (bg_io_err.GetScope() != IOStatus::IOErrorScope::kIOErrorScopeFile &&
+      bg_io_err.GetDataLoss()) {
+    // First, data loss (non file scope) is treated as unrecoverable error. So
+    // it can directly overwrite any existing bg_error_.
+    bool auto_recovery = false;
+    Status bg_err(new_bg_io_err, Status::Severity::kUnrecoverableError);
+    CheckAndSetRecoveryAndBGError(bg_err);
+    if (bg_error_stats_ != nullptr) {
+      RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT);
+      RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT);
+    }
+    ROCKS_LOG_INFO(
+        db_options_.info_log,
+        "ErrorHandler: Set background IO error as unrecoverable error\n");
+    EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason,
+                                          &bg_err, db_mutex_, &auto_recovery);
+    recover_context_ = context;
+    return bg_error_;
+  } else if (bg_io_err.subcode() != IOStatus::SubCode::kNoSpace &&
+             (bg_io_err.GetScope() ==
+                  IOStatus::IOErrorScope::kIOErrorScopeFile ||
+              bg_io_err.GetRetryable())) {
+    // Second, check if the error is a retryable IO error (file scope IO error
+    // is also treated as retryable IO error in RocksDB write path). if it is
+    // retryable error and its severity is higher than bg_error_, overwrite the
+    // bg_error_ with new error. In current stage, for retryable IO error of
+    // compaction, treat it as soft error. In other cases, treat the retryable
+    // IO error as hard error. Note that, all the NoSpace error should be
+    // handled by the SstFileManager::StartErrorRecovery(). Therefore, no matter
+    // it is retryable or file scope, this logic will be bypassed.
+    bool auto_recovery = false;
+    EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason,
+                                          &new_bg_io_err, db_mutex_,
+                                          &auto_recovery);
+    if (bg_error_stats_ != nullptr) {
+      RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT);
+      RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT);
+      RecordTick(bg_error_stats_.get(),
+                 ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT);
+    }
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "ErrorHandler: Set background retryable IO error\n");
+    if (BackgroundErrorReason::kCompaction == reason) {
+      // We map the retryable IO error during compaction to soft error. Since
+      // compaction can reschedule by itself. We will not set the BG error in
+      // this case
+      // TODO:  a better way to set or clean the retryable IO error which
+      // happens during compaction SST file write.
+      if (bg_error_stats_ != nullptr) {
+        RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_COUNT);
+      }
+      ROCKS_LOG_INFO(
+          db_options_.info_log,
+          "ErrorHandler: Compaction will schedule by itself to resume\n");
+      return bg_error_;
+    } else if (BackgroundErrorReason::kFlushNoWAL == reason ||
+               BackgroundErrorReason::kManifestWriteNoWAL == reason) {
+      // When the BG Retryable IO error reason is flush without WAL,
+      // We map it to a soft error. At the same time, all the background work
+      // should be stopped except the BG work from recovery. Therefore, we
+      // set the soft_error_no_bg_work_ to true. At the same time, since DB
+      // continues to receive writes when BG error is soft error, to avoid
+      // to many small memtable being generated during auto resume, the flush
+      // reason is set to kErrorRecoveryRetryFlush.
+      Status bg_err(new_bg_io_err, Status::Severity::kSoftError);
+      CheckAndSetRecoveryAndBGError(bg_err);
+      soft_error_no_bg_work_ = true;
+      context.flush_reason = FlushReason::kErrorRecoveryRetryFlush;
+      recover_context_ = context;
+      return StartRecoverFromRetryableBGIOError(bg_io_err);
+    } else {
+      Status bg_err(new_bg_io_err, Status::Severity::kHardError);
+      CheckAndSetRecoveryAndBGError(bg_err);
+      recover_context_ = context;
+      return StartRecoverFromRetryableBGIOError(bg_io_err);
+    }
+  } else {
+    if (bg_error_stats_ != nullptr) {
+      RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT);
+    }
+    // HandleKnownErrors() will use recovery_error_, so ignore
+    // recovery_io_error_.
+    // TODO: Do some refactoring and use only one recovery_error_
+    recovery_io_error_.PermitUncheckedError();
+    return HandleKnownErrors(new_bg_io_err, reason);
+  }
+}
+
+Status ErrorHandler::OverrideNoSpaceError(const Status& bg_error,
+                                          bool* auto_recovery) {
+#ifndef ROCKSDB_LITE
+  if (bg_error.severity() >= Status::Severity::kFatalError) {
+    return bg_error;
+  }
+
+  if (db_options_.sst_file_manager.get() == nullptr) {
+    // We rely on SFM to poll for enough disk space and recover
+    *auto_recovery = false;
+    return bg_error;
+  }
+
+  if (db_options_.allow_2pc &&
+      (bg_error.severity() <= Status::Severity::kSoftError)) {
+    // Don't know how to recover, as the contents of the current WAL file may
+    // be inconsistent, and it may be needed for 2PC. If 2PC is not enabled,
+    // we can just flush the memtable and discard the log
+    *auto_recovery = false;
+    return Status(bg_error, Status::Severity::kFatalError);
+  }
+
+  {
+    uint64_t free_space;
+    if (db_options_.env->GetFreeSpace(db_options_.db_paths[0].path,
+                                      &free_space) == Status::NotSupported()) {
+      *auto_recovery = false;
+    }
+  }
+
+  return bg_error;
+#else
+  (void)auto_recovery;
+  return Status(bg_error, Status::Severity::kFatalError);
+#endif
+}
+
+void ErrorHandler::RecoverFromNoSpace() {
+#ifndef ROCKSDB_LITE
+  SstFileManagerImpl* sfm =
+      reinterpret_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
+
+  // Inform SFM of the error, so it can kick-off the recovery
+  if (sfm) {
+    sfm->StartErrorRecovery(this, bg_error_);
+  }
+#endif
+}
+
+Status ErrorHandler::ClearBGError() {
+#ifndef ROCKSDB_LITE
+  db_mutex_->AssertHeld();
+
+  // Signal that recovery succeeded
+  if (recovery_error_.ok()) {
+    Status old_bg_error = bg_error_;
+    // old_bg_error is only for notifying listeners, so may not be checked
+    old_bg_error.PermitUncheckedError();
+    // Clear and check the recovery IO and BG error
+    bg_error_ = Status::OK();
+    recovery_io_error_ = IOStatus::OK();
+    bg_error_.PermitUncheckedError();
+    recovery_io_error_.PermitUncheckedError();
+    recovery_in_prog_ = false;
+    soft_error_no_bg_work_ = false;
+    EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, old_bg_error,
+                                           bg_error_, db_mutex_);
+  }
+  return recovery_error_;
+#else
+  return bg_error_;
+#endif
+}
+
+Status ErrorHandler::RecoverFromBGError(bool is_manual) {
+#ifndef ROCKSDB_LITE
+  InstrumentedMutexLock l(db_mutex_);
+  bool no_bg_work_original_flag = soft_error_no_bg_work_;
+  if (is_manual) {
+    // If its a manual recovery and there's a background recovery in progress
+    // return busy status
+    if (recovery_in_prog_) {
+      return Status::Busy();
+    }
+    recovery_in_prog_ = true;
+
+    // In manual resume, we allow the bg work to run. If it is a auto resume,
+    // the bg work should follow this tag.
+    soft_error_no_bg_work_ = false;
+
+    // In manual resume, if the bg error is a soft error and also requires
+    // no bg work, the error must be recovered by call the flush with
+    // flush reason: kErrorRecoveryRetryFlush. In other case, the flush
+    // reason is set to kErrorRecovery.
+    if (no_bg_work_original_flag) {
+      recover_context_.flush_reason = FlushReason::kErrorRecoveryRetryFlush;
+    } else {
+      recover_context_.flush_reason = FlushReason::kErrorRecovery;
+    }
+  }
+
+  if (bg_error_.severity() == Status::Severity::kSoftError &&
+      recover_context_.flush_reason == FlushReason::kErrorRecovery) {
+    // Simply clear the background error and return
+    recovery_error_ = Status::OK();
+    return ClearBGError();
+  }
+
+  // Reset recovery_error_. We will use this to record any errors that happen
+  // during the recovery process. While recovering, the only operations that
+  // can generate background errors should be the flush operations
+  recovery_error_ = Status::OK();
+  recovery_error_.PermitUncheckedError();
+  Status s = db_->ResumeImpl(recover_context_);
+  if (s.ok()) {
+    soft_error_no_bg_work_ = false;
+  } else {
+    soft_error_no_bg_work_ = no_bg_work_original_flag;
+  }
+
+  // For manual recover, shutdown, and fatal error  cases, set
+  // recovery_in_prog_ to false. For automatic background recovery, leave it
+  // as is regardless of success or failure as it will be retried
+  if (is_manual || s.IsShutdownInProgress() ||
+      bg_error_.severity() >= Status::Severity::kFatalError) {
+    recovery_in_prog_ = false;
+  }
+  return s;
+#else
+  (void)is_manual;
+  return bg_error_;
+#endif
+}
+
+const Status& ErrorHandler::StartRecoverFromRetryableBGIOError(
+    const IOStatus& io_error) {
+#ifndef ROCKSDB_LITE
+  db_mutex_->AssertHeld();
+  if (bg_error_.ok()) {
+    return bg_error_;
+  } else if (io_error.ok()) {
+    return kOkStatus;
+  } else if (db_options_.max_bgerror_resume_count <= 0 || recovery_in_prog_) {
+    // Auto resume BG error is not enabled, directly return bg_error_.
+    return bg_error_;
+  }
+  if (bg_error_stats_ != nullptr) {
+    RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_COUNT);
+  }
+  ROCKS_LOG_INFO(
+      db_options_.info_log,
+      "ErrorHandler: Call StartRecoverFromRetryableBGIOError to resume\n");
+  if (recovery_thread_) {
+    // In this case, if recovery_in_prog_ is false, current thread should
+    // wait the previous recover thread to finish and create a new thread
+    // to recover from the bg error.
+    db_mutex_->Unlock();
+    recovery_thread_->join();
+    db_mutex_->Lock();
+  }
+
+  recovery_in_prog_ = true;
+  recovery_thread_.reset(
+      new port::Thread(&ErrorHandler::RecoverFromRetryableBGIOError, this));
+
+  if (recovery_io_error_.ok() && recovery_error_.ok()) {
+    return recovery_error_;
+  } else {
+    return bg_error_;
+  }
+#else
+  (void)io_error;
+  return bg_error_;
+#endif
+}
+
+// Automatic recover from Retryable BG IO error. Must be called after db
+// mutex is released.
+void ErrorHandler::RecoverFromRetryableBGIOError() {
+#ifndef ROCKSDB_LITE
+  TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart");
+  InstrumentedMutexLock l(db_mutex_);
+  if (end_recovery_) {
+    EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
+                                           Status::ShutdownInProgress(),
+                                           db_mutex_);
+    return;
+  }
+  DBRecoverContext context = recover_context_;
+  int resume_count = db_options_.max_bgerror_resume_count;
+  uint64_t wait_interval = db_options_.bgerror_resume_retry_interval;
+  uint64_t retry_count = 0;
+  // Recover from the retryable error. Create a separate thread to do it.
+  while (resume_count > 0) {
+    if (end_recovery_) {
+      EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
+                                             Status::ShutdownInProgress(),
+                                             db_mutex_);
+      return;
+    }
+    TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume0");
+    TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume1");
+    recovery_io_error_ = IOStatus::OK();
+    recovery_error_ = Status::OK();
+    retry_count++;
+    Status s = db_->ResumeImpl(context);
+    if (bg_error_stats_ != nullptr) {
+      RecordTick(bg_error_stats_.get(),
+                 ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT);
+    }
+    if (s.IsShutdownInProgress() ||
+        bg_error_.severity() >= Status::Severity::kFatalError) {
+      // If DB shutdown in progress or the error severity is higher than
+      // Hard Error, stop auto resume and returns.
+      recovery_in_prog_ = false;
+      if (bg_error_stats_ != nullptr) {
+        RecordInHistogram(bg_error_stats_.get(),
+                          ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
+      }
+      EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
+                                             bg_error_, db_mutex_);
+      return;
+    }
+    if (!recovery_io_error_.ok() &&
+        recovery_error_.severity() <= Status::Severity::kHardError &&
+        recovery_io_error_.GetRetryable()) {
+      // If new BG IO error happens during auto recovery and it is retryable
+      // and its severity is Hard Error or lower, the auto resmue sleep for
+      // a period of time and redo auto resume if it is allowed.
+      TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait0");
+      TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait1");
+      int64_t wait_until = db_options_.clock->NowMicros() + wait_interval;
+      cv_.TimedWait(wait_until);
+    } else {
+      // There are three possibility: 1) recover_io_error is set during resume
+      // and the error is not retryable, 2) recover is successful, 3) other
+      // error happens during resume and cannot be resumed here.
+      if (recovery_io_error_.ok() && recovery_error_.ok() && s.ok()) {
+        // recover from the retryable IO error and no other BG errors. Clean
+        // the bg_error and notify user.
+        TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverSuccess");
+        Status old_bg_error = bg_error_;
+        is_db_stopped_.store(false, std::memory_order_release);
+        bg_error_ = Status::OK();
+        bg_error_.PermitUncheckedError();
+        EventHelpers::NotifyOnErrorRecoveryEnd(
+            db_options_.listeners, old_bg_error, bg_error_, db_mutex_);
+        if (bg_error_stats_ != nullptr) {
+          RecordTick(bg_error_stats_.get(),
+                     ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT);
+          RecordInHistogram(bg_error_stats_.get(),
+                            ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
+        }
+        recovery_in_prog_ = false;
+        if (soft_error_no_bg_work_) {
+          soft_error_no_bg_work_ = false;
+        }
+        return;
+      } else {
+        // In this case: 1) recovery_io_error is more serious or not retryable
+        // 2) other Non IO recovery_error happens. The auto recovery stops.
+        recovery_in_prog_ = false;
+        if (bg_error_stats_ != nullptr) {
+          RecordInHistogram(bg_error_stats_.get(),
+                            ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
+        }
+        EventHelpers::NotifyOnErrorRecoveryEnd(
+            db_options_.listeners, bg_error_,
+            !recovery_io_error_.ok()
+                ? recovery_io_error_
+                : (!recovery_error_.ok() ? recovery_error_ : s),
+            db_mutex_);
+        return;
+      }
+    }
+    resume_count--;
+  }
+  recovery_in_prog_ = false;
+  EventHelpers::NotifyOnErrorRecoveryEnd(
+      db_options_.listeners, bg_error_,
+      Status::Aborted("Exceeded resume retry count"), db_mutex_);
+  TEST_SYNC_POINT("RecoverFromRetryableBGIOError:LoopOut");
+  if (bg_error_stats_ != nullptr) {
+    RecordInHistogram(bg_error_stats_.get(),
+                      ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
+  }
+  return;
+#else
+  return;
+#endif
+}
+
+void ErrorHandler::CheckAndSetRecoveryAndBGError(const Status& bg_err) {
+  if (recovery_in_prog_ && recovery_error_.ok()) {
+    recovery_error_ = bg_err;
+  }
+  if (bg_err.severity() > bg_error_.severity()) {
+    bg_error_ = bg_err;
+  }
+  if (bg_error_.severity() >= Status::Severity::kHardError) {
+    is_db_stopped_.store(true, std::memory_order_release);
+  }
+  return;
+}
+
+void ErrorHandler::EndAutoRecovery() {
+  db_mutex_->AssertHeld();
+  if (!end_recovery_) {
+    end_recovery_ = true;
+  }
+  cv_.SignalAll();
+  db_mutex_->Unlock();
+  if (recovery_thread_) {
+    recovery_thread_->join();
+  }
+  db_mutex_->Lock();
+  return;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/error_handler.h b/src/rocksdb/db/error_handler.h
new file mode 100644
index 000000000..34e08a525
--- /dev/null
+++ b/src/rocksdb/db/error_handler.h
@@ -0,0 +1,124 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include "monitoring/instrumented_mutex.h"
+#include "options/db_options.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImpl;
+
+// This structure is used to store the DB recovery context. The context is
+// the information that related to the recover actions. For example, it contains
+// FlushReason, which tells the flush job why this flush is called.
+struct DBRecoverContext {
+  FlushReason flush_reason;
+
+  DBRecoverContext() : flush_reason(FlushReason::kErrorRecovery) {}
+
+  DBRecoverContext(FlushReason reason) : flush_reason(reason) {}
+};
+
+class ErrorHandler {
+ public:
+  ErrorHandler(DBImpl* db, const ImmutableDBOptions& db_options,
+               InstrumentedMutex* db_mutex)
+      : db_(db),
+        db_options_(db_options),
+        cv_(db_mutex),
+        end_recovery_(false),
+        recovery_thread_(nullptr),
+        db_mutex_(db_mutex),
+        auto_recovery_(false),
+        recovery_in_prog_(false),
+        soft_error_no_bg_work_(false),
+        is_db_stopped_(false),
+        bg_error_stats_(db_options.statistics) {
+    // Clear the checked flag for uninitialized errors
+    bg_error_.PermitUncheckedError();
+    recovery_error_.PermitUncheckedError();
+    recovery_io_error_.PermitUncheckedError();
+  }
+
+  void EnableAutoRecovery() { auto_recovery_ = true; }
+
+  Status::Severity GetErrorSeverity(BackgroundErrorReason reason,
+                                    Status::Code code, Status::SubCode subcode);
+
+  const Status& SetBGError(const Status& bg_err, BackgroundErrorReason reason);
+
+  Status GetBGError() const { return bg_error_; }
+
+  Status GetRecoveryError() const { return recovery_error_; }
+
+  Status ClearBGError();
+
+  bool IsDBStopped() { return is_db_stopped_.load(std::memory_order_acquire); }
+
+  bool IsBGWorkStopped() {
+    assert(db_mutex_);
+    db_mutex_->AssertHeld();
+    return !bg_error_.ok() &&
+           (bg_error_.severity() >= Status::Severity::kHardError ||
+            !auto_recovery_ || soft_error_no_bg_work_);
+  }
+
+  bool IsSoftErrorNoBGWork() { return soft_error_no_bg_work_; }
+
+  bool IsRecoveryInProgress() { return recovery_in_prog_; }
+
+  Status RecoverFromBGError(bool is_manual = false);
+  void CancelErrorRecovery();
+
+  void EndAutoRecovery();
+
+ private:
+  DBImpl* db_;
+  const ImmutableDBOptions& db_options_;
+  Status bg_error_;
+  // A separate Status variable used to record any errors during the
+  // recovery process from hard errors
+  Status recovery_error_;
+  // A separate IO Status variable used to record any IO errors during
+  // the recovery process. At the same time, recovery_error_ is also set.
+  IOStatus recovery_io_error_;
+  // The condition variable used with db_mutex during auto resume for time
+  // wait.
+  InstrumentedCondVar cv_;
+  bool end_recovery_;
+  std::unique_ptr<port::Thread> recovery_thread_;
+
+  InstrumentedMutex* db_mutex_;
+  // A flag indicating whether automatic recovery from errors is enabled
+  bool auto_recovery_;
+  bool recovery_in_prog_;
+  // A flag to indicate that for the soft error, we should not allow any
+  // background work except the work is from recovery.
+  bool soft_error_no_bg_work_;
+
+  // Used to store the context for recover, such as flush reason.
+  DBRecoverContext recover_context_;
+  std::atomic<bool> is_db_stopped_;
+
+  // The pointer of DB statistics.
+  std::shared_ptr<Statistics> bg_error_stats_;
+
+  const Status& HandleKnownErrors(const Status& bg_err,
+                                  BackgroundErrorReason reason);
+  Status OverrideNoSpaceError(const Status& bg_error, bool* auto_recovery);
+  void RecoverFromNoSpace();
+  const Status& StartRecoverFromRetryableBGIOError(const IOStatus& io_error);
+  void RecoverFromRetryableBGIOError();
+  // First, if it is in recovery and the recovery_error is ok. Set the
+  // recovery_error_ to bg_err. Second, if the severity is higher than the
+  // current bg_error_, overwrite it.
+  void CheckAndSetRecoveryAndBGError(const Status& bg_err);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/error_handler_fs_test.cc b/src/rocksdb/db/error_handler_fs_test.cc
new file mode 100644
index 000000000..153f3b79e
--- /dev/null
+++ b/src/rocksdb/db/error_handler_fs_test.cc
@@ -0,0 +1,2875 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#ifndef ROCKSDB_LITE
+
+#include "db/db_test_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "port/stack_trace.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/sst_file_manager.h"
+#if !defined(ROCKSDB_LITE)
+#include "test_util/sync_point.h"
+#endif
+#include "util/random.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/fault_injection_fs.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBErrorHandlingFSTest : public DBTestBase {
+ public:
+  DBErrorHandlingFSTest()
+      : DBTestBase("db_error_handling_fs_test", /*env_do_fsync=*/true) {
+    fault_fs_.reset(new FaultInjectionTestFS(env_->GetFileSystem()));
+    fault_env_.reset(new CompositeEnvWrapper(env_, fault_fs_));
+  }
+
+  std::string GetManifestNameFromLiveFiles() {
+    std::vector<std::string> live_files;
+    uint64_t manifest_size;
+
+    Status s = dbfull()->GetLiveFiles(live_files, &manifest_size, false);
+    if (!s.ok()) {
+      return "";
+    }
+    for (auto& file : live_files) {
+      uint64_t num = 0;
+      FileType type;
+      if (ParseFileName(file, &num, &type) && type == kDescriptorFile) {
+        return file;
+      }
+    }
+    return "";
+  }
+
+  std::shared_ptr<FaultInjectionTestFS> fault_fs_;
+  std::unique_ptr<Env> fault_env_;
+};
+
+class ErrorHandlerFSListener : public EventListener {
+ public:
+  ErrorHandlerFSListener()
+      : mutex_(),
+        cv_(&mutex_),
+        no_auto_recovery_(false),
+        recovery_complete_(false),
+        file_creation_started_(false),
+        override_bg_error_(false),
+        file_count_(0),
+        fault_fs_(nullptr) {}
+  ~ErrorHandlerFSListener() {
+    file_creation_error_.PermitUncheckedError();
+    bg_error_.PermitUncheckedError();
+    new_bg_error_.PermitUncheckedError();
+  }
+
+  void OnTableFileCreationStarted(
+      const TableFileCreationBriefInfo& /*ti*/) override {
+    InstrumentedMutexLock l(&mutex_);
+    file_creation_started_ = true;
+    if (file_count_ > 0) {
+      if (--file_count_ == 0) {
+        fault_fs_->SetFilesystemActive(false, file_creation_error_);
+        file_creation_error_ = IOStatus::OK();
+      }
+    }
+    cv_.SignalAll();
+  }
+
+  void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/, Status bg_error,
+                            bool* auto_recovery) override {
+    bg_error.PermitUncheckedError();
+    if (*auto_recovery && no_auto_recovery_) {
+      *auto_recovery = false;
+    }
+  }
+
+  void OnErrorRecoveryEnd(const BackgroundErrorRecoveryInfo& info) override {
+    InstrumentedMutexLock l(&mutex_);
+    recovery_complete_ = true;
+    cv_.SignalAll();
+    new_bg_error_ = info.new_bg_error;
+  }
+
+  bool WaitForRecovery(uint64_t /*abs_time_us*/) {
+    InstrumentedMutexLock l(&mutex_);
+    while (!recovery_complete_) {
+      cv_.Wait(/*abs_time_us*/);
+    }
+    if (recovery_complete_) {
+      recovery_complete_ = false;
+      return true;
+    }
+    return false;
+  }
+
+  void WaitForTableFileCreationStarted(uint64_t /*abs_time_us*/) {
+    InstrumentedMutexLock l(&mutex_);
+    while (!file_creation_started_) {
+      cv_.Wait(/*abs_time_us*/);
+    }
+    file_creation_started_ = false;
+  }
+
+  void OnBackgroundError(BackgroundErrorReason /*reason*/,
+                         Status* bg_error) override {
+    if (override_bg_error_) {
+      *bg_error = bg_error_;
+      override_bg_error_ = false;
+    }
+  }
+
+  void EnableAutoRecovery(bool enable = true) { no_auto_recovery_ = !enable; }
+
+  void OverrideBGError(Status bg_err) {
+    bg_error_ = bg_err;
+    override_bg_error_ = true;
+  }
+
+  void InjectFileCreationError(FaultInjectionTestFS* fs, int file_count,
+                               IOStatus io_s) {
+    fault_fs_ = fs;
+    file_count_ = file_count;
+    file_creation_error_ = io_s;
+  }
+
+  Status new_bg_error() { return new_bg_error_; }
+
+ private:
+  InstrumentedMutex mutex_;
+  InstrumentedCondVar cv_;
+  bool no_auto_recovery_;
+  bool recovery_complete_;
+  bool file_creation_started_;
+  bool override_bg_error_;
+  int file_count_;
+  IOStatus file_creation_error_;
+  Status bg_error_;
+  Status new_bg_error_;
+  FaultInjectionTestFS* fault_fs_;
+};
+
+TEST_F(DBErrorHandlingFSTest, FLushWriteError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.statistics = CreateDBStatistics();
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_fs_->SetFilesystemActive(false, IOStatus::NoSpace("Out of space"));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  Destroy(options);
+}
+
+// All the NoSpace IOError will be handled as the regular BG Error no matter the
+// retryable flag is set of not. So the auto resume for retryable IO Error will
+// not be triggered. Also, it is mapped as hard error.
+TEST_F(DBErrorHandlingFSTest, FLushWriteNoSpaceError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  options.statistics = CreateDBStatistics();
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::NoSpace("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(1), "val1"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FLushWriteRetryableError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  options.statistics = CreateDBStatistics();
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(1), "val1"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+  Reopen(options);
+  ASSERT_EQ("val1", Get(Key(1)));
+
+  ASSERT_OK(Put(Key(2), "val2"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeSyncTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Reopen(options);
+  ASSERT_EQ("val2", Get(Key(2)));
+
+  ASSERT_OK(Put(Key(3), "val3"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeCloseTableFile",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Reopen(options);
+  ASSERT_EQ("val3", Get(Key(3)));
+
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FLushWriteFileScopeError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error");
+  error_msg.SetDataLoss(true);
+  error_msg.SetScope(
+      ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile);
+  error_msg.SetRetryable(false);
+
+  ASSERT_OK(Put(Key(1), "val1"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Reopen(options);
+  ASSERT_EQ("val1", Get(Key(1)));
+
+  ASSERT_OK(Put(Key(2), "val2"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeSyncTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Reopen(options);
+  ASSERT_EQ("val2", Get(Key(2)));
+
+  ASSERT_OK(Put(Key(3), "val3"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeCloseTableFile",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Reopen(options);
+  ASSERT_EQ("val3", Get(Key(3)));
+
+  // not file scope, but retyrable set
+  error_msg.SetDataLoss(false);
+  error_msg.SetScope(
+      ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFileSystem);
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(3), "val3"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeCloseTableFile",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Reopen(options);
+  ASSERT_EQ("val3", Get(Key(3)));
+
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FLushWALWriteRetryableError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  listener->EnableAutoRecovery(false);
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::SyncClosedLogs:Start",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu, sdfsdfsdf"}, options);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = false;
+  ASSERT_OK(Put(Key(1), "val1", wo));
+
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  auto cfh = dbfull()->GetColumnFamilyHandle(1);
+  s = dbfull()->DropColumnFamily(cfh);
+
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_OK(Put(Key(3), "val3", wo));
+  ASSERT_EQ("val3", Get(Key(3)));
+  s = Flush();
+  ASSERT_OK(s);
+  ASSERT_EQ("val3", Get(Key(3)));
+
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FLushWALAtomicWriteRetryableError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  options.atomic_flush = true;
+  Status s;
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  listener->EnableAutoRecovery(false);
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::SyncClosedLogs:Start",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu, sdfsdfsdf"}, options);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = false;
+  ASSERT_OK(Put(Key(1), "val1", wo));
+
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  auto cfh = dbfull()->GetColumnFamilyHandle(1);
+  s = dbfull()->DropColumnFamily(cfh);
+
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_OK(Put(Key(3), "val3", wo));
+  ASSERT_EQ("val3", Get(Key(3)));
+  s = Flush();
+  ASSERT_OK(s);
+  ASSERT_EQ("val3", Get(Key(3)));
+
+  Destroy(options);
+}
+
+// The flush error is injected before we finish the table build
+TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  options.statistics = CreateDBStatistics();
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = true;
+  ASSERT_OK(Put(Key(1), "val1", wo));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_OK(Put(Key(2), "val2", wo));
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  ASSERT_EQ("val2", Get(Key(2)));
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_EQ("val2", Get(Key(2)));
+  ASSERT_OK(Put(Key(3), "val3", wo));
+  ASSERT_EQ("val3", Get(Key(3)));
+  s = Flush();
+  ASSERT_OK(s);
+  ASSERT_EQ("val3", Get(Key(3)));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+
+  Destroy(options);
+}
+
+// The retryable IO error is injected before we sync table
+TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError2) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = true;
+
+  ASSERT_OK(Put(Key(1), "val1", wo));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeSyncTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_OK(Put(Key(2), "val2", wo));
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  ASSERT_EQ("val2", Get(Key(2)));
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_EQ("val2", Get(Key(2)));
+  ASSERT_OK(Put(Key(3), "val3", wo));
+  ASSERT_EQ("val3", Get(Key(3)));
+  s = Flush();
+  ASSERT_OK(s);
+  ASSERT_EQ("val3", Get(Key(3)));
+
+  Destroy(options);
+}
+
+// The retryable IO error is injected before we close the table file
+TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError3) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = true;
+
+  ASSERT_OK(Put(Key(1), "val1", wo));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeCloseTableFile",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_OK(Put(Key(2), "val2", wo));
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  ASSERT_EQ("val2", Get(Key(2)));
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_EQ("val2", Get(Key(2)));
+  ASSERT_OK(Put(Key(3), "val3", wo));
+  ASSERT_EQ("val3", Get(Key(3)));
+  s = Flush();
+  ASSERT_OK(s);
+  ASSERT_EQ("val3", Get(Key(3)));
+
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, ManifestWriteError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), "val"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+        fault_fs_->SetFilesystemActive(false,
+                                       IOStatus::NoSpace("Out of space"));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), "val"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, ManifestWriteFileScopeError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error");
+  error_msg.SetDataLoss(true);
+  error_msg.SetScope(
+      ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile);
+  error_msg.SetRetryable(false);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), "val"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, ManifestWriteNoWALRetryableError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = true;
+  ASSERT_OK(Put(Key(0), "val", wo));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), "val", wo));
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, DoubleManifestWriteError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), "val"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+        fault_fs_->SetFilesystemActive(false,
+                                       IOStatus::NoSpace("Out of space"));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  fault_fs_->SetFilesystemActive(true);
+
+  // This Resume() will attempt to create a new manifest file and fail again
+  s = dbfull()->Resume();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  fault_fs_->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // A successful Resume() will create a new manifest file
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteError) {
+  if (mem_env_ != nullptr) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+    return;
+  }
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+  std::atomic<bool> fail_manifest(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Put(Key(2), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      // Wait for flush of 2nd L0 file before starting compaction
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"},
+       // Wait for compaction to detect manifest write error
+       {"BackgroundCallCompaction:1", "CompactionManifestWriteError:0"},
+       // Make compaction thread wait for error to be cleared
+       {"CompactionManifestWriteError:1",
+        "DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"},
+       // Wait for DB instance to clear bg_error before calling
+       // TEST_WaitForCompact
+       {"SstFileManagerImpl::ErrorCleared", "CompactionManifestWriteError:2"}});
+  // trigger manifest write failure in compaction thread
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) { fail_manifest.store(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+        if (fail_manifest.load()) {
+          fault_fs_->SetFilesystemActive(false,
+                                         IOStatus::NoSpace("Out of space"));
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  // This Flush will trigger a compaction, which will fail when appending to
+  // the manifest
+  s = Flush();
+  ASSERT_OK(s);
+
+  TEST_SYNC_POINT("CompactionManifestWriteError:0");
+  // Clear all errors so when the compaction is retried, it will succeed
+  fault_fs_->SetFilesystemActive(true);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  TEST_SYNC_POINT("CompactionManifestWriteError:1");
+  TEST_SYNC_POINT("CompactionManifestWriteError:2");
+
+  s = dbfull()->TEST_WaitForCompact();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_OK(s);
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  ASSERT_EQ("val", Get(Key(2)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteRetryableError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+  std::atomic<bool> fail_manifest(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Put(Key(2), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError));
+  listener->EnableAutoRecovery(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      // Wait for flush of 2nd L0 file before starting compaction
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"},
+       // Wait for compaction to detect manifest write error
+       {"BackgroundCallCompaction:1", "CompactionManifestWriteError:0"},
+       // Make compaction thread wait for error to be cleared
+       {"CompactionManifestWriteError:1",
+        "DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"}});
+  // trigger manifest write failure in compaction thread
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) { fail_manifest.store(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+        if (fail_manifest.load()) {
+          fault_fs_->SetFilesystemActive(false, error_msg);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  TEST_SYNC_POINT("CompactionManifestWriteError:0");
+  TEST_SYNC_POINT("CompactionManifestWriteError:1");
+
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+
+  fault_fs_->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  ASSERT_EQ("val", Get(Key(2)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, CompactionWriteError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  Status s;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "va;"));
+  ASSERT_OK(Put(Key(2), "va;"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  listener->OverrideBGError(
+      Status(Status::NoSpace(), Status::Severity::kHardError));
+  listener->EnableAutoRecovery(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) {
+        fault_fs_->SetFilesystemActive(false,
+                                       IOStatus::NoSpace("Out of space"));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, DISABLED_CompactionWriteRetryableError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(0), "va;"));
+  ASSERT_OK(Put(Key(2), "va;"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError));
+  listener->EnableAutoRecovery(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::OpenCompactionOutputFile",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:Finish",
+      [&](void*) { CancelAllBackgroundWork(dbfull()); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  s = dbfull()->TEST_GetBGError();
+  ASSERT_OK(s);
+  fault_fs_->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, DISABLED_CompactionWriteFileScopeError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error");
+  error_msg.SetDataLoss(true);
+  error_msg.SetScope(
+      ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile);
+  error_msg.SetRetryable(false);
+
+  ASSERT_OK(Put(Key(0), "va;"));
+  ASSERT_OK(Put(Key(2), "va;"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError));
+  listener->EnableAutoRecovery(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::OpenCompactionOutputFile",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:Finish",
+      [&](void*) { CancelAllBackgroundWork(dbfull()); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  s = dbfull()->TEST_GetBGError();
+  ASSERT_OK(s);
+
+  fault_fs_->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, CorruptionError) {
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  Status s;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "va;"));
+  ASSERT_OK(Put(Key(2), "va;"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) {
+        fault_fs_->SetFilesystemActive(false,
+                                       IOStatus::Corruption("Corruption"));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(),
+            ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
+
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_NOK(s);
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, AutoRecoverFlushError) {
+  if (mem_env_ != nullptr) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+    return;
+  }
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.statistics = CreateDBStatistics();
+  Status s;
+
+  listener->EnableAutoRecovery();
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_fs_->SetFilesystemActive(false, IOStatus::NoSpace("Out of space"));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+
+  s = Put(Key(1), "val");
+  ASSERT_OK(s);
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FailRecoverFlushError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  Status s;
+
+  listener->EnableAutoRecovery();
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_fs_->SetFilesystemActive(false, IOStatus::NoSpace("Out of space"));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  // We should be able to shutdown the database while auto recovery is going
+  // on in the background
+  Close();
+  DestroyDB(dbname_, options).PermitUncheckedError();
+}
+
+TEST_F(DBErrorHandlingFSTest, WALWriteError) {
+  if (mem_env_ != nullptr) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+    return;
+  }
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.writable_file_max_buffer_size = 32768;
+  options.listeners.emplace_back(listener);
+  Status s;
+  Random rnd(301);
+
+  listener->EnableAutoRecovery();
+  DestroyAndReopen(options);
+
+  {
+    WriteBatch batch;
+
+    for (auto i = 0; i < 100; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
+  };
+
+  {
+    WriteBatch batch;
+    int write_error = 0;
+
+    for (auto i = 100; i < 199; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+          write_error++;
+          if (write_error > 2) {
+            fault_fs_->SetFilesystemActive(false,
+                                           IOStatus::NoSpace("Out of space"));
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    WriteOptions wopts;
+    wopts.sync = true;
+    s = dbfull()->Write(wopts, &batch);
+    ASSERT_EQ(s, s.NoSpace());
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  // `ClearAllCallBacks()` is needed in addition to `DisableProcessing()` to
+  // drain all callbacks. Otherwise, a pending callback in the background
+  // could re-disable `fault_fs_` after we enable it below.
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  fault_fs_->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+  for (auto i = 0; i < 199; ++i) {
+    if (i < 100) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+  Reopen(options);
+  for (auto i = 0; i < 199; ++i) {
+    if (i < 100) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, WALWriteRetryableError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.writable_file_max_buffer_size = 32768;
+  options.listeners.emplace_back(listener);
+  options.paranoid_checks = true;
+  options.max_bgerror_resume_count = 0;
+  Random rnd(301);
+
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  // For the first batch, write is successful, require sync
+  {
+    WriteBatch batch;
+
+    for (auto i = 0; i < 100; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
+  };
+
+  // For the second batch, the first 2 file Append are successful, then the
+  // following Append fails due to file system retryable IOError.
+  {
+    WriteBatch batch;
+    int write_error = 0;
+
+    for (auto i = 100; i < 200; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+          write_error++;
+          if (write_error > 2) {
+            fault_fs_->SetFilesystemActive(false, error_msg);
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    WriteOptions wopts;
+    wopts.sync = true;
+    Status s = dbfull()->Write(wopts, &batch);
+    ASSERT_TRUE(s.IsIOError());
+  }
+  fault_fs_->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // Data in corrupted WAL are not stored
+  for (auto i = 0; i < 199; ++i) {
+    if (i < 100) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+
+  // Resume and write a new batch, should be in the WAL
+  ASSERT_OK(dbfull()->Resume());
+  {
+    WriteBatch batch;
+
+    for (auto i = 200; i < 300; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
+  };
+
+  Reopen(options);
+  for (auto i = 0; i < 300; ++i) {
+    if (i < 100 || i >= 200) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, MultiCFWALWriteError) {
+  if (mem_env_ != nullptr) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+    return;
+  }
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.writable_file_max_buffer_size = 32768;
+  options.listeners.emplace_back(listener);
+  Random rnd(301);
+
+  listener->EnableAutoRecovery();
+  CreateAndReopenWithCF({"one", "two", "three"}, options);
+
+  {
+    WriteBatch batch;
+
+    for (auto i = 1; i < 4; ++i) {
+      for (auto j = 0; j < 100; ++j) {
+        ASSERT_OK(batch.Put(handles_[i], Key(j), rnd.RandomString(1024)));
+      }
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
+  };
+
+  {
+    WriteBatch batch;
+    int write_error = 0;
+
+    // Write to one CF
+    for (auto i = 100; i < 199; ++i) {
+      ASSERT_OK(batch.Put(handles_[2], Key(i), rnd.RandomString(1024)));
+    }
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+          write_error++;
+          if (write_error > 2) {
+            fault_fs_->SetFilesystemActive(false,
+                                           IOStatus::NoSpace("Out of space"));
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    WriteOptions wopts;
+    wopts.sync = true;
+    Status s = dbfull()->Write(wopts, &batch);
+    ASSERT_TRUE(s.IsNoSpace());
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  // `ClearAllCallBacks()` is needed in addition to `DisableProcessing()` to
+  // drain all callbacks. Otherwise, a pending callback in the background
+  // could re-disable `fault_fs_` after we enable it below.
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  fault_fs_->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+
+  for (auto i = 1; i < 4; ++i) {
+    // Every CF should have been flushed
+    ASSERT_EQ(NumTableFilesAtLevel(0, i), 1);
+  }
+
+  for (auto i = 1; i < 4; ++i) {
+    for (auto j = 0; j < 199; ++j) {
+      if (j < 100) {
+        ASSERT_NE(Get(i, Key(j)), "NOT_FOUND");
+      } else {
+        ASSERT_EQ(Get(i, Key(j)), "NOT_FOUND");
+      }
+    }
+  }
+  ReopenWithColumnFamilies({"default", "one", "two", "three"}, options);
+  for (auto i = 1; i < 4; ++i) {
+    for (auto j = 0; j < 199; ++j) {
+      if (j < 100) {
+        ASSERT_NE(Get(i, Key(j)), "NOT_FOUND");
+      } else {
+        ASSERT_EQ(Get(i, Key(j)), "NOT_FOUND");
+      }
+    }
+  }
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) {
+  if (mem_env_ != nullptr) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+    return;
+  }
+  FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(env_);
+  std::vector<std::unique_ptr<Env>> fault_envs;
+  std::vector<FaultInjectionTestFS*> fault_fs;
+  std::vector<Options> options;
+  std::vector<std::shared_ptr<ErrorHandlerFSListener>> listener;
+  std::vector<DB*> db;
+  std::shared_ptr<SstFileManager> sfm(NewSstFileManager(def_env));
+  int kNumDbInstances = 3;
+  Random rnd(301);
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    listener.emplace_back(new ErrorHandlerFSListener());
+    options.emplace_back(GetDefaultOptions());
+    fault_fs.emplace_back(new FaultInjectionTestFS(env_->GetFileSystem()));
+    std::shared_ptr<FileSystem> fs(fault_fs.back());
+    fault_envs.emplace_back(new CompositeEnvWrapper(def_env, fs));
+    options[i].env = fault_envs.back().get();
+    options[i].create_if_missing = true;
+    options[i].level0_file_num_compaction_trigger = 2;
+    options[i].writable_file_max_buffer_size = 32768;
+    options[i].listeners.emplace_back(listener[i]);
+    options[i].sst_file_manager = sfm;
+    DB* dbptr;
+    char buf[16];
+
+    listener[i]->EnableAutoRecovery();
+    // Setup for returning error for the 3rd SST, which would be level 1
+    listener[i]->InjectFileCreationError(fault_fs[i], 3,
+                                         IOStatus::NoSpace("Out of space"));
+    snprintf(buf, sizeof(buf), "_%d", i);
+    ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i]));
+    ASSERT_OK(DB::Open(options[i], dbname_ + std::string(buf), &dbptr));
+    db.emplace_back(dbptr);
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    WriteBatch batch;
+
+    for (auto j = 0; j <= 100; ++j) {
+      ASSERT_OK(batch.Put(Key(j), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(db[i]->Write(wopts, &batch));
+    ASSERT_OK(db[i]->Flush(FlushOptions()));
+  }
+
+  def_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    WriteBatch batch;
+
+    // Write to one CF
+    for (auto j = 100; j < 199; ++j) {
+      ASSERT_OK(batch.Put(Key(j), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(db[i]->Write(wopts, &batch));
+    ASSERT_OK(db[i]->Flush(FlushOptions()));
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    Status s = static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true);
+    ASSERT_EQ(s.severity(), Status::Severity::kSoftError);
+    fault_fs[i]->SetFilesystemActive(true);
+  }
+
+  def_env->SetFilesystemActive(true);
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    std::string prop;
+    ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true);
+    ASSERT_OK(static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true));
+    EXPECT_TRUE(db[i]->GetProperty(
+        "rocksdb.num-files-at-level" + std::to_string(0), &prop));
+    EXPECT_EQ(atoi(prop.c_str()), 0);
+    EXPECT_TRUE(db[i]->GetProperty(
+        "rocksdb.num-files-at-level" + std::to_string(1), &prop));
+    EXPECT_EQ(atoi(prop.c_str()), 1);
+  }
+
+  SstFileManagerImpl* sfmImpl =
+      static_cast_with_check<SstFileManagerImpl>(sfm.get());
+  sfmImpl->Close();
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    char buf[16];
+    snprintf(buf, sizeof(buf), "_%d", i);
+    delete db[i];
+    fault_fs[i]->SetFilesystemActive(true);
+    if (getenv("KEEP_DB")) {
+      printf("DB is still at %s%s\n", dbname_.c_str(), buf);
+    } else {
+      ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i]));
+    }
+  }
+  options.clear();
+  sfm.reset();
+  delete def_env;
+}
+
+TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) {
+  if (mem_env_ != nullptr) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+    return;
+  }
+  FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(env_);
+  std::vector<std::unique_ptr<Env>> fault_envs;
+  std::vector<FaultInjectionTestFS*> fault_fs;
+  std::vector<Options> options;
+  std::vector<std::shared_ptr<ErrorHandlerFSListener>> listener;
+  std::vector<DB*> db;
+  std::shared_ptr<SstFileManager> sfm(NewSstFileManager(def_env));
+  int kNumDbInstances = 3;
+  Random rnd(301);
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    listener.emplace_back(new ErrorHandlerFSListener());
+    options.emplace_back(GetDefaultOptions());
+    fault_fs.emplace_back(new FaultInjectionTestFS(env_->GetFileSystem()));
+    std::shared_ptr<FileSystem> fs(fault_fs.back());
+    fault_envs.emplace_back(new CompositeEnvWrapper(def_env, fs));
+    options[i].env = fault_envs.back().get();
+    options[i].create_if_missing = true;
+    options[i].level0_file_num_compaction_trigger = 2;
+    options[i].writable_file_max_buffer_size = 32768;
+    options[i].listeners.emplace_back(listener[i]);
+    options[i].sst_file_manager = sfm;
+    DB* dbptr;
+    char buf[16];
+
+    listener[i]->EnableAutoRecovery();
+    switch (i) {
+      case 0:
+        // Setup for returning error for the 3rd SST, which would be level 1
+        listener[i]->InjectFileCreationError(fault_fs[i], 3,
+                                             IOStatus::NoSpace("Out of space"));
+        break;
+      case 1:
+        // Setup for returning error after the 1st SST, which would result
+        // in a hard error
+        listener[i]->InjectFileCreationError(fault_fs[i], 2,
+                                             IOStatus::NoSpace("Out of space"));
+        break;
+      default:
+        break;
+    }
+    snprintf(buf, sizeof(buf), "_%d", i);
+    ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i]));
+    ASSERT_OK(DB::Open(options[i], dbname_ + std::string(buf), &dbptr));
+    db.emplace_back(dbptr);
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    WriteBatch batch;
+
+    for (auto j = 0; j <= 100; ++j) {
+      ASSERT_OK(batch.Put(Key(j), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(db[i]->Write(wopts, &batch));
+    ASSERT_OK(db[i]->Flush(FlushOptions()));
+  }
+
+  def_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    WriteBatch batch;
+
+    // Write to one CF
+    for (auto j = 100; j < 199; ++j) {
+      ASSERT_OK(batch.Put(Key(j), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(db[i]->Write(wopts, &batch));
+    if (i != 1) {
+      ASSERT_OK(db[i]->Flush(FlushOptions()));
+    } else {
+      ASSERT_TRUE(db[i]->Flush(FlushOptions()).IsNoSpace());
+    }
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    Status s = static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true);
+    switch (i) {
+      case 0:
+        ASSERT_EQ(s.severity(), Status::Severity::kSoftError);
+        break;
+      case 1:
+        ASSERT_EQ(s.severity(), Status::Severity::kHardError);
+        break;
+      case 2:
+        ASSERT_OK(s);
+        break;
+    }
+    fault_fs[i]->SetFilesystemActive(true);
+  }
+
+  def_env->SetFilesystemActive(true);
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    std::string prop;
+    if (i < 2) {
+      ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true);
+    }
+    if (i == 1) {
+      ASSERT_OK(static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true));
+    }
+    EXPECT_TRUE(db[i]->GetProperty(
+        "rocksdb.num-files-at-level" + std::to_string(0), &prop));
+    EXPECT_EQ(atoi(prop.c_str()), 0);
+    EXPECT_TRUE(db[i]->GetProperty(
+        "rocksdb.num-files-at-level" + std::to_string(1), &prop));
+    EXPECT_EQ(atoi(prop.c_str()), 1);
+  }
+
+  SstFileManagerImpl* sfmImpl =
+      static_cast_with_check<SstFileManagerImpl>(sfm.get());
+  sfmImpl->Close();
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    char buf[16];
+    snprintf(buf, sizeof(buf), "_%d", i);
+    fault_fs[i]->SetFilesystemActive(true);
+    delete db[i];
+    if (getenv("KEEP_DB")) {
+      printf("DB is still at %s%s\n", dbname_.c_str(), buf);
+    } else {
+      EXPECT_OK(DestroyDB(dbname_ + std::string(buf), options[i]));
+    }
+  }
+  options.clear();
+  delete def_env;
+}
+
+// When Put the KV-pair, the write option is set to disable WAL.
+// If retryable error happens in this condition, map the bg error
+// to soft error and trigger auto resume. During auto resume, SwitchMemtable
+// is disabled to avoid small SST tables. Write can still be applied before
+// the bg error is cleaned unless the memtable is full.
+TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover1) {
+  // Activate the FS before the first resume
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  options.statistics = CreateDBStatistics();
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = true;
+  ASSERT_OK(Put(Key(1), "val1", wo));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"RecoverFromRetryableBGIOError:LoopOut",
+        "FLushWritNoWALRetryableeErrorAutoRecover1:1"}});
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  TEST_SYNC_POINT("FLushWritNoWALRetryableeErrorAutoRecover1:1");
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_EQ("val1", Get(Key(1)));
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  ASSERT_EQ(3, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(3, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(3, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_LE(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  ASSERT_LE(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+  HistogramData autoresume_retry;
+  options.statistics->histogramData(ERROR_HANDLER_AUTORESUME_RETRY_COUNT,
+                                    &autoresume_retry);
+  ASSERT_GE(autoresume_retry.max, 0);
+  ASSERT_OK(Put(Key(2), "val2", wo));
+  s = Flush();
+  // Since auto resume fails, the bg error is not cleand, flush will
+  // return the bg_error set before.
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  ASSERT_EQ("val2", Get(Key(2)));
+
+  // call auto resume
+  ASSERT_OK(dbfull()->Resume());
+  ASSERT_OK(Put(Key(3), "val3", wo));
+  // After resume is successful, the flush should be ok.
+  ASSERT_OK(Flush());
+  ASSERT_EQ("val3", Get(Key(3)));
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover2) {
+  // Activate the FS before the first resume
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  options.statistics = CreateDBStatistics();
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = true;
+  ASSERT_OK(Put(Key(1), "val1", wo));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_LE(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  ASSERT_LE(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+  HistogramData autoresume_retry;
+  options.statistics->histogramData(ERROR_HANDLER_AUTORESUME_RETRY_COUNT,
+                                    &autoresume_retry);
+  ASSERT_GE(autoresume_retry.max, 0);
+  ASSERT_OK(Put(Key(2), "val2", wo));
+  s = Flush();
+  // Since auto resume is successful, the bg error is cleaned, flush will
+  // be successful.
+  ASSERT_OK(s);
+  ASSERT_EQ("val2", Get(Key(2)));
+  Destroy(options);
+}
+
+// Auto resume fromt the flush retryable IO error. Activate the FS before the
+// first resume. Resume is successful
+TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover1) {
+  // Activate the FS before the first resume
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(1), "val1"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+
+  ASSERT_EQ("val1", Get(Key(1)));
+  Reopen(options);
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_OK(Put(Key(2), "val2"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("val2", Get(Key(2)));
+
+  Destroy(options);
+}
+
+// Auto resume fromt the flush retryable IO error and set the retry limit count.
+// Never activate the FS and auto resume should fail at the end
+TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover2) {
+  // Fail all the resume and let user to resume
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(1), "val1"));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"FLushWritRetryableeErrorAutoRecover2:0",
+        "RecoverFromRetryableBGIOError:BeforeStart"},
+       {"RecoverFromRetryableBGIOError:LoopOut",
+        "FLushWritRetryableeErrorAutoRecover2:1"}});
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover2:0");
+  TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover2:1");
+  fault_fs_->SetFilesystemActive(true);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  ASSERT_EQ("val1", Get(Key(1)));
+  // Auto resume fails due to FS does not recover during resume. User call
+  // resume manually here.
+  s = dbfull()->Resume();
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_OK(s);
+  ASSERT_OK(Put(Key(2), "val2"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("val2", Get(Key(2)));
+
+  Destroy(options);
+}
+
+// Auto resume fromt the flush retryable IO error and set the retry limit count.
+// Fail the first resume and let the second resume be successful.
+TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableErrorAutoRecover) {
+  // Fail the first resume and let the second resume be successful
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), "val"));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"RecoverFromRetryableBGIOError:BeforeStart",
+        "ManifestWriteRetryableErrorAutoRecover:0"},
+       {"ManifestWriteRetryableErrorAutoRecover:1",
+        "RecoverFromRetryableBGIOError:BeforeWait1"},
+       {"RecoverFromRetryableBGIOError:RecoverSuccess",
+        "ManifestWriteRetryableErrorAutoRecover:2"}});
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:0");
+  fault_fs_->SetFilesystemActive(true);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:1");
+  TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:2");
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, ManifestWriteNoWALRetryableErrorAutoRecover) {
+  // Fail the first resume and let the second resume be successful
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = true;
+  ASSERT_OK(Put(Key(0), "val", wo));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), "val", wo));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"RecoverFromRetryableBGIOError:BeforeStart",
+        "ManifestWriteNoWALRetryableErrorAutoRecover:0"},
+       {"ManifestWriteNoWALRetryableErrorAutoRecover:1",
+        "RecoverFromRetryableBGIOError:BeforeWait1"},
+       {"RecoverFromRetryableBGIOError:RecoverSuccess",
+        "ManifestWriteNoWALRetryableErrorAutoRecover:2"}});
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  TEST_SYNC_POINT("ManifestWriteNoWALRetryableErrorAutoRecover:0");
+  fault_fs_->SetFilesystemActive(true);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  TEST_SYNC_POINT("ManifestWriteNoWALRetryableErrorAutoRecover:1");
+  TEST_SYNC_POINT("ManifestWriteNoWALRetryableErrorAutoRecover:2");
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest,
+       CompactionManifestWriteRetryableErrorAutoRecover) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+  std::atomic<bool> fail_manifest(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Put(Key(2), "val"));
+  ASSERT_OK(Flush());
+
+  listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError));
+  listener->EnableAutoRecovery(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      // Wait for flush of 2nd L0 file before starting compaction
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"},
+       // Wait for compaction to detect manifest write error
+       {"BackgroundCallCompaction:1", "CompactionManifestWriteErrorAR:0"},
+       // Make compaction thread wait for error to be cleared
+       {"CompactionManifestWriteErrorAR:1",
+        "DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"},
+       {"CompactionManifestWriteErrorAR:2",
+        "RecoverFromRetryableBGIOError:BeforeStart"},
+       // Fail the first resume, before the wait in resume
+       {"RecoverFromRetryableBGIOError:BeforeResume0",
+        "CompactionManifestWriteErrorAR:3"},
+       // Activate the FS before the second resume
+       {"CompactionManifestWriteErrorAR:4",
+        "RecoverFromRetryableBGIOError:BeforeResume1"},
+       // Wait the auto resume be sucessful
+       {"RecoverFromRetryableBGIOError:RecoverSuccess",
+        "CompactionManifestWriteErrorAR:5"}});
+  // trigger manifest write failure in compaction thread
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) { fail_manifest.store(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+        if (fail_manifest.load()) {
+          fault_fs_->SetFilesystemActive(false, error_msg);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  TEST_SYNC_POINT("CompactionManifestWriteErrorAR:0");
+  TEST_SYNC_POINT("CompactionManifestWriteErrorAR:1");
+
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  TEST_SYNC_POINT("CompactionManifestWriteErrorAR:2");
+  TEST_SYNC_POINT("CompactionManifestWriteErrorAR:3");
+  fault_fs_->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  TEST_SYNC_POINT("CompactionManifestWriteErrorAR:4");
+  TEST_SYNC_POINT("CompactionManifestWriteErrorAR:5");
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  ASSERT_EQ("val", Get(Key(2)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, CompactionWriteRetryableErrorAutoRecover) {
+  // In this test, in the first round of compaction, the FS is set to error.
+  // So the first compaction fails due to retryable IO error and it is mapped
+  // to soft error. Then, compaction is rescheduled, in the second round of
+  // compaction, the FS is set to active and compaction is successful, so
+  // the test will hit the CompactionJob::FinishCompactionOutputFile1 sync
+  // point.
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  Status s;
+  std::atomic<bool> fail_first(false);
+  std::atomic<bool> fail_second(true);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(0), "va;"));
+  ASSERT_OK(Put(Key(2), "va;"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError));
+  listener->EnableAutoRecovery(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"},
+       {"CompactionJob::FinishCompactionOutputFile1",
+        "CompactionWriteRetryableErrorAutoRecover0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:Start",
+      [&](void*) { fault_fs_->SetFilesystemActive(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) { fail_first.store(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::OpenCompactionOutputFile", [&](void*) {
+        if (fail_first.load() && fail_second.load()) {
+          fault_fs_->SetFilesystemActive(false, error_msg);
+          fail_second.store(false);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(s);
+  TEST_SYNC_POINT("CompactionWriteRetryableErrorAutoRecover0");
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover1) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.writable_file_max_buffer_size = 32768;
+  options.listeners.emplace_back(listener);
+  options.paranoid_checks = true;
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  Status s;
+  Random rnd(301);
+
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  // For the first batch, write is successful, require sync
+  {
+    WriteBatch batch;
+
+    for (auto i = 0; i < 100; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
+  };
+
+  // For the second batch, the first 2 file Append are successful, then the
+  // following Append fails due to file system retryable IOError.
+  {
+    WriteBatch batch;
+    int write_error = 0;
+
+    for (auto i = 100; i < 200; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+        {{"WALWriteErrorDone", "RecoverFromRetryableBGIOError:BeforeStart"},
+         {"RecoverFromRetryableBGIOError:BeforeResume0", "WALWriteError1:0"},
+         {"WALWriteError1:1", "RecoverFromRetryableBGIOError:BeforeResume1"},
+         {"RecoverFromRetryableBGIOError:RecoverSuccess", "WALWriteError1:2"}});
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+          write_error++;
+          if (write_error > 2) {
+            fault_fs_->SetFilesystemActive(false, error_msg);
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    WriteOptions wopts;
+    wopts.sync = true;
+    s = dbfull()->Write(wopts, &batch);
+    ASSERT_EQ(true, s.IsIOError());
+    TEST_SYNC_POINT("WALWriteErrorDone");
+
+    TEST_SYNC_POINT("WALWriteError1:0");
+    fault_fs_->SetFilesystemActive(true);
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    TEST_SYNC_POINT("WALWriteError1:1");
+    TEST_SYNC_POINT("WALWriteError1:2");
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // Data in corrupted WAL are not stored
+  for (auto i = 0; i < 199; ++i) {
+    if (i < 100) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+
+  // Resume and write a new batch, should be in the WAL
+  {
+    WriteBatch batch;
+
+    for (auto i = 200; i < 300; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
+  };
+
+  Reopen(options);
+  for (auto i = 0; i < 300; ++i) {
+    if (i < 100 || i >= 200) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover2) {
+  // Fail the first recover and try second time.
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.writable_file_max_buffer_size = 32768;
+  options.listeners.emplace_back(listener);
+  options.paranoid_checks = true;
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  Status s;
+  Random rnd(301);
+
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  // For the first batch, write is successful, require sync
+  {
+    WriteBatch batch;
+
+    for (auto i = 0; i < 100; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
+  };
+
+  // For the second batch, the first 2 file Append are successful, then the
+  // following Append fails due to file system retryable IOError.
+  {
+    WriteBatch batch;
+    int write_error = 0;
+
+    for (auto i = 100; i < 200; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+        {{"RecoverFromRetryableBGIOError:BeforeWait0", "WALWriteError2:0"},
+         {"WALWriteError2:1", "RecoverFromRetryableBGIOError:BeforeWait1"},
+         {"RecoverFromRetryableBGIOError:RecoverSuccess", "WALWriteError2:2"}});
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+          write_error++;
+          if (write_error > 2) {
+            fault_fs_->SetFilesystemActive(false, error_msg);
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    WriteOptions wopts;
+    wopts.sync = true;
+    s = dbfull()->Write(wopts, &batch);
+    ASSERT_EQ(true, s.IsIOError());
+
+    TEST_SYNC_POINT("WALWriteError2:0");
+    fault_fs_->SetFilesystemActive(true);
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    TEST_SYNC_POINT("WALWriteError2:1");
+    TEST_SYNC_POINT("WALWriteError2:2");
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // Data in corrupted WAL are not stored
+  for (auto i = 0; i < 199; ++i) {
+    if (i < 100) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+
+  // Resume and write a new batch, should be in the WAL
+  {
+    WriteBatch batch;
+
+    for (auto i = 200; i < 300; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
+  };
+
+  Reopen(options);
+  for (auto i = 0; i < 300; ++i) {
+    if (i < 100 || i >= 200) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+  Close();
+}
+
+// Fail auto resume from a flush retryable error and verify that
+// OnErrorRecoveryEnd listener callback is called
+TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAbortRecovery) {
+  // Activate the FS before the first resume
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(1), "val1"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+  ASSERT_EQ(listener->new_bg_error(), Status::Aborted());
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FlushReadError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener =
+      std::make_shared<ErrorHandlerFSListener>();
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.statistics = CreateDBStatistics();
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeOutputValidation", [&](void*) {
+        IOStatus st = IOStatus::IOError();
+        st.SetRetryable(true);
+        st.SetScope(IOStatus::IOErrorScope::kIOErrorScopeFile);
+        fault_fs_->SetFilesystemActive(false, st);
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeDeleteFile",
+      [&](void*) { fault_fs_->SetFilesystemActive(true, IOStatus::OK()); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_LE(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_LE(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  s = dbfull()->TEST_GetBGError();
+  ASSERT_OK(s);
+
+  Reopen(GetDefaultOptions());
+  ASSERT_EQ("val", Get(Key(0)));
+}
+
+TEST_F(DBErrorHandlingFSTest, AtomicFlushReadError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener =
+      std::make_shared<ErrorHandlerFSListener>();
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.statistics = CreateDBStatistics();
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  options.atomic_flush = true;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(0, Key(0), "val"));
+  ASSERT_OK(Put(1, Key(0), "val"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeOutputValidation", [&](void*) {
+        IOStatus st = IOStatus::IOError();
+        st.SetRetryable(true);
+        st.SetScope(IOStatus::IOErrorScope::kIOErrorScopeFile);
+        fault_fs_->SetFilesystemActive(false, st);
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeDeleteFile",
+      [&](void*) { fault_fs_->SetFilesystemActive(true, IOStatus::OK()); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush({0, 1});
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_LE(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_LE(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  s = dbfull()->TEST_GetBGError();
+  ASSERT_OK(s);
+
+  TryReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"},
+                              GetDefaultOptions());
+  ASSERT_EQ("val", Get(Key(0)));
+}
+
+TEST_F(DBErrorHandlingFSTest, AtomicFlushNoSpaceError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener =
+      std::make_shared<ErrorHandlerFSListener>();
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.statistics = CreateDBStatistics();
+  Status s;
+
+  listener->EnableAutoRecovery(true);
+  options.atomic_flush = true;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(0, Key(0), "val"));
+  ASSERT_OK(Put(1, Key(0), "val"));
+  SyncPoint::GetInstance()->SetCallBack("BuildTable:create_file", [&](void*) {
+    IOStatus st = IOStatus::NoSpace();
+    fault_fs_->SetFilesystemActive(false, st);
+  });
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeDeleteFile",
+      [&](void*) { fault_fs_->SetFilesystemActive(true, IOStatus::OK()); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush({0, 1});
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+  ASSERT_LE(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_LE(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  s = dbfull()->TEST_GetBGError();
+  ASSERT_OK(s);
+
+  TryReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"},
+                              GetDefaultOptions());
+  ASSERT_EQ("val", Get(Key(0)));
+}
+
+TEST_F(DBErrorHandlingFSTest, CompactionReadRetryableErrorAutoRecover) {
+  // In this test, in the first round of compaction, the FS is set to error.
+  // So the first compaction fails due to retryable IO error and it is mapped
+  // to soft error. Then, compaction is rescheduled, in the second round of
+  // compaction, the FS is set to active and compaction is successful, so
+  // the test will hit the CompactionJob::FinishCompactionOutputFile1 sync
+  // point.
+  std::shared_ptr<ErrorHandlerFSListener> listener =
+      std::make_shared<ErrorHandlerFSListener>();
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Status s;
+  std::atomic<bool> fail_first(false);
+  std::atomic<bool> fail_second(true);
+  Random rnd(301);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(1024)));
+  }
+  s = Flush();
+  ASSERT_OK(s);
+
+  listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError));
+  listener->EnableAutoRecovery(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"},
+       {"CompactionJob::FinishCompactionOutputFile1",
+        "CompactionWriteRetryableErrorAutoRecover0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:Start",
+      [&](void*) { fault_fs_->SetFilesystemActive(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) { fail_first.store(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():PausingManualCompaction:2", [&](void*) {
+        if (fail_first.load() && fail_second.load()) {
+          fault_fs_->SetFilesystemActive(false, error_msg);
+          fail_second.store(false);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(s);
+  TEST_SYNC_POINT("CompactionWriteRetryableErrorAutoRecover0");
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  Reopen(GetDefaultOptions());
+}
+
+class DBErrorHandlingFencingTest : public DBErrorHandlingFSTest,
+                                   public testing::WithParamInterface<bool> {};
+
+TEST_P(DBErrorHandlingFencingTest, FLushWriteFenced) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.paranoid_checks = GetParam();
+  Status s;
+
+  listener->EnableAutoRecovery(true);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_fs_->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced"));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+  ASSERT_TRUE(s.IsIOFenced());
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_TRUE(s.IsIOFenced());
+  Destroy(options);
+}
+
+TEST_P(DBErrorHandlingFencingTest, ManifestWriteFenced) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.paranoid_checks = GetParam();
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(true);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), "val"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+        fault_fs_->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced"));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+  ASSERT_TRUE(s.IsIOFenced());
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_TRUE(s.IsIOFenced());
+  Close();
+}
+
+TEST_P(DBErrorHandlingFencingTest, CompactionWriteFenced) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  options.paranoid_checks = GetParam();
+  Status s;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "va;"));
+  ASSERT_OK(Put(Key(2), "va;"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  listener->EnableAutoRecovery(true);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) {
+        fault_fs_->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced"));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+  ASSERT_TRUE(s.IsIOFenced());
+
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_TRUE(s.IsIOFenced());
+  Destroy(options);
+}
+
+TEST_P(DBErrorHandlingFencingTest, WALWriteFenced) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.writable_file_max_buffer_size = 32768;
+  options.listeners.emplace_back(listener);
+  options.paranoid_checks = GetParam();
+  Status s;
+  Random rnd(301);
+
+  listener->EnableAutoRecovery(true);
+  DestroyAndReopen(options);
+
+  {
+    WriteBatch batch;
+
+    for (auto i = 0; i < 100; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
+  };
+
+  {
+    WriteBatch batch;
+    int write_error = 0;
+
+    for (auto i = 100; i < 199; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+          write_error++;
+          if (write_error > 2) {
+            fault_fs_->SetFilesystemActive(false,
+                                           IOStatus::IOFenced("IO fenced"));
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    WriteOptions wopts;
+    wopts.sync = true;
+    s = dbfull()->Write(wopts, &batch);
+    ASSERT_TRUE(s.IsIOFenced());
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  {
+    WriteBatch batch;
+
+    for (auto i = 0; i < 100; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    s = dbfull()->Write(wopts, &batch);
+    ASSERT_TRUE(s.IsIOFenced());
+  }
+  Close();
+}
+
+INSTANTIATE_TEST_CASE_P(DBErrorHandlingFSTest, DBErrorHandlingFencingTest,
+                        ::testing::Bool());
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/event_helpers.cc b/src/rocksdb/db/event_helpers.cc
new file mode 100644
index 000000000..7987b8ec6
--- /dev/null
+++ b/src/rocksdb/db/event_helpers.cc
@@ -0,0 +1,371 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/event_helpers.h"
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/utilities/customizable_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+#ifndef ROCKSDB_LITE
+Status EventListener::CreateFromString(const ConfigOptions& config_options,
+                                       const std::string& id,
+                                       std::shared_ptr<EventListener>* result) {
+  return LoadSharedObject<EventListener>(config_options, id, nullptr, result);
+}
+#endif  // ROCKSDB_LITE
+
+namespace {
+template <class T>
+inline T SafeDivide(T a, T b) {
+  return b == 0 ? 0 : a / b;
+}
+}  // anonymous namespace
+
+void EventHelpers::AppendCurrentTime(JSONWriter* jwriter) {
+  *jwriter << "time_micros"
+           << std::chrono::duration_cast<std::chrono::microseconds>(
+                  std::chrono::system_clock::now().time_since_epoch())
+                  .count();
+}
+
+#ifndef ROCKSDB_LITE
+void EventHelpers::NotifyTableFileCreationStarted(
+    const std::vector<std::shared_ptr<EventListener>>& listeners,
+    const std::string& db_name, const std::string& cf_name,
+    const std::string& file_path, int job_id, TableFileCreationReason reason) {
+  if (listeners.empty()) {
+    return;
+  }
+  TableFileCreationBriefInfo info;
+  info.db_name = db_name;
+  info.cf_name = cf_name;
+  info.file_path = file_path;
+  info.job_id = job_id;
+  info.reason = reason;
+  for (auto& listener : listeners) {
+    listener->OnTableFileCreationStarted(info);
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+void EventHelpers::NotifyOnBackgroundError(
+    const std::vector<std::shared_ptr<EventListener>>& listeners,
+    BackgroundErrorReason reason, Status* bg_error, InstrumentedMutex* db_mutex,
+    bool* auto_recovery) {
+#ifndef ROCKSDB_LITE
+  if (listeners.empty()) {
+    return;
+  }
+  db_mutex->AssertHeld();
+  // release lock while notifying events
+  db_mutex->Unlock();
+  for (auto& listener : listeners) {
+    listener->OnBackgroundError(reason, bg_error);
+    bg_error->PermitUncheckedError();
+    if (*auto_recovery) {
+      listener->OnErrorRecoveryBegin(reason, *bg_error, auto_recovery);
+    }
+  }
+  db_mutex->Lock();
+#else
+  (void)listeners;
+  (void)reason;
+  (void)bg_error;
+  (void)db_mutex;
+  (void)auto_recovery;
+#endif  // ROCKSDB_LITE
+}
+
+void EventHelpers::LogAndNotifyTableFileCreationFinished(
+    EventLogger* event_logger,
+    const std::vector<std::shared_ptr<EventListener>>& listeners,
+    const std::string& db_name, const std::string& cf_name,
+    const std::string& file_path, int job_id, const FileDescriptor& fd,
+    uint64_t oldest_blob_file_number, const TableProperties& table_properties,
+    TableFileCreationReason reason, const Status& s,
+    const std::string& file_checksum,
+    const std::string& file_checksum_func_name) {
+  if (s.ok() && event_logger) {
+    JSONWriter jwriter;
+    AppendCurrentTime(&jwriter);
+    jwriter << "cf_name" << cf_name << "job" << job_id << "event"
+            << "table_file_creation"
+            << "file_number" << fd.GetNumber() << "file_size"
+            << fd.GetFileSize() << "file_checksum"
+            << Slice(file_checksum).ToString(true) << "file_checksum_func_name"
+            << file_checksum_func_name << "smallest_seqno" << fd.smallest_seqno
+            << "largest_seqno" << fd.largest_seqno;
+
+    // table_properties
+    {
+      jwriter << "table_properties";
+      jwriter.StartObject();
+
+      // basic properties:
+      jwriter << "data_size" << table_properties.data_size << "index_size"
+              << table_properties.index_size << "index_partitions"
+              << table_properties.index_partitions << "top_level_index_size"
+              << table_properties.top_level_index_size
+              << "index_key_is_user_key"
+              << table_properties.index_key_is_user_key
+              << "index_value_is_delta_encoded"
+              << table_properties.index_value_is_delta_encoded << "filter_size"
+              << table_properties.filter_size << "raw_key_size"
+              << table_properties.raw_key_size << "raw_average_key_size"
+              << SafeDivide(table_properties.raw_key_size,
+                            table_properties.num_entries)
+              << "raw_value_size" << table_properties.raw_value_size
+              << "raw_average_value_size"
+              << SafeDivide(table_properties.raw_value_size,
+                            table_properties.num_entries)
+              << "num_data_blocks" << table_properties.num_data_blocks
+              << "num_entries" << table_properties.num_entries
+              << "num_filter_entries" << table_properties.num_filter_entries
+              << "num_deletions" << table_properties.num_deletions
+              << "num_merge_operands" << table_properties.num_merge_operands
+              << "num_range_deletions" << table_properties.num_range_deletions
+              << "format_version" << table_properties.format_version
+              << "fixed_key_len" << table_properties.fixed_key_len
+              << "filter_policy" << table_properties.filter_policy_name
+              << "column_family_name" << table_properties.column_family_name
+              << "column_family_id" << table_properties.column_family_id
+              << "comparator" << table_properties.comparator_name
+              << "merge_operator" << table_properties.merge_operator_name
+              << "prefix_extractor_name"
+              << table_properties.prefix_extractor_name << "property_collectors"
+              << table_properties.property_collectors_names << "compression"
+              << table_properties.compression_name << "compression_options"
+              << table_properties.compression_options << "creation_time"
+              << table_properties.creation_time << "oldest_key_time"
+              << table_properties.oldest_key_time << "file_creation_time"
+              << table_properties.file_creation_time
+              << "slow_compression_estimated_data_size"
+              << table_properties.slow_compression_estimated_data_size
+              << "fast_compression_estimated_data_size"
+              << table_properties.fast_compression_estimated_data_size
+              << "db_id" << table_properties.db_id << "db_session_id"
+              << table_properties.db_session_id << "orig_file_number"
+              << table_properties.orig_file_number << "seqno_to_time_mapping";
+
+      if (table_properties.seqno_to_time_mapping.empty()) {
+        jwriter << "N/A";
+      } else {
+        SeqnoToTimeMapping tmp;
+        Status status = tmp.Add(table_properties.seqno_to_time_mapping);
+        if (status.ok()) {
+          jwriter << tmp.ToHumanString();
+        } else {
+          jwriter << "Invalid";
+        }
+      }
+
+      // user collected properties
+      for (const auto& prop : table_properties.readable_properties) {
+        jwriter << prop.first << prop.second;
+      }
+      jwriter.EndObject();
+    }
+
+    if (oldest_blob_file_number != kInvalidBlobFileNumber) {
+      jwriter << "oldest_blob_file_number" << oldest_blob_file_number;
+    }
+
+    jwriter.EndObject();
+
+    event_logger->Log(jwriter);
+  }
+
+#ifndef ROCKSDB_LITE
+  if (listeners.empty()) {
+    return;
+  }
+  TableFileCreationInfo info;
+  info.db_name = db_name;
+  info.cf_name = cf_name;
+  info.file_path = file_path;
+  info.file_size = fd.file_size;
+  info.job_id = job_id;
+  info.table_properties = table_properties;
+  info.reason = reason;
+  info.status = s;
+  info.file_checksum = file_checksum;
+  info.file_checksum_func_name = file_checksum_func_name;
+  for (auto& listener : listeners) {
+    listener->OnTableFileCreated(info);
+  }
+  info.status.PermitUncheckedError();
+#else
+  (void)listeners;
+  (void)db_name;
+  (void)cf_name;
+  (void)file_path;
+  (void)reason;
+#endif  // !ROCKSDB_LITE
+}
+
+void EventHelpers::LogAndNotifyTableFileDeletion(
+    EventLogger* event_logger, int job_id, uint64_t file_number,
+    const std::string& file_path, const Status& status,
+    const std::string& dbname,
+    const std::vector<std::shared_ptr<EventListener>>& listeners) {
+  JSONWriter jwriter;
+  AppendCurrentTime(&jwriter);
+
+  jwriter << "job" << job_id << "event"
+          << "table_file_deletion"
+          << "file_number" << file_number;
+  if (!status.ok()) {
+    jwriter << "status" << status.ToString();
+  }
+
+  jwriter.EndObject();
+
+  event_logger->Log(jwriter);
+
+#ifndef ROCKSDB_LITE
+  if (listeners.empty()) {
+    return;
+  }
+  TableFileDeletionInfo info;
+  info.db_name = dbname;
+  info.job_id = job_id;
+  info.file_path = file_path;
+  info.status = status;
+  for (auto& listener : listeners) {
+    listener->OnTableFileDeleted(info);
+  }
+  info.status.PermitUncheckedError();
+#else
+  (void)file_path;
+  (void)dbname;
+  (void)listeners;
+#endif  // !ROCKSDB_LITE
+}
+
+void EventHelpers::NotifyOnErrorRecoveryEnd(
+    const std::vector<std::shared_ptr<EventListener>>& listeners,
+    const Status& old_bg_error, const Status& new_bg_error,
+    InstrumentedMutex* db_mutex) {
+#ifndef ROCKSDB_LITE
+  if (!listeners.empty()) {
+    db_mutex->AssertHeld();
+    // release lock while notifying events
+    db_mutex->Unlock();
+    for (auto& listener : listeners) {
+      BackgroundErrorRecoveryInfo info;
+      info.old_bg_error = old_bg_error;
+      info.new_bg_error = new_bg_error;
+      listener->OnErrorRecoveryCompleted(old_bg_error);
+      listener->OnErrorRecoveryEnd(info);
+      info.old_bg_error.PermitUncheckedError();
+      info.new_bg_error.PermitUncheckedError();
+    }
+    db_mutex->Lock();
+  }
+#else
+  (void)listeners;
+  (void)old_bg_error;
+  (void)new_bg_error;
+  (void)db_mutex;
+#endif  // ROCKSDB_LITE
+}
+
+#ifndef ROCKSDB_LITE
+void EventHelpers::NotifyBlobFileCreationStarted(
+    const std::vector<std::shared_ptr<EventListener>>& listeners,
+    const std::string& db_name, const std::string& cf_name,
+    const std::string& file_path, int job_id,
+    BlobFileCreationReason creation_reason) {
+  if (listeners.empty()) {
+    return;
+  }
+  BlobFileCreationBriefInfo info(db_name, cf_name, file_path, job_id,
+                                 creation_reason);
+  for (const auto& listener : listeners) {
+    listener->OnBlobFileCreationStarted(info);
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+void EventHelpers::LogAndNotifyBlobFileCreationFinished(
+    EventLogger* event_logger,
+    const std::vector<std::shared_ptr<EventListener>>& listeners,
+    const std::string& db_name, const std::string& cf_name,
+    const std::string& file_path, int job_id, uint64_t file_number,
+    BlobFileCreationReason creation_reason, const Status& s,
+    const std::string& file_checksum,
+    const std::string& file_checksum_func_name, uint64_t total_blob_count,
+    uint64_t total_blob_bytes) {
+  if (s.ok() && event_logger) {
+    JSONWriter jwriter;
+    AppendCurrentTime(&jwriter);
+    jwriter << "cf_name" << cf_name << "job" << job_id << "event"
+            << "blob_file_creation"
+            << "file_number" << file_number << "total_blob_count"
+            << total_blob_count << "total_blob_bytes" << total_blob_bytes
+            << "file_checksum" << file_checksum << "file_checksum_func_name"
+            << file_checksum_func_name << "status" << s.ToString();
+
+    jwriter.EndObject();
+    event_logger->Log(jwriter);
+  }
+
+#ifndef ROCKSDB_LITE
+  if (listeners.empty()) {
+    return;
+  }
+  BlobFileCreationInfo info(db_name, cf_name, file_path, job_id,
+                            creation_reason, total_blob_count, total_blob_bytes,
+                            s, file_checksum, file_checksum_func_name);
+  for (const auto& listener : listeners) {
+    listener->OnBlobFileCreated(info);
+  }
+  info.status.PermitUncheckedError();
+#else
+  (void)listeners;
+  (void)db_name;
+  (void)file_path;
+  (void)creation_reason;
+#endif
+}
+
+void EventHelpers::LogAndNotifyBlobFileDeletion(
+    EventLogger* event_logger,
+    const std::vector<std::shared_ptr<EventListener>>& listeners, int job_id,
+    uint64_t file_number, const std::string& file_path, const Status& status,
+    const std::string& dbname) {
+  if (event_logger) {
+    JSONWriter jwriter;
+    AppendCurrentTime(&jwriter);
+
+    jwriter << "job" << job_id << "event"
+            << "blob_file_deletion"
+            << "file_number" << file_number;
+    if (!status.ok()) {
+      jwriter << "status" << status.ToString();
+    }
+
+    jwriter.EndObject();
+    event_logger->Log(jwriter);
+  }
+#ifndef ROCKSDB_LITE
+  if (listeners.empty()) {
+    return;
+  }
+  BlobFileDeletionInfo info(dbname, file_path, job_id, status);
+  for (const auto& listener : listeners) {
+    listener->OnBlobFileDeleted(info);
+  }
+  info.status.PermitUncheckedError();
+#else
+  (void)listeners;
+  (void)dbname;
+  (void)file_path;
+#endif  // !ROCKSDB_LITE
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/event_helpers.h b/src/rocksdb/db/event_helpers.h
new file mode 100644
index 000000000..68d819fe6
--- /dev/null
+++ b/src/rocksdb/db/event_helpers.h
@@ -0,0 +1,82 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/version_edit.h"
+#include "logging/event_logger.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/table_properties.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class EventHelpers {
+ public:
+  static void AppendCurrentTime(JSONWriter* json_writer);
+#ifndef ROCKSDB_LITE
+  static void NotifyTableFileCreationStarted(
+      const std::vector<std::shared_ptr<EventListener>>& listeners,
+      const std::string& db_name, const std::string& cf_name,
+      const std::string& file_path, int job_id, TableFileCreationReason reason);
+#endif  // !ROCKSDB_LITE
+  static void NotifyOnBackgroundError(
+      const std::vector<std::shared_ptr<EventListener>>& listeners,
+      BackgroundErrorReason reason, Status* bg_error,
+      InstrumentedMutex* db_mutex, bool* auto_recovery);
+  static void LogAndNotifyTableFileCreationFinished(
+      EventLogger* event_logger,
+      const std::vector<std::shared_ptr<EventListener>>& listeners,
+      const std::string& db_name, const std::string& cf_name,
+      const std::string& file_path, int job_id, const FileDescriptor& fd,
+      uint64_t oldest_blob_file_number, const TableProperties& table_properties,
+      TableFileCreationReason reason, const Status& s,
+      const std::string& file_checksum,
+      const std::string& file_checksum_func_name);
+  static void LogAndNotifyTableFileDeletion(
+      EventLogger* event_logger, int job_id, uint64_t file_number,
+      const std::string& file_path, const Status& status,
+      const std::string& db_name,
+      const std::vector<std::shared_ptr<EventListener>>& listeners);
+  static void NotifyOnErrorRecoveryEnd(
+      const std::vector<std::shared_ptr<EventListener>>& listeners,
+      const Status& old_bg_error, const Status& new_bg_error,
+      InstrumentedMutex* db_mutex);
+
+#ifndef ROCKSDB_LITE
+  static void NotifyBlobFileCreationStarted(
+      const std::vector<std::shared_ptr<EventListener>>& listeners,
+      const std::string& db_name, const std::string& cf_name,
+      const std::string& file_path, int job_id,
+      BlobFileCreationReason creation_reason);
+#endif  // !ROCKSDB_LITE
+
+  static void LogAndNotifyBlobFileCreationFinished(
+      EventLogger* event_logger,
+      const std::vector<std::shared_ptr<EventListener>>& listeners,
+      const std::string& db_name, const std::string& cf_name,
+      const std::string& file_path, int job_id, uint64_t file_number,
+      BlobFileCreationReason creation_reason, const Status& s,
+      const std::string& file_checksum,
+      const std::string& file_checksum_func_name, uint64_t total_blob_count,
+      uint64_t total_blob_bytes);
+
+  static void LogAndNotifyBlobFileDeletion(
+      EventLogger* event_logger,
+      const std::vector<std::shared_ptr<EventListener>>& listeners, int job_id,
+      uint64_t file_number, const std::string& file_path, const Status& status,
+      const std::string& db_name);
+
+ private:
+  static void LogAndNotifyTableFileCreation(
+      EventLogger* event_logger,
+      const std::vector<std::shared_ptr<EventListener>>& listeners,
+      const FileDescriptor& fd, const TableFileCreationInfo& info);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/experimental.cc b/src/rocksdb/db/experimental.cc
new file mode 100644
index 000000000..d838ebde5
--- /dev/null
+++ b/src/rocksdb/db/experimental.cc
@@ -0,0 +1,155 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/experimental.h"
+
+#include "db/db_impl/db_impl.h"
+#include "db/version_util.h"
+#include "logging/logging.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace experimental {
+
+#ifndef ROCKSDB_LITE
+
+Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family,
+                           const Slice* begin, const Slice* end) {
+  if (db == nullptr) {
+    return Status::InvalidArgument("DB is empty");
+  }
+
+  return db->SuggestCompactRange(column_family, begin, end);
+}
+
+Status PromoteL0(DB* db, ColumnFamilyHandle* column_family, int target_level) {
+  if (db == nullptr) {
+    return Status::InvalidArgument("Didn't recognize DB object");
+  }
+  return db->PromoteL0(column_family, target_level);
+}
+
+#else  // ROCKSDB_LITE
+
+Status SuggestCompactRange(DB* /*db*/, ColumnFamilyHandle* /*column_family*/,
+                           const Slice* /*begin*/, const Slice* /*end*/) {
+  return Status::NotSupported("Not supported in RocksDB LITE");
+}
+
+Status PromoteL0(DB* /*db*/, ColumnFamilyHandle* /*column_family*/,
+                 int /*target_level*/) {
+  return Status::NotSupported("Not supported in RocksDB LITE");
+}
+
+#endif  // ROCKSDB_LITE
+
+Status SuggestCompactRange(DB* db, const Slice* begin, const Slice* end) {
+  return SuggestCompactRange(db, db->DefaultColumnFamily(), begin, end);
+}
+
+Status UpdateManifestForFilesState(
+    const DBOptions& db_opts, const std::string& db_name,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    const UpdateManifestForFilesStateOptions& opts) {
+  OfflineManifestWriter w(db_opts, db_name);
+  Status s = w.Recover(column_families);
+
+  size_t files_updated = 0;
+  size_t cfs_updated = 0;
+  auto fs = db_opts.env->GetFileSystem();
+
+  for (auto cfd : *w.Versions().GetColumnFamilySet()) {
+    if (!s.ok()) {
+      break;
+    }
+    assert(cfd);
+
+    if (cfd->IsDropped() || !cfd->initialized()) {
+      continue;
+    }
+
+    const auto* current = cfd->current();
+    assert(current);
+
+    const auto* vstorage = current->storage_info();
+    assert(vstorage);
+
+    VersionEdit edit;
+    edit.SetColumnFamily(cfd->GetID());
+
+    /* SST files */
+    for (int level = 0; level < cfd->NumberLevels(); level++) {
+      if (!s.ok()) {
+        break;
+      }
+      const auto& level_files = vstorage->LevelFiles(level);
+
+      for (const auto& lf : level_files) {
+        assert(lf);
+
+        uint64_t number = lf->fd.GetNumber();
+        std::string fname =
+            TableFileName(w.IOptions().db_paths, number, lf->fd.GetPathId());
+
+        std::unique_ptr<FSSequentialFile> f;
+        FileOptions fopts;
+        // Use kUnknown to signal the FileSystem to search all tiers for the
+        // file.
+        fopts.temperature = Temperature::kUnknown;
+
+        IOStatus file_ios =
+            fs->NewSequentialFile(fname, fopts, &f, /*dbg*/ nullptr);
+        if (file_ios.ok()) {
+          if (opts.update_temperatures) {
+            Temperature temp = f->GetTemperature();
+            if (temp != Temperature::kUnknown && temp != lf->temperature) {
+              // Current state inconsistent with manifest
+              ++files_updated;
+              edit.DeleteFile(level, number);
+              edit.AddFile(
+                  level, number, lf->fd.GetPathId(), lf->fd.GetFileSize(),
+                  lf->smallest, lf->largest, lf->fd.smallest_seqno,
+                  lf->fd.largest_seqno, lf->marked_for_compaction, temp,
+                  lf->oldest_blob_file_number, lf->oldest_ancester_time,
+                  lf->file_creation_time, lf->file_checksum,
+                  lf->file_checksum_func_name, lf->unique_id);
+            }
+          }
+        } else {
+          s = file_ios;
+          break;
+        }
+      }
+    }
+
+    if (s.ok() && edit.NumEntries() > 0) {
+      std::unique_ptr<FSDirectory> db_dir;
+      s = fs->NewDirectory(db_name, IOOptions(), &db_dir, nullptr);
+      if (s.ok()) {
+        s = w.LogAndApply(cfd, &edit, db_dir.get());
+      }
+      if (s.ok()) {
+        ++cfs_updated;
+      }
+    }
+  }
+
+  if (cfs_updated > 0) {
+    ROCKS_LOG_INFO(db_opts.info_log,
+                   "UpdateManifestForFilesState: updated %zu files in %zu CFs",
+                   files_updated, cfs_updated);
+  } else if (s.ok()) {
+    ROCKS_LOG_INFO(db_opts.info_log,
+                   "UpdateManifestForFilesState: no updates needed");
+  }
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(db_opts.info_log, "UpdateManifestForFilesState failed: %s",
+                    s.ToString().c_str());
+  }
+
+  return s;
+}
+
+}  // namespace experimental
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/external_sst_file_basic_test.cc b/src/rocksdb/db/external_sst_file_basic_test.cc
new file mode 100644
index 000000000..665c89869
--- /dev/null
+++ b/src/rocksdb/db/external_sst_file_basic_test.cc
@@ -0,0 +1,1997 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <functional>
+
+#include "db/db_test_util.h"
+#include "db/version_edit.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/sst_file_writer.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+class ExternalSSTFileBasicTest
+    : public DBTestBase,
+      public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  ExternalSSTFileBasicTest()
+      : DBTestBase("external_sst_file_basic_test", /*env_do_fsync=*/true) {
+    sst_files_dir_ = dbname_ + "_sst_files/";
+    fault_injection_test_env_.reset(new FaultInjectionTestEnv(env_));
+    DestroyAndRecreateExternalSSTFilesDir();
+
+    // Check if the Env supports RandomRWFile
+    std::string file_path = sst_files_dir_ + "test_random_rw_file";
+    std::unique_ptr<WritableFile> wfile;
+    assert(env_->NewWritableFile(file_path, &wfile, EnvOptions()).ok());
+    wfile.reset();
+    std::unique_ptr<RandomRWFile> rwfile;
+    Status s = env_->NewRandomRWFile(file_path, &rwfile, EnvOptions());
+    if (s.IsNotSupported()) {
+      random_rwfile_supported_ = false;
+    } else {
+      EXPECT_OK(s);
+      random_rwfile_supported_ = true;
+    }
+    rwfile.reset();
+    EXPECT_OK(env_->DeleteFile(file_path));
+  }
+
+  void DestroyAndRecreateExternalSSTFilesDir() {
+    ASSERT_OK(DestroyDir(env_, sst_files_dir_));
+    ASSERT_OK(env_->CreateDir(sst_files_dir_));
+  }
+
+  Status DeprecatedAddFile(const std::vector<std::string>& files,
+                           bool move_files = false,
+                           bool skip_snapshot_check = false) {
+    IngestExternalFileOptions opts;
+    opts.move_files = move_files;
+    opts.snapshot_consistency = !skip_snapshot_check;
+    opts.allow_global_seqno = false;
+    opts.allow_blocking_flush = false;
+    return db_->IngestExternalFile(files, opts);
+  }
+
+  Status AddFileWithFileChecksum(
+      const std::vector<std::string>& files,
+      const std::vector<std::string>& files_checksums,
+      const std::vector<std::string>& files_checksum_func_names,
+      bool verify_file_checksum = true, bool move_files = false,
+      bool skip_snapshot_check = false, bool write_global_seqno = true) {
+    IngestExternalFileOptions opts;
+    opts.move_files = move_files;
+    opts.snapshot_consistency = !skip_snapshot_check;
+    opts.allow_global_seqno = false;
+    opts.allow_blocking_flush = false;
+    opts.write_global_seqno = write_global_seqno;
+    opts.verify_file_checksum = verify_file_checksum;
+
+    IngestExternalFileArg arg;
+    arg.column_family = db_->DefaultColumnFamily();
+    arg.external_files = files;
+    arg.options = opts;
+    arg.files_checksums = files_checksums;
+    arg.files_checksum_func_names = files_checksum_func_names;
+    return db_->IngestExternalFiles({arg});
+  }
+
+  Status GenerateAndAddExternalFile(
+      const Options options, std::vector<int> keys,
+      const std::vector<ValueType>& value_types,
+      std::vector<std::pair<int, int>> range_deletions, int file_id,
+      bool write_global_seqno, bool verify_checksums_before_ingest,
+      std::map<std::string, std::string>* true_data) {
+    assert(value_types.size() == 1 || keys.size() == value_types.size());
+    std::string file_path = sst_files_dir_ + std::to_string(file_id);
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+
+    Status s = sst_file_writer.Open(file_path);
+    if (!s.ok()) {
+      return s;
+    }
+    for (size_t i = 0; i < range_deletions.size(); i++) {
+      // Account for the effect of range deletions on true_data before
+      // all point operators, even though sst_file_writer.DeleteRange
+      // must be called before other sst_file_writer methods. This is
+      // because point writes take precedence over range deletions
+      // in the same ingested sst.
+      std::string start_key = Key(range_deletions[i].first);
+      std::string end_key = Key(range_deletions[i].second);
+      s = sst_file_writer.DeleteRange(start_key, end_key);
+      if (!s.ok()) {
+        sst_file_writer.Finish();
+        return s;
+      }
+      auto start_key_it = true_data->find(start_key);
+      if (start_key_it == true_data->end()) {
+        start_key_it = true_data->upper_bound(start_key);
+      }
+      auto end_key_it = true_data->find(end_key);
+      if (end_key_it == true_data->end()) {
+        end_key_it = true_data->upper_bound(end_key);
+      }
+      true_data->erase(start_key_it, end_key_it);
+    }
+    for (size_t i = 0; i < keys.size(); i++) {
+      std::string key = Key(keys[i]);
+      std::string value = Key(keys[i]) + std::to_string(file_id);
+      ValueType value_type =
+          (value_types.size() == 1 ? value_types[0] : value_types[i]);
+      switch (value_type) {
+        case ValueType::kTypeValue:
+          s = sst_file_writer.Put(key, value);
+          (*true_data)[key] = value;
+          break;
+        case ValueType::kTypeMerge:
+          s = sst_file_writer.Merge(key, value);
+          // we only use TestPutOperator in this test
+          (*true_data)[key] = value;
+          break;
+        case ValueType::kTypeDeletion:
+          s = sst_file_writer.Delete(key);
+          true_data->erase(key);
+          break;
+        default:
+          return Status::InvalidArgument("Value type is not supported");
+      }
+      if (!s.ok()) {
+        sst_file_writer.Finish();
+        return s;
+      }
+    }
+    s = sst_file_writer.Finish();
+
+    if (s.ok()) {
+      IngestExternalFileOptions ifo;
+      ifo.allow_global_seqno = true;
+      ifo.write_global_seqno = write_global_seqno;
+      ifo.verify_checksums_before_ingest = verify_checksums_before_ingest;
+      s = db_->IngestExternalFile({file_path}, ifo);
+    }
+    return s;
+  }
+
+  Status GenerateAndAddExternalFile(
+      const Options options, std::vector<int> keys,
+      const std::vector<ValueType>& value_types, int file_id,
+      bool write_global_seqno, bool verify_checksums_before_ingest,
+      std::map<std::string, std::string>* true_data) {
+    return GenerateAndAddExternalFile(
+        options, keys, value_types, {}, file_id, write_global_seqno,
+        verify_checksums_before_ingest, true_data);
+  }
+
+  Status GenerateAndAddExternalFile(
+      const Options options, std::vector<int> keys, const ValueType value_type,
+      int file_id, bool write_global_seqno, bool verify_checksums_before_ingest,
+      std::map<std::string, std::string>* true_data) {
+    return GenerateAndAddExternalFile(
+        options, keys, std::vector<ValueType>(1, value_type), file_id,
+        write_global_seqno, verify_checksums_before_ingest, true_data);
+  }
+
+  ~ExternalSSTFileBasicTest() override {
+    DestroyDir(env_, sst_files_dir_).PermitUncheckedError();
+  }
+
+ protected:
+  std::string sst_files_dir_;
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_test_env_;
+  bool random_rwfile_supported_;
+};
+
+TEST_F(ExternalSSTFileBasicTest, Basic) {
+  Options options = CurrentOptions();
+
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // Current file size should be 0 after sst_file_writer init and before open a
+  // file.
+  ASSERT_EQ(sst_file_writer.FileSize(), 0);
+
+  // file1.sst (0 => 99)
+  std::string file1 = sst_files_dir_ + "file1.sst";
+  ASSERT_OK(sst_file_writer.Open(file1));
+  for (int k = 0; k < 100; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file1_info;
+  Status s = sst_file_writer.Finish(&file1_info);
+  ASSERT_OK(s) << s.ToString();
+
+  // Current file size should be non-zero after success write.
+  ASSERT_GT(sst_file_writer.FileSize(), 0);
+
+  ASSERT_EQ(file1_info.file_path, file1);
+  ASSERT_EQ(file1_info.num_entries, 100);
+  ASSERT_EQ(file1_info.smallest_key, Key(0));
+  ASSERT_EQ(file1_info.largest_key, Key(99));
+  ASSERT_EQ(file1_info.num_range_del_entries, 0);
+  ASSERT_EQ(file1_info.smallest_range_del_key, "");
+  ASSERT_EQ(file1_info.largest_range_del_key, "");
+  ASSERT_EQ(file1_info.file_checksum, kUnknownFileChecksum);
+  ASSERT_EQ(file1_info.file_checksum_func_name, kUnknownFileChecksumFuncName);
+  // sst_file_writer already finished, cannot add this value
+  s = sst_file_writer.Put(Key(100), "bad_val");
+  ASSERT_NOK(s) << s.ToString();
+  s = sst_file_writer.DeleteRange(Key(100), Key(200));
+  ASSERT_NOK(s) << s.ToString();
+
+  DestroyAndReopen(options);
+  // Add file using file path
+  s = DeprecatedAddFile({file1});
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+  for (int k = 0; k < 100; k++) {
+    ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+  }
+
+  DestroyAndRecreateExternalSSTFilesDir();
+}
+
+class ChecksumVerifyHelper {
+ private:
+  Options options_;
+
+ public:
+  ChecksumVerifyHelper(Options& options) : options_(options) {}
+  ~ChecksumVerifyHelper() {}
+
+  Status GetSingleFileChecksumAndFuncName(
+      const std::string& file_path, std::string* file_checksum,
+      std::string* file_checksum_func_name) {
+    Status s;
+    EnvOptions soptions;
+    std::unique_ptr<SequentialFile> file_reader;
+    s = options_.env->NewSequentialFile(file_path, &file_reader, soptions);
+    if (!s.ok()) {
+      return s;
+    }
+    std::unique_ptr<char[]> scratch(new char[2048]);
+    Slice result;
+    FileChecksumGenFactory* file_checksum_gen_factory =
+        options_.file_checksum_gen_factory.get();
+    if (file_checksum_gen_factory == nullptr) {
+      *file_checksum = kUnknownFileChecksum;
+      *file_checksum_func_name = kUnknownFileChecksumFuncName;
+      return Status::OK();
+    } else {
+      FileChecksumGenContext gen_context;
+      std::unique_ptr<FileChecksumGenerator> file_checksum_gen =
+          file_checksum_gen_factory->CreateFileChecksumGenerator(gen_context);
+      *file_checksum_func_name = file_checksum_gen->Name();
+      s = file_reader->Read(2048, &result, scratch.get());
+      if (!s.ok()) {
+        return s;
+      }
+      while (result.size() != 0) {
+        file_checksum_gen->Update(scratch.get(), result.size());
+        s = file_reader->Read(2048, &result, scratch.get());
+        if (!s.ok()) {
+          return s;
+        }
+      }
+      file_checksum_gen->Finalize();
+      *file_checksum = file_checksum_gen->GetChecksum();
+    }
+    return Status::OK();
+  }
+};
+
+TEST_F(ExternalSSTFileBasicTest, BasicWithFileChecksumCrc32c) {
+  Options options = CurrentOptions();
+  options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  ChecksumVerifyHelper checksum_helper(options);
+
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // Current file size should be 0 after sst_file_writer init and before open a
+  // file.
+  ASSERT_EQ(sst_file_writer.FileSize(), 0);
+
+  // file1.sst (0 => 99)
+  std::string file1 = sst_files_dir_ + "file1.sst";
+  ASSERT_OK(sst_file_writer.Open(file1));
+  for (int k = 0; k < 100; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file1_info;
+  Status s = sst_file_writer.Finish(&file1_info);
+  ASSERT_OK(s) << s.ToString();
+  std::string file_checksum, file_checksum_func_name;
+  ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+      file1, &file_checksum, &file_checksum_func_name));
+
+  // Current file size should be non-zero after success write.
+  ASSERT_GT(sst_file_writer.FileSize(), 0);
+
+  ASSERT_EQ(file1_info.file_path, file1);
+  ASSERT_EQ(file1_info.num_entries, 100);
+  ASSERT_EQ(file1_info.smallest_key, Key(0));
+  ASSERT_EQ(file1_info.largest_key, Key(99));
+  ASSERT_EQ(file1_info.num_range_del_entries, 0);
+  ASSERT_EQ(file1_info.smallest_range_del_key, "");
+  ASSERT_EQ(file1_info.largest_range_del_key, "");
+  ASSERT_EQ(file1_info.file_checksum, file_checksum);
+  ASSERT_EQ(file1_info.file_checksum_func_name, file_checksum_func_name);
+  // sst_file_writer already finished, cannot add this value
+  s = sst_file_writer.Put(Key(100), "bad_val");
+  ASSERT_NOK(s) << s.ToString();
+  s = sst_file_writer.DeleteRange(Key(100), Key(200));
+  ASSERT_NOK(s) << s.ToString();
+
+  DestroyAndReopen(options);
+  // Add file using file path
+  s = DeprecatedAddFile({file1});
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+  for (int k = 0; k < 100; k++) {
+    ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+  }
+
+  DestroyAndRecreateExternalSSTFilesDir();
+}
+
+TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
+  Options old_options = CurrentOptions();
+  Options options = CurrentOptions();
+  options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  const ImmutableCFOptions ioptions(options);
+  ChecksumVerifyHelper checksum_helper(options);
+
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // file01.sst (1000 => 1099)
+  std::string file1 = sst_files_dir_ + "file01.sst";
+  ASSERT_OK(sst_file_writer.Open(file1));
+  for (int k = 1000; k < 1100; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file1_info;
+  Status s = sst_file_writer.Finish(&file1_info);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(file1_info.file_path, file1);
+  ASSERT_EQ(file1_info.num_entries, 100);
+  ASSERT_EQ(file1_info.smallest_key, Key(1000));
+  ASSERT_EQ(file1_info.largest_key, Key(1099));
+  std::string file_checksum1, file_checksum_func_name1;
+  ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+      file1, &file_checksum1, &file_checksum_func_name1));
+  ASSERT_EQ(file1_info.file_checksum, file_checksum1);
+  ASSERT_EQ(file1_info.file_checksum_func_name, file_checksum_func_name1);
+
+  // file02.sst (1100 => 1299)
+  std::string file2 = sst_files_dir_ + "file02.sst";
+  ASSERT_OK(sst_file_writer.Open(file2));
+  for (int k = 1100; k < 1300; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file2_info;
+  s = sst_file_writer.Finish(&file2_info);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(file2_info.file_path, file2);
+  ASSERT_EQ(file2_info.num_entries, 200);
+  ASSERT_EQ(file2_info.smallest_key, Key(1100));
+  ASSERT_EQ(file2_info.largest_key, Key(1299));
+  std::string file_checksum2, file_checksum_func_name2;
+  ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+      file2, &file_checksum2, &file_checksum_func_name2));
+  ASSERT_EQ(file2_info.file_checksum, file_checksum2);
+  ASSERT_EQ(file2_info.file_checksum_func_name, file_checksum_func_name2);
+
+  // file03.sst (1300 => 1499)
+  std::string file3 = sst_files_dir_ + "file03.sst";
+  ASSERT_OK(sst_file_writer.Open(file3));
+  for (int k = 1300; k < 1500; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+  }
+  ExternalSstFileInfo file3_info;
+  s = sst_file_writer.Finish(&file3_info);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(file3_info.file_path, file3);
+  ASSERT_EQ(file3_info.num_entries, 200);
+  ASSERT_EQ(file3_info.smallest_key, Key(1300));
+  ASSERT_EQ(file3_info.largest_key, Key(1499));
+  std::string file_checksum3, file_checksum_func_name3;
+  ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+      file3, &file_checksum3, &file_checksum_func_name3));
+  ASSERT_EQ(file3_info.file_checksum, file_checksum3);
+  ASSERT_EQ(file3_info.file_checksum_func_name, file_checksum_func_name3);
+
+  // file04.sst (1500 => 1799)
+  std::string file4 = sst_files_dir_ + "file04.sst";
+  ASSERT_OK(sst_file_writer.Open(file4));
+  for (int k = 1500; k < 1800; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+  }
+  ExternalSstFileInfo file4_info;
+  s = sst_file_writer.Finish(&file4_info);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(file4_info.file_path, file4);
+  ASSERT_EQ(file4_info.num_entries, 300);
+  ASSERT_EQ(file4_info.smallest_key, Key(1500));
+  ASSERT_EQ(file4_info.largest_key, Key(1799));
+  std::string file_checksum4, file_checksum_func_name4;
+  ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+      file4, &file_checksum4, &file_checksum_func_name4));
+  ASSERT_EQ(file4_info.file_checksum, file_checksum4);
+  ASSERT_EQ(file4_info.file_checksum_func_name, file_checksum_func_name4);
+
+  // file05.sst (1800 => 1899)
+  std::string file5 = sst_files_dir_ + "file05.sst";
+  ASSERT_OK(sst_file_writer.Open(file5));
+  for (int k = 1800; k < 2000; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+  }
+  ExternalSstFileInfo file5_info;
+  s = sst_file_writer.Finish(&file5_info);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(file5_info.file_path, file5);
+  ASSERT_EQ(file5_info.num_entries, 200);
+  ASSERT_EQ(file5_info.smallest_key, Key(1800));
+  ASSERT_EQ(file5_info.largest_key, Key(1999));
+  std::string file_checksum5, file_checksum_func_name5;
+  ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+      file5, &file_checksum5, &file_checksum_func_name5));
+  ASSERT_EQ(file5_info.file_checksum, file_checksum5);
+  ASSERT_EQ(file5_info.file_checksum_func_name, file_checksum_func_name5);
+
+  // file06.sst (2000 => 2199)
+  std::string file6 = sst_files_dir_ + "file06.sst";
+  ASSERT_OK(sst_file_writer.Open(file6));
+  for (int k = 2000; k < 2200; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+  }
+  ExternalSstFileInfo file6_info;
+  s = sst_file_writer.Finish(&file6_info);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(file6_info.file_path, file6);
+  ASSERT_EQ(file6_info.num_entries, 200);
+  ASSERT_EQ(file6_info.smallest_key, Key(2000));
+  ASSERT_EQ(file6_info.largest_key, Key(2199));
+  std::string file_checksum6, file_checksum_func_name6;
+  ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+      file6, &file_checksum6, &file_checksum_func_name6));
+  ASSERT_EQ(file6_info.file_checksum, file_checksum6);
+  ASSERT_EQ(file6_info.file_checksum_func_name, file_checksum_func_name6);
+
+  s = AddFileWithFileChecksum({file1}, {file_checksum1, "xyz"},
+                              {file_checksum1}, true, false, false, false);
+  // does not care the checksum input since db does not enable file checksum
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_OK(env_->FileExists(file1));
+  std::vector<LiveFileMetaData> live_files;
+  dbfull()->GetLiveFilesMetaData(&live_files);
+  std::set<std::string> set1;
+  for (auto f : live_files) {
+    set1.insert(f.name);
+    ASSERT_EQ(f.file_checksum, kUnknownFileChecksum);
+    ASSERT_EQ(f.file_checksum_func_name, kUnknownFileChecksumFuncName);
+  }
+
+  // check the temperature of the file being ingested
+  ColumnFamilyMetaData metadata;
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(1, metadata.file_count);
+  ASSERT_EQ(Temperature::kUnknown, metadata.levels[6].files[0].temperature);
+  auto size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kHot);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kCold);
+  ASSERT_EQ(size, 0);
+
+  // Reopen Db with checksum enabled
+  Reopen(options);
+  // Enable verify_file_checksum option
+  // The checksum vector does not match, fail the ingestion
+  s = AddFileWithFileChecksum({file2}, {file_checksum2, "xyz"},
+                              {file_checksum_func_name2}, true, false, false,
+                              false);
+  ASSERT_NOK(s) << s.ToString();
+
+  // Enable verify_file_checksum option
+  // The checksum name does not match, fail the ingestion
+  s = AddFileWithFileChecksum({file2}, {file_checksum2}, {"xyz"}, true, false,
+                              false, false);
+  ASSERT_NOK(s) << s.ToString();
+
+  // Enable verify_file_checksum option
+  // The checksum itself does not match, fail the ingestion
+  s = AddFileWithFileChecksum({file2}, {"xyz"}, {file_checksum_func_name2},
+                              true, false, false, false);
+  ASSERT_NOK(s) << s.ToString();
+
+  // Enable verify_file_checksum option
+  // All matches, ingestion is successful
+  s = AddFileWithFileChecksum({file2}, {file_checksum2},
+                              {file_checksum_func_name2}, true, false, false,
+                              false);
+  ASSERT_OK(s) << s.ToString();
+  std::vector<LiveFileMetaData> live_files1;
+  dbfull()->GetLiveFilesMetaData(&live_files1);
+  for (auto f : live_files1) {
+    if (set1.find(f.name) == set1.end()) {
+      ASSERT_EQ(f.file_checksum, file_checksum2);
+      ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name2);
+      set1.insert(f.name);
+    }
+  }
+  ASSERT_OK(env_->FileExists(file2));
+
+  // Enable verify_file_checksum option
+  // No checksum information is provided, generate it when ingesting
+  std::vector<std::string> checksum, checksum_func;
+  s = AddFileWithFileChecksum({file3}, checksum, checksum_func, true, false,
+                              false, false);
+  ASSERT_OK(s) << s.ToString();
+  std::vector<LiveFileMetaData> live_files2;
+  dbfull()->GetLiveFilesMetaData(&live_files2);
+  for (auto f : live_files2) {
+    if (set1.find(f.name) == set1.end()) {
+      ASSERT_EQ(f.file_checksum, file_checksum3);
+      ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name3);
+      set1.insert(f.name);
+    }
+  }
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_OK(env_->FileExists(file3));
+
+  // Does not enable verify_file_checksum options
+  // The checksum name does not match, fail the ingestion
+  s = AddFileWithFileChecksum({file4}, {file_checksum4}, {"xyz"}, false, false,
+                              false, false);
+  ASSERT_NOK(s) << s.ToString();
+
+  // Does not enable verify_file_checksum options
+  // Checksum function name matches, store the checksum being ingested.
+  s = AddFileWithFileChecksum({file4}, {"asd"}, {file_checksum_func_name4},
+                              false, false, false, false);
+  ASSERT_OK(s) << s.ToString();
+  std::vector<LiveFileMetaData> live_files3;
+  dbfull()->GetLiveFilesMetaData(&live_files3);
+  for (auto f : live_files3) {
+    if (set1.find(f.name) == set1.end()) {
+      ASSERT_FALSE(f.file_checksum == file_checksum4);
+      ASSERT_EQ(f.file_checksum, "asd");
+      ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name4);
+      set1.insert(f.name);
+    }
+  }
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_OK(env_->FileExists(file4));
+
+  // enable verify_file_checksum options, DB enable checksum, and enable
+  // write_global_seq. So the checksum stored is different from the one
+  // ingested due to the sequence number changes.
+  s = AddFileWithFileChecksum({file5}, {file_checksum5},
+                              {file_checksum_func_name5}, true, false, false,
+                              true);
+  ASSERT_OK(s) << s.ToString();
+  std::vector<LiveFileMetaData> live_files4;
+  dbfull()->GetLiveFilesMetaData(&live_files4);
+  for (auto f : live_files4) {
+    if (set1.find(f.name) == set1.end()) {
+      std::string cur_checksum5, cur_checksum_func_name5;
+      ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+          dbname_ + f.name, &cur_checksum5, &cur_checksum_func_name5));
+      ASSERT_EQ(f.file_checksum, cur_checksum5);
+      ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name5);
+      set1.insert(f.name);
+    }
+  }
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_OK(env_->FileExists(file5));
+
+  // Does not enable verify_file_checksum options and also the ingested file
+  // checksum information is empty. DB will generate and store the checksum
+  // in Manifest.
+  std::vector<std::string> files_c6, files_name6;
+  s = AddFileWithFileChecksum({file6}, files_c6, files_name6, false, false,
+                              false, false);
+  ASSERT_OK(s) << s.ToString();
+  std::vector<LiveFileMetaData> live_files6;
+  dbfull()->GetLiveFilesMetaData(&live_files6);
+  for (auto f : live_files6) {
+    if (set1.find(f.name) == set1.end()) {
+      ASSERT_EQ(f.file_checksum, file_checksum6);
+      ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name6);
+      set1.insert(f.name);
+    }
+  }
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_OK(env_->FileExists(file6));
+  db_->GetColumnFamilyMetaData(&metadata);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kHot);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kCold);
+  ASSERT_EQ(size, 0);
+}
+
+TEST_F(ExternalSSTFileBasicTest, NoCopy) {
+  Options options = CurrentOptions();
+  const ImmutableCFOptions ioptions(options);
+
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // file1.sst (0 => 99)
+  std::string file1 = sst_files_dir_ + "file1.sst";
+  ASSERT_OK(sst_file_writer.Open(file1));
+  for (int k = 0; k < 100; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file1_info;
+  Status s = sst_file_writer.Finish(&file1_info);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(file1_info.file_path, file1);
+  ASSERT_EQ(file1_info.num_entries, 100);
+  ASSERT_EQ(file1_info.smallest_key, Key(0));
+  ASSERT_EQ(file1_info.largest_key, Key(99));
+
+  // file2.sst (100 => 299)
+  std::string file2 = sst_files_dir_ + "file2.sst";
+  ASSERT_OK(sst_file_writer.Open(file2));
+  for (int k = 100; k < 300; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file2_info;
+  s = sst_file_writer.Finish(&file2_info);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(file2_info.file_path, file2);
+  ASSERT_EQ(file2_info.num_entries, 200);
+  ASSERT_EQ(file2_info.smallest_key, Key(100));
+  ASSERT_EQ(file2_info.largest_key, Key(299));
+
+  // file3.sst (110 => 124) .. overlap with file2.sst
+  std::string file3 = sst_files_dir_ + "file3.sst";
+  ASSERT_OK(sst_file_writer.Open(file3));
+  for (int k = 110; k < 125; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+  }
+  ExternalSstFileInfo file3_info;
+  s = sst_file_writer.Finish(&file3_info);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(file3_info.file_path, file3);
+  ASSERT_EQ(file3_info.num_entries, 15);
+  ASSERT_EQ(file3_info.smallest_key, Key(110));
+  ASSERT_EQ(file3_info.largest_key, Key(124));
+
+  s = DeprecatedAddFile({file1}, true /* move file */);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(Status::NotFound(), env_->FileExists(file1));
+
+  s = DeprecatedAddFile({file2}, false /* copy file */);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_OK(env_->FileExists(file2));
+
+  // This file has overlapping values with the existing data
+  s = DeprecatedAddFile({file3}, true /* move file */);
+  ASSERT_NOK(s) << s.ToString();
+  ASSERT_OK(env_->FileExists(file3));
+
+  for (int k = 0; k < 300; k++) {
+    ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+  }
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithGlobalSeqnoPickedSeqno) {
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  do {
+    Options options = CurrentOptions();
+    DestroyAndReopen(options);
+    std::map<std::string, std::string> true_data;
+
+    int file_id = 1;
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 2, 3, 4, 5, 6}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {10, 11, 12, 13}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 4, 6}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 1);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {11, 15, 19}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {120, 130}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 130}, ValueType::kTypeValue, file_id++, write_global_seqno,
+        verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);
+
+    // Write some keys through normal write path
+    for (int i = 0; i < 50; i++) {
+      ASSERT_OK(Put(Key(i), "memtable"));
+      true_data[Key(i)] = "memtable";
+    }
+    SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {60, 61, 62}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {40, 41, 42}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 1);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {20, 30, 40}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 2);
+
+    const Snapshot* snapshot = db_->GetSnapshot();
+
+    // We will need a seqno for the file regardless if the file overwrite
+    // keys in the DB or not because we have a snapshot
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1000, 1002}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // A global seqno will be assigned anyway because of the snapshot
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 3);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {2000, 3002}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // A global seqno will be assigned anyway because of the snapshot
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 4);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 20, 40, 100, 150}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // A global seqno will be assigned anyway because of the snapshot
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+    db_->ReleaseSnapshot(snapshot);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {5000, 5001}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // No snapshot anymore, no need to assign a seqno
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+    size_t kcnt = 0;
+    VerifyDBFromMap(true_data, &kcnt, false);
+  } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithMultipleValueType) {
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  do {
+    Options options = CurrentOptions();
+    options.merge_operator.reset(new TestPutOperator());
+    DestroyAndReopen(options);
+    std::map<std::string, std::string> true_data;
+
+    int file_id = 1;
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 2, 3, 4, 5, 6}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {10, 11, 12, 13}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 4, 6}, ValueType::kTypeMerge, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 1);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {11, 15, 19}, ValueType::kTypeDeletion, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {120, 130}, ValueType::kTypeMerge, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 130}, ValueType::kTypeDeletion, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {120}, {ValueType::kTypeValue}, {{120, 135}}, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {}, {}, {{110, 120}}, file_id++, write_global_seqno,
+        verify_checksums_before_ingest, &true_data));
+    // The range deletion ends on a key, but it doesn't actually delete
+    // this key because the largest key in the range is exclusive. Still,
+    // it counts as an overlap so a new seqno will be assigned.
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {}, {}, {{100, 109}}, file_id++, write_global_seqno,
+        verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5);
+
+    // Write some keys through normal write path
+    for (int i = 0; i < 50; i++) {
+      ASSERT_OK(Put(Key(i), "memtable"));
+      true_data[Key(i)] = "memtable";
+    }
+    SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {60, 61, 62}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {40, 41, 42}, ValueType::kTypeMerge, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 1);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {20, 30, 40}, ValueType::kTypeDeletion, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 2);
+
+    const Snapshot* snapshot = db_->GetSnapshot();
+
+    // We will need a seqno for the file regardless if the file overwrite
+    // keys in the DB or not because we have a snapshot
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1000, 1002}, ValueType::kTypeMerge, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // A global seqno will be assigned anyway because of the snapshot
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 3);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {2000, 3002}, ValueType::kTypeMerge, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // A global seqno will be assigned anyway because of the snapshot
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 4);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 20, 40, 100, 150}, ValueType::kTypeMerge, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // A global seqno will be assigned anyway because of the snapshot
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+    db_->ReleaseSnapshot(snapshot);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {5000, 5001}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // No snapshot anymore, no need to assign a seqno
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+    size_t kcnt = 0;
+    VerifyDBFromMap(true_data, &kcnt, false);
+  } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) {
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  do {
+    Options options = CurrentOptions();
+    options.merge_operator.reset(new TestPutOperator());
+    DestroyAndReopen(options);
+    std::map<std::string, std::string> true_data;
+
+    int file_id = 1;
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 2, 3, 4, 5, 6},
+        {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue,
+         ValueType::kTypeMerge, ValueType::kTypeValue, ValueType::kTypeMerge},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {10, 11, 12, 13},
+        {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue,
+         ValueType::kTypeMerge},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 4, 6},
+        {ValueType::kTypeDeletion, ValueType::kTypeValue,
+         ValueType::kTypeMerge},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 1);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {11, 15, 19},
+        {ValueType::kTypeDeletion, ValueType::kTypeMerge,
+         ValueType::kTypeValue},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {120, 130}, {ValueType::kTypeValue, ValueType::kTypeMerge},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 130}, {ValueType::kTypeMerge, ValueType::kTypeDeletion},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {150, 151, 152},
+        {ValueType::kTypeValue, ValueType::kTypeMerge,
+         ValueType::kTypeDeletion},
+        {{150, 160}, {180, 190}}, file_id++, write_global_seqno,
+        verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {150, 151, 152},
+        {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue},
+        {{200, 250}}, file_id++, write_global_seqno,
+        verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {300, 301, 302},
+        {ValueType::kTypeValue, ValueType::kTypeMerge,
+         ValueType::kTypeDeletion},
+        {{1, 2}, {152, 154}}, file_id++, write_global_seqno,
+        verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5);
+
+    // Write some keys through normal write path
+    for (int i = 0; i < 50; i++) {
+      ASSERT_OK(Put(Key(i), "memtable"));
+      true_data[Key(i)] = "memtable";
+    }
+    SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {60, 61, 62},
+        {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {40, 41, 42},
+        {ValueType::kTypeValue, ValueType::kTypeDeletion,
+         ValueType::kTypeDeletion},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 1);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {20, 30, 40},
+        {ValueType::kTypeDeletion, ValueType::kTypeDeletion,
+         ValueType::kTypeDeletion},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 2);
+
+    const Snapshot* snapshot = db_->GetSnapshot();
+
+    // We will need a seqno for the file regardless if the file overwrite
+    // keys in the DB or not because we have a snapshot
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1000, 1002}, {ValueType::kTypeValue, ValueType::kTypeMerge},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // A global seqno will be assigned anyway because of the snapshot
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 3);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {2000, 3002}, {ValueType::kTypeValue, ValueType::kTypeMerge},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // A global seqno will be assigned anyway because of the snapshot
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 4);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 20, 40, 100, 150},
+        {ValueType::kTypeDeletion, ValueType::kTypeDeletion,
+         ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeMerge},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // A global seqno will be assigned anyway because of the snapshot
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+    db_->ReleaseSnapshot(snapshot);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {5000, 5001}, {ValueType::kTypeValue, ValueType::kTypeMerge},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // No snapshot anymore, no need to assign a seqno
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+    size_t kcnt = 0;
+    VerifyDBFromMap(true_data, &kcnt, false);
+  } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_F(ExternalSSTFileBasicTest, FadviseTrigger) {
+  Options options = CurrentOptions();
+  const int kNumKeys = 10000;
+
+  size_t total_fadvised_bytes = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileWriter::Rep::InvalidatePageCache", [&](void* arg) {
+        size_t fadvise_size = *(reinterpret_cast<size_t*>(arg));
+        total_fadvised_bytes += fadvise_size;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::unique_ptr<SstFileWriter> sst_file_writer;
+
+  std::string sst_file_path = sst_files_dir_ + "file_fadvise_disable.sst";
+  sst_file_writer.reset(
+      new SstFileWriter(EnvOptions(), options, nullptr, false));
+  ASSERT_OK(sst_file_writer->Open(sst_file_path));
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(sst_file_writer->Put(Key(i), Key(i)));
+  }
+  ASSERT_OK(sst_file_writer->Finish());
+  // fadvise disabled
+  ASSERT_EQ(total_fadvised_bytes, 0);
+
+  sst_file_path = sst_files_dir_ + "file_fadvise_enable.sst";
+  sst_file_writer.reset(
+      new SstFileWriter(EnvOptions(), options, nullptr, true));
+  ASSERT_OK(sst_file_writer->Open(sst_file_path));
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(sst_file_writer->Put(Key(i), Key(i)));
+  }
+  ASSERT_OK(sst_file_writer->Finish());
+  // fadvise enabled
+  ASSERT_EQ(total_fadvised_bytes, sst_file_writer->FileSize());
+  ASSERT_GT(total_fadvised_bytes, 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileBasicTest, SyncFailure) {
+  Options options;
+  options.create_if_missing = true;
+  options.env = fault_injection_test_env_.get();
+
+  std::vector<std::pair<std::string, std::string>> test_cases = {
+      {"ExternalSstFileIngestionJob::BeforeSyncIngestedFile",
+       "ExternalSstFileIngestionJob::AfterSyncIngestedFile"},
+      {"ExternalSstFileIngestionJob::BeforeSyncDir",
+       "ExternalSstFileIngestionJob::AfterSyncDir"},
+      {"ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno",
+       "ExternalSstFileIngestionJob::AfterSyncGlobalSeqno"}};
+
+  for (size_t i = 0; i < test_cases.size(); i++) {
+    bool no_sync = false;
+    SyncPoint::GetInstance()->SetCallBack(test_cases[i].first, [&](void*) {
+      fault_injection_test_env_->SetFilesystemActive(false);
+    });
+    SyncPoint::GetInstance()->SetCallBack(test_cases[i].second, [&](void*) {
+      fault_injection_test_env_->SetFilesystemActive(true);
+    });
+    if (i == 0) {
+      SyncPoint::GetInstance()->SetCallBack(
+          "ExternalSstFileIngestionJob::Prepare:Reopen", [&](void* s) {
+            Status* status = static_cast<Status*>(s);
+            if (status->IsNotSupported()) {
+              no_sync = true;
+            }
+          });
+    }
+    if (i == 2) {
+      SyncPoint::GetInstance()->SetCallBack(
+          "ExternalSstFileIngestionJob::NewRandomRWFile", [&](void* s) {
+            Status* status = static_cast<Status*>(s);
+            if (status->IsNotSupported()) {
+              no_sync = true;
+            }
+          });
+    }
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    DestroyAndReopen(options);
+    if (i == 2) {
+      ASSERT_OK(Put("foo", "v1"));
+    }
+
+    Options sst_file_writer_options;
+    sst_file_writer_options.env = fault_injection_test_env_.get();
+    std::unique_ptr<SstFileWriter> sst_file_writer(
+        new SstFileWriter(EnvOptions(), sst_file_writer_options));
+    std::string file_name =
+        sst_files_dir_ + "sync_failure_test_" + std::to_string(i) + ".sst";
+    ASSERT_OK(sst_file_writer->Open(file_name));
+    ASSERT_OK(sst_file_writer->Put("bar", "v2"));
+    ASSERT_OK(sst_file_writer->Finish());
+
+    IngestExternalFileOptions ingest_opt;
+    if (i == 0) {
+      ingest_opt.move_files = true;
+    }
+    const Snapshot* snapshot = db_->GetSnapshot();
+    if (i == 2) {
+      ingest_opt.write_global_seqno = true;
+    }
+    Status s = db_->IngestExternalFile({file_name}, ingest_opt);
+    if (no_sync) {
+      ASSERT_OK(s);
+    } else {
+      ASSERT_NOK(s);
+    }
+    db_->ReleaseSnapshot(snapshot);
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    Destroy(options);
+  }
+}
+
+TEST_F(ExternalSSTFileBasicTest, ReopenNotSupported) {
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "ExternalSstFileIngestionJob::Prepare:Reopen", [&](void* arg) {
+        Status* s = static_cast<Status*>(arg);
+        *s = Status::NotSupported();
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  DestroyAndReopen(options);
+
+  Options sst_file_writer_options;
+  sst_file_writer_options.env = env_;
+  std::unique_ptr<SstFileWriter> sst_file_writer(
+      new SstFileWriter(EnvOptions(), sst_file_writer_options));
+  std::string file_name =
+      sst_files_dir_ + "reopen_not_supported_test_" + ".sst";
+  ASSERT_OK(sst_file_writer->Open(file_name));
+  ASSERT_OK(sst_file_writer->Put("bar", "v2"));
+  ASSERT_OK(sst_file_writer->Finish());
+
+  IngestExternalFileOptions ingest_opt;
+  ingest_opt.move_files = true;
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt));
+  db_->ReleaseSnapshot(snapshot);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  Destroy(options);
+}
+
+TEST_F(ExternalSSTFileBasicTest, VerifyChecksumReadahead) {
+  Options options;
+  options.create_if_missing = true;
+  SpecialEnv senv(env_);
+  options.env = &senv;
+  DestroyAndReopen(options);
+
+  Options sst_file_writer_options;
+  sst_file_writer_options.env = env_;
+  std::unique_ptr<SstFileWriter> sst_file_writer(
+      new SstFileWriter(EnvOptions(), sst_file_writer_options));
+  std::string file_name = sst_files_dir_ + "verify_checksum_readahead_test.sst";
+  ASSERT_OK(sst_file_writer->Open(file_name));
+  Random rnd(301);
+  std::string value = rnd.RandomString(4000);
+  for (int i = 0; i < 5000; i++) {
+    ASSERT_OK(sst_file_writer->Put(DBTestBase::Key(i), value));
+  }
+  ASSERT_OK(sst_file_writer->Finish());
+
+  // Ingest it once without verifying checksums to see the baseline
+  // preads.
+  IngestExternalFileOptions ingest_opt;
+  ingest_opt.move_files = false;
+  senv.count_random_reads_ = true;
+  senv.random_read_bytes_counter_ = 0;
+  ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt));
+
+  auto base_num_reads = senv.random_read_counter_.Read();
+  // Make sure the counter is enabled.
+  ASSERT_GT(base_num_reads, 0);
+
+  // Ingest again and observe the reads made for for readahead.
+  ingest_opt.move_files = false;
+  ingest_opt.verify_checksums_before_ingest = true;
+  ingest_opt.verify_checksums_readahead_size = size_t{2 * 1024 * 1024};
+
+  senv.count_random_reads_ = true;
+  senv.random_read_bytes_counter_ = 0;
+  ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt));
+
+  // Make sure the counter is enabled.
+  ASSERT_GT(senv.random_read_counter_.Read() - base_num_reads, 0);
+
+  // The SST file is about 20MB. Readahead size is 2MB.
+  // Give a conservative 15 reads for metadata blocks, the number
+  // of random reads should be within 20 MB / 2MB + 15 = 25.
+  ASSERT_LE(senv.random_read_counter_.Read() - base_num_reads, 40);
+
+  Destroy(options);
+}
+
+TEST_F(ExternalSSTFileBasicTest, IngestRangeDeletionTombstoneWithGlobalSeqno) {
+  for (int i = 5; i < 25; i++) {
+    ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), Key(i),
+                       Key(i) + "_val"));
+  }
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // file.sst (delete 0 => 30)
+  std::string file = sst_files_dir_ + "file.sst";
+  ASSERT_OK(sst_file_writer.Open(file));
+  ASSERT_OK(sst_file_writer.DeleteRange(Key(0), Key(30)));
+  ExternalSstFileInfo file_info;
+  ASSERT_OK(sst_file_writer.Finish(&file_info));
+  ASSERT_EQ(file_info.file_path, file);
+  ASSERT_EQ(file_info.num_entries, 0);
+  ASSERT_EQ(file_info.smallest_key, "");
+  ASSERT_EQ(file_info.largest_key, "");
+  ASSERT_EQ(file_info.num_range_del_entries, 1);
+  ASSERT_EQ(file_info.smallest_range_del_key, Key(0));
+  ASSERT_EQ(file_info.largest_range_del_key, Key(30));
+
+  IngestExternalFileOptions ifo;
+  ifo.move_files = true;
+  ifo.snapshot_consistency = true;
+  ifo.allow_global_seqno = true;
+  ifo.write_global_seqno = true;
+  ifo.verify_checksums_before_ingest = false;
+  ASSERT_OK(db_->IngestExternalFile({file}, ifo));
+
+  for (int i = 5; i < 25; i++) {
+    std::string res;
+    ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &res).IsNotFound());
+  }
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) {
+  int kNumLevels = 7;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = kNumLevels;
+  Reopen(options);
+
+  std::map<std::string, std::string> true_data;
+  int file_id = 1;
+  // prevent range deletions from being dropped due to becoming obsolete.
+  const Snapshot* snapshot = db_->GetSnapshot();
+
+  // range del [0, 50) in L6 file, [50, 100) in L0 file, [100, 150) in memtable
+  for (int i = 0; i < 3; i++) {
+    if (i != 0) {
+      db_->Flush(FlushOptions());
+      if (i == 1) {
+        MoveFilesToLevel(kNumLevels - 1);
+      }
+    }
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(50 * i), Key(50 * (i + 1))));
+  }
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 1));
+
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  // overlaps with L0 file but not memtable, so flush is skipped and file is
+  // ingested into L0
+  SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {60, 90}, {ValueType::kTypeValue, ValueType::kTypeValue},
+      {{65, 70}, {70, 85}}, file_id++, write_global_seqno,
+      verify_checksums_before_ingest, &true_data));
+  ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
+
+  // overlaps with L6 file but not memtable or L0 file, so flush is skipped and
+  // file is ingested into L5
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {10, 40}, {ValueType::kTypeValue, ValueType::kTypeValue},
+      file_id++, write_global_seqno, verify_checksums_before_ingest,
+      &true_data));
+  ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+  ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
+
+  // overlaps with L5 file but not memtable or L0 file, so flush is skipped and
+  // file is ingested into L4
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {}, {}, {{5, 15}}, file_id++, write_global_seqno,
+      verify_checksums_before_ingest, &true_data));
+  ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+  ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 2));
+  ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
+
+  // ingested file overlaps with memtable, so flush is triggered before the file
+  // is ingested such that the ingested data is considered newest. So L0 file
+  // count increases by two.
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {100, 140}, {ValueType::kTypeValue, ValueType::kTypeValue},
+      file_id++, write_global_seqno, verify_checksums_before_ingest,
+      &true_data));
+  ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
+  ASSERT_EQ(4, NumTableFilesAtLevel(0));
+  ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
+
+  // snapshot unneeded now that all range deletions are persisted
+  db_->ReleaseSnapshot(snapshot);
+
+  // overlaps with nothing, so places at bottom level and skips incrementing
+  // seqnum.
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {151, 175}, {ValueType::kTypeValue, ValueType::kTypeValue},
+      {{160, 200}}, file_id++, write_global_seqno,
+      verify_checksums_before_ingest, &true_data));
+  ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
+  ASSERT_EQ(4, NumTableFilesAtLevel(0));
+  ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(2, NumTableFilesAtLevel(options.num_levels - 1));
+}
+
+TEST_F(ExternalSSTFileBasicTest, AdjacentRangeDeletionTombstones) {
+  Options options = CurrentOptions();
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // file8.sst (delete 300 => 400)
+  std::string file8 = sst_files_dir_ + "file8.sst";
+  ASSERT_OK(sst_file_writer.Open(file8));
+  ASSERT_OK(sst_file_writer.DeleteRange(Key(300), Key(400)));
+  ExternalSstFileInfo file8_info;
+  Status s = sst_file_writer.Finish(&file8_info);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(file8_info.file_path, file8);
+  ASSERT_EQ(file8_info.num_entries, 0);
+  ASSERT_EQ(file8_info.smallest_key, "");
+  ASSERT_EQ(file8_info.largest_key, "");
+  ASSERT_EQ(file8_info.num_range_del_entries, 1);
+  ASSERT_EQ(file8_info.smallest_range_del_key, Key(300));
+  ASSERT_EQ(file8_info.largest_range_del_key, Key(400));
+
+  // file9.sst (delete 400 => 500)
+  std::string file9 = sst_files_dir_ + "file9.sst";
+  ASSERT_OK(sst_file_writer.Open(file9));
+  ASSERT_OK(sst_file_writer.DeleteRange(Key(400), Key(500)));
+  ExternalSstFileInfo file9_info;
+  s = sst_file_writer.Finish(&file9_info);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(file9_info.file_path, file9);
+  ASSERT_EQ(file9_info.num_entries, 0);
+  ASSERT_EQ(file9_info.smallest_key, "");
+  ASSERT_EQ(file9_info.largest_key, "");
+  ASSERT_EQ(file9_info.num_range_del_entries, 1);
+  ASSERT_EQ(file9_info.smallest_range_del_key, Key(400));
+  ASSERT_EQ(file9_info.largest_range_del_key, Key(500));
+
+  // Range deletion tombstones are exclusive on their end key, so these SSTs
+  // should not be considered as overlapping.
+  s = DeprecatedAddFile({file8, file9});
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+  DestroyAndRecreateExternalSSTFilesDir();
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithBadBlockChecksum) {
+  bool change_checksum_called = false;
+  const auto& change_checksum = [&](void* arg) {
+    if (!change_checksum_called) {
+      char* buf = reinterpret_cast<char*>(arg);
+      assert(nullptr != buf);
+      buf[0] ^= 0x1;
+      change_checksum_called = true;
+    }
+  };
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WriteMaybeCompressedBlock:TamperWithChecksum",
+      change_checksum);
+  SyncPoint::GetInstance()->EnableProcessing();
+  int file_id = 0;
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  do {
+    Options options = CurrentOptions();
+    DestroyAndReopen(options);
+    std::map<std::string, std::string> true_data;
+    Status s = GenerateAndAddExternalFile(
+        options, {1, 2, 3, 4, 5, 6}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data);
+    if (verify_checksums_before_ingest) {
+      ASSERT_NOK(s);
+    } else {
+      ASSERT_OK(s);
+    }
+    change_checksum_called = false;
+  } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithFirstByteTampered) {
+  if (!random_rwfile_supported_) {
+    ROCKSDB_GTEST_SKIP("Test requires NewRandomRWFile support");
+    return;
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  int file_id = 0;
+  EnvOptions env_options;
+  do {
+    Options options = CurrentOptions();
+    std::string file_path = sst_files_dir_ + std::to_string(file_id++);
+    SstFileWriter sst_file_writer(env_options, options);
+    Status s = sst_file_writer.Open(file_path);
+    ASSERT_OK(s);
+    for (int i = 0; i != 100; ++i) {
+      std::string key = Key(i);
+      std::string value = Key(i) + std::to_string(0);
+      ASSERT_OK(sst_file_writer.Put(key, value));
+    }
+    ASSERT_OK(sst_file_writer.Finish());
+    {
+      // Get file size
+      uint64_t file_size = 0;
+      ASSERT_OK(env_->GetFileSize(file_path, &file_size));
+      ASSERT_GT(file_size, 8);
+      std::unique_ptr<RandomRWFile> rwfile;
+      ASSERT_OK(env_->NewRandomRWFile(file_path, &rwfile, EnvOptions()));
+      // Manually corrupt the file
+      // We deterministically corrupt the first byte because we currently
+      // cannot choose a random offset. The reason for this limitation is that
+      // we do not checksum property block at present.
+      const uint64_t offset = 0;
+      char scratch[8] = {0};
+      Slice buf;
+      ASSERT_OK(rwfile->Read(offset, sizeof(scratch), &buf, scratch));
+      scratch[0] ^= 0xff;  // flip one bit
+      ASSERT_OK(rwfile->Write(offset, buf));
+    }
+    // Ingest file.
+    IngestExternalFileOptions ifo;
+    ifo.write_global_seqno = std::get<0>(GetParam());
+    ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+    s = db_->IngestExternalFile({file_path}, ifo);
+    if (ifo.verify_checksums_before_ingest) {
+      ASSERT_NOK(s);
+    } else {
+      ASSERT_OK(s);
+    }
+  } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestExternalFileWithCorruptedPropsBlock) {
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  if (!verify_checksums_before_ingest) {
+    ROCKSDB_GTEST_BYPASS("Bypassing test when !verify_checksums_before_ingest");
+    return;
+  }
+  if (!random_rwfile_supported_) {
+    ROCKSDB_GTEST_SKIP("Test requires NewRandomRWFile support");
+    return;
+  }
+  uint64_t props_block_offset = 0;
+  size_t props_block_size = 0;
+  const auto& get_props_block_offset = [&](void* arg) {
+    props_block_offset = *reinterpret_cast<uint64_t*>(arg);
+  };
+  const auto& get_props_block_size = [&](void* arg) {
+    props_block_size = *reinterpret_cast<uint64_t*>(arg);
+  };
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockOffset",
+      get_props_block_offset);
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockSize",
+      get_props_block_size);
+  SyncPoint::GetInstance()->EnableProcessing();
+  int file_id = 0;
+  Random64 rand(time(nullptr));
+  do {
+    std::string file_path = sst_files_dir_ + std::to_string(file_id++);
+    Options options = CurrentOptions();
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+    Status s = sst_file_writer.Open(file_path);
+    ASSERT_OK(s);
+    for (int i = 0; i != 100; ++i) {
+      std::string key = Key(i);
+      std::string value = Key(i) + std::to_string(0);
+      ASSERT_OK(sst_file_writer.Put(key, value));
+    }
+    ASSERT_OK(sst_file_writer.Finish());
+
+    {
+      std::unique_ptr<RandomRWFile> rwfile;
+      ASSERT_OK(env_->NewRandomRWFile(file_path, &rwfile, EnvOptions()));
+      // Manually corrupt the file
+      ASSERT_GT(props_block_size, 8);
+      uint64_t offset =
+          props_block_offset + rand.Next() % (props_block_size - 8);
+      char scratch[8] = {0};
+      Slice buf;
+      ASSERT_OK(rwfile->Read(offset, sizeof(scratch), &buf, scratch));
+      scratch[0] ^= 0xff;  // flip one bit
+      ASSERT_OK(rwfile->Write(offset, buf));
+    }
+
+    // Ingest file.
+    IngestExternalFileOptions ifo;
+    ifo.write_global_seqno = std::get<0>(GetParam());
+    ifo.verify_checksums_before_ingest = true;
+    s = db_->IngestExternalFile({file_path}, ifo);
+    ASSERT_NOK(s);
+  } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_F(ExternalSSTFileBasicTest, OverlappingFiles) {
+  Options options = CurrentOptions();
+
+  std::vector<std::string> files;
+  {
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+    std::string file1 = sst_files_dir_ + "file1.sst";
+    ASSERT_OK(sst_file_writer.Open(file1));
+    ASSERT_OK(sst_file_writer.Put("a", "z"));
+    ASSERT_OK(sst_file_writer.Put("i", "m"));
+    ExternalSstFileInfo file1_info;
+    ASSERT_OK(sst_file_writer.Finish(&file1_info));
+    files.push_back(std::move(file1));
+  }
+  {
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+    std::string file2 = sst_files_dir_ + "file2.sst";
+    ASSERT_OK(sst_file_writer.Open(file2));
+    ASSERT_OK(sst_file_writer.Put("i", "k"));
+    ExternalSstFileInfo file2_info;
+    ASSERT_OK(sst_file_writer.Finish(&file2_info));
+    files.push_back(std::move(file2));
+  }
+
+  IngestExternalFileOptions ifo;
+  ASSERT_OK(db_->IngestExternalFile(files, ifo));
+  ASSERT_EQ(Get("a"), "z");
+  ASSERT_EQ(Get("i"), "k");
+
+  int total_keys = 0;
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    total_keys++;
+  }
+  delete iter;
+  ASSERT_EQ(total_keys, 2);
+
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+}
+
+TEST_F(ExternalSSTFileBasicTest, IngestFileAfterDBPut) {
+  // Repro https://github.com/facebook/rocksdb/issues/6245.
+  // Flush three files to L0. Ingest one more file to trigger L0->L1 compaction
+  // via trivial move. The bug happened when L1 files were incorrectly sorted
+  // resulting in an old value for "k" returned by `Get()`.
+  Options options = CurrentOptions();
+
+  ASSERT_OK(Put("k", "a"));
+  Flush();
+  ASSERT_OK(Put("k", "a"));
+  Flush();
+  ASSERT_OK(Put("k", "a"));
+  Flush();
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // Current file size should be 0 after sst_file_writer init and before open a
+  // file.
+  ASSERT_EQ(sst_file_writer.FileSize(), 0);
+
+  std::string file1 = sst_files_dir_ + "file1.sst";
+  ASSERT_OK(sst_file_writer.Open(file1));
+  ASSERT_OK(sst_file_writer.Put("k", "b"));
+
+  ExternalSstFileInfo file1_info;
+  Status s = sst_file_writer.Finish(&file1_info);
+  ASSERT_OK(s) << s.ToString();
+
+  // Current file size should be non-zero after success write.
+  ASSERT_GT(sst_file_writer.FileSize(), 0);
+
+  IngestExternalFileOptions ifo;
+  s = db_->IngestExternalFile({file1}, ifo);
+  ASSERT_OK(s);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ(Get("k"), "b");
+}
+
+TEST_F(ExternalSSTFileBasicTest, IngestWithTemperature) {
+  Options options = CurrentOptions();
+  const ImmutableCFOptions ioptions(options);
+  options.bottommost_temperature = Temperature::kWarm;
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+  options.level0_file_num_compaction_trigger = 2;
+  Reopen(options);
+
+  auto size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kHot);
+  ASSERT_EQ(size, 0);
+
+  // create file01.sst (1000 => 1099) and ingest it
+  std::string file1 = sst_files_dir_ + "file01.sst";
+  ASSERT_OK(sst_file_writer.Open(file1));
+  for (int k = 1000; k < 1100; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file1_info;
+  Status s = sst_file_writer.Finish(&file1_info);
+  ASSERT_OK(s);
+  ASSERT_EQ(file1_info.file_path, file1);
+  ASSERT_EQ(file1_info.num_entries, 100);
+  ASSERT_EQ(file1_info.smallest_key, Key(1000));
+  ASSERT_EQ(file1_info.largest_key, Key(1099));
+
+  std::vector<std::string> files;
+  std::vector<std::string> files_checksums;
+  std::vector<std::string> files_checksum_func_names;
+  Temperature file_temperature = Temperature::kWarm;
+
+  files.push_back(file1);
+  IngestExternalFileOptions in_opts;
+  in_opts.move_files = false;
+  in_opts.snapshot_consistency = true;
+  in_opts.allow_global_seqno = false;
+  in_opts.allow_blocking_flush = false;
+  in_opts.write_global_seqno = true;
+  in_opts.verify_file_checksum = false;
+  IngestExternalFileArg arg;
+  arg.column_family = db_->DefaultColumnFamily();
+  arg.external_files = files;
+  arg.options = in_opts;
+  arg.files_checksums = files_checksums;
+  arg.files_checksum_func_names = files_checksum_func_names;
+  arg.file_temperature = file_temperature;
+  s = db_->IngestExternalFiles({arg});
+  ASSERT_OK(s);
+
+  // check the temperature of the file being ingested
+  ColumnFamilyMetaData metadata;
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(1, metadata.file_count);
+  ASSERT_EQ(Temperature::kWarm, metadata.levels[6].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_GT(size, 1);
+
+  // non-bottommost file still has unknown temperature
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(2, metadata.file_count);
+  ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_GT(size, 0);
+
+  // reopen and check the information is persisted
+  Reopen(options);
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(2, metadata.file_count);
+  ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
+  ASSERT_EQ(Temperature::kWarm, metadata.levels[6].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_GT(size, 0);
+
+  // check other non-exist temperatures
+  size = GetSstSizeHelper(Temperature::kHot);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kCold);
+  ASSERT_EQ(size, 0);
+  std::string prop;
+  ASSERT_TRUE(dbfull()->GetProperty(
+      DB::Properties::kLiveSstFilesSizeAtTemperature + std::to_string(22),
+      &prop));
+  ASSERT_EQ(std::atoi(prop.c_str()), 0);
+}
+
+TEST_F(ExternalSSTFileBasicTest, FailIfNotBottommostLevel) {
+  Options options = GetDefaultOptions();
+
+  std::string file_path = sst_files_dir_ + std::to_string(1);
+  SstFileWriter sfw(EnvOptions(), options);
+
+  ASSERT_OK(sfw.Open(file_path));
+  ASSERT_OK(sfw.Put("b", "dontcare"));
+  ASSERT_OK(sfw.Finish());
+
+  // Test universal compaction + ingest with snapshot consistency
+  options.create_if_missing = true;
+  options.compaction_style = CompactionStyle::kCompactionStyleUniversal;
+  DestroyAndReopen(options);
+  {
+    const Snapshot* snapshot = db_->GetSnapshot();
+    ManagedSnapshot snapshot_guard(db_, snapshot);
+    IngestExternalFileOptions ifo;
+    ifo.fail_if_not_bottommost_level = true;
+    ifo.snapshot_consistency = true;
+    const Status s = db_->IngestExternalFile({file_path}, ifo);
+    ASSERT_TRUE(s.IsTryAgain());
+  }
+
+  // Test level compaction
+  options.compaction_style = CompactionStyle::kCompactionStyleLevel;
+  options.num_levels = 2;
+  DestroyAndReopen(options);
+  ASSERT_OK(db_->Put(WriteOptions(), "a", "dontcare"));
+  ASSERT_OK(db_->Put(WriteOptions(), "c", "dontcare"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  ASSERT_OK(db_->Put(WriteOptions(), "b", "dontcare"));
+  ASSERT_OK(db_->Put(WriteOptions(), "d", "dontcare"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  {
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+    IngestExternalFileOptions ifo;
+    ifo.fail_if_not_bottommost_level = true;
+    const Status s = db_->IngestExternalFile({file_path}, ifo);
+    ASSERT_TRUE(s.IsTryAgain());
+  }
+}
+
+TEST_F(ExternalSSTFileBasicTest, VerifyChecksum) {
+  const std::string kPutVal = "put_val";
+  const std::string kIngestedVal = "ingested_val";
+
+  ASSERT_OK(Put("k", kPutVal, WriteOptions()));
+  ASSERT_OK(Flush());
+
+  std::string external_file = sst_files_dir_ + "/file_to_ingest.sst";
+  {
+    SstFileWriter sst_file_writer{EnvOptions(), CurrentOptions()};
+
+    ASSERT_OK(sst_file_writer.Open(external_file));
+    ASSERT_OK(sst_file_writer.Put("k", kIngestedVal));
+    ASSERT_OK(sst_file_writer.Finish());
+  }
+
+  ASSERT_OK(db_->IngestExternalFile(db_->DefaultColumnFamily(), {external_file},
+                                    IngestExternalFileOptions()));
+
+  ASSERT_OK(db_->VerifyChecksum());
+}
+
+TEST_F(ExternalSSTFileBasicTest, VerifySstUniqueId) {
+  const std::string kPutVal = "put_val";
+  const std::string kIngestedVal = "ingested_val";
+
+  ASSERT_OK(Put("k", kPutVal, WriteOptions()));
+  ASSERT_OK(Flush());
+
+  std::string external_file = sst_files_dir_ + "/file_to_ingest.sst";
+  {
+    SstFileWriter sst_file_writer{EnvOptions(), CurrentOptions()};
+
+    ASSERT_OK(sst_file_writer.Open(external_file));
+    ASSERT_OK(sst_file_writer.Put("k", kIngestedVal));
+    ASSERT_OK(sst_file_writer.Finish());
+  }
+
+  ASSERT_OK(db_->IngestExternalFile(db_->DefaultColumnFamily(), {external_file},
+                                    IngestExternalFileOptions()));
+
+  // Test ingest file without session_id and db_id (for example generated by an
+  // older version of sst_writer)
+  SyncPoint::GetInstance()->SetCallBack(
+      "PropertyBlockBuilder::AddTableProperty:Start", [&](void* props_vs) {
+        auto props = static_cast<TableProperties*>(props_vs);
+        // update table property session_id to a different one
+        props->db_session_id = "";
+        props->db_id = "";
+      });
+  std::atomic_int skipped = 0, passed = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTable::Open::SkippedVerifyUniqueId",
+      [&](void* /*arg*/) { skipped++; });
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTable::Open::PassedVerifyUniqueId",
+      [&](void* /*arg*/) { passed++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  auto options = CurrentOptions();
+  ASSERT_TRUE(options.verify_sst_unique_id_in_manifest);
+  Reopen(options);
+  ASSERT_EQ(skipped, 0);
+  ASSERT_EQ(passed, 2);  // one flushed + one ingested
+
+  external_file = sst_files_dir_ + "/file_to_ingest2.sst";
+  {
+    SstFileWriter sst_file_writer{EnvOptions(), CurrentOptions()};
+
+    ASSERT_OK(sst_file_writer.Open(external_file));
+    ASSERT_OK(sst_file_writer.Put("k", kIngestedVal));
+    ASSERT_OK(sst_file_writer.Finish());
+  }
+
+  ASSERT_OK(db_->IngestExternalFile(db_->DefaultColumnFamily(), {external_file},
+                                    IngestExternalFileOptions()));
+
+  // Two table file opens skipping verification:
+  // * ExternalSstFileIngestionJob::GetIngestedFileInfo
+  // * TableCache::GetTableReader
+  ASSERT_EQ(skipped, 2);
+  ASSERT_EQ(passed, 2);
+
+  // Check same after re-open (except no GetIngestedFileInfo)
+  skipped = 0;
+  passed = 0;
+  Reopen(options);
+  ASSERT_EQ(skipped, 1);
+  ASSERT_EQ(passed, 2);
+}
+
+TEST_F(ExternalSSTFileBasicTest, StableSnapshotWhileLoggingToManifest) {
+  const std::string kPutVal = "put_val";
+  const std::string kIngestedVal = "ingested_val";
+
+  ASSERT_OK(Put("k", kPutVal, WriteOptions()));
+  ASSERT_OK(Flush());
+
+  std::string external_file = sst_files_dir_ + "/file_to_ingest.sst";
+  {
+    SstFileWriter sst_file_writer{EnvOptions(), CurrentOptions()};
+    ASSERT_OK(sst_file_writer.Open(external_file));
+    ASSERT_OK(sst_file_writer.Put("k", kIngestedVal));
+    ASSERT_OK(sst_file_writer.Finish());
+  }
+
+  const Snapshot* snapshot = nullptr;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void* /* arg */) {
+        // prevent background compaction job to call this callback
+        ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+        snapshot = db_->GetSnapshot();
+        ReadOptions read_opts;
+        read_opts.snapshot = snapshot;
+        std::string value;
+        ASSERT_OK(db_->Get(read_opts, "k", &value));
+        ASSERT_EQ(kPutVal, value);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db_->IngestExternalFile(db_->DefaultColumnFamily(), {external_file},
+                                    IngestExternalFileOptions()));
+  auto ingested_file_seqno = db_->GetLatestSequenceNumber();
+  ASSERT_NE(nullptr, snapshot);
+  // snapshot is taken before SST ingestion is done
+  ASSERT_EQ(ingested_file_seqno, snapshot->GetSequenceNumber() + 1);
+
+  ReadOptions read_opts;
+  read_opts.snapshot = snapshot;
+  std::string value;
+  ASSERT_OK(db_->Get(read_opts, "k", &value));
+  ASSERT_EQ(kPutVal, value);
+  db_->ReleaseSnapshot(snapshot);
+
+  // After reopen, sequence number should be up current such that
+  // ingested value is read
+  Reopen(CurrentOptions());
+  ASSERT_OK(db_->Get(ReadOptions(), "k", &value));
+  ASSERT_EQ(kIngestedVal, value);
+
+  // New write should get higher seqno compared to ingested file
+  ASSERT_OK(Put("k", kPutVal, WriteOptions()));
+  ASSERT_EQ(db_->GetLatestSequenceNumber(), ingested_file_seqno + 1);
+}
+
+INSTANTIATE_TEST_CASE_P(ExternalSSTFileBasicTest, ExternalSSTFileBasicTest,
+                        testing::Values(std::make_tuple(true, true),
+                                        std::make_tuple(true, false),
+                                        std::make_tuple(false, true),
+                                        std::make_tuple(false, false)));
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/external_sst_file_ingestion_job.cc b/src/rocksdb/db/external_sst_file_ingestion_job.cc
new file mode 100644
index 000000000..ba1277eab
--- /dev/null
+++ b/src/rocksdb/db/external_sst_file_ingestion_job.cc
@@ -0,0 +1,1020 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/external_sst_file_ingestion_job.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/version_edit.h"
+#include "file/file_util.h"
+#include "file/random_access_file_reader.h"
+#include "logging/logging.h"
+#include "table/merging_iterator.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/sst_file_writer_collectors.h"
+#include "table/table_builder.h"
+#include "table/unique_id_impl.h"
+#include "test_util/sync_point.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status ExternalSstFileIngestionJob::Prepare(
+    const std::vector<std::string>& external_files_paths,
+    const std::vector<std::string>& files_checksums,
+    const std::vector<std::string>& files_checksum_func_names,
+    const Temperature& file_temperature, uint64_t next_file_number,
+    SuperVersion* sv) {
+  Status status;
+
+  // Read the information of files we are ingesting
+  for (const std::string& file_path : external_files_paths) {
+    IngestedFileInfo file_to_ingest;
+    status =
+        GetIngestedFileInfo(file_path, next_file_number++, &file_to_ingest, sv);
+    if (!status.ok()) {
+      return status;
+    }
+
+    if (file_to_ingest.cf_id !=
+            TablePropertiesCollectorFactory::Context::kUnknownColumnFamily &&
+        file_to_ingest.cf_id != cfd_->GetID()) {
+      return Status::InvalidArgument(
+          "External file column family id don't match");
+    }
+
+    if (file_to_ingest.num_entries == 0 &&
+        file_to_ingest.num_range_deletions == 0) {
+      return Status::InvalidArgument("File contain no entries");
+    }
+
+    if (!file_to_ingest.smallest_internal_key.Valid() ||
+        !file_to_ingest.largest_internal_key.Valid()) {
+      return Status::Corruption("Generated table have corrupted keys");
+    }
+
+    files_to_ingest_.emplace_back(std::move(file_to_ingest));
+  }
+
+  const Comparator* ucmp = cfd_->internal_comparator().user_comparator();
+  auto num_files = files_to_ingest_.size();
+  if (num_files == 0) {
+    return Status::InvalidArgument("The list of files is empty");
+  } else if (num_files > 1) {
+    // Verify that passed files don't have overlapping ranges
+    autovector<const IngestedFileInfo*> sorted_files;
+    for (size_t i = 0; i < num_files; i++) {
+      sorted_files.push_back(&files_to_ingest_[i]);
+    }
+
+    std::sort(
+        sorted_files.begin(), sorted_files.end(),
+        [&ucmp](const IngestedFileInfo* info1, const IngestedFileInfo* info2) {
+          return sstableKeyCompare(ucmp, info1->smallest_internal_key,
+                                   info2->smallest_internal_key) < 0;
+        });
+
+    for (size_t i = 0; i + 1 < num_files; i++) {
+      if (sstableKeyCompare(ucmp, sorted_files[i]->largest_internal_key,
+                            sorted_files[i + 1]->smallest_internal_key) >= 0) {
+        files_overlap_ = true;
+        break;
+      }
+    }
+  }
+
+  // Hanlde the file temperature
+  for (size_t i = 0; i < num_files; i++) {
+    files_to_ingest_[i].file_temperature = file_temperature;
+  }
+
+  if (ingestion_options_.ingest_behind && files_overlap_) {
+    return Status::NotSupported("Files have overlapping ranges");
+  }
+
+  // Copy/Move external files into DB
+  std::unordered_set<size_t> ingestion_path_ids;
+  for (IngestedFileInfo& f : files_to_ingest_) {
+    f.copy_file = false;
+    const std::string path_outside_db = f.external_file_path;
+    const std::string path_inside_db = TableFileName(
+        cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId());
+    if (ingestion_options_.move_files) {
+      status =
+          fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr);
+      if (status.ok()) {
+        // It is unsafe to assume application had sync the file and file
+        // directory before ingest the file. For integrity of RocksDB we need
+        // to sync the file.
+        std::unique_ptr<FSWritableFile> file_to_sync;
+        Status s = fs_->ReopenWritableFile(path_inside_db, env_options_,
+                                           &file_to_sync, nullptr);
+        TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:Reopen",
+                                 &s);
+        // Some file systems (especially remote/distributed) don't support
+        // reopening a file for writing and don't require reopening and
+        // syncing the file. Ignore the NotSupported error in that case.
+        if (!s.IsNotSupported()) {
+          status = s;
+          if (status.ok()) {
+            TEST_SYNC_POINT(
+                "ExternalSstFileIngestionJob::BeforeSyncIngestedFile");
+            status = SyncIngestedFile(file_to_sync.get());
+            TEST_SYNC_POINT(
+                "ExternalSstFileIngestionJob::AfterSyncIngestedFile");
+            if (!status.ok()) {
+              ROCKS_LOG_WARN(db_options_.info_log,
+                             "Failed to sync ingested file %s: %s",
+                             path_inside_db.c_str(), status.ToString().c_str());
+            }
+          }
+        }
+      } else if (status.IsNotSupported() &&
+                 ingestion_options_.failed_move_fall_back_to_copy) {
+        // Original file is on a different FS, use copy instead of hard linking.
+        f.copy_file = true;
+        ROCKS_LOG_INFO(db_options_.info_log,
+                       "Triy to link file %s but it's not supported : %s",
+                       path_outside_db.c_str(), status.ToString().c_str());
+      }
+    } else {
+      f.copy_file = true;
+    }
+
+    if (f.copy_file) {
+      TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:CopyFile",
+                               nullptr);
+      // CopyFile also sync the new file.
+      status =
+          CopyFile(fs_.get(), path_outside_db, path_inside_db, 0,
+                   db_options_.use_fsync, io_tracer_, Temperature::kUnknown);
+    }
+    TEST_SYNC_POINT("ExternalSstFileIngestionJob::Prepare:FileAdded");
+    if (!status.ok()) {
+      break;
+    }
+    f.internal_file_path = path_inside_db;
+    // Initialize the checksum information of ingested files.
+    f.file_checksum = kUnknownFileChecksum;
+    f.file_checksum_func_name = kUnknownFileChecksumFuncName;
+    ingestion_path_ids.insert(f.fd.GetPathId());
+  }
+
+  TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncDir");
+  if (status.ok()) {
+    for (auto path_id : ingestion_path_ids) {
+      status = directories_->GetDataDir(path_id)->FsyncWithDirOptions(
+          IOOptions(), nullptr,
+          DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+      if (!status.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "Failed to sync directory %" ROCKSDB_PRIszt
+                       " while ingest file: %s",
+                       path_id, status.ToString().c_str());
+        break;
+      }
+    }
+  }
+  TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncDir");
+
+  // Generate and check the sst file checksum. Note that, if
+  // IngestExternalFileOptions::write_global_seqno is true, we will not update
+  // the checksum information in the files_to_ingests_ here, since the file is
+  // upadted with the new global_seqno. After global_seqno is updated, DB will
+  // generate the new checksum and store it in the Manifest. In all other cases
+  // if ingestion_options_.write_global_seqno == true and
+  // verify_file_checksum is false, we only check the checksum function name.
+  if (status.ok() && db_options_.file_checksum_gen_factory != nullptr) {
+    if (ingestion_options_.verify_file_checksum == false &&
+        files_checksums.size() == files_to_ingest_.size() &&
+        files_checksum_func_names.size() == files_to_ingest_.size()) {
+      // Only when verify_file_checksum == false and the checksum for ingested
+      // files are provided, DB will use the provided checksum and does not
+      // generate the checksum for ingested files.
+      need_generate_file_checksum_ = false;
+    } else {
+      need_generate_file_checksum_ = true;
+    }
+    FileChecksumGenContext gen_context;
+    std::unique_ptr<FileChecksumGenerator> file_checksum_gen =
+        db_options_.file_checksum_gen_factory->CreateFileChecksumGenerator(
+            gen_context);
+    std::vector<std::string> generated_checksums;
+    std::vector<std::string> generated_checksum_func_names;
+    // Step 1: generate the checksum for ingested sst file.
+    if (need_generate_file_checksum_) {
+      for (size_t i = 0; i < files_to_ingest_.size(); i++) {
+        std::string generated_checksum;
+        std::string generated_checksum_func_name;
+        std::string requested_checksum_func_name;
+        // TODO: rate limit file reads for checksum calculation during file
+        // ingestion.
+        IOStatus io_s = GenerateOneFileChecksum(
+            fs_.get(), files_to_ingest_[i].internal_file_path,
+            db_options_.file_checksum_gen_factory.get(),
+            requested_checksum_func_name, &generated_checksum,
+            &generated_checksum_func_name,
+            ingestion_options_.verify_checksums_readahead_size,
+            db_options_.allow_mmap_reads, io_tracer_,
+            db_options_.rate_limiter.get(),
+            Env::IO_TOTAL /* rate_limiter_priority */);
+        if (!io_s.ok()) {
+          status = io_s;
+          ROCKS_LOG_WARN(db_options_.info_log,
+                         "Sst file checksum generation of file: %s failed: %s",
+                         files_to_ingest_[i].internal_file_path.c_str(),
+                         status.ToString().c_str());
+          break;
+        }
+        if (ingestion_options_.write_global_seqno == false) {
+          files_to_ingest_[i].file_checksum = generated_checksum;
+          files_to_ingest_[i].file_checksum_func_name =
+              generated_checksum_func_name;
+        }
+        generated_checksums.push_back(generated_checksum);
+        generated_checksum_func_names.push_back(generated_checksum_func_name);
+      }
+    }
+
+    // Step 2: based on the verify_file_checksum and ingested checksum
+    // information, do the verification.
+    if (status.ok()) {
+      if (files_checksums.size() == files_to_ingest_.size() &&
+          files_checksum_func_names.size() == files_to_ingest_.size()) {
+        // Verify the checksum and checksum function name.
+        if (ingestion_options_.verify_file_checksum) {
+          for (size_t i = 0; i < files_to_ingest_.size(); i++) {
+            if (files_checksum_func_names[i] !=
+                generated_checksum_func_names[i]) {
+              status = Status::InvalidArgument(
+                  "Checksum function name does not match with the checksum "
+                  "function name of this DB");
+              ROCKS_LOG_WARN(
+                  db_options_.info_log,
+                  "Sst file checksum verification of file: %s failed: %s",
+                  external_files_paths[i].c_str(), status.ToString().c_str());
+              break;
+            }
+            if (files_checksums[i] != generated_checksums[i]) {
+              status = Status::Corruption(
+                  "Ingested checksum does not match with the generated "
+                  "checksum");
+              ROCKS_LOG_WARN(
+                  db_options_.info_log,
+                  "Sst file checksum verification of file: %s failed: %s",
+                  files_to_ingest_[i].internal_file_path.c_str(),
+                  status.ToString().c_str());
+              break;
+            }
+          }
+        } else {
+          // If verify_file_checksum is not enabled, we only verify the
+          // checksum function name. If it does not match, fail the ingestion.
+          // If matches, we trust the ingested checksum information and store
+          // in the Manifest.
+          for (size_t i = 0; i < files_to_ingest_.size(); i++) {
+            if (files_checksum_func_names[i] != file_checksum_gen->Name()) {
+              status = Status::InvalidArgument(
+                  "Checksum function name does not match with the checksum "
+                  "function name of this DB");
+              ROCKS_LOG_WARN(
+                  db_options_.info_log,
+                  "Sst file checksum verification of file: %s failed: %s",
+                  external_files_paths[i].c_str(), status.ToString().c_str());
+              break;
+            }
+            files_to_ingest_[i].file_checksum = files_checksums[i];
+            files_to_ingest_[i].file_checksum_func_name =
+                files_checksum_func_names[i];
+          }
+        }
+      } else if (files_checksums.size() != files_checksum_func_names.size() ||
+                 (files_checksums.size() == files_checksum_func_names.size() &&
+                  files_checksums.size() != 0)) {
+        // The checksum or checksum function name vector are not both empty
+        // and they are incomplete.
+        status = Status::InvalidArgument(
+            "The checksum information of ingested sst files are nonempty and "
+            "the size of checksums or the size of the checksum function "
+            "names "
+            "does not match with the number of ingested sst files");
+        ROCKS_LOG_WARN(
+            db_options_.info_log,
+            "The ingested sst files checksum information is incomplete: %s",
+            status.ToString().c_str());
+      }
+    }
+  }
+
+  // TODO: The following is duplicated with Cleanup().
+  if (!status.ok()) {
+    IOOptions io_opts;
+    // We failed, remove all files that we copied into the db
+    for (IngestedFileInfo& f : files_to_ingest_) {
+      if (f.internal_file_path.empty()) {
+        continue;
+      }
+      Status s = fs_->DeleteFile(f.internal_file_path, io_opts, nullptr);
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "AddFile() clean up for file %s failed : %s",
+                       f.internal_file_path.c_str(), s.ToString().c_str());
+      }
+    }
+  }
+
+  return status;
+}
+
+Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed,
+                                               SuperVersion* super_version) {
+  autovector<Range> ranges;
+  autovector<std::string> keys;
+  size_t ts_sz = cfd_->user_comparator()->timestamp_size();
+  if (ts_sz) {
+    // Check all ranges [begin, end] inclusively. Add maximum
+    // timestamp to include all `begin` keys, and add minimal timestamp to
+    // include all `end` keys.
+    for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) {
+      std::string begin_str;
+      std::string end_str;
+      AppendUserKeyWithMaxTimestamp(
+          &begin_str, file_to_ingest.smallest_internal_key.user_key(), ts_sz);
+      AppendKeyWithMinTimestamp(
+          &end_str, file_to_ingest.largest_internal_key.user_key(), ts_sz);
+      keys.emplace_back(std::move(begin_str));
+      keys.emplace_back(std::move(end_str));
+    }
+    for (size_t i = 0; i < files_to_ingest_.size(); ++i) {
+      ranges.emplace_back(keys[2 * i], keys[2 * i + 1]);
+    }
+  } else {
+    for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) {
+      ranges.emplace_back(file_to_ingest.smallest_internal_key.user_key(),
+                          file_to_ingest.largest_internal_key.user_key());
+    }
+  }
+  Status status = cfd_->RangesOverlapWithMemtables(
+      ranges, super_version, db_options_.allow_data_in_errors, flush_needed);
+  if (status.ok() && *flush_needed &&
+      !ingestion_options_.allow_blocking_flush) {
+    status = Status::InvalidArgument("External file requires flush");
+  }
+  return status;
+}
+
+// REQUIRES: we have become the only writer by entering both write_thread_ and
+// nonmem_write_thread_
+Status ExternalSstFileIngestionJob::Run() {
+  Status status;
+  SuperVersion* super_version = cfd_->GetSuperVersion();
+#ifndef NDEBUG
+  // We should never run the job with a memtable that is overlapping
+  // with the files we are ingesting
+  bool need_flush = false;
+  status = NeedsFlush(&need_flush, super_version);
+  if (!status.ok()) {
+    return status;
+  }
+  if (need_flush) {
+    return Status::TryAgain();
+  }
+  assert(status.ok() && need_flush == false);
+#endif
+
+  bool force_global_seqno = false;
+
+  if (ingestion_options_.snapshot_consistency && !db_snapshots_->empty()) {
+    // We need to assign a global sequence number to all the files even
+    // if the don't overlap with any ranges since we have snapshots
+    force_global_seqno = true;
+  }
+  // It is safe to use this instead of LastAllocatedSequence since we are
+  // the only active writer, and hence they are equal
+  SequenceNumber last_seqno = versions_->LastSequence();
+  edit_.SetColumnFamily(cfd_->GetID());
+  // The levels that the files will be ingested into
+
+  for (IngestedFileInfo& f : files_to_ingest_) {
+    SequenceNumber assigned_seqno = 0;
+    if (ingestion_options_.ingest_behind) {
+      status = CheckLevelForIngestedBehindFile(&f);
+    } else {
+      status = AssignLevelAndSeqnoForIngestedFile(
+          super_version, force_global_seqno, cfd_->ioptions()->compaction_style,
+          last_seqno, &f, &assigned_seqno);
+    }
+
+    // Modify the smallest/largest internal key to include the sequence number
+    // that we just learned. Only overwrite sequence number zero. There could
+    // be a nonzero sequence number already to indicate a range tombstone's
+    // exclusive endpoint.
+    ParsedInternalKey smallest_parsed, largest_parsed;
+    if (status.ok()) {
+      status = ParseInternalKey(*f.smallest_internal_key.rep(),
+                                &smallest_parsed, false /* log_err_key */);
+    }
+    if (status.ok()) {
+      status = ParseInternalKey(*f.largest_internal_key.rep(), &largest_parsed,
+                                false /* log_err_key */);
+    }
+    if (!status.ok()) {
+      return status;
+    }
+    if (smallest_parsed.sequence == 0) {
+      UpdateInternalKey(f.smallest_internal_key.rep(), assigned_seqno,
+                        smallest_parsed.type);
+    }
+    if (largest_parsed.sequence == 0) {
+      UpdateInternalKey(f.largest_internal_key.rep(), assigned_seqno,
+                        largest_parsed.type);
+    }
+
+    status = AssignGlobalSeqnoForIngestedFile(&f, assigned_seqno);
+    TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run",
+                             &assigned_seqno);
+    if (assigned_seqno > last_seqno) {
+      assert(assigned_seqno == last_seqno + 1);
+      last_seqno = assigned_seqno;
+      ++consumed_seqno_count_;
+    }
+    if (!status.ok()) {
+      return status;
+    }
+
+    status = GenerateChecksumForIngestedFile(&f);
+    if (!status.ok()) {
+      return status;
+    }
+
+    // We use the import time as the ancester time. This is the time the data
+    // is written to the database.
+    int64_t temp_current_time = 0;
+    uint64_t current_time = kUnknownFileCreationTime;
+    uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
+    if (clock_->GetCurrentTime(&temp_current_time).ok()) {
+      current_time = oldest_ancester_time =
+          static_cast<uint64_t>(temp_current_time);
+    }
+    FileMetaData f_metadata(
+        f.fd.GetNumber(), f.fd.GetPathId(), f.fd.GetFileSize(),
+        f.smallest_internal_key, f.largest_internal_key, f.assigned_seqno,
+        f.assigned_seqno, false, f.file_temperature, kInvalidBlobFileNumber,
+        oldest_ancester_time, current_time, f.file_checksum,
+        f.file_checksum_func_name, f.unique_id);
+    f_metadata.temperature = f.file_temperature;
+    edit_.AddFile(f.picked_level, f_metadata);
+  }
+  return status;
+}
+
+void ExternalSstFileIngestionJob::UpdateStats() {
+  // Update internal stats for new ingested files
+  uint64_t total_keys = 0;
+  uint64_t total_l0_files = 0;
+  uint64_t total_time = clock_->NowMicros() - job_start_time_;
+
+  EventLoggerStream stream = event_logger_->Log();
+  stream << "event"
+         << "ingest_finished";
+  stream << "files_ingested";
+  stream.StartArray();
+
+  for (IngestedFileInfo& f : files_to_ingest_) {
+    InternalStats::CompactionStats stats(
+        CompactionReason::kExternalSstIngestion, 1);
+    stats.micros = total_time;
+    // If actual copy occurred for this file, then we need to count the file
+    // size as the actual bytes written. If the file was linked, then we ignore
+    // the bytes written for file metadata.
+    // TODO (yanqin) maybe account for file metadata bytes for exact accuracy?
+    if (f.copy_file) {
+      stats.bytes_written = f.fd.GetFileSize();
+    } else {
+      stats.bytes_moved = f.fd.GetFileSize();
+    }
+    stats.num_output_files = 1;
+    cfd_->internal_stats()->AddCompactionStats(f.picked_level,
+                                               Env::Priority::USER, stats);
+    cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_INGESTED_ADD_FILE,
+                                       f.fd.GetFileSize());
+    total_keys += f.num_entries;
+    if (f.picked_level == 0) {
+      total_l0_files += 1;
+    }
+    ROCKS_LOG_INFO(
+        db_options_.info_log,
+        "[AddFile] External SST file %s was ingested in L%d with path %s "
+        "(global_seqno=%" PRIu64 ")\n",
+        f.external_file_path.c_str(), f.picked_level,
+        f.internal_file_path.c_str(), f.assigned_seqno);
+    stream << "file" << f.internal_file_path << "level" << f.picked_level;
+  }
+  stream.EndArray();
+
+  stream << "lsm_state";
+  stream.StartArray();
+  auto vstorage = cfd_->current()->storage_info();
+  for (int level = 0; level < vstorage->num_levels(); ++level) {
+    stream << vstorage->NumLevelFiles(level);
+  }
+  stream.EndArray();
+
+  cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_KEYS_TOTAL,
+                                     total_keys);
+  cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_FILES_TOTAL,
+                                     files_to_ingest_.size());
+  cfd_->internal_stats()->AddCFStats(
+      InternalStats::INGESTED_LEVEL0_NUM_FILES_TOTAL, total_l0_files);
+}
+
+void ExternalSstFileIngestionJob::Cleanup(const Status& status) {
+  IOOptions io_opts;
+  if (!status.ok()) {
+    // We failed to add the files to the database
+    // remove all the files we copied
+    for (IngestedFileInfo& f : files_to_ingest_) {
+      if (f.internal_file_path.empty()) {
+        continue;
+      }
+      Status s = fs_->DeleteFile(f.internal_file_path, io_opts, nullptr);
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "AddFile() clean up for file %s failed : %s",
+                       f.internal_file_path.c_str(), s.ToString().c_str());
+      }
+    }
+    consumed_seqno_count_ = 0;
+    files_overlap_ = false;
+  } else if (status.ok() && ingestion_options_.move_files) {
+    // The files were moved and added successfully, remove original file links
+    for (IngestedFileInfo& f : files_to_ingest_) {
+      Status s = fs_->DeleteFile(f.external_file_path, io_opts, nullptr);
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(
+            db_options_.info_log,
+            "%s was added to DB successfully but failed to remove original "
+            "file link : %s",
+            f.external_file_path.c_str(), s.ToString().c_str());
+      }
+    }
+  }
+}
+
+Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
+    const std::string& external_file, uint64_t new_file_number,
+    IngestedFileInfo* file_to_ingest, SuperVersion* sv) {
+  file_to_ingest->external_file_path = external_file;
+
+  // Get external file size
+  Status status = fs_->GetFileSize(external_file, IOOptions(),
+                                   &file_to_ingest->file_size, nullptr);
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Assign FD with number
+  file_to_ingest->fd =
+      FileDescriptor(new_file_number, 0, file_to_ingest->file_size);
+
+  // Create TableReader for external file
+  std::unique_ptr<TableReader> table_reader;
+  std::unique_ptr<FSRandomAccessFile> sst_file;
+  std::unique_ptr<RandomAccessFileReader> sst_file_reader;
+
+  status =
+      fs_->NewRandomAccessFile(external_file, env_options_, &sst_file, nullptr);
+  if (!status.ok()) {
+    return status;
+  }
+  sst_file_reader.reset(new RandomAccessFileReader(
+      std::move(sst_file), external_file, nullptr /*Env*/, io_tracer_));
+
+  status = cfd_->ioptions()->table_factory->NewTableReader(
+      TableReaderOptions(
+          *cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor,
+          env_options_, cfd_->internal_comparator(),
+          /*skip_filters*/ false, /*immortal*/ false,
+          /*force_direct_prefetch*/ false, /*level*/ -1,
+          /*block_cache_tracer*/ nullptr,
+          /*max_file_size_for_l0_meta_pin*/ 0, versions_->DbSessionId(),
+          /*cur_file_num*/ new_file_number),
+      std::move(sst_file_reader), file_to_ingest->file_size, &table_reader);
+  if (!status.ok()) {
+    return status;
+  }
+
+  if (ingestion_options_.verify_checksums_before_ingest) {
+    // If customized readahead size is needed, we can pass a user option
+    // all the way to here. Right now we just rely on the default readahead
+    // to keep things simple.
+    ReadOptions ro;
+    ro.readahead_size = ingestion_options_.verify_checksums_readahead_size;
+    status = table_reader->VerifyChecksum(
+        ro, TableReaderCaller::kExternalSSTIngestion);
+  }
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Get the external file properties
+  auto props = table_reader->GetTableProperties();
+  const auto& uprops = props->user_collected_properties;
+
+  // Get table version
+  auto version_iter = uprops.find(ExternalSstFilePropertyNames::kVersion);
+  if (version_iter == uprops.end()) {
+    return Status::Corruption("External file version not found");
+  }
+  file_to_ingest->version = DecodeFixed32(version_iter->second.c_str());
+
+  auto seqno_iter = uprops.find(ExternalSstFilePropertyNames::kGlobalSeqno);
+  if (file_to_ingest->version == 2) {
+    // version 2 imply that we have global sequence number
+    if (seqno_iter == uprops.end()) {
+      return Status::Corruption(
+          "External file global sequence number not found");
+    }
+
+    // Set the global sequence number
+    file_to_ingest->original_seqno = DecodeFixed64(seqno_iter->second.c_str());
+    if (props->external_sst_file_global_seqno_offset == 0) {
+      file_to_ingest->global_seqno_offset = 0;
+      return Status::Corruption("Was not able to find file global seqno field");
+    }
+    file_to_ingest->global_seqno_offset =
+        static_cast<size_t>(props->external_sst_file_global_seqno_offset);
+  } else if (file_to_ingest->version == 1) {
+    // SST file V1 should not have global seqno field
+    assert(seqno_iter == uprops.end());
+    file_to_ingest->original_seqno = 0;
+    if (ingestion_options_.allow_blocking_flush ||
+        ingestion_options_.allow_global_seqno) {
+      return Status::InvalidArgument(
+          "External SST file V1 does not support global seqno");
+    }
+  } else {
+    return Status::InvalidArgument("External file version is not supported");
+  }
+  // Get number of entries in table
+  file_to_ingest->num_entries = props->num_entries;
+  file_to_ingest->num_range_deletions = props->num_range_deletions;
+
+  ParsedInternalKey key;
+  ReadOptions ro;
+  // During reading the external file we can cache blocks that we read into
+  // the block cache, if we later change the global seqno of this file, we will
+  // have block in cache that will include keys with wrong seqno.
+  // We need to disable fill_cache so that we read from the file without
+  // updating the block cache.
+  ro.fill_cache = false;
+  std::unique_ptr<InternalIterator> iter(table_reader->NewIterator(
+      ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion));
+  std::unique_ptr<InternalIterator> range_del_iter(
+      table_reader->NewRangeTombstoneIterator(ro));
+
+  // Get first (smallest) and last (largest) key from file.
+  file_to_ingest->smallest_internal_key =
+      InternalKey("", 0, ValueType::kTypeValue);
+  file_to_ingest->largest_internal_key =
+      InternalKey("", 0, ValueType::kTypeValue);
+  bool bounds_set = false;
+  bool allow_data_in_errors = db_options_.allow_data_in_errors;
+  iter->SeekToFirst();
+  if (iter->Valid()) {
+    Status pik_status =
+        ParseInternalKey(iter->key(), &key, allow_data_in_errors);
+    if (!pik_status.ok()) {
+      return Status::Corruption("Corrupted key in external file. ",
+                                pik_status.getState());
+    }
+    if (key.sequence != 0) {
+      return Status::Corruption("External file has non zero sequence number");
+    }
+    file_to_ingest->smallest_internal_key.SetFrom(key);
+
+    iter->SeekToLast();
+    pik_status = ParseInternalKey(iter->key(), &key, allow_data_in_errors);
+    if (!pik_status.ok()) {
+      return Status::Corruption("Corrupted key in external file. ",
+                                pik_status.getState());
+    }
+    if (key.sequence != 0) {
+      return Status::Corruption("External file has non zero sequence number");
+    }
+    file_to_ingest->largest_internal_key.SetFrom(key);
+
+    bounds_set = true;
+  }
+
+  // We may need to adjust these key bounds, depending on whether any range
+  // deletion tombstones extend past them.
+  const Comparator* ucmp = cfd_->internal_comparator().user_comparator();
+  if (range_del_iter != nullptr) {
+    for (range_del_iter->SeekToFirst(); range_del_iter->Valid();
+         range_del_iter->Next()) {
+      Status pik_status =
+          ParseInternalKey(range_del_iter->key(), &key, allow_data_in_errors);
+      if (!pik_status.ok()) {
+        return Status::Corruption("Corrupted key in external file. ",
+                                  pik_status.getState());
+      }
+      RangeTombstone tombstone(key, range_del_iter->value());
+
+      InternalKey start_key = tombstone.SerializeKey();
+      if (!bounds_set ||
+          sstableKeyCompare(ucmp, start_key,
+                            file_to_ingest->smallest_internal_key) < 0) {
+        file_to_ingest->smallest_internal_key = start_key;
+      }
+      InternalKey end_key = tombstone.SerializeEndKey();
+      if (!bounds_set ||
+          sstableKeyCompare(ucmp, end_key,
+                            file_to_ingest->largest_internal_key) > 0) {
+        file_to_ingest->largest_internal_key = end_key;
+      }
+      bounds_set = true;
+    }
+  }
+
+  file_to_ingest->cf_id = static_cast<uint32_t>(props->column_family_id);
+
+  file_to_ingest->table_properties = *props;
+
+  auto s = GetSstInternalUniqueId(props->db_id, props->db_session_id,
+                                  props->orig_file_number,
+                                  &(file_to_ingest->unique_id));
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Failed to get SST unique id for file %s",
+                   file_to_ingest->internal_file_path.c_str());
+    file_to_ingest->unique_id = kNullUniqueId64x2;
+  }
+
+  return status;
+}
+
+Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
+    SuperVersion* sv, bool force_global_seqno, CompactionStyle compaction_style,
+    SequenceNumber last_seqno, IngestedFileInfo* file_to_ingest,
+    SequenceNumber* assigned_seqno) {
+  Status status;
+  *assigned_seqno = 0;
+  if (force_global_seqno) {
+    *assigned_seqno = last_seqno + 1;
+    if (compaction_style == kCompactionStyleUniversal || files_overlap_) {
+      if (ingestion_options_.fail_if_not_bottommost_level) {
+        status = Status::TryAgain(
+            "Files cannot be ingested to Lmax. Please make sure key range of "
+            "Lmax does not overlap with files to ingest.");
+        return status;
+      }
+      file_to_ingest->picked_level = 0;
+      return status;
+    }
+  }
+
+  bool overlap_with_db = false;
+  Arena arena;
+  ReadOptions ro;
+  ro.total_order_seek = true;
+  int target_level = 0;
+  auto* vstorage = cfd_->current()->storage_info();
+
+  for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) {
+    if (lvl > 0 && lvl < vstorage->base_level()) {
+      continue;
+    }
+
+    if (vstorage->NumLevelFiles(lvl) > 0) {
+      bool overlap_with_level = false;
+      status = sv->current->OverlapWithLevelIterator(
+          ro, env_options_, file_to_ingest->smallest_internal_key.user_key(),
+          file_to_ingest->largest_internal_key.user_key(), lvl,
+          &overlap_with_level);
+      if (!status.ok()) {
+        return status;
+      }
+      if (overlap_with_level) {
+        // We must use L0 or any level higher than `lvl` to be able to overwrite
+        // the keys that we overlap with in this level, We also need to assign
+        // this file a seqno to overwrite the existing keys in level `lvl`
+        overlap_with_db = true;
+        break;
+      }
+
+      if (compaction_style == kCompactionStyleUniversal && lvl != 0) {
+        const std::vector<FileMetaData*>& level_files =
+            vstorage->LevelFiles(lvl);
+        const SequenceNumber level_largest_seqno =
+            (*std::max_element(level_files.begin(), level_files.end(),
+                               [](FileMetaData* f1, FileMetaData* f2) {
+                                 return f1->fd.largest_seqno <
+                                        f2->fd.largest_seqno;
+                               }))
+                ->fd.largest_seqno;
+        // should only assign seqno to current level's largest seqno when
+        // the file fits
+        if (level_largest_seqno != 0 &&
+            IngestedFileFitInLevel(file_to_ingest, lvl)) {
+          *assigned_seqno = level_largest_seqno;
+        } else {
+          continue;
+        }
+      }
+    } else if (compaction_style == kCompactionStyleUniversal) {
+      continue;
+    }
+
+    // We don't overlap with any keys in this level, but we still need to check
+    // if our file can fit in it
+    if (IngestedFileFitInLevel(file_to_ingest, lvl)) {
+      target_level = lvl;
+    }
+  }
+  // If files overlap, we have to ingest them at level 0 and assign the newest
+  // sequence number
+  if (files_overlap_) {
+    target_level = 0;
+    *assigned_seqno = last_seqno + 1;
+  }
+
+  if (ingestion_options_.fail_if_not_bottommost_level &&
+      target_level < cfd_->NumberLevels() - 1) {
+    status = Status::TryAgain(
+        "Files cannot be ingested to Lmax. Please make sure key range of Lmax "
+        "does not overlap with files to ingest.");
+    return status;
+  }
+
+  TEST_SYNC_POINT_CALLBACK(
+      "ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile",
+      &overlap_with_db);
+  file_to_ingest->picked_level = target_level;
+  if (overlap_with_db && *assigned_seqno == 0) {
+    *assigned_seqno = last_seqno + 1;
+  }
+  return status;
+}
+
+Status ExternalSstFileIngestionJob::CheckLevelForIngestedBehindFile(
+    IngestedFileInfo* file_to_ingest) {
+  auto* vstorage = cfd_->current()->storage_info();
+  // first check if new files fit in the bottommost level
+  int bottom_lvl = cfd_->NumberLevels() - 1;
+  if (!IngestedFileFitInLevel(file_to_ingest, bottom_lvl)) {
+    return Status::InvalidArgument(
+        "Can't ingest_behind file as it doesn't fit "
+        "at the bottommost level!");
+  }
+
+  // second check if despite allow_ingest_behind=true we still have 0 seqnums
+  // at some upper level
+  for (int lvl = 0; lvl < cfd_->NumberLevels() - 1; lvl++) {
+    for (auto file : vstorage->LevelFiles(lvl)) {
+      if (file->fd.smallest_seqno == 0) {
+        return Status::InvalidArgument(
+            "Can't ingest_behind file as despite allow_ingest_behind=true "
+            "there are files with 0 seqno in database at upper levels!");
+      }
+    }
+  }
+
+  file_to_ingest->picked_level = bottom_lvl;
+  return Status::OK();
+}
+
+Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile(
+    IngestedFileInfo* file_to_ingest, SequenceNumber seqno) {
+  if (file_to_ingest->original_seqno == seqno) {
+    // This file already have the correct global seqno
+    return Status::OK();
+  } else if (!ingestion_options_.allow_global_seqno) {
+    return Status::InvalidArgument("Global seqno is required, but disabled");
+  } else if (file_to_ingest->global_seqno_offset == 0) {
+    return Status::InvalidArgument(
+        "Trying to set global seqno for a file that don't have a global seqno "
+        "field");
+  }
+
+  if (ingestion_options_.write_global_seqno) {
+    // Determine if we can write global_seqno to a given offset of file.
+    // If the file system does not support random write, then we should not.
+    // Otherwise we should.
+    std::unique_ptr<FSRandomRWFile> rwfile;
+    Status status = fs_->NewRandomRWFile(file_to_ingest->internal_file_path,
+                                         env_options_, &rwfile, nullptr);
+    TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::NewRandomRWFile",
+                             &status);
+    if (status.ok()) {
+      FSRandomRWFilePtr fsptr(std::move(rwfile), io_tracer_,
+                              file_to_ingest->internal_file_path);
+      std::string seqno_val;
+      PutFixed64(&seqno_val, seqno);
+      status = fsptr->Write(file_to_ingest->global_seqno_offset, seqno_val,
+                            IOOptions(), nullptr);
+      if (status.ok()) {
+        TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno");
+        status = SyncIngestedFile(fsptr.get());
+        TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncGlobalSeqno");
+        if (!status.ok()) {
+          ROCKS_LOG_WARN(db_options_.info_log,
+                         "Failed to sync ingested file %s after writing global "
+                         "sequence number: %s",
+                         file_to_ingest->internal_file_path.c_str(),
+                         status.ToString().c_str());
+        }
+      }
+      if (!status.ok()) {
+        return status;
+      }
+    } else if (!status.IsNotSupported()) {
+      return status;
+    }
+  }
+
+  file_to_ingest->assigned_seqno = seqno;
+  return Status::OK();
+}
+
+IOStatus ExternalSstFileIngestionJob::GenerateChecksumForIngestedFile(
+    IngestedFileInfo* file_to_ingest) {
+  if (db_options_.file_checksum_gen_factory == nullptr ||
+      need_generate_file_checksum_ == false ||
+      ingestion_options_.write_global_seqno == false) {
+    // If file_checksum_gen_factory is not set, we are not able to generate
+    // the checksum. if write_global_seqno is false, it means we will use
+    // file checksum generated during Prepare(). This step will be skipped.
+    return IOStatus::OK();
+  }
+  std::string file_checksum;
+  std::string file_checksum_func_name;
+  std::string requested_checksum_func_name;
+  // TODO: rate limit file reads for checksum calculation during file ingestion.
+  IOStatus io_s = GenerateOneFileChecksum(
+      fs_.get(), file_to_ingest->internal_file_path,
+      db_options_.file_checksum_gen_factory.get(), requested_checksum_func_name,
+      &file_checksum, &file_checksum_func_name,
+      ingestion_options_.verify_checksums_readahead_size,
+      db_options_.allow_mmap_reads, io_tracer_, db_options_.rate_limiter.get(),
+      Env::IO_TOTAL /* rate_limiter_priority */);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  file_to_ingest->file_checksum = file_checksum;
+  file_to_ingest->file_checksum_func_name = file_checksum_func_name;
+  return IOStatus::OK();
+}
+
+bool ExternalSstFileIngestionJob::IngestedFileFitInLevel(
+    const IngestedFileInfo* file_to_ingest, int level) {
+  if (level == 0) {
+    // Files can always fit in L0
+    return true;
+  }
+
+  auto* vstorage = cfd_->current()->storage_info();
+  Slice file_smallest_user_key(
+      file_to_ingest->smallest_internal_key.user_key());
+  Slice file_largest_user_key(file_to_ingest->largest_internal_key.user_key());
+
+  if (vstorage->OverlapInLevel(level, &file_smallest_user_key,
+                               &file_largest_user_key)) {
+    // File overlap with another files in this level, we cannot
+    // add it to this level
+    return false;
+  }
+  if (cfd_->RangeOverlapWithCompaction(file_smallest_user_key,
+                                       file_largest_user_key, level)) {
+    // File overlap with a running compaction output that will be stored
+    // in this level, we cannot add this file to this level
+    return false;
+  }
+
+  // File did not overlap with level files, our compaction output
+  return true;
+}
+
+template <typename TWritableFile>
+Status ExternalSstFileIngestionJob::SyncIngestedFile(TWritableFile* file) {
+  assert(file != nullptr);
+  if (db_options_.use_fsync) {
+    return file->Fsync(IOOptions(), nullptr);
+  } else {
+    return file->Sync(IOOptions(), nullptr);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/external_sst_file_ingestion_job.h b/src/rocksdb/db/external_sst_file_ingestion_job.h
new file mode 100644
index 000000000..ce50ae86d
--- /dev/null
+++ b/src/rocksdb/db/external_sst_file_ingestion_job.h
@@ -0,0 +1,201 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/internal_stats.h"
+#include "db/snapshot_impl.h"
+#include "env/file_system_tracer.h"
+#include "logging/event_logger.h"
+#include "options/db_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/sst_file_writer.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Directories;
+class SystemClock;
+
+struct IngestedFileInfo {
+  // External file path
+  std::string external_file_path;
+  // Smallest internal key in external file
+  InternalKey smallest_internal_key;
+  // Largest internal key in external file
+  InternalKey largest_internal_key;
+  // Sequence number for keys in external file
+  SequenceNumber original_seqno;
+  // Offset of the global sequence number field in the file, will
+  // be zero if version is 1 (global seqno is not supported)
+  size_t global_seqno_offset;
+  // External file size
+  uint64_t file_size;
+  // total number of keys in external file
+  uint64_t num_entries;
+  // total number of range deletions in external file
+  uint64_t num_range_deletions;
+  // Id of column family this file shoule be ingested into
+  uint32_t cf_id;
+  // TableProperties read from external file
+  TableProperties table_properties;
+  // Version of external file
+  int version;
+
+  // FileDescriptor for the file inside the DB
+  FileDescriptor fd;
+  // file path that we picked for file inside the DB
+  std::string internal_file_path;
+  // Global sequence number that we picked for the file inside the DB
+  SequenceNumber assigned_seqno = 0;
+  // Level inside the DB we picked for the external file.
+  int picked_level = 0;
+  // Whether to copy or link the external sst file. copy_file will be set to
+  // false if ingestion_options.move_files is true and underlying FS
+  // supports link operation. Need to provide a default value to make the
+  // undefined-behavior sanity check of llvm happy. Since
+  // ingestion_options.move_files is false by default, thus copy_file is true
+  // by default.
+  bool copy_file = true;
+  // The checksum of ingested file
+  std::string file_checksum;
+  // The name of checksum function that generate the checksum
+  std::string file_checksum_func_name;
+  // The temperature of the file to be ingested
+  Temperature file_temperature = Temperature::kUnknown;
+  // Unique id of the file to be ingested
+  UniqueId64x2 unique_id{};
+};
+
+class ExternalSstFileIngestionJob {
+ public:
+  ExternalSstFileIngestionJob(
+      VersionSet* versions, ColumnFamilyData* cfd,
+      const ImmutableDBOptions& db_options, const EnvOptions& env_options,
+      SnapshotList* db_snapshots,
+      const IngestExternalFileOptions& ingestion_options,
+      Directories* directories, EventLogger* event_logger,
+      const std::shared_ptr<IOTracer>& io_tracer)
+      : clock_(db_options.clock),
+        fs_(db_options.fs, io_tracer),
+        versions_(versions),
+        cfd_(cfd),
+        db_options_(db_options),
+        env_options_(env_options),
+        db_snapshots_(db_snapshots),
+        ingestion_options_(ingestion_options),
+        directories_(directories),
+        event_logger_(event_logger),
+        job_start_time_(clock_->NowMicros()),
+        consumed_seqno_count_(0),
+        io_tracer_(io_tracer) {
+    assert(directories != nullptr);
+  }
+
+  // Prepare the job by copying external files into the DB.
+  Status Prepare(const std::vector<std::string>& external_files_paths,
+                 const std::vector<std::string>& files_checksums,
+                 const std::vector<std::string>& files_checksum_func_names,
+                 const Temperature& file_temperature, uint64_t next_file_number,
+                 SuperVersion* sv);
+
+  // Check if we need to flush the memtable before running the ingestion job
+  // This will be true if the files we are ingesting are overlapping with any
+  // key range in the memtable.
+  //
+  // @param super_version A referenced SuperVersion that will be held for the
+  //    duration of this function.
+  //
+  // Thread-safe
+  Status NeedsFlush(bool* flush_needed, SuperVersion* super_version);
+
+  // Will execute the ingestion job and prepare edit() to be applied.
+  // REQUIRES: Mutex held
+  Status Run();
+
+  // Update column family stats.
+  // REQUIRES: Mutex held
+  void UpdateStats();
+
+  // Cleanup after successful/failed job
+  void Cleanup(const Status& status);
+
+  VersionEdit* edit() { return &edit_; }
+
+  const autovector<IngestedFileInfo>& files_to_ingest() const {
+    return files_to_ingest_;
+  }
+
+  // How many sequence numbers did we consume as part of the ingest job?
+  int ConsumedSequenceNumbersCount() const { return consumed_seqno_count_; }
+
+ private:
+  // Open the external file and populate `file_to_ingest` with all the
+  // external information we need to ingest this file.
+  Status GetIngestedFileInfo(const std::string& external_file,
+                             uint64_t new_file_number,
+                             IngestedFileInfo* file_to_ingest,
+                             SuperVersion* sv);
+
+  // Assign `file_to_ingest` the appropriate sequence number and the lowest
+  // possible level that it can be ingested to according to compaction_style.
+  // REQUIRES: Mutex held
+  Status AssignLevelAndSeqnoForIngestedFile(SuperVersion* sv,
+                                            bool force_global_seqno,
+                                            CompactionStyle compaction_style,
+                                            SequenceNumber last_seqno,
+                                            IngestedFileInfo* file_to_ingest,
+                                            SequenceNumber* assigned_seqno);
+
+  // File that we want to ingest behind always goes to the lowest level;
+  // we just check that it fits in the level, that DB allows ingest_behind,
+  // and that we don't have 0 seqnums at the upper levels.
+  // REQUIRES: Mutex held
+  Status CheckLevelForIngestedBehindFile(IngestedFileInfo* file_to_ingest);
+
+  // Set the file global sequence number to `seqno`
+  Status AssignGlobalSeqnoForIngestedFile(IngestedFileInfo* file_to_ingest,
+                                          SequenceNumber seqno);
+  // Generate the file checksum and store in the IngestedFileInfo
+  IOStatus GenerateChecksumForIngestedFile(IngestedFileInfo* file_to_ingest);
+
+  // Check if `file_to_ingest` can fit in level `level`
+  // REQUIRES: Mutex held
+  bool IngestedFileFitInLevel(const IngestedFileInfo* file_to_ingest,
+                              int level);
+
+  // Helper method to sync given file.
+  template <typename TWritableFile>
+  Status SyncIngestedFile(TWritableFile* file);
+
+  SystemClock* clock_;
+  FileSystemPtr fs_;
+  VersionSet* versions_;
+  ColumnFamilyData* cfd_;
+  const ImmutableDBOptions& db_options_;
+  const EnvOptions& env_options_;
+  SnapshotList* db_snapshots_;
+  autovector<IngestedFileInfo> files_to_ingest_;
+  const IngestExternalFileOptions& ingestion_options_;
+  Directories* directories_;
+  EventLogger* event_logger_;
+  VersionEdit edit_;
+  uint64_t job_start_time_;
+  int consumed_seqno_count_;
+  // Set in ExternalSstFileIngestionJob::Prepare(), if true all files are
+  // ingested in L0
+  bool files_overlap_{false};
+  // Set in ExternalSstFileIngestionJob::Prepare(), if true and DB
+  // file_checksum_gen_factory is set, DB will generate checksum each file.
+  bool need_generate_file_checksum_{true};
+  std::shared_ptr<IOTracer> io_tracer_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/external_sst_file_test.cc b/src/rocksdb/db/external_sst_file_test.cc
new file mode 100644
index 000000000..d16f6a58c
--- /dev/null
+++ b/src/rocksdb/db/external_sst_file_test.cc
@@ -0,0 +1,2967 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "file/filename.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/sst_file_reader.h"
+#include "rocksdb/sst_file_writer.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+#include "util/thread_guard.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A test environment that can be configured to fail the Link operation.
+class ExternalSSTTestEnv : public EnvWrapper {
+ public:
+  ExternalSSTTestEnv(Env* t, bool fail_link)
+      : EnvWrapper(t), fail_link_(fail_link) {}
+  static const char* kClassName() { return "ExternalSSTTestEnv"; }
+  const char* Name() const override { return kClassName(); }
+
+  Status LinkFile(const std::string& s, const std::string& t) override {
+    if (fail_link_) {
+      return Status::NotSupported("Link failed");
+    }
+    return target()->LinkFile(s, t);
+  }
+
+  void set_fail_link(bool fail_link) { fail_link_ = fail_link; }
+
+ private:
+  bool fail_link_;
+};
+
+class ExternalSSTFileTestBase : public DBTestBase {
+ public:
+  ExternalSSTFileTestBase()
+      : DBTestBase("external_sst_file_test", /*env_do_fsync=*/true) {
+    sst_files_dir_ = dbname_ + "/sst_files/";
+    DestroyAndRecreateExternalSSTFilesDir();
+  }
+
+  void DestroyAndRecreateExternalSSTFilesDir() {
+    ASSERT_OK(DestroyDir(env_, sst_files_dir_));
+    ASSERT_OK(env_->CreateDir(sst_files_dir_));
+  }
+
+  ~ExternalSSTFileTestBase() override {
+    DestroyDir(env_, sst_files_dir_).PermitUncheckedError();
+  }
+
+ protected:
+  std::string sst_files_dir_;
+};
+
+class ExternSSTFileLinkFailFallbackTest
+    : public ExternalSSTFileTestBase,
+      public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  ExternSSTFileLinkFailFallbackTest()
+      : test_env_(new ExternalSSTTestEnv(env_, true)) {
+    options_ = CurrentOptions();
+    options_.disable_auto_compactions = true;
+    options_.env = test_env_;
+  }
+
+  void TearDown() override {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, options_));
+    delete test_env_;
+    test_env_ = nullptr;
+  }
+
+ protected:
+  Options options_;
+  ExternalSSTTestEnv* test_env_;
+};
+
+class ExternalSSTFileTest
+    : public ExternalSSTFileTestBase,
+      public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  ExternalSSTFileTest() {}
+
+  Status GenerateOneExternalFile(
+      const Options& options, ColumnFamilyHandle* cfh,
+      std::vector<std::pair<std::string, std::string>>& data, int file_id,
+      bool sort_data, std::string* external_file_path,
+      std::map<std::string, std::string>* true_data) {
+    // Generate a file id if not provided
+    if (-1 == file_id) {
+      file_id = (++last_file_id_);
+    }
+    // Sort data if asked to do so
+    if (sort_data) {
+      std::sort(data.begin(), data.end(),
+                [&](const std::pair<std::string, std::string>& e1,
+                    const std::pair<std::string, std::string>& e2) {
+                  return options.comparator->Compare(e1.first, e2.first) < 0;
+                });
+      auto uniq_iter = std::unique(
+          data.begin(), data.end(),
+          [&](const std::pair<std::string, std::string>& e1,
+              const std::pair<std::string, std::string>& e2) {
+            return options.comparator->Compare(e1.first, e2.first) == 0;
+          });
+      data.resize(uniq_iter - data.begin());
+    }
+    std::string file_path = sst_files_dir_ + std::to_string(file_id);
+    SstFileWriter sst_file_writer(EnvOptions(), options, cfh);
+    Status s = sst_file_writer.Open(file_path);
+    if (!s.ok()) {
+      return s;
+    }
+    for (const auto& entry : data) {
+      s = sst_file_writer.Put(entry.first, entry.second);
+      if (!s.ok()) {
+        sst_file_writer.Finish().PermitUncheckedError();
+        return s;
+      }
+    }
+    s = sst_file_writer.Finish();
+    if (s.ok() && external_file_path != nullptr) {
+      *external_file_path = file_path;
+    }
+    if (s.ok() && nullptr != true_data) {
+      for (const auto& entry : data) {
+        true_data->insert({entry.first, entry.second});
+      }
+    }
+    return s;
+  }
+
+  Status GenerateAndAddExternalFile(
+      const Options options,
+      std::vector<std::pair<std::string, std::string>> data, int file_id = -1,
+      bool allow_global_seqno = false, bool write_global_seqno = false,
+      bool verify_checksums_before_ingest = true, bool ingest_behind = false,
+      bool sort_data = false,
+      std::map<std::string, std::string>* true_data = nullptr,
+      ColumnFamilyHandle* cfh = nullptr) {
+    // Generate a file id if not provided
+    if (file_id == -1) {
+      file_id = last_file_id_ + 1;
+      last_file_id_++;
+    }
+
+    // Sort data if asked to do so
+    if (sort_data) {
+      std::sort(data.begin(), data.end(),
+                [&](const std::pair<std::string, std::string>& e1,
+                    const std::pair<std::string, std::string>& e2) {
+                  return options.comparator->Compare(e1.first, e2.first) < 0;
+                });
+      auto uniq_iter = std::unique(
+          data.begin(), data.end(),
+          [&](const std::pair<std::string, std::string>& e1,
+              const std::pair<std::string, std::string>& e2) {
+            return options.comparator->Compare(e1.first, e2.first) == 0;
+          });
+      data.resize(uniq_iter - data.begin());
+    }
+    std::string file_path = sst_files_dir_ + std::to_string(file_id);
+    SstFileWriter sst_file_writer(EnvOptions(), options, cfh);
+
+    Status s = sst_file_writer.Open(file_path);
+    if (!s.ok()) {
+      return s;
+    }
+    for (auto& entry : data) {
+      s = sst_file_writer.Put(entry.first, entry.second);
+      if (!s.ok()) {
+        sst_file_writer.Finish().PermitUncheckedError();
+        return s;
+      }
+    }
+    s = sst_file_writer.Finish();
+
+    if (s.ok()) {
+      IngestExternalFileOptions ifo;
+      ifo.allow_global_seqno = allow_global_seqno;
+      ifo.write_global_seqno = allow_global_seqno ? write_global_seqno : false;
+      ifo.verify_checksums_before_ingest = verify_checksums_before_ingest;
+      ifo.ingest_behind = ingest_behind;
+      if (cfh) {
+        s = db_->IngestExternalFile(cfh, {file_path}, ifo);
+      } else {
+        s = db_->IngestExternalFile({file_path}, ifo);
+      }
+    }
+
+    if (s.ok() && true_data) {
+      for (auto& entry : data) {
+        (*true_data)[entry.first] = entry.second;
+      }
+    }
+
+    return s;
+  }
+
+  Status GenerateAndAddExternalFiles(
+      const Options& options,
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      const std::vector<IngestExternalFileOptions>& ifos,
+      std::vector<std::vector<std::pair<std::string, std::string>>>& data,
+      int file_id, bool sort_data,
+      std::vector<std::map<std::string, std::string>>& true_data) {
+    if (-1 == file_id) {
+      file_id = (++last_file_id_);
+    }
+    // Generate external SST files, one for each column family
+    size_t num_cfs = column_families.size();
+    assert(ifos.size() == num_cfs);
+    assert(data.size() == num_cfs);
+    std::vector<IngestExternalFileArg> args(num_cfs);
+    for (size_t i = 0; i != num_cfs; ++i) {
+      std::string external_file_path;
+      Status s = GenerateOneExternalFile(
+          options, column_families[i], data[i], file_id, sort_data,
+          &external_file_path,
+          true_data.size() == num_cfs ? &true_data[i] : nullptr);
+      if (!s.ok()) {
+        return s;
+      }
+      ++file_id;
+
+      args[i].column_family = column_families[i];
+      args[i].external_files.push_back(external_file_path);
+      args[i].options = ifos[i];
+    }
+    return db_->IngestExternalFiles(args);
+  }
+
+  Status GenerateAndAddExternalFile(
+      const Options options, std::vector<std::pair<int, std::string>> data,
+      int file_id = -1, bool allow_global_seqno = false,
+      bool write_global_seqno = false,
+      bool verify_checksums_before_ingest = true, bool ingest_behind = false,
+      bool sort_data = false,
+      std::map<std::string, std::string>* true_data = nullptr,
+      ColumnFamilyHandle* cfh = nullptr) {
+    std::vector<std::pair<std::string, std::string>> file_data;
+    for (auto& entry : data) {
+      file_data.emplace_back(Key(entry.first), entry.second);
+    }
+    return GenerateAndAddExternalFile(options, file_data, file_id,
+                                      allow_global_seqno, write_global_seqno,
+                                      verify_checksums_before_ingest,
+                                      ingest_behind, sort_data, true_data, cfh);
+  }
+
+  Status GenerateAndAddExternalFile(
+      const Options options, std::vector<int> keys, int file_id = -1,
+      bool allow_global_seqno = false, bool write_global_seqno = false,
+      bool verify_checksums_before_ingest = true, bool ingest_behind = false,
+      bool sort_data = false,
+      std::map<std::string, std::string>* true_data = nullptr,
+      ColumnFamilyHandle* cfh = nullptr) {
+    std::vector<std::pair<std::string, std::string>> file_data;
+    for (auto& k : keys) {
+      file_data.emplace_back(Key(k), Key(k) + std::to_string(file_id));
+    }
+    return GenerateAndAddExternalFile(options, file_data, file_id,
+                                      allow_global_seqno, write_global_seqno,
+                                      verify_checksums_before_ingest,
+                                      ingest_behind, sort_data, true_data, cfh);
+  }
+
+  Status DeprecatedAddFile(const std::vector<std::string>& files,
+                           bool move_files = false,
+                           bool skip_snapshot_check = false,
+                           bool skip_write_global_seqno = false) {
+    IngestExternalFileOptions opts;
+    opts.move_files = move_files;
+    opts.snapshot_consistency = !skip_snapshot_check;
+    opts.allow_global_seqno = false;
+    opts.allow_blocking_flush = false;
+    opts.write_global_seqno = !skip_write_global_seqno;
+    return db_->IngestExternalFile(files, opts);
+  }
+
+ protected:
+  int last_file_id_ = 0;
+};
+
+TEST_F(ExternalSSTFileTest, Basic) {
+  do {
+    Options options = CurrentOptions();
+
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+
+    // Current file size should be 0 after sst_file_writer init and before open
+    // a file.
+    ASSERT_EQ(sst_file_writer.FileSize(), 0);
+
+    // file1.sst (0 => 99)
+    std::string file1 = sst_files_dir_ + "file1.sst";
+    ASSERT_OK(sst_file_writer.Open(file1));
+    for (int k = 0; k < 100; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+    }
+    ExternalSstFileInfo file1_info;
+    ASSERT_OK(sst_file_writer.Finish(&file1_info));
+
+    // Current file size should be non-zero after success write.
+    ASSERT_GT(sst_file_writer.FileSize(), 0);
+
+    ASSERT_EQ(file1_info.file_path, file1);
+    ASSERT_EQ(file1_info.num_entries, 100);
+    ASSERT_EQ(file1_info.smallest_key, Key(0));
+    ASSERT_EQ(file1_info.largest_key, Key(99));
+    ASSERT_EQ(file1_info.num_range_del_entries, 0);
+    ASSERT_EQ(file1_info.smallest_range_del_key, "");
+    ASSERT_EQ(file1_info.largest_range_del_key, "");
+    // sst_file_writer already finished, cannot add this value
+    ASSERT_NOK(sst_file_writer.Put(Key(100), "bad_val"));
+
+    // file2.sst (100 => 199)
+    std::string file2 = sst_files_dir_ + "file2.sst";
+    ASSERT_OK(sst_file_writer.Open(file2));
+    for (int k = 100; k < 200; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+    }
+    // Cannot add this key because it's not after last added key
+    ASSERT_NOK(sst_file_writer.Put(Key(99), "bad_val"));
+    ExternalSstFileInfo file2_info;
+    ASSERT_OK(sst_file_writer.Finish(&file2_info));
+    ASSERT_EQ(file2_info.file_path, file2);
+    ASSERT_EQ(file2_info.num_entries, 100);
+    ASSERT_EQ(file2_info.smallest_key, Key(100));
+    ASSERT_EQ(file2_info.largest_key, Key(199));
+
+    // file3.sst (195 => 299)
+    // This file values overlap with file2 values
+    std::string file3 = sst_files_dir_ + "file3.sst";
+    ASSERT_OK(sst_file_writer.Open(file3));
+    for (int k = 195; k < 300; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+    }
+    ExternalSstFileInfo file3_info;
+    ASSERT_OK(sst_file_writer.Finish(&file3_info));
+
+    // Current file size should be non-zero after success finish.
+    ASSERT_GT(sst_file_writer.FileSize(), 0);
+    ASSERT_EQ(file3_info.file_path, file3);
+    ASSERT_EQ(file3_info.num_entries, 105);
+    ASSERT_EQ(file3_info.smallest_key, Key(195));
+    ASSERT_EQ(file3_info.largest_key, Key(299));
+
+    // file4.sst (30 => 39)
+    // This file values overlap with file1 values
+    std::string file4 = sst_files_dir_ + "file4.sst";
+    ASSERT_OK(sst_file_writer.Open(file4));
+    for (int k = 30; k < 40; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+    }
+    ExternalSstFileInfo file4_info;
+    ASSERT_OK(sst_file_writer.Finish(&file4_info));
+    ASSERT_EQ(file4_info.file_path, file4);
+    ASSERT_EQ(file4_info.num_entries, 10);
+    ASSERT_EQ(file4_info.smallest_key, Key(30));
+    ASSERT_EQ(file4_info.largest_key, Key(39));
+
+    // file5.sst (400 => 499)
+    std::string file5 = sst_files_dir_ + "file5.sst";
+    ASSERT_OK(sst_file_writer.Open(file5));
+    for (int k = 400; k < 500; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+    }
+    ExternalSstFileInfo file5_info;
+    ASSERT_OK(sst_file_writer.Finish(&file5_info));
+    ASSERT_EQ(file5_info.file_path, file5);
+    ASSERT_EQ(file5_info.num_entries, 100);
+    ASSERT_EQ(file5_info.smallest_key, Key(400));
+    ASSERT_EQ(file5_info.largest_key, Key(499));
+
+    // file6.sst (delete 400 => 500)
+    std::string file6 = sst_files_dir_ + "file6.sst";
+    ASSERT_OK(sst_file_writer.Open(file6));
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(400), Key(500)));
+    ExternalSstFileInfo file6_info;
+    ASSERT_OK(sst_file_writer.Finish(&file6_info));
+    ASSERT_EQ(file6_info.file_path, file6);
+    ASSERT_EQ(file6_info.num_entries, 0);
+    ASSERT_EQ(file6_info.smallest_key, "");
+    ASSERT_EQ(file6_info.largest_key, "");
+    ASSERT_EQ(file6_info.num_range_del_entries, 1);
+    ASSERT_EQ(file6_info.smallest_range_del_key, Key(400));
+    ASSERT_EQ(file6_info.largest_range_del_key, Key(500));
+
+    // file7.sst (delete 500 => 570, put 520 => 599 divisible by 2)
+    std::string file7 = sst_files_dir_ + "file7.sst";
+    ASSERT_OK(sst_file_writer.Open(file7));
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(500), Key(550)));
+    for (int k = 520; k < 560; k += 2) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+    }
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(525), Key(575)));
+    for (int k = 560; k < 600; k += 2) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+    }
+    ExternalSstFileInfo file7_info;
+    ASSERT_OK(sst_file_writer.Finish(&file7_info));
+    ASSERT_EQ(file7_info.file_path, file7);
+    ASSERT_EQ(file7_info.num_entries, 40);
+    ASSERT_EQ(file7_info.smallest_key, Key(520));
+    ASSERT_EQ(file7_info.largest_key, Key(598));
+    ASSERT_EQ(file7_info.num_range_del_entries, 2);
+    ASSERT_EQ(file7_info.smallest_range_del_key, Key(500));
+    ASSERT_EQ(file7_info.largest_range_del_key, Key(575));
+
+    // file8.sst (delete 600 => 700)
+    std::string file8 = sst_files_dir_ + "file8.sst";
+    ASSERT_OK(sst_file_writer.Open(file8));
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(600), Key(700)));
+    ExternalSstFileInfo file8_info;
+    ASSERT_OK(sst_file_writer.Finish(&file8_info));
+    ASSERT_EQ(file8_info.file_path, file8);
+    ASSERT_EQ(file8_info.num_entries, 0);
+    ASSERT_EQ(file8_info.smallest_key, "");
+    ASSERT_EQ(file8_info.largest_key, "");
+    ASSERT_EQ(file8_info.num_range_del_entries, 1);
+    ASSERT_EQ(file8_info.smallest_range_del_key, Key(600));
+    ASSERT_EQ(file8_info.largest_range_del_key, Key(700));
+
+    // Cannot create an empty sst file
+    std::string file_empty = sst_files_dir_ + "file_empty.sst";
+    ExternalSstFileInfo file_empty_info;
+    ASSERT_NOK(sst_file_writer.Finish(&file_empty_info));
+
+    DestroyAndReopen(options);
+    // Add file using file path
+    ASSERT_OK(DeprecatedAddFile({file1}));
+    ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+    for (int k = 0; k < 100; k++) {
+      ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+    }
+
+    // Add file while holding a snapshot will fail
+    const Snapshot* s1 = db_->GetSnapshot();
+    if (s1 != nullptr) {
+      ASSERT_NOK(DeprecatedAddFile({file2}));
+      db_->ReleaseSnapshot(s1);
+    }
+    // We can add the file after releaseing the snapshot
+    ASSERT_OK(DeprecatedAddFile({file2}));
+
+    ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+    for (int k = 0; k < 200; k++) {
+      ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+    }
+
+    // This file has overlapping values with the existing data
+    ASSERT_NOK(DeprecatedAddFile({file3}));
+
+    // This file has overlapping values with the existing data
+    ASSERT_NOK(DeprecatedAddFile({file4}));
+
+    // Overwrite values of keys divisible by 5
+    for (int k = 0; k < 200; k += 5) {
+      ASSERT_OK(Put(Key(k), Key(k) + "_val_new"));
+    }
+    ASSERT_NE(db_->GetLatestSequenceNumber(), 0U);
+
+    // Key range of file5 (400 => 499) don't overlap with any keys in DB
+    ASSERT_OK(DeprecatedAddFile({file5}));
+
+    // This file has overlapping values with the existing data
+    ASSERT_NOK(DeprecatedAddFile({file6}));
+
+    // Key range of file7 (500 => 598) don't overlap with any keys in DB
+    ASSERT_OK(DeprecatedAddFile({file7}));
+
+    // Key range of file7 (600 => 700) don't overlap with any keys in DB
+    ASSERT_OK(DeprecatedAddFile({file8}));
+
+    // Make sure values are correct before and after flush/compaction
+    for (int i = 0; i < 2; i++) {
+      for (int k = 0; k < 200; k++) {
+        std::string value = Key(k) + "_val";
+        if (k % 5 == 0) {
+          value += "_new";
+        }
+        ASSERT_EQ(Get(Key(k)), value);
+      }
+      for (int k = 400; k < 500; k++) {
+        std::string value = Key(k) + "_val";
+        ASSERT_EQ(Get(Key(k)), value);
+      }
+      for (int k = 500; k < 600; k++) {
+        std::string value = Key(k) + "_val";
+        if (k < 520 || k % 2 == 1) {
+          value = "NOT_FOUND";
+        }
+        ASSERT_EQ(Get(Key(k)), value);
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    }
+
+    Close();
+    options.disable_auto_compactions = true;
+    Reopen(options);
+
+    // Delete keys in range (400 => 499)
+    for (int k = 400; k < 500; k++) {
+      ASSERT_OK(Delete(Key(k)));
+    }
+    // We deleted range (400 => 499) but cannot add file5 because
+    // of the range tombstones
+    ASSERT_NOK(DeprecatedAddFile({file5}));
+
+    // Compacting the DB will remove the tombstones
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+    // Now we can add the file
+    ASSERT_OK(DeprecatedAddFile({file5}));
+
+    // Verify values of file5 in DB
+    for (int k = 400; k < 500; k++) {
+      std::string value = Key(k) + "_val";
+      ASSERT_EQ(Get(Key(k)), value);
+    }
+    DestroyAndRecreateExternalSSTFilesDir();
+  } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction |
+                         kRangeDelSkipConfigs));
+}
+
+class SstFileWriterCollector : public TablePropertiesCollector {
+ public:
+  explicit SstFileWriterCollector(const std::string prefix) : prefix_(prefix) {
+    name_ = prefix_ + "_SstFileWriterCollector";
+  }
+
+  const char* Name() const override { return name_.c_str(); }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    std::string count = std::to_string(count_);
+    *properties = UserCollectedProperties{
+        {prefix_ + "_SstFileWriterCollector", "YES"},
+        {prefix_ + "_Count", count},
+    };
+    return Status::OK();
+  }
+
+  Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
+                    EntryType /*type*/, SequenceNumber /*seq*/,
+                    uint64_t /*file_size*/) override {
+    ++count_;
+    return Status::OK();
+  }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{};
+  }
+
+ private:
+  uint32_t count_ = 0;
+  std::string prefix_;
+  std::string name_;
+};
+
+class SstFileWriterCollectorFactory : public TablePropertiesCollectorFactory {
+ public:
+  explicit SstFileWriterCollectorFactory(std::string prefix)
+      : prefix_(prefix), num_created_(0) {}
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /*context*/) override {
+    num_created_++;
+    return new SstFileWriterCollector(prefix_);
+  }
+  const char* Name() const override { return "SstFileWriterCollectorFactory"; }
+
+  std::string prefix_;
+  uint32_t num_created_;
+};
+
+TEST_F(ExternalSSTFileTest, AddList) {
+  do {
+    Options options = CurrentOptions();
+
+    auto abc_collector = std::make_shared<SstFileWriterCollectorFactory>("abc");
+    auto xyz_collector = std::make_shared<SstFileWriterCollectorFactory>("xyz");
+
+    options.table_properties_collector_factories.emplace_back(abc_collector);
+    options.table_properties_collector_factories.emplace_back(xyz_collector);
+
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+
+    // file1.sst (0 => 99)
+    std::string file1 = sst_files_dir_ + "file1.sst";
+    ASSERT_OK(sst_file_writer.Open(file1));
+    for (int k = 0; k < 100; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+    }
+    ExternalSstFileInfo file1_info;
+    ASSERT_OK(sst_file_writer.Finish(&file1_info));
+    ASSERT_EQ(file1_info.file_path, file1);
+    ASSERT_EQ(file1_info.num_entries, 100);
+    ASSERT_EQ(file1_info.smallest_key, Key(0));
+    ASSERT_EQ(file1_info.largest_key, Key(99));
+    // sst_file_writer already finished, cannot add this value
+    ASSERT_NOK(sst_file_writer.Put(Key(100), "bad_val"));
+
+    // file2.sst (100 => 199)
+    std::string file2 = sst_files_dir_ + "file2.sst";
+    ASSERT_OK(sst_file_writer.Open(file2));
+    for (int k = 100; k < 200; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+    }
+    // Cannot add this key because it's not after last added key
+    ASSERT_NOK(sst_file_writer.Put(Key(99), "bad_val"));
+    ExternalSstFileInfo file2_info;
+    ASSERT_OK(sst_file_writer.Finish(&file2_info));
+    ASSERT_EQ(file2_info.file_path, file2);
+    ASSERT_EQ(file2_info.num_entries, 100);
+    ASSERT_EQ(file2_info.smallest_key, Key(100));
+    ASSERT_EQ(file2_info.largest_key, Key(199));
+
+    // file3.sst (195 => 199)
+    // This file values overlap with file2 values
+    std::string file3 = sst_files_dir_ + "file3.sst";
+    ASSERT_OK(sst_file_writer.Open(file3));
+    for (int k = 195; k < 200; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+    }
+    ExternalSstFileInfo file3_info;
+    ASSERT_OK(sst_file_writer.Finish(&file3_info));
+    ASSERT_EQ(file3_info.file_path, file3);
+    ASSERT_EQ(file3_info.num_entries, 5);
+    ASSERT_EQ(file3_info.smallest_key, Key(195));
+    ASSERT_EQ(file3_info.largest_key, Key(199));
+
+    // file4.sst (30 => 39)
+    // This file values overlap with file1 values
+    std::string file4 = sst_files_dir_ + "file4.sst";
+    ASSERT_OK(sst_file_writer.Open(file4));
+    for (int k = 30; k < 40; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+    }
+    ExternalSstFileInfo file4_info;
+    ASSERT_OK(sst_file_writer.Finish(&file4_info));
+    ASSERT_EQ(file4_info.file_path, file4);
+    ASSERT_EQ(file4_info.num_entries, 10);
+    ASSERT_EQ(file4_info.smallest_key, Key(30));
+    ASSERT_EQ(file4_info.largest_key, Key(39));
+
+    // file5.sst (200 => 299)
+    std::string file5 = sst_files_dir_ + "file5.sst";
+    ASSERT_OK(sst_file_writer.Open(file5));
+    for (int k = 200; k < 300; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+    }
+    ExternalSstFileInfo file5_info;
+    ASSERT_OK(sst_file_writer.Finish(&file5_info));
+    ASSERT_EQ(file5_info.file_path, file5);
+    ASSERT_EQ(file5_info.num_entries, 100);
+    ASSERT_EQ(file5_info.smallest_key, Key(200));
+    ASSERT_EQ(file5_info.largest_key, Key(299));
+
+    // file6.sst (delete 0 => 100)
+    std::string file6 = sst_files_dir_ + "file6.sst";
+    ASSERT_OK(sst_file_writer.Open(file6));
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(0), Key(75)));
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(25), Key(100)));
+    ExternalSstFileInfo file6_info;
+    ASSERT_OK(sst_file_writer.Finish(&file6_info));
+    ASSERT_EQ(file6_info.file_path, file6);
+    ASSERT_EQ(file6_info.num_entries, 0);
+    ASSERT_EQ(file6_info.smallest_key, "");
+    ASSERT_EQ(file6_info.largest_key, "");
+    ASSERT_EQ(file6_info.num_range_del_entries, 2);
+    ASSERT_EQ(file6_info.smallest_range_del_key, Key(0));
+    ASSERT_EQ(file6_info.largest_range_del_key, Key(100));
+
+    // file7.sst (delete 99 => 201)
+    std::string file7 = sst_files_dir_ + "file7.sst";
+    ASSERT_OK(sst_file_writer.Open(file7));
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(99), Key(201)));
+    ExternalSstFileInfo file7_info;
+    ASSERT_OK(sst_file_writer.Finish(&file7_info));
+    ASSERT_EQ(file7_info.file_path, file7);
+    ASSERT_EQ(file7_info.num_entries, 0);
+    ASSERT_EQ(file7_info.smallest_key, "");
+    ASSERT_EQ(file7_info.largest_key, "");
+    ASSERT_EQ(file7_info.num_range_del_entries, 1);
+    ASSERT_EQ(file7_info.smallest_range_del_key, Key(99));
+    ASSERT_EQ(file7_info.largest_range_del_key, Key(201));
+
+    // list 1 has internal key range conflict
+    std::vector<std::string> file_list0({file1, file2});
+    std::vector<std::string> file_list1({file3, file2, file1});
+    std::vector<std::string> file_list2({file5});
+    std::vector<std::string> file_list3({file3, file4});
+    std::vector<std::string> file_list4({file5, file7});
+    std::vector<std::string> file_list5({file6, file7});
+
+    DestroyAndReopen(options);
+
+    // These lists of files have key ranges that overlap with each other
+    ASSERT_NOK(DeprecatedAddFile(file_list1));
+    // Both of the following overlap on the range deletion tombstone.
+    ASSERT_NOK(DeprecatedAddFile(file_list4));
+    ASSERT_NOK(DeprecatedAddFile(file_list5));
+
+    // Add files using file path list
+    ASSERT_OK(DeprecatedAddFile(file_list0));
+    ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+    for (int k = 0; k < 200; k++) {
+      ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+    }
+
+    TablePropertiesCollection props;
+    ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+    ASSERT_EQ(props.size(), 2);
+    for (auto file_props : props) {
+      auto user_props = file_props.second->user_collected_properties;
+      ASSERT_EQ(user_props["abc_SstFileWriterCollector"], "YES");
+      ASSERT_EQ(user_props["xyz_SstFileWriterCollector"], "YES");
+      ASSERT_EQ(user_props["abc_Count"], "100");
+      ASSERT_EQ(user_props["xyz_Count"], "100");
+    }
+
+    // Add file while holding a snapshot will fail
+    const Snapshot* s1 = db_->GetSnapshot();
+    if (s1 != nullptr) {
+      ASSERT_NOK(DeprecatedAddFile(file_list2));
+      db_->ReleaseSnapshot(s1);
+    }
+    // We can add the file after releaseing the snapshot
+    ASSERT_OK(DeprecatedAddFile(file_list2));
+    ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+    for (int k = 0; k < 300; k++) {
+      ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+    }
+
+    ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+    ASSERT_EQ(props.size(), 3);
+    for (auto file_props : props) {
+      auto user_props = file_props.second->user_collected_properties;
+      ASSERT_EQ(user_props["abc_SstFileWriterCollector"], "YES");
+      ASSERT_EQ(user_props["xyz_SstFileWriterCollector"], "YES");
+      ASSERT_EQ(user_props["abc_Count"], "100");
+      ASSERT_EQ(user_props["xyz_Count"], "100");
+    }
+
+    // This file list has overlapping values with the existing data
+    ASSERT_NOK(DeprecatedAddFile(file_list3));
+
+    // Overwrite values of keys divisible by 5
+    for (int k = 0; k < 200; k += 5) {
+      ASSERT_OK(Put(Key(k), Key(k) + "_val_new"));
+    }
+    ASSERT_NE(db_->GetLatestSequenceNumber(), 0U);
+
+    // Make sure values are correct before and after flush/compaction
+    for (int i = 0; i < 2; i++) {
+      for (int k = 0; k < 200; k++) {
+        std::string value = Key(k) + "_val";
+        if (k % 5 == 0) {
+          value += "_new";
+        }
+        ASSERT_EQ(Get(Key(k)), value);
+      }
+      for (int k = 200; k < 300; k++) {
+        std::string value = Key(k) + "_val";
+        ASSERT_EQ(Get(Key(k)), value);
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    }
+
+    // Delete keys in range (200 => 299)
+    for (int k = 200; k < 300; k++) {
+      ASSERT_OK(Delete(Key(k)));
+    }
+    // We deleted range (200 => 299) but cannot add file5 because
+    // of the range tombstones
+    ASSERT_NOK(DeprecatedAddFile(file_list2));
+
+    // Compacting the DB will remove the tombstones
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+    // Now we can add the file
+    ASSERT_OK(DeprecatedAddFile(file_list2));
+
+    // Verify values of file5 in DB
+    for (int k = 200; k < 300; k++) {
+      std::string value = Key(k) + "_val";
+      ASSERT_EQ(Get(Key(k)), value);
+    }
+    DestroyAndRecreateExternalSSTFilesDir();
+  } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction |
+                         kRangeDelSkipConfigs));
+}
+
+TEST_F(ExternalSSTFileTest, AddListAtomicity) {
+  do {
+    Options options = CurrentOptions();
+
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+
+    // files[0].sst (0 => 99)
+    // files[1].sst (100 => 199)
+    // ...
+    // file[8].sst (800 => 899)
+    int n = 9;
+    std::vector<std::string> files(n);
+    std::vector<ExternalSstFileInfo> files_info(n);
+    for (int i = 0; i < n; i++) {
+      files[i] = sst_files_dir_ + "file" + std::to_string(i) + ".sst";
+      ASSERT_OK(sst_file_writer.Open(files[i]));
+      for (int k = i * 100; k < (i + 1) * 100; k++) {
+        ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+      }
+      ASSERT_OK(sst_file_writer.Finish(&files_info[i]));
+      ASSERT_EQ(files_info[i].file_path, files[i]);
+      ASSERT_EQ(files_info[i].num_entries, 100);
+      ASSERT_EQ(files_info[i].smallest_key, Key(i * 100));
+      ASSERT_EQ(files_info[i].largest_key, Key((i + 1) * 100 - 1));
+    }
+    files.push_back(sst_files_dir_ + "file" + std::to_string(n) + ".sst");
+    ASSERT_NOK(DeprecatedAddFile(files));
+    for (int k = 0; k < n * 100; k++) {
+      ASSERT_EQ("NOT_FOUND", Get(Key(k)));
+    }
+    files.pop_back();
+    ASSERT_OK(DeprecatedAddFile(files));
+    for (int k = 0; k < n * 100; k++) {
+      std::string value = Key(k) + "_val";
+      ASSERT_EQ(Get(Key(k)), value);
+    }
+    DestroyAndRecreateExternalSSTFilesDir();
+  } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction));
+}
+// This test reporduce a bug that can happen in some cases if the DB started
+// purging obsolete files when we are adding an external sst file.
+// This situation may result in deleting the file while it's being added.
+TEST_F(ExternalSSTFileTest, PurgeObsoleteFilesBug) {
+  Options options = CurrentOptions();
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // file1.sst (0 => 500)
+  std::string sst_file_path = sst_files_dir_ + "file1.sst";
+  ASSERT_OK(sst_file_writer.Open(sst_file_path));
+  for (int i = 0; i < 500; i++) {
+    std::string k = Key(i);
+    ASSERT_OK(sst_file_writer.Put(k, k + "_val"));
+  }
+
+  ExternalSstFileInfo sst_file_info;
+  ASSERT_OK(sst_file_writer.Finish(&sst_file_info));
+
+  options.delete_obsolete_files_period_micros = 0;
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ExternalSstFileIngestionJob::Prepare:FileAdded", [&](void* /* arg */) {
+        ASSERT_OK(Put("aaa", "bbb"));
+        ASSERT_OK(Flush());
+        ASSERT_OK(Put("aaa", "xxx"));
+        ASSERT_OK(Flush());
+        ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(DeprecatedAddFile({sst_file_path}));
+
+  for (int i = 0; i < 500; i++) {
+    std::string k = Key(i);
+    std::string v = k + "_val";
+    ASSERT_EQ(Get(k), v);
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileTest, SkipSnapshot) {
+  Options options = CurrentOptions();
+
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // file1.sst (0 => 99)
+  std::string file1 = sst_files_dir_ + "file1.sst";
+  ASSERT_OK(sst_file_writer.Open(file1));
+  for (int k = 0; k < 100; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file1_info;
+  ASSERT_OK(sst_file_writer.Finish(&file1_info));
+  ASSERT_EQ(file1_info.file_path, file1);
+  ASSERT_EQ(file1_info.num_entries, 100);
+  ASSERT_EQ(file1_info.smallest_key, Key(0));
+  ASSERT_EQ(file1_info.largest_key, Key(99));
+
+  // file2.sst (100 => 299)
+  std::string file2 = sst_files_dir_ + "file2.sst";
+  ASSERT_OK(sst_file_writer.Open(file2));
+  for (int k = 100; k < 300; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file2_info;
+  ASSERT_OK(sst_file_writer.Finish(&file2_info));
+  ASSERT_EQ(file2_info.file_path, file2);
+  ASSERT_EQ(file2_info.num_entries, 200);
+  ASSERT_EQ(file2_info.smallest_key, Key(100));
+  ASSERT_EQ(file2_info.largest_key, Key(299));
+
+  ASSERT_OK(DeprecatedAddFile({file1}));
+
+  // Add file will fail when holding snapshot and use the default
+  // skip_snapshot_check to false
+  const Snapshot* s1 = db_->GetSnapshot();
+  if (s1 != nullptr) {
+    ASSERT_NOK(DeprecatedAddFile({file2}));
+  }
+
+  // Add file will success when set skip_snapshot_check to true even db holding
+  // snapshot
+  if (s1 != nullptr) {
+    ASSERT_OK(DeprecatedAddFile({file2}, false, true));
+    db_->ReleaseSnapshot(s1);
+  }
+
+  // file3.sst (300 => 399)
+  std::string file3 = sst_files_dir_ + "file3.sst";
+  ASSERT_OK(sst_file_writer.Open(file3));
+  for (int k = 300; k < 400; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file3_info;
+  ASSERT_OK(sst_file_writer.Finish(&file3_info));
+  ASSERT_EQ(file3_info.file_path, file3);
+  ASSERT_EQ(file3_info.num_entries, 100);
+  ASSERT_EQ(file3_info.smallest_key, Key(300));
+  ASSERT_EQ(file3_info.largest_key, Key(399));
+
+  // check that we have change the old key
+  ASSERT_EQ(Get(Key(300)), "NOT_FOUND");
+  const Snapshot* s2 = db_->GetSnapshot();
+  ASSERT_OK(DeprecatedAddFile({file3}, false, true));
+  ASSERT_EQ(Get(Key(300)), Key(300) + ("_val"));
+  ASSERT_EQ(Get(Key(300), s2), Key(300) + ("_val"));
+
+  db_->ReleaseSnapshot(s2);
+}
+
+TEST_F(ExternalSSTFileTest, MultiThreaded) {
+  env_->skip_fsync_ = true;
+  // Bulk load 10 files every file contain 1000 keys
+  int num_files = 10;
+  int keys_per_file = 1000;
+
+  // Generate file names
+  std::vector<std::string> file_names;
+  for (int i = 0; i < num_files; i++) {
+    std::string file_name = "file_" + std::to_string(i) + ".sst";
+    file_names.push_back(sst_files_dir_ + file_name);
+  }
+
+  do {
+    Options options = CurrentOptions();
+
+    std::atomic<int> thread_num(0);
+    std::function<void()> write_file_func = [&]() {
+      int file_idx = thread_num.fetch_add(1);
+      int range_start = file_idx * keys_per_file;
+      int range_end = range_start + keys_per_file;
+
+      SstFileWriter sst_file_writer(EnvOptions(), options);
+
+      ASSERT_OK(sst_file_writer.Open(file_names[file_idx]));
+
+      for (int k = range_start; k < range_end; k++) {
+        ASSERT_OK(sst_file_writer.Put(Key(k), Key(k)));
+      }
+
+      ASSERT_OK(sst_file_writer.Finish());
+    };
+    // Write num_files files in parallel
+    std::vector<port::Thread> sst_writer_threads;
+    for (int i = 0; i < num_files; ++i) {
+      sst_writer_threads.emplace_back(write_file_func);
+    }
+
+    for (auto& t : sst_writer_threads) {
+      t.join();
+    }
+
+    fprintf(stderr, "Wrote %d files (%d keys)\n", num_files,
+            num_files * keys_per_file);
+
+    thread_num.store(0);
+    std::atomic<int> files_added(0);
+    // Thread 0 -> Load {f0,f1}
+    // Thread 1 -> Load {f0,f1}
+    // Thread 2 -> Load {f2,f3}
+    // Thread 3 -> Load {f2,f3}
+    // Thread 4 -> Load {f4,f5}
+    // Thread 5 -> Load {f4,f5}
+    // ...
+    std::function<void()> load_file_func = [&]() {
+      // We intentionally add every file twice, and assert that it was added
+      // only once and the other add failed
+      int thread_id = thread_num.fetch_add(1);
+      int file_idx = (thread_id / 2) * 2;
+      // sometimes we use copy, sometimes link .. the result should be the same
+      bool move_file = (thread_id % 3 == 0);
+
+      std::vector<std::string> files_to_add;
+
+      files_to_add = {file_names[file_idx]};
+      if (static_cast<size_t>(file_idx + 1) < file_names.size()) {
+        files_to_add.push_back(file_names[file_idx + 1]);
+      }
+
+      Status s = DeprecatedAddFile(files_to_add, move_file);
+      if (s.ok()) {
+        files_added += static_cast<int>(files_to_add.size());
+      }
+    };
+
+    // Bulk load num_files files in parallel
+    std::vector<port::Thread> add_file_threads;
+    DestroyAndReopen(options);
+    for (int i = 0; i < num_files; ++i) {
+      add_file_threads.emplace_back(load_file_func);
+    }
+
+    for (auto& t : add_file_threads) {
+      t.join();
+    }
+    ASSERT_EQ(files_added.load(), num_files);
+    fprintf(stderr, "Loaded %d files (%d keys)\n", num_files,
+            num_files * keys_per_file);
+
+    // Overwrite values of keys divisible by 100
+    for (int k = 0; k < num_files * keys_per_file; k += 100) {
+      std::string key = Key(k);
+      ASSERT_OK(Put(key, key + "_new"));
+    }
+
+    for (int i = 0; i < 2; i++) {
+      // Make sure the values are correct before and after flush/compaction
+      for (int k = 0; k < num_files * keys_per_file; ++k) {
+        std::string key = Key(k);
+        std::string value = (k % 100 == 0) ? (key + "_new") : key;
+        ASSERT_EQ(Get(key), value);
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    }
+
+    fprintf(stderr, "Verified %d values\n", num_files * keys_per_file);
+    DestroyAndRecreateExternalSSTFilesDir();
+  } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction));
+}
+
+TEST_F(ExternalSSTFileTest, OverlappingRanges) {
+  env_->skip_fsync_ = true;
+  Random rnd(301);
+  SequenceNumber assigned_seqno = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ExternalSstFileIngestionJob::Run", [&assigned_seqno](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        assigned_seqno = *(static_cast<SequenceNumber*>(arg));
+      });
+  bool need_flush = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::IngestExternalFile:NeedFlush", [&need_flush](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        need_flush = *(static_cast<bool*>(arg));
+      });
+  bool overlap_with_db = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile",
+      [&overlap_with_db](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        overlap_with_db = *(static_cast<bool*>(arg));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  do {
+    Options options = CurrentOptions();
+    env_->skip_fsync_ = true;
+    DestroyAndReopen(options);
+
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+
+    printf("Option config = %d\n", option_config_);
+    std::vector<std::pair<int, int>> key_ranges;
+    for (int i = 0; i < 100; i++) {
+      int range_start = rnd.Uniform(20000);
+      int keys_per_range = 10 + rnd.Uniform(41);
+
+      key_ranges.emplace_back(range_start, range_start + keys_per_range);
+    }
+
+    int memtable_add = 0;
+    int success_add_file = 0;
+    int failed_add_file = 0;
+    std::map<std::string, std::string> true_data;
+    for (size_t i = 0; i < key_ranges.size(); i++) {
+      int range_start = key_ranges[i].first;
+      int range_end = key_ranges[i].second;
+
+      Status s;
+      std::string range_val = "range_" + std::to_string(i);
+
+      // For 20% of ranges we use DB::Put, for 80% we use DB::AddFile
+      if (i && i % 5 == 0) {
+        // Use DB::Put to insert range (insert into memtable)
+        range_val += "_put";
+        for (int k = range_start; k <= range_end; k++) {
+          s = Put(Key(k), range_val);
+          ASSERT_OK(s);
+        }
+        memtable_add++;
+      } else {
+        // Use DB::AddFile to insert range
+        range_val += "_add_file";
+
+        // Generate the file containing the range
+        std::string file_name = sst_files_dir_ + env_->GenerateUniqueId();
+        s = sst_file_writer.Open(file_name);
+        ASSERT_OK(s);
+        for (int k = range_start; k <= range_end; k++) {
+          s = sst_file_writer.Put(Key(k), range_val);
+          ASSERT_OK(s);
+        }
+        ExternalSstFileInfo file_info;
+        s = sst_file_writer.Finish(&file_info);
+        ASSERT_OK(s);
+
+        // Insert the generated file
+        s = DeprecatedAddFile({file_name});
+        auto it = true_data.lower_bound(Key(range_start));
+        if (option_config_ != kUniversalCompaction &&
+            option_config_ != kUniversalCompactionMultiLevel &&
+            option_config_ != kUniversalSubcompactions) {
+          if (it != true_data.end() && it->first <= Key(range_end)) {
+            // This range overlap with data already exist in DB
+            ASSERT_NOK(s);
+            failed_add_file++;
+          } else {
+            ASSERT_OK(s);
+            success_add_file++;
+          }
+        } else {
+          if ((it != true_data.end() && it->first <= Key(range_end)) ||
+              need_flush || assigned_seqno > 0 || overlap_with_db) {
+            // This range overlap with data already exist in DB
+            ASSERT_NOK(s);
+            failed_add_file++;
+          } else {
+            ASSERT_OK(s);
+            success_add_file++;
+          }
+        }
+      }
+
+      if (s.ok()) {
+        // Update true_data map to include the new inserted data
+        for (int k = range_start; k <= range_end; k++) {
+          true_data[Key(k)] = range_val;
+        }
+      }
+
+      // Flush / Compact the DB
+      if (i && i % 50 == 0) {
+        ASSERT_OK(Flush());
+      }
+      if (i && i % 75 == 0) {
+        ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+      }
+    }
+
+    printf("Total: %" ROCKSDB_PRIszt
+           " ranges\n"
+           "AddFile()|Success: %d ranges\n"
+           "AddFile()|RangeConflict: %d ranges\n"
+           "Put(): %d ranges\n",
+           key_ranges.size(), success_add_file, failed_add_file, memtable_add);
+
+    // Verify the correctness of the data
+    for (const auto& kv : true_data) {
+      ASSERT_EQ(Get(kv.first), kv.second);
+    }
+    printf("keys/values verified\n");
+    DestroyAndRecreateExternalSSTFilesDir();
+  } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction));
+}
+
+TEST_P(ExternalSSTFileTest, PickedLevel) {
+  env_->skip_fsync_ = true;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = false;
+  options.level0_file_num_compaction_trigger = 4;
+  options.num_levels = 4;
+  DestroyAndReopen(options);
+
+  std::map<std::string, std::string> true_data;
+
+  // File 0 will go to last level (L3)
+  ASSERT_OK(GenerateAndAddExternalFile(options, {1, 10}, -1, false, false, true,
+                                       false, false, &true_data));
+  EXPECT_EQ(FilesPerLevel(), "0,0,0,1");
+
+  // File 1 will go to level L2 (since it overlap with file 0 in L3)
+  ASSERT_OK(GenerateAndAddExternalFile(options, {2, 9}, -1, false, false, true,
+                                       false, false, &true_data));
+  EXPECT_EQ(FilesPerLevel(), "0,0,1,1");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"ExternalSSTFileTest::PickedLevel:0", "BackgroundCallCompaction:0"},
+      {"DBImpl::BackgroundCompaction:Start",
+       "ExternalSSTFileTest::PickedLevel:1"},
+      {"ExternalSSTFileTest::PickedLevel:2",
+       "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Flush 4 files containing the same keys
+  for (int i = 0; i < 4; i++) {
+    ASSERT_OK(Put(Key(3), Key(3) + "put"));
+    ASSERT_OK(Put(Key(8), Key(8) + "put"));
+    true_data[Key(3)] = Key(3) + "put";
+    true_data[Key(8)] = Key(8) + "put";
+    ASSERT_OK(Flush());
+  }
+
+  // Wait for BackgroundCompaction() to be called
+  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevel:0");
+  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevel:1");
+
+  EXPECT_EQ(FilesPerLevel(), "4,0,1,1");
+
+  // This file overlaps with file 0 (L3), file 1 (L2) and the
+  // output of compaction going to L1
+  ASSERT_OK(GenerateAndAddExternalFile(options, {4, 7}, -1, false, false, true,
+                                       false, false, &true_data));
+  EXPECT_EQ(FilesPerLevel(), "5,0,1,1");
+
+  // This file does not overlap with any file or with the running compaction
+  ASSERT_OK(GenerateAndAddExternalFile(options, {9000, 9001}, -1, false, false,
+                                       false, false, false, &true_data));
+  EXPECT_EQ(FilesPerLevel(), "5,0,1,2");
+
+  // Hold compaction from finishing
+  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevel:2");
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  EXPECT_EQ(FilesPerLevel(), "1,1,1,2");
+
+  size_t kcnt = 0;
+  VerifyDBFromMap(true_data, &kcnt, false);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileTest, PickedLevelBug) {
+  env_->skip_fsync_ = true;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = false;
+  options.level0_file_num_compaction_trigger = 3;
+  options.num_levels = 2;
+  DestroyAndReopen(options);
+
+  std::vector<int> file_keys;
+
+  // file #1 in L0
+  file_keys = {0, 5, 7};
+  for (int k : file_keys) {
+    ASSERT_OK(Put(Key(k), Key(k)));
+  }
+  ASSERT_OK(Flush());
+
+  // file #2 in L0
+  file_keys = {4, 6, 8, 9};
+  for (int k : file_keys) {
+    ASSERT_OK(Put(Key(k), Key(k)));
+  }
+  ASSERT_OK(Flush());
+
+  // We have 2 overlapping files in L0
+  EXPECT_EQ(FilesPerLevel(), "2");
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::IngestExternalFile:AfterIncIngestFileCounter",
+        "ExternalSSTFileTest::PickedLevelBug:0"},
+       {"ExternalSSTFileTest::PickedLevelBug:1", "DBImpl::AddFile:MutexUnlock"},
+       {"ExternalSSTFileTest::PickedLevelBug:2",
+        "DBImpl::RunManualCompaction:0"},
+       {"ExternalSSTFileTest::PickedLevelBug:3",
+        "DBImpl::RunManualCompaction:1"}});
+
+  std::atomic<bool> bg_compact_started(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:Start",
+      [&](void* /*arg*/) { bg_compact_started.store(true); });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Status bg_compact_status;
+  Status bg_addfile_status;
+
+  {
+    // While writing the MANIFEST start a thread that will ask for compaction
+    ThreadGuard bg_compact(port::Thread([&]() {
+      bg_compact_status =
+          db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    }));
+    TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:2");
+
+    // Start a thread that will ingest a new file
+    ThreadGuard bg_addfile(port::Thread([&]() {
+      file_keys = {1, 2, 3};
+      bg_addfile_status = GenerateAndAddExternalFile(options, file_keys, 1);
+    }));
+
+    // Wait for AddFile to start picking levels and writing MANIFEST
+    TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:0");
+
+    TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:3");
+
+    // We need to verify that no compactions can run while AddFile is
+    // ingesting the files into the levels it find suitable. So we will
+    // wait for 2 seconds to give a chance for compactions to run during
+    // this period, and then make sure that no compactions where able to run
+    env_->SleepForMicroseconds(1000000 * 2);
+    bool bg_compact_started_tmp = bg_compact_started.load();
+
+    // Hold AddFile from finishing writing the MANIFEST
+    TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:1");
+
+    // check the status at the end, so even if the ASSERT fails the threads
+    // could be joined and return.
+    ASSERT_FALSE(bg_compact_started_tmp);
+  }
+
+  ASSERT_OK(bg_addfile_status);
+  ASSERT_OK(bg_compact_status);
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  int total_keys = 0;
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    total_keys++;
+  }
+  ASSERT_EQ(total_keys, 10);
+
+  delete iter;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileTest, IngestNonExistingFile) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+
+  Status s = db_->IngestExternalFile({"non_existing_file"},
+                                     IngestExternalFileOptions());
+  ASSERT_NOK(s);
+
+  // Verify file deletion is not impacted (verify a bug fix)
+  ASSERT_OK(Put(Key(1), Key(1)));
+  ASSERT_OK(Put(Key(9), Key(9)));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(1), Key(1)));
+  ASSERT_OK(Put(Key(9), Key(9)));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  // After full compaction, there should be only 1 file.
+  std::vector<std::string> files;
+  ASSERT_OK(env_->GetChildren(dbname_, &files));
+  int num_sst_files = 0;
+  for (auto& f : files) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(f, &number, &type) && type == kTableFile) {
+      num_sst_files++;
+    }
+  }
+  ASSERT_EQ(1, num_sst_files);
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+TEST_F(ExternalSSTFileTest, CompactDuringAddFileRandom) {
+  env_->skip_fsync_ = true;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = false;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 2;
+  DestroyAndReopen(options);
+
+  std::function<void()> bg_compact = [&]() {
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  };
+
+  int range_id = 0;
+  std::vector<int> file_keys;
+  std::function<void()> bg_addfile = [&]() {
+    ASSERT_OK(GenerateAndAddExternalFile(options, file_keys, range_id));
+  };
+
+  const int num_of_ranges = 1000;
+  std::vector<port::Thread> threads;
+  while (range_id < num_of_ranges) {
+    int range_start = range_id * 10;
+    int range_end = range_start + 10;
+
+    file_keys.clear();
+    for (int k = range_start + 1; k < range_end; k++) {
+      file_keys.push_back(k);
+    }
+    ASSERT_OK(Put(Key(range_start), Key(range_start)));
+    ASSERT_OK(Put(Key(range_end), Key(range_end)));
+    ASSERT_OK(Flush());
+
+    if (range_id % 10 == 0) {
+      threads.emplace_back(bg_compact);
+    }
+    threads.emplace_back(bg_addfile);
+
+    for (auto& t : threads) {
+      t.join();
+    }
+    threads.clear();
+
+    range_id++;
+  }
+
+  for (int rid = 0; rid < num_of_ranges; rid++) {
+    int range_start = rid * 10;
+    int range_end = range_start + 10;
+
+    ASSERT_EQ(Get(Key(range_start)), Key(range_start)) << rid;
+    ASSERT_EQ(Get(Key(range_end)), Key(range_end)) << rid;
+    for (int k = range_start + 1; k < range_end; k++) {
+      std::string v = Key(k) + std::to_string(rid);
+      ASSERT_EQ(Get(Key(k)), v) << rid;
+    }
+  }
+}
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_F(ExternalSSTFileTest, PickedLevelDynamic) {
+  env_->skip_fsync_ = true;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = false;
+  options.level0_file_num_compaction_trigger = 4;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.num_levels = 4;
+  DestroyAndReopen(options);
+  std::map<std::string, std::string> true_data;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"ExternalSSTFileTest::PickedLevelDynamic:0",
+       "BackgroundCallCompaction:0"},
+      {"DBImpl::BackgroundCompaction:Start",
+       "ExternalSSTFileTest::PickedLevelDynamic:1"},
+      {"ExternalSSTFileTest::PickedLevelDynamic:2",
+       "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Flush 4 files containing the same keys
+  for (int i = 0; i < 4; i++) {
+    for (int k = 20; k <= 30; k++) {
+      ASSERT_OK(Put(Key(k), Key(k) + "put"));
+      true_data[Key(k)] = Key(k) + "put";
+    }
+    for (int k = 50; k <= 60; k++) {
+      ASSERT_OK(Put(Key(k), Key(k) + "put"));
+      true_data[Key(k)] = Key(k) + "put";
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // Wait for BackgroundCompaction() to be called
+  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelDynamic:0");
+  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelDynamic:1");
+
+  // This file overlaps with the output of the compaction (going to L3)
+  // so the file will be added to L0 since L3 is the base level
+  ASSERT_OK(GenerateAndAddExternalFile(options, {31, 32, 33, 34}, -1, false,
+                                       false, true, false, false, &true_data));
+  EXPECT_EQ(FilesPerLevel(), "5");
+
+  // This file does not overlap with the current running compactiong
+  ASSERT_OK(GenerateAndAddExternalFile(options, {9000, 9001}, -1, false, false,
+                                       true, false, false, &true_data));
+  EXPECT_EQ(FilesPerLevel(), "5,0,0,1");
+
+  // Hold compaction from finishing
+  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelDynamic:2");
+
+  // Output of the compaction will go to L3
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  EXPECT_EQ(FilesPerLevel(), "1,0,0,2");
+
+  Close();
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  ASSERT_OK(GenerateAndAddExternalFile(options, {1, 15, 19}, -1, false, false,
+                                       true, false, false, &true_data));
+  ASSERT_EQ(FilesPerLevel(), "1,0,0,3");
+
+  ASSERT_OK(GenerateAndAddExternalFile(options, {1000, 1001, 1002}, -1, false,
+                                       false, true, false, false, &true_data));
+  ASSERT_EQ(FilesPerLevel(), "1,0,0,4");
+
+  ASSERT_OK(GenerateAndAddExternalFile(options, {500, 600, 700}, -1, false,
+                                       false, true, false, false, &true_data));
+  ASSERT_EQ(FilesPerLevel(), "1,0,0,5");
+
+  // File 5 overlaps with file 2 (L3 / base level)
+  ASSERT_OK(GenerateAndAddExternalFile(options, {2, 10}, -1, false, false, true,
+                                       false, false, &true_data));
+  ASSERT_EQ(FilesPerLevel(), "2,0,0,5");
+
+  // File 6 overlaps with file 2 (L3 / base level) and file 5 (L0)
+  ASSERT_OK(GenerateAndAddExternalFile(options, {3, 9}, -1, false, false, true,
+                                       false, false, &true_data));
+  ASSERT_EQ(FilesPerLevel(), "3,0,0,5");
+
+  // Verify data in files
+  size_t kcnt = 0;
+  VerifyDBFromMap(true_data, &kcnt, false);
+
+  // Write range [5 => 10] to L0
+  for (int i = 5; i <= 10; i++) {
+    std::string k = Key(i);
+    std::string v = k + "put";
+    ASSERT_OK(Put(k, v));
+    true_data[k] = v;
+  }
+  ASSERT_OK(Flush());
+  ASSERT_EQ(FilesPerLevel(), "4,0,0,5");
+
+  // File 7 overlaps with file 4 (L3)
+  ASSERT_OK(GenerateAndAddExternalFile(options, {650, 651, 652}, -1, false,
+                                       false, true, false, false, &true_data));
+  ASSERT_EQ(FilesPerLevel(), "5,0,0,5");
+
+  VerifyDBFromMap(true_data, &kcnt, false);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileTest, AddExternalSstFileWithCustomCompartor) {
+  Options options = CurrentOptions();
+  options.comparator = ReverseBytewiseComparator();
+  DestroyAndReopen(options);
+
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // Generate files with these key ranges
+  // {14  -> 0}
+  // {24 -> 10}
+  // {34 -> 20}
+  // {44 -> 30}
+  // ..
+  std::vector<std::string> generated_files;
+  for (int i = 0; i < 10; i++) {
+    std::string file_name = sst_files_dir_ + env_->GenerateUniqueId();
+    ASSERT_OK(sst_file_writer.Open(file_name));
+
+    int range_end = i * 10;
+    int range_start = range_end + 15;
+    for (int k = (range_start - 1); k >= range_end; k--) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k)));
+    }
+    ExternalSstFileInfo file_info;
+    ASSERT_OK(sst_file_writer.Finish(&file_info));
+    generated_files.push_back(file_name);
+  }
+
+  std::vector<std::string> in_files;
+
+  // These 2nd and 3rd files overlap with each other
+  in_files = {generated_files[0], generated_files[4], generated_files[5],
+              generated_files[7]};
+  ASSERT_NOK(DeprecatedAddFile(in_files));
+
+  // These 2 files don't overlap with each other
+  in_files = {generated_files[0], generated_files[2]};
+  ASSERT_OK(DeprecatedAddFile(in_files));
+
+  // These 2 files don't overlap with each other but overlap with keys in DB
+  in_files = {generated_files[3], generated_files[7]};
+  ASSERT_NOK(DeprecatedAddFile(in_files));
+
+  // Files don't overlap and don't overlap with DB key range
+  in_files = {generated_files[4], generated_files[6], generated_files[8]};
+  ASSERT_OK(DeprecatedAddFile(in_files));
+
+  for (int i = 0; i < 100; i++) {
+    if (i % 20 <= 14) {
+      ASSERT_EQ(Get(Key(i)), Key(i));
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+}
+
+TEST_F(ExternalSSTFileTest, AddFileTrivialMoveBug) {
+  Options options = CurrentOptions();
+  options.num_levels = 3;
+  options.IncreaseParallelism(20);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(GenerateAndAddExternalFile(options, {1, 4}, 1));  // L3
+  ASSERT_OK(GenerateAndAddExternalFile(options, {2, 3}, 2));  // L2
+
+  ASSERT_OK(GenerateAndAddExternalFile(options, {10, 14}, 3));  // L3
+  ASSERT_OK(GenerateAndAddExternalFile(options, {12, 13}, 4));  // L2
+
+  ASSERT_OK(GenerateAndAddExternalFile(options, {20, 24}, 5));  // L3
+  ASSERT_OK(GenerateAndAddExternalFile(options, {22, 23}, 6));  // L2
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():Start", [&](void* /*arg*/) {
+        // fit in L3 but will overlap with compaction so will be added
+        // to L2 but a compaction will trivially move it to L3
+        // and break LSM consistency
+        static std::atomic<bool> called = {false};
+        if (!called) {
+          called = true;
+          ASSERT_OK(dbfull()->SetOptions({{"max_bytes_for_level_base", "1"}}));
+          ASSERT_OK(GenerateAndAddExternalFile(options, {15, 16}, 7));
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = false;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileTest, CompactAddedFiles) {
+  Options options = CurrentOptions();
+  options.num_levels = 3;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(GenerateAndAddExternalFile(options, {1, 10}, 1));  // L3
+  ASSERT_OK(GenerateAndAddExternalFile(options, {2, 9}, 2));   // L2
+  ASSERT_OK(GenerateAndAddExternalFile(options, {3, 8}, 3));   // L1
+  ASSERT_OK(GenerateAndAddExternalFile(options, {4, 7}, 4));   // L0
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+}
+
+TEST_F(ExternalSSTFileTest, SstFileWriterNonSharedKeys) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  std::string file_path = sst_files_dir_ + "/not_shared";
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  std::string suffix(100, 'X');
+  ASSERT_OK(sst_file_writer.Open(file_path));
+  ASSERT_OK(sst_file_writer.Put("A" + suffix, "VAL"));
+  ASSERT_OK(sst_file_writer.Put("BB" + suffix, "VAL"));
+  ASSERT_OK(sst_file_writer.Put("CC" + suffix, "VAL"));
+  ASSERT_OK(sst_file_writer.Put("CXD" + suffix, "VAL"));
+  ASSERT_OK(sst_file_writer.Put("CZZZ" + suffix, "VAL"));
+  ASSERT_OK(sst_file_writer.Put("ZAAAX" + suffix, "VAL"));
+
+  ASSERT_OK(sst_file_writer.Finish());
+  ASSERT_OK(DeprecatedAddFile({file_path}));
+}
+
+TEST_F(ExternalSSTFileTest, WithUnorderedWrite) {
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL",
+        "ExternalSSTFileTest::WithUnorderedWrite:WaitWriteWAL"},
+       {"DBImpl::WaitForPendingWrites:BeforeBlock",
+        "DBImpl::WriteImpl:BeforeUnorderedWriteMemtable"}});
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::IngestExternalFile:NeedFlush", [&](void* need_flush) {
+        ASSERT_TRUE(*reinterpret_cast<bool*>(need_flush));
+      });
+
+  Options options = CurrentOptions();
+  options.unordered_write = true;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "v1"));
+  SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread writer([&]() { ASSERT_OK(Put("bar", "v2")); });
+
+  TEST_SYNC_POINT("ExternalSSTFileTest::WithUnorderedWrite:WaitWriteWAL");
+  ASSERT_OK(GenerateAndAddExternalFile(options, {{"bar", "v3"}}, -1,
+                                       true /* allow_global_seqno */));
+  ASSERT_EQ(Get("bar"), "v3");
+
+  writer.join();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoRandomized) {
+  env_->skip_fsync_ = true;
+  Options options = CurrentOptions();
+  options.IncreaseParallelism(20);
+  options.level0_slowdown_writes_trigger = 256;
+  options.level0_stop_writes_trigger = 256;
+
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  for (int iter = 0; iter < 2; iter++) {
+    bool write_to_memtable = (iter == 0);
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    std::map<std::string, std::string> true_data;
+    for (int i = 0; i < 500; i++) {
+      std::vector<std::pair<std::string, std::string>> random_data;
+      for (int j = 0; j < 100; j++) {
+        std::string k = rnd.RandomString(rnd.Next() % 20);
+        std::string v = rnd.RandomString(rnd.Next() % 50);
+        random_data.emplace_back(k, v);
+      }
+
+      if (write_to_memtable && rnd.OneIn(4)) {
+        // 25% of writes go through memtable
+        for (auto& entry : random_data) {
+          ASSERT_OK(Put(entry.first, entry.second));
+          true_data[entry.first] = entry.second;
+        }
+      } else {
+        ASSERT_OK(GenerateAndAddExternalFile(
+            options, random_data, -1, true, write_global_seqno,
+            verify_checksums_before_ingest, false, true, &true_data));
+      }
+    }
+    size_t kcnt = 0;
+    VerifyDBFromMap(true_data, &kcnt, false);
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    VerifyDBFromMap(true_data, &kcnt, false);
+  }
+}
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoAssignedLevel) {
+  Options options = CurrentOptions();
+  options.num_levels = 5;
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+  std::vector<std::pair<std::string, std::string>> file_data;
+  std::map<std::string, std::string> true_data;
+
+  // Insert 100 -> 200 into the memtable
+  for (int i = 100; i <= 200; i++) {
+    ASSERT_OK(Put(Key(i), "memtable"));
+    true_data[Key(i)] = "memtable";
+  }
+
+  // Insert 0 -> 20 using AddFile
+  file_data.clear();
+  for (int i = 0; i <= 20; i++) {
+    file_data.emplace_back(Key(i), "L4");
+  }
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, file_data, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
+
+  // This file don't overlap with anything in the DB, will go to L4
+  ASSERT_EQ("0,0,0,0,1", FilesPerLevel());
+
+  // Insert 80 -> 130 using AddFile
+  file_data.clear();
+  for (int i = 80; i <= 130; i++) {
+    file_data.emplace_back(Key(i), "L0");
+  }
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, file_data, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
+
+  // This file overlap with the memtable, so it will flush it and add
+  // it self to L0
+  ASSERT_EQ("2,0,0,0,1", FilesPerLevel());
+
+  // Insert 30 -> 50 using AddFile
+  file_data.clear();
+  for (int i = 30; i <= 50; i++) {
+    file_data.emplace_back(Key(i), "L4");
+  }
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, file_data, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
+
+  // This file don't overlap with anything in the DB and fit in L4 as well
+  ASSERT_EQ("2,0,0,0,2", FilesPerLevel());
+
+  // Insert 10 -> 40 using AddFile
+  file_data.clear();
+  for (int i = 10; i <= 40; i++) {
+    file_data.emplace_back(Key(i), "L3");
+  }
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, file_data, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
+
+  // This file overlap with files in L4, we will ingest it in L3
+  ASSERT_EQ("2,0,0,1,2", FilesPerLevel());
+
+  size_t kcnt = 0;
+  VerifyDBFromMap(true_data, &kcnt, false);
+}
+
+TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoMemtableFlush) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  uint64_t entries_in_memtable;
+  std::map<std::string, std::string> true_data;
+
+  for (int k : {10, 20, 40, 80}) {
+    ASSERT_OK(Put(Key(k), "memtable"));
+    true_data[Key(k)] = "memtable";
+  }
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                                  &entries_in_memtable));
+  ASSERT_GE(entries_in_memtable, 1);
+
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  // No need for flush
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {90, 100, 110}, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                                  &entries_in_memtable));
+  ASSERT_GE(entries_in_memtable, 1);
+
+  // This file will flush the memtable
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {19, 20, 21}, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                                  &entries_in_memtable));
+  ASSERT_EQ(entries_in_memtable, 0);
+
+  for (int k : {200, 201, 205, 206}) {
+    ASSERT_OK(Put(Key(k), "memtable"));
+    true_data[Key(k)] = "memtable";
+  }
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                                  &entries_in_memtable));
+  ASSERT_GE(entries_in_memtable, 1);
+
+  // No need for flush, this file keys fit between the memtable keys
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {202, 203, 204}, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                                  &entries_in_memtable));
+  ASSERT_GE(entries_in_memtable, 1);
+
+  // This file will flush the memtable
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {206, 207}, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                                  &entries_in_memtable));
+  ASSERT_EQ(entries_in_memtable, 0);
+
+  size_t kcnt = 0;
+  VerifyDBFromMap(true_data, &kcnt, false);
+}
+
+TEST_P(ExternalSSTFileTest, L0SortingIssue) {
+  Options options = CurrentOptions();
+  options.num_levels = 2;
+  DestroyAndReopen(options);
+  std::map<std::string, std::string> true_data;
+
+  ASSERT_OK(Put(Key(1), "memtable"));
+  ASSERT_OK(Put(Key(10), "memtable"));
+
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  // No Flush needed, No global seqno needed, Ingest in L1
+  ASSERT_OK(
+      GenerateAndAddExternalFile(options, {7, 8}, -1, true, write_global_seqno,
+                                 verify_checksums_before_ingest, false, false));
+  // No Flush needed, but need a global seqno, Ingest in L0
+  ASSERT_OK(
+      GenerateAndAddExternalFile(options, {7, 8}, -1, true, write_global_seqno,
+                                 verify_checksums_before_ingest, false, false));
+  printf("%s\n", FilesPerLevel().c_str());
+
+  // Overwrite what we added using external files
+  ASSERT_OK(Put(Key(7), "memtable"));
+  ASSERT_OK(Put(Key(8), "memtable"));
+
+  // Read values from memtable
+  ASSERT_EQ(Get(Key(7)), "memtable");
+  ASSERT_EQ(Get(Key(8)), "memtable");
+
+  // Flush and read from L0
+  ASSERT_OK(Flush());
+  printf("%s\n", FilesPerLevel().c_str());
+  ASSERT_EQ(Get(Key(7)), "memtable");
+  ASSERT_EQ(Get(Key(8)), "memtable");
+}
+
+TEST_F(ExternalSSTFileTest, CompactionDeadlock) {
+  Options options = CurrentOptions();
+  options.num_levels = 2;
+  options.level0_file_num_compaction_trigger = 4;
+  options.level0_slowdown_writes_trigger = 4;
+  options.level0_stop_writes_trigger = 4;
+  DestroyAndReopen(options);
+
+  // atomic conter of currently running bg threads
+  std::atomic<int> running_threads(0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::DelayWrite:Wait", "ExternalSSTFileTest::DeadLock:0"},
+      {"ExternalSSTFileTest::DeadLock:1", "DBImpl::AddFile:Start"},
+      {"DBImpl::AddFile:MutexLock", "ExternalSSTFileTest::DeadLock:2"},
+      {"ExternalSSTFileTest::DeadLock:3", "BackgroundCallCompaction:0"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Start ingesting and extrnal file in the background
+  ROCKSDB_NAMESPACE::port::Thread bg_ingest_file([&]() {
+    running_threads += 1;
+    ASSERT_OK(GenerateAndAddExternalFile(options, {5, 6}));
+    running_threads -= 1;
+  });
+
+  ASSERT_OK(Put(Key(1), "memtable"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(2), "memtable"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(3), "memtable"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(4), "memtable"));
+  ASSERT_OK(Flush());
+
+  // This thread will try to insert into the memtable but since we have 4 L0
+  // files this thread will be blocked and hold the writer thread
+  ROCKSDB_NAMESPACE::port::Thread bg_block_put([&]() {
+    running_threads += 1;
+    ASSERT_OK(Put(Key(10), "memtable"));
+    running_threads -= 1;
+  });
+
+  // Make sure DelayWrite is called first
+  TEST_SYNC_POINT("ExternalSSTFileTest::DeadLock:0");
+
+  // `DBImpl::AddFile:Start` will wait until we be here
+  TEST_SYNC_POINT("ExternalSSTFileTest::DeadLock:1");
+
+  // Wait for IngestExternalFile() to start and aquire mutex
+  TEST_SYNC_POINT("ExternalSSTFileTest::DeadLock:2");
+
+  // Now let compaction start
+  TEST_SYNC_POINT("ExternalSSTFileTest::DeadLock:3");
+
+  // Wait for max 5 seconds, if we did not finish all bg threads
+  // then we hit the deadlock bug
+  for (int i = 0; i < 10; i++) {
+    if (running_threads.load() == 0) {
+      break;
+    }
+    // Make sure we do a "real sleep", not a mock one.
+    SystemClock::Default()->SleepForMicroseconds(500000);
+  }
+
+  ASSERT_EQ(running_threads.load(), 0);
+
+  bg_ingest_file.join();
+  bg_block_put.join();
+}
+
+TEST_F(ExternalSSTFileTest, DirtyExit) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  std::string file_path = sst_files_dir_ + "/dirty_exit";
+  std::unique_ptr<SstFileWriter> sst_file_writer;
+
+  // Destruct SstFileWriter without calling Finish()
+  sst_file_writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(sst_file_writer->Open(file_path));
+  sst_file_writer.reset();
+
+  // Destruct SstFileWriter with a failing Finish
+  sst_file_writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(sst_file_writer->Open(file_path));
+  ASSERT_NOK(sst_file_writer->Finish());
+}
+
+TEST_F(ExternalSSTFileTest, FileWithCFInfo) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"koko", "toto"}, options);
+
+  SstFileWriter sfw_default(EnvOptions(), options, handles_[0]);
+  SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+  SstFileWriter sfw_cf2(EnvOptions(), options, handles_[2]);
+  SstFileWriter sfw_unknown(EnvOptions(), options);
+
+  // default_cf.sst
+  const std::string cf_default_sst = sst_files_dir_ + "/default_cf.sst";
+  ASSERT_OK(sfw_default.Open(cf_default_sst));
+  ASSERT_OK(sfw_default.Put("K1", "V1"));
+  ASSERT_OK(sfw_default.Put("K2", "V2"));
+  ASSERT_OK(sfw_default.Finish());
+
+  // cf1.sst
+  const std::string cf1_sst = sst_files_dir_ + "/cf1.sst";
+  ASSERT_OK(sfw_cf1.Open(cf1_sst));
+  ASSERT_OK(sfw_cf1.Put("K3", "V1"));
+  ASSERT_OK(sfw_cf1.Put("K4", "V2"));
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // cf_unknown.sst
+  const std::string unknown_sst = sst_files_dir_ + "/cf_unknown.sst";
+  ASSERT_OK(sfw_unknown.Open(unknown_sst));
+  ASSERT_OK(sfw_unknown.Put("K5", "V1"));
+  ASSERT_OK(sfw_unknown.Put("K6", "V2"));
+  ASSERT_OK(sfw_unknown.Finish());
+
+  IngestExternalFileOptions ifo;
+
+  // SST CF don't match
+  ASSERT_NOK(db_->IngestExternalFile(handles_[0], {cf1_sst}, ifo));
+  // SST CF don't match
+  ASSERT_NOK(db_->IngestExternalFile(handles_[2], {cf1_sst}, ifo));
+  // SST CF match
+  ASSERT_OK(db_->IngestExternalFile(handles_[1], {cf1_sst}, ifo));
+
+  // SST CF don't match
+  ASSERT_NOK(db_->IngestExternalFile(handles_[1], {cf_default_sst}, ifo));
+  // SST CF don't match
+  ASSERT_NOK(db_->IngestExternalFile(handles_[2], {cf_default_sst}, ifo));
+  // SST CF match
+  ASSERT_OK(db_->IngestExternalFile(handles_[0], {cf_default_sst}, ifo));
+
+  // SST CF unknown
+  ASSERT_OK(db_->IngestExternalFile(handles_[1], {unknown_sst}, ifo));
+  // SST CF unknown
+  ASSERT_OK(db_->IngestExternalFile(handles_[2], {unknown_sst}, ifo));
+  // SST CF unknown
+  ASSERT_OK(db_->IngestExternalFile(handles_[0], {unknown_sst}, ifo));
+
+  // Cannot ingest a file into a dropped CF
+  ASSERT_OK(db_->DropColumnFamily(handles_[1]));
+  ASSERT_NOK(db_->IngestExternalFile(handles_[1], {unknown_sst}, ifo));
+
+  // CF was not dropped, ok to Ingest
+  ASSERT_OK(db_->IngestExternalFile(handles_[2], {unknown_sst}, ifo));
+}
+
+/*
+ * Test and verify the functionality of ingestion_options.move_files and
+ * ingestion_options.failed_move_fall_back_to_copy
+ */
+TEST_P(ExternSSTFileLinkFailFallbackTest, LinkFailFallBackExternalSst) {
+  const bool fail_link = std::get<0>(GetParam());
+  const bool failed_move_fall_back_to_copy = std::get<1>(GetParam());
+  test_env_->set_fail_link(fail_link);
+  const EnvOptions env_options;
+  DestroyAndReopen(options_);
+  const int kNumKeys = 10000;
+  IngestExternalFileOptions ifo;
+  ifo.move_files = true;
+  ifo.failed_move_fall_back_to_copy = failed_move_fall_back_to_copy;
+
+  std::string file_path = sst_files_dir_ + "file1.sst";
+  // Create SstFileWriter for default column family
+  SstFileWriter sst_file_writer(env_options, options_);
+  ASSERT_OK(sst_file_writer.Open(file_path));
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(sst_file_writer.Put(Key(i), Key(i) + "_value"));
+  }
+  ASSERT_OK(sst_file_writer.Finish());
+  uint64_t file_size = 0;
+  ASSERT_OK(env_->GetFileSize(file_path, &file_size));
+
+  bool copyfile = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ExternalSstFileIngestionJob::Prepare:CopyFile",
+      [&](void* /* arg */) { copyfile = true; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  const Status s = db_->IngestExternalFile({file_path}, ifo);
+
+  ColumnFamilyHandleImpl* cfh =
+      static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
+  ColumnFamilyData* cfd = cfh->cfd();
+  const InternalStats* internal_stats_ptr = cfd->internal_stats();
+  const std::vector<InternalStats::CompactionStats>& comp_stats =
+      internal_stats_ptr->TEST_GetCompactionStats();
+  uint64_t bytes_copied = 0;
+  uint64_t bytes_moved = 0;
+  for (const auto& stats : comp_stats) {
+    bytes_copied += stats.bytes_written;
+    bytes_moved += stats.bytes_moved;
+  }
+
+  if (!fail_link) {
+    // Link operation succeeds. External SST should be moved.
+    ASSERT_OK(s);
+    ASSERT_EQ(0, bytes_copied);
+    ASSERT_EQ(file_size, bytes_moved);
+    ASSERT_FALSE(copyfile);
+  } else {
+    // Link operation fails.
+    ASSERT_EQ(0, bytes_moved);
+    if (failed_move_fall_back_to_copy) {
+      ASSERT_OK(s);
+      // Copy file is true since a failed link falls back to copy file.
+      ASSERT_TRUE(copyfile);
+      ASSERT_EQ(file_size, bytes_copied);
+    } else {
+      ASSERT_TRUE(s.IsNotSupported());
+      // Copy file is false since a failed link does not fall back to copy file.
+      ASSERT_FALSE(copyfile);
+      ASSERT_EQ(0, bytes_copied);
+    }
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+class TestIngestExternalFileListener : public EventListener {
+ public:
+  void OnExternalFileIngested(DB* /*db*/,
+                              const ExternalFileIngestionInfo& info) override {
+    ingested_files.push_back(info);
+  }
+
+  std::vector<ExternalFileIngestionInfo> ingested_files;
+};
+
+TEST_P(ExternalSSTFileTest, IngestionListener) {
+  Options options = CurrentOptions();
+  TestIngestExternalFileListener* listener =
+      new TestIngestExternalFileListener();
+  options.listeners.emplace_back(listener);
+  CreateAndReopenWithCF({"koko", "toto"}, options);
+
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  // Ingest into default cf
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {1, 2}, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, true, nullptr, handles_[0]));
+  ASSERT_EQ(listener->ingested_files.size(), 1);
+  ASSERT_EQ(listener->ingested_files.back().cf_name, "default");
+  ASSERT_EQ(listener->ingested_files.back().global_seqno, 0);
+  ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_id,
+            0);
+  ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_name,
+            "default");
+
+  // Ingest into cf1
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {1, 2}, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, true, nullptr, handles_[1]));
+  ASSERT_EQ(listener->ingested_files.size(), 2);
+  ASSERT_EQ(listener->ingested_files.back().cf_name, "koko");
+  ASSERT_EQ(listener->ingested_files.back().global_seqno, 0);
+  ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_id,
+            1);
+  ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_name,
+            "koko");
+
+  // Ingest into cf2
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {1, 2}, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, true, nullptr, handles_[2]));
+  ASSERT_EQ(listener->ingested_files.size(), 3);
+  ASSERT_EQ(listener->ingested_files.back().cf_name, "toto");
+  ASSERT_EQ(listener->ingested_files.back().global_seqno, 0);
+  ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_id,
+            2);
+  ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_name,
+            "toto");
+}
+
+TEST_F(ExternalSSTFileTest, SnapshotInconsistencyBug) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  const int kNumKeys = 10000;
+
+  // Insert keys using normal path and take a snapshot
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(Put(Key(i), Key(i) + "_V1"));
+  }
+  const Snapshot* snap = db_->GetSnapshot();
+
+  // Overwrite all keys using IngestExternalFile
+  std::string sst_file_path = sst_files_dir_ + "file1.sst";
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+  ASSERT_OK(sst_file_writer.Open(sst_file_path));
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(sst_file_writer.Put(Key(i), Key(i) + "_V2"));
+  }
+  ASSERT_OK(sst_file_writer.Finish());
+
+  IngestExternalFileOptions ifo;
+  ifo.move_files = true;
+  ASSERT_OK(db_->IngestExternalFile({sst_file_path}, ifo));
+
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_EQ(Get(Key(i), snap), Key(i) + "_V1");
+    ASSERT_EQ(Get(Key(i)), Key(i) + "_V2");
+  }
+
+  db_->ReleaseSnapshot(snap);
+}
+
+TEST_P(ExternalSSTFileTest, IngestBehind) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 3;
+  options.disable_auto_compactions = false;
+  DestroyAndReopen(options);
+  std::vector<std::pair<std::string, std::string>> file_data;
+  std::map<std::string, std::string> true_data;
+
+  // Insert 100 -> 200 into the memtable
+  for (int i = 100; i <= 200; i++) {
+    ASSERT_OK(Put(Key(i), "memtable"));
+    true_data[Key(i)] = "memtable";
+  }
+
+  // Insert 100 -> 200 using IngestExternalFile
+  file_data.clear();
+  for (int i = 0; i <= 20; i++) {
+    file_data.emplace_back(Key(i), "ingest_behind");
+  }
+
+  bool allow_global_seqno = true;
+  bool ingest_behind = true;
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+
+  // Can't ingest behind since allow_ingest_behind isn't set to true
+  ASSERT_NOK(GenerateAndAddExternalFile(
+      options, file_data, -1, allow_global_seqno, write_global_seqno,
+      verify_checksums_before_ingest, ingest_behind, false /*sort_data*/,
+      &true_data));
+
+  options.allow_ingest_behind = true;
+  // check that we still can open the DB, as num_levels should be
+  // sanitized to 3
+  options.num_levels = 2;
+  DestroyAndReopen(options);
+
+  options.num_levels = 3;
+  DestroyAndReopen(options);
+  // Insert 100 -> 200 into the memtable
+  for (int i = 100; i <= 200; i++) {
+    ASSERT_OK(Put(Key(i), "memtable"));
+    true_data[Key(i)] = "memtable";
+  }
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  // Universal picker should go at second from the bottom level
+  ASSERT_EQ("0,1", FilesPerLevel());
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, file_data, -1, allow_global_seqno, write_global_seqno,
+      verify_checksums_before_ingest, true /*ingest_behind*/,
+      false /*sort_data*/, &true_data));
+  ASSERT_EQ("0,1,1", FilesPerLevel());
+  // this time ingest should fail as the file doesn't fit to the bottom level
+  ASSERT_NOK(GenerateAndAddExternalFile(
+      options, file_data, -1, allow_global_seqno, write_global_seqno,
+      verify_checksums_before_ingest, true /*ingest_behind*/,
+      false /*sort_data*/, &true_data));
+  ASSERT_EQ("0,1,1", FilesPerLevel());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  // bottom level should be empty
+  ASSERT_EQ("0,1", FilesPerLevel());
+
+  size_t kcnt = 0;
+  VerifyDBFromMap(true_data, &kcnt, false);
+}
+
+TEST_F(ExternalSSTFileTest, SkipBloomFilter) {
+  Options options = CurrentOptions();
+
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+  table_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  // Create external SST file and include bloom filters
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+  {
+    std::string file_path = sst_files_dir_ + "sst_with_bloom.sst";
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+    ASSERT_OK(sst_file_writer.Open(file_path));
+    ASSERT_OK(sst_file_writer.Put("Key1", "Value1"));
+    ASSERT_OK(sst_file_writer.Finish());
+
+    ASSERT_OK(
+        db_->IngestExternalFile({file_path}, IngestExternalFileOptions()));
+
+    ASSERT_EQ(Get("Key1"), "Value1");
+    ASSERT_GE(
+        options.statistics->getTickerCount(Tickers::BLOCK_CACHE_FILTER_ADD), 1);
+  }
+
+  // Create external SST file but skip bloom filters
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+  {
+    std::string file_path = sst_files_dir_ + "sst_with_no_bloom.sst";
+    SstFileWriter sst_file_writer(EnvOptions(), options, nullptr, true,
+                                  Env::IOPriority::IO_TOTAL,
+                                  true /* skip_filters */);
+    ASSERT_OK(sst_file_writer.Open(file_path));
+    ASSERT_OK(sst_file_writer.Put("Key1", "Value1"));
+    ASSERT_OK(sst_file_writer.Finish());
+
+    ASSERT_OK(
+        db_->IngestExternalFile({file_path}, IngestExternalFileOptions()));
+
+    ASSERT_EQ(Get("Key1"), "Value1");
+    ASSERT_EQ(
+        options.statistics->getTickerCount(Tickers::BLOCK_CACHE_FILTER_ADD), 0);
+  }
+}
+
+TEST_F(ExternalSSTFileTest, IngestFileWrittenWithCompressionDictionary) {
+  if (!ZSTD_Supported()) {
+    return;
+  }
+  const int kNumEntries = 1 << 10;
+  const int kNumBytesPerEntry = 1 << 10;
+  Options options = CurrentOptions();
+  options.compression = kZSTD;
+  options.compression_opts.max_dict_bytes = 1 << 14;        // 16KB
+  options.compression_opts.zstd_max_train_bytes = 1 << 18;  // 256KB
+  DestroyAndReopen(options);
+
+  std::atomic<int> num_compression_dicts(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
+      [&](void* /* arg */) { ++num_compression_dicts; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  std::vector<std::pair<std::string, std::string>> random_data;
+  for (int i = 0; i < kNumEntries; i++) {
+    std::string val = rnd.RandomString(kNumBytesPerEntry);
+    random_data.emplace_back(Key(i), std::move(val));
+  }
+  ASSERT_OK(GenerateAndAddExternalFile(options, std::move(random_data)));
+  ASSERT_EQ(1, num_compression_dicts);
+}
+
+class ExternalSSTBlockChecksumTest
+    : public ExternalSSTFileTestBase,
+      public testing::WithParamInterface<uint32_t> {};
+
+INSTANTIATE_TEST_CASE_P(FormatVersions, ExternalSSTBlockChecksumTest,
+                        testing::ValuesIn(test::kFooterFormatVersionsToTest));
+
+// Very slow, not worth the cost to run regularly
+TEST_P(ExternalSSTBlockChecksumTest, DISABLED_HugeBlockChecksum) {
+  BlockBasedTableOptions table_options;
+  table_options.format_version = GetParam();
+  for (auto t : GetSupportedChecksums()) {
+    table_options.checksum = t;
+    Options options = CurrentOptions();
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+
+    // 2^32 - 1, will lead to data block with more than 2^32 bytes
+    size_t huge_size = std::numeric_limits<uint32_t>::max();
+
+    std::string f = sst_files_dir_ + "f.sst";
+    ASSERT_OK(sst_file_writer.Open(f));
+    {
+      Random64 r(123);
+      std::string huge(huge_size, 0);
+      for (size_t j = 0; j + 7 < huge_size; j += 8) {
+        EncodeFixed64(&huge[j], r.Next());
+      }
+      ASSERT_OK(sst_file_writer.Put("Huge", huge));
+    }
+
+    ExternalSstFileInfo f_info;
+    ASSERT_OK(sst_file_writer.Finish(&f_info));
+    ASSERT_GT(f_info.file_size, uint64_t{huge_size} + 10);
+
+    SstFileReader sst_file_reader(options);
+    ASSERT_OK(sst_file_reader.Open(f));
+    ASSERT_OK(sst_file_reader.VerifyChecksum());
+  }
+}
+
+TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_Success) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = CurrentOptions();
+  options.env = fault_injection_env.get();
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+
+  // Exercise different situations in different column families: two are empty
+  // (so no new sequence number is needed), but at least one overlaps with the
+  // DB and needs to bump the sequence number.
+  ASSERT_OK(db_->Put(WriteOptions(), "foo1", "oldvalue"));
+
+  std::vector<ColumnFamilyHandle*> column_families;
+  column_families.push_back(handles_[0]);
+  column_families.push_back(handles_[1]);
+  column_families.push_back(handles_[2]);
+  std::vector<IngestExternalFileOptions> ifos(column_families.size());
+  for (auto& ifo : ifos) {
+    ifo.allow_global_seqno = true;  // Always allow global_seqno
+    // May or may not write global_seqno
+    ifo.write_global_seqno = std::get<0>(GetParam());
+    // Whether to verify checksums before ingestion
+    ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+  }
+  std::vector<std::vector<std::pair<std::string, std::string>>> data;
+  data.push_back(
+      {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+  data.push_back(
+      {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  data.push_back(
+      {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+
+  // Resize the true_data vector upon construction to avoid re-alloc
+  std::vector<std::map<std::string, std::string>> true_data(
+      column_families.size());
+  ASSERT_OK(GenerateAndAddExternalFiles(options, column_families, ifos, data,
+                                        -1, true, true_data));
+  Close();
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+                           options);
+  ASSERT_EQ(3, handles_.size());
+  int cf = 0;
+  for (const auto& verify_map : true_data) {
+    for (const auto& elem : verify_map) {
+      const std::string& key = elem.first;
+      const std::string& value = elem.second;
+      ASSERT_EQ(value, Get(cf, key));
+    }
+    ++cf;
+  }
+  Close();
+  Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest,
+       IngestFilesIntoMultipleColumnFamilies_NoMixedStateWithSnapshot) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::IngestExternalFiles:InstallSVForFirstCF:0",
+       "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:"
+       "BeforeRead"},
+      {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:"
+       "AfterRead",
+       "DBImpl::IngestExternalFiles:InstallSVForFirstCF:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.env = fault_injection_env.get();
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  const std::vector<std::map<std::string, std::string>> data_before_ingestion =
+      {{{"foo1", "fv1_0"}, {"foo2", "fv2_0"}, {"foo3", "fv3_0"}},
+       {{"bar1", "bv1_0"}, {"bar2", "bv2_0"}, {"bar3", "bv3_0"}},
+       {{"bar4", "bv4_0"}, {"bar5", "bv5_0"}, {"bar6", "bv6_0"}}};
+  for (size_t i = 0; i != handles_.size(); ++i) {
+    int cf = static_cast<int>(i);
+    const auto& orig_data = data_before_ingestion[i];
+    for (const auto& kv : orig_data) {
+      ASSERT_OK(Put(cf, kv.first, kv.second));
+    }
+    ASSERT_OK(Flush(cf));
+  }
+
+  std::vector<ColumnFamilyHandle*> column_families;
+  column_families.push_back(handles_[0]);
+  column_families.push_back(handles_[1]);
+  column_families.push_back(handles_[2]);
+  std::vector<IngestExternalFileOptions> ifos(column_families.size());
+  for (auto& ifo : ifos) {
+    ifo.allow_global_seqno = true;  // Always allow global_seqno
+    // May or may not write global_seqno
+    ifo.write_global_seqno = std::get<0>(GetParam());
+    // Whether to verify checksums before ingestion
+    ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+  }
+  std::vector<std::vector<std::pair<std::string, std::string>>> data;
+  data.push_back(
+      {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+  data.push_back(
+      {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  data.push_back(
+      {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+  // Resize the true_data vector upon construction to avoid re-alloc
+  std::vector<std::map<std::string, std::string>> true_data(
+      column_families.size());
+  // Take snapshot before ingestion starts
+  ReadOptions read_opts;
+  read_opts.total_order_seek = true;
+  read_opts.snapshot = dbfull()->GetSnapshot();
+  std::vector<Iterator*> iters(handles_.size());
+
+  // Range scan checks first kv of each CF before ingestion starts.
+  for (size_t i = 0; i != handles_.size(); ++i) {
+    iters[i] = dbfull()->NewIterator(read_opts, handles_[i]);
+    iters[i]->SeekToFirst();
+    ASSERT_TRUE(iters[i]->Valid());
+    const std::string& key = iters[i]->key().ToString();
+    const std::string& value = iters[i]->value().ToString();
+    const std::map<std::string, std::string>& orig_data =
+        data_before_ingestion[i];
+    std::map<std::string, std::string>::const_iterator it = orig_data.find(key);
+    ASSERT_NE(orig_data.end(), it);
+    ASSERT_EQ(it->second, value);
+    iters[i]->Next();
+  }
+  port::Thread ingest_thread([&]() {
+    ASSERT_OK(GenerateAndAddExternalFiles(options, column_families, ifos, data,
+                                          -1, true, true_data));
+  });
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:"
+      "BeforeRead");
+  // Should see only data before ingestion
+  for (size_t i = 0; i != handles_.size(); ++i) {
+    const auto& orig_data = data_before_ingestion[i];
+    for (; iters[i]->Valid(); iters[i]->Next()) {
+      const std::string& key = iters[i]->key().ToString();
+      const std::string& value = iters[i]->value().ToString();
+      std::map<std::string, std::string>::const_iterator it =
+          orig_data.find(key);
+      ASSERT_NE(orig_data.end(), it);
+      ASSERT_EQ(it->second, value);
+    }
+  }
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:"
+      "AfterRead");
+  ingest_thread.join();
+  for (auto* iter : iters) {
+    delete iter;
+  }
+  iters.clear();
+  dbfull()->ReleaseSnapshot(read_opts.snapshot);
+
+  Close();
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+                           options);
+  // Should see consistent state after ingestion for all column families even
+  // without snapshot.
+  ASSERT_EQ(3, handles_.size());
+  int cf = 0;
+  for (const auto& verify_map : true_data) {
+    for (const auto& elem : verify_map) {
+      const std::string& key = elem.first;
+      const std::string& value = elem.second;
+      ASSERT_EQ(value, Get(cf, key));
+    }
+    ++cf;
+  }
+  Close();
+  Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_PrepareFail) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = CurrentOptions();
+  options.env = fault_injection_env.get();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::IngestExternalFiles:BeforeLastJobPrepare:0",
+       "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_PrepareFail:"
+       "0"},
+      {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies:PrepareFail:"
+       "1",
+       "DBImpl::IngestExternalFiles:BeforeLastJobPrepare:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  std::vector<ColumnFamilyHandle*> column_families;
+  column_families.push_back(handles_[0]);
+  column_families.push_back(handles_[1]);
+  column_families.push_back(handles_[2]);
+  std::vector<IngestExternalFileOptions> ifos(column_families.size());
+  for (auto& ifo : ifos) {
+    ifo.allow_global_seqno = true;  // Always allow global_seqno
+    // May or may not write global_seqno
+    ifo.write_global_seqno = std::get<0>(GetParam());
+    // Whether to verify block checksums before ingest
+    ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+  }
+  std::vector<std::vector<std::pair<std::string, std::string>>> data;
+  data.push_back(
+      {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+  data.push_back(
+      {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  data.push_back(
+      {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+
+  // Resize the true_data vector upon construction to avoid re-alloc
+  std::vector<std::map<std::string, std::string>> true_data(
+      column_families.size());
+  port::Thread ingest_thread([&]() {
+    ASSERT_NOK(GenerateAndAddExternalFiles(options, column_families, ifos, data,
+                                           -1, true, true_data));
+  });
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_PrepareFail:"
+      "0");
+  fault_injection_env->SetFilesystemActive(false);
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies:PrepareFail:"
+      "1");
+  ingest_thread.join();
+
+  fault_injection_env->SetFilesystemActive(true);
+  Close();
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+                           options);
+  ASSERT_EQ(3, handles_.size());
+  int cf = 0;
+  for (const auto& verify_map : true_data) {
+    for (const auto& elem : verify_map) {
+      const std::string& key = elem.first;
+      ASSERT_EQ("NOT_FOUND", Get(cf, key));
+    }
+    ++cf;
+  }
+  Close();
+  Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_CommitFail) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = CurrentOptions();
+  options.env = fault_injection_env.get();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::IngestExternalFiles:BeforeJobsRun:0",
+       "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:"
+       "0"},
+      {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:"
+       "1",
+       "DBImpl::IngestExternalFiles:BeforeJobsRun:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  std::vector<ColumnFamilyHandle*> column_families;
+  column_families.push_back(handles_[0]);
+  column_families.push_back(handles_[1]);
+  column_families.push_back(handles_[2]);
+  std::vector<IngestExternalFileOptions> ifos(column_families.size());
+  for (auto& ifo : ifos) {
+    ifo.allow_global_seqno = true;  // Always allow global_seqno
+    // May or may not write global_seqno
+    ifo.write_global_seqno = std::get<0>(GetParam());
+    // Whether to verify block checksums before ingestion
+    ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+  }
+  std::vector<std::vector<std::pair<std::string, std::string>>> data;
+  data.push_back(
+      {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+  data.push_back(
+      {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  data.push_back(
+      {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+  // Resize the true_data vector upon construction to avoid re-alloc
+  std::vector<std::map<std::string, std::string>> true_data(
+      column_families.size());
+  port::Thread ingest_thread([&]() {
+    ASSERT_NOK(GenerateAndAddExternalFiles(options, column_families, ifos, data,
+                                           -1, true, true_data));
+  });
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:"
+      "0");
+  fault_injection_env->SetFilesystemActive(false);
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:"
+      "1");
+  ingest_thread.join();
+
+  fault_injection_env->SetFilesystemActive(true);
+  Close();
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+                           options);
+  ASSERT_EQ(3, handles_.size());
+  int cf = 0;
+  for (const auto& verify_map : true_data) {
+    for (const auto& elem : verify_map) {
+      const std::string& key = elem.first;
+      ASSERT_EQ("NOT_FOUND", Get(cf, key));
+    }
+    ++cf;
+  }
+  Close();
+  Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest,
+       IngestFilesIntoMultipleColumnFamilies_PartialManifestWriteFail) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = CurrentOptions();
+  options.env = fault_injection_env.get();
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+
+  SyncPoint::GetInstance()->ClearTrace();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0",
+       "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_"
+       "PartialManifestWriteFail:0"},
+      {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_"
+       "PartialManifestWriteFail:1",
+       "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<ColumnFamilyHandle*> column_families;
+  column_families.push_back(handles_[0]);
+  column_families.push_back(handles_[1]);
+  column_families.push_back(handles_[2]);
+  std::vector<IngestExternalFileOptions> ifos(column_families.size());
+  for (auto& ifo : ifos) {
+    ifo.allow_global_seqno = true;  // Always allow global_seqno
+    // May or may not write global_seqno
+    ifo.write_global_seqno = std::get<0>(GetParam());
+    // Whether to verify block checksums before ingestion
+    ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+  }
+  std::vector<std::vector<std::pair<std::string, std::string>>> data;
+  data.push_back(
+      {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+  data.push_back(
+      {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  data.push_back(
+      {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+  // Resize the true_data vector upon construction to avoid re-alloc
+  std::vector<std::map<std::string, std::string>> true_data(
+      column_families.size());
+  port::Thread ingest_thread([&]() {
+    ASSERT_NOK(GenerateAndAddExternalFiles(options, column_families, ifos, data,
+                                           -1, true, true_data));
+  });
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_"
+      "PartialManifestWriteFail:0");
+  fault_injection_env->SetFilesystemActive(false);
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_"
+      "PartialManifestWriteFail:1");
+  ingest_thread.join();
+
+  ASSERT_OK(fault_injection_env->DropUnsyncedFileData());
+  fault_injection_env->SetFilesystemActive(true);
+  Close();
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+                           options);
+  ASSERT_EQ(3, handles_.size());
+  int cf = 0;
+  for (const auto& verify_map : true_data) {
+    for (const auto& elem : verify_map) {
+      const std::string& key = elem.first;
+      ASSERT_EQ("NOT_FOUND", Get(cf, key));
+    }
+    ++cf;
+  }
+  Close();
+  Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest, IngestFilesTriggerFlushingWithTwoWriteQueue) {
+  Options options = CurrentOptions();
+  // Use large buffer to avoid memtable flush
+  options.write_buffer_size = 1024 * 1024;
+  options.two_write_queues = true;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(dbfull()->Put(WriteOptions(), "1000", "v1"));
+  ASSERT_OK(dbfull()->Put(WriteOptions(), "1001", "v1"));
+  ASSERT_OK(dbfull()->Put(WriteOptions(), "9999", "v1"));
+
+  // Put one key which is overlap with keys in memtable.
+  // It will trigger flushing memtable and require this thread is
+  // currently at the front of the 2nd writer queue. We must make
+  // sure that it won't enter the 2nd writer queue for the second time.
+  std::vector<std::pair<std::string, std::string>> data;
+  data.push_back(std::make_pair("1001", "v2"));
+  ASSERT_OK(GenerateAndAddExternalFile(options, data, -1, true));
+}
+
+TEST_P(ExternalSSTFileTest, DeltaEncodingWhileGlobalSeqnoPresent) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  constexpr size_t kValueSize = 8;
+  Random rnd(301);
+  std::string value = rnd.RandomString(kValueSize);
+
+  // Write some key to make global seqno larger than zero
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put("ab" + Key(i), value));
+  }
+  // Get a Snapshot to make RocksDB assign global seqno to ingested sst files.
+  auto snap = dbfull()->GetSnapshot();
+
+  std::string fname = sst_files_dir_ + "test_file";
+  ROCKSDB_NAMESPACE::SstFileWriter writer(EnvOptions(), options);
+  ASSERT_OK(writer.Open(fname));
+  std::string key1 = "ab";
+  std::string key2 = "ab";
+
+  // Make the prefix of key2 is same with key1 add zero seqno. The tail of every
+  // key is composed as (seqno << 8 | value_type), and here `1` represents
+  // ValueType::kTypeValue
+
+  PutFixed64(&key2, PackSequenceAndType(0, kTypeValue));
+  key2 += "cdefghijkl";
+
+  ASSERT_OK(writer.Put(key1, value));
+  ASSERT_OK(writer.Put(key2, value));
+
+  ExternalSstFileInfo info;
+  ASSERT_OK(writer.Finish(&info));
+
+  ASSERT_OK(dbfull()->IngestExternalFile({info.file_path},
+                                         IngestExternalFileOptions()));
+  dbfull()->ReleaseSnapshot(snap);
+  ASSERT_EQ(value, Get(key1));
+  // You will get error here
+  ASSERT_EQ(value, Get(key2));
+}
+
+TEST_P(ExternalSSTFileTest,
+       DeltaEncodingWhileGlobalSeqnoPresentIteratorSwitch) {
+  // Regression test for bug where global seqno corrupted the shared bytes
+  // buffer when switching from reverse iteration to forward iteration.
+  constexpr size_t kValueSize = 8;
+  Options options = CurrentOptions();
+
+  Random rnd(301);
+  std::string value = rnd.RandomString(kValueSize);
+
+  std::string key0 = "aa";
+  std::string key1 = "ab";
+  // Make the prefix of key2 is same with key1 add zero seqno. The tail of every
+  // key is composed as (seqno << 8 | value_type), and here `1` represents
+  // ValueType::kTypeValue
+  std::string key2 = "ab";
+  PutFixed64(&key2, PackSequenceAndType(0, kTypeValue));
+  key2 += "cdefghijkl";
+  std::string key3 = key2 + "_";
+
+  // Write some key to make global seqno larger than zero
+  ASSERT_OK(Put(key0, value));
+
+  std::string fname = sst_files_dir_ + "test_file";
+  ROCKSDB_NAMESPACE::SstFileWriter writer(EnvOptions(), options);
+  ASSERT_OK(writer.Open(fname));
+
+  // key0 is a dummy to ensure the turnaround point (key1) comes from Prev
+  // cache rather than block (restart keys are pinned in block).
+  ASSERT_OK(writer.Put(key0, value));
+  ASSERT_OK(writer.Put(key1, value));
+  ASSERT_OK(writer.Put(key2, value));
+  ASSERT_OK(writer.Put(key3, value));
+
+  ExternalSstFileInfo info;
+  ASSERT_OK(writer.Finish(&info));
+
+  ASSERT_OK(dbfull()->IngestExternalFile({info.file_path},
+                                         IngestExternalFileOptions()));
+  ReadOptions read_opts;
+  // Prevents Seek() when switching directions, which circumvents the bug.
+  read_opts.total_order_seek = true;
+  Iterator* iter = db_->NewIterator(read_opts);
+  // Scan backwards to key2. File iterator will then be positioned at key1.
+  iter->Seek(key3);
+  ASSERT_EQ(key3, iter->key());
+  iter->Prev();
+  ASSERT_EQ(key2, iter->key());
+  // Scan forwards and make sure key3 is present. Previously key3 would be
+  // corrupted by the global seqno from key1.
+  iter->Next();
+  ASSERT_EQ(key3, iter->key());
+  delete iter;
+}
+
+INSTANTIATE_TEST_CASE_P(ExternalSSTFileTest, ExternalSSTFileTest,
+                        testing::Values(std::make_tuple(false, false),
+                                        std::make_tuple(false, true),
+                                        std::make_tuple(true, false),
+                                        std::make_tuple(true, true)));
+
+INSTANTIATE_TEST_CASE_P(ExternSSTFileLinkFailFallbackTest,
+                        ExternSSTFileLinkFailFallbackTest,
+                        testing::Values(std::make_tuple(true, false),
+                                        std::make_tuple(true, true),
+                                        std::make_tuple(false, false)));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as External SST File Writer and Ingestion are not supported "
+          "in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/fault_injection_test.cc b/src/rocksdb/db/fault_injection_test.cc
new file mode 100644
index 000000000..ddd4b47cc
--- /dev/null
+++ b/src/rocksdb/db/fault_injection_test.cc
@@ -0,0 +1,637 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This test uses a custom Env to keep track of the state of a filesystem as of
+// the last "sync". It then checks for data loss errors by purposely dropping
+// file data (or entire files) not protected by a "sync".
+
+#include "db/db_impl/db_impl.h"
+#include "db/log_format.h"
+#include "db/version_set.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "rocksdb/write_batch.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "utilities/fault_injection_env.h"
+#ifndef NDEBUG
+#include "utilities/fault_injection_fs.h"
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+static const int kValueSize = 1000;
+static const int kMaxNumValues = 2000;
+static const size_t kNumIterations = 3;
+
+enum FaultInjectionOptionConfig {
+  kDefault,
+  kDifferentDataDir,
+  kWalDir,
+  kSyncWal,
+  kWalDirSyncWal,
+  kMultiLevels,
+  kEnd,
+};
+class FaultInjectionTest
+    : public testing::Test,
+      public testing::WithParamInterface<std::tuple<
+          bool, FaultInjectionOptionConfig, FaultInjectionOptionConfig>> {
+ protected:
+  int option_config_;
+  int non_inclusive_end_range_;  // kEnd or equivalent to that
+  // When need to make sure data is persistent, sync WAL
+  bool sync_use_wal_;
+  // When need to make sure data is persistent, call DB::CompactRange()
+  bool sync_use_compact_;
+
+  bool sequential_order_;
+
+ public:
+  enum ExpectedVerifResult { kValExpectFound, kValExpectNoError };
+  enum ResetMethod {
+    kResetDropUnsyncedData,
+    kResetDropRandomUnsyncedData,
+    kResetDeleteUnsyncedFiles,
+    kResetDropAndDeleteUnsynced
+  };
+
+  std::unique_ptr<Env> base_env_;
+  FaultInjectionTestEnv* env_;
+  std::string dbname_;
+  std::shared_ptr<Cache> tiny_cache_;
+  Options options_;
+  DB* db_;
+
+  FaultInjectionTest()
+      : option_config_(std::get<1>(GetParam())),
+        non_inclusive_end_range_(std::get<2>(GetParam())),
+        sync_use_wal_(false),
+        sync_use_compact_(true),
+        base_env_(nullptr),
+        env_(nullptr),
+        db_(nullptr) {
+    EXPECT_OK(
+        test::CreateEnvFromSystem(ConfigOptions(), &system_env_, &env_guard_));
+    EXPECT_NE(system_env_, nullptr);
+  }
+
+  ~FaultInjectionTest() override {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  }
+
+  bool ChangeOptions() {
+    option_config_++;
+    if (option_config_ >= non_inclusive_end_range_) {
+      return false;
+    } else {
+      if (option_config_ == kMultiLevels) {
+        base_env_.reset(MockEnv::Create(system_env_));
+      }
+      return true;
+    }
+  }
+
+  // Return the current option configuration.
+  Options CurrentOptions() {
+    sync_use_wal_ = false;
+    sync_use_compact_ = true;
+    Options options;
+    switch (option_config_) {
+      case kWalDir:
+        options.wal_dir = test::PerThreadDBPath(env_, "fault_test_wal");
+        break;
+      case kDifferentDataDir:
+        options.db_paths.emplace_back(
+            test::PerThreadDBPath(env_, "fault_test_data"), 1000000U);
+        break;
+      case kSyncWal:
+        sync_use_wal_ = true;
+        sync_use_compact_ = false;
+        break;
+      case kWalDirSyncWal:
+        options.wal_dir = test::PerThreadDBPath(env_, "/fault_test_wal");
+        sync_use_wal_ = true;
+        sync_use_compact_ = false;
+        break;
+      case kMultiLevels:
+        options.write_buffer_size = 64 * 1024;
+        options.target_file_size_base = 64 * 1024;
+        options.level0_file_num_compaction_trigger = 2;
+        options.level0_slowdown_writes_trigger = 2;
+        options.level0_stop_writes_trigger = 4;
+        options.max_bytes_for_level_base = 128 * 1024;
+        options.max_write_buffer_number = 2;
+        options.max_background_compactions = 8;
+        options.max_background_flushes = 8;
+        sync_use_wal_ = true;
+        sync_use_compact_ = false;
+        break;
+      default:
+        break;
+    }
+    return options;
+  }
+
+  Status NewDB() {
+    assert(db_ == nullptr);
+    assert(tiny_cache_ == nullptr);
+    assert(env_ == nullptr);
+
+    env_ = new FaultInjectionTestEnv(base_env_ ? base_env_.get() : system_env_);
+
+    options_ = CurrentOptions();
+    options_.env = env_;
+    options_.paranoid_checks = true;
+
+    BlockBasedTableOptions table_options;
+    tiny_cache_ = NewLRUCache(100);
+    table_options.block_cache = tiny_cache_;
+    options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    dbname_ = test::PerThreadDBPath("fault_test");
+
+    EXPECT_OK(DestroyDB(dbname_, options_));
+
+    options_.create_if_missing = true;
+    Status s = OpenDB();
+    options_.create_if_missing = false;
+    return s;
+  }
+
+  void SetUp() override {
+    sequential_order_ = std::get<0>(GetParam());
+    ASSERT_OK(NewDB());
+  }
+
+  void TearDown() override {
+    CloseDB();
+
+    Status s = DestroyDB(dbname_, options_);
+
+    delete env_;
+    env_ = nullptr;
+
+    tiny_cache_.reset();
+
+    ASSERT_OK(s);
+  }
+
+  void Build(const WriteOptions& write_options, int start_idx, int num_vals) {
+    std::string key_space, value_space;
+    WriteBatch batch;
+    for (int i = start_idx; i < start_idx + num_vals; i++) {
+      Slice key = Key(i, &key_space);
+      batch.Clear();
+      ASSERT_OK(batch.Put(key, Value(i, &value_space)));
+      ASSERT_OK(db_->Write(write_options, &batch));
+    }
+  }
+
+  Status ReadValue(int i, std::string* val) const {
+    std::string key_space, value_space;
+    Slice key = Key(i, &key_space);
+    Value(i, &value_space);
+    ReadOptions options;
+    return db_->Get(options, key, val);
+  }
+
+  Status Verify(int start_idx, int num_vals,
+                ExpectedVerifResult expected) const {
+    std::string val;
+    std::string value_space;
+    Status s;
+    for (int i = start_idx; i < start_idx + num_vals && s.ok(); i++) {
+      Value(i, &value_space);
+      s = ReadValue(i, &val);
+      if (s.ok()) {
+        EXPECT_EQ(value_space, val);
+      }
+      if (expected == kValExpectFound) {
+        if (!s.ok()) {
+          fprintf(stderr, "Error when read %dth record (expect found): %s\n", i,
+                  s.ToString().c_str());
+          return s;
+        }
+      } else if (!s.ok() && !s.IsNotFound()) {
+        fprintf(stderr, "Error when read %dth record: %s\n", i,
+                s.ToString().c_str());
+        return s;
+      }
+    }
+    return Status::OK();
+  }
+
+  // Return the ith key
+  Slice Key(int i, std::string* storage) const {
+    unsigned long long num = i;
+    if (!sequential_order_) {
+      // random transfer
+      const int m = 0x5bd1e995;
+      num *= m;
+      num ^= num << 24;
+    }
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%016d", static_cast<int>(num));
+    storage->assign(buf, strlen(buf));
+    return Slice(*storage);
+  }
+
+  // Return the value to associate with the specified key
+  Slice Value(int k, std::string* storage) const {
+    Random r(k);
+    *storage = r.RandomString(kValueSize);
+    return Slice(*storage);
+  }
+
+  void CloseDB() {
+    delete db_;
+    db_ = nullptr;
+  }
+
+  Status OpenDB() {
+    CloseDB();
+    env_->ResetState();
+    Status s = DB::Open(options_, dbname_, &db_);
+    assert(db_ != nullptr);
+    return s;
+  }
+
+  void DeleteAllData() {
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    WriteOptions options;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(db_->Delete(WriteOptions(), iter->key()));
+    }
+    ASSERT_OK(iter->status());
+    delete iter;
+
+    FlushOptions flush_options;
+    flush_options.wait = true;
+    ASSERT_OK(db_->Flush(flush_options));
+  }
+
+  // rnd cannot be null for kResetDropRandomUnsyncedData
+  void ResetDBState(ResetMethod reset_method, Random* rnd = nullptr) {
+    env_->AssertNoOpenFile();
+    switch (reset_method) {
+      case kResetDropUnsyncedData:
+        ASSERT_OK(env_->DropUnsyncedFileData());
+        break;
+      case kResetDropRandomUnsyncedData:
+        ASSERT_OK(env_->DropRandomUnsyncedFileData(rnd));
+        break;
+      case kResetDeleteUnsyncedFiles:
+        ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
+        break;
+      case kResetDropAndDeleteUnsynced:
+        ASSERT_OK(env_->DropUnsyncedFileData());
+        ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
+        break;
+      default:
+        assert(false);
+    }
+  }
+
+  void PartialCompactTestPreFault(int num_pre_sync, int num_post_sync) {
+    DeleteAllData();
+
+    WriteOptions write_options;
+    write_options.sync = sync_use_wal_;
+
+    Build(write_options, 0, num_pre_sync);
+    if (sync_use_compact_) {
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    }
+    write_options.sync = false;
+    Build(write_options, num_pre_sync, num_post_sync);
+  }
+
+  void PartialCompactTestReopenWithFault(ResetMethod reset_method,
+                                         int num_pre_sync, int num_post_sync,
+                                         Random* rnd = nullptr) {
+    env_->SetFilesystemActive(false);
+    CloseDB();
+    ResetDBState(reset_method, rnd);
+    ASSERT_OK(OpenDB());
+    ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::kValExpectFound));
+    ASSERT_OK(Verify(num_pre_sync, num_post_sync,
+                     FaultInjectionTest::kValExpectNoError));
+    WaitCompactionFinish();
+    ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::kValExpectFound));
+    ASSERT_OK(Verify(num_pre_sync, num_post_sync,
+                     FaultInjectionTest::kValExpectNoError));
+  }
+
+  void NoWriteTestPreFault() {}
+
+  void NoWriteTestReopenWithFault(ResetMethod reset_method) {
+    CloseDB();
+    ResetDBState(reset_method);
+    ASSERT_OK(OpenDB());
+  }
+
+  void WaitCompactionFinish() {
+    ASSERT_OK(static_cast<DBImpl*>(db_->GetRootDB())->TEST_WaitForCompact());
+    ASSERT_OK(db_->Put(WriteOptions(), "", ""));
+  }
+
+ private:
+  Env* system_env_;
+  std::shared_ptr<Env> env_guard_;
+};
+
+class FaultInjectionTestSplitted : public FaultInjectionTest {};
+
+TEST_P(FaultInjectionTestSplitted, FaultTest) {
+  do {
+    Random rnd(301);
+
+    for (size_t idx = 0; idx < kNumIterations; idx++) {
+      int num_pre_sync = rnd.Uniform(kMaxNumValues);
+      int num_post_sync = rnd.Uniform(kMaxNumValues);
+
+      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+      PartialCompactTestReopenWithFault(kResetDropUnsyncedData, num_pre_sync,
+                                        num_post_sync);
+      NoWriteTestPreFault();
+      NoWriteTestReopenWithFault(kResetDropUnsyncedData);
+
+      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+      PartialCompactTestReopenWithFault(kResetDropRandomUnsyncedData,
+                                        num_pre_sync, num_post_sync, &rnd);
+      NoWriteTestPreFault();
+      NoWriteTestReopenWithFault(kResetDropUnsyncedData);
+
+      // Setting a separate data path won't pass the test as we don't sync
+      // it after creating new files,
+      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+      PartialCompactTestReopenWithFault(kResetDropAndDeleteUnsynced,
+                                        num_pre_sync, num_post_sync);
+      NoWriteTestPreFault();
+      NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
+
+      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+      // No new files created so we expect all values since no files will be
+      // dropped.
+      PartialCompactTestReopenWithFault(kResetDeleteUnsyncedFiles, num_pre_sync,
+                                        num_post_sync);
+      NoWriteTestPreFault();
+      NoWriteTestReopenWithFault(kResetDeleteUnsyncedFiles);
+    }
+  } while (ChangeOptions());
+}
+
+// Previous log file is not fsynced if sync is forced after log rolling.
+TEST_P(FaultInjectionTest, WriteOptionSyncTest) {
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  // Block the job queue to prevent flush job from running.
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::HIGH);
+  sleeping_task_low.WaitUntilSleeping();
+
+  WriteOptions write_options;
+  write_options.sync = false;
+
+  std::string key_space, value_space;
+  ASSERT_OK(
+      db_->Put(write_options, Key(1, &key_space), Value(1, &value_space)));
+  FlushOptions flush_options;
+  flush_options.wait = false;
+  ASSERT_OK(db_->Flush(flush_options));
+  write_options.sync = true;
+  ASSERT_OK(
+      db_->Put(write_options, Key(2, &key_space), Value(2, &value_space)));
+  ASSERT_OK(db_->FlushWAL(false));
+
+  env_->SetFilesystemActive(false);
+  NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+
+  ASSERT_OK(OpenDB());
+  std::string val;
+  Value(2, &value_space);
+  ASSERT_OK(ReadValue(2, &val));
+  ASSERT_EQ(value_space, val);
+
+  Value(1, &value_space);
+  ASSERT_OK(ReadValue(1, &val));
+  ASSERT_EQ(value_space, val);
+}
+
+TEST_P(FaultInjectionTest, UninstalledCompaction) {
+  options_.target_file_size_base = 32 * 1024;
+  options_.write_buffer_size = 100 << 10;  // 100KB
+  options_.level0_file_num_compaction_trigger = 6;
+  options_.level0_stop_writes_trigger = 1 << 10;
+  options_.level0_slowdown_writes_trigger = 1 << 10;
+  options_.max_background_compactions = 1;
+  OpenDB();
+
+  if (!sequential_order_) {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+        {"FaultInjectionTest::FaultTest:0", "DBImpl::BGWorkCompaction"},
+        {"CompactionJob::Run():End", "FaultInjectionTest::FaultTest:1"},
+        {"FaultInjectionTest::FaultTest:2",
+         "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"},
+    });
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  int kNumKeys = 1000;
+  Build(WriteOptions(), 0, kNumKeys);
+  FlushOptions flush_options;
+  flush_options.wait = true;
+  ASSERT_OK(db_->Flush(flush_options));
+  ASSERT_OK(db_->Put(WriteOptions(), "", ""));
+  TEST_SYNC_POINT("FaultInjectionTest::FaultTest:0");
+  TEST_SYNC_POINT("FaultInjectionTest::FaultTest:1");
+  env_->SetFilesystemActive(false);
+  TEST_SYNC_POINT("FaultInjectionTest::FaultTest:2");
+  CloseDB();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ResetDBState(kResetDropUnsyncedData);
+
+  std::atomic<bool> opened(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::Open:Opened", [&](void* /*arg*/) { opened.store(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BGWorkCompaction",
+      [&](void* /*arg*/) { ASSERT_TRUE(opened.load()); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(OpenDB());
+  ASSERT_OK(Verify(0, kNumKeys, FaultInjectionTest::kValExpectFound));
+  WaitCompactionFinish();
+  ASSERT_OK(Verify(0, kNumKeys, FaultInjectionTest::kValExpectFound));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(FaultInjectionTest, ManualLogSyncTest) {
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  // Block the job queue to prevent flush job from running.
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::HIGH);
+  sleeping_task_low.WaitUntilSleeping();
+
+  WriteOptions write_options;
+  write_options.sync = false;
+
+  std::string key_space, value_space;
+  ASSERT_OK(
+      db_->Put(write_options, Key(1, &key_space), Value(1, &value_space)));
+  FlushOptions flush_options;
+  flush_options.wait = false;
+  ASSERT_OK(db_->Flush(flush_options));
+  ASSERT_OK(
+      db_->Put(write_options, Key(2, &key_space), Value(2, &value_space)));
+  ASSERT_OK(db_->FlushWAL(true));
+
+  env_->SetFilesystemActive(false);
+  NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+
+  ASSERT_OK(OpenDB());
+  std::string val;
+  Value(2, &value_space);
+  ASSERT_OK(ReadValue(2, &val));
+  ASSERT_EQ(value_space, val);
+
+  Value(1, &value_space);
+  ASSERT_OK(ReadValue(1, &val));
+  ASSERT_EQ(value_space, val);
+}
+
+TEST_P(FaultInjectionTest, WriteBatchWalTerminationTest) {
+  ReadOptions ro;
+  Options options = CurrentOptions();
+  options.env = env_;
+
+  WriteOptions wo;
+  wo.sync = true;
+  wo.disableWAL = false;
+  WriteBatch batch;
+  ASSERT_OK(batch.Put("cats", "dogs"));
+  batch.MarkWalTerminationPoint();
+  ASSERT_OK(batch.Put("boys", "girls"));
+  ASSERT_OK(db_->Write(wo, &batch));
+
+  env_->SetFilesystemActive(false);
+  NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
+  ASSERT_OK(OpenDB());
+
+  std::string val;
+  ASSERT_OK(db_->Get(ro, "cats", &val));
+  ASSERT_EQ("dogs", val);
+  ASSERT_EQ(db_->Get(ro, "boys", &val), Status::NotFound());
+}
+
+TEST_P(FaultInjectionTest, NoDuplicateTrailingEntries) {
+  auto fault_fs = std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
+  fault_fs->EnableWriteErrorInjection();
+  fault_fs->SetFilesystemDirectWritable(false);
+  const std::string file_name = NormalizePath(dbname_ + "/test_file");
+  std::unique_ptr<log::Writer> log_writer = nullptr;
+  constexpr uint64_t log_number = 0;
+  {
+    std::unique_ptr<FSWritableFile> file;
+    const Status s =
+        fault_fs->NewWritableFile(file_name, FileOptions(), &file, nullptr);
+    ASSERT_OK(s);
+    std::unique_ptr<WritableFileWriter> fwriter(
+        new WritableFileWriter(std::move(file), file_name, FileOptions()));
+    log_writer.reset(new log::Writer(std::move(fwriter), log_number,
+                                     /*recycle_log_files=*/false));
+  }
+
+  fault_fs->SetRandomWriteError(
+      0xdeadbeef, /*one_in=*/1, IOStatus::IOError("Injected IOError"),
+      /*inject_for_all_file_types=*/true, /*types=*/{});
+
+  {
+    VersionEdit edit;
+    edit.SetColumnFamily(0);
+    std::string buf;
+    assert(edit.EncodeTo(&buf));
+    const Status s = log_writer->AddRecord(buf);
+    ASSERT_NOK(s);
+  }
+
+  fault_fs->DisableWriteErrorInjection();
+
+  // Closing the log writer will cause WritableFileWriter::Close() and flush
+  // remaining data from its buffer to underlying file.
+  log_writer.reset();
+
+  {
+    std::unique_ptr<FSSequentialFile> file;
+    Status s =
+        fault_fs->NewSequentialFile(file_name, FileOptions(), &file, nullptr);
+    ASSERT_OK(s);
+    std::unique_ptr<SequentialFileReader> freader(
+        new SequentialFileReader(std::move(file), file_name));
+    Status log_read_s;
+    class LogReporter : public log::Reader::Reporter {
+     public:
+      Status* status_;
+      explicit LogReporter(Status* _s) : status_(_s) {}
+      void Corruption(size_t /*bytes*/, const Status& _s) override {
+        if (status_->ok()) {
+          *status_ = _s;
+        }
+      }
+    } reporter(&log_read_s);
+    std::unique_ptr<log::Reader> log_reader(new log::Reader(
+        nullptr, std::move(freader), &reporter, /*checksum=*/true, log_number));
+    Slice record;
+    std::string data;
+    size_t count = 0;
+    while (log_reader->ReadRecord(&record, &data) && log_read_s.ok()) {
+      VersionEdit edit;
+      ASSERT_OK(edit.DecodeFrom(data));
+      ++count;
+    }
+    // Verify that only one version edit exists in the file.
+    ASSERT_EQ(1, count);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    FaultTest, FaultInjectionTest,
+    ::testing::Values(std::make_tuple(false, kDefault, kEnd),
+                      std::make_tuple(true, kDefault, kEnd)));
+
+INSTANTIATE_TEST_CASE_P(
+    FaultTest, FaultInjectionTestSplitted,
+    ::testing::Values(std::make_tuple(false, kDefault, kSyncWal),
+                      std::make_tuple(true, kDefault, kSyncWal),
+                      std::make_tuple(false, kSyncWal, kEnd),
+                      std::make_tuple(true, kSyncWal, kEnd)));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/file_indexer.cc b/src/rocksdb/db/file_indexer.cc
new file mode 100644
index 000000000..608f1cb28
--- /dev/null
+++ b/src/rocksdb/db/file_indexer.cc
@@ -0,0 +1,218 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/file_indexer.h"
+
+#include <algorithm>
+#include <functional>
+
+#include "db/version_edit.h"
+#include "rocksdb/comparator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+FileIndexer::FileIndexer(const Comparator* ucmp)
+    : num_levels_(0), ucmp_(ucmp), level_rb_(nullptr) {}
+
+size_t FileIndexer::NumLevelIndex() const { return next_level_index_.size(); }
+
+size_t FileIndexer::LevelIndexSize(size_t level) const {
+  if (level >= next_level_index_.size()) {
+    return 0;
+  }
+  return next_level_index_[level].num_index;
+}
+
+void FileIndexer::GetNextLevelIndex(const size_t level, const size_t file_index,
+                                    const int cmp_smallest,
+                                    const int cmp_largest, int32_t* left_bound,
+                                    int32_t* right_bound) const {
+  assert(level > 0);
+
+  // Last level, no hint
+  if (level == num_levels_ - 1) {
+    *left_bound = 0;
+    *right_bound = -1;
+    return;
+  }
+
+  assert(level < num_levels_ - 1);
+  assert(static_cast<int32_t>(file_index) <= level_rb_[level]);
+
+  const IndexUnit* index_units = next_level_index_[level].index_units;
+  const auto& index = index_units[file_index];
+
+  if (cmp_smallest < 0) {
+    *left_bound = (level > 0 && file_index > 0)
+                      ? index_units[file_index - 1].largest_lb
+                      : 0;
+    *right_bound = index.smallest_rb;
+  } else if (cmp_smallest == 0) {
+    *left_bound = index.smallest_lb;
+    *right_bound = index.smallest_rb;
+  } else if (cmp_smallest > 0 && cmp_largest < 0) {
+    *left_bound = index.smallest_lb;
+    *right_bound = index.largest_rb;
+  } else if (cmp_largest == 0) {
+    *left_bound = index.largest_lb;
+    *right_bound = index.largest_rb;
+  } else if (cmp_largest > 0) {
+    *left_bound = index.largest_lb;
+    *right_bound = level_rb_[level + 1];
+  } else {
+    assert(false);
+  }
+
+  assert(*left_bound >= 0);
+  assert(*left_bound <= *right_bound + 1);
+  assert(*right_bound <= level_rb_[level + 1]);
+}
+
+void FileIndexer::UpdateIndex(Arena* arena, const size_t num_levels,
+                              std::vector<FileMetaData*>* const files) {
+  if (files == nullptr) {
+    return;
+  }
+  if (num_levels == 0) {  // uint_32 0-1 would cause bad behavior
+    num_levels_ = num_levels;
+    return;
+  }
+  assert(level_rb_ == nullptr);  // level_rb_ should be init here
+
+  num_levels_ = num_levels;
+  next_level_index_.resize(num_levels);
+
+  char* mem = arena->AllocateAligned(num_levels_ * sizeof(int32_t));
+  level_rb_ = new (mem) int32_t[num_levels_];
+  for (size_t i = 0; i < num_levels_; i++) {
+    level_rb_[i] = -1;
+  }
+
+  // L1 - Ln-1
+  for (size_t level = 1; level < num_levels_ - 1; ++level) {
+    const auto& upper_files = files[level];
+    const int32_t upper_size = static_cast<int32_t>(upper_files.size());
+    const auto& lower_files = files[level + 1];
+    level_rb_[level] = static_cast<int32_t>(upper_files.size()) - 1;
+    if (upper_size == 0) {
+      continue;
+    }
+    IndexLevel& index_level = next_level_index_[level];
+    index_level.num_index = upper_size;
+    mem = arena->AllocateAligned(upper_size * sizeof(IndexUnit));
+    index_level.index_units = new (mem) IndexUnit[upper_size];
+
+    CalculateLB(
+        upper_files, lower_files, &index_level,
+        [this](const FileMetaData* a, const FileMetaData* b) -> int {
+          return ucmp_->CompareWithoutTimestamp(a->smallest.user_key(),
+                                                b->largest.user_key());
+        },
+        [](IndexUnit* index, int32_t f_idx) { index->smallest_lb = f_idx; });
+    CalculateLB(
+        upper_files, lower_files, &index_level,
+        [this](const FileMetaData* a, const FileMetaData* b) -> int {
+          return ucmp_->CompareWithoutTimestamp(a->largest.user_key(),
+                                                b->largest.user_key());
+        },
+        [](IndexUnit* index, int32_t f_idx) { index->largest_lb = f_idx; });
+    CalculateRB(
+        upper_files, lower_files, &index_level,
+        [this](const FileMetaData* a, const FileMetaData* b) -> int {
+          return ucmp_->CompareWithoutTimestamp(a->smallest.user_key(),
+                                                b->smallest.user_key());
+        },
+        [](IndexUnit* index, int32_t f_idx) { index->smallest_rb = f_idx; });
+    CalculateRB(
+        upper_files, lower_files, &index_level,
+        [this](const FileMetaData* a, const FileMetaData* b) -> int {
+          return ucmp_->CompareWithoutTimestamp(a->largest.user_key(),
+                                                b->smallest.user_key());
+        },
+        [](IndexUnit* index, int32_t f_idx) { index->largest_rb = f_idx; });
+  }
+
+  level_rb_[num_levels_ - 1] =
+      static_cast<int32_t>(files[num_levels_ - 1].size()) - 1;
+}
+
+void FileIndexer::CalculateLB(
+    const std::vector<FileMetaData*>& upper_files,
+    const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
+    std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+    std::function<void(IndexUnit*, int32_t)> set_index) {
+  const int32_t upper_size = static_cast<int32_t>(upper_files.size());
+  const int32_t lower_size = static_cast<int32_t>(lower_files.size());
+  int32_t upper_idx = 0;
+  int32_t lower_idx = 0;
+
+  IndexUnit* index = index_level->index_units;
+  while (upper_idx < upper_size && lower_idx < lower_size) {
+    int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]);
+
+    if (cmp == 0) {
+      set_index(&index[upper_idx], lower_idx);
+      ++upper_idx;
+    } else if (cmp > 0) {
+      // Lower level's file (largest) is smaller, a key won't hit in that
+      // file. Move to next lower file
+      ++lower_idx;
+    } else {
+      // Lower level's file becomes larger, update the index, and
+      // move to the next upper file
+      set_index(&index[upper_idx], lower_idx);
+      ++upper_idx;
+    }
+  }
+
+  while (upper_idx < upper_size) {
+    // Lower files are exhausted, that means the remaining upper files are
+    // greater than any lower files. Set the index to be the lower level size.
+    set_index(&index[upper_idx], lower_size);
+    ++upper_idx;
+  }
+}
+
+void FileIndexer::CalculateRB(
+    const std::vector<FileMetaData*>& upper_files,
+    const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
+    std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+    std::function<void(IndexUnit*, int32_t)> set_index) {
+  const int32_t upper_size = static_cast<int32_t>(upper_files.size());
+  const int32_t lower_size = static_cast<int32_t>(lower_files.size());
+  int32_t upper_idx = upper_size - 1;
+  int32_t lower_idx = lower_size - 1;
+
+  IndexUnit* index = index_level->index_units;
+  while (upper_idx >= 0 && lower_idx >= 0) {
+    int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]);
+
+    if (cmp == 0) {
+      set_index(&index[upper_idx], lower_idx);
+      --upper_idx;
+    } else if (cmp < 0) {
+      // Lower level's file (smallest) is larger, a key won't hit in that
+      // file. Move to next lower file.
+      --lower_idx;
+    } else {
+      // Lower level's file becomes smaller, update the index, and move to
+      // the next the upper file
+      set_index(&index[upper_idx], lower_idx);
+      --upper_idx;
+    }
+  }
+  while (upper_idx >= 0) {
+    // Lower files are exhausted, that means the remaining upper files are
+    // smaller than any lower files. Set it to -1.
+    set_index(&index[upper_idx], -1);
+    --upper_idx;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/file_indexer.h b/src/rocksdb/db/file_indexer.h
new file mode 100644
index 000000000..45cb13615
--- /dev/null
+++ b/src/rocksdb/db/file_indexer.h
@@ -0,0 +1,140 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <vector>
+
+#include "memory/arena.h"
+#include "port/port.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Comparator;
+struct FileMetaData;
+struct FdWithKeyRange;
+struct FileLevel;
+
+// The file tree structure in Version is prebuilt and the range of each file
+// is known. On Version::Get(), it uses binary search to find a potential file
+// and then check if a target key can be found in the file by comparing the key
+// to each file's smallest and largest key. The results of these comparisons
+// can be reused beyond checking if a key falls into a file's range.
+// With some pre-calculated knowledge, each key comparison that has been done
+// can serve as a hint to narrow down further searches: if a key compared to
+// be smaller than a file's smallest or largest, that comparison can be used
+// to find out the right bound of next binary search. Similarly, if a key
+// compared to be larger than a file's smallest or largest, it can be utilized
+// to find out the left bound of next binary search.
+// With these hints: it can greatly reduce the range of binary search,
+// especially for bottom levels, given that one file most likely overlaps with
+// only N files from level below (where N is max_bytes_for_level_multiplier).
+// So on level L, we will only look at ~N files instead of N^L files on the
+// naive approach.
+class FileIndexer {
+ public:
+  explicit FileIndexer(const Comparator* ucmp);
+
+  size_t NumLevelIndex() const;
+
+  size_t LevelIndexSize(size_t level) const;
+
+  // Return a file index range in the next level to search for a key based on
+  // smallest and largest key comparison for the current file specified by
+  // level and file_index. When *left_index < *right_index, both index should
+  // be valid and fit in the vector size.
+  void GetNextLevelIndex(const size_t level, const size_t file_index,
+                         const int cmp_smallest, const int cmp_largest,
+                         int32_t* left_bound, int32_t* right_bound) const;
+
+  void UpdateIndex(Arena* arena, const size_t num_levels,
+                   std::vector<FileMetaData*>* const files);
+
+  enum { kLevelMaxIndex = std::numeric_limits<int32_t>::max() };
+
+ private:
+  size_t num_levels_;
+  const Comparator* ucmp_;
+
+  struct IndexUnit {
+    IndexUnit()
+        : smallest_lb(0), largest_lb(0), smallest_rb(-1), largest_rb(-1) {}
+    // During file search, a key is compared against smallest and largest
+    // from a FileMetaData. It can have 3 possible outcomes:
+    // (1) key is smaller than smallest, implying it is also smaller than
+    //     larger. Precalculated index based on "smallest < smallest" can
+    //     be used to provide right bound.
+    // (2) key is in between smallest and largest.
+    //     Precalculated index based on "smallest > greatest" can be used to
+    //     provide left bound.
+    //     Precalculated index based on "largest < smallest" can be used to
+    //     provide right bound.
+    // (3) key is larger than largest, implying it is also larger than smallest.
+    //     Precalculated index based on "largest > largest" can be used to
+    //     provide left bound.
+    //
+    // As a result, we will need to do:
+    // Compare smallest (<=) and largest keys from upper level file with
+    // smallest key from lower level to get a right bound.
+    // Compare smallest (>=) and largest keys from upper level file with
+    // largest key from lower level to get a left bound.
+    //
+    // Example:
+    //    level 1:              [50 - 60]
+    //    level 2:        [1 - 40], [45 - 55], [58 - 80]
+    // A key 35, compared to be less than 50, 3rd file on level 2 can be
+    // skipped according to rule (1). LB = 0, RB = 1.
+    // A key 53, sits in the middle 50 and 60. 1st file on level 2 can be
+    // skipped according to rule (2)-a, but the 3rd file cannot be skipped
+    // because 60 is greater than 58. LB = 1, RB = 2.
+    // A key 70, compared to be larger than 60. 1st and 2nd file can be skipped
+    // according to rule (3). LB = 2, RB = 2.
+    //
+    // Point to a left most file in a lower level that may contain a key,
+    // which compares greater than smallest of a FileMetaData (upper level)
+    int32_t smallest_lb;
+    // Point to a left most file in a lower level that may contain a key,
+    // which compares greater than largest of a FileMetaData (upper level)
+    int32_t largest_lb;
+    // Point to a right most file in a lower level that may contain a key,
+    // which compares smaller than smallest of a FileMetaData (upper level)
+    int32_t smallest_rb;
+    // Point to a right most file in a lower level that may contain a key,
+    // which compares smaller than largest of a FileMetaData (upper level)
+    int32_t largest_rb;
+  };
+
+  // Data structure to store IndexUnits in a whole level
+  struct IndexLevel {
+    size_t num_index;
+    IndexUnit* index_units;
+
+    IndexLevel() : num_index(0), index_units(nullptr) {}
+  };
+
+  void CalculateLB(
+      const std::vector<FileMetaData*>& upper_files,
+      const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
+      std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+      std::function<void(IndexUnit*, int32_t)> set_index);
+
+  void CalculateRB(
+      const std::vector<FileMetaData*>& upper_files,
+      const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
+      std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+      std::function<void(IndexUnit*, int32_t)> set_index);
+
+  autovector<IndexLevel> next_level_index_;
+  int32_t* level_rb_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/file_indexer_test.cc b/src/rocksdb/db/file_indexer_test.cc
new file mode 100644
index 000000000..5c82189ef
--- /dev/null
+++ b/src/rocksdb/db/file_indexer_test.cc
@@ -0,0 +1,352 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/file_indexer.h"
+
+#include <string>
+
+#include "db/dbformat.h"
+#include "db/version_edit.h"
+#include "port/stack_trace.h"
+#include "rocksdb/comparator.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class IntComparator : public Comparator {
+ public:
+  int Compare(const Slice& a, const Slice& b) const override {
+    assert(a.size() == 8);
+    assert(b.size() == 8);
+    int64_t diff = *reinterpret_cast<const int64_t*>(a.data()) -
+                   *reinterpret_cast<const int64_t*>(b.data());
+    if (diff < 0) {
+      return -1;
+    } else if (diff == 0) {
+      return 0;
+    } else {
+      return 1;
+    }
+  }
+
+  const char* Name() const override { return "IntComparator"; }
+
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {}
+
+  void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+class FileIndexerTest : public testing::Test {
+ public:
+  FileIndexerTest()
+      : kNumLevels(4), files(new std::vector<FileMetaData*>[kNumLevels]) {}
+
+  ~FileIndexerTest() override {
+    ClearFiles();
+    delete[] files;
+  }
+
+  void AddFile(int level, int64_t smallest, int64_t largest) {
+    auto* f = new FileMetaData();
+    f->smallest = IntKey(smallest);
+    f->largest = IntKey(largest);
+    files[level].push_back(f);
+  }
+
+  InternalKey IntKey(int64_t v) {
+    return InternalKey(Slice(reinterpret_cast<char*>(&v), 8), 0, kTypeValue);
+  }
+
+  void ClearFiles() {
+    for (uint32_t i = 0; i < kNumLevels; ++i) {
+      for (auto* f : files[i]) {
+        delete f;
+      }
+      files[i].clear();
+    }
+  }
+
+  void GetNextLevelIndex(const uint32_t level, const uint32_t file_index,
+                         const int cmp_smallest, const int cmp_largest,
+                         int32_t* left_index, int32_t* right_index) {
+    *left_index = 100;
+    *right_index = 100;
+    indexer->GetNextLevelIndex(level, file_index, cmp_smallest, cmp_largest,
+                               left_index, right_index);
+  }
+
+  int32_t left = 100;
+  int32_t right = 100;
+  const uint32_t kNumLevels;
+  IntComparator ucmp;
+  FileIndexer* indexer;
+
+  std::vector<FileMetaData*>* files;
+};
+
+// Case 0: Empty
+TEST_F(FileIndexerTest, Empty) {
+  Arena arena;
+  indexer = new FileIndexer(&ucmp);
+  indexer->UpdateIndex(&arena, 0, files);
+  delete indexer;
+}
+
+// Case 1: no overlap, files are on the left of next level files
+TEST_F(FileIndexerTest, no_overlap_left) {
+  Arena arena;
+  indexer = new FileIndexer(&ucmp);
+  // level 1
+  AddFile(1, 100, 200);
+  AddFile(1, 300, 400);
+  AddFile(1, 500, 600);
+  // level 2
+  AddFile(2, 1500, 1600);
+  AddFile(2, 1601, 1699);
+  AddFile(2, 1700, 1800);
+  // level 3
+  AddFile(3, 2500, 2600);
+  AddFile(3, 2601, 2699);
+  AddFile(3, 2700, 2800);
+  indexer->UpdateIndex(&arena, kNumLevels, files);
+  for (uint32_t level = 1; level < 3; ++level) {
+    for (uint32_t f = 0; f < 3; ++f) {
+      GetNextLevelIndex(level, f, -1, -1, &left, &right);
+      ASSERT_EQ(0, left);
+      ASSERT_EQ(-1, right);
+      GetNextLevelIndex(level, f, 0, -1, &left, &right);
+      ASSERT_EQ(0, left);
+      ASSERT_EQ(-1, right);
+      GetNextLevelIndex(level, f, 1, -1, &left, &right);
+      ASSERT_EQ(0, left);
+      ASSERT_EQ(-1, right);
+      GetNextLevelIndex(level, f, 1, 0, &left, &right);
+      ASSERT_EQ(0, left);
+      ASSERT_EQ(-1, right);
+      GetNextLevelIndex(level, f, 1, 1, &left, &right);
+      ASSERT_EQ(0, left);
+      ASSERT_EQ(2, right);
+    }
+  }
+  delete indexer;
+  ClearFiles();
+}
+
+// Case 2: no overlap, files are on the right of next level files
+TEST_F(FileIndexerTest, no_overlap_right) {
+  Arena arena;
+  indexer = new FileIndexer(&ucmp);
+  // level 1
+  AddFile(1, 2100, 2200);
+  AddFile(1, 2300, 2400);
+  AddFile(1, 2500, 2600);
+  // level 2
+  AddFile(2, 1500, 1600);
+  AddFile(2, 1501, 1699);
+  AddFile(2, 1700, 1800);
+  // level 3
+  AddFile(3, 500, 600);
+  AddFile(3, 501, 699);
+  AddFile(3, 700, 800);
+  indexer->UpdateIndex(&arena, kNumLevels, files);
+  for (uint32_t level = 1; level < 3; ++level) {
+    for (uint32_t f = 0; f < 3; ++f) {
+      GetNextLevelIndex(level, f, -1, -1, &left, &right);
+      ASSERT_EQ(f == 0 ? 0 : 3, left);
+      ASSERT_EQ(2, right);
+      GetNextLevelIndex(level, f, 0, -1, &left, &right);
+      ASSERT_EQ(3, left);
+      ASSERT_EQ(2, right);
+      GetNextLevelIndex(level, f, 1, -1, &left, &right);
+      ASSERT_EQ(3, left);
+      ASSERT_EQ(2, right);
+      GetNextLevelIndex(level, f, 1, -1, &left, &right);
+      ASSERT_EQ(3, left);
+      ASSERT_EQ(2, right);
+      GetNextLevelIndex(level, f, 1, 0, &left, &right);
+      ASSERT_EQ(3, left);
+      ASSERT_EQ(2, right);
+      GetNextLevelIndex(level, f, 1, 1, &left, &right);
+      ASSERT_EQ(3, left);
+      ASSERT_EQ(2, right);
+    }
+  }
+  delete indexer;
+}
+
+// Case 3: empty L2
+TEST_F(FileIndexerTest, empty_L2) {
+  Arena arena;
+  indexer = new FileIndexer(&ucmp);
+  for (uint32_t i = 1; i < kNumLevels; ++i) {
+    ASSERT_EQ(0U, indexer->LevelIndexSize(i));
+  }
+  // level 1
+  AddFile(1, 2100, 2200);
+  AddFile(1, 2300, 2400);
+  AddFile(1, 2500, 2600);
+  // level 3
+  AddFile(3, 500, 600);
+  AddFile(3, 501, 699);
+  AddFile(3, 700, 800);
+  indexer->UpdateIndex(&arena, kNumLevels, files);
+  for (uint32_t f = 0; f < 3; ++f) {
+    GetNextLevelIndex(1, f, -1, -1, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+    GetNextLevelIndex(1, f, 0, -1, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+    GetNextLevelIndex(1, f, 1, -1, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+    GetNextLevelIndex(1, f, 1, -1, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+    GetNextLevelIndex(1, f, 1, 0, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+    GetNextLevelIndex(1, f, 1, 1, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+  }
+  delete indexer;
+  ClearFiles();
+}
+
+// Case 4: mixed
+TEST_F(FileIndexerTest, mixed) {
+  Arena arena;
+  indexer = new FileIndexer(&ucmp);
+  // level 1
+  AddFile(1, 100, 200);
+  AddFile(1, 250, 400);
+  AddFile(1, 450, 500);
+  // level 2
+  AddFile(2, 100, 150);  // 0
+  AddFile(2, 200, 250);  // 1
+  AddFile(2, 251, 300);  // 2
+  AddFile(2, 301, 350);  // 3
+  AddFile(2, 500, 600);  // 4
+  // level 3
+  AddFile(3, 0, 50);
+  AddFile(3, 100, 200);
+  AddFile(3, 201, 250);
+  indexer->UpdateIndex(&arena, kNumLevels, files);
+  // level 1, 0
+  GetNextLevelIndex(1, 0, -1, -1, &left, &right);
+  ASSERT_EQ(0, left);
+  ASSERT_EQ(0, right);
+  GetNextLevelIndex(1, 0, 0, -1, &left, &right);
+  ASSERT_EQ(0, left);
+  ASSERT_EQ(0, right);
+  GetNextLevelIndex(1, 0, 1, -1, &left, &right);
+  ASSERT_EQ(0, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(1, 0, 1, 0, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(1, 0, 1, 1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(4, right);
+  // level 1, 1
+  GetNextLevelIndex(1, 1, -1, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(1, 1, 0, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(1, 1, 1, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(3, right);
+  GetNextLevelIndex(1, 1, 1, 0, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(3, right);
+  GetNextLevelIndex(1, 1, 1, 1, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(4, right);
+  // level 1, 2
+  GetNextLevelIndex(1, 2, -1, -1, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(3, right);
+  GetNextLevelIndex(1, 2, 0, -1, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(3, right);
+  GetNextLevelIndex(1, 2, 1, -1, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(4, right);
+  GetNextLevelIndex(1, 2, 1, 0, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(4, right);
+  GetNextLevelIndex(1, 2, 1, 1, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(4, right);
+  // level 2, 0
+  GetNextLevelIndex(2, 0, -1, -1, &left, &right);
+  ASSERT_EQ(0, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 0, 0, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 0, 1, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 0, 1, 0, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 0, 1, 1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(2, right);
+  // level 2, 1
+  GetNextLevelIndex(2, 1, -1, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 1, 0, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 1, 1, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(2, right);
+  GetNextLevelIndex(2, 1, 1, 0, &left, &right);
+  ASSERT_EQ(2, left);
+  ASSERT_EQ(2, right);
+  GetNextLevelIndex(2, 1, 1, 1, &left, &right);
+  ASSERT_EQ(2, left);
+  ASSERT_EQ(2, right);
+  // level 2, [2 - 4], no overlap
+  for (uint32_t f = 2; f <= 4; ++f) {
+    GetNextLevelIndex(2, f, -1, -1, &left, &right);
+    ASSERT_EQ(f == 2 ? 2 : 3, left);
+    ASSERT_EQ(2, right);
+    GetNextLevelIndex(2, f, 0, -1, &left, &right);
+    ASSERT_EQ(3, left);
+    ASSERT_EQ(2, right);
+    GetNextLevelIndex(2, f, 1, -1, &left, &right);
+    ASSERT_EQ(3, left);
+    ASSERT_EQ(2, right);
+    GetNextLevelIndex(2, f, 1, 0, &left, &right);
+    ASSERT_EQ(3, left);
+    ASSERT_EQ(2, right);
+    GetNextLevelIndex(2, f, 1, 1, &left, &right);
+    ASSERT_EQ(3, left);
+    ASSERT_EQ(2, right);
+  }
+  delete indexer;
+  ClearFiles();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/filename_test.cc b/src/rocksdb/db/filename_test.cc
new file mode 100644
index 000000000..04c81b333
--- /dev/null
+++ b/src/rocksdb/db/filename_test.cc
@@ -0,0 +1,241 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/filename.h"
+
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class FileNameTest : public testing::Test {};
+
+TEST_F(FileNameTest, Parse) {
+  Slice db;
+  FileType type;
+  uint64_t number;
+
+  char kDefautInfoLogDir = 1;
+  char kDifferentInfoLogDir = 2;
+  char kNoCheckLogDir = 4;
+  char kAllMode = kDefautInfoLogDir | kDifferentInfoLogDir | kNoCheckLogDir;
+
+  // Successful parses
+  static struct {
+    const char* fname;
+    uint64_t number;
+    FileType type;
+    char mode;
+  } cases[] = {
+      {"100.log", 100, kWalFile, kAllMode},
+      {"0.log", 0, kWalFile, kAllMode},
+      {"0.sst", 0, kTableFile, kAllMode},
+      {"CURRENT", 0, kCurrentFile, kAllMode},
+      {"LOCK", 0, kDBLockFile, kAllMode},
+      {"MANIFEST-2", 2, kDescriptorFile, kAllMode},
+      {"MANIFEST-7", 7, kDescriptorFile, kAllMode},
+      {"METADB-2", 2, kMetaDatabase, kAllMode},
+      {"METADB-7", 7, kMetaDatabase, kAllMode},
+      {"LOG", 0, kInfoLogFile, kDefautInfoLogDir},
+      {"LOG.old", 0, kInfoLogFile, kDefautInfoLogDir},
+      {"LOG.old.6688", 6688, kInfoLogFile, kDefautInfoLogDir},
+      {"rocksdb_dir_LOG", 0, kInfoLogFile, kDifferentInfoLogDir},
+      {"rocksdb_dir_LOG.old", 0, kInfoLogFile, kDifferentInfoLogDir},
+      {"rocksdb_dir_LOG.old.6688", 6688, kInfoLogFile, kDifferentInfoLogDir},
+      {"18446744073709551615.log", 18446744073709551615ull, kWalFile, kAllMode},
+  };
+  for (char mode : {kDifferentInfoLogDir, kDefautInfoLogDir, kNoCheckLogDir}) {
+    for (unsigned int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
+      InfoLogPrefix info_log_prefix(mode != kDefautInfoLogDir, "/rocksdb/dir");
+      if (cases[i].mode & mode) {
+        std::string f = cases[i].fname;
+        if (mode == kNoCheckLogDir) {
+          ASSERT_TRUE(ParseFileName(f, &number, &type)) << f;
+        } else {
+          ASSERT_TRUE(ParseFileName(f, &number, info_log_prefix.prefix, &type))
+              << f;
+        }
+        ASSERT_EQ(cases[i].type, type) << f;
+        ASSERT_EQ(cases[i].number, number) << f;
+      }
+    }
+  }
+
+  // Errors
+  static const char* errors[] = {"",
+                                 "foo",
+                                 "foo-dx-100.log",
+                                 ".log",
+                                 "",
+                                 "manifest",
+                                 "CURREN",
+                                 "CURRENTX",
+                                 "MANIFES",
+                                 "MANIFEST",
+                                 "MANIFEST-",
+                                 "XMANIFEST-3",
+                                 "MANIFEST-3x",
+                                 "META",
+                                 "METADB",
+                                 "METADB-",
+                                 "XMETADB-3",
+                                 "METADB-3x",
+                                 "LOC",
+                                 "LOCKx",
+                                 "LO",
+                                 "LOGx",
+                                 "18446744073709551616.log",
+                                 "184467440737095516150.log",
+                                 "100",
+                                 "100.",
+                                 "100.lop"};
+  for (unsigned int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
+    std::string f = errors[i];
+    ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f;
+  };
+}
+
+TEST_F(FileNameTest, InfoLogFileName) {
+  std::string dbname = ("/data/rocksdb");
+  std::string db_absolute_path;
+  ASSERT_OK(Env::Default()->GetAbsolutePath(dbname, &db_absolute_path));
+
+  ASSERT_EQ("/data/rocksdb/LOG", InfoLogFileName(dbname, db_absolute_path, ""));
+  ASSERT_EQ("/data/rocksdb/LOG.old.666",
+            OldInfoLogFileName(dbname, 666u, db_absolute_path, ""));
+
+  ASSERT_EQ("/data/rocksdb_log/data_rocksdb_LOG",
+            InfoLogFileName(dbname, db_absolute_path, "/data/rocksdb_log"));
+  ASSERT_EQ(
+      "/data/rocksdb_log/data_rocksdb_LOG.old.666",
+      OldInfoLogFileName(dbname, 666u, db_absolute_path, "/data/rocksdb_log"));
+}
+
+TEST_F(FileNameTest, Construction) {
+  uint64_t number;
+  FileType type;
+  std::string fname;
+
+  fname = CurrentFileName("foo");
+  ASSERT_EQ("foo/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(0U, number);
+  ASSERT_EQ(kCurrentFile, type);
+
+  fname = LockFileName("foo");
+  ASSERT_EQ("foo/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(0U, number);
+  ASSERT_EQ(kDBLockFile, type);
+
+  fname = LogFileName("foo", 192);
+  ASSERT_EQ("foo/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(192U, number);
+  ASSERT_EQ(kWalFile, type);
+
+  fname = TableFileName({DbPath("bar", 0)}, 200, 0);
+  std::string fname1 =
+      TableFileName({DbPath("foo", 0), DbPath("bar", 0)}, 200, 1);
+  ASSERT_EQ(fname, fname1);
+  ASSERT_EQ("bar/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(200U, number);
+  ASSERT_EQ(kTableFile, type);
+
+  fname = DescriptorFileName("bar", 100);
+  ASSERT_EQ("bar/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(100U, number);
+  ASSERT_EQ(kDescriptorFile, type);
+
+  fname = TempFileName("tmp", 999);
+  ASSERT_EQ("tmp/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(999U, number);
+  ASSERT_EQ(kTempFile, type);
+
+  fname = MetaDatabaseName("met", 100);
+  ASSERT_EQ("met/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(100U, number);
+  ASSERT_EQ(kMetaDatabase, type);
+}
+
+TEST_F(FileNameTest, NormalizePath) {
+  // No leading slash
+  const std::string sep = std::string(1, kFilePathSeparator);
+
+  std::string expected = "FOLDER" + sep + "filename.ext";
+  std::string given = "FOLDER" + sep + "filename.ext";
+
+  ASSERT_EQ(expected, NormalizePath(given));
+
+  // Two chars /a
+
+  expected = sep + "a";
+  given = expected;
+  ASSERT_EQ(expected, NormalizePath(given));
+
+  // Two chars a/
+  expected = "a" + sep;
+  given = expected;
+  ASSERT_EQ(expected, NormalizePath(given));
+
+  // Server only
+  expected = sep + sep + "a";
+  given = expected;
+  ASSERT_EQ(expected, NormalizePath(given));
+
+  // Two slashes after character
+  expected = "a" + sep;
+  given = "a" + sep + sep;
+
+  ASSERT_EQ(expected, NormalizePath(given));
+
+  // slash only   /
+  expected = sep;
+  given = expected;
+  ASSERT_EQ(expected, NormalizePath(given));
+
+  // UNC only   //
+  expected = sep;
+  given = sep + sep;
+
+  ASSERT_EQ(expected, NormalizePath(given));
+
+  // 3 slashesy   //
+  expected = sep + sep;
+  given = sep + sep + sep;
+  ASSERT_EQ(expected, NormalizePath(given));
+
+  // 3 slashes   //
+  expected = sep + sep + "a" + sep;
+  given = sep + sep + sep + "a" + sep;
+  ASSERT_EQ(expected, NormalizePath(given));
+
+  // 2 separators in the middle
+  expected = "a" + sep + "b";
+  given = "a" + sep + sep + "b";
+  ASSERT_EQ(expected, NormalizePath(given));
+
+  // UNC with duplicate slashes
+  expected = sep + sep + "SERVER" + sep + "a" + sep + "b" + sep + "c";
+  given = sep + sep + "SERVER" + sep + "a" + sep + sep + "b" + sep + "c";
+  ASSERT_EQ(expected, NormalizePath(given));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/flush_job.cc b/src/rocksdb/db/flush_job.cc
new file mode 100644
index 000000000..645e42f44
--- /dev/null
+++ b/src/rocksdb/db/flush_job.cc
@@ -0,0 +1,1094 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/flush_job.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <vector>
+
+#include "db/builder.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/event_helpers.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/memtable_list.h"
+#include "db/merge_context.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/version_set.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "logging/event_logger.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/thread_status_util.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "table/merging_iterator.h"
+#include "table/table_builder.h"
+#include "table/two_level_iterator.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/mutexlock.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const char* GetFlushReasonString(FlushReason flush_reason) {
+  switch (flush_reason) {
+    case FlushReason::kOthers:
+      return "Other Reasons";
+    case FlushReason::kGetLiveFiles:
+      return "Get Live Files";
+    case FlushReason::kShutDown:
+      return "Shut down";
+    case FlushReason::kExternalFileIngestion:
+      return "External File Ingestion";
+    case FlushReason::kManualCompaction:
+      return "Manual Compaction";
+    case FlushReason::kWriteBufferManager:
+      return "Write Buffer Manager";
+    case FlushReason::kWriteBufferFull:
+      return "Write Buffer Full";
+    case FlushReason::kTest:
+      return "Test";
+    case FlushReason::kDeleteFiles:
+      return "Delete Files";
+    case FlushReason::kAutoCompaction:
+      return "Auto Compaction";
+    case FlushReason::kManualFlush:
+      return "Manual Flush";
+    case FlushReason::kErrorRecovery:
+      return "Error Recovery";
+    case FlushReason::kWalFull:
+      return "WAL Full";
+    default:
+      return "Invalid";
+  }
+}
+
+FlushJob::FlushJob(
+    const std::string& dbname, ColumnFamilyData* cfd,
+    const ImmutableDBOptions& db_options,
+    const MutableCFOptions& mutable_cf_options, uint64_t max_memtable_id,
+    const FileOptions& file_options, VersionSet* versions,
+    InstrumentedMutex* db_mutex, std::atomic<bool>* shutting_down,
+    std::vector<SequenceNumber> existing_snapshots,
+    SequenceNumber earliest_write_conflict_snapshot,
+    SnapshotChecker* snapshot_checker, JobContext* job_context,
+    LogBuffer* log_buffer, FSDirectory* db_directory,
+    FSDirectory* output_file_directory, CompressionType output_compression,
+    Statistics* stats, EventLogger* event_logger, bool measure_io_stats,
+    const bool sync_output_directory, const bool write_manifest,
+    Env::Priority thread_pri, const std::shared_ptr<IOTracer>& io_tracer,
+    const SeqnoToTimeMapping& seqno_time_mapping, const std::string& db_id,
+    const std::string& db_session_id, std::string full_history_ts_low,
+    BlobFileCompletionCallback* blob_callback)
+    : dbname_(dbname),
+      db_id_(db_id),
+      db_session_id_(db_session_id),
+      cfd_(cfd),
+      db_options_(db_options),
+      mutable_cf_options_(mutable_cf_options),
+      max_memtable_id_(max_memtable_id),
+      file_options_(file_options),
+      versions_(versions),
+      db_mutex_(db_mutex),
+      shutting_down_(shutting_down),
+      existing_snapshots_(std::move(existing_snapshots)),
+      earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
+      snapshot_checker_(snapshot_checker),
+      job_context_(job_context),
+      log_buffer_(log_buffer),
+      db_directory_(db_directory),
+      output_file_directory_(output_file_directory),
+      output_compression_(output_compression),
+      stats_(stats),
+      event_logger_(event_logger),
+      measure_io_stats_(measure_io_stats),
+      sync_output_directory_(sync_output_directory),
+      write_manifest_(write_manifest),
+      edit_(nullptr),
+      base_(nullptr),
+      pick_memtable_called(false),
+      thread_pri_(thread_pri),
+      io_tracer_(io_tracer),
+      clock_(db_options_.clock),
+      full_history_ts_low_(std::move(full_history_ts_low)),
+      blob_callback_(blob_callback),
+      db_impl_seqno_time_mapping_(seqno_time_mapping) {
+  // Update the thread status to indicate flush.
+  ReportStartedFlush();
+  TEST_SYNC_POINT("FlushJob::FlushJob()");
+}
+
+FlushJob::~FlushJob() { ThreadStatusUtil::ResetThreadStatus(); }
+
+void FlushJob::ReportStartedFlush() {
+  ThreadStatusUtil::SetColumnFamily(cfd_, cfd_->ioptions()->env,
+                                    db_options_.enable_thread_tracking);
+  ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_FLUSH);
+  ThreadStatusUtil::SetThreadOperationProperty(ThreadStatus::COMPACTION_JOB_ID,
+                                               job_context_->job_id);
+  IOSTATS_RESET(bytes_written);
+}
+
+void FlushJob::ReportFlushInputSize(const autovector<MemTable*>& mems) {
+  uint64_t input_size = 0;
+  for (auto* mem : mems) {
+    input_size += mem->ApproximateMemoryUsage();
+  }
+  ThreadStatusUtil::IncreaseThreadOperationProperty(
+      ThreadStatus::FLUSH_BYTES_MEMTABLES, input_size);
+}
+
+void FlushJob::RecordFlushIOStats() {
+  RecordTick(stats_, FLUSH_WRITE_BYTES, IOSTATS(bytes_written));
+  ThreadStatusUtil::IncreaseThreadOperationProperty(
+      ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
+  IOSTATS_RESET(bytes_written);
+}
+void FlushJob::PickMemTable() {
+  db_mutex_->AssertHeld();
+  assert(!pick_memtable_called);
+  pick_memtable_called = true;
+
+  // Maximum "NextLogNumber" of the memtables to flush.
+  // When mempurge feature is turned off, this variable is useless
+  // because the memtables are implicitly sorted by increasing order of creation
+  // time. Therefore mems_->back()->GetNextLogNumber() is already equal to
+  // max_next_log_number. However when Mempurge is on, the memtables are no
+  // longer sorted by increasing order of creation time. Therefore this variable
+  // becomes necessary because mems_->back()->GetNextLogNumber() is no longer
+  // necessarily equal to max_next_log_number.
+  uint64_t max_next_log_number = 0;
+
+  // Save the contents of the earliest memtable as a new Table
+  cfd_->imm()->PickMemtablesToFlush(max_memtable_id_, &mems_,
+                                    &max_next_log_number);
+  if (mems_.empty()) {
+    return;
+  }
+
+  ReportFlushInputSize(mems_);
+
+  // entries mems are (implicitly) sorted in ascending order by their created
+  // time. We will use the first memtable's `edit` to keep the meta info for
+  // this flush.
+  MemTable* m = mems_[0];
+  edit_ = m->GetEdits();
+  edit_->SetPrevLogNumber(0);
+  // SetLogNumber(log_num) indicates logs with number smaller than log_num
+  // will no longer be picked up for recovery.
+  edit_->SetLogNumber(max_next_log_number);
+  edit_->SetColumnFamily(cfd_->GetID());
+
+  // path 0 for level 0 file.
+  meta_.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
+
+  base_ = cfd_->current();
+  base_->Ref();  // it is likely that we do not need this reference
+}
+
+Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta,
+                     bool* switched_to_mempurge) {
+  TEST_SYNC_POINT("FlushJob::Start");
+  db_mutex_->AssertHeld();
+  assert(pick_memtable_called);
+  // Mempurge threshold can be dynamically changed.
+  // For sake of consistency, mempurge_threshold is
+  // saved locally to maintain consistency in each
+  // FlushJob::Run call.
+  double mempurge_threshold =
+      mutable_cf_options_.experimental_mempurge_threshold;
+
+  AutoThreadOperationStageUpdater stage_run(ThreadStatus::STAGE_FLUSH_RUN);
+  if (mems_.empty()) {
+    ROCKS_LOG_BUFFER(log_buffer_, "[%s] Nothing in memtable to flush",
+                     cfd_->GetName().c_str());
+    return Status::OK();
+  }
+
+  // I/O measurement variables
+  PerfLevel prev_perf_level = PerfLevel::kEnableTime;
+  uint64_t prev_write_nanos = 0;
+  uint64_t prev_fsync_nanos = 0;
+  uint64_t prev_range_sync_nanos = 0;
+  uint64_t prev_prepare_write_nanos = 0;
+  uint64_t prev_cpu_write_nanos = 0;
+  uint64_t prev_cpu_read_nanos = 0;
+  if (measure_io_stats_) {
+    prev_perf_level = GetPerfLevel();
+    SetPerfLevel(PerfLevel::kEnableTime);
+    prev_write_nanos = IOSTATS(write_nanos);
+    prev_fsync_nanos = IOSTATS(fsync_nanos);
+    prev_range_sync_nanos = IOSTATS(range_sync_nanos);
+    prev_prepare_write_nanos = IOSTATS(prepare_write_nanos);
+    prev_cpu_write_nanos = IOSTATS(cpu_write_nanos);
+    prev_cpu_read_nanos = IOSTATS(cpu_read_nanos);
+  }
+  Status mempurge_s = Status::NotFound("No MemPurge.");
+  if ((mempurge_threshold > 0.0) &&
+      (cfd_->GetFlushReason() == FlushReason::kWriteBufferFull) &&
+      (!mems_.empty()) && MemPurgeDecider(mempurge_threshold) &&
+      !(db_options_.atomic_flush)) {
+    cfd_->SetMempurgeUsed();
+    mempurge_s = MemPurge();
+    if (!mempurge_s.ok()) {
+      // Mempurge is typically aborted when the output
+      // bytes cannot be contained onto a single output memtable.
+      if (mempurge_s.IsAborted()) {
+        ROCKS_LOG_INFO(db_options_.info_log, "Mempurge process aborted: %s\n",
+                       mempurge_s.ToString().c_str());
+      } else {
+        // However the mempurge process can also fail for
+        // other reasons (eg: new_mem->Add() fails).
+        ROCKS_LOG_WARN(db_options_.info_log, "Mempurge process failed: %s\n",
+                       mempurge_s.ToString().c_str());
+      }
+    } else {
+      if (switched_to_mempurge) {
+        *switched_to_mempurge = true;
+      } else {
+        // The mempurge process was successful, but no switch_to_mempurge
+        // pointer provided so no way to propagate the state of flush job.
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "Mempurge process succeeded"
+                       "but no 'switched_to_mempurge' ptr provided.\n");
+      }
+    }
+  }
+  Status s;
+  if (mempurge_s.ok()) {
+    base_->Unref();
+    s = Status::OK();
+  } else {
+    // This will release and re-acquire the mutex.
+    s = WriteLevel0Table();
+  }
+
+  if (s.ok() && cfd_->IsDropped()) {
+    s = Status::ColumnFamilyDropped("Column family dropped during compaction");
+  }
+  if ((s.ok() || s.IsColumnFamilyDropped()) &&
+      shutting_down_->load(std::memory_order_acquire)) {
+    s = Status::ShutdownInProgress("Database shutdown");
+  }
+
+  if (!s.ok()) {
+    cfd_->imm()->RollbackMemtableFlush(mems_, meta_.fd.GetNumber());
+  } else if (write_manifest_) {
+    TEST_SYNC_POINT("FlushJob::InstallResults");
+    // Replace immutable memtable with the generated Table
+    s = cfd_->imm()->TryInstallMemtableFlushResults(
+        cfd_, mutable_cf_options_, mems_, prep_tracker, versions_, db_mutex_,
+        meta_.fd.GetNumber(), &job_context_->memtables_to_free, db_directory_,
+        log_buffer_, &committed_flush_jobs_info_,
+        !(mempurge_s.ok()) /* write_edit : true if no mempurge happened (or if aborted),
+                              but 'false' if mempurge successful: no new min log number
+                              or new level 0 file path to write to manifest. */);
+  }
+
+  if (s.ok() && file_meta != nullptr) {
+    *file_meta = meta_;
+  }
+  RecordFlushIOStats();
+
+  // When measure_io_stats_ is true, the default 512 bytes is not enough.
+  auto stream = event_logger_->LogToBuffer(log_buffer_, 1024);
+  stream << "job" << job_context_->job_id << "event"
+         << "flush_finished";
+  stream << "output_compression"
+         << CompressionTypeToString(output_compression_);
+  stream << "lsm_state";
+  stream.StartArray();
+  auto vstorage = cfd_->current()->storage_info();
+  for (int level = 0; level < vstorage->num_levels(); ++level) {
+    stream << vstorage->NumLevelFiles(level);
+  }
+  stream.EndArray();
+
+  const auto& blob_files = vstorage->GetBlobFiles();
+  if (!blob_files.empty()) {
+    assert(blob_files.front());
+    stream << "blob_file_head" << blob_files.front()->GetBlobFileNumber();
+
+    assert(blob_files.back());
+    stream << "blob_file_tail" << blob_files.back()->GetBlobFileNumber();
+  }
+
+  stream << "immutable_memtables" << cfd_->imm()->NumNotFlushed();
+
+  if (measure_io_stats_) {
+    if (prev_perf_level != PerfLevel::kEnableTime) {
+      SetPerfLevel(prev_perf_level);
+    }
+    stream << "file_write_nanos" << (IOSTATS(write_nanos) - prev_write_nanos);
+    stream << "file_range_sync_nanos"
+           << (IOSTATS(range_sync_nanos) - prev_range_sync_nanos);
+    stream << "file_fsync_nanos" << (IOSTATS(fsync_nanos) - prev_fsync_nanos);
+    stream << "file_prepare_write_nanos"
+           << (IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos);
+    stream << "file_cpu_write_nanos"
+           << (IOSTATS(cpu_write_nanos) - prev_cpu_write_nanos);
+    stream << "file_cpu_read_nanos"
+           << (IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos);
+  }
+
+  return s;
+}
+
+void FlushJob::Cancel() {
+  db_mutex_->AssertHeld();
+  assert(base_ != nullptr);
+  base_->Unref();
+}
+
+Status FlushJob::MemPurge() {
+  Status s;
+  db_mutex_->AssertHeld();
+  db_mutex_->Unlock();
+  assert(!mems_.empty());
+
+  // Measure purging time.
+  const uint64_t start_micros = clock_->NowMicros();
+  const uint64_t start_cpu_micros = clock_->CPUMicros();
+
+  MemTable* new_mem = nullptr;
+  // For performance/log investigation purposes:
+  // look at how much useful payload we harvest in the new_mem.
+  // This value is then printed to the DB log.
+  double new_mem_capacity = 0.0;
+
+  // Create two iterators, one for the memtable data (contains
+  // info from puts + deletes), and one for the memtable
+  // Range Tombstones (from DeleteRanges).
+  ReadOptions ro;
+  ro.total_order_seek = true;
+  Arena arena;
+  std::vector<InternalIterator*> memtables;
+  std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+      range_del_iters;
+  for (MemTable* m : mems_) {
+    memtables.push_back(m->NewIterator(ro, &arena));
+    auto* range_del_iter = m->NewRangeTombstoneIterator(
+        ro, kMaxSequenceNumber, true /* immutable_memtable */);
+    if (range_del_iter != nullptr) {
+      range_del_iters.emplace_back(range_del_iter);
+    }
+  }
+
+  assert(!memtables.empty());
+  SequenceNumber first_seqno = kMaxSequenceNumber;
+  SequenceNumber earliest_seqno = kMaxSequenceNumber;
+  // Pick first and earliest seqno as min of all first_seqno
+  // and earliest_seqno of the mempurged memtables.
+  for (const auto& mem : mems_) {
+    first_seqno = mem->GetFirstSequenceNumber() < first_seqno
+                      ? mem->GetFirstSequenceNumber()
+                      : first_seqno;
+    earliest_seqno = mem->GetEarliestSequenceNumber() < earliest_seqno
+                         ? mem->GetEarliestSequenceNumber()
+                         : earliest_seqno;
+  }
+
+  ScopedArenaIterator iter(
+      NewMergingIterator(&(cfd_->internal_comparator()), memtables.data(),
+                         static_cast<int>(memtables.size()), &arena));
+
+  auto* ioptions = cfd_->ioptions();
+
+  // Place iterator at the First (meaning most recent) key node.
+  iter->SeekToFirst();
+
+  const std::string* const full_history_ts_low = &(cfd_->GetFullHistoryTsLow());
+  std::unique_ptr<CompactionRangeDelAggregator> range_del_agg(
+      new CompactionRangeDelAggregator(&(cfd_->internal_comparator()),
+                                       existing_snapshots_,
+                                       full_history_ts_low));
+  for (auto& rd_iter : range_del_iters) {
+    range_del_agg->AddTombstones(std::move(rd_iter));
+  }
+
+  // If there is valid data in the memtable,
+  // or at least range tombstones, copy over the info
+  // to the new memtable.
+  if (iter->Valid() || !range_del_agg->IsEmpty()) {
+    // MaxSize is the size of a memtable.
+    size_t maxSize = mutable_cf_options_.write_buffer_size;
+    std::unique_ptr<CompactionFilter> compaction_filter;
+    if (ioptions->compaction_filter_factory != nullptr &&
+        ioptions->compaction_filter_factory->ShouldFilterTableFileCreation(
+            TableFileCreationReason::kFlush)) {
+      CompactionFilter::Context ctx;
+      ctx.is_full_compaction = false;
+      ctx.is_manual_compaction = false;
+      ctx.column_family_id = cfd_->GetID();
+      ctx.reason = TableFileCreationReason::kFlush;
+      compaction_filter =
+          ioptions->compaction_filter_factory->CreateCompactionFilter(ctx);
+      if (compaction_filter != nullptr &&
+          !compaction_filter->IgnoreSnapshots()) {
+        s = Status::NotSupported(
+            "CompactionFilter::IgnoreSnapshots() = false is not supported "
+            "anymore.");
+        return s;
+      }
+    }
+
+    new_mem = new MemTable((cfd_->internal_comparator()), *(cfd_->ioptions()),
+                           mutable_cf_options_, cfd_->write_buffer_mgr(),
+                           earliest_seqno, cfd_->GetID());
+    assert(new_mem != nullptr);
+
+    Env* env = db_options_.env;
+    assert(env);
+    MergeHelper merge(
+        env, (cfd_->internal_comparator()).user_comparator(),
+        (ioptions->merge_operator).get(), compaction_filter.get(),
+        ioptions->logger, true /* internal key corruption is not ok */,
+        existing_snapshots_.empty() ? 0 : existing_snapshots_.back(),
+        snapshot_checker_);
+    assert(job_context_);
+    SequenceNumber job_snapshot_seq = job_context_->GetJobSnapshotSequence();
+    const std::atomic<bool> kManualCompactionCanceledFalse{false};
+    CompactionIterator c_iter(
+        iter.get(), (cfd_->internal_comparator()).user_comparator(), &merge,
+        kMaxSequenceNumber, &existing_snapshots_,
+        earliest_write_conflict_snapshot_, job_snapshot_seq, snapshot_checker_,
+        env, ShouldReportDetailedTime(env, ioptions->stats),
+        true /* internal key corruption is not ok */, range_del_agg.get(),
+        nullptr, ioptions->allow_data_in_errors,
+        ioptions->enforce_single_del_contracts,
+        /*manual_compaction_canceled=*/kManualCompactionCanceledFalse,
+        /*compaction=*/nullptr, compaction_filter.get(),
+        /*shutting_down=*/nullptr, ioptions->info_log, full_history_ts_low);
+
+    // Set earliest sequence number in the new memtable
+    // to be equal to the earliest sequence number of the
+    // memtable being flushed (See later if there is a need
+    // to update this number!).
+    new_mem->SetEarliestSequenceNumber(earliest_seqno);
+    // Likewise for first seq number.
+    new_mem->SetFirstSequenceNumber(first_seqno);
+    SequenceNumber new_first_seqno = kMaxSequenceNumber;
+
+    c_iter.SeekToFirst();
+
+    // Key transfer
+    for (; c_iter.Valid(); c_iter.Next()) {
+      const ParsedInternalKey ikey = c_iter.ikey();
+      const Slice value = c_iter.value();
+      new_first_seqno =
+          ikey.sequence < new_first_seqno ? ikey.sequence : new_first_seqno;
+
+      // Should we update "OldestKeyTime" ???? -> timestamp appear
+      // to still be an "experimental" feature.
+      s = new_mem->Add(
+          ikey.sequence, ikey.type, ikey.user_key, value,
+          nullptr,   // KV protection info set as nullptr since it
+                     // should only be useful for the first add to
+                     // the original memtable.
+          false,     // : allow concurrent_memtable_writes_
+                     // Not seen as necessary for now.
+          nullptr,   // get_post_process_info(m) must be nullptr
+                     // when concurrent_memtable_writes is switched off.
+          nullptr);  // hint, only used when concurrent_memtable_writes_
+                     // is switched on.
+      if (!s.ok()) {
+        break;
+      }
+
+      // If new_mem has size greater than maxSize,
+      // then rollback to regular flush operation,
+      // and destroy new_mem.
+      if (new_mem->ApproximateMemoryUsage() > maxSize) {
+        s = Status::Aborted("Mempurge filled more than one memtable.");
+        new_mem_capacity = 1.0;
+        break;
+      }
+    }
+
+    // Check status and propagate
+    // potential error status from c_iter
+    if (!s.ok()) {
+      c_iter.status().PermitUncheckedError();
+    } else if (!c_iter.status().ok()) {
+      s = c_iter.status();
+    }
+
+    // Range tombstone transfer.
+    if (s.ok()) {
+      auto range_del_it = range_del_agg->NewIterator();
+      for (range_del_it->SeekToFirst(); range_del_it->Valid();
+           range_del_it->Next()) {
+        auto tombstone = range_del_it->Tombstone();
+        new_first_seqno =
+            tombstone.seq_ < new_first_seqno ? tombstone.seq_ : new_first_seqno;
+        s = new_mem->Add(
+            tombstone.seq_,        // Sequence number
+            kTypeRangeDeletion,    // KV type
+            tombstone.start_key_,  // Key is start key.
+            tombstone.end_key_,    // Value is end key.
+            nullptr,               // KV protection info set as nullptr since it
+                                   // should only be useful for the first add to
+                                   // the original memtable.
+            false,                 // : allow concurrent_memtable_writes_
+                                   // Not seen as necessary for now.
+            nullptr,               // get_post_process_info(m) must be nullptr
+                      // when concurrent_memtable_writes is switched off.
+            nullptr);  // hint, only used when concurrent_memtable_writes_
+                       // is switched on.
+
+        if (!s.ok()) {
+          break;
+        }
+
+        // If new_mem has size greater than maxSize,
+        // then rollback to regular flush operation,
+        // and destroy new_mem.
+        if (new_mem->ApproximateMemoryUsage() > maxSize) {
+          s = Status::Aborted(Slice("Mempurge filled more than one memtable."));
+          new_mem_capacity = 1.0;
+          break;
+        }
+      }
+    }
+
+    // If everything happened smoothly and new_mem contains valid data,
+    // decide if it is flushed to storage or kept in the imm()
+    // memtable list (memory).
+    if (s.ok() && (new_first_seqno != kMaxSequenceNumber)) {
+      // Rectify the first sequence number, which (unlike the earliest seq
+      // number) needs to be present in the new memtable.
+      new_mem->SetFirstSequenceNumber(new_first_seqno);
+
+      // The new_mem is added to the list of immutable memtables
+      // only if it filled at less than 100% capacity and isn't flagged
+      // as in need of being flushed.
+      if (new_mem->ApproximateMemoryUsage() < maxSize &&
+          !(new_mem->ShouldFlushNow())) {
+        // Construct fragmented memtable range tombstones without mutex
+        new_mem->ConstructFragmentedRangeTombstones();
+        db_mutex_->Lock();
+        uint64_t new_mem_id = mems_[0]->GetID();
+
+        new_mem->SetID(new_mem_id);
+        new_mem->SetNextLogNumber(mems_[0]->GetNextLogNumber());
+
+        // This addition will not trigger another flush, because
+        // we do not call SchedulePendingFlush().
+        cfd_->imm()->Add(new_mem, &job_context_->memtables_to_free);
+        new_mem->Ref();
+#ifndef ROCKSDB_LITE
+        // Piggyback FlushJobInfo on the first flushed memtable.
+        db_mutex_->AssertHeld();
+        meta_.fd.file_size = 0;
+        mems_[0]->SetFlushJobInfo(GetFlushJobInfo());
+#endif  // !ROCKSDB_LITE
+        db_mutex_->Unlock();
+      } else {
+        s = Status::Aborted(Slice("Mempurge filled more than one memtable."));
+        new_mem_capacity = 1.0;
+        if (new_mem) {
+          job_context_->memtables_to_free.push_back(new_mem);
+        }
+      }
+    } else {
+      // In this case, the newly allocated new_mem is empty.
+      assert(new_mem != nullptr);
+      job_context_->memtables_to_free.push_back(new_mem);
+    }
+  }
+
+  // Reacquire the mutex for WriteLevel0 function.
+  db_mutex_->Lock();
+
+  // If mempurge successful, don't write input tables to level0,
+  // but write any full output table to level0.
+  if (s.ok()) {
+    TEST_SYNC_POINT("DBImpl::FlushJob:MemPurgeSuccessful");
+  } else {
+    TEST_SYNC_POINT("DBImpl::FlushJob:MemPurgeUnsuccessful");
+  }
+  const uint64_t micros = clock_->NowMicros() - start_micros;
+  const uint64_t cpu_micros = clock_->CPUMicros() - start_cpu_micros;
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "[%s] [JOB %d] Mempurge lasted %" PRIu64
+                 " microseconds, and %" PRIu64
+                 " cpu "
+                 "microseconds. Status is %s ok. Perc capacity: %f\n",
+                 cfd_->GetName().c_str(), job_context_->job_id, micros,
+                 cpu_micros, s.ok() ? "" : "not", new_mem_capacity);
+
+  return s;
+}
+
+bool FlushJob::MemPurgeDecider(double threshold) {
+  // Never trigger mempurge if threshold is not a strictly positive value.
+  if (!(threshold > 0.0)) {
+    return false;
+  }
+  if (threshold > (1.0 * mems_.size())) {
+    return true;
+  }
+  // Payload and useful_payload (in bytes).
+  // The useful payload ratio of a given MemTable
+  // is estimated to be useful_payload/payload.
+  uint64_t payload = 0, useful_payload = 0, entry_size = 0;
+
+  // Local variables used repetitively inside the for-loop
+  // when iterating over the sampled entries.
+  Slice key_slice, value_slice;
+  ParsedInternalKey res;
+  SnapshotImpl min_snapshot;
+  std::string vget;
+  Status mget_s, parse_s;
+  MergeContext merge_context;
+  SequenceNumber max_covering_tombstone_seq = 0, sqno = 0,
+                 min_seqno_snapshot = 0;
+  bool get_res, can_be_useful_payload, not_in_next_mems;
+
+  // If estimated_useful_payload is > threshold,
+  // then flush to storage, else MemPurge.
+  double estimated_useful_payload = 0.0;
+  // Cochran formula for determining sample size.
+  // 95% confidence interval, 7% precision.
+  //    n0 = (1.96*1.96)*0.25/(0.07*0.07) = 196.0
+  double n0 = 196.0;
+  ReadOptions ro;
+  ro.total_order_seek = true;
+
+  // Iterate over each memtable of the set.
+  for (auto mem_iter = std::begin(mems_); mem_iter != std::end(mems_);
+       mem_iter++) {
+    MemTable* mt = *mem_iter;
+
+    // Else sample from the table.
+    uint64_t nentries = mt->num_entries();
+    // Corrected Cochran formula for small populations
+    // (converges to n0 for large populations).
+    uint64_t target_sample_size =
+        static_cast<uint64_t>(ceil(n0 / (1.0 + (n0 / nentries))));
+    std::unordered_set<const char*> sentries = {};
+    // Populate sample entries set.
+    mt->UniqueRandomSample(target_sample_size, &sentries);
+
+    // Estimate the garbage ratio by comparing if
+    // each sample corresponds to a valid entry.
+    for (const char* ss : sentries) {
+      key_slice = GetLengthPrefixedSlice(ss);
+      parse_s = ParseInternalKey(key_slice, &res, true /*log_err_key*/);
+      if (!parse_s.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "Memtable Decider: ParseInternalKey did not parse "
+                       "key_slice %s successfully.",
+                       key_slice.data());
+      }
+
+      // Size of the entry is "key size (+ value size if KV entry)"
+      entry_size = key_slice.size();
+      if (res.type == kTypeValue) {
+        value_slice =
+            GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
+        entry_size += value_slice.size();
+      }
+
+      // Count entry bytes as payload.
+      payload += entry_size;
+
+      LookupKey lkey(res.user_key, kMaxSequenceNumber);
+
+      // Paranoia: zero out these values just in case.
+      max_covering_tombstone_seq = 0;
+      sqno = 0;
+
+      // Pick the oldest existing snapshot that is more recent
+      // than the sequence number of the sampled entry.
+      min_seqno_snapshot = kMaxSequenceNumber;
+      for (SequenceNumber seq_num : existing_snapshots_) {
+        if (seq_num > res.sequence && seq_num < min_seqno_snapshot) {
+          min_seqno_snapshot = seq_num;
+        }
+      }
+      min_snapshot.number_ = min_seqno_snapshot;
+      ro.snapshot =
+          min_seqno_snapshot < kMaxSequenceNumber ? &min_snapshot : nullptr;
+
+      // Estimate if the sample entry is valid or not.
+      get_res = mt->Get(lkey, &vget, /*columns=*/nullptr, /*timestamp=*/nullptr,
+                        &mget_s, &merge_context, &max_covering_tombstone_seq,
+                        &sqno, ro, true /* immutable_memtable */);
+      if (!get_res) {
+        ROCKS_LOG_WARN(
+            db_options_.info_log,
+            "Memtable Get returned false when Get(sampled entry). "
+            "Yet each sample entry should exist somewhere in the memtable, "
+            "unrelated to whether it has been deleted or not.");
+      }
+
+      // TODO(bjlemaire): evaluate typeMerge.
+      // This is where the sampled entry is estimated to be
+      // garbage or not. Note that this is a garbage *estimation*
+      // because we do not include certain items such as
+      // CompactionFitlers triggered at flush, or if the same delete
+      // has been inserted twice or more in the memtable.
+
+      // Evaluate if the entry can be useful payload
+      // Situation #1: entry is a KV entry, was found in the memtable mt
+      //               and the sequence numbers match.
+      can_be_useful_payload = (res.type == kTypeValue) && get_res &&
+                              mget_s.ok() && (sqno == res.sequence);
+
+      // Situation #2: entry is a delete entry, was found in the memtable mt
+      //               (because gres==true) and no valid KV entry is found.
+      //               (note: duplicate delete entries are also taken into
+      //               account here, because the sequence number 'sqno'
+      //               in memtable->Get(&sqno) operation is set to be equal
+      //               to the most recent delete entry as well).
+      can_be_useful_payload |=
+          ((res.type == kTypeDeletion) || (res.type == kTypeSingleDeletion)) &&
+          mget_s.IsNotFound() && get_res && (sqno == res.sequence);
+
+      // If there is a chance that the entry is useful payload
+      // Verify that the entry does not appear in the following memtables
+      // (memtables with greater memtable ID/larger sequence numbers).
+      if (can_be_useful_payload) {
+        not_in_next_mems = true;
+        for (auto next_mem_iter = mem_iter + 1;
+             next_mem_iter != std::end(mems_); next_mem_iter++) {
+          if ((*next_mem_iter)
+                  ->Get(lkey, &vget, /*columns=*/nullptr, /*timestamp=*/nullptr,
+                        &mget_s, &merge_context, &max_covering_tombstone_seq,
+                        &sqno, ro, true /* immutable_memtable */)) {
+            not_in_next_mems = false;
+            break;
+          }
+        }
+        if (not_in_next_mems) {
+          useful_payload += entry_size;
+        }
+      }
+    }
+    if (payload > 0) {
+      // We use the estimated useful payload ratio to
+      // evaluate how many of the memtable bytes are useful bytes.
+      estimated_useful_payload +=
+          (mt->ApproximateMemoryUsage()) * (useful_payload * 1.0 / payload);
+
+      ROCKS_LOG_INFO(db_options_.info_log,
+                     "Mempurge sampling [CF %s] - found garbage ratio from "
+                     "sampling: %f. Threshold is %f\n",
+                     cfd_->GetName().c_str(),
+                     (payload - useful_payload) * 1.0 / payload, threshold);
+    } else {
+      ROCKS_LOG_WARN(db_options_.info_log,
+                     "Mempurge sampling: null payload measured, and collected "
+                     "sample size is %zu\n.",
+                     sentries.size());
+    }
+  }
+  // We convert the total number of useful payload bytes
+  // into the proportion of memtable necessary to store all these bytes.
+  // We compare this proportion with the threshold value.
+  return ((estimated_useful_payload / mutable_cf_options_.write_buffer_size) <
+          threshold);
+}
+
+Status FlushJob::WriteLevel0Table() {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_FLUSH_WRITE_L0);
+  db_mutex_->AssertHeld();
+  const uint64_t start_micros = clock_->NowMicros();
+  const uint64_t start_cpu_micros = clock_->CPUMicros();
+  Status s;
+
+  SequenceNumber smallest_seqno = mems_.front()->GetEarliestSequenceNumber();
+  if (!db_impl_seqno_time_mapping_.Empty()) {
+    // make a local copy, as the seqno_time_mapping from db_impl is not thread
+    // safe, which will be used while not holding the db_mutex.
+    seqno_to_time_mapping_ = db_impl_seqno_time_mapping_.Copy(smallest_seqno);
+  }
+
+  std::vector<BlobFileAddition> blob_file_additions;
+
+  {
+    auto write_hint = cfd_->CalculateSSTWriteHint(0);
+    Env::IOPriority io_priority = GetRateLimiterPriorityForWrite();
+    db_mutex_->Unlock();
+    if (log_buffer_) {
+      log_buffer_->FlushBufferToLog();
+    }
+    // memtables and range_del_iters store internal iterators over each data
+    // memtable and its associated range deletion memtable, respectively, at
+    // corresponding indexes.
+    std::vector<InternalIterator*> memtables;
+    std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+        range_del_iters;
+    ReadOptions ro;
+    ro.total_order_seek = true;
+    Arena arena;
+    uint64_t total_num_entries = 0, total_num_deletes = 0;
+    uint64_t total_data_size = 0;
+    size_t total_memory_usage = 0;
+    // Used for testing:
+    uint64_t mems_size = mems_.size();
+    (void)mems_size;  // avoids unused variable error when
+                      // TEST_SYNC_POINT_CALLBACK not used.
+    TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:num_memtables",
+                             &mems_size);
+    assert(job_context_);
+    for (MemTable* m : mems_) {
+      ROCKS_LOG_INFO(
+          db_options_.info_log,
+          "[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n",
+          cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber());
+      memtables.push_back(m->NewIterator(ro, &arena));
+      auto* range_del_iter = m->NewRangeTombstoneIterator(
+          ro, kMaxSequenceNumber, true /* immutable_memtable */);
+      if (range_del_iter != nullptr) {
+        range_del_iters.emplace_back(range_del_iter);
+      }
+      total_num_entries += m->num_entries();
+      total_num_deletes += m->num_deletes();
+      total_data_size += m->get_data_size();
+      total_memory_usage += m->ApproximateMemoryUsage();
+    }
+
+    event_logger_->Log() << "job" << job_context_->job_id << "event"
+                         << "flush_started"
+                         << "num_memtables" << mems_.size() << "num_entries"
+                         << total_num_entries << "num_deletes"
+                         << total_num_deletes << "total_data_size"
+                         << total_data_size << "memory_usage"
+                         << total_memory_usage << "flush_reason"
+                         << GetFlushReasonString(cfd_->GetFlushReason());
+
+    {
+      ScopedArenaIterator iter(
+          NewMergingIterator(&cfd_->internal_comparator(), memtables.data(),
+                             static_cast<int>(memtables.size()), &arena));
+      ROCKS_LOG_INFO(db_options_.info_log,
+                     "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": started",
+                     cfd_->GetName().c_str(), job_context_->job_id,
+                     meta_.fd.GetNumber());
+
+      TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:output_compression",
+                               &output_compression_);
+      int64_t _current_time = 0;
+      auto status = clock_->GetCurrentTime(&_current_time);
+      // Safe to proceed even if GetCurrentTime fails. So, log and proceed.
+      if (!status.ok()) {
+        ROCKS_LOG_WARN(
+            db_options_.info_log,
+            "Failed to get current time to populate creation_time property. "
+            "Status: %s",
+            status.ToString().c_str());
+      }
+      const uint64_t current_time = static_cast<uint64_t>(_current_time);
+
+      uint64_t oldest_key_time = mems_.front()->ApproximateOldestKeyTime();
+
+      // It's not clear whether oldest_key_time is always available. In case
+      // it is not available, use current_time.
+      uint64_t oldest_ancester_time = std::min(current_time, oldest_key_time);
+
+      TEST_SYNC_POINT_CALLBACK(
+          "FlushJob::WriteLevel0Table:oldest_ancester_time",
+          &oldest_ancester_time);
+      meta_.oldest_ancester_time = oldest_ancester_time;
+      meta_.file_creation_time = current_time;
+
+      uint64_t num_input_entries = 0;
+      uint64_t memtable_payload_bytes = 0;
+      uint64_t memtable_garbage_bytes = 0;
+      IOStatus io_s;
+
+      const std::string* const full_history_ts_low =
+          (full_history_ts_low_.empty()) ? nullptr : &full_history_ts_low_;
+      TableBuilderOptions tboptions(
+          *cfd_->ioptions(), mutable_cf_options_, cfd_->internal_comparator(),
+          cfd_->int_tbl_prop_collector_factories(), output_compression_,
+          mutable_cf_options_.compression_opts, cfd_->GetID(), cfd_->GetName(),
+          0 /* level */, false /* is_bottommost */,
+          TableFileCreationReason::kFlush, oldest_key_time, current_time,
+          db_id_, db_session_id_, 0 /* target_file_size */,
+          meta_.fd.GetNumber());
+      const SequenceNumber job_snapshot_seq =
+          job_context_->GetJobSnapshotSequence();
+      s = BuildTable(
+          dbname_, versions_, db_options_, tboptions, file_options_,
+          cfd_->table_cache(), iter.get(), std::move(range_del_iters), &meta_,
+          &blob_file_additions, existing_snapshots_,
+          earliest_write_conflict_snapshot_, job_snapshot_seq,
+          snapshot_checker_, mutable_cf_options_.paranoid_file_checks,
+          cfd_->internal_stats(), &io_s, io_tracer_,
+          BlobFileCreationReason::kFlush, seqno_to_time_mapping_, event_logger_,
+          job_context_->job_id, io_priority, &table_properties_, write_hint,
+          full_history_ts_low, blob_callback_, &num_input_entries,
+          &memtable_payload_bytes, &memtable_garbage_bytes);
+      // TODO: Cleanup io_status in BuildTable and table builders
+      assert(!s.ok() || io_s.ok());
+      io_s.PermitUncheckedError();
+      if (num_input_entries != total_num_entries && s.ok()) {
+        std::string msg = "Expected " + std::to_string(total_num_entries) +
+                          " entries in memtables, but read " +
+                          std::to_string(num_input_entries);
+        ROCKS_LOG_WARN(db_options_.info_log, "[%s] [JOB %d] Level-0 flush %s",
+                       cfd_->GetName().c_str(), job_context_->job_id,
+                       msg.c_str());
+        if (db_options_.flush_verify_memtable_count) {
+          s = Status::Corruption(msg);
+        }
+      }
+      if (tboptions.reason == TableFileCreationReason::kFlush) {
+        TEST_SYNC_POINT("DBImpl::FlushJob:Flush");
+        RecordTick(stats_, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH,
+                   memtable_payload_bytes);
+        RecordTick(stats_, MEMTABLE_GARBAGE_BYTES_AT_FLUSH,
+                   memtable_garbage_bytes);
+      }
+      LogFlush(db_options_.info_log);
+    }
+    ROCKS_LOG_BUFFER(log_buffer_,
+                     "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": %" PRIu64
+                     " bytes %s"
+                     "%s",
+                     cfd_->GetName().c_str(), job_context_->job_id,
+                     meta_.fd.GetNumber(), meta_.fd.GetFileSize(),
+                     s.ToString().c_str(),
+                     meta_.marked_for_compaction ? " (needs compaction)" : "");
+
+    if (s.ok() && output_file_directory_ != nullptr && sync_output_directory_) {
+      s = output_file_directory_->FsyncWithDirOptions(
+          IOOptions(), nullptr,
+          DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+    }
+    TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table", &mems_);
+    db_mutex_->Lock();
+  }
+  base_->Unref();
+
+  // Note that if file_size is zero, the file has been deleted and
+  // should not be added to the manifest.
+  const bool has_output = meta_.fd.GetFileSize() > 0;
+
+  if (s.ok() && has_output) {
+    TEST_SYNC_POINT("DBImpl::FlushJob:SSTFileCreated");
+    // if we have more than 1 background thread, then we cannot
+    // insert files directly into higher levels because some other
+    // threads could be concurrently producing compacted files for
+    // that key range.
+    // Add file to L0
+    edit_->AddFile(0 /* level */, meta_.fd.GetNumber(), meta_.fd.GetPathId(),
+                   meta_.fd.GetFileSize(), meta_.smallest, meta_.largest,
+                   meta_.fd.smallest_seqno, meta_.fd.largest_seqno,
+                   meta_.marked_for_compaction, meta_.temperature,
+                   meta_.oldest_blob_file_number, meta_.oldest_ancester_time,
+                   meta_.file_creation_time, meta_.file_checksum,
+                   meta_.file_checksum_func_name, meta_.unique_id);
+
+    edit_->SetBlobFileAdditions(std::move(blob_file_additions));
+  }
+#ifndef ROCKSDB_LITE
+  // Piggyback FlushJobInfo on the first first flushed memtable.
+  mems_[0]->SetFlushJobInfo(GetFlushJobInfo());
+#endif  // !ROCKSDB_LITE
+
+  // Note that here we treat flush as level 0 compaction in internal stats
+  InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
+  const uint64_t micros = clock_->NowMicros() - start_micros;
+  const uint64_t cpu_micros = clock_->CPUMicros() - start_cpu_micros;
+  stats.micros = micros;
+  stats.cpu_micros = cpu_micros;
+
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "[%s] [JOB %d] Flush lasted %" PRIu64
+                 " microseconds, and %" PRIu64 " cpu microseconds.\n",
+                 cfd_->GetName().c_str(), job_context_->job_id, micros,
+                 cpu_micros);
+
+  if (has_output) {
+    stats.bytes_written = meta_.fd.GetFileSize();
+    stats.num_output_files = 1;
+  }
+
+  const auto& blobs = edit_->GetBlobFileAdditions();
+  for (const auto& blob : blobs) {
+    stats.bytes_written_blob += blob.GetTotalBlobBytes();
+  }
+
+  stats.num_output_files_blob = static_cast<int>(blobs.size());
+
+  RecordTimeToHistogram(stats_, FLUSH_TIME, stats.micros);
+  cfd_->internal_stats()->AddCompactionStats(0 /* level */, thread_pri_, stats);
+  cfd_->internal_stats()->AddCFStats(
+      InternalStats::BYTES_FLUSHED,
+      stats.bytes_written + stats.bytes_written_blob);
+  RecordFlushIOStats();
+
+  return s;
+}
+
+Env::IOPriority FlushJob::GetRateLimiterPriorityForWrite() {
+  if (versions_ && versions_->GetColumnFamilySet() &&
+      versions_->GetColumnFamilySet()->write_controller()) {
+    WriteController* write_controller =
+        versions_->GetColumnFamilySet()->write_controller();
+    if (write_controller->IsStopped() || write_controller->NeedsDelay()) {
+      return Env::IO_USER;
+    }
+  }
+
+  return Env::IO_HIGH;
+}
+
+#ifndef ROCKSDB_LITE
+std::unique_ptr<FlushJobInfo> FlushJob::GetFlushJobInfo() const {
+  db_mutex_->AssertHeld();
+  std::unique_ptr<FlushJobInfo> info(new FlushJobInfo{});
+  info->cf_id = cfd_->GetID();
+  info->cf_name = cfd_->GetName();
+
+  const uint64_t file_number = meta_.fd.GetNumber();
+  info->file_path =
+      MakeTableFileName(cfd_->ioptions()->cf_paths[0].path, file_number);
+  info->file_number = file_number;
+  info->oldest_blob_file_number = meta_.oldest_blob_file_number;
+  info->thread_id = db_options_.env->GetThreadID();
+  info->job_id = job_context_->job_id;
+  info->smallest_seqno = meta_.fd.smallest_seqno;
+  info->largest_seqno = meta_.fd.largest_seqno;
+  info->table_properties = table_properties_;
+  info->flush_reason = cfd_->GetFlushReason();
+  info->blob_compression_type = mutable_cf_options_.blob_compression_type;
+
+  // Update BlobFilesInfo.
+  for (const auto& blob_file : edit_->GetBlobFileAdditions()) {
+    BlobFileAdditionInfo blob_file_addition_info(
+        BlobFileName(cfd_->ioptions()->cf_paths.front().path,
+                     blob_file.GetBlobFileNumber()) /*blob_file_path*/,
+        blob_file.GetBlobFileNumber(), blob_file.GetTotalBlobCount(),
+        blob_file.GetTotalBlobBytes());
+    info->blob_file_addition_infos.emplace_back(
+        std::move(blob_file_addition_info));
+  }
+  return info;
+}
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/flush_job.h b/src/rocksdb/db/flush_job.h
new file mode 100644
index 000000000..60c272aec
--- /dev/null
+++ b/src/rocksdb/db/flush_job.h
@@ -0,0 +1,203 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <limits>
+#include <list>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/blob/blob_file_completion_callback.h"
+#include "db/column_family.h"
+#include "db/flush_scheduler.h"
+#include "db/internal_stats.h"
+#include "db/job_context.h"
+#include "db/log_writer.h"
+#include "db/logs_with_prep_tracker.h"
+#include "db/memtable_list.h"
+#include "db/seqno_to_time_mapping.h"
+#include "db/snapshot_impl.h"
+#include "db/version_edit.h"
+#include "db/write_controller.h"
+#include "db/write_thread.h"
+#include "logging/event_logger.h"
+#include "monitoring/instrumented_mutex.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/transaction_log.h"
+#include "table/scoped_arena_iterator.h"
+#include "util/autovector.h"
+#include "util/stop_watch.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImpl;
+class MemTable;
+class SnapshotChecker;
+class TableCache;
+class Version;
+class VersionEdit;
+class VersionSet;
+class Arena;
+
+class FlushJob {
+ public:
+  // TODO(icanadi) make effort to reduce number of parameters here
+  // IMPORTANT: mutable_cf_options needs to be alive while FlushJob is alive
+  FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
+           const ImmutableDBOptions& db_options,
+           const MutableCFOptions& mutable_cf_options, uint64_t max_memtable_id,
+           const FileOptions& file_options, VersionSet* versions,
+           InstrumentedMutex* db_mutex, std::atomic<bool>* shutting_down,
+           std::vector<SequenceNumber> existing_snapshots,
+           SequenceNumber earliest_write_conflict_snapshot,
+           SnapshotChecker* snapshot_checker, JobContext* job_context,
+           LogBuffer* log_buffer, FSDirectory* db_directory,
+           FSDirectory* output_file_directory,
+           CompressionType output_compression, Statistics* stats,
+           EventLogger* event_logger, bool measure_io_stats,
+           const bool sync_output_directory, const bool write_manifest,
+           Env::Priority thread_pri, const std::shared_ptr<IOTracer>& io_tracer,
+           const SeqnoToTimeMapping& seq_time_mapping,
+           const std::string& db_id = "", const std::string& db_session_id = "",
+           std::string full_history_ts_low = "",
+           BlobFileCompletionCallback* blob_callback = nullptr);
+
+  ~FlushJob();
+
+  // Require db_mutex held.
+  // Once PickMemTable() is called, either Run() or Cancel() has to be called.
+  void PickMemTable();
+  Status Run(LogsWithPrepTracker* prep_tracker = nullptr,
+             FileMetaData* file_meta = nullptr,
+             bool* switched_to_mempurge = nullptr);
+  void Cancel();
+  const autovector<MemTable*>& GetMemTables() const { return mems_; }
+
+#ifndef ROCKSDB_LITE
+  std::list<std::unique_ptr<FlushJobInfo>>* GetCommittedFlushJobsInfo() {
+    return &committed_flush_jobs_info_;
+  }
+#endif  // !ROCKSDB_LITE
+
+ private:
+  friend class FlushJobTest_GetRateLimiterPriorityForWrite_Test;
+
+  void ReportStartedFlush();
+  void ReportFlushInputSize(const autovector<MemTable*>& mems);
+  void RecordFlushIOStats();
+  Status WriteLevel0Table();
+
+  // Memtable Garbage Collection algorithm: a MemPurge takes the list
+  // of immutable memtables and filters out (or "purge") the outdated bytes
+  // out of it. The output (the filtered bytes, or "useful payload") is
+  // then transfered into a new memtable. If this memtable is filled, then
+  // the mempurge is aborted and rerouted to a regular flush process. Else,
+  // depending on the heuristics, placed onto the immutable memtable list.
+  // The addition to the imm list will not trigger a flush operation. The
+  // flush of the imm list will instead be triggered once the mutable memtable
+  // is added to the imm list.
+  // This process is typically intended for workloads with heavy overwrites
+  // when we want to avoid SSD writes (and reads) as much as possible.
+  // "MemPurge" is an experimental feature still at a very early stage
+  // of development. At the moment it is only compatible with the Get, Put,
+  // Delete operations as well as Iterators and CompactionFilters.
+  // For this early version, "MemPurge" is called by setting the
+  // options.experimental_mempurge_threshold value as >0.0. When this is
+  // the case, ALL automatic flush operations (kWRiteBufferManagerFull) will
+  // first go through the MemPurge process. Therefore, we strongly
+  // recommend all users not to set this flag as true given that the MemPurge
+  // process has not matured yet.
+  Status MemPurge();
+  bool MemPurgeDecider(double threshold);
+  // The rate limiter priority (io_priority) is determined dynamically here.
+  Env::IOPriority GetRateLimiterPriorityForWrite();
+#ifndef ROCKSDB_LITE
+  std::unique_ptr<FlushJobInfo> GetFlushJobInfo() const;
+#endif  // !ROCKSDB_LITE
+
+  const std::string& dbname_;
+  const std::string db_id_;
+  const std::string db_session_id_;
+  ColumnFamilyData* cfd_;
+  const ImmutableDBOptions& db_options_;
+  const MutableCFOptions& mutable_cf_options_;
+  // A variable storing the largest memtable id to flush in this
+  // flush job. RocksDB uses this variable to select the memtables to flush in
+  // this job. All memtables in this column family with an ID smaller than or
+  // equal to max_memtable_id_ will be selected for flush.
+  uint64_t max_memtable_id_;
+  const FileOptions file_options_;
+  VersionSet* versions_;
+  InstrumentedMutex* db_mutex_;
+  std::atomic<bool>* shutting_down_;
+  std::vector<SequenceNumber> existing_snapshots_;
+  SequenceNumber earliest_write_conflict_snapshot_;
+  SnapshotChecker* snapshot_checker_;
+  JobContext* job_context_;
+  LogBuffer* log_buffer_;
+  FSDirectory* db_directory_;
+  FSDirectory* output_file_directory_;
+  CompressionType output_compression_;
+  Statistics* stats_;
+  EventLogger* event_logger_;
+  TableProperties table_properties_;
+  bool measure_io_stats_;
+  // True if this flush job should call fsync on the output directory. False
+  // otherwise.
+  // Usually sync_output_directory_ is true. A flush job needs to call sync on
+  // the output directory before committing to the MANIFEST.
+  // However, an individual flush job does not have to call sync on the output
+  // directory if it is part of an atomic flush. After all flush jobs in the
+  // atomic flush succeed, call sync once on each distinct output directory.
+  const bool sync_output_directory_;
+  // True if this flush job should write to MANIFEST after successfully
+  // flushing memtables. False otherwise.
+  // Usually write_manifest_ is true. A flush job commits to the MANIFEST after
+  // flushing the memtables.
+  // However, an individual flush job cannot rashly write to the MANIFEST
+  // immediately after it finishes the flush if it is part of an atomic flush.
+  // In this case, only after all flush jobs succeed in flush can RocksDB
+  // commit to the MANIFEST.
+  const bool write_manifest_;
+  // The current flush job can commit flush result of a concurrent flush job.
+  // We collect FlushJobInfo of all jobs committed by current job and fire
+  // OnFlushCompleted for them.
+  std::list<std::unique_ptr<FlushJobInfo>> committed_flush_jobs_info_;
+
+  // Variables below are set by PickMemTable():
+  FileMetaData meta_;
+  autovector<MemTable*> mems_;
+  VersionEdit* edit_;
+  Version* base_;
+  bool pick_memtable_called;
+  Env::Priority thread_pri_;
+
+  const std::shared_ptr<IOTracer> io_tracer_;
+  SystemClock* clock_;
+
+  const std::string full_history_ts_low_;
+  BlobFileCompletionCallback* blob_callback_;
+
+  // reference to the seqno_time_mapping_ in db_impl.h, not safe to read without
+  // db mutex
+  const SeqnoToTimeMapping& db_impl_seqno_time_mapping_;
+  SeqnoToTimeMapping seqno_to_time_mapping_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/flush_job_test.cc b/src/rocksdb/db/flush_job_test.cc
new file mode 100644
index 000000000..f994b4e9b
--- /dev/null
+++ b/src/rocksdb/db/flush_job_test.cc
@@ -0,0 +1,745 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/flush_job.h"
+
+#include <algorithm>
+#include <array>
+#include <map>
+#include <string>
+
+#include "db/blob/blob_index.h"
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/version_set.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/mock_table.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO(icanadi) Mock out everything else:
+// 1. VersionSet
+// 2. Memtable
+class FlushJobTestBase : public testing::Test {
+ protected:
+  FlushJobTestBase(std::string dbname, const Comparator* ucmp)
+      : env_(Env::Default()),
+        fs_(env_->GetFileSystem()),
+        dbname_(std::move(dbname)),
+        ucmp_(ucmp),
+        options_(),
+        db_options_(options_),
+        column_family_names_({kDefaultColumnFamilyName, "foo", "bar"}),
+        table_cache_(NewLRUCache(50000, 16)),
+        write_buffer_manager_(db_options_.db_write_buffer_size),
+        shutting_down_(false),
+        mock_table_factory_(new mock::MockTableFactory()) {}
+
+  virtual ~FlushJobTestBase() {
+    if (getenv("KEEP_DB")) {
+      fprintf(stdout, "db is still in %s\n", dbname_.c_str());
+    } else {
+      // destroy versions_ to release all file handles
+      versions_.reset();
+      EXPECT_OK(DestroyDir(env_, dbname_));
+    }
+  }
+
+  void NewDB() {
+    ASSERT_OK(SetIdentityFile(env_, dbname_));
+    VersionEdit new_db;
+
+    new_db.SetLogNumber(0);
+    new_db.SetNextFile(2);
+    new_db.SetLastSequence(0);
+
+    autovector<VersionEdit> new_cfs;
+    SequenceNumber last_seq = 1;
+    uint32_t cf_id = 1;
+    for (size_t i = 1; i != column_family_names_.size(); ++i) {
+      VersionEdit new_cf;
+      new_cf.AddColumnFamily(column_family_names_[i]);
+      new_cf.SetColumnFamily(cf_id++);
+      new_cf.SetComparatorName(ucmp_->Name());
+      new_cf.SetLogNumber(0);
+      new_cf.SetNextFile(2);
+      new_cf.SetLastSequence(last_seq++);
+      new_cfs.emplace_back(new_cf);
+    }
+
+    const std::string manifest = DescriptorFileName(dbname_, 1);
+    const auto& fs = env_->GetFileSystem();
+    std::unique_ptr<WritableFileWriter> file_writer;
+    Status s = WritableFileWriter::Create(
+        fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer,
+        nullptr);
+    ASSERT_OK(s);
+
+    {
+      log::Writer log(std::move(file_writer), 0, false);
+      std::string record;
+      new_db.EncodeTo(&record);
+      s = log.AddRecord(record);
+      ASSERT_OK(s);
+
+      for (const auto& e : new_cfs) {
+        record.clear();
+        e.EncodeTo(&record);
+        s = log.AddRecord(record);
+        ASSERT_OK(s);
+      }
+    }
+    ASSERT_OK(s);
+    // Make "CURRENT" file that points to the new manifest file.
+    s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
+    ASSERT_OK(s);
+  }
+
+  void SetUp() override {
+    EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+
+    // TODO(icanadi) Remove this once we mock out VersionSet
+    NewDB();
+
+    db_options_.env = env_;
+    db_options_.fs = fs_;
+    db_options_.db_paths.emplace_back(dbname_,
+                                      std::numeric_limits<uint64_t>::max());
+    db_options_.statistics = CreateDBStatistics();
+
+    cf_options_.comparator = ucmp_;
+
+    std::vector<ColumnFamilyDescriptor> column_families;
+    cf_options_.table_factory = mock_table_factory_;
+    for (const auto& cf_name : column_family_names_) {
+      column_families.emplace_back(cf_name, cf_options_);
+    }
+
+    versions_.reset(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_id*/ "", /*db_session_id*/ ""));
+    EXPECT_OK(versions_->Recover(column_families, false));
+  }
+
+  Env* env_;
+  std::shared_ptr<FileSystem> fs_;
+  std::string dbname_;
+  const Comparator* const ucmp_;
+  EnvOptions env_options_;
+  Options options_;
+  ImmutableDBOptions db_options_;
+  const std::vector<std::string> column_family_names_;
+  std::shared_ptr<Cache> table_cache_;
+  WriteController write_controller_;
+  WriteBufferManager write_buffer_manager_;
+  ColumnFamilyOptions cf_options_;
+  std::unique_ptr<VersionSet> versions_;
+  InstrumentedMutex mutex_;
+  std::atomic<bool> shutting_down_;
+  std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
+
+  SeqnoToTimeMapping empty_seqno_to_time_mapping_;
+};
+
+class FlushJobTest : public FlushJobTestBase {
+ public:
+  FlushJobTest()
+      : FlushJobTestBase(test::PerThreadDBPath("flush_job_test"),
+                         BytewiseComparator()) {}
+};
+
+TEST_F(FlushJobTest, Empty) {
+  JobContext job_context(0);
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  EventLogger event_logger(db_options_.info_log.get());
+  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
+  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
+                     db_options_, *cfd->GetLatestMutableCFOptions(),
+                     std::numeric_limits<uint64_t>::max() /* memtable_id */,
+                     env_options_, versions_.get(), &mutex_, &shutting_down_,
+                     {}, kMaxSequenceNumber, snapshot_checker, &job_context,
+                     nullptr, nullptr, nullptr, kNoCompression, nullptr,
+                     &event_logger, false, true /* sync_output_directory */,
+                     true /* write_manifest */, Env::Priority::USER,
+                     nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
+  {
+    InstrumentedMutexLock l(&mutex_);
+    flush_job.PickMemTable();
+    ASSERT_OK(flush_job.Run());
+  }
+  job_context.Clean();
+}
+
+TEST_F(FlushJobTest, NonEmpty) {
+  JobContext job_context(0);
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  auto new_mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                                           kMaxSequenceNumber);
+  new_mem->Ref();
+  auto inserted_keys = mock::MakeMockFile();
+  // Test data:
+  //   seqno [    1,    2 ... 8998, 8999, 9000, 9001, 9002 ... 9999 ]
+  //   key   [ 1001, 1002 ... 9998, 9999,    0,    1,    2 ...  999 ]
+  //   range-delete "9995" -> "9999" at seqno 10000
+  //   blob references with seqnos 10001..10006
+  for (int i = 1; i < 10000; ++i) {
+    std::string key(std::to_string((i + 1000) % 10000));
+    std::string value("value" + key);
+    ASSERT_OK(new_mem->Add(SequenceNumber(i), kTypeValue, key, value,
+                           nullptr /* kv_prot_info */));
+    if ((i + 1000) % 10000 < 9995) {
+      InternalKey internal_key(key, SequenceNumber(i), kTypeValue);
+      inserted_keys.push_back({internal_key.Encode().ToString(), value});
+    }
+  }
+
+  {
+    ASSERT_OK(new_mem->Add(SequenceNumber(10000), kTypeRangeDeletion, "9995",
+                           "9999a", nullptr /* kv_prot_info */));
+    InternalKey internal_key("9995", SequenceNumber(10000), kTypeRangeDeletion);
+    inserted_keys.push_back({internal_key.Encode().ToString(), "9999a"});
+  }
+
+  // Note: the first two blob references will not be considered when resolving
+  // the oldest blob file referenced (the first one is inlined TTL, while the
+  // second one is TTL and thus points to a TTL blob file).
+  constexpr std::array<uint64_t, 6> blob_file_numbers{
+      {kInvalidBlobFileNumber, 5, 103, 17, 102, 101}};
+  for (size_t i = 0; i < blob_file_numbers.size(); ++i) {
+    std::string key(std::to_string(i + 10001));
+    std::string blob_index;
+    if (i == 0) {
+      BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 1234567890ULL,
+                                  "foo");
+    } else if (i == 1) {
+      BlobIndex::EncodeBlobTTL(&blob_index, /* expiration */ 1234567890ULL,
+                               blob_file_numbers[i], /* offset */ i << 10,
+                               /* size */ i << 20, kNoCompression);
+    } else {
+      BlobIndex::EncodeBlob(&blob_index, blob_file_numbers[i],
+                            /* offset */ i << 10, /* size */ i << 20,
+                            kNoCompression);
+    }
+
+    const SequenceNumber seq(i + 10001);
+    ASSERT_OK(new_mem->Add(seq, kTypeBlobIndex, key, blob_index,
+                           nullptr /* kv_prot_info */));
+
+    InternalKey internal_key(key, seq, kTypeBlobIndex);
+    inserted_keys.push_back({internal_key.Encode().ToString(), blob_index});
+  }
+  mock::SortKVVector(&inserted_keys);
+
+  autovector<MemTable*> to_delete;
+  new_mem->ConstructFragmentedRangeTombstones();
+  cfd->imm()->Add(new_mem, &to_delete);
+  for (auto& m : to_delete) {
+    delete m;
+  }
+
+  EventLogger event_logger(db_options_.info_log.get());
+  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
+  FlushJob flush_job(
+      dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
+      *cfd->GetLatestMutableCFOptions(),
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, env_options_,
+      versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
+      snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression,
+      db_options_.statistics.get(), &event_logger, true,
+      true /* sync_output_directory */, true /* write_manifest */,
+      Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
+
+  HistogramData hist;
+  FileMetaData file_meta;
+  mutex_.Lock();
+  flush_job.PickMemTable();
+  ASSERT_OK(flush_job.Run(nullptr, &file_meta));
+  mutex_.Unlock();
+  db_options_.statistics->histogramData(FLUSH_TIME, &hist);
+  ASSERT_GT(hist.average, 0.0);
+
+  ASSERT_EQ(std::to_string(0), file_meta.smallest.user_key().ToString());
+  ASSERT_EQ("9999a", file_meta.largest.user_key().ToString());
+  ASSERT_EQ(1, file_meta.fd.smallest_seqno);
+  ASSERT_EQ(10006, file_meta.fd.largest_seqno);
+  ASSERT_EQ(17, file_meta.oldest_blob_file_number);
+  mock_table_factory_->AssertSingleFile(inserted_keys);
+  job_context.Clean();
+}
+
+TEST_F(FlushJobTest, FlushMemTablesSingleColumnFamily) {
+  const size_t num_mems = 2;
+  const size_t num_mems_to_flush = 1;
+  const size_t num_keys_per_table = 100;
+  JobContext job_context(0);
+  ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
+  std::vector<uint64_t> memtable_ids;
+  std::vector<MemTable*> new_mems;
+  for (size_t i = 0; i != num_mems; ++i) {
+    MemTable* mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                                              kMaxSequenceNumber);
+    mem->SetID(i);
+    mem->Ref();
+    new_mems.emplace_back(mem);
+    memtable_ids.push_back(mem->GetID());
+
+    for (size_t j = 0; j < num_keys_per_table; ++j) {
+      std::string key(std::to_string(j + i * num_keys_per_table));
+      std::string value("value" + key);
+      ASSERT_OK(mem->Add(SequenceNumber(j + i * num_keys_per_table), kTypeValue,
+                         key, value, nullptr /* kv_prot_info */));
+    }
+  }
+
+  autovector<MemTable*> to_delete;
+  for (auto mem : new_mems) {
+    mem->ConstructFragmentedRangeTombstones();
+    cfd->imm()->Add(mem, &to_delete);
+  }
+
+  EventLogger event_logger(db_options_.info_log.get());
+  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
+
+  assert(memtable_ids.size() == num_mems);
+  uint64_t smallest_memtable_id = memtable_ids.front();
+  uint64_t flush_memtable_id = smallest_memtable_id + num_mems_to_flush - 1;
+  FlushJob flush_job(
+      dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
+      *cfd->GetLatestMutableCFOptions(), flush_memtable_id, env_options_,
+      versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
+      snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression,
+      db_options_.statistics.get(), &event_logger, true,
+      true /* sync_output_directory */, true /* write_manifest */,
+      Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
+  HistogramData hist;
+  FileMetaData file_meta;
+  mutex_.Lock();
+  flush_job.PickMemTable();
+  ASSERT_OK(flush_job.Run(nullptr /* prep_tracker */, &file_meta));
+  mutex_.Unlock();
+  db_options_.statistics->histogramData(FLUSH_TIME, &hist);
+  ASSERT_GT(hist.average, 0.0);
+
+  ASSERT_EQ(std::to_string(0), file_meta.smallest.user_key().ToString());
+  ASSERT_EQ("99", file_meta.largest.user_key().ToString());
+  ASSERT_EQ(0, file_meta.fd.smallest_seqno);
+  ASSERT_EQ(SequenceNumber(num_mems_to_flush * num_keys_per_table - 1),
+            file_meta.fd.largest_seqno);
+  ASSERT_EQ(kInvalidBlobFileNumber, file_meta.oldest_blob_file_number);
+
+  for (auto m : to_delete) {
+    delete m;
+  }
+  to_delete.clear();
+  job_context.Clean();
+}
+
+TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) {
+  autovector<ColumnFamilyData*> all_cfds;
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    all_cfds.push_back(cfd);
+  }
+  const std::vector<size_t> num_memtables = {2, 1, 3};
+  assert(num_memtables.size() == column_family_names_.size());
+  const size_t num_keys_per_memtable = 1000;
+  JobContext job_context(0);
+  std::vector<uint64_t> memtable_ids;
+  std::vector<SequenceNumber> smallest_seqs;
+  std::vector<SequenceNumber> largest_seqs;
+  autovector<MemTable*> to_delete;
+  SequenceNumber curr_seqno = 0;
+  size_t k = 0;
+  for (auto cfd : all_cfds) {
+    smallest_seqs.push_back(curr_seqno);
+    for (size_t i = 0; i != num_memtables[k]; ++i) {
+      MemTable* mem = cfd->ConstructNewMemtable(
+          *cfd->GetLatestMutableCFOptions(), kMaxSequenceNumber);
+      mem->SetID(i);
+      mem->Ref();
+
+      for (size_t j = 0; j != num_keys_per_memtable; ++j) {
+        std::string key(std::to_string(j + i * num_keys_per_memtable));
+        std::string value("value" + key);
+        ASSERT_OK(mem->Add(curr_seqno++, kTypeValue, key, value,
+                           nullptr /* kv_prot_info */));
+      }
+      mem->ConstructFragmentedRangeTombstones();
+      cfd->imm()->Add(mem, &to_delete);
+    }
+    largest_seqs.push_back(curr_seqno - 1);
+    memtable_ids.push_back(num_memtables[k++] - 1);
+  }
+
+  EventLogger event_logger(db_options_.info_log.get());
+  SnapshotChecker* snapshot_checker = nullptr;  // not relevant
+  std::vector<std::unique_ptr<FlushJob>> flush_jobs;
+  k = 0;
+  for (auto cfd : all_cfds) {
+    std::vector<SequenceNumber> snapshot_seqs;
+    flush_jobs.emplace_back(new FlushJob(
+        dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(),
+        memtable_ids[k], env_options_, versions_.get(), &mutex_,
+        &shutting_down_, snapshot_seqs, kMaxSequenceNumber, snapshot_checker,
+        &job_context, nullptr, nullptr, nullptr, kNoCompression,
+        db_options_.statistics.get(), &event_logger, true,
+        false /* sync_output_directory */, false /* write_manifest */,
+        Env::Priority::USER, nullptr /*IOTracer*/,
+        empty_seqno_to_time_mapping_));
+    k++;
+  }
+  HistogramData hist;
+  std::vector<FileMetaData> file_metas;
+  // Call reserve to avoid auto-resizing
+  file_metas.reserve(flush_jobs.size());
+  mutex_.Lock();
+  for (auto& job : flush_jobs) {
+    job->PickMemTable();
+  }
+  for (auto& job : flush_jobs) {
+    FileMetaData meta;
+    // Run will release and re-acquire  mutex
+    ASSERT_OK(job->Run(nullptr /**/, &meta));
+    file_metas.emplace_back(meta);
+  }
+  autovector<FileMetaData*> file_meta_ptrs;
+  for (auto& meta : file_metas) {
+    file_meta_ptrs.push_back(&meta);
+  }
+  autovector<const autovector<MemTable*>*> mems_list;
+  for (size_t i = 0; i != all_cfds.size(); ++i) {
+    const auto& mems = flush_jobs[i]->GetMemTables();
+    mems_list.push_back(&mems);
+  }
+  autovector<const MutableCFOptions*> mutable_cf_options_list;
+  for (auto cfd : all_cfds) {
+    mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions());
+  }
+  autovector<std::list<std::unique_ptr<FlushJobInfo>>*>
+      committed_flush_jobs_info;
+#ifndef ROCKSDB_LITE
+  for (auto& job : flush_jobs) {
+    committed_flush_jobs_info.push_back(job->GetCommittedFlushJobsInfo());
+  }
+#endif  //! ROCKSDB_LITE
+
+  Status s = InstallMemtableAtomicFlushResults(
+      nullptr /* imm_lists */, all_cfds, mutable_cf_options_list, mems_list,
+      versions_.get(), nullptr /* prep_tracker */, &mutex_, file_meta_ptrs,
+      committed_flush_jobs_info, &job_context.memtables_to_free,
+      nullptr /* db_directory */, nullptr /* log_buffer */);
+  ASSERT_OK(s);
+
+  mutex_.Unlock();
+  db_options_.statistics->histogramData(FLUSH_TIME, &hist);
+  ASSERT_GT(hist.average, 0.0);
+  k = 0;
+  for (const auto& file_meta : file_metas) {
+    ASSERT_EQ(std::to_string(0), file_meta.smallest.user_key().ToString());
+    ASSERT_EQ("999", file_meta.largest.user_key()
+                         .ToString());  // max key by bytewise comparator
+    ASSERT_EQ(smallest_seqs[k], file_meta.fd.smallest_seqno);
+    ASSERT_EQ(largest_seqs[k], file_meta.fd.largest_seqno);
+    // Verify that imm is empty
+    ASSERT_EQ(std::numeric_limits<uint64_t>::max(),
+              all_cfds[k]->imm()->GetEarliestMemTableID());
+    ASSERT_EQ(0, all_cfds[k]->imm()->GetLatestMemTableID());
+    ++k;
+  }
+
+  for (auto m : to_delete) {
+    delete m;
+  }
+  to_delete.clear();
+  job_context.Clean();
+}
+
+TEST_F(FlushJobTest, Snapshots) {
+  JobContext job_context(0);
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  auto new_mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                                           kMaxSequenceNumber);
+
+  std::set<SequenceNumber> snapshots_set;
+  int keys = 10000;
+  int max_inserts_per_keys = 8;
+
+  Random rnd(301);
+  for (int i = 0; i < keys / 2; ++i) {
+    snapshots_set.insert(rnd.Uniform(keys * (max_inserts_per_keys / 2)) + 1);
+  }
+  // set has already removed the duplicate snapshots
+  std::vector<SequenceNumber> snapshots(snapshots_set.begin(),
+                                        snapshots_set.end());
+
+  new_mem->Ref();
+  SequenceNumber current_seqno = 0;
+  auto inserted_keys = mock::MakeMockFile();
+  for (int i = 1; i < keys; ++i) {
+    std::string key(std::to_string(i));
+    int insertions = rnd.Uniform(max_inserts_per_keys);
+    for (int j = 0; j < insertions; ++j) {
+      std::string value(rnd.HumanReadableString(10));
+      auto seqno = ++current_seqno;
+      ASSERT_OK(new_mem->Add(SequenceNumber(seqno), kTypeValue, key, value,
+                             nullptr /* kv_prot_info */));
+      // a key is visible only if:
+      // 1. it's the last one written (j == insertions - 1)
+      // 2. there's a snapshot pointing at it
+      bool visible = (j == insertions - 1) ||
+                     (snapshots_set.find(seqno) != snapshots_set.end());
+      if (visible) {
+        InternalKey internal_key(key, seqno, kTypeValue);
+        inserted_keys.push_back({internal_key.Encode().ToString(), value});
+      }
+    }
+  }
+  mock::SortKVVector(&inserted_keys);
+
+  autovector<MemTable*> to_delete;
+  new_mem->ConstructFragmentedRangeTombstones();
+  cfd->imm()->Add(new_mem, &to_delete);
+  for (auto& m : to_delete) {
+    delete m;
+  }
+
+  EventLogger event_logger(db_options_.info_log.get());
+  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
+  FlushJob flush_job(
+      dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
+      *cfd->GetLatestMutableCFOptions(),
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, env_options_,
+      versions_.get(), &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber,
+      snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression,
+      db_options_.statistics.get(), &event_logger, true,
+      true /* sync_output_directory */, true /* write_manifest */,
+      Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
+  mutex_.Lock();
+  flush_job.PickMemTable();
+  ASSERT_OK(flush_job.Run());
+  mutex_.Unlock();
+  mock_table_factory_->AssertSingleFile(inserted_keys);
+  HistogramData hist;
+  db_options_.statistics->histogramData(FLUSH_TIME, &hist);
+  ASSERT_GT(hist.average, 0.0);
+  job_context.Clean();
+}
+
+TEST_F(FlushJobTest, GetRateLimiterPriorityForWrite) {
+  // Prepare a FlushJob that flush MemTables of Single Column Family.
+  const size_t num_mems = 2;
+  const size_t num_mems_to_flush = 1;
+  const size_t num_keys_per_table = 100;
+  JobContext job_context(0);
+  ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
+  std::vector<uint64_t> memtable_ids;
+  std::vector<MemTable*> new_mems;
+  for (size_t i = 0; i != num_mems; ++i) {
+    MemTable* mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                                              kMaxSequenceNumber);
+    mem->SetID(i);
+    mem->Ref();
+    new_mems.emplace_back(mem);
+    memtable_ids.push_back(mem->GetID());
+
+    for (size_t j = 0; j < num_keys_per_table; ++j) {
+      std::string key(std::to_string(j + i * num_keys_per_table));
+      std::string value("value" + key);
+      ASSERT_OK(mem->Add(SequenceNumber(j + i * num_keys_per_table), kTypeValue,
+                         key, value, nullptr /* kv_prot_info */));
+    }
+  }
+
+  autovector<MemTable*> to_delete;
+  for (auto mem : new_mems) {
+    mem->ConstructFragmentedRangeTombstones();
+    cfd->imm()->Add(mem, &to_delete);
+  }
+
+  EventLogger event_logger(db_options_.info_log.get());
+  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
+
+  assert(memtable_ids.size() == num_mems);
+  uint64_t smallest_memtable_id = memtable_ids.front();
+  uint64_t flush_memtable_id = smallest_memtable_id + num_mems_to_flush - 1;
+  FlushJob flush_job(
+      dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
+      *cfd->GetLatestMutableCFOptions(), flush_memtable_id, env_options_,
+      versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
+      snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression,
+      db_options_.statistics.get(), &event_logger, true,
+      true /* sync_output_directory */, true /* write_manifest */,
+      Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
+
+  // When the state from WriteController is normal.
+  ASSERT_EQ(flush_job.GetRateLimiterPriorityForWrite(), Env::IO_HIGH);
+
+  WriteController* write_controller =
+      flush_job.versions_->GetColumnFamilySet()->write_controller();
+
+  {
+    // When the state from WriteController is Delayed.
+    std::unique_ptr<WriteControllerToken> delay_token =
+        write_controller->GetDelayToken(1000000);
+    ASSERT_EQ(flush_job.GetRateLimiterPriorityForWrite(), Env::IO_USER);
+  }
+
+  {
+    // When the state from WriteController is Stopped.
+    std::unique_ptr<WriteControllerToken> stop_token =
+        write_controller->GetStopToken();
+    ASSERT_EQ(flush_job.GetRateLimiterPriorityForWrite(), Env::IO_USER);
+  }
+}
+
+class FlushJobTimestampTest : public FlushJobTestBase {
+ public:
+  FlushJobTimestampTest()
+      : FlushJobTestBase(test::PerThreadDBPath("flush_job_ts_gc_test"),
+                         test::BytewiseComparatorWithU64TsWrapper()) {}
+
+  void AddKeyValueToMemtable(MemTable* memtable, std::string key, uint64_t ts,
+                             SequenceNumber seq, ValueType value_type,
+                             Slice value) {
+    std::string key_str(std::move(key));
+    PutFixed64(&key_str, ts);
+    ASSERT_OK(memtable->Add(seq, value_type, key_str, value,
+                            nullptr /* kv_prot_info */));
+  }
+
+ protected:
+  static constexpr uint64_t kStartTs = 10;
+  static constexpr SequenceNumber kStartSeq = 0;
+  SequenceNumber curr_seq_{kStartSeq};
+  std::atomic<uint64_t> curr_ts_{kStartTs};
+};
+
+TEST_F(FlushJobTimestampTest, AllKeysExpired) {
+  ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
+  autovector<MemTable*> to_delete;
+
+  {
+    MemTable* new_mem = cfd->ConstructNewMemtable(
+        *cfd->GetLatestMutableCFOptions(), kMaxSequenceNumber);
+    new_mem->Ref();
+    for (int i = 0; i < 100; ++i) {
+      uint64_t ts = curr_ts_.fetch_add(1);
+      SequenceNumber seq = (curr_seq_++);
+      AddKeyValueToMemtable(new_mem, test::EncodeInt(0), ts, seq,
+                            ValueType::kTypeValue, "0_value");
+    }
+    uint64_t ts = curr_ts_.fetch_add(1);
+    SequenceNumber seq = (curr_seq_++);
+    AddKeyValueToMemtable(new_mem, test::EncodeInt(0), ts, seq,
+                          ValueType::kTypeDeletionWithTimestamp, "");
+    new_mem->ConstructFragmentedRangeTombstones();
+    cfd->imm()->Add(new_mem, &to_delete);
+  }
+
+  std::vector<SequenceNumber> snapshots;
+  constexpr SnapshotChecker* const snapshot_checker = nullptr;
+  JobContext job_context(0);
+  EventLogger event_logger(db_options_.info_log.get());
+  std::string full_history_ts_low;
+  PutFixed64(&full_history_ts_low, std::numeric_limits<uint64_t>::max());
+  FlushJob flush_job(
+      dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(),
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, env_options_,
+      versions_.get(), &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber,
+      snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression,
+      db_options_.statistics.get(), &event_logger, true,
+      true /* sync_output_directory */, true /* write_manifest */,
+      Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_,
+      /*db_id=*/"",
+      /*db_session_id=*/"", full_history_ts_low);
+
+  FileMetaData fmeta;
+  mutex_.Lock();
+  flush_job.PickMemTable();
+  ASSERT_OK(flush_job.Run(/*prep_tracker=*/nullptr, &fmeta));
+  mutex_.Unlock();
+
+  {
+    std::string key = test::EncodeInt(0);
+    key.append(test::EncodeInt(curr_ts_.load(std::memory_order_relaxed) - 1));
+    InternalKey ikey(key, curr_seq_ - 1, ValueType::kTypeDeletionWithTimestamp);
+    ASSERT_EQ(ikey.Encode(), fmeta.smallest.Encode());
+    ASSERT_EQ(ikey.Encode(), fmeta.largest.Encode());
+  }
+
+  job_context.Clean();
+  ASSERT_TRUE(to_delete.empty());
+}
+
+TEST_F(FlushJobTimestampTest, NoKeyExpired) {
+  ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
+  autovector<MemTable*> to_delete;
+
+  {
+    MemTable* new_mem = cfd->ConstructNewMemtable(
+        *cfd->GetLatestMutableCFOptions(), kMaxSequenceNumber);
+    new_mem->Ref();
+    for (int i = 0; i < 100; ++i) {
+      uint64_t ts = curr_ts_.fetch_add(1);
+      SequenceNumber seq = (curr_seq_++);
+      AddKeyValueToMemtable(new_mem, test::EncodeInt(0), ts, seq,
+                            ValueType::kTypeValue, "0_value");
+    }
+    new_mem->ConstructFragmentedRangeTombstones();
+    cfd->imm()->Add(new_mem, &to_delete);
+  }
+
+  std::vector<SequenceNumber> snapshots;
+  SnapshotChecker* const snapshot_checker = nullptr;
+  JobContext job_context(0);
+  EventLogger event_logger(db_options_.info_log.get());
+  std::string full_history_ts_low;
+  PutFixed64(&full_history_ts_low, 0);
+  FlushJob flush_job(
+      dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(),
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, env_options_,
+      versions_.get(), &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber,
+      snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression,
+      db_options_.statistics.get(), &event_logger, true,
+      true /* sync_output_directory */, true /* write_manifest */,
+      Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_,
+      /*db_id=*/"",
+      /*db_session_id=*/"", full_history_ts_low);
+
+  FileMetaData fmeta;
+  mutex_.Lock();
+  flush_job.PickMemTable();
+  ASSERT_OK(flush_job.Run(/*prep_tracker=*/nullptr, &fmeta));
+  mutex_.Unlock();
+
+  {
+    std::string ukey = test::EncodeInt(0);
+    std::string smallest_key =
+        ukey + test::EncodeInt(curr_ts_.load(std::memory_order_relaxed) - 1);
+    std::string largest_key = ukey + test::EncodeInt(kStartTs);
+    InternalKey smallest(smallest_key, curr_seq_ - 1, ValueType::kTypeValue);
+    InternalKey largest(largest_key, kStartSeq, ValueType::kTypeValue);
+    ASSERT_EQ(smallest.Encode(), fmeta.smallest.Encode());
+    ASSERT_EQ(largest.Encode(), fmeta.largest.Encode());
+  }
+  job_context.Clean();
+  ASSERT_TRUE(to_delete.empty());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/flush_scheduler.cc b/src/rocksdb/db/flush_scheduler.cc
new file mode 100644
index 000000000..6f4d3e1a5
--- /dev/null
+++ b/src/rocksdb/db/flush_scheduler.cc
@@ -0,0 +1,86 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/flush_scheduler.h"
+
+#include <cassert>
+
+#include "db/column_family.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void FlushScheduler::ScheduleWork(ColumnFamilyData* cfd) {
+#ifndef NDEBUG
+  {
+    std::lock_guard<std::mutex> lock(checking_mutex_);
+    assert(checking_set_.count(cfd) == 0);
+    checking_set_.insert(cfd);
+  }
+#endif  // NDEBUG
+  cfd->Ref();
+// Suppress false positive clang analyzer warnings.
+#ifndef __clang_analyzer__
+  Node* node = new Node{cfd, head_.load(std::memory_order_relaxed)};
+  while (!head_.compare_exchange_strong(
+      node->next, node, std::memory_order_relaxed, std::memory_order_relaxed)) {
+    // failing CAS updates the first param, so we are already set for
+    // retry.  TakeNextColumnFamily won't happen until after another
+    // inter-thread synchronization, so we don't even need release
+    // semantics for this CAS
+  }
+#endif  // __clang_analyzer__
+}
+
+ColumnFamilyData* FlushScheduler::TakeNextColumnFamily() {
+  while (true) {
+    if (head_.load(std::memory_order_relaxed) == nullptr) {
+      return nullptr;
+    }
+
+    // dequeue the head
+    Node* node = head_.load(std::memory_order_relaxed);
+    head_.store(node->next, std::memory_order_relaxed);
+    ColumnFamilyData* cfd = node->column_family;
+    delete node;
+
+#ifndef NDEBUG
+    {
+      std::lock_guard<std::mutex> lock(checking_mutex_);
+      auto iter = checking_set_.find(cfd);
+      assert(iter != checking_set_.end());
+      checking_set_.erase(iter);
+    }
+#endif  // NDEBUG
+
+    if (!cfd->IsDropped()) {
+      // success
+      return cfd;
+    }
+
+    // no longer relevant, retry
+    cfd->UnrefAndTryDelete();
+  }
+}
+
+bool FlushScheduler::Empty() {
+  auto rv = head_.load(std::memory_order_relaxed) == nullptr;
+#ifndef NDEBUG
+  std::lock_guard<std::mutex> lock(checking_mutex_);
+  // Empty is allowed to be called concurrnetly with ScheduleFlush. It would
+  // only miss the recent schedules.
+  assert((rv == checking_set_.empty()) || rv);
+#endif  // NDEBUG
+  return rv;
+}
+
+void FlushScheduler::Clear() {
+  ColumnFamilyData* cfd;
+  while ((cfd = TakeNextColumnFamily()) != nullptr) {
+    cfd->UnrefAndTryDelete();
+  }
+  assert(head_.load(std::memory_order_relaxed) == nullptr);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/flush_scheduler.h b/src/rocksdb/db/flush_scheduler.h
new file mode 100644
index 000000000..eb03f3e11
--- /dev/null
+++ b/src/rocksdb/db/flush_scheduler.h
@@ -0,0 +1,55 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+#include <mutex>
+#include <set>
+
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ColumnFamilyData;
+
+// FlushScheduler keeps track of all column families whose memtable may
+// be full and require flushing. Unless otherwise noted, all methods on
+// FlushScheduler should be called only with the DB mutex held or from
+// a single-threaded recovery context.
+class FlushScheduler {
+ public:
+  FlushScheduler() : head_(nullptr) {}
+
+  // May be called from multiple threads at once, but not concurrent with
+  // any other method calls on this instance
+  void ScheduleWork(ColumnFamilyData* cfd);
+
+  // Removes and returns Ref()-ed column family. Client needs to Unref().
+  // Filters column families that have been dropped.
+  ColumnFamilyData* TakeNextColumnFamily();
+
+  // This can be called concurrently with ScheduleWork but it would miss all
+  // the scheduled flushes after the last synchronization. This would result
+  // into less precise enforcement of memtable sizes but should not matter much.
+  bool Empty();
+
+  void Clear();
+
+ private:
+  struct Node {
+    ColumnFamilyData* column_family;
+    Node* next;
+  };
+
+  std::atomic<Node*> head_;
+#ifndef NDEBUG
+  std::mutex checking_mutex_;
+  std::set<ColumnFamilyData*> checking_set_;
+#endif  // NDEBUG
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/forward_iterator.cc b/src/rocksdb/db/forward_iterator.cc
new file mode 100644
index 000000000..3fbc2cf47
--- /dev/null
+++ b/src/rocksdb/db/forward_iterator.cc
@@ -0,0 +1,1062 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "db/forward_iterator.h"
+
+#include <limits>
+#include <string>
+#include <utility>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/job_context.h"
+#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "table/merging_iterator.h"
+#include "test_util/sync_point.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Usage:
+//     ForwardLevelIterator iter;
+//     iter.SetFileIndex(file_index);
+//     iter.Seek(target); // or iter.SeekToFirst();
+//     iter.Next()
+class ForwardLevelIterator : public InternalIterator {
+ public:
+  ForwardLevelIterator(
+      const ColumnFamilyData* const cfd, const ReadOptions& read_options,
+      const std::vector<FileMetaData*>& files,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor,
+      bool allow_unprepared_value)
+      : cfd_(cfd),
+        read_options_(read_options),
+        files_(files),
+        valid_(false),
+        file_index_(std::numeric_limits<uint32_t>::max()),
+        file_iter_(nullptr),
+        pinned_iters_mgr_(nullptr),
+        prefix_extractor_(prefix_extractor),
+        allow_unprepared_value_(allow_unprepared_value) {
+    status_.PermitUncheckedError();  // Allow uninitialized status through
+  }
+
+  ~ForwardLevelIterator() override {
+    // Reset current pointer
+    if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
+      pinned_iters_mgr_->PinIterator(file_iter_);
+    } else {
+      delete file_iter_;
+    }
+  }
+
+  void SetFileIndex(uint32_t file_index) {
+    assert(file_index < files_.size());
+    status_ = Status::OK();
+    if (file_index != file_index_) {
+      file_index_ = file_index;
+      Reset();
+    }
+  }
+  void Reset() {
+    assert(file_index_ < files_.size());
+
+    // Reset current pointer
+    if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
+      pinned_iters_mgr_->PinIterator(file_iter_);
+    } else {
+      delete file_iter_;
+    }
+
+    ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
+                                         kMaxSequenceNumber /* upper_bound */);
+    file_iter_ = cfd_->table_cache()->NewIterator(
+        read_options_, *(cfd_->soptions()), cfd_->internal_comparator(),
+        *files_[file_index_],
+        read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
+        prefix_extractor_, /*table_reader_ptr=*/nullptr,
+        /*file_read_hist=*/nullptr, TableReaderCaller::kUserIterator,
+        /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1,
+        /*max_file_size_for_l0_meta_pin=*/0,
+        /*smallest_compaction_key=*/nullptr,
+        /*largest_compaction_key=*/nullptr, allow_unprepared_value_);
+    file_iter_->SetPinnedItersMgr(pinned_iters_mgr_);
+    valid_ = false;
+    if (!range_del_agg.IsEmpty()) {
+      status_ = Status::NotSupported(
+          "Range tombstones unsupported with ForwardIterator");
+    }
+  }
+  void SeekToLast() override {
+    status_ = Status::NotSupported("ForwardLevelIterator::SeekToLast()");
+    valid_ = false;
+  }
+  void Prev() override {
+    status_ = Status::NotSupported("ForwardLevelIterator::Prev()");
+    valid_ = false;
+  }
+  bool Valid() const override { return valid_; }
+  void SeekToFirst() override {
+    assert(file_iter_ != nullptr);
+    if (!status_.ok()) {
+      assert(!valid_);
+      return;
+    }
+    file_iter_->SeekToFirst();
+    valid_ = file_iter_->Valid();
+  }
+  void Seek(const Slice& internal_key) override {
+    assert(file_iter_ != nullptr);
+
+    // This deviates from the usual convention for InternalIterator::Seek() in
+    // that it doesn't discard pre-existing error status. That's because this
+    // Seek() is only supposed to be called immediately after SetFileIndex()
+    // (which discards pre-existing error status), and SetFileIndex() may set
+    // an error status, which we shouldn't discard.
+    if (!status_.ok()) {
+      assert(!valid_);
+      return;
+    }
+
+    file_iter_->Seek(internal_key);
+    valid_ = file_iter_->Valid();
+  }
+  void SeekForPrev(const Slice& /*internal_key*/) override {
+    status_ = Status::NotSupported("ForwardLevelIterator::SeekForPrev()");
+    valid_ = false;
+  }
+  void Next() override {
+    assert(valid_);
+    file_iter_->Next();
+    for (;;) {
+      valid_ = file_iter_->Valid();
+      if (!file_iter_->status().ok()) {
+        assert(!valid_);
+        return;
+      }
+      if (valid_) {
+        return;
+      }
+      if (file_index_ + 1 >= files_.size()) {
+        valid_ = false;
+        return;
+      }
+      SetFileIndex(file_index_ + 1);
+      if (!status_.ok()) {
+        assert(!valid_);
+        return;
+      }
+      file_iter_->SeekToFirst();
+    }
+  }
+  Slice key() const override {
+    assert(valid_);
+    return file_iter_->key();
+  }
+  Slice value() const override {
+    assert(valid_);
+    return file_iter_->value();
+  }
+  Status status() const override {
+    if (!status_.ok()) {
+      return status_;
+    } else if (file_iter_) {
+      return file_iter_->status();
+    }
+    return Status::OK();
+  }
+  bool PrepareValue() override {
+    assert(valid_);
+    if (file_iter_->PrepareValue()) {
+      return true;
+    }
+
+    assert(!file_iter_->Valid());
+    valid_ = false;
+    return false;
+  }
+  bool IsKeyPinned() const override {
+    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+           file_iter_->IsKeyPinned();
+  }
+  bool IsValuePinned() const override {
+    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+           file_iter_->IsValuePinned();
+  }
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+    pinned_iters_mgr_ = pinned_iters_mgr;
+    if (file_iter_) {
+      file_iter_->SetPinnedItersMgr(pinned_iters_mgr_);
+    }
+  }
+
+ private:
+  const ColumnFamilyData* const cfd_;
+  const ReadOptions& read_options_;
+  const std::vector<FileMetaData*>& files_;
+
+  bool valid_;
+  uint32_t file_index_;
+  Status status_;
+  InternalIterator* file_iter_;
+  PinnedIteratorsManager* pinned_iters_mgr_;
+  // Kept alive by ForwardIterator::sv_->mutable_cf_options
+  const std::shared_ptr<const SliceTransform>& prefix_extractor_;
+  const bool allow_unprepared_value_;
+};
+
+ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options,
+                                 ColumnFamilyData* cfd,
+                                 SuperVersion* current_sv,
+                                 bool allow_unprepared_value)
+    : db_(db),
+      read_options_(read_options),
+      cfd_(cfd),
+      prefix_extractor_(current_sv->mutable_cf_options.prefix_extractor.get()),
+      user_comparator_(cfd->user_comparator()),
+      allow_unprepared_value_(allow_unprepared_value),
+      immutable_min_heap_(MinIterComparator(&cfd_->internal_comparator())),
+      sv_(current_sv),
+      mutable_iter_(nullptr),
+      current_(nullptr),
+      valid_(false),
+      status_(Status::OK()),
+      immutable_status_(Status::OK()),
+      has_iter_trimmed_for_upper_bound_(false),
+      current_over_upper_bound_(false),
+      is_prev_set_(false),
+      is_prev_inclusive_(false),
+      pinned_iters_mgr_(nullptr) {
+  if (sv_) {
+    RebuildIterators(false);
+  }
+
+  // immutable_status_ is a local aggregation of the
+  // status of the immutable Iterators.
+  // We have to PermitUncheckedError in case it is never
+  // used, otherwise it will fail ASSERT_STATUS_CHECKED.
+  immutable_status_.PermitUncheckedError();
+}
+
+ForwardIterator::~ForwardIterator() { Cleanup(true); }
+
+void ForwardIterator::SVCleanup(DBImpl* db, SuperVersion* sv,
+                                bool background_purge_on_iterator_cleanup) {
+  if (sv->Unref()) {
+    // Job id == 0 means that this is not our background process, but rather
+    // user thread
+    JobContext job_context(0);
+    db->mutex_.Lock();
+    sv->Cleanup();
+    db->FindObsoleteFiles(&job_context, false, true);
+    if (background_purge_on_iterator_cleanup) {
+      db->ScheduleBgLogWriterClose(&job_context);
+      db->AddSuperVersionsToFreeQueue(sv);
+      db->SchedulePurge();
+    }
+    db->mutex_.Unlock();
+    if (!background_purge_on_iterator_cleanup) {
+      delete sv;
+    }
+    if (job_context.HaveSomethingToDelete()) {
+      db->PurgeObsoleteFiles(job_context, background_purge_on_iterator_cleanup);
+    }
+    job_context.Clean();
+  }
+}
+
+namespace {
+struct SVCleanupParams {
+  DBImpl* db;
+  SuperVersion* sv;
+  bool background_purge_on_iterator_cleanup;
+};
+}  // anonymous namespace
+
+// Used in PinnedIteratorsManager to release pinned SuperVersion
+void ForwardIterator::DeferredSVCleanup(void* arg) {
+  auto d = reinterpret_cast<SVCleanupParams*>(arg);
+  ForwardIterator::SVCleanup(d->db, d->sv,
+                             d->background_purge_on_iterator_cleanup);
+  delete d;
+}
+
+void ForwardIterator::SVCleanup() {
+  if (sv_ == nullptr) {
+    return;
+  }
+  bool background_purge =
+      read_options_.background_purge_on_iterator_cleanup ||
+      db_->immutable_db_options().avoid_unnecessary_blocking_io;
+  if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
+    // pinned_iters_mgr_ tells us to make sure that all visited key-value slices
+    // are alive until pinned_iters_mgr_->ReleasePinnedData() is called.
+    // The slices may point into some memtables owned by sv_, so we need to keep
+    // sv_ referenced until pinned_iters_mgr_ unpins everything.
+    auto p = new SVCleanupParams{db_, sv_, background_purge};
+    pinned_iters_mgr_->PinPtr(p, &ForwardIterator::DeferredSVCleanup);
+  } else {
+    SVCleanup(db_, sv_, background_purge);
+  }
+}
+
+void ForwardIterator::Cleanup(bool release_sv) {
+  if (mutable_iter_ != nullptr) {
+    DeleteIterator(mutable_iter_, true /* is_arena */);
+  }
+
+  for (auto* m : imm_iters_) {
+    DeleteIterator(m, true /* is_arena */);
+  }
+  imm_iters_.clear();
+
+  for (auto* f : l0_iters_) {
+    DeleteIterator(f);
+  }
+  l0_iters_.clear();
+
+  for (auto* l : level_iters_) {
+    DeleteIterator(l);
+  }
+  level_iters_.clear();
+
+  if (release_sv) {
+    SVCleanup();
+  }
+}
+
+bool ForwardIterator::Valid() const {
+  // See UpdateCurrent().
+  return valid_ ? !current_over_upper_bound_ : false;
+}
+
+void ForwardIterator::SeekToFirst() {
+  if (sv_ == nullptr) {
+    RebuildIterators(true);
+  } else if (sv_->version_number != cfd_->GetSuperVersionNumber()) {
+    RenewIterators();
+  } else if (immutable_status_.IsIncomplete()) {
+    ResetIncompleteIterators();
+  }
+  SeekInternal(Slice(), true, false);
+}
+
+bool ForwardIterator::IsOverUpperBound(const Slice& internal_key) const {
+  return !(read_options_.iterate_upper_bound == nullptr ||
+           cfd_->internal_comparator().user_comparator()->Compare(
+               ExtractUserKey(internal_key),
+               *read_options_.iterate_upper_bound) < 0);
+}
+
+void ForwardIterator::Seek(const Slice& internal_key) {
+  if (sv_ == nullptr) {
+    RebuildIterators(true);
+  } else if (sv_->version_number != cfd_->GetSuperVersionNumber()) {
+    RenewIterators();
+  } else if (immutable_status_.IsIncomplete()) {
+    ResetIncompleteIterators();
+  }
+
+  SeekInternal(internal_key, false, false);
+  if (read_options_.async_io) {
+    SeekInternal(internal_key, false, true);
+  }
+}
+
+// In case of async_io, SeekInternal is called twice with seek_after_async_io
+// enabled in second call which only does seeking part to retrieve the blocks.
+void ForwardIterator::SeekInternal(const Slice& internal_key,
+                                   bool seek_to_first,
+                                   bool seek_after_async_io) {
+  assert(mutable_iter_);
+  // mutable
+  if (!seek_after_async_io) {
+    seek_to_first ? mutable_iter_->SeekToFirst()
+                  : mutable_iter_->Seek(internal_key);
+  }
+
+  // immutable
+  // TODO(ljin): NeedToSeekImmutable has negative impact on performance
+  // if it turns to need to seek immutable often. We probably want to have
+  // an option to turn it off.
+  if (seek_to_first || seek_after_async_io ||
+      NeedToSeekImmutable(internal_key)) {
+    if (!seek_after_async_io) {
+      immutable_status_ = Status::OK();
+      if (has_iter_trimmed_for_upper_bound_ &&
+          (
+              // prev_ is not set yet
+              is_prev_set_ == false ||
+              // We are doing SeekToFirst() and internal_key.size() = 0
+              seek_to_first ||
+              // prev_key_ > internal_key
+              cfd_->internal_comparator().InternalKeyComparator::Compare(
+                  prev_key_.GetInternalKey(), internal_key) > 0)) {
+        // Some iterators are trimmed. Need to rebuild.
+        RebuildIterators(true);
+        // Already seeked mutable iter, so seek again
+        seek_to_first ? mutable_iter_->SeekToFirst()
+                      : mutable_iter_->Seek(internal_key);
+      }
+      {
+        auto tmp = MinIterHeap(MinIterComparator(&cfd_->internal_comparator()));
+        immutable_min_heap_.swap(tmp);
+      }
+      for (size_t i = 0; i < imm_iters_.size(); i++) {
+        auto* m = imm_iters_[i];
+        seek_to_first ? m->SeekToFirst() : m->Seek(internal_key);
+        if (!m->status().ok()) {
+          immutable_status_ = m->status();
+        } else if (m->Valid()) {
+          immutable_min_heap_.push(m);
+        }
+      }
+    }
+
+    Slice target_user_key;
+    if (!seek_to_first) {
+      target_user_key = ExtractUserKey(internal_key);
+    }
+    const VersionStorageInfo* vstorage = sv_->current->storage_info();
+    const std::vector<FileMetaData*>& l0 = vstorage->LevelFiles(0);
+    for (size_t i = 0; i < l0.size(); ++i) {
+      if (!l0_iters_[i]) {
+        continue;
+      }
+      if (seek_after_async_io) {
+        if (!l0_iters_[i]->status().IsTryAgain()) {
+          continue;
+        }
+      }
+
+      if (seek_to_first) {
+        l0_iters_[i]->SeekToFirst();
+      } else {
+        // If the target key passes over the largest key, we are sure Next()
+        // won't go over this file.
+        if (seek_after_async_io == false &&
+            user_comparator_->Compare(target_user_key,
+                                      l0[i]->largest.user_key()) > 0) {
+          if (read_options_.iterate_upper_bound != nullptr) {
+            has_iter_trimmed_for_upper_bound_ = true;
+            DeleteIterator(l0_iters_[i]);
+            l0_iters_[i] = nullptr;
+          }
+          continue;
+        }
+        l0_iters_[i]->Seek(internal_key);
+      }
+
+      if (l0_iters_[i]->status().IsTryAgain()) {
+        assert(!seek_after_async_io);
+        continue;
+      } else if (!l0_iters_[i]->status().ok()) {
+        immutable_status_ = l0_iters_[i]->status();
+      } else if (l0_iters_[i]->Valid() &&
+                 !IsOverUpperBound(l0_iters_[i]->key())) {
+        immutable_min_heap_.push(l0_iters_[i]);
+      } else {
+        has_iter_trimmed_for_upper_bound_ = true;
+        DeleteIterator(l0_iters_[i]);
+        l0_iters_[i] = nullptr;
+      }
+    }
+
+    for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
+      const std::vector<FileMetaData*>& level_files =
+          vstorage->LevelFiles(level);
+      if (level_files.empty()) {
+        continue;
+      }
+      if (level_iters_[level - 1] == nullptr) {
+        continue;
+      }
+
+      if (seek_after_async_io) {
+        if (!level_iters_[level - 1]->status().IsTryAgain()) {
+          continue;
+        }
+      }
+      uint32_t f_idx = 0;
+      if (!seek_to_first && !seek_after_async_io) {
+        f_idx = FindFileInRange(level_files, internal_key, 0,
+                                static_cast<uint32_t>(level_files.size()));
+      }
+
+      // Seek
+      if (seek_after_async_io || f_idx < level_files.size()) {
+        if (!seek_after_async_io) {
+          level_iters_[level - 1]->SetFileIndex(f_idx);
+        }
+        seek_to_first ? level_iters_[level - 1]->SeekToFirst()
+                      : level_iters_[level - 1]->Seek(internal_key);
+
+        if (level_iters_[level - 1]->status().IsTryAgain()) {
+          assert(!seek_after_async_io);
+          continue;
+        } else if (!level_iters_[level - 1]->status().ok()) {
+          immutable_status_ = level_iters_[level - 1]->status();
+        } else if (level_iters_[level - 1]->Valid() &&
+                   !IsOverUpperBound(level_iters_[level - 1]->key())) {
+          immutable_min_heap_.push(level_iters_[level - 1]);
+        } else {
+          // Nothing in this level is interesting. Remove.
+          has_iter_trimmed_for_upper_bound_ = true;
+          DeleteIterator(level_iters_[level - 1]);
+          level_iters_[level - 1] = nullptr;
+        }
+      }
+    }
+
+    if (seek_to_first) {
+      is_prev_set_ = false;
+    } else {
+      prev_key_.SetInternalKey(internal_key);
+      is_prev_set_ = true;
+      is_prev_inclusive_ = true;
+    }
+
+    TEST_SYNC_POINT_CALLBACK("ForwardIterator::SeekInternal:Immutable", this);
+  } else if (current_ && current_ != mutable_iter_) {
+    // current_ is one of immutable iterators, push it back to the heap
+    immutable_min_heap_.push(current_);
+  }
+
+  // For async_io, it should be updated when seek_after_async_io is true (in
+  // second call).
+  if (seek_to_first || !read_options_.async_io || seek_after_async_io) {
+    UpdateCurrent();
+  }
+  TEST_SYNC_POINT_CALLBACK("ForwardIterator::SeekInternal:Return", this);
+}
+
+void ForwardIterator::Next() {
+  assert(valid_);
+  bool update_prev_key = false;
+
+  if (sv_ == nullptr || sv_->version_number != cfd_->GetSuperVersionNumber()) {
+    std::string current_key = key().ToString();
+    Slice old_key(current_key.data(), current_key.size());
+
+    if (sv_ == nullptr) {
+      RebuildIterators(true);
+    } else {
+      RenewIterators();
+    }
+
+    SeekInternal(old_key, false, false);
+    if (read_options_.async_io) {
+      SeekInternal(old_key, false, true);
+    }
+
+    if (!valid_ || key().compare(old_key) != 0) {
+      return;
+    }
+  } else if (current_ != mutable_iter_) {
+    // It is going to advance immutable iterator
+
+    if (is_prev_set_ && prefix_extractor_) {
+      // advance prev_key_ to current_ only if they share the same prefix
+      update_prev_key =
+          prefix_extractor_->Transform(prev_key_.GetUserKey())
+              .compare(prefix_extractor_->Transform(current_->key())) == 0;
+    } else {
+      update_prev_key = true;
+    }
+
+    if (update_prev_key) {
+      prev_key_.SetInternalKey(current_->key());
+      is_prev_set_ = true;
+      is_prev_inclusive_ = false;
+    }
+  }
+
+  current_->Next();
+  if (current_ != mutable_iter_) {
+    if (!current_->status().ok()) {
+      immutable_status_ = current_->status();
+    } else if ((current_->Valid()) && (!IsOverUpperBound(current_->key()))) {
+      immutable_min_heap_.push(current_);
+    } else {
+      if ((current_->Valid()) && (IsOverUpperBound(current_->key()))) {
+        // remove the current iterator
+        DeleteCurrentIter();
+        current_ = nullptr;
+      }
+      if (update_prev_key) {
+        mutable_iter_->Seek(prev_key_.GetInternalKey());
+      }
+    }
+  }
+  UpdateCurrent();
+  TEST_SYNC_POINT_CALLBACK("ForwardIterator::Next:Return", this);
+}
+
+Slice ForwardIterator::key() const {
+  assert(valid_);
+  return current_->key();
+}
+
+Slice ForwardIterator::value() const {
+  assert(valid_);
+  return current_->value();
+}
+
+Status ForwardIterator::status() const {
+  if (!status_.ok()) {
+    return status_;
+  } else if (!mutable_iter_->status().ok()) {
+    return mutable_iter_->status();
+  }
+
+  return immutable_status_;
+}
+
+bool ForwardIterator::PrepareValue() {
+  assert(valid_);
+  if (current_->PrepareValue()) {
+    return true;
+  }
+
+  assert(!current_->Valid());
+  assert(!current_->status().ok());
+  assert(current_ != mutable_iter_);  // memtable iterator can't fail
+  assert(immutable_status_.ok());
+
+  valid_ = false;
+  immutable_status_ = current_->status();
+  return false;
+}
+
+Status ForwardIterator::GetProperty(std::string prop_name, std::string* prop) {
+  assert(prop != nullptr);
+  if (prop_name == "rocksdb.iterator.super-version-number") {
+    *prop = std::to_string(sv_->version_number);
+    return Status::OK();
+  }
+  return Status::InvalidArgument();
+}
+
+void ForwardIterator::SetPinnedItersMgr(
+    PinnedIteratorsManager* pinned_iters_mgr) {
+  pinned_iters_mgr_ = pinned_iters_mgr;
+  UpdateChildrenPinnedItersMgr();
+}
+
+void ForwardIterator::UpdateChildrenPinnedItersMgr() {
+  // Set PinnedIteratorsManager for mutable memtable iterator.
+  if (mutable_iter_) {
+    mutable_iter_->SetPinnedItersMgr(pinned_iters_mgr_);
+  }
+
+  // Set PinnedIteratorsManager for immutable memtable iterators.
+  for (InternalIterator* child_iter : imm_iters_) {
+    if (child_iter) {
+      child_iter->SetPinnedItersMgr(pinned_iters_mgr_);
+    }
+  }
+
+  // Set PinnedIteratorsManager for L0 files iterators.
+  for (InternalIterator* child_iter : l0_iters_) {
+    if (child_iter) {
+      child_iter->SetPinnedItersMgr(pinned_iters_mgr_);
+    }
+  }
+
+  // Set PinnedIteratorsManager for L1+ levels iterators.
+  for (ForwardLevelIterator* child_iter : level_iters_) {
+    if (child_iter) {
+      child_iter->SetPinnedItersMgr(pinned_iters_mgr_);
+    }
+  }
+}
+
+bool ForwardIterator::IsKeyPinned() const {
+  return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+         current_->IsKeyPinned();
+}
+
+bool ForwardIterator::IsValuePinned() const {
+  return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+         current_->IsValuePinned();
+}
+
+void ForwardIterator::RebuildIterators(bool refresh_sv) {
+  // Clean up
+  Cleanup(refresh_sv);
+  if (refresh_sv) {
+    // New
+    sv_ = cfd_->GetReferencedSuperVersion(db_);
+  }
+  ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
+                                       kMaxSequenceNumber /* upper_bound */);
+  mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_);
+  sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_);
+  if (!read_options_.ignore_range_deletions) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+        sv_->mem->NewRangeTombstoneIterator(
+            read_options_, sv_->current->version_set()->LastSequence(),
+            false /* immutable_memtable */));
+    range_del_agg.AddTombstones(std::move(range_del_iter));
+    // Always return Status::OK().
+    Status temp_s = sv_->imm->AddRangeTombstoneIterators(read_options_, &arena_,
+                                                         &range_del_agg);
+    assert(temp_s.ok());
+  }
+  has_iter_trimmed_for_upper_bound_ = false;
+
+  const auto* vstorage = sv_->current->storage_info();
+  const auto& l0_files = vstorage->LevelFiles(0);
+  l0_iters_.reserve(l0_files.size());
+  for (const auto* l0 : l0_files) {
+    if ((read_options_.iterate_upper_bound != nullptr) &&
+        cfd_->internal_comparator().user_comparator()->Compare(
+            l0->smallest.user_key(), *read_options_.iterate_upper_bound) > 0) {
+      // No need to set has_iter_trimmed_for_upper_bound_: this ForwardIterator
+      // will never be interested in files with smallest key above
+      // iterate_upper_bound, since iterate_upper_bound can't be changed.
+      l0_iters_.push_back(nullptr);
+      continue;
+    }
+    l0_iters_.push_back(cfd_->table_cache()->NewIterator(
+        read_options_, *cfd_->soptions(), cfd_->internal_comparator(), *l0,
+        read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
+        sv_->mutable_cf_options.prefix_extractor,
+        /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
+        TableReaderCaller::kUserIterator, /*arena=*/nullptr,
+        /*skip_filters=*/false, /*level=*/-1,
+        MaxFileSizeForL0MetaPin(sv_->mutable_cf_options),
+        /*smallest_compaction_key=*/nullptr,
+        /*largest_compaction_key=*/nullptr, allow_unprepared_value_));
+  }
+  BuildLevelIterators(vstorage, sv_);
+  current_ = nullptr;
+  is_prev_set_ = false;
+
+  UpdateChildrenPinnedItersMgr();
+  if (!range_del_agg.IsEmpty()) {
+    status_ = Status::NotSupported(
+        "Range tombstones unsupported with ForwardIterator");
+    valid_ = false;
+  }
+}
+
+void ForwardIterator::RenewIterators() {
+  SuperVersion* svnew;
+  assert(sv_);
+  svnew = cfd_->GetReferencedSuperVersion(db_);
+
+  if (mutable_iter_ != nullptr) {
+    DeleteIterator(mutable_iter_, true /* is_arena */);
+  }
+  for (auto* m : imm_iters_) {
+    DeleteIterator(m, true /* is_arena */);
+  }
+  imm_iters_.clear();
+
+  mutable_iter_ = svnew->mem->NewIterator(read_options_, &arena_);
+  svnew->imm->AddIterators(read_options_, &imm_iters_, &arena_);
+  ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
+                                       kMaxSequenceNumber /* upper_bound */);
+  if (!read_options_.ignore_range_deletions) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+        svnew->mem->NewRangeTombstoneIterator(
+            read_options_, sv_->current->version_set()->LastSequence(),
+            false /* immutable_memtable */));
+    range_del_agg.AddTombstones(std::move(range_del_iter));
+    // Always return Status::OK().
+    Status temp_s = svnew->imm->AddRangeTombstoneIterators(
+        read_options_, &arena_, &range_del_agg);
+    assert(temp_s.ok());
+  }
+
+  const auto* vstorage = sv_->current->storage_info();
+  const auto& l0_files = vstorage->LevelFiles(0);
+  const auto* vstorage_new = svnew->current->storage_info();
+  const auto& l0_files_new = vstorage_new->LevelFiles(0);
+  size_t iold, inew;
+  bool found;
+  std::vector<InternalIterator*> l0_iters_new;
+  l0_iters_new.reserve(l0_files_new.size());
+
+  for (inew = 0; inew < l0_files_new.size(); inew++) {
+    found = false;
+    for (iold = 0; iold < l0_files.size(); iold++) {
+      if (l0_files[iold] == l0_files_new[inew]) {
+        found = true;
+        break;
+      }
+    }
+    if (found) {
+      if (l0_iters_[iold] == nullptr) {
+        l0_iters_new.push_back(nullptr);
+        TEST_SYNC_POINT_CALLBACK("ForwardIterator::RenewIterators:Null", this);
+      } else {
+        l0_iters_new.push_back(l0_iters_[iold]);
+        l0_iters_[iold] = nullptr;
+        TEST_SYNC_POINT_CALLBACK("ForwardIterator::RenewIterators:Copy", this);
+      }
+      continue;
+    }
+    l0_iters_new.push_back(cfd_->table_cache()->NewIterator(
+        read_options_, *cfd_->soptions(), cfd_->internal_comparator(),
+        *l0_files_new[inew],
+        read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
+        svnew->mutable_cf_options.prefix_extractor,
+        /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
+        TableReaderCaller::kUserIterator, /*arena=*/nullptr,
+        /*skip_filters=*/false, /*level=*/-1,
+        MaxFileSizeForL0MetaPin(svnew->mutable_cf_options),
+        /*smallest_compaction_key=*/nullptr,
+        /*largest_compaction_key=*/nullptr, allow_unprepared_value_));
+  }
+
+  for (auto* f : l0_iters_) {
+    DeleteIterator(f);
+  }
+  l0_iters_.clear();
+  l0_iters_ = l0_iters_new;
+
+  for (auto* l : level_iters_) {
+    DeleteIterator(l);
+  }
+  level_iters_.clear();
+  BuildLevelIterators(vstorage_new, svnew);
+  current_ = nullptr;
+  is_prev_set_ = false;
+  SVCleanup();
+  sv_ = svnew;
+
+  UpdateChildrenPinnedItersMgr();
+  if (!range_del_agg.IsEmpty()) {
+    status_ = Status::NotSupported(
+        "Range tombstones unsupported with ForwardIterator");
+    valid_ = false;
+  }
+}
+
+void ForwardIterator::BuildLevelIterators(const VersionStorageInfo* vstorage,
+                                          SuperVersion* sv) {
+  level_iters_.reserve(vstorage->num_levels() - 1);
+  for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
+    const auto& level_files = vstorage->LevelFiles(level);
+    if ((level_files.empty()) ||
+        ((read_options_.iterate_upper_bound != nullptr) &&
+         (user_comparator_->Compare(*read_options_.iterate_upper_bound,
+                                    level_files[0]->smallest.user_key()) <
+          0))) {
+      level_iters_.push_back(nullptr);
+      if (!level_files.empty()) {
+        has_iter_trimmed_for_upper_bound_ = true;
+      }
+    } else {
+      level_iters_.push_back(new ForwardLevelIterator(
+          cfd_, read_options_, level_files,
+          sv->mutable_cf_options.prefix_extractor, allow_unprepared_value_));
+    }
+  }
+}
+
+void ForwardIterator::ResetIncompleteIterators() {
+  const auto& l0_files = sv_->current->storage_info()->LevelFiles(0);
+  for (size_t i = 0; i < l0_iters_.size(); ++i) {
+    assert(i < l0_files.size());
+    if (!l0_iters_[i] || !l0_iters_[i]->status().IsIncomplete()) {
+      continue;
+    }
+    DeleteIterator(l0_iters_[i]);
+    l0_iters_[i] = cfd_->table_cache()->NewIterator(
+        read_options_, *cfd_->soptions(), cfd_->internal_comparator(),
+        *l0_files[i], /*range_del_agg=*/nullptr,
+        sv_->mutable_cf_options.prefix_extractor,
+        /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
+        TableReaderCaller::kUserIterator, /*arena=*/nullptr,
+        /*skip_filters=*/false, /*level=*/-1,
+        MaxFileSizeForL0MetaPin(sv_->mutable_cf_options),
+        /*smallest_compaction_key=*/nullptr,
+        /*largest_compaction_key=*/nullptr, allow_unprepared_value_);
+    l0_iters_[i]->SetPinnedItersMgr(pinned_iters_mgr_);
+  }
+
+  for (auto* level_iter : level_iters_) {
+    if (level_iter && level_iter->status().IsIncomplete()) {
+      level_iter->Reset();
+    }
+  }
+
+  current_ = nullptr;
+  is_prev_set_ = false;
+}
+
+void ForwardIterator::UpdateCurrent() {
+  if (immutable_min_heap_.empty() && !mutable_iter_->Valid()) {
+    current_ = nullptr;
+  } else if (immutable_min_heap_.empty()) {
+    current_ = mutable_iter_;
+  } else if (!mutable_iter_->Valid()) {
+    current_ = immutable_min_heap_.top();
+    immutable_min_heap_.pop();
+  } else {
+    current_ = immutable_min_heap_.top();
+    assert(current_ != nullptr);
+    assert(current_->Valid());
+    int cmp = cfd_->internal_comparator().InternalKeyComparator::Compare(
+        mutable_iter_->key(), current_->key());
+    assert(cmp != 0);
+    if (cmp > 0) {
+      immutable_min_heap_.pop();
+    } else {
+      current_ = mutable_iter_;
+    }
+  }
+  valid_ = current_ != nullptr && immutable_status_.ok();
+  if (!status_.ok()) {
+    status_ = Status::OK();
+  }
+
+  // Upper bound doesn't apply to the memtable iterator. We want Valid() to
+  // return false when all iterators are over iterate_upper_bound, but can't
+  // just set valid_ to false, as that would effectively disable the tailing
+  // optimization (Seek() would be called on all immutable iterators regardless
+  // of whether the target key is greater than prev_key_).
+  current_over_upper_bound_ = valid_ && IsOverUpperBound(current_->key());
+}
+
+bool ForwardIterator::NeedToSeekImmutable(const Slice& target) {
+  // We maintain the interval (prev_key_, immutable_min_heap_.top()->key())
+  // such that there are no records with keys within that range in
+  // immutable_min_heap_. Since immutable structures (SST files and immutable
+  // memtables) can't change in this version, we don't need to do a seek if
+  // 'target' belongs to that interval (immutable_min_heap_.top() is already
+  // at the correct position).
+
+  if (!valid_ || !current_ || !is_prev_set_ || !immutable_status_.ok()) {
+    return true;
+  }
+  Slice prev_key = prev_key_.GetInternalKey();
+  if (prefix_extractor_ && prefix_extractor_->Transform(target).compare(
+                               prefix_extractor_->Transform(prev_key)) != 0) {
+    return true;
+  }
+  if (cfd_->internal_comparator().InternalKeyComparator::Compare(
+          prev_key, target) >= (is_prev_inclusive_ ? 1 : 0)) {
+    return true;
+  }
+
+  if (immutable_min_heap_.empty() && current_ == mutable_iter_) {
+    // Nothing to seek on.
+    return false;
+  }
+  if (cfd_->internal_comparator().InternalKeyComparator::Compare(
+          target, current_ == mutable_iter_ ? immutable_min_heap_.top()->key()
+                                            : current_->key()) > 0) {
+    return true;
+  }
+  return false;
+}
+
+void ForwardIterator::DeleteCurrentIter() {
+  const VersionStorageInfo* vstorage = sv_->current->storage_info();
+  const std::vector<FileMetaData*>& l0 = vstorage->LevelFiles(0);
+  for (size_t i = 0; i < l0.size(); ++i) {
+    if (!l0_iters_[i]) {
+      continue;
+    }
+    if (l0_iters_[i] == current_) {
+      has_iter_trimmed_for_upper_bound_ = true;
+      DeleteIterator(l0_iters_[i]);
+      l0_iters_[i] = nullptr;
+      return;
+    }
+  }
+
+  for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
+    if (level_iters_[level - 1] == nullptr) {
+      continue;
+    }
+    if (level_iters_[level - 1] == current_) {
+      has_iter_trimmed_for_upper_bound_ = true;
+      DeleteIterator(level_iters_[level - 1]);
+      level_iters_[level - 1] = nullptr;
+    }
+  }
+}
+
+bool ForwardIterator::TEST_CheckDeletedIters(int* pdeleted_iters,
+                                             int* pnum_iters) {
+  bool retval = false;
+  int deleted_iters = 0;
+  int num_iters = 0;
+
+  const VersionStorageInfo* vstorage = sv_->current->storage_info();
+  const std::vector<FileMetaData*>& l0 = vstorage->LevelFiles(0);
+  for (size_t i = 0; i < l0.size(); ++i) {
+    if (!l0_iters_[i]) {
+      retval = true;
+      deleted_iters++;
+    } else {
+      num_iters++;
+    }
+  }
+
+  for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
+    if ((level_iters_[level - 1] == nullptr) &&
+        (!vstorage->LevelFiles(level).empty())) {
+      retval = true;
+      deleted_iters++;
+    } else if (!vstorage->LevelFiles(level).empty()) {
+      num_iters++;
+    }
+  }
+  if ((!retval) && num_iters <= 1) {
+    retval = true;
+  }
+  if (pdeleted_iters) {
+    *pdeleted_iters = deleted_iters;
+  }
+  if (pnum_iters) {
+    *pnum_iters = num_iters;
+  }
+  return retval;
+}
+
+uint32_t ForwardIterator::FindFileInRange(
+    const std::vector<FileMetaData*>& files, const Slice& internal_key,
+    uint32_t left, uint32_t right) {
+  auto cmp = [&](const FileMetaData* f, const Slice& k) -> bool {
+    return cfd_->internal_comparator().InternalKeyComparator::Compare(
+               f->largest.Encode(), k) < 0;
+  };
+  const auto& b = files.begin();
+  return static_cast<uint32_t>(
+      std::lower_bound(b + left, b + right, internal_key, cmp) - b);
+}
+
+void ForwardIterator::DeleteIterator(InternalIterator* iter, bool is_arena) {
+  if (iter == nullptr) {
+    return;
+  }
+
+  if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
+    pinned_iters_mgr_->PinIterator(iter, is_arena);
+  } else {
+    if (is_arena) {
+      iter->~InternalIterator();
+    } else {
+      delete iter;
+    }
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/forward_iterator.h b/src/rocksdb/db/forward_iterator.h
new file mode 100644
index 000000000..5a5c6f0f3
--- /dev/null
+++ b/src/rocksdb/db/forward_iterator.h
@@ -0,0 +1,168 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include "rocksdb/comparator.h"
+#ifndef ROCKSDB_LITE
+
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "memory/arena.h"
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImpl;
+class Env;
+struct SuperVersion;
+class ColumnFamilyData;
+class ForwardLevelIterator;
+class VersionStorageInfo;
+struct FileMetaData;
+
+class MinIterComparator {
+ public:
+  explicit MinIterComparator(const CompareInterface* comparator)
+      : comparator_(comparator) {}
+
+  bool operator()(InternalIterator* a, InternalIterator* b) {
+    return comparator_->Compare(a->key(), b->key()) > 0;
+  }
+
+ private:
+  const CompareInterface* comparator_;
+};
+
+using MinIterHeap =
+    std::priority_queue<InternalIterator*, std::vector<InternalIterator*>,
+                        MinIterComparator>;
+
+/**
+ * ForwardIterator is a special type of iterator that only supports Seek()
+ * and Next(). It is expected to perform better than TailingIterator by
+ * removing the encapsulation and making all information accessible within
+ * the iterator. At the current implementation, snapshot is taken at the
+ * time Seek() is called. The Next() followed do not see new values after.
+ */
+class ForwardIterator : public InternalIterator {
+ public:
+  ForwardIterator(DBImpl* db, const ReadOptions& read_options,
+                  ColumnFamilyData* cfd, SuperVersion* current_sv = nullptr,
+                  bool allow_unprepared_value = false);
+  virtual ~ForwardIterator();
+
+  void SeekForPrev(const Slice& /*target*/) override {
+    status_ = Status::NotSupported("ForwardIterator::SeekForPrev()");
+    valid_ = false;
+  }
+  void SeekToLast() override {
+    status_ = Status::NotSupported("ForwardIterator::SeekToLast()");
+    valid_ = false;
+  }
+  void Prev() override {
+    status_ = Status::NotSupported("ForwardIterator::Prev");
+    valid_ = false;
+  }
+
+  virtual bool Valid() const override;
+  void SeekToFirst() override;
+  virtual void Seek(const Slice& target) override;
+  virtual void Next() override;
+  virtual Slice key() const override;
+  virtual Slice value() const override;
+  virtual Status status() const override;
+  virtual bool PrepareValue() override;
+  virtual Status GetProperty(std::string prop_name, std::string* prop) override;
+  virtual void SetPinnedItersMgr(
+      PinnedIteratorsManager* pinned_iters_mgr) override;
+  virtual bool IsKeyPinned() const override;
+  virtual bool IsValuePinned() const override;
+
+  bool TEST_CheckDeletedIters(int* deleted_iters, int* num_iters);
+
+ private:
+  void Cleanup(bool release_sv);
+  // Unreference and, if needed, clean up the current SuperVersion. This is
+  // either done immediately or deferred until this iterator is unpinned by
+  // PinnedIteratorsManager.
+  void SVCleanup();
+  static void SVCleanup(DBImpl* db, SuperVersion* sv,
+                        bool background_purge_on_iterator_cleanup);
+  static void DeferredSVCleanup(void* arg);
+
+  void RebuildIterators(bool refresh_sv);
+  void RenewIterators();
+  void BuildLevelIterators(const VersionStorageInfo* vstorage,
+                           SuperVersion* sv);
+  void ResetIncompleteIterators();
+  void SeekInternal(const Slice& internal_key, bool seek_to_first,
+                    bool seek_after_async_io);
+
+  void UpdateCurrent();
+  bool NeedToSeekImmutable(const Slice& internal_key);
+  void DeleteCurrentIter();
+  uint32_t FindFileInRange(const std::vector<FileMetaData*>& files,
+                           const Slice& internal_key, uint32_t left,
+                           uint32_t right);
+
+  bool IsOverUpperBound(const Slice& internal_key) const;
+
+  // Set PinnedIteratorsManager for all children Iterators, this function should
+  // be called whenever we update children Iterators or pinned_iters_mgr_.
+  void UpdateChildrenPinnedItersMgr();
+
+  // A helper function that will release iter in the proper manner, or pass it
+  // to pinned_iters_mgr_ to release it later if pinning is enabled.
+  void DeleteIterator(InternalIterator* iter, bool is_arena = false);
+
+  DBImpl* const db_;
+  const ReadOptions read_options_;
+  ColumnFamilyData* const cfd_;
+  const SliceTransform* const prefix_extractor_;
+  const Comparator* user_comparator_;
+  const bool allow_unprepared_value_;
+  MinIterHeap immutable_min_heap_;
+
+  SuperVersion* sv_;
+  InternalIterator* mutable_iter_;
+  std::vector<InternalIterator*> imm_iters_;
+  std::vector<InternalIterator*> l0_iters_;
+  std::vector<ForwardLevelIterator*> level_iters_;
+  InternalIterator* current_;
+  bool valid_;
+
+  // Internal iterator status; set only by one of the unsupported methods.
+  Status status_;
+  // Status of immutable iterators, maintained here to avoid iterating over
+  // all of them in status().
+  Status immutable_status_;
+  // Indicates that at least one of the immutable iterators pointed to a key
+  // larger than iterate_upper_bound and was therefore destroyed. Seek() may
+  // need to rebuild such iterators.
+  bool has_iter_trimmed_for_upper_bound_;
+  // Is current key larger than iterate_upper_bound? If so, makes Valid()
+  // return false.
+  bool current_over_upper_bound_;
+
+  // Left endpoint of the range of keys that immutable iterators currently
+  // cover. When Seek() is called with a key that's within that range, immutable
+  // iterators don't need to be moved; see NeedToSeekImmutable(). This key is
+  // included in the range after a Seek(), but excluded when advancing the
+  // iterator using Next().
+  IterKey prev_key_;
+  bool is_prev_set_;
+  bool is_prev_inclusive_;
+
+  PinnedIteratorsManager* pinned_iters_mgr_;
+  Arena arena_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/forward_iterator_bench.cc b/src/rocksdb/db/forward_iterator_bench.cc
new file mode 100644
index 000000000..325661cef
--- /dev/null
+++ b/src/rocksdb/db/forward_iterator_bench.cc
@@ -0,0 +1,378 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#if !defined(GFLAGS) || defined(ROCKSDB_LITE)
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#elif defined(OS_MACOSX) || defined(OS_WIN)
+// Block forward_iterator_bench under MAC and Windows
+int main() { return 0; }
+#else
+#include <semaphore.h>
+
+#include <atomic>
+#include <bitset>
+#include <chrono>
+#include <climits>
+#include <condition_variable>
+#include <limits>
+#include <mutex>
+#include <queue>
+#include <random>
+#include <thread>
+
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "test_util/testharness.h"
+#include "util/gflags_compat.h"
+
+const int MAX_SHARDS = 100000;
+
+DEFINE_int32(writers, 8, "");
+DEFINE_int32(readers, 8, "");
+DEFINE_int64(rate, 100000, "");
+DEFINE_int64(value_size, 300, "");
+DEFINE_int64(shards, 1000, "");
+DEFINE_int64(memtable_size, 500000000, "");
+DEFINE_int64(block_cache_size, 300000000, "");
+DEFINE_int64(block_size, 65536, "");
+DEFINE_double(runtime, 300.0, "");
+DEFINE_bool(cache_only_first, true, "");
+DEFINE_bool(iterate_upper_bound, true, "");
+
+struct Stats {
+  char pad1[128] __attribute__((__unused__));
+  std::atomic<uint64_t> written{0};
+  char pad2[128] __attribute__((__unused__));
+  std::atomic<uint64_t> read{0};
+  std::atomic<uint64_t> cache_misses{0};
+  char pad3[128] __attribute__((__unused__));
+} stats;
+
+struct Key {
+  Key() {}
+  Key(uint64_t shard_in, uint64_t seqno_in)
+      : shard_be(htobe64(shard_in)), seqno_be(htobe64(seqno_in)) {}
+
+  uint64_t shard() const { return be64toh(shard_be); }
+  uint64_t seqno() const { return be64toh(seqno_be); }
+
+ private:
+  uint64_t shard_be;
+  uint64_t seqno_be;
+} __attribute__((__packed__));
+
+struct Reader;
+struct Writer;
+
+struct ShardState {
+  char pad1[128] __attribute__((__unused__));
+  std::atomic<uint64_t> last_written{0};
+  Writer* writer;
+  Reader* reader;
+  char pad2[128] __attribute__((__unused__));
+  std::atomic<uint64_t> last_read{0};
+  std::unique_ptr<ROCKSDB_NAMESPACE::Iterator> it;
+  std::unique_ptr<ROCKSDB_NAMESPACE::Iterator> it_cacheonly;
+  Key upper_bound;
+  ROCKSDB_NAMESPACE::Slice upper_bound_slice;
+  char pad3[128] __attribute__((__unused__));
+};
+
+struct Reader {
+ public:
+  explicit Reader(std::vector<ShardState>* shard_states,
+                  ROCKSDB_NAMESPACE::DB* db)
+      : shard_states_(shard_states), db_(db) {
+    sem_init(&sem_, 0, 0);
+    thread_ = port::Thread(&Reader::run, this);
+  }
+
+  void run() {
+    while (1) {
+      sem_wait(&sem_);
+      if (done_.load()) {
+        break;
+      }
+
+      uint64_t shard;
+      {
+        std::lock_guard<std::mutex> guard(queue_mutex_);
+        assert(!shards_pending_queue_.empty());
+        shard = shards_pending_queue_.front();
+        shards_pending_queue_.pop();
+        shards_pending_set_.reset(shard);
+      }
+      readOnceFromShard(shard);
+    }
+  }
+
+  void readOnceFromShard(uint64_t shard) {
+    ShardState& state = (*shard_states_)[shard];
+    if (!state.it) {
+      // Initialize iterators
+      ROCKSDB_NAMESPACE::ReadOptions options;
+      options.tailing = true;
+      if (FLAGS_iterate_upper_bound) {
+        state.upper_bound = Key(shard, std::numeric_limits<uint64_t>::max());
+        state.upper_bound_slice = ROCKSDB_NAMESPACE::Slice(
+            (const char*)&state.upper_bound, sizeof(state.upper_bound));
+        options.iterate_upper_bound = &state.upper_bound_slice;
+      }
+
+      state.it.reset(db_->NewIterator(options));
+
+      if (FLAGS_cache_only_first) {
+        options.read_tier = ROCKSDB_NAMESPACE::ReadTier::kBlockCacheTier;
+        state.it_cacheonly.reset(db_->NewIterator(options));
+      }
+    }
+
+    const uint64_t upto = state.last_written.load();
+    for (ROCKSDB_NAMESPACE::Iterator* it :
+         {state.it_cacheonly.get(), state.it.get()}) {
+      if (it == nullptr) {
+        continue;
+      }
+      if (state.last_read.load() >= upto) {
+        break;
+      }
+      bool need_seek = true;
+      for (uint64_t seq = state.last_read.load() + 1; seq <= upto; ++seq) {
+        if (need_seek) {
+          Key from(shard, state.last_read.load() + 1);
+          it->Seek(ROCKSDB_NAMESPACE::Slice((const char*)&from, sizeof(from)));
+          need_seek = false;
+        } else {
+          it->Next();
+        }
+        if (it->status().IsIncomplete()) {
+          ++::stats.cache_misses;
+          break;
+        }
+        assert(it->Valid());
+        assert(it->key().size() == sizeof(Key));
+        Key key;
+        memcpy(&key, it->key().data(), it->key().size());
+        // fprintf(stderr, "Expecting (%ld, %ld) read (%ld, %ld)\n",
+        //         shard, seq, key.shard(), key.seqno());
+        assert(key.shard() == shard);
+        assert(key.seqno() == seq);
+        state.last_read.store(seq);
+        ++::stats.read;
+      }
+    }
+  }
+
+  void onWrite(uint64_t shard) {
+    {
+      std::lock_guard<std::mutex> guard(queue_mutex_);
+      if (!shards_pending_set_.test(shard)) {
+        shards_pending_queue_.push(shard);
+        shards_pending_set_.set(shard);
+        sem_post(&sem_);
+      }
+    }
+  }
+
+  ~Reader() {
+    done_.store(true);
+    sem_post(&sem_);
+    thread_.join();
+  }
+
+ private:
+  char pad1[128] __attribute__((__unused__));
+  std::vector<ShardState>* shard_states_;
+  ROCKSDB_NAMESPACE::DB* db_;
+  ROCKSDB_NAMESPACE::port::Thread thread_;
+  sem_t sem_;
+  std::mutex queue_mutex_;
+  std::bitset<MAX_SHARDS + 1> shards_pending_set_;
+  std::queue<uint64_t> shards_pending_queue_;
+  std::atomic<bool> done_{false};
+  char pad2[128] __attribute__((__unused__));
+};
+
+struct Writer {
+  explicit Writer(std::vector<ShardState>* shard_states,
+                  ROCKSDB_NAMESPACE::DB* db)
+      : shard_states_(shard_states), db_(db) {}
+
+  void start() { thread_ = port::Thread(&Writer::run, this); }
+
+  void run() {
+    std::queue<std::chrono::steady_clock::time_point> workq;
+    std::chrono::steady_clock::time_point deadline(
+        std::chrono::steady_clock::now() +
+        std::chrono::nanoseconds((uint64_t)(1000000000 * FLAGS_runtime)));
+    std::vector<uint64_t> my_shards;
+    for (int i = 1; i <= FLAGS_shards; ++i) {
+      if ((*shard_states_)[i].writer == this) {
+        my_shards.push_back(i);
+      }
+    }
+
+    std::mt19937 rng{std::random_device()()};
+    std::uniform_int_distribution<int> shard_dist(
+        0, static_cast<int>(my_shards.size()) - 1);
+    std::string value(FLAGS_value_size, '*');
+
+    while (1) {
+      auto now = std::chrono::steady_clock::now();
+      if (FLAGS_runtime >= 0 && now >= deadline) {
+        break;
+      }
+      if (workq.empty()) {
+        for (int i = 0; i < FLAGS_rate; i += FLAGS_writers) {
+          std::chrono::nanoseconds offset(1000000000LL * i / FLAGS_rate);
+          workq.push(now + offset);
+        }
+      }
+      while (!workq.empty() && workq.front() < now) {
+        workq.pop();
+        uint64_t shard = my_shards[shard_dist(rng)];
+        ShardState& state = (*shard_states_)[shard];
+        uint64_t seqno = state.last_written.load() + 1;
+        Key key(shard, seqno);
+        // fprintf(stderr, "Writing (%ld, %ld)\n", shard, seqno);
+        ROCKSDB_NAMESPACE::Status status =
+            db_->Put(ROCKSDB_NAMESPACE::WriteOptions(),
+                     ROCKSDB_NAMESPACE::Slice((const char*)&key, sizeof(key)),
+                     ROCKSDB_NAMESPACE::Slice(value));
+        assert(status.ok());
+        state.last_written.store(seqno);
+        state.reader->onWrite(shard);
+        ++::stats.written;
+      }
+      std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    }
+    // fprintf(stderr, "Writer done\n");
+  }
+
+  ~Writer() { thread_.join(); }
+
+ private:
+  char pad1[128] __attribute__((__unused__));
+  std::vector<ShardState>* shard_states_;
+  ROCKSDB_NAMESPACE::DB* db_;
+  ROCKSDB_NAMESPACE::port::Thread thread_;
+  char pad2[128] __attribute__((__unused__));
+};
+
+struct StatsThread {
+  explicit StatsThread(ROCKSDB_NAMESPACE::DB* db)
+      : db_(db), thread_(&StatsThread::run, this) {}
+
+  void run() {
+    auto tstart = std::chrono::steady_clock::now(), tlast = tstart;
+    uint64_t wlast = 0, rlast = 0;
+    while (!done_.load()) {
+      {
+        std::unique_lock<std::mutex> lock(cvm_);
+        cv_.wait_for(lock, std::chrono::seconds(1));
+      }
+      auto now = std::chrono::steady_clock::now();
+      double elapsed =
+          std::chrono::duration_cast<std::chrono::duration<double> >(now -
+                                                                     tlast)
+              .count();
+      uint64_t w = ::stats.written.load();
+      uint64_t r = ::stats.read.load();
+      fprintf(stderr,
+              "%s elapsed %4lds | written %10ld | w/s %10.0f | read %10ld | "
+              "r/s %10.0f | cache misses %10ld\n",
+              db_->GetEnv()->TimeToString(time(nullptr)).c_str(),
+              std::chrono::duration_cast<std::chrono::seconds>(now - tstart)
+                  .count(),
+              w, (w - wlast) / elapsed, r, (r - rlast) / elapsed,
+              ::stats.cache_misses.load());
+      wlast = w;
+      rlast = r;
+      tlast = now;
+    }
+  }
+
+  ~StatsThread() {
+    {
+      std::lock_guard<std::mutex> guard(cvm_);
+      done_.store(true);
+    }
+    cv_.notify_all();
+    thread_.join();
+  }
+
+ private:
+  ROCKSDB_NAMESPACE::DB* db_;
+  std::mutex cvm_;
+  std::condition_variable cv_;
+  ROCKSDB_NAMESPACE::port::Thread thread_;
+  std::atomic<bool> done_{false};
+};
+
+int main(int argc, char** argv) {
+  GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+
+  std::mt19937 rng{std::random_device()()};
+  ROCKSDB_NAMESPACE::Status status;
+  std::string path =
+      ROCKSDB_NAMESPACE::test::PerThreadDBPath("forward_iterator_test");
+  fprintf(stderr, "db path is %s\n", path.c_str());
+  ROCKSDB_NAMESPACE::Options options;
+  options.create_if_missing = true;
+  options.compression = ROCKSDB_NAMESPACE::CompressionType::kNoCompression;
+  options.compaction_style =
+      ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleNone;
+  options.level0_slowdown_writes_trigger = 99999;
+  options.level0_stop_writes_trigger = 99999;
+  options.use_direct_io_for_flush_and_compaction = true;
+  options.write_buffer_size = FLAGS_memtable_size;
+  ROCKSDB_NAMESPACE::BlockBasedTableOptions table_options;
+  table_options.block_cache =
+      ROCKSDB_NAMESPACE::NewLRUCache(FLAGS_block_cache_size);
+  table_options.block_size = FLAGS_block_size;
+  options.table_factory.reset(
+      ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(table_options));
+
+  status = ROCKSDB_NAMESPACE::DestroyDB(path, options);
+  assert(status.ok());
+  ROCKSDB_NAMESPACE::DB* db_raw;
+  status = ROCKSDB_NAMESPACE::DB::Open(options, path, &db_raw);
+  assert(status.ok());
+  std::unique_ptr<ROCKSDB_NAMESPACE::DB> db(db_raw);
+
+  std::vector<ShardState> shard_states(FLAGS_shards + 1);
+  std::deque<Reader> readers;
+  while (static_cast<int>(readers.size()) < FLAGS_readers) {
+    readers.emplace_back(&shard_states, db_raw);
+  }
+  std::deque<Writer> writers;
+  while (static_cast<int>(writers.size()) < FLAGS_writers) {
+    writers.emplace_back(&shard_states, db_raw);
+  }
+
+  // Each shard gets a random reader and random writer assigned to it
+  for (int i = 1; i <= FLAGS_shards; ++i) {
+    std::uniform_int_distribution<int> reader_dist(0, FLAGS_readers - 1);
+    std::uniform_int_distribution<int> writer_dist(0, FLAGS_writers - 1);
+    shard_states[i].reader = &readers[reader_dist(rng)];
+    shard_states[i].writer = &writers[writer_dist(rng)];
+  }
+
+  StatsThread stats_thread(db_raw);
+  for (Writer& w : writers) {
+    w.start();
+  }
+
+  writers.clear();
+  readers.clear();
+}
+#endif  // !defined(GFLAGS) || defined(ROCKSDB_LITE)
diff --git a/src/rocksdb/db/history_trimming_iterator.h b/src/rocksdb/db/history_trimming_iterator.h
new file mode 100644
index 000000000..b445ced33
--- /dev/null
+++ b/src/rocksdb/db/history_trimming_iterator.h
@@ -0,0 +1,91 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice.h"
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class HistoryTrimmingIterator : public InternalIterator {
+ public:
+  explicit HistoryTrimmingIterator(InternalIterator* input,
+                                   const Comparator* cmp, const std::string& ts)
+      : input_(input), filter_ts_(ts), cmp_(cmp) {
+    assert(cmp_->timestamp_size() > 0 && !ts.empty());
+  }
+
+  bool filter() const {
+    if (!input_->Valid()) {
+      return true;
+    }
+    Slice current_ts = ExtractTimestampFromKey(key(), cmp_->timestamp_size());
+    return cmp_->CompareTimestamp(current_ts, Slice(filter_ts_)) <= 0;
+  }
+
+  bool Valid() const override { return input_->Valid(); }
+
+  void SeekToFirst() override {
+    input_->SeekToFirst();
+    while (!filter()) {
+      input_->Next();
+    }
+  }
+
+  void SeekToLast() override {
+    input_->SeekToLast();
+    while (!filter()) {
+      input_->Prev();
+    }
+  }
+
+  void Seek(const Slice& target) override {
+    input_->Seek(target);
+    while (!filter()) {
+      input_->Next();
+    }
+  }
+
+  void SeekForPrev(const Slice& target) override {
+    input_->SeekForPrev(target);
+    while (!filter()) {
+      input_->Prev();
+    }
+  }
+
+  void Next() override {
+    do {
+      input_->Next();
+    } while (!filter());
+  }
+
+  void Prev() override {
+    do {
+      input_->Prev();
+    } while (!filter());
+  }
+
+  Slice key() const override { return input_->key(); }
+
+  Slice value() const override { return input_->value(); }
+
+  Status status() const override { return input_->status(); }
+
+  bool IsKeyPinned() const override { return input_->IsKeyPinned(); }
+
+  bool IsValuePinned() const override { return input_->IsValuePinned(); }
+
+ private:
+  InternalIterator* input_;
+  const std::string filter_ts_;
+  const Comparator* const cmp_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/import_column_family_job.cc b/src/rocksdb/db/import_column_family_job.cc
new file mode 100644
index 000000000..34985666a
--- /dev/null
+++ b/src/rocksdb/db/import_column_family_job.cc
@@ -0,0 +1,312 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/import_column_family_job.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <string>
+#include <vector>
+
+#include "db/version_edit.h"
+#include "file/file_util.h"
+#include "file/random_access_file_reader.h"
+#include "logging/logging.h"
+#include "table/merging_iterator.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/sst_file_writer_collectors.h"
+#include "table/table_builder.h"
+#include "table/unique_id_impl.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status ImportColumnFamilyJob::Prepare(uint64_t next_file_number,
+                                      SuperVersion* sv) {
+  Status status;
+
+  // Read the information of files we are importing
+  for (const auto& file_metadata : metadata_) {
+    const auto file_path = file_metadata.db_path + "/" + file_metadata.name;
+    IngestedFileInfo file_to_import;
+    status =
+        GetIngestedFileInfo(file_path, next_file_number++, &file_to_import, sv);
+    if (!status.ok()) {
+      return status;
+    }
+    files_to_import_.push_back(file_to_import);
+  }
+
+  auto num_files = files_to_import_.size();
+  if (num_files == 0) {
+    return Status::InvalidArgument("The list of files is empty");
+  } else if (num_files > 1) {
+    // Verify that passed files don't have overlapping ranges in any particular
+    // level.
+    int min_level = 1;  // Check for overlaps in Level 1 and above.
+    int max_level = -1;
+    for (const auto& file_metadata : metadata_) {
+      if (file_metadata.level > max_level) {
+        max_level = file_metadata.level;
+      }
+    }
+    for (int level = min_level; level <= max_level; ++level) {
+      autovector<const IngestedFileInfo*> sorted_files;
+      for (size_t i = 0; i < num_files; i++) {
+        if (metadata_[i].level == level) {
+          sorted_files.push_back(&files_to_import_[i]);
+        }
+      }
+
+      std::sort(
+          sorted_files.begin(), sorted_files.end(),
+          [this](const IngestedFileInfo* info1, const IngestedFileInfo* info2) {
+            return cfd_->internal_comparator().Compare(
+                       info1->smallest_internal_key,
+                       info2->smallest_internal_key) < 0;
+          });
+
+      for (size_t i = 0; i + 1 < sorted_files.size(); i++) {
+        if (cfd_->internal_comparator().Compare(
+                sorted_files[i]->largest_internal_key,
+                sorted_files[i + 1]->smallest_internal_key) >= 0) {
+          return Status::InvalidArgument("Files have overlapping ranges");
+        }
+      }
+    }
+  }
+
+  for (const auto& f : files_to_import_) {
+    if (f.num_entries == 0) {
+      return Status::InvalidArgument("File contain no entries");
+    }
+
+    if (!f.smallest_internal_key.Valid() || !f.largest_internal_key.Valid()) {
+      return Status::Corruption("File has corrupted keys");
+    }
+  }
+
+  // Copy/Move external files into DB
+  auto hardlink_files = import_options_.move_files;
+  for (auto& f : files_to_import_) {
+    const auto path_outside_db = f.external_file_path;
+    const auto path_inside_db = TableFileName(
+        cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId());
+
+    if (hardlink_files) {
+      status =
+          fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr);
+      if (status.IsNotSupported()) {
+        // Original file is on a different FS, use copy instead of hard linking
+        hardlink_files = false;
+        ROCKS_LOG_INFO(db_options_.info_log,
+                       "Try to link file %s but it's not supported : %s",
+                       f.internal_file_path.c_str(), status.ToString().c_str());
+      }
+    }
+    if (!hardlink_files) {
+      status =
+          CopyFile(fs_.get(), path_outside_db, path_inside_db, 0,
+                   db_options_.use_fsync, io_tracer_, Temperature::kUnknown);
+    }
+    if (!status.ok()) {
+      break;
+    }
+    f.copy_file = !hardlink_files;
+    f.internal_file_path = path_inside_db;
+  }
+
+  if (!status.ok()) {
+    // We failed, remove all files that we copied into the db
+    for (const auto& f : files_to_import_) {
+      if (f.internal_file_path.empty()) {
+        break;
+      }
+      const auto s =
+          fs_->DeleteFile(f.internal_file_path, IOOptions(), nullptr);
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "AddFile() clean up for file %s failed : %s",
+                       f.internal_file_path.c_str(), s.ToString().c_str());
+      }
+    }
+  }
+
+  return status;
+}
+
+// REQUIRES: we have become the only writer by entering both write_thread_ and
+// nonmem_write_thread_
+Status ImportColumnFamilyJob::Run() {
+  Status status;
+  edit_.SetColumnFamily(cfd_->GetID());
+
+  // We use the import time as the ancester time. This is the time the data
+  // is written to the database.
+  int64_t temp_current_time = 0;
+  uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
+  uint64_t current_time = kUnknownOldestAncesterTime;
+  if (clock_->GetCurrentTime(&temp_current_time).ok()) {
+    current_time = oldest_ancester_time =
+        static_cast<uint64_t>(temp_current_time);
+  }
+
+  for (size_t i = 0; i < files_to_import_.size(); ++i) {
+    const auto& f = files_to_import_[i];
+    const auto& file_metadata = metadata_[i];
+
+    edit_.AddFile(file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(),
+                  f.fd.GetFileSize(), f.smallest_internal_key,
+                  f.largest_internal_key, file_metadata.smallest_seqno,
+                  file_metadata.largest_seqno, false, file_metadata.temperature,
+                  kInvalidBlobFileNumber, oldest_ancester_time, current_time,
+                  kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+                  f.unique_id);
+
+    // If incoming sequence number is higher, update local sequence number.
+    if (file_metadata.largest_seqno > versions_->LastSequence()) {
+      versions_->SetLastAllocatedSequence(file_metadata.largest_seqno);
+      versions_->SetLastPublishedSequence(file_metadata.largest_seqno);
+      versions_->SetLastSequence(file_metadata.largest_seqno);
+    }
+  }
+
+  return status;
+}
+
+void ImportColumnFamilyJob::Cleanup(const Status& status) {
+  if (!status.ok()) {
+    // We failed to add files to the database remove all the files we copied.
+    for (const auto& f : files_to_import_) {
+      const auto s =
+          fs_->DeleteFile(f.internal_file_path, IOOptions(), nullptr);
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "AddFile() clean up for file %s failed : %s",
+                       f.internal_file_path.c_str(), s.ToString().c_str());
+      }
+    }
+  } else if (status.ok() && import_options_.move_files) {
+    // The files were moved and added successfully, remove original file links
+    for (IngestedFileInfo& f : files_to_import_) {
+      const auto s =
+          fs_->DeleteFile(f.external_file_path, IOOptions(), nullptr);
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(
+            db_options_.info_log,
+            "%s was added to DB successfully but failed to remove original "
+            "file link : %s",
+            f.external_file_path.c_str(), s.ToString().c_str());
+      }
+    }
+  }
+}
+
+Status ImportColumnFamilyJob::GetIngestedFileInfo(
+    const std::string& external_file, uint64_t new_file_number,
+    IngestedFileInfo* file_to_import, SuperVersion* sv) {
+  file_to_import->external_file_path = external_file;
+
+  // Get external file size
+  Status status = fs_->GetFileSize(external_file, IOOptions(),
+                                   &file_to_import->file_size, nullptr);
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Assign FD with number
+  file_to_import->fd =
+      FileDescriptor(new_file_number, 0, file_to_import->file_size);
+
+  // Create TableReader for external file
+  std::unique_ptr<TableReader> table_reader;
+  std::unique_ptr<FSRandomAccessFile> sst_file;
+  std::unique_ptr<RandomAccessFileReader> sst_file_reader;
+
+  status =
+      fs_->NewRandomAccessFile(external_file, env_options_, &sst_file, nullptr);
+  if (!status.ok()) {
+    return status;
+  }
+  sst_file_reader.reset(new RandomAccessFileReader(
+      std::move(sst_file), external_file, nullptr /*Env*/, io_tracer_));
+
+  status = cfd_->ioptions()->table_factory->NewTableReader(
+      TableReaderOptions(
+          *cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor,
+          env_options_, cfd_->internal_comparator(),
+          /*skip_filters*/ false, /*immortal*/ false,
+          /*force_direct_prefetch*/ false, /*level*/ -1,
+          /*block_cache_tracer*/ nullptr,
+          /*max_file_size_for_l0_meta_pin*/ 0, versions_->DbSessionId(),
+          /*cur_file_num*/ new_file_number),
+      std::move(sst_file_reader), file_to_import->file_size, &table_reader);
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Get the external file properties
+  auto props = table_reader->GetTableProperties();
+
+  // Set original_seqno to 0.
+  file_to_import->original_seqno = 0;
+
+  // Get number of entries in table
+  file_to_import->num_entries = props->num_entries;
+
+  ParsedInternalKey key;
+  ReadOptions ro;
+  // During reading the external file we can cache blocks that we read into
+  // the block cache, if we later change the global seqno of this file, we will
+  // have block in cache that will include keys with wrong seqno.
+  // We need to disable fill_cache so that we read from the file without
+  // updating the block cache.
+  ro.fill_cache = false;
+  std::unique_ptr<InternalIterator> iter(table_reader->NewIterator(
+      ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion));
+
+  // Get first (smallest) key from file
+  iter->SeekToFirst();
+  Status pik_status =
+      ParseInternalKey(iter->key(), &key, db_options_.allow_data_in_errors);
+  if (!pik_status.ok()) {
+    return Status::Corruption("Corrupted Key in external file. ",
+                              pik_status.getState());
+  }
+  file_to_import->smallest_internal_key.SetFrom(key);
+
+  // Get last (largest) key from file
+  iter->SeekToLast();
+  pik_status =
+      ParseInternalKey(iter->key(), &key, db_options_.allow_data_in_errors);
+  if (!pik_status.ok()) {
+    return Status::Corruption("Corrupted Key in external file. ",
+                              pik_status.getState());
+  }
+  file_to_import->largest_internal_key.SetFrom(key);
+
+  file_to_import->cf_id = static_cast<uint32_t>(props->column_family_id);
+
+  file_to_import->table_properties = *props;
+
+  auto s = GetSstInternalUniqueId(props->db_id, props->db_session_id,
+                                  props->orig_file_number,
+                                  &(file_to_import->unique_id));
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Failed to get SST unique id for file %s",
+                   file_to_import->internal_file_path.c_str());
+  }
+
+  return status;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/import_column_family_job.h b/src/rocksdb/db/import_column_family_job.h
new file mode 100644
index 000000000..57c49c67f
--- /dev/null
+++ b/src/rocksdb/db/import_column_family_job.h
@@ -0,0 +1,82 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/external_sst_file_ingestion_job.h"
+#include "db/snapshot_impl.h"
+#include "options/db_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/metadata.h"
+#include "rocksdb/sst_file_writer.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+struct EnvOptions;
+class SystemClock;
+
+// Imports a set of sst files as is into a new column family. Logic is similar
+// to ExternalSstFileIngestionJob.
+class ImportColumnFamilyJob {
+ public:
+  ImportColumnFamilyJob(VersionSet* versions, ColumnFamilyData* cfd,
+                        const ImmutableDBOptions& db_options,
+                        const EnvOptions& env_options,
+                        const ImportColumnFamilyOptions& import_options,
+                        const std::vector<LiveFileMetaData>& metadata,
+                        const std::shared_ptr<IOTracer>& io_tracer)
+      : clock_(db_options.clock),
+        versions_(versions),
+        cfd_(cfd),
+        db_options_(db_options),
+        fs_(db_options_.fs, io_tracer),
+        env_options_(env_options),
+        import_options_(import_options),
+        metadata_(metadata),
+        io_tracer_(io_tracer) {}
+
+  // Prepare the job by copying external files into the DB.
+  Status Prepare(uint64_t next_file_number, SuperVersion* sv);
+
+  // Will execute the import job and prepare edit() to be applied.
+  // REQUIRES: Mutex held
+  Status Run();
+
+  // Cleanup after successful/failed job
+  void Cleanup(const Status& status);
+
+  VersionEdit* edit() { return &edit_; }
+
+  const autovector<IngestedFileInfo>& files_to_import() const {
+    return files_to_import_;
+  }
+
+ private:
+  // Open the external file and populate `file_to_import` with all the
+  // external information we need to import this file.
+  Status GetIngestedFileInfo(const std::string& external_file,
+                             uint64_t new_file_number,
+                             IngestedFileInfo* file_to_import,
+                             SuperVersion* sv);
+
+  SystemClock* clock_;
+  VersionSet* versions_;
+  ColumnFamilyData* cfd_;
+  const ImmutableDBOptions& db_options_;
+  const FileSystemPtr fs_;
+  const EnvOptions& env_options_;
+  autovector<IngestedFileInfo> files_to_import_;
+  VersionEdit edit_;
+  const ImportColumnFamilyOptions& import_options_;
+  std::vector<LiveFileMetaData> metadata_;
+  const std::shared_ptr<IOTracer> io_tracer_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/import_column_family_test.cc b/src/rocksdb/db/import_column_family_test.cc
new file mode 100644
index 000000000..2847ea8da
--- /dev/null
+++ b/src/rocksdb/db/import_column_family_test.cc
@@ -0,0 +1,644 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/sst_file_writer.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ImportColumnFamilyTest : public DBTestBase {
+ public:
+  ImportColumnFamilyTest()
+      : DBTestBase("import_column_family_test", /*env_do_fsync=*/true) {
+    sst_files_dir_ = dbname_ + "/sst_files/";
+    export_files_dir_ = test::PerThreadDBPath(env_, "export");
+    DestroyAndRecreateExternalSSTFilesDir();
+    import_cfh_ = nullptr;
+    import_cfh2_ = nullptr;
+    metadata_ptr_ = nullptr;
+  }
+
+  ~ImportColumnFamilyTest() {
+    if (import_cfh_) {
+      EXPECT_OK(db_->DropColumnFamily(import_cfh_));
+      EXPECT_OK(db_->DestroyColumnFamilyHandle(import_cfh_));
+      import_cfh_ = nullptr;
+    }
+    if (import_cfh2_) {
+      EXPECT_OK(db_->DropColumnFamily(import_cfh2_));
+      EXPECT_OK(db_->DestroyColumnFamilyHandle(import_cfh2_));
+      import_cfh2_ = nullptr;
+    }
+    if (metadata_ptr_) {
+      delete metadata_ptr_;
+      metadata_ptr_ = nullptr;
+    }
+    EXPECT_OK(DestroyDir(env_, sst_files_dir_));
+    EXPECT_OK(DestroyDir(env_, export_files_dir_));
+  }
+
+  void DestroyAndRecreateExternalSSTFilesDir() {
+    EXPECT_OK(DestroyDir(env_, sst_files_dir_));
+    EXPECT_OK(env_->CreateDir(sst_files_dir_));
+    EXPECT_OK(DestroyDir(env_, export_files_dir_));
+  }
+
+  LiveFileMetaData LiveFileMetaDataInit(std::string name, std::string path,
+                                        int level,
+                                        SequenceNumber smallest_seqno,
+                                        SequenceNumber largest_seqno) {
+    LiveFileMetaData metadata;
+    metadata.name = name;
+    metadata.db_path = path;
+    metadata.smallest_seqno = smallest_seqno;
+    metadata.largest_seqno = largest_seqno;
+    metadata.level = level;
+    return metadata;
+  }
+
+ protected:
+  std::string sst_files_dir_;
+  std::string export_files_dir_;
+  ColumnFamilyHandle* import_cfh_;
+  ColumnFamilyHandle* import_cfh2_;
+  ExportImportFilesMetaData* metadata_ptr_;
+};
+
+TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFiles) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"koko"}, options);
+
+  SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+  SstFileWriter sfw_unknown(EnvOptions(), options);
+
+  // cf1.sst
+  const std::string cf1_sst_name = "cf1.sst";
+  const std::string cf1_sst = sst_files_dir_ + cf1_sst_name;
+  ASSERT_OK(sfw_cf1.Open(cf1_sst));
+  ASSERT_OK(sfw_cf1.Put("K1", "V1"));
+  ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // cf_unknown.sst
+  const std::string unknown_sst_name = "cf_unknown.sst";
+  const std::string unknown_sst = sst_files_dir_ + unknown_sst_name;
+  ASSERT_OK(sfw_unknown.Open(unknown_sst));
+  ASSERT_OK(sfw_unknown.Put("K3", "V1"));
+  ASSERT_OK(sfw_unknown.Put("K4", "V2"));
+  ASSERT_OK(sfw_unknown.Finish());
+
+  {
+    // Import sst file corresponding to cf1 onto a new cf and verify
+    ExportImportFilesMetaData metadata;
+    metadata.files.push_back(
+        LiveFileMetaDataInit(cf1_sst_name, sst_files_dir_, 0, 10, 19));
+    metadata.db_comparator_name = options.comparator->Name();
+
+    ASSERT_OK(db_->CreateColumnFamilyWithImport(
+        options, "toto", ImportColumnFamilyOptions(), metadata, &import_cfh_));
+    ASSERT_NE(import_cfh_, nullptr);
+
+    std::string value;
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K1", &value));
+    ASSERT_EQ(value, "V1");
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K2", &value));
+    ASSERT_EQ(value, "V2");
+    ASSERT_OK(db_->DropColumnFamily(import_cfh_));
+    ASSERT_OK(db_->DestroyColumnFamilyHandle(import_cfh_));
+    import_cfh_ = nullptr;
+  }
+
+  {
+    // Import sst file corresponding to unknown cf onto a new cf and verify
+    ExportImportFilesMetaData metadata;
+    metadata.files.push_back(
+        LiveFileMetaDataInit(unknown_sst_name, sst_files_dir_, 0, 20, 29));
+    metadata.db_comparator_name = options.comparator->Name();
+
+    ASSERT_OK(db_->CreateColumnFamilyWithImport(
+        options, "yoyo", ImportColumnFamilyOptions(), metadata, &import_cfh_));
+    ASSERT_NE(import_cfh_, nullptr);
+
+    std::string value;
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K3", &value));
+    ASSERT_EQ(value, "V1");
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K4", &value));
+    ASSERT_EQ(value, "V2");
+  }
+  EXPECT_OK(db_->DestroyColumnFamilyHandle(import_cfh_));
+  import_cfh_ = nullptr;
+
+  // verify sst unique id during reopen
+  options.verify_sst_unique_id_in_manifest = true;
+  ReopenWithColumnFamilies({"default", "koko", "yoyo"}, options);
+}
+
+TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFilesWithOverlap) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"koko"}, options);
+
+  SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+
+  // file3.sst
+  const std::string file3_sst_name = "file3.sst";
+  const std::string file3_sst = sst_files_dir_ + file3_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file3_sst));
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_val"));
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // file2.sst
+  const std::string file2_sst_name = "file2.sst";
+  const std::string file2_sst = sst_files_dir_ + file2_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file2_sst));
+  for (int i = 0; i < 100; i += 2) {
+    ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite1"));
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // file1a.sst
+  const std::string file1a_sst_name = "file1a.sst";
+  const std::string file1a_sst = sst_files_dir_ + file1a_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file1a_sst));
+  for (int i = 0; i < 52; i += 4) {
+    ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite2"));
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // file1b.sst
+  const std::string file1b_sst_name = "file1b.sst";
+  const std::string file1b_sst = sst_files_dir_ + file1b_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file1b_sst));
+  for (int i = 52; i < 100; i += 4) {
+    ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite2"));
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // file0a.sst
+  const std::string file0a_sst_name = "file0a.sst";
+  const std::string file0a_sst = sst_files_dir_ + file0a_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file0a_sst));
+  for (int i = 0; i < 100; i += 16) {
+    ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite3"));
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // file0b.sst
+  const std::string file0b_sst_name = "file0b.sst";
+  const std::string file0b_sst = sst_files_dir_ + file0b_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file0b_sst));
+  for (int i = 0; i < 100; i += 16) {
+    ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite4"));
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // Import sst files and verify
+  ExportImportFilesMetaData metadata;
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file3_sst_name, sst_files_dir_, 3, 10, 19));
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file2_sst_name, sst_files_dir_, 2, 20, 29));
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file1a_sst_name, sst_files_dir_, 1, 30, 34));
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file1b_sst_name, sst_files_dir_, 1, 35, 39));
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file0a_sst_name, sst_files_dir_, 0, 40, 49));
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file0b_sst_name, sst_files_dir_, 0, 50, 59));
+  metadata.db_comparator_name = options.comparator->Name();
+
+  ASSERT_OK(db_->CreateColumnFamilyWithImport(
+      options, "toto", ImportColumnFamilyOptions(), metadata, &import_cfh_));
+  ASSERT_NE(import_cfh_, nullptr);
+
+  for (int i = 0; i < 100; i++) {
+    std::string value;
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value));
+    if (i % 16 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite4");
+    } else if (i % 4 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite2");
+    } else if (i % 2 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite1");
+    } else {
+      ASSERT_EQ(value, Key(i) + "_val");
+    }
+  }
+
+  for (int i = 0; i < 100; i += 5) {
+    ASSERT_OK(
+        db_->Put(WriteOptions(), import_cfh_, Key(i), Key(i) + "_overwrite5"));
+  }
+
+  // Flush and check again
+  ASSERT_OK(db_->Flush(FlushOptions(), import_cfh_));
+  for (int i = 0; i < 100; i++) {
+    std::string value;
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value));
+    if (i % 5 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite5");
+    } else if (i % 16 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite4");
+    } else if (i % 4 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite2");
+    } else if (i % 2 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite1");
+    } else {
+      ASSERT_EQ(value, Key(i) + "_val");
+    }
+  }
+
+  // Compact and check again.
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), import_cfh_, nullptr, nullptr));
+  for (int i = 0; i < 100; i++) {
+    std::string value;
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value));
+    if (i % 5 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite5");
+    } else if (i % 16 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite4");
+    } else if (i % 4 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite2");
+    } else if (i % 2 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite1");
+    } else {
+      ASSERT_EQ(value, Key(i) + "_val");
+    }
+  }
+}
+
+TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherCF) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"koko"}, options);
+
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(Put(1, Key(i), Key(i) + "_val"));
+  }
+  ASSERT_OK(Flush(1));
+
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr));
+
+  // Overwrite the value in the same set of keys.
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(Put(1, Key(i), Key(i) + "_overwrite"));
+  }
+
+  // Flush to create L0 file.
+  ASSERT_OK(Flush(1));
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(Put(1, Key(i), Key(i) + "_overwrite2"));
+  }
+
+  // Flush again to create another L0 file. It should have higher sequencer.
+  ASSERT_OK(Flush(1));
+
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_,
+                                           &metadata_ptr_));
+  ASSERT_NE(metadata_ptr_, nullptr);
+  delete checkpoint;
+
+  ImportColumnFamilyOptions import_options;
+  import_options.move_files = false;
+  ASSERT_OK(db_->CreateColumnFamilyWithImport(options, "toto", import_options,
+                                              *metadata_ptr_, &import_cfh_));
+  ASSERT_NE(import_cfh_, nullptr);
+
+  import_options.move_files = true;
+  ASSERT_OK(db_->CreateColumnFamilyWithImport(options, "yoyo", import_options,
+                                              *metadata_ptr_, &import_cfh2_));
+  ASSERT_NE(import_cfh2_, nullptr);
+  delete metadata_ptr_;
+  metadata_ptr_ = NULL;
+
+  std::string value1, value2;
+
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1));
+    ASSERT_EQ(Get(1, Key(i)), value1);
+  }
+
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2));
+    ASSERT_EQ(Get(1, Key(i)), value2);
+  }
+
+  // Modify keys in cf1 and verify.
+  for (int i = 0; i < 25; i++) {
+    ASSERT_OK(db_->Delete(WriteOptions(), import_cfh_, Key(i)));
+  }
+  for (int i = 25; i < 50; i++) {
+    ASSERT_OK(
+        db_->Put(WriteOptions(), import_cfh_, Key(i), Key(i) + "_overwrite3"));
+  }
+  for (int i = 0; i < 25; ++i) {
+    ASSERT_TRUE(
+        db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound());
+  }
+  for (int i = 25; i < 50; ++i) {
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1));
+    ASSERT_EQ(Key(i) + "_overwrite3", value1);
+  }
+  for (int i = 50; i < 100; ++i) {
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1));
+    ASSERT_EQ(Key(i) + "_overwrite2", value1);
+  }
+
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2));
+    ASSERT_EQ(Get(1, Key(i)), value2);
+  }
+
+  // Compact and check again.
+  ASSERT_OK(db_->Flush(FlushOptions(), import_cfh_));
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), import_cfh_, nullptr, nullptr));
+
+  for (int i = 0; i < 25; ++i) {
+    ASSERT_TRUE(
+        db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound());
+  }
+  for (int i = 25; i < 50; ++i) {
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1));
+    ASSERT_EQ(Key(i) + "_overwrite3", value1);
+  }
+  for (int i = 50; i < 100; ++i) {
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1));
+    ASSERT_EQ(Key(i) + "_overwrite2", value1);
+  }
+
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2));
+    ASSERT_EQ(Get(1, Key(i)), value2);
+  }
+}
+
+TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherDB) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"koko"}, options);
+
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(Put(1, Key(i), Key(i) + "_val"));
+  }
+  ASSERT_OK(Flush(1));
+
+  // Compact to create a L1 file.
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr));
+
+  // Overwrite the value in the same set of keys.
+  for (int i = 0; i < 50; ++i) {
+    ASSERT_OK(Put(1, Key(i), Key(i) + "_overwrite"));
+  }
+
+  // Flush to create L0 file.
+  ASSERT_OK(Flush(1));
+
+  for (int i = 0; i < 25; ++i) {
+    ASSERT_OK(Put(1, Key(i), Key(i) + "_overwrite2"));
+  }
+
+  // Flush again to create another L0 file. It should have higher sequencer.
+  ASSERT_OK(Flush(1));
+
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_,
+                                           &metadata_ptr_));
+  ASSERT_NE(metadata_ptr_, nullptr);
+  delete checkpoint;
+
+  // Create a new db and import the files.
+  DB* db_copy;
+  ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
+  ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy));
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db_copy->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                                  ImportColumnFamilyOptions(),
+                                                  *metadata_ptr_, &cfh));
+  ASSERT_NE(cfh, nullptr);
+
+  for (int i = 0; i < 100; ++i) {
+    std::string value;
+    ASSERT_OK(db_copy->Get(ReadOptions(), cfh, Key(i), &value));
+    ASSERT_EQ(Get(1, Key(i)), value);
+  }
+  ASSERT_OK(db_copy->DropColumnFamily(cfh));
+  ASSERT_OK(db_copy->DestroyColumnFamilyHandle(cfh));
+  delete db_copy;
+  ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
+}
+
+TEST_F(ImportColumnFamilyTest, LevelFilesOverlappingAtEndpoints) {
+  // Imports a column family containing a level where two files overlap at their
+  // endpoints. "Overlap" means the largest user key in one file is the same as
+  // the smallest user key in the second file.
+  const int kFileBytes = 128 << 10;  // 128KB
+  const int kValueBytes = 1 << 10;   // 1KB
+  const int kNumFiles = 4;
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = 2;
+  CreateAndReopenWithCF({"koko"}, options);
+
+  Random rnd(301);
+  // Every key is snapshot protected to ensure older versions will not be
+  // dropped during compaction.
+  std::vector<const Snapshot*> snapshots;
+  snapshots.reserve(kFileBytes / kValueBytes * kNumFiles);
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
+      auto value = rnd.RandomString(kValueBytes);
+      ASSERT_OK(Put(1, "key", value));
+      snapshots.push_back(db_->GetSnapshot());
+    }
+    ASSERT_OK(Flush(1));
+  }
+
+  // Compact to create overlapping L1 files.
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr));
+  ASSERT_GT(NumTableFilesAtLevel(1, 1), 1);
+
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_,
+                                           &metadata_ptr_));
+  ASSERT_NE(metadata_ptr_, nullptr);
+  delete checkpoint;
+
+  // Create a new db and import the files.
+  DB* db_copy;
+  ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
+  ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy));
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db_copy->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                                  ImportColumnFamilyOptions(),
+                                                  *metadata_ptr_, &cfh));
+  ASSERT_NE(cfh, nullptr);
+
+  {
+    std::string value;
+    ASSERT_OK(db_copy->Get(ReadOptions(), cfh, "key", &value));
+  }
+  ASSERT_OK(db_copy->DropColumnFamily(cfh));
+  ASSERT_OK(db_copy->DestroyColumnFamilyHandle(cfh));
+  delete db_copy;
+  ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
+  for (const Snapshot* snapshot : snapshots) {
+    db_->ReleaseSnapshot(snapshot);
+  }
+}
+
+TEST_F(ImportColumnFamilyTest, ImportColumnFamilyNegativeTest) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"koko"}, options);
+
+  {
+    // Create column family with existing cf name.
+    ExportImportFilesMetaData metadata;
+
+    ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "koko",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_),
+              Status::InvalidArgument("Column family already exists"));
+    ASSERT_EQ(import_cfh_, nullptr);
+  }
+
+  {
+    // Import with no files specified.
+    ExportImportFilesMetaData metadata;
+
+    ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_),
+              Status::InvalidArgument("The list of files is empty"));
+    ASSERT_EQ(import_cfh_, nullptr);
+  }
+
+  {
+    // Import with overlapping keys in sst files.
+    ExportImportFilesMetaData metadata;
+    SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+    const std::string file1_sst_name = "file1.sst";
+    const std::string file1_sst = sst_files_dir_ + file1_sst_name;
+    ASSERT_OK(sfw_cf1.Open(file1_sst));
+    ASSERT_OK(sfw_cf1.Put("K1", "V1"));
+    ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+    ASSERT_OK(sfw_cf1.Finish());
+    const std::string file2_sst_name = "file2.sst";
+    const std::string file2_sst = sst_files_dir_ + file2_sst_name;
+    ASSERT_OK(sfw_cf1.Open(file2_sst));
+    ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+    ASSERT_OK(sfw_cf1.Put("K3", "V3"));
+    ASSERT_OK(sfw_cf1.Finish());
+
+    metadata.files.push_back(
+        LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19));
+    metadata.files.push_back(
+        LiveFileMetaDataInit(file2_sst_name, sst_files_dir_, 1, 10, 19));
+    metadata.db_comparator_name = options.comparator->Name();
+
+    ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_),
+              Status::InvalidArgument("Files have overlapping ranges"));
+    ASSERT_EQ(import_cfh_, nullptr);
+  }
+
+  {
+    // Import with a mismatching comparator, should fail with appropriate error.
+    ExportImportFilesMetaData metadata;
+    Options mismatch_options = CurrentOptions();
+    mismatch_options.comparator = ReverseBytewiseComparator();
+    SstFileWriter sfw_cf1(EnvOptions(), mismatch_options, handles_[1]);
+    const std::string file1_sst_name = "file1.sst";
+    const std::string file1_sst = sst_files_dir_ + file1_sst_name;
+    ASSERT_OK(sfw_cf1.Open(file1_sst));
+    ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+    ASSERT_OK(sfw_cf1.Put("K1", "V1"));
+    ASSERT_OK(sfw_cf1.Finish());
+
+    metadata.files.push_back(
+        LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19));
+    metadata.db_comparator_name = mismatch_options.comparator->Name();
+
+    ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "coco",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_),
+              Status::InvalidArgument("Comparator name mismatch"));
+    ASSERT_EQ(import_cfh_, nullptr);
+  }
+
+  {
+    // Import with non existent sst file should fail with appropriate error
+    ExportImportFilesMetaData metadata;
+    SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+    const std::string file1_sst_name = "file1.sst";
+    const std::string file1_sst = sst_files_dir_ + file1_sst_name;
+    ASSERT_OK(sfw_cf1.Open(file1_sst));
+    ASSERT_OK(sfw_cf1.Put("K1", "V1"));
+    ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+    ASSERT_OK(sfw_cf1.Finish());
+    const std::string file3_sst_name = "file3.sst";
+
+    metadata.files.push_back(
+        LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19));
+    metadata.files.push_back(
+        LiveFileMetaDataInit(file3_sst_name, sst_files_dir_, 1, 10, 19));
+    metadata.db_comparator_name = options.comparator->Name();
+
+    ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_),
+              Status::IOError("No such file or directory"));
+    ASSERT_EQ(import_cfh_, nullptr);
+
+    // Test successful import after a failure with the same CF name. Ensures
+    // there is no side effect with CF when there is a failed import
+    metadata.files.pop_back();
+    metadata.db_comparator_name = options.comparator->Name();
+
+    ASSERT_OK(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_));
+    ASSERT_NE(import_cfh_, nullptr);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as External SST File Writer and Import are not supported "
+          "in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/internal_stats.cc b/src/rocksdb/db/internal_stats.cc
new file mode 100644
index 000000000..ac5b81f3e
--- /dev/null
+++ b/src/rocksdb/db/internal_stats.cc
@@ -0,0 +1,2002 @@
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/internal_stats.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <cstddef>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_entry_stats.h"
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "port/port.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/table.h"
+#include "table/block_based/cachable_entry.h"
+#include "util/hash_containers.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+
+const std::map<LevelStatType, LevelStat> InternalStats::compaction_level_stats =
+    {
+        {LevelStatType::NUM_FILES, LevelStat{"NumFiles", "Files"}},
+        {LevelStatType::COMPACTED_FILES,
+         LevelStat{"CompactedFiles", "CompactedFiles"}},
+        {LevelStatType::SIZE_BYTES, LevelStat{"SizeBytes", "Size"}},
+        {LevelStatType::SCORE, LevelStat{"Score", "Score"}},
+        {LevelStatType::READ_GB, LevelStat{"ReadGB", "Read(GB)"}},
+        {LevelStatType::RN_GB, LevelStat{"RnGB", "Rn(GB)"}},
+        {LevelStatType::RNP1_GB, LevelStat{"Rnp1GB", "Rnp1(GB)"}},
+        {LevelStatType::WRITE_GB, LevelStat{"WriteGB", "Write(GB)"}},
+        {LevelStatType::W_NEW_GB, LevelStat{"WnewGB", "Wnew(GB)"}},
+        {LevelStatType::MOVED_GB, LevelStat{"MovedGB", "Moved(GB)"}},
+        {LevelStatType::WRITE_AMP, LevelStat{"WriteAmp", "W-Amp"}},
+        {LevelStatType::READ_MBPS, LevelStat{"ReadMBps", "Rd(MB/s)"}},
+        {LevelStatType::WRITE_MBPS, LevelStat{"WriteMBps", "Wr(MB/s)"}},
+        {LevelStatType::COMP_SEC, LevelStat{"CompSec", "Comp(sec)"}},
+        {LevelStatType::COMP_CPU_SEC,
+         LevelStat{"CompMergeCPU", "CompMergeCPU(sec)"}},
+        {LevelStatType::COMP_COUNT, LevelStat{"CompCount", "Comp(cnt)"}},
+        {LevelStatType::AVG_SEC, LevelStat{"AvgSec", "Avg(sec)"}},
+        {LevelStatType::KEY_IN, LevelStat{"KeyIn", "KeyIn"}},
+        {LevelStatType::KEY_DROP, LevelStat{"KeyDrop", "KeyDrop"}},
+        {LevelStatType::R_BLOB_GB, LevelStat{"RblobGB", "Rblob(GB)"}},
+        {LevelStatType::W_BLOB_GB, LevelStat{"WblobGB", "Wblob(GB)"}},
+};
+
+const std::map<InternalStats::InternalDBStatsType, DBStatInfo>
+    InternalStats::db_stats_type_to_info = {
+        {InternalStats::kIntStatsWalFileBytes,
+         DBStatInfo{"db.wal_bytes_written"}},
+        {InternalStats::kIntStatsWalFileSynced, DBStatInfo{"db.wal_syncs"}},
+        {InternalStats::kIntStatsBytesWritten,
+         DBStatInfo{"db.user_bytes_written"}},
+        {InternalStats::kIntStatsNumKeysWritten,
+         DBStatInfo{"db.user_keys_written"}},
+        {InternalStats::kIntStatsWriteDoneByOther,
+         DBStatInfo{"db.user_writes_by_other"}},
+        {InternalStats::kIntStatsWriteDoneBySelf,
+         DBStatInfo{"db.user_writes_by_self"}},
+        {InternalStats::kIntStatsWriteWithWal,
+         DBStatInfo{"db.user_writes_with_wal"}},
+        {InternalStats::kIntStatsWriteStallMicros,
+         DBStatInfo{"db.user_write_stall_micros"}},
+};
+
+namespace {
+const double kMB = 1048576.0;
+const double kGB = kMB * 1024;
+const double kMicrosInSec = 1000000.0;
+
+void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name,
+                           const std::string& group_by) {
+  int written_size =
+      snprintf(buf, len, "\n** Compaction Stats [%s] **\n", cf_name.c_str());
+  written_size = std::min(written_size, static_cast<int>(len));
+  auto hdr = [](LevelStatType t) {
+    return InternalStats::compaction_level_stats.at(t).header_name.c_str();
+  };
+  int line_size = snprintf(
+      buf + written_size, len - written_size,
+      "%s    %s   %s     %s %s  %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s "
+      "%s\n",
+      // Note that we skip COMPACTED_FILES and merge it with Files column
+      group_by.c_str(), hdr(LevelStatType::NUM_FILES),
+      hdr(LevelStatType::SIZE_BYTES), hdr(LevelStatType::SCORE),
+      hdr(LevelStatType::READ_GB), hdr(LevelStatType::RN_GB),
+      hdr(LevelStatType::RNP1_GB), hdr(LevelStatType::WRITE_GB),
+      hdr(LevelStatType::W_NEW_GB), hdr(LevelStatType::MOVED_GB),
+      hdr(LevelStatType::WRITE_AMP), hdr(LevelStatType::READ_MBPS),
+      hdr(LevelStatType::WRITE_MBPS), hdr(LevelStatType::COMP_SEC),
+      hdr(LevelStatType::COMP_CPU_SEC), hdr(LevelStatType::COMP_COUNT),
+      hdr(LevelStatType::AVG_SEC), hdr(LevelStatType::KEY_IN),
+      hdr(LevelStatType::KEY_DROP), hdr(LevelStatType::R_BLOB_GB),
+      hdr(LevelStatType::W_BLOB_GB));
+
+  written_size += line_size;
+  written_size = std::min(written_size, static_cast<int>(len));
+  snprintf(buf + written_size, len - written_size, "%s\n",
+           std::string(line_size, '-').c_str());
+}
+
+void PrepareLevelStats(std::map<LevelStatType, double>* level_stats,
+                       int num_files, int being_compacted,
+                       double total_file_size, double score, double w_amp,
+                       const InternalStats::CompactionStats& stats) {
+  const uint64_t bytes_read = stats.bytes_read_non_output_levels +
+                              stats.bytes_read_output_level +
+                              stats.bytes_read_blob;
+  const uint64_t bytes_written = stats.bytes_written + stats.bytes_written_blob;
+  const int64_t bytes_new = stats.bytes_written - stats.bytes_read_output_level;
+  const double elapsed = (stats.micros + 1) / kMicrosInSec;
+
+  (*level_stats)[LevelStatType::NUM_FILES] = num_files;
+  (*level_stats)[LevelStatType::COMPACTED_FILES] = being_compacted;
+  (*level_stats)[LevelStatType::SIZE_BYTES] = total_file_size;
+  (*level_stats)[LevelStatType::SCORE] = score;
+  (*level_stats)[LevelStatType::READ_GB] = bytes_read / kGB;
+  (*level_stats)[LevelStatType::RN_GB] =
+      stats.bytes_read_non_output_levels / kGB;
+  (*level_stats)[LevelStatType::RNP1_GB] = stats.bytes_read_output_level / kGB;
+  (*level_stats)[LevelStatType::WRITE_GB] = stats.bytes_written / kGB;
+  (*level_stats)[LevelStatType::W_NEW_GB] = bytes_new / kGB;
+  (*level_stats)[LevelStatType::MOVED_GB] = stats.bytes_moved / kGB;
+  (*level_stats)[LevelStatType::WRITE_AMP] = w_amp;
+  (*level_stats)[LevelStatType::READ_MBPS] = bytes_read / kMB / elapsed;
+  (*level_stats)[LevelStatType::WRITE_MBPS] = bytes_written / kMB / elapsed;
+  (*level_stats)[LevelStatType::COMP_SEC] = stats.micros / kMicrosInSec;
+  (*level_stats)[LevelStatType::COMP_CPU_SEC] = stats.cpu_micros / kMicrosInSec;
+  (*level_stats)[LevelStatType::COMP_COUNT] = stats.count;
+  (*level_stats)[LevelStatType::AVG_SEC] =
+      stats.count == 0 ? 0 : stats.micros / kMicrosInSec / stats.count;
+  (*level_stats)[LevelStatType::KEY_IN] =
+      static_cast<double>(stats.num_input_records);
+  (*level_stats)[LevelStatType::KEY_DROP] =
+      static_cast<double>(stats.num_dropped_records);
+  (*level_stats)[LevelStatType::R_BLOB_GB] = stats.bytes_read_blob / kGB;
+  (*level_stats)[LevelStatType::W_BLOB_GB] = stats.bytes_written_blob / kGB;
+}
+
+void PrintLevelStats(char* buf, size_t len, const std::string& name,
+                     const std::map<LevelStatType, double>& stat_value) {
+  snprintf(
+      buf, len,
+      "%4s "      /*  Level */
+      "%6d/%-3d " /*  Files */
+      "%8s "      /*  Size */
+      "%5.1f "    /*  Score */
+      "%8.1f "    /*  Read(GB) */
+      "%7.1f "    /*  Rn(GB) */
+      "%8.1f "    /*  Rnp1(GB) */
+      "%9.1f "    /*  Write(GB) */
+      "%8.1f "    /*  Wnew(GB) */
+      "%9.1f "    /*  Moved(GB) */
+      "%5.1f "    /*  W-Amp */
+      "%8.1f "    /*  Rd(MB/s) */
+      "%8.1f "    /*  Wr(MB/s) */
+      "%9.2f "    /*  Comp(sec) */
+      "%17.2f "   /*  CompMergeCPU(sec) */
+      "%9d "      /*  Comp(cnt) */
+      "%8.3f "    /*  Avg(sec) */
+      "%7s "      /*  KeyIn */
+      "%6s "      /*  KeyDrop */
+      "%9.1f "    /*  Rblob(GB) */
+      "%9.1f\n",  /*  Wblob(GB) */
+      name.c_str(), static_cast<int>(stat_value.at(LevelStatType::NUM_FILES)),
+      static_cast<int>(stat_value.at(LevelStatType::COMPACTED_FILES)),
+      BytesToHumanString(
+          static_cast<uint64_t>(stat_value.at(LevelStatType::SIZE_BYTES)))
+          .c_str(),
+      stat_value.at(LevelStatType::SCORE),
+      stat_value.at(LevelStatType::READ_GB),
+      stat_value.at(LevelStatType::RN_GB),
+      stat_value.at(LevelStatType::RNP1_GB),
+      stat_value.at(LevelStatType::WRITE_GB),
+      stat_value.at(LevelStatType::W_NEW_GB),
+      stat_value.at(LevelStatType::MOVED_GB),
+      stat_value.at(LevelStatType::WRITE_AMP),
+      stat_value.at(LevelStatType::READ_MBPS),
+      stat_value.at(LevelStatType::WRITE_MBPS),
+      stat_value.at(LevelStatType::COMP_SEC),
+      stat_value.at(LevelStatType::COMP_CPU_SEC),
+      static_cast<int>(stat_value.at(LevelStatType::COMP_COUNT)),
+      stat_value.at(LevelStatType::AVG_SEC),
+      NumberToHumanString(
+          static_cast<std::int64_t>(stat_value.at(LevelStatType::KEY_IN)))
+          .c_str(),
+      NumberToHumanString(
+          static_cast<std::int64_t>(stat_value.at(LevelStatType::KEY_DROP)))
+          .c_str(),
+      stat_value.at(LevelStatType::R_BLOB_GB),
+      stat_value.at(LevelStatType::W_BLOB_GB));
+}
+
+void PrintLevelStats(char* buf, size_t len, const std::string& name,
+                     int num_files, int being_compacted, double total_file_size,
+                     double score, double w_amp,
+                     const InternalStats::CompactionStats& stats) {
+  std::map<LevelStatType, double> level_stats;
+  PrepareLevelStats(&level_stats, num_files, being_compacted, total_file_size,
+                    score, w_amp, stats);
+  PrintLevelStats(buf, len, name, level_stats);
+}
+
+// Assumes that trailing numbers represent an optional argument. This requires
+// property names to not end with numbers.
+std::pair<Slice, Slice> GetPropertyNameAndArg(const Slice& property) {
+  Slice name = property, arg = property;
+  size_t sfx_len = 0;
+  while (sfx_len < property.size() &&
+         isdigit(property[property.size() - sfx_len - 1])) {
+    ++sfx_len;
+  }
+  name.remove_suffix(sfx_len);
+  arg.remove_prefix(property.size() - sfx_len);
+  return {name, arg};
+}
+}  // anonymous namespace
+
+static const std::string rocksdb_prefix = "rocksdb.";
+
+static const std::string num_files_at_level_prefix = "num-files-at-level";
+static const std::string compression_ratio_at_level_prefix =
+    "compression-ratio-at-level";
+static const std::string allstats = "stats";
+static const std::string sstables = "sstables";
+static const std::string cfstats = "cfstats";
+static const std::string cfstats_no_file_histogram =
+    "cfstats-no-file-histogram";
+static const std::string cf_file_histogram = "cf-file-histogram";
+static const std::string dbstats = "dbstats";
+static const std::string levelstats = "levelstats";
+static const std::string block_cache_entry_stats = "block-cache-entry-stats";
+static const std::string fast_block_cache_entry_stats =
+    "fast-block-cache-entry-stats";
+static const std::string num_immutable_mem_table = "num-immutable-mem-table";
+static const std::string num_immutable_mem_table_flushed =
+    "num-immutable-mem-table-flushed";
+static const std::string mem_table_flush_pending = "mem-table-flush-pending";
+static const std::string compaction_pending = "compaction-pending";
+static const std::string background_errors = "background-errors";
+static const std::string cur_size_active_mem_table =
+    "cur-size-active-mem-table";
+static const std::string cur_size_all_mem_tables = "cur-size-all-mem-tables";
+static const std::string size_all_mem_tables = "size-all-mem-tables";
+static const std::string num_entries_active_mem_table =
+    "num-entries-active-mem-table";
+static const std::string num_entries_imm_mem_tables =
+    "num-entries-imm-mem-tables";
+static const std::string num_deletes_active_mem_table =
+    "num-deletes-active-mem-table";
+static const std::string num_deletes_imm_mem_tables =
+    "num-deletes-imm-mem-tables";
+static const std::string estimate_num_keys = "estimate-num-keys";
+static const std::string estimate_table_readers_mem =
+    "estimate-table-readers-mem";
+static const std::string is_file_deletions_enabled =
+    "is-file-deletions-enabled";
+static const std::string num_snapshots = "num-snapshots";
+static const std::string oldest_snapshot_time = "oldest-snapshot-time";
+static const std::string oldest_snapshot_sequence = "oldest-snapshot-sequence";
+static const std::string num_live_versions = "num-live-versions";
+static const std::string current_version_number =
+    "current-super-version-number";
+static const std::string estimate_live_data_size = "estimate-live-data-size";
+static const std::string min_log_number_to_keep_str = "min-log-number-to-keep";
+static const std::string min_obsolete_sst_number_to_keep_str =
+    "min-obsolete-sst-number-to-keep";
+static const std::string base_level_str = "base-level";
+static const std::string total_sst_files_size = "total-sst-files-size";
+static const std::string live_sst_files_size = "live-sst-files-size";
+static const std::string live_sst_files_size_at_temperature =
+    "live-sst-files-size-at-temperature";
+static const std::string estimate_pending_comp_bytes =
+    "estimate-pending-compaction-bytes";
+static const std::string aggregated_table_properties =
+    "aggregated-table-properties";
+static const std::string aggregated_table_properties_at_level =
+    aggregated_table_properties + "-at-level";
+static const std::string num_running_compactions = "num-running-compactions";
+static const std::string num_running_flushes = "num-running-flushes";
+static const std::string actual_delayed_write_rate =
+    "actual-delayed-write-rate";
+static const std::string is_write_stopped = "is-write-stopped";
+static const std::string estimate_oldest_key_time = "estimate-oldest-key-time";
+static const std::string block_cache_capacity = "block-cache-capacity";
+static const std::string block_cache_usage = "block-cache-usage";
+static const std::string block_cache_pinned_usage = "block-cache-pinned-usage";
+static const std::string options_statistics = "options-statistics";
+static const std::string num_blob_files = "num-blob-files";
+static const std::string blob_stats = "blob-stats";
+static const std::string total_blob_file_size = "total-blob-file-size";
+static const std::string live_blob_file_size = "live-blob-file-size";
+static const std::string live_blob_file_garbage_size =
+    "live-blob-file-garbage-size";
+static const std::string blob_cache_capacity = "blob-cache-capacity";
+static const std::string blob_cache_usage = "blob-cache-usage";
+static const std::string blob_cache_pinned_usage = "blob-cache-pinned-usage";
+
+const std::string DB::Properties::kNumFilesAtLevelPrefix =
+    rocksdb_prefix + num_files_at_level_prefix;
+const std::string DB::Properties::kCompressionRatioAtLevelPrefix =
+    rocksdb_prefix + compression_ratio_at_level_prefix;
+const std::string DB::Properties::kStats = rocksdb_prefix + allstats;
+const std::string DB::Properties::kSSTables = rocksdb_prefix + sstables;
+const std::string DB::Properties::kCFStats = rocksdb_prefix + cfstats;
+const std::string DB::Properties::kCFStatsNoFileHistogram =
+    rocksdb_prefix + cfstats_no_file_histogram;
+const std::string DB::Properties::kCFFileHistogram =
+    rocksdb_prefix + cf_file_histogram;
+const std::string DB::Properties::kDBStats = rocksdb_prefix + dbstats;
+const std::string DB::Properties::kLevelStats = rocksdb_prefix + levelstats;
+const std::string DB::Properties::kBlockCacheEntryStats =
+    rocksdb_prefix + block_cache_entry_stats;
+const std::string DB::Properties::kFastBlockCacheEntryStats =
+    rocksdb_prefix + fast_block_cache_entry_stats;
+const std::string DB::Properties::kNumImmutableMemTable =
+    rocksdb_prefix + num_immutable_mem_table;
+const std::string DB::Properties::kNumImmutableMemTableFlushed =
+    rocksdb_prefix + num_immutable_mem_table_flushed;
+const std::string DB::Properties::kMemTableFlushPending =
+    rocksdb_prefix + mem_table_flush_pending;
+const std::string DB::Properties::kCompactionPending =
+    rocksdb_prefix + compaction_pending;
+const std::string DB::Properties::kNumRunningCompactions =
+    rocksdb_prefix + num_running_compactions;
+const std::string DB::Properties::kNumRunningFlushes =
+    rocksdb_prefix + num_running_flushes;
+const std::string DB::Properties::kBackgroundErrors =
+    rocksdb_prefix + background_errors;
+const std::string DB::Properties::kCurSizeActiveMemTable =
+    rocksdb_prefix + cur_size_active_mem_table;
+const std::string DB::Properties::kCurSizeAllMemTables =
+    rocksdb_prefix + cur_size_all_mem_tables;
+const std::string DB::Properties::kSizeAllMemTables =
+    rocksdb_prefix + size_all_mem_tables;
+const std::string DB::Properties::kNumEntriesActiveMemTable =
+    rocksdb_prefix + num_entries_active_mem_table;
+const std::string DB::Properties::kNumEntriesImmMemTables =
+    rocksdb_prefix + num_entries_imm_mem_tables;
+const std::string DB::Properties::kNumDeletesActiveMemTable =
+    rocksdb_prefix + num_deletes_active_mem_table;
+const std::string DB::Properties::kNumDeletesImmMemTables =
+    rocksdb_prefix + num_deletes_imm_mem_tables;
+const std::string DB::Properties::kEstimateNumKeys =
+    rocksdb_prefix + estimate_num_keys;
+const std::string DB::Properties::kEstimateTableReadersMem =
+    rocksdb_prefix + estimate_table_readers_mem;
+const std::string DB::Properties::kIsFileDeletionsEnabled =
+    rocksdb_prefix + is_file_deletions_enabled;
+const std::string DB::Properties::kNumSnapshots =
+    rocksdb_prefix + num_snapshots;
+const std::string DB::Properties::kOldestSnapshotTime =
+    rocksdb_prefix + oldest_snapshot_time;
+const std::string DB::Properties::kOldestSnapshotSequence =
+    rocksdb_prefix + oldest_snapshot_sequence;
+const std::string DB::Properties::kNumLiveVersions =
+    rocksdb_prefix + num_live_versions;
+const std::string DB::Properties::kCurrentSuperVersionNumber =
+    rocksdb_prefix + current_version_number;
+const std::string DB::Properties::kEstimateLiveDataSize =
+    rocksdb_prefix + estimate_live_data_size;
+const std::string DB::Properties::kMinLogNumberToKeep =
+    rocksdb_prefix + min_log_number_to_keep_str;
+const std::string DB::Properties::kMinObsoleteSstNumberToKeep =
+    rocksdb_prefix + min_obsolete_sst_number_to_keep_str;
+const std::string DB::Properties::kTotalSstFilesSize =
+    rocksdb_prefix + total_sst_files_size;
+const std::string DB::Properties::kLiveSstFilesSize =
+    rocksdb_prefix + live_sst_files_size;
+const std::string DB::Properties::kBaseLevel = rocksdb_prefix + base_level_str;
+const std::string DB::Properties::kEstimatePendingCompactionBytes =
+    rocksdb_prefix + estimate_pending_comp_bytes;
+const std::string DB::Properties::kAggregatedTableProperties =
+    rocksdb_prefix + aggregated_table_properties;
+const std::string DB::Properties::kAggregatedTablePropertiesAtLevel =
+    rocksdb_prefix + aggregated_table_properties_at_level;
+const std::string DB::Properties::kActualDelayedWriteRate =
+    rocksdb_prefix + actual_delayed_write_rate;
+const std::string DB::Properties::kIsWriteStopped =
+    rocksdb_prefix + is_write_stopped;
+const std::string DB::Properties::kEstimateOldestKeyTime =
+    rocksdb_prefix + estimate_oldest_key_time;
+const std::string DB::Properties::kBlockCacheCapacity =
+    rocksdb_prefix + block_cache_capacity;
+const std::string DB::Properties::kBlockCacheUsage =
+    rocksdb_prefix + block_cache_usage;
+const std::string DB::Properties::kBlockCachePinnedUsage =
+    rocksdb_prefix + block_cache_pinned_usage;
+const std::string DB::Properties::kOptionsStatistics =
+    rocksdb_prefix + options_statistics;
+const std::string DB::Properties::kLiveSstFilesSizeAtTemperature =
+    rocksdb_prefix + live_sst_files_size_at_temperature;
+const std::string DB::Properties::kNumBlobFiles =
+    rocksdb_prefix + num_blob_files;
+const std::string DB::Properties::kBlobStats = rocksdb_prefix + blob_stats;
+const std::string DB::Properties::kTotalBlobFileSize =
+    rocksdb_prefix + total_blob_file_size;
+const std::string DB::Properties::kLiveBlobFileSize =
+    rocksdb_prefix + live_blob_file_size;
+const std::string DB::Properties::kLiveBlobFileGarbageSize =
+    rocksdb_prefix + live_blob_file_garbage_size;
+const std::string DB::Properties::kBlobCacheCapacity =
+    rocksdb_prefix + blob_cache_capacity;
+const std::string DB::Properties::kBlobCacheUsage =
+    rocksdb_prefix + blob_cache_usage;
+const std::string DB::Properties::kBlobCachePinnedUsage =
+    rocksdb_prefix + blob_cache_pinned_usage;
+
+const std::string InternalStats::kPeriodicCFStats =
+    DB::Properties::kCFStats + ".periodic";
+const int InternalStats::kMaxNoChangePeriodSinceDump = 8;
+
+const UnorderedMap<std::string, DBPropertyInfo>
+    InternalStats::ppt_name_to_info = {
+        {DB::Properties::kNumFilesAtLevelPrefix,
+         {false, &InternalStats::HandleNumFilesAtLevel, nullptr, nullptr,
+          nullptr}},
+        {DB::Properties::kCompressionRatioAtLevelPrefix,
+         {false, &InternalStats::HandleCompressionRatioAtLevelPrefix, nullptr,
+          nullptr, nullptr}},
+        {DB::Properties::kLevelStats,
+         {false, &InternalStats::HandleLevelStats, nullptr, nullptr, nullptr}},
+        {DB::Properties::kStats,
+         {false, &InternalStats::HandleStats, nullptr, nullptr, nullptr}},
+        {DB::Properties::kCFStats,
+         {false, &InternalStats::HandleCFStats, nullptr,
+          &InternalStats::HandleCFMapStats, nullptr}},
+        {InternalStats::kPeriodicCFStats,
+         {false, &InternalStats::HandleCFStatsPeriodic, nullptr, nullptr,
+          nullptr}},
+        {DB::Properties::kCFStatsNoFileHistogram,
+         {false, &InternalStats::HandleCFStatsNoFileHistogram, nullptr, nullptr,
+          nullptr}},
+        {DB::Properties::kCFFileHistogram,
+         {false, &InternalStats::HandleCFFileHistogram, nullptr, nullptr,
+          nullptr}},
+        {DB::Properties::kDBStats,
+         {false, &InternalStats::HandleDBStats, nullptr,
+          &InternalStats::HandleDBMapStats, nullptr}},
+        {DB::Properties::kBlockCacheEntryStats,
+         {true, &InternalStats::HandleBlockCacheEntryStats, nullptr,
+          &InternalStats::HandleBlockCacheEntryStatsMap, nullptr}},
+        {DB::Properties::kFastBlockCacheEntryStats,
+         {true, &InternalStats::HandleFastBlockCacheEntryStats, nullptr,
+          &InternalStats::HandleFastBlockCacheEntryStatsMap, nullptr}},
+        {DB::Properties::kSSTables,
+         {false, &InternalStats::HandleSsTables, nullptr, nullptr, nullptr}},
+        {DB::Properties::kAggregatedTableProperties,
+         {false, &InternalStats::HandleAggregatedTableProperties, nullptr,
+          &InternalStats::HandleAggregatedTablePropertiesMap, nullptr}},
+        {DB::Properties::kAggregatedTablePropertiesAtLevel,
+         {false, &InternalStats::HandleAggregatedTablePropertiesAtLevel,
+          nullptr, &InternalStats::HandleAggregatedTablePropertiesAtLevelMap,
+          nullptr}},
+        {DB::Properties::kNumImmutableMemTable,
+         {false, nullptr, &InternalStats::HandleNumImmutableMemTable, nullptr,
+          nullptr}},
+        {DB::Properties::kNumImmutableMemTableFlushed,
+         {false, nullptr, &InternalStats::HandleNumImmutableMemTableFlushed,
+          nullptr, nullptr}},
+        {DB::Properties::kMemTableFlushPending,
+         {false, nullptr, &InternalStats::HandleMemTableFlushPending, nullptr,
+          nullptr}},
+        {DB::Properties::kCompactionPending,
+         {false, nullptr, &InternalStats::HandleCompactionPending, nullptr,
+          nullptr}},
+        {DB::Properties::kBackgroundErrors,
+         {false, nullptr, &InternalStats::HandleBackgroundErrors, nullptr,
+          nullptr}},
+        {DB::Properties::kCurSizeActiveMemTable,
+         {false, nullptr, &InternalStats::HandleCurSizeActiveMemTable, nullptr,
+          nullptr}},
+        {DB::Properties::kCurSizeAllMemTables,
+         {false, nullptr, &InternalStats::HandleCurSizeAllMemTables, nullptr,
+          nullptr}},
+        {DB::Properties::kSizeAllMemTables,
+         {false, nullptr, &InternalStats::HandleSizeAllMemTables, nullptr,
+          nullptr}},
+        {DB::Properties::kNumEntriesActiveMemTable,
+         {false, nullptr, &InternalStats::HandleNumEntriesActiveMemTable,
+          nullptr, nullptr}},
+        {DB::Properties::kNumEntriesImmMemTables,
+         {false, nullptr, &InternalStats::HandleNumEntriesImmMemTables, nullptr,
+          nullptr}},
+        {DB::Properties::kNumDeletesActiveMemTable,
+         {false, nullptr, &InternalStats::HandleNumDeletesActiveMemTable,
+          nullptr, nullptr}},
+        {DB::Properties::kNumDeletesImmMemTables,
+         {false, nullptr, &InternalStats::HandleNumDeletesImmMemTables, nullptr,
+          nullptr}},
+        {DB::Properties::kEstimateNumKeys,
+         {false, nullptr, &InternalStats::HandleEstimateNumKeys, nullptr,
+          nullptr}},
+        {DB::Properties::kEstimateTableReadersMem,
+         {true, nullptr, &InternalStats::HandleEstimateTableReadersMem, nullptr,
+          nullptr}},
+        {DB::Properties::kIsFileDeletionsEnabled,
+         {false, nullptr, &InternalStats::HandleIsFileDeletionsEnabled, nullptr,
+          nullptr}},
+        {DB::Properties::kNumSnapshots,
+         {false, nullptr, &InternalStats::HandleNumSnapshots, nullptr,
+          nullptr}},
+        {DB::Properties::kOldestSnapshotTime,
+         {false, nullptr, &InternalStats::HandleOldestSnapshotTime, nullptr,
+          nullptr}},
+        {DB::Properties::kOldestSnapshotSequence,
+         {false, nullptr, &InternalStats::HandleOldestSnapshotSequence, nullptr,
+          nullptr}},
+        {DB::Properties::kNumLiveVersions,
+         {false, nullptr, &InternalStats::HandleNumLiveVersions, nullptr,
+          nullptr}},
+        {DB::Properties::kCurrentSuperVersionNumber,
+         {false, nullptr, &InternalStats::HandleCurrentSuperVersionNumber,
+          nullptr, nullptr}},
+        {DB::Properties::kEstimateLiveDataSize,
+         {true, nullptr, &InternalStats::HandleEstimateLiveDataSize, nullptr,
+          nullptr}},
+        {DB::Properties::kMinLogNumberToKeep,
+         {false, nullptr, &InternalStats::HandleMinLogNumberToKeep, nullptr,
+          nullptr}},
+        {DB::Properties::kMinObsoleteSstNumberToKeep,
+         {false, nullptr, &InternalStats::HandleMinObsoleteSstNumberToKeep,
+          nullptr, nullptr}},
+        {DB::Properties::kBaseLevel,
+         {false, nullptr, &InternalStats::HandleBaseLevel, nullptr, nullptr}},
+        {DB::Properties::kTotalSstFilesSize,
+         {false, nullptr, &InternalStats::HandleTotalSstFilesSize, nullptr,
+          nullptr}},
+        {DB::Properties::kLiveSstFilesSize,
+         {false, nullptr, &InternalStats::HandleLiveSstFilesSize, nullptr,
+          nullptr}},
+        {DB::Properties::kLiveSstFilesSizeAtTemperature,
+         {false, &InternalStats::HandleLiveSstFilesSizeAtTemperature, nullptr,
+          nullptr, nullptr}},
+        {DB::Properties::kEstimatePendingCompactionBytes,
+         {false, nullptr, &InternalStats::HandleEstimatePendingCompactionBytes,
+          nullptr, nullptr}},
+        {DB::Properties::kNumRunningFlushes,
+         {false, nullptr, &InternalStats::HandleNumRunningFlushes, nullptr,
+          nullptr}},
+        {DB::Properties::kNumRunningCompactions,
+         {false, nullptr, &InternalStats::HandleNumRunningCompactions, nullptr,
+          nullptr}},
+        {DB::Properties::kActualDelayedWriteRate,
+         {false, nullptr, &InternalStats::HandleActualDelayedWriteRate, nullptr,
+          nullptr}},
+        {DB::Properties::kIsWriteStopped,
+         {false, nullptr, &InternalStats::HandleIsWriteStopped, nullptr,
+          nullptr}},
+        {DB::Properties::kEstimateOldestKeyTime,
+         {false, nullptr, &InternalStats::HandleEstimateOldestKeyTime, nullptr,
+          nullptr}},
+        {DB::Properties::kBlockCacheCapacity,
+         {false, nullptr, &InternalStats::HandleBlockCacheCapacity, nullptr,
+          nullptr}},
+        {DB::Properties::kBlockCacheUsage,
+         {false, nullptr, &InternalStats::HandleBlockCacheUsage, nullptr,
+          nullptr}},
+        {DB::Properties::kBlockCachePinnedUsage,
+         {false, nullptr, &InternalStats::HandleBlockCachePinnedUsage, nullptr,
+          nullptr}},
+        {DB::Properties::kOptionsStatistics,
+         {true, nullptr, nullptr, nullptr,
+          &DBImpl::GetPropertyHandleOptionsStatistics}},
+        {DB::Properties::kNumBlobFiles,
+         {false, nullptr, &InternalStats::HandleNumBlobFiles, nullptr,
+          nullptr}},
+        {DB::Properties::kBlobStats,
+         {false, &InternalStats::HandleBlobStats, nullptr, nullptr, nullptr}},
+        {DB::Properties::kTotalBlobFileSize,
+         {false, nullptr, &InternalStats::HandleTotalBlobFileSize, nullptr,
+          nullptr}},
+        {DB::Properties::kLiveBlobFileSize,
+         {false, nullptr, &InternalStats::HandleLiveBlobFileSize, nullptr,
+          nullptr}},
+        {DB::Properties::kLiveBlobFileGarbageSize,
+         {false, nullptr, &InternalStats::HandleLiveBlobFileGarbageSize,
+          nullptr, nullptr}},
+        {DB::Properties::kBlobCacheCapacity,
+         {false, nullptr, &InternalStats::HandleBlobCacheCapacity, nullptr,
+          nullptr}},
+        {DB::Properties::kBlobCacheUsage,
+         {false, nullptr, &InternalStats::HandleBlobCacheUsage, nullptr,
+          nullptr}},
+        {DB::Properties::kBlobCachePinnedUsage,
+         {false, nullptr, &InternalStats::HandleBlobCachePinnedUsage, nullptr,
+          nullptr}},
+};
+
+InternalStats::InternalStats(int num_levels, SystemClock* clock,
+                             ColumnFamilyData* cfd)
+    : db_stats_{},
+      cf_stats_value_{},
+      cf_stats_count_{},
+      comp_stats_(num_levels),
+      comp_stats_by_pri_(Env::Priority::TOTAL),
+      file_read_latency_(num_levels),
+      has_cf_change_since_dump_(true),
+      bg_error_count_(0),
+      number_levels_(num_levels),
+      clock_(clock),
+      cfd_(cfd),
+      started_at_(clock->NowMicros()) {
+  Cache* block_cache = GetBlockCacheForStats();
+  if (block_cache) {
+    // Extract or create stats collector. Could fail in rare cases.
+    Status s = CacheEntryStatsCollector<CacheEntryRoleStats>::GetShared(
+        block_cache, clock_, &cache_entry_stats_collector_);
+    if (s.ok()) {
+      assert(cache_entry_stats_collector_);
+    } else {
+      assert(!cache_entry_stats_collector_);
+    }
+  }
+}
+
+void InternalStats::TEST_GetCacheEntryRoleStats(CacheEntryRoleStats* stats,
+                                                bool foreground) {
+  CollectCacheEntryStats(foreground);
+  if (cache_entry_stats_collector_) {
+    cache_entry_stats_collector_->GetStats(stats);
+  }
+}
+
+void InternalStats::CollectCacheEntryStats(bool foreground) {
+  // This function is safe to call from any thread because
+  // cache_entry_stats_collector_ field is const after constructor
+  // and ->GetStats does its own synchronization, which also suffices for
+  // cache_entry_stats_.
+
+  if (!cache_entry_stats_collector_) {
+    return;  // nothing to do (e.g. no block cache)
+  }
+
+  // For "background" collections, strictly cap the collection time by
+  // expanding effective cache TTL. For foreground, be more aggressive about
+  // getting latest data.
+  int min_interval_seconds = foreground ? 10 : 180;
+  // 1/500 = max of 0.2% of one CPU thread
+  int min_interval_factor = foreground ? 10 : 500;
+  cache_entry_stats_collector_->CollectStats(min_interval_seconds,
+                                             min_interval_factor);
+}
+
+std::function<void(const Slice&, void*, size_t, Cache::DeleterFn)>
+InternalStats::CacheEntryRoleStats::GetEntryCallback() {
+  return [&](const Slice& /*key*/, void* /*value*/, size_t charge,
+             Cache::DeleterFn deleter) {
+    auto e = role_map_.find(deleter);
+    size_t role_idx;
+    if (e == role_map_.end()) {
+      role_idx = static_cast<size_t>(CacheEntryRole::kMisc);
+    } else {
+      role_idx = static_cast<size_t>(e->second);
+    }
+    entry_counts[role_idx]++;
+    total_charges[role_idx] += charge;
+  };
+}
+
+void InternalStats::CacheEntryRoleStats::BeginCollection(
+    Cache* cache, SystemClock*, uint64_t start_time_micros) {
+  Clear();
+  last_start_time_micros_ = start_time_micros;
+  ++collection_count;
+  role_map_ = CopyCacheDeleterRoleMap();
+  std::ostringstream str;
+  str << cache->Name() << "@" << static_cast<void*>(cache) << "#"
+      << port::GetProcessID();
+  cache_id = str.str();
+  cache_capacity = cache->GetCapacity();
+  cache_usage = cache->GetUsage();
+  table_size = cache->GetTableAddressCount();
+  occupancy = cache->GetOccupancyCount();
+}
+
+void InternalStats::CacheEntryRoleStats::EndCollection(
+    Cache*, SystemClock*, uint64_t end_time_micros) {
+  last_end_time_micros_ = end_time_micros;
+}
+
+void InternalStats::CacheEntryRoleStats::SkippedCollection() {
+  ++copies_of_last_collection;
+}
+
+uint64_t InternalStats::CacheEntryRoleStats::GetLastDurationMicros() const {
+  if (last_end_time_micros_ > last_start_time_micros_) {
+    return last_end_time_micros_ - last_start_time_micros_;
+  } else {
+    return 0U;
+  }
+}
+
+std::string InternalStats::CacheEntryRoleStats::ToString(
+    SystemClock* clock) const {
+  std::ostringstream str;
+  str << "Block cache " << cache_id
+      << " capacity: " << BytesToHumanString(cache_capacity)
+      << " usage: " << BytesToHumanString(cache_usage)
+      << " table_size: " << table_size << " occupancy: " << occupancy
+      << " collections: " << collection_count
+      << " last_copies: " << copies_of_last_collection
+      << " last_secs: " << (GetLastDurationMicros() / 1000000.0)
+      << " secs_since: "
+      << ((clock->NowMicros() - last_end_time_micros_) / 1000000U) << "\n";
+  str << "Block cache entry stats(count,size,portion):";
+  for (size_t i = 0; i < kNumCacheEntryRoles; ++i) {
+    if (entry_counts[i] > 0) {
+      str << " " << kCacheEntryRoleToCamelString[i] << "(" << entry_counts[i]
+          << "," << BytesToHumanString(total_charges[i]) << ","
+          << (100.0 * total_charges[i] / cache_capacity) << "%)";
+    }
+  }
+  str << "\n";
+  return str.str();
+}
+
+void InternalStats::CacheEntryRoleStats::ToMap(
+    std::map<std::string, std::string>* values, SystemClock* clock) const {
+  values->clear();
+  auto& v = *values;
+  v[BlockCacheEntryStatsMapKeys::CacheId()] = cache_id;
+  v[BlockCacheEntryStatsMapKeys::CacheCapacityBytes()] =
+      std::to_string(cache_capacity);
+  v[BlockCacheEntryStatsMapKeys::LastCollectionDurationSeconds()] =
+      std::to_string(GetLastDurationMicros() / 1000000.0);
+  v[BlockCacheEntryStatsMapKeys::LastCollectionAgeSeconds()] =
+      std::to_string((clock->NowMicros() - last_end_time_micros_) / 1000000U);
+  for (size_t i = 0; i < kNumCacheEntryRoles; ++i) {
+    auto role = static_cast<CacheEntryRole>(i);
+    v[BlockCacheEntryStatsMapKeys::EntryCount(role)] =
+        std::to_string(entry_counts[i]);
+    v[BlockCacheEntryStatsMapKeys::UsedBytes(role)] =
+        std::to_string(total_charges[i]);
+    v[BlockCacheEntryStatsMapKeys::UsedPercent(role)] =
+        std::to_string(100.0 * total_charges[i] / cache_capacity);
+  }
+}
+
+bool InternalStats::HandleBlockCacheEntryStatsInternal(std::string* value,
+                                                       bool fast) {
+  if (!cache_entry_stats_collector_) {
+    return false;
+  }
+  CollectCacheEntryStats(!fast /* foreground */);
+  CacheEntryRoleStats stats;
+  cache_entry_stats_collector_->GetStats(&stats);
+  *value = stats.ToString(clock_);
+  return true;
+}
+
+bool InternalStats::HandleBlockCacheEntryStatsMapInternal(
+    std::map<std::string, std::string>* values, bool fast) {
+  if (!cache_entry_stats_collector_) {
+    return false;
+  }
+  CollectCacheEntryStats(!fast /* foreground */);
+  CacheEntryRoleStats stats;
+  cache_entry_stats_collector_->GetStats(&stats);
+  stats.ToMap(values, clock_);
+  return true;
+}
+
+bool InternalStats::HandleBlockCacheEntryStats(std::string* value,
+                                               Slice /*suffix*/) {
+  return HandleBlockCacheEntryStatsInternal(value, false /* fast */);
+}
+
+bool InternalStats::HandleBlockCacheEntryStatsMap(
+    std::map<std::string, std::string>* values, Slice /*suffix*/) {
+  return HandleBlockCacheEntryStatsMapInternal(values, false /* fast */);
+}
+
+bool InternalStats::HandleFastBlockCacheEntryStats(std::string* value,
+                                                   Slice /*suffix*/) {
+  return HandleBlockCacheEntryStatsInternal(value, true /* fast */);
+}
+
+bool InternalStats::HandleFastBlockCacheEntryStatsMap(
+    std::map<std::string, std::string>* values, Slice /*suffix*/) {
+  return HandleBlockCacheEntryStatsMapInternal(values, true /* fast */);
+}
+
+bool InternalStats::HandleLiveSstFilesSizeAtTemperature(std::string* value,
+                                                        Slice suffix) {
+  uint64_t temperature;
+  bool ok = ConsumeDecimalNumber(&suffix, &temperature) && suffix.empty();
+  if (!ok) {
+    return false;
+  }
+
+  uint64_t size = 0;
+  const auto* vstorage = cfd_->current()->storage_info();
+  for (int level = 0; level < vstorage->num_levels(); level++) {
+    for (const auto& file_meta : vstorage->LevelFiles(level)) {
+      if (static_cast<uint8_t>(file_meta->temperature) == temperature) {
+        size += file_meta->fd.GetFileSize();
+      }
+    }
+  }
+
+  *value = std::to_string(size);
+  return true;
+}
+
+bool InternalStats::HandleNumBlobFiles(uint64_t* value, DBImpl* /*db*/,
+                                       Version* /*version*/) {
+  assert(value);
+  assert(cfd_);
+
+  const auto* current = cfd_->current();
+  assert(current);
+
+  const auto* vstorage = current->storage_info();
+  assert(vstorage);
+
+  const auto& blob_files = vstorage->GetBlobFiles();
+
+  *value = blob_files.size();
+
+  return true;
+}
+
+bool InternalStats::HandleBlobStats(std::string* value, Slice /*suffix*/) {
+  assert(value);
+  assert(cfd_);
+
+  const auto* current = cfd_->current();
+  assert(current);
+
+  const auto* vstorage = current->storage_info();
+  assert(vstorage);
+
+  const auto blob_st = vstorage->GetBlobStats();
+
+  std::ostringstream oss;
+
+  oss << "Number of blob files: " << vstorage->GetBlobFiles().size()
+      << "\nTotal size of blob files: " << blob_st.total_file_size
+      << "\nTotal size of garbage in blob files: " << blob_st.total_garbage_size
+      << "\nBlob file space amplification: " << blob_st.space_amp << '\n';
+
+  value->append(oss.str());
+
+  return true;
+}
+
+bool InternalStats::HandleTotalBlobFileSize(uint64_t* value, DBImpl* /*db*/,
+                                            Version* /*version*/) {
+  assert(value);
+  assert(cfd_);
+
+  *value = cfd_->GetTotalBlobFileSize();
+
+  return true;
+}
+
+bool InternalStats::HandleLiveBlobFileSize(uint64_t* value, DBImpl* /*db*/,
+                                           Version* /*version*/) {
+  assert(value);
+  assert(cfd_);
+
+  const auto* current = cfd_->current();
+  assert(current);
+
+  const auto* vstorage = current->storage_info();
+  assert(vstorage);
+
+  *value = vstorage->GetBlobStats().total_file_size;
+
+  return true;
+}
+
+bool InternalStats::HandleLiveBlobFileGarbageSize(uint64_t* value,
+                                                  DBImpl* /*db*/,
+                                                  Version* /*version*/) {
+  assert(value);
+  assert(cfd_);
+
+  const auto* current = cfd_->current();
+  assert(current);
+
+  const auto* vstorage = current->storage_info();
+  assert(vstorage);
+
+  *value = vstorage->GetBlobStats().total_garbage_size;
+
+  return true;
+}
+
+Cache* InternalStats::GetBlobCacheForStats() {
+  return cfd_->ioptions()->blob_cache.get();
+}
+
+bool InternalStats::HandleBlobCacheCapacity(uint64_t* value, DBImpl* /*db*/,
+                                            Version* /*version*/) {
+  Cache* blob_cache = GetBlobCacheForStats();
+  if (blob_cache) {
+    *value = static_cast<uint64_t>(blob_cache->GetCapacity());
+    return true;
+  }
+  return false;
+}
+
+bool InternalStats::HandleBlobCacheUsage(uint64_t* value, DBImpl* /*db*/,
+                                         Version* /*version*/) {
+  Cache* blob_cache = GetBlobCacheForStats();
+  if (blob_cache) {
+    *value = static_cast<uint64_t>(blob_cache->GetUsage());
+    return true;
+  }
+  return false;
+}
+
+bool InternalStats::HandleBlobCachePinnedUsage(uint64_t* value, DBImpl* /*db*/,
+                                               Version* /*version*/) {
+  Cache* blob_cache = GetBlobCacheForStats();
+  if (blob_cache) {
+    *value = static_cast<uint64_t>(blob_cache->GetPinnedUsage());
+    return true;
+  }
+  return false;
+}
+
+const DBPropertyInfo* GetPropertyInfo(const Slice& property) {
+  std::string ppt_name = GetPropertyNameAndArg(property).first.ToString();
+  auto ppt_info_iter = InternalStats::ppt_name_to_info.find(ppt_name);
+  if (ppt_info_iter == InternalStats::ppt_name_to_info.end()) {
+    return nullptr;
+  }
+  return &ppt_info_iter->second;
+}
+
+bool InternalStats::GetStringProperty(const DBPropertyInfo& property_info,
+                                      const Slice& property,
+                                      std::string* value) {
+  assert(value != nullptr);
+  assert(property_info.handle_string != nullptr);
+  Slice arg = GetPropertyNameAndArg(property).second;
+  return (this->*(property_info.handle_string))(value, arg);
+}
+
+bool InternalStats::GetMapProperty(const DBPropertyInfo& property_info,
+                                   const Slice& property,
+                                   std::map<std::string, std::string>* value) {
+  assert(value != nullptr);
+  assert(property_info.handle_map != nullptr);
+  Slice arg = GetPropertyNameAndArg(property).second;
+  return (this->*(property_info.handle_map))(value, arg);
+}
+
+bool InternalStats::GetIntProperty(const DBPropertyInfo& property_info,
+                                   uint64_t* value, DBImpl* db) {
+  assert(value != nullptr);
+  assert(property_info.handle_int != nullptr &&
+         !property_info.need_out_of_mutex);
+  db->mutex_.AssertHeld();
+  return (this->*(property_info.handle_int))(value, db, nullptr /* version */);
+}
+
+bool InternalStats::GetIntPropertyOutOfMutex(
+    const DBPropertyInfo& property_info, Version* version, uint64_t* value) {
+  assert(value != nullptr);
+  assert(property_info.handle_int != nullptr &&
+         property_info.need_out_of_mutex);
+  return (this->*(property_info.handle_int))(value, nullptr /* db */, version);
+}
+
+bool InternalStats::HandleNumFilesAtLevel(std::string* value, Slice suffix) {
+  uint64_t level;
+  const auto* vstorage = cfd_->current()->storage_info();
+  bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty();
+  if (!ok || static_cast<int>(level) >= number_levels_) {
+    return false;
+  } else {
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%d",
+             vstorage->NumLevelFiles(static_cast<int>(level)));
+    *value = buf;
+    return true;
+  }
+}
+
+bool InternalStats::HandleCompressionRatioAtLevelPrefix(std::string* value,
+                                                        Slice suffix) {
+  uint64_t level;
+  const auto* vstorage = cfd_->current()->storage_info();
+  bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty();
+  if (!ok || level >= static_cast<uint64_t>(number_levels_)) {
+    return false;
+  }
+  *value = std::to_string(
+      vstorage->GetEstimatedCompressionRatioAtLevel(static_cast<int>(level)));
+  return true;
+}
+
+bool InternalStats::HandleLevelStats(std::string* value, Slice /*suffix*/) {
+  char buf[1000];
+  const auto* vstorage = cfd_->current()->storage_info();
+  snprintf(buf, sizeof(buf),
+           "Level Files Size(MB)\n"
+           "--------------------\n");
+  value->append(buf);
+
+  for (int level = 0; level < number_levels_; level++) {
+    snprintf(buf, sizeof(buf), "%3d %8d %8.0f\n", level,
+             vstorage->NumLevelFiles(level),
+             vstorage->NumLevelBytes(level) / kMB);
+    value->append(buf);
+  }
+  return true;
+}
+
+bool InternalStats::HandleStats(std::string* value, Slice suffix) {
+  if (!HandleCFStats(value, suffix)) {
+    return false;
+  }
+  if (!HandleDBStats(value, suffix)) {
+    return false;
+  }
+  return true;
+}
+
+bool InternalStats::HandleCFMapStats(
+    std::map<std::string, std::string>* cf_stats, Slice /*suffix*/) {
+  DumpCFMapStats(cf_stats);
+  return true;
+}
+
+bool InternalStats::HandleCFStats(std::string* value, Slice /*suffix*/) {
+  DumpCFStats(value);
+  return true;
+}
+
+bool InternalStats::HandleCFStatsPeriodic(std::string* value,
+                                          Slice /*suffix*/) {
+  bool has_change = has_cf_change_since_dump_;
+  if (!has_change) {
+    // If file histogram changes, there is activity in this period too.
+    uint64_t new_histogram_num = 0;
+    for (int level = 0; level < number_levels_; level++) {
+      new_histogram_num += file_read_latency_[level].num();
+    }
+    new_histogram_num += blob_file_read_latency_.num();
+    if (new_histogram_num != last_histogram_num) {
+      has_change = true;
+      last_histogram_num = new_histogram_num;
+    }
+  }
+  if (has_change) {
+    no_cf_change_period_since_dump_ = 0;
+    has_cf_change_since_dump_ = false;
+  } else if (no_cf_change_period_since_dump_++ > 0) {
+    // Not ready to sync
+    if (no_cf_change_period_since_dump_ == kMaxNoChangePeriodSinceDump) {
+      // Next periodic, we need to dump stats even if there is no change.
+      no_cf_change_period_since_dump_ = 0;
+    }
+    return true;
+  }
+
+  DumpCFStatsNoFileHistogram(/*is_periodic=*/true, value);
+  DumpCFFileHistogram(value);
+  return true;
+}
+
+bool InternalStats::HandleCFStatsNoFileHistogram(std::string* value,
+                                                 Slice /*suffix*/) {
+  DumpCFStatsNoFileHistogram(/*is_periodic=*/false, value);
+  return true;
+}
+
+bool InternalStats::HandleCFFileHistogram(std::string* value,
+                                          Slice /*suffix*/) {
+  DumpCFFileHistogram(value);
+  return true;
+}
+
+bool InternalStats::HandleDBMapStats(
+    std::map<std::string, std::string>* db_stats, Slice /*suffix*/) {
+  DumpDBMapStats(db_stats);
+  return true;
+}
+
+bool InternalStats::HandleDBStats(std::string* value, Slice /*suffix*/) {
+  DumpDBStats(value);
+  return true;
+}
+
+bool InternalStats::HandleSsTables(std::string* value, Slice /*suffix*/) {
+  auto* current = cfd_->current();
+  *value = current->DebugString(true, true);
+  return true;
+}
+
+bool InternalStats::HandleAggregatedTableProperties(std::string* value,
+                                                    Slice /*suffix*/) {
+  std::shared_ptr<const TableProperties> tp;
+  auto s = cfd_->current()->GetAggregatedTableProperties(&tp);
+  if (!s.ok()) {
+    return false;
+  }
+  *value = tp->ToString();
+  return true;
+}
+
+static std::map<std::string, std::string> MapUint64ValuesToString(
+    const std::map<std::string, uint64_t>& from) {
+  std::map<std::string, std::string> to;
+  for (const auto& e : from) {
+    to[e.first] = std::to_string(e.second);
+  }
+  return to;
+}
+
+bool InternalStats::HandleAggregatedTablePropertiesMap(
+    std::map<std::string, std::string>* values, Slice /*suffix*/) {
+  std::shared_ptr<const TableProperties> tp;
+  auto s = cfd_->current()->GetAggregatedTableProperties(&tp);
+  if (!s.ok()) {
+    return false;
+  }
+  *values = MapUint64ValuesToString(tp->GetAggregatablePropertiesAsMap());
+  return true;
+}
+
+bool InternalStats::HandleAggregatedTablePropertiesAtLevel(std::string* values,
+                                                           Slice suffix) {
+  uint64_t level;
+  bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty();
+  if (!ok || static_cast<int>(level) >= number_levels_) {
+    return false;
+  }
+  std::shared_ptr<const TableProperties> tp;
+  auto s = cfd_->current()->GetAggregatedTableProperties(
+      &tp, static_cast<int>(level));
+  if (!s.ok()) {
+    return false;
+  }
+  *values = tp->ToString();
+  return true;
+}
+
+bool InternalStats::HandleAggregatedTablePropertiesAtLevelMap(
+    std::map<std::string, std::string>* values, Slice suffix) {
+  uint64_t level;
+  bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty();
+  if (!ok || static_cast<int>(level) >= number_levels_) {
+    return false;
+  }
+  std::shared_ptr<const TableProperties> tp;
+  auto s = cfd_->current()->GetAggregatedTableProperties(
+      &tp, static_cast<int>(level));
+  if (!s.ok()) {
+    return false;
+  }
+  *values = MapUint64ValuesToString(tp->GetAggregatablePropertiesAsMap());
+  return true;
+}
+
+bool InternalStats::HandleNumImmutableMemTable(uint64_t* value, DBImpl* /*db*/,
+                                               Version* /*version*/) {
+  *value = cfd_->imm()->NumNotFlushed();
+  return true;
+}
+
+bool InternalStats::HandleNumImmutableMemTableFlushed(uint64_t* value,
+                                                      DBImpl* /*db*/,
+                                                      Version* /*version*/) {
+  *value = cfd_->imm()->NumFlushed();
+  return true;
+}
+
+bool InternalStats::HandleMemTableFlushPending(uint64_t* value, DBImpl* /*db*/,
+                                               Version* /*version*/) {
+  *value = (cfd_->imm()->IsFlushPending() ? 1 : 0);
+  return true;
+}
+
+bool InternalStats::HandleNumRunningFlushes(uint64_t* value, DBImpl* db,
+                                            Version* /*version*/) {
+  *value = db->num_running_flushes();
+  return true;
+}
+
+bool InternalStats::HandleCompactionPending(uint64_t* value, DBImpl* /*db*/,
+                                            Version* /*version*/) {
+  // 1 if the system already determines at least one compaction is needed.
+  // 0 otherwise,
+  const auto* vstorage = cfd_->current()->storage_info();
+  *value = (cfd_->compaction_picker()->NeedsCompaction(vstorage) ? 1 : 0);
+  return true;
+}
+
+bool InternalStats::HandleNumRunningCompactions(uint64_t* value, DBImpl* db,
+                                                Version* /*version*/) {
+  *value = db->num_running_compactions_;
+  return true;
+}
+
+bool InternalStats::HandleBackgroundErrors(uint64_t* value, DBImpl* /*db*/,
+                                           Version* /*version*/) {
+  // Accumulated number of  errors in background flushes or compactions.
+  *value = GetBackgroundErrorCount();
+  return true;
+}
+
+bool InternalStats::HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* /*db*/,
+                                                Version* /*version*/) {
+  // Current size of the active memtable
+  // Using ApproximateMemoryUsageFast to avoid the need for synchronization
+  *value = cfd_->mem()->ApproximateMemoryUsageFast();
+  return true;
+}
+
+bool InternalStats::HandleCurSizeAllMemTables(uint64_t* value, DBImpl* /*db*/,
+                                              Version* /*version*/) {
+  // Current size of the active memtable + immutable memtables
+  // Using ApproximateMemoryUsageFast to avoid the need for synchronization
+  *value = cfd_->mem()->ApproximateMemoryUsageFast() +
+           cfd_->imm()->ApproximateUnflushedMemTablesMemoryUsage();
+  return true;
+}
+
+bool InternalStats::HandleSizeAllMemTables(uint64_t* value, DBImpl* /*db*/,
+                                           Version* /*version*/) {
+  // Using ApproximateMemoryUsageFast to avoid the need for synchronization
+  *value = cfd_->mem()->ApproximateMemoryUsageFast() +
+           cfd_->imm()->ApproximateMemoryUsage();
+  return true;
+}
+
+bool InternalStats::HandleNumEntriesActiveMemTable(uint64_t* value,
+                                                   DBImpl* /*db*/,
+                                                   Version* /*version*/) {
+  // Current number of entires in the active memtable
+  *value = cfd_->mem()->num_entries();
+  return true;
+}
+
+bool InternalStats::HandleNumEntriesImmMemTables(uint64_t* value,
+                                                 DBImpl* /*db*/,
+                                                 Version* /*version*/) {
+  // Current number of entries in the immutable memtables
+  *value = cfd_->imm()->current()->GetTotalNumEntries();
+  return true;
+}
+
+bool InternalStats::HandleNumDeletesActiveMemTable(uint64_t* value,
+                                                   DBImpl* /*db*/,
+                                                   Version* /*version*/) {
+  // Current number of entires in the active memtable
+  *value = cfd_->mem()->num_deletes();
+  return true;
+}
+
+bool InternalStats::HandleNumDeletesImmMemTables(uint64_t* value,
+                                                 DBImpl* /*db*/,
+                                                 Version* /*version*/) {
+  // Current number of entries in the immutable memtables
+  *value = cfd_->imm()->current()->GetTotalNumDeletes();
+  return true;
+}
+
+bool InternalStats::HandleEstimateNumKeys(uint64_t* value, DBImpl* /*db*/,
+                                          Version* /*version*/) {
+  // Estimate number of entries in the column family:
+  // Use estimated entries in tables + total entries in memtables.
+  const auto* vstorage = cfd_->current()->storage_info();
+  uint64_t estimate_keys = cfd_->mem()->num_entries() +
+                           cfd_->imm()->current()->GetTotalNumEntries() +
+                           vstorage->GetEstimatedActiveKeys();
+  uint64_t estimate_deletes =
+      cfd_->mem()->num_deletes() + cfd_->imm()->current()->GetTotalNumDeletes();
+  *value = estimate_keys > estimate_deletes * 2
+               ? estimate_keys - (estimate_deletes * 2)
+               : 0;
+  return true;
+}
+
+bool InternalStats::HandleNumSnapshots(uint64_t* value, DBImpl* db,
+                                       Version* /*version*/) {
+  *value = db->snapshots().count();
+  return true;
+}
+
+bool InternalStats::HandleOldestSnapshotTime(uint64_t* value, DBImpl* db,
+                                             Version* /*version*/) {
+  *value = static_cast<uint64_t>(db->snapshots().GetOldestSnapshotTime());
+  return true;
+}
+
+bool InternalStats::HandleOldestSnapshotSequence(uint64_t* value, DBImpl* db,
+                                                 Version* /*version*/) {
+  *value = static_cast<uint64_t>(db->snapshots().GetOldestSnapshotSequence());
+  return true;
+}
+
+bool InternalStats::HandleNumLiveVersions(uint64_t* value, DBImpl* /*db*/,
+                                          Version* /*version*/) {
+  *value = cfd_->GetNumLiveVersions();
+  return true;
+}
+
+bool InternalStats::HandleCurrentSuperVersionNumber(uint64_t* value,
+                                                    DBImpl* /*db*/,
+                                                    Version* /*version*/) {
+  *value = cfd_->GetSuperVersionNumber();
+  return true;
+}
+
+bool InternalStats::HandleIsFileDeletionsEnabled(uint64_t* value, DBImpl* db,
+                                                 Version* /*version*/) {
+  *value = db->IsFileDeletionsEnabled() ? 1 : 0;
+  return true;
+}
+
+bool InternalStats::HandleBaseLevel(uint64_t* value, DBImpl* /*db*/,
+                                    Version* /*version*/) {
+  const auto* vstorage = cfd_->current()->storage_info();
+  *value = vstorage->base_level();
+  return true;
+}
+
+bool InternalStats::HandleTotalSstFilesSize(uint64_t* value, DBImpl* /*db*/,
+                                            Version* /*version*/) {
+  *value = cfd_->GetTotalSstFilesSize();
+  return true;
+}
+
+bool InternalStats::HandleLiveSstFilesSize(uint64_t* value, DBImpl* /*db*/,
+                                           Version* /*version*/) {
+  *value = cfd_->GetLiveSstFilesSize();
+  return true;
+}
+
+bool InternalStats::HandleEstimatePendingCompactionBytes(uint64_t* value,
+                                                         DBImpl* /*db*/,
+                                                         Version* /*version*/) {
+  const auto* vstorage = cfd_->current()->storage_info();
+  *value = vstorage->estimated_compaction_needed_bytes();
+  return true;
+}
+
+bool InternalStats::HandleEstimateTableReadersMem(uint64_t* value,
+                                                  DBImpl* /*db*/,
+                                                  Version* version) {
+  *value = (version == nullptr) ? 0 : version->GetMemoryUsageByTableReaders();
+  return true;
+}
+
+bool InternalStats::HandleEstimateLiveDataSize(uint64_t* value, DBImpl* /*db*/,
+                                               Version* version) {
+  const auto* vstorage = version->storage_info();
+  *value = vstorage->EstimateLiveDataSize();
+  return true;
+}
+
+bool InternalStats::HandleMinLogNumberToKeep(uint64_t* value, DBImpl* db,
+                                             Version* /*version*/) {
+  *value = db->MinLogNumberToKeep();
+  return true;
+}
+
+bool InternalStats::HandleMinObsoleteSstNumberToKeep(uint64_t* value,
+                                                     DBImpl* db,
+                                                     Version* /*version*/) {
+  *value = db->MinObsoleteSstNumberToKeep();
+  return true;
+}
+
+bool InternalStats::HandleActualDelayedWriteRate(uint64_t* value, DBImpl* db,
+                                                 Version* /*version*/) {
+  const WriteController& wc = db->write_controller();
+  if (!wc.NeedsDelay()) {
+    *value = 0;
+  } else {
+    *value = wc.delayed_write_rate();
+  }
+  return true;
+}
+
+bool InternalStats::HandleIsWriteStopped(uint64_t* value, DBImpl* db,
+                                         Version* /*version*/) {
+  *value = db->write_controller().IsStopped() ? 1 : 0;
+  return true;
+}
+
+bool InternalStats::HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* /*db*/,
+                                                Version* /*version*/) {
+  // TODO(yiwu): The property is currently available for fifo compaction
+  // with allow_compaction = false. This is because we don't propagate
+  // oldest_key_time on compaction.
+  if (cfd_->ioptions()->compaction_style != kCompactionStyleFIFO ||
+      cfd_->GetCurrentMutableCFOptions()
+          ->compaction_options_fifo.allow_compaction) {
+    return false;
+  }
+
+  TablePropertiesCollection collection;
+  auto s = cfd_->current()->GetPropertiesOfAllTables(&collection);
+  if (!s.ok()) {
+    return false;
+  }
+  *value = std::numeric_limits<uint64_t>::max();
+  for (auto& p : collection) {
+    *value = std::min(*value, p.second->oldest_key_time);
+    if (*value == 0) {
+      break;
+    }
+  }
+  if (*value > 0) {
+    *value = std::min({cfd_->mem()->ApproximateOldestKeyTime(),
+                       cfd_->imm()->ApproximateOldestKeyTime(), *value});
+  }
+  return *value > 0 && *value < std::numeric_limits<uint64_t>::max();
+}
+
+Cache* InternalStats::GetBlockCacheForStats() {
+  auto* table_factory = cfd_->ioptions()->table_factory.get();
+  assert(table_factory != nullptr);
+  return table_factory->GetOptions<Cache>(TableFactory::kBlockCacheOpts());
+}
+
+bool InternalStats::HandleBlockCacheCapacity(uint64_t* value, DBImpl* /*db*/,
+                                             Version* /*version*/) {
+  Cache* block_cache = GetBlockCacheForStats();
+  if (block_cache) {
+    *value = static_cast<uint64_t>(block_cache->GetCapacity());
+    return true;
+  }
+  return false;
+}
+
+bool InternalStats::HandleBlockCacheUsage(uint64_t* value, DBImpl* /*db*/,
+                                          Version* /*version*/) {
+  Cache* block_cache = GetBlockCacheForStats();
+  if (block_cache) {
+    *value = static_cast<uint64_t>(block_cache->GetUsage());
+    return true;
+  }
+  return false;
+}
+
+bool InternalStats::HandleBlockCachePinnedUsage(uint64_t* value, DBImpl* /*db*/,
+                                                Version* /*version*/) {
+  Cache* block_cache = GetBlockCacheForStats();
+  if (block_cache) {
+    *value = static_cast<uint64_t>(block_cache->GetPinnedUsage());
+    return true;
+  }
+  return false;
+}
+
+void InternalStats::DumpDBMapStats(
+    std::map<std::string, std::string>* db_stats) {
+  for (int i = 0; i < static_cast<int>(kIntStatsNumMax); ++i) {
+    InternalDBStatsType type = static_cast<InternalDBStatsType>(i);
+    (*db_stats)[db_stats_type_to_info.at(type).property_name] =
+        std::to_string(GetDBStats(type));
+  }
+  double seconds_up = (clock_->NowMicros() - started_at_) / kMicrosInSec;
+  (*db_stats)["db.uptime"] = std::to_string(seconds_up);
+}
+
+void InternalStats::DumpDBStats(std::string* value) {
+  char buf[1000];
+  // DB-level stats, only available from default column family
+  double seconds_up = (clock_->NowMicros() - started_at_) / kMicrosInSec;
+  double interval_seconds_up = seconds_up - db_stats_snapshot_.seconds_up;
+  snprintf(buf, sizeof(buf),
+           "\n** DB Stats **\nUptime(secs): %.1f total, %.1f interval\n",
+           seconds_up, interval_seconds_up);
+  value->append(buf);
+  // Cumulative
+  uint64_t user_bytes_written =
+      GetDBStats(InternalStats::kIntStatsBytesWritten);
+  uint64_t num_keys_written =
+      GetDBStats(InternalStats::kIntStatsNumKeysWritten);
+  uint64_t write_other = GetDBStats(InternalStats::kIntStatsWriteDoneByOther);
+  uint64_t write_self = GetDBStats(InternalStats::kIntStatsWriteDoneBySelf);
+  uint64_t wal_bytes = GetDBStats(InternalStats::kIntStatsWalFileBytes);
+  uint64_t wal_synced = GetDBStats(InternalStats::kIntStatsWalFileSynced);
+  uint64_t write_with_wal = GetDBStats(InternalStats::kIntStatsWriteWithWal);
+  uint64_t write_stall_micros =
+      GetDBStats(InternalStats::kIntStatsWriteStallMicros);
+
+  const int kHumanMicrosLen = 32;
+  char human_micros[kHumanMicrosLen];
+
+  // Data
+  // writes: total number of write requests.
+  // keys: total number of key updates issued by all the write requests
+  // commit groups: number of group commits issued to the DB. Each group can
+  //                contain one or more writes.
+  // so writes/keys is the average number of put in multi-put or put
+  // writes/groups is the average group commit size.
+  //
+  // The format is the same for interval stats.
+  snprintf(buf, sizeof(buf),
+           "Cumulative writes: %s writes, %s keys, %s commit groups, "
+           "%.1f writes per commit group, ingest: %.2f GB, %.2f MB/s\n",
+           NumberToHumanString(write_other + write_self).c_str(),
+           NumberToHumanString(num_keys_written).c_str(),
+           NumberToHumanString(write_self).c_str(),
+           (write_other + write_self) /
+               std::max(1.0, static_cast<double>(write_self)),
+           user_bytes_written / kGB,
+           user_bytes_written / kMB / std::max(seconds_up, 0.001));
+  value->append(buf);
+  // WAL
+  snprintf(buf, sizeof(buf),
+           "Cumulative WAL: %s writes, %s syncs, "
+           "%.2f writes per sync, written: %.2f GB, %.2f MB/s\n",
+           NumberToHumanString(write_with_wal).c_str(),
+           NumberToHumanString(wal_synced).c_str(),
+           write_with_wal / std::max(1.0, static_cast<double>(wal_synced)),
+           wal_bytes / kGB, wal_bytes / kMB / std::max(seconds_up, 0.001));
+  value->append(buf);
+  // Stall
+  AppendHumanMicros(write_stall_micros, human_micros, kHumanMicrosLen, true);
+  snprintf(buf, sizeof(buf), "Cumulative stall: %s, %.1f percent\n",
+           human_micros,
+           // 10000 = divide by 1M to get secs, then multiply by 100 for pct
+           write_stall_micros / 10000.0 / std::max(seconds_up, 0.001));
+  value->append(buf);
+
+  // Interval
+  uint64_t interval_write_other = write_other - db_stats_snapshot_.write_other;
+  uint64_t interval_write_self = write_self - db_stats_snapshot_.write_self;
+  uint64_t interval_num_keys_written =
+      num_keys_written - db_stats_snapshot_.num_keys_written;
+  snprintf(
+      buf, sizeof(buf),
+      "Interval writes: %s writes, %s keys, %s commit groups, "
+      "%.1f writes per commit group, ingest: %.2f MB, %.2f MB/s\n",
+      NumberToHumanString(interval_write_other + interval_write_self).c_str(),
+      NumberToHumanString(interval_num_keys_written).c_str(),
+      NumberToHumanString(interval_write_self).c_str(),
+      static_cast<double>(interval_write_other + interval_write_self) /
+          std::max(1.0, static_cast<double>(interval_write_self)),
+      (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB,
+      (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB /
+          std::max(interval_seconds_up, 0.001)),
+      value->append(buf);
+
+  uint64_t interval_write_with_wal =
+      write_with_wal - db_stats_snapshot_.write_with_wal;
+  uint64_t interval_wal_synced = wal_synced - db_stats_snapshot_.wal_synced;
+  uint64_t interval_wal_bytes = wal_bytes - db_stats_snapshot_.wal_bytes;
+
+  snprintf(buf, sizeof(buf),
+           "Interval WAL: %s writes, %s syncs, "
+           "%.2f writes per sync, written: %.2f GB, %.2f MB/s\n",
+           NumberToHumanString(interval_write_with_wal).c_str(),
+           NumberToHumanString(interval_wal_synced).c_str(),
+           interval_write_with_wal /
+               std::max(1.0, static_cast<double>(interval_wal_synced)),
+           interval_wal_bytes / kGB,
+           interval_wal_bytes / kMB / std::max(interval_seconds_up, 0.001));
+  value->append(buf);
+
+  // Stall
+  AppendHumanMicros(write_stall_micros - db_stats_snapshot_.write_stall_micros,
+                    human_micros, kHumanMicrosLen, true);
+  snprintf(buf, sizeof(buf), "Interval stall: %s, %.1f percent\n", human_micros,
+           // 10000 = divide by 1M to get secs, then multiply by 100 for pct
+           (write_stall_micros - db_stats_snapshot_.write_stall_micros) /
+               10000.0 / std::max(interval_seconds_up, 0.001));
+  value->append(buf);
+
+  db_stats_snapshot_.seconds_up = seconds_up;
+  db_stats_snapshot_.ingest_bytes = user_bytes_written;
+  db_stats_snapshot_.write_other = write_other;
+  db_stats_snapshot_.write_self = write_self;
+  db_stats_snapshot_.num_keys_written = num_keys_written;
+  db_stats_snapshot_.wal_bytes = wal_bytes;
+  db_stats_snapshot_.wal_synced = wal_synced;
+  db_stats_snapshot_.write_with_wal = write_with_wal;
+  db_stats_snapshot_.write_stall_micros = write_stall_micros;
+}
+
+/**
+ * Dump Compaction Level stats to a map of stat name with "compaction." prefix
+ * to value in double as string. The level in stat name is represented with
+ * a prefix "Lx" where "x" is the level number. A special level "Sum"
+ * represents the sum of a stat for all levels.
+ * The result also contains IO stall counters which keys start with "io_stalls."
+ * and values represent uint64 encoded as strings.
+ */
+void InternalStats::DumpCFMapStats(
+    std::map<std::string, std::string>* cf_stats) {
+  const VersionStorageInfo* vstorage = cfd_->current()->storage_info();
+  CompactionStats compaction_stats_sum;
+  std::map<int, std::map<LevelStatType, double>> levels_stats;
+  DumpCFMapStats(vstorage, &levels_stats, &compaction_stats_sum);
+  for (auto const& level_ent : levels_stats) {
+    auto level_str =
+        level_ent.first == -1 ? "Sum" : "L" + std::to_string(level_ent.first);
+    for (auto const& stat_ent : level_ent.second) {
+      auto stat_type = stat_ent.first;
+      auto key_str =
+          "compaction." + level_str + "." +
+          InternalStats::compaction_level_stats.at(stat_type).property_name;
+      (*cf_stats)[key_str] = std::to_string(stat_ent.second);
+    }
+  }
+
+  DumpCFMapStatsIOStalls(cf_stats);
+}
+
+void InternalStats::DumpCFMapStats(
+    const VersionStorageInfo* vstorage,
+    std::map<int, std::map<LevelStatType, double>>* levels_stats,
+    CompactionStats* compaction_stats_sum) {
+  assert(vstorage);
+
+  int num_levels_to_check =
+      (cfd_->ioptions()->compaction_style != kCompactionStyleFIFO)
+          ? vstorage->num_levels() - 1
+          : 1;
+
+  // Compaction scores are sorted based on its value. Restore them to the
+  // level order
+  std::vector<double> compaction_score(number_levels_, 0);
+  for (int i = 0; i < num_levels_to_check; ++i) {
+    compaction_score[vstorage->CompactionScoreLevel(i)] =
+        vstorage->CompactionScore(i);
+  }
+  // Count # of files being compacted for each level
+  std::vector<int> files_being_compacted(number_levels_, 0);
+  for (int level = 0; level < number_levels_; ++level) {
+    for (auto* f : vstorage->LevelFiles(level)) {
+      if (f->being_compacted) {
+        ++files_being_compacted[level];
+      }
+    }
+  }
+
+  int total_files = 0;
+  int total_files_being_compacted = 0;
+  double total_file_size = 0;
+  uint64_t flush_ingest = cf_stats_value_[BYTES_FLUSHED];
+  uint64_t add_file_ingest = cf_stats_value_[BYTES_INGESTED_ADD_FILE];
+  uint64_t curr_ingest = flush_ingest + add_file_ingest;
+  for (int level = 0; level < number_levels_; level++) {
+    int files = vstorage->NumLevelFiles(level);
+    total_files += files;
+    total_files_being_compacted += files_being_compacted[level];
+    if (comp_stats_[level].micros > 0 || comp_stats_[level].cpu_micros > 0 ||
+        files > 0) {
+      compaction_stats_sum->Add(comp_stats_[level]);
+      total_file_size += vstorage->NumLevelBytes(level);
+      uint64_t input_bytes;
+      if (level == 0) {
+        input_bytes = curr_ingest;
+      } else {
+        input_bytes = comp_stats_[level].bytes_read_non_output_levels +
+                      comp_stats_[level].bytes_read_blob;
+      }
+      double w_amp =
+          (input_bytes == 0)
+              ? 0.0
+              : static_cast<double>(comp_stats_[level].bytes_written +
+                                    comp_stats_[level].bytes_written_blob) /
+                    input_bytes;
+      std::map<LevelStatType, double> level_stats;
+      PrepareLevelStats(&level_stats, files, files_being_compacted[level],
+                        static_cast<double>(vstorage->NumLevelBytes(level)),
+                        compaction_score[level], w_amp, comp_stats_[level]);
+      (*levels_stats)[level] = level_stats;
+    }
+  }
+  // Cumulative summary
+  double w_amp = (0 == curr_ingest)
+                     ? 0.0
+                     : (compaction_stats_sum->bytes_written +
+                        compaction_stats_sum->bytes_written_blob) /
+                           static_cast<double>(curr_ingest);
+  // Stats summary across levels
+  std::map<LevelStatType, double> sum_stats;
+  PrepareLevelStats(&sum_stats, total_files, total_files_being_compacted,
+                    total_file_size, 0, w_amp, *compaction_stats_sum);
+  (*levels_stats)[-1] = sum_stats;  //  -1 is for the Sum level
+}
+
+void InternalStats::DumpCFMapStatsByPriority(
+    std::map<int, std::map<LevelStatType, double>>* priorities_stats) {
+  for (size_t priority = 0; priority < comp_stats_by_pri_.size(); priority++) {
+    if (comp_stats_by_pri_[priority].micros > 0) {
+      std::map<LevelStatType, double> priority_stats;
+      PrepareLevelStats(&priority_stats, 0 /* num_files */,
+                        0 /* being_compacted */, 0 /* total_file_size */,
+                        0 /* compaction_score */, 0 /* w_amp */,
+                        comp_stats_by_pri_[priority]);
+      (*priorities_stats)[static_cast<int>(priority)] = priority_stats;
+    }
+  }
+}
+
+void InternalStats::DumpCFMapStatsIOStalls(
+    std::map<std::string, std::string>* cf_stats) {
+  (*cf_stats)["io_stalls.level0_slowdown"] =
+      std::to_string(cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS]);
+  (*cf_stats)["io_stalls.level0_slowdown_with_compaction"] =
+      std::to_string(cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS]);
+  (*cf_stats)["io_stalls.level0_numfiles"] =
+      std::to_string(cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS]);
+  (*cf_stats)["io_stalls.level0_numfiles_with_compaction"] =
+      std::to_string(cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_STOPS]);
+  (*cf_stats)["io_stalls.stop_for_pending_compaction_bytes"] =
+      std::to_string(cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS]);
+  (*cf_stats)["io_stalls.slowdown_for_pending_compaction_bytes"] =
+      std::to_string(cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS]);
+  (*cf_stats)["io_stalls.memtable_compaction"] =
+      std::to_string(cf_stats_count_[MEMTABLE_LIMIT_STOPS]);
+  (*cf_stats)["io_stalls.memtable_slowdown"] =
+      std::to_string(cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS]);
+
+  uint64_t total_stop = cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS] +
+                        cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS] +
+                        cf_stats_count_[MEMTABLE_LIMIT_STOPS];
+
+  uint64_t total_slowdown =
+      cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS] +
+      cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS] +
+      cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS];
+
+  (*cf_stats)["io_stalls.total_stop"] = std::to_string(total_stop);
+  (*cf_stats)["io_stalls.total_slowdown"] = std::to_string(total_slowdown);
+}
+
+void InternalStats::DumpCFStats(std::string* value) {
+  DumpCFStatsNoFileHistogram(/*is_periodic=*/false, value);
+  DumpCFFileHistogram(value);
+}
+
+void InternalStats::DumpCFStatsNoFileHistogram(bool is_periodic,
+                                               std::string* value) {
+  char buf[2000];
+  // Per-ColumnFamily stats
+  PrintLevelStatsHeader(buf, sizeof(buf), cfd_->GetName(), "Level");
+  value->append(buf);
+
+  // Print stats for each level
+  const VersionStorageInfo* vstorage = cfd_->current()->storage_info();
+  std::map<int, std::map<LevelStatType, double>> levels_stats;
+  CompactionStats compaction_stats_sum;
+  DumpCFMapStats(vstorage, &levels_stats, &compaction_stats_sum);
+  for (int l = 0; l < number_levels_; ++l) {
+    if (levels_stats.find(l) != levels_stats.end()) {
+      PrintLevelStats(buf, sizeof(buf), "L" + std::to_string(l),
+                      levels_stats[l]);
+      value->append(buf);
+    }
+  }
+
+  // Print sum of level stats
+  PrintLevelStats(buf, sizeof(buf), "Sum", levels_stats[-1]);
+  value->append(buf);
+
+  uint64_t flush_ingest = cf_stats_value_[BYTES_FLUSHED];
+  uint64_t add_file_ingest = cf_stats_value_[BYTES_INGESTED_ADD_FILE];
+  uint64_t ingest_files_addfile = cf_stats_value_[INGESTED_NUM_FILES_TOTAL];
+  uint64_t ingest_l0_files_addfile =
+      cf_stats_value_[INGESTED_LEVEL0_NUM_FILES_TOTAL];
+  uint64_t ingest_keys_addfile = cf_stats_value_[INGESTED_NUM_KEYS_TOTAL];
+  // Cumulative summary
+  uint64_t total_stall_count =
+      cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS] +
+      cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS] +
+      cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS] +
+      cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS] +
+      cf_stats_count_[MEMTABLE_LIMIT_STOPS] +
+      cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS];
+  // Interval summary
+  uint64_t interval_flush_ingest =
+      flush_ingest - cf_stats_snapshot_.ingest_bytes_flush;
+  uint64_t interval_add_file_inget =
+      add_file_ingest - cf_stats_snapshot_.ingest_bytes_addfile;
+  uint64_t interval_ingest =
+      interval_flush_ingest + interval_add_file_inget + 1;
+  CompactionStats interval_stats(compaction_stats_sum);
+  interval_stats.Subtract(cf_stats_snapshot_.comp_stats);
+  double w_amp =
+      (interval_stats.bytes_written + interval_stats.bytes_written_blob) /
+      static_cast<double>(interval_ingest);
+  PrintLevelStats(buf, sizeof(buf), "Int", 0, 0, 0, 0, w_amp, interval_stats);
+  value->append(buf);
+
+  PrintLevelStatsHeader(buf, sizeof(buf), cfd_->GetName(), "Priority");
+  value->append(buf);
+  std::map<int, std::map<LevelStatType, double>> priorities_stats;
+  DumpCFMapStatsByPriority(&priorities_stats);
+  for (size_t priority = 0; priority < comp_stats_by_pri_.size(); ++priority) {
+    if (priorities_stats.find(static_cast<int>(priority)) !=
+        priorities_stats.end()) {
+      PrintLevelStats(
+          buf, sizeof(buf),
+          Env::PriorityToString(static_cast<Env::Priority>(priority)),
+          priorities_stats[static_cast<int>(priority)]);
+      value->append(buf);
+    }
+  }
+
+  const auto blob_st = vstorage->GetBlobStats();
+
+  snprintf(buf, sizeof(buf),
+           "\nBlob file count: %" ROCKSDB_PRIszt
+           ", total size: %.1f GB, garbage size: %.1f GB, space amp: %.1f\n\n",
+           vstorage->GetBlobFiles().size(), blob_st.total_file_size / kGB,
+           blob_st.total_garbage_size / kGB, blob_st.space_amp);
+  value->append(buf);
+
+  uint64_t now_micros = clock_->NowMicros();
+  double seconds_up = (now_micros - started_at_) / kMicrosInSec;
+  double interval_seconds_up = seconds_up - cf_stats_snapshot_.seconds_up;
+  snprintf(buf, sizeof(buf), "Uptime(secs): %.1f total, %.1f interval\n",
+           seconds_up, interval_seconds_up);
+  value->append(buf);
+  snprintf(buf, sizeof(buf), "Flush(GB): cumulative %.3f, interval %.3f\n",
+           flush_ingest / kGB, interval_flush_ingest / kGB);
+  value->append(buf);
+  snprintf(buf, sizeof(buf), "AddFile(GB): cumulative %.3f, interval %.3f\n",
+           add_file_ingest / kGB, interval_add_file_inget / kGB);
+  value->append(buf);
+
+  uint64_t interval_ingest_files_addfile =
+      ingest_files_addfile - cf_stats_snapshot_.ingest_files_addfile;
+  snprintf(buf, sizeof(buf),
+           "AddFile(Total Files): cumulative %" PRIu64 ", interval %" PRIu64
+           "\n",
+           ingest_files_addfile, interval_ingest_files_addfile);
+  value->append(buf);
+
+  uint64_t interval_ingest_l0_files_addfile =
+      ingest_l0_files_addfile - cf_stats_snapshot_.ingest_l0_files_addfile;
+  snprintf(buf, sizeof(buf),
+           "AddFile(L0 Files): cumulative %" PRIu64 ", interval %" PRIu64 "\n",
+           ingest_l0_files_addfile, interval_ingest_l0_files_addfile);
+  value->append(buf);
+
+  uint64_t interval_ingest_keys_addfile =
+      ingest_keys_addfile - cf_stats_snapshot_.ingest_keys_addfile;
+  snprintf(buf, sizeof(buf),
+           "AddFile(Keys): cumulative %" PRIu64 ", interval %" PRIu64 "\n",
+           ingest_keys_addfile, interval_ingest_keys_addfile);
+  value->append(buf);
+
+  // Compact
+  uint64_t compact_bytes_read = 0;
+  uint64_t compact_bytes_write = 0;
+  uint64_t compact_micros = 0;
+  for (int level = 0; level < number_levels_; level++) {
+    compact_bytes_read += comp_stats_[level].bytes_read_output_level +
+                          comp_stats_[level].bytes_read_non_output_levels +
+                          comp_stats_[level].bytes_read_blob;
+    compact_bytes_write += comp_stats_[level].bytes_written +
+                           comp_stats_[level].bytes_written_blob;
+    compact_micros += comp_stats_[level].micros;
+  }
+
+  snprintf(buf, sizeof(buf),
+           "Cumulative compaction: %.2f GB write, %.2f MB/s write, "
+           "%.2f GB read, %.2f MB/s read, %.1f seconds\n",
+           compact_bytes_write / kGB,
+           compact_bytes_write / kMB / std::max(seconds_up, 0.001),
+           compact_bytes_read / kGB,
+           compact_bytes_read / kMB / std::max(seconds_up, 0.001),
+           compact_micros / kMicrosInSec);
+  value->append(buf);
+
+  // Compaction interval
+  uint64_t interval_compact_bytes_write =
+      compact_bytes_write - cf_stats_snapshot_.compact_bytes_write;
+  uint64_t interval_compact_bytes_read =
+      compact_bytes_read - cf_stats_snapshot_.compact_bytes_read;
+  uint64_t interval_compact_micros =
+      compact_micros - cf_stats_snapshot_.compact_micros;
+
+  snprintf(
+      buf, sizeof(buf),
+      "Interval compaction: %.2f GB write, %.2f MB/s write, "
+      "%.2f GB read, %.2f MB/s read, %.1f seconds\n",
+      interval_compact_bytes_write / kGB,
+      interval_compact_bytes_write / kMB / std::max(interval_seconds_up, 0.001),
+      interval_compact_bytes_read / kGB,
+      interval_compact_bytes_read / kMB / std::max(interval_seconds_up, 0.001),
+      interval_compact_micros / kMicrosInSec);
+  value->append(buf);
+  if (is_periodic) {
+    cf_stats_snapshot_.compact_bytes_write = compact_bytes_write;
+    cf_stats_snapshot_.compact_bytes_read = compact_bytes_read;
+    cf_stats_snapshot_.compact_micros = compact_micros;
+  }
+
+  snprintf(buf, sizeof(buf),
+           "Stalls(count): %" PRIu64
+           " level0_slowdown, "
+           "%" PRIu64
+           " level0_slowdown_with_compaction, "
+           "%" PRIu64
+           " level0_numfiles, "
+           "%" PRIu64
+           " level0_numfiles_with_compaction, "
+           "%" PRIu64
+           " stop for pending_compaction_bytes, "
+           "%" PRIu64
+           " slowdown for pending_compaction_bytes, "
+           "%" PRIu64
+           " memtable_compaction, "
+           "%" PRIu64
+           " memtable_slowdown, "
+           "interval %" PRIu64 " total count\n",
+           cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS],
+           cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS],
+           cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS],
+           cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_STOPS],
+           cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS],
+           cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS],
+           cf_stats_count_[MEMTABLE_LIMIT_STOPS],
+           cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS],
+           total_stall_count - cf_stats_snapshot_.stall_count);
+  value->append(buf);
+
+  if (is_periodic) {
+    cf_stats_snapshot_.seconds_up = seconds_up;
+    cf_stats_snapshot_.ingest_bytes_flush = flush_ingest;
+    cf_stats_snapshot_.ingest_bytes_addfile = add_file_ingest;
+    cf_stats_snapshot_.ingest_files_addfile = ingest_files_addfile;
+    cf_stats_snapshot_.ingest_l0_files_addfile = ingest_l0_files_addfile;
+    cf_stats_snapshot_.ingest_keys_addfile = ingest_keys_addfile;
+    cf_stats_snapshot_.comp_stats = compaction_stats_sum;
+    cf_stats_snapshot_.stall_count = total_stall_count;
+  }
+
+  // Do not gather cache entry stats during CFStats because DB
+  // mutex is held. Only dump last cached collection (rely on DB
+  // periodic stats dump to update)
+  if (cache_entry_stats_collector_) {
+    CacheEntryRoleStats stats;
+    // thread safe
+    cache_entry_stats_collector_->GetStats(&stats);
+
+    constexpr uint64_t kDayInMicros = uint64_t{86400} * 1000000U;
+
+    // Skip if stats are extremely old (> 1 day, incl not yet populated)
+    if (now_micros - stats.last_end_time_micros_ < kDayInMicros) {
+      value->append(stats.ToString(clock_));
+    }
+  }
+}
+
+void InternalStats::DumpCFFileHistogram(std::string* value) {
+  assert(value);
+  assert(cfd_);
+
+  std::ostringstream oss;
+  oss << "\n** File Read Latency Histogram By Level [" << cfd_->GetName()
+      << "] **\n";
+
+  for (int level = 0; level < number_levels_; level++) {
+    if (!file_read_latency_[level].Empty()) {
+      oss << "** Level " << level << " read latency histogram (micros):\n"
+          << file_read_latency_[level].ToString() << '\n';
+    }
+  }
+
+  if (!blob_file_read_latency_.Empty()) {
+    oss << "** Blob file read latency histogram (micros):\n"
+        << blob_file_read_latency_.ToString() << '\n';
+  }
+
+  value->append(oss.str());
+}
+
+#else
+
+const DBPropertyInfo* GetPropertyInfo(const Slice& /*property*/) {
+  return nullptr;
+}
+
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/internal_stats.h b/src/rocksdb/db/internal_stats.h
new file mode 100644
index 000000000..b0cd5899b
--- /dev/null
+++ b/src/rocksdb/db/internal_stats.h
@@ -0,0 +1,996 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "cache/cache_entry_roles.h"
+#include "db/version_set.h"
+#include "rocksdb/system_clock.h"
+#include "util/hash_containers.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+template <class Stats>
+class CacheEntryStatsCollector;
+class DBImpl;
+class MemTableList;
+
+// Config for retrieving a property's value.
+struct DBPropertyInfo {
+  bool need_out_of_mutex;
+
+  // gcc had an internal error for initializing union of pointer-to-member-
+  // functions. Workaround is to populate exactly one of the following function
+  // pointers with a non-nullptr value.
+
+  // @param value Value-result argument for storing the property's string value
+  // @param suffix Argument portion of the property. For example, suffix would
+  //      be "5" for the property "rocksdb.num-files-at-level5". So far, only
+  //      certain string properties take an argument.
+  bool (InternalStats::*handle_string)(std::string* value, Slice suffix);
+
+  // @param value Value-result argument for storing the property's uint64 value
+  // @param db Many of the int properties rely on DBImpl methods.
+  // @param version Version is needed in case the property is retrieved without
+  //      holding db mutex, which is only supported for int properties.
+  bool (InternalStats::*handle_int)(uint64_t* value, DBImpl* db,
+                                    Version* version);
+
+  // @param props Map of general properties to populate
+  // @param suffix Argument portion of the property. (see handle_string)
+  bool (InternalStats::*handle_map)(std::map<std::string, std::string>* props,
+                                    Slice suffix);
+
+  // handle the string type properties rely on DBImpl methods
+  // @param value Value-result argument for storing the property's string value
+  bool (DBImpl::*handle_string_dbimpl)(std::string* value);
+};
+
+extern const DBPropertyInfo* GetPropertyInfo(const Slice& property);
+
+#ifndef ROCKSDB_LITE
+#undef SCORE
+enum class LevelStatType {
+  INVALID = 0,
+  NUM_FILES,
+  COMPACTED_FILES,
+  SIZE_BYTES,
+  SCORE,
+  READ_GB,
+  RN_GB,
+  RNP1_GB,
+  WRITE_GB,
+  W_NEW_GB,
+  MOVED_GB,
+  WRITE_AMP,
+  READ_MBPS,
+  WRITE_MBPS,
+  COMP_SEC,
+  COMP_CPU_SEC,
+  COMP_COUNT,
+  AVG_SEC,
+  KEY_IN,
+  KEY_DROP,
+  R_BLOB_GB,
+  W_BLOB_GB,
+  TOTAL  // total number of types
+};
+
+struct LevelStat {
+  // This what will be L?.property_name in the flat map returned to the user
+  std::string property_name;
+  // This will be what we will print in the header in the cli
+  std::string header_name;
+};
+
+struct DBStatInfo {
+  // This what will be property_name in the flat map returned to the user
+  std::string property_name;
+};
+
+class InternalStats {
+ public:
+  static const std::map<LevelStatType, LevelStat> compaction_level_stats;
+
+  enum InternalCFStatsType {
+    L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+    LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+    MEMTABLE_LIMIT_STOPS,
+    MEMTABLE_LIMIT_SLOWDOWNS,
+    L0_FILE_COUNT_LIMIT_STOPS,
+    LOCKED_L0_FILE_COUNT_LIMIT_STOPS,
+    PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS,
+    PENDING_COMPACTION_BYTES_LIMIT_STOPS,
+    WRITE_STALLS_ENUM_MAX,
+    BYTES_FLUSHED,
+    BYTES_INGESTED_ADD_FILE,
+    INGESTED_NUM_FILES_TOTAL,
+    INGESTED_LEVEL0_NUM_FILES_TOTAL,
+    INGESTED_NUM_KEYS_TOTAL,
+    INTERNAL_CF_STATS_ENUM_MAX,
+  };
+
+  enum InternalDBStatsType {
+    kIntStatsWalFileBytes,
+    kIntStatsWalFileSynced,
+    kIntStatsBytesWritten,
+    kIntStatsNumKeysWritten,
+    kIntStatsWriteDoneByOther,
+    kIntStatsWriteDoneBySelf,
+    kIntStatsWriteWithWal,
+    kIntStatsWriteStallMicros,
+    kIntStatsNumMax,
+  };
+
+  static const std::map<InternalDBStatsType, DBStatInfo> db_stats_type_to_info;
+
+  InternalStats(int num_levels, SystemClock* clock, ColumnFamilyData* cfd);
+
+  // Per level compaction stats
+  struct CompactionOutputsStats {
+    uint64_t num_output_records = 0;
+    uint64_t bytes_written = 0;
+    uint64_t bytes_written_blob = 0;
+    uint64_t num_output_files = 0;
+    uint64_t num_output_files_blob = 0;
+
+    void Add(const CompactionOutputsStats& stats) {
+      this->num_output_records += stats.num_output_records;
+      this->bytes_written += stats.bytes_written;
+      this->bytes_written_blob += stats.bytes_written_blob;
+      this->num_output_files += stats.num_output_files;
+      this->num_output_files_blob += stats.num_output_files_blob;
+    }
+  };
+
+  // Per level compaction stats.  comp_stats_[level] stores the stats for
+  // compactions that produced data for the specified "level".
+  struct CompactionStats {
+    uint64_t micros;
+    uint64_t cpu_micros;
+
+    // The number of bytes read from all non-output levels (table files)
+    uint64_t bytes_read_non_output_levels;
+
+    // The number of bytes read from the compaction output level (table files)
+    uint64_t bytes_read_output_level;
+
+    // The number of bytes read from blob files
+    uint64_t bytes_read_blob;
+
+    // Total number of bytes written to table files during compaction
+    uint64_t bytes_written;
+
+    // Total number of bytes written to blob files during compaction
+    uint64_t bytes_written_blob;
+
+    // Total number of bytes moved to the output level (table files)
+    uint64_t bytes_moved;
+
+    // The number of compaction input files in all non-output levels (table
+    // files)
+    int num_input_files_in_non_output_levels;
+
+    // The number of compaction input files in the output level (table files)
+    int num_input_files_in_output_level;
+
+    // The number of compaction output files (table files)
+    int num_output_files;
+
+    // The number of compaction output files (blob files)
+    int num_output_files_blob;
+
+    // Total incoming entries during compaction between levels N and N+1
+    uint64_t num_input_records;
+
+    // Accumulated diff number of entries
+    // (num input entries - num output entries) for compaction levels N and N+1
+    uint64_t num_dropped_records;
+
+    // Total output entries from compaction
+    uint64_t num_output_records;
+
+    // Number of compactions done
+    int count;
+
+    // Number of compactions done per CompactionReason
+    int counts[static_cast<int>(CompactionReason::kNumOfReasons)]{};
+
+    explicit CompactionStats()
+        : micros(0),
+          cpu_micros(0),
+          bytes_read_non_output_levels(0),
+          bytes_read_output_level(0),
+          bytes_read_blob(0),
+          bytes_written(0),
+          bytes_written_blob(0),
+          bytes_moved(0),
+          num_input_files_in_non_output_levels(0),
+          num_input_files_in_output_level(0),
+          num_output_files(0),
+          num_output_files_blob(0),
+          num_input_records(0),
+          num_dropped_records(0),
+          num_output_records(0),
+          count(0) {
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] = 0;
+      }
+    }
+
+    explicit CompactionStats(CompactionReason reason, int c)
+        : micros(0),
+          cpu_micros(0),
+          bytes_read_non_output_levels(0),
+          bytes_read_output_level(0),
+          bytes_read_blob(0),
+          bytes_written(0),
+          bytes_written_blob(0),
+          bytes_moved(0),
+          num_input_files_in_non_output_levels(0),
+          num_input_files_in_output_level(0),
+          num_output_files(0),
+          num_output_files_blob(0),
+          num_input_records(0),
+          num_dropped_records(0),
+          num_output_records(0),
+          count(c) {
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] = 0;
+      }
+      int r = static_cast<int>(reason);
+      if (r >= 0 && r < num_of_reasons) {
+        counts[r] = c;
+      } else {
+        count = 0;
+      }
+    }
+
+    CompactionStats(const CompactionStats& c)
+        : micros(c.micros),
+          cpu_micros(c.cpu_micros),
+          bytes_read_non_output_levels(c.bytes_read_non_output_levels),
+          bytes_read_output_level(c.bytes_read_output_level),
+          bytes_read_blob(c.bytes_read_blob),
+          bytes_written(c.bytes_written),
+          bytes_written_blob(c.bytes_written_blob),
+          bytes_moved(c.bytes_moved),
+          num_input_files_in_non_output_levels(
+              c.num_input_files_in_non_output_levels),
+          num_input_files_in_output_level(c.num_input_files_in_output_level),
+          num_output_files(c.num_output_files),
+          num_output_files_blob(c.num_output_files_blob),
+          num_input_records(c.num_input_records),
+          num_dropped_records(c.num_dropped_records),
+          num_output_records(c.num_output_records),
+          count(c.count) {
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] = c.counts[i];
+      }
+    }
+
+    CompactionStats& operator=(const CompactionStats& c) {
+      micros = c.micros;
+      cpu_micros = c.cpu_micros;
+      bytes_read_non_output_levels = c.bytes_read_non_output_levels;
+      bytes_read_output_level = c.bytes_read_output_level;
+      bytes_read_blob = c.bytes_read_blob;
+      bytes_written = c.bytes_written;
+      bytes_written_blob = c.bytes_written_blob;
+      bytes_moved = c.bytes_moved;
+      num_input_files_in_non_output_levels =
+          c.num_input_files_in_non_output_levels;
+      num_input_files_in_output_level = c.num_input_files_in_output_level;
+      num_output_files = c.num_output_files;
+      num_output_files_blob = c.num_output_files_blob;
+      num_input_records = c.num_input_records;
+      num_dropped_records = c.num_dropped_records;
+      num_output_records = c.num_output_records;
+      count = c.count;
+
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] = c.counts[i];
+      }
+      return *this;
+    }
+
+    void Clear() {
+      this->micros = 0;
+      this->cpu_micros = 0;
+      this->bytes_read_non_output_levels = 0;
+      this->bytes_read_output_level = 0;
+      this->bytes_read_blob = 0;
+      this->bytes_written = 0;
+      this->bytes_written_blob = 0;
+      this->bytes_moved = 0;
+      this->num_input_files_in_non_output_levels = 0;
+      this->num_input_files_in_output_level = 0;
+      this->num_output_files = 0;
+      this->num_output_files_blob = 0;
+      this->num_input_records = 0;
+      this->num_dropped_records = 0;
+      this->num_output_records = 0;
+      this->count = 0;
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] = 0;
+      }
+    }
+
+    void Add(const CompactionStats& c) {
+      this->micros += c.micros;
+      this->cpu_micros += c.cpu_micros;
+      this->bytes_read_non_output_levels += c.bytes_read_non_output_levels;
+      this->bytes_read_output_level += c.bytes_read_output_level;
+      this->bytes_read_blob += c.bytes_read_blob;
+      this->bytes_written += c.bytes_written;
+      this->bytes_written_blob += c.bytes_written_blob;
+      this->bytes_moved += c.bytes_moved;
+      this->num_input_files_in_non_output_levels +=
+          c.num_input_files_in_non_output_levels;
+      this->num_input_files_in_output_level +=
+          c.num_input_files_in_output_level;
+      this->num_output_files += c.num_output_files;
+      this->num_output_files_blob += c.num_output_files_blob;
+      this->num_input_records += c.num_input_records;
+      this->num_dropped_records += c.num_dropped_records;
+      this->num_output_records += c.num_output_records;
+      this->count += c.count;
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] += c.counts[i];
+      }
+    }
+
+    void Add(const CompactionOutputsStats& stats) {
+      this->num_output_files += static_cast<int>(stats.num_output_files);
+      this->num_output_records += stats.num_output_records;
+      this->bytes_written += stats.bytes_written;
+      this->bytes_written_blob += stats.bytes_written_blob;
+      this->num_output_files_blob +=
+          static_cast<int>(stats.num_output_files_blob);
+    }
+
+    void Subtract(const CompactionStats& c) {
+      this->micros -= c.micros;
+      this->cpu_micros -= c.cpu_micros;
+      this->bytes_read_non_output_levels -= c.bytes_read_non_output_levels;
+      this->bytes_read_output_level -= c.bytes_read_output_level;
+      this->bytes_read_blob -= c.bytes_read_blob;
+      this->bytes_written -= c.bytes_written;
+      this->bytes_written_blob -= c.bytes_written_blob;
+      this->bytes_moved -= c.bytes_moved;
+      this->num_input_files_in_non_output_levels -=
+          c.num_input_files_in_non_output_levels;
+      this->num_input_files_in_output_level -=
+          c.num_input_files_in_output_level;
+      this->num_output_files -= c.num_output_files;
+      this->num_output_files_blob -= c.num_output_files_blob;
+      this->num_input_records -= c.num_input_records;
+      this->num_dropped_records -= c.num_dropped_records;
+      this->num_output_records -= c.num_output_records;
+      this->count -= c.count;
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] -= c.counts[i];
+      }
+    }
+
+    void ResetCompactionReason(CompactionReason reason) {
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      assert(count == 1);  // only support update one compaction reason
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] = 0;
+      }
+      int r = static_cast<int>(reason);
+      assert(r >= 0 && r < num_of_reasons);
+      counts[r] = 1;
+    }
+  };
+
+  // Compaction stats, for per_key_placement compaction, it includes 2 levels
+  // stats: the last level and the penultimate level.
+  struct CompactionStatsFull {
+    // the stats for the target primary output level
+    CompactionStats stats;
+
+    // stats for penultimate level output if exist
+    bool has_penultimate_level_output = false;
+    CompactionStats penultimate_level_stats;
+
+    explicit CompactionStatsFull() : stats(), penultimate_level_stats() {}
+
+    explicit CompactionStatsFull(CompactionReason reason, int c)
+        : stats(reason, c), penultimate_level_stats(reason, c){};
+
+    uint64_t TotalBytesWritten() const {
+      uint64_t bytes_written = stats.bytes_written + stats.bytes_written_blob;
+      if (has_penultimate_level_output) {
+        bytes_written += penultimate_level_stats.bytes_written +
+                         penultimate_level_stats.bytes_written_blob;
+      }
+      return bytes_written;
+    }
+
+    uint64_t DroppedRecords() {
+      uint64_t output_records = stats.num_output_records;
+      if (has_penultimate_level_output) {
+        output_records += penultimate_level_stats.num_output_records;
+      }
+      if (stats.num_input_records > output_records) {
+        return stats.num_input_records - output_records;
+      }
+      return 0;
+    }
+
+    void SetMicros(uint64_t val) {
+      stats.micros = val;
+      penultimate_level_stats.micros = val;
+    }
+
+    void AddCpuMicros(uint64_t val) {
+      stats.cpu_micros += val;
+      penultimate_level_stats.cpu_micros += val;
+    }
+  };
+
+  // For use with CacheEntryStatsCollector
+  struct CacheEntryRoleStats {
+    uint64_t cache_capacity = 0;
+    uint64_t cache_usage = 0;
+    size_t table_size = 0;
+    size_t occupancy = 0;
+    std::string cache_id;
+    std::array<uint64_t, kNumCacheEntryRoles> total_charges;
+    std::array<size_t, kNumCacheEntryRoles> entry_counts;
+    uint32_t collection_count = 0;
+    uint32_t copies_of_last_collection = 0;
+    uint64_t last_start_time_micros_ = 0;
+    uint64_t last_end_time_micros_ = 0;
+
+    void Clear() {
+      // Wipe everything except collection_count
+      uint32_t saved_collection_count = collection_count;
+      *this = CacheEntryRoleStats();
+      collection_count = saved_collection_count;
+    }
+
+    void BeginCollection(Cache*, SystemClock*, uint64_t start_time_micros);
+    std::function<void(const Slice&, void*, size_t, Cache::DeleterFn)>
+    GetEntryCallback();
+    void EndCollection(Cache*, SystemClock*, uint64_t end_time_micros);
+    void SkippedCollection();
+
+    std::string ToString(SystemClock* clock) const;
+    void ToMap(std::map<std::string, std::string>* values,
+               SystemClock* clock) const;
+
+   private:
+    UnorderedMap<Cache::DeleterFn, CacheEntryRole> role_map_;
+    uint64_t GetLastDurationMicros() const;
+  };
+
+  void Clear() {
+    for (int i = 0; i < kIntStatsNumMax; i++) {
+      db_stats_[i].store(0);
+    }
+    for (int i = 0; i < INTERNAL_CF_STATS_ENUM_MAX; i++) {
+      cf_stats_count_[i] = 0;
+      cf_stats_value_[i] = 0;
+    }
+    for (auto& comp_stat : comp_stats_) {
+      comp_stat.Clear();
+    }
+    per_key_placement_comp_stats_.Clear();
+    for (auto& h : file_read_latency_) {
+      h.Clear();
+    }
+    blob_file_read_latency_.Clear();
+    cf_stats_snapshot_.Clear();
+    db_stats_snapshot_.Clear();
+    bg_error_count_ = 0;
+    started_at_ = clock_->NowMicros();
+    has_cf_change_since_dump_ = true;
+  }
+
+  void AddCompactionStats(int level, Env::Priority thread_pri,
+                          const CompactionStats& stats) {
+    comp_stats_[level].Add(stats);
+    comp_stats_by_pri_[thread_pri].Add(stats);
+  }
+
+  void AddCompactionStats(int level, Env::Priority thread_pri,
+                          const CompactionStatsFull& comp_stats_full) {
+    AddCompactionStats(level, thread_pri, comp_stats_full.stats);
+    if (comp_stats_full.has_penultimate_level_output) {
+      per_key_placement_comp_stats_.Add(
+          comp_stats_full.penultimate_level_stats);
+    }
+  }
+
+  void IncBytesMoved(int level, uint64_t amount) {
+    comp_stats_[level].bytes_moved += amount;
+  }
+
+  void AddCFStats(InternalCFStatsType type, uint64_t value) {
+    has_cf_change_since_dump_ = true;
+    cf_stats_value_[type] += value;
+    ++cf_stats_count_[type];
+  }
+
+  void AddDBStats(InternalDBStatsType type, uint64_t value,
+                  bool concurrent = false) {
+    auto& v = db_stats_[type];
+    if (concurrent) {
+      v.fetch_add(value, std::memory_order_relaxed);
+    } else {
+      v.store(v.load(std::memory_order_relaxed) + value,
+              std::memory_order_relaxed);
+    }
+  }
+
+  uint64_t GetDBStats(InternalDBStatsType type) {
+    return db_stats_[type].load(std::memory_order_relaxed);
+  }
+
+  HistogramImpl* GetFileReadHist(int level) {
+    return &file_read_latency_[level];
+  }
+
+  HistogramImpl* GetBlobFileReadHist() { return &blob_file_read_latency_; }
+
+  uint64_t GetBackgroundErrorCount() const { return bg_error_count_; }
+
+  uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; }
+
+  bool GetStringProperty(const DBPropertyInfo& property_info,
+                         const Slice& property, std::string* value);
+
+  bool GetMapProperty(const DBPropertyInfo& property_info,
+                      const Slice& property,
+                      std::map<std::string, std::string>* value);
+
+  bool GetIntProperty(const DBPropertyInfo& property_info, uint64_t* value,
+                      DBImpl* db);
+
+  bool GetIntPropertyOutOfMutex(const DBPropertyInfo& property_info,
+                                Version* version, uint64_t* value);
+
+  // Unless there is a recent enough collection of the stats, collect and
+  // saved new cache entry stats. If `foreground`, require data to be more
+  // recent to skip re-collection.
+  //
+  // This should only be called while NOT holding the DB mutex.
+  void CollectCacheEntryStats(bool foreground);
+
+  const uint64_t* TEST_GetCFStatsValue() const { return cf_stats_value_; }
+
+  const std::vector<CompactionStats>& TEST_GetCompactionStats() const {
+    return comp_stats_;
+  }
+
+  const CompactionStats& TEST_GetPerKeyPlacementCompactionStats() const {
+    return per_key_placement_comp_stats_;
+  }
+
+  void TEST_GetCacheEntryRoleStats(CacheEntryRoleStats* stats, bool foreground);
+
+  // Store a mapping from the user-facing DB::Properties string to our
+  // DBPropertyInfo struct used internally for retrieving properties.
+  static const UnorderedMap<std::string, DBPropertyInfo> ppt_name_to_info;
+
+  static const std::string kPeriodicCFStats;
+
+ private:
+  void DumpDBMapStats(std::map<std::string, std::string>* db_stats);
+  void DumpDBStats(std::string* value);
+  void DumpCFMapStats(std::map<std::string, std::string>* cf_stats);
+  void DumpCFMapStats(
+      const VersionStorageInfo* vstorage,
+      std::map<int, std::map<LevelStatType, double>>* level_stats,
+      CompactionStats* compaction_stats_sum);
+  void DumpCFMapStatsByPriority(
+      std::map<int, std::map<LevelStatType, double>>* priorities_stats);
+  void DumpCFMapStatsIOStalls(std::map<std::string, std::string>* cf_stats);
+  void DumpCFStats(std::string* value);
+  // if is_periodic = true, it is an internal call by RocksDB periodically to
+  // dump the status.
+  void DumpCFStatsNoFileHistogram(bool is_periodic, std::string* value);
+  // if is_periodic = true, it is an internal call by RocksDB periodically to
+  // dump the status.
+  void DumpCFFileHistogram(std::string* value);
+
+  Cache* GetBlockCacheForStats();
+  Cache* GetBlobCacheForStats();
+
+  // Per-DB stats
+  std::atomic<uint64_t> db_stats_[kIntStatsNumMax];
+  // Per-ColumnFamily stats
+  uint64_t cf_stats_value_[INTERNAL_CF_STATS_ENUM_MAX];
+  uint64_t cf_stats_count_[INTERNAL_CF_STATS_ENUM_MAX];
+  // Initialize/reference the collector in constructor so that we don't need
+  // additional synchronization in InternalStats, relying on synchronization
+  // in CacheEntryStatsCollector::GetStats. This collector is pinned in cache
+  // (through a shared_ptr) so that it does not get immediately ejected from
+  // a full cache, which would force a re-scan on the next GetStats.
+  std::shared_ptr<CacheEntryStatsCollector<CacheEntryRoleStats>>
+      cache_entry_stats_collector_;
+  // Per-ColumnFamily/level compaction stats
+  std::vector<CompactionStats> comp_stats_;
+  std::vector<CompactionStats> comp_stats_by_pri_;
+  CompactionStats per_key_placement_comp_stats_;
+  std::vector<HistogramImpl> file_read_latency_;
+  HistogramImpl blob_file_read_latency_;
+  bool has_cf_change_since_dump_;
+  // How many periods of no change since the last time stats are dumped for
+  // a periodic dump.
+  int no_cf_change_period_since_dump_ = 0;
+  uint64_t last_histogram_num = std::numeric_limits<uint64_t>::max();
+  static const int kMaxNoChangePeriodSinceDump;
+
+  // Used to compute per-interval statistics
+  struct CFStatsSnapshot {
+    // ColumnFamily-level stats
+    CompactionStats comp_stats;
+    uint64_t ingest_bytes_flush;  // Bytes written to L0 (Flush)
+    uint64_t stall_count;         // Stall count
+    // Stats from compaction jobs - bytes written, bytes read, duration.
+    uint64_t compact_bytes_write;
+    uint64_t compact_bytes_read;
+    uint64_t compact_micros;
+    double seconds_up;
+
+    // AddFile specific stats
+    uint64_t ingest_bytes_addfile;     // Total Bytes ingested
+    uint64_t ingest_files_addfile;     // Total number of files ingested
+    uint64_t ingest_l0_files_addfile;  // Total number of files ingested to L0
+    uint64_t ingest_keys_addfile;      // Total number of keys ingested
+
+    CFStatsSnapshot()
+        : ingest_bytes_flush(0),
+          stall_count(0),
+          compact_bytes_write(0),
+          compact_bytes_read(0),
+          compact_micros(0),
+          seconds_up(0),
+          ingest_bytes_addfile(0),
+          ingest_files_addfile(0),
+          ingest_l0_files_addfile(0),
+          ingest_keys_addfile(0) {}
+
+    void Clear() {
+      comp_stats.Clear();
+      ingest_bytes_flush = 0;
+      stall_count = 0;
+      compact_bytes_write = 0;
+      compact_bytes_read = 0;
+      compact_micros = 0;
+      seconds_up = 0;
+      ingest_bytes_addfile = 0;
+      ingest_files_addfile = 0;
+      ingest_l0_files_addfile = 0;
+      ingest_keys_addfile = 0;
+    }
+  } cf_stats_snapshot_;
+
+  struct DBStatsSnapshot {
+    // DB-level stats
+    uint64_t ingest_bytes;    // Bytes written by user
+    uint64_t wal_bytes;       // Bytes written to WAL
+    uint64_t wal_synced;      // Number of times WAL is synced
+    uint64_t write_with_wal;  // Number of writes that request WAL
+    // These count the number of writes processed by the calling thread or
+    // another thread.
+    uint64_t write_other;
+    uint64_t write_self;
+    // Total number of keys written. write_self and write_other measure number
+    // of write requests written, Each of the write request can contain updates
+    // to multiple keys. num_keys_written is total number of keys updated by all
+    // those writes.
+    uint64_t num_keys_written;
+    // Total time writes delayed by stalls.
+    uint64_t write_stall_micros;
+    double seconds_up;
+
+    DBStatsSnapshot()
+        : ingest_bytes(0),
+          wal_bytes(0),
+          wal_synced(0),
+          write_with_wal(0),
+          write_other(0),
+          write_self(0),
+          num_keys_written(0),
+          write_stall_micros(0),
+          seconds_up(0) {}
+
+    void Clear() {
+      ingest_bytes = 0;
+      wal_bytes = 0;
+      wal_synced = 0;
+      write_with_wal = 0;
+      write_other = 0;
+      write_self = 0;
+      num_keys_written = 0;
+      write_stall_micros = 0;
+      seconds_up = 0;
+    }
+  } db_stats_snapshot_;
+
+  // Handler functions for getting property values. They use "value" as a value-
+  // result argument, and return true upon successfully setting "value".
+  bool HandleNumFilesAtLevel(std::string* value, Slice suffix);
+  bool HandleCompressionRatioAtLevelPrefix(std::string* value, Slice suffix);
+  bool HandleLevelStats(std::string* value, Slice suffix);
+  bool HandleStats(std::string* value, Slice suffix);
+  bool HandleCFMapStats(std::map<std::string, std::string>* compaction_stats,
+                        Slice suffix);
+  bool HandleCFStats(std::string* value, Slice suffix);
+  bool HandleCFStatsNoFileHistogram(std::string* value, Slice suffix);
+  bool HandleCFFileHistogram(std::string* value, Slice suffix);
+  bool HandleCFStatsPeriodic(std::string* value, Slice suffix);
+  bool HandleDBMapStats(std::map<std::string, std::string>* compaction_stats,
+                        Slice suffix);
+  bool HandleDBStats(std::string* value, Slice suffix);
+  bool HandleSsTables(std::string* value, Slice suffix);
+  bool HandleAggregatedTableProperties(std::string* value, Slice suffix);
+  bool HandleAggregatedTablePropertiesAtLevel(std::string* value, Slice suffix);
+  bool HandleAggregatedTablePropertiesMap(
+      std::map<std::string, std::string>* values, Slice suffix);
+  bool HandleAggregatedTablePropertiesAtLevelMap(
+      std::map<std::string, std::string>* values, Slice suffix);
+  bool HandleNumImmutableMemTable(uint64_t* value, DBImpl* db,
+                                  Version* version);
+  bool HandleNumImmutableMemTableFlushed(uint64_t* value, DBImpl* db,
+                                         Version* version);
+  bool HandleMemTableFlushPending(uint64_t* value, DBImpl* db,
+                                  Version* version);
+  bool HandleNumRunningFlushes(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleCompactionPending(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleNumRunningCompactions(uint64_t* value, DBImpl* db,
+                                   Version* version);
+  bool HandleBackgroundErrors(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* db,
+                                   Version* version);
+  bool HandleCurSizeAllMemTables(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleSizeAllMemTables(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleNumEntriesActiveMemTable(uint64_t* value, DBImpl* db,
+                                      Version* version);
+  bool HandleNumEntriesImmMemTables(uint64_t* value, DBImpl* db,
+                                    Version* version);
+  bool HandleNumDeletesActiveMemTable(uint64_t* value, DBImpl* db,
+                                      Version* version);
+  bool HandleNumDeletesImmMemTables(uint64_t* value, DBImpl* db,
+                                    Version* version);
+  bool HandleEstimateNumKeys(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleNumSnapshots(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleOldestSnapshotTime(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleOldestSnapshotSequence(uint64_t* value, DBImpl* db,
+                                    Version* version);
+  bool HandleNumLiveVersions(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleCurrentSuperVersionNumber(uint64_t* value, DBImpl* db,
+                                       Version* version);
+  bool HandleIsFileDeletionsEnabled(uint64_t* value, DBImpl* db,
+                                    Version* version);
+  bool HandleBaseLevel(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleTotalSstFilesSize(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleLiveSstFilesSize(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleEstimatePendingCompactionBytes(uint64_t* value, DBImpl* db,
+                                            Version* version);
+  bool HandleEstimateTableReadersMem(uint64_t* value, DBImpl* db,
+                                     Version* version);
+  bool HandleEstimateLiveDataSize(uint64_t* value, DBImpl* db,
+                                  Version* version);
+  bool HandleMinLogNumberToKeep(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleMinObsoleteSstNumberToKeep(uint64_t* value, DBImpl* db,
+                                        Version* version);
+  bool HandleActualDelayedWriteRate(uint64_t* value, DBImpl* db,
+                                    Version* version);
+  bool HandleIsWriteStopped(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* db,
+                                   Version* version);
+  bool HandleBlockCacheCapacity(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleBlockCacheUsage(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleBlockCachePinnedUsage(uint64_t* value, DBImpl* db,
+                                   Version* version);
+  bool HandleBlockCacheEntryStatsInternal(std::string* value, bool fast);
+  bool HandleBlockCacheEntryStatsMapInternal(
+      std::map<std::string, std::string>* values, bool fast);
+  bool HandleBlockCacheEntryStats(std::string* value, Slice suffix);
+  bool HandleBlockCacheEntryStatsMap(std::map<std::string, std::string>* values,
+                                     Slice suffix);
+  bool HandleFastBlockCacheEntryStats(std::string* value, Slice suffix);
+  bool HandleFastBlockCacheEntryStatsMap(
+      std::map<std::string, std::string>* values, Slice suffix);
+  bool HandleLiveSstFilesSizeAtTemperature(std::string* value, Slice suffix);
+  bool HandleNumBlobFiles(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleBlobStats(std::string* value, Slice suffix);
+  bool HandleTotalBlobFileSize(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleLiveBlobFileSize(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleLiveBlobFileGarbageSize(uint64_t* value, DBImpl* db,
+                                     Version* version);
+  bool HandleBlobCacheCapacity(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleBlobCacheUsage(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleBlobCachePinnedUsage(uint64_t* value, DBImpl* db,
+                                  Version* version);
+
+  // Total number of background errors encountered. Every time a flush task
+  // or compaction task fails, this counter is incremented. The failure can
+  // be caused by any possible reason, including file system errors, out of
+  // resources, or input file corruption. Failing when retrying the same flush
+  // or compaction will cause the counter to increase too.
+  uint64_t bg_error_count_;
+
+  const int number_levels_;
+  SystemClock* clock_;
+  ColumnFamilyData* cfd_;
+  uint64_t started_at_;
+};
+
+#else
+
+class InternalStats {
+ public:
+  enum InternalCFStatsType {
+    L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+    LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+    MEMTABLE_LIMIT_STOPS,
+    MEMTABLE_LIMIT_SLOWDOWNS,
+    L0_FILE_COUNT_LIMIT_STOPS,
+    LOCKED_L0_FILE_COUNT_LIMIT_STOPS,
+    PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS,
+    PENDING_COMPACTION_BYTES_LIMIT_STOPS,
+    WRITE_STALLS_ENUM_MAX,
+    BYTES_FLUSHED,
+    BYTES_INGESTED_ADD_FILE,
+    INGESTED_NUM_FILES_TOTAL,
+    INGESTED_LEVEL0_NUM_FILES_TOTAL,
+    INGESTED_NUM_KEYS_TOTAL,
+    INTERNAL_CF_STATS_ENUM_MAX,
+  };
+
+  enum InternalDBStatsType {
+    kIntStatsWalFileBytes,
+    kIntStatsWalFileSynced,
+    kIntStatsBytesWritten,
+    kIntStatsNumKeysWritten,
+    kIntStatsWriteDoneByOther,
+    kIntStatsWriteDoneBySelf,
+    kIntStatsWriteWithWal,
+    kIntStatsWriteStallMicros,
+    kIntStatsNumMax,
+  };
+
+  InternalStats(int /*num_levels*/, SystemClock* /*clock*/,
+                ColumnFamilyData* /*cfd*/) {}
+
+  // Per level compaction stats
+  struct CompactionOutputsStats {
+    uint64_t num_output_records = 0;
+    uint64_t bytes_written = 0;
+    uint64_t bytes_written_blob = 0;
+    uint64_t num_output_files = 0;
+    uint64_t num_output_files_blob = 0;
+
+    void Add(const CompactionOutputsStats& stats) {
+      this->num_output_records += stats.num_output_records;
+      this->bytes_written += stats.bytes_written;
+      this->bytes_written_blob += stats.bytes_written_blob;
+      this->num_output_files += stats.num_output_files;
+      this->num_output_files_blob += stats.num_output_files_blob;
+    }
+  };
+
+  struct CompactionStats {
+    uint64_t micros;
+    uint64_t cpu_micros;
+    uint64_t bytes_read_non_output_levels;
+    uint64_t bytes_read_output_level;
+    uint64_t bytes_read_blob;
+    uint64_t bytes_written;
+    uint64_t bytes_written_blob;
+    uint64_t bytes_moved;
+    int num_input_files_in_non_output_levels;
+    int num_input_files_in_output_level;
+    int num_output_files;
+    int num_output_files_blob;
+    uint64_t num_input_records;
+    uint64_t num_dropped_records;
+    uint64_t num_output_records;
+    int count;
+
+    explicit CompactionStats() {}
+
+    explicit CompactionStats(CompactionReason /*reason*/, int /*c*/) {}
+
+    explicit CompactionStats(const CompactionStats& /*c*/) {}
+
+    void Add(const CompactionStats& /*c*/) {}
+
+    void Add(const CompactionOutputsStats& /*c*/) {}
+
+    void Subtract(const CompactionStats& /*c*/) {}
+  };
+
+  struct CompactionStatsFull {
+    // the stats for the target primary output level (per level stats)
+    CompactionStats stats;
+
+    // stats for output_to_penultimate_level level (per level stats)
+    bool has_penultimate_level_output = false;
+    CompactionStats penultimate_level_stats;
+
+    explicit CompactionStatsFull(){};
+
+    explicit CompactionStatsFull(CompactionReason /*reason*/, int /*c*/){};
+
+    uint64_t TotalBytesWritten() const { return 0; }
+
+    uint64_t DroppedRecords() { return 0; }
+
+    void SetMicros(uint64_t /*val*/){};
+
+    void AddCpuMicros(uint64_t /*val*/){};
+  };
+
+  void AddCompactionStats(int /*level*/, Env::Priority /*thread_pri*/,
+                          const CompactionStats& /*stats*/) {}
+
+  void AddCompactionStats(int /*level*/, Env::Priority /*thread_pri*/,
+                          const CompactionStatsFull& /*unmerged_stats*/) {}
+
+  void IncBytesMoved(int /*level*/, uint64_t /*amount*/) {}
+
+  void AddCFStats(InternalCFStatsType /*type*/, uint64_t /*value*/) {}
+
+  void AddDBStats(InternalDBStatsType /*type*/, uint64_t /*value*/,
+                  bool /*concurrent */ = false) {}
+
+  HistogramImpl* GetFileReadHist(int /*level*/) { return nullptr; }
+
+  HistogramImpl* GetBlobFileReadHist() { return nullptr; }
+
+  uint64_t GetBackgroundErrorCount() const { return 0; }
+
+  uint64_t BumpAndGetBackgroundErrorCount() { return 0; }
+
+  bool GetStringProperty(const DBPropertyInfo& /*property_info*/,
+                         const Slice& /*property*/, std::string* /*value*/) {
+    return false;
+  }
+
+  bool GetMapProperty(const DBPropertyInfo& /*property_info*/,
+                      const Slice& /*property*/,
+                      std::map<std::string, std::string>* /*value*/) {
+    return false;
+  }
+
+  bool GetIntProperty(const DBPropertyInfo& /*property_info*/,
+                      uint64_t* /*value*/, DBImpl* /*db*/) const {
+    return false;
+  }
+
+  bool GetIntPropertyOutOfMutex(const DBPropertyInfo& /*property_info*/,
+                                Version* /*version*/,
+                                uint64_t* /*value*/) const {
+    return false;
+  }
+};
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/job_context.h b/src/rocksdb/db/job_context.h
new file mode 100644
index 000000000..352c58e82
--- /dev/null
+++ b/src/rocksdb/db/job_context.h
@@ -0,0 +1,238 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/log_writer.h"
+#include "db/version_set.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MemTable;
+struct SuperVersion;
+
+struct SuperVersionContext {
+  struct WriteStallNotification {
+    WriteStallInfo write_stall_info;
+    const ImmutableOptions* immutable_options;
+  };
+
+  autovector<SuperVersion*> superversions_to_free;
+#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION
+  autovector<WriteStallNotification> write_stall_notifications;
+#endif
+  std::unique_ptr<SuperVersion>
+      new_superversion;  // if nullptr no new superversion
+
+  explicit SuperVersionContext(bool create_superversion = false)
+      : new_superversion(create_superversion ? new SuperVersion() : nullptr) {}
+
+  explicit SuperVersionContext(SuperVersionContext&& other) noexcept
+      : superversions_to_free(std::move(other.superversions_to_free)),
+#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION
+        write_stall_notifications(std::move(other.write_stall_notifications)),
+#endif
+        new_superversion(std::move(other.new_superversion)) {
+  }
+  // No copies
+  SuperVersionContext(const SuperVersionContext& other) = delete;
+  void operator=(const SuperVersionContext& other) = delete;
+
+  void NewSuperVersion() {
+    new_superversion = std::unique_ptr<SuperVersion>(new SuperVersion());
+  }
+
+  inline bool HaveSomethingToDelete() const {
+#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION
+    return !superversions_to_free.empty() || !write_stall_notifications.empty();
+#else
+    return !superversions_to_free.empty();
+#endif
+  }
+
+  void PushWriteStallNotification(WriteStallCondition old_cond,
+                                  WriteStallCondition new_cond,
+                                  const std::string& name,
+                                  const ImmutableOptions* ioptions) {
+#if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+    WriteStallNotification notif;
+    notif.write_stall_info.cf_name = name;
+    notif.write_stall_info.condition.prev = old_cond;
+    notif.write_stall_info.condition.cur = new_cond;
+    notif.immutable_options = ioptions;
+    write_stall_notifications.push_back(notif);
+#else
+    (void)old_cond;
+    (void)new_cond;
+    (void)name;
+    (void)ioptions;
+#endif  // !defined(ROCKSDB_LITE) &&
+        // !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+  }
+
+  void Clean() {
+#if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+    // notify listeners on changed write stall conditions
+    for (auto& notif : write_stall_notifications) {
+      for (auto& listener : notif.immutable_options->listeners) {
+        listener->OnStallConditionsChanged(notif.write_stall_info);
+      }
+    }
+    write_stall_notifications.clear();
+#endif  // !ROCKSDB_LITE
+    // free superversions
+    for (auto s : superversions_to_free) {
+      delete s;
+    }
+    superversions_to_free.clear();
+  }
+
+  ~SuperVersionContext() {
+#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION
+    assert(write_stall_notifications.empty());
+#endif
+    assert(superversions_to_free.empty());
+  }
+};
+
+struct JobContext {
+  inline bool HaveSomethingToDelete() const {
+    return !(full_scan_candidate_files.empty() && sst_delete_files.empty() &&
+             blob_delete_files.empty() && log_delete_files.empty() &&
+             manifest_delete_files.empty());
+  }
+
+  inline bool HaveSomethingToClean() const {
+    bool sv_have_sth = false;
+    for (const auto& sv_ctx : superversion_contexts) {
+      if (sv_ctx.HaveSomethingToDelete()) {
+        sv_have_sth = true;
+        break;
+      }
+    }
+    return memtables_to_free.size() > 0 || logs_to_free.size() > 0 ||
+           job_snapshot != nullptr || sv_have_sth;
+  }
+
+  SequenceNumber GetJobSnapshotSequence() const {
+    if (job_snapshot) {
+      assert(job_snapshot->snapshot());
+      return job_snapshot->snapshot()->GetSequenceNumber();
+    }
+    return kMaxSequenceNumber;
+  }
+
+  // Structure to store information for candidate files to delete.
+  struct CandidateFileInfo {
+    std::string file_name;
+    std::string file_path;
+    CandidateFileInfo(std::string name, std::string path)
+        : file_name(std::move(name)), file_path(std::move(path)) {}
+    bool operator==(const CandidateFileInfo& other) const {
+      return file_name == other.file_name && file_path == other.file_path;
+    }
+  };
+
+  // Unique job id
+  int job_id;
+
+  // a list of all files that we'll consider deleting
+  // (every once in a while this is filled up with all files
+  // in the DB directory)
+  // (filled only if we're doing full scan)
+  std::vector<CandidateFileInfo> full_scan_candidate_files;
+
+  // the list of all live sst files that cannot be deleted
+  std::vector<uint64_t> sst_live;
+
+  // the list of sst files that we need to delete
+  std::vector<ObsoleteFileInfo> sst_delete_files;
+
+  // the list of all live blob files that cannot be deleted
+  std::vector<uint64_t> blob_live;
+
+  // the list of blob files that we need to delete
+  std::vector<ObsoleteBlobFileInfo> blob_delete_files;
+
+  // a list of log files that we need to delete
+  std::vector<uint64_t> log_delete_files;
+
+  // a list of log files that we need to preserve during full purge since they
+  // will be reused later
+  std::vector<uint64_t> log_recycle_files;
+
+  // a list of manifest files that we need to delete
+  std::vector<std::string> manifest_delete_files;
+
+  // a list of memtables to be free
+  autovector<MemTable*> memtables_to_free;
+
+  // contexts for installing superversions for multiple column families
+  std::vector<SuperVersionContext> superversion_contexts;
+
+  autovector<log::Writer*> logs_to_free;
+
+  // the current manifest_file_number, log_number and prev_log_number
+  // that corresponds to the set of files in 'live'.
+  uint64_t manifest_file_number;
+  uint64_t pending_manifest_file_number;
+  uint64_t log_number;
+  uint64_t prev_log_number;
+
+  uint64_t min_pending_output = 0;
+  uint64_t prev_total_log_size = 0;
+  size_t num_alive_log_files = 0;
+  uint64_t size_log_to_delete = 0;
+
+  // Snapshot taken before flush/compaction job.
+  std::unique_ptr<ManagedSnapshot> job_snapshot;
+
+  explicit JobContext(int _job_id, bool create_superversion = false) {
+    job_id = _job_id;
+    manifest_file_number = 0;
+    pending_manifest_file_number = 0;
+    log_number = 0;
+    prev_log_number = 0;
+    superversion_contexts.emplace_back(
+        SuperVersionContext(create_superversion));
+  }
+
+  // For non-empty JobContext Clean() has to be called at least once before
+  // before destruction (see asserts in ~JobContext()). Should be called with
+  // unlocked DB mutex. Destructor doesn't call Clean() to avoid accidentally
+  // doing potentially slow Clean() with locked DB mutex.
+  void Clean() {
+    // free superversions
+    for (auto& sv_context : superversion_contexts) {
+      sv_context.Clean();
+    }
+    // free pending memtables
+    for (auto m : memtables_to_free) {
+      delete m;
+    }
+    for (auto l : logs_to_free) {
+      delete l;
+    }
+
+    memtables_to_free.clear();
+    logs_to_free.clear();
+    job_snapshot.reset();
+  }
+
+  ~JobContext() {
+    assert(memtables_to_free.size() == 0);
+    assert(logs_to_free.size() == 0);
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/kv_checksum.h b/src/rocksdb/db/kv_checksum.h
new file mode 100644
index 000000000..bce507fcf
--- /dev/null
+++ b/src/rocksdb/db/kv_checksum.h
@@ -0,0 +1,398 @@
+//  Copyright (c) 2020-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file contains classes containing fields to protect individual entries.
+// The classes are named "ProtectionInfo<suffix>", where <suffix> indicates the
+// combination of fields that are covered. Each field has a single letter
+// abbreviation as follows.
+//
+// K = key
+// V = value
+// O = optype aka value type
+// S = seqno
+// C = CF ID
+//
+// Then, for example, a class that protects an entry consisting of key, value,
+// optype, and CF ID (i.e., a `WriteBatch` entry) would be named
+// `ProtectionInfoKVOC`.
+//
+// The `ProtectionInfo.*` classes are templated on the integer type used to hold
+// the XOR of hashes for each field. Only unsigned integer types are supported,
+// and the maximum supported integer width is 64 bits. When the integer type is
+// narrower than the hash values, we lop off the most significant bits to make
+// them fit.
+//
+// The `ProtectionInfo.*` classes are all intended to be non-persistent. We do
+// not currently make the byte order consistent for integer fields before
+// hashing them, so the resulting values are endianness-dependent.
+
+#pragma once
+
+#include <type_traits>
+
+#include "db/dbformat.h"
+#include "rocksdb/types.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+template <typename T>
+class ProtectionInfo;
+template <typename T>
+class ProtectionInfoKVO;
+template <typename T>
+class ProtectionInfoKVOC;
+template <typename T>
+class ProtectionInfoKVOS;
+
+// Aliases for 64-bit protection infos.
+using ProtectionInfo64 = ProtectionInfo<uint64_t>;
+using ProtectionInfoKVO64 = ProtectionInfoKVO<uint64_t>;
+using ProtectionInfoKVOC64 = ProtectionInfoKVOC<uint64_t>;
+using ProtectionInfoKVOS64 = ProtectionInfoKVOS<uint64_t>;
+
+template <typename T>
+class ProtectionInfo {
+ public:
+  ProtectionInfo() = default;
+
+  Status GetStatus() const;
+  ProtectionInfoKVO<T> ProtectKVO(const Slice& key, const Slice& value,
+                                  ValueType op_type) const;
+  ProtectionInfoKVO<T> ProtectKVO(const SliceParts& key,
+                                  const SliceParts& value,
+                                  ValueType op_type) const;
+
+  T GetVal() const { return val_; }
+
+ private:
+  friend class ProtectionInfoKVO<T>;
+  friend class ProtectionInfoKVOS<T>;
+  friend class ProtectionInfoKVOC<T>;
+
+  // Each field is hashed with an independent value so we can catch fields being
+  // swapped. Per the `NPHash64()` docs, using consecutive seeds is a pitfall,
+  // and we should instead vary our seeds by a large odd number. This value by
+  // which we increment (0xD28AAD72F49BD50B) was taken from
+  // `head -c8 /dev/urandom | hexdump`, run repeatedly until it yielded an odd
+  // number. The values are computed manually since the Windows C++ compiler
+  // complains about the overflow when adding constants.
+  static const uint64_t kSeedK = 0;
+  static const uint64_t kSeedV = 0xD28AAD72F49BD50B;
+  static const uint64_t kSeedO = 0xA5155AE5E937AA16;
+  static const uint64_t kSeedS = 0x77A00858DDD37F21;
+  static const uint64_t kSeedC = 0x4A2AB5CBD26F542C;
+
+  ProtectionInfo(T val) : val_(val) {
+    static_assert(sizeof(ProtectionInfo<T>) == sizeof(T), "");
+  }
+
+  void SetVal(T val) { val_ = val; }
+
+  T val_ = 0;
+};
+
+template <typename T>
+class ProtectionInfoKVO {
+ public:
+  ProtectionInfoKVO() = default;
+
+  ProtectionInfo<T> StripKVO(const Slice& key, const Slice& value,
+                             ValueType op_type) const;
+  ProtectionInfo<T> StripKVO(const SliceParts& key, const SliceParts& value,
+                             ValueType op_type) const;
+
+  ProtectionInfoKVOC<T> ProtectC(ColumnFamilyId column_family_id) const;
+  ProtectionInfoKVOS<T> ProtectS(SequenceNumber sequence_number) const;
+
+  void UpdateK(const Slice& old_key, const Slice& new_key);
+  void UpdateK(const SliceParts& old_key, const SliceParts& new_key);
+  void UpdateV(const Slice& old_value, const Slice& new_value);
+  void UpdateV(const SliceParts& old_value, const SliceParts& new_value);
+  void UpdateO(ValueType old_op_type, ValueType new_op_type);
+
+  T GetVal() const { return info_.GetVal(); }
+
+ private:
+  friend class ProtectionInfo<T>;
+  friend class ProtectionInfoKVOS<T>;
+  friend class ProtectionInfoKVOC<T>;
+
+  explicit ProtectionInfoKVO(T val) : info_(val) {
+    static_assert(sizeof(ProtectionInfoKVO<T>) == sizeof(T), "");
+  }
+
+  void SetVal(T val) { info_.SetVal(val); }
+
+  ProtectionInfo<T> info_;
+};
+
+template <typename T>
+class ProtectionInfoKVOC {
+ public:
+  ProtectionInfoKVOC() = default;
+
+  ProtectionInfoKVO<T> StripC(ColumnFamilyId column_family_id) const;
+
+  void UpdateK(const Slice& old_key, const Slice& new_key) {
+    kvo_.UpdateK(old_key, new_key);
+  }
+  void UpdateK(const SliceParts& old_key, const SliceParts& new_key) {
+    kvo_.UpdateK(old_key, new_key);
+  }
+  void UpdateV(const Slice& old_value, const Slice& new_value) {
+    kvo_.UpdateV(old_value, new_value);
+  }
+  void UpdateV(const SliceParts& old_value, const SliceParts& new_value) {
+    kvo_.UpdateV(old_value, new_value);
+  }
+  void UpdateO(ValueType old_op_type, ValueType new_op_type) {
+    kvo_.UpdateO(old_op_type, new_op_type);
+  }
+  void UpdateC(ColumnFamilyId old_column_family_id,
+               ColumnFamilyId new_column_family_id);
+
+  T GetVal() const { return kvo_.GetVal(); }
+
+ private:
+  friend class ProtectionInfoKVO<T>;
+
+  explicit ProtectionInfoKVOC(T val) : kvo_(val) {
+    static_assert(sizeof(ProtectionInfoKVOC<T>) == sizeof(T), "");
+  }
+
+  void SetVal(T val) { kvo_.SetVal(val); }
+
+  ProtectionInfoKVO<T> kvo_;
+};
+
+template <typename T>
+class ProtectionInfoKVOS {
+ public:
+  ProtectionInfoKVOS() = default;
+
+  ProtectionInfoKVO<T> StripS(SequenceNumber sequence_number) const;
+
+  void UpdateK(const Slice& old_key, const Slice& new_key) {
+    kvo_.UpdateK(old_key, new_key);
+  }
+  void UpdateK(const SliceParts& old_key, const SliceParts& new_key) {
+    kvo_.UpdateK(old_key, new_key);
+  }
+  void UpdateV(const Slice& old_value, const Slice& new_value) {
+    kvo_.UpdateV(old_value, new_value);
+  }
+  void UpdateV(const SliceParts& old_value, const SliceParts& new_value) {
+    kvo_.UpdateV(old_value, new_value);
+  }
+  void UpdateO(ValueType old_op_type, ValueType new_op_type) {
+    kvo_.UpdateO(old_op_type, new_op_type);
+  }
+  void UpdateS(SequenceNumber old_sequence_number,
+               SequenceNumber new_sequence_number);
+
+  T GetVal() const { return kvo_.GetVal(); }
+
+ private:
+  friend class ProtectionInfoKVO<T>;
+
+  explicit ProtectionInfoKVOS(T val) : kvo_(val) {
+    static_assert(sizeof(ProtectionInfoKVOS<T>) == sizeof(T), "");
+  }
+
+  void SetVal(T val) { kvo_.SetVal(val); }
+
+  ProtectionInfoKVO<T> kvo_;
+};
+
+template <typename T>
+Status ProtectionInfo<T>::GetStatus() const {
+  if (val_ != 0) {
+    return Status::Corruption("ProtectionInfo mismatch");
+  }
+  return Status::OK();
+}
+
+template <typename T>
+ProtectionInfoKVO<T> ProtectionInfo<T>::ProtectKVO(const Slice& key,
+                                                   const Slice& value,
+                                                   ValueType op_type) const {
+  T val = GetVal();
+  val = val ^ static_cast<T>(GetSliceNPHash64(key, ProtectionInfo<T>::kSeedK));
+  val =
+      val ^ static_cast<T>(GetSliceNPHash64(value, ProtectionInfo<T>::kSeedV));
+  val = val ^
+        static_cast<T>(NPHash64(reinterpret_cast<char*>(&op_type),
+                                sizeof(op_type), ProtectionInfo<T>::kSeedO));
+  return ProtectionInfoKVO<T>(val);
+}
+
+template <typename T>
+ProtectionInfoKVO<T> ProtectionInfo<T>::ProtectKVO(const SliceParts& key,
+                                                   const SliceParts& value,
+                                                   ValueType op_type) const {
+  T val = GetVal();
+  val = val ^
+        static_cast<T>(GetSlicePartsNPHash64(key, ProtectionInfo<T>::kSeedK));
+  val = val ^
+        static_cast<T>(GetSlicePartsNPHash64(value, ProtectionInfo<T>::kSeedV));
+  val = val ^
+        static_cast<T>(NPHash64(reinterpret_cast<char*>(&op_type),
+                                sizeof(op_type), ProtectionInfo<T>::kSeedO));
+  return ProtectionInfoKVO<T>(val);
+}
+
+template <typename T>
+void ProtectionInfoKVO<T>::UpdateK(const Slice& old_key, const Slice& new_key) {
+  T val = GetVal();
+  val = val ^
+        static_cast<T>(GetSliceNPHash64(old_key, ProtectionInfo<T>::kSeedK));
+  val = val ^
+        static_cast<T>(GetSliceNPHash64(new_key, ProtectionInfo<T>::kSeedK));
+  SetVal(val);
+}
+
+template <typename T>
+void ProtectionInfoKVO<T>::UpdateK(const SliceParts& old_key,
+                                   const SliceParts& new_key) {
+  T val = GetVal();
+  val = val ^ static_cast<T>(
+                  GetSlicePartsNPHash64(old_key, ProtectionInfo<T>::kSeedK));
+  val = val ^ static_cast<T>(
+                  GetSlicePartsNPHash64(new_key, ProtectionInfo<T>::kSeedK));
+  SetVal(val);
+}
+
+template <typename T>
+void ProtectionInfoKVO<T>::UpdateV(const Slice& old_value,
+                                   const Slice& new_value) {
+  T val = GetVal();
+  val = val ^
+        static_cast<T>(GetSliceNPHash64(old_value, ProtectionInfo<T>::kSeedV));
+  val = val ^
+        static_cast<T>(GetSliceNPHash64(new_value, ProtectionInfo<T>::kSeedV));
+  SetVal(val);
+}
+
+template <typename T>
+void ProtectionInfoKVO<T>::UpdateV(const SliceParts& old_value,
+                                   const SliceParts& new_value) {
+  T val = GetVal();
+  val = val ^ static_cast<T>(
+                  GetSlicePartsNPHash64(old_value, ProtectionInfo<T>::kSeedV));
+  val = val ^ static_cast<T>(
+                  GetSlicePartsNPHash64(new_value, ProtectionInfo<T>::kSeedV));
+  SetVal(val);
+}
+
+template <typename T>
+void ProtectionInfoKVO<T>::UpdateO(ValueType old_op_type,
+                                   ValueType new_op_type) {
+  T val = GetVal();
+  val = val ^ static_cast<T>(NPHash64(reinterpret_cast<char*>(&old_op_type),
+                                      sizeof(old_op_type),
+                                      ProtectionInfo<T>::kSeedO));
+  val = val ^ static_cast<T>(NPHash64(reinterpret_cast<char*>(&new_op_type),
+                                      sizeof(new_op_type),
+                                      ProtectionInfo<T>::kSeedO));
+  SetVal(val);
+}
+
+template <typename T>
+ProtectionInfo<T> ProtectionInfoKVO<T>::StripKVO(const Slice& key,
+                                                 const Slice& value,
+                                                 ValueType op_type) const {
+  T val = GetVal();
+  val = val ^ static_cast<T>(GetSliceNPHash64(key, ProtectionInfo<T>::kSeedK));
+  val =
+      val ^ static_cast<T>(GetSliceNPHash64(value, ProtectionInfo<T>::kSeedV));
+  val = val ^
+        static_cast<T>(NPHash64(reinterpret_cast<char*>(&op_type),
+                                sizeof(op_type), ProtectionInfo<T>::kSeedO));
+  return ProtectionInfo<T>(val);
+}
+
+template <typename T>
+ProtectionInfo<T> ProtectionInfoKVO<T>::StripKVO(const SliceParts& key,
+                                                 const SliceParts& value,
+                                                 ValueType op_type) const {
+  T val = GetVal();
+  val = val ^
+        static_cast<T>(GetSlicePartsNPHash64(key, ProtectionInfo<T>::kSeedK));
+  val = val ^
+        static_cast<T>(GetSlicePartsNPHash64(value, ProtectionInfo<T>::kSeedV));
+  val = val ^
+        static_cast<T>(NPHash64(reinterpret_cast<char*>(&op_type),
+                                sizeof(op_type), ProtectionInfo<T>::kSeedO));
+  return ProtectionInfo<T>(val);
+}
+
+template <typename T>
+ProtectionInfoKVOC<T> ProtectionInfoKVO<T>::ProtectC(
+    ColumnFamilyId column_family_id) const {
+  T val = GetVal();
+  val = val ^ static_cast<T>(NPHash64(
+                  reinterpret_cast<char*>(&column_family_id),
+                  sizeof(column_family_id), ProtectionInfo<T>::kSeedC));
+  return ProtectionInfoKVOC<T>(val);
+}
+
+template <typename T>
+ProtectionInfoKVO<T> ProtectionInfoKVOC<T>::StripC(
+    ColumnFamilyId column_family_id) const {
+  T val = GetVal();
+  val = val ^ static_cast<T>(NPHash64(
+                  reinterpret_cast<char*>(&column_family_id),
+                  sizeof(column_family_id), ProtectionInfo<T>::kSeedC));
+  return ProtectionInfoKVO<T>(val);
+}
+
+template <typename T>
+void ProtectionInfoKVOC<T>::UpdateC(ColumnFamilyId old_column_family_id,
+                                    ColumnFamilyId new_column_family_id) {
+  T val = GetVal();
+  val = val ^ static_cast<T>(NPHash64(
+                  reinterpret_cast<char*>(&old_column_family_id),
+                  sizeof(old_column_family_id), ProtectionInfo<T>::kSeedC));
+  val = val ^ static_cast<T>(NPHash64(
+                  reinterpret_cast<char*>(&new_column_family_id),
+                  sizeof(new_column_family_id), ProtectionInfo<T>::kSeedC));
+  SetVal(val);
+}
+
+template <typename T>
+ProtectionInfoKVOS<T> ProtectionInfoKVO<T>::ProtectS(
+    SequenceNumber sequence_number) const {
+  T val = GetVal();
+  val = val ^ static_cast<T>(NPHash64(reinterpret_cast<char*>(&sequence_number),
+                                      sizeof(sequence_number),
+                                      ProtectionInfo<T>::kSeedS));
+  return ProtectionInfoKVOS<T>(val);
+}
+
+template <typename T>
+ProtectionInfoKVO<T> ProtectionInfoKVOS<T>::StripS(
+    SequenceNumber sequence_number) const {
+  T val = GetVal();
+  val = val ^ static_cast<T>(NPHash64(reinterpret_cast<char*>(&sequence_number),
+                                      sizeof(sequence_number),
+                                      ProtectionInfo<T>::kSeedS));
+  return ProtectionInfoKVO<T>(val);
+}
+
+template <typename T>
+void ProtectionInfoKVOS<T>::UpdateS(SequenceNumber old_sequence_number,
+                                    SequenceNumber new_sequence_number) {
+  T val = GetVal();
+  val = val ^ static_cast<T>(NPHash64(
+                  reinterpret_cast<char*>(&old_sequence_number),
+                  sizeof(old_sequence_number), ProtectionInfo<T>::kSeedS));
+  val = val ^ static_cast<T>(NPHash64(
+                  reinterpret_cast<char*>(&new_sequence_number),
+                  sizeof(new_sequence_number), ProtectionInfo<T>::kSeedS));
+  SetVal(val);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/listener_test.cc b/src/rocksdb/db/listener_test.cc
new file mode 100644
index 000000000..160866bb7
--- /dev/null
+++ b/src/rocksdb/db/listener_test.cc
@@ -0,0 +1,1595 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_index.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "monitoring/statistics.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
+#include "util/rate_limiter.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+#ifndef ROCKSDB_LITE
+
+namespace ROCKSDB_NAMESPACE {
+
+class EventListenerTest : public DBTestBase {
+ public:
+  EventListenerTest() : DBTestBase("listener_test", /*env_do_fsync=*/true) {}
+
+  static std::string BlobStr(uint64_t blob_file_number, uint64_t offset,
+                             uint64_t size) {
+    std::string blob_index;
+    BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+                          kNoCompression);
+    return blob_index;
+  }
+
+  const size_t k110KB = 110 << 10;
+};
+
+struct TestPropertiesCollector
+    : public ROCKSDB_NAMESPACE::TablePropertiesCollector {
+  ROCKSDB_NAMESPACE::Status AddUserKey(
+      const ROCKSDB_NAMESPACE::Slice& /*key*/,
+      const ROCKSDB_NAMESPACE::Slice& /*value*/,
+      ROCKSDB_NAMESPACE::EntryType /*type*/,
+      ROCKSDB_NAMESPACE::SequenceNumber /*seq*/,
+      uint64_t /*file_size*/) override {
+    return Status::OK();
+  }
+  ROCKSDB_NAMESPACE::Status Finish(
+      ROCKSDB_NAMESPACE::UserCollectedProperties* properties) override {
+    properties->insert({"0", "1"});
+    return Status::OK();
+  }
+
+  const char* Name() const override { return "TestTablePropertiesCollector"; }
+
+  ROCKSDB_NAMESPACE::UserCollectedProperties GetReadableProperties()
+      const override {
+    ROCKSDB_NAMESPACE::UserCollectedProperties ret;
+    ret["2"] = "3";
+    return ret;
+  }
+};
+
+class TestPropertiesCollectorFactory : public TablePropertiesCollectorFactory {
+ public:
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /*context*/) override {
+    return new TestPropertiesCollector;
+  }
+  const char* Name() const override { return "TestTablePropertiesCollector"; }
+};
+
+class TestCompactionListener : public EventListener {
+ public:
+  explicit TestCompactionListener(EventListenerTest* test) : test_(test) {}
+
+  void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    compacted_dbs_.push_back(db);
+    ASSERT_GT(ci.input_files.size(), 0U);
+    ASSERT_EQ(ci.input_files.size(), ci.input_file_infos.size());
+
+    for (size_t i = 0; i < ci.input_file_infos.size(); ++i) {
+      ASSERT_EQ(ci.input_file_infos[i].level, ci.base_input_level);
+      ASSERT_EQ(ci.input_file_infos[i].file_number,
+                TableFileNameToNumber(ci.input_files[i]));
+    }
+
+    ASSERT_GT(ci.output_files.size(), 0U);
+    ASSERT_EQ(ci.output_files.size(), ci.output_file_infos.size());
+
+    ASSERT_TRUE(test_);
+    ASSERT_EQ(test_->db_, db);
+
+    std::vector<std::vector<FileMetaData>> files_by_level;
+    test_->dbfull()->TEST_GetFilesMetaData(test_->handles_[ci.cf_id],
+                                           &files_by_level);
+    ASSERT_GT(files_by_level.size(), ci.output_level);
+
+    for (size_t i = 0; i < ci.output_file_infos.size(); ++i) {
+      ASSERT_EQ(ci.output_file_infos[i].level, ci.output_level);
+      ASSERT_EQ(ci.output_file_infos[i].file_number,
+                TableFileNameToNumber(ci.output_files[i]));
+
+      auto it = std::find_if(
+          files_by_level[ci.output_level].begin(),
+          files_by_level[ci.output_level].end(), [&](const FileMetaData& meta) {
+            return meta.fd.GetNumber() == ci.output_file_infos[i].file_number;
+          });
+      ASSERT_NE(it, files_by_level[ci.output_level].end());
+
+      ASSERT_EQ(ci.output_file_infos[i].oldest_blob_file_number,
+                it->oldest_blob_file_number);
+    }
+
+    ASSERT_EQ(db->GetEnv()->GetThreadID(), ci.thread_id);
+    ASSERT_GT(ci.thread_id, 0U);
+
+    for (auto fl : {ci.input_files, ci.output_files}) {
+      for (auto fn : fl) {
+        auto it = ci.table_properties.find(fn);
+        ASSERT_NE(it, ci.table_properties.end());
+        auto tp = it->second;
+        ASSERT_TRUE(tp != nullptr);
+        ASSERT_EQ(tp->user_collected_properties.find("0")->second, "1");
+      }
+    }
+  }
+
+  EventListenerTest* test_;
+  std::vector<DB*> compacted_dbs_;
+  std::mutex mutex_;
+};
+
+TEST_F(EventListenerTest, OnSingleDBCompactionTest) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 100;
+  const int kNumL0Files = 4;
+
+  Options options;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base = options.target_file_size_base * 2;
+  options.max_bytes_for_level_multiplier = 2;
+  options.compression = kNoCompression;
+#ifdef ROCKSDB_USING_THREAD_STATUS
+  options.enable_thread_tracking = true;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.table_properties_collector_factories.push_back(
+      std::make_shared<TestPropertiesCollectorFactory>());
+
+  TestCompactionListener* listener = new TestCompactionListener(this);
+  options.listeners.emplace_back(listener);
+  std::vector<std::string> cf_names = {"pikachu",  "ilya",     "muromec",
+                                       "dobrynia", "nikitich", "alyosha",
+                                       "popovich"};
+  CreateAndReopenWithCF(cf_names, options);
+  ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 1, "ditto",
+                                             BlobStr(123, 0, 1 << 10)));
+  ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+
+  ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
+  ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
+  ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
+  ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
+  ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
+  ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
+  for (int i = 1; i < 8; ++i) {
+    ASSERT_OK(Flush(i));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[i],
+                                     nullptr, nullptr));
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+
+  ASSERT_EQ(listener->compacted_dbs_.size(), cf_names.size());
+  for (size_t i = 0; i < cf_names.size(); ++i) {
+    ASSERT_EQ(listener->compacted_dbs_[i], db_);
+  }
+}
+
+// This simple Listener can only handle one flush at a time.
+class TestFlushListener : public EventListener {
+ public:
+  TestFlushListener(Env* env, EventListenerTest* test)
+      : slowdown_count(0), stop_count(0), db_closed(), env_(env), test_(test) {
+    db_closed = false;
+  }
+
+  virtual ~TestFlushListener() {
+    prev_fc_info_.status.PermitUncheckedError();  // Ignore the status
+  }
+  void OnTableFileCreated(const TableFileCreationInfo& info) override {
+    // remember the info for later checking the FlushJobInfo.
+    prev_fc_info_ = info;
+    ASSERT_GT(info.db_name.size(), 0U);
+    ASSERT_GT(info.cf_name.size(), 0U);
+    ASSERT_GT(info.file_path.size(), 0U);
+    ASSERT_GT(info.job_id, 0);
+    ASSERT_GT(info.table_properties.data_size, 0U);
+    ASSERT_GT(info.table_properties.raw_key_size, 0U);
+    ASSERT_GT(info.table_properties.raw_value_size, 0U);
+    ASSERT_GT(info.table_properties.num_data_blocks, 0U);
+    ASSERT_GT(info.table_properties.num_entries, 0U);
+    ASSERT_EQ(info.file_checksum, kUnknownFileChecksum);
+    ASSERT_EQ(info.file_checksum_func_name, kUnknownFileChecksumFuncName);
+
+#ifdef ROCKSDB_USING_THREAD_STATUS
+    // Verify the id of the current thread that created this table
+    // file matches the id of any active flush or compaction thread.
+    uint64_t thread_id = env_->GetThreadID();
+    std::vector<ThreadStatus> thread_list;
+    ASSERT_OK(env_->GetThreadList(&thread_list));
+    bool found_match = false;
+    for (auto thread_status : thread_list) {
+      if (thread_status.operation_type == ThreadStatus::OP_FLUSH ||
+          thread_status.operation_type == ThreadStatus::OP_COMPACTION) {
+        if (thread_id == thread_status.thread_id) {
+          found_match = true;
+          break;
+        }
+      }
+    }
+    ASSERT_TRUE(found_match);
+#endif  // ROCKSDB_USING_THREAD_STATUS
+  }
+
+  void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
+    flushed_dbs_.push_back(db);
+    flushed_column_family_names_.push_back(info.cf_name);
+    if (info.triggered_writes_slowdown) {
+      slowdown_count++;
+    }
+    if (info.triggered_writes_stop) {
+      stop_count++;
+    }
+    // verify whether the previously created file matches the flushed file.
+    ASSERT_EQ(prev_fc_info_.db_name, db->GetName());
+    ASSERT_EQ(prev_fc_info_.cf_name, info.cf_name);
+    ASSERT_EQ(prev_fc_info_.job_id, info.job_id);
+    ASSERT_EQ(prev_fc_info_.file_path, info.file_path);
+    ASSERT_EQ(TableFileNameToNumber(info.file_path), info.file_number);
+
+    // Note: the following chunk relies on the notification pertaining to the
+    // database pointed to by DBTestBase::db_, and is thus bypassed when
+    // that assumption does not hold (see the test case MultiDBMultiListeners
+    // below).
+    ASSERT_TRUE(test_);
+    if (db == test_->db_) {
+      std::vector<std::vector<FileMetaData>> files_by_level;
+      ASSERT_LT(info.cf_id, test_->handles_.size());
+      ASSERT_GE(info.cf_id, 0u);
+      ASSERT_NE(test_->handles_[info.cf_id], nullptr);
+      test_->dbfull()->TEST_GetFilesMetaData(test_->handles_[info.cf_id],
+                                             &files_by_level);
+
+      ASSERT_FALSE(files_by_level.empty());
+      auto it = std::find_if(files_by_level[0].begin(), files_by_level[0].end(),
+                             [&](const FileMetaData& meta) {
+                               return meta.fd.GetNumber() == info.file_number;
+                             });
+      ASSERT_NE(it, files_by_level[0].end());
+      ASSERT_EQ(info.oldest_blob_file_number, it->oldest_blob_file_number);
+    }
+
+    ASSERT_EQ(db->GetEnv()->GetThreadID(), info.thread_id);
+    ASSERT_GT(info.thread_id, 0U);
+    ASSERT_EQ(info.table_properties.user_collected_properties.find("0")->second,
+              "1");
+  }
+
+  std::vector<std::string> flushed_column_family_names_;
+  std::vector<DB*> flushed_dbs_;
+  int slowdown_count;
+  int stop_count;
+  bool db_closing;
+  std::atomic_bool db_closed;
+  TableFileCreationInfo prev_fc_info_;
+
+ protected:
+  Env* env_;
+  EventListenerTest* test_;
+};
+
+TEST_F(EventListenerTest, OnSingleDBFlushTest) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.write_buffer_size = k110KB;
+#ifdef ROCKSDB_USING_THREAD_STATUS
+  options.enable_thread_tracking = true;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+  TestFlushListener* listener = new TestFlushListener(options.env, this);
+  options.listeners.emplace_back(listener);
+  std::vector<std::string> cf_names = {"pikachu",  "ilya",     "muromec",
+                                       "dobrynia", "nikitich", "alyosha",
+                                       "popovich"};
+  options.table_properties_collector_factories.push_back(
+      std::make_shared<TestPropertiesCollectorFactory>());
+  CreateAndReopenWithCF(cf_names, options);
+
+  ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 1, "ditto",
+                                             BlobStr(456, 0, 1 << 10)));
+  ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+
+  ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
+  ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
+  ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
+  ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
+  ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
+  ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
+  for (int i = 1; i < 8; ++i) {
+    ASSERT_OK(Flush(i));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    // Ensure background work is fully finished including listener callbacks
+    // before accessing listener state.
+    ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
+    ASSERT_EQ(listener->flushed_dbs_.size(), i);
+    ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
+  }
+
+  // make sure callback functions are called in the right order
+  for (size_t i = 0; i < cf_names.size(); ++i) {
+    ASSERT_EQ(listener->flushed_dbs_[i], db_);
+    ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]);
+  }
+}
+
+TEST_F(EventListenerTest, MultiCF) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.write_buffer_size = k110KB;
+#ifdef ROCKSDB_USING_THREAD_STATUS
+  options.enable_thread_tracking = true;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+  for (auto atomic_flush : {false, true}) {
+    options.atomic_flush = atomic_flush;
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+    TestFlushListener* listener = new TestFlushListener(options.env, this);
+    options.listeners.emplace_back(listener);
+    options.table_properties_collector_factories.push_back(
+        std::make_shared<TestPropertiesCollectorFactory>());
+    std::vector<std::string> cf_names = {"pikachu",  "ilya",     "muromec",
+                                         "dobrynia", "nikitich", "alyosha",
+                                         "popovich"};
+    CreateAndReopenWithCF(cf_names, options);
+
+    ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+    ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
+    ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
+    ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
+    ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
+    ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
+    ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    for (int i = 1; i < 8; ++i) {
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+          {{"DBImpl::NotifyOnFlushCompleted::PostAllOnFlushCompleted",
+            "EventListenerTest.MultiCF:PreVerifyListener"}});
+      ASSERT_OK(Flush(i));
+      TEST_SYNC_POINT("EventListenerTest.MultiCF:PreVerifyListener");
+      ASSERT_EQ(listener->flushed_dbs_.size(), i);
+      ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
+      // make sure callback functions are called in the right order
+      if (i == 7) {
+        for (size_t j = 0; j < cf_names.size(); j++) {
+          ASSERT_EQ(listener->flushed_dbs_[j], db_);
+          ASSERT_EQ(listener->flushed_column_family_names_[j], cf_names[j]);
+        }
+      }
+    }
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    Close();
+  }
+}
+
+TEST_F(EventListenerTest, MultiDBMultiListeners) {
+  Options options;
+  options.env = CurrentOptions().env;
+#ifdef ROCKSDB_USING_THREAD_STATUS
+  options.enable_thread_tracking = true;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+  options.table_properties_collector_factories.push_back(
+      std::make_shared<TestPropertiesCollectorFactory>());
+  std::vector<TestFlushListener*> listeners;
+  const int kNumDBs = 5;
+  const int kNumListeners = 10;
+  for (int i = 0; i < kNumListeners; ++i) {
+    listeners.emplace_back(new TestFlushListener(options.env, this));
+  }
+
+  std::vector<std::string> cf_names = {"pikachu",  "ilya",     "muromec",
+                                       "dobrynia", "nikitich", "alyosha",
+                                       "popovich"};
+
+  options.create_if_missing = true;
+  for (int i = 0; i < kNumListeners; ++i) {
+    options.listeners.emplace_back(listeners[i]);
+  }
+  DBOptions db_opts(options);
+  ColumnFamilyOptions cf_opts(options);
+
+  std::vector<DB*> dbs;
+  std::vector<std::vector<ColumnFamilyHandle*>> vec_handles;
+
+  for (int d = 0; d < kNumDBs; ++d) {
+    ASSERT_OK(DestroyDB(dbname_ + std::to_string(d), options));
+    DB* db;
+    std::vector<ColumnFamilyHandle*> handles;
+    ASSERT_OK(DB::Open(options, dbname_ + std::to_string(d), &db));
+    for (size_t c = 0; c < cf_names.size(); ++c) {
+      ColumnFamilyHandle* handle;
+      ASSERT_OK(db->CreateColumnFamily(cf_opts, cf_names[c], &handle));
+      handles.push_back(handle);
+    }
+
+    vec_handles.push_back(std::move(handles));
+    dbs.push_back(db);
+  }
+
+  for (int d = 0; d < kNumDBs; ++d) {
+    for (size_t c = 0; c < cf_names.size(); ++c) {
+      ASSERT_OK(dbs[d]->Put(WriteOptions(), vec_handles[d][c], cf_names[c],
+                            cf_names[c]));
+    }
+  }
+
+  for (size_t c = 0; c < cf_names.size(); ++c) {
+    for (int d = 0; d < kNumDBs; ++d) {
+      ASSERT_OK(dbs[d]->Flush(FlushOptions(), vec_handles[d][c]));
+      ASSERT_OK(
+          static_cast_with_check<DBImpl>(dbs[d])->TEST_WaitForFlushMemTable());
+    }
+  }
+
+  for (int d = 0; d < kNumDBs; ++d) {
+    // Ensure background work is fully finished including listener callbacks
+    // before accessing listener state.
+    ASSERT_OK(
+        static_cast_with_check<DBImpl>(dbs[d])->TEST_WaitForBackgroundWork());
+  }
+
+  for (auto* listener : listeners) {
+    int pos = 0;
+    for (size_t c = 0; c < cf_names.size(); ++c) {
+      for (int d = 0; d < kNumDBs; ++d) {
+        ASSERT_EQ(listener->flushed_dbs_[pos], dbs[d]);
+        ASSERT_EQ(listener->flushed_column_family_names_[pos], cf_names[c]);
+        pos++;
+      }
+    }
+  }
+
+  for (auto handles : vec_handles) {
+    for (auto h : handles) {
+      delete h;
+    }
+    handles.clear();
+  }
+  vec_handles.clear();
+
+  for (auto db : dbs) {
+    delete db;
+  }
+}
+
+TEST_F(EventListenerTest, DisableBGCompaction) {
+  Options options;
+  options.env = CurrentOptions().env;
+#ifdef ROCKSDB_USING_THREAD_STATUS
+  options.enable_thread_tracking = true;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+  TestFlushListener* listener = new TestFlushListener(options.env, this);
+  const int kCompactionTrigger = 1;
+  const int kSlowdownTrigger = 5;
+  const int kStopTrigger = 100;
+  options.level0_file_num_compaction_trigger = kCompactionTrigger;
+  options.level0_slowdown_writes_trigger = kSlowdownTrigger;
+  options.level0_stop_writes_trigger = kStopTrigger;
+  options.max_write_buffer_number = 10;
+  options.listeners.emplace_back(listener);
+  // BG compaction is disabled.  Number of L0 files will simply keeps
+  // increasing in this test.
+  options.compaction_style = kCompactionStyleNone;
+  options.compression = kNoCompression;
+  options.write_buffer_size = 100000;  // Small write buffer
+  options.table_properties_collector_factories.push_back(
+      std::make_shared<TestPropertiesCollectorFactory>());
+
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+
+  // keep writing until writes are forced to stop.
+  for (int i = 0; static_cast<int>(cf_meta.file_count) < kSlowdownTrigger * 10;
+       ++i) {
+    ASSERT_OK(
+        Put(1, std::to_string(i), std::string(10000, 'x'), WriteOptions()));
+    FlushOptions fo;
+    fo.allow_write_stall = true;
+    ASSERT_OK(db_->Flush(fo, handles_[1]));
+    db_->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  }
+  // Ensure background work is fully finished including listener callbacks
+  // before accessing listener state.
+  ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
+  ASSERT_GE(listener->slowdown_count, kSlowdownTrigger * 9);
+}
+
+class TestCompactionReasonListener : public EventListener {
+ public:
+  void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    compaction_reasons_.push_back(ci.compaction_reason);
+  }
+
+  std::vector<CompactionReason> compaction_reasons_;
+  std::mutex mutex_;
+};
+
+TEST_F(EventListenerTest, CompactionReasonLevel) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(
+      DBTestBase::kNumKeysByGenerateNewRandomFile));
+
+  TestCompactionReasonListener* listener = new TestCompactionReasonListener();
+  options.listeners.emplace_back(listener);
+
+  options.level0_file_num_compaction_trigger = 4;
+  options.compaction_style = kCompactionStyleLevel;
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  // Write 4 files in L0
+  for (int i = 0; i < 4; i++) {
+    GenerateNewRandomFile(&rnd);
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ(listener->compaction_reasons_.size(), 1);
+  ASSERT_EQ(listener->compaction_reasons_[0],
+            CompactionReason::kLevelL0FilesNum);
+
+  DestroyAndReopen(options);
+
+  // Write 3 non-overlapping files in L0
+  for (int k = 1; k <= 30; k++) {
+    ASSERT_OK(Put(Key(k), Key(k)));
+    if (k % 10 == 0) {
+      Flush();
+    }
+  }
+
+  // Do a trivial move from L0 -> L1
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  options.max_bytes_for_level_base = 1;
+  Close();
+  listener->compaction_reasons_.clear();
+  Reopen(options);
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_GT(listener->compaction_reasons_.size(), 1);
+
+  for (auto compaction_reason : listener->compaction_reasons_) {
+    ASSERT_EQ(compaction_reason, CompactionReason::kLevelMaxLevelSize);
+  }
+
+  options.disable_auto_compactions = true;
+  Close();
+  listener->compaction_reasons_.clear();
+  Reopen(options);
+
+  ASSERT_OK(Put("key", "value"));
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_GT(listener->compaction_reasons_.size(), 0);
+  for (auto compaction_reason : listener->compaction_reasons_) {
+    ASSERT_EQ(compaction_reason, CompactionReason::kManualCompaction);
+  }
+}
+
+TEST_F(EventListenerTest, CompactionReasonUniversal) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(
+      DBTestBase::kNumKeysByGenerateNewRandomFile));
+
+  TestCompactionReasonListener* listener = new TestCompactionReasonListener();
+  options.listeners.emplace_back(listener);
+
+  options.compaction_style = kCompactionStyleUniversal;
+
+  Random rnd(301);
+
+  options.level0_file_num_compaction_trigger = 8;
+  options.compaction_options_universal.max_size_amplification_percent = 100000;
+  options.compaction_options_universal.size_ratio = 100000;
+  DestroyAndReopen(options);
+  listener->compaction_reasons_.clear();
+
+  // Write 8 files in L0
+  for (int i = 0; i < 8; i++) {
+    GenerateNewRandomFile(&rnd);
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_GT(listener->compaction_reasons_.size(), 0);
+  for (auto compaction_reason : listener->compaction_reasons_) {
+    ASSERT_EQ(compaction_reason, CompactionReason::kUniversalSizeRatio);
+  }
+
+  options.level0_file_num_compaction_trigger = 8;
+  options.compaction_options_universal.max_size_amplification_percent = 1;
+  options.compaction_options_universal.size_ratio = 100000;
+
+  DestroyAndReopen(options);
+  listener->compaction_reasons_.clear();
+
+  // Write 8 files in L0
+  for (int i = 0; i < 8; i++) {
+    GenerateNewRandomFile(&rnd);
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_GT(listener->compaction_reasons_.size(), 0);
+  for (auto compaction_reason : listener->compaction_reasons_) {
+    ASSERT_EQ(compaction_reason, CompactionReason::kUniversalSizeAmplification);
+  }
+
+  options.disable_auto_compactions = true;
+  Close();
+  listener->compaction_reasons_.clear();
+  Reopen(options);
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_GT(listener->compaction_reasons_.size(), 0);
+  for (auto compaction_reason : listener->compaction_reasons_) {
+    ASSERT_EQ(compaction_reason, CompactionReason::kManualCompaction);
+  }
+}
+
+TEST_F(EventListenerTest, CompactionReasonFIFO) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(
+      DBTestBase::kNumKeysByGenerateNewRandomFile));
+
+  TestCompactionReasonListener* listener = new TestCompactionReasonListener();
+  options.listeners.emplace_back(listener);
+
+  options.level0_file_num_compaction_trigger = 4;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.compaction_options_fifo.max_table_files_size = 1;
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  // Write 4 files in L0
+  for (int i = 0; i < 4; i++) {
+    GenerateNewRandomFile(&rnd);
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_GT(listener->compaction_reasons_.size(), 0);
+  for (auto compaction_reason : listener->compaction_reasons_) {
+    ASSERT_EQ(compaction_reason, CompactionReason::kFIFOMaxSize);
+  }
+}
+
+class TableFileCreationListener : public EventListener {
+ public:
+  class TestEnv : public EnvWrapper {
+   public:
+    explicit TestEnv(Env* t) : EnvWrapper(t) {}
+    static const char* kClassName() { return "TestEnv"; }
+    const char* Name() const override { return kClassName(); }
+
+    void SetStatus(Status s) { status_ = s; }
+
+    Status NewWritableFile(const std::string& fname,
+                           std::unique_ptr<WritableFile>* result,
+                           const EnvOptions& options) override {
+      if (fname.size() > 4 && fname.substr(fname.size() - 4) == ".sst") {
+        if (!status_.ok()) {
+          return status_;
+        }
+      }
+      return target()->NewWritableFile(fname, result, options);
+    }
+
+   private:
+    Status status_;
+  };
+
+  TableFileCreationListener() {
+    for (int i = 0; i < 2; i++) {
+      started_[i] = finished_[i] = failure_[i] = 0;
+    }
+  }
+
+  int Index(TableFileCreationReason reason) {
+    int idx;
+    switch (reason) {
+      case TableFileCreationReason::kFlush:
+        idx = 0;
+        break;
+      case TableFileCreationReason::kCompaction:
+        idx = 1;
+        break;
+      default:
+        idx = -1;
+    }
+    return idx;
+  }
+
+  void CheckAndResetCounters(int flush_started, int flush_finished,
+                             int flush_failure, int compaction_started,
+                             int compaction_finished, int compaction_failure) {
+    ASSERT_EQ(started_[0], flush_started);
+    ASSERT_EQ(finished_[0], flush_finished);
+    ASSERT_EQ(failure_[0], flush_failure);
+    ASSERT_EQ(started_[1], compaction_started);
+    ASSERT_EQ(finished_[1], compaction_finished);
+    ASSERT_EQ(failure_[1], compaction_failure);
+    for (int i = 0; i < 2; i++) {
+      started_[i] = finished_[i] = failure_[i] = 0;
+    }
+  }
+
+  void OnTableFileCreationStarted(
+      const TableFileCreationBriefInfo& info) override {
+    int idx = Index(info.reason);
+    if (idx >= 0) {
+      started_[idx]++;
+    }
+    ASSERT_GT(info.db_name.size(), 0U);
+    ASSERT_GT(info.cf_name.size(), 0U);
+    ASSERT_GT(info.file_path.size(), 0U);
+    ASSERT_GT(info.job_id, 0);
+  }
+
+  void OnTableFileCreated(const TableFileCreationInfo& info) override {
+    int idx = Index(info.reason);
+    if (idx >= 0) {
+      finished_[idx]++;
+    }
+    ASSERT_GT(info.db_name.size(), 0U);
+    ASSERT_GT(info.cf_name.size(), 0U);
+    ASSERT_GT(info.file_path.size(), 0U);
+    ASSERT_GT(info.job_id, 0);
+    ASSERT_EQ(info.file_checksum, kUnknownFileChecksum);
+    ASSERT_EQ(info.file_checksum_func_name, kUnknownFileChecksumFuncName);
+    if (info.status.ok()) {
+      if (info.table_properties.num_range_deletions == 0U) {
+        ASSERT_GT(info.table_properties.data_size, 0U);
+        ASSERT_GT(info.table_properties.raw_key_size, 0U);
+        ASSERT_GT(info.table_properties.raw_value_size, 0U);
+        ASSERT_GT(info.table_properties.num_data_blocks, 0U);
+        ASSERT_GT(info.table_properties.num_entries, 0U);
+      }
+    } else {
+      if (idx >= 0) {
+        failure_[idx]++;
+        last_failure_ = info.status;
+      }
+    }
+  }
+
+  int started_[2];
+  int finished_[2];
+  int failure_[2];
+  Status last_failure_;
+};
+
+TEST_F(EventListenerTest, TableFileCreationListenersTest) {
+  auto listener = std::make_shared<TableFileCreationListener>();
+  Options options;
+  std::unique_ptr<TableFileCreationListener::TestEnv> test_env(
+      new TableFileCreationListener::TestEnv(CurrentOptions().env));
+  options.create_if_missing = true;
+  options.listeners.push_back(listener);
+  options.env = test_env.get();
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("foo", "aaa"));
+  ASSERT_OK(Put("bar", "bbb"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  listener->CheckAndResetCounters(1, 1, 0, 0, 0, 0);
+  ASSERT_OK(Put("foo", "aaa1"));
+  ASSERT_OK(Put("bar", "bbb1"));
+  test_env->SetStatus(Status::NotSupported("not supported"));
+  ASSERT_NOK(Flush());
+  listener->CheckAndResetCounters(1, 1, 1, 0, 0, 0);
+  ASSERT_TRUE(listener->last_failure_.IsNotSupported());
+  test_env->SetStatus(Status::OK());
+
+  Reopen(options);
+  ASSERT_OK(Put("foo", "aaa2"));
+  ASSERT_OK(Put("bar", "bbb2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  listener->CheckAndResetCounters(1, 1, 0, 0, 0, 0);
+
+  const Slice kRangeStart = "a";
+  const Slice kRangeEnd = "z";
+  ASSERT_OK(
+      dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  listener->CheckAndResetCounters(0, 0, 0, 1, 1, 0);
+
+  ASSERT_OK(Put("foo", "aaa3"));
+  ASSERT_OK(Put("bar", "bbb3"));
+  ASSERT_OK(Flush());
+  test_env->SetStatus(Status::NotSupported("not supported"));
+  ASSERT_NOK(
+      dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd));
+  ASSERT_NOK(dbfull()->TEST_WaitForCompact());
+  listener->CheckAndResetCounters(1, 1, 0, 1, 1, 1);
+  ASSERT_TRUE(listener->last_failure_.IsNotSupported());
+
+  // Reset
+  test_env->SetStatus(Status::OK());
+  DestroyAndReopen(options);
+
+  // Verify that an empty table file that is immediately deleted gives Aborted
+  // status to listener.
+  ASSERT_OK(Put("baz", "z"));
+  ASSERT_OK(SingleDelete("baz"));
+  ASSERT_OK(Flush());
+  listener->CheckAndResetCounters(1, 1, 1, 0, 0, 0);
+  ASSERT_TRUE(listener->last_failure_.IsAborted());
+
+  // Also in compaction
+  ASSERT_OK(Put("baz", "z"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                             kRangeStart, kRangeEnd));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  listener->CheckAndResetCounters(2, 2, 0, 1, 1, 1);
+  ASSERT_TRUE(listener->last_failure_.IsAborted());
+
+  Close();  // Avoid UAF on listener
+}
+
+class MemTableSealedListener : public EventListener {
+ private:
+  SequenceNumber latest_seq_number_;
+
+ public:
+  MemTableSealedListener() {}
+  void OnMemTableSealed(const MemTableInfo& info) override {
+    latest_seq_number_ = info.first_seqno;
+  }
+
+  void OnFlushCompleted(DB* /*db*/,
+                        const FlushJobInfo& flush_job_info) override {
+    ASSERT_LE(flush_job_info.smallest_seqno, latest_seq_number_);
+  }
+};
+
+TEST_F(EventListenerTest, MemTableSealedListenerTest) {
+  auto listener = std::make_shared<MemTableSealedListener>();
+  Options options;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+  options.listeners.push_back(listener);
+  DestroyAndReopen(options);
+
+  for (unsigned int i = 0; i < 10; i++) {
+    std::string tag = std::to_string(i);
+    ASSERT_OK(Put("foo" + tag, "aaa"));
+    ASSERT_OK(Put("bar" + tag, "bbb"));
+
+    ASSERT_OK(Flush());
+  }
+}
+
+class ColumnFamilyHandleDeletionStartedListener : public EventListener {
+ private:
+  std::vector<std::string> cfs_;
+  int counter;
+
+ public:
+  explicit ColumnFamilyHandleDeletionStartedListener(
+      const std::vector<std::string>& cfs)
+      : cfs_(cfs), counter(0) {
+    cfs_.insert(cfs_.begin(), kDefaultColumnFamilyName);
+  }
+  void OnColumnFamilyHandleDeletionStarted(
+      ColumnFamilyHandle* handle) override {
+    ASSERT_EQ(cfs_[handle->GetID()], handle->GetName());
+    counter++;
+  }
+  int getCounter() { return counter; }
+};
+
+TEST_F(EventListenerTest, ColumnFamilyHandleDeletionStartedListenerTest) {
+  std::vector<std::string> cfs{"pikachu", "eevee", "Mewtwo"};
+  auto listener =
+      std::make_shared<ColumnFamilyHandleDeletionStartedListener>(cfs);
+  Options options;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+  options.listeners.push_back(listener);
+  CreateAndReopenWithCF(cfs, options);
+  ASSERT_EQ(handles_.size(), 4);
+  delete handles_[3];
+  delete handles_[2];
+  delete handles_[1];
+  handles_.resize(1);
+  ASSERT_EQ(listener->getCounter(), 3);
+}
+
+class BackgroundErrorListener : public EventListener {
+ private:
+  SpecialEnv* env_;
+  int counter_;
+
+ public:
+  BackgroundErrorListener(SpecialEnv* env) : env_(env), counter_(0) {}
+
+  void OnBackgroundError(BackgroundErrorReason /*reason*/,
+                         Status* bg_error) override {
+    if (counter_ == 0) {
+      // suppress the first error and disable write-dropping such that a retry
+      // can succeed.
+      *bg_error = Status::OK();
+      env_->drop_writes_.store(false, std::memory_order_release);
+      env_->SetMockSleep(false);
+    }
+    ++counter_;
+  }
+
+  int counter() { return counter_; }
+};
+
+TEST_F(EventListenerTest, BackgroundErrorListenerFailedFlushTest) {
+  auto listener = std::make_shared<BackgroundErrorListener>(env_);
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+  options.listeners.push_back(listener);
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
+  options.paranoid_checks = true;
+  DestroyAndReopen(options);
+
+  // the usual TEST_WaitForFlushMemTable() doesn't work for failed flushes, so
+  // forge a custom one for the failed flush case.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BGWorkFlush:done",
+        "EventListenerTest:BackgroundErrorListenerFailedFlushTest:1"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  env_->drop_writes_.store(true, std::memory_order_release);
+  env_->SetMockSleep();
+
+  ASSERT_OK(Put("key0", "val"));
+  ASSERT_OK(Put("key1", "val"));
+  TEST_SYNC_POINT("EventListenerTest:BackgroundErrorListenerFailedFlushTest:1");
+  ASSERT_EQ(1, listener->counter());
+  ASSERT_OK(Put("key2", "val"));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+}
+
+TEST_F(EventListenerTest, BackgroundErrorListenerFailedCompactionTest) {
+  auto listener = std::make_shared<BackgroundErrorListener>(env_);
+  Options options;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.push_back(listener);
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(2));
+  options.paranoid_checks = true;
+  DestroyAndReopen(options);
+
+  // third iteration triggers the second memtable's flush
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("key0", "val"));
+    if (i > 0) {
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    }
+    ASSERT_OK(Put("key1", "val"));
+  }
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+
+  env_->drop_writes_.store(true, std::memory_order_release);
+  env_->SetMockSleep();
+  ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(1, listener->counter());
+
+  // trigger flush so compaction is triggered again; this time it succeeds
+  // The previous failed compaction may get retried automatically, so we may
+  // be left with 0 or 1 files in level 1, depending on when the retry gets
+  // scheduled
+  ASSERT_OK(Put("key0", "val"));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_LE(1, NumTableFilesAtLevel(0));
+}
+
+class TestFileOperationListener : public EventListener {
+ public:
+  TestFileOperationListener() {
+    file_reads_.store(0);
+    file_reads_success_.store(0);
+    file_writes_.store(0);
+    file_writes_success_.store(0);
+    file_flushes_.store(0);
+    file_flushes_success_.store(0);
+    file_closes_.store(0);
+    file_closes_success_.store(0);
+    file_syncs_.store(0);
+    file_syncs_success_.store(0);
+    file_truncates_.store(0);
+    file_truncates_success_.store(0);
+    file_seq_reads_.store(0);
+    blob_file_reads_.store(0);
+    blob_file_writes_.store(0);
+    blob_file_flushes_.store(0);
+    blob_file_closes_.store(0);
+    blob_file_syncs_.store(0);
+    blob_file_truncates_.store(0);
+  }
+
+  void OnFileReadFinish(const FileOperationInfo& info) override {
+    ++file_reads_;
+    if (info.status.ok()) {
+      ++file_reads_success_;
+    }
+    if (info.path.find("MANIFEST") != std::string::npos) {
+      ++file_seq_reads_;
+    }
+    if (EndsWith(info.path, ".blob")) {
+      ++blob_file_reads_;
+    }
+    ReportDuration(info);
+  }
+
+  void OnFileWriteFinish(const FileOperationInfo& info) override {
+    ++file_writes_;
+    if (info.status.ok()) {
+      ++file_writes_success_;
+    }
+    if (EndsWith(info.path, ".blob")) {
+      ++blob_file_writes_;
+    }
+    ReportDuration(info);
+  }
+
+  void OnFileFlushFinish(const FileOperationInfo& info) override {
+    ++file_flushes_;
+    if (info.status.ok()) {
+      ++file_flushes_success_;
+    }
+    if (EndsWith(info.path, ".blob")) {
+      ++blob_file_flushes_;
+    }
+    ReportDuration(info);
+  }
+
+  void OnFileCloseFinish(const FileOperationInfo& info) override {
+    ++file_closes_;
+    if (info.status.ok()) {
+      ++file_closes_success_;
+    }
+    if (EndsWith(info.path, ".blob")) {
+      ++blob_file_closes_;
+    }
+    ReportDuration(info);
+  }
+
+  void OnFileSyncFinish(const FileOperationInfo& info) override {
+    ++file_syncs_;
+    if (info.status.ok()) {
+      ++file_syncs_success_;
+    }
+    if (EndsWith(info.path, ".blob")) {
+      ++blob_file_syncs_;
+    }
+    ReportDuration(info);
+  }
+
+  void OnFileTruncateFinish(const FileOperationInfo& info) override {
+    ++file_truncates_;
+    if (info.status.ok()) {
+      ++file_truncates_success_;
+    }
+    if (EndsWith(info.path, ".blob")) {
+      ++blob_file_truncates_;
+    }
+    ReportDuration(info);
+  }
+
+  bool ShouldBeNotifiedOnFileIO() override { return true; }
+
+  std::atomic<size_t> file_reads_;
+  std::atomic<size_t> file_reads_success_;
+  std::atomic<size_t> file_writes_;
+  std::atomic<size_t> file_writes_success_;
+  std::atomic<size_t> file_flushes_;
+  std::atomic<size_t> file_flushes_success_;
+  std::atomic<size_t> file_closes_;
+  std::atomic<size_t> file_closes_success_;
+  std::atomic<size_t> file_syncs_;
+  std::atomic<size_t> file_syncs_success_;
+  std::atomic<size_t> file_truncates_;
+  std::atomic<size_t> file_truncates_success_;
+  std::atomic<size_t> file_seq_reads_;
+  std::atomic<size_t> blob_file_reads_;
+  std::atomic<size_t> blob_file_writes_;
+  std::atomic<size_t> blob_file_flushes_;
+  std::atomic<size_t> blob_file_closes_;
+  std::atomic<size_t> blob_file_syncs_;
+  std::atomic<size_t> blob_file_truncates_;
+
+ private:
+  void ReportDuration(const FileOperationInfo& info) const {
+    ASSERT_GT(info.duration.count(), 0);
+  }
+};
+
+TEST_F(EventListenerTest, OnFileOperationTest) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+
+  TestFileOperationListener* listener = new TestFileOperationListener();
+  options.listeners.emplace_back(listener);
+
+  options.use_direct_io_for_flush_and_compaction = false;
+  Status s = TryReopen(options);
+  if (s.IsInvalidArgument()) {
+    options.use_direct_io_for_flush_and_compaction = false;
+  } else {
+    ASSERT_OK(s);
+  }
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "aaa"));
+  ASSERT_OK(dbfull()->Flush(FlushOptions()));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_GE(listener->file_writes_.load(),
+            listener->file_writes_success_.load());
+  ASSERT_GT(listener->file_writes_.load(), 0);
+  ASSERT_GE(listener->file_flushes_.load(),
+            listener->file_flushes_success_.load());
+  ASSERT_GT(listener->file_flushes_.load(), 0);
+  Close();
+
+  Reopen(options);
+  ASSERT_GE(listener->file_reads_.load(), listener->file_reads_success_.load());
+  ASSERT_GT(listener->file_reads_.load(), 0);
+  ASSERT_GE(listener->file_closes_.load(),
+            listener->file_closes_success_.load());
+  ASSERT_GT(listener->file_closes_.load(), 0);
+  ASSERT_GE(listener->file_syncs_.load(), listener->file_syncs_success_.load());
+  ASSERT_GT(listener->file_syncs_.load(), 0);
+  if (true == options.use_direct_io_for_flush_and_compaction) {
+    ASSERT_GE(listener->file_truncates_.load(),
+              listener->file_truncates_success_.load());
+    ASSERT_GT(listener->file_truncates_.load(), 0);
+  }
+}
+
+TEST_F(EventListenerTest, OnBlobFileOperationTest) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+  TestFileOperationListener* listener = new TestFileOperationListener();
+  options.listeners.emplace_back(listener);
+  options.disable_auto_compactions = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 0.5;
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("Key1", "blob_value1"));
+  ASSERT_OK(Put("Key2", "blob_value2"));
+  ASSERT_OK(Put("Key3", "blob_value3"));
+  ASSERT_OK(Put("Key4", "blob_value4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key3", "new_blob_value3"));
+  ASSERT_OK(Put("Key4", "new_blob_value4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key5", "blob_value5"));
+  ASSERT_OK(Put("Key6", "blob_value6"));
+  ASSERT_OK(Flush());
+
+  ASSERT_GT(listener->blob_file_writes_.load(), 0U);
+  ASSERT_GT(listener->blob_file_flushes_.load(), 0U);
+  Close();
+
+  Reopen(options);
+  ASSERT_GT(listener->blob_file_closes_.load(), 0U);
+  ASSERT_GT(listener->blob_file_syncs_.load(), 0U);
+  if (true == options.use_direct_io_for_flush_and_compaction) {
+    ASSERT_GT(listener->blob_file_truncates_.load(), 0U);
+  }
+}
+
+TEST_F(EventListenerTest, ReadManifestAndWALOnRecovery) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+
+  TestFileOperationListener* listener = new TestFileOperationListener();
+  options.listeners.emplace_back(listener);
+
+  options.use_direct_io_for_flush_and_compaction = false;
+  Status s = TryReopen(options);
+  if (s.IsInvalidArgument()) {
+    options.use_direct_io_for_flush_and_compaction = false;
+  } else {
+    ASSERT_OK(s);
+  }
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "aaa"));
+  Close();
+
+  size_t seq_reads = listener->file_seq_reads_.load();
+  Reopen(options);
+  ASSERT_GT(listener->file_seq_reads_.load(), seq_reads);
+}
+
+class BlobDBJobLevelEventListenerTest : public EventListener {
+ public:
+  explicit BlobDBJobLevelEventListenerTest(EventListenerTest* test)
+      : test_(test), call_count_(0) {}
+
+  const VersionStorageInfo* GetVersionStorageInfo() const {
+    VersionSet* const versions = test_->dbfull()->GetVersionSet();
+    assert(versions);
+
+    ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+    EXPECT_NE(cfd, nullptr);
+
+    Version* const current = cfd->current();
+    EXPECT_NE(current, nullptr);
+
+    const VersionStorageInfo* const storage_info = current->storage_info();
+    EXPECT_NE(storage_info, nullptr);
+
+    return storage_info;
+  }
+
+  void CheckBlobFileAdditions(
+      const std::vector<BlobFileAdditionInfo>& blob_file_addition_infos) const {
+    const auto* vstorage = GetVersionStorageInfo();
+
+    EXPECT_FALSE(blob_file_addition_infos.empty());
+
+    for (const auto& blob_file_addition_info : blob_file_addition_infos) {
+      const auto meta = vstorage->GetBlobFileMetaData(
+          blob_file_addition_info.blob_file_number);
+
+      EXPECT_NE(meta, nullptr);
+      EXPECT_EQ(meta->GetBlobFileNumber(),
+                blob_file_addition_info.blob_file_number);
+      EXPECT_EQ(meta->GetTotalBlobBytes(),
+                blob_file_addition_info.total_blob_bytes);
+      EXPECT_EQ(meta->GetTotalBlobCount(),
+                blob_file_addition_info.total_blob_count);
+      EXPECT_FALSE(blob_file_addition_info.blob_file_path.empty());
+    }
+  }
+
+  std::vector<std::string> GetFlushedFiles() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    std::vector<std::string> result;
+    for (const auto& fname : flushed_files_) {
+      result.push_back(fname);
+    }
+    return result;
+  }
+
+  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+    call_count_++;
+
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      flushed_files_.push_back(info.file_path);
+    }
+
+    EXPECT_EQ(info.blob_compression_type, kNoCompression);
+
+    CheckBlobFileAdditions(info.blob_file_addition_infos);
+  }
+
+  void OnCompactionCompleted(DB* /*db*/,
+                             const CompactionJobInfo& info) override {
+    call_count_++;
+
+    EXPECT_EQ(info.blob_compression_type, kNoCompression);
+
+    CheckBlobFileAdditions(info.blob_file_addition_infos);
+
+    EXPECT_FALSE(info.blob_file_garbage_infos.empty());
+
+    for (const auto& blob_file_garbage_info : info.blob_file_garbage_infos) {
+      EXPECT_GT(blob_file_garbage_info.blob_file_number, 0U);
+      EXPECT_GT(blob_file_garbage_info.garbage_blob_count, 0U);
+      EXPECT_GT(blob_file_garbage_info.garbage_blob_bytes, 0U);
+      EXPECT_FALSE(blob_file_garbage_info.blob_file_path.empty());
+    }
+  }
+
+  EventListenerTest* test_;
+  uint32_t call_count_;
+
+ private:
+  std::vector<std::string> flushed_files_;
+  std::mutex mutex_;
+};
+
+// Test OnFlushCompleted EventListener called for blob files
+TEST_F(EventListenerTest, BlobDBOnFlushCompleted) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.enable_blob_files = true;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+
+  options.min_blob_size = 0;
+  BlobDBJobLevelEventListenerTest* blob_event_listener =
+      new BlobDBJobLevelEventListenerTest(this);
+  options.listeners.emplace_back(blob_event_listener);
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("Key1", "blob_value1"));
+  ASSERT_OK(Put("Key2", "blob_value2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key3", "blob_value3"));
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(Get("Key1"), "blob_value1");
+  ASSERT_EQ(Get("Key2"), "blob_value2");
+  ASSERT_EQ(Get("Key3"), "blob_value3");
+
+  ASSERT_GT(blob_event_listener->call_count_, 0U);
+}
+
+// Test OnCompactionCompleted EventListener called for blob files
+TEST_F(EventListenerTest, BlobDBOnCompactionCompleted) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.enable_blob_files = true;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.min_blob_size = 0;
+  BlobDBJobLevelEventListenerTest* blob_event_listener =
+      new BlobDBJobLevelEventListenerTest(this);
+  options.listeners.emplace_back(blob_event_listener);
+
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 0.5;
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("Key1", "blob_value1"));
+  ASSERT_OK(Put("Key2", "blob_value2"));
+  ASSERT_OK(Put("Key3", "blob_value3"));
+  ASSERT_OK(Put("Key4", "blob_value4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key3", "new_blob_value3"));
+  ASSERT_OK(Put("Key4", "new_blob_value4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key5", "blob_value5"));
+  ASSERT_OK(Put("Key6", "blob_value6"));
+  ASSERT_OK(Flush());
+
+  blob_event_listener->call_count_ = 0;
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  // On compaction, because of blob_garbage_collection_age_cutoff, it will
+  // delete the oldest blob file and create new blob file during compaction.
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  // Make sure, OnCompactionCompleted is called.
+  ASSERT_GT(blob_event_listener->call_count_, 0U);
+}
+
+// Test CompactFiles calls OnCompactionCompleted EventListener for blob files
+// and populate the blob files info.
+TEST_F(EventListenerTest, BlobDBCompactFiles) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.enable_blob_files = true;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.min_blob_size = 0;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 0.5;
+
+  BlobDBJobLevelEventListenerTest* blob_event_listener =
+      new BlobDBJobLevelEventListenerTest(this);
+  options.listeners.emplace_back(blob_event_listener);
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("Key1", "blob_value1"));
+  ASSERT_OK(Put("Key2", "blob_value2"));
+  ASSERT_OK(Put("Key3", "blob_value3"));
+  ASSERT_OK(Put("Key4", "blob_value4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key3", "new_blob_value3"));
+  ASSERT_OK(Put("Key4", "new_blob_value4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key5", "blob_value5"));
+  ASSERT_OK(Put("Key6", "blob_value6"));
+  ASSERT_OK(Flush());
+
+  std::vector<std::string> output_file_names;
+  CompactionJobInfo compaction_job_info;
+
+  // On compaction, because of blob_garbage_collection_age_cutoff, it will
+  // delete the oldest blob file and create new blob file during compaction
+  // which will be populated in output_files_names.
+  ASSERT_OK(dbfull()->CompactFiles(
+      CompactionOptions(), blob_event_listener->GetFlushedFiles(), 1, -1,
+      &output_file_names, &compaction_job_info));
+
+  bool is_blob_in_output = false;
+  for (const auto& file : output_file_names) {
+    if (EndsWith(file, ".blob")) {
+      is_blob_in_output = true;
+    }
+  }
+  ASSERT_TRUE(is_blob_in_output);
+
+  for (const auto& blob_file_addition_info :
+       compaction_job_info.blob_file_addition_infos) {
+    EXPECT_GT(blob_file_addition_info.blob_file_number, 0U);
+    EXPECT_GT(blob_file_addition_info.total_blob_bytes, 0U);
+    EXPECT_GT(blob_file_addition_info.total_blob_count, 0U);
+    EXPECT_FALSE(blob_file_addition_info.blob_file_path.empty());
+  }
+
+  for (const auto& blob_file_garbage_info :
+       compaction_job_info.blob_file_garbage_infos) {
+    EXPECT_GT(blob_file_garbage_info.blob_file_number, 0U);
+    EXPECT_GT(blob_file_garbage_info.garbage_blob_count, 0U);
+    EXPECT_GT(blob_file_garbage_info.garbage_blob_bytes, 0U);
+    EXPECT_FALSE(blob_file_garbage_info.blob_file_path.empty());
+  }
+}
+
+class BlobDBFileLevelEventListener : public EventListener {
+ public:
+  void OnBlobFileCreationStarted(
+      const BlobFileCreationBriefInfo& info) override {
+    files_started_++;
+    EXPECT_FALSE(info.db_name.empty());
+    EXPECT_FALSE(info.cf_name.empty());
+    EXPECT_FALSE(info.file_path.empty());
+    EXPECT_GT(info.job_id, 0);
+  }
+
+  void OnBlobFileCreated(const BlobFileCreationInfo& info) override {
+    files_created_++;
+    EXPECT_FALSE(info.db_name.empty());
+    EXPECT_FALSE(info.cf_name.empty());
+    EXPECT_FALSE(info.file_path.empty());
+    EXPECT_GT(info.job_id, 0);
+    EXPECT_GT(info.total_blob_count, 0U);
+    EXPECT_GT(info.total_blob_bytes, 0U);
+    EXPECT_EQ(info.file_checksum, kUnknownFileChecksum);
+    EXPECT_EQ(info.file_checksum_func_name, kUnknownFileChecksumFuncName);
+    EXPECT_TRUE(info.status.ok());
+  }
+
+  void OnBlobFileDeleted(const BlobFileDeletionInfo& info) override {
+    files_deleted_++;
+    EXPECT_FALSE(info.db_name.empty());
+    EXPECT_FALSE(info.file_path.empty());
+    EXPECT_GT(info.job_id, 0);
+    EXPECT_TRUE(info.status.ok());
+  }
+
+  void CheckCounters() {
+    EXPECT_EQ(files_started_, files_created_);
+    EXPECT_GT(files_started_, 0U);
+    EXPECT_GT(files_deleted_, 0U);
+    EXPECT_LT(files_deleted_, files_created_);
+  }
+
+ private:
+  std::atomic<uint32_t> files_started_{};
+  std::atomic<uint32_t> files_created_{};
+  std::atomic<uint32_t> files_deleted_{};
+};
+
+TEST_F(EventListenerTest, BlobDBFileTest) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.enable_blob_files = true;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.min_blob_size = 0;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 0.5;
+
+  BlobDBFileLevelEventListener* blob_event_listener =
+      new BlobDBFileLevelEventListener();
+  options.listeners.emplace_back(blob_event_listener);
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("Key1", "blob_value1"));
+  ASSERT_OK(Put("Key2", "blob_value2"));
+  ASSERT_OK(Put("Key3", "blob_value3"));
+  ASSERT_OK(Put("Key4", "blob_value4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key3", "new_blob_value3"));
+  ASSERT_OK(Put("Key4", "new_blob_value4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key5", "blob_value5"));
+  ASSERT_OK(Put("Key6", "blob_value6"));
+  ASSERT_OK(Flush());
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  // On compaction, because of blob_garbage_collection_age_cutoff, it will
+  // delete the oldest blob file and create new blob file during compaction.
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  blob_event_listener->CheckCounters();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/log_format.h b/src/rocksdb/db/log_format.h
new file mode 100644
index 000000000..d397372f4
--- /dev/null
+++ b/src/rocksdb/db/log_format.h
@@ -0,0 +1,51 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Log format information shared by reader and writer.
+// See ../doc/log_format.txt for more detail.
+
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace log {
+
+enum RecordType {
+  // Zero is reserved for preallocated files
+  kZeroType = 0,
+  kFullType = 1,
+
+  // For fragments
+  kFirstType = 2,
+  kMiddleType = 3,
+  kLastType = 4,
+
+  // For recycled log files
+  kRecyclableFullType = 5,
+  kRecyclableFirstType = 6,
+  kRecyclableMiddleType = 7,
+  kRecyclableLastType = 8,
+
+  // Compression Type
+  kSetCompressionType = 9,
+};
+static const int kMaxRecordType = kSetCompressionType;
+
+static const unsigned int kBlockSize = 32768;
+
+// Header is checksum (4 bytes), length (2 bytes), type (1 byte)
+static const int kHeaderSize = 4 + 2 + 1;
+
+// Recyclable header is checksum (4 bytes), length (2 bytes), type (1 byte),
+// log number (4 bytes).
+static const int kRecyclableHeaderSize = 4 + 2 + 1 + 4;
+
+}  // namespace log
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/log_reader.cc b/src/rocksdb/db/log_reader.cc
new file mode 100644
index 000000000..a21868776
--- /dev/null
+++ b/src/rocksdb/db/log_reader.cc
@@ -0,0 +1,854 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_reader.h"
+
+#include <stdio.h>
+
+#include "file/sequence_file_reader.h"
+#include "port/lang.h"
+#include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace log {
+
+Reader::Reporter::~Reporter() {}
+
+Reader::Reader(std::shared_ptr<Logger> info_log,
+               std::unique_ptr<SequentialFileReader>&& _file,
+               Reporter* reporter, bool checksum, uint64_t log_num)
+    : info_log_(info_log),
+      file_(std::move(_file)),
+      reporter_(reporter),
+      checksum_(checksum),
+      backing_store_(new char[kBlockSize]),
+      buffer_(),
+      eof_(false),
+      read_error_(false),
+      eof_offset_(0),
+      last_record_offset_(0),
+      end_of_buffer_offset_(0),
+      log_number_(log_num),
+      recycled_(false),
+      first_record_read_(false),
+      compression_type_(kNoCompression),
+      compression_type_record_read_(false),
+      uncompress_(nullptr),
+      hash_state_(nullptr),
+      uncompress_hash_state_(nullptr){};
+
+Reader::~Reader() {
+  delete[] backing_store_;
+  if (uncompress_) {
+    delete uncompress_;
+  }
+  if (hash_state_) {
+    XXH3_freeState(hash_state_);
+  }
+  if (uncompress_hash_state_) {
+    XXH3_freeState(uncompress_hash_state_);
+  }
+}
+
+// For kAbsoluteConsistency, on clean shutdown we don't expect any error
+// in the log files.  For other modes, we can ignore only incomplete records
+// in the last log file, which are presumably due to a write in progress
+// during restart (or from log recycling).
+//
+// TODO krad: Evaluate if we need to move to a more strict mode where we
+// restrict the inconsistency to only the last log
+bool Reader::ReadRecord(Slice* record, std::string* scratch,
+                        WALRecoveryMode wal_recovery_mode,
+                        uint64_t* record_checksum) {
+  scratch->clear();
+  record->clear();
+  if (record_checksum != nullptr) {
+    if (hash_state_ == nullptr) {
+      hash_state_ = XXH3_createState();
+    }
+    XXH3_64bits_reset(hash_state_);
+  }
+  if (uncompress_) {
+    uncompress_->Reset();
+  }
+  bool in_fragmented_record = false;
+  // Record offset of the logical record that we're reading
+  // 0 is a dummy value to make compilers happy
+  uint64_t prospective_record_offset = 0;
+
+  Slice fragment;
+  while (true) {
+    uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
+    size_t drop_size = 0;
+    const unsigned int record_type =
+        ReadPhysicalRecord(&fragment, &drop_size, record_checksum);
+    switch (record_type) {
+      case kFullType:
+      case kRecyclableFullType:
+        if (in_fragmented_record && !scratch->empty()) {
+          // Handle bug in earlier versions of log::Writer where
+          // it could emit an empty kFirstType record at the tail end
+          // of a block followed by a kFullType or kFirstType record
+          // at the beginning of the next block.
+          ReportCorruption(scratch->size(), "partial record without end(1)");
+        }
+        // No need to compute record_checksum since the record
+        // consists of a single fragment and the checksum is computed
+        // in ReadPhysicalRecord() if WAL compression is enabled
+        if (record_checksum != nullptr && uncompress_ == nullptr) {
+          // No need to stream since the record is a single fragment
+          *record_checksum = XXH3_64bits(fragment.data(), fragment.size());
+        }
+        prospective_record_offset = physical_record_offset;
+        scratch->clear();
+        *record = fragment;
+        last_record_offset_ = prospective_record_offset;
+        first_record_read_ = true;
+        return true;
+
+      case kFirstType:
+      case kRecyclableFirstType:
+        if (in_fragmented_record && !scratch->empty()) {
+          // Handle bug in earlier versions of log::Writer where
+          // it could emit an empty kFirstType record at the tail end
+          // of a block followed by a kFullType or kFirstType record
+          // at the beginning of the next block.
+          ReportCorruption(scratch->size(), "partial record without end(2)");
+          XXH3_64bits_reset(hash_state_);
+        }
+        if (record_checksum != nullptr) {
+          XXH3_64bits_update(hash_state_, fragment.data(), fragment.size());
+        }
+        prospective_record_offset = physical_record_offset;
+        scratch->assign(fragment.data(), fragment.size());
+        in_fragmented_record = true;
+        break;
+
+      case kMiddleType:
+      case kRecyclableMiddleType:
+        if (!in_fragmented_record) {
+          ReportCorruption(fragment.size(),
+                           "missing start of fragmented record(1)");
+        } else {
+          if (record_checksum != nullptr) {
+            XXH3_64bits_update(hash_state_, fragment.data(), fragment.size());
+          }
+          scratch->append(fragment.data(), fragment.size());
+        }
+        break;
+
+      case kLastType:
+      case kRecyclableLastType:
+        if (!in_fragmented_record) {
+          ReportCorruption(fragment.size(),
+                           "missing start of fragmented record(2)");
+        } else {
+          if (record_checksum != nullptr) {
+            XXH3_64bits_update(hash_state_, fragment.data(), fragment.size());
+            *record_checksum = XXH3_64bits_digest(hash_state_);
+          }
+          scratch->append(fragment.data(), fragment.size());
+          *record = Slice(*scratch);
+          last_record_offset_ = prospective_record_offset;
+          first_record_read_ = true;
+          return true;
+        }
+        break;
+
+      case kBadHeader:
+        if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency ||
+            wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) {
+          // In clean shutdown we don't expect any error in the log files.
+          // In point-in-time recovery an incomplete record at the end could
+          // produce a hole in the recovered data. Report an error here, which
+          // higher layers can choose to ignore when it's provable there is no
+          // hole.
+          ReportCorruption(drop_size, "truncated header");
+        }
+        FALLTHROUGH_INTENDED;
+
+      case kEof:
+        if (in_fragmented_record) {
+          if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency ||
+              wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) {
+            // In clean shutdown we don't expect any error in the log files.
+            // In point-in-time recovery an incomplete record at the end could
+            // produce a hole in the recovered data. Report an error here, which
+            // higher layers can choose to ignore when it's provable there is no
+            // hole.
+            ReportCorruption(scratch->size(), "error reading trailing data");
+          }
+          // This can be caused by the writer dying immediately after
+          //  writing a physical record but before completing the next; don't
+          //  treat it as a corruption, just ignore the entire logical record.
+          scratch->clear();
+        }
+        return false;
+
+      case kOldRecord:
+        if (wal_recovery_mode != WALRecoveryMode::kSkipAnyCorruptedRecords) {
+          // Treat a record from a previous instance of the log as EOF.
+          if (in_fragmented_record) {
+            if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency ||
+                wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) {
+              // In clean shutdown we don't expect any error in the log files.
+              // In point-in-time recovery an incomplete record at the end could
+              // produce a hole in the recovered data. Report an error here,
+              // which higher layers can choose to ignore when it's provable
+              // there is no hole.
+              ReportCorruption(scratch->size(), "error reading trailing data");
+            }
+            // This can be caused by the writer dying immediately after
+            //  writing a physical record but before completing the next; don't
+            //  treat it as a corruption, just ignore the entire logical record.
+            scratch->clear();
+          }
+          return false;
+        }
+        FALLTHROUGH_INTENDED;
+
+      case kBadRecord:
+        if (in_fragmented_record) {
+          ReportCorruption(scratch->size(), "error in middle of record");
+          in_fragmented_record = false;
+          scratch->clear();
+        }
+        break;
+
+      case kBadRecordLen:
+        if (eof_) {
+          if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency ||
+              wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) {
+            // In clean shutdown we don't expect any error in the log files.
+            // In point-in-time recovery an incomplete record at the end could
+            // produce a hole in the recovered data. Report an error here, which
+            // higher layers can choose to ignore when it's provable there is no
+            // hole.
+            ReportCorruption(drop_size, "truncated record body");
+          }
+          return false;
+        }
+        FALLTHROUGH_INTENDED;
+
+      case kBadRecordChecksum:
+        if (recycled_ && wal_recovery_mode ==
+                             WALRecoveryMode::kTolerateCorruptedTailRecords) {
+          scratch->clear();
+          return false;
+        }
+        if (record_type == kBadRecordLen) {
+          ReportCorruption(drop_size, "bad record length");
+        } else {
+          ReportCorruption(drop_size, "checksum mismatch");
+        }
+        if (in_fragmented_record) {
+          ReportCorruption(scratch->size(), "error in middle of record");
+          in_fragmented_record = false;
+          scratch->clear();
+        }
+        break;
+
+      case kSetCompressionType: {
+        if (compression_type_record_read_) {
+          ReportCorruption(fragment.size(),
+                           "read multiple SetCompressionType records");
+        }
+        if (first_record_read_) {
+          ReportCorruption(fragment.size(),
+                           "SetCompressionType not the first record");
+        }
+        prospective_record_offset = physical_record_offset;
+        scratch->clear();
+        last_record_offset_ = prospective_record_offset;
+        CompressionTypeRecord compression_record(kNoCompression);
+        Status s = compression_record.DecodeFrom(&fragment);
+        if (!s.ok()) {
+          ReportCorruption(fragment.size(),
+                           "could not decode SetCompressionType record");
+        } else {
+          InitCompression(compression_record);
+        }
+        break;
+      }
+
+      default: {
+        char buf[40];
+        snprintf(buf, sizeof(buf), "unknown record type %u", record_type);
+        ReportCorruption(
+            (fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
+            buf);
+        in_fragmented_record = false;
+        scratch->clear();
+        break;
+      }
+    }
+  }
+  return false;
+}
+
+uint64_t Reader::LastRecordOffset() { return last_record_offset_; }
+
+uint64_t Reader::LastRecordEnd() {
+  return end_of_buffer_offset_ - buffer_.size();
+}
+
+void Reader::UnmarkEOF() {
+  if (read_error_) {
+    return;
+  }
+  eof_ = false;
+  if (eof_offset_ == 0) {
+    return;
+  }
+  UnmarkEOFInternal();
+}
+
+void Reader::UnmarkEOFInternal() {
+  // If the EOF was in the middle of a block (a partial block was read) we have
+  // to read the rest of the block as ReadPhysicalRecord can only read full
+  // blocks and expects the file position indicator to be aligned to the start
+  // of a block.
+  //
+  //      consumed_bytes + buffer_size() + remaining == kBlockSize
+
+  size_t consumed_bytes = eof_offset_ - buffer_.size();
+  size_t remaining = kBlockSize - eof_offset_;
+
+  // backing_store_ is used to concatenate what is left in buffer_ and
+  // the remainder of the block. If buffer_ already uses backing_store_,
+  // we just append the new data.
+  if (buffer_.data() != backing_store_ + consumed_bytes) {
+    // Buffer_ does not use backing_store_ for storage.
+    // Copy what is left in buffer_ to backing_store.
+    memmove(backing_store_ + consumed_bytes, buffer_.data(), buffer_.size());
+  }
+
+  Slice read_buffer;
+  // TODO: rate limit log reader with approriate priority.
+  // TODO: avoid overcharging rate limiter:
+  // Note that the Read here might overcharge SequentialFileReader's internal
+  // rate limiter if priority is not IO_TOTAL, e.g., when there is not enough
+  // content left until EOF to read.
+  Status status =
+      file_->Read(remaining, &read_buffer, backing_store_ + eof_offset_,
+                  Env::IO_TOTAL /* rate_limiter_priority */);
+
+  size_t added = read_buffer.size();
+  end_of_buffer_offset_ += added;
+
+  if (!status.ok()) {
+    if (added > 0) {
+      ReportDrop(added, status);
+    }
+
+    read_error_ = true;
+    return;
+  }
+
+  if (read_buffer.data() != backing_store_ + eof_offset_) {
+    // Read did not write to backing_store_
+    memmove(backing_store_ + eof_offset_, read_buffer.data(),
+            read_buffer.size());
+  }
+
+  buffer_ = Slice(backing_store_ + consumed_bytes,
+                  eof_offset_ + added - consumed_bytes);
+
+  if (added < remaining) {
+    eof_ = true;
+    eof_offset_ += added;
+  } else {
+    eof_offset_ = 0;
+  }
+}
+
+void Reader::ReportCorruption(size_t bytes, const char* reason) {
+  ReportDrop(bytes, Status::Corruption(reason));
+}
+
+void Reader::ReportDrop(size_t bytes, const Status& reason) {
+  if (reporter_ != nullptr) {
+    reporter_->Corruption(bytes, reason);
+  }
+}
+
+bool Reader::ReadMore(size_t* drop_size, int* error) {
+  if (!eof_ && !read_error_) {
+    // Last read was a full read, so this is a trailer to skip
+    buffer_.clear();
+    // TODO: rate limit log reader with approriate priority.
+    // TODO: avoid overcharging rate limiter:
+    // Note that the Read here might overcharge SequentialFileReader's internal
+    // rate limiter if priority is not IO_TOTAL, e.g., when there is not enough
+    // content left until EOF to read.
+    Status status = file_->Read(kBlockSize, &buffer_, backing_store_,
+                                Env::IO_TOTAL /* rate_limiter_priority */);
+    TEST_SYNC_POINT_CALLBACK("LogReader::ReadMore:AfterReadFile", &status);
+    end_of_buffer_offset_ += buffer_.size();
+    if (!status.ok()) {
+      buffer_.clear();
+      ReportDrop(kBlockSize, status);
+      read_error_ = true;
+      *error = kEof;
+      return false;
+    } else if (buffer_.size() < static_cast<size_t>(kBlockSize)) {
+      eof_ = true;
+      eof_offset_ = buffer_.size();
+    }
+    return true;
+  } else {
+    // Note that if buffer_ is non-empty, we have a truncated header at the
+    //  end of the file, which can be caused by the writer crashing in the
+    //  middle of writing the header. Unless explicitly requested we don't
+    //  considering this an error, just report EOF.
+    if (buffer_.size()) {
+      *drop_size = buffer_.size();
+      buffer_.clear();
+      *error = kBadHeader;
+      return false;
+    }
+    buffer_.clear();
+    *error = kEof;
+    return false;
+  }
+}
+
+unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size,
+                                        uint64_t* fragment_checksum) {
+  while (true) {
+    // We need at least the minimum header size
+    if (buffer_.size() < static_cast<size_t>(kHeaderSize)) {
+      // the default value of r is meaningless because ReadMore will overwrite
+      // it if it returns false; in case it returns true, the return value will
+      // not be used anyway
+      int r = kEof;
+      if (!ReadMore(drop_size, &r)) {
+        return r;
+      }
+      continue;
+    }
+
+    // Parse the header
+    const char* header = buffer_.data();
+    const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
+    const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
+    const unsigned int type = header[6];
+    const uint32_t length = a | (b << 8);
+    int header_size = kHeaderSize;
+    if (type >= kRecyclableFullType && type <= kRecyclableLastType) {
+      if (end_of_buffer_offset_ - buffer_.size() == 0) {
+        recycled_ = true;
+      }
+      header_size = kRecyclableHeaderSize;
+      // We need enough for the larger header
+      if (buffer_.size() < static_cast<size_t>(kRecyclableHeaderSize)) {
+        int r = kEof;
+        if (!ReadMore(drop_size, &r)) {
+          return r;
+        }
+        continue;
+      }
+      const uint32_t log_num = DecodeFixed32(header + 7);
+      if (log_num != log_number_) {
+        return kOldRecord;
+      }
+    }
+    if (header_size + length > buffer_.size()) {
+      assert(buffer_.size() >= static_cast<size_t>(header_size));
+      *drop_size = buffer_.size();
+      buffer_.clear();
+      // If the end of the read has been reached without seeing
+      // `header_size + length` bytes of payload, report a corruption. The
+      // higher layers can decide how to handle it based on the recovery mode,
+      // whether this occurred at EOF, whether this is the final WAL, etc.
+      return kBadRecordLen;
+    }
+
+    if (type == kZeroType && length == 0) {
+      // Skip zero length record without reporting any drops since
+      // such records are produced by the mmap based writing code in
+      // env_posix.cc that preallocates file regions.
+      // NOTE: this should never happen in DB written by new RocksDB versions,
+      // since we turn off mmap writes to manifest and log files
+      buffer_.clear();
+      return kBadRecord;
+    }
+
+    // Check crc
+    if (checksum_) {
+      uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
+      uint32_t actual_crc = crc32c::Value(header + 6, length + header_size - 6);
+      if (actual_crc != expected_crc) {
+        // Drop the rest of the buffer since "length" itself may have
+        // been corrupted and if we trust it, we could find some
+        // fragment of a real log record that just happens to look
+        // like a valid log record.
+        *drop_size = buffer_.size();
+        buffer_.clear();
+        return kBadRecordChecksum;
+      }
+    }
+
+    buffer_.remove_prefix(header_size + length);
+
+    if (!uncompress_ || type == kSetCompressionType) {
+      *result = Slice(header + header_size, length);
+      return type;
+    } else {
+      // Uncompress compressed records
+      uncompressed_record_.clear();
+      if (fragment_checksum != nullptr) {
+        if (uncompress_hash_state_ == nullptr) {
+          uncompress_hash_state_ = XXH3_createState();
+        }
+        XXH3_64bits_reset(uncompress_hash_state_);
+      }
+
+      size_t uncompressed_size = 0;
+      int remaining = 0;
+      do {
+        remaining = uncompress_->Uncompress(header + header_size, length,
+                                            uncompressed_buffer_.get(),
+                                            &uncompressed_size);
+        if (remaining < 0) {
+          buffer_.clear();
+          return kBadRecord;
+        }
+        if (uncompressed_size > 0) {
+          if (fragment_checksum != nullptr) {
+            XXH3_64bits_update(uncompress_hash_state_,
+                               uncompressed_buffer_.get(), uncompressed_size);
+          }
+          uncompressed_record_.append(uncompressed_buffer_.get(),
+                                      uncompressed_size);
+        }
+      } while (remaining > 0 || uncompressed_size == kBlockSize);
+
+      if (fragment_checksum != nullptr) {
+        // We can remove this check by updating hash_state_ directly,
+        // but that requires resetting hash_state_ for full and first types
+        // for edge cases like consecutive fist type records.
+        // Leaving the check as is since it is cleaner and can revert to the
+        // above approach if it causes performance impact.
+        *fragment_checksum = XXH3_64bits_digest(uncompress_hash_state_);
+        uint64_t actual_checksum = XXH3_64bits(uncompressed_record_.data(),
+                                               uncompressed_record_.size());
+        if (*fragment_checksum != actual_checksum) {
+          // uncompressed_record_ contains bad content that does not match
+          // actual decompressed content
+          return kBadRecord;
+        }
+      }
+      *result = Slice(uncompressed_record_);
+      return type;
+    }
+  }
+}
+
+// Initialize uncompress related fields
+void Reader::InitCompression(const CompressionTypeRecord& compression_record) {
+  compression_type_ = compression_record.GetCompressionType();
+  compression_type_record_read_ = true;
+  constexpr uint32_t compression_format_version = 2;
+  uncompress_ = StreamingUncompress::Create(
+      compression_type_, compression_format_version, kBlockSize);
+  assert(uncompress_ != nullptr);
+  uncompressed_buffer_ = std::unique_ptr<char[]>(new char[kBlockSize]);
+  assert(uncompressed_buffer_);
+}
+
+bool FragmentBufferedReader::ReadRecord(Slice* record, std::string* scratch,
+                                        WALRecoveryMode /*unused*/,
+                                        uint64_t* /* checksum */) {
+  assert(record != nullptr);
+  assert(scratch != nullptr);
+  record->clear();
+  scratch->clear();
+  if (uncompress_) {
+    uncompress_->Reset();
+  }
+
+  uint64_t prospective_record_offset = 0;
+  uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
+  size_t drop_size = 0;
+  unsigned int fragment_type_or_err = 0;  // Initialize to make compiler happy
+  Slice fragment;
+  while (TryReadFragment(&fragment, &drop_size, &fragment_type_or_err)) {
+    switch (fragment_type_or_err) {
+      case kFullType:
+      case kRecyclableFullType:
+        if (in_fragmented_record_ && !fragments_.empty()) {
+          ReportCorruption(fragments_.size(), "partial record without end(1)");
+        }
+        fragments_.clear();
+        *record = fragment;
+        prospective_record_offset = physical_record_offset;
+        last_record_offset_ = prospective_record_offset;
+        first_record_read_ = true;
+        in_fragmented_record_ = false;
+        return true;
+
+      case kFirstType:
+      case kRecyclableFirstType:
+        if (in_fragmented_record_ || !fragments_.empty()) {
+          ReportCorruption(fragments_.size(), "partial record without end(2)");
+        }
+        prospective_record_offset = physical_record_offset;
+        fragments_.assign(fragment.data(), fragment.size());
+        in_fragmented_record_ = true;
+        break;
+
+      case kMiddleType:
+      case kRecyclableMiddleType:
+        if (!in_fragmented_record_) {
+          ReportCorruption(fragment.size(),
+                           "missing start of fragmented record(1)");
+        } else {
+          fragments_.append(fragment.data(), fragment.size());
+        }
+        break;
+
+      case kLastType:
+      case kRecyclableLastType:
+        if (!in_fragmented_record_) {
+          ReportCorruption(fragment.size(),
+                           "missing start of fragmented record(2)");
+        } else {
+          fragments_.append(fragment.data(), fragment.size());
+          scratch->assign(fragments_.data(), fragments_.size());
+          fragments_.clear();
+          *record = Slice(*scratch);
+          last_record_offset_ = prospective_record_offset;
+          first_record_read_ = true;
+          in_fragmented_record_ = false;
+          return true;
+        }
+        break;
+
+      case kBadHeader:
+      case kBadRecord:
+      case kEof:
+      case kOldRecord:
+        if (in_fragmented_record_) {
+          ReportCorruption(fragments_.size(), "error in middle of record");
+          in_fragmented_record_ = false;
+          fragments_.clear();
+        }
+        break;
+
+      case kBadRecordChecksum:
+        if (recycled_) {
+          fragments_.clear();
+          return false;
+        }
+        ReportCorruption(drop_size, "checksum mismatch");
+        if (in_fragmented_record_) {
+          ReportCorruption(fragments_.size(), "error in middle of record");
+          in_fragmented_record_ = false;
+          fragments_.clear();
+        }
+        break;
+
+      case kSetCompressionType: {
+        if (compression_type_record_read_) {
+          ReportCorruption(fragment.size(),
+                           "read multiple SetCompressionType records");
+        }
+        if (first_record_read_) {
+          ReportCorruption(fragment.size(),
+                           "SetCompressionType not the first record");
+        }
+        fragments_.clear();
+        prospective_record_offset = physical_record_offset;
+        last_record_offset_ = prospective_record_offset;
+        in_fragmented_record_ = false;
+        CompressionTypeRecord compression_record(kNoCompression);
+        Status s = compression_record.DecodeFrom(&fragment);
+        if (!s.ok()) {
+          ReportCorruption(fragment.size(),
+                           "could not decode SetCompressionType record");
+        } else {
+          InitCompression(compression_record);
+        }
+        break;
+      }
+
+      default: {
+        char buf[40];
+        snprintf(buf, sizeof(buf), "unknown record type %u",
+                 fragment_type_or_err);
+        ReportCorruption(
+            fragment.size() + (in_fragmented_record_ ? fragments_.size() : 0),
+            buf);
+        in_fragmented_record_ = false;
+        fragments_.clear();
+        break;
+      }
+    }
+  }
+  return false;
+}
+
+void FragmentBufferedReader::UnmarkEOF() {
+  if (read_error_) {
+    return;
+  }
+  eof_ = false;
+  UnmarkEOFInternal();
+}
+
+bool FragmentBufferedReader::TryReadMore(size_t* drop_size, int* error) {
+  if (!eof_ && !read_error_) {
+    // Last read was a full read, so this is a trailer to skip
+    buffer_.clear();
+    // TODO: rate limit log reader with approriate priority.
+    // TODO: avoid overcharging rate limiter:
+    // Note that the Read here might overcharge SequentialFileReader's internal
+    // rate limiter if priority is not IO_TOTAL, e.g., when there is not enough
+    // content left until EOF to read.
+    Status status = file_->Read(kBlockSize, &buffer_, backing_store_,
+                                Env::IO_TOTAL /* rate_limiter_priority */);
+    end_of_buffer_offset_ += buffer_.size();
+    if (!status.ok()) {
+      buffer_.clear();
+      ReportDrop(kBlockSize, status);
+      read_error_ = true;
+      *error = kEof;
+      return false;
+    } else if (buffer_.size() < static_cast<size_t>(kBlockSize)) {
+      eof_ = true;
+      eof_offset_ = buffer_.size();
+      TEST_SYNC_POINT_CALLBACK(
+          "FragmentBufferedLogReader::TryReadMore:FirstEOF", nullptr);
+    }
+    return true;
+  } else if (!read_error_) {
+    UnmarkEOF();
+  }
+  if (!read_error_) {
+    return true;
+  }
+  *error = kEof;
+  *drop_size = buffer_.size();
+  if (buffer_.size() > 0) {
+    *error = kBadHeader;
+  }
+  buffer_.clear();
+  return false;
+}
+
+// return true if the caller should process the fragment_type_or_err.
+bool FragmentBufferedReader::TryReadFragment(
+    Slice* fragment, size_t* drop_size, unsigned int* fragment_type_or_err) {
+  assert(fragment != nullptr);
+  assert(drop_size != nullptr);
+  assert(fragment_type_or_err != nullptr);
+
+  while (buffer_.size() < static_cast<size_t>(kHeaderSize)) {
+    size_t old_size = buffer_.size();
+    int error = kEof;
+    if (!TryReadMore(drop_size, &error)) {
+      *fragment_type_or_err = error;
+      return false;
+    } else if (old_size == buffer_.size()) {
+      return false;
+    }
+  }
+  const char* header = buffer_.data();
+  const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
+  const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
+  const unsigned int type = header[6];
+  const uint32_t length = a | (b << 8);
+  int header_size = kHeaderSize;
+  if (type >= kRecyclableFullType && type <= kRecyclableLastType) {
+    if (end_of_buffer_offset_ - buffer_.size() == 0) {
+      recycled_ = true;
+    }
+    header_size = kRecyclableHeaderSize;
+    while (buffer_.size() < static_cast<size_t>(kRecyclableHeaderSize)) {
+      size_t old_size = buffer_.size();
+      int error = kEof;
+      if (!TryReadMore(drop_size, &error)) {
+        *fragment_type_or_err = error;
+        return false;
+      } else if (old_size == buffer_.size()) {
+        return false;
+      }
+    }
+    const uint32_t log_num = DecodeFixed32(header + 7);
+    if (log_num != log_number_) {
+      *fragment_type_or_err = kOldRecord;
+      return true;
+    }
+  }
+
+  while (header_size + length > buffer_.size()) {
+    size_t old_size = buffer_.size();
+    int error = kEof;
+    if (!TryReadMore(drop_size, &error)) {
+      *fragment_type_or_err = error;
+      return false;
+    } else if (old_size == buffer_.size()) {
+      return false;
+    }
+  }
+
+  if (type == kZeroType && length == 0) {
+    buffer_.clear();
+    *fragment_type_or_err = kBadRecord;
+    return true;
+  }
+
+  if (checksum_) {
+    uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
+    uint32_t actual_crc = crc32c::Value(header + 6, length + header_size - 6);
+    if (actual_crc != expected_crc) {
+      *drop_size = buffer_.size();
+      buffer_.clear();
+      *fragment_type_or_err = kBadRecordChecksum;
+      return true;
+    }
+  }
+
+  buffer_.remove_prefix(header_size + length);
+
+  if (!uncompress_ || type == kSetCompressionType) {
+    *fragment = Slice(header + header_size, length);
+    *fragment_type_or_err = type;
+    return true;
+  } else {
+    // Uncompress compressed records
+    uncompressed_record_.clear();
+    size_t uncompressed_size = 0;
+    int remaining = 0;
+    do {
+      remaining = uncompress_->Uncompress(header + header_size, length,
+                                          uncompressed_buffer_.get(),
+                                          &uncompressed_size);
+      if (remaining < 0) {
+        buffer_.clear();
+        *fragment_type_or_err = kBadRecord;
+        return true;
+      }
+      if (uncompressed_size > 0) {
+        uncompressed_record_.append(uncompressed_buffer_.get(),
+                                    uncompressed_size);
+      }
+    } while (remaining > 0 || uncompressed_size == kBlockSize);
+    *fragment = Slice(std::move(uncompressed_record_));
+    *fragment_type_or_err = type;
+    return true;
+  }
+}
+
+}  // namespace log
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/log_reader.h b/src/rocksdb/db/log_reader.h
new file mode 100644
index 000000000..e3be1570e
--- /dev/null
+++ b/src/rocksdb/db/log_reader.h
@@ -0,0 +1,225 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+
+#include <memory>
+
+#include "db/log_format.h"
+#include "file/sequence_file_reader.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "util/compression.h"
+#include "util/xxhash.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Logger;
+
+namespace log {
+
+/**
+ * Reader is a general purpose log stream reader implementation. The actual job
+ * of reading from the device is implemented by the SequentialFile interface.
+ *
+ * Please see Writer for details on the file and record layout.
+ */
+class Reader {
+ public:
+  // Interface for reporting errors.
+  class Reporter {
+   public:
+    virtual ~Reporter();
+
+    // Some corruption was detected.  "size" is the approximate number
+    // of bytes dropped due to the corruption.
+    virtual void Corruption(size_t bytes, const Status& status) = 0;
+  };
+
+  // Create a reader that will return log records from "*file".
+  // "*file" must remain live while this Reader is in use.
+  //
+  // If "reporter" is non-nullptr, it is notified whenever some data is
+  // dropped due to a detected corruption.  "*reporter" must remain
+  // live while this Reader is in use.
+  //
+  // If "checksum" is true, verify checksums if available.
+  Reader(std::shared_ptr<Logger> info_log,
+         std::unique_ptr<SequentialFileReader>&& file, Reporter* reporter,
+         bool checksum, uint64_t log_num);
+  // No copying allowed
+  Reader(const Reader&) = delete;
+  void operator=(const Reader&) = delete;
+
+  virtual ~Reader();
+
+  // Read the next record into *record.  Returns true if read
+  // successfully, false if we hit end of the input.  May use
+  // "*scratch" as temporary storage. The contents filled in *record
+  // will only be valid until the next mutating operation on this
+  // reader or the next mutation to *scratch.
+  // If record_checksum is not nullptr, then this function will calculate the
+  // checksum of the record read and set record_checksum to it. The checksum is
+  // calculated from the original buffers that contain the contents of the
+  // record.
+  virtual bool ReadRecord(Slice* record, std::string* scratch,
+                          WALRecoveryMode wal_recovery_mode =
+                              WALRecoveryMode::kTolerateCorruptedTailRecords,
+                          uint64_t* record_checksum = nullptr);
+
+  // Returns the physical offset of the last record returned by ReadRecord.
+  //
+  // Undefined before the first call to ReadRecord.
+  uint64_t LastRecordOffset();
+
+  // Returns the first physical offset after the last record returned by
+  // ReadRecord, or zero before first call to ReadRecord. This can also be
+  // thought of as the "current" position in processing the file bytes.
+  uint64_t LastRecordEnd();
+
+  // returns true if the reader has encountered an eof condition.
+  bool IsEOF() { return eof_; }
+
+  // returns true if the reader has encountered read error.
+  bool hasReadError() const { return read_error_; }
+
+  // when we know more data has been written to the file. we can use this
+  // function to force the reader to look again in the file.
+  // Also aligns the file position indicator to the start of the next block
+  // by reading the rest of the data from the EOF position to the end of the
+  // block that was partially read.
+  virtual void UnmarkEOF();
+
+  SequentialFileReader* file() { return file_.get(); }
+
+  Reporter* GetReporter() const { return reporter_; }
+
+  uint64_t GetLogNumber() const { return log_number_; }
+
+  size_t GetReadOffset() const {
+    return static_cast<size_t>(end_of_buffer_offset_);
+  }
+
+  bool IsCompressedAndEmptyFile() {
+    return !first_record_read_ && compression_type_record_read_;
+  }
+
+ protected:
+  std::shared_ptr<Logger> info_log_;
+  const std::unique_ptr<SequentialFileReader> file_;
+  Reporter* const reporter_;
+  bool const checksum_;
+  char* const backing_store_;
+
+  // Internal state variables used for reading records
+  Slice buffer_;
+  bool eof_;         // Last Read() indicated EOF by returning < kBlockSize
+  bool read_error_;  // Error occurred while reading from file
+
+  // Offset of the file position indicator within the last block when an
+  // EOF was detected.
+  size_t eof_offset_;
+
+  // Offset of the last record returned by ReadRecord.
+  uint64_t last_record_offset_;
+  // Offset of the first location past the end of buffer_.
+  uint64_t end_of_buffer_offset_;
+
+  // which log number this is
+  uint64_t const log_number_;
+
+  // Whether this is a recycled log file
+  bool recycled_;
+
+  // Whether the first record has been read or not.
+  bool first_record_read_;
+  // Type of compression used
+  CompressionType compression_type_;
+  // Track whether the compression type record has been read or not.
+  bool compression_type_record_read_;
+  StreamingUncompress* uncompress_;
+  // Reusable uncompressed output buffer
+  std::unique_ptr<char[]> uncompressed_buffer_;
+  // Reusable uncompressed record
+  std::string uncompressed_record_;
+  // Used for stream hashing fragment content in ReadRecord()
+  XXH3_state_t* hash_state_;
+  // Used for stream hashing uncompressed buffer in ReadPhysicalRecord()
+  XXH3_state_t* uncompress_hash_state_;
+
+  // Extend record types with the following special values
+  enum {
+    kEof = kMaxRecordType + 1,
+    // Returned whenever we find an invalid physical record.
+    // Currently there are three situations in which this happens:
+    // * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
+    // * The record is a 0-length record (No drop is reported)
+    kBadRecord = kMaxRecordType + 2,
+    // Returned when we fail to read a valid header.
+    kBadHeader = kMaxRecordType + 3,
+    // Returned when we read an old record from a previous user of the log.
+    kOldRecord = kMaxRecordType + 4,
+    // Returned when we get a bad record length
+    kBadRecordLen = kMaxRecordType + 5,
+    // Returned when we get a bad record checksum
+    kBadRecordChecksum = kMaxRecordType + 6,
+  };
+
+  // Return type, or one of the preceding special values
+  // If WAL compressioned is enabled, fragment_checksum is the checksum of the
+  // fragment computed from the orginal buffer containinng uncompressed
+  // fragment.
+  unsigned int ReadPhysicalRecord(Slice* result, size_t* drop_size,
+                                  uint64_t* fragment_checksum = nullptr);
+
+  // Read some more
+  bool ReadMore(size_t* drop_size, int* error);
+
+  void UnmarkEOFInternal();
+
+  // Reports dropped bytes to the reporter.
+  // buffer_ must be updated to remove the dropped bytes prior to invocation.
+  void ReportCorruption(size_t bytes, const char* reason);
+  void ReportDrop(size_t bytes, const Status& reason);
+
+  void InitCompression(const CompressionTypeRecord& compression_record);
+};
+
+class FragmentBufferedReader : public Reader {
+ public:
+  FragmentBufferedReader(std::shared_ptr<Logger> info_log,
+                         std::unique_ptr<SequentialFileReader>&& _file,
+                         Reporter* reporter, bool checksum, uint64_t log_num)
+      : Reader(info_log, std::move(_file), reporter, checksum, log_num),
+        fragments_(),
+        in_fragmented_record_(false) {}
+  ~FragmentBufferedReader() override {}
+  bool ReadRecord(Slice* record, std::string* scratch,
+                  WALRecoveryMode wal_recovery_mode =
+                      WALRecoveryMode::kTolerateCorruptedTailRecords,
+                  uint64_t* record_checksum = nullptr) override;
+  void UnmarkEOF() override;
+
+ private:
+  std::string fragments_;
+  bool in_fragmented_record_;
+
+  bool TryReadFragment(Slice* result, size_t* drop_size,
+                       unsigned int* fragment_type_or_err);
+
+  bool TryReadMore(size_t* drop_size, int* error);
+
+  // No copy allowed
+  FragmentBufferedReader(const FragmentBufferedReader&);
+  void operator=(const FragmentBufferedReader&);
+};
+
+}  // namespace log
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/log_test.cc b/src/rocksdb/db/log_test.cc
new file mode 100644
index 000000000..2a43dc152
--- /dev/null
+++ b/src/rocksdb/db/log_test.cc
@@ -0,0 +1,1062 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "file/sequence_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/env.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/random.h"
+#include "utilities/memory_allocators.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace log {
+
+// Construct a string of the specified length made out of the supplied
+// partial string.
+static std::string BigString(const std::string& partial_string, size_t n) {
+  std::string result;
+  while (result.size() < n) {
+    result.append(partial_string);
+  }
+  result.resize(n);
+  return result;
+}
+
+// Construct a string from a number
+static std::string NumberString(int n) {
+  char buf[50];
+  snprintf(buf, sizeof(buf), "%d.", n);
+  return std::string(buf);
+}
+
+// Return a skewed potentially long string
+static std::string RandomSkewedString(int i, Random* rnd) {
+  return BigString(NumberString(i), rnd->Skewed(17));
+}
+
+// Param type is tuple<int, bool>
+// get<0>(tuple): non-zero if recycling log, zero if regular log
+// get<1>(tuple): true if allow retry after read EOF, false otherwise
+class LogTest
+    : public ::testing::TestWithParam<std::tuple<int, bool, CompressionType>> {
+ private:
+  class StringSource : public FSSequentialFile {
+   public:
+    Slice& contents_;
+    bool force_error_;
+    size_t force_error_position_;
+    bool force_eof_;
+    size_t force_eof_position_;
+    bool returned_partial_;
+    bool fail_after_read_partial_;
+    explicit StringSource(Slice& contents, bool fail_after_read_partial)
+        : contents_(contents),
+          force_error_(false),
+          force_error_position_(0),
+          force_eof_(false),
+          force_eof_position_(0),
+          returned_partial_(false),
+          fail_after_read_partial_(fail_after_read_partial) {}
+
+    IOStatus Read(size_t n, const IOOptions& /*opts*/, Slice* result,
+                  char* scratch, IODebugContext* /*dbg*/) override {
+      if (fail_after_read_partial_) {
+        EXPECT_TRUE(!returned_partial_) << "must not Read() after eof/error";
+      }
+
+      if (force_error_) {
+        if (force_error_position_ >= n) {
+          force_error_position_ -= n;
+        } else {
+          *result = Slice(contents_.data(), force_error_position_);
+          contents_.remove_prefix(force_error_position_);
+          force_error_ = false;
+          returned_partial_ = true;
+          return IOStatus::Corruption("read error");
+        }
+      }
+
+      if (contents_.size() < n) {
+        n = contents_.size();
+        returned_partial_ = true;
+      }
+
+      if (force_eof_) {
+        if (force_eof_position_ >= n) {
+          force_eof_position_ -= n;
+        } else {
+          force_eof_ = false;
+          n = force_eof_position_;
+          returned_partial_ = true;
+        }
+      }
+
+      // By using scratch we ensure that caller has control over the
+      // lifetime of result.data()
+      memcpy(scratch, contents_.data(), n);
+      *result = Slice(scratch, n);
+
+      contents_.remove_prefix(n);
+      return IOStatus::OK();
+    }
+
+    IOStatus Skip(uint64_t n) override {
+      if (n > contents_.size()) {
+        contents_.clear();
+        return IOStatus::NotFound("in-memory file skipepd past end");
+      }
+
+      contents_.remove_prefix(n);
+
+      return IOStatus::OK();
+    }
+  };
+
+  class ReportCollector : public Reader::Reporter {
+   public:
+    size_t dropped_bytes_;
+    std::string message_;
+
+    ReportCollector() : dropped_bytes_(0) {}
+    void Corruption(size_t bytes, const Status& status) override {
+      dropped_bytes_ += bytes;
+      message_.append(status.ToString());
+    }
+  };
+
+  std::string& dest_contents() { return sink_->contents_; }
+
+  const std::string& dest_contents() const { return sink_->contents_; }
+
+  void reset_source_contents() { source_->contents_ = dest_contents(); }
+
+  Slice reader_contents_;
+  test::StringSink* sink_;
+  StringSource* source_;
+  ReportCollector report_;
+
+ protected:
+  std::unique_ptr<Writer> writer_;
+  std::unique_ptr<Reader> reader_;
+  bool allow_retry_read_;
+  CompressionType compression_type_;
+
+ public:
+  LogTest()
+      : reader_contents_(),
+        sink_(new test::StringSink(&reader_contents_)),
+        source_(new StringSource(reader_contents_, !std::get<1>(GetParam()))),
+        allow_retry_read_(std::get<1>(GetParam())),
+        compression_type_(std::get<2>(GetParam())) {
+    std::unique_ptr<FSWritableFile> sink_holder(sink_);
+    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+        std::move(sink_holder), "" /* don't care */, FileOptions()));
+    Writer* writer =
+        new Writer(std::move(file_writer), 123, std::get<0>(GetParam()), false,
+                   compression_type_);
+    writer_.reset(writer);
+    std::unique_ptr<FSSequentialFile> source_holder(source_);
+    std::unique_ptr<SequentialFileReader> file_reader(
+        new SequentialFileReader(std::move(source_holder), "" /* file name */));
+    if (allow_retry_read_) {
+      reader_.reset(new FragmentBufferedReader(nullptr, std::move(file_reader),
+                                               &report_, true /* checksum */,
+                                               123 /* log_number */));
+    } else {
+      reader_.reset(new Reader(nullptr, std::move(file_reader), &report_,
+                               true /* checksum */, 123 /* log_number */));
+    }
+  }
+
+  Slice* get_reader_contents() { return &reader_contents_; }
+
+  void Write(const std::string& msg) {
+    ASSERT_OK(writer_->AddRecord(Slice(msg)));
+  }
+
+  size_t WrittenBytes() const { return dest_contents().size(); }
+
+  std::string Read(const WALRecoveryMode wal_recovery_mode =
+                       WALRecoveryMode::kTolerateCorruptedTailRecords) {
+    std::string scratch;
+    Slice record;
+    bool ret = false;
+    uint64_t record_checksum;
+    ret = reader_->ReadRecord(&record, &scratch, wal_recovery_mode,
+                              &record_checksum);
+    if (ret) {
+      if (!allow_retry_read_) {
+        // allow_retry_read_ means using FragmentBufferedReader which does not
+        // support record checksum yet.
+        uint64_t actual_record_checksum =
+            XXH3_64bits(record.data(), record.size());
+        assert(actual_record_checksum == record_checksum);
+      }
+      return record.ToString();
+    } else {
+      return "EOF";
+    }
+  }
+
+  void IncrementByte(int offset, char delta) {
+    dest_contents()[offset] += delta;
+  }
+
+  void SetByte(int offset, char new_byte) {
+    dest_contents()[offset] = new_byte;
+  }
+
+  void ShrinkSize(int bytes) { sink_->Drop(bytes); }
+
+  void FixChecksum(int header_offset, int len, bool recyclable) {
+    // Compute crc of type/len/data
+    int header_size = recyclable ? kRecyclableHeaderSize : kHeaderSize;
+    uint32_t crc = crc32c::Value(&dest_contents()[header_offset + 6],
+                                 header_size - 6 + len);
+    crc = crc32c::Mask(crc);
+    EncodeFixed32(&dest_contents()[header_offset], crc);
+  }
+
+  void ForceError(size_t position = 0) {
+    source_->force_error_ = true;
+    source_->force_error_position_ = position;
+  }
+
+  size_t DroppedBytes() const { return report_.dropped_bytes_; }
+
+  std::string ReportMessage() const { return report_.message_; }
+
+  void ForceEOF(size_t position = 0) {
+    source_->force_eof_ = true;
+    source_->force_eof_position_ = position;
+  }
+
+  void UnmarkEOF() {
+    source_->returned_partial_ = false;
+    reader_->UnmarkEOF();
+  }
+
+  bool IsEOF() { return reader_->IsEOF(); }
+
+  // Returns OK iff recorded error message contains "msg"
+  std::string MatchError(const std::string& msg) const {
+    if (report_.message_.find(msg) == std::string::npos) {
+      return report_.message_;
+    } else {
+      return "OK";
+    }
+  }
+};
+
+TEST_P(LogTest, Empty) { ASSERT_EQ("EOF", Read()); }
+
+TEST_P(LogTest, ReadWrite) {
+  Write("foo");
+  Write("bar");
+  Write("");
+  Write("xxxx");
+  ASSERT_EQ("foo", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("xxxx", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ("EOF", Read());  // Make sure reads at eof work
+}
+
+TEST_P(LogTest, ManyBlocks) {
+  for (int i = 0; i < 100000; i++) {
+    Write(NumberString(i));
+  }
+  for (int i = 0; i < 100000; i++) {
+    ASSERT_EQ(NumberString(i), Read());
+  }
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, Fragmentation) {
+  Write("small");
+  Write(BigString("medium", 50000));
+  Write(BigString("large", 100000));
+  ASSERT_EQ("small", Read());
+  ASSERT_EQ(BigString("medium", 50000), Read());
+  ASSERT_EQ(BigString("large", 100000), Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, MarginalTrailer) {
+  // Make a trailer that is exactly the same length as an empty record.
+  int header_size =
+      std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize;
+  const int n = kBlockSize - 2 * header_size;
+  Write(BigString("foo", n));
+  ASSERT_EQ((unsigned int)(kBlockSize - header_size), WrittenBytes());
+  Write("");
+  Write("bar");
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, MarginalTrailer2) {
+  // Make a trailer that is exactly the same length as an empty record.
+  int header_size =
+      std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize;
+  const int n = kBlockSize - 2 * header_size;
+  Write(BigString("foo", n));
+  ASSERT_EQ((unsigned int)(kBlockSize - header_size), WrittenBytes());
+  Write("bar");
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(0U, DroppedBytes());
+  ASSERT_EQ("", ReportMessage());
+}
+
+TEST_P(LogTest, ShortTrailer) {
+  int header_size =
+      std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize;
+  const int n = kBlockSize - 2 * header_size + 4;
+  Write(BigString("foo", n));
+  ASSERT_EQ((unsigned int)(kBlockSize - header_size + 4), WrittenBytes());
+  Write("");
+  Write("bar");
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, AlignedEof) {
+  int header_size =
+      std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize;
+  const int n = kBlockSize - 2 * header_size + 4;
+  Write(BigString("foo", n));
+  ASSERT_EQ((unsigned int)(kBlockSize - header_size + 4), WrittenBytes());
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, RandomRead) {
+  const int N = 500;
+  Random write_rnd(301);
+  for (int i = 0; i < N; i++) {
+    Write(RandomSkewedString(i, &write_rnd));
+  }
+  Random read_rnd(301);
+  for (int i = 0; i < N; i++) {
+    ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read());
+  }
+  ASSERT_EQ("EOF", Read());
+}
+
+// Tests of all the error paths in log_reader.cc follow:
+
+TEST_P(LogTest, ReadError) {
+  Write("foo");
+  ForceError();
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ((unsigned int)kBlockSize, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("read error"));
+}
+
+TEST_P(LogTest, BadRecordType) {
+  Write("foo");
+  // Type is stored in header[6]
+  IncrementByte(6, 100);
+  FixChecksum(0, 3, false);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("unknown record type"));
+}
+
+TEST_P(LogTest, TruncatedTrailingRecordIsIgnored) {
+  Write("foo");
+  ShrinkSize(4);  // Drop all payload as well as a header byte
+  ASSERT_EQ("EOF", Read());
+  // Truncated last record is ignored, not treated as an error
+  ASSERT_EQ(0U, DroppedBytes());
+  ASSERT_EQ("", ReportMessage());
+}
+
+TEST_P(LogTest, TruncatedTrailingRecordIsNotIgnored) {
+  if (allow_retry_read_) {
+    // If read retry is allowed, then truncated trailing record should not
+    // raise an error.
+    return;
+  }
+  Write("foo");
+  ShrinkSize(4);  // Drop all payload as well as a header byte
+  ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
+  // Truncated last record is ignored, not treated as an error
+  ASSERT_GT(DroppedBytes(), 0U);
+  ASSERT_EQ("OK", MatchError("Corruption: truncated header"));
+}
+
+TEST_P(LogTest, BadLength) {
+  if (allow_retry_read_) {
+    // If read retry is allowed, then we should not raise an error when the
+    // record length specified in header is longer than data currently
+    // available. It's possible that the body of the record is not written yet.
+    return;
+  }
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  int header_size = recyclable_log ? kRecyclableHeaderSize : kHeaderSize;
+  const int kPayloadSize = kBlockSize - header_size;
+  Write(BigString("bar", kPayloadSize));
+  Write("foo");
+  // Least significant size byte is stored in header[4].
+  IncrementByte(4, 1);
+  if (!recyclable_log) {
+    ASSERT_EQ("foo", Read());
+    ASSERT_EQ(kBlockSize, DroppedBytes());
+    ASSERT_EQ("OK", MatchError("bad record length"));
+  } else {
+    ASSERT_EQ("EOF", Read());
+  }
+}
+
+TEST_P(LogTest, BadLengthAtEndIsIgnored) {
+  if (allow_retry_read_) {
+    // If read retry is allowed, then we should not raise an error when the
+    // record length specified in header is longer than data currently
+    // available. It's possible that the body of the record is not written yet.
+    return;
+  }
+  Write("foo");
+  ShrinkSize(1);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(0U, DroppedBytes());
+  ASSERT_EQ("", ReportMessage());
+}
+
+TEST_P(LogTest, BadLengthAtEndIsNotIgnored) {
+  if (allow_retry_read_) {
+    // If read retry is allowed, then we should not raise an error when the
+    // record length specified in header is longer than data currently
+    // available. It's possible that the body of the record is not written yet.
+    return;
+  }
+  Write("foo");
+  ShrinkSize(1);
+  ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
+  ASSERT_GT(DroppedBytes(), 0U);
+  ASSERT_EQ("OK", MatchError("Corruption: truncated record body"));
+}
+
+TEST_P(LogTest, ChecksumMismatch) {
+  Write("foooooo");
+  IncrementByte(0, 14);
+  ASSERT_EQ("EOF", Read());
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  if (!recyclable_log) {
+    ASSERT_EQ(14U, DroppedBytes());
+    ASSERT_EQ("OK", MatchError("checksum mismatch"));
+  } else {
+    ASSERT_EQ(0U, DroppedBytes());
+    ASSERT_EQ("", ReportMessage());
+  }
+}
+
+TEST_P(LogTest, UnexpectedMiddleType) {
+  Write("foo");
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  SetByte(6, static_cast<char>(recyclable_log ? kRecyclableMiddleType
+                                              : kMiddleType));
+  FixChecksum(0, 3, !!recyclable_log);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("missing start"));
+}
+
+TEST_P(LogTest, UnexpectedLastType) {
+  Write("foo");
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  SetByte(6,
+          static_cast<char>(recyclable_log ? kRecyclableLastType : kLastType));
+  FixChecksum(0, 3, !!recyclable_log);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("missing start"));
+}
+
+TEST_P(LogTest, UnexpectedFullType) {
+  Write("foo");
+  Write("bar");
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  SetByte(
+      6, static_cast<char>(recyclable_log ? kRecyclableFirstType : kFirstType));
+  FixChecksum(0, 3, !!recyclable_log);
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("partial record without end"));
+}
+
+TEST_P(LogTest, UnexpectedFirstType) {
+  Write("foo");
+  Write(BigString("bar", 100000));
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  SetByte(
+      6, static_cast<char>(recyclable_log ? kRecyclableFirstType : kFirstType));
+  FixChecksum(0, 3, !!recyclable_log);
+  ASSERT_EQ(BigString("bar", 100000), Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("partial record without end"));
+}
+
+TEST_P(LogTest, MissingLastIsIgnored) {
+  Write(BigString("bar", kBlockSize));
+  // Remove the LAST block, including header.
+  ShrinkSize(14);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ("", ReportMessage());
+  ASSERT_EQ(0U, DroppedBytes());
+}
+
+TEST_P(LogTest, MissingLastIsNotIgnored) {
+  if (allow_retry_read_) {
+    // If read retry is allowed, then truncated trailing record should not
+    // raise an error.
+    return;
+  }
+  Write(BigString("bar", kBlockSize));
+  // Remove the LAST block, including header.
+  ShrinkSize(14);
+  ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
+  ASSERT_GT(DroppedBytes(), 0U);
+  ASSERT_EQ("OK", MatchError("Corruption: error reading trailing data"));
+}
+
+TEST_P(LogTest, PartialLastIsIgnored) {
+  Write(BigString("bar", kBlockSize));
+  // Cause a bad record length in the LAST block.
+  ShrinkSize(1);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ("", ReportMessage());
+  ASSERT_EQ(0U, DroppedBytes());
+}
+
+TEST_P(LogTest, PartialLastIsNotIgnored) {
+  if (allow_retry_read_) {
+    // If read retry is allowed, then truncated trailing record should not
+    // raise an error.
+    return;
+  }
+  Write(BigString("bar", kBlockSize));
+  // Cause a bad record length in the LAST block.
+  ShrinkSize(1);
+  ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
+  ASSERT_GT(DroppedBytes(), 0U);
+  ASSERT_EQ("OK", MatchError("Corruption: truncated record body"));
+}
+
+TEST_P(LogTest, ErrorJoinsRecords) {
+  // Consider two fragmented records:
+  //    first(R1) last(R1) first(R2) last(R2)
+  // where the middle two fragments disappear.  We do not want
+  // first(R1),last(R2) to get joined and returned as a valid record.
+
+  // Write records that span two blocks
+  Write(BigString("foo", kBlockSize));
+  Write(BigString("bar", kBlockSize));
+  Write("correct");
+
+  // Wipe the middle block
+  for (unsigned int offset = kBlockSize; offset < 2 * kBlockSize; offset++) {
+    SetByte(offset, 'x');
+  }
+
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  if (!recyclable_log) {
+    ASSERT_EQ("correct", Read());
+    ASSERT_EQ("EOF", Read());
+    size_t dropped = DroppedBytes();
+    ASSERT_LE(dropped, 2 * kBlockSize + 100);
+    ASSERT_GE(dropped, 2 * kBlockSize);
+  } else {
+    ASSERT_EQ("EOF", Read());
+  }
+}
+
+TEST_P(LogTest, ClearEofSingleBlock) {
+  Write("foo");
+  Write("bar");
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  int header_size = recyclable_log ? kRecyclableHeaderSize : kHeaderSize;
+  ForceEOF(3 + header_size + 2);
+  ASSERT_EQ("foo", Read());
+  UnmarkEOF();
+  ASSERT_EQ("bar", Read());
+  ASSERT_TRUE(IsEOF());
+  ASSERT_EQ("EOF", Read());
+  Write("xxx");
+  UnmarkEOF();
+  ASSERT_EQ("xxx", Read());
+  ASSERT_TRUE(IsEOF());
+}
+
+TEST_P(LogTest, ClearEofMultiBlock) {
+  size_t num_full_blocks = 5;
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  int header_size = recyclable_log ? kRecyclableHeaderSize : kHeaderSize;
+  size_t n = (kBlockSize - header_size) * num_full_blocks + 25;
+  Write(BigString("foo", n));
+  Write(BigString("bar", n));
+  ForceEOF(n + num_full_blocks * header_size + header_size + 3);
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_TRUE(IsEOF());
+  UnmarkEOF();
+  ASSERT_EQ(BigString("bar", n), Read());
+  ASSERT_TRUE(IsEOF());
+  Write(BigString("xxx", n));
+  UnmarkEOF();
+  ASSERT_EQ(BigString("xxx", n), Read());
+  ASSERT_TRUE(IsEOF());
+}
+
+TEST_P(LogTest, ClearEofError) {
+  // If an error occurs during Read() in UnmarkEOF(), the records contained
+  // in the buffer should be returned on subsequent calls of ReadRecord()
+  // until no more full records are left, whereafter ReadRecord() should return
+  // false to indicate that it cannot read any further.
+
+  Write("foo");
+  Write("bar");
+  UnmarkEOF();
+  ASSERT_EQ("foo", Read());
+  ASSERT_TRUE(IsEOF());
+  Write("xxx");
+  ForceError(0);
+  UnmarkEOF();
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, ClearEofError2) {
+  Write("foo");
+  Write("bar");
+  UnmarkEOF();
+  ASSERT_EQ("foo", Read());
+  Write("xxx");
+  ForceError(3);
+  UnmarkEOF();
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("read error"));
+}
+
+TEST_P(LogTest, Recycle) {
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  if (!recyclable_log) {
+    return;  // test is only valid for recycled logs
+  }
+  Write("foo");
+  Write("bar");
+  Write("baz");
+  Write("bif");
+  Write("blitz");
+  while (get_reader_contents()->size() < log::kBlockSize * 2) {
+    Write("xxxxxxxxxxxxxxxx");
+  }
+  std::unique_ptr<FSWritableFile> sink(
+      new test::OverwritingStringSink(get_reader_contents()));
+  std::unique_ptr<WritableFileWriter> dest_holder(new WritableFileWriter(
+      std::move(sink), "" /* don't care */, FileOptions()));
+  Writer recycle_writer(std::move(dest_holder), 123, true);
+  ASSERT_OK(recycle_writer.AddRecord(Slice("foooo")));
+  ASSERT_OK(recycle_writer.AddRecord(Slice("bar")));
+  ASSERT_GE(get_reader_contents()->size(), log::kBlockSize * 2);
+  ASSERT_EQ("foooo", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+// Do NOT enable compression for this instantiation.
+INSTANTIATE_TEST_CASE_P(
+    Log, LogTest,
+    ::testing::Combine(::testing::Values(0, 1), ::testing::Bool(),
+                       ::testing::Values(CompressionType::kNoCompression)));
+
+class RetriableLogTest : public ::testing::TestWithParam<int> {
+ private:
+  class ReportCollector : public Reader::Reporter {
+   public:
+    size_t dropped_bytes_;
+    std::string message_;
+
+    ReportCollector() : dropped_bytes_(0) {}
+    void Corruption(size_t bytes, const Status& status) override {
+      dropped_bytes_ += bytes;
+      message_.append(status.ToString());
+    }
+  };
+
+  Slice contents_;
+  test::StringSink* sink_;
+  std::unique_ptr<Writer> log_writer_;
+  Env* env_;
+  const std::string test_dir_;
+  const std::string log_file_;
+  std::unique_ptr<WritableFileWriter> writer_;
+  std::unique_ptr<SequentialFileReader> reader_;
+  ReportCollector report_;
+  std::unique_ptr<FragmentBufferedReader> log_reader_;
+
+ public:
+  RetriableLogTest()
+      : contents_(),
+        sink_(new test::StringSink(&contents_)),
+        log_writer_(nullptr),
+        env_(Env::Default()),
+        test_dir_(test::PerThreadDBPath("retriable_log_test")),
+        log_file_(test_dir_ + "/log"),
+        writer_(nullptr),
+        reader_(nullptr),
+        log_reader_(nullptr) {
+    std::unique_ptr<FSWritableFile> sink_holder(sink_);
+    std::unique_ptr<WritableFileWriter> wfw(new WritableFileWriter(
+        std::move(sink_holder), "" /* file name */, FileOptions()));
+    log_writer_.reset(new Writer(std::move(wfw), 123, GetParam()));
+  }
+
+  Status SetupTestEnv() {
+    Status s;
+    FileOptions fopts;
+    auto fs = env_->GetFileSystem();
+    s = fs->CreateDirIfMissing(test_dir_, IOOptions(), nullptr);
+    std::unique_ptr<FSWritableFile> writable_file;
+    if (s.ok()) {
+      s = fs->NewWritableFile(log_file_, fopts, &writable_file, nullptr);
+    }
+    if (s.ok()) {
+      writer_.reset(
+          new WritableFileWriter(std::move(writable_file), log_file_, fopts));
+      EXPECT_NE(writer_, nullptr);
+    }
+    std::unique_ptr<FSSequentialFile> seq_file;
+    if (s.ok()) {
+      s = fs->NewSequentialFile(log_file_, fopts, &seq_file, nullptr);
+    }
+    if (s.ok()) {
+      reader_.reset(new SequentialFileReader(std::move(seq_file), log_file_));
+      EXPECT_NE(reader_, nullptr);
+      log_reader_.reset(new FragmentBufferedReader(
+          nullptr, std::move(reader_), &report_, true /* checksum */,
+          123 /* log_number */));
+      EXPECT_NE(log_reader_, nullptr);
+    }
+    return s;
+  }
+
+  std::string contents() { return sink_->contents_; }
+
+  void Encode(const std::string& msg) {
+    ASSERT_OK(log_writer_->AddRecord(Slice(msg)));
+  }
+
+  void Write(const Slice& data) {
+    ASSERT_OK(writer_->Append(data));
+    ASSERT_OK(writer_->Sync(true));
+  }
+
+  bool TryRead(std::string* result) {
+    assert(result != nullptr);
+    result->clear();
+    std::string scratch;
+    Slice record;
+    bool r = log_reader_->ReadRecord(&record, &scratch);
+    if (r) {
+      result->assign(record.data(), record.size());
+      return true;
+    } else {
+      return false;
+    }
+  }
+};
+
+TEST_P(RetriableLogTest, TailLog_PartialHeader) {
+  ASSERT_OK(SetupTestEnv());
+  std::vector<int> remaining_bytes_in_last_record;
+  size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
+  bool eof = false;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"RetriableLogTest::TailLog:AfterPart1",
+        "RetriableLogTest::TailLog:BeforeReadRecord"},
+       {"FragmentBufferedLogReader::TryReadMore:FirstEOF",
+        "RetriableLogTest::TailLog:BeforePart2"}});
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "FragmentBufferedLogReader::TryReadMore:FirstEOF",
+      [&](void* /*arg*/) { eof = true; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  size_t delta = header_size - 1;
+  port::Thread log_writer_thread([&]() {
+    size_t old_sz = contents().size();
+    Encode("foo");
+    size_t new_sz = contents().size();
+    std::string part1 = contents().substr(old_sz, delta);
+    std::string part2 =
+        contents().substr(old_sz + delta, new_sz - old_sz - delta);
+    Write(Slice(part1));
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:AfterPart1");
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforePart2");
+    Write(Slice(part2));
+  });
+
+  std::string record;
+  port::Thread log_reader_thread([&]() {
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforeReadRecord");
+    while (!TryRead(&record)) {
+    }
+  });
+  log_reader_thread.join();
+  log_writer_thread.join();
+  ASSERT_EQ("foo", record);
+  ASSERT_TRUE(eof);
+}
+
+TEST_P(RetriableLogTest, TailLog_FullHeader) {
+  ASSERT_OK(SetupTestEnv());
+  std::vector<int> remaining_bytes_in_last_record;
+  size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
+  bool eof = false;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"RetriableLogTest::TailLog:AfterPart1",
+        "RetriableLogTest::TailLog:BeforeReadRecord"},
+       {"FragmentBufferedLogReader::TryReadMore:FirstEOF",
+        "RetriableLogTest::TailLog:BeforePart2"}});
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "FragmentBufferedLogReader::TryReadMore:FirstEOF",
+      [&](void* /*arg*/) { eof = true; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  size_t delta = header_size + 1;
+  port::Thread log_writer_thread([&]() {
+    size_t old_sz = contents().size();
+    Encode("foo");
+    size_t new_sz = contents().size();
+    std::string part1 = contents().substr(old_sz, delta);
+    std::string part2 =
+        contents().substr(old_sz + delta, new_sz - old_sz - delta);
+    Write(Slice(part1));
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:AfterPart1");
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforePart2");
+    Write(Slice(part2));
+    ASSERT_TRUE(eof);
+  });
+
+  std::string record;
+  port::Thread log_reader_thread([&]() {
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforeReadRecord");
+    while (!TryRead(&record)) {
+    }
+  });
+  log_reader_thread.join();
+  log_writer_thread.join();
+  ASSERT_EQ("foo", record);
+}
+
+TEST_P(RetriableLogTest, NonBlockingReadFullRecord) {
+  // Clear all sync point callbacks even if this test does not use sync point.
+  // It is necessary, otherwise the execute of this test may hit a sync point
+  // with which a callback is registered. The registered callback may access
+  // some dead variable, causing segfault.
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  ASSERT_OK(SetupTestEnv());
+  size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
+  size_t delta = header_size - 1;
+  size_t old_sz = contents().size();
+  Encode("foo-bar");
+  size_t new_sz = contents().size();
+  std::string part1 = contents().substr(old_sz, delta);
+  std::string part2 =
+      contents().substr(old_sz + delta, new_sz - old_sz - delta);
+  Write(Slice(part1));
+  std::string record;
+  ASSERT_FALSE(TryRead(&record));
+  ASSERT_TRUE(record.empty());
+  Write(Slice(part2));
+  ASSERT_TRUE(TryRead(&record));
+  ASSERT_EQ("foo-bar", record);
+}
+
+INSTANTIATE_TEST_CASE_P(bool, RetriableLogTest, ::testing::Values(0, 2));
+
+class CompressionLogTest : public LogTest {
+ public:
+  Status SetupTestEnv() { return writer_->AddCompressionTypeRecord(); }
+};
+
+TEST_P(CompressionLogTest, Empty) {
+  CompressionType compression_type = std::get<2>(GetParam());
+  if (!StreamingCompressionTypeSupported(compression_type)) {
+    ROCKSDB_GTEST_SKIP("Test requires support for compression type");
+    return;
+  }
+  ASSERT_OK(SetupTestEnv());
+  const bool compression_enabled =
+      std::get<2>(GetParam()) == kNoCompression ? false : true;
+  // If WAL compression is enabled, a record is added for the compression type
+  const int compression_record_size = compression_enabled ? kHeaderSize + 4 : 0;
+  ASSERT_EQ(compression_record_size, WrittenBytes());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(CompressionLogTest, ReadWrite) {
+  CompressionType compression_type = std::get<2>(GetParam());
+  if (!StreamingCompressionTypeSupported(compression_type)) {
+    ROCKSDB_GTEST_SKIP("Test requires support for compression type");
+    return;
+  }
+  ASSERT_OK(SetupTestEnv());
+  Write("foo");
+  Write("bar");
+  Write("");
+  Write("xxxx");
+  ASSERT_EQ("foo", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("xxxx", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ("EOF", Read());  // Make sure reads at eof work
+}
+
+TEST_P(CompressionLogTest, ManyBlocks) {
+  CompressionType compression_type = std::get<2>(GetParam());
+  if (!StreamingCompressionTypeSupported(compression_type)) {
+    ROCKSDB_GTEST_SKIP("Test requires support for compression type");
+    return;
+  }
+  ASSERT_OK(SetupTestEnv());
+  for (int i = 0; i < 100000; i++) {
+    Write(NumberString(i));
+  }
+  for (int i = 0; i < 100000; i++) {
+    ASSERT_EQ(NumberString(i), Read());
+  }
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(CompressionLogTest, Fragmentation) {
+  CompressionType compression_type = std::get<2>(GetParam());
+  if (!StreamingCompressionTypeSupported(compression_type)) {
+    ROCKSDB_GTEST_SKIP("Test requires support for compression type");
+    return;
+  }
+  ASSERT_OK(SetupTestEnv());
+  Random rnd(301);
+  const std::vector<std::string> wal_entries = {
+      "small",
+      rnd.RandomBinaryString(3 * kBlockSize / 2),  // Spans into block 2
+      rnd.RandomBinaryString(3 * kBlockSize),      // Spans into block 5
+  };
+  for (const std::string& wal_entry : wal_entries) {
+    Write(wal_entry);
+  }
+
+  for (const std::string& wal_entry : wal_entries) {
+    ASSERT_EQ(wal_entry, Read());
+  }
+  ASSERT_EQ("EOF", Read());
+}
+
+INSTANTIATE_TEST_CASE_P(
+    Compression, CompressionLogTest,
+    ::testing::Combine(::testing::Values(0, 1), ::testing::Bool(),
+                       ::testing::Values(CompressionType::kNoCompression,
+                                         CompressionType::kZSTD)));
+
+class StreamingCompressionTest
+    : public ::testing::TestWithParam<std::tuple<int, CompressionType>> {};
+
+TEST_P(StreamingCompressionTest, Basic) {
+  size_t input_size = std::get<0>(GetParam());
+  CompressionType compression_type = std::get<1>(GetParam());
+  if (!StreamingCompressionTypeSupported(compression_type)) {
+    ROCKSDB_GTEST_SKIP("Test requires support for compression type");
+    return;
+  }
+  CompressionOptions opts;
+  constexpr uint32_t compression_format_version = 2;
+  StreamingCompress* compress = StreamingCompress::Create(
+      compression_type, opts, compression_format_version, kBlockSize);
+  StreamingUncompress* uncompress = StreamingUncompress::Create(
+      compression_type, compression_format_version, kBlockSize);
+  MemoryAllocator* allocator = new DefaultMemoryAllocator();
+  std::string input_buffer = BigString("abc", input_size);
+  std::vector<std::string> compressed_buffers;
+  size_t remaining;
+  // Call compress till the entire input is consumed
+  do {
+    char* output_buffer = (char*)allocator->Allocate(kBlockSize);
+    size_t output_pos;
+    remaining = compress->Compress(input_buffer.c_str(), input_size,
+                                   output_buffer, &output_pos);
+    if (output_pos > 0) {
+      std::string compressed_buffer;
+      compressed_buffer.assign(output_buffer, output_pos);
+      compressed_buffers.emplace_back(std::move(compressed_buffer));
+    }
+    allocator->Deallocate((void*)output_buffer);
+  } while (remaining > 0);
+  std::string uncompressed_buffer = "";
+  int ret_val = 0;
+  size_t output_pos;
+  char* uncompressed_output_buffer = (char*)allocator->Allocate(kBlockSize);
+  // Uncompress the fragments and concatenate them.
+  for (int i = 0; i < (int)compressed_buffers.size(); i++) {
+    // Call uncompress till either the entire input is consumed or the output
+    // buffer size is equal to the allocated output buffer size.
+    do {
+      ret_val = uncompress->Uncompress(compressed_buffers[i].c_str(),
+                                       compressed_buffers[i].size(),
+                                       uncompressed_output_buffer, &output_pos);
+      if (output_pos > 0) {
+        std::string uncompressed_fragment;
+        uncompressed_fragment.assign(uncompressed_output_buffer, output_pos);
+        uncompressed_buffer += uncompressed_fragment;
+      }
+    } while (ret_val > 0 || output_pos == kBlockSize);
+  }
+  allocator->Deallocate((void*)uncompressed_output_buffer);
+  delete allocator;
+  delete compress;
+  delete uncompress;
+  // The final return value from uncompress() should be 0.
+  ASSERT_EQ(ret_val, 0);
+  ASSERT_EQ(input_buffer, uncompressed_buffer);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    StreamingCompression, StreamingCompressionTest,
+    ::testing::Combine(::testing::Values(10, 100, 1000, kBlockSize,
+                                         kBlockSize * 2),
+                       ::testing::Values(CompressionType::kZSTD)));
+
+}  // namespace log
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/log_writer.cc b/src/rocksdb/db/log_writer.cc
new file mode 100644
index 000000000..56f58543e
--- /dev/null
+++ b/src/rocksdb/db/log_writer.cc
@@ -0,0 +1,249 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_writer.h"
+
+#include <stdint.h>
+
+#include "file/writable_file_writer.h"
+#include "rocksdb/env.h"
+#include "rocksdb/io_status.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace log {
+
+Writer::Writer(std::unique_ptr<WritableFileWriter>&& dest, uint64_t log_number,
+               bool recycle_log_files, bool manual_flush,
+               CompressionType compression_type)
+    : dest_(std::move(dest)),
+      block_offset_(0),
+      log_number_(log_number),
+      recycle_log_files_(recycle_log_files),
+      manual_flush_(manual_flush),
+      compression_type_(compression_type),
+      compress_(nullptr) {
+  for (int i = 0; i <= kMaxRecordType; i++) {
+    char t = static_cast<char>(i);
+    type_crc_[i] = crc32c::Value(&t, 1);
+  }
+}
+
+Writer::~Writer() {
+  if (dest_) {
+    WriteBuffer().PermitUncheckedError();
+  }
+  if (compress_) {
+    delete compress_;
+  }
+}
+
+IOStatus Writer::WriteBuffer() {
+  if (dest_->seen_error()) {
+    return IOStatus::IOError("Seen error. Skip writing buffer.");
+  }
+  return dest_->Flush();
+}
+
+IOStatus Writer::Close() {
+  IOStatus s;
+  if (dest_) {
+    s = dest_->Close();
+    dest_.reset();
+  }
+  return s;
+}
+
+IOStatus Writer::AddRecord(const Slice& slice,
+                           Env::IOPriority rate_limiter_priority) {
+  const char* ptr = slice.data();
+  size_t left = slice.size();
+
+  // Header size varies depending on whether we are recycling or not.
+  const int header_size =
+      recycle_log_files_ ? kRecyclableHeaderSize : kHeaderSize;
+
+  // Fragment the record if necessary and emit it.  Note that if slice
+  // is empty, we still want to iterate once to emit a single
+  // zero-length record
+  IOStatus s;
+  bool begin = true;
+  int compress_remaining = 0;
+  bool compress_start = false;
+  if (compress_) {
+    compress_->Reset();
+    compress_start = true;
+  }
+  do {
+    const int64_t leftover = kBlockSize - block_offset_;
+    assert(leftover >= 0);
+    if (leftover < header_size) {
+      // Switch to a new block
+      if (leftover > 0) {
+        // Fill the trailer (literal below relies on kHeaderSize and
+        // kRecyclableHeaderSize being <= 11)
+        assert(header_size <= 11);
+        s = dest_->Append(Slice("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+                                static_cast<size_t>(leftover)),
+                          0 /* crc32c_checksum */, rate_limiter_priority);
+        if (!s.ok()) {
+          break;
+        }
+      }
+      block_offset_ = 0;
+    }
+
+    // Invariant: we never leave < header_size bytes in a block.
+    assert(static_cast<int64_t>(kBlockSize - block_offset_) >= header_size);
+
+    const size_t avail = kBlockSize - block_offset_ - header_size;
+
+    // Compress the record if compression is enabled.
+    // Compress() is called at least once (compress_start=true) and after the
+    // previous generated compressed chunk is written out as one or more
+    // physical records (left=0).
+    if (compress_ && (compress_start || left == 0)) {
+      compress_remaining = compress_->Compress(slice.data(), slice.size(),
+                                               compressed_buffer_.get(), &left);
+
+      if (compress_remaining < 0) {
+        // Set failure status
+        s = IOStatus::IOError("Unexpected WAL compression error");
+        s.SetDataLoss(true);
+        break;
+      } else if (left == 0) {
+        // Nothing left to compress
+        if (!compress_start) {
+          break;
+        }
+      }
+      compress_start = false;
+      ptr = compressed_buffer_.get();
+    }
+
+    const size_t fragment_length = (left < avail) ? left : avail;
+
+    RecordType type;
+    const bool end = (left == fragment_length && compress_remaining == 0);
+    if (begin && end) {
+      type = recycle_log_files_ ? kRecyclableFullType : kFullType;
+    } else if (begin) {
+      type = recycle_log_files_ ? kRecyclableFirstType : kFirstType;
+    } else if (end) {
+      type = recycle_log_files_ ? kRecyclableLastType : kLastType;
+    } else {
+      type = recycle_log_files_ ? kRecyclableMiddleType : kMiddleType;
+    }
+
+    s = EmitPhysicalRecord(type, ptr, fragment_length, rate_limiter_priority);
+    ptr += fragment_length;
+    left -= fragment_length;
+    begin = false;
+  } while (s.ok() && (left > 0 || compress_remaining > 0));
+
+  if (s.ok()) {
+    if (!manual_flush_) {
+      s = dest_->Flush(rate_limiter_priority);
+    }
+  }
+
+  return s;
+}
+
+IOStatus Writer::AddCompressionTypeRecord() {
+  // Should be the first record
+  assert(block_offset_ == 0);
+
+  if (compression_type_ == kNoCompression) {
+    // No need to add a record
+    return IOStatus::OK();
+  }
+
+  CompressionTypeRecord record(compression_type_);
+  std::string encode;
+  record.EncodeTo(&encode);
+  IOStatus s =
+      EmitPhysicalRecord(kSetCompressionType, encode.data(), encode.size());
+  if (s.ok()) {
+    if (!manual_flush_) {
+      s = dest_->Flush();
+    }
+    // Initialize fields required for compression
+    const size_t max_output_buffer_len =
+        kBlockSize - (recycle_log_files_ ? kRecyclableHeaderSize : kHeaderSize);
+    CompressionOptions opts;
+    constexpr uint32_t compression_format_version = 2;
+    compress_ = StreamingCompress::Create(compression_type_, opts,
+                                          compression_format_version,
+                                          max_output_buffer_len);
+    assert(compress_ != nullptr);
+    compressed_buffer_ =
+        std::unique_ptr<char[]>(new char[max_output_buffer_len]);
+    assert(compressed_buffer_);
+  } else {
+    // Disable compression if the record could not be added.
+    compression_type_ = kNoCompression;
+  }
+  return s;
+}
+
+bool Writer::BufferIsEmpty() { return dest_->BufferIsEmpty(); }
+
+IOStatus Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n,
+                                    Env::IOPriority rate_limiter_priority) {
+  assert(n <= 0xffff);  // Must fit in two bytes
+
+  size_t header_size;
+  char buf[kRecyclableHeaderSize];
+
+  // Format the header
+  buf[4] = static_cast<char>(n & 0xff);
+  buf[5] = static_cast<char>(n >> 8);
+  buf[6] = static_cast<char>(t);
+
+  uint32_t crc = type_crc_[t];
+  if (t < kRecyclableFullType || t == kSetCompressionType) {
+    // Legacy record format
+    assert(block_offset_ + kHeaderSize + n <= kBlockSize);
+    header_size = kHeaderSize;
+  } else {
+    // Recyclable record format
+    assert(block_offset_ + kRecyclableHeaderSize + n <= kBlockSize);
+    header_size = kRecyclableHeaderSize;
+
+    // Only encode low 32-bits of the 64-bit log number.  This means
+    // we will fail to detect an old record if we recycled a log from
+    // ~4 billion logs ago, but that is effectively impossible, and
+    // even if it were we'dbe far more likely to see a false positive
+    // on the 32-bit CRC.
+    EncodeFixed32(buf + 7, static_cast<uint32_t>(log_number_));
+    crc = crc32c::Extend(crc, buf + 7, 4);
+  }
+
+  // Compute the crc of the record type and the payload.
+  uint32_t payload_crc = crc32c::Value(ptr, n);
+  crc = crc32c::Crc32cCombine(crc, payload_crc, n);
+  crc = crc32c::Mask(crc);  // Adjust for storage
+  TEST_SYNC_POINT_CALLBACK("LogWriter::EmitPhysicalRecord:BeforeEncodeChecksum",
+                           &crc);
+  EncodeFixed32(buf, crc);
+
+  // Write the header and the payload
+  IOStatus s = dest_->Append(Slice(buf, header_size), 0 /* crc32c_checksum */,
+                             rate_limiter_priority);
+  if (s.ok()) {
+    s = dest_->Append(Slice(ptr, n), payload_crc, rate_limiter_priority);
+  }
+  block_offset_ += header_size + n;
+  return s;
+}
+
+}  // namespace log
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/log_writer.h b/src/rocksdb/db/log_writer.h
new file mode 100644
index 000000000..5d266e434
--- /dev/null
+++ b/src/rocksdb/db/log_writer.h
@@ -0,0 +1,128 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "db/log_format.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/env.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "util/compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WritableFileWriter;
+
+namespace log {
+
+/**
+ * Writer is a general purpose log stream writer. It provides an append-only
+ * abstraction for writing data. The details of the how the data is written is
+ * handled by the WritableFile sub-class implementation.
+ *
+ * File format:
+ *
+ * File is broken down into variable sized records. The format of each record
+ * is described below.
+ *       +-----+-------------+--+----+----------+------+-- ... ----+
+ * File  | r0  |        r1   |P | r2 |    r3    |  r4  |           |
+ *       +-----+-------------+--+----+----------+------+-- ... ----+
+ *       <--- kBlockSize ------>|<-- kBlockSize ------>|
+ *  rn = variable size records
+ *  P = Padding
+ *
+ * Data is written out in kBlockSize chunks. If next record does not fit
+ * into the space left, the leftover space will be padded with \0.
+ *
+ * Legacy record format:
+ *
+ * +---------+-----------+-----------+--- ... ---+
+ * |CRC (4B) | Size (2B) | Type (1B) | Payload   |
+ * +---------+-----------+-----------+--- ... ---+
+ *
+ * CRC = 32bit hash computed over the record type and payload using CRC
+ * Size = Length of the payload data
+ * Type = Type of record
+ *        (kZeroType, kFullType, kFirstType, kLastType, kMiddleType )
+ *        The type is used to group a bunch of records together to represent
+ *        blocks that are larger than kBlockSize
+ * Payload = Byte stream as long as specified by the payload size
+ *
+ * Recyclable record format:
+ *
+ * +---------+-----------+-----------+----------------+--- ... ---+
+ * |CRC (4B) | Size (2B) | Type (1B) | Log number (4B)| Payload   |
+ * +---------+-----------+-----------+----------------+--- ... ---+
+ *
+ * Same as above, with the addition of
+ * Log number = 32bit log file number, so that we can distinguish between
+ * records written by the most recent log writer vs a previous one.
+ */
+class Writer {
+ public:
+  // Create a writer that will append data to "*dest".
+  // "*dest" must be initially empty.
+  // "*dest" must remain live while this Writer is in use.
+  explicit Writer(std::unique_ptr<WritableFileWriter>&& dest,
+                  uint64_t log_number, bool recycle_log_files,
+                  bool manual_flush = false,
+                  CompressionType compressionType = kNoCompression);
+  // No copying allowed
+  Writer(const Writer&) = delete;
+  void operator=(const Writer&) = delete;
+
+  ~Writer();
+
+  IOStatus AddRecord(const Slice& slice,
+                     Env::IOPriority rate_limiter_priority = Env::IO_TOTAL);
+  IOStatus AddCompressionTypeRecord();
+
+  WritableFileWriter* file() { return dest_.get(); }
+  const WritableFileWriter* file() const { return dest_.get(); }
+
+  uint64_t get_log_number() const { return log_number_; }
+
+  IOStatus WriteBuffer();
+
+  IOStatus Close();
+
+  bool BufferIsEmpty();
+
+ private:
+  std::unique_ptr<WritableFileWriter> dest_;
+  size_t block_offset_;  // Current offset in block
+  uint64_t log_number_;
+  bool recycle_log_files_;
+
+  // crc32c values for all supported record types.  These are
+  // pre-computed to reduce the overhead of computing the crc of the
+  // record type stored in the header.
+  uint32_t type_crc_[kMaxRecordType + 1];
+
+  IOStatus EmitPhysicalRecord(
+      RecordType type, const char* ptr, size_t length,
+      Env::IOPriority rate_limiter_priority = Env::IO_TOTAL);
+
+  // If true, it does not flush after each write. Instead it relies on the upper
+  // layer to manually does the flush by calling ::WriteBuffer()
+  bool manual_flush_;
+
+  // Compression Type
+  CompressionType compression_type_;
+  StreamingCompress* compress_;
+  // Reusable compressed output buffer
+  std::unique_ptr<char[]> compressed_buffer_;
+};
+
+}  // namespace log
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/logs_with_prep_tracker.cc b/src/rocksdb/db/logs_with_prep_tracker.cc
new file mode 100644
index 000000000..ff98155c4
--- /dev/null
+++ b/src/rocksdb/db/logs_with_prep_tracker.cc
@@ -0,0 +1,67 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/logs_with_prep_tracker.h"
+
+#include "port/likely.h"
+
+namespace ROCKSDB_NAMESPACE {
+void LogsWithPrepTracker::MarkLogAsHavingPrepSectionFlushed(uint64_t log) {
+  assert(log != 0);
+  std::lock_guard<std::mutex> lock(prepared_section_completed_mutex_);
+  auto it = prepared_section_completed_.find(log);
+  if (UNLIKELY(it == prepared_section_completed_.end())) {
+    prepared_section_completed_[log] = 1;
+  } else {
+    it->second += 1;
+  }
+}
+
+void LogsWithPrepTracker::MarkLogAsContainingPrepSection(uint64_t log) {
+  assert(log != 0);
+  std::lock_guard<std::mutex> lock(logs_with_prep_mutex_);
+
+  auto rit = logs_with_prep_.rbegin();
+  bool updated = false;
+  // Most probably the last log is the one that is being marked for
+  // having a prepare section; so search from the end.
+  for (; rit != logs_with_prep_.rend() && rit->log >= log; ++rit) {
+    if (rit->log == log) {
+      rit->cnt++;
+      updated = true;
+      break;
+    }
+  }
+  if (!updated) {
+    // We are either at the start, or at a position with rit->log < log
+    logs_with_prep_.insert(rit.base(), {log, 1});
+  }
+}
+
+uint64_t LogsWithPrepTracker::FindMinLogContainingOutstandingPrep() {
+  std::lock_guard<std::mutex> lock(logs_with_prep_mutex_);
+  auto it = logs_with_prep_.begin();
+  // start with the smallest log
+  for (; it != logs_with_prep_.end();) {
+    auto min_log = it->log;
+    {
+      std::lock_guard<std::mutex> lock2(prepared_section_completed_mutex_);
+      auto completed_it = prepared_section_completed_.find(min_log);
+      if (completed_it == prepared_section_completed_.end() ||
+          completed_it->second < it->cnt) {
+        return min_log;
+      }
+      assert(completed_it != prepared_section_completed_.end() &&
+             completed_it->second == it->cnt);
+      prepared_section_completed_.erase(completed_it);
+    }
+    // erase from beginning in vector is not efficient but this function is not
+    // on the fast path.
+    it = logs_with_prep_.erase(it);
+  }
+  // no such log found
+  return 0;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/logs_with_prep_tracker.h b/src/rocksdb/db/logs_with_prep_tracker.h
new file mode 100644
index 000000000..f72f0ca07
--- /dev/null
+++ b/src/rocksdb/db/logs_with_prep_tracker.h
@@ -0,0 +1,62 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <mutex>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This class is used to track the log files with outstanding prepare entries.
+class LogsWithPrepTracker {
+ public:
+  // Called when a transaction prepared in `log` has been committed or aborted.
+  void MarkLogAsHavingPrepSectionFlushed(uint64_t log);
+  // Called when a transaction is prepared in `log`.
+  void MarkLogAsContainingPrepSection(uint64_t log);
+  // Return the earliest log file with outstanding prepare entries.
+  uint64_t FindMinLogContainingOutstandingPrep();
+  size_t TEST_PreparedSectionCompletedSize() {
+    return prepared_section_completed_.size();
+  }
+  size_t TEST_LogsWithPrepSize() { return logs_with_prep_.size(); }
+
+ private:
+  // REQUIRES: logs_with_prep_mutex_ held
+  //
+  // sorted list of log numbers still containing prepared data.
+  // this is used by FindObsoleteFiles to determine which
+  // flushed logs we must keep around because they still
+  // contain prepared data which has not been committed or rolled back
+  struct LogCnt {
+    uint64_t log;  // the log number
+    uint64_t cnt;  // number of prepared sections in the log
+  };
+  std::vector<LogCnt> logs_with_prep_;
+  std::mutex logs_with_prep_mutex_;
+
+  // REQUIRES: prepared_section_completed_mutex_ held
+  //
+  // to be used in conjunction with logs_with_prep_.
+  // once a transaction with data in log L is committed or rolled back
+  // rather than updating logs_with_prep_ directly we keep track of that
+  // in prepared_section_completed_ which maps LOG -> instance_count. This helps
+  // avoiding contention between a commit thread and the prepare threads.
+  //
+  // when trying to determine the minimum log still active we first
+  // consult logs_with_prep_. while that root value maps to
+  // an equal value in prepared_section_completed_ we erase the log from
+  // both logs_with_prep_ and prepared_section_completed_.
+  std::unordered_map<uint64_t, uint64_t> prepared_section_completed_;
+  std::mutex prepared_section_completed_mutex_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/lookup_key.h b/src/rocksdb/db/lookup_key.h
new file mode 100644
index 000000000..68851bddd
--- /dev/null
+++ b/src/rocksdb/db/lookup_key.h
@@ -0,0 +1,68 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <string>
+#include <utility>
+
+#include "rocksdb/slice.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A helper class useful for DBImpl::Get()
+class LookupKey {
+ public:
+  // Initialize *this for looking up user_key at a snapshot with
+  // the specified sequence number.
+  LookupKey(const Slice& _user_key, SequenceNumber sequence,
+            const Slice* ts = nullptr);
+
+  ~LookupKey();
+
+  // Return a key suitable for lookup in a MemTable.
+  Slice memtable_key() const {
+    return Slice(start_, static_cast<size_t>(end_ - start_));
+  }
+
+  // Return an internal key (suitable for passing to an internal iterator)
+  Slice internal_key() const {
+    return Slice(kstart_, static_cast<size_t>(end_ - kstart_));
+  }
+
+  // Return the user key.
+  // If user-defined timestamp is enabled, then timestamp is included in the
+  // result.
+  Slice user_key() const {
+    return Slice(kstart_, static_cast<size_t>(end_ - kstart_ - 8));
+  }
+
+ private:
+  // We construct a char array of the form:
+  //    klength  varint32               <-- start_
+  //    userkey  char[klength]          <-- kstart_
+  //    tag      uint64
+  //                                    <-- end_
+  // The array is a suitable MemTable key.
+  // The suffix starting with "userkey" can be used as an InternalKey.
+  const char* start_;
+  const char* kstart_;
+  const char* end_;
+  char space_[200];  // Avoid allocation for short keys
+
+  // No copying allowed
+  LookupKey(const LookupKey&);
+  void operator=(const LookupKey&);
+};
+
+inline LookupKey::~LookupKey() {
+  if (start_ != space_) delete[] start_;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/malloc_stats.cc b/src/rocksdb/db/malloc_stats.cc
new file mode 100644
index 000000000..52f2e6e0f
--- /dev/null
+++ b/src/rocksdb/db/malloc_stats.cc
@@ -0,0 +1,55 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/malloc_stats.h"
+
+#ifndef ROCKSDB_LITE
+#include <string.h>
+
+#include <memory>
+
+#include "port/jemalloc_helper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifdef ROCKSDB_JEMALLOC
+
+struct MallocStatus {
+  char* cur;
+  char* end;
+};
+
+static void GetJemallocStatus(void* mstat_arg, const char* status) {
+  MallocStatus* mstat = reinterpret_cast<MallocStatus*>(mstat_arg);
+  size_t status_len = status ? strlen(status) : 0;
+  size_t buf_size = (size_t)(mstat->end - mstat->cur);
+  if (!status_len || status_len > buf_size) {
+    return;
+  }
+
+  snprintf(mstat->cur, buf_size, "%s", status);
+  mstat->cur += status_len;
+}
+void DumpMallocStats(std::string* stats) {
+  if (!HasJemalloc()) {
+    return;
+  }
+  MallocStatus mstat;
+  const unsigned int kMallocStatusLen = 1000000;
+  std::unique_ptr<char[]> buf{new char[kMallocStatusLen + 1]};
+  mstat.cur = buf.get();
+  mstat.end = buf.get() + kMallocStatusLen;
+  malloc_stats_print(GetJemallocStatus, &mstat, "");
+  stats->append(buf.get());
+}
+#else
+void DumpMallocStats(std::string*) {}
+#endif  // ROCKSDB_JEMALLOC
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/malloc_stats.h b/src/rocksdb/db/malloc_stats.h
new file mode 100644
index 000000000..18aff3ad0
--- /dev/null
+++ b/src/rocksdb/db/malloc_stats.h
@@ -0,0 +1,24 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void DumpMallocStats(std::string*);
+
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/manual_compaction_test.cc b/src/rocksdb/db/manual_compaction_test.cc
new file mode 100644
index 000000000..b92cb794b
--- /dev/null
+++ b/src/rocksdb/db/manual_compaction_test.cc
@@ -0,0 +1,308 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Test for issue 178: a manual compaction causes deleted data to reappear.
+#include <cstdlib>
+
+#include "port/port.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/write_batch.h"
+#include "test_util/testharness.h"
+
+using ROCKSDB_NAMESPACE::CompactionFilter;
+using ROCKSDB_NAMESPACE::CompactionStyle;
+using ROCKSDB_NAMESPACE::CompactRangeOptions;
+using ROCKSDB_NAMESPACE::CompressionType;
+using ROCKSDB_NAMESPACE::DB;
+using ROCKSDB_NAMESPACE::DestroyDB;
+using ROCKSDB_NAMESPACE::FlushOptions;
+using ROCKSDB_NAMESPACE::Iterator;
+using ROCKSDB_NAMESPACE::Options;
+using ROCKSDB_NAMESPACE::ReadOptions;
+using ROCKSDB_NAMESPACE::Slice;
+using ROCKSDB_NAMESPACE::WriteBatch;
+using ROCKSDB_NAMESPACE::WriteOptions;
+
+namespace {
+
+// Reasoning: previously the number was 1100000. Since the keys are written to
+// the batch in one write each write will result into one SST file. each write
+// will result into one SST file. We reduced the write_buffer_size to 1K to
+// basically have the same effect with however less number of keys, which
+// results into less test runtime.
+const int kNumKeys = 1100;
+
+std::string Key1(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "my_key_%d", i);
+  return buf;
+}
+
+std::string Key2(int i) { return Key1(i) + "_xxx"; }
+
+class ManualCompactionTest : public testing::Test {
+ public:
+  ManualCompactionTest() {
+    // Get rid of any state from an old run.
+    dbname_ = ROCKSDB_NAMESPACE::test::PerThreadDBPath(
+        "rocksdb_manual_compaction_test");
+    EXPECT_OK(DestroyDB(dbname_, Options()));
+  }
+
+  std::string dbname_;
+};
+
+class DestroyAllCompactionFilter : public CompactionFilter {
+ public:
+  DestroyAllCompactionFilter() {}
+
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& existing_value,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    return existing_value.ToString() == "destroy";
+  }
+
+  const char* Name() const override { return "DestroyAllCompactionFilter"; }
+};
+
+class LogCompactionFilter : public CompactionFilter {
+ public:
+  const char* Name() const override { return "LogCompactionFilter"; }
+
+  bool Filter(int level, const Slice& key, const Slice& /*existing_value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    key_level_[key.ToString()] = level;
+    return false;
+  }
+
+  void Reset() { key_level_.clear(); }
+
+  size_t NumKeys() const { return key_level_.size(); }
+
+  int KeyLevel(const Slice& key) {
+    auto it = key_level_.find(key.ToString());
+    if (it == key_level_.end()) {
+      return -1;
+    }
+    return it->second;
+  }
+
+ private:
+  mutable std::map<std::string, int> key_level_;
+};
+
+TEST_F(ManualCompactionTest, CompactTouchesAllKeys) {
+  for (int iter = 0; iter < 2; ++iter) {
+    DB* db;
+    Options options;
+    if (iter == 0) {  // level compaction
+      options.num_levels = 3;
+      options.compaction_style = CompactionStyle::kCompactionStyleLevel;
+    } else {  // universal compaction
+      options.compaction_style = CompactionStyle::kCompactionStyleUniversal;
+    }
+    options.create_if_missing = true;
+    options.compression = CompressionType::kNoCompression;
+    options.compaction_filter = new DestroyAllCompactionFilter();
+    ASSERT_OK(DB::Open(options, dbname_, &db));
+
+    ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice("destroy")));
+    ASSERT_OK(db->Put(WriteOptions(), Slice("key2"), Slice("destroy")));
+    ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
+    ASSERT_OK(db->Put(WriteOptions(), Slice("key4"), Slice("destroy")));
+
+    Slice key4("key4");
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, &key4));
+    Iterator* itr = db->NewIterator(ReadOptions());
+    itr->SeekToFirst();
+    ASSERT_TRUE(itr->Valid());
+    ASSERT_EQ("key3", itr->key().ToString());
+    itr->Next();
+    ASSERT_TRUE(!itr->Valid());
+    delete itr;
+
+    delete options.compaction_filter;
+    delete db;
+    ASSERT_OK(DestroyDB(dbname_, options));
+  }
+}
+
+TEST_F(ManualCompactionTest, Test) {
+  // Open database.  Disable compression since it affects the creation
+  // of layers and the code below is trying to test against a very
+  // specific scenario.
+  DB* db;
+  Options db_options;
+  db_options.write_buffer_size = 1024;
+  db_options.create_if_missing = true;
+  db_options.compression = CompressionType::kNoCompression;
+  ASSERT_OK(DB::Open(db_options, dbname_, &db));
+
+  // create first key range
+  WriteBatch batch;
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Put(Key1(i), "value for range 1 key"));
+  }
+  ASSERT_OK(db->Write(WriteOptions(), &batch));
+
+  // create second key range
+  batch.Clear();
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Put(Key2(i), "value for range 2 key"));
+  }
+  ASSERT_OK(db->Write(WriteOptions(), &batch));
+
+  // delete second key range
+  batch.Clear();
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Delete(Key2(i)));
+  }
+  ASSERT_OK(db->Write(WriteOptions(), &batch));
+
+  // compact database
+  std::string start_key = Key1(0);
+  std::string end_key = Key1(kNumKeys - 1);
+  Slice least(start_key.data(), start_key.size());
+  Slice greatest(end_key.data(), end_key.size());
+
+  // commenting out the line below causes the example to work correctly
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), &least, &greatest));
+
+  // count the keys
+  Iterator* iter = db->NewIterator(ReadOptions());
+  int num_keys = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    num_keys++;
+  }
+  delete iter;
+  ASSERT_EQ(kNumKeys, num_keys) << "Bad number of keys";
+
+  // close database
+  delete db;
+  ASSERT_OK(DestroyDB(dbname_, Options()));
+}
+
+TEST_F(ManualCompactionTest, SkipLevel) {
+  DB* db;
+  Options options;
+  options.num_levels = 3;
+  // Initially, flushed L0 files won't exceed 100.
+  options.level0_file_num_compaction_trigger = 100;
+  options.compaction_style = CompactionStyle::kCompactionStyleLevel;
+  options.create_if_missing = true;
+  options.compression = CompressionType::kNoCompression;
+  LogCompactionFilter* filter = new LogCompactionFilter();
+  options.compaction_filter = filter;
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+
+  WriteOptions wo;
+  FlushOptions fo;
+  ASSERT_OK(db->Put(wo, "1", ""));
+  ASSERT_OK(db->Flush(fo));
+  ASSERT_OK(db->Put(wo, "2", ""));
+  ASSERT_OK(db->Flush(fo));
+  ASSERT_OK(db->Put(wo, "4", ""));
+  ASSERT_OK(db->Put(wo, "8", ""));
+  ASSERT_OK(db->Flush(fo));
+
+  {
+    // L0: 1, 2, [4, 8]
+    // no file has keys in range [5, 7]
+    Slice start("5");
+    Slice end("7");
+    filter->Reset();
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end));
+    ASSERT_EQ(0, filter->NumKeys());
+  }
+
+  {
+    // L0: 1, 2, [4, 8]
+    // [3, 7] overlaps with 4 in L0
+    Slice start("3");
+    Slice end("7");
+    filter->Reset();
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end));
+    ASSERT_EQ(2, filter->NumKeys());
+    ASSERT_EQ(0, filter->KeyLevel("4"));
+    ASSERT_EQ(0, filter->KeyLevel("8"));
+  }
+
+  {
+    // L0: 1, 2
+    // L1: [4, 8]
+    // no file has keys in range (-inf, 0]
+    Slice end("0");
+    filter->Reset();
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, &end));
+    ASSERT_EQ(0, filter->NumKeys());
+  }
+
+  {
+    // L0: 1, 2
+    // L1: [4, 8]
+    // no file has keys in range [9, inf)
+    Slice start("9");
+    filter->Reset();
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, nullptr));
+    ASSERT_EQ(0, filter->NumKeys());
+  }
+
+  {
+    // L0: 1, 2
+    // L1: [4, 8]
+    // [2, 2] overlaps with 2 in L0
+    Slice start("2");
+    Slice end("2");
+    filter->Reset();
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end));
+    ASSERT_EQ(1, filter->NumKeys());
+    ASSERT_EQ(0, filter->KeyLevel("2"));
+  }
+
+  {
+    // L0: 1
+    // L1: 2, [4, 8]
+    // [2, 5] overlaps with 2 and [4, 8) in L1, skip L0
+    Slice start("2");
+    Slice end("5");
+    filter->Reset();
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end));
+    ASSERT_EQ(3, filter->NumKeys());
+    ASSERT_EQ(1, filter->KeyLevel("2"));
+    ASSERT_EQ(1, filter->KeyLevel("4"));
+    ASSERT_EQ(1, filter->KeyLevel("8"));
+  }
+
+  {
+    // L0: 1
+    // L1: [2, 4, 8]
+    // [0, inf) overlaps all files
+    Slice start("0");
+    filter->Reset();
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, nullptr));
+    ASSERT_EQ(4, filter->NumKeys());
+    // 1 is first compacted to L1 and then further compacted into [2, 4, 8],
+    // so finally the logged level for 1 is L1.
+    ASSERT_EQ(1, filter->KeyLevel("1"));
+    ASSERT_EQ(1, filter->KeyLevel("2"));
+    ASSERT_EQ(1, filter->KeyLevel("4"));
+    ASSERT_EQ(1, filter->KeyLevel("8"));
+  }
+
+  delete filter;
+  delete db;
+  ASSERT_OK(DestroyDB(dbname_, options));
+}
+
+}  // anonymous namespace
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/memtable.cc b/src/rocksdb/db/memtable.cc
new file mode 100644
index 000000000..45b139e80
--- /dev/null
+++ b/src/rocksdb/db/memtable.cc
@@ -0,0 +1,1675 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/memtable.h"
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <memory>
+
+#include "db/dbformat.h"
+#include "db/kv_checksum.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/read_callback.h"
+#include "db/wide/wide_column_serialization.h"
+#include "logging/logging.h"
+#include "memory/arena.h"
+#include "memory/memory_usage.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/statistics.h"
+#include "port/lang.h"
+#include "port/port.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/types.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/internal_iterator.h"
+#include "table/iterator_wrapper.h"
+#include "table/merging_iterator.h"
+#include "util/autovector.h"
+#include "util/coding.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ImmutableMemTableOptions::ImmutableMemTableOptions(
+    const ImmutableOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options)
+    : arena_block_size(mutable_cf_options.arena_block_size),
+      memtable_prefix_bloom_bits(
+          static_cast<uint32_t>(
+              static_cast<double>(mutable_cf_options.write_buffer_size) *
+              mutable_cf_options.memtable_prefix_bloom_size_ratio) *
+          8u),
+      memtable_huge_page_size(mutable_cf_options.memtable_huge_page_size),
+      memtable_whole_key_filtering(
+          mutable_cf_options.memtable_whole_key_filtering),
+      inplace_update_support(ioptions.inplace_update_support),
+      inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks),
+      inplace_callback(ioptions.inplace_callback),
+      max_successive_merges(mutable_cf_options.max_successive_merges),
+      statistics(ioptions.stats),
+      merge_operator(ioptions.merge_operator.get()),
+      info_log(ioptions.logger),
+      allow_data_in_errors(ioptions.allow_data_in_errors),
+      protection_bytes_per_key(
+          mutable_cf_options.memtable_protection_bytes_per_key) {}
+
+MemTable::MemTable(const InternalKeyComparator& cmp,
+                   const ImmutableOptions& ioptions,
+                   const MutableCFOptions& mutable_cf_options,
+                   WriteBufferManager* write_buffer_manager,
+                   SequenceNumber latest_seq, uint32_t column_family_id)
+    : comparator_(cmp),
+      moptions_(ioptions, mutable_cf_options),
+      refs_(0),
+      kArenaBlockSize(OptimizeBlockSize(moptions_.arena_block_size)),
+      mem_tracker_(write_buffer_manager),
+      arena_(moptions_.arena_block_size,
+             (write_buffer_manager != nullptr &&
+              (write_buffer_manager->enabled() ||
+               write_buffer_manager->cost_to_cache()))
+                 ? &mem_tracker_
+                 : nullptr,
+             mutable_cf_options.memtable_huge_page_size),
+      table_(ioptions.memtable_factory->CreateMemTableRep(
+          comparator_, &arena_, mutable_cf_options.prefix_extractor.get(),
+          ioptions.logger, column_family_id)),
+      range_del_table_(SkipListFactory().CreateMemTableRep(
+          comparator_, &arena_, nullptr /* transform */, ioptions.logger,
+          column_family_id)),
+      is_range_del_table_empty_(true),
+      data_size_(0),
+      num_entries_(0),
+      num_deletes_(0),
+      write_buffer_size_(mutable_cf_options.write_buffer_size),
+      flush_in_progress_(false),
+      flush_completed_(false),
+      file_number_(0),
+      first_seqno_(0),
+      earliest_seqno_(latest_seq),
+      creation_seq_(latest_seq),
+      mem_next_logfile_number_(0),
+      min_prep_log_referenced_(0),
+      locks_(moptions_.inplace_update_support
+                 ? moptions_.inplace_update_num_locks
+                 : 0),
+      prefix_extractor_(mutable_cf_options.prefix_extractor.get()),
+      flush_state_(FLUSH_NOT_REQUESTED),
+      clock_(ioptions.clock),
+      insert_with_hint_prefix_extractor_(
+          ioptions.memtable_insert_with_hint_prefix_extractor.get()),
+      oldest_key_time_(std::numeric_limits<uint64_t>::max()),
+      atomic_flush_seqno_(kMaxSequenceNumber),
+      approximate_memory_usage_(0) {
+  UpdateFlushState();
+  // something went wrong if we need to flush before inserting anything
+  assert(!ShouldScheduleFlush());
+
+  // use bloom_filter_ for both whole key and prefix bloom filter
+  if ((prefix_extractor_ || moptions_.memtable_whole_key_filtering) &&
+      moptions_.memtable_prefix_bloom_bits > 0) {
+    bloom_filter_.reset(
+        new DynamicBloom(&arena_, moptions_.memtable_prefix_bloom_bits,
+                         6 /* hard coded 6 probes */,
+                         moptions_.memtable_huge_page_size, ioptions.logger));
+  }
+  // Initialize cached_range_tombstone_ here since it could
+  // be read before it is constructed in MemTable::Add(), which could also lead
+  // to a data race on the global mutex table backing atomic shared_ptr.
+  auto new_cache = std::make_shared<FragmentedRangeTombstoneListCache>();
+  size_t size = cached_range_tombstone_.Size();
+  for (size_t i = 0; i < size; ++i) {
+    std::shared_ptr<FragmentedRangeTombstoneListCache>* local_cache_ref_ptr =
+        cached_range_tombstone_.AccessAtCore(i);
+    auto new_local_cache_ref = std::make_shared<
+        const std::shared_ptr<FragmentedRangeTombstoneListCache>>(new_cache);
+    std::atomic_store_explicit(
+        local_cache_ref_ptr,
+        std::shared_ptr<FragmentedRangeTombstoneListCache>(new_local_cache_ref,
+                                                           new_cache.get()),
+        std::memory_order_relaxed);
+  }
+}
+
+MemTable::~MemTable() {
+  mem_tracker_.FreeMem();
+  assert(refs_ == 0);
+}
+
+size_t MemTable::ApproximateMemoryUsage() {
+  autovector<size_t> usages = {
+      arena_.ApproximateMemoryUsage(), table_->ApproximateMemoryUsage(),
+      range_del_table_->ApproximateMemoryUsage(),
+      ROCKSDB_NAMESPACE::ApproximateMemoryUsage(insert_hints_)};
+  size_t total_usage = 0;
+  for (size_t usage : usages) {
+    // If usage + total_usage >= kMaxSizet, return kMaxSizet.
+    // the following variation is to avoid numeric overflow.
+    if (usage >= std::numeric_limits<size_t>::max() - total_usage) {
+      return std::numeric_limits<size_t>::max();
+    }
+    total_usage += usage;
+  }
+  approximate_memory_usage_.store(total_usage, std::memory_order_relaxed);
+  // otherwise, return the actual usage
+  return total_usage;
+}
+
+bool MemTable::ShouldFlushNow() {
+  size_t write_buffer_size = write_buffer_size_.load(std::memory_order_relaxed);
+  // In a lot of times, we cannot allocate arena blocks that exactly matches the
+  // buffer size. Thus we have to decide if we should over-allocate or
+  // under-allocate.
+  // This constant variable can be interpreted as: if we still have more than
+  // "kAllowOverAllocationRatio * kArenaBlockSize" space left, we'd try to over
+  // allocate one more block.
+  const double kAllowOverAllocationRatio = 0.6;
+
+  // If arena still have room for new block allocation, we can safely say it
+  // shouldn't flush.
+  auto allocated_memory = table_->ApproximateMemoryUsage() +
+                          range_del_table_->ApproximateMemoryUsage() +
+                          arena_.MemoryAllocatedBytes();
+
+  approximate_memory_usage_.store(allocated_memory, std::memory_order_relaxed);
+
+  // if we can still allocate one more block without exceeding the
+  // over-allocation ratio, then we should not flush.
+  if (allocated_memory + kArenaBlockSize <
+      write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
+    return false;
+  }
+
+  // if user keeps adding entries that exceeds write_buffer_size, we need to
+  // flush earlier even though we still have much available memory left.
+  if (allocated_memory >
+      write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
+    return true;
+  }
+
+  // In this code path, Arena has already allocated its "last block", which
+  // means the total allocatedmemory size is either:
+  //  (1) "moderately" over allocated the memory (no more than `0.6 * arena
+  // block size`. Or,
+  //  (2) the allocated memory is less than write buffer size, but we'll stop
+  // here since if we allocate a new arena block, we'll over allocate too much
+  // more (half of the arena block size) memory.
+  //
+  // In either case, to avoid over-allocate, the last block will stop allocation
+  // when its usage reaches a certain ratio, which we carefully choose "0.75
+  // full" as the stop condition because it addresses the following issue with
+  // great simplicity: What if the next inserted entry's size is
+  // bigger than AllocatedAndUnused()?
+  //
+  // The answer is: if the entry size is also bigger than 0.25 *
+  // kArenaBlockSize, a dedicated block will be allocated for it; otherwise
+  // arena will anyway skip the AllocatedAndUnused() and allocate a new, empty
+  // and regular block. In either case, we *overly* over-allocated.
+  //
+  // Therefore, setting the last block to be at most "0.75 full" avoids both
+  // cases.
+  //
+  // NOTE: the average percentage of waste space of this approach can be counted
+  // as: "arena block size * 0.25 / write buffer size". User who specify a small
+  // write buffer size and/or big arena block size may suffer.
+  return arena_.AllocatedAndUnused() < kArenaBlockSize / 4;
+}
+
+void MemTable::UpdateFlushState() {
+  auto state = flush_state_.load(std::memory_order_relaxed);
+  if (state == FLUSH_NOT_REQUESTED && ShouldFlushNow()) {
+    // ignore CAS failure, because that means somebody else requested
+    // a flush
+    flush_state_.compare_exchange_strong(state, FLUSH_REQUESTED,
+                                         std::memory_order_relaxed,
+                                         std::memory_order_relaxed);
+  }
+}
+
+void MemTable::UpdateOldestKeyTime() {
+  uint64_t oldest_key_time = oldest_key_time_.load(std::memory_order_relaxed);
+  if (oldest_key_time == std::numeric_limits<uint64_t>::max()) {
+    int64_t current_time = 0;
+    auto s = clock_->GetCurrentTime(&current_time);
+    if (s.ok()) {
+      assert(current_time >= 0);
+      // If fail, the timestamp is already set.
+      oldest_key_time_.compare_exchange_strong(
+          oldest_key_time, static_cast<uint64_t>(current_time),
+          std::memory_order_relaxed, std::memory_order_relaxed);
+    }
+  }
+}
+
+Status MemTable::VerifyEntryChecksum(const char* entry,
+                                     size_t protection_bytes_per_key,
+                                     bool allow_data_in_errors) {
+  if (protection_bytes_per_key == 0) {
+    return Status::OK();
+  }
+  uint32_t key_length;
+  const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+  if (key_ptr == nullptr) {
+    return Status::Corruption("Unable to parse internal key length");
+  }
+  if (key_length < 8) {
+    return Status::Corruption("Memtable entry internal key length too short.");
+  }
+  Slice user_key = Slice(key_ptr, key_length - 8);
+
+  const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+  ValueType type;
+  SequenceNumber seq;
+  UnPackSequenceAndType(tag, &seq, &type);
+
+  uint32_t value_length = 0;
+  const char* value_ptr = GetVarint32Ptr(
+      key_ptr + key_length, key_ptr + key_length + 5, &value_length);
+  if (value_ptr == nullptr) {
+    return Status::Corruption("Unable to parse internal key value");
+  }
+  Slice value = Slice(value_ptr, value_length);
+
+  const char* checksum_ptr = value_ptr + value_length;
+  uint64_t expected = ProtectionInfo64()
+                          .ProtectKVO(user_key, value, type)
+                          .ProtectS(seq)
+                          .GetVal();
+  bool match = true;
+  switch (protection_bytes_per_key) {
+    case 1:
+      match = static_cast<uint8_t>(checksum_ptr[0]) ==
+              static_cast<uint8_t>(expected);
+      break;
+    case 2:
+      match = DecodeFixed16(checksum_ptr) == static_cast<uint16_t>(expected);
+      break;
+    case 4:
+      match = DecodeFixed32(checksum_ptr) == static_cast<uint32_t>(expected);
+      break;
+    case 8:
+      match = DecodeFixed64(checksum_ptr) == expected;
+      break;
+    default:
+      assert(false);
+  }
+  if (!match) {
+    std::string msg(
+        "Corrupted memtable entry, per key-value checksum verification "
+        "failed.");
+    if (allow_data_in_errors) {
+      msg.append("Unrecognized value type: " +
+                 std::to_string(static_cast<int>(type)) + ". ");
+      msg.append("User key: " + user_key.ToString(/*hex=*/true) + ". ");
+      msg.append("seq: " + std::to_string(seq) + ".");
+    }
+    return Status::Corruption(msg.c_str());
+  }
+  return Status::OK();
+}
+
+int MemTable::KeyComparator::operator()(const char* prefix_len_key1,
+                                        const char* prefix_len_key2) const {
+  // Internal keys are encoded as length-prefixed strings.
+  Slice k1 = GetLengthPrefixedSlice(prefix_len_key1);
+  Slice k2 = GetLengthPrefixedSlice(prefix_len_key2);
+  return comparator.CompareKeySeq(k1, k2);
+}
+
+int MemTable::KeyComparator::operator()(
+    const char* prefix_len_key, const KeyComparator::DecodedType& key) const {
+  // Internal keys are encoded as length-prefixed strings.
+  Slice a = GetLengthPrefixedSlice(prefix_len_key);
+  return comparator.CompareKeySeq(a, key);
+}
+
+void MemTableRep::InsertConcurrently(KeyHandle /*handle*/) {
+#ifndef ROCKSDB_LITE
+  throw std::runtime_error("concurrent insert not supported");
+#else
+  abort();
+#endif
+}
+
+Slice MemTableRep::UserKey(const char* key) const {
+  Slice slice = GetLengthPrefixedSlice(key);
+  return Slice(slice.data(), slice.size() - 8);
+}
+
+KeyHandle MemTableRep::Allocate(const size_t len, char** buf) {
+  *buf = allocator_->Allocate(len);
+  return static_cast<KeyHandle>(*buf);
+}
+
+// Encode a suitable internal key target for "target" and return it.
+// Uses *scratch as scratch space, and the returned pointer will point
+// into this scratch space.
+const char* EncodeKey(std::string* scratch, const Slice& target) {
+  scratch->clear();
+  PutVarint32(scratch, static_cast<uint32_t>(target.size()));
+  scratch->append(target.data(), target.size());
+  return scratch->data();
+}
+
+class MemTableIterator : public InternalIterator {
+ public:
+  MemTableIterator(const MemTable& mem, const ReadOptions& read_options,
+                   Arena* arena, bool use_range_del_table = false)
+      : bloom_(nullptr),
+        prefix_extractor_(mem.prefix_extractor_),
+        comparator_(mem.comparator_),
+        valid_(false),
+        arena_mode_(arena != nullptr),
+        value_pinned_(
+            !mem.GetImmutableMemTableOptions()->inplace_update_support),
+        protection_bytes_per_key_(mem.moptions_.protection_bytes_per_key),
+        status_(Status::OK()),
+        logger_(mem.moptions_.info_log) {
+    if (use_range_del_table) {
+      iter_ = mem.range_del_table_->GetIterator(arena);
+    } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek &&
+               !read_options.auto_prefix_mode) {
+      // Auto prefix mode is not implemented in memtable yet.
+      bloom_ = mem.bloom_filter_.get();
+      iter_ = mem.table_->GetDynamicPrefixIterator(arena);
+    } else {
+      iter_ = mem.table_->GetIterator(arena);
+    }
+    status_.PermitUncheckedError();
+  }
+  // No copying allowed
+  MemTableIterator(const MemTableIterator&) = delete;
+  void operator=(const MemTableIterator&) = delete;
+
+  ~MemTableIterator() override {
+#ifndef NDEBUG
+    // Assert that the MemTableIterator is never deleted while
+    // Pinning is Enabled.
+    assert(!pinned_iters_mgr_ || !pinned_iters_mgr_->PinningEnabled());
+#endif
+    if (arena_mode_) {
+      iter_->~Iterator();
+    } else {
+      delete iter_;
+    }
+  }
+
+#ifndef NDEBUG
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+    pinned_iters_mgr_ = pinned_iters_mgr;
+  }
+  PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
+#endif
+
+  bool Valid() const override { return valid_ && status_.ok(); }
+  void Seek(const Slice& k) override {
+    PERF_TIMER_GUARD(seek_on_memtable_time);
+    PERF_COUNTER_ADD(seek_on_memtable_count, 1);
+    if (bloom_) {
+      // iterator should only use prefix bloom filter
+      auto ts_sz = comparator_.comparator.user_comparator()->timestamp_size();
+      Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz));
+      if (prefix_extractor_->InDomain(user_k_without_ts)) {
+        if (!bloom_->MayContain(
+                prefix_extractor_->Transform(user_k_without_ts))) {
+          PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
+          valid_ = false;
+          return;
+        } else {
+          PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
+        }
+      }
+    }
+    iter_->Seek(k, nullptr);
+    valid_ = iter_->Valid();
+    VerifyEntryChecksum();
+  }
+  void SeekForPrev(const Slice& k) override {
+    PERF_TIMER_GUARD(seek_on_memtable_time);
+    PERF_COUNTER_ADD(seek_on_memtable_count, 1);
+    if (bloom_) {
+      auto ts_sz = comparator_.comparator.user_comparator()->timestamp_size();
+      Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz));
+      if (prefix_extractor_->InDomain(user_k_without_ts)) {
+        if (!bloom_->MayContain(
+                prefix_extractor_->Transform(user_k_without_ts))) {
+          PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
+          valid_ = false;
+          return;
+        } else {
+          PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
+        }
+      }
+    }
+    iter_->Seek(k, nullptr);
+    valid_ = iter_->Valid();
+    VerifyEntryChecksum();
+    if (!Valid() && status().ok()) {
+      SeekToLast();
+    }
+    while (Valid() && comparator_.comparator.Compare(k, key()) < 0) {
+      Prev();
+    }
+  }
+  void SeekToFirst() override {
+    iter_->SeekToFirst();
+    valid_ = iter_->Valid();
+    VerifyEntryChecksum();
+  }
+  void SeekToLast() override {
+    iter_->SeekToLast();
+    valid_ = iter_->Valid();
+    VerifyEntryChecksum();
+  }
+  void Next() override {
+    PERF_COUNTER_ADD(next_on_memtable_count, 1);
+    assert(Valid());
+    iter_->Next();
+    TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_);
+    valid_ = iter_->Valid();
+    VerifyEntryChecksum();
+  }
+  bool NextAndGetResult(IterateResult* result) override {
+    Next();
+    bool is_valid = Valid();
+    if (is_valid) {
+      result->key = key();
+      result->bound_check_result = IterBoundCheck::kUnknown;
+      result->value_prepared = true;
+    }
+    return is_valid;
+  }
+  void Prev() override {
+    PERF_COUNTER_ADD(prev_on_memtable_count, 1);
+    assert(Valid());
+    iter_->Prev();
+    valid_ = iter_->Valid();
+    VerifyEntryChecksum();
+  }
+  Slice key() const override {
+    assert(Valid());
+    return GetLengthPrefixedSlice(iter_->key());
+  }
+  Slice value() const override {
+    assert(Valid());
+    Slice key_slice = GetLengthPrefixedSlice(iter_->key());
+    return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
+  }
+
+  Status status() const override { return status_; }
+
+  bool IsKeyPinned() const override {
+    // memtable data is always pinned
+    return true;
+  }
+
+  bool IsValuePinned() const override {
+    // memtable value is always pinned, except if we allow inplace update.
+    return value_pinned_;
+  }
+
+ private:
+  DynamicBloom* bloom_;
+  const SliceTransform* const prefix_extractor_;
+  const MemTable::KeyComparator comparator_;
+  MemTableRep::Iterator* iter_;
+  bool valid_;
+  bool arena_mode_;
+  bool value_pinned_;
+  size_t protection_bytes_per_key_;
+  Status status_;
+  Logger* logger_;
+
+  void VerifyEntryChecksum() {
+    if (protection_bytes_per_key_ > 0 && Valid()) {
+      status_ = MemTable::VerifyEntryChecksum(iter_->key(),
+                                              protection_bytes_per_key_);
+      if (!status_.ok()) {
+        ROCKS_LOG_ERROR(logger_, "In MemtableIterator: %s", status_.getState());
+      }
+    }
+  }
+};
+
+InternalIterator* MemTable::NewIterator(const ReadOptions& read_options,
+                                        Arena* arena) {
+  assert(arena != nullptr);
+  auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
+  return new (mem) MemTableIterator(*this, read_options, arena);
+}
+
+FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator(
+    const ReadOptions& read_options, SequenceNumber read_seq,
+    bool immutable_memtable) {
+  if (read_options.ignore_range_deletions ||
+      is_range_del_table_empty_.load(std::memory_order_relaxed)) {
+    return nullptr;
+  }
+  return NewRangeTombstoneIteratorInternal(read_options, read_seq,
+                                           immutable_memtable);
+}
+
+FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal(
+    const ReadOptions& read_options, SequenceNumber read_seq,
+    bool immutable_memtable) {
+  if (immutable_memtable) {
+    // Note that caller should already have verified that
+    // !is_range_del_table_empty_
+    assert(IsFragmentedRangeTombstonesConstructed());
+    return new FragmentedRangeTombstoneIterator(
+        fragmented_range_tombstone_list_.get(), comparator_.comparator,
+        read_seq, read_options.timestamp);
+  }
+
+  // takes current cache
+  std::shared_ptr<FragmentedRangeTombstoneListCache> cache =
+      std::atomic_load_explicit(cached_range_tombstone_.Access(),
+                                std::memory_order_relaxed);
+  // construct fragmented tombstone list if necessary
+  if (!cache->initialized.load(std::memory_order_acquire)) {
+    cache->reader_mutex.lock();
+    if (!cache->tombstones) {
+      auto* unfragmented_iter =
+          new MemTableIterator(*this, read_options, nullptr /* arena */,
+                               true /* use_range_del_table */);
+      cache->tombstones.reset(new FragmentedRangeTombstoneList(
+          std::unique_ptr<InternalIterator>(unfragmented_iter),
+          comparator_.comparator));
+      cache->initialized.store(true, std::memory_order_release);
+    }
+    cache->reader_mutex.unlock();
+  }
+
+  auto* fragmented_iter = new FragmentedRangeTombstoneIterator(
+      cache, comparator_.comparator, read_seq, read_options.timestamp);
+  return fragmented_iter;
+}
+
+void MemTable::ConstructFragmentedRangeTombstones() {
+  assert(!IsFragmentedRangeTombstonesConstructed(false));
+  // There should be no concurrent Construction
+  if (!is_range_del_table_empty_.load(std::memory_order_relaxed)) {
+    auto* unfragmented_iter =
+        new MemTableIterator(*this, ReadOptions(), nullptr /* arena */,
+                             true /* use_range_del_table */);
+
+    fragmented_range_tombstone_list_ =
+        std::make_unique<FragmentedRangeTombstoneList>(
+            std::unique_ptr<InternalIterator>(unfragmented_iter),
+            comparator_.comparator);
+  }
+}
+
+port::RWMutex* MemTable::GetLock(const Slice& key) {
+  return &locks_[GetSliceRangedNPHash(key, locks_.size())];
+}
+
+MemTable::MemTableStats MemTable::ApproximateStats(const Slice& start_ikey,
+                                                   const Slice& end_ikey) {
+  uint64_t entry_count = table_->ApproximateNumEntries(start_ikey, end_ikey);
+  entry_count += range_del_table_->ApproximateNumEntries(start_ikey, end_ikey);
+  if (entry_count == 0) {
+    return {0, 0};
+  }
+  uint64_t n = num_entries_.load(std::memory_order_relaxed);
+  if (n == 0) {
+    return {0, 0};
+  }
+  if (entry_count > n) {
+    // (range_del_)table_->ApproximateNumEntries() is just an estimate so it can
+    // be larger than actual entries we have. Cap it to entries we have to limit
+    // the inaccuracy.
+    entry_count = n;
+  }
+  uint64_t data_size = data_size_.load(std::memory_order_relaxed);
+  return {entry_count * (data_size / n), entry_count};
+}
+
+Status MemTable::VerifyEncodedEntry(Slice encoded,
+                                    const ProtectionInfoKVOS64& kv_prot_info) {
+  uint32_t ikey_len = 0;
+  if (!GetVarint32(&encoded, &ikey_len)) {
+    return Status::Corruption("Unable to parse internal key length");
+  }
+  size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size();
+  if (ikey_len < 8 + ts_sz) {
+    return Status::Corruption("Internal key length too short");
+  }
+  if (ikey_len > encoded.size()) {
+    return Status::Corruption("Internal key length too long");
+  }
+  uint32_t value_len = 0;
+  const size_t user_key_len = ikey_len - 8;
+  Slice key(encoded.data(), user_key_len);
+  encoded.remove_prefix(user_key_len);
+
+  uint64_t packed = DecodeFixed64(encoded.data());
+  ValueType value_type = kMaxValue;
+  SequenceNumber sequence_number = kMaxSequenceNumber;
+  UnPackSequenceAndType(packed, &sequence_number, &value_type);
+  encoded.remove_prefix(8);
+
+  if (!GetVarint32(&encoded, &value_len)) {
+    return Status::Corruption("Unable to parse value length");
+  }
+  if (value_len < encoded.size()) {
+    return Status::Corruption("Value length too short");
+  }
+  if (value_len > encoded.size()) {
+    return Status::Corruption("Value length too long");
+  }
+  Slice value(encoded.data(), value_len);
+
+  return kv_prot_info.StripS(sequence_number)
+      .StripKVO(key, value, value_type)
+      .GetStatus();
+}
+
+void MemTable::UpdateEntryChecksum(const ProtectionInfoKVOS64* kv_prot_info,
+                                   const Slice& key, const Slice& value,
+                                   ValueType type, SequenceNumber s,
+                                   char* checksum_ptr) {
+  if (moptions_.protection_bytes_per_key == 0) {
+    return;
+  }
+
+  uint64_t checksum = 0;
+  if (kv_prot_info == nullptr) {
+    checksum =
+        ProtectionInfo64().ProtectKVO(key, value, type).ProtectS(s).GetVal();
+  } else {
+    checksum = kv_prot_info->GetVal();
+  }
+  switch (moptions_.protection_bytes_per_key) {
+    case 1:
+      checksum_ptr[0] = static_cast<uint8_t>(checksum);
+      break;
+    case 2:
+      EncodeFixed16(checksum_ptr, static_cast<uint16_t>(checksum));
+      break;
+    case 4:
+      EncodeFixed32(checksum_ptr, static_cast<uint32_t>(checksum));
+      break;
+    case 8:
+      EncodeFixed64(checksum_ptr, checksum);
+      break;
+    default:
+      assert(false);
+  }
+}
+
+Status MemTable::Add(SequenceNumber s, ValueType type,
+                     const Slice& key, /* user key */
+                     const Slice& value,
+                     const ProtectionInfoKVOS64* kv_prot_info,
+                     bool allow_concurrent,
+                     MemTablePostProcessInfo* post_process_info, void** hint) {
+  // Format of an entry is concatenation of:
+  //  key_size     : varint32 of internal_key.size()
+  //  key bytes    : char[internal_key.size()]
+  //  value_size   : varint32 of value.size()
+  //  value bytes  : char[value.size()]
+  //  checksum     : char[moptions_.protection_bytes_per_key]
+  uint32_t key_size = static_cast<uint32_t>(key.size());
+  uint32_t val_size = static_cast<uint32_t>(value.size());
+  uint32_t internal_key_size = key_size + 8;
+  const uint32_t encoded_len = VarintLength(internal_key_size) +
+                               internal_key_size + VarintLength(val_size) +
+                               val_size + moptions_.protection_bytes_per_key;
+  char* buf = nullptr;
+  std::unique_ptr<MemTableRep>& table =
+      type == kTypeRangeDeletion ? range_del_table_ : table_;
+  KeyHandle handle = table->Allocate(encoded_len, &buf);
+
+  char* p = EncodeVarint32(buf, internal_key_size);
+  memcpy(p, key.data(), key_size);
+  Slice key_slice(p, key_size);
+  p += key_size;
+  uint64_t packed = PackSequenceAndType(s, type);
+  EncodeFixed64(p, packed);
+  p += 8;
+  p = EncodeVarint32(p, val_size);
+  memcpy(p, value.data(), val_size);
+  assert((unsigned)(p + val_size - buf + moptions_.protection_bytes_per_key) ==
+         (unsigned)encoded_len);
+
+  UpdateEntryChecksum(kv_prot_info, key, value, type, s,
+                      buf + encoded_len - moptions_.protection_bytes_per_key);
+  Slice encoded(buf, encoded_len - moptions_.protection_bytes_per_key);
+  if (kv_prot_info != nullptr) {
+    TEST_SYNC_POINT_CALLBACK("MemTable::Add:Encoded", &encoded);
+    Status status = VerifyEncodedEntry(encoded, *kv_prot_info);
+    if (!status.ok()) {
+      return status;
+    }
+  }
+
+  size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size();
+  Slice key_without_ts = StripTimestampFromUserKey(key, ts_sz);
+
+  if (!allow_concurrent) {
+    // Extract prefix for insert with hint.
+    if (insert_with_hint_prefix_extractor_ != nullptr &&
+        insert_with_hint_prefix_extractor_->InDomain(key_slice)) {
+      Slice prefix = insert_with_hint_prefix_extractor_->Transform(key_slice);
+      bool res = table->InsertKeyWithHint(handle, &insert_hints_[prefix]);
+      if (UNLIKELY(!res)) {
+        return Status::TryAgain("key+seq exists");
+      }
+    } else {
+      bool res = table->InsertKey(handle);
+      if (UNLIKELY(!res)) {
+        return Status::TryAgain("key+seq exists");
+      }
+    }
+
+    // this is a bit ugly, but is the way to avoid locked instructions
+    // when incrementing an atomic
+    num_entries_.store(num_entries_.load(std::memory_order_relaxed) + 1,
+                       std::memory_order_relaxed);
+    data_size_.store(data_size_.load(std::memory_order_relaxed) + encoded_len,
+                     std::memory_order_relaxed);
+    if (type == kTypeDeletion || type == kTypeSingleDeletion ||
+        type == kTypeDeletionWithTimestamp) {
+      num_deletes_.store(num_deletes_.load(std::memory_order_relaxed) + 1,
+                         std::memory_order_relaxed);
+    }
+
+    if (bloom_filter_ && prefix_extractor_ &&
+        prefix_extractor_->InDomain(key_without_ts)) {
+      bloom_filter_->Add(prefix_extractor_->Transform(key_without_ts));
+    }
+    if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
+      bloom_filter_->Add(key_without_ts);
+    }
+
+    // The first sequence number inserted into the memtable
+    assert(first_seqno_ == 0 || s >= first_seqno_);
+    if (first_seqno_ == 0) {
+      first_seqno_.store(s, std::memory_order_relaxed);
+
+      if (earliest_seqno_ == kMaxSequenceNumber) {
+        earliest_seqno_.store(GetFirstSequenceNumber(),
+                              std::memory_order_relaxed);
+      }
+      assert(first_seqno_.load() >= earliest_seqno_.load());
+    }
+    assert(post_process_info == nullptr);
+    UpdateFlushState();
+  } else {
+    bool res = (hint == nullptr)
+                   ? table->InsertKeyConcurrently(handle)
+                   : table->InsertKeyWithHintConcurrently(handle, hint);
+    if (UNLIKELY(!res)) {
+      return Status::TryAgain("key+seq exists");
+    }
+
+    assert(post_process_info != nullptr);
+    post_process_info->num_entries++;
+    post_process_info->data_size += encoded_len;
+    if (type == kTypeDeletion) {
+      post_process_info->num_deletes++;
+    }
+
+    if (bloom_filter_ && prefix_extractor_ &&
+        prefix_extractor_->InDomain(key_without_ts)) {
+      bloom_filter_->AddConcurrently(
+          prefix_extractor_->Transform(key_without_ts));
+    }
+    if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
+      bloom_filter_->AddConcurrently(key_without_ts);
+    }
+
+    // atomically update first_seqno_ and earliest_seqno_.
+    uint64_t cur_seq_num = first_seqno_.load(std::memory_order_relaxed);
+    while ((cur_seq_num == 0 || s < cur_seq_num) &&
+           !first_seqno_.compare_exchange_weak(cur_seq_num, s)) {
+    }
+    uint64_t cur_earliest_seqno =
+        earliest_seqno_.load(std::memory_order_relaxed);
+    while (
+        (cur_earliest_seqno == kMaxSequenceNumber || s < cur_earliest_seqno) &&
+        !first_seqno_.compare_exchange_weak(cur_earliest_seqno, s)) {
+    }
+  }
+  if (type == kTypeRangeDeletion) {
+    auto new_cache = std::make_shared<FragmentedRangeTombstoneListCache>();
+    size_t size = cached_range_tombstone_.Size();
+    if (allow_concurrent) {
+      range_del_mutex_.lock();
+    }
+    for (size_t i = 0; i < size; ++i) {
+      std::shared_ptr<FragmentedRangeTombstoneListCache>* local_cache_ref_ptr =
+          cached_range_tombstone_.AccessAtCore(i);
+      auto new_local_cache_ref = std::make_shared<
+          const std::shared_ptr<FragmentedRangeTombstoneListCache>>(new_cache);
+      // It is okay for some reader to load old cache during invalidation as
+      // the new sequence number is not published yet.
+      // Each core will have a shared_ptr to a shared_ptr to the cached
+      // fragmented range tombstones, so that ref count is maintianed locally
+      // per-core using the per-core shared_ptr.
+      std::atomic_store_explicit(
+          local_cache_ref_ptr,
+          std::shared_ptr<FragmentedRangeTombstoneListCache>(
+              new_local_cache_ref, new_cache.get()),
+          std::memory_order_relaxed);
+    }
+    if (allow_concurrent) {
+      range_del_mutex_.unlock();
+    }
+    is_range_del_table_empty_.store(false, std::memory_order_relaxed);
+  }
+  UpdateOldestKeyTime();
+
+  TEST_SYNC_POINT_CALLBACK("MemTable::Add:BeforeReturn:Encoded", &encoded);
+  return Status::OK();
+}
+
+// Callback from MemTable::Get()
+namespace {
+
+struct Saver {
+  Status* status;
+  const LookupKey* key;
+  bool* found_final_value;  // Is value set correctly? Used by KeyMayExist
+  bool* merge_in_progress;
+  std::string* value;
+  PinnableWideColumns* columns;
+  SequenceNumber seq;
+  std::string* timestamp;
+  const MergeOperator* merge_operator;
+  // the merge operations encountered;
+  MergeContext* merge_context;
+  SequenceNumber max_covering_tombstone_seq;
+  MemTable* mem;
+  Logger* logger;
+  Statistics* statistics;
+  bool inplace_update_support;
+  bool do_merge;
+  SystemClock* clock;
+
+  ReadCallback* callback_;
+  bool* is_blob_index;
+  bool allow_data_in_errors;
+  size_t protection_bytes_per_key;
+  bool CheckCallback(SequenceNumber _seq) {
+    if (callback_) {
+      return callback_->IsVisible(_seq);
+    }
+    return true;
+  }
+};
+}  // anonymous namespace
+
+static bool SaveValue(void* arg, const char* entry) {
+  TEST_SYNC_POINT_CALLBACK("Memtable::SaveValue:Begin:entry", &entry);
+  Saver* s = reinterpret_cast<Saver*>(arg);
+  assert(s != nullptr);
+  assert(!s->value || !s->columns);
+
+  if (s->protection_bytes_per_key > 0) {
+    *(s->status) = MemTable::VerifyEntryChecksum(
+        entry, s->protection_bytes_per_key, s->allow_data_in_errors);
+    if (!s->status->ok()) {
+      ROCKS_LOG_ERROR(s->logger, "In SaveValue: %s", s->status->getState());
+      // Memtable entry corrupted
+      return false;
+    }
+  }
+
+  MergeContext* merge_context = s->merge_context;
+  SequenceNumber max_covering_tombstone_seq = s->max_covering_tombstone_seq;
+  const MergeOperator* merge_operator = s->merge_operator;
+
+  assert(merge_context != nullptr);
+
+  // Refer to comments under MemTable::Add() for entry format.
+  // Check that it belongs to same user key.
+  uint32_t key_length = 0;
+  const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+  assert(key_length >= 8);
+  Slice user_key_slice = Slice(key_ptr, key_length - 8);
+  const Comparator* user_comparator =
+      s->mem->GetInternalKeyComparator().user_comparator();
+  size_t ts_sz = user_comparator->timestamp_size();
+  if (ts_sz && s->timestamp && max_covering_tombstone_seq > 0) {
+    // timestamp should already be set to range tombstone timestamp
+    assert(s->timestamp->size() == ts_sz);
+  }
+  if (user_comparator->EqualWithoutTimestamp(user_key_slice,
+                                             s->key->user_key())) {
+    // Correct user key
+    const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+    ValueType type;
+    SequenceNumber seq;
+    UnPackSequenceAndType(tag, &seq, &type);
+    // If the value is not in the snapshot, skip it
+    if (!s->CheckCallback(seq)) {
+      return true;  // to continue to the next seq
+    }
+
+    if (s->seq == kMaxSequenceNumber) {
+      s->seq = seq;
+      if (s->seq > max_covering_tombstone_seq) {
+        if (ts_sz && s->timestamp != nullptr) {
+          // `timestamp` was set to range tombstone's timestamp before
+          // `SaveValue` is ever called. This key has a higher sequence number
+          // than range tombstone, and is the key with the highest seqno across
+          // all keys with this user_key, so we update timestamp here.
+          Slice ts = ExtractTimestampFromUserKey(user_key_slice, ts_sz);
+          s->timestamp->assign(ts.data(), ts_sz);
+        }
+      } else {
+        s->seq = max_covering_tombstone_seq;
+      }
+    }
+
+    if (ts_sz > 0 && s->timestamp != nullptr) {
+      if (!s->timestamp->empty()) {
+        assert(ts_sz == s->timestamp->size());
+      }
+      // TODO optimize for smaller size ts
+      const std::string kMaxTs(ts_sz, '\xff');
+      if (s->timestamp->empty() ||
+          user_comparator->CompareTimestamp(*(s->timestamp), kMaxTs) == 0) {
+        Slice ts = ExtractTimestampFromUserKey(user_key_slice, ts_sz);
+        s->timestamp->assign(ts.data(), ts_sz);
+      }
+    }
+
+    if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex ||
+         type == kTypeWideColumnEntity || type == kTypeDeletion ||
+         type == kTypeSingleDeletion || type == kTypeDeletionWithTimestamp) &&
+        max_covering_tombstone_seq > seq) {
+      type = kTypeRangeDeletion;
+    }
+    switch (type) {
+      case kTypeBlobIndex: {
+        if (!s->do_merge) {
+          *(s->status) = Status::NotSupported(
+              "GetMergeOperands not supported by stacked BlobDB");
+          *(s->found_final_value) = true;
+          return false;
+        }
+
+        if (*(s->merge_in_progress)) {
+          *(s->status) = Status::NotSupported(
+              "Merge operator not supported by stacked BlobDB");
+          *(s->found_final_value) = true;
+          return false;
+        }
+
+        if (s->is_blob_index == nullptr) {
+          ROCKS_LOG_ERROR(s->logger, "Encountered unexpected blob index.");
+          *(s->status) = Status::NotSupported(
+              "Encountered unexpected blob index. Please open DB with "
+              "ROCKSDB_NAMESPACE::blob_db::BlobDB.");
+          *(s->found_final_value) = true;
+          return false;
+        }
+
+        if (s->inplace_update_support) {
+          s->mem->GetLock(s->key->user_key())->ReadLock();
+        }
+
+        Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
+
+        *(s->status) = Status::OK();
+
+        if (s->value) {
+          s->value->assign(v.data(), v.size());
+        } else if (s->columns) {
+          s->columns->SetPlainValue(v);
+        }
+
+        if (s->inplace_update_support) {
+          s->mem->GetLock(s->key->user_key())->ReadUnlock();
+        }
+
+        *(s->found_final_value) = true;
+        *(s->is_blob_index) = true;
+
+        return false;
+      }
+      case kTypeValue: {
+        if (s->inplace_update_support) {
+          s->mem->GetLock(s->key->user_key())->ReadLock();
+        }
+
+        Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
+
+        *(s->status) = Status::OK();
+
+        if (!s->do_merge) {
+          // Preserve the value with the goal of returning it as part of
+          // raw merge operands to the user
+          // TODO(yanqin) update MergeContext so that timestamps information
+          // can also be retained.
+
+          merge_context->PushOperand(
+              v, s->inplace_update_support == false /* operand_pinned */);
+        } else if (*(s->merge_in_progress)) {
+          assert(s->do_merge);
+
+          if (s->value || s->columns) {
+            std::string result;
+            *(s->status) = MergeHelper::TimedFullMerge(
+                merge_operator, s->key->user_key(), &v,
+                merge_context->GetOperands(), &result, s->logger, s->statistics,
+                s->clock, /* result_operand */ nullptr,
+                /* update_num_ops_stats */ true);
+
+            if (s->status->ok()) {
+              if (s->value) {
+                *(s->value) = std::move(result);
+              } else {
+                assert(s->columns);
+                s->columns->SetPlainValue(result);
+              }
+            }
+          }
+        } else if (s->value) {
+          s->value->assign(v.data(), v.size());
+        } else if (s->columns) {
+          s->columns->SetPlainValue(v);
+        }
+
+        if (s->inplace_update_support) {
+          s->mem->GetLock(s->key->user_key())->ReadUnlock();
+        }
+
+        *(s->found_final_value) = true;
+
+        if (s->is_blob_index != nullptr) {
+          *(s->is_blob_index) = false;
+        }
+
+        return false;
+      }
+      case kTypeWideColumnEntity: {
+        if (s->inplace_update_support) {
+          s->mem->GetLock(s->key->user_key())->ReadLock();
+        }
+
+        Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
+
+        *(s->status) = Status::OK();
+
+        if (!s->do_merge) {
+          // Preserve the value with the goal of returning it as part of
+          // raw merge operands to the user
+
+          Slice value_of_default;
+          *(s->status) = WideColumnSerialization::GetValueOfDefaultColumn(
+              v, value_of_default);
+
+          if (s->status->ok()) {
+            merge_context->PushOperand(
+                value_of_default,
+                s->inplace_update_support == false /* operand_pinned */);
+          }
+        } else if (*(s->merge_in_progress)) {
+          assert(s->do_merge);
+
+          if (s->value) {
+            Slice value_of_default;
+            *(s->status) = WideColumnSerialization::GetValueOfDefaultColumn(
+                v, value_of_default);
+            if (s->status->ok()) {
+              *(s->status) = MergeHelper::TimedFullMerge(
+                  merge_operator, s->key->user_key(), &value_of_default,
+                  merge_context->GetOperands(), s->value, s->logger,
+                  s->statistics, s->clock, /* result_operand */ nullptr,
+                  /* update_num_ops_stats */ true);
+            }
+          } else if (s->columns) {
+            std::string result;
+            *(s->status) = MergeHelper::TimedFullMergeWithEntity(
+                merge_operator, s->key->user_key(), v,
+                merge_context->GetOperands(), &result, s->logger, s->statistics,
+                s->clock, /* update_num_ops_stats */ true);
+
+            if (s->status->ok()) {
+              *(s->status) = s->columns->SetWideColumnValue(result);
+            }
+          }
+        } else if (s->value) {
+          Slice value_of_default;
+          *(s->status) = WideColumnSerialization::GetValueOfDefaultColumn(
+              v, value_of_default);
+          if (s->status->ok()) {
+            s->value->assign(value_of_default.data(), value_of_default.size());
+          }
+        } else if (s->columns) {
+          *(s->status) = s->columns->SetWideColumnValue(v);
+        }
+
+        if (s->inplace_update_support) {
+          s->mem->GetLock(s->key->user_key())->ReadUnlock();
+        }
+
+        *(s->found_final_value) = true;
+
+        if (s->is_blob_index != nullptr) {
+          *(s->is_blob_index) = false;
+        }
+
+        return false;
+      }
+      case kTypeDeletion:
+      case kTypeDeletionWithTimestamp:
+      case kTypeSingleDeletion:
+      case kTypeRangeDeletion: {
+        if (*(s->merge_in_progress)) {
+          if (s->value || s->columns) {
+            std::string result;
+            *(s->status) = MergeHelper::TimedFullMerge(
+                merge_operator, s->key->user_key(), nullptr,
+                merge_context->GetOperands(), &result, s->logger, s->statistics,
+                s->clock, /* result_operand */ nullptr,
+                /* update_num_ops_stats */ true);
+
+            if (s->status->ok()) {
+              if (s->value) {
+                *(s->value) = std::move(result);
+              } else {
+                assert(s->columns);
+                s->columns->SetPlainValue(result);
+              }
+            }
+          }
+        } else {
+          *(s->status) = Status::NotFound();
+        }
+        *(s->found_final_value) = true;
+        return false;
+      }
+      case kTypeMerge: {
+        if (!merge_operator) {
+          *(s->status) = Status::InvalidArgument(
+              "merge_operator is not properly initialized.");
+          // Normally we continue the loop (return true) when we see a merge
+          // operand.  But in case of an error, we should stop the loop
+          // immediately and pretend we have found the value to stop further
+          // seek.  Otherwise, the later call will override this error status.
+          *(s->found_final_value) = true;
+          return false;
+        }
+        Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
+        *(s->merge_in_progress) = true;
+        merge_context->PushOperand(
+            v, s->inplace_update_support == false /* operand_pinned */);
+        if (s->do_merge && merge_operator->ShouldMerge(
+                               merge_context->GetOperandsDirectionBackward())) {
+          if (s->value || s->columns) {
+            std::string result;
+            *(s->status) = MergeHelper::TimedFullMerge(
+                merge_operator, s->key->user_key(), nullptr,
+                merge_context->GetOperands(), &result, s->logger, s->statistics,
+                s->clock, /* result_operand */ nullptr,
+                /* update_num_ops_stats */ true);
+
+            if (s->status->ok()) {
+              if (s->value) {
+                *(s->value) = std::move(result);
+              } else {
+                assert(s->columns);
+                s->columns->SetPlainValue(result);
+              }
+            }
+          }
+
+          *(s->found_final_value) = true;
+          return false;
+        }
+        return true;
+      }
+      default: {
+        std::string msg("Corrupted value not expected.");
+        if (s->allow_data_in_errors) {
+          msg.append("Unrecognized value type: " +
+                     std::to_string(static_cast<int>(type)) + ". ");
+          msg.append("User key: " + user_key_slice.ToString(/*hex=*/true) +
+                     ". ");
+          msg.append("seq: " + std::to_string(seq) + ".");
+        }
+        *(s->status) = Status::Corruption(msg.c_str());
+        return false;
+      }
+    }
+  }
+
+  // s->state could be Corrupt, merge or notfound
+  return false;
+}
+
+bool MemTable::Get(const LookupKey& key, std::string* value,
+                   PinnableWideColumns* columns, std::string* timestamp,
+                   Status* s, MergeContext* merge_context,
+                   SequenceNumber* max_covering_tombstone_seq,
+                   SequenceNumber* seq, const ReadOptions& read_opts,
+                   bool immutable_memtable, ReadCallback* callback,
+                   bool* is_blob_index, bool do_merge) {
+  // The sequence number is updated synchronously in version_set.h
+  if (IsEmpty()) {
+    // Avoiding recording stats for speed.
+    return false;
+  }
+  PERF_TIMER_GUARD(get_from_memtable_time);
+
+  std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+      NewRangeTombstoneIterator(read_opts,
+                                GetInternalKeySeqno(key.internal_key()),
+                                immutable_memtable));
+  if (range_del_iter != nullptr) {
+    SequenceNumber covering_seq =
+        range_del_iter->MaxCoveringTombstoneSeqnum(key.user_key());
+    if (covering_seq > *max_covering_tombstone_seq) {
+      *max_covering_tombstone_seq = covering_seq;
+      if (timestamp) {
+        // Will be overwritten in SaveValue() if there is a point key with
+        // a higher seqno.
+        timestamp->assign(range_del_iter->timestamp().data(),
+                          range_del_iter->timestamp().size());
+      }
+    }
+  }
+
+  bool found_final_value = false;
+  bool merge_in_progress = s->IsMergeInProgress();
+  bool may_contain = true;
+  size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size();
+  Slice user_key_without_ts = StripTimestampFromUserKey(key.user_key(), ts_sz);
+  bool bloom_checked = false;
+  if (bloom_filter_) {
+    // when both memtable_whole_key_filtering and prefix_extractor_ are set,
+    // only do whole key filtering for Get() to save CPU
+    if (moptions_.memtable_whole_key_filtering) {
+      may_contain = bloom_filter_->MayContain(user_key_without_ts);
+      bloom_checked = true;
+    } else {
+      assert(prefix_extractor_);
+      if (prefix_extractor_->InDomain(user_key_without_ts)) {
+        may_contain = bloom_filter_->MayContain(
+            prefix_extractor_->Transform(user_key_without_ts));
+        bloom_checked = true;
+      }
+    }
+  }
+
+  if (bloom_filter_ && !may_contain) {
+    // iter is null if prefix bloom says the key does not exist
+    PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
+    *seq = kMaxSequenceNumber;
+  } else {
+    if (bloom_checked) {
+      PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
+    }
+    GetFromTable(key, *max_covering_tombstone_seq, do_merge, callback,
+                 is_blob_index, value, columns, timestamp, s, merge_context,
+                 seq, &found_final_value, &merge_in_progress);
+  }
+
+  // No change to value, since we have not yet found a Put/Delete
+  // Propagate corruption error
+  if (!found_final_value && merge_in_progress && !s->IsCorruption()) {
+    *s = Status::MergeInProgress();
+  }
+  PERF_COUNTER_ADD(get_from_memtable_count, 1);
+  return found_final_value;
+}
+
+void MemTable::GetFromTable(const LookupKey& key,
+                            SequenceNumber max_covering_tombstone_seq,
+                            bool do_merge, ReadCallback* callback,
+                            bool* is_blob_index, std::string* value,
+                            PinnableWideColumns* columns,
+                            std::string* timestamp, Status* s,
+                            MergeContext* merge_context, SequenceNumber* seq,
+                            bool* found_final_value, bool* merge_in_progress) {
+  Saver saver;
+  saver.status = s;
+  saver.found_final_value = found_final_value;
+  saver.merge_in_progress = merge_in_progress;
+  saver.key = &key;
+  saver.value = value;
+  saver.columns = columns;
+  saver.timestamp = timestamp;
+  saver.seq = kMaxSequenceNumber;
+  saver.mem = this;
+  saver.merge_context = merge_context;
+  saver.max_covering_tombstone_seq = max_covering_tombstone_seq;
+  saver.merge_operator = moptions_.merge_operator;
+  saver.logger = moptions_.info_log;
+  saver.inplace_update_support = moptions_.inplace_update_support;
+  saver.statistics = moptions_.statistics;
+  saver.clock = clock_;
+  saver.callback_ = callback;
+  saver.is_blob_index = is_blob_index;
+  saver.do_merge = do_merge;
+  saver.allow_data_in_errors = moptions_.allow_data_in_errors;
+  saver.protection_bytes_per_key = moptions_.protection_bytes_per_key;
+  table_->Get(key, &saver, SaveValue);
+  *seq = saver.seq;
+}
+
+void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
+                        ReadCallback* callback, bool immutable_memtable) {
+  // The sequence number is updated synchronously in version_set.h
+  if (IsEmpty()) {
+    // Avoiding recording stats for speed.
+    return;
+  }
+  PERF_TIMER_GUARD(get_from_memtable_time);
+
+  // For now, memtable Bloom filter is effectively disabled if there are any
+  // range tombstones. This is the simplest way to ensure range tombstones are
+  // handled. TODO: allow Bloom checks where max_covering_tombstone_seq==0
+  bool no_range_del = read_options.ignore_range_deletions ||
+                      is_range_del_table_empty_.load(std::memory_order_relaxed);
+  MultiGetRange temp_range(*range, range->begin(), range->end());
+  if (bloom_filter_ && no_range_del) {
+    bool whole_key =
+        !prefix_extractor_ || moptions_.memtable_whole_key_filtering;
+    std::array<Slice, MultiGetContext::MAX_BATCH_SIZE> bloom_keys;
+    std::array<bool, MultiGetContext::MAX_BATCH_SIZE> may_match;
+    std::array<size_t, MultiGetContext::MAX_BATCH_SIZE> range_indexes;
+    int num_keys = 0;
+    for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
+      if (whole_key) {
+        bloom_keys[num_keys] = iter->ukey_without_ts;
+        range_indexes[num_keys++] = iter.index();
+      } else if (prefix_extractor_->InDomain(iter->ukey_without_ts)) {
+        bloom_keys[num_keys] =
+            prefix_extractor_->Transform(iter->ukey_without_ts);
+        range_indexes[num_keys++] = iter.index();
+      }
+    }
+    bloom_filter_->MayContain(num_keys, &bloom_keys[0], &may_match[0]);
+    for (int i = 0; i < num_keys; ++i) {
+      if (!may_match[i]) {
+        temp_range.SkipIndex(range_indexes[i]);
+        PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
+      } else {
+        PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
+      }
+    }
+  }
+  for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
+    bool found_final_value{false};
+    bool merge_in_progress = iter->s->IsMergeInProgress();
+    if (!no_range_del) {
+      std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+          NewRangeTombstoneIteratorInternal(
+              read_options, GetInternalKeySeqno(iter->lkey->internal_key()),
+              immutable_memtable));
+      SequenceNumber covering_seq =
+          range_del_iter->MaxCoveringTombstoneSeqnum(iter->lkey->user_key());
+      if (covering_seq > iter->max_covering_tombstone_seq) {
+        iter->max_covering_tombstone_seq = covering_seq;
+        if (iter->timestamp) {
+          // Will be overwritten in SaveValue() if there is a point key with
+          // a higher seqno.
+          iter->timestamp->assign(range_del_iter->timestamp().data(),
+                                  range_del_iter->timestamp().size());
+        }
+      }
+    }
+    SequenceNumber dummy_seq;
+    GetFromTable(*(iter->lkey), iter->max_covering_tombstone_seq, true,
+                 callback, &iter->is_blob_index, iter->value->GetSelf(),
+                 /*columns=*/nullptr, iter->timestamp, iter->s,
+                 &(iter->merge_context), &dummy_seq, &found_final_value,
+                 &merge_in_progress);
+
+    if (!found_final_value && merge_in_progress) {
+      *(iter->s) = Status::MergeInProgress();
+    }
+
+    if (found_final_value) {
+      iter->value->PinSelf();
+      range->AddValueSize(iter->value->size());
+      range->MarkKeyDone(iter);
+      RecordTick(moptions_.statistics, MEMTABLE_HIT);
+      if (range->GetValueSize() > read_options.value_size_soft_limit) {
+        // Set all remaining keys in range to Abort
+        for (auto range_iter = range->begin(); range_iter != range->end();
+             ++range_iter) {
+          range->MarkKeyDone(range_iter);
+          *(range_iter->s) = Status::Aborted();
+        }
+        break;
+      }
+    }
+  }
+  PERF_COUNTER_ADD(get_from_memtable_count, 1);
+}
+
+Status MemTable::Update(SequenceNumber seq, ValueType value_type,
+                        const Slice& key, const Slice& value,
+                        const ProtectionInfoKVOS64* kv_prot_info) {
+  LookupKey lkey(key, seq);
+  Slice mem_key = lkey.memtable_key();
+
+  std::unique_ptr<MemTableRep::Iterator> iter(
+      table_->GetDynamicPrefixIterator());
+  iter->Seek(lkey.internal_key(), mem_key.data());
+
+  if (iter->Valid()) {
+    // Refer to comments under MemTable::Add() for entry format.
+    // Check that it belongs to same user key.  We do not check the
+    // sequence number since the Seek() call above should have skipped
+    // all entries with overly large sequence numbers.
+    const char* entry = iter->key();
+    uint32_t key_length = 0;
+    const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+    if (comparator_.comparator.user_comparator()->Equal(
+            Slice(key_ptr, key_length - 8), lkey.user_key())) {
+      // Correct user key
+      const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+      ValueType type;
+      SequenceNumber existing_seq;
+      UnPackSequenceAndType(tag, &existing_seq, &type);
+      assert(existing_seq != seq);
+      if (type == value_type) {
+        Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
+        uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
+        uint32_t new_size = static_cast<uint32_t>(value.size());
+
+        // Update value, if new value size  <= previous value size
+        if (new_size <= prev_size) {
+          char* p =
+              EncodeVarint32(const_cast<char*>(key_ptr) + key_length, new_size);
+          WriteLock wl(GetLock(lkey.user_key()));
+          memcpy(p, value.data(), value.size());
+          assert((unsigned)((p + value.size()) - entry) ==
+                 (unsigned)(VarintLength(key_length) + key_length +
+                            VarintLength(value.size()) + value.size()));
+          RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED);
+          if (kv_prot_info != nullptr) {
+            ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info);
+            // `seq` is swallowed and `existing_seq` prevails.
+            updated_kv_prot_info.UpdateS(seq, existing_seq);
+            UpdateEntryChecksum(&updated_kv_prot_info, key, value, type,
+                                existing_seq, p + value.size());
+            Slice encoded(entry, p + value.size() - entry);
+            return VerifyEncodedEntry(encoded, updated_kv_prot_info);
+          } else {
+            UpdateEntryChecksum(nullptr, key, value, type, existing_seq,
+                                p + value.size());
+          }
+          return Status::OK();
+        }
+      }
+    }
+  }
+
+  // The latest value is not value_type or key doesn't exist
+  return Add(seq, value_type, key, value, kv_prot_info);
+}
+
+Status MemTable::UpdateCallback(SequenceNumber seq, const Slice& key,
+                                const Slice& delta,
+                                const ProtectionInfoKVOS64* kv_prot_info) {
+  LookupKey lkey(key, seq);
+  Slice memkey = lkey.memtable_key();
+
+  std::unique_ptr<MemTableRep::Iterator> iter(
+      table_->GetDynamicPrefixIterator());
+  iter->Seek(lkey.internal_key(), memkey.data());
+
+  if (iter->Valid()) {
+    // Refer to comments under MemTable::Add() for entry format.
+    // Check that it belongs to same user key.  We do not check the
+    // sequence number since the Seek() call above should have skipped
+    // all entries with overly large sequence numbers.
+    const char* entry = iter->key();
+    uint32_t key_length = 0;
+    const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+    if (comparator_.comparator.user_comparator()->Equal(
+            Slice(key_ptr, key_length - 8), lkey.user_key())) {
+      // Correct user key
+      const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+      ValueType type;
+      uint64_t existing_seq;
+      UnPackSequenceAndType(tag, &existing_seq, &type);
+      if (type == kTypeValue) {
+        Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
+        uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
+
+        char* prev_buffer = const_cast<char*>(prev_value.data());
+        uint32_t new_prev_size = prev_size;
+
+        std::string str_value;
+        WriteLock wl(GetLock(lkey.user_key()));
+        auto status = moptions_.inplace_callback(prev_buffer, &new_prev_size,
+                                                 delta, &str_value);
+        if (status == UpdateStatus::UPDATED_INPLACE) {
+          // Value already updated by callback.
+          assert(new_prev_size <= prev_size);
+          if (new_prev_size < prev_size) {
+            // overwrite the new prev_size
+            char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
+                                     new_prev_size);
+            if (VarintLength(new_prev_size) < VarintLength(prev_size)) {
+              // shift the value buffer as well.
+              memcpy(p, prev_buffer, new_prev_size);
+              prev_buffer = p;
+            }
+          }
+          RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED);
+          UpdateFlushState();
+          Slice new_value(prev_buffer, new_prev_size);
+          if (kv_prot_info != nullptr) {
+            ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info);
+            // `seq` is swallowed and `existing_seq` prevails.
+            updated_kv_prot_info.UpdateS(seq, existing_seq);
+            updated_kv_prot_info.UpdateV(delta, new_value);
+            Slice encoded(entry, prev_buffer + new_prev_size - entry);
+            UpdateEntryChecksum(&updated_kv_prot_info, key, new_value, type,
+                                existing_seq, prev_buffer + new_prev_size);
+            return VerifyEncodedEntry(encoded, updated_kv_prot_info);
+          } else {
+            UpdateEntryChecksum(nullptr, key, new_value, type, existing_seq,
+                                prev_buffer + new_prev_size);
+          }
+          return Status::OK();
+        } else if (status == UpdateStatus::UPDATED) {
+          Status s;
+          if (kv_prot_info != nullptr) {
+            ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info);
+            updated_kv_prot_info.UpdateV(delta, str_value);
+            s = Add(seq, kTypeValue, key, Slice(str_value),
+                    &updated_kv_prot_info);
+          } else {
+            s = Add(seq, kTypeValue, key, Slice(str_value),
+                    nullptr /* kv_prot_info */);
+          }
+          RecordTick(moptions_.statistics, NUMBER_KEYS_WRITTEN);
+          UpdateFlushState();
+          return s;
+        } else if (status == UpdateStatus::UPDATE_FAILED) {
+          // `UPDATE_FAILED` is named incorrectly. It indicates no update
+          // happened. It does not indicate a failure happened.
+          UpdateFlushState();
+          return Status::OK();
+        }
+      }
+    }
+  }
+  // The latest value is not `kTypeValue` or key doesn't exist
+  return Status::NotFound();
+}
+
+size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
+  Slice memkey = key.memtable_key();
+
+  // A total ordered iterator is costly for some memtablerep (prefix aware
+  // reps). By passing in the user key, we allow efficient iterator creation.
+  // The iterator only needs to be ordered within the same user key.
+  std::unique_ptr<MemTableRep::Iterator> iter(
+      table_->GetDynamicPrefixIterator());
+  iter->Seek(key.internal_key(), memkey.data());
+
+  size_t num_successive_merges = 0;
+
+  for (; iter->Valid(); iter->Next()) {
+    const char* entry = iter->key();
+    uint32_t key_length = 0;
+    const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+    if (!comparator_.comparator.user_comparator()->Equal(
+            Slice(iter_key_ptr, key_length - 8), key.user_key())) {
+      break;
+    }
+
+    const uint64_t tag = DecodeFixed64(iter_key_ptr + key_length - 8);
+    ValueType type;
+    uint64_t unused;
+    UnPackSequenceAndType(tag, &unused, &type);
+    if (type != kTypeMerge) {
+      break;
+    }
+
+    ++num_successive_merges;
+  }
+
+  return num_successive_merges;
+}
+
+void MemTableRep::Get(const LookupKey& k, void* callback_args,
+                      bool (*callback_func)(void* arg, const char* entry)) {
+  auto iter = GetDynamicPrefixIterator();
+  for (iter->Seek(k.internal_key(), k.memtable_key().data());
+       iter->Valid() && callback_func(callback_args, iter->key());
+       iter->Next()) {
+  }
+}
+
+void MemTable::RefLogContainingPrepSection(uint64_t log) {
+  assert(log > 0);
+  auto cur = min_prep_log_referenced_.load();
+  while ((log < cur || cur == 0) &&
+         !min_prep_log_referenced_.compare_exchange_strong(cur, log)) {
+    cur = min_prep_log_referenced_.load();
+  }
+}
+
+uint64_t MemTable::GetMinLogContainingPrepSection() {
+  return min_prep_log_referenced_.load();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/memtable.h b/src/rocksdb/db/memtable.h
new file mode 100644
index 000000000..6db2721e4
--- /dev/null
+++ b/src/rocksdb/db/memtable.h
@@ -0,0 +1,664 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <deque>
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/kv_checksum.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/read_callback.h"
+#include "db/version_edit.h"
+#include "memory/allocator.h"
+#include "memory/concurrent_arena.h"
+#include "monitoring/instrumented_mutex.h"
+#include "options/cf_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/memtablerep.h"
+#include "table/multiget_context.h"
+#include "util/dynamic_bloom.h"
+#include "util/hash.h"
+#include "util/hash_containers.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct FlushJobInfo;
+class Mutex;
+class MemTableIterator;
+class MergeContext;
+class SystemClock;
+
+struct ImmutableMemTableOptions {
+  explicit ImmutableMemTableOptions(const ImmutableOptions& ioptions,
+                                    const MutableCFOptions& mutable_cf_options);
+  size_t arena_block_size;
+  uint32_t memtable_prefix_bloom_bits;
+  size_t memtable_huge_page_size;
+  bool memtable_whole_key_filtering;
+  bool inplace_update_support;
+  size_t inplace_update_num_locks;
+  UpdateStatus (*inplace_callback)(char* existing_value,
+                                   uint32_t* existing_value_size,
+                                   Slice delta_value,
+                                   std::string* merged_value);
+  size_t max_successive_merges;
+  Statistics* statistics;
+  MergeOperator* merge_operator;
+  Logger* info_log;
+  bool allow_data_in_errors;
+  uint32_t protection_bytes_per_key;
+};
+
+// Batched counters to updated when inserting keys in one write batch.
+// In post process of the write batch, these can be updated together.
+// Only used in concurrent memtable insert case.
+struct MemTablePostProcessInfo {
+  uint64_t data_size = 0;
+  uint64_t num_entries = 0;
+  uint64_t num_deletes = 0;
+};
+
+using MultiGetRange = MultiGetContext::Range;
+// Note:  Many of the methods in this class have comments indicating that
+// external synchronization is required as these methods are not thread-safe.
+// It is up to higher layers of code to decide how to prevent concurrent
+// invocation of these methods.  This is usually done by acquiring either
+// the db mutex or the single writer thread.
+//
+// Some of these methods are documented to only require external
+// synchronization if this memtable is immutable.  Calling MarkImmutable() is
+// not sufficient to guarantee immutability.  It is up to higher layers of
+// code to determine if this MemTable can still be modified by other threads.
+// Eg: The Superversion stores a pointer to the current MemTable (that can
+// be modified) and a separate list of the MemTables that can no longer be
+// written to (aka the 'immutable memtables').
+class MemTable {
+ public:
+  struct KeyComparator : public MemTableRep::KeyComparator {
+    const InternalKeyComparator comparator;
+    explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) {}
+    virtual int operator()(const char* prefix_len_key1,
+                           const char* prefix_len_key2) const override;
+    virtual int operator()(const char* prefix_len_key,
+                           const DecodedType& key) const override;
+  };
+
+  // MemTables are reference counted.  The initial reference count
+  // is zero and the caller must call Ref() at least once.
+  //
+  // earliest_seq should be the current SequenceNumber in the db such that any
+  // key inserted into this memtable will have an equal or larger seq number.
+  // (When a db is first created, the earliest sequence number will be 0).
+  // If the earliest sequence number is not known, kMaxSequenceNumber may be
+  // used, but this may prevent some transactions from succeeding until the
+  // first key is inserted into the memtable.
+  explicit MemTable(const InternalKeyComparator& comparator,
+                    const ImmutableOptions& ioptions,
+                    const MutableCFOptions& mutable_cf_options,
+                    WriteBufferManager* write_buffer_manager,
+                    SequenceNumber earliest_seq, uint32_t column_family_id);
+  // No copying allowed
+  MemTable(const MemTable&) = delete;
+  MemTable& operator=(const MemTable&) = delete;
+
+  // Do not delete this MemTable unless Unref() indicates it not in use.
+  ~MemTable();
+
+  // Increase reference count.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  void Ref() { ++refs_; }
+
+  // Drop reference count.
+  // If the refcount goes to zero return this memtable, otherwise return null.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  MemTable* Unref() {
+    --refs_;
+    assert(refs_ >= 0);
+    if (refs_ <= 0) {
+      return this;
+    }
+    return nullptr;
+  }
+
+  // Returns an estimate of the number of bytes of data in use by this
+  // data structure.
+  //
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  size_t ApproximateMemoryUsage();
+
+  // As a cheap version of `ApproximateMemoryUsage()`, this function doesn't
+  // require external synchronization. The value may be less accurate though
+  size_t ApproximateMemoryUsageFast() const {
+    return approximate_memory_usage_.load(std::memory_order_relaxed);
+  }
+
+  // used by MemTableListVersion::MemoryAllocatedBytesExcludingLast
+  size_t MemoryAllocatedBytes() const {
+    return table_->ApproximateMemoryUsage() +
+           range_del_table_->ApproximateMemoryUsage() +
+           arena_.MemoryAllocatedBytes();
+  }
+
+  // Returns a vector of unique random memtable entries of size 'sample_size'.
+  //
+  // Note: the entries are stored in the unordered_set as length-prefixed keys,
+  //       hence their representation in the set as "const char*".
+  // Note2: the size of the output set 'entries' is not enforced to be strictly
+  //        equal to 'target_sample_size'. Its final size might be slightly
+  //        greater or slightly less than 'target_sample_size'
+  //
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  // REQUIRES: SkipList memtable representation. This function is not
+  // implemented for any other type of memtable representation (vectorrep,
+  // hashskiplist,...).
+  void UniqueRandomSample(const uint64_t& target_sample_size,
+                          std::unordered_set<const char*>* entries) {
+    // TODO(bjlemaire): at the moment, only supported by skiplistrep.
+    // Extend it to all other memtable representations.
+    table_->UniqueRandomSample(num_entries(), target_sample_size, entries);
+  }
+
+  // This method heuristically determines if the memtable should continue to
+  // host more data.
+  bool ShouldScheduleFlush() const {
+    return flush_state_.load(std::memory_order_relaxed) == FLUSH_REQUESTED;
+  }
+
+  // Returns true if a flush should be scheduled and the caller should
+  // be the one to schedule it
+  bool MarkFlushScheduled() {
+    auto before = FLUSH_REQUESTED;
+    return flush_state_.compare_exchange_strong(before, FLUSH_SCHEDULED,
+                                                std::memory_order_relaxed,
+                                                std::memory_order_relaxed);
+  }
+
+  // Return an iterator that yields the contents of the memtable.
+  //
+  // The caller must ensure that the underlying MemTable remains live
+  // while the returned iterator is live.  The keys returned by this
+  // iterator are internal keys encoded by AppendInternalKey in the
+  // db/dbformat.{h,cc} module.
+  //
+  // By default, it returns an iterator for prefix seek if prefix_extractor
+  // is configured in Options.
+  // arena: If not null, the arena needs to be used to allocate the Iterator.
+  //        Calling ~Iterator of the iterator will destroy all the states but
+  //        those allocated in arena.
+  InternalIterator* NewIterator(const ReadOptions& read_options, Arena* arena);
+
+  // Returns an iterator that yields the range tombstones of the memtable.
+  // The caller must ensure that the underlying MemTable remains live
+  // while the returned iterator is live.
+  // @param immutable_memtable Whether this memtable is an immutable memtable.
+  // This information is not stored in memtable itself, so it needs to be
+  // specified by the caller. This flag is used internally to decide whether a
+  // cached fragmented range tombstone list can be returned. This cached version
+  // is constructed when a memtable becomes immutable. Setting the flag to false
+  // will always yield correct result, but may incur performance penalty as it
+  // always creates a new fragmented range tombstone list.
+  FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
+      const ReadOptions& read_options, SequenceNumber read_seq,
+      bool immutable_memtable);
+
+  Status VerifyEncodedEntry(Slice encoded,
+                            const ProtectionInfoKVOS64& kv_prot_info);
+
+  // Add an entry into memtable that maps key to value at the
+  // specified sequence number and with the specified type.
+  // Typically value will be empty if type==kTypeDeletion.
+  //
+  // REQUIRES: if allow_concurrent = false, external synchronization to prevent
+  // simultaneous operations on the same MemTable.
+  //
+  // Returns `Status::TryAgain` if the `seq`, `key` combination already exists
+  // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true.
+  // The next attempt should try a larger value for `seq`.
+  Status Add(SequenceNumber seq, ValueType type, const Slice& key,
+             const Slice& value, const ProtectionInfoKVOS64* kv_prot_info,
+             bool allow_concurrent = false,
+             MemTablePostProcessInfo* post_process_info = nullptr,
+             void** hint = nullptr);
+
+  // Used to Get value associated with key or Get Merge Operands associated
+  // with key.
+  // If do_merge = true the default behavior which is Get value for key is
+  // executed. Expected behavior is described right below.
+  // If memtable contains a value for key, store it in *value and return true.
+  // If memtable contains a deletion for key, store a NotFound() error
+  // in *status and return true.
+  // If memtable contains Merge operation as the most recent entry for a key,
+  //   and the merge process does not stop (not reaching a value or delete),
+  //   prepend the current merge operand to *operands.
+  //   store MergeInProgress in s, and return false.
+  // Else, return false.
+  // If any operation was found, its most recent sequence number
+  // will be stored in *seq on success (regardless of whether true/false is
+  // returned).  Otherwise, *seq will be set to kMaxSequenceNumber.
+  // On success, *s may be set to OK, NotFound, or MergeInProgress.  Any other
+  // status returned indicates a corruption or other unexpected error.
+  // If do_merge = false then any Merge Operands encountered for key are simply
+  // stored in merge_context.operands_list and never actually merged to get a
+  // final value. The raw Merge Operands are eventually returned to the user.
+  // @param immutable_memtable Whether this memtable is immutable. Used
+  // internally by NewRangeTombstoneIterator(). See comment above
+  // NewRangeTombstoneIterator() for more detail.
+  bool Get(const LookupKey& key, std::string* value,
+           PinnableWideColumns* columns, std::string* timestamp, Status* s,
+           MergeContext* merge_context,
+           SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
+           const ReadOptions& read_opts, bool immutable_memtable,
+           ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
+           bool do_merge = true);
+
+  bool Get(const LookupKey& key, std::string* value,
+           PinnableWideColumns* columns, std::string* timestamp, Status* s,
+           MergeContext* merge_context,
+           SequenceNumber* max_covering_tombstone_seq,
+           const ReadOptions& read_opts, bool immutable_memtable,
+           ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
+           bool do_merge = true) {
+    SequenceNumber seq;
+    return Get(key, value, columns, timestamp, s, merge_context,
+               max_covering_tombstone_seq, &seq, read_opts, immutable_memtable,
+               callback, is_blob_index, do_merge);
+  }
+
+  // @param immutable_memtable Whether this memtable is immutable. Used
+  // internally by NewRangeTombstoneIterator(). See comment above
+  // NewRangeTombstoneIterator() for more detail.
+  void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
+                ReadCallback* callback, bool immutable_memtable);
+
+  // If `key` exists in current memtable with type value_type and the existing
+  // value is at least as large as the new value, updates it in-place. Otherwise
+  // adds the new value to the memtable out-of-place.
+  //
+  // Returns `Status::TryAgain` if the `seq`, `key` combination already exists
+  // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true.
+  // The next attempt should try a larger value for `seq`.
+  //
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  Status Update(SequenceNumber seq, ValueType value_type, const Slice& key,
+                const Slice& value, const ProtectionInfoKVOS64* kv_prot_info);
+
+  // If `key` exists in current memtable with type `kTypeValue` and the existing
+  // value is at least as large as the new value, updates it in-place. Otherwise
+  // if `key` exists in current memtable with type `kTypeValue`, adds the new
+  // value to the memtable out-of-place.
+  //
+  // Returns `Status::NotFound` if `key` does not exist in current memtable or
+  // the latest version of `key` does not have `kTypeValue`.
+  //
+  // Returns `Status::TryAgain` if the `seq`, `key` combination already exists
+  // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true.
+  // The next attempt should try a larger value for `seq`.
+  //
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  Status UpdateCallback(SequenceNumber seq, const Slice& key,
+                        const Slice& delta,
+                        const ProtectionInfoKVOS64* kv_prot_info);
+
+  // Returns the number of successive merge entries starting from the newest
+  // entry for the key up to the last non-merge entry or last entry for the
+  // key in the memtable.
+  size_t CountSuccessiveMergeEntries(const LookupKey& key);
+
+  // Update counters and flush status after inserting a whole write batch
+  // Used in concurrent memtable inserts.
+  void BatchPostProcess(const MemTablePostProcessInfo& update_counters) {
+    num_entries_.fetch_add(update_counters.num_entries,
+                           std::memory_order_relaxed);
+    data_size_.fetch_add(update_counters.data_size, std::memory_order_relaxed);
+    if (update_counters.num_deletes != 0) {
+      num_deletes_.fetch_add(update_counters.num_deletes,
+                             std::memory_order_relaxed);
+    }
+    UpdateFlushState();
+  }
+
+  // Get total number of entries in the mem table.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  uint64_t num_entries() const {
+    return num_entries_.load(std::memory_order_relaxed);
+  }
+
+  // Get total number of deletes in the mem table.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  uint64_t num_deletes() const {
+    return num_deletes_.load(std::memory_order_relaxed);
+  }
+
+  uint64_t get_data_size() const {
+    return data_size_.load(std::memory_order_relaxed);
+  }
+
+  // Dynamically change the memtable's capacity. If set below the current usage,
+  // the next key added will trigger a flush. Can only increase size when
+  // memtable prefix bloom is disabled, since we can't easily allocate more
+  // space.
+  void UpdateWriteBufferSize(size_t new_write_buffer_size) {
+    if (bloom_filter_ == nullptr ||
+        new_write_buffer_size < write_buffer_size_) {
+      write_buffer_size_.store(new_write_buffer_size,
+                               std::memory_order_relaxed);
+    }
+  }
+
+  // Returns the edits area that is needed for flushing the memtable
+  VersionEdit* GetEdits() { return &edit_; }
+
+  // Returns if there is no entry inserted to the mem table.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  bool IsEmpty() const { return first_seqno_ == 0; }
+
+  // Returns the sequence number of the first element that was inserted
+  // into the memtable.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  SequenceNumber GetFirstSequenceNumber() {
+    return first_seqno_.load(std::memory_order_relaxed);
+  }
+
+  // Returns the sequence number of the first element that was inserted
+  // into the memtable.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  void SetFirstSequenceNumber(SequenceNumber first_seqno) {
+    return first_seqno_.store(first_seqno, std::memory_order_relaxed);
+  }
+
+  // Returns the sequence number that is guaranteed to be smaller than or equal
+  // to the sequence number of any key that could be inserted into this
+  // memtable. It can then be assumed that any write with a larger(or equal)
+  // sequence number will be present in this memtable or a later memtable.
+  //
+  // If the earliest sequence number could not be determined,
+  // kMaxSequenceNumber will be returned.
+  SequenceNumber GetEarliestSequenceNumber() {
+    return earliest_seqno_.load(std::memory_order_relaxed);
+  }
+
+  // Sets the sequence number that is guaranteed to be smaller than or equal
+  // to the sequence number of any key that could be inserted into this
+  // memtable. It can then be assumed that any write with a larger(or equal)
+  // sequence number will be present in this memtable or a later memtable.
+  // Used only for MemPurge operation
+  void SetEarliestSequenceNumber(SequenceNumber earliest_seqno) {
+    return earliest_seqno_.store(earliest_seqno, std::memory_order_relaxed);
+  }
+
+  // DB's latest sequence ID when the memtable is created. This number
+  // may be updated to a more recent one before any key is inserted.
+  SequenceNumber GetCreationSeq() const { return creation_seq_; }
+
+  void SetCreationSeq(SequenceNumber sn) { creation_seq_ = sn; }
+
+  // Returns the next active logfile number when this memtable is about to
+  // be flushed to storage
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  uint64_t GetNextLogNumber() { return mem_next_logfile_number_; }
+
+  // Sets the next active logfile number when this memtable is about to
+  // be flushed to storage
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
+
+  // if this memtable contains data from a committed
+  // two phase transaction we must take note of the
+  // log which contains that data so we can know
+  // when to relese that log
+  void RefLogContainingPrepSection(uint64_t log);
+  uint64_t GetMinLogContainingPrepSection();
+
+  // Notify the underlying storage that no more items will be added.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  // After MarkImmutable() is called, you should not attempt to
+  // write anything to this MemTable().  (Ie. do not call Add() or Update()).
+  void MarkImmutable() {
+    table_->MarkReadOnly();
+    mem_tracker_.DoneAllocating();
+  }
+
+  // Notify the underlying storage that all data it contained has been
+  // persisted.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  void MarkFlushed() { table_->MarkFlushed(); }
+
+  // return true if the current MemTableRep supports merge operator.
+  bool IsMergeOperatorSupported() const {
+    return table_->IsMergeOperatorSupported();
+  }
+
+  // return true if the current MemTableRep supports snapshots.
+  // inplace update prevents snapshots,
+  bool IsSnapshotSupported() const {
+    return table_->IsSnapshotSupported() && !moptions_.inplace_update_support;
+  }
+
+  struct MemTableStats {
+    uint64_t size;
+    uint64_t count;
+  };
+
+  MemTableStats ApproximateStats(const Slice& start_ikey,
+                                 const Slice& end_ikey);
+
+  // Get the lock associated for the key
+  port::RWMutex* GetLock(const Slice& key);
+
+  const InternalKeyComparator& GetInternalKeyComparator() const {
+    return comparator_.comparator;
+  }
+
+  const ImmutableMemTableOptions* GetImmutableMemTableOptions() const {
+    return &moptions_;
+  }
+
+  uint64_t ApproximateOldestKeyTime() const {
+    return oldest_key_time_.load(std::memory_order_relaxed);
+  }
+
+  // REQUIRES: db_mutex held.
+  void SetID(uint64_t id) { id_ = id; }
+
+  uint64_t GetID() const { return id_; }
+
+  void SetFlushCompleted(bool completed) { flush_completed_ = completed; }
+
+  uint64_t GetFileNumber() const { return file_number_; }
+
+  void SetFileNumber(uint64_t file_num) { file_number_ = file_num; }
+
+  void SetFlushInProgress(bool in_progress) {
+    flush_in_progress_ = in_progress;
+  }
+
+#ifndef ROCKSDB_LITE
+  void SetFlushJobInfo(std::unique_ptr<FlushJobInfo>&& info) {
+    flush_job_info_ = std::move(info);
+  }
+
+  std::unique_ptr<FlushJobInfo> ReleaseFlushJobInfo() {
+    return std::move(flush_job_info_);
+  }
+#endif  // !ROCKSDB_LITE
+
+  // Returns a heuristic flush decision
+  bool ShouldFlushNow();
+
+  void ConstructFragmentedRangeTombstones();
+
+  // Returns whether a fragmented range tombstone list is already constructed
+  // for this memtable. It should be constructed right before a memtable is
+  // added to an immutable memtable list. Note that if a memtable does not have
+  // any range tombstone, then no range tombstone list will ever be constructed.
+  // @param allow_empty Specifies whether a memtable with no range tombstone is
+  // considered to have its fragmented range tombstone list constructed.
+  bool IsFragmentedRangeTombstonesConstructed(bool allow_empty = true) const {
+    if (allow_empty) {
+      return fragmented_range_tombstone_list_.get() != nullptr ||
+             is_range_del_table_empty_;
+    } else {
+      return fragmented_range_tombstone_list_.get() != nullptr;
+    }
+  }
+
+  // Returns Corruption status if verification fails.
+  static Status VerifyEntryChecksum(const char* entry,
+                                    size_t protection_bytes_per_key,
+                                    bool allow_data_in_errors = false);
+
+ private:
+  enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED };
+
+  friend class MemTableIterator;
+  friend class MemTableBackwardIterator;
+  friend class MemTableList;
+
+  KeyComparator comparator_;
+  const ImmutableMemTableOptions moptions_;
+  int refs_;
+  const size_t kArenaBlockSize;
+  AllocTracker mem_tracker_;
+  ConcurrentArena arena_;
+  std::unique_ptr<MemTableRep> table_;
+  std::unique_ptr<MemTableRep> range_del_table_;
+  std::atomic_bool is_range_del_table_empty_;
+
+  // Total data size of all data inserted
+  std::atomic<uint64_t> data_size_;
+  std::atomic<uint64_t> num_entries_;
+  std::atomic<uint64_t> num_deletes_;
+
+  // Dynamically changeable memtable option
+  std::atomic<size_t> write_buffer_size_;
+
+  // These are used to manage memtable flushes to storage
+  bool flush_in_progress_;  // started the flush
+  bool flush_completed_;    // finished the flush
+  uint64_t file_number_;    // filled up after flush is complete
+
+  // The updates to be applied to the transaction log when this
+  // memtable is flushed to storage.
+  VersionEdit edit_;
+
+  // The sequence number of the kv that was inserted first
+  std::atomic<SequenceNumber> first_seqno_;
+
+  // The db sequence number at the time of creation or kMaxSequenceNumber
+  // if not set.
+  std::atomic<SequenceNumber> earliest_seqno_;
+
+  SequenceNumber creation_seq_;
+
+  // The log files earlier than this number can be deleted.
+  uint64_t mem_next_logfile_number_;
+
+  // the earliest log containing a prepared section
+  // which has been inserted into this memtable.
+  std::atomic<uint64_t> min_prep_log_referenced_;
+
+  // rw locks for inplace updates
+  std::vector<port::RWMutex> locks_;
+
+  const SliceTransform* const prefix_extractor_;
+  std::unique_ptr<DynamicBloom> bloom_filter_;
+
+  std::atomic<FlushStateEnum> flush_state_;
+
+  SystemClock* clock_;
+
+  // Extract sequential insert prefixes.
+  const SliceTransform* insert_with_hint_prefix_extractor_;
+
+  // Insert hints for each prefix.
+  UnorderedMapH<Slice, void*, SliceHasher> insert_hints_;
+
+  // Timestamp of oldest key
+  std::atomic<uint64_t> oldest_key_time_;
+
+  // Memtable id to track flush.
+  uint64_t id_ = 0;
+
+  // Sequence number of the atomic flush that is responsible for this memtable.
+  // The sequence number of atomic flush is a seq, such that no writes with
+  // sequence numbers greater than or equal to seq are flushed, while all
+  // writes with sequence number smaller than seq are flushed.
+  SequenceNumber atomic_flush_seqno_;
+
+  // keep track of memory usage in table_, arena_, and range_del_table_.
+  // Gets refreshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow`
+  std::atomic<uint64_t> approximate_memory_usage_;
+
+#ifndef ROCKSDB_LITE
+  // Flush job info of the current memtable.
+  std::unique_ptr<FlushJobInfo> flush_job_info_;
+#endif  // !ROCKSDB_LITE
+
+  // Updates flush_state_ using ShouldFlushNow()
+  void UpdateFlushState();
+
+  void UpdateOldestKeyTime();
+
+  void GetFromTable(const LookupKey& key,
+                    SequenceNumber max_covering_tombstone_seq, bool do_merge,
+                    ReadCallback* callback, bool* is_blob_index,
+                    std::string* value, PinnableWideColumns* columns,
+                    std::string* timestamp, Status* s,
+                    MergeContext* merge_context, SequenceNumber* seq,
+                    bool* found_final_value, bool* merge_in_progress);
+
+  // Always returns non-null and assumes certain pre-checks (e.g.,
+  // is_range_del_table_empty_) are done. This is only valid during the lifetime
+  // of the underlying memtable.
+  // read_seq and read_options.timestamp will be used as the upper bound
+  // for range tombstones.
+  FragmentedRangeTombstoneIterator* NewRangeTombstoneIteratorInternal(
+      const ReadOptions& read_options, SequenceNumber read_seq,
+      bool immutable_memtable);
+
+  // The fragmented range tombstones of this memtable.
+  // This is constructed when this memtable becomes immutable
+  // if !is_range_del_table_empty_.
+  std::unique_ptr<FragmentedRangeTombstoneList>
+      fragmented_range_tombstone_list_;
+
+  // makes sure there is a single range tombstone writer to invalidate cache
+  std::mutex range_del_mutex_;
+  CoreLocalArray<std::shared_ptr<FragmentedRangeTombstoneListCache>>
+      cached_range_tombstone_;
+
+  void UpdateEntryChecksum(const ProtectionInfoKVOS64* kv_prot_info,
+                           const Slice& key, const Slice& value, ValueType type,
+                           SequenceNumber s, char* checksum_ptr);
+};
+
+extern const char* EncodeKey(std::string* scratch, const Slice& target);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/memtable_list.cc b/src/rocksdb/db/memtable_list.cc
new file mode 100644
index 000000000..1545003ad
--- /dev/null
+++ b/src/rocksdb/db/memtable_list.cc
@@ -0,0 +1,991 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/memtable_list.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <limits>
+#include <queue>
+#include <string>
+
+#include "db/db_impl/db_impl.h"
+#include "db/memtable.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/version_set.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "monitoring/thread_status_util.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "table/merging_iterator.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class InternalKeyComparator;
+class Mutex;
+class VersionSet;
+
+void MemTableListVersion::AddMemTable(MemTable* m) {
+  memlist_.push_front(m);
+  *parent_memtable_list_memory_usage_ += m->ApproximateMemoryUsage();
+}
+
+void MemTableListVersion::UnrefMemTable(autovector<MemTable*>* to_delete,
+                                        MemTable* m) {
+  if (m->Unref()) {
+    to_delete->push_back(m);
+    assert(*parent_memtable_list_memory_usage_ >= m->ApproximateMemoryUsage());
+    *parent_memtable_list_memory_usage_ -= m->ApproximateMemoryUsage();
+  }
+}
+
+MemTableListVersion::MemTableListVersion(
+    size_t* parent_memtable_list_memory_usage, const MemTableListVersion& old)
+    : max_write_buffer_number_to_maintain_(
+          old.max_write_buffer_number_to_maintain_),
+      max_write_buffer_size_to_maintain_(
+          old.max_write_buffer_size_to_maintain_),
+      parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {
+  memlist_ = old.memlist_;
+  for (auto& m : memlist_) {
+    m->Ref();
+  }
+
+  memlist_history_ = old.memlist_history_;
+  for (auto& m : memlist_history_) {
+    m->Ref();
+  }
+}
+
+MemTableListVersion::MemTableListVersion(
+    size_t* parent_memtable_list_memory_usage,
+    int max_write_buffer_number_to_maintain,
+    int64_t max_write_buffer_size_to_maintain)
+    : max_write_buffer_number_to_maintain_(max_write_buffer_number_to_maintain),
+      max_write_buffer_size_to_maintain_(max_write_buffer_size_to_maintain),
+      parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {}
+
+void MemTableListVersion::Ref() { ++refs_; }
+
+// called by superversion::clean()
+void MemTableListVersion::Unref(autovector<MemTable*>* to_delete) {
+  assert(refs_ >= 1);
+  --refs_;
+  if (refs_ == 0) {
+    // if to_delete is equal to nullptr it means we're confident
+    // that refs_ will not be zero
+    assert(to_delete != nullptr);
+    for (const auto& m : memlist_) {
+      UnrefMemTable(to_delete, m);
+    }
+    for (const auto& m : memlist_history_) {
+      UnrefMemTable(to_delete, m);
+    }
+    delete this;
+  }
+}
+
+int MemTableList::NumNotFlushed() const {
+  int size = static_cast<int>(current_->memlist_.size());
+  assert(num_flush_not_started_ <= size);
+  return size;
+}
+
+int MemTableList::NumFlushed() const {
+  return static_cast<int>(current_->memlist_history_.size());
+}
+
+// Search all the memtables starting from the most recent one.
+// Return the most recent value found, if any.
+// Operands stores the list of merge operations to apply, so far.
+bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
+                              PinnableWideColumns* columns,
+                              std::string* timestamp, Status* s,
+                              MergeContext* merge_context,
+                              SequenceNumber* max_covering_tombstone_seq,
+                              SequenceNumber* seq, const ReadOptions& read_opts,
+                              ReadCallback* callback, bool* is_blob_index) {
+  return GetFromList(&memlist_, key, value, columns, timestamp, s,
+                     merge_context, max_covering_tombstone_seq, seq, read_opts,
+                     callback, is_blob_index);
+}
+
+void MemTableListVersion::MultiGet(const ReadOptions& read_options,
+                                   MultiGetRange* range,
+                                   ReadCallback* callback) {
+  for (auto memtable : memlist_) {
+    memtable->MultiGet(read_options, range, callback,
+                       true /* immutable_memtable */);
+    if (range->empty()) {
+      return;
+    }
+  }
+}
+
+bool MemTableListVersion::GetMergeOperands(
+    const LookupKey& key, Status* s, MergeContext* merge_context,
+    SequenceNumber* max_covering_tombstone_seq, const ReadOptions& read_opts) {
+  for (MemTable* memtable : memlist_) {
+    bool done = memtable->Get(
+        key, /*value=*/nullptr, /*columns=*/nullptr, /*timestamp=*/nullptr, s,
+        merge_context, max_covering_tombstone_seq, read_opts,
+        true /* immutable_memtable */, nullptr, nullptr, false);
+    if (done) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool MemTableListVersion::GetFromHistory(
+    const LookupKey& key, std::string* value, PinnableWideColumns* columns,
+    std::string* timestamp, Status* s, MergeContext* merge_context,
+    SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
+    const ReadOptions& read_opts, bool* is_blob_index) {
+  return GetFromList(&memlist_history_, key, value, columns, timestamp, s,
+                     merge_context, max_covering_tombstone_seq, seq, read_opts,
+                     nullptr /*read_callback*/, is_blob_index);
+}
+
+bool MemTableListVersion::GetFromList(
+    std::list<MemTable*>* list, const LookupKey& key, std::string* value,
+    PinnableWideColumns* columns, std::string* timestamp, Status* s,
+    MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq,
+    SequenceNumber* seq, const ReadOptions& read_opts, ReadCallback* callback,
+    bool* is_blob_index) {
+  *seq = kMaxSequenceNumber;
+
+  for (auto& memtable : *list) {
+    assert(memtable->IsFragmentedRangeTombstonesConstructed());
+    SequenceNumber current_seq = kMaxSequenceNumber;
+
+    bool done =
+        memtable->Get(key, value, columns, timestamp, s, merge_context,
+                      max_covering_tombstone_seq, &current_seq, read_opts,
+                      true /* immutable_memtable */, callback, is_blob_index);
+    if (*seq == kMaxSequenceNumber) {
+      // Store the most recent sequence number of any operation on this key.
+      // Since we only care about the most recent change, we only need to
+      // return the first operation found when searching memtables in
+      // reverse-chronological order.
+      // current_seq would be equal to kMaxSequenceNumber if the value was to be
+      // skipped. This allows seq to be assigned again when the next value is
+      // read.
+      *seq = current_seq;
+    }
+
+    if (done) {
+      assert(*seq != kMaxSequenceNumber || s->IsNotFound());
+      return true;
+    }
+    if (!done && !s->ok() && !s->IsMergeInProgress() && !s->IsNotFound()) {
+      return false;
+    }
+  }
+  return false;
+}
+
+Status MemTableListVersion::AddRangeTombstoneIterators(
+    const ReadOptions& read_opts, Arena* /*arena*/,
+    RangeDelAggregator* range_del_agg) {
+  assert(range_del_agg != nullptr);
+  // Except for snapshot read, using kMaxSequenceNumber is OK because these
+  // are immutable memtables.
+  SequenceNumber read_seq = read_opts.snapshot != nullptr
+                                ? read_opts.snapshot->GetSequenceNumber()
+                                : kMaxSequenceNumber;
+  for (auto& m : memlist_) {
+    assert(m->IsFragmentedRangeTombstonesConstructed());
+    std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+        m->NewRangeTombstoneIterator(read_opts, read_seq,
+                                     true /* immutable_memtable */));
+    range_del_agg->AddTombstones(std::move(range_del_iter));
+  }
+  return Status::OK();
+}
+
+void MemTableListVersion::AddIterators(
+    const ReadOptions& options, std::vector<InternalIterator*>* iterator_list,
+    Arena* arena) {
+  for (auto& m : memlist_) {
+    iterator_list->push_back(m->NewIterator(options, arena));
+  }
+}
+
+void MemTableListVersion::AddIterators(const ReadOptions& options,
+                                       MergeIteratorBuilder* merge_iter_builder,
+                                       bool add_range_tombstone_iter) {
+  for (auto& m : memlist_) {
+    auto mem_iter = m->NewIterator(options, merge_iter_builder->GetArena());
+    if (!add_range_tombstone_iter || options.ignore_range_deletions) {
+      merge_iter_builder->AddIterator(mem_iter);
+    } else {
+      // Except for snapshot read, using kMaxSequenceNumber is OK because these
+      // are immutable memtables.
+      SequenceNumber read_seq = options.snapshot != nullptr
+                                    ? options.snapshot->GetSequenceNumber()
+                                    : kMaxSequenceNumber;
+      TruncatedRangeDelIterator* mem_tombstone_iter = nullptr;
+      auto range_del_iter = m->NewRangeTombstoneIterator(
+          options, read_seq, true /* immutale_memtable */);
+      if (range_del_iter == nullptr || range_del_iter->empty()) {
+        delete range_del_iter;
+      } else {
+        mem_tombstone_iter = new TruncatedRangeDelIterator(
+            std::unique_ptr<FragmentedRangeTombstoneIterator>(range_del_iter),
+            &m->GetInternalKeyComparator(), nullptr /* smallest */,
+            nullptr /* largest */);
+      }
+      merge_iter_builder->AddPointAndTombstoneIterator(mem_iter,
+                                                       mem_tombstone_iter);
+    }
+  }
+}
+
+uint64_t MemTableListVersion::GetTotalNumEntries() const {
+  uint64_t total_num = 0;
+  for (auto& m : memlist_) {
+    total_num += m->num_entries();
+  }
+  return total_num;
+}
+
+MemTable::MemTableStats MemTableListVersion::ApproximateStats(
+    const Slice& start_ikey, const Slice& end_ikey) {
+  MemTable::MemTableStats total_stats = {0, 0};
+  for (auto& m : memlist_) {
+    auto mStats = m->ApproximateStats(start_ikey, end_ikey);
+    total_stats.size += mStats.size;
+    total_stats.count += mStats.count;
+  }
+  return total_stats;
+}
+
+uint64_t MemTableListVersion::GetTotalNumDeletes() const {
+  uint64_t total_num = 0;
+  for (auto& m : memlist_) {
+    total_num += m->num_deletes();
+  }
+  return total_num;
+}
+
+SequenceNumber MemTableListVersion::GetEarliestSequenceNumber(
+    bool include_history) const {
+  if (include_history && !memlist_history_.empty()) {
+    return memlist_history_.back()->GetEarliestSequenceNumber();
+  } else if (!memlist_.empty()) {
+    return memlist_.back()->GetEarliestSequenceNumber();
+  } else {
+    return kMaxSequenceNumber;
+  }
+}
+
+SequenceNumber MemTableListVersion::GetFirstSequenceNumber() const {
+  SequenceNumber min_first_seqno = kMaxSequenceNumber;
+  // The first memtable in the list might not be the oldest one with mempurge
+  for (const auto& m : memlist_) {
+    min_first_seqno = std::min(m->GetFirstSequenceNumber(), min_first_seqno);
+  }
+  return min_first_seqno;
+}
+
+// caller is responsible for referencing m
+void MemTableListVersion::Add(MemTable* m, autovector<MemTable*>* to_delete) {
+  assert(refs_ == 1);  // only when refs_ == 1 is MemTableListVersion mutable
+  AddMemTable(m);
+  // m->MemoryAllocatedBytes() is added in MemoryAllocatedBytesExcludingLast
+  TrimHistory(to_delete, 0);
+}
+
+// Removes m from list of memtables not flushed.  Caller should NOT Unref m.
+void MemTableListVersion::Remove(MemTable* m,
+                                 autovector<MemTable*>* to_delete) {
+  assert(refs_ == 1);  // only when refs_ == 1 is MemTableListVersion mutable
+  memlist_.remove(m);
+
+  m->MarkFlushed();
+  if (max_write_buffer_size_to_maintain_ > 0 ||
+      max_write_buffer_number_to_maintain_ > 0) {
+    memlist_history_.push_front(m);
+    // Unable to get size of mutable memtable at this point, pass 0 to
+    // TrimHistory as a best effort.
+    TrimHistory(to_delete, 0);
+  } else {
+    UnrefMemTable(to_delete, m);
+  }
+}
+
+// return the total memory usage assuming the oldest flushed memtable is dropped
+size_t MemTableListVersion::MemoryAllocatedBytesExcludingLast() const {
+  size_t total_memtable_size = 0;
+  for (auto& memtable : memlist_) {
+    total_memtable_size += memtable->MemoryAllocatedBytes();
+  }
+  for (auto& memtable : memlist_history_) {
+    total_memtable_size += memtable->MemoryAllocatedBytes();
+  }
+  if (!memlist_history_.empty()) {
+    total_memtable_size -= memlist_history_.back()->MemoryAllocatedBytes();
+  }
+  return total_memtable_size;
+}
+
+bool MemTableListVersion::MemtableLimitExceeded(size_t usage) {
+  if (max_write_buffer_size_to_maintain_ > 0) {
+    // calculate the total memory usage after dropping the oldest flushed
+    // memtable, compare with max_write_buffer_size_to_maintain_ to decide
+    // whether to trim history
+    return MemoryAllocatedBytesExcludingLast() + usage >=
+           static_cast<size_t>(max_write_buffer_size_to_maintain_);
+  } else if (max_write_buffer_number_to_maintain_ > 0) {
+    return memlist_.size() + memlist_history_.size() >
+           static_cast<size_t>(max_write_buffer_number_to_maintain_);
+  } else {
+    return false;
+  }
+}
+
+// Make sure we don't use up too much space in history
+bool MemTableListVersion::TrimHistory(autovector<MemTable*>* to_delete,
+                                      size_t usage) {
+  bool ret = false;
+  while (MemtableLimitExceeded(usage) && !memlist_history_.empty()) {
+    MemTable* x = memlist_history_.back();
+    memlist_history_.pop_back();
+
+    UnrefMemTable(to_delete, x);
+    ret = true;
+  }
+  return ret;
+}
+
+// Returns true if there is at least one memtable on which flush has
+// not yet started.
+bool MemTableList::IsFlushPending() const {
+  if ((flush_requested_ && num_flush_not_started_ > 0) ||
+      (num_flush_not_started_ >= min_write_buffer_number_to_merge_)) {
+    assert(imm_flush_needed.load(std::memory_order_relaxed));
+    return true;
+  }
+  return false;
+}
+
+bool MemTableList::IsFlushPendingOrRunning() const {
+  if (current_->memlist_.size() - num_flush_not_started_ > 0) {
+    // Flush is already running on at least one memtable
+    return true;
+  }
+  return IsFlushPending();
+}
+
+// Returns the memtables that need to be flushed.
+void MemTableList::PickMemtablesToFlush(uint64_t max_memtable_id,
+                                        autovector<MemTable*>* ret,
+                                        uint64_t* max_next_log_number) {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH);
+  const auto& memlist = current_->memlist_;
+  bool atomic_flush = false;
+
+  // Note: every time MemTableList::Add(mem) is called, it adds the new mem
+  // at the FRONT of the memlist (memlist.push_front(mem)). Therefore, by
+  // iterating through the memlist starting at the end, the vector<MemTable*>
+  // ret is filled with memtables already sorted in increasing MemTable ID.
+  // However, when the mempurge feature is activated, new memtables with older
+  // IDs will be added to the memlist.
+  for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
+    MemTable* m = *it;
+    if (!atomic_flush && m->atomic_flush_seqno_ != kMaxSequenceNumber) {
+      atomic_flush = true;
+    }
+    if (m->GetID() > max_memtable_id) {
+      break;
+    }
+    if (!m->flush_in_progress_) {
+      assert(!m->flush_completed_);
+      num_flush_not_started_--;
+      if (num_flush_not_started_ == 0) {
+        imm_flush_needed.store(false, std::memory_order_release);
+      }
+      m->flush_in_progress_ = true;  // flushing will start very soon
+      if (max_next_log_number) {
+        *max_next_log_number =
+            std::max(m->GetNextLogNumber(), *max_next_log_number);
+      }
+      ret->push_back(m);
+    } else if (!ret->empty()) {
+      // This `break` is necessary to prevent picking non-consecutive memtables
+      // in case `memlist` has one or more entries with
+      // `flush_in_progress_ == true` sandwiched between entries with
+      // `flush_in_progress_ == false`. This could happen after parallel flushes
+      // are picked and the one flushing older memtables is rolled back.
+      break;
+    }
+  }
+  if (!atomic_flush || num_flush_not_started_ == 0) {
+    flush_requested_ = false;  // start-flush request is complete
+  }
+}
+
+void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
+                                         uint64_t /*file_number*/) {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_MEMTABLE_ROLLBACK);
+  assert(!mems.empty());
+
+  // If the flush was not successful, then just reset state.
+  // Maybe a succeeding attempt to flush will be successful.
+  for (MemTable* m : mems) {
+    assert(m->flush_in_progress_);
+    assert(m->file_number_ == 0);
+
+    m->flush_in_progress_ = false;
+    m->flush_completed_ = false;
+    m->edit_.Clear();
+    num_flush_not_started_++;
+  }
+  imm_flush_needed.store(true, std::memory_order_release);
+}
+
+// Try record a successful flush in the manifest file. It might just return
+// Status::OK letting a concurrent flush to do actual the recording..
+Status MemTableList::TryInstallMemtableFlushResults(
+    ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+    const autovector<MemTable*>& mems, LogsWithPrepTracker* prep_tracker,
+    VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number,
+    autovector<MemTable*>* to_delete, FSDirectory* db_directory,
+    LogBuffer* log_buffer,
+    std::list<std::unique_ptr<FlushJobInfo>>* committed_flush_jobs_info,
+    bool write_edits) {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS);
+  mu->AssertHeld();
+
+  // Flush was successful
+  // Record the status on the memtable object. Either this call or a call by a
+  // concurrent flush thread will read the status and write it to manifest.
+  for (size_t i = 0; i < mems.size(); ++i) {
+    // All the edits are associated with the first memtable of this batch.
+    assert(i == 0 || mems[i]->GetEdits()->NumEntries() == 0);
+
+    mems[i]->flush_completed_ = true;
+    mems[i]->file_number_ = file_number;
+  }
+
+  // if some other thread is already committing, then return
+  Status s;
+  if (commit_in_progress_) {
+    TEST_SYNC_POINT("MemTableList::TryInstallMemtableFlushResults:InProgress");
+    return s;
+  }
+
+  // Only a single thread can be executing this piece of code
+  commit_in_progress_ = true;
+
+  // Retry until all completed flushes are committed. New flushes can finish
+  // while the current thread is writing manifest where mutex is released.
+  while (s.ok()) {
+    auto& memlist = current_->memlist_;
+    // The back is the oldest; if flush_completed_ is not set to it, it means
+    // that we were assigned a more recent memtable. The memtables' flushes must
+    // be recorded in manifest in order. A concurrent flush thread, who is
+    // assigned to flush the oldest memtable, will later wake up and does all
+    // the pending writes to manifest, in order.
+    if (memlist.empty() || !memlist.back()->flush_completed_) {
+      break;
+    }
+    // scan all memtables from the earliest, and commit those
+    // (in that order) that have finished flushing. Memtables
+    // are always committed in the order that they were created.
+    uint64_t batch_file_number = 0;
+    size_t batch_count = 0;
+    autovector<VersionEdit*> edit_list;
+    autovector<MemTable*> memtables_to_flush;
+    // enumerate from the last (earliest) element to see how many batch finished
+    for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
+      MemTable* m = *it;
+      if (!m->flush_completed_) {
+        break;
+      }
+      if (it == memlist.rbegin() || batch_file_number != m->file_number_) {
+        batch_file_number = m->file_number_;
+        if (m->edit_.GetBlobFileAdditions().empty()) {
+          ROCKS_LOG_BUFFER(log_buffer,
+                           "[%s] Level-0 commit table #%" PRIu64 " started",
+                           cfd->GetName().c_str(), m->file_number_);
+        } else {
+          ROCKS_LOG_BUFFER(log_buffer,
+                           "[%s] Level-0 commit table #%" PRIu64
+                           " (+%zu blob files) started",
+                           cfd->GetName().c_str(), m->file_number_,
+                           m->edit_.GetBlobFileAdditions().size());
+        }
+
+        edit_list.push_back(&m->edit_);
+        memtables_to_flush.push_back(m);
+#ifndef ROCKSDB_LITE
+        std::unique_ptr<FlushJobInfo> info = m->ReleaseFlushJobInfo();
+        if (info != nullptr) {
+          committed_flush_jobs_info->push_back(std::move(info));
+        }
+#else
+        (void)committed_flush_jobs_info;
+#endif  // !ROCKSDB_LITE
+      }
+      batch_count++;
+    }
+
+    // TODO(myabandeh): Not sure how batch_count could be 0 here.
+    if (batch_count > 0) {
+      uint64_t min_wal_number_to_keep = 0;
+      assert(edit_list.size() > 0);
+      if (vset->db_options()->allow_2pc) {
+        // Note that if mempurge is successful, the edit_list will
+        // not be applicable (contains info of new min_log number to keep,
+        // and level 0 file path of SST file created during normal flush,
+        // so both pieces of information are irrelevant after a successful
+        // mempurge operation).
+        min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC(
+            vset, *cfd, edit_list, memtables_to_flush, prep_tracker);
+
+        // We piggyback the information of earliest log file to keep in the
+        // manifest entry for the last file flushed.
+      } else {
+        min_wal_number_to_keep =
+            PrecomputeMinLogNumberToKeepNon2PC(vset, *cfd, edit_list);
+      }
+
+      VersionEdit wal_deletion;
+      wal_deletion.SetMinLogNumberToKeep(min_wal_number_to_keep);
+      if (vset->db_options()->track_and_verify_wals_in_manifest) {
+        if (min_wal_number_to_keep >
+            vset->GetWalSet().GetMinWalNumberToKeep()) {
+          wal_deletion.DeleteWalsBefore(min_wal_number_to_keep);
+        }
+        TEST_SYNC_POINT_CALLBACK(
+            "MemTableList::TryInstallMemtableFlushResults:"
+            "AfterComputeMinWalToKeep",
+            nullptr);
+      }
+      edit_list.push_back(&wal_deletion);
+
+      const auto manifest_write_cb = [this, cfd, batch_count, log_buffer,
+                                      to_delete, mu](const Status& status) {
+        RemoveMemTablesOrRestoreFlags(status, cfd, batch_count, log_buffer,
+                                      to_delete, mu);
+      };
+      if (write_edits) {
+        // this can release and reacquire the mutex.
+        s = vset->LogAndApply(cfd, mutable_cf_options, edit_list, mu,
+                              db_directory, /*new_descriptor_log=*/false,
+                              /*column_family_options=*/nullptr,
+                              manifest_write_cb);
+      } else {
+        // If write_edit is false (e.g: successful mempurge),
+        // then remove old memtables, wake up manifest write queue threads,
+        // and don't commit anything to the manifest file.
+        RemoveMemTablesOrRestoreFlags(s, cfd, batch_count, log_buffer,
+                                      to_delete, mu);
+        // Note: cfd->SetLogNumber is only called when a VersionEdit
+        // is written to MANIFEST. When mempurge is succesful, we skip
+        // this step, therefore cfd->GetLogNumber is always is
+        // earliest log with data unflushed.
+        // Notify new head of manifest write queue.
+        // wake up all the waiting writers
+        // TODO(bjlemaire): explain full reason WakeUpWaitingManifestWriters
+        // needed or investigate more.
+        vset->WakeUpWaitingManifestWriters();
+      }
+    }
+  }
+  commit_in_progress_ = false;
+  return s;
+}
+
+// New memtables are inserted at the front of the list.
+void MemTableList::Add(MemTable* m, autovector<MemTable*>* to_delete) {
+  assert(static_cast<int>(current_->memlist_.size()) >= num_flush_not_started_);
+  InstallNewVersion();
+  // this method is used to move mutable memtable into an immutable list.
+  // since mutable memtable is already refcounted by the DBImpl,
+  // and when moving to the immutable list we don't unref it,
+  // we don't have to ref the memtable here. we just take over the
+  // reference from the DBImpl.
+  current_->Add(m, to_delete);
+  m->MarkImmutable();
+  num_flush_not_started_++;
+  if (num_flush_not_started_ == 1) {
+    imm_flush_needed.store(true, std::memory_order_release);
+  }
+  UpdateCachedValuesFromMemTableListVersion();
+  ResetTrimHistoryNeeded();
+}
+
+bool MemTableList::TrimHistory(autovector<MemTable*>* to_delete, size_t usage) {
+  InstallNewVersion();
+  bool ret = current_->TrimHistory(to_delete, usage);
+  UpdateCachedValuesFromMemTableListVersion();
+  ResetTrimHistoryNeeded();
+  return ret;
+}
+
+// Returns an estimate of the number of bytes of data in use.
+size_t MemTableList::ApproximateUnflushedMemTablesMemoryUsage() {
+  size_t total_size = 0;
+  for (auto& memtable : current_->memlist_) {
+    total_size += memtable->ApproximateMemoryUsage();
+  }
+  return total_size;
+}
+
+size_t MemTableList::ApproximateMemoryUsage() { return current_memory_usage_; }
+
+size_t MemTableList::MemoryAllocatedBytesExcludingLast() const {
+  const size_t usage = current_memory_allocted_bytes_excluding_last_.load(
+      std::memory_order_relaxed);
+  return usage;
+}
+
+bool MemTableList::HasHistory() const {
+  const bool has_history = current_has_history_.load(std::memory_order_relaxed);
+  return has_history;
+}
+
+void MemTableList::UpdateCachedValuesFromMemTableListVersion() {
+  const size_t total_memtable_size =
+      current_->MemoryAllocatedBytesExcludingLast();
+  current_memory_allocted_bytes_excluding_last_.store(
+      total_memtable_size, std::memory_order_relaxed);
+
+  const bool has_history = current_->HasHistory();
+  current_has_history_.store(has_history, std::memory_order_relaxed);
+}
+
+uint64_t MemTableList::ApproximateOldestKeyTime() const {
+  if (!current_->memlist_.empty()) {
+    return current_->memlist_.back()->ApproximateOldestKeyTime();
+  }
+  return std::numeric_limits<uint64_t>::max();
+}
+
+void MemTableList::InstallNewVersion() {
+  if (current_->refs_ == 1) {
+    // we're the only one using the version, just keep using it
+  } else {
+    // somebody else holds the current version, we need to create new one
+    MemTableListVersion* version = current_;
+    current_ = new MemTableListVersion(&current_memory_usage_, *version);
+    current_->Ref();
+    version->Unref();
+  }
+}
+
+void MemTableList::RemoveMemTablesOrRestoreFlags(
+    const Status& s, ColumnFamilyData* cfd, size_t batch_count,
+    LogBuffer* log_buffer, autovector<MemTable*>* to_delete,
+    InstrumentedMutex* mu) {
+  assert(mu);
+  mu->AssertHeld();
+  assert(to_delete);
+  // we will be changing the version in the next code path,
+  // so we better create a new one, since versions are immutable
+  InstallNewVersion();
+
+  // All the later memtables that have the same filenum
+  // are part of the same batch. They can be committed now.
+  uint64_t mem_id = 1;  // how many memtables have been flushed.
+
+  // commit new state only if the column family is NOT dropped.
+  // The reason is as follows (refer to
+  // ColumnFamilyTest.FlushAndDropRaceCondition).
+  // If the column family is dropped, then according to LogAndApply, its
+  // corresponding flush operation is NOT written to the MANIFEST. This
+  // means the DB is not aware of the L0 files generated from the flush.
+  // By committing the new state, we remove the memtable from the memtable
+  // list. Creating an iterator on this column family will not be able to
+  // read full data since the memtable is removed, and the DB is not aware
+  // of the L0 files, causing MergingIterator unable to build child
+  // iterators. RocksDB contract requires that the iterator can be created
+  // on a dropped column family, and we must be able to
+  // read full data as long as column family handle is not deleted, even if
+  // the column family is dropped.
+  if (s.ok() && !cfd->IsDropped()) {  // commit new state
+    while (batch_count-- > 0) {
+      MemTable* m = current_->memlist_.back();
+      if (m->edit_.GetBlobFileAdditions().empty()) {
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "[%s] Level-0 commit table #%" PRIu64
+                         ": memtable #%" PRIu64 " done",
+                         cfd->GetName().c_str(), m->file_number_, mem_id);
+      } else {
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "[%s] Level-0 commit table #%" PRIu64
+                         " (+%zu blob files)"
+                         ": memtable #%" PRIu64 " done",
+                         cfd->GetName().c_str(), m->file_number_,
+                         m->edit_.GetBlobFileAdditions().size(), mem_id);
+      }
+
+      assert(m->file_number_ > 0);
+      current_->Remove(m, to_delete);
+      UpdateCachedValuesFromMemTableListVersion();
+      ResetTrimHistoryNeeded();
+      ++mem_id;
+    }
+  } else {
+    for (auto it = current_->memlist_.rbegin(); batch_count-- > 0; ++it) {
+      MemTable* m = *it;
+      // commit failed. setup state so that we can flush again.
+      if (m->edit_.GetBlobFileAdditions().empty()) {
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "Level-0 commit table #%" PRIu64 ": memtable #%" PRIu64
+                         " failed",
+                         m->file_number_, mem_id);
+      } else {
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "Level-0 commit table #%" PRIu64
+                         " (+%zu blob files)"
+                         ": memtable #%" PRIu64 " failed",
+                         m->file_number_,
+                         m->edit_.GetBlobFileAdditions().size(), mem_id);
+      }
+
+      m->flush_completed_ = false;
+      m->flush_in_progress_ = false;
+      m->edit_.Clear();
+      num_flush_not_started_++;
+      m->file_number_ = 0;
+      imm_flush_needed.store(true, std::memory_order_release);
+      ++mem_id;
+    }
+  }
+}
+
+uint64_t MemTableList::PrecomputeMinLogContainingPrepSection(
+    const std::unordered_set<MemTable*>* memtables_to_flush) {
+  uint64_t min_log = 0;
+
+  for (auto& m : current_->memlist_) {
+    if (memtables_to_flush && memtables_to_flush->count(m)) {
+      continue;
+    }
+
+    auto log = m->GetMinLogContainingPrepSection();
+
+    if (log > 0 && (min_log == 0 || log < min_log)) {
+      min_log = log;
+    }
+  }
+
+  return min_log;
+}
+
+// Commit a successful atomic flush in the manifest file.
+Status InstallMemtableAtomicFlushResults(
+    const autovector<MemTableList*>* imm_lists,
+    const autovector<ColumnFamilyData*>& cfds,
+    const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+    const autovector<const autovector<MemTable*>*>& mems_list, VersionSet* vset,
+    LogsWithPrepTracker* prep_tracker, InstrumentedMutex* mu,
+    const autovector<FileMetaData*>& file_metas,
+    const autovector<std::list<std::unique_ptr<FlushJobInfo>>*>&
+        committed_flush_jobs_info,
+    autovector<MemTable*>* to_delete, FSDirectory* db_directory,
+    LogBuffer* log_buffer) {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS);
+  mu->AssertHeld();
+
+  size_t num = mems_list.size();
+  assert(cfds.size() == num);
+  if (imm_lists != nullptr) {
+    assert(imm_lists->size() == num);
+  }
+  if (num == 0) {
+    return Status::OK();
+  }
+
+  for (size_t k = 0; k != num; ++k) {
+#ifndef NDEBUG
+    const auto* imm =
+        (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k);
+    if (!mems_list[k]->empty()) {
+      assert((*mems_list[k])[0]->GetID() == imm->GetEarliestMemTableID());
+    }
+#endif
+    assert(nullptr != file_metas[k]);
+    for (size_t i = 0; i != mems_list[k]->size(); ++i) {
+      assert(i == 0 || (*mems_list[k])[i]->GetEdits()->NumEntries() == 0);
+      (*mems_list[k])[i]->SetFlushCompleted(true);
+      (*mems_list[k])[i]->SetFileNumber(file_metas[k]->fd.GetNumber());
+    }
+#ifndef ROCKSDB_LITE
+    if (committed_flush_jobs_info[k]) {
+      assert(!mems_list[k]->empty());
+      assert((*mems_list[k])[0]);
+      std::unique_ptr<FlushJobInfo> flush_job_info =
+          (*mems_list[k])[0]->ReleaseFlushJobInfo();
+      committed_flush_jobs_info[k]->push_back(std::move(flush_job_info));
+    }
+#else   //! ROCKSDB_LITE
+    (void)committed_flush_jobs_info;
+#endif  // ROCKSDB_LITE
+  }
+
+  Status s;
+
+  autovector<autovector<VersionEdit*>> edit_lists;
+  uint32_t num_entries = 0;
+  for (const auto mems : mems_list) {
+    assert(mems != nullptr);
+    autovector<VersionEdit*> edits;
+    assert(!mems->empty());
+    edits.emplace_back((*mems)[0]->GetEdits());
+    ++num_entries;
+    edit_lists.emplace_back(edits);
+  }
+
+  WalNumber min_wal_number_to_keep = 0;
+  if (vset->db_options()->allow_2pc) {
+    min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC(
+        vset, cfds, edit_lists, mems_list, prep_tracker);
+  } else {
+    min_wal_number_to_keep =
+        PrecomputeMinLogNumberToKeepNon2PC(vset, cfds, edit_lists);
+  }
+
+  VersionEdit wal_deletion;
+  wal_deletion.SetMinLogNumberToKeep(min_wal_number_to_keep);
+  if (vset->db_options()->track_and_verify_wals_in_manifest &&
+      min_wal_number_to_keep > vset->GetWalSet().GetMinWalNumberToKeep()) {
+    wal_deletion.DeleteWalsBefore(min_wal_number_to_keep);
+  }
+  edit_lists.back().push_back(&wal_deletion);
+  ++num_entries;
+
+  // Mark the version edits as an atomic group if the number of version edits
+  // exceeds 1.
+  if (cfds.size() > 1) {
+    for (size_t i = 0; i < edit_lists.size(); i++) {
+      assert((edit_lists[i].size() == 1) ||
+             ((edit_lists[i].size() == 2) && (i == edit_lists.size() - 1)));
+      for (auto& e : edit_lists[i]) {
+        e->MarkAtomicGroup(--num_entries);
+      }
+    }
+    assert(0 == num_entries);
+  }
+
+  // this can release and reacquire the mutex.
+  s = vset->LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
+                        db_directory);
+
+  for (size_t k = 0; k != cfds.size(); ++k) {
+    auto* imm = (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k);
+    imm->InstallNewVersion();
+  }
+
+  if (s.ok() || s.IsColumnFamilyDropped()) {
+    for (size_t i = 0; i != cfds.size(); ++i) {
+      if (cfds[i]->IsDropped()) {
+        continue;
+      }
+      auto* imm = (imm_lists == nullptr) ? cfds[i]->imm() : imm_lists->at(i);
+      for (auto m : *mems_list[i]) {
+        assert(m->GetFileNumber() > 0);
+        uint64_t mem_id = m->GetID();
+
+        const VersionEdit* const edit = m->GetEdits();
+        assert(edit);
+
+        if (edit->GetBlobFileAdditions().empty()) {
+          ROCKS_LOG_BUFFER(log_buffer,
+                           "[%s] Level-0 commit table #%" PRIu64
+                           ": memtable #%" PRIu64 " done",
+                           cfds[i]->GetName().c_str(), m->GetFileNumber(),
+                           mem_id);
+        } else {
+          ROCKS_LOG_BUFFER(log_buffer,
+                           "[%s] Level-0 commit table #%" PRIu64
+                           " (+%zu blob files)"
+                           ": memtable #%" PRIu64 " done",
+                           cfds[i]->GetName().c_str(), m->GetFileNumber(),
+                           edit->GetBlobFileAdditions().size(), mem_id);
+        }
+
+        imm->current_->Remove(m, to_delete);
+        imm->UpdateCachedValuesFromMemTableListVersion();
+        imm->ResetTrimHistoryNeeded();
+      }
+    }
+  } else {
+    for (size_t i = 0; i != cfds.size(); ++i) {
+      auto* imm = (imm_lists == nullptr) ? cfds[i]->imm() : imm_lists->at(i);
+      for (auto m : *mems_list[i]) {
+        uint64_t mem_id = m->GetID();
+
+        const VersionEdit* const edit = m->GetEdits();
+        assert(edit);
+
+        if (edit->GetBlobFileAdditions().empty()) {
+          ROCKS_LOG_BUFFER(log_buffer,
+                           "[%s] Level-0 commit table #%" PRIu64
+                           ": memtable #%" PRIu64 " failed",
+                           cfds[i]->GetName().c_str(), m->GetFileNumber(),
+                           mem_id);
+        } else {
+          ROCKS_LOG_BUFFER(log_buffer,
+                           "[%s] Level-0 commit table #%" PRIu64
+                           " (+%zu blob files)"
+                           ": memtable #%" PRIu64 " failed",
+                           cfds[i]->GetName().c_str(), m->GetFileNumber(),
+                           edit->GetBlobFileAdditions().size(), mem_id);
+        }
+
+        m->SetFlushCompleted(false);
+        m->SetFlushInProgress(false);
+        m->GetEdits()->Clear();
+        m->SetFileNumber(0);
+        imm->num_flush_not_started_++;
+      }
+      imm->imm_flush_needed.store(true, std::memory_order_release);
+    }
+  }
+
+  return s;
+}
+
+void MemTableList::RemoveOldMemTables(uint64_t log_number,
+                                      autovector<MemTable*>* to_delete) {
+  assert(to_delete != nullptr);
+  InstallNewVersion();
+  auto& memlist = current_->memlist_;
+  autovector<MemTable*> old_memtables;
+  for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
+    MemTable* mem = *it;
+    if (mem->GetNextLogNumber() > log_number) {
+      break;
+    }
+    old_memtables.push_back(mem);
+  }
+
+  for (auto it = old_memtables.begin(); it != old_memtables.end(); ++it) {
+    MemTable* mem = *it;
+    current_->Remove(mem, to_delete);
+    --num_flush_not_started_;
+    if (0 == num_flush_not_started_) {
+      imm_flush_needed.store(false, std::memory_order_release);
+    }
+  }
+
+  UpdateCachedValuesFromMemTableListVersion();
+  ResetTrimHistoryNeeded();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/memtable_list.h b/src/rocksdb/db/memtable_list.h
new file mode 100644
index 000000000..1ad28a59e
--- /dev/null
+++ b/src/rocksdb/db/memtable_list.h
@@ -0,0 +1,471 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#include <deque>
+#include <limits>
+#include <list>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "db/logs_with_prep_tracker.h"
+#include "db/memtable.h"
+#include "db/range_del_aggregator.h"
+#include "file/filename.h"
+#include "logging/log_buffer.h"
+#include "monitoring/instrumented_mutex.h"
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/types.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ColumnFamilyData;
+class InternalKeyComparator;
+class InstrumentedMutex;
+class MergeIteratorBuilder;
+class MemTableList;
+
+struct FlushJobInfo;
+
+// keeps a list of immutable memtables in a vector. the list is immutable
+// if refcount is bigger than one. It is used as a state for Get() and
+// Iterator code paths
+//
+// This class is not thread-safe.  External synchronization is required
+// (such as holding the db mutex or being on the write thread).
+class MemTableListVersion {
+ public:
+  explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
+                               const MemTableListVersion& old);
+  explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
+                               int max_write_buffer_number_to_maintain,
+                               int64_t max_write_buffer_size_to_maintain);
+
+  void Ref();
+  void Unref(autovector<MemTable*>* to_delete = nullptr);
+
+  // Search all the memtables starting from the most recent one.
+  // Return the most recent value found, if any.
+  //
+  // If any operation was found for this key, its most recent sequence number
+  // will be stored in *seq on success (regardless of whether true/false is
+  // returned).  Otherwise, *seq will be set to kMaxSequenceNumber.
+  bool Get(const LookupKey& key, std::string* value,
+           PinnableWideColumns* columns, std::string* timestamp, Status* s,
+           MergeContext* merge_context,
+           SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
+           const ReadOptions& read_opts, ReadCallback* callback = nullptr,
+           bool* is_blob_index = nullptr);
+
+  bool Get(const LookupKey& key, std::string* value,
+           PinnableWideColumns* columns, std::string* timestamp, Status* s,
+           MergeContext* merge_context,
+           SequenceNumber* max_covering_tombstone_seq,
+           const ReadOptions& read_opts, ReadCallback* callback = nullptr,
+           bool* is_blob_index = nullptr) {
+    SequenceNumber seq;
+    return Get(key, value, columns, timestamp, s, merge_context,
+               max_covering_tombstone_seq, &seq, read_opts, callback,
+               is_blob_index);
+  }
+
+  void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
+                ReadCallback* callback);
+
+  // Returns all the merge operands corresponding to the key by searching all
+  // memtables starting from the most recent one.
+  bool GetMergeOperands(const LookupKey& key, Status* s,
+                        MergeContext* merge_context,
+                        SequenceNumber* max_covering_tombstone_seq,
+                        const ReadOptions& read_opts);
+
+  // Similar to Get(), but searches the Memtable history of memtables that
+  // have already been flushed.  Should only be used from in-memory only
+  // queries (such as Transaction validation) as the history may contain
+  // writes that are also present in the SST files.
+  bool GetFromHistory(const LookupKey& key, std::string* value,
+                      PinnableWideColumns* columns, std::string* timestamp,
+                      Status* s, MergeContext* merge_context,
+                      SequenceNumber* max_covering_tombstone_seq,
+                      SequenceNumber* seq, const ReadOptions& read_opts,
+                      bool* is_blob_index = nullptr);
+  bool GetFromHistory(const LookupKey& key, std::string* value,
+                      PinnableWideColumns* columns, std::string* timestamp,
+                      Status* s, MergeContext* merge_context,
+                      SequenceNumber* max_covering_tombstone_seq,
+                      const ReadOptions& read_opts,
+                      bool* is_blob_index = nullptr) {
+    SequenceNumber seq;
+    return GetFromHistory(key, value, columns, timestamp, s, merge_context,
+                          max_covering_tombstone_seq, &seq, read_opts,
+                          is_blob_index);
+  }
+
+  Status AddRangeTombstoneIterators(const ReadOptions& read_opts, Arena* arena,
+                                    RangeDelAggregator* range_del_agg);
+
+  void AddIterators(const ReadOptions& options,
+                    std::vector<InternalIterator*>* iterator_list,
+                    Arena* arena);
+
+  void AddIterators(const ReadOptions& options,
+                    MergeIteratorBuilder* merge_iter_builder,
+                    bool add_range_tombstone_iter);
+
+  uint64_t GetTotalNumEntries() const;
+
+  uint64_t GetTotalNumDeletes() const;
+
+  MemTable::MemTableStats ApproximateStats(const Slice& start_ikey,
+                                           const Slice& end_ikey);
+
+  // Returns the value of MemTable::GetEarliestSequenceNumber() on the most
+  // recent MemTable in this list or kMaxSequenceNumber if the list is empty.
+  // If include_history=true, will also search Memtables in MemTableList
+  // History.
+  SequenceNumber GetEarliestSequenceNumber(bool include_history = false) const;
+
+  // Return the first sequence number from the memtable list, which is the
+  // smallest sequence number of all FirstSequenceNumber.
+  // Return kMaxSequenceNumber if the list is empty.
+  SequenceNumber GetFirstSequenceNumber() const;
+
+ private:
+  friend class MemTableList;
+
+  friend Status InstallMemtableAtomicFlushResults(
+      const autovector<MemTableList*>* imm_lists,
+      const autovector<ColumnFamilyData*>& cfds,
+      const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+      const autovector<const autovector<MemTable*>*>& mems_list,
+      VersionSet* vset, LogsWithPrepTracker* prep_tracker,
+      InstrumentedMutex* mu, const autovector<FileMetaData*>& file_meta,
+      const autovector<std::list<std::unique_ptr<FlushJobInfo>>*>&
+          committed_flush_jobs_info,
+      autovector<MemTable*>* to_delete, FSDirectory* db_directory,
+      LogBuffer* log_buffer);
+
+  // REQUIRE: m is an immutable memtable
+  void Add(MemTable* m, autovector<MemTable*>* to_delete);
+  // REQUIRE: m is an immutable memtable
+  void Remove(MemTable* m, autovector<MemTable*>* to_delete);
+
+  // Return true if memtable is trimmed
+  bool TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
+
+  bool GetFromList(std::list<MemTable*>* list, const LookupKey& key,
+                   std::string* value, PinnableWideColumns* columns,
+                   std::string* timestamp, Status* s,
+                   MergeContext* merge_context,
+                   SequenceNumber* max_covering_tombstone_seq,
+                   SequenceNumber* seq, const ReadOptions& read_opts,
+                   ReadCallback* callback = nullptr,
+                   bool* is_blob_index = nullptr);
+
+  void AddMemTable(MemTable* m);
+
+  void UnrefMemTable(autovector<MemTable*>* to_delete, MemTable* m);
+
+  // Calculate the total amount of memory used by memlist_ and memlist_history_
+  // excluding the last MemTable in memlist_history_. The reason for excluding
+  // the last MemTable is to see if dropping the last MemTable will keep total
+  // memory usage above or equal to max_write_buffer_size_to_maintain_
+  size_t MemoryAllocatedBytesExcludingLast() const;
+
+  // Whether this version contains flushed memtables that are only kept around
+  // for transaction conflict checking.
+  bool HasHistory() const { return !memlist_history_.empty(); }
+
+  bool MemtableLimitExceeded(size_t usage);
+
+  // Immutable MemTables that have not yet been flushed.
+  std::list<MemTable*> memlist_;
+
+  // MemTables that have already been flushed
+  // (used during Transaction validation)
+  std::list<MemTable*> memlist_history_;
+
+  // Maximum number of MemTables to keep in memory (including both flushed
+  const int max_write_buffer_number_to_maintain_;
+  // Maximum size of MemTables to keep in memory (including both flushed
+  // and not-yet-flushed tables).
+  const int64_t max_write_buffer_size_to_maintain_;
+
+  int refs_ = 0;
+
+  size_t* parent_memtable_list_memory_usage_;
+};
+
+// This class stores references to all the immutable memtables.
+// The memtables are flushed to L0 as soon as possible and in
+// any order. If there are more than one immutable memtable, their
+// flushes can occur concurrently.  However, they are 'committed'
+// to the manifest in FIFO order to maintain correctness and
+// recoverability from a crash.
+//
+//
+// Other than imm_flush_needed and imm_trim_needed, this class is not
+// thread-safe and requires external synchronization (such as holding the db
+// mutex or being on the write thread.)
+class MemTableList {
+ public:
+  // A list of memtables.
+  explicit MemTableList(int min_write_buffer_number_to_merge,
+                        int max_write_buffer_number_to_maintain,
+                        int64_t max_write_buffer_size_to_maintain)
+      : imm_flush_needed(false),
+        imm_trim_needed(false),
+        min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
+        current_(new MemTableListVersion(&current_memory_usage_,
+                                         max_write_buffer_number_to_maintain,
+                                         max_write_buffer_size_to_maintain)),
+        num_flush_not_started_(0),
+        commit_in_progress_(false),
+        flush_requested_(false),
+        current_memory_usage_(0),
+        current_memory_allocted_bytes_excluding_last_(0),
+        current_has_history_(false) {
+    current_->Ref();
+  }
+
+  // Should not delete MemTableList without making sure MemTableList::current()
+  // is Unref()'d.
+  ~MemTableList() {}
+
+  MemTableListVersion* current() const { return current_; }
+
+  // so that background threads can detect non-nullptr pointer to
+  // determine whether there is anything more to start flushing.
+  std::atomic<bool> imm_flush_needed;
+
+  std::atomic<bool> imm_trim_needed;
+
+  // Returns the total number of memtables in the list that haven't yet
+  // been flushed and logged.
+  int NumNotFlushed() const;
+
+  // Returns total number of memtables in the list that have been
+  // completely flushed and logged.
+  int NumFlushed() const;
+
+  // Returns true if there is at least one memtable on which flush has
+  // not yet started.
+  bool IsFlushPending() const;
+
+  // Returns true if there is at least one memtable that is pending flush or
+  // flushing.
+  bool IsFlushPendingOrRunning() const;
+
+  // Returns the earliest memtables that needs to be flushed. The returned
+  // memtables are guaranteed to be in the ascending order of created time.
+  void PickMemtablesToFlush(uint64_t max_memtable_id,
+                            autovector<MemTable*>* mems,
+                            uint64_t* max_next_log_number = nullptr);
+
+  // Reset status of the given memtable list back to pending state so that
+  // they can get picked up again on the next round of flush.
+  void RollbackMemtableFlush(const autovector<MemTable*>& mems,
+                             uint64_t file_number);
+
+  // Try commit a successful flush in the manifest file. It might just return
+  // Status::OK letting a concurrent flush to do the actual the recording.
+  Status TryInstallMemtableFlushResults(
+      ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+      const autovector<MemTable*>& m, LogsWithPrepTracker* prep_tracker,
+      VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number,
+      autovector<MemTable*>* to_delete, FSDirectory* db_directory,
+      LogBuffer* log_buffer,
+      std::list<std::unique_ptr<FlushJobInfo>>* committed_flush_jobs_info,
+      bool write_edits = true);
+
+  // New memtables are inserted at the front of the list.
+  // Takes ownership of the referenced held on *m by the caller of Add().
+  // By default, adding memtables will flag that the memtable list needs to be
+  // flushed, but in certain situations, like after a mempurge, we may want to
+  // avoid flushing the memtable list upon addition of a memtable.
+  void Add(MemTable* m, autovector<MemTable*>* to_delete);
+
+  // Returns an estimate of the number of bytes of data in use.
+  size_t ApproximateMemoryUsage();
+
+  // Returns the cached current_memory_allocted_bytes_excluding_last_ value.
+  size_t MemoryAllocatedBytesExcludingLast() const;
+
+  // Returns the cached current_has_history_ value.
+  bool HasHistory() const;
+
+  // Updates current_memory_allocted_bytes_excluding_last_ and
+  // current_has_history_ from MemTableListVersion. Must be called whenever
+  // InstallNewVersion is called.
+  void UpdateCachedValuesFromMemTableListVersion();
+
+  // `usage` is the current size of the mutable Memtable. When
+  // max_write_buffer_size_to_maintain is used, total size of mutable and
+  // immutable memtables is checked against it to decide whether to trim
+  // memtable list.
+  //
+  // Return true if memtable is trimmed
+  bool TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
+
+  // Returns an estimate of the number of bytes of data used by
+  // the unflushed mem-tables.
+  size_t ApproximateUnflushedMemTablesMemoryUsage();
+
+  // Returns an estimate of the timestamp of the earliest key.
+  uint64_t ApproximateOldestKeyTime() const;
+
+  // Request a flush of all existing memtables to storage.  This will
+  // cause future calls to IsFlushPending() to return true if this list is
+  // non-empty (regardless of the min_write_buffer_number_to_merge
+  // parameter). This flush request will persist until the next time
+  // PickMemtablesToFlush() is called.
+  void FlushRequested() {
+    flush_requested_ = true;
+    // If there are some memtables stored in imm() that don't trigger
+    // flush (eg: mempurge output memtable), then update imm_flush_needed.
+    // Note: if race condition and imm_flush_needed is set to true
+    // when there is num_flush_not_started_==0, then there is no
+    // impact whatsoever. Imm_flush_needed is only used in an assert
+    // in IsFlushPending().
+    if (num_flush_not_started_ > 0) {
+      imm_flush_needed.store(true, std::memory_order_release);
+    }
+  }
+
+  bool HasFlushRequested() { return flush_requested_; }
+
+  // Returns true if a trim history should be scheduled and the caller should
+  // be the one to schedule it
+  bool MarkTrimHistoryNeeded() {
+    auto expected = false;
+    return imm_trim_needed.compare_exchange_strong(
+        expected, true, std::memory_order_relaxed, std::memory_order_relaxed);
+  }
+
+  void ResetTrimHistoryNeeded() {
+    auto expected = true;
+    imm_trim_needed.compare_exchange_strong(
+        expected, false, std::memory_order_relaxed, std::memory_order_relaxed);
+  }
+
+  // Copying allowed
+  // MemTableList(const MemTableList&);
+  // void operator=(const MemTableList&);
+
+  size_t* current_memory_usage() { return &current_memory_usage_; }
+
+  // Returns the min log containing the prep section after memtables listsed in
+  // `memtables_to_flush` are flushed and their status is persisted in manifest.
+  uint64_t PrecomputeMinLogContainingPrepSection(
+      const std::unordered_set<MemTable*>* memtables_to_flush = nullptr);
+
+  uint64_t GetEarliestMemTableID() const {
+    auto& memlist = current_->memlist_;
+    if (memlist.empty()) {
+      return std::numeric_limits<uint64_t>::max();
+    }
+    return memlist.back()->GetID();
+  }
+
+  uint64_t GetLatestMemTableID() const {
+    auto& memlist = current_->memlist_;
+    if (memlist.empty()) {
+      return 0;
+    }
+    return memlist.front()->GetID();
+  }
+
+  void AssignAtomicFlushSeq(const SequenceNumber& seq) {
+    const auto& memlist = current_->memlist_;
+    // Scan the memtable list from new to old
+    for (auto it = memlist.begin(); it != memlist.end(); ++it) {
+      MemTable* mem = *it;
+      if (mem->atomic_flush_seqno_ == kMaxSequenceNumber) {
+        mem->atomic_flush_seqno_ = seq;
+      } else {
+        // Earlier memtables must have been assigned a atomic flush seq, no
+        // need to continue scan.
+        break;
+      }
+    }
+  }
+
+  // Used only by DBImplSecondary during log replay.
+  // Remove memtables whose data were written before the WAL with log_number
+  // was created, i.e. mem->GetNextLogNumber() <= log_number. The memtables are
+  // not freed, but put into a vector for future deref and reclamation.
+  void RemoveOldMemTables(uint64_t log_number,
+                          autovector<MemTable*>* to_delete);
+
+ private:
+  friend Status InstallMemtableAtomicFlushResults(
+      const autovector<MemTableList*>* imm_lists,
+      const autovector<ColumnFamilyData*>& cfds,
+      const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+      const autovector<const autovector<MemTable*>*>& mems_list,
+      VersionSet* vset, LogsWithPrepTracker* prep_tracker,
+      InstrumentedMutex* mu, const autovector<FileMetaData*>& file_meta,
+      const autovector<std::list<std::unique_ptr<FlushJobInfo>>*>&
+          committed_flush_jobs_info,
+      autovector<MemTable*>* to_delete, FSDirectory* db_directory,
+      LogBuffer* log_buffer);
+
+  // DB mutex held
+  void InstallNewVersion();
+
+  // DB mutex held
+  // Called after writing to MANIFEST
+  void RemoveMemTablesOrRestoreFlags(const Status& s, ColumnFamilyData* cfd,
+                                     size_t batch_count, LogBuffer* log_buffer,
+                                     autovector<MemTable*>* to_delete,
+                                     InstrumentedMutex* mu);
+
+  const int min_write_buffer_number_to_merge_;
+
+  MemTableListVersion* current_;
+
+  // the number of elements that still need flushing
+  int num_flush_not_started_;
+
+  // committing in progress
+  bool commit_in_progress_;
+
+  // Requested a flush of memtables to storage. It's possible to request that
+  // a subset of memtables be flushed.
+  bool flush_requested_;
+
+  // The current memory usage.
+  size_t current_memory_usage_;
+
+  // Cached value of current_->MemoryAllocatedBytesExcludingLast().
+  std::atomic<size_t> current_memory_allocted_bytes_excluding_last_;
+
+  // Cached value of current_->HasHistory().
+  std::atomic<bool> current_has_history_;
+};
+
+// Installs memtable atomic flush results.
+// In most cases, imm_lists is nullptr, and the function simply uses the
+// immutable memtable lists associated with the cfds. There are unit tests that
+// installs flush results for external immutable memtable lists other than the
+// cfds' own immutable memtable lists, e.g. MemTableLIstTest. In this case,
+// imm_lists parameter is not nullptr.
+extern Status InstallMemtableAtomicFlushResults(
+    const autovector<MemTableList*>* imm_lists,
+    const autovector<ColumnFamilyData*>& cfds,
+    const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+    const autovector<const autovector<MemTable*>*>& mems_list, VersionSet* vset,
+    LogsWithPrepTracker* prep_tracker, InstrumentedMutex* mu,
+    const autovector<FileMetaData*>& file_meta,
+    const autovector<std::list<std::unique_ptr<FlushJobInfo>>*>&
+        committed_flush_jobs_info,
+    autovector<MemTable*>* to_delete, FSDirectory* db_directory,
+    LogBuffer* log_buffer);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/memtable_list_test.cc b/src/rocksdb/db/memtable_list_test.cc
new file mode 100644
index 000000000..8242061af
--- /dev/null
+++ b/src/rocksdb/db/memtable_list_test.cc
@@ -0,0 +1,1039 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/memtable_list.h"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "db/merge_context.h"
+#include "db/version_set.h"
+#include "db/write_controller.h"
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MemTableListTest : public testing::Test {
+ public:
+  std::string dbname;
+  DB* db;
+  Options options;
+  std::vector<ColumnFamilyHandle*> handles;
+  std::atomic<uint64_t> file_number;
+
+  MemTableListTest() : db(nullptr), file_number(1) {
+    dbname = test::PerThreadDBPath("memtable_list_test");
+    options.create_if_missing = true;
+    EXPECT_OK(DestroyDB(dbname, options));
+  }
+
+  // Create a test db if not yet created
+  void CreateDB() {
+    if (db == nullptr) {
+      options.create_if_missing = true;
+      EXPECT_OK(DestroyDB(dbname, options));
+      // Open DB only with default column family
+      ColumnFamilyOptions cf_options;
+      std::vector<ColumnFamilyDescriptor> cf_descs;
+      cf_descs.emplace_back(kDefaultColumnFamilyName, cf_options);
+      Status s = DB::Open(options, dbname, cf_descs, &handles, &db);
+      EXPECT_OK(s);
+
+      ColumnFamilyOptions cf_opt1, cf_opt2;
+      cf_opt1.cf_paths.emplace_back(dbname + "_one_1",
+                                    std::numeric_limits<uint64_t>::max());
+      cf_opt2.cf_paths.emplace_back(dbname + "_two_1",
+                                    std::numeric_limits<uint64_t>::max());
+      int sz = static_cast<int>(handles.size());
+      handles.resize(sz + 2);
+      s = db->CreateColumnFamily(cf_opt1, "one", &handles[1]);
+      EXPECT_OK(s);
+      s = db->CreateColumnFamily(cf_opt2, "two", &handles[2]);
+      EXPECT_OK(s);
+
+      cf_descs.emplace_back("one", cf_options);
+      cf_descs.emplace_back("two", cf_options);
+    }
+  }
+
+  ~MemTableListTest() override {
+    if (db) {
+      std::vector<ColumnFamilyDescriptor> cf_descs(handles.size());
+#ifndef ROCKSDB_LITE
+      for (int i = 0; i != static_cast<int>(handles.size()); ++i) {
+        EXPECT_OK(handles[i]->GetDescriptor(&cf_descs[i]));
+      }
+#endif  // !ROCKSDB_LITE
+      for (auto h : handles) {
+        if (h) {
+          EXPECT_OK(db->DestroyColumnFamilyHandle(h));
+        }
+      }
+      handles.clear();
+      delete db;
+      db = nullptr;
+      EXPECT_OK(DestroyDB(dbname, options, cf_descs));
+    }
+  }
+
+  // Calls MemTableList::TryInstallMemtableFlushResults() and sets up all
+  // structures needed to call this function.
+  Status Mock_InstallMemtableFlushResults(
+      MemTableList* list, const MutableCFOptions& mutable_cf_options,
+      const autovector<MemTable*>& m, autovector<MemTable*>* to_delete) {
+    // Create a mock Logger
+    test::NullLogger logger;
+    LogBuffer log_buffer(DEBUG_LEVEL, &logger);
+
+    CreateDB();
+    // Create a mock VersionSet
+    DBOptions db_options;
+    ImmutableDBOptions immutable_db_options(db_options);
+    EnvOptions env_options;
+    std::shared_ptr<Cache> table_cache(NewLRUCache(50000, 16));
+    WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
+    WriteController write_controller(10000000u);
+
+    VersionSet versions(dbname, &immutable_db_options, env_options,
+                        table_cache.get(), &write_buffer_manager,
+                        &write_controller, /*block_cache_tracer=*/nullptr,
+                        /*io_tracer=*/nullptr, /*db_id*/ "",
+                        /*db_session_id*/ "");
+    std::vector<ColumnFamilyDescriptor> cf_descs;
+    cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
+    cf_descs.emplace_back("one", ColumnFamilyOptions());
+    cf_descs.emplace_back("two", ColumnFamilyOptions());
+
+    EXPECT_OK(versions.Recover(cf_descs, false));
+
+    // Create mock default ColumnFamilyData
+    auto column_family_set = versions.GetColumnFamilySet();
+    LogsWithPrepTracker dummy_prep_tracker;
+    auto cfd = column_family_set->GetDefault();
+    EXPECT_TRUE(nullptr != cfd);
+    uint64_t file_num = file_number.fetch_add(1);
+    IOStatus io_s;
+    // Create dummy mutex.
+    InstrumentedMutex mutex;
+    InstrumentedMutexLock l(&mutex);
+    std::list<std::unique_ptr<FlushJobInfo>> flush_jobs_info;
+    Status s = list->TryInstallMemtableFlushResults(
+        cfd, mutable_cf_options, m, &dummy_prep_tracker, &versions, &mutex,
+        file_num, to_delete, nullptr, &log_buffer, &flush_jobs_info);
+    EXPECT_OK(io_s);
+    return s;
+  }
+
+  // Calls MemTableList::InstallMemtableFlushResults() and sets up all
+  // structures needed to call this function.
+  Status Mock_InstallMemtableAtomicFlushResults(
+      autovector<MemTableList*>& lists, const autovector<uint32_t>& cf_ids,
+      const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+      const autovector<const autovector<MemTable*>*>& mems_list,
+      autovector<MemTable*>* to_delete) {
+    // Create a mock Logger
+    test::NullLogger logger;
+    LogBuffer log_buffer(DEBUG_LEVEL, &logger);
+
+    CreateDB();
+    // Create a mock VersionSet
+    DBOptions db_options;
+
+    ImmutableDBOptions immutable_db_options(db_options);
+    EnvOptions env_options;
+    std::shared_ptr<Cache> table_cache(NewLRUCache(50000, 16));
+    WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
+    WriteController write_controller(10000000u);
+
+    VersionSet versions(dbname, &immutable_db_options, env_options,
+                        table_cache.get(), &write_buffer_manager,
+                        &write_controller, /*block_cache_tracer=*/nullptr,
+                        /*io_tracer=*/nullptr, /*db_id*/ "",
+                        /*db_session_id*/ "");
+    std::vector<ColumnFamilyDescriptor> cf_descs;
+    cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
+    cf_descs.emplace_back("one", ColumnFamilyOptions());
+    cf_descs.emplace_back("two", ColumnFamilyOptions());
+    EXPECT_OK(versions.Recover(cf_descs, false));
+
+    // Create mock default ColumnFamilyData
+
+    auto column_family_set = versions.GetColumnFamilySet();
+
+    LogsWithPrepTracker dummy_prep_tracker;
+    autovector<ColumnFamilyData*> cfds;
+    for (int i = 0; i != static_cast<int>(cf_ids.size()); ++i) {
+      cfds.emplace_back(column_family_set->GetColumnFamily(cf_ids[i]));
+      EXPECT_NE(nullptr, cfds[i]);
+    }
+    std::vector<FileMetaData> file_metas;
+    file_metas.reserve(cf_ids.size());
+    for (size_t i = 0; i != cf_ids.size(); ++i) {
+      FileMetaData meta;
+      uint64_t file_num = file_number.fetch_add(1);
+      meta.fd = FileDescriptor(file_num, 0, 0);
+      file_metas.emplace_back(meta);
+    }
+    autovector<FileMetaData*> file_meta_ptrs;
+    for (auto& meta : file_metas) {
+      file_meta_ptrs.push_back(&meta);
+    }
+    std::vector<std::list<std::unique_ptr<FlushJobInfo>>>
+        committed_flush_jobs_info_storage(cf_ids.size());
+    autovector<std::list<std::unique_ptr<FlushJobInfo>>*>
+        committed_flush_jobs_info;
+    for (int i = 0; i < static_cast<int>(cf_ids.size()); ++i) {
+      committed_flush_jobs_info.push_back(
+          &committed_flush_jobs_info_storage[i]);
+    }
+
+    InstrumentedMutex mutex;
+    InstrumentedMutexLock l(&mutex);
+    return InstallMemtableAtomicFlushResults(
+        &lists, cfds, mutable_cf_options_list, mems_list, &versions,
+        nullptr /* prep_tracker */, &mutex, file_meta_ptrs,
+        committed_flush_jobs_info, to_delete, nullptr, &log_buffer);
+  }
+};
+
+TEST_F(MemTableListTest, Empty) {
+  // Create an empty MemTableList and validate basic functions.
+  MemTableList list(1, 0, 0);
+
+  ASSERT_EQ(0, list.NumNotFlushed());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+  ASSERT_FALSE(list.IsFlushPending());
+
+  autovector<MemTable*> mems;
+  list.PickMemtablesToFlush(
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, &mems);
+  ASSERT_EQ(0, mems.size());
+
+  autovector<MemTable*> to_delete;
+  list.current()->Unref(&to_delete);
+  ASSERT_EQ(0, to_delete.size());
+}
+
+TEST_F(MemTableListTest, GetTest) {
+  // Create MemTableList
+  int min_write_buffer_number_to_merge = 2;
+  int max_write_buffer_number_to_maintain = 0;
+  int64_t max_write_buffer_size_to_maintain = 0;
+  MemTableList list(min_write_buffer_number_to_merge,
+                    max_write_buffer_number_to_maintain,
+                    max_write_buffer_size_to_maintain);
+
+  SequenceNumber seq = 1;
+  std::string value;
+  Status s;
+  MergeContext merge_context;
+  InternalKeyComparator ikey_cmp(options.comparator);
+  SequenceNumber max_covering_tombstone_seq = 0;
+  autovector<MemTable*> to_delete;
+
+  LookupKey lkey("key1", seq);
+  bool found = list.current()->Get(lkey, &value, /*columns=*/nullptr,
+                                   /*timestamp=*/nullptr, &s, &merge_context,
+                                   &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_FALSE(found);
+
+  // Create a MemTable
+  InternalKeyComparator cmp(BytewiseComparator());
+  auto factory = std::make_shared<SkipListFactory>();
+  options.memtable_factory = factory;
+  ImmutableOptions ioptions(options);
+
+  WriteBufferManager wb(options.db_write_buffer_size);
+  MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+                               kMaxSequenceNumber, 0 /* column_family_id */);
+  mem->Ref();
+
+  // Write some keys to this memtable.
+  ASSERT_OK(
+      mem->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */));
+  ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2",
+                     nullptr /* kv_prot_info */));
+  ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", "value1",
+                     nullptr /* kv_prot_info */));
+  ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2.2",
+                     nullptr /* kv_prot_info */));
+
+  // Fetch the newly written keys
+  merge_context.Clear();
+  found = mem->Get(LookupKey("key1", seq), &value, /*columns*/ nullptr,
+                   /*timestamp*/ nullptr, &s, &merge_context,
+                   &max_covering_tombstone_seq, ReadOptions(),
+                   false /* immutable_memtable */);
+  ASSERT_TRUE(s.ok() && found);
+  ASSERT_EQ(value, "value1");
+
+  merge_context.Clear();
+  found = mem->Get(LookupKey("key1", 2), &value, /*columns*/ nullptr,
+                   /*timestamp*/ nullptr, &s, &merge_context,
+                   &max_covering_tombstone_seq, ReadOptions(),
+                   false /* immutable_memtable */);
+  // MemTable found out that this key is *not* found (at this sequence#)
+  ASSERT_TRUE(found && s.IsNotFound());
+
+  merge_context.Clear();
+  found = mem->Get(LookupKey("key2", seq), &value, /*columns*/ nullptr,
+                   /*timestamp*/ nullptr, &s, &merge_context,
+                   &max_covering_tombstone_seq, ReadOptions(),
+                   false /* immutable_memtable */);
+  ASSERT_TRUE(s.ok() && found);
+  ASSERT_EQ(value, "value2.2");
+
+  ASSERT_EQ(4, mem->num_entries());
+  ASSERT_EQ(1, mem->num_deletes());
+
+  // Add memtable to list
+  // This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed())
+  // in MemTableListVersion::GetFromList work.
+  mem->ConstructFragmentedRangeTombstones();
+  list.Add(mem, &to_delete);
+
+  SequenceNumber saved_seq = seq;
+
+  // Create another memtable and write some keys to it
+  WriteBufferManager wb2(options.db_write_buffer_size);
+  MemTable* mem2 = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb2,
+                                kMaxSequenceNumber, 0 /* column_family_id */);
+  mem2->Ref();
+
+  ASSERT_OK(
+      mem2->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */));
+  ASSERT_OK(mem2->Add(++seq, kTypeValue, "key2", "value2.3",
+                      nullptr /* kv_prot_info */));
+
+  // Add second memtable to list
+  // This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed())
+  // in MemTableListVersion::GetFromList work.
+  mem2->ConstructFragmentedRangeTombstones();
+  list.Add(mem2, &to_delete);
+
+  // Fetch keys via MemTableList
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr,
+                          /*timestamp=*/nullptr, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_TRUE(found && s.IsNotFound());
+
+  merge_context.Clear();
+  found = list.current()->Get(LookupKey("key1", saved_seq), &value,
+                              /*columns=*/nullptr, /*timestamp=*/nullptr, &s,
+                              &merge_context, &max_covering_tombstone_seq,
+                              ReadOptions());
+  ASSERT_TRUE(s.ok() && found);
+  ASSERT_EQ("value1", value);
+
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
+                          /*timestamp=*/nullptr, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_TRUE(s.ok() && found);
+  ASSERT_EQ(value, "value2.3");
+
+  merge_context.Clear();
+  found = list.current()->Get(LookupKey("key2", 1), &value, /*columns=*/nullptr,
+                              /*timestamp=*/nullptr, &s, &merge_context,
+                              &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_FALSE(found);
+
+  ASSERT_EQ(2, list.NumNotFlushed());
+
+  list.current()->Unref(&to_delete);
+  for (MemTable* m : to_delete) {
+    delete m;
+  }
+}
+
+TEST_F(MemTableListTest, GetFromHistoryTest) {
+  // Create MemTableList
+  int min_write_buffer_number_to_merge = 2;
+  int max_write_buffer_number_to_maintain = 2;
+  int64_t max_write_buffer_size_to_maintain = 2 * Arena::kInlineSize;
+  MemTableList list(min_write_buffer_number_to_merge,
+                    max_write_buffer_number_to_maintain,
+                    max_write_buffer_size_to_maintain);
+
+  SequenceNumber seq = 1;
+  std::string value;
+  Status s;
+  MergeContext merge_context;
+  InternalKeyComparator ikey_cmp(options.comparator);
+  SequenceNumber max_covering_tombstone_seq = 0;
+  autovector<MemTable*> to_delete;
+
+  LookupKey lkey("key1", seq);
+  bool found = list.current()->Get(lkey, &value, /*columns=*/nullptr,
+                                   /*timestamp=*/nullptr, &s, &merge_context,
+                                   &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_FALSE(found);
+
+  // Create a MemTable
+  InternalKeyComparator cmp(BytewiseComparator());
+  auto factory = std::make_shared<SkipListFactory>();
+  options.memtable_factory = factory;
+  ImmutableOptions ioptions(options);
+
+  WriteBufferManager wb(options.db_write_buffer_size);
+  MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+                               kMaxSequenceNumber, 0 /* column_family_id */);
+  mem->Ref();
+
+  // Write some keys to this memtable.
+  ASSERT_OK(
+      mem->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */));
+  ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2",
+                     nullptr /* kv_prot_info */));
+  ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2.2",
+                     nullptr /* kv_prot_info */));
+
+  // Fetch the newly written keys
+  merge_context.Clear();
+  found = mem->Get(LookupKey("key1", seq), &value, /*columns*/ nullptr,
+                   /*timestamp*/ nullptr, &s, &merge_context,
+                   &max_covering_tombstone_seq, ReadOptions(),
+                   false /* immutable_memtable */);
+  // MemTable found out that this key is *not* found (at this sequence#)
+  ASSERT_TRUE(found && s.IsNotFound());
+
+  merge_context.Clear();
+  found = mem->Get(LookupKey("key2", seq), &value, /*columns*/ nullptr,
+                   /*timestamp*/ nullptr, &s, &merge_context,
+                   &max_covering_tombstone_seq, ReadOptions(),
+                   false /* immutable_memtable */);
+  ASSERT_TRUE(s.ok() && found);
+  ASSERT_EQ(value, "value2.2");
+
+  // Add memtable to list
+  // This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed())
+  // in MemTableListVersion::GetFromList work.
+  mem->ConstructFragmentedRangeTombstones();
+  list.Add(mem, &to_delete);
+  ASSERT_EQ(0, to_delete.size());
+
+  // Fetch keys via MemTableList
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr,
+                          /*timestamp=*/nullptr, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_TRUE(found && s.IsNotFound());
+
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
+                          /*timestamp=*/nullptr, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_TRUE(s.ok() && found);
+  ASSERT_EQ("value2.2", value);
+
+  // Flush this memtable from the list.
+  // (It will then be a part of the memtable history).
+  autovector<MemTable*> to_flush;
+  list.PickMemtablesToFlush(
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
+  ASSERT_EQ(1, to_flush.size());
+
+  MutableCFOptions mutable_cf_options(options);
+  s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush,
+                                       &to_delete);
+  ASSERT_OK(s);
+  ASSERT_EQ(0, list.NumNotFlushed());
+  ASSERT_EQ(1, list.NumFlushed());
+  ASSERT_EQ(0, to_delete.size());
+
+  // Verify keys are no longer in MemTableList
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr,
+                          /*timestamp=*/nullptr, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_FALSE(found);
+
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
+                          /*timestamp=*/nullptr, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_FALSE(found);
+
+  // Verify keys are present in history
+  merge_context.Clear();
+  found = list.current()->GetFromHistory(
+      LookupKey("key1", seq), &value, /*columns=*/nullptr,
+      /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
+      ReadOptions());
+  ASSERT_TRUE(found && s.IsNotFound());
+
+  merge_context.Clear();
+  found = list.current()->GetFromHistory(
+      LookupKey("key2", seq), &value, /*columns=*/nullptr,
+      /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
+      ReadOptions());
+  ASSERT_TRUE(found);
+  ASSERT_EQ("value2.2", value);
+
+  // Create another memtable and write some keys to it
+  WriteBufferManager wb2(options.db_write_buffer_size);
+  MemTable* mem2 = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb2,
+                                kMaxSequenceNumber, 0 /* column_family_id */);
+  mem2->Ref();
+
+  ASSERT_OK(
+      mem2->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */));
+  ASSERT_OK(mem2->Add(++seq, kTypeValue, "key3", "value3",
+                      nullptr /* kv_prot_info */));
+
+  // Add second memtable to list
+  // This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed())
+  // in MemTableListVersion::GetFromList work.
+  mem2->ConstructFragmentedRangeTombstones();
+  list.Add(mem2, &to_delete);
+  ASSERT_EQ(0, to_delete.size());
+
+  to_flush.clear();
+  list.PickMemtablesToFlush(
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
+  ASSERT_EQ(1, to_flush.size());
+
+  // Flush second memtable
+  s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush,
+                                       &to_delete);
+  ASSERT_OK(s);
+  ASSERT_EQ(0, list.NumNotFlushed());
+  ASSERT_EQ(2, list.NumFlushed());
+  ASSERT_EQ(0, to_delete.size());
+
+  // Add a third memtable to push the first memtable out of the history
+  WriteBufferManager wb3(options.db_write_buffer_size);
+  MemTable* mem3 = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb3,
+                                kMaxSequenceNumber, 0 /* column_family_id */);
+  mem3->Ref();
+  // This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed())
+  // in MemTableListVersion::GetFromList work.
+  mem3->ConstructFragmentedRangeTombstones();
+  list.Add(mem3, &to_delete);
+  ASSERT_EQ(1, list.NumNotFlushed());
+  ASSERT_EQ(1, list.NumFlushed());
+  ASSERT_EQ(1, to_delete.size());
+
+  // Verify keys are no longer in MemTableList
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr,
+                          /*timestamp=*/nullptr, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_FALSE(found);
+
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
+                          /*timestamp=*/nullptr, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_FALSE(found);
+
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key3", seq), &value, /*columns=*/nullptr,
+                          /*timestamp=*/nullptr, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_FALSE(found);
+
+  // Verify that the second memtable's keys are in the history
+  merge_context.Clear();
+  found = list.current()->GetFromHistory(
+      LookupKey("key1", seq), &value, /*columns=*/nullptr,
+      /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
+      ReadOptions());
+  ASSERT_TRUE(found && s.IsNotFound());
+
+  merge_context.Clear();
+  found = list.current()->GetFromHistory(
+      LookupKey("key3", seq), &value, /*columns=*/nullptr,
+      /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
+      ReadOptions());
+  ASSERT_TRUE(found);
+  ASSERT_EQ("value3", value);
+
+  // Verify that key2 from the first memtable is no longer in the history
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
+                          /*timestamp=*/nullptr, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_FALSE(found);
+
+  // Cleanup
+  list.current()->Unref(&to_delete);
+  ASSERT_EQ(3, to_delete.size());
+  for (MemTable* m : to_delete) {
+    delete m;
+  }
+}
+
+TEST_F(MemTableListTest, FlushPendingTest) {
+  const int num_tables = 6;
+  SequenceNumber seq = 1;
+  Status s;
+
+  auto factory = std::make_shared<SkipListFactory>();
+  options.memtable_factory = factory;
+  ImmutableOptions ioptions(options);
+  InternalKeyComparator cmp(BytewiseComparator());
+  WriteBufferManager wb(options.db_write_buffer_size);
+  autovector<MemTable*> to_delete;
+
+  // Create MemTableList
+  int min_write_buffer_number_to_merge = 3;
+  int max_write_buffer_number_to_maintain = 7;
+  int64_t max_write_buffer_size_to_maintain =
+      7 * static_cast<int>(options.write_buffer_size);
+  MemTableList list(min_write_buffer_number_to_merge,
+                    max_write_buffer_number_to_maintain,
+                    max_write_buffer_size_to_maintain);
+
+  // Create some MemTables
+  uint64_t memtable_id = 0;
+  std::vector<MemTable*> tables;
+  MutableCFOptions mutable_cf_options(options);
+  for (int i = 0; i < num_tables; i++) {
+    MemTable* mem = new MemTable(cmp, ioptions, mutable_cf_options, &wb,
+                                 kMaxSequenceNumber, 0 /* column_family_id */);
+    mem->SetID(memtable_id++);
+    mem->Ref();
+
+    std::string value;
+    MergeContext merge_context;
+
+    ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", std::to_string(i),
+                       nullptr /* kv_prot_info */));
+    ASSERT_OK(mem->Add(++seq, kTypeValue, "keyN" + std::to_string(i), "valueN",
+                       nullptr /* kv_prot_info */));
+    ASSERT_OK(mem->Add(++seq, kTypeValue, "keyX" + std::to_string(i), "value",
+                       nullptr /* kv_prot_info */));
+    ASSERT_OK(mem->Add(++seq, kTypeValue, "keyM" + std::to_string(i), "valueM",
+                       nullptr /* kv_prot_info */));
+    ASSERT_OK(mem->Add(++seq, kTypeDeletion, "keyX" + std::to_string(i), "",
+                       nullptr /* kv_prot_info */));
+
+    tables.push_back(mem);
+  }
+
+  // Nothing to flush
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+  autovector<MemTable*> to_flush;
+  list.PickMemtablesToFlush(
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
+  ASSERT_EQ(0, to_flush.size());
+
+  // Request a flush even though there is nothing to flush
+  list.FlushRequested();
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Attempt to 'flush' to clear request for flush
+  list.PickMemtablesToFlush(
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
+  ASSERT_EQ(0, to_flush.size());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Request a flush again
+  list.FlushRequested();
+  // No flush pending since the list is empty.
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Add 2 tables
+  list.Add(tables[0], &to_delete);
+  list.Add(tables[1], &to_delete);
+  ASSERT_EQ(2, list.NumNotFlushed());
+  ASSERT_EQ(0, to_delete.size());
+
+  // Even though we have less than the minimum to flush, a flush is
+  // pending since we had previously requested a flush and never called
+  // PickMemtablesToFlush() to clear the flush.
+  ASSERT_TRUE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Pick tables to flush
+  list.PickMemtablesToFlush(
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
+  ASSERT_EQ(2, to_flush.size());
+  ASSERT_EQ(2, list.NumNotFlushed());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Revert flush
+  list.RollbackMemtableFlush(to_flush, 0);
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+  to_flush.clear();
+
+  // Add another table
+  list.Add(tables[2], &to_delete);
+  // We now have the minimum to flush regardles of whether FlushRequested()
+  // was called.
+  ASSERT_TRUE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+  ASSERT_EQ(0, to_delete.size());
+
+  // Pick tables to flush
+  list.PickMemtablesToFlush(
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
+  ASSERT_EQ(3, to_flush.size());
+  ASSERT_EQ(3, list.NumNotFlushed());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Pick tables to flush again
+  autovector<MemTable*> to_flush2;
+  list.PickMemtablesToFlush(
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush2);
+  ASSERT_EQ(0, to_flush2.size());
+  ASSERT_EQ(3, list.NumNotFlushed());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Add another table
+  list.Add(tables[3], &to_delete);
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+  ASSERT_EQ(0, to_delete.size());
+
+  // Request a flush again
+  list.FlushRequested();
+  ASSERT_TRUE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Pick tables to flush again
+  list.PickMemtablesToFlush(
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush2);
+  ASSERT_EQ(1, to_flush2.size());
+  ASSERT_EQ(4, list.NumNotFlushed());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Rollback first pick of tables
+  list.RollbackMemtableFlush(to_flush, 0);
+  ASSERT_TRUE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+  to_flush.clear();
+
+  // Add another tables
+  list.Add(tables[4], &to_delete);
+  ASSERT_EQ(5, list.NumNotFlushed());
+  // We now have the minimum to flush regardles of whether FlushRequested()
+  ASSERT_TRUE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+  ASSERT_EQ(0, to_delete.size());
+
+  // Pick tables to flush
+  list.PickMemtablesToFlush(
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
+  // Picks three oldest memtables. The fourth oldest is picked in `to_flush2` so
+  // must be excluded. The newest (fifth oldest) is non-consecutive with the
+  // three oldest due to omitting the fourth oldest so must not be picked.
+  ASSERT_EQ(3, to_flush.size());
+  ASSERT_EQ(5, list.NumNotFlushed());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Pick tables to flush again
+  autovector<MemTable*> to_flush3;
+  list.PickMemtablesToFlush(
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush3);
+  // Picks newest (fifth oldest)
+  ASSERT_EQ(1, to_flush3.size());
+  ASSERT_EQ(5, list.NumNotFlushed());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Nothing left to flush
+  autovector<MemTable*> to_flush4;
+  list.PickMemtablesToFlush(
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush4);
+  ASSERT_EQ(0, to_flush4.size());
+  ASSERT_EQ(5, list.NumNotFlushed());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Flush the 3 memtables that were picked in to_flush
+  s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush,
+                                       &to_delete);
+  ASSERT_OK(s);
+
+  // Note:  now to_flush contains tables[0,1,2].  to_flush2 contains
+  // tables[3]. to_flush3 contains tables[4].
+  // Current implementation will only commit memtables in the order they were
+  // created. So TryInstallMemtableFlushResults will install the first 3 tables
+  // in to_flush and stop when it encounters a table not yet flushed.
+  ASSERT_EQ(2, list.NumNotFlushed());
+  int num_in_history =
+      std::min(3, static_cast<int>(max_write_buffer_size_to_maintain) /
+                      static_cast<int>(options.write_buffer_size));
+  ASSERT_EQ(num_in_history, list.NumFlushed());
+  ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size());
+
+  // Request a flush again. Should be nothing to flush
+  list.FlushRequested();
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Flush the 1 memtable (tables[4]) that was picked in to_flush3
+  s = MemTableListTest::Mock_InstallMemtableFlushResults(
+      &list, mutable_cf_options, to_flush3, &to_delete);
+  ASSERT_OK(s);
+
+  // This will install 0 tables since tables[4] flushed while tables[3] has not
+  // yet flushed.
+  ASSERT_EQ(2, list.NumNotFlushed());
+  ASSERT_EQ(0, to_delete.size());
+
+  // Flush the 1 memtable (tables[3]) that was picked in to_flush2
+  s = MemTableListTest::Mock_InstallMemtableFlushResults(
+      &list, mutable_cf_options, to_flush2, &to_delete);
+  ASSERT_OK(s);
+
+  // This will actually install 2 tables.  The 1 we told it to flush, and also
+  // tables[4] which has been waiting for tables[3] to commit.
+  ASSERT_EQ(0, list.NumNotFlushed());
+  num_in_history =
+      std::min(5, static_cast<int>(max_write_buffer_size_to_maintain) /
+                      static_cast<int>(options.write_buffer_size));
+  ASSERT_EQ(num_in_history, list.NumFlushed());
+  ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size());
+
+  for (const auto& m : to_delete) {
+    // Refcount should be 0 after calling TryInstallMemtableFlushResults.
+    // Verify this, by Ref'ing then UnRef'ing:
+    m->Ref();
+    ASSERT_EQ(m, m->Unref());
+    delete m;
+  }
+  to_delete.clear();
+
+  // Add another table
+  list.Add(tables[5], &to_delete);
+  ASSERT_EQ(1, list.NumNotFlushed());
+  ASSERT_EQ(5, list.GetLatestMemTableID());
+  memtable_id = 4;
+  // Pick tables to flush. The tables to pick must have ID smaller than or
+  // equal to 4. Therefore, no table will be selected in this case.
+  autovector<MemTable*> to_flush5;
+  list.FlushRequested();
+  ASSERT_TRUE(list.HasFlushRequested());
+  list.PickMemtablesToFlush(memtable_id, &to_flush5);
+  ASSERT_TRUE(to_flush5.empty());
+  ASSERT_EQ(1, list.NumNotFlushed());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.HasFlushRequested());
+
+  // Pick tables to flush. The tables to pick must have ID smaller than or
+  // equal to 5. Therefore, only tables[5] will be selected.
+  memtable_id = 5;
+  list.FlushRequested();
+  list.PickMemtablesToFlush(memtable_id, &to_flush5);
+  ASSERT_EQ(1, static_cast<int>(to_flush5.size()));
+  ASSERT_EQ(1, list.NumNotFlushed());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+  ASSERT_FALSE(list.IsFlushPending());
+  to_delete.clear();
+
+  list.current()->Unref(&to_delete);
+  int to_delete_size =
+      std::min(num_tables, static_cast<int>(max_write_buffer_size_to_maintain) /
+                               static_cast<int>(options.write_buffer_size));
+  ASSERT_EQ(to_delete_size, to_delete.size());
+
+  for (const auto& m : to_delete) {
+    // Refcount should be 0 after calling TryInstallMemtableFlushResults.
+    // Verify this, by Ref'ing then UnRef'ing:
+    m->Ref();
+    ASSERT_EQ(m, m->Unref());
+    delete m;
+  }
+  to_delete.clear();
+}
+
+TEST_F(MemTableListTest, EmptyAtomicFlusTest) {
+  autovector<MemTableList*> lists;
+  autovector<uint32_t> cf_ids;
+  autovector<const MutableCFOptions*> options_list;
+  autovector<const autovector<MemTable*>*> to_flush;
+  autovector<MemTable*> to_delete;
+  Status s = Mock_InstallMemtableAtomicFlushResults(lists, cf_ids, options_list,
+                                                    to_flush, &to_delete);
+  ASSERT_OK(s);
+  ASSERT_TRUE(to_delete.empty());
+}
+
+TEST_F(MemTableListTest, AtomicFlusTest) {
+  const int num_cfs = 3;
+  const int num_tables_per_cf = 2;
+  SequenceNumber seq = 1;
+
+  auto factory = std::make_shared<SkipListFactory>();
+  options.memtable_factory = factory;
+  ImmutableOptions ioptions(options);
+  InternalKeyComparator cmp(BytewiseComparator());
+  WriteBufferManager wb(options.db_write_buffer_size);
+
+  // Create MemTableLists
+  int min_write_buffer_number_to_merge = 3;
+  int max_write_buffer_number_to_maintain = 7;
+  int64_t max_write_buffer_size_to_maintain =
+      7 * static_cast<int64_t>(options.write_buffer_size);
+  autovector<MemTableList*> lists;
+  for (int i = 0; i != num_cfs; ++i) {
+    lists.emplace_back(new MemTableList(min_write_buffer_number_to_merge,
+                                        max_write_buffer_number_to_maintain,
+                                        max_write_buffer_size_to_maintain));
+  }
+
+  autovector<uint32_t> cf_ids;
+  std::vector<std::vector<MemTable*>> tables(num_cfs);
+  autovector<const MutableCFOptions*> mutable_cf_options_list;
+  uint32_t cf_id = 0;
+  for (auto& elem : tables) {
+    mutable_cf_options_list.emplace_back(new MutableCFOptions(options));
+    uint64_t memtable_id = 0;
+    for (int i = 0; i != num_tables_per_cf; ++i) {
+      MemTable* mem =
+          new MemTable(cmp, ioptions, *(mutable_cf_options_list.back()), &wb,
+                       kMaxSequenceNumber, cf_id);
+      mem->SetID(memtable_id++);
+      mem->Ref();
+
+      std::string value;
+
+      ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", std::to_string(i),
+                         nullptr /* kv_prot_info */));
+      ASSERT_OK(mem->Add(++seq, kTypeValue, "keyN" + std::to_string(i),
+                         "valueN", nullptr /* kv_prot_info */));
+      ASSERT_OK(mem->Add(++seq, kTypeValue, "keyX" + std::to_string(i), "value",
+                         nullptr /* kv_prot_info */));
+      ASSERT_OK(mem->Add(++seq, kTypeValue, "keyM" + std::to_string(i),
+                         "valueM", nullptr /* kv_prot_info */));
+      ASSERT_OK(mem->Add(++seq, kTypeDeletion, "keyX" + std::to_string(i), "",
+                         nullptr /* kv_prot_info */));
+
+      elem.push_back(mem);
+    }
+    cf_ids.push_back(cf_id++);
+  }
+
+  std::vector<autovector<MemTable*>> flush_candidates(num_cfs);
+
+  // Nothing to flush
+  for (auto i = 0; i != num_cfs; ++i) {
+    auto* list = lists[i];
+    ASSERT_FALSE(list->IsFlushPending());
+    ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire));
+    list->PickMemtablesToFlush(
+        std::numeric_limits<uint64_t>::max() /* memtable_id */,
+        &flush_candidates[i]);
+    ASSERT_EQ(0, flush_candidates[i].size());
+  }
+  // Request flush even though there is nothing to flush
+  for (auto i = 0; i != num_cfs; ++i) {
+    auto* list = lists[i];
+    list->FlushRequested();
+    ASSERT_FALSE(list->IsFlushPending());
+    ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire));
+  }
+  autovector<MemTable*> to_delete;
+  // Add tables to the immutable memtalbe lists associated with column families
+  for (auto i = 0; i != num_cfs; ++i) {
+    for (auto j = 0; j != num_tables_per_cf; ++j) {
+      lists[i]->Add(tables[i][j], &to_delete);
+    }
+    ASSERT_EQ(num_tables_per_cf, lists[i]->NumNotFlushed());
+    ASSERT_TRUE(lists[i]->IsFlushPending());
+    ASSERT_TRUE(lists[i]->imm_flush_needed.load(std::memory_order_acquire));
+  }
+  std::vector<uint64_t> flush_memtable_ids = {1, 1, 0};
+  //          +----+
+  // list[0]: |0  1|
+  // list[1]: |0  1|
+  //          | +--+
+  // list[2]: |0| 1
+  //          +-+
+  // Pick memtables to flush
+  for (auto i = 0; i != num_cfs; ++i) {
+    flush_candidates[i].clear();
+    lists[i]->PickMemtablesToFlush(flush_memtable_ids[i], &flush_candidates[i]);
+    ASSERT_EQ(flush_memtable_ids[i] - 0 + 1,
+              static_cast<uint64_t>(flush_candidates[i].size()));
+  }
+  autovector<MemTableList*> tmp_lists;
+  autovector<uint32_t> tmp_cf_ids;
+  autovector<const MutableCFOptions*> tmp_options_list;
+  autovector<const autovector<MemTable*>*> to_flush;
+  for (auto i = 0; i != num_cfs; ++i) {
+    if (!flush_candidates[i].empty()) {
+      to_flush.push_back(&flush_candidates[i]);
+      tmp_lists.push_back(lists[i]);
+      tmp_cf_ids.push_back(i);
+      tmp_options_list.push_back(mutable_cf_options_list[i]);
+    }
+  }
+  Status s = Mock_InstallMemtableAtomicFlushResults(
+      tmp_lists, tmp_cf_ids, tmp_options_list, to_flush, &to_delete);
+  ASSERT_OK(s);
+
+  for (auto i = 0; i != num_cfs; ++i) {
+    for (auto j = 0; j != num_tables_per_cf; ++j) {
+      if (static_cast<uint64_t>(j) <= flush_memtable_ids[i]) {
+        ASSERT_LT(0, tables[i][j]->GetFileNumber());
+      }
+    }
+    ASSERT_EQ(
+        static_cast<size_t>(num_tables_per_cf) - flush_candidates[i].size(),
+        lists[i]->NumNotFlushed());
+  }
+
+  to_delete.clear();
+  for (auto list : lists) {
+    list->current()->Unref(&to_delete);
+    delete list;
+  }
+  for (auto& mutable_cf_options : mutable_cf_options_list) {
+    if (mutable_cf_options != nullptr) {
+      delete mutable_cf_options;
+      mutable_cf_options = nullptr;
+    }
+  }
+  // All memtables in tables array must have been flushed, thus ready to be
+  // deleted.
+  ASSERT_EQ(to_delete.size(), tables.size() * tables.front().size());
+  for (const auto& m : to_delete) {
+    // Refcount should be 0 after calling InstallMemtableFlushResults.
+    // Verify this by Ref'ing and then Unref'ing.
+    m->Ref();
+    ASSERT_EQ(m, m->Unref());
+    delete m;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/merge_context.h b/src/rocksdb/db/merge_context.h
new file mode 100644
index 000000000..8a7b07290
--- /dev/null
+++ b/src/rocksdb/db/merge_context.h
@@ -0,0 +1,147 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const std::vector<Slice> empty_operand_list;
+
+// The merge context for merging a user key.
+// When doing a Get(), DB will create such a class and pass it when
+// issuing Get() operation to memtables and version_set. The operands
+// will be fetched from the context when issuing partial of full merge.
+class MergeContext {
+ public:
+  // Clear all the operands
+  void Clear() {
+    if (operand_list_) {
+      operand_list_->clear();
+      copied_operands_->clear();
+    }
+  }
+
+  // Push a merge operand
+  void PushOperand(const Slice& operand_slice, bool operand_pinned = false) {
+    Initialize();
+    SetDirectionBackward();
+
+    if (operand_pinned) {
+      operand_list_->push_back(operand_slice);
+    } else {
+      // We need to have our own copy of the operand since it's not pinned
+      copied_operands_->emplace_back(
+          new std::string(operand_slice.data(), operand_slice.size()));
+      operand_list_->push_back(*copied_operands_->back());
+    }
+  }
+
+  // Push back a merge operand
+  void PushOperandBack(const Slice& operand_slice,
+                       bool operand_pinned = false) {
+    Initialize();
+    SetDirectionForward();
+
+    if (operand_pinned) {
+      operand_list_->push_back(operand_slice);
+    } else {
+      // We need to have our own copy of the operand since it's not pinned
+      copied_operands_->emplace_back(
+          new std::string(operand_slice.data(), operand_slice.size()));
+      operand_list_->push_back(*copied_operands_->back());
+    }
+  }
+
+  // return total number of operands in the list
+  size_t GetNumOperands() const {
+    if (!operand_list_) {
+      return 0;
+    }
+    return operand_list_->size();
+  }
+
+  // Get the operand at the index.
+  Slice GetOperand(int index) const {
+    assert(operand_list_);
+
+    SetDirectionForward();
+    return (*operand_list_)[index];
+  }
+
+  // Same as GetOperandsDirectionForward
+  //
+  // Note that the returned reference is only good until another call
+  // to this MergeContext.  If the returned value is needed for longer,
+  // a copy must be made.
+  const std::vector<Slice>& GetOperands() const {
+    return GetOperandsDirectionForward();
+  }
+
+  // Return all the operands in the order as they were merged (passed to
+  // FullMerge or FullMergeV2)
+  //
+  // Note that the returned reference is only good until another call
+  // to this MergeContext.  If the returned value is needed for longer,
+  // a copy must be made.
+  const std::vector<Slice>& GetOperandsDirectionForward() const {
+    if (!operand_list_) {
+      return empty_operand_list;
+    }
+
+    SetDirectionForward();
+    return *operand_list_;
+  }
+
+  // Return all the operands in the reversed order relative to how they were
+  // merged (passed to FullMerge or FullMergeV2)
+  //
+  // Note that the returned reference is only good until another call
+  // to this MergeContext.  If the returned value is needed for longer,
+  // a copy must be made.
+  const std::vector<Slice>& GetOperandsDirectionBackward() const {
+    if (!operand_list_) {
+      return empty_operand_list;
+    }
+
+    SetDirectionBackward();
+    return *operand_list_;
+  }
+
+ private:
+  void Initialize() {
+    if (!operand_list_) {
+      operand_list_.reset(new std::vector<Slice>());
+      copied_operands_.reset(new std::vector<std::unique_ptr<std::string>>());
+    }
+  }
+
+  void SetDirectionForward() const {
+    if (operands_reversed_ == true) {
+      std::reverse(operand_list_->begin(), operand_list_->end());
+      operands_reversed_ = false;
+    }
+  }
+
+  void SetDirectionBackward() const {
+    if (operands_reversed_ == false) {
+      std::reverse(operand_list_->begin(), operand_list_->end());
+      operands_reversed_ = true;
+    }
+  }
+
+  // List of operands
+  mutable std::unique_ptr<std::vector<Slice>> operand_list_;
+  // Copy of operands that are not pinned.
+  std::unique_ptr<std::vector<std::unique_ptr<std::string>>> copied_operands_;
+  mutable bool operands_reversed_ = true;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/merge_helper.cc b/src/rocksdb/db/merge_helper.cc
new file mode 100644
index 000000000..6df841012
--- /dev/null
+++ b/src/rocksdb/db/merge_helper.cc
@@ -0,0 +1,583 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/merge_helper.h"
+
+#include <string>
+
+#include "db/blob/blob_fetcher.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/prefetch_buffer_collection.h"
+#include "db/compaction/compaction_iteration_stats.h"
+#include "db/dbformat.h"
+#include "db/wide/wide_column_serialization.h"
+#include "logging/logging.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/statistics.h"
+#include "port/likely.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/system_clock.h"
+#include "table/format.h"
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+MergeHelper::MergeHelper(Env* env, const Comparator* user_comparator,
+                         const MergeOperator* user_merge_operator,
+                         const CompactionFilter* compaction_filter,
+                         Logger* logger, bool assert_valid_internal_key,
+                         SequenceNumber latest_snapshot,
+                         const SnapshotChecker* snapshot_checker, int level,
+                         Statistics* stats,
+                         const std::atomic<bool>* shutting_down)
+    : env_(env),
+      clock_(env->GetSystemClock().get()),
+      user_comparator_(user_comparator),
+      user_merge_operator_(user_merge_operator),
+      compaction_filter_(compaction_filter),
+      shutting_down_(shutting_down),
+      logger_(logger),
+      assert_valid_internal_key_(assert_valid_internal_key),
+      allow_single_operand_(false),
+      latest_snapshot_(latest_snapshot),
+      snapshot_checker_(snapshot_checker),
+      level_(level),
+      keys_(),
+      filter_timer_(clock_),
+      total_filter_time_(0U),
+      stats_(stats) {
+  assert(user_comparator_ != nullptr);
+  if (user_merge_operator_) {
+    allow_single_operand_ = user_merge_operator_->AllowSingleOperand();
+  }
+}
+
+Status MergeHelper::TimedFullMerge(const MergeOperator* merge_operator,
+                                   const Slice& key, const Slice* value,
+                                   const std::vector<Slice>& operands,
+                                   std::string* result, Logger* logger,
+                                   Statistics* statistics, SystemClock* clock,
+                                   Slice* result_operand,
+                                   bool update_num_ops_stats) {
+  assert(merge_operator != nullptr);
+
+  if (operands.empty()) {
+    assert(value != nullptr && result != nullptr);
+    result->assign(value->data(), value->size());
+    return Status::OK();
+  }
+
+  if (update_num_ops_stats) {
+    RecordInHistogram(statistics, READ_NUM_MERGE_OPERANDS,
+                      static_cast<uint64_t>(operands.size()));
+  }
+
+  bool success = false;
+  Slice tmp_result_operand(nullptr, 0);
+  const MergeOperator::MergeOperationInput merge_in(key, value, operands,
+                                                    logger);
+  MergeOperator::MergeOperationOutput merge_out(*result, tmp_result_operand);
+  {
+    // Setup to time the merge
+    StopWatchNano timer(clock, statistics != nullptr);
+    PERF_TIMER_GUARD(merge_operator_time_nanos);
+
+    // Do the merge
+    success = merge_operator->FullMergeV2(merge_in, &merge_out);
+
+    if (tmp_result_operand.data()) {
+      // FullMergeV2 result is an existing operand
+      if (result_operand != nullptr) {
+        *result_operand = tmp_result_operand;
+      } else {
+        result->assign(tmp_result_operand.data(), tmp_result_operand.size());
+      }
+    } else if (result_operand) {
+      *result_operand = Slice(nullptr, 0);
+    }
+
+    RecordTick(statistics, MERGE_OPERATION_TOTAL_TIME,
+               statistics ? timer.ElapsedNanos() : 0);
+  }
+
+  if (!success) {
+    RecordTick(statistics, NUMBER_MERGE_FAILURES);
+    return Status::Corruption("Error: Could not perform merge.");
+  }
+
+  return Status::OK();
+}
+
+Status MergeHelper::TimedFullMergeWithEntity(
+    const MergeOperator* merge_operator, const Slice& key, Slice base_entity,
+    const std::vector<Slice>& operands, std::string* result, Logger* logger,
+    Statistics* statistics, SystemClock* clock, bool update_num_ops_stats) {
+  WideColumns base_columns;
+
+  {
+    const Status s =
+        WideColumnSerialization::Deserialize(base_entity, base_columns);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  const bool has_default_column =
+      !base_columns.empty() && base_columns[0].name() == kDefaultWideColumnName;
+
+  Slice value_of_default;
+  if (has_default_column) {
+    value_of_default = base_columns[0].value();
+  }
+
+  std::string merge_result;
+
+  {
+    constexpr Slice* result_operand = nullptr;
+
+    const Status s = TimedFullMerge(
+        merge_operator, key, &value_of_default, operands, &merge_result, logger,
+        statistics, clock, result_operand, update_num_ops_stats);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (has_default_column) {
+    base_columns[0].value() = merge_result;
+
+    const Status s = WideColumnSerialization::Serialize(base_columns, *result);
+    if (!s.ok()) {
+      return s;
+    }
+  } else {
+    const Status s =
+        WideColumnSerialization::Serialize(merge_result, base_columns, *result);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  return Status::OK();
+}
+
+// PRE:  iter points to the first merge type entry
+// POST: iter points to the first entry beyond the merge process (or the end)
+//       keys_, operands_ are updated to reflect the merge result.
+//       keys_ stores the list of keys encountered while merging.
+//       operands_ stores the list of merge operands encountered while merging.
+//       keys_[i] corresponds to operands_[i] for each i.
+//
+// TODO: Avoid the snapshot stripe map lookup in CompactionRangeDelAggregator
+// and just pass the StripeRep corresponding to the stripe being merged.
+Status MergeHelper::MergeUntil(InternalIterator* iter,
+                               CompactionRangeDelAggregator* range_del_agg,
+                               const SequenceNumber stop_before,
+                               const bool at_bottom,
+                               const bool allow_data_in_errors,
+                               const BlobFetcher* blob_fetcher,
+                               const std::string* const full_history_ts_low,
+                               PrefetchBufferCollection* prefetch_buffers,
+                               CompactionIterationStats* c_iter_stats) {
+  // Get a copy of the internal key, before it's invalidated by iter->Next()
+  // Also maintain the list of merge operands seen.
+  assert(HasOperator());
+  keys_.clear();
+  merge_context_.Clear();
+  has_compaction_filter_skip_until_ = false;
+  assert(user_merge_operator_);
+  assert(user_comparator_);
+  const size_t ts_sz = user_comparator_->timestamp_size();
+  if (full_history_ts_low) {
+    assert(ts_sz > 0);
+    assert(ts_sz == full_history_ts_low->size());
+  }
+  bool first_key = true;
+
+  // We need to parse the internal key again as the parsed key is
+  // backed by the internal key!
+  // Assume no internal key corruption as it has been successfully parsed
+  // by the caller.
+  // original_key_is_iter variable is just caching the information:
+  // original_key_is_iter == (iter->key().ToString() == original_key)
+  bool original_key_is_iter = true;
+  std::string original_key = iter->key().ToString();
+  // Important:
+  // orig_ikey is backed by original_key if keys_.empty()
+  // orig_ikey is backed by keys_.back() if !keys_.empty()
+  ParsedInternalKey orig_ikey;
+
+  Status s = ParseInternalKey(original_key, &orig_ikey, allow_data_in_errors);
+  assert(s.ok());
+  if (!s.ok()) return s;
+
+  assert(kTypeMerge == orig_ikey.type);
+
+  bool hit_the_next_user_key = false;
+  int cmp_with_full_history_ts_low = 0;
+  for (; iter->Valid(); iter->Next(), original_key_is_iter = false) {
+    if (IsShuttingDown()) {
+      s = Status::ShutdownInProgress();
+      return s;
+    }
+
+    ParsedInternalKey ikey;
+    assert(keys_.size() == merge_context_.GetNumOperands());
+
+    Status pik_status =
+        ParseInternalKey(iter->key(), &ikey, allow_data_in_errors);
+    Slice ts;
+    if (pik_status.ok()) {
+      ts = ExtractTimestampFromUserKey(ikey.user_key, ts_sz);
+      if (full_history_ts_low) {
+        cmp_with_full_history_ts_low =
+            user_comparator_->CompareTimestamp(ts, *full_history_ts_low);
+      }
+    }
+    if (!pik_status.ok()) {
+      // stop at corrupted key
+      if (assert_valid_internal_key_) {
+        return pik_status;
+      }
+      break;
+    } else if (first_key) {
+      // If user-defined timestamp is enabled, we expect both user key and
+      // timestamps are equal, as a sanity check.
+      assert(user_comparator_->Equal(ikey.user_key, orig_ikey.user_key));
+      first_key = false;
+    } else if (!user_comparator_->EqualWithoutTimestamp(ikey.user_key,
+                                                        orig_ikey.user_key) ||
+               (ts_sz > 0 &&
+                !user_comparator_->Equal(ikey.user_key, orig_ikey.user_key) &&
+                cmp_with_full_history_ts_low >= 0)) {
+      // 1) hit a different user key, or
+      // 2) user-defined timestamp is enabled, and hit a version of user key NOT
+      // eligible for GC, then stop right here.
+      hit_the_next_user_key = true;
+      break;
+    } else if (stop_before > 0 && ikey.sequence <= stop_before &&
+               LIKELY(snapshot_checker_ == nullptr ||
+                      snapshot_checker_->CheckInSnapshot(ikey.sequence,
+                                                         stop_before) !=
+                          SnapshotCheckerResult::kNotInSnapshot)) {
+      // hit an entry that's possibly visible by the previous snapshot, can't
+      // touch that
+      break;
+    }
+
+    // At this point we are guaranteed that we need to process this key.
+
+    assert(IsValueType(ikey.type));
+    if (ikey.type != kTypeMerge) {
+      // hit a put/delete/single delete
+      //   => merge the put value or a nullptr with operands_
+      //   => store result in operands_.back() (and update keys_.back())
+      //   => change the entry type to kTypeValue for keys_.back()
+      // We are done! Success!
+
+      // If there are no operands, just return the Status::OK(). That will cause
+      // the compaction iterator to write out the key we're currently at, which
+      // is the put/delete we just encountered.
+      if (keys_.empty()) {
+        return s;
+      }
+
+      // TODO(noetzli) If the merge operator returns false, we are currently
+      // (almost) silently dropping the put/delete. That's probably not what we
+      // want. Also if we're in compaction and it's a put, it would be nice to
+      // run compaction filter on it.
+      std::string merge_result;
+
+      if (range_del_agg &&
+          range_del_agg->ShouldDelete(
+              ikey, RangeDelPositioningMode::kForwardTraversal)) {
+        s = TimedFullMerge(user_merge_operator_, ikey.user_key, nullptr,
+                           merge_context_.GetOperands(), &merge_result, logger_,
+                           stats_, clock_,
+                           /* result_operand */ nullptr,
+                           /* update_num_ops_stats */ false);
+      } else if (ikey.type == kTypeValue) {
+        const Slice val = iter->value();
+
+        s = TimedFullMerge(user_merge_operator_, ikey.user_key, &val,
+                           merge_context_.GetOperands(), &merge_result, logger_,
+                           stats_, clock_,
+                           /* result_operand */ nullptr,
+                           /* update_num_ops_stats */ false);
+      } else if (ikey.type == kTypeBlobIndex) {
+        BlobIndex blob_index;
+
+        s = blob_index.DecodeFrom(iter->value());
+        if (!s.ok()) {
+          return s;
+        }
+
+        FilePrefetchBuffer* prefetch_buffer =
+            prefetch_buffers ? prefetch_buffers->GetOrCreatePrefetchBuffer(
+                                   blob_index.file_number())
+                             : nullptr;
+
+        uint64_t bytes_read = 0;
+
+        assert(blob_fetcher);
+
+        PinnableSlice blob_value;
+        s = blob_fetcher->FetchBlob(ikey.user_key, blob_index, prefetch_buffer,
+                                    &blob_value, &bytes_read);
+        if (!s.ok()) {
+          return s;
+        }
+
+        if (c_iter_stats) {
+          ++c_iter_stats->num_blobs_read;
+          c_iter_stats->total_blob_bytes_read += bytes_read;
+        }
+
+        s = TimedFullMerge(user_merge_operator_, ikey.user_key, &blob_value,
+                           merge_context_.GetOperands(), &merge_result, logger_,
+                           stats_, clock_,
+                           /* result_operand */ nullptr,
+                           /* update_num_ops_stats */ false);
+      } else if (ikey.type == kTypeWideColumnEntity) {
+        s = TimedFullMergeWithEntity(
+            user_merge_operator_, ikey.user_key, iter->value(),
+            merge_context_.GetOperands(), &merge_result, logger_, stats_,
+            clock_, /* update_num_ops_stats */ false);
+      } else {
+        s = TimedFullMerge(user_merge_operator_, ikey.user_key, nullptr,
+                           merge_context_.GetOperands(), &merge_result, logger_,
+                           stats_, clock_,
+                           /* result_operand */ nullptr,
+                           /* update_num_ops_stats */ false);
+      }
+
+      // We store the result in keys_.back() and operands_.back()
+      // if nothing went wrong (i.e.: no operand corruption on disk)
+      if (s.ok()) {
+        // The original key encountered
+        original_key = std::move(keys_.back());
+        orig_ikey.type = ikey.type == kTypeWideColumnEntity
+                             ? kTypeWideColumnEntity
+                             : kTypeValue;
+        UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type);
+        keys_.clear();
+        merge_context_.Clear();
+        keys_.emplace_front(std::move(original_key));
+        merge_context_.PushOperand(merge_result);
+      }
+
+      // move iter to the next entry
+      iter->Next();
+      return s;
+    } else {
+      // hit a merge
+      //   => if there is a compaction filter, apply it.
+      //   => check for range tombstones covering the operand
+      //   => merge the operand into the front of the operands_ list
+      //      if not filtered
+      //   => then continue because we haven't yet seen a Put/Delete.
+      //
+      // Keep queuing keys and operands until we either meet a put / delete
+      // request or later did a partial merge.
+
+      Slice value_slice = iter->value();
+      // add an operand to the list if:
+      // 1) it's included in one of the snapshots. in that case we *must* write
+      // it out, no matter what compaction filter says
+      // 2) it's not filtered by a compaction filter
+      CompactionFilter::Decision filter =
+          ikey.sequence <= latest_snapshot_
+              ? CompactionFilter::Decision::kKeep
+              : FilterMerge(orig_ikey.user_key, value_slice);
+      if (filter != CompactionFilter::Decision::kRemoveAndSkipUntil &&
+          range_del_agg != nullptr &&
+          range_del_agg->ShouldDelete(
+              iter->key(), RangeDelPositioningMode::kForwardTraversal)) {
+        filter = CompactionFilter::Decision::kRemove;
+      }
+      if (filter == CompactionFilter::Decision::kKeep ||
+          filter == CompactionFilter::Decision::kChangeValue) {
+        if (original_key_is_iter) {
+          // this is just an optimization that saves us one memcpy
+          keys_.emplace_front(original_key);
+        } else {
+          keys_.emplace_front(iter->key().ToString());
+        }
+        if (keys_.size() == 1) {
+          // we need to re-anchor the orig_ikey because it was anchored by
+          // original_key before
+          pik_status =
+              ParseInternalKey(keys_.back(), &orig_ikey, allow_data_in_errors);
+          pik_status.PermitUncheckedError();
+          assert(pik_status.ok());
+        }
+        if (filter == CompactionFilter::Decision::kKeep) {
+          merge_context_.PushOperand(
+              value_slice, iter->IsValuePinned() /* operand_pinned */);
+        } else {
+          assert(filter == CompactionFilter::Decision::kChangeValue);
+          // Compaction filter asked us to change the operand from value_slice
+          // to compaction_filter_value_.
+          merge_context_.PushOperand(compaction_filter_value_, false);
+        }
+      } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) {
+        // Compaction filter asked us to remove this key altogether
+        // (not just this operand), along with some keys following it.
+        keys_.clear();
+        merge_context_.Clear();
+        has_compaction_filter_skip_until_ = true;
+        return s;
+      }
+    }
+  }
+
+  if (cmp_with_full_history_ts_low >= 0) {
+    size_t num_merge_operands = merge_context_.GetNumOperands();
+    if (ts_sz && num_merge_operands > 1) {
+      // We do not merge merge operands with different timestamps if they are
+      // not eligible for GC.
+      ROCKS_LOG_ERROR(logger_, "ts_sz=%d, %d merge oprands",
+                      static_cast<int>(ts_sz),
+                      static_cast<int>(num_merge_operands));
+      assert(false);
+    }
+  }
+
+  if (merge_context_.GetNumOperands() == 0) {
+    // we filtered out all the merge operands
+    return s;
+  }
+
+  // We are sure we have seen this key's entire history if:
+  // at_bottom == true (this does not necessarily mean it is the bottommost
+  // layer, but rather that we are confident the key does not appear on any of
+  // the lower layers, at_bottom == false doesn't mean it does appear, just
+  // that we can't be sure, see Compaction::IsBottommostLevel for details)
+  // AND
+  // we have either encountered another key or end of key history on this
+  // layer.
+  // Note that if user-defined timestamp is enabled, we need some extra caution
+  // here: if full_history_ts_low is nullptr, or it's not null but the key's
+  // timestamp is greater than or equal to full_history_ts_low, it means this
+  // key cannot be dropped. We may not have seen the beginning of the key.
+  //
+  // When these conditions are true we are able to merge all the keys
+  // using full merge.
+  //
+  // For these cases we are not sure about, we simply miss the opportunity
+  // to combine the keys. Since VersionSet::SetupOtherInputs() always makes
+  // sure that all merge-operands on the same level get compacted together,
+  // this will simply lead to these merge operands moving to the next level.
+  bool surely_seen_the_beginning =
+      (hit_the_next_user_key || !iter->Valid()) && at_bottom &&
+      (ts_sz == 0 || cmp_with_full_history_ts_low < 0);
+  if (surely_seen_the_beginning) {
+    // do a final merge with nullptr as the existing value and say
+    // bye to the merge type (it's now converted to a Put)
+    assert(kTypeMerge == orig_ikey.type);
+    assert(merge_context_.GetNumOperands() >= 1);
+    assert(merge_context_.GetNumOperands() == keys_.size());
+    std::string merge_result;
+    s = TimedFullMerge(
+        user_merge_operator_, orig_ikey.user_key, nullptr,
+        merge_context_.GetOperands(), &merge_result, logger_, stats_, clock_,
+        /* result_operand */ nullptr, /* update_num_ops_stats */ false);
+    if (s.ok()) {
+      // The original key encountered
+      // We are certain that keys_ is not empty here (see assertions couple of
+      // lines before).
+      original_key = std::move(keys_.back());
+      orig_ikey.type = kTypeValue;
+      UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type);
+      keys_.clear();
+      merge_context_.Clear();
+      keys_.emplace_front(std::move(original_key));
+      merge_context_.PushOperand(merge_result);
+    }
+  } else {
+    // We haven't seen the beginning of the key nor a Put/Delete.
+    // Attempt to use the user's associative merge function to
+    // merge the stacked merge operands into a single operand.
+    s = Status::MergeInProgress();
+    if (merge_context_.GetNumOperands() >= 2 ||
+        (allow_single_operand_ && merge_context_.GetNumOperands() == 1)) {
+      bool merge_success = false;
+      std::string merge_result;
+      {
+        StopWatchNano timer(clock_, stats_ != nullptr);
+        PERF_TIMER_GUARD(merge_operator_time_nanos);
+        merge_success = user_merge_operator_->PartialMergeMulti(
+            orig_ikey.user_key,
+            std::deque<Slice>(merge_context_.GetOperands().begin(),
+                              merge_context_.GetOperands().end()),
+            &merge_result, logger_);
+        RecordTick(stats_, MERGE_OPERATION_TOTAL_TIME,
+                   stats_ ? timer.ElapsedNanosSafe() : 0);
+      }
+      if (merge_success) {
+        // Merging of operands (associative merge) was successful.
+        // Replace operands with the merge result
+        merge_context_.Clear();
+        merge_context_.PushOperand(merge_result);
+        keys_.erase(keys_.begin(), keys_.end() - 1);
+      }
+    }
+  }
+
+  return s;
+}
+
+MergeOutputIterator::MergeOutputIterator(const MergeHelper* merge_helper)
+    : merge_helper_(merge_helper) {
+  it_keys_ = merge_helper_->keys().rend();
+  it_values_ = merge_helper_->values().rend();
+}
+
+void MergeOutputIterator::SeekToFirst() {
+  const auto& keys = merge_helper_->keys();
+  const auto& values = merge_helper_->values();
+  assert(keys.size() == values.size());
+  it_keys_ = keys.rbegin();
+  it_values_ = values.rbegin();
+}
+
+void MergeOutputIterator::Next() {
+  ++it_keys_;
+  ++it_values_;
+}
+
+CompactionFilter::Decision MergeHelper::FilterMerge(const Slice& user_key,
+                                                    const Slice& value_slice) {
+  if (compaction_filter_ == nullptr) {
+    return CompactionFilter::Decision::kKeep;
+  }
+  if (stats_ != nullptr && ShouldReportDetailedTime(env_, stats_)) {
+    filter_timer_.Start();
+  }
+  compaction_filter_value_.clear();
+  compaction_filter_skip_until_.Clear();
+  auto ret = compaction_filter_->FilterV2(
+      level_, user_key, CompactionFilter::ValueType::kMergeOperand, value_slice,
+      &compaction_filter_value_, compaction_filter_skip_until_.rep());
+  if (ret == CompactionFilter::Decision::kRemoveAndSkipUntil) {
+    if (user_comparator_->Compare(*compaction_filter_skip_until_.rep(),
+                                  user_key) <= 0) {
+      // Invalid skip_until returned from compaction filter.
+      // Keep the key as per FilterV2 documentation.
+      ret = CompactionFilter::Decision::kKeep;
+    } else {
+      compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber,
+                                                       kValueTypeForSeek);
+    }
+  }
+  if (stats_ != nullptr && ShouldReportDetailedTime(env_, stats_)) {
+    total_filter_time_ += filter_timer_.ElapsedNanosSafe();
+  }
+  return ret;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/merge_helper.h b/src/rocksdb/db/merge_helper.h
new file mode 100644
index 000000000..790ec6239
--- /dev/null
+++ b/src/rocksdb/db/merge_helper.h
@@ -0,0 +1,216 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#include <deque>
+#include <string>
+#include <vector>
+
+#include "db/merge_context.h"
+#include "db/range_del_aggregator.h"
+#include "db/snapshot_checker.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/wide_columns.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Comparator;
+class Iterator;
+class Logger;
+class MergeOperator;
+class Statistics;
+class SystemClock;
+class BlobFetcher;
+class PrefetchBufferCollection;
+struct CompactionIterationStats;
+
+class MergeHelper {
+ public:
+  MergeHelper(Env* env, const Comparator* user_comparator,
+              const MergeOperator* user_merge_operator,
+              const CompactionFilter* compaction_filter, Logger* logger,
+              bool assert_valid_internal_key, SequenceNumber latest_snapshot,
+              const SnapshotChecker* snapshot_checker = nullptr, int level = 0,
+              Statistics* stats = nullptr,
+              const std::atomic<bool>* shutting_down = nullptr);
+
+  // Wrapper around MergeOperator::FullMergeV2() that records perf statistics.
+  // Result of merge will be written to result if status returned is OK.
+  // If operands is empty, the value will simply be copied to result.
+  // Set `update_num_ops_stats` to true if it is from a user read, so that
+  // the latency is sensitive.
+  // Returns one of the following statuses:
+  // - OK: Entries were successfully merged.
+  // - Corruption: Merge operator reported unsuccessful merge.
+  static Status TimedFullMerge(const MergeOperator* merge_operator,
+                               const Slice& key, const Slice* value,
+                               const std::vector<Slice>& operands,
+                               std::string* result, Logger* logger,
+                               Statistics* statistics, SystemClock* clock,
+                               Slice* result_operand,
+                               bool update_num_ops_stats);
+
+  static Status TimedFullMergeWithEntity(
+      const MergeOperator* merge_operator, const Slice& key, Slice base_entity,
+      const std::vector<Slice>& operands, std::string* result, Logger* logger,
+      Statistics* statistics, SystemClock* clock, bool update_num_ops_stats);
+
+  // During compaction, merge entries until we hit
+  //     - a corrupted key
+  //     - a Put/Delete,
+  //     - a different user key,
+  //     - a specific sequence number (snapshot boundary),
+  //     - REMOVE_AND_SKIP_UNTIL returned from compaction filter,
+  //  or - the end of iteration
+  // iter: (IN)  points to the first merge type entry
+  //       (OUT) points to the first entry not included in the merge process
+  // range_del_agg: (IN) filters merge operands covered by range tombstones.
+  // stop_before: (IN) a sequence number that merge should not cross.
+  //                   0 means no restriction
+  // at_bottom:   (IN) true if the iterator covers the bottem level, which means
+  //                   we could reach the start of the history of this user key.
+  // allow_data_in_errors: (IN) if true, data details will be displayed in
+  //                   error/log messages.
+  // blob_fetcher: (IN) blob fetcher object for the compaction's input version.
+  // prefetch_buffers: (IN/OUT) a collection of blob file prefetch buffers
+  //                            used for compaction readahead.
+  // c_iter_stats: (OUT) compaction iteration statistics.
+  //
+  // Returns one of the following statuses:
+  // - OK: Entries were successfully merged.
+  // - MergeInProgress: Put/Delete not encountered, and didn't reach the start
+  //   of key's history. Output consists of merge operands only.
+  // - Corruption: Merge operator reported unsuccessful merge or a corrupted
+  //   key has been encountered and not expected (applies only when compiling
+  //   with asserts removed).
+  // - ShutdownInProgress: interrupted by shutdown (*shutting_down == true).
+  //
+  // REQUIRED: The first key in the input is not corrupted.
+  Status MergeUntil(InternalIterator* iter,
+                    CompactionRangeDelAggregator* range_del_agg,
+                    const SequenceNumber stop_before, const bool at_bottom,
+                    const bool allow_data_in_errors,
+                    const BlobFetcher* blob_fetcher,
+                    const std::string* const full_history_ts_low,
+                    PrefetchBufferCollection* prefetch_buffers,
+                    CompactionIterationStats* c_iter_stats);
+
+  // Filters a merge operand using the compaction filter specified
+  // in the constructor. Returns the decision that the filter made.
+  // Uses compaction_filter_value_ and compaction_filter_skip_until_ for the
+  // optional outputs of compaction filter.
+  // user_key includes timestamp if user-defined timestamp is enabled.
+  CompactionFilter::Decision FilterMerge(const Slice& user_key,
+                                         const Slice& value_slice);
+
+  // Query the merge result
+  // These are valid until the next MergeUntil call
+  // If the merging was successful:
+  //   - keys() contains a single element with the latest sequence number of
+  //     the merges. The type will be Put or Merge. See IMPORTANT 1 note, below.
+  //   - values() contains a single element with the result of merging all the
+  //     operands together
+  //
+  //   IMPORTANT 1: the key type could change after the MergeUntil call.
+  //        Put/Delete + Merge + ... + Merge => Put
+  //        Merge + ... + Merge => Merge
+  //
+  // If the merge operator is not associative, and if a Put/Delete is not found
+  // then the merging will be unsuccessful. In this case:
+  //   - keys() contains the list of internal keys seen in order of iteration.
+  //   - values() contains the list of values (merges) seen in the same order.
+  //              values() is parallel to keys() so that the first entry in
+  //              keys() is the key associated with the first entry in values()
+  //              and so on. These lists will be the same length.
+  //              All of these pairs will be merges over the same user key.
+  //              See IMPORTANT 2 note below.
+  //
+  //   IMPORTANT 2: The entries were traversed in order from BACK to FRONT.
+  //                So keys().back() was the first key seen by iterator.
+  // TODO: Re-style this comment to be like the first one
+  const std::deque<std::string>& keys() const { return keys_; }
+  const std::vector<Slice>& values() const {
+    return merge_context_.GetOperands();
+  }
+  uint64_t TotalFilterTime() const { return total_filter_time_; }
+  bool HasOperator() const { return user_merge_operator_ != nullptr; }
+
+  // If compaction filter returned REMOVE_AND_SKIP_UNTIL, this method will
+  // return true and fill *until with the key to which we should skip.
+  // If true, keys() and values() are empty.
+  bool FilteredUntil(Slice* skip_until) const {
+    if (!has_compaction_filter_skip_until_) {
+      return false;
+    }
+    assert(compaction_filter_ != nullptr);
+    assert(skip_until != nullptr);
+    assert(compaction_filter_skip_until_.Valid());
+    *skip_until = compaction_filter_skip_until_.Encode();
+    return true;
+  }
+
+ private:
+  Env* env_;
+  SystemClock* clock_;
+  const Comparator* user_comparator_;
+  const MergeOperator* user_merge_operator_;
+  const CompactionFilter* compaction_filter_;
+  const std::atomic<bool>* shutting_down_;
+  Logger* logger_;
+  bool assert_valid_internal_key_;  // enforce no internal key corruption?
+  bool allow_single_operand_;
+  SequenceNumber latest_snapshot_;
+  const SnapshotChecker* const snapshot_checker_;
+  int level_;
+
+  // the scratch area that holds the result of MergeUntil
+  // valid up to the next MergeUntil call
+
+  // Keeps track of the sequence of keys seen
+  std::deque<std::string> keys_;
+  // Parallel with keys_; stores the operands
+  mutable MergeContext merge_context_;
+
+  StopWatchNano filter_timer_;
+  uint64_t total_filter_time_;
+  Statistics* stats_;
+
+  bool has_compaction_filter_skip_until_ = false;
+  std::string compaction_filter_value_;
+  InternalKey compaction_filter_skip_until_;
+
+  bool IsShuttingDown() {
+    // This is a best-effort facility, so memory_order_relaxed is sufficient.
+    return shutting_down_ && shutting_down_->load(std::memory_order_relaxed);
+  }
+};
+
+// MergeOutputIterator can be used to iterate over the result of a merge.
+class MergeOutputIterator {
+ public:
+  // The MergeOutputIterator is bound to a MergeHelper instance.
+  explicit MergeOutputIterator(const MergeHelper* merge_helper);
+
+  // Seeks to the first record in the output.
+  void SeekToFirst();
+  // Advances to the next record in the output.
+  void Next();
+
+  Slice key() { return Slice(*it_keys_); }
+  Slice value() { return Slice(*it_values_); }
+  bool Valid() { return it_keys_ != merge_helper_->keys().rend(); }
+
+ private:
+  const MergeHelper* merge_helper_;
+  std::deque<std::string>::const_reverse_iterator it_keys_;
+  std::vector<Slice>::const_reverse_iterator it_values_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/merge_helper_test.cc b/src/rocksdb/db/merge_helper_test.cc
new file mode 100644
index 000000000..05408d5b9
--- /dev/null
+++ b/src/rocksdb/db/merge_helper_test.cc
@@ -0,0 +1,298 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/merge_helper.h"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/vector_iterator.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MergeHelperTest : public testing::Test {
+ public:
+  MergeHelperTest() : icmp_(BytewiseComparator()) { env_ = Env::Default(); }
+
+  ~MergeHelperTest() override = default;
+
+  Status Run(SequenceNumber stop_before, bool at_bottom,
+             SequenceNumber latest_snapshot = 0) {
+    iter_.reset(new VectorIterator(ks_, vs_, &icmp_));
+    iter_->SeekToFirst();
+    merge_helper_.reset(new MergeHelper(env_, icmp_.user_comparator(),
+                                        merge_op_.get(), filter_.get(), nullptr,
+                                        false, latest_snapshot));
+    return merge_helper_->MergeUntil(
+        iter_.get(), nullptr /* range_del_agg */, stop_before, at_bottom,
+        false /* allow_data_in_errors */, nullptr /* blob_fetcher */,
+        nullptr /* full_history_ts_low */, nullptr /* prefetch_buffers */,
+        nullptr /* c_iter_stats */);
+  }
+
+  void AddKeyVal(const std::string& user_key, const SequenceNumber& seq,
+                 const ValueType& t, const std::string& val,
+                 bool corrupt = false) {
+    InternalKey ikey(user_key, seq, t);
+    if (corrupt) {
+      test::CorruptKeyType(&ikey);
+    }
+    ks_.push_back(ikey.Encode().ToString());
+    vs_.push_back(val);
+  }
+
+  Env* env_;
+  InternalKeyComparator icmp_;
+  std::unique_ptr<VectorIterator> iter_;
+  std::shared_ptr<MergeOperator> merge_op_;
+  std::unique_ptr<MergeHelper> merge_helper_;
+  std::vector<std::string> ks_;
+  std::vector<std::string> vs_;
+  std::unique_ptr<test::FilterNumber> filter_;
+};
+
+// If MergeHelper encounters a new key on the last level, we know that
+// the key has no more history and it can merge keys.
+TEST_F(MergeHelperTest, MergeAtBottomSuccess) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+  AddKeyVal("a", 20, kTypeMerge, test::EncodeInt(1U));
+  AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("b", 10, kTypeMerge, test::EncodeInt(4U));  // <- iter_ after merge
+
+  ASSERT_TRUE(Run(0, true).ok());
+  ASSERT_EQ(ks_[2], iter_->key());
+  ASSERT_EQ(test::KeyStr("a", 20, kTypeValue), merge_helper_->keys()[0]);
+  ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]);
+  ASSERT_EQ(1U, merge_helper_->keys().size());
+  ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// Merging with a value results in a successful merge.
+TEST_F(MergeHelperTest, MergeValue) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+  AddKeyVal("a", 40, kTypeMerge, test::EncodeInt(1U));
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("a", 20, kTypeValue, test::EncodeInt(4U));  // <- iter_ after merge
+  AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(1U));
+
+  ASSERT_TRUE(Run(0, false).ok());
+  ASSERT_EQ(ks_[3], iter_->key());
+  ASSERT_EQ(test::KeyStr("a", 40, kTypeValue), merge_helper_->keys()[0]);
+  ASSERT_EQ(test::EncodeInt(8U), merge_helper_->values()[0]);
+  ASSERT_EQ(1U, merge_helper_->keys().size());
+  ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// Merging stops before a snapshot.
+TEST_F(MergeHelperTest, SnapshotBeforeValue) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+  AddKeyVal("a", 50, kTypeMerge, test::EncodeInt(1U));
+  AddKeyVal("a", 40, kTypeMerge, test::EncodeInt(3U));  // <- iter_ after merge
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(1U));
+  AddKeyVal("a", 20, kTypeValue, test::EncodeInt(4U));
+  AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(1U));
+
+  ASSERT_TRUE(Run(31, true).IsMergeInProgress());
+  ASSERT_EQ(ks_[2], iter_->key());
+  ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[0]);
+  ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]);
+  ASSERT_EQ(1U, merge_helper_->keys().size());
+  ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// MergeHelper preserves the operand stack for merge operators that
+// cannot do a partial merge.
+TEST_F(MergeHelperTest, NoPartialMerge) {
+  merge_op_ = MergeOperators::CreateStringAppendTESTOperator();
+
+  AddKeyVal("a", 50, kTypeMerge, "v2");
+  AddKeyVal("a", 40, kTypeMerge, "v");  // <- iter_ after merge
+  AddKeyVal("a", 30, kTypeMerge, "v");
+
+  ASSERT_TRUE(Run(31, true).IsMergeInProgress());
+  ASSERT_EQ(ks_[2], iter_->key());
+  ASSERT_EQ(test::KeyStr("a", 40, kTypeMerge), merge_helper_->keys()[0]);
+  ASSERT_EQ("v", merge_helper_->values()[0]);
+  ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[1]);
+  ASSERT_EQ("v2", merge_helper_->values()[1]);
+  ASSERT_EQ(2U, merge_helper_->keys().size());
+  ASSERT_EQ(2U, merge_helper_->values().size());
+}
+
+// A single operand can not be merged.
+TEST_F(MergeHelperTest, SingleOperand) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+  AddKeyVal("a", 50, kTypeMerge, test::EncodeInt(1U));
+
+  ASSERT_TRUE(Run(31, false).IsMergeInProgress());
+  ASSERT_FALSE(iter_->Valid());
+  ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[0]);
+  ASSERT_EQ(test::EncodeInt(1U), merge_helper_->values()[0]);
+  ASSERT_EQ(1U, merge_helper_->keys().size());
+  ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// Merging with a deletion turns the deletion into a value
+TEST_F(MergeHelperTest, MergeDeletion) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("a", 20, kTypeDeletion, "");
+
+  ASSERT_TRUE(Run(15, false).ok());
+  ASSERT_FALSE(iter_->Valid());
+  ASSERT_EQ(test::KeyStr("a", 30, kTypeValue), merge_helper_->keys()[0]);
+  ASSERT_EQ(test::EncodeInt(3U), merge_helper_->values()[0]);
+  ASSERT_EQ(1U, merge_helper_->keys().size());
+  ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// The merge helper stops upon encountering a corrupt key
+TEST_F(MergeHelperTest, CorruptKey) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(1U));
+  // Corrupt key
+  AddKeyVal("a", 20, kTypeDeletion, "", true);  // <- iter_ after merge
+
+  ASSERT_TRUE(Run(15, false).IsMergeInProgress());
+  ASSERT_EQ(ks_[2], iter_->key());
+  ASSERT_EQ(test::KeyStr("a", 30, kTypeMerge), merge_helper_->keys()[0]);
+  ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]);
+  ASSERT_EQ(1U, merge_helper_->keys().size());
+  ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// The compaction filter is called on every merge operand
+TEST_F(MergeHelperTest, FilterMergeOperands) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+  filter_.reset(new test::FilterNumber(5U));
+
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(5U));  // Filtered
+  AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(1U));
+  AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U));  // Filtered
+  AddKeyVal("a", 25, kTypeValue, test::EncodeInt(1U));
+
+  ASSERT_TRUE(Run(15, false).ok());
+  ASSERT_FALSE(iter_->Valid());
+  MergeOutputIterator merge_output_iter(merge_helper_.get());
+  merge_output_iter.SeekToFirst();
+  ASSERT_EQ(test::KeyStr("a", 30, kTypeValue),
+            merge_output_iter.key().ToString());
+  ASSERT_EQ(test::EncodeInt(8U), merge_output_iter.value().ToString());
+  merge_output_iter.Next();
+  ASSERT_FALSE(merge_output_iter.Valid());
+}
+
+TEST_F(MergeHelperTest, FilterAllMergeOperands) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+  filter_.reset(new test::FilterNumber(5U));
+
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U));
+
+  // filtered out all
+  ASSERT_TRUE(Run(15, false).ok());
+  ASSERT_FALSE(iter_->Valid());
+  MergeOutputIterator merge_output_iter(merge_helper_.get());
+  merge_output_iter.SeekToFirst();
+  ASSERT_FALSE(merge_output_iter.Valid());
+
+  // we have one operand that will survive because it's a delete
+  AddKeyVal("a", 24, kTypeDeletion, test::EncodeInt(5U));
+  AddKeyVal("b", 23, kTypeValue, test::EncodeInt(5U));
+  ASSERT_TRUE(Run(15, true).ok());
+  merge_output_iter = MergeOutputIterator(merge_helper_.get());
+  ASSERT_TRUE(iter_->Valid());
+  merge_output_iter.SeekToFirst();
+  ASSERT_FALSE(merge_output_iter.Valid());
+
+  // when all merge operands are filtered out, we leave the iterator pointing to
+  // the Put/Delete that survived
+  ASSERT_EQ(test::KeyStr("a", 24, kTypeDeletion), iter_->key().ToString());
+  ASSERT_EQ(test::EncodeInt(5U), iter_->value().ToString());
+}
+
+// Make sure that merge operands are filtered at the beginning
+TEST_F(MergeHelperTest, FilterFirstMergeOperand) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+  filter_.reset(new test::FilterNumber(5U));
+
+  AddKeyVal("a", 31, kTypeMerge, test::EncodeInt(5U));  // Filtered
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U));  // Filtered
+  AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(2U));
+  AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(1U));
+  AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U));  // Filtered
+  AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U));  // Filtered
+  AddKeyVal("b", 24, kTypeValue, test::EncodeInt(5U));  // next user key
+
+  ASSERT_OK(Run(15, true));
+  ASSERT_TRUE(iter_->Valid());
+  MergeOutputIterator merge_output_iter(merge_helper_.get());
+  merge_output_iter.SeekToFirst();
+  // sequence number is 29 here, because the first merge operand got filtered
+  // out
+  ASSERT_EQ(test::KeyStr("a", 29, kTypeValue),
+            merge_output_iter.key().ToString());
+  ASSERT_EQ(test::EncodeInt(6U), merge_output_iter.value().ToString());
+  merge_output_iter.Next();
+  ASSERT_FALSE(merge_output_iter.Valid());
+
+  // make sure that we're passing user keys into the filter
+  ASSERT_EQ("a", filter_->last_merge_operand_key());
+}
+
+// Make sure that merge operands are not filtered out if there's a snapshot
+// pointing at them
+TEST_F(MergeHelperTest, DontFilterMergeOperandsBeforeSnapshotTest) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+  filter_.reset(new test::FilterNumber(5U));
+
+  AddKeyVal("a", 31, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(2U));
+  AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(1U));
+  AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("b", 24, kTypeValue, test::EncodeInt(5U));
+
+  ASSERT_OK(Run(15, true, 32));
+  ASSERT_TRUE(iter_->Valid());
+  MergeOutputIterator merge_output_iter(merge_helper_.get());
+  merge_output_iter.SeekToFirst();
+  ASSERT_EQ(test::KeyStr("a", 31, kTypeValue),
+            merge_output_iter.key().ToString());
+  ASSERT_EQ(test::EncodeInt(26U), merge_output_iter.value().ToString());
+  merge_output_iter.Next();
+  ASSERT_FALSE(merge_output_iter.Valid());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/merge_operator.cc b/src/rocksdb/db/merge_operator.cc
new file mode 100644
index 000000000..d32585640
--- /dev/null
+++ b/src/rocksdb/db/merge_operator.cc
@@ -0,0 +1,85 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+/**
+ * Back-end implementation details specific to the Merge Operator.
+ */
+
+#include "rocksdb/merge_operator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool MergeOperator::FullMergeV2(const MergeOperationInput& merge_in,
+                                MergeOperationOutput* merge_out) const {
+  // If FullMergeV2 is not implemented, we convert the operand_list to
+  // std::deque<std::string> and pass it to FullMerge
+  std::deque<std::string> operand_list_str;
+  for (auto& op : merge_in.operand_list) {
+    operand_list_str.emplace_back(op.data(), op.size());
+  }
+  return FullMerge(merge_in.key, merge_in.existing_value, operand_list_str,
+                   &merge_out->new_value, merge_in.logger);
+}
+
+// The default implementation of PartialMergeMulti, which invokes
+// PartialMerge multiple times internally and merges two operands at
+// a time.
+bool MergeOperator::PartialMergeMulti(const Slice& key,
+                                      const std::deque<Slice>& operand_list,
+                                      std::string* new_value,
+                                      Logger* logger) const {
+  assert(operand_list.size() >= 2);
+  // Simply loop through the operands
+  Slice temp_slice(operand_list[0]);
+
+  for (size_t i = 1; i < operand_list.size(); ++i) {
+    auto& operand = operand_list[i];
+    std::string temp_value;
+    if (!PartialMerge(key, temp_slice, operand, &temp_value, logger)) {
+      return false;
+    }
+    swap(temp_value, *new_value);
+    temp_slice = Slice(*new_value);
+  }
+
+  // The result will be in *new_value. All merges succeeded.
+  return true;
+}
+
+// Given a "real" merge from the library, call the user's
+// associative merge function one-by-one on each of the operands.
+// NOTE: It is assumed that the client's merge-operator will handle any errors.
+bool AssociativeMergeOperator::FullMergeV2(
+    const MergeOperationInput& merge_in,
+    MergeOperationOutput* merge_out) const {
+  // Simply loop through the operands
+  Slice temp_existing;
+  const Slice* existing_value = merge_in.existing_value;
+  for (const auto& operand : merge_in.operand_list) {
+    std::string temp_value;
+    if (!Merge(merge_in.key, existing_value, operand, &temp_value,
+               merge_in.logger)) {
+      return false;
+    }
+    swap(temp_value, merge_out->new_value);
+    temp_existing = Slice(merge_out->new_value);
+    existing_value = &temp_existing;
+  }
+
+  // The result will be in *new_value. All merges succeeded.
+  return true;
+}
+
+// Call the user defined simple merge on the operands;
+// NOTE: It is assumed that the client's merge-operator will handle any errors.
+bool AssociativeMergeOperator::PartialMerge(const Slice& key,
+                                            const Slice& left_operand,
+                                            const Slice& right_operand,
+                                            std::string* new_value,
+                                            Logger* logger) const {
+  return Merge(key, &left_operand, right_operand, new_value, logger);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/merge_test.cc b/src/rocksdb/db/merge_test.cc
new file mode 100644
index 000000000..0d373d41e
--- /dev/null
+++ b/src/rocksdb/db/merge_test.cc
@@ -0,0 +1,629 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include <assert.h>
+
+#include <iostream>
+#include <memory>
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/write_batch_internal.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "test_util/testharness.h"
+#include "util/coding.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool use_compression;
+
+class MergeTest : public testing::Test {};
+
+size_t num_merge_operator_calls;
+void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; }
+
+size_t num_partial_merge_calls;
+void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; }
+
+class CountMergeOperator : public AssociativeMergeOperator {
+ public:
+  CountMergeOperator() {
+    mergeOperator_ = MergeOperators::CreateUInt64AddOperator();
+  }
+
+  bool Merge(const Slice& key, const Slice* existing_value, const Slice& value,
+             std::string* new_value, Logger* logger) const override {
+    assert(new_value->empty());
+    ++num_merge_operator_calls;
+    if (existing_value == nullptr) {
+      new_value->assign(value.data(), value.size());
+      return true;
+    }
+
+    return mergeOperator_->PartialMerge(key, *existing_value, value, new_value,
+                                        logger);
+  }
+
+  bool PartialMergeMulti(const Slice& key,
+                         const std::deque<Slice>& operand_list,
+                         std::string* new_value,
+                         Logger* logger) const override {
+    assert(new_value->empty());
+    ++num_partial_merge_calls;
+    return mergeOperator_->PartialMergeMulti(key, operand_list, new_value,
+                                             logger);
+  }
+
+  const char* Name() const override { return "UInt64AddOperator"; }
+
+ private:
+  std::shared_ptr<MergeOperator> mergeOperator_;
+};
+
+class EnvMergeTest : public EnvWrapper {
+ public:
+  EnvMergeTest() : EnvWrapper(Env::Default()) {}
+  static const char* kClassName() { return "MergeEnv"; }
+  const char* Name() const override { return kClassName(); }
+  //  ~EnvMergeTest() override {}
+
+  uint64_t NowNanos() override {
+    ++now_nanos_count_;
+    return target()->NowNanos();
+  }
+
+  static uint64_t now_nanos_count_;
+
+  static std::unique_ptr<EnvMergeTest> singleton_;
+
+  static EnvMergeTest* GetInstance() {
+    if (nullptr == singleton_) singleton_.reset(new EnvMergeTest);
+    return singleton_.get();
+  }
+};
+
+uint64_t EnvMergeTest::now_nanos_count_{0};
+std::unique_ptr<EnvMergeTest> EnvMergeTest::singleton_;
+
+std::shared_ptr<DB> OpenDb(const std::string& dbname, const bool ttl = false,
+                           const size_t max_successive_merges = 0) {
+  DB* db;
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator = std::make_shared<CountMergeOperator>();
+  options.max_successive_merges = max_successive_merges;
+  options.env = EnvMergeTest::GetInstance();
+  EXPECT_OK(DestroyDB(dbname, Options()));
+  Status s;
+// DBWithTTL is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+  if (ttl) {
+    DBWithTTL* db_with_ttl;
+    s = DBWithTTL::Open(options, dbname, &db_with_ttl);
+    db = db_with_ttl;
+  } else {
+    s = DB::Open(options, dbname, &db);
+  }
+#else
+  assert(!ttl);
+  s = DB::Open(options, dbname, &db);
+#endif  // !ROCKSDB_LITE
+  EXPECT_OK(s);
+  assert(s.ok());
+  // Allowed to call NowNanos during DB creation (in GenerateRawUniqueId() for
+  // session ID)
+  EnvMergeTest::now_nanos_count_ = 0;
+  return std::shared_ptr<DB>(db);
+}
+
+// Imagine we are maintaining a set of uint64 counters.
+// Each counter has a distinct name. And we would like
+// to support four high level operations:
+// set, add, get and remove
+// This is a quick implementation without a Merge operation.
+class Counters {
+ protected:
+  std::shared_ptr<DB> db_;
+
+  WriteOptions put_option_;
+  ReadOptions get_option_;
+  WriteOptions delete_option_;
+
+  uint64_t default_;
+
+ public:
+  explicit Counters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
+      : db_(db),
+        put_option_(),
+        get_option_(),
+        delete_option_(),
+        default_(defaultCount) {
+    assert(db_);
+  }
+
+  virtual ~Counters() {}
+
+  // public interface of Counters.
+  // All four functions return false
+  // if the underlying level db operation failed.
+
+  // mapped to a levedb Put
+  bool set(const std::string& key, uint64_t value) {
+    // just treat the internal rep of int64 as the string
+    char buf[sizeof(value)];
+    EncodeFixed64(buf, value);
+    Slice slice(buf, sizeof(value));
+    auto s = db_->Put(put_option_, key, slice);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      std::cerr << s.ToString() << std::endl;
+      return false;
+    }
+  }
+
+  // mapped to a rocksdb Delete
+  bool remove(const std::string& key) {
+    auto s = db_->Delete(delete_option_, key);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      std::cerr << s.ToString() << std::endl;
+      return false;
+    }
+  }
+
+  // mapped to a rocksdb Get
+  bool get(const std::string& key, uint64_t* value) {
+    std::string str;
+    auto s = db_->Get(get_option_, key, &str);
+
+    if (s.IsNotFound()) {
+      // return default value if not found;
+      *value = default_;
+      return true;
+    } else if (s.ok()) {
+      // deserialization
+      if (str.size() != sizeof(uint64_t)) {
+        std::cerr << "value corruption\n";
+        return false;
+      }
+      *value = DecodeFixed64(&str[0]);
+      return true;
+    } else {
+      std::cerr << s.ToString() << std::endl;
+      return false;
+    }
+  }
+
+  // 'add' is implemented as get -> modify -> set
+  // An alternative is a single merge operation, see MergeBasedCounters
+  virtual bool add(const std::string& key, uint64_t value) {
+    uint64_t base = default_;
+    return get(key, &base) && set(key, base + value);
+  }
+
+  // convenience functions for testing
+  void assert_set(const std::string& key, uint64_t value) {
+    assert(set(key, value));
+  }
+
+  void assert_remove(const std::string& key) { assert(remove(key)); }
+
+  uint64_t assert_get(const std::string& key) {
+    uint64_t value = default_;
+    int result = get(key, &value);
+    assert(result);
+    if (result == 0) exit(1);  // Disable unused variable warning.
+    return value;
+  }
+
+  void assert_add(const std::string& key, uint64_t value) {
+    int result = add(key, value);
+    assert(result);
+    if (result == 0) exit(1);  // Disable unused variable warning.
+  }
+};
+
+// Implement 'add' directly with the new Merge operation
+class MergeBasedCounters : public Counters {
+ private:
+  WriteOptions merge_option_;  // for merge
+
+ public:
+  explicit MergeBasedCounters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
+      : Counters(db, defaultCount), merge_option_() {}
+
+  // mapped to a rocksdb Merge operation
+  bool add(const std::string& key, uint64_t value) override {
+    char encoded[sizeof(uint64_t)];
+    EncodeFixed64(encoded, value);
+    Slice slice(encoded, sizeof(uint64_t));
+    auto s = db_->Merge(merge_option_, key, slice);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      std::cerr << s.ToString() << std::endl;
+      return false;
+    }
+  }
+};
+
+void dumpDb(DB* db) {
+  auto it = std::unique_ptr<Iterator>(db->NewIterator(ReadOptions()));
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    // uint64_t value = DecodeFixed64(it->value().data());
+    // std::cout << it->key().ToString() << ": " << value << std::endl;
+  }
+  assert(it->status().ok());  // Check for any errors found during the scan
+}
+
+void testCounters(Counters& counters, DB* db, bool test_compaction) {
+  FlushOptions o;
+  o.wait = true;
+
+  counters.assert_set("a", 1);
+
+  if (test_compaction) {
+    ASSERT_OK(db->Flush(o));
+  }
+
+  ASSERT_EQ(counters.assert_get("a"), 1);
+
+  counters.assert_remove("b");
+
+  // defaut value is 0 if non-existent
+  ASSERT_EQ(counters.assert_get("b"), 0);
+
+  counters.assert_add("a", 2);
+
+  if (test_compaction) {
+    ASSERT_OK(db->Flush(o));
+  }
+
+  // 1+2 = 3
+  ASSERT_EQ(counters.assert_get("a"), 3);
+
+  dumpDb(db);
+
+  // 1+...+49 = ?
+  uint64_t sum = 0;
+  for (int i = 1; i < 50; i++) {
+    counters.assert_add("b", i);
+    sum += i;
+  }
+  ASSERT_EQ(counters.assert_get("b"), sum);
+
+  dumpDb(db);
+
+  if (test_compaction) {
+    ASSERT_OK(db->Flush(o));
+
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+    dumpDb(db);
+
+    ASSERT_EQ(counters.assert_get("a"), 3);
+    ASSERT_EQ(counters.assert_get("b"), sum);
+  }
+}
+
+void testCountersWithFlushAndCompaction(Counters& counters, DB* db) {
+  ASSERT_OK(db->Put({}, "1", "1"));
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  std::atomic<int> cnt{0};
+  const auto get_thread_id = [&cnt]() {
+    thread_local int thread_id{cnt++};
+    return thread_id;
+  };
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:BeforeWriterWaiting", [&](void* /*arg*/) {
+        int thread_id = get_thread_id();
+        if (1 == thread_id) {
+          TEST_SYNC_POINT(
+              "testCountersWithFlushAndCompaction::bg_compact_thread:0");
+        } else if (2 == thread_id) {
+          TEST_SYNC_POINT(
+              "testCountersWithFlushAndCompaction::bg_flush_thread:0");
+        }
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void* /*arg*/) {
+        int thread_id = get_thread_id();
+        if (0 == thread_id) {
+          TEST_SYNC_POINT(
+              "testCountersWithFlushAndCompaction::set_options_thread:0");
+          TEST_SYNC_POINT(
+              "testCountersWithFlushAndCompaction::set_options_thread:1");
+        }
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WakeUpAndDone", [&](void* arg) {
+        auto* mutex = reinterpret_cast<InstrumentedMutex*>(arg);
+        mutex->AssertHeld();
+        int thread_id = get_thread_id();
+        ASSERT_EQ(2, thread_id);
+        mutex->Unlock();
+        TEST_SYNC_POINT(
+            "testCountersWithFlushAndCompaction::bg_flush_thread:1");
+        TEST_SYNC_POINT(
+            "testCountersWithFlushAndCompaction::bg_flush_thread:2");
+        mutex->Lock();
+      });
+  SyncPoint::GetInstance()->LoadDependency({
+      {"testCountersWithFlushAndCompaction::set_options_thread:0",
+       "testCountersWithCompactionAndFlush:BeforeCompact"},
+      {"testCountersWithFlushAndCompaction::bg_compact_thread:0",
+       "testCountersWithFlushAndCompaction:BeforeIncCounters"},
+      {"testCountersWithFlushAndCompaction::bg_flush_thread:0",
+       "testCountersWithFlushAndCompaction::set_options_thread:1"},
+      {"testCountersWithFlushAndCompaction::bg_flush_thread:1",
+       "testCountersWithFlushAndCompaction:BeforeVerification"},
+      {"testCountersWithFlushAndCompaction:AfterGet",
+       "testCountersWithFlushAndCompaction::bg_flush_thread:2"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread set_options_thread([&]() {
+    ASSERT_OK(reinterpret_cast<DBImpl*>(db)->SetOptions(
+        {{"disable_auto_compactions", "false"}}));
+  });
+  TEST_SYNC_POINT("testCountersWithCompactionAndFlush:BeforeCompact");
+  port::Thread compact_thread([&]() {
+    ASSERT_OK(reinterpret_cast<DBImpl*>(db)->CompactRange(
+        CompactRangeOptions(), db->DefaultColumnFamily(), nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT("testCountersWithFlushAndCompaction:BeforeIncCounters");
+  counters.add("test-key", 1);
+
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  ASSERT_OK(db->Flush(flush_opts));
+
+  TEST_SYNC_POINT("testCountersWithFlushAndCompaction:BeforeVerification");
+  std::string expected;
+  PutFixed64(&expected, 1);
+  std::string actual;
+  Status s = db->Get(ReadOptions(), "test-key", &actual);
+  TEST_SYNC_POINT("testCountersWithFlushAndCompaction:AfterGet");
+  set_options_thread.join();
+  compact_thread.join();
+  ASSERT_OK(s);
+  ASSERT_EQ(expected, actual);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+void testSuccessiveMerge(Counters& counters, size_t max_num_merges,
+                         size_t num_merges) {
+  counters.assert_remove("z");
+  uint64_t sum = 0;
+
+  for (size_t i = 1; i <= num_merges; ++i) {
+    resetNumMergeOperatorCalls();
+    counters.assert_add("z", i);
+    sum += i;
+
+    if (i % (max_num_merges + 1) == 0) {
+      ASSERT_EQ(num_merge_operator_calls, max_num_merges + 1);
+    } else {
+      ASSERT_EQ(num_merge_operator_calls, 0);
+    }
+
+    resetNumMergeOperatorCalls();
+    ASSERT_EQ(counters.assert_get("z"), sum);
+    ASSERT_EQ(num_merge_operator_calls, i % (max_num_merges + 1));
+  }
+}
+
+void testPartialMerge(Counters* counters, DB* db, size_t max_merge,
+                      size_t min_merge, size_t count) {
+  FlushOptions o;
+  o.wait = true;
+
+  // Test case 1: partial merge should be called when the number of merge
+  //              operands exceeds the threshold.
+  uint64_t tmp_sum = 0;
+  resetNumPartialMergeCalls();
+  for (size_t i = 1; i <= count; i++) {
+    counters->assert_add("b", i);
+    tmp_sum += i;
+  }
+  ASSERT_OK(db->Flush(o));
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(tmp_sum, counters->assert_get("b"));
+  if (count > max_merge) {
+    // in this case, FullMerge should be called instead.
+    ASSERT_EQ(num_partial_merge_calls, 0U);
+  } else {
+    // if count >= min_merge, then partial merge should be called once.
+    ASSERT_EQ((count >= min_merge), (num_partial_merge_calls == 1));
+  }
+
+  // Test case 2: partial merge should not be called when a put is found.
+  resetNumPartialMergeCalls();
+  tmp_sum = 0;
+  ASSERT_OK(db->Put(ROCKSDB_NAMESPACE::WriteOptions(), "c", "10"));
+  for (size_t i = 1; i <= count; i++) {
+    counters->assert_add("c", i);
+    tmp_sum += i;
+  }
+  ASSERT_OK(db->Flush(o));
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(tmp_sum, counters->assert_get("c"));
+  ASSERT_EQ(num_partial_merge_calls, 0U);
+  // NowNanos was previously called in MergeHelper::FilterMerge(), which
+  // harmed performance.
+  ASSERT_EQ(EnvMergeTest::now_nanos_count_, 0U);
+}
+
+void testSingleBatchSuccessiveMerge(DB* db, size_t max_num_merges,
+                                    size_t num_merges) {
+  ASSERT_GT(num_merges, max_num_merges);
+
+  Slice key("BatchSuccessiveMerge");
+  uint64_t merge_value = 1;
+  char buf[sizeof(merge_value)];
+  EncodeFixed64(buf, merge_value);
+  Slice merge_value_slice(buf, sizeof(merge_value));
+
+  // Create the batch
+  WriteBatch batch;
+  for (size_t i = 0; i < num_merges; ++i) {
+    ASSERT_OK(batch.Merge(key, merge_value_slice));
+  }
+
+  // Apply to memtable and count the number of merges
+  resetNumMergeOperatorCalls();
+  ASSERT_OK(db->Write(WriteOptions(), &batch));
+  ASSERT_EQ(
+      num_merge_operator_calls,
+      static_cast<size_t>(num_merges - (num_merges % (max_num_merges + 1))));
+
+  // Get the value
+  resetNumMergeOperatorCalls();
+  std::string get_value_str;
+  ASSERT_OK(db->Get(ReadOptions(), key, &get_value_str));
+  assert(get_value_str.size() == sizeof(uint64_t));
+  uint64_t get_value = DecodeFixed64(&get_value_str[0]);
+  ASSERT_EQ(get_value, num_merges * merge_value);
+  ASSERT_EQ(num_merge_operator_calls,
+            static_cast<size_t>((num_merges % (max_num_merges + 1))));
+}
+
+void runTest(const std::string& dbname, const bool use_ttl = false) {
+  {
+    auto db = OpenDb(dbname, use_ttl);
+
+    {
+      Counters counters(db, 0);
+      testCounters(counters, db.get(), true);
+    }
+
+    {
+      MergeBasedCounters counters(db, 0);
+      testCounters(counters, db.get(), use_compression);
+    }
+  }
+
+  ASSERT_OK(DestroyDB(dbname, Options()));
+
+  {
+    size_t max_merge = 5;
+    auto db = OpenDb(dbname, use_ttl, max_merge);
+    MergeBasedCounters counters(db, 0);
+    testCounters(counters, db.get(), use_compression);
+    testSuccessiveMerge(counters, max_merge, max_merge * 2);
+    testSingleBatchSuccessiveMerge(db.get(), 5, 7);
+    ASSERT_OK(db->Close());
+    ASSERT_OK(DestroyDB(dbname, Options()));
+  }
+
+  {
+    size_t max_merge = 100;
+    // Min merge is hard-coded to 2.
+    uint32_t min_merge = 2;
+    for (uint32_t count = min_merge - 1; count <= min_merge + 1; count++) {
+      auto db = OpenDb(dbname, use_ttl, max_merge);
+      MergeBasedCounters counters(db, 0);
+      testPartialMerge(&counters, db.get(), max_merge, min_merge, count);
+      ASSERT_OK(db->Close());
+      ASSERT_OK(DestroyDB(dbname, Options()));
+    }
+    {
+      auto db = OpenDb(dbname, use_ttl, max_merge);
+      MergeBasedCounters counters(db, 0);
+      testPartialMerge(&counters, db.get(), max_merge, min_merge,
+                       min_merge * 10);
+      ASSERT_OK(db->Close());
+      ASSERT_OK(DestroyDB(dbname, Options()));
+    }
+  }
+
+  {
+    {
+      auto db = OpenDb(dbname);
+      MergeBasedCounters counters(db, 0);
+      counters.add("test-key", 1);
+      counters.add("test-key", 1);
+      counters.add("test-key", 1);
+      ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    }
+
+    DB* reopen_db;
+    ASSERT_OK(DB::Open(Options(), dbname, &reopen_db));
+    std::string value;
+    ASSERT_NOK(reopen_db->Get(ReadOptions(), "test-key", &value));
+    delete reopen_db;
+    ASSERT_OK(DestroyDB(dbname, Options()));
+  }
+
+  /* Temporary remove this test
+  {
+    std::cout << "Test merge-operator not set after reopen (recovery case)\n";
+    {
+      auto db = OpenDb(dbname);
+      MergeBasedCounters counters(db, 0);
+      counters.add("test-key", 1);
+      counters.add("test-key", 1);
+      counters.add("test-key", 1);
+    }
+
+    DB* reopen_db;
+    ASSERT_TRUE(DB::Open(Options(), dbname, &reopen_db).IsInvalidArgument());
+  }
+  */
+}
+
+TEST_F(MergeTest, MergeDbTest) {
+  runTest(test::PerThreadDBPath("merge_testdb"));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(MergeTest, MergeDbTtlTest) {
+  runTest(test::PerThreadDBPath("merge_testdbttl"),
+          true);  // Run test on TTL database
+}
+
+TEST_F(MergeTest, MergeWithCompactionAndFlush) {
+  const std::string dbname =
+      test::PerThreadDBPath("merge_with_compaction_and_flush");
+  {
+    auto db = OpenDb(dbname);
+    {
+      MergeBasedCounters counters(db, 0);
+      testCountersWithFlushAndCompaction(counters, db.get());
+    }
+  }
+  ASSERT_OK(DestroyDB(dbname, Options()));
+}
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::use_compression = false;
+  if (argc > 1) {
+    ROCKSDB_NAMESPACE::use_compression = true;
+  }
+
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/obsolete_files_test.cc b/src/rocksdb/db/obsolete_files_test.cc
new file mode 100644
index 000000000..8e9f28f65
--- /dev/null
+++ b/src/rocksdb/db/obsolete_files_test.cc
@@ -0,0 +1,328 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include <stdlib.h>
+
+#include <algorithm>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/transaction_log.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ObsoleteFilesTest : public DBTestBase {
+ public:
+  ObsoleteFilesTest()
+      : DBTestBase("obsolete_files_test", /*env_do_fsync=*/true),
+        wal_dir_(dbname_ + "/wal_files") {}
+
+  void AddKeys(int numkeys, int startkey) {
+    WriteOptions options;
+    options.sync = false;
+    for (int i = startkey; i < (numkeys + startkey); i++) {
+      std::string temp = std::to_string(i);
+      Slice key(temp);
+      Slice value(temp);
+      ASSERT_OK(db_->Put(options, key, value));
+    }
+  }
+
+  void createLevel0Files(int numFiles, int numKeysPerFile) {
+    int startKey = 0;
+    for (int i = 0; i < numFiles; i++) {
+      AddKeys(numKeysPerFile, startKey);
+      startKey += numKeysPerFile;
+      ASSERT_OK(dbfull()->TEST_FlushMemTable());
+      ASSERT_OK(
+          dbfull()->TEST_WaitForCompact());  // wait for background flush (flush
+                                             // is also a kind of compaction).
+    }
+  }
+
+  void CheckFileTypeCounts(const std::string& dir, int required_log,
+                           int required_sst, int required_manifest) {
+    std::vector<std::string> filenames;
+    ASSERT_OK(env_->GetChildren(dir, &filenames));
+
+    int log_cnt = 0;
+    int sst_cnt = 0;
+    int manifest_cnt = 0;
+    for (auto file : filenames) {
+      uint64_t number;
+      FileType type;
+      if (ParseFileName(file, &number, &type)) {
+        log_cnt += (type == kWalFile);
+        sst_cnt += (type == kTableFile);
+        manifest_cnt += (type == kDescriptorFile);
+      }
+    }
+    ASSERT_EQ(required_log, log_cnt);
+    ASSERT_EQ(required_sst, sst_cnt);
+    ASSERT_EQ(required_manifest, manifest_cnt);
+  }
+
+  void ReopenDB() {
+    Options options = CurrentOptions();
+    // Trigger compaction when the number of level 0 files reaches 2.
+    options.create_if_missing = true;
+    options.level0_file_num_compaction_trigger = 2;
+    options.disable_auto_compactions = false;
+    options.delete_obsolete_files_period_micros = 0;  // always do full purge
+    options.enable_thread_tracking = true;
+    options.write_buffer_size = 1024 * 1024 * 1000;
+    options.target_file_size_base = 1024 * 1024 * 1000;
+    options.max_bytes_for_level_base = 1024 * 1024 * 1000;
+    options.WAL_ttl_seconds = 300;     // Used to test log files
+    options.WAL_size_limit_MB = 1024;  // Used to test log files
+    options.wal_dir = wal_dir_;
+
+    // Note: the following prevents an otherwise harmless data race between the
+    // test setup code (AddBlobFile) in ObsoleteFilesTest.BlobFiles and the
+    // periodic stat dumping thread.
+    options.stats_dump_period_sec = 0;
+
+    Destroy(options);
+    Reopen(options);
+  }
+
+  const std::string wal_dir_;
+};
+
+TEST_F(ObsoleteFilesTest, RaceForObsoleteFileDeletion) {
+  ReopenDB();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::BackgroundCallCompaction:FoundObsoleteFiles",
+       "ObsoleteFilesTest::RaceForObsoleteFileDeletion:1"},
+      {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
+       "ObsoleteFilesTest::RaceForObsoleteFileDeletion:2"},
+  });
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DeleteObsoleteFileImpl:AfterDeletion", [&](void* arg) {
+        Status* p_status = reinterpret_cast<Status*>(arg);
+        ASSERT_OK(*p_status);
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::CloseHelper:PendingPurgeFinished", [&](void* arg) {
+        std::unordered_set<uint64_t>* files_grabbed_for_purge_ptr =
+            reinterpret_cast<std::unordered_set<uint64_t>*>(arg);
+        ASSERT_TRUE(files_grabbed_for_purge_ptr->empty());
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  createLevel0Files(2, 50000);
+  CheckFileTypeCounts(wal_dir_, 1, 0, 0);
+
+  port::Thread user_thread([this]() {
+    JobContext jobCxt(0);
+    TEST_SYNC_POINT("ObsoleteFilesTest::RaceForObsoleteFileDeletion:1");
+    dbfull()->TEST_LockMutex();
+    dbfull()->FindObsoleteFiles(&jobCxt, true /* force=true */,
+                                false /* no_full_scan=false */);
+    dbfull()->TEST_UnlockMutex();
+    TEST_SYNC_POINT("ObsoleteFilesTest::RaceForObsoleteFileDeletion:2");
+    dbfull()->PurgeObsoleteFiles(jobCxt);
+    jobCxt.Clean();
+  });
+
+  user_thread.join();
+}
+
+TEST_F(ObsoleteFilesTest, DeleteObsoleteOptionsFile) {
+  ReopenDB();
+
+  createLevel0Files(2, 50000);
+  CheckFileTypeCounts(wal_dir_, 1, 0, 0);
+
+  ASSERT_OK(dbfull()->DisableFileDeletions());
+  for (int i = 0; i != 4; ++i) {
+    if (i % 2) {
+      ASSERT_OK(dbfull()->SetOptions(dbfull()->DefaultColumnFamily(),
+                                     {{"paranoid_file_checks", "false"}}));
+    } else {
+      ASSERT_OK(dbfull()->SetOptions(dbfull()->DefaultColumnFamily(),
+                                     {{"paranoid_file_checks", "true"}}));
+    }
+  }
+  ASSERT_OK(dbfull()->EnableFileDeletions(true /* force */));
+
+  Close();
+
+  std::vector<std::string> files;
+  int opts_file_count = 0;
+  ASSERT_OK(env_->GetChildren(dbname_, &files));
+  for (const auto& file : files) {
+    uint64_t file_num;
+    Slice dummy_info_log_name_prefix;
+    FileType type;
+    WalFileType log_type;
+    if (ParseFileName(file, &file_num, dummy_info_log_name_prefix, &type,
+                      &log_type) &&
+        type == kOptionsFile) {
+      opts_file_count++;
+    }
+  }
+  ASSERT_EQ(2, opts_file_count);
+}
+
+TEST_F(ObsoleteFilesTest, BlobFiles) {
+  ReopenDB();
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+  assert(versions->GetColumnFamilySet());
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  assert(cfd);
+
+  const ImmutableCFOptions* const ioptions = cfd->ioptions();
+  assert(ioptions);
+  assert(!ioptions->cf_paths.empty());
+
+  const std::string& path = ioptions->cf_paths.front().path;
+
+  // Add an obsolete blob file.
+  constexpr uint64_t first_blob_file_number = 234;
+  versions->AddObsoleteBlobFile(first_blob_file_number, path);
+
+  // Add a live blob file.
+  Version* const version = cfd->current();
+  assert(version);
+
+  VersionStorageInfo* const storage_info = version->storage_info();
+  assert(storage_info);
+
+  constexpr uint64_t second_blob_file_number = 456;
+  constexpr uint64_t second_total_blob_count = 100;
+  constexpr uint64_t second_total_blob_bytes = 2000000;
+  constexpr char second_checksum_method[] = "CRC32B";
+  constexpr char second_checksum_value[] = "\x6d\xbd\xf2\x3a";
+
+  auto shared_meta = SharedBlobFileMetaData::Create(
+      second_blob_file_number, second_total_blob_count, second_total_blob_bytes,
+      second_checksum_method, second_checksum_value);
+
+  constexpr uint64_t second_garbage_blob_count = 0;
+  constexpr uint64_t second_garbage_blob_bytes = 0;
+
+  auto meta = BlobFileMetaData::Create(
+      std::move(shared_meta), BlobFileMetaData::LinkedSsts(),
+      second_garbage_blob_count, second_garbage_blob_bytes);
+
+  storage_info->AddBlobFile(std::move(meta));
+
+  // Check for obsolete files and make sure the first blob file is picked up
+  // and grabbed for purge. The second blob file should be on the live list.
+  constexpr int job_id = 0;
+  JobContext job_context{job_id};
+
+  dbfull()->TEST_LockMutex();
+  constexpr bool force_full_scan = false;
+  dbfull()->FindObsoleteFiles(&job_context, force_full_scan);
+  dbfull()->TEST_UnlockMutex();
+
+  ASSERT_TRUE(job_context.HaveSomethingToDelete());
+  ASSERT_EQ(job_context.blob_delete_files.size(), 1);
+  ASSERT_EQ(job_context.blob_delete_files[0].GetBlobFileNumber(),
+            first_blob_file_number);
+
+  const auto& files_grabbed_for_purge =
+      dbfull()->TEST_GetFilesGrabbedForPurge();
+  ASSERT_NE(files_grabbed_for_purge.find(first_blob_file_number),
+            files_grabbed_for_purge.end());
+
+  ASSERT_EQ(job_context.blob_live.size(), 1);
+  ASSERT_EQ(job_context.blob_live[0], second_blob_file_number);
+
+  // Hack the job context a bit by adding a few files to the full scan
+  // list and adjusting the pending file number. We add the two files
+  // above as well as two additional ones, where one is old
+  // and should be cleaned up, and the other is still pending.
+  constexpr uint64_t old_blob_file_number = 123;
+  constexpr uint64_t pending_blob_file_number = 567;
+
+  job_context.full_scan_candidate_files.emplace_back(
+      BlobFileName(old_blob_file_number), path);
+  job_context.full_scan_candidate_files.emplace_back(
+      BlobFileName(first_blob_file_number), path);
+  job_context.full_scan_candidate_files.emplace_back(
+      BlobFileName(second_blob_file_number), path);
+  job_context.full_scan_candidate_files.emplace_back(
+      BlobFileName(pending_blob_file_number), path);
+
+  job_context.min_pending_output = pending_blob_file_number;
+
+  // Purge obsolete files and make sure we purge the old file and the first file
+  // (and keep the second file and the pending file).
+  std::vector<std::string> deleted_files;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DeleteObsoleteFileImpl::BeforeDeletion", [&](void* arg) {
+        const std::string* file = static_cast<std::string*>(arg);
+        assert(file);
+
+        constexpr char blob_extension[] = ".blob";
+
+        if (file->find(blob_extension) != std::string::npos) {
+          deleted_files.emplace_back(*file);
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  dbfull()->PurgeObsoleteFiles(job_context);
+  job_context.Clean();
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_EQ(files_grabbed_for_purge.find(first_blob_file_number),
+            files_grabbed_for_purge.end());
+
+  std::sort(deleted_files.begin(), deleted_files.end());
+  const std::vector<std::string> expected_deleted_files{
+      BlobFileName(path, old_blob_file_number),
+      BlobFileName(path, first_blob_file_number)};
+
+  ASSERT_EQ(deleted_files, expected_deleted_files);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as DBImpl::DeleteFile is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/options_file_test.cc b/src/rocksdb/db/options_file_test.cc
new file mode 100644
index 000000000..eb02e6ca4
--- /dev/null
+++ b/src/rocksdb/db/options_file_test.cc
@@ -0,0 +1,120 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include <string>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+class OptionsFileTest : public testing::Test {
+ public:
+  OptionsFileTest() : dbname_(test::PerThreadDBPath("options_file_test")) {}
+
+  std::string dbname_;
+};
+
+namespace {
+void UpdateOptionsFiles(DB* db,
+                        std::unordered_set<std::string>* filename_history,
+                        int* options_files_count) {
+  std::vector<std::string> filenames;
+  EXPECT_OK(db->GetEnv()->GetChildren(db->GetName(), &filenames));
+  uint64_t number;
+  FileType type;
+  *options_files_count = 0;
+  for (auto filename : filenames) {
+    if (ParseFileName(filename, &number, &type) && type == kOptionsFile) {
+      filename_history->insert(filename);
+      (*options_files_count)++;
+    }
+  }
+}
+
+// Verify whether the current Options Files are the latest ones.
+void VerifyOptionsFileName(
+    DB* db, const std::unordered_set<std::string>& past_filenames) {
+  std::vector<std::string> filenames;
+  std::unordered_set<std::string> current_filenames;
+  EXPECT_OK(db->GetEnv()->GetChildren(db->GetName(), &filenames));
+  uint64_t number;
+  FileType type;
+  for (auto filename : filenames) {
+    if (ParseFileName(filename, &number, &type) && type == kOptionsFile) {
+      current_filenames.insert(filename);
+    }
+  }
+  for (auto past_filename : past_filenames) {
+    if (current_filenames.find(past_filename) != current_filenames.end()) {
+      continue;
+    }
+    for (auto filename : current_filenames) {
+      ASSERT_GT(filename, past_filename);
+    }
+  }
+}
+}  // anonymous namespace
+
+TEST_F(OptionsFileTest, NumberOfOptionsFiles) {
+  const int kReopenCount = 20;
+  Options opt;
+  opt.create_if_missing = true;
+  ASSERT_OK(DestroyDB(dbname_, opt));
+  std::unordered_set<std::string> filename_history;
+  DB* db;
+  for (int i = 0; i < kReopenCount; ++i) {
+    ASSERT_OK(DB::Open(opt, dbname_, &db));
+    int num_options_files = 0;
+    UpdateOptionsFiles(db, &filename_history, &num_options_files);
+    ASSERT_GT(num_options_files, 0);
+    ASSERT_LE(num_options_files, 2);
+    // Make sure we always keep the latest option files.
+    VerifyOptionsFileName(db, filename_history);
+    delete db;
+  }
+}
+
+TEST_F(OptionsFileTest, OptionsFileName) {
+  const uint64_t kOptionsFileNum = 12345;
+  uint64_t number;
+  FileType type;
+
+  auto options_file_name = OptionsFileName("", kOptionsFileNum);
+  ASSERT_TRUE(ParseFileName(options_file_name, &number, &type, nullptr));
+  ASSERT_EQ(type, kOptionsFile);
+  ASSERT_EQ(number, kOptionsFileNum);
+
+  const uint64_t kTempOptionsFileNum = 54352;
+  auto temp_options_file_name = TempOptionsFileName("", kTempOptionsFileNum);
+  ASSERT_TRUE(ParseFileName(temp_options_file_name, &number, &type, nullptr));
+  ASSERT_NE(temp_options_file_name.find(kTempFileNameSuffix),
+            std::string::npos);
+  ASSERT_EQ(type, kTempFile);
+  ASSERT_EQ(number, kTempOptionsFileNum);
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+#if !(defined NDEBUG) || !defined(OS_WIN)
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  return 0;
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+}
+#else
+
+#include <cstdio>
+
+int main(int /*argc*/, char** /*argv*/) {
+  printf("Skipped as Options file is not supported in RocksDBLite.\n");
+  return 0;
+}
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/output_validator.cc b/src/rocksdb/db/output_validator.cc
new file mode 100644
index 000000000..e93e2d68c
--- /dev/null
+++ b/src/rocksdb/db/output_validator.cc
@@ -0,0 +1,33 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/output_validator.h"
+
+#include "test_util/sync_point.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status OutputValidator::Add(const Slice& key, const Slice& value) {
+  if (enable_hash_) {
+    // Generate a rolling 64-bit hash of the key and values
+    paranoid_hash_ = NPHash64(key.data(), key.size(), paranoid_hash_);
+    paranoid_hash_ = NPHash64(value.data(), value.size(), paranoid_hash_);
+  }
+  if (enable_order_check_) {
+    TEST_SYNC_POINT_CALLBACK("OutputValidator::Add:order_check",
+                             /*arg=*/nullptr);
+    if (key.size() < kNumInternalBytes) {
+      return Status::Corruption(
+          "Compaction tries to write a key without internal bytes.");
+    }
+    // prev_key_ starts with empty.
+    if (!prev_key_.empty() && icmp_.Compare(key, prev_key_) < 0) {
+      return Status::Corruption("Compaction sees out-of-order keys.");
+    }
+    prev_key_.assign(key.data(), key.size());
+  }
+  return Status::OK();
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/output_validator.h b/src/rocksdb/db/output_validator.h
new file mode 100644
index 000000000..40635f9c4
--- /dev/null
+++ b/src/rocksdb/db/output_validator.h
@@ -0,0 +1,48 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include "db/dbformat.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+// A class that validates key/value that is inserted to an SST file.
+// Pass every key/value of the file using OutputValidator::Add()
+// and the class validates key order and optionally calculate a hash
+// of all the key and value.
+class OutputValidator {
+ public:
+  explicit OutputValidator(const InternalKeyComparator& icmp,
+                           bool enable_order_check, bool enable_hash,
+                           uint64_t precalculated_hash = 0)
+      : icmp_(icmp),
+        paranoid_hash_(precalculated_hash),
+        enable_order_check_(enable_order_check),
+        enable_hash_(enable_hash) {}
+
+  // Add a key to the KV sequence, and return whether the key follows
+  // criteria, e.g. key is ordered.
+  Status Add(const Slice& key, const Slice& value);
+
+  // Compare result of two key orders are the same. It can be used
+  // to compare the keys inserted into a file, and what is read back.
+  // Return true if the validation passes.
+  bool CompareValidator(const OutputValidator& other_validator) {
+    return GetHash() == other_validator.GetHash();
+  }
+
+  // Not (yet) intended to be persisted, so subject to change
+  // without notice between releases.
+  uint64_t GetHash() const { return paranoid_hash_; }
+
+ private:
+  const InternalKeyComparator& icmp_;
+  std::string prev_key_;
+  uint64_t paranoid_hash_ = 0;
+  bool enable_order_check_;
+  bool enable_hash_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/perf_context_test.cc b/src/rocksdb/db/perf_context_test.cc
new file mode 100644
index 000000000..454d12dc5
--- /dev/null
+++ b/src/rocksdb/db/perf_context_test.cc
@@ -0,0 +1,1010 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "rocksdb/perf_context.h"
+
+#include <algorithm>
+#include <iostream>
+#include <thread>
+#include <vector>
+
+#include "monitoring/histogram.h"
+#include "monitoring/instrumented_mutex.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/thread_status_util.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/testharness.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+bool FLAGS_random_key = false;
+bool FLAGS_use_set_based_memetable = false;
+int FLAGS_total_keys = 100;
+int FLAGS_write_buffer_size = 1000000000;
+int FLAGS_max_write_buffer_number = 8;
+int FLAGS_min_write_buffer_number_to_merge = 7;
+bool FLAGS_verbose = false;
+
+// Path to the database on file system
+const std::string kDbName =
+    ROCKSDB_NAMESPACE::test::PerThreadDBPath("perf_context_test");
+
+namespace ROCKSDB_NAMESPACE {
+
+std::shared_ptr<DB> OpenDb(bool read_only = false) {
+  DB* db;
+  Options options;
+  options.create_if_missing = true;
+  options.max_open_files = -1;
+  options.write_buffer_size = FLAGS_write_buffer_size;
+  options.max_write_buffer_number = FLAGS_max_write_buffer_number;
+  options.min_write_buffer_number_to_merge =
+      FLAGS_min_write_buffer_number_to_merge;
+
+  if (FLAGS_use_set_based_memetable) {
+#ifndef ROCKSDB_LITE
+    options.prefix_extractor.reset(
+        ROCKSDB_NAMESPACE::NewFixedPrefixTransform(0));
+    options.memtable_factory.reset(NewHashSkipListRepFactory());
+#endif  // ROCKSDB_LITE
+  }
+
+  Status s;
+  if (!read_only) {
+    s = DB::Open(options, kDbName, &db);
+  } else {
+    s = DB::OpenForReadOnly(options, kDbName, &db);
+  }
+  EXPECT_OK(s);
+  return std::shared_ptr<DB>(db);
+}
+
+class PerfContextTest : public testing::Test {};
+
+TEST_F(PerfContextTest, SeekIntoDeletion) {
+  ASSERT_OK(DestroyDB(kDbName, Options()));
+  auto db = OpenDb();
+  WriteOptions write_options;
+  ReadOptions read_options;
+
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    std::string key = "k" + std::to_string(i);
+    std::string value = "v" + std::to_string(i);
+
+    ASSERT_OK(db->Put(write_options, key, value));
+  }
+
+  for (int i = 0; i < FLAGS_total_keys - 1; ++i) {
+    std::string key = "k" + std::to_string(i);
+    ASSERT_OK(db->Delete(write_options, key));
+  }
+
+  HistogramImpl hist_get;
+  HistogramImpl hist_get_time;
+  for (int i = 0; i < FLAGS_total_keys - 1; ++i) {
+    std::string key = "k" + std::to_string(i);
+    std::string value;
+
+    get_perf_context()->Reset();
+    StopWatchNano timer(SystemClock::Default().get());
+    timer.Start();
+    auto status = db->Get(read_options, key, &value);
+    auto elapsed_nanos = timer.ElapsedNanos();
+    ASSERT_TRUE(status.IsNotFound());
+    hist_get.Add(get_perf_context()->user_key_comparison_count);
+    hist_get_time.Add(elapsed_nanos);
+  }
+
+  if (FLAGS_verbose) {
+    std::cout << "Get user key comparison: \n"
+              << hist_get.ToString() << "Get time: \n"
+              << hist_get_time.ToString();
+  }
+
+  {
+    HistogramImpl hist_seek_to_first;
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+    get_perf_context()->Reset();
+    StopWatchNano timer(SystemClock::Default().get(), true);
+    iter->SeekToFirst();
+    hist_seek_to_first.Add(get_perf_context()->user_key_comparison_count);
+    auto elapsed_nanos = timer.ElapsedNanos();
+
+    if (FLAGS_verbose) {
+      std::cout << "SeekToFirst user key comparison: \n"
+                << hist_seek_to_first.ToString() << "ikey skipped: "
+                << get_perf_context()->internal_key_skipped_count << "\n"
+                << "idelete skipped: "
+                << get_perf_context()->internal_delete_skipped_count << "\n"
+                << "elapsed: " << elapsed_nanos << "\n";
+    }
+  }
+
+  HistogramImpl hist_seek;
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+    std::string key = "k" + std::to_string(i);
+
+    get_perf_context()->Reset();
+    StopWatchNano timer(SystemClock::Default().get(), true);
+    iter->Seek(key);
+    auto elapsed_nanos = timer.ElapsedNanos();
+    hist_seek.Add(get_perf_context()->user_key_comparison_count);
+    if (FLAGS_verbose) {
+      std::cout << "seek cmp: " << get_perf_context()->user_key_comparison_count
+                << " ikey skipped "
+                << get_perf_context()->internal_key_skipped_count
+                << " idelete skipped "
+                << get_perf_context()->internal_delete_skipped_count
+                << " elapsed: " << elapsed_nanos << "ns\n";
+    }
+
+    get_perf_context()->Reset();
+    ASSERT_TRUE(iter->Valid());
+    StopWatchNano timer2(SystemClock::Default().get(), true);
+    iter->Next();
+    auto elapsed_nanos2 = timer2.ElapsedNanos();
+    if (FLAGS_verbose) {
+      std::cout << "next cmp: " << get_perf_context()->user_key_comparison_count
+                << "elapsed: " << elapsed_nanos2 << "ns\n";
+    }
+  }
+
+  if (FLAGS_verbose) {
+    std::cout << "Seek user key comparison: \n" << hist_seek.ToString();
+  }
+}
+
+TEST_F(PerfContextTest, StopWatchNanoOverhead) {
+  // profile the timer cost by itself!
+  const int kTotalIterations = 1000000;
+  std::vector<uint64_t> timings(kTotalIterations);
+
+  StopWatchNano timer(SystemClock::Default().get(), true);
+  for (auto& timing : timings) {
+    timing = timer.ElapsedNanos(true /* reset */);
+  }
+
+  HistogramImpl histogram;
+  for (const auto timing : timings) {
+    histogram.Add(timing);
+  }
+
+  if (FLAGS_verbose) {
+    std::cout << histogram.ToString();
+  }
+}
+
+TEST_F(PerfContextTest, StopWatchOverhead) {
+  // profile the timer cost by itself!
+  const int kTotalIterations = 1000000;
+  uint64_t elapsed = 0;
+  std::vector<uint64_t> timings(kTotalIterations);
+
+  StopWatch timer(SystemClock::Default().get(), nullptr, 0, &elapsed);
+  for (auto& timing : timings) {
+    timing = elapsed;
+  }
+
+  HistogramImpl histogram;
+  uint64_t prev_timing = 0;
+  for (const auto timing : timings) {
+    histogram.Add(timing - prev_timing);
+    prev_timing = timing;
+  }
+
+  if (FLAGS_verbose) {
+    std::cout << histogram.ToString();
+  }
+}
+
+void ProfileQueries(bool enabled_time = false) {
+  ASSERT_OK(DestroyDB(kDbName, Options()));  // Start this test with a fresh DB
+
+  auto db = OpenDb();
+
+  WriteOptions write_options;
+  ReadOptions read_options;
+
+  HistogramImpl hist_put;
+
+  HistogramImpl hist_get;
+  HistogramImpl hist_get_snapshot;
+  HistogramImpl hist_get_memtable;
+  HistogramImpl hist_get_files;
+  HistogramImpl hist_get_post_process;
+  HistogramImpl hist_num_memtable_checked;
+
+  HistogramImpl hist_mget;
+  HistogramImpl hist_mget_snapshot;
+  HistogramImpl hist_mget_memtable;
+  HistogramImpl hist_mget_files;
+  HistogramImpl hist_mget_post_process;
+  HistogramImpl hist_mget_num_memtable_checked;
+
+  HistogramImpl hist_write_pre_post;
+  HistogramImpl hist_write_wal_time;
+  HistogramImpl hist_write_memtable_time;
+  HistogramImpl hist_write_delay_time;
+  HistogramImpl hist_write_thread_wait_nanos;
+  HistogramImpl hist_write_scheduling_time;
+
+  uint64_t total_db_mutex_nanos = 0;
+
+  if (FLAGS_verbose) {
+    std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
+  }
+
+  std::vector<int> keys;
+  const int kFlushFlag = -1;
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    keys.push_back(i);
+    if (i == FLAGS_total_keys / 2) {
+      // Issuing a flush in the middle.
+      keys.push_back(kFlushFlag);
+    }
+  }
+
+  if (FLAGS_random_key) {
+    RandomShuffle(std::begin(keys), std::end(keys));
+  }
+#ifndef NDEBUG
+  ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 1U);
+#endif
+  int num_mutex_waited = 0;
+  for (const int i : keys) {
+    if (i == kFlushFlag) {
+      FlushOptions fo;
+      db->Flush(fo);
+      continue;
+    }
+
+    std::string key = "k" + std::to_string(i);
+    std::string value = "v" + std::to_string(i);
+
+    std::vector<std::string> values;
+
+    get_perf_context()->Reset();
+    ASSERT_OK(db->Put(write_options, key, value));
+    if (++num_mutex_waited > 3) {
+#ifndef NDEBUG
+      ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U);
+#endif
+    }
+    hist_write_pre_post.Add(
+        get_perf_context()->write_pre_and_post_process_time);
+    hist_write_wal_time.Add(get_perf_context()->write_wal_time);
+    hist_write_memtable_time.Add(get_perf_context()->write_memtable_time);
+    hist_write_delay_time.Add(get_perf_context()->write_delay_time);
+    hist_write_thread_wait_nanos.Add(
+        get_perf_context()->write_thread_wait_nanos);
+    hist_write_scheduling_time.Add(
+        get_perf_context()->write_scheduling_flushes_compactions_time);
+    hist_put.Add(get_perf_context()->user_key_comparison_count);
+    total_db_mutex_nanos += get_perf_context()->db_mutex_lock_nanos;
+  }
+#ifndef NDEBUG
+  ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U);
+#endif
+
+  for (const int i : keys) {
+    if (i == kFlushFlag) {
+      continue;
+    }
+    std::string key = "k" + std::to_string(i);
+    std::string expected_value = "v" + std::to_string(i);
+    std::string value;
+
+    std::vector<Slice> multiget_keys = {Slice(key)};
+    std::vector<std::string> values;
+
+    get_perf_context()->Reset();
+    ASSERT_OK(db->Get(read_options, key, &value));
+    ASSERT_EQ(expected_value, value);
+    hist_get_snapshot.Add(get_perf_context()->get_snapshot_time);
+    hist_get_memtable.Add(get_perf_context()->get_from_memtable_time);
+    hist_get_files.Add(get_perf_context()->get_from_output_files_time);
+    hist_num_memtable_checked.Add(get_perf_context()->get_from_memtable_count);
+    hist_get_post_process.Add(get_perf_context()->get_post_process_time);
+    hist_get.Add(get_perf_context()->user_key_comparison_count);
+
+    get_perf_context()->Reset();
+    auto statuses = db->MultiGet(read_options, multiget_keys, &values);
+    for (const auto& s : statuses) {
+      ASSERT_OK(s);
+    }
+    hist_mget_snapshot.Add(get_perf_context()->get_snapshot_time);
+    hist_mget_memtable.Add(get_perf_context()->get_from_memtable_time);
+    hist_mget_files.Add(get_perf_context()->get_from_output_files_time);
+    hist_mget_num_memtable_checked.Add(
+        get_perf_context()->get_from_memtable_count);
+    hist_mget_post_process.Add(get_perf_context()->get_post_process_time);
+    hist_mget.Add(get_perf_context()->user_key_comparison_count);
+  }
+
+  if (FLAGS_verbose) {
+    std::cout << "Put user key comparison: \n"
+              << hist_put.ToString() << "Get user key comparison: \n"
+              << hist_get.ToString() << "MultiGet user key comparison: \n"
+              << hist_get.ToString();
+    std::cout << "Put(): Pre and Post Process Time: \n"
+              << hist_write_pre_post.ToString() << " Writing WAL time: \n"
+              << hist_write_wal_time.ToString() << "\n"
+              << " Writing Mem Table time: \n"
+              << hist_write_memtable_time.ToString() << "\n"
+              << " Write Delay: \n"
+              << hist_write_delay_time.ToString() << "\n"
+              << " Waiting for Batch time: \n"
+              << hist_write_thread_wait_nanos.ToString() << "\n"
+              << " Scheduling Flushes and Compactions Time: \n"
+              << hist_write_scheduling_time.ToString() << "\n"
+              << " Total DB mutex nanos: \n"
+              << total_db_mutex_nanos << "\n";
+
+    std::cout << "Get(): Time to get snapshot: \n"
+              << hist_get_snapshot.ToString()
+              << " Time to get value from memtables: \n"
+              << hist_get_memtable.ToString() << "\n"
+              << " Time to get value from output files: \n"
+              << hist_get_files.ToString() << "\n"
+              << " Number of memtables checked: \n"
+              << hist_num_memtable_checked.ToString() << "\n"
+              << " Time to post process: \n"
+              << hist_get_post_process.ToString() << "\n";
+
+    std::cout << "MultiGet(): Time to get snapshot: \n"
+              << hist_mget_snapshot.ToString()
+              << " Time to get value from memtables: \n"
+              << hist_mget_memtable.ToString() << "\n"
+              << " Time to get value from output files: \n"
+              << hist_mget_files.ToString() << "\n"
+              << " Number of memtables checked: \n"
+              << hist_mget_num_memtable_checked.ToString() << "\n"
+              << " Time to post process: \n"
+              << hist_mget_post_process.ToString() << "\n";
+  }
+
+  if (enabled_time) {
+    ASSERT_GT(hist_get.Average(), 0);
+    ASSERT_GT(hist_get_snapshot.Average(), 0);
+    ASSERT_GT(hist_get_memtable.Average(), 0);
+    ASSERT_GT(hist_get_files.Average(), 0);
+    ASSERT_GT(hist_get_post_process.Average(), 0);
+    ASSERT_GT(hist_num_memtable_checked.Average(), 0);
+
+    ASSERT_GT(hist_mget.Average(), 0);
+    ASSERT_GT(hist_mget_snapshot.Average(), 0);
+    ASSERT_GT(hist_mget_memtable.Average(), 0);
+    ASSERT_GT(hist_mget_files.Average(), 0);
+    ASSERT_GT(hist_mget_post_process.Average(), 0);
+    ASSERT_GT(hist_mget_num_memtable_checked.Average(), 0);
+
+    EXPECT_GT(hist_write_pre_post.Average(), 0);
+    EXPECT_GT(hist_write_wal_time.Average(), 0);
+    EXPECT_GT(hist_write_memtable_time.Average(), 0);
+    EXPECT_EQ(hist_write_delay_time.Average(), 0);
+    EXPECT_EQ(hist_write_thread_wait_nanos.Average(), 0);
+    EXPECT_GT(hist_write_scheduling_time.Average(), 0);
+
+#ifndef NDEBUG
+    ASSERT_LT(total_db_mutex_nanos, 100U);
+#endif
+  }
+
+  db.reset();
+  db = OpenDb(true);
+
+  hist_get.Clear();
+  hist_get_snapshot.Clear();
+  hist_get_memtable.Clear();
+  hist_get_files.Clear();
+  hist_get_post_process.Clear();
+  hist_num_memtable_checked.Clear();
+
+  hist_mget.Clear();
+  hist_mget_snapshot.Clear();
+  hist_mget_memtable.Clear();
+  hist_mget_files.Clear();
+  hist_mget_post_process.Clear();
+  hist_mget_num_memtable_checked.Clear();
+
+  for (const int i : keys) {
+    if (i == kFlushFlag) {
+      continue;
+    }
+    std::string key = "k" + std::to_string(i);
+    std::string expected_value = "v" + std::to_string(i);
+    std::string value;
+
+    std::vector<Slice> multiget_keys = {Slice(key)};
+    std::vector<std::string> values;
+
+    get_perf_context()->Reset();
+    ASSERT_OK(db->Get(read_options, key, &value));
+    ASSERT_EQ(expected_value, value);
+    hist_get_snapshot.Add(get_perf_context()->get_snapshot_time);
+    hist_get_memtable.Add(get_perf_context()->get_from_memtable_time);
+    hist_get_files.Add(get_perf_context()->get_from_output_files_time);
+    hist_num_memtable_checked.Add(get_perf_context()->get_from_memtable_count);
+    hist_get_post_process.Add(get_perf_context()->get_post_process_time);
+    hist_get.Add(get_perf_context()->user_key_comparison_count);
+
+    get_perf_context()->Reset();
+    auto statuses = db->MultiGet(read_options, multiget_keys, &values);
+    for (const auto& s : statuses) {
+      ASSERT_OK(s);
+    }
+    hist_mget_snapshot.Add(get_perf_context()->get_snapshot_time);
+    hist_mget_memtable.Add(get_perf_context()->get_from_memtable_time);
+    hist_mget_files.Add(get_perf_context()->get_from_output_files_time);
+    hist_mget_num_memtable_checked.Add(
+        get_perf_context()->get_from_memtable_count);
+    hist_mget_post_process.Add(get_perf_context()->get_post_process_time);
+    hist_mget.Add(get_perf_context()->user_key_comparison_count);
+  }
+
+  if (FLAGS_verbose) {
+    std::cout << "ReadOnly Get user key comparison: \n"
+              << hist_get.ToString()
+              << "ReadOnly MultiGet user key comparison: \n"
+              << hist_mget.ToString();
+
+    std::cout << "ReadOnly Get(): Time to get snapshot: \n"
+              << hist_get_snapshot.ToString()
+              << " Time to get value from memtables: \n"
+              << hist_get_memtable.ToString() << "\n"
+              << " Time to get value from output files: \n"
+              << hist_get_files.ToString() << "\n"
+              << " Number of memtables checked: \n"
+              << hist_num_memtable_checked.ToString() << "\n"
+              << " Time to post process: \n"
+              << hist_get_post_process.ToString() << "\n";
+
+    std::cout << "ReadOnly MultiGet(): Time to get snapshot: \n"
+              << hist_mget_snapshot.ToString()
+              << " Time to get value from memtables: \n"
+              << hist_mget_memtable.ToString() << "\n"
+              << " Time to get value from output files: \n"
+              << hist_mget_files.ToString() << "\n"
+              << " Number of memtables checked: \n"
+              << hist_mget_num_memtable_checked.ToString() << "\n"
+              << " Time to post process: \n"
+              << hist_mget_post_process.ToString() << "\n";
+  }
+
+  if (enabled_time) {
+    ASSERT_GT(hist_get.Average(), 0);
+    ASSERT_GT(hist_get_memtable.Average(), 0);
+    ASSERT_GT(hist_get_files.Average(), 0);
+    ASSERT_GT(hist_num_memtable_checked.Average(), 0);
+    // In read-only mode Get(), no super version operation is needed
+    ASSERT_EQ(hist_get_post_process.Average(), 0);
+    ASSERT_GT(hist_get_snapshot.Average(), 0);
+
+    ASSERT_GT(hist_mget.Average(), 0);
+    ASSERT_GT(hist_mget_snapshot.Average(), 0);
+    ASSERT_GT(hist_mget_memtable.Average(), 0);
+    ASSERT_GT(hist_mget_files.Average(), 0);
+    ASSERT_GT(hist_mget_post_process.Average(), 0);
+    ASSERT_GT(hist_mget_num_memtable_checked.Average(), 0);
+  }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(PerfContextTest, KeyComparisonCount) {
+  SetPerfLevel(kEnableCount);
+  ProfileQueries();
+
+  SetPerfLevel(kDisable);
+  ProfileQueries();
+
+  SetPerfLevel(kEnableTime);
+  ProfileQueries(true);
+}
+#endif  // ROCKSDB_LITE
+
+// make perf_context_test
+// export ROCKSDB_TESTS=PerfContextTest.SeekKeyComparison
+// For one memtable:
+// ./perf_context_test --write_buffer_size=500000 --total_keys=10000
+// For two memtables:
+// ./perf_context_test --write_buffer_size=250000 --total_keys=10000
+// Specify --random_key=1 to shuffle the key before insertion
+// Results show that, for sequential insertion, worst-case Seek Key comparison
+// is close to the total number of keys (linear), when there is only one
+// memtable. When there are two memtables, even the avg Seek Key comparison
+// starts to become linear to the input size.
+
+TEST_F(PerfContextTest, SeekKeyComparison) {
+  ASSERT_OK(DestroyDB(kDbName, Options()));
+  auto db = OpenDb();
+  WriteOptions write_options;
+  ReadOptions read_options;
+
+  if (FLAGS_verbose) {
+    std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
+  }
+
+  std::vector<int> keys;
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    keys.push_back(i);
+  }
+
+  if (FLAGS_random_key) {
+    RandomShuffle(std::begin(keys), std::end(keys));
+  }
+
+  HistogramImpl hist_put_time;
+  HistogramImpl hist_wal_time;
+  HistogramImpl hist_time_diff;
+
+  SetPerfLevel(kEnableTime);
+  StopWatchNano timer(SystemClock::Default().get());
+  for (const int i : keys) {
+    std::string key = "k" + std::to_string(i);
+    std::string value = "v" + std::to_string(i);
+
+    get_perf_context()->Reset();
+    timer.Start();
+    ASSERT_OK(db->Put(write_options, key, value));
+    auto put_time = timer.ElapsedNanos();
+    hist_put_time.Add(put_time);
+    hist_wal_time.Add(get_perf_context()->write_wal_time);
+    hist_time_diff.Add(put_time - get_perf_context()->write_wal_time);
+  }
+
+  if (FLAGS_verbose) {
+    std::cout << "Put time:\n"
+              << hist_put_time.ToString() << "WAL time:\n"
+              << hist_wal_time.ToString() << "time diff:\n"
+              << hist_time_diff.ToString();
+  }
+
+  HistogramImpl hist_seek;
+  HistogramImpl hist_next;
+
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    std::string key = "k" + std::to_string(i);
+    std::string value = "v" + std::to_string(i);
+
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+    get_perf_context()->Reset();
+    iter->Seek(key);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->value().ToString(), value);
+    hist_seek.Add(get_perf_context()->user_key_comparison_count);
+  }
+
+  std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+  for (iter->SeekToFirst(); iter->Valid();) {
+    get_perf_context()->Reset();
+    iter->Next();
+    hist_next.Add(get_perf_context()->user_key_comparison_count);
+  }
+  ASSERT_OK(iter->status());
+  if (FLAGS_verbose) {
+    std::cout << "Seek:\n"
+              << hist_seek.ToString() << "Next:\n"
+              << hist_next.ToString();
+  }
+}
+
+TEST_F(PerfContextTest, DBMutexLockCounter) {
+  int stats_code[] = {0, static_cast<int>(DB_MUTEX_WAIT_MICROS)};
+  for (PerfLevel perf_level_test :
+       {PerfLevel::kEnableTimeExceptForMutex, PerfLevel::kEnableTime}) {
+    for (int c = 0; c < 2; ++c) {
+      InstrumentedMutex mutex(nullptr, SystemClock::Default().get(),
+                              stats_code[c]);
+      mutex.Lock();
+      ROCKSDB_NAMESPACE::port::Thread child_thread([&] {
+        SetPerfLevel(perf_level_test);
+        get_perf_context()->Reset();
+        ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0);
+        mutex.Lock();
+        mutex.Unlock();
+        if (perf_level_test == PerfLevel::kEnableTimeExceptForMutex ||
+            stats_code[c] != DB_MUTEX_WAIT_MICROS) {
+          ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0);
+        } else {
+          // increment the counter only when it's a DB Mutex
+          ASSERT_GT(get_perf_context()->db_mutex_lock_nanos, 0);
+        }
+      });
+      SystemClock::Default()->SleepForMicroseconds(100);
+      mutex.Unlock();
+      child_thread.join();
+    }
+  }
+}
+
+TEST_F(PerfContextTest, FalseDBMutexWait) {
+  SetPerfLevel(kEnableTime);
+  int stats_code[] = {0, static_cast<int>(DB_MUTEX_WAIT_MICROS)};
+  for (int c = 0; c < 2; ++c) {
+    InstrumentedMutex mutex(nullptr, SystemClock::Default().get(),
+                            stats_code[c]);
+    InstrumentedCondVar lock(&mutex);
+    get_perf_context()->Reset();
+    mutex.Lock();
+    lock.TimedWait(100);
+    mutex.Unlock();
+    if (stats_code[c] == static_cast<int>(DB_MUTEX_WAIT_MICROS)) {
+      // increment the counter only when it's a DB Mutex
+      ASSERT_GT(get_perf_context()->db_condition_wait_nanos, 0);
+    } else {
+      ASSERT_EQ(get_perf_context()->db_condition_wait_nanos, 0);
+    }
+  }
+}
+
+TEST_F(PerfContextTest, ToString) {
+  get_perf_context()->Reset();
+  get_perf_context()->block_read_count = 12345;
+
+  std::string zero_included = get_perf_context()->ToString();
+  ASSERT_NE(std::string::npos, zero_included.find("= 0"));
+  ASSERT_NE(std::string::npos, zero_included.find("= 12345"));
+
+  std::string zero_excluded = get_perf_context()->ToString(true);
+  ASSERT_EQ(std::string::npos, zero_excluded.find("= 0"));
+  ASSERT_NE(std::string::npos, zero_excluded.find("= 12345"));
+}
+
+TEST_F(PerfContextTest, MergeOperatorTime) {
+  ASSERT_OK(DestroyDB(kDbName, Options()));
+  DB* db;
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  Status s = DB::Open(options, kDbName, &db);
+  EXPECT_OK(s);
+
+  std::string val;
+  ASSERT_OK(db->Merge(WriteOptions(), "k1", "val1"));
+  ASSERT_OK(db->Merge(WriteOptions(), "k1", "val2"));
+  ASSERT_OK(db->Merge(WriteOptions(), "k1", "val3"));
+  ASSERT_OK(db->Merge(WriteOptions(), "k1", "val4"));
+
+  SetPerfLevel(kEnableTime);
+  get_perf_context()->Reset();
+  ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+#ifdef OS_SOLARIS
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+  }
+#endif
+  EXPECT_GT(get_perf_context()->merge_operator_time_nanos, 0);
+
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  get_perf_context()->Reset();
+  ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+#ifdef OS_SOLARIS
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+  }
+#endif
+  EXPECT_GT(get_perf_context()->merge_operator_time_nanos, 0);
+
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  get_perf_context()->Reset();
+  ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+#ifdef OS_SOLARIS
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+  }
+#endif
+  EXPECT_GT(get_perf_context()->merge_operator_time_nanos, 0);
+
+  delete db;
+}
+
+TEST_F(PerfContextTest, CopyAndMove) {
+  // Assignment operator
+  {
+    get_perf_context()->Reset();
+    get_perf_context()->EnablePerLevelPerfContext();
+    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+    ASSERT_EQ(
+        1,
+        (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful);
+    PerfContext perf_context_assign;
+    perf_context_assign = *get_perf_context();
+    ASSERT_EQ(
+        1,
+        (*(perf_context_assign.level_to_perf_context))[5].bloom_filter_useful);
+    get_perf_context()->ClearPerLevelPerfContext();
+    get_perf_context()->Reset();
+    ASSERT_EQ(
+        1,
+        (*(perf_context_assign.level_to_perf_context))[5].bloom_filter_useful);
+    perf_context_assign.ClearPerLevelPerfContext();
+    perf_context_assign.Reset();
+  }
+  // Copy constructor
+  {
+    get_perf_context()->Reset();
+    get_perf_context()->EnablePerLevelPerfContext();
+    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+    ASSERT_EQ(
+        1,
+        (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful);
+    PerfContext perf_context_copy(*get_perf_context());
+    ASSERT_EQ(
+        1, (*(perf_context_copy.level_to_perf_context))[5].bloom_filter_useful);
+    get_perf_context()->ClearPerLevelPerfContext();
+    get_perf_context()->Reset();
+    ASSERT_EQ(
+        1, (*(perf_context_copy.level_to_perf_context))[5].bloom_filter_useful);
+    perf_context_copy.ClearPerLevelPerfContext();
+    perf_context_copy.Reset();
+  }
+  // Move constructor
+  {
+    get_perf_context()->Reset();
+    get_perf_context()->EnablePerLevelPerfContext();
+    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+    ASSERT_EQ(
+        1,
+        (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful);
+    PerfContext perf_context_move = std::move(*get_perf_context());
+    ASSERT_EQ(
+        1, (*(perf_context_move.level_to_perf_context))[5].bloom_filter_useful);
+    get_perf_context()->ClearPerLevelPerfContext();
+    get_perf_context()->Reset();
+    ASSERT_EQ(
+        1, (*(perf_context_move.level_to_perf_context))[5].bloom_filter_useful);
+    perf_context_move.ClearPerLevelPerfContext();
+    perf_context_move.Reset();
+  }
+}
+
+TEST_F(PerfContextTest, PerfContextDisableEnable) {
+  get_perf_context()->Reset();
+  get_perf_context()->EnablePerLevelPerfContext();
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, 0);
+  get_perf_context()->DisablePerLevelPerfContext();
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+  get_perf_context()->EnablePerLevelPerfContext();
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1, 0);
+  get_perf_context()->DisablePerLevelPerfContext();
+  PerfContext perf_context_copy(*get_perf_context());
+  ASSERT_EQ(1, (*(perf_context_copy.level_to_perf_context))[0]
+                   .bloom_filter_full_positive);
+  // this was set when per level perf context is disabled, should not be copied
+  ASSERT_NE(
+      1, (*(perf_context_copy.level_to_perf_context))[5].bloom_filter_useful);
+  ASSERT_EQ(
+      1, (*(perf_context_copy.level_to_perf_context))[0].block_cache_hit_count);
+  perf_context_copy.ClearPerLevelPerfContext();
+  perf_context_copy.Reset();
+  get_perf_context()->ClearPerLevelPerfContext();
+  get_perf_context()->Reset();
+}
+
+TEST_F(PerfContextTest, PerfContextByLevelGetSet) {
+  get_perf_context()->Reset();
+  get_perf_context()->EnablePerLevelPerfContext();
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, 0);
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 7);
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 7);
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1, 2);
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1, 0);
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 5, 2);
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 2, 3);
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 4, 1);
+  ASSERT_EQ(
+      0, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+  ASSERT_EQ(
+      1, (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful);
+  ASSERT_EQ(
+      2, (*(get_perf_context()->level_to_perf_context))[7].bloom_filter_useful);
+  ASSERT_EQ(1, (*(get_perf_context()->level_to_perf_context))[0]
+                   .bloom_filter_full_positive);
+  ASSERT_EQ(1, (*(get_perf_context()->level_to_perf_context))[2]
+                   .bloom_filter_full_true_positive);
+  ASSERT_EQ(
+      1,
+      (*(get_perf_context()->level_to_perf_context))[0].block_cache_hit_count);
+  ASSERT_EQ(
+      5,
+      (*(get_perf_context()->level_to_perf_context))[2].block_cache_hit_count);
+  ASSERT_EQ(
+      2,
+      (*(get_perf_context()->level_to_perf_context))[3].block_cache_miss_count);
+  ASSERT_EQ(
+      4,
+      (*(get_perf_context()->level_to_perf_context))[1].block_cache_miss_count);
+  std::string zero_excluded = get_perf_context()->ToString(true);
+  ASSERT_NE(std::string::npos,
+            zero_excluded.find("bloom_filter_useful = 1@level5, 2@level7"));
+  ASSERT_NE(std::string::npos,
+            zero_excluded.find("bloom_filter_full_positive = 1@level0"));
+  ASSERT_NE(std::string::npos,
+            zero_excluded.find("bloom_filter_full_true_positive = 1@level2"));
+  ASSERT_NE(std::string::npos,
+            zero_excluded.find("block_cache_hit_count = 1@level0, 5@level2"));
+  ASSERT_NE(std::string::npos,
+            zero_excluded.find("block_cache_miss_count = 4@level1, 2@level3"));
+}
+
+TEST_F(PerfContextTest, CPUTimer) {
+  if (SystemClock::Default()->CPUNanos() == 0) {
+    ROCKSDB_GTEST_SKIP("Target without CPUNanos support");
+    return;
+  }
+
+  ASSERT_OK(DestroyDB(kDbName, Options()));
+  auto db = OpenDb();
+  WriteOptions write_options;
+  ReadOptions read_options;
+  SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
+
+  std::string max_str = "0";
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    std::string i_str = std::to_string(i);
+    std::string key = "k" + i_str;
+    std::string value = "v" + i_str;
+    max_str = max_str > i_str ? max_str : i_str;
+
+    ASSERT_OK(db->Put(write_options, key, value));
+  }
+  std::string last_key = "k" + max_str;
+  std::string last_value = "v" + max_str;
+
+  {
+    // Get
+    get_perf_context()->Reset();
+    std::string value;
+    ASSERT_OK(db->Get(read_options, "k0", &value));
+    ASSERT_EQ(value, "v0");
+
+    if (FLAGS_verbose) {
+      std::cout << "Get CPU time nanos: " << get_perf_context()->get_cpu_nanos
+                << "ns\n";
+    }
+
+    // Iter
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+    // Seek
+    get_perf_context()->Reset();
+    iter->Seek(last_key);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(last_value, iter->value().ToString());
+
+    if (FLAGS_verbose) {
+      std::cout << "Iter Seek CPU time nanos: "
+                << get_perf_context()->iter_seek_cpu_nanos << "ns\n";
+    }
+
+    // SeekForPrev
+    get_perf_context()->Reset();
+    iter->SeekForPrev(last_key);
+    ASSERT_TRUE(iter->Valid());
+
+    if (FLAGS_verbose) {
+      std::cout << "Iter SeekForPrev CPU time nanos: "
+                << get_perf_context()->iter_seek_cpu_nanos << "ns\n";
+    }
+
+    // SeekToLast
+    get_perf_context()->Reset();
+    iter->SeekToLast();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(last_value, iter->value().ToString());
+
+    if (FLAGS_verbose) {
+      std::cout << "Iter SeekToLast CPU time nanos: "
+                << get_perf_context()->iter_seek_cpu_nanos << "ns\n";
+    }
+
+    // SeekToFirst
+    get_perf_context()->Reset();
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("v0", iter->value().ToString());
+
+    if (FLAGS_verbose) {
+      std::cout << "Iter SeekToFirst CPU time nanos: "
+                << get_perf_context()->iter_seek_cpu_nanos << "ns\n";
+    }
+
+    // Next
+    get_perf_context()->Reset();
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("v1", iter->value().ToString());
+
+    if (FLAGS_verbose) {
+      std::cout << "Iter Next CPU time nanos: "
+                << get_perf_context()->iter_next_cpu_nanos << "ns\n";
+    }
+
+    // Prev
+    get_perf_context()->Reset();
+    iter->Prev();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("v0", iter->value().ToString());
+
+    if (FLAGS_verbose) {
+      std::cout << "Iter Prev CPU time nanos: "
+                << get_perf_context()->iter_prev_cpu_nanos << "ns\n";
+    }
+
+    // monotonically increasing
+    get_perf_context()->Reset();
+    auto count = get_perf_context()->iter_seek_cpu_nanos;
+    for (int i = 0; i < FLAGS_total_keys; ++i) {
+      iter->Seek("k" + std::to_string(i));
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("v" + std::to_string(i), iter->value().ToString());
+      auto next_count = get_perf_context()->iter_seek_cpu_nanos;
+      ASSERT_GT(next_count, count);
+      count = next_count;
+    }
+
+    // iterator creation/destruction; multiple iterators
+    {
+      std::unique_ptr<Iterator> iter2(db->NewIterator(read_options));
+      ASSERT_EQ(count, get_perf_context()->iter_seek_cpu_nanos);
+      iter2->Seek(last_key);
+      ASSERT_TRUE(iter2->Valid());
+      ASSERT_EQ(last_value, iter2->value().ToString());
+      ASSERT_GT(get_perf_context()->iter_seek_cpu_nanos, count);
+      count = get_perf_context()->iter_seek_cpu_nanos;
+    }
+    ASSERT_EQ(count, get_perf_context()->iter_seek_cpu_nanos);
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+
+  for (int i = 1; i < argc; i++) {
+    int n;
+    char junk;
+
+    if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
+      FLAGS_write_buffer_size = n;
+    }
+
+    if (sscanf(argv[i], "--total_keys=%d%c", &n, &junk) == 1) {
+      FLAGS_total_keys = n;
+    }
+
+    if (sscanf(argv[i], "--random_key=%d%c", &n, &junk) == 1 &&
+        (n == 0 || n == 1)) {
+      FLAGS_random_key = n;
+    }
+
+    if (sscanf(argv[i], "--use_set_based_memetable=%d%c", &n, &junk) == 1 &&
+        (n == 0 || n == 1)) {
+      FLAGS_use_set_based_memetable = n;
+    }
+
+    if (sscanf(argv[i], "--verbose=%d%c", &n, &junk) == 1 &&
+        (n == 0 || n == 1)) {
+      FLAGS_verbose = n;
+    }
+  }
+
+  if (FLAGS_verbose) {
+    std::cout << kDbName << "\n";
+  }
+
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/periodic_task_scheduler.cc b/src/rocksdb/db/periodic_task_scheduler.cc
new file mode 100644
index 000000000..2024510dd
--- /dev/null
+++ b/src/rocksdb/db/periodic_task_scheduler.cc
@@ -0,0 +1,113 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/periodic_task_scheduler.h"
+
+#include "rocksdb/system_clock.h"
+
+#ifndef ROCKSDB_LITE
+namespace ROCKSDB_NAMESPACE {
+
+// `timer_mutex` is a global mutex serves 3 purposes currently:
+// (1) to ensure calls to `Start()` and `Shutdown()` are serialized, as
+//     they are currently not implemented in a thread-safe way; and
+// (2) to ensure the `Timer::Add()`s and `Timer::Start()` run atomically, and
+//     the `Timer::Cancel()`s and `Timer::Shutdown()` run atomically.
+// (3) protect tasks_map_ in PeriodicTaskScheduler
+// Note: It's not efficient to have a static global mutex, for
+// PeriodicTaskScheduler it should be okay, as the operations are called
+// infrequently.
+static port::Mutex timer_mutex;
+
+static const std::map<PeriodicTaskType, uint64_t> kDefaultPeriodSeconds = {
+    {PeriodicTaskType::kDumpStats, kInvalidPeriodSec},
+    {PeriodicTaskType::kPersistStats, kInvalidPeriodSec},
+    {PeriodicTaskType::kFlushInfoLog, 10},
+    {PeriodicTaskType::kRecordSeqnoTime, kInvalidPeriodSec},
+};
+
+static const std::map<PeriodicTaskType, std::string> kPeriodicTaskTypeNames = {
+    {PeriodicTaskType::kDumpStats, "dump_st"},
+    {PeriodicTaskType::kPersistStats, "pst_st"},
+    {PeriodicTaskType::kFlushInfoLog, "flush_info_log"},
+    {PeriodicTaskType::kRecordSeqnoTime, "record_seq_time"},
+};
+
+Status PeriodicTaskScheduler::Register(PeriodicTaskType task_type,
+                                       const PeriodicTaskFunc& fn) {
+  return Register(task_type, fn, kDefaultPeriodSeconds.at(task_type));
+}
+
+Status PeriodicTaskScheduler::Register(PeriodicTaskType task_type,
+                                       const PeriodicTaskFunc& fn,
+                                       uint64_t repeat_period_seconds) {
+  MutexLock l(&timer_mutex);
+  static std::atomic<uint64_t> initial_delay(0);
+
+  if (repeat_period_seconds == kInvalidPeriodSec) {
+    return Status::InvalidArgument("Invalid task repeat period");
+  }
+  auto it = tasks_map_.find(task_type);
+  if (it != tasks_map_.end()) {
+    // the task already exists and it's the same, no update needed
+    if (it->second.repeat_every_sec == repeat_period_seconds) {
+      return Status::OK();
+    }
+    // cancel the existing one before register new one
+    timer_->Cancel(it->second.name);
+    tasks_map_.erase(it);
+  }
+
+  timer_->Start();
+  // put task type name as prefix, for easy debug
+  std::string unique_id =
+      kPeriodicTaskTypeNames.at(task_type) + std::to_string(id_++);
+
+  bool succeeded = timer_->Add(
+      fn, unique_id,
+      (initial_delay.fetch_add(1) % repeat_period_seconds) * kMicrosInSecond,
+      repeat_period_seconds * kMicrosInSecond);
+  if (!succeeded) {
+    return Status::Aborted("Failed to register periodic task");
+  }
+  auto result = tasks_map_.try_emplace(
+      task_type, TaskInfo{unique_id, repeat_period_seconds});
+  if (!result.second) {
+    return Status::Aborted("Failed to add periodic task");
+  };
+  return Status::OK();
+}
+
+Status PeriodicTaskScheduler::Unregister(PeriodicTaskType task_type) {
+  MutexLock l(&timer_mutex);
+  auto it = tasks_map_.find(task_type);
+  if (it != tasks_map_.end()) {
+    timer_->Cancel(it->second.name);
+    tasks_map_.erase(it);
+  }
+  if (!timer_->HasPendingTask()) {
+    timer_->Shutdown();
+  }
+  return Status::OK();
+}
+
+Timer* PeriodicTaskScheduler::Default() {
+  static Timer timer(SystemClock::Default().get());
+  return &timer;
+}
+
+#ifndef NDEBUG
+void PeriodicTaskScheduler::TEST_OverrideTimer(SystemClock* clock) {
+  static Timer test_timer(clock);
+  test_timer.TEST_OverrideTimer(clock);
+  MutexLock l(&timer_mutex);
+  timer_ = &test_timer;
+}
+#endif  // NDEBUG
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/periodic_task_scheduler.h b/src/rocksdb/db/periodic_task_scheduler.h
new file mode 100644
index 000000000..f45b80c4d
--- /dev/null
+++ b/src/rocksdb/db/periodic_task_scheduler.h
@@ -0,0 +1,110 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include "util/timer.h"
+
+namespace ROCKSDB_NAMESPACE {
+class SystemClock;
+
+using PeriodicTaskFunc = std::function<void()>;
+
+constexpr uint64_t kInvalidPeriodSec = 0;
+
+// List of task types
+enum class PeriodicTaskType : uint8_t {
+  kDumpStats = 0,
+  kPersistStats,
+  kFlushInfoLog,
+  kRecordSeqnoTime,
+  kMax,
+};
+
+// PeriodicTaskScheduler contains the periodic task scheduled from the DB
+// instance. It's used to schedule/unschedule DumpStats(), PersistStats(),
+// FlushInfoLog(), etc. Each type of the task can only have one instance,
+// re-register the same task type would only update the repeat period.
+//
+// Internally, it uses a global single threaded timer object to run the periodic
+// task functions. Timer thread will always be started since the info log
+// flushing cannot be disabled.
+class PeriodicTaskScheduler {
+ public:
+  explicit PeriodicTaskScheduler() = default;
+
+  PeriodicTaskScheduler(const PeriodicTaskScheduler&) = delete;
+  PeriodicTaskScheduler(PeriodicTaskScheduler&&) = delete;
+  PeriodicTaskScheduler& operator=(const PeriodicTaskScheduler&) = delete;
+  PeriodicTaskScheduler& operator=(PeriodicTaskScheduler&&) = delete;
+
+  // Register a task with its default repeat period
+  Status Register(PeriodicTaskType task_type, const PeriodicTaskFunc& fn);
+
+  // Register a task with specified repeat period. 0 is an invalid argument
+  // (kInvalidPeriodSec). To stop the task, please use Unregister() specifically
+  Status Register(PeriodicTaskType task_type, const PeriodicTaskFunc& fn,
+                  uint64_t repeat_period_seconds);
+
+  // Unregister the task
+  Status Unregister(PeriodicTaskType task_type);
+
+#ifndef NDEBUG
+  // Override the timer for the unittest
+  void TEST_OverrideTimer(SystemClock* clock);
+
+  // Call Timer TEST_WaitForRun() which wait until Timer starting waiting.
+  void TEST_WaitForRun(const std::function<void()>& callback) const {
+    if (timer_ != nullptr) {
+      timer_->TEST_WaitForRun(callback);
+    }
+  }
+
+  // Get global valid task number in the Timer
+  size_t TEST_GetValidTaskNum() const {
+    if (timer_ != nullptr) {
+      return timer_->TEST_GetPendingTaskNum();
+    }
+    return 0;
+  }
+
+  // If it has the specified task type registered
+  bool TEST_HasTask(PeriodicTaskType task_type) const {
+    auto it = tasks_map_.find(task_type);
+    return it != tasks_map_.end();
+  }
+#endif  // NDEBUG
+
+ private:
+  // default global Timer instance
+  static Timer* Default();
+
+  // Internal structure to store task information
+  struct TaskInfo {
+    TaskInfo(std::string _name, uint64_t _repeat_every_sec)
+        : name(std::move(_name)), repeat_every_sec(_repeat_every_sec) {}
+    std::string name;
+    uint64_t repeat_every_sec;
+  };
+
+  // Internal tasks map
+  std::map<PeriodicTaskType, TaskInfo> tasks_map_;
+
+  // Global timer pointer, which doesn't support synchronous add/cancel tasks
+  // so having a global `timer_mutex` for add/cancel task.
+  Timer* timer_ = Default();
+
+  // Global task id, protected by the global `timer_mutex`
+  inline static uint64_t id_;
+
+  static constexpr uint64_t kMicrosInSecond = 1000U * 1000U;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/periodic_task_scheduler_test.cc b/src/rocksdb/db/periodic_task_scheduler_test.cc
new file mode 100644
index 000000000..4abea4d5e
--- /dev/null
+++ b/src/rocksdb/db/periodic_task_scheduler_test.cc
@@ -0,0 +1,231 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/periodic_task_scheduler.h"
+
+#include "db/db_test_util.h"
+#include "env/composite_env_wrapper.h"
+#include "test_util/mock_time_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+class PeriodicTaskSchedulerTest : public DBTestBase {
+ public:
+  PeriodicTaskSchedulerTest()
+      : DBTestBase("periodic_task_scheduler_test", /*env_do_fsync=*/true) {
+    mock_clock_ = std::make_shared<MockSystemClock>(env_->GetSystemClock());
+    mock_env_.reset(new CompositeEnvWrapper(env_, mock_clock_));
+  }
+
+ protected:
+  std::unique_ptr<Env> mock_env_;
+  std::shared_ptr<MockSystemClock> mock_clock_;
+
+  void SetUp() override {
+    mock_clock_->InstallTimedWaitFixCallback();
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::StartPeriodicTaskScheduler:Init", [&](void* arg) {
+          auto periodic_task_scheduler_ptr =
+              reinterpret_cast<PeriodicTaskScheduler*>(arg);
+          periodic_task_scheduler_ptr->TEST_OverrideTimer(mock_clock_.get());
+        });
+  }
+};
+
+TEST_F(PeriodicTaskSchedulerTest, Basic) {
+  constexpr unsigned int kPeriodSec = 10;
+  Close();
+  Options options;
+  options.stats_dump_period_sec = kPeriodSec;
+  options.stats_persist_period_sec = kPeriodSec;
+  options.create_if_missing = true;
+  options.env = mock_env_.get();
+
+  int dump_st_counter = 0;
+  SyncPoint::GetInstance()->SetCallBack("DBImpl::DumpStats:StartRunning",
+                                        [&](void*) { dump_st_counter++; });
+
+  int pst_st_counter = 0;
+  SyncPoint::GetInstance()->SetCallBack("DBImpl::PersistStats:StartRunning",
+                                        [&](void*) { pst_st_counter++; });
+
+  int flush_info_log_counter = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushInfoLog:StartRunning",
+      [&](void*) { flush_info_log_counter++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Reopen(options);
+
+  ASSERT_EQ(kPeriodSec, dbfull()->GetDBOptions().stats_dump_period_sec);
+  ASSERT_EQ(kPeriodSec, dbfull()->GetDBOptions().stats_persist_period_sec);
+
+  ASSERT_GT(kPeriodSec, 1u);
+  dbfull()->TEST_WaitForPeridicTaskRun([&] {
+    mock_clock_->MockSleepForSeconds(static_cast<int>(kPeriodSec) - 1);
+  });
+
+  const PeriodicTaskScheduler& scheduler =
+      dbfull()->TEST_GetPeriodicTaskScheduler();
+  ASSERT_EQ(3, scheduler.TEST_GetValidTaskNum());
+
+  ASSERT_EQ(1, dump_st_counter);
+  ASSERT_EQ(1, pst_st_counter);
+  ASSERT_EQ(1, flush_info_log_counter);
+
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kPeriodSec)); });
+
+  ASSERT_EQ(2, dump_st_counter);
+  ASSERT_EQ(2, pst_st_counter);
+  ASSERT_EQ(2, flush_info_log_counter);
+
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kPeriodSec)); });
+
+  ASSERT_EQ(3, dump_st_counter);
+  ASSERT_EQ(3, pst_st_counter);
+  ASSERT_EQ(3, flush_info_log_counter);
+
+  // Disable scheduler with SetOption
+  ASSERT_OK(dbfull()->SetDBOptions(
+      {{"stats_dump_period_sec", "0"}, {"stats_persist_period_sec", "0"}}));
+  ASSERT_EQ(0u, dbfull()->GetDBOptions().stats_dump_period_sec);
+  ASSERT_EQ(0u, dbfull()->GetDBOptions().stats_persist_period_sec);
+
+  // Info log flush should still run.
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kPeriodSec)); });
+  ASSERT_EQ(3, dump_st_counter);
+  ASSERT_EQ(3, pst_st_counter);
+  ASSERT_EQ(4, flush_info_log_counter);
+
+  ASSERT_EQ(1u, scheduler.TEST_GetValidTaskNum());
+
+  // Re-enable one task
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_dump_period_sec", "5"}}));
+  ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_dump_period_sec);
+  ASSERT_EQ(0u, dbfull()->GetDBOptions().stats_persist_period_sec);
+
+  ASSERT_EQ(2, scheduler.TEST_GetValidTaskNum());
+
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kPeriodSec)); });
+  ASSERT_EQ(4, dump_st_counter);
+  ASSERT_EQ(3, pst_st_counter);
+  ASSERT_EQ(5, flush_info_log_counter);
+
+  Close();
+}
+
+TEST_F(PeriodicTaskSchedulerTest, MultiInstances) {
+  constexpr int kPeriodSec = 5;
+  const int kInstanceNum = 10;
+
+  Close();
+  Options options;
+  options.stats_dump_period_sec = kPeriodSec;
+  options.stats_persist_period_sec = kPeriodSec;
+  options.create_if_missing = true;
+  options.env = mock_env_.get();
+
+  int dump_st_counter = 0;
+  SyncPoint::GetInstance()->SetCallBack("DBImpl::DumpStats:2",
+                                        [&](void*) { dump_st_counter++; });
+
+  int pst_st_counter = 0;
+  SyncPoint::GetInstance()->SetCallBack("DBImpl::PersistStats:StartRunning",
+                                        [&](void*) { pst_st_counter++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  auto dbs = std::vector<DB*>(kInstanceNum);
+  for (int i = 0; i < kInstanceNum; i++) {
+    ASSERT_OK(
+        DB::Open(options, test::PerThreadDBPath(std::to_string(i)), &(dbs[i])));
+  }
+
+  auto dbi = static_cast_with_check<DBImpl>(dbs[kInstanceNum - 1]);
+
+  const PeriodicTaskScheduler& scheduler = dbi->TEST_GetPeriodicTaskScheduler();
+  ASSERT_EQ(kInstanceNum * 3, scheduler.TEST_GetValidTaskNum());
+
+  int expected_run = kInstanceNum;
+  dbi->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); });
+  ASSERT_EQ(expected_run, dump_st_counter);
+  ASSERT_EQ(expected_run, pst_st_counter);
+
+  expected_run += kInstanceNum;
+  dbi->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+  ASSERT_EQ(expected_run, dump_st_counter);
+  ASSERT_EQ(expected_run, pst_st_counter);
+
+  expected_run += kInstanceNum;
+  dbi->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+  ASSERT_EQ(expected_run, dump_st_counter);
+  ASSERT_EQ(expected_run, pst_st_counter);
+
+  int half = kInstanceNum / 2;
+  for (int i = 0; i < half; i++) {
+    delete dbs[i];
+  }
+
+  expected_run += (kInstanceNum - half) * 2;
+
+  dbi->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+  dbi->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+  ASSERT_EQ(expected_run, dump_st_counter);
+  ASSERT_EQ(expected_run, pst_st_counter);
+
+  for (int i = half; i < kInstanceNum; i++) {
+    ASSERT_OK(dbs[i]->Close());
+    delete dbs[i];
+  }
+}
+
+TEST_F(PeriodicTaskSchedulerTest, MultiEnv) {
+  constexpr int kDumpPeriodSec = 5;
+  constexpr int kPersistPeriodSec = 10;
+  Close();
+  Options options1;
+  options1.stats_dump_period_sec = kDumpPeriodSec;
+  options1.stats_persist_period_sec = kPersistPeriodSec;
+  options1.create_if_missing = true;
+  options1.env = mock_env_.get();
+
+  Reopen(options1);
+
+  std::unique_ptr<Env> mock_env2(
+      new CompositeEnvWrapper(Env::Default(), mock_clock_));
+  Options options2;
+  options2.stats_dump_period_sec = kDumpPeriodSec;
+  options2.stats_persist_period_sec = kPersistPeriodSec;
+  options2.create_if_missing = true;
+  options1.env = mock_env2.get();
+
+  std::string dbname = test::PerThreadDBPath("multi_env_test");
+  DB* db;
+  ASSERT_OK(DB::Open(options2, dbname, &db));
+
+  ASSERT_OK(db->Close());
+  delete db;
+  Close();
+}
+
+#endif  // !ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/pinned_iterators_manager.h b/src/rocksdb/db/pinned_iterators_manager.h
new file mode 100644
index 000000000..0fcf231da
--- /dev/null
+++ b/src/rocksdb/db/pinned_iterators_manager.h
@@ -0,0 +1,92 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include <algorithm>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// PinnedIteratorsManager will be notified whenever we need to pin an Iterator
+// and it will be responsible for deleting pinned Iterators when they are
+// not needed anymore.
+class PinnedIteratorsManager : public Cleanable {
+ public:
+  PinnedIteratorsManager() : pinning_enabled(false) {}
+  ~PinnedIteratorsManager() {
+    if (pinning_enabled) {
+      ReleasePinnedData();
+    }
+  }
+
+  // Move constructor and move assignment is allowed.
+  PinnedIteratorsManager(PinnedIteratorsManager&& other) noexcept = default;
+  PinnedIteratorsManager& operator=(PinnedIteratorsManager&& other) noexcept =
+      default;
+
+  // Enable Iterators pinning
+  void StartPinning() {
+    assert(pinning_enabled == false);
+    pinning_enabled = true;
+  }
+
+  // Is pinning enabled ?
+  bool PinningEnabled() { return pinning_enabled; }
+
+  // Take ownership of iter and delete it when ReleasePinnedData() is called
+  void PinIterator(InternalIterator* iter, bool arena = false) {
+    if (arena) {
+      PinPtr(iter, &PinnedIteratorsManager::ReleaseArenaInternalIterator);
+    } else {
+      PinPtr(iter, &PinnedIteratorsManager::ReleaseInternalIterator);
+    }
+  }
+
+  using ReleaseFunction = void (*)(void* arg1);
+  void PinPtr(void* ptr, ReleaseFunction release_func) {
+    assert(pinning_enabled);
+    if (ptr == nullptr) {
+      return;
+    }
+    pinned_ptrs_.emplace_back(ptr, release_func);
+  }
+
+  // Release pinned Iterators
+  inline void ReleasePinnedData() {
+    assert(pinning_enabled == true);
+    pinning_enabled = false;
+
+    // Remove duplicate pointers
+    std::sort(pinned_ptrs_.begin(), pinned_ptrs_.end());
+    auto unique_end = std::unique(pinned_ptrs_.begin(), pinned_ptrs_.end());
+
+    for (auto i = pinned_ptrs_.begin(); i != unique_end; ++i) {
+      void* ptr = i->first;
+      ReleaseFunction release_func = i->second;
+      release_func(ptr);
+    }
+    pinned_ptrs_.clear();
+    // Also do cleanups from the base Cleanable
+    Cleanable::Reset();
+  }
+
+ private:
+  static void ReleaseInternalIterator(void* ptr) {
+    delete reinterpret_cast<InternalIterator*>(ptr);
+  }
+
+  static void ReleaseArenaInternalIterator(void* ptr) {
+    reinterpret_cast<InternalIterator*>(ptr)->~InternalIterator();
+  }
+
+  bool pinning_enabled;
+  std::vector<std::pair<void*, ReleaseFunction>> pinned_ptrs_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/plain_table_db_test.cc b/src/rocksdb/db/plain_table_db_test.cc
new file mode 100644
index 000000000..755b639b0
--- /dev/null
+++ b/src/rocksdb/db/plain_table_db_test.cc
@@ -0,0 +1,1357 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <set>
+
+#include "db/db_impl/db_impl.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "table/meta_blocks.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_key_coding.h"
+#include "table/plain/plain_table_reader.h"
+#include "table/table_builder.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/cast_util.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+class PlainTableKeyDecoderTest : public testing::Test {};
+
+TEST_F(PlainTableKeyDecoderTest, ReadNonMmap) {
+  Random rnd(301);
+  const uint32_t kLength = 2222;
+  std::string tmp = rnd.RandomString(kLength);
+  Slice contents(tmp);
+  test::StringSource* string_source =
+      new test::StringSource(contents, 0, false);
+  std::unique_ptr<FSRandomAccessFile> holder(string_source);
+  std::unique_ptr<RandomAccessFileReader> file_reader(
+      new RandomAccessFileReader(std::move(holder), "test"));
+  std::unique_ptr<PlainTableReaderFileInfo> file_info(
+      new PlainTableReaderFileInfo(std::move(file_reader), EnvOptions(),
+                                   kLength));
+
+  {
+    PlainTableFileReader reader(file_info.get());
+
+    const uint32_t kReadSize = 77;
+    for (uint32_t pos = 0; pos < kLength; pos += kReadSize) {
+      uint32_t read_size = std::min(kLength - pos, kReadSize);
+      Slice out;
+      ASSERT_TRUE(reader.Read(pos, read_size, &out));
+      ASSERT_EQ(0, out.compare(tmp.substr(pos, read_size)));
+    }
+
+    ASSERT_LT(uint32_t(string_source->total_reads()), kLength / kReadSize / 2);
+  }
+
+  std::vector<std::vector<std::pair<uint32_t, uint32_t>>> reads = {
+      {{600, 30}, {590, 30}, {600, 20}, {600, 40}},
+      {{800, 20}, {100, 20}, {500, 20}, {1500, 20}, {100, 20}, {80, 20}},
+      {{1000, 20}, {500, 20}, {1000, 50}},
+      {{1000, 20}, {500, 20}, {500, 20}},
+      {{1000, 20}, {500, 20}, {200, 20}, {500, 20}},
+      {{1000, 20}, {500, 20}, {200, 20}, {1000, 50}},
+      {{600, 500}, {610, 20}, {100, 20}},
+      {{500, 100}, {490, 100}, {550, 50}},
+  };
+
+  std::vector<int> num_file_reads = {2, 6, 2, 2, 4, 3, 2, 2};
+
+  for (size_t i = 0; i < reads.size(); i++) {
+    string_source->set_total_reads(0);
+    PlainTableFileReader reader(file_info.get());
+    for (auto p : reads[i]) {
+      Slice out;
+      ASSERT_TRUE(reader.Read(p.first, p.second, &out));
+      ASSERT_EQ(0, out.compare(tmp.substr(p.first, p.second)));
+    }
+    ASSERT_EQ(num_file_reads[i], string_source->total_reads());
+  }
+}
+
+class PlainTableDBTest : public testing::Test,
+                         public testing::WithParamInterface<bool> {
+ protected:
+ private:
+  std::string dbname_;
+  Env* env_;
+  DB* db_;
+
+  bool mmap_mode_;
+  Options last_options_;
+
+ public:
+  PlainTableDBTest() : env_(Env::Default()) {}
+
+  ~PlainTableDBTest() override {
+    delete db_;
+    EXPECT_OK(DestroyDB(dbname_, Options()));
+  }
+
+  void SetUp() override {
+    mmap_mode_ = GetParam();
+    dbname_ = test::PerThreadDBPath("plain_table_db_test");
+    EXPECT_OK(DestroyDB(dbname_, Options()));
+    db_ = nullptr;
+    Reopen();
+  }
+
+  // Return the current option configuration.
+  Options CurrentOptions() {
+    Options options;
+
+    PlainTableOptions plain_table_options;
+    plain_table_options.user_key_len = 0;
+    plain_table_options.bloom_bits_per_key = 2;
+    plain_table_options.hash_table_ratio = 0.8;
+    plain_table_options.index_sparseness = 3;
+    plain_table_options.huge_page_tlb_size = 0;
+    plain_table_options.encoding_type = kPrefix;
+    plain_table_options.full_scan_mode = false;
+    plain_table_options.store_index_in_file = false;
+
+    options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+    options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true));
+
+    options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+    options.allow_mmap_reads = mmap_mode_;
+    options.allow_concurrent_memtable_write = false;
+    options.unordered_write = false;
+    return options;
+  }
+
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
+
+  void Reopen(Options* options = nullptr) { ASSERT_OK(TryReopen(options)); }
+
+  void Close() {
+    delete db_;
+    db_ = nullptr;
+  }
+
+  bool mmap_mode() const { return mmap_mode_; }
+
+  void DestroyAndReopen(Options* options = nullptr) {
+    // Destroy using last options
+    Destroy(&last_options_);
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Destroy(Options* options) {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, *options));
+  }
+
+  Status PureReopen(Options* options, DB** db) {
+    return DB::Open(*options, dbname_, db);
+  }
+
+  Status ReopenForReadOnly(Options* options) {
+    delete db_;
+    db_ = nullptr;
+    return DB::OpenForReadOnly(*options, dbname_, &db_);
+  }
+
+  Status TryReopen(Options* options = nullptr) {
+    delete db_;
+    db_ = nullptr;
+    Options opts;
+    if (options != nullptr) {
+      opts = *options;
+    } else {
+      opts = CurrentOptions();
+      opts.create_if_missing = true;
+    }
+    last_options_ = opts;
+
+    return DB::Open(opts, dbname_, &db_);
+  }
+
+  Status Put(const Slice& k, const Slice& v) {
+    return db_->Put(WriteOptions(), k, v);
+  }
+
+  Status Delete(const std::string& k) { return db_->Delete(WriteOptions(), k); }
+
+  std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
+    ReadOptions options;
+    options.snapshot = snapshot;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  int NumTableFilesAtLevel(int level) {
+    std::string property;
+    EXPECT_TRUE(db_->GetProperty(
+        "rocksdb.num-files-at-level" + std::to_string(level), &property));
+    return atoi(property.c_str());
+  }
+
+  // Return spread of files per level
+  std::string FilesPerLevel() {
+    std::string result;
+    size_t last_non_zero_offset = 0;
+    for (int level = 0; level < db_->NumberLevels(); level++) {
+      int f = NumTableFilesAtLevel(level);
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+      result += buf;
+      if (f > 0) {
+        last_non_zero_offset = result.size();
+      }
+    }
+    result.resize(last_non_zero_offset);
+    return result;
+  }
+
+  std::string IterStatus(Iterator* iter) {
+    std::string result;
+    if (iter->Valid()) {
+      result = iter->key().ToString() + "->" + iter->value().ToString();
+    } else {
+      result = "(invalid)";
+    }
+    return result;
+  }
+};
+
+TEST_P(PlainTableDBTest, Empty) {
+  ASSERT_TRUE(dbfull() != nullptr);
+  ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
+}
+
+extern const uint64_t kPlainTableMagicNumber;
+
+class TestPlainTableReader : public PlainTableReader {
+ public:
+  TestPlainTableReader(
+      const EnvOptions& env_options, const InternalKeyComparator& icomparator,
+      EncodingType encoding_type, uint64_t file_size, int bloom_bits_per_key,
+      double hash_table_ratio, size_t index_sparseness,
+      std::unique_ptr<TableProperties>&& props,
+      std::unique_ptr<RandomAccessFileReader>&& file,
+      const ImmutableOptions& ioptions, const SliceTransform* prefix_extractor,
+      bool* expect_bloom_not_match, bool store_index_in_file,
+      uint32_t column_family_id, const std::string& column_family_name)
+      : PlainTableReader(ioptions, std::move(file), env_options, icomparator,
+                         encoding_type, file_size, props.get(),
+                         prefix_extractor),
+        expect_bloom_not_match_(expect_bloom_not_match) {
+    Status s = MmapDataIfNeeded();
+    EXPECT_TRUE(s.ok());
+
+    s = PopulateIndex(props.get(), bloom_bits_per_key, hash_table_ratio,
+                      index_sparseness, 2 * 1024 * 1024);
+    EXPECT_TRUE(s.ok());
+
+    EXPECT_EQ(column_family_id, static_cast<uint32_t>(props->column_family_id));
+    EXPECT_EQ(column_family_name, props->column_family_name);
+    if (store_index_in_file) {
+      auto bloom_version_ptr = props->user_collected_properties.find(
+          PlainTablePropertyNames::kBloomVersion);
+      EXPECT_TRUE(bloom_version_ptr != props->user_collected_properties.end());
+      EXPECT_EQ(bloom_version_ptr->second, std::string("1"));
+      if (ioptions.bloom_locality > 0) {
+        auto num_blocks_ptr = props->user_collected_properties.find(
+            PlainTablePropertyNames::kNumBloomBlocks);
+        EXPECT_TRUE(num_blocks_ptr != props->user_collected_properties.end());
+      }
+    }
+    table_properties_ = std::move(props);
+  }
+
+  ~TestPlainTableReader() override {}
+
+ private:
+  bool MatchBloom(uint32_t hash) const override {
+    bool ret = PlainTableReader::MatchBloom(hash);
+    if (*expect_bloom_not_match_) {
+      EXPECT_TRUE(!ret);
+    } else {
+      EXPECT_TRUE(ret);
+    }
+    return ret;
+  }
+  bool* expect_bloom_not_match_;
+};
+
+extern const uint64_t kPlainTableMagicNumber;
+class TestPlainTableFactory : public PlainTableFactory {
+ public:
+  explicit TestPlainTableFactory(bool* expect_bloom_not_match,
+                                 const PlainTableOptions& options,
+                                 uint32_t column_family_id,
+                                 std::string column_family_name)
+      : PlainTableFactory(options),
+        bloom_bits_per_key_(options.bloom_bits_per_key),
+        hash_table_ratio_(options.hash_table_ratio),
+        index_sparseness_(options.index_sparseness),
+        store_index_in_file_(options.store_index_in_file),
+        expect_bloom_not_match_(expect_bloom_not_match),
+        column_family_id_(column_family_id),
+        column_family_name_(std::move(column_family_name)) {}
+
+  using PlainTableFactory::NewTableReader;
+  Status NewTableReader(
+      const ReadOptions& /*ro*/, const TableReaderOptions& table_reader_options,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table,
+      bool /*prefetch_index_and_filter_in_cache*/) const override {
+    std::unique_ptr<TableProperties> props;
+    auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
+                                 table_reader_options.ioptions, &props);
+    EXPECT_TRUE(s.ok());
+
+    if (store_index_in_file_) {
+      BlockHandle bloom_block_handle;
+      s = FindMetaBlockInFile(file.get(), file_size, kPlainTableMagicNumber,
+                              table_reader_options.ioptions,
+                              BloomBlockBuilder::kBloomBlock,
+                              &bloom_block_handle);
+      EXPECT_TRUE(s.ok());
+
+      BlockHandle index_block_handle;
+      s = FindMetaBlockInFile(file.get(), file_size, kPlainTableMagicNumber,
+                              table_reader_options.ioptions,
+                              PlainTableIndexBuilder::kPlainTableIndexBlock,
+                              &index_block_handle);
+      EXPECT_TRUE(s.ok());
+    }
+
+    auto& user_props = props->user_collected_properties;
+    auto encoding_type_prop =
+        user_props.find(PlainTablePropertyNames::kEncodingType);
+    assert(encoding_type_prop != user_props.end());
+    EncodingType encoding_type = static_cast<EncodingType>(
+        DecodeFixed32(encoding_type_prop->second.c_str()));
+
+    std::unique_ptr<PlainTableReader> new_reader(new TestPlainTableReader(
+        table_reader_options.env_options,
+        table_reader_options.internal_comparator, encoding_type, file_size,
+        bloom_bits_per_key_, hash_table_ratio_, index_sparseness_,
+        std::move(props), std::move(file), table_reader_options.ioptions,
+        table_reader_options.prefix_extractor.get(), expect_bloom_not_match_,
+        store_index_in_file_, column_family_id_, column_family_name_));
+
+    *table = std::move(new_reader);
+    return s;
+  }
+
+ private:
+  int bloom_bits_per_key_;
+  double hash_table_ratio_;
+  size_t index_sparseness_;
+  bool store_index_in_file_;
+  bool* expect_bloom_not_match_;
+  const uint32_t column_family_id_;
+  const std::string column_family_name_;
+};
+
+TEST_P(PlainTableDBTest, BadOptions1) {
+  // Build with a prefix extractor
+  ASSERT_OK(Put("1000000000000foo", "v1"));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+  // Bad attempt to re-open without a prefix extractor
+  Options options = CurrentOptions();
+  options.prefix_extractor.reset();
+  ASSERT_EQ(
+      "Invalid argument: Prefix extractor is missing when opening a PlainTable "
+      "built using a prefix extractor",
+      TryReopen(&options).ToString());
+
+  // Bad attempt to re-open with different prefix extractor
+  options.prefix_extractor.reset(NewFixedPrefixTransform(6));
+  ASSERT_EQ(
+      "Invalid argument: Prefix extractor given doesn't match the one used to "
+      "build PlainTable",
+      TryReopen(&options).ToString());
+
+  // Correct prefix extractor
+  options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+  Reopen(&options);
+  ASSERT_EQ("v1", Get("1000000000000foo"));
+}
+
+TEST_P(PlainTableDBTest, BadOptions2) {
+  Options options = CurrentOptions();
+  options.prefix_extractor.reset();
+  options.create_if_missing = true;
+  DestroyAndReopen(&options);
+  // Build without a prefix extractor
+  // (apparently works even if hash_table_ratio > 0)
+  ASSERT_OK(Put("1000000000000foo", "v1"));
+  // Build without a prefix extractor, this call will fail and returns the
+  // status for this bad attempt.
+  ASSERT_NOK(dbfull()->TEST_FlushMemTable());
+
+  // Bad attempt to re-open with hash_table_ratio > 0 and no prefix extractor
+  Status s = TryReopen(&options);
+  ASSERT_EQ(
+      "Not implemented: PlainTable requires a prefix extractor enable prefix "
+      "hash mode.",
+      s.ToString());
+
+  // OK to open with hash_table_ratio == 0 and no prefix extractor
+  PlainTableOptions plain_table_options;
+  plain_table_options.hash_table_ratio = 0;
+  options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+  Reopen(&options);
+  ASSERT_EQ("v1", Get("1000000000000foo"));
+
+  // OK to open newly with a prefix_extractor and hash table; builds index
+  // in memory.
+  options = CurrentOptions();
+  Reopen(&options);
+  ASSERT_EQ("v1", Get("1000000000000foo"));
+}
+
+TEST_P(PlainTableDBTest, Flush) {
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (EncodingType encoding_type : {kPlain, kPrefix}) {
+      for (int bloom = -1; bloom <= 117; bloom += 117) {
+        const int bloom_bits = std::max(bloom, 0);
+        const bool full_scan_mode = bloom < 0;
+        for (int total_order = 0; total_order <= 1; total_order++) {
+          for (int store_index_in_file = 0; store_index_in_file <= 1;
+               ++store_index_in_file) {
+            Options options = CurrentOptions();
+            options.create_if_missing = true;
+            // Set only one bucket to force bucket conflict.
+            // Test index interval for the same prefix to be 1, 2 and 4
+            if (total_order) {
+              options.prefix_extractor.reset();
+
+              PlainTableOptions plain_table_options;
+              plain_table_options.user_key_len = 0;
+              plain_table_options.bloom_bits_per_key = bloom_bits;
+              plain_table_options.hash_table_ratio = 0;
+              plain_table_options.index_sparseness = 2;
+              plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+              plain_table_options.encoding_type = encoding_type;
+              plain_table_options.full_scan_mode = full_scan_mode;
+              plain_table_options.store_index_in_file = store_index_in_file;
+
+              options.table_factory.reset(
+                  NewPlainTableFactory(plain_table_options));
+            } else {
+              PlainTableOptions plain_table_options;
+              plain_table_options.user_key_len = 0;
+              plain_table_options.bloom_bits_per_key = bloom_bits;
+              plain_table_options.hash_table_ratio = 0.75;
+              plain_table_options.index_sparseness = 16;
+              plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+              plain_table_options.encoding_type = encoding_type;
+              plain_table_options.full_scan_mode = full_scan_mode;
+              plain_table_options.store_index_in_file = store_index_in_file;
+
+              options.table_factory.reset(
+                  NewPlainTableFactory(plain_table_options));
+            }
+            DestroyAndReopen(&options);
+            uint64_t int_num;
+            ASSERT_TRUE(dbfull()->GetIntProperty(
+                "rocksdb.estimate-table-readers-mem", &int_num));
+            ASSERT_EQ(int_num, 0U);
+
+            ASSERT_OK(Put("1000000000000foo", "v1"));
+            ASSERT_OK(Put("0000000000000bar", "v2"));
+            ASSERT_OK(Put("1000000000000foo", "v3"));
+            ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+            ASSERT_TRUE(dbfull()->GetIntProperty(
+                "rocksdb.estimate-table-readers-mem", &int_num));
+            ASSERT_GT(int_num, 0U);
+
+            TablePropertiesCollection ptc;
+            ASSERT_OK(reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(
+                &ptc));
+            ASSERT_EQ(1U, ptc.size());
+            auto row = ptc.begin();
+            auto tp = row->second;
+
+            if (full_scan_mode) {
+              // Does not support Get/Seek
+              std::unique_ptr<Iterator> iter(
+                  dbfull()->NewIterator(ReadOptions()));
+              iter->SeekToFirst();
+              ASSERT_TRUE(iter->Valid());
+              ASSERT_EQ("0000000000000bar", iter->key().ToString());
+              ASSERT_EQ("v2", iter->value().ToString());
+              iter->Next();
+              ASSERT_TRUE(iter->Valid());
+              ASSERT_EQ("1000000000000foo", iter->key().ToString());
+              ASSERT_EQ("v3", iter->value().ToString());
+              iter->Next();
+              ASSERT_TRUE(!iter->Valid());
+              ASSERT_TRUE(iter->status().ok());
+            } else {
+              if (!store_index_in_file) {
+                ASSERT_EQ(total_order ? "4" : "12",
+                          (tp->user_collected_properties)
+                              .at("plain_table_hash_table_size"));
+                ASSERT_EQ("0", (tp->user_collected_properties)
+                                   .at("plain_table_sub_index_size"));
+              } else {
+                ASSERT_EQ("0", (tp->user_collected_properties)
+                                   .at("plain_table_hash_table_size"));
+                ASSERT_EQ("0", (tp->user_collected_properties)
+                                   .at("plain_table_sub_index_size"));
+              }
+              ASSERT_EQ("v3", Get("1000000000000foo"));
+              ASSERT_EQ("v2", Get("0000000000000bar"));
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST_P(PlainTableDBTest, Flush2) {
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (EncodingType encoding_type : {kPlain, kPrefix}) {
+      for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
+        for (int total_order = 0; total_order <= 1; total_order++) {
+          for (int store_index_in_file = 0; store_index_in_file <= 1;
+               ++store_index_in_file) {
+            if (encoding_type == kPrefix && total_order) {
+              continue;
+            }
+            if (!bloom_bits && store_index_in_file) {
+              continue;
+            }
+            if (total_order && store_index_in_file) {
+              continue;
+            }
+            bool expect_bloom_not_match = false;
+            Options options = CurrentOptions();
+            options.create_if_missing = true;
+            // Set only one bucket to force bucket conflict.
+            // Test index interval for the same prefix to be 1, 2 and 4
+            PlainTableOptions plain_table_options;
+            if (total_order) {
+              options.prefix_extractor = nullptr;
+              plain_table_options.hash_table_ratio = 0;
+              plain_table_options.index_sparseness = 2;
+            } else {
+              plain_table_options.hash_table_ratio = 0.75;
+              plain_table_options.index_sparseness = 16;
+            }
+            plain_table_options.user_key_len = kPlainTableVariableLength;
+            plain_table_options.bloom_bits_per_key = bloom_bits;
+            plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+            plain_table_options.encoding_type = encoding_type;
+            plain_table_options.store_index_in_file = store_index_in_file;
+            options.table_factory.reset(new TestPlainTableFactory(
+                &expect_bloom_not_match, plain_table_options,
+                0 /* column_family_id */, kDefaultColumnFamilyName));
+
+            DestroyAndReopen(&options);
+            ASSERT_OK(Put("0000000000000bar", "b"));
+            ASSERT_OK(Put("1000000000000foo", "v1"));
+            ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+            ASSERT_OK(Put("1000000000000foo", "v2"));
+            ASSERT_OK(dbfull()->TEST_FlushMemTable());
+            ASSERT_EQ("v2", Get("1000000000000foo"));
+
+            ASSERT_OK(Put("0000000000000eee", "v3"));
+            ASSERT_OK(dbfull()->TEST_FlushMemTable());
+            ASSERT_EQ("v3", Get("0000000000000eee"));
+
+            ASSERT_OK(Delete("0000000000000bar"));
+            ASSERT_OK(dbfull()->TEST_FlushMemTable());
+            ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
+
+            ASSERT_OK(Put("0000000000000eee", "v5"));
+            ASSERT_OK(Put("9000000000000eee", "v5"));
+            ASSERT_OK(dbfull()->TEST_FlushMemTable());
+            ASSERT_EQ("v5", Get("0000000000000eee"));
+
+            // Test Bloom Filter
+            if (bloom_bits > 0) {
+              // Neither key nor value should exist.
+              expect_bloom_not_match = true;
+              ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar"));
+              // Key doesn't exist any more but prefix exists.
+              if (total_order) {
+                ASSERT_EQ("NOT_FOUND", Get("1000000000000not"));
+                ASSERT_EQ("NOT_FOUND", Get("0000000000000not"));
+              }
+              expect_bloom_not_match = false;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST_P(PlainTableDBTest, Immortal) {
+  for (EncodingType encoding_type : {kPlain, kPrefix}) {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.max_open_files = -1;
+    // Set only one bucket to force bucket conflict.
+    // Test index interval for the same prefix to be 1, 2 and 4
+    PlainTableOptions plain_table_options;
+    plain_table_options.hash_table_ratio = 0.75;
+    plain_table_options.index_sparseness = 16;
+    plain_table_options.user_key_len = kPlainTableVariableLength;
+    plain_table_options.bloom_bits_per_key = 10;
+    plain_table_options.encoding_type = encoding_type;
+    options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+
+    DestroyAndReopen(&options);
+    ASSERT_OK(Put("0000000000000bar", "b"));
+    ASSERT_OK(Put("1000000000000foo", "v1"));
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+    int copied = 0;
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "GetContext::SaveValue::PinSelf", [&](void* /*arg*/) { copied++; });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    ASSERT_EQ("b", Get("0000000000000bar"));
+    ASSERT_EQ("v1", Get("1000000000000foo"));
+    ASSERT_EQ(2, copied);
+    copied = 0;
+
+    Close();
+    ASSERT_OK(ReopenForReadOnly(&options));
+
+    ASSERT_EQ("b", Get("0000000000000bar"));
+    ASSERT_EQ("v1", Get("1000000000000foo"));
+    ASSERT_EQ("NOT_FOUND", Get("1000000000000bar"));
+    if (mmap_mode()) {
+      ASSERT_EQ(0, copied);
+    } else {
+      ASSERT_EQ(2, copied);
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_P(PlainTableDBTest, Iterator) {
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (EncodingType encoding_type : {kPlain, kPrefix}) {
+      for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
+        for (int total_order = 0; total_order <= 1; total_order++) {
+          if (encoding_type == kPrefix && total_order == 1) {
+            continue;
+          }
+          bool expect_bloom_not_match = false;
+          Options options = CurrentOptions();
+          options.create_if_missing = true;
+          // Set only one bucket to force bucket conflict.
+          // Test index interval for the same prefix to be 1, 2 and 4
+          if (total_order) {
+            options.prefix_extractor = nullptr;
+
+            PlainTableOptions plain_table_options;
+            plain_table_options.user_key_len = 16;
+            plain_table_options.bloom_bits_per_key = bloom_bits;
+            plain_table_options.hash_table_ratio = 0;
+            plain_table_options.index_sparseness = 2;
+            plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+            plain_table_options.encoding_type = encoding_type;
+
+            options.table_factory.reset(new TestPlainTableFactory(
+                &expect_bloom_not_match, plain_table_options,
+                0 /* column_family_id */, kDefaultColumnFamilyName));
+          } else {
+            PlainTableOptions plain_table_options;
+            plain_table_options.user_key_len = 16;
+            plain_table_options.bloom_bits_per_key = bloom_bits;
+            plain_table_options.hash_table_ratio = 0.75;
+            plain_table_options.index_sparseness = 16;
+            plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+            plain_table_options.encoding_type = encoding_type;
+
+            options.table_factory.reset(new TestPlainTableFactory(
+                &expect_bloom_not_match, plain_table_options,
+                0 /* column_family_id */, kDefaultColumnFamilyName));
+          }
+          DestroyAndReopen(&options);
+
+          ASSERT_OK(Put("1000000000foo002", "v_2"));
+          ASSERT_OK(Put("0000000000000bar", "random"));
+          ASSERT_OK(Put("1000000000foo001", "v1"));
+          ASSERT_OK(Put("3000000000000bar", "bar_v"));
+          ASSERT_OK(Put("1000000000foo003", "v__3"));
+          ASSERT_OK(Put("1000000000foo004", "v__4"));
+          ASSERT_OK(Put("1000000000foo005", "v__5"));
+          ASSERT_OK(Put("1000000000foo007", "v__7"));
+          ASSERT_OK(Put("1000000000foo008", "v__8"));
+          ASSERT_OK(dbfull()->TEST_FlushMemTable());
+          ASSERT_EQ("v1", Get("1000000000foo001"));
+          ASSERT_EQ("v__3", Get("1000000000foo003"));
+          Iterator* iter = dbfull()->NewIterator(ReadOptions());
+          iter->Seek("1000000000foo000");
+          ASSERT_TRUE(iter->Valid());
+          ASSERT_EQ("1000000000foo001", iter->key().ToString());
+          ASSERT_EQ("v1", iter->value().ToString());
+
+          iter->Next();
+          ASSERT_TRUE(iter->Valid());
+          ASSERT_EQ("1000000000foo002", iter->key().ToString());
+          ASSERT_EQ("v_2", iter->value().ToString());
+
+          iter->Next();
+          ASSERT_TRUE(iter->Valid());
+          ASSERT_EQ("1000000000foo003", iter->key().ToString());
+          ASSERT_EQ("v__3", iter->value().ToString());
+
+          iter->Next();
+          ASSERT_TRUE(iter->Valid());
+          ASSERT_EQ("1000000000foo004", iter->key().ToString());
+          ASSERT_EQ("v__4", iter->value().ToString());
+
+          iter->Seek("3000000000000bar");
+          ASSERT_TRUE(iter->Valid());
+          ASSERT_EQ("3000000000000bar", iter->key().ToString());
+          ASSERT_EQ("bar_v", iter->value().ToString());
+
+          iter->Seek("1000000000foo000");
+          ASSERT_TRUE(iter->Valid());
+          ASSERT_EQ("1000000000foo001", iter->key().ToString());
+          ASSERT_EQ("v1", iter->value().ToString());
+
+          iter->Seek("1000000000foo005");
+          ASSERT_TRUE(iter->Valid());
+          ASSERT_EQ("1000000000foo005", iter->key().ToString());
+          ASSERT_EQ("v__5", iter->value().ToString());
+
+          iter->Seek("1000000000foo006");
+          ASSERT_TRUE(iter->Valid());
+          ASSERT_EQ("1000000000foo007", iter->key().ToString());
+          ASSERT_EQ("v__7", iter->value().ToString());
+
+          iter->Seek("1000000000foo008");
+          ASSERT_TRUE(iter->Valid());
+          ASSERT_EQ("1000000000foo008", iter->key().ToString());
+          ASSERT_EQ("v__8", iter->value().ToString());
+
+          if (total_order == 0) {
+            iter->Seek("1000000000foo009");
+            ASSERT_TRUE(iter->Valid());
+            ASSERT_EQ("3000000000000bar", iter->key().ToString());
+          }
+
+          // Test Bloom Filter
+          if (bloom_bits > 0) {
+            if (!total_order) {
+              // Neither key nor value should exist.
+              expect_bloom_not_match = true;
+              iter->Seek("2not000000000bar");
+              ASSERT_TRUE(!iter->Valid());
+              ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
+              expect_bloom_not_match = false;
+            } else {
+              expect_bloom_not_match = true;
+              ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
+              expect_bloom_not_match = false;
+            }
+          }
+          ASSERT_OK(iter->status());
+          delete iter;
+        }
+      }
+    }
+  }
+}
+
+namespace {
+std::string NthKey(size_t n, char filler) {
+  std::string rv(16, filler);
+  rv[0] = n % 10;
+  rv[1] = (n / 10) % 10;
+  rv[2] = (n / 100) % 10;
+  rv[3] = (n / 1000) % 10;
+  return rv;
+}
+}  // anonymous namespace
+
+TEST_P(PlainTableDBTest, BloomSchema) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  for (int bloom_locality = 0; bloom_locality <= 1; bloom_locality++) {
+    options.bloom_locality = bloom_locality;
+    PlainTableOptions plain_table_options;
+    plain_table_options.user_key_len = 16;
+    plain_table_options.bloom_bits_per_key = 3;  // high FP rate for test
+    plain_table_options.hash_table_ratio = 0.75;
+    plain_table_options.index_sparseness = 16;
+    plain_table_options.huge_page_tlb_size = 0;
+    plain_table_options.encoding_type = kPlain;
+
+    bool expect_bloom_not_match = false;
+    options.table_factory.reset(new TestPlainTableFactory(
+        &expect_bloom_not_match, plain_table_options, 0 /* column_family_id */,
+        kDefaultColumnFamilyName));
+    DestroyAndReopen(&options);
+
+    for (unsigned i = 0; i < 2345; ++i) {
+      ASSERT_OK(Put(NthKey(i, 'y'), "added"));
+    }
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
+    ASSERT_EQ("added", Get(NthKey(42, 'y')));
+
+    for (unsigned i = 0; i < 32; ++i) {
+      // Known pattern of Bloom filter false positives can detect schema change
+      // with high probability. Known FPs stuffed into bits:
+      uint32_t pattern;
+      if (!bloom_locality) {
+        pattern = 1785868347UL;
+      } else if (CACHE_LINE_SIZE == 64U) {
+        pattern = 2421694657UL;
+      } else if (CACHE_LINE_SIZE == 128U) {
+        pattern = 788710956UL;
+      } else {
+        ASSERT_EQ(CACHE_LINE_SIZE, 256U);
+        pattern = 163905UL;
+      }
+      bool expect_fp = pattern & (1UL << i);
+      // fprintf(stderr, "expect_fp@%u: %d\n", i, (int)expect_fp);
+      expect_bloom_not_match = !expect_fp;
+      ASSERT_EQ("NOT_FOUND", Get(NthKey(i, 'n')));
+    }
+  }
+}
+
+namespace {
+std::string MakeLongKey(size_t length, char c) {
+  return std::string(length, c);
+}
+}  // anonymous namespace
+
+TEST_P(PlainTableDBTest, IteratorLargeKeys) {
+  Options options = CurrentOptions();
+
+  PlainTableOptions plain_table_options;
+  plain_table_options.user_key_len = 0;
+  plain_table_options.bloom_bits_per_key = 0;
+  plain_table_options.hash_table_ratio = 0;
+
+  options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+  options.create_if_missing = true;
+  options.prefix_extractor.reset();
+  DestroyAndReopen(&options);
+
+  std::string key_list[] = {MakeLongKey(30, '0'), MakeLongKey(16, '1'),
+                            MakeLongKey(32, '2'), MakeLongKey(60, '3'),
+                            MakeLongKey(90, '4'), MakeLongKey(50, '5'),
+                            MakeLongKey(26, '6')};
+
+  for (size_t i = 0; i < 7; i++) {
+    ASSERT_OK(Put(key_list[i], std::to_string(i)));
+  }
+
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+  Iterator* iter = dbfull()->NewIterator(ReadOptions());
+  iter->Seek(key_list[0]);
+
+  for (size_t i = 0; i < 7; i++) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(key_list[i], iter->key().ToString());
+    ASSERT_EQ(std::to_string(i), iter->value().ToString());
+    iter->Next();
+  }
+
+  ASSERT_TRUE(!iter->Valid());
+
+  delete iter;
+}
+
+namespace {
+std::string MakeLongKeyWithPrefix(size_t length, char c) {
+  return "00000000" + std::string(length - 8, c);
+}
+}  // anonymous namespace
+
+TEST_P(PlainTableDBTest, IteratorLargeKeysWithPrefix) {
+  Options options = CurrentOptions();
+
+  PlainTableOptions plain_table_options;
+  plain_table_options.user_key_len = 16;
+  plain_table_options.bloom_bits_per_key = 0;
+  plain_table_options.hash_table_ratio = 0.8;
+  plain_table_options.index_sparseness = 3;
+  plain_table_options.huge_page_tlb_size = 0;
+  plain_table_options.encoding_type = kPrefix;
+
+  options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+  options.create_if_missing = true;
+  DestroyAndReopen(&options);
+
+  std::string key_list[] = {
+      MakeLongKeyWithPrefix(30, '0'), MakeLongKeyWithPrefix(16, '1'),
+      MakeLongKeyWithPrefix(32, '2'), MakeLongKeyWithPrefix(60, '3'),
+      MakeLongKeyWithPrefix(90, '4'), MakeLongKeyWithPrefix(50, '5'),
+      MakeLongKeyWithPrefix(26, '6')};
+
+  for (size_t i = 0; i < 7; i++) {
+    ASSERT_OK(Put(key_list[i], std::to_string(i)));
+  }
+
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+  Iterator* iter = dbfull()->NewIterator(ReadOptions());
+  iter->Seek(key_list[0]);
+
+  for (size_t i = 0; i < 7; i++) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(key_list[i], iter->key().ToString());
+    ASSERT_EQ(std::to_string(i), iter->value().ToString());
+    iter->Next();
+  }
+
+  ASSERT_TRUE(!iter->Valid());
+
+  delete iter;
+}
+
+TEST_P(PlainTableDBTest, IteratorReverseSuffixComparator) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  // Set only one bucket to force bucket conflict.
+  // Test index interval for the same prefix to be 1, 2 and 4
+  test::SimpleSuffixReverseComparator comp;
+  options.comparator = &comp;
+  DestroyAndReopen(&options);
+
+  ASSERT_OK(Put("1000000000foo002", "v_2"));
+  ASSERT_OK(Put("0000000000000bar", "random"));
+  ASSERT_OK(Put("1000000000foo001", "v1"));
+  ASSERT_OK(Put("3000000000000bar", "bar_v"));
+  ASSERT_OK(Put("1000000000foo003", "v__3"));
+  ASSERT_OK(Put("1000000000foo004", "v__4"));
+  ASSERT_OK(Put("1000000000foo005", "v__5"));
+  ASSERT_OK(Put("1000000000foo007", "v__7"));
+  ASSERT_OK(Put("1000000000foo008", "v__8"));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+  ASSERT_EQ("v1", Get("1000000000foo001"));
+  ASSERT_EQ("v__3", Get("1000000000foo003"));
+  Iterator* iter = dbfull()->NewIterator(ReadOptions());
+  iter->Seek("1000000000foo009");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo008", iter->key().ToString());
+  ASSERT_EQ("v__8", iter->value().ToString());
+
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo007", iter->key().ToString());
+  ASSERT_EQ("v__7", iter->value().ToString());
+
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo005", iter->key().ToString());
+  ASSERT_EQ("v__5", iter->value().ToString());
+
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo004", iter->key().ToString());
+  ASSERT_EQ("v__4", iter->value().ToString());
+
+  iter->Seek("3000000000000bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("3000000000000bar", iter->key().ToString());
+  ASSERT_EQ("bar_v", iter->value().ToString());
+
+  iter->Seek("1000000000foo005");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo005", iter->key().ToString());
+  ASSERT_EQ("v__5", iter->value().ToString());
+
+  iter->Seek("1000000000foo006");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo005", iter->key().ToString());
+  ASSERT_EQ("v__5", iter->value().ToString());
+
+  iter->Seek("1000000000foo008");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo008", iter->key().ToString());
+  ASSERT_EQ("v__8", iter->value().ToString());
+
+  iter->Seek("1000000000foo000");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("3000000000000bar", iter->key().ToString());
+
+  delete iter;
+}
+
+TEST_P(PlainTableDBTest, HashBucketConflict) {
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (unsigned char i = 1; i <= 3; i++) {
+      Options options = CurrentOptions();
+      options.create_if_missing = true;
+      // Set only one bucket to force bucket conflict.
+      // Test index interval for the same prefix to be 1, 2 and 4
+
+      PlainTableOptions plain_table_options;
+      plain_table_options.user_key_len = 16;
+      plain_table_options.bloom_bits_per_key = 0;
+      plain_table_options.hash_table_ratio = 0;
+      plain_table_options.index_sparseness = 2 ^ i;
+      plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+
+      options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+
+      DestroyAndReopen(&options);
+      ASSERT_OK(Put("5000000000000fo0", "v1"));
+      ASSERT_OK(Put("5000000000000fo1", "v2"));
+      ASSERT_OK(Put("5000000000000fo2", "v"));
+      ASSERT_OK(Put("2000000000000fo0", "v3"));
+      ASSERT_OK(Put("2000000000000fo1", "v4"));
+      ASSERT_OK(Put("2000000000000fo2", "v"));
+      ASSERT_OK(Put("2000000000000fo3", "v"));
+
+      ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+      ASSERT_EQ("v1", Get("5000000000000fo0"));
+      ASSERT_EQ("v2", Get("5000000000000fo1"));
+      ASSERT_EQ("v3", Get("2000000000000fo0"));
+      ASSERT_EQ("v4", Get("2000000000000fo1"));
+
+      ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
+      ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
+      ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
+      ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
+
+      ReadOptions ro;
+      Iterator* iter = dbfull()->NewIterator(ro);
+
+      iter->Seek("5000000000000fo0");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+
+      iter->Seek("5000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+
+      iter->Seek("2000000000000fo0");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+
+      iter->Seek("2000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+
+      iter->Seek("2000000000000bar");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+
+      iter->Seek("5000000000000bar");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+
+      iter->Seek("2000000000000fo8");
+      ASSERT_TRUE(!iter->Valid() ||
+                  options.comparator->Compare(iter->key(), "20000001") > 0);
+
+      iter->Seek("5000000000000fo8");
+      ASSERT_TRUE(!iter->Valid());
+
+      iter->Seek("1000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      iter->Seek("3000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      iter->Seek("8000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      ASSERT_OK(iter->status());
+      delete iter;
+    }
+  }
+}
+
+TEST_P(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) {
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (unsigned char i = 1; i <= 3; i++) {
+      Options options = CurrentOptions();
+      options.create_if_missing = true;
+      test::SimpleSuffixReverseComparator comp;
+      options.comparator = &comp;
+      // Set only one bucket to force bucket conflict.
+      // Test index interval for the same prefix to be 1, 2 and 4
+
+      PlainTableOptions plain_table_options;
+      plain_table_options.user_key_len = 16;
+      plain_table_options.bloom_bits_per_key = 0;
+      plain_table_options.hash_table_ratio = 0;
+      plain_table_options.index_sparseness = 2 ^ i;
+      plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+
+      options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+      DestroyAndReopen(&options);
+      ASSERT_OK(Put("5000000000000fo0", "v1"));
+      ASSERT_OK(Put("5000000000000fo1", "v2"));
+      ASSERT_OK(Put("5000000000000fo2", "v"));
+      ASSERT_OK(Put("2000000000000fo0", "v3"));
+      ASSERT_OK(Put("2000000000000fo1", "v4"));
+      ASSERT_OK(Put("2000000000000fo2", "v"));
+      ASSERT_OK(Put("2000000000000fo3", "v"));
+
+      ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+      ASSERT_EQ("v1", Get("5000000000000fo0"));
+      ASSERT_EQ("v2", Get("5000000000000fo1"));
+      ASSERT_EQ("v3", Get("2000000000000fo0"));
+      ASSERT_EQ("v4", Get("2000000000000fo1"));
+
+      ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
+      ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
+      ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
+      ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
+
+      ReadOptions ro;
+      Iterator* iter = dbfull()->NewIterator(ro);
+
+      iter->Seek("5000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+
+      iter->Seek("5000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+
+      iter->Seek("2000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+
+      iter->Seek("2000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+
+      iter->Seek("2000000000000var");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo3", iter->key().ToString());
+
+      iter->Seek("5000000000000var");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo2", iter->key().ToString());
+
+      std::string seek_key = "2000000000000bar";
+      iter->Seek(seek_key);
+      ASSERT_TRUE(!iter->Valid() ||
+                  options.prefix_extractor->Transform(iter->key()) !=
+                      options.prefix_extractor->Transform(seek_key));
+
+      iter->Seek("1000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      iter->Seek("3000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      iter->Seek("8000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      ASSERT_OK(iter->status());
+      delete iter;
+    }
+  }
+}
+
+TEST_P(PlainTableDBTest, NonExistingKeyToNonEmptyBucket) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  // Set only one bucket to force bucket conflict.
+  // Test index interval for the same prefix to be 1, 2 and 4
+  PlainTableOptions plain_table_options;
+  plain_table_options.user_key_len = 16;
+  plain_table_options.bloom_bits_per_key = 0;
+  plain_table_options.hash_table_ratio = 0;
+  plain_table_options.index_sparseness = 5;
+
+  options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+  DestroyAndReopen(&options);
+  ASSERT_OK(Put("5000000000000fo0", "v1"));
+  ASSERT_OK(Put("5000000000000fo1", "v2"));
+  ASSERT_OK(Put("5000000000000fo2", "v3"));
+
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+  ASSERT_EQ("v1", Get("5000000000000fo0"));
+  ASSERT_EQ("v2", Get("5000000000000fo1"));
+  ASSERT_EQ("v3", Get("5000000000000fo2"));
+
+  ASSERT_EQ("NOT_FOUND", Get("8000000000000bar"));
+  ASSERT_EQ("NOT_FOUND", Get("1000000000000bar"));
+
+  Iterator* iter = dbfull()->NewIterator(ReadOptions());
+
+  iter->Seek("5000000000000bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+
+  iter->Seek("5000000000000fo8");
+  ASSERT_TRUE(!iter->Valid());
+
+  iter->Seek("1000000000000fo2");
+  ASSERT_TRUE(!iter->Valid());
+
+  iter->Seek("8000000000000fo2");
+  ASSERT_TRUE(!iter->Valid());
+
+  ASSERT_OK(iter->status());
+  delete iter;
+}
+
+static std::string Key(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "key_______%06d", i);
+  return std::string(buf);
+}
+
+TEST_P(PlainTableDBTest, CompactionTrigger) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 120 << 10;  // 120KB
+  options.num_levels = 3;
+  options.level0_file_num_compaction_trigger = 3;
+  Reopen(&options);
+
+  Random rnd(301);
+
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    std::vector<std::string> values;
+    // Write 120KB (10 values, each 12K)
+    for (int i = 0; i < 10; i++) {
+      values.push_back(rnd.RandomString(12 << 10));
+      ASSERT_OK(Put(Key(i), values[i]));
+    }
+    ASSERT_OK(Put(Key(999), ""));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
+  }
+
+  // generate one more file in level-0, and should trigger level-0 compaction
+  std::vector<std::string> values;
+  for (int i = 0; i < 12; i++) {
+    values.push_back(rnd.RandomString(10000));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Put(Key(999), ""));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+}
+
+TEST_P(PlainTableDBTest, AdaptiveTable) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+
+  options.table_factory.reset(NewPlainTableFactory());
+  DestroyAndReopen(&options);
+
+  ASSERT_OK(Put("1000000000000foo", "v1"));
+  ASSERT_OK(Put("0000000000000bar", "v2"));
+  ASSERT_OK(Put("1000000000000foo", "v3"));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+  options.create_if_missing = false;
+  std::shared_ptr<TableFactory> block_based_factory(
+      NewBlockBasedTableFactory());
+  std::shared_ptr<TableFactory> plain_table_factory(NewPlainTableFactory());
+  std::shared_ptr<TableFactory> dummy_factory;
+  options.table_factory.reset(NewAdaptiveTableFactory(
+      block_based_factory, block_based_factory, plain_table_factory));
+  Reopen(&options);
+  ASSERT_EQ("v3", Get("1000000000000foo"));
+  ASSERT_EQ("v2", Get("0000000000000bar"));
+
+  ASSERT_OK(Put("2000000000000foo", "v4"));
+  ASSERT_OK(Put("3000000000000bar", "v5"));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+  ASSERT_EQ("v4", Get("2000000000000foo"));
+  ASSERT_EQ("v5", Get("3000000000000bar"));
+
+  Reopen(&options);
+  ASSERT_EQ("v3", Get("1000000000000foo"));
+  ASSERT_EQ("v2", Get("0000000000000bar"));
+  ASSERT_EQ("v4", Get("2000000000000foo"));
+  ASSERT_EQ("v5", Get("3000000000000bar"));
+
+  options.paranoid_checks = false;
+  options.table_factory.reset(NewBlockBasedTableFactory());
+  Reopen(&options);
+  ASSERT_NE("v3", Get("1000000000000foo"));
+
+  options.paranoid_checks = false;
+  options.table_factory.reset(NewPlainTableFactory());
+  Reopen(&options);
+  ASSERT_NE("v5", Get("3000000000000bar"));
+}
+
+INSTANTIATE_TEST_CASE_P(PlainTableDBTest, PlainTableDBTest, ::testing::Bool());
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as plain table is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/post_memtable_callback.h b/src/rocksdb/db/post_memtable_callback.h
new file mode 100644
index 000000000..fbf2fbe86
--- /dev/null
+++ b/src/rocksdb/db/post_memtable_callback.h
@@ -0,0 +1,25 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Callback invoked after finishing writing to the memtable but before
+// publishing the sequence number to readers.
+// Note that with write-prepared/write-unprepared transactions with
+// two-write-queues, PreReleaseCallback is called before publishing the
+// sequence numbers to readers.
+class PostMemTableCallback {
+ public:
+  virtual ~PostMemTableCallback() {}
+
+  virtual Status operator()(SequenceNumber seq, bool disable_memtable) = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/pre_release_callback.h b/src/rocksdb/db/pre_release_callback.h
new file mode 100644
index 000000000..6b9039487
--- /dev/null
+++ b/src/rocksdb/db/pre_release_callback.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class PreReleaseCallback {
+ public:
+  virtual ~PreReleaseCallback() {}
+
+  // Will be called while on the write thread after the write to the WAL and
+  // before the write to memtable. This is useful if any operation needs to be
+  // done before the write gets visible to the readers, or if we want to reduce
+  // the overhead of locking by updating something sequentially while we are on
+  // the write thread. If the callback fails, this function returns a non-OK
+  // status, the sequence number will not be released, and same status will be
+  // propagated to all the writers in the write group.
+  // seq is the sequence number that is used for this write and will be
+  // released.
+  // is_mem_disabled is currently used for debugging purposes to assert that
+  // the callback is done from the right write queue.
+  // If non-zero, log_number indicates the WAL log to which we wrote.
+  // index >= 0 specifies the order of callback in the same write thread.
+  // total > index specifies the total number of callbacks in the same write
+  // thread. Together with index, could be used to reduce the redundant
+  // operations among the callbacks.
+  virtual Status Callback(SequenceNumber seq, bool is_mem_disabled,
+                          uint64_t log_number, size_t index, size_t total) = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/prefix_test.cc b/src/rocksdb/db/prefix_test.cc
new file mode 100644
index 000000000..8592b8f31
--- /dev/null
+++ b/src/rocksdb/db/prefix_test.cc
@@ -0,0 +1,906 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run this test... Skipping...\n");
+  return 0;
+}
+#else
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "monitoring/histogram.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/table.h"
+#include "test_util/testharness.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/gflags_compat.h"
+#include "util/random.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_bool(trigger_deadlock, false,
+            "issue delete in range scan to trigger PrefixHashMap deadlock");
+DEFINE_int32(bucket_count, 100000, "number of buckets");
+DEFINE_uint64(num_locks, 10001, "number of locks");
+DEFINE_bool(random_prefix, false, "randomize prefix");
+DEFINE_uint64(total_prefixes, 100000, "total number of prefixes");
+DEFINE_uint64(items_per_prefix, 1, "total number of values per prefix");
+DEFINE_int64(write_buffer_size, 33554432, "");
+DEFINE_int32(max_write_buffer_number, 2, "");
+DEFINE_int32(min_write_buffer_number_to_merge, 1, "");
+DEFINE_int32(skiplist_height, 4, "");
+DEFINE_double(memtable_prefix_bloom_size_ratio, 0.1, "");
+DEFINE_int32(memtable_huge_page_size, 2 * 1024 * 1024, "");
+DEFINE_int32(value_size, 40, "");
+DEFINE_bool(enable_print, false, "Print options generated to console.");
+
+// Path to the database on file system
+const std::string kDbName =
+    ROCKSDB_NAMESPACE::test::PerThreadDBPath("prefix_test");
+
+namespace ROCKSDB_NAMESPACE {
+
+struct TestKey {
+  uint64_t prefix;
+  uint64_t sorted;
+
+  TestKey(uint64_t _prefix, uint64_t _sorted)
+      : prefix(_prefix), sorted(_sorted) {}
+};
+
+// return a slice backed by test_key
+inline Slice TestKeyToSlice(std::string& s, const TestKey& test_key) {
+  s.clear();
+  PutFixed64(&s, test_key.prefix);
+  PutFixed64(&s, test_key.sorted);
+  return Slice(s.c_str(), s.size());
+}
+
+inline const TestKey SliceToTestKey(const Slice& slice) {
+  return TestKey(DecodeFixed64(slice.data()), DecodeFixed64(slice.data() + 8));
+}
+
+class TestKeyComparator : public Comparator {
+ public:
+  // Compare needs to be aware of the possibility of a and/or b is
+  // prefix only
+  int Compare(const Slice& a, const Slice& b) const override {
+    const TestKey kkey_a = SliceToTestKey(a);
+    const TestKey kkey_b = SliceToTestKey(b);
+    const TestKey* key_a = &kkey_a;
+    const TestKey* key_b = &kkey_b;
+    if (key_a->prefix != key_b->prefix) {
+      if (key_a->prefix < key_b->prefix) return -1;
+      if (key_a->prefix > key_b->prefix) return 1;
+    } else {
+      EXPECT_TRUE(key_a->prefix == key_b->prefix);
+      // note, both a and b could be prefix only
+      if (a.size() != b.size()) {
+        // one of them is prefix
+        EXPECT_TRUE(
+            (a.size() == sizeof(uint64_t) && b.size() == sizeof(TestKey)) ||
+            (b.size() == sizeof(uint64_t) && a.size() == sizeof(TestKey)));
+        if (a.size() < b.size()) return -1;
+        if (a.size() > b.size()) return 1;
+      } else {
+        // both a and b are prefix
+        if (a.size() == sizeof(uint64_t)) {
+          return 0;
+        }
+
+        // both a and b are whole key
+        EXPECT_TRUE(a.size() == sizeof(TestKey) && b.size() == sizeof(TestKey));
+        if (key_a->sorted < key_b->sorted) return -1;
+        if (key_a->sorted > key_b->sorted) return 1;
+        if (key_a->sorted == key_b->sorted) return 0;
+      }
+    }
+    return 0;
+  }
+
+  bool operator()(const TestKey& a, const TestKey& b) const {
+    std::string sa, sb;
+    return Compare(TestKeyToSlice(sa, a), TestKeyToSlice(sb, b)) < 0;
+  }
+
+  const char* Name() const override { return "TestKeyComparator"; }
+
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {}
+
+  void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+namespace {
+void PutKey(DB* db, WriteOptions write_options, uint64_t prefix,
+            uint64_t suffix, const Slice& value) {
+  TestKey test_key(prefix, suffix);
+  std::string s;
+  Slice key = TestKeyToSlice(s, test_key);
+  ASSERT_OK(db->Put(write_options, key, value));
+}
+
+void PutKey(DB* db, WriteOptions write_options, const TestKey& test_key,
+            const Slice& value) {
+  std::string s;
+  Slice key = TestKeyToSlice(s, test_key);
+  ASSERT_OK(db->Put(write_options, key, value));
+}
+
+void MergeKey(DB* db, WriteOptions write_options, const TestKey& test_key,
+              const Slice& value) {
+  std::string s;
+  Slice key = TestKeyToSlice(s, test_key);
+  ASSERT_OK(db->Merge(write_options, key, value));
+}
+
+void DeleteKey(DB* db, WriteOptions write_options, const TestKey& test_key) {
+  std::string s;
+  Slice key = TestKeyToSlice(s, test_key);
+  ASSERT_OK(db->Delete(write_options, key));
+}
+
+void SeekIterator(Iterator* iter, uint64_t prefix, uint64_t suffix) {
+  TestKey test_key(prefix, suffix);
+  std::string s;
+  Slice key = TestKeyToSlice(s, test_key);
+  iter->Seek(key);
+}
+
+const std::string kNotFoundResult = "NOT_FOUND";
+
+std::string Get(DB* db, const ReadOptions& read_options, uint64_t prefix,
+                uint64_t suffix) {
+  TestKey test_key(prefix, suffix);
+  std::string s2;
+  Slice key = TestKeyToSlice(s2, test_key);
+
+  std::string result;
+  Status s = db->Get(read_options, key, &result);
+  if (s.IsNotFound()) {
+    result = kNotFoundResult;
+  } else if (!s.ok()) {
+    result = s.ToString();
+  }
+  return result;
+}
+
+class SamePrefixTransform : public SliceTransform {
+ private:
+  const Slice prefix_;
+  std::string name_;
+
+ public:
+  explicit SamePrefixTransform(const Slice& prefix)
+      : prefix_(prefix), name_("rocksdb.SamePrefix." + prefix.ToString()) {}
+
+  const char* Name() const override { return name_.c_str(); }
+
+  Slice Transform(const Slice& src) const override {
+    assert(InDomain(src));
+    return prefix_;
+  }
+
+  bool InDomain(const Slice& src) const override {
+    if (src.size() >= prefix_.size()) {
+      return Slice(src.data(), prefix_.size()) == prefix_;
+    }
+    return false;
+  }
+
+  bool InRange(const Slice& dst) const override { return dst == prefix_; }
+
+  bool FullLengthEnabled(size_t* /*len*/) const override { return false; }
+};
+
+}  // anonymous namespace
+
+class PrefixTest : public testing::Test {
+ public:
+  std::shared_ptr<DB> OpenDb() {
+    DB* db;
+
+    options.create_if_missing = true;
+    options.write_buffer_size = FLAGS_write_buffer_size;
+    options.max_write_buffer_number = FLAGS_max_write_buffer_number;
+    options.min_write_buffer_number_to_merge =
+        FLAGS_min_write_buffer_number_to_merge;
+
+    options.memtable_prefix_bloom_size_ratio =
+        FLAGS_memtable_prefix_bloom_size_ratio;
+    options.memtable_huge_page_size = FLAGS_memtable_huge_page_size;
+
+    options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+    BlockBasedTableOptions bbto;
+    bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+    bbto.whole_key_filtering = false;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    options.allow_concurrent_memtable_write = false;
+
+    Status s = DB::Open(options, kDbName, &db);
+    EXPECT_OK(s);
+    return std::shared_ptr<DB>(db);
+  }
+
+  void FirstOption() { option_config_ = kBegin; }
+
+  bool NextOptions(int bucket_count) {
+    // skip some options
+    option_config_++;
+    if (option_config_ < kEnd) {
+      options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+      switch (option_config_) {
+        case kHashSkipList:
+          options.memtable_factory.reset(
+              NewHashSkipListRepFactory(bucket_count, FLAGS_skiplist_height));
+          return true;
+        case kHashLinkList:
+          options.memtable_factory.reset(
+              NewHashLinkListRepFactory(bucket_count));
+          return true;
+        case kHashLinkListHugePageTlb:
+          options.memtable_factory.reset(
+              NewHashLinkListRepFactory(bucket_count, 2 * 1024 * 1024));
+          return true;
+        case kHashLinkListTriggerSkipList:
+          options.memtable_factory.reset(
+              NewHashLinkListRepFactory(bucket_count, 0, 3));
+          return true;
+        default:
+          return false;
+      }
+    }
+    return false;
+  }
+
+  PrefixTest() : option_config_(kBegin) {
+    options.comparator = new TestKeyComparator();
+  }
+  ~PrefixTest() override { delete options.comparator; }
+
+ protected:
+  enum OptionConfig {
+    kBegin,
+    kHashSkipList,
+    kHashLinkList,
+    kHashLinkListHugePageTlb,
+    kHashLinkListTriggerSkipList,
+    kEnd
+  };
+  int option_config_;
+  Options options;
+};
+
+TEST(SamePrefixTest, InDomainTest) {
+  DB* db;
+  Options options;
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(new SamePrefixTransform("HHKB"));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  WriteOptions write_options;
+  ReadOptions read_options;
+  {
+    ASSERT_OK(DestroyDB(kDbName, Options()));
+    ASSERT_OK(DB::Open(options, kDbName, &db));
+    ASSERT_OK(db->Put(write_options, "HHKB pro2", "Mar 24, 2006"));
+    ASSERT_OK(db->Put(write_options, "HHKB pro2 Type-S", "June 29, 2011"));
+    ASSERT_OK(db->Put(write_options, "Realforce 87u", "idk"));
+    ASSERT_OK(db->Flush(FlushOptions()));
+    std::string result;
+    auto db_iter = db->NewIterator(ReadOptions());
+
+    db_iter->Seek("Realforce 87u");
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_OK(db_iter->status());
+    ASSERT_EQ(db_iter->key(), "Realforce 87u");
+    ASSERT_EQ(db_iter->value(), "idk");
+
+    delete db_iter;
+    delete db;
+    ASSERT_OK(DestroyDB(kDbName, Options()));
+  }
+
+  {
+    ASSERT_OK(DB::Open(options, kDbName, &db));
+    ASSERT_OK(db->Put(write_options, "pikachu", "1"));
+    ASSERT_OK(db->Put(write_options, "Meowth", "1"));
+    ASSERT_OK(db->Put(write_options, "Mewtwo", "idk"));
+    ASSERT_OK(db->Flush(FlushOptions()));
+    std::string result;
+    auto db_iter = db->NewIterator(ReadOptions());
+
+    db_iter->Seek("Mewtwo");
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_OK(db_iter->status());
+    delete db_iter;
+    delete db;
+    ASSERT_OK(DestroyDB(kDbName, Options()));
+  }
+}
+
+TEST_F(PrefixTest, TestResult) {
+  for (int num_buckets = 1; num_buckets <= 2; num_buckets++) {
+    FirstOption();
+    while (NextOptions(num_buckets)) {
+      std::cout << "*** Mem table: " << options.memtable_factory->Name()
+                << " number of buckets: " << num_buckets << std::endl;
+      ASSERT_OK(DestroyDB(kDbName, Options()));
+      auto db = OpenDb();
+      WriteOptions write_options;
+      ReadOptions read_options;
+
+      // 1. Insert one row.
+      Slice v16("v16");
+      PutKey(db.get(), write_options, 1, 6, v16);
+      std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+      SeekIterator(iter.get(), 1, 6);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(!iter->Valid());
+      ASSERT_OK(iter->status());
+
+      SeekIterator(iter.get(), 2, 0);
+      ASSERT_TRUE(!iter->Valid());
+      ASSERT_OK(iter->status());
+
+      ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6));
+      ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 1, 5));
+      ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 1, 7));
+      ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 0, 6));
+      ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 2, 6));
+
+      // 2. Insert an entry for the same prefix as the last entry in the bucket.
+      Slice v17("v17");
+      PutKey(db.get(), write_options, 1, 7, v17);
+      iter.reset(db->NewIterator(read_options));
+      SeekIterator(iter.get(), 1, 7);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      SeekIterator(iter.get(), 1, 6);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(!iter->Valid());
+      ASSERT_OK(iter->status());
+
+      SeekIterator(iter.get(), 2, 0);
+      ASSERT_TRUE(!iter->Valid());
+      ASSERT_OK(iter->status());
+
+      // 3. Insert an entry for the same prefix as the head of the bucket.
+      Slice v15("v15");
+      PutKey(db.get(), write_options, 1, 5, v15);
+      iter.reset(db->NewIterator(read_options));
+
+      SeekIterator(iter.get(), 1, 7);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v15 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v15 == iter->value());
+
+      ASSERT_EQ(v15.ToString(), Get(db.get(), read_options, 1, 5));
+      ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6));
+      ASSERT_EQ(v17.ToString(), Get(db.get(), read_options, 1, 7));
+
+      // 4. Insert an entry with a larger prefix
+      Slice v22("v22");
+      PutKey(db.get(), write_options, 2, 2, v22);
+      iter.reset(db->NewIterator(read_options));
+
+      SeekIterator(iter.get(), 2, 2);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v22 == iter->value());
+      SeekIterator(iter.get(), 2, 0);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v22 == iter->value());
+
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v15 == iter->value());
+
+      SeekIterator(iter.get(), 1, 7);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      // 5. Insert an entry with a smaller prefix
+      Slice v02("v02");
+      PutKey(db.get(), write_options, 0, 2, v02);
+      iter.reset(db->NewIterator(read_options));
+
+      SeekIterator(iter.get(), 0, 2);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v02 == iter->value());
+      SeekIterator(iter.get(), 0, 0);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v02 == iter->value());
+
+      SeekIterator(iter.get(), 2, 0);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v22 == iter->value());
+
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v15 == iter->value());
+
+      SeekIterator(iter.get(), 1, 7);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      // 6. Insert to the beginning and the end of the first prefix
+      Slice v13("v13");
+      Slice v18("v18");
+      PutKey(db.get(), write_options, 1, 3, v13);
+      PutKey(db.get(), write_options, 1, 8, v18);
+      iter.reset(db->NewIterator(read_options));
+      SeekIterator(iter.get(), 1, 7);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      SeekIterator(iter.get(), 1, 3);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v13 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v15 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v18 == iter->value());
+
+      SeekIterator(iter.get(), 0, 0);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v02 == iter->value());
+
+      SeekIterator(iter.get(), 2, 0);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v22 == iter->value());
+
+      ASSERT_EQ(v22.ToString(), Get(db.get(), read_options, 2, 2));
+      ASSERT_EQ(v02.ToString(), Get(db.get(), read_options, 0, 2));
+      ASSERT_EQ(v13.ToString(), Get(db.get(), read_options, 1, 3));
+      ASSERT_EQ(v15.ToString(), Get(db.get(), read_options, 1, 5));
+      ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6));
+      ASSERT_EQ(v17.ToString(), Get(db.get(), read_options, 1, 7));
+      ASSERT_EQ(v18.ToString(), Get(db.get(), read_options, 1, 8));
+    }
+  }
+}
+
+// Show results in prefix
+TEST_F(PrefixTest, PrefixValid) {
+  for (int num_buckets = 1; num_buckets <= 2; num_buckets++) {
+    FirstOption();
+    while (NextOptions(num_buckets)) {
+      std::cout << "*** Mem table: " << options.memtable_factory->Name()
+                << " number of buckets: " << num_buckets << std::endl;
+      ASSERT_OK(DestroyDB(kDbName, Options()));
+      auto db = OpenDb();
+      WriteOptions write_options;
+      ReadOptions read_options;
+
+      // Insert keys with common prefix and one key with different
+      Slice v16("v16");
+      Slice v17("v17");
+      Slice v18("v18");
+      Slice v19("v19");
+      PutKey(db.get(), write_options, 12345, 6, v16);
+      PutKey(db.get(), write_options, 12345, 7, v17);
+      PutKey(db.get(), write_options, 12345, 8, v18);
+      PutKey(db.get(), write_options, 12345, 9, v19);
+      PutKey(db.get(), write_options, 12346, 8, v16);
+      ASSERT_OK(db->Flush(FlushOptions()));
+      TestKey test_key(12346, 8);
+      std::string s;
+      ASSERT_OK(db->Delete(write_options, TestKeyToSlice(s, test_key)));
+      ASSERT_OK(db->Flush(FlushOptions()));
+      read_options.prefix_same_as_start = true;
+      std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+      SeekIterator(iter.get(), 12345, 6);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v18 == iter->value());
+
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v19 == iter->value());
+      iter->Next();
+      ASSERT_FALSE(iter->Valid());
+      ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 12346, 8));
+
+      // Verify seeking past the prefix won't return a result.
+      SeekIterator(iter.get(), 12345, 10);
+      ASSERT_TRUE(!iter->Valid());
+      ASSERT_OK(iter->status());
+    }
+  }
+}
+
+TEST_F(PrefixTest, DynamicPrefixIterator) {
+  while (NextOptions(FLAGS_bucket_count)) {
+    std::cout << "*** Mem table: " << options.memtable_factory->Name()
+              << std::endl;
+    ASSERT_OK(DestroyDB(kDbName, Options()));
+    auto db = OpenDb();
+    WriteOptions write_options;
+    ReadOptions read_options;
+
+    std::vector<uint64_t> prefixes;
+    for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
+      prefixes.push_back(i);
+    }
+
+    if (FLAGS_random_prefix) {
+      RandomShuffle(prefixes.begin(), prefixes.end());
+    }
+
+    HistogramImpl hist_put_time;
+    HistogramImpl hist_put_comparison;
+    // insert x random prefix, each with y continuous element.
+    for (auto prefix : prefixes) {
+      for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
+        TestKey test_key(prefix, sorted);
+
+        std::string s;
+        Slice key = TestKeyToSlice(s, test_key);
+        std::string value(FLAGS_value_size, 0);
+
+        get_perf_context()->Reset();
+        StopWatchNano timer(SystemClock::Default().get(), true);
+        ASSERT_OK(db->Put(write_options, key, value));
+        hist_put_time.Add(timer.ElapsedNanos());
+        hist_put_comparison.Add(get_perf_context()->user_key_comparison_count);
+      }
+    }
+
+    std::cout << "Put key comparison: \n"
+              << hist_put_comparison.ToString() << "Put time: \n"
+              << hist_put_time.ToString();
+
+    // test seek existing keys
+    HistogramImpl hist_seek_time;
+    HistogramImpl hist_seek_comparison;
+
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+    for (auto prefix : prefixes) {
+      TestKey test_key(prefix, FLAGS_items_per_prefix / 2);
+      std::string s;
+      Slice key = TestKeyToSlice(s, test_key);
+      std::string value = "v" + std::to_string(0);
+
+      get_perf_context()->Reset();
+      StopWatchNano timer(SystemClock::Default().get(), true);
+      auto key_prefix = options.prefix_extractor->Transform(key);
+      uint64_t total_keys = 0;
+      for (iter->Seek(key);
+           iter->Valid() && iter->key().starts_with(key_prefix); iter->Next()) {
+        if (FLAGS_trigger_deadlock) {
+          std::cout << "Behold the deadlock!\n";
+          db->Delete(write_options, iter->key());
+        }
+        total_keys++;
+      }
+      hist_seek_time.Add(timer.ElapsedNanos());
+      hist_seek_comparison.Add(get_perf_context()->user_key_comparison_count);
+      ASSERT_EQ(total_keys,
+                FLAGS_items_per_prefix - FLAGS_items_per_prefix / 2);
+    }
+
+    std::cout << "Seek key comparison: \n"
+              << hist_seek_comparison.ToString() << "Seek time: \n"
+              << hist_seek_time.ToString();
+
+    // test non-existing keys
+    HistogramImpl hist_no_seek_time;
+    HistogramImpl hist_no_seek_comparison;
+
+    for (auto prefix = FLAGS_total_prefixes;
+         prefix < FLAGS_total_prefixes + 10000; prefix++) {
+      TestKey test_key(prefix, 0);
+      std::string s;
+      Slice key = TestKeyToSlice(s, test_key);
+
+      get_perf_context()->Reset();
+      StopWatchNano timer(SystemClock::Default().get(), true);
+      iter->Seek(key);
+      hist_no_seek_time.Add(timer.ElapsedNanos());
+      hist_no_seek_comparison.Add(
+          get_perf_context()->user_key_comparison_count);
+      ASSERT_TRUE(!iter->Valid());
+      ASSERT_OK(iter->status());
+    }
+
+    std::cout << "non-existing Seek key comparison: \n"
+              << hist_no_seek_comparison.ToString()
+              << "non-existing Seek time: \n"
+              << hist_no_seek_time.ToString();
+  }
+}
+
+TEST_F(PrefixTest, PrefixSeekModePrev) {
+  // Only for SkipListFactory
+  options.memtable_factory.reset(new SkipListFactory);
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  options.write_buffer_size = 1024 * 1024;
+  Random rnd(1);
+  for (size_t m = 1; m < 100; m++) {
+    std::cout << "[" + std::to_string(m) + "]" + "*** Mem table: "
+              << options.memtable_factory->Name() << std::endl;
+    ASSERT_OK(DestroyDB(kDbName, Options()));
+    auto db = OpenDb();
+    WriteOptions write_options;
+    ReadOptions read_options;
+    std::map<TestKey, std::string, TestKeyComparator> entry_maps[3], whole_map;
+    for (uint64_t i = 0; i < 10; i++) {
+      int div = i % 3 + 1;
+      for (uint64_t j = 0; j < 10; j++) {
+        whole_map[TestKey(i, j)] = entry_maps[rnd.Uniform(div)][TestKey(i, j)] =
+            'v' + std::to_string(i) + std::to_string(j);
+      }
+    }
+
+    std::map<TestKey, std::string, TestKeyComparator> type_map;
+    for (size_t i = 0; i < 3; i++) {
+      for (auto& kv : entry_maps[i]) {
+        if (rnd.OneIn(3)) {
+          PutKey(db.get(), write_options, kv.first, kv.second);
+          type_map[kv.first] = "value";
+        } else {
+          MergeKey(db.get(), write_options, kv.first, kv.second);
+          type_map[kv.first] = "merge";
+        }
+      }
+      if (i < 2) {
+        ASSERT_OK(db->Flush(FlushOptions()));
+      }
+    }
+
+    for (size_t i = 0; i < 2; i++) {
+      for (auto& kv : entry_maps[i]) {
+        if (rnd.OneIn(10)) {
+          whole_map.erase(kv.first);
+          DeleteKey(db.get(), write_options, kv.first);
+          entry_maps[2][kv.first] = "delete";
+        }
+      }
+    }
+
+    if (FLAGS_enable_print) {
+      for (size_t i = 0; i < 3; i++) {
+        for (auto& kv : entry_maps[i]) {
+          std::cout << "[" << i << "]" << kv.first.prefix << kv.first.sorted
+                    << " " << kv.second + " " + type_map[kv.first] << std::endl;
+        }
+      }
+    }
+
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+    for (uint64_t prefix = 0; prefix < 10; prefix++) {
+      uint64_t start_suffix = rnd.Uniform(9);
+      SeekIterator(iter.get(), prefix, start_suffix);
+      auto it = whole_map.find(TestKey(prefix, start_suffix));
+      if (it == whole_map.end()) {
+        continue;
+      }
+      ASSERT_NE(it, whole_map.end());
+      ASSERT_TRUE(iter->Valid());
+      if (FLAGS_enable_print) {
+        std::cout << "round " << prefix
+                  << " iter: " << SliceToTestKey(iter->key()).prefix
+                  << SliceToTestKey(iter->key()).sorted
+                  << " | map: " << it->first.prefix << it->first.sorted << " | "
+                  << iter->value().ToString() << " " << it->second << std::endl;
+      }
+      ASSERT_EQ(iter->value(), it->second);
+      uint64_t stored_prefix = prefix;
+      for (size_t k = 0; k < 9; k++) {
+        if (rnd.OneIn(2) || it == whole_map.begin()) {
+          iter->Next();
+          ++it;
+          if (FLAGS_enable_print) {
+            std::cout << "Next >> ";
+          }
+        } else {
+          iter->Prev();
+          it--;
+          if (FLAGS_enable_print) {
+            std::cout << "Prev >> ";
+          }
+        }
+        if (!iter->Valid() ||
+            SliceToTestKey(iter->key()).prefix != stored_prefix) {
+          break;
+        }
+        ASSERT_OK(iter->status());
+        stored_prefix = SliceToTestKey(iter->key()).prefix;
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_NE(it, whole_map.end());
+        ASSERT_EQ(iter->value(), it->second);
+        if (FLAGS_enable_print) {
+          std::cout << "iter: " << SliceToTestKey(iter->key()).prefix
+                    << SliceToTestKey(iter->key()).sorted
+                    << " | map: " << it->first.prefix << it->first.sorted
+                    << " | " << iter->value().ToString() << " " << it->second
+                    << std::endl;
+        }
+      }
+    }
+  }
+}
+
+TEST_F(PrefixTest, PrefixSeekModePrev2) {
+  // Only for SkipListFactory
+  // test the case
+  //        iter1                iter2
+  // | prefix | suffix |  | prefix | suffix |
+  // |   1    |   1    |  |   1    |   2    |
+  // |   1    |   3    |  |   1    |   4    |
+  // |   2    |   1    |  |   3    |   3    |
+  // |   2    |   2    |  |   3    |   4    |
+  // after seek(15), iter1 will be at 21 and iter2 will be 33.
+  // Then if call Prev() in prefix mode where SeekForPrev(21) gets called,
+  // iter2 should turn to invalid state because of bloom filter.
+  options.memtable_factory.reset(new SkipListFactory);
+  options.write_buffer_size = 1024 * 1024;
+  std::string v13("v13");
+  ASSERT_OK(DestroyDB(kDbName, Options()));
+  auto db = OpenDb();
+  WriteOptions write_options;
+  ReadOptions read_options;
+  PutKey(db.get(), write_options, TestKey(1, 2), "v12");
+  PutKey(db.get(), write_options, TestKey(1, 4), "v14");
+  PutKey(db.get(), write_options, TestKey(3, 3), "v33");
+  PutKey(db.get(), write_options, TestKey(3, 4), "v34");
+  ASSERT_OK(db->Flush(FlushOptions()));
+  ASSERT_OK(
+      static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
+  PutKey(db.get(), write_options, TestKey(1, 1), "v11");
+  PutKey(db.get(), write_options, TestKey(1, 3), "v13");
+  PutKey(db.get(), write_options, TestKey(2, 1), "v21");
+  PutKey(db.get(), write_options, TestKey(2, 2), "v22");
+  ASSERT_OK(db->Flush(FlushOptions()));
+  ASSERT_OK(
+      static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
+  std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+  SeekIterator(iter.get(), 1, 5);
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->value(), v13);
+}
+
+TEST_F(PrefixTest, PrefixSeekModePrev3) {
+  // Only for SkipListFactory
+  // test SeekToLast() with iterate_upper_bound_ in prefix_seek_mode
+  options.memtable_factory.reset(new SkipListFactory);
+  options.write_buffer_size = 1024 * 1024;
+  std::string v14("v14");
+  TestKey upper_bound_key = TestKey(1, 5);
+  std::string s;
+  Slice upper_bound = TestKeyToSlice(s, upper_bound_key);
+
+  {
+    ASSERT_OK(DestroyDB(kDbName, Options()));
+    auto db = OpenDb();
+    WriteOptions write_options;
+    ReadOptions read_options;
+    read_options.iterate_upper_bound = &upper_bound;
+    PutKey(db.get(), write_options, TestKey(1, 2), "v12");
+    PutKey(db.get(), write_options, TestKey(1, 4), "v14");
+    ASSERT_OK(db->Flush(FlushOptions()));
+    ASSERT_OK(
+        static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
+    PutKey(db.get(), write_options, TestKey(1, 1), "v11");
+    PutKey(db.get(), write_options, TestKey(1, 3), "v13");
+    PutKey(db.get(), write_options, TestKey(2, 1), "v21");
+    PutKey(db.get(), write_options, TestKey(2, 2), "v22");
+    ASSERT_OK(db->Flush(FlushOptions()));
+    ASSERT_OK(
+        static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+    iter->SeekToLast();
+    ASSERT_EQ(iter->value(), v14);
+  }
+  {
+    ASSERT_OK(DestroyDB(kDbName, Options()));
+    auto db = OpenDb();
+    WriteOptions write_options;
+    ReadOptions read_options;
+    read_options.iterate_upper_bound = &upper_bound;
+    PutKey(db.get(), write_options, TestKey(1, 2), "v12");
+    PutKey(db.get(), write_options, TestKey(1, 4), "v14");
+    PutKey(db.get(), write_options, TestKey(3, 3), "v33");
+    PutKey(db.get(), write_options, TestKey(3, 4), "v34");
+    ASSERT_OK(db->Flush(FlushOptions()));
+    ASSERT_OK(
+        static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
+    PutKey(db.get(), write_options, TestKey(1, 1), "v11");
+    PutKey(db.get(), write_options, TestKey(1, 3), "v13");
+    ASSERT_OK(db->Flush(FlushOptions()));
+    ASSERT_OK(
+        static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+    iter->SeekToLast();
+    ASSERT_EQ(iter->value(), v14);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  ParseCommandLineFlags(&argc, &argv, true);
+  return RUN_ALL_TESTS();
+}
+
+#endif  // GFLAGS
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as HashSkipList and HashLinkList are not supported in "
+          "ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/range_del_aggregator.cc b/src/rocksdb/db/range_del_aggregator.cc
new file mode 100644
index 000000000..c03efa11f
--- /dev/null
+++ b/src/rocksdb/db/range_del_aggregator.cc
@@ -0,0 +1,524 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/range_del_aggregator.h"
+
+#include "db/compaction/compaction_iteration_stats.h"
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/version_edit.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/types.h"
+#include "table/internal_iterator.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/table_builder.h"
+#include "util/heap.h"
+#include "util/kv_map.h"
+#include "util/vector_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+TruncatedRangeDelIterator::TruncatedRangeDelIterator(
+    std::unique_ptr<FragmentedRangeTombstoneIterator> iter,
+    const InternalKeyComparator* icmp, const InternalKey* smallest,
+    const InternalKey* largest)
+    : iter_(std::move(iter)),
+      icmp_(icmp),
+      smallest_ikey_(smallest),
+      largest_ikey_(largest) {
+  if (smallest != nullptr) {
+    pinned_bounds_.emplace_back();
+    auto& parsed_smallest = pinned_bounds_.back();
+    Status pik_status = ParseInternalKey(smallest->Encode(), &parsed_smallest,
+                                         false /* log_err_key */);  // TODO
+    pik_status.PermitUncheckedError();
+    assert(pik_status.ok());
+    smallest_ = &parsed_smallest;
+  }
+  if (largest != nullptr) {
+    pinned_bounds_.emplace_back();
+    auto& parsed_largest = pinned_bounds_.back();
+
+    Status pik_status = ParseInternalKey(largest->Encode(), &parsed_largest,
+                                         false /* log_err_key */);  // TODO
+    pik_status.PermitUncheckedError();
+    assert(pik_status.ok());
+
+    if (parsed_largest.type == kTypeRangeDeletion &&
+        parsed_largest.sequence == kMaxSequenceNumber) {
+      // The file boundary has been artificially extended by a range tombstone.
+      // We do not need to adjust largest to properly truncate range
+      // tombstones that extend past the boundary.
+    } else if (parsed_largest.sequence == 0) {
+      // The largest key in the sstable has a sequence number of 0. Since we
+      // guarantee that no internal keys with the same user key and sequence
+      // number can exist in a DB, we know that the largest key in this sstable
+      // cannot exist as the smallest key in the next sstable. This further
+      // implies that no range tombstone in this sstable covers largest;
+      // otherwise, the file boundary would have been artificially extended.
+      //
+      // Therefore, we will never truncate a range tombstone at largest, so we
+      // can leave it unchanged.
+    } else {
+      // The same user key may straddle two sstable boundaries. To ensure that
+      // the truncated end key can cover the largest key in this sstable, reduce
+      // its sequence number by 1.
+      parsed_largest.sequence -= 1;
+      // This line is not needed for correctness, but it ensures that the
+      // truncated end key is not covering keys from the next SST file.
+      parsed_largest.type = kValueTypeForSeek;
+    }
+    largest_ = &parsed_largest;
+  }
+}
+
+bool TruncatedRangeDelIterator::Valid() const {
+  assert(iter_ != nullptr);
+  return iter_->Valid() &&
+         (smallest_ == nullptr ||
+          icmp_->Compare(*smallest_, iter_->parsed_end_key()) < 0) &&
+         (largest_ == nullptr ||
+          icmp_->Compare(iter_->parsed_start_key(), *largest_) < 0);
+}
+
+// NOTE: target is a user key, with timestamp if enabled.
+void TruncatedRangeDelIterator::Seek(const Slice& target) {
+  if (largest_ != nullptr &&
+      icmp_->Compare(*largest_, ParsedInternalKey(target, kMaxSequenceNumber,
+                                                  kTypeRangeDeletion)) <= 0) {
+    iter_->Invalidate();
+    return;
+  }
+  if (smallest_ != nullptr &&
+      icmp_->user_comparator()->Compare(target, smallest_->user_key) < 0) {
+    iter_->Seek(smallest_->user_key);
+    return;
+  }
+  iter_->Seek(target);
+}
+
+// NOTE: target is a user key, with timestamp if enabled.
+void TruncatedRangeDelIterator::SeekForPrev(const Slice& target) {
+  if (smallest_ != nullptr &&
+      icmp_->Compare(ParsedInternalKey(target, 0, kTypeRangeDeletion),
+                     *smallest_) < 0) {
+    iter_->Invalidate();
+    return;
+  }
+  if (largest_ != nullptr &&
+      icmp_->user_comparator()->Compare(largest_->user_key, target) < 0) {
+    iter_->SeekForPrev(largest_->user_key);
+    return;
+  }
+  iter_->SeekForPrev(target);
+}
+
+void TruncatedRangeDelIterator::SeekToFirst() {
+  if (smallest_ != nullptr) {
+    iter_->Seek(smallest_->user_key);
+    return;
+  }
+  iter_->SeekToTopFirst();
+}
+
+void TruncatedRangeDelIterator::SeekToLast() {
+  if (largest_ != nullptr) {
+    iter_->SeekForPrev(largest_->user_key);
+    return;
+  }
+  iter_->SeekToTopLast();
+}
+
+std::map<SequenceNumber, std::unique_ptr<TruncatedRangeDelIterator>>
+TruncatedRangeDelIterator::SplitBySnapshot(
+    const std::vector<SequenceNumber>& snapshots) {
+  using FragmentedIterPair =
+      std::pair<const SequenceNumber,
+                std::unique_ptr<FragmentedRangeTombstoneIterator>>;
+
+  auto split_untruncated_iters = iter_->SplitBySnapshot(snapshots);
+  std::map<SequenceNumber, std::unique_ptr<TruncatedRangeDelIterator>>
+      split_truncated_iters;
+  std::for_each(
+      split_untruncated_iters.begin(), split_untruncated_iters.end(),
+      [&](FragmentedIterPair& iter_pair) {
+        auto truncated_iter = std::make_unique<TruncatedRangeDelIterator>(
+            std::move(iter_pair.second), icmp_, smallest_ikey_, largest_ikey_);
+        split_truncated_iters.emplace(iter_pair.first,
+                                      std::move(truncated_iter));
+      });
+  return split_truncated_iters;
+}
+
+ForwardRangeDelIterator::ForwardRangeDelIterator(
+    const InternalKeyComparator* icmp)
+    : icmp_(icmp),
+      unused_idx_(0),
+      active_seqnums_(SeqMaxComparator()),
+      active_iters_(EndKeyMinComparator(icmp)),
+      inactive_iters_(StartKeyMinComparator(icmp)) {}
+
+bool ForwardRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) {
+  // Move active iterators that end before parsed.
+  while (!active_iters_.empty() &&
+         icmp_->Compare((*active_iters_.top())->end_key(), parsed) <= 0) {
+    TruncatedRangeDelIterator* iter = PopActiveIter();
+    do {
+      iter->Next();
+    } while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0);
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
+  }
+
+  // Move inactive iterators that start before parsed.
+  while (!inactive_iters_.empty() &&
+         icmp_->Compare(inactive_iters_.top()->start_key(), parsed) <= 0) {
+    TruncatedRangeDelIterator* iter = PopInactiveIter();
+    while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0) {
+      iter->Next();
+    }
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
+  }
+
+  return active_seqnums_.empty()
+             ? false
+             : (*active_seqnums_.begin())->seq() > parsed.sequence;
+}
+
+void ForwardRangeDelIterator::Invalidate() {
+  unused_idx_ = 0;
+  active_iters_.clear();
+  active_seqnums_.clear();
+  inactive_iters_.clear();
+}
+
+ReverseRangeDelIterator::ReverseRangeDelIterator(
+    const InternalKeyComparator* icmp)
+    : icmp_(icmp),
+      unused_idx_(0),
+      active_seqnums_(SeqMaxComparator()),
+      active_iters_(StartKeyMaxComparator(icmp)),
+      inactive_iters_(EndKeyMaxComparator(icmp)) {}
+
+bool ReverseRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) {
+  // Move active iterators that start after parsed.
+  while (!active_iters_.empty() &&
+         icmp_->Compare(parsed, (*active_iters_.top())->start_key()) < 0) {
+    TruncatedRangeDelIterator* iter = PopActiveIter();
+    do {
+      iter->Prev();
+    } while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0);
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
+  }
+
+  // Move inactive iterators that end after parsed.
+  while (!inactive_iters_.empty() &&
+         icmp_->Compare(parsed, inactive_iters_.top()->end_key()) < 0) {
+    TruncatedRangeDelIterator* iter = PopInactiveIter();
+    while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0) {
+      iter->Prev();
+    }
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
+  }
+
+  return active_seqnums_.empty()
+             ? false
+             : (*active_seqnums_.begin())->seq() > parsed.sequence;
+}
+
+void ReverseRangeDelIterator::Invalidate() {
+  unused_idx_ = 0;
+  active_iters_.clear();
+  active_seqnums_.clear();
+  inactive_iters_.clear();
+}
+
+bool RangeDelAggregator::StripeRep::ShouldDelete(
+    const ParsedInternalKey& parsed, RangeDelPositioningMode mode) {
+  if (!InStripe(parsed.sequence) || IsEmpty()) {
+    return false;
+  }
+  switch (mode) {
+    case RangeDelPositioningMode::kForwardTraversal:
+      InvalidateReverseIter();
+
+      // Pick up previously unseen iterators.
+      for (auto it = std::next(iters_.begin(), forward_iter_.UnusedIdx());
+           it != iters_.end(); ++it, forward_iter_.IncUnusedIdx()) {
+        auto& iter = *it;
+        forward_iter_.AddNewIter(iter.get(), parsed);
+      }
+
+      return forward_iter_.ShouldDelete(parsed);
+    case RangeDelPositioningMode::kBackwardTraversal:
+      InvalidateForwardIter();
+
+      // Pick up previously unseen iterators.
+      for (auto it = std::next(iters_.begin(), reverse_iter_.UnusedIdx());
+           it != iters_.end(); ++it, reverse_iter_.IncUnusedIdx()) {
+        auto& iter = *it;
+        reverse_iter_.AddNewIter(iter.get(), parsed);
+      }
+
+      return reverse_iter_.ShouldDelete(parsed);
+    default:
+      assert(false);
+      return false;
+  }
+}
+
+bool RangeDelAggregator::StripeRep::IsRangeOverlapped(const Slice& start,
+                                                      const Slice& end) {
+  Invalidate();
+
+  // Set the internal start/end keys so that:
+  // - if start_ikey has the same user key and sequence number as the
+  // current end key, start_ikey will be considered greater; and
+  // - if end_ikey has the same user key and sequence number as the current
+  // start key, end_ikey will be considered greater.
+  ParsedInternalKey start_ikey(start, kMaxSequenceNumber,
+                               static_cast<ValueType>(0));
+  ParsedInternalKey end_ikey(end, 0, static_cast<ValueType>(0));
+  for (auto& iter : iters_) {
+    bool checked_candidate_tombstones = false;
+    for (iter->SeekForPrev(start);
+         iter->Valid() && icmp_->Compare(iter->start_key(), end_ikey) <= 0;
+         iter->Next()) {
+      checked_candidate_tombstones = true;
+      if (icmp_->Compare(start_ikey, iter->end_key()) < 0 &&
+          icmp_->Compare(iter->start_key(), end_ikey) <= 0) {
+        return true;
+      }
+    }
+
+    if (!checked_candidate_tombstones) {
+      // Do an additional check for when the end of the range is the begin
+      // key of a tombstone, which we missed earlier since SeekForPrev'ing
+      // to the start was invalid.
+      iter->SeekForPrev(end);
+      if (iter->Valid() && icmp_->Compare(start_ikey, iter->end_key()) < 0 &&
+          icmp_->Compare(iter->start_key(), end_ikey) <= 0) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+void ReadRangeDelAggregator::AddTombstones(
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+    const InternalKey* smallest, const InternalKey* largest) {
+  if (input_iter == nullptr || input_iter->empty()) {
+    return;
+  }
+  rep_.AddTombstones(std::make_unique<TruncatedRangeDelIterator>(
+      std::move(input_iter), icmp_, smallest, largest));
+}
+
+bool ReadRangeDelAggregator::ShouldDeleteImpl(const ParsedInternalKey& parsed,
+                                              RangeDelPositioningMode mode) {
+  return rep_.ShouldDelete(parsed, mode);
+}
+
+bool ReadRangeDelAggregator::IsRangeOverlapped(const Slice& start,
+                                               const Slice& end) {
+  InvalidateRangeDelMapPositions();
+  return rep_.IsRangeOverlapped(start, end);
+}
+
+void CompactionRangeDelAggregator::AddTombstones(
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+    const InternalKey* smallest, const InternalKey* largest) {
+  if (input_iter == nullptr || input_iter->empty()) {
+    return;
+  }
+  // This bounds output of CompactionRangeDelAggregator::NewIterator.
+  if (!trim_ts_.empty()) {
+    assert(icmp_->user_comparator()->timestamp_size() > 0);
+    input_iter->SetTimestampUpperBound(&trim_ts_);
+  }
+
+  assert(input_iter->lower_bound() == 0);
+  assert(input_iter->upper_bound() == kMaxSequenceNumber);
+  parent_iters_.emplace_back(new TruncatedRangeDelIterator(
+      std::move(input_iter), icmp_, smallest, largest));
+
+  Slice* ts_upper_bound = nullptr;
+  if (!ts_upper_bound_.empty()) {
+    assert(icmp_->user_comparator()->timestamp_size() > 0);
+    ts_upper_bound = &ts_upper_bound_;
+  }
+  auto split_iters = parent_iters_.back()->SplitBySnapshot(*snapshots_);
+  for (auto& split_iter : split_iters) {
+    auto it = reps_.find(split_iter.first);
+    if (it == reps_.end()) {
+      bool inserted;
+      SequenceNumber upper_bound = split_iter.second->upper_bound();
+      SequenceNumber lower_bound = split_iter.second->lower_bound();
+      std::tie(it, inserted) = reps_.emplace(
+          split_iter.first, StripeRep(icmp_, upper_bound, lower_bound));
+      assert(inserted);
+    }
+    assert(it != reps_.end());
+    // ts_upper_bound is used to bound ShouldDelete() to only consider
+    // range tombstones under full_history_ts_low_ and trim_ts_. Keys covered by
+    // range tombstones that are above full_history_ts_low_ should not be
+    // dropped prematurely: user may read with a timestamp between the range
+    // tombstone and the covered key. Note that we cannot set timestamp
+    // upperbound on the original `input_iter` since `input_iter`s are later
+    // used in CompactionRangeDelAggregator::NewIterator to output range
+    // tombstones for persistence. We do not want to only persist range
+    // tombstones with timestamp lower than ts_upper_bound.
+    split_iter.second->SetTimestampUpperBound(ts_upper_bound);
+    it->second.AddTombstones(std::move(split_iter.second));
+  }
+}
+
+bool CompactionRangeDelAggregator::ShouldDelete(const ParsedInternalKey& parsed,
+                                                RangeDelPositioningMode mode) {
+  auto it = reps_.lower_bound(parsed.sequence);
+  if (it == reps_.end()) {
+    return false;
+  }
+  return it->second.ShouldDelete(parsed, mode);
+}
+
+namespace {
+
+// Produce a sorted (by start internal key) stream of range tombstones from
+// `children`. lower_bound and upper_bound on user key can be
+// optionally specified. Range tombstones that ends before lower_bound or starts
+// after upper_bound are excluded.
+// If user-defined timestamp is enabled, lower_bound and upper_bound should
+// contain timestamp, but comparison is done ignoring timestamps.
+class TruncatedRangeDelMergingIter : public InternalIterator {
+ public:
+  TruncatedRangeDelMergingIter(
+      const InternalKeyComparator* icmp, const Slice* lower_bound,
+      const Slice* upper_bound, bool upper_bound_inclusive,
+      const std::vector<std::unique_ptr<TruncatedRangeDelIterator>>& children)
+      : icmp_(icmp),
+        lower_bound_(lower_bound),
+        upper_bound_(upper_bound),
+        upper_bound_inclusive_(upper_bound_inclusive),
+        heap_(StartKeyMinComparator(icmp)),
+        ts_sz_(icmp_->user_comparator()->timestamp_size()) {
+    for (auto& child : children) {
+      if (child != nullptr) {
+        assert(child->lower_bound() == 0);
+        assert(child->upper_bound() == kMaxSequenceNumber);
+        children_.push_back(child.get());
+      }
+    }
+  }
+
+  bool Valid() const override {
+    return !heap_.empty() && BeforeEndKey(heap_.top());
+  }
+  Status status() const override { return Status::OK(); }
+
+  void SeekToFirst() override {
+    heap_.clear();
+    for (auto& child : children_) {
+      if (lower_bound_ != nullptr) {
+        child->Seek(*lower_bound_);
+      } else {
+        child->SeekToFirst();
+      }
+      if (child->Valid()) {
+        heap_.push(child);
+      }
+    }
+  }
+
+  void Next() override {
+    auto* top = heap_.top();
+    top->InternalNext();
+    if (top->Valid()) {
+      heap_.replace_top(top);
+    } else {
+      heap_.pop();
+    }
+  }
+
+  Slice key() const override {
+    auto* top = heap_.top();
+    if (ts_sz_) {
+      cur_start_key_.Set(top->start_key().user_key, top->seq(),
+                         kTypeRangeDeletion, top->timestamp());
+    } else {
+      cur_start_key_.Set(top->start_key().user_key, top->seq(),
+                         kTypeRangeDeletion);
+    }
+    assert(top->start_key().user_key.size() >= ts_sz_);
+    return cur_start_key_.Encode();
+  }
+
+  Slice value() const override {
+    auto* top = heap_.top();
+    if (!ts_sz_) {
+      return top->end_key().user_key;
+    }
+    assert(top->timestamp().size() == ts_sz_);
+    cur_end_key_.clear();
+    cur_end_key_.append(top->end_key().user_key.data(),
+                        top->end_key().user_key.size() - ts_sz_);
+    cur_end_key_.append(top->timestamp().data(), ts_sz_);
+    return cur_end_key_;
+  }
+
+  // Unused InternalIterator methods
+  void Prev() override { assert(false); }
+  void Seek(const Slice& /* target */) override { assert(false); }
+  void SeekForPrev(const Slice& /* target */) override { assert(false); }
+  void SeekToLast() override { assert(false); }
+
+ private:
+  bool BeforeEndKey(const TruncatedRangeDelIterator* iter) const {
+    if (upper_bound_ == nullptr) {
+      return true;
+    }
+    int cmp = icmp_->user_comparator()->CompareWithoutTimestamp(
+        iter->start_key().user_key, *upper_bound_);
+    return upper_bound_inclusive_ ? cmp <= 0 : cmp < 0;
+  }
+
+  const InternalKeyComparator* icmp_;
+  const Slice* lower_bound_;
+  const Slice* upper_bound_;
+  bool upper_bound_inclusive_;
+  BinaryHeap<TruncatedRangeDelIterator*, StartKeyMinComparator> heap_;
+  std::vector<TruncatedRangeDelIterator*> children_;
+
+  mutable InternalKey cur_start_key_;
+  mutable std::string cur_end_key_;
+  size_t ts_sz_;
+};
+
+}  // anonymous namespace
+
+std::unique_ptr<FragmentedRangeTombstoneIterator>
+CompactionRangeDelAggregator::NewIterator(const Slice* lower_bound,
+                                          const Slice* upper_bound,
+                                          bool upper_bound_inclusive) {
+  InvalidateRangeDelMapPositions();
+  auto merging_iter = std::make_unique<TruncatedRangeDelMergingIter>(
+      icmp_, lower_bound, upper_bound, upper_bound_inclusive, parent_iters_);
+
+  auto fragmented_tombstone_list =
+      std::make_shared<FragmentedRangeTombstoneList>(
+          std::move(merging_iter), *icmp_, true /* for_compaction */,
+          *snapshots_);
+
+  return std::make_unique<FragmentedRangeTombstoneIterator>(
+      fragmented_tombstone_list, *icmp_, kMaxSequenceNumber /* upper_bound */);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/range_del_aggregator.h b/src/rocksdb/db/range_del_aggregator.h
new file mode 100644
index 000000000..9bd40967d
--- /dev/null
+++ b/src/rocksdb/db/range_del_aggregator.h
@@ -0,0 +1,476 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <algorithm>
+#include <iterator>
+#include <list>
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "db/compaction/compaction_iteration_stats.h"
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/version_edit.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/types.h"
+#include "table/internal_iterator.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/table_builder.h"
+#include "util/heap.h"
+#include "util/kv_map.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TruncatedRangeDelIterator {
+ public:
+  TruncatedRangeDelIterator(
+      std::unique_ptr<FragmentedRangeTombstoneIterator> iter,
+      const InternalKeyComparator* icmp, const InternalKey* smallest,
+      const InternalKey* largest);
+
+  bool Valid() const;
+
+  void Next() { iter_->TopNext(); }
+  void Prev() { iter_->TopPrev(); }
+
+  void InternalNext() { iter_->Next(); }
+
+  // Seeks to the tombstone with the highest visible sequence number that covers
+  // target (a user key). If no such tombstone exists, the position will be at
+  // the earliest tombstone that ends after target.
+  // REQUIRES: target is a user key.
+  void Seek(const Slice& target);
+
+  // Seeks to the tombstone with the highest visible sequence number that covers
+  // target (a user key). If no such tombstone exists, the position will be at
+  // the latest tombstone that starts before target.
+  void SeekForPrev(const Slice& target);
+
+  void SeekToFirst();
+  void SeekToLast();
+
+  ParsedInternalKey start_key() const {
+    return (smallest_ == nullptr ||
+            icmp_->Compare(*smallest_, iter_->parsed_start_key()) <= 0)
+               ? iter_->parsed_start_key()
+               : *smallest_;
+  }
+
+  ParsedInternalKey end_key() const {
+    return (largest_ == nullptr ||
+            icmp_->Compare(iter_->parsed_end_key(), *largest_) <= 0)
+               ? iter_->parsed_end_key()
+               : *largest_;
+  }
+
+  SequenceNumber seq() const { return iter_->seq(); }
+  Slice timestamp() const {
+    assert(icmp_->user_comparator()->timestamp_size());
+    return iter_->timestamp();
+  }
+  void SetTimestampUpperBound(const Slice* ts_upper_bound) {
+    iter_->SetTimestampUpperBound(ts_upper_bound);
+  }
+
+  std::map<SequenceNumber, std::unique_ptr<TruncatedRangeDelIterator>>
+  SplitBySnapshot(const std::vector<SequenceNumber>& snapshots);
+
+  SequenceNumber upper_bound() const { return iter_->upper_bound(); }
+
+  SequenceNumber lower_bound() const { return iter_->lower_bound(); }
+
+ private:
+  std::unique_ptr<FragmentedRangeTombstoneIterator> iter_;
+  const InternalKeyComparator* icmp_;
+  const ParsedInternalKey* smallest_ = nullptr;
+  const ParsedInternalKey* largest_ = nullptr;
+  std::list<ParsedInternalKey> pinned_bounds_;
+
+  const InternalKey* smallest_ikey_;
+  const InternalKey* largest_ikey_;
+};
+
+struct SeqMaxComparator {
+  bool operator()(const TruncatedRangeDelIterator* a,
+                  const TruncatedRangeDelIterator* b) const {
+    return a->seq() > b->seq();
+  }
+};
+
+struct StartKeyMinComparator {
+  explicit StartKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+  bool operator()(const TruncatedRangeDelIterator* a,
+                  const TruncatedRangeDelIterator* b) const {
+    return icmp->Compare(a->start_key(), b->start_key()) > 0;
+  }
+
+  const InternalKeyComparator* icmp;
+};
+
+class ForwardRangeDelIterator {
+ public:
+  explicit ForwardRangeDelIterator(const InternalKeyComparator* icmp);
+
+  bool ShouldDelete(const ParsedInternalKey& parsed);
+  void Invalidate();
+
+  void AddNewIter(TruncatedRangeDelIterator* iter,
+                  const ParsedInternalKey& parsed) {
+    iter->Seek(parsed.user_key);
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
+  }
+
+  size_t UnusedIdx() const { return unused_idx_; }
+  void IncUnusedIdx() { unused_idx_++; }
+
+ private:
+  using ActiveSeqSet =
+      std::multiset<TruncatedRangeDelIterator*, SeqMaxComparator>;
+
+  struct EndKeyMinComparator {
+    explicit EndKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+    bool operator()(const ActiveSeqSet::const_iterator& a,
+                    const ActiveSeqSet::const_iterator& b) const {
+      return icmp->Compare((*a)->end_key(), (*b)->end_key()) > 0;
+    }
+
+    const InternalKeyComparator* icmp;
+  };
+
+  void PushIter(TruncatedRangeDelIterator* iter,
+                const ParsedInternalKey& parsed) {
+    if (!iter->Valid()) {
+      // The iterator has been fully consumed, so we don't need to add it to
+      // either of the heaps.
+      return;
+    }
+    int cmp = icmp_->Compare(parsed, iter->start_key());
+    if (cmp < 0) {
+      PushInactiveIter(iter);
+    } else {
+      PushActiveIter(iter);
+    }
+  }
+
+  void PushActiveIter(TruncatedRangeDelIterator* iter) {
+    auto seq_pos = active_seqnums_.insert(iter);
+    active_iters_.push(seq_pos);
+  }
+
+  TruncatedRangeDelIterator* PopActiveIter() {
+    auto active_top = active_iters_.top();
+    auto iter = *active_top;
+    active_iters_.pop();
+    active_seqnums_.erase(active_top);
+    return iter;
+  }
+
+  void PushInactiveIter(TruncatedRangeDelIterator* iter) {
+    inactive_iters_.push(iter);
+  }
+
+  TruncatedRangeDelIterator* PopInactiveIter() {
+    auto* iter = inactive_iters_.top();
+    inactive_iters_.pop();
+    return iter;
+  }
+
+  const InternalKeyComparator* icmp_;
+  size_t unused_idx_;
+  ActiveSeqSet active_seqnums_;
+  BinaryHeap<ActiveSeqSet::const_iterator, EndKeyMinComparator> active_iters_;
+  BinaryHeap<TruncatedRangeDelIterator*, StartKeyMinComparator> inactive_iters_;
+};
+
+class ReverseRangeDelIterator {
+ public:
+  explicit ReverseRangeDelIterator(const InternalKeyComparator* icmp);
+
+  bool ShouldDelete(const ParsedInternalKey& parsed);
+  void Invalidate();
+
+  void AddNewIter(TruncatedRangeDelIterator* iter,
+                  const ParsedInternalKey& parsed) {
+    iter->SeekForPrev(parsed.user_key);
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
+  }
+
+  size_t UnusedIdx() const { return unused_idx_; }
+  void IncUnusedIdx() { unused_idx_++; }
+
+ private:
+  using ActiveSeqSet =
+      std::multiset<TruncatedRangeDelIterator*, SeqMaxComparator>;
+
+  struct EndKeyMaxComparator {
+    explicit EndKeyMaxComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+    bool operator()(const TruncatedRangeDelIterator* a,
+                    const TruncatedRangeDelIterator* b) const {
+      return icmp->Compare(a->end_key(), b->end_key()) < 0;
+    }
+
+    const InternalKeyComparator* icmp;
+  };
+  struct StartKeyMaxComparator {
+    explicit StartKeyMaxComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+    bool operator()(const ActiveSeqSet::const_iterator& a,
+                    const ActiveSeqSet::const_iterator& b) const {
+      return icmp->Compare((*a)->start_key(), (*b)->start_key()) < 0;
+    }
+
+    const InternalKeyComparator* icmp;
+  };
+
+  void PushIter(TruncatedRangeDelIterator* iter,
+                const ParsedInternalKey& parsed) {
+    if (!iter->Valid()) {
+      // The iterator has been fully consumed, so we don't need to add it to
+      // either of the heaps.
+    } else if (icmp_->Compare(iter->end_key(), parsed) <= 0) {
+      PushInactiveIter(iter);
+    } else {
+      PushActiveIter(iter);
+    }
+  }
+
+  void PushActiveIter(TruncatedRangeDelIterator* iter) {
+    auto seq_pos = active_seqnums_.insert(iter);
+    active_iters_.push(seq_pos);
+  }
+
+  TruncatedRangeDelIterator* PopActiveIter() {
+    auto active_top = active_iters_.top();
+    auto iter = *active_top;
+    active_iters_.pop();
+    active_seqnums_.erase(active_top);
+    return iter;
+  }
+
+  void PushInactiveIter(TruncatedRangeDelIterator* iter) {
+    inactive_iters_.push(iter);
+  }
+
+  TruncatedRangeDelIterator* PopInactiveIter() {
+    auto* iter = inactive_iters_.top();
+    inactive_iters_.pop();
+    return iter;
+  }
+
+  const InternalKeyComparator* icmp_;
+  size_t unused_idx_;
+  ActiveSeqSet active_seqnums_;
+  BinaryHeap<ActiveSeqSet::const_iterator, StartKeyMaxComparator> active_iters_;
+  BinaryHeap<TruncatedRangeDelIterator*, EndKeyMaxComparator> inactive_iters_;
+};
+
+enum class RangeDelPositioningMode { kForwardTraversal, kBackwardTraversal };
+class RangeDelAggregator {
+ public:
+  explicit RangeDelAggregator(const InternalKeyComparator* icmp)
+      : icmp_(icmp) {}
+  virtual ~RangeDelAggregator() {}
+
+  virtual void AddTombstones(
+      std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+      const InternalKey* smallest = nullptr,
+      const InternalKey* largest = nullptr) = 0;
+
+  bool ShouldDelete(const Slice& ikey, RangeDelPositioningMode mode) {
+    ParsedInternalKey parsed;
+
+    Status pik_status =
+        ParseInternalKey(ikey, &parsed, false /* log_err_key */);  // TODO
+    assert(pik_status.ok());
+    if (!pik_status.ok()) {
+      return false;
+    }
+
+    return ShouldDelete(parsed, mode);
+  }
+  virtual bool ShouldDelete(const ParsedInternalKey& parsed,
+                            RangeDelPositioningMode mode) = 0;
+
+  virtual void InvalidateRangeDelMapPositions() = 0;
+
+  virtual bool IsEmpty() const = 0;
+
+  bool AddFile(uint64_t file_number) {
+    return files_seen_.insert(file_number).second;
+  }
+
+ protected:
+  class StripeRep {
+   public:
+    StripeRep(const InternalKeyComparator* icmp, SequenceNumber upper_bound,
+              SequenceNumber lower_bound)
+        : icmp_(icmp),
+          forward_iter_(icmp),
+          reverse_iter_(icmp),
+          upper_bound_(upper_bound),
+          lower_bound_(lower_bound) {}
+
+    void AddTombstones(std::unique_ptr<TruncatedRangeDelIterator> input_iter) {
+      iters_.push_back(std::move(input_iter));
+    }
+
+    bool IsEmpty() const { return iters_.empty(); }
+
+    bool ShouldDelete(const ParsedInternalKey& parsed,
+                      RangeDelPositioningMode mode);
+
+    void Invalidate() {
+      if (!IsEmpty()) {
+        InvalidateForwardIter();
+        InvalidateReverseIter();
+      }
+    }
+
+    // If user-defined timestamp is enabled, `start` and `end` are user keys
+    // with timestamp.
+    bool IsRangeOverlapped(const Slice& start, const Slice& end);
+
+   private:
+    bool InStripe(SequenceNumber seq) const {
+      return lower_bound_ <= seq && seq <= upper_bound_;
+    }
+
+    void InvalidateForwardIter() { forward_iter_.Invalidate(); }
+
+    void InvalidateReverseIter() { reverse_iter_.Invalidate(); }
+
+    const InternalKeyComparator* icmp_;
+    std::vector<std::unique_ptr<TruncatedRangeDelIterator>> iters_;
+    ForwardRangeDelIterator forward_iter_;
+    ReverseRangeDelIterator reverse_iter_;
+    SequenceNumber upper_bound_;
+    SequenceNumber lower_bound_;
+  };
+
+  const InternalKeyComparator* icmp_;
+
+ private:
+  std::set<uint64_t> files_seen_;
+};
+
+class ReadRangeDelAggregator final : public RangeDelAggregator {
+ public:
+  ReadRangeDelAggregator(const InternalKeyComparator* icmp,
+                         SequenceNumber upper_bound)
+      : RangeDelAggregator(icmp),
+        rep_(icmp, upper_bound, 0 /* lower_bound */) {}
+  ~ReadRangeDelAggregator() override {}
+
+  using RangeDelAggregator::ShouldDelete;
+  void AddTombstones(
+      std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+      const InternalKey* smallest = nullptr,
+      const InternalKey* largest = nullptr) override;
+
+  bool ShouldDelete(const ParsedInternalKey& parsed,
+                    RangeDelPositioningMode mode) final override {
+    if (rep_.IsEmpty()) {
+      return false;
+    }
+    return ShouldDeleteImpl(parsed, mode);
+  }
+
+  bool IsRangeOverlapped(const Slice& start, const Slice& end);
+
+  void InvalidateRangeDelMapPositions() override { rep_.Invalidate(); }
+
+  bool IsEmpty() const override { return rep_.IsEmpty(); }
+
+ private:
+  StripeRep rep_;
+
+  bool ShouldDeleteImpl(const ParsedInternalKey& parsed,
+                        RangeDelPositioningMode mode);
+};
+
+class CompactionRangeDelAggregator : public RangeDelAggregator {
+ public:
+  CompactionRangeDelAggregator(const InternalKeyComparator* icmp,
+                               const std::vector<SequenceNumber>& snapshots,
+                               const std::string* full_history_ts_low = nullptr,
+                               const std::string* trim_ts = nullptr)
+      : RangeDelAggregator(icmp), snapshots_(&snapshots) {
+    if (full_history_ts_low) {
+      ts_upper_bound_ = *full_history_ts_low;
+    }
+    if (trim_ts) {
+      trim_ts_ = *trim_ts;
+      // Range tombstone newer than `trim_ts` or `full_history_ts_low` should
+      // not be considered in ShouldDelete().
+      if (ts_upper_bound_.empty()) {
+        ts_upper_bound_ = trim_ts_;
+      } else if (!trim_ts_.empty() && icmp->user_comparator()->CompareTimestamp(
+                                          trim_ts_, ts_upper_bound_) < 0) {
+        ts_upper_bound_ = trim_ts_;
+      }
+    }
+  }
+  ~CompactionRangeDelAggregator() override {}
+
+  void AddTombstones(
+      std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+      const InternalKey* smallest = nullptr,
+      const InternalKey* largest = nullptr) override;
+
+  using RangeDelAggregator::ShouldDelete;
+  bool ShouldDelete(const ParsedInternalKey& parsed,
+                    RangeDelPositioningMode mode) override;
+
+  bool IsRangeOverlapped(const Slice& start, const Slice& end);
+
+  void InvalidateRangeDelMapPositions() override {
+    for (auto& rep : reps_) {
+      rep.second.Invalidate();
+    }
+  }
+
+  bool IsEmpty() const override {
+    for (const auto& rep : reps_) {
+      if (!rep.second.IsEmpty()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Creates an iterator over all the range tombstones in the aggregator, for
+  // use in compaction. Nullptr arguments indicate that the iterator range is
+  // unbounded.
+  // NOTE: the boundaries are used for optimization purposes to reduce the
+  // number of tombstones that are passed to the fragmenter; they do not
+  // guarantee that the resulting iterator only contains range tombstones that
+  // cover keys in the provided range. If required, these bounds must be
+  // enforced during iteration.
+  std::unique_ptr<FragmentedRangeTombstoneIterator> NewIterator(
+      const Slice* lower_bound = nullptr, const Slice* upper_bound = nullptr,
+      bool upper_bound_inclusive = false);
+
+ private:
+  std::vector<std::unique_ptr<TruncatedRangeDelIterator>> parent_iters_;
+  std::map<SequenceNumber, StripeRep> reps_;
+
+  const std::vector<SequenceNumber>* snapshots_;
+  // min over full_history_ts_low and trim_ts_
+  Slice ts_upper_bound_{};
+  Slice trim_ts_{};
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/range_del_aggregator_bench.cc b/src/rocksdb/db/range_del_aggregator_bench.cc
new file mode 100644
index 000000000..9dca707e5
--- /dev/null
+++ b/src/rocksdb/db/range_del_aggregator_bench.cc
@@ -0,0 +1,280 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/system_clock.h"
+#include "util/coding.h"
+#include "util/gflags_compat.h"
+#include "util/random.h"
+#include "util/stop_watch.h"
+#include "util/vector_iterator.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_int32(num_range_tombstones, 1000, "number of range tombstones created");
+
+DEFINE_int32(num_runs, 1000, "number of test runs");
+
+DEFINE_int32(tombstone_start_upper_bound, 1000,
+             "exclusive upper bound on range tombstone start keys");
+
+DEFINE_int32(should_delete_upper_bound, 1000,
+             "exclusive upper bound on keys passed to ShouldDelete");
+
+DEFINE_double(tombstone_width_mean, 100.0, "average range tombstone width");
+
+DEFINE_double(tombstone_width_stddev, 0.0,
+              "standard deviation of range tombstone width");
+
+DEFINE_int32(seed, 0, "random number generator seed");
+
+DEFINE_int32(should_deletes_per_run, 1, "number of ShouldDelete calls per run");
+
+DEFINE_int32(add_tombstones_per_run, 1,
+             "number of AddTombstones calls per run");
+
+DEFINE_bool(use_compaction_range_del_aggregator, false,
+            "Whether to use CompactionRangeDelAggregator. Default is to use "
+            "ReadRangeDelAggregator.");
+
+namespace {
+
+struct Stats {
+  uint64_t time_add_tombstones = 0;
+  uint64_t time_first_should_delete = 0;
+  uint64_t time_rest_should_delete = 0;
+  uint64_t time_fragment_tombstones = 0;
+};
+
+std::ostream& operator<<(std::ostream& os, const Stats& s) {
+  std::ios fmt_holder(nullptr);
+  fmt_holder.copyfmt(os);
+
+  os << std::left;
+  os << std::setw(25) << "Fragment Tombstones: "
+     << s.time_fragment_tombstones /
+            (FLAGS_add_tombstones_per_run * FLAGS_num_runs * 1.0e3)
+     << " us\n";
+  os << std::setw(25) << "AddTombstones: "
+     << s.time_add_tombstones /
+            (FLAGS_add_tombstones_per_run * FLAGS_num_runs * 1.0e3)
+     << " us\n";
+  os << std::setw(25) << "ShouldDelete (first): "
+     << s.time_first_should_delete / (FLAGS_num_runs * 1.0e3) << " us\n";
+  if (FLAGS_should_deletes_per_run > 1) {
+    os << std::setw(25) << "ShouldDelete (rest): "
+       << s.time_rest_should_delete /
+              ((FLAGS_should_deletes_per_run - 1) * FLAGS_num_runs * 1.0e3)
+       << " us\n";
+  }
+
+  os.copyfmt(fmt_holder);
+  return os;
+}
+
+auto icmp = ROCKSDB_NAMESPACE::InternalKeyComparator(
+    ROCKSDB_NAMESPACE::BytewiseComparator());
+
+}  // anonymous namespace
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// A wrapper around RangeTombstones and the underlying data of its start and end
+// keys.
+struct PersistentRangeTombstone {
+  std::string start_key;
+  std::string end_key;
+  RangeTombstone tombstone;
+
+  PersistentRangeTombstone(std::string start, std::string end,
+                           SequenceNumber seq)
+      : start_key(std::move(start)), end_key(std::move(end)) {
+    tombstone = RangeTombstone(start_key, end_key, seq);
+  }
+
+  PersistentRangeTombstone() = default;
+
+  PersistentRangeTombstone(const PersistentRangeTombstone& t) { *this = t; }
+
+  PersistentRangeTombstone& operator=(const PersistentRangeTombstone& t) {
+    start_key = t.start_key;
+    end_key = t.end_key;
+    tombstone = RangeTombstone(start_key, end_key, t.tombstone.seq_);
+
+    return *this;
+  }
+
+  PersistentRangeTombstone(PersistentRangeTombstone&& t) noexcept { *this = t; }
+
+  PersistentRangeTombstone& operator=(PersistentRangeTombstone&& t) {
+    start_key = std::move(t.start_key);
+    end_key = std::move(t.end_key);
+    tombstone = RangeTombstone(start_key, end_key, t.tombstone.seq_);
+
+    return *this;
+  }
+};
+
+struct TombstoneStartKeyComparator {
+  explicit TombstoneStartKeyComparator(const Comparator* c) : cmp(c) {}
+
+  bool operator()(const RangeTombstone& a, const RangeTombstone& b) const {
+    return cmp->Compare(a.start_key_, b.start_key_) < 0;
+  }
+
+  const Comparator* cmp;
+};
+
+std::unique_ptr<InternalIterator> MakeRangeDelIterator(
+    const std::vector<PersistentRangeTombstone>& range_dels) {
+  std::vector<std::string> keys, values;
+  for (const auto& range_del : range_dels) {
+    auto key_and_value = range_del.tombstone.Serialize();
+    keys.push_back(key_and_value.first.Encode().ToString());
+    values.push_back(key_and_value.second.ToString());
+  }
+  return std::unique_ptr<VectorIterator>(
+      new VectorIterator(keys, values, &icmp));
+}
+
+// convert long to a big-endian slice key
+static std::string Key(int64_t val) {
+  std::string little_endian_key;
+  std::string big_endian_key;
+  PutFixed64(&little_endian_key, val);
+  assert(little_endian_key.size() == sizeof(val));
+  big_endian_key.resize(sizeof(val));
+  for (size_t i = 0; i < sizeof(val); ++i) {
+    big_endian_key[i] = little_endian_key[sizeof(val) - 1 - i];
+  }
+  return big_endian_key;
+}
+
+}  // anonymous namespace
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  Stats stats;
+  ROCKSDB_NAMESPACE::SystemClock* clock =
+      ROCKSDB_NAMESPACE::SystemClock::Default().get();
+  ROCKSDB_NAMESPACE::Random64 rnd(FLAGS_seed);
+  std::default_random_engine random_gen(FLAGS_seed);
+  std::normal_distribution<double> normal_dist(FLAGS_tombstone_width_mean,
+                                               FLAGS_tombstone_width_stddev);
+  std::vector<std::vector<ROCKSDB_NAMESPACE::PersistentRangeTombstone> >
+      all_persistent_range_tombstones(FLAGS_add_tombstones_per_run);
+  for (int i = 0; i < FLAGS_add_tombstones_per_run; i++) {
+    all_persistent_range_tombstones[i] =
+        std::vector<ROCKSDB_NAMESPACE::PersistentRangeTombstone>(
+            FLAGS_num_range_tombstones);
+  }
+  auto mode = ROCKSDB_NAMESPACE::RangeDelPositioningMode::kForwardTraversal;
+  std::vector<ROCKSDB_NAMESPACE::SequenceNumber> snapshots{0};
+  for (int i = 0; i < FLAGS_num_runs; i++) {
+    std::unique_ptr<ROCKSDB_NAMESPACE::RangeDelAggregator> range_del_agg =
+        nullptr;
+    if (FLAGS_use_compaction_range_del_aggregator) {
+      range_del_agg.reset(new ROCKSDB_NAMESPACE::CompactionRangeDelAggregator(
+          &icmp, snapshots));
+    } else {
+      range_del_agg.reset(new ROCKSDB_NAMESPACE::ReadRangeDelAggregator(
+          &icmp, ROCKSDB_NAMESPACE::kMaxSequenceNumber /* upper_bound */));
+    }
+
+    std::vector<
+        std::unique_ptr<ROCKSDB_NAMESPACE::FragmentedRangeTombstoneList> >
+        fragmented_range_tombstone_lists(FLAGS_add_tombstones_per_run);
+
+    for (auto& persistent_range_tombstones : all_persistent_range_tombstones) {
+      // TODO(abhimadan): consider whether creating the range tombstones right
+      // before AddTombstones is artificially warming the cache compared to
+      // real workloads.
+      for (int j = 0; j < FLAGS_num_range_tombstones; j++) {
+        uint64_t start = rnd.Uniform(FLAGS_tombstone_start_upper_bound);
+        uint64_t end = static_cast<uint64_t>(
+            std::round(start + std::max(1.0, normal_dist(random_gen))));
+        persistent_range_tombstones[j] =
+            ROCKSDB_NAMESPACE::PersistentRangeTombstone(
+                ROCKSDB_NAMESPACE::Key(start), ROCKSDB_NAMESPACE::Key(end), j);
+      }
+      auto iter =
+          ROCKSDB_NAMESPACE::MakeRangeDelIterator(persistent_range_tombstones);
+      ROCKSDB_NAMESPACE::StopWatchNano stop_watch_fragment_tombstones(
+          clock, true /* auto_start */);
+      fragmented_range_tombstone_lists.emplace_back(
+          new ROCKSDB_NAMESPACE::FragmentedRangeTombstoneList(
+              std::move(iter), icmp, FLAGS_use_compaction_range_del_aggregator,
+              snapshots));
+      stats.time_fragment_tombstones +=
+          stop_watch_fragment_tombstones.ElapsedNanos();
+      std::unique_ptr<ROCKSDB_NAMESPACE::FragmentedRangeTombstoneIterator>
+          fragmented_range_del_iter(
+              new ROCKSDB_NAMESPACE::FragmentedRangeTombstoneIterator(
+                  fragmented_range_tombstone_lists.back().get(), icmp,
+                  ROCKSDB_NAMESPACE::kMaxSequenceNumber));
+
+      ROCKSDB_NAMESPACE::StopWatchNano stop_watch_add_tombstones(
+          clock, true /* auto_start */);
+      range_del_agg->AddTombstones(std::move(fragmented_range_del_iter));
+      stats.time_add_tombstones += stop_watch_add_tombstones.ElapsedNanos();
+    }
+
+    ROCKSDB_NAMESPACE::ParsedInternalKey parsed_key;
+    parsed_key.sequence = FLAGS_num_range_tombstones / 2;
+    parsed_key.type = ROCKSDB_NAMESPACE::kTypeValue;
+
+    uint64_t first_key = rnd.Uniform(FLAGS_should_delete_upper_bound -
+                                     FLAGS_should_deletes_per_run + 1);
+
+    for (int j = 0; j < FLAGS_should_deletes_per_run; j++) {
+      std::string key_string = ROCKSDB_NAMESPACE::Key(first_key + j);
+      parsed_key.user_key = key_string;
+
+      ROCKSDB_NAMESPACE::StopWatchNano stop_watch_should_delete(
+          clock, true /* auto_start */);
+      range_del_agg->ShouldDelete(parsed_key, mode);
+      uint64_t call_time = stop_watch_should_delete.ElapsedNanos();
+
+      if (j == 0) {
+        stats.time_first_should_delete += call_time;
+      } else {
+        stats.time_rest_should_delete += call_time;
+      }
+    }
+  }
+
+  std::cout << "=========================\n"
+            << "Results:\n"
+            << "=========================\n"
+            << stats;
+
+  return 0;
+}
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/db/range_del_aggregator_test.cc b/src/rocksdb/db/range_del_aggregator_test.cc
new file mode 100644
index 000000000..7fe35276a
--- /dev/null
+++ b/src/rocksdb/db/range_del_aggregator_test.cc
@@ -0,0 +1,715 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/range_del_aggregator.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "test_util/testutil.h"
+#include "util/vector_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class RangeDelAggregatorTest : public testing::Test {};
+
+namespace {
+
+static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator());
+
+std::unique_ptr<InternalIterator> MakeRangeDelIter(
+    const std::vector<RangeTombstone>& range_dels) {
+  std::vector<std::string> keys, values;
+  for (const auto& range_del : range_dels) {
+    auto key_and_value = range_del.Serialize();
+    keys.push_back(key_and_value.first.Encode().ToString());
+    values.push_back(key_and_value.second.ToString());
+  }
+  return std::unique_ptr<VectorIterator>(
+      new VectorIterator(keys, values, &bytewise_icmp));
+}
+
+std::vector<std::unique_ptr<FragmentedRangeTombstoneList>>
+MakeFragmentedTombstoneLists(
+    const std::vector<std::vector<RangeTombstone>>& range_dels_list) {
+  std::vector<std::unique_ptr<FragmentedRangeTombstoneList>> fragment_lists;
+  for (const auto& range_dels : range_dels_list) {
+    auto range_del_iter = MakeRangeDelIter(range_dels);
+    fragment_lists.emplace_back(new FragmentedRangeTombstoneList(
+        std::move(range_del_iter), bytewise_icmp));
+  }
+  return fragment_lists;
+}
+
+struct TruncatedIterScanTestCase {
+  ParsedInternalKey start;
+  ParsedInternalKey end;
+  SequenceNumber seq;
+};
+
+struct TruncatedIterSeekTestCase {
+  Slice target;
+  ParsedInternalKey start;
+  ParsedInternalKey end;
+  SequenceNumber seq;
+  bool invalid;
+};
+
+struct ShouldDeleteTestCase {
+  ParsedInternalKey lookup_key;
+  bool result;
+};
+
+struct IsRangeOverlappedTestCase {
+  Slice start;
+  Slice end;
+  bool result;
+};
+
+ParsedInternalKey UncutEndpoint(const Slice& s) {
+  return ParsedInternalKey(s, kMaxSequenceNumber, kTypeRangeDeletion);
+}
+
+ParsedInternalKey InternalValue(const Slice& key, SequenceNumber seq,
+                                ValueType type = kTypeValue) {
+  return ParsedInternalKey(key, seq, type);
+}
+
+void VerifyIterator(
+    TruncatedRangeDelIterator* iter, const InternalKeyComparator& icmp,
+    const std::vector<TruncatedIterScanTestCase>& expected_range_dels) {
+  // Test forward iteration.
+  iter->SeekToFirst();
+  for (size_t i = 0; i < expected_range_dels.size(); i++, iter->Next()) {
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(0, icmp.Compare(iter->start_key(), expected_range_dels[i].start));
+    EXPECT_EQ(0, icmp.Compare(iter->end_key(), expected_range_dels[i].end));
+    EXPECT_EQ(expected_range_dels[i].seq, iter->seq());
+  }
+  EXPECT_FALSE(iter->Valid());
+
+  // Test reverse iteration.
+  iter->SeekToLast();
+  std::vector<TruncatedIterScanTestCase> reverse_expected_range_dels(
+      expected_range_dels.rbegin(), expected_range_dels.rend());
+  for (size_t i = 0; i < reverse_expected_range_dels.size();
+       i++, iter->Prev()) {
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(0, icmp.Compare(iter->start_key(),
+                              reverse_expected_range_dels[i].start));
+    EXPECT_EQ(
+        0, icmp.Compare(iter->end_key(), reverse_expected_range_dels[i].end));
+    EXPECT_EQ(reverse_expected_range_dels[i].seq, iter->seq());
+  }
+  EXPECT_FALSE(iter->Valid());
+}
+
+void VerifySeek(TruncatedRangeDelIterator* iter,
+                const InternalKeyComparator& icmp,
+                const std::vector<TruncatedIterSeekTestCase>& test_cases) {
+  for (const auto& test_case : test_cases) {
+    iter->Seek(test_case.target);
+    if (test_case.invalid) {
+      ASSERT_FALSE(iter->Valid());
+    } else {
+      ASSERT_TRUE(iter->Valid());
+      EXPECT_EQ(0, icmp.Compare(iter->start_key(), test_case.start));
+      EXPECT_EQ(0, icmp.Compare(iter->end_key(), test_case.end));
+      EXPECT_EQ(test_case.seq, iter->seq());
+    }
+  }
+}
+
+void VerifySeekForPrev(
+    TruncatedRangeDelIterator* iter, const InternalKeyComparator& icmp,
+    const std::vector<TruncatedIterSeekTestCase>& test_cases) {
+  for (const auto& test_case : test_cases) {
+    iter->SeekForPrev(test_case.target);
+    if (test_case.invalid) {
+      ASSERT_FALSE(iter->Valid());
+    } else {
+      ASSERT_TRUE(iter->Valid());
+      EXPECT_EQ(0, icmp.Compare(iter->start_key(), test_case.start));
+      EXPECT_EQ(0, icmp.Compare(iter->end_key(), test_case.end));
+      EXPECT_EQ(test_case.seq, iter->seq());
+    }
+  }
+}
+
+void VerifyShouldDelete(RangeDelAggregator* range_del_agg,
+                        const std::vector<ShouldDeleteTestCase>& test_cases) {
+  for (const auto& test_case : test_cases) {
+    EXPECT_EQ(
+        test_case.result,
+        range_del_agg->ShouldDelete(
+            test_case.lookup_key, RangeDelPositioningMode::kForwardTraversal));
+  }
+  for (auto it = test_cases.rbegin(); it != test_cases.rend(); ++it) {
+    const auto& test_case = *it;
+    EXPECT_EQ(
+        test_case.result,
+        range_del_agg->ShouldDelete(
+            test_case.lookup_key, RangeDelPositioningMode::kBackwardTraversal));
+  }
+}
+
+void VerifyIsRangeOverlapped(
+    ReadRangeDelAggregator* range_del_agg,
+    const std::vector<IsRangeOverlappedTestCase>& test_cases) {
+  for (const auto& test_case : test_cases) {
+    EXPECT_EQ(test_case.result,
+              range_del_agg->IsRangeOverlapped(test_case.start, test_case.end));
+  }
+}
+
+void CheckIterPosition(const RangeTombstone& tombstone,
+                       const FragmentedRangeTombstoneIterator* iter) {
+  // Test InternalIterator interface.
+  EXPECT_EQ(tombstone.start_key_, ExtractUserKey(iter->key()));
+  EXPECT_EQ(tombstone.end_key_, iter->value());
+  EXPECT_EQ(tombstone.seq_, iter->seq());
+
+  // Test FragmentedRangeTombstoneIterator interface.
+  EXPECT_EQ(tombstone.start_key_, iter->start_key());
+  EXPECT_EQ(tombstone.end_key_, iter->end_key());
+  EXPECT_EQ(tombstone.seq_, GetInternalKeySeqno(iter->key()));
+}
+
+void VerifyFragmentedRangeDels(
+    FragmentedRangeTombstoneIterator* iter,
+    const std::vector<RangeTombstone>& expected_tombstones) {
+  iter->SeekToFirst();
+  for (size_t i = 0; i < expected_tombstones.size(); i++, iter->Next()) {
+    ASSERT_TRUE(iter->Valid());
+    CheckIterPosition(expected_tombstones[i], iter);
+  }
+  EXPECT_FALSE(iter->Valid());
+}
+
+}  // anonymous namespace
+
+TEST_F(RangeDelAggregatorTest, EmptyTruncatedIter) {
+  auto range_del_iter = MakeRangeDelIter({});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           kMaxSequenceNumber));
+
+  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr,
+                                 nullptr);
+
+  iter.SeekToFirst();
+  ASSERT_FALSE(iter.Valid());
+
+  iter.SeekToLast();
+  ASSERT_FALSE(iter.Valid());
+}
+
+TEST_F(RangeDelAggregatorTest, UntruncatedIter) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           kMaxSequenceNumber));
+
+  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr,
+                                 nullptr);
+
+  VerifyIterator(&iter, bytewise_icmp,
+                 {{UncutEndpoint("a"), UncutEndpoint("e"), 10},
+                  {UncutEndpoint("e"), UncutEndpoint("g"), 8},
+                  {UncutEndpoint("j"), UncutEndpoint("n"), 4}});
+
+  VerifySeek(
+      &iter, bytewise_icmp,
+      {{"d", UncutEndpoint("a"), UncutEndpoint("e"), 10},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("j"), UncutEndpoint("n"), 4},
+       {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+       {"", UncutEndpoint("a"), UncutEndpoint("e"), 10}});
+
+  VerifySeekForPrev(
+      &iter, bytewise_icmp,
+      {{"d", UncutEndpoint("a"), UncutEndpoint("e"), 10},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"n", UncutEndpoint("j"), UncutEndpoint("n"), 4},
+       {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
+}
+
+TEST_F(RangeDelAggregatorTest, UntruncatedIterWithSnapshot) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           9 /* snapshot */));
+
+  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr,
+                                 nullptr);
+
+  VerifyIterator(&iter, bytewise_icmp,
+                 {{UncutEndpoint("e"), UncutEndpoint("g"), 8},
+                  {UncutEndpoint("j"), UncutEndpoint("n"), 4}});
+
+  VerifySeek(
+      &iter, bytewise_icmp,
+      {{"d", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("j"), UncutEndpoint("n"), 4},
+       {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+       {"", UncutEndpoint("e"), UncutEndpoint("g"), 8}});
+
+  VerifySeekForPrev(
+      &iter, bytewise_icmp,
+      {{"d", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"n", UncutEndpoint("j"), UncutEndpoint("n"), 4},
+       {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
+}
+
+TEST_F(RangeDelAggregatorTest, TruncatedIterPartiallyCutTombstones) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           kMaxSequenceNumber));
+
+  InternalKey smallest("d", 7, kTypeValue);
+  InternalKey largest("m", 9, kTypeValue);
+  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp,
+                                 &smallest, &largest);
+
+  VerifyIterator(
+      &iter, bytewise_icmp,
+      {{InternalValue("d", 7), UncutEndpoint("e"), 10},
+       {UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {UncutEndpoint("j"), InternalValue("m", 8, kValueTypeForSeek), 4}});
+
+  VerifySeek(
+      &iter, bytewise_icmp,
+      {{"d", InternalValue("d", 7), UncutEndpoint("e"), 10},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("j"), InternalValue("m", 8, kValueTypeForSeek), 4,
+        false /* invalid */},
+       {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+       {"", InternalValue("d", 7), UncutEndpoint("e"), 10}});
+
+  VerifySeekForPrev(
+      &iter, bytewise_icmp,
+      {{"d", InternalValue("d", 7), UncutEndpoint("e"), 10},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"n", UncutEndpoint("j"), InternalValue("m", 8, kValueTypeForSeek), 4,
+        false /* invalid */},
+       {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
+}
+
+TEST_F(RangeDelAggregatorTest, TruncatedIterFullyCutTombstones) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           kMaxSequenceNumber));
+
+  InternalKey smallest("f", 7, kTypeValue);
+  InternalKey largest("i", 9, kTypeValue);
+  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp,
+                                 &smallest, &largest);
+
+  VerifyIterator(&iter, bytewise_icmp,
+                 {{InternalValue("f", 7), UncutEndpoint("g"), 8}});
+
+  VerifySeek(
+      &iter, bytewise_icmp,
+      {{"d", InternalValue("f", 7), UncutEndpoint("g"), 8},
+       {"f", InternalValue("f", 7), UncutEndpoint("g"), 8},
+       {"j", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
+
+  VerifySeekForPrev(
+      &iter, bytewise_icmp,
+      {{"d", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+       {"f", InternalValue("f", 7), UncutEndpoint("g"), 8},
+       {"j", InternalValue("f", 7), UncutEndpoint("g"), 8}});
+}
+
+TEST_F(RangeDelAggregatorTest, SingleIterInAggregator) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, {"c", "g", 8}});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           kMaxSequenceNumber));
+
+  ReadRangeDelAggregator range_del_agg(&bytewise_icmp, kMaxSequenceNumber);
+  range_del_agg.AddTombstones(std::move(input_iter));
+
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), false},
+                                      {InternalValue("b", 9), true},
+                                      {InternalValue("d", 9), true},
+                                      {InternalValue("e", 7), true},
+                                      {InternalValue("g", 7), false}});
+
+  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+                                           {"_", "a", true},
+                                           {"a", "c", true},
+                                           {"d", "f", true},
+                                           {"g", "l", false}});
+}
+
+TEST_F(RangeDelAggregatorTest, MultipleItersInAggregator) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  ReadRangeDelAggregator range_del_agg(&bytewise_icmp, kMaxSequenceNumber);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), true},
+                                      {InternalValue("b", 19), false},
+                                      {InternalValue("b", 9), true},
+                                      {InternalValue("d", 9), true},
+                                      {InternalValue("e", 7), true},
+                                      {InternalValue("g", 7), false},
+                                      {InternalValue("h", 24), true},
+                                      {InternalValue("i", 24), false},
+                                      {InternalValue("ii", 14), true},
+                                      {InternalValue("j", 14), false}});
+
+  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+                                           {"_", "a", true},
+                                           {"a", "c", true},
+                                           {"d", "f", true},
+                                           {"g", "l", true},
+                                           {"x", "y", false}});
+}
+
+TEST_F(RangeDelAggregatorTest, MultipleItersInAggregatorWithUpperBound) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             19 /* snapshot */));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), false},
+                                      {InternalValue("a", 9), true},
+                                      {InternalValue("b", 9), true},
+                                      {InternalValue("d", 9), true},
+                                      {InternalValue("e", 7), true},
+                                      {InternalValue("g", 7), false},
+                                      {InternalValue("h", 24), false},
+                                      {InternalValue("i", 24), false},
+                                      {InternalValue("ii", 14), true},
+                                      {InternalValue("j", 14), false}});
+
+  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+                                           {"_", "a", true},
+                                           {"a", "c", true},
+                                           {"d", "f", true},
+                                           {"g", "l", true},
+                                           {"x", "y", false}});
+}
+
+TEST_F(RangeDelAggregatorTest, MultipleTruncatedItersInAggregator) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "z", 10}}, {{"a", "z", 10}}, {{"a", "z", 10}}});
+  std::vector<std::pair<InternalKey, InternalKey>> iter_bounds = {
+      {InternalKey("a", 4, kTypeValue),
+       InternalKey("m", kMaxSequenceNumber, kTypeRangeDeletion)},
+      {InternalKey("m", 20, kTypeValue),
+       InternalKey("x", kMaxSequenceNumber, kTypeRangeDeletion)},
+      {InternalKey("x", 5, kTypeValue), InternalKey("zz", 30, kTypeValue)}};
+
+  ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19);
+  for (size_t i = 0; i < fragment_lists.size(); i++) {
+    const auto& fragment_list = fragment_lists[i];
+    const auto& bounds = iter_bounds[i];
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             19 /* snapshot */));
+    range_del_agg.AddTombstones(std::move(input_iter), &bounds.first,
+                                &bounds.second);
+  }
+
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 10), false},
+                                      {InternalValue("a", 9), false},
+                                      {InternalValue("a", 4), true},
+                                      {InternalValue("m", 10), false},
+                                      {InternalValue("m", 9), true},
+                                      {InternalValue("x", 10), false},
+                                      {InternalValue("x", 9), false},
+                                      {InternalValue("x", 5), true},
+                                      {InternalValue("z", 9), false}});
+
+  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+                                           {"_", "a", true},
+                                           {"a", "n", true},
+                                           {"l", "x", true},
+                                           {"w", "z", true},
+                                           {"zzz", "zz", false},
+                                           {"zz", "zzz", false}});
+}
+
+TEST_F(RangeDelAggregatorTest, MultipleTruncatedItersInAggregatorSameLevel) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "z", 10}}, {{"a", "z", 10}}, {{"a", "z", 10}}});
+  std::vector<std::pair<InternalKey, InternalKey>> iter_bounds = {
+      {InternalKey("a", 4, kTypeValue),
+       InternalKey("m", kMaxSequenceNumber, kTypeRangeDeletion)},
+      {InternalKey("m", 20, kTypeValue),
+       InternalKey("x", kMaxSequenceNumber, kTypeRangeDeletion)},
+      {InternalKey("x", 5, kTypeValue), InternalKey("zz", 30, kTypeValue)}};
+
+  ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19);
+
+  auto add_iter_to_agg = [&](size_t i) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_lists[i].get(),
+                                             bytewise_icmp, 19 /* snapshot */));
+    range_del_agg.AddTombstones(std::move(input_iter), &iter_bounds[i].first,
+                                &iter_bounds[i].second);
+  };
+
+  add_iter_to_agg(0);
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 10), false},
+                                      {InternalValue("a", 9), false},
+                                      {InternalValue("a", 4), true}});
+
+  add_iter_to_agg(1);
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("m", 10), false},
+                                      {InternalValue("m", 9), true}});
+
+  add_iter_to_agg(2);
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("x", 10), false},
+                                      {InternalValue("x", 9), false},
+                                      {InternalValue("x", 5), true},
+                                      {InternalValue("z", 9), false}});
+
+  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+                                           {"_", "a", true},
+                                           {"a", "n", true},
+                                           {"l", "x", true},
+                                           {"w", "z", true},
+                                           {"zzz", "zz", false},
+                                           {"zz", "zzz", false}});
+}
+
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorNoSnapshots) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots;
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), true},
+                                      {InternalValue("b", 19), false},
+                                      {InternalValue("b", 9), true},
+                                      {InternalValue("d", 9), true},
+                                      {InternalValue("e", 7), true},
+                                      {InternalValue("g", 7), false},
+                                      {InternalValue("h", 24), true},
+                                      {InternalValue("i", 24), false},
+                                      {InternalValue("ii", 14), true},
+                                      {InternalValue("j", 14), false}});
+
+  auto range_del_compaction_iter = range_del_agg.NewIterator();
+  VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {{"a", "b", 20},
+                                                              {"b", "c", 10},
+                                                              {"c", "e", 10},
+                                                              {"e", "g", 8},
+                                                              {"h", "i", 25},
+                                                              {"ii", "j", 15}});
+}
+
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorWithSnapshots) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots{9, 19};
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  VerifyShouldDelete(
+      &range_del_agg,
+      {
+          {InternalValue("a", 19), false},  // [10, 19]
+          {InternalValue("a", 9), false},   // [0, 9]
+          {InternalValue("b", 9), false},   // [0, 9]
+          {InternalValue("d", 9), false},   // [0, 9]
+          {InternalValue("d", 7), true},    // [0, 9]
+          {InternalValue("e", 7), true},    // [0, 9]
+          {InternalValue("g", 7), false},   // [0, 9]
+          {InternalValue("h", 24), true},   // [20, kMaxSequenceNumber]
+          {InternalValue("i", 24), false},  // [20, kMaxSequenceNumber]
+          {InternalValue("ii", 14), true},  // [10, 19]
+          {InternalValue("j", 14), false}   // [10, 19]
+      });
+
+  auto range_del_compaction_iter = range_del_agg.NewIterator();
+  VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {{"a", "b", 20},
+                                                              {"a", "b", 10},
+                                                              {"b", "c", 10},
+                                                              {"c", "e", 10},
+                                                              {"c", "e", 8},
+                                                              {"e", "g", 8},
+                                                              {"h", "i", 25},
+                                                              {"ii", "j", 15}});
+}
+
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorEmptyIteratorLeft) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots{9, 19};
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  Slice start("_");
+  Slice end("__");
+}
+
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorEmptyIteratorRight) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots{9, 19};
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  Slice start("p");
+  Slice end("q");
+  auto range_del_compaction_iter1 =
+      range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), {});
+
+  auto range_del_compaction_iter2 =
+      range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(range_del_compaction_iter2.get(), {});
+}
+
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorBoundedIterator) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots{9, 19};
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  Slice start("bb");
+  Slice end("e");
+  auto range_del_compaction_iter1 =
+      range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(range_del_compaction_iter1.get(),
+                            {{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}});
+
+  auto range_del_compaction_iter2 =
+      range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(
+      range_del_compaction_iter2.get(),
+      {{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}, {"e", "g", 8}});
+}
+
+TEST_F(RangeDelAggregatorTest,
+       CompactionAggregatorBoundedIteratorExtraFragments) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "d", 10}, {"c", "g", 8}},
+       {{"b", "c", 20}, {"d", "f", 30}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots{9, 19};
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  Slice start("bb");
+  Slice end("e");
+  auto range_del_compaction_iter1 =
+      range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), {{"a", "b", 10},
+                                                               {"b", "c", 20},
+                                                               {"b", "c", 10},
+                                                               {"c", "d", 10},
+                                                               {"c", "d", 8},
+                                                               {"d", "f", 30},
+                                                               {"d", "f", 8},
+                                                               {"f", "g", 8}});
+
+  auto range_del_compaction_iter2 =
+      range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(range_del_compaction_iter2.get(), {{"a", "b", 10},
+                                                               {"b", "c", 20},
+                                                               {"b", "c", 10},
+                                                               {"c", "d", 10},
+                                                               {"c", "d", 8},
+                                                               {"d", "f", 30},
+                                                               {"d", "f", 8},
+                                                               {"f", "g", 8}});
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/range_tombstone_fragmenter.cc b/src/rocksdb/db/range_tombstone_fragmenter.cc
new file mode 100644
index 000000000..7e7cedeca
--- /dev/null
+++ b/src/rocksdb/db/range_tombstone_fragmenter.cc
@@ -0,0 +1,502 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/range_tombstone_fragmenter.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <cstdio>
+#include <functional>
+#include <set>
+
+#include "util/autovector.h"
+#include "util/kv_map.h"
+#include "util/vector_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+FragmentedRangeTombstoneList::FragmentedRangeTombstoneList(
+    std::unique_ptr<InternalIterator> unfragmented_tombstones,
+    const InternalKeyComparator& icmp, bool for_compaction,
+    const std::vector<SequenceNumber>& snapshots) {
+  if (unfragmented_tombstones == nullptr) {
+    return;
+  }
+  bool is_sorted = true;
+  InternalKey pinned_last_start_key;
+  Slice last_start_key;
+  num_unfragmented_tombstones_ = 0;
+  total_tombstone_payload_bytes_ = 0;
+  for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
+       unfragmented_tombstones->Next(), num_unfragmented_tombstones_++) {
+    total_tombstone_payload_bytes_ += unfragmented_tombstones->key().size() +
+                                      unfragmented_tombstones->value().size();
+    if (num_unfragmented_tombstones_ > 0 &&
+        icmp.Compare(last_start_key, unfragmented_tombstones->key()) > 0) {
+      is_sorted = false;
+      break;
+    }
+    if (unfragmented_tombstones->IsKeyPinned()) {
+      last_start_key = unfragmented_tombstones->key();
+    } else {
+      pinned_last_start_key.DecodeFrom(unfragmented_tombstones->key());
+      last_start_key = pinned_last_start_key.Encode();
+    }
+  }
+  if (is_sorted) {
+    FragmentTombstones(std::move(unfragmented_tombstones), icmp, for_compaction,
+                       snapshots);
+    return;
+  }
+
+  // Sort the tombstones before fragmenting them.
+  std::vector<std::string> keys, values;
+  keys.reserve(num_unfragmented_tombstones_);
+  values.reserve(num_unfragmented_tombstones_);
+  // Reset the counter to zero for the next iteration over keys.
+  total_tombstone_payload_bytes_ = 0;
+  for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
+       unfragmented_tombstones->Next()) {
+    total_tombstone_payload_bytes_ += unfragmented_tombstones->key().size() +
+                                      unfragmented_tombstones->value().size();
+    keys.emplace_back(unfragmented_tombstones->key().data(),
+                      unfragmented_tombstones->key().size());
+    values.emplace_back(unfragmented_tombstones->value().data(),
+                        unfragmented_tombstones->value().size());
+  }
+  // VectorIterator implicitly sorts by key during construction.
+  auto iter = std::make_unique<VectorIterator>(std::move(keys),
+                                               std::move(values), &icmp);
+  FragmentTombstones(std::move(iter), icmp, for_compaction, snapshots);
+}
+
+void FragmentedRangeTombstoneList::FragmentTombstones(
+    std::unique_ptr<InternalIterator> unfragmented_tombstones,
+    const InternalKeyComparator& icmp, bool for_compaction,
+    const std::vector<SequenceNumber>& snapshots) {
+  Slice cur_start_key(nullptr, 0);
+  auto cmp = ParsedInternalKeyComparator(&icmp);
+
+  // Stores the end keys and sequence numbers of range tombstones with a start
+  // key less than or equal to cur_start_key. Provides an ordering by end key
+  // for use in flush_current_tombstones.
+  std::set<ParsedInternalKey, ParsedInternalKeyComparator> cur_end_keys(cmp);
+
+  size_t ts_sz = icmp.user_comparator()->timestamp_size();
+  // Given the next start key in unfragmented_tombstones,
+  // flush_current_tombstones writes every tombstone fragment that starts
+  // and ends with a key before next_start_key, and starts with a key greater
+  // than or equal to cur_start_key.
+  auto flush_current_tombstones = [&](const Slice& next_start_key) {
+    auto it = cur_end_keys.begin();
+    bool reached_next_start_key = false;
+    for (; it != cur_end_keys.end() && !reached_next_start_key; ++it) {
+      Slice cur_end_key = it->user_key;
+      if (icmp.user_comparator()->CompareWithoutTimestamp(cur_start_key,
+                                                          cur_end_key) == 0) {
+        // Empty tombstone.
+        continue;
+      }
+      if (icmp.user_comparator()->CompareWithoutTimestamp(next_start_key,
+                                                          cur_end_key) <= 0) {
+        // All the end keys in [it, cur_end_keys.end()) are after
+        // next_start_key, so the tombstones they represent can be used in
+        // fragments that start with keys greater than or equal to
+        // next_start_key. However, the end keys we already passed will not be
+        // used in any more tombstone fragments.
+        //
+        // Remove the fully fragmented tombstones and stop iteration after a
+        // final round of flushing to preserve the tombstones we can create more
+        // fragments from.
+        reached_next_start_key = true;
+        cur_end_keys.erase(cur_end_keys.begin(), it);
+        cur_end_key = next_start_key;
+      }
+
+      // Flush a range tombstone fragment [cur_start_key, cur_end_key), which
+      // should not overlap with the last-flushed tombstone fragment.
+      assert(tombstones_.empty() ||
+             icmp.user_comparator()->CompareWithoutTimestamp(
+                 tombstones_.back().end_key, cur_start_key) <= 0);
+
+      // Sort the sequence numbers of the tombstones being fragmented in
+      // descending order, and then flush them in that order.
+      autovector<SequenceNumber> seqnums_to_flush;
+      autovector<Slice> timestamps_to_flush;
+      for (auto flush_it = it; flush_it != cur_end_keys.end(); ++flush_it) {
+        seqnums_to_flush.push_back(flush_it->sequence);
+        if (ts_sz) {
+          timestamps_to_flush.push_back(
+              ExtractTimestampFromUserKey(flush_it->user_key, ts_sz));
+        }
+      }
+      // TODO: bind the two sorting together to be more efficient
+      std::sort(seqnums_to_flush.begin(), seqnums_to_flush.end(),
+                std::greater<SequenceNumber>());
+      if (ts_sz) {
+        std::sort(timestamps_to_flush.begin(), timestamps_to_flush.end(),
+                  [icmp](const Slice& ts1, const Slice& ts2) {
+                    return icmp.user_comparator()->CompareTimestamp(ts1, ts2) >
+                           0;
+                  });
+      }
+
+      size_t start_idx = tombstone_seqs_.size();
+      size_t end_idx = start_idx + seqnums_to_flush.size();
+
+      // If user-defined timestamp is enabled, we should not drop tombstones
+      // from any snapshot stripe. Garbage collection of range tombstones
+      // happens in CompactionOutputs::AddRangeDels().
+      if (for_compaction && ts_sz == 0) {
+        // Drop all tombstone seqnums that are not preserved by a snapshot.
+        SequenceNumber next_snapshot = kMaxSequenceNumber;
+        for (auto seq : seqnums_to_flush) {
+          if (seq <= next_snapshot) {
+            // This seqnum is visible by a lower snapshot.
+            tombstone_seqs_.push_back(seq);
+            auto upper_bound_it =
+                std::lower_bound(snapshots.begin(), snapshots.end(), seq);
+            if (upper_bound_it == snapshots.begin()) {
+              // This seqnum is the topmost one visible by the earliest
+              // snapshot. None of the seqnums below it will be visible, so we
+              // can skip them.
+              break;
+            }
+            next_snapshot = *std::prev(upper_bound_it);
+          }
+        }
+        end_idx = tombstone_seqs_.size();
+      } else {
+        // The fragmentation is being done for reads, so preserve all seqnums.
+        tombstone_seqs_.insert(tombstone_seqs_.end(), seqnums_to_flush.begin(),
+                               seqnums_to_flush.end());
+        if (ts_sz) {
+          tombstone_timestamps_.insert(tombstone_timestamps_.end(),
+                                       timestamps_to_flush.begin(),
+                                       timestamps_to_flush.end());
+        }
+      }
+
+      assert(start_idx < end_idx);
+      if (ts_sz) {
+        std::string start_key_with_max_ts;
+        AppendUserKeyWithMaxTimestamp(&start_key_with_max_ts, cur_start_key,
+                                      ts_sz);
+        pinned_slices_.emplace_back(std::move(start_key_with_max_ts));
+        Slice start_key = pinned_slices_.back();
+
+        std::string end_key_with_max_ts;
+        AppendUserKeyWithMaxTimestamp(&end_key_with_max_ts, cur_end_key, ts_sz);
+        pinned_slices_.emplace_back(std::move(end_key_with_max_ts));
+        Slice end_key = pinned_slices_.back();
+
+        // RangeTombstoneStack expects start_key and end_key to have max
+        // timestamp.
+        tombstones_.emplace_back(start_key, end_key, start_idx, end_idx);
+      } else {
+        tombstones_.emplace_back(cur_start_key, cur_end_key, start_idx,
+                                 end_idx);
+      }
+
+      cur_start_key = cur_end_key;
+    }
+    if (!reached_next_start_key) {
+      // There is a gap between the last flushed tombstone fragment and
+      // the next tombstone's start key. Remove all the end keys in
+      // the working set, since we have fully fragmented their corresponding
+      // tombstones.
+      cur_end_keys.clear();
+    }
+    cur_start_key = next_start_key;
+  };
+
+  pinned_iters_mgr_.StartPinning();
+
+  bool no_tombstones = true;
+  for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
+       unfragmented_tombstones->Next()) {
+    const Slice& ikey = unfragmented_tombstones->key();
+    Slice tombstone_start_key = ExtractUserKey(ikey);
+    SequenceNumber tombstone_seq = GetInternalKeySeqno(ikey);
+    if (!unfragmented_tombstones->IsKeyPinned()) {
+      pinned_slices_.emplace_back(tombstone_start_key.data(),
+                                  tombstone_start_key.size());
+      tombstone_start_key = pinned_slices_.back();
+    }
+    no_tombstones = false;
+
+    Slice tombstone_end_key = unfragmented_tombstones->value();
+    if (!unfragmented_tombstones->IsValuePinned()) {
+      pinned_slices_.emplace_back(tombstone_end_key.data(),
+                                  tombstone_end_key.size());
+      tombstone_end_key = pinned_slices_.back();
+    }
+    if (!cur_end_keys.empty() &&
+        icmp.user_comparator()->CompareWithoutTimestamp(
+            cur_start_key, tombstone_start_key) != 0) {
+      // The start key has changed. Flush all tombstones that start before
+      // this new start key.
+      flush_current_tombstones(tombstone_start_key);
+    }
+    cur_start_key = tombstone_start_key;
+
+    cur_end_keys.emplace(tombstone_end_key, tombstone_seq, kTypeRangeDeletion);
+  }
+  if (!cur_end_keys.empty()) {
+    ParsedInternalKey last_end_key = *std::prev(cur_end_keys.end());
+    flush_current_tombstones(last_end_key.user_key);
+  }
+
+  if (!no_tombstones) {
+    pinned_iters_mgr_.PinIterator(unfragmented_tombstones.release(),
+                                  false /* arena */);
+  }
+}
+
+bool FragmentedRangeTombstoneList::ContainsRange(SequenceNumber lower,
+                                                 SequenceNumber upper) {
+  std::call_once(seq_set_init_once_flag_, [this]() {
+    for (auto s : tombstone_seqs_) {
+      seq_set_.insert(s);
+    }
+  });
+  auto seq_it = seq_set_.lower_bound(lower);
+  return seq_it != seq_set_.end() && *seq_it <= upper;
+}
+
+FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
+    FragmentedRangeTombstoneList* tombstones, const InternalKeyComparator& icmp,
+    SequenceNumber _upper_bound, const Slice* ts_upper_bound,
+    SequenceNumber _lower_bound)
+    : tombstone_start_cmp_(icmp.user_comparator()),
+      tombstone_end_cmp_(icmp.user_comparator()),
+      icmp_(&icmp),
+      ucmp_(icmp.user_comparator()),
+      tombstones_(tombstones),
+      upper_bound_(_upper_bound),
+      lower_bound_(_lower_bound),
+      ts_upper_bound_(ts_upper_bound) {
+  assert(tombstones_ != nullptr);
+  Invalidate();
+}
+
+FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
+    const std::shared_ptr<FragmentedRangeTombstoneList>& tombstones,
+    const InternalKeyComparator& icmp, SequenceNumber _upper_bound,
+    const Slice* ts_upper_bound, SequenceNumber _lower_bound)
+    : tombstone_start_cmp_(icmp.user_comparator()),
+      tombstone_end_cmp_(icmp.user_comparator()),
+      icmp_(&icmp),
+      ucmp_(icmp.user_comparator()),
+      tombstones_ref_(tombstones),
+      tombstones_(tombstones_ref_.get()),
+      upper_bound_(_upper_bound),
+      lower_bound_(_lower_bound),
+      ts_upper_bound_(ts_upper_bound) {
+  assert(tombstones_ != nullptr);
+  Invalidate();
+}
+
+FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
+    const std::shared_ptr<FragmentedRangeTombstoneListCache>& tombstones_cache,
+    const InternalKeyComparator& icmp, SequenceNumber _upper_bound,
+    const Slice* ts_upper_bound, SequenceNumber _lower_bound)
+    : tombstone_start_cmp_(icmp.user_comparator()),
+      tombstone_end_cmp_(icmp.user_comparator()),
+      icmp_(&icmp),
+      ucmp_(icmp.user_comparator()),
+      tombstones_cache_ref_(tombstones_cache),
+      tombstones_(tombstones_cache_ref_->tombstones.get()),
+      upper_bound_(_upper_bound),
+      lower_bound_(_lower_bound) {
+  assert(tombstones_ != nullptr);
+  if (!ts_upper_bound || ts_upper_bound->empty()) {
+    ts_upper_bound_ = nullptr;
+  } else {
+    ts_upper_bound_ = ts_upper_bound;
+  }
+  Invalidate();
+}
+
+void FragmentedRangeTombstoneIterator::SeekToFirst() {
+  pos_ = tombstones_->begin();
+  seq_pos_ = tombstones_->seq_begin();
+}
+
+void FragmentedRangeTombstoneIterator::SeekToTopFirst() {
+  if (tombstones_->empty()) {
+    Invalidate();
+    return;
+  }
+  pos_ = tombstones_->begin();
+  SetMaxVisibleSeqAndTimestamp();
+  ScanForwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::SeekToLast() {
+  pos_ = std::prev(tombstones_->end());
+  seq_pos_ = std::prev(tombstones_->seq_end());
+}
+
+void FragmentedRangeTombstoneIterator::SeekToTopLast() {
+  if (tombstones_->empty()) {
+    Invalidate();
+    return;
+  }
+  pos_ = std::prev(tombstones_->end());
+  SetMaxVisibleSeqAndTimestamp();
+  ScanBackwardToVisibleTombstone();
+}
+
+// @param `target` is a user key, with timestamp if user-defined timestamp is
+// enabled.
+void FragmentedRangeTombstoneIterator::Seek(const Slice& target) {
+  if (tombstones_->empty()) {
+    Invalidate();
+    return;
+  }
+  SeekToCoveringTombstone(target);
+  ScanForwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::SeekForPrev(const Slice& target) {
+  if (tombstones_->empty()) {
+    Invalidate();
+    return;
+  }
+  SeekForPrevToCoveringTombstone(target);
+  ScanBackwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::SeekToCoveringTombstone(
+    const Slice& target) {
+  pos_ = std::upper_bound(tombstones_->begin(), tombstones_->end(), target,
+                          tombstone_end_cmp_);
+  if (pos_ == tombstones_->end()) {
+    // All tombstones end before target.
+    seq_pos_ = tombstones_->seq_end();
+    return;
+  }
+  SetMaxVisibleSeqAndTimestamp();
+}
+
+void FragmentedRangeTombstoneIterator::SeekForPrevToCoveringTombstone(
+    const Slice& target) {
+  if (tombstones_->empty()) {
+    Invalidate();
+    return;
+  }
+  pos_ = std::upper_bound(tombstones_->begin(), tombstones_->end(), target,
+                          tombstone_start_cmp_);
+  if (pos_ == tombstones_->begin()) {
+    // All tombstones start after target.
+    Invalidate();
+    return;
+  }
+  --pos_;
+  SetMaxVisibleSeqAndTimestamp();
+}
+
+void FragmentedRangeTombstoneIterator::ScanForwardToVisibleTombstone() {
+  while (pos_ != tombstones_->end() &&
+         (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx) ||
+          *seq_pos_ < lower_bound_)) {
+    ++pos_;
+    if (pos_ == tombstones_->end()) {
+      Invalidate();
+      return;
+    }
+    SetMaxVisibleSeqAndTimestamp();
+  }
+}
+
+void FragmentedRangeTombstoneIterator::ScanBackwardToVisibleTombstone() {
+  while (pos_ != tombstones_->end() &&
+         (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx) ||
+          *seq_pos_ < lower_bound_)) {
+    if (pos_ == tombstones_->begin()) {
+      Invalidate();
+      return;
+    }
+    --pos_;
+    SetMaxVisibleSeqAndTimestamp();
+  }
+}
+
+void FragmentedRangeTombstoneIterator::Next() {
+  ++seq_pos_;
+  if (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx)) {
+    ++pos_;
+  }
+}
+
+void FragmentedRangeTombstoneIterator::TopNext() {
+  ++pos_;
+  if (pos_ == tombstones_->end()) {
+    return;
+  }
+  SetMaxVisibleSeqAndTimestamp();
+  ScanForwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::Prev() {
+  if (seq_pos_ == tombstones_->seq_begin()) {
+    Invalidate();
+    return;
+  }
+  --seq_pos_;
+  if (pos_ == tombstones_->end() ||
+      seq_pos_ == tombstones_->seq_iter(pos_->seq_start_idx - 1)) {
+    --pos_;
+  }
+}
+
+void FragmentedRangeTombstoneIterator::TopPrev() {
+  if (pos_ == tombstones_->begin()) {
+    Invalidate();
+    return;
+  }
+  --pos_;
+  SetMaxVisibleSeqAndTimestamp();
+  ScanBackwardToVisibleTombstone();
+}
+
+bool FragmentedRangeTombstoneIterator::Valid() const {
+  return tombstones_ != nullptr && pos_ != tombstones_->end();
+}
+
+SequenceNumber FragmentedRangeTombstoneIterator::MaxCoveringTombstoneSeqnum(
+    const Slice& target_user_key) {
+  SeekToCoveringTombstone(target_user_key);
+  return ValidPos() && ucmp_->CompareWithoutTimestamp(start_key(),
+                                                      target_user_key) <= 0
+             ? seq()
+             : 0;
+}
+
+std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
+FragmentedRangeTombstoneIterator::SplitBySnapshot(
+    const std::vector<SequenceNumber>& snapshots) {
+  std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
+      splits;
+  SequenceNumber lower = 0;
+  SequenceNumber upper;
+  for (size_t i = 0; i <= snapshots.size(); i++) {
+    if (i >= snapshots.size()) {
+      upper = kMaxSequenceNumber;
+    } else {
+      upper = snapshots[i];
+    }
+    if (tombstones_->ContainsRange(lower, upper)) {
+      splits.emplace(upper,
+                     std::make_unique<FragmentedRangeTombstoneIterator>(
+                         tombstones_, *icmp_, upper, ts_upper_bound_, lower));
+    }
+    lower = upper + 1;
+  }
+  return splits;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/range_tombstone_fragmenter.h b/src/rocksdb/db/range_tombstone_fragmenter.h
new file mode 100644
index 000000000..df07fa894
--- /dev/null
+++ b/src/rocksdb/db/range_tombstone_fragmenter.h
@@ -0,0 +1,357 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <list>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+#include "rocksdb/status.h"
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+struct FragmentedRangeTombstoneList;
+
+struct FragmentedRangeTombstoneListCache {
+  // ensure only the first reader needs to initialize l
+  std::mutex reader_mutex;
+  std::unique_ptr<FragmentedRangeTombstoneList> tombstones = nullptr;
+  // readers will first check this bool to avoid
+  std::atomic<bool> initialized = false;
+};
+
+struct FragmentedRangeTombstoneList {
+ public:
+  // A compact representation of a "stack" of range tombstone fragments, which
+  // start and end at the same user keys but have different sequence numbers.
+  // The members seq_start_idx and seq_end_idx are intended to be parameters to
+  // seq_iter().
+  // If user-defined timestamp is enabled, `start` and `end` should be user keys
+  // with timestamp, and the timestamps are set to max timestamp to be returned
+  // by parsed_start_key()/parsed_end_key(). seq_start_idx and seq_end_idx will
+  // also be used as parameters to ts_iter().
+  struct RangeTombstoneStack {
+    RangeTombstoneStack(const Slice& start, const Slice& end, size_t start_idx,
+                        size_t end_idx)
+        : start_key(start),
+          end_key(end),
+          seq_start_idx(start_idx),
+          seq_end_idx(end_idx) {}
+    Slice start_key;
+    Slice end_key;
+    size_t seq_start_idx;
+    size_t seq_end_idx;
+  };
+  // Assumes unfragmented_tombstones->key() and unfragmented_tombstones->value()
+  // both contain timestamp if enabled.
+  FragmentedRangeTombstoneList(
+      std::unique_ptr<InternalIterator> unfragmented_tombstones,
+      const InternalKeyComparator& icmp, bool for_compaction = false,
+      const std::vector<SequenceNumber>& snapshots = {});
+
+  std::vector<RangeTombstoneStack>::const_iterator begin() const {
+    return tombstones_.begin();
+  }
+
+  std::vector<RangeTombstoneStack>::const_iterator end() const {
+    return tombstones_.end();
+  }
+
+  std::vector<SequenceNumber>::const_iterator seq_iter(size_t idx) const {
+    return std::next(tombstone_seqs_.begin(), idx);
+  }
+
+  std::vector<Slice>::const_iterator ts_iter(size_t idx) const {
+    return std::next(tombstone_timestamps_.begin(), idx);
+  }
+
+  std::vector<SequenceNumber>::const_iterator seq_begin() const {
+    return tombstone_seqs_.begin();
+  }
+
+  std::vector<SequenceNumber>::const_iterator seq_end() const {
+    return tombstone_seqs_.end();
+  }
+
+  bool empty() const { return tombstones_.empty(); }
+
+  // Returns true if the stored tombstones contain with one with a sequence
+  // number in [lower, upper].
+  // This method is not const as it internally lazy initialize a set of
+  // sequence numbers (`seq_set_`).
+  bool ContainsRange(SequenceNumber lower, SequenceNumber upper);
+
+  uint64_t num_unfragmented_tombstones() const {
+    return num_unfragmented_tombstones_;
+  }
+
+  uint64_t total_tombstone_payload_bytes() const {
+    return total_tombstone_payload_bytes_;
+  }
+
+ private:
+  // Given an ordered range tombstone iterator unfragmented_tombstones,
+  // "fragment" the tombstones into non-overlapping pieces. Each
+  // "non-overlapping piece" is a RangeTombstoneStack in tombstones_, which
+  // contains start_key, end_key, and indices that points to sequence numbers
+  // (in tombstone_seqs_) and timestamps (in tombstone_timestamps_). If
+  // for_compaction is true, then `snapshots` should be provided. Range
+  // tombstone fragments are dropped if they are not visible in any snapshot and
+  // user-defined timestamp is not enabled. That is, for each snapshot stripe
+  // [lower, upper], the range tombstone fragment with largest seqno in [lower,
+  // upper] is preserved, and all the other range tombstones are dropped.
+  void FragmentTombstones(
+      std::unique_ptr<InternalIterator> unfragmented_tombstones,
+      const InternalKeyComparator& icmp, bool for_compaction,
+      const std::vector<SequenceNumber>& snapshots);
+
+  std::vector<RangeTombstoneStack> tombstones_;
+  std::vector<SequenceNumber> tombstone_seqs_;
+  std::vector<Slice> tombstone_timestamps_;
+  std::once_flag seq_set_init_once_flag_;
+  std::set<SequenceNumber> seq_set_;
+  std::list<std::string> pinned_slices_;
+  PinnedIteratorsManager pinned_iters_mgr_;
+  uint64_t num_unfragmented_tombstones_;
+  uint64_t total_tombstone_payload_bytes_;
+};
+
+// FragmentedRangeTombstoneIterator converts an InternalIterator of a range-del
+// meta block into an iterator over non-overlapping tombstone fragments. The
+// tombstone fragmentation process should be more efficient than the range
+// tombstone collapsing algorithm in RangeDelAggregator because this leverages
+// the internal key ordering already provided by the input iterator, if
+// applicable (when the iterator is unsorted, a new sorted iterator is created
+// before proceeding). If there are few overlaps, creating a
+// FragmentedRangeTombstoneIterator should be O(n), while the RangeDelAggregator
+// tombstone collapsing is always O(n log n).
+class FragmentedRangeTombstoneIterator : public InternalIterator {
+ public:
+  FragmentedRangeTombstoneIterator(FragmentedRangeTombstoneList* tombstones,
+                                   const InternalKeyComparator& icmp,
+                                   SequenceNumber upper_bound,
+                                   const Slice* ts_upper_bound = nullptr,
+                                   SequenceNumber lower_bound = 0);
+  FragmentedRangeTombstoneIterator(
+      const std::shared_ptr<FragmentedRangeTombstoneList>& tombstones,
+      const InternalKeyComparator& icmp, SequenceNumber upper_bound,
+      const Slice* ts_upper_bound = nullptr, SequenceNumber lower_bound = 0);
+  FragmentedRangeTombstoneIterator(
+      const std::shared_ptr<FragmentedRangeTombstoneListCache>& tombstones,
+      const InternalKeyComparator& icmp, SequenceNumber upper_bound,
+      const Slice* ts_upper_bound = nullptr, SequenceNumber lower_bound = 0);
+
+  void SeekToFirst() override;
+  void SeekToLast() override;
+
+  void SeekToTopFirst();
+  void SeekToTopLast();
+
+  // NOTE: Seek and SeekForPrev do not behave in the way InternalIterator
+  // seeking should behave. This is OK because they are not currently used, but
+  // eventually FragmentedRangeTombstoneIterator should no longer implement
+  // InternalIterator.
+  //
+  // Seeks to the range tombstone that covers target at a seqnum in the
+  // snapshot. If no such tombstone exists, seek to the earliest tombstone in
+  // the snapshot that ends after target.
+  void Seek(const Slice& target) override;
+  // Seeks to the range tombstone that covers target at a seqnum in the
+  // snapshot. If no such tombstone exists, seek to the latest tombstone in the
+  // snapshot that starts before target.
+  void SeekForPrev(const Slice& target) override;
+
+  void Next() override;
+  void Prev() override;
+
+  void TopNext();
+  void TopPrev();
+
+  bool Valid() const override;
+  // Note that key() and value() do not return correct timestamp.
+  // Caller should call timestamp() to get the current timestamp.
+  Slice key() const override {
+    MaybePinKey();
+    return current_start_key_.Encode();
+  }
+  Slice value() const override { return pos_->end_key; }
+  bool IsKeyPinned() const override { return false; }
+  bool IsValuePinned() const override { return true; }
+  Status status() const override { return Status::OK(); }
+
+  bool empty() const { return tombstones_->empty(); }
+  void Invalidate() {
+    pos_ = tombstones_->end();
+    seq_pos_ = tombstones_->seq_end();
+    pinned_pos_ = tombstones_->end();
+    pinned_seq_pos_ = tombstones_->seq_end();
+  }
+
+  RangeTombstone Tombstone() const {
+    assert(Valid());
+    if (icmp_->user_comparator()->timestamp_size()) {
+      return RangeTombstone(start_key(), end_key(), seq(), timestamp());
+    }
+    return RangeTombstone(start_key(), end_key(), seq());
+  }
+  // Note that start_key() and end_key() are not guaranteed to have the
+  // correct timestamp. User can call timestamp() to get the correct
+  // timestamp().
+  Slice start_key() const { return pos_->start_key; }
+  Slice end_key() const { return pos_->end_key; }
+  SequenceNumber seq() const { return *seq_pos_; }
+  Slice timestamp() const {
+    // seqno and timestamp are stored in the same order.
+    return *tombstones_->ts_iter(seq_pos_ - tombstones_->seq_begin());
+  }
+  // Current use case is by CompactionRangeDelAggregator to set
+  // full_history_ts_low_.
+  void SetTimestampUpperBound(const Slice* ts_upper_bound) {
+    ts_upper_bound_ = ts_upper_bound;
+  }
+
+  ParsedInternalKey parsed_start_key() const {
+    return ParsedInternalKey(pos_->start_key, kMaxSequenceNumber,
+                             kTypeRangeDeletion);
+  }
+  ParsedInternalKey parsed_end_key() const {
+    return ParsedInternalKey(pos_->end_key, kMaxSequenceNumber,
+                             kTypeRangeDeletion);
+  }
+
+  // Return the max sequence number of a range tombstone that covers
+  // the given user key.
+  // If there is no covering tombstone, then 0 is returned.
+  SequenceNumber MaxCoveringTombstoneSeqnum(const Slice& user_key);
+
+  // Splits the iterator into n+1 iterators (where n is the number of
+  // snapshots), each providing a view over a "stripe" of sequence numbers. The
+  // iterators are keyed by the upper bound of their ranges (the provided
+  // snapshots + kMaxSequenceNumber).
+  //
+  // NOTE: the iterators in the returned map are no longer valid if their
+  // parent iterator is deleted, since they do not modify the refcount of the
+  // underlying tombstone list. Therefore, this map should be deleted before
+  // the parent iterator.
+  std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
+  SplitBySnapshot(const std::vector<SequenceNumber>& snapshots);
+
+  SequenceNumber upper_bound() const { return upper_bound_; }
+  SequenceNumber lower_bound() const { return lower_bound_; }
+
+  uint64_t num_unfragmented_tombstones() const {
+    return tombstones_->num_unfragmented_tombstones();
+  }
+  uint64_t total_tombstone_payload_bytes() const {
+    return tombstones_->total_tombstone_payload_bytes();
+  }
+
+ private:
+  using RangeTombstoneStack = FragmentedRangeTombstoneList::RangeTombstoneStack;
+
+  struct RangeTombstoneStackStartComparator {
+    explicit RangeTombstoneStackStartComparator(const Comparator* c) : cmp(c) {}
+
+    bool operator()(const RangeTombstoneStack& a,
+                    const RangeTombstoneStack& b) const {
+      return cmp->CompareWithoutTimestamp(a.start_key, b.start_key) < 0;
+    }
+
+    bool operator()(const RangeTombstoneStack& a, const Slice& b) const {
+      return cmp->CompareWithoutTimestamp(a.start_key, b) < 0;
+    }
+
+    bool operator()(const Slice& a, const RangeTombstoneStack& b) const {
+      return cmp->CompareWithoutTimestamp(a, b.start_key) < 0;
+    }
+
+    const Comparator* cmp;
+  };
+
+  struct RangeTombstoneStackEndComparator {
+    explicit RangeTombstoneStackEndComparator(const Comparator* c) : cmp(c) {}
+
+    bool operator()(const RangeTombstoneStack& a,
+                    const RangeTombstoneStack& b) const {
+      return cmp->CompareWithoutTimestamp(a.end_key, b.end_key) < 0;
+    }
+
+    bool operator()(const RangeTombstoneStack& a, const Slice& b) const {
+      return cmp->CompareWithoutTimestamp(a.end_key, b) < 0;
+    }
+
+    bool operator()(const Slice& a, const RangeTombstoneStack& b) const {
+      return cmp->CompareWithoutTimestamp(a, b.end_key) < 0;
+    }
+
+    const Comparator* cmp;
+  };
+
+  void MaybePinKey() const {
+    if (pos_ != tombstones_->end() && seq_pos_ != tombstones_->seq_end() &&
+        (pinned_pos_ != pos_ || pinned_seq_pos_ != seq_pos_)) {
+      current_start_key_.Set(pos_->start_key, *seq_pos_, kTypeRangeDeletion);
+      pinned_pos_ = pos_;
+      pinned_seq_pos_ = seq_pos_;
+    }
+  }
+
+  void SeekToCoveringTombstone(const Slice& key);
+  void SeekForPrevToCoveringTombstone(const Slice& key);
+  void ScanForwardToVisibleTombstone();
+  void ScanBackwardToVisibleTombstone();
+  bool ValidPos() const {
+    return Valid() && seq_pos_ != tombstones_->seq_iter(pos_->seq_end_idx);
+  }
+
+  const RangeTombstoneStackStartComparator tombstone_start_cmp_;
+  const RangeTombstoneStackEndComparator tombstone_end_cmp_;
+  const InternalKeyComparator* icmp_;
+  const Comparator* ucmp_;
+  std::shared_ptr<FragmentedRangeTombstoneList> tombstones_ref_;
+  std::shared_ptr<FragmentedRangeTombstoneListCache> tombstones_cache_ref_;
+  FragmentedRangeTombstoneList* tombstones_;
+  SequenceNumber upper_bound_;
+  SequenceNumber lower_bound_;
+  // Only consider timestamps <= ts_upper_bound_.
+  const Slice* ts_upper_bound_;
+  std::vector<RangeTombstoneStack>::const_iterator pos_;
+  std::vector<SequenceNumber>::const_iterator seq_pos_;
+  mutable std::vector<RangeTombstoneStack>::const_iterator pinned_pos_;
+  mutable std::vector<SequenceNumber>::const_iterator pinned_seq_pos_;
+  mutable InternalKey current_start_key_;
+
+  // Check the current RangeTombstoneStack `pos_` against timestamp
+  // upper bound `ts_upper_bound_` and sequence number upper bound
+  // `upper_bound_`. Update the sequence number (and timestamp) pointer
+  // `seq_pos_` to the first valid position satisfying both bounds.
+  void SetMaxVisibleSeqAndTimestamp() {
+    seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+                                tombstones_->seq_iter(pos_->seq_end_idx),
+                                upper_bound_, std::greater<SequenceNumber>());
+    if (ts_upper_bound_ && !ts_upper_bound_->empty()) {
+      auto ts_pos = std::lower_bound(
+          tombstones_->ts_iter(pos_->seq_start_idx),
+          tombstones_->ts_iter(pos_->seq_end_idx), *ts_upper_bound_,
+          [this](const Slice& s1, const Slice& s2) {
+            return ucmp_->CompareTimestamp(s1, s2) > 0;
+          });
+      auto ts_idx = ts_pos - tombstones_->ts_iter(pos_->seq_start_idx);
+      auto seq_idx = seq_pos_ - tombstones_->seq_iter(pos_->seq_start_idx);
+      if (seq_idx < ts_idx) {
+        // seq and ts are ordered in non-increasing order. Only updates seq_pos_
+        // to a larger index for smaller sequence number and timestamp.
+        seq_pos_ = tombstones_->seq_iter(pos_->seq_start_idx + ts_idx);
+      }
+    }
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/range_tombstone_fragmenter_test.cc b/src/rocksdb/db/range_tombstone_fragmenter_test.cc
new file mode 100644
index 000000000..46b3c99b5
--- /dev/null
+++ b/src/rocksdb/db/range_tombstone_fragmenter_test.cc
@@ -0,0 +1,555 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/range_tombstone_fragmenter.h"
+
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "test_util/testutil.h"
+#include "util/vector_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class RangeTombstoneFragmenterTest : public testing::Test {};
+
+namespace {
+
+static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator());
+
+std::unique_ptr<InternalIterator> MakeRangeDelIter(
+    const std::vector<RangeTombstone>& range_dels) {
+  std::vector<std::string> keys, values;
+  for (const auto& range_del : range_dels) {
+    auto key_and_value = range_del.Serialize();
+    keys.push_back(key_and_value.first.Encode().ToString());
+    values.push_back(key_and_value.second.ToString());
+  }
+  return std::unique_ptr<VectorIterator>(
+      new VectorIterator(keys, values, &bytewise_icmp));
+}
+
+void CheckIterPosition(const RangeTombstone& tombstone,
+                       const FragmentedRangeTombstoneIterator* iter) {
+  // Test InternalIterator interface.
+  EXPECT_EQ(tombstone.start_key_, ExtractUserKey(iter->key()));
+  EXPECT_EQ(tombstone.end_key_, iter->value());
+  EXPECT_EQ(tombstone.seq_, iter->seq());
+
+  // Test FragmentedRangeTombstoneIterator interface.
+  EXPECT_EQ(tombstone.start_key_, iter->start_key());
+  EXPECT_EQ(tombstone.end_key_, iter->end_key());
+  EXPECT_EQ(tombstone.seq_, GetInternalKeySeqno(iter->key()));
+}
+
+void VerifyFragmentedRangeDels(
+    FragmentedRangeTombstoneIterator* iter,
+    const std::vector<RangeTombstone>& expected_tombstones) {
+  iter->SeekToFirst();
+  for (size_t i = 0; i < expected_tombstones.size(); i++, iter->Next()) {
+    ASSERT_TRUE(iter->Valid());
+    CheckIterPosition(expected_tombstones[i], iter);
+  }
+  EXPECT_FALSE(iter->Valid());
+}
+
+void VerifyVisibleTombstones(
+    FragmentedRangeTombstoneIterator* iter,
+    const std::vector<RangeTombstone>& expected_tombstones) {
+  iter->SeekToTopFirst();
+  for (size_t i = 0; i < expected_tombstones.size(); i++, iter->TopNext()) {
+    ASSERT_TRUE(iter->Valid());
+    CheckIterPosition(expected_tombstones[i], iter);
+  }
+  EXPECT_FALSE(iter->Valid());
+}
+
+struct SeekTestCase {
+  Slice seek_target;
+  RangeTombstone expected_position;
+  bool out_of_range;
+};
+
+void VerifySeek(FragmentedRangeTombstoneIterator* iter,
+                const std::vector<SeekTestCase>& cases) {
+  for (const auto& testcase : cases) {
+    iter->Seek(testcase.seek_target);
+    if (testcase.out_of_range) {
+      ASSERT_FALSE(iter->Valid());
+    } else {
+      ASSERT_TRUE(iter->Valid());
+      CheckIterPosition(testcase.expected_position, iter);
+    }
+  }
+}
+
+void VerifySeekForPrev(FragmentedRangeTombstoneIterator* iter,
+                       const std::vector<SeekTestCase>& cases) {
+  for (const auto& testcase : cases) {
+    iter->SeekForPrev(testcase.seek_target);
+    if (testcase.out_of_range) {
+      ASSERT_FALSE(iter->Valid());
+    } else {
+      ASSERT_TRUE(iter->Valid());
+      CheckIterPosition(testcase.expected_position, iter);
+    }
+  }
+}
+
+struct MaxCoveringTombstoneSeqnumTestCase {
+  Slice user_key;
+  SequenceNumber result;
+};
+
+void VerifyMaxCoveringTombstoneSeqnum(
+    FragmentedRangeTombstoneIterator* iter,
+    const std::vector<MaxCoveringTombstoneSeqnumTestCase>& cases) {
+  for (const auto& testcase : cases) {
+    EXPECT_EQ(testcase.result,
+              iter->MaxCoveringTombstoneSeqnum(testcase.user_key));
+  }
+}
+
+}  // anonymous namespace
+
+TEST_F(RangeTombstoneFragmenterTest, NonOverlappingTombstones) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "b", 10}, {"c", "d", 5}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(&iter, {{"a", "b", 10}, {"c", "d", 5}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter,
+                                   {{"", 0}, {"a", 10}, {"b", 0}, {"c", 5}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, OverlappingTombstones) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, {"c", "g", 15}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(
+      &iter, {{"a", "c", 10}, {"c", "e", 15}, {"c", "e", 10}, {"e", "g", 15}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter,
+                                   {{"a", 10}, {"c", 15}, {"e", 15}, {"g", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, ContiguousTombstones) {
+  auto range_del_iter = MakeRangeDelIter(
+      {{"a", "c", 10}, {"c", "e", 20}, {"c", "e", 5}, {"e", "g", 15}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(
+      &iter, {{"a", "c", 10}, {"c", "e", 20}, {"c", "e", 5}, {"e", "g", 15}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter,
+                                   {{"a", 10}, {"c", 20}, {"e", 15}, {"g", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, RepeatedStartAndEndKey) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "c", 10}, {"a", "c", 7}, {"a", "c", 3}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(&iter,
+                            {{"a", "c", 10}, {"a", "c", 7}, {"a", "c", 3}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter, {{"a", 10}, {"b", 10}, {"c", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, RepeatedStartKeyDifferentEndKeys) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "e", 10}, {"a", "g", 7}, {"a", "c", 3}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+                                    {"a", "c", 7},
+                                    {"a", "c", 3},
+                                    {"c", "e", 10},
+                                    {"c", "e", 7},
+                                    {"e", "g", 7}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter,
+                                   {{"a", 10}, {"c", 10}, {"e", 7}, {"g", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, RepeatedStartKeyMixedEndKeys) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "c", 30},
+                                          {"a", "g", 20},
+                                          {"a", "e", 10},
+                                          {"a", "g", 7},
+                                          {"a", "c", 3}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(&iter, {{"a", "c", 30},
+                                    {"a", "c", 20},
+                                    {"a", "c", 10},
+                                    {"a", "c", 7},
+                                    {"a", "c", 3},
+                                    {"c", "e", 20},
+                                    {"c", "e", 10},
+                                    {"c", "e", 7},
+                                    {"e", "g", 20},
+                                    {"e", "g", 7}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter,
+                                   {{"a", 30}, {"c", 20}, {"e", 20}, {"g", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKey) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"c", "g", 8},
+                                          {"c", "i", 6},
+                                          {"j", "n", 4},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+                                         kMaxSequenceNumber);
+  FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+                                         9 /* upper_bound */);
+  FragmentedRangeTombstoneIterator iter3(&fragment_list, bytewise_icmp,
+                                         7 /* upper_bound */);
+  FragmentedRangeTombstoneIterator iter4(&fragment_list, bytewise_icmp,
+                                         5 /* upper_bound */);
+  FragmentedRangeTombstoneIterator iter5(&fragment_list, bytewise_icmp,
+                                         3 /* upper_bound */);
+  for (auto* iter : {&iter1, &iter2, &iter3, &iter4, &iter5}) {
+    VerifyFragmentedRangeDels(iter, {{"a", "c", 10},
+                                     {"c", "e", 10},
+                                     {"c", "e", 8},
+                                     {"c", "e", 6},
+                                     {"e", "g", 8},
+                                     {"e", "g", 6},
+                                     {"g", "i", 6},
+                                     {"j", "l", 4},
+                                     {"j", "l", 2},
+                                     {"l", "n", 4}});
+  }
+
+  ASSERT_EQ(0, iter1.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter1.upper_bound());
+  VerifyVisibleTombstones(&iter1, {{"a", "c", 10},
+                                   {"c", "e", 10},
+                                   {"e", "g", 8},
+                                   {"g", "i", 6},
+                                   {"j", "l", 4},
+                                   {"l", "n", 4}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter1, {{"a", 10}, {"c", 10}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}});
+
+  ASSERT_EQ(0, iter2.lower_bound());
+  ASSERT_EQ(9, iter2.upper_bound());
+  VerifyVisibleTombstones(&iter2, {{"c", "e", 8},
+                                   {"e", "g", 8},
+                                   {"g", "i", 6},
+                                   {"j", "l", 4},
+                                   {"l", "n", 4}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter2, {{"a", 0}, {"c", 8}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}});
+
+  ASSERT_EQ(0, iter3.lower_bound());
+  ASSERT_EQ(7, iter3.upper_bound());
+  VerifyVisibleTombstones(&iter3, {{"c", "e", 6},
+                                   {"e", "g", 6},
+                                   {"g", "i", 6},
+                                   {"j", "l", 4},
+                                   {"l", "n", 4}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter3, {{"a", 0}, {"c", 6}, {"e", 6}, {"i", 0}, {"j", 4}, {"m", 4}});
+
+  ASSERT_EQ(0, iter4.lower_bound());
+  ASSERT_EQ(5, iter4.upper_bound());
+  VerifyVisibleTombstones(&iter4, {{"j", "l", 4}, {"l", "n", 4}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter4, {{"a", 0}, {"c", 0}, {"e", 0}, {"i", 0}, {"j", 4}, {"m", 4}});
+
+  ASSERT_EQ(0, iter5.lower_bound());
+  ASSERT_EQ(3, iter5.upper_bound());
+  VerifyVisibleTombstones(&iter5, {{"j", "l", 2}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter5, {{"a", 0}, {"c", 0}, {"e", 0}, {"i", 0}, {"j", 2}, {"m", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyUnordered) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        9 /* upper_bound */);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(9, iter.upper_bound());
+  VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+                                    {"c", "e", 10},
+                                    {"c", "e", 8},
+                                    {"c", "e", 6},
+                                    {"e", "g", 8},
+                                    {"e", "g", 6},
+                                    {"g", "i", 6},
+                                    {"j", "l", 4},
+                                    {"j", "l", 2},
+                                    {"l", "n", 4}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter, {{"a", 0}, {"c", 8}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyForCompaction) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(
+      std::move(range_del_iter), bytewise_icmp, true /* for_compaction */,
+      {} /* snapshots */);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber /* upper_bound */);
+  VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+                                    {"c", "e", 10},
+                                    {"e", "g", 8},
+                                    {"g", "i", 6},
+                                    {"j", "l", 4},
+                                    {"l", "n", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest,
+       OverlapAndRepeatedStartKeyForCompactionWithSnapshot) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(
+      std::move(range_del_iter), bytewise_icmp, true /* for_compaction */,
+      {20, 9} /* upper_bounds */);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber /* upper_bound */);
+  VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+                                    {"c", "e", 10},
+                                    {"c", "e", 8},
+                                    {"e", "g", 8},
+                                    {"g", "i", 6},
+                                    {"j", "l", 4},
+                                    {"l", "n", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, IteratorSplitNoSnapshots) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber /* upper_bound */);
+
+  auto split_iters = iter.SplitBySnapshot({} /* snapshots */);
+  ASSERT_EQ(1, split_iters.size());
+
+  auto* split_iter = split_iters[kMaxSequenceNumber].get();
+  ASSERT_EQ(0, split_iter->lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, split_iter->upper_bound());
+  VerifyVisibleTombstones(split_iter, {{"a", "c", 10},
+                                       {"c", "e", 10},
+                                       {"e", "g", 8},
+                                       {"g", "i", 6},
+                                       {"j", "l", 4},
+                                       {"l", "n", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, IteratorSplitWithSnapshots) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber /* upper_bound */);
+
+  auto split_iters = iter.SplitBySnapshot({3, 5, 7, 9} /* snapshots */);
+  ASSERT_EQ(5, split_iters.size());
+
+  auto* split_iter1 = split_iters[3].get();
+  ASSERT_EQ(0, split_iter1->lower_bound());
+  ASSERT_EQ(3, split_iter1->upper_bound());
+  VerifyVisibleTombstones(split_iter1, {{"j", "l", 2}});
+
+  auto* split_iter2 = split_iters[5].get();
+  ASSERT_EQ(4, split_iter2->lower_bound());
+  ASSERT_EQ(5, split_iter2->upper_bound());
+  VerifyVisibleTombstones(split_iter2, {{"j", "l", 4}, {"l", "n", 4}});
+
+  auto* split_iter3 = split_iters[7].get();
+  ASSERT_EQ(6, split_iter3->lower_bound());
+  ASSERT_EQ(7, split_iter3->upper_bound());
+  VerifyVisibleTombstones(split_iter3,
+                          {{"c", "e", 6}, {"e", "g", 6}, {"g", "i", 6}});
+
+  auto* split_iter4 = split_iters[9].get();
+  ASSERT_EQ(8, split_iter4->lower_bound());
+  ASSERT_EQ(9, split_iter4->upper_bound());
+  VerifyVisibleTombstones(split_iter4, {{"c", "e", 8}, {"e", "g", 8}});
+
+  auto* split_iter5 = split_iters[kMaxSequenceNumber].get();
+  ASSERT_EQ(10, split_iter5->lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, split_iter5->upper_bound());
+  VerifyVisibleTombstones(split_iter5, {{"a", "c", 10}, {"c", "e", 10}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, SeekStartKey) {
+  // Same tombstones as OverlapAndRepeatedStartKey.
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"c", "g", 8},
+                                          {"c", "i", 6},
+                                          {"j", "n", 4},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+
+  FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+                                         kMaxSequenceNumber);
+  VerifySeek(
+      &iter1,
+      {{"a", {"a", "c", 10}}, {"e", {"e", "g", 8}}, {"l", {"l", "n", 4}}});
+  VerifySeekForPrev(
+      &iter1,
+      {{"a", {"a", "c", 10}}, {"e", {"e", "g", 8}}, {"l", {"l", "n", 4}}});
+
+  FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+                                         3 /* upper_bound */);
+  VerifySeek(&iter2, {{"a", {"j", "l", 2}},
+                      {"e", {"j", "l", 2}},
+                      {"l", {}, true /* out of range */}});
+  VerifySeekForPrev(&iter2, {{"a", {}, true /* out of range */},
+                             {"e", {}, true /* out of range */},
+                             {"l", {"j", "l", 2}}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, SeekCovered) {
+  // Same tombstones as OverlapAndRepeatedStartKey.
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"c", "g", 8},
+                                          {"c", "i", 6},
+                                          {"j", "n", 4},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+
+  FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+                                         kMaxSequenceNumber);
+  VerifySeek(
+      &iter1,
+      {{"b", {"a", "c", 10}}, {"f", {"e", "g", 8}}, {"m", {"l", "n", 4}}});
+  VerifySeekForPrev(
+      &iter1,
+      {{"b", {"a", "c", 10}}, {"f", {"e", "g", 8}}, {"m", {"l", "n", 4}}});
+
+  FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+                                         3 /* upper_bound */);
+  VerifySeek(&iter2, {{"b", {"j", "l", 2}},
+                      {"f", {"j", "l", 2}},
+                      {"m", {}, true /* out of range */}});
+  VerifySeekForPrev(&iter2, {{"b", {}, true /* out of range */},
+                             {"f", {}, true /* out of range */},
+                             {"m", {"j", "l", 2}}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, SeekEndKey) {
+  // Same tombstones as OverlapAndRepeatedStartKey.
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"c", "g", 8},
+                                          {"c", "i", 6},
+                                          {"j", "n", 4},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+
+  FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+                                         kMaxSequenceNumber);
+  VerifySeek(&iter1, {{"c", {"c", "e", 10}},
+                      {"g", {"g", "i", 6}},
+                      {"i", {"j", "l", 4}},
+                      {"n", {}, true /* out of range */}});
+  VerifySeekForPrev(&iter1, {{"c", {"c", "e", 10}},
+                             {"g", {"g", "i", 6}},
+                             {"i", {"g", "i", 6}},
+                             {"n", {"l", "n", 4}}});
+
+  FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+                                         3 /* upper_bound */);
+  VerifySeek(&iter2, {{"c", {"j", "l", 2}},
+                      {"g", {"j", "l", 2}},
+                      {"i", {"j", "l", 2}},
+                      {"n", {}, true /* out of range */}});
+  VerifySeekForPrev(&iter2, {{"c", {}, true /* out of range */},
+                             {"g", {}, true /* out of range */},
+                             {"i", {}, true /* out of range */},
+                             {"n", {"j", "l", 2}}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, SeekOutOfBounds) {
+  // Same tombstones as OverlapAndRepeatedStartKey.
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"c", "g", 8},
+                                          {"c", "i", 6},
+                                          {"j", "n", 4},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  VerifySeek(&iter, {{"", {"a", "c", 10}}, {"z", {}, true /* out of range */}});
+  VerifySeekForPrev(&iter,
+                    {{"", {}, true /* out of range */}, {"z", {"l", "n", 4}}});
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/read_callback.h b/src/rocksdb/db/read_callback.h
new file mode 100644
index 000000000..c042352db
--- /dev/null
+++ b/src/rocksdb/db/read_callback.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "db/dbformat.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ReadCallback {
+ public:
+  explicit ReadCallback(SequenceNumber last_visible_seq)
+      : max_visible_seq_(last_visible_seq) {}
+  ReadCallback(SequenceNumber last_visible_seq, SequenceNumber min_uncommitted)
+      : max_visible_seq_(last_visible_seq), min_uncommitted_(min_uncommitted) {}
+
+  virtual ~ReadCallback() {}
+
+  // Will be called to see if the seq number visible; if not it moves on to
+  // the next seq number.
+  virtual bool IsVisibleFullCheck(SequenceNumber seq) = 0;
+
+  inline bool IsVisible(SequenceNumber seq) {
+    assert(min_uncommitted_ > 0);
+    assert(min_uncommitted_ >= kMinUnCommittedSeq);
+    if (seq < min_uncommitted_) {  // handles seq == 0 as well
+      assert(seq <= max_visible_seq_);
+      return true;
+    } else if (max_visible_seq_ < seq) {
+      assert(seq != 0);
+      return false;
+    } else {
+      assert(seq != 0);  // already handled in the first if-then clause
+      return IsVisibleFullCheck(seq);
+    }
+  }
+
+  inline SequenceNumber max_visible_seq() { return max_visible_seq_; }
+
+  // Refresh to a more recent visible seq
+  virtual void Refresh(SequenceNumber seq) { max_visible_seq_ = seq; }
+
+ protected:
+  // The max visible seq, it is usually the snapshot but could be larger if
+  // transaction has its own writes written to db.
+  SequenceNumber max_visible_seq_ = kMaxSequenceNumber;
+  // Any seq less than min_uncommitted_ is committed.
+  const SequenceNumber min_uncommitted_ = kMinUnCommittedSeq;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/repair.cc b/src/rocksdb/db/repair.cc
new file mode 100644
index 000000000..1829a79f2
--- /dev/null
+++ b/src/rocksdb/db/repair.cc
@@ -0,0 +1,771 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Repairer does best effort recovery to recover as much data as possible after
+// a disaster without compromising consistency. It does not guarantee bringing
+// the database to a time consistent state.
+//
+// Repair process is broken into 4 phases:
+// (a) Find files
+// (b) Convert logs to tables
+// (c) Extract metadata
+// (d) Write Descriptor
+//
+// (a) Find files
+//
+// The repairer goes through all the files in the directory, and classifies them
+// based on their file name. Any file that cannot be identified by name will be
+// ignored.
+//
+// (b) Convert logs to table
+//
+// Every log file that is active is replayed. All sections of the file where the
+// checksum does not match is skipped over. We intentionally give preference to
+// data consistency.
+//
+// (c) Extract metadata
+//
+// We scan every table to compute
+// (1) smallest/largest for the table
+// (2) largest sequence number in the table
+// (3) oldest blob file referred to by the table (if applicable)
+//
+// If we are unable to scan the file, then we ignore the table.
+//
+// (d) Write Descriptor
+//
+// We generate descriptor contents:
+//  - log number is set to zero
+//  - next-file-number is set to 1 + largest file number we found
+//  - last-sequence-number is set to largest sequence# found across
+//    all tables (see 2c)
+//  - compaction pointers are cleared
+//  - every table file is added at level 0
+//
+// Possible optimization 1:
+//   (a) Compute total size and use to pick appropriate max-level M
+//   (b) Sort tables by largest sequence# in the table
+//   (c) For each table: if it overlaps earlier table, place in level-0,
+//       else place in level-M.
+//   (d) We can provide options for time consistent recovery and unsafe recovery
+//       (ignore checksum failure when applicable)
+// Possible optimization 2:
+//   Store per-table metadata (smallest, largest, largest-seq#, ...)
+//   in the table's meta section to speed up ScanTable.
+
+#ifndef ROCKSDB_LITE
+
+#include <cinttypes>
+
+#include "db/builder.h"
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "file/writable_file_writer.h"
+#include "logging/logging.h"
+#include "options/cf_options.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/unique_id_impl.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+class Repairer {
+ public:
+  Repairer(const std::string& dbname, const DBOptions& db_options,
+           const std::vector<ColumnFamilyDescriptor>& column_families,
+           const ColumnFamilyOptions& default_cf_opts,
+           const ColumnFamilyOptions& unknown_cf_opts, bool create_unknown_cfs)
+      : dbname_(dbname),
+        db_session_id_(DBImpl::GenerateDbSessionId(db_options.env)),
+        env_(db_options.env),
+        file_options_(),
+        db_options_(SanitizeOptions(dbname_, db_options)),
+        immutable_db_options_(ImmutableDBOptions(db_options_)),
+        icmp_(default_cf_opts.comparator),
+        default_cf_opts_(
+            SanitizeOptions(immutable_db_options_, default_cf_opts)),
+        default_iopts_(
+            ImmutableOptions(immutable_db_options_, default_cf_opts_)),
+        unknown_cf_opts_(
+            SanitizeOptions(immutable_db_options_, unknown_cf_opts)),
+        create_unknown_cfs_(create_unknown_cfs),
+        raw_table_cache_(
+            // TableCache can be small since we expect each table to be opened
+            // once.
+            NewLRUCache(10, db_options_.table_cache_numshardbits)),
+        table_cache_(new TableCache(default_iopts_, &file_options_,
+                                    raw_table_cache_.get(),
+                                    /*block_cache_tracer=*/nullptr,
+                                    /*io_tracer=*/nullptr, db_session_id_)),
+        wb_(db_options_.db_write_buffer_size),
+        wc_(db_options_.delayed_write_rate),
+        vset_(dbname_, &immutable_db_options_, file_options_,
+              raw_table_cache_.get(), &wb_, &wc_,
+              /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+              /*db_id=*/"", db_session_id_),
+        next_file_number_(1),
+        db_lock_(nullptr),
+        closed_(false) {
+    for (const auto& cfd : column_families) {
+      cf_name_to_opts_[cfd.name] = cfd.options;
+    }
+  }
+
+  const ColumnFamilyOptions* GetColumnFamilyOptions(
+      const std::string& cf_name) {
+    if (cf_name_to_opts_.find(cf_name) == cf_name_to_opts_.end()) {
+      if (create_unknown_cfs_) {
+        return &unknown_cf_opts_;
+      }
+      return nullptr;
+    }
+    return &cf_name_to_opts_[cf_name];
+  }
+
+  // Adds a column family to the VersionSet with cf_options_ and updates
+  // manifest.
+  Status AddColumnFamily(const std::string& cf_name, uint32_t cf_id) {
+    const auto* cf_opts = GetColumnFamilyOptions(cf_name);
+    if (cf_opts == nullptr) {
+      return Status::Corruption("Encountered unknown column family with name=" +
+                                cf_name + ", id=" + std::to_string(cf_id));
+    }
+    Options opts(db_options_, *cf_opts);
+    MutableCFOptions mut_cf_opts(opts);
+
+    VersionEdit edit;
+    edit.SetComparatorName(opts.comparator->Name());
+    edit.SetLogNumber(0);
+    edit.SetColumnFamily(cf_id);
+    ColumnFamilyData* cfd;
+    cfd = nullptr;
+    edit.AddColumnFamily(cf_name);
+
+    mutex_.Lock();
+    std::unique_ptr<FSDirectory> db_dir;
+    Status status = env_->GetFileSystem()->NewDirectory(dbname_, IOOptions(),
+                                                        &db_dir, nullptr);
+    if (status.ok()) {
+      status = vset_.LogAndApply(cfd, mut_cf_opts, &edit, &mutex_, db_dir.get(),
+                                 false /* new_descriptor_log */, cf_opts);
+    }
+    mutex_.Unlock();
+    return status;
+  }
+
+  Status Close() {
+    Status s = Status::OK();
+    if (!closed_) {
+      if (db_lock_ != nullptr) {
+        s = env_->UnlockFile(db_lock_);
+        db_lock_ = nullptr;
+      }
+      closed_ = true;
+    }
+    return s;
+  }
+
+  ~Repairer() { Close().PermitUncheckedError(); }
+
+  Status Run() {
+    Status status = env_->LockFile(LockFileName(dbname_), &db_lock_);
+    if (!status.ok()) {
+      return status;
+    }
+    status = FindFiles();
+    DBImpl* db_impl = nullptr;
+    if (status.ok()) {
+      // Discard older manifests and start a fresh one
+      for (size_t i = 0; i < manifests_.size(); i++) {
+        ArchiveFile(dbname_ + "/" + manifests_[i]);
+      }
+      // Just create a DBImpl temporarily so we can reuse NewDB()
+      db_impl = new DBImpl(db_options_, dbname_);
+      status = db_impl->NewDB(/*new_filenames=*/nullptr);
+    }
+    delete db_impl;
+
+    if (status.ok()) {
+      // Recover using the fresh manifest created by NewDB()
+      status =
+          vset_.Recover({{kDefaultColumnFamilyName, default_cf_opts_}}, false);
+    }
+    if (status.ok()) {
+      // Need to scan existing SST files first so the column families are
+      // created before we process WAL files
+      ExtractMetaData();
+
+      // ExtractMetaData() uses table_fds_ to know which SST files' metadata to
+      // extract -- we need to clear it here since metadata for existing SST
+      // files has been extracted already
+      table_fds_.clear();
+      ConvertLogFilesToTables();
+      ExtractMetaData();
+      status = AddTables();
+    }
+    if (status.ok()) {
+      uint64_t bytes = 0;
+      for (size_t i = 0; i < tables_.size(); i++) {
+        bytes += tables_[i].meta.fd.GetFileSize();
+      }
+      ROCKS_LOG_WARN(db_options_.info_log,
+                     "**** Repaired rocksdb %s; "
+                     "recovered %" ROCKSDB_PRIszt " files; %" PRIu64
+                     " bytes. "
+                     "Some data may have been lost. "
+                     "****",
+                     dbname_.c_str(), tables_.size(), bytes);
+    }
+    return status;
+  }
+
+ private:
+  struct TableInfo {
+    FileMetaData meta;
+    uint32_t column_family_id;
+    std::string column_family_name;
+  };
+
+  std::string const dbname_;
+  std::string db_session_id_;
+  Env* const env_;
+  const FileOptions file_options_;
+  const DBOptions db_options_;
+  const ImmutableDBOptions immutable_db_options_;
+  const InternalKeyComparator icmp_;
+  const ColumnFamilyOptions default_cf_opts_;
+  const ImmutableOptions default_iopts_;  // table_cache_ holds reference
+  const ColumnFamilyOptions unknown_cf_opts_;
+  const bool create_unknown_cfs_;
+  std::shared_ptr<Cache> raw_table_cache_;
+  std::unique_ptr<TableCache> table_cache_;
+  WriteBufferManager wb_;
+  WriteController wc_;
+  VersionSet vset_;
+  std::unordered_map<std::string, ColumnFamilyOptions> cf_name_to_opts_;
+  InstrumentedMutex mutex_;
+
+  std::vector<std::string> manifests_;
+  std::vector<FileDescriptor> table_fds_;
+  std::vector<uint64_t> logs_;
+  std::vector<TableInfo> tables_;
+  uint64_t next_file_number_;
+  // Lock over the persistent DB state. Non-nullptr iff successfully
+  // acquired.
+  FileLock* db_lock_;
+  bool closed_;
+
+  Status FindFiles() {
+    std::vector<std::string> filenames;
+    bool found_file = false;
+    std::vector<std::string> to_search_paths;
+
+    for (size_t path_id = 0; path_id < db_options_.db_paths.size(); path_id++) {
+      to_search_paths.push_back(db_options_.db_paths[path_id].path);
+    }
+
+    // search wal_dir if user uses a customize wal_dir
+    bool same = immutable_db_options_.IsWalDirSameAsDBPath(dbname_);
+    if (!same) {
+      to_search_paths.push_back(immutable_db_options_.wal_dir);
+    }
+
+    for (size_t path_id = 0; path_id < to_search_paths.size(); path_id++) {
+      ROCKS_LOG_INFO(db_options_.info_log, "Searching path %s\n",
+                     to_search_paths[path_id].c_str());
+      Status status = env_->GetChildren(to_search_paths[path_id], &filenames);
+      if (!status.ok()) {
+        return status;
+      }
+      if (!filenames.empty()) {
+        found_file = true;
+      }
+
+      uint64_t number;
+      FileType type;
+      for (size_t i = 0; i < filenames.size(); i++) {
+        if (ParseFileName(filenames[i], &number, &type)) {
+          if (type == kDescriptorFile) {
+            manifests_.push_back(filenames[i]);
+          } else {
+            if (number + 1 > next_file_number_) {
+              next_file_number_ = number + 1;
+            }
+            if (type == kWalFile) {
+              logs_.push_back(number);
+            } else if (type == kTableFile) {
+              table_fds_.emplace_back(number, static_cast<uint32_t>(path_id),
+                                      0);
+            } else {
+              // Ignore other files
+            }
+          }
+        }
+      }
+    }
+    if (!found_file) {
+      return Status::Corruption(dbname_, "repair found no files");
+    }
+    return Status::OK();
+  }
+
+  void ConvertLogFilesToTables() {
+    const auto& wal_dir = immutable_db_options_.GetWalDir();
+    for (size_t i = 0; i < logs_.size(); i++) {
+      // we should use LogFileName(wal_dir, logs_[i]) here. user might uses
+      // wal_dir option.
+      std::string logname = LogFileName(wal_dir, logs_[i]);
+      Status status = ConvertLogToTable(wal_dir, logs_[i]);
+      if (!status.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "Log #%" PRIu64 ": ignoring conversion error: %s",
+                       logs_[i], status.ToString().c_str());
+      }
+      ArchiveFile(logname);
+    }
+  }
+
+  Status ConvertLogToTable(const std::string& wal_dir, uint64_t log) {
+    struct LogReporter : public log::Reader::Reporter {
+      Env* env;
+      std::shared_ptr<Logger> info_log;
+      uint64_t lognum;
+      void Corruption(size_t bytes, const Status& s) override {
+        // We print error messages for corruption, but continue repairing.
+        ROCKS_LOG_ERROR(info_log, "Log #%" PRIu64 ": dropping %d bytes; %s",
+                        lognum, static_cast<int>(bytes), s.ToString().c_str());
+      }
+    };
+
+    // Open the log file
+    std::string logname = LogFileName(wal_dir, log);
+    const auto& fs = env_->GetFileSystem();
+    std::unique_ptr<SequentialFileReader> lfile_reader;
+    Status status = SequentialFileReader::Create(
+        fs, logname, fs->OptimizeForLogRead(file_options_), &lfile_reader,
+        nullptr /* dbg */, nullptr /* rate limiter */);
+    if (!status.ok()) {
+      return status;
+    }
+
+    // Create the log reader.
+    LogReporter reporter;
+    reporter.env = env_;
+    reporter.info_log = db_options_.info_log;
+    reporter.lognum = log;
+    // We intentionally make log::Reader do checksumming so that
+    // corruptions cause entire commits to be skipped instead of
+    // propagating bad information (like overly large sequence
+    // numbers).
+    log::Reader reader(db_options_.info_log, std::move(lfile_reader), &reporter,
+                       true /*enable checksum*/, log);
+
+    // Initialize per-column family memtables
+    for (auto* cfd : *vset_.GetColumnFamilySet()) {
+      cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                             kMaxSequenceNumber);
+    }
+    auto cf_mems = new ColumnFamilyMemTablesImpl(vset_.GetColumnFamilySet());
+
+    // Read all the records and add to a memtable
+    std::string scratch;
+    Slice record;
+    WriteBatch batch;
+    int counter = 0;
+    while (reader.ReadRecord(&record, &scratch)) {
+      if (record.size() < WriteBatchInternal::kHeader) {
+        reporter.Corruption(record.size(),
+                            Status::Corruption("log record too small"));
+        continue;
+      }
+      Status record_status = WriteBatchInternal::SetContents(&batch, record);
+      if (record_status.ok()) {
+        record_status =
+            WriteBatchInternal::InsertInto(&batch, cf_mems, nullptr, nullptr);
+      }
+      if (record_status.ok()) {
+        counter += WriteBatchInternal::Count(&batch);
+      } else {
+        ROCKS_LOG_WARN(db_options_.info_log, "Log #%" PRIu64 ": ignoring %s",
+                       log, record_status.ToString().c_str());
+      }
+    }
+
+    // Dump a table for each column family with entries in this log file.
+    for (auto* cfd : *vset_.GetColumnFamilySet()) {
+      // Do not record a version edit for this conversion to a Table
+      // since ExtractMetaData() will also generate edits.
+      MemTable* mem = cfd->mem();
+      if (mem->IsEmpty()) {
+        continue;
+      }
+
+      FileMetaData meta;
+      meta.fd = FileDescriptor(next_file_number_++, 0, 0);
+      ReadOptions ro;
+      ro.total_order_seek = true;
+      Arena arena;
+      ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
+      int64_t _current_time = 0;
+      immutable_db_options_.clock->GetCurrentTime(&_current_time)
+          .PermitUncheckedError();  // ignore error
+      const uint64_t current_time = static_cast<uint64_t>(_current_time);
+      meta.file_creation_time = current_time;
+      SnapshotChecker* snapshot_checker = DisableGCSnapshotChecker::Instance();
+
+      auto write_hint = cfd->CalculateSSTWriteHint(0);
+      std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+          range_del_iters;
+      auto range_del_iter = mem->NewRangeTombstoneIterator(
+          ro, kMaxSequenceNumber, false /* immutable_memtable */);
+      if (range_del_iter != nullptr) {
+        range_del_iters.emplace_back(range_del_iter);
+      }
+
+      IOStatus io_s;
+      CompressionOptions default_compression;
+      TableBuilderOptions tboptions(
+          *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(),
+          cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(),
+          kNoCompression, default_compression, cfd->GetID(), cfd->GetName(),
+          -1 /* level */, false /* is_bottommost */,
+          TableFileCreationReason::kRecovery, 0 /* oldest_key_time */,
+          0 /* file_creation_time */, "DB Repairer" /* db_id */, db_session_id_,
+          0 /*target_file_size*/, meta.fd.GetNumber());
+
+      SeqnoToTimeMapping empty_seqno_time_mapping;
+      status = BuildTable(
+          dbname_, /* versions */ nullptr, immutable_db_options_, tboptions,
+          file_options_, table_cache_.get(), iter.get(),
+          std::move(range_del_iters), &meta, nullptr /* blob_file_additions */,
+          {}, kMaxSequenceNumber, kMaxSequenceNumber, snapshot_checker,
+          false /* paranoid_file_checks*/, nullptr /* internal_stats */, &io_s,
+          nullptr /*IOTracer*/, BlobFileCreationReason::kRecovery,
+          empty_seqno_time_mapping, nullptr /* event_logger */, 0 /* job_id */,
+          Env::IO_HIGH, nullptr /* table_properties */, write_hint);
+      ROCKS_LOG_INFO(db_options_.info_log,
+                     "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s",
+                     log, counter, meta.fd.GetNumber(),
+                     status.ToString().c_str());
+      if (status.ok()) {
+        if (meta.fd.GetFileSize() > 0) {
+          table_fds_.push_back(meta.fd);
+        }
+      } else {
+        break;
+      }
+    }
+    delete cf_mems;
+    return status;
+  }
+
+  void ExtractMetaData() {
+    for (size_t i = 0; i < table_fds_.size(); i++) {
+      TableInfo t;
+      t.meta.fd = table_fds_[i];
+      Status status = ScanTable(&t);
+      if (!status.ok()) {
+        std::string fname = TableFileName(
+            db_options_.db_paths, t.meta.fd.GetNumber(), t.meta.fd.GetPathId());
+        char file_num_buf[kFormatFileNumberBufSize];
+        FormatFileNumber(t.meta.fd.GetNumber(), t.meta.fd.GetPathId(),
+                         file_num_buf, sizeof(file_num_buf));
+        ROCKS_LOG_WARN(db_options_.info_log, "Table #%s: ignoring %s",
+                       file_num_buf, status.ToString().c_str());
+        ArchiveFile(fname);
+      } else {
+        tables_.push_back(t);
+      }
+    }
+  }
+
+  Status ScanTable(TableInfo* t) {
+    std::string fname = TableFileName(
+        db_options_.db_paths, t->meta.fd.GetNumber(), t->meta.fd.GetPathId());
+    int counter = 0;
+    uint64_t file_size;
+    Status status = env_->GetFileSize(fname, &file_size);
+    t->meta.fd = FileDescriptor(t->meta.fd.GetNumber(), t->meta.fd.GetPathId(),
+                                file_size);
+    std::shared_ptr<const TableProperties> props;
+    if (status.ok()) {
+      status = table_cache_->GetTableProperties(file_options_, icmp_, t->meta,
+                                                &props);
+    }
+    if (status.ok()) {
+      auto s =
+          GetSstInternalUniqueId(props->db_id, props->db_session_id,
+                                 props->orig_file_number, &t->meta.unique_id);
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "Table #%" PRIu64
+                       ": unable to get unique id, default to Unknown.",
+                       t->meta.fd.GetNumber());
+      }
+      t->column_family_id = static_cast<uint32_t>(props->column_family_id);
+      if (t->column_family_id ==
+          TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) {
+        ROCKS_LOG_WARN(
+            db_options_.info_log,
+            "Table #%" PRIu64
+            ": column family unknown (probably due to legacy format); "
+            "adding to default column family id 0.",
+            t->meta.fd.GetNumber());
+        t->column_family_id = 0;
+      }
+
+      if (vset_.GetColumnFamilySet()->GetColumnFamily(t->column_family_id) ==
+          nullptr) {
+        status =
+            AddColumnFamily(props->column_family_name, t->column_family_id);
+      }
+      t->meta.oldest_ancester_time = props->creation_time;
+    }
+    ColumnFamilyData* cfd = nullptr;
+    if (status.ok()) {
+      cfd = vset_.GetColumnFamilySet()->GetColumnFamily(t->column_family_id);
+      if (cfd->GetName() != props->column_family_name) {
+        ROCKS_LOG_ERROR(
+            db_options_.info_log,
+            "Table #%" PRIu64
+            ": inconsistent column family name '%s'; expected '%s' for column "
+            "family id %" PRIu32 ".",
+            t->meta.fd.GetNumber(), props->column_family_name.c_str(),
+            cfd->GetName().c_str(), t->column_family_id);
+        status = Status::Corruption(dbname_, "inconsistent column family name");
+      }
+    }
+    if (status.ok()) {
+      ReadOptions ropts;
+      ropts.total_order_seek = true;
+      InternalIterator* iter = table_cache_->NewIterator(
+          ropts, file_options_, cfd->internal_comparator(), t->meta,
+          nullptr /* range_del_agg */,
+          cfd->GetLatestMutableCFOptions()->prefix_extractor,
+          /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
+          TableReaderCaller::kRepair, /*arena=*/nullptr, /*skip_filters=*/false,
+          /*level=*/-1, /*max_file_size_for_l0_meta_pin=*/0,
+          /*smallest_compaction_key=*/nullptr,
+          /*largest_compaction_key=*/nullptr,
+          /*allow_unprepared_value=*/false);
+      ParsedInternalKey parsed;
+      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+        Slice key = iter->key();
+        Status pik_status =
+            ParseInternalKey(key, &parsed, db_options_.allow_data_in_errors);
+        if (!pik_status.ok()) {
+          ROCKS_LOG_ERROR(db_options_.info_log,
+                          "Table #%" PRIu64 ": unparsable key - %s",
+                          t->meta.fd.GetNumber(), pik_status.getState());
+          continue;
+        }
+
+        counter++;
+
+        status = t->meta.UpdateBoundaries(key, iter->value(), parsed.sequence,
+                                          parsed.type);
+        if (!status.ok()) {
+          break;
+        }
+      }
+      if (status.ok() && !iter->status().ok()) {
+        status = iter->status();
+      }
+      delete iter;
+
+      ROCKS_LOG_INFO(db_options_.info_log, "Table #%" PRIu64 ": %d entries %s",
+                     t->meta.fd.GetNumber(), counter,
+                     status.ToString().c_str());
+    }
+    if (status.ok()) {
+      // XXX/FIXME: This is just basic, naive handling of range tombstones,
+      // like call to UpdateBoundariesForRange in builder.cc where we assume
+      // an SST file is a full sorted run. This probably needs the extra logic
+      // from compaction_job.cc around call to UpdateBoundariesForRange (to
+      // handle range tombstones extendingg beyond range of other entries).
+      ReadOptions ropts;
+      std::unique_ptr<FragmentedRangeTombstoneIterator> r_iter;
+      status = table_cache_->GetRangeTombstoneIterator(
+          ropts, cfd->internal_comparator(), t->meta, &r_iter);
+
+      if (r_iter) {
+        r_iter->SeekToFirst();
+
+        while (r_iter->Valid()) {
+          auto tombstone = r_iter->Tombstone();
+          auto kv = tombstone.Serialize();
+          t->meta.UpdateBoundariesForRange(
+              kv.first, tombstone.SerializeEndKey(), tombstone.seq_,
+              cfd->internal_comparator());
+          r_iter->Next();
+        }
+      }
+    }
+    return status;
+  }
+
+  Status AddTables() {
+    std::unordered_map<uint32_t, std::vector<const TableInfo*>> cf_id_to_tables;
+    SequenceNumber max_sequence = 0;
+    for (size_t i = 0; i < tables_.size(); i++) {
+      cf_id_to_tables[tables_[i].column_family_id].push_back(&tables_[i]);
+      if (max_sequence < tables_[i].meta.fd.largest_seqno) {
+        max_sequence = tables_[i].meta.fd.largest_seqno;
+      }
+    }
+    vset_.SetLastAllocatedSequence(max_sequence);
+    vset_.SetLastPublishedSequence(max_sequence);
+    vset_.SetLastSequence(max_sequence);
+
+    for (const auto& cf_id_and_tables : cf_id_to_tables) {
+      auto* cfd =
+          vset_.GetColumnFamilySet()->GetColumnFamily(cf_id_and_tables.first);
+      VersionEdit edit;
+      edit.SetComparatorName(cfd->user_comparator()->Name());
+      edit.SetLogNumber(0);
+      edit.SetNextFile(next_file_number_);
+      edit.SetColumnFamily(cfd->GetID());
+
+      // TODO(opt): separate out into multiple levels
+      for (const auto* table : cf_id_and_tables.second) {
+        edit.AddFile(
+            0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(),
+            table->meta.fd.GetFileSize(), table->meta.smallest,
+            table->meta.largest, table->meta.fd.smallest_seqno,
+            table->meta.fd.largest_seqno, table->meta.marked_for_compaction,
+            table->meta.temperature, table->meta.oldest_blob_file_number,
+            table->meta.oldest_ancester_time, table->meta.file_creation_time,
+            table->meta.file_checksum, table->meta.file_checksum_func_name,
+            table->meta.unique_id);
+      }
+      assert(next_file_number_ > 0);
+      vset_.MarkFileNumberUsed(next_file_number_ - 1);
+      mutex_.Lock();
+      std::unique_ptr<FSDirectory> db_dir;
+      Status status = env_->GetFileSystem()->NewDirectory(dbname_, IOOptions(),
+                                                          &db_dir, nullptr);
+      if (status.ok()) {
+        status = vset_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+                                   &edit, &mutex_, db_dir.get(),
+                                   false /* new_descriptor_log */);
+      }
+      mutex_.Unlock();
+      if (!status.ok()) {
+        return status;
+      }
+    }
+    return Status::OK();
+  }
+
+  void ArchiveFile(const std::string& fname) {
+    // Move into another directory.  E.g., for
+    //    dir/foo
+    // rename to
+    //    dir/lost/foo
+    const char* slash = strrchr(fname.c_str(), '/');
+    std::string new_dir;
+    if (slash != nullptr) {
+      new_dir.assign(fname.data(), slash - fname.data());
+    }
+    new_dir.append("/lost");
+    env_->CreateDir(new_dir).PermitUncheckedError();  // Ignore error
+    std::string new_file = new_dir;
+    new_file.append("/");
+    new_file.append((slash == nullptr) ? fname.c_str() : slash + 1);
+    Status s = env_->RenameFile(fname, new_file);
+    ROCKS_LOG_INFO(db_options_.info_log, "Archiving %s: %s\n", fname.c_str(),
+                   s.ToString().c_str());
+  }
+};
+
+Status GetDefaultCFOptions(
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    ColumnFamilyOptions* res) {
+  assert(res != nullptr);
+  auto iter = std::find_if(column_families.begin(), column_families.end(),
+                           [](const ColumnFamilyDescriptor& cfd) {
+                             return cfd.name == kDefaultColumnFamilyName;
+                           });
+  if (iter == column_families.end()) {
+    return Status::InvalidArgument(
+        "column_families", "Must contain entry for default column family");
+  }
+  *res = iter->options;
+  return Status::OK();
+}
+}  // anonymous namespace
+
+Status RepairDB(const std::string& dbname, const DBOptions& db_options,
+                const std::vector<ColumnFamilyDescriptor>& column_families) {
+  ColumnFamilyOptions default_cf_opts;
+  Status status = GetDefaultCFOptions(column_families, &default_cf_opts);
+  if (!status.ok()) {
+    return status;
+  }
+
+  Repairer repairer(dbname, db_options, column_families, default_cf_opts,
+                    ColumnFamilyOptions() /* unknown_cf_opts */,
+                    false /* create_unknown_cfs */);
+  status = repairer.Run();
+  if (status.ok()) {
+    status = repairer.Close();
+  }
+  return status;
+}
+
+Status RepairDB(const std::string& dbname, const DBOptions& db_options,
+                const std::vector<ColumnFamilyDescriptor>& column_families,
+                const ColumnFamilyOptions& unknown_cf_opts) {
+  ColumnFamilyOptions default_cf_opts;
+  Status status = GetDefaultCFOptions(column_families, &default_cf_opts);
+  if (!status.ok()) {
+    return status;
+  }
+
+  Repairer repairer(dbname, db_options, column_families, default_cf_opts,
+                    unknown_cf_opts, true /* create_unknown_cfs */);
+  status = repairer.Run();
+  if (status.ok()) {
+    status = repairer.Close();
+  }
+  return status;
+}
+
+Status RepairDB(const std::string& dbname, const Options& options) {
+  Options opts(options);
+  DBOptions db_options(opts);
+  ColumnFamilyOptions cf_options(opts);
+
+  Repairer repairer(dbname, db_options, {}, cf_options /* default_cf_opts */,
+                    cf_options /* unknown_cf_opts */,
+                    true /* create_unknown_cfs */);
+  Status status = repairer.Run();
+  if (status.ok()) {
+    status = repairer.Close();
+  }
+  return status;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/repair_test.cc b/src/rocksdb/db/repair_test.cc
new file mode 100644
index 000000000..644a9270d
--- /dev/null
+++ b/src/rocksdb/db/repair_test.cc
@@ -0,0 +1,442 @@
+//  Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/options.h"
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "file/file_util.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/transaction_log.h"
+#include "table/unique_id_impl.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+class RepairTest : public DBTestBase {
+ public:
+  RepairTest() : DBTestBase("repair_test", /*env_do_fsync=*/true) {}
+
+  Status GetFirstSstPath(std::string* first_sst_path) {
+    assert(first_sst_path != nullptr);
+    first_sst_path->clear();
+    uint64_t manifest_size;
+    std::vector<std::string> files;
+    Status s = db_->GetLiveFiles(files, &manifest_size);
+    if (s.ok()) {
+      auto sst_iter =
+          std::find_if(files.begin(), files.end(), [](const std::string& file) {
+            uint64_t number;
+            FileType type;
+            bool ok = ParseFileName(file, &number, &type);
+            return ok && type == kTableFile;
+          });
+      *first_sst_path = sst_iter == files.end() ? "" : dbname_ + *sst_iter;
+    }
+    return s;
+  }
+
+  void ReopenWithSstIdVerify() {
+    std::atomic_int verify_passed{0};
+    SyncPoint::GetInstance()->SetCallBack(
+        "BlockBasedTable::Open::PassedVerifyUniqueId", [&](void* arg) {
+          // override job status
+          auto id = static_cast<UniqueId64x2*>(arg);
+          assert(*id != kNullUniqueId64x2);
+          verify_passed++;
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    auto options = CurrentOptions();
+    options.verify_sst_unique_id_in_manifest = true;
+    Reopen(options);
+
+    ASSERT_GT(verify_passed, 0);
+    SyncPoint::GetInstance()->DisableProcessing();
+  }
+};
+
+TEST_F(RepairTest, LostManifest) {
+  // Add a couple SST files, delete the manifest, and verify RepairDB() saves
+  // the day.
+  ASSERT_OK(Put("key", "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("key2", "val2"));
+  ASSERT_OK(Flush());
+  // Need to get path before Close() deletes db_, but delete it after Close() to
+  // ensure Close() didn't change the manifest.
+  std::string manifest_path =
+      DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+  Close();
+  ASSERT_OK(env_->FileExists(manifest_path));
+  ASSERT_OK(env_->DeleteFile(manifest_path));
+  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+  ReopenWithSstIdVerify();
+
+  ASSERT_EQ(Get("key"), "val");
+  ASSERT_EQ(Get("key2"), "val2");
+}
+
+TEST_F(RepairTest, LostManifestMoreDbFeatures) {
+  // Add a couple SST files, delete the manifest, and verify RepairDB() saves
+  // the day.
+  ASSERT_OK(Put("key", "val"));
+  ASSERT_OK(Put("key2", "val2"));
+  ASSERT_OK(Put("key3", "val3"));
+  ASSERT_OK(Put("key4", "val4"));
+  ASSERT_OK(Flush());
+  // Test an SST file containing only a range tombstone
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key2",
+                             "key3z"));
+  ASSERT_OK(Flush());
+  // Need to get path before Close() deletes db_, but delete it after Close() to
+  // ensure Close() didn't change the manifest.
+  std::string manifest_path =
+      DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+  Close();
+  ASSERT_OK(env_->FileExists(manifest_path));
+  ASSERT_OK(env_->DeleteFile(manifest_path));
+  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+
+  // repair from sst should work with unique_id verification
+  ReopenWithSstIdVerify();
+
+  ASSERT_EQ(Get("key"), "val");
+  ASSERT_EQ(Get("key2"), "NOT_FOUND");
+  ASSERT_EQ(Get("key3"), "NOT_FOUND");
+  ASSERT_EQ(Get("key4"), "val4");
+}
+
+TEST_F(RepairTest, CorruptManifest) {
+  // Manifest is in an invalid format. Expect a full recovery.
+  ASSERT_OK(Put("key", "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("key2", "val2"));
+  ASSERT_OK(Flush());
+  // Need to get path before Close() deletes db_, but overwrite it after Close()
+  // to ensure Close() didn't change the manifest.
+  std::string manifest_path =
+      DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+  Close();
+  ASSERT_OK(env_->FileExists(manifest_path));
+
+  ASSERT_OK(CreateFile(env_->GetFileSystem(), manifest_path, "blah",
+                       false /* use_fsync */));
+  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+
+  ReopenWithSstIdVerify();
+
+  ASSERT_EQ(Get("key"), "val");
+  ASSERT_EQ(Get("key2"), "val2");
+}
+
+TEST_F(RepairTest, IncompleteManifest) {
+  // In this case, the manifest is valid but does not reference all of the SST
+  // files. Expect a full recovery.
+  ASSERT_OK(Put("key", "val"));
+  ASSERT_OK(Flush());
+  std::string orig_manifest_path =
+      DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+  CopyFile(orig_manifest_path, orig_manifest_path + ".tmp");
+  ASSERT_OK(Put("key2", "val2"));
+  ASSERT_OK(Flush());
+  // Need to get path before Close() deletes db_, but overwrite it after Close()
+  // to ensure Close() didn't change the manifest.
+  std::string new_manifest_path =
+      DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+  Close();
+  ASSERT_OK(env_->FileExists(new_manifest_path));
+  // Replace the manifest with one that is only aware of the first SST file.
+  CopyFile(orig_manifest_path + ".tmp", new_manifest_path);
+  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+
+  ReopenWithSstIdVerify();
+
+  ASSERT_EQ(Get("key"), "val");
+  ASSERT_EQ(Get("key2"), "val2");
+}
+
+TEST_F(RepairTest, PostRepairSstFileNumbering) {
+  // Verify after a DB is repaired, new files will be assigned higher numbers
+  // than old files.
+  ASSERT_OK(Put("key", "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("key2", "val2"));
+  ASSERT_OK(Flush());
+  uint64_t pre_repair_file_num = dbfull()->TEST_Current_Next_FileNo();
+  Close();
+
+  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+
+  ReopenWithSstIdVerify();
+
+  uint64_t post_repair_file_num = dbfull()->TEST_Current_Next_FileNo();
+  ASSERT_GE(post_repair_file_num, pre_repair_file_num);
+}
+
+TEST_F(RepairTest, LostSst) {
+  // Delete one of the SST files but preserve the manifest that refers to it,
+  // then verify the DB is still usable for the intact SST.
+  ASSERT_OK(Put("key", "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("key2", "val2"));
+  ASSERT_OK(Flush());
+  std::string sst_path;
+  ASSERT_OK(GetFirstSstPath(&sst_path));
+  ASSERT_FALSE(sst_path.empty());
+  ASSERT_OK(env_->DeleteFile(sst_path));
+
+  Close();
+  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+  ReopenWithSstIdVerify();
+
+  // Exactly one of the key-value pairs should be in the DB now.
+  ASSERT_TRUE((Get("key") == "val") != (Get("key2") == "val2"));
+}
+
+TEST_F(RepairTest, CorruptSst) {
+  // Corrupt one of the SST files but preserve the manifest that refers to it,
+  // then verify the DB is still usable for the intact SST.
+  ASSERT_OK(Put("key", "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("key2", "val2"));
+  ASSERT_OK(Flush());
+  std::string sst_path;
+  ASSERT_OK(GetFirstSstPath(&sst_path));
+  ASSERT_FALSE(sst_path.empty());
+
+  ASSERT_OK(CreateFile(env_->GetFileSystem(), sst_path, "blah",
+                       false /* use_fsync */));
+
+  Close();
+  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+  ReopenWithSstIdVerify();
+
+  // Exactly one of the key-value pairs should be in the DB now.
+  ASSERT_TRUE((Get("key") == "val") != (Get("key2") == "val2"));
+}
+
+TEST_F(RepairTest, UnflushedSst) {
+  // This test case invokes repair while some data is unflushed, then verifies
+  // that data is in the db.
+  ASSERT_OK(Put("key", "val"));
+  VectorLogPtr wal_files;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
+  ASSERT_EQ(wal_files.size(), 1);
+  {
+    uint64_t total_ssts_size;
+    std::unordered_map<std::string, uint64_t> sst_files;
+    ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size));
+    ASSERT_EQ(total_ssts_size, 0);
+  }
+  // Need to get path before Close() deletes db_, but delete it after Close() to
+  // ensure Close() didn't change the manifest.
+  std::string manifest_path =
+      DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+  Close();
+  ASSERT_OK(env_->FileExists(manifest_path));
+  ASSERT_OK(env_->DeleteFile(manifest_path));
+  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+  ReopenWithSstIdVerify();
+
+  ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
+  ASSERT_EQ(wal_files.size(), 0);
+  {
+    uint64_t total_ssts_size;
+    std::unordered_map<std::string, uint64_t> sst_files;
+    ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size));
+    ASSERT_GT(total_ssts_size, 0);
+  }
+  ASSERT_EQ(Get("key"), "val");
+}
+
+TEST_F(RepairTest, SeparateWalDir) {
+  do {
+    Options options = CurrentOptions();
+    DestroyAndReopen(options);
+    ASSERT_OK(Put("key", "val"));
+    ASSERT_OK(Put("foo", "bar"));
+    VectorLogPtr wal_files;
+    ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
+    ASSERT_EQ(wal_files.size(), 1);
+    {
+      uint64_t total_ssts_size;
+      std::unordered_map<std::string, uint64_t> sst_files;
+      ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size));
+      ASSERT_EQ(total_ssts_size, 0);
+    }
+    std::string manifest_path =
+        DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+    Close();
+    ASSERT_OK(env_->FileExists(manifest_path));
+    ASSERT_OK(env_->DeleteFile(manifest_path));
+    ASSERT_OK(RepairDB(dbname_, options));
+
+    // make sure that all WALs are converted to SSTables.
+    options.wal_dir = "";
+
+    ReopenWithSstIdVerify();
+    ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
+    ASSERT_EQ(wal_files.size(), 0);
+    {
+      uint64_t total_ssts_size;
+      std::unordered_map<std::string, uint64_t> sst_files;
+      ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size));
+      ASSERT_GT(total_ssts_size, 0);
+    }
+    ASSERT_EQ(Get("key"), "val");
+    ASSERT_EQ(Get("foo"), "bar");
+
+  } while (ChangeWalOptions());
+}
+
+TEST_F(RepairTest, RepairMultipleColumnFamilies) {
+  // Verify repair logic associates SST files with their original column
+  // families.
+  const int kNumCfs = 3;
+  const int kEntriesPerCf = 2;
+  DestroyAndReopen(CurrentOptions());
+  CreateAndReopenWithCF({"pikachu1", "pikachu2"}, CurrentOptions());
+  for (int i = 0; i < kNumCfs; ++i) {
+    for (int j = 0; j < kEntriesPerCf; ++j) {
+      ASSERT_OK(Put(i, "key" + std::to_string(j), "val" + std::to_string(j)));
+      if (j == kEntriesPerCf - 1 && i == kNumCfs - 1) {
+        // Leave one unflushed so we can verify WAL entries are properly
+        // associated with column families.
+        continue;
+      }
+      ASSERT_OK(Flush(i));
+    }
+  }
+
+  // Need to get path before Close() deletes db_, but delete it after Close() to
+  // ensure Close() doesn't re-create the manifest.
+  std::string manifest_path =
+      DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+  Close();
+  ASSERT_OK(env_->FileExists(manifest_path));
+  ASSERT_OK(env_->DeleteFile(manifest_path));
+
+  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+
+  ReopenWithColumnFamilies({"default", "pikachu1", "pikachu2"},
+                           CurrentOptions());
+  for (int i = 0; i < kNumCfs; ++i) {
+    for (int j = 0; j < kEntriesPerCf; ++j) {
+      ASSERT_EQ(Get(i, "key" + std::to_string(j)), "val" + std::to_string(j));
+    }
+  }
+}
+
+TEST_F(RepairTest, RepairColumnFamilyOptions) {
+  // Verify repair logic uses correct ColumnFamilyOptions when repairing a
+  // database with different options for column families.
+  const int kNumCfs = 2;
+  const int kEntriesPerCf = 2;
+
+  Options opts(CurrentOptions()), rev_opts(CurrentOptions());
+  opts.comparator = BytewiseComparator();
+  rev_opts.comparator = ReverseBytewiseComparator();
+
+  DestroyAndReopen(opts);
+  CreateColumnFamilies({"reverse"}, rev_opts);
+  ReopenWithColumnFamilies({"default", "reverse"},
+                           std::vector<Options>{opts, rev_opts});
+  for (int i = 0; i < kNumCfs; ++i) {
+    for (int j = 0; j < kEntriesPerCf; ++j) {
+      ASSERT_OK(Put(i, "key" + std::to_string(j), "val" + std::to_string(j)));
+      if (i == kNumCfs - 1 && j == kEntriesPerCf - 1) {
+        // Leave one unflushed so we can verify RepairDB's flush logic
+        continue;
+      }
+      ASSERT_OK(Flush(i));
+    }
+  }
+  Close();
+
+  // RepairDB() records the comparator in the manifest, and DB::Open would fail
+  // if a different comparator were used.
+  ASSERT_OK(RepairDB(dbname_, opts, {{"default", opts}, {"reverse", rev_opts}},
+                     opts /* unknown_cf_opts */));
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "reverse"},
+                                        std::vector<Options>{opts, rev_opts}));
+  for (int i = 0; i < kNumCfs; ++i) {
+    for (int j = 0; j < kEntriesPerCf; ++j) {
+      ASSERT_EQ(Get(i, "key" + std::to_string(j)), "val" + std::to_string(j));
+    }
+  }
+
+  // Examine table properties to verify RepairDB() used the right options when
+  // converting WAL->SST
+  TablePropertiesCollection fname_to_props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[1], &fname_to_props));
+  ASSERT_EQ(fname_to_props.size(), 2U);
+  for (const auto& fname_and_props : fname_to_props) {
+    std::string comparator_name(rev_opts.comparator->Name());
+    ASSERT_EQ(comparator_name, fname_and_props.second->comparator_name);
+  }
+  Close();
+
+  // Also check comparator when it's provided via "unknown" CF options
+  ASSERT_OK(RepairDB(dbname_, opts, {{"default", opts}},
+                     rev_opts /* unknown_cf_opts */));
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "reverse"},
+                                        std::vector<Options>{opts, rev_opts}));
+  for (int i = 0; i < kNumCfs; ++i) {
+    for (int j = 0; j < kEntriesPerCf; ++j) {
+      ASSERT_EQ(Get(i, "key" + std::to_string(j)), "val" + std::to_string(j));
+    }
+  }
+}
+
+TEST_F(RepairTest, DbNameContainsTrailingSlash) {
+  {
+    bool tmp;
+    if (env_->AreFilesSame("", "", &tmp).IsNotSupported()) {
+      fprintf(stderr,
+              "skipping RepairTest.DbNameContainsTrailingSlash due to "
+              "unsupported Env::AreFilesSame\n");
+      return;
+    }
+  }
+
+  ASSERT_OK(Put("key", "val"));
+  ASSERT_OK(Flush());
+  Close();
+
+  ASSERT_OK(RepairDB(dbname_ + "/", CurrentOptions()));
+  ReopenWithSstIdVerify();
+  ASSERT_EQ(Get("key"), "val");
+}
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as RepairDB is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/seqno_time_test.cc b/src/rocksdb/db/seqno_time_test.cc
new file mode 100644
index 000000000..12394a368
--- /dev/null
+++ b/src/rocksdb/db/seqno_time_test.cc
@@ -0,0 +1,996 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "db/periodic_task_scheduler.h"
+#include "db/seqno_to_time_mapping.h"
+#include "port/stack_trace.h"
+#include "rocksdb/iostats_context.h"
+#include "rocksdb/utilities/debug.h"
+#include "test_util/mock_time_env.h"
+
+#ifndef ROCKSDB_LITE
+
+namespace ROCKSDB_NAMESPACE {
+
+class SeqnoTimeTest : public DBTestBase {
+ public:
+  SeqnoTimeTest() : DBTestBase("seqno_time_test", /*env_do_fsync=*/false) {
+    mock_clock_ = std::make_shared<MockSystemClock>(env_->GetSystemClock());
+    mock_env_ = std::make_unique<CompositeEnvWrapper>(env_, mock_clock_);
+  }
+
+ protected:
+  std::unique_ptr<Env> mock_env_;
+  std::shared_ptr<MockSystemClock> mock_clock_;
+
+  void SetUp() override {
+    mock_clock_->InstallTimedWaitFixCallback();
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::StartPeriodicTaskScheduler:Init", [&](void* arg) {
+          auto periodic_task_scheduler_ptr =
+              reinterpret_cast<PeriodicTaskScheduler*>(arg);
+          periodic_task_scheduler_ptr->TEST_OverrideTimer(mock_clock_.get());
+        });
+  }
+
+  // make sure the file is not in cache, otherwise it won't have IO info
+  void AssertKeyTemperature(int key_id, Temperature expected_temperature) {
+    get_iostats_context()->Reset();
+    IOStatsContext* iostats = get_iostats_context();
+    std::string result = Get(Key(key_id));
+    ASSERT_FALSE(result.empty());
+    ASSERT_GT(iostats->bytes_read, 0);
+    switch (expected_temperature) {
+      case Temperature::kUnknown:
+        ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_read_count,
+                  0);
+        ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read,
+                  0);
+        break;
+      case Temperature::kCold:
+        ASSERT_GT(iostats->file_io_stats_by_temperature.cold_file_read_count,
+                  0);
+        ASSERT_GT(iostats->file_io_stats_by_temperature.cold_file_bytes_read,
+                  0);
+        break;
+      default:
+        // the test only support kCold now for the bottommost temperature
+        FAIL();
+    }
+  }
+};
+
+TEST_F(SeqnoTimeTest, TemperatureBasicUniversal) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kNumKeys = 100;
+  const int kKeyPerSec = 10;
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.preclude_last_level_data_seconds = 10000;
+  options.env = mock_env_.get();
+  options.bottommost_temperature = Temperature::kCold;
+  options.num_levels = kNumLevels;
+  DestroyAndReopen(options);
+
+  // pass some time first, otherwise the first a few keys write time are going
+  // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); });
+
+  int sst_num = 0;
+  // Write files that are overlap and enough to trigger compaction
+  for (; sst_num < kNumTrigger; sst_num++) {
+    for (int i = 0; i < kNumKeys; i++) {
+      ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+      dbfull()->TEST_WaitForPeridicTaskRun([&] {
+        mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+      });
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+
+  // All data is hot, only output to penultimate level
+  ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // read a random key, which should be hot (kUnknown)
+  AssertKeyTemperature(20, Temperature::kUnknown);
+
+  // Write more data, but still all hot until the 10th SST, as:
+  // write a key every 10 seconds, 100 keys per SST, each SST takes 1000 seconds
+  // The preclude_last_level_data_seconds is 10k
+  for (; sst_num < kNumTrigger * 2; sst_num++) {
+    for (int i = 0; i < kNumKeys; i++) {
+      ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+      dbfull()->TEST_WaitForPeridicTaskRun([&] {
+        mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+      });
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->WaitForCompact(true));
+    ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+    ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+  }
+
+  // Now we have both hot data and cold data
+  for (; sst_num < kNumTrigger * 3; sst_num++) {
+    for (int i = 0; i < kNumKeys; i++) {
+      ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+      dbfull()->TEST_WaitForPeridicTaskRun([&] {
+        mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+      });
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->WaitForCompact(true));
+  }
+
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  uint64_t hot_data_size = GetSstSizeHelper(Temperature::kUnknown);
+  uint64_t cold_data_size = GetSstSizeHelper(Temperature::kCold);
+  ASSERT_GT(hot_data_size, 0);
+  ASSERT_GT(cold_data_size, 0);
+  // the first a few key should be cold
+  AssertKeyTemperature(20, Temperature::kCold);
+
+  for (int i = 0; i < 30; i++) {
+    dbfull()->TEST_WaitForPeridicTaskRun([&] {
+      mock_clock_->MockSleepForSeconds(static_cast<int>(20 * kKeyPerSec));
+    });
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+    // the hot/cold data cut off range should be between i * 20 + 200 -> 250
+    AssertKeyTemperature(i * 20 + 250, Temperature::kUnknown);
+    AssertKeyTemperature(i * 20 + 200, Temperature::kCold);
+  }
+
+  ASSERT_LT(GetSstSizeHelper(Temperature::kUnknown), hot_data_size);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), cold_data_size);
+
+  // Wait again, the most of the data should be cold after that
+  // but it may not be all cold, because if there's no new data write to SST,
+  // the compaction will not get the new seqno->time sampling to decide the last
+  // a few data's time.
+  for (int i = 0; i < 5; i++) {
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(1000)); });
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  }
+
+  // any random data close to the end should be cold
+  AssertKeyTemperature(1000, Temperature::kCold);
+
+  // close explicitly, because the env is local variable which will be released
+  // first.
+  Close();
+}
+
+TEST_F(SeqnoTimeTest, TemperatureBasicLevel) {
+  const int kNumLevels = 7;
+  const int kNumKeys = 100;
+
+  Options options = CurrentOptions();
+  options.preclude_last_level_data_seconds = 10000;
+  options.env = mock_env_.get();
+  options.bottommost_temperature = Temperature::kCold;
+  options.num_levels = kNumLevels;
+  options.level_compaction_dynamic_level_bytes = true;
+  // TODO(zjay): for level compaction, auto-compaction may stuck in deadloop, if
+  //  the penultimate level score > 1, but the hot is not cold enough to compact
+  //  to last level, which will keep triggering compaction.
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  // pass some time first, otherwise the first a few keys write time are going
+  // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+
+  int sst_num = 0;
+  // Write files that are overlap
+  for (; sst_num < 4; sst_num++) {
+    for (int i = 0; i < kNumKeys; i++) {
+      ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+      dbfull()->TEST_WaitForPeridicTaskRun(
+          [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+    }
+    ASSERT_OK(Flush());
+  }
+
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  // All data is hot, only output to penultimate level
+  ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // read a random key, which should be hot (kUnknown)
+  AssertKeyTemperature(20, Temperature::kUnknown);
+
+  // Adding more data to have mixed hot and cold data
+  for (; sst_num < 14; sst_num++) {
+    for (int i = 0; i < kNumKeys; i++) {
+      ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+      dbfull()->TEST_WaitForPeridicTaskRun(
+          [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // Compact the files to the last level which should split the hot/cold data
+  MoveFilesToLevel(6);
+  uint64_t hot_data_size = GetSstSizeHelper(Temperature::kUnknown);
+  uint64_t cold_data_size = GetSstSizeHelper(Temperature::kCold);
+  ASSERT_GT(hot_data_size, 0);
+  ASSERT_GT(cold_data_size, 0);
+  // the first a few key should be cold
+  AssertKeyTemperature(20, Temperature::kCold);
+
+  // Wait some time, with each wait, the cold data is increasing and hot data is
+  // decreasing
+  for (int i = 0; i < 30; i++) {
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(200)); });
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+    uint64_t pre_hot = hot_data_size;
+    uint64_t pre_cold = cold_data_size;
+    hot_data_size = GetSstSizeHelper(Temperature::kUnknown);
+    cold_data_size = GetSstSizeHelper(Temperature::kCold);
+    ASSERT_LT(hot_data_size, pre_hot);
+    ASSERT_GT(cold_data_size, pre_cold);
+
+    // the hot/cold cut_off key should be around i * 20 + 400 -> 450
+    AssertKeyTemperature(i * 20 + 450, Temperature::kUnknown);
+    AssertKeyTemperature(i * 20 + 400, Temperature::kCold);
+  }
+
+  // Wait again, the most of the data should be cold after that
+  // hot data might not be empty, because if we don't write new data, there's
+  // no seqno->time sampling available to the compaction
+  for (int i = 0; i < 5; i++) {
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(1000)); });
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  }
+
+  // any random data close to the end should be cold
+  AssertKeyTemperature(1000, Temperature::kCold);
+
+  Close();
+}
+
+enum class SeqnoTimeTestType : char {
+  kTrackInternalTimeSeconds = 0,
+  kPrecludeLastLevel = 1,
+  kBothSetTrackSmaller = 2,
+};
+
+class SeqnoTimeTablePropTest
+    : public SeqnoTimeTest,
+      public ::testing::WithParamInterface<SeqnoTimeTestType> {
+ public:
+  SeqnoTimeTablePropTest() : SeqnoTimeTest() {}
+
+  void SetTrackTimeDurationOptions(uint64_t track_time_duration,
+                                   Options& options) const {
+    // either option set will enable the time tracking feature
+    switch (GetParam()) {
+      case SeqnoTimeTestType::kTrackInternalTimeSeconds:
+        options.preclude_last_level_data_seconds = 0;
+        options.preserve_internal_time_seconds = track_time_duration;
+        break;
+      case SeqnoTimeTestType::kPrecludeLastLevel:
+        options.preclude_last_level_data_seconds = track_time_duration;
+        options.preserve_internal_time_seconds = 0;
+        break;
+      case SeqnoTimeTestType::kBothSetTrackSmaller:
+        options.preclude_last_level_data_seconds = track_time_duration;
+        options.preserve_internal_time_seconds = track_time_duration / 10;
+        break;
+    }
+  }
+};
+
+INSTANTIATE_TEST_CASE_P(
+    SeqnoTimeTablePropTest, SeqnoTimeTablePropTest,
+    ::testing::Values(SeqnoTimeTestType::kTrackInternalTimeSeconds,
+                      SeqnoTimeTestType::kPrecludeLastLevel,
+                      SeqnoTimeTestType::kBothSetTrackSmaller));
+
+TEST_P(SeqnoTimeTablePropTest, BasicSeqnoToTimeMapping) {
+  Options options = CurrentOptions();
+  SetTrackTimeDurationOptions(10000, options);
+
+  options.env = mock_env_.get();
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  std::set<uint64_t> checked_file_nums;
+  SequenceNumber start_seq = dbfull()->GetLatestSequenceNumber();
+  // Write a key every 10 seconds
+  for (int i = 0; i < 200; i++) {
+    ASSERT_OK(Put(Key(i), "value"));
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+  }
+  ASSERT_OK(Flush());
+  TablePropertiesCollection tables_props;
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+  ASSERT_EQ(tables_props.size(), 1);
+  auto it = tables_props.begin();
+  SeqnoToTimeMapping tp_mapping;
+  ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+  ASSERT_OK(tp_mapping.Sort());
+  ASSERT_FALSE(tp_mapping.Empty());
+  auto seqs = tp_mapping.TEST_GetInternalMapping();
+  // about ~20 seqs->time entries, because the sample rate is 10000/100, and it
+  // passes 2k time.
+  ASSERT_GE(seqs.size(), 19);
+  ASSERT_LE(seqs.size(), 21);
+  SequenceNumber seq_end = dbfull()->GetLatestSequenceNumber();
+  for (auto i = start_seq; i < start_seq + 10; i++) {
+    ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), (i + 1) * 10);
+  }
+  start_seq += 10;
+  for (auto i = start_seq; i < seq_end; i++) {
+    // The result is within the range
+    ASSERT_GE(tp_mapping.GetOldestApproximateTime(i), (i - 10) * 10);
+    ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), (i + 10) * 10);
+  }
+  checked_file_nums.insert(it->second->orig_file_number);
+  start_seq = seq_end;
+
+  // Write a key every 1 seconds
+  for (int i = 0; i < 200; i++) {
+    ASSERT_OK(Put(Key(i + 190), "value"));
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(1)); });
+  }
+  seq_end = dbfull()->GetLatestSequenceNumber();
+  ASSERT_OK(Flush());
+  tables_props.clear();
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+  ASSERT_EQ(tables_props.size(), 2);
+  it = tables_props.begin();
+  while (it != tables_props.end()) {
+    if (!checked_file_nums.count(it->second->orig_file_number)) {
+      break;
+    }
+    it++;
+  }
+  ASSERT_TRUE(it != tables_props.end());
+
+  tp_mapping.Clear();
+  ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+  ASSERT_OK(tp_mapping.Sort());
+  seqs = tp_mapping.TEST_GetInternalMapping();
+  // There only a few time sample
+  ASSERT_GE(seqs.size(), 1);
+  ASSERT_LE(seqs.size(), 3);
+  for (auto i = start_seq; i < seq_end; i++) {
+    // The result is not very accurate, as there is more data write within small
+    // range of time
+    ASSERT_GE(tp_mapping.GetOldestApproximateTime(i), (i - start_seq) + 1000);
+    ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), (i - start_seq) + 3000);
+  }
+  checked_file_nums.insert(it->second->orig_file_number);
+  start_seq = seq_end;
+
+  // Write a key every 200 seconds
+  for (int i = 0; i < 200; i++) {
+    ASSERT_OK(Put(Key(i + 380), "value"));
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(200)); });
+  }
+  seq_end = dbfull()->GetLatestSequenceNumber();
+  ASSERT_OK(Flush());
+  tables_props.clear();
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+  ASSERT_EQ(tables_props.size(), 3);
+  it = tables_props.begin();
+  while (it != tables_props.end()) {
+    if (!checked_file_nums.count(it->second->orig_file_number)) {
+      break;
+    }
+    it++;
+  }
+  ASSERT_TRUE(it != tables_props.end());
+
+  tp_mapping.Clear();
+  ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+  ASSERT_OK(tp_mapping.Sort());
+  seqs = tp_mapping.TEST_GetInternalMapping();
+  // The sequence number -> time entries should be maxed
+  ASSERT_GE(seqs.size(), 99);
+  ASSERT_LE(seqs.size(), 101);
+  for (auto i = start_seq; i < seq_end - 99; i++) {
+    // likely the first 100 entries reports 0
+    ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), (i - start_seq) + 3000);
+  }
+  start_seq += 101;
+
+  for (auto i = start_seq; i < seq_end; i++) {
+    ASSERT_GE(tp_mapping.GetOldestApproximateTime(i),
+              (i - start_seq) * 200 + 22200);
+    ASSERT_LE(tp_mapping.GetOldestApproximateTime(i),
+              (i - start_seq) * 200 + 22600);
+  }
+  checked_file_nums.insert(it->second->orig_file_number);
+  start_seq = seq_end;
+
+  // Write a key every 100 seconds
+  for (int i = 0; i < 200; i++) {
+    ASSERT_OK(Put(Key(i + 570), "value"));
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
+  }
+  seq_end = dbfull()->GetLatestSequenceNumber();
+  ASSERT_OK(Flush());
+  tables_props.clear();
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+  ASSERT_EQ(tables_props.size(), 4);
+  it = tables_props.begin();
+  while (it != tables_props.end()) {
+    if (!checked_file_nums.count(it->second->orig_file_number)) {
+      break;
+    }
+    it++;
+  }
+  ASSERT_TRUE(it != tables_props.end());
+  tp_mapping.Clear();
+  ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+  ASSERT_OK(tp_mapping.Sort());
+  seqs = tp_mapping.TEST_GetInternalMapping();
+  ASSERT_GE(seqs.size(), 99);
+  ASSERT_LE(seqs.size(), 101);
+
+  checked_file_nums.insert(it->second->orig_file_number);
+
+  // re-enable compaction
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "false"},
+  }));
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  tables_props.clear();
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+  ASSERT_GE(tables_props.size(), 1);
+  it = tables_props.begin();
+  while (it != tables_props.end()) {
+    if (!checked_file_nums.count(it->second->orig_file_number)) {
+      break;
+    }
+    it++;
+  }
+  ASSERT_TRUE(it != tables_props.end());
+  tp_mapping.Clear();
+  ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+  ASSERT_OK(tp_mapping.Sort());
+  seqs = tp_mapping.TEST_GetInternalMapping();
+  ASSERT_GE(seqs.size(), 99);
+  ASSERT_LE(seqs.size(), 101);
+  for (auto i = start_seq; i < seq_end - 99; i++) {
+    // likely the first 100 entries reports 0
+    ASSERT_LE(tp_mapping.GetOldestApproximateTime(i),
+              (i - start_seq) * 100 + 50000);
+  }
+  start_seq += 101;
+
+  for (auto i = start_seq; i < seq_end; i++) {
+    ASSERT_GE(tp_mapping.GetOldestApproximateTime(i),
+              (i - start_seq) * 100 + 52200);
+    ASSERT_LE(tp_mapping.GetOldestApproximateTime(i),
+              (i - start_seq) * 100 + 52400);
+  }
+  ASSERT_OK(db_->Close());
+}
+
+TEST_P(SeqnoTimeTablePropTest, MultiCFs) {
+  Options options = CurrentOptions();
+  options.preclude_last_level_data_seconds = 0;
+  options.preserve_internal_time_seconds = 0;
+  options.env = mock_env_.get();
+  options.stats_dump_period_sec = 0;
+  options.stats_persist_period_sec = 0;
+  ReopenWithColumnFamilies({"default"}, options);
+
+  const PeriodicTaskScheduler& scheduler =
+      dbfull()->TEST_GetPeriodicTaskScheduler();
+  ASSERT_FALSE(scheduler.TEST_HasTask(PeriodicTaskType::kRecordSeqnoTime));
+
+  // Write some data and increase the current time
+  for (int i = 0; i < 200; i++) {
+    ASSERT_OK(Put(Key(i), "value"));
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
+  }
+  ASSERT_OK(Flush());
+  TablePropertiesCollection tables_props;
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+  ASSERT_EQ(tables_props.size(), 1);
+  auto it = tables_props.begin();
+  ASSERT_TRUE(it->second->seqno_to_time_mapping.empty());
+
+  ASSERT_TRUE(dbfull()->TEST_GetSeqnoToTimeMapping().Empty());
+
+  Options options_1 = options;
+  SetTrackTimeDurationOptions(10000, options_1);
+  CreateColumnFamilies({"one"}, options_1);
+  ASSERT_TRUE(scheduler.TEST_HasTask(PeriodicTaskType::kRecordSeqnoTime));
+
+  // Write some data to the default CF (without preclude_last_level feature)
+  for (int i = 0; i < 200; i++) {
+    ASSERT_OK(Put(Key(i), "value"));
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
+  }
+  ASSERT_OK(Flush());
+
+  // Write some data to the CF one
+  for (int i = 0; i < 20; i++) {
+    ASSERT_OK(Put(1, Key(i), "value"));
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+  }
+  ASSERT_OK(Flush(1));
+  tables_props.clear();
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[1], &tables_props));
+  ASSERT_EQ(tables_props.size(), 1);
+  it = tables_props.begin();
+  SeqnoToTimeMapping tp_mapping;
+  ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+  ASSERT_OK(tp_mapping.Sort());
+  ASSERT_FALSE(tp_mapping.Empty());
+  auto seqs = tp_mapping.TEST_GetInternalMapping();
+  ASSERT_GE(seqs.size(), 1);
+  ASSERT_LE(seqs.size(), 4);
+
+  // Create one more CF with larger preclude_last_level time
+  Options options_2 = options;
+  SetTrackTimeDurationOptions(1000000, options_2);  // 1m
+  CreateColumnFamilies({"two"}, options_2);
+
+  // Add more data to CF "two" to fill the in memory mapping
+  for (int i = 0; i < 2000; i++) {
+    ASSERT_OK(Put(2, Key(i), "value"));
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
+  }
+  seqs = dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping();
+  ASSERT_GE(seqs.size(), 1000 - 1);
+  ASSERT_LE(seqs.size(), 1000 + 1);
+
+  ASSERT_OK(Flush(2));
+  tables_props.clear();
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[2], &tables_props));
+  ASSERT_EQ(tables_props.size(), 1);
+  it = tables_props.begin();
+  tp_mapping.Clear();
+  ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+  ASSERT_OK(tp_mapping.Sort());
+  seqs = tp_mapping.TEST_GetInternalMapping();
+  // the max encoded entries is 100
+  ASSERT_GE(seqs.size(), 100 - 1);
+  ASSERT_LE(seqs.size(), 100 + 1);
+
+  // Write some data to default CF, as all memtable with preclude_last_level
+  // enabled have flushed, the in-memory seqno->time mapping should be cleared
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(0, Key(i), "value"));
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
+  }
+  seqs = dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping();
+  ASSERT_OK(Flush(0));
+
+  // trigger compaction for CF "two" and make sure the compaction output has
+  // seqno_to_time_mapping
+  for (int j = 0; j < 3; j++) {
+    for (int i = 0; i < 200; i++) {
+      ASSERT_OK(Put(2, Key(i), "value"));
+      dbfull()->TEST_WaitForPeridicTaskRun(
+          [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
+    }
+    ASSERT_OK(Flush(2));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  tables_props.clear();
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[2], &tables_props));
+  ASSERT_EQ(tables_props.size(), 1);
+  it = tables_props.begin();
+  tp_mapping.Clear();
+  ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+  ASSERT_OK(tp_mapping.Sort());
+  seqs = tp_mapping.TEST_GetInternalMapping();
+  ASSERT_GE(seqs.size(), 99);
+  ASSERT_LE(seqs.size(), 101);
+
+  for (int j = 0; j < 2; j++) {
+    for (int i = 0; i < 200; i++) {
+      ASSERT_OK(Put(0, Key(i), "value"));
+      dbfull()->TEST_WaitForPeridicTaskRun(
+          [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
+    }
+    ASSERT_OK(Flush(0));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  tables_props.clear();
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[0], &tables_props));
+  ASSERT_EQ(tables_props.size(), 1);
+  it = tables_props.begin();
+  ASSERT_TRUE(it->second->seqno_to_time_mapping.empty());
+
+  // Write some data to CF "two", but don't flush to accumulate
+  for (int i = 0; i < 1000; i++) {
+    ASSERT_OK(Put(2, Key(i), "value"));
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
+  }
+  ASSERT_GE(
+      dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping().size(),
+      500);
+  // After dropping CF "one", the in-memory mapping will be change to only
+  // follow CF "two" options.
+  ASSERT_OK(db_->DropColumnFamily(handles_[1]));
+  ASSERT_LE(
+      dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping().size(),
+      100 + 5);
+
+  // After dropping CF "two", the in-memory mapping is also clear.
+  ASSERT_OK(db_->DropColumnFamily(handles_[2]));
+  ASSERT_EQ(
+      dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping().size(),
+      0);
+
+  // And the timer worker is stopped
+  ASSERT_FALSE(scheduler.TEST_HasTask(PeriodicTaskType::kRecordSeqnoTime));
+  Close();
+}
+
+TEST_P(SeqnoTimeTablePropTest, MultiInstancesBasic) {
+  const int kInstanceNum = 2;
+
+  Options options = CurrentOptions();
+  SetTrackTimeDurationOptions(10000, options);
+  options.env = mock_env_.get();
+  options.stats_dump_period_sec = 0;
+  options.stats_persist_period_sec = 0;
+
+  auto dbs = std::vector<DB*>(kInstanceNum);
+  for (int i = 0; i < kInstanceNum; i++) {
+    ASSERT_OK(
+        DB::Open(options, test::PerThreadDBPath(std::to_string(i)), &(dbs[i])));
+  }
+
+  // Make sure the second instance has the worker enabled
+  auto dbi = static_cast_with_check<DBImpl>(dbs[1]);
+  WriteOptions wo;
+  for (int i = 0; i < 200; i++) {
+    ASSERT_OK(dbi->Put(wo, Key(i), "value"));
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
+  }
+  SeqnoToTimeMapping seqno_to_time_mapping = dbi->TEST_GetSeqnoToTimeMapping();
+  ASSERT_GT(seqno_to_time_mapping.Size(), 10);
+
+  for (int i = 0; i < kInstanceNum; i++) {
+    ASSERT_OK(dbs[i]->Close());
+    delete dbs[i];
+  }
+}
+
+TEST_P(SeqnoTimeTablePropTest, SeqnoToTimeMappingUniversal) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kNumKeys = 100;
+
+  Options options = CurrentOptions();
+  SetTrackTimeDurationOptions(10000, options);
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = kNumLevels;
+  options.env = mock_env_.get();
+
+  DestroyAndReopen(options);
+
+  std::atomic_uint64_t num_seqno_zeroing{0};
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::PrepareOutput:ZeroingSeq",
+      [&](void* /*arg*/) { num_seqno_zeroing++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  int sst_num = 0;
+  for (; sst_num < kNumTrigger - 1; sst_num++) {
+    for (int i = 0; i < kNumKeys; i++) {
+      ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+      dbfull()->TEST_WaitForPeridicTaskRun(
+          [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+    }
+    ASSERT_OK(Flush());
+  }
+  TablePropertiesCollection tables_props;
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+  ASSERT_EQ(tables_props.size(), 3);
+  for (const auto& props : tables_props) {
+    ASSERT_FALSE(props.second->seqno_to_time_mapping.empty());
+    SeqnoToTimeMapping tp_mapping;
+    ASSERT_OK(tp_mapping.Add(props.second->seqno_to_time_mapping));
+    ASSERT_OK(tp_mapping.Sort());
+    ASSERT_FALSE(tp_mapping.Empty());
+    auto seqs = tp_mapping.TEST_GetInternalMapping();
+    ASSERT_GE(seqs.size(), 10 - 1);
+    ASSERT_LE(seqs.size(), 10 + 1);
+  }
+
+  // Trigger a compaction
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+  }
+  sst_num++;
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  tables_props.clear();
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+  ASSERT_EQ(tables_props.size(), 1);
+
+  auto it = tables_props.begin();
+  SeqnoToTimeMapping tp_mapping;
+  ASSERT_FALSE(it->second->seqno_to_time_mapping.empty());
+  ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+
+  // compact to the last level
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  // make sure the data is all compacted to penultimate level if the feature is
+  // on, otherwise, compacted to the last level.
+  if (options.preclude_last_level_data_seconds > 0) {
+    ASSERT_GT(NumTableFilesAtLevel(5), 0);
+    ASSERT_EQ(NumTableFilesAtLevel(6), 0);
+  } else {
+    ASSERT_EQ(NumTableFilesAtLevel(5), 0);
+    ASSERT_GT(NumTableFilesAtLevel(6), 0);
+  }
+
+  // regardless the file is on the last level or not, it should keep the time
+  // information and sequence number are not set
+  tables_props.clear();
+  tp_mapping.Clear();
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+
+  ASSERT_EQ(tables_props.size(), 1);
+  ASSERT_EQ(num_seqno_zeroing, 0);
+
+  it = tables_props.begin();
+  ASSERT_FALSE(it->second->seqno_to_time_mapping.empty());
+  ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+
+  // make half of the data expired
+  mock_clock_->MockSleepForSeconds(static_cast<int>(8000));
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  tables_props.clear();
+  tp_mapping.Clear();
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+
+  if (options.preclude_last_level_data_seconds > 0) {
+    ASSERT_EQ(tables_props.size(), 2);
+  } else {
+    ASSERT_EQ(tables_props.size(), 1);
+  }
+  ASSERT_GT(num_seqno_zeroing, 0);
+  std::vector<KeyVersion> key_versions;
+  ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(),
+                              std::numeric_limits<size_t>::max(),
+                              &key_versions));
+  // make sure there're more than 300 keys and first 100 keys are having seqno
+  // zeroed out, the last 100 key seqno not zeroed out
+  ASSERT_GT(key_versions.size(), 300);
+  for (int i = 0; i < 100; i++) {
+    ASSERT_EQ(key_versions[i].sequence, 0);
+  }
+  auto rit = key_versions.rbegin();
+  for (int i = 0; i < 100; i++) {
+    ASSERT_GT(rit->sequence, 0);
+    rit++;
+  }
+
+  // make all data expired and compact again to push it to the last level
+  // regardless if the tiering feature is enabled or not
+  mock_clock_->MockSleepForSeconds(static_cast<int>(20000));
+
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  ASSERT_GT(num_seqno_zeroing, 0);
+  ASSERT_GT(NumTableFilesAtLevel(6), 0);
+
+  Close();
+}
+
+TEST_F(SeqnoTimeTest, MappingAppend) {
+  SeqnoToTimeMapping test(/*max_time_duration=*/100, /*max_capacity=*/10);
+
+  // ignore seqno == 0, as it may mean the seqno is zeroed out
+  ASSERT_FALSE(test.Append(0, 9));
+
+  ASSERT_TRUE(test.Append(3, 10));
+  auto size = test.Size();
+  // normal add
+  ASSERT_TRUE(test.Append(10, 11));
+  size++;
+  ASSERT_EQ(size, test.Size());
+
+  // Append unsorted
+  ASSERT_FALSE(test.Append(8, 12));
+  ASSERT_EQ(size, test.Size());
+
+  // Append with the same seqno, newer time will be accepted
+  ASSERT_TRUE(test.Append(10, 12));
+  ASSERT_EQ(size, test.Size());
+  // older time will be ignored
+  ASSERT_FALSE(test.Append(10, 9));
+  ASSERT_EQ(size, test.Size());
+
+  // new seqno with old time will be ignored
+  ASSERT_FALSE(test.Append(12, 8));
+  ASSERT_EQ(size, test.Size());
+}
+
+TEST_F(SeqnoTimeTest, GetOldestApproximateTime) {
+  SeqnoToTimeMapping test(/*max_time_duration=*/100, /*max_capacity=*/10);
+
+  ASSERT_EQ(test.GetOldestApproximateTime(10), kUnknownSeqnoTime);
+
+  test.Append(3, 10);
+
+  ASSERT_EQ(test.GetOldestApproximateTime(2), kUnknownSeqnoTime);
+  ASSERT_EQ(test.GetOldestApproximateTime(3), 10);
+  ASSERT_EQ(test.GetOldestApproximateTime(10), 10);
+
+  test.Append(10, 100);
+
+  test.Append(100, 1000);
+  ASSERT_EQ(test.GetOldestApproximateTime(10), 100);
+  ASSERT_EQ(test.GetOldestApproximateTime(40), 100);
+  ASSERT_EQ(test.GetOldestApproximateTime(111), 1000);
+}
+
+TEST_F(SeqnoTimeTest, Sort) {
+  SeqnoToTimeMapping test;
+
+  // single entry
+  test.Add(10, 11);
+  ASSERT_OK(test.Sort());
+  ASSERT_EQ(test.Size(), 1);
+
+  // duplicate, should be removed by sort
+  test.Add(10, 11);
+  // same seqno, but older time, should be removed
+  test.Add(10, 9);
+
+  // unuseful ones, should be removed by sort
+  test.Add(11, 9);
+  test.Add(9, 8);
+
+  // Good ones
+  test.Add(1, 10);
+  test.Add(100, 100);
+
+  ASSERT_OK(test.Sort());
+
+  auto seqs = test.TEST_GetInternalMapping();
+
+  std::deque<SeqnoToTimeMapping::SeqnoTimePair> expected;
+  expected.emplace_back(1, 10);
+  expected.emplace_back(10, 11);
+  expected.emplace_back(100, 100);
+
+  ASSERT_EQ(expected, seqs);
+}
+
+TEST_F(SeqnoTimeTest, EncodeDecodeBasic) {
+  SeqnoToTimeMapping test(0, 1000);
+
+  std::string output;
+  test.Encode(output, 0, 1000, 100);
+  ASSERT_TRUE(output.empty());
+
+  for (int i = 1; i <= 1000; i++) {
+    ASSERT_TRUE(test.Append(i, i * 10));
+  }
+  test.Encode(output, 0, 1000, 100);
+
+  ASSERT_FALSE(output.empty());
+
+  SeqnoToTimeMapping decoded;
+  ASSERT_OK(decoded.Add(output));
+  ASSERT_OK(decoded.Sort());
+  ASSERT_EQ(decoded.Size(), SeqnoToTimeMapping::kMaxSeqnoTimePairsPerSST);
+  ASSERT_EQ(test.Size(), 1000);
+
+  for (SequenceNumber seq = 0; seq <= 1000; seq++) {
+    // test has the more accurate time mapping, encode only pick
+    // kMaxSeqnoTimePairsPerSST number of entries, which is less accurate
+    uint64_t target_time = test.GetOldestApproximateTime(seq);
+    ASSERT_GE(decoded.GetOldestApproximateTime(seq),
+              target_time < 200 ? 0 : target_time - 200);
+    ASSERT_LE(decoded.GetOldestApproximateTime(seq), target_time);
+  }
+}
+
+TEST_F(SeqnoTimeTest, EncodeDecodePerferNewTime) {
+  SeqnoToTimeMapping test(0, 10);
+
+  test.Append(1, 10);
+  test.Append(5, 17);
+  test.Append(6, 25);
+  test.Append(8, 30);
+
+  std::string output;
+  test.Encode(output, 1, 10, 0, 3);
+
+  SeqnoToTimeMapping decoded;
+  ASSERT_OK(decoded.Add(output));
+  ASSERT_OK(decoded.Sort());
+
+  ASSERT_EQ(decoded.Size(), 3);
+
+  auto seqs = decoded.TEST_GetInternalMapping();
+  std::deque<SeqnoToTimeMapping::SeqnoTimePair> expected;
+  expected.emplace_back(1, 10);
+  expected.emplace_back(6, 25);
+  expected.emplace_back(8, 30);
+  ASSERT_EQ(expected, seqs);
+
+  // Add a few large time number
+  test.Append(10, 100);
+  test.Append(13, 200);
+  test.Append(16, 300);
+
+  output.clear();
+  test.Encode(output, 1, 20, 0, 4);
+  decoded.Clear();
+  ASSERT_OK(decoded.Add(output));
+  ASSERT_OK(decoded.Sort());
+  ASSERT_EQ(decoded.Size(), 4);
+
+  expected.clear();
+  expected.emplace_back(1, 10);
+  // entry #6, #8 are skipped as they are too close to #1.
+  // entry #100 is also within skip range, but if it's skipped, there not enough
+  // number to fill 4 entries, so select it.
+  expected.emplace_back(10, 100);
+  expected.emplace_back(13, 200);
+  expected.emplace_back(16, 300);
+  seqs = decoded.TEST_GetInternalMapping();
+  ASSERT_EQ(expected, seqs);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/seqno_to_time_mapping.cc b/src/rocksdb/db/seqno_to_time_mapping.cc
new file mode 100644
index 000000000..c69209929
--- /dev/null
+++ b/src/rocksdb/db/seqno_to_time_mapping.cc
@@ -0,0 +1,341 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/seqno_to_time_mapping.h"
+
+#include "db/version_edit.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+uint64_t SeqnoToTimeMapping::GetOldestApproximateTime(
+    const SequenceNumber seqno) const {
+  assert(is_sorted_);
+  auto it = std::upper_bound(seqno_time_mapping_.begin(),
+                             seqno_time_mapping_.end(), seqno);
+  if (it == seqno_time_mapping_.begin()) {
+    return 0;
+  }
+  it--;
+  return it->time;
+}
+
+void SeqnoToTimeMapping::Add(SequenceNumber seqno, uint64_t time) {
+  if (seqno == 0) {
+    return;
+  }
+  is_sorted_ = false;
+  seqno_time_mapping_.emplace_back(seqno, time);
+}
+
+void SeqnoToTimeMapping::TruncateOldEntries(const uint64_t now) {
+  assert(is_sorted_);
+
+  if (max_time_duration_ == 0) {
+    return;
+  }
+
+  const uint64_t cut_off_time =
+      now > max_time_duration_ ? now - max_time_duration_ : 0;
+  assert(cut_off_time <= now);  // no overflow
+
+  auto it = std::upper_bound(
+      seqno_time_mapping_.begin(), seqno_time_mapping_.end(), cut_off_time,
+      [](uint64_t target, const SeqnoTimePair& other) -> bool {
+        return target < other.time;
+      });
+  if (it == seqno_time_mapping_.begin()) {
+    return;
+  }
+  it--;
+  seqno_time_mapping_.erase(seqno_time_mapping_.begin(), it);
+}
+
+SequenceNumber SeqnoToTimeMapping::GetOldestSequenceNum(uint64_t time) {
+  assert(is_sorted_);
+
+  auto it = std::upper_bound(
+      seqno_time_mapping_.begin(), seqno_time_mapping_.end(), time,
+      [](uint64_t target, const SeqnoTimePair& other) -> bool {
+        return target < other.time;
+      });
+  if (it == seqno_time_mapping_.begin()) {
+    return 0;
+  }
+  it--;
+  return it->seqno;
+}
+
+// The encoded format is:
+//  [num_of_entries][[seqno][time],[seqno][time],...]
+//      ^                                 ^
+//    var_int                      delta_encoded (var_int)
+void SeqnoToTimeMapping::Encode(std::string& dest, const SequenceNumber start,
+                                const SequenceNumber end, const uint64_t now,
+                                const uint64_t output_size) const {
+  assert(is_sorted_);
+  if (start > end) {
+    // It could happen when the SST file is empty, the initial value of min
+    // sequence number is kMaxSequenceNumber and max is 0.
+    // The empty output file will be removed in the final step of compaction.
+    return;
+  }
+
+  auto start_it = std::upper_bound(seqno_time_mapping_.begin(),
+                                   seqno_time_mapping_.end(), start);
+  if (start_it != seqno_time_mapping_.begin()) {
+    start_it--;
+  }
+
+  auto end_it = std::upper_bound(seqno_time_mapping_.begin(),
+                                 seqno_time_mapping_.end(), end);
+  if (end_it == seqno_time_mapping_.begin()) {
+    return;
+  }
+  if (start_it >= end_it) {
+    return;
+  }
+
+  // truncate old entries that are not needed
+  if (max_time_duration_ > 0) {
+    const uint64_t cut_off_time =
+        now > max_time_duration_ ? now - max_time_duration_ : 0;
+    while (start_it < end_it && start_it->time < cut_off_time) {
+      start_it++;
+    }
+  }
+  // to include the first element
+  if (start_it != seqno_time_mapping_.begin()) {
+    start_it--;
+  }
+
+  // If there are more data than needed, pick the entries for encoding.
+  // It's not the most optimized algorithm for selecting the best representative
+  // entries over the time.
+  // It starts from the beginning and makes sure the distance is larger than
+  // `(end - start) / size` before selecting the number. For example, for the
+  // following list, pick 3 entries (it will pick seqno #1, #6, #8):
+  //    1 -> 10
+  //    5 -> 17
+  //    6 -> 25
+  //    8 -> 30
+  // first, it always picks the first one, then there are 2 num_entries_to_fill
+  // and the time difference between current one vs. the last one is
+  // (30 - 10) = 20. 20/2 = 10. So it will skip until 10+10 = 20. => it skips
+  // #5 and pick #6.
+  // But the most optimized solution is picking #1 #5 #8, as it will be more
+  // evenly distributed for time. Anyway the following algorithm is simple and
+  // may over-select new data, which is good. We do want more accurate time
+  // information for recent data.
+  std::deque<SeqnoTimePair> output_copy;
+  if (std::distance(start_it, end_it) > static_cast<int64_t>(output_size)) {
+    int64_t num_entries_to_fill = static_cast<int64_t>(output_size);
+    auto last_it = end_it;
+    last_it--;
+    uint64_t end_time = last_it->time;
+    uint64_t skip_until_time = 0;
+    for (auto it = start_it; it < end_it; it++) {
+      // skip if it's not reach the skip_until_time yet
+      if (std::distance(it, end_it) > num_entries_to_fill &&
+          it->time < skip_until_time) {
+        continue;
+      }
+      output_copy.push_back(*it);
+      num_entries_to_fill--;
+      if (std::distance(it, end_it) > num_entries_to_fill &&
+          num_entries_to_fill > 0) {
+        // If there are more entries than we need, re-calculate the
+        // skip_until_time, which means skip until that time
+        skip_until_time =
+            it->time + ((end_time - it->time) / num_entries_to_fill);
+      }
+    }
+
+    // Make sure all entries are filled
+    assert(num_entries_to_fill == 0);
+    start_it = output_copy.begin();
+    end_it = output_copy.end();
+  }
+
+  // Delta encode the data
+  uint64_t size = std::distance(start_it, end_it);
+  PutVarint64(&dest, size);
+  SeqnoTimePair base;
+  for (auto it = start_it; it < end_it; it++) {
+    assert(base < *it);
+    SeqnoTimePair val = *it - base;
+    base = *it;
+    val.Encode(dest);
+  }
+}
+
+Status SeqnoToTimeMapping::Add(const std::string& seqno_time_mapping_str) {
+  Slice input(seqno_time_mapping_str);
+  if (input.empty()) {
+    return Status::OK();
+  }
+  uint64_t size;
+  if (!GetVarint64(&input, &size)) {
+    return Status::Corruption("Invalid sequence number time size");
+  }
+  is_sorted_ = false;
+  SeqnoTimePair base;
+  for (uint64_t i = 0; i < size; i++) {
+    SeqnoTimePair val;
+    Status s = val.Decode(input);
+    if (!s.ok()) {
+      return s;
+    }
+    val.Add(base);
+    seqno_time_mapping_.emplace_back(val);
+    base = val;
+  }
+  return Status::OK();
+}
+
+void SeqnoToTimeMapping::SeqnoTimePair::Encode(std::string& dest) const {
+  PutVarint64Varint64(&dest, seqno, time);
+}
+
+Status SeqnoToTimeMapping::SeqnoTimePair::Decode(Slice& input) {
+  if (!GetVarint64(&input, &seqno)) {
+    return Status::Corruption("Invalid sequence number");
+  }
+  if (!GetVarint64(&input, &time)) {
+    return Status::Corruption("Invalid time");
+  }
+  return Status::OK();
+}
+
+bool SeqnoToTimeMapping::Append(SequenceNumber seqno, uint64_t time) {
+  assert(is_sorted_);
+
+  // skip seq number 0, which may have special meaning, like zeroed out data
+  if (seqno == 0) {
+    return false;
+  }
+  if (!Empty()) {
+    if (seqno < Last().seqno || time < Last().time) {
+      return false;
+    }
+    if (seqno == Last().seqno) {
+      Last().time = time;
+      return true;
+    }
+    if (time == Last().time) {
+      // new sequence has the same time as old one, no need to add new mapping
+      return false;
+    }
+  }
+
+  seqno_time_mapping_.emplace_back(seqno, time);
+
+  if (seqno_time_mapping_.size() > max_capacity_) {
+    seqno_time_mapping_.pop_front();
+  }
+  return true;
+}
+
+bool SeqnoToTimeMapping::Resize(uint64_t min_time_duration,
+                                uint64_t max_time_duration) {
+  uint64_t new_max_capacity =
+      CalculateMaxCapacity(min_time_duration, max_time_duration);
+  if (new_max_capacity == max_capacity_) {
+    return false;
+  } else if (new_max_capacity < seqno_time_mapping_.size()) {
+    uint64_t delta = seqno_time_mapping_.size() - new_max_capacity;
+    seqno_time_mapping_.erase(seqno_time_mapping_.begin(),
+                              seqno_time_mapping_.begin() + delta);
+  }
+  max_capacity_ = new_max_capacity;
+  return true;
+}
+
+Status SeqnoToTimeMapping::Sort() {
+  if (is_sorted_) {
+    return Status::OK();
+  }
+  if (seqno_time_mapping_.empty()) {
+    is_sorted_ = true;
+    return Status::OK();
+  }
+
+  std::deque<SeqnoTimePair> copy = std::move(seqno_time_mapping_);
+
+  std::sort(copy.begin(), copy.end());
+
+  seqno_time_mapping_.clear();
+
+  // remove seqno = 0, which may have special meaning, like zeroed out data
+  while (copy.front().seqno == 0) {
+    copy.pop_front();
+  }
+
+  SeqnoTimePair prev = copy.front();
+  for (const auto& it : copy) {
+    // If sequence number is the same, pick the one with larger time, which is
+    // more accurate than the older time.
+    if (it.seqno == prev.seqno) {
+      assert(it.time >= prev.time);
+      prev.time = it.time;
+    } else {
+      assert(it.seqno > prev.seqno);
+      // If a larger sequence number has an older time which is not useful, skip
+      if (it.time > prev.time) {
+        seqno_time_mapping_.push_back(prev);
+        prev = it;
+      }
+    }
+  }
+  seqno_time_mapping_.emplace_back(prev);
+
+  is_sorted_ = true;
+  return Status::OK();
+}
+
+std::string SeqnoToTimeMapping::ToHumanString() const {
+  std::string ret;
+  for (const auto& seq_time : seqno_time_mapping_) {
+    AppendNumberTo(&ret, seq_time.seqno);
+    ret.append("->");
+    AppendNumberTo(&ret, seq_time.time);
+    ret.append(",");
+  }
+  return ret;
+}
+
+SeqnoToTimeMapping SeqnoToTimeMapping::Copy(
+    SequenceNumber smallest_seqno) const {
+  SeqnoToTimeMapping ret;
+  auto it = std::upper_bound(seqno_time_mapping_.begin(),
+                             seqno_time_mapping_.end(), smallest_seqno);
+  if (it != seqno_time_mapping_.begin()) {
+    it--;
+  }
+  std::copy(it, seqno_time_mapping_.end(),
+            std::back_inserter(ret.seqno_time_mapping_));
+  return ret;
+}
+
+uint64_t SeqnoToTimeMapping::CalculateMaxCapacity(uint64_t min_time_duration,
+                                                  uint64_t max_time_duration) {
+  if (min_time_duration == 0) {
+    return 0;
+  }
+  return std::min(
+      kMaxSeqnoToTimeEntries,
+      max_time_duration * kMaxSeqnoTimePairsPerCF / min_time_duration);
+}
+
+SeqnoToTimeMapping::SeqnoTimePair SeqnoToTimeMapping::SeqnoTimePair::operator-(
+    const SeqnoTimePair& other) const {
+  SeqnoTimePair res;
+  res.seqno = seqno - other.seqno;
+  res.time = time - other.time;
+  return res;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/seqno_to_time_mapping.h b/src/rocksdb/db/seqno_to_time_mapping.h
new file mode 100644
index 000000000..4ffc9c199
--- /dev/null
+++ b/src/rocksdb/db/seqno_to_time_mapping.h
@@ -0,0 +1,189 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <algorithm>
+#include <cinttypes>
+#include <deque>
+#include <functional>
+#include <iterator>
+#include <string>
+
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+constexpr uint64_t kUnknownSeqnoTime = 0;
+
+// SeqnoToTimeMapping stores the sequence number to time mapping, so given a
+// sequence number it can estimate the oldest possible time for that sequence
+// number. For example:
+//   10 -> 100
+//   50 -> 300
+// then if a key has seqno 19, the OldestApproximateTime would be 100, for 51 it
+// would be 300.
+// As it's a sorted list, the new entry is inserted from the back. The old data
+// will be popped from the front if they're no longer used.
+//
+// Note: the data struct is not thread safe, both read and write need to be
+//  synchronized by caller.
+class SeqnoToTimeMapping {
+ public:
+  // Maximum number of entries can be encoded into SST. The data is delta encode
+  // so the maximum data usage for each SST is < 0.3K
+  static constexpr uint64_t kMaxSeqnoTimePairsPerSST = 100;
+
+  // Maximum number of entries per CF. If there's only CF with this feature on,
+  // the max duration divided by this number, so for example, if
+  // preclude_last_level_data_seconds = 100000 (~1day), then it will sample the
+  // seqno -> time every 1000 seconds (~17minutes). Then the maximum entry it
+  // needs is 100.
+  // When there are multiple CFs having this feature on, the sampling cadence is
+  // determined by the smallest setting, the capacity is determined the largest
+  // setting, also it's caped by kMaxSeqnoTimePairsPerCF * 10.
+  static constexpr uint64_t kMaxSeqnoTimePairsPerCF = 100;
+
+  // A simple struct for sequence number to time pair
+  struct SeqnoTimePair {
+    SequenceNumber seqno = 0;
+    uint64_t time = 0;
+
+    SeqnoTimePair() = default;
+    SeqnoTimePair(SequenceNumber _seqno, uint64_t _time)
+        : seqno(_seqno), time(_time) {}
+
+    // Encode to dest string
+    void Encode(std::string& dest) const;
+
+    // Decode the value from input Slice and remove it from the input
+    Status Decode(Slice& input);
+
+    // subtraction of 2 SeqnoTimePair
+    SeqnoTimePair operator-(const SeqnoTimePair& other) const;
+
+    // Add 2 values together
+    void Add(const SeqnoTimePair& obj) {
+      seqno += obj.seqno;
+      time += obj.time;
+    }
+
+    // Compare SeqnoTimePair with a sequence number, used for binary search a
+    // sequence number in a list of SeqnoTimePair
+    bool operator<(const SequenceNumber& other) const { return seqno < other; }
+
+    // Compare 2 SeqnoTimePair
+    bool operator<(const SeqnoTimePair& other) const {
+      return std::tie(seqno, time) < std::tie(other.seqno, other.time);
+    }
+
+    // Check if 2 SeqnoTimePair is the same
+    bool operator==(const SeqnoTimePair& other) const {
+      return std::tie(seqno, time) == std::tie(other.seqno, other.time);
+    }
+  };
+
+  // constractor of SeqnoToTimeMapping
+  // max_time_duration is the maximum time it should track. For example, if
+  // preclude_last_level_data_seconds is 1 day, then if an entry is older than 1
+  // day, then it can be removed.
+  // max_capacity is the maximum number of entry it can hold. For single CF,
+  // it's caped at 100 (kMaxSeqnoTimePairsPerCF), otherwise
+  // kMaxSeqnoTimePairsPerCF * 10.
+  // If it's set to 0, means it won't truncate any old data.
+  explicit SeqnoToTimeMapping(uint64_t max_time_duration = 0,
+                              uint64_t max_capacity = 0)
+      : max_time_duration_(max_time_duration), max_capacity_(max_capacity) {}
+
+  // Append a new entry to the list. The new entry should be newer than the
+  // existing ones. It maintains the internal sorted status.
+  bool Append(SequenceNumber seqno, uint64_t time);
+
+  // Given a sequence number, estimate it's oldest time
+  uint64_t GetOldestApproximateTime(SequenceNumber seqno) const;
+
+  // Truncate the old entries based on the current time and max_time_duration_
+  void TruncateOldEntries(uint64_t now);
+
+  // Given a time, return it's oldest possible sequence number
+  SequenceNumber GetOldestSequenceNum(uint64_t time);
+
+  // Encode to a binary string
+  void Encode(std::string& des, SequenceNumber start, SequenceNumber end,
+              uint64_t now,
+              uint64_t output_size = kMaxSeqnoTimePairsPerSST) const;
+
+  // Add a new random entry, unlike Append(), it can be any data, but also makes
+  // the list un-sorted.
+  void Add(SequenceNumber seqno, uint64_t time);
+
+  // Decode and add the entries to the current obj. The list will be unsorted
+  Status Add(const std::string& seqno_time_mapping_str);
+
+  // Return the number of entries
+  size_t Size() const { return seqno_time_mapping_.size(); }
+
+  // Reduce the size of internal list
+  bool Resize(uint64_t min_time_duration, uint64_t max_time_duration);
+
+  // Override the max_time_duration_
+  void SetMaxTimeDuration(uint64_t max_time_duration) {
+    max_time_duration_ = max_time_duration;
+  }
+
+  uint64_t GetCapacity() const { return max_capacity_; }
+
+  // Sort the list, which also remove the redundant entries, useless entries,
+  // which makes sure the seqno is sorted, but also the time
+  Status Sort();
+
+  // copy the current obj from the given smallest_seqno.
+  SeqnoToTimeMapping Copy(SequenceNumber smallest_seqno) const;
+
+  // If the internal list is empty
+  bool Empty() const { return seqno_time_mapping_.empty(); }
+
+  // clear all entries
+  void Clear() { seqno_time_mapping_.clear(); }
+
+  // return the string for user message
+  // Note: Not efficient, okay for print
+  std::string ToHumanString() const;
+
+#ifndef NDEBUG
+  const std::deque<SeqnoTimePair>& TEST_GetInternalMapping() const {
+    return seqno_time_mapping_;
+  }
+#endif
+
+ private:
+  static constexpr uint64_t kMaxSeqnoToTimeEntries =
+      kMaxSeqnoTimePairsPerCF * 10;
+
+  uint64_t max_time_duration_;
+  uint64_t max_capacity_;
+
+  std::deque<SeqnoTimePair> seqno_time_mapping_;
+
+  bool is_sorted_ = true;
+
+  static uint64_t CalculateMaxCapacity(uint64_t min_time_duration,
+                                       uint64_t max_time_duration);
+
+  SeqnoTimePair& Last() {
+    assert(!Empty());
+    return seqno_time_mapping_.back();
+  }
+};
+
+// for searching the sequence number from SeqnoToTimeMapping
+inline bool operator<(const SequenceNumber& seqno,
+                      const SeqnoToTimeMapping::SeqnoTimePair& other) {
+  return seqno < other.seqno;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/snapshot_checker.h b/src/rocksdb/db/snapshot_checker.h
new file mode 100644
index 000000000..0bfb1aa07
--- /dev/null
+++ b/src/rocksdb/db/snapshot_checker.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+enum class SnapshotCheckerResult : int {
+  kInSnapshot = 0,
+  kNotInSnapshot = 1,
+  // In case snapshot is released and the checker has no clue whether
+  // the given sequence is visible to the snapshot.
+  kSnapshotReleased = 2,
+};
+
+// Callback class that control GC of duplicate keys in flush/compaction.
+class SnapshotChecker {
+ public:
+  virtual ~SnapshotChecker() {}
+  virtual SnapshotCheckerResult CheckInSnapshot(
+      SequenceNumber sequence, SequenceNumber snapshot_sequence) const = 0;
+};
+
+class DisableGCSnapshotChecker : public SnapshotChecker {
+ public:
+  virtual ~DisableGCSnapshotChecker() {}
+  virtual SnapshotCheckerResult CheckInSnapshot(
+      SequenceNumber /*sequence*/,
+      SequenceNumber /*snapshot_sequence*/) const override {
+    // By returning kNotInSnapshot, we prevent all the values from being GCed
+    return SnapshotCheckerResult::kNotInSnapshot;
+  }
+  static DisableGCSnapshotChecker* Instance();
+
+ protected:
+  explicit DisableGCSnapshotChecker() {}
+};
+
+class WritePreparedTxnDB;
+
+// Callback class created by WritePreparedTxnDB to check if a key
+// is visible by a snapshot.
+class WritePreparedSnapshotChecker : public SnapshotChecker {
+ public:
+  explicit WritePreparedSnapshotChecker(WritePreparedTxnDB* txn_db);
+  virtual ~WritePreparedSnapshotChecker() {}
+
+  virtual SnapshotCheckerResult CheckInSnapshot(
+      SequenceNumber sequence, SequenceNumber snapshot_sequence) const override;
+
+ private:
+#ifndef ROCKSDB_LITE
+  const WritePreparedTxnDB* const txn_db_;
+#endif  // !ROCKSDB_LITE
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/snapshot_impl.cc b/src/rocksdb/db/snapshot_impl.cc
new file mode 100644
index 000000000..98b475463
--- /dev/null
+++ b/src/rocksdb/db/snapshot_impl.cc
@@ -0,0 +1,25 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/db.h"
+#include "rocksdb/snapshot.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ManagedSnapshot::ManagedSnapshot(DB* db)
+    : db_(db), snapshot_(db->GetSnapshot()) {}
+
+ManagedSnapshot::ManagedSnapshot(DB* db, const Snapshot* _snapshot)
+    : db_(db), snapshot_(_snapshot) {}
+
+ManagedSnapshot::~ManagedSnapshot() {
+  if (snapshot_) {
+    db_->ReleaseSnapshot(snapshot_);
+  }
+}
+
+const Snapshot* ManagedSnapshot::snapshot() { return snapshot_; }
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/snapshot_impl.h b/src/rocksdb/db/snapshot_impl.h
new file mode 100644
index 000000000..23e5e98cd
--- /dev/null
+++ b/src/rocksdb/db/snapshot_impl.h
@@ -0,0 +1,239 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/db.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SnapshotList;
+
+// Snapshots are kept in a doubly-linked list in the DB.
+// Each SnapshotImpl corresponds to a particular sequence number.
+class SnapshotImpl : public Snapshot {
+ public:
+  SequenceNumber number_;  // const after creation
+  // It indicates the smallest uncommitted data at the time the snapshot was
+  // taken. This is currently used by WritePrepared transactions to limit the
+  // scope of queries to IsInSnapshot.
+  SequenceNumber min_uncommitted_ = kMinUnCommittedSeq;
+
+  SequenceNumber GetSequenceNumber() const override { return number_; }
+
+  int64_t GetUnixTime() const override { return unix_time_; }
+
+  uint64_t GetTimestamp() const override { return timestamp_; }
+
+ private:
+  friend class SnapshotList;
+
+  // SnapshotImpl is kept in a doubly-linked circular list
+  SnapshotImpl* prev_;
+  SnapshotImpl* next_;
+
+  SnapshotList* list_;  // just for sanity checks
+
+  int64_t unix_time_;
+
+  uint64_t timestamp_;
+
+  // Will this snapshot be used by a Transaction to do write-conflict checking?
+  bool is_write_conflict_boundary_;
+};
+
+class SnapshotList {
+ public:
+  SnapshotList() {
+    list_.prev_ = &list_;
+    list_.next_ = &list_;
+    list_.number_ = 0xFFFFFFFFL;  // placeholder marker, for debugging
+    // Set all the variables to make UBSAN happy.
+    list_.list_ = nullptr;
+    list_.unix_time_ = 0;
+    list_.timestamp_ = 0;
+    list_.is_write_conflict_boundary_ = false;
+    count_ = 0;
+  }
+
+  // No copy-construct.
+  SnapshotList(const SnapshotList&) = delete;
+
+  bool empty() const {
+    assert(list_.next_ != &list_ || 0 == count_);
+    return list_.next_ == &list_;
+  }
+  SnapshotImpl* oldest() const {
+    assert(!empty());
+    return list_.next_;
+  }
+  SnapshotImpl* newest() const {
+    assert(!empty());
+    return list_.prev_;
+  }
+
+  SnapshotImpl* New(SnapshotImpl* s, SequenceNumber seq, uint64_t unix_time,
+                    bool is_write_conflict_boundary,
+                    uint64_t ts = std::numeric_limits<uint64_t>::max()) {
+    s->number_ = seq;
+    s->unix_time_ = unix_time;
+    s->timestamp_ = ts;
+    s->is_write_conflict_boundary_ = is_write_conflict_boundary;
+    s->list_ = this;
+    s->next_ = &list_;
+    s->prev_ = list_.prev_;
+    s->prev_->next_ = s;
+    s->next_->prev_ = s;
+    count_++;
+    return s;
+  }
+
+  // Do not responsible to free the object.
+  void Delete(const SnapshotImpl* s) {
+    assert(s->list_ == this);
+    s->prev_->next_ = s->next_;
+    s->next_->prev_ = s->prev_;
+    count_--;
+  }
+
+  // retrieve all snapshot numbers up until max_seq. They are sorted in
+  // ascending order (with no duplicates).
+  std::vector<SequenceNumber> GetAll(
+      SequenceNumber* oldest_write_conflict_snapshot = nullptr,
+      const SequenceNumber& max_seq = kMaxSequenceNumber) const {
+    std::vector<SequenceNumber> ret;
+    GetAll(&ret, oldest_write_conflict_snapshot, max_seq);
+    return ret;
+  }
+
+  void GetAll(std::vector<SequenceNumber>* snap_vector,
+              SequenceNumber* oldest_write_conflict_snapshot = nullptr,
+              const SequenceNumber& max_seq = kMaxSequenceNumber) const {
+    std::vector<SequenceNumber>& ret = *snap_vector;
+    // So far we have no use case that would pass a non-empty vector
+    assert(ret.size() == 0);
+
+    if (oldest_write_conflict_snapshot != nullptr) {
+      *oldest_write_conflict_snapshot = kMaxSequenceNumber;
+    }
+
+    if (empty()) {
+      return;
+    }
+    const SnapshotImpl* s = &list_;
+    while (s->next_ != &list_) {
+      if (s->next_->number_ > max_seq) {
+        break;
+      }
+      // Avoid duplicates
+      if (ret.empty() || ret.back() != s->next_->number_) {
+        ret.push_back(s->next_->number_);
+      }
+
+      if (oldest_write_conflict_snapshot != nullptr &&
+          *oldest_write_conflict_snapshot == kMaxSequenceNumber &&
+          s->next_->is_write_conflict_boundary_) {
+        // If this is the first write-conflict boundary snapshot in the list,
+        // it is the oldest
+        *oldest_write_conflict_snapshot = s->next_->number_;
+      }
+
+      s = s->next_;
+    }
+    return;
+  }
+
+  // get the sequence number of the most recent snapshot
+  SequenceNumber GetNewest() {
+    if (empty()) {
+      return 0;
+    }
+    return newest()->number_;
+  }
+
+  int64_t GetOldestSnapshotTime() const {
+    if (empty()) {
+      return 0;
+    } else {
+      return oldest()->unix_time_;
+    }
+  }
+
+  int64_t GetOldestSnapshotSequence() const {
+    if (empty()) {
+      return 0;
+    } else {
+      return oldest()->GetSequenceNumber();
+    }
+  }
+
+  uint64_t count() const { return count_; }
+
+ private:
+  // Dummy head of doubly-linked list of snapshots
+  SnapshotImpl list_;
+  uint64_t count_;
+};
+
+// All operations on TimestampedSnapshotList must be protected by db mutex.
+class TimestampedSnapshotList {
+ public:
+  explicit TimestampedSnapshotList() = default;
+
+  std::shared_ptr<const SnapshotImpl> GetSnapshot(uint64_t ts) const {
+    if (ts == std::numeric_limits<uint64_t>::max() && !snapshots_.empty()) {
+      auto it = snapshots_.rbegin();
+      assert(it != snapshots_.rend());
+      return it->second;
+    }
+    auto it = snapshots_.find(ts);
+    if (it == snapshots_.end()) {
+      return std::shared_ptr<const SnapshotImpl>();
+    }
+    return it->second;
+  }
+
+  void GetSnapshots(
+      uint64_t ts_lb, uint64_t ts_ub,
+      std::vector<std::shared_ptr<const Snapshot>>& snapshots) const {
+    assert(ts_lb < ts_ub);
+    auto it_low = snapshots_.lower_bound(ts_lb);
+    auto it_high = snapshots_.lower_bound(ts_ub);
+    for (auto it = it_low; it != it_high; ++it) {
+      snapshots.emplace_back(it->second);
+    }
+  }
+
+  void AddSnapshot(const std::shared_ptr<const SnapshotImpl>& snapshot) {
+    assert(snapshot);
+    snapshots_.try_emplace(snapshot->GetTimestamp(), snapshot);
+  }
+
+  // snapshots_to_release: the container to where the timestamped snapshots will
+  // be moved so that it retains the last reference to the snapshots and the
+  // snapshots won't be actually released which requires db mutex. The
+  // snapshots will be released by caller of ReleaseSnapshotsOlderThan().
+  void ReleaseSnapshotsOlderThan(
+      uint64_t ts,
+      autovector<std::shared_ptr<const SnapshotImpl>>& snapshots_to_release) {
+    auto ub = snapshots_.lower_bound(ts);
+    for (auto it = snapshots_.begin(); it != ub; ++it) {
+      snapshots_to_release.emplace_back(it->second);
+    }
+    snapshots_.erase(snapshots_.begin(), ub);
+  }
+
+ private:
+  std::map<uint64_t, std::shared_ptr<const SnapshotImpl>> snapshots_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/table_cache.cc b/src/rocksdb/db/table_cache.cc
new file mode 100644
index 000000000..c44c4bb84
--- /dev/null
+++ b/src/rocksdb/db/table_cache.cc
@@ -0,0 +1,753 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/table_cache.h"
+
+#include "db/dbformat.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/snapshot_impl.h"
+#include "db/version_edit.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/statistics.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/iterator_wrapper.h"
+#include "table/multiget_context.h"
+#include "table/table_builder.h"
+#include "table/table_reader.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+template <class T>
+static void DeleteEntry(const Slice& /*key*/, void* value) {
+  T* typed_value = reinterpret_cast<T*>(value);
+  delete typed_value;
+}
+}  // anonymous namespace
+}  // namespace ROCKSDB_NAMESPACE
+
+// Generate the regular and coroutine versions of some methods by
+// including table_cache_sync_and_async.h twice
+// Macros in the header will expand differently based on whether
+// WITH_COROUTINES or WITHOUT_COROUTINES is defined
+// clang-format off
+#define WITHOUT_COROUTINES
+#include "db/table_cache_sync_and_async.h"
+#undef WITHOUT_COROUTINES
+#define WITH_COROUTINES
+#include "db/table_cache_sync_and_async.h"
+#undef WITH_COROUTINES
+// clang-format on
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+static void UnrefEntry(void* arg1, void* arg2) {
+  Cache* cache = reinterpret_cast<Cache*>(arg1);
+  Cache::Handle* h = reinterpret_cast<Cache::Handle*>(arg2);
+  cache->Release(h);
+}
+
+static Slice GetSliceForFileNumber(const uint64_t* file_number) {
+  return Slice(reinterpret_cast<const char*>(file_number),
+               sizeof(*file_number));
+}
+
+#ifndef ROCKSDB_LITE
+
+void AppendVarint64(IterKey* key, uint64_t v) {
+  char buf[10];
+  auto ptr = EncodeVarint64(buf, v);
+  key->TrimAppend(key->Size(), buf, ptr - buf);
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // anonymous namespace
+
+const int kLoadConcurency = 128;
+
+TableCache::TableCache(const ImmutableOptions& ioptions,
+                       const FileOptions* file_options, Cache* const cache,
+                       BlockCacheTracer* const block_cache_tracer,
+                       const std::shared_ptr<IOTracer>& io_tracer,
+                       const std::string& db_session_id)
+    : ioptions_(ioptions),
+      file_options_(*file_options),
+      cache_(cache),
+      immortal_tables_(false),
+      block_cache_tracer_(block_cache_tracer),
+      loader_mutex_(kLoadConcurency, kGetSliceNPHash64UnseededFnPtr),
+      io_tracer_(io_tracer),
+      db_session_id_(db_session_id) {
+  if (ioptions_.row_cache) {
+    // If the same cache is shared by multiple instances, we need to
+    // disambiguate its entries.
+    PutVarint64(&row_cache_id_, ioptions_.row_cache->NewId());
+  }
+}
+
+TableCache::~TableCache() {}
+
+TableReader* TableCache::GetTableReaderFromHandle(Cache::Handle* handle) {
+  return reinterpret_cast<TableReader*>(cache_->Value(handle));
+}
+
+void TableCache::ReleaseHandle(Cache::Handle* handle) {
+  cache_->Release(handle);
+}
+
+Status TableCache::GetTableReader(
+    const ReadOptions& ro, const FileOptions& file_options,
+    const InternalKeyComparator& internal_comparator,
+    const FileMetaData& file_meta, bool sequential_mode, bool record_read_stats,
+    HistogramImpl* file_read_hist, std::unique_ptr<TableReader>* table_reader,
+    const std::shared_ptr<const SliceTransform>& prefix_extractor,
+    bool skip_filters, int level, bool prefetch_index_and_filter_in_cache,
+    size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) {
+  std::string fname = TableFileName(
+      ioptions_.cf_paths, file_meta.fd.GetNumber(), file_meta.fd.GetPathId());
+  std::unique_ptr<FSRandomAccessFile> file;
+  FileOptions fopts = file_options;
+  fopts.temperature = file_temperature;
+  Status s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options);
+  TEST_SYNC_POINT_CALLBACK("TableCache::GetTableReader:BeforeOpenFile",
+                           const_cast<Status*>(&s));
+  if (s.ok()) {
+    s = ioptions_.fs->NewRandomAccessFile(fname, fopts, &file, nullptr);
+  }
+  if (s.ok()) {
+    RecordTick(ioptions_.stats, NO_FILE_OPENS);
+  } else if (s.IsPathNotFound()) {
+    fname = Rocks2LevelTableFileName(fname);
+    s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options);
+    if (s.ok()) {
+      s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file,
+                                            nullptr);
+    }
+    if (s.ok()) {
+      RecordTick(ioptions_.stats, NO_FILE_OPENS);
+    }
+  }
+
+  if (s.ok()) {
+    if (!sequential_mode && ioptions_.advise_random_on_open) {
+      file->Hint(FSRandomAccessFile::kRandom);
+    }
+    StopWatch sw(ioptions_.clock, ioptions_.stats, TABLE_OPEN_IO_MICROS);
+    std::unique_ptr<RandomAccessFileReader> file_reader(
+        new RandomAccessFileReader(
+            std::move(file), fname, ioptions_.clock, io_tracer_,
+            record_read_stats ? ioptions_.stats : nullptr, SST_READ_MICROS,
+            file_read_hist, ioptions_.rate_limiter.get(), ioptions_.listeners,
+            file_temperature, level == ioptions_.num_levels - 1));
+    UniqueId64x2 expected_unique_id;
+    if (ioptions_.verify_sst_unique_id_in_manifest) {
+      expected_unique_id = file_meta.unique_id;
+    } else {
+      expected_unique_id = kNullUniqueId64x2;  // null ID == no verification
+    }
+    s = ioptions_.table_factory->NewTableReader(
+        ro,
+        TableReaderOptions(ioptions_, prefix_extractor, file_options,
+                           internal_comparator, skip_filters, immortal_tables_,
+                           false /* force_direct_prefetch */, level,
+                           block_cache_tracer_, max_file_size_for_l0_meta_pin,
+                           db_session_id_, file_meta.fd.GetNumber(),
+                           expected_unique_id, file_meta.fd.largest_seqno),
+        std::move(file_reader), file_meta.fd.GetFileSize(), table_reader,
+        prefetch_index_and_filter_in_cache);
+    TEST_SYNC_POINT("TableCache::GetTableReader:0");
+  }
+  return s;
+}
+
+void TableCache::EraseHandle(const FileDescriptor& fd, Cache::Handle* handle) {
+  ReleaseHandle(handle);
+  uint64_t number = fd.GetNumber();
+  Slice key = GetSliceForFileNumber(&number);
+  cache_->Erase(key);
+}
+
+Status TableCache::FindTable(
+    const ReadOptions& ro, const FileOptions& file_options,
+    const InternalKeyComparator& internal_comparator,
+    const FileMetaData& file_meta, Cache::Handle** handle,
+    const std::shared_ptr<const SliceTransform>& prefix_extractor,
+    const bool no_io, bool record_read_stats, HistogramImpl* file_read_hist,
+    bool skip_filters, int level, bool prefetch_index_and_filter_in_cache,
+    size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) {
+  PERF_TIMER_GUARD_WITH_CLOCK(find_table_nanos, ioptions_.clock);
+  uint64_t number = file_meta.fd.GetNumber();
+  Slice key = GetSliceForFileNumber(&number);
+  *handle = cache_->Lookup(key);
+  TEST_SYNC_POINT_CALLBACK("TableCache::FindTable:0",
+                           const_cast<bool*>(&no_io));
+
+  if (*handle == nullptr) {
+    if (no_io) {
+      return Status::Incomplete("Table not found in table_cache, no_io is set");
+    }
+    MutexLock load_lock(loader_mutex_.get(key));
+    // We check the cache again under loading mutex
+    *handle = cache_->Lookup(key);
+    if (*handle != nullptr) {
+      return Status::OK();
+    }
+
+    std::unique_ptr<TableReader> table_reader;
+    Status s =
+        GetTableReader(ro, file_options, internal_comparator, file_meta,
+                       false /* sequential mode */, record_read_stats,
+                       file_read_hist, &table_reader, prefix_extractor,
+                       skip_filters, level, prefetch_index_and_filter_in_cache,
+                       max_file_size_for_l0_meta_pin, file_temperature);
+    if (!s.ok()) {
+      assert(table_reader == nullptr);
+      RecordTick(ioptions_.stats, NO_FILE_ERRORS);
+      // We do not cache error results so that if the error is transient,
+      // or somebody repairs the file, we recover automatically.
+    } else {
+      s = cache_->Insert(key, table_reader.get(), 1, &DeleteEntry<TableReader>,
+                         handle);
+      if (s.ok()) {
+        // Release ownership of table reader.
+        table_reader.release();
+      }
+    }
+    return s;
+  }
+  return Status::OK();
+}
+
+InternalIterator* TableCache::NewIterator(
+    const ReadOptions& options, const FileOptions& file_options,
+    const InternalKeyComparator& icomparator, const FileMetaData& file_meta,
+    RangeDelAggregator* range_del_agg,
+    const std::shared_ptr<const SliceTransform>& prefix_extractor,
+    TableReader** table_reader_ptr, HistogramImpl* file_read_hist,
+    TableReaderCaller caller, Arena* arena, bool skip_filters, int level,
+    size_t max_file_size_for_l0_meta_pin,
+    const InternalKey* smallest_compaction_key,
+    const InternalKey* largest_compaction_key, bool allow_unprepared_value,
+    TruncatedRangeDelIterator** range_del_iter) {
+  PERF_TIMER_GUARD(new_table_iterator_nanos);
+
+  Status s;
+  TableReader* table_reader = nullptr;
+  Cache::Handle* handle = nullptr;
+  if (table_reader_ptr != nullptr) {
+    *table_reader_ptr = nullptr;
+  }
+  bool for_compaction = caller == TableReaderCaller::kCompaction;
+  auto& fd = file_meta.fd;
+  table_reader = fd.table_reader;
+  if (table_reader == nullptr) {
+    s = FindTable(
+        options, file_options, icomparator, file_meta, &handle,
+        prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */,
+        !for_compaction /* record_read_stats */, file_read_hist, skip_filters,
+        level, true /* prefetch_index_and_filter_in_cache */,
+        max_file_size_for_l0_meta_pin, file_meta.temperature);
+    if (s.ok()) {
+      table_reader = GetTableReaderFromHandle(handle);
+    }
+  }
+  InternalIterator* result = nullptr;
+  if (s.ok()) {
+    if (options.table_filter &&
+        !options.table_filter(*table_reader->GetTableProperties())) {
+      result = NewEmptyInternalIterator<Slice>(arena);
+    } else {
+      result = table_reader->NewIterator(
+          options, prefix_extractor.get(), arena, skip_filters, caller,
+          file_options.compaction_readahead_size, allow_unprepared_value);
+    }
+    if (handle != nullptr) {
+      result->RegisterCleanup(&UnrefEntry, cache_, handle);
+      handle = nullptr;  // prevent from releasing below
+    }
+
+    if (for_compaction) {
+      table_reader->SetupForCompaction();
+    }
+    if (table_reader_ptr != nullptr) {
+      *table_reader_ptr = table_reader;
+    }
+  }
+  if (s.ok() && !options.ignore_range_deletions) {
+    if (range_del_iter != nullptr) {
+      auto new_range_del_iter =
+          table_reader->NewRangeTombstoneIterator(options);
+      if (new_range_del_iter == nullptr || new_range_del_iter->empty()) {
+        delete new_range_del_iter;
+        *range_del_iter = nullptr;
+      } else {
+        *range_del_iter = new TruncatedRangeDelIterator(
+            std::unique_ptr<FragmentedRangeTombstoneIterator>(
+                new_range_del_iter),
+            &icomparator, &file_meta.smallest, &file_meta.largest);
+      }
+    }
+    if (range_del_agg != nullptr) {
+      if (range_del_agg->AddFile(fd.GetNumber())) {
+        std::unique_ptr<FragmentedRangeTombstoneIterator> new_range_del_iter(
+            static_cast<FragmentedRangeTombstoneIterator*>(
+                table_reader->NewRangeTombstoneIterator(options)));
+        if (new_range_del_iter != nullptr) {
+          s = new_range_del_iter->status();
+        }
+        if (s.ok()) {
+          const InternalKey* smallest = &file_meta.smallest;
+          const InternalKey* largest = &file_meta.largest;
+          if (smallest_compaction_key != nullptr) {
+            smallest = smallest_compaction_key;
+          }
+          if (largest_compaction_key != nullptr) {
+            largest = largest_compaction_key;
+          }
+          range_del_agg->AddTombstones(std::move(new_range_del_iter), smallest,
+                                       largest);
+        }
+      }
+    }
+  }
+
+  if (handle != nullptr) {
+    ReleaseHandle(handle);
+  }
+  if (!s.ok()) {
+    assert(result == nullptr);
+    result = NewErrorInternalIterator<Slice>(s, arena);
+  }
+  return result;
+}
+
+Status TableCache::GetRangeTombstoneIterator(
+    const ReadOptions& options,
+    const InternalKeyComparator& internal_comparator,
+    const FileMetaData& file_meta,
+    std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter) {
+  assert(out_iter);
+  const FileDescriptor& fd = file_meta.fd;
+  Status s;
+  TableReader* t = fd.table_reader;
+  Cache::Handle* handle = nullptr;
+  if (t == nullptr) {
+    s = FindTable(options, file_options_, internal_comparator, file_meta,
+                  &handle);
+    if (s.ok()) {
+      t = GetTableReaderFromHandle(handle);
+    }
+  }
+  if (s.ok()) {
+    // Note: NewRangeTombstoneIterator could return nullptr
+    out_iter->reset(t->NewRangeTombstoneIterator(options));
+  }
+  if (handle) {
+    if (*out_iter) {
+      (*out_iter)->RegisterCleanup(&UnrefEntry, cache_, handle);
+    } else {
+      ReleaseHandle(handle);
+    }
+  }
+  return s;
+}
+
+#ifndef ROCKSDB_LITE
+void TableCache::CreateRowCacheKeyPrefix(const ReadOptions& options,
+                                         const FileDescriptor& fd,
+                                         const Slice& internal_key,
+                                         GetContext* get_context,
+                                         IterKey& row_cache_key) {
+  uint64_t fd_number = fd.GetNumber();
+  // We use the user key as cache key instead of the internal key,
+  // otherwise the whole cache would be invalidated every time the
+  // sequence key increases. However, to support caching snapshot
+  // reads, we append the sequence number (incremented by 1 to
+  // distinguish from 0) only in this case.
+  // If the snapshot is larger than the largest seqno in the file,
+  // all data should be exposed to the snapshot, so we treat it
+  // the same as there is no snapshot. The exception is that if
+  // a seq-checking callback is registered, some internal keys
+  // may still be filtered out.
+  uint64_t seq_no = 0;
+  // Maybe we can include the whole file ifsnapshot == fd.largest_seqno.
+  if (options.snapshot != nullptr &&
+      (get_context->has_callback() ||
+       static_cast_with_check<const SnapshotImpl>(options.snapshot)
+               ->GetSequenceNumber() <= fd.largest_seqno)) {
+    // We should consider to use options.snapshot->GetSequenceNumber()
+    // instead of GetInternalKeySeqno(k), which will make the code
+    // easier to understand.
+    seq_no = 1 + GetInternalKeySeqno(internal_key);
+  }
+
+  // Compute row cache key.
+  row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(),
+                           row_cache_id_.size());
+  AppendVarint64(&row_cache_key, fd_number);
+  AppendVarint64(&row_cache_key, seq_no);
+}
+
+bool TableCache::GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
+                                 size_t prefix_size, GetContext* get_context) {
+  bool found = false;
+
+  row_cache_key.TrimAppend(prefix_size, user_key.data(), user_key.size());
+  if (auto row_handle =
+          ioptions_.row_cache->Lookup(row_cache_key.GetUserKey())) {
+    // Cleanable routine to release the cache entry
+    Cleanable value_pinner;
+    auto release_cache_entry_func = [](void* cache_to_clean,
+                                       void* cache_handle) {
+      ((Cache*)cache_to_clean)->Release((Cache::Handle*)cache_handle);
+    };
+    auto found_row_cache_entry =
+        static_cast<const std::string*>(ioptions_.row_cache->Value(row_handle));
+    // If it comes here value is located on the cache.
+    // found_row_cache_entry points to the value on cache,
+    // and value_pinner has cleanup procedure for the cached entry.
+    // After replayGetContextLog() returns, get_context.pinnable_slice_
+    // will point to cache entry buffer (or a copy based on that) and
+    // cleanup routine under value_pinner will be delegated to
+    // get_context.pinnable_slice_. Cache entry is released when
+    // get_context.pinnable_slice_ is reset.
+    value_pinner.RegisterCleanup(release_cache_entry_func,
+                                 ioptions_.row_cache.get(), row_handle);
+    replayGetContextLog(*found_row_cache_entry, user_key, get_context,
+                        &value_pinner);
+    RecordTick(ioptions_.stats, ROW_CACHE_HIT);
+    found = true;
+  } else {
+    RecordTick(ioptions_.stats, ROW_CACHE_MISS);
+  }
+  return found;
+}
+#endif  // ROCKSDB_LITE
+
+Status TableCache::Get(
+    const ReadOptions& options,
+    const InternalKeyComparator& internal_comparator,
+    const FileMetaData& file_meta, const Slice& k, GetContext* get_context,
+    const std::shared_ptr<const SliceTransform>& prefix_extractor,
+    HistogramImpl* file_read_hist, bool skip_filters, int level,
+    size_t max_file_size_for_l0_meta_pin) {
+  auto& fd = file_meta.fd;
+  std::string* row_cache_entry = nullptr;
+  bool done = false;
+#ifndef ROCKSDB_LITE
+  IterKey row_cache_key;
+  std::string row_cache_entry_buffer;
+
+  // Check row cache if enabled. Since row cache does not currently store
+  // sequence numbers, we cannot use it if we need to fetch the sequence.
+  if (ioptions_.row_cache && !get_context->NeedToReadSequence()) {
+    auto user_key = ExtractUserKey(k);
+    CreateRowCacheKeyPrefix(options, fd, k, get_context, row_cache_key);
+    done = GetFromRowCache(user_key, row_cache_key, row_cache_key.Size(),
+                           get_context);
+    if (!done) {
+      row_cache_entry = &row_cache_entry_buffer;
+    }
+  }
+#endif  // ROCKSDB_LITE
+  Status s;
+  TableReader* t = fd.table_reader;
+  Cache::Handle* handle = nullptr;
+  if (!done) {
+    assert(s.ok());
+    if (t == nullptr) {
+      s = FindTable(options, file_options_, internal_comparator, file_meta,
+                    &handle, prefix_extractor,
+                    options.read_tier == kBlockCacheTier /* no_io */,
+                    true /* record_read_stats */, file_read_hist, skip_filters,
+                    level, true /* prefetch_index_and_filter_in_cache */,
+                    max_file_size_for_l0_meta_pin, file_meta.temperature);
+      if (s.ok()) {
+        t = GetTableReaderFromHandle(handle);
+      }
+    }
+    SequenceNumber* max_covering_tombstone_seq =
+        get_context->max_covering_tombstone_seq();
+    if (s.ok() && max_covering_tombstone_seq != nullptr &&
+        !options.ignore_range_deletions) {
+      std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+          t->NewRangeTombstoneIterator(options));
+      if (range_del_iter != nullptr) {
+        SequenceNumber seq =
+            range_del_iter->MaxCoveringTombstoneSeqnum(ExtractUserKey(k));
+        if (seq > *max_covering_tombstone_seq) {
+          *max_covering_tombstone_seq = seq;
+          if (get_context->NeedTimestamp()) {
+            get_context->SetTimestampFromRangeTombstone(
+                range_del_iter->timestamp());
+          }
+        }
+      }
+    }
+    if (s.ok()) {
+      get_context->SetReplayLog(row_cache_entry);  // nullptr if no cache.
+      s = t->Get(options, k, get_context, prefix_extractor.get(), skip_filters);
+      get_context->SetReplayLog(nullptr);
+    } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) {
+      // Couldn't find Table in cache but treat as kFound if no_io set
+      get_context->MarkKeyMayExist();
+      s = Status::OK();
+      done = true;
+    }
+  }
+
+#ifndef ROCKSDB_LITE
+  // Put the replay log in row cache only if something was found.
+  if (!done && s.ok() && row_cache_entry && !row_cache_entry->empty()) {
+    size_t charge = row_cache_entry->capacity() + sizeof(std::string);
+    void* row_ptr = new std::string(std::move(*row_cache_entry));
+    // If row cache is full, it's OK to continue.
+    ioptions_.row_cache
+        ->Insert(row_cache_key.GetUserKey(), row_ptr, charge,
+                 &DeleteEntry<std::string>)
+        .PermitUncheckedError();
+  }
+#endif  // ROCKSDB_LITE
+
+  if (handle != nullptr) {
+    ReleaseHandle(handle);
+  }
+  return s;
+}
+
+void TableCache::UpdateRangeTombstoneSeqnums(
+    const ReadOptions& options, TableReader* t,
+    MultiGetContext::Range& table_range) {
+  std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+      t->NewRangeTombstoneIterator(options));
+  if (range_del_iter != nullptr) {
+    for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) {
+      SequenceNumber* max_covering_tombstone_seq =
+          iter->get_context->max_covering_tombstone_seq();
+      SequenceNumber seq =
+          range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey_with_ts);
+      if (seq > *max_covering_tombstone_seq) {
+        *max_covering_tombstone_seq = seq;
+        if (iter->get_context->NeedTimestamp()) {
+          iter->get_context->SetTimestampFromRangeTombstone(
+              range_del_iter->timestamp());
+        }
+      }
+    }
+  }
+}
+
+Status TableCache::MultiGetFilter(
+    const ReadOptions& options,
+    const InternalKeyComparator& internal_comparator,
+    const FileMetaData& file_meta,
+    const std::shared_ptr<const SliceTransform>& prefix_extractor,
+    HistogramImpl* file_read_hist, int level,
+    MultiGetContext::Range* mget_range, Cache::Handle** table_handle) {
+  auto& fd = file_meta.fd;
+#ifndef ROCKSDB_LITE
+  IterKey row_cache_key;
+  std::string row_cache_entry_buffer;
+
+  // Check if we need to use the row cache. If yes, then we cannot do the
+  // filtering here, since the filtering needs to happen after the row cache
+  // lookup.
+  KeyContext& first_key = *mget_range->begin();
+  if (ioptions_.row_cache && !first_key.get_context->NeedToReadSequence()) {
+    return Status::NotSupported();
+  }
+#endif  // ROCKSDB_LITE
+  Status s;
+  TableReader* t = fd.table_reader;
+  Cache::Handle* handle = nullptr;
+  MultiGetContext::Range tombstone_range(*mget_range, mget_range->begin(),
+                                         mget_range->end());
+  if (t == nullptr) {
+    s = FindTable(
+        options, file_options_, internal_comparator, file_meta, &handle,
+        prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */,
+        true /* record_read_stats */, file_read_hist, /*skip_filters=*/false,
+        level, true /* prefetch_index_and_filter_in_cache */,
+        /*max_file_size_for_l0_meta_pin=*/0, file_meta.temperature);
+    if (s.ok()) {
+      t = GetTableReaderFromHandle(handle);
+    }
+    *table_handle = handle;
+  }
+  if (s.ok()) {
+    s = t->MultiGetFilter(options, prefix_extractor.get(), mget_range);
+  }
+  if (s.ok() && !options.ignore_range_deletions) {
+    // Update the range tombstone sequence numbers for the keys here
+    // as TableCache::MultiGet may or may not be called, and even if it
+    // is, it may be called with fewer keys in the rangedue to filtering.
+    UpdateRangeTombstoneSeqnums(options, t, tombstone_range);
+  }
+  if (mget_range->empty() && handle) {
+    ReleaseHandle(handle);
+    *table_handle = nullptr;
+  }
+
+  return s;
+}
+
+Status TableCache::GetTableProperties(
+    const FileOptions& file_options,
+    const InternalKeyComparator& internal_comparator,
+    const FileMetaData& file_meta,
+    std::shared_ptr<const TableProperties>* properties,
+    const std::shared_ptr<const SliceTransform>& prefix_extractor, bool no_io) {
+  auto table_reader = file_meta.fd.table_reader;
+  // table already been pre-loaded?
+  if (table_reader) {
+    *properties = table_reader->GetTableProperties();
+
+    return Status::OK();
+  }
+
+  Cache::Handle* table_handle = nullptr;
+  Status s = FindTable(ReadOptions(), file_options, internal_comparator,
+                       file_meta, &table_handle, prefix_extractor, no_io);
+  if (!s.ok()) {
+    return s;
+  }
+  assert(table_handle);
+  auto table = GetTableReaderFromHandle(table_handle);
+  *properties = table->GetTableProperties();
+  ReleaseHandle(table_handle);
+  return s;
+}
+
+Status TableCache::ApproximateKeyAnchors(
+    const ReadOptions& ro, const InternalKeyComparator& internal_comparator,
+    const FileMetaData& file_meta, std::vector<TableReader::Anchor>& anchors) {
+  Status s;
+  TableReader* t = file_meta.fd.table_reader;
+  Cache::Handle* handle = nullptr;
+  if (t == nullptr) {
+    s = FindTable(ro, file_options_, internal_comparator, file_meta, &handle);
+    if (s.ok()) {
+      t = GetTableReaderFromHandle(handle);
+    }
+  }
+  if (s.ok() && t != nullptr) {
+    s = t->ApproximateKeyAnchors(ro, anchors);
+  }
+  if (handle != nullptr) {
+    ReleaseHandle(handle);
+  }
+  return s;
+}
+
+size_t TableCache::GetMemoryUsageByTableReader(
+    const FileOptions& file_options,
+    const InternalKeyComparator& internal_comparator,
+    const FileMetaData& file_meta,
+    const std::shared_ptr<const SliceTransform>& prefix_extractor) {
+  auto table_reader = file_meta.fd.table_reader;
+  // table already been pre-loaded?
+  if (table_reader) {
+    return table_reader->ApproximateMemoryUsage();
+  }
+
+  Cache::Handle* table_handle = nullptr;
+  Status s = FindTable(ReadOptions(), file_options, internal_comparator,
+                       file_meta, &table_handle, prefix_extractor, true);
+  if (!s.ok()) {
+    return 0;
+  }
+  assert(table_handle);
+  auto table = GetTableReaderFromHandle(table_handle);
+  auto ret = table->ApproximateMemoryUsage();
+  ReleaseHandle(table_handle);
+  return ret;
+}
+
+bool TableCache::HasEntry(Cache* cache, uint64_t file_number) {
+  Cache::Handle* handle = cache->Lookup(GetSliceForFileNumber(&file_number));
+  if (handle) {
+    cache->Release(handle);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void TableCache::Evict(Cache* cache, uint64_t file_number) {
+  cache->Erase(GetSliceForFileNumber(&file_number));
+}
+
+uint64_t TableCache::ApproximateOffsetOf(
+    const Slice& key, const FileMetaData& file_meta, TableReaderCaller caller,
+    const InternalKeyComparator& internal_comparator,
+    const std::shared_ptr<const SliceTransform>& prefix_extractor) {
+  uint64_t result = 0;
+  TableReader* table_reader = file_meta.fd.table_reader;
+  Cache::Handle* table_handle = nullptr;
+  if (table_reader == nullptr) {
+    const bool for_compaction = (caller == TableReaderCaller::kCompaction);
+    Status s =
+        FindTable(ReadOptions(), file_options_, internal_comparator, file_meta,
+                  &table_handle, prefix_extractor, false /* no_io */,
+                  !for_compaction /* record_read_stats */);
+    if (s.ok()) {
+      table_reader = GetTableReaderFromHandle(table_handle);
+    }
+  }
+
+  if (table_reader != nullptr) {
+    result = table_reader->ApproximateOffsetOf(key, caller);
+  }
+  if (table_handle != nullptr) {
+    ReleaseHandle(table_handle);
+  }
+
+  return result;
+}
+
+uint64_t TableCache::ApproximateSize(
+    const Slice& start, const Slice& end, const FileMetaData& file_meta,
+    TableReaderCaller caller, const InternalKeyComparator& internal_comparator,
+    const std::shared_ptr<const SliceTransform>& prefix_extractor) {
+  uint64_t result = 0;
+  TableReader* table_reader = file_meta.fd.table_reader;
+  Cache::Handle* table_handle = nullptr;
+  if (table_reader == nullptr) {
+    const bool for_compaction = (caller == TableReaderCaller::kCompaction);
+    Status s =
+        FindTable(ReadOptions(), file_options_, internal_comparator, file_meta,
+                  &table_handle, prefix_extractor, false /* no_io */,
+                  !for_compaction /* record_read_stats */);
+    if (s.ok()) {
+      table_reader = GetTableReaderFromHandle(table_handle);
+    }
+  }
+
+  if (table_reader != nullptr) {
+    result = table_reader->ApproximateSize(start, end, caller);
+  }
+  if (table_handle != nullptr) {
+    ReleaseHandle(table_handle);
+  }
+
+  return result;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/table_cache.h b/src/rocksdb/db/table_cache.h
new file mode 100644
index 000000000..2e50f2c77
--- /dev/null
+++ b/src/rocksdb/db/table_cache.h
@@ -0,0 +1,275 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Thread-safe (provides internal synchronization)
+
+#pragma once
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/range_del_aggregator.h"
+#include "options/cf_options.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/table_reader.h"
+#include "trace_replay/block_cache_tracer.h"
+#include "util/coro_utils.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Env;
+class Arena;
+struct FileDescriptor;
+class GetContext;
+class HistogramImpl;
+
+// Manages caching for TableReader objects for a column family. The actual
+// cache is allocated separately and passed to the constructor. TableCache
+// wraps around the underlying SST file readers by providing Get(),
+// MultiGet() and NewIterator() methods that hide the instantiation,
+// caching and access to the TableReader. The main purpose of this is
+// performance - by caching the TableReader, it avoids unnecessary file opens
+// and object allocation and instantiation. One exception is compaction, where
+// a new TableReader may be instantiated - see NewIterator() comments
+//
+// Another service provided by TableCache is managing the row cache - if the
+// DB is configured with a row cache, and the lookup key is present in the row
+// cache, lookup is very fast. The row cache is obtained from
+// ioptions.row_cache
+class TableCache {
+ public:
+  TableCache(const ImmutableOptions& ioptions,
+             const FileOptions* storage_options, Cache* cache,
+             BlockCacheTracer* const block_cache_tracer,
+             const std::shared_ptr<IOTracer>& io_tracer,
+             const std::string& db_session_id);
+  ~TableCache();
+
+  // Return an iterator for the specified file number (the corresponding
+  // file length must be exactly "file_size" bytes).  If "table_reader_ptr"
+  // is non-nullptr, also sets "*table_reader_ptr" to point to the Table object
+  // underlying the returned iterator, or nullptr if no Table object underlies
+  // the returned iterator.  The returned "*table_reader_ptr" object is owned
+  // by the cache and should not be deleted, and is valid for as long as the
+  // returned iterator is live.
+  // If !options.ignore_range_deletions, and range_del_iter is non-nullptr,
+  // then range_del_iter is set to a TruncatedRangeDelIterator for range
+  // tombstones in the SST file corresponding to the specified file number. The
+  // upper/lower bounds for the TruncatedRangeDelIterator are set to the SST
+  // file's boundary.
+  // @param options Must outlive the returned iterator.
+  // @param range_del_agg If non-nullptr, adds range deletions to the
+  //    aggregator. If an error occurs, returns it in a NewErrorInternalIterator
+  // @param for_compaction If true, a new TableReader may be allocated (but
+  //                       not cached), depending on the CF options
+  // @param skip_filters Disables loading/accessing the filter block
+  // @param level The level this table is at, -1 for "not set / don't know"
+  InternalIterator* NewIterator(
+      const ReadOptions& options, const FileOptions& toptions,
+      const InternalKeyComparator& internal_comparator,
+      const FileMetaData& file_meta, RangeDelAggregator* range_del_agg,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor,
+      TableReader** table_reader_ptr, HistogramImpl* file_read_hist,
+      TableReaderCaller caller, Arena* arena, bool skip_filters, int level,
+      size_t max_file_size_for_l0_meta_pin,
+      const InternalKey* smallest_compaction_key,
+      const InternalKey* largest_compaction_key, bool allow_unprepared_value,
+      TruncatedRangeDelIterator** range_del_iter = nullptr);
+
+  // If a seek to internal key "k" in specified file finds an entry,
+  // call get_context->SaveValue() repeatedly until
+  // it returns false. As a side effect, it will insert the TableReader
+  // into the cache and potentially evict another entry
+  // @param get_context Context for get operation. The result of the lookup
+  //                    can be retrieved by calling get_context->State()
+  // @param file_read_hist If non-nullptr, the file reader statistics are
+  //                       recorded
+  // @param skip_filters Disables loading/accessing the filter block
+  // @param level The level this table is at, -1 for "not set / don't know"
+  Status Get(
+      const ReadOptions& options,
+      const InternalKeyComparator& internal_comparator,
+      const FileMetaData& file_meta, const Slice& k, GetContext* get_context,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+      HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
+      int level = -1, size_t max_file_size_for_l0_meta_pin = 0);
+
+  // Return the range delete tombstone iterator of the file specified by
+  // `file_meta`.
+  Status GetRangeTombstoneIterator(
+      const ReadOptions& options,
+      const InternalKeyComparator& internal_comparator,
+      const FileMetaData& file_meta,
+      std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter);
+
+  // Call table reader's MultiGetFilter to use the bloom filter to filter out
+  // keys. Returns Status::NotSupported() if row cache needs to be checked.
+  // If the table cache is looked up to get the table reader, the cache handle
+  // is returned in table_handle. This handle should be passed back to
+  // MultiGet() so it can be released.
+  Status MultiGetFilter(
+      const ReadOptions& options,
+      const InternalKeyComparator& internal_comparator,
+      const FileMetaData& file_meta,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor,
+      HistogramImpl* file_read_hist, int level,
+      MultiGetContext::Range* mget_range, Cache::Handle** table_handle);
+
+  // If a seek to internal key "k" in specified file finds an entry,
+  // call get_context->SaveValue() repeatedly until
+  // it returns false. As a side effect, it will insert the TableReader
+  // into the cache and potentially evict another entry
+  // @param mget_range Pointer to the structure describing a batch of keys to
+  //                   be looked up in this table file. The result is stored
+  //                   in the embedded GetContext
+  // @param skip_filters Disables loading/accessing the filter block
+  // @param level The level this table is at, -1 for "not set / don't know"
+  DECLARE_SYNC_AND_ASYNC(
+      Status, MultiGet, const ReadOptions& options,
+      const InternalKeyComparator& internal_comparator,
+      const FileMetaData& file_meta, const MultiGetContext::Range* mget_range,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+      HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
+      bool skip_range_deletions = false, int level = -1,
+      Cache::Handle* table_handle = nullptr);
+
+  // Evict any entry for the specified file number
+  static void Evict(Cache* cache, uint64_t file_number);
+
+  // Query whether specified file number is currently in cache
+  static bool HasEntry(Cache* cache, uint64_t file_number);
+
+  // Clean table handle and erase it from the table cache
+  // Used in DB close, or the file is not live anymore.
+  void EraseHandle(const FileDescriptor& fd, Cache::Handle* handle);
+
+  // Find table reader
+  // @param skip_filters Disables loading/accessing the filter block
+  // @param level == -1 means not specified
+  Status FindTable(
+      const ReadOptions& ro, const FileOptions& toptions,
+      const InternalKeyComparator& internal_comparator,
+      const FileMetaData& file_meta, Cache::Handle**,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+      const bool no_io = false, bool record_read_stats = true,
+      HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
+      int level = -1, bool prefetch_index_and_filter_in_cache = true,
+      size_t max_file_size_for_l0_meta_pin = 0,
+      Temperature file_temperature = Temperature::kUnknown);
+
+  // Get TableReader from a cache handle.
+  TableReader* GetTableReaderFromHandle(Cache::Handle* handle);
+
+  // Get the table properties of a given table.
+  // @no_io: indicates if we should load table to the cache if it is not present
+  //         in table cache yet.
+  // @returns: `properties` will be reset on success. Please note that we will
+  //            return Status::Incomplete() if table is not present in cache and
+  //            we set `no_io` to be true.
+  Status GetTableProperties(
+      const FileOptions& toptions,
+      const InternalKeyComparator& internal_comparator,
+      const FileMetaData& file_meta,
+      std::shared_ptr<const TableProperties>* properties,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+      bool no_io = false);
+
+  Status ApproximateKeyAnchors(const ReadOptions& ro,
+                               const InternalKeyComparator& internal_comparator,
+                               const FileMetaData& file_meta,
+                               std::vector<TableReader::Anchor>& anchors);
+
+  // Return total memory usage of the table reader of the file.
+  // 0 if table reader of the file is not loaded.
+  size_t GetMemoryUsageByTableReader(
+      const FileOptions& toptions,
+      const InternalKeyComparator& internal_comparator,
+      const FileMetaData& file_meta,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr);
+
+  // Returns approximated offset of a key in a file represented by fd.
+  uint64_t ApproximateOffsetOf(
+      const Slice& key, const FileMetaData& file_meta, TableReaderCaller caller,
+      const InternalKeyComparator& internal_comparator,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr);
+
+  // Returns approximated data size between start and end keys in a file
+  // represented by fd (the start key must not be greater than the end key).
+  uint64_t ApproximateSize(
+      const Slice& start, const Slice& end, const FileMetaData& file_meta,
+      TableReaderCaller caller,
+      const InternalKeyComparator& internal_comparator,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr);
+
+  // Release the handle from a cache
+  void ReleaseHandle(Cache::Handle* handle);
+
+  Cache* get_cache() const { return cache_; }
+
+  // Capacity of the backing Cache that indicates infinite TableCache capacity.
+  // For example when max_open_files is -1 we set the backing Cache to this.
+  static const int kInfiniteCapacity = 0x400000;
+
+  // The tables opened with this TableCache will be immortal, i.e., their
+  // lifetime is as long as that of the DB.
+  void SetTablesAreImmortal() {
+    if (cache_->GetCapacity() >= kInfiniteCapacity) {
+      immortal_tables_ = true;
+    }
+  }
+
+ private:
+  // Build a table reader
+  Status GetTableReader(
+      const ReadOptions& ro, const FileOptions& file_options,
+      const InternalKeyComparator& internal_comparator,
+      const FileMetaData& file_meta, bool sequential_mode,
+      bool record_read_stats, HistogramImpl* file_read_hist,
+      std::unique_ptr<TableReader>* table_reader,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+      bool skip_filters = false, int level = -1,
+      bool prefetch_index_and_filter_in_cache = true,
+      size_t max_file_size_for_l0_meta_pin = 0,
+      Temperature file_temperature = Temperature::kUnknown);
+
+  // Update the max_covering_tombstone_seq in the GetContext for each key based
+  // on the range deletions in the table
+  void UpdateRangeTombstoneSeqnums(const ReadOptions& options, TableReader* t,
+                                   MultiGetContext::Range& table_range);
+
+  // Create a key prefix for looking up the row cache. The prefix is of the
+  // format row_cache_id + fd_number + seq_no. Later, the user key can be
+  // appended to form the full key
+  void CreateRowCacheKeyPrefix(const ReadOptions& options,
+                               const FileDescriptor& fd,
+                               const Slice& internal_key,
+                               GetContext* get_context, IterKey& row_cache_key);
+
+  // Helper function to lookup the row cache for a key. It appends the
+  // user key to row_cache_key at offset prefix_size
+  bool GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
+                       size_t prefix_size, GetContext* get_context);
+
+  const ImmutableOptions& ioptions_;
+  const FileOptions& file_options_;
+  Cache* const cache_;
+  std::string row_cache_id_;
+  bool immortal_tables_;
+  BlockCacheTracer* const block_cache_tracer_;
+  Striped<port::Mutex, Slice> loader_mutex_;
+  std::shared_ptr<IOTracer> io_tracer_;
+  std::string db_session_id_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/table_cache_sync_and_async.h b/src/rocksdb/db/table_cache_sync_and_async.h
new file mode 100644
index 000000000..e72abdd45
--- /dev/null
+++ b/src/rocksdb/db/table_cache_sync_and_async.h
@@ -0,0 +1,135 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/coro_utils.h"
+
+#if defined(WITHOUT_COROUTINES) || \
+    (defined(USE_COROUTINES) && defined(WITH_COROUTINES))
+namespace ROCKSDB_NAMESPACE {
+
+#if defined(WITHOUT_COROUTINES)
+#endif
+
+// Batched version of TableCache::MultiGet.
+DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet)
+(const ReadOptions& options, const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta, const MultiGetContext::Range* mget_range,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor,
+ HistogramImpl* file_read_hist, bool skip_filters, bool skip_range_deletions,
+ int level, Cache::Handle* table_handle) {
+  auto& fd = file_meta.fd;
+  Status s;
+  TableReader* t = fd.table_reader;
+  Cache::Handle* handle = table_handle;
+  MultiGetRange table_range(*mget_range, mget_range->begin(),
+                            mget_range->end());
+  if (handle != nullptr && t == nullptr) {
+    t = GetTableReaderFromHandle(handle);
+  }
+#ifndef ROCKSDB_LITE
+  autovector<std::string, MultiGetContext::MAX_BATCH_SIZE> row_cache_entries;
+  IterKey row_cache_key;
+  size_t row_cache_key_prefix_size = 0;
+  KeyContext& first_key = *table_range.begin();
+  bool lookup_row_cache =
+      ioptions_.row_cache && !first_key.get_context->NeedToReadSequence();
+
+  // Check row cache if enabled. Since row cache does not currently store
+  // sequence numbers, we cannot use it if we need to fetch the sequence.
+  if (lookup_row_cache) {
+    GetContext* first_context = first_key.get_context;
+    CreateRowCacheKeyPrefix(options, fd, first_key.ikey, first_context,
+                            row_cache_key);
+    row_cache_key_prefix_size = row_cache_key.Size();
+
+    for (auto miter = table_range.begin(); miter != table_range.end();
+         ++miter) {
+      const Slice& user_key = miter->ukey_with_ts;
+
+      GetContext* get_context = miter->get_context;
+
+      if (GetFromRowCache(user_key, row_cache_key, row_cache_key_prefix_size,
+                          get_context)) {
+        table_range.SkipKey(miter);
+      } else {
+        row_cache_entries.emplace_back();
+        get_context->SetReplayLog(&(row_cache_entries.back()));
+      }
+    }
+  }
+#endif  // ROCKSDB_LITE
+
+  // Check that table_range is not empty. Its possible all keys may have been
+  // found in the row cache and thus the range may now be empty
+  if (s.ok() && !table_range.empty()) {
+    if (t == nullptr) {
+      assert(handle == nullptr);
+      s = FindTable(options, file_options_, internal_comparator, file_meta,
+                    &handle, prefix_extractor,
+                    options.read_tier == kBlockCacheTier /* no_io */,
+                    true /* record_read_stats */, file_read_hist, skip_filters,
+                    level, true /* prefetch_index_and_filter_in_cache */,
+                    0 /*max_file_size_for_l0_meta_pin*/, file_meta.temperature);
+      TEST_SYNC_POINT_CALLBACK("TableCache::MultiGet:FindTable", &s);
+      if (s.ok()) {
+        t = GetTableReaderFromHandle(handle);
+        assert(t);
+      }
+    }
+    if (s.ok() && !options.ignore_range_deletions && !skip_range_deletions) {
+      UpdateRangeTombstoneSeqnums(options, t, table_range);
+    }
+    if (s.ok()) {
+      CO_AWAIT(t->MultiGet)
+      (options, &table_range, prefix_extractor.get(), skip_filters);
+    } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) {
+      for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) {
+        Status* status = iter->s;
+        if (status->IsIncomplete()) {
+          // Couldn't find Table in cache but treat as kFound if no_io set
+          iter->get_context->MarkKeyMayExist();
+          s = Status::OK();
+        }
+      }
+    }
+  }
+
+#ifndef ROCKSDB_LITE
+  if (lookup_row_cache) {
+    size_t row_idx = 0;
+
+    for (auto miter = table_range.begin(); miter != table_range.end();
+         ++miter) {
+      std::string& row_cache_entry = row_cache_entries[row_idx++];
+      const Slice& user_key = miter->ukey_with_ts;
+      ;
+      GetContext* get_context = miter->get_context;
+
+      get_context->SetReplayLog(nullptr);
+      // Compute row cache key.
+      row_cache_key.TrimAppend(row_cache_key_prefix_size, user_key.data(),
+                               user_key.size());
+      // Put the replay log in row cache only if something was found.
+      if (s.ok() && !row_cache_entry.empty()) {
+        size_t charge = row_cache_entry.capacity() + sizeof(std::string);
+        void* row_ptr = new std::string(std::move(row_cache_entry));
+        // If row cache is full, it's OK.
+        ioptions_.row_cache
+            ->Insert(row_cache_key.GetUserKey(), row_ptr, charge,
+                     &DeleteEntry<std::string>)
+            .PermitUncheckedError();
+      }
+    }
+  }
+#endif  // ROCKSDB_LITE
+
+  if (handle != nullptr) {
+    ReleaseHandle(handle);
+  }
+  CO_RETURN s;
+}
+}  // namespace ROCKSDB_NAMESPACE
+#endif
diff --git a/src/rocksdb/db/table_properties_collector.cc b/src/rocksdb/db/table_properties_collector.cc
new file mode 100644
index 000000000..edb9a1b63
--- /dev/null
+++ b/src/rocksdb/db/table_properties_collector.cc
@@ -0,0 +1,74 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/table_properties_collector.h"
+
+#include "db/dbformat.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+uint64_t GetUint64Property(const UserCollectedProperties& props,
+                           const std::string& property_name,
+                           bool* property_present) {
+  auto pos = props.find(property_name);
+  if (pos == props.end()) {
+    *property_present = false;
+    return 0;
+  }
+  Slice raw = pos->second;
+  uint64_t val = 0;
+  *property_present = true;
+  return GetVarint64(&raw, &val) ? val : 0;
+}
+
+}  // anonymous namespace
+
+Status UserKeyTablePropertiesCollector::InternalAdd(const Slice& key,
+                                                    const Slice& value,
+                                                    uint64_t file_size) {
+  ParsedInternalKey ikey;
+  Status s = ParseInternalKey(key, &ikey, false /* log_err_key */);  // TODO
+  if (!s.ok()) {
+    return s;
+  }
+
+  return collector_->AddUserKey(ikey.user_key, value, GetEntryType(ikey.type),
+                                ikey.sequence, file_size);
+}
+
+void UserKeyTablePropertiesCollector::BlockAdd(
+    uint64_t block_uncomp_bytes, uint64_t block_compressed_bytes_fast,
+    uint64_t block_compressed_bytes_slow) {
+  return collector_->BlockAdd(block_uncomp_bytes, block_compressed_bytes_fast,
+                              block_compressed_bytes_slow);
+}
+
+Status UserKeyTablePropertiesCollector::Finish(
+    UserCollectedProperties* properties) {
+  return collector_->Finish(properties);
+}
+
+UserCollectedProperties UserKeyTablePropertiesCollector::GetReadableProperties()
+    const {
+  return collector_->GetReadableProperties();
+}
+
+uint64_t GetDeletedKeys(const UserCollectedProperties& props) {
+  bool property_present_ignored;
+  return GetUint64Property(props, TablePropertiesNames::kDeletedKeys,
+                           &property_present_ignored);
+}
+
+uint64_t GetMergeOperands(const UserCollectedProperties& props,
+                          bool* property_present) {
+  return GetUint64Property(props, TablePropertiesNames::kMergeOperands,
+                           property_present);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/table_properties_collector.h b/src/rocksdb/db/table_properties_collector.h
new file mode 100644
index 000000000..9035ba793
--- /dev/null
+++ b/src/rocksdb/db/table_properties_collector.h
@@ -0,0 +1,175 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file defines a collection of statistics collectors.
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/table_properties.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Base class for internal table properties collector.
+class IntTblPropCollector {
+ public:
+  virtual ~IntTblPropCollector() {}
+  virtual Status Finish(UserCollectedProperties* properties) = 0;
+
+  virtual const char* Name() const = 0;
+
+  // @params key    the user key that is inserted into the table.
+  // @params value  the value that is inserted into the table.
+  virtual Status InternalAdd(const Slice& key, const Slice& value,
+                             uint64_t file_size) = 0;
+
+  virtual void BlockAdd(uint64_t block_uncomp_bytes,
+                        uint64_t block_compressed_bytes_fast,
+                        uint64_t block_compressed_bytes_slow) = 0;
+
+  virtual UserCollectedProperties GetReadableProperties() const = 0;
+
+  virtual bool NeedCompact() const { return false; }
+};
+
+// Factory for internal table properties collector.
+class IntTblPropCollectorFactory {
+ public:
+  virtual ~IntTblPropCollectorFactory() {}
+  // has to be thread-safe
+  virtual IntTblPropCollector* CreateIntTblPropCollector(
+      uint32_t column_family_id, int level_at_creation) = 0;
+
+  // The name of the properties collector can be used for debugging purpose.
+  virtual const char* Name() const = 0;
+};
+
+using IntTblPropCollectorFactories =
+    std::vector<std::unique_ptr<IntTblPropCollectorFactory>>;
+
+// When rocksdb creates a new table, it will encode all "user keys" into
+// "internal keys", which contains meta information of a given entry.
+//
+// This class extracts user key from the encoded internal key when Add() is
+// invoked.
+class UserKeyTablePropertiesCollector : public IntTblPropCollector {
+ public:
+  // transfer of ownership
+  explicit UserKeyTablePropertiesCollector(TablePropertiesCollector* collector)
+      : collector_(collector) {}
+
+  virtual ~UserKeyTablePropertiesCollector() {}
+
+  virtual Status InternalAdd(const Slice& key, const Slice& value,
+                             uint64_t file_size) override;
+
+  virtual void BlockAdd(uint64_t block_uncomp_bytes,
+                        uint64_t block_compressed_bytes_fast,
+                        uint64_t block_compressed_bytes_slow) override;
+
+  virtual Status Finish(UserCollectedProperties* properties) override;
+
+  virtual const char* Name() const override { return collector_->Name(); }
+
+  UserCollectedProperties GetReadableProperties() const override;
+
+  virtual bool NeedCompact() const override {
+    return collector_->NeedCompact();
+  }
+
+ protected:
+  std::unique_ptr<TablePropertiesCollector> collector_;
+};
+
+class UserKeyTablePropertiesCollectorFactory
+    : public IntTblPropCollectorFactory {
+ public:
+  explicit UserKeyTablePropertiesCollectorFactory(
+      std::shared_ptr<TablePropertiesCollectorFactory> user_collector_factory)
+      : user_collector_factory_(user_collector_factory) {}
+  virtual IntTblPropCollector* CreateIntTblPropCollector(
+      uint32_t column_family_id, int level_at_creation) override {
+    TablePropertiesCollectorFactory::Context context;
+    context.column_family_id = column_family_id;
+    context.level_at_creation = level_at_creation;
+    return new UserKeyTablePropertiesCollector(
+        user_collector_factory_->CreateTablePropertiesCollector(context));
+  }
+
+  virtual const char* Name() const override {
+    return user_collector_factory_->Name();
+  }
+
+ private:
+  std::shared_ptr<TablePropertiesCollectorFactory> user_collector_factory_;
+};
+
+// When rocksdb creates a newtable, it will encode all "user keys" into
+// "internal keys". This class collects min/max timestamp from the encoded
+// internal key when Add() is invoked.
+//
+// @param cmp  the user comparator to compare the timestamps in internal key.
+class TimestampTablePropertiesCollector : public IntTblPropCollector {
+ public:
+  explicit TimestampTablePropertiesCollector(const Comparator* cmp)
+      : cmp_(cmp),
+        timestamp_min_(kDisableUserTimestamp),
+        timestamp_max_(kDisableUserTimestamp) {}
+
+  Status InternalAdd(const Slice& key, const Slice& /* value */,
+                     uint64_t /* file_size */) override {
+    auto user_key = ExtractUserKey(key);
+    assert(cmp_ && cmp_->timestamp_size() > 0);
+    if (user_key.size() < cmp_->timestamp_size()) {
+      return Status::Corruption(
+          "User key size mismatch when comparing to timestamp size.");
+    }
+    auto timestamp_in_key =
+        ExtractTimestampFromUserKey(user_key, cmp_->timestamp_size());
+    if (timestamp_max_ == kDisableUserTimestamp ||
+        cmp_->CompareTimestamp(timestamp_in_key, timestamp_max_) > 0) {
+      timestamp_max_.assign(timestamp_in_key.data(), timestamp_in_key.size());
+    }
+    if (timestamp_min_ == kDisableUserTimestamp ||
+        cmp_->CompareTimestamp(timestamp_min_, timestamp_in_key) > 0) {
+      timestamp_min_.assign(timestamp_in_key.data(), timestamp_in_key.size());
+    }
+    return Status::OK();
+  }
+
+  void BlockAdd(uint64_t /* block_uncomp_bytes */,
+                uint64_t /* block_compressed_bytes_fast */,
+                uint64_t /* block_compressed_bytes_slow */) override {
+    return;
+  }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    assert(timestamp_min_.size() == timestamp_max_.size() &&
+           timestamp_max_.size() == cmp_->timestamp_size());
+    properties->insert({"rocksdb.timestamp_min", timestamp_min_});
+    properties->insert({"rocksdb.timestamp_max", timestamp_max_});
+    return Status::OK();
+  }
+
+  const char* Name() const override {
+    return "TimestampTablePropertiesCollector";
+  }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    return {{"rocksdb.timestamp_min", Slice(timestamp_min_).ToString(true)},
+            {"rocksdb.timestamp_max", Slice(timestamp_max_).ToString(true)}};
+  }
+
+ protected:
+  const Comparator* const cmp_;
+  std::string timestamp_min_;
+  std::string timestamp_max_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/table_properties_collector_test.cc b/src/rocksdb/db/table_properties_collector_test.cc
new file mode 100644
index 000000000..5f0f205da
--- /dev/null
+++ b/src/rocksdb/db/table_properties_collector_test.cc
@@ -0,0 +1,513 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/table_properties_collector.h"
+
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "file/sequence_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/meta_blocks.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/table_builder.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TablePropertiesTest : public testing::Test,
+                            public testing::WithParamInterface<bool> {
+ public:
+  void SetUp() override { backward_mode_ = GetParam(); }
+
+  bool backward_mode_;
+};
+
+// Utilities test functions
+namespace {
+static const uint32_t kTestColumnFamilyId = 66;
+static const std::string kTestColumnFamilyName = "test_column_fam";
+static const int kTestLevel = 1;
+
+void MakeBuilder(
+    const Options& options, const ImmutableOptions& ioptions,
+    const MutableCFOptions& moptions,
+    const InternalKeyComparator& internal_comparator,
+    const IntTblPropCollectorFactories* int_tbl_prop_collector_factories,
+    std::unique_ptr<WritableFileWriter>* writable,
+    std::unique_ptr<TableBuilder>* builder) {
+  std::unique_ptr<FSWritableFile> wf(new test::StringSink);
+  writable->reset(
+      new WritableFileWriter(std::move(wf), "" /* don't care */, EnvOptions()));
+  TableBuilderOptions tboptions(
+      ioptions, moptions, internal_comparator, int_tbl_prop_collector_factories,
+      options.compression, options.compression_opts, kTestColumnFamilyId,
+      kTestColumnFamilyName, kTestLevel);
+  builder->reset(NewTableBuilder(tboptions, writable->get()));
+}
+}  // namespace
+
+// Collects keys that starts with "A" in a table.
+class RegularKeysStartWithA : public TablePropertiesCollector {
+ public:
+  const char* Name() const override { return "RegularKeysStartWithA"; }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    std::string encoded;
+    std::string encoded_num_puts;
+    std::string encoded_num_deletes;
+    std::string encoded_num_single_deletes;
+    std::string encoded_num_size_changes;
+    PutVarint32(&encoded, count_);
+    PutVarint32(&encoded_num_puts, num_puts_);
+    PutVarint32(&encoded_num_deletes, num_deletes_);
+    PutVarint32(&encoded_num_single_deletes, num_single_deletes_);
+    PutVarint32(&encoded_num_size_changes, num_size_changes_);
+    *properties = UserCollectedProperties{
+        {"TablePropertiesTest", message_},
+        {"Count", encoded},
+        {"NumPuts", encoded_num_puts},
+        {"NumDeletes", encoded_num_deletes},
+        {"NumSingleDeletes", encoded_num_single_deletes},
+        {"NumSizeChanges", encoded_num_size_changes},
+    };
+    return Status::OK();
+  }
+
+  Status AddUserKey(const Slice& user_key, const Slice& /*value*/,
+                    EntryType type, SequenceNumber /*seq*/,
+                    uint64_t file_size) override {
+    // simply asssume all user keys are not empty.
+    if (user_key.data()[0] == 'A') {
+      ++count_;
+    }
+    if (type == kEntryPut) {
+      num_puts_++;
+    } else if (type == kEntryDelete) {
+      num_deletes_++;
+    } else if (type == kEntrySingleDelete) {
+      num_single_deletes_++;
+    }
+    if (file_size < file_size_) {
+      message_ = "File size should not decrease.";
+    } else if (file_size != file_size_) {
+      num_size_changes_++;
+    }
+
+    return Status::OK();
+  }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{};
+  }
+
+ private:
+  std::string message_ = "Rocksdb";
+  uint32_t count_ = 0;
+  uint32_t num_puts_ = 0;
+  uint32_t num_deletes_ = 0;
+  uint32_t num_single_deletes_ = 0;
+  uint32_t num_size_changes_ = 0;
+  uint64_t file_size_ = 0;
+};
+
+// Collects keys that starts with "A" in a table. Backward compatible mode
+// It is also used to test internal key table property collector
+class RegularKeysStartWithABackwardCompatible
+    : public TablePropertiesCollector {
+ public:
+  const char* Name() const override { return "RegularKeysStartWithA"; }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    std::string encoded;
+    PutVarint32(&encoded, count_);
+    *properties = UserCollectedProperties{{"TablePropertiesTest", "Rocksdb"},
+                                          {"Count", encoded}};
+    return Status::OK();
+  }
+
+  Status Add(const Slice& user_key, const Slice& /*value*/) override {
+    // simply asssume all user keys are not empty.
+    if (user_key.data()[0] == 'A') {
+      ++count_;
+    }
+    return Status::OK();
+  }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{};
+  }
+
+ private:
+  uint32_t count_ = 0;
+};
+
+class RegularKeysStartWithAInternal : public IntTblPropCollector {
+ public:
+  const char* Name() const override { return "RegularKeysStartWithA"; }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    std::string encoded;
+    PutVarint32(&encoded, count_);
+    *properties = UserCollectedProperties{{"TablePropertiesTest", "Rocksdb"},
+                                          {"Count", encoded}};
+    return Status::OK();
+  }
+
+  Status InternalAdd(const Slice& user_key, const Slice& /*value*/,
+                     uint64_t /*file_size*/) override {
+    // simply asssume all user keys are not empty.
+    if (user_key.data()[0] == 'A') {
+      ++count_;
+    }
+    return Status::OK();
+  }
+
+  void BlockAdd(uint64_t /* block_uncomp_bytes */,
+                uint64_t /* block_compressed_bytes_fast */,
+                uint64_t /* block_compressed_bytes_slow */) override {
+    // Nothing to do.
+    return;
+  }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{};
+  }
+
+ private:
+  uint32_t count_ = 0;
+};
+
+class RegularKeysStartWithAFactory : public IntTblPropCollectorFactory,
+                                     public TablePropertiesCollectorFactory {
+ public:
+  explicit RegularKeysStartWithAFactory(bool backward_mode)
+      : backward_mode_(backward_mode) {}
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context context) override {
+    EXPECT_EQ(kTestColumnFamilyId, context.column_family_id);
+    EXPECT_EQ(kTestLevel, context.level_at_creation);
+    if (!backward_mode_) {
+      return new RegularKeysStartWithA();
+    } else {
+      return new RegularKeysStartWithABackwardCompatible();
+    }
+  }
+  IntTblPropCollector* CreateIntTblPropCollector(
+      uint32_t /*column_family_id*/, int /* level_at_creation */) override {
+    return new RegularKeysStartWithAInternal();
+  }
+  const char* Name() const override { return "RegularKeysStartWithA"; }
+
+  bool backward_mode_;
+};
+
+class FlushBlockEveryThreePolicy : public FlushBlockPolicy {
+ public:
+  bool Update(const Slice& /*key*/, const Slice& /*value*/) override {
+    return (++count_ % 3U == 0);
+  }
+
+ private:
+  uint64_t count_ = 0;
+};
+
+class FlushBlockEveryThreePolicyFactory : public FlushBlockPolicyFactory {
+ public:
+  explicit FlushBlockEveryThreePolicyFactory() {}
+
+  const char* Name() const override {
+    return "FlushBlockEveryThreePolicyFactory";
+  }
+
+  FlushBlockPolicy* NewFlushBlockPolicy(
+      const BlockBasedTableOptions& /*table_options*/,
+      const BlockBuilder& /*data_block_builder*/) const override {
+    return new FlushBlockEveryThreePolicy;
+  }
+};
+
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const uint64_t kPlainTableMagicNumber;
+namespace {
+void TestCustomizedTablePropertiesCollector(
+    bool backward_mode, uint64_t magic_number, bool test_int_tbl_prop_collector,
+    const Options& options, const InternalKeyComparator& internal_comparator) {
+  // make sure the entries will be inserted with order.
+  std::map<std::pair<std::string, ValueType>, std::string> kvs = {
+      {{"About   ", kTypeValue}, "val5"},  // starts with 'A'
+      {{"Abstract", kTypeValue}, "val2"},  // starts with 'A'
+      {{"Around  ", kTypeValue}, "val7"},  // starts with 'A'
+      {{"Beyond  ", kTypeValue}, "val3"},
+      {{"Builder ", kTypeValue}, "val1"},
+      {{"Love    ", kTypeDeletion}, ""},
+      {{"Cancel  ", kTypeValue}, "val4"},
+      {{"Find    ", kTypeValue}, "val6"},
+      {{"Rocks   ", kTypeDeletion}, ""},
+      {{"Foo     ", kTypeSingleDeletion}, ""},
+  };
+
+  // -- Step 1: build table
+  std::unique_ptr<TableBuilder> builder;
+  std::unique_ptr<WritableFileWriter> writer;
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+  if (test_int_tbl_prop_collector) {
+    int_tbl_prop_collector_factories.emplace_back(
+        new RegularKeysStartWithAFactory(backward_mode));
+  } else {
+    GetIntTblPropCollectorFactory(ioptions, &int_tbl_prop_collector_factories);
+  }
+  MakeBuilder(options, ioptions, moptions, internal_comparator,
+              &int_tbl_prop_collector_factories, &writer, &builder);
+
+  SequenceNumber seqNum = 0U;
+  for (const auto& kv : kvs) {
+    InternalKey ikey(kv.first.first, seqNum++, kv.first.second);
+    builder->Add(ikey.Encode(), kv.second);
+  }
+  ASSERT_OK(builder->Finish());
+  ASSERT_OK(writer->Flush());
+
+  // -- Step 2: Read properties
+  test::StringSink* fwf =
+      static_cast<test::StringSink*>(writer->writable_file());
+  std::unique_ptr<FSRandomAccessFile> source(
+      new test::StringSource(fwf->contents()));
+  std::unique_ptr<RandomAccessFileReader> fake_file_reader(
+      new RandomAccessFileReader(std::move(source), "test"));
+
+  std::unique_ptr<TableProperties> props;
+  Status s = ReadTableProperties(fake_file_reader.get(), fwf->contents().size(),
+                                 magic_number, ioptions, &props);
+  ASSERT_OK(s);
+
+  auto user_collected = props->user_collected_properties;
+
+  ASSERT_NE(user_collected.find("TablePropertiesTest"), user_collected.end());
+  ASSERT_EQ("Rocksdb", user_collected.at("TablePropertiesTest"));
+
+  uint32_t starts_with_A = 0;
+  ASSERT_NE(user_collected.find("Count"), user_collected.end());
+  Slice key(user_collected.at("Count"));
+  ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
+  ASSERT_EQ(3u, starts_with_A);
+
+  if (!backward_mode && !test_int_tbl_prop_collector) {
+    uint32_t num_puts;
+    ASSERT_NE(user_collected.find("NumPuts"), user_collected.end());
+    Slice key_puts(user_collected.at("NumPuts"));
+    ASSERT_TRUE(GetVarint32(&key_puts, &num_puts));
+    ASSERT_EQ(7u, num_puts);
+
+    uint32_t num_deletes;
+    ASSERT_NE(user_collected.find("NumDeletes"), user_collected.end());
+    Slice key_deletes(user_collected.at("NumDeletes"));
+    ASSERT_TRUE(GetVarint32(&key_deletes, &num_deletes));
+    ASSERT_EQ(2u, num_deletes);
+
+    uint32_t num_single_deletes;
+    ASSERT_NE(user_collected.find("NumSingleDeletes"), user_collected.end());
+    Slice key_single_deletes(user_collected.at("NumSingleDeletes"));
+    ASSERT_TRUE(GetVarint32(&key_single_deletes, &num_single_deletes));
+    ASSERT_EQ(1u, num_single_deletes);
+
+    uint32_t num_size_changes;
+    ASSERT_NE(user_collected.find("NumSizeChanges"), user_collected.end());
+    Slice key_size_changes(user_collected.at("NumSizeChanges"));
+    ASSERT_TRUE(GetVarint32(&key_size_changes, &num_size_changes));
+    ASSERT_GE(num_size_changes, 2u);
+  }
+}
+}  // namespace
+
+TEST_P(TablePropertiesTest, CustomizedTablePropertiesCollector) {
+  // Test properties collectors with internal keys or regular keys
+  // for block based table
+  for (bool encode_as_internal : {true, false}) {
+    Options options;
+    BlockBasedTableOptions table_options;
+    table_options.flush_block_policy_factory =
+        std::make_shared<FlushBlockEveryThreePolicyFactory>();
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    test::PlainInternalKeyComparator ikc(options.comparator);
+    std::shared_ptr<TablePropertiesCollectorFactory> collector_factory(
+        new RegularKeysStartWithAFactory(backward_mode_));
+    options.table_properties_collector_factories.resize(1);
+    options.table_properties_collector_factories[0] = collector_factory;
+
+    TestCustomizedTablePropertiesCollector(backward_mode_,
+                                           kBlockBasedTableMagicNumber,
+                                           encode_as_internal, options, ikc);
+
+#ifndef ROCKSDB_LITE  // PlainTable is not supported in Lite
+    // test plain table
+    PlainTableOptions plain_table_options;
+    plain_table_options.user_key_len = 8;
+    plain_table_options.bloom_bits_per_key = 8;
+    plain_table_options.hash_table_ratio = 0;
+
+    options.table_factory =
+        std::make_shared<PlainTableFactory>(plain_table_options);
+    TestCustomizedTablePropertiesCollector(backward_mode_,
+                                           kPlainTableMagicNumber,
+                                           encode_as_internal, options, ikc);
+#endif  // !ROCKSDB_LITE
+  }
+}
+
+namespace {
+void TestInternalKeyPropertiesCollector(
+    bool backward_mode, uint64_t magic_number, bool sanitized,
+    std::shared_ptr<TableFactory> table_factory) {
+  InternalKey keys[] = {
+      InternalKey("A       ", 0, ValueType::kTypeValue),
+      InternalKey("B       ", 1, ValueType::kTypeValue),
+      InternalKey("C       ", 2, ValueType::kTypeValue),
+      InternalKey("W       ", 3, ValueType::kTypeDeletion),
+      InternalKey("X       ", 4, ValueType::kTypeDeletion),
+      InternalKey("Y       ", 5, ValueType::kTypeDeletion),
+      InternalKey("Z       ", 6, ValueType::kTypeDeletion),
+      InternalKey("a       ", 7, ValueType::kTypeSingleDeletion),
+      InternalKey("b       ", 8, ValueType::kTypeMerge),
+      InternalKey("c       ", 9, ValueType::kTypeMerge),
+  };
+
+  std::unique_ptr<TableBuilder> builder;
+  std::unique_ptr<WritableFileWriter> writable;
+  Options options;
+  test::PlainInternalKeyComparator pikc(options.comparator);
+
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+  options.table_factory = table_factory;
+  if (sanitized) {
+    options.table_properties_collector_factories.emplace_back(
+        new RegularKeysStartWithAFactory(backward_mode));
+    // with sanitization, even regular properties collector will be able to
+    // handle internal keys.
+    auto comparator = options.comparator;
+    // HACK: Set options.info_log to avoid writing log in
+    // SanitizeOptions().
+    options.info_log = std::make_shared<test::NullLogger>();
+    options = SanitizeOptions("db",  // just a place holder
+                              options);
+    ImmutableOptions ioptions(options);
+    GetIntTblPropCollectorFactory(ioptions, &int_tbl_prop_collector_factories);
+    options.comparator = comparator;
+  }
+  const ImmutableOptions ioptions(options);
+  MutableCFOptions moptions(options);
+
+  for (int iter = 0; iter < 2; ++iter) {
+    MakeBuilder(options, ioptions, moptions, pikc,
+                &int_tbl_prop_collector_factories, &writable, &builder);
+    for (const auto& k : keys) {
+      builder->Add(k.Encode(), "val");
+    }
+
+    ASSERT_OK(builder->Finish());
+    ASSERT_OK(writable->Flush());
+
+    test::StringSink* fwf =
+        static_cast<test::StringSink*>(writable->writable_file());
+    std::unique_ptr<FSRandomAccessFile> source(
+        new test::StringSource(fwf->contents()));
+    std::unique_ptr<RandomAccessFileReader> reader(
+        new RandomAccessFileReader(std::move(source), "test"));
+
+    std::unique_ptr<TableProperties> props;
+    Status s = ReadTableProperties(reader.get(), fwf->contents().size(),
+                                   magic_number, ioptions, &props);
+    ASSERT_OK(s);
+
+    auto user_collected = props->user_collected_properties;
+    uint64_t deleted = GetDeletedKeys(user_collected);
+    ASSERT_EQ(5u, deleted);  // deletes + single-deletes
+
+    bool property_present;
+    uint64_t merges = GetMergeOperands(user_collected, &property_present);
+    ASSERT_TRUE(property_present);
+    ASSERT_EQ(2u, merges);
+
+    if (sanitized) {
+      uint32_t starts_with_A = 0;
+      ASSERT_NE(user_collected.find("Count"), user_collected.end());
+      Slice key(user_collected.at("Count"));
+      ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
+      ASSERT_EQ(1u, starts_with_A);
+
+      if (!backward_mode) {
+        uint32_t num_puts;
+        ASSERT_NE(user_collected.find("NumPuts"), user_collected.end());
+        Slice key_puts(user_collected.at("NumPuts"));
+        ASSERT_TRUE(GetVarint32(&key_puts, &num_puts));
+        ASSERT_EQ(3u, num_puts);
+
+        uint32_t num_deletes;
+        ASSERT_NE(user_collected.find("NumDeletes"), user_collected.end());
+        Slice key_deletes(user_collected.at("NumDeletes"));
+        ASSERT_TRUE(GetVarint32(&key_deletes, &num_deletes));
+        ASSERT_EQ(4u, num_deletes);
+
+        uint32_t num_single_deletes;
+        ASSERT_NE(user_collected.find("NumSingleDeletes"),
+                  user_collected.end());
+        Slice key_single_deletes(user_collected.at("NumSingleDeletes"));
+        ASSERT_TRUE(GetVarint32(&key_single_deletes, &num_single_deletes));
+        ASSERT_EQ(1u, num_single_deletes);
+      }
+    }
+  }
+}
+}  // namespace
+
+TEST_P(TablePropertiesTest, InternalKeyPropertiesCollector) {
+  TestInternalKeyPropertiesCollector(
+      backward_mode_, kBlockBasedTableMagicNumber, true /* sanitize */,
+      std::make_shared<BlockBasedTableFactory>());
+  if (backward_mode_) {
+    TestInternalKeyPropertiesCollector(
+        backward_mode_, kBlockBasedTableMagicNumber, false /* not sanitize */,
+        std::make_shared<BlockBasedTableFactory>());
+  }
+
+#ifndef ROCKSDB_LITE  // PlainTable is not supported in Lite
+  PlainTableOptions plain_table_options;
+  plain_table_options.user_key_len = 8;
+  plain_table_options.bloom_bits_per_key = 8;
+  plain_table_options.hash_table_ratio = 0;
+
+  TestInternalKeyPropertiesCollector(
+      backward_mode_, kPlainTableMagicNumber, false /* not sanitize */,
+      std::make_shared<PlainTableFactory>(plain_table_options));
+#endif  // !ROCKSDB_LITE
+}
+
+INSTANTIATE_TEST_CASE_P(InternalKeyPropertiesCollector, TablePropertiesTest,
+                        ::testing::Bool());
+
+INSTANTIATE_TEST_CASE_P(CustomizedTablePropertiesCollector, TablePropertiesTest,
+                        ::testing::Bool());
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/transaction_log_impl.cc b/src/rocksdb/db/transaction_log_impl.cc
new file mode 100644
index 000000000..3878b428a
--- /dev/null
+++ b/src/rocksdb/db/transaction_log_impl.cc
@@ -0,0 +1,298 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/transaction_log_impl.h"
+
+#include <cinttypes>
+
+#include "db/write_batch_internal.h"
+#include "file/sequence_file_reader.h"
+#include "util/defer.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+TransactionLogIteratorImpl::TransactionLogIteratorImpl(
+    const std::string& dir, const ImmutableDBOptions* options,
+    const TransactionLogIterator::ReadOptions& read_options,
+    const EnvOptions& soptions, const SequenceNumber seq,
+    std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions,
+    const bool seq_per_batch, const std::shared_ptr<IOTracer>& io_tracer)
+    : dir_(dir),
+      options_(options),
+      read_options_(read_options),
+      soptions_(soptions),
+      starting_sequence_number_(seq),
+      files_(std::move(files)),
+      versions_(versions),
+      seq_per_batch_(seq_per_batch),
+      io_tracer_(io_tracer),
+      started_(false),
+      is_valid_(false),
+      current_file_index_(0),
+      current_batch_seq_(0),
+      current_last_seq_(0) {
+  assert(files_ != nullptr);
+  assert(versions_ != nullptr);
+  assert(!seq_per_batch_);
+  current_status_.PermitUncheckedError();  // Clear on start
+  reporter_.env = options_->env;
+  reporter_.info_log = options_->info_log.get();
+  SeekToStartSequence();  // Seek till starting sequence
+}
+
+Status TransactionLogIteratorImpl::OpenLogFile(
+    const LogFile* log_file,
+    std::unique_ptr<SequentialFileReader>* file_reader) {
+  FileSystemPtr fs(options_->fs, io_tracer_);
+  std::unique_ptr<FSSequentialFile> file;
+  std::string fname;
+  Status s;
+  EnvOptions optimized_env_options = fs->OptimizeForLogRead(soptions_);
+  if (log_file->Type() == kArchivedLogFile) {
+    fname = ArchivedLogFileName(dir_, log_file->LogNumber());
+    s = fs->NewSequentialFile(fname, optimized_env_options, &file, nullptr);
+  } else {
+    fname = LogFileName(dir_, log_file->LogNumber());
+    s = fs->NewSequentialFile(fname, optimized_env_options, &file, nullptr);
+    if (!s.ok()) {
+      //  If cannot open file in DB directory.
+      //  Try the archive dir, as it could have moved in the meanwhile.
+      fname = ArchivedLogFileName(dir_, log_file->LogNumber());
+      s = fs->NewSequentialFile(fname, optimized_env_options, &file, nullptr);
+    }
+  }
+  if (s.ok()) {
+    file_reader->reset(new SequentialFileReader(std::move(file), fname,
+                                                io_tracer_, options_->listeners,
+                                                options_->rate_limiter.get()));
+  }
+  return s;
+}
+
+BatchResult TransactionLogIteratorImpl::GetBatch() {
+  assert(is_valid_);  //  cannot call in a non valid state.
+  BatchResult result;
+  result.sequence = current_batch_seq_;
+  result.writeBatchPtr = std::move(current_batch_);
+  return result;
+}
+
+Status TransactionLogIteratorImpl::status() { return current_status_; }
+
+bool TransactionLogIteratorImpl::Valid() { return started_ && is_valid_; }
+
+bool TransactionLogIteratorImpl::RestrictedRead(Slice* record) {
+  // Don't read if no more complete entries to read from logs
+  if (current_last_seq_ >= versions_->LastSequence()) {
+    return false;
+  }
+  return current_log_reader_->ReadRecord(record, &scratch_);
+}
+
+void TransactionLogIteratorImpl::SeekToStartSequence(uint64_t start_file_index,
+                                                     bool strict) {
+  Slice record;
+  started_ = false;
+  is_valid_ = false;
+  // Check invariant of TransactionLogIterator when SeekToStartSequence()
+  // succeeds.
+  const Defer defer([this]() {
+    if (is_valid_) {
+      assert(current_status_.ok());
+      if (starting_sequence_number_ > current_batch_seq_) {
+        assert(current_batch_seq_ < current_last_seq_);
+        assert(current_last_seq_ >= starting_sequence_number_);
+      }
+    }
+  });
+  if (files_->size() <= start_file_index) {
+    return;
+  } else if (!current_status_.ok()) {
+    return;
+  }
+  Status s =
+      OpenLogReader(files_->at(static_cast<size_t>(start_file_index)).get());
+  if (!s.ok()) {
+    current_status_ = s;
+    reporter_.Info(current_status_.ToString().c_str());
+    return;
+  }
+  while (RestrictedRead(&record)) {
+    if (record.size() < WriteBatchInternal::kHeader) {
+      reporter_.Corruption(record.size(),
+                           Status::Corruption("very small log record"));
+      continue;
+    }
+    UpdateCurrentWriteBatch(record);
+    if (current_last_seq_ >= starting_sequence_number_) {
+      if (strict && current_batch_seq_ != starting_sequence_number_) {
+        current_status_ = Status::Corruption(
+            "Gap in sequence number. Could not "
+            "seek to required sequence number");
+        reporter_.Info(current_status_.ToString().c_str());
+        return;
+      } else if (strict) {
+        reporter_.Info(
+            "Could seek required sequence number. Iterator will "
+            "continue.");
+      }
+      is_valid_ = true;
+      started_ = true;  // set started_ as we could seek till starting sequence
+      return;
+    } else {
+      is_valid_ = false;
+    }
+  }
+
+  // Could not find start sequence in first file. Normally this must be the
+  // only file. Otherwise log the error and let the iterator return next entry
+  // If strict is set, we want to seek exactly till the start sequence and it
+  // should have been present in the file we scanned above
+  if (strict) {
+    current_status_ = Status::Corruption(
+        "Gap in sequence number. Could not "
+        "seek to required sequence number");
+    reporter_.Info(current_status_.ToString().c_str());
+  } else if (files_->size() != 1) {
+    current_status_ = Status::Corruption(
+        "Start sequence was not found, "
+        "skipping to the next available");
+    reporter_.Info(current_status_.ToString().c_str());
+    // Let NextImpl find the next available entry. started_ remains false
+    // because we don't want to check for gaps while moving to start sequence
+    NextImpl(true);
+  }
+}
+
+void TransactionLogIteratorImpl::Next() {
+  if (!current_status_.ok()) {
+    return;
+  }
+  return NextImpl(false);
+}
+
+void TransactionLogIteratorImpl::NextImpl(bool internal) {
+  Slice record;
+  is_valid_ = false;
+  if (!internal && !started_) {
+    // Runs every time until we can seek to the start sequence
+    SeekToStartSequence();
+  }
+  while (true) {
+    assert(current_log_reader_);
+    if (current_log_reader_->IsEOF()) {
+      current_log_reader_->UnmarkEOF();
+    }
+    while (RestrictedRead(&record)) {
+      if (record.size() < WriteBatchInternal::kHeader) {
+        reporter_.Corruption(record.size(),
+                             Status::Corruption("very small log record"));
+        continue;
+      } else {
+        // started_ should be true if called by application
+        assert(internal || started_);
+        // started_ should be false if called internally
+        assert(!internal || !started_);
+        UpdateCurrentWriteBatch(record);
+        if (internal && !started_) {
+          started_ = true;
+        }
+        return;
+      }
+    }
+
+    // Open the next file
+    if (current_file_index_ < files_->size() - 1) {
+      ++current_file_index_;
+      Status s = OpenLogReader(files_->at(current_file_index_).get());
+      if (!s.ok()) {
+        is_valid_ = false;
+        current_status_ = s;
+        return;
+      }
+    } else {
+      is_valid_ = false;
+      if (current_last_seq_ == versions_->LastSequence()) {
+        current_status_ = Status::OK();
+      } else {
+        const char* msg = "Create a new iterator to fetch the new tail.";
+        current_status_ = Status::TryAgain(msg);
+      }
+      return;
+    }
+  }
+}
+
+bool TransactionLogIteratorImpl::IsBatchExpected(
+    const WriteBatch* batch, const SequenceNumber expected_seq) {
+  assert(batch);
+  SequenceNumber batchSeq = WriteBatchInternal::Sequence(batch);
+  if (batchSeq != expected_seq) {
+    char buf[200];
+    snprintf(buf, sizeof(buf),
+             "Discontinuity in log records. Got seq=%" PRIu64
+             ", Expected seq=%" PRIu64 ", Last flushed seq=%" PRIu64
+             ".Log iterator will reseek the correct batch.",
+             batchSeq, expected_seq, versions_->LastSequence());
+    reporter_.Info(buf);
+    return false;
+  }
+  return true;
+}
+
+void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
+  std::unique_ptr<WriteBatch> batch(new WriteBatch());
+  Status s = WriteBatchInternal::SetContents(batch.get(), record);
+  s.PermitUncheckedError();  // TODO: What should we do with this error?
+
+  SequenceNumber expected_seq = current_last_seq_ + 1;
+  // If the iterator has started, then confirm that we get continuous batches
+  if (started_ && !IsBatchExpected(batch.get(), expected_seq)) {
+    // Seek to the batch having expected sequence number
+    if (expected_seq < files_->at(current_file_index_)->StartSequence()) {
+      // Expected batch must lie in the previous log file
+      // Avoid underflow.
+      if (current_file_index_ != 0) {
+        current_file_index_--;
+      }
+    }
+    starting_sequence_number_ = expected_seq;
+    // currentStatus_ will be set to Ok if reseek succeeds
+    // Note: this is still ok in seq_pre_batch_ && two_write_queuesp_ mode
+    // that allows gaps in the WAL since it will still skip over the gap.
+    current_status_ = Status::NotFound("Gap in sequence numbers");
+    // In seq_per_batch_ mode, gaps in the seq are possible so the strict mode
+    // should be disabled
+    return SeekToStartSequence(current_file_index_, !seq_per_batch_);
+  }
+
+  current_batch_seq_ = WriteBatchInternal::Sequence(batch.get());
+  assert(!seq_per_batch_);
+  current_last_seq_ =
+      current_batch_seq_ + WriteBatchInternal::Count(batch.get()) - 1;
+  // currentBatchSeq_ can only change here
+  assert(current_last_seq_ <= versions_->LastSequence());
+
+  current_batch_ = std::move(batch);
+  is_valid_ = true;
+  current_status_ = Status::OK();
+}
+
+Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* log_file) {
+  std::unique_ptr<SequentialFileReader> file;
+  Status s = OpenLogFile(log_file, &file);
+  if (!s.ok()) {
+    return s;
+  }
+  assert(file);
+  current_log_reader_.reset(
+      new log::Reader(options_->info_log, std::move(file), &reporter_,
+                      read_options_.verify_checksums_, log_file->LogNumber()));
+  return Status::OK();
+}
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/transaction_log_impl.h b/src/rocksdb/db/transaction_log_impl.h
new file mode 100644
index 000000000..e8c6efc02
--- /dev/null
+++ b/src/rocksdb/db/transaction_log_impl.h
@@ -0,0 +1,130 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#ifndef ROCKSDB_LITE
+#include <vector>
+
+#include "db/log_reader.h"
+#include "db/version_set.h"
+#include "file/filename.h"
+#include "logging/logging.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/transaction_log.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class LogFileImpl : public LogFile {
+ public:
+  LogFileImpl(uint64_t logNum, WalFileType logType, SequenceNumber startSeq,
+              uint64_t sizeBytes)
+      : logNumber_(logNum),
+        type_(logType),
+        startSequence_(startSeq),
+        sizeFileBytes_(sizeBytes) {}
+
+  std::string PathName() const override {
+    if (type_ == kArchivedLogFile) {
+      return ArchivedLogFileName("", logNumber_);
+    }
+    return LogFileName("", logNumber_);
+  }
+
+  uint64_t LogNumber() const override { return logNumber_; }
+
+  WalFileType Type() const override { return type_; }
+
+  SequenceNumber StartSequence() const override { return startSequence_; }
+
+  uint64_t SizeFileBytes() const override { return sizeFileBytes_; }
+
+  bool operator<(const LogFile& that) const {
+    return LogNumber() < that.LogNumber();
+  }
+
+ private:
+  uint64_t logNumber_;
+  WalFileType type_;
+  SequenceNumber startSequence_;
+  uint64_t sizeFileBytes_;
+};
+
+class TransactionLogIteratorImpl : public TransactionLogIterator {
+ public:
+  TransactionLogIteratorImpl(
+      const std::string& dir, const ImmutableDBOptions* options,
+      const TransactionLogIterator::ReadOptions& read_options,
+      const EnvOptions& soptions, const SequenceNumber seqNum,
+      std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions,
+      const bool seq_per_batch, const std::shared_ptr<IOTracer>& io_tracer);
+
+  virtual bool Valid() override;
+
+  virtual void Next() override;
+
+  virtual Status status() override;
+
+  virtual BatchResult GetBatch() override;
+
+ private:
+  const std::string& dir_;
+  const ImmutableDBOptions* options_;
+  const TransactionLogIterator::ReadOptions read_options_;
+  const EnvOptions& soptions_;
+  SequenceNumber starting_sequence_number_;
+  std::unique_ptr<VectorLogPtr> files_;
+  // Used only to get latest seq. num
+  // TODO(icanadi) can this be just a callback?
+  VersionSet const* const versions_;
+  const bool seq_per_batch_;
+  std::shared_ptr<IOTracer> io_tracer_;
+
+  // State variables
+  bool started_;
+  bool is_valid_;  // not valid when it starts of.
+  Status current_status_;
+  size_t current_file_index_;
+  std::unique_ptr<WriteBatch> current_batch_;
+  std::unique_ptr<log::Reader> current_log_reader_;
+  std::string scratch_;
+  Status OpenLogFile(const LogFile* log_file,
+                     std::unique_ptr<SequentialFileReader>* file);
+
+  struct LogReporter : public log::Reader::Reporter {
+    Env* env;
+    Logger* info_log;
+    virtual void Corruption(size_t bytes, const Status& s) override {
+      ROCKS_LOG_ERROR(info_log, "dropping %" ROCKSDB_PRIszt " bytes; %s", bytes,
+                      s.ToString().c_str());
+    }
+    virtual void Info(const char* s) { ROCKS_LOG_INFO(info_log, "%s", s); }
+  } reporter_;
+
+  SequenceNumber
+      current_batch_seq_;  // sequence number at start of current batch
+  SequenceNumber current_last_seq_;  // last sequence in the current batch
+  // Reads from transaction log only if the writebatch record has been written
+  bool RestrictedRead(Slice* record);
+  // Seeks to starting_sequence_number_ reading from start_file_index in files_.
+  // If strict is set, then must get a batch starting with
+  // starting_sequence_number_.
+  void SeekToStartSequence(uint64_t start_file_index = 0, bool strict = false);
+  // Implementation of Next. SeekToStartSequence calls it internally with
+  // internal=true to let it find next entry even if it has to jump gaps because
+  // the iterator may start off from the first available entry but promises to
+  // be continuous after that
+  void NextImpl(bool internal = false);
+  // Check if batch is expected, else return false
+  bool IsBatchExpected(const WriteBatch* batch, SequenceNumber expected_seq);
+  // Update current batch if a continuous batch is found.
+  void UpdateCurrentWriteBatch(const Slice& record);
+  Status OpenLogReader(const LogFile* file);
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/trim_history_scheduler.cc b/src/rocksdb/db/trim_history_scheduler.cc
new file mode 100644
index 000000000..d7ca0899f
--- /dev/null
+++ b/src/rocksdb/db/trim_history_scheduler.cc
@@ -0,0 +1,54 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/trim_history_scheduler.h"
+
+#include <cassert>
+
+#include "db/column_family.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void TrimHistoryScheduler::ScheduleWork(ColumnFamilyData* cfd) {
+  std::lock_guard<std::mutex> lock(checking_mutex_);
+  cfd->Ref();
+  cfds_.push_back(cfd);
+  is_empty_.store(false, std::memory_order_relaxed);
+}
+
+ColumnFamilyData* TrimHistoryScheduler::TakeNextColumnFamily() {
+  std::lock_guard<std::mutex> lock(checking_mutex_);
+  while (true) {
+    if (cfds_.empty()) {
+      return nullptr;
+    }
+    ColumnFamilyData* cfd = cfds_.back();
+    cfds_.pop_back();
+    if (cfds_.empty()) {
+      is_empty_.store(true, std::memory_order_relaxed);
+    }
+
+    if (!cfd->IsDropped()) {
+      // success
+      return cfd;
+    }
+    cfd->UnrefAndTryDelete();
+  }
+}
+
+bool TrimHistoryScheduler::Empty() {
+  bool is_empty = is_empty_.load(std::memory_order_relaxed);
+  return is_empty;
+}
+
+void TrimHistoryScheduler::Clear() {
+  ColumnFamilyData* cfd;
+  while ((cfd = TakeNextColumnFamily()) != nullptr) {
+    cfd->UnrefAndTryDelete();
+  }
+  assert(Empty());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/trim_history_scheduler.h b/src/rocksdb/db/trim_history_scheduler.h
new file mode 100644
index 000000000..252802a7a
--- /dev/null
+++ b/src/rocksdb/db/trim_history_scheduler.h
@@ -0,0 +1,46 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+
+#include <atomic>
+#include <mutex>
+
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ColumnFamilyData;
+
+// Similar to FlushScheduler, TrimHistoryScheduler is a FIFO queue that keeps
+// track of column families whose flushed immutable memtables may need to be
+// removed (aka trimmed). The actual trimming may be slightly delayed. Due to
+// the use of the mutex and atomic variable, ScheduleWork,
+// TakeNextColumnFamily, and, Empty can be called concurrently.
+class TrimHistoryScheduler {
+ public:
+  TrimHistoryScheduler() : is_empty_(true) {}
+
+  // When a column family needs history trimming, add cfd to the FIFO queue
+  void ScheduleWork(ColumnFamilyData* cfd);
+
+  // Remove the column family from the queue, the caller is responsible for
+  // calling `MemtableList::TrimHistory`
+  ColumnFamilyData* TakeNextColumnFamily();
+
+  bool Empty();
+
+  void Clear();
+
+  // Not on critical path, use mutex to ensure thread safety
+ private:
+  std::atomic<bool> is_empty_;
+  autovector<ColumnFamilyData*> cfds_;
+  std::mutex checking_mutex_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_builder.cc b/src/rocksdb/db/version_builder.cc
new file mode 100644
index 000000000..2c65dcf71
--- /dev/null
+++ b/src/rocksdb/db/version_builder.cc
@@ -0,0 +1,1372 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_builder.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cinttypes>
+#include <functional>
+#include <map>
+#include <memory>
+#include <set>
+#include <sstream>
+#include <thread>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "cache/cache_reservation_manager.h"
+#include "db/blob/blob_file_meta.h"
+#include "db/dbformat.h"
+#include "db/internal_stats.h"
+#include "db/table_cache.h"
+#include "db/version_set.h"
+#include "port/port.h"
+#include "table/table_reader.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class VersionBuilder::Rep {
+  class NewestFirstBySeqNo {
+   public:
+    bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const {
+      assert(lhs);
+      assert(rhs);
+
+      if (lhs->fd.largest_seqno != rhs->fd.largest_seqno) {
+        return lhs->fd.largest_seqno > rhs->fd.largest_seqno;
+      }
+
+      if (lhs->fd.smallest_seqno != rhs->fd.smallest_seqno) {
+        return lhs->fd.smallest_seqno > rhs->fd.smallest_seqno;
+      }
+
+      // Break ties by file number
+      return lhs->fd.GetNumber() > rhs->fd.GetNumber();
+    }
+  };
+
+  class BySmallestKey {
+   public:
+    explicit BySmallestKey(const InternalKeyComparator* cmp) : cmp_(cmp) {}
+
+    bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const {
+      assert(lhs);
+      assert(rhs);
+      assert(cmp_);
+
+      const int r = cmp_->Compare(lhs->smallest, rhs->smallest);
+      if (r != 0) {
+        return (r < 0);
+      }
+
+      // Break ties by file number
+      return (lhs->fd.GetNumber() < rhs->fd.GetNumber());
+    }
+
+   private:
+    const InternalKeyComparator* cmp_;
+  };
+
+  struct LevelState {
+    std::unordered_set<uint64_t> deleted_files;
+    // Map from file number to file meta data.
+    std::unordered_map<uint64_t, FileMetaData*> added_files;
+  };
+
+  // A class that represents the accumulated changes (like additional garbage or
+  // newly linked/unlinked SST files) for a given blob file after applying a
+  // series of VersionEdits.
+  class BlobFileMetaDataDelta {
+   public:
+    bool IsEmpty() const {
+      return !additional_garbage_count_ && !additional_garbage_bytes_ &&
+             newly_linked_ssts_.empty() && newly_unlinked_ssts_.empty();
+    }
+
+    uint64_t GetAdditionalGarbageCount() const {
+      return additional_garbage_count_;
+    }
+
+    uint64_t GetAdditionalGarbageBytes() const {
+      return additional_garbage_bytes_;
+    }
+
+    const std::unordered_set<uint64_t>& GetNewlyLinkedSsts() const {
+      return newly_linked_ssts_;
+    }
+
+    const std::unordered_set<uint64_t>& GetNewlyUnlinkedSsts() const {
+      return newly_unlinked_ssts_;
+    }
+
+    void AddGarbage(uint64_t count, uint64_t bytes) {
+      additional_garbage_count_ += count;
+      additional_garbage_bytes_ += bytes;
+    }
+
+    void LinkSst(uint64_t sst_file_number) {
+      assert(newly_linked_ssts_.find(sst_file_number) ==
+             newly_linked_ssts_.end());
+
+      // Reconcile with newly unlinked SSTs on the fly. (Note: an SST can be
+      // linked to and unlinked from the same blob file in the case of a trivial
+      // move.)
+      auto it = newly_unlinked_ssts_.find(sst_file_number);
+
+      if (it != newly_unlinked_ssts_.end()) {
+        newly_unlinked_ssts_.erase(it);
+      } else {
+        newly_linked_ssts_.emplace(sst_file_number);
+      }
+    }
+
+    void UnlinkSst(uint64_t sst_file_number) {
+      assert(newly_unlinked_ssts_.find(sst_file_number) ==
+             newly_unlinked_ssts_.end());
+
+      // Reconcile with newly linked SSTs on the fly. (Note: an SST can be
+      // linked to and unlinked from the same blob file in the case of a trivial
+      // move.)
+      auto it = newly_linked_ssts_.find(sst_file_number);
+
+      if (it != newly_linked_ssts_.end()) {
+        newly_linked_ssts_.erase(it);
+      } else {
+        newly_unlinked_ssts_.emplace(sst_file_number);
+      }
+    }
+
+   private:
+    uint64_t additional_garbage_count_ = 0;
+    uint64_t additional_garbage_bytes_ = 0;
+    std::unordered_set<uint64_t> newly_linked_ssts_;
+    std::unordered_set<uint64_t> newly_unlinked_ssts_;
+  };
+
+  // A class that represents the state of a blob file after applying a series of
+  // VersionEdits. In addition to the resulting state, it also contains the
+  // delta (see BlobFileMetaDataDelta above). The resulting state can be used to
+  // identify obsolete blob files, while the delta makes it possible to
+  // efficiently detect trivial moves.
+  class MutableBlobFileMetaData {
+   public:
+    // To be used for brand new blob files
+    explicit MutableBlobFileMetaData(
+        std::shared_ptr<SharedBlobFileMetaData>&& shared_meta)
+        : shared_meta_(std::move(shared_meta)) {}
+
+    // To be used for pre-existing blob files
+    explicit MutableBlobFileMetaData(
+        const std::shared_ptr<BlobFileMetaData>& meta)
+        : shared_meta_(meta->GetSharedMeta()),
+          linked_ssts_(meta->GetLinkedSsts()),
+          garbage_blob_count_(meta->GetGarbageBlobCount()),
+          garbage_blob_bytes_(meta->GetGarbageBlobBytes()) {}
+
+    const std::shared_ptr<SharedBlobFileMetaData>& GetSharedMeta() const {
+      return shared_meta_;
+    }
+
+    uint64_t GetBlobFileNumber() const {
+      assert(shared_meta_);
+      return shared_meta_->GetBlobFileNumber();
+    }
+
+    bool HasDelta() const { return !delta_.IsEmpty(); }
+
+    const std::unordered_set<uint64_t>& GetLinkedSsts() const {
+      return linked_ssts_;
+    }
+
+    uint64_t GetGarbageBlobCount() const { return garbage_blob_count_; }
+
+    uint64_t GetGarbageBlobBytes() const { return garbage_blob_bytes_; }
+
+    bool AddGarbage(uint64_t count, uint64_t bytes) {
+      assert(shared_meta_);
+
+      if (garbage_blob_count_ + count > shared_meta_->GetTotalBlobCount() ||
+          garbage_blob_bytes_ + bytes > shared_meta_->GetTotalBlobBytes()) {
+        return false;
+      }
+
+      delta_.AddGarbage(count, bytes);
+
+      garbage_blob_count_ += count;
+      garbage_blob_bytes_ += bytes;
+
+      return true;
+    }
+
+    void LinkSst(uint64_t sst_file_number) {
+      delta_.LinkSst(sst_file_number);
+
+      assert(linked_ssts_.find(sst_file_number) == linked_ssts_.end());
+      linked_ssts_.emplace(sst_file_number);
+    }
+
+    void UnlinkSst(uint64_t sst_file_number) {
+      delta_.UnlinkSst(sst_file_number);
+
+      assert(linked_ssts_.find(sst_file_number) != linked_ssts_.end());
+      linked_ssts_.erase(sst_file_number);
+    }
+
+   private:
+    std::shared_ptr<SharedBlobFileMetaData> shared_meta_;
+    // Accumulated changes
+    BlobFileMetaDataDelta delta_;
+    // Resulting state after applying the changes
+    BlobFileMetaData::LinkedSsts linked_ssts_;
+    uint64_t garbage_blob_count_ = 0;
+    uint64_t garbage_blob_bytes_ = 0;
+  };
+
+  const FileOptions& file_options_;
+  const ImmutableCFOptions* const ioptions_;
+  TableCache* table_cache_;
+  VersionStorageInfo* base_vstorage_;
+  VersionSet* version_set_;
+  int num_levels_;
+  LevelState* levels_;
+  // Store sizes of levels larger than num_levels_. We do this instead of
+  // storing them in levels_ to avoid regression in case there are no files
+  // on invalid levels. The version is not consistent if in the end the files
+  // on invalid levels don't cancel out.
+  std::unordered_map<int, size_t> invalid_level_sizes_;
+  // Whether there are invalid new files or invalid deletion on levels larger
+  // than num_levels_.
+  bool has_invalid_levels_;
+  // Current levels of table files affected by additions/deletions.
+  std::unordered_map<uint64_t, int> table_file_levels_;
+  // Current compact cursors that should be changed after the last compaction
+  std::unordered_map<int, InternalKey> updated_compact_cursors_;
+  NewestFirstBySeqNo level_zero_cmp_;
+  BySmallestKey level_nonzero_cmp_;
+
+  // Mutable metadata objects for all blob files affected by the series of
+  // version edits.
+  std::map<uint64_t, MutableBlobFileMetaData> mutable_blob_file_metas_;
+
+  std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr_;
+
+ public:
+  Rep(const FileOptions& file_options, const ImmutableCFOptions* ioptions,
+      TableCache* table_cache, VersionStorageInfo* base_vstorage,
+      VersionSet* version_set,
+      std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr)
+      : file_options_(file_options),
+        ioptions_(ioptions),
+        table_cache_(table_cache),
+        base_vstorage_(base_vstorage),
+        version_set_(version_set),
+        num_levels_(base_vstorage->num_levels()),
+        has_invalid_levels_(false),
+        level_nonzero_cmp_(base_vstorage_->InternalComparator()),
+        file_metadata_cache_res_mgr_(file_metadata_cache_res_mgr) {
+    assert(ioptions_);
+
+    levels_ = new LevelState[num_levels_];
+  }
+
+  ~Rep() {
+    for (int level = 0; level < num_levels_; level++) {
+      const auto& added = levels_[level].added_files;
+      for (auto& pair : added) {
+        UnrefFile(pair.second);
+      }
+    }
+
+    delete[] levels_;
+  }
+
+  void UnrefFile(FileMetaData* f) {
+    f->refs--;
+    if (f->refs <= 0) {
+      if (f->table_reader_handle) {
+        assert(table_cache_ != nullptr);
+        table_cache_->ReleaseHandle(f->table_reader_handle);
+        f->table_reader_handle = nullptr;
+      }
+
+      if (file_metadata_cache_res_mgr_) {
+        Status s = file_metadata_cache_res_mgr_->UpdateCacheReservation(
+            f->ApproximateMemoryUsage(), false /* increase */);
+        s.PermitUncheckedError();
+      }
+      delete f;
+    }
+  }
+
+  // Mapping used for checking the consistency of links between SST files and
+  // blob files. It is built using the forward links (table file -> blob file),
+  // and is subsequently compared with the inverse mapping stored in the
+  // BlobFileMetaData objects.
+  using ExpectedLinkedSsts =
+      std::unordered_map<uint64_t, BlobFileMetaData::LinkedSsts>;
+
+  static void UpdateExpectedLinkedSsts(
+      uint64_t table_file_number, uint64_t blob_file_number,
+      ExpectedLinkedSsts* expected_linked_ssts) {
+    assert(expected_linked_ssts);
+
+    if (blob_file_number == kInvalidBlobFileNumber) {
+      return;
+    }
+
+    (*expected_linked_ssts)[blob_file_number].emplace(table_file_number);
+  }
+
+  template <typename Checker>
+  Status CheckConsistencyDetailsForLevel(
+      const VersionStorageInfo* vstorage, int level, Checker checker,
+      const std::string& sync_point,
+      ExpectedLinkedSsts* expected_linked_ssts) const {
+#ifdef NDEBUG
+    (void)sync_point;
+#endif
+
+    assert(vstorage);
+    assert(level >= 0 && level < num_levels_);
+    assert(expected_linked_ssts);
+
+    const auto& level_files = vstorage->LevelFiles(level);
+
+    if (level_files.empty()) {
+      return Status::OK();
+    }
+
+    assert(level_files[0]);
+    UpdateExpectedLinkedSsts(level_files[0]->fd.GetNumber(),
+                             level_files[0]->oldest_blob_file_number,
+                             expected_linked_ssts);
+
+    for (size_t i = 1; i < level_files.size(); ++i) {
+      assert(level_files[i]);
+      UpdateExpectedLinkedSsts(level_files[i]->fd.GetNumber(),
+                               level_files[i]->oldest_blob_file_number,
+                               expected_linked_ssts);
+
+      auto lhs = level_files[i - 1];
+      auto rhs = level_files[i];
+
+#ifndef NDEBUG
+      auto pair = std::make_pair(&lhs, &rhs);
+      TEST_SYNC_POINT_CALLBACK(sync_point, &pair);
+#endif
+
+      const Status s = checker(lhs, rhs);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+
+    return Status::OK();
+  }
+
+  // Make sure table files are sorted correctly and that the links between
+  // table files and blob files are consistent.
+  Status CheckConsistencyDetails(const VersionStorageInfo* vstorage) const {
+    assert(vstorage);
+
+    ExpectedLinkedSsts expected_linked_ssts;
+
+    if (num_levels_ > 0) {
+      // Check L0
+      {
+        auto l0_checker = [this](const FileMetaData* lhs,
+                                 const FileMetaData* rhs) {
+          assert(lhs);
+          assert(rhs);
+
+          if (!level_zero_cmp_(lhs, rhs)) {
+            std::ostringstream oss;
+            oss << "L0 files are not sorted properly: files #"
+                << lhs->fd.GetNumber() << ", #" << rhs->fd.GetNumber();
+
+            return Status::Corruption("VersionBuilder", oss.str());
+          }
+
+          if (rhs->fd.smallest_seqno == rhs->fd.largest_seqno) {
+            // This is an external file that we ingested
+            const SequenceNumber external_file_seqno = rhs->fd.smallest_seqno;
+
+            if (!(external_file_seqno < lhs->fd.largest_seqno ||
+                  external_file_seqno == 0)) {
+              std::ostringstream oss;
+              oss << "L0 file #" << lhs->fd.GetNumber() << " with seqno "
+                  << lhs->fd.smallest_seqno << ' ' << lhs->fd.largest_seqno
+                  << " vs. file #" << rhs->fd.GetNumber()
+                  << " with global_seqno " << external_file_seqno;
+
+              return Status::Corruption("VersionBuilder", oss.str());
+            }
+          } else if (lhs->fd.smallest_seqno <= rhs->fd.smallest_seqno) {
+            std::ostringstream oss;
+            oss << "L0 file #" << lhs->fd.GetNumber() << " with seqno "
+                << lhs->fd.smallest_seqno << ' ' << lhs->fd.largest_seqno
+                << " vs. file #" << rhs->fd.GetNumber() << " with seqno "
+                << rhs->fd.smallest_seqno << ' ' << rhs->fd.largest_seqno;
+
+            return Status::Corruption("VersionBuilder", oss.str());
+          }
+
+          return Status::OK();
+        };
+
+        const Status s = CheckConsistencyDetailsForLevel(
+            vstorage, /* level */ 0, l0_checker,
+            "VersionBuilder::CheckConsistency0", &expected_linked_ssts);
+        if (!s.ok()) {
+          return s;
+        }
+      }
+
+      // Check L1 and up
+      const InternalKeyComparator* const icmp = vstorage->InternalComparator();
+      assert(icmp);
+
+      for (int level = 1; level < num_levels_; ++level) {
+        auto checker = [this, level, icmp](const FileMetaData* lhs,
+                                           const FileMetaData* rhs) {
+          assert(lhs);
+          assert(rhs);
+
+          if (!level_nonzero_cmp_(lhs, rhs)) {
+            std::ostringstream oss;
+            oss << 'L' << level << " files are not sorted properly: files #"
+                << lhs->fd.GetNumber() << ", #" << rhs->fd.GetNumber();
+
+            return Status::Corruption("VersionBuilder", oss.str());
+          }
+
+          // Make sure there is no overlap in level
+          if (icmp->Compare(lhs->largest, rhs->smallest) >= 0) {
+            std::ostringstream oss;
+            oss << 'L' << level << " has overlapping ranges: file #"
+                << lhs->fd.GetNumber()
+                << " largest key: " << lhs->largest.DebugString(true)
+                << " vs. file #" << rhs->fd.GetNumber()
+                << " smallest key: " << rhs->smallest.DebugString(true);
+
+            return Status::Corruption("VersionBuilder", oss.str());
+          }
+
+          return Status::OK();
+        };
+
+        const Status s = CheckConsistencyDetailsForLevel(
+            vstorage, level, checker, "VersionBuilder::CheckConsistency1",
+            &expected_linked_ssts);
+        if (!s.ok()) {
+          return s;
+        }
+      }
+    }
+
+    // Make sure that all blob files in the version have non-garbage data and
+    // the links between them and the table files are consistent.
+    const auto& blob_files = vstorage->GetBlobFiles();
+    for (const auto& blob_file_meta : blob_files) {
+      assert(blob_file_meta);
+
+      const uint64_t blob_file_number = blob_file_meta->GetBlobFileNumber();
+
+      if (blob_file_meta->GetGarbageBlobCount() >=
+          blob_file_meta->GetTotalBlobCount()) {
+        std::ostringstream oss;
+        oss << "Blob file #" << blob_file_number
+            << " consists entirely of garbage";
+
+        return Status::Corruption("VersionBuilder", oss.str());
+      }
+
+      if (blob_file_meta->GetLinkedSsts() !=
+          expected_linked_ssts[blob_file_number]) {
+        std::ostringstream oss;
+        oss << "Links are inconsistent between table files and blob file #"
+            << blob_file_number;
+
+        return Status::Corruption("VersionBuilder", oss.str());
+      }
+    }
+
+    Status ret_s;
+    TEST_SYNC_POINT_CALLBACK("VersionBuilder::CheckConsistencyBeforeReturn",
+                             &ret_s);
+    return ret_s;
+  }
+
+  Status CheckConsistency(const VersionStorageInfo* vstorage) const {
+    assert(vstorage);
+
+    // Always run consistency checks in debug build
+#ifdef NDEBUG
+    if (!vstorage->force_consistency_checks()) {
+      return Status::OK();
+    }
+#endif
+    Status s = CheckConsistencyDetails(vstorage);
+    if (s.IsCorruption() && s.getState()) {
+      // Make it clear the error is due to force_consistency_checks = 1 or
+      // debug build
+#ifdef NDEBUG
+      auto prefix = "force_consistency_checks";
+#else
+      auto prefix = "force_consistency_checks(DEBUG)";
+#endif
+      s = Status::Corruption(prefix, s.getState());
+    } else {
+      // was only expecting corruption with message, or OK
+      assert(s.ok());
+    }
+    return s;
+  }
+
+  bool CheckConsistencyForNumLevels() const {
+    // Make sure there are no files on or beyond num_levels().
+    if (has_invalid_levels_) {
+      return false;
+    }
+
+    for (const auto& pair : invalid_level_sizes_) {
+      const size_t level_size = pair.second;
+      if (level_size != 0) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  bool IsBlobFileInVersion(uint64_t blob_file_number) const {
+    auto mutable_it = mutable_blob_file_metas_.find(blob_file_number);
+    if (mutable_it != mutable_blob_file_metas_.end()) {
+      return true;
+    }
+
+    assert(base_vstorage_);
+    const auto meta = base_vstorage_->GetBlobFileMetaData(blob_file_number);
+
+    return !!meta;
+  }
+
+  MutableBlobFileMetaData* GetOrCreateMutableBlobFileMetaData(
+      uint64_t blob_file_number) {
+    auto mutable_it = mutable_blob_file_metas_.find(blob_file_number);
+    if (mutable_it != mutable_blob_file_metas_.end()) {
+      return &mutable_it->second;
+    }
+
+    assert(base_vstorage_);
+    const auto meta = base_vstorage_->GetBlobFileMetaData(blob_file_number);
+
+    if (meta) {
+      mutable_it = mutable_blob_file_metas_
+                       .emplace(blob_file_number, MutableBlobFileMetaData(meta))
+                       .first;
+      return &mutable_it->second;
+    }
+
+    return nullptr;
+  }
+
+  Status ApplyBlobFileAddition(const BlobFileAddition& blob_file_addition) {
+    const uint64_t blob_file_number = blob_file_addition.GetBlobFileNumber();
+
+    if (IsBlobFileInVersion(blob_file_number)) {
+      std::ostringstream oss;
+      oss << "Blob file #" << blob_file_number << " already added";
+
+      return Status::Corruption("VersionBuilder", oss.str());
+    }
+
+    // Note: we use C++11 for now but in C++14, this could be done in a more
+    // elegant way using generalized lambda capture.
+    VersionSet* const vs = version_set_;
+    const ImmutableCFOptions* const ioptions = ioptions_;
+
+    auto deleter = [vs, ioptions](SharedBlobFileMetaData* shared_meta) {
+      if (vs) {
+        assert(ioptions);
+        assert(!ioptions->cf_paths.empty());
+        assert(shared_meta);
+
+        vs->AddObsoleteBlobFile(shared_meta->GetBlobFileNumber(),
+                                ioptions->cf_paths.front().path);
+      }
+
+      delete shared_meta;
+    };
+
+    auto shared_meta = SharedBlobFileMetaData::Create(
+        blob_file_number, blob_file_addition.GetTotalBlobCount(),
+        blob_file_addition.GetTotalBlobBytes(),
+        blob_file_addition.GetChecksumMethod(),
+        blob_file_addition.GetChecksumValue(), deleter);
+
+    mutable_blob_file_metas_.emplace(
+        blob_file_number, MutableBlobFileMetaData(std::move(shared_meta)));
+
+    return Status::OK();
+  }
+
+  Status ApplyBlobFileGarbage(const BlobFileGarbage& blob_file_garbage) {
+    const uint64_t blob_file_number = blob_file_garbage.GetBlobFileNumber();
+
+    MutableBlobFileMetaData* const mutable_meta =
+        GetOrCreateMutableBlobFileMetaData(blob_file_number);
+
+    if (!mutable_meta) {
+      std::ostringstream oss;
+      oss << "Blob file #" << blob_file_number << " not found";
+
+      return Status::Corruption("VersionBuilder", oss.str());
+    }
+
+    if (!mutable_meta->AddGarbage(blob_file_garbage.GetGarbageBlobCount(),
+                                  blob_file_garbage.GetGarbageBlobBytes())) {
+      std::ostringstream oss;
+      oss << "Garbage overflow for blob file #" << blob_file_number;
+      return Status::Corruption("VersionBuilder", oss.str());
+    }
+
+    return Status::OK();
+  }
+
+  int GetCurrentLevelForTableFile(uint64_t file_number) const {
+    auto it = table_file_levels_.find(file_number);
+    if (it != table_file_levels_.end()) {
+      return it->second;
+    }
+
+    assert(base_vstorage_);
+    return base_vstorage_->GetFileLocation(file_number).GetLevel();
+  }
+
+  uint64_t GetOldestBlobFileNumberForTableFile(int level,
+                                               uint64_t file_number) const {
+    assert(level < num_levels_);
+
+    const auto& added_files = levels_[level].added_files;
+
+    auto it = added_files.find(file_number);
+    if (it != added_files.end()) {
+      const FileMetaData* const meta = it->second;
+      assert(meta);
+
+      return meta->oldest_blob_file_number;
+    }
+
+    assert(base_vstorage_);
+    const FileMetaData* const meta =
+        base_vstorage_->GetFileMetaDataByNumber(file_number);
+    assert(meta);
+
+    return meta->oldest_blob_file_number;
+  }
+
+  Status ApplyFileDeletion(int level, uint64_t file_number) {
+    assert(level != VersionStorageInfo::FileLocation::Invalid().GetLevel());
+
+    const int current_level = GetCurrentLevelForTableFile(file_number);
+
+    if (level != current_level) {
+      if (level >= num_levels_) {
+        has_invalid_levels_ = true;
+      }
+
+      std::ostringstream oss;
+      oss << "Cannot delete table file #" << file_number << " from level "
+          << level << " since it is ";
+      if (current_level ==
+          VersionStorageInfo::FileLocation::Invalid().GetLevel()) {
+        oss << "not in the LSM tree";
+      } else {
+        oss << "on level " << current_level;
+      }
+
+      return Status::Corruption("VersionBuilder", oss.str());
+    }
+
+    if (level >= num_levels_) {
+      assert(invalid_level_sizes_[level] > 0);
+      --invalid_level_sizes_[level];
+
+      table_file_levels_[file_number] =
+          VersionStorageInfo::FileLocation::Invalid().GetLevel();
+
+      return Status::OK();
+    }
+
+    const uint64_t blob_file_number =
+        GetOldestBlobFileNumberForTableFile(level, file_number);
+
+    if (blob_file_number != kInvalidBlobFileNumber) {
+      MutableBlobFileMetaData* const mutable_meta =
+          GetOrCreateMutableBlobFileMetaData(blob_file_number);
+      if (mutable_meta) {
+        mutable_meta->UnlinkSst(file_number);
+      }
+    }
+
+    auto& level_state = levels_[level];
+
+    auto& add_files = level_state.added_files;
+    auto add_it = add_files.find(file_number);
+    if (add_it != add_files.end()) {
+      UnrefFile(add_it->second);
+      add_files.erase(add_it);
+    }
+
+    auto& del_files = level_state.deleted_files;
+    assert(del_files.find(file_number) == del_files.end());
+    del_files.emplace(file_number);
+
+    table_file_levels_[file_number] =
+        VersionStorageInfo::FileLocation::Invalid().GetLevel();
+
+    return Status::OK();
+  }
+
+  Status ApplyFileAddition(int level, const FileMetaData& meta) {
+    assert(level != VersionStorageInfo::FileLocation::Invalid().GetLevel());
+
+    const uint64_t file_number = meta.fd.GetNumber();
+
+    const int current_level = GetCurrentLevelForTableFile(file_number);
+
+    if (current_level !=
+        VersionStorageInfo::FileLocation::Invalid().GetLevel()) {
+      if (level >= num_levels_) {
+        has_invalid_levels_ = true;
+      }
+
+      std::ostringstream oss;
+      oss << "Cannot add table file #" << file_number << " to level " << level
+          << " since it is already in the LSM tree on level " << current_level;
+      return Status::Corruption("VersionBuilder", oss.str());
+    }
+
+    if (level >= num_levels_) {
+      ++invalid_level_sizes_[level];
+      table_file_levels_[file_number] = level;
+
+      return Status::OK();
+    }
+
+    auto& level_state = levels_[level];
+
+    auto& del_files = level_state.deleted_files;
+    auto del_it = del_files.find(file_number);
+    if (del_it != del_files.end()) {
+      del_files.erase(del_it);
+    }
+
+    FileMetaData* const f = new FileMetaData(meta);
+    f->refs = 1;
+
+    if (file_metadata_cache_res_mgr_) {
+      Status s = file_metadata_cache_res_mgr_->UpdateCacheReservation(
+          f->ApproximateMemoryUsage(), true /* increase */);
+      if (!s.ok()) {
+        delete f;
+        s = Status::MemoryLimit(
+            "Can't allocate " +
+            kCacheEntryRoleToCamelString[static_cast<std::uint32_t>(
+                CacheEntryRole::kFileMetadata)] +
+            " due to exceeding the memory limit "
+            "based on "
+            "cache capacity");
+        return s;
+      }
+    }
+
+    auto& add_files = level_state.added_files;
+    assert(add_files.find(file_number) == add_files.end());
+    add_files.emplace(file_number, f);
+
+    const uint64_t blob_file_number = f->oldest_blob_file_number;
+
+    if (blob_file_number != kInvalidBlobFileNumber) {
+      MutableBlobFileMetaData* const mutable_meta =
+          GetOrCreateMutableBlobFileMetaData(blob_file_number);
+      if (mutable_meta) {
+        mutable_meta->LinkSst(file_number);
+      }
+    }
+
+    table_file_levels_[file_number] = level;
+
+    return Status::OK();
+  }
+
+  Status ApplyCompactCursors(int level,
+                             const InternalKey& smallest_uncompacted_key) {
+    if (level < 0) {
+      std::ostringstream oss;
+      oss << "Cannot add compact cursor (" << level << ","
+          << smallest_uncompacted_key.Encode().ToString()
+          << " due to invalid level (level = " << level << ")";
+      return Status::Corruption("VersionBuilder", oss.str());
+    }
+    if (level < num_levels_) {
+      // Omit levels (>= num_levels_) when re-open with shrinking num_levels_
+      updated_compact_cursors_[level] = smallest_uncompacted_key;
+    }
+    return Status::OK();
+  }
+
+  // Apply all of the edits in *edit to the current state.
+  Status Apply(const VersionEdit* edit) {
+    {
+      const Status s = CheckConsistency(base_vstorage_);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+
+    // Note: we process the blob file related changes first because the
+    // table file addition/deletion logic depends on the blob files
+    // already being there.
+
+    // Add new blob files
+    for (const auto& blob_file_addition : edit->GetBlobFileAdditions()) {
+      const Status s = ApplyBlobFileAddition(blob_file_addition);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+
+    // Increase the amount of garbage for blob files affected by GC
+    for (const auto& blob_file_garbage : edit->GetBlobFileGarbages()) {
+      const Status s = ApplyBlobFileGarbage(blob_file_garbage);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+
+    // Delete table files
+    for (const auto& deleted_file : edit->GetDeletedFiles()) {
+      const int level = deleted_file.first;
+      const uint64_t file_number = deleted_file.second;
+
+      const Status s = ApplyFileDeletion(level, file_number);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+
+    // Add new table files
+    for (const auto& new_file : edit->GetNewFiles()) {
+      const int level = new_file.first;
+      const FileMetaData& meta = new_file.second;
+
+      const Status s = ApplyFileAddition(level, meta);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+
+    // Populate compact cursors for round-robin compaction, leave
+    // the cursor to be empty to indicate it is invalid
+    for (const auto& cursor : edit->GetCompactCursors()) {
+      const int level = cursor.first;
+      const InternalKey smallest_uncompacted_key = cursor.second;
+      const Status s = ApplyCompactCursors(level, smallest_uncompacted_key);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+    return Status::OK();
+  }
+
+  // Helper function template for merging the blob file metadata from the base
+  // version with the mutable metadata representing the state after applying the
+  // edits. The function objects process_base and process_mutable are
+  // respectively called to handle a base version object when there is no
+  // matching mutable object, and a mutable object when there is no matching
+  // base version object. process_both is called to perform the merge when a
+  // given blob file appears both in the base version and the mutable list. The
+  // helper stops processing objects if a function object returns false. Blob
+  // files with a file number below first_blob_file are not processed.
+  template <typename ProcessBase, typename ProcessMutable, typename ProcessBoth>
+  void MergeBlobFileMetas(uint64_t first_blob_file, ProcessBase process_base,
+                          ProcessMutable process_mutable,
+                          ProcessBoth process_both) const {
+    assert(base_vstorage_);
+
+    auto base_it = base_vstorage_->GetBlobFileMetaDataLB(first_blob_file);
+    const auto base_it_end = base_vstorage_->GetBlobFiles().end();
+
+    auto mutable_it = mutable_blob_file_metas_.lower_bound(first_blob_file);
+    const auto mutable_it_end = mutable_blob_file_metas_.end();
+
+    while (base_it != base_it_end && mutable_it != mutable_it_end) {
+      const auto& base_meta = *base_it;
+      assert(base_meta);
+
+      const uint64_t base_blob_file_number = base_meta->GetBlobFileNumber();
+      const uint64_t mutable_blob_file_number = mutable_it->first;
+
+      if (base_blob_file_number < mutable_blob_file_number) {
+        if (!process_base(base_meta)) {
+          return;
+        }
+
+        ++base_it;
+      } else if (mutable_blob_file_number < base_blob_file_number) {
+        const auto& mutable_meta = mutable_it->second;
+
+        if (!process_mutable(mutable_meta)) {
+          return;
+        }
+
+        ++mutable_it;
+      } else {
+        assert(base_blob_file_number == mutable_blob_file_number);
+
+        const auto& mutable_meta = mutable_it->second;
+
+        if (!process_both(base_meta, mutable_meta)) {
+          return;
+        }
+
+        ++base_it;
+        ++mutable_it;
+      }
+    }
+
+    while (base_it != base_it_end) {
+      const auto& base_meta = *base_it;
+
+      if (!process_base(base_meta)) {
+        return;
+      }
+
+      ++base_it;
+    }
+
+    while (mutable_it != mutable_it_end) {
+      const auto& mutable_meta = mutable_it->second;
+
+      if (!process_mutable(mutable_meta)) {
+        return;
+      }
+
+      ++mutable_it;
+    }
+  }
+
+  // Helper function template for finding the first blob file that has linked
+  // SSTs.
+  template <typename Meta>
+  static bool CheckLinkedSsts(const Meta& meta,
+                              uint64_t* min_oldest_blob_file_num) {
+    assert(min_oldest_blob_file_num);
+
+    if (!meta.GetLinkedSsts().empty()) {
+      assert(*min_oldest_blob_file_num == kInvalidBlobFileNumber);
+
+      *min_oldest_blob_file_num = meta.GetBlobFileNumber();
+
+      return false;
+    }
+
+    return true;
+  }
+
+  // Find the oldest blob file that has linked SSTs.
+  uint64_t GetMinOldestBlobFileNumber() const {
+    uint64_t min_oldest_blob_file_num = kInvalidBlobFileNumber;
+
+    auto process_base =
+        [&min_oldest_blob_file_num](
+            const std::shared_ptr<BlobFileMetaData>& base_meta) {
+          assert(base_meta);
+
+          return CheckLinkedSsts(*base_meta, &min_oldest_blob_file_num);
+        };
+
+    auto process_mutable = [&min_oldest_blob_file_num](
+                               const MutableBlobFileMetaData& mutable_meta) {
+      return CheckLinkedSsts(mutable_meta, &min_oldest_blob_file_num);
+    };
+
+    auto process_both = [&min_oldest_blob_file_num](
+                            const std::shared_ptr<BlobFileMetaData>& base_meta,
+                            const MutableBlobFileMetaData& mutable_meta) {
+#ifndef NDEBUG
+      assert(base_meta);
+      assert(base_meta->GetSharedMeta() == mutable_meta.GetSharedMeta());
+#else
+      (void)base_meta;
+#endif
+
+      // Look at mutable_meta since it supersedes *base_meta
+      return CheckLinkedSsts(mutable_meta, &min_oldest_blob_file_num);
+    };
+
+    MergeBlobFileMetas(kInvalidBlobFileNumber, process_base, process_mutable,
+                       process_both);
+
+    return min_oldest_blob_file_num;
+  }
+
+  static std::shared_ptr<BlobFileMetaData> CreateBlobFileMetaData(
+      const MutableBlobFileMetaData& mutable_meta) {
+    return BlobFileMetaData::Create(
+        mutable_meta.GetSharedMeta(), mutable_meta.GetLinkedSsts(),
+        mutable_meta.GetGarbageBlobCount(), mutable_meta.GetGarbageBlobBytes());
+  }
+
+  // Add the blob file specified by meta to *vstorage if it is determined to
+  // contain valid data (blobs).
+  template <typename Meta>
+  static void AddBlobFileIfNeeded(VersionStorageInfo* vstorage, Meta&& meta) {
+    assert(vstorage);
+    assert(meta);
+
+    if (meta->GetLinkedSsts().empty() &&
+        meta->GetGarbageBlobCount() >= meta->GetTotalBlobCount()) {
+      return;
+    }
+
+    vstorage->AddBlobFile(std::forward<Meta>(meta));
+  }
+
+  // Merge the blob file metadata from the base version with the changes (edits)
+  // applied, and save the result into *vstorage.
+  void SaveBlobFilesTo(VersionStorageInfo* vstorage) const {
+    assert(vstorage);
+
+    assert(base_vstorage_);
+    vstorage->ReserveBlob(base_vstorage_->GetBlobFiles().size() +
+                          mutable_blob_file_metas_.size());
+
+    const uint64_t oldest_blob_file_with_linked_ssts =
+        GetMinOldestBlobFileNumber();
+
+    auto process_base =
+        [vstorage](const std::shared_ptr<BlobFileMetaData>& base_meta) {
+          assert(base_meta);
+
+          AddBlobFileIfNeeded(vstorage, base_meta);
+
+          return true;
+        };
+
+    auto process_mutable =
+        [vstorage](const MutableBlobFileMetaData& mutable_meta) {
+          AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta));
+
+          return true;
+        };
+
+    auto process_both = [vstorage](
+                            const std::shared_ptr<BlobFileMetaData>& base_meta,
+                            const MutableBlobFileMetaData& mutable_meta) {
+      assert(base_meta);
+      assert(base_meta->GetSharedMeta() == mutable_meta.GetSharedMeta());
+
+      if (!mutable_meta.HasDelta()) {
+        assert(base_meta->GetGarbageBlobCount() ==
+               mutable_meta.GetGarbageBlobCount());
+        assert(base_meta->GetGarbageBlobBytes() ==
+               mutable_meta.GetGarbageBlobBytes());
+        assert(base_meta->GetLinkedSsts() == mutable_meta.GetLinkedSsts());
+
+        AddBlobFileIfNeeded(vstorage, base_meta);
+
+        return true;
+      }
+
+      AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta));
+
+      return true;
+    };
+
+    MergeBlobFileMetas(oldest_blob_file_with_linked_ssts, process_base,
+                       process_mutable, process_both);
+  }
+
+  void MaybeAddFile(VersionStorageInfo* vstorage, int level,
+                    FileMetaData* f) const {
+    const uint64_t file_number = f->fd.GetNumber();
+
+    const auto& level_state = levels_[level];
+
+    const auto& del_files = level_state.deleted_files;
+    const auto del_it = del_files.find(file_number);
+
+    if (del_it != del_files.end()) {
+      // f is to-be-deleted table file
+      vstorage->RemoveCurrentStats(f);
+    } else {
+      const auto& add_files = level_state.added_files;
+      const auto add_it = add_files.find(file_number);
+
+      // Note: if the file appears both in the base version and in the added
+      // list, the added FileMetaData supersedes the one in the base version.
+      if (add_it != add_files.end() && add_it->second != f) {
+        vstorage->RemoveCurrentStats(f);
+      } else {
+        vstorage->AddFile(level, f);
+      }
+    }
+  }
+
+  template <typename Cmp>
+  void SaveSSTFilesTo(VersionStorageInfo* vstorage, int level, Cmp cmp) const {
+    // Merge the set of added files with the set of pre-existing files.
+    // Drop any deleted files.  Store the result in *vstorage.
+    const auto& base_files = base_vstorage_->LevelFiles(level);
+    const auto& unordered_added_files = levels_[level].added_files;
+    vstorage->Reserve(level, base_files.size() + unordered_added_files.size());
+
+    // Sort added files for the level.
+    std::vector<FileMetaData*> added_files;
+    added_files.reserve(unordered_added_files.size());
+    for (const auto& pair : unordered_added_files) {
+      added_files.push_back(pair.second);
+    }
+    std::sort(added_files.begin(), added_files.end(), cmp);
+
+    auto base_iter = base_files.begin();
+    auto base_end = base_files.end();
+    auto added_iter = added_files.begin();
+    auto added_end = added_files.end();
+    while (added_iter != added_end || base_iter != base_end) {
+      if (base_iter == base_end ||
+          (added_iter != added_end && cmp(*added_iter, *base_iter))) {
+        MaybeAddFile(vstorage, level, *added_iter++);
+      } else {
+        MaybeAddFile(vstorage, level, *base_iter++);
+      }
+    }
+  }
+
+  void SaveSSTFilesTo(VersionStorageInfo* vstorage) const {
+    assert(vstorage);
+
+    if (!num_levels_) {
+      return;
+    }
+
+    SaveSSTFilesTo(vstorage, /* level */ 0, level_zero_cmp_);
+
+    for (int level = 1; level < num_levels_; ++level) {
+      SaveSSTFilesTo(vstorage, level, level_nonzero_cmp_);
+    }
+  }
+
+  void SaveCompactCursorsTo(VersionStorageInfo* vstorage) const {
+    for (auto iter = updated_compact_cursors_.begin();
+         iter != updated_compact_cursors_.end(); iter++) {
+      vstorage->AddCursorForOneLevel(iter->first, iter->second);
+    }
+  }
+
+  // Save the current state in *vstorage.
+  Status SaveTo(VersionStorageInfo* vstorage) const {
+    Status s;
+
+#ifndef NDEBUG
+    // The same check is done within Apply() so we skip it in release mode.
+    s = CheckConsistency(base_vstorage_);
+    if (!s.ok()) {
+      return s;
+    }
+#endif  // NDEBUG
+
+    s = CheckConsistency(vstorage);
+    if (!s.ok()) {
+      return s;
+    }
+
+    SaveSSTFilesTo(vstorage);
+
+    SaveBlobFilesTo(vstorage);
+
+    SaveCompactCursorsTo(vstorage);
+
+    s = CheckConsistency(vstorage);
+    return s;
+  }
+
+  Status LoadTableHandlers(
+      InternalStats* internal_stats, int max_threads,
+      bool prefetch_index_and_filter_in_cache, bool is_initial_load,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor,
+      size_t max_file_size_for_l0_meta_pin) {
+    assert(table_cache_ != nullptr);
+
+    size_t table_cache_capacity = table_cache_->get_cache()->GetCapacity();
+    bool always_load = (table_cache_capacity == TableCache::kInfiniteCapacity);
+    size_t max_load = std::numeric_limits<size_t>::max();
+
+    if (!always_load) {
+      // If it is initial loading and not set to always loading all the
+      // files, we only load up to kInitialLoadLimit files, to limit the
+      // time reopening the DB.
+      const size_t kInitialLoadLimit = 16;
+      size_t load_limit;
+      // If the table cache is not 1/4 full, we pin the table handle to
+      // file metadata to avoid the cache read costs when reading the file.
+      // The downside of pinning those files is that LRU won't be followed
+      // for those files. This doesn't matter much because if number of files
+      // of the DB excceeds table cache capacity, eventually no table reader
+      // will be pinned and LRU will be followed.
+      if (is_initial_load) {
+        load_limit = std::min(kInitialLoadLimit, table_cache_capacity / 4);
+      } else {
+        load_limit = table_cache_capacity / 4;
+      }
+
+      size_t table_cache_usage = table_cache_->get_cache()->GetUsage();
+      if (table_cache_usage >= load_limit) {
+        // TODO (yanqin) find a suitable status code.
+        return Status::OK();
+      } else {
+        max_load = load_limit - table_cache_usage;
+      }
+    }
+
+    // <file metadata, level>
+    std::vector<std::pair<FileMetaData*, int>> files_meta;
+    std::vector<Status> statuses;
+    for (int level = 0; level < num_levels_; level++) {
+      for (auto& file_meta_pair : levels_[level].added_files) {
+        auto* file_meta = file_meta_pair.second;
+        // If the file has been opened before, just skip it.
+        if (!file_meta->table_reader_handle) {
+          files_meta.emplace_back(file_meta, level);
+          statuses.emplace_back(Status::OK());
+        }
+        if (files_meta.size() >= max_load) {
+          break;
+        }
+      }
+      if (files_meta.size() >= max_load) {
+        break;
+      }
+    }
+
+    std::atomic<size_t> next_file_meta_idx(0);
+    std::function<void()> load_handlers_func([&]() {
+      while (true) {
+        size_t file_idx = next_file_meta_idx.fetch_add(1);
+        if (file_idx >= files_meta.size()) {
+          break;
+        }
+
+        auto* file_meta = files_meta[file_idx].first;
+        int level = files_meta[file_idx].second;
+        statuses[file_idx] = table_cache_->FindTable(
+            ReadOptions(), file_options_,
+            *(base_vstorage_->InternalComparator()), *file_meta,
+            &file_meta->table_reader_handle, prefix_extractor, false /*no_io */,
+            true /* record_read_stats */,
+            internal_stats->GetFileReadHist(level), false, level,
+            prefetch_index_and_filter_in_cache, max_file_size_for_l0_meta_pin,
+            file_meta->temperature);
+        if (file_meta->table_reader_handle != nullptr) {
+          // Load table_reader
+          file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle(
+              file_meta->table_reader_handle);
+        }
+      }
+    });
+
+    std::vector<port::Thread> threads;
+    for (int i = 1; i < max_threads; i++) {
+      threads.emplace_back(load_handlers_func);
+    }
+    load_handlers_func();
+    for (auto& t : threads) {
+      t.join();
+    }
+    Status ret;
+    for (const auto& s : statuses) {
+      if (!s.ok()) {
+        if (ret.ok()) {
+          ret = s;
+        }
+      }
+    }
+    return ret;
+  }
+};
+
+VersionBuilder::VersionBuilder(
+    const FileOptions& file_options, const ImmutableCFOptions* ioptions,
+    TableCache* table_cache, VersionStorageInfo* base_vstorage,
+    VersionSet* version_set,
+    std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr)
+    : rep_(new Rep(file_options, ioptions, table_cache, base_vstorage,
+                   version_set, file_metadata_cache_res_mgr)) {}
+
+VersionBuilder::~VersionBuilder() = default;
+
+bool VersionBuilder::CheckConsistencyForNumLevels() {
+  return rep_->CheckConsistencyForNumLevels();
+}
+
+Status VersionBuilder::Apply(const VersionEdit* edit) {
+  return rep_->Apply(edit);
+}
+
+Status VersionBuilder::SaveTo(VersionStorageInfo* vstorage) const {
+  return rep_->SaveTo(vstorage);
+}
+
+Status VersionBuilder::LoadTableHandlers(
+    InternalStats* internal_stats, int max_threads,
+    bool prefetch_index_and_filter_in_cache, bool is_initial_load,
+    const std::shared_ptr<const SliceTransform>& prefix_extractor,
+    size_t max_file_size_for_l0_meta_pin) {
+  return rep_->LoadTableHandlers(
+      internal_stats, max_threads, prefetch_index_and_filter_in_cache,
+      is_initial_load, prefix_extractor, max_file_size_for_l0_meta_pin);
+}
+
+uint64_t VersionBuilder::GetMinOldestBlobFileNumber() const {
+  return rep_->GetMinOldestBlobFileNumber();
+}
+
+BaseReferencedVersionBuilder::BaseReferencedVersionBuilder(
+    ColumnFamilyData* cfd)
+    : version_builder_(new VersionBuilder(
+          cfd->current()->version_set()->file_options(), cfd->ioptions(),
+          cfd->table_cache(), cfd->current()->storage_info(),
+          cfd->current()->version_set(),
+          cfd->GetFileMetadataCacheReservationManager())),
+      version_(cfd->current()) {
+  version_->Ref();
+}
+
+BaseReferencedVersionBuilder::BaseReferencedVersionBuilder(
+    ColumnFamilyData* cfd, Version* v)
+    : version_builder_(new VersionBuilder(
+          cfd->current()->version_set()->file_options(), cfd->ioptions(),
+          cfd->table_cache(), v->storage_info(), v->version_set(),
+          cfd->GetFileMetadataCacheReservationManager())),
+      version_(v) {
+  assert(version_ != cfd->current());
+}
+
+BaseReferencedVersionBuilder::~BaseReferencedVersionBuilder() {
+  version_->Unref();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_builder.h b/src/rocksdb/db/version_builder.h
new file mode 100644
index 000000000..1c022832a
--- /dev/null
+++ b/src/rocksdb/db/version_builder.h
@@ -0,0 +1,72 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+#pragma once
+
+#include <memory>
+
+#include "rocksdb/file_system.h"
+#include "rocksdb/slice_transform.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct ImmutableCFOptions;
+class TableCache;
+class VersionStorageInfo;
+class VersionEdit;
+struct FileMetaData;
+class InternalStats;
+class Version;
+class VersionSet;
+class ColumnFamilyData;
+class CacheReservationManager;
+
+// A helper class so we can efficiently apply a whole sequence
+// of edits to a particular state without creating intermediate
+// Versions that contain full copies of the intermediate state.
+class VersionBuilder {
+ public:
+  VersionBuilder(const FileOptions& file_options,
+                 const ImmutableCFOptions* ioptions, TableCache* table_cache,
+                 VersionStorageInfo* base_vstorage, VersionSet* version_set,
+                 std::shared_ptr<CacheReservationManager>
+                     file_metadata_cache_res_mgr = nullptr);
+  ~VersionBuilder();
+
+  bool CheckConsistencyForNumLevels();
+  Status Apply(const VersionEdit* edit);
+  Status SaveTo(VersionStorageInfo* vstorage) const;
+  Status LoadTableHandlers(
+      InternalStats* internal_stats, int max_threads,
+      bool prefetch_index_and_filter_in_cache, bool is_initial_load,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor,
+      size_t max_file_size_for_l0_meta_pin);
+  uint64_t GetMinOldestBlobFileNumber() const;
+
+ private:
+  class Rep;
+  std::unique_ptr<Rep> rep_;
+};
+
+// A wrapper of version builder which references the current version in
+// constructor and unref it in the destructor.
+// Both of the constructor and destructor need to be called inside DB Mutex.
+class BaseReferencedVersionBuilder {
+ public:
+  explicit BaseReferencedVersionBuilder(ColumnFamilyData* cfd);
+  BaseReferencedVersionBuilder(ColumnFamilyData* cfd, Version* v);
+  ~BaseReferencedVersionBuilder();
+  VersionBuilder* version_builder() const { return version_builder_.get(); }
+
+ private:
+  std::unique_ptr<VersionBuilder> version_builder_;
+  Version* version_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_builder_test.cc b/src/rocksdb/db/version_builder_test.cc
new file mode 100644
index 000000000..ee5c3f2e3
--- /dev/null
+++ b/src/rocksdb/db/version_builder_test.cc
@@ -0,0 +1,1695 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <cstring>
+#include <iomanip>
+#include <memory>
+#include <sstream>
+#include <string>
+
+#include "db/version_edit.h"
+#include "db/version_set.h"
+#include "rocksdb/advanced_options.h"
+#include "table/unique_id_impl.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class VersionBuilderTest : public testing::Test {
+ public:
+  const Comparator* ucmp_;
+  InternalKeyComparator icmp_;
+  Options options_;
+  ImmutableOptions ioptions_;
+  MutableCFOptions mutable_cf_options_;
+  VersionStorageInfo vstorage_;
+  uint32_t file_num_;
+  CompactionOptionsFIFO fifo_options_;
+  std::vector<uint64_t> size_being_compacted_;
+
+  VersionBuilderTest()
+      : ucmp_(BytewiseComparator()),
+        icmp_(ucmp_),
+        ioptions_(options_),
+        mutable_cf_options_(options_),
+        vstorage_(&icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel,
+                  nullptr, false),
+        file_num_(1) {
+    mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+    size_being_compacted_.resize(options_.num_levels);
+  }
+
+  ~VersionBuilderTest() override {
+    for (int i = 0; i < vstorage_.num_levels(); i++) {
+      for (auto* f : vstorage_.LevelFiles(i)) {
+        if (--f->refs == 0) {
+          delete f;
+        }
+      }
+    }
+  }
+
+  InternalKey GetInternalKey(const char* ukey,
+                             SequenceNumber smallest_seq = 100) {
+    return InternalKey(ukey, smallest_seq, kTypeValue);
+  }
+
+  void Add(int level, uint64_t file_number, const char* smallest,
+           const char* largest, uint64_t file_size = 0, uint32_t path_id = 0,
+           SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100,
+           uint64_t num_entries = 0, uint64_t num_deletions = 0,
+           bool sampled = false, SequenceNumber smallest_seqno = 0,
+           SequenceNumber largest_seqno = 0,
+           uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) {
+    assert(level < vstorage_.num_levels());
+    FileMetaData* f = new FileMetaData(
+        file_number, path_id, file_size, GetInternalKey(smallest, smallest_seq),
+        GetInternalKey(largest, largest_seq), smallest_seqno, largest_seqno,
+        /* marked_for_compact */ false, Temperature::kUnknown,
+        oldest_blob_file_number, kUnknownOldestAncesterTime,
+        kUnknownFileCreationTime, kUnknownFileChecksum,
+        kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+    f->compensated_file_size = file_size;
+    f->num_entries = num_entries;
+    f->num_deletions = num_deletions;
+    vstorage_.AddFile(level, f);
+    if (sampled) {
+      f->init_stats_from_file = true;
+      vstorage_.UpdateAccumulatedStats(f);
+    }
+  }
+
+  void AddBlob(uint64_t blob_file_number, uint64_t total_blob_count,
+               uint64_t total_blob_bytes, std::string checksum_method,
+               std::string checksum_value,
+               BlobFileMetaData::LinkedSsts linked_ssts,
+               uint64_t garbage_blob_count, uint64_t garbage_blob_bytes) {
+    auto shared_meta = SharedBlobFileMetaData::Create(
+        blob_file_number, total_blob_count, total_blob_bytes,
+        std::move(checksum_method), std::move(checksum_value));
+    auto meta =
+        BlobFileMetaData::Create(std::move(shared_meta), std::move(linked_ssts),
+                                 garbage_blob_count, garbage_blob_bytes);
+
+    vstorage_.AddBlobFile(std::move(meta));
+  }
+
+  void AddDummyFile(uint64_t table_file_number, uint64_t blob_file_number) {
+    constexpr int level = 0;
+    constexpr char smallest[] = "bar";
+    constexpr char largest[] = "foo";
+    constexpr uint64_t file_size = 100;
+    constexpr uint32_t path_id = 0;
+    constexpr SequenceNumber smallest_seq = 0;
+    constexpr SequenceNumber largest_seq = 0;
+    constexpr uint64_t num_entries = 0;
+    constexpr uint64_t num_deletions = 0;
+    constexpr bool sampled = false;
+
+    Add(level, table_file_number, smallest, largest, file_size, path_id,
+        smallest_seq, largest_seq, num_entries, num_deletions, sampled,
+        smallest_seq, largest_seq, blob_file_number);
+  }
+
+  void AddDummyFileToEdit(VersionEdit* edit, uint64_t table_file_number,
+                          uint64_t blob_file_number) {
+    assert(edit);
+
+    constexpr int level = 0;
+    constexpr uint32_t path_id = 0;
+    constexpr uint64_t file_size = 100;
+    constexpr char smallest[] = "bar";
+    constexpr char largest[] = "foo";
+    constexpr SequenceNumber smallest_seqno = 100;
+    constexpr SequenceNumber largest_seqno = 300;
+    constexpr bool marked_for_compaction = false;
+
+    edit->AddFile(
+        level, table_file_number, path_id, file_size, GetInternalKey(smallest),
+        GetInternalKey(largest), smallest_seqno, largest_seqno,
+        marked_for_compaction, Temperature::kUnknown, blob_file_number,
+        kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+        kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  }
+
+  void UpdateVersionStorageInfo(VersionStorageInfo* vstorage) {
+    assert(vstorage);
+
+    vstorage->PrepareForVersionAppend(ioptions_, mutable_cf_options_);
+    vstorage->SetFinalized();
+  }
+
+  void UpdateVersionStorageInfo() { UpdateVersionStorageInfo(&vstorage_); }
+};
+
+void UnrefFilesInVersion(VersionStorageInfo* new_vstorage) {
+  for (int i = 0; i < new_vstorage->num_levels(); i++) {
+    for (auto* f : new_vstorage->LevelFiles(i)) {
+      if (--f->refs == 0) {
+        delete f;
+      }
+    }
+  }
+}
+
+TEST_F(VersionBuilderTest, ApplyAndSaveTo) {
+  Add(0, 1U, "150", "200", 100U);
+
+  Add(1, 66U, "150", "200", 100U);
+  Add(1, 88U, "201", "300", 100U);
+
+  Add(2, 6U, "150", "179", 100U);
+  Add(2, 7U, "180", "220", 100U);
+  Add(2, 8U, "221", "300", 100U);
+
+  Add(3, 26U, "150", "170", 100U);
+  Add(3, 27U, "171", "179", 100U);
+  Add(3, 28U, "191", "220", 100U);
+  Add(3, 29U, "221", "300", 100U);
+
+  UpdateVersionStorageInfo();
+
+  VersionEdit version_edit;
+  version_edit.AddFile(
+      2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  version_edit.DeleteFile(3, 27U);
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder version_builder(env_options, &ioptions_, table_cache,
+                                 &vstorage_, version_set);
+
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr, false);
+  ASSERT_OK(version_builder.Apply(&version_edit));
+  ASSERT_OK(version_builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  ASSERT_EQ(400U, new_vstorage.NumLevelBytes(2));
+  ASSERT_EQ(300U, new_vstorage.NumLevelBytes(3));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+
+  Add(0, 1U, "150", "200", 100U, 0, 200U, 200U, 0, 0, false, 200U, 200U);
+  Add(0, 88U, "201", "300", 100U, 0, 100U, 100U, 0, 0, false, 100U, 100U);
+
+  Add(4, 6U, "150", "179", 100U);
+  Add(4, 7U, "180", "220", 100U);
+  Add(4, 8U, "221", "300", 100U);
+
+  Add(5, 26U, "150", "170", 100U);
+  Add(5, 27U, "171", "179", 100U);
+
+  UpdateVersionStorageInfo();
+
+  VersionEdit version_edit;
+  version_edit.AddFile(
+      3, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  version_edit.DeleteFile(0, 1U);
+  version_edit.DeleteFile(0, 88U);
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder version_builder(env_options, &ioptions_, table_cache,
+                                 &vstorage_, version_set);
+
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr, false);
+  ASSERT_OK(version_builder.Apply(&version_edit));
+  ASSERT_OK(version_builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  ASSERT_EQ(0U, new_vstorage.NumLevelBytes(0));
+  ASSERT_EQ(100U, new_vstorage.NumLevelBytes(3));
+  ASSERT_EQ(300U, new_vstorage.NumLevelBytes(4));
+  ASSERT_EQ(200U, new_vstorage.NumLevelBytes(5));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+
+  Add(0, 1U, "150", "200", 100U, 0, 200U, 200U, 0, 0, false, 200U, 200U);
+  Add(0, 88U, "201", "300", 100U, 0, 100U, 100U, 0, 0, false, 100U, 100U);
+
+  Add(4, 6U, "150", "179", 100U);
+  Add(4, 7U, "180", "220", 100U);
+  Add(4, 8U, "221", "300", 100U);
+
+  Add(5, 26U, "150", "170", 100U);
+  Add(5, 27U, "171", "179", 100U);
+
+  UpdateVersionStorageInfo();
+
+  VersionEdit version_edit;
+  version_edit.AddFile(
+      4, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  version_edit.DeleteFile(0, 1U);
+  version_edit.DeleteFile(0, 88U);
+  version_edit.DeleteFile(4, 6U);
+  version_edit.DeleteFile(4, 7U);
+  version_edit.DeleteFile(4, 8U);
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder version_builder(env_options, &ioptions_, table_cache,
+                                 &vstorage_, version_set);
+
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr, false);
+  ASSERT_OK(version_builder.Apply(&version_edit));
+  ASSERT_OK(version_builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  ASSERT_EQ(0U, new_vstorage.NumLevelBytes(0));
+  ASSERT_EQ(100U, new_vstorage.NumLevelBytes(4));
+  ASSERT_EQ(200U, new_vstorage.NumLevelBytes(5));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyMultipleAndSaveTo) {
+  UpdateVersionStorageInfo();
+
+  VersionEdit version_edit;
+  version_edit.AddFile(
+      2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  version_edit.AddFile(
+      2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  version_edit.AddFile(
+      2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  version_edit.AddFile(
+      2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  version_edit.AddFile(
+      2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder version_builder(env_options, &ioptions_, table_cache,
+                                 &vstorage_, version_set);
+
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr, false);
+  ASSERT_OK(version_builder.Apply(&version_edit));
+  ASSERT_OK(version_builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  ASSERT_EQ(500U, new_vstorage.NumLevelBytes(2));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) {
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder version_builder(env_options, &ioptions_, table_cache,
+                                 &vstorage_, version_set);
+
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr, false);
+
+  VersionEdit version_edit;
+  version_edit.AddFile(
+      2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  version_edit.AddFile(
+      2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  version_edit.AddFile(
+      2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  version_edit.AddFile(
+      2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  version_edit.AddFile(
+      2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  ASSERT_OK(version_builder.Apply(&version_edit));
+
+  VersionEdit version_edit2;
+  version_edit.AddFile(
+      2, 808, 0, 100U, GetInternalKey("901"), GetInternalKey("950"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  version_edit2.DeleteFile(2, 616);
+  version_edit2.DeleteFile(2, 636);
+  version_edit.AddFile(
+      2, 806, 0, 100U, GetInternalKey("801"), GetInternalKey("850"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+  ASSERT_OK(version_builder.Apply(&version_edit2));
+  ASSERT_OK(version_builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  ASSERT_EQ(300U, new_vstorage.NumLevelBytes(2));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyFileDeletionIncorrectLevel) {
+  constexpr int level = 1;
+  constexpr uint64_t file_number = 2345;
+  constexpr char smallest[] = "bar";
+  constexpr char largest[] = "foo";
+  constexpr uint64_t file_size = 100;
+
+  Add(level, file_number, smallest, largest, file_size);
+
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  constexpr int incorrect_level = 3;
+
+  edit.DeleteFile(incorrect_level, file_number);
+
+  const Status s = builder.Apply(&edit);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(),
+                          "Cannot delete table file #2345 from level 3 since "
+                          "it is on level 1"));
+}
+
+TEST_F(VersionBuilderTest, ApplyFileDeletionNotInLSMTree) {
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  constexpr int level = 3;
+  constexpr uint64_t file_number = 1234;
+
+  edit.DeleteFile(level, file_number);
+
+  const Status s = builder.Apply(&edit);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(),
+                          "Cannot delete table file #1234 from level 3 since "
+                          "it is not in the LSM tree"));
+}
+
+TEST_F(VersionBuilderTest, ApplyFileDeletionAndAddition) {
+  constexpr int level = 1;
+  constexpr uint64_t file_number = 2345;
+  constexpr char smallest[] = "bar";
+  constexpr char largest[] = "foo";
+  constexpr uint64_t file_size = 10000;
+  constexpr uint32_t path_id = 0;
+  constexpr SequenceNumber smallest_seq = 100;
+  constexpr SequenceNumber largest_seq = 500;
+  constexpr uint64_t num_entries = 0;
+  constexpr uint64_t num_deletions = 0;
+  constexpr bool sampled = false;
+  constexpr SequenceNumber smallest_seqno = 1;
+  constexpr SequenceNumber largest_seqno = 1000;
+
+  Add(level, file_number, smallest, largest, file_size, path_id, smallest_seq,
+      largest_seq, num_entries, num_deletions, sampled, smallest_seqno,
+      largest_seqno);
+
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit deletion;
+
+  deletion.DeleteFile(level, file_number);
+
+  ASSERT_OK(builder.Apply(&deletion));
+
+  VersionEdit addition;
+
+  constexpr bool marked_for_compaction = false;
+
+  addition.AddFile(level, file_number, path_id, file_size,
+                   GetInternalKey(smallest, smallest_seq),
+                   GetInternalKey(largest, largest_seq), smallest_seqno,
+                   largest_seqno, marked_for_compaction, Temperature::kUnknown,
+                   kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                   kUnknownFileCreationTime, kUnknownFileChecksum,
+                   kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+  ASSERT_OK(builder.Apply(&addition));
+
+  constexpr bool force_consistency_checks = false;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  ASSERT_EQ(new_vstorage.GetFileLocation(file_number).GetLevel(), level);
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyInBase) {
+  constexpr int level = 1;
+  constexpr uint64_t file_number = 2345;
+  constexpr char smallest[] = "bar";
+  constexpr char largest[] = "foo";
+  constexpr uint64_t file_size = 10000;
+
+  Add(level, file_number, smallest, largest, file_size);
+
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  constexpr int new_level = 2;
+  constexpr uint32_t path_id = 0;
+  constexpr SequenceNumber smallest_seqno = 100;
+  constexpr SequenceNumber largest_seqno = 1000;
+  constexpr bool marked_for_compaction = false;
+
+  edit.AddFile(
+      new_level, file_number, path_id, file_size, GetInternalKey(smallest),
+      GetInternalKey(largest), smallest_seqno, largest_seqno,
+      marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+  const Status s = builder.Apply(&edit);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(),
+                          "Cannot add table file #2345 to level 2 since it is "
+                          "already in the LSM tree on level 1"));
+}
+
+TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyApplied) {
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  constexpr int level = 3;
+  constexpr uint64_t file_number = 2345;
+  constexpr uint32_t path_id = 0;
+  constexpr uint64_t file_size = 10000;
+  constexpr char smallest[] = "bar";
+  constexpr char largest[] = "foo";
+  constexpr SequenceNumber smallest_seqno = 100;
+  constexpr SequenceNumber largest_seqno = 1000;
+  constexpr bool marked_for_compaction = false;
+
+  edit.AddFile(level, file_number, path_id, file_size, GetInternalKey(smallest),
+               GetInternalKey(largest), smallest_seqno, largest_seqno,
+               marked_for_compaction, Temperature::kUnknown,
+               kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+               kUnknownFileCreationTime, kUnknownFileChecksum,
+               kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+  ASSERT_OK(builder.Apply(&edit));
+
+  VersionEdit other_edit;
+
+  constexpr int new_level = 2;
+
+  other_edit.AddFile(
+      new_level, file_number, path_id, file_size, GetInternalKey(smallest),
+      GetInternalKey(largest), smallest_seqno, largest_seqno,
+      marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+  const Status s = builder.Apply(&other_edit);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(),
+                          "Cannot add table file #2345 to level 2 since it is "
+                          "already in the LSM tree on level 3"));
+}
+
+TEST_F(VersionBuilderTest, ApplyFileAdditionAndDeletion) {
+  UpdateVersionStorageInfo();
+
+  constexpr int level = 1;
+  constexpr uint64_t file_number = 2345;
+  constexpr uint32_t path_id = 0;
+  constexpr uint64_t file_size = 10000;
+  constexpr char smallest[] = "bar";
+  constexpr char largest[] = "foo";
+  constexpr SequenceNumber smallest_seqno = 100;
+  constexpr SequenceNumber largest_seqno = 1000;
+  constexpr bool marked_for_compaction = false;
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit addition;
+
+  addition.AddFile(
+      level, file_number, path_id, file_size, GetInternalKey(smallest),
+      GetInternalKey(largest), smallest_seqno, largest_seqno,
+      marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+  ASSERT_OK(builder.Apply(&addition));
+
+  VersionEdit deletion;
+
+  deletion.DeleteFile(level, file_number);
+
+  ASSERT_OK(builder.Apply(&deletion));
+
+  constexpr bool force_consistency_checks = false;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  ASSERT_FALSE(new_vstorage.GetFileLocation(file_number).IsValid());
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyBlobFileAddition) {
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  constexpr uint64_t blob_file_number = 1234;
+  constexpr uint64_t total_blob_count = 5678;
+  constexpr uint64_t total_blob_bytes = 999999;
+  constexpr char checksum_method[] = "SHA1";
+  constexpr char checksum_value[] =
+      "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+      "\x5c\xbd";
+
+  edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+                   checksum_method, checksum_value);
+
+  // Add dummy table file to ensure the blob file is referenced.
+  constexpr uint64_t table_file_number = 1;
+  AddDummyFileToEdit(&edit, table_file_number, blob_file_number);
+
+  ASSERT_OK(builder.Apply(&edit));
+
+  constexpr bool force_consistency_checks = false;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  const auto& new_blob_files = new_vstorage.GetBlobFiles();
+  ASSERT_EQ(new_blob_files.size(), 1);
+
+  const auto new_meta = new_vstorage.GetBlobFileMetaData(blob_file_number);
+
+  ASSERT_NE(new_meta, nullptr);
+  ASSERT_EQ(new_meta->GetBlobFileNumber(), blob_file_number);
+  ASSERT_EQ(new_meta->GetTotalBlobCount(), total_blob_count);
+  ASSERT_EQ(new_meta->GetTotalBlobBytes(), total_blob_bytes);
+  ASSERT_EQ(new_meta->GetChecksumMethod(), checksum_method);
+  ASSERT_EQ(new_meta->GetChecksumValue(), checksum_value);
+  ASSERT_EQ(new_meta->GetLinkedSsts(),
+            BlobFileMetaData::LinkedSsts{table_file_number});
+  ASSERT_EQ(new_meta->GetGarbageBlobCount(), 0);
+  ASSERT_EQ(new_meta->GetGarbageBlobBytes(), 0);
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyBlobFileAdditionAlreadyInBase) {
+  // Attempt to add a blob file that is already present in the base version.
+
+  constexpr uint64_t blob_file_number = 1234;
+  constexpr uint64_t total_blob_count = 5678;
+  constexpr uint64_t total_blob_bytes = 999999;
+  constexpr char checksum_method[] = "SHA1";
+  constexpr char checksum_value[] =
+      "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+      "\x5c\xbd";
+  constexpr uint64_t garbage_blob_count = 123;
+  constexpr uint64_t garbage_blob_bytes = 456789;
+
+  AddBlob(blob_file_number, total_blob_count, total_blob_bytes, checksum_method,
+          checksum_value, BlobFileMetaData::LinkedSsts(), garbage_blob_count,
+          garbage_blob_bytes);
+
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+                   checksum_method, checksum_value);
+
+  const Status s = builder.Apply(&edit);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(), "Blob file #1234 already added"));
+}
+
+TEST_F(VersionBuilderTest, ApplyBlobFileAdditionAlreadyApplied) {
+  // Attempt to add the same blob file twice using version edits.
+
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  constexpr uint64_t blob_file_number = 1234;
+  constexpr uint64_t total_blob_count = 5678;
+  constexpr uint64_t total_blob_bytes = 999999;
+  constexpr char checksum_method[] = "SHA1";
+  constexpr char checksum_value[] =
+      "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+      "\x5c\xbd";
+
+  edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+                   checksum_method, checksum_value);
+
+  ASSERT_OK(builder.Apply(&edit));
+
+  const Status s = builder.Apply(&edit);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(), "Blob file #1234 already added"));
+}
+
+TEST_F(VersionBuilderTest, ApplyBlobFileGarbageFileInBase) {
+  // Increase the amount of garbage for a blob file present in the base version.
+
+  constexpr uint64_t table_file_number = 1;
+  constexpr uint64_t blob_file_number = 1234;
+  constexpr uint64_t total_blob_count = 5678;
+  constexpr uint64_t total_blob_bytes = 999999;
+  constexpr char checksum_method[] = "SHA1";
+  constexpr char checksum_value[] =
+      "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+      "\x5c\xbd";
+  constexpr uint64_t garbage_blob_count = 123;
+  constexpr uint64_t garbage_blob_bytes = 456789;
+
+  AddBlob(blob_file_number, total_blob_count, total_blob_bytes, checksum_method,
+          checksum_value, BlobFileMetaData::LinkedSsts{table_file_number},
+          garbage_blob_count, garbage_blob_bytes);
+
+  const auto meta = vstorage_.GetBlobFileMetaData(blob_file_number);
+  ASSERT_NE(meta, nullptr);
+
+  // Add dummy table file to ensure the blob file is referenced.
+  AddDummyFile(table_file_number, blob_file_number);
+
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  constexpr uint64_t new_garbage_blob_count = 456;
+  constexpr uint64_t new_garbage_blob_bytes = 111111;
+
+  edit.AddBlobFileGarbage(blob_file_number, new_garbage_blob_count,
+                          new_garbage_blob_bytes);
+
+  ASSERT_OK(builder.Apply(&edit));
+
+  constexpr bool force_consistency_checks = false;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  const auto& new_blob_files = new_vstorage.GetBlobFiles();
+  ASSERT_EQ(new_blob_files.size(), 1);
+
+  const auto new_meta = new_vstorage.GetBlobFileMetaData(blob_file_number);
+
+  ASSERT_NE(new_meta, nullptr);
+  ASSERT_EQ(new_meta->GetSharedMeta(), meta->GetSharedMeta());
+  ASSERT_EQ(new_meta->GetBlobFileNumber(), blob_file_number);
+  ASSERT_EQ(new_meta->GetTotalBlobCount(), total_blob_count);
+  ASSERT_EQ(new_meta->GetTotalBlobBytes(), total_blob_bytes);
+  ASSERT_EQ(new_meta->GetChecksumMethod(), checksum_method);
+  ASSERT_EQ(new_meta->GetChecksumValue(), checksum_value);
+  ASSERT_EQ(new_meta->GetLinkedSsts(),
+            BlobFileMetaData::LinkedSsts{table_file_number});
+  ASSERT_EQ(new_meta->GetGarbageBlobCount(),
+            garbage_blob_count + new_garbage_blob_count);
+  ASSERT_EQ(new_meta->GetGarbageBlobBytes(),
+            garbage_blob_bytes + new_garbage_blob_bytes);
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyBlobFileGarbageFileAdditionApplied) {
+  // Increase the amount of garbage for a blob file added using a version edit.
+
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit addition;
+
+  constexpr uint64_t blob_file_number = 1234;
+  constexpr uint64_t total_blob_count = 5678;
+  constexpr uint64_t total_blob_bytes = 999999;
+  constexpr char checksum_method[] = "SHA1";
+  constexpr char checksum_value[] =
+      "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+      "\x5c\xbd";
+
+  addition.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+                       checksum_method, checksum_value);
+
+  // Add dummy table file to ensure the blob file is referenced.
+  constexpr uint64_t table_file_number = 1;
+  AddDummyFileToEdit(&addition, table_file_number, blob_file_number);
+
+  ASSERT_OK(builder.Apply(&addition));
+
+  constexpr uint64_t garbage_blob_count = 123;
+  constexpr uint64_t garbage_blob_bytes = 456789;
+
+  VersionEdit garbage;
+
+  garbage.AddBlobFileGarbage(blob_file_number, garbage_blob_count,
+                             garbage_blob_bytes);
+
+  ASSERT_OK(builder.Apply(&garbage));
+
+  constexpr bool force_consistency_checks = false;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  const auto& new_blob_files = new_vstorage.GetBlobFiles();
+  ASSERT_EQ(new_blob_files.size(), 1);
+
+  const auto new_meta = new_vstorage.GetBlobFileMetaData(blob_file_number);
+
+  ASSERT_NE(new_meta, nullptr);
+  ASSERT_EQ(new_meta->GetBlobFileNumber(), blob_file_number);
+  ASSERT_EQ(new_meta->GetTotalBlobCount(), total_blob_count);
+  ASSERT_EQ(new_meta->GetTotalBlobBytes(), total_blob_bytes);
+  ASSERT_EQ(new_meta->GetChecksumMethod(), checksum_method);
+  ASSERT_EQ(new_meta->GetChecksumValue(), checksum_value);
+  ASSERT_EQ(new_meta->GetLinkedSsts(),
+            BlobFileMetaData::LinkedSsts{table_file_number});
+  ASSERT_EQ(new_meta->GetGarbageBlobCount(), garbage_blob_count);
+  ASSERT_EQ(new_meta->GetGarbageBlobBytes(), garbage_blob_bytes);
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyBlobFileGarbageFileNotFound) {
+  // Attempt to increase the amount of garbage for a blob file that is
+  // neither in the base version, nor was it added using a version edit.
+
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  constexpr uint64_t blob_file_number = 1234;
+  constexpr uint64_t garbage_blob_count = 5678;
+  constexpr uint64_t garbage_blob_bytes = 999999;
+
+  edit.AddBlobFileGarbage(blob_file_number, garbage_blob_count,
+                          garbage_blob_bytes);
+
+  const Status s = builder.Apply(&edit);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(), "Blob file #1234 not found"));
+}
+
+TEST_F(VersionBuilderTest, BlobFileGarbageOverflow) {
+  // Test that VersionEdits that would result in the count/total size of garbage
+  // exceeding the count/total size of all blobs are rejected.
+
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit addition;
+
+  constexpr uint64_t blob_file_number = 1234;
+  constexpr uint64_t total_blob_count = 5678;
+  constexpr uint64_t total_blob_bytes = 999999;
+  constexpr char checksum_method[] = "SHA1";
+  constexpr char checksum_value[] =
+      "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+      "\x5c\xbd";
+
+  addition.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+                       checksum_method, checksum_value);
+
+  // Add dummy table file to ensure the blob file is referenced.
+  constexpr uint64_t table_file_number = 1;
+  AddDummyFileToEdit(&addition, table_file_number, blob_file_number);
+
+  ASSERT_OK(builder.Apply(&addition));
+
+  {
+    // Garbage blob count overflow
+    constexpr uint64_t garbage_blob_count = 5679;
+    constexpr uint64_t garbage_blob_bytes = 999999;
+
+    VersionEdit garbage;
+
+    garbage.AddBlobFileGarbage(blob_file_number, garbage_blob_count,
+                               garbage_blob_bytes);
+
+    const Status s = builder.Apply(&garbage);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(
+        std::strstr(s.getState(), "Garbage overflow for blob file #1234"));
+  }
+
+  {
+    // Garbage blob bytes overflow
+    constexpr uint64_t garbage_blob_count = 5678;
+    constexpr uint64_t garbage_blob_bytes = 1000000;
+
+    VersionEdit garbage;
+
+    garbage.AddBlobFileGarbage(blob_file_number, garbage_blob_count,
+                               garbage_blob_bytes);
+
+    const Status s = builder.Apply(&garbage);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(
+        std::strstr(s.getState(), "Garbage overflow for blob file #1234"));
+  }
+}
+
+TEST_F(VersionBuilderTest, SaveBlobFilesTo) {
+  // Add three blob files to base version.
+  for (uint64_t i = 1; i <= 3; ++i) {
+    const uint64_t table_file_number = 2 * i;
+    const uint64_t blob_file_number = 2 * i + 1;
+    const uint64_t total_blob_count = i * 1000;
+    const uint64_t total_blob_bytes = i * 1000000;
+    const uint64_t garbage_blob_count = i * 100;
+    const uint64_t garbage_blob_bytes = i * 20000;
+
+    AddBlob(blob_file_number, total_blob_count, total_blob_bytes,
+            /* checksum_method */ std::string(),
+            /* checksum_value */ std::string(),
+            BlobFileMetaData::LinkedSsts{table_file_number}, garbage_blob_count,
+            garbage_blob_bytes);
+  }
+
+  // Add dummy table files to ensure the blob files are referenced.
+  // Note: files are added to L0, so they have to be added in reverse order
+  // (newest first).
+  for (uint64_t i = 3; i >= 1; --i) {
+    const uint64_t table_file_number = 2 * i;
+    const uint64_t blob_file_number = 2 * i + 1;
+
+    AddDummyFile(table_file_number, blob_file_number);
+  }
+
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  // Add some garbage to the second and third blob files. The second blob file
+  // remains valid since it does not consist entirely of garbage yet. The third
+  // blob file is all garbage after the edit and will not be part of the new
+  // version. The corresponding dummy table file is also removed for
+  // consistency.
+  edit.AddBlobFileGarbage(/* blob_file_number */ 5,
+                          /* garbage_blob_count */ 200,
+                          /* garbage_blob_bytes */ 100000);
+  edit.AddBlobFileGarbage(/* blob_file_number */ 7,
+                          /* garbage_blob_count */ 2700,
+                          /* garbage_blob_bytes */ 2940000);
+  edit.DeleteFile(/* level */ 0, /* file_number */ 6);
+
+  // Add a fourth blob file.
+  edit.AddBlobFile(/* blob_file_number */ 9, /* total_blob_count */ 4000,
+                   /* total_blob_bytes */ 4000000,
+                   /* checksum_method */ std::string(),
+                   /* checksum_value */ std::string());
+
+  ASSERT_OK(builder.Apply(&edit));
+
+  constexpr bool force_consistency_checks = false;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  const auto& new_blob_files = new_vstorage.GetBlobFiles();
+  ASSERT_EQ(new_blob_files.size(), 3);
+
+  const auto meta3 = new_vstorage.GetBlobFileMetaData(/* blob_file_number */ 3);
+
+  ASSERT_NE(meta3, nullptr);
+  ASSERT_EQ(meta3->GetBlobFileNumber(), 3);
+  ASSERT_EQ(meta3->GetTotalBlobCount(), 1000);
+  ASSERT_EQ(meta3->GetTotalBlobBytes(), 1000000);
+  ASSERT_EQ(meta3->GetGarbageBlobCount(), 100);
+  ASSERT_EQ(meta3->GetGarbageBlobBytes(), 20000);
+
+  const auto meta5 = new_vstorage.GetBlobFileMetaData(/* blob_file_number */ 5);
+
+  ASSERT_NE(meta5, nullptr);
+  ASSERT_EQ(meta5->GetBlobFileNumber(), 5);
+  ASSERT_EQ(meta5->GetTotalBlobCount(), 2000);
+  ASSERT_EQ(meta5->GetTotalBlobBytes(), 2000000);
+  ASSERT_EQ(meta5->GetGarbageBlobCount(), 400);
+  ASSERT_EQ(meta5->GetGarbageBlobBytes(), 140000);
+
+  const auto meta9 = new_vstorage.GetBlobFileMetaData(/* blob_file_number */ 9);
+
+  ASSERT_NE(meta9, nullptr);
+  ASSERT_EQ(meta9->GetBlobFileNumber(), 9);
+  ASSERT_EQ(meta9->GetTotalBlobCount(), 4000);
+  ASSERT_EQ(meta9->GetTotalBlobBytes(), 4000000);
+  ASSERT_EQ(meta9->GetGarbageBlobCount(), 0);
+  ASSERT_EQ(meta9->GetGarbageBlobBytes(), 0);
+
+  // Delete the first table file, which makes the first blob file obsolete
+  // since it's at the head and unreferenced.
+  VersionBuilder second_builder(env_options, &ioptions_, table_cache,
+                                &new_vstorage, version_set);
+
+  VersionEdit second_edit;
+  second_edit.DeleteFile(/* level */ 0, /* file_number */ 2);
+
+  ASSERT_OK(second_builder.Apply(&second_edit));
+
+  VersionStorageInfo newer_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                    kCompactionStyleLevel, &new_vstorage,
+                                    force_consistency_checks);
+
+  ASSERT_OK(second_builder.SaveTo(&newer_vstorage));
+
+  UpdateVersionStorageInfo(&newer_vstorage);
+
+  const auto& newer_blob_files = newer_vstorage.GetBlobFiles();
+  ASSERT_EQ(newer_blob_files.size(), 2);
+
+  const auto newer_meta3 =
+      newer_vstorage.GetBlobFileMetaData(/* blob_file_number */ 3);
+
+  ASSERT_EQ(newer_meta3, nullptr);
+
+  UnrefFilesInVersion(&newer_vstorage);
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, SaveBlobFilesToConcurrentJobs) {
+  // When multiple background jobs (flushes/compactions) are executing in
+  // parallel, it is possible for the VersionEdit adding blob file K to be
+  // applied *after* the VersionEdit adding blob file N (for N > K). This test
+  // case makes sure this is handled correctly.
+
+  // Add blob file #4 (referenced by table file #3) to base version.
+  constexpr uint64_t base_table_file_number = 3;
+  constexpr uint64_t base_blob_file_number = 4;
+  constexpr uint64_t base_total_blob_count = 100;
+  constexpr uint64_t base_total_blob_bytes = 1 << 20;
+
+  constexpr char checksum_method[] = "SHA1";
+  constexpr char checksum_value[] = "\xfa\xce\xb0\x0c";
+  constexpr uint64_t garbage_blob_count = 0;
+  constexpr uint64_t garbage_blob_bytes = 0;
+
+  AddDummyFile(base_table_file_number, base_blob_file_number);
+  AddBlob(base_blob_file_number, base_total_blob_count, base_total_blob_bytes,
+          checksum_method, checksum_value,
+          BlobFileMetaData::LinkedSsts{base_table_file_number},
+          garbage_blob_count, garbage_blob_bytes);
+
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  // Add blob file #2 (referenced by table file #1).
+  constexpr int level = 0;
+  constexpr uint64_t table_file_number = 1;
+  constexpr uint32_t path_id = 0;
+  constexpr uint64_t file_size = 1 << 12;
+  constexpr char smallest[] = "key1";
+  constexpr char largest[] = "key987";
+  constexpr SequenceNumber smallest_seqno = 0;
+  constexpr SequenceNumber largest_seqno = 0;
+  constexpr bool marked_for_compaction = false;
+
+  constexpr uint64_t blob_file_number = 2;
+  static_assert(blob_file_number < base_blob_file_number,
+                "Added blob file should have a smaller file number");
+
+  constexpr uint64_t total_blob_count = 234;
+  constexpr uint64_t total_blob_bytes = 1 << 22;
+
+  edit.AddFile(level, table_file_number, path_id, file_size,
+               GetInternalKey(smallest), GetInternalKey(largest),
+               smallest_seqno, largest_seqno, marked_for_compaction,
+               Temperature::kUnknown, blob_file_number,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+               checksum_value, checksum_method, kNullUniqueId64x2);
+  edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+                   checksum_method, checksum_value);
+
+  ASSERT_OK(builder.Apply(&edit));
+
+  constexpr bool force_consistency_checks = true;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  const auto& new_blob_files = new_vstorage.GetBlobFiles();
+  ASSERT_EQ(new_blob_files.size(), 2);
+
+  const auto base_meta =
+      new_vstorage.GetBlobFileMetaData(base_blob_file_number);
+
+  ASSERT_NE(base_meta, nullptr);
+  ASSERT_EQ(base_meta->GetBlobFileNumber(), base_blob_file_number);
+  ASSERT_EQ(base_meta->GetTotalBlobCount(), base_total_blob_count);
+  ASSERT_EQ(base_meta->GetTotalBlobBytes(), base_total_blob_bytes);
+  ASSERT_EQ(base_meta->GetGarbageBlobCount(), garbage_blob_count);
+  ASSERT_EQ(base_meta->GetGarbageBlobBytes(), garbage_blob_bytes);
+  ASSERT_EQ(base_meta->GetChecksumMethod(), checksum_method);
+  ASSERT_EQ(base_meta->GetChecksumValue(), checksum_value);
+
+  const auto added_meta = new_vstorage.GetBlobFileMetaData(blob_file_number);
+
+  ASSERT_NE(added_meta, nullptr);
+  ASSERT_EQ(added_meta->GetBlobFileNumber(), blob_file_number);
+  ASSERT_EQ(added_meta->GetTotalBlobCount(), total_blob_count);
+  ASSERT_EQ(added_meta->GetTotalBlobBytes(), total_blob_bytes);
+  ASSERT_EQ(added_meta->GetGarbageBlobCount(), garbage_blob_count);
+  ASSERT_EQ(added_meta->GetGarbageBlobBytes(), garbage_blob_bytes);
+  ASSERT_EQ(added_meta->GetChecksumMethod(), checksum_method);
+  ASSERT_EQ(added_meta->GetChecksumValue(), checksum_value);
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, CheckConsistencyForBlobFiles) {
+  // Initialize base version. The first table file points to a valid blob file
+  // in this version; the second one does not refer to any blob files.
+
+  Add(/* level */ 1, /* file_number */ 1, /* smallest */ "150",
+      /* largest */ "200", /* file_size */ 100,
+      /* path_id */ 0, /* smallest_seq */ 100, /* largest_seq */ 100,
+      /* num_entries */ 0, /* num_deletions */ 0,
+      /* sampled */ false, /* smallest_seqno */ 100, /* largest_seqno */ 100,
+      /* oldest_blob_file_number */ 16);
+  Add(/* level */ 1, /* file_number */ 23, /* smallest */ "201",
+      /* largest */ "300", /* file_size */ 100,
+      /* path_id */ 0, /* smallest_seq */ 200, /* largest_seq */ 200,
+      /* num_entries */ 0, /* num_deletions */ 0,
+      /* sampled */ false, /* smallest_seqno */ 200, /* largest_seqno */ 200,
+      kInvalidBlobFileNumber);
+
+  AddBlob(/* blob_file_number */ 16, /* total_blob_count */ 1000,
+          /* total_blob_bytes */ 1000000,
+          /* checksum_method */ std::string(),
+          /* checksum_value */ std::string(), BlobFileMetaData::LinkedSsts{1},
+          /* garbage_blob_count */ 500, /* garbage_blob_bytes */ 300000);
+
+  UpdateVersionStorageInfo();
+
+  // Add a new table file that points to the existing blob file, and add a
+  // new table file--blob file pair.
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  edit.AddFile(/* level */ 1, /* file_number */ 606, /* path_id */ 0,
+               /* file_size */ 100, /* smallest */ GetInternalKey("701"),
+               /* largest */ GetInternalKey("750"), /* smallest_seqno */ 200,
+               /* largest_seqno */ 200, /* marked_for_compaction */ false,
+               Temperature::kUnknown,
+               /* oldest_blob_file_number */ 16, kUnknownOldestAncesterTime,
+               kUnknownFileCreationTime, kUnknownFileChecksum,
+               kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+  edit.AddFile(/* level */ 1, /* file_number */ 700, /* path_id */ 0,
+               /* file_size */ 100, /* smallest */ GetInternalKey("801"),
+               /* largest */ GetInternalKey("850"), /* smallest_seqno */ 200,
+               /* largest_seqno */ 200, /* marked_for_compaction */ false,
+               Temperature::kUnknown,
+               /* oldest_blob_file_number */ 1000, kUnknownOldestAncesterTime,
+               kUnknownFileCreationTime, kUnknownFileChecksum,
+               kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  edit.AddBlobFile(/* blob_file_number */ 1000, /* total_blob_count */ 2000,
+                   /* total_blob_bytes */ 200000,
+                   /* checksum_method */ std::string(),
+                   /* checksum_value */ std::string());
+
+  ASSERT_OK(builder.Apply(&edit));
+
+  // Save to a new version in order to trigger consistency checks.
+  constexpr bool force_consistency_checks = true;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, CheckConsistencyForBlobFilesInconsistentLinks) {
+  // Initialize base version. Links between the table file and the blob file
+  // are inconsistent.
+
+  Add(/* level */ 1, /* file_number */ 1, /* smallest */ "150",
+      /* largest */ "200", /* file_size */ 100,
+      /* path_id */ 0, /* smallest_seq */ 100, /* largest_seq */ 100,
+      /* num_entries */ 0, /* num_deletions */ 0,
+      /* sampled */ false, /* smallest_seqno */ 100, /* largest_seqno */ 100,
+      /* oldest_blob_file_number */ 256);
+
+  AddBlob(/* blob_file_number */ 16, /* total_blob_count */ 1000,
+          /* total_blob_bytes */ 1000000,
+          /* checksum_method */ std::string(),
+          /* checksum_value */ std::string(), BlobFileMetaData::LinkedSsts{1},
+          /* garbage_blob_count */ 500, /* garbage_blob_bytes */ 300000);
+
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  // Save to a new version in order to trigger consistency checks.
+  constexpr bool force_consistency_checks = true;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  const Status s = builder.SaveTo(&new_vstorage);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(
+      s.getState(),
+      "Links are inconsistent between table files and blob file #16"));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, CheckConsistencyForBlobFilesAllGarbage) {
+  // Initialize base version. The table file points to a blob file that is
+  // all garbage.
+
+  Add(/* level */ 1, /* file_number */ 1, /* smallest */ "150",
+      /* largest */ "200", /* file_size */ 100,
+      /* path_id */ 0, /* smallest_seq */ 100, /* largest_seq */ 100,
+      /* num_entries */ 0, /* num_deletions */ 0,
+      /* sampled */ false, /* smallest_seqno */ 100, /* largest_seqno */ 100,
+      /* oldest_blob_file_number */ 16);
+
+  AddBlob(/* blob_file_number */ 16, /* total_blob_count */ 1000,
+          /* total_blob_bytes */ 1000000,
+          /* checksum_method */ std::string(),
+          /* checksum_value */ std::string(), BlobFileMetaData::LinkedSsts{1},
+          /* garbage_blob_count */ 1000, /* garbage_blob_bytes */ 1000000);
+
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  // Save to a new version in order to trigger consistency checks.
+  constexpr bool force_consistency_checks = true;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  const Status s = builder.SaveTo(&new_vstorage);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(
+      std::strstr(s.getState(), "Blob file #16 consists entirely of garbage"));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, CheckConsistencyForBlobFilesAllGarbageLinkedSsts) {
+  // Initialize base version, with a table file pointing to a blob file
+  // that has no garbage at this point.
+
+  Add(/* level */ 1, /* file_number */ 1, /* smallest */ "150",
+      /* largest */ "200", /* file_size */ 100,
+      /* path_id */ 0, /* smallest_seq */ 100, /* largest_seq */ 100,
+      /* num_entries */ 0, /* num_deletions */ 0,
+      /* sampled */ false, /* smallest_seqno */ 100, /* largest_seqno */ 100,
+      /* oldest_blob_file_number */ 16);
+
+  AddBlob(/* blob_file_number */ 16, /* total_blob_count */ 1000,
+          /* total_blob_bytes */ 1000000,
+          /* checksum_method */ std::string(),
+          /* checksum_value */ std::string(), BlobFileMetaData::LinkedSsts{1},
+          /* garbage_blob_count */ 0, /* garbage_blob_bytes */ 0);
+
+  UpdateVersionStorageInfo();
+
+  // Mark the entire blob file garbage but do not remove the linked SST.
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  edit.AddBlobFileGarbage(/* blob_file_number */ 16,
+                          /* garbage_blob_count */ 1000,
+                          /* garbage_blob_bytes */ 1000000);
+
+  ASSERT_OK(builder.Apply(&edit));
+
+  // Save to a new version in order to trigger consistency checks.
+  constexpr bool force_consistency_checks = true;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  const Status s = builder.SaveTo(&new_vstorage);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(
+      std::strstr(s.getState(), "Blob file #16 consists entirely of garbage"));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) {
+  // Initialize base version. Table files 1..10 are linked to blob files 1..5,
+  // while table files 11..20 are not linked to any blob files.
+
+  for (uint64_t i = 1; i <= 10; ++i) {
+    std::ostringstream oss;
+    oss << std::setw(2) << std::setfill('0') << i;
+
+    const std::string key = oss.str();
+
+    Add(/* level */ 1, /* file_number */ i, /* smallest */ key.c_str(),
+        /* largest */ key.c_str(), /* file_size */ 100,
+        /* path_id */ 0, /* smallest_seq */ i * 100, /* largest_seq */ i * 100,
+        /* num_entries */ 0, /* num_deletions */ 0,
+        /* sampled */ false, /* smallest_seqno */ i * 100,
+        /* largest_seqno */ i * 100,
+        /* oldest_blob_file_number */ ((i - 1) % 5) + 1);
+  }
+
+  for (uint64_t i = 1; i <= 5; ++i) {
+    AddBlob(/* blob_file_number */ i, /* total_blob_count */ 2000,
+            /* total_blob_bytes */ 2000000,
+            /* checksum_method */ std::string(),
+            /* checksum_value */ std::string(),
+            BlobFileMetaData::LinkedSsts{i, i + 5},
+            /* garbage_blob_count */ 1000, /* garbage_blob_bytes */ 1000000);
+  }
+
+  for (uint64_t i = 11; i <= 20; ++i) {
+    std::ostringstream oss;
+    oss << std::setw(2) << std::setfill('0') << i;
+
+    const std::string key = oss.str();
+
+    Add(/* level */ 1, /* file_number */ i, /* smallest */ key.c_str(),
+        /* largest */ key.c_str(), /* file_size */ 100,
+        /* path_id */ 0, /* smallest_seq */ i * 100, /* largest_seq */ i * 100,
+        /* num_entries */ 0, /* num_deletions */ 0,
+        /* sampled */ false, /* smallest_seqno */ i * 100,
+        /* largest_seqno */ i * 100, kInvalidBlobFileNumber);
+  }
+
+  UpdateVersionStorageInfo();
+
+  {
+    const auto& blob_files = vstorage_.GetBlobFiles();
+    ASSERT_EQ(blob_files.size(), 5);
+
+    const std::vector<BlobFileMetaData::LinkedSsts> expected_linked_ssts{
+        {1, 6}, {2, 7}, {3, 8}, {4, 9}, {5, 10}};
+
+    for (size_t i = 0; i < 5; ++i) {
+      const auto meta =
+          vstorage_.GetBlobFileMetaData(/* blob_file_number */ i + 1);
+      ASSERT_NE(meta, nullptr);
+      ASSERT_EQ(meta->GetLinkedSsts(), expected_linked_ssts[i]);
+    }
+  }
+
+  VersionEdit edit;
+
+  // Add an SST that references a blob file.
+  edit.AddFile(
+      /* level */ 1, /* file_number */ 21, /* path_id */ 0,
+      /* file_size */ 100, /* smallest */ GetInternalKey("21", 2100),
+      /* largest */ GetInternalKey("21", 2100), /* smallest_seqno */ 2100,
+      /* largest_seqno */ 2100, /* marked_for_compaction */ false,
+      Temperature::kUnknown,
+      /* oldest_blob_file_number */ 1, kUnknownOldestAncesterTime,
+      kUnknownFileCreationTime, kUnknownFileChecksum,
+      kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+  // Add an SST that does not reference any blob files.
+  edit.AddFile(
+      /* level */ 1, /* file_number */ 22, /* path_id */ 0,
+      /* file_size */ 100, /* smallest */ GetInternalKey("22", 2200),
+      /* largest */ GetInternalKey("22", 2200), /* smallest_seqno */ 2200,
+      /* largest_seqno */ 2200, /* marked_for_compaction */ false,
+      Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+      kUnknownFileCreationTime, kUnknownFileChecksum,
+      kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+  // Delete a file that references a blob file.
+  edit.DeleteFile(/* level */ 1, /* file_number */ 6);
+
+  // Delete a file that does not reference any blob files.
+  edit.DeleteFile(/* level */ 1, /* file_number */ 16);
+
+  // Trivially move a file that references a blob file. Note that we save
+  // the original BlobFileMetaData object so we can check that no new object
+  // gets created.
+  auto meta3 = vstorage_.GetBlobFileMetaData(/* blob_file_number */ 3);
+
+  edit.DeleteFile(/* level */ 1, /* file_number */ 3);
+  edit.AddFile(/* level */ 2, /* file_number */ 3, /* path_id */ 0,
+               /* file_size */ 100, /* smallest */ GetInternalKey("03", 300),
+               /* largest */ GetInternalKey("03", 300),
+               /* smallest_seqno */ 300,
+               /* largest_seqno */ 300, /* marked_for_compaction */ false,
+               Temperature::kUnknown,
+               /* oldest_blob_file_number */ 3, kUnknownOldestAncesterTime,
+               kUnknownFileCreationTime, kUnknownFileChecksum,
+               kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+  // Trivially move a file that does not reference any blob files.
+  edit.DeleteFile(/* level */ 1, /* file_number */ 13);
+  edit.AddFile(/* level */ 2, /* file_number */ 13, /* path_id */ 0,
+               /* file_size */ 100, /* smallest */ GetInternalKey("13", 1300),
+               /* largest */ GetInternalKey("13", 1300),
+               /* smallest_seqno */ 1300,
+               /* largest_seqno */ 1300, /* marked_for_compaction */ false,
+               Temperature::kUnknown, kInvalidBlobFileNumber,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+               kNullUniqueId64x2);
+
+  // Add one more SST file that references a blob file, then promptly
+  // delete it in a second version edit before the new version gets saved.
+  // This file should not show up as linked to the blob file in the new version.
+  edit.AddFile(/* level */ 1, /* file_number */ 23, /* path_id */ 0,
+               /* file_size */ 100, /* smallest */ GetInternalKey("23", 2300),
+               /* largest */ GetInternalKey("23", 2300),
+               /* smallest_seqno */ 2300,
+               /* largest_seqno */ 2300, /* marked_for_compaction */ false,
+               Temperature::kUnknown,
+               /* oldest_blob_file_number */ 5, kUnknownOldestAncesterTime,
+               kUnknownFileCreationTime, kUnknownFileChecksum,
+               kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+  VersionEdit edit2;
+
+  edit2.DeleteFile(/* level */ 1, /* file_number */ 23);
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  ASSERT_OK(builder.Apply(&edit));
+  ASSERT_OK(builder.Apply(&edit2));
+
+  constexpr bool force_consistency_checks = true;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  {
+    const auto& blob_files = new_vstorage.GetBlobFiles();
+    ASSERT_EQ(blob_files.size(), 5);
+
+    const std::vector<BlobFileMetaData::LinkedSsts> expected_linked_ssts{
+        {1, 21}, {2, 7}, {3, 8}, {4, 9}, {5, 10}};
+
+    for (size_t i = 0; i < 5; ++i) {
+      const auto meta =
+          new_vstorage.GetBlobFileMetaData(/* blob_file_number */ i + 1);
+      ASSERT_NE(meta, nullptr);
+      ASSERT_EQ(meta->GetLinkedSsts(), expected_linked_ssts[i]);
+    }
+
+    // Make sure that no new BlobFileMetaData got created for the blob file
+    // affected by the trivial move.
+    ASSERT_EQ(new_vstorage.GetBlobFileMetaData(/* blob_file_number */ 3),
+              meta3);
+  }
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, CheckConsistencyForFileDeletedTwice) {
+  Add(0, 1U, "150", "200", 100U);
+
+  UpdateVersionStorageInfo();
+
+  VersionEdit version_edit;
+  version_edit.DeleteFile(0, 1U);
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder version_builder(env_options, &ioptions_, table_cache,
+                                 &vstorage_, version_set);
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr,
+                                  true /* force_consistency_checks */);
+  ASSERT_OK(version_builder.Apply(&version_edit));
+  ASSERT_OK(version_builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  VersionBuilder version_builder2(env_options, &ioptions_, table_cache,
+                                  &new_vstorage, version_set);
+  VersionStorageInfo new_vstorage2(&icmp_, ucmp_, options_.num_levels,
+                                   kCompactionStyleLevel, nullptr,
+                                   true /* force_consistency_checks */);
+  ASSERT_NOK(version_builder2.Apply(&version_edit));
+
+  UnrefFilesInVersion(&new_vstorage);
+  UnrefFilesInVersion(&new_vstorage2);
+}
+
+TEST_F(VersionBuilderTest, EstimatedActiveKeys) {
+  const uint32_t kTotalSamples = 20;
+  const uint32_t kNumLevels = 5;
+  const uint32_t kFilesPerLevel = 8;
+  const uint32_t kNumFiles = kNumLevels * kFilesPerLevel;
+  const uint32_t kEntriesPerFile = 1000;
+  const uint32_t kDeletionsPerFile = 100;
+  for (uint32_t i = 0; i < kNumFiles; ++i) {
+    Add(static_cast<int>(i / kFilesPerLevel), i + 1,
+        std::to_string((i + 100) * 1000).c_str(),
+        std::to_string((i + 100) * 1000 + 999).c_str(), 100U, 0, 100, 100,
+        kEntriesPerFile, kDeletionsPerFile, (i < kTotalSamples));
+  }
+  // minus 2X for the number of deletion entries because:
+  // 1x for deletion entry does not count as a data entry.
+  // 1x for each deletion entry will actually remove one data entry.
+  ASSERT_EQ(vstorage_.GetEstimatedActiveKeys(),
+            (kEntriesPerFile - 2 * kDeletionsPerFile) * kNumFiles);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/version_edit.cc b/src/rocksdb/db/version_edit.cc
new file mode 100644
index 000000000..e4e02fe25
--- /dev/null
+++ b/src/rocksdb/db/version_edit.cc
@@ -0,0 +1,1043 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit.h"
+
+#include "db/blob/blob_index.h"
+#include "db/version_set.h"
+#include "logging/event_logger.h"
+#include "rocksdb/slice.h"
+#include "table/unique_id_impl.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {}  // anonymous namespace
+
+uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id) {
+  assert(number <= kFileNumberMask);
+  return number | (path_id * (kFileNumberMask + 1));
+}
+
+Status FileMetaData::UpdateBoundaries(const Slice& key, const Slice& value,
+                                      SequenceNumber seqno,
+                                      ValueType value_type) {
+  if (value_type == kTypeBlobIndex) {
+    BlobIndex blob_index;
+    const Status s = blob_index.DecodeFrom(value);
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (!blob_index.IsInlined() && !blob_index.HasTTL()) {
+      if (blob_index.file_number() == kInvalidBlobFileNumber) {
+        return Status::Corruption("Invalid blob file number");
+      }
+
+      if (oldest_blob_file_number == kInvalidBlobFileNumber ||
+          oldest_blob_file_number > blob_index.file_number()) {
+        oldest_blob_file_number = blob_index.file_number();
+      }
+    }
+  }
+
+  if (smallest.size() == 0) {
+    smallest.DecodeFrom(key);
+  }
+  largest.DecodeFrom(key);
+  fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
+  fd.largest_seqno = std::max(fd.largest_seqno, seqno);
+
+  return Status::OK();
+}
+
+void VersionEdit::Clear() {
+  max_level_ = 0;
+  db_id_.clear();
+  comparator_.clear();
+  log_number_ = 0;
+  prev_log_number_ = 0;
+  next_file_number_ = 0;
+  max_column_family_ = 0;
+  min_log_number_to_keep_ = 0;
+  last_sequence_ = 0;
+  has_db_id_ = false;
+  has_comparator_ = false;
+  has_log_number_ = false;
+  has_prev_log_number_ = false;
+  has_next_file_number_ = false;
+  has_max_column_family_ = false;
+  has_min_log_number_to_keep_ = false;
+  has_last_sequence_ = false;
+  compact_cursors_.clear();
+  deleted_files_.clear();
+  new_files_.clear();
+  blob_file_additions_.clear();
+  blob_file_garbages_.clear();
+  wal_additions_.clear();
+  wal_deletion_.Reset();
+  column_family_ = 0;
+  is_column_family_add_ = false;
+  is_column_family_drop_ = false;
+  column_family_name_.clear();
+  is_in_atomic_group_ = false;
+  remaining_entries_ = 0;
+  full_history_ts_low_.clear();
+}
+
+bool VersionEdit::EncodeTo(std::string* dst) const {
+  if (has_db_id_) {
+    PutVarint32(dst, kDbId);
+    PutLengthPrefixedSlice(dst, db_id_);
+  }
+  if (has_comparator_) {
+    PutVarint32(dst, kComparator);
+    PutLengthPrefixedSlice(dst, comparator_);
+  }
+  if (has_log_number_) {
+    PutVarint32Varint64(dst, kLogNumber, log_number_);
+  }
+  if (has_prev_log_number_) {
+    PutVarint32Varint64(dst, kPrevLogNumber, prev_log_number_);
+  }
+  if (has_next_file_number_) {
+    PutVarint32Varint64(dst, kNextFileNumber, next_file_number_);
+  }
+  if (has_max_column_family_) {
+    PutVarint32Varint32(dst, kMaxColumnFamily, max_column_family_);
+  }
+  if (has_min_log_number_to_keep_) {
+    PutVarint32Varint64(dst, kMinLogNumberToKeep, min_log_number_to_keep_);
+  }
+  if (has_last_sequence_) {
+    PutVarint32Varint64(dst, kLastSequence, last_sequence_);
+  }
+  for (size_t i = 0; i < compact_cursors_.size(); i++) {
+    if (compact_cursors_[i].second.Valid()) {
+      PutVarint32(dst, kCompactCursor);
+      PutVarint32(dst, compact_cursors_[i].first);  // level
+      PutLengthPrefixedSlice(dst, compact_cursors_[i].second.Encode());
+    }
+  }
+  for (const auto& deleted : deleted_files_) {
+    PutVarint32Varint32Varint64(dst, kDeletedFile, deleted.first /* level */,
+                                deleted.second /* file number */);
+  }
+
+  bool min_log_num_written = false;
+  for (size_t i = 0; i < new_files_.size(); i++) {
+    const FileMetaData& f = new_files_[i].second;
+    if (!f.smallest.Valid() || !f.largest.Valid()) {
+      return false;
+    }
+    PutVarint32(dst, kNewFile4);
+    PutVarint32Varint64(dst, new_files_[i].first /* level */, f.fd.GetNumber());
+    PutVarint64(dst, f.fd.GetFileSize());
+    PutLengthPrefixedSlice(dst, f.smallest.Encode());
+    PutLengthPrefixedSlice(dst, f.largest.Encode());
+    PutVarint64Varint64(dst, f.fd.smallest_seqno, f.fd.largest_seqno);
+    // Customized fields' format:
+    // +-----------------------------+
+    // | 1st field's tag (varint32)  |
+    // +-----------------------------+
+    // | 1st field's size (varint32) |
+    // +-----------------------------+
+    // |    bytes for 1st field      |
+    // |  (based on size decoded)    |
+    // +-----------------------------+
+    // |                             |
+    // |          ......             |
+    // |                             |
+    // +-----------------------------+
+    // | last field's size (varint32)|
+    // +-----------------------------+
+    // |    bytes for last field     |
+    // |  (based on size decoded)    |
+    // +-----------------------------+
+    // | terminating tag (varint32)  |
+    // +-----------------------------+
+    //
+    // Customized encoding for fields:
+    //   tag kPathId: 1 byte as path_id
+    //   tag kNeedCompaction:
+    //        now only can take one char value 1 indicating need-compaction
+    //
+    PutVarint32(dst, NewFileCustomTag::kOldestAncesterTime);
+    std::string varint_oldest_ancester_time;
+    PutVarint64(&varint_oldest_ancester_time, f.oldest_ancester_time);
+    TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintOldestAncesterTime",
+                             &varint_oldest_ancester_time);
+    PutLengthPrefixedSlice(dst, Slice(varint_oldest_ancester_time));
+
+    PutVarint32(dst, NewFileCustomTag::kFileCreationTime);
+    std::string varint_file_creation_time;
+    PutVarint64(&varint_file_creation_time, f.file_creation_time);
+    TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintFileCreationTime",
+                             &varint_file_creation_time);
+    PutLengthPrefixedSlice(dst, Slice(varint_file_creation_time));
+
+    PutVarint32(dst, NewFileCustomTag::kFileChecksum);
+    PutLengthPrefixedSlice(dst, Slice(f.file_checksum));
+
+    PutVarint32(dst, NewFileCustomTag::kFileChecksumFuncName);
+    PutLengthPrefixedSlice(dst, Slice(f.file_checksum_func_name));
+
+    if (f.fd.GetPathId() != 0) {
+      PutVarint32(dst, NewFileCustomTag::kPathId);
+      char p = static_cast<char>(f.fd.GetPathId());
+      PutLengthPrefixedSlice(dst, Slice(&p, 1));
+    }
+    if (f.temperature != Temperature::kUnknown) {
+      PutVarint32(dst, NewFileCustomTag::kTemperature);
+      char p = static_cast<char>(f.temperature);
+      PutLengthPrefixedSlice(dst, Slice(&p, 1));
+    }
+    if (f.marked_for_compaction) {
+      PutVarint32(dst, NewFileCustomTag::kNeedCompaction);
+      char p = static_cast<char>(1);
+      PutLengthPrefixedSlice(dst, Slice(&p, 1));
+    }
+    if (has_min_log_number_to_keep_ && !min_log_num_written) {
+      PutVarint32(dst, NewFileCustomTag::kMinLogNumberToKeepHack);
+      std::string varint_log_number;
+      PutFixed64(&varint_log_number, min_log_number_to_keep_);
+      PutLengthPrefixedSlice(dst, Slice(varint_log_number));
+      min_log_num_written = true;
+    }
+    if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
+      PutVarint32(dst, NewFileCustomTag::kOldestBlobFileNumber);
+      std::string oldest_blob_file_number;
+      PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number);
+      PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number));
+    }
+    UniqueId64x2 unique_id = f.unique_id;
+    TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:UniqueId", &unique_id);
+    if (unique_id != kNullUniqueId64x2) {
+      PutVarint32(dst, NewFileCustomTag::kUniqueId);
+      std::string unique_id_str = EncodeUniqueIdBytes(&unique_id);
+      PutLengthPrefixedSlice(dst, Slice(unique_id_str));
+    }
+
+    TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields",
+                             dst);
+
+    PutVarint32(dst, NewFileCustomTag::kTerminate);
+  }
+
+  for (const auto& blob_file_addition : blob_file_additions_) {
+    PutVarint32(dst, kBlobFileAddition);
+    blob_file_addition.EncodeTo(dst);
+  }
+
+  for (const auto& blob_file_garbage : blob_file_garbages_) {
+    PutVarint32(dst, kBlobFileGarbage);
+    blob_file_garbage.EncodeTo(dst);
+  }
+
+  for (const auto& wal_addition : wal_additions_) {
+    PutVarint32(dst, kWalAddition2);
+    std::string encoded;
+    wal_addition.EncodeTo(&encoded);
+    PutLengthPrefixedSlice(dst, encoded);
+  }
+
+  if (!wal_deletion_.IsEmpty()) {
+    PutVarint32(dst, kWalDeletion2);
+    std::string encoded;
+    wal_deletion_.EncodeTo(&encoded);
+    PutLengthPrefixedSlice(dst, encoded);
+  }
+
+  // 0 is default and does not need to be explicitly written
+  if (column_family_ != 0) {
+    PutVarint32Varint32(dst, kColumnFamily, column_family_);
+  }
+
+  if (is_column_family_add_) {
+    PutVarint32(dst, kColumnFamilyAdd);
+    PutLengthPrefixedSlice(dst, Slice(column_family_name_));
+  }
+
+  if (is_column_family_drop_) {
+    PutVarint32(dst, kColumnFamilyDrop);
+  }
+
+  if (is_in_atomic_group_) {
+    PutVarint32(dst, kInAtomicGroup);
+    PutVarint32(dst, remaining_entries_);
+  }
+
+  if (HasFullHistoryTsLow()) {
+    PutVarint32(dst, kFullHistoryTsLow);
+    PutLengthPrefixedSlice(dst, full_history_ts_low_);
+  }
+  return true;
+}
+
+static bool GetInternalKey(Slice* input, InternalKey* dst) {
+  Slice str;
+  if (GetLengthPrefixedSlice(input, &str)) {
+    dst->DecodeFrom(str);
+    return dst->Valid();
+  } else {
+    return false;
+  }
+}
+
+bool VersionEdit::GetLevel(Slice* input, int* level, const char** /*msg*/) {
+  uint32_t v = 0;
+  if (GetVarint32(input, &v)) {
+    *level = v;
+    if (max_level_ < *level) {
+      max_level_ = *level;
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+static bool is_pseudo_new_file_record_pr3488(
+  const int level,
+  const uint64_t number,
+  const uint64_t file_size,
+  InternalKey& smallest,
+  InternalKey& largest,
+  const bool has_min_log_number_to_keep_) {
+
+  if (level == 0 && number == 0 && file_size == 0 &&
+      has_min_log_number_to_keep_) {
+    InternalKey dummy_key(Slice("dummy_key"), 0ull, ValueType::kTypeValue);
+    return (*smallest.rep() == *dummy_key.rep() &&
+            *largest.rep() == *dummy_key.rep());
+  } else {
+    return false;
+  }
+}
+
+const char* VersionEdit::DecodeNewFile4From(Slice* input) {
+  const char* msg = nullptr;
+  int level = 0;
+  FileMetaData f;
+  uint64_t number = 0;
+  uint32_t path_id = 0;
+  uint64_t file_size = 0;
+  SequenceNumber smallest_seqno = 0;
+  SequenceNumber largest_seqno = kMaxSequenceNumber;
+  if (GetLevel(input, &level, &msg) && GetVarint64(input, &number) &&
+      GetVarint64(input, &file_size) && GetInternalKey(input, &f.smallest) &&
+      GetInternalKey(input, &f.largest) &&
+      GetVarint64(input, &smallest_seqno) &&
+      GetVarint64(input, &largest_seqno)) {
+    // See comments in VersionEdit::EncodeTo() for format of customized fields
+    while (true) {
+      uint32_t custom_tag = 0;
+      Slice field;
+      if (!GetVarint32(input, &custom_tag)) {
+        return "new-file4 custom field";
+      }
+      if (custom_tag == kTerminate) {
+        break;
+      }
+      if (!GetLengthPrefixedSlice(input, &field)) {
+        return "new-file4 custom field length prefixed slice error";
+      }
+      switch (custom_tag) {
+        case kPathId:
+          if (field.size() != 1) {
+            return "path_id field wrong size";
+          }
+          path_id = field[0];
+          if (path_id > 3) {
+            return "path_id wrong vaue";
+          }
+          break;
+        case kOldestAncesterTime:
+          if (!GetVarint64(&field, &f.oldest_ancester_time)) {
+            return "invalid oldest ancester time";
+          }
+          break;
+        case kFileCreationTime:
+          if (!GetVarint64(&field, &f.file_creation_time)) {
+            return "invalid file creation time";
+          }
+          break;
+        case kFileChecksum:
+          f.file_checksum = field.ToString();
+          break;
+        case kFileChecksumFuncName:
+          f.file_checksum_func_name = field.ToString();
+          break;
+        case kNeedCompaction:
+          if (field.size() != 1) {
+            return "need_compaction field wrong size";
+          }
+          f.marked_for_compaction = (field[0] == 1);
+          break;
+        case kMinLogNumberToKeepHack:
+          // This is a hack to encode kMinLogNumberToKeep in a
+          // forward-compatible fashion.
+          if (!GetFixed64(&field, &min_log_number_to_keep_)) {
+            return "deleted log number malformatted";
+          }
+          has_min_log_number_to_keep_ = true;
+          break;
+        case kOldestBlobFileNumber:
+          if (!GetVarint64(&field, &f.oldest_blob_file_number)) {
+            return "invalid oldest blob file number";
+          }
+          break;
+        case kTemperature:
+          if (field.size() != 1) {
+            return "temperature field wrong size";
+          } else {
+            Temperature casted_field = static_cast<Temperature>(field[0]);
+            if (casted_field <= Temperature::kCold) {
+              f.temperature = casted_field;
+            }
+          }
+          break;
+        case kUniqueId:
+          if (!DecodeUniqueIdBytes(field.ToString(), &f.unique_id).ok()) {
+            f.unique_id = kNullUniqueId64x2;
+            return "invalid unique id";
+          }
+          break;
+        default:
+          if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) {
+            // Should not proceed if cannot understand it
+            return "new-file4 custom field not supported";
+          }
+          break;
+      }
+    }
+  } else {
+    return "new-file4 entry";
+  }
+  if (is_pseudo_new_file_record_pr3488(level, number, file_size,
+                                       f.smallest, f.largest,
+                                       has_min_log_number_to_keep_)) {
+    // Since this has nothing to do with NewFile, return immediately.
+    return nullptr;
+  }
+  f.fd =
+      FileDescriptor(number, path_id, file_size, smallest_seqno, largest_seqno);
+  new_files_.push_back(std::make_pair(level, f));
+  return nullptr;
+}
+
+Status VersionEdit::DecodeFrom(const Slice& src) {
+  Clear();
+#ifndef NDEBUG
+  bool ignore_ignorable_tags = false;
+  TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:IgnoreIgnorableTags",
+                           &ignore_ignorable_tags);
+#endif
+  Slice input = src;
+  const char* msg = nullptr;
+  uint32_t tag = 0;
+
+  // Temporary storage for parsing
+  int level = 0;
+  FileMetaData f;
+  Slice str;
+  InternalKey key;
+  while (msg == nullptr && GetVarint32(&input, &tag)) {
+#ifndef NDEBUG
+    if (ignore_ignorable_tags && tag > kTagSafeIgnoreMask) {
+      tag = kTagSafeIgnoreMask;
+    }
+#endif
+    switch (tag) {
+      case kDbId:
+        if (GetLengthPrefixedSlice(&input, &str)) {
+          db_id_ = str.ToString();
+          has_db_id_ = true;
+        } else {
+          msg = "db id";
+        }
+        break;
+      case kComparator:
+        if (GetLengthPrefixedSlice(&input, &str)) {
+          comparator_ = str.ToString();
+          has_comparator_ = true;
+        } else {
+          msg = "comparator name";
+        }
+        break;
+
+      case kLogNumber:
+        if (GetVarint64(&input, &log_number_)) {
+          has_log_number_ = true;
+        } else {
+          msg = "log number";
+        }
+        break;
+
+      case kPrevLogNumber:
+        if (GetVarint64(&input, &prev_log_number_)) {
+          has_prev_log_number_ = true;
+        } else {
+          msg = "previous log number";
+        }
+        break;
+
+      case kNextFileNumber:
+        if (GetVarint64(&input, &next_file_number_)) {
+          has_next_file_number_ = true;
+        } else {
+          msg = "next file number";
+        }
+        break;
+
+      case kMaxColumnFamily:
+        if (GetVarint32(&input, &max_column_family_)) {
+          has_max_column_family_ = true;
+        } else {
+          msg = "max column family";
+        }
+        break;
+
+      case kMinLogNumberToKeep:
+        if (GetVarint64(&input, &min_log_number_to_keep_)) {
+          has_min_log_number_to_keep_ = true;
+        } else {
+          msg = "min log number to kee";
+        }
+        break;
+
+      case kLastSequence:
+        if (GetVarint64(&input, &last_sequence_)) {
+          has_last_sequence_ = true;
+        } else {
+          msg = "last sequence number";
+        }
+        break;
+
+      case kCompactCursor:
+        if (GetLevel(&input, &level, &msg) && GetInternalKey(&input, &key)) {
+          // Here we re-use the output format of compact pointer in LevelDB
+          // to persist compact_cursors_
+          compact_cursors_.push_back(std::make_pair(level, key));
+        } else {
+          if (!msg) {
+            msg = "compaction cursor";
+          }
+        }
+        break;
+
+      case kDeletedFile: {
+        uint64_t number = 0;
+        if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number)) {
+          deleted_files_.insert(std::make_pair(level, number));
+        } else {
+          if (!msg) {
+            msg = "deleted file";
+          }
+        }
+        break;
+      }
+
+      case kNewFile: {
+        uint64_t number = 0;
+        uint64_t file_size = 0;
+        if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
+            GetVarint64(&input, &file_size) &&
+            GetInternalKey(&input, &f.smallest) &&
+            GetInternalKey(&input, &f.largest)) {
+          f.fd = FileDescriptor(number, 0, file_size);
+          new_files_.push_back(std::make_pair(level, f));
+        } else {
+          if (!msg) {
+            msg = "new-file entry";
+          }
+        }
+        break;
+      }
+      case kNewFile2: {
+        uint64_t number = 0;
+        uint64_t file_size = 0;
+        SequenceNumber smallest_seqno = 0;
+        SequenceNumber largest_seqno = kMaxSequenceNumber;
+        if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
+            GetVarint64(&input, &file_size) &&
+            GetInternalKey(&input, &f.smallest) &&
+            GetInternalKey(&input, &f.largest) &&
+            GetVarint64(&input, &smallest_seqno) &&
+            GetVarint64(&input, &largest_seqno)) {
+          f.fd = FileDescriptor(number, 0, file_size, smallest_seqno,
+                                largest_seqno);
+          new_files_.push_back(std::make_pair(level, f));
+        } else {
+          if (!msg) {
+            msg = "new-file2 entry";
+          }
+        }
+        break;
+      }
+
+      case kNewFile3: {
+        uint64_t number = 0;
+        uint32_t path_id = 0;
+        uint64_t file_size = 0;
+        SequenceNumber smallest_seqno = 0;
+        SequenceNumber largest_seqno = kMaxSequenceNumber;
+        if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
+            GetVarint32(&input, &path_id) && GetVarint64(&input, &file_size) &&
+            GetInternalKey(&input, &f.smallest) &&
+            GetInternalKey(&input, &f.largest) &&
+            GetVarint64(&input, &smallest_seqno) &&
+            GetVarint64(&input, &largest_seqno)) {
+          f.fd = FileDescriptor(number, path_id, file_size, smallest_seqno,
+                                largest_seqno);
+          new_files_.push_back(std::make_pair(level, f));
+        } else {
+          if (!msg) {
+            msg = "new-file3 entry";
+          }
+        }
+        break;
+      }
+
+      case kNewFile4: {
+        msg = DecodeNewFile4From(&input);
+        break;
+      }
+
+      case kBlobFileAddition:
+      case kBlobFileAddition_DEPRECATED: {
+        BlobFileAddition blob_file_addition;
+        const Status s = blob_file_addition.DecodeFrom(&input);
+        if (!s.ok()) {
+          return s;
+        }
+
+        AddBlobFile(std::move(blob_file_addition));
+        break;
+      }
+
+      case kBlobFileGarbage:
+      case kBlobFileGarbage_DEPRECATED: {
+        BlobFileGarbage blob_file_garbage;
+        const Status s = blob_file_garbage.DecodeFrom(&input);
+        if (!s.ok()) {
+          return s;
+        }
+
+        AddBlobFileGarbage(std::move(blob_file_garbage));
+        break;
+      }
+
+      case kWalAddition: {
+        WalAddition wal_addition;
+        const Status s = wal_addition.DecodeFrom(&input);
+        if (!s.ok()) {
+          return s;
+        }
+
+        wal_additions_.emplace_back(std::move(wal_addition));
+        break;
+      }
+
+      case kWalAddition2: {
+        Slice encoded;
+        if (!GetLengthPrefixedSlice(&input, &encoded)) {
+          msg = "WalAddition not prefixed by length";
+          break;
+        }
+
+        WalAddition wal_addition;
+        const Status s = wal_addition.DecodeFrom(&encoded);
+        if (!s.ok()) {
+          return s;
+        }
+
+        wal_additions_.emplace_back(std::move(wal_addition));
+        break;
+      }
+
+      case kWalDeletion: {
+        WalDeletion wal_deletion;
+        const Status s = wal_deletion.DecodeFrom(&input);
+        if (!s.ok()) {
+          return s;
+        }
+
+        wal_deletion_ = std::move(wal_deletion);
+        break;
+      }
+
+      case kWalDeletion2: {
+        Slice encoded;
+        if (!GetLengthPrefixedSlice(&input, &encoded)) {
+          msg = "WalDeletion not prefixed by length";
+          break;
+        }
+
+        WalDeletion wal_deletion;
+        const Status s = wal_deletion.DecodeFrom(&encoded);
+        if (!s.ok()) {
+          return s;
+        }
+
+        wal_deletion_ = std::move(wal_deletion);
+        break;
+      }
+
+      case kColumnFamily:
+        if (!GetVarint32(&input, &column_family_)) {
+          if (!msg) {
+            msg = "set column family id";
+          }
+        }
+        break;
+
+      case kColumnFamilyAdd:
+        if (GetLengthPrefixedSlice(&input, &str)) {
+          is_column_family_add_ = true;
+          column_family_name_ = str.ToString();
+        } else {
+          if (!msg) {
+            msg = "column family add";
+          }
+        }
+        break;
+
+      case kColumnFamilyDrop:
+        is_column_family_drop_ = true;
+        break;
+
+      case kInAtomicGroup:
+        is_in_atomic_group_ = true;
+        if (!GetVarint32(&input, &remaining_entries_)) {
+          if (!msg) {
+            msg = "remaining entries";
+          }
+        }
+        break;
+
+      case kFullHistoryTsLow:
+        if (!GetLengthPrefixedSlice(&input, &str)) {
+          msg = "full_history_ts_low";
+        } else if (str.empty()) {
+          msg = "full_history_ts_low: empty";
+        } else {
+          full_history_ts_low_.assign(str.data(), str.size());
+        }
+        break;
+
+      default:
+        if (tag & kTagSafeIgnoreMask) {
+          // Tag from future which can be safely ignored.
+          // The next field must be the length of the entry.
+          uint32_t field_len;
+          if (!GetVarint32(&input, &field_len) ||
+              static_cast<size_t>(field_len) > input.size()) {
+            if (!msg) {
+              msg = "safely ignoreable tag length error";
+            }
+          } else {
+            input.remove_prefix(static_cast<size_t>(field_len));
+          }
+        } else {
+          msg = "unknown tag";
+        }
+        break;
+    }
+  }
+
+  if (msg == nullptr && !input.empty()) {
+    msg = "invalid tag";
+  }
+
+  Status result;
+  if (msg != nullptr) {
+    result = Status::Corruption("VersionEdit", msg);
+  }
+  return result;
+}
+
+std::string VersionEdit::DebugString(bool hex_key) const {
+  std::string r;
+  r.append("VersionEdit {");
+  if (has_db_id_) {
+    r.append("\n  DB ID: ");
+    r.append(db_id_);
+  }
+  if (has_comparator_) {
+    r.append("\n  Comparator: ");
+    r.append(comparator_);
+  }
+  if (has_log_number_) {
+    r.append("\n  LogNumber: ");
+    AppendNumberTo(&r, log_number_);
+  }
+  if (has_prev_log_number_) {
+    r.append("\n  PrevLogNumber: ");
+    AppendNumberTo(&r, prev_log_number_);
+  }
+  if (has_next_file_number_) {
+    r.append("\n  NextFileNumber: ");
+    AppendNumberTo(&r, next_file_number_);
+  }
+  if (has_max_column_family_) {
+    r.append("\n  MaxColumnFamily: ");
+    AppendNumberTo(&r, max_column_family_);
+  }
+  if (has_min_log_number_to_keep_) {
+    r.append("\n  MinLogNumberToKeep: ");
+    AppendNumberTo(&r, min_log_number_to_keep_);
+  }
+  if (has_last_sequence_) {
+    r.append("\n  LastSeq: ");
+    AppendNumberTo(&r, last_sequence_);
+  }
+  for (const auto& level_and_compact_cursor : compact_cursors_) {
+    r.append("\n  CompactCursor: ");
+    AppendNumberTo(&r, level_and_compact_cursor.first);
+    r.append(" ");
+    r.append(level_and_compact_cursor.second.DebugString(hex_key));
+  }
+  for (const auto& deleted_file : deleted_files_) {
+    r.append("\n  DeleteFile: ");
+    AppendNumberTo(&r, deleted_file.first);
+    r.append(" ");
+    AppendNumberTo(&r, deleted_file.second);
+  }
+  for (size_t i = 0; i < new_files_.size(); i++) {
+    const FileMetaData& f = new_files_[i].second;
+    r.append("\n  AddFile: ");
+    AppendNumberTo(&r, new_files_[i].first);
+    r.append(" ");
+    AppendNumberTo(&r, f.fd.GetNumber());
+    r.append(" ");
+    AppendNumberTo(&r, f.fd.GetFileSize());
+    r.append(" ");
+    r.append(f.smallest.DebugString(hex_key));
+    r.append(" .. ");
+    r.append(f.largest.DebugString(hex_key));
+    if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
+      r.append(" blob_file:");
+      AppendNumberTo(&r, f.oldest_blob_file_number);
+    }
+    r.append(" oldest_ancester_time:");
+    AppendNumberTo(&r, f.oldest_ancester_time);
+    r.append(" file_creation_time:");
+    AppendNumberTo(&r, f.file_creation_time);
+    r.append(" file_checksum:");
+    r.append(Slice(f.file_checksum).ToString(true));
+    r.append(" file_checksum_func_name: ");
+    r.append(f.file_checksum_func_name);
+    if (f.temperature != Temperature::kUnknown) {
+      r.append(" temperature: ");
+      // Maybe change to human readable format whenthe feature becomes
+      // permanent
+      r.append(std::to_string(static_cast<int>(f.temperature)));
+    }
+    if (f.unique_id != kNullUniqueId64x2) {
+      r.append(" unique_id(internal): ");
+      UniqueId64x2 id = f.unique_id;
+      r.append(InternalUniqueIdToHumanString(&id));
+      r.append(" public_unique_id: ");
+      InternalUniqueIdToExternal(&id);
+      r.append(UniqueIdToHumanString(EncodeUniqueIdBytes(&id)));
+    }
+  }
+
+  for (const auto& blob_file_addition : blob_file_additions_) {
+    r.append("\n  BlobFileAddition: ");
+    r.append(blob_file_addition.DebugString());
+  }
+
+  for (const auto& blob_file_garbage : blob_file_garbages_) {
+    r.append("\n  BlobFileGarbage: ");
+    r.append(blob_file_garbage.DebugString());
+  }
+
+  for (const auto& wal_addition : wal_additions_) {
+    r.append("\n  WalAddition: ");
+    r.append(wal_addition.DebugString());
+  }
+
+  if (!wal_deletion_.IsEmpty()) {
+    r.append("\n  WalDeletion: ");
+    r.append(wal_deletion_.DebugString());
+  }
+
+  r.append("\n  ColumnFamily: ");
+  AppendNumberTo(&r, column_family_);
+  if (is_column_family_add_) {
+    r.append("\n  ColumnFamilyAdd: ");
+    r.append(column_family_name_);
+  }
+  if (is_column_family_drop_) {
+    r.append("\n  ColumnFamilyDrop");
+  }
+  if (is_in_atomic_group_) {
+    r.append("\n  AtomicGroup: ");
+    AppendNumberTo(&r, remaining_entries_);
+    r.append(" entries remains");
+  }
+  if (HasFullHistoryTsLow()) {
+    r.append("\n FullHistoryTsLow: ");
+    r.append(Slice(full_history_ts_low_).ToString(hex_key));
+  }
+  r.append("\n}\n");
+  return r;
+}
+
+std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const {
+  JSONWriter jw;
+  jw << "EditNumber" << edit_num;
+
+  if (has_db_id_) {
+    jw << "DB ID" << db_id_;
+  }
+  if (has_comparator_) {
+    jw << "Comparator" << comparator_;
+  }
+  if (has_log_number_) {
+    jw << "LogNumber" << log_number_;
+  }
+  if (has_prev_log_number_) {
+    jw << "PrevLogNumber" << prev_log_number_;
+  }
+  if (has_next_file_number_) {
+    jw << "NextFileNumber" << next_file_number_;
+  }
+  if (has_max_column_family_) {
+    jw << "MaxColumnFamily" << max_column_family_;
+  }
+  if (has_min_log_number_to_keep_) {
+    jw << "MinLogNumberToKeep" << min_log_number_to_keep_;
+  }
+  if (has_last_sequence_) {
+    jw << "LastSeq" << last_sequence_;
+  }
+
+  if (!deleted_files_.empty()) {
+    jw << "DeletedFiles";
+    jw.StartArray();
+
+    for (const auto& deleted_file : deleted_files_) {
+      jw.StartArrayedObject();
+      jw << "Level" << deleted_file.first;
+      jw << "FileNumber" << deleted_file.second;
+      jw.EndArrayedObject();
+    }
+
+    jw.EndArray();
+  }
+
+  if (!new_files_.empty()) {
+    jw << "AddedFiles";
+    jw.StartArray();
+
+    for (size_t i = 0; i < new_files_.size(); i++) {
+      jw.StartArrayedObject();
+      jw << "Level" << new_files_[i].first;
+      const FileMetaData& f = new_files_[i].second;
+      jw << "FileNumber" << f.fd.GetNumber();
+      jw << "FileSize" << f.fd.GetFileSize();
+      jw << "SmallestIKey" << f.smallest.DebugString(hex_key);
+      jw << "LargestIKey" << f.largest.DebugString(hex_key);
+      jw << "OldestAncesterTime" << f.oldest_ancester_time;
+      jw << "FileCreationTime" << f.file_creation_time;
+      jw << "FileChecksum" << Slice(f.file_checksum).ToString(true);
+      jw << "FileChecksumFuncName" << f.file_checksum_func_name;
+      if (f.temperature != Temperature::kUnknown) {
+        jw << "temperature" << std::to_string(static_cast<int>(f.temperature));
+      }
+      if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
+        jw << "OldestBlobFile" << f.oldest_blob_file_number;
+      }
+      if (f.temperature != Temperature::kUnknown) {
+        // Maybe change to human readable format whenthe feature becomes
+        // permanent
+        jw << "Temperature" << static_cast<int>(f.temperature);
+      }
+      jw.EndArrayedObject();
+    }
+
+    jw.EndArray();
+  }
+
+  if (!blob_file_additions_.empty()) {
+    jw << "BlobFileAdditions";
+
+    jw.StartArray();
+
+    for (const auto& blob_file_addition : blob_file_additions_) {
+      jw.StartArrayedObject();
+      jw << blob_file_addition;
+      jw.EndArrayedObject();
+    }
+
+    jw.EndArray();
+  }
+
+  if (!blob_file_garbages_.empty()) {
+    jw << "BlobFileGarbages";
+
+    jw.StartArray();
+
+    for (const auto& blob_file_garbage : blob_file_garbages_) {
+      jw.StartArrayedObject();
+      jw << blob_file_garbage;
+      jw.EndArrayedObject();
+    }
+
+    jw.EndArray();
+  }
+
+  if (!wal_additions_.empty()) {
+    jw << "WalAdditions";
+
+    jw.StartArray();
+
+    for (const auto& wal_addition : wal_additions_) {
+      jw.StartArrayedObject();
+      jw << wal_addition;
+      jw.EndArrayedObject();
+    }
+
+    jw.EndArray();
+  }
+
+  if (!wal_deletion_.IsEmpty()) {
+    jw << "WalDeletion";
+    jw.StartObject();
+    jw << wal_deletion_;
+    jw.EndObject();
+  }
+
+  jw << "ColumnFamily" << column_family_;
+
+  if (is_column_family_add_) {
+    jw << "ColumnFamilyAdd" << column_family_name_;
+  }
+  if (is_column_family_drop_) {
+    jw << "ColumnFamilyDrop" << column_family_name_;
+  }
+  if (is_in_atomic_group_) {
+    jw << "AtomicGroup" << remaining_entries_;
+  }
+
+  if (HasFullHistoryTsLow()) {
+    jw << "FullHistoryTsLow" << Slice(full_history_ts_low_).ToString(hex_key);
+  }
+
+  jw.EndObject();
+
+  return jw.Get();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_edit.h b/src/rocksdb/db/version_edit.h
new file mode 100644
index 000000000..c9800a3c0
--- /dev/null
+++ b/src/rocksdb/db/version_edit.h
@@ -0,0 +1,669 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <algorithm>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/blob/blob_file_addition.h"
+#include "db/blob/blob_file_garbage.h"
+#include "db/dbformat.h"
+#include "db/wal_edit.h"
+#include "memory/arena.h"
+#include "port/malloc.h"
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/cache.h"
+#include "table/table_reader.h"
+#include "table/unique_id_impl.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Tag numbers for serialized VersionEdit.  These numbers are written to
+// disk and should not be changed. The number should be forward compatible so
+// users can down-grade RocksDB safely. A future Tag is ignored by doing '&'
+// between Tag and kTagSafeIgnoreMask field.
+enum Tag : uint32_t {
+  kComparator = 1,
+  kLogNumber = 2,
+  kNextFileNumber = 3,
+  kLastSequence = 4,
+  kCompactCursor = 5,
+  kDeletedFile = 6,
+  kNewFile = 7,
+  // 8 was used for large value refs
+  kPrevLogNumber = 9,
+  kMinLogNumberToKeep = 10,
+
+  // these are new formats divergent from open source leveldb
+  kNewFile2 = 100,
+  kNewFile3 = 102,
+  kNewFile4 = 103,      // 4th (the latest) format version of adding files
+  kColumnFamily = 200,  // specify column family for version edit
+  kColumnFamilyAdd = 201,
+  kColumnFamilyDrop = 202,
+  kMaxColumnFamily = 203,
+
+  kInAtomicGroup = 300,
+
+  kBlobFileAddition = 400,
+  kBlobFileGarbage,
+
+  // Mask for an unidentified tag from the future which can be safely ignored.
+  kTagSafeIgnoreMask = 1 << 13,
+
+  // Forward compatible (aka ignorable) records
+  kDbId,
+  kBlobFileAddition_DEPRECATED,
+  kBlobFileGarbage_DEPRECATED,
+  kWalAddition,
+  kWalDeletion,
+  kFullHistoryTsLow,
+  kWalAddition2,
+  kWalDeletion2,
+};
+
+enum NewFileCustomTag : uint32_t {
+  kTerminate = 1,  // The end of customized fields
+  kNeedCompaction = 2,
+  // Since Manifest is not entirely forward-compatible, we currently encode
+  // kMinLogNumberToKeep as part of NewFile as a hack. This should be removed
+  // when manifest becomes forward-compatible.
+  kMinLogNumberToKeepHack = 3,
+  kOldestBlobFileNumber = 4,
+  kOldestAncesterTime = 5,
+  kFileCreationTime = 6,
+  kFileChecksum = 7,
+  kFileChecksumFuncName = 8,
+  kTemperature = 9,
+  kMinTimestamp = 10,
+  kMaxTimestamp = 11,
+  kUniqueId = 12,
+
+  // If this bit for the custom tag is set, opening DB should fail if
+  // we don't know this field.
+  kCustomTagNonSafeIgnoreMask = 1 << 6,
+
+  // Forward incompatible (aka unignorable) fields
+  kPathId,
+};
+
+class VersionSet;
+
+constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF;
+constexpr uint64_t kUnknownOldestAncesterTime = 0;
+constexpr uint64_t kUnknownFileCreationTime = 0;
+
+extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id);
+
+// A copyable structure contains information needed to read data from an SST
+// file. It can contain a pointer to a table reader opened for the file, or
+// file number and size, which can be used to create a new table reader for it.
+// The behavior is undefined when a copied of the structure is used when the
+// file is not in any live version any more.
+struct FileDescriptor {
+  // Table reader in table_reader_handle
+  TableReader* table_reader;
+  uint64_t packed_number_and_path_id;
+  uint64_t file_size;             // File size in bytes
+  SequenceNumber smallest_seqno;  // The smallest seqno in this file
+  SequenceNumber largest_seqno;   // The largest seqno in this file
+
+  FileDescriptor() : FileDescriptor(0, 0, 0) {}
+
+  FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size)
+      : FileDescriptor(number, path_id, _file_size, kMaxSequenceNumber, 0) {}
+
+  FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size,
+                 SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno)
+      : table_reader(nullptr),
+        packed_number_and_path_id(PackFileNumberAndPathId(number, path_id)),
+        file_size(_file_size),
+        smallest_seqno(_smallest_seqno),
+        largest_seqno(_largest_seqno) {}
+
+  FileDescriptor(const FileDescriptor& fd) { *this = fd; }
+
+  FileDescriptor& operator=(const FileDescriptor& fd) {
+    table_reader = fd.table_reader;
+    packed_number_and_path_id = fd.packed_number_and_path_id;
+    file_size = fd.file_size;
+    smallest_seqno = fd.smallest_seqno;
+    largest_seqno = fd.largest_seqno;
+    return *this;
+  }
+
+  uint64_t GetNumber() const {
+    return packed_number_and_path_id & kFileNumberMask;
+  }
+  uint32_t GetPathId() const {
+    return static_cast<uint32_t>(packed_number_and_path_id /
+                                 (kFileNumberMask + 1));
+  }
+  uint64_t GetFileSize() const { return file_size; }
+};
+
+struct FileSampledStats {
+  FileSampledStats() : num_reads_sampled(0) {}
+  FileSampledStats(const FileSampledStats& other) { *this = other; }
+  FileSampledStats& operator=(const FileSampledStats& other) {
+    num_reads_sampled = other.num_reads_sampled.load();
+    return *this;
+  }
+
+  // number of user reads to this file.
+  mutable std::atomic<uint64_t> num_reads_sampled;
+};
+
+struct FileMetaData {
+  FileDescriptor fd;
+  InternalKey smallest;  // Smallest internal key served by table
+  InternalKey largest;   // Largest internal key served by table
+
+  // Needs to be disposed when refs becomes 0.
+  Cache::Handle* table_reader_handle = nullptr;
+
+  FileSampledStats stats;
+
+  // Stats for compensating deletion entries during compaction
+
+  // File size compensated by deletion entry.
+  // This is updated in Version::UpdateAccumulatedStats() first time when the
+  // file is created or loaded.  After it is updated (!= 0), it is immutable.
+  uint64_t compensated_file_size = 0;
+  // These values can mutate, but they can only be read or written from
+  // single-threaded LogAndApply thread
+  uint64_t num_entries = 0;     // the number of entries.
+  uint64_t num_deletions = 0;   // the number of deletion entries.
+  uint64_t raw_key_size = 0;    // total uncompressed key size.
+  uint64_t raw_value_size = 0;  // total uncompressed value size.
+
+  int refs = 0;  // Reference count
+
+  bool being_compacted = false;       // Is this file undergoing compaction?
+  bool init_stats_from_file = false;  // true if the data-entry stats of this
+                                      // file has initialized from file.
+
+  bool marked_for_compaction = false;  // True if client asked us nicely to
+                                       // compact this file.
+  Temperature temperature = Temperature::kUnknown;
+
+  // Used only in BlobDB. The file number of the oldest blob file this SST file
+  // refers to. 0 is an invalid value; BlobDB numbers the files starting from 1.
+  uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
+
+  // The file could be the compaction output from other SST files, which could
+  // in turn be outputs for compact older SST files. We track the memtable
+  // flush timestamp for the oldest SST file that eventually contribute data
+  // to this file. 0 means the information is not available.
+  uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
+
+  // Unix time when the SST file is created.
+  uint64_t file_creation_time = kUnknownFileCreationTime;
+
+  // File checksum
+  std::string file_checksum = kUnknownFileChecksum;
+
+  // File checksum function name
+  std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
+
+  // SST unique id
+  UniqueId64x2 unique_id{};
+
+  FileMetaData() = default;
+
+  FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size,
+               const InternalKey& smallest_key, const InternalKey& largest_key,
+               const SequenceNumber& smallest_seq,
+               const SequenceNumber& largest_seq, bool marked_for_compact,
+               Temperature _temperature, uint64_t oldest_blob_file,
+               uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
+               const std::string& _file_checksum,
+               const std::string& _file_checksum_func_name,
+               UniqueId64x2 _unique_id)
+      : fd(file, file_path_id, file_size, smallest_seq, largest_seq),
+        smallest(smallest_key),
+        largest(largest_key),
+        marked_for_compaction(marked_for_compact),
+        temperature(_temperature),
+        oldest_blob_file_number(oldest_blob_file),
+        oldest_ancester_time(_oldest_ancester_time),
+        file_creation_time(_file_creation_time),
+        file_checksum(_file_checksum),
+        file_checksum_func_name(_file_checksum_func_name),
+        unique_id(std::move(_unique_id)) {
+    TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this);
+  }
+
+  // REQUIRED: Keys must be given to the function in sorted order (it expects
+  // the last key to be the largest).
+  Status UpdateBoundaries(const Slice& key, const Slice& value,
+                          SequenceNumber seqno, ValueType value_type);
+
+  // Unlike UpdateBoundaries, ranges do not need to be presented in any
+  // particular order.
+  void UpdateBoundariesForRange(const InternalKey& start,
+                                const InternalKey& end, SequenceNumber seqno,
+                                const InternalKeyComparator& icmp) {
+    if (smallest.size() == 0 || icmp.Compare(start, smallest) < 0) {
+      smallest = start;
+    }
+    if (largest.size() == 0 || icmp.Compare(largest, end) < 0) {
+      largest = end;
+    }
+    fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
+    fd.largest_seqno = std::max(fd.largest_seqno, seqno);
+  }
+
+  // Try to get oldest ancester time from the class itself or table properties
+  // if table reader is already pinned.
+  // 0 means the information is not available.
+  uint64_t TryGetOldestAncesterTime() {
+    if (oldest_ancester_time != kUnknownOldestAncesterTime) {
+      return oldest_ancester_time;
+    } else if (fd.table_reader != nullptr &&
+               fd.table_reader->GetTableProperties() != nullptr) {
+      return fd.table_reader->GetTableProperties()->creation_time;
+    }
+    return kUnknownOldestAncesterTime;
+  }
+
+  uint64_t TryGetFileCreationTime() {
+    if (file_creation_time != kUnknownFileCreationTime) {
+      return file_creation_time;
+    } else if (fd.table_reader != nullptr &&
+               fd.table_reader->GetTableProperties() != nullptr) {
+      return fd.table_reader->GetTableProperties()->file_creation_time;
+    }
+    return kUnknownFileCreationTime;
+  }
+
+  // WARNING: manual update to this function is needed
+  // whenever a new string property is added to FileMetaData
+  // to reduce approximation error.
+  //
+  // TODO: eliminate the need of manually updating this function
+  // for new string properties
+  size_t ApproximateMemoryUsage() const {
+    size_t usage = 0;
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    usage += malloc_usable_size(const_cast<FileMetaData*>(this));
+#else
+    usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    usage += smallest.size() + largest.size() + file_checksum.size() +
+             file_checksum_func_name.size();
+    return usage;
+  }
+};
+
+// A compressed copy of file meta data that just contain minimum data needed
+// to serve read operations, while still keeping the pointer to full metadata
+// of the file in case it is needed.
+struct FdWithKeyRange {
+  FileDescriptor fd;
+  FileMetaData* file_metadata;  // Point to all metadata
+  Slice smallest_key;           // slice that contain smallest key
+  Slice largest_key;            // slice that contain largest key
+
+  FdWithKeyRange()
+      : fd(), file_metadata(nullptr), smallest_key(), largest_key() {}
+
+  FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key,
+                 FileMetaData* _file_metadata)
+      : fd(_fd),
+        file_metadata(_file_metadata),
+        smallest_key(_smallest_key),
+        largest_key(_largest_key) {}
+};
+
+// Data structure to store an array of FdWithKeyRange in one level
+// Actual data is guaranteed to be stored closely
+struct LevelFilesBrief {
+  size_t num_files;
+  FdWithKeyRange* files;
+  LevelFilesBrief() {
+    num_files = 0;
+    files = nullptr;
+  }
+};
+
+// The state of a DB at any given time is referred to as a Version.
+// Any modification to the Version is considered a Version Edit. A Version is
+// constructed by joining a sequence of Version Edits. Version Edits are written
+// to the MANIFEST file.
+class VersionEdit {
+ public:
+  void Clear();
+
+  void SetDBId(const std::string& db_id) {
+    has_db_id_ = true;
+    db_id_ = db_id;
+  }
+  bool HasDbId() const { return has_db_id_; }
+  const std::string& GetDbId() const { return db_id_; }
+
+  void SetComparatorName(const Slice& name) {
+    has_comparator_ = true;
+    comparator_ = name.ToString();
+  }
+  bool HasComparatorName() const { return has_comparator_; }
+  const std::string& GetComparatorName() const { return comparator_; }
+
+  void SetLogNumber(uint64_t num) {
+    has_log_number_ = true;
+    log_number_ = num;
+  }
+  bool HasLogNumber() const { return has_log_number_; }
+  uint64_t GetLogNumber() const { return log_number_; }
+
+  void SetPrevLogNumber(uint64_t num) {
+    has_prev_log_number_ = true;
+    prev_log_number_ = num;
+  }
+  bool HasPrevLogNumber() const { return has_prev_log_number_; }
+  uint64_t GetPrevLogNumber() const { return prev_log_number_; }
+
+  void SetNextFile(uint64_t num) {
+    has_next_file_number_ = true;
+    next_file_number_ = num;
+  }
+  bool HasNextFile() const { return has_next_file_number_; }
+  uint64_t GetNextFile() const { return next_file_number_; }
+
+  void SetMaxColumnFamily(uint32_t max_column_family) {
+    has_max_column_family_ = true;
+    max_column_family_ = max_column_family;
+  }
+  bool HasMaxColumnFamily() const { return has_max_column_family_; }
+  uint32_t GetMaxColumnFamily() const { return max_column_family_; }
+
+  void SetMinLogNumberToKeep(uint64_t num) {
+    has_min_log_number_to_keep_ = true;
+    min_log_number_to_keep_ = num;
+  }
+  bool HasMinLogNumberToKeep() const { return has_min_log_number_to_keep_; }
+  uint64_t GetMinLogNumberToKeep() const { return min_log_number_to_keep_; }
+
+  void SetLastSequence(SequenceNumber seq) {
+    has_last_sequence_ = true;
+    last_sequence_ = seq;
+  }
+  bool HasLastSequence() const { return has_last_sequence_; }
+  SequenceNumber GetLastSequence() const { return last_sequence_; }
+
+  // Delete the specified table file from the specified level.
+  void DeleteFile(int level, uint64_t file) {
+    deleted_files_.emplace(level, file);
+  }
+
+  // Retrieve the table files deleted as well as their associated levels.
+  using DeletedFiles = std::set<std::pair<int, uint64_t>>;
+  const DeletedFiles& GetDeletedFiles() const { return deleted_files_; }
+
+  // Add the specified table file at the specified level.
+  // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
+  // REQUIRES: "oldest_blob_file_number" is the number of the oldest blob file
+  // referred to by this file if any, kInvalidBlobFileNumber otherwise.
+  void AddFile(int level, uint64_t file, uint32_t file_path_id,
+               uint64_t file_size, const InternalKey& smallest,
+               const InternalKey& largest, const SequenceNumber& smallest_seqno,
+               const SequenceNumber& largest_seqno, bool marked_for_compaction,
+               Temperature temperature, uint64_t oldest_blob_file_number,
+               uint64_t oldest_ancester_time, uint64_t file_creation_time,
+               const std::string& file_checksum,
+               const std::string& file_checksum_func_name,
+               const UniqueId64x2& unique_id) {
+    assert(smallest_seqno <= largest_seqno);
+    new_files_.emplace_back(
+        level,
+        FileMetaData(file, file_path_id, file_size, smallest, largest,
+                     smallest_seqno, largest_seqno, marked_for_compaction,
+                     temperature, oldest_blob_file_number, oldest_ancester_time,
+                     file_creation_time, file_checksum, file_checksum_func_name,
+                     unique_id));
+    if (!HasLastSequence() || largest_seqno > GetLastSequence()) {
+      SetLastSequence(largest_seqno);
+    }
+  }
+
+  void AddFile(int level, const FileMetaData& f) {
+    assert(f.fd.smallest_seqno <= f.fd.largest_seqno);
+    new_files_.emplace_back(level, f);
+    if (!HasLastSequence() || f.fd.largest_seqno > GetLastSequence()) {
+      SetLastSequence(f.fd.largest_seqno);
+    }
+  }
+
+  // Retrieve the table files added as well as their associated levels.
+  using NewFiles = std::vector<std::pair<int, FileMetaData>>;
+  const NewFiles& GetNewFiles() const { return new_files_; }
+
+  // Retrieve all the compact cursors
+  using CompactCursors = std::vector<std::pair<int, InternalKey>>;
+  const CompactCursors& GetCompactCursors() const { return compact_cursors_; }
+  void AddCompactCursor(int level, const InternalKey& cursor) {
+    compact_cursors_.push_back(std::make_pair(level, cursor));
+  }
+  void SetCompactCursors(
+      const std::vector<InternalKey>& compact_cursors_by_level) {
+    compact_cursors_.clear();
+    compact_cursors_.reserve(compact_cursors_by_level.size());
+    for (int i = 0; i < (int)compact_cursors_by_level.size(); i++) {
+      if (compact_cursors_by_level[i].Valid()) {
+        compact_cursors_.push_back(
+            std::make_pair(i, compact_cursors_by_level[i]));
+      }
+    }
+  }
+
+  // Add a new blob file.
+  void AddBlobFile(uint64_t blob_file_number, uint64_t total_blob_count,
+                   uint64_t total_blob_bytes, std::string checksum_method,
+                   std::string checksum_value) {
+    blob_file_additions_.emplace_back(
+        blob_file_number, total_blob_count, total_blob_bytes,
+        std::move(checksum_method), std::move(checksum_value));
+  }
+
+  void AddBlobFile(BlobFileAddition blob_file_addition) {
+    blob_file_additions_.emplace_back(std::move(blob_file_addition));
+  }
+
+  // Retrieve all the blob files added.
+  using BlobFileAdditions = std::vector<BlobFileAddition>;
+  const BlobFileAdditions& GetBlobFileAdditions() const {
+    return blob_file_additions_;
+  }
+
+  void SetBlobFileAdditions(BlobFileAdditions blob_file_additions) {
+    assert(blob_file_additions_.empty());
+    blob_file_additions_ = std::move(blob_file_additions);
+  }
+
+  // Add garbage for an existing blob file.  Note: intentionally broken English
+  // follows.
+  void AddBlobFileGarbage(uint64_t blob_file_number,
+                          uint64_t garbage_blob_count,
+                          uint64_t garbage_blob_bytes) {
+    blob_file_garbages_.emplace_back(blob_file_number, garbage_blob_count,
+                                     garbage_blob_bytes);
+  }
+
+  void AddBlobFileGarbage(BlobFileGarbage blob_file_garbage) {
+    blob_file_garbages_.emplace_back(std::move(blob_file_garbage));
+  }
+
+  // Retrieve all the blob file garbage added.
+  using BlobFileGarbages = std::vector<BlobFileGarbage>;
+  const BlobFileGarbages& GetBlobFileGarbages() const {
+    return blob_file_garbages_;
+  }
+
+  void SetBlobFileGarbages(BlobFileGarbages blob_file_garbages) {
+    assert(blob_file_garbages_.empty());
+    blob_file_garbages_ = std::move(blob_file_garbages);
+  }
+
+  // Add a WAL (either just created or closed).
+  // AddWal and DeleteWalsBefore cannot be called on the same VersionEdit.
+  void AddWal(WalNumber number, WalMetadata metadata = WalMetadata()) {
+    assert(NumEntries() == wal_additions_.size());
+    wal_additions_.emplace_back(number, std::move(metadata));
+  }
+
+  // Retrieve all the added WALs.
+  const WalAdditions& GetWalAdditions() const { return wal_additions_; }
+
+  bool IsWalAddition() const { return !wal_additions_.empty(); }
+
+  // Delete a WAL (either directly deleted or archived).
+  // AddWal and DeleteWalsBefore cannot be called on the same VersionEdit.
+  void DeleteWalsBefore(WalNumber number) {
+    assert((NumEntries() == 1) == !wal_deletion_.IsEmpty());
+    wal_deletion_ = WalDeletion(number);
+  }
+
+  const WalDeletion& GetWalDeletion() const { return wal_deletion_; }
+
+  bool IsWalDeletion() const { return !wal_deletion_.IsEmpty(); }
+
+  bool IsWalManipulation() const {
+    size_t entries = NumEntries();
+    return (entries > 0) && ((entries == wal_additions_.size()) ||
+                             (entries == !wal_deletion_.IsEmpty()));
+  }
+
+  // Number of edits
+  size_t NumEntries() const {
+    return new_files_.size() + deleted_files_.size() +
+           blob_file_additions_.size() + blob_file_garbages_.size() +
+           wal_additions_.size() + !wal_deletion_.IsEmpty();
+  }
+
+  void SetColumnFamily(uint32_t column_family_id) {
+    column_family_ = column_family_id;
+  }
+  uint32_t GetColumnFamily() const { return column_family_; }
+
+  // set column family ID by calling SetColumnFamily()
+  void AddColumnFamily(const std::string& name) {
+    assert(!is_column_family_drop_);
+    assert(!is_column_family_add_);
+    assert(NumEntries() == 0);
+    is_column_family_add_ = true;
+    column_family_name_ = name;
+  }
+
+  // set column family ID by calling SetColumnFamily()
+  void DropColumnFamily() {
+    assert(!is_column_family_drop_);
+    assert(!is_column_family_add_);
+    assert(NumEntries() == 0);
+    is_column_family_drop_ = true;
+  }
+
+  bool IsColumnFamilyManipulation() const {
+    return is_column_family_add_ || is_column_family_drop_;
+  }
+
+  bool IsColumnFamilyAdd() const { return is_column_family_add_; }
+
+  bool IsColumnFamilyDrop() const { return is_column_family_drop_; }
+
+  void MarkAtomicGroup(uint32_t remaining_entries) {
+    is_in_atomic_group_ = true;
+    remaining_entries_ = remaining_entries;
+  }
+  bool IsInAtomicGroup() const { return is_in_atomic_group_; }
+  uint32_t GetRemainingEntries() const { return remaining_entries_; }
+
+  bool HasFullHistoryTsLow() const { return !full_history_ts_low_.empty(); }
+  const std::string& GetFullHistoryTsLow() const {
+    assert(HasFullHistoryTsLow());
+    return full_history_ts_low_;
+  }
+  void SetFullHistoryTsLow(std::string full_history_ts_low) {
+    assert(!full_history_ts_low.empty());
+    full_history_ts_low_ = std::move(full_history_ts_low);
+  }
+
+  // return true on success.
+  bool EncodeTo(std::string* dst) const;
+  Status DecodeFrom(const Slice& src);
+
+  std::string DebugString(bool hex_key = false) const;
+  std::string DebugJSON(int edit_num, bool hex_key = false) const;
+
+ private:
+  friend class ReactiveVersionSet;
+  friend class VersionEditHandlerBase;
+  friend class ListColumnFamiliesHandler;
+  friend class VersionEditHandler;
+  friend class VersionEditHandlerPointInTime;
+  friend class DumpManifestHandler;
+  friend class VersionSet;
+  friend class Version;
+  friend class AtomicGroupReadBuffer;
+
+  bool GetLevel(Slice* input, int* level, const char** msg);
+
+  const char* DecodeNewFile4From(Slice* input);
+
+  int max_level_ = 0;
+  std::string db_id_;
+  std::string comparator_;
+  uint64_t log_number_ = 0;
+  uint64_t prev_log_number_ = 0;
+  uint64_t next_file_number_ = 0;
+  uint32_t max_column_family_ = 0;
+  // The most recent WAL log number that is deleted
+  uint64_t min_log_number_to_keep_ = 0;
+  SequenceNumber last_sequence_ = 0;
+  bool has_db_id_ = false;
+  bool has_comparator_ = false;
+  bool has_log_number_ = false;
+  bool has_prev_log_number_ = false;
+  bool has_next_file_number_ = false;
+  bool has_max_column_family_ = false;
+  bool has_min_log_number_to_keep_ = false;
+  bool has_last_sequence_ = false;
+
+  // Compaction cursors for round-robin compaction policy
+  CompactCursors compact_cursors_;
+
+  DeletedFiles deleted_files_;
+  NewFiles new_files_;
+
+  BlobFileAdditions blob_file_additions_;
+  BlobFileGarbages blob_file_garbages_;
+
+  WalAdditions wal_additions_;
+  WalDeletion wal_deletion_;
+
+  // Each version edit record should have column_family_ set
+  // If it's not set, it is default (0)
+  uint32_t column_family_ = 0;
+  // a version edit can be either column_family add or
+  // column_family drop. If it's column family add,
+  // it also includes column family name.
+  bool is_column_family_drop_ = false;
+  bool is_column_family_add_ = false;
+  std::string column_family_name_;
+
+  bool is_in_atomic_group_ = false;
+  uint32_t remaining_entries_ = 0;
+
+  std::string full_history_ts_low_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_edit_handler.cc b/src/rocksdb/db/version_edit_handler.cc
new file mode 100644
index 000000000..145e78789
--- /dev/null
+++ b/src/rocksdb/db/version_edit_handler.cc
@@ -0,0 +1,1002 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit_handler.h"
+
+#include <cinttypes>
+#include <sstream>
+
+#include "db/blob/blob_file_reader.h"
+#include "db/blob/blob_source.h"
+#include "logging/logging.h"
+#include "monitoring/persistent_stats_history.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void VersionEditHandlerBase::Iterate(log::Reader& reader,
+                                     Status* log_read_status) {
+  Slice record;
+  std::string scratch;
+  assert(log_read_status);
+  assert(log_read_status->ok());
+
+  size_t recovered_edits = 0;
+  Status s = Initialize();
+  while (reader.LastRecordEnd() < max_manifest_read_size_ && s.ok() &&
+         reader.ReadRecord(&record, &scratch) && log_read_status->ok()) {
+    VersionEdit edit;
+    s = edit.DecodeFrom(record);
+    if (!s.ok()) {
+      break;
+    }
+
+    s = read_buffer_.AddEdit(&edit);
+    if (!s.ok()) {
+      break;
+    }
+    ColumnFamilyData* cfd = nullptr;
+    if (edit.is_in_atomic_group_) {
+      if (read_buffer_.IsFull()) {
+        for (auto& e : read_buffer_.replay_buffer()) {
+          s = ApplyVersionEdit(e, &cfd);
+          if (!s.ok()) {
+            break;
+          }
+          ++recovered_edits;
+        }
+        if (!s.ok()) {
+          break;
+        }
+        read_buffer_.Clear();
+      }
+    } else {
+      s = ApplyVersionEdit(edit, &cfd);
+      if (s.ok()) {
+        ++recovered_edits;
+      }
+    }
+  }
+  if (!log_read_status->ok()) {
+    s = *log_read_status;
+  }
+
+  CheckIterationResult(reader, &s);
+
+  if (!s.ok()) {
+    if (s.IsCorruption()) {
+      // when we find a Corruption error, something is
+      // wrong with the underlying file. in this case we
+      // want to report the filename, so in here we append
+      // the filename to the Corruption message
+      assert(reader.file());
+
+      // build a new error message
+      std::stringstream message;
+      // append previous dynamic state message
+      const char* state = s.getState();
+      if (state != nullptr) {
+        message << state;
+        message << ' ';
+      }
+      // append the filename to the corruption message
+      message << "in file " << reader.file()->file_name();
+      // overwrite the status with the extended status
+      s = Status(s.code(), s.subcode(), s.severity(), message.str());
+    }
+    status_ = s;
+  }
+  TEST_SYNC_POINT_CALLBACK("VersionEditHandlerBase::Iterate:Finish",
+                           &recovered_edits);
+}
+
+Status ListColumnFamiliesHandler::ApplyVersionEdit(
+    VersionEdit& edit, ColumnFamilyData** /*unused*/) {
+  Status s;
+  if (edit.is_column_family_add_) {
+    if (column_family_names_.find(edit.column_family_) !=
+        column_family_names_.end()) {
+      s = Status::Corruption("Manifest adding the same column family twice");
+    } else {
+      column_family_names_.insert(
+          {edit.column_family_, edit.column_family_name_});
+    }
+  } else if (edit.is_column_family_drop_) {
+    if (column_family_names_.find(edit.column_family_) ==
+        column_family_names_.end()) {
+      s = Status::Corruption("Manifest - dropping non-existing column family");
+    } else {
+      column_family_names_.erase(edit.column_family_);
+    }
+  }
+  return s;
+}
+
+Status FileChecksumRetriever::ApplyVersionEdit(VersionEdit& edit,
+                                               ColumnFamilyData** /*unused*/) {
+  for (const auto& deleted_file : edit.GetDeletedFiles()) {
+    Status s = file_checksum_list_.RemoveOneFileChecksum(deleted_file.second);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  for (const auto& new_file : edit.GetNewFiles()) {
+    Status s = file_checksum_list_.InsertOneFileChecksum(
+        new_file.second.fd.GetNumber(), new_file.second.file_checksum,
+        new_file.second.file_checksum_func_name);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  for (const auto& new_blob_file : edit.GetBlobFileAdditions()) {
+    std::string checksum_value = new_blob_file.GetChecksumValue();
+    std::string checksum_method = new_blob_file.GetChecksumMethod();
+    assert(checksum_value.empty() == checksum_method.empty());
+    if (checksum_method.empty()) {
+      checksum_value = kUnknownFileChecksum;
+      checksum_method = kUnknownFileChecksumFuncName;
+    }
+    Status s = file_checksum_list_.InsertOneFileChecksum(
+        new_blob_file.GetBlobFileNumber(), checksum_value, checksum_method);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  return Status::OK();
+}
+
+VersionEditHandler::VersionEditHandler(
+    bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
+    VersionSet* version_set, bool track_missing_files,
+    bool no_error_if_files_missing, const std::shared_ptr<IOTracer>& io_tracer,
+    bool skip_load_table_files)
+    : VersionEditHandlerBase(),
+      read_only_(read_only),
+      column_families_(std::move(column_families)),
+      version_set_(version_set),
+      track_missing_files_(track_missing_files),
+      no_error_if_files_missing_(no_error_if_files_missing),
+      io_tracer_(io_tracer),
+      skip_load_table_files_(skip_load_table_files),
+      initialized_(false) {
+  assert(version_set_ != nullptr);
+}
+
+Status VersionEditHandler::Initialize() {
+  Status s;
+  if (!initialized_) {
+    for (const auto& cf_desc : column_families_) {
+      name_to_options_.emplace(cf_desc.name, cf_desc.options);
+    }
+    auto default_cf_iter = name_to_options_.find(kDefaultColumnFamilyName);
+    if (default_cf_iter == name_to_options_.end()) {
+      s = Status::InvalidArgument("Default column family not specified");
+    }
+    if (s.ok()) {
+      VersionEdit default_cf_edit;
+      default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName);
+      default_cf_edit.SetColumnFamily(0);
+      ColumnFamilyData* cfd =
+          CreateCfAndInit(default_cf_iter->second, default_cf_edit);
+      assert(cfd != nullptr);
+#ifdef NDEBUG
+      (void)cfd;
+#endif
+      initialized_ = true;
+    }
+  }
+  return s;
+}
+
+Status VersionEditHandler::ApplyVersionEdit(VersionEdit& edit,
+                                            ColumnFamilyData** cfd) {
+  Status s;
+  if (edit.is_column_family_add_) {
+    s = OnColumnFamilyAdd(edit, cfd);
+  } else if (edit.is_column_family_drop_) {
+    s = OnColumnFamilyDrop(edit, cfd);
+  } else if (edit.IsWalAddition()) {
+    s = OnWalAddition(edit);
+  } else if (edit.IsWalDeletion()) {
+    s = OnWalDeletion(edit);
+  } else {
+    s = OnNonCfOperation(edit, cfd);
+  }
+  if (s.ok()) {
+    assert(cfd != nullptr);
+    s = ExtractInfoFromVersionEdit(*cfd, edit);
+  }
+  return s;
+}
+
+Status VersionEditHandler::OnColumnFamilyAdd(VersionEdit& edit,
+                                             ColumnFamilyData** cfd) {
+  bool cf_in_not_found = false;
+  bool cf_in_builders = false;
+  CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders);
+
+  assert(cfd != nullptr);
+  *cfd = nullptr;
+  Status s;
+  if (cf_in_builders || cf_in_not_found) {
+    s = Status::Corruption("MANIFEST adding the same column family twice: " +
+                           edit.column_family_name_);
+  }
+  if (s.ok()) {
+    auto cf_options = name_to_options_.find(edit.column_family_name_);
+    // implicitly add persistent_stats column family without requiring user
+    // to specify
+    ColumnFamilyData* tmp_cfd = nullptr;
+    bool is_persistent_stats_column_family =
+        edit.column_family_name_.compare(kPersistentStatsColumnFamilyName) == 0;
+    if (cf_options == name_to_options_.end() &&
+        !is_persistent_stats_column_family) {
+      column_families_not_found_.emplace(edit.column_family_,
+                                         edit.column_family_name_);
+    } else {
+      if (is_persistent_stats_column_family) {
+        ColumnFamilyOptions cfo;
+        OptimizeForPersistentStats(&cfo);
+        tmp_cfd = CreateCfAndInit(cfo, edit);
+      } else {
+        tmp_cfd = CreateCfAndInit(cf_options->second, edit);
+      }
+      *cfd = tmp_cfd;
+    }
+  }
+  return s;
+}
+
+Status VersionEditHandler::OnColumnFamilyDrop(VersionEdit& edit,
+                                              ColumnFamilyData** cfd) {
+  bool cf_in_not_found = false;
+  bool cf_in_builders = false;
+  CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders);
+
+  assert(cfd != nullptr);
+  *cfd = nullptr;
+  ColumnFamilyData* tmp_cfd = nullptr;
+  Status s;
+  if (cf_in_builders) {
+    tmp_cfd = DestroyCfAndCleanup(edit);
+  } else if (cf_in_not_found) {
+    column_families_not_found_.erase(edit.column_family_);
+  } else {
+    s = Status::Corruption("MANIFEST - dropping non-existing column family");
+  }
+  *cfd = tmp_cfd;
+  return s;
+}
+
+Status VersionEditHandler::OnWalAddition(VersionEdit& edit) {
+  assert(edit.IsWalAddition());
+  return version_set_->wals_.AddWals(edit.GetWalAdditions());
+}
+
+Status VersionEditHandler::OnWalDeletion(VersionEdit& edit) {
+  assert(edit.IsWalDeletion());
+  return version_set_->wals_.DeleteWalsBefore(
+      edit.GetWalDeletion().GetLogNumber());
+}
+
+Status VersionEditHandler::OnNonCfOperation(VersionEdit& edit,
+                                            ColumnFamilyData** cfd) {
+  bool cf_in_not_found = false;
+  bool cf_in_builders = false;
+  CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders);
+
+  assert(cfd != nullptr);
+  *cfd = nullptr;
+  Status s;
+  if (!cf_in_not_found) {
+    if (!cf_in_builders) {
+      s = Status::Corruption(
+          "MANIFEST record referencing unknown column family");
+    }
+    ColumnFamilyData* tmp_cfd = nullptr;
+    if (s.ok()) {
+      auto builder_iter = builders_.find(edit.column_family_);
+      assert(builder_iter != builders_.end());
+      tmp_cfd = version_set_->GetColumnFamilySet()->GetColumnFamily(
+          edit.column_family_);
+      assert(tmp_cfd != nullptr);
+      s = MaybeCreateVersion(edit, tmp_cfd, /*force_create_version=*/false);
+      if (s.ok()) {
+        s = builder_iter->second->version_builder()->Apply(&edit);
+      }
+    }
+    *cfd = tmp_cfd;
+  }
+  return s;
+}
+
+// TODO maybe cache the computation result
+bool VersionEditHandler::HasMissingFiles() const {
+  bool ret = false;
+  for (const auto& elem : cf_to_missing_files_) {
+    const auto& missing_files = elem.second;
+    if (!missing_files.empty()) {
+      ret = true;
+      break;
+    }
+  }
+  if (!ret) {
+    for (const auto& elem : cf_to_missing_blob_files_high_) {
+      if (elem.second != kInvalidBlobFileNumber) {
+        ret = true;
+        break;
+      }
+    }
+  }
+  return ret;
+}
+
+void VersionEditHandler::CheckColumnFamilyId(const VersionEdit& edit,
+                                             bool* cf_in_not_found,
+                                             bool* cf_in_builders) const {
+  assert(cf_in_not_found != nullptr);
+  assert(cf_in_builders != nullptr);
+  // Not found means that user didn't supply that column
+  // family option AND we encountered column family add
+  // record. Once we encounter column family drop record,
+  // we will delete the column family from
+  // column_families_not_found.
+  bool in_not_found = column_families_not_found_.find(edit.column_family_) !=
+                      column_families_not_found_.end();
+  // in builders means that user supplied that column family
+  // option AND that we encountered column family add record
+  bool in_builders = builders_.find(edit.column_family_) != builders_.end();
+  // They cannot both be true
+  assert(!(in_not_found && in_builders));
+  *cf_in_not_found = in_not_found;
+  *cf_in_builders = in_builders;
+}
+
+void VersionEditHandler::CheckIterationResult(const log::Reader& reader,
+                                              Status* s) {
+  assert(s != nullptr);
+  if (!s->ok()) {
+    // Do nothing here.
+  } else if (!version_edit_params_.has_log_number_ ||
+             !version_edit_params_.has_next_file_number_ ||
+             !version_edit_params_.has_last_sequence_) {
+    std::string msg("no ");
+    if (!version_edit_params_.has_log_number_) {
+      msg.append("log_file_number, ");
+    }
+    if (!version_edit_params_.has_next_file_number_) {
+      msg.append("next_file_number, ");
+    }
+    if (!version_edit_params_.has_last_sequence_) {
+      msg.append("last_sequence, ");
+    }
+    msg = msg.substr(0, msg.size() - 2);
+    msg.append(" entry in MANIFEST");
+    *s = Status::Corruption(msg);
+  }
+  // There were some column families in the MANIFEST that weren't specified
+  // in the argument. This is OK in read_only mode
+  if (s->ok() && MustOpenAllColumnFamilies() &&
+      !column_families_not_found_.empty()) {
+    std::string msg;
+    for (const auto& cf : column_families_not_found_) {
+      msg.append(", ");
+      msg.append(cf.second);
+    }
+    msg = msg.substr(2);
+    *s = Status::InvalidArgument("Column families not opened: " + msg);
+  }
+  if (s->ok()) {
+    version_set_->GetColumnFamilySet()->UpdateMaxColumnFamily(
+        version_edit_params_.max_column_family_);
+    version_set_->MarkMinLogNumberToKeep(
+        version_edit_params_.min_log_number_to_keep_);
+    version_set_->MarkFileNumberUsed(version_edit_params_.prev_log_number_);
+    version_set_->MarkFileNumberUsed(version_edit_params_.log_number_);
+    for (auto* cfd : *(version_set_->GetColumnFamilySet())) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      auto builder_iter = builders_.find(cfd->GetID());
+      assert(builder_iter != builders_.end());
+      auto* builder = builder_iter->second->version_builder();
+      if (!builder->CheckConsistencyForNumLevels()) {
+        *s = Status::InvalidArgument(
+            "db has more levels than options.num_levels");
+        break;
+      }
+    }
+  }
+  if (s->ok()) {
+    for (auto* cfd : *(version_set_->GetColumnFamilySet())) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      if (read_only_) {
+        cfd->table_cache()->SetTablesAreImmortal();
+      }
+      *s = LoadTables(cfd, /*prefetch_index_and_filter_in_cache=*/false,
+                      /*is_initial_load=*/true);
+      if (!s->ok()) {
+        // If s is IOError::PathNotFound, then we mark the db as corrupted.
+        if (s->IsPathNotFound()) {
+          *s = Status::Corruption("Corruption: " + s->ToString());
+        }
+        break;
+      }
+    }
+  }
+  if (s->ok()) {
+    for (auto* cfd : *(version_set_->column_family_set_)) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      assert(cfd->initialized());
+      VersionEdit edit;
+      *s = MaybeCreateVersion(edit, cfd, /*force_create_version=*/true);
+      if (!s->ok()) {
+        break;
+      }
+    }
+  }
+  if (s->ok()) {
+    version_set_->manifest_file_size_ = reader.GetReadOffset();
+    assert(version_set_->manifest_file_size_ > 0);
+    version_set_->next_file_number_.store(
+        version_edit_params_.next_file_number_ + 1);
+    SequenceNumber last_seq = version_edit_params_.last_sequence_;
+    assert(last_seq != kMaxSequenceNumber);
+    if (last_seq != kMaxSequenceNumber &&
+        last_seq > version_set_->last_allocated_sequence_.load()) {
+      version_set_->last_allocated_sequence_.store(last_seq);
+    }
+    if (last_seq != kMaxSequenceNumber &&
+        last_seq > version_set_->last_published_sequence_.load()) {
+      version_set_->last_published_sequence_.store(last_seq);
+    }
+    if (last_seq != kMaxSequenceNumber &&
+        last_seq > version_set_->last_sequence_.load()) {
+      version_set_->last_sequence_.store(last_seq);
+    }
+    if (last_seq != kMaxSequenceNumber &&
+        last_seq > version_set_->descriptor_last_sequence_) {
+      // This is the maximum last sequence of all `VersionEdit`s iterated. It
+      // may be greater than the maximum `largest_seqno` of all files in case
+      // the newest data referred to by the MANIFEST has been dropped or had its
+      // sequence number zeroed through compaction.
+      version_set_->descriptor_last_sequence_ = last_seq;
+    }
+    version_set_->prev_log_number_ = version_edit_params_.prev_log_number_;
+  }
+}
+
+ColumnFamilyData* VersionEditHandler::CreateCfAndInit(
+    const ColumnFamilyOptions& cf_options, const VersionEdit& edit) {
+  ColumnFamilyData* cfd = version_set_->CreateColumnFamily(cf_options, &edit);
+  assert(cfd != nullptr);
+  cfd->set_initialized();
+  assert(builders_.find(edit.column_family_) == builders_.end());
+  builders_.emplace(edit.column_family_,
+                    VersionBuilderUPtr(new BaseReferencedVersionBuilder(cfd)));
+  if (track_missing_files_) {
+    cf_to_missing_files_.emplace(edit.column_family_,
+                                 std::unordered_set<uint64_t>());
+    cf_to_missing_blob_files_high_.emplace(edit.column_family_,
+                                           kInvalidBlobFileNumber);
+  }
+  return cfd;
+}
+
+ColumnFamilyData* VersionEditHandler::DestroyCfAndCleanup(
+    const VersionEdit& edit) {
+  auto builder_iter = builders_.find(edit.column_family_);
+  assert(builder_iter != builders_.end());
+  builders_.erase(builder_iter);
+  if (track_missing_files_) {
+    auto missing_files_iter = cf_to_missing_files_.find(edit.column_family_);
+    assert(missing_files_iter != cf_to_missing_files_.end());
+    cf_to_missing_files_.erase(missing_files_iter);
+
+    auto missing_blob_files_high_iter =
+        cf_to_missing_blob_files_high_.find(edit.column_family_);
+    assert(missing_blob_files_high_iter !=
+           cf_to_missing_blob_files_high_.end());
+    cf_to_missing_blob_files_high_.erase(missing_blob_files_high_iter);
+  }
+  ColumnFamilyData* ret =
+      version_set_->GetColumnFamilySet()->GetColumnFamily(edit.column_family_);
+  assert(ret != nullptr);
+  ret->SetDropped();
+  ret->UnrefAndTryDelete();
+  ret = nullptr;
+  return ret;
+}
+
+Status VersionEditHandler::MaybeCreateVersion(const VersionEdit& /*edit*/,
+                                              ColumnFamilyData* cfd,
+                                              bool force_create_version) {
+  assert(cfd->initialized());
+  Status s;
+  if (force_create_version) {
+    auto builder_iter = builders_.find(cfd->GetID());
+    assert(builder_iter != builders_.end());
+    auto* builder = builder_iter->second->version_builder();
+    auto* v = new Version(cfd, version_set_, version_set_->file_options_,
+                          *cfd->GetLatestMutableCFOptions(), io_tracer_,
+                          version_set_->current_version_number_++);
+    s = builder->SaveTo(v->storage_info());
+    if (s.ok()) {
+      // Install new version
+      v->PrepareAppend(
+          *cfd->GetLatestMutableCFOptions(),
+          !(version_set_->db_options_->skip_stats_update_on_db_open));
+      version_set_->AppendVersion(cfd, v);
+    } else {
+      delete v;
+    }
+  }
+  return s;
+}
+
+Status VersionEditHandler::LoadTables(ColumnFamilyData* cfd,
+                                      bool prefetch_index_and_filter_in_cache,
+                                      bool is_initial_load) {
+  bool skip_load_table_files = skip_load_table_files_;
+  TEST_SYNC_POINT_CALLBACK(
+      "VersionEditHandler::LoadTables:skip_load_table_files",
+      &skip_load_table_files);
+  if (skip_load_table_files) {
+    return Status::OK();
+  }
+  assert(cfd != nullptr);
+  assert(!cfd->IsDropped());
+  auto builder_iter = builders_.find(cfd->GetID());
+  assert(builder_iter != builders_.end());
+  assert(builder_iter->second != nullptr);
+  VersionBuilder* builder = builder_iter->second->version_builder();
+  assert(builder);
+  Status s = builder->LoadTableHandlers(
+      cfd->internal_stats(),
+      version_set_->db_options_->max_file_opening_threads,
+      prefetch_index_and_filter_in_cache, is_initial_load,
+      cfd->GetLatestMutableCFOptions()->prefix_extractor,
+      MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()));
+  if ((s.IsPathNotFound() || s.IsCorruption()) && no_error_if_files_missing_) {
+    s = Status::OK();
+  }
+  if (!s.ok() && !version_set_->db_options_->paranoid_checks) {
+    s = Status::OK();
+  }
+  return s;
+}
+
+Status VersionEditHandler::ExtractInfoFromVersionEdit(ColumnFamilyData* cfd,
+                                                      const VersionEdit& edit) {
+  Status s;
+  if (edit.has_db_id_) {
+    version_set_->db_id_ = edit.GetDbId();
+    version_edit_params_.SetDBId(edit.db_id_);
+  }
+  if (cfd != nullptr) {
+    if (edit.has_log_number_) {
+      if (cfd->GetLogNumber() > edit.log_number_) {
+        ROCKS_LOG_WARN(
+            version_set_->db_options()->info_log,
+            "MANIFEST corruption detected, but ignored - Log numbers in "
+            "records NOT monotonically increasing");
+      } else {
+        cfd->SetLogNumber(edit.log_number_);
+        version_edit_params_.SetLogNumber(edit.log_number_);
+      }
+    }
+    if (edit.has_comparator_ &&
+        edit.comparator_ != cfd->user_comparator()->Name()) {
+      if (!cf_to_cmp_names_) {
+        s = Status::InvalidArgument(
+            cfd->user_comparator()->Name(),
+            "does not match existing comparator " + edit.comparator_);
+      } else {
+        cf_to_cmp_names_->emplace(cfd->GetID(), edit.comparator_);
+      }
+    }
+    if (edit.HasFullHistoryTsLow()) {
+      const std::string& new_ts = edit.GetFullHistoryTsLow();
+      cfd->SetFullHistoryTsLow(new_ts);
+    }
+  }
+
+  if (s.ok()) {
+    if (edit.has_prev_log_number_) {
+      version_edit_params_.SetPrevLogNumber(edit.prev_log_number_);
+    }
+    if (edit.has_next_file_number_) {
+      version_edit_params_.SetNextFile(edit.next_file_number_);
+    }
+    if (edit.has_max_column_family_) {
+      version_edit_params_.SetMaxColumnFamily(edit.max_column_family_);
+    }
+    if (edit.has_min_log_number_to_keep_) {
+      version_edit_params_.min_log_number_to_keep_ =
+          std::max(version_edit_params_.min_log_number_to_keep_,
+                   edit.min_log_number_to_keep_);
+    }
+    if (edit.has_last_sequence_) {
+      // `VersionEdit::last_sequence_`s are assumed to be non-decreasing. This
+      // is legacy behavior that cannot change without breaking downgrade
+      // compatibility.
+      assert(!version_edit_params_.has_last_sequence_ ||
+             version_edit_params_.last_sequence_ <= edit.last_sequence_);
+      version_edit_params_.SetLastSequence(edit.last_sequence_);
+    }
+    if (!version_edit_params_.has_prev_log_number_) {
+      version_edit_params_.SetPrevLogNumber(0);
+    }
+  }
+  return s;
+}
+
+VersionEditHandlerPointInTime::VersionEditHandlerPointInTime(
+    bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
+    VersionSet* version_set, const std::shared_ptr<IOTracer>& io_tracer)
+    : VersionEditHandler(read_only, column_families, version_set,
+                         /*track_missing_files=*/true,
+                         /*no_error_if_files_missing=*/true, io_tracer) {}
+
+VersionEditHandlerPointInTime::~VersionEditHandlerPointInTime() {
+  for (const auto& elem : versions_) {
+    delete elem.second;
+  }
+  versions_.clear();
+}
+
+void VersionEditHandlerPointInTime::CheckIterationResult(
+    const log::Reader& reader, Status* s) {
+  VersionEditHandler::CheckIterationResult(reader, s);
+  assert(s != nullptr);
+  if (s->ok()) {
+    for (auto* cfd : *(version_set_->column_family_set_)) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      assert(cfd->initialized());
+      auto v_iter = versions_.find(cfd->GetID());
+      if (v_iter != versions_.end()) {
+        assert(v_iter->second != nullptr);
+
+        version_set_->AppendVersion(cfd, v_iter->second);
+        versions_.erase(v_iter);
+      }
+    }
+  } else {
+    for (const auto& elem : versions_) {
+      delete elem.second;
+    }
+    versions_.clear();
+  }
+}
+
+ColumnFamilyData* VersionEditHandlerPointInTime::DestroyCfAndCleanup(
+    const VersionEdit& edit) {
+  ColumnFamilyData* cfd = VersionEditHandler::DestroyCfAndCleanup(edit);
+  auto v_iter = versions_.find(edit.column_family_);
+  if (v_iter != versions_.end()) {
+    delete v_iter->second;
+    versions_.erase(v_iter);
+  }
+  return cfd;
+}
+
+Status VersionEditHandlerPointInTime::MaybeCreateVersion(
+    const VersionEdit& edit, ColumnFamilyData* cfd, bool force_create_version) {
+  assert(cfd != nullptr);
+  if (!force_create_version) {
+    assert(edit.column_family_ == cfd->GetID());
+  }
+  auto missing_files_iter = cf_to_missing_files_.find(cfd->GetID());
+  assert(missing_files_iter != cf_to_missing_files_.end());
+  std::unordered_set<uint64_t>& missing_files = missing_files_iter->second;
+
+  auto missing_blob_files_high_iter =
+      cf_to_missing_blob_files_high_.find(cfd->GetID());
+  assert(missing_blob_files_high_iter != cf_to_missing_blob_files_high_.end());
+  const uint64_t prev_missing_blob_file_high =
+      missing_blob_files_high_iter->second;
+
+  VersionBuilder* builder = nullptr;
+
+  if (prev_missing_blob_file_high != kInvalidBlobFileNumber) {
+    auto builder_iter = builders_.find(cfd->GetID());
+    assert(builder_iter != builders_.end());
+    builder = builder_iter->second->version_builder();
+    assert(builder != nullptr);
+  }
+
+  // At this point, we have not yet applied the new version edits read from the
+  // MANIFEST. We check whether we have any missing table and blob files.
+  const bool prev_has_missing_files =
+      !missing_files.empty() ||
+      (prev_missing_blob_file_high != kInvalidBlobFileNumber &&
+       prev_missing_blob_file_high >= builder->GetMinOldestBlobFileNumber());
+
+  for (const auto& file : edit.GetDeletedFiles()) {
+    uint64_t file_num = file.second;
+    auto fiter = missing_files.find(file_num);
+    if (fiter != missing_files.end()) {
+      missing_files.erase(fiter);
+    }
+  }
+
+  assert(!cfd->ioptions()->cf_paths.empty());
+  Status s;
+  for (const auto& elem : edit.GetNewFiles()) {
+    int level = elem.first;
+    const FileMetaData& meta = elem.second;
+    const FileDescriptor& fd = meta.fd;
+    uint64_t file_num = fd.GetNumber();
+    const std::string fpath =
+        MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_num);
+    s = VerifyFile(cfd, fpath, level, meta);
+    if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) {
+      missing_files.insert(file_num);
+      s = Status::OK();
+    } else if (!s.ok()) {
+      break;
+    }
+  }
+
+  uint64_t missing_blob_file_num = prev_missing_blob_file_high;
+  for (const auto& elem : edit.GetBlobFileAdditions()) {
+    uint64_t file_num = elem.GetBlobFileNumber();
+    s = VerifyBlobFile(cfd, file_num, elem);
+    if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) {
+      missing_blob_file_num = std::max(missing_blob_file_num, file_num);
+      s = Status::OK();
+    } else if (!s.ok()) {
+      break;
+    }
+  }
+
+  bool has_missing_blob_files = false;
+  if (missing_blob_file_num != kInvalidBlobFileNumber &&
+      missing_blob_file_num >= prev_missing_blob_file_high) {
+    missing_blob_files_high_iter->second = missing_blob_file_num;
+    has_missing_blob_files = true;
+  } else if (missing_blob_file_num < prev_missing_blob_file_high) {
+    assert(false);
+  }
+
+  // We still have not applied the new version edit, but have tried to add new
+  // table and blob files after verifying their presence and consistency.
+  // Therefore, we know whether we will see new missing table and blob files
+  // later after actually applying the version edit. We perform the check here
+  // and record the result.
+  const bool has_missing_files =
+      !missing_files.empty() || has_missing_blob_files;
+
+  bool missing_info = !version_edit_params_.has_log_number_ ||
+                      !version_edit_params_.has_next_file_number_ ||
+                      !version_edit_params_.has_last_sequence_;
+
+  // Create version before apply edit. The version will represent the state
+  // before applying the version edit.
+  // A new version will created if:
+  // 1) no error has occurred so far, and
+  // 2) log_number_, next_file_number_ and last_sequence_ are known, and
+  // 3) any of the following:
+  //   a) no missing file before, but will have missing file(s) after applying
+  //      this version edit.
+  //   b) no missing file after applying the version edit, and the caller
+  //      explicitly request that a new version be created.
+  if (s.ok() && !missing_info &&
+      ((has_missing_files && !prev_has_missing_files) ||
+       (!has_missing_files && force_create_version))) {
+    if (!builder) {
+      auto builder_iter = builders_.find(cfd->GetID());
+      assert(builder_iter != builders_.end());
+      builder = builder_iter->second->version_builder();
+      assert(builder);
+    }
+
+    auto* version = new Version(cfd, version_set_, version_set_->file_options_,
+                                *cfd->GetLatestMutableCFOptions(), io_tracer_,
+                                version_set_->current_version_number_++);
+    s = builder->LoadTableHandlers(
+        cfd->internal_stats(),
+        version_set_->db_options_->max_file_opening_threads, false, true,
+        cfd->GetLatestMutableCFOptions()->prefix_extractor,
+        MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()));
+    if (!s.ok()) {
+      delete version;
+      if (s.IsCorruption()) {
+        s = Status::OK();
+      }
+      return s;
+    }
+    s = builder->SaveTo(version->storage_info());
+    if (s.ok()) {
+      version->PrepareAppend(
+          *cfd->GetLatestMutableCFOptions(),
+          !version_set_->db_options_->skip_stats_update_on_db_open);
+      auto v_iter = versions_.find(cfd->GetID());
+      if (v_iter != versions_.end()) {
+        delete v_iter->second;
+        v_iter->second = version;
+      } else {
+        versions_.emplace(cfd->GetID(), version);
+      }
+    } else {
+      delete version;
+    }
+  }
+  return s;
+}
+
+Status VersionEditHandlerPointInTime::VerifyFile(ColumnFamilyData* cfd,
+                                                 const std::string& fpath,
+                                                 int level,
+                                                 const FileMetaData& fmeta) {
+  return version_set_->VerifyFileMetadata(cfd, fpath, level, fmeta);
+}
+
+Status VersionEditHandlerPointInTime::VerifyBlobFile(
+    ColumnFamilyData* cfd, uint64_t blob_file_num,
+    const BlobFileAddition& blob_addition) {
+  BlobSource* blob_source = cfd->blob_source();
+  assert(blob_source);
+  CacheHandleGuard<BlobFileReader> blob_file_reader;
+  Status s = blob_source->GetBlobFileReader(blob_file_num, &blob_file_reader);
+  if (!s.ok()) {
+    return s;
+  }
+  // TODO: verify checksum
+  (void)blob_addition;
+  return s;
+}
+
+Status VersionEditHandlerPointInTime::LoadTables(
+    ColumnFamilyData* /*cfd*/, bool /*prefetch_index_and_filter_in_cache*/,
+    bool /*is_initial_load*/) {
+  return Status::OK();
+}
+
+Status ManifestTailer::Initialize() {
+  if (Mode::kRecovery == mode_) {
+    return VersionEditHandler::Initialize();
+  }
+  assert(Mode::kCatchUp == mode_);
+  Status s;
+  if (!initialized_) {
+    ColumnFamilySet* cfd_set = version_set_->GetColumnFamilySet();
+    assert(cfd_set);
+    ColumnFamilyData* default_cfd = cfd_set->GetDefault();
+    assert(default_cfd);
+    auto builder_iter = builders_.find(default_cfd->GetID());
+    assert(builder_iter != builders_.end());
+
+    Version* dummy_version = default_cfd->dummy_versions();
+    assert(dummy_version);
+    Version* base_version = dummy_version->Next();
+    assert(base_version);
+    base_version->Ref();
+    VersionBuilderUPtr new_builder(
+        new BaseReferencedVersionBuilder(default_cfd, base_version));
+    builder_iter->second = std::move(new_builder);
+
+    initialized_ = true;
+  }
+  return s;
+}
+
+Status ManifestTailer::ApplyVersionEdit(VersionEdit& edit,
+                                        ColumnFamilyData** cfd) {
+  Status s = VersionEditHandler::ApplyVersionEdit(edit, cfd);
+  if (s.ok()) {
+    assert(cfd);
+    if (*cfd) {
+      cfds_changed_.insert(*cfd);
+    }
+  }
+  return s;
+}
+
+Status ManifestTailer::OnColumnFamilyAdd(VersionEdit& edit,
+                                         ColumnFamilyData** cfd) {
+  if (Mode::kRecovery == mode_) {
+    return VersionEditHandler::OnColumnFamilyAdd(edit, cfd);
+  }
+  assert(Mode::kCatchUp == mode_);
+  ColumnFamilySet* cfd_set = version_set_->GetColumnFamilySet();
+  assert(cfd_set);
+  ColumnFamilyData* tmp_cfd = cfd_set->GetColumnFamily(edit.GetColumnFamily());
+  assert(cfd);
+  *cfd = tmp_cfd;
+  if (!tmp_cfd) {
+    // For now, ignore new column families created after Recover() succeeds.
+    return Status::OK();
+  }
+  auto builder_iter = builders_.find(edit.GetColumnFamily());
+  assert(builder_iter != builders_.end());
+
+  Version* dummy_version = tmp_cfd->dummy_versions();
+  assert(dummy_version);
+  Version* base_version = dummy_version->Next();
+  assert(base_version);
+  base_version->Ref();
+  VersionBuilderUPtr new_builder(
+      new BaseReferencedVersionBuilder(tmp_cfd, base_version));
+  builder_iter->second = std::move(new_builder);
+
+#ifndef NDEBUG
+  auto version_iter = versions_.find(edit.GetColumnFamily());
+  assert(version_iter == versions_.end());
+#endif  // !NDEBUG
+  return Status::OK();
+}
+
+void ManifestTailer::CheckIterationResult(const log::Reader& reader,
+                                          Status* s) {
+  VersionEditHandlerPointInTime::CheckIterationResult(reader, s);
+  assert(s);
+  if (s->ok()) {
+    if (Mode::kRecovery == mode_) {
+      mode_ = Mode::kCatchUp;
+    } else {
+      assert(Mode::kCatchUp == mode_);
+    }
+  }
+}
+
+Status ManifestTailer::VerifyFile(ColumnFamilyData* cfd,
+                                  const std::string& fpath, int level,
+                                  const FileMetaData& fmeta) {
+  Status s =
+      VersionEditHandlerPointInTime::VerifyFile(cfd, fpath, level, fmeta);
+  // TODO: Open file or create hard link to prevent the file from being
+  // deleted.
+  return s;
+}
+
+void DumpManifestHandler::CheckIterationResult(const log::Reader& reader,
+                                               Status* s) {
+  VersionEditHandler::CheckIterationResult(reader, s);
+  if (!s->ok()) {
+    fprintf(stdout, "%s\n", s->ToString().c_str());
+    return;
+  }
+  assert(cf_to_cmp_names_);
+  for (auto* cfd : *(version_set_->column_family_set_)) {
+    fprintf(stdout,
+            "--------------- Column family \"%s\"  (ID %" PRIu32
+            ") --------------\n",
+            cfd->GetName().c_str(), cfd->GetID());
+    fprintf(stdout, "log number: %" PRIu64 "\n", cfd->GetLogNumber());
+    auto it = cf_to_cmp_names_->find(cfd->GetID());
+    if (it != cf_to_cmp_names_->end()) {
+      fprintf(stdout,
+              "comparator: <%s>, but the comparator object is not available.\n",
+              it->second.c_str());
+    } else {
+      fprintf(stdout, "comparator: %s\n", cfd->user_comparator()->Name());
+    }
+    assert(cfd->current());
+
+    // Print out DebugStrings. Can include non-terminating null characters.
+    fwrite(cfd->current()->DebugString(hex_).data(), sizeof(char),
+           cfd->current()->DebugString(hex_).size(), stdout);
+  }
+  fprintf(stdout,
+          "next_file_number %" PRIu64 " last_sequence %" PRIu64
+          "  prev_log_number %" PRIu64 " max_column_family %" PRIu32
+          " min_log_number_to_keep %" PRIu64 "\n",
+          version_set_->current_next_file_number(),
+          version_set_->LastSequence(), version_set_->prev_log_number(),
+          version_set_->column_family_set_->GetMaxColumnFamily(),
+          version_set_->min_log_number_to_keep());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_edit_handler.h b/src/rocksdb/db/version_edit_handler.h
new file mode 100644
index 000000000..fd2379b07
--- /dev/null
+++ b/src/rocksdb/db/version_edit_handler.h
@@ -0,0 +1,313 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/version_builder.h"
+#include "db/version_edit.h"
+#include "db/version_set.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct FileMetaData;
+
+class VersionEditHandlerBase {
+ public:
+  explicit VersionEditHandlerBase()
+      : max_manifest_read_size_(std::numeric_limits<uint64_t>::max()) {}
+
+  virtual ~VersionEditHandlerBase() {}
+
+  void Iterate(log::Reader& reader, Status* log_read_status);
+
+  const Status& status() const { return status_; }
+
+  AtomicGroupReadBuffer& GetReadBuffer() { return read_buffer_; }
+
+ protected:
+  explicit VersionEditHandlerBase(uint64_t max_read_size)
+      : max_manifest_read_size_(max_read_size) {}
+  virtual Status Initialize() { return Status::OK(); }
+
+  virtual Status ApplyVersionEdit(VersionEdit& edit,
+                                  ColumnFamilyData** cfd) = 0;
+
+  virtual void CheckIterationResult(const log::Reader& /*reader*/,
+                                    Status* /*s*/) {}
+
+  void ClearReadBuffer() { read_buffer_.Clear(); }
+
+  Status status_;
+
+ private:
+  AtomicGroupReadBuffer read_buffer_;
+  const uint64_t max_manifest_read_size_;
+};
+
+class ListColumnFamiliesHandler : public VersionEditHandlerBase {
+ public:
+  ListColumnFamiliesHandler() : VersionEditHandlerBase() {}
+
+  ~ListColumnFamiliesHandler() override {}
+
+  const std::map<uint32_t, std::string> GetColumnFamilyNames() const {
+    return column_family_names_;
+  }
+
+ protected:
+  Status ApplyVersionEdit(VersionEdit& edit,
+                          ColumnFamilyData** /*unused*/) override;
+
+ private:
+  // default column family is always implicitly there
+  std::map<uint32_t, std::string> column_family_names_{
+      {0, kDefaultColumnFamilyName}};
+};
+
+class FileChecksumRetriever : public VersionEditHandlerBase {
+ public:
+  FileChecksumRetriever(uint64_t max_read_size,
+                        FileChecksumList& file_checksum_list)
+      : VersionEditHandlerBase(max_read_size),
+        file_checksum_list_(file_checksum_list) {}
+
+  ~FileChecksumRetriever() override {}
+
+ protected:
+  Status ApplyVersionEdit(VersionEdit& edit,
+                          ColumnFamilyData** /*unused*/) override;
+
+ private:
+  FileChecksumList& file_checksum_list_;
+};
+
+using VersionBuilderUPtr = std::unique_ptr<BaseReferencedVersionBuilder>;
+
+// A class used for scanning MANIFEST file.
+// VersionEditHandler reads a MANIFEST file, parses the version edits, and
+// builds the version set's in-memory state, e.g. the version storage info for
+// the versions of column families.
+// To use this class and its subclasses,
+// 1. Create an object of VersionEditHandler or its subclasses.
+//    VersionEditHandler handler(read_only, column_families, version_set,
+//                               track_missing_files,
+//                               no_error_if_files_missing);
+// 2. Status s = handler.Iterate(reader, &db_id);
+// 3. Check s and handle possible errors.
+//
+// Not thread-safe, external synchronization is necessary if an object of
+// VersionEditHandler is shared by multiple threads.
+class VersionEditHandler : public VersionEditHandlerBase {
+ public:
+  explicit VersionEditHandler(
+      bool read_only,
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      VersionSet* version_set, bool track_missing_files,
+      bool no_error_if_files_missing,
+      const std::shared_ptr<IOTracer>& io_tracer)
+      : VersionEditHandler(read_only, column_families, version_set,
+                           track_missing_files, no_error_if_files_missing,
+                           io_tracer, /*skip_load_table_files=*/false) {}
+
+  ~VersionEditHandler() override {}
+
+  const VersionEditParams& GetVersionEditParams() const {
+    return version_edit_params_;
+  }
+
+  bool HasMissingFiles() const;
+
+  void GetDbId(std::string* db_id) const {
+    if (db_id && version_edit_params_.has_db_id_) {
+      *db_id = version_edit_params_.db_id_;
+    }
+  }
+
+ protected:
+  explicit VersionEditHandler(
+      bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
+      VersionSet* version_set, bool track_missing_files,
+      bool no_error_if_files_missing,
+      const std::shared_ptr<IOTracer>& io_tracer, bool skip_load_table_files);
+
+  Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override;
+
+  virtual Status OnColumnFamilyAdd(VersionEdit& edit, ColumnFamilyData** cfd);
+
+  Status OnColumnFamilyDrop(VersionEdit& edit, ColumnFamilyData** cfd);
+
+  Status OnNonCfOperation(VersionEdit& edit, ColumnFamilyData** cfd);
+
+  Status OnWalAddition(VersionEdit& edit);
+
+  Status OnWalDeletion(VersionEdit& edit);
+
+  Status Initialize() override;
+
+  void CheckColumnFamilyId(const VersionEdit& edit, bool* cf_in_not_found,
+                           bool* cf_in_builders) const;
+
+  void CheckIterationResult(const log::Reader& reader, Status* s) override;
+
+  ColumnFamilyData* CreateCfAndInit(const ColumnFamilyOptions& cf_options,
+                                    const VersionEdit& edit);
+
+  virtual ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit);
+
+  virtual Status MaybeCreateVersion(const VersionEdit& edit,
+                                    ColumnFamilyData* cfd,
+                                    bool force_create_version);
+
+  virtual Status LoadTables(ColumnFamilyData* cfd,
+                            bool prefetch_index_and_filter_in_cache,
+                            bool is_initial_load);
+
+  virtual bool MustOpenAllColumnFamilies() const { return !read_only_; }
+
+  const bool read_only_;
+  std::vector<ColumnFamilyDescriptor> column_families_;
+  VersionSet* version_set_;
+  std::unordered_map<uint32_t, VersionBuilderUPtr> builders_;
+  std::unordered_map<std::string, ColumnFamilyOptions> name_to_options_;
+  // Keeps track of column families in manifest that were not found in
+  // column families parameters. if those column families are not dropped
+  // by subsequent manifest records, Recover() will return failure status.
+  std::unordered_map<uint32_t, std::string> column_families_not_found_;
+  VersionEditParams version_edit_params_;
+  const bool track_missing_files_;
+  std::unordered_map<uint32_t, std::unordered_set<uint64_t>>
+      cf_to_missing_files_;
+  std::unordered_map<uint32_t, uint64_t> cf_to_missing_blob_files_high_;
+  bool no_error_if_files_missing_;
+  std::shared_ptr<IOTracer> io_tracer_;
+  bool skip_load_table_files_;
+  bool initialized_;
+  std::unique_ptr<std::unordered_map<uint32_t, std::string>> cf_to_cmp_names_;
+
+ private:
+  Status ExtractInfoFromVersionEdit(ColumnFamilyData* cfd,
+                                    const VersionEdit& edit);
+};
+
+// A class similar to its base class, i.e. VersionEditHandler.
+// VersionEditHandlerPointInTime restores the versions to the most recent point
+// in time such that at this point, the version does not have missing files.
+//
+// Not thread-safe, external synchronization is necessary if an object of
+// VersionEditHandlerPointInTime is shared by multiple threads.
+class VersionEditHandlerPointInTime : public VersionEditHandler {
+ public:
+  VersionEditHandlerPointInTime(
+      bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
+      VersionSet* version_set, const std::shared_ptr<IOTracer>& io_tracer);
+  ~VersionEditHandlerPointInTime() override;
+
+ protected:
+  void CheckIterationResult(const log::Reader& reader, Status* s) override;
+  ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit) override;
+  Status MaybeCreateVersion(const VersionEdit& edit, ColumnFamilyData* cfd,
+                            bool force_create_version) override;
+  virtual Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath,
+                            int level, const FileMetaData& fmeta);
+  virtual Status VerifyBlobFile(ColumnFamilyData* cfd, uint64_t blob_file_num,
+                                const BlobFileAddition& blob_addition);
+
+  Status LoadTables(ColumnFamilyData* cfd,
+                    bool prefetch_index_and_filter_in_cache,
+                    bool is_initial_load) override;
+
+  std::unordered_map<uint32_t, Version*> versions_;
+};
+
+class ManifestTailer : public VersionEditHandlerPointInTime {
+ public:
+  explicit ManifestTailer(std::vector<ColumnFamilyDescriptor> column_families,
+                          VersionSet* version_set,
+                          const std::shared_ptr<IOTracer>& io_tracer)
+      : VersionEditHandlerPointInTime(/*read_only=*/false, column_families,
+                                      version_set, io_tracer),
+        mode_(Mode::kRecovery) {}
+
+  void PrepareToReadNewManifest() {
+    initialized_ = false;
+    ClearReadBuffer();
+  }
+
+  std::unordered_set<ColumnFamilyData*>& GetUpdatedColumnFamilies() {
+    return cfds_changed_;
+  }
+
+ protected:
+  Status Initialize() override;
+
+  bool MustOpenAllColumnFamilies() const override { return false; }
+
+  Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override;
+
+  Status OnColumnFamilyAdd(VersionEdit& edit, ColumnFamilyData** cfd) override;
+
+  void CheckIterationResult(const log::Reader& reader, Status* s) override;
+
+  Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, int level,
+                    const FileMetaData& fmeta) override;
+
+  enum Mode : uint8_t {
+    kRecovery = 0,
+    kCatchUp = 1,
+  };
+
+  Mode mode_;
+  std::unordered_set<ColumnFamilyData*> cfds_changed_;
+};
+
+class DumpManifestHandler : public VersionEditHandler {
+ public:
+  DumpManifestHandler(std::vector<ColumnFamilyDescriptor> column_families,
+                      VersionSet* version_set,
+                      const std::shared_ptr<IOTracer>& io_tracer, bool verbose,
+                      bool hex, bool json)
+      : VersionEditHandler(
+            /*read_only=*/true, column_families, version_set,
+            /*track_missing_files=*/false,
+            /*no_error_if_files_missing=*/false, io_tracer,
+            /*skip_load_table_files=*/true),
+        verbose_(verbose),
+        hex_(hex),
+        json_(json),
+        count_(0) {
+    cf_to_cmp_names_.reset(new std::unordered_map<uint32_t, std::string>());
+  }
+
+  ~DumpManifestHandler() override {}
+
+  Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override {
+    // Write out each individual edit
+    if (verbose_ && !json_) {
+      // Print out DebugStrings. Can include non-terminating null characters.
+      fwrite(edit.DebugString(hex_).data(), sizeof(char),
+             edit.DebugString(hex_).size(), stdout);
+    } else if (json_) {
+      // Print out DebugStrings. Can include non-terminating null characters.
+      fwrite(edit.DebugString(hex_).data(), sizeof(char),
+             edit.DebugString(hex_).size(), stdout);
+    }
+    ++count_;
+    return VersionEditHandler::ApplyVersionEdit(edit, cfd);
+  }
+
+  void CheckIterationResult(const log::Reader& reader, Status* s) override;
+
+ private:
+  const bool verbose_;
+  const bool hex_;
+  const bool json_;
+  int count_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_edit_test.cc b/src/rocksdb/db/version_edit_test.cc
new file mode 100644
index 000000000..c7f271d83
--- /dev/null
+++ b/src/rocksdb/db/version_edit_test.cc
@@ -0,0 +1,730 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit.h"
+
+#include "db/blob/blob_index.h"
+#include "rocksdb/advanced_options.h"
+#include "table/unique_id_impl.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static void TestEncodeDecode(const VersionEdit& edit) {
+  std::string encoded, encoded2;
+  edit.EncodeTo(&encoded);
+  VersionEdit parsed;
+  Status s = parsed.DecodeFrom(encoded);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  parsed.EncodeTo(&encoded2);
+  ASSERT_EQ(encoded, encoded2);
+}
+
+class VersionEditTest : public testing::Test {};
+
+TEST_F(VersionEditTest, EncodeDecode) {
+  static const uint64_t kBig = 1ull << 50;
+  static const uint32_t kBig32Bit = 1ull << 30;
+
+  VersionEdit edit;
+  for (int i = 0; i < 4; i++) {
+    TestEncodeDecode(edit);
+    edit.AddFile(3, kBig + 300 + i, kBig32Bit + 400 + i, 0,
+                 InternalKey("foo", kBig + 500 + i, kTypeValue),
+                 InternalKey("zoo", kBig + 600 + i, kTypeDeletion),
+                 kBig + 500 + i, kBig + 600 + i, false, Temperature::kUnknown,
+                 kInvalidBlobFileNumber, 888, 678, "234", "crc32c",
+                 kNullUniqueId64x2);
+    edit.DeleteFile(4, kBig + 700 + i);
+  }
+
+  edit.SetComparatorName("foo");
+  edit.SetLogNumber(kBig + 100);
+  edit.SetNextFile(kBig + 200);
+  edit.SetLastSequence(kBig + 1000);
+  TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, EncodeDecodeNewFile4) {
+  static const uint64_t kBig = 1ull << 50;
+
+  VersionEdit edit;
+  edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
+               InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
+               kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+               kNullUniqueId64x2);
+  edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
+               InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
+               kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+               kNullUniqueId64x2);
+  edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue),
+               InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502,
+               kBig + 602, true, Temperature::kUnknown, kInvalidBlobFileNumber,
+               666, 888, kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+               kNullUniqueId64x2);
+  edit.AddFile(5, 303, 0, 100, InternalKey("foo", kBig + 503, kTypeBlobIndex),
+               InternalKey("zoo", kBig + 603, kTypeBlobIndex), kBig + 503,
+               kBig + 603, true, Temperature::kUnknown, 1001,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+               kNullUniqueId64x2);
+
+  edit.DeleteFile(4, 700);
+
+  edit.SetComparatorName("foo");
+  edit.SetLogNumber(kBig + 100);
+  edit.SetNextFile(kBig + 200);
+  edit.SetLastSequence(kBig + 1000);
+  TestEncodeDecode(edit);
+
+  std::string encoded, encoded2;
+  edit.EncodeTo(&encoded);
+  VersionEdit parsed;
+  Status s = parsed.DecodeFrom(encoded);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  auto& new_files = parsed.GetNewFiles();
+  ASSERT_TRUE(new_files[0].second.marked_for_compaction);
+  ASSERT_TRUE(!new_files[1].second.marked_for_compaction);
+  ASSERT_TRUE(new_files[2].second.marked_for_compaction);
+  ASSERT_TRUE(new_files[3].second.marked_for_compaction);
+  ASSERT_EQ(3u, new_files[0].second.fd.GetPathId());
+  ASSERT_EQ(3u, new_files[1].second.fd.GetPathId());
+  ASSERT_EQ(0u, new_files[2].second.fd.GetPathId());
+  ASSERT_EQ(0u, new_files[3].second.fd.GetPathId());
+  ASSERT_EQ(kInvalidBlobFileNumber,
+            new_files[0].second.oldest_blob_file_number);
+  ASSERT_EQ(kInvalidBlobFileNumber,
+            new_files[1].second.oldest_blob_file_number);
+  ASSERT_EQ(kInvalidBlobFileNumber,
+            new_files[2].second.oldest_blob_file_number);
+  ASSERT_EQ(1001, new_files[3].second.oldest_blob_file_number);
+}
+
+TEST_F(VersionEditTest, ForwardCompatibleNewFile4) {
+  static const uint64_t kBig = 1ull << 50;
+  VersionEdit edit;
+  edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
+               InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
+               kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+               kNullUniqueId64x2);
+  edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
+               InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
+               kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber,
+               686, 868, "234", "crc32c", kNullUniqueId64x2);
+  edit.DeleteFile(4, 700);
+
+  edit.SetComparatorName("foo");
+  edit.SetLogNumber(kBig + 100);
+  edit.SetNextFile(kBig + 200);
+  edit.SetLastSequence(kBig + 1000);
+
+  std::string encoded;
+
+  // Call back function to add extra customized builds.
+  bool first = true;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "VersionEdit::EncodeTo:NewFile4:CustomizeFields", [&](void* arg) {
+        std::string* str = reinterpret_cast<std::string*>(arg);
+        PutVarint32(str, 33);
+        const std::string str1 = "random_string";
+        PutLengthPrefixedSlice(str, str1);
+        if (first) {
+          first = false;
+          PutVarint32(str, 22);
+          const std::string str2 = "s";
+          PutLengthPrefixedSlice(str, str2);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  edit.EncodeTo(&encoded);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  VersionEdit parsed;
+  Status s = parsed.DecodeFrom(encoded);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_TRUE(!first);
+  auto& new_files = parsed.GetNewFiles();
+  ASSERT_TRUE(new_files[0].second.marked_for_compaction);
+  ASSERT_TRUE(!new_files[1].second.marked_for_compaction);
+  ASSERT_EQ(3u, new_files[0].second.fd.GetPathId());
+  ASSERT_EQ(3u, new_files[1].second.fd.GetPathId());
+  ASSERT_EQ(1u, parsed.GetDeletedFiles().size());
+}
+
+TEST_F(VersionEditTest, NewFile4NotSupportedField) {
+  static const uint64_t kBig = 1ull << 50;
+  VersionEdit edit;
+  edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
+               InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
+               kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+               kNullUniqueId64x2);
+
+  edit.SetComparatorName("foo");
+  edit.SetLogNumber(kBig + 100);
+  edit.SetNextFile(kBig + 200);
+  edit.SetLastSequence(kBig + 1000);
+
+  std::string encoded;
+
+  // Call back function to add extra customized builds.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "VersionEdit::EncodeTo:NewFile4:CustomizeFields", [&](void* arg) {
+        std::string* str = reinterpret_cast<std::string*>(arg);
+        const std::string str1 = "s";
+        PutLengthPrefixedSlice(str, str1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  edit.EncodeTo(&encoded);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  VersionEdit parsed;
+  Status s = parsed.DecodeFrom(encoded);
+  ASSERT_NOK(s);
+}
+
+TEST_F(VersionEditTest, EncodeEmptyFile) {
+  VersionEdit edit;
+  edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false,
+               Temperature::kUnknown, kInvalidBlobFileNumber,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+               kNullUniqueId64x2);
+  std::string buffer;
+  ASSERT_TRUE(!edit.EncodeTo(&buffer));
+}
+
+TEST_F(VersionEditTest, ColumnFamilyTest) {
+  VersionEdit edit;
+  edit.SetColumnFamily(2);
+  edit.AddColumnFamily("column_family");
+  edit.SetMaxColumnFamily(5);
+  TestEncodeDecode(edit);
+
+  edit.Clear();
+  edit.SetColumnFamily(3);
+  edit.DropColumnFamily();
+  TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, MinLogNumberToKeep) {
+  VersionEdit edit;
+  edit.SetMinLogNumberToKeep(13);
+  TestEncodeDecode(edit);
+
+  edit.Clear();
+  edit.SetMinLogNumberToKeep(23);
+  TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, AtomicGroupTest) {
+  VersionEdit edit;
+  edit.MarkAtomicGroup(1);
+  TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, IgnorableField) {
+  VersionEdit ve;
+  std::string encoded;
+
+  // Size of ignorable field is too large
+  PutVarint32Varint64(&encoded, 2 /* kLogNumber */, 66);
+  // This is a customized ignorable tag
+  PutVarint32Varint64(&encoded,
+                      0x2710 /* A field with kTagSafeIgnoreMask set */,
+                      5 /* fieldlength 5 */);
+  encoded += "abc";  // Only fills 3 bytes,
+  ASSERT_NOK(ve.DecodeFrom(encoded));
+
+  encoded.clear();
+  // Error when seeing unidentified tag that is not ignorable
+  PutVarint32Varint64(&encoded, 2 /* kLogNumber */, 66);
+  // This is a customized ignorable tag
+  PutVarint32Varint64(&encoded, 666 /* A field with kTagSafeIgnoreMask unset */,
+                      3 /* fieldlength 3 */);
+  encoded += "abc";  //  Fill 3 bytes
+  PutVarint32Varint64(&encoded, 3 /* next file number */, 88);
+  ASSERT_NOK(ve.DecodeFrom(encoded));
+
+  // Safely ignore an identified but safely ignorable entry
+  encoded.clear();
+  PutVarint32Varint64(&encoded, 2 /* kLogNumber */, 66);
+  // This is a customized ignorable tag
+  PutVarint32Varint64(&encoded,
+                      0x2710 /* A field with kTagSafeIgnoreMask set */,
+                      3 /* fieldlength 3 */);
+  encoded += "abc";  //  Fill 3 bytes
+  PutVarint32Varint64(&encoded, 3 /* kNextFileNumber */, 88);
+
+  ASSERT_OK(ve.DecodeFrom(encoded));
+
+  ASSERT_TRUE(ve.HasLogNumber());
+  ASSERT_TRUE(ve.HasNextFile());
+  ASSERT_EQ(66, ve.GetLogNumber());
+  ASSERT_EQ(88, ve.GetNextFile());
+}
+
+TEST_F(VersionEditTest, DbId) {
+  VersionEdit edit;
+  edit.SetDBId("ab34-cd12-435f-er00");
+  TestEncodeDecode(edit);
+
+  edit.Clear();
+  edit.SetDBId("34ba-cd12-435f-er01");
+  TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, BlobFileAdditionAndGarbage) {
+  VersionEdit edit;
+
+  const std::string checksum_method_prefix = "Hash";
+  const std::string checksum_value_prefix = "Value";
+
+  for (uint64_t blob_file_number = 1; blob_file_number <= 10;
+       ++blob_file_number) {
+    const uint64_t total_blob_count = blob_file_number << 10;
+    const uint64_t total_blob_bytes = blob_file_number << 20;
+
+    std::string checksum_method(checksum_method_prefix);
+    AppendNumberTo(&checksum_method, blob_file_number);
+
+    std::string checksum_value(checksum_value_prefix);
+    AppendNumberTo(&checksum_value, blob_file_number);
+
+    edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+                     checksum_method, checksum_value);
+
+    const uint64_t garbage_blob_count = total_blob_count >> 2;
+    const uint64_t garbage_blob_bytes = total_blob_bytes >> 1;
+
+    edit.AddBlobFileGarbage(blob_file_number, garbage_blob_count,
+                            garbage_blob_bytes);
+  }
+
+  TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, AddWalEncodeDecode) {
+  VersionEdit edit;
+  for (uint64_t log_number = 1; log_number <= 20; log_number++) {
+    WalMetadata meta;
+    bool has_size = rand() % 2 == 0;
+    if (has_size) {
+      meta.SetSyncedSizeInBytes(rand() % 1000);
+    }
+    edit.AddWal(log_number, meta);
+  }
+  TestEncodeDecode(edit);
+}
+
+static std::string PrefixEncodedWalAdditionWithLength(
+    const std::string& encoded) {
+  std::string ret;
+  PutVarint32(&ret, Tag::kWalAddition2);
+  PutLengthPrefixedSlice(&ret, encoded);
+  return ret;
+}
+
+TEST_F(VersionEditTest, AddWalDecodeBadLogNumber) {
+  std::string encoded;
+
+  {
+    // No log number.
+    std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded);
+    VersionEdit edit;
+    Status s = edit.DecodeFrom(encoded_edit);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(s.ToString().find("Error decoding WAL log number") !=
+                std::string::npos)
+        << s.ToString();
+  }
+
+  {
+    // log number should be varint64,
+    // but we only encode 128 which is not a valid representation of varint64.
+    char c = 0;
+    unsigned char* ptr = reinterpret_cast<unsigned char*>(&c);
+    *ptr = 128;
+    encoded.append(1, c);
+
+    std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded);
+    VersionEdit edit;
+    Status s = edit.DecodeFrom(encoded_edit);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(s.ToString().find("Error decoding WAL log number") !=
+                std::string::npos)
+        << s.ToString();
+  }
+}
+
+TEST_F(VersionEditTest, AddWalDecodeBadTag) {
+  constexpr WalNumber kLogNumber = 100;
+  constexpr uint64_t kSizeInBytes = 100;
+
+  std::string encoded;
+  PutVarint64(&encoded, kLogNumber);
+
+  {
+    // No tag.
+    std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded);
+    VersionEdit edit;
+    Status s = edit.DecodeFrom(encoded_edit);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(s.ToString().find("Error decoding tag") != std::string::npos)
+        << s.ToString();
+  }
+
+  {
+    // Only has size tag, no terminate tag.
+    std::string encoded_with_size = encoded;
+    PutVarint32(&encoded_with_size,
+                static_cast<uint32_t>(WalAdditionTag::kSyncedSize));
+    PutVarint64(&encoded_with_size, kSizeInBytes);
+
+    std::string encoded_edit =
+        PrefixEncodedWalAdditionWithLength(encoded_with_size);
+    VersionEdit edit;
+    Status s = edit.DecodeFrom(encoded_edit);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(s.ToString().find("Error decoding tag") != std::string::npos)
+        << s.ToString();
+  }
+
+  {
+    // Only has terminate tag.
+    std::string encoded_with_terminate = encoded;
+    PutVarint32(&encoded_with_terminate,
+                static_cast<uint32_t>(WalAdditionTag::kTerminate));
+
+    std::string encoded_edit =
+        PrefixEncodedWalAdditionWithLength(encoded_with_terminate);
+    VersionEdit edit;
+    ASSERT_OK(edit.DecodeFrom(encoded_edit));
+    auto& wal_addition = edit.GetWalAdditions()[0];
+    ASSERT_EQ(wal_addition.GetLogNumber(), kLogNumber);
+    ASSERT_FALSE(wal_addition.GetMetadata().HasSyncedSize());
+  }
+}
+
+TEST_F(VersionEditTest, AddWalDecodeNoSize) {
+  constexpr WalNumber kLogNumber = 100;
+
+  std::string encoded;
+  PutVarint64(&encoded, kLogNumber);
+  PutVarint32(&encoded, static_cast<uint32_t>(WalAdditionTag::kSyncedSize));
+  // No real size after the size tag.
+
+  {
+    // Without terminate tag.
+    std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded);
+    VersionEdit edit;
+    Status s = edit.DecodeFrom(encoded_edit);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(s.ToString().find("Error decoding WAL file size") !=
+                std::string::npos)
+        << s.ToString();
+  }
+
+  {
+    // With terminate tag.
+    PutVarint32(&encoded, static_cast<uint32_t>(WalAdditionTag::kTerminate));
+
+    std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded);
+    VersionEdit edit;
+    Status s = edit.DecodeFrom(encoded_edit);
+    ASSERT_TRUE(s.IsCorruption());
+    // The terminate tag is misunderstood as the size.
+    ASSERT_TRUE(s.ToString().find("Error decoding tag") != std::string::npos)
+        << s.ToString();
+  }
+}
+
+TEST_F(VersionEditTest, AddWalDebug) {
+  constexpr int n = 2;
+  constexpr std::array<uint64_t, n> kLogNumbers{{10, 20}};
+  constexpr std::array<uint64_t, n> kSizeInBytes{{100, 200}};
+
+  VersionEdit edit;
+  for (int i = 0; i < n; i++) {
+    edit.AddWal(kLogNumbers[i], WalMetadata(kSizeInBytes[i]));
+  }
+
+  const WalAdditions& wals = edit.GetWalAdditions();
+
+  ASSERT_TRUE(edit.IsWalAddition());
+  ASSERT_EQ(wals.size(), n);
+  for (int i = 0; i < n; i++) {
+    const WalAddition& wal = wals[i];
+    ASSERT_EQ(wal.GetLogNumber(), kLogNumbers[i]);
+    ASSERT_EQ(wal.GetMetadata().GetSyncedSizeInBytes(), kSizeInBytes[i]);
+  }
+
+  std::string expected_str = "VersionEdit {\n";
+  for (int i = 0; i < n; i++) {
+    std::stringstream ss;
+    ss << "  WalAddition: log_number: " << kLogNumbers[i]
+       << " synced_size_in_bytes: " << kSizeInBytes[i] << "\n";
+    expected_str += ss.str();
+  }
+  expected_str += "  ColumnFamily: 0\n}\n";
+  ASSERT_EQ(edit.DebugString(true), expected_str);
+
+  std::string expected_json = "{\"EditNumber\": 4, \"WalAdditions\": [";
+  for (int i = 0; i < n; i++) {
+    std::stringstream ss;
+    ss << "{\"LogNumber\": " << kLogNumbers[i] << ", "
+       << "\"SyncedSizeInBytes\": " << kSizeInBytes[i] << "}";
+    if (i < n - 1) ss << ", ";
+    expected_json += ss.str();
+  }
+  expected_json += "], \"ColumnFamily\": 0}";
+  ASSERT_EQ(edit.DebugJSON(4, true), expected_json);
+}
+
+TEST_F(VersionEditTest, DeleteWalEncodeDecode) {
+  VersionEdit edit;
+  edit.DeleteWalsBefore(rand() % 100);
+  TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, DeleteWalDebug) {
+  constexpr int n = 2;
+  constexpr std::array<uint64_t, n> kLogNumbers{{10, 20}};
+
+  VersionEdit edit;
+  edit.DeleteWalsBefore(kLogNumbers[n - 1]);
+
+  const WalDeletion& wal = edit.GetWalDeletion();
+
+  ASSERT_TRUE(edit.IsWalDeletion());
+  ASSERT_EQ(wal.GetLogNumber(), kLogNumbers[n - 1]);
+
+  std::string expected_str = "VersionEdit {\n";
+  {
+    std::stringstream ss;
+    ss << "  WalDeletion: log_number: " << kLogNumbers[n - 1] << "\n";
+    expected_str += ss.str();
+  }
+  expected_str += "  ColumnFamily: 0\n}\n";
+  ASSERT_EQ(edit.DebugString(true), expected_str);
+
+  std::string expected_json = "{\"EditNumber\": 4, \"WalDeletion\": ";
+  {
+    std::stringstream ss;
+    ss << "{\"LogNumber\": " << kLogNumbers[n - 1] << "}";
+    expected_json += ss.str();
+  }
+  expected_json += ", \"ColumnFamily\": 0}";
+  ASSERT_EQ(edit.DebugJSON(4, true), expected_json);
+}
+
+TEST_F(VersionEditTest, FullHistoryTsLow) {
+  VersionEdit edit;
+  ASSERT_FALSE(edit.HasFullHistoryTsLow());
+  std::string ts = test::EncodeInt(0);
+  edit.SetFullHistoryTsLow(ts);
+  TestEncodeDecode(edit);
+}
+
+// Tests that if RocksDB is downgraded, the new types of VersionEdits
+// that have a tag larger than kTagSafeIgnoreMask can be safely ignored.
+TEST_F(VersionEditTest, IgnorableTags) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionEdit::EncodeTo:IgnoreIgnorableTags", [&](void* arg) {
+        bool* ignore = static_cast<bool*>(arg);
+        *ignore = true;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr uint64_t kPrevLogNumber = 100;
+  constexpr uint64_t kLogNumber = 200;
+  constexpr uint64_t kNextFileNumber = 300;
+  constexpr uint64_t kColumnFamilyId = 400;
+
+  VersionEdit edit;
+  // Add some ignorable entries.
+  for (int i = 0; i < 2; i++) {
+    edit.AddWal(i + 1, WalMetadata(i + 2));
+  }
+  edit.SetDBId("db_id");
+  // Add unignorable entries.
+  edit.SetPrevLogNumber(kPrevLogNumber);
+  edit.SetLogNumber(kLogNumber);
+  // Add more ignorable entries.
+  edit.DeleteWalsBefore(100);
+  // Add unignorable entry.
+  edit.SetNextFile(kNextFileNumber);
+  // Add more ignorable entries.
+  edit.SetFullHistoryTsLow("ts");
+  // Add unignorable entry.
+  edit.SetColumnFamily(kColumnFamilyId);
+
+  std::string encoded;
+  ASSERT_TRUE(edit.EncodeTo(&encoded));
+
+  VersionEdit decoded;
+  ASSERT_OK(decoded.DecodeFrom(encoded));
+
+  // Check that all ignorable entries are ignored.
+  ASSERT_FALSE(decoded.HasDbId());
+  ASSERT_FALSE(decoded.HasFullHistoryTsLow());
+  ASSERT_FALSE(decoded.IsWalAddition());
+  ASSERT_FALSE(decoded.IsWalDeletion());
+  ASSERT_TRUE(decoded.GetWalAdditions().empty());
+  ASSERT_TRUE(decoded.GetWalDeletion().IsEmpty());
+
+  // Check that unignorable entries are still present.
+  ASSERT_EQ(edit.GetPrevLogNumber(), kPrevLogNumber);
+  ASSERT_EQ(edit.GetLogNumber(), kLogNumber);
+  ASSERT_EQ(edit.GetNextFile(), kNextFileNumber);
+  ASSERT_EQ(edit.GetColumnFamily(), kColumnFamilyId);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST(FileMetaDataTest, UpdateBoundariesBlobIndex) {
+  FileMetaData meta;
+
+  {
+    constexpr uint64_t file_number = 10;
+    constexpr uint32_t path_id = 0;
+    constexpr uint64_t file_size = 0;
+
+    meta.fd = FileDescriptor(file_number, path_id, file_size);
+  }
+
+  constexpr char key[] = "foo";
+
+  constexpr uint64_t expected_oldest_blob_file_number = 20;
+
+  // Plain old value (does not affect oldest_blob_file_number)
+  {
+    constexpr char value[] = "value";
+    constexpr SequenceNumber seq = 200;
+
+    ASSERT_OK(meta.UpdateBoundaries(key, value, seq, kTypeValue));
+    ASSERT_EQ(meta.oldest_blob_file_number, kInvalidBlobFileNumber);
+  }
+
+  // Non-inlined, non-TTL blob index (sets oldest_blob_file_number)
+  {
+    constexpr uint64_t blob_file_number = 25;
+    static_assert(blob_file_number > expected_oldest_blob_file_number,
+                  "unexpected");
+
+    constexpr uint64_t offset = 1000;
+    constexpr uint64_t size = 100;
+
+    std::string blob_index;
+    BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+                          kNoCompression);
+
+    constexpr SequenceNumber seq = 201;
+
+    ASSERT_OK(meta.UpdateBoundaries(key, blob_index, seq, kTypeBlobIndex));
+    ASSERT_EQ(meta.oldest_blob_file_number, blob_file_number);
+  }
+
+  // Another one, with the oldest blob file number (updates
+  // oldest_blob_file_number)
+  {
+    constexpr uint64_t offset = 2000;
+    constexpr uint64_t size = 300;
+
+    std::string blob_index;
+    BlobIndex::EncodeBlob(&blob_index, expected_oldest_blob_file_number, offset,
+                          size, kNoCompression);
+
+    constexpr SequenceNumber seq = 202;
+
+    ASSERT_OK(meta.UpdateBoundaries(key, blob_index, seq, kTypeBlobIndex));
+    ASSERT_EQ(meta.oldest_blob_file_number, expected_oldest_blob_file_number);
+  }
+
+  // Inlined TTL blob index (does not affect oldest_blob_file_number)
+  {
+    constexpr uint64_t expiration = 9876543210;
+    constexpr char value[] = "value";
+
+    std::string blob_index;
+    BlobIndex::EncodeInlinedTTL(&blob_index, expiration, value);
+
+    constexpr SequenceNumber seq = 203;
+
+    ASSERT_OK(meta.UpdateBoundaries(key, blob_index, seq, kTypeBlobIndex));
+    ASSERT_EQ(meta.oldest_blob_file_number, expected_oldest_blob_file_number);
+  }
+
+  // Non-inlined TTL blob index (does not affect oldest_blob_file_number, even
+  // though file number is smaller)
+  {
+    constexpr uint64_t expiration = 9876543210;
+    constexpr uint64_t blob_file_number = 15;
+    static_assert(blob_file_number < expected_oldest_blob_file_number,
+                  "unexpected");
+
+    constexpr uint64_t offset = 2000;
+    constexpr uint64_t size = 500;
+
+    std::string blob_index;
+    BlobIndex::EncodeBlobTTL(&blob_index, expiration, blob_file_number, offset,
+                             size, kNoCompression);
+
+    constexpr SequenceNumber seq = 204;
+
+    ASSERT_OK(meta.UpdateBoundaries(key, blob_index, seq, kTypeBlobIndex));
+    ASSERT_EQ(meta.oldest_blob_file_number, expected_oldest_blob_file_number);
+  }
+
+  // Corrupt blob index
+  {
+    constexpr char corrupt_blob_index[] = "!corrupt!";
+    constexpr SequenceNumber seq = 205;
+
+    ASSERT_TRUE(
+        meta.UpdateBoundaries(key, corrupt_blob_index, seq, kTypeBlobIndex)
+            .IsCorruption());
+    ASSERT_EQ(meta.oldest_blob_file_number, expected_oldest_blob_file_number);
+  }
+
+  // Invalid blob file number
+  {
+    constexpr uint64_t offset = 10000;
+    constexpr uint64_t size = 1000;
+
+    std::string blob_index;
+    BlobIndex::EncodeBlob(&blob_index, kInvalidBlobFileNumber, offset, size,
+                          kNoCompression);
+
+    constexpr SequenceNumber seq = 206;
+
+    ASSERT_TRUE(meta.UpdateBoundaries(key, blob_index, seq, kTypeBlobIndex)
+                    .IsCorruption());
+    ASSERT_EQ(meta.oldest_blob_file_number, expected_oldest_blob_file_number);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/version_set.cc b/src/rocksdb/db/version_set.cc
new file mode 100644
index 000000000..427af6e25
--- /dev/null
+++ b/src/rocksdb/db/version_set.cc
@@ -0,0 +1,6903 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_set.h"
+
+#include <algorithm>
+#include <array>
+#include <cinttypes>
+#include <cstdio>
+#include <list>
+#include <map>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "db/blob/blob_fetcher.h"
+#include "db/blob/blob_file_cache.h"
+#include "db/blob/blob_file_reader.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_source.h"
+#include "db/compaction/compaction.h"
+#include "db/compaction/file_pri.h"
+#include "db/dbformat.h"
+#include "db/internal_stats.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/table_cache.h"
+#include "db/version_builder.h"
+#include "db/version_edit_handler.h"
+#if USE_COROUTINES
+#include "folly/experimental/coro/BlockingWait.h"
+#include "folly/experimental/coro/Collect.h"
+#endif
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
+#include "logging/logging.h"
+#include "monitoring/file_read_sample.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/persistent_stats_history.h"
+#include "options/options_helper.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/format.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/merging_iterator.h"
+#include "table/meta_blocks.h"
+#include "table/multiget_context.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/table_reader.h"
+#include "table/two_level_iterator.h"
+#include "table/unique_id_impl.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/coro_utils.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "util/user_comparator_wrapper.h"
+
+// Generate the regular and coroutine versions of some methods by
+// including version_set_sync_and_async.h twice
+// Macros in the header will expand differently based on whether
+// WITH_COROUTINES or WITHOUT_COROUTINES is defined
+// clang-format off
+#define WITHOUT_COROUTINES
+#include "db/version_set_sync_and_async.h"
+#undef WITHOUT_COROUTINES
+#define WITH_COROUTINES
+#include "db/version_set_sync_and_async.h"
+#undef WITH_COROUTINES
+// clang-format on
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Find File in LevelFilesBrief data structure
+// Within an index range defined by left and right
+int FindFileInRange(const InternalKeyComparator& icmp,
+                    const LevelFilesBrief& file_level, const Slice& key,
+                    uint32_t left, uint32_t right) {
+  auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool {
+    return icmp.InternalKeyComparator::Compare(f.largest_key, k) < 0;
+  };
+  const auto& b = file_level.files;
+  return static_cast<int>(std::lower_bound(b + left, b + right, key, cmp) - b);
+}
+
+Status OverlapWithIterator(const Comparator* ucmp,
+                           const Slice& smallest_user_key,
+                           const Slice& largest_user_key,
+                           InternalIterator* iter, bool* overlap) {
+  InternalKey range_start(smallest_user_key, kMaxSequenceNumber,
+                          kValueTypeForSeek);
+  iter->Seek(range_start.Encode());
+  if (!iter->status().ok()) {
+    return iter->status();
+  }
+
+  *overlap = false;
+  if (iter->Valid()) {
+    ParsedInternalKey seek_result;
+    Status s = ParseInternalKey(iter->key(), &seek_result,
+                                false /* log_err_key */);  // TODO
+    if (!s.ok()) return s;
+
+    if (ucmp->CompareWithoutTimestamp(seek_result.user_key, largest_user_key) <=
+        0) {
+      *overlap = true;
+    }
+  }
+
+  return iter->status();
+}
+
+// Class to help choose the next file to search for the particular key.
+// Searches and returns files level by level.
+// We can search level-by-level since entries never hop across
+// levels. Therefore we are guaranteed that if we find data
+// in a smaller level, later levels are irrelevant (unless we
+// are MergeInProgress).
+class FilePicker {
+ public:
+  FilePicker(const Slice& user_key, const Slice& ikey,
+             autovector<LevelFilesBrief>* file_levels, unsigned int num_levels,
+             FileIndexer* file_indexer, const Comparator* user_comparator,
+             const InternalKeyComparator* internal_comparator)
+      : num_levels_(num_levels),
+        curr_level_(static_cast<unsigned int>(-1)),
+        returned_file_level_(static_cast<unsigned int>(-1)),
+        hit_file_level_(static_cast<unsigned int>(-1)),
+        search_left_bound_(0),
+        search_right_bound_(FileIndexer::kLevelMaxIndex),
+        level_files_brief_(file_levels),
+        is_hit_file_last_in_level_(false),
+        curr_file_level_(nullptr),
+        user_key_(user_key),
+        ikey_(ikey),
+        file_indexer_(file_indexer),
+        user_comparator_(user_comparator),
+        internal_comparator_(internal_comparator) {
+    // Setup member variables to search first level.
+    search_ended_ = !PrepareNextLevel();
+    if (!search_ended_) {
+      // Prefetch Level 0 table data to avoid cache miss if possible.
+      for (unsigned int i = 0; i < (*level_files_brief_)[0].num_files; ++i) {
+        auto* r = (*level_files_brief_)[0].files[i].fd.table_reader;
+        if (r) {
+          r->Prepare(ikey);
+        }
+      }
+    }
+  }
+
+  int GetCurrentLevel() const { return curr_level_; }
+
+  FdWithKeyRange* GetNextFile() {
+    while (!search_ended_) {  // Loops over different levels.
+      while (curr_index_in_curr_level_ < curr_file_level_->num_files) {
+        // Loops over all files in current level.
+        FdWithKeyRange* f = &curr_file_level_->files[curr_index_in_curr_level_];
+        hit_file_level_ = curr_level_;
+        is_hit_file_last_in_level_ =
+            curr_index_in_curr_level_ == curr_file_level_->num_files - 1;
+        int cmp_largest = -1;
+
+        // Do key range filtering of files or/and fractional cascading if:
+        // (1) not all the files are in level 0, or
+        // (2) there are more than 3 current level files
+        // If there are only 3 or less current level files in the system, we
+        // skip the key range filtering. In this case, more likely, the system
+        // is highly tuned to minimize number of tables queried by each query,
+        // so it is unlikely that key range filtering is more efficient than
+        // querying the files.
+        if (num_levels_ > 1 || curr_file_level_->num_files > 3) {
+          // Check if key is within a file's range. If search left bound and
+          // right bound point to the same find, we are sure key falls in
+          // range.
+          assert(curr_level_ == 0 ||
+                 curr_index_in_curr_level_ == start_index_in_curr_level_ ||
+                 user_comparator_->CompareWithoutTimestamp(
+                     user_key_, ExtractUserKey(f->smallest_key)) <= 0);
+
+          int cmp_smallest = user_comparator_->CompareWithoutTimestamp(
+              user_key_, ExtractUserKey(f->smallest_key));
+          if (cmp_smallest >= 0) {
+            cmp_largest = user_comparator_->CompareWithoutTimestamp(
+                user_key_, ExtractUserKey(f->largest_key));
+          }
+
+          // Setup file search bound for the next level based on the
+          // comparison results
+          if (curr_level_ > 0) {
+            file_indexer_->GetNextLevelIndex(
+                curr_level_, curr_index_in_curr_level_, cmp_smallest,
+                cmp_largest, &search_left_bound_, &search_right_bound_);
+          }
+          // Key falls out of current file's range
+          if (cmp_smallest < 0 || cmp_largest > 0) {
+            if (curr_level_ == 0) {
+              ++curr_index_in_curr_level_;
+              continue;
+            } else {
+              // Search next level.
+              break;
+            }
+          }
+        }
+
+        returned_file_level_ = curr_level_;
+        if (curr_level_ > 0 && cmp_largest < 0) {
+          // No more files to search in this level.
+          search_ended_ = !PrepareNextLevel();
+        } else {
+          ++curr_index_in_curr_level_;
+        }
+        return f;
+      }
+      // Start searching next level.
+      search_ended_ = !PrepareNextLevel();
+    }
+    // Search ended.
+    return nullptr;
+  }
+
+  // getter for current file level
+  // for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts
+  unsigned int GetHitFileLevel() { return hit_file_level_; }
+
+  // Returns true if the most recent "hit file" (i.e., one returned by
+  // GetNextFile()) is at the last index in its level.
+  bool IsHitFileLastInLevel() { return is_hit_file_last_in_level_; }
+
+ private:
+  unsigned int num_levels_;
+  unsigned int curr_level_;
+  unsigned int returned_file_level_;
+  unsigned int hit_file_level_;
+  int32_t search_left_bound_;
+  int32_t search_right_bound_;
+  autovector<LevelFilesBrief>* level_files_brief_;
+  bool search_ended_;
+  bool is_hit_file_last_in_level_;
+  LevelFilesBrief* curr_file_level_;
+  unsigned int curr_index_in_curr_level_;
+  unsigned int start_index_in_curr_level_;
+  Slice user_key_;
+  Slice ikey_;
+  FileIndexer* file_indexer_;
+  const Comparator* user_comparator_;
+  const InternalKeyComparator* internal_comparator_;
+
+  // Setup local variables to search next level.
+  // Returns false if there are no more levels to search.
+  bool PrepareNextLevel() {
+    curr_level_++;
+    while (curr_level_ < num_levels_) {
+      curr_file_level_ = &(*level_files_brief_)[curr_level_];
+      if (curr_file_level_->num_files == 0) {
+        // When current level is empty, the search bound generated from upper
+        // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is
+        // also empty.
+        assert(search_left_bound_ == 0);
+        assert(search_right_bound_ == -1 ||
+               search_right_bound_ == FileIndexer::kLevelMaxIndex);
+        // Since current level is empty, it will need to search all files in
+        // the next level
+        search_left_bound_ = 0;
+        search_right_bound_ = FileIndexer::kLevelMaxIndex;
+        curr_level_++;
+        continue;
+      }
+
+      // Some files may overlap each other. We find
+      // all files that overlap user_key and process them in order from
+      // newest to oldest. In the context of merge-operator, this can occur at
+      // any level. Otherwise, it only occurs at Level-0 (since Put/Deletes
+      // are always compacted into a single entry).
+      int32_t start_index;
+      if (curr_level_ == 0) {
+        // On Level-0, we read through all files to check for overlap.
+        start_index = 0;
+      } else {
+        // On Level-n (n>=1), files are sorted. Binary search to find the
+        // earliest file whose largest key >= ikey. Search left bound and
+        // right bound are used to narrow the range.
+        if (search_left_bound_ <= search_right_bound_) {
+          if (search_right_bound_ == FileIndexer::kLevelMaxIndex) {
+            search_right_bound_ =
+                static_cast<int32_t>(curr_file_level_->num_files) - 1;
+          }
+          // `search_right_bound_` is an inclusive upper-bound, but since it was
+          // determined based on user key, it is still possible the lookup key
+          // falls to the right of `search_right_bound_`'s corresponding file.
+          // So, pass a limit one higher, which allows us to detect this case.
+          start_index =
+              FindFileInRange(*internal_comparator_, *curr_file_level_, ikey_,
+                              static_cast<uint32_t>(search_left_bound_),
+                              static_cast<uint32_t>(search_right_bound_) + 1);
+          if (start_index == search_right_bound_ + 1) {
+            // `ikey_` comes after `search_right_bound_`. The lookup key does
+            // not exist on this level, so let's skip this level and do a full
+            // binary search on the next level.
+            search_left_bound_ = 0;
+            search_right_bound_ = FileIndexer::kLevelMaxIndex;
+            curr_level_++;
+            continue;
+          }
+        } else {
+          // search_left_bound > search_right_bound, key does not exist in
+          // this level. Since no comparison is done in this level, it will
+          // need to search all files in the next level.
+          search_left_bound_ = 0;
+          search_right_bound_ = FileIndexer::kLevelMaxIndex;
+          curr_level_++;
+          continue;
+        }
+      }
+      start_index_in_curr_level_ = start_index;
+      curr_index_in_curr_level_ = start_index;
+
+      return true;
+    }
+    // curr_level_ = num_levels_. So, no more levels to search.
+    return false;
+  }
+};
+}  // anonymous namespace
+
+class FilePickerMultiGet {
+ private:
+  struct FilePickerContext;
+
+ public:
+  FilePickerMultiGet(MultiGetRange* range,
+                     autovector<LevelFilesBrief>* file_levels,
+                     unsigned int num_levels, FileIndexer* file_indexer,
+                     const Comparator* user_comparator,
+                     const InternalKeyComparator* internal_comparator)
+      : num_levels_(num_levels),
+        curr_level_(static_cast<unsigned int>(-1)),
+        returned_file_level_(static_cast<unsigned int>(-1)),
+        hit_file_level_(static_cast<unsigned int>(-1)),
+        range_(*range, range->begin(), range->end()),
+        maybe_repeat_key_(false),
+        current_level_range_(*range, range->begin(), range->end()),
+        current_file_range_(*range, range->begin(), range->end()),
+        batch_iter_(range->begin()),
+        batch_iter_prev_(range->begin()),
+        upper_key_(range->begin()),
+        level_files_brief_(file_levels),
+        is_hit_file_last_in_level_(false),
+        curr_file_level_(nullptr),
+        file_indexer_(file_indexer),
+        user_comparator_(user_comparator),
+        internal_comparator_(internal_comparator),
+        hit_file_(nullptr) {
+    for (auto iter = range_.begin(); iter != range_.end(); ++iter) {
+      fp_ctx_array_[iter.index()] =
+          FilePickerContext(0, FileIndexer::kLevelMaxIndex);
+    }
+
+    // Setup member variables to search first level.
+    search_ended_ = !PrepareNextLevel();
+    if (!search_ended_) {
+      // REVISIT
+      // Prefetch Level 0 table data to avoid cache miss if possible.
+      // As of now, only PlainTableReader and CuckooTableReader do any
+      // prefetching. This may not be necessary anymore once we implement
+      // batching in those table readers
+      for (unsigned int i = 0; i < (*level_files_brief_)[0].num_files; ++i) {
+        auto* r = (*level_files_brief_)[0].files[i].fd.table_reader;
+        if (r) {
+          for (auto iter = range_.begin(); iter != range_.end(); ++iter) {
+            r->Prepare(iter->ikey);
+          }
+        }
+      }
+    }
+  }
+
+  FilePickerMultiGet(MultiGetRange* range, const FilePickerMultiGet& other)
+      : num_levels_(other.num_levels_),
+        curr_level_(other.curr_level_),
+        returned_file_level_(other.returned_file_level_),
+        hit_file_level_(other.hit_file_level_),
+        fp_ctx_array_(other.fp_ctx_array_),
+        range_(*range, range->begin(), range->end()),
+        maybe_repeat_key_(false),
+        current_level_range_(*range, range->begin(), range->end()),
+        current_file_range_(*range, range->begin(), range->end()),
+        batch_iter_(range->begin()),
+        batch_iter_prev_(range->begin()),
+        upper_key_(range->begin()),
+        level_files_brief_(other.level_files_brief_),
+        is_hit_file_last_in_level_(false),
+        curr_file_level_(other.curr_file_level_),
+        file_indexer_(other.file_indexer_),
+        user_comparator_(other.user_comparator_),
+        internal_comparator_(other.internal_comparator_),
+        hit_file_(nullptr) {
+    PrepareNextLevelForSearch();
+  }
+
+  int GetCurrentLevel() const { return curr_level_; }
+
+  void PrepareNextLevelForSearch() { search_ended_ = !PrepareNextLevel(); }
+
+  FdWithKeyRange* GetNextFileInLevel() {
+    if (batch_iter_ == current_level_range_.end() || search_ended_) {
+      hit_file_ = nullptr;
+      return nullptr;
+    } else {
+      if (maybe_repeat_key_) {
+        maybe_repeat_key_ = false;
+        // Check if we found the final value for the last key in the
+        // previous lookup range. If we did, then there's no need to look
+        // any further for that key, so advance batch_iter_. Else, keep
+        // batch_iter_ positioned on that key so we look it up again in
+        // the next file
+        // For L0, always advance the key because we will look in the next
+        // file regardless for all keys not found yet
+        if (current_level_range_.CheckKeyDone(batch_iter_) ||
+            curr_level_ == 0) {
+          batch_iter_ = upper_key_;
+        }
+      }
+      // batch_iter_prev_ will become the start key for the next file
+      // lookup
+      batch_iter_prev_ = batch_iter_;
+    }
+
+    MultiGetRange next_file_range(current_level_range_, batch_iter_prev_,
+                                  current_level_range_.end());
+    size_t curr_file_index =
+        (batch_iter_ != current_level_range_.end())
+            ? fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level
+            : curr_file_level_->num_files;
+    FdWithKeyRange* f;
+    bool is_last_key_in_file;
+    if (!GetNextFileInLevelWithKeys(&next_file_range, &curr_file_index, &f,
+                                    &is_last_key_in_file)) {
+      hit_file_ = nullptr;
+      return nullptr;
+    } else {
+      if (is_last_key_in_file) {
+        // Since cmp_largest is 0, batch_iter_ still points to the last key
+        // that falls in this file, instead of the next one. Increment
+        // the file index for all keys between batch_iter_ and upper_key_
+        auto tmp_iter = batch_iter_;
+        while (tmp_iter != upper_key_) {
+          ++(fp_ctx_array_[tmp_iter.index()].curr_index_in_curr_level);
+          ++tmp_iter;
+        }
+        maybe_repeat_key_ = true;
+      }
+      // Set the range for this file
+      current_file_range_ =
+          MultiGetRange(next_file_range, batch_iter_prev_, upper_key_);
+      returned_file_level_ = curr_level_;
+      hit_file_level_ = curr_level_;
+      is_hit_file_last_in_level_ =
+          curr_file_index == curr_file_level_->num_files - 1;
+      hit_file_ = f;
+      return f;
+    }
+  }
+
+  // getter for current file level
+  // for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts
+  unsigned int GetHitFileLevel() { return hit_file_level_; }
+
+  FdWithKeyRange* GetHitFile() { return hit_file_; }
+
+  // Returns true if the most recent "hit file" (i.e., one returned by
+  // GetNextFile()) is at the last index in its level.
+  bool IsHitFileLastInLevel() { return is_hit_file_last_in_level_; }
+
+  bool KeyMaySpanNextFile() { return maybe_repeat_key_; }
+
+  bool IsSearchEnded() { return search_ended_; }
+
+  const MultiGetRange& CurrentFileRange() { return current_file_range_; }
+
+  bool RemainingOverlapInLevel() {
+    return !current_level_range_.Suffix(current_file_range_).empty();
+  }
+
+  MultiGetRange& GetRange() { return range_; }
+
+  void ReplaceRange(const MultiGetRange& other) {
+    assert(hit_file_ == nullptr);
+    range_ = other;
+    current_level_range_ = other;
+  }
+
+  FilePickerMultiGet(FilePickerMultiGet&& other)
+      : num_levels_(other.num_levels_),
+        curr_level_(other.curr_level_),
+        returned_file_level_(other.returned_file_level_),
+        hit_file_level_(other.hit_file_level_),
+        fp_ctx_array_(std::move(other.fp_ctx_array_)),
+        range_(std::move(other.range_)),
+        maybe_repeat_key_(other.maybe_repeat_key_),
+        current_level_range_(std::move(other.current_level_range_)),
+        current_file_range_(std::move(other.current_file_range_)),
+        batch_iter_(other.batch_iter_, &current_level_range_),
+        batch_iter_prev_(other.batch_iter_prev_, &current_level_range_),
+        upper_key_(other.upper_key_, &current_level_range_),
+        level_files_brief_(other.level_files_brief_),
+        search_ended_(other.search_ended_),
+        is_hit_file_last_in_level_(other.is_hit_file_last_in_level_),
+        curr_file_level_(other.curr_file_level_),
+        file_indexer_(other.file_indexer_),
+        user_comparator_(other.user_comparator_),
+        internal_comparator_(other.internal_comparator_),
+        hit_file_(other.hit_file_) {}
+
+ private:
+  unsigned int num_levels_;
+  unsigned int curr_level_;
+  unsigned int returned_file_level_;
+  unsigned int hit_file_level_;
+
+  struct FilePickerContext {
+    int32_t search_left_bound;
+    int32_t search_right_bound;
+    unsigned int curr_index_in_curr_level;
+    unsigned int start_index_in_curr_level;
+
+    FilePickerContext(int32_t left, int32_t right)
+        : search_left_bound(left),
+          search_right_bound(right),
+          curr_index_in_curr_level(0),
+          start_index_in_curr_level(0) {}
+
+    FilePickerContext() = default;
+  };
+  std::array<FilePickerContext, MultiGetContext::MAX_BATCH_SIZE> fp_ctx_array_;
+  MultiGetRange range_;
+  bool maybe_repeat_key_;
+  MultiGetRange current_level_range_;
+  MultiGetRange current_file_range_;
+  // Iterator to iterate through the keys in a MultiGet batch, that gets reset
+  // at the beginning of each level. Each call to GetNextFile() will position
+  // batch_iter_ at or right after the last key that was found in the returned
+  // SST file
+  MultiGetRange::Iterator batch_iter_;
+  // An iterator that records the previous position of batch_iter_, i.e last
+  // key found in the previous SST file, in order to serve as the start of
+  // the batch key range for the next SST file
+  MultiGetRange::Iterator batch_iter_prev_;
+  MultiGetRange::Iterator upper_key_;
+  autovector<LevelFilesBrief>* level_files_brief_;
+  bool search_ended_;
+  bool is_hit_file_last_in_level_;
+  LevelFilesBrief* curr_file_level_;
+  FileIndexer* file_indexer_;
+  const Comparator* user_comparator_;
+  const InternalKeyComparator* internal_comparator_;
+  FdWithKeyRange* hit_file_;
+
+  // Iterates through files in the current level until it finds a file that
+  // contains at least one key from the MultiGet batch
+  bool GetNextFileInLevelWithKeys(MultiGetRange* next_file_range,
+                                  size_t* file_index, FdWithKeyRange** fd,
+                                  bool* is_last_key_in_file) {
+    size_t curr_file_index = *file_index;
+    FdWithKeyRange* f = nullptr;
+    bool file_hit = false;
+    int cmp_largest = -1;
+    if (curr_file_index >= curr_file_level_->num_files) {
+      // In the unlikely case the next key is a duplicate of the current key,
+      // and the current key is the last in the level and the internal key
+      // was not found, we need to skip lookup for the remaining keys and
+      // reset the search bounds
+      if (batch_iter_ != current_level_range_.end()) {
+        ++batch_iter_;
+        for (; batch_iter_ != current_level_range_.end(); ++batch_iter_) {
+          struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()];
+          fp_ctx.search_left_bound = 0;
+          fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex;
+        }
+      }
+      return false;
+    }
+    // Loops over keys in the MultiGet batch until it finds a file with
+    // atleast one of the keys. Then it keeps moving forward until the
+    // last key in the batch that falls in that file
+    while (batch_iter_ != current_level_range_.end() &&
+           (fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level ==
+                curr_file_index ||
+            !file_hit)) {
+      struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()];
+      f = &curr_file_level_->files[fp_ctx.curr_index_in_curr_level];
+      Slice& user_key = batch_iter_->ukey_without_ts;
+
+      // Do key range filtering of files or/and fractional cascading if:
+      // (1) not all the files are in level 0, or
+      // (2) there are more than 3 current level files
+      // If there are only 3 or less current level files in the system, we
+      // skip the key range filtering. In this case, more likely, the system
+      // is highly tuned to minimize number of tables queried by each query,
+      // so it is unlikely that key range filtering is more efficient than
+      // querying the files.
+      if (num_levels_ > 1 || curr_file_level_->num_files > 3) {
+        // Check if key is within a file's range. If search left bound and
+        // right bound point to the same find, we are sure key falls in
+        // range.
+        int cmp_smallest = user_comparator_->CompareWithoutTimestamp(
+            user_key, false, ExtractUserKey(f->smallest_key), true);
+
+        assert(curr_level_ == 0 ||
+               fp_ctx.curr_index_in_curr_level ==
+                   fp_ctx.start_index_in_curr_level ||
+               cmp_smallest <= 0);
+
+        if (cmp_smallest >= 0) {
+          cmp_largest = user_comparator_->CompareWithoutTimestamp(
+              user_key, false, ExtractUserKey(f->largest_key), true);
+        } else {
+          cmp_largest = -1;
+        }
+
+        // Setup file search bound for the next level based on the
+        // comparison results
+        if (curr_level_ > 0) {
+          file_indexer_->GetNextLevelIndex(
+              curr_level_, fp_ctx.curr_index_in_curr_level, cmp_smallest,
+              cmp_largest, &fp_ctx.search_left_bound,
+              &fp_ctx.search_right_bound);
+        }
+        // Key falls out of current file's range
+        if (cmp_smallest < 0 || cmp_largest > 0) {
+          next_file_range->SkipKey(batch_iter_);
+        } else {
+          file_hit = true;
+        }
+      } else {
+        file_hit = true;
+      }
+      if (cmp_largest == 0) {
+        // cmp_largest is 0, which means the next key will not be in this
+        // file, so stop looking further. However, its possible there are
+        // duplicates in the batch, so find the upper bound for the batch
+        // in this file (upper_key_) by skipping past the duplicates. We
+        // leave batch_iter_ as is since we may have to pick up from there
+        // for the next file, if this file has a merge value rather than
+        // final value
+        upper_key_ = batch_iter_;
+        ++upper_key_;
+        while (upper_key_ != current_level_range_.end() &&
+               user_comparator_->CompareWithoutTimestamp(
+                   batch_iter_->ukey_without_ts, false,
+                   upper_key_->ukey_without_ts, false) == 0) {
+          ++upper_key_;
+        }
+        break;
+      } else {
+        if (curr_level_ == 0) {
+          // We need to look through all files in level 0
+          ++fp_ctx.curr_index_in_curr_level;
+        }
+        ++batch_iter_;
+      }
+      if (!file_hit) {
+        curr_file_index =
+            (batch_iter_ != current_level_range_.end())
+                ? fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level
+                : curr_file_level_->num_files;
+      }
+    }
+
+    *fd = f;
+    *file_index = curr_file_index;
+    *is_last_key_in_file = cmp_largest == 0;
+    if (!*is_last_key_in_file) {
+      // If the largest key in the batch overlapping the file is not the
+      // largest key in the file, upper_ley_ would not have been updated so
+      // update it here
+      upper_key_ = batch_iter_;
+    }
+    return file_hit;
+  }
+
+  // Setup local variables to search next level.
+  // Returns false if there are no more levels to search.
+  bool PrepareNextLevel() {
+    if (curr_level_ == 0) {
+      MultiGetRange::Iterator mget_iter = current_level_range_.begin();
+      if (fp_ctx_array_[mget_iter.index()].curr_index_in_curr_level <
+          curr_file_level_->num_files) {
+        batch_iter_prev_ = current_level_range_.begin();
+        upper_key_ = batch_iter_ = current_level_range_.begin();
+        return true;
+      }
+    }
+
+    curr_level_++;
+    // Reset key range to saved value
+    while (curr_level_ < num_levels_) {
+      bool level_contains_keys = false;
+      curr_file_level_ = &(*level_files_brief_)[curr_level_];
+      if (curr_file_level_->num_files == 0) {
+        // When current level is empty, the search bound generated from upper
+        // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is
+        // also empty.
+
+        for (auto mget_iter = current_level_range_.begin();
+             mget_iter != current_level_range_.end(); ++mget_iter) {
+          struct FilePickerContext& fp_ctx = fp_ctx_array_[mget_iter.index()];
+
+          assert(fp_ctx.search_left_bound == 0);
+          assert(fp_ctx.search_right_bound == -1 ||
+                 fp_ctx.search_right_bound == FileIndexer::kLevelMaxIndex);
+          // Since current level is empty, it will need to search all files in
+          // the next level
+          fp_ctx.search_left_bound = 0;
+          fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex;
+        }
+        // Skip all subsequent empty levels
+        do {
+          ++curr_level_;
+        } while ((curr_level_ < num_levels_) &&
+                 (*level_files_brief_)[curr_level_].num_files == 0);
+        continue;
+      }
+
+      // Some files may overlap each other. We find
+      // all files that overlap user_key and process them in order from
+      // newest to oldest. In the context of merge-operator, this can occur at
+      // any level. Otherwise, it only occurs at Level-0 (since Put/Deletes
+      // are always compacted into a single entry).
+      int32_t start_index = -1;
+      current_level_range_ =
+          MultiGetRange(range_, range_.begin(), range_.end());
+      for (auto mget_iter = current_level_range_.begin();
+           mget_iter != current_level_range_.end(); ++mget_iter) {
+        struct FilePickerContext& fp_ctx = fp_ctx_array_[mget_iter.index()];
+        if (curr_level_ == 0) {
+          // On Level-0, we read through all files to check for overlap.
+          start_index = 0;
+          level_contains_keys = true;
+        } else {
+          // On Level-n (n>=1), files are sorted. Binary search to find the
+          // earliest file whose largest key >= ikey. Search left bound and
+          // right bound are used to narrow the range.
+          if (fp_ctx.search_left_bound <= fp_ctx.search_right_bound) {
+            if (fp_ctx.search_right_bound == FileIndexer::kLevelMaxIndex) {
+              fp_ctx.search_right_bound =
+                  static_cast<int32_t>(curr_file_level_->num_files) - 1;
+            }
+            // `search_right_bound_` is an inclusive upper-bound, but since it
+            // was determined based on user key, it is still possible the lookup
+            // key falls to the right of `search_right_bound_`'s corresponding
+            // file. So, pass a limit one higher, which allows us to detect this
+            // case.
+            Slice& ikey = mget_iter->ikey;
+            start_index = FindFileInRange(
+                *internal_comparator_, *curr_file_level_, ikey,
+                static_cast<uint32_t>(fp_ctx.search_left_bound),
+                static_cast<uint32_t>(fp_ctx.search_right_bound) + 1);
+            if (start_index == fp_ctx.search_right_bound + 1) {
+              // `ikey_` comes after `search_right_bound_`. The lookup key does
+              // not exist on this level, so let's skip this level and do a full
+              // binary search on the next level.
+              fp_ctx.search_left_bound = 0;
+              fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex;
+              current_level_range_.SkipKey(mget_iter);
+              continue;
+            } else {
+              level_contains_keys = true;
+            }
+          } else {
+            // search_left_bound > search_right_bound, key does not exist in
+            // this level. Since no comparison is done in this level, it will
+            // need to search all files in the next level.
+            fp_ctx.search_left_bound = 0;
+            fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex;
+            current_level_range_.SkipKey(mget_iter);
+            continue;
+          }
+        }
+        fp_ctx.start_index_in_curr_level = start_index;
+        fp_ctx.curr_index_in_curr_level = start_index;
+      }
+      if (level_contains_keys) {
+        batch_iter_prev_ = current_level_range_.begin();
+        upper_key_ = batch_iter_ = current_level_range_.begin();
+        return true;
+      }
+      curr_level_++;
+    }
+    // curr_level_ = num_levels_. So, no more levels to search.
+    return false;
+  }
+};
+
+VersionStorageInfo::~VersionStorageInfo() { delete[] files_; }
+
+Version::~Version() {
+  assert(refs_ == 0);
+
+  // Remove from linked list
+  prev_->next_ = next_;
+  next_->prev_ = prev_;
+
+  // Drop references to files
+  for (int level = 0; level < storage_info_.num_levels_; level++) {
+    for (size_t i = 0; i < storage_info_.files_[level].size(); i++) {
+      FileMetaData* f = storage_info_.files_[level][i];
+      assert(f->refs > 0);
+      f->refs--;
+      if (f->refs <= 0) {
+        assert(cfd_ != nullptr);
+        uint32_t path_id = f->fd.GetPathId();
+        assert(path_id < cfd_->ioptions()->cf_paths.size());
+        vset_->obsolete_files_.push_back(
+            ObsoleteFileInfo(f, cfd_->ioptions()->cf_paths[path_id].path,
+                             cfd_->GetFileMetadataCacheReservationManager()));
+      }
+    }
+  }
+}
+
+int FindFile(const InternalKeyComparator& icmp,
+             const LevelFilesBrief& file_level, const Slice& key) {
+  return FindFileInRange(icmp, file_level, key, 0,
+                         static_cast<uint32_t>(file_level.num_files));
+}
+
+void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
+                               const std::vector<FileMetaData*>& files,
+                               Arena* arena) {
+  assert(file_level);
+  assert(arena);
+
+  size_t num = files.size();
+  file_level->num_files = num;
+  char* mem = arena->AllocateAligned(num * sizeof(FdWithKeyRange));
+  file_level->files = new (mem) FdWithKeyRange[num];
+
+  for (size_t i = 0; i < num; i++) {
+    Slice smallest_key = files[i]->smallest.Encode();
+    Slice largest_key = files[i]->largest.Encode();
+
+    // Copy key slice to sequential memory
+    size_t smallest_size = smallest_key.size();
+    size_t largest_size = largest_key.size();
+    mem = arena->AllocateAligned(smallest_size + largest_size);
+    memcpy(mem, smallest_key.data(), smallest_size);
+    memcpy(mem + smallest_size, largest_key.data(), largest_size);
+
+    FdWithKeyRange& f = file_level->files[i];
+    f.fd = files[i]->fd;
+    f.file_metadata = files[i];
+    f.smallest_key = Slice(mem, smallest_size);
+    f.largest_key = Slice(mem + smallest_size, largest_size);
+  }
+}
+
+static bool AfterFile(const Comparator* ucmp, const Slice* user_key,
+                      const FdWithKeyRange* f) {
+  // nullptr user_key occurs before all keys and is therefore never after *f
+  return (user_key != nullptr &&
+          ucmp->CompareWithoutTimestamp(*user_key,
+                                        ExtractUserKey(f->largest_key)) > 0);
+}
+
+static bool BeforeFile(const Comparator* ucmp, const Slice* user_key,
+                       const FdWithKeyRange* f) {
+  // nullptr user_key occurs after all keys and is therefore never before *f
+  return (user_key != nullptr &&
+          ucmp->CompareWithoutTimestamp(*user_key,
+                                        ExtractUserKey(f->smallest_key)) < 0);
+}
+
+bool SomeFileOverlapsRange(const InternalKeyComparator& icmp,
+                           bool disjoint_sorted_files,
+                           const LevelFilesBrief& file_level,
+                           const Slice* smallest_user_key,
+                           const Slice* largest_user_key) {
+  const Comparator* ucmp = icmp.user_comparator();
+  if (!disjoint_sorted_files) {
+    // Need to check against all files
+    for (size_t i = 0; i < file_level.num_files; i++) {
+      const FdWithKeyRange* f = &(file_level.files[i]);
+      if (AfterFile(ucmp, smallest_user_key, f) ||
+          BeforeFile(ucmp, largest_user_key, f)) {
+        // No overlap
+      } else {
+        return true;  // Overlap
+      }
+    }
+    return false;
+  }
+
+  // Binary search over file list
+  uint32_t index = 0;
+  if (smallest_user_key != nullptr) {
+    // Find the leftmost possible internal key for smallest_user_key
+    InternalKey small;
+    small.SetMinPossibleForUserKey(*smallest_user_key);
+    index = FindFile(icmp, file_level, small.Encode());
+  }
+
+  if (index >= file_level.num_files) {
+    // beginning of range is after all files, so no overlap.
+    return false;
+  }
+
+  return !BeforeFile(ucmp, largest_user_key, &file_level.files[index]);
+}
+
+namespace {
+
+class LevelIterator final : public InternalIterator {
+ public:
+  // @param read_options Must outlive this iterator.
+  LevelIterator(
+      TableCache* table_cache, const ReadOptions& read_options,
+      const FileOptions& file_options, const InternalKeyComparator& icomparator,
+      const LevelFilesBrief* flevel,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor,
+      bool should_sample, HistogramImpl* file_read_hist,
+      TableReaderCaller caller, bool skip_filters, int level,
+      RangeDelAggregator* range_del_agg,
+      const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries =
+          nullptr,
+      bool allow_unprepared_value = false,
+      TruncatedRangeDelIterator**** range_tombstone_iter_ptr_ = nullptr)
+      : table_cache_(table_cache),
+        read_options_(read_options),
+        file_options_(file_options),
+        icomparator_(icomparator),
+        user_comparator_(icomparator.user_comparator()),
+        flevel_(flevel),
+        prefix_extractor_(prefix_extractor),
+        file_read_hist_(file_read_hist),
+        should_sample_(should_sample),
+        caller_(caller),
+        skip_filters_(skip_filters),
+        allow_unprepared_value_(allow_unprepared_value),
+        file_index_(flevel_->num_files),
+        level_(level),
+        range_del_agg_(range_del_agg),
+        pinned_iters_mgr_(nullptr),
+        compaction_boundaries_(compaction_boundaries),
+        is_next_read_sequential_(false),
+        range_tombstone_iter_(nullptr),
+        to_return_sentinel_(false) {
+    // Empty level is not supported.
+    assert(flevel_ != nullptr && flevel_->num_files > 0);
+    if (range_tombstone_iter_ptr_) {
+      *range_tombstone_iter_ptr_ = &range_tombstone_iter_;
+    }
+  }
+
+  ~LevelIterator() override { delete file_iter_.Set(nullptr); }
+
+  // Seek to the first file with a key >= target.
+  // If range_tombstone_iter_ is not nullptr, then we pretend that file
+  // boundaries are fake keys (sentinel keys). These keys are used to keep range
+  // tombstones alive even when all point keys in an SST file are exhausted.
+  // These sentinel keys will be skipped in merging iterator.
+  void Seek(const Slice& target) override;
+  void SeekForPrev(const Slice& target) override;
+  void SeekToFirst() override;
+  void SeekToLast() override;
+  void Next() final override;
+  bool NextAndGetResult(IterateResult* result) override;
+  void Prev() override;
+
+  // In addition to valid and invalid state (!file_iter.Valid() and
+  // status.ok()), a third state of the iterator is when !file_iter_.Valid() and
+  // to_return_sentinel_. This means we are at the end of a file, and a sentinel
+  // key (the file boundary that we pretend as a key) is to be returned next.
+  // file_iter_.Valid() and to_return_sentinel_ should not both be true.
+  bool Valid() const override {
+    assert(!(file_iter_.Valid() && to_return_sentinel_));
+    return file_iter_.Valid() || to_return_sentinel_;
+  }
+  Slice key() const override {
+    assert(Valid());
+    if (to_return_sentinel_) {
+      // Sentinel should be returned after file_iter_ reaches the end of the
+      // file
+      assert(!file_iter_.Valid());
+      return sentinel_;
+    }
+    return file_iter_.key();
+  }
+
+  Slice value() const override {
+    assert(Valid());
+    assert(!to_return_sentinel_);
+    return file_iter_.value();
+  }
+
+  Status status() const override {
+    return file_iter_.iter() ? file_iter_.status() : Status::OK();
+  }
+
+  bool PrepareValue() override { return file_iter_.PrepareValue(); }
+
+  inline bool MayBeOutOfLowerBound() override {
+    assert(Valid());
+    return may_be_out_of_lower_bound_ && file_iter_.MayBeOutOfLowerBound();
+  }
+
+  inline IterBoundCheck UpperBoundCheckResult() override {
+    if (Valid()) {
+      return file_iter_.UpperBoundCheckResult();
+    } else {
+      return IterBoundCheck::kUnknown;
+    }
+  }
+
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+    pinned_iters_mgr_ = pinned_iters_mgr;
+    if (file_iter_.iter()) {
+      file_iter_.SetPinnedItersMgr(pinned_iters_mgr);
+    }
+  }
+
+  bool IsKeyPinned() const override {
+    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+           file_iter_.iter() && file_iter_.IsKeyPinned();
+  }
+
+  bool IsValuePinned() const override {
+    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+           file_iter_.iter() && file_iter_.IsValuePinned();
+  }
+
+  bool IsDeleteRangeSentinelKey() const override { return to_return_sentinel_; }
+
+ private:
+  // Return true if at least one invalid file is seen and skipped.
+  bool SkipEmptyFileForward();
+  void SkipEmptyFileBackward();
+  void SetFileIterator(InternalIterator* iter);
+  void InitFileIterator(size_t new_file_index);
+
+  const Slice& file_smallest_key(size_t file_index) {
+    assert(file_index < flevel_->num_files);
+    return flevel_->files[file_index].smallest_key;
+  }
+
+  const Slice& file_largest_key(size_t file_index) {
+    assert(file_index < flevel_->num_files);
+    return flevel_->files[file_index].largest_key;
+  }
+
+  bool KeyReachedUpperBound(const Slice& internal_key) {
+    return read_options_.iterate_upper_bound != nullptr &&
+           user_comparator_.CompareWithoutTimestamp(
+               ExtractUserKey(internal_key), /*a_has_ts=*/true,
+               *read_options_.iterate_upper_bound, /*b_has_ts=*/false) >= 0;
+  }
+
+  void ClearRangeTombstoneIter() {
+    if (range_tombstone_iter_ && *range_tombstone_iter_) {
+      delete *range_tombstone_iter_;
+      *range_tombstone_iter_ = nullptr;
+    }
+  }
+
+  // Move file_iter_ to the file at file_index_.
+  // range_tombstone_iter_ is updated with a range tombstone iterator
+  // into the new file. Old range tombstone iterator is cleared.
+  InternalIterator* NewFileIterator() {
+    assert(file_index_ < flevel_->num_files);
+    auto file_meta = flevel_->files[file_index_];
+    if (should_sample_) {
+      sample_file_read_inc(file_meta.file_metadata);
+    }
+
+    const InternalKey* smallest_compaction_key = nullptr;
+    const InternalKey* largest_compaction_key = nullptr;
+    if (compaction_boundaries_ != nullptr) {
+      smallest_compaction_key = (*compaction_boundaries_)[file_index_].smallest;
+      largest_compaction_key = (*compaction_boundaries_)[file_index_].largest;
+    }
+    CheckMayBeOutOfLowerBound();
+    ClearRangeTombstoneIter();
+    return table_cache_->NewIterator(
+        read_options_, file_options_, icomparator_, *file_meta.file_metadata,
+        range_del_agg_, prefix_extractor_,
+        nullptr /* don't need reference to table */, file_read_hist_, caller_,
+        /*arena=*/nullptr, skip_filters_, level_,
+        /*max_file_size_for_l0_meta_pin=*/0, smallest_compaction_key,
+        largest_compaction_key, allow_unprepared_value_, range_tombstone_iter_);
+  }
+
+  // Check if current file being fully within iterate_lower_bound.
+  //
+  // Note MyRocks may update iterate bounds between seek. To workaround it,
+  // we need to check and update may_be_out_of_lower_bound_ accordingly.
+  void CheckMayBeOutOfLowerBound() {
+    if (read_options_.iterate_lower_bound != nullptr &&
+        file_index_ < flevel_->num_files) {
+      may_be_out_of_lower_bound_ =
+          user_comparator_.CompareWithoutTimestamp(
+              ExtractUserKey(file_smallest_key(file_index_)), /*a_has_ts=*/true,
+              *read_options_.iterate_lower_bound, /*b_has_ts=*/false) < 0;
+    }
+  }
+
+  TableCache* table_cache_;
+  const ReadOptions& read_options_;
+  const FileOptions& file_options_;
+  const InternalKeyComparator& icomparator_;
+  const UserComparatorWrapper user_comparator_;
+  const LevelFilesBrief* flevel_;
+  mutable FileDescriptor current_value_;
+  // `prefix_extractor_` may be non-null even for total order seek. Checking
+  // this variable is not the right way to identify whether prefix iterator
+  // is used.
+  const std::shared_ptr<const SliceTransform>& prefix_extractor_;
+
+  HistogramImpl* file_read_hist_;
+  bool should_sample_;
+  TableReaderCaller caller_;
+  bool skip_filters_;
+  bool allow_unprepared_value_;
+  bool may_be_out_of_lower_bound_ = true;
+  size_t file_index_;
+  int level_;
+  RangeDelAggregator* range_del_agg_;
+  IteratorWrapper file_iter_;  // May be nullptr
+  PinnedIteratorsManager* pinned_iters_mgr_;
+
+  // To be propagated to RangeDelAggregator in order to safely truncate range
+  // tombstones.
+  const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries_;
+
+  bool is_next_read_sequential_;
+
+  // This is set when this level iterator is used under a merging iterator
+  // that processes range tombstones. range_tombstone_iter_ points to where the
+  // merging iterator stores the range tombstones iterator for this level. When
+  // this level iterator moves to a new SST file, it updates the range
+  // tombstones accordingly through this pointer. So the merging iterator always
+  // has access to the current SST file's range tombstones.
+  //
+  // The level iterator treats file boundary as fake keys (sentinel keys) to
+  // keep range tombstones alive if needed and make upper level, i.e. merging
+  // iterator, aware of file changes (when level iterator moves to a new SST
+  // file, there is some bookkeeping work that needs to be done at merging
+  // iterator end).
+  //
+  // *range_tombstone_iter_ points to range tombstones of the current SST file
+  TruncatedRangeDelIterator** range_tombstone_iter_;
+
+  // Whether next/prev key is a sentinel key.
+  bool to_return_sentinel_ = false;
+  // The sentinel key to be returned
+  Slice sentinel_;
+  // Sets flags for if we should return the sentinel key next.
+  // The condition for returning sentinel is reaching the end of current
+  // file_iter_: !Valid() && status.().ok().
+  void TrySetDeleteRangeSentinel(const Slice& boundary_key);
+  void ClearSentinel() { to_return_sentinel_ = false; }
+
+  // Set in Seek() when a prefix seek reaches end of the current file,
+  // and the next file has a different prefix. SkipEmptyFileForward()
+  // will not move to next file when this flag is set.
+  bool prefix_exhausted_ = false;
+};
+
+void LevelIterator::TrySetDeleteRangeSentinel(const Slice& boundary_key) {
+  assert(range_tombstone_iter_);
+  if (file_iter_.iter() != nullptr && !file_iter_.Valid() &&
+      file_iter_.status().ok()) {
+    to_return_sentinel_ = true;
+    sentinel_ = boundary_key;
+  }
+}
+
+void LevelIterator::Seek(const Slice& target) {
+  prefix_exhausted_ = false;
+  ClearSentinel();
+  // Check whether the seek key fall under the same file
+  bool need_to_reseek = true;
+  if (file_iter_.iter() != nullptr && file_index_ < flevel_->num_files) {
+    const FdWithKeyRange& cur_file = flevel_->files[file_index_];
+    if (icomparator_.InternalKeyComparator::Compare(
+            target, cur_file.largest_key) <= 0 &&
+        icomparator_.InternalKeyComparator::Compare(
+            target, cur_file.smallest_key) >= 0) {
+      need_to_reseek = false;
+      assert(static_cast<size_t>(FindFile(icomparator_, *flevel_, target)) ==
+             file_index_);
+    }
+  }
+  if (need_to_reseek) {
+    TEST_SYNC_POINT("LevelIterator::Seek:BeforeFindFile");
+    size_t new_file_index = FindFile(icomparator_, *flevel_, target);
+    InitFileIterator(new_file_index);
+  }
+
+  if (file_iter_.iter() != nullptr) {
+    file_iter_.Seek(target);
+    // Status::TryAgain indicates asynchronous request for retrieval of data
+    // blocks has been submitted. So it should return at this point and Seek
+    // should be called again to retrieve the requested block and execute the
+    // remaining code.
+    if (file_iter_.status() == Status::TryAgain()) {
+      return;
+    }
+    if (!file_iter_.Valid() && file_iter_.status().ok() &&
+        prefix_extractor_ != nullptr && !read_options_.total_order_seek &&
+        !read_options_.auto_prefix_mode &&
+        file_index_ < flevel_->num_files - 1) {
+      size_t ts_sz = user_comparator_.user_comparator()->timestamp_size();
+      Slice target_user_key_without_ts =
+          ExtractUserKeyAndStripTimestamp(target, ts_sz);
+      Slice next_file_first_user_key_without_ts =
+          ExtractUserKeyAndStripTimestamp(file_smallest_key(file_index_ + 1),
+                                          ts_sz);
+      if (prefix_extractor_->InDomain(target_user_key_without_ts) &&
+          (!prefix_extractor_->InDomain(next_file_first_user_key_without_ts) ||
+           user_comparator_.CompareWithoutTimestamp(
+               prefix_extractor_->Transform(target_user_key_without_ts), false,
+               prefix_extractor_->Transform(
+                   next_file_first_user_key_without_ts),
+               false) != 0)) {
+        // SkipEmptyFileForward() will not advance to next file when this flag
+        // is set for reason detailed below.
+        //
+        // The file we initially positioned to has no keys under the target
+        // prefix, and the next file's smallest key has a different prefix than
+        // target. When doing prefix iterator seek, when keys for one prefix
+        // have been exhausted, it can jump to any key that is larger. Here we
+        // are enforcing a stricter contract than that, in order to make it
+        // easier for higher layers (merging and DB iterator) to reason the
+        // correctness:
+        // 1. Within the prefix, the result should be accurate.
+        // 2. If keys for the prefix is exhausted, it is either positioned to
+        // the next key after the prefix, or make the iterator invalid.
+        // A side benefit will be that it invalidates the iterator earlier so
+        // that the upper level merging iterator can merge fewer child
+        // iterators.
+        //
+        // The flag is cleared in Seek*() calls. There is no need to clear the
+        // flag in Prev() since Prev() will not be called when the flag is set
+        // for reasons explained below. If range_tombstone_iter_ is nullptr,
+        // then there is no file boundary sentinel key. Since
+        // !file_iter_.Valid() from the if condition above, this level iterator
+        // is !Valid(), so Prev() will not be called. If range_tombstone_iter_
+        // is not nullptr, there are two cases depending on if this level
+        // iterator reaches top of the heap in merging iterator (the upper
+        // layer).
+        //  If so, merging iterator will see the sentinel key, call
+        //  NextAndGetResult() and the call to NextAndGetResult() will skip the
+        //  sentinel key and makes this level iterator invalid. If not, then it
+        //  could be because the upper layer is done before any method of this
+        //  level iterator is called or another Seek*() call is invoked. Either
+        //  way, Prev() is never called before Seek*().
+        // The flag should not be cleared at the beginning of
+        // Next/NextAndGetResult() since it is used in SkipEmptyFileForward()
+        // called in Next/NextAndGetResult().
+        prefix_exhausted_ = true;
+      }
+    }
+
+    if (range_tombstone_iter_) {
+      TrySetDeleteRangeSentinel(file_largest_key(file_index_));
+    }
+  }
+  SkipEmptyFileForward();
+  CheckMayBeOutOfLowerBound();
+}
+
+void LevelIterator::SeekForPrev(const Slice& target) {
+  prefix_exhausted_ = false;
+  ClearSentinel();
+  size_t new_file_index = FindFile(icomparator_, *flevel_, target);
+  // Seek beyond this level's smallest key
+  if (new_file_index == 0 &&
+      icomparator_.Compare(target, file_smallest_key(0)) < 0) {
+    SetFileIterator(nullptr);
+    ClearRangeTombstoneIter();
+    CheckMayBeOutOfLowerBound();
+    return;
+  }
+  if (new_file_index >= flevel_->num_files) {
+    new_file_index = flevel_->num_files - 1;
+  }
+
+  InitFileIterator(new_file_index);
+  if (file_iter_.iter() != nullptr) {
+    file_iter_.SeekForPrev(target);
+    if (range_tombstone_iter_ &&
+        icomparator_.Compare(target, file_smallest_key(file_index_)) >= 0) {
+      // In SeekForPrev() case, it is possible that the target is less than
+      // file's lower boundary since largest key is used to determine file index
+      // (FindFile()). When target is less than file's lower boundary, sentinel
+      // key should not be set so that SeekForPrev() does not result in a key
+      // larger than target. This is correct in that there is no need to keep
+      // the range tombstones in this file alive as they only cover keys
+      // starting from the file's lower boundary, which is after `target`.
+      TrySetDeleteRangeSentinel(file_smallest_key(file_index_));
+    }
+    SkipEmptyFileBackward();
+  }
+  CheckMayBeOutOfLowerBound();
+}
+
+void LevelIterator::SeekToFirst() {
+  prefix_exhausted_ = false;
+  ClearSentinel();
+  InitFileIterator(0);
+  if (file_iter_.iter() != nullptr) {
+    file_iter_.SeekToFirst();
+    if (range_tombstone_iter_) {
+      // We do this in SeekToFirst() and SeekToLast() since
+      // we could have an empty file with only range tombstones.
+      TrySetDeleteRangeSentinel(file_largest_key(file_index_));
+    }
+  }
+  SkipEmptyFileForward();
+  CheckMayBeOutOfLowerBound();
+}
+
+void LevelIterator::SeekToLast() {
+  prefix_exhausted_ = false;
+  ClearSentinel();
+  InitFileIterator(flevel_->num_files - 1);
+  if (file_iter_.iter() != nullptr) {
+    file_iter_.SeekToLast();
+    if (range_tombstone_iter_) {
+      TrySetDeleteRangeSentinel(file_smallest_key(file_index_));
+    }
+  }
+  SkipEmptyFileBackward();
+  CheckMayBeOutOfLowerBound();
+}
+
+void LevelIterator::Next() {
+  assert(Valid());
+  if (to_return_sentinel_) {
+    // file_iter_ is at EOF already when to_return_sentinel_
+    ClearSentinel();
+  } else {
+    file_iter_.Next();
+    if (range_tombstone_iter_) {
+      TrySetDeleteRangeSentinel(file_largest_key(file_index_));
+    }
+  }
+  SkipEmptyFileForward();
+}
+
+bool LevelIterator::NextAndGetResult(IterateResult* result) {
+  assert(Valid());
+  // file_iter_ is at EOF already when to_return_sentinel_
+  bool is_valid = !to_return_sentinel_ && file_iter_.NextAndGetResult(result);
+  if (!is_valid) {
+    if (to_return_sentinel_) {
+      ClearSentinel();
+    } else if (range_tombstone_iter_) {
+      TrySetDeleteRangeSentinel(file_largest_key(file_index_));
+    }
+    is_next_read_sequential_ = true;
+    SkipEmptyFileForward();
+    is_next_read_sequential_ = false;
+    is_valid = Valid();
+    if (is_valid) {
+      // This could be set in TrySetDeleteRangeSentinel() or
+      // SkipEmptyFileForward() above.
+      if (to_return_sentinel_) {
+        result->key = sentinel_;
+        result->bound_check_result = IterBoundCheck::kUnknown;
+        result->value_prepared = true;
+      } else {
+        result->key = key();
+        result->bound_check_result = file_iter_.UpperBoundCheckResult();
+        // Ideally, we should return the real file_iter_.value_prepared but the
+        // information is not here. It would casue an extra PrepareValue()
+        // for the first key of a file.
+        result->value_prepared = !allow_unprepared_value_;
+      }
+    }
+  }
+  return is_valid;
+}
+
+void LevelIterator::Prev() {
+  assert(Valid());
+  if (to_return_sentinel_) {
+    ClearSentinel();
+  } else {
+    file_iter_.Prev();
+    if (range_tombstone_iter_) {
+      TrySetDeleteRangeSentinel(file_smallest_key(file_index_));
+    }
+  }
+  SkipEmptyFileBackward();
+}
+
+bool LevelIterator::SkipEmptyFileForward() {
+  bool seen_empty_file = false;
+  // Pause at sentinel key
+  while (!to_return_sentinel_ &&
+         (file_iter_.iter() == nullptr ||
+          (!file_iter_.Valid() && file_iter_.status().ok() &&
+           file_iter_.iter()->UpperBoundCheckResult() !=
+               IterBoundCheck::kOutOfBound))) {
+    seen_empty_file = true;
+    // Move to next file
+    if (file_index_ >= flevel_->num_files - 1 ||
+        KeyReachedUpperBound(file_smallest_key(file_index_ + 1)) ||
+        prefix_exhausted_) {
+      SetFileIterator(nullptr);
+      ClearRangeTombstoneIter();
+      break;
+    }
+    // may init a new *range_tombstone_iter
+    InitFileIterator(file_index_ + 1);
+    // We moved to a new SST file
+    // Seek range_tombstone_iter_ to reset its !Valid() default state.
+    // We do not need to call range_tombstone_iter_.Seek* in
+    // LevelIterator::Seek* since when the merging iterator calls
+    // LevelIterator::Seek*, it should also call Seek* into the corresponding
+    // range tombstone iterator.
+    if (file_iter_.iter() != nullptr) {
+      file_iter_.SeekToFirst();
+      if (range_tombstone_iter_) {
+        if (*range_tombstone_iter_) {
+          (*range_tombstone_iter_)->SeekToFirst();
+        }
+        TrySetDeleteRangeSentinel(file_largest_key(file_index_));
+      }
+    }
+  }
+  return seen_empty_file;
+}
+
+void LevelIterator::SkipEmptyFileBackward() {
+  // Pause at sentinel key
+  while (!to_return_sentinel_ &&
+         (file_iter_.iter() == nullptr ||
+          (!file_iter_.Valid() && file_iter_.status().ok()))) {
+    // Move to previous file
+    if (file_index_ == 0) {
+      // Already the first file
+      SetFileIterator(nullptr);
+      ClearRangeTombstoneIter();
+      return;
+    }
+    InitFileIterator(file_index_ - 1);
+    // We moved to a new SST file
+    // Seek range_tombstone_iter_ to reset its !Valid() default state.
+    if (file_iter_.iter() != nullptr) {
+      file_iter_.SeekToLast();
+      if (range_tombstone_iter_) {
+        if (*range_tombstone_iter_) {
+          (*range_tombstone_iter_)->SeekToLast();
+        }
+        TrySetDeleteRangeSentinel(file_smallest_key(file_index_));
+        if (to_return_sentinel_) {
+          break;
+        }
+      }
+    }
+  }
+}
+
+void LevelIterator::SetFileIterator(InternalIterator* iter) {
+  if (pinned_iters_mgr_ && iter) {
+    iter->SetPinnedItersMgr(pinned_iters_mgr_);
+  }
+
+  InternalIterator* old_iter = file_iter_.Set(iter);
+
+  // Update the read pattern for PrefetchBuffer.
+  if (is_next_read_sequential_) {
+    file_iter_.UpdateReadaheadState(old_iter);
+  }
+
+  if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
+    pinned_iters_mgr_->PinIterator(old_iter);
+  } else {
+    delete old_iter;
+  }
+}
+
+void LevelIterator::InitFileIterator(size_t new_file_index) {
+  if (new_file_index >= flevel_->num_files) {
+    file_index_ = new_file_index;
+    SetFileIterator(nullptr);
+    ClearRangeTombstoneIter();
+    return;
+  } else {
+    // If the file iterator shows incomplete, we try it again if users seek
+    // to the same file, as this time we may go to a different data block
+    // which is cached in block cache.
+    //
+    if (file_iter_.iter() != nullptr && !file_iter_.status().IsIncomplete() &&
+        new_file_index == file_index_) {
+      // file_iter_ is already constructed with this iterator, so
+      // no need to change anything
+    } else {
+      file_index_ = new_file_index;
+      InternalIterator* iter = NewFileIterator();
+      SetFileIterator(iter);
+    }
+  }
+}
+}  // anonymous namespace
+
+Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
+                                   const FileMetaData* file_meta,
+                                   const std::string* fname) const {
+  auto table_cache = cfd_->table_cache();
+  auto ioptions = cfd_->ioptions();
+  Status s = table_cache->GetTableProperties(
+      file_options_, cfd_->internal_comparator(), *file_meta, tp,
+      mutable_cf_options_.prefix_extractor, true /* no io */);
+  if (s.ok()) {
+    return s;
+  }
+
+  // We only ignore error type `Incomplete` since it's by design that we
+  // disallow table when it's not in table cache.
+  if (!s.IsIncomplete()) {
+    return s;
+  }
+
+  // 2. Table is not present in table cache, we'll read the table properties
+  // directly from the properties block in the file.
+  std::unique_ptr<FSRandomAccessFile> file;
+  std::string file_name;
+  if (fname != nullptr) {
+    file_name = *fname;
+  } else {
+    file_name = TableFileName(ioptions->cf_paths, file_meta->fd.GetNumber(),
+                              file_meta->fd.GetPathId());
+  }
+  s = ioptions->fs->NewRandomAccessFile(file_name, file_options_, &file,
+                                        nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // By setting the magic number to kNullTableMagicNumber, we can bypass
+  // the magic number check in the footer.
+  std::unique_ptr<RandomAccessFileReader> file_reader(
+      new RandomAccessFileReader(
+          std::move(file), file_name, nullptr /* env */, io_tracer_,
+          nullptr /* stats */, 0 /* hist_type */, nullptr /* file_read_hist */,
+          nullptr /* rate_limiter */, ioptions->listeners));
+  std::unique_ptr<TableProperties> props;
+  s = ReadTableProperties(
+      file_reader.get(), file_meta->fd.GetFileSize(),
+      Footer::kNullTableMagicNumber /* table's magic number */, *ioptions,
+      &props);
+  if (!s.ok()) {
+    return s;
+  }
+  *tp = std::move(props);
+  RecordTick(ioptions->stats, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES);
+  return s;
+}
+
+Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) {
+  Status s;
+  for (int level = 0; level < storage_info_.num_levels_; level++) {
+    s = GetPropertiesOfAllTables(props, level);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  return Status::OK();
+}
+
+Status Version::TablesRangeTombstoneSummary(int max_entries_to_print,
+                                            std::string* out_str) {
+  if (max_entries_to_print <= 0) {
+    return Status::OK();
+  }
+  int num_entries_left = max_entries_to_print;
+
+  std::stringstream ss;
+
+  for (int level = 0; level < storage_info_.num_levels_; level++) {
+    for (const auto& file_meta : storage_info_.files_[level]) {
+      auto fname =
+          TableFileName(cfd_->ioptions()->cf_paths, file_meta->fd.GetNumber(),
+                        file_meta->fd.GetPathId());
+
+      ss << "=== file : " << fname << " ===\n";
+
+      TableCache* table_cache = cfd_->table_cache();
+      std::unique_ptr<FragmentedRangeTombstoneIterator> tombstone_iter;
+
+      Status s = table_cache->GetRangeTombstoneIterator(
+          ReadOptions(), cfd_->internal_comparator(), *file_meta,
+          &tombstone_iter);
+      if (!s.ok()) {
+        return s;
+      }
+      if (tombstone_iter) {
+        tombstone_iter->SeekToFirst();
+
+        // TODO: print timestamp
+        while (tombstone_iter->Valid() && num_entries_left > 0) {
+          ss << "start: " << tombstone_iter->start_key().ToString(true)
+             << " end: " << tombstone_iter->end_key().ToString(true)
+             << " seq: " << tombstone_iter->seq() << '\n';
+          tombstone_iter->Next();
+          num_entries_left--;
+        }
+        if (num_entries_left <= 0) {
+          break;
+        }
+      }
+    }
+    if (num_entries_left <= 0) {
+      break;
+    }
+  }
+  assert(num_entries_left >= 0);
+  if (num_entries_left <= 0) {
+    ss << "(results may not be complete)\n";
+  }
+
+  *out_str = ss.str();
+  return Status::OK();
+}
+
+Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props,
+                                         int level) {
+  for (const auto& file_meta : storage_info_.files_[level]) {
+    auto fname =
+        TableFileName(cfd_->ioptions()->cf_paths, file_meta->fd.GetNumber(),
+                      file_meta->fd.GetPathId());
+    // 1. If the table is already present in table cache, load table
+    // properties from there.
+    std::shared_ptr<const TableProperties> table_properties;
+    Status s = GetTableProperties(&table_properties, file_meta, &fname);
+    if (s.ok()) {
+      props->insert({fname, table_properties});
+    } else {
+      return s;
+    }
+  }
+
+  return Status::OK();
+}
+
+Status Version::GetPropertiesOfTablesInRange(
+    const Range* range, std::size_t n, TablePropertiesCollection* props) const {
+  for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) {
+    for (decltype(n) i = 0; i < n; i++) {
+      // Convert user_key into a corresponding internal key.
+      InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
+      InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
+      std::vector<FileMetaData*> files;
+      storage_info_.GetOverlappingInputs(level, &k1, &k2, &files, -1, nullptr,
+                                         false);
+      for (const auto& file_meta : files) {
+        auto fname =
+            TableFileName(cfd_->ioptions()->cf_paths, file_meta->fd.GetNumber(),
+                          file_meta->fd.GetPathId());
+        if (props->count(fname) == 0) {
+          // 1. If the table is already present in table cache, load table
+          // properties from there.
+          std::shared_ptr<const TableProperties> table_properties;
+          Status s = GetTableProperties(&table_properties, file_meta, &fname);
+          if (s.ok()) {
+            props->insert({fname, table_properties});
+          } else {
+            return s;
+          }
+        }
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+Status Version::GetAggregatedTableProperties(
+    std::shared_ptr<const TableProperties>* tp, int level) {
+  TablePropertiesCollection props;
+  Status s;
+  if (level < 0) {
+    s = GetPropertiesOfAllTables(&props);
+  } else {
+    s = GetPropertiesOfAllTables(&props, level);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+
+  auto* new_tp = new TableProperties();
+  for (const auto& item : props) {
+    new_tp->Add(*item.second);
+  }
+  tp->reset(new_tp);
+  return Status::OK();
+}
+
+size_t Version::GetMemoryUsageByTableReaders() {
+  size_t total_usage = 0;
+  for (auto& file_level : storage_info_.level_files_brief_) {
+    for (size_t i = 0; i < file_level.num_files; i++) {
+      total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader(
+          file_options_, cfd_->internal_comparator(),
+          *file_level.files[i].file_metadata,
+          mutable_cf_options_.prefix_extractor);
+    }
+  }
+  return total_usage;
+}
+
+void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
+  assert(cf_meta);
+  assert(cfd_);
+
+  cf_meta->name = cfd_->GetName();
+  cf_meta->size = 0;
+  cf_meta->file_count = 0;
+  cf_meta->levels.clear();
+
+  cf_meta->blob_file_size = 0;
+  cf_meta->blob_file_count = 0;
+  cf_meta->blob_files.clear();
+
+  auto* ioptions = cfd_->ioptions();
+  auto* vstorage = storage_info();
+
+  for (int level = 0; level < cfd_->NumberLevels(); level++) {
+    uint64_t level_size = 0;
+    cf_meta->file_count += vstorage->LevelFiles(level).size();
+    std::vector<SstFileMetaData> files;
+    for (const auto& file : vstorage->LevelFiles(level)) {
+      uint32_t path_id = file->fd.GetPathId();
+      std::string file_path;
+      if (path_id < ioptions->cf_paths.size()) {
+        file_path = ioptions->cf_paths[path_id].path;
+      } else {
+        assert(!ioptions->cf_paths.empty());
+        file_path = ioptions->cf_paths.back().path;
+      }
+      const uint64_t file_number = file->fd.GetNumber();
+      files.emplace_back(
+          MakeTableFileName("", file_number), file_number, file_path,
+          file->fd.GetFileSize(), file->fd.smallest_seqno,
+          file->fd.largest_seqno, file->smallest.user_key().ToString(),
+          file->largest.user_key().ToString(),
+          file->stats.num_reads_sampled.load(std::memory_order_relaxed),
+          file->being_compacted, file->temperature,
+          file->oldest_blob_file_number, file->TryGetOldestAncesterTime(),
+          file->TryGetFileCreationTime(), file->file_checksum,
+          file->file_checksum_func_name);
+      files.back().num_entries = file->num_entries;
+      files.back().num_deletions = file->num_deletions;
+      level_size += file->fd.GetFileSize();
+    }
+    cf_meta->levels.emplace_back(level, level_size, std::move(files));
+    cf_meta->size += level_size;
+  }
+  for (const auto& meta : vstorage->GetBlobFiles()) {
+    assert(meta);
+
+    cf_meta->blob_files.emplace_back(
+        meta->GetBlobFileNumber(), BlobFileName("", meta->GetBlobFileNumber()),
+        ioptions->cf_paths.front().path, meta->GetBlobFileSize(),
+        meta->GetTotalBlobCount(), meta->GetTotalBlobBytes(),
+        meta->GetGarbageBlobCount(), meta->GetGarbageBlobBytes(),
+        meta->GetChecksumMethod(), meta->GetChecksumValue());
+    ++cf_meta->blob_file_count;
+    cf_meta->blob_file_size += meta->GetBlobFileSize();
+  }
+}
+
+uint64_t Version::GetSstFilesSize() {
+  uint64_t sst_files_size = 0;
+  for (int level = 0; level < storage_info_.num_levels_; level++) {
+    for (const auto& file_meta : storage_info_.LevelFiles(level)) {
+      sst_files_size += file_meta->fd.GetFileSize();
+    }
+  }
+  return sst_files_size;
+}
+
+void Version::GetCreationTimeOfOldestFile(uint64_t* creation_time) {
+  uint64_t oldest_time = std::numeric_limits<uint64_t>::max();
+  for (int level = 0; level < storage_info_.num_non_empty_levels_; level++) {
+    for (FileMetaData* meta : storage_info_.LevelFiles(level)) {
+      assert(meta->fd.table_reader != nullptr);
+      uint64_t file_creation_time = meta->TryGetFileCreationTime();
+      if (file_creation_time == kUnknownFileCreationTime) {
+        *creation_time = 0;
+        return;
+      }
+      if (file_creation_time < oldest_time) {
+        oldest_time = file_creation_time;
+      }
+    }
+  }
+  *creation_time = oldest_time;
+}
+
+InternalIterator* Version::TEST_GetLevelIterator(
+    const ReadOptions& read_options, MergeIteratorBuilder* merge_iter_builder,
+    int level, bool allow_unprepared_value) {
+  auto* arena = merge_iter_builder->GetArena();
+  auto* mem = arena->AllocateAligned(sizeof(LevelIterator));
+  TruncatedRangeDelIterator*** tombstone_iter_ptr = nullptr;
+  auto level_iter = new (mem) LevelIterator(
+      cfd_->table_cache(), read_options, file_options_,
+      cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
+      mutable_cf_options_.prefix_extractor, should_sample_file_read(),
+      cfd_->internal_stats()->GetFileReadHist(level),
+      TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
+      nullptr /* range_del_agg */, nullptr /* compaction_boundaries */,
+      allow_unprepared_value, &tombstone_iter_ptr);
+  if (read_options.ignore_range_deletions) {
+    merge_iter_builder->AddIterator(level_iter);
+  } else {
+    merge_iter_builder->AddPointAndTombstoneIterator(
+        level_iter, nullptr /* tombstone_iter */, tombstone_iter_ptr);
+  }
+  return level_iter;
+}
+
+uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const {
+  // Estimation will be inaccurate when:
+  // (1) there exist merge keys
+  // (2) keys are directly overwritten
+  // (3) deletion on non-existing keys
+  // (4) low number of samples
+  if (current_num_samples_ == 0) {
+    return 0;
+  }
+
+  if (current_num_non_deletions_ <= current_num_deletions_) {
+    return 0;
+  }
+
+  uint64_t est = current_num_non_deletions_ - current_num_deletions_;
+
+  uint64_t file_count = 0;
+  for (int level = 0; level < num_levels_; ++level) {
+    file_count += files_[level].size();
+  }
+
+  if (current_num_samples_ < file_count) {
+    // casting to avoid overflowing
+    return static_cast<uint64_t>(
+        (est * static_cast<double>(file_count) / current_num_samples_));
+  } else {
+    return est;
+  }
+}
+
+double VersionStorageInfo::GetEstimatedCompressionRatioAtLevel(
+    int level) const {
+  assert(level < num_levels_);
+  uint64_t sum_file_size_bytes = 0;
+  uint64_t sum_data_size_bytes = 0;
+  for (auto* file_meta : files_[level]) {
+    sum_file_size_bytes += file_meta->fd.GetFileSize();
+    sum_data_size_bytes += file_meta->raw_key_size + file_meta->raw_value_size;
+  }
+  if (sum_file_size_bytes == 0) {
+    return -1.0;
+  }
+  return static_cast<double>(sum_data_size_bytes) / sum_file_size_bytes;
+}
+
+void Version::AddIterators(const ReadOptions& read_options,
+                           const FileOptions& soptions,
+                           MergeIteratorBuilder* merge_iter_builder,
+                           bool allow_unprepared_value) {
+  assert(storage_info_.finalized_);
+
+  for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) {
+    AddIteratorsForLevel(read_options, soptions, merge_iter_builder, level,
+                         allow_unprepared_value);
+  }
+}
+
+void Version::AddIteratorsForLevel(const ReadOptions& read_options,
+                                   const FileOptions& soptions,
+                                   MergeIteratorBuilder* merge_iter_builder,
+                                   int level, bool allow_unprepared_value) {
+  assert(storage_info_.finalized_);
+  if (level >= storage_info_.num_non_empty_levels()) {
+    // This is an empty level
+    return;
+  } else if (storage_info_.LevelFilesBrief(level).num_files == 0) {
+    // No files in this level
+    return;
+  }
+
+  bool should_sample = should_sample_file_read();
+
+  auto* arena = merge_iter_builder->GetArena();
+  if (level == 0) {
+    // Merge all level zero files together since they may overlap
+    TruncatedRangeDelIterator* tombstone_iter = nullptr;
+    for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) {
+      const auto& file = storage_info_.LevelFilesBrief(0).files[i];
+      auto table_iter = cfd_->table_cache()->NewIterator(
+          read_options, soptions, cfd_->internal_comparator(),
+          *file.file_metadata, /*range_del_agg=*/nullptr,
+          mutable_cf_options_.prefix_extractor, nullptr,
+          cfd_->internal_stats()->GetFileReadHist(0),
+          TableReaderCaller::kUserIterator, arena,
+          /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_,
+          /*smallest_compaction_key=*/nullptr,
+          /*largest_compaction_key=*/nullptr, allow_unprepared_value,
+          &tombstone_iter);
+      if (read_options.ignore_range_deletions) {
+        merge_iter_builder->AddIterator(table_iter);
+      } else {
+        merge_iter_builder->AddPointAndTombstoneIterator(table_iter,
+                                                         tombstone_iter);
+      }
+    }
+    if (should_sample) {
+      // Count ones for every L0 files. This is done per iterator creation
+      // rather than Seek(), while files in other levels are recored per seek.
+      // If users execute one range query per iterator, there may be some
+      // discrepancy here.
+      for (FileMetaData* meta : storage_info_.LevelFiles(0)) {
+        sample_file_read_inc(meta);
+      }
+    }
+  } else if (storage_info_.LevelFilesBrief(level).num_files > 0) {
+    // For levels > 0, we can use a concatenating iterator that sequentially
+    // walks through the non-overlapping files in the level, opening them
+    // lazily.
+    auto* mem = arena->AllocateAligned(sizeof(LevelIterator));
+    TruncatedRangeDelIterator*** tombstone_iter_ptr = nullptr;
+    auto level_iter = new (mem) LevelIterator(
+        cfd_->table_cache(), read_options, soptions,
+        cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
+        mutable_cf_options_.prefix_extractor, should_sample_file_read(),
+        cfd_->internal_stats()->GetFileReadHist(level),
+        TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
+        /*range_del_agg=*/nullptr, /*compaction_boundaries=*/nullptr,
+        allow_unprepared_value, &tombstone_iter_ptr);
+    if (read_options.ignore_range_deletions) {
+      merge_iter_builder->AddIterator(level_iter);
+    } else {
+      merge_iter_builder->AddPointAndTombstoneIterator(
+          level_iter, nullptr /* tombstone_iter */, tombstone_iter_ptr);
+    }
+  }
+}
+
+Status Version::OverlapWithLevelIterator(const ReadOptions& read_options,
+                                         const FileOptions& file_options,
+                                         const Slice& smallest_user_key,
+                                         const Slice& largest_user_key,
+                                         int level, bool* overlap) {
+  assert(storage_info_.finalized_);
+
+  auto icmp = cfd_->internal_comparator();
+  auto ucmp = icmp.user_comparator();
+
+  Arena arena;
+  Status status;
+  ReadRangeDelAggregator range_del_agg(&icmp,
+                                       kMaxSequenceNumber /* upper_bound */);
+
+  *overlap = false;
+
+  if (level == 0) {
+    for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) {
+      const auto file = &storage_info_.LevelFilesBrief(0).files[i];
+      if (AfterFile(ucmp, &smallest_user_key, file) ||
+          BeforeFile(ucmp, &largest_user_key, file)) {
+        continue;
+      }
+      ScopedArenaIterator iter(cfd_->table_cache()->NewIterator(
+          read_options, file_options, cfd_->internal_comparator(),
+          *file->file_metadata, &range_del_agg,
+          mutable_cf_options_.prefix_extractor, nullptr,
+          cfd_->internal_stats()->GetFileReadHist(0),
+          TableReaderCaller::kUserIterator, &arena,
+          /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_,
+          /*smallest_compaction_key=*/nullptr,
+          /*largest_compaction_key=*/nullptr,
+          /*allow_unprepared_value=*/false));
+      status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key,
+                                   iter.get(), overlap);
+      if (!status.ok() || *overlap) {
+        break;
+      }
+    }
+  } else if (storage_info_.LevelFilesBrief(level).num_files > 0) {
+    auto mem = arena.AllocateAligned(sizeof(LevelIterator));
+    ScopedArenaIterator iter(new (mem) LevelIterator(
+        cfd_->table_cache(), read_options, file_options,
+        cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
+        mutable_cf_options_.prefix_extractor, should_sample_file_read(),
+        cfd_->internal_stats()->GetFileReadHist(level),
+        TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
+        &range_del_agg));
+    status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key,
+                                 iter.get(), overlap);
+  }
+
+  if (status.ok() && *overlap == false &&
+      range_del_agg.IsRangeOverlapped(smallest_user_key, largest_user_key)) {
+    *overlap = true;
+  }
+  return status;
+}
+
+VersionStorageInfo::VersionStorageInfo(
+    const InternalKeyComparator* internal_comparator,
+    const Comparator* user_comparator, int levels,
+    CompactionStyle compaction_style, VersionStorageInfo* ref_vstorage,
+    bool _force_consistency_checks)
+    : internal_comparator_(internal_comparator),
+      user_comparator_(user_comparator),
+      // cfd is nullptr if Version is dummy
+      num_levels_(levels),
+      num_non_empty_levels_(0),
+      file_indexer_(user_comparator),
+      compaction_style_(compaction_style),
+      files_(new std::vector<FileMetaData*>[num_levels_]),
+      base_level_(num_levels_ == 1 ? -1 : 1),
+      level_multiplier_(0.0),
+      files_by_compaction_pri_(num_levels_),
+      level0_non_overlapping_(false),
+      next_file_to_compact_by_size_(num_levels_),
+      compaction_score_(num_levels_),
+      compaction_level_(num_levels_),
+      l0_delay_trigger_count_(0),
+      compact_cursor_(num_levels_),
+      accumulated_file_size_(0),
+      accumulated_raw_key_size_(0),
+      accumulated_raw_value_size_(0),
+      accumulated_num_non_deletions_(0),
+      accumulated_num_deletions_(0),
+      current_num_non_deletions_(0),
+      current_num_deletions_(0),
+      current_num_samples_(0),
+      estimated_compaction_needed_bytes_(0),
+      finalized_(false),
+      force_consistency_checks_(_force_consistency_checks) {
+  if (ref_vstorage != nullptr) {
+    accumulated_file_size_ = ref_vstorage->accumulated_file_size_;
+    accumulated_raw_key_size_ = ref_vstorage->accumulated_raw_key_size_;
+    accumulated_raw_value_size_ = ref_vstorage->accumulated_raw_value_size_;
+    accumulated_num_non_deletions_ =
+        ref_vstorage->accumulated_num_non_deletions_;
+    accumulated_num_deletions_ = ref_vstorage->accumulated_num_deletions_;
+    current_num_non_deletions_ = ref_vstorage->current_num_non_deletions_;
+    current_num_deletions_ = ref_vstorage->current_num_deletions_;
+    current_num_samples_ = ref_vstorage->current_num_samples_;
+    oldest_snapshot_seqnum_ = ref_vstorage->oldest_snapshot_seqnum_;
+    compact_cursor_ = ref_vstorage->compact_cursor_;
+    compact_cursor_.resize(num_levels_);
+  }
+}
+
+Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
+                 const FileOptions& file_opt,
+                 const MutableCFOptions mutable_cf_options,
+                 const std::shared_ptr<IOTracer>& io_tracer,
+                 uint64_t version_number)
+    : env_(vset->env_),
+      clock_(vset->clock_),
+      cfd_(column_family_data),
+      info_log_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->logger),
+      db_statistics_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->stats),
+      table_cache_((cfd_ == nullptr) ? nullptr : cfd_->table_cache()),
+      blob_source_(cfd_ ? cfd_->blob_source() : nullptr),
+      merge_operator_(
+          (cfd_ == nullptr) ? nullptr : cfd_->ioptions()->merge_operator.get()),
+      storage_info_(
+          (cfd_ == nullptr) ? nullptr : &cfd_->internal_comparator(),
+          (cfd_ == nullptr) ? nullptr : cfd_->user_comparator(),
+          cfd_ == nullptr ? 0 : cfd_->NumberLevels(),
+          cfd_ == nullptr ? kCompactionStyleLevel
+                          : cfd_->ioptions()->compaction_style,
+          (cfd_ == nullptr || cfd_->current() == nullptr)
+              ? nullptr
+              : cfd_->current()->storage_info(),
+          cfd_ == nullptr ? false : cfd_->ioptions()->force_consistency_checks),
+      vset_(vset),
+      next_(this),
+      prev_(this),
+      refs_(0),
+      file_options_(file_opt),
+      mutable_cf_options_(mutable_cf_options),
+      max_file_size_for_l0_meta_pin_(
+          MaxFileSizeForL0MetaPin(mutable_cf_options_)),
+      version_number_(version_number),
+      io_tracer_(io_tracer) {}
+
+Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key,
+                        const Slice& blob_index_slice,
+                        FilePrefetchBuffer* prefetch_buffer,
+                        PinnableSlice* value, uint64_t* bytes_read) const {
+  BlobIndex blob_index;
+
+  {
+    Status s = blob_index.DecodeFrom(blob_index_slice);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  return GetBlob(read_options, user_key, blob_index, prefetch_buffer, value,
+                 bytes_read);
+}
+
+Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key,
+                        const BlobIndex& blob_index,
+                        FilePrefetchBuffer* prefetch_buffer,
+                        PinnableSlice* value, uint64_t* bytes_read) const {
+  assert(value);
+
+  if (blob_index.HasTTL() || blob_index.IsInlined()) {
+    return Status::Corruption("Unexpected TTL/inlined blob index");
+  }
+
+  const uint64_t blob_file_number = blob_index.file_number();
+
+  auto blob_file_meta = storage_info_.GetBlobFileMetaData(blob_file_number);
+  if (!blob_file_meta) {
+    return Status::Corruption("Invalid blob file number");
+  }
+
+  assert(blob_source_);
+  value->Reset();
+  const Status s = blob_source_->GetBlob(
+      read_options, user_key, blob_file_number, blob_index.offset(),
+      blob_file_meta->GetBlobFileSize(), blob_index.size(),
+      blob_index.compression(), prefetch_buffer, value, bytes_read);
+
+  return s;
+}
+
+void Version::MultiGetBlob(
+    const ReadOptions& read_options, MultiGetRange& range,
+    std::unordered_map<uint64_t, BlobReadContexts>& blob_ctxs) {
+  assert(!blob_ctxs.empty());
+
+  autovector<BlobFileReadRequests> blob_reqs;
+
+  for (auto& ctx : blob_ctxs) {
+    const auto file_number = ctx.first;
+    const auto blob_file_meta = storage_info_.GetBlobFileMetaData(file_number);
+
+    autovector<BlobReadRequest> blob_reqs_in_file;
+    BlobReadContexts& blobs_in_file = ctx.second;
+    for (const auto& blob : blobs_in_file) {
+      const BlobIndex& blob_index = blob.first;
+      const KeyContext& key_context = blob.second;
+
+      if (!blob_file_meta) {
+        *key_context.s = Status::Corruption("Invalid blob file number");
+        continue;
+      }
+
+      if (blob_index.HasTTL() || blob_index.IsInlined()) {
+        *key_context.s =
+            Status::Corruption("Unexpected TTL/inlined blob index");
+        continue;
+      }
+
+      key_context.value->Reset();
+      blob_reqs_in_file.emplace_back(
+          key_context.ukey_with_ts, blob_index.offset(), blob_index.size(),
+          blob_index.compression(), key_context.value, key_context.s);
+    }
+    if (blob_reqs_in_file.size() > 0) {
+      const auto file_size = blob_file_meta->GetBlobFileSize();
+      blob_reqs.emplace_back(file_number, file_size, blob_reqs_in_file);
+    }
+  }
+
+  if (blob_reqs.size() > 0) {
+    blob_source_->MultiGetBlob(read_options, blob_reqs, /*bytes_read=*/nullptr);
+  }
+
+  for (auto& ctx : blob_ctxs) {
+    BlobReadContexts& blobs_in_file = ctx.second;
+    for (const auto& blob : blobs_in_file) {
+      const KeyContext& key_context = blob.second;
+      if (key_context.s->ok()) {
+        range.AddValueSize(key_context.value->size());
+        if (range.GetValueSize() > read_options.value_size_soft_limit) {
+          *key_context.s = Status::Aborted();
+        }
+      } else if (key_context.s->IsIncomplete()) {
+        // read_options.read_tier == kBlockCacheTier
+        // Cannot read blob(s): no disk I/O allowed
+        assert(key_context.get_context);
+        auto& get_context = *(key_context.get_context);
+        get_context.MarkKeyMayExist();
+      }
+    }
+  }
+}
+
+void Version::Get(const ReadOptions& read_options, const LookupKey& k,
+                  PinnableSlice* value, PinnableWideColumns* columns,
+                  std::string* timestamp, Status* status,
+                  MergeContext* merge_context,
+                  SequenceNumber* max_covering_tombstone_seq,
+                  PinnedIteratorsManager* pinned_iters_mgr, bool* value_found,
+                  bool* key_exists, SequenceNumber* seq, ReadCallback* callback,
+                  bool* is_blob, bool do_merge) {
+  Slice ikey = k.internal_key();
+  Slice user_key = k.user_key();
+
+  assert(status->ok() || status->IsMergeInProgress());
+
+  if (key_exists != nullptr) {
+    // will falsify below if not found
+    *key_exists = true;
+  }
+
+  uint64_t tracing_get_id = BlockCacheTraceHelper::kReservedGetId;
+  if (vset_ && vset_->block_cache_tracer_ &&
+      vset_->block_cache_tracer_->is_tracing_enabled()) {
+    tracing_get_id = vset_->block_cache_tracer_->NextGetId();
+  }
+
+  // Note: the old StackableDB-based BlobDB passes in
+  // GetImplOptions::is_blob_index; for the integrated BlobDB implementation, we
+  // need to provide it here.
+  bool is_blob_index = false;
+  bool* const is_blob_to_use = is_blob ? is_blob : &is_blob_index;
+  BlobFetcher blob_fetcher(this, read_options);
+
+  assert(pinned_iters_mgr);
+  GetContext get_context(
+      user_comparator(), merge_operator_, info_log_, db_statistics_,
+      status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key,
+      do_merge ? value : nullptr, do_merge ? columns : nullptr,
+      do_merge ? timestamp : nullptr, value_found, merge_context, do_merge,
+      max_covering_tombstone_seq, clock_, seq,
+      merge_operator_ ? pinned_iters_mgr : nullptr, callback, is_blob_to_use,
+      tracing_get_id, &blob_fetcher);
+
+  // Pin blocks that we read to hold merge operands
+  if (merge_operator_) {
+    pinned_iters_mgr->StartPinning();
+  }
+
+  FilePicker fp(user_key, ikey, &storage_info_.level_files_brief_,
+                storage_info_.num_non_empty_levels_,
+                &storage_info_.file_indexer_, user_comparator(),
+                internal_comparator());
+  FdWithKeyRange* f = fp.GetNextFile();
+
+  while (f != nullptr) {
+    if (*max_covering_tombstone_seq > 0) {
+      // The remaining files we look at will only contain covered keys, so we
+      // stop here.
+      break;
+    }
+    if (get_context.sample()) {
+      sample_file_read_inc(f->file_metadata);
+    }
+
+    bool timer_enabled =
+        GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
+        get_perf_context()->per_level_perf_context_enabled;
+    StopWatchNano timer(clock_, timer_enabled /* auto_start */);
+    *status = table_cache_->Get(
+        read_options, *internal_comparator(), *f->file_metadata, ikey,
+        &get_context, mutable_cf_options_.prefix_extractor,
+        cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
+        IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
+                        fp.IsHitFileLastInLevel()),
+        fp.GetHitFileLevel(), max_file_size_for_l0_meta_pin_);
+    // TODO: examine the behavior for corrupted key
+    if (timer_enabled) {
+      PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(),
+                                fp.GetHitFileLevel());
+    }
+    if (!status->ok()) {
+      if (db_statistics_ != nullptr) {
+        get_context.ReportCounters();
+      }
+      return;
+    }
+
+    // report the counters before returning
+    if (get_context.State() != GetContext::kNotFound &&
+        get_context.State() != GetContext::kMerge &&
+        db_statistics_ != nullptr) {
+      get_context.ReportCounters();
+    }
+    switch (get_context.State()) {
+      case GetContext::kNotFound:
+        // Keep searching in other files
+        break;
+      case GetContext::kMerge:
+        // TODO: update per-level perfcontext user_key_return_count for kMerge
+        break;
+      case GetContext::kFound:
+        if (fp.GetHitFileLevel() == 0) {
+          RecordTick(db_statistics_, GET_HIT_L0);
+        } else if (fp.GetHitFileLevel() == 1) {
+          RecordTick(db_statistics_, GET_HIT_L1);
+        } else if (fp.GetHitFileLevel() >= 2) {
+          RecordTick(db_statistics_, GET_HIT_L2_AND_UP);
+        }
+
+        PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1,
+                                  fp.GetHitFileLevel());
+
+        if (is_blob_index) {
+          if (do_merge && value) {
+            TEST_SYNC_POINT_CALLBACK("Version::Get::TamperWithBlobIndex",
+                                     value);
+
+            constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+            constexpr uint64_t* bytes_read = nullptr;
+
+            *status = GetBlob(read_options, user_key, *value, prefetch_buffer,
+                              value, bytes_read);
+            if (!status->ok()) {
+              if (status->IsIncomplete()) {
+                get_context.MarkKeyMayExist();
+              }
+              return;
+            }
+          }
+        }
+
+        return;
+      case GetContext::kDeleted:
+        // Use empty error message for speed
+        *status = Status::NotFound();
+        return;
+      case GetContext::kCorrupt:
+        *status = Status::Corruption("corrupted key for ", user_key);
+        return;
+      case GetContext::kUnexpectedBlobIndex:
+        ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index.");
+        *status = Status::NotSupported(
+            "Encounter unexpected blob index. Please open DB with "
+            "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
+        return;
+    }
+    f = fp.GetNextFile();
+  }
+  if (db_statistics_ != nullptr) {
+    get_context.ReportCounters();
+  }
+  if (GetContext::kMerge == get_context.State()) {
+    if (!do_merge) {
+      *status = Status::OK();
+      return;
+    }
+    if (!merge_operator_) {
+      *status = Status::InvalidArgument(
+          "merge_operator is not properly initialized.");
+      return;
+    }
+    // merge_operands are in saver and we hit the beginning of the key history
+    // do a final merge of nullptr and operands;
+    if (value || columns) {
+      std::string result;
+      *status = MergeHelper::TimedFullMerge(
+          merge_operator_, user_key, nullptr, merge_context->GetOperands(),
+          &result, info_log_, db_statistics_, clock_,
+          /* result_operand */ nullptr, /* update_num_ops_stats */ true);
+      if (status->ok()) {
+        if (LIKELY(value != nullptr)) {
+          *(value->GetSelf()) = std::move(result);
+          value->PinSelf();
+        } else {
+          assert(columns != nullptr);
+          columns->SetPlainValue(result);
+        }
+      }
+    }
+  } else {
+    if (key_exists != nullptr) {
+      *key_exists = false;
+    }
+    *status = Status::NotFound();  // Use an empty error message for speed
+  }
+}
+
+void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
+                       ReadCallback* callback) {
+  PinnedIteratorsManager pinned_iters_mgr;
+
+  // Pin blocks that we read to hold merge operands
+  if (merge_operator_) {
+    pinned_iters_mgr.StartPinning();
+  }
+  uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
+
+  if (vset_ && vset_->block_cache_tracer_ &&
+      vset_->block_cache_tracer_->is_tracing_enabled()) {
+    tracing_mget_id = vset_->block_cache_tracer_->NextGetId();
+  }
+  // Even though we know the batch size won't be > MAX_BATCH_SIZE,
+  // use autovector in order to avoid unnecessary construction of GetContext
+  // objects, which is expensive
+  autovector<GetContext, 16> get_ctx;
+  BlobFetcher blob_fetcher(this, read_options);
+  for (auto iter = range->begin(); iter != range->end(); ++iter) {
+    assert(iter->s->ok() || iter->s->IsMergeInProgress());
+    get_ctx.emplace_back(
+        user_comparator(), merge_operator_, info_log_, db_statistics_,
+        iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge,
+        iter->ukey_with_ts, iter->value, /*columns=*/nullptr, iter->timestamp,
+        nullptr, &(iter->merge_context), true,
+        &iter->max_covering_tombstone_seq, clock_, nullptr,
+        merge_operator_ ? &pinned_iters_mgr : nullptr, callback,
+        &iter->is_blob_index, tracing_mget_id, &blob_fetcher);
+    // MergeInProgress status, if set, has been transferred to the get_context
+    // state, so we set status to ok here. From now on, the iter status will
+    // be used for IO errors, and get_context state will be used for any
+    // key level errors
+    *(iter->s) = Status::OK();
+  }
+  int get_ctx_index = 0;
+  for (auto iter = range->begin(); iter != range->end();
+       ++iter, get_ctx_index++) {
+    iter->get_context = &(get_ctx[get_ctx_index]);
+  }
+
+  Status s;
+  // blob_file => [[blob_idx, it], ...]
+  std::unordered_map<uint64_t, BlobReadContexts> blob_ctxs;
+  MultiGetRange keys_with_blobs_range(*range, range->begin(), range->end());
+#if USE_COROUTINES
+  if (read_options.async_io && read_options.optimize_multiget_for_io &&
+      using_coroutines()) {
+    s = MultiGetAsync(read_options, range, &blob_ctxs);
+  } else
+#endif  // USE_COROUTINES
+  {
+    MultiGetRange file_picker_range(*range, range->begin(), range->end());
+    FilePickerMultiGet fp(&file_picker_range, &storage_info_.level_files_brief_,
+                          storage_info_.num_non_empty_levels_,
+                          &storage_info_.file_indexer_, user_comparator(),
+                          internal_comparator());
+    FdWithKeyRange* f = fp.GetNextFileInLevel();
+    uint64_t num_index_read = 0;
+    uint64_t num_filter_read = 0;
+    uint64_t num_sst_read = 0;
+    uint64_t num_level_read = 0;
+
+    int prev_level = -1;
+
+    while (!fp.IsSearchEnded()) {
+      // This will be set to true later if we actually look up in a file in L0.
+      // For per level stats purposes, an L0 file is treated as a level
+      bool dump_stats_for_l0_file = false;
+
+      // Avoid using the coroutine version if we're looking in a L0 file, since
+      // L0 files won't be parallelized anyway. The regular synchronous version
+      // is faster.
+      if (!read_options.async_io || !using_coroutines() ||
+          fp.GetHitFileLevel() == 0 || !fp.RemainingOverlapInLevel()) {
+        if (f) {
+          bool skip_filters =
+              IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
+                              fp.IsHitFileLastInLevel());
+          // Call MultiGetFromSST for looking up a single file
+          s = MultiGetFromSST(read_options, fp.CurrentFileRange(),
+                              fp.GetHitFileLevel(), skip_filters,
+                              /*skip_range_deletions=*/false, f, blob_ctxs,
+                              /*table_handle=*/nullptr, num_filter_read,
+                              num_index_read, num_sst_read);
+          if (fp.GetHitFileLevel() == 0) {
+            dump_stats_for_l0_file = true;
+          }
+        }
+        if (s.ok()) {
+          f = fp.GetNextFileInLevel();
+        }
+#if USE_COROUTINES
+      } else {
+        std::vector<folly::coro::Task<Status>> mget_tasks;
+        while (f != nullptr) {
+          MultiGetRange file_range = fp.CurrentFileRange();
+          Cache::Handle* table_handle = nullptr;
+          bool skip_filters =
+              IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
+                              fp.IsHitFileLastInLevel());
+          bool skip_range_deletions = false;
+          if (!skip_filters) {
+            Status status = table_cache_->MultiGetFilter(
+                read_options, *internal_comparator(), *f->file_metadata,
+                mutable_cf_options_.prefix_extractor,
+                cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
+                fp.GetHitFileLevel(), &file_range, &table_handle);
+            skip_range_deletions = true;
+            if (status.ok()) {
+              skip_filters = true;
+            } else if (!status.IsNotSupported()) {
+              s = status;
+            }
+          }
+
+          if (!s.ok()) {
+            break;
+          }
+
+          if (!file_range.empty()) {
+            mget_tasks.emplace_back(MultiGetFromSSTCoroutine(
+                read_options, file_range, fp.GetHitFileLevel(), skip_filters,
+                skip_range_deletions, f, blob_ctxs, table_handle,
+                num_filter_read, num_index_read, num_sst_read));
+          }
+          if (fp.KeyMaySpanNextFile()) {
+            break;
+          }
+          f = fp.GetNextFileInLevel();
+        }
+        if (mget_tasks.size() > 0) {
+          RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT,
+                     mget_tasks.size());
+          // Collect all results so far
+          std::vector<Status> statuses = folly::coro::blockingWait(
+              folly::coro::collectAllRange(std::move(mget_tasks))
+                  .scheduleOn(&range->context()->executor()));
+          if (s.ok()) {
+            for (Status stat : statuses) {
+              if (!stat.ok()) {
+                s = std::move(stat);
+                break;
+              }
+            }
+          }
+
+          if (s.ok() && fp.KeyMaySpanNextFile()) {
+            f = fp.GetNextFileInLevel();
+          }
+        }
+#endif  // USE_COROUTINES
+      }
+      // If bad status or we found final result for all the keys
+      if (!s.ok() || file_picker_range.empty()) {
+        break;
+      }
+      if (!f) {
+        // Reached the end of this level. Prepare the next level
+        fp.PrepareNextLevelForSearch();
+        if (!fp.IsSearchEnded()) {
+          // Its possible there is no overlap on this level and f is nullptr
+          f = fp.GetNextFileInLevel();
+        }
+        if (dump_stats_for_l0_file ||
+            (prev_level != 0 && prev_level != (int)fp.GetHitFileLevel())) {
+          // Dump the stats if the search has moved to the next level and
+          // reset for next level.
+          if (num_filter_read + num_index_read) {
+            RecordInHistogram(db_statistics_,
+                              NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
+                              num_index_read + num_filter_read);
+          }
+          if (num_sst_read) {
+            RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL,
+                              num_sst_read);
+            num_level_read++;
+          }
+          num_filter_read = 0;
+          num_index_read = 0;
+          num_sst_read = 0;
+        }
+        prev_level = fp.GetHitFileLevel();
+      }
+    }
+
+    // Dump stats for most recent level
+    if (num_filter_read + num_index_read) {
+      RecordInHistogram(db_statistics_,
+                        NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
+                        num_index_read + num_filter_read);
+    }
+    if (num_sst_read) {
+      RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read);
+      num_level_read++;
+    }
+    if (num_level_read) {
+      RecordInHistogram(db_statistics_, NUM_LEVEL_READ_PER_MULTIGET,
+                        num_level_read);
+    }
+  }
+
+  if (s.ok() && !blob_ctxs.empty()) {
+    MultiGetBlob(read_options, keys_with_blobs_range, blob_ctxs);
+  }
+
+  // Process any left over keys
+  for (auto iter = range->begin(); s.ok() && iter != range->end(); ++iter) {
+    GetContext& get_context = *iter->get_context;
+    Status* status = iter->s;
+    Slice user_key = iter->lkey->user_key();
+
+    if (db_statistics_ != nullptr) {
+      get_context.ReportCounters();
+    }
+    if (GetContext::kMerge == get_context.State()) {
+      if (!merge_operator_) {
+        *status = Status::InvalidArgument(
+            "merge_operator is not properly initialized.");
+        range->MarkKeyDone(iter);
+        continue;
+      }
+      // merge_operands are in saver and we hit the beginning of the key history
+      // do a final merge of nullptr and operands;
+      std::string* str_value =
+          iter->value != nullptr ? iter->value->GetSelf() : nullptr;
+      *status = MergeHelper::TimedFullMerge(
+          merge_operator_, user_key, nullptr, iter->merge_context.GetOperands(),
+          str_value, info_log_, db_statistics_, clock_,
+          /* result_operand */ nullptr, /* update_num_ops_stats */ true);
+      if (LIKELY(iter->value != nullptr)) {
+        iter->value->PinSelf();
+        range->AddValueSize(iter->value->size());
+        range->MarkKeyDone(iter);
+        if (range->GetValueSize() > read_options.value_size_soft_limit) {
+          s = Status::Aborted();
+          break;
+        }
+      }
+    } else {
+      range->MarkKeyDone(iter);
+      *status = Status::NotFound();  // Use an empty error message for speed
+    }
+  }
+
+  for (auto iter = range->begin(); iter != range->end(); ++iter) {
+    range->MarkKeyDone(iter);
+    *(iter->s) = s;
+  }
+}
+
+#ifdef USE_COROUTINES
+Status Version::ProcessBatch(
+    const ReadOptions& read_options, FilePickerMultiGet* batch,
+    std::vector<folly::coro::Task<Status>>& mget_tasks,
+    std::unordered_map<uint64_t, BlobReadContexts>* blob_ctxs,
+    autovector<FilePickerMultiGet, 4>& batches, std::deque<size_t>& waiting,
+    std::deque<size_t>& to_process, unsigned int& num_tasks_queued,
+    std::unordered_map<int, std::tuple<uint64_t, uint64_t, uint64_t>>&
+        mget_stats) {
+  FilePickerMultiGet& fp = *batch;
+  MultiGetRange range = fp.GetRange();
+  // Initialize a new empty range. Any keys that are not in this level will
+  // eventually become part of the new range.
+  MultiGetRange leftover(range, range.begin(), range.begin());
+  FdWithKeyRange* f = nullptr;
+  Status s;
+
+  f = fp.GetNextFileInLevel();
+  while (!f) {
+    fp.PrepareNextLevelForSearch();
+    if (!fp.IsSearchEnded()) {
+      f = fp.GetNextFileInLevel();
+    } else {
+      break;
+    }
+  }
+  while (f) {
+    MultiGetRange file_range = fp.CurrentFileRange();
+    Cache::Handle* table_handle = nullptr;
+    bool skip_filters = IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
+                                        fp.IsHitFileLastInLevel());
+    bool skip_range_deletions = false;
+    if (!skip_filters) {
+      Status status = table_cache_->MultiGetFilter(
+          read_options, *internal_comparator(), *f->file_metadata,
+          mutable_cf_options_.prefix_extractor,
+          cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
+          fp.GetHitFileLevel(), &file_range, &table_handle);
+      if (status.ok()) {
+        skip_filters = true;
+        skip_range_deletions = true;
+      } else if (!status.IsNotSupported()) {
+        s = status;
+      }
+    }
+    if (!s.ok()) {
+      break;
+    }
+    // At this point, file_range contains any keys that are likely in this
+    // file. It may have false positives, but that's ok since higher level
+    // lookups for the key are dependent on this lookup anyway.
+    // Add the complement of file_range to leftover. That's the set of keys
+    // definitely not in this level.
+    // Subtract the complement of file_range from range, since they will be
+    // processed in a separate batch in parallel.
+    leftover += ~file_range;
+    range -= ~file_range;
+    if (!file_range.empty()) {
+      int level = fp.GetHitFileLevel();
+      auto stat = mget_stats.find(level);
+      if (stat == mget_stats.end()) {
+        auto entry = mget_stats.insert({level, {0, 0, 0}});
+        assert(entry.second);
+        stat = entry.first;
+      }
+
+      if (waiting.empty() && to_process.empty() &&
+          !fp.RemainingOverlapInLevel() && leftover.empty() &&
+          mget_tasks.empty()) {
+        // All keys are in one SST file, so take the fast path
+        s = MultiGetFromSST(read_options, file_range, fp.GetHitFileLevel(),
+                            skip_filters, skip_range_deletions, f, *blob_ctxs,
+                            table_handle, std::get<0>(stat->second),
+                            std::get<1>(stat->second),
+                            std::get<2>(stat->second));
+      } else {
+        mget_tasks.emplace_back(MultiGetFromSSTCoroutine(
+            read_options, file_range, fp.GetHitFileLevel(), skip_filters,
+            skip_range_deletions, f, *blob_ctxs, table_handle,
+            std::get<0>(stat->second), std::get<1>(stat->second),
+            std::get<2>(stat->second)));
+        ++num_tasks_queued;
+      }
+    }
+    if (fp.KeyMaySpanNextFile() && !file_range.empty()) {
+      break;
+    }
+    f = fp.GetNextFileInLevel();
+  }
+  // Split the current batch only if some keys are likely in this level and
+  // some are not. Only split if we're done with this level, i.e f is null.
+  // Otherwise, it means there are more files in this level to look at.
+  if (s.ok() && !f && !leftover.empty() && !range.empty()) {
+    fp.ReplaceRange(range);
+    batches.emplace_back(&leftover, fp);
+    to_process.emplace_back(batches.size() - 1);
+  }
+  // 1. If f is non-null, that means we might not be done with this level.
+  //    This can happen if one of the keys is the last key in the file, i.e
+  //    fp.KeyMaySpanNextFile() is true.
+  // 2. If range is empty, then we're done with this range and no need to
+  //    prepare the next level
+  // 3. If some tasks were queued for this range, then the next level will be
+  //    prepared after executing those tasks
+  if (!f && !range.empty() && !num_tasks_queued) {
+    fp.PrepareNextLevelForSearch();
+  }
+  return s;
+}
+
+Status Version::MultiGetAsync(
+    const ReadOptions& options, MultiGetRange* range,
+    std::unordered_map<uint64_t, BlobReadContexts>* blob_ctxs) {
+  autovector<FilePickerMultiGet, 4> batches;
+  std::deque<size_t> waiting;
+  std::deque<size_t> to_process;
+  Status s;
+  std::vector<folly::coro::Task<Status>> mget_tasks;
+  std::unordered_map<int, std::tuple<uint64_t, uint64_t, uint64_t>> mget_stats;
+
+  // Create the initial batch with the input range
+  batches.emplace_back(range, &storage_info_.level_files_brief_,
+                       storage_info_.num_non_empty_levels_,
+                       &storage_info_.file_indexer_, user_comparator(),
+                       internal_comparator());
+  to_process.emplace_back(0);
+
+  while (!to_process.empty()) {
+    // As we process a batch, it may get split into two. So reserve space for
+    // an additional batch in the autovector in order to prevent later moves
+    // of elements in ProcessBatch().
+    batches.reserve(batches.size() + 1);
+
+    size_t idx = to_process.front();
+    FilePickerMultiGet* batch = &batches.at(idx);
+    unsigned int num_tasks_queued = 0;
+    to_process.pop_front();
+    if (batch->IsSearchEnded() || batch->GetRange().empty()) {
+      // If to_process is empty, i.e no more batches to look at, then we need
+      // schedule the enqueued coroutines and wait for them. Otherwise, we
+      // skip this batch and move to the next one in to_process.
+      if (!to_process.empty()) {
+        continue;
+      }
+    } else {
+      // Look through one level. This may split the batch and enqueue it to
+      // to_process
+      s = ProcessBatch(options, batch, mget_tasks, blob_ctxs, batches, waiting,
+                       to_process, num_tasks_queued, mget_stats);
+      // If ProcessBatch didn't enqueue any coroutine tasks, it means all
+      // keys were filtered out. So put the batch back in to_process to
+      // lookup in the next level
+      if (!num_tasks_queued && !batch->IsSearchEnded()) {
+        // Put this back in the processing queue
+        to_process.emplace_back(idx);
+      } else if (num_tasks_queued) {
+        waiting.emplace_back(idx);
+      }
+    }
+    // If ProcessBatch() returned an error, then schedule the enqueued
+    // coroutines and wait for them, then abort the MultiGet.
+    if (to_process.empty() || !s.ok()) {
+      if (mget_tasks.size() > 0) {
+        assert(waiting.size());
+        RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT, mget_tasks.size());
+        // Collect all results so far
+        std::vector<Status> statuses = folly::coro::blockingWait(
+            folly::coro::collectAllRange(std::move(mget_tasks))
+                .scheduleOn(&range->context()->executor()));
+        mget_tasks.clear();
+        if (s.ok()) {
+          for (Status stat : statuses) {
+            if (!stat.ok()) {
+              s = std::move(stat);
+              break;
+            }
+          }
+        }
+
+        if (!s.ok()) {
+          break;
+        }
+
+        for (size_t wait_idx : waiting) {
+          FilePickerMultiGet& fp = batches.at(wait_idx);
+          // 1. If fp.GetHitFile() is non-null, then there could be more
+          // overlap in this level. So skip preparing next level.
+          // 2. If fp.GetRange() is empty, then this batch is completed
+          // and no need to prepare the next level.
+          if (!fp.GetHitFile() && !fp.GetRange().empty()) {
+            fp.PrepareNextLevelForSearch();
+          }
+        }
+        to_process.swap(waiting);
+      } else {
+        assert(!s.ok() || waiting.size() == 0);
+      }
+    }
+    if (!s.ok()) {
+      break;
+    }
+  }
+
+  uint64_t num_levels = 0;
+  for (auto& stat : mget_stats) {
+    if (stat.first == 0) {
+      num_levels += std::get<2>(stat.second);
+    } else {
+      num_levels++;
+    }
+
+    uint64_t num_meta_reads =
+        std::get<0>(stat.second) + std::get<1>(stat.second);
+    uint64_t num_sst_reads = std::get<2>(stat.second);
+    if (num_meta_reads > 0) {
+      RecordInHistogram(db_statistics_,
+                        NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
+                        num_meta_reads);
+    }
+    if (num_sst_reads > 0) {
+      RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_reads);
+    }
+  }
+  if (num_levels > 0) {
+    RecordInHistogram(db_statistics_, NUM_LEVEL_READ_PER_MULTIGET, num_levels);
+  }
+
+  return s;
+}
+#endif
+
+bool Version::IsFilterSkipped(int level, bool is_file_last_in_level) {
+  // Reaching the bottom level implies misses at all upper levels, so we'll
+  // skip checking the filters when we predict a hit.
+  return cfd_->ioptions()->optimize_filters_for_hits &&
+         (level > 0 || is_file_last_in_level) &&
+         level == storage_info_.num_non_empty_levels() - 1;
+}
+
+void VersionStorageInfo::GenerateLevelFilesBrief() {
+  level_files_brief_.resize(num_non_empty_levels_);
+  for (int level = 0; level < num_non_empty_levels_; level++) {
+    DoGenerateLevelFilesBrief(&level_files_brief_[level], files_[level],
+                              &arena_);
+  }
+}
+
+void VersionStorageInfo::PrepareForVersionAppend(
+    const ImmutableOptions& immutable_options,
+    const MutableCFOptions& mutable_cf_options) {
+  ComputeCompensatedSizes();
+  UpdateNumNonEmptyLevels();
+  CalculateBaseBytes(immutable_options, mutable_cf_options);
+  UpdateFilesByCompactionPri(immutable_options, mutable_cf_options);
+  GenerateFileIndexer();
+  GenerateLevelFilesBrief();
+  GenerateLevel0NonOverlapping();
+  if (!immutable_options.allow_ingest_behind) {
+    GenerateBottommostFiles();
+  }
+  GenerateFileLocationIndex();
+}
+
+void Version::PrepareAppend(const MutableCFOptions& mutable_cf_options,
+                            bool update_stats) {
+  TEST_SYNC_POINT_CALLBACK(
+      "Version::PrepareAppend:forced_check",
+      reinterpret_cast<void*>(&storage_info_.force_consistency_checks_));
+
+  if (update_stats) {
+    UpdateAccumulatedStats();
+  }
+
+  storage_info_.PrepareForVersionAppend(*cfd_->ioptions(), mutable_cf_options);
+}
+
+bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) {
+  if (file_meta->init_stats_from_file || file_meta->compensated_file_size > 0) {
+    return false;
+  }
+  std::shared_ptr<const TableProperties> tp;
+  Status s = GetTableProperties(&tp, file_meta);
+  file_meta->init_stats_from_file = true;
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(vset_->db_options_->info_log,
+                    "Unable to load table properties for file %" PRIu64
+                    " --- %s\n",
+                    file_meta->fd.GetNumber(), s.ToString().c_str());
+    return false;
+  }
+  if (tp.get() == nullptr) return false;
+  file_meta->num_entries = tp->num_entries;
+  file_meta->num_deletions = tp->num_deletions;
+  file_meta->raw_value_size = tp->raw_value_size;
+  file_meta->raw_key_size = tp->raw_key_size;
+
+  return true;
+}
+
+void VersionStorageInfo::UpdateAccumulatedStats(FileMetaData* file_meta) {
+  TEST_SYNC_POINT_CALLBACK("VersionStorageInfo::UpdateAccumulatedStats",
+                           nullptr);
+
+  assert(file_meta->init_stats_from_file);
+  accumulated_file_size_ += file_meta->fd.GetFileSize();
+  accumulated_raw_key_size_ += file_meta->raw_key_size;
+  accumulated_raw_value_size_ += file_meta->raw_value_size;
+  accumulated_num_non_deletions_ +=
+      file_meta->num_entries - file_meta->num_deletions;
+  accumulated_num_deletions_ += file_meta->num_deletions;
+
+  current_num_non_deletions_ +=
+      file_meta->num_entries - file_meta->num_deletions;
+  current_num_deletions_ += file_meta->num_deletions;
+  current_num_samples_++;
+}
+
+void VersionStorageInfo::RemoveCurrentStats(FileMetaData* file_meta) {
+  if (file_meta->init_stats_from_file) {
+    current_num_non_deletions_ -=
+        file_meta->num_entries - file_meta->num_deletions;
+    current_num_deletions_ -= file_meta->num_deletions;
+    current_num_samples_--;
+  }
+}
+
+void Version::UpdateAccumulatedStats() {
+  // maximum number of table properties loaded from files.
+  const int kMaxInitCount = 20;
+  int init_count = 0;
+  // here only the first kMaxInitCount files which haven't been
+  // initialized from file will be updated with num_deletions.
+  // The motivation here is to cap the maximum I/O per Version creation.
+  // The reason for choosing files from lower-level instead of higher-level
+  // is that such design is able to propagate the initialization from
+  // lower-level to higher-level:  When the num_deletions of lower-level
+  // files are updated, it will make the lower-level files have accurate
+  // compensated_file_size, making lower-level to higher-level compaction
+  // will be triggered, which creates higher-level files whose num_deletions
+  // will be updated here.
+  for (int level = 0;
+       level < storage_info_.num_levels_ && init_count < kMaxInitCount;
+       ++level) {
+    for (auto* file_meta : storage_info_.files_[level]) {
+      if (MaybeInitializeFileMetaData(file_meta)) {
+        // each FileMeta will be initialized only once.
+        storage_info_.UpdateAccumulatedStats(file_meta);
+        // when option "max_open_files" is -1, all the file metadata has
+        // already been read, so MaybeInitializeFileMetaData() won't incur
+        // any I/O cost. "max_open_files=-1" means that the table cache passed
+        // to the VersionSet and then to the ColumnFamilySet has a size of
+        // TableCache::kInfiniteCapacity
+        if (vset_->GetColumnFamilySet()->get_table_cache()->GetCapacity() ==
+            TableCache::kInfiniteCapacity) {
+          continue;
+        }
+        if (++init_count >= kMaxInitCount) {
+          break;
+        }
+      }
+    }
+  }
+  // In case all sampled-files contain only deletion entries, then we
+  // load the table-property of a file in higher-level to initialize
+  // that value.
+  for (int level = storage_info_.num_levels_ - 1;
+       storage_info_.accumulated_raw_value_size_ == 0 && level >= 0; --level) {
+    for (int i = static_cast<int>(storage_info_.files_[level].size()) - 1;
+         storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) {
+      if (MaybeInitializeFileMetaData(storage_info_.files_[level][i])) {
+        storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]);
+      }
+    }
+  }
+}
+
+void VersionStorageInfo::ComputeCompensatedSizes() {
+  static const int kDeletionWeightOnCompaction = 2;
+  uint64_t average_value_size = GetAverageValueSize();
+
+  // compute the compensated size
+  for (int level = 0; level < num_levels_; level++) {
+    for (auto* file_meta : files_[level]) {
+      // Here we only compute compensated_file_size for those file_meta
+      // which compensated_file_size is uninitialized (== 0). This is true only
+      // for files that have been created right now and no other thread has
+      // access to them. That's why we can safely mutate compensated_file_size.
+      if (file_meta->compensated_file_size == 0) {
+        file_meta->compensated_file_size = file_meta->fd.GetFileSize();
+        // Here we only boost the size of deletion entries of a file only
+        // when the number of deletion entries is greater than the number of
+        // non-deletion entries in the file.  The motivation here is that in
+        // a stable workload, the number of deletion entries should be roughly
+        // equal to the number of non-deletion entries.  If we compensate the
+        // size of deletion entries in a stable workload, the deletion
+        // compensation logic might introduce unwanted effet which changes the
+        // shape of LSM tree.
+        if (file_meta->num_deletions * 2 >= file_meta->num_entries) {
+          file_meta->compensated_file_size +=
+              (file_meta->num_deletions * 2 - file_meta->num_entries) *
+              average_value_size * kDeletionWeightOnCompaction;
+        }
+      }
+    }
+  }
+}
+
+int VersionStorageInfo::MaxInputLevel() const {
+  if (compaction_style_ == kCompactionStyleLevel) {
+    return num_levels() - 2;
+  }
+  return 0;
+}
+
+int VersionStorageInfo::MaxOutputLevel(bool allow_ingest_behind) const {
+  if (allow_ingest_behind) {
+    assert(num_levels() > 1);
+    return num_levels() - 2;
+  }
+  return num_levels() - 1;
+}
+
+void VersionStorageInfo::EstimateCompactionBytesNeeded(
+    const MutableCFOptions& mutable_cf_options) {
+  // Only implemented for level-based compaction
+  if (compaction_style_ != kCompactionStyleLevel) {
+    estimated_compaction_needed_bytes_ = 0;
+    return;
+  }
+
+  // Start from Level 0, if level 0 qualifies compaction to level 1,
+  // we estimate the size of compaction.
+  // Then we move on to the next level and see whether it qualifies compaction
+  // to the next level. The size of the level is estimated as the actual size
+  // on the level plus the input bytes from the previous level if there is any.
+  // If it exceeds, take the exceeded bytes as compaction input and add the size
+  // of the compaction size to tatal size.
+  // We keep doing it to Level 2, 3, etc, until the last level and return the
+  // accumulated bytes.
+
+  uint64_t bytes_compact_to_next_level = 0;
+  uint64_t level_size = 0;
+  for (auto* f : files_[0]) {
+    level_size += f->fd.GetFileSize();
+  }
+  // Level 0
+  bool level0_compact_triggered = false;
+  if (static_cast<int>(files_[0].size()) >=
+          mutable_cf_options.level0_file_num_compaction_trigger ||
+      level_size >= mutable_cf_options.max_bytes_for_level_base) {
+    level0_compact_triggered = true;
+    estimated_compaction_needed_bytes_ = level_size;
+    bytes_compact_to_next_level = level_size;
+  } else {
+    estimated_compaction_needed_bytes_ = 0;
+  }
+
+  // Level 1 and up.
+  uint64_t bytes_next_level = 0;
+  for (int level = base_level(); level <= MaxInputLevel(); level++) {
+    level_size = 0;
+    if (bytes_next_level > 0) {
+#ifndef NDEBUG
+      uint64_t level_size2 = 0;
+      for (auto* f : files_[level]) {
+        level_size2 += f->fd.GetFileSize();
+      }
+      assert(level_size2 == bytes_next_level);
+#endif
+      level_size = bytes_next_level;
+      bytes_next_level = 0;
+    } else {
+      for (auto* f : files_[level]) {
+        level_size += f->fd.GetFileSize();
+      }
+    }
+    if (level == base_level() && level0_compact_triggered) {
+      // Add base level size to compaction if level0 compaction triggered.
+      estimated_compaction_needed_bytes_ += level_size;
+    }
+    // Add size added by previous compaction
+    level_size += bytes_compact_to_next_level;
+    bytes_compact_to_next_level = 0;
+    uint64_t level_target = MaxBytesForLevel(level);
+    if (level_size > level_target) {
+      bytes_compact_to_next_level = level_size - level_target;
+      // Estimate the actual compaction fan-out ratio as size ratio between
+      // the two levels.
+
+      assert(bytes_next_level == 0);
+      if (level + 1 < num_levels_) {
+        for (auto* f : files_[level + 1]) {
+          bytes_next_level += f->fd.GetFileSize();
+        }
+      }
+      if (bytes_next_level > 0) {
+        assert(level_size > 0);
+        estimated_compaction_needed_bytes_ += static_cast<uint64_t>(
+            static_cast<double>(bytes_compact_to_next_level) *
+            (static_cast<double>(bytes_next_level) /
+                 static_cast<double>(level_size) +
+             1));
+      }
+    }
+  }
+}
+
+namespace {
+uint32_t GetExpiredTtlFilesCount(const ImmutableOptions& ioptions,
+                                 const MutableCFOptions& mutable_cf_options,
+                                 const std::vector<FileMetaData*>& files) {
+  uint32_t ttl_expired_files_count = 0;
+
+  int64_t _current_time;
+  auto status = ioptions.clock->GetCurrentTime(&_current_time);
+  if (status.ok()) {
+    const uint64_t current_time = static_cast<uint64_t>(_current_time);
+    for (FileMetaData* f : files) {
+      if (!f->being_compacted) {
+        uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
+        if (oldest_ancester_time != 0 &&
+            oldest_ancester_time < (current_time - mutable_cf_options.ttl)) {
+          ttl_expired_files_count++;
+        }
+      }
+    }
+  }
+  return ttl_expired_files_count;
+}
+}  // anonymous namespace
+
+void VersionStorageInfo::ComputeCompactionScore(
+    const ImmutableOptions& immutable_options,
+    const MutableCFOptions& mutable_cf_options) {
+  double total_downcompact_bytes = 0.0;
+  // Historically, score is defined as actual bytes in a level divided by
+  // the level's target size, and 1.0 is the threshold for triggering
+  // compaction. Higher score means higher prioritization.
+  // Now we keep the compaction triggering condition, but consider more
+  // factors for priorization, while still keeping the 1.0 threshold.
+  // In order to provide flexibility for reducing score while still
+  // maintaining it to be over 1.0, we scale the original score by 10x
+  // if it is larger than 1.0.
+  const double kScoreScale = 10.0;
+  for (int level = 0; level <= MaxInputLevel(); level++) {
+    double score;
+    if (level == 0) {
+      // We treat level-0 specially by bounding the number of files
+      // instead of number of bytes for two reasons:
+      //
+      // (1) With larger write-buffer sizes, it is nice not to do too
+      // many level-0 compactions.
+      //
+      // (2) The files in level-0 are merged on every read and
+      // therefore we wish to avoid too many files when the individual
+      // file size is small (perhaps because of a small write-buffer
+      // setting, or very high compression ratios, or lots of
+      // overwrites/deletions).
+      int num_sorted_runs = 0;
+      uint64_t total_size = 0;
+      for (auto* f : files_[level]) {
+        total_downcompact_bytes += static_cast<double>(f->fd.GetFileSize());
+        if (!f->being_compacted) {
+          total_size += f->compensated_file_size;
+          num_sorted_runs++;
+        }
+      }
+      if (compaction_style_ == kCompactionStyleUniversal) {
+        // For universal compaction, we use level0 score to indicate
+        // compaction score for the whole DB. Adding other levels as if
+        // they are L0 files.
+        for (int i = 1; i < num_levels(); i++) {
+          // Its possible that a subset of the files in a level may be in a
+          // compaction, due to delete triggered compaction or trivial move.
+          // In that case, the below check may not catch a level being
+          // compacted as it only checks the first file. The worst that can
+          // happen is a scheduled compaction thread will find nothing to do.
+          if (!files_[i].empty() && !files_[i][0]->being_compacted) {
+            num_sorted_runs++;
+          }
+        }
+      }
+
+      if (compaction_style_ == kCompactionStyleFIFO) {
+        score = static_cast<double>(total_size) /
+                mutable_cf_options.compaction_options_fifo.max_table_files_size;
+        if (mutable_cf_options.compaction_options_fifo.allow_compaction ||
+            mutable_cf_options.compaction_options_fifo.age_for_warm > 0) {
+          // Warm tier move can happen at any time. It's too expensive to
+          // check very file's timestamp now. For now, just trigger it
+          // slightly more frequently than FIFO compaction so that this
+          // happens first.
+          score = std::max(
+              static_cast<double>(num_sorted_runs) /
+                  mutable_cf_options.level0_file_num_compaction_trigger,
+              score);
+        }
+        if (mutable_cf_options.ttl > 0) {
+          score = std::max(
+              static_cast<double>(GetExpiredTtlFilesCount(
+                  immutable_options, mutable_cf_options, files_[level])),
+              score);
+        }
+      } else {
+        score = static_cast<double>(num_sorted_runs) /
+                mutable_cf_options.level0_file_num_compaction_trigger;
+        if (compaction_style_ == kCompactionStyleLevel && num_levels() > 1) {
+          // Level-based involves L0->L0 compactions that can lead to oversized
+          // L0 files. Take into account size as well to avoid later giant
+          // compactions to the base level.
+          // If score in L0 is always too high, L0->L1 will always be
+          // prioritized over L1->L2 compaction and L1 will accumulate to
+          // too large. But if L0 score isn't high enough, L0 will accumulate
+          // and data is not moved to L1 fast enough. With potential L0->L0
+          // compaction, number of L0 files aren't always an indication of
+          // L0 oversizing, and we also need to consider total size of L0.
+          if (immutable_options.level_compaction_dynamic_level_bytes) {
+            if (total_size >= mutable_cf_options.max_bytes_for_level_base) {
+              // When calculating estimated_compaction_needed_bytes, we assume
+              // L0 is qualified as pending compactions. We will need to make
+              // sure that it qualifies for compaction.
+              // It might be guafanteed by logic below anyway, but we are
+              // explicit here to make sure we don't stop writes with no
+              // compaction scheduled.
+              score = std::max(score, 1.01);
+            }
+            if (total_size > level_max_bytes_[base_level_]) {
+              // In this case, we compare L0 size with actual L1 size and make
+              // sure score is more than 1.0 (10.0 after scaled) if L0 is larger
+              // than L1. Since in this case L1 score is lower than 10.0, L0->L1
+              // is prioritized over L1->L2.
+              uint64_t base_level_size = 0;
+              for (auto f : files_[base_level_]) {
+                base_level_size += f->compensated_file_size;
+              }
+              score = std::max(score, static_cast<double>(total_size) /
+                                          static_cast<double>(std::max(
+                                              base_level_size,
+                                              level_max_bytes_[base_level_])));
+            }
+            if (score > 1.0) {
+              score *= kScoreScale;
+            }
+          } else {
+            score = std::max(score,
+                             static_cast<double>(total_size) /
+                                 mutable_cf_options.max_bytes_for_level_base);
+          }
+        }
+      }
+    } else {
+      // Compute the ratio of current size to size limit.
+      uint64_t level_bytes_no_compacting = 0;
+      uint64_t level_total_bytes = 0;
+      for (auto f : files_[level]) {
+        level_total_bytes += f->fd.GetFileSize();
+        if (!f->being_compacted) {
+          level_bytes_no_compacting += f->compensated_file_size;
+        }
+      }
+      if (!immutable_options.level_compaction_dynamic_level_bytes ||
+          level_bytes_no_compacting < MaxBytesForLevel(level)) {
+        score = static_cast<double>(level_bytes_no_compacting) /
+                MaxBytesForLevel(level);
+      } else {
+        // If there are a large mount of data being compacted down to the
+        // current level soon, we would de-prioritize compaction from
+        // a level where the incoming data would be a large ratio. We do
+        // it by dividing level size not by target level size, but
+        // the target size and the incoming compaction bytes.
+        score = static_cast<double>(level_bytes_no_compacting) /
+                (MaxBytesForLevel(level) + total_downcompact_bytes) *
+                kScoreScale;
+      }
+      if (level_total_bytes > MaxBytesForLevel(level)) {
+        total_downcompact_bytes +=
+            static_cast<double>(level_total_bytes - MaxBytesForLevel(level));
+      }
+    }
+    compaction_level_[level] = level;
+    compaction_score_[level] = score;
+  }
+
+  // sort all the levels based on their score. Higher scores get listed
+  // first. Use bubble sort because the number of entries are small.
+  for (int i = 0; i < num_levels() - 2; i++) {
+    for (int j = i + 1; j < num_levels() - 1; j++) {
+      if (compaction_score_[i] < compaction_score_[j]) {
+        double score = compaction_score_[i];
+        int level = compaction_level_[i];
+        compaction_score_[i] = compaction_score_[j];
+        compaction_level_[i] = compaction_level_[j];
+        compaction_score_[j] = score;
+        compaction_level_[j] = level;
+      }
+    }
+  }
+  ComputeFilesMarkedForCompaction();
+  if (!immutable_options.allow_ingest_behind) {
+    ComputeBottommostFilesMarkedForCompaction();
+  }
+  if (mutable_cf_options.ttl > 0) {
+    ComputeExpiredTtlFiles(immutable_options, mutable_cf_options.ttl);
+  }
+  if (mutable_cf_options.periodic_compaction_seconds > 0) {
+    ComputeFilesMarkedForPeriodicCompaction(
+        immutable_options, mutable_cf_options.periodic_compaction_seconds);
+  }
+
+  if (mutable_cf_options.enable_blob_garbage_collection &&
+      mutable_cf_options.blob_garbage_collection_age_cutoff > 0.0 &&
+      mutable_cf_options.blob_garbage_collection_force_threshold < 1.0) {
+    ComputeFilesMarkedForForcedBlobGC(
+        mutable_cf_options.blob_garbage_collection_age_cutoff,
+        mutable_cf_options.blob_garbage_collection_force_threshold);
+  }
+
+  EstimateCompactionBytesNeeded(mutable_cf_options);
+}
+
+void VersionStorageInfo::ComputeFilesMarkedForCompaction() {
+  files_marked_for_compaction_.clear();
+  int last_qualify_level = 0;
+
+  // Do not include files from the last level with data
+  // If table properties collector suggests a file on the last level,
+  // we should not move it to a new level.
+  for (int level = num_levels() - 1; level >= 1; level--) {
+    if (!files_[level].empty()) {
+      last_qualify_level = level - 1;
+      break;
+    }
+  }
+
+  for (int level = 0; level <= last_qualify_level; level++) {
+    for (auto* f : files_[level]) {
+      if (!f->being_compacted && f->marked_for_compaction) {
+        files_marked_for_compaction_.emplace_back(level, f);
+      }
+    }
+  }
+}
+
+void VersionStorageInfo::ComputeExpiredTtlFiles(
+    const ImmutableOptions& ioptions, const uint64_t ttl) {
+  assert(ttl > 0);
+
+  expired_ttl_files_.clear();
+
+  int64_t _current_time;
+  auto status = ioptions.clock->GetCurrentTime(&_current_time);
+  if (!status.ok()) {
+    return;
+  }
+  const uint64_t current_time = static_cast<uint64_t>(_current_time);
+
+  for (int level = 0; level < num_levels() - 1; level++) {
+    for (FileMetaData* f : files_[level]) {
+      if (!f->being_compacted) {
+        uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
+        if (oldest_ancester_time > 0 &&
+            oldest_ancester_time < (current_time - ttl)) {
+          expired_ttl_files_.emplace_back(level, f);
+        }
+      }
+    }
+  }
+}
+
+void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction(
+    const ImmutableOptions& ioptions,
+    const uint64_t periodic_compaction_seconds) {
+  assert(periodic_compaction_seconds > 0);
+
+  files_marked_for_periodic_compaction_.clear();
+
+  int64_t temp_current_time;
+  auto status = ioptions.clock->GetCurrentTime(&temp_current_time);
+  if (!status.ok()) {
+    return;
+  }
+  const uint64_t current_time = static_cast<uint64_t>(temp_current_time);
+
+  // If periodic_compaction_seconds is larger than current time, periodic
+  // compaction can't possibly be triggered.
+  if (periodic_compaction_seconds > current_time) {
+    return;
+  }
+
+  const uint64_t allowed_time_limit =
+      current_time - periodic_compaction_seconds;
+
+  for (int level = 0; level < num_levels(); level++) {
+    for (auto f : files_[level]) {
+      if (!f->being_compacted) {
+        // Compute a file's modification time in the following order:
+        // 1. Use file_creation_time table property if it is > 0.
+        // 2. Use creation_time table property if it is > 0.
+        // 3. Use file's mtime metadata if the above two table properties are 0.
+        // Don't consider the file at all if the modification time cannot be
+        // correctly determined based on the above conditions.
+        uint64_t file_modification_time = f->TryGetFileCreationTime();
+        if (file_modification_time == kUnknownFileCreationTime) {
+          file_modification_time = f->TryGetOldestAncesterTime();
+        }
+        if (file_modification_time == kUnknownOldestAncesterTime) {
+          auto file_path = TableFileName(ioptions.cf_paths, f->fd.GetNumber(),
+                                         f->fd.GetPathId());
+          status = ioptions.env->GetFileModificationTime(
+              file_path, &file_modification_time);
+          if (!status.ok()) {
+            ROCKS_LOG_WARN(ioptions.logger,
+                           "Can't get file modification time: %s: %s",
+                           file_path.c_str(), status.ToString().c_str());
+            continue;
+          }
+        }
+        if (file_modification_time > 0 &&
+            file_modification_time < allowed_time_limit) {
+          files_marked_for_periodic_compaction_.emplace_back(level, f);
+        }
+      }
+    }
+  }
+}
+
+void VersionStorageInfo::ComputeFilesMarkedForForcedBlobGC(
+    double blob_garbage_collection_age_cutoff,
+    double blob_garbage_collection_force_threshold) {
+  files_marked_for_forced_blob_gc_.clear();
+
+  if (blob_files_.empty()) {
+    return;
+  }
+
+  // Number of blob files eligible for GC based on age
+  const size_t cutoff_count = static_cast<size_t>(
+      blob_garbage_collection_age_cutoff * blob_files_.size());
+  if (!cutoff_count) {
+    return;
+  }
+
+  // Compute the sum of total and garbage bytes over the oldest batch of blob
+  // files. The oldest batch is defined as the set of blob files which are
+  // kept alive by the same SSTs as the very oldest one. Here is a toy example.
+  // Let's assume we have three SSTs 1, 2, and 3, and four blob files 10, 11,
+  // 12, and 13. Also, let's say SSTs 1 and 2 both rely on blob file 10 and
+  // potentially some higher-numbered ones, while SST 3 relies on blob file 12
+  // and potentially some higher-numbered ones. Then, the SST to oldest blob
+  // file mapping is as follows:
+  //
+  // SST file number               Oldest blob file number
+  // 1                             10
+  // 2                             10
+  // 3                             12
+  //
+  // This is what the same thing looks like from the blob files' POV. (Note that
+  // the linked SSTs simply denote the inverse mapping of the above.)
+  //
+  // Blob file number              Linked SST set
+  // 10                            {1, 2}
+  // 11                            {}
+  // 12                            {3}
+  // 13                            {}
+  //
+  // Then, the oldest batch of blob files consists of blob files 10 and 11,
+  // and we can get rid of them by forcing the compaction of SSTs 1 and 2.
+  //
+  // Note that the overall ratio of garbage computed for the batch has to exceed
+  // blob_garbage_collection_force_threshold and the entire batch has to be
+  // eligible for GC according to blob_garbage_collection_age_cutoff in order
+  // for us to schedule any compactions.
+  const auto& oldest_meta = blob_files_.front();
+  assert(oldest_meta);
+
+  const auto& linked_ssts = oldest_meta->GetLinkedSsts();
+  assert(!linked_ssts.empty());
+
+  size_t count = 1;
+  uint64_t sum_total_blob_bytes = oldest_meta->GetTotalBlobBytes();
+  uint64_t sum_garbage_blob_bytes = oldest_meta->GetGarbageBlobBytes();
+
+  assert(cutoff_count <= blob_files_.size());
+
+  for (; count < cutoff_count; ++count) {
+    const auto& meta = blob_files_[count];
+    assert(meta);
+
+    if (!meta->GetLinkedSsts().empty()) {
+      // Found the beginning of the next batch of blob files
+      break;
+    }
+
+    sum_total_blob_bytes += meta->GetTotalBlobBytes();
+    sum_garbage_blob_bytes += meta->GetGarbageBlobBytes();
+  }
+
+  if (count < blob_files_.size()) {
+    const auto& meta = blob_files_[count];
+    assert(meta);
+
+    if (meta->GetLinkedSsts().empty()) {
+      // Some files in the oldest batch are not eligible for GC
+      return;
+    }
+  }
+
+  if (sum_garbage_blob_bytes <
+      blob_garbage_collection_force_threshold * sum_total_blob_bytes) {
+    return;
+  }
+
+  for (uint64_t sst_file_number : linked_ssts) {
+    const FileLocation location = GetFileLocation(sst_file_number);
+    assert(location.IsValid());
+
+    const int level = location.GetLevel();
+    assert(level >= 0);
+
+    const size_t pos = location.GetPosition();
+
+    FileMetaData* const sst_meta = files_[level][pos];
+    assert(sst_meta);
+
+    if (sst_meta->being_compacted) {
+      continue;
+    }
+
+    files_marked_for_forced_blob_gc_.emplace_back(level, sst_meta);
+  }
+}
+
+namespace {
+
+// used to sort files by size
+struct Fsize {
+  size_t index;
+  FileMetaData* file;
+};
+
+// Comparator that is used to sort files based on their size
+// In normal mode: descending size
+bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) {
+  return (first.file->compensated_file_size >
+          second.file->compensated_file_size);
+}
+}  // anonymous namespace
+
+void VersionStorageInfo::AddFile(int level, FileMetaData* f) {
+  auto& level_files = files_[level];
+  level_files.push_back(f);
+
+  f->refs++;
+}
+
+void VersionStorageInfo::AddBlobFile(
+    std::shared_ptr<BlobFileMetaData> blob_file_meta) {
+  assert(blob_file_meta);
+
+  assert(blob_files_.empty() ||
+         (blob_files_.back() && blob_files_.back()->GetBlobFileNumber() <
+                                    blob_file_meta->GetBlobFileNumber()));
+
+  blob_files_.emplace_back(std::move(blob_file_meta));
+}
+
+VersionStorageInfo::BlobFiles::const_iterator
+VersionStorageInfo::GetBlobFileMetaDataLB(uint64_t blob_file_number) const {
+  return std::lower_bound(
+      blob_files_.begin(), blob_files_.end(), blob_file_number,
+      [](const std::shared_ptr<BlobFileMetaData>& lhs, uint64_t rhs) {
+        assert(lhs);
+        return lhs->GetBlobFileNumber() < rhs;
+      });
+}
+
+void VersionStorageInfo::SetFinalized() {
+  finalized_ = true;
+
+#ifndef NDEBUG
+  if (compaction_style_ != kCompactionStyleLevel) {
+    // Not level based compaction.
+    return;
+  }
+  assert(base_level_ < 0 || num_levels() == 1 ||
+         (base_level_ >= 1 && base_level_ < num_levels()));
+  // Verify all levels newer than base_level are empty except L0
+  for (int level = 1; level < base_level(); level++) {
+    assert(NumLevelBytes(level) == 0);
+  }
+  uint64_t max_bytes_prev_level = 0;
+  for (int level = base_level(); level < num_levels() - 1; level++) {
+    if (LevelFiles(level).size() == 0) {
+      continue;
+    }
+    assert(MaxBytesForLevel(level) >= max_bytes_prev_level);
+    max_bytes_prev_level = MaxBytesForLevel(level);
+  }
+  for (int level = 0; level < num_levels(); level++) {
+    assert(LevelFiles(level).size() == 0 ||
+           LevelFiles(level).size() == LevelFilesBrief(level).num_files);
+    if (LevelFiles(level).size() > 0) {
+      assert(level < num_non_empty_levels());
+    }
+  }
+  assert(compaction_level_.size() > 0);
+  assert(compaction_level_.size() == compaction_score_.size());
+#endif
+}
+
+void VersionStorageInfo::UpdateNumNonEmptyLevels() {
+  num_non_empty_levels_ = num_levels_;
+  for (int i = num_levels_ - 1; i >= 0; i--) {
+    if (files_[i].size() != 0) {
+      return;
+    } else {
+      num_non_empty_levels_ = i;
+    }
+  }
+}
+
+namespace {
+// Sort `temp` based on ratio of overlapping size over file size
+void SortFileByOverlappingRatio(
+    const InternalKeyComparator& icmp, const std::vector<FileMetaData*>& files,
+    const std::vector<FileMetaData*>& next_level_files, SystemClock* clock,
+    int level, int num_non_empty_levels, uint64_t ttl,
+    std::vector<Fsize>* temp) {
+  std::unordered_map<uint64_t, uint64_t> file_to_order;
+  auto next_level_it = next_level_files.begin();
+
+  int64_t curr_time;
+  Status status = clock->GetCurrentTime(&curr_time);
+  if (!status.ok()) {
+    // If we can't get time, disable TTL.
+    ttl = 0;
+  }
+
+  FileTtlBooster ttl_booster(static_cast<uint64_t>(curr_time), ttl,
+                             num_non_empty_levels, level);
+
+  for (auto& file : files) {
+    uint64_t overlapping_bytes = 0;
+    // Skip files in next level that is smaller than current file
+    while (next_level_it != next_level_files.end() &&
+           icmp.Compare((*next_level_it)->largest, file->smallest) < 0) {
+      next_level_it++;
+    }
+
+    while (next_level_it != next_level_files.end() &&
+           icmp.Compare((*next_level_it)->smallest, file->largest) < 0) {
+      overlapping_bytes += (*next_level_it)->fd.file_size;
+
+      if (icmp.Compare((*next_level_it)->largest, file->largest) > 0) {
+        // next level file cross large boundary of current file.
+        break;
+      }
+      next_level_it++;
+    }
+
+    uint64_t ttl_boost_score = (ttl > 0) ? ttl_booster.GetBoostScore(file) : 1;
+    assert(ttl_boost_score > 0);
+    assert(file->compensated_file_size != 0);
+    file_to_order[file->fd.GetNumber()] = overlapping_bytes * 1024U /
+                                          file->compensated_file_size /
+                                          ttl_boost_score;
+  }
+
+  size_t num_to_sort = temp->size() > VersionStorageInfo::kNumberFilesToSort
+                           ? VersionStorageInfo::kNumberFilesToSort
+                           : temp->size();
+
+  std::partial_sort(temp->begin(), temp->begin() + num_to_sort, temp->end(),
+                    [&](const Fsize& f1, const Fsize& f2) -> bool {
+                      // If score is the same, pick file with smaller keys.
+                      // This makes the algorithm more deterministic, and also
+                      // help the trivial move case to have more files to
+                      // extend.
+                      if (file_to_order[f1.file->fd.GetNumber()] ==
+                          file_to_order[f2.file->fd.GetNumber()]) {
+                        return icmp.Compare(f1.file->smallest,
+                                            f2.file->smallest) < 0;
+                      }
+                      return file_to_order[f1.file->fd.GetNumber()] <
+                             file_to_order[f2.file->fd.GetNumber()];
+                    });
+}
+
+void SortFileByRoundRobin(const InternalKeyComparator& icmp,
+                          std::vector<InternalKey>* compact_cursor,
+                          bool level0_non_overlapping, int level,
+                          std::vector<Fsize>* temp) {
+  if (level == 0 && !level0_non_overlapping) {
+    // Using kOldestSmallestSeqFirst when level === 0, since the
+    // files may overlap (not fully sorted)
+    std::sort(temp->begin(), temp->end(),
+              [](const Fsize& f1, const Fsize& f2) -> bool {
+                return f1.file->fd.smallest_seqno < f2.file->fd.smallest_seqno;
+              });
+    return;
+  }
+
+  bool should_move_files =
+      compact_cursor->at(level).size() > 0 && temp->size() > 1;
+
+  // The iterator points to the Fsize with smallest key larger than or equal to
+  // the given cursor
+  std::vector<Fsize>::iterator current_file_iter;
+  if (should_move_files) {
+    // Find the file of which the smallest key is larger than or equal to
+    // the cursor (the smallest key in the successor file of the last
+    // chosen file), skip this if the cursor is invalid or there is only
+    // one file in this level
+    current_file_iter = std::lower_bound(
+        temp->begin(), temp->end(), compact_cursor->at(level),
+        [&](const Fsize& f, const InternalKey& cursor) -> bool {
+          return icmp.Compare(cursor, f.file->smallest) > 0;
+        });
+
+    should_move_files =
+        current_file_iter != temp->end() && current_file_iter != temp->begin();
+  }
+  if (should_move_files) {
+    // Construct a local temporary vector
+    std::vector<Fsize> local_temp;
+    local_temp.reserve(temp->size());
+    // Move the selected File into the first position and its successors
+    // into the second, third, ..., positions
+    for (auto iter = current_file_iter; iter != temp->end(); iter++) {
+      local_temp.push_back(*iter);
+    }
+    // Move the origin predecessors of the selected file in a round-robin
+    // manner
+    for (auto iter = temp->begin(); iter != current_file_iter; iter++) {
+      local_temp.push_back(*iter);
+    }
+    // Replace all the items in temp
+    for (size_t i = 0; i < local_temp.size(); i++) {
+      temp->at(i) = local_temp[i];
+    }
+  }
+}
+}  // anonymous namespace
+
+void VersionStorageInfo::UpdateFilesByCompactionPri(
+    const ImmutableOptions& ioptions, const MutableCFOptions& options) {
+  if (compaction_style_ == kCompactionStyleNone ||
+      compaction_style_ == kCompactionStyleFIFO ||
+      compaction_style_ == kCompactionStyleUniversal) {
+    // don't need this
+    return;
+  }
+  // No need to sort the highest level because it is never compacted.
+  for (int level = 0; level < num_levels() - 1; level++) {
+    const std::vector<FileMetaData*>& files = files_[level];
+    auto& files_by_compaction_pri = files_by_compaction_pri_[level];
+    assert(files_by_compaction_pri.size() == 0);
+
+    // populate a temp vector for sorting based on size
+    std::vector<Fsize> temp(files.size());
+    for (size_t i = 0; i < files.size(); i++) {
+      temp[i].index = i;
+      temp[i].file = files[i];
+    }
+
+    // sort the top number_of_files_to_sort_ based on file size
+    size_t num = VersionStorageInfo::kNumberFilesToSort;
+    if (num > temp.size()) {
+      num = temp.size();
+    }
+    switch (ioptions.compaction_pri) {
+      case kByCompensatedSize:
+        std::partial_sort(temp.begin(), temp.begin() + num, temp.end(),
+                          CompareCompensatedSizeDescending);
+        break;
+      case kOldestLargestSeqFirst:
+        std::sort(temp.begin(), temp.end(),
+                  [](const Fsize& f1, const Fsize& f2) -> bool {
+                    return f1.file->fd.largest_seqno <
+                           f2.file->fd.largest_seqno;
+                  });
+        break;
+      case kOldestSmallestSeqFirst:
+        std::sort(temp.begin(), temp.end(),
+                  [](const Fsize& f1, const Fsize& f2) -> bool {
+                    return f1.file->fd.smallest_seqno <
+                           f2.file->fd.smallest_seqno;
+                  });
+        break;
+      case kMinOverlappingRatio:
+        SortFileByOverlappingRatio(*internal_comparator_, files_[level],
+                                   files_[level + 1], ioptions.clock, level,
+                                   num_non_empty_levels_, options.ttl, &temp);
+        break;
+      case kRoundRobin:
+        SortFileByRoundRobin(*internal_comparator_, &compact_cursor_,
+                             level0_non_overlapping_, level, &temp);
+        break;
+      default:
+        assert(false);
+    }
+    assert(temp.size() == files.size());
+
+    // initialize files_by_compaction_pri_
+    for (size_t i = 0; i < temp.size(); i++) {
+      files_by_compaction_pri.push_back(static_cast<int>(temp[i].index));
+    }
+    next_file_to_compact_by_size_[level] = 0;
+    assert(files_[level].size() == files_by_compaction_pri_[level].size());
+  }
+}
+
+void VersionStorageInfo::GenerateLevel0NonOverlapping() {
+  assert(!finalized_);
+  level0_non_overlapping_ = true;
+  if (level_files_brief_.size() == 0) {
+    return;
+  }
+
+  // A copy of L0 files sorted by smallest key
+  std::vector<FdWithKeyRange> level0_sorted_file(
+      level_files_brief_[0].files,
+      level_files_brief_[0].files + level_files_brief_[0].num_files);
+  std::sort(level0_sorted_file.begin(), level0_sorted_file.end(),
+            [this](const FdWithKeyRange& f1, const FdWithKeyRange& f2) -> bool {
+              return (internal_comparator_->Compare(f1.smallest_key,
+                                                    f2.smallest_key) < 0);
+            });
+
+  for (size_t i = 1; i < level0_sorted_file.size(); ++i) {
+    FdWithKeyRange& f = level0_sorted_file[i];
+    FdWithKeyRange& prev = level0_sorted_file[i - 1];
+    if (internal_comparator_->Compare(prev.largest_key, f.smallest_key) >= 0) {
+      level0_non_overlapping_ = false;
+      break;
+    }
+  }
+}
+
+void VersionStorageInfo::GenerateBottommostFiles() {
+  assert(!finalized_);
+  assert(bottommost_files_.empty());
+  for (size_t level = 0; level < level_files_brief_.size(); ++level) {
+    for (size_t file_idx = 0; file_idx < level_files_brief_[level].num_files;
+         ++file_idx) {
+      const FdWithKeyRange& f = level_files_brief_[level].files[file_idx];
+      int l0_file_idx;
+      if (level == 0) {
+        l0_file_idx = static_cast<int>(file_idx);
+      } else {
+        l0_file_idx = -1;
+      }
+      Slice smallest_user_key = ExtractUserKey(f.smallest_key);
+      Slice largest_user_key = ExtractUserKey(f.largest_key);
+      if (!RangeMightExistAfterSortedRun(smallest_user_key, largest_user_key,
+                                         static_cast<int>(level),
+                                         l0_file_idx)) {
+        bottommost_files_.emplace_back(static_cast<int>(level),
+                                       f.file_metadata);
+      }
+    }
+  }
+}
+
+void VersionStorageInfo::GenerateFileLocationIndex() {
+  size_t num_files = 0;
+
+  for (int level = 0; level < num_levels_; ++level) {
+    num_files += files_[level].size();
+  }
+
+  file_locations_.reserve(num_files);
+
+  for (int level = 0; level < num_levels_; ++level) {
+    for (size_t pos = 0; pos < files_[level].size(); ++pos) {
+      const FileMetaData* const meta = files_[level][pos];
+      assert(meta);
+
+      const uint64_t file_number = meta->fd.GetNumber();
+
+      assert(file_locations_.find(file_number) == file_locations_.end());
+      file_locations_.emplace(file_number, FileLocation(level, pos));
+    }
+  }
+}
+
+void VersionStorageInfo::UpdateOldestSnapshot(SequenceNumber seqnum) {
+  assert(seqnum >= oldest_snapshot_seqnum_);
+  oldest_snapshot_seqnum_ = seqnum;
+  if (oldest_snapshot_seqnum_ > bottommost_files_mark_threshold_) {
+    ComputeBottommostFilesMarkedForCompaction();
+  }
+}
+
+void VersionStorageInfo::ComputeBottommostFilesMarkedForCompaction() {
+  bottommost_files_marked_for_compaction_.clear();
+  bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+  for (auto& level_and_file : bottommost_files_) {
+    if (!level_and_file.second->being_compacted &&
+        level_and_file.second->fd.largest_seqno != 0) {
+      // largest_seqno might be nonzero due to containing the final key in an
+      // earlier compaction, whose seqnum we didn't zero out. Multiple deletions
+      // ensures the file really contains deleted or overwritten keys.
+      if (level_and_file.second->fd.largest_seqno < oldest_snapshot_seqnum_) {
+        bottommost_files_marked_for_compaction_.push_back(level_and_file);
+      } else {
+        bottommost_files_mark_threshold_ =
+            std::min(bottommost_files_mark_threshold_,
+                     level_and_file.second->fd.largest_seqno);
+      }
+    }
+  }
+}
+
+void Version::Ref() { ++refs_; }
+
+bool Version::Unref() {
+  assert(refs_ >= 1);
+  --refs_;
+  if (refs_ == 0) {
+    delete this;
+    return true;
+  }
+  return false;
+}
+
+bool VersionStorageInfo::OverlapInLevel(int level,
+                                        const Slice* smallest_user_key,
+                                        const Slice* largest_user_key) {
+  if (level >= num_non_empty_levels_) {
+    // empty level, no overlap
+    return false;
+  }
+  return SomeFileOverlapsRange(*internal_comparator_, (level > 0),
+                               level_files_brief_[level], smallest_user_key,
+                               largest_user_key);
+}
+
+// Store in "*inputs" all files in "level" that overlap [begin,end]
+// If hint_index is specified, then it points to a file in the
+// overlapping range.
+// The file_index returns a pointer to any file in an overlapping range.
+void VersionStorageInfo::GetOverlappingInputs(
+    int level, const InternalKey* begin, const InternalKey* end,
+    std::vector<FileMetaData*>* inputs, int hint_index, int* file_index,
+    bool expand_range, InternalKey** next_smallest) const {
+  if (level >= num_non_empty_levels_) {
+    // this level is empty, no overlapping inputs
+    return;
+  }
+
+  inputs->clear();
+  if (file_index) {
+    *file_index = -1;
+  }
+  const Comparator* user_cmp = user_comparator_;
+  if (level > 0) {
+    GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs, hint_index,
+                                          file_index, false, next_smallest);
+    return;
+  }
+
+  if (next_smallest) {
+    // next_smallest key only makes sense for non-level 0, where files are
+    // non-overlapping
+    *next_smallest = nullptr;
+  }
+
+  Slice user_begin, user_end;
+  if (begin != nullptr) {
+    user_begin = begin->user_key();
+  }
+  if (end != nullptr) {
+    user_end = end->user_key();
+  }
+
+  // index stores the file index need to check.
+  std::list<size_t> index;
+  for (size_t i = 0; i < level_files_brief_[level].num_files; i++) {
+    index.emplace_back(i);
+  }
+
+  while (!index.empty()) {
+    bool found_overlapping_file = false;
+    auto iter = index.begin();
+    while (iter != index.end()) {
+      FdWithKeyRange* f = &(level_files_brief_[level].files[*iter]);
+      const Slice file_start = ExtractUserKey(f->smallest_key);
+      const Slice file_limit = ExtractUserKey(f->largest_key);
+      if (begin != nullptr &&
+          user_cmp->CompareWithoutTimestamp(file_limit, user_begin) < 0) {
+        // "f" is completely before specified range; skip it
+        iter++;
+      } else if (end != nullptr &&
+                 user_cmp->CompareWithoutTimestamp(file_start, user_end) > 0) {
+        // "f" is completely after specified range; skip it
+        iter++;
+      } else {
+        // if overlap
+        inputs->emplace_back(files_[level][*iter]);
+        found_overlapping_file = true;
+        // record the first file index.
+        if (file_index && *file_index == -1) {
+          *file_index = static_cast<int>(*iter);
+        }
+        // the related file is overlap, erase to avoid checking again.
+        iter = index.erase(iter);
+        if (expand_range) {
+          if (begin != nullptr &&
+              user_cmp->CompareWithoutTimestamp(file_start, user_begin) < 0) {
+            user_begin = file_start;
+          }
+          if (end != nullptr &&
+              user_cmp->CompareWithoutTimestamp(file_limit, user_end) > 0) {
+            user_end = file_limit;
+          }
+        }
+      }
+    }
+    // if all the files left are not overlap, break
+    if (!found_overlapping_file) {
+      break;
+    }
+  }
+}
+
+// Store in "*inputs" files in "level" that within range [begin,end]
+// Guarantee a "clean cut" boundary between the files in inputs
+// and the surrounding files and the maxinum number of files.
+// This will ensure that no parts of a key are lost during compaction.
+// If hint_index is specified, then it points to a file in the range.
+// The file_index returns a pointer to any file in an overlapping range.
+void VersionStorageInfo::GetCleanInputsWithinInterval(
+    int level, const InternalKey* begin, const InternalKey* end,
+    std::vector<FileMetaData*>* inputs, int hint_index, int* file_index) const {
+  inputs->clear();
+  if (file_index) {
+    *file_index = -1;
+  }
+  if (level >= num_non_empty_levels_ || level == 0 ||
+      level_files_brief_[level].num_files == 0) {
+    // this level is empty, no inputs within range
+    // also don't support clean input interval within L0
+    return;
+  }
+
+  GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs, hint_index,
+                                        file_index, true /* within_interval */);
+}
+
+// Store in "*inputs" all files in "level" that overlap [begin,end]
+// Employ binary search to find at least one file that overlaps the
+// specified range. From that file, iterate backwards and
+// forwards to find all overlapping files.
+// if within_range is set, then only store the maximum clean inputs
+// within range [begin, end]. "clean" means there is a boundary
+// between the files in "*inputs" and the surrounding files
+void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch(
+    int level, const InternalKey* begin, const InternalKey* end,
+    std::vector<FileMetaData*>* inputs, int hint_index, int* file_index,
+    bool within_interval, InternalKey** next_smallest) const {
+  assert(level > 0);
+
+  auto user_cmp = user_comparator_;
+  const FdWithKeyRange* files = level_files_brief_[level].files;
+  const int num_files = static_cast<int>(level_files_brief_[level].num_files);
+
+  // begin to use binary search to find lower bound
+  // and upper bound.
+  int start_index = 0;
+  int end_index = num_files;
+
+  if (begin != nullptr) {
+    // if within_interval is true, with file_key would find
+    // not overlapping ranges in std::lower_bound.
+    auto cmp = [&user_cmp, &within_interval](const FdWithKeyRange& f,
+                                             const InternalKey* k) {
+      auto& file_key = within_interval ? f.file_metadata->smallest
+                                       : f.file_metadata->largest;
+      return sstableKeyCompare(user_cmp, file_key, *k) < 0;
+    };
+
+    start_index = static_cast<int>(
+        std::lower_bound(files,
+                         files + (hint_index == -1 ? num_files : hint_index),
+                         begin, cmp) -
+        files);
+
+    if (start_index > 0 && within_interval) {
+      bool is_overlapping = true;
+      while (is_overlapping && start_index < num_files) {
+        auto& pre_limit = files[start_index - 1].file_metadata->largest;
+        auto& cur_start = files[start_index].file_metadata->smallest;
+        is_overlapping = sstableKeyCompare(user_cmp, pre_limit, cur_start) == 0;
+        start_index += is_overlapping;
+      }
+    }
+  }
+
+  if (end != nullptr) {
+    // if within_interval is true, with file_key would find
+    // not overlapping ranges in std::upper_bound.
+    auto cmp = [&user_cmp, &within_interval](const InternalKey* k,
+                                             const FdWithKeyRange& f) {
+      auto& file_key = within_interval ? f.file_metadata->largest
+                                       : f.file_metadata->smallest;
+      return sstableKeyCompare(user_cmp, *k, file_key) < 0;
+    };
+
+    end_index = static_cast<int>(
+        std::upper_bound(files + start_index, files + num_files, end, cmp) -
+        files);
+
+    if (end_index < num_files && within_interval) {
+      bool is_overlapping = true;
+      while (is_overlapping && end_index > start_index) {
+        auto& next_start = files[end_index].file_metadata->smallest;
+        auto& cur_limit = files[end_index - 1].file_metadata->largest;
+        is_overlapping =
+            sstableKeyCompare(user_cmp, cur_limit, next_start) == 0;
+        end_index -= is_overlapping;
+      }
+    }
+  }
+
+  assert(start_index <= end_index);
+
+  // If there were no overlapping files, return immediately.
+  if (start_index == end_index) {
+    if (next_smallest) {
+      *next_smallest = nullptr;
+    }
+    return;
+  }
+
+  assert(start_index < end_index);
+
+  // returns the index where an overlap is found
+  if (file_index) {
+    *file_index = start_index;
+  }
+
+  // insert overlapping files into vector
+  for (int i = start_index; i < end_index; i++) {
+    inputs->push_back(files_[level][i]);
+  }
+
+  if (next_smallest != nullptr) {
+    // Provide the next key outside the range covered by inputs
+    if (end_index < static_cast<int>(files_[level].size())) {
+      **next_smallest = files_[level][end_index]->smallest;
+    } else {
+      *next_smallest = nullptr;
+    }
+  }
+}
+
+uint64_t VersionStorageInfo::NumLevelBytes(int level) const {
+  assert(level >= 0);
+  assert(level < num_levels());
+  return TotalFileSize(files_[level]);
+}
+
+const char* VersionStorageInfo::LevelSummary(
+    LevelSummaryStorage* scratch) const {
+  int len = 0;
+  if (compaction_style_ == kCompactionStyleLevel && num_levels() > 1) {
+    assert(base_level_ < static_cast<int>(level_max_bytes_.size()));
+    if (level_multiplier_ != 0.0) {
+      len = snprintf(
+          scratch->buffer, sizeof(scratch->buffer),
+          "base level %d level multiplier %.2f max bytes base %" PRIu64 " ",
+          base_level_, level_multiplier_, level_max_bytes_[base_level_]);
+    }
+  }
+  len +=
+      snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "files[");
+  for (int i = 0; i < num_levels(); i++) {
+    int sz = sizeof(scratch->buffer) - len;
+    int ret = snprintf(scratch->buffer + len, sz, "%d ", int(files_[i].size()));
+    if (ret < 0 || ret >= sz) break;
+    len += ret;
+  }
+  if (len > 0) {
+    // overwrite the last space
+    --len;
+  }
+  len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+                  "] max score %.2f", compaction_score_[0]);
+
+  if (!files_marked_for_compaction_.empty()) {
+    snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+             " (%" ROCKSDB_PRIszt " files need compaction)",
+             files_marked_for_compaction_.size());
+  }
+
+  return scratch->buffer;
+}
+
+const char* VersionStorageInfo::LevelFileSummary(FileSummaryStorage* scratch,
+                                                 int level) const {
+  int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size[");
+  for (const auto& f : files_[level]) {
+    int sz = sizeof(scratch->buffer) - len;
+    char sztxt[16];
+    AppendHumanBytes(f->fd.GetFileSize(), sztxt, sizeof(sztxt));
+    int ret = snprintf(scratch->buffer + len, sz,
+                       "#%" PRIu64 "(seq=%" PRIu64 ",sz=%s,%d) ",
+                       f->fd.GetNumber(), f->fd.smallest_seqno, sztxt,
+                       static_cast<int>(f->being_compacted));
+    if (ret < 0 || ret >= sz) break;
+    len += ret;
+  }
+  // overwrite the last space (only if files_[level].size() is non-zero)
+  if (files_[level].size() && len > 0) {
+    --len;
+  }
+  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
+  return scratch->buffer;
+}
+
+uint64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() {
+  uint64_t result = 0;
+  std::vector<FileMetaData*> overlaps;
+  for (int level = 1; level < num_levels() - 1; level++) {
+    for (const auto& f : files_[level]) {
+      GetOverlappingInputs(level + 1, &f->smallest, &f->largest, &overlaps);
+      const uint64_t sum = TotalFileSize(overlaps);
+      if (sum > result) {
+        result = sum;
+      }
+    }
+  }
+  return result;
+}
+
+uint64_t VersionStorageInfo::MaxBytesForLevel(int level) const {
+  // Note: the result for level zero is not really used since we set
+  // the level-0 compaction threshold based on number of files.
+  assert(level >= 0);
+  assert(level < static_cast<int>(level_max_bytes_.size()));
+  return level_max_bytes_[level];
+}
+
+void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions,
+                                            const MutableCFOptions& options) {
+  // Special logic to set number of sorted runs.
+  // It is to match the previous behavior when all files are in L0.
+  int num_l0_count = static_cast<int>(files_[0].size());
+  if (compaction_style_ == kCompactionStyleUniversal) {
+    // For universal compaction, we use level0 score to indicate
+    // compaction score for the whole DB. Adding other levels as if
+    // they are L0 files.
+    for (int i = 1; i < num_levels(); i++) {
+      if (!files_[i].empty()) {
+        num_l0_count++;
+      }
+    }
+  }
+  set_l0_delay_trigger_count(num_l0_count);
+
+  level_max_bytes_.resize(ioptions.num_levels);
+  if (!ioptions.level_compaction_dynamic_level_bytes) {
+    base_level_ = (ioptions.compaction_style == kCompactionStyleLevel) ? 1 : -1;
+
+    // Calculate for static bytes base case
+    for (int i = 0; i < ioptions.num_levels; ++i) {
+      if (i == 0 && ioptions.compaction_style == kCompactionStyleUniversal) {
+        level_max_bytes_[i] = options.max_bytes_for_level_base;
+      } else if (i > 1) {
+        level_max_bytes_[i] = MultiplyCheckOverflow(
+            MultiplyCheckOverflow(level_max_bytes_[i - 1],
+                                  options.max_bytes_for_level_multiplier),
+            options.MaxBytesMultiplerAdditional(i - 1));
+      } else {
+        level_max_bytes_[i] = options.max_bytes_for_level_base;
+      }
+    }
+  } else {
+    uint64_t max_level_size = 0;
+
+    int first_non_empty_level = -1;
+    // Find size of non-L0 level of most data.
+    // Cannot use the size of the last level because it can be empty or less
+    // than previous levels after compaction.
+    for (int i = 1; i < num_levels_; i++) {
+      uint64_t total_size = 0;
+      for (const auto& f : files_[i]) {
+        total_size += f->fd.GetFileSize();
+      }
+      if (total_size > 0 && first_non_empty_level == -1) {
+        first_non_empty_level = i;
+      }
+      if (total_size > max_level_size) {
+        max_level_size = total_size;
+      }
+    }
+
+    // Prefill every level's max bytes to disallow compaction from there.
+    for (int i = 0; i < num_levels_; i++) {
+      level_max_bytes_[i] = std::numeric_limits<uint64_t>::max();
+    }
+
+    if (max_level_size == 0) {
+      // No data for L1 and up. L0 compacts to last level directly.
+      // No compaction from L1+ needs to be scheduled.
+      base_level_ = num_levels_ - 1;
+    } else {
+      uint64_t base_bytes_max = options.max_bytes_for_level_base;
+      uint64_t base_bytes_min = static_cast<uint64_t>(
+          base_bytes_max / options.max_bytes_for_level_multiplier);
+
+      // Try whether we can make last level's target size to be max_level_size
+      uint64_t cur_level_size = max_level_size;
+      for (int i = num_levels_ - 2; i >= first_non_empty_level; i--) {
+        // Round up after dividing
+        cur_level_size = static_cast<uint64_t>(
+            cur_level_size / options.max_bytes_for_level_multiplier);
+      }
+
+      // Calculate base level and its size.
+      uint64_t base_level_size;
+      if (cur_level_size <= base_bytes_min) {
+        // Case 1. If we make target size of last level to be max_level_size,
+        // target size of the first non-empty level would be smaller than
+        // base_bytes_min. We set it be base_bytes_min.
+        base_level_size = base_bytes_min + 1U;
+        base_level_ = first_non_empty_level;
+        ROCKS_LOG_INFO(ioptions.logger,
+                       "More existing levels in DB than needed. "
+                       "max_bytes_for_level_multiplier may not be guaranteed.");
+      } else {
+        // Find base level (where L0 data is compacted to).
+        base_level_ = first_non_empty_level;
+        while (base_level_ > 1 && cur_level_size > base_bytes_max) {
+          --base_level_;
+          cur_level_size = static_cast<uint64_t>(
+              cur_level_size / options.max_bytes_for_level_multiplier);
+        }
+        if (cur_level_size > base_bytes_max) {
+          // Even L1 will be too large
+          assert(base_level_ == 1);
+          base_level_size = base_bytes_max;
+        } else {
+          base_level_size = cur_level_size;
+        }
+      }
+
+      level_multiplier_ = options.max_bytes_for_level_multiplier;
+      assert(base_level_size > 0);
+
+      uint64_t level_size = base_level_size;
+      for (int i = base_level_; i < num_levels_; i++) {
+        if (i > base_level_) {
+          level_size = MultiplyCheckOverflow(level_size, level_multiplier_);
+        }
+        // Don't set any level below base_bytes_max. Otherwise, the LSM can
+        // assume an hourglass shape where L1+ sizes are smaller than L0. This
+        // causes compaction scoring, which depends on level sizes, to favor L1+
+        // at the expense of L0, which may fill up and stall.
+        level_max_bytes_[i] = std::max(level_size, base_bytes_max);
+      }
+    }
+  }
+}
+
+uint64_t VersionStorageInfo::EstimateLiveDataSize() const {
+  // Estimate the live data size by adding up the size of a maximal set of
+  // sst files with no range overlap in same or higher level. The less
+  // compacted, the more optimistic (smaller) this estimate is. Also,
+  // for multiple sorted runs within a level, file order will matter.
+  uint64_t size = 0;
+
+  auto ikey_lt = [this](InternalKey* x, InternalKey* y) {
+    return internal_comparator_->Compare(*x, *y) < 0;
+  };
+  // (Ordered) map of largest keys in files being included in size estimate
+  std::map<InternalKey*, FileMetaData*, decltype(ikey_lt)> ranges(ikey_lt);
+
+  for (int l = num_levels_ - 1; l >= 0; l--) {
+    bool found_end = false;
+    for (auto file : files_[l]) {
+      // Find the first file already included with largest key is larger than
+      // the smallest key of `file`. If that file does not overlap with the
+      // current file, none of the files in the map does. If there is
+      // no potential overlap, we can safely insert the rest of this level
+      // (if the level is not 0) into the map without checking again because
+      // the elements in the level are sorted and non-overlapping.
+      auto lb = (found_end && l != 0) ? ranges.end()
+                                      : ranges.lower_bound(&file->smallest);
+      found_end = (lb == ranges.end());
+      if (found_end || internal_comparator_->Compare(
+                           file->largest, (*lb).second->smallest) < 0) {
+        ranges.emplace_hint(lb, &file->largest, file);
+        size += file->fd.file_size;
+      }
+    }
+  }
+
+  // For BlobDB, the result also includes the exact value of live bytes in the
+  // blob files of the version.
+  for (const auto& meta : blob_files_) {
+    assert(meta);
+
+    size += meta->GetTotalBlobBytes();
+    size -= meta->GetGarbageBlobBytes();
+  }
+
+  return size;
+}
+
+bool VersionStorageInfo::RangeMightExistAfterSortedRun(
+    const Slice& smallest_user_key, const Slice& largest_user_key,
+    int last_level, int last_l0_idx) {
+  assert((last_l0_idx != -1) == (last_level == 0));
+  // TODO(ajkr): this preserves earlier behavior where we considered an L0 file
+  // bottommost only if it's the oldest L0 file and there are no files on older
+  // levels. It'd be better to consider it bottommost if there's no overlap in
+  // older levels/files.
+  if (last_level == 0 &&
+      last_l0_idx != static_cast<int>(LevelFiles(0).size() - 1)) {
+    return true;
+  }
+
+  // Checks whether there are files living beyond the `last_level`. If lower
+  // levels have files, it checks for overlap between [`smallest_key`,
+  // `largest_key`] and those files. Bottomlevel optimizations can be made if
+  // there are no files in lower levels or if there is no overlap with the files
+  // in the lower levels.
+  for (int level = last_level + 1; level < num_levels(); level++) {
+    // The range is not in the bottommost level if there are files in lower
+    // levels when the `last_level` is 0 or if there are files in lower levels
+    // which overlap with [`smallest_key`, `largest_key`].
+    if (files_[level].size() > 0 &&
+        (last_level == 0 ||
+         OverlapInLevel(level, &smallest_user_key, &largest_user_key))) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void Version::AddLiveFiles(std::vector<uint64_t>* live_table_files,
+                           std::vector<uint64_t>* live_blob_files) const {
+  assert(live_table_files);
+  assert(live_blob_files);
+
+  for (int level = 0; level < storage_info_.num_levels(); ++level) {
+    const auto& level_files = storage_info_.LevelFiles(level);
+    for (const auto& meta : level_files) {
+      assert(meta);
+
+      live_table_files->emplace_back(meta->fd.GetNumber());
+    }
+  }
+
+  const auto& blob_files = storage_info_.GetBlobFiles();
+  for (const auto& meta : blob_files) {
+    assert(meta);
+
+    live_blob_files->emplace_back(meta->GetBlobFileNumber());
+  }
+}
+
+void Version::RemoveLiveFiles(
+    std::vector<ObsoleteFileInfo>& sst_delete_candidates,
+    std::vector<ObsoleteBlobFileInfo>& blob_delete_candidates) const {
+  for (ObsoleteFileInfo& fi : sst_delete_candidates) {
+    if (!fi.only_delete_metadata &&
+        storage_info()->GetFileLocation(fi.metadata->fd.GetNumber()) !=
+            VersionStorageInfo::FileLocation::Invalid()) {
+      fi.only_delete_metadata = true;
+    }
+  }
+
+  blob_delete_candidates.erase(
+      std::remove_if(
+          blob_delete_candidates.begin(), blob_delete_candidates.end(),
+          [this](ObsoleteBlobFileInfo& x) {
+            return storage_info()->GetBlobFileMetaData(x.GetBlobFileNumber());
+          }),
+      blob_delete_candidates.end());
+}
+
+std::string Version::DebugString(bool hex, bool print_stats) const {
+  std::string r;
+  for (int level = 0; level < storage_info_.num_levels_; level++) {
+    // E.g.,
+    //   --- level 1 ---
+    //   17:123[1 .. 124]['a' .. 'd']
+    //   20:43[124 .. 128]['e' .. 'g']
+    //
+    // if print_stats=true:
+    //   17:123[1 .. 124]['a' .. 'd'](4096)
+    r.append("--- level ");
+    AppendNumberTo(&r, level);
+    r.append(" --- version# ");
+    AppendNumberTo(&r, version_number_);
+    if (storage_info_.compact_cursor_[level].Valid()) {
+      r.append(" --- compact_cursor: ");
+      r.append(storage_info_.compact_cursor_[level].DebugString(hex));
+    }
+    r.append(" ---\n");
+    const std::vector<FileMetaData*>& files = storage_info_.files_[level];
+    for (size_t i = 0; i < files.size(); i++) {
+      r.push_back(' ');
+      AppendNumberTo(&r, files[i]->fd.GetNumber());
+      r.push_back(':');
+      AppendNumberTo(&r, files[i]->fd.GetFileSize());
+      r.append("[");
+      AppendNumberTo(&r, files[i]->fd.smallest_seqno);
+      r.append(" .. ");
+      AppendNumberTo(&r, files[i]->fd.largest_seqno);
+      r.append("]");
+      r.append("[");
+      r.append(files[i]->smallest.DebugString(hex));
+      r.append(" .. ");
+      r.append(files[i]->largest.DebugString(hex));
+      r.append("]");
+      if (files[i]->oldest_blob_file_number != kInvalidBlobFileNumber) {
+        r.append(" blob_file:");
+        AppendNumberTo(&r, files[i]->oldest_blob_file_number);
+      }
+      if (print_stats) {
+        r.append("(");
+        r.append(std::to_string(
+            files[i]->stats.num_reads_sampled.load(std::memory_order_relaxed)));
+        r.append(")");
+      }
+      r.append("\n");
+    }
+  }
+
+  const auto& blob_files = storage_info_.GetBlobFiles();
+  if (!blob_files.empty()) {
+    r.append("--- blob files --- version# ");
+    AppendNumberTo(&r, version_number_);
+    r.append(" ---\n");
+    for (const auto& blob_file_meta : blob_files) {
+      assert(blob_file_meta);
+
+      r.append(blob_file_meta->DebugString());
+      r.push_back('\n');
+    }
+  }
+
+  return r;
+}
+
+// this is used to batch writes to the manifest file
+struct VersionSet::ManifestWriter {
+  Status status;
+  bool done;
+  InstrumentedCondVar cv;
+  ColumnFamilyData* cfd;
+  const MutableCFOptions mutable_cf_options;
+  const autovector<VersionEdit*>& edit_list;
+  const std::function<void(const Status&)> manifest_write_callback;
+
+  explicit ManifestWriter(
+      InstrumentedMutex* mu, ColumnFamilyData* _cfd,
+      const MutableCFOptions& cf_options, const autovector<VersionEdit*>& e,
+      const std::function<void(const Status&)>& manifest_wcb)
+      : done(false),
+        cv(mu),
+        cfd(_cfd),
+        mutable_cf_options(cf_options),
+        edit_list(e),
+        manifest_write_callback(manifest_wcb) {}
+  ~ManifestWriter() { status.PermitUncheckedError(); }
+
+  bool IsAllWalEdits() const {
+    bool all_wal_edits = true;
+    for (const auto& e : edit_list) {
+      if (!e->IsWalManipulation()) {
+        all_wal_edits = false;
+        break;
+      }
+    }
+    return all_wal_edits;
+  }
+};
+
+Status AtomicGroupReadBuffer::AddEdit(VersionEdit* edit) {
+  assert(edit);
+  if (edit->is_in_atomic_group_) {
+    TEST_SYNC_POINT("AtomicGroupReadBuffer::AddEdit:AtomicGroup");
+    if (replay_buffer_.empty()) {
+      replay_buffer_.resize(edit->remaining_entries_ + 1);
+      TEST_SYNC_POINT_CALLBACK(
+          "AtomicGroupReadBuffer::AddEdit:FirstInAtomicGroup", edit);
+    }
+    read_edits_in_atomic_group_++;
+    if (read_edits_in_atomic_group_ + edit->remaining_entries_ !=
+        static_cast<uint32_t>(replay_buffer_.size())) {
+      TEST_SYNC_POINT_CALLBACK(
+          "AtomicGroupReadBuffer::AddEdit:IncorrectAtomicGroupSize", edit);
+      return Status::Corruption("corrupted atomic group");
+    }
+    replay_buffer_[read_edits_in_atomic_group_ - 1] = *edit;
+    if (read_edits_in_atomic_group_ == replay_buffer_.size()) {
+      TEST_SYNC_POINT_CALLBACK(
+          "AtomicGroupReadBuffer::AddEdit:LastInAtomicGroup", edit);
+      return Status::OK();
+    }
+    return Status::OK();
+  }
+
+  // A normal edit.
+  if (!replay_buffer().empty()) {
+    TEST_SYNC_POINT_CALLBACK(
+        "AtomicGroupReadBuffer::AddEdit:AtomicGroupMixedWithNormalEdits", edit);
+    return Status::Corruption("corrupted atomic group");
+  }
+  return Status::OK();
+}
+
+bool AtomicGroupReadBuffer::IsFull() const {
+  return read_edits_in_atomic_group_ == replay_buffer_.size();
+}
+
+bool AtomicGroupReadBuffer::IsEmpty() const { return replay_buffer_.empty(); }
+
+void AtomicGroupReadBuffer::Clear() {
+  read_edits_in_atomic_group_ = 0;
+  replay_buffer_.clear();
+}
+
+VersionSet::VersionSet(const std::string& dbname,
+                       const ImmutableDBOptions* _db_options,
+                       const FileOptions& storage_options, Cache* table_cache,
+                       WriteBufferManager* write_buffer_manager,
+                       WriteController* write_controller,
+                       BlockCacheTracer* const block_cache_tracer,
+                       const std::shared_ptr<IOTracer>& io_tracer,
+                       const std::string& db_id,
+                       const std::string& db_session_id)
+    : column_family_set_(new ColumnFamilySet(
+          dbname, _db_options, storage_options, table_cache,
+          write_buffer_manager, write_controller, block_cache_tracer, io_tracer,
+          db_id, db_session_id)),
+      table_cache_(table_cache),
+      env_(_db_options->env),
+      fs_(_db_options->fs, io_tracer),
+      clock_(_db_options->clock),
+      dbname_(dbname),
+      db_options_(_db_options),
+      next_file_number_(2),
+      manifest_file_number_(0),  // Filled by Recover()
+      options_file_number_(0),
+      options_file_size_(0),
+      pending_manifest_file_number_(0),
+      last_sequence_(0),
+      last_allocated_sequence_(0),
+      last_published_sequence_(0),
+      prev_log_number_(0),
+      current_version_number_(0),
+      manifest_file_size_(0),
+      file_options_(storage_options),
+      block_cache_tracer_(block_cache_tracer),
+      io_tracer_(io_tracer),
+      db_session_id_(db_session_id) {}
+
+VersionSet::~VersionSet() {
+  // we need to delete column_family_set_ because its destructor depends on
+  // VersionSet
+  column_family_set_.reset();
+  for (auto& file : obsolete_files_) {
+    if (file.metadata->table_reader_handle) {
+      table_cache_->Release(file.metadata->table_reader_handle);
+      TableCache::Evict(table_cache_, file.metadata->fd.GetNumber());
+    }
+    file.DeleteMetadata();
+  }
+  obsolete_files_.clear();
+  io_status_.PermitUncheckedError();
+}
+
+void VersionSet::Reset() {
+  if (column_family_set_) {
+    WriteBufferManager* wbm = column_family_set_->write_buffer_manager();
+    WriteController* wc = column_family_set_->write_controller();
+    // db_id becomes the source of truth after DBImpl::Recover():
+    // https://github.com/facebook/rocksdb/blob/v7.3.1/db/db_impl/db_impl_open.cc#L527
+    // Note: we may not be able to recover db_id from MANIFEST if
+    // options.write_dbid_to_manifest is false (default).
+    column_family_set_.reset(new ColumnFamilySet(
+        dbname_, db_options_, file_options_, table_cache_, wbm, wc,
+        block_cache_tracer_, io_tracer_, db_id_, db_session_id_));
+  }
+  db_id_.clear();
+  next_file_number_.store(2);
+  min_log_number_to_keep_.store(0);
+  manifest_file_number_ = 0;
+  options_file_number_ = 0;
+  pending_manifest_file_number_ = 0;
+  last_sequence_.store(0);
+  last_allocated_sequence_.store(0);
+  last_published_sequence_.store(0);
+  prev_log_number_ = 0;
+  descriptor_log_.reset();
+  current_version_number_ = 0;
+  manifest_writers_.clear();
+  manifest_file_size_ = 0;
+  obsolete_files_.clear();
+  obsolete_manifests_.clear();
+  wals_.Reset();
+}
+
+void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
+                               Version* v) {
+  // compute new compaction score
+  v->storage_info()->ComputeCompactionScore(
+      *column_family_data->ioptions(),
+      *column_family_data->GetLatestMutableCFOptions());
+
+  // Mark v finalized
+  v->storage_info_.SetFinalized();
+
+  // Make "v" current
+  assert(v->refs_ == 0);
+  Version* current = column_family_data->current();
+  assert(v != current);
+  if (current != nullptr) {
+    assert(current->refs_ > 0);
+    current->Unref();
+  }
+  column_family_data->SetCurrent(v);
+  v->Ref();
+
+  // Append to linked list
+  v->prev_ = column_family_data->dummy_versions()->prev_;
+  v->next_ = column_family_data->dummy_versions();
+  v->prev_->next_ = v;
+  v->next_->prev_ = v;
+}
+
+Status VersionSet::ProcessManifestWrites(
+    std::deque<ManifestWriter>& writers, InstrumentedMutex* mu,
+    FSDirectory* dir_contains_current_file, bool new_descriptor_log,
+    const ColumnFamilyOptions* new_cf_options) {
+  mu->AssertHeld();
+  assert(!writers.empty());
+  ManifestWriter& first_writer = writers.front();
+  ManifestWriter* last_writer = &first_writer;
+
+  assert(!manifest_writers_.empty());
+  assert(manifest_writers_.front() == &first_writer);
+
+  autovector<VersionEdit*> batch_edits;
+  autovector<Version*> versions;
+  autovector<const MutableCFOptions*> mutable_cf_options_ptrs;
+  std::vector<std::unique_ptr<BaseReferencedVersionBuilder>> builder_guards;
+
+  // Tracking `max_last_sequence` is needed to ensure we write
+  // `VersionEdit::last_sequence_`s in non-decreasing order according to the
+  // recovery code's requirement. It also allows us to defer updating
+  // `descriptor_last_sequence_` until the apply phase, after the log phase
+  // succeeds.
+  SequenceNumber max_last_sequence = descriptor_last_sequence_;
+
+  if (first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
+    // No group commits for column family add or drop
+    LogAndApplyCFHelper(first_writer.edit_list.front(), &max_last_sequence);
+    batch_edits.push_back(first_writer.edit_list.front());
+  } else {
+    auto it = manifest_writers_.cbegin();
+    size_t group_start = std::numeric_limits<size_t>::max();
+    while (it != manifest_writers_.cend()) {
+      if ((*it)->edit_list.front()->IsColumnFamilyManipulation()) {
+        // no group commits for column family add or drop
+        break;
+      }
+      last_writer = *(it++);
+      assert(last_writer != nullptr);
+      assert(last_writer->cfd != nullptr);
+      if (last_writer->cfd->IsDropped()) {
+        // If we detect a dropped CF at this point, and the corresponding
+        // version edits belong to an atomic group, then we need to find out
+        // the preceding version edits in the same atomic group, and update
+        // their `remaining_entries_` member variable because we are NOT going
+        // to write the version edits' of dropped CF to the MANIFEST. If we
+        // don't update, then Recover can report corrupted atomic group because
+        // the `remaining_entries_` do not match.
+        if (!batch_edits.empty()) {
+          if (batch_edits.back()->is_in_atomic_group_ &&
+              batch_edits.back()->remaining_entries_ > 0) {
+            assert(group_start < batch_edits.size());
+            const auto& edit_list = last_writer->edit_list;
+            size_t k = 0;
+            while (k < edit_list.size()) {
+              if (!edit_list[k]->is_in_atomic_group_) {
+                break;
+              } else if (edit_list[k]->remaining_entries_ == 0) {
+                ++k;
+                break;
+              }
+              ++k;
+            }
+            for (auto i = group_start; i < batch_edits.size(); ++i) {
+              assert(static_cast<uint32_t>(k) <=
+                     batch_edits.back()->remaining_entries_);
+              batch_edits[i]->remaining_entries_ -= static_cast<uint32_t>(k);
+            }
+          }
+        }
+        continue;
+      }
+      // We do a linear search on versions because versions is small.
+      // TODO(yanqin) maybe consider unordered_map
+      Version* version = nullptr;
+      VersionBuilder* builder = nullptr;
+      for (int i = 0; i != static_cast<int>(versions.size()); ++i) {
+        uint32_t cf_id = last_writer->cfd->GetID();
+        if (versions[i]->cfd()->GetID() == cf_id) {
+          version = versions[i];
+          assert(!builder_guards.empty() &&
+                 builder_guards.size() == versions.size());
+          builder = builder_guards[i]->version_builder();
+          TEST_SYNC_POINT_CALLBACK(
+              "VersionSet::ProcessManifestWrites:SameColumnFamily", &cf_id);
+          break;
+        }
+      }
+      if (version == nullptr) {
+        // WAL manipulations do not need to be applied to versions.
+        if (!last_writer->IsAllWalEdits()) {
+          version = new Version(last_writer->cfd, this, file_options_,
+                                last_writer->mutable_cf_options, io_tracer_,
+                                current_version_number_++);
+          versions.push_back(version);
+          mutable_cf_options_ptrs.push_back(&last_writer->mutable_cf_options);
+          builder_guards.emplace_back(
+              new BaseReferencedVersionBuilder(last_writer->cfd));
+          builder = builder_guards.back()->version_builder();
+        }
+        assert(last_writer->IsAllWalEdits() || builder);
+        assert(last_writer->IsAllWalEdits() || version);
+        TEST_SYNC_POINT_CALLBACK("VersionSet::ProcessManifestWrites:NewVersion",
+                                 version);
+      }
+      for (const auto& e : last_writer->edit_list) {
+        if (e->is_in_atomic_group_) {
+          if (batch_edits.empty() || !batch_edits.back()->is_in_atomic_group_ ||
+              (batch_edits.back()->is_in_atomic_group_ &&
+               batch_edits.back()->remaining_entries_ == 0)) {
+            group_start = batch_edits.size();
+          }
+        } else if (group_start != std::numeric_limits<size_t>::max()) {
+          group_start = std::numeric_limits<size_t>::max();
+        }
+        Status s = LogAndApplyHelper(last_writer->cfd, builder, e,
+                                     &max_last_sequence, mu);
+        if (!s.ok()) {
+          // free up the allocated memory
+          for (auto v : versions) {
+            delete v;
+          }
+          return s;
+        }
+        batch_edits.push_back(e);
+      }
+    }
+    for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
+      assert(!builder_guards.empty() &&
+             builder_guards.size() == versions.size());
+      auto* builder = builder_guards[i]->version_builder();
+      Status s = builder->SaveTo(versions[i]->storage_info());
+      if (!s.ok()) {
+        // free up the allocated memory
+        for (auto v : versions) {
+          delete v;
+        }
+        return s;
+      }
+    }
+  }
+
+#ifndef NDEBUG
+  // Verify that version edits of atomic groups have correct
+  // remaining_entries_.
+  size_t k = 0;
+  while (k < batch_edits.size()) {
+    while (k < batch_edits.size() && !batch_edits[k]->is_in_atomic_group_) {
+      ++k;
+    }
+    if (k == batch_edits.size()) {
+      break;
+    }
+    size_t i = k;
+    while (i < batch_edits.size()) {
+      if (!batch_edits[i]->is_in_atomic_group_) {
+        break;
+      }
+      assert(i - k + batch_edits[i]->remaining_entries_ ==
+             batch_edits[k]->remaining_entries_);
+      if (batch_edits[i]->remaining_entries_ == 0) {
+        ++i;
+        break;
+      }
+      ++i;
+    }
+    assert(batch_edits[i - 1]->is_in_atomic_group_);
+    assert(0 == batch_edits[i - 1]->remaining_entries_);
+    std::vector<VersionEdit*> tmp;
+    for (size_t j = k; j != i; ++j) {
+      tmp.emplace_back(batch_edits[j]);
+    }
+    TEST_SYNC_POINT_CALLBACK(
+        "VersionSet::ProcessManifestWrites:CheckOneAtomicGroup", &tmp);
+    k = i;
+  }
+#endif  // NDEBUG
+
+  assert(pending_manifest_file_number_ == 0);
+  if (!descriptor_log_ ||
+      manifest_file_size_ > db_options_->max_manifest_file_size) {
+    TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:BeforeNewManifest");
+    new_descriptor_log = true;
+  } else {
+    pending_manifest_file_number_ = manifest_file_number_;
+  }
+
+  // Local cached copy of state variable(s). WriteCurrentStateToManifest()
+  // reads its content after releasing db mutex to avoid race with
+  // SwitchMemtable().
+  std::unordered_map<uint32_t, MutableCFState> curr_state;
+  VersionEdit wal_additions;
+  if (new_descriptor_log) {
+    pending_manifest_file_number_ = NewFileNumber();
+    batch_edits.back()->SetNextFile(next_file_number_.load());
+
+    // if we are writing out new snapshot make sure to persist max column
+    // family.
+    if (column_family_set_->GetMaxColumnFamily() > 0) {
+      first_writer.edit_list.front()->SetMaxColumnFamily(
+          column_family_set_->GetMaxColumnFamily());
+    }
+    for (const auto* cfd : *column_family_set_) {
+      assert(curr_state.find(cfd->GetID()) == curr_state.end());
+      curr_state.emplace(std::make_pair(
+          cfd->GetID(),
+          MutableCFState(cfd->GetLogNumber(), cfd->GetFullHistoryTsLow())));
+    }
+
+    for (const auto& wal : wals_.GetWals()) {
+      wal_additions.AddWal(wal.first, wal.second);
+    }
+  }
+
+  uint64_t new_manifest_file_size = 0;
+  Status s;
+  IOStatus io_s;
+  IOStatus manifest_io_status;
+  {
+    FileOptions opt_file_opts = fs_->OptimizeForManifestWrite(file_options_);
+    mu->Unlock();
+    TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestStart");
+    TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WriteManifest", nullptr);
+    if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
+      for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
+        assert(!builder_guards.empty() &&
+               builder_guards.size() == versions.size());
+        assert(!mutable_cf_options_ptrs.empty() &&
+               builder_guards.size() == versions.size());
+        ColumnFamilyData* cfd = versions[i]->cfd_;
+        s = builder_guards[i]->version_builder()->LoadTableHandlers(
+            cfd->internal_stats(), 1 /* max_threads */,
+            true /* prefetch_index_and_filter_in_cache */,
+            false /* is_initial_load */,
+            mutable_cf_options_ptrs[i]->prefix_extractor,
+            MaxFileSizeForL0MetaPin(*mutable_cf_options_ptrs[i]));
+        if (!s.ok()) {
+          if (db_options_->paranoid_checks) {
+            break;
+          }
+          s = Status::OK();
+        }
+      }
+    }
+
+    if (s.ok() && new_descriptor_log) {
+      // This is fine because everything inside of this block is serialized --
+      // only one thread can be here at the same time
+      // create new manifest file
+      ROCKS_LOG_INFO(db_options_->info_log, "Creating manifest %" PRIu64 "\n",
+                     pending_manifest_file_number_);
+      std::string descriptor_fname =
+          DescriptorFileName(dbname_, pending_manifest_file_number_);
+      std::unique_ptr<FSWritableFile> descriptor_file;
+      io_s = NewWritableFile(fs_.get(), descriptor_fname, &descriptor_file,
+                             opt_file_opts);
+      if (io_s.ok()) {
+        descriptor_file->SetPreallocationBlockSize(
+            db_options_->manifest_preallocation_size);
+        FileTypeSet tmp_set = db_options_->checksum_handoff_file_types;
+        std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+            std::move(descriptor_file), descriptor_fname, opt_file_opts, clock_,
+            io_tracer_, nullptr, db_options_->listeners, nullptr,
+            tmp_set.Contains(FileType::kDescriptorFile),
+            tmp_set.Contains(FileType::kDescriptorFile)));
+        descriptor_log_.reset(
+            new log::Writer(std::move(file_writer), 0, false));
+        s = WriteCurrentStateToManifest(curr_state, wal_additions,
+                                        descriptor_log_.get(), io_s);
+      } else {
+        manifest_io_status = io_s;
+        s = io_s;
+      }
+    }
+
+    if (s.ok()) {
+      if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
+        constexpr bool update_stats = true;
+
+        for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
+          versions[i]->PrepareAppend(*mutable_cf_options_ptrs[i], update_stats);
+        }
+      }
+
+      // Write new records to MANIFEST log
+#ifndef NDEBUG
+      size_t idx = 0;
+#endif
+      for (auto& e : batch_edits) {
+        std::string record;
+        if (!e->EncodeTo(&record)) {
+          s = Status::Corruption("Unable to encode VersionEdit:" +
+                                 e->DebugString(true));
+          break;
+        }
+        TEST_KILL_RANDOM_WITH_WEIGHT("VersionSet::LogAndApply:BeforeAddRecord",
+                                     REDUCE_ODDS2);
+#ifndef NDEBUG
+        if (batch_edits.size() > 1 && batch_edits.size() - 1 == idx) {
+          TEST_SYNC_POINT_CALLBACK(
+              "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0",
+              nullptr);
+          TEST_SYNC_POINT(
+              "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:1");
+        }
+        ++idx;
+#endif /* !NDEBUG */
+        io_s = descriptor_log_->AddRecord(record);
+        if (!io_s.ok()) {
+          s = io_s;
+          manifest_io_status = io_s;
+          break;
+        }
+      }
+      if (s.ok()) {
+        io_s = SyncManifest(db_options_, descriptor_log_->file());
+        manifest_io_status = io_s;
+        TEST_SYNC_POINT_CALLBACK(
+            "VersionSet::ProcessManifestWrites:AfterSyncManifest", &io_s);
+      }
+      if (!io_s.ok()) {
+        s = io_s;
+        ROCKS_LOG_ERROR(db_options_->info_log, "MANIFEST write %s\n",
+                        s.ToString().c_str());
+      }
+    }
+
+    // If we just created a new descriptor file, install it by writing a
+    // new CURRENT file that points to it.
+    if (s.ok()) {
+      assert(manifest_io_status.ok());
+    }
+    if (s.ok() && new_descriptor_log) {
+      io_s = SetCurrentFile(fs_.get(), dbname_, pending_manifest_file_number_,
+                            dir_contains_current_file);
+      if (!io_s.ok()) {
+        s = io_s;
+      }
+    }
+
+    if (s.ok()) {
+      // find offset in manifest file where this version is stored.
+      new_manifest_file_size = descriptor_log_->file()->GetFileSize();
+    }
+
+    if (first_writer.edit_list.front()->is_column_family_drop_) {
+      TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:0");
+      TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:1");
+      TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:2");
+    }
+
+    LogFlush(db_options_->info_log);
+    TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestDone");
+    mu->Lock();
+  }
+
+  if (s.ok()) {
+    // Apply WAL edits, DB mutex must be held.
+    for (auto& e : batch_edits) {
+      if (e->IsWalAddition()) {
+        s = wals_.AddWals(e->GetWalAdditions());
+      } else if (e->IsWalDeletion()) {
+        s = wals_.DeleteWalsBefore(e->GetWalDeletion().GetLogNumber());
+      }
+      if (!s.ok()) {
+        break;
+      }
+    }
+  }
+
+  if (!io_s.ok()) {
+    if (io_status_.ok()) {
+      io_status_ = io_s;
+    }
+  } else if (!io_status_.ok()) {
+    io_status_ = io_s;
+  }
+
+  // Append the old manifest file to the obsolete_manifest_ list to be deleted
+  // by PurgeObsoleteFiles later.
+  if (s.ok() && new_descriptor_log) {
+    obsolete_manifests_.emplace_back(
+        DescriptorFileName("", manifest_file_number_));
+  }
+
+  // Install the new versions
+  if (s.ok()) {
+    if (first_writer.edit_list.front()->is_column_family_add_) {
+      assert(batch_edits.size() == 1);
+      assert(new_cf_options != nullptr);
+      assert(max_last_sequence == descriptor_last_sequence_);
+      CreateColumnFamily(*new_cf_options, first_writer.edit_list.front());
+    } else if (first_writer.edit_list.front()->is_column_family_drop_) {
+      assert(batch_edits.size() == 1);
+      assert(max_last_sequence == descriptor_last_sequence_);
+      first_writer.cfd->SetDropped();
+      first_writer.cfd->UnrefAndTryDelete();
+    } else {
+      // Each version in versions corresponds to a column family.
+      // For each column family, update its log number indicating that logs
+      // with number smaller than this should be ignored.
+      uint64_t last_min_log_number_to_keep = 0;
+      for (const auto& e : batch_edits) {
+        ColumnFamilyData* cfd = nullptr;
+        if (!e->IsColumnFamilyManipulation()) {
+          cfd = column_family_set_->GetColumnFamily(e->column_family_);
+          // e would not have been added to batch_edits if its corresponding
+          // column family is dropped.
+          assert(cfd);
+        }
+        if (cfd) {
+          if (e->has_log_number_ && e->log_number_ > cfd->GetLogNumber()) {
+            cfd->SetLogNumber(e->log_number_);
+          }
+          if (e->HasFullHistoryTsLow()) {
+            cfd->SetFullHistoryTsLow(e->GetFullHistoryTsLow());
+          }
+        }
+        if (e->has_min_log_number_to_keep_) {
+          last_min_log_number_to_keep =
+              std::max(last_min_log_number_to_keep, e->min_log_number_to_keep_);
+        }
+      }
+
+      if (last_min_log_number_to_keep != 0) {
+        MarkMinLogNumberToKeep(last_min_log_number_to_keep);
+      }
+
+      for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
+        ColumnFamilyData* cfd = versions[i]->cfd_;
+        AppendVersion(cfd, versions[i]);
+      }
+    }
+    assert(max_last_sequence >= descriptor_last_sequence_);
+    descriptor_last_sequence_ = max_last_sequence;
+    manifest_file_number_ = pending_manifest_file_number_;
+    manifest_file_size_ = new_manifest_file_size;
+    prev_log_number_ = first_writer.edit_list.front()->prev_log_number_;
+  } else {
+    std::string version_edits;
+    for (auto& e : batch_edits) {
+      version_edits += ("\n" + e->DebugString(true));
+    }
+    ROCKS_LOG_ERROR(db_options_->info_log,
+                    "Error in committing version edit to MANIFEST: %s",
+                    version_edits.c_str());
+    for (auto v : versions) {
+      delete v;
+    }
+    if (manifest_io_status.ok()) {
+      manifest_file_number_ = pending_manifest_file_number_;
+      manifest_file_size_ = new_manifest_file_size;
+    }
+    // If manifest append failed for whatever reason, the file could be
+    // corrupted. So we need to force the next version update to start a
+    // new manifest file.
+    descriptor_log_.reset();
+    // If manifest operations failed, then we know the CURRENT file still
+    // points to the original MANIFEST. Therefore, we can safely delete the
+    // new MANIFEST.
+    // If manifest operations succeeded, and we are here, then it is possible
+    // that renaming tmp file to CURRENT failed.
+    //
+    // On local POSIX-compliant FS, the CURRENT must point to the original
+    // MANIFEST. We can delete the new MANIFEST for simplicity, but we can also
+    // keep it. Future recovery will ignore this MANIFEST. It's also ok for the
+    // process not to crash and continue using the db. Any future LogAndApply()
+    // call will switch to a new MANIFEST and update CURRENT, still ignoring
+    // this one.
+    //
+    // On non-local FS, it is
+    // possible that the rename operation succeeded on the server (remote)
+    // side, but the client somehow returns a non-ok status to RocksDB. Note
+    // that this does not violate atomicity. Should we delete the new MANIFEST
+    // successfully, a subsequent recovery attempt will likely see the CURRENT
+    // pointing to the new MANIFEST, thus fail. We will not be able to open the
+    // DB again. Therefore, if manifest operations succeed, we should keep the
+    // the new MANIFEST. If the process proceeds, any future LogAndApply() call
+    // will switch to a new MANIFEST and update CURRENT. If user tries to
+    // re-open the DB,
+    // a) CURRENT points to the new MANIFEST, and the new MANIFEST is present.
+    // b) CURRENT points to the original MANIFEST, and the original MANIFEST
+    //    also exists.
+    if (new_descriptor_log && !manifest_io_status.ok()) {
+      ROCKS_LOG_INFO(db_options_->info_log,
+                     "Deleting manifest %" PRIu64 " current manifest %" PRIu64
+                     "\n",
+                     pending_manifest_file_number_, manifest_file_number_);
+      Status manifest_del_status = env_->DeleteFile(
+          DescriptorFileName(dbname_, pending_manifest_file_number_));
+      if (!manifest_del_status.ok()) {
+        ROCKS_LOG_WARN(db_options_->info_log,
+                       "Failed to delete manifest %" PRIu64 ": %s",
+                       pending_manifest_file_number_,
+                       manifest_del_status.ToString().c_str());
+      }
+    }
+  }
+
+  pending_manifest_file_number_ = 0;
+
+#ifndef NDEBUG
+  // This is here kind of awkwardly because there's no other consistency
+  // checks on `VersionSet`'s updates for the new `Version`s. We might want
+  // to move it to a dedicated function, or remove it if we gain enough
+  // confidence in `descriptor_last_sequence_`.
+  if (s.ok()) {
+    for (const auto* v : versions) {
+      const auto* vstorage = v->storage_info();
+      for (int level = 0; level < vstorage->num_levels(); ++level) {
+        for (const auto& file : vstorage->LevelFiles(level)) {
+          assert(file->fd.largest_seqno <= descriptor_last_sequence_);
+        }
+      }
+    }
+  }
+#endif  // NDEBUG
+
+  // wake up all the waiting writers
+  while (true) {
+    ManifestWriter* ready = manifest_writers_.front();
+    manifest_writers_.pop_front();
+    bool need_signal = true;
+    for (const auto& w : writers) {
+      if (&w == ready) {
+        need_signal = false;
+        break;
+      }
+    }
+    ready->status = s;
+    ready->done = true;
+    if (ready->manifest_write_callback) {
+      (ready->manifest_write_callback)(s);
+    }
+    if (need_signal) {
+      ready->cv.Signal();
+    }
+    if (ready == last_writer) {
+      break;
+    }
+  }
+  if (!manifest_writers_.empty()) {
+    manifest_writers_.front()->cv.Signal();
+  }
+  return s;
+}
+
+void VersionSet::WakeUpWaitingManifestWriters() {
+  // wake up all the waiting writers
+  // Notify new head of manifest write queue.
+  if (!manifest_writers_.empty()) {
+    manifest_writers_.front()->cv.Signal();
+  }
+}
+
+// 'datas' is grammatically incorrect. We still use this notation to indicate
+// that this variable represents a collection of column_family_data.
+Status VersionSet::LogAndApply(
+    const autovector<ColumnFamilyData*>& column_family_datas,
+    const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+    const autovector<autovector<VersionEdit*>>& edit_lists,
+    InstrumentedMutex* mu, FSDirectory* dir_contains_current_file,
+    bool new_descriptor_log, const ColumnFamilyOptions* new_cf_options,
+    const std::vector<std::function<void(const Status&)>>& manifest_wcbs) {
+  mu->AssertHeld();
+  int num_edits = 0;
+  for (const auto& elist : edit_lists) {
+    num_edits += static_cast<int>(elist.size());
+  }
+  if (num_edits == 0) {
+    return Status::OK();
+  } else if (num_edits > 1) {
+#ifndef NDEBUG
+    for (const auto& edit_list : edit_lists) {
+      for (const auto& edit : edit_list) {
+        assert(!edit->IsColumnFamilyManipulation());
+      }
+    }
+#endif /* ! NDEBUG */
+  }
+
+  int num_cfds = static_cast<int>(column_family_datas.size());
+  if (num_cfds == 1 && column_family_datas[0] == nullptr) {
+    assert(edit_lists.size() == 1 && edit_lists[0].size() == 1);
+    assert(edit_lists[0][0]->is_column_family_add_);
+    assert(new_cf_options != nullptr);
+  }
+  std::deque<ManifestWriter> writers;
+  if (num_cfds > 0) {
+    assert(static_cast<size_t>(num_cfds) == mutable_cf_options_list.size());
+    assert(static_cast<size_t>(num_cfds) == edit_lists.size());
+  }
+  for (int i = 0; i < num_cfds; ++i) {
+    const auto wcb =
+        manifest_wcbs.empty() ? [](const Status&) {} : manifest_wcbs[i];
+    writers.emplace_back(mu, column_family_datas[i],
+                         *mutable_cf_options_list[i], edit_lists[i], wcb);
+    manifest_writers_.push_back(&writers[i]);
+  }
+  assert(!writers.empty());
+  ManifestWriter& first_writer = writers.front();
+  TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:BeforeWriterWaiting",
+                           nullptr);
+  while (!first_writer.done && &first_writer != manifest_writers_.front()) {
+    first_writer.cv.Wait();
+  }
+  if (first_writer.done) {
+    // All non-CF-manipulation operations can be grouped together and committed
+    // to MANIFEST. They should all have finished. The status code is stored in
+    // the first manifest writer.
+#ifndef NDEBUG
+    for (const auto& writer : writers) {
+      assert(writer.done);
+    }
+    TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WakeUpAndDone", mu);
+#endif /* !NDEBUG */
+    return first_writer.status;
+  }
+
+  int num_undropped_cfds = 0;
+  for (auto cfd : column_family_datas) {
+    // if cfd == nullptr, it is a column family add.
+    if (cfd == nullptr || !cfd->IsDropped()) {
+      ++num_undropped_cfds;
+    }
+  }
+  if (0 == num_undropped_cfds) {
+    for (int i = 0; i != num_cfds; ++i) {
+      manifest_writers_.pop_front();
+    }
+    // Notify new head of manifest write queue.
+    if (!manifest_writers_.empty()) {
+      manifest_writers_.front()->cv.Signal();
+    }
+    return Status::ColumnFamilyDropped();
+  }
+  return ProcessManifestWrites(writers, mu, dir_contains_current_file,
+                               new_descriptor_log, new_cf_options);
+}
+
+void VersionSet::LogAndApplyCFHelper(VersionEdit* edit,
+                                     SequenceNumber* max_last_sequence) {
+  assert(max_last_sequence != nullptr);
+  assert(edit->IsColumnFamilyManipulation());
+  edit->SetNextFile(next_file_number_.load());
+  assert(!edit->HasLastSequence());
+  edit->SetLastSequence(*max_last_sequence);
+  if (edit->is_column_family_drop_) {
+    // if we drop column family, we have to make sure to save max column family,
+    // so that we don't reuse existing ID
+    edit->SetMaxColumnFamily(column_family_set_->GetMaxColumnFamily());
+  }
+}
+
+Status VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
+                                     VersionBuilder* builder, VersionEdit* edit,
+                                     SequenceNumber* max_last_sequence,
+                                     InstrumentedMutex* mu) {
+#ifdef NDEBUG
+  (void)cfd;
+#endif
+  mu->AssertHeld();
+  assert(!edit->IsColumnFamilyManipulation());
+  assert(max_last_sequence != nullptr);
+
+  if (edit->has_log_number_) {
+    assert(edit->log_number_ >= cfd->GetLogNumber());
+    assert(edit->log_number_ < next_file_number_.load());
+  }
+
+  if (!edit->has_prev_log_number_) {
+    edit->SetPrevLogNumber(prev_log_number_);
+  }
+  edit->SetNextFile(next_file_number_.load());
+  if (edit->HasLastSequence() && edit->GetLastSequence() > *max_last_sequence) {
+    *max_last_sequence = edit->GetLastSequence();
+  } else {
+    edit->SetLastSequence(*max_last_sequence);
+  }
+
+  // The builder can be nullptr only if edit is WAL manipulation,
+  // because WAL edits do not need to be applied to versions,
+  // we return Status::OK() in this case.
+  assert(builder || edit->IsWalManipulation());
+  return builder ? builder->Apply(edit) : Status::OK();
+}
+
+Status VersionSet::GetCurrentManifestPath(const std::string& dbname,
+                                          FileSystem* fs,
+                                          std::string* manifest_path,
+                                          uint64_t* manifest_file_number) {
+  assert(fs != nullptr);
+  assert(manifest_path != nullptr);
+  assert(manifest_file_number != nullptr);
+
+  std::string fname;
+  Status s = ReadFileToString(fs, CurrentFileName(dbname), &fname);
+  if (!s.ok()) {
+    return s;
+  }
+  if (fname.empty() || fname.back() != '\n') {
+    return Status::Corruption("CURRENT file does not end with newline");
+  }
+  // remove the trailing '\n'
+  fname.resize(fname.size() - 1);
+  FileType type;
+  bool parse_ok = ParseFileName(fname, manifest_file_number, &type);
+  if (!parse_ok || type != kDescriptorFile) {
+    return Status::Corruption("CURRENT file corrupted");
+  }
+  *manifest_path = dbname;
+  if (dbname.back() != '/') {
+    manifest_path->push_back('/');
+  }
+  manifest_path->append(fname);
+  return Status::OK();
+}
+
+Status VersionSet::Recover(
+    const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
+    std::string* db_id, bool no_error_if_files_missing) {
+  // Read "CURRENT" file, which contains a pointer to the current manifest file
+  std::string manifest_path;
+  Status s = GetCurrentManifestPath(dbname_, fs_.get(), &manifest_path,
+                                    &manifest_file_number_);
+  if (!s.ok()) {
+    return s;
+  }
+
+  ROCKS_LOG_INFO(db_options_->info_log, "Recovering from manifest file: %s\n",
+                 manifest_path.c_str());
+
+  std::unique_ptr<SequentialFileReader> manifest_file_reader;
+  {
+    std::unique_ptr<FSSequentialFile> manifest_file;
+    s = fs_->NewSequentialFile(manifest_path,
+                               fs_->OptimizeForManifestRead(file_options_),
+                               &manifest_file, nullptr);
+    if (!s.ok()) {
+      return s;
+    }
+    manifest_file_reader.reset(new SequentialFileReader(
+        std::move(manifest_file), manifest_path,
+        db_options_->log_readahead_size, io_tracer_, db_options_->listeners));
+  }
+  uint64_t current_manifest_file_size = 0;
+  uint64_t log_number = 0;
+  {
+    VersionSet::LogReporter reporter;
+    Status log_read_status;
+    reporter.status = &log_read_status;
+    log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter,
+                       true /* checksum */, 0 /* log_number */);
+    VersionEditHandler handler(
+        read_only, column_families, const_cast<VersionSet*>(this),
+        /*track_missing_files=*/false, no_error_if_files_missing, io_tracer_);
+    handler.Iterate(reader, &log_read_status);
+    s = handler.status();
+    if (s.ok()) {
+      log_number = handler.GetVersionEditParams().log_number_;
+      current_manifest_file_size = reader.GetReadOffset();
+      assert(current_manifest_file_size != 0);
+      handler.GetDbId(db_id);
+    }
+  }
+
+  if (s.ok()) {
+    manifest_file_size_ = current_manifest_file_size;
+    ROCKS_LOG_INFO(
+        db_options_->info_log,
+        "Recovered from manifest file:%s succeeded,"
+        "manifest_file_number is %" PRIu64 ", next_file_number is %" PRIu64
+        ", last_sequence is %" PRIu64 ", log_number is %" PRIu64
+        ",prev_log_number is %" PRIu64 ",max_column_family is %" PRIu32
+        ",min_log_number_to_keep is %" PRIu64 "\n",
+        manifest_path.c_str(), manifest_file_number_, next_file_number_.load(),
+        last_sequence_.load(), log_number, prev_log_number_,
+        column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep());
+
+    for (auto cfd : *column_family_set_) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      ROCKS_LOG_INFO(db_options_->info_log,
+                     "Column family [%s] (ID %" PRIu32
+                     "), log number is %" PRIu64 "\n",
+                     cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber());
+    }
+  }
+
+  return s;
+}
+
+namespace {
+class ManifestPicker {
+ public:
+  explicit ManifestPicker(const std::string& dbname,
+                          const std::vector<std::string>& files_in_dbname);
+  // REQUIRES Valid() == true
+  std::string GetNextManifest(uint64_t* file_number, std::string* file_name);
+  bool Valid() const { return manifest_file_iter_ != manifest_files_.end(); }
+
+ private:
+  const std::string& dbname_;
+  // MANIFEST file names(s)
+  std::vector<std::string> manifest_files_;
+  std::vector<std::string>::const_iterator manifest_file_iter_;
+};
+
+ManifestPicker::ManifestPicker(const std::string& dbname,
+                               const std::vector<std::string>& files_in_dbname)
+    : dbname_(dbname) {
+  // populate manifest files
+  assert(!files_in_dbname.empty());
+  for (const auto& fname : files_in_dbname) {
+    uint64_t file_num = 0;
+    FileType file_type;
+    bool parse_ok = ParseFileName(fname, &file_num, &file_type);
+    if (parse_ok && file_type == kDescriptorFile) {
+      manifest_files_.push_back(fname);
+    }
+  }
+  // seek to first manifest
+  std::sort(manifest_files_.begin(), manifest_files_.end(),
+            [](const std::string& lhs, const std::string& rhs) {
+              uint64_t num1 = 0;
+              uint64_t num2 = 0;
+              FileType type1;
+              FileType type2;
+              bool parse_ok1 = ParseFileName(lhs, &num1, &type1);
+              bool parse_ok2 = ParseFileName(rhs, &num2, &type2);
+#ifndef NDEBUG
+              assert(parse_ok1);
+              assert(parse_ok2);
+#else
+              (void)parse_ok1;
+              (void)parse_ok2;
+#endif
+              return num1 > num2;
+            });
+  manifest_file_iter_ = manifest_files_.begin();
+}
+
+std::string ManifestPicker::GetNextManifest(uint64_t* number,
+                                            std::string* file_name) {
+  assert(Valid());
+  std::string ret;
+  if (manifest_file_iter_ != manifest_files_.end()) {
+    ret.assign(dbname_);
+    if (ret.back() != kFilePathSeparator) {
+      ret.push_back(kFilePathSeparator);
+    }
+    ret.append(*manifest_file_iter_);
+    if (number) {
+      FileType type;
+      bool parse = ParseFileName(*manifest_file_iter_, number, &type);
+      assert(type == kDescriptorFile);
+#ifndef NDEBUG
+      assert(parse);
+#else
+      (void)parse;
+#endif
+    }
+    if (file_name) {
+      *file_name = *manifest_file_iter_;
+    }
+    ++manifest_file_iter_;
+  }
+  return ret;
+}
+}  // anonymous namespace
+
+Status VersionSet::TryRecover(
+    const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
+    const std::vector<std::string>& files_in_dbname, std::string* db_id,
+    bool* has_missing_table_file) {
+  ManifestPicker manifest_picker(dbname_, files_in_dbname);
+  if (!manifest_picker.Valid()) {
+    return Status::Corruption("Cannot locate MANIFEST file in " + dbname_);
+  }
+  Status s;
+  std::string manifest_path =
+      manifest_picker.GetNextManifest(&manifest_file_number_, nullptr);
+  while (!manifest_path.empty()) {
+    s = TryRecoverFromOneManifest(manifest_path, column_families, read_only,
+                                  db_id, has_missing_table_file);
+    if (s.ok() || !manifest_picker.Valid()) {
+      break;
+    }
+    Reset();
+    manifest_path =
+        manifest_picker.GetNextManifest(&manifest_file_number_, nullptr);
+  }
+  return s;
+}
+
+Status VersionSet::TryRecoverFromOneManifest(
+    const std::string& manifest_path,
+    const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
+    std::string* db_id, bool* has_missing_table_file) {
+  ROCKS_LOG_INFO(db_options_->info_log, "Trying to recover from manifest: %s\n",
+                 manifest_path.c_str());
+  std::unique_ptr<SequentialFileReader> manifest_file_reader;
+  Status s;
+  {
+    std::unique_ptr<FSSequentialFile> manifest_file;
+    s = fs_->NewSequentialFile(manifest_path,
+                               fs_->OptimizeForManifestRead(file_options_),
+                               &manifest_file, nullptr);
+    if (!s.ok()) {
+      return s;
+    }
+    manifest_file_reader.reset(new SequentialFileReader(
+        std::move(manifest_file), manifest_path,
+        db_options_->log_readahead_size, io_tracer_, db_options_->listeners));
+  }
+
+  assert(s.ok());
+  VersionSet::LogReporter reporter;
+  reporter.status = &s;
+  log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter,
+                     /*checksum=*/true, /*log_num=*/0);
+  VersionEditHandlerPointInTime handler_pit(
+      read_only, column_families, const_cast<VersionSet*>(this), io_tracer_);
+
+  handler_pit.Iterate(reader, &s);
+
+  handler_pit.GetDbId(db_id);
+
+  assert(nullptr != has_missing_table_file);
+  *has_missing_table_file = handler_pit.HasMissingFiles();
+
+  return handler_pit.status();
+}
+
+Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
+                                      const std::string& dbname,
+                                      FileSystem* fs) {
+  // Read "CURRENT" file, which contains a pointer to the current manifest file
+  std::string manifest_path;
+  uint64_t manifest_file_number;
+  Status s =
+      GetCurrentManifestPath(dbname, fs, &manifest_path, &manifest_file_number);
+  if (!s.ok()) {
+    return s;
+  }
+  return ListColumnFamiliesFromManifest(manifest_path, fs, column_families);
+}
+
+Status VersionSet::ListColumnFamiliesFromManifest(
+    const std::string& manifest_path, FileSystem* fs,
+    std::vector<std::string>* column_families) {
+  std::unique_ptr<SequentialFileReader> file_reader;
+  Status s;
+  {
+    std::unique_ptr<FSSequentialFile> file;
+    // these are just for performance reasons, not correctness,
+    // so we're fine using the defaults
+    s = fs->NewSequentialFile(manifest_path, FileOptions(), &file, nullptr);
+    if (!s.ok()) {
+      return s;
+    }
+    file_reader = std::make_unique<SequentialFileReader>(
+        std::move(file), manifest_path, /*io_tracer=*/nullptr);
+  }
+
+  VersionSet::LogReporter reporter;
+  reporter.status = &s;
+  log::Reader reader(nullptr, std::move(file_reader), &reporter,
+                     true /* checksum */, 0 /* log_number */);
+
+  ListColumnFamiliesHandler handler;
+  handler.Iterate(reader, &s);
+
+  assert(column_families);
+  column_families->clear();
+  if (handler.status().ok()) {
+    for (const auto& iter : handler.GetColumnFamilyNames()) {
+      column_families->push_back(iter.second);
+    }
+  }
+
+  return handler.status();
+}
+
+#ifndef ROCKSDB_LITE
+Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
+                                        const Options* options,
+                                        const FileOptions& file_options,
+                                        int new_levels) {
+  if (new_levels <= 1) {
+    return Status::InvalidArgument(
+        "Number of levels needs to be bigger than 1");
+  }
+
+  ImmutableDBOptions db_options(*options);
+  ColumnFamilyOptions cf_options(*options);
+  std::shared_ptr<Cache> tc(NewLRUCache(options->max_open_files - 10,
+                                        options->table_cache_numshardbits));
+  WriteController wc(options->delayed_write_rate);
+  WriteBufferManager wb(options->db_write_buffer_size);
+  VersionSet versions(dbname, &db_options, file_options, tc.get(), &wb, &wc,
+                      nullptr /*BlockCacheTracer*/, nullptr /*IOTracer*/,
+                      /*db_id*/ "",
+                      /*db_session_id*/ "");
+  Status status;
+
+  std::vector<ColumnFamilyDescriptor> dummy;
+  ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName,
+                                          ColumnFamilyOptions(*options));
+  dummy.push_back(dummy_descriptor);
+  status = versions.Recover(dummy);
+  if (!status.ok()) {
+    return status;
+  }
+
+  Version* current_version =
+      versions.GetColumnFamilySet()->GetDefault()->current();
+  auto* vstorage = current_version->storage_info();
+  int current_levels = vstorage->num_levels();
+
+  if (current_levels <= new_levels) {
+    return Status::OK();
+  }
+
+  // Make sure there are file only on one level from
+  // (new_levels-1) to (current_levels-1)
+  int first_nonempty_level = -1;
+  int first_nonempty_level_filenum = 0;
+  for (int i = new_levels - 1; i < current_levels; i++) {
+    int file_num = vstorage->NumLevelFiles(i);
+    if (file_num != 0) {
+      if (first_nonempty_level < 0) {
+        first_nonempty_level = i;
+        first_nonempty_level_filenum = file_num;
+      } else {
+        char msg[255];
+        snprintf(msg, sizeof(msg),
+                 "Found at least two levels containing files: "
+                 "[%d:%d],[%d:%d].\n",
+                 first_nonempty_level, first_nonempty_level_filenum, i,
+                 file_num);
+        return Status::InvalidArgument(msg);
+      }
+    }
+  }
+
+  // we need to allocate an array with the old number of levels size to
+  // avoid SIGSEGV in WriteCurrentStatetoManifest()
+  // however, all levels bigger or equal to new_levels will be empty
+  std::vector<FileMetaData*>* new_files_list =
+      new std::vector<FileMetaData*>[current_levels];
+  for (int i = 0; i < new_levels - 1; i++) {
+    new_files_list[i] = vstorage->LevelFiles(i);
+  }
+
+  if (first_nonempty_level > 0) {
+    auto& new_last_level = new_files_list[new_levels - 1];
+
+    new_last_level = vstorage->LevelFiles(first_nonempty_level);
+
+    for (size_t i = 0; i < new_last_level.size(); ++i) {
+      const FileMetaData* const meta = new_last_level[i];
+      assert(meta);
+
+      const uint64_t file_number = meta->fd.GetNumber();
+
+      vstorage->file_locations_[file_number] =
+          VersionStorageInfo::FileLocation(new_levels - 1, i);
+    }
+  }
+
+  delete[] vstorage->files_;
+  vstorage->files_ = new_files_list;
+  vstorage->num_levels_ = new_levels;
+  vstorage->ResizeCompactCursors(new_levels);
+
+  MutableCFOptions mutable_cf_options(*options);
+  VersionEdit ve;
+  InstrumentedMutex dummy_mutex;
+  InstrumentedMutexLock l(&dummy_mutex);
+  return versions.LogAndApply(versions.GetColumnFamilySet()->GetDefault(),
+                              mutable_cf_options, &ve, &dummy_mutex, nullptr,
+                              true);
+}
+
+// Get the checksum information including the checksum and checksum function
+// name of all SST and blob files in VersionSet. Store the information in
+// FileChecksumList which contains a map from file number to its checksum info.
+// If DB is not running, make sure call VersionSet::Recover() to load the file
+// metadata from Manifest to VersionSet before calling this function.
+Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) {
+  // Clean the previously stored checksum information if any.
+  Status s;
+  if (checksum_list == nullptr) {
+    s = Status::InvalidArgument("checksum_list is nullptr");
+    return s;
+  }
+  checksum_list->reset();
+
+  for (auto cfd : *column_family_set_) {
+    assert(cfd);
+
+    if (cfd->IsDropped() || !cfd->initialized()) {
+      continue;
+    }
+
+    const auto* current = cfd->current();
+    assert(current);
+
+    const auto* vstorage = current->storage_info();
+    assert(vstorage);
+
+    /* SST files */
+    for (int level = 0; level < cfd->NumberLevels(); level++) {
+      const auto& level_files = vstorage->LevelFiles(level);
+
+      for (const auto& file : level_files) {
+        assert(file);
+
+        s = checksum_list->InsertOneFileChecksum(file->fd.GetNumber(),
+                                                 file->file_checksum,
+                                                 file->file_checksum_func_name);
+        if (!s.ok()) {
+          return s;
+        }
+      }
+    }
+
+    /* Blob files */
+    const auto& blob_files = vstorage->GetBlobFiles();
+    for (const auto& meta : blob_files) {
+      assert(meta);
+
+      std::string checksum_value = meta->GetChecksumValue();
+      std::string checksum_method = meta->GetChecksumMethod();
+      assert(checksum_value.empty() == checksum_method.empty());
+      if (meta->GetChecksumMethod().empty()) {
+        checksum_value = kUnknownFileChecksum;
+        checksum_method = kUnknownFileChecksumFuncName;
+      }
+
+      s = checksum_list->InsertOneFileChecksum(meta->GetBlobFileNumber(),
+                                               checksum_value, checksum_method);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  }
+
+  return s;
+}
+
+Status VersionSet::DumpManifest(Options& options, std::string& dscname,
+                                bool verbose, bool hex, bool json) {
+  assert(options.env);
+  std::vector<std::string> column_families;
+  Status s = ListColumnFamiliesFromManifest(
+      dscname, options.env->GetFileSystem().get(), &column_families);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Open the specified manifest file.
+  std::unique_ptr<SequentialFileReader> file_reader;
+  {
+    std::unique_ptr<FSSequentialFile> file;
+    const std::shared_ptr<FileSystem>& fs = options.env->GetFileSystem();
+    s = fs->NewSequentialFile(
+        dscname, fs->OptimizeForManifestRead(file_options_), &file, nullptr);
+    if (!s.ok()) {
+      return s;
+    }
+    file_reader = std::make_unique<SequentialFileReader>(
+        std::move(file), dscname, db_options_->log_readahead_size, io_tracer_);
+  }
+
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  for (const auto& cf : column_families) {
+    cf_descs.emplace_back(cf, options);
+  }
+
+  DumpManifestHandler handler(cf_descs, this, io_tracer_, verbose, hex, json);
+  {
+    VersionSet::LogReporter reporter;
+    reporter.status = &s;
+    log::Reader reader(nullptr, std::move(file_reader), &reporter,
+                       true /* checksum */, 0 /* log_number */);
+    handler.Iterate(reader, &s);
+  }
+
+  return handler.status();
+}
+#endif  // ROCKSDB_LITE
+
+void VersionSet::MarkFileNumberUsed(uint64_t number) {
+  // only called during recovery and repair which are single threaded, so this
+  // works because there can't be concurrent calls
+  if (next_file_number_.load(std::memory_order_relaxed) <= number) {
+    next_file_number_.store(number + 1, std::memory_order_relaxed);
+  }
+}
+// Called only either from ::LogAndApply which is protected by mutex or during
+// recovery which is single-threaded.
+void VersionSet::MarkMinLogNumberToKeep(uint64_t number) {
+  if (min_log_number_to_keep_.load(std::memory_order_relaxed) < number) {
+    min_log_number_to_keep_.store(number, std::memory_order_relaxed);
+  }
+}
+
+Status VersionSet::WriteCurrentStateToManifest(
+    const std::unordered_map<uint32_t, MutableCFState>& curr_state,
+    const VersionEdit& wal_additions, log::Writer* log, IOStatus& io_s) {
+  // TODO: Break up into multiple records to reduce memory usage on recovery?
+
+  // WARNING: This method doesn't hold a mutex!!
+
+  // This is done without DB mutex lock held, but only within single-threaded
+  // LogAndApply. Column family manipulations can only happen within LogAndApply
+  // (the same single thread), so we're safe to iterate.
+
+  assert(io_s.ok());
+  if (db_options_->write_dbid_to_manifest) {
+    VersionEdit edit_for_db_id;
+    assert(!db_id_.empty());
+    edit_for_db_id.SetDBId(db_id_);
+    std::string db_id_record;
+    if (!edit_for_db_id.EncodeTo(&db_id_record)) {
+      return Status::Corruption("Unable to Encode VersionEdit:" +
+                                edit_for_db_id.DebugString(true));
+    }
+    io_s = log->AddRecord(db_id_record);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+  }
+
+  // Save WALs.
+  if (!wal_additions.GetWalAdditions().empty()) {
+    TEST_SYNC_POINT_CALLBACK("VersionSet::WriteCurrentStateToManifest:SaveWal",
+                             const_cast<VersionEdit*>(&wal_additions));
+    std::string record;
+    if (!wal_additions.EncodeTo(&record)) {
+      return Status::Corruption("Unable to Encode VersionEdit: " +
+                                wal_additions.DebugString(true));
+    }
+    io_s = log->AddRecord(record);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+  }
+
+  for (auto cfd : *column_family_set_) {
+    assert(cfd);
+
+    if (cfd->IsDropped()) {
+      continue;
+    }
+    assert(cfd->initialized());
+    {
+      // Store column family info
+      VersionEdit edit;
+      if (cfd->GetID() != 0) {
+        // default column family is always there,
+        // no need to explicitly write it
+        edit.AddColumnFamily(cfd->GetName());
+        edit.SetColumnFamily(cfd->GetID());
+      }
+      edit.SetComparatorName(
+          cfd->internal_comparator().user_comparator()->Name());
+      std::string record;
+      if (!edit.EncodeTo(&record)) {
+        return Status::Corruption("Unable to Encode VersionEdit:" +
+                                  edit.DebugString(true));
+      }
+      io_s = log->AddRecord(record);
+      if (!io_s.ok()) {
+        return io_s;
+      }
+    }
+
+    {
+      // Save files
+      VersionEdit edit;
+      edit.SetColumnFamily(cfd->GetID());
+
+      const auto* current = cfd->current();
+      assert(current);
+
+      const auto* vstorage = current->storage_info();
+      assert(vstorage);
+
+      for (int level = 0; level < cfd->NumberLevels(); level++) {
+        const auto& level_files = vstorage->LevelFiles(level);
+
+        for (const auto& f : level_files) {
+          assert(f);
+
+          edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(),
+                       f->fd.GetFileSize(), f->smallest, f->largest,
+                       f->fd.smallest_seqno, f->fd.largest_seqno,
+                       f->marked_for_compaction, f->temperature,
+                       f->oldest_blob_file_number, f->oldest_ancester_time,
+                       f->file_creation_time, f->file_checksum,
+                       f->file_checksum_func_name, f->unique_id);
+        }
+      }
+
+      edit.SetCompactCursors(vstorage->GetCompactCursors());
+
+      const auto& blob_files = vstorage->GetBlobFiles();
+      for (const auto& meta : blob_files) {
+        assert(meta);
+
+        const uint64_t blob_file_number = meta->GetBlobFileNumber();
+
+        edit.AddBlobFile(blob_file_number, meta->GetTotalBlobCount(),
+                         meta->GetTotalBlobBytes(), meta->GetChecksumMethod(),
+                         meta->GetChecksumValue());
+        if (meta->GetGarbageBlobCount() > 0) {
+          edit.AddBlobFileGarbage(blob_file_number, meta->GetGarbageBlobCount(),
+                                  meta->GetGarbageBlobBytes());
+        }
+      }
+
+      const auto iter = curr_state.find(cfd->GetID());
+      assert(iter != curr_state.end());
+      uint64_t log_number = iter->second.log_number;
+      edit.SetLogNumber(log_number);
+
+      if (cfd->GetID() == 0) {
+        // min_log_number_to_keep is for the whole db, not for specific column
+        // family. So it does not need to be set for every column family, just
+        // need to be set once. Since default CF can never be dropped, we set
+        // the min_log to the default CF here.
+        uint64_t min_log = min_log_number_to_keep();
+        if (min_log != 0) {
+          edit.SetMinLogNumberToKeep(min_log);
+        }
+      }
+
+      const std::string& full_history_ts_low = iter->second.full_history_ts_low;
+      if (!full_history_ts_low.empty()) {
+        edit.SetFullHistoryTsLow(full_history_ts_low);
+      }
+
+      edit.SetLastSequence(descriptor_last_sequence_);
+
+      std::string record;
+      if (!edit.EncodeTo(&record)) {
+        return Status::Corruption("Unable to Encode VersionEdit:" +
+                                  edit.DebugString(true));
+      }
+      io_s = log->AddRecord(record);
+      if (!io_s.ok()) {
+        return io_s;
+      }
+    }
+  }
+  return Status::OK();
+}
+
+// TODO(aekmekji): in CompactionJob::GenSubcompactionBoundaries(), this
+// function is called repeatedly with consecutive pairs of slices. For example
+// if the slice list is [a, b, c, d] this function is called with arguments
+// (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible where
+// we avoid doing binary search for the keys b and c twice and instead somehow
+// maintain state of where they first appear in the files.
+uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
+                                     Version* v, const Slice& start,
+                                     const Slice& end, int start_level,
+                                     int end_level, TableReaderCaller caller) {
+  const auto& icmp = v->cfd_->internal_comparator();
+
+  // pre-condition
+  assert(icmp.Compare(start, end) <= 0);
+
+  uint64_t total_full_size = 0;
+  const auto* vstorage = v->storage_info();
+  const int num_non_empty_levels = vstorage->num_non_empty_levels();
+  end_level = (end_level == -1) ? num_non_empty_levels
+                                : std::min(end_level, num_non_empty_levels);
+
+  assert(start_level <= end_level);
+
+  // Outline of the optimization that uses options.files_size_error_margin.
+  // When approximating the files total size that is used to store a keys range,
+  // we first sum up the sizes of the files that fully fall into the range.
+  // Then we sum up the sizes of all the files that may intersect with the range
+  // (this includes all files in L0 as well). Then, if total_intersecting_size
+  // is smaller than total_full_size * options.files_size_error_margin - we can
+  // infer that the intersecting files have a sufficiently negligible
+  // contribution to the total size, and we can approximate the storage required
+  // for the keys in range as just half of the intersecting_files_size.
+  // E.g., if the value of files_size_error_margin is 0.1, then the error of the
+  // approximation is limited to only ~10% of the total size of files that fully
+  // fall into the keys range. In such case, this helps to avoid a costly
+  // process of binary searching the intersecting files that is required only
+  // for a more precise calculation of the total size.
+
+  autovector<FdWithKeyRange*, 32> first_files;
+  autovector<FdWithKeyRange*, 16> last_files;
+
+  // scan all the levels
+  for (int level = start_level; level < end_level; ++level) {
+    const LevelFilesBrief& files_brief = vstorage->LevelFilesBrief(level);
+    if (files_brief.num_files == 0) {
+      // empty level, skip exploration
+      continue;
+    }
+
+    if (level == 0) {
+      // level 0 files are not in sorted order, we need to iterate through
+      // the list to compute the total bytes that require scanning,
+      // so handle the case explicitly (similarly to first_files case)
+      for (size_t i = 0; i < files_brief.num_files; i++) {
+        first_files.push_back(&files_brief.files[i]);
+      }
+      continue;
+    }
+
+    assert(level > 0);
+    assert(files_brief.num_files > 0);
+
+    // identify the file position for start key
+    const int idx_start =
+        FindFileInRange(icmp, files_brief, start, 0,
+                        static_cast<uint32_t>(files_brief.num_files - 1));
+    assert(static_cast<size_t>(idx_start) < files_brief.num_files);
+
+    // identify the file position for end key
+    int idx_end = idx_start;
+    if (icmp.Compare(files_brief.files[idx_end].largest_key, end) < 0) {
+      idx_end =
+          FindFileInRange(icmp, files_brief, end, idx_start,
+                          static_cast<uint32_t>(files_brief.num_files - 1));
+    }
+    assert(idx_end >= idx_start &&
+           static_cast<size_t>(idx_end) < files_brief.num_files);
+
+    // scan all files from the starting index to the ending index
+    // (inferred from the sorted order)
+
+    // first scan all the intermediate full files (excluding first and last)
+    for (int i = idx_start + 1; i < idx_end; ++i) {
+      uint64_t file_size = files_brief.files[i].fd.GetFileSize();
+      // The entire file falls into the range, so we can just take its size.
+      assert(file_size ==
+             ApproximateSize(v, files_brief.files[i], start, end, caller));
+      total_full_size += file_size;
+    }
+
+    // save the first and the last files (which may be the same file), so we
+    // can scan them later.
+    first_files.push_back(&files_brief.files[idx_start]);
+    if (idx_start != idx_end) {
+      // we need to estimate size for both files, only if they are different
+      last_files.push_back(&files_brief.files[idx_end]);
+    }
+  }
+
+  // The sum of all file sizes that intersect the [start, end] keys range.
+  uint64_t total_intersecting_size = 0;
+  for (const auto* file_ptr : first_files) {
+    total_intersecting_size += file_ptr->fd.GetFileSize();
+  }
+  for (const auto* file_ptr : last_files) {
+    total_intersecting_size += file_ptr->fd.GetFileSize();
+  }
+
+  // Now scan all the first & last files at each level, and estimate their size.
+  // If the total_intersecting_size is less than X% of the total_full_size - we
+  // want to approximate the result in order to avoid the costly binary search
+  // inside ApproximateSize. We use half of file size as an approximation below.
+
+  const double margin = options.files_size_error_margin;
+  if (margin > 0 && total_intersecting_size <
+                        static_cast<uint64_t>(total_full_size * margin)) {
+    total_full_size += total_intersecting_size / 2;
+  } else {
+    // Estimate for all the first files (might also be last files), at each
+    // level
+    for (const auto file_ptr : first_files) {
+      total_full_size += ApproximateSize(v, *file_ptr, start, end, caller);
+    }
+
+    // Estimate for all the last files, at each level
+    for (const auto file_ptr : last_files) {
+      // We could use ApproximateSize here, but calling ApproximateOffsetOf
+      // directly is just more efficient.
+      total_full_size += ApproximateOffsetOf(v, *file_ptr, end, caller);
+    }
+  }
+
+  return total_full_size;
+}
+
+uint64_t VersionSet::ApproximateOffsetOf(Version* v, const FdWithKeyRange& f,
+                                         const Slice& key,
+                                         TableReaderCaller caller) {
+  // pre-condition
+  assert(v);
+  const auto& icmp = v->cfd_->internal_comparator();
+
+  uint64_t result = 0;
+  if (icmp.Compare(f.largest_key, key) <= 0) {
+    // Entire file is before "key", so just add the file size
+    result = f.fd.GetFileSize();
+  } else if (icmp.Compare(f.smallest_key, key) > 0) {
+    // Entire file is after "key", so ignore
+    result = 0;
+  } else {
+    // "key" falls in the range for this table.  Add the
+    // approximate offset of "key" within the table.
+    TableCache* table_cache = v->cfd_->table_cache();
+    if (table_cache != nullptr) {
+      result = table_cache->ApproximateOffsetOf(
+          key, *f.file_metadata, caller, icmp,
+          v->GetMutableCFOptions().prefix_extractor);
+    }
+  }
+  return result;
+}
+
+uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
+                                     const Slice& start, const Slice& end,
+                                     TableReaderCaller caller) {
+  // pre-condition
+  assert(v);
+  const auto& icmp = v->cfd_->internal_comparator();
+  assert(icmp.Compare(start, end) <= 0);
+
+  if (icmp.Compare(f.largest_key, start) <= 0 ||
+      icmp.Compare(f.smallest_key, end) > 0) {
+    // Entire file is before or after the start/end keys range
+    return 0;
+  }
+
+  if (icmp.Compare(f.smallest_key, start) >= 0) {
+    // Start of the range is before the file start - approximate by end offset
+    return ApproximateOffsetOf(v, f, end, caller);
+  }
+
+  if (icmp.Compare(f.largest_key, end) < 0) {
+    // End of the range is after the file end - approximate by subtracting
+    // start offset from the file size
+    uint64_t start_offset = ApproximateOffsetOf(v, f, start, caller);
+    assert(f.fd.GetFileSize() >= start_offset);
+    return f.fd.GetFileSize() - start_offset;
+  }
+
+  // The interval falls entirely in the range for this file.
+  TableCache* table_cache = v->cfd_->table_cache();
+  if (table_cache == nullptr) {
+    return 0;
+  }
+  return table_cache->ApproximateSize(
+      start, end, *f.file_metadata, caller, icmp,
+      v->GetMutableCFOptions().prefix_extractor);
+}
+
+void VersionSet::RemoveLiveFiles(
+    std::vector<ObsoleteFileInfo>& sst_delete_candidates,
+    std::vector<ObsoleteBlobFileInfo>& blob_delete_candidates) const {
+  assert(column_family_set_);
+  for (auto cfd : *column_family_set_) {
+    assert(cfd);
+    if (!cfd->initialized()) {
+      continue;
+    }
+
+    auto* current = cfd->current();
+    bool found_current = false;
+
+    Version* const dummy_versions = cfd->dummy_versions();
+    assert(dummy_versions);
+
+    for (Version* v = dummy_versions->next_; v != dummy_versions;
+         v = v->next_) {
+      v->RemoveLiveFiles(sst_delete_candidates, blob_delete_candidates);
+      if (v == current) {
+        found_current = true;
+      }
+    }
+
+    if (!found_current && current != nullptr) {
+      // Should never happen unless it is a bug.
+      assert(false);
+      current->RemoveLiveFiles(sst_delete_candidates, blob_delete_candidates);
+    }
+  }
+}
+
+void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_table_files,
+                              std::vector<uint64_t>* live_blob_files) const {
+  assert(live_table_files);
+  assert(live_blob_files);
+
+  // pre-calculate space requirement
+  size_t total_table_files = 0;
+  size_t total_blob_files = 0;
+
+  assert(column_family_set_);
+  for (auto cfd : *column_family_set_) {
+    assert(cfd);
+
+    if (!cfd->initialized()) {
+      continue;
+    }
+
+    Version* const dummy_versions = cfd->dummy_versions();
+    assert(dummy_versions);
+
+    for (Version* v = dummy_versions->next_; v != dummy_versions;
+         v = v->next_) {
+      assert(v);
+
+      const auto* vstorage = v->storage_info();
+      assert(vstorage);
+
+      for (int level = 0; level < vstorage->num_levels(); ++level) {
+        total_table_files += vstorage->LevelFiles(level).size();
+      }
+
+      total_blob_files += vstorage->GetBlobFiles().size();
+    }
+  }
+
+  // just one time extension to the right size
+  live_table_files->reserve(live_table_files->size() + total_table_files);
+  live_blob_files->reserve(live_blob_files->size() + total_blob_files);
+
+  assert(column_family_set_);
+  for (auto cfd : *column_family_set_) {
+    assert(cfd);
+    if (!cfd->initialized()) {
+      continue;
+    }
+
+    auto* current = cfd->current();
+    bool found_current = false;
+
+    Version* const dummy_versions = cfd->dummy_versions();
+    assert(dummy_versions);
+
+    for (Version* v = dummy_versions->next_; v != dummy_versions;
+         v = v->next_) {
+      v->AddLiveFiles(live_table_files, live_blob_files);
+      if (v == current) {
+        found_current = true;
+      }
+    }
+
+    if (!found_current && current != nullptr) {
+      // Should never happen unless it is a bug.
+      assert(false);
+      current->AddLiveFiles(live_table_files, live_blob_files);
+    }
+  }
+}
+
+InternalIterator* VersionSet::MakeInputIterator(
+    const ReadOptions& read_options, const Compaction* c,
+    RangeDelAggregator* range_del_agg,
+    const FileOptions& file_options_compactions,
+    const std::optional<const Slice>& start,
+    const std::optional<const Slice>& end) {
+  auto cfd = c->column_family_data();
+  // Level-0 files have to be merged together.  For other levels,
+  // we will make a concatenating iterator per level.
+  // TODO(opt): use concatenating iterator for level-0 if there is no overlap
+  const size_t space = (c->level() == 0 ? c->input_levels(0)->num_files +
+                                              c->num_input_levels() - 1
+                                        : c->num_input_levels());
+  InternalIterator** list = new InternalIterator*[space];
+  size_t num = 0;
+  for (size_t which = 0; which < c->num_input_levels(); which++) {
+    if (c->input_levels(which)->num_files != 0) {
+      if (c->level(which) == 0) {
+        const LevelFilesBrief* flevel = c->input_levels(which);
+        for (size_t i = 0; i < flevel->num_files; i++) {
+          const FileMetaData& fmd = *flevel->files[i].file_metadata;
+          if (start.has_value() &&
+              cfd->user_comparator()->CompareWithoutTimestamp(
+                  start.value(), fmd.largest.user_key()) > 0) {
+            continue;
+          }
+          // We should be able to filter out the case where the end key
+          // equals to the end boundary, since the end key is exclusive.
+          // We try to be extra safe here.
+          if (end.has_value() &&
+              cfd->user_comparator()->CompareWithoutTimestamp(
+                  end.value(), fmd.smallest.user_key()) < 0) {
+            continue;
+          }
+
+          list[num++] = cfd->table_cache()->NewIterator(
+              read_options, file_options_compactions,
+              cfd->internal_comparator(), fmd, range_del_agg,
+              c->mutable_cf_options()->prefix_extractor,
+              /*table_reader_ptr=*/nullptr,
+              /*file_read_hist=*/nullptr, TableReaderCaller::kCompaction,
+              /*arena=*/nullptr,
+              /*skip_filters=*/false,
+              /*level=*/static_cast<int>(c->level(which)),
+              MaxFileSizeForL0MetaPin(*c->mutable_cf_options()),
+              /*smallest_compaction_key=*/nullptr,
+              /*largest_compaction_key=*/nullptr,
+              /*allow_unprepared_value=*/false);
+        }
+      } else {
+        // Create concatenating iterator for the files from this level
+        list[num++] = new LevelIterator(
+            cfd->table_cache(), read_options, file_options_compactions,
+            cfd->internal_comparator(), c->input_levels(which),
+            c->mutable_cf_options()->prefix_extractor,
+            /*should_sample=*/false,
+            /*no per level latency histogram=*/nullptr,
+            TableReaderCaller::kCompaction, /*skip_filters=*/false,
+            /*level=*/static_cast<int>(c->level(which)), range_del_agg,
+            c->boundaries(which));
+      }
+    }
+  }
+  assert(num <= space);
+  InternalIterator* result =
+      NewMergingIterator(&c->column_family_data()->internal_comparator(), list,
+                         static_cast<int>(num));
+  delete[] list;
+  return result;
+}
+
+Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel,
+                                      FileMetaData** meta,
+                                      ColumnFamilyData** cfd) {
+  for (auto cfd_iter : *column_family_set_) {
+    if (!cfd_iter->initialized()) {
+      continue;
+    }
+    Version* version = cfd_iter->current();
+    const auto* vstorage = version->storage_info();
+    for (int level = 0; level < vstorage->num_levels(); level++) {
+      for (const auto& file : vstorage->LevelFiles(level)) {
+        if (file->fd.GetNumber() == number) {
+          *meta = file;
+          *filelevel = level;
+          *cfd = cfd_iter;
+          return Status::OK();
+        }
+      }
+    }
+  }
+  return Status::NotFound("File not present in any level");
+}
+
+void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
+  for (auto cfd : *column_family_set_) {
+    if (cfd->IsDropped() || !cfd->initialized()) {
+      continue;
+    }
+    for (int level = 0; level < cfd->NumberLevels(); level++) {
+      for (const auto& file :
+           cfd->current()->storage_info()->LevelFiles(level)) {
+        LiveFileMetaData filemetadata;
+        filemetadata.column_family_name = cfd->GetName();
+        uint32_t path_id = file->fd.GetPathId();
+        if (path_id < cfd->ioptions()->cf_paths.size()) {
+          filemetadata.db_path = cfd->ioptions()->cf_paths[path_id].path;
+        } else {
+          assert(!cfd->ioptions()->cf_paths.empty());
+          filemetadata.db_path = cfd->ioptions()->cf_paths.back().path;
+        }
+        filemetadata.directory = filemetadata.db_path;
+        const uint64_t file_number = file->fd.GetNumber();
+        filemetadata.name = MakeTableFileName("", file_number);
+        filemetadata.relative_filename = filemetadata.name.substr(1);
+        filemetadata.file_number = file_number;
+        filemetadata.level = level;
+        filemetadata.size = file->fd.GetFileSize();
+        filemetadata.smallestkey = file->smallest.user_key().ToString();
+        filemetadata.largestkey = file->largest.user_key().ToString();
+        filemetadata.smallest_seqno = file->fd.smallest_seqno;
+        filemetadata.largest_seqno = file->fd.largest_seqno;
+        filemetadata.num_reads_sampled =
+            file->stats.num_reads_sampled.load(std::memory_order_relaxed);
+        filemetadata.being_compacted = file->being_compacted;
+        filemetadata.num_entries = file->num_entries;
+        filemetadata.num_deletions = file->num_deletions;
+        filemetadata.oldest_blob_file_number = file->oldest_blob_file_number;
+        filemetadata.file_checksum = file->file_checksum;
+        filemetadata.file_checksum_func_name = file->file_checksum_func_name;
+        filemetadata.temperature = file->temperature;
+        filemetadata.oldest_ancester_time = file->TryGetOldestAncesterTime();
+        filemetadata.file_creation_time = file->TryGetFileCreationTime();
+        metadata->push_back(filemetadata);
+      }
+    }
+  }
+}
+
+void VersionSet::GetObsoleteFiles(std::vector<ObsoleteFileInfo>* files,
+                                  std::vector<ObsoleteBlobFileInfo>* blob_files,
+                                  std::vector<std::string>* manifest_filenames,
+                                  uint64_t min_pending_output) {
+  assert(files);
+  assert(blob_files);
+  assert(manifest_filenames);
+  assert(files->empty());
+  assert(blob_files->empty());
+  assert(manifest_filenames->empty());
+
+  std::vector<ObsoleteFileInfo> pending_files;
+  for (auto& f : obsolete_files_) {
+    if (f.metadata->fd.GetNumber() < min_pending_output) {
+      files->emplace_back(std::move(f));
+    } else {
+      pending_files.emplace_back(std::move(f));
+    }
+  }
+  obsolete_files_.swap(pending_files);
+
+  std::vector<ObsoleteBlobFileInfo> pending_blob_files;
+  for (auto& blob_file : obsolete_blob_files_) {
+    if (blob_file.GetBlobFileNumber() < min_pending_output) {
+      blob_files->emplace_back(std::move(blob_file));
+    } else {
+      pending_blob_files.emplace_back(std::move(blob_file));
+    }
+  }
+  obsolete_blob_files_.swap(pending_blob_files);
+
+  obsolete_manifests_.swap(*manifest_filenames);
+}
+
+ColumnFamilyData* VersionSet::CreateColumnFamily(
+    const ColumnFamilyOptions& cf_options, const VersionEdit* edit) {
+  assert(edit->is_column_family_add_);
+
+  MutableCFOptions dummy_cf_options;
+  Version* dummy_versions =
+      new Version(nullptr, this, file_options_, dummy_cf_options, io_tracer_);
+  // Ref() dummy version once so that later we can call Unref() to delete it
+  // by avoiding calling "delete" explicitly (~Version is private)
+  dummy_versions->Ref();
+  auto new_cfd = column_family_set_->CreateColumnFamily(
+      edit->column_family_name_, edit->column_family_, dummy_versions,
+      cf_options);
+
+  Version* v = new Version(new_cfd, this, file_options_,
+                           *new_cfd->GetLatestMutableCFOptions(), io_tracer_,
+                           current_version_number_++);
+
+  constexpr bool update_stats = false;
+
+  v->PrepareAppend(*new_cfd->GetLatestMutableCFOptions(), update_stats);
+
+  AppendVersion(new_cfd, v);
+  // GetLatestMutableCFOptions() is safe here without mutex since the
+  // cfd is not available to client
+  new_cfd->CreateNewMemtable(*new_cfd->GetLatestMutableCFOptions(),
+                             LastSequence());
+  new_cfd->SetLogNumber(edit->log_number_);
+  return new_cfd;
+}
+
+uint64_t VersionSet::GetNumLiveVersions(Version* dummy_versions) {
+  uint64_t count = 0;
+  for (Version* v = dummy_versions->next_; v != dummy_versions; v = v->next_) {
+    count++;
+  }
+  return count;
+}
+
+uint64_t VersionSet::GetTotalSstFilesSize(Version* dummy_versions) {
+  std::unordered_set<uint64_t> unique_files;
+  uint64_t total_files_size = 0;
+  for (Version* v = dummy_versions->next_; v != dummy_versions; v = v->next_) {
+    VersionStorageInfo* storage_info = v->storage_info();
+    for (int level = 0; level < storage_info->num_levels_; level++) {
+      for (const auto& file_meta : storage_info->LevelFiles(level)) {
+        if (unique_files.find(file_meta->fd.packed_number_and_path_id) ==
+            unique_files.end()) {
+          unique_files.insert(file_meta->fd.packed_number_and_path_id);
+          total_files_size += file_meta->fd.GetFileSize();
+        }
+      }
+    }
+  }
+  return total_files_size;
+}
+
+uint64_t VersionSet::GetTotalBlobFileSize(Version* dummy_versions) {
+  std::unordered_set<uint64_t> unique_blob_files;
+
+  uint64_t all_versions_blob_file_size = 0;
+
+  for (auto* v = dummy_versions->next_; v != dummy_versions; v = v->next_) {
+    // iterate all the versions
+    const auto* vstorage = v->storage_info();
+    assert(vstorage);
+
+    const auto& blob_files = vstorage->GetBlobFiles();
+
+    for (const auto& meta : blob_files) {
+      assert(meta);
+
+      const uint64_t blob_file_number = meta->GetBlobFileNumber();
+
+      if (unique_blob_files.find(blob_file_number) == unique_blob_files.end()) {
+        // find Blob file that has not been counted
+        unique_blob_files.insert(blob_file_number);
+        all_versions_blob_file_size += meta->GetBlobFileSize();
+      }
+    }
+  }
+
+  return all_versions_blob_file_size;
+}
+
+Status VersionSet::VerifyFileMetadata(ColumnFamilyData* cfd,
+                                      const std::string& fpath, int level,
+                                      const FileMetaData& meta) {
+  uint64_t fsize = 0;
+  Status status = fs_->GetFileSize(fpath, IOOptions(), &fsize, nullptr);
+  if (status.ok()) {
+    if (fsize != meta.fd.GetFileSize()) {
+      status = Status::Corruption("File size mismatch: " + fpath);
+    }
+  }
+  if (status.ok() && db_options_->verify_sst_unique_id_in_manifest) {
+    assert(cfd);
+    TableCache* table_cache = cfd->table_cache();
+    assert(table_cache);
+
+    const MutableCFOptions* const cf_opts = cfd->GetLatestMutableCFOptions();
+    assert(cf_opts);
+    std::shared_ptr<const SliceTransform> pe = cf_opts->prefix_extractor;
+    size_t max_sz_for_l0_meta_pin = MaxFileSizeForL0MetaPin(*cf_opts);
+
+    const FileOptions& file_opts = file_options();
+
+    Version* version = cfd->current();
+    assert(version);
+    VersionStorageInfo& storage_info = version->storage_info_;
+    const InternalKeyComparator* icmp = storage_info.InternalComparator();
+    assert(icmp);
+
+    InternalStats* internal_stats = cfd->internal_stats();
+
+    FileMetaData meta_copy = meta;
+    status = table_cache->FindTable(
+        ReadOptions(), file_opts, *icmp, meta_copy,
+        &(meta_copy.table_reader_handle), pe,
+        /*no_io=*/false, /*record_read_stats=*/true,
+        internal_stats->GetFileReadHist(level), false, level,
+        /*prefetch_index_and_filter_in_cache*/ false, max_sz_for_l0_meta_pin,
+        meta_copy.temperature);
+    if (meta_copy.table_reader_handle) {
+      table_cache->ReleaseHandle(meta_copy.table_reader_handle);
+    }
+  }
+  return status;
+}
+
+ReactiveVersionSet::ReactiveVersionSet(
+    const std::string& dbname, const ImmutableDBOptions* _db_options,
+    const FileOptions& _file_options, Cache* table_cache,
+    WriteBufferManager* write_buffer_manager, WriteController* write_controller,
+    const std::shared_ptr<IOTracer>& io_tracer)
+    : VersionSet(dbname, _db_options, _file_options, table_cache,
+                 write_buffer_manager, write_controller,
+                 /*block_cache_tracer=*/nullptr, io_tracer, /*db_id*/ "",
+                 /*db_session_id*/ "") {}
+
+ReactiveVersionSet::~ReactiveVersionSet() {}
+
+Status ReactiveVersionSet::Recover(
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
+    std::unique_ptr<log::Reader::Reporter>* manifest_reporter,
+    std::unique_ptr<Status>* manifest_reader_status) {
+  assert(manifest_reader != nullptr);
+  assert(manifest_reporter != nullptr);
+  assert(manifest_reader_status != nullptr);
+
+  manifest_reader_status->reset(new Status());
+  manifest_reporter->reset(new LogReporter());
+  static_cast_with_check<LogReporter>(manifest_reporter->get())->status =
+      manifest_reader_status->get();
+  Status s = MaybeSwitchManifest(manifest_reporter->get(), manifest_reader);
+  if (!s.ok()) {
+    return s;
+  }
+  log::Reader* reader = manifest_reader->get();
+  assert(reader);
+
+  manifest_tailer_.reset(new ManifestTailer(
+      column_families, const_cast<ReactiveVersionSet*>(this), io_tracer_));
+
+  manifest_tailer_->Iterate(*reader, manifest_reader_status->get());
+
+  return manifest_tailer_->status();
+}
+
+Status ReactiveVersionSet::ReadAndApply(
+    InstrumentedMutex* mu,
+    std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
+    Status* manifest_read_status,
+    std::unordered_set<ColumnFamilyData*>* cfds_changed) {
+  assert(manifest_reader != nullptr);
+  assert(cfds_changed != nullptr);
+  mu->AssertHeld();
+
+  Status s;
+  log::Reader* reader = manifest_reader->get();
+  assert(reader);
+  s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader);
+  if (!s.ok()) {
+    return s;
+  }
+  manifest_tailer_->Iterate(*(manifest_reader->get()), manifest_read_status);
+  s = manifest_tailer_->status();
+  if (s.ok()) {
+    *cfds_changed = std::move(manifest_tailer_->GetUpdatedColumnFamilies());
+  }
+
+  return s;
+}
+
+Status ReactiveVersionSet::MaybeSwitchManifest(
+    log::Reader::Reporter* reporter,
+    std::unique_ptr<log::FragmentBufferedReader>* manifest_reader) {
+  assert(manifest_reader != nullptr);
+  Status s;
+  std::string manifest_path;
+  s = GetCurrentManifestPath(dbname_, fs_.get(), &manifest_path,
+                             &manifest_file_number_);
+  if (!s.ok()) {
+    return s;
+  }
+  std::unique_ptr<FSSequentialFile> manifest_file;
+  if (manifest_reader->get() != nullptr &&
+      manifest_reader->get()->file()->file_name() == manifest_path) {
+    // CURRENT points to the same MANIFEST as before, no need to switch
+    // MANIFEST.
+    return s;
+  }
+  assert(nullptr == manifest_reader->get() ||
+         manifest_reader->get()->file()->file_name() != manifest_path);
+  s = fs_->FileExists(manifest_path, IOOptions(), nullptr);
+  if (s.IsNotFound()) {
+    return Status::TryAgain(
+        "The primary may have switched to a new MANIFEST and deleted the old "
+        "one.");
+  } else if (!s.ok()) {
+    return s;
+  }
+  TEST_SYNC_POINT(
+      "ReactiveVersionSet::MaybeSwitchManifest:"
+      "AfterGetCurrentManifestPath:0");
+  TEST_SYNC_POINT(
+      "ReactiveVersionSet::MaybeSwitchManifest:"
+      "AfterGetCurrentManifestPath:1");
+  // The primary can also delete the MANIFEST while the secondary is reading
+  // it. This is OK on POSIX. For other file systems, maybe create a hard link
+  // to MANIFEST. The hard link should be cleaned up later by the secondary.
+  s = fs_->NewSequentialFile(manifest_path,
+                             fs_->OptimizeForManifestRead(file_options_),
+                             &manifest_file, nullptr);
+  std::unique_ptr<SequentialFileReader> manifest_file_reader;
+  if (s.ok()) {
+    manifest_file_reader.reset(new SequentialFileReader(
+        std::move(manifest_file), manifest_path,
+        db_options_->log_readahead_size, io_tracer_, db_options_->listeners));
+    manifest_reader->reset(new log::FragmentBufferedReader(
+        nullptr, std::move(manifest_file_reader), reporter, true /* checksum */,
+        0 /* log_number */));
+    ROCKS_LOG_INFO(db_options_->info_log, "Switched to new manifest: %s\n",
+                   manifest_path.c_str());
+    if (manifest_tailer_) {
+      manifest_tailer_->PrepareToReadNewManifest();
+    }
+  } else if (s.IsPathNotFound()) {
+    // This can happen if the primary switches to a new MANIFEST after the
+    // secondary reads the CURRENT file but before the secondary actually tries
+    // to open the MANIFEST.
+    s = Status::TryAgain(
+        "The primary may have switched to a new MANIFEST and deleted the old "
+        "one.");
+  }
+  return s;
+}
+
+#ifndef NDEBUG
+uint64_t ReactiveVersionSet::TEST_read_edits_in_atomic_group() const {
+  assert(manifest_tailer_);
+  return manifest_tailer_->GetReadBuffer().TEST_read_edits_in_atomic_group();
+}
+#endif  // !NDEBUG
+
+std::vector<VersionEdit>& ReactiveVersionSet::replay_buffer() {
+  assert(manifest_tailer_);
+  return manifest_tailer_->GetReadBuffer().replay_buffer();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_set.h b/src/rocksdb/db/version_set.h
new file mode 100644
index 000000000..03176a8b5
--- /dev/null
+++ b/src/rocksdb/db/version_set.h
@@ -0,0 +1,1652 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// The representation of a DBImpl consists of a set of Versions.  The
+// newest version is called "current".  Older versions may be kept
+// around to provide a consistent view to live iterators.
+//
+// Each Version keeps track of a set of table files per level, as well as a
+// set of blob files. The entire set of versions is maintained in a
+// VersionSet.
+//
+// Version,VersionSet are thread-compatible, but require external
+// synchronization on all accesses.
+
+#pragma once
+#include <atomic>
+#include <deque>
+#include <limits>
+#include <map>
+#include <memory>
+#include <optional>
+#include <set>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "cache/cache_helpers.h"
+#include "db/blob/blob_file_meta.h"
+#include "db/column_family.h"
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_picker.h"
+#include "db/dbformat.h"
+#include "db/file_indexer.h"
+#include "db/log_reader.h"
+#include "db/range_del_aggregator.h"
+#include "db/read_callback.h"
+#include "db/table_cache.h"
+#include "db/version_builder.h"
+#include "db/version_edit.h"
+#include "db/write_controller.h"
+#include "env/file_system_tracer.h"
+#if USE_COROUTINES
+#include "folly/experimental/coro/BlockingWait.h"
+#include "folly/experimental/coro/Collect.h"
+#endif
+#include "monitoring/instrumented_mutex.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_checksum.h"
+#include "table/get_context.h"
+#include "table/multiget_context.h"
+#include "trace_replay/block_cache_tracer.h"
+#include "util/autovector.h"
+#include "util/coro_utils.h"
+#include "util/hash_containers.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace log {
+class Writer;
+}
+
+class BlobIndex;
+class Compaction;
+class LogBuffer;
+class LookupKey;
+class MemTable;
+class Version;
+class VersionSet;
+class WriteBufferManager;
+class MergeContext;
+class ColumnFamilySet;
+class MergeIteratorBuilder;
+class SystemClock;
+class ManifestTailer;
+class FilePickerMultiGet;
+
+// VersionEdit is always supposed to be valid and it is used to point at
+// entries in Manifest. Ideally it should not be used as a container to
+// carry around few of its fields as function params because it can cause
+// readers to think it's a valid entry from Manifest. To avoid that confusion
+// introducing VersionEditParams to simply carry around multiple VersionEdit
+// params. It need not point to a valid record in Manifest.
+using VersionEditParams = VersionEdit;
+
+// Return the smallest index i such that file_level.files[i]->largest >= key.
+// Return file_level.num_files if there is no such file.
+// REQUIRES: "file_level.files" contains a sorted list of
+// non-overlapping files.
+extern int FindFile(const InternalKeyComparator& icmp,
+                    const LevelFilesBrief& file_level, const Slice& key);
+
+// Returns true iff some file in "files" overlaps the user key range
+// [*smallest,*largest].
+// smallest==nullptr represents a key smaller than all keys in the DB.
+// largest==nullptr represents a key largest than all keys in the DB.
+// REQUIRES: If disjoint_sorted_files, file_level.files[]
+// contains disjoint ranges in sorted order.
+extern bool SomeFileOverlapsRange(const InternalKeyComparator& icmp,
+                                  bool disjoint_sorted_files,
+                                  const LevelFilesBrief& file_level,
+                                  const Slice* smallest_user_key,
+                                  const Slice* largest_user_key);
+
+// Generate LevelFilesBrief from vector<FdWithKeyRange*>
+// Would copy smallest_key and largest_key data to sequential memory
+// arena: Arena used to allocate the memory
+extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
+                                      const std::vector<FileMetaData*>& files,
+                                      Arena* arena);
+
+// Information of the storage associated with each Version, including number of
+// levels of LSM tree, files information at each level, files marked for
+// compaction, blob files, etc.
+class VersionStorageInfo {
+ public:
+  VersionStorageInfo(const InternalKeyComparator* internal_comparator,
+                     const Comparator* user_comparator, int num_levels,
+                     CompactionStyle compaction_style,
+                     VersionStorageInfo* src_vstorage,
+                     bool _force_consistency_checks);
+  // No copying allowed
+  VersionStorageInfo(const VersionStorageInfo&) = delete;
+  void operator=(const VersionStorageInfo&) = delete;
+  ~VersionStorageInfo();
+
+  void Reserve(int level, size_t size) { files_[level].reserve(size); }
+
+  void AddFile(int level, FileMetaData* f);
+
+  // Resize/Initialize the space for compact_cursor_
+  void ResizeCompactCursors(int level) {
+    compact_cursor_.resize(level, InternalKey());
+  }
+
+  const std::vector<InternalKey>& GetCompactCursors() const {
+    return compact_cursor_;
+  }
+
+  // REQUIRES: ResizeCompactCursors has been called
+  void AddCursorForOneLevel(int level,
+                            const InternalKey& smallest_uncompacted_key) {
+    compact_cursor_[level] = smallest_uncompacted_key;
+  }
+
+  // REQUIRES: lock is held
+  // Update the compact cursor and advance the file index using increment
+  // so that it can point to the next cursor (increment means the number of
+  // input files in this level of the last compaction)
+  const InternalKey& GetNextCompactCursor(int level, size_t increment) {
+    int cmp_idx = next_file_to_compact_by_size_[level] + (int)increment;
+    assert(cmp_idx <= (int)files_by_compaction_pri_[level].size());
+    // TODO(zichen): may need to update next_file_to_compact_by_size_
+    // for parallel compaction.
+    InternalKey new_cursor;
+    if (cmp_idx >= (int)files_by_compaction_pri_[level].size()) {
+      cmp_idx = 0;
+    }
+    // TODO(zichen): rethink if this strategy gives us some good guarantee
+    return files_[level][files_by_compaction_pri_[level][cmp_idx]]->smallest;
+  }
+
+  void ReserveBlob(size_t size) { blob_files_.reserve(size); }
+
+  void AddBlobFile(std::shared_ptr<BlobFileMetaData> blob_file_meta);
+
+  void PrepareForVersionAppend(const ImmutableOptions& immutable_options,
+                               const MutableCFOptions& mutable_cf_options);
+
+  // REQUIRES: PrepareForVersionAppend has been called
+  void SetFinalized();
+
+  // Update the accumulated stats from a file-meta.
+  void UpdateAccumulatedStats(FileMetaData* file_meta);
+
+  // Decrease the current stat from a to-be-deleted file-meta
+  void RemoveCurrentStats(FileMetaData* file_meta);
+
+  // Updates internal structures that keep track of compaction scores
+  // We use compaction scores to figure out which compaction to do next
+  // REQUIRES: db_mutex held!!
+  // TODO find a better way to pass compaction_options_fifo.
+  void ComputeCompactionScore(const ImmutableOptions& immutable_options,
+                              const MutableCFOptions& mutable_cf_options);
+
+  // Estimate est_comp_needed_bytes_
+  void EstimateCompactionBytesNeeded(
+      const MutableCFOptions& mutable_cf_options);
+
+  // This computes files_marked_for_compaction_ and is called by
+  // ComputeCompactionScore()
+  void ComputeFilesMarkedForCompaction();
+
+  // This computes ttl_expired_files_ and is called by
+  // ComputeCompactionScore()
+  void ComputeExpiredTtlFiles(const ImmutableOptions& ioptions,
+                              const uint64_t ttl);
+
+  // This computes files_marked_for_periodic_compaction_ and is called by
+  // ComputeCompactionScore()
+  void ComputeFilesMarkedForPeriodicCompaction(
+      const ImmutableOptions& ioptions,
+      const uint64_t periodic_compaction_seconds);
+
+  // This computes bottommost_files_marked_for_compaction_ and is called by
+  // ComputeCompactionScore() or UpdateOldestSnapshot().
+  //
+  // Among bottommost files (assumes they've already been computed), marks the
+  // ones that have keys that would be eliminated if recompacted, according to
+  // the seqnum of the oldest existing snapshot. Must be called every time
+  // oldest snapshot changes as that is when bottom-level files can become
+  // eligible for compaction.
+  //
+  // REQUIRES: DB mutex held
+  void ComputeBottommostFilesMarkedForCompaction();
+
+  // This computes files_marked_for_forced_blob_gc_ and is called by
+  // ComputeCompactionScore()
+  //
+  // REQUIRES: DB mutex held
+  void ComputeFilesMarkedForForcedBlobGC(
+      double blob_garbage_collection_age_cutoff,
+      double blob_garbage_collection_force_threshold);
+
+  bool level0_non_overlapping() const { return level0_non_overlapping_; }
+
+  // Updates the oldest snapshot and related internal state, like the bottommost
+  // files marked for compaction.
+  // REQUIRES: DB mutex held
+  void UpdateOldestSnapshot(SequenceNumber oldest_snapshot_seqnum);
+
+  int MaxInputLevel() const;
+  int MaxOutputLevel(bool allow_ingest_behind) const;
+
+  // Return level number that has idx'th highest score
+  int CompactionScoreLevel(int idx) const { return compaction_level_[idx]; }
+
+  // Return idx'th highest score
+  double CompactionScore(int idx) const { return compaction_score_[idx]; }
+
+  void GetOverlappingInputs(
+      int level, const InternalKey* begin,  // nullptr means before all keys
+      const InternalKey* end,               // nullptr means after all keys
+      std::vector<FileMetaData*>* inputs,
+      int hint_index = -1,        // index of overlap file
+      int* file_index = nullptr,  // return index of overlap file
+      bool expand_range = true,   // if set, returns files which overlap the
+                                  // range and overlap each other. If false,
+                                  // then just files intersecting the range
+      InternalKey** next_smallest = nullptr)  // if non-null, returns the
+      const;  // smallest key of next file not included
+  void GetCleanInputsWithinInterval(
+      int level, const InternalKey* begin,  // nullptr means before all keys
+      const InternalKey* end,               // nullptr means after all keys
+      std::vector<FileMetaData*>* inputs,
+      int hint_index = -1,        // index of overlap file
+      int* file_index = nullptr)  // return index of overlap file
+      const;
+
+  void GetOverlappingInputsRangeBinarySearch(
+      int level,                 // level > 0
+      const InternalKey* begin,  // nullptr means before all keys
+      const InternalKey* end,    // nullptr means after all keys
+      std::vector<FileMetaData*>* inputs,
+      int hint_index,                // index of overlap file
+      int* file_index,               // return index of overlap file
+      bool within_interval = false,  // if set, force the inputs within interval
+      InternalKey** next_smallest = nullptr)  // if non-null, returns the
+      const;  // smallest key of next file not included
+
+  // Returns true iff some file in the specified level overlaps
+  // some part of [*smallest_user_key,*largest_user_key].
+  // smallest_user_key==NULL represents a key smaller than all keys in the DB.
+  // largest_user_key==NULL represents a key largest than all keys in the DB.
+  bool OverlapInLevel(int level, const Slice* smallest_user_key,
+                      const Slice* largest_user_key);
+
+  // Returns true iff the first or last file in inputs contains
+  // an overlapping user key to the file "just outside" of it (i.e.
+  // just after the last file, or just before the first file)
+  // REQUIRES: "*inputs" is a sorted list of non-overlapping files
+  bool HasOverlappingUserKey(const std::vector<FileMetaData*>* inputs,
+                             int level);
+
+  int num_levels() const { return num_levels_; }
+
+  // REQUIRES: PrepareForVersionAppend has been called
+  int num_non_empty_levels() const {
+    assert(finalized_);
+    return num_non_empty_levels_;
+  }
+
+  // REQUIRES: PrepareForVersionAppend has been called
+  // This may or may not return number of level files. It is to keep backward
+  // compatible behavior in universal compaction.
+  int l0_delay_trigger_count() const { return l0_delay_trigger_count_; }
+
+  void set_l0_delay_trigger_count(int v) { l0_delay_trigger_count_ = v; }
+
+  // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
+  int NumLevelFiles(int level) const {
+    assert(finalized_);
+    return static_cast<int>(files_[level].size());
+  }
+
+  // Return the combined file size of all files at the specified level.
+  uint64_t NumLevelBytes(int level) const;
+
+  // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
+  const std::vector<FileMetaData*>& LevelFiles(int level) const {
+    return files_[level];
+  }
+
+  class FileLocation {
+   public:
+    FileLocation() = default;
+    FileLocation(int level, size_t position)
+        : level_(level), position_(position) {}
+
+    int GetLevel() const { return level_; }
+    size_t GetPosition() const { return position_; }
+
+    bool IsValid() const { return level_ >= 0; }
+
+    bool operator==(const FileLocation& rhs) const {
+      return level_ == rhs.level_ && position_ == rhs.position_;
+    }
+
+    bool operator!=(const FileLocation& rhs) const { return !(*this == rhs); }
+
+    static FileLocation Invalid() { return FileLocation(); }
+
+   private:
+    int level_ = -1;
+    size_t position_ = 0;
+  };
+
+  // REQUIRES: PrepareForVersionAppend has been called
+  FileLocation GetFileLocation(uint64_t file_number) const {
+    const auto it = file_locations_.find(file_number);
+
+    if (it == file_locations_.end()) {
+      return FileLocation::Invalid();
+    }
+
+    assert(it->second.GetLevel() < num_levels_);
+    assert(it->second.GetPosition() < files_[it->second.GetLevel()].size());
+    assert(files_[it->second.GetLevel()][it->second.GetPosition()]);
+    assert(files_[it->second.GetLevel()][it->second.GetPosition()]
+               ->fd.GetNumber() == file_number);
+
+    return it->second;
+  }
+
+  // REQUIRES: PrepareForVersionAppend has been called
+  FileMetaData* GetFileMetaDataByNumber(uint64_t file_number) const {
+    auto location = GetFileLocation(file_number);
+
+    if (!location.IsValid()) {
+      return nullptr;
+    }
+
+    return files_[location.GetLevel()][location.GetPosition()];
+  }
+
+  // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
+  using BlobFiles = std::vector<std::shared_ptr<BlobFileMetaData>>;
+  const BlobFiles& GetBlobFiles() const { return blob_files_; }
+
+  // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
+  BlobFiles::const_iterator GetBlobFileMetaDataLB(
+      uint64_t blob_file_number) const;
+
+  // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
+  std::shared_ptr<BlobFileMetaData> GetBlobFileMetaData(
+      uint64_t blob_file_number) const {
+    const auto it = GetBlobFileMetaDataLB(blob_file_number);
+
+    assert(it == blob_files_.end() || *it);
+
+    if (it != blob_files_.end() &&
+        (*it)->GetBlobFileNumber() == blob_file_number) {
+      return *it;
+    }
+
+    return std::shared_ptr<BlobFileMetaData>();
+  }
+
+  // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
+  struct BlobStats {
+    uint64_t total_file_size = 0;
+    uint64_t total_garbage_size = 0;
+    double space_amp = 0.0;
+  };
+
+  BlobStats GetBlobStats() const {
+    uint64_t total_file_size = 0;
+    uint64_t total_garbage_size = 0;
+
+    for (const auto& meta : blob_files_) {
+      assert(meta);
+
+      total_file_size += meta->GetBlobFileSize();
+      total_garbage_size += meta->GetGarbageBlobBytes();
+    }
+
+    double space_amp = 0.0;
+    if (total_file_size > total_garbage_size) {
+      space_amp = static_cast<double>(total_file_size) /
+                  (total_file_size - total_garbage_size);
+    }
+
+    return BlobStats{total_file_size, total_garbage_size, space_amp};
+  }
+
+  const ROCKSDB_NAMESPACE::LevelFilesBrief& LevelFilesBrief(int level) const {
+    assert(level < static_cast<int>(level_files_brief_.size()));
+    return level_files_brief_[level];
+  }
+
+  // REQUIRES: PrepareForVersionAppend has been called
+  const std::vector<int>& FilesByCompactionPri(int level) const {
+    assert(finalized_);
+    return files_by_compaction_pri_[level];
+  }
+
+  // REQUIRES: ComputeCompactionScore has been called
+  // REQUIRES: DB mutex held during access
+  const autovector<std::pair<int, FileMetaData*>>& FilesMarkedForCompaction()
+      const {
+    assert(finalized_);
+    return files_marked_for_compaction_;
+  }
+
+  // REQUIRES: ComputeCompactionScore has been called
+  // REQUIRES: DB mutex held during access
+  const autovector<std::pair<int, FileMetaData*>>& ExpiredTtlFiles() const {
+    assert(finalized_);
+    return expired_ttl_files_;
+  }
+
+  // REQUIRES: ComputeCompactionScore has been called
+  // REQUIRES: DB mutex held during access
+  const autovector<std::pair<int, FileMetaData*>>&
+  FilesMarkedForPeriodicCompaction() const {
+    assert(finalized_);
+    return files_marked_for_periodic_compaction_;
+  }
+
+  void TEST_AddFileMarkedForPeriodicCompaction(int level, FileMetaData* f) {
+    files_marked_for_periodic_compaction_.emplace_back(level, f);
+  }
+
+  // REQUIRES: ComputeCompactionScore has been called
+  // REQUIRES: DB mutex held during access
+  const autovector<std::pair<int, FileMetaData*>>&
+  BottommostFilesMarkedForCompaction() const {
+    assert(finalized_);
+    return bottommost_files_marked_for_compaction_;
+  }
+
+  // REQUIRES: ComputeCompactionScore has been called
+  // REQUIRES: DB mutex held during access
+  const autovector<std::pair<int, FileMetaData*>>& FilesMarkedForForcedBlobGC()
+      const {
+    assert(finalized_);
+    return files_marked_for_forced_blob_gc_;
+  }
+
+  int base_level() const { return base_level_; }
+  double level_multiplier() const { return level_multiplier_; }
+
+  // REQUIRES: lock is held
+  // Set the index that is used to offset into files_by_compaction_pri_ to find
+  // the next compaction candidate file.
+  void SetNextCompactionIndex(int level, int index) {
+    next_file_to_compact_by_size_[level] = index;
+  }
+
+  // REQUIRES: lock is held
+  int NextCompactionIndex(int level) const {
+    return next_file_to_compact_by_size_[level];
+  }
+
+  // REQUIRES: PrepareForVersionAppend has been called
+  const FileIndexer& file_indexer() const {
+    assert(finalized_);
+    return file_indexer_;
+  }
+
+  // Only the first few entries of files_by_compaction_pri_ are sorted.
+  // There is no need to sort all the files because it is likely
+  // that on a running system, we need to look at only the first
+  // few largest files because a new version is created every few
+  // seconds/minutes (because of concurrent compactions).
+  static const size_t kNumberFilesToSort = 50;
+
+  // Return a human-readable short (single-line) summary of the number
+  // of files per level.  Uses *scratch as backing store.
+  struct LevelSummaryStorage {
+    char buffer[1000];
+  };
+  struct FileSummaryStorage {
+    char buffer[3000];
+  };
+  const char* LevelSummary(LevelSummaryStorage* scratch) const;
+  // Return a human-readable short (single-line) summary of files
+  // in a specified level.  Uses *scratch as backing store.
+  const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
+
+  // Return the maximum overlapping data (in bytes) at next level for any
+  // file at a level >= 1.
+  uint64_t MaxNextLevelOverlappingBytes();
+
+  // Return a human readable string that describes this version's contents.
+  std::string DebugString(bool hex = false) const;
+
+  uint64_t GetAverageValueSize() const {
+    if (accumulated_num_non_deletions_ == 0) {
+      return 0;
+    }
+    assert(accumulated_raw_key_size_ + accumulated_raw_value_size_ > 0);
+    assert(accumulated_file_size_ > 0);
+    return accumulated_raw_value_size_ / accumulated_num_non_deletions_ *
+           accumulated_file_size_ /
+           (accumulated_raw_key_size_ + accumulated_raw_value_size_);
+  }
+
+  uint64_t GetEstimatedActiveKeys() const;
+
+  double GetEstimatedCompressionRatioAtLevel(int level) const;
+
+  // re-initializes the index that is used to offset into
+  // files_by_compaction_pri_
+  // to find the next compaction candidate file.
+  void ResetNextCompactionIndex(int level) {
+    next_file_to_compact_by_size_[level] = 0;
+  }
+
+  const InternalKeyComparator* InternalComparator() const {
+    return internal_comparator_;
+  }
+
+  // Returns maximum total bytes of data on a given level.
+  uint64_t MaxBytesForLevel(int level) const;
+
+  // Returns an estimate of the amount of live data in bytes.
+  uint64_t EstimateLiveDataSize() const;
+
+  uint64_t estimated_compaction_needed_bytes() const {
+    return estimated_compaction_needed_bytes_;
+  }
+
+  void TEST_set_estimated_compaction_needed_bytes(uint64_t v) {
+    estimated_compaction_needed_bytes_ = v;
+  }
+
+  bool force_consistency_checks() const { return force_consistency_checks_; }
+
+  SequenceNumber bottommost_files_mark_threshold() const {
+    return bottommost_files_mark_threshold_;
+  }
+
+  // Returns whether any key in [`smallest_key`, `largest_key`] could appear in
+  // an older L0 file than `last_l0_idx` or in a greater level than `last_level`
+  //
+  // @param last_level Level after which we check for overlap
+  // @param last_l0_idx If `last_level == 0`, index of L0 file after which we
+  //    check for overlap; otherwise, must be -1
+  bool RangeMightExistAfterSortedRun(const Slice& smallest_user_key,
+                                     const Slice& largest_user_key,
+                                     int last_level, int last_l0_idx);
+
+ private:
+  void ComputeCompensatedSizes();
+  void UpdateNumNonEmptyLevels();
+  void CalculateBaseBytes(const ImmutableOptions& ioptions,
+                          const MutableCFOptions& options);
+  void UpdateFilesByCompactionPri(const ImmutableOptions& immutable_options,
+                                  const MutableCFOptions& mutable_cf_options);
+
+  void GenerateFileIndexer() {
+    file_indexer_.UpdateIndex(&arena_, num_non_empty_levels_, files_);
+  }
+
+  void GenerateLevelFilesBrief();
+  void GenerateLevel0NonOverlapping();
+  void GenerateBottommostFiles();
+  void GenerateFileLocationIndex();
+
+  const InternalKeyComparator* internal_comparator_;
+  const Comparator* user_comparator_;
+  int num_levels_;            // Number of levels
+  int num_non_empty_levels_;  // Number of levels. Any level larger than it
+                              // is guaranteed to be empty.
+  // Per-level max bytes
+  std::vector<uint64_t> level_max_bytes_;
+
+  // A short brief metadata of files per level
+  autovector<ROCKSDB_NAMESPACE::LevelFilesBrief> level_files_brief_;
+  FileIndexer file_indexer_;
+  Arena arena_;  // Used to allocate space for file_levels_
+
+  CompactionStyle compaction_style_;
+
+  // List of files per level, files in each level are arranged
+  // in increasing order of keys
+  std::vector<FileMetaData*>* files_;
+
+  // Map of all table files in version. Maps file number to (level, position on
+  // level).
+  using FileLocations = UnorderedMap<uint64_t, FileLocation>;
+  FileLocations file_locations_;
+
+  // Vector of blob files in version sorted by blob file number.
+  BlobFiles blob_files_;
+
+  // Level that L0 data should be compacted to. All levels < base_level_ should
+  // be empty. -1 if it is not level-compaction so it's not applicable.
+  int base_level_;
+
+  double level_multiplier_;
+
+  // A list for the same set of files that are stored in files_,
+  // but files in each level are now sorted based on file
+  // size. The file with the largest size is at the front.
+  // This vector stores the index of the file from files_.
+  std::vector<std::vector<int>> files_by_compaction_pri_;
+
+  // If true, means that files in L0 have keys with non overlapping ranges
+  bool level0_non_overlapping_;
+
+  // An index into files_by_compaction_pri_ that specifies the first
+  // file that is not yet compacted
+  std::vector<int> next_file_to_compact_by_size_;
+
+  // Only the first few entries of files_by_compaction_pri_ are sorted.
+  // There is no need to sort all the files because it is likely
+  // that on a running system, we need to look at only the first
+  // few largest files because a new version is created every few
+  // seconds/minutes (because of concurrent compactions).
+  static const size_t number_of_files_to_sort_ = 50;
+
+  // This vector contains list of files marked for compaction and also not
+  // currently being compacted. It is protected by DB mutex. It is calculated in
+  // ComputeCompactionScore()
+  autovector<std::pair<int, FileMetaData*>> files_marked_for_compaction_;
+
+  autovector<std::pair<int, FileMetaData*>> expired_ttl_files_;
+
+  autovector<std::pair<int, FileMetaData*>>
+      files_marked_for_periodic_compaction_;
+
+  // These files are considered bottommost because none of their keys can exist
+  // at lower levels. They are not necessarily all in the same level. The marked
+  // ones are eligible for compaction because they contain duplicate key
+  // versions that are no longer protected by snapshot. These variables are
+  // protected by DB mutex and are calculated in `GenerateBottommostFiles()` and
+  // `ComputeBottommostFilesMarkedForCompaction()`.
+  autovector<std::pair<int, FileMetaData*>> bottommost_files_;
+  autovector<std::pair<int, FileMetaData*>>
+      bottommost_files_marked_for_compaction_;
+
+  autovector<std::pair<int, FileMetaData*>> files_marked_for_forced_blob_gc_;
+
+  // Threshold for needing to mark another bottommost file. Maintain it so we
+  // can quickly check when releasing a snapshot whether more bottommost files
+  // became eligible for compaction. It's defined as the min of the max nonzero
+  // seqnums of unmarked bottommost files.
+  SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+
+  // Monotonically increases as we release old snapshots. Zero indicates no
+  // snapshots have been released yet. When no snapshots remain we set it to the
+  // current seqnum, which needs to be protected as a snapshot can still be
+  // created that references it.
+  SequenceNumber oldest_snapshot_seqnum_ = 0;
+
+  // Level that should be compacted next and its compaction score.
+  // Score < 1 means compaction is not strictly needed.  These fields
+  // are initialized by ComputeCompactionScore.
+  // The most critical level to be compacted is listed first
+  // These are used to pick the best compaction level
+  std::vector<double> compaction_score_;
+  std::vector<int> compaction_level_;
+  int l0_delay_trigger_count_ = 0;  // Count used to trigger slow down and stop
+                                    // for number of L0 files.
+
+  // Compact cursors for round-robin compactions in each level
+  std::vector<InternalKey> compact_cursor_;
+
+  // the following are the sampled temporary stats.
+  // the current accumulated size of sampled files.
+  uint64_t accumulated_file_size_;
+  // the current accumulated size of all raw keys based on the sampled files.
+  uint64_t accumulated_raw_key_size_;
+  // the current accumulated size of all raw keys based on the sampled files.
+  uint64_t accumulated_raw_value_size_;
+  // total number of non-deletion entries
+  uint64_t accumulated_num_non_deletions_;
+  // total number of deletion entries
+  uint64_t accumulated_num_deletions_;
+  // current number of non_deletion entries
+  uint64_t current_num_non_deletions_;
+  // current number of deletion entries
+  uint64_t current_num_deletions_;
+  // current number of file samples
+  uint64_t current_num_samples_;
+  // Estimated bytes needed to be compacted until all levels' size is down to
+  // target sizes.
+  uint64_t estimated_compaction_needed_bytes_;
+
+  bool finalized_;
+
+  // If set to true, we will run consistency checks even if RocksDB
+  // is compiled in release mode
+  bool force_consistency_checks_;
+
+  friend class Version;
+  friend class VersionSet;
+};
+
+struct ObsoleteFileInfo {
+  FileMetaData* metadata;
+  std::string path;
+  // If true, the FileMataData should be destroyed but the file should
+  // not be deleted. This is because another FileMetaData still references
+  // the file, usually because the file is trivial moved so two FileMetadata
+  // is managing the file.
+  bool only_delete_metadata = false;
+
+  ObsoleteFileInfo() noexcept
+      : metadata(nullptr), only_delete_metadata(false) {}
+  ObsoleteFileInfo(FileMetaData* f, const std::string& file_path,
+                   std::shared_ptr<CacheReservationManager>
+                       file_metadata_cache_res_mgr_arg = nullptr)
+      : metadata(f),
+        path(file_path),
+        only_delete_metadata(false),
+        file_metadata_cache_res_mgr(file_metadata_cache_res_mgr_arg) {}
+
+  ObsoleteFileInfo(const ObsoleteFileInfo&) = delete;
+  ObsoleteFileInfo& operator=(const ObsoleteFileInfo&) = delete;
+
+  ObsoleteFileInfo(ObsoleteFileInfo&& rhs) noexcept : ObsoleteFileInfo() {
+    *this = std::move(rhs);
+  }
+
+  ObsoleteFileInfo& operator=(ObsoleteFileInfo&& rhs) noexcept {
+    path = std::move(rhs.path);
+    metadata = rhs.metadata;
+    rhs.metadata = nullptr;
+    file_metadata_cache_res_mgr = rhs.file_metadata_cache_res_mgr;
+    rhs.file_metadata_cache_res_mgr = nullptr;
+
+    return *this;
+  }
+  void DeleteMetadata() {
+    if (file_metadata_cache_res_mgr) {
+      Status s = file_metadata_cache_res_mgr->UpdateCacheReservation(
+          metadata->ApproximateMemoryUsage(), false /* increase */);
+      s.PermitUncheckedError();
+    }
+    delete metadata;
+    metadata = nullptr;
+  }
+
+ private:
+  std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr;
+};
+
+class ObsoleteBlobFileInfo {
+ public:
+  ObsoleteBlobFileInfo(uint64_t blob_file_number, std::string path)
+      : blob_file_number_(blob_file_number), path_(std::move(path)) {}
+
+  uint64_t GetBlobFileNumber() const { return blob_file_number_; }
+  const std::string& GetPath() const { return path_; }
+
+ private:
+  uint64_t blob_file_number_;
+  std::string path_;
+};
+
+using MultiGetRange = MultiGetContext::Range;
+// A column family's version consists of the table and blob files owned by
+// the column family at a certain point in time.
+class Version {
+ public:
+  // Append to *iters a sequence of iterators that will
+  // yield the contents of this Version when merged together.
+  // @param read_options Must outlive any iterator built by
+  // `merger_iter_builder`.
+  void AddIterators(const ReadOptions& read_options,
+                    const FileOptions& soptions,
+                    MergeIteratorBuilder* merger_iter_builder,
+                    bool allow_unprepared_value);
+
+  // @param read_options Must outlive any iterator built by
+  // `merger_iter_builder`.
+  void AddIteratorsForLevel(const ReadOptions& read_options,
+                            const FileOptions& soptions,
+                            MergeIteratorBuilder* merger_iter_builder,
+                            int level, bool allow_unprepared_value);
+
+  Status OverlapWithLevelIterator(const ReadOptions&, const FileOptions&,
+                                  const Slice& smallest_user_key,
+                                  const Slice& largest_user_key, int level,
+                                  bool* overlap);
+
+  // Lookup the value for key or get all merge operands for key.
+  // If do_merge = true (default) then lookup value for key.
+  // Behavior if do_merge = true:
+  //    If found, store it in *value and
+  //    return OK.  Else return a non-OK status.
+  //    Uses *operands to store merge_operator operations to apply later.
+  //
+  //    If the ReadOptions.read_tier is set to do a read-only fetch, then
+  //    *value_found will be set to false if it cannot be determined whether
+  //    this value exists without doing IO.
+  //
+  //    If the key is Deleted, *status will be set to NotFound and
+  //                        *key_exists will be set to true.
+  //    If no key was found, *status will be set to NotFound and
+  //                      *key_exists will be set to false.
+  //    If seq is non-null, *seq will be set to the sequence number found
+  //    for the key if a key was found.
+  // Behavior if do_merge = false
+  //    If the key has any merge operands then store them in
+  //    merge_context.operands_list and don't merge the operands
+  // REQUIRES: lock is not held
+  // REQUIRES: pinned_iters_mgr != nullptr
+  void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value,
+           PinnableWideColumns* columns, std::string* timestamp, Status* status,
+           MergeContext* merge_context,
+           SequenceNumber* max_covering_tombstone_seq,
+           PinnedIteratorsManager* pinned_iters_mgr,
+           bool* value_found = nullptr, bool* key_exists = nullptr,
+           SequenceNumber* seq = nullptr, ReadCallback* callback = nullptr,
+           bool* is_blob = nullptr, bool do_merge = true);
+
+  void MultiGet(const ReadOptions&, MultiGetRange* range,
+                ReadCallback* callback = nullptr);
+
+  // Interprets blob_index_slice as a blob reference, and (assuming the
+  // corresponding blob file is part of this Version) retrieves the blob and
+  // saves it in *value.
+  // REQUIRES: blob_index_slice stores an encoded blob reference
+  Status GetBlob(const ReadOptions& read_options, const Slice& user_key,
+                 const Slice& blob_index_slice,
+                 FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value,
+                 uint64_t* bytes_read) const;
+
+  // Retrieves a blob using a blob reference and saves it in *value,
+  // assuming the corresponding blob file is part of this Version.
+  Status GetBlob(const ReadOptions& read_options, const Slice& user_key,
+                 const BlobIndex& blob_index,
+                 FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value,
+                 uint64_t* bytes_read) const;
+
+  using BlobReadContext =
+      std::pair<BlobIndex, std::reference_wrapper<const KeyContext>>;
+  using BlobReadContexts = std::vector<BlobReadContext>;
+  void MultiGetBlob(const ReadOptions& read_options, MultiGetRange& range,
+                    std::unordered_map<uint64_t, BlobReadContexts>& blob_ctxs);
+
+  // Loads some stats information from files (if update_stats is set) and
+  // populates derived data structures. Call without mutex held. It needs to be
+  // called before appending the version to the version set.
+  void PrepareAppend(const MutableCFOptions& mutable_cf_options,
+                     bool update_stats);
+
+  // Reference count management (so Versions do not disappear out from
+  // under live iterators)
+  void Ref();
+  // Decrease reference count. Delete the object if no reference left
+  // and return true. Otherwise, return false.
+  bool Unref();
+
+  // Add all files listed in the current version to *live_table_files and
+  // *live_blob_files.
+  void AddLiveFiles(std::vector<uint64_t>* live_table_files,
+                    std::vector<uint64_t>* live_blob_files) const;
+
+  // Remove live files that are in the delete candidate lists.
+  void RemoveLiveFiles(
+      std::vector<ObsoleteFileInfo>& sst_delete_candidates,
+      std::vector<ObsoleteBlobFileInfo>& blob_delete_candidates) const;
+
+  // Return a human readable string that describes this version's contents.
+  std::string DebugString(bool hex = false, bool print_stats = false) const;
+
+  // Returns the version number of this version
+  uint64_t GetVersionNumber() const { return version_number_; }
+
+  // REQUIRES: lock is held
+  // On success, "tp" will contains the table properties of the file
+  // specified in "file_meta".  If the file name of "file_meta" is
+  // known ahead, passing it by a non-null "fname" can save a
+  // file-name conversion.
+  Status GetTableProperties(std::shared_ptr<const TableProperties>* tp,
+                            const FileMetaData* file_meta,
+                            const std::string* fname = nullptr) const;
+
+  // REQUIRES: lock is held
+  // On success, *props will be populated with all SSTables' table properties.
+  // The keys of `props` are the sst file name, the values of `props` are the
+  // tables' properties, represented as std::shared_ptr.
+  Status GetPropertiesOfAllTables(TablePropertiesCollection* props);
+  Status GetPropertiesOfAllTables(TablePropertiesCollection* props, int level);
+  Status GetPropertiesOfTablesInRange(const Range* range, std::size_t n,
+                                      TablePropertiesCollection* props) const;
+
+  // Print summary of range delete tombstones in SST files into out_str,
+  // with maximum max_entries_to_print entries printed out.
+  Status TablesRangeTombstoneSummary(int max_entries_to_print,
+                                     std::string* out_str);
+
+  // REQUIRES: lock is held
+  // On success, "tp" will contains the aggregated table property among
+  // the table properties of all sst files in this version.
+  Status GetAggregatedTableProperties(
+      std::shared_ptr<const TableProperties>* tp, int level = -1);
+
+  uint64_t GetEstimatedActiveKeys() {
+    return storage_info_.GetEstimatedActiveKeys();
+  }
+
+  size_t GetMemoryUsageByTableReaders();
+
+  ColumnFamilyData* cfd() const { return cfd_; }
+
+  // Return the next Version in the linked list.
+  Version* Next() const { return next_; }
+
+  int TEST_refs() const { return refs_; }
+
+  VersionStorageInfo* storage_info() { return &storage_info_; }
+  const VersionStorageInfo* storage_info() const { return &storage_info_; }
+
+  VersionSet* version_set() { return vset_; }
+
+  void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta);
+
+  uint64_t GetSstFilesSize();
+
+  // Retrieves the file_creation_time of the oldest file in the DB.
+  // Prerequisite for this API is max_open_files = -1
+  void GetCreationTimeOfOldestFile(uint64_t* creation_time);
+
+  const MutableCFOptions& GetMutableCFOptions() { return mutable_cf_options_; }
+
+  InternalIterator* TEST_GetLevelIterator(
+      const ReadOptions& read_options, MergeIteratorBuilder* merge_iter_builder,
+      int level, bool allow_unprepared_value);
+
+ private:
+  Env* env_;
+  SystemClock* clock_;
+
+  friend class ReactiveVersionSet;
+  friend class VersionSet;
+  friend class VersionEditHandler;
+  friend class VersionEditHandlerPointInTime;
+
+  const InternalKeyComparator* internal_comparator() const {
+    return storage_info_.internal_comparator_;
+  }
+  const Comparator* user_comparator() const {
+    return storage_info_.user_comparator_;
+  }
+
+  // Returns true if the filter blocks in the specified level will not be
+  // checked during read operations. In certain cases (trivial move or preload),
+  // the filter block may already be cached, but we still do not access it such
+  // that it eventually expires from the cache.
+  bool IsFilterSkipped(int level, bool is_file_last_in_level = false);
+
+  // The helper function of UpdateAccumulatedStats, which may fill the missing
+  // fields of file_meta from its associated TableProperties.
+  // Returns true if it does initialize FileMetaData.
+  bool MaybeInitializeFileMetaData(FileMetaData* file_meta);
+
+  // Update the accumulated stats associated with the current version.
+  // This accumulated stats will be used in compaction.
+  void UpdateAccumulatedStats();
+
+  DECLARE_SYNC_AND_ASYNC(
+      /* ret_type */ Status, /* func_name */ MultiGetFromSST,
+      const ReadOptions& read_options, MultiGetRange file_range,
+      int hit_file_level, bool skip_filters, bool skip_range_deletions,
+      FdWithKeyRange* f,
+      std::unordered_map<uint64_t, BlobReadContexts>& blob_ctxs,
+      Cache::Handle* table_handle, uint64_t& num_filter_read,
+      uint64_t& num_index_read, uint64_t& num_sst_read);
+
+#ifdef USE_COROUTINES
+  // MultiGet using async IO to read data blocks from SST files in parallel
+  // within and across levels
+  Status MultiGetAsync(
+      const ReadOptions& options, MultiGetRange* range,
+      std::unordered_map<uint64_t, BlobReadContexts>* blob_ctxs);
+
+  // A helper function to lookup a batch of keys in a single level. It will
+  // queue coroutine tasks to mget_tasks. It may also split the input batch
+  // by creating a new batch with keys definitely not in this level and
+  // enqueuing it to to_process.
+  Status ProcessBatch(
+      const ReadOptions& read_options, FilePickerMultiGet* batch,
+      std::vector<folly::coro::Task<Status>>& mget_tasks,
+      std::unordered_map<uint64_t, BlobReadContexts>* blob_ctxs,
+      autovector<FilePickerMultiGet, 4>& batches, std::deque<size_t>& waiting,
+      std::deque<size_t>& to_process, unsigned int& num_tasks_queued,
+      std::unordered_map<int, std::tuple<uint64_t, uint64_t, uint64_t>>&
+          mget_stats);
+#endif
+
+  ColumnFamilyData* cfd_;  // ColumnFamilyData to which this Version belongs
+  Logger* info_log_;
+  Statistics* db_statistics_;
+  TableCache* table_cache_;
+  BlobSource* blob_source_;
+  const MergeOperator* merge_operator_;
+
+  VersionStorageInfo storage_info_;
+  VersionSet* vset_;  // VersionSet to which this Version belongs
+  Version* next_;     // Next version in linked list
+  Version* prev_;     // Previous version in linked list
+  int refs_;          // Number of live refs to this version
+  const FileOptions file_options_;
+  const MutableCFOptions mutable_cf_options_;
+  // Cached value to avoid recomputing it on every read.
+  const size_t max_file_size_for_l0_meta_pin_;
+
+  // A version number that uniquely represents this version. This is
+  // used for debugging and logging purposes only.
+  uint64_t version_number_;
+  std::shared_ptr<IOTracer> io_tracer_;
+
+  Version(ColumnFamilyData* cfd, VersionSet* vset, const FileOptions& file_opt,
+          MutableCFOptions mutable_cf_options,
+          const std::shared_ptr<IOTracer>& io_tracer,
+          uint64_t version_number = 0);
+
+  ~Version();
+
+  // No copying allowed
+  Version(const Version&) = delete;
+  void operator=(const Version&) = delete;
+};
+
+class BaseReferencedVersionBuilder;
+
+class AtomicGroupReadBuffer {
+ public:
+  AtomicGroupReadBuffer() = default;
+  Status AddEdit(VersionEdit* edit);
+  void Clear();
+  bool IsFull() const;
+  bool IsEmpty() const;
+
+  uint64_t TEST_read_edits_in_atomic_group() const {
+    return read_edits_in_atomic_group_;
+  }
+  std::vector<VersionEdit>& replay_buffer() { return replay_buffer_; }
+
+ private:
+  uint64_t read_edits_in_atomic_group_ = 0;
+  std::vector<VersionEdit> replay_buffer_;
+};
+
+// VersionSet is the collection of versions of all the column families of the
+// database. Each database owns one VersionSet. A VersionSet has access to all
+// column families via ColumnFamilySet, i.e. set of the column families.
+class VersionSet {
+ public:
+  VersionSet(const std::string& dbname, const ImmutableDBOptions* db_options,
+             const FileOptions& file_options, Cache* table_cache,
+             WriteBufferManager* write_buffer_manager,
+             WriteController* write_controller,
+             BlockCacheTracer* const block_cache_tracer,
+             const std::shared_ptr<IOTracer>& io_tracer,
+             const std::string& db_id, const std::string& db_session_id);
+  // No copying allowed
+  VersionSet(const VersionSet&) = delete;
+  void operator=(const VersionSet&) = delete;
+
+  virtual ~VersionSet();
+
+  Status LogAndApplyToDefaultColumnFamily(
+      VersionEdit* edit, InstrumentedMutex* mu,
+      FSDirectory* dir_contains_current_file, bool new_descriptor_log = false,
+      const ColumnFamilyOptions* column_family_options = nullptr) {
+    ColumnFamilyData* default_cf = GetColumnFamilySet()->GetDefault();
+    const MutableCFOptions* cf_options =
+        default_cf->GetLatestMutableCFOptions();
+    return LogAndApply(default_cf, *cf_options, edit, mu,
+                       dir_contains_current_file, new_descriptor_log,
+                       column_family_options);
+  }
+
+  // Apply *edit to the current version to form a new descriptor that
+  // is both saved to persistent state and installed as the new
+  // current version.  Will release *mu while actually writing to the file.
+  // column_family_options has to be set if edit is column family add
+  // REQUIRES: *mu is held on entry.
+  // REQUIRES: no other thread concurrently calls LogAndApply()
+  Status LogAndApply(
+      ColumnFamilyData* column_family_data,
+      const MutableCFOptions& mutable_cf_options, VersionEdit* edit,
+      InstrumentedMutex* mu, FSDirectory* dir_contains_current_file,
+      bool new_descriptor_log = false,
+      const ColumnFamilyOptions* column_family_options = nullptr) {
+    autovector<ColumnFamilyData*> cfds;
+    cfds.emplace_back(column_family_data);
+    autovector<const MutableCFOptions*> mutable_cf_options_list;
+    mutable_cf_options_list.emplace_back(&mutable_cf_options);
+    autovector<autovector<VersionEdit*>> edit_lists;
+    autovector<VersionEdit*> edit_list;
+    edit_list.emplace_back(edit);
+    edit_lists.emplace_back(edit_list);
+    return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
+                       dir_contains_current_file, new_descriptor_log,
+                       column_family_options);
+  }
+  // The batch version. If edit_list.size() > 1, caller must ensure that
+  // no edit in the list column family add or drop
+  Status LogAndApply(
+      ColumnFamilyData* column_family_data,
+      const MutableCFOptions& mutable_cf_options,
+      const autovector<VersionEdit*>& edit_list, InstrumentedMutex* mu,
+      FSDirectory* dir_contains_current_file, bool new_descriptor_log = false,
+      const ColumnFamilyOptions* column_family_options = nullptr,
+      const std::function<void(const Status&)>& manifest_wcb = {}) {
+    autovector<ColumnFamilyData*> cfds;
+    cfds.emplace_back(column_family_data);
+    autovector<const MutableCFOptions*> mutable_cf_options_list;
+    mutable_cf_options_list.emplace_back(&mutable_cf_options);
+    autovector<autovector<VersionEdit*>> edit_lists;
+    edit_lists.emplace_back(edit_list);
+    return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
+                       dir_contains_current_file, new_descriptor_log,
+                       column_family_options, {manifest_wcb});
+  }
+
+  // The across-multi-cf batch version. If edit_lists contain more than
+  // 1 version edits, caller must ensure that no edit in the []list is column
+  // family manipulation.
+  virtual Status LogAndApply(
+      const autovector<ColumnFamilyData*>& cfds,
+      const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+      const autovector<autovector<VersionEdit*>>& edit_lists,
+      InstrumentedMutex* mu, FSDirectory* dir_contains_current_file,
+      bool new_descriptor_log = false,
+      const ColumnFamilyOptions* new_cf_options = nullptr,
+      const std::vector<std::function<void(const Status&)>>& manifest_wcbs =
+          {});
+
+  static Status GetCurrentManifestPath(const std::string& dbname,
+                                       FileSystem* fs,
+                                       std::string* manifest_filename,
+                                       uint64_t* manifest_file_number);
+  void WakeUpWaitingManifestWriters();
+
+  // Recover the last saved descriptor from persistent storage.
+  // If read_only == true, Recover() will not complain if some column families
+  // are not opened
+  Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
+                 bool read_only = false, std::string* db_id = nullptr,
+                 bool no_error_if_files_missing = false);
+
+  Status TryRecover(const std::vector<ColumnFamilyDescriptor>& column_families,
+                    bool read_only,
+                    const std::vector<std::string>& files_in_dbname,
+                    std::string* db_id, bool* has_missing_table_file);
+
+  // Try to recover the version set to the most recent consistent state
+  // recorded in the specified manifest.
+  Status TryRecoverFromOneManifest(
+      const std::string& manifest_path,
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      bool read_only, std::string* db_id, bool* has_missing_table_file);
+
+  // Reads a manifest file and returns a list of column families in
+  // column_families.
+  static Status ListColumnFamilies(std::vector<std::string>* column_families,
+                                   const std::string& dbname, FileSystem* fs);
+  static Status ListColumnFamiliesFromManifest(
+      const std::string& manifest_path, FileSystem* fs,
+      std::vector<std::string>* column_families);
+
+#ifndef ROCKSDB_LITE
+  // Try to reduce the number of levels. This call is valid when
+  // only one level from the new max level to the old
+  // max level containing files.
+  // The call is static, since number of levels is immutable during
+  // the lifetime of a RocksDB instance. It reduces number of levels
+  // in a DB by applying changes to manifest.
+  // For example, a db currently has 7 levels [0-6], and a call to
+  // to reduce to 5 [0-4] can only be executed when only one level
+  // among [4-6] contains files.
+  static Status ReduceNumberOfLevels(const std::string& dbname,
+                                     const Options* options,
+                                     const FileOptions& file_options,
+                                     int new_levels);
+
+  // Get the checksum information of all live files
+  Status GetLiveFilesChecksumInfo(FileChecksumList* checksum_list);
+
+  // printf contents (for debugging)
+  Status DumpManifest(Options& options, std::string& manifestFileName,
+                      bool verbose, bool hex = false, bool json = false);
+
+#endif  // ROCKSDB_LITE
+
+  const std::string& DbSessionId() const { return db_session_id_; }
+
+  // Return the current manifest file number
+  uint64_t manifest_file_number() const { return manifest_file_number_; }
+
+  uint64_t options_file_number() const { return options_file_number_; }
+
+  uint64_t pending_manifest_file_number() const {
+    return pending_manifest_file_number_;
+  }
+
+  uint64_t current_next_file_number() const { return next_file_number_.load(); }
+
+  uint64_t min_log_number_to_keep() const {
+    return min_log_number_to_keep_.load();
+  }
+
+  // Allocate and return a new file number
+  uint64_t NewFileNumber() { return next_file_number_.fetch_add(1); }
+
+  // Fetch And Add n new file number
+  uint64_t FetchAddFileNumber(uint64_t n) {
+    return next_file_number_.fetch_add(n);
+  }
+
+  // Return the last sequence number.
+  uint64_t LastSequence() const {
+    return last_sequence_.load(std::memory_order_acquire);
+  }
+
+  // Note: memory_order_acquire must be sufficient.
+  uint64_t LastAllocatedSequence() const {
+    return last_allocated_sequence_.load(std::memory_order_seq_cst);
+  }
+
+  // Note: memory_order_acquire must be sufficient.
+  uint64_t LastPublishedSequence() const {
+    return last_published_sequence_.load(std::memory_order_seq_cst);
+  }
+
+  // Set the last sequence number to s.
+  void SetLastSequence(uint64_t s) {
+    assert(s >= last_sequence_);
+    // Last visible sequence must always be less than last written seq
+    assert(!db_options_->two_write_queues || s <= last_allocated_sequence_);
+    last_sequence_.store(s, std::memory_order_release);
+  }
+
+  // Note: memory_order_release must be sufficient
+  void SetLastPublishedSequence(uint64_t s) {
+    assert(s >= last_published_sequence_);
+    last_published_sequence_.store(s, std::memory_order_seq_cst);
+  }
+
+  // Note: memory_order_release must be sufficient
+  void SetLastAllocatedSequence(uint64_t s) {
+    assert(s >= last_allocated_sequence_);
+    last_allocated_sequence_.store(s, std::memory_order_seq_cst);
+  }
+
+  // Note: memory_order_release must be sufficient
+  uint64_t FetchAddLastAllocatedSequence(uint64_t s) {
+    return last_allocated_sequence_.fetch_add(s, std::memory_order_seq_cst);
+  }
+
+  // Mark the specified file number as used.
+  // REQUIRED: this is only called during single-threaded recovery or repair.
+  void MarkFileNumberUsed(uint64_t number);
+
+  // Mark the specified log number as deleted
+  // REQUIRED: this is only called during single-threaded recovery or repair, or
+  // from ::LogAndApply where the global mutex is held.
+  void MarkMinLogNumberToKeep(uint64_t number);
+
+  // Return the log file number for the log file that is currently
+  // being compacted, or zero if there is no such log file.
+  uint64_t prev_log_number() const { return prev_log_number_; }
+
+  // Returns the minimum log number which still has data not flushed to any SST
+  // file.
+  // In non-2PC mode, all the log numbers smaller than this number can be safely
+  // deleted, although we still use `min_log_number_to_keep_` to determine when
+  // to delete a WAL file.
+  uint64_t MinLogNumberWithUnflushedData() const {
+    return PreComputeMinLogNumberWithUnflushedData(nullptr);
+  }
+
+  // Returns the minimum log number which still has data not flushed to any SST
+  // file.
+  // Empty column families' log number is considered to be
+  // new_log_number_for_empty_cf.
+  uint64_t PreComputeMinLogNumberWithUnflushedData(
+      uint64_t new_log_number_for_empty_cf) const {
+    uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
+    for (auto cfd : *column_family_set_) {
+      // It's safe to ignore dropped column families here:
+      // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST.
+      uint64_t num =
+          cfd->IsEmpty() ? new_log_number_for_empty_cf : cfd->GetLogNumber();
+      if (min_log_num > num && !cfd->IsDropped()) {
+        min_log_num = num;
+      }
+    }
+    return min_log_num;
+  }
+  // Returns the minimum log number which still has data not flushed to any SST
+  // file, except data from `cfd_to_skip`.
+  uint64_t PreComputeMinLogNumberWithUnflushedData(
+      const ColumnFamilyData* cfd_to_skip) const {
+    uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
+    for (auto cfd : *column_family_set_) {
+      if (cfd == cfd_to_skip) {
+        continue;
+      }
+      // It's safe to ignore dropped column families here:
+      // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST.
+      if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) {
+        min_log_num = cfd->GetLogNumber();
+      }
+    }
+    return min_log_num;
+  }
+  // Returns the minimum log number which still has data not flushed to any SST
+  // file, except data from `cfds_to_skip`.
+  uint64_t PreComputeMinLogNumberWithUnflushedData(
+      const std::unordered_set<const ColumnFamilyData*>& cfds_to_skip) const {
+    uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
+    for (auto cfd : *column_family_set_) {
+      if (cfds_to_skip.count(cfd)) {
+        continue;
+      }
+      // It's safe to ignore dropped column families here:
+      // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST.
+      if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) {
+        min_log_num = cfd->GetLogNumber();
+      }
+    }
+    return min_log_num;
+  }
+
+  // Create an iterator that reads over the compaction inputs for "*c".
+  // The caller should delete the iterator when no longer needed.
+  // @param read_options Must outlive the returned iterator.
+  // @param start, end indicates compaction range
+  InternalIterator* MakeInputIterator(
+      const ReadOptions& read_options, const Compaction* c,
+      RangeDelAggregator* range_del_agg,
+      const FileOptions& file_options_compactions,
+      const std::optional<const Slice>& start,
+      const std::optional<const Slice>& end);
+
+  // Add all files listed in any live version to *live_table_files and
+  // *live_blob_files. Note that these lists may contain duplicates.
+  void AddLiveFiles(std::vector<uint64_t>* live_table_files,
+                    std::vector<uint64_t>* live_blob_files) const;
+
+  // Remove live files that are in the delete candidate lists.
+  void RemoveLiveFiles(
+      std::vector<ObsoleteFileInfo>& sst_delete_candidates,
+      std::vector<ObsoleteBlobFileInfo>& blob_delete_candidates) const;
+
+  // Return the approximate size of data to be scanned for range [start, end)
+  // in levels [start_level, end_level). If end_level == -1 it will search
+  // through all non-empty levels
+  uint64_t ApproximateSize(const SizeApproximationOptions& options, Version* v,
+                           const Slice& start, const Slice& end,
+                           int start_level, int end_level,
+                           TableReaderCaller caller);
+
+  // Return the size of the current manifest file
+  uint64_t manifest_file_size() const { return manifest_file_size_; }
+
+  Status GetMetadataForFile(uint64_t number, int* filelevel,
+                            FileMetaData** metadata, ColumnFamilyData** cfd);
+
+  // This function doesn't support leveldb SST filenames
+  void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata);
+
+  void AddObsoleteBlobFile(uint64_t blob_file_number, std::string path) {
+    assert(table_cache_);
+
+    table_cache_->Erase(GetSlice(&blob_file_number));
+
+    obsolete_blob_files_.emplace_back(blob_file_number, std::move(path));
+  }
+
+  void GetObsoleteFiles(std::vector<ObsoleteFileInfo>* files,
+                        std::vector<ObsoleteBlobFileInfo>* blob_files,
+                        std::vector<std::string>* manifest_filenames,
+                        uint64_t min_pending_output);
+
+  ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
+  RefedColumnFamilySet GetRefedColumnFamilySet() {
+    return RefedColumnFamilySet(GetColumnFamilySet());
+  }
+
+  const FileOptions& file_options() { return file_options_; }
+  void ChangeFileOptions(const MutableDBOptions& new_options) {
+    file_options_.writable_file_max_buffer_size =
+        new_options.writable_file_max_buffer_size;
+  }
+
+  const ImmutableDBOptions* db_options() const { return db_options_; }
+
+  static uint64_t GetNumLiveVersions(Version* dummy_versions);
+
+  static uint64_t GetTotalSstFilesSize(Version* dummy_versions);
+
+  static uint64_t GetTotalBlobFileSize(Version* dummy_versions);
+
+  // Get the IO Status returned by written Manifest.
+  const IOStatus& io_status() const { return io_status_; }
+
+  // The returned WalSet needs to be accessed with DB mutex held.
+  const WalSet& GetWalSet() const { return wals_; }
+
+  void TEST_CreateAndAppendVersion(ColumnFamilyData* cfd) {
+    assert(cfd);
+
+    const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+    Version* const version =
+        new Version(cfd, this, file_options_, mutable_cf_options, io_tracer_);
+
+    constexpr bool update_stats = false;
+    version->PrepareAppend(mutable_cf_options, update_stats);
+    AppendVersion(cfd, version);
+  }
+
+ protected:
+  using VersionBuilderMap =
+      UnorderedMap<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>;
+
+  struct ManifestWriter;
+
+  friend class Version;
+  friend class VersionEditHandler;
+  friend class VersionEditHandlerPointInTime;
+  friend class DumpManifestHandler;
+  friend class DBImpl;
+  friend class DBImplReadOnly;
+
+  struct LogReporter : public log::Reader::Reporter {
+    Status* status;
+    virtual void Corruption(size_t /*bytes*/, const Status& s) override {
+      if (status->ok()) {
+        *status = s;
+      }
+    }
+  };
+
+  void Reset();
+
+  // Returns approximated offset of a key in a file for a given version.
+  uint64_t ApproximateOffsetOf(Version* v, const FdWithKeyRange& f,
+                               const Slice& key, TableReaderCaller caller);
+
+  // Returns approximated data size between start and end keys in a file
+  // for a given version.
+  uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f,
+                           const Slice& start, const Slice& end,
+                           TableReaderCaller caller);
+
+  struct MutableCFState {
+    uint64_t log_number;
+    std::string full_history_ts_low;
+
+    explicit MutableCFState() = default;
+    explicit MutableCFState(uint64_t _log_number, std::string ts_low)
+        : log_number(_log_number), full_history_ts_low(std::move(ts_low)) {}
+  };
+
+  // Save current contents to *log
+  Status WriteCurrentStateToManifest(
+      const std::unordered_map<uint32_t, MutableCFState>& curr_state,
+      const VersionEdit& wal_additions, log::Writer* log, IOStatus& io_s);
+
+  void AppendVersion(ColumnFamilyData* column_family_data, Version* v);
+
+  ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
+                                       const VersionEdit* edit);
+
+  Status VerifyFileMetadata(ColumnFamilyData* cfd, const std::string& fpath,
+                            int level, const FileMetaData& meta);
+
+  // Protected by DB mutex.
+  WalSet wals_;
+
+  std::unique_ptr<ColumnFamilySet> column_family_set_;
+  Cache* table_cache_;
+  Env* const env_;
+  FileSystemPtr const fs_;
+  SystemClock* const clock_;
+  const std::string dbname_;
+  std::string db_id_;
+  const ImmutableDBOptions* const db_options_;
+  std::atomic<uint64_t> next_file_number_;
+  // Any WAL number smaller than this should be ignored during recovery,
+  // and is qualified for being deleted.
+  std::atomic<uint64_t> min_log_number_to_keep_ = {0};
+  uint64_t manifest_file_number_;
+  uint64_t options_file_number_;
+  uint64_t options_file_size_;
+  uint64_t pending_manifest_file_number_;
+  // The last seq visible to reads. It normally indicates the last sequence in
+  // the memtable but when using two write queues it could also indicate the
+  // last sequence in the WAL visible to reads.
+  std::atomic<uint64_t> last_sequence_;
+  // The last sequence number of data committed to the descriptor (manifest
+  // file).
+  SequenceNumber descriptor_last_sequence_ = 0;
+  // The last seq that is already allocated. It is applicable only when we have
+  // two write queues. In that case seq might or might not have appreated in
+  // memtable but it is expected to appear in the WAL.
+  // We have last_sequence <= last_allocated_sequence_
+  std::atomic<uint64_t> last_allocated_sequence_;
+  // The last allocated sequence that is also published to the readers. This is
+  // applicable only when last_seq_same_as_publish_seq_ is not set. Otherwise
+  // last_sequence_ also indicates the last published seq.
+  // We have last_sequence <= last_published_sequence_ <=
+  // last_allocated_sequence_
+  std::atomic<uint64_t> last_published_sequence_;
+  uint64_t prev_log_number_;  // 0 or backing store for memtable being compacted
+
+  // Opened lazily
+  std::unique_ptr<log::Writer> descriptor_log_;
+
+  // generates a increasing version number for every new version
+  uint64_t current_version_number_;
+
+  // Queue of writers to the manifest file
+  std::deque<ManifestWriter*> manifest_writers_;
+
+  // Current size of manifest file
+  uint64_t manifest_file_size_;
+
+  std::vector<ObsoleteFileInfo> obsolete_files_;
+  std::vector<ObsoleteBlobFileInfo> obsolete_blob_files_;
+  std::vector<std::string> obsolete_manifests_;
+
+  // env options for all reads and writes except compactions
+  FileOptions file_options_;
+
+  BlockCacheTracer* const block_cache_tracer_;
+
+  // Store the IO status when Manifest is written
+  IOStatus io_status_;
+
+  std::shared_ptr<IOTracer> io_tracer_;
+
+  std::string db_session_id_;
+
+ private:
+  // REQUIRES db mutex at beginning. may release and re-acquire db mutex
+  Status ProcessManifestWrites(std::deque<ManifestWriter>& writers,
+                               InstrumentedMutex* mu,
+                               FSDirectory* dir_contains_current_file,
+                               bool new_descriptor_log,
+                               const ColumnFamilyOptions* new_cf_options);
+
+  void LogAndApplyCFHelper(VersionEdit* edit,
+                           SequenceNumber* max_last_sequence);
+  Status LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b,
+                           VersionEdit* edit, SequenceNumber* max_last_sequence,
+                           InstrumentedMutex* mu);
+};
+
+// ReactiveVersionSet represents a collection of versions of the column
+// families of the database. Users of ReactiveVersionSet, e.g. DBImplSecondary,
+// need to replay the MANIFEST (description log in older terms) in order to
+// reconstruct and install versions.
+class ReactiveVersionSet : public VersionSet {
+ public:
+  ReactiveVersionSet(const std::string& dbname,
+                     const ImmutableDBOptions* _db_options,
+                     const FileOptions& _file_options, Cache* table_cache,
+                     WriteBufferManager* write_buffer_manager,
+                     WriteController* write_controller,
+                     const std::shared_ptr<IOTracer>& io_tracer);
+
+  ~ReactiveVersionSet() override;
+
+  Status ReadAndApply(
+      InstrumentedMutex* mu,
+      std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
+      Status* manifest_read_status,
+      std::unordered_set<ColumnFamilyData*>* cfds_changed);
+
+  Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
+                 std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
+                 std::unique_ptr<log::Reader::Reporter>* manifest_reporter,
+                 std::unique_ptr<Status>* manifest_reader_status);
+#ifndef NDEBUG
+  uint64_t TEST_read_edits_in_atomic_group() const;
+#endif  //! NDEBUG
+
+  std::vector<VersionEdit>& replay_buffer();
+
+ protected:
+  // REQUIRES db mutex
+  Status ApplyOneVersionEditToBuilder(
+      VersionEdit& edit, std::unordered_set<ColumnFamilyData*>* cfds_changed,
+      VersionEdit* version_edit);
+
+  Status MaybeSwitchManifest(
+      log::Reader::Reporter* reporter,
+      std::unique_ptr<log::FragmentBufferedReader>* manifest_reader);
+
+ private:
+  std::unique_ptr<ManifestTailer> manifest_tailer_;
+
+  using VersionSet::LogAndApply;
+  using VersionSet::Recover;
+
+  Status LogAndApply(
+      const autovector<ColumnFamilyData*>& /*cfds*/,
+      const autovector<const MutableCFOptions*>& /*mutable_cf_options_list*/,
+      const autovector<autovector<VersionEdit*>>& /*edit_lists*/,
+      InstrumentedMutex* /*mu*/, FSDirectory* /*dir_contains_current_file*/,
+      bool /*new_descriptor_log*/, const ColumnFamilyOptions* /*new_cf_option*/,
+      const std::vector<std::function<void(const Status&)>>& /*manifest_wcbs*/)
+      override {
+    return Status::NotSupported("not supported in reactive mode");
+  }
+
+  // No copy allowed
+  ReactiveVersionSet(const ReactiveVersionSet&);
+  ReactiveVersionSet& operator=(const ReactiveVersionSet&);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_set_sync_and_async.h b/src/rocksdb/db/version_set_sync_and_async.h
new file mode 100644
index 000000000..755585990
--- /dev/null
+++ b/src/rocksdb/db/version_set_sync_and_async.h
@@ -0,0 +1,151 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/coro_utils.h"
+
+#if defined(WITHOUT_COROUTINES) || \
+    (defined(USE_COROUTINES) && defined(WITH_COROUTINES))
+
+namespace ROCKSDB_NAMESPACE {
+
+// Lookup a batch of keys in a single SST file
+DEFINE_SYNC_AND_ASYNC(Status, Version::MultiGetFromSST)
+(const ReadOptions& read_options, MultiGetRange file_range, int hit_file_level,
+ bool skip_filters, bool skip_range_deletions, FdWithKeyRange* f,
+ std::unordered_map<uint64_t, BlobReadContexts>& blob_ctxs,
+ Cache::Handle* table_handle, uint64_t& num_filter_read,
+ uint64_t& num_index_read, uint64_t& num_sst_read) {
+  bool timer_enabled = GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
+                       get_perf_context()->per_level_perf_context_enabled;
+
+  Status s;
+  StopWatchNano timer(clock_, timer_enabled /* auto_start */);
+  s = CO_AWAIT(table_cache_->MultiGet)(
+      read_options, *internal_comparator(), *f->file_metadata, &file_range,
+      mutable_cf_options_.prefix_extractor,
+      cfd_->internal_stats()->GetFileReadHist(hit_file_level), skip_filters,
+      skip_range_deletions, hit_file_level, table_handle);
+  // TODO: examine the behavior for corrupted key
+  if (timer_enabled) {
+    PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(),
+                              hit_file_level);
+  }
+  if (!s.ok()) {
+    // TODO: Set status for individual keys appropriately
+    for (auto iter = file_range.begin(); iter != file_range.end(); ++iter) {
+      *iter->s = s;
+      file_range.MarkKeyDone(iter);
+    }
+    CO_RETURN s;
+  }
+  uint64_t batch_size = 0;
+  for (auto iter = file_range.begin(); s.ok() && iter != file_range.end();
+       ++iter) {
+    GetContext& get_context = *iter->get_context;
+    Status* status = iter->s;
+    // The Status in the KeyContext takes precedence over GetContext state
+    // Status may be an error if there were any IO errors in the table
+    // reader. We never expect Status to be NotFound(), as that is
+    // determined by get_context
+    assert(!status->IsNotFound());
+    if (!status->ok()) {
+      file_range.MarkKeyDone(iter);
+      continue;
+    }
+
+    if (get_context.sample()) {
+      sample_file_read_inc(f->file_metadata);
+    }
+    batch_size++;
+    num_index_read += get_context.get_context_stats_.num_index_read;
+    num_filter_read += get_context.get_context_stats_.num_filter_read;
+    num_sst_read += get_context.get_context_stats_.num_sst_read;
+    // Reset these stats since they're specific to a level
+    get_context.get_context_stats_.num_index_read = 0;
+    get_context.get_context_stats_.num_filter_read = 0;
+    get_context.get_context_stats_.num_sst_read = 0;
+
+    // report the counters before returning
+    if (get_context.State() != GetContext::kNotFound &&
+        get_context.State() != GetContext::kMerge &&
+        db_statistics_ != nullptr) {
+      get_context.ReportCounters();
+    } else {
+      if (iter->max_covering_tombstone_seq > 0) {
+        // The remaining files we look at will only contain covered keys, so
+        // we stop here for this key
+        file_range.SkipKey(iter);
+      }
+    }
+    switch (get_context.State()) {
+      case GetContext::kNotFound:
+        // Keep searching in other files
+        break;
+      case GetContext::kMerge:
+        // TODO: update per-level perfcontext user_key_return_count for kMerge
+        break;
+      case GetContext::kFound:
+        if (hit_file_level == 0) {
+          RecordTick(db_statistics_, GET_HIT_L0);
+        } else if (hit_file_level == 1) {
+          RecordTick(db_statistics_, GET_HIT_L1);
+        } else if (hit_file_level >= 2) {
+          RecordTick(db_statistics_, GET_HIT_L2_AND_UP);
+        }
+
+        PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1, hit_file_level);
+
+        file_range.MarkKeyDone(iter);
+
+        if (iter->is_blob_index) {
+          if (iter->value) {
+            TEST_SYNC_POINT_CALLBACK("Version::MultiGet::TamperWithBlobIndex",
+                                     &(*iter));
+
+            const Slice& blob_index_slice = *(iter->value);
+            BlobIndex blob_index;
+            Status tmp_s = blob_index.DecodeFrom(blob_index_slice);
+            if (tmp_s.ok()) {
+              const uint64_t blob_file_num = blob_index.file_number();
+              blob_ctxs[blob_file_num].emplace_back(
+                  std::make_pair(blob_index, std::cref(*iter)));
+            } else {
+              *(iter->s) = tmp_s;
+            }
+          }
+        } else {
+          file_range.AddValueSize(iter->value->size());
+          if (file_range.GetValueSize() > read_options.value_size_soft_limit) {
+            s = Status::Aborted();
+            break;
+          }
+        }
+        continue;
+      case GetContext::kDeleted:
+        // Use empty error message for speed
+        *status = Status::NotFound();
+        file_range.MarkKeyDone(iter);
+        continue;
+      case GetContext::kCorrupt:
+        *status =
+            Status::Corruption("corrupted key for ", iter->lkey->user_key());
+        file_range.MarkKeyDone(iter);
+        continue;
+      case GetContext::kUnexpectedBlobIndex:
+        ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index.");
+        *status = Status::NotSupported(
+            "Encounter unexpected blob index. Please open DB with "
+            "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
+        file_range.MarkKeyDone(iter);
+        continue;
+    }
+  }
+
+  RecordInHistogram(db_statistics_, SST_BATCH_SIZE, batch_size);
+  CO_RETURN s;
+}
+}  // namespace ROCKSDB_NAMESPACE
+#endif
diff --git a/src/rocksdb/db/version_set_test.cc b/src/rocksdb/db/version_set_test.cc
new file mode 100644
index 000000000..7d17406c1
--- /dev/null
+++ b/src/rocksdb/db/version_set_test.cc
@@ -0,0 +1,3587 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_set.h"
+
+#include <algorithm>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/log_writer.h"
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/file_system.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/mock_table.h"
+#include "table/unique_id_impl.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class GenerateLevelFilesBriefTest : public testing::Test {
+ public:
+  std::vector<FileMetaData*> files_;
+  LevelFilesBrief file_level_;
+  Arena arena_;
+
+  GenerateLevelFilesBriefTest() {}
+
+  ~GenerateLevelFilesBriefTest() override {
+    for (size_t i = 0; i < files_.size(); i++) {
+      delete files_[i];
+    }
+  }
+
+  void Add(const char* smallest, const char* largest,
+           SequenceNumber smallest_seq = 100,
+           SequenceNumber largest_seq = 100) {
+    FileMetaData* f = new FileMetaData(
+        files_.size() + 1, 0, 0,
+        InternalKey(smallest, smallest_seq, kTypeValue),
+        InternalKey(largest, largest_seq, kTypeValue), smallest_seq,
+        largest_seq, /* marked_for_compact */ false, Temperature::kUnknown,
+        kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+        kUnknownFileCreationTime, kUnknownFileChecksum,
+        kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+    files_.push_back(f);
+  }
+
+  int Compare() {
+    int diff = 0;
+    for (size_t i = 0; i < files_.size(); i++) {
+      if (file_level_.files[i].fd.GetNumber() != files_[i]->fd.GetNumber()) {
+        diff++;
+      }
+    }
+    return diff;
+  }
+};
+
+TEST_F(GenerateLevelFilesBriefTest, Empty) {
+  DoGenerateLevelFilesBrief(&file_level_, files_, &arena_);
+  ASSERT_EQ(0u, file_level_.num_files);
+  ASSERT_EQ(0, Compare());
+}
+
+TEST_F(GenerateLevelFilesBriefTest, Single) {
+  Add("p", "q");
+  DoGenerateLevelFilesBrief(&file_level_, files_, &arena_);
+  ASSERT_EQ(1u, file_level_.num_files);
+  ASSERT_EQ(0, Compare());
+}
+
+TEST_F(GenerateLevelFilesBriefTest, Multiple) {
+  Add("150", "200");
+  Add("200", "250");
+  Add("300", "350");
+  Add("400", "450");
+  DoGenerateLevelFilesBrief(&file_level_, files_, &arena_);
+  ASSERT_EQ(4u, file_level_.num_files);
+  ASSERT_EQ(0, Compare());
+}
+
+class CountingLogger : public Logger {
+ public:
+  CountingLogger() : log_count(0) {}
+  using Logger::Logv;
+  void Logv(const char* /*format*/, va_list /*ap*/) override { log_count++; }
+  int log_count;
+};
+
+Options GetOptionsWithNumLevels(int num_levels,
+                                std::shared_ptr<CountingLogger> logger) {
+  Options opt;
+  opt.num_levels = num_levels;
+  opt.info_log = logger;
+  return opt;
+}
+
+class VersionStorageInfoTestBase : public testing::Test {
+ public:
+  const Comparator* ucmp_;
+  InternalKeyComparator icmp_;
+  std::shared_ptr<CountingLogger> logger_;
+  Options options_;
+  ImmutableOptions ioptions_;
+  MutableCFOptions mutable_cf_options_;
+  VersionStorageInfo vstorage_;
+
+  InternalKey GetInternalKey(const char* ukey,
+                             SequenceNumber smallest_seq = 100) {
+    return InternalKey(ukey, smallest_seq, kTypeValue);
+  }
+
+  explicit VersionStorageInfoTestBase(const Comparator* ucmp)
+      : ucmp_(ucmp),
+        icmp_(ucmp_),
+        logger_(new CountingLogger()),
+        options_(GetOptionsWithNumLevels(6, logger_)),
+        ioptions_(options_),
+        mutable_cf_options_(options_),
+        vstorage_(&icmp_, ucmp_, 6, kCompactionStyleLevel,
+                  /*src_vstorage=*/nullptr,
+                  /*_force_consistency_checks=*/false) {}
+
+  ~VersionStorageInfoTestBase() override {
+    for (int i = 0; i < vstorage_.num_levels(); ++i) {
+      for (auto* f : vstorage_.LevelFiles(i)) {
+        if (--f->refs == 0) {
+          delete f;
+        }
+      }
+    }
+  }
+
+  void Add(int level, uint32_t file_number, const char* smallest,
+           const char* largest, uint64_t file_size = 0,
+           uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) {
+    constexpr SequenceNumber dummy_seq = 0;
+
+    Add(level, file_number, GetInternalKey(smallest, dummy_seq),
+        GetInternalKey(largest, dummy_seq), file_size, oldest_blob_file_number);
+  }
+
+  void Add(int level, uint32_t file_number, const InternalKey& smallest,
+           const InternalKey& largest, uint64_t file_size = 0,
+           uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) {
+    assert(level < vstorage_.num_levels());
+    FileMetaData* f = new FileMetaData(
+        file_number, 0, file_size, smallest, largest, /* smallest_seq */ 0,
+        /* largest_seq */ 0, /* marked_for_compact */ false,
+        Temperature::kUnknown, oldest_blob_file_number,
+        kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+        kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+    f->compensated_file_size = file_size;
+    vstorage_.AddFile(level, f);
+  }
+
+  void AddBlob(uint64_t blob_file_number, uint64_t total_blob_count,
+               uint64_t total_blob_bytes,
+               BlobFileMetaData::LinkedSsts linked_ssts,
+               uint64_t garbage_blob_count, uint64_t garbage_blob_bytes) {
+    auto shared_meta = SharedBlobFileMetaData::Create(
+        blob_file_number, total_blob_count, total_blob_bytes,
+        /* checksum_method */ std::string(),
+        /* checksum_value */ std::string());
+    auto meta =
+        BlobFileMetaData::Create(std::move(shared_meta), std::move(linked_ssts),
+                                 garbage_blob_count, garbage_blob_bytes);
+
+    vstorage_.AddBlobFile(std::move(meta));
+  }
+
+  void UpdateVersionStorageInfo() {
+    vstorage_.PrepareForVersionAppend(ioptions_, mutable_cf_options_);
+    vstorage_.SetFinalized();
+  }
+
+  std::string GetOverlappingFiles(int level, const InternalKey& begin,
+                                  const InternalKey& end) {
+    std::vector<FileMetaData*> inputs;
+    vstorage_.GetOverlappingInputs(level, &begin, &end, &inputs);
+
+    std::string result;
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      if (i > 0) {
+        result += ",";
+      }
+      AppendNumberTo(&result, inputs[i]->fd.GetNumber());
+    }
+    return result;
+  }
+};
+
+class VersionStorageInfoTest : public VersionStorageInfoTestBase {
+ public:
+  VersionStorageInfoTest() : VersionStorageInfoTestBase(BytewiseComparator()) {}
+
+  ~VersionStorageInfoTest() override {}
+};
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelStatic) {
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  mutable_cf_options_.max_bytes_for_level_base = 10;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+
+  Add(4, 100U, "1", "2", 100U);
+  Add(5, 101U, "1", "2", 100U);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 10U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 50U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 250U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1250U);
+
+  ASSERT_EQ(0, logger_->log_count);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic_1) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 1000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+
+  Add(5, 1U, "1", "2", 500U);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(0, logger_->log_count);
+  ASSERT_EQ(vstorage_.base_level(), 5);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic_2) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 1000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+
+  Add(5, 1U, "1", "2", 500U);
+  Add(5, 2U, "3", "4", 550U);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(0, logger_->log_count);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1000U);
+  ASSERT_EQ(vstorage_.base_level(), 4);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic_3) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 1000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+
+  Add(5, 1U, "1", "2", 500U);
+  Add(5, 2U, "3", "4", 550U);
+  Add(4, 3U, "3", "4", 550U);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(0, logger_->log_count);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1000U);
+  ASSERT_EQ(vstorage_.base_level(), 4);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic_4) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 1000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+
+  Add(5, 1U, "1", "2", 500U);
+  Add(5, 2U, "3", "4", 550U);
+  Add(4, 3U, "3", "4", 550U);
+  Add(3, 4U, "3", "4", 250U);
+  Add(3, 5U, "5", "7", 300U);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(1, logger_->log_count);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1005U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 1000U);
+  ASSERT_EQ(vstorage_.base_level(), 3);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic_5) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 1000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+
+  Add(5, 1U, "1", "2", 500U);
+  Add(5, 2U, "3", "4", 550U);
+  Add(4, 3U, "3", "4", 550U);
+  Add(3, 4U, "3", "4", 250U);
+  Add(3, 5U, "5", "7", 300U);
+  Add(1, 6U, "3", "4", 5U);
+  Add(1, 7U, "8", "9", 5U);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(1, logger_->log_count);
+  ASSERT_GT(vstorage_.MaxBytesForLevel(4), 1005U);
+  ASSERT_GT(vstorage_.MaxBytesForLevel(3), 1005U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 1005U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 1000U);
+  ASSERT_EQ(vstorage_.base_level(), 1);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicLotsOfData) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 100;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 2;
+
+  Add(0, 1U, "1", "2", 50U);
+  Add(1, 2U, "1", "2", 50U);
+  Add(2, 3U, "1", "2", 500U);
+  Add(3, 4U, "1", "2", 500U);
+  Add(4, 5U, "1", "2", 1700U);
+  Add(5, 6U, "1", "2", 500U);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 800U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 400U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 200U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 100U);
+  ASSERT_EQ(vstorage_.base_level(), 1);
+  ASSERT_EQ(0, logger_->log_count);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicLargeLevel) {
+  uint64_t kOneGB = 1000U * 1000U * 1000U;
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 10U * kOneGB;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+
+  Add(0, 1U, "1", "2", 50U);
+  Add(3, 4U, "1", "2", 32U * kOneGB);
+  Add(4, 5U, "1", "2", 500U * kOneGB);
+  Add(5, 6U, "1", "2", 3000U * kOneGB);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(5), 3000U * kOneGB);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 300U * kOneGB);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 30U * kOneGB);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 10U * kOneGB);
+  ASSERT_EQ(vstorage_.base_level(), 2);
+  ASSERT_EQ(0, logger_->log_count);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_1) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 40000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+
+  Add(0, 1U, "1", "2", 10000U);
+  Add(0, 2U, "1", "2", 10000U);
+  Add(0, 3U, "1", "2", 10000U);
+
+  Add(5, 4U, "1", "2", 1286250U);
+  Add(4, 5U, "1", "2", 200000U);
+  Add(3, 6U, "1", "2", 40000U);
+  Add(2, 7U, "1", "2", 8000U);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(0, logger_->log_count);
+  ASSERT_EQ(2, vstorage_.base_level());
+  // level multiplier should be 3.5
+  ASSERT_EQ(vstorage_.level_multiplier(), 5.0);
+  ASSERT_EQ(40000U, vstorage_.MaxBytesForLevel(2));
+  ASSERT_EQ(51450U, vstorage_.MaxBytesForLevel(3));
+  ASSERT_EQ(257250U, vstorage_.MaxBytesForLevel(4));
+
+  vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_);
+  // Only L0 hits compaction.
+  ASSERT_EQ(vstorage_.CompactionScoreLevel(0), 0);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_2) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 10000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 4;
+
+  Add(0, 11U, "1", "2", 10000U);
+  Add(0, 12U, "1", "2", 10000U);
+  Add(0, 13U, "1", "2", 10000U);
+
+  // Level size should be around 10,000, 10,290, 51,450, 257,250
+  Add(5, 4U, "1", "2", 1286250U);
+  Add(4, 5U, "1", "2", 258000U);  // unadjusted score 1.003
+  Add(3, 6U, "1", "2", 53000U);   // unadjusted score 1.03
+  Add(2, 7U, "1", "2", 20000U);   // unadjusted score 1.94
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(0, logger_->log_count);
+  ASSERT_EQ(1, vstorage_.base_level());
+  ASSERT_EQ(10000U, vstorage_.MaxBytesForLevel(1));
+  ASSERT_EQ(10290U, vstorage_.MaxBytesForLevel(2));
+  ASSERT_EQ(51450U, vstorage_.MaxBytesForLevel(3));
+  ASSERT_EQ(257250U, vstorage_.MaxBytesForLevel(4));
+
+  vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_);
+  // Although L2 and l3 have higher unadjusted compaction score, considering
+  // a relatively large L0 being compacted down soon, L4 is picked up for
+  // compaction.
+  // L0 is still picked up for oversizing.
+  ASSERT_EQ(0, vstorage_.CompactionScoreLevel(0));
+  ASSERT_EQ(4, vstorage_.CompactionScoreLevel(1));
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_3) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 20000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 5;
+
+  Add(0, 11U, "1", "2", 2500U);
+  Add(0, 12U, "1", "2", 2500U);
+  Add(0, 13U, "1", "2", 2500U);
+  Add(0, 14U, "1", "2", 2500U);
+
+  // Level size should be around 20,000, 53000, 258000
+  Add(5, 4U, "1", "2", 1286250U);
+  Add(4, 5U, "1", "2", 260000U);  // Unadjusted score 1.01, adjusted about 4.3
+  Add(3, 6U, "1", "2", 85000U);   // Unadjusted score 1.42, adjusted about 11.6
+  Add(2, 7U, "1", "2", 30000);    // Unadjusted score 1.5, adjusted about 10.0
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(0, logger_->log_count);
+  ASSERT_EQ(2, vstorage_.base_level());
+  ASSERT_EQ(20000U, vstorage_.MaxBytesForLevel(2));
+
+  vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_);
+  // Although L2 has higher unadjusted compaction score, considering
+  // a relatively large L0 being compacted down soon, L3 is picked up for
+  // compaction.
+
+  ASSERT_EQ(3, vstorage_.CompactionScoreLevel(0));
+  ASSERT_EQ(2, vstorage_.CompactionScoreLevel(1));
+  ASSERT_EQ(4, vstorage_.CompactionScoreLevel(2));
+}
+
+TEST_F(VersionStorageInfoTest, EstimateLiveDataSize) {
+  // Test whether the overlaps are detected as expected
+  Add(1, 1U, "4", "7", 1U);  // Perfect overlap with last level
+  Add(2, 2U, "3", "5", 1U);  // Partial overlap with last level
+  Add(2, 3U, "6", "8", 1U);  // Partial overlap with last level
+  Add(3, 4U, "1", "9", 1U);  // Contains range of last level
+  Add(4, 5U, "4", "5", 1U);  // Inside range of last level
+  Add(4, 6U, "6", "7", 1U);  // Inside range of last level
+  Add(5, 7U, "4", "7", 10U);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(10U, vstorage_.EstimateLiveDataSize());
+}
+
+TEST_F(VersionStorageInfoTest, EstimateLiveDataSize2) {
+  Add(0, 1U, "9", "9", 1U);  // Level 0 is not ordered
+  Add(0, 2U, "5", "6", 1U);  // Ignored because of [5,6] in l1
+  Add(1, 3U, "1", "2", 1U);  // Ignored because of [2,3] in l2
+  Add(1, 4U, "3", "4", 1U);  // Ignored because of [2,3] in l2
+  Add(1, 5U, "5", "6", 1U);
+  Add(2, 6U, "2", "3", 1U);
+  Add(3, 7U, "7", "8", 1U);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(4U, vstorage_.EstimateLiveDataSize());
+}
+
+TEST_F(VersionStorageInfoTest, GetOverlappingInputs) {
+  // Two files that overlap at the range deletion tombstone sentinel.
+  Add(1, 1U, {"a", 0, kTypeValue},
+      {"b", kMaxSequenceNumber, kTypeRangeDeletion}, 1);
+  Add(1, 2U, {"b", 0, kTypeValue}, {"c", 0, kTypeValue}, 1);
+  // Two files that overlap at the same user key.
+  Add(1, 3U, {"d", 0, kTypeValue}, {"e", kMaxSequenceNumber, kTypeValue}, 1);
+  Add(1, 4U, {"e", 0, kTypeValue}, {"f", 0, kTypeValue}, 1);
+  // Two files that do not overlap.
+  Add(1, 5U, {"g", 0, kTypeValue}, {"h", 0, kTypeValue}, 1);
+  Add(1, 6U, {"i", 0, kTypeValue}, {"j", 0, kTypeValue}, 1);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ("1,2",
+            GetOverlappingFiles(1, {"a", 0, kTypeValue}, {"b", 0, kTypeValue}));
+  ASSERT_EQ("1",
+            GetOverlappingFiles(1, {"a", 0, kTypeValue},
+                                {"b", kMaxSequenceNumber, kTypeRangeDeletion}));
+  ASSERT_EQ("2", GetOverlappingFiles(1, {"b", kMaxSequenceNumber, kTypeValue},
+                                     {"c", 0, kTypeValue}));
+  ASSERT_EQ("3,4",
+            GetOverlappingFiles(1, {"d", 0, kTypeValue}, {"e", 0, kTypeValue}));
+  ASSERT_EQ("3",
+            GetOverlappingFiles(1, {"d", 0, kTypeValue},
+                                {"e", kMaxSequenceNumber, kTypeRangeDeletion}));
+  ASSERT_EQ("3,4", GetOverlappingFiles(1, {"e", kMaxSequenceNumber, kTypeValue},
+                                       {"f", 0, kTypeValue}));
+  ASSERT_EQ("3,4",
+            GetOverlappingFiles(1, {"e", 0, kTypeValue}, {"f", 0, kTypeValue}));
+  ASSERT_EQ("5",
+            GetOverlappingFiles(1, {"g", 0, kTypeValue}, {"h", 0, kTypeValue}));
+  ASSERT_EQ("6",
+            GetOverlappingFiles(1, {"i", 0, kTypeValue}, {"j", 0, kTypeValue}));
+}
+
+TEST_F(VersionStorageInfoTest, FileLocationAndMetaDataByNumber) {
+  Add(0, 11U, "1", "2", 5000U);
+  Add(0, 12U, "1", "2", 5000U);
+
+  Add(2, 7U, "1", "2", 8000U);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(vstorage_.GetFileLocation(11U),
+            VersionStorageInfo::FileLocation(0, 0));
+  ASSERT_NE(vstorage_.GetFileMetaDataByNumber(11U), nullptr);
+
+  ASSERT_EQ(vstorage_.GetFileLocation(12U),
+            VersionStorageInfo::FileLocation(0, 1));
+  ASSERT_NE(vstorage_.GetFileMetaDataByNumber(12U), nullptr);
+
+  ASSERT_EQ(vstorage_.GetFileLocation(7U),
+            VersionStorageInfo::FileLocation(2, 0));
+  ASSERT_NE(vstorage_.GetFileMetaDataByNumber(7U), nullptr);
+
+  ASSERT_FALSE(vstorage_.GetFileLocation(999U).IsValid());
+  ASSERT_EQ(vstorage_.GetFileMetaDataByNumber(999U), nullptr);
+}
+
+TEST_F(VersionStorageInfoTest, ForcedBlobGCEmpty) {
+  // No SST or blob files in VersionStorageInfo
+  UpdateVersionStorageInfo();
+
+  constexpr double age_cutoff = 0.5;
+  constexpr double force_threshold = 0.75;
+  vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+  ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
+}
+
+TEST_F(VersionStorageInfoTest, ForcedBlobGCSingleBatch) {
+  // Test the edge case when all blob files are part of the oldest batch.
+  // We have one L0 SST file #1, and four blob files #10, #11, #12, and #13.
+  // The oldest blob file used by SST #1 is blob file #10.
+
+  constexpr int level = 0;
+
+  constexpr uint64_t sst = 1;
+
+  constexpr uint64_t first_blob = 10;
+  constexpr uint64_t second_blob = 11;
+  constexpr uint64_t third_blob = 12;
+  constexpr uint64_t fourth_blob = 13;
+
+  {
+    constexpr char smallest[] = "bar1";
+    constexpr char largest[] = "foo1";
+    constexpr uint64_t file_size = 1000;
+
+    Add(level, sst, smallest, largest, file_size, first_blob);
+  }
+
+  {
+    constexpr uint64_t total_blob_count = 10;
+    constexpr uint64_t total_blob_bytes = 100000;
+    constexpr uint64_t garbage_blob_count = 2;
+    constexpr uint64_t garbage_blob_bytes = 15000;
+
+    AddBlob(first_blob, total_blob_count, total_blob_bytes,
+            BlobFileMetaData::LinkedSsts{sst}, garbage_blob_count,
+            garbage_blob_bytes);
+  }
+
+  {
+    constexpr uint64_t total_blob_count = 4;
+    constexpr uint64_t total_blob_bytes = 400000;
+    constexpr uint64_t garbage_blob_count = 3;
+    constexpr uint64_t garbage_blob_bytes = 235000;
+
+    AddBlob(second_blob, total_blob_count, total_blob_bytes,
+            BlobFileMetaData::LinkedSsts{}, garbage_blob_count,
+            garbage_blob_bytes);
+  }
+
+  {
+    constexpr uint64_t total_blob_count = 20;
+    constexpr uint64_t total_blob_bytes = 1000000;
+    constexpr uint64_t garbage_blob_count = 8;
+    constexpr uint64_t garbage_blob_bytes = 400000;
+
+    AddBlob(third_blob, total_blob_count, total_blob_bytes,
+            BlobFileMetaData::LinkedSsts{}, garbage_blob_count,
+            garbage_blob_bytes);
+  }
+
+  {
+    constexpr uint64_t total_blob_count = 128;
+    constexpr uint64_t total_blob_bytes = 1000000;
+    constexpr uint64_t garbage_blob_count = 67;
+    constexpr uint64_t garbage_blob_bytes = 600000;
+
+    AddBlob(fourth_blob, total_blob_count, total_blob_bytes,
+            BlobFileMetaData::LinkedSsts{}, garbage_blob_count,
+            garbage_blob_bytes);
+  }
+
+  UpdateVersionStorageInfo();
+
+  assert(vstorage_.num_levels() > 0);
+  const auto& level_files = vstorage_.LevelFiles(level);
+
+  assert(level_files.size() == 1);
+  assert(level_files[0] && level_files[0]->fd.GetNumber() == sst);
+
+  // No blob files eligible for GC due to the age cutoff
+
+  {
+    constexpr double age_cutoff = 0.1;
+    constexpr double force_threshold = 0.0;
+    vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+    ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
+  }
+
+  // Part of the oldest batch of blob files (specifically, #12 and #13) is
+  // ineligible for GC due to the age cutoff
+
+  {
+    constexpr double age_cutoff = 0.5;
+    constexpr double force_threshold = 0.0;
+    vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+    ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
+  }
+
+  // Oldest batch is eligible based on age cutoff but its overall garbage ratio
+  // is below threshold
+
+  {
+    constexpr double age_cutoff = 1.0;
+    constexpr double force_threshold = 0.6;
+    vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+    ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
+  }
+
+  // Oldest batch is eligible based on age cutoff and its overall garbage ratio
+  // meets threshold
+
+  {
+    constexpr double age_cutoff = 1.0;
+    constexpr double force_threshold = 0.5;
+    vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+    auto ssts_to_be_compacted = vstorage_.FilesMarkedForForcedBlobGC();
+    ASSERT_EQ(ssts_to_be_compacted.size(), 1);
+
+    const autovector<std::pair<int, FileMetaData*>>
+        expected_ssts_to_be_compacted{{level, level_files[0]}};
+
+    ASSERT_EQ(ssts_to_be_compacted[0], expected_ssts_to_be_compacted[0]);
+  }
+}
+
+TEST_F(VersionStorageInfoTest, ForcedBlobGCMultipleBatches) {
+  // Add three L0 SSTs (1, 2, and 3) and four blob files (10, 11, 12, and 13).
+  // The first two SSTs have the same oldest blob file, namely, the very oldest
+  // one (10), while the third SST's oldest blob file reference points to the
+  // third blob file (12). Thus, the oldest batch of blob files contains the
+  // first two blob files 10 and 11, and assuming they are eligible for GC based
+  // on the age cutoff, compacting away the SSTs 1 and 2 will eliminate them.
+
+  constexpr int level = 0;
+
+  constexpr uint64_t first_sst = 1;
+  constexpr uint64_t second_sst = 2;
+  constexpr uint64_t third_sst = 3;
+
+  constexpr uint64_t first_blob = 10;
+  constexpr uint64_t second_blob = 11;
+  constexpr uint64_t third_blob = 12;
+  constexpr uint64_t fourth_blob = 13;
+
+  {
+    constexpr char smallest[] = "bar1";
+    constexpr char largest[] = "foo1";
+    constexpr uint64_t file_size = 1000;
+
+    Add(level, first_sst, smallest, largest, file_size, first_blob);
+  }
+
+  {
+    constexpr char smallest[] = "bar2";
+    constexpr char largest[] = "foo2";
+    constexpr uint64_t file_size = 2000;
+
+    Add(level, second_sst, smallest, largest, file_size, first_blob);
+  }
+
+  {
+    constexpr char smallest[] = "bar3";
+    constexpr char largest[] = "foo3";
+    constexpr uint64_t file_size = 3000;
+
+    Add(level, third_sst, smallest, largest, file_size, third_blob);
+  }
+
+  {
+    constexpr uint64_t total_blob_count = 10;
+    constexpr uint64_t total_blob_bytes = 100000;
+    constexpr uint64_t garbage_blob_count = 2;
+    constexpr uint64_t garbage_blob_bytes = 15000;
+
+    AddBlob(first_blob, total_blob_count, total_blob_bytes,
+            BlobFileMetaData::LinkedSsts{first_sst, second_sst},
+            garbage_blob_count, garbage_blob_bytes);
+  }
+
+  {
+    constexpr uint64_t total_blob_count = 4;
+    constexpr uint64_t total_blob_bytes = 400000;
+    constexpr uint64_t garbage_blob_count = 3;
+    constexpr uint64_t garbage_blob_bytes = 235000;
+
+    AddBlob(second_blob, total_blob_count, total_blob_bytes,
+            BlobFileMetaData::LinkedSsts{}, garbage_blob_count,
+            garbage_blob_bytes);
+  }
+
+  {
+    constexpr uint64_t total_blob_count = 20;
+    constexpr uint64_t total_blob_bytes = 1000000;
+    constexpr uint64_t garbage_blob_count = 8;
+    constexpr uint64_t garbage_blob_bytes = 123456;
+
+    AddBlob(third_blob, total_blob_count, total_blob_bytes,
+            BlobFileMetaData::LinkedSsts{third_sst}, garbage_blob_count,
+            garbage_blob_bytes);
+  }
+
+  {
+    constexpr uint64_t total_blob_count = 128;
+    constexpr uint64_t total_blob_bytes = 789012345;
+    constexpr uint64_t garbage_blob_count = 67;
+    constexpr uint64_t garbage_blob_bytes = 88888888;
+
+    AddBlob(fourth_blob, total_blob_count, total_blob_bytes,
+            BlobFileMetaData::LinkedSsts{}, garbage_blob_count,
+            garbage_blob_bytes);
+  }
+
+  UpdateVersionStorageInfo();
+
+  assert(vstorage_.num_levels() > 0);
+  const auto& level_files = vstorage_.LevelFiles(level);
+
+  assert(level_files.size() == 3);
+  assert(level_files[0] && level_files[0]->fd.GetNumber() == first_sst);
+  assert(level_files[1] && level_files[1]->fd.GetNumber() == second_sst);
+  assert(level_files[2] && level_files[2]->fd.GetNumber() == third_sst);
+
+  // No blob files eligible for GC due to the age cutoff
+
+  {
+    constexpr double age_cutoff = 0.1;
+    constexpr double force_threshold = 0.0;
+    vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+    ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
+  }
+
+  // Part of the oldest batch of blob files (specifically, the second file) is
+  // ineligible for GC due to the age cutoff
+
+  {
+    constexpr double age_cutoff = 0.25;
+    constexpr double force_threshold = 0.0;
+    vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+    ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
+  }
+
+  // Oldest batch is eligible based on age cutoff but its overall garbage ratio
+  // is below threshold
+
+  {
+    constexpr double age_cutoff = 0.5;
+    constexpr double force_threshold = 0.6;
+    vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+    ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
+  }
+
+  // Oldest batch is eligible based on age cutoff and its overall garbage ratio
+  // meets threshold
+
+  {
+    constexpr double age_cutoff = 0.5;
+    constexpr double force_threshold = 0.5;
+    vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+    auto ssts_to_be_compacted = vstorage_.FilesMarkedForForcedBlobGC();
+    ASSERT_EQ(ssts_to_be_compacted.size(), 2);
+
+    std::sort(ssts_to_be_compacted.begin(), ssts_to_be_compacted.end(),
+              [](const std::pair<int, FileMetaData*>& lhs,
+                 const std::pair<int, FileMetaData*>& rhs) {
+                assert(lhs.second);
+                assert(rhs.second);
+                return lhs.second->fd.GetNumber() < rhs.second->fd.GetNumber();
+              });
+
+    const autovector<std::pair<int, FileMetaData*>>
+        expected_ssts_to_be_compacted{{level, level_files[0]},
+                                      {level, level_files[1]}};
+
+    ASSERT_EQ(ssts_to_be_compacted[0], expected_ssts_to_be_compacted[0]);
+    ASSERT_EQ(ssts_to_be_compacted[1], expected_ssts_to_be_compacted[1]);
+  }
+
+  // Now try the last two cases again with a greater than necessary age cutoff
+
+  // Oldest batch is eligible based on age cutoff but its overall garbage ratio
+  // is below threshold
+
+  {
+    constexpr double age_cutoff = 0.75;
+    constexpr double force_threshold = 0.6;
+    vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+    ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
+  }
+
+  // Oldest batch is eligible based on age cutoff and its overall garbage ratio
+  // meets threshold
+
+  {
+    constexpr double age_cutoff = 0.75;
+    constexpr double force_threshold = 0.5;
+    vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+    auto ssts_to_be_compacted = vstorage_.FilesMarkedForForcedBlobGC();
+    ASSERT_EQ(ssts_to_be_compacted.size(), 2);
+
+    std::sort(ssts_to_be_compacted.begin(), ssts_to_be_compacted.end(),
+              [](const std::pair<int, FileMetaData*>& lhs,
+                 const std::pair<int, FileMetaData*>& rhs) {
+                assert(lhs.second);
+                assert(rhs.second);
+                return lhs.second->fd.GetNumber() < rhs.second->fd.GetNumber();
+              });
+
+    const autovector<std::pair<int, FileMetaData*>>
+        expected_ssts_to_be_compacted{{level, level_files[0]},
+                                      {level, level_files[1]}};
+
+    ASSERT_EQ(ssts_to_be_compacted[0], expected_ssts_to_be_compacted[0]);
+    ASSERT_EQ(ssts_to_be_compacted[1], expected_ssts_to_be_compacted[1]);
+  }
+}
+
+class VersionStorageInfoTimestampTest : public VersionStorageInfoTestBase {
+ public:
+  VersionStorageInfoTimestampTest()
+      : VersionStorageInfoTestBase(test::BytewiseComparatorWithU64TsWrapper()) {
+  }
+  ~VersionStorageInfoTimestampTest() override {}
+  std::string Timestamp(uint64_t ts) const {
+    std::string ret;
+    PutFixed64(&ret, ts);
+    return ret;
+  }
+  std::string PackUserKeyAndTimestamp(const Slice& ukey, uint64_t ts) const {
+    std::string ret;
+    ret.assign(ukey.data(), ukey.size());
+    PutFixed64(&ret, ts);
+    return ret;
+  }
+};
+
+TEST_F(VersionStorageInfoTimestampTest, GetOverlappingInputs) {
+  Add(/*level=*/1, /*file_number=*/1, /*smallest=*/
+      {PackUserKeyAndTimestamp("a", /*ts=*/9), /*s=*/0, kTypeValue},
+      /*largest=*/
+      {PackUserKeyAndTimestamp("a", /*ts=*/8), /*s=*/0, kTypeValue},
+      /*file_size=*/100);
+  Add(/*level=*/1, /*file_number=*/2, /*smallest=*/
+      {PackUserKeyAndTimestamp("a", /*ts=*/5), /*s=*/0, kTypeValue},
+      /*largest=*/
+      {PackUserKeyAndTimestamp("b", /*ts=*/10), /*s=*/0, kTypeValue},
+      /*file_size=*/100);
+  Add(/*level=*/1, /*file_number=*/3, /*smallest=*/
+      {PackUserKeyAndTimestamp("c", /*ts=*/12), /*s=*/0, kTypeValue},
+      /*largest=*/
+      {PackUserKeyAndTimestamp("d", /*ts=*/1), /*s=*/0, kTypeValue},
+      /*file_size=*/100);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(
+      "1,2",
+      GetOverlappingFiles(
+          /*level=*/1,
+          {PackUserKeyAndTimestamp("a", /*ts=*/12), /*s=*/0, kTypeValue},
+          {PackUserKeyAndTimestamp("a", /*ts=*/11), /*s=*/0, kTypeValue}));
+  ASSERT_EQ("3",
+            GetOverlappingFiles(
+                /*level=*/1,
+                {PackUserKeyAndTimestamp("c", /*ts=*/15), /*s=*/0, kTypeValue},
+                {PackUserKeyAndTimestamp("c", /*ts=*/2), /*s=*/0, kTypeValue}));
+}
+
+class FindLevelFileTest : public testing::Test {
+ public:
+  LevelFilesBrief file_level_;
+  bool disjoint_sorted_files_;
+  Arena arena_;
+
+  FindLevelFileTest() : disjoint_sorted_files_(true) {}
+
+  ~FindLevelFileTest() override {}
+
+  void LevelFileInit(size_t num = 0) {
+    char* mem = arena_.AllocateAligned(num * sizeof(FdWithKeyRange));
+    file_level_.files = new (mem) FdWithKeyRange[num];
+    file_level_.num_files = 0;
+  }
+
+  void Add(const char* smallest, const char* largest,
+           SequenceNumber smallest_seq = 100,
+           SequenceNumber largest_seq = 100) {
+    InternalKey smallest_key = InternalKey(smallest, smallest_seq, kTypeValue);
+    InternalKey largest_key = InternalKey(largest, largest_seq, kTypeValue);
+
+    Slice smallest_slice = smallest_key.Encode();
+    Slice largest_slice = largest_key.Encode();
+
+    char* mem =
+        arena_.AllocateAligned(smallest_slice.size() + largest_slice.size());
+    memcpy(mem, smallest_slice.data(), smallest_slice.size());
+    memcpy(mem + smallest_slice.size(), largest_slice.data(),
+           largest_slice.size());
+
+    // add to file_level_
+    size_t num = file_level_.num_files;
+    auto& file = file_level_.files[num];
+    file.fd = FileDescriptor(num + 1, 0, 0);
+    file.smallest_key = Slice(mem, smallest_slice.size());
+    file.largest_key = Slice(mem + smallest_slice.size(), largest_slice.size());
+    file_level_.num_files++;
+  }
+
+  int Find(const char* key) {
+    InternalKey target(key, 100, kTypeValue);
+    InternalKeyComparator cmp(BytewiseComparator());
+    return FindFile(cmp, file_level_, target.Encode());
+  }
+
+  bool Overlaps(const char* smallest, const char* largest) {
+    InternalKeyComparator cmp(BytewiseComparator());
+    Slice s(smallest != nullptr ? smallest : "");
+    Slice l(largest != nullptr ? largest : "");
+    return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, file_level_,
+                                 (smallest != nullptr ? &s : nullptr),
+                                 (largest != nullptr ? &l : nullptr));
+  }
+};
+
+TEST_F(FindLevelFileTest, LevelEmpty) {
+  LevelFileInit(0);
+
+  ASSERT_EQ(0, Find("foo"));
+  ASSERT_TRUE(!Overlaps("a", "z"));
+  ASSERT_TRUE(!Overlaps(nullptr, "z"));
+  ASSERT_TRUE(!Overlaps("a", nullptr));
+  ASSERT_TRUE(!Overlaps(nullptr, nullptr));
+}
+
+TEST_F(FindLevelFileTest, LevelSingle) {
+  LevelFileInit(1);
+
+  Add("p", "q");
+  ASSERT_EQ(0, Find("a"));
+  ASSERT_EQ(0, Find("p"));
+  ASSERT_EQ(0, Find("p1"));
+  ASSERT_EQ(0, Find("q"));
+  ASSERT_EQ(1, Find("q1"));
+  ASSERT_EQ(1, Find("z"));
+
+  ASSERT_TRUE(!Overlaps("a", "b"));
+  ASSERT_TRUE(!Overlaps("z1", "z2"));
+  ASSERT_TRUE(Overlaps("a", "p"));
+  ASSERT_TRUE(Overlaps("a", "q"));
+  ASSERT_TRUE(Overlaps("a", "z"));
+  ASSERT_TRUE(Overlaps("p", "p1"));
+  ASSERT_TRUE(Overlaps("p", "q"));
+  ASSERT_TRUE(Overlaps("p", "z"));
+  ASSERT_TRUE(Overlaps("p1", "p2"));
+  ASSERT_TRUE(Overlaps("p1", "z"));
+  ASSERT_TRUE(Overlaps("q", "q"));
+  ASSERT_TRUE(Overlaps("q", "q1"));
+
+  ASSERT_TRUE(!Overlaps(nullptr, "j"));
+  ASSERT_TRUE(!Overlaps("r", nullptr));
+  ASSERT_TRUE(Overlaps(nullptr, "p"));
+  ASSERT_TRUE(Overlaps(nullptr, "p1"));
+  ASSERT_TRUE(Overlaps("q", nullptr));
+  ASSERT_TRUE(Overlaps(nullptr, nullptr));
+}
+
+TEST_F(FindLevelFileTest, LevelMultiple) {
+  LevelFileInit(4);
+
+  Add("150", "200");
+  Add("200", "250");
+  Add("300", "350");
+  Add("400", "450");
+  ASSERT_EQ(0, Find("100"));
+  ASSERT_EQ(0, Find("150"));
+  ASSERT_EQ(0, Find("151"));
+  ASSERT_EQ(0, Find("199"));
+  ASSERT_EQ(0, Find("200"));
+  ASSERT_EQ(1, Find("201"));
+  ASSERT_EQ(1, Find("249"));
+  ASSERT_EQ(1, Find("250"));
+  ASSERT_EQ(2, Find("251"));
+  ASSERT_EQ(2, Find("299"));
+  ASSERT_EQ(2, Find("300"));
+  ASSERT_EQ(2, Find("349"));
+  ASSERT_EQ(2, Find("350"));
+  ASSERT_EQ(3, Find("351"));
+  ASSERT_EQ(3, Find("400"));
+  ASSERT_EQ(3, Find("450"));
+  ASSERT_EQ(4, Find("451"));
+
+  ASSERT_TRUE(!Overlaps("100", "149"));
+  ASSERT_TRUE(!Overlaps("251", "299"));
+  ASSERT_TRUE(!Overlaps("451", "500"));
+  ASSERT_TRUE(!Overlaps("351", "399"));
+
+  ASSERT_TRUE(Overlaps("100", "150"));
+  ASSERT_TRUE(Overlaps("100", "200"));
+  ASSERT_TRUE(Overlaps("100", "300"));
+  ASSERT_TRUE(Overlaps("100", "400"));
+  ASSERT_TRUE(Overlaps("100", "500"));
+  ASSERT_TRUE(Overlaps("375", "400"));
+  ASSERT_TRUE(Overlaps("450", "450"));
+  ASSERT_TRUE(Overlaps("450", "500"));
+}
+
+TEST_F(FindLevelFileTest, LevelMultipleNullBoundaries) {
+  LevelFileInit(4);
+
+  Add("150", "200");
+  Add("200", "250");
+  Add("300", "350");
+  Add("400", "450");
+  ASSERT_TRUE(!Overlaps(nullptr, "149"));
+  ASSERT_TRUE(!Overlaps("451", nullptr));
+  ASSERT_TRUE(Overlaps(nullptr, nullptr));
+  ASSERT_TRUE(Overlaps(nullptr, "150"));
+  ASSERT_TRUE(Overlaps(nullptr, "199"));
+  ASSERT_TRUE(Overlaps(nullptr, "200"));
+  ASSERT_TRUE(Overlaps(nullptr, "201"));
+  ASSERT_TRUE(Overlaps(nullptr, "400"));
+  ASSERT_TRUE(Overlaps(nullptr, "800"));
+  ASSERT_TRUE(Overlaps("100", nullptr));
+  ASSERT_TRUE(Overlaps("200", nullptr));
+  ASSERT_TRUE(Overlaps("449", nullptr));
+  ASSERT_TRUE(Overlaps("450", nullptr));
+}
+
+TEST_F(FindLevelFileTest, LevelOverlapSequenceChecks) {
+  LevelFileInit(1);
+
+  Add("200", "200", 5000, 3000);
+  ASSERT_TRUE(!Overlaps("199", "199"));
+  ASSERT_TRUE(!Overlaps("201", "300"));
+  ASSERT_TRUE(Overlaps("200", "200"));
+  ASSERT_TRUE(Overlaps("190", "200"));
+  ASSERT_TRUE(Overlaps("200", "210"));
+}
+
+TEST_F(FindLevelFileTest, LevelOverlappingFiles) {
+  LevelFileInit(2);
+
+  Add("150", "600");
+  Add("400", "500");
+  disjoint_sorted_files_ = false;
+  ASSERT_TRUE(!Overlaps("100", "149"));
+  ASSERT_TRUE(!Overlaps("601", "700"));
+  ASSERT_TRUE(Overlaps("100", "150"));
+  ASSERT_TRUE(Overlaps("100", "200"));
+  ASSERT_TRUE(Overlaps("100", "300"));
+  ASSERT_TRUE(Overlaps("100", "400"));
+  ASSERT_TRUE(Overlaps("100", "500"));
+  ASSERT_TRUE(Overlaps("375", "400"));
+  ASSERT_TRUE(Overlaps("450", "450"));
+  ASSERT_TRUE(Overlaps("450", "500"));
+  ASSERT_TRUE(Overlaps("450", "700"));
+  ASSERT_TRUE(Overlaps("600", "700"));
+}
+
+class VersionSetTestBase {
+ public:
+  const static std::string kColumnFamilyName1;
+  const static std::string kColumnFamilyName2;
+  const static std::string kColumnFamilyName3;
+  int num_initial_edits_;
+
+  explicit VersionSetTestBase(const std::string& name)
+      : env_(nullptr),
+        dbname_(test::PerThreadDBPath(name)),
+        options_(),
+        db_options_(options_),
+        cf_options_(options_),
+        immutable_options_(db_options_, cf_options_),
+        mutable_cf_options_(cf_options_),
+        table_cache_(NewLRUCache(50000, 16)),
+        write_buffer_manager_(db_options_.db_write_buffer_size),
+        shutting_down_(false),
+        mock_table_factory_(std::make_shared<mock::MockTableFactory>()) {
+    EXPECT_OK(test::CreateEnvFromSystem(ConfigOptions(), &env_, &env_guard_));
+    if (env_ == Env::Default() && getenv("MEM_ENV")) {
+      env_guard_.reset(NewMemEnv(Env::Default()));
+      env_ = env_guard_.get();
+    }
+    EXPECT_NE(nullptr, env_);
+
+    fs_ = env_->GetFileSystem();
+    EXPECT_OK(fs_->CreateDirIfMissing(dbname_, IOOptions(), nullptr));
+
+    options_.env = env_;
+    db_options_.env = env_;
+    db_options_.fs = fs_;
+    immutable_options_.env = env_;
+    immutable_options_.fs = fs_;
+    immutable_options_.clock = env_->GetSystemClock().get();
+
+    versions_.reset(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_id*/ "", /*db_session_id*/ ""));
+    reactive_versions_ = std::make_shared<ReactiveVersionSet>(
+        dbname_, &db_options_, env_options_, table_cache_.get(),
+        &write_buffer_manager_, &write_controller_, nullptr);
+    db_options_.db_paths.emplace_back(dbname_,
+                                      std::numeric_limits<uint64_t>::max());
+  }
+
+  virtual ~VersionSetTestBase() {
+    if (getenv("KEEP_DB")) {
+      fprintf(stdout, "DB is still at %s\n", dbname_.c_str());
+    } else {
+      Options options;
+      options.env = env_;
+      EXPECT_OK(DestroyDB(dbname_, options));
+    }
+  }
+
+ protected:
+  virtual void PrepareManifest(
+      std::vector<ColumnFamilyDescriptor>* column_families,
+      SequenceNumber* last_seqno, std::unique_ptr<log::Writer>* log_writer) {
+    assert(column_families != nullptr);
+    assert(last_seqno != nullptr);
+    assert(log_writer != nullptr);
+    VersionEdit new_db;
+    if (db_options_.write_dbid_to_manifest) {
+      DBOptions tmp_db_options;
+      tmp_db_options.env = env_;
+      std::unique_ptr<DBImpl> impl(new DBImpl(tmp_db_options, dbname_));
+      std::string db_id;
+      impl->GetDbIdentityFromIdentityFile(&db_id);
+      new_db.SetDBId(db_id);
+    }
+    new_db.SetLogNumber(0);
+    new_db.SetNextFile(2);
+    new_db.SetLastSequence(0);
+
+    const std::vector<std::string> cf_names = {
+        kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
+        kColumnFamilyName3};
+    const int kInitialNumOfCfs = static_cast<int>(cf_names.size());
+    autovector<VersionEdit> new_cfs;
+    uint64_t last_seq = 1;
+    uint32_t cf_id = 1;
+    for (int i = 1; i != kInitialNumOfCfs; ++i) {
+      VersionEdit new_cf;
+      new_cf.AddColumnFamily(cf_names[i]);
+      new_cf.SetColumnFamily(cf_id++);
+      new_cf.SetLogNumber(0);
+      new_cf.SetNextFile(2);
+      new_cf.SetLastSequence(last_seq++);
+      new_cfs.emplace_back(new_cf);
+    }
+    *last_seqno = last_seq;
+    num_initial_edits_ = static_cast<int>(new_cfs.size() + 1);
+    std::unique_ptr<WritableFileWriter> file_writer;
+    const std::string manifest = DescriptorFileName(dbname_, 1);
+    const auto& fs = env_->GetFileSystem();
+    Status s = WritableFileWriter::Create(
+        fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer,
+        nullptr);
+    ASSERT_OK(s);
+    {
+      log_writer->reset(new log::Writer(std::move(file_writer), 0, false));
+      std::string record;
+      new_db.EncodeTo(&record);
+      s = (*log_writer)->AddRecord(record);
+      for (const auto& e : new_cfs) {
+        record.clear();
+        e.EncodeTo(&record);
+        s = (*log_writer)->AddRecord(record);
+        ASSERT_OK(s);
+      }
+    }
+    ASSERT_OK(s);
+
+    cf_options_.table_factory = mock_table_factory_;
+    for (const auto& cf_name : cf_names) {
+      column_families->emplace_back(cf_name, cf_options_);
+    }
+  }
+
+  // Create DB with 3 column families.
+  void NewDB() {
+    SequenceNumber last_seqno;
+    std::unique_ptr<log::Writer> log_writer;
+    SetIdentityFile(env_, dbname_);
+    PrepareManifest(&column_families_, &last_seqno, &log_writer);
+    log_writer.reset();
+    // Make "CURRENT" file point to the new manifest file.
+    Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
+    ASSERT_OK(s);
+
+    EXPECT_OK(versions_->Recover(column_families_, false));
+    EXPECT_EQ(column_families_.size(),
+              versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  }
+
+  void ReopenDB() {
+    versions_.reset(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_id*/ "", /*db_session_id*/ ""));
+    EXPECT_OK(versions_->Recover(column_families_, false));
+  }
+
+  void VerifyManifest(std::string* manifest_path) const {
+    assert(manifest_path != nullptr);
+    uint64_t manifest_file_number = 0;
+    Status s = versions_->GetCurrentManifestPath(
+        dbname_, fs_.get(), manifest_path, &manifest_file_number);
+    ASSERT_OK(s);
+    ASSERT_EQ(1, manifest_file_number);
+  }
+
+  Status LogAndApplyToDefaultCF(VersionEdit& edit) {
+    mutex_.Lock();
+    Status s =
+        versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+                               mutable_cf_options_, &edit, &mutex_, nullptr);
+    mutex_.Unlock();
+    return s;
+  }
+
+  Status LogAndApplyToDefaultCF(
+      const autovector<std::unique_ptr<VersionEdit>>& edits) {
+    autovector<VersionEdit*> vedits;
+    for (auto& e : edits) {
+      vedits.push_back(e.get());
+    }
+    mutex_.Lock();
+    Status s =
+        versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+                               mutable_cf_options_, vedits, &mutex_, nullptr);
+    mutex_.Unlock();
+    return s;
+  }
+
+  void CreateNewManifest() {
+    constexpr FSDirectory* db_directory = nullptr;
+    constexpr bool new_descriptor_log = true;
+    mutex_.Lock();
+    VersionEdit dummy;
+    ASSERT_OK(versions_->LogAndApply(
+        versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_,
+        &dummy, &mutex_, db_directory, new_descriptor_log));
+    mutex_.Unlock();
+  }
+
+  ColumnFamilyData* CreateColumnFamily(const std::string& cf_name,
+                                       const ColumnFamilyOptions& cf_options) {
+    VersionEdit new_cf;
+    new_cf.AddColumnFamily(cf_name);
+    uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID();
+    new_cf.SetColumnFamily(new_id);
+    new_cf.SetLogNumber(0);
+    new_cf.SetComparatorName(cf_options.comparator->Name());
+    Status s;
+    mutex_.Lock();
+    s = versions_->LogAndApply(/*column_family_data=*/nullptr,
+                               MutableCFOptions(cf_options), &new_cf, &mutex_,
+                               /*db_directory=*/nullptr,
+                               /*new_descriptor_log=*/false, &cf_options);
+    mutex_.Unlock();
+    EXPECT_OK(s);
+    ColumnFamilyData* cfd =
+        versions_->GetColumnFamilySet()->GetColumnFamily(cf_name);
+    EXPECT_NE(nullptr, cfd);
+    return cfd;
+  }
+
+  Env* mem_env_;
+  Env* env_;
+  std::shared_ptr<Env> env_guard_;
+  std::shared_ptr<FileSystem> fs_;
+  const std::string dbname_;
+  EnvOptions env_options_;
+  Options options_;
+  ImmutableDBOptions db_options_;
+  ColumnFamilyOptions cf_options_;
+  ImmutableOptions immutable_options_;
+  MutableCFOptions mutable_cf_options_;
+  std::shared_ptr<Cache> table_cache_;
+  WriteController write_controller_;
+  WriteBufferManager write_buffer_manager_;
+  std::shared_ptr<VersionSet> versions_;
+  std::shared_ptr<ReactiveVersionSet> reactive_versions_;
+  InstrumentedMutex mutex_;
+  std::atomic<bool> shutting_down_;
+  std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
+  std::vector<ColumnFamilyDescriptor> column_families_;
+};
+
+const std::string VersionSetTestBase::kColumnFamilyName1 = "alice";
+const std::string VersionSetTestBase::kColumnFamilyName2 = "bob";
+const std::string VersionSetTestBase::kColumnFamilyName3 = "charles";
+
+class VersionSetTest : public VersionSetTestBase, public testing::Test {
+ public:
+  VersionSetTest() : VersionSetTestBase("version_set_test") {}
+};
+
+TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) {
+  NewDB();
+  const int kGroupSize = 5;
+  autovector<VersionEdit> edits;
+  for (int i = 0; i != kGroupSize; ++i) {
+    edits.emplace_back(VersionEdit());
+  }
+  autovector<ColumnFamilyData*> cfds;
+  autovector<const MutableCFOptions*> all_mutable_cf_options;
+  autovector<autovector<VersionEdit*>> edit_lists;
+  for (int i = 0; i != kGroupSize; ++i) {
+    cfds.emplace_back(versions_->GetColumnFamilySet()->GetDefault());
+    all_mutable_cf_options.emplace_back(&mutable_cf_options_);
+    autovector<VersionEdit*> edit_list;
+    edit_list.emplace_back(&edits[i]);
+    edit_lists.emplace_back(edit_list);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  int count = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:SameColumnFamily", [&](void* arg) {
+        uint32_t* cf_id = reinterpret_cast<uint32_t*>(arg);
+        EXPECT_EQ(0u, *cf_id);
+        ++count;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  mutex_.Lock();
+  Status s = versions_->LogAndApply(cfds, all_mutable_cf_options, edit_lists,
+                                    &mutex_, nullptr);
+  mutex_.Unlock();
+  EXPECT_OK(s);
+  EXPECT_EQ(kGroupSize - 1, count);
+}
+
+TEST_F(VersionSetTest, PersistBlobFileStateInNewManifest) {
+  // Initialize the database and add a couple of blob files, one with some
+  // garbage in it, and one without any garbage.
+  NewDB();
+
+  assert(versions_);
+  assert(versions_->GetColumnFamilySet());
+
+  ColumnFamilyData* const cfd = versions_->GetColumnFamilySet()->GetDefault();
+  assert(cfd);
+
+  Version* const version = cfd->current();
+  assert(version);
+
+  VersionStorageInfo* const storage_info = version->storage_info();
+  assert(storage_info);
+
+  {
+    constexpr uint64_t blob_file_number = 123;
+    constexpr uint64_t total_blob_count = 456;
+    constexpr uint64_t total_blob_bytes = 77777777;
+    constexpr char checksum_method[] = "SHA1";
+    constexpr char checksum_value[] =
+        "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c"
+        "\x52\x5c\xbd";
+
+    auto shared_meta = SharedBlobFileMetaData::Create(
+        blob_file_number, total_blob_count, total_blob_bytes, checksum_method,
+        checksum_value);
+
+    constexpr uint64_t garbage_blob_count = 89;
+    constexpr uint64_t garbage_blob_bytes = 1000000;
+
+    auto meta = BlobFileMetaData::Create(
+        std::move(shared_meta), BlobFileMetaData::LinkedSsts(),
+        garbage_blob_count, garbage_blob_bytes);
+
+    storage_info->AddBlobFile(std::move(meta));
+  }
+
+  {
+    constexpr uint64_t blob_file_number = 234;
+    constexpr uint64_t total_blob_count = 555;
+    constexpr uint64_t total_blob_bytes = 66666;
+    constexpr char checksum_method[] = "CRC32";
+    constexpr char checksum_value[] = "\x3d\x87\xff\x57";
+
+    auto shared_meta = SharedBlobFileMetaData::Create(
+        blob_file_number, total_blob_count, total_blob_bytes, checksum_method,
+        checksum_value);
+
+    constexpr uint64_t garbage_blob_count = 0;
+    constexpr uint64_t garbage_blob_bytes = 0;
+
+    auto meta = BlobFileMetaData::Create(
+        std::move(shared_meta), BlobFileMetaData::LinkedSsts(),
+        garbage_blob_count, garbage_blob_bytes);
+
+    storage_info->AddBlobFile(std::move(meta));
+  }
+
+  // Force the creation of a new manifest file and make sure metadata for
+  // the blob files is re-persisted.
+  size_t addition_encoded = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileAddition::EncodeTo::CustomFields",
+      [&](void* /* arg */) { ++addition_encoded; });
+
+  size_t garbage_encoded = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileGarbage::EncodeTo::CustomFields",
+      [&](void* /* arg */) { ++garbage_encoded; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateNewManifest();
+
+  ASSERT_EQ(addition_encoded, 2);
+  ASSERT_EQ(garbage_encoded, 1);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(VersionSetTest, AddLiveBlobFiles) {
+  // Initialize the database and add a blob file.
+  NewDB();
+
+  assert(versions_);
+  assert(versions_->GetColumnFamilySet());
+
+  ColumnFamilyData* const cfd = versions_->GetColumnFamilySet()->GetDefault();
+  assert(cfd);
+
+  Version* const first_version = cfd->current();
+  assert(first_version);
+
+  VersionStorageInfo* const first_storage_info = first_version->storage_info();
+  assert(first_storage_info);
+
+  constexpr uint64_t first_blob_file_number = 234;
+  constexpr uint64_t first_total_blob_count = 555;
+  constexpr uint64_t first_total_blob_bytes = 66666;
+  constexpr char first_checksum_method[] = "CRC32";
+  constexpr char first_checksum_value[] = "\x3d\x87\xff\x57";
+
+  auto first_shared_meta = SharedBlobFileMetaData::Create(
+      first_blob_file_number, first_total_blob_count, first_total_blob_bytes,
+      first_checksum_method, first_checksum_value);
+
+  constexpr uint64_t garbage_blob_count = 0;
+  constexpr uint64_t garbage_blob_bytes = 0;
+
+  auto first_meta = BlobFileMetaData::Create(
+      std::move(first_shared_meta), BlobFileMetaData::LinkedSsts(),
+      garbage_blob_count, garbage_blob_bytes);
+
+  first_storage_info->AddBlobFile(first_meta);
+
+  // Reference the version so it stays alive even after the following version
+  // edit.
+  first_version->Ref();
+
+  // Get live files directly from version.
+  std::vector<uint64_t> version_table_files;
+  std::vector<uint64_t> version_blob_files;
+
+  first_version->AddLiveFiles(&version_table_files, &version_blob_files);
+
+  ASSERT_EQ(version_blob_files.size(), 1);
+  ASSERT_EQ(version_blob_files[0], first_blob_file_number);
+
+  // Create a new version containing an additional blob file.
+  versions_->TEST_CreateAndAppendVersion(cfd);
+
+  Version* const second_version = cfd->current();
+  assert(second_version);
+  assert(second_version != first_version);
+
+  VersionStorageInfo* const second_storage_info =
+      second_version->storage_info();
+  assert(second_storage_info);
+
+  constexpr uint64_t second_blob_file_number = 456;
+  constexpr uint64_t second_total_blob_count = 100;
+  constexpr uint64_t second_total_blob_bytes = 2000000;
+  constexpr char second_checksum_method[] = "CRC32B";
+  constexpr char second_checksum_value[] = "\x6d\xbd\xf2\x3a";
+
+  auto second_shared_meta = SharedBlobFileMetaData::Create(
+      second_blob_file_number, second_total_blob_count, second_total_blob_bytes,
+      second_checksum_method, second_checksum_value);
+
+  auto second_meta = BlobFileMetaData::Create(
+      std::move(second_shared_meta), BlobFileMetaData::LinkedSsts(),
+      garbage_blob_count, garbage_blob_bytes);
+
+  second_storage_info->AddBlobFile(std::move(first_meta));
+  second_storage_info->AddBlobFile(std::move(second_meta));
+
+  // Get all live files from version set. Note that the result contains
+  // duplicates.
+  std::vector<uint64_t> all_table_files;
+  std::vector<uint64_t> all_blob_files;
+
+  versions_->AddLiveFiles(&all_table_files, &all_blob_files);
+
+  ASSERT_EQ(all_blob_files.size(), 3);
+  ASSERT_EQ(all_blob_files[0], first_blob_file_number);
+  ASSERT_EQ(all_blob_files[1], first_blob_file_number);
+  ASSERT_EQ(all_blob_files[2], second_blob_file_number);
+
+  // Clean up previous version.
+  first_version->Unref();
+}
+
+TEST_F(VersionSetTest, ObsoleteBlobFile) {
+  // Initialize the database and add a blob file that is entirely garbage
+  // and thus can immediately be marked obsolete.
+  NewDB();
+
+  VersionEdit edit;
+
+  constexpr uint64_t blob_file_number = 234;
+  constexpr uint64_t total_blob_count = 555;
+  constexpr uint64_t total_blob_bytes = 66666;
+  constexpr char checksum_method[] = "CRC32";
+  constexpr char checksum_value[] = "\x3d\x87\xff\x57";
+
+  edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+                   checksum_method, checksum_value);
+
+  edit.AddBlobFileGarbage(blob_file_number, total_blob_count, total_blob_bytes);
+
+  mutex_.Lock();
+  Status s =
+      versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+                             mutable_cf_options_, &edit, &mutex_, nullptr);
+  mutex_.Unlock();
+
+  ASSERT_OK(s);
+
+  // Make sure blob files from the pending number range are not returned
+  // as obsolete.
+  {
+    std::vector<ObsoleteFileInfo> table_files;
+    std::vector<ObsoleteBlobFileInfo> blob_files;
+    std::vector<std::string> manifest_files;
+    constexpr uint64_t min_pending_output = blob_file_number;
+
+    versions_->GetObsoleteFiles(&table_files, &blob_files, &manifest_files,
+                                min_pending_output);
+
+    ASSERT_TRUE(blob_files.empty());
+  }
+
+  // Make sure the blob file is returned as obsolete if it's not in the pending
+  // range.
+  {
+    std::vector<ObsoleteFileInfo> table_files;
+    std::vector<ObsoleteBlobFileInfo> blob_files;
+    std::vector<std::string> manifest_files;
+    constexpr uint64_t min_pending_output = blob_file_number + 1;
+
+    versions_->GetObsoleteFiles(&table_files, &blob_files, &manifest_files,
+                                min_pending_output);
+
+    ASSERT_EQ(blob_files.size(), 1);
+    ASSERT_EQ(blob_files[0].GetBlobFileNumber(), blob_file_number);
+  }
+
+  // Make sure it's not returned a second time.
+  {
+    std::vector<ObsoleteFileInfo> table_files;
+    std::vector<ObsoleteBlobFileInfo> blob_files;
+    std::vector<std::string> manifest_files;
+    constexpr uint64_t min_pending_output = blob_file_number + 1;
+
+    versions_->GetObsoleteFiles(&table_files, &blob_files, &manifest_files,
+                                min_pending_output);
+
+    ASSERT_TRUE(blob_files.empty());
+  }
+}
+
+TEST_F(VersionSetTest, WalEditsNotAppliedToVersion) {
+  NewDB();
+
+  constexpr uint64_t kNumWals = 5;
+
+  autovector<std::unique_ptr<VersionEdit>> edits;
+  // Add some WALs.
+  for (uint64_t i = 1; i <= kNumWals; i++) {
+    edits.emplace_back(new VersionEdit);
+    // WAL's size equals its log number.
+    edits.back()->AddWal(i, WalMetadata(i));
+  }
+  // Delete the first half of the WALs.
+  edits.emplace_back(new VersionEdit);
+  edits.back()->DeleteWalsBefore(kNumWals / 2 + 1);
+
+  autovector<Version*> versions;
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:NewVersion",
+      [&](void* arg) { versions.push_back(reinterpret_cast<Version*>(arg)); });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(LogAndApplyToDefaultCF(edits));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Since the edits are all WAL edits, no version should be created.
+  ASSERT_EQ(versions.size(), 1);
+  ASSERT_EQ(versions[0], nullptr);
+}
+
+// Similar to WalEditsNotAppliedToVersion, but contains a non-WAL edit.
+TEST_F(VersionSetTest, NonWalEditsAppliedToVersion) {
+  NewDB();
+
+  const std::string kDBId = "db_db";
+  constexpr uint64_t kNumWals = 5;
+
+  autovector<std::unique_ptr<VersionEdit>> edits;
+  // Add some WALs.
+  for (uint64_t i = 1; i <= kNumWals; i++) {
+    edits.emplace_back(new VersionEdit);
+    // WAL's size equals its log number.
+    edits.back()->AddWal(i, WalMetadata(i));
+  }
+  // Delete the first half of the WALs.
+  edits.emplace_back(new VersionEdit);
+  edits.back()->DeleteWalsBefore(kNumWals / 2 + 1);
+  edits.emplace_back(new VersionEdit);
+  edits.back()->SetDBId(kDBId);
+
+  autovector<Version*> versions;
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:NewVersion",
+      [&](void* arg) { versions.push_back(reinterpret_cast<Version*>(arg)); });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(LogAndApplyToDefaultCF(edits));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Since the edits are all WAL edits, no version should be created.
+  ASSERT_EQ(versions.size(), 1);
+  ASSERT_NE(versions[0], nullptr);
+}
+
+TEST_F(VersionSetTest, WalAddition) {
+  NewDB();
+
+  constexpr WalNumber kLogNumber = 10;
+  constexpr uint64_t kSizeInBytes = 111;
+
+  // A WAL is just created.
+  {
+    VersionEdit edit;
+    edit.AddWal(kLogNumber);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+    const auto& wals = versions_->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+    ASSERT_FALSE(wals.at(kLogNumber).HasSyncedSize());
+  }
+
+  // The WAL is synced for several times before closing.
+  {
+    for (uint64_t size_delta = 100; size_delta > 0; size_delta /= 2) {
+      uint64_t size = kSizeInBytes - size_delta;
+      WalMetadata wal(size);
+      VersionEdit edit;
+      edit.AddWal(kLogNumber, wal);
+
+      ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+      const auto& wals = versions_->GetWalSet().GetWals();
+      ASSERT_EQ(wals.size(), 1);
+      ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+      ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+      ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), size);
+    }
+  }
+
+  // The WAL is closed.
+  {
+    WalMetadata wal(kSizeInBytes);
+    VersionEdit edit;
+    edit.AddWal(kLogNumber, wal);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+    const auto& wals = versions_->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+    ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+    ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSizeInBytes);
+  }
+
+  // Recover a new VersionSet.
+  {
+    std::unique_ptr<VersionSet> new_versions(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_id*/ "", /*db_session_id*/ ""));
+    ASSERT_OK(new_versions->Recover(column_families_, /*read_only=*/false));
+    const auto& wals = new_versions->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+    ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+    ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSizeInBytes);
+  }
+}
+
+TEST_F(VersionSetTest, WalCloseWithoutSync) {
+  NewDB();
+
+  constexpr WalNumber kLogNumber = 10;
+  constexpr uint64_t kSizeInBytes = 111;
+  constexpr uint64_t kSyncedSizeInBytes = kSizeInBytes / 2;
+
+  // A WAL is just created.
+  {
+    VersionEdit edit;
+    edit.AddWal(kLogNumber);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+    const auto& wals = versions_->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+    ASSERT_FALSE(wals.at(kLogNumber).HasSyncedSize());
+  }
+
+  // The WAL is synced before closing.
+  {
+    WalMetadata wal(kSyncedSizeInBytes);
+    VersionEdit edit;
+    edit.AddWal(kLogNumber, wal);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+    const auto& wals = versions_->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+    ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+    ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSyncedSizeInBytes);
+  }
+
+  // A new WAL with larger log number is created,
+  // implicitly marking the current WAL closed.
+  {
+    VersionEdit edit;
+    edit.AddWal(kLogNumber + 1);
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+    const auto& wals = versions_->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 2);
+    ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+    ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+    ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSyncedSizeInBytes);
+    ASSERT_TRUE(wals.find(kLogNumber + 1) != wals.end());
+    ASSERT_FALSE(wals.at(kLogNumber + 1).HasSyncedSize());
+  }
+
+  // Recover a new VersionSet.
+  {
+    std::unique_ptr<VersionSet> new_versions(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_id*/ "", /*db_session_id*/ ""));
+    ASSERT_OK(new_versions->Recover(column_families_, false));
+    const auto& wals = new_versions->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 2);
+    ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+    ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+    ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSyncedSizeInBytes);
+  }
+}
+
+TEST_F(VersionSetTest, WalDeletion) {
+  NewDB();
+
+  constexpr WalNumber kClosedLogNumber = 10;
+  constexpr WalNumber kNonClosedLogNumber = 20;
+  constexpr uint64_t kSizeInBytes = 111;
+
+  // Add a non-closed and a closed WAL.
+  {
+    VersionEdit edit;
+    edit.AddWal(kClosedLogNumber, WalMetadata(kSizeInBytes));
+    edit.AddWal(kNonClosedLogNumber);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+    const auto& wals = versions_->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 2);
+    ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end());
+    ASSERT_TRUE(wals.find(kClosedLogNumber) != wals.end());
+    ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize());
+    ASSERT_TRUE(wals.at(kClosedLogNumber).HasSyncedSize());
+    ASSERT_EQ(wals.at(kClosedLogNumber).GetSyncedSizeInBytes(), kSizeInBytes);
+  }
+
+  // Delete the closed WAL.
+  {
+    VersionEdit edit;
+    edit.DeleteWalsBefore(kNonClosedLogNumber);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+    const auto& wals = versions_->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end());
+    ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize());
+  }
+
+  // Recover a new VersionSet, only the non-closed WAL should show up.
+  {
+    std::unique_ptr<VersionSet> new_versions(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_id*/ "", /*db_session_id*/ ""));
+    ASSERT_OK(new_versions->Recover(column_families_, false));
+    const auto& wals = new_versions->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end());
+    ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize());
+  }
+
+  // Force the creation of a new MANIFEST file,
+  // only the non-closed WAL should be written to the new MANIFEST.
+  {
+    std::vector<WalAddition> wal_additions;
+    SyncPoint::GetInstance()->SetCallBack(
+        "VersionSet::WriteCurrentStateToManifest:SaveWal", [&](void* arg) {
+          VersionEdit* edit = reinterpret_cast<VersionEdit*>(arg);
+          ASSERT_TRUE(edit->IsWalAddition());
+          for (auto& addition : edit->GetWalAdditions()) {
+            wal_additions.push_back(addition);
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    CreateNewManifest();
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+
+    ASSERT_EQ(wal_additions.size(), 1);
+    ASSERT_EQ(wal_additions[0].GetLogNumber(), kNonClosedLogNumber);
+    ASSERT_FALSE(wal_additions[0].GetMetadata().HasSyncedSize());
+  }
+
+  // Recover from the new MANIFEST, only the non-closed WAL should show up.
+  {
+    std::unique_ptr<VersionSet> new_versions(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_id*/ "", /*db_session_id*/ ""));
+    ASSERT_OK(new_versions->Recover(column_families_, false));
+    const auto& wals = new_versions->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end());
+    ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize());
+  }
+}
+
+TEST_F(VersionSetTest, WalCreateTwice) {
+  NewDB();
+
+  constexpr WalNumber kLogNumber = 10;
+
+  VersionEdit edit;
+  edit.AddWal(kLogNumber);
+
+  ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+  Status s = LogAndApplyToDefaultCF(edit);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(s.ToString().find("WAL 10 is created more than once") !=
+              std::string::npos)
+      << s.ToString();
+}
+
+TEST_F(VersionSetTest, WalCreateAfterClose) {
+  NewDB();
+
+  constexpr WalNumber kLogNumber = 10;
+  constexpr uint64_t kSizeInBytes = 111;
+
+  {
+    // Add a closed WAL.
+    VersionEdit edit;
+    edit.AddWal(kLogNumber);
+    WalMetadata wal(kSizeInBytes);
+    edit.AddWal(kLogNumber, wal);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+  }
+
+  {
+    // Create the same WAL again.
+    VersionEdit edit;
+    edit.AddWal(kLogNumber);
+
+    Status s = LogAndApplyToDefaultCF(edit);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(s.ToString().find("WAL 10 is created more than once") !=
+                std::string::npos)
+        << s.ToString();
+  }
+}
+
+TEST_F(VersionSetTest, AddWalWithSmallerSize) {
+  NewDB();
+  assert(versions_);
+
+  constexpr WalNumber kLogNumber = 10;
+  constexpr uint64_t kSizeInBytes = 111;
+
+  {
+    // Add a closed WAL.
+    VersionEdit edit;
+    WalMetadata wal(kSizeInBytes);
+    edit.AddWal(kLogNumber, wal);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+  }
+  // Copy for future comparison.
+  const std::map<WalNumber, WalMetadata> wals1 =
+      versions_->GetWalSet().GetWals();
+
+  {
+    // Add the same WAL with smaller synced size.
+    VersionEdit edit;
+    WalMetadata wal(kSizeInBytes / 2);
+    edit.AddWal(kLogNumber, wal);
+
+    Status s = LogAndApplyToDefaultCF(edit);
+    ASSERT_OK(s);
+  }
+  const std::map<WalNumber, WalMetadata> wals2 =
+      versions_->GetWalSet().GetWals();
+  ASSERT_EQ(wals1, wals2);
+}
+
+TEST_F(VersionSetTest, DeleteWalsBeforeNonExistingWalNumber) {
+  NewDB();
+
+  constexpr WalNumber kLogNumber0 = 10;
+  constexpr WalNumber kLogNumber1 = 20;
+  constexpr WalNumber kNonExistingNumber = 15;
+  constexpr uint64_t kSizeInBytes = 111;
+
+  {
+    // Add closed WALs.
+    VersionEdit edit;
+    WalMetadata wal(kSizeInBytes);
+    edit.AddWal(kLogNumber0, wal);
+    edit.AddWal(kLogNumber1, wal);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+  }
+
+  {
+    // Delete WALs before a non-existing WAL.
+    VersionEdit edit;
+    edit.DeleteWalsBefore(kNonExistingNumber);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+  }
+
+  // Recover a new VersionSet, WAL0 is deleted, WAL1 is not.
+  {
+    std::unique_ptr<VersionSet> new_versions(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_id*/ "", /*db_session_id*/ ""));
+    ASSERT_OK(new_versions->Recover(column_families_, false));
+    const auto& wals = new_versions->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kLogNumber1) != wals.end());
+  }
+}
+
+TEST_F(VersionSetTest, DeleteAllWals) {
+  NewDB();
+
+  constexpr WalNumber kMaxLogNumber = 10;
+  constexpr uint64_t kSizeInBytes = 111;
+
+  {
+    // Add a closed WAL.
+    VersionEdit edit;
+    WalMetadata wal(kSizeInBytes);
+    edit.AddWal(kMaxLogNumber, wal);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+  }
+
+  {
+    VersionEdit edit;
+    edit.DeleteWalsBefore(kMaxLogNumber + 10);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+  }
+
+  // Recover a new VersionSet, all WALs are deleted.
+  {
+    std::unique_ptr<VersionSet> new_versions(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_id*/ "", /*db_session_id*/ ""));
+    ASSERT_OK(new_versions->Recover(column_families_, false));
+    const auto& wals = new_versions->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 0);
+  }
+}
+
+TEST_F(VersionSetTest, AtomicGroupWithWalEdits) {
+  NewDB();
+
+  constexpr int kAtomicGroupSize = 7;
+  constexpr uint64_t kNumWals = 5;
+  const std::string kDBId = "db_db";
+
+  int remaining = kAtomicGroupSize;
+  autovector<std::unique_ptr<VersionEdit>> edits;
+  // Add 5 WALs.
+  for (uint64_t i = 1; i <= kNumWals; i++) {
+    edits.emplace_back(new VersionEdit);
+    // WAL's size equals its log number.
+    edits.back()->AddWal(i, WalMetadata(i));
+    edits.back()->MarkAtomicGroup(--remaining);
+  }
+  // One edit with the min log number set.
+  edits.emplace_back(new VersionEdit);
+  edits.back()->SetDBId(kDBId);
+  edits.back()->MarkAtomicGroup(--remaining);
+  // Delete the first added 4 WALs.
+  edits.emplace_back(new VersionEdit);
+  edits.back()->DeleteWalsBefore(kNumWals);
+  edits.back()->MarkAtomicGroup(--remaining);
+  ASSERT_EQ(remaining, 0);
+
+  ASSERT_OK(LogAndApplyToDefaultCF(edits));
+
+  // Recover a new VersionSet, the min log number and the last WAL should be
+  // kept.
+  {
+    std::unique_ptr<VersionSet> new_versions(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_id*/ "", /*db_session_id*/ ""));
+    std::string db_id;
+    ASSERT_OK(
+        new_versions->Recover(column_families_, /*read_only=*/false, &db_id));
+
+    ASSERT_EQ(db_id, kDBId);
+
+    const auto& wals = new_versions->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kNumWals) != wals.end());
+    ASSERT_TRUE(wals.at(kNumWals).HasSyncedSize());
+    ASSERT_EQ(wals.at(kNumWals).GetSyncedSizeInBytes(), kNumWals);
+  }
+}
+
+class VersionSetWithTimestampTest : public VersionSetTest {
+ public:
+  static const std::string kNewCfName;
+
+  explicit VersionSetWithTimestampTest() : VersionSetTest() {}
+
+  void SetUp() override {
+    NewDB();
+    Options options;
+    options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+    cfd_ = CreateColumnFamily(kNewCfName, options);
+    EXPECT_NE(nullptr, cfd_);
+    EXPECT_NE(nullptr, cfd_->GetLatestMutableCFOptions());
+    column_families_.emplace_back(kNewCfName, options);
+  }
+
+  void TearDown() override {
+    for (auto* e : edits_) {
+      delete e;
+    }
+    edits_.clear();
+  }
+
+  void GenVersionEditsToSetFullHistoryTsLow(
+      const std::vector<uint64_t>& ts_lbs) {
+    for (const auto ts_lb : ts_lbs) {
+      VersionEdit* edit = new VersionEdit;
+      edit->SetColumnFamily(cfd_->GetID());
+      std::string ts_str = test::EncodeInt(ts_lb);
+      edit->SetFullHistoryTsLow(ts_str);
+      edits_.emplace_back(edit);
+    }
+  }
+
+  void VerifyFullHistoryTsLow(uint64_t expected_ts_low) {
+    std::unique_ptr<VersionSet> vset(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_id*/ "", /*db_session_id*/ ""));
+    ASSERT_OK(vset->Recover(column_families_, /*read_only=*/false,
+                            /*db_id=*/nullptr));
+    for (auto* cfd : *(vset->GetColumnFamilySet())) {
+      ASSERT_NE(nullptr, cfd);
+      if (cfd->GetName() == kNewCfName) {
+        ASSERT_EQ(test::EncodeInt(expected_ts_low), cfd->GetFullHistoryTsLow());
+      } else {
+        ASSERT_TRUE(cfd->GetFullHistoryTsLow().empty());
+      }
+    }
+  }
+
+  void DoTest(const std::vector<uint64_t>& ts_lbs) {
+    if (ts_lbs.empty()) {
+      return;
+    }
+
+    GenVersionEditsToSetFullHistoryTsLow(ts_lbs);
+
+    Status s;
+    mutex_.Lock();
+    s = versions_->LogAndApply(cfd_, *(cfd_->GetLatestMutableCFOptions()),
+                               edits_, &mutex_, nullptr);
+    mutex_.Unlock();
+    ASSERT_OK(s);
+    VerifyFullHistoryTsLow(*std::max_element(ts_lbs.begin(), ts_lbs.end()));
+  }
+
+ protected:
+  ColumnFamilyData* cfd_{nullptr};
+  // edits_ must contain and own pointers to heap-alloc VersionEdit objects.
+  autovector<VersionEdit*> edits_;
+};
+
+const std::string VersionSetWithTimestampTest::kNewCfName("new_cf");
+
+TEST_F(VersionSetWithTimestampTest, SetFullHistoryTsLbOnce) {
+  constexpr uint64_t kTsLow = 100;
+  DoTest({kTsLow});
+}
+
+// Simulate the application increasing full_history_ts_low.
+TEST_F(VersionSetWithTimestampTest, IncreaseFullHistoryTsLb) {
+  const std::vector<uint64_t> ts_lbs = {100, 101, 102, 103};
+  DoTest(ts_lbs);
+}
+
+// Simulate the application trying to decrease full_history_ts_low
+// unsuccessfully. If the application calls public API sequentially to
+// decrease the lower bound ts, RocksDB will return an InvalidArgument
+// status before involving VersionSet. Only when multiple threads trying
+// to decrease the lower bound concurrently will this case ever happen. Even
+// so, the lower bound cannot be decreased. The application will be notified
+// via return value of the API.
+TEST_F(VersionSetWithTimestampTest, TryDecreaseFullHistoryTsLb) {
+  const std::vector<uint64_t> ts_lbs = {103, 102, 101, 100};
+  DoTest(ts_lbs);
+}
+
+class VersionSetAtomicGroupTest : public VersionSetTestBase,
+                                  public testing::Test {
+ public:
+  VersionSetAtomicGroupTest()
+      : VersionSetTestBase("version_set_atomic_group_test") {}
+
+  void SetUp() override {
+    PrepareManifest(&column_families_, &last_seqno_, &log_writer_);
+    SetupTestSyncPoints();
+  }
+
+  void SetupValidAtomicGroup(int atomic_group_size) {
+    edits_.resize(atomic_group_size);
+    int remaining = atomic_group_size;
+    for (size_t i = 0; i != edits_.size(); ++i) {
+      edits_[i].SetLogNumber(0);
+      edits_[i].SetNextFile(2);
+      edits_[i].MarkAtomicGroup(--remaining);
+      edits_[i].SetLastSequence(last_seqno_++);
+    }
+    ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr));
+  }
+
+  void SetupIncompleteTrailingAtomicGroup(int atomic_group_size) {
+    edits_.resize(atomic_group_size);
+    int remaining = atomic_group_size;
+    for (size_t i = 0; i != edits_.size(); ++i) {
+      edits_[i].SetLogNumber(0);
+      edits_[i].SetNextFile(2);
+      edits_[i].MarkAtomicGroup(--remaining);
+      edits_[i].SetLastSequence(last_seqno_++);
+    }
+    ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr));
+  }
+
+  void SetupCorruptedAtomicGroup(int atomic_group_size) {
+    edits_.resize(atomic_group_size);
+    int remaining = atomic_group_size;
+    for (size_t i = 0; i != edits_.size(); ++i) {
+      edits_[i].SetLogNumber(0);
+      edits_[i].SetNextFile(2);
+      if (i != ((size_t)atomic_group_size / 2)) {
+        edits_[i].MarkAtomicGroup(--remaining);
+      }
+      edits_[i].SetLastSequence(last_seqno_++);
+    }
+    ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr));
+  }
+
+  void SetupIncorrectAtomicGroup(int atomic_group_size) {
+    edits_.resize(atomic_group_size);
+    int remaining = atomic_group_size;
+    for (size_t i = 0; i != edits_.size(); ++i) {
+      edits_[i].SetLogNumber(0);
+      edits_[i].SetNextFile(2);
+      if (i != 1) {
+        edits_[i].MarkAtomicGroup(--remaining);
+      } else {
+        edits_[i].MarkAtomicGroup(remaining--);
+      }
+      edits_[i].SetLastSequence(last_seqno_++);
+    }
+    ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr));
+  }
+
+  void SetupTestSyncPoints() {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    SyncPoint::GetInstance()->SetCallBack(
+        "AtomicGroupReadBuffer::AddEdit:FirstInAtomicGroup", [&](void* arg) {
+          VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
+          EXPECT_EQ(edits_.front().DebugString(),
+                    e->DebugString());  // compare based on value
+          first_in_atomic_group_ = true;
+        });
+    SyncPoint::GetInstance()->SetCallBack(
+        "AtomicGroupReadBuffer::AddEdit:LastInAtomicGroup", [&](void* arg) {
+          VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
+          EXPECT_EQ(edits_.back().DebugString(),
+                    e->DebugString());  // compare based on value
+          EXPECT_TRUE(first_in_atomic_group_);
+          last_in_atomic_group_ = true;
+        });
+    SyncPoint::GetInstance()->SetCallBack(
+        "VersionEditHandlerBase::Iterate:Finish", [&](void* arg) {
+          num_recovered_edits_ = *reinterpret_cast<size_t*>(arg);
+        });
+    SyncPoint::GetInstance()->SetCallBack(
+        "AtomicGroupReadBuffer::AddEdit:AtomicGroup",
+        [&](void* /* arg */) { ++num_edits_in_atomic_group_; });
+    SyncPoint::GetInstance()->SetCallBack(
+        "AtomicGroupReadBuffer::AddEdit:AtomicGroupMixedWithNormalEdits",
+        [&](void* arg) {
+          corrupted_edit_ = *reinterpret_cast<VersionEdit*>(arg);
+        });
+    SyncPoint::GetInstance()->SetCallBack(
+        "AtomicGroupReadBuffer::AddEdit:IncorrectAtomicGroupSize",
+        [&](void* arg) {
+          edit_with_incorrect_group_size_ =
+              *reinterpret_cast<VersionEdit*>(arg);
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+  }
+
+  void AddNewEditsToLog(int num_edits) {
+    for (int i = 0; i < num_edits; i++) {
+      std::string record;
+      edits_[i].EncodeTo(&record);
+      ASSERT_OK(log_writer_->AddRecord(record));
+    }
+  }
+
+  void TearDown() override {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    log_writer_.reset();
+  }
+
+ protected:
+  std::vector<ColumnFamilyDescriptor> column_families_;
+  SequenceNumber last_seqno_;
+  std::vector<VersionEdit> edits_;
+  bool first_in_atomic_group_ = false;
+  bool last_in_atomic_group_ = false;
+  int num_edits_in_atomic_group_ = 0;
+  size_t num_recovered_edits_ = 0;
+  VersionEdit corrupted_edit_;
+  VersionEdit edit_with_incorrect_group_size_;
+  std::unique_ptr<log::Writer> log_writer_;
+};
+
+TEST_F(VersionSetAtomicGroupTest, HandleValidAtomicGroupWithVersionSetRecover) {
+  const int kAtomicGroupSize = 3;
+  SetupValidAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  EXPECT_OK(versions_->Recover(column_families_, false));
+  EXPECT_EQ(column_families_.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_TRUE(last_in_atomic_group_);
+  EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleValidAtomicGroupWithReactiveVersionSetRecover) {
+  const int kAtomicGroupSize = 3;
+  SetupValidAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  EXPECT_EQ(column_families_.size(),
+            reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_TRUE(last_in_atomic_group_);
+  // The recover should clean up the replay buffer.
+  EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
+  EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
+  EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleValidAtomicGroupWithReactiveVersionSetReadAndApply) {
+  const int kAtomicGroupSize = 3;
+  SetupValidAtomicGroup(kAtomicGroupSize);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
+  AddNewEditsToLog(kAtomicGroupSize);
+  InstrumentedMutex mu;
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  mu.Lock();
+  EXPECT_OK(reactive_versions_->ReadAndApply(
+      &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
+  mu.Unlock();
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_TRUE(last_in_atomic_group_);
+  // The recover should clean up the replay buffer.
+  EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
+  EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
+  EXPECT_EQ(kAtomicGroupSize, num_recovered_edits_);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncompleteTrailingAtomicGroupWithVersionSetRecover) {
+  const int kAtomicGroupSize = 4;
+  const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1;
+  SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kNumberOfPersistedVersionEdits);
+  EXPECT_OK(versions_->Recover(column_families_, false));
+  EXPECT_EQ(column_families_.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_FALSE(last_in_atomic_group_);
+  EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_);
+  EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncompleteTrailingAtomicGroupWithReactiveVersionSetRecover) {
+  const int kAtomicGroupSize = 4;
+  const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1;
+  SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kNumberOfPersistedVersionEdits);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  EXPECT_EQ(column_families_.size(),
+            reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_FALSE(last_in_atomic_group_);
+  EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_);
+  // Reactive version set should store the edits in the replay buffer.
+  EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() ==
+              kNumberOfPersistedVersionEdits);
+  EXPECT_TRUE(reactive_versions_->replay_buffer().size() == kAtomicGroupSize);
+  // Write the last record. The reactive version set should now apply all
+  // edits.
+  std::string last_record;
+  edits_[kAtomicGroupSize - 1].EncodeTo(&last_record);
+  EXPECT_OK(log_writer_->AddRecord(last_record));
+  InstrumentedMutex mu;
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  mu.Lock();
+  EXPECT_OK(reactive_versions_->ReadAndApply(
+      &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
+  mu.Unlock();
+  // Reactive version set should be empty now.
+  EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
+  EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
+  EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncompleteTrailingAtomicGroupWithReactiveVersionSetReadAndApply) {
+  const int kAtomicGroupSize = 4;
+  const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1;
+  SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  // No edits in an atomic group.
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  EXPECT_EQ(column_families_.size(),
+            reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
+  // Write a few edits in an atomic group.
+  AddNewEditsToLog(kNumberOfPersistedVersionEdits);
+  InstrumentedMutex mu;
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  mu.Lock();
+  EXPECT_OK(reactive_versions_->ReadAndApply(
+      &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
+  mu.Unlock();
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_FALSE(last_in_atomic_group_);
+  EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_);
+  // Reactive version set should store the edits in the replay buffer.
+  EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() ==
+              kNumberOfPersistedVersionEdits);
+  EXPECT_TRUE(reactive_versions_->replay_buffer().size() == kAtomicGroupSize);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleCorruptedAtomicGroupWithVersionSetRecover) {
+  const int kAtomicGroupSize = 4;
+  SetupCorruptedAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  EXPECT_NOK(versions_->Recover(column_families_, false));
+  EXPECT_EQ(column_families_.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(),
+            corrupted_edit_.DebugString());
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleCorruptedAtomicGroupWithReactiveVersionSetRecover) {
+  const int kAtomicGroupSize = 4;
+  SetupCorruptedAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_NOK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                         &manifest_reporter,
+                                         &manifest_reader_status));
+  EXPECT_EQ(column_families_.size(),
+            reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(),
+            corrupted_edit_.DebugString());
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleCorruptedAtomicGroupWithReactiveVersionSetReadAndApply) {
+  const int kAtomicGroupSize = 4;
+  SetupCorruptedAtomicGroup(kAtomicGroupSize);
+  InstrumentedMutex mu;
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  // Write the corrupted edits.
+  AddNewEditsToLog(kAtomicGroupSize);
+  mu.Lock();
+  EXPECT_NOK(reactive_versions_->ReadAndApply(
+      &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
+  mu.Unlock();
+  EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(),
+            corrupted_edit_.DebugString());
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncorrectAtomicGroupSizeWithVersionSetRecover) {
+  const int kAtomicGroupSize = 4;
+  SetupIncorrectAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  EXPECT_NOK(versions_->Recover(column_families_, false));
+  EXPECT_EQ(column_families_.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_EQ(edits_[1].DebugString(),
+            edit_with_incorrect_group_size_.DebugString());
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncorrectAtomicGroupSizeWithReactiveVersionSetRecover) {
+  const int kAtomicGroupSize = 4;
+  SetupIncorrectAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_NOK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                         &manifest_reporter,
+                                         &manifest_reader_status));
+  EXPECT_EQ(column_families_.size(),
+            reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_EQ(edits_[1].DebugString(),
+            edit_with_incorrect_group_size_.DebugString());
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncorrectAtomicGroupSizeWithReactiveVersionSetReadAndApply) {
+  const int kAtomicGroupSize = 4;
+  SetupIncorrectAtomicGroup(kAtomicGroupSize);
+  InstrumentedMutex mu;
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  AddNewEditsToLog(kAtomicGroupSize);
+  mu.Lock();
+  EXPECT_NOK(reactive_versions_->ReadAndApply(
+      &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
+  mu.Unlock();
+  EXPECT_EQ(edits_[1].DebugString(),
+            edit_with_incorrect_group_size_.DebugString());
+}
+
+class VersionSetTestDropOneCF : public VersionSetTestBase,
+                                public testing::TestWithParam<std::string> {
+ public:
+  VersionSetTestDropOneCF()
+      : VersionSetTestBase("version_set_test_drop_one_cf") {}
+};
+
+// This test simulates the following execution sequence
+// Time  thread1                  bg_flush_thr
+//  |                             Prepare version edits (e1,e2,e3) for atomic
+//  |                             flush cf1, cf2, cf3
+//  |    Enqueue e to drop cfi
+//  |    to manifest_writers_
+//  |                             Enqueue (e1,e2,e3) to manifest_writers_
+//  |
+//  |    Apply e,
+//  |    cfi.IsDropped() is true
+//  |                             Apply (e1,e2,e3),
+//  |                             since cfi.IsDropped() == true, we need to
+//  |                             drop ei and write the rest to MANIFEST.
+//  V
+//
+//  Repeat the test for i = 1, 2, 3 to simulate dropping the first, middle and
+//  last column family in an atomic group.
+TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  SequenceNumber last_seqno;
+  std::unique_ptr<log::Writer> log_writer;
+  PrepareManifest(&column_families, &last_seqno, &log_writer);
+  Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
+  ASSERT_OK(s);
+
+  EXPECT_OK(versions_->Recover(column_families, false /* read_only */));
+  EXPECT_EQ(column_families.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+
+  const int kAtomicGroupSize = 3;
+  const std::vector<std::string> non_default_cf_names = {
+      kColumnFamilyName1, kColumnFamilyName2, kColumnFamilyName3};
+
+  // Drop one column family
+  VersionEdit drop_cf_edit;
+  drop_cf_edit.DropColumnFamily();
+  const std::string cf_to_drop_name(GetParam());
+  auto cfd_to_drop =
+      versions_->GetColumnFamilySet()->GetColumnFamily(cf_to_drop_name);
+  ASSERT_NE(nullptr, cfd_to_drop);
+  // Increase its refcount because cfd_to_drop is used later, and we need to
+  // prevent it from being deleted.
+  cfd_to_drop->Ref();
+  drop_cf_edit.SetColumnFamily(cfd_to_drop->GetID());
+  mutex_.Lock();
+  s = versions_->LogAndApply(cfd_to_drop,
+                             *cfd_to_drop->GetLatestMutableCFOptions(),
+                             &drop_cf_edit, &mutex_, nullptr);
+  mutex_.Unlock();
+  ASSERT_OK(s);
+
+  std::vector<VersionEdit> edits(kAtomicGroupSize);
+  uint32_t remaining = kAtomicGroupSize;
+  size_t i = 0;
+  autovector<ColumnFamilyData*> cfds;
+  autovector<const MutableCFOptions*> mutable_cf_options_list;
+  autovector<autovector<VersionEdit*>> edit_lists;
+  for (const auto& cf_name : non_default_cf_names) {
+    auto cfd = (cf_name != cf_to_drop_name)
+                   ? versions_->GetColumnFamilySet()->GetColumnFamily(cf_name)
+                   : cfd_to_drop;
+    ASSERT_NE(nullptr, cfd);
+    cfds.push_back(cfd);
+    mutable_cf_options_list.emplace_back(cfd->GetLatestMutableCFOptions());
+    edits[i].SetColumnFamily(cfd->GetID());
+    edits[i].SetLogNumber(0);
+    edits[i].SetNextFile(2);
+    edits[i].MarkAtomicGroup(--remaining);
+    edits[i].SetLastSequence(last_seqno++);
+    autovector<VersionEdit*> tmp_edits;
+    tmp_edits.push_back(&edits[i]);
+    edit_lists.emplace_back(tmp_edits);
+    ++i;
+  }
+  int called = 0;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:CheckOneAtomicGroup", [&](void* arg) {
+        std::vector<VersionEdit*>* tmp_edits =
+            reinterpret_cast<std::vector<VersionEdit*>*>(arg);
+        EXPECT_EQ(kAtomicGroupSize - 1, tmp_edits->size());
+        for (const auto e : *tmp_edits) {
+          bool found = false;
+          for (const auto& e2 : edits) {
+            if (&e2 == e) {
+              found = true;
+              break;
+            }
+          }
+          ASSERT_TRUE(found);
+        }
+        ++called;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  mutex_.Lock();
+  s = versions_->LogAndApply(cfds, mutable_cf_options_list, edit_lists, &mutex_,
+                             nullptr);
+  mutex_.Unlock();
+  ASSERT_OK(s);
+  ASSERT_EQ(1, called);
+  cfd_to_drop->UnrefAndTryDelete();
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AtomicGroup, VersionSetTestDropOneCF,
+    testing::Values(VersionSetTestBase::kColumnFamilyName1,
+                    VersionSetTestBase::kColumnFamilyName2,
+                    VersionSetTestBase::kColumnFamilyName3));
+
+class EmptyDefaultCfNewManifest : public VersionSetTestBase,
+                                  public testing::Test {
+ public:
+  EmptyDefaultCfNewManifest() : VersionSetTestBase("version_set_new_db_test") {}
+  // Emulate DBImpl::NewDB()
+  void PrepareManifest(std::vector<ColumnFamilyDescriptor>* /*column_families*/,
+                       SequenceNumber* /*last_seqno*/,
+                       std::unique_ptr<log::Writer>* log_writer) override {
+    assert(log_writer != nullptr);
+    VersionEdit new_db;
+    new_db.SetLogNumber(0);
+    const std::string manifest_path = DescriptorFileName(dbname_, 1);
+    const auto& fs = env_->GetFileSystem();
+    std::unique_ptr<WritableFileWriter> file_writer;
+    Status s = WritableFileWriter::Create(
+        fs, manifest_path, fs->OptimizeForManifestWrite(env_options_),
+        &file_writer, nullptr);
+    ASSERT_OK(s);
+    log_writer->reset(new log::Writer(std::move(file_writer), 0, true));
+    std::string record;
+    ASSERT_TRUE(new_db.EncodeTo(&record));
+    s = (*log_writer)->AddRecord(record);
+    ASSERT_OK(s);
+    // Create new column family
+    VersionEdit new_cf;
+    new_cf.AddColumnFamily(VersionSetTestBase::kColumnFamilyName1);
+    new_cf.SetColumnFamily(1);
+    new_cf.SetLastSequence(2);
+    new_cf.SetNextFile(2);
+    record.clear();
+    ASSERT_TRUE(new_cf.EncodeTo(&record));
+    s = (*log_writer)->AddRecord(record);
+    ASSERT_OK(s);
+  }
+
+ protected:
+  bool write_dbid_to_manifest_ = false;
+  std::unique_ptr<log::Writer> log_writer_;
+};
+
+// Create db, create column family. Cf creation will switch to a new MANIFEST.
+// Then reopen db, trying to recover.
+TEST_F(EmptyDefaultCfNewManifest, Recover) {
+  PrepareManifest(nullptr, nullptr, &log_writer_);
+  log_writer_.reset();
+  Status s =
+      SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr);
+  ASSERT_OK(s);
+  std::string manifest_path;
+  VerifyManifest(&manifest_path);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
+  column_families.emplace_back(VersionSetTestBase::kColumnFamilyName1,
+                               cf_options_);
+  std::string db_id;
+  bool has_missing_table_file = false;
+  s = versions_->TryRecoverFromOneManifest(
+      manifest_path, column_families, false, &db_id, &has_missing_table_file);
+  ASSERT_OK(s);
+  ASSERT_FALSE(has_missing_table_file);
+}
+
+class VersionSetTestEmptyDb
+    : public VersionSetTestBase,
+      public testing::TestWithParam<
+          std::tuple<bool, bool, std::vector<std::string>>> {
+ public:
+  static const std::string kUnknownColumnFamilyName;
+  VersionSetTestEmptyDb() : VersionSetTestBase("version_set_test_empty_db") {}
+
+ protected:
+  void PrepareManifest(std::vector<ColumnFamilyDescriptor>* /*column_families*/,
+                       SequenceNumber* /*last_seqno*/,
+                       std::unique_ptr<log::Writer>* log_writer) override {
+    assert(nullptr != log_writer);
+    VersionEdit new_db;
+    if (db_options_.write_dbid_to_manifest) {
+      DBOptions tmp_db_options;
+      tmp_db_options.env = env_;
+      std::unique_ptr<DBImpl> impl(new DBImpl(tmp_db_options, dbname_));
+      std::string db_id;
+      impl->GetDbIdentityFromIdentityFile(&db_id);
+      new_db.SetDBId(db_id);
+    }
+    const std::string manifest_path = DescriptorFileName(dbname_, 1);
+    const auto& fs = env_->GetFileSystem();
+    std::unique_ptr<WritableFileWriter> file_writer;
+    Status s = WritableFileWriter::Create(
+        fs, manifest_path, fs->OptimizeForManifestWrite(env_options_),
+        &file_writer, nullptr);
+    ASSERT_OK(s);
+    {
+      log_writer->reset(new log::Writer(std::move(file_writer), 0, false));
+      std::string record;
+      new_db.EncodeTo(&record);
+      s = (*log_writer)->AddRecord(record);
+      ASSERT_OK(s);
+    }
+  }
+
+  std::unique_ptr<log::Writer> log_writer_;
+};
+
+const std::string VersionSetTestEmptyDb::kUnknownColumnFamilyName = "unknown";
+
+TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest0) {
+  db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+  PrepareManifest(nullptr, nullptr, &log_writer_);
+  log_writer_.reset();
+  Status s =
+      SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr);
+  ASSERT_OK(s);
+
+  std::string manifest_path;
+  VerifyManifest(&manifest_path);
+
+  bool read_only = std::get<1>(GetParam());
+  const std::vector<std::string> cf_names = std::get<2>(GetParam());
+
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (const auto& cf_name : cf_names) {
+    column_families.emplace_back(cf_name, cf_options_);
+  }
+
+  std::string db_id;
+  bool has_missing_table_file = false;
+  s = versions_->TryRecoverFromOneManifest(manifest_path, column_families,
+                                           read_only, &db_id,
+                                           &has_missing_table_file);
+  auto iter =
+      std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName);
+  if (iter == cf_names.end()) {
+    ASSERT_TRUE(s.IsInvalidArgument());
+  } else {
+    ASSERT_NE(s.ToString().find(manifest_path), std::string::npos);
+    ASSERT_TRUE(s.IsCorruption());
+  }
+}
+
+TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest1) {
+  db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+  PrepareManifest(nullptr, nullptr, &log_writer_);
+  // Only a subset of column families in the MANIFEST.
+  VersionEdit new_cf1;
+  new_cf1.AddColumnFamily(VersionSetTestBase::kColumnFamilyName1);
+  new_cf1.SetColumnFamily(1);
+  Status s;
+  {
+    std::string record;
+    new_cf1.EncodeTo(&record);
+    s = log_writer_->AddRecord(record);
+    ASSERT_OK(s);
+  }
+  log_writer_.reset();
+  s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr);
+  ASSERT_OK(s);
+
+  std::string manifest_path;
+  VerifyManifest(&manifest_path);
+
+  bool read_only = std::get<1>(GetParam());
+  const std::vector<std::string>& cf_names = std::get<2>(GetParam());
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (const auto& cf_name : cf_names) {
+    column_families.emplace_back(cf_name, cf_options_);
+  }
+  std::string db_id;
+  bool has_missing_table_file = false;
+  s = versions_->TryRecoverFromOneManifest(manifest_path, column_families,
+                                           read_only, &db_id,
+                                           &has_missing_table_file);
+  auto iter =
+      std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName);
+  if (iter == cf_names.end()) {
+    ASSERT_TRUE(s.IsInvalidArgument());
+  } else {
+    ASSERT_NE(s.ToString().find(manifest_path), std::string::npos);
+    ASSERT_TRUE(s.IsCorruption());
+  }
+}
+
+TEST_P(VersionSetTestEmptyDb, OpenFromInCompleteManifest2) {
+  db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+  PrepareManifest(nullptr, nullptr, &log_writer_);
+  // Write all column families but no log_number, next_file_number and
+  // last_sequence.
+  const std::vector<std::string> all_cf_names = {
+      kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
+      kColumnFamilyName3};
+  uint32_t cf_id = 1;
+  Status s;
+  for (size_t i = 1; i != all_cf_names.size(); ++i) {
+    VersionEdit new_cf;
+    new_cf.AddColumnFamily(all_cf_names[i]);
+    new_cf.SetColumnFamily(cf_id++);
+    std::string record;
+    ASSERT_TRUE(new_cf.EncodeTo(&record));
+    s = log_writer_->AddRecord(record);
+    ASSERT_OK(s);
+  }
+  log_writer_.reset();
+  s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr);
+  ASSERT_OK(s);
+
+  std::string manifest_path;
+  VerifyManifest(&manifest_path);
+
+  bool read_only = std::get<1>(GetParam());
+  const std::vector<std::string>& cf_names = std::get<2>(GetParam());
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (const auto& cf_name : cf_names) {
+    column_families.emplace_back(cf_name, cf_options_);
+  }
+  std::string db_id;
+  bool has_missing_table_file = false;
+  s = versions_->TryRecoverFromOneManifest(manifest_path, column_families,
+                                           read_only, &db_id,
+                                           &has_missing_table_file);
+  auto iter =
+      std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName);
+  if (iter == cf_names.end()) {
+    ASSERT_TRUE(s.IsInvalidArgument());
+  } else {
+    ASSERT_NE(s.ToString().find(manifest_path), std::string::npos);
+    ASSERT_TRUE(s.IsCorruption());
+  }
+}
+
+TEST_P(VersionSetTestEmptyDb, OpenManifestWithUnknownCF) {
+  db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+  PrepareManifest(nullptr, nullptr, &log_writer_);
+  // Write all column families but no log_number, next_file_number and
+  // last_sequence.
+  const std::vector<std::string> all_cf_names = {
+      kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
+      kColumnFamilyName3};
+  uint32_t cf_id = 1;
+  Status s;
+  for (size_t i = 1; i != all_cf_names.size(); ++i) {
+    VersionEdit new_cf;
+    new_cf.AddColumnFamily(all_cf_names[i]);
+    new_cf.SetColumnFamily(cf_id++);
+    std::string record;
+    ASSERT_TRUE(new_cf.EncodeTo(&record));
+    s = log_writer_->AddRecord(record);
+    ASSERT_OK(s);
+  }
+  {
+    VersionEdit tmp_edit;
+    tmp_edit.SetColumnFamily(4);
+    tmp_edit.SetLogNumber(0);
+    tmp_edit.SetNextFile(2);
+    tmp_edit.SetLastSequence(0);
+    std::string record;
+    ASSERT_TRUE(tmp_edit.EncodeTo(&record));
+    s = log_writer_->AddRecord(record);
+    ASSERT_OK(s);
+  }
+  log_writer_.reset();
+  s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr);
+  ASSERT_OK(s);
+
+  std::string manifest_path;
+  VerifyManifest(&manifest_path);
+
+  bool read_only = std::get<1>(GetParam());
+  const std::vector<std::string>& cf_names = std::get<2>(GetParam());
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (const auto& cf_name : cf_names) {
+    column_families.emplace_back(cf_name, cf_options_);
+  }
+  std::string db_id;
+  bool has_missing_table_file = false;
+  s = versions_->TryRecoverFromOneManifest(manifest_path, column_families,
+                                           read_only, &db_id,
+                                           &has_missing_table_file);
+  auto iter =
+      std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName);
+  if (iter == cf_names.end()) {
+    ASSERT_TRUE(s.IsInvalidArgument());
+  } else {
+    ASSERT_NE(s.ToString().find(manifest_path), std::string::npos);
+    ASSERT_TRUE(s.IsCorruption());
+  }
+}
+
+TEST_P(VersionSetTestEmptyDb, OpenCompleteManifest) {
+  db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+  PrepareManifest(nullptr, nullptr, &log_writer_);
+  // Write all column families but no log_number, next_file_number and
+  // last_sequence.
+  const std::vector<std::string> all_cf_names = {
+      kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
+      kColumnFamilyName3};
+  uint32_t cf_id = 1;
+  Status s;
+  for (size_t i = 1; i != all_cf_names.size(); ++i) {
+    VersionEdit new_cf;
+    new_cf.AddColumnFamily(all_cf_names[i]);
+    new_cf.SetColumnFamily(cf_id++);
+    std::string record;
+    ASSERT_TRUE(new_cf.EncodeTo(&record));
+    s = log_writer_->AddRecord(record);
+    ASSERT_OK(s);
+  }
+  {
+    VersionEdit tmp_edit;
+    tmp_edit.SetLogNumber(0);
+    tmp_edit.SetNextFile(2);
+    tmp_edit.SetLastSequence(0);
+    std::string record;
+    ASSERT_TRUE(tmp_edit.EncodeTo(&record));
+    s = log_writer_->AddRecord(record);
+    ASSERT_OK(s);
+  }
+  log_writer_.reset();
+  s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr);
+  ASSERT_OK(s);
+
+  std::string manifest_path;
+  VerifyManifest(&manifest_path);
+
+  bool read_only = std::get<1>(GetParam());
+  const std::vector<std::string>& cf_names = std::get<2>(GetParam());
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (const auto& cf_name : cf_names) {
+    column_families.emplace_back(cf_name, cf_options_);
+  }
+  std::string db_id;
+  bool has_missing_table_file = false;
+  s = versions_->TryRecoverFromOneManifest(manifest_path, column_families,
+                                           read_only, &db_id,
+                                           &has_missing_table_file);
+  auto iter =
+      std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName);
+  if (iter == cf_names.end()) {
+    ASSERT_TRUE(s.IsInvalidArgument());
+  } else if (read_only) {
+    ASSERT_OK(s);
+    ASSERT_FALSE(has_missing_table_file);
+  } else if (cf_names.size() == all_cf_names.size()) {
+    ASSERT_OK(s);
+    ASSERT_FALSE(has_missing_table_file);
+  } else if (cf_names.size() < all_cf_names.size()) {
+    ASSERT_TRUE(s.IsInvalidArgument());
+  } else {
+    ASSERT_OK(s);
+    ASSERT_FALSE(has_missing_table_file);
+    ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetColumnFamily(
+        kUnknownColumnFamilyName);
+    ASSERT_EQ(nullptr, cfd);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    BestEffortRecovery, VersionSetTestEmptyDb,
+    testing::Combine(
+        /*write_dbid_to_manifest=*/testing::Bool(),
+        /*read_only=*/testing::Bool(),
+        /*cf_names=*/
+        testing::Values(
+            std::vector<std::string>(),
+            std::vector<std::string>({kDefaultColumnFamilyName}),
+            std::vector<std::string>({VersionSetTestBase::kColumnFamilyName1,
+                                      VersionSetTestBase::kColumnFamilyName2,
+                                      VersionSetTestBase::kColumnFamilyName3}),
+            std::vector<std::string>({kDefaultColumnFamilyName,
+                                      VersionSetTestBase::kColumnFamilyName1}),
+            std::vector<std::string>({kDefaultColumnFamilyName,
+                                      VersionSetTestBase::kColumnFamilyName1,
+                                      VersionSetTestBase::kColumnFamilyName2,
+                                      VersionSetTestBase::kColumnFamilyName3}),
+            std::vector<std::string>(
+                {kDefaultColumnFamilyName,
+                 VersionSetTestBase::kColumnFamilyName1,
+                 VersionSetTestBase::kColumnFamilyName2,
+                 VersionSetTestBase::kColumnFamilyName3,
+                 VersionSetTestEmptyDb::kUnknownColumnFamilyName}))));
+
+class VersionSetTestMissingFiles : public VersionSetTestBase,
+                                   public testing::Test {
+ public:
+  VersionSetTestMissingFiles()
+      : VersionSetTestBase("version_set_test_missing_files"),
+        block_based_table_options_(),
+        table_factory_(std::make_shared<BlockBasedTableFactory>(
+            block_based_table_options_)),
+        internal_comparator_(
+            std::make_shared<InternalKeyComparator>(options_.comparator)) {}
+
+ protected:
+  void PrepareManifest(std::vector<ColumnFamilyDescriptor>* column_families,
+                       SequenceNumber* last_seqno,
+                       std::unique_ptr<log::Writer>* log_writer) override {
+    assert(column_families != nullptr);
+    assert(last_seqno != nullptr);
+    assert(log_writer != nullptr);
+    const std::string manifest = DescriptorFileName(dbname_, 1);
+    const auto& fs = env_->GetFileSystem();
+    std::unique_ptr<WritableFileWriter> file_writer;
+    Status s = WritableFileWriter::Create(
+        fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer,
+        nullptr);
+    ASSERT_OK(s);
+    log_writer->reset(new log::Writer(std::move(file_writer), 0, false));
+    VersionEdit new_db;
+    if (db_options_.write_dbid_to_manifest) {
+      DBOptions tmp_db_options;
+      tmp_db_options.env = env_;
+      std::unique_ptr<DBImpl> impl(new DBImpl(tmp_db_options, dbname_));
+      std::string db_id;
+      impl->GetDbIdentityFromIdentityFile(&db_id);
+      new_db.SetDBId(db_id);
+    }
+    {
+      std::string record;
+      ASSERT_TRUE(new_db.EncodeTo(&record));
+      s = (*log_writer)->AddRecord(record);
+      ASSERT_OK(s);
+    }
+    const std::vector<std::string> cf_names = {
+        kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
+        kColumnFamilyName3};
+    uint32_t cf_id = 1;  // default cf id is 0
+    cf_options_.table_factory = table_factory_;
+    for (const auto& cf_name : cf_names) {
+      column_families->emplace_back(cf_name, cf_options_);
+      if (cf_name == kDefaultColumnFamilyName) {
+        continue;
+      }
+      VersionEdit new_cf;
+      new_cf.AddColumnFamily(cf_name);
+      new_cf.SetColumnFamily(cf_id);
+      std::string record;
+      ASSERT_TRUE(new_cf.EncodeTo(&record));
+      s = (*log_writer)->AddRecord(record);
+      ASSERT_OK(s);
+
+      VersionEdit cf_files;
+      cf_files.SetColumnFamily(cf_id);
+      cf_files.SetLogNumber(0);
+      record.clear();
+      ASSERT_TRUE(cf_files.EncodeTo(&record));
+      s = (*log_writer)->AddRecord(record);
+      ASSERT_OK(s);
+      ++cf_id;
+    }
+    SequenceNumber seq = 2;
+    {
+      VersionEdit edit;
+      edit.SetNextFile(7);
+      edit.SetLastSequence(seq);
+      std::string record;
+      ASSERT_TRUE(edit.EncodeTo(&record));
+      s = (*log_writer)->AddRecord(record);
+      ASSERT_OK(s);
+    }
+    *last_seqno = seq + 1;
+  }
+
+  struct SstInfo {
+    uint64_t file_number;
+    std::string column_family;
+    std::string key;  // the only key
+    int level = 0;
+    SstInfo(uint64_t file_num, const std::string& cf_name,
+            const std::string& _key)
+        : SstInfo(file_num, cf_name, _key, 0) {}
+    SstInfo(uint64_t file_num, const std::string& cf_name,
+            const std::string& _key, int lvl)
+        : file_number(file_num),
+          column_family(cf_name),
+          key(_key),
+          level(lvl) {}
+  };
+
+  // Create dummy sst, return their metadata. Note that only file name and size
+  // are used.
+  void CreateDummyTableFiles(const std::vector<SstInfo>& file_infos,
+                             std::vector<FileMetaData>* file_metas) {
+    assert(file_metas != nullptr);
+    for (const auto& info : file_infos) {
+      uint64_t file_num = info.file_number;
+      std::string fname = MakeTableFileName(dbname_, file_num);
+      std::unique_ptr<FSWritableFile> file;
+      Status s = fs_->NewWritableFile(fname, FileOptions(), &file, nullptr);
+      ASSERT_OK(s);
+      std::unique_ptr<WritableFileWriter> fwriter(new WritableFileWriter(
+          std::move(file), fname, FileOptions(), env_->GetSystemClock().get()));
+      IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+
+      std::unique_ptr<TableBuilder> builder(table_factory_->NewTableBuilder(
+          TableBuilderOptions(
+              immutable_options_, mutable_cf_options_, *internal_comparator_,
+              &int_tbl_prop_collector_factories, kNoCompression,
+              CompressionOptions(),
+              TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+              info.column_family, info.level),
+          fwriter.get()));
+      InternalKey ikey(info.key, 0, ValueType::kTypeValue);
+      builder->Add(ikey.Encode(), "value");
+      ASSERT_OK(builder->Finish());
+      ASSERT_OK(fwriter->Flush());
+      uint64_t file_size = 0;
+      s = fs_->GetFileSize(fname, IOOptions(), &file_size, nullptr);
+      ASSERT_OK(s);
+      ASSERT_NE(0, file_size);
+      file_metas->emplace_back(file_num, /*file_path_id=*/0, file_size, ikey,
+                               ikey, 0, 0, false, Temperature::kUnknown, 0, 0,
+                               0, kUnknownFileChecksum,
+                               kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+    }
+  }
+
+  // This method updates last_sequence_.
+  void WriteFileAdditionAndDeletionToManifest(
+      uint32_t cf, const std::vector<std::pair<int, FileMetaData>>& added_files,
+      const std::vector<std::pair<int, uint64_t>>& deleted_files) {
+    VersionEdit edit;
+    edit.SetColumnFamily(cf);
+    for (const auto& elem : added_files) {
+      int level = elem.first;
+      edit.AddFile(level, elem.second);
+    }
+    for (const auto& elem : deleted_files) {
+      int level = elem.first;
+      edit.DeleteFile(level, elem.second);
+    }
+    edit.SetLastSequence(last_seqno_);
+    ++last_seqno_;
+    assert(log_writer_.get() != nullptr);
+    std::string record;
+    ASSERT_TRUE(edit.EncodeTo(&record));
+    Status s = log_writer_->AddRecord(record);
+    ASSERT_OK(s);
+  }
+
+  BlockBasedTableOptions block_based_table_options_;
+  std::shared_ptr<TableFactory> table_factory_;
+  std::shared_ptr<InternalKeyComparator> internal_comparator_;
+  std::vector<ColumnFamilyDescriptor> column_families_;
+  SequenceNumber last_seqno_;
+  std::unique_ptr<log::Writer> log_writer_;
+};
+
+TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) {
+  std::vector<SstInfo> existing_files = {
+      SstInfo(100, kDefaultColumnFamilyName, "a"),
+      SstInfo(102, kDefaultColumnFamilyName, "b"),
+      SstInfo(103, kDefaultColumnFamilyName, "c"),
+      SstInfo(107, kDefaultColumnFamilyName, "d"),
+      SstInfo(110, kDefaultColumnFamilyName, "e")};
+  std::vector<FileMetaData> file_metas;
+  CreateDummyTableFiles(existing_files, &file_metas);
+
+  PrepareManifest(&column_families_, &last_seqno_, &log_writer_);
+  std::vector<std::pair<int, FileMetaData>> added_files;
+  for (uint64_t file_num = 10; file_num < 15; ++file_num) {
+    std::string smallest_ukey = "a";
+    std::string largest_ukey = "b";
+    InternalKey smallest_ikey(smallest_ukey, 1, ValueType::kTypeValue);
+    InternalKey largest_ikey(largest_ukey, 1, ValueType::kTypeValue);
+    FileMetaData meta = FileMetaData(
+        file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey,
+        largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0,
+        kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+    added_files.emplace_back(0, meta);
+  }
+  WriteFileAdditionAndDeletionToManifest(
+      /*cf=*/0, added_files, std::vector<std::pair<int, uint64_t>>());
+  std::vector<std::pair<int, uint64_t>> deleted_files;
+  deleted_files.emplace_back(0, 10);
+  WriteFileAdditionAndDeletionToManifest(
+      /*cf=*/0, std::vector<std::pair<int, FileMetaData>>(), deleted_files);
+  log_writer_.reset();
+  Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
+  ASSERT_OK(s);
+  std::string manifest_path;
+  VerifyManifest(&manifest_path);
+  std::string db_id;
+  bool has_missing_table_file = false;
+  s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_,
+                                           /*read_only=*/false, &db_id,
+                                           &has_missing_table_file);
+  ASSERT_OK(s);
+  ASSERT_TRUE(has_missing_table_file);
+  for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) {
+    VersionStorageInfo* vstorage = cfd->current()->storage_info();
+    const std::vector<FileMetaData*>& files = vstorage->LevelFiles(0);
+    ASSERT_TRUE(files.empty());
+  }
+}
+
+TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) {
+  std::vector<SstInfo> existing_files = {
+      SstInfo(100, kDefaultColumnFamilyName, "a"),
+      SstInfo(102, kDefaultColumnFamilyName, "b"),
+      SstInfo(103, kDefaultColumnFamilyName, "c"),
+      SstInfo(107, kDefaultColumnFamilyName, "d"),
+      SstInfo(110, kDefaultColumnFamilyName, "e")};
+  std::vector<FileMetaData> file_metas;
+  CreateDummyTableFiles(existing_files, &file_metas);
+
+  PrepareManifest(&column_families_, &last_seqno_, &log_writer_);
+  std::vector<std::pair<int, FileMetaData>> added_files;
+  for (size_t i = 3; i != 5; ++i) {
+    added_files.emplace_back(0, file_metas[i]);
+  }
+  WriteFileAdditionAndDeletionToManifest(
+      /*cf=*/0, added_files, std::vector<std::pair<int, uint64_t>>());
+
+  added_files.clear();
+  for (uint64_t file_num = 120; file_num < 130; ++file_num) {
+    std::string smallest_ukey = "a";
+    std::string largest_ukey = "b";
+    InternalKey smallest_ikey(smallest_ukey, 1, ValueType::kTypeValue);
+    InternalKey largest_ikey(largest_ukey, 1, ValueType::kTypeValue);
+    FileMetaData meta = FileMetaData(
+        file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey,
+        largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0,
+        kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+    added_files.emplace_back(0, meta);
+  }
+  WriteFileAdditionAndDeletionToManifest(
+      /*cf=*/0, added_files, std::vector<std::pair<int, uint64_t>>());
+  log_writer_.reset();
+  Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
+  ASSERT_OK(s);
+  std::string manifest_path;
+  VerifyManifest(&manifest_path);
+  std::string db_id;
+  bool has_missing_table_file = false;
+  s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_,
+                                           /*read_only=*/false, &db_id,
+                                           &has_missing_table_file);
+  ASSERT_OK(s);
+  ASSERT_TRUE(has_missing_table_file);
+  for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) {
+    VersionStorageInfo* vstorage = cfd->current()->storage_info();
+    const std::vector<FileMetaData*>& files = vstorage->LevelFiles(0);
+    if (cfd->GetName() == kDefaultColumnFamilyName) {
+      ASSERT_EQ(2, files.size());
+      for (const auto* fmeta : files) {
+        if (fmeta->fd.GetNumber() != 107 && fmeta->fd.GetNumber() != 110) {
+          ASSERT_FALSE(true);
+        }
+      }
+    } else {
+      ASSERT_TRUE(files.empty());
+    }
+  }
+}
+
+TEST_F(VersionSetTestMissingFiles, NoFileMissing) {
+  std::vector<SstInfo> existing_files = {
+      SstInfo(100, kDefaultColumnFamilyName, "a"),
+      SstInfo(102, kDefaultColumnFamilyName, "b"),
+      SstInfo(103, kDefaultColumnFamilyName, "c"),
+      SstInfo(107, kDefaultColumnFamilyName, "d"),
+      SstInfo(110, kDefaultColumnFamilyName, "e")};
+  std::vector<FileMetaData> file_metas;
+  CreateDummyTableFiles(existing_files, &file_metas);
+
+  PrepareManifest(&column_families_, &last_seqno_, &log_writer_);
+  std::vector<std::pair<int, FileMetaData>> added_files;
+  for (const auto& meta : file_metas) {
+    added_files.emplace_back(0, meta);
+  }
+  WriteFileAdditionAndDeletionToManifest(
+      /*cf=*/0, added_files, std::vector<std::pair<int, uint64_t>>());
+  std::vector<std::pair<int, uint64_t>> deleted_files;
+  deleted_files.emplace_back(/*level=*/0, 100);
+  WriteFileAdditionAndDeletionToManifest(
+      /*cf=*/0, std::vector<std::pair<int, FileMetaData>>(), deleted_files);
+  log_writer_.reset();
+  Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
+  ASSERT_OK(s);
+  std::string manifest_path;
+  VerifyManifest(&manifest_path);
+  std::string db_id;
+  bool has_missing_table_file = false;
+  s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_,
+                                           /*read_only=*/false, &db_id,
+                                           &has_missing_table_file);
+  ASSERT_OK(s);
+  ASSERT_FALSE(has_missing_table_file);
+  for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) {
+    VersionStorageInfo* vstorage = cfd->current()->storage_info();
+    const std::vector<FileMetaData*>& files = vstorage->LevelFiles(0);
+    if (cfd->GetName() == kDefaultColumnFamilyName) {
+      ASSERT_EQ(existing_files.size() - deleted_files.size(), files.size());
+      bool has_deleted_file = false;
+      for (const auto* fmeta : files) {
+        if (fmeta->fd.GetNumber() == 100) {
+          has_deleted_file = true;
+          break;
+        }
+      }
+      ASSERT_FALSE(has_deleted_file);
+    } else {
+      ASSERT_TRUE(files.empty());
+    }
+  }
+}
+
+TEST_F(VersionSetTestMissingFiles, MinLogNumberToKeep2PC) {
+  db_options_.allow_2pc = true;
+  NewDB();
+
+  SstInfo sst(100, kDefaultColumnFamilyName, "a");
+  std::vector<FileMetaData> file_metas;
+  CreateDummyTableFiles({sst}, &file_metas);
+
+  constexpr WalNumber kMinWalNumberToKeep2PC = 10;
+  VersionEdit edit;
+  edit.AddFile(0, file_metas[0]);
+  edit.SetMinLogNumberToKeep(kMinWalNumberToKeep2PC);
+  ASSERT_OK(LogAndApplyToDefaultCF(edit));
+  ASSERT_EQ(versions_->min_log_number_to_keep(), kMinWalNumberToKeep2PC);
+
+  for (int i = 0; i < 3; i++) {
+    CreateNewManifest();
+    ReopenDB();
+    ASSERT_EQ(versions_->min_log_number_to_keep(), kMinWalNumberToKeep2PC);
+  }
+}
+
+class ChargeFileMetadataTest : public DBTestBase {
+ public:
+  ChargeFileMetadataTest()
+      : DBTestBase("charge_file_metadata_test", /*env_do_fsync=*/true) {}
+};
+
+class ChargeFileMetadataTestWithParam
+    : public ChargeFileMetadataTest,
+      public testing::WithParamInterface<CacheEntryRoleOptions::Decision> {
+ public:
+  ChargeFileMetadataTestWithParam() {}
+};
+
+#ifndef ROCKSDB_LITE
+INSTANTIATE_TEST_CASE_P(
+    ChargeFileMetadataTestWithParam, ChargeFileMetadataTestWithParam,
+    ::testing::Values(CacheEntryRoleOptions::Decision::kEnabled,
+                      CacheEntryRoleOptions::Decision::kDisabled));
+
+TEST_P(ChargeFileMetadataTestWithParam, Basic) {
+  Options options;
+  BlockBasedTableOptions table_options;
+  CacheEntryRoleOptions::Decision charge_file_metadata = GetParam();
+  table_options.cache_usage_options.options_overrides.insert(
+      {CacheEntryRole::kFileMetadata, {/*.charged = */ charge_file_metadata}});
+  std::shared_ptr<TargetCacheChargeTrackingCache<CacheEntryRole::kFileMetadata>>
+      file_metadata_charge_only_cache = std::make_shared<
+          TargetCacheChargeTrackingCache<CacheEntryRole::kFileMetadata>>(
+          NewLRUCache(
+              4 * CacheReservationManagerImpl<
+                      CacheEntryRole::kFileMetadata>::GetDummyEntrySize(),
+              0 /* num_shard_bits */, true /* strict_capacity_limit */));
+  table_options.block_cache = file_metadata_charge_only_cache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  // Create 128 file metadata, each of which is roughly 1024 bytes.
+  // This results in 1 *
+  // CacheReservationManagerImpl<CacheEntryRole::kFileMetadata>::GetDummyEntrySize()
+  // cache reservation for file metadata.
+  for (int i = 1; i <= 128; ++i) {
+    ASSERT_OK(Put(std::string(1024, 'a'), "va"));
+    ASSERT_OK(Put("b", "vb"));
+    ASSERT_OK(Flush());
+  }
+  if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) {
+    EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(),
+              1 * CacheReservationManagerImpl<
+                      CacheEntryRole::kFileMetadata>::GetDummyEntrySize());
+
+  } else {
+    EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), 0);
+  }
+
+  // Create another 128 file metadata.
+  // This increases the file metadata cache reservation to 2 *
+  // CacheReservationManagerImpl<CacheEntryRole::kFileMetadata>::GetDummyEntrySize().
+  for (int i = 1; i <= 128; ++i) {
+    ASSERT_OK(Put(std::string(1024, 'a'), "vva"));
+    ASSERT_OK(Put("b", "vvb"));
+    ASSERT_OK(Flush());
+  }
+  if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) {
+    EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(),
+              2 * CacheReservationManagerImpl<
+                      CacheEntryRole::kFileMetadata>::GetDummyEntrySize());
+  } else {
+    EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), 0);
+  }
+  // Compaction will create 1 new file metadata, obsolete and delete all 256
+  // file metadata above. This results in 1 *
+  // CacheReservationManagerImpl<CacheEntryRole::kFileMetadata>::GetDummyEntrySize()
+  // cache reservation for file metadata.
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
+        "ChargeFileMetadataTestWithParam::"
+        "PreVerifyingCacheReservationRelease"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  TEST_SYNC_POINT(
+      "ChargeFileMetadataTestWithParam::PreVerifyingCacheReservationRelease");
+  if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) {
+    EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(),
+              1 * CacheReservationManagerImpl<
+                      CacheEntryRole::kFileMetadata>::GetDummyEntrySize());
+  } else {
+    EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), 0);
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // Destroying the db will delete the remaining 1 new file metadata
+  // This results in no cache reservation for file metadata.
+  Destroy(options);
+  EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(),
+            0 * CacheReservationManagerImpl<
+                    CacheEntryRole::kFileMetadata>::GetDummyEntrySize());
+
+  // Reopen the db with a smaller cache in order to test failure in allocating
+  // file metadata due to memory limit based on cache capacity
+  file_metadata_charge_only_cache = std::make_shared<
+      TargetCacheChargeTrackingCache<CacheEntryRole::kFileMetadata>>(
+      NewLRUCache(1 * CacheReservationManagerImpl<
+                          CacheEntryRole::kFileMetadata>::GetDummyEntrySize(),
+                  0 /* num_shard_bits */, true /* strict_capacity_limit */));
+  table_options.block_cache = file_metadata_charge_only_cache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  ASSERT_OK(Put(std::string(1024, 'a'), "va"));
+  ASSERT_OK(Put("b", "vb"));
+  Status s = Flush();
+  if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) {
+    EXPECT_TRUE(s.IsMemoryLimit());
+    EXPECT_TRUE(s.ToString().find(
+                    kCacheEntryRoleToCamelString[static_cast<std::uint32_t>(
+                        CacheEntryRole::kFileMetadata)]) != std::string::npos);
+    EXPECT_TRUE(s.ToString().find("memory limit based on cache capacity") !=
+                std::string::npos);
+  } else {
+    EXPECT_TRUE(s.ok());
+  }
+}
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/version_util.h b/src/rocksdb/db/version_util.h
new file mode 100644
index 000000000..5ec6fda11
--- /dev/null
+++ b/src/rocksdb/db/version_util.h
@@ -0,0 +1,71 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "db/version_set.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Instead of opening a `DB` to perform certain manifest updates, this
+// uses the underlying `VersionSet` API to read and modify the MANIFEST. This
+// allows us to use the user's real options, while not having to worry about
+// the DB persisting new SST files via flush/compaction or attempting to read/
+// compact files which may fail, particularly for the file we intend to remove
+// (the user may want to remove an already deleted file from MANIFEST).
+class OfflineManifestWriter {
+ public:
+  OfflineManifestWriter(const DBOptions& options, const std::string& db_path)
+      : wc_(options.delayed_write_rate),
+        wb_(options.db_write_buffer_size),
+        immutable_db_options_(WithDbPath(options, db_path)),
+        tc_(NewLRUCache(1 << 20 /* capacity */,
+                        options.table_cache_numshardbits)),
+        versions_(db_path, &immutable_db_options_, sopt_, tc_.get(), &wb_, &wc_,
+                  /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                  /*db_id*/ "", /*db_session_id*/ "") {}
+
+  Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families) {
+    return versions_.Recover(column_families, /*read_only*/ false,
+                             /*db_id*/ nullptr,
+                             /*no_error_if_files_missing*/ true);
+  }
+
+  Status LogAndApply(ColumnFamilyData* cfd, VersionEdit* edit,
+                     FSDirectory* dir_contains_current_file) {
+    // Use `mutex` to imitate a locked DB mutex when calling `LogAndApply()`.
+    InstrumentedMutex mutex;
+    mutex.Lock();
+    Status s = versions_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+                                     edit, &mutex, dir_contains_current_file,
+                                     false /* new_descriptor_log */);
+    mutex.Unlock();
+    return s;
+  }
+
+  VersionSet& Versions() { return versions_; }
+  const ImmutableDBOptions& IOptions() { return immutable_db_options_; }
+
+ private:
+  WriteController wc_;
+  WriteBufferManager wb_;
+  ImmutableDBOptions immutable_db_options_;
+  std::shared_ptr<Cache> tc_;
+  EnvOptions sopt_;
+  VersionSet versions_;
+
+  static ImmutableDBOptions WithDbPath(const DBOptions& options,
+                                       const std::string& db_path) {
+    ImmutableDBOptions rv(options);
+    if (rv.db_paths.empty()) {
+      // `VersionSet` expects options that have been through
+      // `SanitizeOptions()`, which would sanitize an empty `db_paths`.
+      rv.db_paths.emplace_back(db_path, 0 /* target_size */);
+    }
+    return rv;
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/wal_edit.cc b/src/rocksdb/db/wal_edit.cc
new file mode 100644
index 000000000..2525be610
--- /dev/null
+++ b/src/rocksdb/db/wal_edit.cc
@@ -0,0 +1,211 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/wal_edit.h"
+
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void WalAddition::EncodeTo(std::string* dst) const {
+  PutVarint64(dst, number_);
+
+  if (metadata_.HasSyncedSize()) {
+    PutVarint32(dst, static_cast<uint32_t>(WalAdditionTag::kSyncedSize));
+    PutVarint64(dst, metadata_.GetSyncedSizeInBytes());
+  }
+
+  PutVarint32(dst, static_cast<uint32_t>(WalAdditionTag::kTerminate));
+}
+
+Status WalAddition::DecodeFrom(Slice* src) {
+  constexpr char class_name[] = "WalAddition";
+
+  if (!GetVarint64(src, &number_)) {
+    return Status::Corruption(class_name, "Error decoding WAL log number");
+  }
+
+  while (true) {
+    uint32_t tag_value = 0;
+    if (!GetVarint32(src, &tag_value)) {
+      return Status::Corruption(class_name, "Error decoding tag");
+    }
+    WalAdditionTag tag = static_cast<WalAdditionTag>(tag_value);
+    switch (tag) {
+      case WalAdditionTag::kSyncedSize: {
+        uint64_t size = 0;
+        if (!GetVarint64(src, &size)) {
+          return Status::Corruption(class_name, "Error decoding WAL file size");
+        }
+        metadata_.SetSyncedSizeInBytes(size);
+        break;
+      }
+      // TODO: process future tags such as checksum.
+      case WalAdditionTag::kTerminate:
+        return Status::OK();
+      default: {
+        std::stringstream ss;
+        ss << "Unknown tag " << tag_value;
+        return Status::Corruption(class_name, ss.str());
+      }
+    }
+  }
+}
+
+JSONWriter& operator<<(JSONWriter& jw, const WalAddition& wal) {
+  jw << "LogNumber" << wal.GetLogNumber() << "SyncedSizeInBytes"
+     << wal.GetMetadata().GetSyncedSizeInBytes();
+  return jw;
+}
+
+std::ostream& operator<<(std::ostream& os, const WalAddition& wal) {
+  os << "log_number: " << wal.GetLogNumber()
+     << " synced_size_in_bytes: " << wal.GetMetadata().GetSyncedSizeInBytes();
+  return os;
+}
+
+std::string WalAddition::DebugString() const {
+  std::ostringstream oss;
+  oss << *this;
+  return oss.str();
+}
+
+void WalDeletion::EncodeTo(std::string* dst) const {
+  PutVarint64(dst, number_);
+}
+
+Status WalDeletion::DecodeFrom(Slice* src) {
+  constexpr char class_name[] = "WalDeletion";
+
+  if (!GetVarint64(src, &number_)) {
+    return Status::Corruption(class_name, "Error decoding WAL log number");
+  }
+
+  return Status::OK();
+}
+
+JSONWriter& operator<<(JSONWriter& jw, const WalDeletion& wal) {
+  jw << "LogNumber" << wal.GetLogNumber();
+  return jw;
+}
+
+std::ostream& operator<<(std::ostream& os, const WalDeletion& wal) {
+  os << "log_number: " << wal.GetLogNumber();
+  return os;
+}
+
+std::string WalDeletion::DebugString() const {
+  std::ostringstream oss;
+  oss << *this;
+  return oss.str();
+}
+
+Status WalSet::AddWal(const WalAddition& wal) {
+  if (wal.GetLogNumber() < min_wal_number_to_keep_) {
+    // The WAL has been obsolete, ignore it.
+    return Status::OK();
+  }
+
+  auto it = wals_.lower_bound(wal.GetLogNumber());
+  bool existing = it != wals_.end() && it->first == wal.GetLogNumber();
+
+  if (!existing) {
+    wals_.insert(it, {wal.GetLogNumber(), wal.GetMetadata()});
+    return Status::OK();
+  }
+
+  assert(existing);
+  if (!wal.GetMetadata().HasSyncedSize()) {
+    std::stringstream ss;
+    ss << "WAL " << wal.GetLogNumber() << " is created more than once";
+    return Status::Corruption("WalSet::AddWal", ss.str());
+  }
+
+  assert(wal.GetMetadata().HasSyncedSize());
+  if (it->second.HasSyncedSize() && wal.GetMetadata().GetSyncedSizeInBytes() <=
+                                        it->second.GetSyncedSizeInBytes()) {
+    // This is possible because version edits with different synced WAL sizes
+    // for the same WAL can be committed out-of-order. For example, thread
+    // 1 synces the first 10 bytes of 1.log, while thread 2 synces the first 20
+    // bytes of 1.log. It's possible that thread 1 calls LogAndApply() after
+    // thread 2.
+    // In this case, just return ok.
+    return Status::OK();
+  }
+
+  // Update synced size for the given WAL.
+  it->second.SetSyncedSizeInBytes(wal.GetMetadata().GetSyncedSizeInBytes());
+  return Status::OK();
+}
+
+Status WalSet::AddWals(const WalAdditions& wals) {
+  Status s;
+  for (const WalAddition& wal : wals) {
+    s = AddWal(wal);
+    if (!s.ok()) {
+      break;
+    }
+  }
+  return s;
+}
+
+Status WalSet::DeleteWalsBefore(WalNumber wal) {
+  if (wal > min_wal_number_to_keep_) {
+    min_wal_number_to_keep_ = wal;
+    wals_.erase(wals_.begin(), wals_.lower_bound(wal));
+  }
+  return Status::OK();
+}
+
+void WalSet::Reset() {
+  wals_.clear();
+  min_wal_number_to_keep_ = 0;
+}
+
+Status WalSet::CheckWals(
+    Env* env,
+    const std::unordered_map<WalNumber, std::string>& logs_on_disk) const {
+  assert(env != nullptr);
+
+  Status s;
+  for (const auto& wal : wals_) {
+    const uint64_t log_number = wal.first;
+    const WalMetadata& wal_meta = wal.second;
+
+    if (!wal_meta.HasSyncedSize()) {
+      // The WAL and WAL directory is not even synced,
+      // so the WAL's inode may not be persisted,
+      // then the WAL might not show up when listing WAL directory.
+      continue;
+    }
+
+    if (logs_on_disk.find(log_number) == logs_on_disk.end()) {
+      std::stringstream ss;
+      ss << "Missing WAL with log number: " << log_number << ".";
+      s = Status::Corruption(ss.str());
+      break;
+    }
+
+    uint64_t log_file_size = 0;
+    s = env->GetFileSize(logs_on_disk.at(log_number), &log_file_size);
+    if (!s.ok()) {
+      break;
+    }
+    if (log_file_size < wal_meta.GetSyncedSizeInBytes()) {
+      std::stringstream ss;
+      ss << "Size mismatch: WAL (log number: " << log_number
+         << ") in MANIFEST is " << wal_meta.GetSyncedSizeInBytes()
+         << " bytes , but actually is " << log_file_size << " bytes on disk.";
+      s = Status::Corruption(ss.str());
+      break;
+    }
+  }
+
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/wal_edit.h b/src/rocksdb/db/wal_edit.h
new file mode 100644
index 000000000..bb5c5e292
--- /dev/null
+++ b/src/rocksdb/db/wal_edit.h
@@ -0,0 +1,177 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+// WAL related classes used in VersionEdit and VersionSet.
+// Modifications to WalAddition and WalDeletion may need to update
+// VersionEdit and its related tests.
+
+#pragma once
+
+#include <map>
+#include <ostream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "logging/event_logger.h"
+#include "port/port.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class JSONWriter;
+class Slice;
+class Status;
+
+using WalNumber = uint64_t;
+
+// Metadata of a WAL.
+class WalMetadata {
+ public:
+  WalMetadata() = default;
+
+  explicit WalMetadata(uint64_t synced_size_bytes)
+      : synced_size_bytes_(synced_size_bytes) {}
+
+  bool HasSyncedSize() const { return synced_size_bytes_ != kUnknownWalSize; }
+
+  void SetSyncedSizeInBytes(uint64_t bytes) { synced_size_bytes_ = bytes; }
+
+  uint64_t GetSyncedSizeInBytes() const { return synced_size_bytes_; }
+
+ private:
+  friend bool operator==(const WalMetadata& lhs, const WalMetadata& rhs);
+  friend bool operator!=(const WalMetadata& lhs, const WalMetadata& rhs);
+  // The size of WAL is unknown, used when the WAL is not synced yet or is
+  // empty.
+  constexpr static uint64_t kUnknownWalSize =
+      std::numeric_limits<uint64_t>::max();
+
+  // Size of the most recently synced WAL in bytes.
+  uint64_t synced_size_bytes_ = kUnknownWalSize;
+};
+
+inline bool operator==(const WalMetadata& lhs, const WalMetadata& rhs) {
+  return lhs.synced_size_bytes_ == rhs.synced_size_bytes_;
+}
+
+inline bool operator!=(const WalMetadata& lhs, const WalMetadata& rhs) {
+  return !(lhs == rhs);
+}
+
+// These tags are persisted to MANIFEST, so it's part of the user API.
+enum class WalAdditionTag : uint32_t {
+  // Indicates that there are no more tags.
+  kTerminate = 1,
+  // Synced Size in bytes.
+  kSyncedSize = 2,
+  // Add tags in the future, such as checksum?
+};
+
+// Records the event of adding a WAL in VersionEdit.
+class WalAddition {
+ public:
+  WalAddition() : number_(0), metadata_() {}
+
+  explicit WalAddition(WalNumber number) : number_(number), metadata_() {}
+
+  WalAddition(WalNumber number, WalMetadata meta)
+      : number_(number), metadata_(std::move(meta)) {}
+
+  WalNumber GetLogNumber() const { return number_; }
+
+  const WalMetadata& GetMetadata() const { return metadata_; }
+
+  void EncodeTo(std::string* dst) const;
+
+  Status DecodeFrom(Slice* src);
+
+  std::string DebugString() const;
+
+ private:
+  WalNumber number_;
+  WalMetadata metadata_;
+};
+
+std::ostream& operator<<(std::ostream& os, const WalAddition& wal);
+JSONWriter& operator<<(JSONWriter& jw, const WalAddition& wal);
+
+using WalAdditions = std::vector<WalAddition>;
+
+// Records the event of deleting WALs before the specified log number.
+class WalDeletion {
+ public:
+  WalDeletion() : number_(kEmpty) {}
+
+  explicit WalDeletion(WalNumber number) : number_(number) {}
+
+  WalNumber GetLogNumber() const { return number_; }
+
+  void EncodeTo(std::string* dst) const;
+
+  Status DecodeFrom(Slice* src);
+
+  std::string DebugString() const;
+
+  bool IsEmpty() const { return number_ == kEmpty; }
+
+  void Reset() { number_ = kEmpty; }
+
+ private:
+  static constexpr WalNumber kEmpty = 0;
+
+  WalNumber number_;
+};
+
+std::ostream& operator<<(std::ostream& os, const WalDeletion& wal);
+JSONWriter& operator<<(JSONWriter& jw, const WalDeletion& wal);
+
+// Used in VersionSet to keep the current set of WALs.
+//
+// When a WAL is synced or becomes obsoleted,
+// a VersionEdit is logged to MANIFEST and
+// the WAL is added to or deleted from WalSet.
+//
+// Not thread safe, needs external synchronization such as holding DB mutex.
+class WalSet {
+ public:
+  // Add WAL(s).
+  // If the WAL is closed,
+  // then there must be an existing unclosed WAL,
+  // otherwise, return Status::Corruption.
+  // Can happen when applying a VersionEdit or recovering from MANIFEST.
+  Status AddWal(const WalAddition& wal);
+  Status AddWals(const WalAdditions& wals);
+
+  // Delete WALs with log number smaller than the specified wal number.
+  // Can happen when applying a VersionEdit or recovering from MANIFEST.
+  Status DeleteWalsBefore(WalNumber wal);
+
+  // Resets the internal state.
+  void Reset();
+
+  // WALs with number less than MinWalNumberToKeep should not exist in WalSet.
+  WalNumber GetMinWalNumberToKeep() const { return min_wal_number_to_keep_; }
+
+  const std::map<WalNumber, WalMetadata>& GetWals() const { return wals_; }
+
+  // Checks whether there are missing or corrupted WALs.
+  // Returns Status::OK if there is no missing nor corrupted WAL,
+  // otherwise returns Status::Corruption.
+  // logs_on_disk is a map from log number to the log filename.
+  // Note that logs_on_disk may contain logs that is obsolete but
+  // haven't been deleted from disk.
+  Status CheckWals(
+      Env* env,
+      const std::unordered_map<WalNumber, std::string>& logs_on_disk) const;
+
+ private:
+  std::map<WalNumber, WalMetadata> wals_;
+  // WAL number < min_wal_number_to_keep_ should not exist in wals_.
+  // It's monotonically increasing, in-memory only, not written to MANIFEST.
+  WalNumber min_wal_number_to_keep_ = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/wal_edit_test.cc b/src/rocksdb/db/wal_edit_test.cc
new file mode 100644
index 000000000..0c18fb125
--- /dev/null
+++ b/src/rocksdb/db/wal_edit_test.cc
@@ -0,0 +1,213 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/wal_edit.h"
+
+#include "db/db_test_util.h"
+#include "file/file_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+TEST(WalSet, AddDeleteReset) {
+  WalSet wals;
+  ASSERT_TRUE(wals.GetWals().empty());
+
+  // Create WAL 1 - 10.
+  for (WalNumber log_number = 1; log_number <= 10; log_number++) {
+    wals.AddWal(WalAddition(log_number));
+  }
+  ASSERT_EQ(wals.GetWals().size(), 10);
+
+  // Delete WAL 1 - 5.
+  wals.DeleteWalsBefore(6);
+  ASSERT_EQ(wals.GetWals().size(), 5);
+
+  WalNumber expected_log_number = 6;
+  for (auto it : wals.GetWals()) {
+    WalNumber log_number = it.first;
+    ASSERT_EQ(log_number, expected_log_number++);
+  }
+
+  wals.Reset();
+  ASSERT_TRUE(wals.GetWals().empty());
+}
+
+TEST(WalSet, Overwrite) {
+  constexpr WalNumber kNumber = 100;
+  constexpr uint64_t kBytes = 200;
+  WalSet wals;
+  wals.AddWal(WalAddition(kNumber));
+  ASSERT_FALSE(wals.GetWals().at(kNumber).HasSyncedSize());
+  wals.AddWal(WalAddition(kNumber, WalMetadata(kBytes)));
+  ASSERT_TRUE(wals.GetWals().at(kNumber).HasSyncedSize());
+  ASSERT_EQ(wals.GetWals().at(kNumber).GetSyncedSizeInBytes(), kBytes);
+}
+
+TEST(WalSet, SmallerSyncedSize) {
+  constexpr WalNumber kNumber = 100;
+  constexpr uint64_t kBytes = 100;
+  WalSet wals;
+  ASSERT_OK(wals.AddWal(WalAddition(kNumber, WalMetadata(kBytes))));
+  const auto wals1 = wals.GetWals();
+  Status s = wals.AddWal(WalAddition(kNumber, WalMetadata(0)));
+  const auto wals2 = wals.GetWals();
+  ASSERT_OK(s);
+  ASSERT_EQ(wals1, wals2);
+}
+
+TEST(WalSet, CreateTwice) {
+  constexpr WalNumber kNumber = 100;
+  WalSet wals;
+  ASSERT_OK(wals.AddWal(WalAddition(kNumber)));
+  Status s = wals.AddWal(WalAddition(kNumber));
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(s.ToString().find("WAL 100 is created more than once") !=
+              std::string::npos);
+}
+
+TEST(WalSet, DeleteAllWals) {
+  constexpr WalNumber kMaxWalNumber = 10;
+  WalSet wals;
+  for (WalNumber i = 1; i <= kMaxWalNumber; i++) {
+    wals.AddWal(WalAddition(i));
+  }
+  ASSERT_OK(wals.DeleteWalsBefore(kMaxWalNumber + 1));
+}
+
+TEST(WalSet, AddObsoleteWal) {
+  constexpr WalNumber kNumber = 100;
+  WalSet wals;
+  ASSERT_OK(wals.DeleteWalsBefore(kNumber + 1));
+  ASSERT_OK(wals.AddWal(WalAddition(kNumber)));
+  ASSERT_TRUE(wals.GetWals().empty());
+}
+
+TEST(WalSet, MinWalNumberToKeep) {
+  constexpr WalNumber kNumber = 100;
+  WalSet wals;
+  ASSERT_EQ(wals.GetMinWalNumberToKeep(), 0);
+  ASSERT_OK(wals.DeleteWalsBefore(kNumber));
+  ASSERT_EQ(wals.GetMinWalNumberToKeep(), kNumber);
+  ASSERT_OK(wals.DeleteWalsBefore(kNumber - 1));
+  ASSERT_EQ(wals.GetMinWalNumberToKeep(), kNumber);
+  ASSERT_OK(wals.DeleteWalsBefore(kNumber + 1));
+  ASSERT_EQ(wals.GetMinWalNumberToKeep(), kNumber + 1);
+}
+
+class WalSetTest : public DBTestBase {
+ public:
+  WalSetTest() : DBTestBase("WalSetTest", /* env_do_fsync */ true) {}
+
+  void SetUp() override {
+    test_dir_ = test::PerThreadDBPath("wal_set_test");
+    ASSERT_OK(env_->CreateDir(test_dir_));
+  }
+
+  void TearDown() override {
+    EXPECT_OK(DestroyDir(env_, test_dir_));
+    logs_on_disk_.clear();
+    wals_.Reset();
+  }
+
+  void CreateWalOnDisk(WalNumber number, const std::string& fname,
+                       uint64_t size_bytes) {
+    std::unique_ptr<WritableFile> f;
+    std::string fpath = Path(fname);
+    ASSERT_OK(env_->NewWritableFile(fpath, &f, EnvOptions()));
+    std::string content(size_bytes, '0');
+    ASSERT_OK(f->Append(content));
+    ASSERT_OK(f->Close());
+
+    logs_on_disk_[number] = fpath;
+  }
+
+  void AddWalToWalSet(WalNumber number, uint64_t size_bytes) {
+    // Create WAL.
+    ASSERT_OK(wals_.AddWal(WalAddition(number)));
+    // Close WAL.
+    WalMetadata wal(size_bytes);
+    ASSERT_OK(wals_.AddWal(WalAddition(number, wal)));
+  }
+
+  Status CheckWals() const { return wals_.CheckWals(env_, logs_on_disk_); }
+
+ private:
+  std::string test_dir_;
+  std::unordered_map<WalNumber, std::string> logs_on_disk_;
+  WalSet wals_;
+
+  std::string Path(const std::string& fname) { return test_dir_ + "/" + fname; }
+};
+
+TEST_F(WalSetTest, CheckEmptyWals) { ASSERT_OK(CheckWals()); }
+
+TEST_F(WalSetTest, CheckWals) {
+  for (int number = 1; number < 10; number++) {
+    uint64_t size = rand() % 100;
+    std::stringstream ss;
+    ss << "log" << number;
+    std::string fname = ss.str();
+    CreateWalOnDisk(number, fname, size);
+    // log 0 - 5 are obsolete.
+    if (number > 5) {
+      AddWalToWalSet(number, size);
+    }
+  }
+  ASSERT_OK(CheckWals());
+}
+
+TEST_F(WalSetTest, CheckMissingWals) {
+  for (int number = 1; number < 10; number++) {
+    uint64_t size = rand() % 100;
+    AddWalToWalSet(number, size);
+    // logs with even number are missing from disk.
+    if (number % 2) {
+      std::stringstream ss;
+      ss << "log" << number;
+      std::string fname = ss.str();
+      CreateWalOnDisk(number, fname, size);
+    }
+  }
+
+  Status s = CheckWals();
+  ASSERT_TRUE(s.IsCorruption()) << s.ToString();
+  // The first log with even number is missing.
+  std::stringstream expected_err;
+  expected_err << "Missing WAL with log number: " << 2;
+  ASSERT_TRUE(s.ToString().find(expected_err.str()) != std::string::npos)
+      << s.ToString();
+}
+
+TEST_F(WalSetTest, CheckWalsWithShrinkedSize) {
+  for (int number = 1; number < 10; number++) {
+    uint64_t size = rand() % 100 + 1;
+    AddWalToWalSet(number, size);
+    // logs with even number have shrinked size.
+    std::stringstream ss;
+    ss << "log" << number;
+    std::string fname = ss.str();
+    CreateWalOnDisk(number, fname, (number % 2) ? size : size - 1);
+  }
+
+  Status s = CheckWals();
+  ASSERT_TRUE(s.IsCorruption()) << s.ToString();
+  // The first log with even number has wrong size.
+  std::stringstream expected_err;
+  expected_err << "Size mismatch: WAL (log number: " << 2 << ")";
+  ASSERT_TRUE(s.ToString().find(expected_err.str()) != std::string::npos)
+      << s.ToString();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/wal_manager.cc b/src/rocksdb/db/wal_manager.cc
new file mode 100644
index 000000000..a6060235f
--- /dev/null
+++ b/src/rocksdb/db/wal_manager.cc
@@ -0,0 +1,529 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/wal_manager.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <memory>
+#include <vector>
+
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/transaction_log_impl.h"
+#include "db/write_batch_internal.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "file/sequence_file_reader.h"
+#include "logging/logging.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/write_batch.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/mutexlock.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+
+Status WalManager::DeleteFile(const std::string& fname, uint64_t number) {
+  auto s = env_->DeleteFile(wal_dir_ + "/" + fname);
+  if (s.ok()) {
+    MutexLock l(&read_first_record_cache_mutex_);
+    read_first_record_cache_.erase(number);
+  }
+  return s;
+}
+
+Status WalManager::GetSortedWalFiles(VectorLogPtr& files) {
+  // First get sorted files in db dir, then get sorted files from archived
+  // dir, to avoid a race condition where a log file is moved to archived
+  // dir in between.
+  Status s;
+  // list wal files in main db dir.
+  VectorLogPtr logs;
+  s = GetSortedWalsOfType(wal_dir_, logs, kAliveLogFile);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Reproduce the race condition where a log file is moved
+  // to archived dir, between these two sync points, used in
+  // (DBTest,TransactionLogIteratorRace)
+  TEST_SYNC_POINT("WalManager::GetSortedWalFiles:1");
+  TEST_SYNC_POINT("WalManager::GetSortedWalFiles:2");
+
+  files.clear();
+  // list wal files in archive dir.
+  std::string archivedir = ArchivalDirectory(wal_dir_);
+  Status exists = env_->FileExists(archivedir);
+  if (exists.ok()) {
+    s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile);
+    if (!s.ok()) {
+      return s;
+    }
+  } else if (!exists.IsNotFound()) {
+    assert(s.IsIOError());
+    return s;
+  }
+
+  uint64_t latest_archived_log_number = 0;
+  if (!files.empty()) {
+    latest_archived_log_number = files.back()->LogNumber();
+    ROCKS_LOG_INFO(db_options_.info_log, "Latest Archived log: %" PRIu64,
+                   latest_archived_log_number);
+  }
+
+  files.reserve(files.size() + logs.size());
+  for (auto& log : logs) {
+    if (log->LogNumber() > latest_archived_log_number) {
+      files.push_back(std::move(log));
+    } else {
+      // When the race condition happens, we could see the
+      // same log in both db dir and archived dir. Simply
+      // ignore the one in db dir. Note that, if we read
+      // archived dir first, we would have missed the log file.
+      ROCKS_LOG_WARN(db_options_.info_log, "%s already moved to archive",
+                     log->PathName().c_str());
+    }
+  }
+
+  return s;
+}
+
+Status WalManager::GetUpdatesSince(
+    SequenceNumber seq, std::unique_ptr<TransactionLogIterator>* iter,
+    const TransactionLogIterator::ReadOptions& read_options,
+    VersionSet* version_set) {
+  if (seq_per_batch_) {
+    return Status::NotSupported();
+  }
+
+  assert(!seq_per_batch_);
+
+  //  Get all sorted Wal Files.
+  //  Do binary search and open files and find the seq number.
+
+  std::unique_ptr<VectorLogPtr> wal_files(new VectorLogPtr);
+  Status s = GetSortedWalFiles(*wal_files);
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = RetainProbableWalFiles(*wal_files, seq);
+  if (!s.ok()) {
+    return s;
+  }
+  iter->reset(new TransactionLogIteratorImpl(
+      wal_dir_, &db_options_, read_options, file_options_, seq,
+      std::move(wal_files), version_set, seq_per_batch_, io_tracer_));
+  return (*iter)->status();
+}
+
+// 1. Go through all archived files and
+//    a. if ttl is enabled, delete outdated files
+//    b. if archive size limit is enabled, delete empty files,
+//        compute file number and size.
+// 2. If size limit is enabled:
+//    a. compute how many files should be deleted
+//    b. get sorted non-empty archived logs
+//    c. delete what should be deleted
+void WalManager::PurgeObsoleteWALFiles() {
+  bool const ttl_enabled = db_options_.WAL_ttl_seconds > 0;
+  bool const size_limit_enabled = db_options_.WAL_size_limit_MB > 0;
+  if (!ttl_enabled && !size_limit_enabled) {
+    return;
+  }
+
+  int64_t current_time = 0;
+  Status s = db_options_.clock->GetCurrentTime(&current_time);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(db_options_.info_log, "Can't get current time: %s",
+                    s.ToString().c_str());
+    assert(false);
+    return;
+  }
+  uint64_t const now_seconds = static_cast<uint64_t>(current_time);
+  uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled)
+                                     ? db_options_.WAL_ttl_seconds / 2
+                                     : kDefaultIntervalToDeleteObsoleteWAL;
+
+  if (purge_wal_files_last_run_ + time_to_check > now_seconds) {
+    return;
+  }
+
+  purge_wal_files_last_run_ = now_seconds;
+
+  std::string archival_dir = ArchivalDirectory(wal_dir_);
+  std::vector<std::string> files;
+  s = env_->GetChildren(archival_dir, &files);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(db_options_.info_log, "Can't get archive files: %s",
+                    s.ToString().c_str());
+    assert(false);
+    return;
+  }
+
+  size_t log_files_num = 0;
+  uint64_t log_file_size = 0;
+  for (auto& f : files) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(f, &number, &type) && type == kWalFile) {
+      std::string const file_path = archival_dir + "/" + f;
+      if (ttl_enabled) {
+        uint64_t file_m_time;
+        s = env_->GetFileModificationTime(file_path, &file_m_time);
+        if (!s.ok()) {
+          ROCKS_LOG_WARN(db_options_.info_log,
+                         "Can't get file mod time: %s: %s", file_path.c_str(),
+                         s.ToString().c_str());
+          continue;
+        }
+        if (now_seconds - file_m_time > db_options_.WAL_ttl_seconds) {
+          s = DeleteDBFile(&db_options_, file_path, archival_dir, false,
+                           /*force_fg=*/!wal_in_db_path_);
+          if (!s.ok()) {
+            ROCKS_LOG_WARN(db_options_.info_log, "Can't delete file: %s: %s",
+                           file_path.c_str(), s.ToString().c_str());
+            continue;
+          } else {
+            MutexLock l(&read_first_record_cache_mutex_);
+            read_first_record_cache_.erase(number);
+          }
+          continue;
+        }
+      }
+
+      if (size_limit_enabled) {
+        uint64_t file_size;
+        s = env_->GetFileSize(file_path, &file_size);
+        if (!s.ok()) {
+          ROCKS_LOG_ERROR(db_options_.info_log,
+                          "Unable to get file size: %s: %s", file_path.c_str(),
+                          s.ToString().c_str());
+          return;
+        } else {
+          if (file_size > 0) {
+            log_file_size = std::max(log_file_size, file_size);
+            ++log_files_num;
+          } else {
+            s = DeleteDBFile(&db_options_, file_path, archival_dir, false,
+                             /*force_fg=*/!wal_in_db_path_);
+            if (!s.ok()) {
+              ROCKS_LOG_WARN(db_options_.info_log,
+                             "Unable to delete file: %s: %s", file_path.c_str(),
+                             s.ToString().c_str());
+              continue;
+            } else {
+              MutexLock l(&read_first_record_cache_mutex_);
+              read_first_record_cache_.erase(number);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (0 == log_files_num || !size_limit_enabled) {
+    return;
+  }
+
+  size_t const files_keep_num = static_cast<size_t>(
+      db_options_.WAL_size_limit_MB * 1024 * 1024 / log_file_size);
+  if (log_files_num <= files_keep_num) {
+    return;
+  }
+
+  size_t files_del_num = log_files_num - files_keep_num;
+  VectorLogPtr archived_logs;
+  s = GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile);
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Unable to get archived WALs from: %s: %s",
+                   archival_dir.c_str(), s.ToString().c_str());
+    files_del_num = 0;
+  } else if (files_del_num > archived_logs.size()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Trying to delete more archived log files than "
+                   "exist. Deleting all");
+    files_del_num = archived_logs.size();
+  }
+
+  for (size_t i = 0; i < files_del_num; ++i) {
+    std::string const file_path = archived_logs[i]->PathName();
+    s = DeleteDBFile(&db_options_, wal_dir_ + "/" + file_path, wal_dir_, false,
+                     /*force_fg=*/!wal_in_db_path_);
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(db_options_.info_log, "Unable to delete file: %s: %s",
+                     file_path.c_str(), s.ToString().c_str());
+      continue;
+    } else {
+      MutexLock l(&read_first_record_cache_mutex_);
+      read_first_record_cache_.erase(archived_logs[i]->LogNumber());
+    }
+  }
+}
+
+void WalManager::ArchiveWALFile(const std::string& fname, uint64_t number) {
+  auto archived_log_name = ArchivedLogFileName(wal_dir_, number);
+  // The sync point below is used in (DBTest,TransactionLogIteratorRace)
+  TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:1");
+  Status s = env_->RenameFile(fname, archived_log_name);
+  // The sync point below is used in (DBTest,TransactionLogIteratorRace)
+  TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:2");
+  ROCKS_LOG_INFO(db_options_.info_log, "Move log file %s to %s -- %s\n",
+                 fname.c_str(), archived_log_name.c_str(),
+                 s.ToString().c_str());
+}
+
+Status WalManager::GetSortedWalsOfType(const std::string& path,
+                                       VectorLogPtr& log_files,
+                                       WalFileType log_type) {
+  std::vector<std::string> all_files;
+  const Status status = env_->GetChildren(path, &all_files);
+  if (!status.ok()) {
+    return status;
+  }
+  log_files.reserve(all_files.size());
+  for (const auto& f : all_files) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(f, &number, &type) && type == kWalFile) {
+      SequenceNumber sequence;
+      Status s = ReadFirstRecord(log_type, number, &sequence);
+      if (!s.ok()) {
+        return s;
+      }
+      if (sequence == 0) {
+        // empty file
+        continue;
+      }
+
+      // Reproduce the race condition where a log file is moved
+      // to archived dir, between these two sync points, used in
+      // (DBTest,TransactionLogIteratorRace)
+      TEST_SYNC_POINT("WalManager::GetSortedWalsOfType:1");
+      TEST_SYNC_POINT("WalManager::GetSortedWalsOfType:2");
+
+      uint64_t size_bytes;
+      s = env_->GetFileSize(LogFileName(path, number), &size_bytes);
+      // re-try in case the alive log file has been moved to archive.
+      if (!s.ok() && log_type == kAliveLogFile) {
+        std::string archived_file = ArchivedLogFileName(path, number);
+        if (env_->FileExists(archived_file).ok()) {
+          s = env_->GetFileSize(archived_file, &size_bytes);
+          if (!s.ok() && env_->FileExists(archived_file).IsNotFound()) {
+            // oops, the file just got deleted from archived dir! move on
+            s = Status::OK();
+            continue;
+          }
+        }
+      }
+      if (!s.ok()) {
+        return s;
+      }
+
+      log_files.push_back(std::unique_ptr<LogFile>(
+          new LogFileImpl(number, log_type, sequence, size_bytes)));
+    }
+  }
+  std::sort(
+      log_files.begin(), log_files.end(),
+      [](const std::unique_ptr<LogFile>& a, const std::unique_ptr<LogFile>& b) {
+        LogFileImpl* a_impl = static_cast_with_check<LogFileImpl>(a.get());
+        LogFileImpl* b_impl = static_cast_with_check<LogFileImpl>(b.get());
+        return *a_impl < *b_impl;
+      });
+  return status;
+}
+
+Status WalManager::RetainProbableWalFiles(VectorLogPtr& all_logs,
+                                          const SequenceNumber target) {
+  int64_t start = 0;  // signed to avoid overflow when target is < first file.
+  int64_t end = static_cast<int64_t>(all_logs.size()) - 1;
+  // Binary Search. avoid opening all files.
+  while (end >= start) {
+    int64_t mid = start + (end - start) / 2;  // Avoid overflow.
+    SequenceNumber current_seq_num =
+        all_logs.at(static_cast<size_t>(mid))->StartSequence();
+    if (current_seq_num == target) {
+      end = mid;
+      break;
+    } else if (current_seq_num < target) {
+      start = mid + 1;
+    } else {
+      end = mid - 1;
+    }
+  }
+  // end could be -ve.
+  size_t start_index =
+      static_cast<size_t>(std::max(static_cast<int64_t>(0), end));
+  // The last wal file is always included
+  all_logs.erase(all_logs.begin(), all_logs.begin() + start_index);
+  return Status::OK();
+}
+
+Status WalManager::ReadFirstRecord(const WalFileType type,
+                                   const uint64_t number,
+                                   SequenceNumber* sequence) {
+  *sequence = 0;
+  if (type != kAliveLogFile && type != kArchivedLogFile) {
+    ROCKS_LOG_ERROR(db_options_.info_log, "[WalManger] Unknown file type %s",
+                    std::to_string(type).c_str());
+    return Status::NotSupported("File Type Not Known " + std::to_string(type));
+  }
+  {
+    MutexLock l(&read_first_record_cache_mutex_);
+    auto itr = read_first_record_cache_.find(number);
+    if (itr != read_first_record_cache_.end()) {
+      *sequence = itr->second;
+      return Status::OK();
+    }
+  }
+  Status s;
+  if (type == kAliveLogFile) {
+    std::string fname = LogFileName(wal_dir_, number);
+    s = ReadFirstLine(fname, number, sequence);
+    if (!s.ok() && env_->FileExists(fname).ok()) {
+      // return any error that is not caused by non-existing file
+      return s;
+    }
+  }
+
+  if (type == kArchivedLogFile || !s.ok()) {
+    //  check if the file got moved to archive.
+    std::string archived_file = ArchivedLogFileName(wal_dir_, number);
+    s = ReadFirstLine(archived_file, number, sequence);
+    // maybe the file was deleted from archive dir. If that's the case, return
+    // Status::OK(). The caller with identify this as empty file because
+    // *sequence == 0
+    if (!s.ok() && env_->FileExists(archived_file).IsNotFound()) {
+      return Status::OK();
+    }
+  }
+
+  if (s.ok() && *sequence != 0) {
+    MutexLock l(&read_first_record_cache_mutex_);
+    read_first_record_cache_.insert({number, *sequence});
+  }
+  return s;
+}
+
+Status WalManager::GetLiveWalFile(uint64_t number,
+                                  std::unique_ptr<LogFile>* log_file) {
+  if (!log_file) {
+    return Status::InvalidArgument("log_file not preallocated.");
+  }
+
+  if (!number) {
+    return Status::PathNotFound("log file not available");
+  }
+
+  Status s;
+
+  uint64_t size_bytes;
+  s = env_->GetFileSize(LogFileName(wal_dir_, number), &size_bytes);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  log_file->reset(new LogFileImpl(number, kAliveLogFile,
+                                  0,  // SequenceNumber
+                                  size_bytes));
+
+  return Status::OK();
+}
+
+// the function returns status.ok() and sequence == 0 if the file exists, but is
+// empty
+Status WalManager::ReadFirstLine(const std::string& fname,
+                                 const uint64_t number,
+                                 SequenceNumber* sequence) {
+  struct LogReporter : public log::Reader::Reporter {
+    Env* env;
+    Logger* info_log;
+    const char* fname;
+
+    Status* status;
+    bool ignore_error;  // true if db_options_.paranoid_checks==false
+    void Corruption(size_t bytes, const Status& s) override {
+      ROCKS_LOG_WARN(info_log, "[WalManager] %s%s: dropping %d bytes; %s",
+                     (this->ignore_error ? "(ignoring error) " : ""), fname,
+                     static_cast<int>(bytes), s.ToString().c_str());
+      if (this->status->ok()) {
+        // only keep the first error
+        *this->status = s;
+      }
+    }
+  };
+
+  std::unique_ptr<FSSequentialFile> file;
+  Status status = fs_->NewSequentialFile(
+      fname, fs_->OptimizeForLogRead(file_options_), &file, nullptr);
+  std::unique_ptr<SequentialFileReader> file_reader(
+      new SequentialFileReader(std::move(file), fname, io_tracer_));
+
+  if (!status.ok()) {
+    return status;
+  }
+
+  LogReporter reporter;
+  reporter.env = env_;
+  reporter.info_log = db_options_.info_log.get();
+  reporter.fname = fname.c_str();
+  reporter.status = &status;
+  reporter.ignore_error = !db_options_.paranoid_checks;
+  log::Reader reader(db_options_.info_log, std::move(file_reader), &reporter,
+                     true /*checksum*/, number);
+  std::string scratch;
+  Slice record;
+
+  if (reader.ReadRecord(&record, &scratch) &&
+      (status.ok() || !db_options_.paranoid_checks)) {
+    if (record.size() < WriteBatchInternal::kHeader) {
+      reporter.Corruption(record.size(),
+                          Status::Corruption("log record too small"));
+      // TODO read record's till the first no corrupt entry?
+    } else {
+      WriteBatch batch;
+      // We can overwrite an existing non-OK Status since it'd only reach here
+      // with `paranoid_checks == false`.
+      status = WriteBatchInternal::SetContents(&batch, record);
+      if (status.ok()) {
+        *sequence = WriteBatchInternal::Sequence(&batch);
+        return status;
+      }
+    }
+  }
+
+  if (status.ok() && reader.IsCompressedAndEmptyFile()) {
+    // In case of wal_compression, it writes a `kSetCompressionType` record
+    // which is not associated with any sequence number. As result for an empty
+    // file, GetSortedWalsOfType() will skip these WALs causing the operations
+    // to fail.
+    // Therefore, in order to avoid that failure, it sets sequence_number to 1
+    // indicating those WALs should be included.
+    *sequence = 1;
+  } else {
+    // ReadRecord might have returned false on EOF, which means that the log
+    // file is empty. Or, a failure may have occurred while processing the first
+    // entry. In any case, return status and set sequence number to 0.
+    *sequence = 0;
+  }
+  return status;
+}
+
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/wal_manager.h b/src/rocksdb/db/wal_manager.h
new file mode 100644
index 000000000..8cc067935
--- /dev/null
+++ b/src/rocksdb/db/wal_manager.h
@@ -0,0 +1,138 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <limits>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/version_set.h"
+#include "file/file_util.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/transaction_log.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+
+// WAL manager provides the abstraction for reading the WAL files as a single
+// unit. Internally, it opens and reads the files using Reader or Writer
+// abstraction.
+class WalManager {
+ public:
+  WalManager(const ImmutableDBOptions& db_options,
+             const FileOptions& file_options,
+             const std::shared_ptr<IOTracer>& io_tracer,
+             const bool seq_per_batch = false)
+      : db_options_(db_options),
+        file_options_(file_options),
+        env_(db_options.env),
+        fs_(db_options.fs, io_tracer),
+        purge_wal_files_last_run_(0),
+        seq_per_batch_(seq_per_batch),
+        wal_dir_(db_options_.GetWalDir()),
+        wal_in_db_path_(db_options_.IsWalDirSameAsDBPath()),
+        io_tracer_(io_tracer) {}
+
+  Status GetSortedWalFiles(VectorLogPtr& files);
+
+  // Allow user to tail transaction log to find all recent changes to the
+  // database that are newer than `seq_number`.
+  Status GetUpdatesSince(
+      SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
+      const TransactionLogIterator::ReadOptions& read_options,
+      VersionSet* version_set);
+
+  void PurgeObsoleteWALFiles();
+
+  void ArchiveWALFile(const std::string& fname, uint64_t number);
+
+  Status DeleteFile(const std::string& fname, uint64_t number);
+
+  Status GetLiveWalFile(uint64_t number, std::unique_ptr<LogFile>* log_file);
+
+  Status TEST_ReadFirstRecord(const WalFileType type, const uint64_t number,
+                              SequenceNumber* sequence) {
+    return ReadFirstRecord(type, number, sequence);
+  }
+
+  Status TEST_ReadFirstLine(const std::string& fname, const uint64_t number,
+                            SequenceNumber* sequence) {
+    return ReadFirstLine(fname, number, sequence);
+  }
+
+ private:
+  Status GetSortedWalsOfType(const std::string& path, VectorLogPtr& log_files,
+                             WalFileType type);
+  // Requires: all_logs should be sorted with earliest log file first
+  // Retains all log files in all_logs which contain updates with seq no.
+  // Greater Than or Equal to the requested SequenceNumber.
+  Status RetainProbableWalFiles(VectorLogPtr& all_logs,
+                                const SequenceNumber target);
+
+  // ReadFirstRecord checks the read_first_record_cache_ to see if the entry
+  // exists or not. If not, it will read the WAL file.
+  // In case of wal_compression, WAL contains a `kSetCompressionType` record
+  // which is not associated with any sequence number. So the sequence_number is
+  // set to 1 if that WAL doesn't include any other record (basically empty) in
+  // order to include that WAL and is inserted in read_first_record_cache_.
+  // Therefore, sequence_number is used as boolean if WAL should be included or
+  // not and that sequence_number shouldn't be use for any other purpose.
+  Status ReadFirstRecord(const WalFileType type, const uint64_t number,
+                         SequenceNumber* sequence);
+
+  // In case of no wal_compression, ReadFirstLine returns status.ok() and
+  // sequence == 0 if the file exists, but is empty.
+  // In case of wal_compression, WAL contains
+  // `kSetCompressionType` record which is not associated with any sequence
+  // number if that WAL doesn't include any other record (basically empty). As
+  // result for an empty file, GetSortedWalsOfType() will skip these WALs
+  // causing the operations to fail. To avoid that, it sets sequence_number to
+  // 1 inorder to include that WAL.
+  Status ReadFirstLine(const std::string& fname, const uint64_t number,
+                       SequenceNumber* sequence);
+
+  // ------- state from DBImpl ------
+  const ImmutableDBOptions& db_options_;
+  const FileOptions file_options_;
+  Env* env_;
+  const FileSystemPtr fs_;
+
+  // ------- WalManager state -------
+  // cache for ReadFirstRecord() calls
+  std::unordered_map<uint64_t, SequenceNumber> read_first_record_cache_;
+  port::Mutex read_first_record_cache_mutex_;
+
+  // last time when PurgeObsoleteWALFiles ran.
+  uint64_t purge_wal_files_last_run_;
+
+  bool seq_per_batch_;
+
+  const std::string& wal_dir_;
+
+  bool wal_in_db_path_;
+
+  // obsolete files will be deleted every this seconds if ttl deletion is
+  // enabled and archive size_limit is disabled.
+  static constexpr uint64_t kDefaultIntervalToDeleteObsoleteWAL = 600;
+
+  std::shared_ptr<IOTracer> io_tracer_;
+};
+
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/wal_manager_test.cc b/src/rocksdb/db/wal_manager_test.cc
new file mode 100644
index 000000000..4ad4e9749
--- /dev/null
+++ b/src/rocksdb/db/wal_manager_test.cc
@@ -0,0 +1,346 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/wal_manager.h"
+
+#include <map>
+#include <string>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/log_writer.h"
+#include "db/version_set.h"
+#include "env/mock_env.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/mock_table.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO(icanadi) mock out VersionSet
+// TODO(icanadi) move other WalManager-specific tests from db_test here
+class WalManagerTest : public testing::Test {
+ public:
+  WalManagerTest()
+      : dbname_(test::PerThreadDBPath("wal_manager_test")),
+        db_options_(),
+        table_cache_(NewLRUCache(50000, 16)),
+        write_buffer_manager_(db_options_.db_write_buffer_size),
+        current_log_number_(0) {
+    env_.reset(MockEnv::Create(Env::Default()));
+    EXPECT_OK(DestroyDB(dbname_, Options()));
+  }
+
+  void Init() {
+    ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+    ASSERT_OK(env_->CreateDirIfMissing(ArchivalDirectory(dbname_)));
+    db_options_.db_paths.emplace_back(dbname_,
+                                      std::numeric_limits<uint64_t>::max());
+    db_options_.wal_dir = dbname_;
+    db_options_.env = env_.get();
+    db_options_.fs = env_->GetFileSystem();
+    db_options_.clock = env_->GetSystemClock().get();
+
+    versions_.reset(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_id*/ "", /*db_session_id*/ ""));
+
+    wal_manager_.reset(
+        new WalManager(db_options_, env_options_, nullptr /*IOTracer*/));
+  }
+
+  void Reopen() {
+    wal_manager_.reset(
+        new WalManager(db_options_, env_options_, nullptr /*IOTracer*/));
+  }
+
+  // NOT thread safe
+  void Put(const std::string& key, const std::string& value) {
+    assert(current_log_writer_.get() != nullptr);
+    uint64_t seq = versions_->LastSequence() + 1;
+    WriteBatch batch;
+    ASSERT_OK(batch.Put(key, value));
+    WriteBatchInternal::SetSequence(&batch, seq);
+    ASSERT_OK(
+        current_log_writer_->AddRecord(WriteBatchInternal::Contents(&batch)));
+    versions_->SetLastAllocatedSequence(seq);
+    versions_->SetLastPublishedSequence(seq);
+    versions_->SetLastSequence(seq);
+  }
+
+  // NOT thread safe
+  void RollTheLog(bool /*archived*/) {
+    current_log_number_++;
+    std::string fname = ArchivedLogFileName(dbname_, current_log_number_);
+    const auto& fs = env_->GetFileSystem();
+    std::unique_ptr<WritableFileWriter> file_writer;
+    ASSERT_OK(WritableFileWriter::Create(fs, fname, env_options_, &file_writer,
+                                         nullptr));
+    current_log_writer_.reset(
+        new log::Writer(std::move(file_writer), 0, false));
+  }
+
+  void CreateArchiveLogs(int num_logs, int entries_per_log) {
+    for (int i = 1; i <= num_logs; ++i) {
+      RollTheLog(true);
+      for (int k = 0; k < entries_per_log; ++k) {
+        Put(std::to_string(k), std::string(1024, 'a'));
+      }
+    }
+  }
+
+  std::unique_ptr<TransactionLogIterator> OpenTransactionLogIter(
+      const SequenceNumber seq) {
+    std::unique_ptr<TransactionLogIterator> iter;
+    Status status = wal_manager_->GetUpdatesSince(
+        seq, &iter, TransactionLogIterator::ReadOptions(), versions_.get());
+    EXPECT_OK(status);
+    return iter;
+  }
+
+  std::unique_ptr<MockEnv> env_;
+  std::string dbname_;
+  ImmutableDBOptions db_options_;
+  WriteController write_controller_;
+  EnvOptions env_options_;
+  std::shared_ptr<Cache> table_cache_;
+  WriteBufferManager write_buffer_manager_;
+  std::unique_ptr<VersionSet> versions_;
+  std::unique_ptr<WalManager> wal_manager_;
+
+  std::unique_ptr<log::Writer> current_log_writer_;
+  uint64_t current_log_number_;
+};
+
+TEST_F(WalManagerTest, ReadFirstRecordCache) {
+  Init();
+  std::string path = dbname_ + "/000001.log";
+  std::unique_ptr<FSWritableFile> file;
+  ASSERT_OK(env_->GetFileSystem()->NewWritableFile(path, FileOptions(), &file,
+                                                   nullptr));
+
+  SequenceNumber s;
+  ASSERT_OK(wal_manager_->TEST_ReadFirstLine(path, 1 /* number */, &s));
+  ASSERT_EQ(s, 0U);
+
+  ASSERT_OK(
+      wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1 /* number */, &s));
+  ASSERT_EQ(s, 0U);
+
+  std::unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(file), path, FileOptions()));
+  log::Writer writer(std::move(file_writer), 1,
+                     db_options_.recycle_log_file_num > 0);
+  WriteBatch batch;
+  ASSERT_OK(batch.Put("foo", "bar"));
+  WriteBatchInternal::SetSequence(&batch, 10);
+  ASSERT_OK(writer.AddRecord(WriteBatchInternal::Contents(&batch)));
+
+  // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here.
+  // Waiting for lei to finish with db_test
+  // env_->count_sequential_reads_ = true;
+  // sequential_read_counter_ sanity test
+  // ASSERT_EQ(env_->sequential_read_counter_.Read(), 0);
+
+  ASSERT_OK(wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1, &s));
+  ASSERT_EQ(s, 10U);
+  // did a read
+  // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here
+  // ASSERT_EQ(env_->sequential_read_counter_.Read(), 1);
+
+  ASSERT_OK(wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1, &s));
+  ASSERT_EQ(s, 10U);
+  // no new reads since the value is cached
+  // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here
+  // ASSERT_EQ(env_->sequential_read_counter_.Read(), 1);
+}
+
+namespace {
+uint64_t GetLogDirSize(std::string dir_path, Env* env) {
+  uint64_t dir_size = 0;
+  std::vector<std::string> files;
+  EXPECT_OK(env->GetChildren(dir_path, &files));
+  for (auto& f : files) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(f, &number, &type) && type == kWalFile) {
+      std::string const file_path = dir_path + "/" + f;
+      uint64_t file_size;
+      EXPECT_OK(env->GetFileSize(file_path, &file_size));
+      dir_size += file_size;
+    }
+  }
+  return dir_size;
+}
+std::vector<std::uint64_t> ListSpecificFiles(
+    Env* env, const std::string& path, const FileType expected_file_type) {
+  std::vector<std::string> files;
+  std::vector<uint64_t> file_numbers;
+  uint64_t number;
+  FileType type;
+  EXPECT_OK(env->GetChildren(path, &files));
+  for (size_t i = 0; i < files.size(); ++i) {
+    if (ParseFileName(files[i], &number, &type)) {
+      if (type == expected_file_type) {
+        file_numbers.push_back(number);
+      }
+    }
+  }
+  return file_numbers;
+}
+
+int CountRecords(TransactionLogIterator* iter) {
+  int count = 0;
+  SequenceNumber lastSequence = 0;
+  BatchResult res;
+  while (iter->Valid()) {
+    res = iter->GetBatch();
+    EXPECT_TRUE(res.sequence > lastSequence);
+    ++count;
+    lastSequence = res.sequence;
+    EXPECT_OK(iter->status());
+    iter->Next();
+  }
+  EXPECT_OK(iter->status());
+  return count;
+}
+}  // anonymous namespace
+
+TEST_F(WalManagerTest, WALArchivalSizeLimit) {
+  db_options_.WAL_ttl_seconds = 0;
+  db_options_.WAL_size_limit_MB = 1000;
+  Init();
+
+  // TEST : Create WalManager with huge size limit and no ttl.
+  // Create some archived files and call PurgeObsoleteWALFiles().
+  // Count the archived log files that survived.
+  // Assert that all of them did.
+  // Change size limit. Re-open WalManager.
+  // Assert that archive is not greater than WAL_size_limit_MB after
+  // PurgeObsoleteWALFiles()
+  // Set ttl and time_to_check_ to small values. Re-open db.
+  // Assert that there are no archived logs left.
+
+  std::string archive_dir = ArchivalDirectory(dbname_);
+  CreateArchiveLogs(20, 5000);
+
+  std::vector<std::uint64_t> log_files =
+      ListSpecificFiles(env_.get(), archive_dir, kWalFile);
+  ASSERT_EQ(log_files.size(), 20U);
+
+  db_options_.WAL_size_limit_MB = 8;
+  Reopen();
+  wal_manager_->PurgeObsoleteWALFiles();
+
+  uint64_t archive_size = GetLogDirSize(archive_dir, env_.get());
+  ASSERT_TRUE(archive_size <= db_options_.WAL_size_limit_MB * 1024 * 1024);
+
+  db_options_.WAL_ttl_seconds = 1;
+  env_->SleepForMicroseconds(2 * 1000 * 1000);
+  Reopen();
+  wal_manager_->PurgeObsoleteWALFiles();
+
+  log_files = ListSpecificFiles(env_.get(), archive_dir, kWalFile);
+  ASSERT_TRUE(log_files.empty());
+}
+
+TEST_F(WalManagerTest, WALArchivalTtl) {
+  db_options_.WAL_ttl_seconds = 1000;
+  Init();
+
+  // TEST : Create WalManager with a ttl and no size limit.
+  // Create some archived log files and call PurgeObsoleteWALFiles().
+  // Assert that files are not deleted
+  // Reopen db with small ttl.
+  // Assert that all archived logs was removed.
+
+  std::string archive_dir = ArchivalDirectory(dbname_);
+  CreateArchiveLogs(20, 5000);
+
+  std::vector<uint64_t> log_files =
+      ListSpecificFiles(env_.get(), archive_dir, kWalFile);
+  ASSERT_GT(log_files.size(), 0U);
+
+  db_options_.WAL_ttl_seconds = 1;
+  env_->SleepForMicroseconds(3 * 1000 * 1000);
+  Reopen();
+  wal_manager_->PurgeObsoleteWALFiles();
+
+  log_files = ListSpecificFiles(env_.get(), archive_dir, kWalFile);
+  ASSERT_TRUE(log_files.empty());
+}
+
+TEST_F(WalManagerTest, TransactionLogIteratorMoveOverZeroFiles) {
+  Init();
+  RollTheLog(false);
+  Put("key1", std::string(1024, 'a'));
+  // Create a zero record WAL file.
+  RollTheLog(false);
+  RollTheLog(false);
+
+  Put("key2", std::string(1024, 'a'));
+
+  auto iter = OpenTransactionLogIter(0);
+  ASSERT_EQ(2, CountRecords(iter.get()));
+}
+
+TEST_F(WalManagerTest, TransactionLogIteratorJustEmptyFile) {
+  Init();
+  RollTheLog(false);
+  auto iter = OpenTransactionLogIter(0);
+  // Check that an empty iterator is returned
+  ASSERT_TRUE(!iter->Valid());
+}
+
+TEST_F(WalManagerTest, TransactionLogIteratorNewFileWhileScanning) {
+  Init();
+  CreateArchiveLogs(2, 100);
+  auto iter = OpenTransactionLogIter(0);
+  CreateArchiveLogs(1, 100);
+  int i = 0;
+  for (; iter->Valid(); iter->Next()) {
+    i++;
+  }
+  ASSERT_EQ(i, 200);
+  // A new log file was added after the iterator was created.
+  // TryAgain indicates a new iterator is needed to fetch the new data
+  ASSERT_TRUE(iter->status().IsTryAgain());
+
+  iter = OpenTransactionLogIter(0);
+  i = 0;
+  for (; iter->Valid(); iter->Next()) {
+    i++;
+  }
+  ASSERT_EQ(i, 300);
+  ASSERT_TRUE(iter->status().ok());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as WalManager is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/wide/db_wide_basic_test.cc b/src/rocksdb/db/wide/db_wide_basic_test.cc
new file mode 100644
index 000000000..1ffe314fe
--- /dev/null
+++ b/src/rocksdb/db/wide/db_wide_basic_test.cc
@@ -0,0 +1,654 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <array>
+#include <memory>
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/testutil.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBWideBasicTest : public DBTestBase {
+ protected:
+  explicit DBWideBasicTest()
+      : DBTestBase("db_wide_basic_test", /* env_do_fsync */ false) {}
+};
+
+TEST_F(DBWideBasicTest, PutEntity) {
+  Options options = GetDefaultOptions();
+
+  // Write a couple of wide-column entities and a plain old key-value, then read
+  // them back.
+  constexpr char first_key[] = "first";
+  constexpr char first_value_of_default_column[] = "hello";
+  WideColumns first_columns{
+      {kDefaultWideColumnName, first_value_of_default_column},
+      {"attr_name1", "foo"},
+      {"attr_name2", "bar"}};
+
+  constexpr char second_key[] = "second";
+  WideColumns second_columns{{"attr_one", "two"}, {"attr_three", "four"}};
+
+  constexpr char third_key[] = "third";
+  constexpr char third_value[] = "baz";
+
+  auto verify = [&]() {
+    const WideColumns expected_third_columns{
+        {kDefaultWideColumnName, third_value}};
+
+    {
+      PinnableSlice result;
+      ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), first_key,
+                         &result));
+      ASSERT_EQ(result, first_value_of_default_column);
+    }
+
+    {
+      PinnableWideColumns result;
+      ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
+                               first_key, &result));
+      ASSERT_EQ(result.columns(), first_columns);
+    }
+
+    {
+      PinnableSlice result;
+      ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), second_key,
+                         &result));
+      ASSERT_TRUE(result.empty());
+    }
+
+    {
+      PinnableWideColumns result;
+      ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
+                               second_key, &result));
+      ASSERT_EQ(result.columns(), second_columns);
+    }
+
+    {
+      PinnableSlice result;
+      ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), third_key,
+                         &result));
+      ASSERT_EQ(result, third_value);
+    }
+
+    {
+      PinnableWideColumns result;
+      ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
+                               third_key, &result));
+
+      ASSERT_EQ(result.columns(), expected_third_columns);
+    }
+
+    {
+      constexpr size_t num_keys = 3;
+
+      std::array<Slice, num_keys> keys{{first_key, second_key, third_key}};
+      std::array<PinnableSlice, num_keys> values;
+      std::array<Status, num_keys> statuses;
+
+      db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys,
+                    &keys[0], &values[0], &statuses[0]);
+
+      ASSERT_OK(statuses[0]);
+      ASSERT_EQ(values[0], first_value_of_default_column);
+
+      ASSERT_OK(statuses[1]);
+      ASSERT_TRUE(values[1].empty());
+
+      ASSERT_OK(statuses[2]);
+      ASSERT_EQ(values[2], third_value);
+    }
+
+    {
+      std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+
+      iter->SeekToFirst();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), first_key);
+      ASSERT_EQ(iter->value(), first_value_of_default_column);
+      ASSERT_EQ(iter->columns(), first_columns);
+
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), second_key);
+      ASSERT_TRUE(iter->value().empty());
+      ASSERT_EQ(iter->columns(), second_columns);
+
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), third_key);
+      ASSERT_EQ(iter->value(), third_value);
+      ASSERT_EQ(iter->columns(), expected_third_columns);
+
+      iter->Next();
+      ASSERT_FALSE(iter->Valid());
+      ASSERT_OK(iter->status());
+
+      iter->SeekToLast();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), third_key);
+      ASSERT_EQ(iter->value(), third_value);
+      ASSERT_EQ(iter->columns(), expected_third_columns);
+
+      iter->Prev();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), second_key);
+      ASSERT_TRUE(iter->value().empty());
+      ASSERT_EQ(iter->columns(), second_columns);
+
+      iter->Prev();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), first_key);
+      ASSERT_EQ(iter->value(), first_value_of_default_column);
+      ASSERT_EQ(iter->columns(), first_columns);
+
+      iter->Prev();
+      ASSERT_FALSE(iter->Valid());
+      ASSERT_OK(iter->status());
+    }
+  };
+
+  // Use the DB::PutEntity API to write the first entity
+  ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(),
+                           first_key, first_columns));
+
+  // Use WriteBatch to write the second entity
+  WriteBatch batch;
+  ASSERT_OK(
+      batch.PutEntity(db_->DefaultColumnFamily(), second_key, second_columns));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  // Use Put to write the plain key-value
+  ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), third_key,
+                     third_value));
+
+  // Try reading from memtable
+  verify();
+
+  // Try reading after recovery
+  Close();
+  options.avoid_flush_during_recovery = true;
+  Reopen(options);
+
+  verify();
+
+  // Try reading from storage
+  ASSERT_OK(Flush());
+
+  verify();
+}
+
+TEST_F(DBWideBasicTest, PutEntityColumnFamily) {
+  Options options = GetDefaultOptions();
+  CreateAndReopenWithCF({"corinthian"}, options);
+
+  // Use the DB::PutEntity API
+  constexpr char first_key[] = "first";
+  WideColumns first_columns{{"attr_name1", "foo"}, {"attr_name2", "bar"}};
+
+  ASSERT_OK(
+      db_->PutEntity(WriteOptions(), handles_[1], first_key, first_columns));
+
+  // Use WriteBatch
+  constexpr char second_key[] = "second";
+  WideColumns second_columns{{"attr_one", "two"}, {"attr_three", "four"}};
+
+  WriteBatch batch;
+  ASSERT_OK(batch.PutEntity(handles_[1], second_key, second_columns));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+}
+
+TEST_F(DBWideBasicTest, MergePlainKeyValue) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  Reopen(options);
+
+  // Put + Merge
+  constexpr char first_key[] = "first";
+  constexpr char first_base_value[] = "hello";
+  constexpr char first_merge_op[] = "world";
+
+  // Delete + Merge
+  constexpr char second_key[] = "second";
+  constexpr char second_merge_op[] = "foo";
+
+  // Merge without any preceding KV
+  constexpr char third_key[] = "third";
+  constexpr char third_merge_op[] = "bar";
+
+  auto write_base = [&]() {
+    // Write "base" KVs: a Put for the 1st key and a Delete for the 2nd one;
+    // note there is no "base" KV for the 3rd
+    ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), first_key,
+                       first_base_value));
+    ASSERT_OK(
+        db_->Delete(WriteOptions(), db_->DefaultColumnFamily(), second_key));
+  };
+
+  auto write_merge = [&]() {
+    // Write Merge operands
+    ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), first_key,
+                         first_merge_op));
+    ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), second_key,
+                         second_merge_op));
+    ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), third_key,
+                         third_merge_op));
+  };
+
+  const std::string expected_first_column(std::string(first_base_value) + "," +
+                                          first_merge_op);
+  const WideColumns expected_first_columns{
+      {kDefaultWideColumnName, expected_first_column}};
+  const WideColumns expected_second_columns{
+      {kDefaultWideColumnName, second_merge_op}};
+  const WideColumns expected_third_columns{
+      {kDefaultWideColumnName, third_merge_op}};
+
+  auto verify = [&]() {
+    {
+      PinnableWideColumns result;
+      ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
+                               first_key, &result));
+      ASSERT_EQ(result.columns(), expected_first_columns);
+    }
+
+    {
+      PinnableWideColumns result;
+      ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
+                               second_key, &result));
+      ASSERT_EQ(result.columns(), expected_second_columns);
+    }
+
+    {
+      PinnableWideColumns result;
+      ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
+                               third_key, &result));
+
+      ASSERT_EQ(result.columns(), expected_third_columns);
+    }
+
+    {
+      std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+
+      iter->SeekToFirst();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), first_key);
+      ASSERT_EQ(iter->value(), expected_first_columns[0].value());
+      ASSERT_EQ(iter->columns(), expected_first_columns);
+
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), second_key);
+      ASSERT_EQ(iter->value(), expected_second_columns[0].value());
+      ASSERT_EQ(iter->columns(), expected_second_columns);
+
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), third_key);
+      ASSERT_EQ(iter->value(), expected_third_columns[0].value());
+      ASSERT_EQ(iter->columns(), expected_third_columns);
+
+      iter->Next();
+      ASSERT_FALSE(iter->Valid());
+      ASSERT_OK(iter->status());
+
+      iter->SeekToLast();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), third_key);
+      ASSERT_EQ(iter->value(), expected_third_columns[0].value());
+      ASSERT_EQ(iter->columns(), expected_third_columns);
+
+      iter->Prev();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), second_key);
+      ASSERT_EQ(iter->value(), expected_second_columns[0].value());
+      ASSERT_EQ(iter->columns(), expected_second_columns);
+
+      iter->Prev();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), first_key);
+      ASSERT_EQ(iter->value(), expected_first_columns[0].value());
+      ASSERT_EQ(iter->columns(), expected_first_columns);
+
+      iter->Prev();
+      ASSERT_FALSE(iter->Valid());
+      ASSERT_OK(iter->status());
+    }
+  };
+
+  {
+    // Base KVs (if any) and Merge operands both in memtable (note: we take a
+    // snapshot in between to make sure they do not get reconciled during the
+    // subsequent flush)
+    write_base();
+    ManagedSnapshot snapshot(db_);
+    write_merge();
+    verify();
+
+    // Base KVs (if any) and Merge operands both in storage
+    ASSERT_OK(Flush());
+    verify();
+  }
+
+  // Base KVs (if any) in storage, Merge operands in memtable
+  DestroyAndReopen(options);
+  write_base();
+  ASSERT_OK(Flush());
+  write_merge();
+  verify();
+}
+
+TEST_F(DBWideBasicTest, MergeEntity) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+
+  const std::string delim("|");
+  options.merge_operator = MergeOperators::CreateStringAppendOperator(delim);
+
+  Reopen(options);
+
+  // Test Merge with two entities: one that has the default column and one that
+  // doesn't
+  constexpr char first_key[] = "first";
+  WideColumns first_columns{{kDefaultWideColumnName, "a"},
+                            {"attr_name1", "foo"},
+                            {"attr_name2", "bar"}};
+  constexpr char first_merge_operand[] = "bla1";
+
+  constexpr char second_key[] = "second";
+  WideColumns second_columns{{"attr_one", "two"}, {"attr_three", "four"}};
+  constexpr char second_merge_operand[] = "bla2";
+
+  auto write_base = [&]() {
+    // Use the DB::PutEntity API
+    ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(),
+                             first_key, first_columns));
+
+    // Use WriteBatch
+    WriteBatch batch;
+    ASSERT_OK(batch.PutEntity(db_->DefaultColumnFamily(), second_key,
+                              second_columns));
+    ASSERT_OK(db_->Write(WriteOptions(), &batch));
+  };
+
+  auto write_merge = [&]() {
+    ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), first_key,
+                         first_merge_operand));
+    ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), second_key,
+                         second_merge_operand));
+  };
+
+  const std::string first_expected_default(first_columns[0].value().ToString() +
+                                           delim + first_merge_operand);
+  const std::string second_expected_default(delim + second_merge_operand);
+
+  auto verify_basic = [&]() {
+    WideColumns first_expected_columns{
+        {kDefaultWideColumnName, first_expected_default},
+        first_columns[1],
+        first_columns[2]};
+
+    WideColumns second_expected_columns{
+        {kDefaultWideColumnName, second_expected_default},
+        second_columns[0],
+        second_columns[1]};
+
+    {
+      PinnableSlice result;
+      ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), first_key,
+                         &result));
+      ASSERT_EQ(result, first_expected_default);
+    }
+
+    {
+      PinnableWideColumns result;
+      ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
+                               first_key, &result));
+      ASSERT_EQ(result.columns(), first_expected_columns);
+    }
+
+    {
+      PinnableSlice result;
+      ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), second_key,
+                         &result));
+      ASSERT_EQ(result, second_expected_default);
+    }
+
+    {
+      PinnableWideColumns result;
+      ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
+                               second_key, &result));
+      ASSERT_EQ(result.columns(), second_expected_columns);
+    }
+
+    {
+      constexpr size_t num_keys = 2;
+
+      std::array<Slice, num_keys> keys{{first_key, second_key}};
+      std::array<PinnableSlice, num_keys> values;
+      std::array<Status, num_keys> statuses;
+
+      db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys,
+                    &keys[0], &values[0], &statuses[0]);
+
+      ASSERT_EQ(values[0], first_expected_default);
+      ASSERT_OK(statuses[0]);
+
+      ASSERT_EQ(values[1], second_expected_default);
+      ASSERT_OK(statuses[1]);
+    }
+
+    {
+      std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+
+      iter->SeekToFirst();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), first_key);
+      ASSERT_EQ(iter->value(), first_expected_default);
+      ASSERT_EQ(iter->columns(), first_expected_columns);
+
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), second_key);
+      ASSERT_EQ(iter->value(), second_expected_default);
+      ASSERT_EQ(iter->columns(), second_expected_columns);
+
+      iter->Next();
+      ASSERT_FALSE(iter->Valid());
+      ASSERT_OK(iter->status());
+
+      iter->SeekToLast();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), second_key);
+      ASSERT_EQ(iter->value(), second_expected_default);
+      ASSERT_EQ(iter->columns(), second_expected_columns);
+
+      iter->Prev();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), first_key);
+      ASSERT_EQ(iter->value(), first_expected_default);
+      ASSERT_EQ(iter->columns(), first_expected_columns);
+
+      iter->Prev();
+      ASSERT_FALSE(iter->Valid());
+      ASSERT_OK(iter->status());
+    }
+  };
+
+  auto verify_merge_ops_pre_compaction = [&]() {
+    constexpr size_t num_merge_operands = 2;
+
+    GetMergeOperandsOptions get_merge_opts;
+    get_merge_opts.expected_max_number_of_operands = num_merge_operands;
+
+    {
+      std::array<PinnableSlice, num_merge_operands> merge_operands;
+      int number_of_operands = 0;
+
+      ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                      first_key, &merge_operands[0],
+                                      &get_merge_opts, &number_of_operands));
+
+      ASSERT_EQ(number_of_operands, num_merge_operands);
+      ASSERT_EQ(merge_operands[0], first_columns[0].value());
+      ASSERT_EQ(merge_operands[1], first_merge_operand);
+    }
+
+    {
+      std::array<PinnableSlice, num_merge_operands> merge_operands;
+      int number_of_operands = 0;
+
+      ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                      second_key, &merge_operands[0],
+                                      &get_merge_opts, &number_of_operands));
+
+      ASSERT_EQ(number_of_operands, num_merge_operands);
+      ASSERT_TRUE(merge_operands[0].empty());
+      ASSERT_EQ(merge_operands[1], second_merge_operand);
+    }
+  };
+
+  auto verify_merge_ops_post_compaction = [&]() {
+    constexpr size_t num_merge_operands = 1;
+
+    GetMergeOperandsOptions get_merge_opts;
+    get_merge_opts.expected_max_number_of_operands = num_merge_operands;
+
+    {
+      std::array<PinnableSlice, num_merge_operands> merge_operands;
+      int number_of_operands = 0;
+
+      ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                      first_key, &merge_operands[0],
+                                      &get_merge_opts, &number_of_operands));
+
+      ASSERT_EQ(number_of_operands, num_merge_operands);
+      ASSERT_EQ(merge_operands[0], first_expected_default);
+    }
+
+    {
+      std::array<PinnableSlice, num_merge_operands> merge_operands;
+      int number_of_operands = 0;
+
+      ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                      second_key, &merge_operands[0],
+                                      &get_merge_opts, &number_of_operands));
+
+      ASSERT_EQ(number_of_operands, num_merge_operands);
+      ASSERT_EQ(merge_operands[0], second_expected_default);
+    }
+  };
+
+  {
+    // Base KVs and Merge operands both in memtable (note: we take a snapshot in
+    // between to make sure they do not get reconciled during the subsequent
+    // flush)
+    write_base();
+    ManagedSnapshot snapshot(db_);
+    write_merge();
+    verify_basic();
+    verify_merge_ops_pre_compaction();
+
+    // Base KVs and Merge operands both in storage
+    ASSERT_OK(Flush());
+    verify_basic();
+    verify_merge_ops_pre_compaction();
+  }
+
+  // Base KVs in storage, Merge operands in memtable
+  DestroyAndReopen(options);
+  write_base();
+  ASSERT_OK(Flush());
+  write_merge();
+  verify_basic();
+  verify_merge_ops_pre_compaction();
+
+  // Flush and compact
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /* begin */ nullptr,
+                              /* end */ nullptr));
+  verify_basic();
+  verify_merge_ops_post_compaction();
+}
+
+TEST_F(DBWideBasicTest, PutEntityTimestampError) {
+  // Note: timestamps are currently not supported
+
+  Options options = GetDefaultOptions();
+  options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+
+  ColumnFamilyHandle* handle = nullptr;
+  ASSERT_OK(db_->CreateColumnFamily(options, "corinthian", &handle));
+  std::unique_ptr<ColumnFamilyHandle> handle_guard(handle);
+
+  // Use the DB::PutEntity API
+  constexpr char first_key[] = "first";
+  WideColumns first_columns{{"attr_name1", "foo"}, {"attr_name2", "bar"}};
+
+  ASSERT_TRUE(db_->PutEntity(WriteOptions(), handle, first_key, first_columns)
+                  .IsInvalidArgument());
+
+  // Use WriteBatch
+  constexpr char second_key[] = "second";
+  WideColumns second_columns{{"doric", "column"}, {"ionic", "column"}};
+
+  WriteBatch batch;
+  ASSERT_TRUE(
+      batch.PutEntity(handle, second_key, second_columns).IsInvalidArgument());
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+}
+
+TEST_F(DBWideBasicTest, PutEntitySerializationError) {
+  // Make sure duplicate columns are caught
+
+  Options options = GetDefaultOptions();
+
+  // Use the DB::PutEntity API
+  constexpr char first_key[] = "first";
+  WideColumns first_columns{{"foo", "bar"}, {"foo", "baz"}};
+
+  ASSERT_TRUE(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(),
+                             first_key, first_columns)
+                  .IsCorruption());
+
+  // Use WriteBatch
+  constexpr char second_key[] = "second";
+  WideColumns second_columns{{"column", "doric"}, {"column", "ionic"}};
+
+  WriteBatch batch;
+  ASSERT_TRUE(
+      batch.PutEntity(db_->DefaultColumnFamily(), second_key, second_columns)
+          .IsCorruption());
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/wide/wide_column_serialization.cc b/src/rocksdb/db/wide/wide_column_serialization.cc
new file mode 100644
index 000000000..f62143c40
--- /dev/null
+++ b/src/rocksdb/db/wide/wide_column_serialization.cc
@@ -0,0 +1,182 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/wide/wide_column_serialization.h"
+
+#include <algorithm>
+#include <cassert>
+#include <limits>
+
+#include "rocksdb/slice.h"
+#include "util/autovector.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status WideColumnSerialization::SerializeImpl(const Slice* value_of_default,
+                                              const WideColumns& columns,
+                                              std::string& output) {
+  const size_t num_columns =
+      value_of_default ? columns.size() + 1 : columns.size();
+
+  if (num_columns > static_cast<size_t>(std::numeric_limits<uint32_t>::max())) {
+    return Status::InvalidArgument("Too many wide columns");
+  }
+
+  PutVarint32(&output, kCurrentVersion);
+
+  PutVarint32(&output, static_cast<uint32_t>(num_columns));
+
+  const Slice* prev_name = nullptr;
+  if (value_of_default) {
+    if (value_of_default->size() >
+        static_cast<size_t>(std::numeric_limits<uint32_t>::max())) {
+      return Status::InvalidArgument("Wide column value too long");
+    }
+
+    PutLengthPrefixedSlice(&output, kDefaultWideColumnName);
+    PutVarint32(&output, static_cast<uint32_t>(value_of_default->size()));
+
+    prev_name = &kDefaultWideColumnName;
+  }
+
+  for (size_t i = 0; i < columns.size(); ++i) {
+    const WideColumn& column = columns[i];
+
+    const Slice& name = column.name();
+    if (name.size() >
+        static_cast<size_t>(std::numeric_limits<uint32_t>::max())) {
+      return Status::InvalidArgument("Wide column name too long");
+    }
+
+    if (prev_name && prev_name->compare(name) >= 0) {
+      return Status::Corruption("Wide columns out of order");
+    }
+
+    const Slice& value = column.value();
+    if (value.size() >
+        static_cast<size_t>(std::numeric_limits<uint32_t>::max())) {
+      return Status::InvalidArgument("Wide column value too long");
+    }
+
+    PutLengthPrefixedSlice(&output, name);
+    PutVarint32(&output, static_cast<uint32_t>(value.size()));
+
+    prev_name = &name;
+  }
+
+  if (value_of_default) {
+    output.append(value_of_default->data(), value_of_default->size());
+  }
+
+  for (const auto& column : columns) {
+    const Slice& value = column.value();
+
+    output.append(value.data(), value.size());
+  }
+
+  return Status::OK();
+}
+
+Status WideColumnSerialization::Deserialize(Slice& input,
+                                            WideColumns& columns) {
+  assert(columns.empty());
+
+  uint32_t version = 0;
+  if (!GetVarint32(&input, &version)) {
+    return Status::Corruption("Error decoding wide column version");
+  }
+
+  if (version > kCurrentVersion) {
+    return Status::NotSupported("Unsupported wide column version");
+  }
+
+  uint32_t num_columns = 0;
+  if (!GetVarint32(&input, &num_columns)) {
+    return Status::Corruption("Error decoding number of wide columns");
+  }
+
+  if (!num_columns) {
+    return Status::OK();
+  }
+
+  columns.reserve(num_columns);
+
+  autovector<uint32_t, 16> column_value_sizes;
+  column_value_sizes.reserve(num_columns);
+
+  for (uint32_t i = 0; i < num_columns; ++i) {
+    Slice name;
+    if (!GetLengthPrefixedSlice(&input, &name)) {
+      return Status::Corruption("Error decoding wide column name");
+    }
+
+    if (!columns.empty() && columns.back().name().compare(name) >= 0) {
+      return Status::Corruption("Wide columns out of order");
+    }
+
+    columns.emplace_back(name, Slice());
+
+    uint32_t value_size = 0;
+    if (!GetVarint32(&input, &value_size)) {
+      return Status::Corruption("Error decoding wide column value size");
+    }
+
+    column_value_sizes.emplace_back(value_size);
+  }
+
+  const Slice data(input);
+  size_t pos = 0;
+
+  for (uint32_t i = 0; i < num_columns; ++i) {
+    const uint32_t value_size = column_value_sizes[i];
+
+    if (pos + value_size > data.size()) {
+      return Status::Corruption("Error decoding wide column value payload");
+    }
+
+    columns[i].value() = Slice(data.data() + pos, value_size);
+
+    pos += value_size;
+  }
+
+  return Status::OK();
+}
+
+WideColumns::const_iterator WideColumnSerialization::Find(
+    const WideColumns& columns, const Slice& column_name) {
+  const auto it =
+      std::lower_bound(columns.cbegin(), columns.cend(), column_name,
+                       [](const WideColumn& lhs, const Slice& rhs) {
+                         return lhs.name().compare(rhs) < 0;
+                       });
+
+  if (it == columns.cend() || it->name() != column_name) {
+    return columns.cend();
+  }
+
+  return it;
+}
+
+Status WideColumnSerialization::GetValueOfDefaultColumn(Slice& input,
+                                                        Slice& value) {
+  WideColumns columns;
+
+  const Status s = Deserialize(input, columns);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (columns.empty() || columns[0].name() != kDefaultWideColumnName) {
+    value.clear();
+    return Status::OK();
+  }
+
+  value = columns[0].value();
+
+  return Status::OK();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/wide/wide_column_serialization.h b/src/rocksdb/db/wide/wide_column_serialization.h
new file mode 100644
index 000000000..f0ffbd392
--- /dev/null
+++ b/src/rocksdb/db/wide/wide_column_serialization.h
@@ -0,0 +1,77 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+#include "rocksdb/wide_columns.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+
+// Wide-column serialization/deserialization primitives.
+//
+// The two main parts of the layout are 1) a sorted index containing the column
+// names and column value sizes and 2) the column values themselves. Keeping the
+// index and the values separate will enable selectively reading column values
+// down the line. Note that currently the index has to be fully parsed in order
+// to find out the offset of each column value.
+//
+// Legend: cn = column name, cv = column value, cns = column name size, cvs =
+// column value size.
+//
+//      +----------+--------------+----------+-------+----------+---...
+//      | version  | # of columns |  cns 1   | cn 1  |  cvs 1   |
+//      +----------+--------------+------------------+--------- +---...
+//      | varint32 |   varint32   | varint32 | bytes | varint32 |
+//      +----------+--------------+----------+-------+----------+---...
+//
+//      ... continued ...
+//
+//          ...---+----------+-------+----------+-------+---...---+-------+
+//                |  cns N   | cn N  |  cvs N   | cv 1  |         | cv N  |
+//          ...---+----------+-------+----------+-------+---...---+-------+
+//                | varint32 | bytes | varint32 | bytes |         | bytes |
+//          ...---+----------+-------+----------+-------+---...---+-------+
+
+class WideColumnSerialization {
+ public:
+  static Status Serialize(const WideColumns& columns, std::string& output);
+  static Status Serialize(const Slice& value_of_default,
+                          const WideColumns& other_columns,
+                          std::string& output);
+
+  static Status Deserialize(Slice& input, WideColumns& columns);
+
+  static WideColumns::const_iterator Find(const WideColumns& columns,
+                                          const Slice& column_name);
+  static Status GetValueOfDefaultColumn(Slice& input, Slice& value);
+
+  static constexpr uint32_t kCurrentVersion = 1;
+
+ private:
+  static Status SerializeImpl(const Slice* value_of_default,
+                              const WideColumns& columns, std::string& output);
+};
+
+inline Status WideColumnSerialization::Serialize(const WideColumns& columns,
+                                                 std::string& output) {
+  constexpr Slice* value_of_default = nullptr;
+
+  return SerializeImpl(value_of_default, columns, output);
+}
+
+inline Status WideColumnSerialization::Serialize(
+    const Slice& value_of_default, const WideColumns& other_columns,
+    std::string& output) {
+  return SerializeImpl(&value_of_default, other_columns, output);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/wide/wide_column_serialization_test.cc b/src/rocksdb/db/wide/wide_column_serialization_test.cc
new file mode 100644
index 000000000..8060d2f24
--- /dev/null
+++ b/src/rocksdb/db/wide/wide_column_serialization_test.cc
@@ -0,0 +1,338 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/wide/wide_column_serialization.h"
+
+#include "test_util/testharness.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+TEST(WideColumnSerializationTest, Construct) {
+  constexpr char foo[] = "foo";
+  constexpr char bar[] = "bar";
+
+  const std::string foo_str(foo);
+  const std::string bar_str(bar);
+
+  const Slice foo_slice(foo_str);
+  const Slice bar_slice(bar_str);
+
+  {
+    WideColumn column(foo, bar);
+    ASSERT_EQ(column.name(), foo);
+    ASSERT_EQ(column.value(), bar);
+  }
+
+  {
+    WideColumn column(foo_str, bar);
+    ASSERT_EQ(column.name(), foo_str);
+    ASSERT_EQ(column.value(), bar);
+  }
+
+  {
+    WideColumn column(foo_slice, bar);
+    ASSERT_EQ(column.name(), foo_slice);
+    ASSERT_EQ(column.value(), bar);
+  }
+
+  {
+    WideColumn column(foo, bar_str);
+    ASSERT_EQ(column.name(), foo);
+    ASSERT_EQ(column.value(), bar_str);
+  }
+
+  {
+    WideColumn column(foo_str, bar_str);
+    ASSERT_EQ(column.name(), foo_str);
+    ASSERT_EQ(column.value(), bar_str);
+  }
+
+  {
+    WideColumn column(foo_slice, bar_str);
+    ASSERT_EQ(column.name(), foo_slice);
+    ASSERT_EQ(column.value(), bar_str);
+  }
+
+  {
+    WideColumn column(foo, bar_slice);
+    ASSERT_EQ(column.name(), foo);
+    ASSERT_EQ(column.value(), bar_slice);
+  }
+
+  {
+    WideColumn column(foo_str, bar_slice);
+    ASSERT_EQ(column.name(), foo_str);
+    ASSERT_EQ(column.value(), bar_slice);
+  }
+
+  {
+    WideColumn column(foo_slice, bar_slice);
+    ASSERT_EQ(column.name(), foo_slice);
+    ASSERT_EQ(column.value(), bar_slice);
+  }
+
+  {
+    constexpr char foo_name[] = "foo_name";
+    constexpr char bar_value[] = "bar_value";
+
+    WideColumn column(std::piecewise_construct,
+                      std::forward_as_tuple(foo_name, sizeof(foo) - 1),
+                      std::forward_as_tuple(bar_value, sizeof(bar) - 1));
+    ASSERT_EQ(column.name(), foo);
+    ASSERT_EQ(column.value(), bar);
+  }
+}
+
+TEST(WideColumnSerializationTest, SerializeDeserialize) {
+  WideColumns columns{{"foo", "bar"}, {"hello", "world"}};
+  std::string output;
+
+  ASSERT_OK(WideColumnSerialization::Serialize(columns, output));
+
+  Slice input(output);
+  WideColumns deserialized_columns;
+
+  ASSERT_OK(WideColumnSerialization::Deserialize(input, deserialized_columns));
+  ASSERT_EQ(columns, deserialized_columns);
+
+  {
+    const auto it = WideColumnSerialization::Find(deserialized_columns, "foo");
+    ASSERT_NE(it, deserialized_columns.cend());
+    ASSERT_EQ(*it, deserialized_columns.front());
+  }
+
+  {
+    const auto it =
+        WideColumnSerialization::Find(deserialized_columns, "hello");
+    ASSERT_NE(it, deserialized_columns.cend());
+    ASSERT_EQ(*it, deserialized_columns.back());
+  }
+
+  {
+    const auto it =
+        WideColumnSerialization::Find(deserialized_columns, "fubar");
+    ASSERT_EQ(it, deserialized_columns.cend());
+  }
+
+  {
+    const auto it =
+        WideColumnSerialization::Find(deserialized_columns, "snafu");
+    ASSERT_EQ(it, deserialized_columns.cend());
+  }
+}
+
+TEST(WideColumnSerializationTest, SerializeWithPrepend) {
+  Slice value_of_default("baz");
+  WideColumns other_columns{{"foo", "bar"}, {"hello", "world"}};
+
+  std::string output;
+  ASSERT_OK(WideColumnSerialization::Serialize(value_of_default, other_columns,
+                                               output));
+
+  Slice input(output);
+
+  WideColumns deserialized_columns;
+  ASSERT_OK(WideColumnSerialization::Deserialize(input, deserialized_columns));
+
+  WideColumns expected_columns{{kDefaultWideColumnName, value_of_default},
+                               other_columns[0],
+                               other_columns[1]};
+  ASSERT_EQ(deserialized_columns, expected_columns);
+}
+
+TEST(WideColumnSerializationTest, SerializeDuplicateError) {
+  WideColumns columns{{"foo", "bar"}, {"foo", "baz"}};
+  std::string output;
+
+  ASSERT_TRUE(
+      WideColumnSerialization::Serialize(columns, output).IsCorruption());
+}
+
+TEST(WideColumnSerializationTest, SerializeWithPrependDuplicateError) {
+  Slice value_of_default("baz");
+  WideColumns other_columns{{kDefaultWideColumnName, "dup"}, {"foo", "bar"}};
+
+  std::string output;
+  ASSERT_TRUE(WideColumnSerialization::Serialize(value_of_default,
+                                                 other_columns, output)
+                  .IsCorruption());
+}
+
+TEST(WideColumnSerializationTest, SerializeOutOfOrderError) {
+  WideColumns columns{{"hello", "world"}, {"foo", "bar"}};
+  std::string output;
+
+  ASSERT_TRUE(
+      WideColumnSerialization::Serialize(columns, output).IsCorruption());
+}
+
+TEST(WideColumnSerializationTest, DeserializeVersionError) {
+  // Can't decode version
+
+  std::string buf;
+
+  Slice input(buf);
+  WideColumns columns;
+
+  const Status s = WideColumnSerialization::Deserialize(input, columns);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(), "version"));
+}
+
+TEST(WideColumnSerializationTest, DeserializeUnsupportedVersion) {
+  // Unsupported version
+  constexpr uint32_t future_version = 1000;
+
+  std::string buf;
+  PutVarint32(&buf, future_version);
+
+  Slice input(buf);
+  WideColumns columns;
+
+  const Status s = WideColumnSerialization::Deserialize(input, columns);
+  ASSERT_TRUE(s.IsNotSupported());
+  ASSERT_TRUE(std::strstr(s.getState(), "version"));
+}
+
+TEST(WideColumnSerializationTest, DeserializeNumberOfColumnsError) {
+  // Can't decode number of columns
+
+  std::string buf;
+  PutVarint32(&buf, WideColumnSerialization::kCurrentVersion);
+
+  Slice input(buf);
+  WideColumns columns;
+
+  const Status s = WideColumnSerialization::Deserialize(input, columns);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(), "number"));
+}
+
+TEST(WideColumnSerializationTest, DeserializeColumnsError) {
+  std::string buf;
+
+  PutVarint32(&buf, WideColumnSerialization::kCurrentVersion);
+
+  constexpr uint32_t num_columns = 2;
+  PutVarint32(&buf, num_columns);
+
+  // Can't decode the first column name
+  {
+    Slice input(buf);
+    WideColumns columns;
+
+    const Status s = WideColumnSerialization::Deserialize(input, columns);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "name"));
+  }
+
+  constexpr char first_column_name[] = "foo";
+  PutLengthPrefixedSlice(&buf, first_column_name);
+
+  // Can't decode the size of the first column value
+  {
+    Slice input(buf);
+    WideColumns columns;
+
+    const Status s = WideColumnSerialization::Deserialize(input, columns);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "value size"));
+  }
+
+  constexpr uint32_t first_value_size = 16;
+  PutVarint32(&buf, first_value_size);
+
+  // Can't decode the second column name
+  {
+    Slice input(buf);
+    WideColumns columns;
+
+    const Status s = WideColumnSerialization::Deserialize(input, columns);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "name"));
+  }
+
+  constexpr char second_column_name[] = "hello";
+  PutLengthPrefixedSlice(&buf, second_column_name);
+
+  // Can't decode the size of the second column value
+  {
+    Slice input(buf);
+    WideColumns columns;
+
+    const Status s = WideColumnSerialization::Deserialize(input, columns);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "value size"));
+  }
+
+  constexpr uint32_t second_value_size = 64;
+  PutVarint32(&buf, second_value_size);
+
+  // Can't decode the payload of the first column
+  {
+    Slice input(buf);
+    WideColumns columns;
+
+    const Status s = WideColumnSerialization::Deserialize(input, columns);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "payload"));
+  }
+
+  buf.append(first_value_size, '0');
+
+  // Can't decode the payload of the second column
+  {
+    Slice input(buf);
+    WideColumns columns;
+
+    const Status s = WideColumnSerialization::Deserialize(input, columns);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "payload"));
+  }
+
+  buf.append(second_value_size, 'x');
+
+  // Success
+  {
+    Slice input(buf);
+    WideColumns columns;
+
+    ASSERT_OK(WideColumnSerialization::Deserialize(input, columns));
+  }
+}
+
+TEST(WideColumnSerializationTest, DeserializeColumnsOutOfOrder) {
+  std::string buf;
+
+  PutVarint32(&buf, WideColumnSerialization::kCurrentVersion);
+
+  constexpr uint32_t num_columns = 2;
+  PutVarint32(&buf, num_columns);
+
+  constexpr char first_column_name[] = "b";
+  PutLengthPrefixedSlice(&buf, first_column_name);
+
+  constexpr uint32_t first_value_size = 16;
+  PutVarint32(&buf, first_value_size);
+
+  constexpr char second_column_name[] = "a";
+  PutLengthPrefixedSlice(&buf, second_column_name);
+
+  Slice input(buf);
+  WideColumns columns;
+
+  const Status s = WideColumnSerialization::Deserialize(input, columns);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(), "order"));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/wide/wide_columns.cc b/src/rocksdb/db/wide/wide_columns.cc
new file mode 100644
index 000000000..186be7f85
--- /dev/null
+++ b/src/rocksdb/db/wide/wide_columns.cc
@@ -0,0 +1,22 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/wide_columns.h"
+
+#include "db/wide/wide_column_serialization.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const Slice kDefaultWideColumnName;
+
+const WideColumns kNoWideColumns;
+
+Status PinnableWideColumns::CreateIndexForWideColumns() {
+  Slice value_copy = value_;
+
+  return WideColumnSerialization::Deserialize(value_copy, columns_);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_batch.cc b/src/rocksdb/db/write_batch.cc
new file mode 100644
index 000000000..796697cfc
--- /dev/null
+++ b/src/rocksdb/db/write_batch.cc
@@ -0,0 +1,3137 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBatch::rep_ :=
+//    sequence: fixed64
+//    count: fixed32
+//    data: record[count]
+// record :=
+//    kTypeValue varstring varstring
+//    kTypeDeletion varstring
+//    kTypeSingleDeletion varstring
+//    kTypeRangeDeletion varstring varstring
+//    kTypeMerge varstring varstring
+//    kTypeColumnFamilyValue varint32 varstring varstring
+//    kTypeColumnFamilyDeletion varint32 varstring
+//    kTypeColumnFamilySingleDeletion varint32 varstring
+//    kTypeColumnFamilyRangeDeletion varint32 varstring varstring
+//    kTypeColumnFamilyMerge varint32 varstring varstring
+//    kTypeBeginPrepareXID
+//    kTypeEndPrepareXID varstring
+//    kTypeCommitXID varstring
+//    kTypeCommitXIDAndTimestamp varstring varstring
+//    kTypeRollbackXID varstring
+//    kTypeBeginPersistedPrepareXID
+//    kTypeBeginUnprepareXID
+//    kTypeWideColumnEntity varstring varstring
+//    kTypeColumnFamilyWideColumnEntity varint32 varstring varstring
+//    kTypeNoop
+// varstring :=
+//    len: varint32
+//    data: uint8[len]
+
+#include "rocksdb/write_batch.h"
+
+#include <algorithm>
+#include <limits>
+#include <map>
+#include <stack>
+#include <stdexcept>
+#include <type_traits>
+#include <unordered_map>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/flush_scheduler.h"
+#include "db/kv_checksum.h"
+#include "db/memtable.h"
+#include "db/merge_context.h"
+#include "db/snapshot_impl.h"
+#include "db/trim_history_scheduler.h"
+#include "db/wide/wide_column_serialization.h"
+#include "db/write_batch_internal.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/statistics.h"
+#include "port/lang.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/system_clock.h"
+#include "util/autovector.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/duplicate_detector.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// anon namespace for file-local types
+namespace {
+
+enum ContentFlags : uint32_t {
+  DEFERRED = 1 << 0,
+  HAS_PUT = 1 << 1,
+  HAS_DELETE = 1 << 2,
+  HAS_SINGLE_DELETE = 1 << 3,
+  HAS_MERGE = 1 << 4,
+  HAS_BEGIN_PREPARE = 1 << 5,
+  HAS_END_PREPARE = 1 << 6,
+  HAS_COMMIT = 1 << 7,
+  HAS_ROLLBACK = 1 << 8,
+  HAS_DELETE_RANGE = 1 << 9,
+  HAS_BLOB_INDEX = 1 << 10,
+  HAS_BEGIN_UNPREPARE = 1 << 11,
+  HAS_PUT_ENTITY = 1 << 12,
+};
+
+struct BatchContentClassifier : public WriteBatch::Handler {
+  uint32_t content_flags = 0;
+
+  Status PutCF(uint32_t, const Slice&, const Slice&) override {
+    content_flags |= ContentFlags::HAS_PUT;
+    return Status::OK();
+  }
+
+  Status PutEntityCF(uint32_t /* column_family_id */, const Slice& /* key */,
+                     const Slice& /* entity */) override {
+    content_flags |= ContentFlags::HAS_PUT_ENTITY;
+    return Status::OK();
+  }
+
+  Status DeleteCF(uint32_t, const Slice&) override {
+    content_flags |= ContentFlags::HAS_DELETE;
+    return Status::OK();
+  }
+
+  Status SingleDeleteCF(uint32_t, const Slice&) override {
+    content_flags |= ContentFlags::HAS_SINGLE_DELETE;
+    return Status::OK();
+  }
+
+  Status DeleteRangeCF(uint32_t, const Slice&, const Slice&) override {
+    content_flags |= ContentFlags::HAS_DELETE_RANGE;
+    return Status::OK();
+  }
+
+  Status MergeCF(uint32_t, const Slice&, const Slice&) override {
+    content_flags |= ContentFlags::HAS_MERGE;
+    return Status::OK();
+  }
+
+  Status PutBlobIndexCF(uint32_t, const Slice&, const Slice&) override {
+    content_flags |= ContentFlags::HAS_BLOB_INDEX;
+    return Status::OK();
+  }
+
+  Status MarkBeginPrepare(bool unprepare) override {
+    content_flags |= ContentFlags::HAS_BEGIN_PREPARE;
+    if (unprepare) {
+      content_flags |= ContentFlags::HAS_BEGIN_UNPREPARE;
+    }
+    return Status::OK();
+  }
+
+  Status MarkEndPrepare(const Slice&) override {
+    content_flags |= ContentFlags::HAS_END_PREPARE;
+    return Status::OK();
+  }
+
+  Status MarkCommit(const Slice&) override {
+    content_flags |= ContentFlags::HAS_COMMIT;
+    return Status::OK();
+  }
+
+  Status MarkCommitWithTimestamp(const Slice&, const Slice&) override {
+    content_flags |= ContentFlags::HAS_COMMIT;
+    return Status::OK();
+  }
+
+  Status MarkRollback(const Slice&) override {
+    content_flags |= ContentFlags::HAS_ROLLBACK;
+    return Status::OK();
+  }
+};
+
+}  // anonymous namespace
+
+struct SavePoints {
+  std::stack<SavePoint, autovector<SavePoint>> stack;
+};
+
+WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes,
+                       size_t protection_bytes_per_key, size_t default_cf_ts_sz)
+    : content_flags_(0),
+      max_bytes_(max_bytes),
+      default_cf_ts_sz_(default_cf_ts_sz),
+      rep_() {
+  // Currently `protection_bytes_per_key` can only be enabled at 8 bytes per
+  // entry.
+  assert(protection_bytes_per_key == 0 || protection_bytes_per_key == 8);
+  if (protection_bytes_per_key != 0) {
+    prot_info_.reset(new WriteBatch::ProtectionInfo());
+  }
+  rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader)
+                   ? reserved_bytes
+                   : WriteBatchInternal::kHeader);
+  rep_.resize(WriteBatchInternal::kHeader);
+}
+
+WriteBatch::WriteBatch(const std::string& rep)
+    : content_flags_(ContentFlags::DEFERRED), max_bytes_(0), rep_(rep) {}
+
+WriteBatch::WriteBatch(std::string&& rep)
+    : content_flags_(ContentFlags::DEFERRED),
+      max_bytes_(0),
+      rep_(std::move(rep)) {}
+
+WriteBatch::WriteBatch(const WriteBatch& src)
+    : wal_term_point_(src.wal_term_point_),
+      content_flags_(src.content_flags_.load(std::memory_order_relaxed)),
+      max_bytes_(src.max_bytes_),
+      default_cf_ts_sz_(src.default_cf_ts_sz_),
+      rep_(src.rep_) {
+  if (src.save_points_ != nullptr) {
+    save_points_.reset(new SavePoints());
+    save_points_->stack = src.save_points_->stack;
+  }
+  if (src.prot_info_ != nullptr) {
+    prot_info_.reset(new WriteBatch::ProtectionInfo());
+    prot_info_->entries_ = src.prot_info_->entries_;
+  }
+}
+
+WriteBatch::WriteBatch(WriteBatch&& src) noexcept
+    : save_points_(std::move(src.save_points_)),
+      wal_term_point_(std::move(src.wal_term_point_)),
+      content_flags_(src.content_flags_.load(std::memory_order_relaxed)),
+      max_bytes_(src.max_bytes_),
+      prot_info_(std::move(src.prot_info_)),
+      default_cf_ts_sz_(src.default_cf_ts_sz_),
+      rep_(std::move(src.rep_)) {}
+
+WriteBatch& WriteBatch::operator=(const WriteBatch& src) {
+  if (&src != this) {
+    this->~WriteBatch();
+    new (this) WriteBatch(src);
+  }
+  return *this;
+}
+
+WriteBatch& WriteBatch::operator=(WriteBatch&& src) {
+  if (&src != this) {
+    this->~WriteBatch();
+    new (this) WriteBatch(std::move(src));
+  }
+  return *this;
+}
+
+WriteBatch::~WriteBatch() {}
+
+WriteBatch::Handler::~Handler() {}
+
+void WriteBatch::Handler::LogData(const Slice& /*blob*/) {
+  // If the user has not specified something to do with blobs, then we ignore
+  // them.
+}
+
+bool WriteBatch::Handler::Continue() { return true; }
+
+void WriteBatch::Clear() {
+  rep_.clear();
+  rep_.resize(WriteBatchInternal::kHeader);
+
+  content_flags_.store(0, std::memory_order_relaxed);
+
+  if (save_points_ != nullptr) {
+    while (!save_points_->stack.empty()) {
+      save_points_->stack.pop();
+    }
+  }
+
+  if (prot_info_ != nullptr) {
+    prot_info_->entries_.clear();
+  }
+  wal_term_point_.clear();
+  default_cf_ts_sz_ = 0;
+}
+
+uint32_t WriteBatch::Count() const { return WriteBatchInternal::Count(this); }
+
+uint32_t WriteBatch::ComputeContentFlags() const {
+  auto rv = content_flags_.load(std::memory_order_relaxed);
+  if ((rv & ContentFlags::DEFERRED) != 0) {
+    BatchContentClassifier classifier;
+    // Should we handle status here?
+    Iterate(&classifier).PermitUncheckedError();
+    rv = classifier.content_flags;
+
+    // this method is conceptually const, because it is performing a lazy
+    // computation that doesn't affect the abstract state of the batch.
+    // content_flags_ is marked mutable so that we can perform the
+    // following assignment
+    content_flags_.store(rv, std::memory_order_relaxed);
+  }
+  return rv;
+}
+
+void WriteBatch::MarkWalTerminationPoint() {
+  wal_term_point_.size = GetDataSize();
+  wal_term_point_.count = Count();
+  wal_term_point_.content_flags = content_flags_;
+}
+
+size_t WriteBatch::GetProtectionBytesPerKey() const {
+  if (prot_info_ != nullptr) {
+    return prot_info_->GetBytesPerKey();
+  }
+  return 0;
+}
+
+bool WriteBatch::HasPut() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_PUT) != 0;
+}
+
+bool WriteBatch::HasPutEntity() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_PUT_ENTITY) != 0;
+}
+
+bool WriteBatch::HasDelete() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_DELETE) != 0;
+}
+
+bool WriteBatch::HasSingleDelete() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_SINGLE_DELETE) != 0;
+}
+
+bool WriteBatch::HasDeleteRange() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_DELETE_RANGE) != 0;
+}
+
+bool WriteBatch::HasMerge() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_MERGE) != 0;
+}
+
+bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key, bool cf_record) {
+  assert(input != nullptr && key != nullptr);
+  // Skip tag byte
+  input->remove_prefix(1);
+
+  if (cf_record) {
+    // Skip column_family bytes
+    uint32_t cf;
+    if (!GetVarint32(input, &cf)) {
+      return false;
+    }
+  }
+
+  // Extract key
+  return GetLengthPrefixedSlice(input, key);
+}
+
+bool WriteBatch::HasBeginPrepare() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_BEGIN_PREPARE) != 0;
+}
+
+bool WriteBatch::HasEndPrepare() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_END_PREPARE) != 0;
+}
+
+bool WriteBatch::HasCommit() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_COMMIT) != 0;
+}
+
+bool WriteBatch::HasRollback() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_ROLLBACK) != 0;
+}
+
+Status ReadRecordFromWriteBatch(Slice* input, char* tag,
+                                uint32_t* column_family, Slice* key,
+                                Slice* value, Slice* blob, Slice* xid) {
+  assert(key != nullptr && value != nullptr);
+  *tag = (*input)[0];
+  input->remove_prefix(1);
+  *column_family = 0;  // default
+  switch (*tag) {
+    case kTypeColumnFamilyValue:
+      if (!GetVarint32(input, column_family)) {
+        return Status::Corruption("bad WriteBatch Put");
+      }
+      FALLTHROUGH_INTENDED;
+    case kTypeValue:
+      if (!GetLengthPrefixedSlice(input, key) ||
+          !GetLengthPrefixedSlice(input, value)) {
+        return Status::Corruption("bad WriteBatch Put");
+      }
+      break;
+    case kTypeColumnFamilyDeletion:
+    case kTypeColumnFamilySingleDeletion:
+      if (!GetVarint32(input, column_family)) {
+        return Status::Corruption("bad WriteBatch Delete");
+      }
+      FALLTHROUGH_INTENDED;
+    case kTypeDeletion:
+    case kTypeSingleDeletion:
+      if (!GetLengthPrefixedSlice(input, key)) {
+        return Status::Corruption("bad WriteBatch Delete");
+      }
+      break;
+    case kTypeColumnFamilyRangeDeletion:
+      if (!GetVarint32(input, column_family)) {
+        return Status::Corruption("bad WriteBatch DeleteRange");
+      }
+      FALLTHROUGH_INTENDED;
+    case kTypeRangeDeletion:
+      // for range delete, "key" is begin_key, "value" is end_key
+      if (!GetLengthPrefixedSlice(input, key) ||
+          !GetLengthPrefixedSlice(input, value)) {
+        return Status::Corruption("bad WriteBatch DeleteRange");
+      }
+      break;
+    case kTypeColumnFamilyMerge:
+      if (!GetVarint32(input, column_family)) {
+        return Status::Corruption("bad WriteBatch Merge");
+      }
+      FALLTHROUGH_INTENDED;
+    case kTypeMerge:
+      if (!GetLengthPrefixedSlice(input, key) ||
+          !GetLengthPrefixedSlice(input, value)) {
+        return Status::Corruption("bad WriteBatch Merge");
+      }
+      break;
+    case kTypeColumnFamilyBlobIndex:
+      if (!GetVarint32(input, column_family)) {
+        return Status::Corruption("bad WriteBatch BlobIndex");
+      }
+      FALLTHROUGH_INTENDED;
+    case kTypeBlobIndex:
+      if (!GetLengthPrefixedSlice(input, key) ||
+          !GetLengthPrefixedSlice(input, value)) {
+        return Status::Corruption("bad WriteBatch BlobIndex");
+      }
+      break;
+    case kTypeLogData:
+      assert(blob != nullptr);
+      if (!GetLengthPrefixedSlice(input, blob)) {
+        return Status::Corruption("bad WriteBatch Blob");
+      }
+      break;
+    case kTypeNoop:
+    case kTypeBeginPrepareXID:
+      // This indicates that the prepared batch is also persisted in the db.
+      // This is used in WritePreparedTxn
+    case kTypeBeginPersistedPrepareXID:
+      // This is used in WriteUnpreparedTxn
+    case kTypeBeginUnprepareXID:
+      break;
+    case kTypeEndPrepareXID:
+      if (!GetLengthPrefixedSlice(input, xid)) {
+        return Status::Corruption("bad EndPrepare XID");
+      }
+      break;
+    case kTypeCommitXIDAndTimestamp:
+      if (!GetLengthPrefixedSlice(input, key)) {
+        return Status::Corruption("bad commit timestamp");
+      }
+      FALLTHROUGH_INTENDED;
+    case kTypeCommitXID:
+      if (!GetLengthPrefixedSlice(input, xid)) {
+        return Status::Corruption("bad Commit XID");
+      }
+      break;
+    case kTypeRollbackXID:
+      if (!GetLengthPrefixedSlice(input, xid)) {
+        return Status::Corruption("bad Rollback XID");
+      }
+      break;
+    case kTypeColumnFamilyWideColumnEntity:
+      if (!GetVarint32(input, column_family)) {
+        return Status::Corruption("bad WriteBatch PutEntity");
+      }
+      FALLTHROUGH_INTENDED;
+    case kTypeWideColumnEntity:
+      if (!GetLengthPrefixedSlice(input, key) ||
+          !GetLengthPrefixedSlice(input, value)) {
+        return Status::Corruption("bad WriteBatch PutEntity");
+      }
+      break;
+    default:
+      return Status::Corruption("unknown WriteBatch tag");
+  }
+  return Status::OK();
+}
+
+Status WriteBatch::Iterate(Handler* handler) const {
+  if (rep_.size() < WriteBatchInternal::kHeader) {
+    return Status::Corruption("malformed WriteBatch (too small)");
+  }
+
+  return WriteBatchInternal::Iterate(this, handler, WriteBatchInternal::kHeader,
+                                     rep_.size());
+}
+
+Status WriteBatchInternal::Iterate(const WriteBatch* wb,
+                                   WriteBatch::Handler* handler, size_t begin,
+                                   size_t end) {
+  if (begin > wb->rep_.size() || end > wb->rep_.size() || end < begin) {
+    return Status::Corruption("Invalid start/end bounds for Iterate");
+  }
+  assert(begin <= end);
+  Slice input(wb->rep_.data() + begin, static_cast<size_t>(end - begin));
+  bool whole_batch =
+      (begin == WriteBatchInternal::kHeader) && (end == wb->rep_.size());
+
+  Slice key, value, blob, xid;
+
+  // Sometimes a sub-batch starts with a Noop. We want to exclude such Noops as
+  // the batch boundary symbols otherwise we would mis-count the number of
+  // batches. We do that by checking whether the accumulated batch is empty
+  // before seeing the next Noop.
+  bool empty_batch = true;
+  uint32_t found = 0;
+  Status s;
+  char tag = 0;
+  uint32_t column_family = 0;  // default
+  bool last_was_try_again = false;
+  bool handler_continue = true;
+  while (((s.ok() && !input.empty()) || UNLIKELY(s.IsTryAgain()))) {
+    handler_continue = handler->Continue();
+    if (!handler_continue) {
+      break;
+    }
+
+    if (LIKELY(!s.IsTryAgain())) {
+      last_was_try_again = false;
+      tag = 0;
+      column_family = 0;  // default
+
+      s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value,
+                                   &blob, &xid);
+      if (!s.ok()) {
+        return s;
+      }
+    } else {
+      assert(s.IsTryAgain());
+      assert(!last_was_try_again);  // to detect infinite loop bugs
+      if (UNLIKELY(last_was_try_again)) {
+        return Status::Corruption(
+            "two consecutive TryAgain in WriteBatch handler; this is either a "
+            "software bug or data corruption.");
+      }
+      last_was_try_again = true;
+      s = Status::OK();
+    }
+
+    switch (tag) {
+      case kTypeColumnFamilyValue:
+      case kTypeValue:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_PUT));
+        s = handler->PutCF(column_family, key, value);
+        if (LIKELY(s.ok())) {
+          empty_batch = false;
+          found++;
+        }
+        break;
+      case kTypeColumnFamilyDeletion:
+      case kTypeDeletion:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_DELETE));
+        s = handler->DeleteCF(column_family, key);
+        if (LIKELY(s.ok())) {
+          empty_batch = false;
+          found++;
+        }
+        break;
+      case kTypeColumnFamilySingleDeletion:
+      case kTypeSingleDeletion:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_SINGLE_DELETE));
+        s = handler->SingleDeleteCF(column_family, key);
+        if (LIKELY(s.ok())) {
+          empty_batch = false;
+          found++;
+        }
+        break;
+      case kTypeColumnFamilyRangeDeletion:
+      case kTypeRangeDeletion:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_DELETE_RANGE));
+        s = handler->DeleteRangeCF(column_family, key, value);
+        if (LIKELY(s.ok())) {
+          empty_batch = false;
+          found++;
+        }
+        break;
+      case kTypeColumnFamilyMerge:
+      case kTypeMerge:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_MERGE));
+        s = handler->MergeCF(column_family, key, value);
+        if (LIKELY(s.ok())) {
+          empty_batch = false;
+          found++;
+        }
+        break;
+      case kTypeColumnFamilyBlobIndex:
+      case kTypeBlobIndex:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_BLOB_INDEX));
+        s = handler->PutBlobIndexCF(column_family, key, value);
+        if (LIKELY(s.ok())) {
+          found++;
+        }
+        break;
+      case kTypeLogData:
+        handler->LogData(blob);
+        // A batch might have nothing but LogData. It is still a batch.
+        empty_batch = false;
+        break;
+      case kTypeBeginPrepareXID:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE));
+        s = handler->MarkBeginPrepare();
+        assert(s.ok());
+        empty_batch = false;
+        if (handler->WriteAfterCommit() ==
+            WriteBatch::Handler::OptionState::kDisabled) {
+          s = Status::NotSupported(
+              "WriteCommitted txn tag when write_after_commit_ is disabled (in "
+              "WritePrepared/WriteUnprepared mode). If it is not due to "
+              "corruption, the WAL must be emptied before changing the "
+              "WritePolicy.");
+        }
+        if (handler->WriteBeforePrepare() ==
+            WriteBatch::Handler::OptionState::kEnabled) {
+          s = Status::NotSupported(
+              "WriteCommitted txn tag when write_before_prepare_ is enabled "
+              "(in WriteUnprepared mode). If it is not due to corruption, the "
+              "WAL must be emptied before changing the WritePolicy.");
+        }
+        break;
+      case kTypeBeginPersistedPrepareXID:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE));
+        s = handler->MarkBeginPrepare();
+        assert(s.ok());
+        empty_batch = false;
+        if (handler->WriteAfterCommit() ==
+            WriteBatch::Handler::OptionState::kEnabled) {
+          s = Status::NotSupported(
+              "WritePrepared/WriteUnprepared txn tag when write_after_commit_ "
+              "is enabled (in default WriteCommitted mode). If it is not due "
+              "to corruption, the WAL must be emptied before changing the "
+              "WritePolicy.");
+        }
+        break;
+      case kTypeBeginUnprepareXID:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_UNPREPARE));
+        s = handler->MarkBeginPrepare(true /* unprepared */);
+        assert(s.ok());
+        empty_batch = false;
+        if (handler->WriteAfterCommit() ==
+            WriteBatch::Handler::OptionState::kEnabled) {
+          s = Status::NotSupported(
+              "WriteUnprepared txn tag when write_after_commit_ is enabled (in "
+              "default WriteCommitted mode). If it is not due to corruption, "
+              "the WAL must be emptied before changing the WritePolicy.");
+        }
+        if (handler->WriteBeforePrepare() ==
+            WriteBatch::Handler::OptionState::kDisabled) {
+          s = Status::NotSupported(
+              "WriteUnprepared txn tag when write_before_prepare_ is disabled "
+              "(in WriteCommitted/WritePrepared mode). If it is not due to "
+              "corruption, the WAL must be emptied before changing the "
+              "WritePolicy.");
+        }
+        break;
+      case kTypeEndPrepareXID:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_END_PREPARE));
+        s = handler->MarkEndPrepare(xid);
+        assert(s.ok());
+        empty_batch = true;
+        break;
+      case kTypeCommitXID:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_COMMIT));
+        s = handler->MarkCommit(xid);
+        assert(s.ok());
+        empty_batch = true;
+        break;
+      case kTypeCommitXIDAndTimestamp:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_COMMIT));
+        // key stores the commit timestamp.
+        assert(!key.empty());
+        s = handler->MarkCommitWithTimestamp(xid, key);
+        if (LIKELY(s.ok())) {
+          empty_batch = true;
+        }
+        break;
+      case kTypeRollbackXID:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_ROLLBACK));
+        s = handler->MarkRollback(xid);
+        assert(s.ok());
+        empty_batch = true;
+        break;
+      case kTypeNoop:
+        s = handler->MarkNoop(empty_batch);
+        assert(s.ok());
+        empty_batch = true;
+        break;
+      case kTypeWideColumnEntity:
+      case kTypeColumnFamilyWideColumnEntity:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_PUT_ENTITY));
+        s = handler->PutEntityCF(column_family, key, value);
+        if (LIKELY(s.ok())) {
+          empty_batch = false;
+          ++found;
+        }
+        break;
+      default:
+        return Status::Corruption("unknown WriteBatch tag");
+    }
+  }
+  if (!s.ok()) {
+    return s;
+  }
+  if (handler_continue && whole_batch &&
+      found != WriteBatchInternal::Count(wb)) {
+    return Status::Corruption("WriteBatch has wrong count");
+  } else {
+    return Status::OK();
+  }
+}
+
+bool WriteBatchInternal::IsLatestPersistentState(const WriteBatch* b) {
+  return b->is_latest_persistent_state_;
+}
+
+void WriteBatchInternal::SetAsLatestPersistentState(WriteBatch* b) {
+  b->is_latest_persistent_state_ = true;
+}
+
+uint32_t WriteBatchInternal::Count(const WriteBatch* b) {
+  return DecodeFixed32(b->rep_.data() + 8);
+}
+
+void WriteBatchInternal::SetCount(WriteBatch* b, uint32_t n) {
+  EncodeFixed32(&b->rep_[8], n);
+}
+
+SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) {
+  return SequenceNumber(DecodeFixed64(b->rep_.data()));
+}
+
+void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
+  EncodeFixed64(&b->rep_[0], seq);
+}
+
+size_t WriteBatchInternal::GetFirstOffset(WriteBatch* /*b*/) {
+  return WriteBatchInternal::kHeader;
+}
+
+std::tuple<Status, uint32_t, size_t>
+WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(
+    WriteBatch* b, ColumnFamilyHandle* column_family) {
+  uint32_t cf_id = GetColumnFamilyID(column_family);
+  size_t ts_sz = 0;
+  Status s;
+  if (column_family) {
+    const Comparator* const ucmp = column_family->GetComparator();
+    if (ucmp) {
+      ts_sz = ucmp->timestamp_size();
+      if (0 == cf_id && b->default_cf_ts_sz_ != ts_sz) {
+        s = Status::InvalidArgument("Default cf timestamp size mismatch");
+      }
+    }
+  } else if (b->default_cf_ts_sz_ > 0) {
+    ts_sz = b->default_cf_ts_sz_;
+  }
+  return std::make_tuple(s, cf_id, ts_sz);
+}
+
+namespace {
+Status CheckColumnFamilyTimestampSize(ColumnFamilyHandle* column_family,
+                                      const Slice& ts) {
+  if (!column_family) {
+    return Status::InvalidArgument("column family handle cannot be null");
+  }
+  const Comparator* const ucmp = column_family->GetComparator();
+  assert(ucmp);
+  size_t cf_ts_sz = ucmp->timestamp_size();
+  if (0 == cf_ts_sz) {
+    return Status::InvalidArgument("timestamp disabled");
+  }
+  if (cf_ts_sz != ts.size()) {
+    return Status::InvalidArgument("timestamp size mismatch");
+  }
+  return Status::OK();
+}
+}  // anonymous namespace
+
+Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
+                               const Slice& key, const Slice& value) {
+  if (key.size() > size_t{std::numeric_limits<uint32_t>::max()}) {
+    return Status::InvalidArgument("key is too large");
+  }
+  if (value.size() > size_t{std::numeric_limits<uint32_t>::max()}) {
+    return Status::InvalidArgument("value is too large");
+  }
+
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeValue));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSlice(&b->rep_, key);
+  PutLengthPrefixedSlice(&b->rep_, value);
+  b->content_flags_.store(
+      b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT,
+      std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // Technically the optype could've been `kTypeColumnFamilyValue` with the
+    // CF ID encoded in the `WriteBatch`. That distinction is unimportant
+    // however since we verify CF ID is correct, as well as all other fields
+    // (a missing/extra encoded CF ID would corrupt another field). It is
+    // convenient to consolidate on `kTypeValue` here as that is what will be
+    // inserted into memtable.
+    b->prot_info_->entries_.emplace_back(ProtectionInfo64()
+                                             .ProtectKVO(key, value, kTypeValue)
+                                             .ProtectC(column_family_id));
+  }
+  return save.commit();
+}
+
+Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) {
+  size_t ts_sz = 0;
+  uint32_t cf_id = 0;
+  Status s;
+
+  std::tie(s, cf_id, ts_sz) =
+      WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+                                                            column_family);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (0 == ts_sz) {
+    return WriteBatchInternal::Put(this, cf_id, key, value);
+  }
+
+  needs_in_place_update_ts_ = true;
+  has_key_with_ts_ = true;
+  std::string dummy_ts(ts_sz, '\0');
+  std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
+  return WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2),
+                                 SliceParts(&value, 1));
+}
+
+Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& ts, const Slice& value) {
+  const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
+  if (!s.ok()) {
+    return s;
+  }
+  has_key_with_ts_ = true;
+  assert(column_family);
+  uint32_t cf_id = column_family->GetID();
+  std::array<Slice, 2> key_with_ts{{key, ts}};
+  return WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2),
+                                 SliceParts(&value, 1));
+}
+
+Status WriteBatchInternal::CheckSlicePartsLength(const SliceParts& key,
+                                                 const SliceParts& value) {
+  size_t total_key_bytes = 0;
+  for (int i = 0; i < key.num_parts; ++i) {
+    total_key_bytes += key.parts[i].size();
+  }
+  if (total_key_bytes >= size_t{std::numeric_limits<uint32_t>::max()}) {
+    return Status::InvalidArgument("key is too large");
+  }
+
+  size_t total_value_bytes = 0;
+  for (int i = 0; i < value.num_parts; ++i) {
+    total_value_bytes += value.parts[i].size();
+  }
+  if (total_value_bytes >= size_t{std::numeric_limits<uint32_t>::max()}) {
+    return Status::InvalidArgument("value is too large");
+  }
+  return Status::OK();
+}
+
+Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
+                               const SliceParts& key, const SliceParts& value) {
+  Status s = CheckSlicePartsLength(key, value);
+  if (!s.ok()) {
+    return s;
+  }
+
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeValue));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSliceParts(&b->rep_, key);
+  PutLengthPrefixedSliceParts(&b->rep_, value);
+  b->content_flags_.store(
+      b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT,
+      std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    b->prot_info_->entries_.emplace_back(ProtectionInfo64()
+                                             .ProtectKVO(key, value, kTypeValue)
+                                             .ProtectC(column_family_id));
+  }
+  return save.commit();
+}
+
+Status WriteBatch::Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+                       const SliceParts& value) {
+  size_t ts_sz = 0;
+  uint32_t cf_id = 0;
+  Status s;
+
+  std::tie(s, cf_id, ts_sz) =
+      WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+                                                            column_family);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (ts_sz == 0) {
+    return WriteBatchInternal::Put(this, cf_id, key, value);
+  }
+
+  return Status::InvalidArgument(
+      "Cannot call this method on column family enabling timestamp");
+}
+
+Status WriteBatchInternal::PutEntity(WriteBatch* b, uint32_t column_family_id,
+                                     const Slice& key,
+                                     const WideColumns& columns) {
+  assert(b);
+
+  if (key.size() > size_t{std::numeric_limits<uint32_t>::max()}) {
+    return Status::InvalidArgument("key is too large");
+  }
+
+  WideColumns sorted_columns(columns);
+  std::sort(sorted_columns.begin(), sorted_columns.end(),
+            [](const WideColumn& lhs, const WideColumn& rhs) {
+              return lhs.name().compare(rhs.name()) < 0;
+            });
+
+  std::string entity;
+  const Status s = WideColumnSerialization::Serialize(sorted_columns, entity);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (entity.size() > size_t{std::numeric_limits<uint32_t>::max()}) {
+    return Status::InvalidArgument("wide column entity is too large");
+  }
+
+  LocalSavePoint save(b);
+
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeWideColumnEntity));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyWideColumnEntity));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+
+  PutLengthPrefixedSlice(&b->rep_, key);
+  PutLengthPrefixedSlice(&b->rep_, entity);
+
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_PUT_ENTITY,
+                          std::memory_order_relaxed);
+
+  if (b->prot_info_ != nullptr) {
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVO(key, entity, kTypeWideColumnEntity)
+            .ProtectC(column_family_id));
+  }
+
+  return save.commit();
+}
+
+Status WriteBatch::PutEntity(ColumnFamilyHandle* column_family,
+                             const Slice& key, const WideColumns& columns) {
+  if (!column_family) {
+    return Status::InvalidArgument(
+        "Cannot call this method without a column family handle");
+  }
+
+  Status s;
+  uint32_t cf_id = 0;
+  size_t ts_sz = 0;
+
+  std::tie(s, cf_id, ts_sz) =
+      WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+                                                            column_family);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (ts_sz) {
+    return Status::InvalidArgument(
+        "Cannot call this method on column family enabling timestamp");
+  }
+
+  return WriteBatchInternal::PutEntity(this, cf_id, key, columns);
+}
+
+Status WriteBatchInternal::InsertNoop(WriteBatch* b) {
+  b->rep_.push_back(static_cast<char>(kTypeNoop));
+  return Status::OK();
+}
+
+Status WriteBatchInternal::MarkEndPrepare(WriteBatch* b, const Slice& xid,
+                                          bool write_after_commit,
+                                          bool unprepared_batch) {
+  // a manually constructed batch can only contain one prepare section
+  assert(b->rep_[12] == static_cast<char>(kTypeNoop));
+
+  // all savepoints up to this point are cleared
+  if (b->save_points_ != nullptr) {
+    while (!b->save_points_->stack.empty()) {
+      b->save_points_->stack.pop();
+    }
+  }
+
+  // rewrite noop as begin marker
+  b->rep_[12] = static_cast<char>(
+      write_after_commit ? kTypeBeginPrepareXID
+                         : (unprepared_batch ? kTypeBeginUnprepareXID
+                                             : kTypeBeginPersistedPrepareXID));
+  b->rep_.push_back(static_cast<char>(kTypeEndPrepareXID));
+  PutLengthPrefixedSlice(&b->rep_, xid);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_END_PREPARE |
+                              ContentFlags::HAS_BEGIN_PREPARE,
+                          std::memory_order_relaxed);
+  if (unprepared_batch) {
+    b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                                ContentFlags::HAS_BEGIN_UNPREPARE,
+                            std::memory_order_relaxed);
+  }
+  return Status::OK();
+}
+
+Status WriteBatchInternal::MarkCommit(WriteBatch* b, const Slice& xid) {
+  b->rep_.push_back(static_cast<char>(kTypeCommitXID));
+  PutLengthPrefixedSlice(&b->rep_, xid);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_COMMIT,
+                          std::memory_order_relaxed);
+  return Status::OK();
+}
+
+Status WriteBatchInternal::MarkCommitWithTimestamp(WriteBatch* b,
+                                                   const Slice& xid,
+                                                   const Slice& commit_ts) {
+  assert(!commit_ts.empty());
+  b->rep_.push_back(static_cast<char>(kTypeCommitXIDAndTimestamp));
+  PutLengthPrefixedSlice(&b->rep_, commit_ts);
+  PutLengthPrefixedSlice(&b->rep_, xid);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_COMMIT,
+                          std::memory_order_relaxed);
+  return Status::OK();
+}
+
+Status WriteBatchInternal::MarkRollback(WriteBatch* b, const Slice& xid) {
+  b->rep_.push_back(static_cast<char>(kTypeRollbackXID));
+  PutLengthPrefixedSlice(&b->rep_, xid);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_ROLLBACK,
+                          std::memory_order_relaxed);
+  return Status::OK();
+}
+
+Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id,
+                                  const Slice& key) {
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeDeletion));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyDeletion));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSlice(&b->rep_, key);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_DELETE,
+                          std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVO(key, "" /* value */, kTypeDeletion)
+            .ProtectC(column_family_id));
+  }
+  return save.commit();
+}
+
+Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key) {
+  size_t ts_sz = 0;
+  uint32_t cf_id = 0;
+  Status s;
+
+  std::tie(s, cf_id, ts_sz) =
+      WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+                                                            column_family);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (0 == ts_sz) {
+    return WriteBatchInternal::Delete(this, cf_id, key);
+  }
+
+  needs_in_place_update_ts_ = true;
+  has_key_with_ts_ = true;
+  std::string dummy_ts(ts_sz, '\0');
+  std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
+  return WriteBatchInternal::Delete(this, cf_id,
+                                    SliceParts(key_with_ts.data(), 2));
+}
+
+Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key,
+                          const Slice& ts) {
+  const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
+  if (!s.ok()) {
+    return s;
+  }
+  assert(column_family);
+  has_key_with_ts_ = true;
+  uint32_t cf_id = column_family->GetID();
+  std::array<Slice, 2> key_with_ts{{key, ts}};
+  return WriteBatchInternal::Delete(this, cf_id,
+                                    SliceParts(key_with_ts.data(), 2));
+}
+
+Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id,
+                                  const SliceParts& key) {
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeDeletion));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyDeletion));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSliceParts(&b->rep_, key);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_DELETE,
+                          std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVO(key,
+                        SliceParts(nullptr /* _parts */, 0 /* _num_parts */),
+                        kTypeDeletion)
+            .ProtectC(column_family_id));
+  }
+  return save.commit();
+}
+
+Status WriteBatch::Delete(ColumnFamilyHandle* column_family,
+                          const SliceParts& key) {
+  size_t ts_sz = 0;
+  uint32_t cf_id = 0;
+  Status s;
+
+  std::tie(s, cf_id, ts_sz) =
+      WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+                                                            column_family);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (0 == ts_sz) {
+    return WriteBatchInternal::Delete(this, cf_id, key);
+  }
+
+  return Status::InvalidArgument(
+      "Cannot call this method on column family enabling timestamp");
+}
+
+Status WriteBatchInternal::SingleDelete(WriteBatch* b,
+                                        uint32_t column_family_id,
+                                        const Slice& key) {
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeSingleDeletion));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilySingleDeletion));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSlice(&b->rep_, key);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_SINGLE_DELETE,
+                          std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVO(key, "" /* value */, kTypeSingleDeletion)
+            .ProtectC(column_family_id));
+  }
+  return save.commit();
+}
+
+Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
+                                const Slice& key) {
+  size_t ts_sz = 0;
+  uint32_t cf_id = 0;
+  Status s;
+
+  std::tie(s, cf_id, ts_sz) =
+      WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+                                                            column_family);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (0 == ts_sz) {
+    return WriteBatchInternal::SingleDelete(this, cf_id, key);
+  }
+
+  needs_in_place_update_ts_ = true;
+  has_key_with_ts_ = true;
+  std::string dummy_ts(ts_sz, '\0');
+  std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
+  return WriteBatchInternal::SingleDelete(this, cf_id,
+                                          SliceParts(key_with_ts.data(), 2));
+}
+
+Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
+                                const Slice& key, const Slice& ts) {
+  const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
+  if (!s.ok()) {
+    return s;
+  }
+  has_key_with_ts_ = true;
+  assert(column_family);
+  uint32_t cf_id = column_family->GetID();
+  std::array<Slice, 2> key_with_ts{{key, ts}};
+  return WriteBatchInternal::SingleDelete(this, cf_id,
+                                          SliceParts(key_with_ts.data(), 2));
+}
+
+Status WriteBatchInternal::SingleDelete(WriteBatch* b,
+                                        uint32_t column_family_id,
+                                        const SliceParts& key) {
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeSingleDeletion));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilySingleDeletion));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSliceParts(&b->rep_, key);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_SINGLE_DELETE,
+                          std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVO(key,
+                        SliceParts(nullptr /* _parts */,
+                                   0 /* _num_parts */) /* value */,
+                        kTypeSingleDeletion)
+            .ProtectC(column_family_id));
+  }
+  return save.commit();
+}
+
+Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
+                                const SliceParts& key) {
+  size_t ts_sz = 0;
+  uint32_t cf_id = 0;
+  Status s;
+
+  std::tie(s, cf_id, ts_sz) =
+      WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+                                                            column_family);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (0 == ts_sz) {
+    return WriteBatchInternal::SingleDelete(this, cf_id, key);
+  }
+
+  return Status::InvalidArgument(
+      "Cannot call this method on column family enabling timestamp");
+}
+
+Status WriteBatchInternal::DeleteRange(WriteBatch* b, uint32_t column_family_id,
+                                       const Slice& begin_key,
+                                       const Slice& end_key) {
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeRangeDeletion));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyRangeDeletion));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSlice(&b->rep_, begin_key);
+  PutLengthPrefixedSlice(&b->rep_, end_key);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_DELETE_RANGE,
+                          std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    // In `DeleteRange()`, the end key is treated as the value.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVO(begin_key, end_key, kTypeRangeDeletion)
+            .ProtectC(column_family_id));
+  }
+  return save.commit();
+}
+
+Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
+                               const Slice& begin_key, const Slice& end_key) {
+  size_t ts_sz = 0;
+  uint32_t cf_id = 0;
+  Status s;
+
+  std::tie(s, cf_id, ts_sz) =
+      WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+                                                            column_family);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (0 == ts_sz) {
+    return WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key);
+  }
+
+  needs_in_place_update_ts_ = true;
+  has_key_with_ts_ = true;
+  std::string dummy_ts(ts_sz, '\0');
+  std::array<Slice, 2> begin_key_with_ts{{begin_key, dummy_ts}};
+  std::array<Slice, 2> end_key_with_ts{{end_key, dummy_ts}};
+  return WriteBatchInternal::DeleteRange(
+      this, cf_id, SliceParts(begin_key_with_ts.data(), 2),
+      SliceParts(end_key_with_ts.data(), 2));
+}
+
+Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
+                               const Slice& begin_key, const Slice& end_key,
+                               const Slice& ts) {
+  const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
+  if (!s.ok()) {
+    return s;
+  }
+  assert(column_family);
+  has_key_with_ts_ = true;
+  uint32_t cf_id = column_family->GetID();
+  std::array<Slice, 2> key_with_ts{{begin_key, ts}};
+  std::array<Slice, 2> end_key_with_ts{{end_key, ts}};
+  return WriteBatchInternal::DeleteRange(this, cf_id,
+                                         SliceParts(key_with_ts.data(), 2),
+                                         SliceParts(end_key_with_ts.data(), 2));
+}
+
+Status WriteBatchInternal::DeleteRange(WriteBatch* b, uint32_t column_family_id,
+                                       const SliceParts& begin_key,
+                                       const SliceParts& end_key) {
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeRangeDeletion));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyRangeDeletion));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSliceParts(&b->rep_, begin_key);
+  PutLengthPrefixedSliceParts(&b->rep_, end_key);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_DELETE_RANGE,
+                          std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    // In `DeleteRange()`, the end key is treated as the value.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVO(begin_key, end_key, kTypeRangeDeletion)
+            .ProtectC(column_family_id));
+  }
+  return save.commit();
+}
+
+Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
+                               const SliceParts& begin_key,
+                               const SliceParts& end_key) {
+  size_t ts_sz = 0;
+  uint32_t cf_id = 0;
+  Status s;
+
+  std::tie(s, cf_id, ts_sz) =
+      WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+                                                            column_family);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (0 == ts_sz) {
+    return WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key);
+  }
+
+  return Status::InvalidArgument(
+      "Cannot call this method on column family enabling timestamp");
+}
+
+Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id,
+                                 const Slice& key, const Slice& value) {
+  if (key.size() > size_t{std::numeric_limits<uint32_t>::max()}) {
+    return Status::InvalidArgument("key is too large");
+  }
+  if (value.size() > size_t{std::numeric_limits<uint32_t>::max()}) {
+    return Status::InvalidArgument("value is too large");
+  }
+
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeMerge));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyMerge));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSlice(&b->rep_, key);
+  PutLengthPrefixedSlice(&b->rep_, value);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_MERGE,
+                          std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    b->prot_info_->entries_.emplace_back(ProtectionInfo64()
+                                             .ProtectKVO(key, value, kTypeMerge)
+                                             .ProtectC(column_family_id));
+  }
+  return save.commit();
+}
+
+Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
+                         const Slice& value) {
+  size_t ts_sz = 0;
+  uint32_t cf_id = 0;
+  Status s;
+
+  std::tie(s, cf_id, ts_sz) =
+      WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+                                                            column_family);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (0 == ts_sz) {
+    return WriteBatchInternal::Merge(this, cf_id, key, value);
+  }
+
+  needs_in_place_update_ts_ = true;
+  has_key_with_ts_ = true;
+  std::string dummy_ts(ts_sz, '\0');
+  std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
+
+  return WriteBatchInternal::Merge(
+      this, cf_id, SliceParts(key_with_ts.data(), 2), SliceParts(&value, 1));
+}
+
+Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
+                         const Slice& ts, const Slice& value) {
+  const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
+  if (!s.ok()) {
+    return s;
+  }
+  has_key_with_ts_ = true;
+  assert(column_family);
+  uint32_t cf_id = column_family->GetID();
+  std::array<Slice, 2> key_with_ts{{key, ts}};
+  return WriteBatchInternal::Merge(
+      this, cf_id, SliceParts(key_with_ts.data(), 2), SliceParts(&value, 1));
+}
+
+Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id,
+                                 const SliceParts& key,
+                                 const SliceParts& value) {
+  Status s = CheckSlicePartsLength(key, value);
+  if (!s.ok()) {
+    return s;
+  }
+
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeMerge));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyMerge));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSliceParts(&b->rep_, key);
+  PutLengthPrefixedSliceParts(&b->rep_, value);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_MERGE,
+                          std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    b->prot_info_->entries_.emplace_back(ProtectionInfo64()
+                                             .ProtectKVO(key, value, kTypeMerge)
+                                             .ProtectC(column_family_id));
+  }
+  return save.commit();
+}
+
+Status WriteBatch::Merge(ColumnFamilyHandle* column_family,
+                         const SliceParts& key, const SliceParts& value) {
+  size_t ts_sz = 0;
+  uint32_t cf_id = 0;
+  Status s;
+
+  std::tie(s, cf_id, ts_sz) =
+      WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+                                                            column_family);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (0 == ts_sz) {
+    return WriteBatchInternal::Merge(this, cf_id, key, value);
+  }
+
+  return Status::InvalidArgument(
+      "Cannot call this method on column family enabling timestamp");
+}
+
+Status WriteBatchInternal::PutBlobIndex(WriteBatch* b,
+                                        uint32_t column_family_id,
+                                        const Slice& key, const Slice& value) {
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeBlobIndex));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyBlobIndex));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSlice(&b->rep_, key);
+  PutLengthPrefixedSlice(&b->rep_, value);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_BLOB_INDEX,
+                          std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVO(key, value, kTypeBlobIndex)
+            .ProtectC(column_family_id));
+  }
+  return save.commit();
+}
+
+Status WriteBatch::PutLogData(const Slice& blob) {
+  LocalSavePoint save(this);
+  rep_.push_back(static_cast<char>(kTypeLogData));
+  PutLengthPrefixedSlice(&rep_, blob);
+  return save.commit();
+}
+
+void WriteBatch::SetSavePoint() {
+  if (save_points_ == nullptr) {
+    save_points_.reset(new SavePoints());
+  }
+  // Record length and count of current batch of writes.
+  save_points_->stack.push(SavePoint(
+      GetDataSize(), Count(), content_flags_.load(std::memory_order_relaxed)));
+}
+
+Status WriteBatch::RollbackToSavePoint() {
+  if (save_points_ == nullptr || save_points_->stack.size() == 0) {
+    return Status::NotFound();
+  }
+
+  // Pop the most recent savepoint off the stack
+  SavePoint savepoint = save_points_->stack.top();
+  save_points_->stack.pop();
+
+  assert(savepoint.size <= rep_.size());
+  assert(static_cast<uint32_t>(savepoint.count) <= Count());
+
+  if (savepoint.size == rep_.size()) {
+    // No changes to rollback
+  } else if (savepoint.size == 0) {
+    // Rollback everything
+    Clear();
+  } else {
+    rep_.resize(savepoint.size);
+    if (prot_info_ != nullptr) {
+      prot_info_->entries_.resize(savepoint.count);
+    }
+    WriteBatchInternal::SetCount(this, savepoint.count);
+    content_flags_.store(savepoint.content_flags, std::memory_order_relaxed);
+  }
+
+  return Status::OK();
+}
+
+Status WriteBatch::PopSavePoint() {
+  if (save_points_ == nullptr || save_points_->stack.size() == 0) {
+    return Status::NotFound();
+  }
+
+  // Pop the most recent savepoint off the stack
+  save_points_->stack.pop();
+
+  return Status::OK();
+}
+
+Status WriteBatch::UpdateTimestamps(
+    const Slice& ts, std::function<size_t(uint32_t)> ts_sz_func) {
+  TimestampUpdater<decltype(ts_sz_func)> ts_updater(prot_info_.get(),
+                                                    std::move(ts_sz_func), ts);
+  const Status s = Iterate(&ts_updater);
+  if (s.ok()) {
+    needs_in_place_update_ts_ = false;
+  }
+  return s;
+}
+
+Status WriteBatch::VerifyChecksum() const {
+  if (prot_info_ == nullptr) {
+    return Status::OK();
+  }
+  Slice input(rep_.data() + WriteBatchInternal::kHeader,
+              rep_.size() - WriteBatchInternal::kHeader);
+  Slice key, value, blob, xid;
+  char tag = 0;
+  uint32_t column_family = 0;  // default
+  Status s;
+  size_t prot_info_idx = 0;
+  bool checksum_protected = true;
+  while (!input.empty() && prot_info_idx < prot_info_->entries_.size()) {
+    // In case key/value/column_family are not updated by
+    // ReadRecordFromWriteBatch
+    key.clear();
+    value.clear();
+    column_family = 0;
+    s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value,
+                                 &blob, &xid);
+    if (!s.ok()) {
+      return s;
+    }
+    checksum_protected = true;
+    // Write batch checksum uses op_type without ColumnFamily (e.g., if op_type
+    // in the write batch is kTypeColumnFamilyValue, kTypeValue is used to
+    // compute the checksum), and encodes column family id separately. See
+    // comment in first `WriteBatchInternal::Put()` for more detail.
+    switch (tag) {
+      case kTypeColumnFamilyValue:
+      case kTypeValue:
+        tag = kTypeValue;
+        break;
+      case kTypeColumnFamilyDeletion:
+      case kTypeDeletion:
+        tag = kTypeDeletion;
+        break;
+      case kTypeColumnFamilySingleDeletion:
+      case kTypeSingleDeletion:
+        tag = kTypeSingleDeletion;
+        break;
+      case kTypeColumnFamilyRangeDeletion:
+      case kTypeRangeDeletion:
+        tag = kTypeRangeDeletion;
+        break;
+      case kTypeColumnFamilyMerge:
+      case kTypeMerge:
+        tag = kTypeMerge;
+        break;
+      case kTypeColumnFamilyBlobIndex:
+      case kTypeBlobIndex:
+        tag = kTypeBlobIndex;
+        break;
+      case kTypeLogData:
+      case kTypeBeginPrepareXID:
+      case kTypeEndPrepareXID:
+      case kTypeCommitXID:
+      case kTypeRollbackXID:
+      case kTypeNoop:
+      case kTypeBeginPersistedPrepareXID:
+      case kTypeBeginUnprepareXID:
+      case kTypeDeletionWithTimestamp:
+      case kTypeCommitXIDAndTimestamp:
+        checksum_protected = false;
+        break;
+      case kTypeColumnFamilyWideColumnEntity:
+      case kTypeWideColumnEntity:
+        tag = kTypeWideColumnEntity;
+        break;
+      default:
+        return Status::Corruption(
+            "unknown WriteBatch tag",
+            std::to_string(static_cast<unsigned int>(tag)));
+    }
+    if (checksum_protected) {
+      s = prot_info_->entries_[prot_info_idx++]
+              .StripC(column_family)
+              .StripKVO(key, value, static_cast<ValueType>(tag))
+              .GetStatus();
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  }
+
+  if (prot_info_idx != WriteBatchInternal::Count(this)) {
+    return Status::Corruption("WriteBatch has wrong count");
+  }
+  assert(WriteBatchInternal::Count(this) == prot_info_->entries_.size());
+  return Status::OK();
+}
+
+namespace {
+
+class MemTableInserter : public WriteBatch::Handler {
+  SequenceNumber sequence_;
+  ColumnFamilyMemTables* const cf_mems_;
+  FlushScheduler* const flush_scheduler_;
+  TrimHistoryScheduler* const trim_history_scheduler_;
+  const bool ignore_missing_column_families_;
+  const uint64_t recovering_log_number_;
+  // log number that all Memtables inserted into should reference
+  uint64_t log_number_ref_;
+  DBImpl* db_;
+  const bool concurrent_memtable_writes_;
+  bool post_info_created_;
+  const WriteBatch::ProtectionInfo* prot_info_;
+  size_t prot_info_idx_;
+
+  bool* has_valid_writes_;
+  // On some (!) platforms just default creating
+  // a map is too expensive in the Write() path as they
+  // cause memory allocations though unused.
+  // Make creation optional but do not incur
+  // std::unique_ptr additional allocation
+  using MemPostInfoMap = std::map<MemTable*, MemTablePostProcessInfo>;
+  using PostMapType = std::aligned_storage<sizeof(MemPostInfoMap)>::type;
+  PostMapType mem_post_info_map_;
+  // current recovered transaction we are rebuilding (recovery)
+  WriteBatch* rebuilding_trx_;
+  SequenceNumber rebuilding_trx_seq_;
+  // Increase seq number once per each write batch. Otherwise increase it once
+  // per key.
+  bool seq_per_batch_;
+  // Whether the memtable write will be done only after the commit
+  bool write_after_commit_;
+  // Whether memtable write can be done before prepare
+  bool write_before_prepare_;
+  // Whether this batch was unprepared or not
+  bool unprepared_batch_;
+  using DupDetector = std::aligned_storage<sizeof(DuplicateDetector)>::type;
+  DupDetector duplicate_detector_;
+  bool dup_dectector_on_;
+
+  bool hint_per_batch_;
+  bool hint_created_;
+  // Hints for this batch
+  using HintMap = std::unordered_map<MemTable*, void*>;
+  using HintMapType = std::aligned_storage<sizeof(HintMap)>::type;
+  HintMapType hint_;
+
+  HintMap& GetHintMap() {
+    assert(hint_per_batch_);
+    if (!hint_created_) {
+      new (&hint_) HintMap();
+      hint_created_ = true;
+    }
+    return *reinterpret_cast<HintMap*>(&hint_);
+  }
+
+  MemPostInfoMap& GetPostMap() {
+    assert(concurrent_memtable_writes_);
+    if (!post_info_created_) {
+      new (&mem_post_info_map_) MemPostInfoMap();
+      post_info_created_ = true;
+    }
+    return *reinterpret_cast<MemPostInfoMap*>(&mem_post_info_map_);
+  }
+
+  bool IsDuplicateKeySeq(uint32_t column_family_id, const Slice& key) {
+    assert(!write_after_commit_);
+    assert(rebuilding_trx_ != nullptr);
+    if (!dup_dectector_on_) {
+      new (&duplicate_detector_) DuplicateDetector(db_);
+      dup_dectector_on_ = true;
+    }
+    return reinterpret_cast<DuplicateDetector*>(&duplicate_detector_)
+        ->IsDuplicateKeySeq(column_family_id, key, sequence_);
+  }
+
+  const ProtectionInfoKVOC64* NextProtectionInfo() {
+    const ProtectionInfoKVOC64* res = nullptr;
+    if (prot_info_ != nullptr) {
+      assert(prot_info_idx_ < prot_info_->entries_.size());
+      res = &prot_info_->entries_[prot_info_idx_];
+      ++prot_info_idx_;
+    }
+    return res;
+  }
+
+  void DecrementProtectionInfoIdxForTryAgain() {
+    if (prot_info_ != nullptr) --prot_info_idx_;
+  }
+
+  void ResetProtectionInfo() {
+    prot_info_idx_ = 0;
+    prot_info_ = nullptr;
+  }
+
+ protected:
+  Handler::OptionState WriteBeforePrepare() const override {
+    return write_before_prepare_ ? Handler::OptionState::kEnabled
+                                 : Handler::OptionState::kDisabled;
+  }
+  Handler::OptionState WriteAfterCommit() const override {
+    return write_after_commit_ ? Handler::OptionState::kEnabled
+                               : Handler::OptionState::kDisabled;
+  }
+
+ public:
+  // cf_mems should not be shared with concurrent inserters
+  MemTableInserter(SequenceNumber _sequence, ColumnFamilyMemTables* cf_mems,
+                   FlushScheduler* flush_scheduler,
+                   TrimHistoryScheduler* trim_history_scheduler,
+                   bool ignore_missing_column_families,
+                   uint64_t recovering_log_number, DB* db,
+                   bool concurrent_memtable_writes,
+                   const WriteBatch::ProtectionInfo* prot_info,
+                   bool* has_valid_writes = nullptr, bool seq_per_batch = false,
+                   bool batch_per_txn = true, bool hint_per_batch = false)
+      : sequence_(_sequence),
+        cf_mems_(cf_mems),
+        flush_scheduler_(flush_scheduler),
+        trim_history_scheduler_(trim_history_scheduler),
+        ignore_missing_column_families_(ignore_missing_column_families),
+        recovering_log_number_(recovering_log_number),
+        log_number_ref_(0),
+        db_(static_cast_with_check<DBImpl>(db)),
+        concurrent_memtable_writes_(concurrent_memtable_writes),
+        post_info_created_(false),
+        prot_info_(prot_info),
+        prot_info_idx_(0),
+        has_valid_writes_(has_valid_writes),
+        rebuilding_trx_(nullptr),
+        rebuilding_trx_seq_(0),
+        seq_per_batch_(seq_per_batch),
+        // Write after commit currently uses one seq per key (instead of per
+        // batch). So seq_per_batch being false indicates write_after_commit
+        // approach.
+        write_after_commit_(!seq_per_batch),
+        // WriteUnprepared can write WriteBatches per transaction, so
+        // batch_per_txn being false indicates write_before_prepare.
+        write_before_prepare_(!batch_per_txn),
+        unprepared_batch_(false),
+        duplicate_detector_(),
+        dup_dectector_on_(false),
+        hint_per_batch_(hint_per_batch),
+        hint_created_(false) {
+    assert(cf_mems_);
+  }
+
+  ~MemTableInserter() override {
+    if (dup_dectector_on_) {
+      reinterpret_cast<DuplicateDetector*>(&duplicate_detector_)
+          ->~DuplicateDetector();
+    }
+    if (post_info_created_) {
+      reinterpret_cast<MemPostInfoMap*>(&mem_post_info_map_)->~MemPostInfoMap();
+    }
+    if (hint_created_) {
+      for (auto iter : GetHintMap()) {
+        delete[] reinterpret_cast<char*>(iter.second);
+      }
+      reinterpret_cast<HintMap*>(&hint_)->~HintMap();
+    }
+    delete rebuilding_trx_;
+  }
+
+  MemTableInserter(const MemTableInserter&) = delete;
+  MemTableInserter& operator=(const MemTableInserter&) = delete;
+
+  // The batch seq is regularly restarted; In normal mode it is set when
+  // MemTableInserter is constructed in the write thread and in recovery mode it
+  // is set when a batch, which is tagged with seq, is read from the WAL.
+  // Within a sequenced batch, which could be a merge of multiple batches, we
+  // have two policies to advance the seq: i) seq_per_key (default) and ii)
+  // seq_per_batch. To implement the latter we need to mark the boundary between
+  // the individual batches. The approach is this: 1) Use the terminating
+  // markers to indicate the boundary (kTypeEndPrepareXID, kTypeCommitXID,
+  // kTypeRollbackXID) 2) Terminate a batch with kTypeNoop in the absence of a
+  // natural boundary marker.
+  void MaybeAdvanceSeq(bool batch_boundry = false) {
+    if (batch_boundry == seq_per_batch_) {
+      sequence_++;
+    }
+  }
+
+  void set_log_number_ref(uint64_t log) { log_number_ref_ = log; }
+  void set_prot_info(const WriteBatch::ProtectionInfo* prot_info) {
+    prot_info_ = prot_info;
+    prot_info_idx_ = 0;
+  }
+
+  SequenceNumber sequence() const { return sequence_; }
+
+  void PostProcess() {
+    assert(concurrent_memtable_writes_);
+    // If post info was not created there is nothing
+    // to process and no need to create on demand
+    if (post_info_created_) {
+      for (auto& pair : GetPostMap()) {
+        pair.first->BatchPostProcess(pair.second);
+      }
+    }
+  }
+
+  bool SeekToColumnFamily(uint32_t column_family_id, Status* s) {
+    // If we are in a concurrent mode, it is the caller's responsibility
+    // to clone the original ColumnFamilyMemTables so that each thread
+    // has its own instance.  Otherwise, it must be guaranteed that there
+    // is no concurrent access
+    bool found = cf_mems_->Seek(column_family_id);
+    if (!found) {
+      if (ignore_missing_column_families_) {
+        *s = Status::OK();
+      } else {
+        *s = Status::InvalidArgument(
+            "Invalid column family specified in write batch");
+      }
+      return false;
+    }
+    if (recovering_log_number_ != 0 &&
+        recovering_log_number_ < cf_mems_->GetLogNumber()) {
+      // This is true only in recovery environment (recovering_log_number_ is
+      // always 0 in
+      // non-recovery, regular write code-path)
+      // * If recovering_log_number_ < cf_mems_->GetLogNumber(), this means that
+      // column family already contains updates from this log. We can't apply
+      // updates twice because of update-in-place or merge workloads -- ignore
+      // the update
+      *s = Status::OK();
+      return false;
+    }
+
+    if (has_valid_writes_ != nullptr) {
+      *has_valid_writes_ = true;
+    }
+
+    if (log_number_ref_ > 0) {
+      cf_mems_->GetMemTable()->RefLogContainingPrepSection(log_number_ref_);
+    }
+
+    return true;
+  }
+
+  Status PutCFImpl(uint32_t column_family_id, const Slice& key,
+                   const Slice& value, ValueType value_type,
+                   const ProtectionInfoKVOS64* kv_prot_info) {
+    // optimize for non-recovery mode
+    if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      return WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key,
+                                     value);
+      // else insert the values to the memtable right away
+    }
+
+    Status ret_status;
+    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
+      if (ret_status.ok() && rebuilding_trx_ != nullptr) {
+        assert(!write_after_commit_);
+        // The CF is probably flushed and hence no need for insert but we still
+        // need to keep track of the keys for upcoming rollback/commit.
+        // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+        ret_status = WriteBatchInternal::Put(rebuilding_trx_, column_family_id,
+                                             key, value);
+        if (ret_status.ok()) {
+          MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key));
+        }
+      } else if (ret_status.ok()) {
+        MaybeAdvanceSeq(false /* batch_boundary */);
+      }
+      return ret_status;
+    }
+    assert(ret_status.ok());
+
+    MemTable* mem = cf_mems_->GetMemTable();
+    auto* moptions = mem->GetImmutableMemTableOptions();
+    // inplace_update_support is inconsistent with snapshots, and therefore with
+    // any kind of transactions including the ones that use seq_per_batch
+    assert(!seq_per_batch_ || !moptions->inplace_update_support);
+    if (!moptions->inplace_update_support) {
+      ret_status =
+          mem->Add(sequence_, value_type, key, value, kv_prot_info,
+                   concurrent_memtable_writes_, get_post_process_info(mem),
+                   hint_per_batch_ ? &GetHintMap()[mem] : nullptr);
+    } else if (moptions->inplace_callback == nullptr ||
+               value_type != kTypeValue) {
+      assert(!concurrent_memtable_writes_);
+      ret_status = mem->Update(sequence_, value_type, key, value, kv_prot_info);
+    } else {
+      assert(!concurrent_memtable_writes_);
+      assert(value_type == kTypeValue);
+      ret_status = mem->UpdateCallback(sequence_, key, value, kv_prot_info);
+      if (ret_status.IsNotFound()) {
+        // key not found in memtable. Do sst get, update, add
+        SnapshotImpl read_from_snapshot;
+        read_from_snapshot.number_ = sequence_;
+        ReadOptions ropts;
+        // it's going to be overwritten for sure, so no point caching data block
+        // containing the old version
+        ropts.fill_cache = false;
+        ropts.snapshot = &read_from_snapshot;
+
+        std::string prev_value;
+        std::string merged_value;
+
+        auto cf_handle = cf_mems_->GetColumnFamilyHandle();
+        Status get_status = Status::NotSupported();
+        if (db_ != nullptr && recovering_log_number_ == 0) {
+          if (cf_handle == nullptr) {
+            cf_handle = db_->DefaultColumnFamily();
+          }
+          // TODO (yanqin): fix when user-defined timestamp is enabled.
+          get_status = db_->Get(ropts, cf_handle, key, &prev_value);
+        }
+        // Intentionally overwrites the `NotFound` in `ret_status`.
+        if (!get_status.ok() && !get_status.IsNotFound()) {
+          ret_status = get_status;
+        } else {
+          ret_status = Status::OK();
+        }
+        if (ret_status.ok()) {
+          UpdateStatus update_status;
+          char* prev_buffer = const_cast<char*>(prev_value.c_str());
+          uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
+          if (get_status.ok()) {
+            update_status = moptions->inplace_callback(prev_buffer, &prev_size,
+                                                       value, &merged_value);
+          } else {
+            update_status = moptions->inplace_callback(
+                nullptr /* existing_value */, nullptr /* existing_value_size */,
+                value, &merged_value);
+          }
+          if (update_status == UpdateStatus::UPDATED_INPLACE) {
+            assert(get_status.ok());
+            if (kv_prot_info != nullptr) {
+              ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info);
+              updated_kv_prot_info.UpdateV(value,
+                                           Slice(prev_buffer, prev_size));
+              // prev_value is updated in-place with final value.
+              ret_status = mem->Add(sequence_, value_type, key,
+                                    Slice(prev_buffer, prev_size),
+                                    &updated_kv_prot_info);
+            } else {
+              ret_status = mem->Add(sequence_, value_type, key,
+                                    Slice(prev_buffer, prev_size),
+                                    nullptr /* kv_prot_info */);
+            }
+            if (ret_status.ok()) {
+              RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
+            }
+          } else if (update_status == UpdateStatus::UPDATED) {
+            if (kv_prot_info != nullptr) {
+              ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info);
+              updated_kv_prot_info.UpdateV(value, merged_value);
+              // merged_value contains the final value.
+              ret_status = mem->Add(sequence_, value_type, key,
+                                    Slice(merged_value), &updated_kv_prot_info);
+            } else {
+              // merged_value contains the final value.
+              ret_status =
+                  mem->Add(sequence_, value_type, key, Slice(merged_value),
+                           nullptr /* kv_prot_info */);
+            }
+            if (ret_status.ok()) {
+              RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
+            }
+          }
+        }
+      }
+    }
+    if (UNLIKELY(ret_status.IsTryAgain())) {
+      assert(seq_per_batch_);
+      const bool kBatchBoundary = true;
+      MaybeAdvanceSeq(kBatchBoundary);
+    } else if (ret_status.ok()) {
+      MaybeAdvanceSeq();
+      CheckMemtableFull();
+    }
+    // optimize for non-recovery mode
+    // If `ret_status` is `TryAgain` then the next (successful) try will add
+    // the key to the rebuilding transaction object. If `ret_status` is
+    // another non-OK `Status`, then the `rebuilding_trx_` will be thrown
+    // away. So we only need to add to it when `ret_status.ok()`.
+    if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) {
+      assert(!write_after_commit_);
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      ret_status = WriteBatchInternal::Put(rebuilding_trx_, column_family_id,
+                                           key, value);
+    }
+    return ret_status;
+  }
+
+  Status PutCF(uint32_t column_family_id, const Slice& key,
+               const Slice& value) override {
+    const auto* kv_prot_info = NextProtectionInfo();
+    Status ret_status;
+    if (kv_prot_info != nullptr) {
+      // Memtable needs seqno, doesn't need CF ID
+      auto mem_kv_prot_info =
+          kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+      ret_status = PutCFImpl(column_family_id, key, value, kTypeValue,
+                             &mem_kv_prot_info);
+    } else {
+      ret_status = PutCFImpl(column_family_id, key, value, kTypeValue,
+                             nullptr /* kv_prot_info */);
+    }
+    // TODO: this assumes that if TryAgain status is returned to the caller,
+    // the operation is actually tried again. The proper way to do this is to
+    // pass a `try_again` parameter to the operation itself and decrement
+    // prot_info_idx_ based on that
+    if (UNLIKELY(ret_status.IsTryAgain())) {
+      DecrementProtectionInfoIdxForTryAgain();
+    }
+    return ret_status;
+  }
+
+  Status PutEntityCF(uint32_t column_family_id, const Slice& key,
+                     const Slice& value) override {
+    const auto* kv_prot_info = NextProtectionInfo();
+
+    Status s;
+    if (kv_prot_info) {
+      // Memtable needs seqno, doesn't need CF ID
+      auto mem_kv_prot_info =
+          kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+      s = PutCFImpl(column_family_id, key, value, kTypeWideColumnEntity,
+                    &mem_kv_prot_info);
+    } else {
+      s = PutCFImpl(column_family_id, key, value, kTypeWideColumnEntity,
+                    /* kv_prot_info */ nullptr);
+    }
+
+    if (UNLIKELY(s.IsTryAgain())) {
+      DecrementProtectionInfoIdxForTryAgain();
+    }
+
+    return s;
+  }
+
+  Status DeleteImpl(uint32_t /*column_family_id*/, const Slice& key,
+                    const Slice& value, ValueType delete_type,
+                    const ProtectionInfoKVOS64* kv_prot_info) {
+    Status ret_status;
+    MemTable* mem = cf_mems_->GetMemTable();
+    ret_status =
+        mem->Add(sequence_, delete_type, key, value, kv_prot_info,
+                 concurrent_memtable_writes_, get_post_process_info(mem),
+                 hint_per_batch_ ? &GetHintMap()[mem] : nullptr);
+    if (UNLIKELY(ret_status.IsTryAgain())) {
+      assert(seq_per_batch_);
+      const bool kBatchBoundary = true;
+      MaybeAdvanceSeq(kBatchBoundary);
+    } else if (ret_status.ok()) {
+      MaybeAdvanceSeq();
+      CheckMemtableFull();
+    }
+    return ret_status;
+  }
+
+  Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
+    const auto* kv_prot_info = NextProtectionInfo();
+    // optimize for non-recovery mode
+    if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      return WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
+      // else insert the values to the memtable right away
+    }
+
+    Status ret_status;
+    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
+      if (ret_status.ok() && rebuilding_trx_ != nullptr) {
+        assert(!write_after_commit_);
+        // The CF is probably flushed and hence no need for insert but we still
+        // need to keep track of the keys for upcoming rollback/commit.
+        // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+        ret_status =
+            WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
+        if (ret_status.ok()) {
+          MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key));
+        }
+      } else if (ret_status.ok()) {
+        MaybeAdvanceSeq(false /* batch_boundary */);
+      }
+      if (UNLIKELY(ret_status.IsTryAgain())) {
+        DecrementProtectionInfoIdxForTryAgain();
+      }
+      return ret_status;
+    }
+
+    ColumnFamilyData* cfd = cf_mems_->current();
+    assert(!cfd || cfd->user_comparator());
+    const size_t ts_sz = (cfd && cfd->user_comparator())
+                             ? cfd->user_comparator()->timestamp_size()
+                             : 0;
+    const ValueType delete_type =
+        (0 == ts_sz) ? kTypeDeletion : kTypeDeletionWithTimestamp;
+    if (kv_prot_info != nullptr) {
+      auto mem_kv_prot_info =
+          kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+      mem_kv_prot_info.UpdateO(kTypeDeletion, delete_type);
+      ret_status = DeleteImpl(column_family_id, key, Slice(), delete_type,
+                              &mem_kv_prot_info);
+    } else {
+      ret_status = DeleteImpl(column_family_id, key, Slice(), delete_type,
+                              nullptr /* kv_prot_info */);
+    }
+    // optimize for non-recovery mode
+    // If `ret_status` is `TryAgain` then the next (successful) try will add
+    // the key to the rebuilding transaction object. If `ret_status` is
+    // another non-OK `Status`, then the `rebuilding_trx_` will be thrown
+    // away. So we only need to add to it when `ret_status.ok()`.
+    if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) {
+      assert(!write_after_commit_);
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      ret_status =
+          WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
+    }
+    if (UNLIKELY(ret_status.IsTryAgain())) {
+      DecrementProtectionInfoIdxForTryAgain();
+    }
+    return ret_status;
+  }
+
+  Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override {
+    const auto* kv_prot_info = NextProtectionInfo();
+    // optimize for non-recovery mode
+    if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      return WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id,
+                                              key);
+      // else insert the values to the memtable right away
+    }
+
+    Status ret_status;
+    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
+      if (ret_status.ok() && rebuilding_trx_ != nullptr) {
+        assert(!write_after_commit_);
+        // The CF is probably flushed and hence no need for insert but we still
+        // need to keep track of the keys for upcoming rollback/commit.
+        // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+        ret_status = WriteBatchInternal::SingleDelete(rebuilding_trx_,
+                                                      column_family_id, key);
+        if (ret_status.ok()) {
+          MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key));
+        }
+      } else if (ret_status.ok()) {
+        MaybeAdvanceSeq(false /* batch_boundary */);
+      }
+      if (UNLIKELY(ret_status.IsTryAgain())) {
+        DecrementProtectionInfoIdxForTryAgain();
+      }
+      return ret_status;
+    }
+    assert(ret_status.ok());
+
+    if (kv_prot_info != nullptr) {
+      auto mem_kv_prot_info =
+          kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+      ret_status = DeleteImpl(column_family_id, key, Slice(),
+                              kTypeSingleDeletion, &mem_kv_prot_info);
+    } else {
+      ret_status = DeleteImpl(column_family_id, key, Slice(),
+                              kTypeSingleDeletion, nullptr /* kv_prot_info */);
+    }
+    // optimize for non-recovery mode
+    // If `ret_status` is `TryAgain` then the next (successful) try will add
+    // the key to the rebuilding transaction object. If `ret_status` is
+    // another non-OK `Status`, then the `rebuilding_trx_` will be thrown
+    // away. So we only need to add to it when `ret_status.ok()`.
+    if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) {
+      assert(!write_after_commit_);
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      ret_status = WriteBatchInternal::SingleDelete(rebuilding_trx_,
+                                                    column_family_id, key);
+    }
+    if (UNLIKELY(ret_status.IsTryAgain())) {
+      DecrementProtectionInfoIdxForTryAgain();
+    }
+    return ret_status;
+  }
+
+  Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key,
+                       const Slice& end_key) override {
+    const auto* kv_prot_info = NextProtectionInfo();
+    // optimize for non-recovery mode
+    if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      return WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id,
+                                             begin_key, end_key);
+      // else insert the values to the memtable right away
+    }
+
+    Status ret_status;
+    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
+      if (ret_status.ok() && rebuilding_trx_ != nullptr) {
+        assert(!write_after_commit_);
+        // The CF is probably flushed and hence no need for insert but we still
+        // need to keep track of the keys for upcoming rollback/commit.
+        // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+        ret_status = WriteBatchInternal::DeleteRange(
+            rebuilding_trx_, column_family_id, begin_key, end_key);
+        if (ret_status.ok()) {
+          MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, begin_key));
+        }
+      } else if (ret_status.ok()) {
+        MaybeAdvanceSeq(false /* batch_boundary */);
+      }
+      if (UNLIKELY(ret_status.IsTryAgain())) {
+        DecrementProtectionInfoIdxForTryAgain();
+      }
+      return ret_status;
+    }
+    assert(ret_status.ok());
+
+    if (db_ != nullptr) {
+      auto cf_handle = cf_mems_->GetColumnFamilyHandle();
+      if (cf_handle == nullptr) {
+        cf_handle = db_->DefaultColumnFamily();
+      }
+      auto* cfd =
+          static_cast_with_check<ColumnFamilyHandleImpl>(cf_handle)->cfd();
+      if (!cfd->is_delete_range_supported()) {
+        // TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`.
+        ret_status.PermitUncheckedError();
+        return Status::NotSupported(
+            std::string("DeleteRange not supported for table type ") +
+            cfd->ioptions()->table_factory->Name() + " in CF " +
+            cfd->GetName());
+      }
+      int cmp =
+          cfd->user_comparator()->CompareWithoutTimestamp(begin_key, end_key);
+      if (cmp > 0) {
+        // TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`.
+        ret_status.PermitUncheckedError();
+        // It's an empty range where endpoints appear mistaken. Don't bother
+        // applying it to the DB, and return an error to the user.
+        return Status::InvalidArgument("end key comes before start key");
+      } else if (cmp == 0) {
+        // TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`.
+        ret_status.PermitUncheckedError();
+        // It's an empty range. Don't bother applying it to the DB.
+        return Status::OK();
+      }
+    }
+
+    if (kv_prot_info != nullptr) {
+      auto mem_kv_prot_info =
+          kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+      ret_status = DeleteImpl(column_family_id, begin_key, end_key,
+                              kTypeRangeDeletion, &mem_kv_prot_info);
+    } else {
+      ret_status = DeleteImpl(column_family_id, begin_key, end_key,
+                              kTypeRangeDeletion, nullptr /* kv_prot_info */);
+    }
+    // optimize for non-recovery mode
+    // If `ret_status` is `TryAgain` then the next (successful) try will add
+    // the key to the rebuilding transaction object. If `ret_status` is
+    // another non-OK `Status`, then the `rebuilding_trx_` will be thrown
+    // away. So we only need to add to it when `ret_status.ok()`.
+    if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
+      assert(!write_after_commit_);
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      ret_status = WriteBatchInternal::DeleteRange(
+          rebuilding_trx_, column_family_id, begin_key, end_key);
+    }
+    if (UNLIKELY(ret_status.IsTryAgain())) {
+      DecrementProtectionInfoIdxForTryAgain();
+    }
+    return ret_status;
+  }
+
+  Status MergeCF(uint32_t column_family_id, const Slice& key,
+                 const Slice& value) override {
+    const auto* kv_prot_info = NextProtectionInfo();
+    // optimize for non-recovery mode
+    if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      return WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key,
+                                       value);
+      // else insert the values to the memtable right away
+    }
+
+    Status ret_status;
+    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
+      if (ret_status.ok() && rebuilding_trx_ != nullptr) {
+        assert(!write_after_commit_);
+        // The CF is probably flushed and hence no need for insert but we still
+        // need to keep track of the keys for upcoming rollback/commit.
+        // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+        ret_status = WriteBatchInternal::Merge(rebuilding_trx_,
+                                               column_family_id, key, value);
+        if (ret_status.ok()) {
+          MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key));
+        }
+      } else if (ret_status.ok()) {
+        MaybeAdvanceSeq(false /* batch_boundary */);
+      }
+      if (UNLIKELY(ret_status.IsTryAgain())) {
+        DecrementProtectionInfoIdxForTryAgain();
+      }
+      return ret_status;
+    }
+    assert(ret_status.ok());
+
+    MemTable* mem = cf_mems_->GetMemTable();
+    auto* moptions = mem->GetImmutableMemTableOptions();
+    if (moptions->merge_operator == nullptr) {
+      return Status::InvalidArgument(
+          "Merge requires `ColumnFamilyOptions::merge_operator != nullptr`");
+    }
+    bool perform_merge = false;
+    assert(!concurrent_memtable_writes_ ||
+           moptions->max_successive_merges == 0);
+
+    // If we pass DB through and options.max_successive_merges is hit
+    // during recovery, Get() will be issued which will try to acquire
+    // DB mutex and cause deadlock, as DB mutex is already held.
+    // So we disable merge in recovery
+    if (moptions->max_successive_merges > 0 && db_ != nullptr &&
+        recovering_log_number_ == 0) {
+      assert(!concurrent_memtable_writes_);
+      LookupKey lkey(key, sequence_);
+
+      // Count the number of successive merges at the head
+      // of the key in the memtable
+      size_t num_merges = mem->CountSuccessiveMergeEntries(lkey);
+
+      if (num_merges >= moptions->max_successive_merges) {
+        perform_merge = true;
+      }
+    }
+
+    if (perform_merge) {
+      // 1) Get the existing value
+      std::string get_value;
+
+      // Pass in the sequence number so that we also include previous merge
+      // operations in the same batch.
+      SnapshotImpl read_from_snapshot;
+      read_from_snapshot.number_ = sequence_;
+      ReadOptions read_options;
+      read_options.snapshot = &read_from_snapshot;
+
+      auto cf_handle = cf_mems_->GetColumnFamilyHandle();
+      if (cf_handle == nullptr) {
+        cf_handle = db_->DefaultColumnFamily();
+      }
+      Status get_status = db_->Get(read_options, cf_handle, key, &get_value);
+      if (!get_status.ok()) {
+        // Failed to read a key we know exists. Store the delta in memtable.
+        perform_merge = false;
+      } else {
+        Slice get_value_slice = Slice(get_value);
+
+        // 2) Apply this merge
+        auto merge_operator = moptions->merge_operator;
+        assert(merge_operator);
+
+        std::string new_value;
+        Status merge_status = MergeHelper::TimedFullMerge(
+            merge_operator, key, &get_value_slice, {value}, &new_value,
+            moptions->info_log, moptions->statistics,
+            SystemClock::Default().get(), /* result_operand */ nullptr,
+            /* update_num_ops_stats */ false);
+
+        if (!merge_status.ok()) {
+          // Failed to merge!
+          // Store the delta in memtable
+          perform_merge = false;
+        } else {
+          // 3) Add value to memtable
+          assert(!concurrent_memtable_writes_);
+          if (kv_prot_info != nullptr) {
+            auto merged_kv_prot_info =
+                kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+            merged_kv_prot_info.UpdateV(value, new_value);
+            merged_kv_prot_info.UpdateO(kTypeMerge, kTypeValue);
+            ret_status = mem->Add(sequence_, kTypeValue, key, new_value,
+                                  &merged_kv_prot_info);
+          } else {
+            ret_status = mem->Add(sequence_, kTypeValue, key, new_value,
+                                  nullptr /* kv_prot_info */);
+          }
+        }
+      }
+    }
+
+    if (!perform_merge) {
+      assert(ret_status.ok());
+      // Add merge operand to memtable
+      if (kv_prot_info != nullptr) {
+        auto mem_kv_prot_info =
+            kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+        ret_status =
+            mem->Add(sequence_, kTypeMerge, key, value, &mem_kv_prot_info,
+                     concurrent_memtable_writes_, get_post_process_info(mem));
+      } else {
+        ret_status = mem->Add(
+            sequence_, kTypeMerge, key, value, nullptr /* kv_prot_info */,
+            concurrent_memtable_writes_, get_post_process_info(mem));
+      }
+    }
+
+    if (UNLIKELY(ret_status.IsTryAgain())) {
+      assert(seq_per_batch_);
+      const bool kBatchBoundary = true;
+      MaybeAdvanceSeq(kBatchBoundary);
+    } else if (ret_status.ok()) {
+      MaybeAdvanceSeq();
+      CheckMemtableFull();
+    }
+    // optimize for non-recovery mode
+    // If `ret_status` is `TryAgain` then the next (successful) try will add
+    // the key to the rebuilding transaction object. If `ret_status` is
+    // another non-OK `Status`, then the `rebuilding_trx_` will be thrown
+    // away. So we only need to add to it when `ret_status.ok()`.
+    if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) {
+      assert(!write_after_commit_);
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      ret_status = WriteBatchInternal::Merge(rebuilding_trx_, column_family_id,
+                                             key, value);
+    }
+    if (UNLIKELY(ret_status.IsTryAgain())) {
+      DecrementProtectionInfoIdxForTryAgain();
+    }
+    return ret_status;
+  }
+
+  Status PutBlobIndexCF(uint32_t column_family_id, const Slice& key,
+                        const Slice& value) override {
+    const auto* kv_prot_info = NextProtectionInfo();
+    Status ret_status;
+    if (kv_prot_info != nullptr) {
+      // Memtable needs seqno, doesn't need CF ID
+      auto mem_kv_prot_info =
+          kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+      // Same as PutCF except for value type.
+      ret_status = PutCFImpl(column_family_id, key, value, kTypeBlobIndex,
+                             &mem_kv_prot_info);
+    } else {
+      ret_status = PutCFImpl(column_family_id, key, value, kTypeBlobIndex,
+                             nullptr /* kv_prot_info */);
+    }
+    if (UNLIKELY(ret_status.IsTryAgain())) {
+      DecrementProtectionInfoIdxForTryAgain();
+    }
+    return ret_status;
+  }
+
+  void CheckMemtableFull() {
+    if (flush_scheduler_ != nullptr) {
+      auto* cfd = cf_mems_->current();
+      assert(cfd != nullptr);
+      if (cfd->mem()->ShouldScheduleFlush() &&
+          cfd->mem()->MarkFlushScheduled()) {
+        // MarkFlushScheduled only returns true if we are the one that
+        // should take action, so no need to dedup further
+        flush_scheduler_->ScheduleWork(cfd);
+      }
+    }
+    // check if memtable_list size exceeds max_write_buffer_size_to_maintain
+    if (trim_history_scheduler_ != nullptr) {
+      auto* cfd = cf_mems_->current();
+
+      assert(cfd);
+      assert(cfd->ioptions());
+
+      const size_t size_to_maintain = static_cast<size_t>(
+          cfd->ioptions()->max_write_buffer_size_to_maintain);
+
+      if (size_to_maintain > 0) {
+        MemTableList* const imm = cfd->imm();
+        assert(imm);
+
+        if (imm->HasHistory()) {
+          const MemTable* const mem = cfd->mem();
+          assert(mem);
+
+          if (mem->MemoryAllocatedBytes() +
+                      imm->MemoryAllocatedBytesExcludingLast() >=
+                  size_to_maintain &&
+              imm->MarkTrimHistoryNeeded()) {
+            trim_history_scheduler_->ScheduleWork(cfd);
+          }
+        }
+      }
+    }
+  }
+
+  // The write batch handler calls MarkBeginPrepare with unprepare set to true
+  // if it encounters the kTypeBeginUnprepareXID marker.
+  Status MarkBeginPrepare(bool unprepare) override {
+    assert(rebuilding_trx_ == nullptr);
+    assert(db_);
+
+    if (recovering_log_number_ != 0) {
+      db_->mutex()->AssertHeld();
+      // during recovery we rebuild a hollow transaction
+      // from all encountered prepare sections of the wal
+      if (db_->allow_2pc() == false) {
+        return Status::NotSupported(
+            "WAL contains prepared transactions. Open with "
+            "TransactionDB::Open().");
+      }
+
+      // we are now iterating through a prepared section
+      rebuilding_trx_ = new WriteBatch();
+      rebuilding_trx_seq_ = sequence_;
+      // Verify that we have matching MarkBeginPrepare/MarkEndPrepare markers.
+      // unprepared_batch_ should be false because it is false by default, and
+      // gets reset to false in MarkEndPrepare.
+      assert(!unprepared_batch_);
+      unprepared_batch_ = unprepare;
+
+      if (has_valid_writes_ != nullptr) {
+        *has_valid_writes_ = true;
+      }
+    }
+
+    return Status::OK();
+  }
+
+  Status MarkEndPrepare(const Slice& name) override {
+    assert(db_);
+    assert((rebuilding_trx_ != nullptr) == (recovering_log_number_ != 0));
+
+    if (recovering_log_number_ != 0) {
+      db_->mutex()->AssertHeld();
+      assert(db_->allow_2pc());
+      size_t batch_cnt =
+          write_after_commit_
+              ? 0  // 0 will disable further checks
+              : static_cast<size_t>(sequence_ - rebuilding_trx_seq_ + 1);
+      db_->InsertRecoveredTransaction(recovering_log_number_, name.ToString(),
+                                      rebuilding_trx_, rebuilding_trx_seq_,
+                                      batch_cnt, unprepared_batch_);
+      unprepared_batch_ = false;
+      rebuilding_trx_ = nullptr;
+    } else {
+      assert(rebuilding_trx_ == nullptr);
+    }
+    const bool batch_boundry = true;
+    MaybeAdvanceSeq(batch_boundry);
+
+    return Status::OK();
+  }
+
+  Status MarkNoop(bool empty_batch) override {
+    if (recovering_log_number_ != 0) {
+      db_->mutex()->AssertHeld();
+    }
+    // A hack in pessimistic transaction could result into a noop at the start
+    // of the write batch, that should be ignored.
+    if (!empty_batch) {
+      // In the absence of Prepare markers, a kTypeNoop tag indicates the end of
+      // a batch. This happens when write batch commits skipping the prepare
+      // phase.
+      const bool batch_boundry = true;
+      MaybeAdvanceSeq(batch_boundry);
+    }
+    return Status::OK();
+  }
+
+  Status MarkCommit(const Slice& name) override {
+    assert(db_);
+
+    Status s;
+
+    if (recovering_log_number_ != 0) {
+      // We must hold db mutex in recovery.
+      db_->mutex()->AssertHeld();
+      // in recovery when we encounter a commit marker
+      // we lookup this transaction in our set of rebuilt transactions
+      // and commit.
+      auto trx = db_->GetRecoveredTransaction(name.ToString());
+
+      // the log containing the prepared section may have
+      // been released in the last incarnation because the
+      // data was flushed to L0
+      if (trx != nullptr) {
+        // at this point individual CF lognumbers will prevent
+        // duplicate re-insertion of values.
+        assert(log_number_ref_ == 0);
+        if (write_after_commit_) {
+          // write_after_commit_ can only have one batch in trx.
+          assert(trx->batches_.size() == 1);
+          const auto& batch_info = trx->batches_.begin()->second;
+          // all inserts must reference this trx log number
+          log_number_ref_ = batch_info.log_number_;
+          ResetProtectionInfo();
+          s = batch_info.batch_->Iterate(this);
+          log_number_ref_ = 0;
+        }
+        // else the values are already inserted before the commit
+
+        if (s.ok()) {
+          db_->DeleteRecoveredTransaction(name.ToString());
+        }
+        if (has_valid_writes_ != nullptr) {
+          *has_valid_writes_ = true;
+        }
+      }
+    } else {
+      // When writes are not delayed until commit, there is no disconnect
+      // between a memtable write and the WAL that supports it. So the commit
+      // need not reference any log as the only log to which it depends.
+      assert(!write_after_commit_ || log_number_ref_ > 0);
+    }
+    const bool batch_boundry = true;
+    MaybeAdvanceSeq(batch_boundry);
+
+    if (UNLIKELY(s.IsTryAgain())) {
+      DecrementProtectionInfoIdxForTryAgain();
+    }
+
+    return s;
+  }
+
+  Status MarkCommitWithTimestamp(const Slice& name,
+                                 const Slice& commit_ts) override {
+    assert(db_);
+
+    Status s;
+
+    if (recovering_log_number_ != 0) {
+      // In recovery, db mutex must be held.
+      db_->mutex()->AssertHeld();
+      // in recovery when we encounter a commit marker
+      // we lookup this transaction in our set of rebuilt transactions
+      // and commit.
+      auto trx = db_->GetRecoveredTransaction(name.ToString());
+      // the log containing the prepared section may have
+      // been released in the last incarnation because the
+      // data was flushed to L0
+      if (trx) {
+        // at this point individual CF lognumbers will prevent
+        // duplicate re-insertion of values.
+        assert(0 == log_number_ref_);
+        if (write_after_commit_) {
+          // write_after_commit_ can only have one batch in trx.
+          assert(trx->batches_.size() == 1);
+          const auto& batch_info = trx->batches_.begin()->second;
+          // all inserts must reference this trx log number
+          log_number_ref_ = batch_info.log_number_;
+
+          s = batch_info.batch_->UpdateTimestamps(
+              commit_ts, [this](uint32_t cf) {
+                assert(db_);
+                VersionSet* const vset = db_->GetVersionSet();
+                assert(vset);
+                ColumnFamilySet* const cf_set = vset->GetColumnFamilySet();
+                assert(cf_set);
+                ColumnFamilyData* cfd = cf_set->GetColumnFamily(cf);
+                assert(cfd);
+                const auto* const ucmp = cfd->user_comparator();
+                assert(ucmp);
+                return ucmp->timestamp_size();
+              });
+          if (s.ok()) {
+            ResetProtectionInfo();
+            s = batch_info.batch_->Iterate(this);
+            log_number_ref_ = 0;
+          }
+        }
+        // else the values are already inserted before the commit
+
+        if (s.ok()) {
+          db_->DeleteRecoveredTransaction(name.ToString());
+        }
+        if (has_valid_writes_) {
+          *has_valid_writes_ = true;
+        }
+      }
+    } else {
+      // When writes are not delayed until commit, there is no connection
+      // between a memtable write and the WAL that supports it. So the commit
+      // need not reference any log as the only log to which it depends.
+      assert(!write_after_commit_ || log_number_ref_ > 0);
+    }
+    constexpr bool batch_boundary = true;
+    MaybeAdvanceSeq(batch_boundary);
+
+    if (UNLIKELY(s.IsTryAgain())) {
+      DecrementProtectionInfoIdxForTryAgain();
+    }
+
+    return s;
+  }
+
+  Status MarkRollback(const Slice& name) override {
+    assert(db_);
+
+    if (recovering_log_number_ != 0) {
+      auto trx = db_->GetRecoveredTransaction(name.ToString());
+
+      // the log containing the transactions prep section
+      // may have been released in the previous incarnation
+      // because we knew it had been rolled back
+      if (trx != nullptr) {
+        db_->DeleteRecoveredTransaction(name.ToString());
+      }
+    } else {
+      // in non recovery we simply ignore this tag
+    }
+
+    const bool batch_boundry = true;
+    MaybeAdvanceSeq(batch_boundry);
+
+    return Status::OK();
+  }
+
+ private:
+  MemTablePostProcessInfo* get_post_process_info(MemTable* mem) {
+    if (!concurrent_memtable_writes_) {
+      // No need to batch counters locally if we don't use concurrent mode.
+      return nullptr;
+    }
+    return &GetPostMap()[mem];
+  }
+};
+
+}  // anonymous namespace
+
+// This function can only be called in these conditions:
+// 1) During Recovery()
+// 2) During Write(), in a single-threaded write thread
+// 3) During Write(), in a concurrent context where memtables has been cloned
+// The reason is that it calls memtables->Seek(), which has a stateful cache
+Status WriteBatchInternal::InsertInto(
+    WriteThread::WriteGroup& write_group, SequenceNumber sequence,
+    ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
+    TrimHistoryScheduler* trim_history_scheduler,
+    bool ignore_missing_column_families, uint64_t recovery_log_number, DB* db,
+    bool concurrent_memtable_writes, bool seq_per_batch, bool batch_per_txn) {
+  MemTableInserter inserter(
+      sequence, memtables, flush_scheduler, trim_history_scheduler,
+      ignore_missing_column_families, recovery_log_number, db,
+      concurrent_memtable_writes, nullptr /* prot_info */,
+      nullptr /*has_valid_writes*/, seq_per_batch, batch_per_txn);
+  for (auto w : write_group) {
+    if (w->CallbackFailed()) {
+      continue;
+    }
+    w->sequence = inserter.sequence();
+    if (!w->ShouldWriteToMemtable()) {
+      // In seq_per_batch_ mode this advances the seq by one.
+      inserter.MaybeAdvanceSeq(true);
+      continue;
+    }
+    SetSequence(w->batch, inserter.sequence());
+    inserter.set_log_number_ref(w->log_ref);
+    inserter.set_prot_info(w->batch->prot_info_.get());
+    w->status = w->batch->Iterate(&inserter);
+    if (!w->status.ok()) {
+      return w->status;
+    }
+    assert(!seq_per_batch || w->batch_cnt != 0);
+    assert(!seq_per_batch || inserter.sequence() - w->sequence == w->batch_cnt);
+  }
+  return Status::OK();
+}
+
+Status WriteBatchInternal::InsertInto(
+    WriteThread::Writer* writer, SequenceNumber sequence,
+    ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
+    TrimHistoryScheduler* trim_history_scheduler,
+    bool ignore_missing_column_families, uint64_t log_number, DB* db,
+    bool concurrent_memtable_writes, bool seq_per_batch, size_t batch_cnt,
+    bool batch_per_txn, bool hint_per_batch) {
+#ifdef NDEBUG
+  (void)batch_cnt;
+#endif
+  assert(writer->ShouldWriteToMemtable());
+  MemTableInserter inserter(sequence, memtables, flush_scheduler,
+                            trim_history_scheduler,
+                            ignore_missing_column_families, log_number, db,
+                            concurrent_memtable_writes, nullptr /* prot_info */,
+                            nullptr /*has_valid_writes*/, seq_per_batch,
+                            batch_per_txn, hint_per_batch);
+  SetSequence(writer->batch, sequence);
+  inserter.set_log_number_ref(writer->log_ref);
+  inserter.set_prot_info(writer->batch->prot_info_.get());
+  Status s = writer->batch->Iterate(&inserter);
+  assert(!seq_per_batch || batch_cnt != 0);
+  assert(!seq_per_batch || inserter.sequence() - sequence == batch_cnt);
+  if (concurrent_memtable_writes) {
+    inserter.PostProcess();
+  }
+  return s;
+}
+
+Status WriteBatchInternal::InsertInto(
+    const WriteBatch* batch, ColumnFamilyMemTables* memtables,
+    FlushScheduler* flush_scheduler,
+    TrimHistoryScheduler* trim_history_scheduler,
+    bool ignore_missing_column_families, uint64_t log_number, DB* db,
+    bool concurrent_memtable_writes, SequenceNumber* next_seq,
+    bool* has_valid_writes, bool seq_per_batch, bool batch_per_txn) {
+  MemTableInserter inserter(Sequence(batch), memtables, flush_scheduler,
+                            trim_history_scheduler,
+                            ignore_missing_column_families, log_number, db,
+                            concurrent_memtable_writes, batch->prot_info_.get(),
+                            has_valid_writes, seq_per_batch, batch_per_txn);
+  Status s = batch->Iterate(&inserter);
+  if (next_seq != nullptr) {
+    *next_seq = inserter.sequence();
+  }
+  if (concurrent_memtable_writes) {
+    inserter.PostProcess();
+  }
+  return s;
+}
+
+namespace {
+
+// This class updates protection info for a WriteBatch.
+class ProtectionInfoUpdater : public WriteBatch::Handler {
+ public:
+  explicit ProtectionInfoUpdater(WriteBatch::ProtectionInfo* prot_info)
+      : prot_info_(prot_info) {}
+
+  ~ProtectionInfoUpdater() override {}
+
+  Status PutCF(uint32_t cf, const Slice& key, const Slice& val) override {
+    return UpdateProtInfo(cf, key, val, kTypeValue);
+  }
+
+  Status PutEntityCF(uint32_t cf, const Slice& key,
+                     const Slice& entity) override {
+    return UpdateProtInfo(cf, key, entity, kTypeWideColumnEntity);
+  }
+
+  Status DeleteCF(uint32_t cf, const Slice& key) override {
+    return UpdateProtInfo(cf, key, "", kTypeDeletion);
+  }
+
+  Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+    return UpdateProtInfo(cf, key, "", kTypeSingleDeletion);
+  }
+
+  Status DeleteRangeCF(uint32_t cf, const Slice& begin_key,
+                       const Slice& end_key) override {
+    return UpdateProtInfo(cf, begin_key, end_key, kTypeRangeDeletion);
+  }
+
+  Status MergeCF(uint32_t cf, const Slice& key, const Slice& val) override {
+    return UpdateProtInfo(cf, key, val, kTypeMerge);
+  }
+
+  Status PutBlobIndexCF(uint32_t cf, const Slice& key,
+                        const Slice& val) override {
+    return UpdateProtInfo(cf, key, val, kTypeBlobIndex);
+  }
+
+  Status MarkBeginPrepare(bool /* unprepare */) override {
+    return Status::OK();
+  }
+
+  Status MarkEndPrepare(const Slice& /* xid */) override {
+    return Status::OK();
+  }
+
+  Status MarkCommit(const Slice& /* xid */) override { return Status::OK(); }
+
+  Status MarkCommitWithTimestamp(const Slice& /* xid */,
+                                 const Slice& /* ts */) override {
+    return Status::OK();
+  }
+
+  Status MarkRollback(const Slice& /* xid */) override { return Status::OK(); }
+
+  Status MarkNoop(bool /* empty_batch */) override { return Status::OK(); }
+
+ private:
+  Status UpdateProtInfo(uint32_t cf, const Slice& key, const Slice& val,
+                        const ValueType op_type) {
+    if (prot_info_) {
+      prot_info_->entries_.emplace_back(
+          ProtectionInfo64().ProtectKVO(key, val, op_type).ProtectC(cf));
+    }
+    return Status::OK();
+  }
+
+  // No copy or move.
+  ProtectionInfoUpdater(const ProtectionInfoUpdater&) = delete;
+  ProtectionInfoUpdater(ProtectionInfoUpdater&&) = delete;
+  ProtectionInfoUpdater& operator=(const ProtectionInfoUpdater&) = delete;
+  ProtectionInfoUpdater& operator=(ProtectionInfoUpdater&&) = delete;
+
+  WriteBatch::ProtectionInfo* const prot_info_ = nullptr;
+};
+
+}  // anonymous namespace
+
+Status WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
+  assert(contents.size() >= WriteBatchInternal::kHeader);
+  assert(b->prot_info_ == nullptr);
+
+  b->rep_.assign(contents.data(), contents.size());
+  b->content_flags_.store(ContentFlags::DEFERRED, std::memory_order_relaxed);
+  return Status::OK();
+}
+
+Status WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src,
+                                  const bool wal_only) {
+  assert(dst->Count() == 0 ||
+         (dst->prot_info_ == nullptr) == (src->prot_info_ == nullptr));
+  if ((src->prot_info_ != nullptr &&
+       src->prot_info_->entries_.size() != src->Count()) ||
+      (dst->prot_info_ != nullptr &&
+       dst->prot_info_->entries_.size() != dst->Count())) {
+    return Status::Corruption(
+        "Write batch has inconsistent count and number of checksums");
+  }
+
+  size_t src_len;
+  int src_count;
+  uint32_t src_flags;
+
+  const SavePoint& batch_end = src->GetWalTerminationPoint();
+
+  if (wal_only && !batch_end.is_cleared()) {
+    src_len = batch_end.size - WriteBatchInternal::kHeader;
+    src_count = batch_end.count;
+    src_flags = batch_end.content_flags;
+  } else {
+    src_len = src->rep_.size() - WriteBatchInternal::kHeader;
+    src_count = Count(src);
+    src_flags = src->content_flags_.load(std::memory_order_relaxed);
+  }
+
+  if (src->prot_info_ != nullptr) {
+    if (dst->prot_info_ == nullptr) {
+      dst->prot_info_.reset(new WriteBatch::ProtectionInfo());
+    }
+    std::copy(src->prot_info_->entries_.begin(),
+              src->prot_info_->entries_.begin() + src_count,
+              std::back_inserter(dst->prot_info_->entries_));
+  } else if (dst->prot_info_ != nullptr) {
+    // dst has empty prot_info->entries
+    // In this special case, we allow write batch without prot_info to
+    // be appende to write batch with empty prot_info
+    dst->prot_info_ = nullptr;
+  }
+  SetCount(dst, Count(dst) + src_count);
+  assert(src->rep_.size() >= WriteBatchInternal::kHeader);
+  dst->rep_.append(src->rep_.data() + WriteBatchInternal::kHeader, src_len);
+  dst->content_flags_.store(
+      dst->content_flags_.load(std::memory_order_relaxed) | src_flags,
+      std::memory_order_relaxed);
+  return Status::OK();
+}
+
+size_t WriteBatchInternal::AppendedByteSize(size_t leftByteSize,
+                                            size_t rightByteSize) {
+  if (leftByteSize == 0 || rightByteSize == 0) {
+    return leftByteSize + rightByteSize;
+  } else {
+    return leftByteSize + rightByteSize - WriteBatchInternal::kHeader;
+  }
+}
+
+Status WriteBatchInternal::UpdateProtectionInfo(WriteBatch* wb,
+                                                size_t bytes_per_key,
+                                                uint64_t* checksum) {
+  if (bytes_per_key == 0) {
+    if (wb->prot_info_ != nullptr) {
+      wb->prot_info_.reset();
+      return Status::OK();
+    } else {
+      // Already not protected.
+      return Status::OK();
+    }
+  } else if (bytes_per_key == 8) {
+    if (wb->prot_info_ == nullptr) {
+      wb->prot_info_.reset(new WriteBatch::ProtectionInfo());
+      ProtectionInfoUpdater prot_info_updater(wb->prot_info_.get());
+      Status s = wb->Iterate(&prot_info_updater);
+      if (s.ok() && checksum != nullptr) {
+        uint64_t expected_hash = XXH3_64bits(wb->rep_.data(), wb->rep_.size());
+        if (expected_hash != *checksum) {
+          return Status::Corruption("Write batch content corrupted.");
+        }
+      }
+      return s;
+    } else {
+      // Already protected.
+      return Status::OK();
+    }
+  }
+  return Status::NotSupported(
+      "WriteBatch protection info must be zero or eight bytes/key");
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_batch_base.cc b/src/rocksdb/db/write_batch_base.cc
new file mode 100644
index 000000000..e4c0e74bd
--- /dev/null
+++ b/src/rocksdb/db/write_batch_base.cc
@@ -0,0 +1,94 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/write_batch_base.h"
+
+#include <string>
+
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Simple implementation of SlicePart variants of Put().  Child classes
+// can override these method with more performant solutions if they choose.
+Status WriteBatchBase::Put(ColumnFamilyHandle* column_family,
+                           const SliceParts& key, const SliceParts& value) {
+  std::string key_buf, value_buf;
+  Slice key_slice(key, &key_buf);
+  Slice value_slice(value, &value_buf);
+
+  return Put(column_family, key_slice, value_slice);
+}
+
+Status WriteBatchBase::Put(const SliceParts& key, const SliceParts& value) {
+  std::string key_buf, value_buf;
+  Slice key_slice(key, &key_buf);
+  Slice value_slice(value, &value_buf);
+
+  return Put(key_slice, value_slice);
+}
+
+Status WriteBatchBase::Delete(ColumnFamilyHandle* column_family,
+                              const SliceParts& key) {
+  std::string key_buf;
+  Slice key_slice(key, &key_buf);
+  return Delete(column_family, key_slice);
+}
+
+Status WriteBatchBase::Delete(const SliceParts& key) {
+  std::string key_buf;
+  Slice key_slice(key, &key_buf);
+  return Delete(key_slice);
+}
+
+Status WriteBatchBase::SingleDelete(ColumnFamilyHandle* column_family,
+                                    const SliceParts& key) {
+  std::string key_buf;
+  Slice key_slice(key, &key_buf);
+  return SingleDelete(column_family, key_slice);
+}
+
+Status WriteBatchBase::SingleDelete(const SliceParts& key) {
+  std::string key_buf;
+  Slice key_slice(key, &key_buf);
+  return SingleDelete(key_slice);
+}
+
+Status WriteBatchBase::DeleteRange(ColumnFamilyHandle* column_family,
+                                   const SliceParts& begin_key,
+                                   const SliceParts& end_key) {
+  std::string begin_key_buf, end_key_buf;
+  Slice begin_key_slice(begin_key, &begin_key_buf);
+  Slice end_key_slice(end_key, &end_key_buf);
+  return DeleteRange(column_family, begin_key_slice, end_key_slice);
+}
+
+Status WriteBatchBase::DeleteRange(const SliceParts& begin_key,
+                                   const SliceParts& end_key) {
+  std::string begin_key_buf, end_key_buf;
+  Slice begin_key_slice(begin_key, &begin_key_buf);
+  Slice end_key_slice(end_key, &end_key_buf);
+  return DeleteRange(begin_key_slice, end_key_slice);
+}
+
+Status WriteBatchBase::Merge(ColumnFamilyHandle* column_family,
+                             const SliceParts& key, const SliceParts& value) {
+  std::string key_buf, value_buf;
+  Slice key_slice(key, &key_buf);
+  Slice value_slice(value, &value_buf);
+
+  return Merge(column_family, key_slice, value_slice);
+}
+
+Status WriteBatchBase::Merge(const SliceParts& key, const SliceParts& value) {
+  std::string key_buf, value_buf;
+  Slice key_slice(key, &key_buf);
+  Slice value_slice(value, &value_buf);
+
+  return Merge(key_slice, value_slice);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_batch_internal.h b/src/rocksdb/db/write_batch_internal.h
new file mode 100644
index 000000000..1be0bd140
--- /dev/null
+++ b/src/rocksdb/db/write_batch_internal.h
@@ -0,0 +1,401 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <array>
+#include <vector>
+
+#include "db/flush_scheduler.h"
+#include "db/kv_checksum.h"
+#include "db/trim_history_scheduler.h"
+#include "db/write_thread.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/types.h"
+#include "rocksdb/write_batch.h"
+#include "util/autovector.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MemTable;
+class FlushScheduler;
+class ColumnFamilyData;
+
+class ColumnFamilyMemTables {
+ public:
+  virtual ~ColumnFamilyMemTables() {}
+  virtual bool Seek(uint32_t column_family_id) = 0;
+  // returns true if the update to memtable should be ignored
+  // (useful when recovering from log whose updates have already
+  // been processed)
+  virtual uint64_t GetLogNumber() const = 0;
+  virtual MemTable* GetMemTable() const = 0;
+  virtual ColumnFamilyHandle* GetColumnFamilyHandle() = 0;
+  virtual ColumnFamilyData* current() { return nullptr; }
+};
+
+class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables {
+ public:
+  explicit ColumnFamilyMemTablesDefault(MemTable* mem)
+      : ok_(false), mem_(mem) {}
+
+  bool Seek(uint32_t column_family_id) override {
+    ok_ = (column_family_id == 0);
+    return ok_;
+  }
+
+  uint64_t GetLogNumber() const override { return 0; }
+
+  MemTable* GetMemTable() const override {
+    assert(ok_);
+    return mem_;
+  }
+
+  ColumnFamilyHandle* GetColumnFamilyHandle() override { return nullptr; }
+
+ private:
+  bool ok_;
+  MemTable* mem_;
+};
+
+struct WriteBatch::ProtectionInfo {
+  // `WriteBatch` usually doesn't contain a huge number of keys so protecting
+  // with a fixed, non-configurable eight bytes per key may work well enough.
+  autovector<ProtectionInfoKVOC64> entries_;
+
+  size_t GetBytesPerKey() const { return 8; }
+};
+
+// WriteBatchInternal provides static methods for manipulating a
+// WriteBatch that we don't want in the public WriteBatch interface.
+class WriteBatchInternal {
+ public:
+  // WriteBatch header has an 8-byte sequence number followed by a 4-byte count.
+  static constexpr size_t kHeader = 12;
+
+  // WriteBatch methods with column_family_id instead of ColumnFamilyHandle*
+  static Status Put(WriteBatch* batch, uint32_t column_family_id,
+                    const Slice& key, const Slice& value);
+
+  static Status Put(WriteBatch* batch, uint32_t column_family_id,
+                    const SliceParts& key, const SliceParts& value);
+
+  static Status PutEntity(WriteBatch* batch, uint32_t column_family_id,
+                          const Slice& key, const WideColumns& columns);
+
+  static Status Delete(WriteBatch* batch, uint32_t column_family_id,
+                       const SliceParts& key);
+
+  static Status Delete(WriteBatch* batch, uint32_t column_family_id,
+                       const Slice& key);
+
+  static Status SingleDelete(WriteBatch* batch, uint32_t column_family_id,
+                             const SliceParts& key);
+
+  static Status SingleDelete(WriteBatch* batch, uint32_t column_family_id,
+                             const Slice& key);
+
+  static Status DeleteRange(WriteBatch* b, uint32_t column_family_id,
+                            const Slice& begin_key, const Slice& end_key);
+
+  static Status DeleteRange(WriteBatch* b, uint32_t column_family_id,
+                            const SliceParts& begin_key,
+                            const SliceParts& end_key);
+
+  static Status Merge(WriteBatch* batch, uint32_t column_family_id,
+                      const Slice& key, const Slice& value);
+
+  static Status Merge(WriteBatch* batch, uint32_t column_family_id,
+                      const SliceParts& key, const SliceParts& value);
+
+  static Status PutBlobIndex(WriteBatch* batch, uint32_t column_family_id,
+                             const Slice& key, const Slice& value);
+
+  static Status MarkEndPrepare(WriteBatch* batch, const Slice& xid,
+                               const bool write_after_commit = true,
+                               const bool unprepared_batch = false);
+
+  static Status MarkRollback(WriteBatch* batch, const Slice& xid);
+
+  static Status MarkCommit(WriteBatch* batch, const Slice& xid);
+
+  static Status MarkCommitWithTimestamp(WriteBatch* batch, const Slice& xid,
+                                        const Slice& commit_ts);
+
+  static Status InsertNoop(WriteBatch* batch);
+
+  // Return the number of entries in the batch.
+  static uint32_t Count(const WriteBatch* batch);
+
+  // Set the count for the number of entries in the batch.
+  static void SetCount(WriteBatch* batch, uint32_t n);
+
+  // Return the sequence number for the start of this batch.
+  static SequenceNumber Sequence(const WriteBatch* batch);
+
+  // Store the specified number as the sequence number for the start of
+  // this batch.
+  static void SetSequence(WriteBatch* batch, SequenceNumber seq);
+
+  // Returns the offset of the first entry in the batch.
+  // This offset is only valid if the batch is not empty.
+  static size_t GetFirstOffset(WriteBatch* batch);
+
+  static Slice Contents(const WriteBatch* batch) { return Slice(batch->rep_); }
+
+  static size_t ByteSize(const WriteBatch* batch) { return batch->rep_.size(); }
+
+  static Status SetContents(WriteBatch* batch, const Slice& contents);
+
+  static Status CheckSlicePartsLength(const SliceParts& key,
+                                      const SliceParts& value);
+
+  // Inserts batches[i] into memtable, for i in 0..num_batches-1 inclusive.
+  //
+  // If ignore_missing_column_families == true. WriteBatch
+  // referencing non-existing column family will be ignored.
+  // If ignore_missing_column_families == false, processing of the
+  // batches will be stopped if a reference is found to a non-existing
+  // column family and InvalidArgument() will be returned.  The writes
+  // in batches may be only partially applied at that point.
+  //
+  // If log_number is non-zero, the memtable will be updated only if
+  // memtables->GetLogNumber() >= log_number.
+  //
+  // If flush_scheduler is non-null, it will be invoked if the memtable
+  // should be flushed.
+  //
+  // Under concurrent use, the caller is responsible for making sure that
+  // the memtables object itself is thread-local.
+  static Status InsertInto(
+      WriteThread::WriteGroup& write_group, SequenceNumber sequence,
+      ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
+      TrimHistoryScheduler* trim_history_scheduler,
+      bool ignore_missing_column_families = false, uint64_t log_number = 0,
+      DB* db = nullptr, bool concurrent_memtable_writes = false,
+      bool seq_per_batch = false, bool batch_per_txn = true);
+
+  // Convenience form of InsertInto when you have only one batch
+  // next_seq returns the seq after last sequence number used in MemTable insert
+  static Status InsertInto(
+      const WriteBatch* batch, ColumnFamilyMemTables* memtables,
+      FlushScheduler* flush_scheduler,
+      TrimHistoryScheduler* trim_history_scheduler,
+      bool ignore_missing_column_families = false, uint64_t log_number = 0,
+      DB* db = nullptr, bool concurrent_memtable_writes = false,
+      SequenceNumber* next_seq = nullptr, bool* has_valid_writes = nullptr,
+      bool seq_per_batch = false, bool batch_per_txn = true);
+
+  static Status InsertInto(WriteThread::Writer* writer, SequenceNumber sequence,
+                           ColumnFamilyMemTables* memtables,
+                           FlushScheduler* flush_scheduler,
+                           TrimHistoryScheduler* trim_history_scheduler,
+                           bool ignore_missing_column_families = false,
+                           uint64_t log_number = 0, DB* db = nullptr,
+                           bool concurrent_memtable_writes = false,
+                           bool seq_per_batch = false, size_t batch_cnt = 0,
+                           bool batch_per_txn = true,
+                           bool hint_per_batch = false);
+
+  // Appends src write batch to dst write batch and updates count in dst
+  // write batch. Returns OK if the append is successful. Checks number of
+  // checksum against count in dst and src write batches, and returns Corruption
+  // if the count is inconsistent.
+  static Status Append(WriteBatch* dst, const WriteBatch* src,
+                       const bool WAL_only = false);
+
+  // Returns the byte size of appending a WriteBatch with ByteSize
+  // leftByteSize and a WriteBatch with ByteSize rightByteSize
+  static size_t AppendedByteSize(size_t leftByteSize, size_t rightByteSize);
+
+  // Iterate over [begin, end) range of a write batch
+  static Status Iterate(const WriteBatch* wb, WriteBatch::Handler* handler,
+                        size_t begin, size_t end);
+
+  // This write batch includes the latest state that should be persisted. Such
+  // state meant to be used only during recovery.
+  static void SetAsLatestPersistentState(WriteBatch* b);
+  static bool IsLatestPersistentState(const WriteBatch* b);
+
+  static std::tuple<Status, uint32_t, size_t> GetColumnFamilyIdAndTimestampSize(
+      WriteBatch* b, ColumnFamilyHandle* column_family);
+
+  static bool TimestampsUpdateNeeded(const WriteBatch& wb) {
+    return wb.needs_in_place_update_ts_;
+  }
+
+  static bool HasKeyWithTimestamp(const WriteBatch& wb) {
+    return wb.has_key_with_ts_;
+  }
+
+  // Update per-key value protection information on this write batch.
+  // If checksum is provided, the batch content is verfied against the checksum.
+  static Status UpdateProtectionInfo(WriteBatch* wb, size_t bytes_per_key,
+                                     uint64_t* checksum = nullptr);
+};
+
+// LocalSavePoint is similar to a scope guard
+class LocalSavePoint {
+ public:
+  explicit LocalSavePoint(WriteBatch* batch)
+      : batch_(batch),
+        savepoint_(batch->GetDataSize(), batch->Count(),
+                   batch->content_flags_.load(std::memory_order_relaxed))
+#ifndef NDEBUG
+        ,
+        committed_(false)
+#endif
+  {
+  }
+
+#ifndef NDEBUG
+  ~LocalSavePoint() { assert(committed_); }
+#endif
+  Status commit() {
+#ifndef NDEBUG
+    committed_ = true;
+#endif
+    if (batch_->max_bytes_ && batch_->rep_.size() > batch_->max_bytes_) {
+      batch_->rep_.resize(savepoint_.size);
+      WriteBatchInternal::SetCount(batch_, savepoint_.count);
+      if (batch_->prot_info_ != nullptr) {
+        batch_->prot_info_->entries_.resize(savepoint_.count);
+      }
+      batch_->content_flags_.store(savepoint_.content_flags,
+                                   std::memory_order_relaxed);
+      return Status::MemoryLimit();
+    }
+    return Status::OK();
+  }
+
+ private:
+  WriteBatch* batch_;
+  SavePoint savepoint_;
+#ifndef NDEBUG
+  bool committed_;
+#endif
+};
+
+template <typename TimestampSizeFuncType>
+class TimestampUpdater : public WriteBatch::Handler {
+ public:
+  explicit TimestampUpdater(WriteBatch::ProtectionInfo* prot_info,
+                            TimestampSizeFuncType&& ts_sz_func, const Slice& ts)
+      : prot_info_(prot_info),
+        ts_sz_func_(std::move(ts_sz_func)),
+        timestamp_(ts) {
+    assert(!timestamp_.empty());
+  }
+
+  ~TimestampUpdater() override {}
+
+  Status PutCF(uint32_t cf, const Slice& key, const Slice&) override {
+    return UpdateTimestamp(cf, key);
+  }
+
+  Status DeleteCF(uint32_t cf, const Slice& key) override {
+    return UpdateTimestamp(cf, key);
+  }
+
+  Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+    return UpdateTimestamp(cf, key);
+  }
+
+  Status DeleteRangeCF(uint32_t cf, const Slice& begin_key,
+                       const Slice& end_key) override {
+    Status s = UpdateTimestamp(cf, begin_key, true /* is_key */);
+    if (s.ok()) {
+      s = UpdateTimestamp(cf, end_key, false /* is_key */);
+    }
+    return s;
+  }
+
+  Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override {
+    return UpdateTimestamp(cf, key);
+  }
+
+  Status PutBlobIndexCF(uint32_t cf, const Slice& key, const Slice&) override {
+    return UpdateTimestamp(cf, key);
+  }
+
+  Status MarkBeginPrepare(bool) override { return Status::OK(); }
+
+  Status MarkEndPrepare(const Slice&) override { return Status::OK(); }
+
+  Status MarkCommit(const Slice&) override { return Status::OK(); }
+
+  Status MarkCommitWithTimestamp(const Slice&, const Slice&) override {
+    return Status::OK();
+  }
+
+  Status MarkRollback(const Slice&) override { return Status::OK(); }
+
+  Status MarkNoop(bool /*empty_batch*/) override { return Status::OK(); }
+
+ private:
+  // @param is_key specifies whether the update is for key or value.
+  Status UpdateTimestamp(uint32_t cf, const Slice& buf, bool is_key = true) {
+    Status s = UpdateTimestampImpl(cf, buf, idx_, is_key);
+    ++idx_;
+    return s;
+  }
+
+  Status UpdateTimestampImpl(uint32_t cf, const Slice& buf, size_t /*idx*/,
+                             bool is_key) {
+    if (timestamp_.empty()) {
+      return Status::InvalidArgument("Timestamp is empty");
+    }
+    size_t cf_ts_sz = ts_sz_func_(cf);
+    if (0 == cf_ts_sz) {
+      // Skip this column family.
+      return Status::OK();
+    } else if (std::numeric_limits<size_t>::max() == cf_ts_sz) {
+      // Column family timestamp info not found.
+      return Status::NotFound();
+    } else if (cf_ts_sz != timestamp_.size()) {
+      return Status::InvalidArgument("timestamp size mismatch");
+    }
+    UpdateProtectionInformationIfNeeded(buf, timestamp_, is_key);
+
+    char* ptr = const_cast<char*>(buf.data() + buf.size() - cf_ts_sz);
+    assert(ptr);
+    memcpy(ptr, timestamp_.data(), timestamp_.size());
+    return Status::OK();
+  }
+
+  void UpdateProtectionInformationIfNeeded(const Slice& buf, const Slice& ts,
+                                           bool is_key) {
+    if (prot_info_ != nullptr) {
+      const size_t ts_sz = ts.size();
+      SliceParts old(&buf, 1);
+      Slice old_no_ts(buf.data(), buf.size() - ts_sz);
+      std::array<Slice, 2> new_key_cmpts{{old_no_ts, ts}};
+      SliceParts new_parts(new_key_cmpts.data(), 2);
+      if (is_key) {
+        prot_info_->entries_[idx_].UpdateK(old, new_parts);
+      } else {
+        prot_info_->entries_[idx_].UpdateV(old, new_parts);
+      }
+    }
+  }
+
+  // No copy or move.
+  TimestampUpdater(const TimestampUpdater&) = delete;
+  TimestampUpdater(TimestampUpdater&&) = delete;
+  TimestampUpdater& operator=(const TimestampUpdater&) = delete;
+  TimestampUpdater& operator=(TimestampUpdater&&) = delete;
+
+  WriteBatch::ProtectionInfo* const prot_info_ = nullptr;
+  const TimestampSizeFuncType ts_sz_func_{};
+  const Slice timestamp_;
+  size_t idx_ = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_batch_test.cc b/src/rocksdb/db/write_batch_test.cc
new file mode 100644
index 000000000..d233853e2
--- /dev/null
+++ b/src/rocksdb/db/write_batch_test.cc
@@ -0,0 +1,1114 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <memory>
+
+#include "db/column_family.h"
+#include "db/db_test_util.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/scoped_arena_iterator.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static std::string PrintContents(WriteBatch* b,
+                                 bool merge_operator_supported = true) {
+  InternalKeyComparator cmp(BytewiseComparator());
+  auto factory = std::make_shared<SkipListFactory>();
+  Options options;
+  options.memtable_factory = factory;
+  if (merge_operator_supported) {
+    options.merge_operator.reset(new TestPutOperator());
+  }
+  ImmutableOptions ioptions(options);
+  WriteBufferManager wb(options.db_write_buffer_size);
+  MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+                               kMaxSequenceNumber, 0 /* column_family_id */);
+  mem->Ref();
+  std::string state;
+  ColumnFamilyMemTablesDefault cf_mems_default(mem);
+  Status s =
+      WriteBatchInternal::InsertInto(b, &cf_mems_default, nullptr, nullptr);
+  uint32_t count = 0;
+  int put_count = 0;
+  int delete_count = 0;
+  int single_delete_count = 0;
+  int delete_range_count = 0;
+  int merge_count = 0;
+  for (int i = 0; i < 2; ++i) {
+    Arena arena;
+    ScopedArenaIterator arena_iter_guard;
+    std::unique_ptr<InternalIterator> iter_guard;
+    InternalIterator* iter;
+    if (i == 0) {
+      iter = mem->NewIterator(ReadOptions(), &arena);
+      arena_iter_guard.set(iter);
+    } else {
+      iter = mem->NewRangeTombstoneIterator(ReadOptions(),
+                                            kMaxSequenceNumber /* read_seq */,
+                                            false /* immutable_memtable */);
+      iter_guard.reset(iter);
+    }
+    if (iter == nullptr) {
+      continue;
+    }
+    EXPECT_OK(iter->status());
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ParsedInternalKey ikey;
+      ikey.clear();
+      EXPECT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
+      switch (ikey.type) {
+        case kTypeValue:
+          state.append("Put(");
+          state.append(ikey.user_key.ToString());
+          state.append(", ");
+          state.append(iter->value().ToString());
+          state.append(")");
+          count++;
+          put_count++;
+          break;
+        case kTypeDeletion:
+          state.append("Delete(");
+          state.append(ikey.user_key.ToString());
+          state.append(")");
+          count++;
+          delete_count++;
+          break;
+        case kTypeSingleDeletion:
+          state.append("SingleDelete(");
+          state.append(ikey.user_key.ToString());
+          state.append(")");
+          count++;
+          single_delete_count++;
+          break;
+        case kTypeRangeDeletion:
+          state.append("DeleteRange(");
+          state.append(ikey.user_key.ToString());
+          state.append(", ");
+          state.append(iter->value().ToString());
+          state.append(")");
+          count++;
+          delete_range_count++;
+          break;
+        case kTypeMerge:
+          state.append("Merge(");
+          state.append(ikey.user_key.ToString());
+          state.append(", ");
+          state.append(iter->value().ToString());
+          state.append(")");
+          count++;
+          merge_count++;
+          break;
+        default:
+          assert(false);
+          break;
+      }
+      state.append("@");
+      state.append(std::to_string(ikey.sequence));
+    }
+    EXPECT_OK(iter->status());
+  }
+  if (s.ok()) {
+    EXPECT_EQ(b->HasPut(), put_count > 0);
+    EXPECT_EQ(b->HasDelete(), delete_count > 0);
+    EXPECT_EQ(b->HasSingleDelete(), single_delete_count > 0);
+    EXPECT_EQ(b->HasDeleteRange(), delete_range_count > 0);
+    EXPECT_EQ(b->HasMerge(), merge_count > 0);
+    if (count != WriteBatchInternal::Count(b)) {
+      state.append("CountMismatch()");
+    }
+  } else {
+    state.append(s.ToString());
+  }
+  delete mem->Unref();
+  return state;
+}
+
+class WriteBatchTest : public testing::Test {};
+
+TEST_F(WriteBatchTest, Empty) {
+  WriteBatch batch;
+  ASSERT_EQ("", PrintContents(&batch));
+  ASSERT_EQ(0u, WriteBatchInternal::Count(&batch));
+  ASSERT_EQ(0u, batch.Count());
+}
+
+TEST_F(WriteBatchTest, Multiple) {
+  WriteBatch batch;
+  ASSERT_OK(batch.Put(Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Delete(Slice("box")));
+  ASSERT_OK(batch.DeleteRange(Slice("bar"), Slice("foo")));
+  ASSERT_OK(batch.Put(Slice("baz"), Slice("boo")));
+  WriteBatchInternal::SetSequence(&batch, 100);
+  ASSERT_EQ(100U, WriteBatchInternal::Sequence(&batch));
+  ASSERT_EQ(4u, WriteBatchInternal::Count(&batch));
+  ASSERT_EQ(
+      "Put(baz, boo)@103"
+      "Delete(box)@101"
+      "Put(foo, bar)@100"
+      "DeleteRange(bar, foo)@102",
+      PrintContents(&batch));
+  ASSERT_EQ(4u, batch.Count());
+}
+
+TEST_F(WriteBatchTest, Corruption) {
+  WriteBatch batch;
+  ASSERT_OK(batch.Put(Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Delete(Slice("box")));
+  WriteBatchInternal::SetSequence(&batch, 200);
+  Slice contents = WriteBatchInternal::Contents(&batch);
+  ASSERT_OK(WriteBatchInternal::SetContents(
+      &batch, Slice(contents.data(), contents.size() - 1)));
+  ASSERT_EQ(
+      "Put(foo, bar)@200"
+      "Corruption: bad WriteBatch Delete",
+      PrintContents(&batch));
+}
+
+TEST_F(WriteBatchTest, Append) {
+  WriteBatch b1, b2;
+  WriteBatchInternal::SetSequence(&b1, 200);
+  WriteBatchInternal::SetSequence(&b2, 300);
+  ASSERT_OK(WriteBatchInternal::Append(&b1, &b2));
+  ASSERT_EQ("", PrintContents(&b1));
+  ASSERT_EQ(0u, b1.Count());
+  ASSERT_OK(b2.Put("a", "va"));
+  ASSERT_OK(WriteBatchInternal::Append(&b1, &b2));
+  ASSERT_EQ("Put(a, va)@200", PrintContents(&b1));
+  ASSERT_EQ(1u, b1.Count());
+  b2.Clear();
+  ASSERT_OK(b2.Put("b", "vb"));
+  ASSERT_OK(WriteBatchInternal::Append(&b1, &b2));
+  ASSERT_EQ(
+      "Put(a, va)@200"
+      "Put(b, vb)@201",
+      PrintContents(&b1));
+  ASSERT_EQ(2u, b1.Count());
+  ASSERT_OK(b2.Delete("foo"));
+  ASSERT_OK(WriteBatchInternal::Append(&b1, &b2));
+  ASSERT_EQ(
+      "Put(a, va)@200"
+      "Put(b, vb)@202"
+      "Put(b, vb)@201"
+      "Delete(foo)@203",
+      PrintContents(&b1));
+  ASSERT_EQ(4u, b1.Count());
+  b2.Clear();
+  ASSERT_OK(b2.Put("c", "cc"));
+  ASSERT_OK(b2.Put("d", "dd"));
+  b2.MarkWalTerminationPoint();
+  ASSERT_OK(b2.Put("e", "ee"));
+  ASSERT_OK(WriteBatchInternal::Append(&b1, &b2, /*wal only*/ true));
+  ASSERT_EQ(
+      "Put(a, va)@200"
+      "Put(b, vb)@202"
+      "Put(b, vb)@201"
+      "Put(c, cc)@204"
+      "Put(d, dd)@205"
+      "Delete(foo)@203",
+      PrintContents(&b1));
+  ASSERT_EQ(6u, b1.Count());
+  ASSERT_EQ(
+      "Put(c, cc)@0"
+      "Put(d, dd)@1"
+      "Put(e, ee)@2",
+      PrintContents(&b2));
+  ASSERT_EQ(3u, b2.Count());
+}
+
+TEST_F(WriteBatchTest, SingleDeletion) {
+  WriteBatch batch;
+  WriteBatchInternal::SetSequence(&batch, 100);
+  ASSERT_EQ("", PrintContents(&batch));
+  ASSERT_EQ(0u, batch.Count());
+  ASSERT_OK(batch.Put("a", "va"));
+  ASSERT_EQ("Put(a, va)@100", PrintContents(&batch));
+  ASSERT_EQ(1u, batch.Count());
+  ASSERT_OK(batch.SingleDelete("a"));
+  ASSERT_EQ(
+      "SingleDelete(a)@101"
+      "Put(a, va)@100",
+      PrintContents(&batch));
+  ASSERT_EQ(2u, batch.Count());
+}
+
+namespace {
+struct TestHandler : public WriteBatch::Handler {
+  std::string seen;
+  Status PutCF(uint32_t column_family_id, const Slice& key,
+               const Slice& value) override {
+    if (column_family_id == 0) {
+      seen += "Put(" + key.ToString() + ", " + value.ToString() + ")";
+    } else {
+      seen += "PutCF(" + std::to_string(column_family_id) + ", " +
+              key.ToString() + ", " + value.ToString() + ")";
+    }
+    return Status::OK();
+  }
+  Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
+    if (column_family_id == 0) {
+      seen += "Delete(" + key.ToString() + ")";
+    } else {
+      seen += "DeleteCF(" + std::to_string(column_family_id) + ", " +
+              key.ToString() + ")";
+    }
+    return Status::OK();
+  }
+  Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override {
+    if (column_family_id == 0) {
+      seen += "SingleDelete(" + key.ToString() + ")";
+    } else {
+      seen += "SingleDeleteCF(" + std::to_string(column_family_id) + ", " +
+              key.ToString() + ")";
+    }
+    return Status::OK();
+  }
+  Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key,
+                       const Slice& end_key) override {
+    if (column_family_id == 0) {
+      seen += "DeleteRange(" + begin_key.ToString() + ", " +
+              end_key.ToString() + ")";
+    } else {
+      seen += "DeleteRangeCF(" + std::to_string(column_family_id) + ", " +
+              begin_key.ToString() + ", " + end_key.ToString() + ")";
+    }
+    return Status::OK();
+  }
+  Status MergeCF(uint32_t column_family_id, const Slice& key,
+                 const Slice& value) override {
+    if (column_family_id == 0) {
+      seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")";
+    } else {
+      seen += "MergeCF(" + std::to_string(column_family_id) + ", " +
+              key.ToString() + ", " + value.ToString() + ")";
+    }
+    return Status::OK();
+  }
+  void LogData(const Slice& blob) override {
+    seen += "LogData(" + blob.ToString() + ")";
+  }
+  Status MarkBeginPrepare(bool unprepare) override {
+    seen +=
+        "MarkBeginPrepare(" + std::string(unprepare ? "true" : "false") + ")";
+    return Status::OK();
+  }
+  Status MarkEndPrepare(const Slice& xid) override {
+    seen += "MarkEndPrepare(" + xid.ToString() + ")";
+    return Status::OK();
+  }
+  Status MarkNoop(bool empty_batch) override {
+    seen += "MarkNoop(" + std::string(empty_batch ? "true" : "false") + ")";
+    return Status::OK();
+  }
+  Status MarkCommit(const Slice& xid) override {
+    seen += "MarkCommit(" + xid.ToString() + ")";
+    return Status::OK();
+  }
+  Status MarkCommitWithTimestamp(const Slice& xid, const Slice& ts) override {
+    seen += "MarkCommitWithTimestamp(" + xid.ToString() + ", " +
+            ts.ToString(true) + ")";
+    return Status::OK();
+  }
+  Status MarkRollback(const Slice& xid) override {
+    seen += "MarkRollback(" + xid.ToString() + ")";
+    return Status::OK();
+  }
+};
+}  // anonymous namespace
+
+TEST_F(WriteBatchTest, PutNotImplemented) {
+  WriteBatch batch;
+  ASSERT_OK(batch.Put(Slice("k1"), Slice("v1")));
+  ASSERT_EQ(1u, batch.Count());
+  ASSERT_EQ("Put(k1, v1)@0", PrintContents(&batch));
+
+  WriteBatch::Handler handler;
+  ASSERT_OK(batch.Iterate(&handler));
+}
+
+TEST_F(WriteBatchTest, DeleteNotImplemented) {
+  WriteBatch batch;
+  ASSERT_OK(batch.Delete(Slice("k2")));
+  ASSERT_EQ(1u, batch.Count());
+  ASSERT_EQ("Delete(k2)@0", PrintContents(&batch));
+
+  WriteBatch::Handler handler;
+  ASSERT_OK(batch.Iterate(&handler));
+}
+
+TEST_F(WriteBatchTest, SingleDeleteNotImplemented) {
+  WriteBatch batch;
+  ASSERT_OK(batch.SingleDelete(Slice("k2")));
+  ASSERT_EQ(1u, batch.Count());
+  ASSERT_EQ("SingleDelete(k2)@0", PrintContents(&batch));
+
+  WriteBatch::Handler handler;
+  ASSERT_OK(batch.Iterate(&handler));
+}
+
+TEST_F(WriteBatchTest, MergeNotImplemented) {
+  WriteBatch batch;
+  ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar")));
+  ASSERT_EQ(1u, batch.Count());
+  ASSERT_EQ("Merge(foo, bar)@0", PrintContents(&batch));
+
+  WriteBatch::Handler handler;
+  ASSERT_OK(batch.Iterate(&handler));
+}
+
+TEST_F(WriteBatchTest, MergeWithoutOperatorInsertionFailure) {
+  WriteBatch batch;
+  ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar")));
+  ASSERT_EQ(1u, batch.Count());
+  ASSERT_EQ(
+      "Invalid argument: Merge requires `ColumnFamilyOptions::merge_operator "
+      "!= nullptr`",
+      PrintContents(&batch, false /* merge_operator_supported */));
+}
+
+TEST_F(WriteBatchTest, Blob) {
+  WriteBatch batch;
+  ASSERT_OK(batch.Put(Slice("k1"), Slice("v1")));
+  ASSERT_OK(batch.Put(Slice("k2"), Slice("v2")));
+  ASSERT_OK(batch.Put(Slice("k3"), Slice("v3")));
+  ASSERT_OK(batch.PutLogData(Slice("blob1")));
+  ASSERT_OK(batch.Delete(Slice("k2")));
+  ASSERT_OK(batch.SingleDelete(Slice("k3")));
+  ASSERT_OK(batch.PutLogData(Slice("blob2")));
+  ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar")));
+  ASSERT_EQ(6u, batch.Count());
+  ASSERT_EQ(
+      "Merge(foo, bar)@5"
+      "Put(k1, v1)@0"
+      "Delete(k2)@3"
+      "Put(k2, v2)@1"
+      "SingleDelete(k3)@4"
+      "Put(k3, v3)@2",
+      PrintContents(&batch));
+
+  TestHandler handler;
+  ASSERT_OK(batch.Iterate(&handler));
+  ASSERT_EQ(
+      "Put(k1, v1)"
+      "Put(k2, v2)"
+      "Put(k3, v3)"
+      "LogData(blob1)"
+      "Delete(k2)"
+      "SingleDelete(k3)"
+      "LogData(blob2)"
+      "Merge(foo, bar)",
+      handler.seen);
+}
+
+TEST_F(WriteBatchTest, PrepareCommit) {
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::InsertNoop(&batch));
+  ASSERT_OK(batch.Put(Slice("k1"), Slice("v1")));
+  ASSERT_OK(batch.Put(Slice("k2"), Slice("v2")));
+  batch.SetSavePoint();
+  ASSERT_OK(WriteBatchInternal::MarkEndPrepare(&batch, Slice("xid1")));
+  Status s = batch.RollbackToSavePoint();
+  ASSERT_EQ(s, Status::NotFound());
+  ASSERT_OK(WriteBatchInternal::MarkCommit(&batch, Slice("xid1")));
+  ASSERT_OK(WriteBatchInternal::MarkRollback(&batch, Slice("xid1")));
+  ASSERT_EQ(2u, batch.Count());
+
+  TestHandler handler;
+  ASSERT_OK(batch.Iterate(&handler));
+  ASSERT_EQ(
+      "MarkBeginPrepare(false)"
+      "Put(k1, v1)"
+      "Put(k2, v2)"
+      "MarkEndPrepare(xid1)"
+      "MarkCommit(xid1)"
+      "MarkRollback(xid1)",
+      handler.seen);
+}
+
+// It requires more than 30GB of memory to run the test. With single memory
+// allocation of more than 30GB.
+// Not all platform can run it. Also it runs a long time. So disable it.
+TEST_F(WriteBatchTest, DISABLED_ManyUpdates) {
+  // Insert key and value of 3GB and push total batch size to 12GB.
+  static const size_t kKeyValueSize = 4u;
+  static const uint32_t kNumUpdates = uint32_t{3} << 30;
+  std::string raw(kKeyValueSize, 'A');
+  WriteBatch batch(kNumUpdates * (4 + kKeyValueSize * 2) + 1024u);
+  char c = 'A';
+  for (uint32_t i = 0; i < kNumUpdates; i++) {
+    if (c > 'Z') {
+      c = 'A';
+    }
+    raw[0] = c;
+    raw[raw.length() - 1] = c;
+    c++;
+    ASSERT_OK(batch.Put(raw, raw));
+  }
+
+  ASSERT_EQ(kNumUpdates, batch.Count());
+
+  struct NoopHandler : public WriteBatch::Handler {
+    uint32_t num_seen = 0;
+    char expected_char = 'A';
+    Status PutCF(uint32_t /*column_family_id*/, const Slice& key,
+                 const Slice& value) override {
+      EXPECT_EQ(kKeyValueSize, key.size());
+      EXPECT_EQ(kKeyValueSize, value.size());
+      EXPECT_EQ(expected_char, key[0]);
+      EXPECT_EQ(expected_char, value[0]);
+      EXPECT_EQ(expected_char, key[kKeyValueSize - 1]);
+      EXPECT_EQ(expected_char, value[kKeyValueSize - 1]);
+      expected_char++;
+      if (expected_char > 'Z') {
+        expected_char = 'A';
+      }
+      ++num_seen;
+      return Status::OK();
+    }
+    Status DeleteCF(uint32_t /*column_family_id*/,
+                    const Slice& /*key*/) override {
+      ADD_FAILURE();
+      return Status::OK();
+    }
+    Status SingleDeleteCF(uint32_t /*column_family_id*/,
+                          const Slice& /*key*/) override {
+      ADD_FAILURE();
+      return Status::OK();
+    }
+    Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/,
+                   const Slice& /*value*/) override {
+      ADD_FAILURE();
+      return Status::OK();
+    }
+    void LogData(const Slice& /*blob*/) override { ADD_FAILURE(); }
+    bool Continue() override { return num_seen < kNumUpdates; }
+  } handler;
+
+  ASSERT_OK(batch.Iterate(&handler));
+  ASSERT_EQ(kNumUpdates, handler.num_seen);
+}
+
+// The test requires more than 18GB memory to run it, with single memory
+// allocation of more than 12GB. Not all the platform can run it. So disable it.
+TEST_F(WriteBatchTest, DISABLED_LargeKeyValue) {
+  // Insert key and value of 3GB and push total batch size to 12GB.
+  static const size_t kKeyValueSize = 3221225472u;
+  std::string raw(kKeyValueSize, 'A');
+  WriteBatch batch(size_t(12884901888ull + 1024u));
+  for (char i = 0; i < 2; i++) {
+    raw[0] = 'A' + i;
+    raw[raw.length() - 1] = 'A' - i;
+    ASSERT_OK(batch.Put(raw, raw));
+  }
+
+  ASSERT_EQ(2u, batch.Count());
+
+  struct NoopHandler : public WriteBatch::Handler {
+    int num_seen = 0;
+    Status PutCF(uint32_t /*column_family_id*/, const Slice& key,
+                 const Slice& value) override {
+      EXPECT_EQ(kKeyValueSize, key.size());
+      EXPECT_EQ(kKeyValueSize, value.size());
+      EXPECT_EQ('A' + num_seen, key[0]);
+      EXPECT_EQ('A' + num_seen, value[0]);
+      EXPECT_EQ('A' - num_seen, key[kKeyValueSize - 1]);
+      EXPECT_EQ('A' - num_seen, value[kKeyValueSize - 1]);
+      ++num_seen;
+      return Status::OK();
+    }
+    Status DeleteCF(uint32_t /*column_family_id*/,
+                    const Slice& /*key*/) override {
+      ADD_FAILURE();
+      return Status::OK();
+    }
+    Status SingleDeleteCF(uint32_t /*column_family_id*/,
+                          const Slice& /*key*/) override {
+      ADD_FAILURE();
+      return Status::OK();
+    }
+    Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/,
+                   const Slice& /*value*/) override {
+      ADD_FAILURE();
+      return Status::OK();
+    }
+    void LogData(const Slice& /*blob*/) override { ADD_FAILURE(); }
+    bool Continue() override { return num_seen < 2; }
+  } handler;
+
+  ASSERT_OK(batch.Iterate(&handler));
+  ASSERT_EQ(2, handler.num_seen);
+}
+
+TEST_F(WriteBatchTest, Continue) {
+  WriteBatch batch;
+
+  struct Handler : public TestHandler {
+    int num_seen = 0;
+    Status PutCF(uint32_t column_family_id, const Slice& key,
+                 const Slice& value) override {
+      ++num_seen;
+      return TestHandler::PutCF(column_family_id, key, value);
+    }
+    Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
+      ++num_seen;
+      return TestHandler::DeleteCF(column_family_id, key);
+    }
+    Status SingleDeleteCF(uint32_t column_family_id,
+                          const Slice& key) override {
+      ++num_seen;
+      return TestHandler::SingleDeleteCF(column_family_id, key);
+    }
+    Status MergeCF(uint32_t column_family_id, const Slice& key,
+                   const Slice& value) override {
+      ++num_seen;
+      return TestHandler::MergeCF(column_family_id, key, value);
+    }
+    void LogData(const Slice& blob) override {
+      ++num_seen;
+      TestHandler::LogData(blob);
+    }
+    bool Continue() override { return num_seen < 5; }
+  } handler;
+
+  ASSERT_OK(batch.Put(Slice("k1"), Slice("v1")));
+  ASSERT_OK(batch.Put(Slice("k2"), Slice("v2")));
+  ASSERT_OK(batch.PutLogData(Slice("blob1")));
+  ASSERT_OK(batch.Delete(Slice("k1")));
+  ASSERT_OK(batch.SingleDelete(Slice("k2")));
+  ASSERT_OK(batch.PutLogData(Slice("blob2")));
+  ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Iterate(&handler));
+  ASSERT_EQ(
+      "Put(k1, v1)"
+      "Put(k2, v2)"
+      "LogData(blob1)"
+      "Delete(k1)"
+      "SingleDelete(k2)",
+      handler.seen);
+}
+
+TEST_F(WriteBatchTest, PutGatherSlices) {
+  WriteBatch batch;
+  ASSERT_OK(batch.Put(Slice("foo"), Slice("bar")));
+
+  {
+    // Try a write where the key is one slice but the value is two
+    Slice key_slice("baz");
+    Slice value_slices[2] = {Slice("header"), Slice("payload")};
+    ASSERT_OK(
+        batch.Put(SliceParts(&key_slice, 1), SliceParts(value_slices, 2)));
+  }
+
+  {
+    // One where the key is composite but the value is a single slice
+    Slice key_slices[3] = {Slice("key"), Slice("part2"), Slice("part3")};
+    Slice value_slice("value");
+    ASSERT_OK(
+        batch.Put(SliceParts(key_slices, 3), SliceParts(&value_slice, 1)));
+  }
+
+  WriteBatchInternal::SetSequence(&batch, 100);
+  ASSERT_EQ(
+      "Put(baz, headerpayload)@101"
+      "Put(foo, bar)@100"
+      "Put(keypart2part3, value)@102",
+      PrintContents(&batch));
+  ASSERT_EQ(3u, batch.Count());
+}
+
+namespace {
+class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl {
+ public:
+  explicit ColumnFamilyHandleImplDummy(int id)
+      : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {}
+  explicit ColumnFamilyHandleImplDummy(int id, const Comparator* ucmp)
+      : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr),
+        id_(id),
+        ucmp_(ucmp) {}
+  uint32_t GetID() const override { return id_; }
+  const Comparator* GetComparator() const override { return ucmp_; }
+
+ private:
+  uint32_t id_;
+  const Comparator* const ucmp_ = BytewiseComparator();
+};
+}  // anonymous namespace
+
+TEST_F(WriteBatchTest, ColumnFamiliesBatchTest) {
+  WriteBatch batch;
+  ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8);
+  ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Put(&two, Slice("twofoo"), Slice("bar2")));
+  ASSERT_OK(batch.Put(&eight, Slice("eightfoo"), Slice("bar8")));
+  ASSERT_OK(batch.Delete(&eight, Slice("eightfoo")));
+  ASSERT_OK(batch.SingleDelete(&two, Slice("twofoo")));
+  ASSERT_OK(batch.DeleteRange(&two, Slice("3foo"), Slice("4foo")));
+  ASSERT_OK(batch.Merge(&three, Slice("threethree"), Slice("3three")));
+  ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Merge(Slice("omom"), Slice("nom")));
+
+  TestHandler handler;
+  ASSERT_OK(batch.Iterate(&handler));
+  ASSERT_EQ(
+      "Put(foo, bar)"
+      "PutCF(2, twofoo, bar2)"
+      "PutCF(8, eightfoo, bar8)"
+      "DeleteCF(8, eightfoo)"
+      "SingleDeleteCF(2, twofoo)"
+      "DeleteRangeCF(2, 3foo, 4foo)"
+      "MergeCF(3, threethree, 3three)"
+      "Put(foo, bar)"
+      "Merge(omom, nom)",
+      handler.seen);
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(WriteBatchTest, ColumnFamiliesBatchWithIndexTest) {
+  WriteBatchWithIndex batch;
+  ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8);
+  ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Put(&two, Slice("twofoo"), Slice("bar2")));
+  ASSERT_OK(batch.Put(&eight, Slice("eightfoo"), Slice("bar8")));
+  ASSERT_OK(batch.Delete(&eight, Slice("eightfoo")));
+  ASSERT_OK(batch.SingleDelete(&two, Slice("twofoo")));
+  ASSERT_OK(batch.Merge(&three, Slice("threethree"), Slice("3three")));
+  ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Merge(Slice("omom"), Slice("nom")));
+
+  std::unique_ptr<WBWIIterator> iter;
+
+  iter.reset(batch.NewIterator(&eight));
+  iter->Seek("eightfoo");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
+  ASSERT_EQ("eightfoo", iter->Entry().key.ToString());
+  ASSERT_EQ("bar8", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kDeleteRecord, iter->Entry().type);
+  ASSERT_EQ("eightfoo", iter->Entry().key.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+
+  iter.reset(batch.NewIterator(&two));
+  iter->Seek("twofoo");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
+  ASSERT_EQ("twofoo", iter->Entry().key.ToString());
+  ASSERT_EQ("bar2", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kSingleDeleteRecord, iter->Entry().type);
+  ASSERT_EQ("twofoo", iter->Entry().key.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+
+  iter.reset(batch.NewIterator());
+  iter->Seek("gggg");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kMergeRecord, iter->Entry().type);
+  ASSERT_EQ("omom", iter->Entry().key.ToString());
+  ASSERT_EQ("nom", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+
+  iter.reset(batch.NewIterator(&zero));
+  iter->Seek("foo");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
+  ASSERT_EQ("foo", iter->Entry().key.ToString());
+  ASSERT_EQ("bar", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
+  ASSERT_EQ("foo", iter->Entry().key.ToString());
+  ASSERT_EQ("bar", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kMergeRecord, iter->Entry().type);
+  ASSERT_EQ("omom", iter->Entry().key.ToString());
+  ASSERT_EQ("nom", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+
+  TestHandler handler;
+  ASSERT_OK(batch.GetWriteBatch()->Iterate(&handler));
+  ASSERT_EQ(
+      "Put(foo, bar)"
+      "PutCF(2, twofoo, bar2)"
+      "PutCF(8, eightfoo, bar8)"
+      "DeleteCF(8, eightfoo)"
+      "SingleDeleteCF(2, twofoo)"
+      "MergeCF(3, threethree, 3three)"
+      "Put(foo, bar)"
+      "Merge(omom, nom)",
+      handler.seen);
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_F(WriteBatchTest, SavePointTest) {
+  Status s;
+  WriteBatch batch;
+  batch.SetSavePoint();
+
+  ASSERT_OK(batch.Put("A", "a"));
+  ASSERT_OK(batch.Put("B", "b"));
+  batch.SetSavePoint();
+
+  ASSERT_OK(batch.Put("C", "c"));
+  ASSERT_OK(batch.Delete("A"));
+  batch.SetSavePoint();
+  batch.SetSavePoint();
+
+  ASSERT_OK(batch.RollbackToSavePoint());
+  ASSERT_EQ(
+      "Delete(A)@3"
+      "Put(A, a)@0"
+      "Put(B, b)@1"
+      "Put(C, c)@2",
+      PrintContents(&batch));
+
+  ASSERT_OK(batch.RollbackToSavePoint());
+  ASSERT_OK(batch.RollbackToSavePoint());
+  ASSERT_EQ(
+      "Put(A, a)@0"
+      "Put(B, b)@1",
+      PrintContents(&batch));
+
+  ASSERT_OK(batch.Delete("A"));
+  ASSERT_OK(batch.Put("B", "bb"));
+
+  ASSERT_OK(batch.RollbackToSavePoint());
+  ASSERT_EQ("", PrintContents(&batch));
+
+  s = batch.RollbackToSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_EQ("", PrintContents(&batch));
+
+  ASSERT_OK(batch.Put("D", "d"));
+  ASSERT_OK(batch.Delete("A"));
+
+  batch.SetSavePoint();
+
+  ASSERT_OK(batch.Put("A", "aaa"));
+
+  ASSERT_OK(batch.RollbackToSavePoint());
+  ASSERT_EQ(
+      "Delete(A)@1"
+      "Put(D, d)@0",
+      PrintContents(&batch));
+
+  batch.SetSavePoint();
+
+  ASSERT_OK(batch.Put("D", "d"));
+  ASSERT_OK(batch.Delete("A"));
+
+  ASSERT_OK(batch.RollbackToSavePoint());
+  ASSERT_EQ(
+      "Delete(A)@1"
+      "Put(D, d)@0",
+      PrintContents(&batch));
+
+  s = batch.RollbackToSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_EQ(
+      "Delete(A)@1"
+      "Put(D, d)@0",
+      PrintContents(&batch));
+
+  WriteBatch batch2;
+
+  s = batch2.RollbackToSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_EQ("", PrintContents(&batch2));
+
+  ASSERT_OK(batch2.Delete("A"));
+  batch2.SetSavePoint();
+
+  s = batch2.RollbackToSavePoint();
+  ASSERT_OK(s);
+  ASSERT_EQ("Delete(A)@0", PrintContents(&batch2));
+
+  batch2.Clear();
+  ASSERT_EQ("", PrintContents(&batch2));
+
+  batch2.SetSavePoint();
+
+  ASSERT_OK(batch2.Delete("B"));
+  ASSERT_EQ("Delete(B)@0", PrintContents(&batch2));
+
+  batch2.SetSavePoint();
+  s = batch2.RollbackToSavePoint();
+  ASSERT_OK(s);
+  ASSERT_EQ("Delete(B)@0", PrintContents(&batch2));
+
+  s = batch2.RollbackToSavePoint();
+  ASSERT_OK(s);
+  ASSERT_EQ("", PrintContents(&batch2));
+
+  s = batch2.RollbackToSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_EQ("", PrintContents(&batch2));
+
+  WriteBatch batch3;
+
+  s = batch3.PopSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_EQ("", PrintContents(&batch3));
+
+  batch3.SetSavePoint();
+  ASSERT_OK(batch3.Delete("A"));
+
+  s = batch3.PopSavePoint();
+  ASSERT_OK(s);
+  ASSERT_EQ("Delete(A)@0", PrintContents(&batch3));
+}
+
+TEST_F(WriteBatchTest, MemoryLimitTest) {
+  Status s;
+  // The header size is 12 bytes. The two Puts take 8 bytes which gives total
+  // of 12 + 8 * 2 = 28 bytes.
+  WriteBatch batch(0, 28);
+
+  ASSERT_OK(batch.Put("a", "...."));
+  ASSERT_OK(batch.Put("b", "...."));
+  s = batch.Put("c", "....");
+  ASSERT_TRUE(s.IsMemoryLimit());
+}
+
+namespace {
+class TimestampChecker : public WriteBatch::Handler {
+ public:
+  explicit TimestampChecker(
+      std::unordered_map<uint32_t, const Comparator*> cf_to_ucmps, Slice ts)
+      : cf_to_ucmps_(std::move(cf_to_ucmps)), timestamp_(std::move(ts)) {}
+  Status PutCF(uint32_t cf, const Slice& key, const Slice& /*value*/) override {
+    auto cf_iter = cf_to_ucmps_.find(cf);
+    if (cf_iter == cf_to_ucmps_.end()) {
+      return Status::Corruption();
+    }
+    const Comparator* const ucmp = cf_iter->second;
+    assert(ucmp);
+    size_t ts_sz = ucmp->timestamp_size();
+    if (ts_sz == 0) {
+      return Status::OK();
+    }
+    if (key.size() < ts_sz) {
+      return Status::Corruption();
+    }
+    Slice ts = ExtractTimestampFromUserKey(key, ts_sz);
+    if (ts.compare(timestamp_) != 0) {
+      return Status::Corruption();
+    }
+    return Status::OK();
+  }
+
+ private:
+  std::unordered_map<uint32_t, const Comparator*> cf_to_ucmps_;
+  Slice timestamp_;
+};
+
+Status CheckTimestampsInWriteBatch(
+    WriteBatch& wb, Slice timestamp,
+    std::unordered_map<uint32_t, const Comparator*> cf_to_ucmps) {
+  TimestampChecker ts_checker(cf_to_ucmps, timestamp);
+  return wb.Iterate(&ts_checker);
+}
+}  // anonymous namespace
+
+TEST_F(WriteBatchTest, SanityChecks) {
+  ColumnFamilyHandleImplDummy cf0(0,
+                                  test::BytewiseComparatorWithU64TsWrapper());
+  ColumnFamilyHandleImplDummy cf4(4);
+
+  WriteBatch wb(0, 0, 0, /*default_cf_ts_sz=*/sizeof(uint64_t));
+
+  // Sanity checks for the new WriteBatch APIs with extra 'ts' arg.
+  ASSERT_TRUE(wb.Put(nullptr, "key", "ts", "value").IsInvalidArgument());
+  ASSERT_TRUE(wb.Delete(nullptr, "key", "ts").IsInvalidArgument());
+  ASSERT_TRUE(wb.SingleDelete(nullptr, "key", "ts").IsInvalidArgument());
+  ASSERT_TRUE(wb.Merge(nullptr, "key", "ts", "value").IsInvalidArgument());
+  ASSERT_TRUE(wb.DeleteRange(nullptr, "begin_key", "end_key", "ts")
+                  .IsInvalidArgument());
+
+  ASSERT_TRUE(wb.Put(&cf4, "key", "ts", "value").IsInvalidArgument());
+  ASSERT_TRUE(wb.Delete(&cf4, "key", "ts").IsInvalidArgument());
+  ASSERT_TRUE(wb.SingleDelete(&cf4, "key", "ts").IsInvalidArgument());
+  ASSERT_TRUE(wb.Merge(&cf4, "key", "ts", "value").IsInvalidArgument());
+  ASSERT_TRUE(
+      wb.DeleteRange(&cf4, "begin_key", "end_key", "ts").IsInvalidArgument());
+
+  constexpr size_t wrong_ts_sz = 1 + sizeof(uint64_t);
+  std::string ts(wrong_ts_sz, '\0');
+
+  ASSERT_TRUE(wb.Put(&cf0, "key", ts, "value").IsInvalidArgument());
+  ASSERT_TRUE(wb.Delete(&cf0, "key", ts).IsInvalidArgument());
+  ASSERT_TRUE(wb.SingleDelete(&cf0, "key", ts).IsInvalidArgument());
+  ASSERT_TRUE(wb.Merge(&cf0, "key", ts, "value").IsInvalidArgument());
+  ASSERT_TRUE(
+      wb.DeleteRange(&cf0, "begin_key", "end_key", ts).IsInvalidArgument());
+
+  // Sanity checks for the new WriteBatch APIs without extra 'ts' arg.
+  WriteBatch wb1(0, 0, 0, wrong_ts_sz);
+  ASSERT_TRUE(wb1.Put(&cf0, "key", "value").IsInvalidArgument());
+  ASSERT_TRUE(wb1.Delete(&cf0, "key").IsInvalidArgument());
+  ASSERT_TRUE(wb1.SingleDelete(&cf0, "key").IsInvalidArgument());
+  ASSERT_TRUE(wb1.Merge(&cf0, "key", "value").IsInvalidArgument());
+  ASSERT_TRUE(
+      wb1.DeleteRange(&cf0, "begin_key", "end_key").IsInvalidArgument());
+}
+
+TEST_F(WriteBatchTest, UpdateTimestamps) {
+  // We assume the last eight bytes of each key is reserved for timestamps.
+  // Therefore, we must make sure each key is longer than eight bytes.
+  constexpr size_t key_size = 16;
+  constexpr size_t num_of_keys = 10;
+  std::vector<std::string> key_strs(num_of_keys, std::string(key_size, '\0'));
+
+  ColumnFamilyHandleImplDummy cf0(0);
+  ColumnFamilyHandleImplDummy cf4(4,
+                                  test::BytewiseComparatorWithU64TsWrapper());
+  ColumnFamilyHandleImplDummy cf5(5,
+                                  test::BytewiseComparatorWithU64TsWrapper());
+
+  const std::unordered_map<uint32_t, const Comparator*> cf_to_ucmps = {
+      {0, cf0.GetComparator()},
+      {4, cf4.GetComparator()},
+      {5, cf5.GetComparator()}};
+
+  static constexpr size_t timestamp_size = sizeof(uint64_t);
+
+  {
+    WriteBatch wb1, wb2, wb3, wb4, wb5, wb6, wb7;
+    ASSERT_OK(wb1.Put(&cf0, "key", "value"));
+    ASSERT_FALSE(WriteBatchInternal::HasKeyWithTimestamp(wb1));
+    ASSERT_OK(wb2.Put(&cf4, "key", "value"));
+    ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb2));
+    ASSERT_OK(wb3.Put(&cf4, "key", /*ts=*/std::string(timestamp_size, '\xfe'),
+                      "value"));
+    ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb3));
+    ASSERT_OK(wb4.Delete(&cf4, "key",
+                         /*ts=*/std::string(timestamp_size, '\xfe')));
+    ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb4));
+    ASSERT_OK(wb5.Delete(&cf4, "key"));
+    ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb5));
+    ASSERT_OK(wb6.SingleDelete(&cf4, "key"));
+    ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb6));
+    ASSERT_OK(wb7.SingleDelete(&cf4, "key",
+                               /*ts=*/std::string(timestamp_size, '\xfe')));
+    ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb7));
+  }
+
+  WriteBatch batch;
+  // Write to the batch. We will assign timestamps later.
+  for (const auto& key_str : key_strs) {
+    ASSERT_OK(batch.Put(&cf0, key_str, "value"));
+    ASSERT_OK(batch.Put(&cf4, key_str, "value"));
+    ASSERT_OK(batch.Put(&cf5, key_str, "value"));
+  }
+
+  const auto checker1 = [](uint32_t cf) {
+    if (cf == 4 || cf == 5) {
+      return timestamp_size;
+    } else if (cf == 0) {
+      return static_cast<size_t>(0);
+    } else {
+      return std::numeric_limits<size_t>::max();
+    }
+  };
+  ASSERT_OK(
+      batch.UpdateTimestamps(std::string(timestamp_size, '\xfe'), checker1));
+  ASSERT_OK(CheckTimestampsInWriteBatch(
+      batch, std::string(timestamp_size, '\xfe'), cf_to_ucmps));
+
+  // We use indexed_cf_to_ucmps, non_indexed_cfs_with_ts and timestamp_size to
+  // simulate the case in which a transaction enables indexing for some writes
+  // while disables indexing for other writes. A transaction uses a
+  // WriteBatchWithIndex object to buffer writes (we consider Write-committed
+  // policy only). If indexing is enabled, then writes go through
+  // WriteBatchWithIndex API populating a WBWI internal data structure, i.e. a
+  // mapping from cf to user comparators. If indexing is disabled, a transaction
+  // writes directly to the underlying raw WriteBatch. We will need to track the
+  // comparator information for the column families to which un-indexed writes
+  // are performed. When calling UpdateTimestamp API of WriteBatch, we need
+  // indexed_cf_to_ucmps, non_indexed_cfs_with_ts, and timestamp_size to perform
+  // checking.
+  std::unordered_map<uint32_t, const Comparator*> indexed_cf_to_ucmps = {
+      {0, cf0.GetComparator()}, {4, cf4.GetComparator()}};
+  std::unordered_set<uint32_t> non_indexed_cfs_with_ts = {cf5.GetID()};
+  const auto checker2 = [&indexed_cf_to_ucmps,
+                         &non_indexed_cfs_with_ts](uint32_t cf) {
+    if (non_indexed_cfs_with_ts.count(cf) > 0) {
+      return timestamp_size;
+    }
+    auto cf_iter = indexed_cf_to_ucmps.find(cf);
+    if (cf_iter == indexed_cf_to_ucmps.end()) {
+      assert(false);
+      return std::numeric_limits<size_t>::max();
+    }
+    const Comparator* const ucmp = cf_iter->second;
+    assert(ucmp);
+    return ucmp->timestamp_size();
+  };
+  ASSERT_OK(
+      batch.UpdateTimestamps(std::string(timestamp_size, '\xef'), checker2));
+  ASSERT_OK(CheckTimestampsInWriteBatch(
+      batch, std::string(timestamp_size, '\xef'), cf_to_ucmps));
+}
+
+TEST_F(WriteBatchTest, CommitWithTimestamp) {
+  WriteBatch wb;
+  const std::string txn_name = "xid1";
+  std::string ts;
+  constexpr uint64_t commit_ts = 23;
+  PutFixed64(&ts, commit_ts);
+  ASSERT_OK(WriteBatchInternal::MarkCommitWithTimestamp(&wb, txn_name, ts));
+  TestHandler handler;
+  ASSERT_OK(wb.Iterate(&handler));
+  ASSERT_EQ("MarkCommitWithTimestamp(" + txn_name + ", " +
+                Slice(ts).ToString(true) + ")",
+            handler.seen);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/write_callback.h b/src/rocksdb/db/write_callback.h
new file mode 100644
index 000000000..106d02041
--- /dev/null
+++ b/src/rocksdb/db/write_callback.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DB;
+
+class WriteCallback {
+ public:
+  virtual ~WriteCallback() {}
+
+  // Will be called while on the write thread before the write executes.  If
+  // this function returns a non-OK status, the write will be aborted and this
+  // status will be returned to the caller of DB::Write().
+  virtual Status Callback(DB* db) = 0;
+
+  // return true if writes with this callback can be batched with other writes
+  virtual bool AllowWriteBatching() = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_callback_test.cc b/src/rocksdb/db/write_callback_test.cc
new file mode 100644
index 000000000..e6ebaae08
--- /dev/null
+++ b/src/rocksdb/db/write_callback_test.cc
@@ -0,0 +1,465 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/write_callback.h"
+
+#include <atomic>
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/write_batch.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/random.h"
+
+using std::string;
+
+namespace ROCKSDB_NAMESPACE {
+
+class WriteCallbackTest : public testing::Test {
+ public:
+  string dbname;
+
+  WriteCallbackTest() {
+    dbname = test::PerThreadDBPath("write_callback_testdb");
+  }
+};
+
+class WriteCallbackTestWriteCallback1 : public WriteCallback {
+ public:
+  bool was_called = false;
+
+  Status Callback(DB* db) override {
+    was_called = true;
+
+    // Make sure db is a DBImpl
+    DBImpl* db_impl = dynamic_cast<DBImpl*>(db);
+    if (db_impl == nullptr) {
+      return Status::InvalidArgument("");
+    }
+
+    return Status::OK();
+  }
+
+  bool AllowWriteBatching() override { return true; }
+};
+
+class WriteCallbackTestWriteCallback2 : public WriteCallback {
+ public:
+  Status Callback(DB* /*db*/) override { return Status::Busy(); }
+  bool AllowWriteBatching() override { return true; }
+};
+
+class MockWriteCallback : public WriteCallback {
+ public:
+  bool should_fail_ = false;
+  bool allow_batching_ = false;
+  std::atomic<bool> was_called_{false};
+
+  MockWriteCallback() {}
+
+  MockWriteCallback(const MockWriteCallback& other) {
+    should_fail_ = other.should_fail_;
+    allow_batching_ = other.allow_batching_;
+    was_called_.store(other.was_called_.load());
+  }
+
+  Status Callback(DB* /*db*/) override {
+    was_called_.store(true);
+    if (should_fail_) {
+      return Status::Busy();
+    } else {
+      return Status::OK();
+    }
+  }
+
+  bool AllowWriteBatching() override { return allow_batching_; }
+};
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+class WriteCallbackPTest
+    : public WriteCallbackTest,
+      public ::testing::WithParamInterface<
+          std::tuple<bool, bool, bool, bool, bool, bool, bool>> {
+ public:
+  WriteCallbackPTest() {
+    std::tie(unordered_write_, seq_per_batch_, two_queues_, allow_parallel_,
+             allow_batching_, enable_WAL_, enable_pipelined_write_) =
+        GetParam();
+  }
+
+ protected:
+  bool unordered_write_;
+  bool seq_per_batch_;
+  bool two_queues_;
+  bool allow_parallel_;
+  bool allow_batching_;
+  bool enable_WAL_;
+  bool enable_pipelined_write_;
+};
+
+TEST_P(WriteCallbackPTest, WriteWithCallbackTest) {
+  struct WriteOP {
+    WriteOP(bool should_fail = false) { callback_.should_fail_ = should_fail; }
+
+    void Put(const string& key, const string& val) {
+      kvs_.push_back(std::make_pair(key, val));
+      ASSERT_OK(write_batch_.Put(key, val));
+    }
+
+    void Clear() {
+      kvs_.clear();
+      write_batch_.Clear();
+      callback_.was_called_.store(false);
+    }
+
+    MockWriteCallback callback_;
+    WriteBatch write_batch_;
+    std::vector<std::pair<string, string>> kvs_;
+  };
+
+  // In each scenario we'll launch multiple threads to write.
+  // The size of each array equals to number of threads, and
+  // each boolean in it denote whether callback of corresponding
+  // thread should succeed or fail.
+  std::vector<std::vector<WriteOP>> write_scenarios = {
+      {true},
+      {false},
+      {false, false},
+      {true, true},
+      {true, false},
+      {false, true},
+      {false, false, false},
+      {true, true, true},
+      {false, true, false},
+      {true, false, true},
+      {true, false, false, false, false},
+      {false, false, false, false, true},
+      {false, false, true, false, true},
+  };
+
+  for (auto& write_group : write_scenarios) {
+    Options options;
+    options.create_if_missing = true;
+    options.unordered_write = unordered_write_;
+    options.allow_concurrent_memtable_write = allow_parallel_;
+    options.enable_pipelined_write = enable_pipelined_write_;
+    options.two_write_queues = two_queues_;
+    // Skip unsupported combinations
+    if (options.enable_pipelined_write && seq_per_batch_) {
+      continue;
+    }
+    if (options.enable_pipelined_write && options.two_write_queues) {
+      continue;
+    }
+    if (options.unordered_write && !options.allow_concurrent_memtable_write) {
+      continue;
+    }
+    if (options.unordered_write && options.enable_pipelined_write) {
+      continue;
+    }
+
+    ReadOptions read_options;
+    DB* db;
+    DBImpl* db_impl;
+
+    ASSERT_OK(DestroyDB(dbname, options));
+
+    DBOptions db_options(options);
+    ColumnFamilyOptions cf_options(options);
+    std::vector<ColumnFamilyDescriptor> column_families;
+    column_families.push_back(
+        ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+    std::vector<ColumnFamilyHandle*> handles;
+    auto open_s = DBImpl::Open(db_options, dbname, column_families, &handles,
+                               &db, seq_per_batch_, true /* batch_per_txn */);
+    ASSERT_OK(open_s);
+    assert(handles.size() == 1);
+    delete handles[0];
+
+    db_impl = dynamic_cast<DBImpl*>(db);
+    ASSERT_TRUE(db_impl);
+
+    // Writers that have called JoinBatchGroup.
+    std::atomic<uint64_t> threads_joining(0);
+    // Writers that have linked to the queue
+    std::atomic<uint64_t> threads_linked(0);
+    // Writers that pass WriteThread::JoinBatchGroup:Wait sync-point.
+    std::atomic<uint64_t> threads_verified(0);
+
+    std::atomic<uint64_t> seq(db_impl->GetLatestSequenceNumber());
+    ASSERT_EQ(db_impl->GetLatestSequenceNumber(), 0);
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "WriteThread::JoinBatchGroup:Start", [&](void*) {
+          uint64_t cur_threads_joining = threads_joining.fetch_add(1);
+          // Wait for the last joined writer to link to the queue.
+          // In this way the writers link to the queue one by one.
+          // This allows us to confidently detect the first writer
+          // who increases threads_linked as the leader.
+          while (threads_linked.load() < cur_threads_joining) {
+          }
+        });
+
+    // Verification once writers call JoinBatchGroup.
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "WriteThread::JoinBatchGroup:Wait", [&](void* arg) {
+          uint64_t cur_threads_linked = threads_linked.fetch_add(1);
+          bool is_leader = false;
+          bool is_last = false;
+
+          // who am i
+          is_leader = (cur_threads_linked == 0);
+          is_last = (cur_threads_linked == write_group.size() - 1);
+
+          // check my state
+          auto* writer = reinterpret_cast<WriteThread::Writer*>(arg);
+
+          if (is_leader) {
+            ASSERT_TRUE(writer->state ==
+                        WriteThread::State::STATE_GROUP_LEADER);
+          } else {
+            ASSERT_TRUE(writer->state == WriteThread::State::STATE_INIT);
+          }
+
+          // (meta test) the first WriteOP should indeed be the first
+          // and the last should be the last (all others can be out of
+          // order)
+          if (is_leader) {
+            ASSERT_TRUE(writer->callback->Callback(nullptr).ok() ==
+                        !write_group.front().callback_.should_fail_);
+          } else if (is_last) {
+            ASSERT_TRUE(writer->callback->Callback(nullptr).ok() ==
+                        !write_group.back().callback_.should_fail_);
+          }
+
+          threads_verified.fetch_add(1);
+          // Wait here until all verification in this sync-point
+          // callback finish for all writers.
+          while (threads_verified.load() < write_group.size()) {
+          }
+        });
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "WriteThread::JoinBatchGroup:DoneWaiting", [&](void* arg) {
+          // check my state
+          auto* writer = reinterpret_cast<WriteThread::Writer*>(arg);
+
+          if (!allow_batching_) {
+            // no batching so everyone should be a leader
+            ASSERT_TRUE(writer->state ==
+                        WriteThread::State::STATE_GROUP_LEADER);
+          } else if (!allow_parallel_) {
+            ASSERT_TRUE(writer->state == WriteThread::State::STATE_COMPLETED ||
+                        (enable_pipelined_write_ &&
+                         writer->state ==
+                             WriteThread::State::STATE_MEMTABLE_WRITER_LEADER));
+          }
+        });
+
+    std::atomic<uint32_t> thread_num(0);
+    std::atomic<char> dummy_key(0);
+
+    // Each write thread create a random write batch and write to DB
+    // with a write callback.
+    std::function<void()> write_with_callback_func = [&]() {
+      uint32_t i = thread_num.fetch_add(1);
+      Random rnd(i);
+
+      // leaders gotta lead
+      while (i > 0 && threads_verified.load() < 1) {
+      }
+
+      // loser has to lose
+      while (i == write_group.size() - 1 &&
+             threads_verified.load() < write_group.size() - 1) {
+      }
+
+      auto& write_op = write_group.at(i);
+      write_op.Clear();
+      write_op.callback_.allow_batching_ = allow_batching_;
+
+      // insert some keys
+      for (uint32_t j = 0; j < rnd.Next() % 50; j++) {
+        // grab unique key
+        char my_key = dummy_key.fetch_add(1);
+
+        string skey(5, my_key);
+        string sval(10, my_key);
+        write_op.Put(skey, sval);
+
+        if (!write_op.callback_.should_fail_ && !seq_per_batch_) {
+          seq.fetch_add(1);
+        }
+      }
+      if (!write_op.callback_.should_fail_ && seq_per_batch_) {
+        seq.fetch_add(1);
+      }
+
+      WriteOptions woptions;
+      woptions.disableWAL = !enable_WAL_;
+      woptions.sync = enable_WAL_;
+      if (woptions.protection_bytes_per_key > 0) {
+        ASSERT_OK(WriteBatchInternal::UpdateProtectionInfo(
+            &write_op.write_batch_, woptions.protection_bytes_per_key));
+      }
+      Status s;
+      if (seq_per_batch_) {
+        class PublishSeqCallback : public PreReleaseCallback {
+         public:
+          PublishSeqCallback(DBImpl* db_impl_in) : db_impl_(db_impl_in) {}
+          Status Callback(SequenceNumber last_seq, bool /*not used*/, uint64_t,
+                          size_t /*index*/, size_t /*total*/) override {
+            db_impl_->SetLastPublishedSequence(last_seq);
+            return Status::OK();
+          }
+          DBImpl* db_impl_;
+        } publish_seq_callback(db_impl);
+        // seq_per_batch_ requires a natural batch separator or Noop
+        ASSERT_OK(WriteBatchInternal::InsertNoop(&write_op.write_batch_));
+        const size_t ONE_BATCH = 1;
+        s = db_impl->WriteImpl(woptions, &write_op.write_batch_,
+                               &write_op.callback_, nullptr, 0, false, nullptr,
+                               ONE_BATCH,
+                               two_queues_ ? &publish_seq_callback : nullptr);
+      } else {
+        s = db_impl->WriteWithCallback(woptions, &write_op.write_batch_,
+                                       &write_op.callback_);
+      }
+
+      if (write_op.callback_.should_fail_) {
+        ASSERT_TRUE(s.IsBusy());
+      } else {
+        ASSERT_OK(s);
+      }
+    };
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    // do all the writes
+    std::vector<port::Thread> threads;
+    for (uint32_t i = 0; i < write_group.size(); i++) {
+      threads.emplace_back(write_with_callback_func);
+    }
+    for (auto& t : threads) {
+      t.join();
+    }
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+    // check for keys
+    string value;
+    for (auto& w : write_group) {
+      ASSERT_TRUE(w.callback_.was_called_.load());
+      for (auto& kvp : w.kvs_) {
+        if (w.callback_.should_fail_) {
+          ASSERT_TRUE(db->Get(read_options, kvp.first, &value).IsNotFound());
+        } else {
+          ASSERT_OK(db->Get(read_options, kvp.first, &value));
+          ASSERT_EQ(value, kvp.second);
+        }
+      }
+    }
+
+    ASSERT_EQ(seq.load(), db_impl->TEST_GetLastVisibleSequence());
+
+    delete db;
+    ASSERT_OK(DestroyDB(dbname, options));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(WriteCallbackPTest, WriteCallbackPTest,
+                        ::testing::Combine(::testing::Bool(), ::testing::Bool(),
+                                           ::testing::Bool(), ::testing::Bool(),
+                                           ::testing::Bool(), ::testing::Bool(),
+                                           ::testing::Bool()));
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_F(WriteCallbackTest, WriteCallBackTest) {
+  Options options;
+  WriteOptions write_options;
+  ReadOptions read_options;
+  string value;
+  DB* db;
+  DBImpl* db_impl;
+
+  ASSERT_OK(DestroyDB(dbname, options));
+
+  options.create_if_missing = true;
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+
+  db_impl = dynamic_cast<DBImpl*>(db);
+  ASSERT_TRUE(db_impl);
+
+  WriteBatch wb;
+
+  ASSERT_OK(wb.Put("a", "value.a"));
+  ASSERT_OK(wb.Delete("x"));
+
+  // Test a simple Write
+  s = db->Write(write_options, &wb);
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "a", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("value.a", value);
+
+  // Test WriteWithCallback
+  WriteCallbackTestWriteCallback1 callback1;
+  WriteBatch wb2;
+
+  ASSERT_OK(wb2.Put("a", "value.a2"));
+
+  s = db_impl->WriteWithCallback(write_options, &wb2, &callback1);
+  ASSERT_OK(s);
+  ASSERT_TRUE(callback1.was_called);
+
+  s = db->Get(read_options, "a", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("value.a2", value);
+
+  // Test WriteWithCallback for a callback that fails
+  WriteCallbackTestWriteCallback2 callback2;
+  WriteBatch wb3;
+
+  ASSERT_OK(wb3.Put("a", "value.a3"));
+
+  s = db_impl->WriteWithCallback(write_options, &wb3, &callback2);
+  ASSERT_NOK(s);
+
+  s = db->Get(read_options, "a", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("value.a2", value);
+
+  delete db;
+  ASSERT_OK(DestroyDB(dbname, options));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as WriteWithCallback is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/write_controller.cc b/src/rocksdb/db/write_controller.cc
new file mode 100644
index 000000000..c5f744375
--- /dev/null
+++ b/src/rocksdb/db/write_controller.cc
@@ -0,0 +1,121 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/write_controller.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <ratio>
+
+#include "rocksdb/system_clock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::unique_ptr<WriteControllerToken> WriteController::GetStopToken() {
+  ++total_stopped_;
+  return std::unique_ptr<WriteControllerToken>(new StopWriteToken(this));
+}
+
+std::unique_ptr<WriteControllerToken> WriteController::GetDelayToken(
+    uint64_t write_rate) {
+  if (0 == total_delayed_++) {
+    // Starting delay, so reset counters.
+    next_refill_time_ = 0;
+    credit_in_bytes_ = 0;
+  }
+  // NOTE: for simplicity, any current credit_in_bytes_ or "debt" in
+  // next_refill_time_ will be based on an old rate. This rate will apply
+  // for subsequent additional debts and for the next refill.
+  set_delayed_write_rate(write_rate);
+  return std::unique_ptr<WriteControllerToken>(new DelayWriteToken(this));
+}
+
+std::unique_ptr<WriteControllerToken>
+WriteController::GetCompactionPressureToken() {
+  ++total_compaction_pressure_;
+  return std::unique_ptr<WriteControllerToken>(
+      new CompactionPressureToken(this));
+}
+
+bool WriteController::IsStopped() const {
+  return total_stopped_.load(std::memory_order_relaxed) > 0;
+}
+// This is inside DB mutex, so we can't sleep and need to minimize
+// frequency to get time.
+// If it turns out to be a performance issue, we can redesign the thread
+// synchronization model here.
+// The function trust caller will sleep micros returned.
+uint64_t WriteController::GetDelay(SystemClock* clock, uint64_t num_bytes) {
+  if (total_stopped_.load(std::memory_order_relaxed) > 0) {
+    return 0;
+  }
+  if (total_delayed_.load(std::memory_order_relaxed) == 0) {
+    return 0;
+  }
+
+  if (credit_in_bytes_ >= num_bytes) {
+    credit_in_bytes_ -= num_bytes;
+    return 0;
+  }
+  // The frequency to get time inside DB mutex is less than one per refill
+  // interval.
+  auto time_now = NowMicrosMonotonic(clock);
+
+  const uint64_t kMicrosPerSecond = 1000000;
+  // Refill every 1 ms
+  const uint64_t kMicrosPerRefill = 1000;
+
+  if (next_refill_time_ == 0) {
+    // Start with an initial allotment of bytes for one interval
+    next_refill_time_ = time_now;
+  }
+  if (next_refill_time_ <= time_now) {
+    // Refill based on time interval plus any extra elapsed
+    uint64_t elapsed = time_now - next_refill_time_ + kMicrosPerRefill;
+    credit_in_bytes_ += static_cast<uint64_t>(
+        1.0 * elapsed / kMicrosPerSecond * delayed_write_rate_ + 0.999999);
+    next_refill_time_ = time_now + kMicrosPerRefill;
+
+    if (credit_in_bytes_ >= num_bytes) {
+      // Avoid delay if possible, to reduce DB mutex release & re-aquire.
+      credit_in_bytes_ -= num_bytes;
+      return 0;
+    }
+  }
+
+  // We need to delay to avoid exceeding write rate.
+  assert(num_bytes > credit_in_bytes_);
+  uint64_t bytes_over_budget = num_bytes - credit_in_bytes_;
+  uint64_t needed_delay = static_cast<uint64_t>(
+      1.0 * bytes_over_budget / delayed_write_rate_ * kMicrosPerSecond);
+
+  credit_in_bytes_ = 0;
+  next_refill_time_ += needed_delay;
+
+  // Minimum delay of refill interval, to reduce DB mutex contention.
+  return std::max(next_refill_time_ - time_now, kMicrosPerRefill);
+}
+
+uint64_t WriteController::NowMicrosMonotonic(SystemClock* clock) {
+  return clock->NowNanos() / std::milli::den;
+}
+
+StopWriteToken::~StopWriteToken() {
+  assert(controller_->total_stopped_ >= 1);
+  --controller_->total_stopped_;
+}
+
+DelayWriteToken::~DelayWriteToken() {
+  controller_->total_delayed_--;
+  assert(controller_->total_delayed_.load() >= 0);
+}
+
+CompactionPressureToken::~CompactionPressureToken() {
+  controller_->total_compaction_pressure_--;
+  assert(controller_->total_compaction_pressure_ >= 0);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_controller.h b/src/rocksdb/db/write_controller.h
new file mode 100644
index 000000000..bcead165b
--- /dev/null
+++ b/src/rocksdb/db/write_controller.h
@@ -0,0 +1,148 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+
+#include <atomic>
+#include <memory>
+
+#include "rocksdb/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SystemClock;
+class WriteControllerToken;
+
+// WriteController is controlling write stalls in our write code-path. Write
+// stalls happen when compaction can't keep up with write rate.
+// All of the methods here (including WriteControllerToken's destructors) need
+// to be called while holding DB mutex
+class WriteController {
+ public:
+  explicit WriteController(uint64_t _delayed_write_rate = 1024u * 1024u * 32u,
+                           int64_t low_pri_rate_bytes_per_sec = 1024 * 1024)
+      : total_stopped_(0),
+        total_delayed_(0),
+        total_compaction_pressure_(0),
+        credit_in_bytes_(0),
+        next_refill_time_(0),
+        low_pri_rate_limiter_(
+            NewGenericRateLimiter(low_pri_rate_bytes_per_sec)) {
+    set_max_delayed_write_rate(_delayed_write_rate);
+  }
+  ~WriteController() = default;
+
+  // When an actor (column family) requests a stop token, all writes will be
+  // stopped until the stop token is released (deleted)
+  std::unique_ptr<WriteControllerToken> GetStopToken();
+  // When an actor (column family) requests a delay token, total delay for all
+  // writes to the DB will be controlled under the delayed write rate. Every
+  // write needs to call GetDelay() with number of bytes writing to the DB,
+  // which returns number of microseconds to sleep.
+  std::unique_ptr<WriteControllerToken> GetDelayToken(
+      uint64_t delayed_write_rate);
+  // When an actor (column family) requests a moderate token, compaction
+  // threads will be increased
+  std::unique_ptr<WriteControllerToken> GetCompactionPressureToken();
+
+  // these three metods are querying the state of the WriteController
+  bool IsStopped() const;
+  bool NeedsDelay() const { return total_delayed_.load() > 0; }
+  bool NeedSpeedupCompaction() const {
+    return IsStopped() || NeedsDelay() || total_compaction_pressure_.load() > 0;
+  }
+  // return how many microseconds the caller needs to sleep after the call
+  // num_bytes: how many number of bytes to put into the DB.
+  // Prerequisite: DB mutex held.
+  uint64_t GetDelay(SystemClock* clock, uint64_t num_bytes);
+  void set_delayed_write_rate(uint64_t write_rate) {
+    // avoid divide 0
+    if (write_rate == 0) {
+      write_rate = 1u;
+    } else if (write_rate > max_delayed_write_rate()) {
+      write_rate = max_delayed_write_rate();
+    }
+    delayed_write_rate_ = write_rate;
+  }
+
+  void set_max_delayed_write_rate(uint64_t write_rate) {
+    // avoid divide 0
+    if (write_rate == 0) {
+      write_rate = 1u;
+    }
+    max_delayed_write_rate_ = write_rate;
+    // update delayed_write_rate_ as well
+    delayed_write_rate_ = write_rate;
+  }
+
+  uint64_t delayed_write_rate() const { return delayed_write_rate_; }
+
+  uint64_t max_delayed_write_rate() const { return max_delayed_write_rate_; }
+
+  RateLimiter* low_pri_rate_limiter() { return low_pri_rate_limiter_.get(); }
+
+ private:
+  uint64_t NowMicrosMonotonic(SystemClock* clock);
+
+  friend class WriteControllerToken;
+  friend class StopWriteToken;
+  friend class DelayWriteToken;
+  friend class CompactionPressureToken;
+
+  std::atomic<int> total_stopped_;
+  std::atomic<int> total_delayed_;
+  std::atomic<int> total_compaction_pressure_;
+
+  // Number of bytes allowed to write without delay
+  uint64_t credit_in_bytes_;
+  // Next time that we can add more credit of bytes
+  uint64_t next_refill_time_;
+  // Write rate set when initialization or by `DBImpl::SetDBOptions`
+  uint64_t max_delayed_write_rate_;
+  // Current write rate (bytes / second)
+  uint64_t delayed_write_rate_;
+
+  std::unique_ptr<RateLimiter> low_pri_rate_limiter_;
+};
+
+class WriteControllerToken {
+ public:
+  explicit WriteControllerToken(WriteController* controller)
+      : controller_(controller) {}
+  virtual ~WriteControllerToken() {}
+
+ protected:
+  WriteController* controller_;
+
+ private:
+  // no copying allowed
+  WriteControllerToken(const WriteControllerToken&) = delete;
+  void operator=(const WriteControllerToken&) = delete;
+};
+
+class StopWriteToken : public WriteControllerToken {
+ public:
+  explicit StopWriteToken(WriteController* controller)
+      : WriteControllerToken(controller) {}
+  virtual ~StopWriteToken();
+};
+
+class DelayWriteToken : public WriteControllerToken {
+ public:
+  explicit DelayWriteToken(WriteController* controller)
+      : WriteControllerToken(controller) {}
+  virtual ~DelayWriteToken();
+};
+
+class CompactionPressureToken : public WriteControllerToken {
+ public:
+  explicit CompactionPressureToken(WriteController* controller)
+      : WriteControllerToken(controller) {}
+  virtual ~CompactionPressureToken();
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_controller_test.cc b/src/rocksdb/db/write_controller_test.cc
new file mode 100644
index 000000000..b6321a3bc
--- /dev/null
+++ b/src/rocksdb/db/write_controller_test.cc
@@ -0,0 +1,248 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/write_controller.h"
+
+#include <array>
+#include <ratio>
+
+#include "rocksdb/system_clock.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+class TimeSetClock : public SystemClockWrapper {
+ public:
+  explicit TimeSetClock() : SystemClockWrapper(nullptr) {}
+  const char* Name() const override { return "TimeSetClock"; }
+  uint64_t now_micros_ = 6666;
+  uint64_t NowNanos() override { return now_micros_ * std::milli::den; }
+};
+}  // anonymous namespace
+class WriteControllerTest : public testing::Test {
+ public:
+  WriteControllerTest() { clock_ = std::make_shared<TimeSetClock>(); }
+  std::shared_ptr<TimeSetClock> clock_;
+};
+
+// Make tests easier to read
+#define MILLION *1000000u
+#define MB MILLION
+#define MBPS MILLION
+#define SECS MILLION  // in microseconds
+
+TEST_F(WriteControllerTest, BasicAPI) {
+  WriteController controller(40 MBPS);  // also set max delayed rate
+  EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS);
+  EXPECT_FALSE(controller.IsStopped());
+  EXPECT_FALSE(controller.NeedsDelay());
+  EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB));
+
+  // set, get
+  controller.set_delayed_write_rate(20 MBPS);
+  EXPECT_EQ(controller.delayed_write_rate(), 20 MBPS);
+  EXPECT_FALSE(controller.IsStopped());
+  EXPECT_FALSE(controller.NeedsDelay());
+  EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB));
+
+  {
+    // set with token, get
+    auto delay_token_0 = controller.GetDelayToken(10 MBPS);
+    EXPECT_EQ(controller.delayed_write_rate(), 10 MBPS);
+    EXPECT_FALSE(controller.IsStopped());
+    EXPECT_TRUE(controller.NeedsDelay());
+    // test with delay
+    EXPECT_EQ(2 SECS, controller.GetDelay(clock_.get(), 20 MB));
+    clock_->now_micros_ += 2 SECS;  // pay the "debt"
+
+    auto delay_token_1 = controller.GetDelayToken(2 MBPS);
+    EXPECT_EQ(10 SECS, controller.GetDelay(clock_.get(), 20 MB));
+    clock_->now_micros_ += 10 SECS;  // pay the "debt"
+
+    auto delay_token_2 = controller.GetDelayToken(1 MBPS);
+    EXPECT_EQ(20 SECS, controller.GetDelay(clock_.get(), 20 MB));
+    clock_->now_micros_ += 20 SECS;  // pay the "debt"
+
+    auto delay_token_3 = controller.GetDelayToken(20 MBPS);
+    EXPECT_EQ(1 SECS, controller.GetDelay(clock_.get(), 20 MB));
+    clock_->now_micros_ += 1 SECS;  // pay the "debt"
+
+    // 60M is more than the max rate of 40M. Max rate will be used.
+    EXPECT_EQ(controller.delayed_write_rate(), 20 MBPS);
+    auto delay_token_4 =
+        controller.GetDelayToken(controller.delayed_write_rate() * 3);
+    EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS);
+    EXPECT_EQ(static_cast<uint64_t>(0.5 SECS),
+              controller.GetDelay(clock_.get(), 20 MB));
+
+    EXPECT_FALSE(controller.IsStopped());
+    EXPECT_TRUE(controller.NeedsDelay());
+
+    // Test stop tokens
+    {
+      auto stop_token_1 = controller.GetStopToken();
+      EXPECT_TRUE(controller.IsStopped());
+      EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB));
+      {
+        auto stop_token_2 = controller.GetStopToken();
+        EXPECT_TRUE(controller.IsStopped());
+        EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB));
+      }
+      EXPECT_TRUE(controller.IsStopped());
+      EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB));
+    }
+    // Stop tokens released
+    EXPECT_FALSE(controller.IsStopped());
+    EXPECT_TRUE(controller.NeedsDelay());
+    EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS);
+    // pay the previous "debt"
+    clock_->now_micros_ += static_cast<uint64_t>(0.5 SECS);
+    EXPECT_EQ(1 SECS, controller.GetDelay(clock_.get(), 40 MB));
+  }
+
+  // Delay tokens released
+  EXPECT_FALSE(controller.NeedsDelay());
+}
+
+TEST_F(WriteControllerTest, StartFilled) {
+  WriteController controller(10 MBPS);
+
+  // Attempt to write two things that combined would be allowed within
+  // a single refill interval
+  auto delay_token_0 =
+      controller.GetDelayToken(controller.delayed_write_rate());
+
+  // Verify no delay because write rate has not been exceeded within
+  // refill interval.
+  EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/));
+  EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/));
+
+  // Allow refill (kMicrosPerRefill)
+  clock_->now_micros_ += 1000;
+
+  // Again
+  EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/));
+  EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/));
+
+  // Control: something bigger that would exceed write rate within interval
+  uint64_t delay = controller.GetDelay(clock_.get(), 10 MB);
+  EXPECT_GT(1.0 * delay, 0.999 SECS);
+  EXPECT_LT(1.0 * delay, 1.001 SECS);
+}
+
+TEST_F(WriteControllerTest, DebtAccumulation) {
+  WriteController controller(10 MBPS);
+
+  std::array<std::unique_ptr<WriteControllerToken>, 10> tokens;
+
+  // Accumulate a time delay debt with no passage of time, like many column
+  // families delaying writes simultaneously. (Old versions of WriteController
+  // would reset the debt on every GetDelayToken.)
+  uint64_t debt = 0;
+  for (unsigned i = 0; i < tokens.size(); ++i) {
+    tokens[i] = controller.GetDelayToken((i + 1u) MBPS);
+    uint64_t delay = controller.GetDelay(clock_.get(), 63 MB);
+    ASSERT_GT(delay, debt);
+    uint64_t incremental = delay - debt;
+    ASSERT_EQ(incremental, (63 SECS) / (i + 1u));
+    debt += incremental;
+  }
+
+  // Pay down the debt
+  clock_->now_micros_ += debt;
+  debt = 0;
+
+  // Now accumulate debt with some passage of time.
+  for (unsigned i = 0; i < tokens.size(); ++i) {
+    // Debt is accumulated in time, not in bytes, so this new write
+    // limit is not applied to prior requested delays, even it they are
+    // in progress.
+    tokens[i] = controller.GetDelayToken((i + 1u) MBPS);
+    uint64_t delay = controller.GetDelay(clock_.get(), 63 MB);
+    ASSERT_GT(delay, debt);
+    uint64_t incremental = delay - debt;
+    ASSERT_EQ(incremental, (63 SECS) / (i + 1u));
+    debt += incremental;
+    uint64_t credit = debt / 2;
+    clock_->now_micros_ += credit;
+    debt -= credit;
+  }
+
+  // Pay down the debt
+  clock_->now_micros_ += debt;
+  debt = 0;    // consistent state
+  (void)debt;  // appease clang-analyze
+
+  // Verify paid down
+  EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 100u /*small bytes*/));
+
+  // Accumulate another debt, without accounting, and releasing tokens
+  for (unsigned i = 0; i < tokens.size(); ++i) {
+    // Big and small are delayed
+    ASSERT_LT(0U, controller.GetDelay(clock_.get(), 63 MB));
+    ASSERT_LT(0U, controller.GetDelay(clock_.get(), 100u /*small bytes*/));
+    tokens[i].reset();
+  }
+  // All tokens released.
+  // Verify that releasing all tokens pays down debt, even with no time passage.
+  tokens[0] = controller.GetDelayToken(1 MBPS);
+  ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 100u /*small bytes*/));
+}
+
+// This may or may not be a "good" feature, but it's an old feature
+TEST_F(WriteControllerTest, CreditAccumulation) {
+  WriteController controller(10 MBPS);
+
+  std::array<std::unique_ptr<WriteControllerToken>, 10> tokens;
+
+  // Ensure started
+  tokens[0] = controller.GetDelayToken(1 MBPS);
+  ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB));
+  clock_->now_micros_ += 10 SECS;
+
+  // Accumulate a credit
+  uint64_t credit = 1000 SECS /* see below: * 1 MB / 1 SEC */;
+  clock_->now_micros_ += credit;
+
+  // Spend some credit (burst of I/O)
+  for (unsigned i = 0; i < tokens.size(); ++i) {
+    tokens[i] = controller.GetDelayToken((i + 1u) MBPS);
+    ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 63 MB));
+    // In WriteController, credit is accumulated in bytes, not in time.
+    // After an "unnecessary" delay, all of our time credit will be
+    // translated to bytes on the next operation, in this case with
+    // setting 1 MBPS. So regardless of the rate at delay time, we just
+    // account for the bytes.
+    credit -= 63 MB;
+  }
+  // Spend remaining credit
+  tokens[0] = controller.GetDelayToken(1 MBPS);
+  ASSERT_EQ(0U, controller.GetDelay(clock_.get(), credit));
+  // Verify
+  ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB));
+  clock_->now_micros_ += 10 SECS;
+
+  // Accumulate a credit, no accounting
+  clock_->now_micros_ += 1000 SECS;
+
+  // Spend a small amount, releasing tokens
+  for (unsigned i = 0; i < tokens.size(); ++i) {
+    ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 3 MB));
+    tokens[i].reset();
+  }
+
+  // All tokens released.
+  // Verify credit is wiped away on new delay.
+  tokens[0] = controller.GetDelayToken(1 MBPS);
+  ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/write_thread.cc b/src/rocksdb/db/write_thread.cc
new file mode 100644
index 000000000..cc8645f37
--- /dev/null
+++ b/src/rocksdb/db/write_thread.cc
@@ -0,0 +1,815 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/write_thread.h"
+
+#include <chrono>
+#include <thread>
+
+#include "db/column_family.h"
+#include "monitoring/perf_context_imp.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+WriteThread::WriteThread(const ImmutableDBOptions& db_options)
+    : max_yield_usec_(db_options.enable_write_thread_adaptive_yield
+                          ? db_options.write_thread_max_yield_usec
+                          : 0),
+      slow_yield_usec_(db_options.write_thread_slow_yield_usec),
+      allow_concurrent_memtable_write_(
+          db_options.allow_concurrent_memtable_write),
+      enable_pipelined_write_(db_options.enable_pipelined_write),
+      max_write_batch_group_size_bytes(
+          db_options.max_write_batch_group_size_bytes),
+      newest_writer_(nullptr),
+      newest_memtable_writer_(nullptr),
+      last_sequence_(0),
+      write_stall_dummy_(),
+      stall_mu_(),
+      stall_cv_(&stall_mu_) {}
+
+uint8_t WriteThread::BlockingAwaitState(Writer* w, uint8_t goal_mask) {
+  // We're going to block.  Lazily create the mutex.  We guarantee
+  // propagation of this construction to the waker via the
+  // STATE_LOCKED_WAITING state.  The waker won't try to touch the mutex
+  // or the condvar unless they CAS away the STATE_LOCKED_WAITING that
+  // we install below.
+  w->CreateMutex();
+
+  auto state = w->state.load(std::memory_order_acquire);
+  assert(state != STATE_LOCKED_WAITING);
+  if ((state & goal_mask) == 0 &&
+      w->state.compare_exchange_strong(state, STATE_LOCKED_WAITING)) {
+    // we have permission (and an obligation) to use StateMutex
+    std::unique_lock<std::mutex> guard(w->StateMutex());
+    w->StateCV().wait(guard, [w] {
+      return w->state.load(std::memory_order_relaxed) != STATE_LOCKED_WAITING;
+    });
+    state = w->state.load(std::memory_order_relaxed);
+  }
+  // else tricky.  Goal is met or CAS failed.  In the latter case the waker
+  // must have changed the state, and compare_exchange_strong has updated
+  // our local variable with the new one.  At the moment WriteThread never
+  // waits for a transition across intermediate states, so we know that
+  // since a state change has occurred the goal must have been met.
+  assert((state & goal_mask) != 0);
+  return state;
+}
+
+uint8_t WriteThread::AwaitState(Writer* w, uint8_t goal_mask,
+                                AdaptationContext* ctx) {
+  uint8_t state = 0;
+
+  // 1. Busy loop using "pause" for 1 micro sec
+  // 2. Else SOMETIMES busy loop using "yield" for 100 micro sec (default)
+  // 3. Else blocking wait
+
+  // On a modern Xeon each loop takes about 7 nanoseconds (most of which
+  // is the effect of the pause instruction), so 200 iterations is a bit
+  // more than a microsecond.  This is long enough that waits longer than
+  // this can amortize the cost of accessing the clock and yielding.
+  for (uint32_t tries = 0; tries < 200; ++tries) {
+    state = w->state.load(std::memory_order_acquire);
+    if ((state & goal_mask) != 0) {
+      return state;
+    }
+    port::AsmVolatilePause();
+  }
+
+  // This is below the fast path, so that the stat is zero when all writes are
+  // from the same thread.
+  PERF_TIMER_GUARD(write_thread_wait_nanos);
+
+  // If we're only going to end up waiting a short period of time,
+  // it can be a lot more efficient to call std::this_thread::yield()
+  // in a loop than to block in StateMutex().  For reference, on my 4.0
+  // SELinux test server with support for syscall auditing enabled, the
+  // minimum latency between FUTEX_WAKE to returning from FUTEX_WAIT is
+  // 2.7 usec, and the average is more like 10 usec.  That can be a big
+  // drag on RockDB's single-writer design.  Of course, spinning is a
+  // bad idea if other threads are waiting to run or if we're going to
+  // wait for a long time.  How do we decide?
+  //
+  // We break waiting into 3 categories: short-uncontended,
+  // short-contended, and long.  If we had an oracle, then we would always
+  // spin for short-uncontended, always block for long, and our choice for
+  // short-contended might depend on whether we were trying to optimize
+  // RocksDB throughput or avoid being greedy with system resources.
+  //
+  // Bucketing into short or long is easy by measuring elapsed time.
+  // Differentiating short-uncontended from short-contended is a bit
+  // trickier, but not too bad.  We could look for involuntary context
+  // switches using getrusage(RUSAGE_THREAD, ..), but it's less work
+  // (portability code and CPU) to just look for yield calls that take
+  // longer than we expect.  sched_yield() doesn't actually result in any
+  // context switch overhead if there are no other runnable processes
+  // on the current core, in which case it usually takes less than
+  // a microsecond.
+  //
+  // There are two primary tunables here: the threshold between "short"
+  // and "long" waits, and the threshold at which we suspect that a yield
+  // is slow enough to indicate we should probably block.  If these
+  // thresholds are chosen well then CPU-bound workloads that don't
+  // have more threads than cores will experience few context switches
+  // (voluntary or involuntary), and the total number of context switches
+  // (voluntary and involuntary) will not be dramatically larger (maybe
+  // 2x) than the number of voluntary context switches that occur when
+  // --max_yield_wait_micros=0.
+  //
+  // There's another constant, which is the number of slow yields we will
+  // tolerate before reversing our previous decision.  Solitary slow
+  // yields are pretty common (low-priority small jobs ready to run),
+  // so this should be at least 2.  We set this conservatively to 3 so
+  // that we can also immediately schedule a ctx adaptation, rather than
+  // waiting for the next update_ctx.
+
+  const size_t kMaxSlowYieldsWhileSpinning = 3;
+
+  // Whether the yield approach has any credit in this context. The credit is
+  // added by yield being succesfull before timing out, and decreased otherwise.
+  auto& yield_credit = ctx->value;
+  // Update the yield_credit based on sample runs or right after a hard failure
+  bool update_ctx = false;
+  // Should we reinforce the yield credit
+  bool would_spin_again = false;
+  // The samling base for updating the yeild credit. The sampling rate would be
+  // 1/sampling_base.
+  const int sampling_base = 256;
+
+  if (max_yield_usec_ > 0) {
+    update_ctx = Random::GetTLSInstance()->OneIn(sampling_base);
+
+    if (update_ctx || yield_credit.load(std::memory_order_relaxed) >= 0) {
+      // we're updating the adaptation statistics, or spinning has >
+      // 50% chance of being shorter than max_yield_usec_ and causing no
+      // involuntary context switches
+      auto spin_begin = std::chrono::steady_clock::now();
+
+      // this variable doesn't include the final yield (if any) that
+      // causes the goal to be met
+      size_t slow_yield_count = 0;
+
+      auto iter_begin = spin_begin;
+      while ((iter_begin - spin_begin) <=
+             std::chrono::microseconds(max_yield_usec_)) {
+        std::this_thread::yield();
+
+        state = w->state.load(std::memory_order_acquire);
+        if ((state & goal_mask) != 0) {
+          // success
+          would_spin_again = true;
+          break;
+        }
+
+        auto now = std::chrono::steady_clock::now();
+        if (now == iter_begin ||
+            now - iter_begin >= std::chrono::microseconds(slow_yield_usec_)) {
+          // conservatively count it as a slow yield if our clock isn't
+          // accurate enough to measure the yield duration
+          ++slow_yield_count;
+          if (slow_yield_count >= kMaxSlowYieldsWhileSpinning) {
+            // Not just one ivcsw, but several.  Immediately update yield_credit
+            // and fall back to blocking
+            update_ctx = true;
+            break;
+          }
+        }
+        iter_begin = now;
+      }
+    }
+  }
+
+  if ((state & goal_mask) == 0) {
+    TEST_SYNC_POINT_CALLBACK("WriteThread::AwaitState:BlockingWaiting", w);
+    state = BlockingAwaitState(w, goal_mask);
+  }
+
+  if (update_ctx) {
+    // Since our update is sample based, it is ok if a thread overwrites the
+    // updates by other threads. Thus the update does not have to be atomic.
+    auto v = yield_credit.load(std::memory_order_relaxed);
+    // fixed point exponential decay with decay constant 1/1024, with +1
+    // and -1 scaled to avoid overflow for int32_t
+    //
+    // On each update the positive credit is decayed by a facor of 1/1024 (i.e.,
+    // 0.1%). If the sampled yield was successful, the credit is also increased
+    // by X. Setting X=2^17 ensures that the credit never exceeds
+    // 2^17*2^10=2^27, which is lower than 2^31 the upperbound of int32_t. Same
+    // logic applies to negative credits.
+    v = v - (v / 1024) + (would_spin_again ? 1 : -1) * 131072;
+    yield_credit.store(v, std::memory_order_relaxed);
+  }
+
+  assert((state & goal_mask) != 0);
+  return state;
+}
+
+void WriteThread::SetState(Writer* w, uint8_t new_state) {
+  assert(w);
+  auto state = w->state.load(std::memory_order_acquire);
+  if (state == STATE_LOCKED_WAITING ||
+      !w->state.compare_exchange_strong(state, new_state)) {
+    assert(state == STATE_LOCKED_WAITING);
+
+    std::lock_guard<std::mutex> guard(w->StateMutex());
+    assert(w->state.load(std::memory_order_relaxed) != new_state);
+    w->state.store(new_state, std::memory_order_relaxed);
+    w->StateCV().notify_one();
+  }
+}
+
+bool WriteThread::LinkOne(Writer* w, std::atomic<Writer*>* newest_writer) {
+  assert(newest_writer != nullptr);
+  assert(w->state == STATE_INIT);
+  Writer* writers = newest_writer->load(std::memory_order_relaxed);
+  while (true) {
+    // If write stall in effect, and w->no_slowdown is not true,
+    // block here until stall is cleared. If its true, then return
+    // immediately
+    if (writers == &write_stall_dummy_) {
+      if (w->no_slowdown) {
+        w->status = Status::Incomplete("Write stall");
+        SetState(w, STATE_COMPLETED);
+        return false;
+      }
+      // Since no_slowdown is false, wait here to be notified of the write
+      // stall clearing
+      {
+        MutexLock lock(&stall_mu_);
+        writers = newest_writer->load(std::memory_order_relaxed);
+        if (writers == &write_stall_dummy_) {
+          TEST_SYNC_POINT_CALLBACK("WriteThread::WriteStall::Wait", w);
+          stall_cv_.Wait();
+          // Load newest_writers_ again since it may have changed
+          writers = newest_writer->load(std::memory_order_relaxed);
+          continue;
+        }
+      }
+    }
+    w->link_older = writers;
+    if (newest_writer->compare_exchange_weak(writers, w)) {
+      return (writers == nullptr);
+    }
+  }
+}
+
+bool WriteThread::LinkGroup(WriteGroup& write_group,
+                            std::atomic<Writer*>* newest_writer) {
+  assert(newest_writer != nullptr);
+  Writer* leader = write_group.leader;
+  Writer* last_writer = write_group.last_writer;
+  Writer* w = last_writer;
+  while (true) {
+    // Unset link_newer pointers to make sure when we call
+    // CreateMissingNewerLinks later it create all missing links.
+    w->link_newer = nullptr;
+    w->write_group = nullptr;
+    if (w == leader) {
+      break;
+    }
+    w = w->link_older;
+  }
+  Writer* newest = newest_writer->load(std::memory_order_relaxed);
+  while (true) {
+    leader->link_older = newest;
+    if (newest_writer->compare_exchange_weak(newest, last_writer)) {
+      return (newest == nullptr);
+    }
+  }
+}
+
+void WriteThread::CreateMissingNewerLinks(Writer* head) {
+  while (true) {
+    Writer* next = head->link_older;
+    if (next == nullptr || next->link_newer != nullptr) {
+      assert(next == nullptr || next->link_newer == head);
+      break;
+    }
+    next->link_newer = head;
+    head = next;
+  }
+}
+
+void WriteThread::CompleteLeader(WriteGroup& write_group) {
+  assert(write_group.size > 0);
+  Writer* leader = write_group.leader;
+  if (write_group.size == 1) {
+    write_group.leader = nullptr;
+    write_group.last_writer = nullptr;
+  } else {
+    assert(leader->link_newer != nullptr);
+    leader->link_newer->link_older = nullptr;
+    write_group.leader = leader->link_newer;
+  }
+  write_group.size -= 1;
+  SetState(leader, STATE_COMPLETED);
+}
+
+void WriteThread::CompleteFollower(Writer* w, WriteGroup& write_group) {
+  assert(write_group.size > 1);
+  assert(w != write_group.leader);
+  if (w == write_group.last_writer) {
+    w->link_older->link_newer = nullptr;
+    write_group.last_writer = w->link_older;
+  } else {
+    w->link_older->link_newer = w->link_newer;
+    w->link_newer->link_older = w->link_older;
+  }
+  write_group.size -= 1;
+  SetState(w, STATE_COMPLETED);
+}
+
+void WriteThread::BeginWriteStall() {
+  LinkOne(&write_stall_dummy_, &newest_writer_);
+
+  // Walk writer list until w->write_group != nullptr. The current write group
+  // will not have a mix of slowdown/no_slowdown, so its ok to stop at that
+  // point
+  Writer* w = write_stall_dummy_.link_older;
+  Writer* prev = &write_stall_dummy_;
+  while (w != nullptr && w->write_group == nullptr) {
+    if (w->no_slowdown) {
+      prev->link_older = w->link_older;
+      w->status = Status::Incomplete("Write stall");
+      SetState(w, STATE_COMPLETED);
+      // Only update `link_newer` if it's already set.
+      // `CreateMissingNewerLinks()` will update the nullptr `link_newer` later,
+      // which assumes the the first non-nullptr `link_newer` is the last
+      // nullptr link in the writer list.
+      // If `link_newer` is set here, `CreateMissingNewerLinks()` may stop
+      // updating the whole list when it sees the first non nullptr link.
+      if (prev->link_older && prev->link_older->link_newer) {
+        prev->link_older->link_newer = prev;
+      }
+      w = prev->link_older;
+    } else {
+      prev = w;
+      w = w->link_older;
+    }
+  }
+}
+
+void WriteThread::EndWriteStall() {
+  MutexLock lock(&stall_mu_);
+
+  // Unlink write_stall_dummy_ from the write queue. This will unblock
+  // pending write threads to enqueue themselves
+  assert(newest_writer_.load(std::memory_order_relaxed) == &write_stall_dummy_);
+  assert(write_stall_dummy_.link_older != nullptr);
+  write_stall_dummy_.link_older->link_newer = write_stall_dummy_.link_newer;
+  newest_writer_.exchange(write_stall_dummy_.link_older);
+
+  // Wake up writers
+  stall_cv_.SignalAll();
+}
+
+static WriteThread::AdaptationContext jbg_ctx("JoinBatchGroup");
+void WriteThread::JoinBatchGroup(Writer* w) {
+  TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Start", w);
+  assert(w->batch != nullptr);
+
+  bool linked_as_leader = LinkOne(w, &newest_writer_);
+
+  if (linked_as_leader) {
+    SetState(w, STATE_GROUP_LEADER);
+  }
+
+  TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Wait", w);
+  TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Wait2", w);
+
+  if (!linked_as_leader) {
+    /**
+     * Wait util:
+     * 1) An existing leader pick us as the new leader when it finishes
+     * 2) An existing leader pick us as its follewer and
+     * 2.1) finishes the memtable writes on our behalf
+     * 2.2) Or tell us to finish the memtable writes in pralallel
+     * 3) (pipelined write) An existing leader pick us as its follower and
+     *    finish book-keeping and WAL write for us, enqueue us as pending
+     *    memtable writer, and
+     * 3.1) we become memtable writer group leader, or
+     * 3.2) an existing memtable writer group leader tell us to finish memtable
+     *      writes in parallel.
+     */
+    TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:BeganWaiting", w);
+    AwaitState(w,
+               STATE_GROUP_LEADER | STATE_MEMTABLE_WRITER_LEADER |
+                   STATE_PARALLEL_MEMTABLE_WRITER | STATE_COMPLETED,
+               &jbg_ctx);
+    TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:DoneWaiting", w);
+  }
+}
+
+size_t WriteThread::EnterAsBatchGroupLeader(Writer* leader,
+                                            WriteGroup* write_group) {
+  assert(leader->link_older == nullptr);
+  assert(leader->batch != nullptr);
+  assert(write_group != nullptr);
+
+  size_t size = WriteBatchInternal::ByteSize(leader->batch);
+
+  // Allow the group to grow up to a maximum size, but if the
+  // original write is small, limit the growth so we do not slow
+  // down the small write too much.
+  size_t max_size = max_write_batch_group_size_bytes;
+  const uint64_t min_batch_size_bytes = max_write_batch_group_size_bytes / 8;
+  if (size <= min_batch_size_bytes) {
+    max_size = size + min_batch_size_bytes;
+  }
+
+  leader->write_group = write_group;
+  write_group->leader = leader;
+  write_group->last_writer = leader;
+  write_group->size = 1;
+  Writer* newest_writer = newest_writer_.load(std::memory_order_acquire);
+
+  // This is safe regardless of any db mutex status of the caller. Previous
+  // calls to ExitAsGroupLeader either didn't call CreateMissingNewerLinks
+  // (they emptied the list and then we added ourself as leader) or had to
+  // explicitly wake us up (the list was non-empty when we added ourself,
+  // so we have already received our MarkJoined).
+  CreateMissingNewerLinks(newest_writer);
+
+  // Tricky. Iteration start (leader) is exclusive and finish
+  // (newest_writer) is inclusive. Iteration goes from old to new.
+  Writer* w = leader;
+  while (w != newest_writer) {
+    assert(w->link_newer);
+    w = w->link_newer;
+
+    if (w->sync && !leader->sync) {
+      // Do not include a sync write into a batch handled by a non-sync write.
+      break;
+    }
+
+    if (w->no_slowdown != leader->no_slowdown) {
+      // Do not mix writes that are ok with delays with the ones that
+      // request fail on delays.
+      break;
+    }
+
+    if (w->disable_wal != leader->disable_wal) {
+      // Do not mix writes that enable WAL with the ones whose
+      // WAL disabled.
+      break;
+    }
+
+    if (w->protection_bytes_per_key != leader->protection_bytes_per_key) {
+      // Do not mix writes with different levels of integrity protection.
+      break;
+    }
+
+    if (w->rate_limiter_priority != leader->rate_limiter_priority) {
+      // Do not mix writes with different rate limiter priorities.
+      break;
+    }
+
+    if (w->batch == nullptr) {
+      // Do not include those writes with nullptr batch. Those are not writes,
+      // those are something else. They want to be alone
+      break;
+    }
+
+    if (w->callback != nullptr && !w->callback->AllowWriteBatching()) {
+      // don't batch writes that don't want to be batched
+      break;
+    }
+
+    auto batch_size = WriteBatchInternal::ByteSize(w->batch);
+    if (size + batch_size > max_size) {
+      // Do not make batch too big
+      break;
+    }
+
+    w->write_group = write_group;
+    size += batch_size;
+    write_group->last_writer = w;
+    write_group->size++;
+  }
+  TEST_SYNC_POINT_CALLBACK("WriteThread::EnterAsBatchGroupLeader:End", w);
+  return size;
+}
+
+void WriteThread::EnterAsMemTableWriter(Writer* leader,
+                                        WriteGroup* write_group) {
+  assert(leader != nullptr);
+  assert(leader->link_older == nullptr);
+  assert(leader->batch != nullptr);
+  assert(write_group != nullptr);
+
+  size_t size = WriteBatchInternal::ByteSize(leader->batch);
+
+  // Allow the group to grow up to a maximum size, but if the
+  // original write is small, limit the growth so we do not slow
+  // down the small write too much.
+  size_t max_size = max_write_batch_group_size_bytes;
+  const uint64_t min_batch_size_bytes = max_write_batch_group_size_bytes / 8;
+  if (size <= min_batch_size_bytes) {
+    max_size = size + min_batch_size_bytes;
+  }
+
+  leader->write_group = write_group;
+  write_group->leader = leader;
+  write_group->size = 1;
+  Writer* last_writer = leader;
+
+  if (!allow_concurrent_memtable_write_ || !leader->batch->HasMerge()) {
+    Writer* newest_writer = newest_memtable_writer_.load();
+    CreateMissingNewerLinks(newest_writer);
+
+    Writer* w = leader;
+    while (w != newest_writer) {
+      assert(w->link_newer);
+      w = w->link_newer;
+
+      if (w->batch == nullptr) {
+        break;
+      }
+
+      if (w->batch->HasMerge()) {
+        break;
+      }
+
+      if (!allow_concurrent_memtable_write_) {
+        auto batch_size = WriteBatchInternal::ByteSize(w->batch);
+        if (size + batch_size > max_size) {
+          // Do not make batch too big
+          break;
+        }
+        size += batch_size;
+      }
+
+      w->write_group = write_group;
+      last_writer = w;
+      write_group->size++;
+    }
+  }
+
+  write_group->last_writer = last_writer;
+  write_group->last_sequence =
+      last_writer->sequence + WriteBatchInternal::Count(last_writer->batch) - 1;
+}
+
+void WriteThread::ExitAsMemTableWriter(Writer* /*self*/,
+                                       WriteGroup& write_group) {
+  Writer* leader = write_group.leader;
+  Writer* last_writer = write_group.last_writer;
+
+  Writer* newest_writer = last_writer;
+  if (!newest_memtable_writer_.compare_exchange_strong(newest_writer,
+                                                       nullptr)) {
+    CreateMissingNewerLinks(newest_writer);
+    Writer* next_leader = last_writer->link_newer;
+    assert(next_leader != nullptr);
+    next_leader->link_older = nullptr;
+    SetState(next_leader, STATE_MEMTABLE_WRITER_LEADER);
+  }
+  Writer* w = leader;
+  while (true) {
+    if (!write_group.status.ok()) {
+      w->status = write_group.status;
+    }
+    Writer* next = w->link_newer;
+    if (w != leader) {
+      SetState(w, STATE_COMPLETED);
+    }
+    if (w == last_writer) {
+      break;
+    }
+    assert(next);
+    w = next;
+  }
+  // Note that leader has to exit last, since it owns the write group.
+  SetState(leader, STATE_COMPLETED);
+}
+
+void WriteThread::LaunchParallelMemTableWriters(WriteGroup* write_group) {
+  assert(write_group != nullptr);
+  write_group->running.store(write_group->size);
+  for (auto w : *write_group) {
+    SetState(w, STATE_PARALLEL_MEMTABLE_WRITER);
+  }
+}
+
+static WriteThread::AdaptationContext cpmtw_ctx(
+    "CompleteParallelMemTableWriter");
+// This method is called by both the leader and parallel followers
+bool WriteThread::CompleteParallelMemTableWriter(Writer* w) {
+  auto* write_group = w->write_group;
+  if (!w->status.ok()) {
+    std::lock_guard<std::mutex> guard(write_group->leader->StateMutex());
+    write_group->status = w->status;
+  }
+
+  if (write_group->running-- > 1) {
+    // we're not the last one
+    AwaitState(w, STATE_COMPLETED, &cpmtw_ctx);
+    return false;
+  }
+  // else we're the last parallel worker and should perform exit duties.
+  w->status = write_group->status;
+  // Callers of this function must ensure w->status is checked.
+  write_group->status.PermitUncheckedError();
+  return true;
+}
+
+void WriteThread::ExitAsBatchGroupFollower(Writer* w) {
+  auto* write_group = w->write_group;
+
+  assert(w->state == STATE_PARALLEL_MEMTABLE_WRITER);
+  assert(write_group->status.ok());
+  ExitAsBatchGroupLeader(*write_group, write_group->status);
+  assert(w->status.ok());
+  assert(w->state == STATE_COMPLETED);
+  SetState(write_group->leader, STATE_COMPLETED);
+}
+
+static WriteThread::AdaptationContext eabgl_ctx("ExitAsBatchGroupLeader");
+void WriteThread::ExitAsBatchGroupLeader(WriteGroup& write_group,
+                                         Status& status) {
+  TEST_SYNC_POINT_CALLBACK("WriteThread::ExitAsBatchGroupLeader:Start",
+                           &write_group);
+
+  Writer* leader = write_group.leader;
+  Writer* last_writer = write_group.last_writer;
+  assert(leader->link_older == nullptr);
+
+  // If status is non-ok already, then write_group.status won't have the chance
+  // of being propagated to caller.
+  if (!status.ok()) {
+    write_group.status.PermitUncheckedError();
+  }
+
+  // Propagate memtable write error to the whole group.
+  if (status.ok() && !write_group.status.ok()) {
+    status = write_group.status;
+  }
+
+  if (enable_pipelined_write_) {
+    // We insert a dummy Writer right before our current write_group. This
+    // allows us to unlink our write_group without the risk that a subsequent
+    // writer becomes a new leader and might overtake us and add itself to the
+    // memtable-writer-list before we can do so. This ensures that writers are
+    // added to the memtable-writer-list in the exact same order in which they
+    // were in the newest_writer list.
+    // This must happen before completing the writers from our group to prevent
+    // a race where the owning thread of one of these writers can start a new
+    // write operation.
+    Writer dummy;
+    Writer* head = newest_writer_.load(std::memory_order_acquire);
+    if (head != last_writer ||
+        !newest_writer_.compare_exchange_strong(head, &dummy)) {
+      // Either last_writer wasn't the head during the load(), or it was the
+      // head during the load() but somebody else pushed onto the list before
+      // we did the compare_exchange_strong (causing it to fail). In the latter
+      // case compare_exchange_strong has the effect of re-reading its first
+      // param (head). No need to retry a failing CAS, because only a departing
+      // leader (which we are at the moment) can remove nodes from the list.
+      assert(head != last_writer);
+
+      // After walking link_older starting from head (if not already done) we
+      // will be able to traverse w->link_newer below.
+      CreateMissingNewerLinks(head);
+      assert(last_writer->link_newer != nullptr);
+      last_writer->link_newer->link_older = &dummy;
+      dummy.link_newer = last_writer->link_newer;
+    }
+
+    // Complete writers that don't write to memtable
+    for (Writer* w = last_writer; w != leader;) {
+      Writer* next = w->link_older;
+      w->status = status;
+      if (!w->ShouldWriteToMemtable()) {
+        CompleteFollower(w, write_group);
+      }
+      w = next;
+    }
+    if (!leader->ShouldWriteToMemtable()) {
+      CompleteLeader(write_group);
+    }
+
+    TEST_SYNC_POINT_CALLBACK(
+        "WriteThread::ExitAsBatchGroupLeader:AfterCompleteWriters",
+        &write_group);
+
+    // Link the remaining of the group to memtable writer list.
+    // We have to link our group to memtable writer queue before wake up the
+    // next leader or set newest_writer_ to null, otherwise the next leader
+    // can run ahead of us and link to memtable writer queue before we do.
+    if (write_group.size > 0) {
+      if (LinkGroup(write_group, &newest_memtable_writer_)) {
+        // The leader can now be different from current writer.
+        SetState(write_group.leader, STATE_MEMTABLE_WRITER_LEADER);
+      }
+    }
+
+    // Unlink the dummy writer from the list and identify the new leader
+    head = newest_writer_.load(std::memory_order_acquire);
+    if (head != &dummy ||
+        !newest_writer_.compare_exchange_strong(head, nullptr)) {
+      CreateMissingNewerLinks(head);
+      Writer* new_leader = dummy.link_newer;
+      assert(new_leader != nullptr);
+      new_leader->link_older = nullptr;
+      SetState(new_leader, STATE_GROUP_LEADER);
+    }
+
+    AwaitState(leader,
+               STATE_MEMTABLE_WRITER_LEADER | STATE_PARALLEL_MEMTABLE_WRITER |
+                   STATE_COMPLETED,
+               &eabgl_ctx);
+  } else {
+    Writer* head = newest_writer_.load(std::memory_order_acquire);
+    if (head != last_writer ||
+        !newest_writer_.compare_exchange_strong(head, nullptr)) {
+      // Either last_writer wasn't the head during the load(), or it was the
+      // head during the load() but somebody else pushed onto the list before
+      // we did the compare_exchange_strong (causing it to fail).  In the
+      // latter case compare_exchange_strong has the effect of re-reading
+      // its first param (head).  No need to retry a failing CAS, because
+      // only a departing leader (which we are at the moment) can remove
+      // nodes from the list.
+      assert(head != last_writer);
+
+      // After walking link_older starting from head (if not already done)
+      // we will be able to traverse w->link_newer below. This function
+      // can only be called from an active leader, only a leader can
+      // clear newest_writer_, we didn't, and only a clear newest_writer_
+      // could cause the next leader to start their work without a call
+      // to MarkJoined, so we can definitely conclude that no other leader
+      // work is going on here (with or without db mutex).
+      CreateMissingNewerLinks(head);
+      assert(last_writer->link_newer != nullptr);
+      assert(last_writer->link_newer->link_older == last_writer);
+      last_writer->link_newer->link_older = nullptr;
+
+      // Next leader didn't self-identify, because newest_writer_ wasn't
+      // nullptr when they enqueued (we were definitely enqueued before them
+      // and are still in the list).  That means leader handoff occurs when
+      // we call MarkJoined
+      SetState(last_writer->link_newer, STATE_GROUP_LEADER);
+    }
+    // else nobody else was waiting, although there might already be a new
+    // leader now
+
+    while (last_writer != leader) {
+      assert(last_writer);
+      last_writer->status = status;
+      // we need to read link_older before calling SetState, because as soon
+      // as it is marked committed the other thread's Await may return and
+      // deallocate the Writer.
+      auto next = last_writer->link_older;
+      SetState(last_writer, STATE_COMPLETED);
+
+      last_writer = next;
+    }
+  }
+}
+
+static WriteThread::AdaptationContext eu_ctx("EnterUnbatched");
+void WriteThread::EnterUnbatched(Writer* w, InstrumentedMutex* mu) {
+  assert(w != nullptr && w->batch == nullptr);
+  mu->Unlock();
+  bool linked_as_leader = LinkOne(w, &newest_writer_);
+  if (!linked_as_leader) {
+    TEST_SYNC_POINT("WriteThread::EnterUnbatched:Wait");
+    // Last leader will not pick us as a follower since our batch is nullptr
+    AwaitState(w, STATE_GROUP_LEADER, &eu_ctx);
+  }
+  if (enable_pipelined_write_) {
+    WaitForMemTableWriters();
+  }
+  mu->Lock();
+}
+
+void WriteThread::ExitUnbatched(Writer* w) {
+  assert(w != nullptr);
+  Writer* newest_writer = w;
+  if (!newest_writer_.compare_exchange_strong(newest_writer, nullptr)) {
+    CreateMissingNewerLinks(newest_writer);
+    Writer* next_leader = w->link_newer;
+    assert(next_leader != nullptr);
+    next_leader->link_older = nullptr;
+    SetState(next_leader, STATE_GROUP_LEADER);
+  }
+}
+
+static WriteThread::AdaptationContext wfmw_ctx("WaitForMemTableWriters");
+void WriteThread::WaitForMemTableWriters() {
+  assert(enable_pipelined_write_);
+  if (newest_memtable_writer_.load() == nullptr) {
+    return;
+  }
+  Writer w;
+  if (!LinkOne(&w, &newest_memtable_writer_)) {
+    AwaitState(&w, STATE_MEMTABLE_WRITER_LEADER, &wfmw_ctx);
+  }
+  newest_memtable_writer_.store(nullptr);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_thread.h b/src/rocksdb/db/write_thread.h
new file mode 100644
index 000000000..0ea51d922
--- /dev/null
+++ b/src/rocksdb/db/write_thread.h
@@ -0,0 +1,440 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <chrono>
+#include <condition_variable>
+#include <cstdint>
+#include <mutex>
+#include <type_traits>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/post_memtable_callback.h"
+#include "db/pre_release_callback.h"
+#include "db/write_callback.h"
+#include "monitoring/instrumented_mutex.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/write_batch.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WriteThread {
+ public:
+  enum State : uint8_t {
+    // The initial state of a writer.  This is a Writer that is
+    // waiting in JoinBatchGroup.  This state can be left when another
+    // thread informs the waiter that it has become a group leader
+    // (-> STATE_GROUP_LEADER), when a leader that has chosen to be
+    // non-parallel informs a follower that its writes have been committed
+    // (-> STATE_COMPLETED), or when a leader that has chosen to perform
+    // updates in parallel and needs this Writer to apply its batch (->
+    // STATE_PARALLEL_MEMTABLE_WRITER).
+    STATE_INIT = 1,
+
+    // The state used to inform a waiting Writer that it has become the
+    // leader, and it should now build a write batch group.  Tricky:
+    // this state is not used if newest_writer_ is empty when a writer
+    // enqueues itself, because there is no need to wait (or even to
+    // create the mutex and condvar used to wait) in that case.  This is
+    // a terminal state unless the leader chooses to make this a parallel
+    // batch, in which case the last parallel worker to finish will move
+    // the leader to STATE_COMPLETED.
+    STATE_GROUP_LEADER = 2,
+
+    // The state used to inform a waiting writer that it has become the
+    // leader of memtable writer group. The leader will either write
+    // memtable for the whole group, or launch a parallel group write
+    // to memtable by calling LaunchParallelMemTableWrite.
+    STATE_MEMTABLE_WRITER_LEADER = 4,
+
+    // The state used to inform a waiting writer that it has become a
+    // parallel memtable writer. It can be the group leader who launch the
+    // parallel writer group, or one of the followers. The writer should then
+    // apply its batch to the memtable concurrently and call
+    // CompleteParallelMemTableWriter.
+    STATE_PARALLEL_MEMTABLE_WRITER = 8,
+
+    // A follower whose writes have been applied, or a parallel leader
+    // whose followers have all finished their work.  This is a terminal
+    // state.
+    STATE_COMPLETED = 16,
+
+    // A state indicating that the thread may be waiting using StateMutex()
+    // and StateCondVar()
+    STATE_LOCKED_WAITING = 32,
+  };
+
+  struct Writer;
+
+  struct WriteGroup {
+    Writer* leader = nullptr;
+    Writer* last_writer = nullptr;
+    SequenceNumber last_sequence;
+    // before running goes to zero, status needs leader->StateMutex()
+    Status status;
+    std::atomic<size_t> running;
+    size_t size = 0;
+
+    struct Iterator {
+      Writer* writer;
+      Writer* last_writer;
+
+      explicit Iterator(Writer* w, Writer* last)
+          : writer(w), last_writer(last) {}
+
+      Writer* operator*() const { return writer; }
+
+      Iterator& operator++() {
+        assert(writer != nullptr);
+        if (writer == last_writer) {
+          writer = nullptr;
+        } else {
+          writer = writer->link_newer;
+        }
+        return *this;
+      }
+
+      bool operator!=(const Iterator& other) const {
+        return writer != other.writer;
+      }
+    };
+
+    Iterator begin() const { return Iterator(leader, last_writer); }
+    Iterator end() const { return Iterator(nullptr, nullptr); }
+  };
+
+  // Information kept for every waiting writer.
+  struct Writer {
+    WriteBatch* batch;
+    bool sync;
+    bool no_slowdown;
+    bool disable_wal;
+    Env::IOPriority rate_limiter_priority;
+    bool disable_memtable;
+    size_t batch_cnt;  // if non-zero, number of sub-batches in the write batch
+    size_t protection_bytes_per_key;
+    PreReleaseCallback* pre_release_callback;
+    PostMemTableCallback* post_memtable_callback;
+    uint64_t log_used;  // log number that this batch was inserted into
+    uint64_t log_ref;   // log number that memtable insert should reference
+    WriteCallback* callback;
+    bool made_waitable;          // records lazy construction of mutex and cv
+    std::atomic<uint8_t> state;  // write under StateMutex() or pre-link
+    WriteGroup* write_group;
+    SequenceNumber sequence;  // the sequence number to use for the first key
+    Status status;
+    Status callback_status;  // status returned by callback->Callback()
+
+    std::aligned_storage<sizeof(std::mutex)>::type state_mutex_bytes;
+    std::aligned_storage<sizeof(std::condition_variable)>::type state_cv_bytes;
+    Writer* link_older;  // read/write only before linking, or as leader
+    Writer* link_newer;  // lazy, read/write only before linking, or as leader
+
+    Writer()
+        : batch(nullptr),
+          sync(false),
+          no_slowdown(false),
+          disable_wal(false),
+          rate_limiter_priority(Env::IOPriority::IO_TOTAL),
+          disable_memtable(false),
+          batch_cnt(0),
+          protection_bytes_per_key(0),
+          pre_release_callback(nullptr),
+          post_memtable_callback(nullptr),
+          log_used(0),
+          log_ref(0),
+          callback(nullptr),
+          made_waitable(false),
+          state(STATE_INIT),
+          write_group(nullptr),
+          sequence(kMaxSequenceNumber),
+          link_older(nullptr),
+          link_newer(nullptr) {}
+
+    Writer(const WriteOptions& write_options, WriteBatch* _batch,
+           WriteCallback* _callback, uint64_t _log_ref, bool _disable_memtable,
+           size_t _batch_cnt = 0,
+           PreReleaseCallback* _pre_release_callback = nullptr,
+           PostMemTableCallback* _post_memtable_callback = nullptr)
+        : batch(_batch),
+          sync(write_options.sync),
+          no_slowdown(write_options.no_slowdown),
+          disable_wal(write_options.disableWAL),
+          rate_limiter_priority(write_options.rate_limiter_priority),
+          disable_memtable(_disable_memtable),
+          batch_cnt(_batch_cnt),
+          protection_bytes_per_key(_batch->GetProtectionBytesPerKey()),
+          pre_release_callback(_pre_release_callback),
+          post_memtable_callback(_post_memtable_callback),
+          log_used(0),
+          log_ref(_log_ref),
+          callback(_callback),
+          made_waitable(false),
+          state(STATE_INIT),
+          write_group(nullptr),
+          sequence(kMaxSequenceNumber),
+          link_older(nullptr),
+          link_newer(nullptr) {}
+
+    ~Writer() {
+      if (made_waitable) {
+        StateMutex().~mutex();
+        StateCV().~condition_variable();
+      }
+      status.PermitUncheckedError();
+      callback_status.PermitUncheckedError();
+    }
+
+    bool CheckCallback(DB* db) {
+      if (callback != nullptr) {
+        callback_status = callback->Callback(db);
+      }
+      return callback_status.ok();
+    }
+
+    void CreateMutex() {
+      if (!made_waitable) {
+        // Note that made_waitable is tracked separately from state
+        // transitions, because we can't atomically create the mutex and
+        // link into the list.
+        made_waitable = true;
+        new (&state_mutex_bytes) std::mutex;
+        new (&state_cv_bytes) std::condition_variable;
+      }
+    }
+
+    // returns the aggregate status of this Writer
+    Status FinalStatus() {
+      if (!status.ok()) {
+        // a non-ok memtable write status takes presidence
+        assert(callback == nullptr || callback_status.ok());
+        return status;
+      } else if (!callback_status.ok()) {
+        // if the callback failed then that is the status we want
+        // because a memtable insert should not have been attempted
+        assert(callback != nullptr);
+        assert(status.ok());
+        return callback_status;
+      } else {
+        // if there is no callback then we only care about
+        // the memtable insert status
+        assert(callback == nullptr || callback_status.ok());
+        return status;
+      }
+    }
+
+    bool CallbackFailed() {
+      return (callback != nullptr) && !callback_status.ok();
+    }
+
+    bool ShouldWriteToMemtable() {
+      return status.ok() && !CallbackFailed() && !disable_memtable;
+    }
+
+    bool ShouldWriteToWAL() {
+      return status.ok() && !CallbackFailed() && !disable_wal;
+    }
+
+    // No other mutexes may be acquired while holding StateMutex(), it is
+    // always last in the order
+    std::mutex& StateMutex() {
+      assert(made_waitable);
+      return *static_cast<std::mutex*>(static_cast<void*>(&state_mutex_bytes));
+    }
+
+    std::condition_variable& StateCV() {
+      assert(made_waitable);
+      return *static_cast<std::condition_variable*>(
+          static_cast<void*>(&state_cv_bytes));
+    }
+  };
+
+  struct AdaptationContext {
+    const char* name;
+    std::atomic<int32_t> value;
+
+    explicit AdaptationContext(const char* name0) : name(name0), value(0) {}
+  };
+
+  explicit WriteThread(const ImmutableDBOptions& db_options);
+
+  virtual ~WriteThread() = default;
+
+  // IMPORTANT: None of the methods in this class rely on the db mutex
+  // for correctness. All of the methods except JoinBatchGroup and
+  // EnterUnbatched may be called either with or without the db mutex held.
+  // Correctness is maintained by ensuring that only a single thread is
+  // a leader at a time.
+
+  // Registers w as ready to become part of a batch group, waits until the
+  // caller should perform some work, and returns the current state of the
+  // writer.  If w has become the leader of a write batch group, returns
+  // STATE_GROUP_LEADER.  If w has been made part of a sequential batch
+  // group and the leader has performed the write, returns STATE_DONE.
+  // If w has been made part of a parallel batch group and is responsible
+  // for updating the memtable, returns STATE_PARALLEL_MEMTABLE_WRITER.
+  //
+  // The db mutex SHOULD NOT be held when calling this function, because
+  // it will block.
+  //
+  // Writer* w:        Writer to be executed as part of a batch group
+  void JoinBatchGroup(Writer* w);
+
+  // Constructs a write batch group led by leader, which should be a
+  // Writer passed to JoinBatchGroup on the current thread.
+  //
+  // Writer* leader:          Writer that is STATE_GROUP_LEADER
+  // WriteGroup* write_group: Out-param of group members
+  // returns:                 Total batch group byte size
+  size_t EnterAsBatchGroupLeader(Writer* leader, WriteGroup* write_group);
+
+  // Unlinks the Writer-s in a batch group, wakes up the non-leaders,
+  // and wakes up the next leader (if any).
+  //
+  // WriteGroup* write_group: the write group
+  // Status status:           Status of write operation
+  void ExitAsBatchGroupLeader(WriteGroup& write_group, Status& status);
+
+  // Exit batch group on behalf of batch group leader.
+  void ExitAsBatchGroupFollower(Writer* w);
+
+  // Constructs a write batch group led by leader from newest_memtable_writers_
+  // list. The leader should either write memtable for the whole group and
+  // call ExitAsMemTableWriter, or launch parallel memtable write through
+  // LaunchParallelMemTableWriters.
+  void EnterAsMemTableWriter(Writer* leader, WriteGroup* write_grup);
+
+  // Memtable writer group leader, or the last finished writer in a parallel
+  // write group, exit from the newest_memtable_writers_ list, and wake up
+  // the next leader if needed.
+  void ExitAsMemTableWriter(Writer* self, WriteGroup& write_group);
+
+  // Causes JoinBatchGroup to return STATE_PARALLEL_MEMTABLE_WRITER for all of
+  // the non-leader members of this write batch group.  Sets Writer::sequence
+  // before waking them up.
+  //
+  // WriteGroup* write_group: Extra state used to coordinate the parallel add
+  void LaunchParallelMemTableWriters(WriteGroup* write_group);
+
+  // Reports the completion of w's batch to the parallel group leader, and
+  // waits for the rest of the parallel batch to complete.  Returns true
+  // if this thread is the last to complete, and hence should advance
+  // the sequence number and then call EarlyExitParallelGroup, false if
+  // someone else has already taken responsibility for that.
+  bool CompleteParallelMemTableWriter(Writer* w);
+
+  // Waits for all preceding writers (unlocking mu while waiting), then
+  // registers w as the currently proceeding writer.
+  //
+  // Writer* w:              A Writer not eligible for batching
+  // InstrumentedMutex* mu:  The db mutex, to unlock while waiting
+  // REQUIRES: db mutex held
+  void EnterUnbatched(Writer* w, InstrumentedMutex* mu);
+
+  // Completes a Writer begun with EnterUnbatched, unblocking subsequent
+  // writers.
+  void ExitUnbatched(Writer* w);
+
+  // Wait for all parallel memtable writers to finish, in case pipelined
+  // write is enabled.
+  void WaitForMemTableWriters();
+
+  SequenceNumber UpdateLastSequence(SequenceNumber sequence) {
+    if (sequence > last_sequence_) {
+      last_sequence_ = sequence;
+    }
+    return last_sequence_;
+  }
+
+  // Insert a dummy writer at the tail of the write queue to indicate a write
+  // stall, and fail any writers in the queue with no_slowdown set to true
+  void BeginWriteStall();
+
+  // Remove the dummy writer and wake up waiting writers
+  void EndWriteStall();
+
+ private:
+  // See AwaitState.
+  const uint64_t max_yield_usec_;
+  const uint64_t slow_yield_usec_;
+
+  // Allow multiple writers write to memtable concurrently.
+  const bool allow_concurrent_memtable_write_;
+
+  // Enable pipelined write to WAL and memtable.
+  const bool enable_pipelined_write_;
+
+  // The maximum limit of number of bytes that are written in a single batch
+  // of WAL or memtable write. It is followed when the leader write size
+  // is larger than 1/8 of this limit.
+  const uint64_t max_write_batch_group_size_bytes;
+
+  // Points to the newest pending writer. Only leader can remove
+  // elements, adding can be done lock-free by anybody.
+  std::atomic<Writer*> newest_writer_;
+
+  // Points to the newest pending memtable writer. Used only when pipelined
+  // write is enabled.
+  std::atomic<Writer*> newest_memtable_writer_;
+
+  // The last sequence that have been consumed by a writer. The sequence
+  // is not necessary visible to reads because the writer can be ongoing.
+  SequenceNumber last_sequence_;
+
+  // A dummy writer to indicate a write stall condition. This will be inserted
+  // at the tail of the writer queue by the leader, so newer writers can just
+  // check for this and bail
+  Writer write_stall_dummy_;
+
+  // Mutex and condvar for writers to block on a write stall. During a write
+  // stall, writers with no_slowdown set to false will wait on this rather
+  // on the writer queue
+  port::Mutex stall_mu_;
+  port::CondVar stall_cv_;
+
+  // Waits for w->state & goal_mask using w->StateMutex().  Returns
+  // the state that satisfies goal_mask.
+  uint8_t BlockingAwaitState(Writer* w, uint8_t goal_mask);
+
+  // Blocks until w->state & goal_mask, returning the state value
+  // that satisfied the predicate.  Uses ctx to adaptively use
+  // std::this_thread::yield() to avoid mutex overheads.  ctx should be
+  // a context-dependent static.
+  uint8_t AwaitState(Writer* w, uint8_t goal_mask, AdaptationContext* ctx);
+
+  // Set writer state and wake the writer up if it is waiting.
+  void SetState(Writer* w, uint8_t new_state);
+
+  // Links w into the newest_writer list. Return true if w was linked directly
+  // into the leader position.  Safe to call from multiple threads without
+  // external locking.
+  bool LinkOne(Writer* w, std::atomic<Writer*>* newest_writer);
+
+  // Link write group into the newest_writer list as a whole, while keeping the
+  // order of the writers unchanged. Return true if the group was linked
+  // directly into the leader position.
+  bool LinkGroup(WriteGroup& write_group, std::atomic<Writer*>* newest_writer);
+
+  // Computes any missing link_newer links.  Should not be called
+  // concurrently with itself.
+  void CreateMissingNewerLinks(Writer* head);
+
+  // Set the leader in write_group to completed state and remove it from the
+  // write group.
+  void CompleteLeader(WriteGroup& write_group);
+
+  // Set a follower in write_group to completed state and remove it from the
+  // write group.
+  void CompleteFollower(Writer* w, WriteGroup& write_group);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db_stress_tool/CMakeLists.txt b/src/rocksdb/db_stress_tool/CMakeLists.txt
new file mode 100644
index 000000000..96d70dd0e
--- /dev/null
+++ b/src/rocksdb/db_stress_tool/CMakeLists.txt
@@ -0,0 +1,17 @@
+add_executable(db_stress${ARTIFACT_SUFFIX}
+  batched_ops_stress.cc
+  cf_consistency_stress.cc
+  db_stress.cc
+  db_stress_common.cc
+  db_stress_driver.cc
+  db_stress_gflags.cc
+  db_stress_listener.cc
+  db_stress_shared_state.cc
+  db_stress_stat.cc
+  db_stress_test_base.cc
+  db_stress_tool.cc
+  expected_state.cc
+  multi_ops_txns_stress.cc
+  no_batched_ops_stress.cc)
+target_link_libraries(db_stress${ARTIFACT_SUFFIX} ${ROCKSDB_LIB} ${THIRDPARTY_LIBS})
+list(APPEND tool_deps db_stress)
diff --git a/src/rocksdb/db_stress_tool/batched_ops_stress.cc b/src/rocksdb/db_stress_tool/batched_ops_stress.cc
new file mode 100644
index 000000000..3f3446076
--- /dev/null
+++ b/src/rocksdb/db_stress_tool/batched_ops_stress.cc
@@ -0,0 +1,399 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifdef GFLAGS
+#include "db_stress_tool/db_stress_common.h"
+
+namespace ROCKSDB_NAMESPACE {
+class BatchedOpsStressTest : public StressTest {
+ public:
+  BatchedOpsStressTest() {}
+  virtual ~BatchedOpsStressTest() {}
+
+  bool IsStateTracked() const override { return false; }
+
+  // Given a key K and value V, this puts ("0"+K, V+"0"), ("1"+K, V+"1"), ...,
+  // ("9"+K, V+"9") in DB atomically i.e in a single batch.
+  // Also refer BatchedOpsStressTest::TestGet
+  Status TestPut(ThreadState* thread, WriteOptions& write_opts,
+                 const ReadOptions& /* read_opts */,
+                 const std::vector<int>& rand_column_families,
+                 const std::vector<int64_t>& rand_keys,
+                 char (&value)[100]) override {
+    assert(!rand_column_families.empty());
+    assert(!rand_keys.empty());
+
+    const std::string key_body = Key(rand_keys[0]);
+
+    const uint32_t value_base =
+        thread->rand.Next() % thread->shared->UNKNOWN_SENTINEL;
+    const size_t sz = GenerateValue(value_base, value, sizeof(value));
+    const std::string value_body = Slice(value, sz).ToString();
+
+    WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+                     FLAGS_batch_protection_bytes_per_key,
+                     FLAGS_user_timestamp_size);
+
+    ColumnFamilyHandle* const cfh = column_families_[rand_column_families[0]];
+    assert(cfh);
+
+    for (int i = 9; i >= 0; --i) {
+      const std::string num = std::to_string(i);
+
+      // Note: the digit in num is prepended to the key; however, it is appended
+      // to the value because we want the "value base" to be encoded uniformly
+      // at the beginning of the value for all types of stress tests (e.g.
+      // batched, non-batched, CF consistency).
+      const std::string k = num + key_body;
+      const std::string v = value_body + num;
+
+      if (FLAGS_use_merge) {
+        batch.Merge(cfh, k, v);
+      } else if (FLAGS_use_put_entity_one_in > 0 &&
+                 (value_base % FLAGS_use_put_entity_one_in) == 0) {
+        batch.PutEntity(cfh, k, GenerateWideColumns(value_base, v));
+      } else {
+        batch.Put(cfh, k, v);
+      }
+    }
+
+    const Status s = db_->Write(write_opts, &batch);
+
+    if (!s.ok()) {
+      fprintf(stderr, "multiput error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      // we did 10 writes each of size sz + 1
+      thread->stats.AddBytesForWrites(10, (sz + 1) * 10);
+    }
+
+    return s;
+  }
+
+  // Given a key K, this deletes ("0"+K), ("1"+K), ..., ("9"+K)
+  // in DB atomically i.e in a single batch. Also refer MultiGet.
+  Status TestDelete(ThreadState* thread, WriteOptions& writeoptions,
+                    const std::vector<int>& rand_column_families,
+                    const std::vector<int64_t>& rand_keys) override {
+    std::string keys[10] = {"9", "7", "5", "3", "1", "8", "6", "4", "2", "0"};
+
+    WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+                     FLAGS_batch_protection_bytes_per_key,
+                     FLAGS_user_timestamp_size);
+    Status s;
+    auto cfh = column_families_[rand_column_families[0]];
+    std::string key_str = Key(rand_keys[0]);
+    for (int i = 0; i < 10; i++) {
+      keys[i] += key_str;
+      batch.Delete(cfh, keys[i]);
+    }
+
+    s = db_->Write(writeoptions, &batch);
+    if (!s.ok()) {
+      fprintf(stderr, "multidelete error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      thread->stats.AddDeletes(10);
+    }
+
+    return s;
+  }
+
+  Status TestDeleteRange(ThreadState* /* thread */,
+                         WriteOptions& /* write_opts */,
+                         const std::vector<int>& /* rand_column_families */,
+                         const std::vector<int64_t>& /* rand_keys */) override {
+    assert(false);
+    return Status::NotSupported(
+        "BatchedOpsStressTest does not support "
+        "TestDeleteRange");
+  }
+
+  void TestIngestExternalFile(
+      ThreadState* /* thread */,
+      const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */) override {
+    assert(false);
+    fprintf(stderr,
+            "BatchedOpsStressTest does not support "
+            "TestIngestExternalFile\n");
+    std::terminate();
+  }
+
+  // Given a key K, this gets values for "0"+K, "1"+K, ..., "9"+K
+  // in the same snapshot, and verifies that all the values are of the form
+  // V+"0", V+"1", ..., V+"9".
+  // ASSUMES that BatchedOpsStressTest::TestPut was used to put (K, V) into
+  // the DB.
+  Status TestGet(ThreadState* thread, const ReadOptions& readoptions,
+                 const std::vector<int>& rand_column_families,
+                 const std::vector<int64_t>& rand_keys) override {
+    std::string keys[10] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"};
+    Slice key_slices[10];
+    std::string values[10];
+    ReadOptions readoptionscopy = readoptions;
+    readoptionscopy.snapshot = db_->GetSnapshot();
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    auto cfh = column_families_[rand_column_families[0]];
+    std::string from_db;
+    Status s;
+    for (int i = 0; i < 10; i++) {
+      keys[i] += key.ToString();
+      key_slices[i] = keys[i];
+      s = db_->Get(readoptionscopy, cfh, key_slices[i], &from_db);
+      if (!s.ok() && !s.IsNotFound()) {
+        fprintf(stderr, "get error: %s\n", s.ToString().c_str());
+        values[i] = "";
+        thread->stats.AddErrors(1);
+        // we continue after error rather than exiting so that we can
+        // find more errors if any
+      } else if (s.IsNotFound()) {
+        values[i] = "";
+        thread->stats.AddGets(1, 0);
+      } else {
+        values[i] = from_db;
+
+        assert(!keys[i].empty());
+        assert(!values[i].empty());
+
+        const char expected = keys[i].front();
+        const char actual = values[i].back();
+
+        if (expected != actual) {
+          fprintf(stderr, "get error expected = %c actual = %c\n", expected,
+                  actual);
+        }
+
+        values[i].pop_back();  // get rid of the differing character
+
+        thread->stats.AddGets(1, 1);
+      }
+    }
+    db_->ReleaseSnapshot(readoptionscopy.snapshot);
+
+    // Now that we retrieved all values, check that they all match
+    for (int i = 1; i < 10; i++) {
+      if (values[i] != values[0]) {
+        fprintf(stderr, "get error: inconsistent values for key %s: %s, %s\n",
+                key.ToString(true).c_str(), StringToHex(values[0]).c_str(),
+                StringToHex(values[i]).c_str());
+        // we continue after error rather than exiting so that we can
+        // find more errors if any
+      }
+    }
+
+    return s;
+  }
+
+  std::vector<Status> TestMultiGet(
+      ThreadState* thread, const ReadOptions& readoptions,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys) override {
+    size_t num_keys = rand_keys.size();
+    std::vector<Status> ret_status(num_keys);
+    std::array<std::string, 10> keys = {
+        {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}};
+    size_t num_prefixes = keys.size();
+    for (size_t rand_key = 0; rand_key < num_keys; ++rand_key) {
+      std::vector<Slice> key_slices;
+      std::vector<PinnableSlice> values(num_prefixes);
+      std::vector<Status> statuses(num_prefixes);
+      ReadOptions readoptionscopy = readoptions;
+      readoptionscopy.snapshot = db_->GetSnapshot();
+      readoptionscopy.rate_limiter_priority =
+          FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL;
+      std::vector<std::string> key_str;
+      key_str.reserve(num_prefixes);
+      key_slices.reserve(num_prefixes);
+      std::string from_db;
+      ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]];
+
+      for (size_t key = 0; key < num_prefixes; ++key) {
+        key_str.emplace_back(keys[key] + Key(rand_keys[rand_key]));
+        key_slices.emplace_back(key_str.back());
+      }
+      db_->MultiGet(readoptionscopy, cfh, num_prefixes, key_slices.data(),
+                    values.data(), statuses.data());
+      for (size_t i = 0; i < num_prefixes; i++) {
+        Status s = statuses[i];
+        if (!s.ok() && !s.IsNotFound()) {
+          fprintf(stderr, "multiget error: %s\n", s.ToString().c_str());
+          thread->stats.AddErrors(1);
+          ret_status[rand_key] = s;
+          // we continue after error rather than exiting so that we can
+          // find more errors if any
+        } else if (s.IsNotFound()) {
+          thread->stats.AddGets(1, 0);
+          ret_status[rand_key] = s;
+        } else {
+          assert(!keys[i].empty());
+          assert(!values[i].empty());
+
+          const char expected = keys[i][0];
+          const char actual = values[i][values[i].size() - 1];
+
+          if (expected != actual) {
+            fprintf(stderr, "multiget error expected = %c actual = %c\n",
+                    expected, actual);
+          }
+
+          values[i].remove_suffix(1);  // get rid of the differing character
+
+          thread->stats.AddGets(1, 1);
+        }
+      }
+      db_->ReleaseSnapshot(readoptionscopy.snapshot);
+
+      // Now that we retrieved all values, check that they all match
+      for (size_t i = 1; i < num_prefixes; i++) {
+        if (values[i] != values[0]) {
+          fprintf(stderr,
+                  "multiget error: inconsistent values for key %s: %s, %s\n",
+                  StringToHex(key_str[i]).c_str(),
+                  StringToHex(values[0].ToString()).c_str(),
+                  StringToHex(values[i].ToString()).c_str());
+          // we continue after error rather than exiting so that we can
+          // find more errors if any
+        }
+      }
+    }
+
+    return ret_status;
+  }
+
+  // Given a key, this does prefix scans for "0"+P, "1"+P, ..., "9"+P
+  // in the same snapshot where P is the first FLAGS_prefix_size - 1 bytes
+  // of the key. Each of these 10 scans returns a series of values;
+  // each series should be the same length, and it is verified for each
+  // index i that all the i'th values are of the form V+"0", V+"1", ..., V+"9".
+  // ASSUMES that MultiPut was used to put (K, V)
+  Status TestPrefixScan(ThreadState* thread, const ReadOptions& readoptions,
+                        const std::vector<int>& rand_column_families,
+                        const std::vector<int64_t>& rand_keys) override {
+    assert(!rand_column_families.empty());
+    assert(!rand_keys.empty());
+
+    const std::string key = Key(rand_keys[0]);
+
+    assert(FLAGS_prefix_size > 0);
+    const size_t prefix_to_use = static_cast<size_t>(FLAGS_prefix_size);
+
+    constexpr size_t num_prefixes = 10;
+
+    std::array<std::string, num_prefixes> prefixes;
+    std::array<Slice, num_prefixes> prefix_slices;
+    std::array<ReadOptions, num_prefixes> ro_copies;
+    std::array<std::string, num_prefixes> upper_bounds;
+    std::array<Slice, num_prefixes> ub_slices;
+    std::array<std::unique_ptr<Iterator>, num_prefixes> iters;
+
+    const Snapshot* const snapshot = db_->GetSnapshot();
+
+    ColumnFamilyHandle* const cfh = column_families_[rand_column_families[0]];
+    assert(cfh);
+
+    for (size_t i = 0; i < num_prefixes; ++i) {
+      prefixes[i] = std::to_string(i) + key;
+      prefix_slices[i] = Slice(prefixes[i].data(), prefix_to_use);
+
+      ro_copies[i] = readoptions;
+      ro_copies[i].snapshot = snapshot;
+      if (thread->rand.OneIn(2) &&
+          GetNextPrefix(prefix_slices[i], &(upper_bounds[i]))) {
+        // For half of the time, set the upper bound to the next prefix
+        ub_slices[i] = upper_bounds[i];
+        ro_copies[i].iterate_upper_bound = &(ub_slices[i]);
+      }
+
+      iters[i].reset(db_->NewIterator(ro_copies[i], cfh));
+      iters[i]->Seek(prefix_slices[i]);
+    }
+
+    uint64_t count = 0;
+
+    while (iters[0]->Valid() && iters[0]->key().starts_with(prefix_slices[0])) {
+      ++count;
+
+      std::array<std::string, num_prefixes> values;
+
+      // get list of all values for this iteration
+      for (size_t i = 0; i < num_prefixes; ++i) {
+        // no iterator should finish before the first one
+        assert(iters[i]->Valid() &&
+               iters[i]->key().starts_with(prefix_slices[i]));
+        values[i] = iters[i]->value().ToString();
+
+        // make sure the last character of the value is the expected digit
+        assert(!prefixes[i].empty());
+        assert(!values[i].empty());
+
+        const char expected = prefixes[i].front();
+        const char actual = values[i].back();
+
+        if (expected != actual) {
+          fprintf(stderr, "prefix scan error expected = %c actual = %c\n",
+                  expected, actual);
+        }
+
+        values[i].pop_back();  // get rid of the differing character
+
+        // make sure all values are equivalent
+        if (values[i] != values[0]) {
+          fprintf(stderr,
+                  "prefix scan error : %" ROCKSDB_PRIszt
+                  ", inconsistent values for prefix %s: %s, %s\n",
+                  i, prefix_slices[i].ToString(/* hex */ true).c_str(),
+                  StringToHex(values[0]).c_str(),
+                  StringToHex(values[i]).c_str());
+          // we continue after error rather than exiting so that we can
+          // find more errors if any
+        }
+
+        // make sure value() and columns() are consistent
+        const WideColumns expected_columns = GenerateExpectedWideColumns(
+            GetValueBase(iters[i]->value()), iters[i]->value());
+        if (iters[i]->columns() != expected_columns) {
+          fprintf(stderr,
+                  "prefix scan error : %" ROCKSDB_PRIszt
+                  ", value and columns inconsistent for prefix %s: %s\n",
+                  i, prefix_slices[i].ToString(/* hex */ true).c_str(),
+                  DebugString(iters[i]->value(), iters[i]->columns(),
+                              expected_columns)
+                      .c_str());
+        }
+
+        iters[i]->Next();
+      }
+    }
+
+    // cleanup iterators and snapshot
+    for (size_t i = 0; i < num_prefixes; ++i) {
+      // if the first iterator finished, they should have all finished
+      assert(!iters[i]->Valid() ||
+             !iters[i]->key().starts_with(prefix_slices[i]));
+      assert(iters[i]->status().ok());
+    }
+
+    db_->ReleaseSnapshot(snapshot);
+
+    thread->stats.AddPrefixes(1, count);
+
+    return Status::OK();
+  }
+
+  void VerifyDb(ThreadState* /* thread */) const override {}
+
+  void ContinuouslyVerifyDb(ThreadState* /* thread */) const override {}
+};
+
+StressTest* CreateBatchedOpsStressTest() { return new BatchedOpsStressTest(); }
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // GFLAGS
diff --git a/src/rocksdb/db_stress_tool/cf_consistency_stress.cc b/src/rocksdb/db_stress_tool/cf_consistency_stress.cc
new file mode 100644
index 000000000..33f7b1f2e
--- /dev/null
+++ b/src/rocksdb/db_stress_tool/cf_consistency_stress.cc
@@ -0,0 +1,640 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifdef GFLAGS
+#include "db_stress_tool/db_stress_common.h"
+#include "file/file_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+class CfConsistencyStressTest : public StressTest {
+ public:
+  CfConsistencyStressTest() : batch_id_(0) {}
+
+  ~CfConsistencyStressTest() override {}
+
+  bool IsStateTracked() const override { return false; }
+
+  Status TestPut(ThreadState* thread, WriteOptions& write_opts,
+                 const ReadOptions& /* read_opts */,
+                 const std::vector<int>& rand_column_families,
+                 const std::vector<int64_t>& rand_keys,
+                 char (&value)[100]) override {
+    assert(!rand_column_families.empty());
+    assert(!rand_keys.empty());
+
+    const std::string k = Key(rand_keys[0]);
+
+    const uint32_t value_base = batch_id_.fetch_add(1);
+    const size_t sz = GenerateValue(value_base, value, sizeof(value));
+    const Slice v(value, sz);
+
+    WriteBatch batch;
+
+    const bool use_put_entity = !FLAGS_use_merge &&
+                                FLAGS_use_put_entity_one_in > 0 &&
+                                (value_base % FLAGS_use_put_entity_one_in) == 0;
+
+    for (auto cf : rand_column_families) {
+      ColumnFamilyHandle* const cfh = column_families_[cf];
+      assert(cfh);
+
+      if (FLAGS_use_merge) {
+        batch.Merge(cfh, k, v);
+      } else if (use_put_entity) {
+        batch.PutEntity(cfh, k, GenerateWideColumns(value_base, v));
+      } else {
+        batch.Put(cfh, k, v);
+      }
+    }
+
+    Status s = db_->Write(write_opts, &batch);
+
+    if (!s.ok()) {
+      fprintf(stderr, "multi put or merge error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      auto num = static_cast<long>(rand_column_families.size());
+      thread->stats.AddBytesForWrites(num, (sz + 1) * num);
+    }
+
+    return s;
+  }
+
+  Status TestDelete(ThreadState* thread, WriteOptions& write_opts,
+                    const std::vector<int>& rand_column_families,
+                    const std::vector<int64_t>& rand_keys) override {
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    WriteBatch batch;
+    for (auto cf : rand_column_families) {
+      ColumnFamilyHandle* cfh = column_families_[cf];
+      batch.Delete(cfh, key);
+    }
+    Status s = db_->Write(write_opts, &batch);
+    if (!s.ok()) {
+      fprintf(stderr, "multidel error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      thread->stats.AddDeletes(static_cast<long>(rand_column_families.size()));
+    }
+    return s;
+  }
+
+  Status TestDeleteRange(ThreadState* thread, WriteOptions& write_opts,
+                         const std::vector<int>& rand_column_families,
+                         const std::vector<int64_t>& rand_keys) override {
+    int64_t rand_key = rand_keys[0];
+    auto shared = thread->shared;
+    int64_t max_key = shared->GetMaxKey();
+    if (rand_key > max_key - FLAGS_range_deletion_width) {
+      rand_key =
+          thread->rand.Next() % (max_key - FLAGS_range_deletion_width + 1);
+    }
+    std::string key_str = Key(rand_key);
+    Slice key = key_str;
+    std::string end_key_str = Key(rand_key + FLAGS_range_deletion_width);
+    Slice end_key = end_key_str;
+    WriteBatch batch;
+    for (auto cf : rand_column_families) {
+      ColumnFamilyHandle* cfh = column_families_[rand_column_families[cf]];
+      batch.DeleteRange(cfh, key, end_key);
+    }
+    Status s = db_->Write(write_opts, &batch);
+    if (!s.ok()) {
+      fprintf(stderr, "multi del range error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      thread->stats.AddRangeDeletions(
+          static_cast<long>(rand_column_families.size()));
+    }
+    return s;
+  }
+
+  void TestIngestExternalFile(
+      ThreadState* /* thread */,
+      const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */) override {
+    assert(false);
+    fprintf(stderr,
+            "CfConsistencyStressTest does not support TestIngestExternalFile "
+            "because it's not possible to verify the result\n");
+    std::terminate();
+  }
+
+  Status TestGet(ThreadState* thread, const ReadOptions& readoptions,
+                 const std::vector<int>& rand_column_families,
+                 const std::vector<int64_t>& rand_keys) override {
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    Status s;
+    bool is_consistent = true;
+
+    if (thread->rand.OneIn(2)) {
+      // 1/2 chance, does a random read from random CF
+      auto cfh =
+          column_families_[rand_column_families[thread->rand.Next() %
+                                                rand_column_families.size()]];
+      std::string from_db;
+      s = db_->Get(readoptions, cfh, key, &from_db);
+    } else {
+      // 1/2 chance, comparing one key is the same across all CFs
+      const Snapshot* snapshot = db_->GetSnapshot();
+      ReadOptions readoptionscopy = readoptions;
+      readoptionscopy.snapshot = snapshot;
+
+      std::string value0;
+      s = db_->Get(readoptionscopy, column_families_[rand_column_families[0]],
+                   key, &value0);
+      if (s.ok() || s.IsNotFound()) {
+        bool found = s.ok();
+        for (size_t i = 1; i < rand_column_families.size(); i++) {
+          std::string value1;
+          s = db_->Get(readoptionscopy,
+                       column_families_[rand_column_families[i]], key, &value1);
+          if (!s.ok() && !s.IsNotFound()) {
+            break;
+          }
+          if (!found && s.ok()) {
+            fprintf(stderr, "Get() return different results with key %s\n",
+                    Slice(key_str).ToString(true).c_str());
+            fprintf(stderr, "CF %s is not found\n",
+                    column_family_names_[0].c_str());
+            fprintf(stderr, "CF %s returns value %s\n",
+                    column_family_names_[i].c_str(),
+                    Slice(value1).ToString(true).c_str());
+            is_consistent = false;
+          } else if (found && s.IsNotFound()) {
+            fprintf(stderr, "Get() return different results with key %s\n",
+                    Slice(key_str).ToString(true).c_str());
+            fprintf(stderr, "CF %s returns value %s\n",
+                    column_family_names_[0].c_str(),
+                    Slice(value0).ToString(true).c_str());
+            fprintf(stderr, "CF %s is not found\n",
+                    column_family_names_[i].c_str());
+            is_consistent = false;
+          } else if (s.ok() && value0 != value1) {
+            fprintf(stderr, "Get() return different results with key %s\n",
+                    Slice(key_str).ToString(true).c_str());
+            fprintf(stderr, "CF %s returns value %s\n",
+                    column_family_names_[0].c_str(),
+                    Slice(value0).ToString(true).c_str());
+            fprintf(stderr, "CF %s returns value %s\n",
+                    column_family_names_[i].c_str(),
+                    Slice(value1).ToString(true).c_str());
+            is_consistent = false;
+          }
+          if (!is_consistent) {
+            break;
+          }
+        }
+      }
+
+      db_->ReleaseSnapshot(snapshot);
+    }
+    if (!is_consistent) {
+      fprintf(stderr, "TestGet error: is_consistent is false\n");
+      thread->stats.AddErrors(1);
+      // Fail fast to preserve the DB state.
+      thread->shared->SetVerificationFailure();
+    } else if (s.ok()) {
+      thread->stats.AddGets(1, 1);
+    } else if (s.IsNotFound()) {
+      thread->stats.AddGets(1, 0);
+    } else {
+      fprintf(stderr, "TestGet error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    }
+    return s;
+  }
+
+  std::vector<Status> TestMultiGet(
+      ThreadState* thread, const ReadOptions& read_opts,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys) override {
+    size_t num_keys = rand_keys.size();
+    std::vector<std::string> key_str;
+    std::vector<Slice> keys;
+    keys.reserve(num_keys);
+    key_str.reserve(num_keys);
+    std::vector<PinnableSlice> values(num_keys);
+    std::vector<Status> statuses(num_keys);
+    ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]];
+    ReadOptions readoptionscopy = read_opts;
+    readoptionscopy.rate_limiter_priority =
+        FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL;
+
+    for (size_t i = 0; i < num_keys; ++i) {
+      key_str.emplace_back(Key(rand_keys[i]));
+      keys.emplace_back(key_str.back());
+    }
+    db_->MultiGet(readoptionscopy, cfh, num_keys, keys.data(), values.data(),
+                  statuses.data());
+    for (auto s : statuses) {
+      if (s.ok()) {
+        // found case
+        thread->stats.AddGets(1, 1);
+      } else if (s.IsNotFound()) {
+        // not found case
+        thread->stats.AddGets(1, 0);
+      } else {
+        // errors case
+        fprintf(stderr, "MultiGet error: %s\n", s.ToString().c_str());
+        thread->stats.AddErrors(1);
+      }
+    }
+    return statuses;
+  }
+
+  Status TestPrefixScan(ThreadState* thread, const ReadOptions& readoptions,
+                        const std::vector<int>& rand_column_families,
+                        const std::vector<int64_t>& rand_keys) override {
+    assert(!rand_column_families.empty());
+    assert(!rand_keys.empty());
+
+    const std::string key = Key(rand_keys[0]);
+
+    const size_t prefix_to_use =
+        (FLAGS_prefix_size < 0) ? 7 : static_cast<size_t>(FLAGS_prefix_size);
+
+    const Slice prefix(key.data(), prefix_to_use);
+
+    std::string upper_bound;
+    Slice ub_slice;
+
+    ReadOptions ro_copy = readoptions;
+
+    // Get the next prefix first and then see if we want to set upper bound.
+    // We'll use the next prefix in an assertion later on
+    if (GetNextPrefix(prefix, &upper_bound) && thread->rand.OneIn(2)) {
+      ub_slice = Slice(upper_bound);
+      ro_copy.iterate_upper_bound = &ub_slice;
+    }
+
+    ColumnFamilyHandle* const cfh =
+        column_families_[rand_column_families[thread->rand.Uniform(
+            static_cast<int>(rand_column_families.size()))]];
+    assert(cfh);
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ro_copy, cfh));
+
+    uint64_t count = 0;
+    Status s;
+
+    for (iter->Seek(prefix); iter->Valid() && iter->key().starts_with(prefix);
+         iter->Next()) {
+      ++count;
+
+      const WideColumns expected_columns = GenerateExpectedWideColumns(
+          GetValueBase(iter->value()), iter->value());
+      if (iter->columns() != expected_columns) {
+        s = Status::Corruption(
+            "Value and columns inconsistent",
+            DebugString(iter->value(), iter->columns(), expected_columns));
+        break;
+      }
+    }
+
+    assert(prefix_to_use == 0 ||
+           count <= GetPrefixKeyCount(prefix.ToString(), upper_bound));
+
+    if (s.ok()) {
+      s = iter->status();
+    }
+
+    if (!s.ok()) {
+      fprintf(stderr, "TestPrefixScan error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+
+      return s;
+    }
+
+    thread->stats.AddPrefixes(1, count);
+
+    return Status::OK();
+  }
+
+  ColumnFamilyHandle* GetControlCfh(ThreadState* thread,
+                                    int /*column_family_id*/
+                                    ) override {
+    // All column families should contain the same data. Randomly pick one.
+    return column_families_[thread->rand.Next() % column_families_.size()];
+  }
+
+  void VerifyDb(ThreadState* thread) const override {
+    // This `ReadOptions` is for validation purposes. Ignore
+    // `FLAGS_rate_limit_user_ops` to avoid slowing any validation.
+    ReadOptions options(FLAGS_verify_checksum, true);
+
+    // We must set total_order_seek to true because we are doing a SeekToFirst
+    // on a column family whose memtables may support (by default) prefix-based
+    // iterator. In this case, NewIterator with options.total_order_seek being
+    // false returns a prefix-based iterator. Calling SeekToFirst using this
+    // iterator causes the iterator to become invalid. That means we cannot
+    // iterate the memtable using this iterator any more, although the memtable
+    // contains the most up-to-date key-values.
+    options.total_order_seek = true;
+
+    ManagedSnapshot snapshot_guard(db_);
+    options.snapshot = snapshot_guard.snapshot();
+
+    const size_t num = column_families_.size();
+
+    std::vector<std::unique_ptr<Iterator>> iters;
+    iters.reserve(num);
+
+    for (size_t i = 0; i < num; ++i) {
+      iters.emplace_back(db_->NewIterator(options, column_families_[i]));
+      iters.back()->SeekToFirst();
+    }
+
+    std::vector<Status> statuses(num, Status::OK());
+
+    assert(thread);
+
+    auto shared = thread->shared;
+    assert(shared);
+
+    do {
+      if (shared->HasVerificationFailedYet()) {
+        break;
+      }
+
+      size_t valid_cnt = 0;
+
+      for (size_t i = 0; i < num; ++i) {
+        const auto& iter = iters[i];
+        assert(iter);
+
+        if (iter->Valid()) {
+          const WideColumns expected_columns = GenerateExpectedWideColumns(
+              GetValueBase(iter->value()), iter->value());
+          if (iter->columns() != expected_columns) {
+            statuses[i] = Status::Corruption(
+                "Value and columns inconsistent",
+                DebugString(iter->value(), iter->columns(), expected_columns));
+          } else {
+            ++valid_cnt;
+          }
+        } else {
+          statuses[i] = iter->status();
+        }
+      }
+
+      if (valid_cnt == 0) {
+        for (size_t i = 0; i < num; ++i) {
+          const auto& s = statuses[i];
+          if (!s.ok()) {
+            fprintf(stderr, "Iterator on cf %s has error: %s\n",
+                    column_families_[i]->GetName().c_str(),
+                    s.ToString().c_str());
+            shared->SetVerificationFailure();
+          }
+        }
+
+        break;
+      }
+
+      if (valid_cnt < num) {
+        shared->SetVerificationFailure();
+
+        for (size_t i = 0; i < num; ++i) {
+          assert(iters[i]);
+
+          if (!iters[i]->Valid()) {
+            if (statuses[i].ok()) {
+              fprintf(stderr, "Finished scanning cf %s\n",
+                      column_families_[i]->GetName().c_str());
+            } else {
+              fprintf(stderr, "Iterator on cf %s has error: %s\n",
+                      column_families_[i]->GetName().c_str(),
+                      statuses[i].ToString().c_str());
+            }
+          } else {
+            fprintf(stderr, "cf %s has remaining data to scan\n",
+                    column_families_[i]->GetName().c_str());
+          }
+        }
+
+        break;
+      }
+
+      if (shared->HasVerificationFailedYet()) {
+        break;
+      }
+
+      // If the program reaches here, then all column families' iterators are
+      // still valid.
+      assert(valid_cnt == num);
+
+      if (shared->PrintingVerificationResults()) {
+        continue;
+      }
+
+      assert(iters[0]);
+
+      const Slice key = iters[0]->key();
+      const Slice value = iters[0]->value();
+
+      int num_mismatched_cfs = 0;
+
+      for (size_t i = 1; i < num; ++i) {
+        assert(iters[i]);
+
+        const int cmp = key.compare(iters[i]->key());
+
+        if (cmp != 0) {
+          ++num_mismatched_cfs;
+
+          if (1 == num_mismatched_cfs) {
+            fprintf(stderr, "Verification failed\n");
+            fprintf(stderr, "Latest Sequence Number: %" PRIu64 "\n",
+                    db_->GetLatestSequenceNumber());
+            fprintf(stderr, "[%s] %s => %s\n",
+                    column_families_[0]->GetName().c_str(),
+                    key.ToString(true /* hex */).c_str(),
+                    value.ToString(true /* hex */).c_str());
+          }
+
+          fprintf(stderr, "[%s] %s => %s\n",
+                  column_families_[i]->GetName().c_str(),
+                  iters[i]->key().ToString(true /* hex */).c_str(),
+                  iters[i]->value().ToString(true /* hex */).c_str());
+
+#ifndef ROCKSDB_LITE
+          Slice begin_key;
+          Slice end_key;
+          if (cmp < 0) {
+            begin_key = key;
+            end_key = iters[i]->key();
+          } else {
+            begin_key = iters[i]->key();
+            end_key = key;
+          }
+
+          const auto print_key_versions = [&](ColumnFamilyHandle* cfh) {
+            constexpr size_t kMaxNumIKeys = 8;
+
+            std::vector<KeyVersion> versions;
+            const Status s = GetAllKeyVersions(db_, cfh, begin_key, end_key,
+                                               kMaxNumIKeys, &versions);
+            if (!s.ok()) {
+              fprintf(stderr, "%s\n", s.ToString().c_str());
+              return;
+            }
+
+            assert(cfh);
+
+            fprintf(stderr,
+                    "Internal keys in CF '%s', [%s, %s] (max %" ROCKSDB_PRIszt
+                    ")\n",
+                    cfh->GetName().c_str(),
+                    begin_key.ToString(true /* hex */).c_str(),
+                    end_key.ToString(true /* hex */).c_str(), kMaxNumIKeys);
+
+            for (const KeyVersion& kv : versions) {
+              fprintf(stderr, "  key %s seq %" PRIu64 " type %d\n",
+                      Slice(kv.user_key).ToString(true).c_str(), kv.sequence,
+                      kv.type);
+            }
+          };
+
+          if (1 == num_mismatched_cfs) {
+            print_key_versions(column_families_[0]);
+          }
+
+          print_key_versions(column_families_[i]);
+#endif  // ROCKSDB_LITE
+
+          shared->SetVerificationFailure();
+        }
+      }
+
+      shared->FinishPrintingVerificationResults();
+
+      for (auto& iter : iters) {
+        assert(iter);
+        iter->Next();
+      }
+    } while (true);
+  }
+
+#ifndef ROCKSDB_LITE
+  void ContinuouslyVerifyDb(ThreadState* thread) const override {
+    assert(thread);
+    Status status;
+
+    DB* db_ptr = cmp_db_ ? cmp_db_ : db_;
+    const auto& cfhs = cmp_db_ ? cmp_cfhs_ : column_families_;
+
+    // Take a snapshot to preserve the state of primary db.
+    ManagedSnapshot snapshot_guard(db_);
+
+    SharedState* shared = thread->shared;
+    assert(shared);
+
+    if (cmp_db_) {
+      status = cmp_db_->TryCatchUpWithPrimary();
+      if (!status.ok()) {
+        fprintf(stderr, "TryCatchUpWithPrimary: %s\n",
+                status.ToString().c_str());
+        shared->SetShouldStopTest();
+        assert(false);
+        return;
+      }
+    }
+
+    const auto checksum_column_family = [](Iterator* iter,
+                                           uint32_t* checksum) -> Status {
+      assert(nullptr != checksum);
+
+      uint32_t ret = 0;
+      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+        ret = crc32c::Extend(ret, iter->key().data(), iter->key().size());
+        ret = crc32c::Extend(ret, iter->value().data(), iter->value().size());
+
+        for (const auto& column : iter->columns()) {
+          ret = crc32c::Extend(ret, column.name().data(), column.name().size());
+          ret =
+              crc32c::Extend(ret, column.value().data(), column.value().size());
+        }
+      }
+
+      *checksum = ret;
+      return iter->status();
+    };
+    // This `ReadOptions` is for validation purposes. Ignore
+    // `FLAGS_rate_limit_user_ops` to avoid slowing any validation.
+    ReadOptions ropts(FLAGS_verify_checksum, true);
+    ropts.total_order_seek = true;
+    if (nullptr == cmp_db_) {
+      ropts.snapshot = snapshot_guard.snapshot();
+    }
+    uint32_t crc = 0;
+    {
+      // Compute crc for all key-values of default column family.
+      std::unique_ptr<Iterator> it(db_ptr->NewIterator(ropts));
+      status = checksum_column_family(it.get(), &crc);
+      if (!status.ok()) {
+        fprintf(stderr, "Computing checksum of default cf: %s\n",
+                status.ToString().c_str());
+        assert(false);
+      }
+    }
+    // Since we currently intentionally disallow reading from the secondary
+    // instance with snapshot, we cannot achieve cross-cf consistency if WAL is
+    // enabled because there is no guarantee that secondary instance replays
+    // the primary's WAL to a consistent point where all cfs have the same
+    // data.
+    if (status.ok() && FLAGS_disable_wal) {
+      uint32_t tmp_crc = 0;
+      for (ColumnFamilyHandle* cfh : cfhs) {
+        if (cfh == db_ptr->DefaultColumnFamily()) {
+          continue;
+        }
+        std::unique_ptr<Iterator> it(db_ptr->NewIterator(ropts, cfh));
+        status = checksum_column_family(it.get(), &tmp_crc);
+        if (!status.ok() || tmp_crc != crc) {
+          break;
+        }
+      }
+      if (!status.ok()) {
+        fprintf(stderr, "status: %s\n", status.ToString().c_str());
+        shared->SetShouldStopTest();
+        assert(false);
+      } else if (tmp_crc != crc) {
+        fprintf(stderr, "tmp_crc=%" PRIu32 " crc=%" PRIu32 "\n", tmp_crc, crc);
+        shared->SetShouldStopTest();
+        assert(false);
+      }
+    }
+  }
+#else   // ROCKSDB_LITE
+  void ContinuouslyVerifyDb(ThreadState* /*thread*/) const override {}
+#endif  // !ROCKSDB_LITE
+
+  std::vector<int> GenerateColumnFamilies(
+      const int /* num_column_families */,
+      int /* rand_column_family */) const override {
+    std::vector<int> ret;
+    int num = static_cast<int>(column_families_.size());
+    int k = 0;
+    std::generate_n(back_inserter(ret), num, [&k]() -> int { return k++; });
+    return ret;
+  }
+
+ private:
+  std::atomic<uint32_t> batch_id_;
+};
+
+StressTest* CreateCfConsistencyStressTest() {
+  return new CfConsistencyStressTest();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // GFLAGS
diff --git a/src/rocksdb/db_stress_tool/db_stress.cc b/src/rocksdb/db_stress_tool/db_stress.cc
new file mode 100644
index 000000000..2d03f5d26
--- /dev/null
+++ b/src/rocksdb/db_stress_tool/db_stress.cc
@@ -0,0 +1,25 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef GFLAGS
+#include <cstdio>
+
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+#include "port/stack_trace.h"
+#include "rocksdb/db_stress_tool.h"
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  return ROCKSDB_NAMESPACE::db_stress_tool(argc, argv);
+}
+#endif  // GFLAGS
diff --git a/src/rocksdb/db_stress_tool/db_stress_common.cc b/src/rocksdb/db_stress_tool/db_stress_common.cc
new file mode 100644
index 000000000..af8db9e2f
--- /dev/null
+++ b/src/rocksdb/db_stress_tool/db_stress_common.cc
@@ -0,0 +1,460 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+
+#ifdef GFLAGS
+#include "db_stress_tool/db_stress_common.h"
+
+#include <cmath>
+
+#include "util/file_checksum_helper.h"
+#include "util/xxhash.h"
+
+ROCKSDB_NAMESPACE::Env* db_stress_listener_env = nullptr;
+ROCKSDB_NAMESPACE::Env* db_stress_env = nullptr;
+// If non-null, injects read error at a rate specified by the
+// read_fault_one_in or write_fault_one_in flag
+std::shared_ptr<ROCKSDB_NAMESPACE::FaultInjectionTestFS> fault_fs_guard;
+enum ROCKSDB_NAMESPACE::CompressionType compression_type_e =
+    ROCKSDB_NAMESPACE::kSnappyCompression;
+enum ROCKSDB_NAMESPACE::CompressionType bottommost_compression_type_e =
+    ROCKSDB_NAMESPACE::kSnappyCompression;
+enum ROCKSDB_NAMESPACE::ChecksumType checksum_type_e =
+    ROCKSDB_NAMESPACE::kCRC32c;
+enum RepFactory FLAGS_rep_factory = kSkipList;
+std::vector<double> sum_probs(100001);
+constexpr int64_t zipf_sum_size = 100000;
+
+namespace ROCKSDB_NAMESPACE {
+
+// Zipfian distribution is generated based on a pre-calculated array.
+// It should be used before start the stress test.
+// First, the probability distribution function (PDF) of this Zipfian follows
+// power low. P(x) = 1/(x^alpha).
+// So we calculate the PDF when x is from 0 to zipf_sum_size in first for loop
+// and add the PDF value togetger as c. So we get the total probability in c.
+// Next, we calculate inverse CDF of Zipfian and store the value of each in
+// an array (sum_probs). The rank is from 0 to zipf_sum_size. For example, for
+// integer k, its Zipfian CDF value is sum_probs[k].
+// Third, when we need to get an integer whose probability follows Zipfian
+// distribution, we use a rand_seed [0,1] which follows uniform distribution
+// as a seed and search it in the sum_probs via binary search. When we find
+// the closest sum_probs[i] of rand_seed, i is the integer that in
+// [0, zipf_sum_size] following Zipfian distribution with parameter alpha.
+// Finally, we can scale i to [0, max_key] scale.
+// In order to avoid that hot keys are close to each other and skew towards 0,
+// we use Rando64 to shuffle it.
+void InitializeHotKeyGenerator(double alpha) {
+  double c = 0;
+  for (int64_t i = 1; i <= zipf_sum_size; i++) {
+    c = c + (1.0 / std::pow(static_cast<double>(i), alpha));
+  }
+  c = 1.0 / c;
+
+  sum_probs[0] = 0;
+  for (int64_t i = 1; i <= zipf_sum_size; i++) {
+    sum_probs[i] =
+        sum_probs[i - 1] + c / std::pow(static_cast<double>(i), alpha);
+  }
+}
+
+// Generate one key that follows the Zipfian distribution. The skewness
+// is decided by the parameter alpha. Input is the rand_seed [0,1] and
+// the max of the key to be generated. If we directly return tmp_zipf_seed,
+// the closer to 0, the higher probability will be. To randomly distribute
+// the hot keys in [0, max_key], we use Random64 to shuffle it.
+int64_t GetOneHotKeyID(double rand_seed, int64_t max_key) {
+  int64_t low = 1, mid, high = zipf_sum_size, zipf = 0;
+  while (low <= high) {
+    mid = (low + high) / 2;
+    if (sum_probs[mid] >= rand_seed && sum_probs[mid - 1] < rand_seed) {
+      zipf = mid;
+      break;
+    } else if (sum_probs[mid] >= rand_seed) {
+      high = mid - 1;
+    } else {
+      low = mid + 1;
+    }
+  }
+  int64_t tmp_zipf_seed = zipf * max_key / zipf_sum_size;
+  Random64 rand_local(tmp_zipf_seed);
+  return rand_local.Next() % max_key;
+}
+
+void PoolSizeChangeThread(void* v) {
+  assert(FLAGS_compaction_thread_pool_adjust_interval > 0);
+  ThreadState* thread = reinterpret_cast<ThreadState*>(v);
+  SharedState* shared = thread->shared;
+
+  while (true) {
+    {
+      MutexLock l(shared->GetMutex());
+      if (shared->ShouldStopBgThread()) {
+        shared->IncBgThreadsFinished();
+        if (shared->BgThreadsFinished()) {
+          shared->GetCondVar()->SignalAll();
+        }
+        return;
+      }
+    }
+
+    auto thread_pool_size_base = FLAGS_max_background_compactions;
+    auto thread_pool_size_var = FLAGS_compaction_thread_pool_variations;
+    int new_thread_pool_size =
+        thread_pool_size_base - thread_pool_size_var +
+        thread->rand.Next() % (thread_pool_size_var * 2 + 1);
+    if (new_thread_pool_size < 1) {
+      new_thread_pool_size = 1;
+    }
+    db_stress_env->SetBackgroundThreads(new_thread_pool_size,
+                                        ROCKSDB_NAMESPACE::Env::Priority::LOW);
+    // Sleep up to 3 seconds
+    db_stress_env->SleepForMicroseconds(
+        thread->rand.Next() % FLAGS_compaction_thread_pool_adjust_interval *
+            1000 +
+        1);
+  }
+}
+
+void DbVerificationThread(void* v) {
+  assert(FLAGS_continuous_verification_interval > 0);
+  auto* thread = reinterpret_cast<ThreadState*>(v);
+  SharedState* shared = thread->shared;
+  StressTest* stress_test = shared->GetStressTest();
+  assert(stress_test != nullptr);
+  while (true) {
+    {
+      MutexLock l(shared->GetMutex());
+      if (shared->ShouldStopBgThread()) {
+        shared->IncBgThreadsFinished();
+        if (shared->BgThreadsFinished()) {
+          shared->GetCondVar()->SignalAll();
+        }
+        return;
+      }
+    }
+    if (!shared->HasVerificationFailedYet()) {
+      stress_test->ContinuouslyVerifyDb(thread);
+    }
+    db_stress_env->SleepForMicroseconds(
+        thread->rand.Next() % FLAGS_continuous_verification_interval * 1000 +
+        1);
+  }
+}
+
+void PrintKeyValue(int cf, uint64_t key, const char* value, size_t sz) {
+  if (!FLAGS_verbose) {
+    return;
+  }
+  std::string tmp;
+  tmp.reserve(sz * 2 + 16);
+  char buf[4];
+  for (size_t i = 0; i < sz; i++) {
+    snprintf(buf, 4, "%X", value[i]);
+    tmp.append(buf);
+  }
+  auto key_str = Key(key);
+  Slice key_slice = key_str;
+  fprintf(stdout, "[CF %d] %s (%" PRIi64 ") == > (%" ROCKSDB_PRIszt ") %s\n",
+          cf, key_slice.ToString(true).c_str(), key, sz, tmp.c_str());
+}
+
+// Note that if hot_key_alpha != 0, it generates the key based on Zipfian
+// distribution. Keys are randomly scattered to [0, FLAGS_max_key]. It does
+// not ensure the order of the keys being generated and the keys does not have
+// the active range which is related to FLAGS_active_width.
+int64_t GenerateOneKey(ThreadState* thread, uint64_t iteration) {
+  const double completed_ratio =
+      static_cast<double>(iteration) / FLAGS_ops_per_thread;
+  const int64_t base_key = static_cast<int64_t>(
+      completed_ratio * (FLAGS_max_key - FLAGS_active_width));
+  int64_t rand_seed = base_key + thread->rand.Next() % FLAGS_active_width;
+  int64_t cur_key = rand_seed;
+  if (FLAGS_hot_key_alpha != 0) {
+    // If set the Zipfian distribution Alpha to non 0, use Zipfian
+    double float_rand =
+        (static_cast<double>(thread->rand.Next() % FLAGS_max_key)) /
+        FLAGS_max_key;
+    cur_key = GetOneHotKeyID(float_rand, FLAGS_max_key);
+  }
+  return cur_key;
+}
+
+// Note that if hot_key_alpha != 0, it generates the key based on Zipfian
+// distribution. Keys being generated are in random order.
+// If user want to generate keys based on uniform distribution, user needs to
+// set hot_key_alpha == 0. It will generate the random keys in increasing
+// order in the key array (ensure key[i] >= key[i+1]) and constrained in a
+// range related to FLAGS_active_width.
+std::vector<int64_t> GenerateNKeys(ThreadState* thread, int num_keys,
+                                   uint64_t iteration) {
+  const double completed_ratio =
+      static_cast<double>(iteration) / FLAGS_ops_per_thread;
+  const int64_t base_key = static_cast<int64_t>(
+      completed_ratio * (FLAGS_max_key - FLAGS_active_width));
+  std::vector<int64_t> keys;
+  keys.reserve(num_keys);
+  int64_t next_key = base_key + thread->rand.Next() % FLAGS_active_width;
+  keys.push_back(next_key);
+  for (int i = 1; i < num_keys; ++i) {
+    // Generate the key follows zipfian distribution
+    if (FLAGS_hot_key_alpha != 0) {
+      double float_rand =
+          (static_cast<double>(thread->rand.Next() % FLAGS_max_key)) /
+          FLAGS_max_key;
+      next_key = GetOneHotKeyID(float_rand, FLAGS_max_key);
+    } else {
+      // This may result in some duplicate keys
+      next_key = next_key + thread->rand.Next() %
+                                (FLAGS_active_width - (next_key - base_key));
+    }
+    keys.push_back(next_key);
+  }
+  return keys;
+}
+
+size_t GenerateValue(uint32_t rand, char* v, size_t max_sz) {
+  size_t value_sz =
+      ((rand % kRandomValueMaxFactor) + 1) * FLAGS_value_size_mult;
+  assert(value_sz <= max_sz && value_sz >= sizeof(uint32_t));
+  (void)max_sz;
+  PutUnaligned(reinterpret_cast<uint32_t*>(v), rand);
+  for (size_t i = sizeof(uint32_t); i < value_sz; i++) {
+    v[i] = (char)(rand ^ i);
+  }
+  v[value_sz] = '\0';
+  return value_sz;  // the size of the value set.
+}
+
+uint32_t GetValueBase(Slice s) {
+  assert(s.size() >= sizeof(uint32_t));
+  uint32_t res;
+  GetUnaligned(reinterpret_cast<const uint32_t*>(s.data()), &res);
+  return res;
+}
+
+WideColumns GenerateWideColumns(uint32_t value_base, const Slice& slice) {
+  WideColumns columns;
+
+  constexpr size_t max_columns = 4;
+  const size_t num_columns = (value_base % max_columns) + 1;
+
+  columns.reserve(num_columns);
+
+  assert(slice.size() >= num_columns);
+
+  columns.emplace_back(kDefaultWideColumnName, slice);
+
+  for (size_t i = 1; i < num_columns; ++i) {
+    const Slice name(slice.data(), i);
+    const Slice value(slice.data() + i, slice.size() - i);
+
+    columns.emplace_back(name, value);
+  }
+
+  return columns;
+}
+
+WideColumns GenerateExpectedWideColumns(uint32_t value_base,
+                                        const Slice& slice) {
+  if (FLAGS_use_put_entity_one_in == 0 ||
+      (value_base % FLAGS_use_put_entity_one_in) != 0) {
+    return WideColumns{{kDefaultWideColumnName, slice}};
+  }
+
+  WideColumns columns = GenerateWideColumns(value_base, slice);
+
+  std::sort(columns.begin(), columns.end(),
+            [](const WideColumn& lhs, const WideColumn& rhs) {
+              return lhs.name().compare(rhs.name()) < 0;
+            });
+
+  return columns;
+}
+
+std::string GetNowNanos() {
+  uint64_t t = db_stress_env->NowNanos();
+  std::string ret;
+  PutFixed64(&ret, t);
+  return ret;
+}
+
+namespace {
+
+class MyXXH64Checksum : public FileChecksumGenerator {
+ public:
+  explicit MyXXH64Checksum(bool big) : big_(big) {
+    state_ = XXH64_createState();
+    XXH64_reset(state_, 0);
+  }
+
+  virtual ~MyXXH64Checksum() override { XXH64_freeState(state_); }
+
+  void Update(const char* data, size_t n) override {
+    XXH64_update(state_, data, n);
+  }
+
+  void Finalize() override {
+    assert(str_.empty());
+    uint64_t digest = XXH64_digest(state_);
+    // Store as little endian raw bytes
+    PutFixed64(&str_, digest);
+    if (big_) {
+      // Throw in some more data for stress testing (448 bits total)
+      PutFixed64(&str_, GetSliceHash64(str_));
+      PutFixed64(&str_, GetSliceHash64(str_));
+      PutFixed64(&str_, GetSliceHash64(str_));
+      PutFixed64(&str_, GetSliceHash64(str_));
+      PutFixed64(&str_, GetSliceHash64(str_));
+      PutFixed64(&str_, GetSliceHash64(str_));
+    }
+  }
+
+  std::string GetChecksum() const override {
+    assert(!str_.empty());
+    return str_;
+  }
+
+  const char* Name() const override {
+    return big_ ? "MyBigChecksum" : "MyXXH64Checksum";
+  }
+
+ private:
+  bool big_;
+  XXH64_state_t* state_;
+  std::string str_;
+};
+
+class DbStressChecksumGenFactory : public FileChecksumGenFactory {
+  std::string default_func_name_;
+
+  std::unique_ptr<FileChecksumGenerator> CreateFromFuncName(
+      const std::string& func_name) {
+    std::unique_ptr<FileChecksumGenerator> rv;
+    if (func_name == "FileChecksumCrc32c") {
+      rv.reset(new FileChecksumGenCrc32c(FileChecksumGenContext()));
+    } else if (func_name == "MyXXH64Checksum") {
+      rv.reset(new MyXXH64Checksum(false /* big */));
+    } else if (func_name == "MyBigChecksum") {
+      rv.reset(new MyXXH64Checksum(true /* big */));
+    } else {
+      // Should be a recognized function when we get here
+      assert(false);
+    }
+    return rv;
+  }
+
+ public:
+  explicit DbStressChecksumGenFactory(const std::string& default_func_name)
+      : default_func_name_(default_func_name) {}
+
+  std::unique_ptr<FileChecksumGenerator> CreateFileChecksumGenerator(
+      const FileChecksumGenContext& context) override {
+    if (context.requested_checksum_func_name.empty()) {
+      return CreateFromFuncName(default_func_name_);
+    } else {
+      return CreateFromFuncName(context.requested_checksum_func_name);
+    }
+  }
+
+  const char* Name() const override { return "FileChecksumGenCrc32cFactory"; }
+};
+
+}  // namespace
+
+std::shared_ptr<FileChecksumGenFactory> GetFileChecksumImpl(
+    const std::string& name) {
+  // Translate from friendly names to internal names
+  std::string internal_name;
+  if (name == "crc32c") {
+    internal_name = "FileChecksumCrc32c";
+  } else if (name == "xxh64") {
+    internal_name = "MyXXH64Checksum";
+  } else if (name == "big") {
+    internal_name = "MyBigChecksum";
+  } else {
+    assert(name.empty() || name == "none");
+    return nullptr;
+  }
+  return std::make_shared<DbStressChecksumGenFactory>(internal_name);
+}
+
+Status DeleteFilesInDirectory(const std::string& dirname) {
+  std::vector<std::string> filenames;
+  Status s = Env::Default()->GetChildren(dirname, &filenames);
+  for (size_t i = 0; s.ok() && i < filenames.size(); ++i) {
+    s = Env::Default()->DeleteFile(dirname + "/" + filenames[i]);
+  }
+  return s;
+}
+
+Status SaveFilesInDirectory(const std::string& src_dirname,
+                            const std::string& dst_dirname) {
+  std::vector<std::string> filenames;
+  Status s = Env::Default()->GetChildren(src_dirname, &filenames);
+  for (size_t i = 0; s.ok() && i < filenames.size(); ++i) {
+    bool is_dir = false;
+    s = Env::Default()->IsDirectory(src_dirname + "/" + filenames[i], &is_dir);
+    if (s.ok()) {
+      if (is_dir) {
+        continue;
+      }
+      s = Env::Default()->LinkFile(src_dirname + "/" + filenames[i],
+                                   dst_dirname + "/" + filenames[i]);
+    }
+  }
+  return s;
+}
+
+Status InitUnverifiedSubdir(const std::string& dirname) {
+  Status s = Env::Default()->FileExists(dirname);
+  if (s.IsNotFound()) {
+    return Status::OK();
+  }
+
+  const std::string kUnverifiedDirname = dirname + "/unverified";
+  if (s.ok()) {
+    s = Env::Default()->CreateDirIfMissing(kUnverifiedDirname);
+  }
+  if (s.ok()) {
+    // It might already exist with some stale contents. Delete any such
+    // contents.
+    s = DeleteFilesInDirectory(kUnverifiedDirname);
+  }
+  if (s.ok()) {
+    s = SaveFilesInDirectory(dirname, kUnverifiedDirname);
+  }
+  return s;
+}
+
+Status DestroyUnverifiedSubdir(const std::string& dirname) {
+  Status s = Env::Default()->FileExists(dirname);
+  if (s.IsNotFound()) {
+    return Status::OK();
+  }
+
+  const std::string kUnverifiedDirname = dirname + "/unverified";
+  if (s.ok()) {
+    s = Env::Default()->FileExists(kUnverifiedDirname);
+  }
+  if (s.IsNotFound()) {
+    return Status::OK();
+  }
+
+  if (s.ok()) {
+    s = DeleteFilesInDirectory(kUnverifiedDirname);
+  }
+  if (s.ok()) {
+    s = Env::Default()->DeleteDir(kUnverifiedDirname);
+  }
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // GFLAGS
diff --git a/src/rocksdb/db_stress_tool/db_stress_common.h b/src/rocksdb/db_stress_tool/db_stress_common.h
new file mode 100644
index 000000000..45f3e9c19
--- /dev/null
+++ b/src/rocksdb/db_stress_tool/db_stress_common.h
@@ -0,0 +1,650 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// The test uses an array to compare against values written to the database.
+// Keys written to the array are in 1:1 correspondence to the actual values in
+// the database according to the formula in the function GenerateValue.
+
+// Space is reserved in the array from 0 to FLAGS_max_key and values are
+// randomly written/deleted/read from those positions. During verification we
+// compare all the positions in the array. To shorten/elongate the running
+// time, you could change the settings: FLAGS_max_key, FLAGS_ops_per_thread,
+// (sometimes also FLAGS_threads).
+//
+// NOTE that if FLAGS_test_batches_snapshots is set, the test will have
+// different behavior. See comment of the flag for details.
+
+#ifdef GFLAGS
+#pragma once
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <cinttypes>
+#include <exception>
+#include <queue>
+#include <thread>
+
+#include "db/db_impl/db_impl.h"
+#include "db/version_set.h"
+#include "db_stress_tool/db_stress_env_wrapper.h"
+#include "db_stress_tool/db_stress_listener.h"
+#include "db_stress_tool/db_stress_shared_state.h"
+#include "db_stress_tool/db_stress_test_base.h"
+#include "logging/logging.h"
+#include "monitoring/histogram.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/utilities/backup_engine.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "rocksdb/utilities/debug.h"
+#include "rocksdb/utilities/options_util.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/write_batch.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/compression.h"
+#include "util/crc32c.h"
+#include "util/gflags_compat.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/blob_db/blob_db.h"
+#include "utilities/fault_injection_fs.h"
+#include "utilities/merge_operators.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::RegisterFlagValidator;
+using GFLAGS_NAMESPACE::SetUsageMessage;
+
+DECLARE_uint64(seed);
+DECLARE_bool(read_only);
+DECLARE_int64(max_key);
+DECLARE_double(hot_key_alpha);
+DECLARE_int32(max_key_len);
+DECLARE_string(key_len_percent_dist);
+DECLARE_int32(key_window_scale_factor);
+DECLARE_int32(column_families);
+DECLARE_string(options_file);
+DECLARE_int64(active_width);
+DECLARE_bool(test_batches_snapshots);
+DECLARE_bool(atomic_flush);
+DECLARE_int32(manual_wal_flush_one_in);
+DECLARE_bool(test_cf_consistency);
+DECLARE_bool(test_multi_ops_txns);
+DECLARE_int32(threads);
+DECLARE_int32(ttl);
+DECLARE_int32(value_size_mult);
+DECLARE_int32(compaction_readahead_size);
+DECLARE_bool(enable_pipelined_write);
+DECLARE_bool(verify_before_write);
+DECLARE_bool(histogram);
+DECLARE_bool(destroy_db_initially);
+DECLARE_bool(verbose);
+DECLARE_bool(progress_reports);
+DECLARE_uint64(db_write_buffer_size);
+DECLARE_int32(write_buffer_size);
+DECLARE_int32(max_write_buffer_number);
+DECLARE_int32(min_write_buffer_number_to_merge);
+DECLARE_int32(max_write_buffer_number_to_maintain);
+DECLARE_int64(max_write_buffer_size_to_maintain);
+DECLARE_double(memtable_prefix_bloom_size_ratio);
+DECLARE_bool(memtable_whole_key_filtering);
+DECLARE_int32(open_files);
+DECLARE_int64(compressed_cache_size);
+DECLARE_int32(compressed_cache_numshardbits);
+DECLARE_int32(compaction_style);
+DECLARE_int32(compaction_pri);
+DECLARE_int32(num_levels);
+DECLARE_int32(level0_file_num_compaction_trigger);
+DECLARE_int32(level0_slowdown_writes_trigger);
+DECLARE_int32(level0_stop_writes_trigger);
+DECLARE_int32(block_size);
+DECLARE_int32(format_version);
+DECLARE_int32(index_block_restart_interval);
+DECLARE_bool(disable_auto_compactions);
+DECLARE_int32(max_background_compactions);
+DECLARE_int32(num_bottom_pri_threads);
+DECLARE_int32(compaction_thread_pool_adjust_interval);
+DECLARE_int32(compaction_thread_pool_variations);
+DECLARE_int32(max_background_flushes);
+DECLARE_int32(universal_size_ratio);
+DECLARE_int32(universal_min_merge_width);
+DECLARE_int32(universal_max_merge_width);
+DECLARE_int32(universal_max_size_amplification_percent);
+DECLARE_int32(clear_column_family_one_in);
+DECLARE_int32(get_live_files_one_in);
+DECLARE_int32(get_sorted_wal_files_one_in);
+DECLARE_int32(get_current_wal_file_one_in);
+DECLARE_int32(set_options_one_in);
+DECLARE_int32(set_in_place_one_in);
+DECLARE_int64(cache_size);
+DECLARE_int32(cache_numshardbits);
+DECLARE_bool(cache_index_and_filter_blocks);
+DECLARE_bool(charge_compression_dictionary_building_buffer);
+DECLARE_bool(charge_filter_construction);
+DECLARE_bool(charge_table_reader);
+DECLARE_bool(charge_file_metadata);
+DECLARE_bool(charge_blob_cache);
+DECLARE_int32(top_level_index_pinning);
+DECLARE_int32(partition_pinning);
+DECLARE_int32(unpartitioned_pinning);
+DECLARE_string(cache_type);
+DECLARE_uint64(subcompactions);
+DECLARE_uint64(periodic_compaction_seconds);
+DECLARE_uint64(compaction_ttl);
+DECLARE_bool(allow_concurrent_memtable_write);
+DECLARE_double(experimental_mempurge_threshold);
+DECLARE_bool(enable_write_thread_adaptive_yield);
+DECLARE_int32(reopen);
+DECLARE_double(bloom_bits);
+DECLARE_int32(ribbon_starting_level);
+DECLARE_bool(partition_filters);
+DECLARE_bool(optimize_filters_for_memory);
+DECLARE_bool(detect_filter_construct_corruption);
+DECLARE_int32(index_type);
+DECLARE_int32(data_block_index_type);
+DECLARE_string(db);
+DECLARE_string(secondaries_base);
+DECLARE_bool(test_secondary);
+DECLARE_string(expected_values_dir);
+DECLARE_bool(verify_checksum);
+DECLARE_bool(mmap_read);
+DECLARE_bool(mmap_write);
+DECLARE_bool(use_direct_reads);
+DECLARE_bool(use_direct_io_for_flush_and_compaction);
+DECLARE_bool(mock_direct_io);
+DECLARE_bool(statistics);
+DECLARE_bool(sync);
+DECLARE_bool(use_fsync);
+DECLARE_uint64(stats_dump_period_sec);
+DECLARE_uint64(bytes_per_sync);
+DECLARE_uint64(wal_bytes_per_sync);
+DECLARE_int32(kill_random_test);
+DECLARE_string(kill_exclude_prefixes);
+DECLARE_bool(disable_wal);
+DECLARE_uint64(recycle_log_file_num);
+DECLARE_int64(target_file_size_base);
+DECLARE_int32(target_file_size_multiplier);
+DECLARE_uint64(max_bytes_for_level_base);
+DECLARE_double(max_bytes_for_level_multiplier);
+DECLARE_int32(range_deletion_width);
+DECLARE_uint64(rate_limiter_bytes_per_sec);
+DECLARE_bool(rate_limit_bg_reads);
+DECLARE_bool(rate_limit_user_ops);
+DECLARE_bool(rate_limit_auto_wal_flush);
+DECLARE_uint64(sst_file_manager_bytes_per_sec);
+DECLARE_uint64(sst_file_manager_bytes_per_truncate);
+DECLARE_bool(use_txn);
+DECLARE_uint64(txn_write_policy);
+DECLARE_bool(unordered_write);
+DECLARE_int32(backup_one_in);
+DECLARE_uint64(backup_max_size);
+DECLARE_int32(checkpoint_one_in);
+DECLARE_int32(ingest_external_file_one_in);
+DECLARE_int32(ingest_external_file_width);
+DECLARE_int32(compact_files_one_in);
+DECLARE_int32(compact_range_one_in);
+DECLARE_int32(mark_for_compaction_one_file_in);
+DECLARE_int32(flush_one_in);
+DECLARE_int32(pause_background_one_in);
+DECLARE_int32(compact_range_width);
+DECLARE_int32(acquire_snapshot_one_in);
+DECLARE_bool(compare_full_db_state_snapshot);
+DECLARE_uint64(snapshot_hold_ops);
+DECLARE_bool(long_running_snapshots);
+DECLARE_bool(use_multiget);
+DECLARE_int32(readpercent);
+DECLARE_int32(prefixpercent);
+DECLARE_int32(writepercent);
+DECLARE_int32(delpercent);
+DECLARE_int32(delrangepercent);
+DECLARE_int32(nooverwritepercent);
+DECLARE_int32(iterpercent);
+DECLARE_uint64(num_iterations);
+DECLARE_int32(customopspercent);
+DECLARE_string(compression_type);
+DECLARE_string(bottommost_compression_type);
+DECLARE_int32(compression_max_dict_bytes);
+DECLARE_int32(compression_zstd_max_train_bytes);
+DECLARE_int32(compression_parallel_threads);
+DECLARE_uint64(compression_max_dict_buffer_bytes);
+DECLARE_bool(compression_use_zstd_dict_trainer);
+DECLARE_string(checksum_type);
+DECLARE_string(env_uri);
+DECLARE_string(fs_uri);
+DECLARE_uint64(ops_per_thread);
+DECLARE_uint64(log2_keys_per_lock);
+DECLARE_uint64(max_manifest_file_size);
+DECLARE_bool(in_place_update);
+DECLARE_string(memtablerep);
+DECLARE_int32(prefix_size);
+DECLARE_bool(use_merge);
+DECLARE_uint32(use_put_entity_one_in);
+DECLARE_bool(use_full_merge_v1);
+DECLARE_int32(sync_wal_one_in);
+DECLARE_bool(avoid_unnecessary_blocking_io);
+DECLARE_bool(write_dbid_to_manifest);
+DECLARE_bool(avoid_flush_during_recovery);
+DECLARE_uint64(max_write_batch_group_size_bytes);
+DECLARE_bool(level_compaction_dynamic_level_bytes);
+DECLARE_int32(verify_checksum_one_in);
+DECLARE_int32(verify_db_one_in);
+DECLARE_int32(continuous_verification_interval);
+DECLARE_int32(get_property_one_in);
+DECLARE_string(file_checksum_impl);
+
+#ifndef ROCKSDB_LITE
+// Options for StackableDB-based BlobDB
+DECLARE_bool(use_blob_db);
+DECLARE_uint64(blob_db_min_blob_size);
+DECLARE_uint64(blob_db_bytes_per_sync);
+DECLARE_uint64(blob_db_file_size);
+DECLARE_bool(blob_db_enable_gc);
+DECLARE_double(blob_db_gc_cutoff);
+#endif  // !ROCKSDB_LITE
+
+// Options for integrated BlobDB
+DECLARE_bool(allow_setting_blob_options_dynamically);
+DECLARE_bool(enable_blob_files);
+DECLARE_uint64(min_blob_size);
+DECLARE_uint64(blob_file_size);
+DECLARE_string(blob_compression_type);
+DECLARE_bool(enable_blob_garbage_collection);
+DECLARE_double(blob_garbage_collection_age_cutoff);
+DECLARE_double(blob_garbage_collection_force_threshold);
+DECLARE_uint64(blob_compaction_readahead_size);
+DECLARE_int32(blob_file_starting_level);
+DECLARE_bool(use_blob_cache);
+DECLARE_bool(use_shared_block_and_blob_cache);
+DECLARE_uint64(blob_cache_size);
+DECLARE_int32(blob_cache_numshardbits);
+DECLARE_int32(prepopulate_blob_cache);
+
+DECLARE_int32(approximate_size_one_in);
+DECLARE_bool(sync_fault_injection);
+
+DECLARE_bool(best_efforts_recovery);
+DECLARE_bool(skip_verifydb);
+DECLARE_bool(enable_compaction_filter);
+DECLARE_bool(paranoid_file_checks);
+DECLARE_bool(fail_if_options_file_error);
+DECLARE_uint64(batch_protection_bytes_per_key);
+DECLARE_uint32(memtable_protection_bytes_per_key);
+
+DECLARE_uint64(user_timestamp_size);
+DECLARE_string(secondary_cache_uri);
+DECLARE_int32(secondary_cache_fault_one_in);
+
+DECLARE_int32(prepopulate_block_cache);
+
+DECLARE_bool(two_write_queues);
+#ifndef ROCKSDB_LITE
+DECLARE_bool(use_only_the_last_commit_time_batch_for_recovery);
+DECLARE_uint64(wp_snapshot_cache_bits);
+DECLARE_uint64(wp_commit_cache_bits);
+#endif  // !ROCKSDB_LITE
+
+DECLARE_bool(adaptive_readahead);
+DECLARE_bool(async_io);
+DECLARE_string(wal_compression);
+DECLARE_bool(verify_sst_unique_id_in_manifest);
+
+DECLARE_int32(create_timestamped_snapshot_one_in);
+
+DECLARE_bool(allow_data_in_errors);
+
+// Tiered storage
+DECLARE_bool(enable_tiered_storage);  // set last_level_temperature
+DECLARE_int64(preclude_last_level_data_seconds);
+DECLARE_int64(preserve_internal_time_seconds);
+
+DECLARE_int32(verify_iterator_with_expected_state_one_in);
+DECLARE_bool(preserve_unverified_changes);
+
+DECLARE_uint64(readahead_size);
+DECLARE_uint64(initial_auto_readahead_size);
+DECLARE_uint64(max_auto_readahead_size);
+DECLARE_uint64(num_file_reads_for_auto_readahead);
+
+constexpr long KB = 1024;
+constexpr int kRandomValueMaxFactor = 3;
+constexpr int kValueMaxLen = 100;
+
+// wrapped posix environment
+extern ROCKSDB_NAMESPACE::Env* db_stress_env;
+extern ROCKSDB_NAMESPACE::Env* db_stress_listener_env;
+extern std::shared_ptr<ROCKSDB_NAMESPACE::FaultInjectionTestFS> fault_fs_guard;
+
+extern enum ROCKSDB_NAMESPACE::CompressionType compression_type_e;
+extern enum ROCKSDB_NAMESPACE::CompressionType bottommost_compression_type_e;
+extern enum ROCKSDB_NAMESPACE::ChecksumType checksum_type_e;
+
+enum RepFactory { kSkipList, kHashSkipList, kVectorRep };
+
+inline enum RepFactory StringToRepFactory(const char* ctype) {
+  assert(ctype);
+
+  if (!strcasecmp(ctype, "skip_list"))
+    return kSkipList;
+  else if (!strcasecmp(ctype, "prefix_hash"))
+    return kHashSkipList;
+  else if (!strcasecmp(ctype, "vector"))
+    return kVectorRep;
+
+  fprintf(stdout, "Cannot parse memreptable %s\n", ctype);
+  return kSkipList;
+}
+
+extern enum RepFactory FLAGS_rep_factory;
+
+namespace ROCKSDB_NAMESPACE {
+inline enum ROCKSDB_NAMESPACE::CompressionType StringToCompressionType(
+    const char* ctype) {
+  assert(ctype);
+
+  ROCKSDB_NAMESPACE::CompressionType ret_compression_type;
+
+  if (!strcasecmp(ctype, "disable")) {
+    ret_compression_type = ROCKSDB_NAMESPACE::kDisableCompressionOption;
+  } else if (!strcasecmp(ctype, "none")) {
+    ret_compression_type = ROCKSDB_NAMESPACE::kNoCompression;
+  } else if (!strcasecmp(ctype, "snappy")) {
+    ret_compression_type = ROCKSDB_NAMESPACE::kSnappyCompression;
+  } else if (!strcasecmp(ctype, "zlib")) {
+    ret_compression_type = ROCKSDB_NAMESPACE::kZlibCompression;
+  } else if (!strcasecmp(ctype, "bzip2")) {
+    ret_compression_type = ROCKSDB_NAMESPACE::kBZip2Compression;
+  } else if (!strcasecmp(ctype, "lz4")) {
+    ret_compression_type = ROCKSDB_NAMESPACE::kLZ4Compression;
+  } else if (!strcasecmp(ctype, "lz4hc")) {
+    ret_compression_type = ROCKSDB_NAMESPACE::kLZ4HCCompression;
+  } else if (!strcasecmp(ctype, "xpress")) {
+    ret_compression_type = ROCKSDB_NAMESPACE::kXpressCompression;
+  } else if (!strcasecmp(ctype, "zstd")) {
+    ret_compression_type = ROCKSDB_NAMESPACE::kZSTD;
+  } else {
+    fprintf(stderr, "Cannot parse compression type '%s'\n", ctype);
+    ret_compression_type =
+        ROCKSDB_NAMESPACE::kSnappyCompression;  // default value
+  }
+  if (ret_compression_type != ROCKSDB_NAMESPACE::kDisableCompressionOption &&
+      !CompressionTypeSupported(ret_compression_type)) {
+    // Use no compression will be more portable but considering this is
+    // only a stress test and snappy is widely available. Use snappy here.
+    ret_compression_type = ROCKSDB_NAMESPACE::kSnappyCompression;
+  }
+  return ret_compression_type;
+}
+
+inline enum ROCKSDB_NAMESPACE::ChecksumType StringToChecksumType(
+    const char* ctype) {
+  assert(ctype);
+  auto iter = ROCKSDB_NAMESPACE::checksum_type_string_map.find(ctype);
+  if (iter != ROCKSDB_NAMESPACE::checksum_type_string_map.end()) {
+    return iter->second;
+  }
+  fprintf(stderr, "Cannot parse checksum type '%s'\n", ctype);
+  return ROCKSDB_NAMESPACE::kCRC32c;
+}
+
+inline std::string ChecksumTypeToString(ROCKSDB_NAMESPACE::ChecksumType ctype) {
+  auto iter = std::find_if(
+      ROCKSDB_NAMESPACE::checksum_type_string_map.begin(),
+      ROCKSDB_NAMESPACE::checksum_type_string_map.end(),
+      [&](const std::pair<std::string, ROCKSDB_NAMESPACE::ChecksumType>&
+              name_and_enum_val) { return name_and_enum_val.second == ctype; });
+  assert(iter != ROCKSDB_NAMESPACE::checksum_type_string_map.end());
+  return iter->first;
+}
+
+inline std::vector<std::string> SplitString(std::string src) {
+  std::vector<std::string> ret;
+  if (src.empty()) {
+    return ret;
+  }
+  size_t pos = 0;
+  size_t pos_comma;
+  while ((pos_comma = src.find(',', pos)) != std::string::npos) {
+    ret.push_back(src.substr(pos, pos_comma - pos));
+    pos = pos_comma + 1;
+  }
+  ret.push_back(src.substr(pos, src.length()));
+  return ret;
+}
+
+#ifdef _MSC_VER
+#pragma warning(push)
+// truncation of constant value on static_cast
+#pragma warning(disable : 4309)
+#endif
+inline bool GetNextPrefix(const ROCKSDB_NAMESPACE::Slice& src, std::string* v) {
+  std::string ret = src.ToString();
+  for (int i = static_cast<int>(ret.size()) - 1; i >= 0; i--) {
+    if (ret[i] != static_cast<char>(255)) {
+      ret[i] = ret[i] + 1;
+      break;
+    } else if (i != 0) {
+      ret[i] = 0;
+    } else {
+      // all FF. No next prefix
+      return false;
+    }
+  }
+  *v = ret;
+  return true;
+}
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+// Append `val` to `*key` in fixed-width big-endian format
+extern inline void AppendIntToString(uint64_t val, std::string* key) {
+  // PutFixed64 uses little endian
+  PutFixed64(key, val);
+  // Reverse to get big endian
+  char* int_data = &((*key)[key->size() - sizeof(uint64_t)]);
+  for (size_t i = 0; i < sizeof(uint64_t) / 2; ++i) {
+    std::swap(int_data[i], int_data[sizeof(uint64_t) - 1 - i]);
+  }
+}
+
+// A struct for maintaining the parameters for generating variable length keys
+struct KeyGenContext {
+  // Number of adjacent keys in one cycle of key lengths
+  uint64_t window;
+  // Number of keys of each possible length in a given window
+  std::vector<uint64_t> weights;
+};
+extern KeyGenContext key_gen_ctx;
+
+// Generate a variable length key string from the given int64 val. The
+// order of the keys is preserved. The key could be anywhere from 8 to
+// max_key_len * 8 bytes.
+// The algorithm picks the length based on the
+// offset of the val within a configured window and the distribution of the
+// number of keys of various lengths in that window. For example, if x, y, x are
+// the weights assigned to each possible key length, the keys generated would be
+// - {0}...{x-1}
+// {(x-1),0}..{(x-1),(y-1)},{(x-1),(y-1),0}..{(x-1),(y-1),(z-1)} and so on.
+// Additionally, a trailer of 0-7 bytes could be appended.
+extern inline std::string Key(int64_t val) {
+  uint64_t window = key_gen_ctx.window;
+  size_t levels = key_gen_ctx.weights.size();
+  std::string key;
+  // Over-reserve and for now do not bother `shrink_to_fit()` since the key
+  // strings are transient.
+  key.reserve(FLAGS_max_key_len * 8);
+
+  uint64_t window_idx = static_cast<uint64_t>(val) / window;
+  uint64_t offset = static_cast<uint64_t>(val) % window;
+  for (size_t level = 0; level < levels; ++level) {
+    uint64_t weight = key_gen_ctx.weights[level];
+    uint64_t pfx;
+    if (level == 0) {
+      pfx = window_idx * weight;
+    } else {
+      pfx = 0;
+    }
+    pfx += offset >= weight ? weight - 1 : offset;
+    AppendIntToString(pfx, &key);
+    if (offset < weight) {
+      // Use the bottom 3 bits of offset as the number of trailing 'x's in the
+      // key. If the next key is going to be of the next level, then skip the
+      // trailer as it would break ordering. If the key length is already at
+      // max, skip the trailer.
+      if (offset < weight - 1 && level < levels - 1) {
+        size_t trailer_len = offset & 0x7;
+        key.append(trailer_len, 'x');
+      }
+      break;
+    }
+    offset -= weight;
+  }
+
+  return key;
+}
+
+// Given a string key, map it to an index into the expected values buffer
+extern inline bool GetIntVal(std::string big_endian_key, uint64_t* key_p) {
+  size_t size_key = big_endian_key.size();
+  std::vector<uint64_t> prefixes;
+
+  assert(size_key <= key_gen_ctx.weights.size() * sizeof(uint64_t));
+
+  std::string little_endian_key;
+  little_endian_key.resize(size_key);
+  for (size_t start = 0; start + sizeof(uint64_t) <= size_key;
+       start += sizeof(uint64_t)) {
+    size_t end = start + sizeof(uint64_t);
+    for (size_t i = 0; i < sizeof(uint64_t); ++i) {
+      little_endian_key[start + i] = big_endian_key[end - 1 - i];
+    }
+    Slice little_endian_slice =
+        Slice(&little_endian_key[start], sizeof(uint64_t));
+    uint64_t pfx;
+    if (!GetFixed64(&little_endian_slice, &pfx)) {
+      return false;
+    }
+    prefixes.emplace_back(pfx);
+  }
+
+  uint64_t key = 0;
+  for (size_t i = 0; i < prefixes.size(); ++i) {
+    uint64_t pfx = prefixes[i];
+    key += (pfx / key_gen_ctx.weights[i]) * key_gen_ctx.window +
+           pfx % key_gen_ctx.weights[i];
+    if (i < prefixes.size() - 1) {
+      // The encoding writes a `key_gen_ctx.weights[i] - 1` that counts for
+      // `key_gen_ctx.weights[i]` when there are more prefixes to come. So we
+      // need to add back the one here as we're at a non-last prefix.
+      ++key;
+    }
+  }
+  *key_p = key;
+  return true;
+}
+
+// Given a string prefix, map it to the first corresponding index in the
+// expected values buffer.
+inline bool GetFirstIntValInPrefix(std::string big_endian_prefix,
+                                   uint64_t* key_p) {
+  size_t size_key = big_endian_prefix.size();
+  // Pad with zeros to make it a multiple of 8. This function may be called
+  // with a prefix, in which case we return the first index that falls
+  // inside or outside that prefix, dependeing on whether the prefix is
+  // the start of upper bound of a scan
+  unsigned int pad = sizeof(uint64_t) - (size_key % sizeof(uint64_t));
+  if (pad < sizeof(uint64_t)) {
+    big_endian_prefix.append(pad, '\0');
+  }
+  return GetIntVal(std::move(big_endian_prefix), key_p);
+}
+
+extern inline uint64_t GetPrefixKeyCount(const std::string& prefix,
+                                         const std::string& ub) {
+  uint64_t start = 0;
+  uint64_t end = 0;
+
+  if (!GetFirstIntValInPrefix(prefix, &start) ||
+      !GetFirstIntValInPrefix(ub, &end)) {
+    return 0;
+  }
+
+  return end - start;
+}
+
+extern inline std::string StringToHex(const std::string& str) {
+  std::string result = "0x";
+  result.append(Slice(str).ToString(true));
+  return result;
+}
+
+// Unified output format for double parameters
+extern inline std::string FormatDoubleParam(double param) {
+  return std::to_string(param);
+}
+
+// Make sure that double parameter is a value we can reproduce by
+// re-inputting the value printed.
+extern inline void SanitizeDoubleParam(double* param) {
+  *param = std::atof(FormatDoubleParam(*param).c_str());
+}
+
+extern void PoolSizeChangeThread(void* v);
+
+extern void DbVerificationThread(void* v);
+
+extern void TimestampedSnapshotsThread(void* v);
+
+extern void PrintKeyValue(int cf, uint64_t key, const char* value, size_t sz);
+
+extern int64_t GenerateOneKey(ThreadState* thread, uint64_t iteration);
+
+extern std::vector<int64_t> GenerateNKeys(ThreadState* thread, int num_keys,
+                                          uint64_t iteration);
+
+extern size_t GenerateValue(uint32_t rand, char* v, size_t max_sz);
+extern uint32_t GetValueBase(Slice s);
+
+extern WideColumns GenerateWideColumns(uint32_t value_base, const Slice& slice);
+extern WideColumns GenerateExpectedWideColumns(uint32_t value_base,
+                                               const Slice& slice);
+
+extern StressTest* CreateCfConsistencyStressTest();
+extern StressTest* CreateBatchedOpsStressTest();
+extern StressTest* CreateNonBatchedOpsStressTest();
+extern StressTest* CreateMultiOpsTxnsStressTest();
+extern void CheckAndSetOptionsForMultiOpsTxnStressTest();
+extern void InitializeHotKeyGenerator(double alpha);
+extern int64_t GetOneHotKeyID(double rand_seed, int64_t max_key);
+
+extern std::string GetNowNanos();
+
+std::shared_ptr<FileChecksumGenFactory> GetFileChecksumImpl(
+    const std::string& name);
+
+Status DeleteFilesInDirectory(const std::string& dirname);
+Status SaveFilesInDirectory(const std::string& src_dirname,
+                            const std::string& dst_dirname);
+Status DestroyUnverifiedSubdir(const std::string& dirname);
+Status InitUnverifiedSubdir(const std::string& dirname);
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // GFLAGS
diff --git a/src/rocksdb/db_stress_tool/db_stress_compaction_filter.h b/src/rocksdb/db_stress_tool/db_stress_compaction_filter.h
new file mode 100644
index 000000000..408bb48f3
--- /dev/null
+++ b/src/rocksdb/db_stress_tool/db_stress_compaction_filter.h
@@ -0,0 +1,96 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db_stress_tool/db_stress_common.h"
+#include "db_stress_tool/db_stress_shared_state.h"
+#include "rocksdb/compaction_filter.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// DbStressCompactionFilter is safe to use with db_stress as it does not perform
+// any mutation. It only makes `kRemove` decisions for keys that are already
+// non-existent according to the `SharedState`.
+class DbStressCompactionFilter : public CompactionFilter {
+ public:
+  DbStressCompactionFilter(SharedState* state, int cf_id)
+      : state_(state), cf_id_(cf_id) {}
+
+  Decision FilterV2(int /*level*/, const Slice& key, ValueType /*value_type*/,
+                    const Slice& /*existing_value*/, std::string* /*new_value*/,
+                    std::string* /*skip_until*/) const override {
+    if (state_ == nullptr) {
+      return Decision::kKeep;
+    }
+    if (key.empty() || ('0' <= key[0] && key[0] <= '9')) {
+      // It is likely leftover from a test_batches_snapshots run. Below this
+      // conditional, the test_batches_snapshots key format is not handled
+      // properly. Just keep it to be safe.
+      return Decision::kKeep;
+    }
+    uint64_t key_num = 0;
+    {
+      Slice ukey_without_ts = key;
+      assert(ukey_without_ts.size() >= FLAGS_user_timestamp_size);
+      ukey_without_ts.remove_suffix(FLAGS_user_timestamp_size);
+      [[maybe_unused]] bool ok =
+          GetIntVal(ukey_without_ts.ToString(), &key_num);
+      assert(ok);
+    }
+    port::Mutex* key_mutex = state_->GetMutexForKey(cf_id_, key_num);
+    if (!key_mutex->TryLock()) {
+      return Decision::kKeep;
+    }
+    // Reaching here means we acquired the lock.
+
+    bool key_exists = state_->Exists(cf_id_, key_num);
+    const bool allow_overwrite = state_->AllowsOverwrite(key_num);
+
+    key_mutex->Unlock();
+
+    if (!key_exists) {
+      return allow_overwrite ? Decision::kRemove : Decision::kPurge;
+    }
+    return Decision::kKeep;
+  }
+
+  const char* Name() const override { return "DbStressCompactionFilter"; }
+
+ private:
+  SharedState* const state_;
+  const int cf_id_;
+};
+
+class DbStressCompactionFilterFactory : public CompactionFilterFactory {
+ public:
+  DbStressCompactionFilterFactory() : state_(nullptr) {}
+
+  void SetSharedState(SharedState* state) {
+    MutexLock state_mutex_guard(&state_mutex_);
+    state_ = state;
+  }
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    MutexLock state_mutex_guard(&state_mutex_);
+    return std::unique_ptr<CompactionFilter>(
+        new DbStressCompactionFilter(state_, context.column_family_id));
+  }
+
+  const char* Name() const override {
+    return "DbStressCompactionFilterFactory";
+  }
+
+ private:
+  port::Mutex state_mutex_;
+  SharedState* state_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db_stress_tool/db_stress_driver.cc b/src/rocksdb/db_stress_tool/db_stress_driver.cc
new file mode 100644
index 000000000..ed1240e00
--- /dev/null
+++ b/src/rocksdb/db_stress_tool/db_stress_driver.cc
@@ -0,0 +1,212 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+
+#ifdef GFLAGS
+#include "db_stress_tool/db_stress_common.h"
+#include "utilities/fault_injection_fs.h"
+
+namespace ROCKSDB_NAMESPACE {
+void ThreadBody(void* v) {
+  ThreadState* thread = reinterpret_cast<ThreadState*>(v);
+  SharedState* shared = thread->shared;
+
+  if (!FLAGS_skip_verifydb && shared->ShouldVerifyAtBeginning()) {
+    thread->shared->GetStressTest()->VerifyDb(thread);
+  }
+  {
+    MutexLock l(shared->GetMutex());
+    shared->IncInitialized();
+    if (shared->AllInitialized()) {
+      shared->GetCondVar()->SignalAll();
+    }
+    while (!shared->Started()) {
+      shared->GetCondVar()->Wait();
+    }
+  }
+  thread->shared->GetStressTest()->OperateDb(thread);
+
+  {
+    MutexLock l(shared->GetMutex());
+    shared->IncOperated();
+    if (shared->AllOperated()) {
+      shared->GetCondVar()->SignalAll();
+    }
+    while (!shared->VerifyStarted()) {
+      shared->GetCondVar()->Wait();
+    }
+  }
+
+  if (!FLAGS_skip_verifydb) {
+    thread->shared->GetStressTest()->VerifyDb(thread);
+  }
+
+  {
+    MutexLock l(shared->GetMutex());
+    shared->IncDone();
+    if (shared->AllDone()) {
+      shared->GetCondVar()->SignalAll();
+    }
+  }
+}
+
+bool RunStressTest(StressTest* stress) {
+  SystemClock* clock = db_stress_env->GetSystemClock().get();
+
+  SharedState shared(db_stress_env, stress);
+
+  if (shared.ShouldVerifyAtBeginning() && FLAGS_preserve_unverified_changes) {
+    Status s = InitUnverifiedSubdir(FLAGS_db);
+    if (s.ok() && !FLAGS_expected_values_dir.empty()) {
+      s = InitUnverifiedSubdir(FLAGS_expected_values_dir);
+    }
+    if (!s.ok()) {
+      fprintf(stderr, "Failed to setup unverified state dir: %s\n",
+              s.ToString().c_str());
+      exit(1);
+    }
+  }
+
+  stress->InitDb(&shared);
+  stress->FinishInitDb(&shared);
+
+  if (FLAGS_sync_fault_injection) {
+    fault_fs_guard->SetFilesystemDirectWritable(false);
+  }
+  if (FLAGS_write_fault_one_in) {
+    fault_fs_guard->EnableWriteErrorInjection();
+  }
+
+  uint32_t n = FLAGS_threads;
+  uint64_t now = clock->NowMicros();
+  fprintf(stdout, "%s Initializing worker threads\n",
+          clock->TimeToString(now / 1000000).c_str());
+
+  shared.SetThreads(n);
+
+  if (FLAGS_compaction_thread_pool_adjust_interval > 0) {
+    shared.IncBgThreads();
+  }
+
+  if (FLAGS_continuous_verification_interval > 0) {
+    shared.IncBgThreads();
+  }
+
+  std::vector<ThreadState*> threads(n);
+  for (uint32_t i = 0; i < n; i++) {
+    threads[i] = new ThreadState(i, &shared);
+    db_stress_env->StartThread(ThreadBody, threads[i]);
+  }
+
+  ThreadState bg_thread(0, &shared);
+  if (FLAGS_compaction_thread_pool_adjust_interval > 0) {
+    db_stress_env->StartThread(PoolSizeChangeThread, &bg_thread);
+  }
+
+  ThreadState continuous_verification_thread(0, &shared);
+  if (FLAGS_continuous_verification_interval > 0) {
+    db_stress_env->StartThread(DbVerificationThread,
+                               &continuous_verification_thread);
+  }
+
+  // Each thread goes through the following states:
+  // initializing -> wait for others to init -> read/populate/depopulate
+  // wait for others to operate -> verify -> done
+
+  {
+    MutexLock l(shared.GetMutex());
+    while (!shared.AllInitialized()) {
+      shared.GetCondVar()->Wait();
+    }
+    if (shared.ShouldVerifyAtBeginning()) {
+      if (shared.HasVerificationFailedYet()) {
+        fprintf(stderr, "Crash-recovery verification failed :(\n");
+      } else {
+        fprintf(stdout, "Crash-recovery verification passed :)\n");
+        Status s = DestroyUnverifiedSubdir(FLAGS_db);
+        if (s.ok() && !FLAGS_expected_values_dir.empty()) {
+          s = DestroyUnverifiedSubdir(FLAGS_expected_values_dir);
+        }
+        if (!s.ok()) {
+          fprintf(stderr, "Failed to cleanup unverified state dir: %s\n",
+                  s.ToString().c_str());
+          exit(1);
+        }
+      }
+    }
+
+    // This is after the verification step to avoid making all those `Get()`s
+    // and `MultiGet()`s contend on the DB-wide trace mutex.
+    if (!FLAGS_expected_values_dir.empty()) {
+      stress->TrackExpectedState(&shared);
+    }
+
+    now = clock->NowMicros();
+    fprintf(stdout, "%s Starting database operations\n",
+            clock->TimeToString(now / 1000000).c_str());
+
+    shared.SetStart();
+    shared.GetCondVar()->SignalAll();
+    while (!shared.AllOperated()) {
+      shared.GetCondVar()->Wait();
+    }
+
+    now = clock->NowMicros();
+    if (FLAGS_test_batches_snapshots) {
+      fprintf(stdout, "%s Limited verification already done during gets\n",
+              clock->TimeToString((uint64_t)now / 1000000).c_str());
+    } else if (FLAGS_skip_verifydb) {
+      fprintf(stdout, "%s Verification skipped\n",
+              clock->TimeToString((uint64_t)now / 1000000).c_str());
+    } else {
+      fprintf(stdout, "%s Starting verification\n",
+              clock->TimeToString((uint64_t)now / 1000000).c_str());
+    }
+
+    shared.SetStartVerify();
+    shared.GetCondVar()->SignalAll();
+    while (!shared.AllDone()) {
+      shared.GetCondVar()->Wait();
+    }
+  }
+
+  for (unsigned int i = 1; i < n; i++) {
+    threads[0]->stats.Merge(threads[i]->stats);
+  }
+  threads[0]->stats.Report("Stress Test");
+
+  for (unsigned int i = 0; i < n; i++) {
+    delete threads[i];
+    threads[i] = nullptr;
+  }
+  now = clock->NowMicros();
+  if (!FLAGS_skip_verifydb && !FLAGS_test_batches_snapshots &&
+      !shared.HasVerificationFailedYet()) {
+    fprintf(stdout, "%s Verification successful\n",
+            clock->TimeToString(now / 1000000).c_str());
+  }
+  stress->PrintStatistics();
+
+  if (FLAGS_compaction_thread_pool_adjust_interval > 0 ||
+      FLAGS_continuous_verification_interval > 0) {
+    MutexLock l(shared.GetMutex());
+    shared.SetShouldStopBgThread();
+    while (!shared.BgThreadsFinished()) {
+      shared.GetCondVar()->Wait();
+    }
+  }
+
+  if (shared.HasVerificationFailedYet()) {
+    fprintf(stderr, "Verification failed :(\n");
+    return false;
+  }
+  return true;
+}
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // GFLAGS
diff --git a/src/rocksdb/db_stress_tool/db_stress_driver.h b/src/rocksdb/db_stress_tool/db_stress_driver.h
new file mode 100644
index 000000000..ff701fcb2
--- /dev/null
+++ b/src/rocksdb/db_stress_tool/db_stress_driver.h
@@ -0,0 +1,17 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifdef GFLAGS
+#pragma once
+#include "db_stress_tool/db_stress_test_base.h"
+namespace ROCKSDB_NAMESPACE {
+extern void ThreadBody(void* /*thread_state*/);
+extern bool RunStressTest(StressTest*);
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // GFLAGS
diff --git a/src/rocksdb/db_stress_tool/db_stress_env_wrapper.h b/src/rocksdb/db_stress_tool/db_stress_env_wrapper.h
new file mode 100644
index 000000000..21f6db2ab
--- /dev/null
+++ b/src/rocksdb/db_stress_tool/db_stress_env_wrapper.h
@@ -0,0 +1,42 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifdef GFLAGS
+#pragma once
+#include "db_stress_tool/db_stress_common.h"
+
+namespace ROCKSDB_NAMESPACE {
+class DbStressEnvWrapper : public EnvWrapper {
+ public:
+  explicit DbStressEnvWrapper(Env* t) : EnvWrapper(t) {}
+  static const char* kClassName() { return "DbStressEnv"; }
+  const char* Name() const override { return kClassName(); }
+
+  Status DeleteFile(const std::string& f) override {
+    // We determine whether it is a manifest file by searching a strong,
+    // so that there will be false positive if the directory path contains the
+    // keyword but it is unlikely.
+    // Checkpoint, backup, and restore directories needs to be exempted.
+    if (!if_preserve_all_manifests ||
+        f.find("MANIFEST-") == std::string::npos ||
+        f.find("checkpoint") != std::string::npos ||
+        f.find(".backup") != std::string::npos ||
+        f.find(".restore") != std::string::npos) {
+      return target()->DeleteFile(f);
+    }
+    // Rename the file instead of deletion to keep the history, and
+    // at the same time it is not visible to RocksDB.
+    return target()->RenameFile(f, f + "_renamed_");
+  }
+
+  // If true, all manifest files will not be delted in DeleteFile().
+  bool if_preserve_all_manifests = true;
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // GFLAGS
diff --git a/src/rocksdb/db_stress_tool/db_stress_gflags.cc b/src/rocksdb/db_stress_tool/db_stress_gflags.cc
new file mode 100644
index 000000000..7adc66509
--- /dev/null
+++ b/src/rocksdb/db_stress_tool/db_stress_gflags.cc
@@ -0,0 +1,1074 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifdef GFLAGS
+#include "db_stress_tool/db_stress_common.h"
+
+static bool ValidateUint32Range(const char* flagname, uint64_t value) {
+  if (value > std::numeric_limits<uint32_t>::max()) {
+    fprintf(stderr, "Invalid value for --%s: %lu, overflow\n", flagname,
+            (unsigned long)value);
+    return false;
+  }
+  return true;
+}
+
+DEFINE_uint64(seed, 2341234,
+              "Seed for PRNG. When --nooverwritepercent is "
+              "nonzero and --expected_values_dir is nonempty, this value "
+              "must be fixed across invocations.");
+static const bool FLAGS_seed_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_seed, &ValidateUint32Range);
+
+DEFINE_bool(read_only, false, "True if open DB in read-only mode during tests");
+
+DEFINE_int64(max_key, 1 * KB * KB,
+             "Max number of key/values to place in database");
+
+DEFINE_int32(max_key_len, 3, "Maximum length of a key in 8-byte units");
+
+DEFINE_string(key_len_percent_dist, "",
+              "Percentages of keys of various lengths. For example, 1,30,69 "
+              "means 1% of keys are 8 bytes, 30% are 16 bytes, and 69% are "
+              "24 bytes. If not specified, it will be evenly distributed");
+
+DEFINE_int32(key_window_scale_factor, 10,
+             "This value will be multiplied by 100 to come up with a window "
+             "size for varying the key length");
+
+DEFINE_int32(column_families, 10, "Number of column families");
+
+DEFINE_double(
+    hot_key_alpha, 0,
+    "Use Zipfian distribution to generate the key "
+    "distribution. If it is not specified, write path will use random "
+    "distribution to generate the keys. The parameter is [0, double_max]). "
+    "However, the larger alpha is, the more shewed will be. If alpha is "
+    "larger than 2, it is likely that only 1 key will be accessed. The "
+    "Recommended value is [0.8-1.5]. The distribution is also related to "
+    "max_key and total iterations of generating the hot key. ");
+
+DEFINE_string(
+    options_file, "",
+    "The path to a RocksDB options file.  If specified, then db_stress will "
+    "run with the RocksDB options in the default column family of the "
+    "specified options file. Note that, when an options file is provided, "
+    "db_stress will ignore the flag values for all options that may be passed "
+    "via options file.");
+
+DEFINE_int64(
+    active_width, 0,
+    "Number of keys in active span of the key-range at any given time. The "
+    "span begins with its left endpoint at key 0, gradually moves rightwards, "
+    "and ends with its right endpoint at max_key. If set to 0, active_width "
+    "will be sanitized to be equal to max_key.");
+
+// TODO(noetzli) Add support for single deletes
+DEFINE_bool(test_batches_snapshots, false,
+            "If set, the test uses MultiGet(), MultiPut() and MultiDelete()"
+            " which read/write/delete multiple keys in a batch. In this mode,"
+            " we do not verify db content by comparing the content with the "
+            "pre-allocated array. Instead, we do partial verification inside"
+            " MultiGet() by checking various values in a batch. Benefit of"
+            " this mode:\n"
+            "\t(a) No need to acquire mutexes during writes (less cache "
+            "flushes in multi-core leading to speed up)\n"
+            "\t(b) No long validation at the end (more speed up)\n"
+            "\t(c) Test snapshot and atomicity of batch writes");
+
+DEFINE_bool(atomic_flush, false,
+            "If set, enables atomic flush in the options.\n");
+
+DEFINE_int32(
+    manual_wal_flush_one_in, 0,
+    "If non-zero, then `FlushWAL(bool sync)`, where `bool sync` is randomly "
+    "decided, will be explictly called in db stress once for every N ops "
+    "on average. Setting `manual_wal_flush_one_in` to be greater than 0 "
+    "implies `Options::manual_wal_flush = true` is set.");
+
+DEFINE_bool(test_cf_consistency, false,
+            "If set, runs the stress test dedicated to verifying writes to "
+            "multiple column families are consistent. Setting this implies "
+            "`atomic_flush=true` is set true if `disable_wal=false`.\n");
+
+DEFINE_bool(test_multi_ops_txns, false,
+            "If set, runs stress test dedicated to verifying multi-ops "
+            "transactions on a simple relational table with primary and "
+            "secondary index.");
+
+DEFINE_int32(threads, 32, "Number of concurrent threads to run.");
+
+DEFINE_int32(ttl, -1,
+             "Opens the db with this ttl value if this is not -1. "
+             "Carefully specify a large value such that verifications on "
+             "deleted values don't fail");
+
+DEFINE_int32(value_size_mult, 8,
+             "Size of value will be this number times rand_int(1,3) bytes");
+
+DEFINE_int32(compaction_readahead_size, 0, "Compaction readahead size");
+
+DEFINE_bool(enable_pipelined_write, false, "Pipeline WAL/memtable writes");
+
+DEFINE_bool(verify_before_write, false, "Verify before write");
+
+DEFINE_bool(histogram, false, "Print histogram of operation timings");
+
+DEFINE_bool(destroy_db_initially, true,
+            "Destroys the database dir before start if this is true");
+
+DEFINE_bool(verbose, false, "Verbose");
+
+DEFINE_bool(progress_reports, true,
+            "If true, db_stress will report number of finished operations");
+
+DEFINE_uint64(db_write_buffer_size,
+              ROCKSDB_NAMESPACE::Options().db_write_buffer_size,
+              "Number of bytes to buffer in all memtables before compacting");
+
+DEFINE_int32(
+    write_buffer_size,
+    static_cast<int32_t>(ROCKSDB_NAMESPACE::Options().write_buffer_size),
+    "Number of bytes to buffer in memtable before compacting");
+
+DEFINE_int32(max_write_buffer_number,
+             ROCKSDB_NAMESPACE::Options().max_write_buffer_number,
+             "The number of in-memory memtables. "
+             "Each memtable is of size FLAGS_write_buffer_size.");
+
+DEFINE_int32(min_write_buffer_number_to_merge,
+             ROCKSDB_NAMESPACE::Options().min_write_buffer_number_to_merge,
+             "The minimum number of write buffers that will be merged together "
+             "before writing to storage. This is cheap because it is an "
+             "in-memory merge. If this feature is not enabled, then all these "
+             "write buffers are flushed to L0 as separate files and this "
+             "increases read amplification because a get request has to check "
+             "in all of these files. Also, an in-memory merge may result in "
+             "writing less data to storage if there are duplicate records in"
+             " each of these individual write buffers.");
+
+DEFINE_int32(max_write_buffer_number_to_maintain,
+             ROCKSDB_NAMESPACE::Options().max_write_buffer_number_to_maintain,
+             "The total maximum number of write buffers to maintain in memory "
+             "including copies of buffers that have already been flushed. "
+             "Unlike max_write_buffer_number, this parameter does not affect "
+             "flushing. This controls the minimum amount of write history "
+             "that will be available in memory for conflict checking when "
+             "Transactions are used. If this value is too low, some "
+             "transactions may fail at commit time due to not being able to "
+             "determine whether there were any write conflicts. Setting this "
+             "value to 0 will cause write buffers to be freed immediately "
+             "after they are flushed.  If this value is set to -1, "
+             "'max_write_buffer_number' will be used.");
+
+DEFINE_int64(max_write_buffer_size_to_maintain,
+             ROCKSDB_NAMESPACE::Options().max_write_buffer_size_to_maintain,
+             "The total maximum size of write buffers to maintain in memory "
+             "including copies of buffers that have already been flushed. "
+             "Unlike max_write_buffer_number, this parameter does not affect "
+             "flushing. This controls the minimum amount of write history "
+             "that will be available in memory for conflict checking when "
+             "Transactions are used. If this value is too low, some "
+             "transactions may fail at commit time due to not being able to "
+             "determine whether there were any write conflicts. Setting this "
+             "value to 0 will cause write buffers to be freed immediately "
+             "after they are flushed.  If this value is set to -1, "
+             "'max_write_buffer_number' will be used.");
+
+DEFINE_double(memtable_prefix_bloom_size_ratio,
+              ROCKSDB_NAMESPACE::Options().memtable_prefix_bloom_size_ratio,
+              "creates prefix blooms for memtables, each with size "
+              "`write_buffer_size * memtable_prefix_bloom_size_ratio`.");
+
+DEFINE_bool(memtable_whole_key_filtering,
+            ROCKSDB_NAMESPACE::Options().memtable_whole_key_filtering,
+            "Enable whole key filtering in memtables.");
+
+DEFINE_int32(open_files, ROCKSDB_NAMESPACE::Options().max_open_files,
+             "Maximum number of files to keep open at the same time "
+             "(use default if == 0)");
+
+DEFINE_int64(compressed_cache_size, 0,
+             "Number of bytes to use as a cache of compressed data."
+             " 0 means use default settings.");
+
+DEFINE_int32(
+    compressed_cache_numshardbits, -1,
+    "Number of shards for the compressed block cache is 2 ** "
+    "compressed_cache_numshardbits. Negative value means default settings. "
+    "This is applied only if compressed_cache_size is greater than 0.");
+
+DEFINE_int32(compaction_style, ROCKSDB_NAMESPACE::Options().compaction_style,
+             "");
+
+DEFINE_int32(compaction_pri, ROCKSDB_NAMESPACE::Options().compaction_pri,
+             "Which file from a level should be picked to merge to the next "
+             "level in level-based compaction");
+
+DEFINE_int32(num_levels, ROCKSDB_NAMESPACE::Options().num_levels,
+             "Number of levels in the DB");
+
+DEFINE_int32(level0_file_num_compaction_trigger,
+             ROCKSDB_NAMESPACE::Options().level0_file_num_compaction_trigger,
+             "Level0 compaction start trigger");
+
+DEFINE_int32(level0_slowdown_writes_trigger,
+             ROCKSDB_NAMESPACE::Options().level0_slowdown_writes_trigger,
+             "Number of files in level-0 that will slow down writes");
+
+DEFINE_int32(level0_stop_writes_trigger,
+             ROCKSDB_NAMESPACE::Options().level0_stop_writes_trigger,
+             "Number of files in level-0 that will trigger put stop.");
+
+DEFINE_int32(block_size,
+             static_cast<int32_t>(
+                 ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_size),
+             "Number of bytes in a block.");
+
+DEFINE_int32(format_version,
+             static_cast<int32_t>(
+                 ROCKSDB_NAMESPACE::BlockBasedTableOptions().format_version),
+             "Format version of SST files.");
+
+DEFINE_int32(
+    index_block_restart_interval,
+    ROCKSDB_NAMESPACE::BlockBasedTableOptions().index_block_restart_interval,
+    "Number of keys between restart points "
+    "for delta encoding of keys in index block.");
+
+DEFINE_bool(disable_auto_compactions,
+            ROCKSDB_NAMESPACE::Options().disable_auto_compactions,
+            "If true, RocksDB internally will not trigger compactions.");
+
+DEFINE_int32(max_background_compactions,
+             ROCKSDB_NAMESPACE::Options().max_background_compactions,
+             "The maximum number of concurrent background compactions "
+             "that can occur in parallel.");
+
+DEFINE_int32(num_bottom_pri_threads, 0,
+             "The number of threads in the bottom-priority thread pool (used "
+             "by universal compaction only).");
+
+DEFINE_int32(compaction_thread_pool_adjust_interval, 0,
+             "The interval (in milliseconds) to adjust compaction thread pool "
+             "size. Don't change it periodically if the value is 0.");
+
+DEFINE_int32(compaction_thread_pool_variations, 2,
+             "Range of background thread pool size variations when adjusted "
+             "periodically.");
+
+DEFINE_int32(max_background_flushes,
+             ROCKSDB_NAMESPACE::Options().max_background_flushes,
+             "The maximum number of concurrent background flushes "
+             "that can occur in parallel.");
+
+DEFINE_int32(universal_size_ratio, 0,
+             "The ratio of file sizes that trigger"
+             " compaction in universal style");
+
+DEFINE_int32(universal_min_merge_width, 0,
+             "The minimum number of files to "
+             "compact in universal style compaction");
+
+DEFINE_int32(universal_max_merge_width, 0,
+             "The max number of files to compact"
+             " in universal style compaction");
+
+DEFINE_int32(universal_max_size_amplification_percent, 0,
+             "The max size amplification for universal style compaction");
+
+DEFINE_int32(clear_column_family_one_in, 1000000,
+             "With a chance of 1/N, delete a column family and then recreate "
+             "it again. If N == 0, never drop/create column families. "
+             "When test_batches_snapshots is true, this flag has no effect");
+
+DEFINE_int32(get_live_files_one_in, 1000000,
+             "With a chance of 1/N, call GetLiveFiles to verify if it returns "
+             "correctly. If N == 0, do not call the interface.");
+
+DEFINE_int32(
+    get_sorted_wal_files_one_in, 1000000,
+    "With a chance of 1/N, call GetSortedWalFiles to verify if it returns "
+    "correctly. (Note that this API may legitimately return an error.) If N == "
+    "0, do not call the interface.");
+
+DEFINE_int32(
+    get_current_wal_file_one_in, 1000000,
+    "With a chance of 1/N, call GetCurrentWalFile to verify if it returns "
+    "correctly. (Note that this API may legitimately return an error.) If N == "
+    "0, do not call the interface.");
+
+DEFINE_int32(set_options_one_in, 0,
+             "With a chance of 1/N, change some random options");
+
+DEFINE_int32(set_in_place_one_in, 0,
+             "With a chance of 1/N, toggle in place support option");
+
+DEFINE_int64(cache_size, 2LL * KB * KB * KB,
+             "Number of bytes to use as a cache of uncompressed data.");
+
+DEFINE_int32(cache_numshardbits, 6,
+             "Number of shards for the block cache"
+             " is 2 ** cache_numshardbits. Negative means use default settings."
+             " This is applied only if FLAGS_cache_size is greater than 0.");
+
+DEFINE_bool(cache_index_and_filter_blocks, false,
+            "True if indexes/filters should be cached in block cache.");
+
+DEFINE_bool(charge_compression_dictionary_building_buffer, false,
+            "Setting for "
+            "CacheEntryRoleOptions::charged of "
+            "CacheEntryRole::kCompressionDictionaryBuildingBuffer");
+
+DEFINE_bool(charge_filter_construction, false,
+            "Setting for "
+            "CacheEntryRoleOptions::charged of "
+            "CacheEntryRole::kFilterConstruction");
+
+DEFINE_bool(charge_table_reader, false,
+            "Setting for "
+            "CacheEntryRoleOptions::charged of "
+            "CacheEntryRole::kBlockBasedTableReader");
+
+DEFINE_bool(charge_file_metadata, false,
+            "Setting for "
+            "CacheEntryRoleOptions::charged of "
+            "kFileMetadata");
+
+DEFINE_bool(charge_blob_cache, false,
+            "Setting for "
+            "CacheEntryRoleOptions::charged of "
+            "kBlobCache");
+
+DEFINE_int32(
+    top_level_index_pinning,
+    static_cast<int32_t>(ROCKSDB_NAMESPACE::PinningTier::kFallback),
+    "Type of pinning for top-level indexes into metadata partitions (see "
+    "`enum PinningTier` in table.h)");
+
+DEFINE_int32(
+    partition_pinning,
+    static_cast<int32_t>(ROCKSDB_NAMESPACE::PinningTier::kFallback),
+    "Type of pinning for metadata partitions (see `enum PinningTier` in "
+    "table.h)");
+
+DEFINE_int32(
+    unpartitioned_pinning,
+    static_cast<int32_t>(ROCKSDB_NAMESPACE::PinningTier::kFallback),
+    "Type of pinning for unpartitioned metadata blocks (see `enum PinningTier` "
+    "in table.h)");
+
+DEFINE_string(cache_type, "lru_cache", "Type of block cache.");
+
+DEFINE_uint64(subcompactions, 1,
+              "Maximum number of subcompactions to divide L0-L1 compactions "
+              "into.");
+
+DEFINE_uint64(periodic_compaction_seconds, 1000,
+              "Files older than this value will be picked up for compaction.");
+
+DEFINE_uint64(compaction_ttl, 1000,
+              "Files older than TTL will be compacted to the next level.");
+
+DEFINE_bool(allow_concurrent_memtable_write, false,
+            "Allow multi-writers to update mem tables in parallel.");
+
+DEFINE_double(experimental_mempurge_threshold, 0.0,
+              "Maximum estimated useful payload that triggers a "
+              "mempurge process to collect memtable garbage bytes.");
+
+DEFINE_bool(enable_write_thread_adaptive_yield, true,
+            "Use a yielding spin loop for brief writer thread waits.");
+
+#ifndef ROCKSDB_LITE
+// Options for StackableDB-based BlobDB
+DEFINE_bool(use_blob_db, false, "[Stacked BlobDB] Use BlobDB.");
+
+DEFINE_uint64(
+    blob_db_min_blob_size,
+    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size,
+    "[Stacked BlobDB] Smallest blob to store in a file. Blobs "
+    "smaller than this will be inlined with the key in the LSM tree.");
+
+DEFINE_uint64(
+    blob_db_bytes_per_sync,
+    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync,
+    "[Stacked BlobDB] Sync blob files once per every N bytes written.");
+
+DEFINE_uint64(blob_db_file_size,
+              ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().blob_file_size,
+              "[Stacked BlobDB] Target size of each blob file.");
+
+DEFINE_bool(
+    blob_db_enable_gc,
+    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().enable_garbage_collection,
+    "[Stacked BlobDB] Enable BlobDB garbage collection.");
+
+DEFINE_double(
+    blob_db_gc_cutoff,
+    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().garbage_collection_cutoff,
+    "[Stacked BlobDB] Cutoff ratio for BlobDB garbage collection.");
+#endif  // !ROCKSDB_LITE
+
+// Options for integrated BlobDB
+DEFINE_bool(allow_setting_blob_options_dynamically, false,
+            "[Integrated BlobDB] Allow setting blob options dynamically.");
+
+DEFINE_bool(
+    enable_blob_files,
+    ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().enable_blob_files,
+    "[Integrated BlobDB] Enable writing large values to separate blob files.");
+
+DEFINE_uint64(min_blob_size,
+              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().min_blob_size,
+              "[Integrated BlobDB] The size of the smallest value to be stored "
+              "separately in a blob file.");
+
+DEFINE_uint64(blob_file_size,
+              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().blob_file_size,
+              "[Integrated BlobDB] The size limit for blob files.");
+
+DEFINE_string(blob_compression_type, "none",
+              "[Integrated BlobDB] The compression algorithm to use for large "
+              "values stored in blob files.");
+
+DEFINE_bool(enable_blob_garbage_collection,
+            ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
+                .enable_blob_garbage_collection,
+            "[Integrated BlobDB] Enable blob garbage collection.");
+
+DEFINE_double(blob_garbage_collection_age_cutoff,
+              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
+                  .blob_garbage_collection_age_cutoff,
+              "[Integrated BlobDB] The cutoff in terms of blob file age for "
+              "garbage collection.");
+
+DEFINE_double(blob_garbage_collection_force_threshold,
+              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
+                  .blob_garbage_collection_force_threshold,
+              "[Integrated BlobDB] The threshold for the ratio of garbage in "
+              "the oldest blob files for forcing garbage collection.");
+
+DEFINE_uint64(blob_compaction_readahead_size,
+              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
+                  .blob_compaction_readahead_size,
+              "[Integrated BlobDB] Compaction readahead for blob files.");
+
+DEFINE_int32(
+    blob_file_starting_level,
+    ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().blob_file_starting_level,
+    "[Integrated BlobDB] Enable writing blob files during flushes and "
+    "compactions starting from the specified level.");
+
+DEFINE_bool(use_blob_cache, false, "[Integrated BlobDB] Enable blob cache.");
+
+DEFINE_bool(
+    use_shared_block_and_blob_cache, true,
+    "[Integrated BlobDB] Use a shared backing cache for both block "
+    "cache and blob cache. It only takes effect if use_blob_cache is enabled.");
+
+DEFINE_uint64(
+    blob_cache_size, 2LL * KB * KB * KB,
+    "[Integrated BlobDB] Number of bytes to use as a cache of blobs. It only "
+    "takes effect if the block and blob caches are different "
+    "(use_shared_block_and_blob_cache = false).");
+
+DEFINE_int32(blob_cache_numshardbits, 6,
+             "[Integrated BlobDB] Number of shards for the blob cache is 2 ** "
+             "blob_cache_numshardbits. Negative means use default settings. "
+             "It only takes effect if blob_cache_size is greater than 0, and "
+             "the block and blob caches are different "
+             "(use_shared_block_and_blob_cache = false).");
+
+DEFINE_int32(prepopulate_blob_cache, 0,
+             "[Integrated BlobDB] Pre-populate hot/warm blobs in blob cache. 0 "
+             "to disable and 1 to insert during flush.");
+
+DEFINE_bool(enable_tiered_storage, false, "Set last_level_temperature");
+
+DEFINE_int64(preclude_last_level_data_seconds, 0,
+             "Preclude data from the last level. Used with tiered storage "
+             "feature to preclude new data from comacting to the last level.");
+
+DEFINE_int64(
+    preserve_internal_time_seconds, 0,
+    "Preserve internal time information which is attached to each SST.");
+
+static const bool FLAGS_subcompactions_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range);
+
+static bool ValidateInt32Positive(const char* flagname, int32_t value) {
+  if (value < 0) {
+    fprintf(stderr, "Invalid value for --%s: %d, must be >=0\n", flagname,
+            value);
+    return false;
+  }
+  return true;
+}
+DEFINE_int32(reopen, 10, "Number of times database reopens");
+static const bool FLAGS_reopen_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_reopen, &ValidateInt32Positive);
+
+DEFINE_double(bloom_bits, 10,
+              "Bloom filter bits per key. "
+              "Negative means use default settings.");
+
+DEFINE_int32(
+    ribbon_starting_level, 999,
+    "Use Bloom filter on levels below specified and Ribbon beginning on level "
+    "specified. Flush is considered level -1. 999 or more -> always Bloom. 0 "
+    "-> Ribbon except Bloom for flush. -1 -> always Ribbon.");
+
+DEFINE_bool(partition_filters, false,
+            "use partitioned filters "
+            "for block-based table");
+
+DEFINE_bool(
+    optimize_filters_for_memory,
+    ROCKSDB_NAMESPACE::BlockBasedTableOptions().optimize_filters_for_memory,
+    "Minimize memory footprint of filters");
+
+DEFINE_bool(
+    detect_filter_construct_corruption,
+    ROCKSDB_NAMESPACE::BlockBasedTableOptions()
+        .detect_filter_construct_corruption,
+    "Detect corruption during new Bloom Filter and Ribbon Filter construction");
+
+DEFINE_int32(
+    index_type,
+    static_cast<int32_t>(
+        ROCKSDB_NAMESPACE::BlockBasedTableOptions().index_type),
+    "Type of block-based table index (see `enum IndexType` in table.h)");
+
+DEFINE_int32(
+    data_block_index_type,
+    static_cast<int32_t>(
+        ROCKSDB_NAMESPACE::BlockBasedTableOptions().data_block_index_type),
+    "Index type for data blocks (see `enum DataBlockIndexType` in table.h)");
+
+DEFINE_string(db, "", "Use the db with the following name.");
+
+DEFINE_string(secondaries_base, "",
+              "Use this path as the base path for secondary instances.");
+
+DEFINE_bool(test_secondary, false,
+            "If true, start an additional secondary instance which can be used "
+            "for verification.");
+
+DEFINE_string(
+    expected_values_dir, "",
+    "Dir where files containing info about the latest/historical values will "
+    "be stored. If provided and non-empty, the DB state will be verified "
+    "against values from these files after recovery. --max_key and "
+    "--column_family must be kept the same across invocations of this program "
+    "that use the same --expected_values_dir. Currently historical values are "
+    "only tracked when --sync_fault_injection is set. See --seed and "
+    "--nooverwritepercent for further requirements.");
+
+DEFINE_bool(verify_checksum, false,
+            "Verify checksum for every block read from storage");
+
+DEFINE_bool(mmap_read, ROCKSDB_NAMESPACE::Options().allow_mmap_reads,
+            "Allow reads to occur via mmap-ing files");
+
+DEFINE_bool(mmap_write, ROCKSDB_NAMESPACE::Options().allow_mmap_writes,
+            "Allow writes to occur via mmap-ing files");
+
+DEFINE_bool(use_direct_reads, ROCKSDB_NAMESPACE::Options().use_direct_reads,
+            "Use O_DIRECT for reading data");
+
+DEFINE_bool(use_direct_io_for_flush_and_compaction,
+            ROCKSDB_NAMESPACE::Options().use_direct_io_for_flush_and_compaction,
+            "Use O_DIRECT for writing data");
+
+DEFINE_bool(mock_direct_io, false,
+            "Mock direct IO by not using O_DIRECT for direct IO read");
+
+DEFINE_bool(statistics, false, "Create database statistics");
+
+DEFINE_bool(sync, false, "Sync all writes to disk");
+
+DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync");
+
+DEFINE_uint64(bytes_per_sync, ROCKSDB_NAMESPACE::Options().bytes_per_sync,
+              "If nonzero, sync SST file data incrementally after every "
+              "`bytes_per_sync` bytes are written");
+
+DEFINE_uint64(wal_bytes_per_sync,
+              ROCKSDB_NAMESPACE::Options().wal_bytes_per_sync,
+              "If nonzero, sync WAL file data incrementally after every "
+              "`bytes_per_sync` bytes are written");
+
+DEFINE_int32(kill_random_test, 0,
+             "If non-zero, kill at various points in source code with "
+             "probability 1/this");
+static const bool FLAGS_kill_random_test_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_kill_random_test, &ValidateInt32Positive);
+
+DEFINE_string(kill_exclude_prefixes, "",
+              "If non-empty, kill points with prefix in the list given will be"
+              " skipped. Items are comma-separated.");
+extern std::vector<std::string> rocksdb_kill_exclude_prefixes;
+
+DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
+
+DEFINE_uint64(recycle_log_file_num,
+              ROCKSDB_NAMESPACE::Options().recycle_log_file_num,
+              "Number of old WAL files to keep around for later recycling");
+
+DEFINE_int64(target_file_size_base,
+             ROCKSDB_NAMESPACE::Options().target_file_size_base,
+             "Target level-1 file size for compaction");
+
+DEFINE_int32(target_file_size_multiplier, 1,
+             "A multiplier to compute target level-N file size (N >= 2)");
+
+DEFINE_uint64(max_bytes_for_level_base,
+              ROCKSDB_NAMESPACE::Options().max_bytes_for_level_base,
+              "Max bytes for level-1");
+
+DEFINE_double(max_bytes_for_level_multiplier, 2,
+              "A multiplier to compute max bytes for level-N (N >= 2)");
+
+DEFINE_int32(range_deletion_width, 10,
+             "The width of the range deletion intervals.");
+
+DEFINE_uint64(rate_limiter_bytes_per_sec, 0, "Set options.rate_limiter value.");
+
+DEFINE_bool(rate_limit_bg_reads, false,
+            "Use options.rate_limiter on compaction reads");
+
+DEFINE_bool(rate_limit_user_ops, false,
+            "When true use Env::IO_USER priority level to charge internal rate "
+            "limiter for reads associated with user operations.");
+
+DEFINE_bool(rate_limit_auto_wal_flush, false,
+            "When true use Env::IO_USER priority level to charge internal rate "
+            "limiter for automatic WAL flush (`Options::manual_wal_flush` == "
+            "false) after the user "
+            "write operation.");
+
+DEFINE_uint64(sst_file_manager_bytes_per_sec, 0,
+              "Set `Options::sst_file_manager` to delete at this rate. By "
+              "default the deletion rate is unbounded.");
+
+DEFINE_uint64(sst_file_manager_bytes_per_truncate, 0,
+              "Set `Options::sst_file_manager` to delete in chunks of this "
+              "many bytes. By default whole files will be deleted.");
+
+DEFINE_bool(use_txn, false,
+            "Use TransactionDB. Currently the default write policy is "
+            "TxnDBWritePolicy::WRITE_PREPARED");
+
+DEFINE_uint64(txn_write_policy, 0,
+              "The transaction write policy. Default is "
+              "TxnDBWritePolicy::WRITE_COMMITTED. Note that this should not be "
+              "changed accross crashes.");
+
+DEFINE_bool(unordered_write, false,
+            "Turn on the unordered_write feature. This options is currently "
+            "tested only in combination with use_txn=true and "
+            "txn_write_policy=TxnDBWritePolicy::WRITE_PREPARED.");
+
+DEFINE_int32(backup_one_in, 0,
+             "If non-zero, then CreateNewBackup() will be called once for "
+             "every N operations on average.  0 indicates CreateNewBackup() "
+             "is disabled.");
+
+DEFINE_uint64(backup_max_size, 100 * 1024 * 1024,
+              "If non-zero, skip checking backup/restore when DB size in "
+              "bytes exceeds this setting.");
+
+DEFINE_int32(checkpoint_one_in, 0,
+             "If non-zero, then CreateCheckpoint() will be called once for "
+             "every N operations on average.  0 indicates CreateCheckpoint() "
+             "is disabled.");
+
+DEFINE_int32(ingest_external_file_one_in, 0,
+             "If non-zero, then IngestExternalFile() will be called once for "
+             "every N operations on average.  0 indicates IngestExternalFile() "
+             "is disabled.");
+
+DEFINE_int32(ingest_external_file_width, 100,
+             "The width of the ingested external files.");
+
+DEFINE_int32(compact_files_one_in, 0,
+             "If non-zero, then CompactFiles() will be called once for every N "
+             "operations on average.  0 indicates CompactFiles() is disabled.");
+
+DEFINE_int32(compact_range_one_in, 0,
+             "If non-zero, then CompactRange() will be called once for every N "
+             "operations on average.  0 indicates CompactRange() is disabled.");
+
+DEFINE_int32(mark_for_compaction_one_file_in, 0,
+             "A `TablePropertiesCollectorFactory` will be registered, which "
+             "creates a `TablePropertiesCollector` with `NeedCompact()` "
+             "returning true once for every N files on average. 0 or negative "
+             "mean `NeedCompact()` always returns false.");
+
+DEFINE_int32(flush_one_in, 0,
+             "If non-zero, then Flush() will be called once for every N ops "
+             "on average.  0 indicates calls to Flush() are disabled.");
+
+DEFINE_int32(pause_background_one_in, 0,
+             "If non-zero, then PauseBackgroundWork()+Continue will be called "
+             "once for every N ops on average.  0 disables.");
+
+DEFINE_int32(compact_range_width, 10000,
+             "The width of the ranges passed to CompactRange().");
+
+DEFINE_int32(acquire_snapshot_one_in, 0,
+             "If non-zero, then acquires a snapshot once every N operations on "
+             "average.");
+
+DEFINE_bool(compare_full_db_state_snapshot, false,
+            "If set we compare state of entire db (in one of the threads) with"
+            "each snapshot.");
+
+DEFINE_uint64(snapshot_hold_ops, 0,
+              "If non-zero, then releases snapshots N operations after they're "
+              "acquired.");
+
+DEFINE_bool(long_running_snapshots, false,
+            "If set, hold on some some snapshots for much longer time.");
+
+DEFINE_bool(use_multiget, false,
+            "If set, use the batched MultiGet API for reads");
+
+static bool ValidateInt32Percent(const char* flagname, int32_t value) {
+  if (value < 0 || value > 100) {
+    fprintf(stderr, "Invalid value for --%s: %d, 0<= pct <=100 \n", flagname,
+            value);
+    return false;
+  }
+  return true;
+}
+
+DEFINE_int32(readpercent, 10,
+             "Ratio of reads to total workload (expressed as a percentage)");
+static const bool FLAGS_readpercent_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_readpercent, &ValidateInt32Percent);
+
+DEFINE_int32(prefixpercent, 20,
+             "Ratio of prefix iterators to total workload (expressed as a"
+             " percentage)");
+static const bool FLAGS_prefixpercent_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_prefixpercent, &ValidateInt32Percent);
+
+DEFINE_int32(writepercent, 45,
+             "Ratio of writes to total workload (expressed as a percentage)");
+static const bool FLAGS_writepercent_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_writepercent, &ValidateInt32Percent);
+
+DEFINE_int32(delpercent, 15,
+             "Ratio of deletes to total workload (expressed as a percentage)");
+static const bool FLAGS_delpercent_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_delpercent, &ValidateInt32Percent);
+
+DEFINE_int32(delrangepercent, 0,
+             "Ratio of range deletions to total workload (expressed as a "
+             "percentage). Cannot be used with test_batches_snapshots");
+static const bool FLAGS_delrangepercent_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_delrangepercent, &ValidateInt32Percent);
+
+DEFINE_int32(nooverwritepercent, 60,
+             "Ratio of keys without overwrite to total workload (expressed as "
+             "a percentage). When --expected_values_dir is nonempty, must "
+             "keep this value constant across invocations.");
+static const bool FLAGS_nooverwritepercent_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_nooverwritepercent, &ValidateInt32Percent);
+
+DEFINE_int32(iterpercent, 10,
+             "Ratio of iterations to total workload"
+             " (expressed as a percentage)");
+static const bool FLAGS_iterpercent_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_iterpercent, &ValidateInt32Percent);
+
+DEFINE_uint64(num_iterations, 10, "Number of iterations per MultiIterate run");
+static const bool FLAGS_num_iterations_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range);
+
+DEFINE_int32(
+    customopspercent, 0,
+    "Ratio of custom operations to total workload (expressed as a percentage)");
+
+DEFINE_string(compression_type, "snappy",
+              "Algorithm to use to compress the database");
+
+DEFINE_int32(compression_max_dict_bytes, 0,
+             "Maximum size of dictionary used to prime the compression "
+             "library.");
+
+DEFINE_int32(compression_zstd_max_train_bytes, 0,
+             "Maximum size of training data passed to zstd's dictionary "
+             "trainer.");
+
+DEFINE_int32(compression_parallel_threads, 1,
+             "Number of threads for parallel compression.");
+
+DEFINE_uint64(compression_max_dict_buffer_bytes, 0,
+              "Buffering limit for SST file data to sample for dictionary "
+              "compression.");
+
+DEFINE_bool(
+    compression_use_zstd_dict_trainer, true,
+    "Use zstd's trainer to generate dictionary. If the options is false, "
+    "zstd's finalizeDictionary() API is used to generate dictionary. "
+    "ZSTD 1.4.5+ is required. If ZSTD 1.4.5+ is not linked with the binary, "
+    "this flag will have the default value true.");
+
+DEFINE_string(bottommost_compression_type, "disable",
+              "Algorithm to use to compress bottommost level of the database. "
+              "\"disable\" means disabling the feature");
+
+DEFINE_string(checksum_type, "kCRC32c", "Algorithm to use to checksum blocks");
+
+DEFINE_string(env_uri, "",
+              "URI for env lookup. Mutually exclusive with --fs_uri");
+
+DEFINE_string(fs_uri, "",
+              "URI for registry Filesystem lookup. Mutually exclusive"
+              " with --env_uri."
+              " Creates a default environment with the specified filesystem.");
+
+DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread.");
+static const bool FLAGS_ops_per_thread_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_ops_per_thread, &ValidateUint32Range);
+
+DEFINE_uint64(log2_keys_per_lock, 2, "Log2 of number of keys per lock");
+static const bool FLAGS_log2_keys_per_lock_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_log2_keys_per_lock, &ValidateUint32Range);
+
+DEFINE_uint64(max_manifest_file_size, 16384, "Maximum size of a MANIFEST file");
+
+DEFINE_bool(in_place_update, false, "On true, does inplace update in memtable");
+
+DEFINE_string(memtablerep, "skip_list", "");
+
+inline static bool ValidatePrefixSize(const char* flagname, int32_t value) {
+  if (value < -1 || value > 8) {
+    fprintf(stderr, "Invalid value for --%s: %d. -1 <= PrefixSize <= 8\n",
+            flagname, value);
+    return false;
+  }
+  return true;
+}
+DEFINE_int32(prefix_size, 7,
+             "Control the prefix size for HashSkipListRep. "
+             "-1 is disabled.");
+static const bool FLAGS_prefix_size_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
+
+DEFINE_bool(use_merge, false,
+            "On true, replaces all writes with a Merge "
+            "that behaves like a Put");
+
+DEFINE_uint32(use_put_entity_one_in, 0,
+              "If greater than zero, PutEntity will be used once per every N "
+              "write ops on average.");
+
+DEFINE_bool(use_full_merge_v1, false,
+            "On true, use a merge operator that implement the deprecated "
+            "version of FullMerge");
+
+DEFINE_int32(sync_wal_one_in, 0,
+             "If non-zero, then SyncWAL() will be called once for every N ops "
+             "on average. 0 indicates that calls to SyncWAL() are disabled.");
+
+DEFINE_bool(avoid_unnecessary_blocking_io,
+            ROCKSDB_NAMESPACE::Options().avoid_unnecessary_blocking_io,
+            "If true, some expensive cleaning up operations will be moved from "
+            "user reads to high-pri background threads.");
+
+DEFINE_bool(write_dbid_to_manifest,
+            ROCKSDB_NAMESPACE::Options().write_dbid_to_manifest,
+            "Write DB_ID to manifest");
+
+DEFINE_bool(avoid_flush_during_recovery,
+            ROCKSDB_NAMESPACE::Options().avoid_flush_during_recovery,
+            "Avoid flush during recovery");
+
+DEFINE_uint64(max_write_batch_group_size_bytes,
+              ROCKSDB_NAMESPACE::Options().max_write_batch_group_size_bytes,
+              "Max write batch group size");
+
+DEFINE_bool(level_compaction_dynamic_level_bytes,
+            ROCKSDB_NAMESPACE::Options().level_compaction_dynamic_level_bytes,
+            "Use dynamic level");
+
+DEFINE_int32(verify_checksum_one_in, 0,
+             "If non-zero, then DB::VerifyChecksum() will be called to do"
+             " checksum verification of all the files in the database once for"
+             " every N ops on average. 0 indicates that calls to"
+             " VerifyChecksum() are disabled.");
+DEFINE_int32(verify_db_one_in, 0,
+             "If non-zero, call VerifyDb() once for every N ops. 0 indicates "
+             "that VerifyDb() will not be called in OperateDb(). Note that "
+             "enabling this can slow down tests.");
+
+DEFINE_int32(continuous_verification_interval, 1000,
+             "While test is running, verify db every N milliseconds. 0 "
+             "disables continuous verification.");
+
+DEFINE_int32(approximate_size_one_in, 64,
+             "If non-zero, DB::GetApproximateSizes() will be called against"
+             " random key ranges.");
+
+DEFINE_int32(read_fault_one_in, 1000,
+             "On non-zero, enables fault injection on read");
+
+DEFINE_int32(get_property_one_in, 1000,
+             "If non-zero, then DB::GetProperty() will be called to get various"
+             " properties for every N ops on average. 0 indicates that"
+             " GetProperty() will be not be called.");
+
+DEFINE_bool(sync_fault_injection, false,
+            "If true, FaultInjectionTestFS will be used for write operations, "
+            "and unsynced data in DB will lost after crash. In such a case we "
+            "track DB changes in a trace file (\"*.trace\") in "
+            "--expected_values_dir for verifying there are no holes in the "
+            "recovered data.");
+
+DEFINE_bool(best_efforts_recovery, false,
+            "If true, use best efforts recovery.");
+DEFINE_bool(skip_verifydb, false,
+            "If true, skip VerifyDb() calls and Get()/Iterator verifications"
+            "against expected state.");
+
+DEFINE_bool(enable_compaction_filter, false,
+            "If true, configures a compaction filter that returns a kRemove "
+            "decision for deleted keys.");
+
+DEFINE_bool(paranoid_file_checks, true,
+            "After writing every SST file, reopen it and read all the keys "
+            "and validate checksums");
+
+DEFINE_bool(fail_if_options_file_error, false,
+            "Fail operations that fail to detect or properly persist options "
+            "file.");
+
+DEFINE_uint64(batch_protection_bytes_per_key, 0,
+              "If nonzero, enables integrity protection in `WriteBatch` at the "
+              "specified number of bytes per key. Currently the only supported "
+              "nonzero value is eight.");
+
+DEFINE_uint32(
+    memtable_protection_bytes_per_key, 0,
+    "If nonzero, enables integrity protection in memtable entries at the "
+    "specified number of bytes per key. Currently the supported "
+    "nonzero values are 1, 2, 4 and 8.");
+
+DEFINE_string(file_checksum_impl, "none",
+              "Name of an implementation for file_checksum_gen_factory, or "
+              "\"none\" for null.");
+
+DEFINE_int32(write_fault_one_in, 0,
+             "On non-zero, enables fault injection on write");
+
+DEFINE_uint64(user_timestamp_size, 0,
+              "Number of bytes for a user-defined timestamp. Currently, only "
+              "8-byte is supported");
+
+DEFINE_int32(open_metadata_write_fault_one_in, 0,
+             "On non-zero, enables fault injection on file metadata write "
+             "during DB reopen.");
+
+#ifndef ROCKSDB_LITE
+DEFINE_string(secondary_cache_uri, "",
+              "Full URI for creating a customized secondary cache object");
+DEFINE_int32(secondary_cache_fault_one_in, 0,
+             "On non-zero, enables fault injection in secondary cache inserts"
+             " and lookups");
+#endif  // ROCKSDB_LITE
+DEFINE_int32(open_write_fault_one_in, 0,
+             "On non-zero, enables fault injection on file writes "
+             "during DB reopen.");
+DEFINE_int32(open_read_fault_one_in, 0,
+             "On non-zero, enables fault injection on file reads "
+             "during DB reopen.");
+DEFINE_int32(injest_error_severity, 1,
+             "The severity of the injested IO Error. 1 is soft error (e.g. "
+             "retryable error), 2 is fatal error, and the default is "
+             "retryable error.");
+DEFINE_int32(prepopulate_block_cache,
+             static_cast<int32_t>(ROCKSDB_NAMESPACE::BlockBasedTableOptions::
+                                      PrepopulateBlockCache::kDisable),
+             "Options related to cache warming (see `enum "
+             "PrepopulateBlockCache` in table.h)");
+
+DEFINE_bool(two_write_queues, false,
+            "Set to true to enable two write queues. Default: false");
+#ifndef ROCKSDB_LITE
+
+DEFINE_bool(use_only_the_last_commit_time_batch_for_recovery, false,
+            "If true, the commit-time write batch will not be immediately "
+            "inserted into the memtables. Default: false");
+
+DEFINE_uint64(
+    wp_snapshot_cache_bits, 7ull,
+    "Number of bits to represent write-prepared transaction db's snapshot "
+    "cache. Default: 7 (128 entries)");
+
+DEFINE_uint64(wp_commit_cache_bits, 23ull,
+              "Number of bits to represent write-prepared transaction db's "
+              "commit cache. Default: 23 (8M entries)");
+#endif  // !ROCKSDB_LITE
+
+DEFINE_bool(adaptive_readahead, false,
+            "Carry forward internal auto readahead size from one file to next "
+            "file at each level during iteration");
+DEFINE_bool(
+    async_io, false,
+    "Does asynchronous prefetching when internal auto readahead is enabled");
+
+DEFINE_string(wal_compression, "none",
+              "Algorithm to use for WAL compression. none to disable.");
+
+DEFINE_bool(
+    verify_sst_unique_id_in_manifest, false,
+    "Enable DB options `verify_sst_unique_id_in_manifest`, if true, during "
+    "DB-open try verifying the SST unique id between MANIFEST and SST "
+    "properties.");
+
+DEFINE_int32(
+    create_timestamped_snapshot_one_in, 0,
+    "On non-zero, create timestamped snapshots upon transaction commits.");
+
+DEFINE_bool(allow_data_in_errors,
+            ROCKSDB_NAMESPACE::Options().allow_data_in_errors,
+            "If true, allow logging data, e.g. key, value in LOG files.");
+
+DEFINE_int32(verify_iterator_with_expected_state_one_in, 0,
+             "If non-zero, when TestIterate() is to be called, there is a "
+             "1/verify_iterator_with_expected_state_one_in "
+             "chance that the iterator is verified against the expected state "
+             "file, instead of comparing keys between two iterators.");
+
+DEFINE_uint64(readahead_size, 0, "Iterator readahead size");
+DEFINE_uint64(initial_auto_readahead_size, 0,
+              "Initial auto readahead size for prefetching during Iteration");
+DEFINE_uint64(max_auto_readahead_size, 0,
+              "Max auto readahead size for prefetching during Iteration");
+DEFINE_uint64(
+    num_file_reads_for_auto_readahead, 0,
+    "Num of sequential reads to enable auto prefetching during Iteration");
+
+DEFINE_bool(
+    preserve_unverified_changes, false,
+    "DB files of the current run will all be preserved in `FLAGS_db`. DB files "
+    "from the last run will be preserved in `FLAGS_db/unverified` until the "
+    "first verification succeeds. Expected state files from the last run will "
+    "be preserved similarly under `FLAGS_expected_values_dir/unverified` when "
+    "`--expected_values_dir` is nonempty.");
+
+DEFINE_uint64(stats_dump_period_sec,
+              ROCKSDB_NAMESPACE::Options().stats_dump_period_sec,
+              "Gap between printing stats to log in seconds");
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/db_stress_tool/db_stress_listener.cc b/src/rocksdb/db_stress_tool/db_stress_listener.cc
new file mode 100644
index 000000000..578f21c41
--- /dev/null
+++ b/src/rocksdb/db_stress_tool/db_stress_listener.cc
@@ -0,0 +1,191 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db_stress_tool/db_stress_listener.h"
+
+#include <cstdint>
+
+#include "file/file_util.h"
+#include "rocksdb/file_system.h"
+#include "util/coding_lean.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifdef GFLAGS
+#ifndef ROCKSDB_LITE
+
+// TODO: consider using expected_values_dir instead, but this is more
+// convenient for now.
+UniqueIdVerifier::UniqueIdVerifier(const std::string& db_name, Env* env)
+    : path_(db_name + "/.unique_ids") {
+  // We expect such a small number of files generated during this test
+  // (thousands?), checking full 192-bit IDs for uniqueness is a very
+  // weak check. For a stronger check, we pick a specific 64-bit
+  // subsequence from the ID to check for uniqueness. All bits of the
+  // ID should be high quality, and 64 bits should be unique with
+  // very good probability for the quantities in this test.
+  offset_ = Random::GetTLSInstance()->Uniform(17);  // 0 to 16
+
+  const std::shared_ptr<FileSystem> fs = env->GetFileSystem();
+  IOOptions opts;
+
+  Status st = fs->CreateDirIfMissing(db_name, opts, nullptr);
+  if (!st.ok()) {
+    fprintf(stderr, "Failed to create directory %s: %s\n", db_name.c_str(),
+            st.ToString().c_str());
+    exit(1);
+  }
+
+  // Avoid relying on ReopenWritableFile which is not supported by all
+  // file systems. Create a new file and copy the old file contents to it.
+  std::string tmp_path = path_ + ".tmp";
+  st = fs->FileExists(tmp_path, opts, /*dbg*/ nullptr);
+  if (st.IsNotFound()) {
+    st = fs->RenameFile(path_, tmp_path, opts, /*dbg*/ nullptr);
+    // Either it should succeed or fail because src path doesn't exist
+    assert(st.ok() || st.IsPathNotFound());
+  } else {
+    // If path_ and tmp_path both exist, retain tmp_path as its
+    // guaranteed to be more complete. The order of operations are -
+    // 1. Rename path_ to tmp_path
+    // 2. Parse tmp_path contents
+    // 3. Create path_
+    // 4. Copy tmp_path contents to path_
+    // 5. Delete tmp_path
+    st = fs->DeleteFile(path_, opts, /*dbg*/ nullptr);
+    assert(st.ok() || st.IsPathNotFound());
+  }
+
+  uint64_t size = 0;
+  {
+    std::unique_ptr<FSSequentialFile> reader;
+    Status s = fs->NewSequentialFile(tmp_path, FileOptions(), &reader,
+                                     /*dbg*/ nullptr);
+    if (s.ok()) {
+      // Load from file
+      std::string id(24U, '\0');
+      Slice result;
+      for (;;) {
+        s = reader->Read(id.size(), opts, &result, &id[0], /*dbg*/ nullptr);
+        if (!s.ok()) {
+          fprintf(stderr, "Error reading unique id file: %s\n",
+                  s.ToString().c_str());
+          assert(false);
+        }
+        if (result.size() < id.size()) {
+          // EOF
+          if (result.size() != 0) {
+            // Corrupt file. Not a DB bug but could happen if OS doesn't provide
+            // good guarantees on process crash.
+            fprintf(stdout, "Warning: clearing corrupt unique id file\n");
+            id_set_.clear();
+            reader.reset();
+            s = fs->DeleteFile(tmp_path, opts, /*dbg*/ nullptr);
+            assert(s.ok());
+            size = 0;
+          }
+          break;
+        }
+        size += 24U;
+        VerifyNoWrite(id);
+      }
+    } else {
+      // Newly created is ok.
+      // But FileSystem doesn't tell us whether non-existence was the cause of
+      // the failure. (Issue #9021)
+      Status s2 = fs->FileExists(tmp_path, opts, /*dbg*/ nullptr);
+      if (!s2.IsNotFound()) {
+        fprintf(stderr, "Error opening unique id file: %s\n",
+                s.ToString().c_str());
+        assert(false);
+      }
+      size = 0;
+    }
+  }
+  fprintf(stdout, "(Re-)verified %zu unique IDs\n", id_set_.size());
+
+  std::unique_ptr<FSWritableFile> file_writer;
+  st = fs->NewWritableFile(path_, FileOptions(), &file_writer, /*dbg*/ nullptr);
+  if (!st.ok()) {
+    fprintf(stderr, "Error creating the unique ids file: %s\n",
+            st.ToString().c_str());
+    assert(false);
+  }
+  data_file_writer_.reset(
+      new WritableFileWriter(std::move(file_writer), path_, FileOptions()));
+
+  if (size > 0) {
+    st = CopyFile(fs.get(), tmp_path, data_file_writer_, size,
+                  /*use_fsync*/ true, /*io_tracer*/ nullptr,
+                  /*temparature*/ Temperature::kHot);
+    if (!st.ok()) {
+      fprintf(stderr, "Error copying contents of old unique id file: %s\n",
+              st.ToString().c_str());
+      assert(false);
+    }
+  }
+  st = fs->DeleteFile(tmp_path, opts, /*dbg*/ nullptr);
+  assert(st.ok() || st.IsPathNotFound());
+}
+
+UniqueIdVerifier::~UniqueIdVerifier() {
+  IOStatus s = data_file_writer_->Close();
+  assert(s.ok());
+}
+
+void UniqueIdVerifier::VerifyNoWrite(const std::string& id) {
+  assert(id.size() == 24);
+  bool is_new = id_set_.insert(DecodeFixed64(&id[offset_])).second;
+  if (!is_new) {
+    fprintf(stderr,
+            "Duplicate partial unique ID found (offset=%zu, count=%zu)\n",
+            offset_, id_set_.size());
+    assert(false);
+  }
+}
+
+void UniqueIdVerifier::Verify(const std::string& id) {
+  assert(id.size() == 24);
+  std::lock_guard<std::mutex> lock(mutex_);
+  // If we accumulate more than ~4 million IDs, there would be > 1 in 1M
+  // natural chance of collision. Thus, simply stop checking at that point.
+  if (id_set_.size() >= 4294967) {
+    return;
+  }
+  IOStatus s = data_file_writer_->Append(Slice(id));
+  if (!s.ok()) {
+    fprintf(stderr, "Error writing to unique id file: %s\n",
+            s.ToString().c_str());
+    assert(false);
+  }
+  s = data_file_writer_->Flush();
+  if (!s.ok()) {
+    fprintf(stderr, "Error flushing unique id file: %s\n",
+            s.ToString().c_str());
+    assert(false);
+  }
+  VerifyNoWrite(id);
+}
+
+void DbStressListener::VerifyTableFileUniqueId(
+    const TableProperties& new_file_properties, const std::string& file_path) {
+  // Verify unique ID
+  std::string id;
+  // Unit tests verify that GetUniqueIdFromTableProperties returns just a
+  // substring of this, and we're only going to pull out 64 bits, so using
+  // GetExtendedUniqueIdFromTableProperties is arguably stronger testing here.
+  Status s = GetExtendedUniqueIdFromTableProperties(new_file_properties, &id);
+  if (!s.ok()) {
+    fprintf(stderr, "Error getting SST unique id for %s: %s\n",
+            file_path.c_str(), s.ToString().c_str());
+    assert(false);
+  }
+  unique_ids_.Verify(id);
+}
+
+#endif  // !ROCKSDB_LITE
+#endif  // GFLAGS
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db_stress_tool/db_stress_listener.h b/src/rocksdb/db_stress_tool/db_stress_listener.h
new file mode 100644
index 000000000..faced3172
--- /dev/null
+++ b/src/rocksdb/db_stress_tool/db_stress_listener.h
@@ -0,0 +1,271 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifdef GFLAGS
+#pragma once
+
+#include <mutex>
+#include <unordered_set>
+
+#include "file/filename.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/unique_id.h"
+#include "util/gflags_compat.h"
+#include "util/random.h"
+
+DECLARE_int32(compact_files_one_in);
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+// Verify across process executions that all seen IDs are unique
+class UniqueIdVerifier {
+ public:
+  explicit UniqueIdVerifier(const std::string& db_name, Env* env);
+  ~UniqueIdVerifier();
+
+  void Verify(const std::string& id);
+
+ private:
+  void VerifyNoWrite(const std::string& id);
+
+ private:
+  std::mutex mutex_;
+  // IDs persisted to a hidden file inside DB dir
+  std::string path_;
+  std::unique_ptr<WritableFileWriter> data_file_writer_;
+  // Starting byte for which 8 bytes to check in memory within 24 byte ID
+  size_t offset_;
+  // Working copy of the set of 8 byte pieces
+  std::unordered_set<uint64_t> id_set_;
+};
+
+class DbStressListener : public EventListener {
+ public:
+  DbStressListener(const std::string& db_name,
+                   const std::vector<DbPath>& db_paths,
+                   const std::vector<ColumnFamilyDescriptor>& column_families,
+                   Env* env)
+      : db_name_(db_name),
+        db_paths_(db_paths),
+        column_families_(column_families),
+        num_pending_file_creations_(0),
+        unique_ids_(db_name, env) {}
+
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "DBStressListener"; }
+
+  ~DbStressListener() override { assert(num_pending_file_creations_ == 0); }
+  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+    assert(IsValidColumnFamilyName(info.cf_name));
+    VerifyFilePath(info.file_path);
+    // pretending doing some work here
+    RandomSleep();
+  }
+
+  void OnFlushBegin(DB* /*db*/,
+                    const FlushJobInfo& /*flush_job_info*/) override {
+    RandomSleep();
+  }
+
+  void OnTableFileDeleted(const TableFileDeletionInfo& /*info*/) override {
+    RandomSleep();
+  }
+
+  void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& /*ci*/) override {
+    RandomSleep();
+  }
+
+  void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
+    assert(IsValidColumnFamilyName(ci.cf_name));
+    assert(ci.input_files.size() + ci.output_files.size() > 0U);
+    for (const auto& file_path : ci.input_files) {
+      VerifyFilePath(file_path);
+    }
+    for (const auto& file_path : ci.output_files) {
+      VerifyFilePath(file_path);
+    }
+    // pretending doing some work here
+    RandomSleep();
+  }
+
+  void OnTableFileCreationStarted(
+      const TableFileCreationBriefInfo& /*info*/) override {
+    ++num_pending_file_creations_;
+  }
+
+  void OnTableFileCreated(const TableFileCreationInfo& info) override {
+    assert(info.db_name == db_name_);
+    assert(IsValidColumnFamilyName(info.cf_name));
+    assert(info.job_id > 0 || FLAGS_compact_files_one_in > 0);
+    if (info.status.ok()) {
+      assert(info.file_size > 0);
+      VerifyFilePath(info.file_path);
+      assert(info.table_properties.data_size > 0 ||
+             info.table_properties.num_range_deletions > 0);
+      assert(info.table_properties.raw_key_size > 0);
+      assert(info.table_properties.num_entries > 0);
+      VerifyTableFileUniqueId(info.table_properties, info.file_path);
+    }
+    --num_pending_file_creations_;
+  }
+
+  void OnMemTableSealed(const MemTableInfo& /*info*/) override {
+    RandomSleep();
+  }
+
+  void OnColumnFamilyHandleDeletionStarted(
+      ColumnFamilyHandle* /*handle*/) override {
+    RandomSleep();
+  }
+
+  void OnExternalFileIngested(DB* /*db*/,
+                              const ExternalFileIngestionInfo& info) override {
+    RandomSleep();
+    // Here we assume that each generated external file is ingested
+    // exactly once (or thrown away in case of crash)
+    VerifyTableFileUniqueId(info.table_properties, info.internal_file_path);
+  }
+
+  void OnBackgroundError(BackgroundErrorReason /* reason */,
+                         Status* /* bg_error */) override {
+    RandomSleep();
+  }
+
+  void OnStallConditionsChanged(const WriteStallInfo& /*info*/) override {
+    RandomSleep();
+  }
+
+  void OnFileReadFinish(const FileOperationInfo& info) override {
+    // Even empty callback is valuable because sometimes some locks are
+    // released in order to make the callback.
+
+    // Sleep carefully here as it is a frequent operation and we don't want
+    // to slow down the tests. We always sleep when the read is large.
+    // When read is small, sleep in a small chance.
+    size_t length_read = info.length;
+    if (length_read >= 1000000 || Random::GetTLSInstance()->OneIn(1000)) {
+      RandomSleep();
+    }
+  }
+
+  void OnFileWriteFinish(const FileOperationInfo& info) override {
+    // Even empty callback is valuable because sometimes some locks are
+    // released in order to make the callback.
+
+    // Sleep carefully here as it is a frequent operation and we don't want
+    // to slow down the tests. When the write is large, always sleep.
+    // Otherwise, sleep in a relatively small chance.
+    size_t length_write = info.length;
+    if (length_write >= 1000000 || Random::GetTLSInstance()->OneIn(64)) {
+      RandomSleep();
+    }
+  }
+
+  bool ShouldBeNotifiedOnFileIO() override {
+    RandomSleep();
+    return static_cast<bool>(Random::GetTLSInstance()->OneIn(1));
+  }
+
+  void OnErrorRecoveryBegin(BackgroundErrorReason /* reason */,
+                            Status /* bg_error */,
+                            bool* /* auto_recovery */) override {
+    RandomSleep();
+  }
+
+  void OnErrorRecoveryCompleted(Status /* old_bg_error */) override {
+    RandomSleep();
+  }
+
+ protected:
+  bool IsValidColumnFamilyName(const std::string& cf_name) const {
+    if (cf_name == kDefaultColumnFamilyName) {
+      return true;
+    }
+    // The column family names in the stress tests are numbers.
+    for (size_t i = 0; i < cf_name.size(); ++i) {
+      if (cf_name[i] < '0' || cf_name[i] > '9') {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  void VerifyFileDir(const std::string& file_dir) {
+#ifndef NDEBUG
+    if (db_name_ == file_dir) {
+      return;
+    }
+    for (const auto& db_path : db_paths_) {
+      if (db_path.path == file_dir) {
+        return;
+      }
+    }
+    for (auto& cf : column_families_) {
+      for (const auto& cf_path : cf.options.cf_paths) {
+        if (cf_path.path == file_dir) {
+          return;
+        }
+      }
+    }
+    assert(false);
+#else
+    (void)file_dir;
+#endif  // !NDEBUG
+  }
+
+  void VerifyFileName(const std::string& file_name) {
+#ifndef NDEBUG
+    uint64_t file_number;
+    FileType file_type;
+    bool result = ParseFileName(file_name, &file_number, &file_type);
+    assert(result);
+    assert(file_type == kTableFile);
+#else
+    (void)file_name;
+#endif  // !NDEBUG
+  }
+
+  void VerifyFilePath(const std::string& file_path) {
+#ifndef NDEBUG
+    size_t pos = file_path.find_last_of("/");
+    if (pos == std::string::npos) {
+      VerifyFileName(file_path);
+    } else {
+      if (pos > 0) {
+        VerifyFileDir(file_path.substr(0, pos));
+      }
+      VerifyFileName(file_path.substr(pos));
+    }
+#else
+    (void)file_path;
+#endif  // !NDEBUG
+  }
+
+  // Unique id is verified using the TableProperties. file_path is only used
+  // for reporting.
+  void VerifyTableFileUniqueId(const TableProperties& new_file_properties,
+                               const std::string& file_path);
+
+  void RandomSleep() {
+    std::this_thread::sleep_for(
+        std::chrono::microseconds(Random::GetTLSInstance()->Uniform(5000)));
+  }
+
+ private:
+  std::string db_name_;
+  std::vector<DbPath> db_paths_;
+  std::vector<ColumnFamilyDescriptor> column_families_;
+  std::atomic<int> num_pending_file_creations_;
+  UniqueIdVerifier unique_ids_;
+};
+#endif  // !ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // GFLAGS
diff --git a/src/rocksdb/db_stress_tool/db_stress_shared_state.cc b/src/rocksdb/db_stress_tool/db_stress_shared_state.cc
new file mode 100644
index 000000000..a27f6ac73
--- /dev/null
+++ b/src/rocksdb/db_stress_tool/db_stress_shared_state.cc
@@ -0,0 +1,17 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+
+#ifdef GFLAGS
+#include "db_stress_tool/db_stress_shared_state.h"
+
+namespace ROCKSDB_NAMESPACE {
+thread_local bool SharedState::ignore_read_error;
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // GFLAGS
diff --git a/src/rocksdb/db_stress_tool/db_stress_shared_state.h b/src/rocksdb/db_stress_tool/db_stress_shared_state.h
new file mode 100644
index 000000000..5565c6221
--- /dev/null
+++ b/src/rocksdb/db_stress_tool/db_stress_shared_state.h
@@ -0,0 +1,427 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors
+
+#ifdef GFLAGS
+#pragma once
+
+#include "db_stress_tool/db_stress_stat.h"
+#include "db_stress_tool/expected_state.h"
+// SyncPoint is not supported in Released Windows Mode.
+#if !(defined NDEBUG) || !defined(OS_WIN)
+#include "test_util/sync_point.h"
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+#include "util/gflags_compat.h"
+
+DECLARE_uint64(seed);
+DECLARE_int64(max_key);
+DECLARE_uint64(log2_keys_per_lock);
+DECLARE_int32(threads);
+DECLARE_int32(column_families);
+DECLARE_int32(nooverwritepercent);
+DECLARE_string(expected_values_dir);
+DECLARE_int32(clear_column_family_one_in);
+DECLARE_bool(test_batches_snapshots);
+DECLARE_int32(compaction_thread_pool_adjust_interval);
+DECLARE_int32(continuous_verification_interval);
+DECLARE_int32(read_fault_one_in);
+DECLARE_int32(write_fault_one_in);
+DECLARE_int32(open_metadata_write_fault_one_in);
+DECLARE_int32(open_write_fault_one_in);
+DECLARE_int32(open_read_fault_one_in);
+
+DECLARE_int32(injest_error_severity);
+
+namespace ROCKSDB_NAMESPACE {
+class StressTest;
+
+// State shared by all concurrent executions of the same benchmark.
+class SharedState {
+ public:
+  // indicates a key may have any value (or not be present) as an operation on
+  // it is incomplete.
+  static constexpr uint32_t UNKNOWN_SENTINEL = 0xfffffffe;
+  // indicates a key should definitely be deleted
+  static constexpr uint32_t DELETION_SENTINEL = 0xffffffff;
+
+  // Errors when reading filter blocks are ignored, so we use a thread
+  // local variable updated via sync points to keep track of errors injected
+  // while reading filter blocks in order to ignore the Get/MultiGet result
+  // for those calls
+  static thread_local bool ignore_read_error;
+
+  SharedState(Env* /*env*/, StressTest* stress_test)
+      : cv_(&mu_),
+        seed_(static_cast<uint32_t>(FLAGS_seed)),
+        max_key_(FLAGS_max_key),
+        log2_keys_per_lock_(static_cast<uint32_t>(FLAGS_log2_keys_per_lock)),
+        num_threads_(0),
+        num_initialized_(0),
+        num_populated_(0),
+        vote_reopen_(0),
+        num_done_(0),
+        start_(false),
+        start_verify_(false),
+        num_bg_threads_(0),
+        should_stop_bg_thread_(false),
+        bg_thread_finished_(0),
+        stress_test_(stress_test),
+        verification_failure_(false),
+        should_stop_test_(false),
+        no_overwrite_ids_(GenerateNoOverwriteIds()),
+        expected_state_manager_(nullptr),
+        printing_verification_results_(false),
+        start_timestamp_(Env::Default()->NowNanos()) {
+    Status status;
+    // TODO: We should introduce a way to explicitly disable verification
+    // during shutdown. When that is disabled and FLAGS_expected_values_dir
+    // is empty (disabling verification at startup), we can skip tracking
+    // expected state. Only then should we permit bypassing the below feature
+    // compatibility checks.
+    if (!FLAGS_expected_values_dir.empty()) {
+      if (!std::atomic<uint32_t>{}.is_lock_free()) {
+        status = Status::InvalidArgument(
+            "Cannot use --expected_values_dir on platforms without lock-free "
+            "std::atomic<uint32_t>");
+      }
+      if (status.ok() && FLAGS_clear_column_family_one_in > 0) {
+        status = Status::InvalidArgument(
+            "Cannot use --expected_values_dir on when "
+            "--clear_column_family_one_in is greater than zero.");
+      }
+    }
+    if (status.ok()) {
+      if (FLAGS_expected_values_dir.empty()) {
+        expected_state_manager_.reset(
+            new AnonExpectedStateManager(FLAGS_max_key, FLAGS_column_families));
+      } else {
+        expected_state_manager_.reset(new FileExpectedStateManager(
+            FLAGS_max_key, FLAGS_column_families, FLAGS_expected_values_dir));
+      }
+      status = expected_state_manager_->Open();
+    }
+    if (!status.ok()) {
+      fprintf(stderr, "Failed setting up expected state with error: %s\n",
+              status.ToString().c_str());
+      exit(1);
+    }
+
+    if (FLAGS_test_batches_snapshots) {
+      fprintf(stdout, "No lock creation because test_batches_snapshots set\n");
+      return;
+    }
+
+    long num_locks = static_cast<long>(max_key_ >> log2_keys_per_lock_);
+    if (max_key_ & ((1 << log2_keys_per_lock_) - 1)) {
+      num_locks++;
+    }
+    fprintf(stdout, "Creating %ld locks\n", num_locks * FLAGS_column_families);
+    key_locks_.resize(FLAGS_column_families);
+
+    for (int i = 0; i < FLAGS_column_families; ++i) {
+      key_locks_[i].reset(new port::Mutex[num_locks]);
+    }
+    if (FLAGS_read_fault_one_in) {
+#ifdef NDEBUG
+      // Unsupported in release mode because it relies on
+      // `IGNORE_STATUS_IF_ERROR` to distinguish faults not expected to lead to
+      // failure.
+      fprintf(stderr,
+              "Cannot set nonzero value for --read_fault_one_in in "
+              "release mode.");
+      exit(1);
+#else   // NDEBUG
+      SyncPoint::GetInstance()->SetCallBack("FaultInjectionIgnoreError",
+                                            IgnoreReadErrorCallback);
+      SyncPoint::GetInstance()->EnableProcessing();
+#endif  // NDEBUG
+    }
+  }
+
+  ~SharedState() {
+#ifndef NDEBUG
+    if (FLAGS_read_fault_one_in) {
+      SyncPoint::GetInstance()->ClearAllCallBacks();
+      SyncPoint::GetInstance()->DisableProcessing();
+    }
+#endif
+  }
+
+  port::Mutex* GetMutex() { return &mu_; }
+
+  port::CondVar* GetCondVar() { return &cv_; }
+
+  StressTest* GetStressTest() const { return stress_test_; }
+
+  int64_t GetMaxKey() const { return max_key_; }
+
+  uint32_t GetNumThreads() const { return num_threads_; }
+
+  void SetThreads(int num_threads) { num_threads_ = num_threads; }
+
+  void IncInitialized() { num_initialized_++; }
+
+  void IncOperated() { num_populated_++; }
+
+  void IncDone() { num_done_++; }
+
+  void IncVotedReopen() { vote_reopen_ = (vote_reopen_ + 1) % num_threads_; }
+
+  bool AllInitialized() const { return num_initialized_ >= num_threads_; }
+
+  bool AllOperated() const { return num_populated_ >= num_threads_; }
+
+  bool AllDone() const { return num_done_ >= num_threads_; }
+
+  bool AllVotedReopen() { return (vote_reopen_ == 0); }
+
+  void SetStart() { start_ = true; }
+
+  void SetStartVerify() { start_verify_ = true; }
+
+  bool Started() const { return start_; }
+
+  bool VerifyStarted() const { return start_verify_; }
+
+  void SetVerificationFailure() { verification_failure_.store(true); }
+
+  bool HasVerificationFailedYet() const { return verification_failure_.load(); }
+
+  void SetShouldStopTest() { should_stop_test_.store(true); }
+
+  bool ShouldStopTest() const { return should_stop_test_.load(); }
+
+  // Returns a lock covering `key` in `cf`.
+  port::Mutex* GetMutexForKey(int cf, int64_t key) {
+    return &key_locks_[cf][key >> log2_keys_per_lock_];
+  }
+
+  // Acquires locks for all keys in `cf`.
+  void LockColumnFamily(int cf) {
+    for (int i = 0; i < max_key_ >> log2_keys_per_lock_; ++i) {
+      key_locks_[cf][i].Lock();
+    }
+  }
+
+  // Releases locks for all keys in `cf`.
+  void UnlockColumnFamily(int cf) {
+    for (int i = 0; i < max_key_ >> log2_keys_per_lock_; ++i) {
+      key_locks_[cf][i].Unlock();
+    }
+  }
+
+  // Returns a collection of mutex locks covering the key range [start, end) in
+  // `cf`.
+  std::vector<std::unique_ptr<MutexLock>> GetLocksForKeyRange(int cf,
+                                                              int64_t start,
+                                                              int64_t end) {
+    std::vector<std::unique_ptr<MutexLock>> range_locks;
+
+    if (start >= end) {
+      return range_locks;
+    }
+
+    const int64_t start_idx = start >> log2_keys_per_lock_;
+
+    int64_t end_idx = end >> log2_keys_per_lock_;
+    if ((end & ((1 << log2_keys_per_lock_) - 1)) == 0) {
+      --end_idx;
+    }
+
+    for (int64_t idx = start_idx; idx <= end_idx; ++idx) {
+      range_locks.emplace_back(
+          std::make_unique<MutexLock>(&key_locks_[cf][idx]));
+    }
+
+    return range_locks;
+  }
+
+  Status SaveAtAndAfter(DB* db) {
+    return expected_state_manager_->SaveAtAndAfter(db);
+  }
+
+  bool HasHistory() { return expected_state_manager_->HasHistory(); }
+
+  Status Restore(DB* db) { return expected_state_manager_->Restore(db); }
+
+  // Requires external locking covering all keys in `cf`.
+  void ClearColumnFamily(int cf) {
+    return expected_state_manager_->ClearColumnFamily(cf);
+  }
+
+  // @param pending True if the update may have started but is not yet
+  //    guaranteed finished. This is useful for crash-recovery testing when the
+  //    process may crash before updating the expected values array.
+  //
+  // Requires external locking covering `key` in `cf`.
+  void Put(int cf, int64_t key, uint32_t value_base, bool pending) {
+    return expected_state_manager_->Put(cf, key, value_base, pending);
+  }
+
+  // Requires external locking covering `key` in `cf`.
+  uint32_t Get(int cf, int64_t key) const {
+    return expected_state_manager_->Get(cf, key);
+  }
+
+  // @param pending See comment above Put()
+  // Returns true if the key was not yet deleted.
+  //
+  // Requires external locking covering `key` in `cf`.
+  bool Delete(int cf, int64_t key, bool pending) {
+    return expected_state_manager_->Delete(cf, key, pending);
+  }
+
+  // @param pending See comment above Put()
+  // Returns true if the key was not yet deleted.
+  //
+  // Requires external locking covering `key` in `cf`.
+  bool SingleDelete(int cf, int64_t key, bool pending) {
+    return expected_state_manager_->Delete(cf, key, pending);
+  }
+
+  // @param pending See comment above Put()
+  // Returns number of keys deleted by the call.
+  //
+  // Requires external locking covering keys in `[begin_key, end_key)` in `cf`.
+  int DeleteRange(int cf, int64_t begin_key, int64_t end_key, bool pending) {
+    return expected_state_manager_->DeleteRange(cf, begin_key, end_key,
+                                                pending);
+  }
+
+  bool AllowsOverwrite(int64_t key) const {
+    return no_overwrite_ids_.find(key) == no_overwrite_ids_.end();
+  }
+
+  // Requires external locking covering `key` in `cf`.
+  bool Exists(int cf, int64_t key) {
+    return expected_state_manager_->Exists(cf, key);
+  }
+
+  uint32_t GetSeed() const { return seed_; }
+
+  void SetShouldStopBgThread() { should_stop_bg_thread_ = true; }
+
+  bool ShouldStopBgThread() { return should_stop_bg_thread_; }
+
+  void IncBgThreads() { ++num_bg_threads_; }
+
+  void IncBgThreadsFinished() { ++bg_thread_finished_; }
+
+  bool BgThreadsFinished() const {
+    return bg_thread_finished_ == num_bg_threads_;
+  }
+
+  bool ShouldVerifyAtBeginning() const {
+    return !FLAGS_expected_values_dir.empty();
+  }
+
+  bool PrintingVerificationResults() {
+    bool tmp = false;
+    return !printing_verification_results_.compare_exchange_strong(
+        tmp, true, std::memory_order_relaxed);
+  }
+
+  void FinishPrintingVerificationResults() {
+    printing_verification_results_.store(false, std::memory_order_relaxed);
+  }
+
+  uint64_t GetStartTimestamp() const { return start_timestamp_; }
+
+ private:
+  static void IgnoreReadErrorCallback(void*) { ignore_read_error = true; }
+
+  // Pick random keys in each column family that will not experience overwrite.
+  std::unordered_set<int64_t> GenerateNoOverwriteIds() const {
+    fprintf(stdout, "Choosing random keys with no overwrite\n");
+    // Start with the identity permutation. Subsequent iterations of
+    // for loop below will start with perm of previous for loop
+    std::vector<int64_t> permutation(max_key_);
+    for (int64_t i = 0; i < max_key_; ++i) {
+      permutation[i] = i;
+    }
+    // Now do the Knuth shuffle
+    const int64_t num_no_overwrite_keys =
+        (max_key_ * FLAGS_nooverwritepercent) / 100;
+    // Only need to figure out first num_no_overwrite_keys of permutation
+    std::unordered_set<int64_t> ret;
+    ret.reserve(num_no_overwrite_keys);
+    Random64 rnd(seed_);
+    for (int64_t i = 0; i < num_no_overwrite_keys; i++) {
+      assert(i < max_key_);
+      int64_t rand_index = i + rnd.Next() % (max_key_ - i);
+      // Swap i and rand_index;
+      int64_t temp = permutation[i];
+      permutation[i] = permutation[rand_index];
+      permutation[rand_index] = temp;
+      // Fill no_overwrite_ids_ with the first num_no_overwrite_keys of
+      // permutation
+      ret.insert(permutation[i]);
+    }
+    return ret;
+  }
+
+  port::Mutex mu_;
+  port::CondVar cv_;
+  const uint32_t seed_;
+  const int64_t max_key_;
+  const uint32_t log2_keys_per_lock_;
+  int num_threads_;
+  long num_initialized_;
+  long num_populated_;
+  long vote_reopen_;
+  long num_done_;
+  bool start_;
+  bool start_verify_;
+  int num_bg_threads_;
+  bool should_stop_bg_thread_;
+  int bg_thread_finished_;
+  StressTest* stress_test_;
+  std::atomic<bool> verification_failure_;
+  std::atomic<bool> should_stop_test_;
+
+  // Keys that should not be overwritten
+  const std::unordered_set<int64_t> no_overwrite_ids_;
+
+  std::unique_ptr<ExpectedStateManager> expected_state_manager_;
+  // Cannot store `port::Mutex` directly in vector since it is not copyable
+  // and storing it in the container may require copying depending on the impl.
+  std::vector<std::unique_ptr<port::Mutex[]>> key_locks_;
+  std::atomic<bool> printing_verification_results_;
+  const uint64_t start_timestamp_;
+};
+
+// Per-thread state for concurrent executions of the same benchmark.
+struct ThreadState {
+  uint32_t tid;  // 0..n-1
+  Random rand;   // Has different seeds for different threads
+  SharedState* shared;
+  Stats stats;
+  struct SnapshotState {
+    const Snapshot* snapshot;
+    // The cf from which we did a Get at this snapshot
+    int cf_at;
+    // The name of the cf at the time that we did a read
+    std::string cf_at_name;
+    // The key with which we did a Get at this snapshot
+    std::string key;
+    // The status of the Get
+    Status status;
+    // The value of the Get
+    std::string value;
+    // optional state of all keys in the db
+    std::vector<bool>* key_vec;
+
+    std::string timestamp;
+  };
+  std::queue<std::pair<uint64_t, SnapshotState>> snapshot_queue;
+
+  ThreadState(uint32_t index, SharedState* _shared)
+      : tid(index), rand(1000 + index + _shared->GetSeed()), shared(_shared) {}
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // GFLAGS
diff --git a/src/rocksdb/db_stress_tool/db_stress_stat.cc b/src/rocksdb/db_stress_tool/db_stress_stat.cc
new file mode 100644
index 000000000..6a7883a52
--- /dev/null
+++ b/src/rocksdb/db_stress_tool/db_stress_stat.cc
@@ -0,0 +1,17 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifdef GFLAGS
+
+#include "db_stress_tool/db_stress_stat.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats;
+std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats_secondaries;
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/db_stress_tool/db_stress_stat.h b/src/rocksdb/db_stress_tool/db_stress_stat.h
new file mode 100644
index 000000000..5b38c6e2b
--- /dev/null
+++ b/src/rocksdb/db_stress_tool/db_stress_stat.h
@@ -0,0 +1,219 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <cinttypes>
+#include <memory>
+#include <queue>
+#include <unordered_set>
+
+#include "monitoring/histogram.h"
+#include "port/port.h"
+#include "rocksdb/snapshot.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/system_clock.h"
+#include "util/gflags_compat.h"
+#include "util/random.h"
+
+DECLARE_bool(histogram);
+DECLARE_bool(progress_reports);
+
+namespace ROCKSDB_NAMESPACE {
+
+// Database statistics
+extern std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats;
+extern std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats_secondaries;
+
+class Stats {
+ private:
+  uint64_t start_;
+  uint64_t finish_;
+  double seconds_;
+  long done_;
+  long gets_;
+  long prefixes_;
+  long writes_;
+  long deletes_;
+  size_t single_deletes_;
+  long iterator_size_sums_;
+  long founds_;
+  long iterations_;
+  long range_deletions_;
+  long covered_by_range_deletions_;
+  long errors_;
+  long verified_errors_;
+  long num_compact_files_succeed_;
+  long num_compact_files_failed_;
+  int next_report_;
+  size_t bytes_;
+  uint64_t last_op_finish_;
+  HistogramImpl hist_;
+
+ public:
+  Stats() {}
+
+  void Start() {
+    next_report_ = 100;
+    hist_.Clear();
+    done_ = 0;
+    gets_ = 0;
+    prefixes_ = 0;
+    writes_ = 0;
+    deletes_ = 0;
+    single_deletes_ = 0;
+    iterator_size_sums_ = 0;
+    founds_ = 0;
+    iterations_ = 0;
+    range_deletions_ = 0;
+    covered_by_range_deletions_ = 0;
+    errors_ = 0;
+    verified_errors_ = 0;
+    bytes_ = 0;
+    seconds_ = 0;
+    num_compact_files_succeed_ = 0;
+    num_compact_files_failed_ = 0;
+    start_ = SystemClock::Default()->NowMicros();
+    last_op_finish_ = start_;
+    finish_ = start_;
+  }
+
+  void Merge(const Stats& other) {
+    hist_.Merge(other.hist_);
+    done_ += other.done_;
+    gets_ += other.gets_;
+    prefixes_ += other.prefixes_;
+    writes_ += other.writes_;
+    deletes_ += other.deletes_;
+    single_deletes_ += other.single_deletes_;
+    iterator_size_sums_ += other.iterator_size_sums_;
+    founds_ += other.founds_;
+    iterations_ += other.iterations_;
+    range_deletions_ += other.range_deletions_;
+    covered_by_range_deletions_ = other.covered_by_range_deletions_;
+    errors_ += other.errors_;
+    verified_errors_ += other.verified_errors_;
+    bytes_ += other.bytes_;
+    seconds_ += other.seconds_;
+    num_compact_files_succeed_ += other.num_compact_files_succeed_;
+    num_compact_files_failed_ += other.num_compact_files_failed_;
+    if (other.start_ < start_) start_ = other.start_;
+    if (other.finish_ > finish_) finish_ = other.finish_;
+  }
+
+  void Stop() {
+    finish_ = SystemClock::Default()->NowMicros();
+    seconds_ = (finish_ - start_) * 1e-6;
+  }
+
+  void FinishedSingleOp() {
+    if (FLAGS_histogram) {
+      auto now = SystemClock::Default()->NowMicros();
+      auto micros = now - last_op_finish_;
+      hist_.Add(micros);
+      if (micros > 20000) {
+        fprintf(stdout, "long op: %" PRIu64 " micros%30s\r", micros, "");
+      }
+      last_op_finish_ = now;
+    }
+
+    done_++;
+    if (FLAGS_progress_reports) {
+      if (done_ >= next_report_) {
+        if (next_report_ < 1000)
+          next_report_ += 100;
+        else if (next_report_ < 5000)
+          next_report_ += 500;
+        else if (next_report_ < 10000)
+          next_report_ += 1000;
+        else if (next_report_ < 50000)
+          next_report_ += 5000;
+        else if (next_report_ < 100000)
+          next_report_ += 10000;
+        else if (next_report_ < 500000)
+          next_report_ += 50000;
+        else
+          next_report_ += 100000;
+        fprintf(stdout, "... finished %ld ops%30s\r", done_, "");
+      }
+    }
+  }
+
+  void AddBytesForWrites(long nwrites, size_t nbytes) {
+    writes_ += nwrites;
+    bytes_ += nbytes;
+  }
+
+  void AddGets(long ngets, long nfounds) {
+    founds_ += nfounds;
+    gets_ += ngets;
+  }
+
+  void AddPrefixes(long nprefixes, long count) {
+    prefixes_ += nprefixes;
+    iterator_size_sums_ += count;
+  }
+
+  void AddIterations(long n) { iterations_ += n; }
+
+  void AddDeletes(long n) { deletes_ += n; }
+
+  void AddSingleDeletes(size_t n) { single_deletes_ += n; }
+
+  void AddRangeDeletions(long n) { range_deletions_ += n; }
+
+  void AddCoveredByRangeDeletions(long n) { covered_by_range_deletions_ += n; }
+
+  void AddErrors(long n) { errors_ += n; }
+
+  void AddVerifiedErrors(long n) { verified_errors_ += n; }
+
+  void AddNumCompactFilesSucceed(long n) { num_compact_files_succeed_ += n; }
+
+  void AddNumCompactFilesFailed(long n) { num_compact_files_failed_ += n; }
+
+  void Report(const char* name) {
+    std::string extra;
+    if (bytes_ < 1 || done_ < 1) {
+      fprintf(stderr, "No writes or ops?\n");
+      return;
+    }
+
+    double elapsed = (finish_ - start_) * 1e-6;
+    double bytes_mb = bytes_ / 1048576.0;
+    double rate = bytes_mb / elapsed;
+    double throughput = (double)done_ / elapsed;
+
+    fprintf(stdout, "%-12s: ", name);
+    fprintf(stdout, "%.3f micros/op %ld ops/sec\n", seconds_ * 1e6 / done_,
+            (long)throughput);
+    fprintf(stdout, "%-12s: Wrote %.2f MB (%.2f MB/sec) (%ld%% of %ld ops)\n",
+            "", bytes_mb, rate, (100 * writes_) / done_, done_);
+    fprintf(stdout, "%-12s: Wrote %ld times\n", "", writes_);
+    fprintf(stdout, "%-12s: Deleted %ld times\n", "", deletes_);
+    fprintf(stdout, "%-12s: Single deleted %" ROCKSDB_PRIszt " times\n", "",
+            single_deletes_);
+    fprintf(stdout, "%-12s: %ld read and %ld found the key\n", "", gets_,
+            founds_);
+    fprintf(stdout, "%-12s: Prefix scanned %ld times\n", "", prefixes_);
+    fprintf(stdout, "%-12s: Iterator size sum is %ld\n", "",
+            iterator_size_sums_);
+    fprintf(stdout, "%-12s: Iterated %ld times\n", "", iterations_);
+    fprintf(stdout, "%-12s: Deleted %ld key-ranges\n", "", range_deletions_);
+    fprintf(stdout, "%-12s: Range deletions covered %ld keys\n", "",
+            covered_by_range_deletions_);
+
+    fprintf(stdout, "%-12s: Got errors %ld times\n", "", errors_);
+    fprintf(stdout, "%-12s: %ld CompactFiles() succeed\n", "",
+            num_compact_files_succeed_);
+    fprintf(stdout, "%-12s: %ld CompactFiles() did not succeed\n", "",
+            num_compact_files_failed_);
+
+    if (FLAGS_histogram) {
+      fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
+    }
+    fflush(stdout);
+  }
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db_stress_tool/db_stress_table_properties_collector.h b/src/rocksdb/db_stress_tool/db_stress_table_properties_collector.h
new file mode 100644
index 000000000..d1758cbb4
--- /dev/null
+++ b/src/rocksdb/db_stress_tool/db_stress_table_properties_collector.h
@@ -0,0 +1,65 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/table.h"
+#include "util/gflags_compat.h"
+#include "util/random.h"
+
+DECLARE_int32(mark_for_compaction_one_file_in);
+
+namespace ROCKSDB_NAMESPACE {
+
+// A `DbStressTablePropertiesCollector` ignores what keys/values were added to
+// the table, adds no properties to the table, and decides at random whether the
+// table will be marked for compaction according to
+// `FLAGS_mark_for_compaction_one_file_in`.
+class DbStressTablePropertiesCollector : public TablePropertiesCollector {
+ public:
+  DbStressTablePropertiesCollector()
+      : need_compact_(Random::GetTLSInstance()->OneInOpt(
+            FLAGS_mark_for_compaction_one_file_in)) {}
+
+  virtual Status AddUserKey(const Slice& /* key */, const Slice& /* value */,
+                            EntryType /*type*/, SequenceNumber /*seq*/,
+                            uint64_t /*file_size*/) override {
+    return Status::OK();
+  }
+
+  virtual Status Finish(UserCollectedProperties* /* properties */) override {
+    return Status::OK();
+  }
+
+  virtual UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{};
+  }
+
+  virtual const char* Name() const override {
+    return "DbStressTablePropertiesCollector";
+  }
+
+  virtual bool NeedCompact() const override { return need_compact_; }
+
+ private:
+  const bool need_compact_;
+};
+
+// A `DbStressTablePropertiesCollectorFactory` creates
+// `DbStressTablePropertiesCollectorFactory`s.
+class DbStressTablePropertiesCollectorFactory
+    : public TablePropertiesCollectorFactory {
+ public:
+  virtual TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /* context */) override {
+    return new DbStressTablePropertiesCollector();
+  }
+
+  virtual const char* Name() const override {
+    return "DbStressTablePropertiesCollectorFactory";
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db_stress_tool/db_stress_test_base.cc b/src/rocksdb/db_stress_tool/db_stress_test_base.cc
new file mode 100644
index 000000000..e51b43176
--- /dev/null
+++ b/src/rocksdb/db_stress_tool/db_stress_test_base.cc
@@ -0,0 +1,3383 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+
+#include <ios>
+
+#include "util/compression.h"
+#ifdef GFLAGS
+#include "db_stress_tool/db_stress_common.h"
+#include "db_stress_tool/db_stress_compaction_filter.h"
+#include "db_stress_tool/db_stress_driver.h"
+#include "db_stress_tool/db_stress_table_properties_collector.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/secondary_cache.h"
+#include "rocksdb/sst_file_manager.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "test_util/testutil.h"
+#include "util/cast_util.h"
+#include "utilities/backup/backup_engine_impl.h"
+#include "utilities/fault_injection_fs.h"
+#include "utilities/fault_injection_secondary_cache.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+std::shared_ptr<const FilterPolicy> CreateFilterPolicy() {
+  if (FLAGS_bloom_bits < 0) {
+    return BlockBasedTableOptions().filter_policy;
+  }
+  const FilterPolicy* new_policy;
+  if (FLAGS_ribbon_starting_level >= 999) {
+    // Use Bloom API
+    new_policy = NewBloomFilterPolicy(FLAGS_bloom_bits, false);
+  } else {
+    new_policy = NewRibbonFilterPolicy(
+        FLAGS_bloom_bits, /* bloom_before_level */ FLAGS_ribbon_starting_level);
+  }
+  return std::shared_ptr<const FilterPolicy>(new_policy);
+}
+
+}  // namespace
+
+StressTest::StressTest()
+    : cache_(NewCache(FLAGS_cache_size, FLAGS_cache_numshardbits)),
+      compressed_cache_(NewLRUCache(FLAGS_compressed_cache_size,
+                                    FLAGS_compressed_cache_numshardbits)),
+      filter_policy_(CreateFilterPolicy()),
+      db_(nullptr),
+#ifndef ROCKSDB_LITE
+      txn_db_(nullptr),
+#endif
+      db_aptr_(nullptr),
+      clock_(db_stress_env->GetSystemClock().get()),
+      new_column_family_name_(1),
+      num_times_reopened_(0),
+      db_preload_finished_(false),
+      cmp_db_(nullptr),
+      is_db_stopped_(false) {
+  if (FLAGS_destroy_db_initially) {
+    std::vector<std::string> files;
+    db_stress_env->GetChildren(FLAGS_db, &files);
+    for (unsigned int i = 0; i < files.size(); i++) {
+      if (Slice(files[i]).starts_with("heap-")) {
+        db_stress_env->DeleteFile(FLAGS_db + "/" + files[i]);
+      }
+    }
+
+    Options options;
+    options.env = db_stress_env;
+    // Remove files without preserving manfiest files
+#ifndef ROCKSDB_LITE
+    const Status s = !FLAGS_use_blob_db
+                         ? DestroyDB(FLAGS_db, options)
+                         : blob_db::DestroyBlobDB(FLAGS_db, options,
+                                                  blob_db::BlobDBOptions());
+#else
+    const Status s = DestroyDB(FLAGS_db, options);
+#endif  // !ROCKSDB_LITE
+
+    if (!s.ok()) {
+      fprintf(stderr, "Cannot destroy original db: %s\n", s.ToString().c_str());
+      exit(1);
+    }
+  }
+}
+
+StressTest::~StressTest() {
+  for (auto cf : column_families_) {
+    delete cf;
+  }
+  column_families_.clear();
+  delete db_;
+
+  for (auto* cf : cmp_cfhs_) {
+    delete cf;
+  }
+  cmp_cfhs_.clear();
+  delete cmp_db_;
+}
+
+std::shared_ptr<Cache> StressTest::NewCache(size_t capacity,
+                                            int32_t num_shard_bits) {
+  ConfigOptions config_options;
+  if (capacity <= 0) {
+    return nullptr;
+  }
+
+  if (FLAGS_cache_type == "clock_cache") {
+    fprintf(stderr, "Old clock cache implementation has been removed.\n");
+    exit(1);
+  } else if (FLAGS_cache_type == "hyper_clock_cache") {
+    return HyperClockCacheOptions(static_cast<size_t>(capacity),
+                                  FLAGS_block_size /*estimated_entry_charge*/,
+                                  num_shard_bits)
+        .MakeSharedCache();
+  } else if (FLAGS_cache_type == "lru_cache") {
+    LRUCacheOptions opts;
+    opts.capacity = capacity;
+    opts.num_shard_bits = num_shard_bits;
+#ifndef ROCKSDB_LITE
+    std::shared_ptr<SecondaryCache> secondary_cache;
+    if (!FLAGS_secondary_cache_uri.empty()) {
+      Status s = SecondaryCache::CreateFromString(
+          config_options, FLAGS_secondary_cache_uri, &secondary_cache);
+      if (secondary_cache == nullptr) {
+        fprintf(stderr,
+                "No secondary cache registered matching string: %s status=%s\n",
+                FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str());
+        exit(1);
+      }
+      if (FLAGS_secondary_cache_fault_one_in > 0) {
+        secondary_cache = std::make_shared<FaultInjectionSecondaryCache>(
+            secondary_cache, static_cast<uint32_t>(FLAGS_seed),
+            FLAGS_secondary_cache_fault_one_in);
+      }
+      opts.secondary_cache = secondary_cache;
+    }
+#endif
+    return NewLRUCache(opts);
+  } else {
+    fprintf(stderr, "Cache type not supported.");
+    exit(1);
+  }
+}
+
+std::vector<std::string> StressTest::GetBlobCompressionTags() {
+  std::vector<std::string> compression_tags{"kNoCompression"};
+
+  if (Snappy_Supported()) {
+    compression_tags.emplace_back("kSnappyCompression");
+  }
+  if (LZ4_Supported()) {
+    compression_tags.emplace_back("kLZ4Compression");
+  }
+  if (ZSTD_Supported()) {
+    compression_tags.emplace_back("kZSTD");
+  }
+
+  return compression_tags;
+}
+
+bool StressTest::BuildOptionsTable() {
+  if (FLAGS_set_options_one_in <= 0) {
+    return true;
+  }
+
+  std::unordered_map<std::string, std::vector<std::string>> options_tbl = {
+      {"write_buffer_size",
+       {std::to_string(options_.write_buffer_size),
+        std::to_string(options_.write_buffer_size * 2),
+        std::to_string(options_.write_buffer_size * 4)}},
+      {"max_write_buffer_number",
+       {std::to_string(options_.max_write_buffer_number),
+        std::to_string(options_.max_write_buffer_number * 2),
+        std::to_string(options_.max_write_buffer_number * 4)}},
+      {"arena_block_size",
+       {
+           std::to_string(options_.arena_block_size),
+           std::to_string(options_.write_buffer_size / 4),
+           std::to_string(options_.write_buffer_size / 8),
+       }},
+      {"memtable_huge_page_size", {"0", std::to_string(2 * 1024 * 1024)}},
+      {"max_successive_merges", {"0", "2", "4"}},
+      {"inplace_update_num_locks", {"100", "200", "300"}},
+      // TODO: re-enable once internal task T124324915 is fixed.
+      // {"experimental_mempurge_threshold", {"0.0", "1.0"}},
+      // TODO(ljin): enable test for this option
+      // {"disable_auto_compactions", {"100", "200", "300"}},
+      {"level0_file_num_compaction_trigger",
+       {
+           std::to_string(options_.level0_file_num_compaction_trigger),
+           std::to_string(options_.level0_file_num_compaction_trigger + 2),
+           std::to_string(options_.level0_file_num_compaction_trigger + 4),
+       }},
+      {"level0_slowdown_writes_trigger",
+       {
+           std::to_string(options_.level0_slowdown_writes_trigger),
+           std::to_string(options_.level0_slowdown_writes_trigger + 2),
+           std::to_string(options_.level0_slowdown_writes_trigger + 4),
+       }},
+      {"level0_stop_writes_trigger",
+       {
+           std::to_string(options_.level0_stop_writes_trigger),
+           std::to_string(options_.level0_stop_writes_trigger + 2),
+           std::to_string(options_.level0_stop_writes_trigger + 4),
+       }},
+      {"max_compaction_bytes",
+       {
+           std::to_string(options_.target_file_size_base * 5),
+           std::to_string(options_.target_file_size_base * 15),
+           std::to_string(options_.target_file_size_base * 100),
+       }},
+      {"target_file_size_base",
+       {
+           std::to_string(options_.target_file_size_base),
+           std::to_string(options_.target_file_size_base * 2),
+           std::to_string(options_.target_file_size_base * 4),
+       }},
+      {"target_file_size_multiplier",
+       {
+           std::to_string(options_.target_file_size_multiplier),
+           "1",
+           "2",
+       }},
+      {"max_bytes_for_level_base",
+       {
+           std::to_string(options_.max_bytes_for_level_base / 2),
+           std::to_string(options_.max_bytes_for_level_base),
+           std::to_string(options_.max_bytes_for_level_base * 2),
+       }},
+      {"max_bytes_for_level_multiplier",
+       {
+           std::to_string(options_.max_bytes_for_level_multiplier),
+           "1",
+           "2",
+       }},
+      {"max_sequential_skip_in_iterations", {"4", "8", "12"}},
+  };
+
+  if (FLAGS_allow_setting_blob_options_dynamically) {
+    options_tbl.emplace("enable_blob_files",
+                        std::vector<std::string>{"false", "true"});
+    options_tbl.emplace("min_blob_size",
+                        std::vector<std::string>{"0", "8", "16"});
+    options_tbl.emplace("blob_file_size",
+                        std::vector<std::string>{"1M", "16M", "256M", "1G"});
+    options_tbl.emplace("blob_compression_type", GetBlobCompressionTags());
+    options_tbl.emplace("enable_blob_garbage_collection",
+                        std::vector<std::string>{"false", "true"});
+    options_tbl.emplace(
+        "blob_garbage_collection_age_cutoff",
+        std::vector<std::string>{"0.0", "0.25", "0.5", "0.75", "1.0"});
+    options_tbl.emplace("blob_garbage_collection_force_threshold",
+                        std::vector<std::string>{"0.5", "0.75", "1.0"});
+    options_tbl.emplace("blob_compaction_readahead_size",
+                        std::vector<std::string>{"0", "1M", "4M"});
+    options_tbl.emplace("blob_file_starting_level",
+                        std::vector<std::string>{"0", "1", "2"});
+    options_tbl.emplace("prepopulate_blob_cache",
+                        std::vector<std::string>{"kDisable", "kFlushOnly"});
+  }
+
+  options_table_ = std::move(options_tbl);
+
+  for (const auto& iter : options_table_) {
+    options_index_.push_back(iter.first);
+  }
+  return true;
+}
+
+void StressTest::InitDb(SharedState* shared) {
+  uint64_t now = clock_->NowMicros();
+  fprintf(stdout, "%s Initializing db_stress\n",
+          clock_->TimeToString(now / 1000000).c_str());
+  PrintEnv();
+  Open(shared);
+  BuildOptionsTable();
+}
+
+void StressTest::FinishInitDb(SharedState* shared) {
+  if (FLAGS_read_only) {
+    uint64_t now = clock_->NowMicros();
+    fprintf(stdout, "%s Preloading db with %" PRIu64 " KVs\n",
+            clock_->TimeToString(now / 1000000).c_str(), FLAGS_max_key);
+    PreloadDbAndReopenAsReadOnly(FLAGS_max_key, shared);
+  }
+
+  if (shared->HasHistory()) {
+    // The way it works right now is, if there's any history, that means the
+    // previous run mutating the DB had all its operations traced, in which case
+    // we should always be able to `Restore()` the expected values to match the
+    // `db_`'s current seqno.
+    Status s = shared->Restore(db_);
+    if (!s.ok()) {
+      fprintf(stderr, "Error restoring historical expected values: %s\n",
+              s.ToString().c_str());
+      exit(1);
+    }
+  }
+#ifndef ROCKSDB_LITE
+  if (FLAGS_use_txn) {
+    // It's OK here without sync because unsynced data cannot be lost at this
+    // point
+    // - even with sync_fault_injection=1 as the
+    // file is still directly writable until after FinishInitDb()
+    ProcessRecoveredPreparedTxns(shared);
+  }
+#endif
+  if (FLAGS_enable_compaction_filter) {
+    auto* compaction_filter_factory =
+        reinterpret_cast<DbStressCompactionFilterFactory*>(
+            options_.compaction_filter_factory.get());
+    assert(compaction_filter_factory);
+    // This must be called only after any potential `SharedState::Restore()` has
+    // completed in order for the `compaction_filter_factory` to operate on the
+    // correct latest values file.
+    compaction_filter_factory->SetSharedState(shared);
+    fprintf(stdout, "Compaction filter factory: %s\n",
+            compaction_filter_factory->Name());
+  }
+}
+
+void StressTest::TrackExpectedState(SharedState* shared) {
+  // For `FLAGS_manual_wal_flush_one_inWAL`
+  // data can be lost when `manual_wal_flush_one_in > 0` and `FlushWAL()` is not
+  // explictly called by users of RocksDB (in our case, db stress).
+  // Therefore recovery from such potential WAL data loss is a prefix recovery
+  // that requires tracing
+  if ((FLAGS_sync_fault_injection || FLAGS_disable_wal ||
+       FLAGS_manual_wal_flush_one_in > 0) &&
+      IsStateTracked()) {
+    Status s = shared->SaveAtAndAfter(db_);
+    if (!s.ok()) {
+      fprintf(stderr, "Error enabling history tracing: %s\n",
+              s.ToString().c_str());
+      exit(1);
+    }
+  }
+}
+
+Status StressTest::AssertSame(DB* db, ColumnFamilyHandle* cf,
+                              ThreadState::SnapshotState& snap_state) {
+  Status s;
+  if (cf->GetName() != snap_state.cf_at_name) {
+    return s;
+  }
+  // This `ReadOptions` is for validation purposes. Ignore
+  // `FLAGS_rate_limit_user_ops` to avoid slowing any validation.
+  ReadOptions ropt;
+  ropt.snapshot = snap_state.snapshot;
+  Slice ts;
+  if (!snap_state.timestamp.empty()) {
+    ts = snap_state.timestamp;
+    ropt.timestamp = &ts;
+  }
+  PinnableSlice exp_v(&snap_state.value);
+  exp_v.PinSelf();
+  PinnableSlice v;
+  s = db->Get(ropt, cf, snap_state.key, &v);
+  if (!s.ok() && !s.IsNotFound()) {
+    return s;
+  }
+  if (snap_state.status != s) {
+    return Status::Corruption(
+        "The snapshot gave inconsistent results for key " +
+        std::to_string(Hash(snap_state.key.c_str(), snap_state.key.size(), 0)) +
+        " in cf " + cf->GetName() + ": (" + snap_state.status.ToString() +
+        ") vs. (" + s.ToString() + ")");
+  }
+  if (s.ok()) {
+    if (exp_v != v) {
+      return Status::Corruption("The snapshot gave inconsistent values: (" +
+                                exp_v.ToString() + ") vs. (" + v.ToString() +
+                                ")");
+    }
+  }
+  if (snap_state.key_vec != nullptr) {
+    // When `prefix_extractor` is set, seeking to beginning and scanning
+    // across prefixes are only supported with `total_order_seek` set.
+    ropt.total_order_seek = true;
+    std::unique_ptr<Iterator> iterator(db->NewIterator(ropt));
+    std::unique_ptr<std::vector<bool>> tmp_bitvec(
+        new std::vector<bool>(FLAGS_max_key));
+    for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+      uint64_t key_val;
+      if (GetIntVal(iterator->key().ToString(), &key_val)) {
+        (*tmp_bitvec.get())[key_val] = true;
+      }
+    }
+    if (!std::equal(snap_state.key_vec->begin(), snap_state.key_vec->end(),
+                    tmp_bitvec.get()->begin())) {
+      return Status::Corruption("Found inconsistent keys at this snapshot");
+    }
+  }
+  return Status::OK();
+}
+
+void StressTest::VerificationAbort(SharedState* shared, std::string msg,
+                                   Status s) const {
+  fprintf(stderr, "Verification failed: %s. Status is %s\n", msg.c_str(),
+          s.ToString().c_str());
+  shared->SetVerificationFailure();
+}
+
+void StressTest::VerificationAbort(SharedState* shared, std::string msg, int cf,
+                                   int64_t key) const {
+  auto key_str = Key(key);
+  Slice key_slice = key_str;
+  fprintf(stderr,
+          "Verification failed for column family %d key %s (%" PRIi64 "): %s\n",
+          cf, key_slice.ToString(true).c_str(), key, msg.c_str());
+  shared->SetVerificationFailure();
+}
+
+void StressTest::VerificationAbort(SharedState* shared, std::string msg, int cf,
+                                   int64_t key, Slice value_from_db,
+                                   Slice value_from_expected) const {
+  auto key_str = Key(key);
+  fprintf(stderr,
+          "Verification failed for column family %d key %s (%" PRIi64
+          "): value_from_db: %s, value_from_expected: %s, msg: %s\n",
+          cf, Slice(key_str).ToString(true).c_str(), key,
+          value_from_db.ToString(true).c_str(),
+          value_from_expected.ToString(true).c_str(), msg.c_str());
+  shared->SetVerificationFailure();
+}
+
+void StressTest::VerificationAbort(SharedState* shared, int cf, int64_t key,
+                                   const Slice& value,
+                                   const WideColumns& columns,
+                                   const WideColumns& expected_columns) const {
+  assert(shared);
+
+  auto key_str = Key(key);
+
+  fprintf(stderr,
+          "Verification failed for column family %d key %s (%" PRIi64
+          "): Value and columns inconsistent: %s\n",
+          cf, Slice(key_str).ToString(/* hex */ true).c_str(), key,
+          DebugString(value, columns, expected_columns).c_str());
+
+  shared->SetVerificationFailure();
+}
+
+std::string StressTest::DebugString(const Slice& value,
+                                    const WideColumns& columns,
+                                    const WideColumns& expected_columns) {
+  std::ostringstream oss;
+
+  oss << "value: " << value.ToString(/* hex */ true);
+
+  auto dump = [](const WideColumns& cols, std::ostream& os) {
+    if (cols.empty()) {
+      return;
+    }
+
+    os << std::hex;
+
+    auto it = cols.begin();
+    os << *it;
+    for (++it; it != cols.end(); ++it) {
+      os << ' ' << *it;
+    }
+  };
+
+  oss << ", columns: ";
+  dump(columns, oss);
+
+  oss << ", expected_columns: ";
+  dump(expected_columns, oss);
+
+  return oss.str();
+}
+
+void StressTest::PrintStatistics() {
+  if (dbstats) {
+    fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
+  }
+  if (dbstats_secondaries) {
+    fprintf(stdout, "Secondary instances STATISTICS:\n%s\n",
+            dbstats_secondaries->ToString().c_str());
+  }
+}
+
+// Currently PreloadDb has to be single-threaded.
+void StressTest::PreloadDbAndReopenAsReadOnly(int64_t number_of_keys,
+                                              SharedState* shared) {
+  WriteOptions write_opts;
+  write_opts.disableWAL = FLAGS_disable_wal;
+  if (FLAGS_sync) {
+    write_opts.sync = true;
+  }
+  if (FLAGS_rate_limit_auto_wal_flush) {
+    write_opts.rate_limiter_priority = Env::IO_USER;
+  }
+  char value[100];
+  int cf_idx = 0;
+  Status s;
+  for (auto cfh : column_families_) {
+    for (int64_t k = 0; k != number_of_keys; ++k) {
+      const std::string key = Key(k);
+
+      constexpr uint32_t value_base = 0;
+      const size_t sz = GenerateValue(value_base, value, sizeof(value));
+
+      const Slice v(value, sz);
+
+      shared->Put(cf_idx, k, value_base, true /* pending */);
+
+      std::string ts;
+      if (FLAGS_user_timestamp_size > 0) {
+        ts = GetNowNanos();
+      }
+
+      if (FLAGS_use_merge) {
+        if (!FLAGS_use_txn) {
+          if (FLAGS_user_timestamp_size > 0) {
+            s = db_->Merge(write_opts, cfh, key, ts, v);
+          } else {
+            s = db_->Merge(write_opts, cfh, key, v);
+          }
+        } else {
+#ifndef ROCKSDB_LITE
+          Transaction* txn;
+          s = NewTxn(write_opts, &txn);
+          if (s.ok()) {
+            s = txn->Merge(cfh, key, v);
+            if (s.ok()) {
+              s = CommitTxn(txn);
+            }
+          }
+#endif
+        }
+      } else if (FLAGS_use_put_entity_one_in > 0) {
+        s = db_->PutEntity(write_opts, cfh, key,
+                           GenerateWideColumns(value_base, v));
+      } else {
+        if (!FLAGS_use_txn) {
+          if (FLAGS_user_timestamp_size > 0) {
+            s = db_->Put(write_opts, cfh, key, ts, v);
+          } else {
+            s = db_->Put(write_opts, cfh, key, v);
+          }
+        } else {
+#ifndef ROCKSDB_LITE
+          Transaction* txn;
+          s = NewTxn(write_opts, &txn);
+          if (s.ok()) {
+            s = txn->Put(cfh, key, v);
+            if (s.ok()) {
+              s = CommitTxn(txn);
+            }
+          }
+#endif
+        }
+      }
+
+      shared->Put(cf_idx, k, value_base, false /* pending */);
+      if (!s.ok()) {
+        break;
+      }
+    }
+    if (!s.ok()) {
+      break;
+    }
+    ++cf_idx;
+  }
+  if (s.ok()) {
+    s = db_->Flush(FlushOptions(), column_families_);
+  }
+  if (s.ok()) {
+    for (auto cf : column_families_) {
+      delete cf;
+    }
+    column_families_.clear();
+    delete db_;
+    db_ = nullptr;
+#ifndef ROCKSDB_LITE
+    txn_db_ = nullptr;
+#endif
+
+    db_preload_finished_.store(true);
+    auto now = clock_->NowMicros();
+    fprintf(stdout, "%s Reopening database in read-only\n",
+            clock_->TimeToString(now / 1000000).c_str());
+    // Reopen as read-only, can ignore all options related to updates
+    Open(shared);
+  } else {
+    fprintf(stderr, "Failed to preload db");
+    exit(1);
+  }
+}
+
+Status StressTest::SetOptions(ThreadState* thread) {
+  assert(FLAGS_set_options_one_in > 0);
+  std::unordered_map<std::string, std::string> opts;
+  std::string name =
+      options_index_[thread->rand.Next() % options_index_.size()];
+  int value_idx = thread->rand.Next() % options_table_[name].size();
+  if (name == "level0_file_num_compaction_trigger" ||
+      name == "level0_slowdown_writes_trigger" ||
+      name == "level0_stop_writes_trigger") {
+    opts["level0_file_num_compaction_trigger"] =
+        options_table_["level0_file_num_compaction_trigger"][value_idx];
+    opts["level0_slowdown_writes_trigger"] =
+        options_table_["level0_slowdown_writes_trigger"][value_idx];
+    opts["level0_stop_writes_trigger"] =
+        options_table_["level0_stop_writes_trigger"][value_idx];
+  } else {
+    opts[name] = options_table_[name][value_idx];
+  }
+
+  int rand_cf_idx = thread->rand.Next() % FLAGS_column_families;
+  auto cfh = column_families_[rand_cf_idx];
+  return db_->SetOptions(cfh, opts);
+}
+
+#ifndef ROCKSDB_LITE
+void StressTest::ProcessRecoveredPreparedTxns(SharedState* shared) {
+  assert(txn_db_);
+  std::vector<Transaction*> recovered_prepared_trans;
+  txn_db_->GetAllPreparedTransactions(&recovered_prepared_trans);
+  for (Transaction* txn : recovered_prepared_trans) {
+    ProcessRecoveredPreparedTxnsHelper(txn, shared);
+    delete txn;
+  }
+  recovered_prepared_trans.clear();
+  txn_db_->GetAllPreparedTransactions(&recovered_prepared_trans);
+  assert(recovered_prepared_trans.size() == 0);
+}
+
+void StressTest::ProcessRecoveredPreparedTxnsHelper(Transaction* txn,
+                                                    SharedState* shared) {
+  thread_local Random rand(static_cast<uint32_t>(FLAGS_seed));
+  for (size_t i = 0; i < column_families_.size(); ++i) {
+    std::unique_ptr<WBWIIterator> wbwi_iter(
+        txn->GetWriteBatch()->NewIterator(column_families_[i]));
+    for (wbwi_iter->SeekToFirst(); wbwi_iter->Valid(); wbwi_iter->Next()) {
+      uint64_t key_val;
+      if (GetIntVal(wbwi_iter->Entry().key.ToString(), &key_val)) {
+        shared->Put(static_cast<int>(i) /* cf_idx */, key_val,
+                    0 /* value_base */, true /* pending */);
+      }
+    }
+  }
+  if (rand.OneIn(2)) {
+    Status s = txn->Commit();
+    assert(s.ok());
+  } else {
+    Status s = txn->Rollback();
+    assert(s.ok());
+  }
+}
+
+Status StressTest::NewTxn(WriteOptions& write_opts, Transaction** txn) {
+  if (!FLAGS_use_txn) {
+    return Status::InvalidArgument("NewTxn when FLAGS_use_txn is not set");
+  }
+  write_opts.disableWAL = FLAGS_disable_wal;
+  static std::atomic<uint64_t> txn_id = {0};
+  TransactionOptions txn_options;
+  txn_options.use_only_the_last_commit_time_batch_for_recovery =
+      FLAGS_use_only_the_last_commit_time_batch_for_recovery;
+  txn_options.lock_timeout = 600000;  // 10 min
+  txn_options.deadlock_detect = true;
+  *txn = txn_db_->BeginTransaction(write_opts, txn_options);
+  auto istr = std::to_string(txn_id.fetch_add(1));
+  Status s = (*txn)->SetName("xid" + istr);
+  return s;
+}
+
+Status StressTest::CommitTxn(Transaction* txn, ThreadState* thread) {
+  if (!FLAGS_use_txn) {
+    return Status::InvalidArgument("CommitTxn when FLAGS_use_txn is not set");
+  }
+  assert(txn_db_);
+  Status s = txn->Prepare();
+  std::shared_ptr<const Snapshot> timestamped_snapshot;
+  if (s.ok()) {
+    if (thread && FLAGS_create_timestamped_snapshot_one_in &&
+        thread->rand.OneIn(FLAGS_create_timestamped_snapshot_one_in)) {
+      uint64_t ts = db_stress_env->NowNanos();
+      s = txn->CommitAndTryCreateSnapshot(/*notifier=*/nullptr, ts,
+                                          &timestamped_snapshot);
+
+      std::pair<Status, std::shared_ptr<const Snapshot>> res;
+      if (thread->tid == 0) {
+        uint64_t now = db_stress_env->NowNanos();
+        res = txn_db_->CreateTimestampedSnapshot(now);
+        if (res.first.ok()) {
+          assert(res.second);
+          assert(res.second->GetTimestamp() == now);
+          if (timestamped_snapshot) {
+            assert(res.second->GetTimestamp() >
+                   timestamped_snapshot->GetTimestamp());
+          }
+        } else {
+          assert(!res.second);
+        }
+      }
+    } else {
+      s = txn->Commit();
+    }
+  }
+  if (thread && FLAGS_create_timestamped_snapshot_one_in > 0 &&
+      thread->rand.OneInOpt(50000)) {
+    uint64_t now = db_stress_env->NowNanos();
+    constexpr uint64_t time_diff = static_cast<uint64_t>(1000) * 1000 * 1000;
+    txn_db_->ReleaseTimestampedSnapshotsOlderThan(now - time_diff);
+  }
+  delete txn;
+  return s;
+}
+
+Status StressTest::RollbackTxn(Transaction* txn) {
+  if (!FLAGS_use_txn) {
+    return Status::InvalidArgument(
+        "RollbackTxn when FLAGS_use_txn is not"
+        " set");
+  }
+  Status s = txn->Rollback();
+  delete txn;
+  return s;
+}
+#endif
+
+void StressTest::OperateDb(ThreadState* thread) {
+  ReadOptions read_opts(FLAGS_verify_checksum, true);
+  read_opts.rate_limiter_priority =
+      FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL;
+  read_opts.async_io = FLAGS_async_io;
+  read_opts.adaptive_readahead = FLAGS_adaptive_readahead;
+  read_opts.readahead_size = FLAGS_readahead_size;
+  WriteOptions write_opts;
+  if (FLAGS_rate_limit_auto_wal_flush) {
+    write_opts.rate_limiter_priority = Env::IO_USER;
+  }
+  auto shared = thread->shared;
+  char value[100];
+  std::string from_db;
+  if (FLAGS_sync) {
+    write_opts.sync = true;
+  }
+  write_opts.disableWAL = FLAGS_disable_wal;
+  write_opts.protection_bytes_per_key = FLAGS_batch_protection_bytes_per_key;
+  const int prefix_bound = static_cast<int>(FLAGS_readpercent) +
+                           static_cast<int>(FLAGS_prefixpercent);
+  const int write_bound = prefix_bound + static_cast<int>(FLAGS_writepercent);
+  const int del_bound = write_bound + static_cast<int>(FLAGS_delpercent);
+  const int delrange_bound =
+      del_bound + static_cast<int>(FLAGS_delrangepercent);
+  const int iterate_bound =
+      delrange_bound + static_cast<int>(FLAGS_iterpercent);
+
+  const uint64_t ops_per_open = FLAGS_ops_per_thread / (FLAGS_reopen + 1);
+
+#ifndef NDEBUG
+  if (FLAGS_read_fault_one_in) {
+    fault_fs_guard->SetThreadLocalReadErrorContext(thread->shared->GetSeed(),
+                                                   FLAGS_read_fault_one_in);
+  }
+#endif  // NDEBUG
+  if (FLAGS_write_fault_one_in) {
+    IOStatus error_msg;
+    if (FLAGS_injest_error_severity <= 1 || FLAGS_injest_error_severity > 2) {
+      error_msg = IOStatus::IOError("Retryable IO Error");
+      error_msg.SetRetryable(true);
+    } else if (FLAGS_injest_error_severity == 2) {
+      // Ingest the fatal error
+      error_msg = IOStatus::IOError("Fatal IO Error");
+      error_msg.SetDataLoss(true);
+    }
+    std::vector<FileType> types = {FileType::kTableFile,
+                                   FileType::kDescriptorFile,
+                                   FileType::kCurrentFile};
+    fault_fs_guard->SetRandomWriteError(
+        thread->shared->GetSeed(), FLAGS_write_fault_one_in, error_msg,
+        /*inject_for_all_file_types=*/false, types);
+  }
+  thread->stats.Start();
+  for (int open_cnt = 0; open_cnt <= FLAGS_reopen; ++open_cnt) {
+    if (thread->shared->HasVerificationFailedYet() ||
+        thread->shared->ShouldStopTest()) {
+      break;
+    }
+    if (open_cnt != 0) {
+      thread->stats.FinishedSingleOp();
+      MutexLock l(thread->shared->GetMutex());
+      while (!thread->snapshot_queue.empty()) {
+        db_->ReleaseSnapshot(thread->snapshot_queue.front().second.snapshot);
+        delete thread->snapshot_queue.front().second.key_vec;
+        thread->snapshot_queue.pop();
+      }
+      thread->shared->IncVotedReopen();
+      if (thread->shared->AllVotedReopen()) {
+        thread->shared->GetStressTest()->Reopen(thread);
+        thread->shared->GetCondVar()->SignalAll();
+      } else {
+        thread->shared->GetCondVar()->Wait();
+      }
+      // Commenting this out as we don't want to reset stats on each open.
+      // thread->stats.Start();
+    }
+
+    for (uint64_t i = 0; i < ops_per_open; i++) {
+      if (thread->shared->HasVerificationFailedYet()) {
+        break;
+      }
+
+      // Change Options
+      if (thread->rand.OneInOpt(FLAGS_set_options_one_in)) {
+        SetOptions(thread);
+      }
+
+      if (thread->rand.OneInOpt(FLAGS_set_in_place_one_in)) {
+        options_.inplace_update_support ^= options_.inplace_update_support;
+      }
+
+      if (thread->tid == 0 && FLAGS_verify_db_one_in > 0 &&
+          thread->rand.OneIn(FLAGS_verify_db_one_in)) {
+        ContinuouslyVerifyDb(thread);
+        if (thread->shared->ShouldStopTest()) {
+          break;
+        }
+      }
+
+      MaybeClearOneColumnFamily(thread);
+
+      if (thread->rand.OneInOpt(FLAGS_manual_wal_flush_one_in)) {
+        bool sync = thread->rand.OneIn(2) ? true : false;
+        Status s = db_->FlushWAL(sync);
+        if (!s.ok() && !(sync && s.IsNotSupported())) {
+          fprintf(stderr, "FlushWAL(sync=%s) failed: %s\n",
+                  (sync ? "true" : "false"), s.ToString().c_str());
+        }
+      }
+
+      if (thread->rand.OneInOpt(FLAGS_sync_wal_one_in)) {
+        Status s = db_->SyncWAL();
+        if (!s.ok() && !s.IsNotSupported()) {
+          fprintf(stderr, "SyncWAL() failed: %s\n", s.ToString().c_str());
+        }
+      }
+
+      int rand_column_family = thread->rand.Next() % FLAGS_column_families;
+      ColumnFamilyHandle* column_family = column_families_[rand_column_family];
+
+      if (thread->rand.OneInOpt(FLAGS_compact_files_one_in)) {
+        TestCompactFiles(thread, column_family);
+      }
+
+      int64_t rand_key = GenerateOneKey(thread, i);
+      std::string keystr = Key(rand_key);
+      Slice key = keystr;
+
+      if (thread->rand.OneInOpt(FLAGS_compact_range_one_in)) {
+        TestCompactRange(thread, rand_key, key, column_family);
+        if (thread->shared->HasVerificationFailedYet()) {
+          break;
+        }
+      }
+
+      std::vector<int> rand_column_families =
+          GenerateColumnFamilies(FLAGS_column_families, rand_column_family);
+
+      if (thread->rand.OneInOpt(FLAGS_flush_one_in)) {
+        Status status = TestFlush(rand_column_families);
+        if (!status.ok()) {
+          fprintf(stdout, "Unable to perform Flush(): %s\n",
+                  status.ToString().c_str());
+        }
+      }
+
+#ifndef ROCKSDB_LITE
+      // Verify GetLiveFiles with a 1 in N chance.
+      if (thread->rand.OneInOpt(FLAGS_get_live_files_one_in) &&
+          !FLAGS_write_fault_one_in) {
+        Status status = VerifyGetLiveFiles();
+        if (!status.ok()) {
+          VerificationAbort(shared, "VerifyGetLiveFiles status not OK", status);
+        }
+      }
+
+      // Verify GetSortedWalFiles with a 1 in N chance.
+      if (thread->rand.OneInOpt(FLAGS_get_sorted_wal_files_one_in)) {
+        Status status = VerifyGetSortedWalFiles();
+        if (!status.ok()) {
+          VerificationAbort(shared, "VerifyGetSortedWalFiles status not OK",
+                            status);
+        }
+      }
+
+      // Verify GetCurrentWalFile with a 1 in N chance.
+      if (thread->rand.OneInOpt(FLAGS_get_current_wal_file_one_in)) {
+        Status status = VerifyGetCurrentWalFile();
+        if (!status.ok()) {
+          VerificationAbort(shared, "VerifyGetCurrentWalFile status not OK",
+                            status);
+        }
+      }
+#endif  // !ROCKSDB_LITE
+
+      if (thread->rand.OneInOpt(FLAGS_pause_background_one_in)) {
+        Status status = TestPauseBackground(thread);
+        if (!status.ok()) {
+          VerificationAbort(
+              shared, "Pause/ContinueBackgroundWork status not OK", status);
+        }
+      }
+
+#ifndef ROCKSDB_LITE
+      if (thread->rand.OneInOpt(FLAGS_verify_checksum_one_in)) {
+        Status status = db_->VerifyChecksum();
+        if (!status.ok()) {
+          VerificationAbort(shared, "VerifyChecksum status not OK", status);
+        }
+      }
+
+      if (thread->rand.OneInOpt(FLAGS_get_property_one_in)) {
+        TestGetProperty(thread);
+      }
+#endif
+
+      std::vector<int64_t> rand_keys = GenerateKeys(rand_key);
+
+      if (thread->rand.OneInOpt(FLAGS_ingest_external_file_one_in)) {
+        TestIngestExternalFile(thread, rand_column_families, rand_keys);
+      }
+
+      if (thread->rand.OneInOpt(FLAGS_backup_one_in)) {
+        // Beyond a certain DB size threshold, this test becomes heavier than
+        // it's worth.
+        uint64_t total_size = 0;
+        if (FLAGS_backup_max_size > 0) {
+          std::vector<FileAttributes> files;
+          db_stress_env->GetChildrenFileAttributes(FLAGS_db, &files);
+          for (auto& file : files) {
+            total_size += file.size_bytes;
+          }
+        }
+
+        if (total_size <= FLAGS_backup_max_size) {
+          Status s = TestBackupRestore(thread, rand_column_families, rand_keys);
+          if (!s.ok()) {
+            VerificationAbort(shared, "Backup/restore gave inconsistent state",
+                              s);
+          }
+        }
+      }
+
+      if (thread->rand.OneInOpt(FLAGS_checkpoint_one_in)) {
+        Status s = TestCheckpoint(thread, rand_column_families, rand_keys);
+        if (!s.ok()) {
+          VerificationAbort(shared, "Checkpoint gave inconsistent state", s);
+        }
+      }
+
+#ifndef ROCKSDB_LITE
+      if (thread->rand.OneInOpt(FLAGS_approximate_size_one_in)) {
+        Status s =
+            TestApproximateSize(thread, i, rand_column_families, rand_keys);
+        if (!s.ok()) {
+          VerificationAbort(shared, "ApproximateSize Failed", s);
+        }
+      }
+#endif  // !ROCKSDB_LITE
+      if (thread->rand.OneInOpt(FLAGS_acquire_snapshot_one_in)) {
+        TestAcquireSnapshot(thread, rand_column_family, keystr, i);
+      }
+
+      /*always*/ {
+        Status s = MaybeReleaseSnapshots(thread, i);
+        if (!s.ok()) {
+          VerificationAbort(shared, "Snapshot gave inconsistent state", s);
+        }
+      }
+
+      // Assign timestamps if necessary.
+      std::string read_ts_str;
+      Slice read_ts;
+      if (FLAGS_user_timestamp_size > 0) {
+        read_ts_str = GetNowNanos();
+        read_ts = read_ts_str;
+        read_opts.timestamp = &read_ts;
+      }
+
+      int prob_op = thread->rand.Uniform(100);
+      // Reset this in case we pick something other than a read op. We don't
+      // want to use a stale value when deciding at the beginning of the loop
+      // whether to vote to reopen
+      if (prob_op >= 0 && prob_op < static_cast<int>(FLAGS_readpercent)) {
+        assert(0 <= prob_op);
+        // OPERATION read
+        if (FLAGS_use_multiget) {
+          // Leave room for one more iteration of the loop with a single key
+          // batch. This is to ensure that each thread does exactly the same
+          // number of ops
+          int multiget_batch_size = static_cast<int>(
+              std::min(static_cast<uint64_t>(thread->rand.Uniform(64)),
+                       FLAGS_ops_per_thread - i - 1));
+          // If its the last iteration, ensure that multiget_batch_size is 1
+          multiget_batch_size = std::max(multiget_batch_size, 1);
+          rand_keys = GenerateNKeys(thread, multiget_batch_size, i);
+          TestMultiGet(thread, read_opts, rand_column_families, rand_keys);
+          i += multiget_batch_size - 1;
+        } else {
+          TestGet(thread, read_opts, rand_column_families, rand_keys);
+        }
+      } else if (prob_op < prefix_bound) {
+        assert(static_cast<int>(FLAGS_readpercent) <= prob_op);
+        // OPERATION prefix scan
+        // keys are 8 bytes long, prefix size is FLAGS_prefix_size. There are
+        // (8 - FLAGS_prefix_size) bytes besides the prefix. So there will
+        // be 2 ^ ((8 - FLAGS_prefix_size) * 8) possible keys with the same
+        // prefix
+        TestPrefixScan(thread, read_opts, rand_column_families, rand_keys);
+      } else if (prob_op < write_bound) {
+        assert(prefix_bound <= prob_op);
+        // OPERATION write
+        TestPut(thread, write_opts, read_opts, rand_column_families, rand_keys,
+                value);
+      } else if (prob_op < del_bound) {
+        assert(write_bound <= prob_op);
+        // OPERATION delete
+        TestDelete(thread, write_opts, rand_column_families, rand_keys);
+      } else if (prob_op < delrange_bound) {
+        assert(del_bound <= prob_op);
+        // OPERATION delete range
+        TestDeleteRange(thread, write_opts, rand_column_families, rand_keys);
+      } else if (prob_op < iterate_bound) {
+        assert(delrange_bound <= prob_op);
+        // OPERATION iterate
+        if (!FLAGS_skip_verifydb &&
+            thread->rand.OneInOpt(
+                FLAGS_verify_iterator_with_expected_state_one_in)) {
+          TestIterateAgainstExpected(thread, read_opts, rand_column_families,
+                                     rand_keys);
+        } else {
+          int num_seeks = static_cast<int>(
+              std::min(static_cast<uint64_t>(thread->rand.Uniform(4)),
+                       FLAGS_ops_per_thread - i - 1));
+          rand_keys = GenerateNKeys(thread, num_seeks, i);
+          i += num_seeks - 1;
+          TestIterate(thread, read_opts, rand_column_families, rand_keys);
+        }
+      } else {
+        assert(iterate_bound <= prob_op);
+        TestCustomOperations(thread, rand_column_families);
+      }
+      thread->stats.FinishedSingleOp();
+    }
+  }
+  while (!thread->snapshot_queue.empty()) {
+    db_->ReleaseSnapshot(thread->snapshot_queue.front().second.snapshot);
+    delete thread->snapshot_queue.front().second.key_vec;
+    thread->snapshot_queue.pop();
+  }
+
+  thread->stats.Stop();
+}
+
+#ifndef ROCKSDB_LITE
+// Generated a list of keys that close to boundaries of SST keys.
+// If there isn't any SST file in the DB, return empty list.
+std::vector<std::string> StressTest::GetWhiteBoxKeys(ThreadState* thread,
+                                                     DB* db,
+                                                     ColumnFamilyHandle* cfh,
+                                                     size_t num_keys) {
+  ColumnFamilyMetaData cfmd;
+  db->GetColumnFamilyMetaData(cfh, &cfmd);
+  std::vector<std::string> boundaries;
+  for (const LevelMetaData& lmd : cfmd.levels) {
+    for (const SstFileMetaData& sfmd : lmd.files) {
+      // If FLAGS_user_timestamp_size > 0, then both smallestkey and largestkey
+      // have timestamps.
+      const auto& skey = sfmd.smallestkey;
+      const auto& lkey = sfmd.largestkey;
+      assert(skey.size() >= FLAGS_user_timestamp_size);
+      assert(lkey.size() >= FLAGS_user_timestamp_size);
+      boundaries.push_back(
+          skey.substr(0, skey.size() - FLAGS_user_timestamp_size));
+      boundaries.push_back(
+          lkey.substr(0, lkey.size() - FLAGS_user_timestamp_size));
+    }
+  }
+  if (boundaries.empty()) {
+    return {};
+  }
+
+  std::vector<std::string> ret;
+  for (size_t j = 0; j < num_keys; j++) {
+    std::string k =
+        boundaries[thread->rand.Uniform(static_cast<int>(boundaries.size()))];
+    if (thread->rand.OneIn(3)) {
+      // Reduce one byte from the string
+      for (int i = static_cast<int>(k.length()) - 1; i >= 0; i--) {
+        uint8_t cur = k[i];
+        if (cur > 0) {
+          k[i] = static_cast<char>(cur - 1);
+          break;
+        } else if (i > 0) {
+          k[i] = 0xFFu;
+        }
+      }
+    } else if (thread->rand.OneIn(2)) {
+      // Add one byte to the string
+      for (int i = static_cast<int>(k.length()) - 1; i >= 0; i--) {
+        uint8_t cur = k[i];
+        if (cur < 255) {
+          k[i] = static_cast<char>(cur + 1);
+          break;
+        } else if (i > 0) {
+          k[i] = 0x00;
+        }
+      }
+    }
+    ret.push_back(k);
+  }
+  return ret;
+}
+#endif  // !ROCKSDB_LITE
+
+// Given a key K, this creates an iterator which scans to K and then
+// does a random sequence of Next/Prev operations.
+Status StressTest::TestIterate(ThreadState* thread,
+                               const ReadOptions& read_opts,
+                               const std::vector<int>& rand_column_families,
+                               const std::vector<int64_t>& rand_keys) {
+  assert(!rand_column_families.empty());
+  assert(!rand_keys.empty());
+
+  ManagedSnapshot snapshot_guard(db_);
+
+  ReadOptions ro = read_opts;
+  ro.snapshot = snapshot_guard.snapshot();
+
+  std::string read_ts_str;
+  Slice read_ts_slice;
+  MaybeUseOlderTimestampForRangeScan(thread, read_ts_str, read_ts_slice, ro);
+
+  bool expect_total_order = false;
+  if (thread->rand.OneIn(16)) {
+    // When prefix extractor is used, it's useful to cover total order seek.
+    ro.total_order_seek = true;
+    expect_total_order = true;
+  } else if (thread->rand.OneIn(4)) {
+    ro.total_order_seek = false;
+    ro.auto_prefix_mode = true;
+    expect_total_order = true;
+  } else if (options_.prefix_extractor.get() == nullptr) {
+    expect_total_order = true;
+  }
+
+  std::string upper_bound_str;
+  Slice upper_bound;
+  if (thread->rand.OneIn(16)) {
+    // With a 1/16 chance, set an iterator upper bound.
+    // Note: upper_bound can be smaller than the seek key.
+    const int64_t rand_upper_key = GenerateOneKey(thread, FLAGS_ops_per_thread);
+    upper_bound_str = Key(rand_upper_key);
+    upper_bound = Slice(upper_bound_str);
+    ro.iterate_upper_bound = &upper_bound;
+  }
+  std::string lower_bound_str;
+  Slice lower_bound;
+  if (thread->rand.OneIn(16)) {
+    // With a 1/16 chance, enable iterator lower bound.
+    // Note: lower_bound can be greater than the seek key.
+    const int64_t rand_lower_key = GenerateOneKey(thread, FLAGS_ops_per_thread);
+    lower_bound_str = Key(rand_lower_key);
+    lower_bound = Slice(lower_bound_str);
+    ro.iterate_lower_bound = &lower_bound;
+  }
+
+  ColumnFamilyHandle* const cfh = column_families_[rand_column_families[0]];
+  assert(cfh);
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ro, cfh));
+
+  std::vector<std::string> key_strs;
+  if (thread->rand.OneIn(16)) {
+    // Generate keys close to lower or upper bound of SST files.
+    key_strs = GetWhiteBoxKeys(thread, db_, cfh, rand_keys.size());
+  }
+  if (key_strs.empty()) {
+    // Use the random keys passed in.
+    for (int64_t rkey : rand_keys) {
+      key_strs.push_back(Key(rkey));
+    }
+  }
+
+  std::string op_logs;
+  constexpr size_t kOpLogsLimit = 10000;
+
+  for (const std::string& key_str : key_strs) {
+    if (op_logs.size() > kOpLogsLimit) {
+      // Shouldn't take too much memory for the history log. Clear it.
+      op_logs = "(cleared...)\n";
+    }
+
+    if (ro.iterate_upper_bound != nullptr && thread->rand.OneIn(2)) {
+      // With a 1/2 chance, change the upper bound.
+      // It is possible that it is changed before first use, but there is no
+      // problem with that.
+      const int64_t rand_upper_key =
+          GenerateOneKey(thread, FLAGS_ops_per_thread);
+      upper_bound_str = Key(rand_upper_key);
+      upper_bound = Slice(upper_bound_str);
+    }
+    if (ro.iterate_lower_bound != nullptr && thread->rand.OneIn(4)) {
+      // With a 1/4 chance, change the lower bound.
+      // It is possible that it is changed before first use, but there is no
+      // problem with that.
+      const int64_t rand_lower_key =
+          GenerateOneKey(thread, FLAGS_ops_per_thread);
+      lower_bound_str = Key(rand_lower_key);
+      lower_bound = Slice(lower_bound_str);
+    }
+
+    // Record some options to op_logs
+    op_logs += "total_order_seek: ";
+    op_logs += (ro.total_order_seek ? "1 " : "0 ");
+    op_logs += "auto_prefix_mode: ";
+    op_logs += (ro.auto_prefix_mode ? "1 " : "0 ");
+    if (ro.iterate_upper_bound != nullptr) {
+      op_logs += "ub: " + upper_bound.ToString(true) + " ";
+    }
+    if (ro.iterate_lower_bound != nullptr) {
+      op_logs += "lb: " + lower_bound.ToString(true) + " ";
+    }
+
+    // Set up an iterator, perform the same operations without bounds and with
+    // total order seek, and compare the results. This is to identify bugs
+    // related to bounds, prefix extractor, or reseeking. Sometimes we are
+    // comparing iterators with the same set-up, and it doesn't hurt to check
+    // them to be equal.
+    //
+    // This `ReadOptions` is for validation purposes. Ignore
+    // `FLAGS_rate_limit_user_ops` to avoid slowing any validation.
+    ReadOptions cmp_ro;
+    cmp_ro.timestamp = ro.timestamp;
+    cmp_ro.iter_start_ts = ro.iter_start_ts;
+    cmp_ro.snapshot = snapshot_guard.snapshot();
+    cmp_ro.total_order_seek = true;
+
+    ColumnFamilyHandle* const cmp_cfh =
+        GetControlCfh(thread, rand_column_families[0]);
+    assert(cmp_cfh);
+
+    std::unique_ptr<Iterator> cmp_iter(db_->NewIterator(cmp_ro, cmp_cfh));
+
+    bool diverged = false;
+
+    Slice key(key_str);
+
+    const bool support_seek_first_or_last = expect_total_order;
+
+    LastIterateOp last_op;
+    if (support_seek_first_or_last && thread->rand.OneIn(100)) {
+      iter->SeekToFirst();
+      cmp_iter->SeekToFirst();
+      last_op = kLastOpSeekToFirst;
+      op_logs += "STF ";
+    } else if (support_seek_first_or_last && thread->rand.OneIn(100)) {
+      iter->SeekToLast();
+      cmp_iter->SeekToLast();
+      last_op = kLastOpSeekToLast;
+      op_logs += "STL ";
+    } else if (thread->rand.OneIn(8)) {
+      iter->SeekForPrev(key);
+      cmp_iter->SeekForPrev(key);
+      last_op = kLastOpSeekForPrev;
+      op_logs += "SFP " + key.ToString(true) + " ";
+    } else {
+      iter->Seek(key);
+      cmp_iter->Seek(key);
+      last_op = kLastOpSeek;
+      op_logs += "S " + key.ToString(true) + " ";
+    }
+
+    VerifyIterator(thread, cmp_cfh, ro, iter.get(), cmp_iter.get(), last_op,
+                   key, op_logs, &diverged);
+
+    const bool no_reverse =
+        (FLAGS_memtablerep == "prefix_hash" && !expect_total_order);
+    for (uint64_t i = 0; i < FLAGS_num_iterations && iter->Valid(); ++i) {
+      if (no_reverse || thread->rand.OneIn(2)) {
+        iter->Next();
+        if (!diverged) {
+          assert(cmp_iter->Valid());
+          cmp_iter->Next();
+        }
+        op_logs += "N";
+      } else {
+        iter->Prev();
+        if (!diverged) {
+          assert(cmp_iter->Valid());
+          cmp_iter->Prev();
+        }
+        op_logs += "P";
+      }
+
+      last_op = kLastOpNextOrPrev;
+
+      VerifyIterator(thread, cmp_cfh, ro, iter.get(), cmp_iter.get(), last_op,
+                     key, op_logs, &diverged);
+    }
+
+    thread->stats.AddIterations(1);
+
+    op_logs += "; ";
+  }
+
+  return Status::OK();
+}
+
+#ifndef ROCKSDB_LITE
+// Test the return status of GetLiveFiles.
+Status StressTest::VerifyGetLiveFiles() const {
+  std::vector<std::string> live_file;
+  uint64_t manifest_size = 0;
+  return db_->GetLiveFiles(live_file, &manifest_size);
+}
+
+// Test the return status of GetSortedWalFiles.
+Status StressTest::VerifyGetSortedWalFiles() const {
+  VectorLogPtr log_ptr;
+  return db_->GetSortedWalFiles(log_ptr);
+}
+
+// Test the return status of GetCurrentWalFile.
+Status StressTest::VerifyGetCurrentWalFile() const {
+  std::unique_ptr<LogFile> cur_wal_file;
+  return db_->GetCurrentWalFile(&cur_wal_file);
+}
+#endif  // !ROCKSDB_LITE
+
+// Compare the two iterator, iter and cmp_iter are in the same position,
+// unless iter might be made invalidate or undefined because of
+// upper or lower bounds, or prefix extractor.
+// Will flag failure if the verification fails.
+// diverged = true if the two iterator is already diverged.
+// True if verification passed, false if not.
+void StressTest::VerifyIterator(ThreadState* thread,
+                                ColumnFamilyHandle* cmp_cfh,
+                                const ReadOptions& ro, Iterator* iter,
+                                Iterator* cmp_iter, LastIterateOp op,
+                                const Slice& seek_key,
+                                const std::string& op_logs, bool* diverged) {
+  assert(diverged);
+
+  if (*diverged) {
+    return;
+  }
+
+  if (ro.iter_start_ts != nullptr) {
+    assert(FLAGS_user_timestamp_size > 0);
+    // We currently do not verify iterator when dumping history of internal
+    // keys.
+    *diverged = true;
+    return;
+  }
+
+  if (op == kLastOpSeekToFirst && ro.iterate_lower_bound != nullptr) {
+    // SeekToFirst() with lower bound is not well defined.
+    *diverged = true;
+    return;
+  } else if (op == kLastOpSeekToLast && ro.iterate_upper_bound != nullptr) {
+    // SeekToLast() with higher bound is not well defined.
+    *diverged = true;
+    return;
+  } else if (op == kLastOpSeek && ro.iterate_lower_bound != nullptr &&
+             (options_.comparator->CompareWithoutTimestamp(
+                  *ro.iterate_lower_bound, /*a_has_ts=*/false, seek_key,
+                  /*b_has_ts=*/false) >= 0 ||
+              (ro.iterate_upper_bound != nullptr &&
+               options_.comparator->CompareWithoutTimestamp(
+                   *ro.iterate_lower_bound, /*a_has_ts=*/false,
+                   *ro.iterate_upper_bound, /*b_has_ts*/ false) >= 0))) {
+    // Lower bound behavior is not well defined if it is larger than
+    // seek key or upper bound. Disable the check for now.
+    *diverged = true;
+    return;
+  } else if (op == kLastOpSeekForPrev && ro.iterate_upper_bound != nullptr &&
+             (options_.comparator->CompareWithoutTimestamp(
+                  *ro.iterate_upper_bound, /*a_has_ts=*/false, seek_key,
+                  /*b_has_ts=*/false) <= 0 ||
+              (ro.iterate_lower_bound != nullptr &&
+               options_.comparator->CompareWithoutTimestamp(
+                   *ro.iterate_lower_bound, /*a_has_ts=*/false,
+                   *ro.iterate_upper_bound, /*b_has_ts=*/false) >= 0))) {
+    // Uppder bound behavior is not well defined if it is smaller than
+    // seek key or lower bound. Disable the check for now.
+    *diverged = true;
+    return;
+  }
+
+  const SliceTransform* pe = (ro.total_order_seek || ro.auto_prefix_mode)
+                                 ? nullptr
+                                 : options_.prefix_extractor.get();
+  const Comparator* cmp = options_.comparator;
+
+  if (iter->Valid() && !cmp_iter->Valid()) {
+    if (pe != nullptr) {
+      if (!pe->InDomain(seek_key)) {
+        // Prefix seek a non-in-domain key is undefined. Skip checking for
+        // this scenario.
+        *diverged = true;
+        return;
+      } else if (!pe->InDomain(iter->key())) {
+        // out of range is iterator key is not in domain anymore.
+        *diverged = true;
+        return;
+      } else if (pe->Transform(iter->key()) != pe->Transform(seek_key)) {
+        *diverged = true;
+        return;
+      }
+    }
+    fprintf(stderr,
+            "Control interator is invalid but iterator has key %s "
+            "%s\n",
+            iter->key().ToString(true).c_str(), op_logs.c_str());
+
+    *diverged = true;
+  } else if (cmp_iter->Valid()) {
+    // Iterator is not valid. It can be legimate if it has already been
+    // out of upper or lower bound, or filtered out by prefix iterator.
+    const Slice& total_order_key = cmp_iter->key();
+
+    if (pe != nullptr) {
+      if (!pe->InDomain(seek_key)) {
+        // Prefix seek a non-in-domain key is undefined. Skip checking for
+        // this scenario.
+        *diverged = true;
+        return;
+      }
+
+      if (!pe->InDomain(total_order_key) ||
+          pe->Transform(total_order_key) != pe->Transform(seek_key)) {
+        // If the prefix is exhausted, the only thing needs to check
+        // is the iterator isn't return a position in prefix.
+        // Either way, checking can stop from here.
+        *diverged = true;
+        if (!iter->Valid() || !pe->InDomain(iter->key()) ||
+            pe->Transform(iter->key()) != pe->Transform(seek_key)) {
+          return;
+        }
+        fprintf(stderr,
+                "Iterator stays in prefix but contol doesn't"
+                " iterator key %s control iterator key %s %s\n",
+                iter->key().ToString(true).c_str(),
+                cmp_iter->key().ToString(true).c_str(), op_logs.c_str());
+      }
+    }
+    // Check upper or lower bounds.
+    if (!*diverged) {
+      if ((iter->Valid() && iter->key() != cmp_iter->key()) ||
+          (!iter->Valid() &&
+           (ro.iterate_upper_bound == nullptr ||
+            cmp->CompareWithoutTimestamp(total_order_key, /*a_has_ts=*/false,
+                                         *ro.iterate_upper_bound,
+                                         /*b_has_ts=*/false) < 0) &&
+           (ro.iterate_lower_bound == nullptr ||
+            cmp->CompareWithoutTimestamp(total_order_key, /*a_has_ts=*/false,
+                                         *ro.iterate_lower_bound,
+                                         /*b_has_ts=*/false) > 0))) {
+        fprintf(stderr,
+                "Iterator diverged from control iterator which"
+                " has value %s %s\n",
+                total_order_key.ToString(true).c_str(), op_logs.c_str());
+        if (iter->Valid()) {
+          fprintf(stderr, "iterator has value %s\n",
+                  iter->key().ToString(true).c_str());
+        } else {
+          fprintf(stderr, "iterator is not valid\n");
+        }
+        *diverged = true;
+      }
+    }
+  }
+
+  if (!*diverged && iter->Valid()) {
+    const WideColumns expected_columns =
+        GenerateExpectedWideColumns(GetValueBase(iter->value()), iter->value());
+    if (iter->columns() != expected_columns) {
+      fprintf(stderr, "Value and columns inconsistent for iterator: %s\n",
+              DebugString(iter->value(), iter->columns(), expected_columns)
+                  .c_str());
+
+      *diverged = true;
+    }
+  }
+
+  if (*diverged) {
+    fprintf(stderr, "Control CF %s\n", cmp_cfh->GetName().c_str());
+    thread->stats.AddErrors(1);
+    // Fail fast to preserve the DB state.
+    thread->shared->SetVerificationFailure();
+  }
+}
+
+#ifdef ROCKSDB_LITE
+Status StressTest::TestBackupRestore(
+    ThreadState* /* thread */,
+    const std::vector<int>& /* rand_column_families */,
+    const std::vector<int64_t>& /* rand_keys */) {
+  assert(false);
+  fprintf(stderr,
+          "RocksDB lite does not support "
+          "TestBackupRestore\n");
+  std::terminate();
+}
+
+Status StressTest::TestCheckpoint(
+    ThreadState* /* thread */,
+    const std::vector<int>& /* rand_column_families */,
+    const std::vector<int64_t>& /* rand_keys */) {
+  assert(false);
+  fprintf(stderr,
+          "RocksDB lite does not support "
+          "TestCheckpoint\n");
+  std::terminate();
+}
+
+void StressTest::TestCompactFiles(ThreadState* /* thread */,
+                                  ColumnFamilyHandle* /* column_family */) {
+  assert(false);
+  fprintf(stderr,
+          "RocksDB lite does not support "
+          "CompactFiles\n");
+  std::terminate();
+}
+#else   // ROCKSDB_LITE
+Status StressTest::TestBackupRestore(
+    ThreadState* thread, const std::vector<int>& rand_column_families,
+    const std::vector<int64_t>& rand_keys) {
+  std::vector<std::unique_ptr<MutexLock>> locks;
+  if (ShouldAcquireMutexOnKey()) {
+    for (int rand_column_family : rand_column_families) {
+      // `rand_keys[0]` on each chosen CF will be verified.
+      locks.emplace_back(new MutexLock(
+          thread->shared->GetMutexForKey(rand_column_family, rand_keys[0])));
+    }
+  }
+
+  const std::string backup_dir =
+      FLAGS_db + "/.backup" + std::to_string(thread->tid);
+  const std::string restore_dir =
+      FLAGS_db + "/.restore" + std::to_string(thread->tid);
+  BackupEngineOptions backup_opts(backup_dir);
+  // For debugging, get info_log from live options
+  backup_opts.info_log = db_->GetDBOptions().info_log.get();
+  if (thread->rand.OneIn(10)) {
+    backup_opts.share_table_files = false;
+  } else {
+    backup_opts.share_table_files = true;
+    if (thread->rand.OneIn(5)) {
+      backup_opts.share_files_with_checksum = false;
+    } else {
+      backup_opts.share_files_with_checksum = true;
+      if (thread->rand.OneIn(2)) {
+        // old
+        backup_opts.share_files_with_checksum_naming =
+            BackupEngineOptions::kLegacyCrc32cAndFileSize;
+      } else {
+        // new
+        backup_opts.share_files_with_checksum_naming =
+            BackupEngineOptions::kUseDbSessionId;
+      }
+      if (thread->rand.OneIn(2)) {
+        backup_opts.share_files_with_checksum_naming =
+            backup_opts.share_files_with_checksum_naming |
+            BackupEngineOptions::kFlagIncludeFileSize;
+      }
+    }
+  }
+  if (thread->rand.OneIn(2)) {
+    backup_opts.schema_version = 1;
+  } else {
+    backup_opts.schema_version = 2;
+  }
+  BackupEngine* backup_engine = nullptr;
+  std::string from = "a backup/restore operation";
+  Status s = BackupEngine::Open(db_stress_env, backup_opts, &backup_engine);
+  if (!s.ok()) {
+    from = "BackupEngine::Open";
+  }
+  if (s.ok()) {
+    if (backup_opts.schema_version >= 2 && thread->rand.OneIn(2)) {
+      TEST_BackupMetaSchemaOptions test_opts;
+      test_opts.crc32c_checksums = thread->rand.OneIn(2) == 0;
+      test_opts.file_sizes = thread->rand.OneIn(2) == 0;
+      TEST_SetBackupMetaSchemaOptions(backup_engine, test_opts);
+    }
+    CreateBackupOptions create_opts;
+    if (FLAGS_disable_wal) {
+      // The verification can only work when latest value of `key` is backed up,
+      // which requires flushing in case of WAL disabled.
+      //
+      // Note this triggers a flush with a key lock held. Meanwhile, operations
+      // like flush/compaction may attempt to grab key locks like in
+      // `DbStressCompactionFilter`. The philosophy around preventing deadlock
+      // is the background operation key lock acquisition only tries but does
+      // not wait for the lock. So here in the foreground it is OK to hold the
+      // lock and wait on a background operation (flush).
+      create_opts.flush_before_backup = true;
+    }
+    s = backup_engine->CreateNewBackup(create_opts, db_);
+    if (!s.ok()) {
+      from = "BackupEngine::CreateNewBackup";
+    }
+  }
+  if (s.ok()) {
+    delete backup_engine;
+    backup_engine = nullptr;
+    s = BackupEngine::Open(db_stress_env, backup_opts, &backup_engine);
+    if (!s.ok()) {
+      from = "BackupEngine::Open (again)";
+    }
+  }
+  std::vector<BackupInfo> backup_info;
+  // If inplace_not_restore, we verify the backup by opening it as a
+  // read-only DB. If !inplace_not_restore, we restore it to a temporary
+  // directory for verification.
+  bool inplace_not_restore = thread->rand.OneIn(3);
+  if (s.ok()) {
+    backup_engine->GetBackupInfo(&backup_info,
+                                 /*include_file_details*/ inplace_not_restore);
+    if (backup_info.empty()) {
+      s = Status::NotFound("no backups found");
+      from = "BackupEngine::GetBackupInfo";
+    }
+  }
+  if (s.ok() && thread->rand.OneIn(2)) {
+    s = backup_engine->VerifyBackup(
+        backup_info.front().backup_id,
+        thread->rand.OneIn(2) /* verify_with_checksum */);
+    if (!s.ok()) {
+      from = "BackupEngine::VerifyBackup";
+    }
+  }
+  const bool allow_persistent = thread->tid == 0;  // not too many
+  bool from_latest = false;
+  int count = static_cast<int>(backup_info.size());
+  if (s.ok() && !inplace_not_restore) {
+    if (count > 1) {
+      s = backup_engine->RestoreDBFromBackup(
+          RestoreOptions(), backup_info[thread->rand.Uniform(count)].backup_id,
+          restore_dir /* db_dir */, restore_dir /* wal_dir */);
+      if (!s.ok()) {
+        from = "BackupEngine::RestoreDBFromBackup";
+      }
+    } else {
+      from_latest = true;
+      s = backup_engine->RestoreDBFromLatestBackup(RestoreOptions(),
+                                                   restore_dir /* db_dir */,
+                                                   restore_dir /* wal_dir */);
+      if (!s.ok()) {
+        from = "BackupEngine::RestoreDBFromLatestBackup";
+      }
+    }
+  }
+  if (s.ok() && !inplace_not_restore) {
+    // Purge early if restoring, to ensure the restored directory doesn't
+    // have some secret dependency on the backup directory.
+    uint32_t to_keep = 0;
+    if (allow_persistent) {
+      // allow one thread to keep up to 2 backups
+      to_keep = thread->rand.Uniform(3);
+    }
+    s = backup_engine->PurgeOldBackups(to_keep);
+    if (!s.ok()) {
+      from = "BackupEngine::PurgeOldBackups";
+    }
+  }
+  DB* restored_db = nullptr;
+  std::vector<ColumnFamilyHandle*> restored_cf_handles;
+  // Not yet implemented: opening restored BlobDB or TransactionDB
+  if (s.ok() && !FLAGS_use_txn && !FLAGS_use_blob_db) {
+    Options restore_options(options_);
+    restore_options.best_efforts_recovery = false;
+    restore_options.listeners.clear();
+    // Avoid dangling/shared file descriptors, for reliable destroy
+    restore_options.sst_file_manager = nullptr;
+    std::vector<ColumnFamilyDescriptor> cf_descriptors;
+    // TODO(ajkr): `column_family_names_` is not safe to access here when
+    // `clear_column_family_one_in != 0`. But we can't easily switch to
+    // `ListColumnFamilies` to get names because it won't necessarily give
+    // the same order as `column_family_names_`.
+    assert(FLAGS_clear_column_family_one_in == 0);
+    for (auto name : column_family_names_) {
+      cf_descriptors.emplace_back(name, ColumnFamilyOptions(restore_options));
+    }
+    if (inplace_not_restore) {
+      BackupInfo& info = backup_info[thread->rand.Uniform(count)];
+      restore_options.env = info.env_for_open.get();
+      s = DB::OpenForReadOnly(DBOptions(restore_options), info.name_for_open,
+                              cf_descriptors, &restored_cf_handles,
+                              &restored_db);
+      if (!s.ok()) {
+        from = "DB::OpenForReadOnly in backup/restore";
+      }
+    } else {
+      s = DB::Open(DBOptions(restore_options), restore_dir, cf_descriptors,
+                   &restored_cf_handles, &restored_db);
+      if (!s.ok()) {
+        from = "DB::Open in backup/restore";
+      }
+    }
+  }
+  // Note the column families chosen by `rand_column_families` cannot be
+  // dropped while the locks for `rand_keys` are held. So we should not have
+  // to worry about accessing those column families throughout this function.
+  //
+  // For simplicity, currently only verifies existence/non-existence of a
+  // single key
+  for (size_t i = 0; restored_db && s.ok() && i < rand_column_families.size();
+       ++i) {
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    std::string restored_value;
+    // This `ReadOptions` is for validation purposes. Ignore
+    // `FLAGS_rate_limit_user_ops` to avoid slowing any validation.
+    ReadOptions read_opts;
+    std::string ts_str;
+    Slice ts;
+    if (FLAGS_user_timestamp_size > 0) {
+      ts_str = GetNowNanos();
+      ts = ts_str;
+      read_opts.timestamp = &ts;
+    }
+    Status get_status = restored_db->Get(
+        read_opts, restored_cf_handles[rand_column_families[i]], key,
+        &restored_value);
+    bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
+    if (get_status.ok()) {
+      if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
+        std::ostringstream oss;
+        oss << "0x" << key.ToString(true)
+            << " exists in restore but not in original db";
+        s = Status::Corruption(oss.str());
+      }
+    } else if (get_status.IsNotFound()) {
+      if (exists && from_latest && ShouldAcquireMutexOnKey()) {
+        std::ostringstream oss;
+        oss << "0x" << key.ToString(true)
+            << " exists in original db but not in restore";
+        s = Status::Corruption(oss.str());
+      }
+    } else {
+      s = get_status;
+      if (!s.ok()) {
+        from = "DB::Get in backup/restore";
+      }
+    }
+  }
+  if (restored_db != nullptr) {
+    for (auto* cf_handle : restored_cf_handles) {
+      restored_db->DestroyColumnFamilyHandle(cf_handle);
+    }
+    delete restored_db;
+    restored_db = nullptr;
+  }
+  if (s.ok() && inplace_not_restore) {
+    // Purge late if inplace open read-only
+    uint32_t to_keep = 0;
+    if (allow_persistent) {
+      // allow one thread to keep up to 2 backups
+      to_keep = thread->rand.Uniform(3);
+    }
+    s = backup_engine->PurgeOldBackups(to_keep);
+    if (!s.ok()) {
+      from = "BackupEngine::PurgeOldBackups";
+    }
+  }
+  if (backup_engine != nullptr) {
+    delete backup_engine;
+    backup_engine = nullptr;
+  }
+  if (s.ok()) {
+    // Preserve directories on failure, or allowed persistent backup
+    if (!allow_persistent) {
+      s = DestroyDir(db_stress_env, backup_dir);
+      if (!s.ok()) {
+        from = "Destroy backup dir";
+      }
+    }
+  }
+  if (s.ok()) {
+    s = DestroyDir(db_stress_env, restore_dir);
+    if (!s.ok()) {
+      from = "Destroy restore dir";
+    }
+  }
+  if (!s.ok()) {
+    fprintf(stderr, "Failure in %s with: %s\n", from.c_str(),
+            s.ToString().c_str());
+  }
+  return s;
+}
+
+Status StressTest::TestApproximateSize(
+    ThreadState* thread, uint64_t iteration,
+    const std::vector<int>& rand_column_families,
+    const std::vector<int64_t>& rand_keys) {
+  // rand_keys likely only has one key. Just use the first one.
+  assert(!rand_keys.empty());
+  assert(!rand_column_families.empty());
+  int64_t key1 = rand_keys[0];
+  int64_t key2;
+  if (thread->rand.OneIn(2)) {
+    // Two totally random keys. This tends to cover large ranges.
+    key2 = GenerateOneKey(thread, iteration);
+    if (key2 < key1) {
+      std::swap(key1, key2);
+    }
+  } else {
+    // Unless users pass a very large FLAGS_max_key, it we should not worry
+    // about overflow. It is for testing, so we skip the overflow checking
+    // for simplicity.
+    key2 = key1 + static_cast<int64_t>(thread->rand.Uniform(1000));
+  }
+  std::string key1_str = Key(key1);
+  std::string key2_str = Key(key2);
+  Range range{Slice(key1_str), Slice(key2_str)};
+  SizeApproximationOptions sao;
+  sao.include_memtables = thread->rand.OneIn(2);
+  if (sao.include_memtables) {
+    sao.include_files = thread->rand.OneIn(2);
+  }
+  if (thread->rand.OneIn(2)) {
+    if (thread->rand.OneIn(2)) {
+      sao.files_size_error_margin = 0.0;
+    } else {
+      sao.files_size_error_margin =
+          static_cast<double>(thread->rand.Uniform(3));
+    }
+  }
+  uint64_t result;
+  return db_->GetApproximateSizes(
+      sao, column_families_[rand_column_families[0]], &range, 1, &result);
+}
+
+Status StressTest::TestCheckpoint(ThreadState* thread,
+                                  const std::vector<int>& rand_column_families,
+                                  const std::vector<int64_t>& rand_keys) {
+  std::vector<std::unique_ptr<MutexLock>> locks;
+  if (ShouldAcquireMutexOnKey()) {
+    for (int rand_column_family : rand_column_families) {
+      // `rand_keys[0]` on each chosen CF will be verified.
+      locks.emplace_back(new MutexLock(
+          thread->shared->GetMutexForKey(rand_column_family, rand_keys[0])));
+    }
+  }
+
+  std::string checkpoint_dir =
+      FLAGS_db + "/.checkpoint" + std::to_string(thread->tid);
+  Options tmp_opts(options_);
+  tmp_opts.listeners.clear();
+  tmp_opts.env = db_stress_env;
+
+  DestroyDB(checkpoint_dir, tmp_opts);
+
+  if (db_stress_env->FileExists(checkpoint_dir).ok()) {
+    // If the directory might still exist, try to delete the files one by one.
+    // Likely a trash file is still there.
+    Status my_s = DestroyDir(db_stress_env, checkpoint_dir);
+    if (!my_s.ok()) {
+      fprintf(stderr, "Fail to destory directory before checkpoint: %s",
+              my_s.ToString().c_str());
+    }
+  }
+
+  Checkpoint* checkpoint = nullptr;
+  Status s = Checkpoint::Create(db_, &checkpoint);
+  if (s.ok()) {
+    s = checkpoint->CreateCheckpoint(checkpoint_dir);
+    if (!s.ok()) {
+      fprintf(stderr, "Fail to create checkpoint to %s\n",
+              checkpoint_dir.c_str());
+      std::vector<std::string> files;
+      Status my_s = db_stress_env->GetChildren(checkpoint_dir, &files);
+      if (my_s.ok()) {
+        for (const auto& f : files) {
+          fprintf(stderr, " %s\n", f.c_str());
+        }
+      } else {
+        fprintf(stderr, "Fail to get files under the directory to %s\n",
+                my_s.ToString().c_str());
+      }
+    }
+  }
+  delete checkpoint;
+  checkpoint = nullptr;
+  std::vector<ColumnFamilyHandle*> cf_handles;
+  DB* checkpoint_db = nullptr;
+  if (s.ok()) {
+    Options options(options_);
+    options.best_efforts_recovery = false;
+    options.listeners.clear();
+    // Avoid race condition in trash handling after delete checkpoint_db
+    options.sst_file_manager.reset();
+    std::vector<ColumnFamilyDescriptor> cf_descs;
+    // TODO(ajkr): `column_family_names_` is not safe to access here when
+    // `clear_column_family_one_in != 0`. But we can't easily switch to
+    // `ListColumnFamilies` to get names because it won't necessarily give
+    // the same order as `column_family_names_`.
+    assert(FLAGS_clear_column_family_one_in == 0);
+    if (FLAGS_clear_column_family_one_in == 0) {
+      for (const auto& name : column_family_names_) {
+        cf_descs.emplace_back(name, ColumnFamilyOptions(options));
+      }
+      s = DB::OpenForReadOnly(DBOptions(options), checkpoint_dir, cf_descs,
+                              &cf_handles, &checkpoint_db);
+    }
+  }
+  if (checkpoint_db != nullptr) {
+    // Note the column families chosen by `rand_column_families` cannot be
+    // dropped while the locks for `rand_keys` are held. So we should not have
+    // to worry about accessing those column families throughout this function.
+    for (size_t i = 0; s.ok() && i < rand_column_families.size(); ++i) {
+      std::string key_str = Key(rand_keys[0]);
+      Slice key = key_str;
+      std::string ts_str;
+      Slice ts;
+      ReadOptions read_opts;
+      if (FLAGS_user_timestamp_size > 0) {
+        ts_str = GetNowNanos();
+        ts = ts_str;
+        read_opts.timestamp = &ts;
+      }
+      std::string value;
+      Status get_status = checkpoint_db->Get(
+          read_opts, cf_handles[rand_column_families[i]], key, &value);
+      bool exists =
+          thread->shared->Exists(rand_column_families[i], rand_keys[0]);
+      if (get_status.ok()) {
+        if (!exists && ShouldAcquireMutexOnKey()) {
+          std::ostringstream oss;
+          oss << "0x" << key.ToString(true) << " exists in checkpoint "
+              << checkpoint_dir << " but not in original db";
+          s = Status::Corruption(oss.str());
+        }
+      } else if (get_status.IsNotFound()) {
+        if (exists && ShouldAcquireMutexOnKey()) {
+          std::ostringstream oss;
+          oss << "0x" << key.ToString(true)
+              << " exists in original db but not in checkpoint "
+              << checkpoint_dir;
+          s = Status::Corruption(oss.str());
+        }
+      } else {
+        s = get_status;
+      }
+    }
+    for (auto cfh : cf_handles) {
+      delete cfh;
+    }
+    cf_handles.clear();
+    delete checkpoint_db;
+    checkpoint_db = nullptr;
+  }
+
+  if (!s.ok()) {
+    fprintf(stderr, "A checkpoint operation failed with: %s\n",
+            s.ToString().c_str());
+  } else {
+    DestroyDB(checkpoint_dir, tmp_opts);
+  }
+  return s;
+}
+
+void StressTest::TestGetProperty(ThreadState* thread) const {
+  std::unordered_set<std::string> levelPropertyNames = {
+      DB::Properties::kAggregatedTablePropertiesAtLevel,
+      DB::Properties::kCompressionRatioAtLevelPrefix,
+      DB::Properties::kNumFilesAtLevelPrefix,
+  };
+  std::unordered_set<std::string> unknownPropertyNames = {
+      DB::Properties::kEstimateOldestKeyTime,
+      DB::Properties::kOptionsStatistics,
+      DB::Properties::
+          kLiveSstFilesSizeAtTemperature,  // similar to levelPropertyNames, it
+                                           // requires a number suffix
+  };
+  unknownPropertyNames.insert(levelPropertyNames.begin(),
+                              levelPropertyNames.end());
+
+  std::unordered_set<std::string> blobCachePropertyNames = {
+      DB::Properties::kBlobCacheCapacity,
+      DB::Properties::kBlobCacheUsage,
+      DB::Properties::kBlobCachePinnedUsage,
+  };
+  if (db_->GetOptions().blob_cache == nullptr) {
+    unknownPropertyNames.insert(blobCachePropertyNames.begin(),
+                                blobCachePropertyNames.end());
+  }
+
+  std::string prop;
+  for (const auto& ppt_name_and_info : InternalStats::ppt_name_to_info) {
+    bool res = db_->GetProperty(ppt_name_and_info.first, &prop);
+    if (unknownPropertyNames.find(ppt_name_and_info.first) ==
+        unknownPropertyNames.end()) {
+      if (!res) {
+        fprintf(stderr, "Failed to get DB property: %s\n",
+                ppt_name_and_info.first.c_str());
+        thread->shared->SetVerificationFailure();
+      }
+      if (ppt_name_and_info.second.handle_int != nullptr) {
+        uint64_t prop_int;
+        if (!db_->GetIntProperty(ppt_name_and_info.first, &prop_int)) {
+          fprintf(stderr, "Failed to get Int property: %s\n",
+                  ppt_name_and_info.first.c_str());
+          thread->shared->SetVerificationFailure();
+        }
+      }
+      if (ppt_name_and_info.second.handle_map != nullptr) {
+        std::map<std::string, std::string> prop_map;
+        if (!db_->GetMapProperty(ppt_name_and_info.first, &prop_map)) {
+          fprintf(stderr, "Failed to get Map property: %s\n",
+                  ppt_name_and_info.first.c_str());
+          thread->shared->SetVerificationFailure();
+        }
+      }
+    }
+  }
+
+  ROCKSDB_NAMESPACE::ColumnFamilyMetaData cf_meta_data;
+  db_->GetColumnFamilyMetaData(&cf_meta_data);
+  int level_size = static_cast<int>(cf_meta_data.levels.size());
+  for (int level = 0; level < level_size; level++) {
+    for (const auto& ppt_name : levelPropertyNames) {
+      bool res = db_->GetProperty(ppt_name + std::to_string(level), &prop);
+      if (!res) {
+        fprintf(stderr, "Failed to get DB property: %s\n",
+                (ppt_name + std::to_string(level)).c_str());
+        thread->shared->SetVerificationFailure();
+      }
+    }
+  }
+
+  // Test for an invalid property name
+  if (thread->rand.OneIn(100)) {
+    if (db_->GetProperty("rocksdb.invalid_property_name", &prop)) {
+      fprintf(stderr, "Failed to return false for invalid property name\n");
+      thread->shared->SetVerificationFailure();
+    }
+  }
+}
+
+void StressTest::TestCompactFiles(ThreadState* thread,
+                                  ColumnFamilyHandle* column_family) {
+  ROCKSDB_NAMESPACE::ColumnFamilyMetaData cf_meta_data;
+  db_->GetColumnFamilyMetaData(column_family, &cf_meta_data);
+
+  if (cf_meta_data.levels.empty()) {
+    return;
+  }
+
+  // Randomly compact up to three consecutive files from a level
+  const int kMaxRetry = 3;
+  for (int attempt = 0; attempt < kMaxRetry; ++attempt) {
+    size_t random_level =
+        thread->rand.Uniform(static_cast<int>(cf_meta_data.levels.size()));
+
+    const auto& files = cf_meta_data.levels[random_level].files;
+    if (files.size() > 0) {
+      size_t random_file_index =
+          thread->rand.Uniform(static_cast<int>(files.size()));
+      if (files[random_file_index].being_compacted) {
+        // Retry as the selected file is currently being compacted
+        continue;
+      }
+
+      std::vector<std::string> input_files;
+      input_files.push_back(files[random_file_index].name);
+      if (random_file_index > 0 &&
+          !files[random_file_index - 1].being_compacted) {
+        input_files.push_back(files[random_file_index - 1].name);
+      }
+      if (random_file_index + 1 < files.size() &&
+          !files[random_file_index + 1].being_compacted) {
+        input_files.push_back(files[random_file_index + 1].name);
+      }
+
+      size_t output_level =
+          std::min(random_level + 1, cf_meta_data.levels.size() - 1);
+      auto s = db_->CompactFiles(CompactionOptions(), column_family,
+                                 input_files, static_cast<int>(output_level));
+      if (!s.ok()) {
+        fprintf(stdout, "Unable to perform CompactFiles(): %s\n",
+                s.ToString().c_str());
+        thread->stats.AddNumCompactFilesFailed(1);
+      } else {
+        thread->stats.AddNumCompactFilesSucceed(1);
+      }
+      break;
+    }
+  }
+}
+#endif  // ROCKSDB_LITE
+
+Status StressTest::TestFlush(const std::vector<int>& rand_column_families) {
+  FlushOptions flush_opts;
+  if (FLAGS_atomic_flush) {
+    return db_->Flush(flush_opts, column_families_);
+  }
+  std::vector<ColumnFamilyHandle*> cfhs;
+  std::for_each(rand_column_families.begin(), rand_column_families.end(),
+                [this, &cfhs](int k) { cfhs.push_back(column_families_[k]); });
+  return db_->Flush(flush_opts, cfhs);
+}
+
+Status StressTest::TestPauseBackground(ThreadState* thread) {
+  Status status = db_->PauseBackgroundWork();
+  if (!status.ok()) {
+    return status;
+  }
+  // To avoid stalling/deadlocking ourself in this thread, just
+  // sleep here during pause and let other threads do db operations.
+  // Sleep up to ~16 seconds (2**24 microseconds), but very skewed
+  // toward short pause. (1 chance in 25 of pausing >= 1s;
+  // 1 chance in 625 of pausing full 16s.)
+  int pwr2_micros =
+      std::min(thread->rand.Uniform(25), thread->rand.Uniform(25));
+  clock_->SleepForMicroseconds(1 << pwr2_micros);
+  return db_->ContinueBackgroundWork();
+}
+
+void StressTest::TestAcquireSnapshot(ThreadState* thread,
+                                     int rand_column_family,
+                                     const std::string& keystr, uint64_t i) {
+  Slice key = keystr;
+  ColumnFamilyHandle* column_family = column_families_[rand_column_family];
+  // This `ReadOptions` is for validation purposes. Ignore
+  // `FLAGS_rate_limit_user_ops` to avoid slowing any validation.
+  ReadOptions ropt;
+#ifndef ROCKSDB_LITE
+  auto db_impl = static_cast_with_check<DBImpl>(db_->GetRootDB());
+  const bool ww_snapshot = thread->rand.OneIn(10);
+  const Snapshot* snapshot =
+      ww_snapshot ? db_impl->GetSnapshotForWriteConflictBoundary()
+                  : db_->GetSnapshot();
+#else
+  const Snapshot* snapshot = db_->GetSnapshot();
+#endif  // !ROCKSDB_LITE
+  ropt.snapshot = snapshot;
+
+  // Ideally, we want snapshot taking and timestamp generation to be atomic
+  // here, so that the snapshot corresponds to the timestamp. However, it is
+  // not possible with current GetSnapshot() API.
+  std::string ts_str;
+  Slice ts;
+  if (FLAGS_user_timestamp_size > 0) {
+    ts_str = GetNowNanos();
+    ts = ts_str;
+    ropt.timestamp = &ts;
+  }
+
+  std::string value_at;
+  // When taking a snapshot, we also read a key from that snapshot. We
+  // will later read the same key before releasing the snapshot and
+  // verify that the results are the same.
+  auto status_at = db_->Get(ropt, column_family, key, &value_at);
+  std::vector<bool>* key_vec = nullptr;
+
+  if (FLAGS_compare_full_db_state_snapshot && (thread->tid == 0)) {
+    key_vec = new std::vector<bool>(FLAGS_max_key);
+    // When `prefix_extractor` is set, seeking to beginning and scanning
+    // across prefixes are only supported with `total_order_seek` set.
+    ropt.total_order_seek = true;
+    std::unique_ptr<Iterator> iterator(db_->NewIterator(ropt));
+    for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+      uint64_t key_val;
+      if (GetIntVal(iterator->key().ToString(), &key_val)) {
+        (*key_vec)[key_val] = true;
+      }
+    }
+  }
+
+  ThreadState::SnapshotState snap_state = {snapshot,
+                                           rand_column_family,
+                                           column_family->GetName(),
+                                           keystr,
+                                           status_at,
+                                           value_at,
+                                           key_vec,
+                                           ts_str};
+  uint64_t hold_for = FLAGS_snapshot_hold_ops;
+  if (FLAGS_long_running_snapshots) {
+    // Hold 10% of snapshots for 10x more
+    if (thread->rand.OneIn(10)) {
+      assert(hold_for < std::numeric_limits<uint64_t>::max() / 10);
+      hold_for *= 10;
+      // Hold 1% of snapshots for 100x more
+      if (thread->rand.OneIn(10)) {
+        assert(hold_for < std::numeric_limits<uint64_t>::max() / 10);
+        hold_for *= 10;
+      }
+    }
+  }
+  uint64_t release_at = std::min(FLAGS_ops_per_thread - 1, i + hold_for);
+  thread->snapshot_queue.emplace(release_at, snap_state);
+}
+
+Status StressTest::MaybeReleaseSnapshots(ThreadState* thread, uint64_t i) {
+  while (!thread->snapshot_queue.empty() &&
+         i >= thread->snapshot_queue.front().first) {
+    auto snap_state = thread->snapshot_queue.front().second;
+    assert(snap_state.snapshot);
+    // Note: this is unsafe as the cf might be dropped concurrently. But
+    // it is ok since unclean cf drop is cunnrently not supported by write
+    // prepared transactions.
+    Status s = AssertSame(db_, column_families_[snap_state.cf_at], snap_state);
+    db_->ReleaseSnapshot(snap_state.snapshot);
+    delete snap_state.key_vec;
+    thread->snapshot_queue.pop();
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  return Status::OK();
+}
+
+void StressTest::TestCompactRange(ThreadState* thread, int64_t rand_key,
+                                  const Slice& start_key,
+                                  ColumnFamilyHandle* column_family) {
+  int64_t end_key_num;
+  if (std::numeric_limits<int64_t>::max() - rand_key <
+      FLAGS_compact_range_width) {
+    end_key_num = std::numeric_limits<int64_t>::max();
+  } else {
+    end_key_num = FLAGS_compact_range_width + rand_key;
+  }
+  std::string end_key_buf = Key(end_key_num);
+  Slice end_key(end_key_buf);
+
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = static_cast<bool>(thread->rand.Next() % 2);
+  cro.change_level = static_cast<bool>(thread->rand.Next() % 2);
+  std::vector<BottommostLevelCompaction> bottom_level_styles = {
+      BottommostLevelCompaction::kSkip,
+      BottommostLevelCompaction::kIfHaveCompactionFilter,
+      BottommostLevelCompaction::kForce,
+      BottommostLevelCompaction::kForceOptimized};
+  cro.bottommost_level_compaction =
+      bottom_level_styles[thread->rand.Next() %
+                          static_cast<uint32_t>(bottom_level_styles.size())];
+  cro.allow_write_stall = static_cast<bool>(thread->rand.Next() % 2);
+  cro.max_subcompactions = static_cast<uint32_t>(thread->rand.Next() % 4);
+  std::vector<BlobGarbageCollectionPolicy> blob_gc_policies = {
+      BlobGarbageCollectionPolicy::kForce,
+      BlobGarbageCollectionPolicy::kDisable,
+      BlobGarbageCollectionPolicy::kUseDefault};
+  cro.blob_garbage_collection_policy =
+      blob_gc_policies[thread->rand.Next() %
+                       static_cast<uint32_t>(blob_gc_policies.size())];
+  cro.blob_garbage_collection_age_cutoff =
+      static_cast<double>(thread->rand.Next() % 100) / 100.0;
+
+  const Snapshot* pre_snapshot = nullptr;
+  uint32_t pre_hash = 0;
+  if (thread->rand.OneIn(2)) {
+    // Do some validation by declaring a snapshot and compare the data before
+    // and after the compaction
+    pre_snapshot = db_->GetSnapshot();
+    pre_hash =
+        GetRangeHash(thread, pre_snapshot, column_family, start_key, end_key);
+  }
+
+  Status status = db_->CompactRange(cro, column_family, &start_key, &end_key);
+
+  if (!status.ok()) {
+    fprintf(stdout, "Unable to perform CompactRange(): %s\n",
+            status.ToString().c_str());
+  }
+
+  if (pre_snapshot != nullptr) {
+    uint32_t post_hash =
+        GetRangeHash(thread, pre_snapshot, column_family, start_key, end_key);
+    if (pre_hash != post_hash) {
+      fprintf(stderr,
+              "Data hash different before and after compact range "
+              "start_key %s end_key %s\n",
+              start_key.ToString(true).c_str(), end_key.ToString(true).c_str());
+      thread->stats.AddErrors(1);
+      // Fail fast to preserve the DB state.
+      thread->shared->SetVerificationFailure();
+    }
+    db_->ReleaseSnapshot(pre_snapshot);
+  }
+}
+
+uint32_t StressTest::GetRangeHash(ThreadState* thread, const Snapshot* snapshot,
+                                  ColumnFamilyHandle* column_family,
+                                  const Slice& start_key,
+                                  const Slice& end_key) {
+  // This `ReadOptions` is for validation purposes. Ignore
+  // `FLAGS_rate_limit_user_ops` to avoid slowing any validation.
+  ReadOptions ro;
+  ro.snapshot = snapshot;
+  ro.total_order_seek = true;
+  std::string ts_str;
+  Slice ts;
+  if (FLAGS_user_timestamp_size > 0) {
+    ts_str = GetNowNanos();
+    ts = ts_str;
+    ro.timestamp = &ts;
+  }
+
+  std::unique_ptr<Iterator> it(db_->NewIterator(ro, column_family));
+
+  constexpr char kCrcCalculatorSepearator = ';';
+
+  uint32_t crc = 0;
+
+  for (it->Seek(start_key);
+       it->Valid() && options_.comparator->Compare(it->key(), end_key) <= 0;
+       it->Next()) {
+    crc = crc32c::Extend(crc, it->key().data(), it->key().size());
+    crc = crc32c::Extend(crc, &kCrcCalculatorSepearator, sizeof(char));
+    crc = crc32c::Extend(crc, it->value().data(), it->value().size());
+    crc = crc32c::Extend(crc, &kCrcCalculatorSepearator, sizeof(char));
+
+    for (const auto& column : it->columns()) {
+      crc = crc32c::Extend(crc, column.name().data(), column.name().size());
+      crc = crc32c::Extend(crc, &kCrcCalculatorSepearator, sizeof(char));
+      crc = crc32c::Extend(crc, column.value().data(), column.value().size());
+      crc = crc32c::Extend(crc, &kCrcCalculatorSepearator, sizeof(char));
+    }
+  }
+
+  if (!it->status().ok()) {
+    fprintf(stderr, "Iterator non-OK when calculating range CRC: %s\n",
+            it->status().ToString().c_str());
+    thread->stats.AddErrors(1);
+    // Fail fast to preserve the DB state.
+    thread->shared->SetVerificationFailure();
+  }
+
+  return crc;
+}
+
+void StressTest::PrintEnv() const {
+  fprintf(stdout, "RocksDB version           : %d.%d\n", kMajorVersion,
+          kMinorVersion);
+  fprintf(stdout, "Format version            : %d\n", FLAGS_format_version);
+  fprintf(stdout, "TransactionDB             : %s\n",
+          FLAGS_use_txn ? "true" : "false");
+
+  if (FLAGS_use_txn) {
+#ifndef ROCKSDB_LITE
+    fprintf(stdout, "Two write queues:         : %s\n",
+            FLAGS_two_write_queues ? "true" : "false");
+    fprintf(stdout, "Write policy              : %d\n",
+            static_cast<int>(FLAGS_txn_write_policy));
+    if (static_cast<uint64_t>(TxnDBWritePolicy::WRITE_PREPARED) ==
+            FLAGS_txn_write_policy ||
+        static_cast<uint64_t>(TxnDBWritePolicy::WRITE_UNPREPARED) ==
+            FLAGS_txn_write_policy) {
+      fprintf(stdout, "Snapshot cache bits       : %d\n",
+              static_cast<int>(FLAGS_wp_snapshot_cache_bits));
+      fprintf(stdout, "Commit cache bits         : %d\n",
+              static_cast<int>(FLAGS_wp_commit_cache_bits));
+    }
+    fprintf(stdout, "last cwb for recovery    : %s\n",
+            FLAGS_use_only_the_last_commit_time_batch_for_recovery ? "true"
+                                                                   : "false");
+#endif  // !ROCKSDB_LITE
+  }
+
+#ifndef ROCKSDB_LITE
+  fprintf(stdout, "Stacked BlobDB            : %s\n",
+          FLAGS_use_blob_db ? "true" : "false");
+#endif  // !ROCKSDB_LITE
+  fprintf(stdout, "Read only mode            : %s\n",
+          FLAGS_read_only ? "true" : "false");
+  fprintf(stdout, "Atomic flush              : %s\n",
+          FLAGS_atomic_flush ? "true" : "false");
+  fprintf(stdout, "Manual WAL flush          : %s\n",
+          FLAGS_manual_wal_flush_one_in > 0 ? "true" : "false");
+  fprintf(stdout, "Column families           : %d\n", FLAGS_column_families);
+  if (!FLAGS_test_batches_snapshots) {
+    fprintf(stdout, "Clear CFs one in          : %d\n",
+            FLAGS_clear_column_family_one_in);
+  }
+  fprintf(stdout, "Number of threads         : %d\n", FLAGS_threads);
+  fprintf(stdout, "Ops per thread            : %lu\n",
+          (unsigned long)FLAGS_ops_per_thread);
+  std::string ttl_state("unused");
+  if (FLAGS_ttl > 0) {
+    ttl_state = std::to_string(FLAGS_ttl);
+  }
+  fprintf(stdout, "Time to live(sec)         : %s\n", ttl_state.c_str());
+  fprintf(stdout, "Read percentage           : %d%%\n", FLAGS_readpercent);
+  fprintf(stdout, "Prefix percentage         : %d%%\n", FLAGS_prefixpercent);
+  fprintf(stdout, "Write percentage          : %d%%\n", FLAGS_writepercent);
+  fprintf(stdout, "Delete percentage         : %d%%\n", FLAGS_delpercent);
+  fprintf(stdout, "Delete range percentage   : %d%%\n", FLAGS_delrangepercent);
+  fprintf(stdout, "No overwrite percentage   : %d%%\n",
+          FLAGS_nooverwritepercent);
+  fprintf(stdout, "Iterate percentage        : %d%%\n", FLAGS_iterpercent);
+  fprintf(stdout, "Custom ops percentage     : %d%%\n", FLAGS_customopspercent);
+  fprintf(stdout, "DB-write-buffer-size      : %" PRIu64 "\n",
+          FLAGS_db_write_buffer_size);
+  fprintf(stdout, "Write-buffer-size         : %d\n", FLAGS_write_buffer_size);
+  fprintf(stdout, "Iterations                : %lu\n",
+          (unsigned long)FLAGS_num_iterations);
+  fprintf(stdout, "Max key                   : %lu\n",
+          (unsigned long)FLAGS_max_key);
+  fprintf(stdout, "Ratio #ops/#keys          : %f\n",
+          (1.0 * FLAGS_ops_per_thread * FLAGS_threads) / FLAGS_max_key);
+  fprintf(stdout, "Num times DB reopens      : %d\n", FLAGS_reopen);
+  fprintf(stdout, "Batches/snapshots         : %d\n",
+          FLAGS_test_batches_snapshots);
+  fprintf(stdout, "Do update in place        : %d\n", FLAGS_in_place_update);
+  fprintf(stdout, "Num keys per lock         : %d\n",
+          1 << FLAGS_log2_keys_per_lock);
+  std::string compression = CompressionTypeToString(compression_type_e);
+  fprintf(stdout, "Compression               : %s\n", compression.c_str());
+  std::string bottommost_compression =
+      CompressionTypeToString(bottommost_compression_type_e);
+  fprintf(stdout, "Bottommost Compression    : %s\n",
+          bottommost_compression.c_str());
+  std::string checksum = ChecksumTypeToString(checksum_type_e);
+  fprintf(stdout, "Checksum type             : %s\n", checksum.c_str());
+  fprintf(stdout, "File checksum impl        : %s\n",
+          FLAGS_file_checksum_impl.c_str());
+  fprintf(stdout, "Bloom bits / key          : %s\n",
+          FormatDoubleParam(FLAGS_bloom_bits).c_str());
+  fprintf(stdout, "Max subcompactions        : %" PRIu64 "\n",
+          FLAGS_subcompactions);
+  fprintf(stdout, "Use MultiGet              : %s\n",
+          FLAGS_use_multiget ? "true" : "false");
+
+  const char* memtablerep = "";
+  switch (FLAGS_rep_factory) {
+    case kSkipList:
+      memtablerep = "skip_list";
+      break;
+    case kHashSkipList:
+      memtablerep = "prefix_hash";
+      break;
+    case kVectorRep:
+      memtablerep = "vector";
+      break;
+  }
+
+  fprintf(stdout, "Memtablerep               : %s\n", memtablerep);
+
+#ifndef NDEBUG
+  KillPoint* kp = KillPoint::GetInstance();
+  fprintf(stdout, "Test kill odd             : %d\n", kp->rocksdb_kill_odds);
+  if (!kp->rocksdb_kill_exclude_prefixes.empty()) {
+    fprintf(stdout, "Skipping kill points prefixes:\n");
+    for (auto& p : kp->rocksdb_kill_exclude_prefixes) {
+      fprintf(stdout, "  %s\n", p.c_str());
+    }
+  }
+#endif
+  fprintf(stdout, "Periodic Compaction Secs  : %" PRIu64 "\n",
+          FLAGS_periodic_compaction_seconds);
+  fprintf(stdout, "Compaction TTL            : %" PRIu64 "\n",
+          FLAGS_compaction_ttl);
+  const char* compaction_pri = "";
+  switch (FLAGS_compaction_pri) {
+    case kByCompensatedSize:
+      compaction_pri = "kByCompensatedSize";
+      break;
+    case kOldestLargestSeqFirst:
+      compaction_pri = "kOldestLargestSeqFirst";
+      break;
+    case kOldestSmallestSeqFirst:
+      compaction_pri = "kOldestSmallestSeqFirst";
+      break;
+    case kMinOverlappingRatio:
+      compaction_pri = "kMinOverlappingRatio";
+      break;
+    case kRoundRobin:
+      compaction_pri = "kRoundRobin";
+      break;
+  }
+  fprintf(stdout, "Compaction Pri            : %s\n", compaction_pri);
+  fprintf(stdout, "Background Purge          : %d\n",
+          static_cast<int>(FLAGS_avoid_unnecessary_blocking_io));
+  fprintf(stdout, "Write DB ID to manifest   : %d\n",
+          static_cast<int>(FLAGS_write_dbid_to_manifest));
+  fprintf(stdout, "Max Write Batch Group Size: %" PRIu64 "\n",
+          FLAGS_max_write_batch_group_size_bytes);
+  fprintf(stdout, "Use dynamic level         : %d\n",
+          static_cast<int>(FLAGS_level_compaction_dynamic_level_bytes));
+  fprintf(stdout, "Read fault one in         : %d\n", FLAGS_read_fault_one_in);
+  fprintf(stdout, "Write fault one in        : %d\n", FLAGS_write_fault_one_in);
+  fprintf(stdout, "Open metadata write fault one in:\n");
+  fprintf(stdout, "                            %d\n",
+          FLAGS_open_metadata_write_fault_one_in);
+  fprintf(stdout, "Sync fault injection      : %d\n",
+          FLAGS_sync_fault_injection);
+  fprintf(stdout, "Best efforts recovery     : %d\n",
+          static_cast<int>(FLAGS_best_efforts_recovery));
+  fprintf(stdout, "Fail if OPTIONS file error: %d\n",
+          static_cast<int>(FLAGS_fail_if_options_file_error));
+  fprintf(stdout, "User timestamp size bytes : %d\n",
+          static_cast<int>(FLAGS_user_timestamp_size));
+  fprintf(stdout, "WAL compression           : %s\n",
+          FLAGS_wal_compression.c_str());
+  fprintf(stdout, "Try verify sst unique id  : %d\n",
+          static_cast<int>(FLAGS_verify_sst_unique_id_in_manifest));
+
+  fprintf(stdout, "------------------------------------------------\n");
+}
+
+void StressTest::Open(SharedState* shared) {
+  assert(db_ == nullptr);
+#ifndef ROCKSDB_LITE
+  assert(txn_db_ == nullptr);
+#else
+  (void)shared;
+#endif
+  if (!InitializeOptionsFromFile(options_)) {
+    InitializeOptionsFromFlags(cache_, compressed_cache_, filter_policy_,
+                               options_);
+  }
+  InitializeOptionsGeneral(cache_, compressed_cache_, filter_policy_, options_);
+
+  if (FLAGS_prefix_size == 0 && FLAGS_rep_factory == kHashSkipList) {
+    fprintf(stderr,
+            "prefeix_size cannot be zero if memtablerep == prefix_hash\n");
+    exit(1);
+  }
+  if (FLAGS_prefix_size != 0 && FLAGS_rep_factory != kHashSkipList) {
+    fprintf(stderr,
+            "WARNING: prefix_size is non-zero but "
+            "memtablerep != prefix_hash\n");
+  }
+
+  if ((options_.enable_blob_files || options_.enable_blob_garbage_collection ||
+       FLAGS_allow_setting_blob_options_dynamically) &&
+      FLAGS_best_efforts_recovery) {
+    fprintf(stderr,
+            "Integrated BlobDB is currently incompatible with best-effort "
+            "recovery\n");
+    exit(1);
+  }
+
+  fprintf(stdout,
+          "Integrated BlobDB: blob files enabled %d, min blob size %" PRIu64
+          ", blob file size %" PRIu64
+          ", blob compression type %s, blob GC enabled %d, cutoff %f, force "
+          "threshold %f, blob compaction readahead size %" PRIu64
+          ", blob file starting level %d\n",
+          options_.enable_blob_files, options_.min_blob_size,
+          options_.blob_file_size,
+          CompressionTypeToString(options_.blob_compression_type).c_str(),
+          options_.enable_blob_garbage_collection,
+          options_.blob_garbage_collection_age_cutoff,
+          options_.blob_garbage_collection_force_threshold,
+          options_.blob_compaction_readahead_size,
+          options_.blob_file_starting_level);
+
+  if (FLAGS_use_blob_cache) {
+    fprintf(stdout,
+            "Integrated BlobDB: blob cache enabled"
+            ", block and blob caches shared: %d",
+            FLAGS_use_shared_block_and_blob_cache);
+    if (!FLAGS_use_shared_block_and_blob_cache) {
+      fprintf(stdout,
+              ", blob cache size %" PRIu64 ", blob cache num shard bits: %d",
+              FLAGS_blob_cache_size, FLAGS_blob_cache_numshardbits);
+    }
+    fprintf(stdout, ", blob cache prepopulated: %d\n",
+            FLAGS_prepopulate_blob_cache);
+  } else {
+    fprintf(stdout, "Integrated BlobDB: blob cache disabled\n");
+  }
+
+  fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
+
+  Status s;
+
+  if (FLAGS_ttl == -1) {
+    std::vector<std::string> existing_column_families;
+    s = DB::ListColumnFamilies(DBOptions(options_), FLAGS_db,
+                               &existing_column_families);  // ignore errors
+    if (!s.ok()) {
+      // DB doesn't exist
+      assert(existing_column_families.empty());
+      assert(column_family_names_.empty());
+      column_family_names_.push_back(kDefaultColumnFamilyName);
+    } else if (column_family_names_.empty()) {
+      // this is the first call to the function Open()
+      column_family_names_ = existing_column_families;
+    } else {
+      // this is a reopen. just assert that existing column_family_names are
+      // equivalent to what we remember
+      auto sorted_cfn = column_family_names_;
+      std::sort(sorted_cfn.begin(), sorted_cfn.end());
+      std::sort(existing_column_families.begin(),
+                existing_column_families.end());
+      if (sorted_cfn != existing_column_families) {
+        fprintf(stderr, "Expected column families differ from the existing:\n");
+        fprintf(stderr, "Expected: {");
+        for (auto cf : sorted_cfn) {
+          fprintf(stderr, "%s ", cf.c_str());
+        }
+        fprintf(stderr, "}\n");
+        fprintf(stderr, "Existing: {");
+        for (auto cf : existing_column_families) {
+          fprintf(stderr, "%s ", cf.c_str());
+        }
+        fprintf(stderr, "}\n");
+      }
+      assert(sorted_cfn == existing_column_families);
+    }
+    std::vector<ColumnFamilyDescriptor> cf_descriptors;
+    for (auto name : column_family_names_) {
+      if (name != kDefaultColumnFamilyName) {
+        new_column_family_name_ =
+            std::max(new_column_family_name_.load(), std::stoi(name) + 1);
+      }
+      cf_descriptors.emplace_back(name, ColumnFamilyOptions(options_));
+    }
+    while (cf_descriptors.size() < (size_t)FLAGS_column_families) {
+      std::string name = std::to_string(new_column_family_name_.load());
+      new_column_family_name_++;
+      cf_descriptors.emplace_back(name, ColumnFamilyOptions(options_));
+      column_family_names_.push_back(name);
+    }
+
+    options_.listeners.clear();
+#ifndef ROCKSDB_LITE
+    options_.listeners.emplace_back(new DbStressListener(
+        FLAGS_db, options_.db_paths, cf_descriptors, db_stress_listener_env));
+#endif  // !ROCKSDB_LITE
+    RegisterAdditionalListeners();
+
+    if (!FLAGS_use_txn) {
+      // Determine whether we need to ingest file metadata write failures
+      // during DB reopen. If it does, enable it.
+      // Only ingest metadata error if it is reopening, as initial open
+      // failure doesn't need to be handled.
+      // TODO cover transaction DB is not covered in this fault test too.
+      bool ingest_meta_error = false;
+      bool ingest_write_error = false;
+      bool ingest_read_error = false;
+      if ((FLAGS_open_metadata_write_fault_one_in ||
+           FLAGS_open_write_fault_one_in || FLAGS_open_read_fault_one_in) &&
+          fault_fs_guard
+              ->FileExists(FLAGS_db + "/CURRENT", IOOptions(), nullptr)
+              .ok()) {
+        if (!FLAGS_sync) {
+          // When DB Stress is not sync mode, we expect all WAL writes to
+          // WAL is durable. Buffering unsynced writes will cause false
+          // positive in crash tests. Before we figure out a way to
+          // solve it, skip WAL from failure injection.
+          fault_fs_guard->SetSkipDirectWritableTypes({kWalFile});
+        }
+        ingest_meta_error = FLAGS_open_metadata_write_fault_one_in;
+        ingest_write_error = FLAGS_open_write_fault_one_in;
+        ingest_read_error = FLAGS_open_read_fault_one_in;
+        if (ingest_meta_error) {
+          fault_fs_guard->EnableMetadataWriteErrorInjection();
+          fault_fs_guard->SetRandomMetadataWriteError(
+              FLAGS_open_metadata_write_fault_one_in);
+        }
+        if (ingest_write_error) {
+          fault_fs_guard->SetFilesystemDirectWritable(false);
+          fault_fs_guard->EnableWriteErrorInjection();
+          fault_fs_guard->SetRandomWriteError(
+              static_cast<uint32_t>(FLAGS_seed), FLAGS_open_write_fault_one_in,
+              IOStatus::IOError("Injected Open Error"),
+              /*inject_for_all_file_types=*/true, /*types=*/{});
+        }
+        if (ingest_read_error) {
+          fault_fs_guard->SetRandomReadError(FLAGS_open_read_fault_one_in);
+        }
+      }
+      while (true) {
+#ifndef ROCKSDB_LITE
+        // StackableDB-based BlobDB
+        if (FLAGS_use_blob_db) {
+          blob_db::BlobDBOptions blob_db_options;
+          blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size;
+          blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync;
+          blob_db_options.blob_file_size = FLAGS_blob_db_file_size;
+          blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc;
+          blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff;
+
+          blob_db::BlobDB* blob_db = nullptr;
+          s = blob_db::BlobDB::Open(options_, blob_db_options, FLAGS_db,
+                                    cf_descriptors, &column_families_,
+                                    &blob_db);
+          if (s.ok()) {
+            db_ = blob_db;
+          }
+        } else
+#endif  // !ROCKSDB_LITE
+        {
+          if (db_preload_finished_.load() && FLAGS_read_only) {
+            s = DB::OpenForReadOnly(DBOptions(options_), FLAGS_db,
+                                    cf_descriptors, &column_families_, &db_);
+          } else {
+            s = DB::Open(DBOptions(options_), FLAGS_db, cf_descriptors,
+                         &column_families_, &db_);
+          }
+        }
+
+        if (ingest_meta_error || ingest_write_error || ingest_read_error) {
+          fault_fs_guard->SetFilesystemDirectWritable(true);
+          fault_fs_guard->DisableMetadataWriteErrorInjection();
+          fault_fs_guard->DisableWriteErrorInjection();
+          fault_fs_guard->SetSkipDirectWritableTypes({});
+          fault_fs_guard->SetRandomReadError(0);
+          if (s.ok()) {
+            // Ingested errors might happen in background compactions. We
+            // wait for all compactions to finish to make sure DB is in
+            // clean state before executing queries.
+            s = static_cast_with_check<DBImpl>(db_->GetRootDB())
+                    ->WaitForCompact(true /* wait_unscheduled */);
+            if (!s.ok()) {
+              for (auto cf : column_families_) {
+                delete cf;
+              }
+              column_families_.clear();
+              delete db_;
+              db_ = nullptr;
+            }
+          }
+          if (!s.ok()) {
+            // After failure to opening a DB due to IO error, retry should
+            // successfully open the DB with correct data if no IO error shows
+            // up.
+            ingest_meta_error = false;
+            ingest_write_error = false;
+            ingest_read_error = false;
+
+            Random rand(static_cast<uint32_t>(FLAGS_seed));
+            if (rand.OneIn(2)) {
+              fault_fs_guard->DeleteFilesCreatedAfterLastDirSync(IOOptions(),
+                                                                 nullptr);
+            }
+            if (rand.OneIn(3)) {
+              fault_fs_guard->DropUnsyncedFileData();
+            } else if (rand.OneIn(2)) {
+              fault_fs_guard->DropRandomUnsyncedFileData(&rand);
+            }
+            continue;
+          }
+        }
+        break;
+      }
+    } else {
+#ifndef ROCKSDB_LITE
+      TransactionDBOptions txn_db_options;
+      assert(FLAGS_txn_write_policy <= TxnDBWritePolicy::WRITE_UNPREPARED);
+      txn_db_options.write_policy =
+          static_cast<TxnDBWritePolicy>(FLAGS_txn_write_policy);
+      if (FLAGS_unordered_write) {
+        assert(txn_db_options.write_policy == TxnDBWritePolicy::WRITE_PREPARED);
+        options_.unordered_write = true;
+        options_.two_write_queues = true;
+        txn_db_options.skip_concurrency_control = true;
+      } else {
+        options_.two_write_queues = FLAGS_two_write_queues;
+      }
+      txn_db_options.wp_snapshot_cache_bits =
+          static_cast<size_t>(FLAGS_wp_snapshot_cache_bits);
+      txn_db_options.wp_commit_cache_bits =
+          static_cast<size_t>(FLAGS_wp_commit_cache_bits);
+      PrepareTxnDbOptions(shared, txn_db_options);
+      s = TransactionDB::Open(options_, txn_db_options, FLAGS_db,
+                              cf_descriptors, &column_families_, &txn_db_);
+      if (!s.ok()) {
+        fprintf(stderr, "Error in opening the TransactionDB [%s]\n",
+                s.ToString().c_str());
+        fflush(stderr);
+      }
+      assert(s.ok());
+
+      // Do not swap the order of the following.
+      {
+        db_ = txn_db_;
+        db_aptr_.store(txn_db_, std::memory_order_release);
+      }
+#endif
+    }
+    if (!s.ok()) {
+      fprintf(stderr, "Error in opening the DB [%s]\n", s.ToString().c_str());
+      fflush(stderr);
+    }
+    assert(s.ok());
+    assert(column_families_.size() ==
+           static_cast<size_t>(FLAGS_column_families));
+
+    // Secondary instance does not support write-prepared/write-unprepared
+    // transactions, thus just disable secondary instance if we use
+    // transaction.
+    if (s.ok() && FLAGS_test_secondary && !FLAGS_use_txn) {
+#ifndef ROCKSDB_LITE
+      Options tmp_opts;
+      // TODO(yanqin) support max_open_files != -1 for secondary instance.
+      tmp_opts.max_open_files = -1;
+      tmp_opts.env = db_stress_env;
+      const std::string& secondary_path = FLAGS_secondaries_base;
+      s = DB::OpenAsSecondary(tmp_opts, FLAGS_db, secondary_path,
+                              cf_descriptors, &cmp_cfhs_, &cmp_db_);
+      assert(s.ok());
+      assert(cmp_cfhs_.size() == static_cast<size_t>(FLAGS_column_families));
+#else
+      fprintf(stderr, "Secondary is not supported in RocksDBLite\n");
+      exit(1);
+#endif  // !ROCKSDB_LITE
+    }
+  } else {
+#ifndef ROCKSDB_LITE
+    DBWithTTL* db_with_ttl;
+    s = DBWithTTL::Open(options_, FLAGS_db, &db_with_ttl, FLAGS_ttl);
+    db_ = db_with_ttl;
+#else
+    fprintf(stderr, "TTL is not supported in RocksDBLite\n");
+    exit(1);
+#endif
+  }
+
+  if (FLAGS_preserve_unverified_changes) {
+    // Up until now, no live file should have become obsolete due to these
+    // options. After `DisableFileDeletions()` we can reenable auto compactions
+    // since, even if live files become obsolete, they won't be deleted.
+    assert(options_.avoid_flush_during_recovery);
+    assert(options_.disable_auto_compactions);
+    if (s.ok()) {
+      s = db_->DisableFileDeletions();
+    }
+    if (s.ok()) {
+      s = db_->EnableAutoCompaction(column_families_);
+    }
+  }
+
+  if (!s.ok()) {
+    fprintf(stderr, "open error: %s\n", s.ToString().c_str());
+    exit(1);
+  }
+}
+
+void StressTest::Reopen(ThreadState* thread) {
+#ifndef ROCKSDB_LITE
+  // BG jobs in WritePrepared must be canceled first because i) they can access
+  // the db via a callbac ii) they hold on to a snapshot and the upcoming
+  // ::Close would complain about it.
+  const bool write_prepared = FLAGS_use_txn && FLAGS_txn_write_policy != 0;
+  bool bg_canceled __attribute__((unused)) = false;
+  if (write_prepared || thread->rand.OneIn(2)) {
+    const bool wait =
+        write_prepared || static_cast<bool>(thread->rand.OneIn(2));
+    CancelAllBackgroundWork(db_, wait);
+    bg_canceled = wait;
+  }
+  assert(!write_prepared || bg_canceled);
+#else
+  (void)thread;
+#endif
+
+  for (auto cf : column_families_) {
+    delete cf;
+  }
+  column_families_.clear();
+
+#ifndef ROCKSDB_LITE
+  if (thread->rand.OneIn(2)) {
+    Status s = db_->Close();
+    if (!s.ok()) {
+      fprintf(stderr, "Non-ok close status: %s\n", s.ToString().c_str());
+      fflush(stderr);
+    }
+    assert(s.ok());
+  }
+#endif
+  delete db_;
+  db_ = nullptr;
+#ifndef ROCKSDB_LITE
+  txn_db_ = nullptr;
+#endif
+
+  num_times_reopened_++;
+  auto now = clock_->NowMicros();
+  fprintf(stdout, "%s Reopening database for the %dth time\n",
+          clock_->TimeToString(now / 1000000).c_str(), num_times_reopened_);
+  Open(thread->shared);
+
+  if ((FLAGS_sync_fault_injection || FLAGS_disable_wal ||
+       FLAGS_manual_wal_flush_one_in > 0) &&
+      IsStateTracked()) {
+    Status s = thread->shared->SaveAtAndAfter(db_);
+    if (!s.ok()) {
+      fprintf(stderr, "Error enabling history tracing: %s\n",
+              s.ToString().c_str());
+      exit(1);
+    }
+  }
+}
+
+bool StressTest::MaybeUseOlderTimestampForPointLookup(ThreadState* thread,
+                                                      std::string& ts_str,
+                                                      Slice& ts_slice,
+                                                      ReadOptions& read_opts) {
+  if (FLAGS_user_timestamp_size == 0) {
+    return false;
+  }
+
+  assert(thread);
+  if (!thread->rand.OneInOpt(3)) {
+    return false;
+  }
+
+  const SharedState* const shared = thread->shared;
+  assert(shared);
+  const uint64_t start_ts = shared->GetStartTimestamp();
+
+  uint64_t now = db_stress_env->NowNanos();
+
+  assert(now > start_ts);
+  uint64_t time_diff = now - start_ts;
+  uint64_t ts = start_ts + (thread->rand.Next64() % time_diff);
+  ts_str.clear();
+  PutFixed64(&ts_str, ts);
+  ts_slice = ts_str;
+  read_opts.timestamp = &ts_slice;
+  return true;
+}
+
+void StressTest::MaybeUseOlderTimestampForRangeScan(ThreadState* thread,
+                                                    std::string& ts_str,
+                                                    Slice& ts_slice,
+                                                    ReadOptions& read_opts) {
+  if (FLAGS_user_timestamp_size == 0) {
+    return;
+  }
+
+  assert(thread);
+  if (!thread->rand.OneInOpt(3)) {
+    return;
+  }
+
+  const Slice* const saved_ts = read_opts.timestamp;
+  assert(saved_ts != nullptr);
+
+  const SharedState* const shared = thread->shared;
+  assert(shared);
+  const uint64_t start_ts = shared->GetStartTimestamp();
+
+  uint64_t now = db_stress_env->NowNanos();
+
+  assert(now > start_ts);
+  uint64_t time_diff = now - start_ts;
+  uint64_t ts = start_ts + (thread->rand.Next64() % time_diff);
+  ts_str.clear();
+  PutFixed64(&ts_str, ts);
+  ts_slice = ts_str;
+  read_opts.timestamp = &ts_slice;
+
+  // TODO (yanqin): support Merge with iter_start_ts
+  if (!thread->rand.OneInOpt(3) || FLAGS_use_merge || FLAGS_use_full_merge_v1) {
+    return;
+  }
+
+  ts_str.clear();
+  PutFixed64(&ts_str, start_ts);
+  ts_slice = ts_str;
+  read_opts.iter_start_ts = &ts_slice;
+  read_opts.timestamp = saved_ts;
+}
+
+void CheckAndSetOptionsForUserTimestamp(Options& options) {
+  assert(FLAGS_user_timestamp_size > 0);
+  const Comparator* const cmp = test::BytewiseComparatorWithU64TsWrapper();
+  assert(cmp);
+  if (FLAGS_user_timestamp_size != cmp->timestamp_size()) {
+    fprintf(stderr,
+            "Only -user_timestamp_size=%d is supported in stress test.\n",
+            static_cast<int>(cmp->timestamp_size()));
+    exit(1);
+  }
+  if (FLAGS_use_txn) {
+    fprintf(stderr, "TransactionDB does not support timestamp yet.\n");
+    exit(1);
+  }
+#ifndef ROCKSDB_LITE
+  if (FLAGS_enable_blob_files || FLAGS_use_blob_db) {
+    fprintf(stderr, "BlobDB not supported with timestamp.\n");
+    exit(1);
+  }
+#endif  // !ROCKSDB_LITE
+  if (FLAGS_test_cf_consistency || FLAGS_test_batches_snapshots) {
+    fprintf(stderr,
+            "Due to per-key ts-seq ordering constraint, only the (default) "
+            "non-batched test is supported with timestamp.\n");
+    exit(1);
+  }
+  if (FLAGS_ingest_external_file_one_in > 0) {
+    fprintf(stderr, "Bulk loading may not support timestamp yet.\n");
+    exit(1);
+  }
+  options.comparator = cmp;
+}
+
+bool InitializeOptionsFromFile(Options& options) {
+#ifndef ROCKSDB_LITE
+  DBOptions db_options;
+  std::vector<ColumnFamilyDescriptor> cf_descriptors;
+  if (!FLAGS_options_file.empty()) {
+    Status s = LoadOptionsFromFile(FLAGS_options_file, db_stress_env,
+                                   &db_options, &cf_descriptors);
+    if (!s.ok()) {
+      fprintf(stderr, "Unable to load options file %s --- %s\n",
+              FLAGS_options_file.c_str(), s.ToString().c_str());
+      exit(1);
+    }
+    db_options.env = new DbStressEnvWrapper(db_stress_env);
+    options = Options(db_options, cf_descriptors[0].options);
+    return true;
+  }
+#else
+  (void)options;
+  fprintf(stderr, "--options_file not supported in lite mode\n");
+  exit(1);
+#endif  //! ROCKSDB_LITE
+  return false;
+}
+
+void InitializeOptionsFromFlags(
+    const std::shared_ptr<Cache>& cache,
+    const std::shared_ptr<Cache>& block_cache_compressed,
+    const std::shared_ptr<const FilterPolicy>& filter_policy,
+    Options& options) {
+  BlockBasedTableOptions block_based_options;
+  block_based_options.block_cache = cache;
+  block_based_options.cache_index_and_filter_blocks =
+      FLAGS_cache_index_and_filter_blocks;
+  block_based_options.metadata_cache_options.top_level_index_pinning =
+      static_cast<PinningTier>(FLAGS_top_level_index_pinning);
+  block_based_options.metadata_cache_options.partition_pinning =
+      static_cast<PinningTier>(FLAGS_partition_pinning);
+  block_based_options.metadata_cache_options.unpartitioned_pinning =
+      static_cast<PinningTier>(FLAGS_unpartitioned_pinning);
+  block_based_options.block_cache_compressed = block_cache_compressed;
+  block_based_options.checksum = checksum_type_e;
+  block_based_options.block_size = FLAGS_block_size;
+  block_based_options.cache_usage_options.options_overrides.insert(
+      {CacheEntryRole::kCompressionDictionaryBuildingBuffer,
+       {/*.charged = */ FLAGS_charge_compression_dictionary_building_buffer
+            ? CacheEntryRoleOptions::Decision::kEnabled
+            : CacheEntryRoleOptions::Decision::kDisabled}});
+  block_based_options.cache_usage_options.options_overrides.insert(
+      {CacheEntryRole::kFilterConstruction,
+       {/*.charged = */ FLAGS_charge_filter_construction
+            ? CacheEntryRoleOptions::Decision::kEnabled
+            : CacheEntryRoleOptions::Decision::kDisabled}});
+  block_based_options.cache_usage_options.options_overrides.insert(
+      {CacheEntryRole::kBlockBasedTableReader,
+       {/*.charged = */ FLAGS_charge_table_reader
+            ? CacheEntryRoleOptions::Decision::kEnabled
+            : CacheEntryRoleOptions::Decision::kDisabled}});
+  block_based_options.cache_usage_options.options_overrides.insert(
+      {CacheEntryRole::kFileMetadata,
+       {/*.charged = */ FLAGS_charge_file_metadata
+            ? CacheEntryRoleOptions::Decision::kEnabled
+            : CacheEntryRoleOptions::Decision::kDisabled}});
+  block_based_options.cache_usage_options.options_overrides.insert(
+      {CacheEntryRole::kBlobCache,
+       {/*.charged = */ FLAGS_charge_blob_cache
+            ? CacheEntryRoleOptions::Decision::kEnabled
+            : CacheEntryRoleOptions::Decision::kDisabled}});
+  block_based_options.format_version =
+      static_cast<uint32_t>(FLAGS_format_version);
+  block_based_options.index_block_restart_interval =
+      static_cast<int32_t>(FLAGS_index_block_restart_interval);
+  block_based_options.filter_policy = filter_policy;
+  block_based_options.partition_filters = FLAGS_partition_filters;
+  block_based_options.optimize_filters_for_memory =
+      FLAGS_optimize_filters_for_memory;
+  block_based_options.detect_filter_construct_corruption =
+      FLAGS_detect_filter_construct_corruption;
+  block_based_options.index_type =
+      static_cast<BlockBasedTableOptions::IndexType>(FLAGS_index_type);
+  block_based_options.data_block_index_type =
+      static_cast<BlockBasedTableOptions::DataBlockIndexType>(
+          FLAGS_data_block_index_type);
+  block_based_options.prepopulate_block_cache =
+      static_cast<BlockBasedTableOptions::PrepopulateBlockCache>(
+          FLAGS_prepopulate_block_cache);
+  block_based_options.initial_auto_readahead_size =
+      FLAGS_initial_auto_readahead_size;
+  block_based_options.max_auto_readahead_size = FLAGS_max_auto_readahead_size;
+  block_based_options.num_file_reads_for_auto_readahead =
+      FLAGS_num_file_reads_for_auto_readahead;
+  options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
+  options.db_write_buffer_size = FLAGS_db_write_buffer_size;
+  options.write_buffer_size = FLAGS_write_buffer_size;
+  options.max_write_buffer_number = FLAGS_max_write_buffer_number;
+  options.min_write_buffer_number_to_merge =
+      FLAGS_min_write_buffer_number_to_merge;
+  options.max_write_buffer_number_to_maintain =
+      FLAGS_max_write_buffer_number_to_maintain;
+  options.max_write_buffer_size_to_maintain =
+      FLAGS_max_write_buffer_size_to_maintain;
+  options.memtable_prefix_bloom_size_ratio =
+      FLAGS_memtable_prefix_bloom_size_ratio;
+  options.memtable_whole_key_filtering = FLAGS_memtable_whole_key_filtering;
+  options.disable_auto_compactions = FLAGS_disable_auto_compactions;
+  options.max_background_compactions = FLAGS_max_background_compactions;
+  options.max_background_flushes = FLAGS_max_background_flushes;
+  options.compaction_style =
+      static_cast<ROCKSDB_NAMESPACE::CompactionStyle>(FLAGS_compaction_style);
+  options.compaction_pri =
+      static_cast<ROCKSDB_NAMESPACE::CompactionPri>(FLAGS_compaction_pri);
+  options.num_levels = FLAGS_num_levels;
+  if (FLAGS_prefix_size >= 0) {
+    options.prefix_extractor.reset(NewFixedPrefixTransform(FLAGS_prefix_size));
+  }
+  options.max_open_files = FLAGS_open_files;
+  options.statistics = dbstats;
+  options.env = db_stress_env;
+  options.use_fsync = FLAGS_use_fsync;
+  options.compaction_readahead_size = FLAGS_compaction_readahead_size;
+  options.allow_mmap_reads = FLAGS_mmap_read;
+  options.allow_mmap_writes = FLAGS_mmap_write;
+  options.use_direct_reads = FLAGS_use_direct_reads;
+  options.use_direct_io_for_flush_and_compaction =
+      FLAGS_use_direct_io_for_flush_and_compaction;
+  options.recycle_log_file_num =
+      static_cast<size_t>(FLAGS_recycle_log_file_num);
+  options.target_file_size_base = FLAGS_target_file_size_base;
+  options.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
+  options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
+  options.max_bytes_for_level_multiplier = FLAGS_max_bytes_for_level_multiplier;
+  options.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
+  options.level0_slowdown_writes_trigger = FLAGS_level0_slowdown_writes_trigger;
+  options.level0_file_num_compaction_trigger =
+      FLAGS_level0_file_num_compaction_trigger;
+  options.compression = compression_type_e;
+  options.bottommost_compression = bottommost_compression_type_e;
+  options.compression_opts.max_dict_bytes = FLAGS_compression_max_dict_bytes;
+  options.compression_opts.zstd_max_train_bytes =
+      FLAGS_compression_zstd_max_train_bytes;
+  options.compression_opts.parallel_threads =
+      FLAGS_compression_parallel_threads;
+  options.compression_opts.max_dict_buffer_bytes =
+      FLAGS_compression_max_dict_buffer_bytes;
+  if (ZSTD_FinalizeDictionarySupported()) {
+    options.compression_opts.use_zstd_dict_trainer =
+        FLAGS_compression_use_zstd_dict_trainer;
+  } else if (!FLAGS_compression_use_zstd_dict_trainer) {
+    fprintf(
+        stderr,
+        "WARNING: use_zstd_dict_trainer is false but zstd finalizeDictionary "
+        "cannot be used because ZSTD 1.4.5+ is not linked with the binary."
+        " zstd dictionary trainer will be used.\n");
+  }
+  options.max_manifest_file_size = FLAGS_max_manifest_file_size;
+  options.inplace_update_support = FLAGS_in_place_update;
+  options.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
+  options.allow_concurrent_memtable_write =
+      FLAGS_allow_concurrent_memtable_write;
+  options.experimental_mempurge_threshold =
+      FLAGS_experimental_mempurge_threshold;
+  options.periodic_compaction_seconds = FLAGS_periodic_compaction_seconds;
+  options.stats_dump_period_sec =
+      static_cast<unsigned int>(FLAGS_stats_dump_period_sec);
+  options.ttl = FLAGS_compaction_ttl;
+  options.enable_pipelined_write = FLAGS_enable_pipelined_write;
+  options.enable_write_thread_adaptive_yield =
+      FLAGS_enable_write_thread_adaptive_yield;
+  options.compaction_options_universal.size_ratio = FLAGS_universal_size_ratio;
+  options.compaction_options_universal.min_merge_width =
+      FLAGS_universal_min_merge_width;
+  options.compaction_options_universal.max_merge_width =
+      FLAGS_universal_max_merge_width;
+  options.compaction_options_universal.max_size_amplification_percent =
+      FLAGS_universal_max_size_amplification_percent;
+  options.atomic_flush = FLAGS_atomic_flush;
+  options.manual_wal_flush = FLAGS_manual_wal_flush_one_in > 0 ? true : false;
+  options.avoid_unnecessary_blocking_io = FLAGS_avoid_unnecessary_blocking_io;
+  options.write_dbid_to_manifest = FLAGS_write_dbid_to_manifest;
+  options.avoid_flush_during_recovery = FLAGS_avoid_flush_during_recovery;
+  options.max_write_batch_group_size_bytes =
+      FLAGS_max_write_batch_group_size_bytes;
+  options.level_compaction_dynamic_level_bytes =
+      FLAGS_level_compaction_dynamic_level_bytes;
+  options.track_and_verify_wals_in_manifest = true;
+  options.verify_sst_unique_id_in_manifest =
+      FLAGS_verify_sst_unique_id_in_manifest;
+  options.memtable_protection_bytes_per_key =
+      FLAGS_memtable_protection_bytes_per_key;
+
+  // Integrated BlobDB
+  options.enable_blob_files = FLAGS_enable_blob_files;
+  options.min_blob_size = FLAGS_min_blob_size;
+  options.blob_file_size = FLAGS_blob_file_size;
+  options.blob_compression_type =
+      StringToCompressionType(FLAGS_blob_compression_type.c_str());
+  options.enable_blob_garbage_collection = FLAGS_enable_blob_garbage_collection;
+  options.blob_garbage_collection_age_cutoff =
+      FLAGS_blob_garbage_collection_age_cutoff;
+  options.blob_garbage_collection_force_threshold =
+      FLAGS_blob_garbage_collection_force_threshold;
+  options.blob_compaction_readahead_size = FLAGS_blob_compaction_readahead_size;
+  options.blob_file_starting_level = FLAGS_blob_file_starting_level;
+
+  if (FLAGS_use_blob_cache) {
+    if (FLAGS_use_shared_block_and_blob_cache) {
+      options.blob_cache = cache;
+    } else {
+      if (FLAGS_blob_cache_size > 0) {
+        LRUCacheOptions co;
+        co.capacity = FLAGS_blob_cache_size;
+        co.num_shard_bits = FLAGS_blob_cache_numshardbits;
+        options.blob_cache = NewLRUCache(co);
+      } else {
+        fprintf(stderr,
+                "Unable to create a standalone blob cache if blob_cache_size "
+                "<= 0.\n");
+        exit(1);
+      }
+    }
+    switch (FLAGS_prepopulate_blob_cache) {
+      case 0:
+        options.prepopulate_blob_cache = PrepopulateBlobCache::kDisable;
+        break;
+      case 1:
+        options.prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly;
+        break;
+      default:
+        fprintf(stderr, "Unknown prepopulate blob cache mode\n");
+        exit(1);
+    }
+  }
+
+  options.wal_compression =
+      StringToCompressionType(FLAGS_wal_compression.c_str());
+
+  if (FLAGS_enable_tiered_storage) {
+    options.bottommost_temperature = Temperature::kCold;
+  }
+  options.preclude_last_level_data_seconds =
+      FLAGS_preclude_last_level_data_seconds;
+  options.preserve_internal_time_seconds = FLAGS_preserve_internal_time_seconds;
+
+  switch (FLAGS_rep_factory) {
+    case kSkipList:
+      // no need to do anything
+      break;
+#ifndef ROCKSDB_LITE
+    case kHashSkipList:
+      options.memtable_factory.reset(NewHashSkipListRepFactory(10000));
+      break;
+    case kVectorRep:
+      options.memtable_factory.reset(new VectorRepFactory());
+      break;
+#else
+    default:
+      fprintf(stderr,
+              "RocksdbLite only supports skip list mem table. Skip "
+              "--rep_factory\n");
+#endif  // ROCKSDB_LITE
+  }
+
+  if (FLAGS_use_full_merge_v1) {
+    options.merge_operator = MergeOperators::CreateDeprecatedPutOperator();
+  } else {
+    options.merge_operator = MergeOperators::CreatePutOperator();
+  }
+
+  if (FLAGS_enable_compaction_filter) {
+    options.compaction_filter_factory =
+        std::make_shared<DbStressCompactionFilterFactory>();
+  }
+
+  options.best_efforts_recovery = FLAGS_best_efforts_recovery;
+  options.paranoid_file_checks = FLAGS_paranoid_file_checks;
+  options.fail_if_options_file_error = FLAGS_fail_if_options_file_error;
+
+  if (FLAGS_user_timestamp_size > 0) {
+    CheckAndSetOptionsForUserTimestamp(options);
+  }
+
+  options.allow_data_in_errors = FLAGS_allow_data_in_errors;
+}
+
+void InitializeOptionsGeneral(
+    const std::shared_ptr<Cache>& cache,
+    const std::shared_ptr<Cache>& block_cache_compressed,
+    const std::shared_ptr<const FilterPolicy>& filter_policy,
+    Options& options) {
+  options.create_missing_column_families = true;
+  options.create_if_missing = true;
+
+  if (!options.statistics) {
+    options.statistics = dbstats;
+  }
+
+  if (options.env == Options().env) {
+    options.env = db_stress_env;
+  }
+
+  assert(options.table_factory);
+  auto table_options =
+      options.table_factory->GetOptions<BlockBasedTableOptions>();
+  if (table_options) {
+    if (FLAGS_cache_size > 0) {
+      table_options->block_cache = cache;
+    }
+    if (!table_options->block_cache_compressed &&
+        FLAGS_compressed_cache_size > 0) {
+      table_options->block_cache_compressed = block_cache_compressed;
+    }
+    if (!table_options->filter_policy) {
+      table_options->filter_policy = filter_policy;
+    }
+  }
+
+  // TODO: row_cache, thread-pool IO priority, CPU priority.
+
+  if (!options.rate_limiter) {
+    if (FLAGS_rate_limiter_bytes_per_sec > 0) {
+      options.rate_limiter.reset(NewGenericRateLimiter(
+          FLAGS_rate_limiter_bytes_per_sec, 1000 /* refill_period_us */,
+          10 /* fairness */,
+          FLAGS_rate_limit_bg_reads ? RateLimiter::Mode::kReadsOnly
+                                    : RateLimiter::Mode::kWritesOnly));
+    }
+  }
+
+  if (!options.file_checksum_gen_factory) {
+    options.file_checksum_gen_factory =
+        GetFileChecksumImpl(FLAGS_file_checksum_impl);
+  }
+
+  if (FLAGS_sst_file_manager_bytes_per_sec > 0 ||
+      FLAGS_sst_file_manager_bytes_per_truncate > 0) {
+    Status status;
+    options.sst_file_manager.reset(NewSstFileManager(
+        db_stress_env, options.info_log, "" /* trash_dir */,
+        static_cast<int64_t>(FLAGS_sst_file_manager_bytes_per_sec),
+        true /* delete_existing_trash */, &status,
+        0.25 /* max_trash_db_ratio */,
+        FLAGS_sst_file_manager_bytes_per_truncate));
+    if (!status.ok()) {
+      fprintf(stderr, "SstFileManager creation failed: %s\n",
+              status.ToString().c_str());
+      exit(1);
+    }
+  }
+
+  if (FLAGS_preserve_unverified_changes) {
+    if (!options.avoid_flush_during_recovery) {
+      fprintf(stderr,
+              "WARNING: flipping `avoid_flush_during_recovery` to true for "
+              "`preserve_unverified_changes` to keep all files\n");
+      options.avoid_flush_during_recovery = true;
+    }
+    // Together with `avoid_flush_during_recovery == true`, this will prevent
+    // live files from becoming obsolete and deleted between `DB::Open()` and
+    // `DisableFileDeletions()` due to flush or compaction. We do not need to
+    // warn the user since we will reenable compaction soon.
+    options.disable_auto_compactions = true;
+  }
+
+  options.table_properties_collector_factories.emplace_back(
+      std::make_shared<DbStressTablePropertiesCollectorFactory>());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // GFLAGS
diff --git a/src/rocksdb/db_stress_tool/db_stress_test_base.h b/src/rocksdb/db_stress_tool/db_stress_test_base.h
new file mode 100644
index 000000000..81fbbe24b
--- /dev/null
+++ b/src/rocksdb/db_stress_tool/db_stress_test_base.h
@@ -0,0 +1,337 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifdef GFLAGS
+#pragma once
+
+#include "db_stress_tool/db_stress_common.h"
+#include "db_stress_tool/db_stress_shared_state.h"
+
+namespace ROCKSDB_NAMESPACE {
+class SystemClock;
+class Transaction;
+class TransactionDB;
+struct TransactionDBOptions;
+
+class StressTest {
+ public:
+  StressTest();
+
+  virtual ~StressTest();
+
+  std::shared_ptr<Cache> NewCache(size_t capacity, int32_t num_shard_bits);
+
+  static std::vector<std::string> GetBlobCompressionTags();
+
+  bool BuildOptionsTable();
+
+  void InitDb(SharedState*);
+  // The initialization work is split into two parts to avoid a circular
+  // dependency with `SharedState`.
+  virtual void FinishInitDb(SharedState*);
+  void TrackExpectedState(SharedState* shared);
+  void OperateDb(ThreadState* thread);
+  virtual void VerifyDb(ThreadState* thread) const = 0;
+  virtual void ContinuouslyVerifyDb(ThreadState* /*thread*/) const = 0;
+  void PrintStatistics();
+
+ protected:
+  Status AssertSame(DB* db, ColumnFamilyHandle* cf,
+                    ThreadState::SnapshotState& snap_state);
+
+  // Currently PreloadDb has to be single-threaded.
+  void PreloadDbAndReopenAsReadOnly(int64_t number_of_keys,
+                                    SharedState* shared);
+
+  Status SetOptions(ThreadState* thread);
+
+#ifndef ROCKSDB_LITE
+  // For transactionsDB, there can be txns prepared but not yet committeed
+  // right before previous stress run crash.
+  // They will be recovered and processed through
+  // ProcessRecoveredPreparedTxnsHelper on the start of current stress run.
+  void ProcessRecoveredPreparedTxns(SharedState* shared);
+
+  // Default implementation will first update ExpectedState to be
+  // `SharedState::UNKNOWN` for each keys in `txn` and then randomly
+  // commit or rollback `txn`.
+  virtual void ProcessRecoveredPreparedTxnsHelper(Transaction* txn,
+                                                  SharedState* shared);
+
+  Status NewTxn(WriteOptions& write_opts, Transaction** txn);
+
+  Status CommitTxn(Transaction* txn, ThreadState* thread = nullptr);
+
+  Status RollbackTxn(Transaction* txn);
+#endif
+
+  virtual void MaybeClearOneColumnFamily(ThreadState* /* thread */) {}
+
+  virtual bool ShouldAcquireMutexOnKey() const { return false; }
+
+  // Returns true if DB state is tracked by the stress test.
+  virtual bool IsStateTracked() const = 0;
+
+  virtual std::vector<int> GenerateColumnFamilies(
+      const int /* num_column_families */, int rand_column_family) const {
+    return {rand_column_family};
+  }
+
+  virtual std::vector<int64_t> GenerateKeys(int64_t rand_key) const {
+    return {rand_key};
+  }
+
+  virtual Status TestGet(ThreadState* thread, const ReadOptions& read_opts,
+                         const std::vector<int>& rand_column_families,
+                         const std::vector<int64_t>& rand_keys) = 0;
+
+  virtual std::vector<Status> TestMultiGet(
+      ThreadState* thread, const ReadOptions& read_opts,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys) = 0;
+
+  virtual Status TestPrefixScan(ThreadState* thread,
+                                const ReadOptions& read_opts,
+                                const std::vector<int>& rand_column_families,
+                                const std::vector<int64_t>& rand_keys) = 0;
+
+  virtual Status TestPut(ThreadState* thread, WriteOptions& write_opts,
+                         const ReadOptions& read_opts,
+                         const std::vector<int>& cf_ids,
+                         const std::vector<int64_t>& keys,
+                         char (&value)[100]) = 0;
+
+  virtual Status TestDelete(ThreadState* thread, WriteOptions& write_opts,
+                            const std::vector<int>& rand_column_families,
+                            const std::vector<int64_t>& rand_keys) = 0;
+
+  virtual Status TestDeleteRange(ThreadState* thread, WriteOptions& write_opts,
+                                 const std::vector<int>& rand_column_families,
+                                 const std::vector<int64_t>& rand_keys) = 0;
+
+  virtual void TestIngestExternalFile(
+      ThreadState* thread, const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys) = 0;
+
+  // Issue compact range, starting with start_key, whose integer value
+  // is rand_key.
+  virtual void TestCompactRange(ThreadState* thread, int64_t rand_key,
+                                const Slice& start_key,
+                                ColumnFamilyHandle* column_family);
+
+  // Calculate a hash value for all keys in range [start_key, end_key]
+  // at a certain snapshot.
+  uint32_t GetRangeHash(ThreadState* thread, const Snapshot* snapshot,
+                        ColumnFamilyHandle* column_family,
+                        const Slice& start_key, const Slice& end_key);
+
+  // Return a column family handle that mirrors what is pointed by
+  // `column_family_id`, which will be used to validate data to be correct.
+  // By default, the column family itself will be returned.
+  virtual ColumnFamilyHandle* GetControlCfh(ThreadState* /* thread*/,
+                                            int column_family_id) {
+    return column_families_[column_family_id];
+  }
+
+#ifndef ROCKSDB_LITE
+  // Generated a list of keys that close to boundaries of SST keys.
+  // If there isn't any SST file in the DB, return empty list.
+  std::vector<std::string> GetWhiteBoxKeys(ThreadState* thread, DB* db,
+                                           ColumnFamilyHandle* cfh,
+                                           size_t num_keys);
+#else   // !ROCKSDB_LITE
+  std::vector<std::string> GetWhiteBoxKeys(ThreadState*, DB*,
+                                           ColumnFamilyHandle*, size_t) {
+    // Not supported in LITE mode.
+    return {};
+  }
+#endif  // !ROCKSDB_LITE
+
+  // Given a key K, this creates an iterator which scans to K and then
+  // does a random sequence of Next/Prev operations.
+  virtual Status TestIterate(ThreadState* thread, const ReadOptions& read_opts,
+                             const std::vector<int>& rand_column_families,
+                             const std::vector<int64_t>& rand_keys);
+
+  virtual Status TestIterateAgainstExpected(
+      ThreadState* /* thread */, const ReadOptions& /* read_opts */,
+      const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */) {
+    return Status::NotSupported();
+  }
+
+  // Enum used by VerifyIterator() to identify the mode to validate.
+  enum LastIterateOp {
+    kLastOpSeek,
+    kLastOpSeekForPrev,
+    kLastOpNextOrPrev,
+    kLastOpSeekToFirst,
+    kLastOpSeekToLast
+  };
+
+  // Compare the two iterator, iter and cmp_iter are in the same position,
+  // unless iter might be made invalidate or undefined because of
+  // upper or lower bounds, or prefix extractor.
+  // Will flag failure if the verification fails.
+  // diverged = true if the two iterator is already diverged.
+  // True if verification passed, false if not.
+  // op_logs is the information to print when validation fails.
+  void VerifyIterator(ThreadState* thread, ColumnFamilyHandle* cmp_cfh,
+                      const ReadOptions& ro, Iterator* iter, Iterator* cmp_iter,
+                      LastIterateOp op, const Slice& seek_key,
+                      const std::string& op_logs, bool* diverged);
+
+  virtual Status TestBackupRestore(ThreadState* thread,
+                                   const std::vector<int>& rand_column_families,
+                                   const std::vector<int64_t>& rand_keys);
+
+  virtual Status TestCheckpoint(ThreadState* thread,
+                                const std::vector<int>& rand_column_families,
+                                const std::vector<int64_t>& rand_keys);
+
+  void TestCompactFiles(ThreadState* thread, ColumnFamilyHandle* column_family);
+
+  Status TestFlush(const std::vector<int>& rand_column_families);
+
+  Status TestPauseBackground(ThreadState* thread);
+
+  void TestAcquireSnapshot(ThreadState* thread, int rand_column_family,
+                           const std::string& keystr, uint64_t i);
+
+  Status MaybeReleaseSnapshots(ThreadState* thread, uint64_t i);
+#ifndef ROCKSDB_LITE
+  Status VerifyGetLiveFiles() const;
+  Status VerifyGetSortedWalFiles() const;
+  Status VerifyGetCurrentWalFile() const;
+  void TestGetProperty(ThreadState* thread) const;
+
+  virtual Status TestApproximateSize(
+      ThreadState* thread, uint64_t iteration,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys);
+#endif  // !ROCKSDB_LITE
+
+  virtual Status TestCustomOperations(
+      ThreadState* /*thread*/,
+      const std::vector<int>& /*rand_column_families*/) {
+    return Status::NotSupported("TestCustomOperations() must be overridden");
+  }
+
+  void VerificationAbort(SharedState* shared, std::string msg, Status s) const;
+
+  void VerificationAbort(SharedState* shared, std::string msg, int cf,
+                         int64_t key) const;
+
+  void VerificationAbort(SharedState* shared, std::string msg, int cf,
+                         int64_t key, Slice value_from_db,
+                         Slice value_from_expected) const;
+
+  void VerificationAbort(SharedState* shared, int cf, int64_t key,
+                         const Slice& value, const WideColumns& columns,
+                         const WideColumns& expected_columns) const;
+
+  static std::string DebugString(const Slice& value, const WideColumns& columns,
+                                 const WideColumns& expected_columns);
+
+  void PrintEnv() const;
+
+  void Open(SharedState* shared);
+
+  void Reopen(ThreadState* thread);
+
+  virtual void RegisterAdditionalListeners() {}
+
+#ifndef ROCKSDB_LITE
+  virtual void PrepareTxnDbOptions(SharedState* /*shared*/,
+                                   TransactionDBOptions& /*txn_db_opts*/) {}
+#endif
+
+  // Returns whether the timestamp of read_opts is updated.
+  bool MaybeUseOlderTimestampForPointLookup(ThreadState* thread,
+                                            std::string& ts_str,
+                                            Slice& ts_slice,
+                                            ReadOptions& read_opts);
+
+  void MaybeUseOlderTimestampForRangeScan(ThreadState* thread,
+                                          std::string& ts_str, Slice& ts_slice,
+                                          ReadOptions& read_opts);
+
+  std::shared_ptr<Cache> cache_;
+  std::shared_ptr<Cache> compressed_cache_;
+  std::shared_ptr<const FilterPolicy> filter_policy_;
+  DB* db_;
+#ifndef ROCKSDB_LITE
+  TransactionDB* txn_db_;
+#endif
+
+  // Currently only used in MultiOpsTxnsStressTest
+  std::atomic<DB*> db_aptr_;
+
+  Options options_;
+  SystemClock* clock_;
+  std::vector<ColumnFamilyHandle*> column_families_;
+  std::vector<std::string> column_family_names_;
+  std::atomic<int> new_column_family_name_;
+  int num_times_reopened_;
+  std::unordered_map<std::string, std::vector<std::string>> options_table_;
+  std::vector<std::string> options_index_;
+  std::atomic<bool> db_preload_finished_;
+
+  // Fields used for continuous verification from another thread
+  DB* cmp_db_;
+  std::vector<ColumnFamilyHandle*> cmp_cfhs_;
+  bool is_db_stopped_;
+};
+
+// Load options from OPTIONS file and populate `options`.
+extern bool InitializeOptionsFromFile(Options& options);
+
+// Initialize `options` using command line arguments.
+// When this function is called, `cache`, `block_cache_compressed`,
+// `filter_policy` have all been initialized. Therefore, we just pass them as
+// input arguments.
+extern void InitializeOptionsFromFlags(
+    const std::shared_ptr<Cache>& cache,
+    const std::shared_ptr<Cache>& block_cache_compressed,
+    const std::shared_ptr<const FilterPolicy>& filter_policy, Options& options);
+
+// Initialize `options` on which `InitializeOptionsFromFile()` and
+// `InitializeOptionsFromFlags()` have both been called already.
+// There are two cases.
+// Case 1: OPTIONS file is not specified. Command line arguments have been used
+//         to initialize `options`. InitializeOptionsGeneral() will use
+//         `cache`, `block_cache_compressed` and `filter_policy` to initialize
+//         corresponding fields of `options`. InitializeOptionsGeneral() will
+//         also set up other fields of `options` so that stress test can run.
+//         Examples include `create_if_missing` and
+//         `create_missing_column_families`, etc.
+// Case 2: OPTIONS file is specified. It is possible that, after loading from
+//         the given OPTIONS files, some shared object fields are still not
+//         initialized because they are not set in the OPTIONS file. In this
+//         case, if command line arguments indicate that the user wants to set
+//         up such shared objects, e.g. block cache, compressed block cache,
+//         row cache, filter policy, then InitializeOptionsGeneral() will honor
+//         the user's choice, thus passing `cache`, `block_cache_compressed`,
+//         `filter_policy` as input arguments.
+//
+// InitializeOptionsGeneral() must not overwrite fields of `options` loaded
+// from OPTIONS file.
+extern void InitializeOptionsGeneral(
+    const std::shared_ptr<Cache>& cache,
+    const std::shared_ptr<Cache>& block_cache_compressed,
+    const std::shared_ptr<const FilterPolicy>& filter_policy, Options& options);
+
+// If no OPTIONS file is specified, set up `options` so that we can test
+// user-defined timestamp which requires `-user_timestamp_size=8`.
+// This function also checks for known (currently) incompatible features with
+// user-defined timestamp.
+extern void CheckAndSetOptionsForUserTimestamp(Options& options);
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // GFLAGS
diff --git a/src/rocksdb/db_stress_tool/db_stress_tool.cc b/src/rocksdb/db_stress_tool/db_stress_tool.cc
new file mode 100644
index 000000000..6c5e952db
--- /dev/null
+++ b/src/rocksdb/db_stress_tool/db_stress_tool.cc
@@ -0,0 +1,365 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// The test uses an array to compare against values written to the database.
+// Keys written to the array are in 1:1 correspondence to the actual values in
+// the database according to the formula in the function GenerateValue.
+
+// Space is reserved in the array from 0 to FLAGS_max_key and values are
+// randomly written/deleted/read from those positions. During verification we
+// compare all the positions in the array. To shorten/elongate the running
+// time, you could change the settings: FLAGS_max_key, FLAGS_ops_per_thread,
+// (sometimes also FLAGS_threads).
+//
+// NOTE that if FLAGS_test_batches_snapshots is set, the test will have
+// different behavior. See comment of the flag for details.
+
+#ifdef GFLAGS
+#include "db_stress_tool/db_stress_common.h"
+#include "db_stress_tool/db_stress_driver.h"
+#include "rocksdb/convenience.h"
+#include "utilities/fault_injection_fs.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+static std::shared_ptr<ROCKSDB_NAMESPACE::Env> env_guard;
+static std::shared_ptr<ROCKSDB_NAMESPACE::DbStressEnvWrapper> env_wrapper_guard;
+static std::shared_ptr<ROCKSDB_NAMESPACE::DbStressEnvWrapper>
+    dbsl_env_wrapper_guard;
+static std::shared_ptr<CompositeEnvWrapper> fault_env_guard;
+}  // namespace
+
+KeyGenContext key_gen_ctx;
+
+int db_stress_tool(int argc, char** argv) {
+  SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                  " [OPTIONS]...");
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  SanitizeDoubleParam(&FLAGS_bloom_bits);
+  SanitizeDoubleParam(&FLAGS_memtable_prefix_bloom_size_ratio);
+  SanitizeDoubleParam(&FLAGS_max_bytes_for_level_multiplier);
+
+#ifndef NDEBUG
+  if (FLAGS_mock_direct_io) {
+    SetupSyncPointsToMockDirectIO();
+  }
+#endif
+  if (FLAGS_statistics) {
+    dbstats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    if (FLAGS_test_secondary) {
+      dbstats_secondaries = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    }
+  }
+  compression_type_e = StringToCompressionType(FLAGS_compression_type.c_str());
+  bottommost_compression_type_e =
+      StringToCompressionType(FLAGS_bottommost_compression_type.c_str());
+  checksum_type_e = StringToChecksumType(FLAGS_checksum_type.c_str());
+
+  Env* raw_env;
+
+  int env_opts = !FLAGS_env_uri.empty() + !FLAGS_fs_uri.empty();
+  if (env_opts > 1) {
+    fprintf(stderr, "Error: --env_uri and --fs_uri are mutually exclusive\n");
+    exit(1);
+  }
+
+  Status s = Env::CreateFromUri(ConfigOptions(), FLAGS_env_uri, FLAGS_fs_uri,
+                                &raw_env, &env_guard);
+  if (!s.ok()) {
+    fprintf(stderr, "Error Creating Env URI: %s: %s\n", FLAGS_env_uri.c_str(),
+            s.ToString().c_str());
+    exit(1);
+  }
+  dbsl_env_wrapper_guard = std::make_shared<DbStressEnvWrapper>(raw_env);
+  db_stress_listener_env = dbsl_env_wrapper_guard.get();
+
+  if (FLAGS_read_fault_one_in || FLAGS_sync_fault_injection ||
+      FLAGS_write_fault_one_in || FLAGS_open_metadata_write_fault_one_in ||
+      FLAGS_open_write_fault_one_in || FLAGS_open_read_fault_one_in) {
+    FaultInjectionTestFS* fs =
+        new FaultInjectionTestFS(raw_env->GetFileSystem());
+    fault_fs_guard.reset(fs);
+    if (FLAGS_write_fault_one_in) {
+      fault_fs_guard->SetFilesystemDirectWritable(false);
+    } else {
+      fault_fs_guard->SetFilesystemDirectWritable(true);
+    }
+    fault_env_guard =
+        std::make_shared<CompositeEnvWrapper>(raw_env, fault_fs_guard);
+    raw_env = fault_env_guard.get();
+  }
+
+  env_wrapper_guard = std::make_shared<DbStressEnvWrapper>(raw_env);
+  db_stress_env = env_wrapper_guard.get();
+
+  if (FLAGS_write_fault_one_in) {
+    // In the write injection case, we need to use the FS interface and returns
+    // the IOStatus with different error and flags. Therefore,
+    // DbStressEnvWrapper cannot be used which will swallow the FS
+    // implementations. We should directly use the raw_env which is the
+    // CompositeEnvWrapper of env and fault_fs.
+    db_stress_env = raw_env;
+  }
+
+  FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
+
+  // The number of background threads should be at least as much the
+  // max number of concurrent compactions.
+  db_stress_env->SetBackgroundThreads(FLAGS_max_background_compactions,
+                                      ROCKSDB_NAMESPACE::Env::Priority::LOW);
+  db_stress_env->SetBackgroundThreads(FLAGS_num_bottom_pri_threads,
+                                      ROCKSDB_NAMESPACE::Env::Priority::BOTTOM);
+  if (FLAGS_prefixpercent > 0 && FLAGS_prefix_size < 0) {
+    fprintf(stderr,
+            "Error: prefixpercent is non-zero while prefix_size is "
+            "not positive!\n");
+    exit(1);
+  }
+  if (FLAGS_test_batches_snapshots && FLAGS_prefix_size <= 0) {
+    fprintf(stderr,
+            "Error: please specify prefix_size for "
+            "test_batches_snapshots test!\n");
+    exit(1);
+  }
+  if (FLAGS_memtable_prefix_bloom_size_ratio > 0.0 && FLAGS_prefix_size < 0 &&
+      !FLAGS_memtable_whole_key_filtering) {
+    fprintf(stderr,
+            "Error: please specify positive prefix_size or enable whole key "
+            "filtering in order to use memtable_prefix_bloom_size_ratio\n");
+    exit(1);
+  }
+  if ((FLAGS_readpercent + FLAGS_prefixpercent + FLAGS_writepercent +
+       FLAGS_delpercent + FLAGS_delrangepercent + FLAGS_iterpercent +
+       FLAGS_customopspercent) != 100) {
+    fprintf(
+        stderr,
+        "Error: "
+        "Read(-readpercent=%d)+Prefix(-prefixpercent=%d)+Write(-writepercent=%"
+        "d)+Delete(-delpercent=%d)+DeleteRange(-delrangepercent=%d)"
+        "+Iterate(-iterpercent=%d)+CustomOps(-customopspercent=%d) percents != "
+        "100!\n",
+        FLAGS_readpercent, FLAGS_prefixpercent, FLAGS_writepercent,
+        FLAGS_delpercent, FLAGS_delrangepercent, FLAGS_iterpercent,
+        FLAGS_customopspercent);
+    exit(1);
+  }
+  if (FLAGS_disable_wal == 1 && FLAGS_reopen > 0) {
+    fprintf(stderr, "Error: Db cannot reopen safely with disable_wal set!\n");
+    exit(1);
+  }
+  if ((unsigned)FLAGS_reopen >= FLAGS_ops_per_thread) {
+    fprintf(stderr,
+            "Error: #DB-reopens should be < ops_per_thread\n"
+            "Provided reopens = %d and ops_per_thread = %lu\n",
+            FLAGS_reopen, (unsigned long)FLAGS_ops_per_thread);
+    exit(1);
+  }
+  if (FLAGS_test_batches_snapshots && FLAGS_delrangepercent > 0) {
+    fprintf(stderr,
+            "Error: nonzero delrangepercent unsupported in "
+            "test_batches_snapshots mode\n");
+    exit(1);
+  }
+  if (FLAGS_active_width > FLAGS_max_key) {
+    fprintf(stderr, "Error: active_width can be at most max_key\n");
+    exit(1);
+  } else if (FLAGS_active_width == 0) {
+    FLAGS_active_width = FLAGS_max_key;
+  }
+  if (FLAGS_value_size_mult * kRandomValueMaxFactor > kValueMaxLen) {
+    fprintf(stderr, "Error: value_size_mult can be at most %d\n",
+            kValueMaxLen / kRandomValueMaxFactor);
+    exit(1);
+  }
+  if (FLAGS_use_merge && FLAGS_nooverwritepercent == 100) {
+    fprintf(
+        stderr,
+        "Error: nooverwritepercent must not be 100 when using merge operands");
+    exit(1);
+  }
+  if (FLAGS_ingest_external_file_one_in > 0 &&
+      FLAGS_nooverwritepercent == 100) {
+    fprintf(
+        stderr,
+        "Error: nooverwritepercent must not be 100 when using file ingestion");
+    exit(1);
+  }
+  if (FLAGS_clear_column_family_one_in > 0 && FLAGS_backup_one_in > 0) {
+    fprintf(stderr,
+            "Error: clear_column_family_one_in must be 0 when using backup\n");
+    exit(1);
+  }
+  if (FLAGS_test_cf_consistency && FLAGS_disable_wal) {
+    FLAGS_atomic_flush = true;
+  }
+
+  if (FLAGS_read_only) {
+    if (FLAGS_writepercent != 0 || FLAGS_delpercent != 0 ||
+        FLAGS_delrangepercent != 0) {
+      fprintf(stderr, "Error: updates are not supported in read only mode\n");
+      exit(1);
+    } else if (FLAGS_checkpoint_one_in > 0 &&
+               FLAGS_clear_column_family_one_in > 0) {
+      fprintf(stdout,
+              "Warn: checkpoint won't be validated since column families may "
+              "be dropped.\n");
+    }
+  }
+
+  // Choose a location for the test database if none given with --db=<path>
+  if (FLAGS_db.empty()) {
+    std::string default_db_path;
+    db_stress_env->GetTestDirectory(&default_db_path);
+    default_db_path += "/dbstress";
+    FLAGS_db = default_db_path;
+  }
+
+  if ((FLAGS_test_secondary || FLAGS_continuous_verification_interval > 0) &&
+      FLAGS_secondaries_base.empty()) {
+    std::string default_secondaries_path;
+    db_stress_env->GetTestDirectory(&default_secondaries_path);
+    default_secondaries_path += "/dbstress_secondaries";
+    s = db_stress_env->CreateDirIfMissing(default_secondaries_path);
+    if (!s.ok()) {
+      fprintf(stderr, "Failed to create directory %s: %s\n",
+              default_secondaries_path.c_str(), s.ToString().c_str());
+      exit(1);
+    }
+    FLAGS_secondaries_base = default_secondaries_path;
+  }
+
+  if (FLAGS_best_efforts_recovery && !FLAGS_skip_verifydb &&
+      !FLAGS_disable_wal) {
+    fprintf(stderr,
+            "With best-efforts recovery, either skip_verifydb or disable_wal "
+            "should be set to true.\n");
+    exit(1);
+  }
+  if (FLAGS_skip_verifydb) {
+    if (FLAGS_verify_db_one_in > 0) {
+      fprintf(stderr,
+              "Must set -verify_db_one_in=0 if skip_verifydb is true.\n");
+      exit(1);
+    }
+    if (FLAGS_continuous_verification_interval > 0) {
+      fprintf(stderr,
+              "Must set -continuous_verification_interval=0 if skip_verifydb "
+              "is true.\n");
+      exit(1);
+    }
+  }
+  if (FLAGS_enable_compaction_filter &&
+      (FLAGS_acquire_snapshot_one_in > 0 || FLAGS_compact_range_one_in > 0 ||
+       FLAGS_iterpercent > 0 || FLAGS_test_batches_snapshots ||
+       FLAGS_test_cf_consistency)) {
+    fprintf(
+        stderr,
+        "Error: acquire_snapshot_one_in, compact_range_one_in, iterpercent, "
+        "test_batches_snapshots  must all be 0 when using compaction filter\n");
+    exit(1);
+  }
+  if (FLAGS_test_multi_ops_txns) {
+    CheckAndSetOptionsForMultiOpsTxnStressTest();
+  }
+
+  if (FLAGS_create_timestamped_snapshot_one_in > 0) {
+    if (!FLAGS_use_txn) {
+      fprintf(stderr, "timestamped snapshot supported only in TransactionDB\n");
+      exit(1);
+    } else if (FLAGS_txn_write_policy != 0) {
+      fprintf(stderr,
+              "timestamped snapshot supported only in write-committed\n");
+      exit(1);
+    }
+  }
+
+  if (FLAGS_preserve_unverified_changes && FLAGS_reopen != 0) {
+    fprintf(stderr,
+            "Reopen DB is incompatible with preserving unverified changes\n");
+    exit(1);
+  }
+
+  if (FLAGS_use_txn && FLAGS_sync_fault_injection &&
+      FLAGS_txn_write_policy != 0) {
+    fprintf(stderr,
+            "For TransactionDB, correctness testing with unsync data loss is "
+            "currently compatible with only write committed policy\n");
+    exit(1);
+  }
+
+  if (FLAGS_use_put_entity_one_in > 0 &&
+      (FLAGS_ingest_external_file_one_in > 0 || FLAGS_use_merge ||
+       FLAGS_use_full_merge_v1 || FLAGS_use_txn || FLAGS_test_multi_ops_txns ||
+       FLAGS_user_timestamp_size > 0)) {
+    fprintf(stderr,
+            "PutEntity is currently incompatible with SstFileWriter, Merge,"
+            " transactions, and user-defined timestamps\n");
+    exit(1);
+  }
+
+#ifndef NDEBUG
+  KillPoint* kp = KillPoint::GetInstance();
+  kp->rocksdb_kill_odds = FLAGS_kill_random_test;
+  kp->rocksdb_kill_exclude_prefixes = SplitString(FLAGS_kill_exclude_prefixes);
+#endif
+
+  unsigned int levels = FLAGS_max_key_len;
+  std::vector<std::string> weights;
+  uint64_t scale_factor = FLAGS_key_window_scale_factor;
+  key_gen_ctx.window = scale_factor * 100;
+  if (!FLAGS_key_len_percent_dist.empty()) {
+    weights = SplitString(FLAGS_key_len_percent_dist);
+    if (weights.size() != levels) {
+      fprintf(stderr,
+              "Number of weights in key_len_dist should be equal to"
+              " max_key_len");
+      exit(1);
+    }
+
+    uint64_t total_weight = 0;
+    for (std::string& weight : weights) {
+      uint64_t val = std::stoull(weight);
+      key_gen_ctx.weights.emplace_back(val * scale_factor);
+      total_weight += val;
+    }
+    if (total_weight != 100) {
+      fprintf(stderr, "Sum of all weights in key_len_dist should be 100");
+      exit(1);
+    }
+  } else {
+    uint64_t keys_per_level = key_gen_ctx.window / levels;
+    for (unsigned int level = 0; level + 1 < levels; ++level) {
+      key_gen_ctx.weights.emplace_back(keys_per_level);
+    }
+    key_gen_ctx.weights.emplace_back(key_gen_ctx.window -
+                                     keys_per_level * (levels - 1));
+  }
+
+  std::unique_ptr<ROCKSDB_NAMESPACE::StressTest> stress;
+  if (FLAGS_test_cf_consistency) {
+    stress.reset(CreateCfConsistencyStressTest());
+  } else if (FLAGS_test_batches_snapshots) {
+    stress.reset(CreateBatchedOpsStressTest());
+  } else if (FLAGS_test_multi_ops_txns) {
+    stress.reset(CreateMultiOpsTxnsStressTest());
+  } else {
+    stress.reset(CreateNonBatchedOpsStressTest());
+  }
+  // Initialize the Zipfian pre-calculated array
+  InitializeHotKeyGenerator(FLAGS_hot_key_alpha);
+  if (RunStressTest(stress.get())) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // GFLAGS
diff --git a/src/rocksdb/db_stress_tool/expected_state.cc b/src/rocksdb/db_stress_tool/expected_state.cc
new file mode 100644
index 000000000..d08403b76
--- /dev/null
+++ b/src/rocksdb/db_stress_tool/expected_state.cc
@@ -0,0 +1,761 @@
+//  Copyright (c) 2021-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifdef GFLAGS
+
+#include "db_stress_tool/expected_state.h"
+
+#include "db/wide/wide_column_serialization.h"
+#include "db_stress_tool/db_stress_common.h"
+#include "db_stress_tool/db_stress_shared_state.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "rocksdb/trace_record_result.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ExpectedState::ExpectedState(size_t max_key, size_t num_column_families)
+    : max_key_(max_key),
+      num_column_families_(num_column_families),
+      values_(nullptr) {}
+
+void ExpectedState::ClearColumnFamily(int cf) {
+  std::fill(&Value(cf, 0 /* key */), &Value(cf + 1, 0 /* key */),
+            SharedState::DELETION_SENTINEL);
+}
+
+void ExpectedState::Put(int cf, int64_t key, uint32_t value_base,
+                        bool pending) {
+  if (!pending) {
+    // prevent expected-value update from reordering before Write
+    std::atomic_thread_fence(std::memory_order_release);
+  }
+  Value(cf, key).store(pending ? SharedState::UNKNOWN_SENTINEL : value_base,
+                       std::memory_order_relaxed);
+  if (pending) {
+    // prevent Write from reordering before expected-value update
+    std::atomic_thread_fence(std::memory_order_release);
+  }
+}
+
+uint32_t ExpectedState::Get(int cf, int64_t key) const {
+  return Value(cf, key);
+}
+
+bool ExpectedState::Delete(int cf, int64_t key, bool pending) {
+  if (Value(cf, key) == SharedState::DELETION_SENTINEL) {
+    return false;
+  }
+  Put(cf, key, SharedState::DELETION_SENTINEL, pending);
+  return true;
+}
+
+bool ExpectedState::SingleDelete(int cf, int64_t key, bool pending) {
+  return Delete(cf, key, pending);
+}
+
+int ExpectedState::DeleteRange(int cf, int64_t begin_key, int64_t end_key,
+                               bool pending) {
+  int covered = 0;
+  for (int64_t key = begin_key; key < end_key; ++key) {
+    if (Delete(cf, key, pending)) {
+      ++covered;
+    }
+  }
+  return covered;
+}
+
+bool ExpectedState::Exists(int cf, int64_t key) {
+  // UNKNOWN_SENTINEL counts as exists. That assures a key for which overwrite
+  // is disallowed can't be accidentally added a second time, in which case
+  // SingleDelete wouldn't be able to properly delete the key. It does allow
+  // the case where a SingleDelete might be added which covers nothing, but
+  // that's not a correctness issue.
+  uint32_t expected_value = Value(cf, key).load();
+  return expected_value != SharedState::DELETION_SENTINEL;
+}
+
+void ExpectedState::Reset() {
+  for (size_t i = 0; i < num_column_families_; ++i) {
+    for (size_t j = 0; j < max_key_; ++j) {
+      Value(static_cast<int>(i), j)
+          .store(SharedState::DELETION_SENTINEL, std::memory_order_relaxed);
+    }
+  }
+}
+
+FileExpectedState::FileExpectedState(std::string expected_state_file_path,
+                                     size_t max_key, size_t num_column_families)
+    : ExpectedState(max_key, num_column_families),
+      expected_state_file_path_(expected_state_file_path) {}
+
+Status FileExpectedState::Open(bool create) {
+  size_t expected_values_size = GetValuesLen();
+
+  Env* default_env = Env::Default();
+
+  Status status;
+  if (create) {
+    std::unique_ptr<WritableFile> wfile;
+    const EnvOptions soptions;
+    status = default_env->NewWritableFile(expected_state_file_path_, &wfile,
+                                          soptions);
+    if (status.ok()) {
+      std::string buf(expected_values_size, '\0');
+      status = wfile->Append(buf);
+    }
+  }
+  if (status.ok()) {
+    status = default_env->NewMemoryMappedFileBuffer(
+        expected_state_file_path_, &expected_state_mmap_buffer_);
+  }
+  if (status.ok()) {
+    assert(expected_state_mmap_buffer_->GetLen() == expected_values_size);
+    values_ = static_cast<std::atomic<uint32_t>*>(
+        expected_state_mmap_buffer_->GetBase());
+    assert(values_ != nullptr);
+    if (create) {
+      Reset();
+    }
+  } else {
+    assert(values_ == nullptr);
+  }
+  return status;
+}
+
+AnonExpectedState::AnonExpectedState(size_t max_key, size_t num_column_families)
+    : ExpectedState(max_key, num_column_families) {}
+
+#ifndef NDEBUG
+Status AnonExpectedState::Open(bool create) {
+#else
+Status AnonExpectedState::Open(bool /* create */) {
+#endif
+  // AnonExpectedState only supports being freshly created.
+  assert(create);
+  values_allocation_.reset(
+      new std::atomic<uint32_t>[GetValuesLen() /
+                                sizeof(std::atomic<uint32_t>)]);
+  values_ = &values_allocation_[0];
+  Reset();
+  return Status::OK();
+}
+
+ExpectedStateManager::ExpectedStateManager(size_t max_key,
+                                           size_t num_column_families)
+    : max_key_(max_key),
+      num_column_families_(num_column_families),
+      latest_(nullptr) {}
+
+ExpectedStateManager::~ExpectedStateManager() {}
+
+const std::string FileExpectedStateManager::kLatestBasename = "LATEST";
+const std::string FileExpectedStateManager::kStateFilenameSuffix = ".state";
+const std::string FileExpectedStateManager::kTraceFilenameSuffix = ".trace";
+const std::string FileExpectedStateManager::kTempFilenamePrefix = ".";
+const std::string FileExpectedStateManager::kTempFilenameSuffix = ".tmp";
+
+FileExpectedStateManager::FileExpectedStateManager(
+    size_t max_key, size_t num_column_families,
+    std::string expected_state_dir_path)
+    : ExpectedStateManager(max_key, num_column_families),
+      expected_state_dir_path_(std::move(expected_state_dir_path)) {
+  assert(!expected_state_dir_path_.empty());
+}
+
+Status FileExpectedStateManager::Open() {
+  // Before doing anything, sync directory state with ours. That is, determine
+  // `saved_seqno_`, and create any necessary missing files.
+  std::vector<std::string> expected_state_dir_children;
+  Status s = Env::Default()->GetChildren(expected_state_dir_path_,
+                                         &expected_state_dir_children);
+  bool found_trace = false;
+  if (s.ok()) {
+    for (size_t i = 0; i < expected_state_dir_children.size(); ++i) {
+      const auto& filename = expected_state_dir_children[i];
+      if (filename.size() >= kStateFilenameSuffix.size() &&
+          filename.rfind(kStateFilenameSuffix) ==
+              filename.size() - kStateFilenameSuffix.size() &&
+          filename.rfind(kLatestBasename, 0) == std::string::npos) {
+        SequenceNumber found_seqno = ParseUint64(
+            filename.substr(0, filename.size() - kStateFilenameSuffix.size()));
+        if (saved_seqno_ == kMaxSequenceNumber || found_seqno > saved_seqno_) {
+          saved_seqno_ = found_seqno;
+        }
+      }
+    }
+    // Check if crash happened after creating state file but before creating
+    // trace file.
+    if (saved_seqno_ != kMaxSequenceNumber) {
+      std::string saved_seqno_trace_path = GetPathForFilename(
+          std::to_string(saved_seqno_) + kTraceFilenameSuffix);
+      Status exists_status = Env::Default()->FileExists(saved_seqno_trace_path);
+      if (exists_status.ok()) {
+        found_trace = true;
+      } else if (exists_status.IsNotFound()) {
+        found_trace = false;
+      } else {
+        s = exists_status;
+      }
+    }
+  }
+  if (s.ok() && saved_seqno_ != kMaxSequenceNumber && !found_trace) {
+    // Create an empty trace file so later logic does not need to distinguish
+    // missing vs. empty trace file.
+    std::unique_ptr<WritableFile> wfile;
+    const EnvOptions soptions;
+    std::string saved_seqno_trace_path =
+        GetPathForFilename(std::to_string(saved_seqno_) + kTraceFilenameSuffix);
+    s = Env::Default()->NewWritableFile(saved_seqno_trace_path, &wfile,
+                                        soptions);
+  }
+
+  if (s.ok()) {
+    s = Clean();
+  }
+
+  std::string expected_state_file_path =
+      GetPathForFilename(kLatestBasename + kStateFilenameSuffix);
+  bool found = false;
+  if (s.ok()) {
+    Status exists_status = Env::Default()->FileExists(expected_state_file_path);
+    if (exists_status.ok()) {
+      found = true;
+    } else if (exists_status.IsNotFound()) {
+      found = false;
+    } else {
+      s = exists_status;
+    }
+  }
+
+  if (!found) {
+    // Initialize the file in a temp path and then rename it. That way, in case
+    // this process is killed during setup, `Clean()` will take care of removing
+    // the incomplete expected values file.
+    std::string temp_expected_state_file_path =
+        GetTempPathForFilename(kLatestBasename + kStateFilenameSuffix);
+    FileExpectedState temp_expected_state(temp_expected_state_file_path,
+                                          max_key_, num_column_families_);
+    if (s.ok()) {
+      s = temp_expected_state.Open(true /* create */);
+    }
+    if (s.ok()) {
+      s = Env::Default()->RenameFile(temp_expected_state_file_path,
+                                     expected_state_file_path);
+    }
+  }
+
+  if (s.ok()) {
+    latest_.reset(new FileExpectedState(std::move(expected_state_file_path),
+                                        max_key_, num_column_families_));
+    s = latest_->Open(false /* create */);
+  }
+  return s;
+}
+
+#ifndef ROCKSDB_LITE
+Status FileExpectedStateManager::SaveAtAndAfter(DB* db) {
+  SequenceNumber seqno = db->GetLatestSequenceNumber();
+
+  std::string state_filename = std::to_string(seqno) + kStateFilenameSuffix;
+  std::string state_file_temp_path = GetTempPathForFilename(state_filename);
+  std::string state_file_path = GetPathForFilename(state_filename);
+
+  std::string latest_file_path =
+      GetPathForFilename(kLatestBasename + kStateFilenameSuffix);
+
+  std::string trace_filename = std::to_string(seqno) + kTraceFilenameSuffix;
+  std::string trace_file_path = GetPathForFilename(trace_filename);
+
+  // Populate a tempfile and then rename it to atomically create "<seqno>.state"
+  // with contents from "LATEST.state"
+  Status s = CopyFile(FileSystem::Default(), latest_file_path,
+                      state_file_temp_path, 0 /* size */, false /* use_fsync */,
+                      nullptr /* io_tracer */, Temperature::kUnknown);
+  if (s.ok()) {
+    s = FileSystem::Default()->RenameFile(state_file_temp_path, state_file_path,
+                                          IOOptions(), nullptr /* dbg */);
+  }
+  SequenceNumber old_saved_seqno = 0;
+  if (s.ok()) {
+    old_saved_seqno = saved_seqno_;
+    saved_seqno_ = seqno;
+  }
+
+  // If there is a crash now, i.e., after "<seqno>.state" was created but before
+  // "<seqno>.trace" is created, it will be treated as if "<seqno>.trace" were
+  // present but empty.
+
+  // Create "<seqno>.trace" directly. It is initially empty so no need for
+  // tempfile.
+  std::unique_ptr<TraceWriter> trace_writer;
+  if (s.ok()) {
+    EnvOptions soptions;
+    // Disable buffering so traces will not get stuck in application buffer.
+    soptions.writable_file_max_buffer_size = 0;
+    s = NewFileTraceWriter(Env::Default(), soptions, trace_file_path,
+                           &trace_writer);
+  }
+  if (s.ok()) {
+    TraceOptions trace_opts;
+    trace_opts.filter |= kTraceFilterGet;
+    trace_opts.filter |= kTraceFilterMultiGet;
+    trace_opts.filter |= kTraceFilterIteratorSeek;
+    trace_opts.filter |= kTraceFilterIteratorSeekForPrev;
+    trace_opts.preserve_write_order = true;
+    s = db->StartTrace(trace_opts, std::move(trace_writer));
+  }
+
+  // Delete old state/trace files. Deletion order does not matter since we only
+  // delete after successfully saving new files, so old files will never be used
+  // again, even if we crash.
+  if (s.ok() && old_saved_seqno != kMaxSequenceNumber &&
+      old_saved_seqno != saved_seqno_) {
+    s = Env::Default()->DeleteFile(GetPathForFilename(
+        std::to_string(old_saved_seqno) + kStateFilenameSuffix));
+  }
+  if (s.ok() && old_saved_seqno != kMaxSequenceNumber &&
+      old_saved_seqno != saved_seqno_) {
+    s = Env::Default()->DeleteFile(GetPathForFilename(
+        std::to_string(old_saved_seqno) + kTraceFilenameSuffix));
+  }
+  return s;
+}
+#else   // ROCKSDB_LITE
+Status FileExpectedStateManager::SaveAtAndAfter(DB* /* db */) {
+  return Status::NotSupported();
+}
+#endif  // ROCKSDB_LITE
+
+bool FileExpectedStateManager::HasHistory() {
+  return saved_seqno_ != kMaxSequenceNumber;
+}
+
+#ifndef ROCKSDB_LITE
+
+namespace {
+
+// An `ExpectedStateTraceRecordHandler` applies a configurable number of
+// write operation trace records to the configured expected state. It is used in
+// `FileExpectedStateManager::Restore()` to sync the expected state with the
+// DB's post-recovery state.
+class ExpectedStateTraceRecordHandler : public TraceRecord::Handler,
+                                        public WriteBatch::Handler {
+ public:
+  ExpectedStateTraceRecordHandler(uint64_t max_write_ops, ExpectedState* state)
+      : max_write_ops_(max_write_ops),
+        state_(state),
+        buffered_writes_(nullptr) {}
+
+  ~ExpectedStateTraceRecordHandler() { assert(IsDone()); }
+
+  // True if we have already reached the limit on write operations to apply.
+  bool IsDone() { return num_write_ops_ == max_write_ops_; }
+
+  Status Handle(const WriteQueryTraceRecord& record,
+                std::unique_ptr<TraceRecordResult>* /* result */) override {
+    if (IsDone()) {
+      return Status::OK();
+    }
+    WriteBatch batch(record.GetWriteBatchRep().ToString());
+    return batch.Iterate(this);
+  }
+
+  // Ignore reads.
+  Status Handle(const GetQueryTraceRecord& /* record */,
+                std::unique_ptr<TraceRecordResult>* /* result */) override {
+    return Status::OK();
+  }
+
+  // Ignore reads.
+  Status Handle(const IteratorSeekQueryTraceRecord& /* record */,
+                std::unique_ptr<TraceRecordResult>* /* result */) override {
+    return Status::OK();
+  }
+
+  // Ignore reads.
+  Status Handle(const MultiGetQueryTraceRecord& /* record */,
+                std::unique_ptr<TraceRecordResult>* /* result */) override {
+    return Status::OK();
+  }
+
+  // Below are the WriteBatch::Handler overrides. We could use a separate
+  // object, but it's convenient and works to share state with the
+  // `TraceRecord::Handler`.
+
+  Status PutCF(uint32_t column_family_id, const Slice& key_with_ts,
+               const Slice& value) override {
+    Slice key =
+        StripTimestampFromUserKey(key_with_ts, FLAGS_user_timestamp_size);
+    uint64_t key_id;
+    if (!GetIntVal(key.ToString(), &key_id)) {
+      return Status::Corruption("unable to parse key", key.ToString());
+    }
+    uint32_t value_id = GetValueBase(value);
+
+    bool should_buffer_write = !(buffered_writes_ == nullptr);
+    if (should_buffer_write) {
+      return WriteBatchInternal::Put(buffered_writes_.get(), column_family_id,
+                                     key, value);
+    }
+
+    state_->Put(column_family_id, static_cast<int64_t>(key_id), value_id,
+                false /* pending */);
+    ++num_write_ops_;
+    return Status::OK();
+  }
+
+  Status PutEntityCF(uint32_t column_family_id, const Slice& key_with_ts,
+                     const Slice& entity) override {
+    Slice key =
+        StripTimestampFromUserKey(key_with_ts, FLAGS_user_timestamp_size);
+
+    uint64_t key_id = 0;
+    if (!GetIntVal(key.ToString(), &key_id)) {
+      return Status::Corruption("Unable to parse key", key.ToString());
+    }
+
+    Slice entity_copy = entity;
+    WideColumns columns;
+    if (!WideColumnSerialization::Deserialize(entity_copy, columns).ok()) {
+      return Status::Corruption("Unable to deserialize entity",
+                                entity.ToString(/* hex */ true));
+    }
+
+    if (columns.empty() || columns[0].name() != kDefaultWideColumnName) {
+      return Status::Corruption("Cannot find default column in entity",
+                                entity.ToString(/* hex */ true));
+    }
+
+    const Slice& value_of_default = columns[0].value();
+
+    const uint32_t value_base = GetValueBase(value_of_default);
+
+    if (columns != GenerateExpectedWideColumns(value_base, value_of_default)) {
+      return Status::Corruption("Wide columns in entity inconsistent",
+                                entity.ToString(/* hex */ true));
+    }
+
+    if (buffered_writes_) {
+      return WriteBatchInternal::PutEntity(buffered_writes_.get(),
+                                           column_family_id, key, columns);
+    }
+
+    state_->Put(column_family_id, static_cast<int64_t>(key_id), value_base,
+                false /* pending */);
+
+    ++num_write_ops_;
+
+    return Status::OK();
+  }
+
+  Status DeleteCF(uint32_t column_family_id,
+                  const Slice& key_with_ts) override {
+    Slice key =
+        StripTimestampFromUserKey(key_with_ts, FLAGS_user_timestamp_size);
+    uint64_t key_id;
+    if (!GetIntVal(key.ToString(), &key_id)) {
+      return Status::Corruption("unable to parse key", key.ToString());
+    }
+
+    bool should_buffer_write = !(buffered_writes_ == nullptr);
+    if (should_buffer_write) {
+      return WriteBatchInternal::Delete(buffered_writes_.get(),
+                                        column_family_id, key);
+    }
+
+    state_->Delete(column_family_id, static_cast<int64_t>(key_id),
+                   false /* pending */);
+    ++num_write_ops_;
+    return Status::OK();
+  }
+
+  Status SingleDeleteCF(uint32_t column_family_id,
+                        const Slice& key_with_ts) override {
+    bool should_buffer_write = !(buffered_writes_ == nullptr);
+    if (should_buffer_write) {
+      Slice key =
+          StripTimestampFromUserKey(key_with_ts, FLAGS_user_timestamp_size);
+      Slice ts =
+          ExtractTimestampFromUserKey(key_with_ts, FLAGS_user_timestamp_size);
+      std::array<Slice, 2> key_with_ts_arr{{key, ts}};
+      return WriteBatchInternal::SingleDelete(
+          buffered_writes_.get(), column_family_id,
+          SliceParts(key_with_ts_arr.data(), 2));
+    }
+
+    return DeleteCF(column_family_id, key_with_ts);
+  }
+
+  Status DeleteRangeCF(uint32_t column_family_id,
+                       const Slice& begin_key_with_ts,
+                       const Slice& end_key_with_ts) override {
+    Slice begin_key =
+        StripTimestampFromUserKey(begin_key_with_ts, FLAGS_user_timestamp_size);
+    Slice end_key =
+        StripTimestampFromUserKey(end_key_with_ts, FLAGS_user_timestamp_size);
+    uint64_t begin_key_id, end_key_id;
+    if (!GetIntVal(begin_key.ToString(), &begin_key_id)) {
+      return Status::Corruption("unable to parse begin key",
+                                begin_key.ToString());
+    }
+    if (!GetIntVal(end_key.ToString(), &end_key_id)) {
+      return Status::Corruption("unable to parse end key", end_key.ToString());
+    }
+
+    bool should_buffer_write = !(buffered_writes_ == nullptr);
+    if (should_buffer_write) {
+      return WriteBatchInternal::DeleteRange(
+          buffered_writes_.get(), column_family_id, begin_key, end_key);
+    }
+
+    state_->DeleteRange(column_family_id, static_cast<int64_t>(begin_key_id),
+                        static_cast<int64_t>(end_key_id), false /* pending */);
+    ++num_write_ops_;
+    return Status::OK();
+  }
+
+  Status MergeCF(uint32_t column_family_id, const Slice& key_with_ts,
+                 const Slice& value) override {
+    Slice key =
+        StripTimestampFromUserKey(key_with_ts, FLAGS_user_timestamp_size);
+
+    bool should_buffer_write = !(buffered_writes_ == nullptr);
+    if (should_buffer_write) {
+      return WriteBatchInternal::Merge(buffered_writes_.get(), column_family_id,
+                                       key, value);
+    }
+
+    return PutCF(column_family_id, key, value);
+  }
+
+  Status MarkBeginPrepare(bool = false) override {
+    assert(!buffered_writes_);
+    buffered_writes_.reset(new WriteBatch());
+    return Status::OK();
+  }
+
+  Status MarkEndPrepare(const Slice& xid) override {
+    assert(buffered_writes_);
+    std::string xid_str = xid.ToString();
+    assert(xid_to_buffered_writes_.find(xid_str) ==
+           xid_to_buffered_writes_.end());
+
+    xid_to_buffered_writes_[xid_str].swap(buffered_writes_);
+
+    buffered_writes_.reset();
+
+    return Status::OK();
+  }
+
+  Status MarkCommit(const Slice& xid) override {
+    std::string xid_str = xid.ToString();
+    assert(xid_to_buffered_writes_.find(xid_str) !=
+           xid_to_buffered_writes_.end());
+    assert(xid_to_buffered_writes_.at(xid_str));
+
+    Status s = xid_to_buffered_writes_.at(xid_str)->Iterate(this);
+    xid_to_buffered_writes_.erase(xid_str);
+
+    return s;
+  }
+
+  Status MarkRollback(const Slice& xid) override {
+    std::string xid_str = xid.ToString();
+    assert(xid_to_buffered_writes_.find(xid_str) !=
+           xid_to_buffered_writes_.end());
+    assert(xid_to_buffered_writes_.at(xid_str));
+    xid_to_buffered_writes_.erase(xid_str);
+
+    return Status::OK();
+  }
+
+ private:
+  uint64_t num_write_ops_ = 0;
+  uint64_t max_write_ops_;
+  ExpectedState* state_;
+  std::unordered_map<std::string, std::unique_ptr<WriteBatch>>
+      xid_to_buffered_writes_;
+  std::unique_ptr<WriteBatch> buffered_writes_;
+};
+
+}  // anonymous namespace
+
+Status FileExpectedStateManager::Restore(DB* db) {
+  assert(HasHistory());
+  SequenceNumber seqno = db->GetLatestSequenceNumber();
+  if (seqno < saved_seqno_) {
+    return Status::Corruption("DB is older than any restorable expected state");
+  }
+
+  std::string state_filename =
+      std::to_string(saved_seqno_) + kStateFilenameSuffix;
+  std::string state_file_path = GetPathForFilename(state_filename);
+
+  std::string latest_file_temp_path =
+      GetTempPathForFilename(kLatestBasename + kStateFilenameSuffix);
+  std::string latest_file_path =
+      GetPathForFilename(kLatestBasename + kStateFilenameSuffix);
+
+  std::string trace_filename =
+      std::to_string(saved_seqno_) + kTraceFilenameSuffix;
+  std::string trace_file_path = GetPathForFilename(trace_filename);
+
+  std::unique_ptr<TraceReader> trace_reader;
+  Status s = NewFileTraceReader(Env::Default(), EnvOptions(), trace_file_path,
+                                &trace_reader);
+
+  if (s.ok()) {
+    // We are going to replay on top of "`seqno`.state" to create a new
+    // "LATEST.state". Start off by creating a tempfile so we can later make the
+    // new "LATEST.state" appear atomically using `RenameFile()`.
+    s = CopyFile(FileSystem::Default(), state_file_path, latest_file_temp_path,
+                 0 /* size */, false /* use_fsync */, nullptr /* io_tracer */,
+                 Temperature::kUnknown);
+  }
+
+  {
+    std::unique_ptr<Replayer> replayer;
+    std::unique_ptr<ExpectedState> state;
+    std::unique_ptr<ExpectedStateTraceRecordHandler> handler;
+    if (s.ok()) {
+      state.reset(new FileExpectedState(latest_file_temp_path, max_key_,
+                                        num_column_families_));
+      s = state->Open(false /* create */);
+    }
+    if (s.ok()) {
+      handler.reset(new ExpectedStateTraceRecordHandler(seqno - saved_seqno_,
+                                                        state.get()));
+      // TODO(ajkr): An API limitation requires we provide `handles` although
+      // they will be unused since we only use the replayer for reading records.
+      // Just give a default CFH for now to satisfy the requirement.
+      s = db->NewDefaultReplayer({db->DefaultColumnFamily()} /* handles */,
+                                 std::move(trace_reader), &replayer);
+    }
+
+    if (s.ok()) {
+      s = replayer->Prepare();
+    }
+    for (;;) {
+      std::unique_ptr<TraceRecord> record;
+      s = replayer->Next(&record);
+      if (!s.ok()) {
+        break;
+      }
+      std::unique_ptr<TraceRecordResult> res;
+      record->Accept(handler.get(), &res);
+    }
+    if (s.IsCorruption() && handler->IsDone()) {
+      // There could be a corruption reading the tail record of the trace due to
+      // `db_stress` crashing while writing it. It shouldn't matter as long as
+      // we already found all the write ops we need to catch up the expected
+      // state.
+      s = Status::OK();
+    }
+    if (s.IsIncomplete()) {
+      // OK because `Status::Incomplete` is expected upon finishing all the
+      // trace records.
+      s = Status::OK();
+    }
+  }
+
+  if (s.ok()) {
+    s = FileSystem::Default()->RenameFile(latest_file_temp_path,
+                                          latest_file_path, IOOptions(),
+                                          nullptr /* dbg */);
+  }
+  if (s.ok()) {
+    latest_.reset(new FileExpectedState(latest_file_path, max_key_,
+                                        num_column_families_));
+    s = latest_->Open(false /* create */);
+  }
+
+  // Delete old state/trace files. We must delete the state file first.
+  // Otherwise, a crash-recovery immediately after deleting the trace file could
+  // lead to `Restore()` unable to replay to `seqno`.
+  if (s.ok()) {
+    s = Env::Default()->DeleteFile(state_file_path);
+  }
+  if (s.ok()) {
+    saved_seqno_ = kMaxSequenceNumber;
+    s = Env::Default()->DeleteFile(trace_file_path);
+  }
+  return s;
+}
+#else   // ROCKSDB_LITE
+Status FileExpectedStateManager::Restore(DB* /* db */) {
+  return Status::NotSupported();
+}
+#endif  // ROCKSDB_LITE
+
+Status FileExpectedStateManager::Clean() {
+  std::vector<std::string> expected_state_dir_children;
+  Status s = Env::Default()->GetChildren(expected_state_dir_path_,
+                                         &expected_state_dir_children);
+  // An incomplete `Open()` or incomplete `SaveAtAndAfter()` could have left
+  // behind invalid temporary files. An incomplete `SaveAtAndAfter()` could have
+  // also left behind stale state/trace files. An incomplete `Restore()` could
+  // have left behind stale trace files.
+  for (size_t i = 0; s.ok() && i < expected_state_dir_children.size(); ++i) {
+    const auto& filename = expected_state_dir_children[i];
+    if (filename.rfind(kTempFilenamePrefix, 0 /* pos */) == 0 &&
+        filename.size() >= kTempFilenameSuffix.size() &&
+        filename.rfind(kTempFilenameSuffix) ==
+            filename.size() - kTempFilenameSuffix.size()) {
+      // Delete all temp files.
+      s = Env::Default()->DeleteFile(GetPathForFilename(filename));
+    } else if (filename.size() >= kStateFilenameSuffix.size() &&
+               filename.rfind(kStateFilenameSuffix) ==
+                   filename.size() - kStateFilenameSuffix.size() &&
+               filename.rfind(kLatestBasename, 0) == std::string::npos &&
+               ParseUint64(filename.substr(
+                   0, filename.size() - kStateFilenameSuffix.size())) <
+                   saved_seqno_) {
+      assert(saved_seqno_ != kMaxSequenceNumber);
+      // Delete stale state files.
+      s = Env::Default()->DeleteFile(GetPathForFilename(filename));
+    } else if (filename.size() >= kTraceFilenameSuffix.size() &&
+               filename.rfind(kTraceFilenameSuffix) ==
+                   filename.size() - kTraceFilenameSuffix.size() &&
+               ParseUint64(filename.substr(
+                   0, filename.size() - kTraceFilenameSuffix.size())) <
+                   saved_seqno_) {
+      // Delete stale trace files.
+      s = Env::Default()->DeleteFile(GetPathForFilename(filename));
+    }
+  }
+  return s;
+}
+
+std::string FileExpectedStateManager::GetTempPathForFilename(
+    const std::string& filename) {
+  assert(!expected_state_dir_path_.empty());
+  std::string expected_state_dir_path_slash =
+      expected_state_dir_path_.back() == '/' ? expected_state_dir_path_
+                                             : expected_state_dir_path_ + "/";
+  return expected_state_dir_path_slash + kTempFilenamePrefix + filename +
+         kTempFilenameSuffix;
+}
+
+std::string FileExpectedStateManager::GetPathForFilename(
+    const std::string& filename) {
+  assert(!expected_state_dir_path_.empty());
+  std::string expected_state_dir_path_slash =
+      expected_state_dir_path_.back() == '/' ? expected_state_dir_path_
+                                             : expected_state_dir_path_ + "/";
+  return expected_state_dir_path_slash + filename;
+}
+
+AnonExpectedStateManager::AnonExpectedStateManager(size_t max_key,
+                                                   size_t num_column_families)
+    : ExpectedStateManager(max_key, num_column_families) {}
+
+Status AnonExpectedStateManager::Open() {
+  latest_.reset(new AnonExpectedState(max_key_, num_column_families_));
+  return latest_->Open(true /* create */);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/db_stress_tool/expected_state.h b/src/rocksdb/db_stress_tool/expected_state.h
new file mode 100644
index 000000000..41d747e76
--- /dev/null
+++ b/src/rocksdb/db_stress_tool/expected_state.h
@@ -0,0 +1,287 @@
+//  Copyright (c) 2021-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifdef GFLAGS
+
+#pragma once
+
+#include <stdint.h>
+
+#include <atomic>
+#include <memory>
+
+#include "db/dbformat.h"
+#include "file/file_util.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/types.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// An `ExpectedState` provides read/write access to expected values for every
+// key.
+class ExpectedState {
+ public:
+  explicit ExpectedState(size_t max_key, size_t num_column_families);
+
+  virtual ~ExpectedState() {}
+
+  // Requires external locking preventing concurrent execution with any other
+  // member function.
+  virtual Status Open(bool create) = 0;
+
+  // Requires external locking covering all keys in `cf`.
+  void ClearColumnFamily(int cf);
+
+  // @param pending True if the update may have started but is not yet
+  //    guaranteed finished. This is useful for crash-recovery testing when the
+  //    process may crash before updating the expected values array.
+  //
+  // Requires external locking covering `key` in `cf`.
+  void Put(int cf, int64_t key, uint32_t value_base, bool pending);
+
+  // Requires external locking covering `key` in `cf`.
+  uint32_t Get(int cf, int64_t key) const;
+
+  // @param pending See comment above Put()
+  // Returns true if the key was not yet deleted.
+  //
+  // Requires external locking covering `key` in `cf`.
+  bool Delete(int cf, int64_t key, bool pending);
+
+  // @param pending See comment above Put()
+  // Returns true if the key was not yet deleted.
+  //
+  // Requires external locking covering `key` in `cf`.
+  bool SingleDelete(int cf, int64_t key, bool pending);
+
+  // @param pending See comment above Put()
+  // Returns number of keys deleted by the call.
+  //
+  // Requires external locking covering keys in `[begin_key, end_key)` in `cf`.
+  int DeleteRange(int cf, int64_t begin_key, int64_t end_key, bool pending);
+
+  // Requires external locking covering `key` in `cf`.
+  bool Exists(int cf, int64_t key);
+
+ private:
+  // Requires external locking covering `key` in `cf`.
+  std::atomic<uint32_t>& Value(int cf, int64_t key) const {
+    return values_[cf * max_key_ + key];
+  }
+
+  const size_t max_key_;
+  const size_t num_column_families_;
+
+ protected:
+  size_t GetValuesLen() const {
+    return sizeof(std::atomic<uint32_t>) * num_column_families_ * max_key_;
+  }
+
+  // Requires external locking preventing concurrent execution with any other
+  // member function.
+  void Reset();
+
+  std::atomic<uint32_t>* values_;
+};
+
+// A `FileExpectedState` implements `ExpectedState` backed by a file.
+class FileExpectedState : public ExpectedState {
+ public:
+  explicit FileExpectedState(std::string expected_state_file_path,
+                             size_t max_key, size_t num_column_families);
+
+  // Requires external locking preventing concurrent execution with any other
+  // member function.
+  Status Open(bool create) override;
+
+ private:
+  const std::string expected_state_file_path_;
+  std::unique_ptr<MemoryMappedFileBuffer> expected_state_mmap_buffer_;
+};
+
+// An `AnonExpectedState` implements `ExpectedState` backed by a memory
+// allocation.
+class AnonExpectedState : public ExpectedState {
+ public:
+  explicit AnonExpectedState(size_t max_key, size_t num_column_families);
+
+  // Requires external locking preventing concurrent execution with any other
+  // member function.
+  Status Open(bool create) override;
+
+ private:
+  std::unique_ptr<std::atomic<uint32_t>[]> values_allocation_;
+};
+
+// An `ExpectedStateManager` manages data about the expected state of the
+// database. It exposes operations for reading and modifying the latest
+// expected state.
+class ExpectedStateManager {
+ public:
+  explicit ExpectedStateManager(size_t max_key, size_t num_column_families);
+
+  virtual ~ExpectedStateManager();
+
+  // Requires external locking preventing concurrent execution with any other
+  // member function.
+  virtual Status Open() = 0;
+
+  // Saves expected values for the current state of `db` and begins tracking
+  // changes. Following a successful `SaveAtAndAfter()`, `Restore()` can be
+  // called on the same DB, as long as its state does not roll back to before
+  // its current state.
+  //
+  // Requires external locking preventing concurrent execution with any other
+  // member function. Furthermore, `db` must not be mutated while this function
+  // is executing.
+  virtual Status SaveAtAndAfter(DB* db) = 0;
+
+  // Returns true if at least one state of historical expected values can be
+  // restored.
+  //
+  // Requires external locking preventing concurrent execution with any other
+  // member function.
+  virtual bool HasHistory() = 0;
+
+  // Restores expected values according to the current state of `db`. See
+  // `SaveAtAndAfter()` for conditions where this can be called.
+  //
+  // Requires external locking preventing concurrent execution with any other
+  // member function. Furthermore, `db` must not be mutated while this function
+  // is executing.
+  virtual Status Restore(DB* db) = 0;
+
+  // Requires external locking covering all keys in `cf`.
+  void ClearColumnFamily(int cf) { return latest_->ClearColumnFamily(cf); }
+
+  // @param pending True if the update may have started but is not yet
+  //    guaranteed finished. This is useful for crash-recovery testing when the
+  //    process may crash before updating the expected values array.
+  //
+  // Requires external locking covering `key` in `cf`.
+  void Put(int cf, int64_t key, uint32_t value_base, bool pending) {
+    return latest_->Put(cf, key, value_base, pending);
+  }
+
+  // Requires external locking covering `key` in `cf`.
+  uint32_t Get(int cf, int64_t key) const { return latest_->Get(cf, key); }
+
+  // @param pending See comment above Put()
+  // Returns true if the key was not yet deleted.
+  //
+  // Requires external locking covering `key` in `cf`.
+  bool Delete(int cf, int64_t key, bool pending) {
+    return latest_->Delete(cf, key, pending);
+  }
+
+  // @param pending See comment above Put()
+  // Returns true if the key was not yet deleted.
+  //
+  // Requires external locking covering `key` in `cf`.
+  bool SingleDelete(int cf, int64_t key, bool pending) {
+    return latest_->SingleDelete(cf, key, pending);
+  }
+
+  // @param pending See comment above Put()
+  // Returns number of keys deleted by the call.
+  //
+  // Requires external locking covering keys in `[begin_key, end_key)` in `cf`.
+  int DeleteRange(int cf, int64_t begin_key, int64_t end_key, bool pending) {
+    return latest_->DeleteRange(cf, begin_key, end_key, pending);
+  }
+
+  // Requires external locking covering `key` in `cf`.
+  bool Exists(int cf, int64_t key) { return latest_->Exists(cf, key); }
+
+ protected:
+  const size_t max_key_;
+  const size_t num_column_families_;
+  std::unique_ptr<ExpectedState> latest_;
+};
+
+// A `FileExpectedStateManager` implements an `ExpectedStateManager` backed by
+// a directory of files containing data about the expected state of the
+// database.
+class FileExpectedStateManager : public ExpectedStateManager {
+ public:
+  explicit FileExpectedStateManager(size_t max_key, size_t num_column_families,
+                                    std::string expected_state_dir_path);
+
+  // Requires external locking preventing concurrent execution with any other
+  // member function.
+  Status Open() override;
+
+  // See `ExpectedStateManager::SaveAtAndAfter()` API doc.
+  //
+  // This implementation makes a copy of "LATEST.state" into
+  // "<current seqno>.state", and starts a trace in "<current seqno>.trace".
+  // Due to using external files, a following `Restore()` can happen even
+  // from a different process.
+  Status SaveAtAndAfter(DB* db) override;
+
+  // See `ExpectedStateManager::HasHistory()` API doc.
+  bool HasHistory() override;
+
+  // See `ExpectedStateManager::Restore()` API doc.
+  //
+  // Say `db->GetLatestSequenceNumber()` was `a` last time `SaveAtAndAfter()`
+  // was called and now it is `b`. Then this function replays `b - a` write
+  // operations from "`a`.trace" onto "`a`.state", and then copies the resulting
+  // file into "LATEST.state".
+  Status Restore(DB* db) override;
+
+ private:
+  // Requires external locking preventing concurrent execution with any other
+  // member function.
+  Status Clean();
+
+  std::string GetTempPathForFilename(const std::string& filename);
+  std::string GetPathForFilename(const std::string& filename);
+
+  static const std::string kLatestBasename;
+  static const std::string kStateFilenameSuffix;
+  static const std::string kTraceFilenameSuffix;
+  static const std::string kTempFilenamePrefix;
+  static const std::string kTempFilenameSuffix;
+
+  const std::string expected_state_dir_path_;
+  SequenceNumber saved_seqno_ = kMaxSequenceNumber;
+};
+
+// An `AnonExpectedStateManager` implements an `ExpectedStateManager` backed by
+// a memory allocation containing data about the expected state of the database.
+class AnonExpectedStateManager : public ExpectedStateManager {
+ public:
+  explicit AnonExpectedStateManager(size_t max_key, size_t num_column_families);
+
+  // See `ExpectedStateManager::SaveAtAndAfter()` API doc.
+  //
+  // This implementation returns `Status::NotSupported` since we do not
+  // currently have a need to keep history of expected state within a process.
+  Status SaveAtAndAfter(DB* /* db */) override {
+    return Status::NotSupported();
+  }
+
+  // See `ExpectedStateManager::HasHistory()` API doc.
+  bool HasHistory() override { return false; }
+
+  // See `ExpectedStateManager::Restore()` API doc.
+  //
+  // This implementation returns `Status::NotSupported` since we do not
+  // currently have a need to keep history of expected state within a process.
+  Status Restore(DB* /* db */) override { return Status::NotSupported(); }
+
+  // Requires external locking preventing concurrent execution with any other
+  // member function.
+  Status Open() override;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/db_stress_tool/multi_ops_txns_stress.cc b/src/rocksdb/db_stress_tool/multi_ops_txns_stress.cc
new file mode 100644
index 000000000..7db5e8942
--- /dev/null
+++ b/src/rocksdb/db_stress_tool/multi_ops_txns_stress.cc
@@ -0,0 +1,1808 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifdef GFLAGS
+#include "db_stress_tool/multi_ops_txns_stress.h"
+
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "util/defer.h"
+#include "utilities/fault_injection_fs.h"
+#include "utilities/transactions/write_prepared_txn_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The description of A and C can be found in multi_ops_txns_stress.h
+DEFINE_int32(lb_a, 0, "(Inclusive) lower bound of A");
+DEFINE_int32(ub_a, 1000, "(Exclusive) upper bound of A");
+DEFINE_int32(lb_c, 0, "(Inclusive) lower bound of C");
+DEFINE_int32(ub_c, 1000, "(Exclusive) upper bound of C");
+
+DEFINE_string(key_spaces_path, "",
+              "Path to file describing the lower and upper bounds of A and C");
+
+DEFINE_int32(delay_snapshot_read_one_in, 0,
+             "With a chance of 1/N, inject a random delay between taking "
+             "snapshot and read.");
+
+DEFINE_int32(rollback_one_in, 0,
+             "If non-zero, rollback non-read-only transactions with a "
+             "probability of 1/N.");
+
+DEFINE_int32(clear_wp_commit_cache_one_in, 0,
+             "If non-zero, evict all commit entries from commit cache with a "
+             "probability of 1/N. This options applies to write-prepared and "
+             "write-unprepared transactions.");
+
+extern "C" bool rocksdb_write_prepared_TEST_ShouldClearCommitCache(void) {
+  static Random rand(static_cast<uint32_t>(db_stress_env->NowMicros()));
+  return FLAGS_clear_wp_commit_cache_one_in > 0 &&
+         rand.OneIn(FLAGS_clear_wp_commit_cache_one_in);
+}
+
+// MultiOpsTxnsStressTest can either operate on a database with pre-populated
+// data (possibly from previous ones), or create a new db and preload it with
+// data specified via `-lb_a`, `-ub_a`, `-lb_c`, `-ub_c`, etc. Among these, we
+// define the test key spaces as two key ranges: [lb_a, ub_a) and [lb_c, ub_c).
+// The key spaces specification is persisted in a file whose absolute path can
+// be specified via `-key_spaces_path`.
+//
+// Whether an existing db is used or a new one is created, key_spaces_path will
+// be used. In the former case, the test reads the key spaces specification
+// from `-key_spaces_path` and decodes [lb_a, ub_a) and [lb_c, ub_c). In the
+// latter case, the test writes a key spaces specification to a file at the
+// location, and this file will be used by future runs until a new db is
+// created.
+//
+// Create a fresh new database (-destroy_db_initially=1 or there is no database
+// in the location specified by -db). See PreloadDb().
+//
+// Use an existing, non-empty database. See ScanExistingDb().
+//
+// This test is multi-threaded, and thread count can be specified via
+// `-threads`. For simplicity, we partition the key ranges and each thread
+// operates on a subrange independently.
+// Within each subrange, a KeyGenerator object is responsible for key
+// generation. A KeyGenerator maintains two sets: set of existing keys within
+// [low, high), set of non-existing keys within [low, high). [low, high) is the
+// subrange. The test initialization makes sure there is at least one
+// non-existing key, otherwise the test will return an error and exit before
+// any test thread is spawned.
+
+void MultiOpsTxnsStressTest::KeyGenerator::FinishInit() {
+  assert(existing_.empty());
+  assert(!existing_uniq_.empty());
+  assert(low_ < high_);
+  for (auto v : existing_uniq_) {
+    assert(low_ <= v);
+    assert(high_ > v);
+    existing_.push_back(v);
+  }
+  if (non_existing_uniq_.empty()) {
+    fprintf(
+        stderr,
+        "Cannot allocate key in [%u, %u)\nStart with a new DB or try change "
+        "the number of threads for testing via -threads=<#threads>\n",
+        static_cast<unsigned int>(low_), static_cast<unsigned int>(high_));
+    fflush(stdout);
+    fflush(stderr);
+    assert(false);
+  }
+  initialized_ = true;
+}
+
+std::pair<uint32_t, uint32_t>
+MultiOpsTxnsStressTest::KeyGenerator::ChooseExisting() {
+  assert(initialized_);
+  const size_t N = existing_.size();
+  assert(N > 0);
+  uint32_t rnd = rand_.Uniform(static_cast<int>(N));
+  assert(rnd < N);
+  return std::make_pair(existing_[rnd], rnd);
+}
+
+uint32_t MultiOpsTxnsStressTest::KeyGenerator::Allocate() {
+  assert(initialized_);
+  auto it = non_existing_uniq_.begin();
+  assert(non_existing_uniq_.end() != it);
+  uint32_t ret = *it;
+  // Remove this element from non_existing_.
+  // Need to call UndoAllocation() if the calling transaction does not commit.
+  non_existing_uniq_.erase(it);
+  return ret;
+}
+
+void MultiOpsTxnsStressTest::KeyGenerator::Replace(uint32_t old_val,
+                                                   uint32_t old_pos,
+                                                   uint32_t new_val) {
+  assert(initialized_);
+  {
+    auto it = existing_uniq_.find(old_val);
+    assert(it != existing_uniq_.end());
+    existing_uniq_.erase(it);
+  }
+
+  {
+    assert(0 == existing_uniq_.count(new_val));
+    existing_uniq_.insert(new_val);
+    existing_[old_pos] = new_val;
+  }
+
+  {
+    assert(0 == non_existing_uniq_.count(old_val));
+    non_existing_uniq_.insert(old_val);
+  }
+}
+
+void MultiOpsTxnsStressTest::KeyGenerator::UndoAllocation(uint32_t new_val) {
+  assert(initialized_);
+  assert(0 == non_existing_uniq_.count(new_val));
+  non_existing_uniq_.insert(new_val);
+}
+
+std::string MultiOpsTxnsStressTest::Record::EncodePrimaryKey(uint32_t a) {
+  std::string ret;
+  PutFixed32(&ret, kPrimaryIndexId);
+  PutFixed32(&ret, a);
+
+  char* const buf = &ret[0];
+  std::reverse(buf, buf + sizeof(kPrimaryIndexId));
+  std::reverse(buf + sizeof(kPrimaryIndexId),
+               buf + sizeof(kPrimaryIndexId) + sizeof(a));
+  return ret;
+}
+
+std::string MultiOpsTxnsStressTest::Record::EncodeSecondaryKey(uint32_t c) {
+  std::string ret;
+  PutFixed32(&ret, kSecondaryIndexId);
+  PutFixed32(&ret, c);
+
+  char* const buf = &ret[0];
+  std::reverse(buf, buf + sizeof(kSecondaryIndexId));
+  std::reverse(buf + sizeof(kSecondaryIndexId),
+               buf + sizeof(kSecondaryIndexId) + sizeof(c));
+  return ret;
+}
+
+std::string MultiOpsTxnsStressTest::Record::EncodeSecondaryKey(uint32_t c,
+                                                               uint32_t a) {
+  std::string ret;
+  PutFixed32(&ret, kSecondaryIndexId);
+  PutFixed32(&ret, c);
+  PutFixed32(&ret, a);
+
+  char* const buf = &ret[0];
+  std::reverse(buf, buf + sizeof(kSecondaryIndexId));
+  std::reverse(buf + sizeof(kSecondaryIndexId),
+               buf + sizeof(kSecondaryIndexId) + sizeof(c));
+  std::reverse(buf + sizeof(kSecondaryIndexId) + sizeof(c),
+               buf + sizeof(kSecondaryIndexId) + sizeof(c) + sizeof(a));
+  return ret;
+}
+
+std::tuple<Status, uint32_t, uint32_t>
+MultiOpsTxnsStressTest::Record::DecodePrimaryIndexValue(
+    Slice primary_index_value) {
+  if (primary_index_value.size() != 8) {
+    return std::tuple<Status, uint32_t, uint32_t>{Status::Corruption(""), 0, 0};
+  }
+  uint32_t b = 0;
+  uint32_t c = 0;
+  if (!GetFixed32(&primary_index_value, &b) ||
+      !GetFixed32(&primary_index_value, &c)) {
+    assert(false);
+    return std::tuple<Status, uint32_t, uint32_t>{Status::Corruption(""), 0, 0};
+  }
+  return std::tuple<Status, uint32_t, uint32_t>{Status::OK(), b, c};
+}
+
+std::pair<Status, uint32_t>
+MultiOpsTxnsStressTest::Record::DecodeSecondaryIndexValue(
+    Slice secondary_index_value) {
+  if (secondary_index_value.size() != 4) {
+    return std::make_pair(Status::Corruption(""), 0);
+  }
+  uint32_t crc = 0;
+  bool result __attribute__((unused)) =
+      GetFixed32(&secondary_index_value, &crc);
+  assert(result);
+  return std::make_pair(Status::OK(), crc);
+}
+
+std::pair<std::string, std::string>
+MultiOpsTxnsStressTest::Record::EncodePrimaryIndexEntry() const {
+  std::string primary_index_key = EncodePrimaryKey();
+  std::string primary_index_value = EncodePrimaryIndexValue();
+  return std::make_pair(primary_index_key, primary_index_value);
+}
+
+std::string MultiOpsTxnsStressTest::Record::EncodePrimaryKey() const {
+  return EncodePrimaryKey(a_);
+}
+
+std::string MultiOpsTxnsStressTest::Record::EncodePrimaryIndexValue() const {
+  std::string ret;
+  PutFixed32(&ret, b_);
+  PutFixed32(&ret, c_);
+  return ret;
+}
+
+std::pair<std::string, std::string>
+MultiOpsTxnsStressTest::Record::EncodeSecondaryIndexEntry() const {
+  std::string secondary_index_key = EncodeSecondaryKey(c_, a_);
+
+  // Secondary index value is always 4-byte crc32 of the secondary key
+  std::string secondary_index_value;
+  uint32_t crc =
+      crc32c::Value(secondary_index_key.data(), secondary_index_key.size());
+  PutFixed32(&secondary_index_value, crc);
+  return std::make_pair(std::move(secondary_index_key), secondary_index_value);
+}
+
+std::string MultiOpsTxnsStressTest::Record::EncodeSecondaryKey() const {
+  return EncodeSecondaryKey(c_, a_);
+}
+
+Status MultiOpsTxnsStressTest::Record::DecodePrimaryIndexEntry(
+    Slice primary_index_key, Slice primary_index_value) {
+  if (primary_index_key.size() != 8) {
+    assert(false);
+    return Status::Corruption("Primary index key length is not 8");
+  }
+
+  uint32_t index_id = 0;
+
+  [[maybe_unused]] bool res = GetFixed32(&primary_index_key, &index_id);
+  assert(res);
+  index_id = EndianSwapValue(index_id);
+
+  if (index_id != kPrimaryIndexId) {
+    std::ostringstream oss;
+    oss << "Unexpected primary index id: " << index_id;
+    return Status::Corruption(oss.str());
+  }
+
+  res = GetFixed32(&primary_index_key, &a_);
+  assert(res);
+  a_ = EndianSwapValue(a_);
+  assert(primary_index_key.empty());
+
+  if (primary_index_value.size() != 8) {
+    return Status::Corruption("Primary index value length is not 8");
+  }
+  GetFixed32(&primary_index_value, &b_);
+  GetFixed32(&primary_index_value, &c_);
+  return Status::OK();
+}
+
+Status MultiOpsTxnsStressTest::Record::DecodeSecondaryIndexEntry(
+    Slice secondary_index_key, Slice secondary_index_value) {
+  if (secondary_index_key.size() != 12) {
+    return Status::Corruption("Secondary index key length is not 12");
+  }
+  uint32_t crc =
+      crc32c::Value(secondary_index_key.data(), secondary_index_key.size());
+
+  uint32_t index_id = 0;
+
+  [[maybe_unused]] bool res = GetFixed32(&secondary_index_key, &index_id);
+  assert(res);
+  index_id = EndianSwapValue(index_id);
+
+  if (index_id != kSecondaryIndexId) {
+    std::ostringstream oss;
+    oss << "Unexpected secondary index id: " << index_id;
+    return Status::Corruption(oss.str());
+  }
+
+  assert(secondary_index_key.size() == 8);
+  res = GetFixed32(&secondary_index_key, &c_);
+  assert(res);
+  c_ = EndianSwapValue(c_);
+
+  assert(secondary_index_key.size() == 4);
+  res = GetFixed32(&secondary_index_key, &a_);
+  assert(res);
+  a_ = EndianSwapValue(a_);
+  assert(secondary_index_key.empty());
+
+  if (secondary_index_value.size() != 4) {
+    return Status::Corruption("Secondary index value length is not 4");
+  }
+  uint32_t val = 0;
+  GetFixed32(&secondary_index_value, &val);
+  if (val != crc) {
+    std::ostringstream oss;
+    oss << "Secondary index key checksum mismatch, stored: " << val
+        << ", recomputed: " << crc;
+    return Status::Corruption(oss.str());
+  }
+  return Status::OK();
+}
+
+void MultiOpsTxnsStressTest::FinishInitDb(SharedState* shared) {
+  if (FLAGS_enable_compaction_filter) {
+    // TODO (yanqin) enable compaction filter
+  }
+#ifndef ROCKSDB_LITE
+  ProcessRecoveredPreparedTxns(shared);
+#endif
+
+  ReopenAndPreloadDbIfNeeded(shared);
+  // TODO (yanqin) parallelize if key space is large
+  for (auto& key_gen : key_gen_for_a_) {
+    assert(key_gen);
+    key_gen->FinishInit();
+  }
+  // TODO (yanqin) parallelize if key space is large
+  for (auto& key_gen : key_gen_for_c_) {
+    assert(key_gen);
+    key_gen->FinishInit();
+  }
+}
+
+void MultiOpsTxnsStressTest::ReopenAndPreloadDbIfNeeded(SharedState* shared) {
+  (void)shared;
+#ifndef ROCKSDB_LITE
+  bool db_empty = false;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    iter->SeekToFirst();
+    if (!iter->Valid()) {
+      db_empty = true;
+    }
+  }
+
+  if (db_empty) {
+    PreloadDb(shared, FLAGS_threads, FLAGS_lb_a, FLAGS_ub_a, FLAGS_lb_c,
+              FLAGS_ub_c);
+  } else {
+    fprintf(stdout,
+            "Key ranges will be read from %s.\n-lb_a, -ub_a, -lb_c, -ub_c will "
+            "be ignored\n",
+            FLAGS_key_spaces_path.c_str());
+    fflush(stdout);
+    ScanExistingDb(shared, FLAGS_threads);
+  }
+#endif  // !ROCKSDB_LITE
+}
+
+// Used for point-lookup transaction
+Status MultiOpsTxnsStressTest::TestGet(
+    ThreadState* thread, const ReadOptions& read_opts,
+    const std::vector<int>& /*rand_column_families*/,
+    const std::vector<int64_t>& /*rand_keys*/) {
+  uint32_t a = 0;
+  uint32_t pos = 0;
+  std::tie(a, pos) = ChooseExistingA(thread);
+  return PointLookupTxn(thread, read_opts, a);
+}
+
+// Not used.
+std::vector<Status> MultiOpsTxnsStressTest::TestMultiGet(
+    ThreadState* /*thread*/, const ReadOptions& /*read_opts*/,
+    const std::vector<int>& /*rand_column_families*/,
+    const std::vector<int64_t>& /*rand_keys*/) {
+  return std::vector<Status>{Status::NotSupported()};
+}
+
+Status MultiOpsTxnsStressTest::TestPrefixScan(
+    ThreadState* thread, const ReadOptions& read_opts,
+    const std::vector<int>& rand_column_families,
+    const std::vector<int64_t>& rand_keys) {
+  (void)thread;
+  (void)read_opts;
+  (void)rand_column_families;
+  (void)rand_keys;
+  return Status::OK();
+}
+
+// Given a key K, this creates an iterator which scans to K and then
+// does a random sequence of Next/Prev operations.
+Status MultiOpsTxnsStressTest::TestIterate(
+    ThreadState* thread, const ReadOptions& read_opts,
+    const std::vector<int>& /*rand_column_families*/,
+    const std::vector<int64_t>& /*rand_keys*/) {
+  uint32_t c = 0;
+  uint32_t pos = 0;
+  std::tie(c, pos) = ChooseExistingC(thread);
+  return RangeScanTxn(thread, read_opts, c);
+}
+
+// Not intended for use.
+Status MultiOpsTxnsStressTest::TestPut(ThreadState* /*thread*/,
+                                       WriteOptions& /*write_opts*/,
+                                       const ReadOptions& /*read_opts*/,
+                                       const std::vector<int>& /*cf_ids*/,
+                                       const std::vector<int64_t>& /*keys*/,
+                                       char (&value)[100]) {
+  (void)value;
+  return Status::NotSupported();
+}
+
+// Not intended for use.
+Status MultiOpsTxnsStressTest::TestDelete(
+    ThreadState* /*thread*/, WriteOptions& /*write_opts*/,
+    const std::vector<int>& /*rand_column_families*/,
+    const std::vector<int64_t>& /*rand_keys*/) {
+  return Status::NotSupported();
+}
+
+// Not intended for use.
+Status MultiOpsTxnsStressTest::TestDeleteRange(
+    ThreadState* /*thread*/, WriteOptions& /*write_opts*/,
+    const std::vector<int>& /*rand_column_families*/,
+    const std::vector<int64_t>& /*rand_keys*/) {
+  return Status::NotSupported();
+}
+
+void MultiOpsTxnsStressTest::TestIngestExternalFile(
+    ThreadState* thread, const std::vector<int>& rand_column_families,
+    const std::vector<int64_t>& /*rand_keys*/) {
+  // TODO (yanqin)
+  (void)thread;
+  (void)rand_column_families;
+}
+
+void MultiOpsTxnsStressTest::TestCompactRange(
+    ThreadState* thread, int64_t /*rand_key*/, const Slice& /*start_key*/,
+    ColumnFamilyHandle* column_family) {
+  // TODO (yanqin).
+  // May use GetRangeHash() for validation before and after DB::CompactRange()
+  // completes.
+  (void)thread;
+  (void)column_family;
+}
+
+Status MultiOpsTxnsStressTest::TestBackupRestore(
+    ThreadState* thread, const std::vector<int>& rand_column_families,
+    const std::vector<int64_t>& /*rand_keys*/) {
+  // TODO (yanqin)
+  (void)thread;
+  (void)rand_column_families;
+  return Status::OK();
+}
+
+Status MultiOpsTxnsStressTest::TestCheckpoint(
+    ThreadState* thread, const std::vector<int>& rand_column_families,
+    const std::vector<int64_t>& /*rand_keys*/) {
+  // TODO (yanqin)
+  (void)thread;
+  (void)rand_column_families;
+  return Status::OK();
+}
+
+#ifndef ROCKSDB_LITE
+Status MultiOpsTxnsStressTest::TestApproximateSize(
+    ThreadState* thread, uint64_t iteration,
+    const std::vector<int>& rand_column_families,
+    const std::vector<int64_t>& /*rand_keys*/) {
+  // TODO (yanqin)
+  (void)thread;
+  (void)iteration;
+  (void)rand_column_families;
+  return Status::OK();
+}
+#endif  // !ROCKSDB_LITE
+
+Status MultiOpsTxnsStressTest::TestCustomOperations(
+    ThreadState* thread, const std::vector<int>& rand_column_families) {
+  (void)rand_column_families;
+  // Randomly choose from 0, 1, and 2.
+  // TODO (yanqin) allow user to configure probability of each operation.
+  uint32_t rand = thread->rand.Uniform(3);
+  Status s;
+  if (0 == rand) {
+    // Update primary key.
+    uint32_t old_a = 0;
+    uint32_t pos = 0;
+    std::tie(old_a, pos) = ChooseExistingA(thread);
+    uint32_t new_a = GenerateNextA(thread);
+    s = PrimaryKeyUpdateTxn(thread, old_a, pos, new_a);
+  } else if (1 == rand) {
+    // Update secondary key.
+    uint32_t old_c = 0;
+    uint32_t pos = 0;
+    std::tie(old_c, pos) = ChooseExistingC(thread);
+    uint32_t new_c = GenerateNextC(thread);
+    s = SecondaryKeyUpdateTxn(thread, old_c, pos, new_c);
+  } else if (2 == rand) {
+    // Update primary index value.
+    uint32_t a = 0;
+    uint32_t pos = 0;
+    std::tie(a, pos) = ChooseExistingA(thread);
+    s = UpdatePrimaryIndexValueTxn(thread, a, /*b_delta=*/1);
+  } else {
+    // Should never reach here.
+    assert(false);
+  }
+
+  return s;
+}
+
+void MultiOpsTxnsStressTest::RegisterAdditionalListeners() {
+  options_.listeners.emplace_back(new MultiOpsTxnsStressListener(this));
+}
+
+#ifndef ROCKSDB_LITE
+void MultiOpsTxnsStressTest::PrepareTxnDbOptions(
+    SharedState* /*shared*/, TransactionDBOptions& txn_db_opts) {
+  // MultiOpsTxnStressTest uses SingleDelete to delete secondary keys, thus we
+  // register this callback to let TxnDb know that when rolling back
+  // a transaction, use only SingleDelete to cancel prior Put from the same
+  // transaction if applicable.
+  txn_db_opts.rollback_deletion_type_callback =
+      [](TransactionDB* /*db*/, ColumnFamilyHandle* /*column_family*/,
+         const Slice& key) {
+        Slice ks = key;
+        uint32_t index_id = 0;
+        [[maybe_unused]] bool res = GetFixed32(&ks, &index_id);
+        assert(res);
+        index_id = EndianSwapValue(index_id);
+        assert(index_id <= Record::kSecondaryIndexId);
+        return index_id == Record::kSecondaryIndexId;
+      };
+}
+#endif  // !ROCKSDB_LITE
+
+Status MultiOpsTxnsStressTest::PrimaryKeyUpdateTxn(ThreadState* thread,
+                                                   uint32_t old_a,
+                                                   uint32_t old_a_pos,
+                                                   uint32_t new_a) {
+#ifdef ROCKSDB_LITE
+  (void)thread;
+  (void)old_a;
+  (void)old_a_pos;
+  (void)new_a;
+  return Status::NotSupported();
+#else
+  std::string old_pk = Record::EncodePrimaryKey(old_a);
+  std::string new_pk = Record::EncodePrimaryKey(new_a);
+  Transaction* txn = nullptr;
+  WriteOptions wopts;
+  Status s = NewTxn(wopts, &txn);
+  if (!s.ok()) {
+    assert(!txn);
+    thread->stats.AddErrors(1);
+    return s;
+  }
+
+  assert(txn);
+  txn->SetSnapshotOnNextOperation(/*notifier=*/nullptr);
+
+  const Defer cleanup([new_a, &s, thread, txn, this]() {
+    if (s.ok()) {
+      // Two gets, one for existing pk, one for locking potential new pk.
+      thread->stats.AddGets(/*ngets=*/2, /*nfounds=*/1);
+      thread->stats.AddDeletes(1);
+      thread->stats.AddBytesForWrites(
+          /*nwrites=*/2,
+          Record::kPrimaryIndexEntrySize + Record::kSecondaryIndexEntrySize);
+      thread->stats.AddSingleDeletes(1);
+      return;
+    }
+    if (s.IsNotFound()) {
+      thread->stats.AddGets(/*ngets=*/1, /*nfounds=*/0);
+    } else if (s.IsBusy() || s.IsIncomplete()) {
+      // ignore.
+      // Incomplete also means rollback by application. See the transaction
+      // implementations.
+    } else {
+      thread->stats.AddErrors(1);
+    }
+    auto& key_gen = key_gen_for_a_[thread->tid];
+    key_gen->UndoAllocation(new_a);
+    RollbackTxn(txn).PermitUncheckedError();
+  });
+
+  ReadOptions ropts;
+  ropts.rate_limiter_priority =
+      FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL;
+  std::string value;
+  s = txn->GetForUpdate(ropts, old_pk, &value);
+  if (!s.ok()) {
+    return s;
+  }
+  std::string empty_value;
+  s = txn->GetForUpdate(ropts, new_pk, &empty_value);
+  if (s.ok()) {
+    assert(!empty_value.empty());
+    s = Status::Busy();
+    return s;
+  } else if (!s.IsNotFound()) {
+    return s;
+  }
+
+  auto result = Record::DecodePrimaryIndexValue(value);
+  s = std::get<0>(result);
+  if (!s.ok()) {
+    return s;
+  }
+  uint32_t b = std::get<1>(result);
+  uint32_t c = std::get<2>(result);
+
+  ColumnFamilyHandle* cf = db_->DefaultColumnFamily();
+  s = txn->Delete(cf, old_pk, /*assume_tracked=*/true);
+  if (!s.ok()) {
+    return s;
+  }
+  s = txn->Put(cf, new_pk, value, /*assume_tracked=*/true);
+  if (!s.ok()) {
+    return s;
+  }
+
+  auto* wb = txn->GetWriteBatch();
+  assert(wb);
+
+  std::string old_sk = Record::EncodeSecondaryKey(c, old_a);
+  s = wb->SingleDelete(old_sk);
+  if (!s.ok()) {
+    return s;
+  }
+
+  Record record(new_a, b, c);
+  std::string new_sk;
+  std::string new_crc;
+  std::tie(new_sk, new_crc) = record.EncodeSecondaryIndexEntry();
+  s = wb->Put(new_sk, new_crc);
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = txn->Prepare();
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (FLAGS_rollback_one_in > 0 && thread->rand.OneIn(FLAGS_rollback_one_in)) {
+    s = Status::Incomplete();
+    return s;
+  }
+
+  s = WriteToCommitTimeWriteBatch(*txn);
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = CommitAndCreateTimestampedSnapshotIfNeeded(thread, *txn);
+
+  auto& key_gen = key_gen_for_a_.at(thread->tid);
+  if (s.ok()) {
+    delete txn;
+    key_gen->Replace(old_a, old_a_pos, new_a);
+  }
+  return s;
+#endif  // !ROCKSDB_LITE
+}
+
+Status MultiOpsTxnsStressTest::SecondaryKeyUpdateTxn(ThreadState* thread,
+                                                     uint32_t old_c,
+                                                     uint32_t old_c_pos,
+                                                     uint32_t new_c) {
+#ifdef ROCKSDB_LITE
+  (void)thread;
+  (void)old_c;
+  (void)old_c_pos;
+  (void)new_c;
+  return Status::NotSupported();
+#else
+  Transaction* txn = nullptr;
+  WriteOptions wopts;
+  Status s = NewTxn(wopts, &txn);
+  if (!s.ok()) {
+    assert(!txn);
+    thread->stats.AddErrors(1);
+    return s;
+  }
+
+  assert(txn);
+
+  Iterator* it = nullptr;
+  long iterations = 0;
+  const Defer cleanup([new_c, &s, thread, &it, txn, this, &iterations]() {
+    delete it;
+    if (s.ok()) {
+      thread->stats.AddIterations(iterations);
+      thread->stats.AddGets(/*ngets=*/1, /*nfounds=*/1);
+      thread->stats.AddSingleDeletes(1);
+      thread->stats.AddBytesForWrites(
+          /*nwrites=*/2,
+          Record::kPrimaryIndexEntrySize + Record::kSecondaryIndexEntrySize);
+      return;
+    } else if (s.IsBusy() || s.IsTimedOut() || s.IsTryAgain() ||
+               s.IsMergeInProgress() || s.IsIncomplete()) {
+      // ww-conflict detected, or
+      // lock cannot be acquired, or
+      // memtable history is not large enough for conflict checking, or
+      // Merge operation cannot be resolved, or
+      // application rollback.
+      // TODO (yanqin) add stats for other cases?
+    } else if (s.IsNotFound()) {
+      // ignore.
+    } else {
+      thread->stats.AddErrors(1);
+    }
+    auto& key_gen = key_gen_for_c_[thread->tid];
+    key_gen->UndoAllocation(new_c);
+    RollbackTxn(txn).PermitUncheckedError();
+  });
+
+  // TODO (yanqin) try SetSnapshotOnNextOperation(). We currently need to take
+  // a snapshot here because we will later verify that point lookup in the
+  // primary index using GetForUpdate() returns the same value for 'c' as the
+  // iterator. The iterator does not need a snapshot though, because it will be
+  // assigned the current latest (published) sequence in the db, which will be
+  // no smaller than the snapshot created here. The GetForUpdate will perform
+  // ww conflict checking to ensure GetForUpdate() (using the snapshot) sees
+  // the same data as this iterator.
+  txn->SetSnapshot();
+  std::string old_sk_prefix = Record::EncodeSecondaryKey(old_c);
+  std::string iter_ub_str = Record::EncodeSecondaryKey(old_c + 1);
+  Slice iter_ub = iter_ub_str;
+  ReadOptions ropts;
+  ropts.snapshot = txn->GetSnapshot();
+  ropts.total_order_seek = true;
+  ropts.iterate_upper_bound = &iter_ub;
+  ropts.rate_limiter_priority =
+      FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL;
+  it = txn->GetIterator(ropts);
+
+  assert(it);
+  it->Seek(old_sk_prefix);
+  if (!it->Valid()) {
+    s = Status::NotFound();
+    return s;
+  }
+  auto* wb = txn->GetWriteBatch();
+  assert(wb);
+
+  do {
+    ++iterations;
+    Record record;
+    s = record.DecodeSecondaryIndexEntry(it->key(), it->value());
+    if (!s.ok()) {
+      fprintf(stderr, "Cannot decode secondary key (%s => %s): %s\n",
+              it->key().ToString(true).c_str(),
+              it->value().ToString(true).c_str(), s.ToString().c_str());
+      assert(false);
+      break;
+    }
+    // At this point, record.b is not known yet, thus we need to access
+    // primary index.
+    std::string pk = Record::EncodePrimaryKey(record.a_value());
+    std::string value;
+    ReadOptions read_opts;
+    read_opts.rate_limiter_priority =
+        FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL;
+    read_opts.snapshot = txn->GetSnapshot();
+    s = txn->GetForUpdate(read_opts, pk, &value);
+    if (s.IsBusy() || s.IsTimedOut() || s.IsTryAgain() ||
+        s.IsMergeInProgress()) {
+      // Write conflict, or cannot acquire lock, or memtable size is not large
+      // enough, or merge cannot be resolved.
+      break;
+    } else if (s.IsNotFound()) {
+      // We can also fail verification here.
+      std::ostringstream oss;
+      auto* dbimpl = static_cast_with_check<DBImpl>(db_->GetRootDB());
+      assert(dbimpl);
+      oss << "snap " << read_opts.snapshot->GetSequenceNumber()
+          << " (published " << dbimpl->GetLastPublishedSequence()
+          << "), pk should exist: " << Slice(pk).ToString(true);
+      fprintf(stderr, "%s\n", oss.str().c_str());
+      assert(false);
+      break;
+    }
+    if (!s.ok()) {
+      std::ostringstream oss;
+      auto* dbimpl = static_cast_with_check<DBImpl>(db_->GetRootDB());
+      assert(dbimpl);
+      oss << "snap " << read_opts.snapshot->GetSequenceNumber()
+          << " (published " << dbimpl->GetLastPublishedSequence() << "), "
+          << s.ToString();
+      fprintf(stderr, "%s\n", oss.str().c_str());
+      assert(false);
+      break;
+    }
+    auto result = Record::DecodePrimaryIndexValue(value);
+    s = std::get<0>(result);
+    if (!s.ok()) {
+      fprintf(stderr, "Cannot decode primary index value %s: %s\n",
+              Slice(value).ToString(true).c_str(), s.ToString().c_str());
+      assert(false);
+      break;
+    }
+    uint32_t b = std::get<1>(result);
+    uint32_t c = std::get<2>(result);
+    if (c != old_c) {
+      std::ostringstream oss;
+      auto* dbimpl = static_cast_with_check<DBImpl>(db_->GetRootDB());
+      assert(dbimpl);
+      oss << "snap " << read_opts.snapshot->GetSequenceNumber()
+          << " (published " << dbimpl->GetLastPublishedSequence()
+          << "), pk/sk mismatch. pk: (a=" << record.a_value() << ", "
+          << "c=" << c << "), sk: (c=" << old_c << ")";
+      s = Status::Corruption();
+      fprintf(stderr, "%s\n", oss.str().c_str());
+      assert(false);
+      break;
+    }
+    Record new_rec(record.a_value(), b, new_c);
+    std::string new_primary_index_value = new_rec.EncodePrimaryIndexValue();
+    ColumnFamilyHandle* cf = db_->DefaultColumnFamily();
+    s = txn->Put(cf, pk, new_primary_index_value, /*assume_tracked=*/true);
+    if (!s.ok()) {
+      break;
+    }
+    std::string old_sk = it->key().ToString(/*hex=*/false);
+    std::string new_sk;
+    std::string new_crc;
+    std::tie(new_sk, new_crc) = new_rec.EncodeSecondaryIndexEntry();
+    s = wb->SingleDelete(old_sk);
+    if (!s.ok()) {
+      break;
+    }
+    s = wb->Put(new_sk, new_crc);
+    if (!s.ok()) {
+      break;
+    }
+
+    it->Next();
+  } while (it->Valid());
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = txn->Prepare();
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (FLAGS_rollback_one_in > 0 && thread->rand.OneIn(FLAGS_rollback_one_in)) {
+    s = Status::Incomplete();
+    return s;
+  }
+
+  s = WriteToCommitTimeWriteBatch(*txn);
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = CommitAndCreateTimestampedSnapshotIfNeeded(thread, *txn);
+
+  if (s.ok()) {
+    delete txn;
+    auto& key_gen = key_gen_for_c_.at(thread->tid);
+    key_gen->Replace(old_c, old_c_pos, new_c);
+  }
+
+  return s;
+#endif  // !ROCKSDB_LITE
+}
+
+Status MultiOpsTxnsStressTest::UpdatePrimaryIndexValueTxn(ThreadState* thread,
+                                                          uint32_t a,
+                                                          uint32_t b_delta) {
+#ifdef ROCKSDB_LITE
+  (void)thread;
+  (void)a;
+  (void)b_delta;
+  return Status::NotSupported();
+#else
+  std::string pk_str = Record::EncodePrimaryKey(a);
+  Transaction* txn = nullptr;
+  WriteOptions wopts;
+  Status s = NewTxn(wopts, &txn);
+  if (!s.ok()) {
+    assert(!txn);
+    thread->stats.AddErrors(1);
+    return s;
+  }
+
+  assert(txn);
+
+  const Defer cleanup([&s, thread, txn, this]() {
+    if (s.ok()) {
+      thread->stats.AddGets(/*ngets=*/1, /*nfounds=*/1);
+      thread->stats.AddBytesForWrites(
+          /*nwrites=*/1, /*nbytes=*/Record::kPrimaryIndexEntrySize);
+      return;
+    }
+    if (s.IsNotFound()) {
+      thread->stats.AddGets(/*ngets=*/1, /*nfounds=*/0);
+    } else if (s.IsInvalidArgument()) {
+      // ignored.
+    } else if (s.IsBusy() || s.IsTimedOut() || s.IsTryAgain() ||
+               s.IsMergeInProgress() || s.IsIncomplete()) {
+      // ignored.
+    } else {
+      thread->stats.AddErrors(1);
+    }
+    RollbackTxn(txn).PermitUncheckedError();
+  });
+  ReadOptions ropts;
+  ropts.rate_limiter_priority =
+      FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL;
+  std::string value;
+  s = txn->GetForUpdate(ropts, pk_str, &value);
+  if (!s.ok()) {
+    return s;
+  }
+  auto result = Record::DecodePrimaryIndexValue(value);
+  if (!std::get<0>(result).ok()) {
+    s = std::get<0>(result);
+    fprintf(stderr, "Cannot decode primary index value %s: %s\n",
+            Slice(value).ToString(true).c_str(), s.ToString().c_str());
+    assert(false);
+    return s;
+  }
+  uint32_t b = std::get<1>(result) + b_delta;
+  uint32_t c = std::get<2>(result);
+  Record record(a, b, c);
+  std::string primary_index_value = record.EncodePrimaryIndexValue();
+  ColumnFamilyHandle* cf = db_->DefaultColumnFamily();
+  s = txn->Put(cf, pk_str, primary_index_value, /*assume_tracked=*/true);
+  if (!s.ok()) {
+    return s;
+  }
+  s = txn->Prepare();
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (FLAGS_rollback_one_in > 0 && thread->rand.OneIn(FLAGS_rollback_one_in)) {
+    s = Status::Incomplete();
+    return s;
+  }
+
+  s = WriteToCommitTimeWriteBatch(*txn);
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = CommitAndCreateTimestampedSnapshotIfNeeded(thread, *txn);
+
+  if (s.ok()) {
+    delete txn;
+  }
+  return s;
+#endif  // !ROCKSDB_LITE
+}
+
+Status MultiOpsTxnsStressTest::PointLookupTxn(ThreadState* thread,
+                                              ReadOptions ropts, uint32_t a) {
+#ifdef ROCKSDB_LITE
+  (void)thread;
+  (void)ropts;
+  (void)a;
+  return Status::NotSupported();
+#else
+  std::string pk_str = Record::EncodePrimaryKey(a);
+  // pk may or may not exist
+  PinnableSlice value;
+
+  Transaction* txn = nullptr;
+  WriteOptions wopts;
+  Status s = NewTxn(wopts, &txn);
+  if (!s.ok()) {
+    assert(!txn);
+    thread->stats.AddErrors(1);
+    return s;
+  }
+
+  assert(txn);
+
+  const Defer cleanup([&s, thread, txn, this]() {
+    if (s.ok()) {
+      thread->stats.AddGets(/*ngets=*/1, /*nfounds=*/1);
+      return;
+    } else if (s.IsNotFound()) {
+      thread->stats.AddGets(/*ngets=*/1, /*nfounds=*/0);
+    } else {
+      thread->stats.AddErrors(1);
+    }
+    RollbackTxn(txn).PermitUncheckedError();
+  });
+
+  std::shared_ptr<const Snapshot> snapshot;
+  SetupSnapshot(thread, ropts, *txn, snapshot);
+
+  if (FLAGS_delay_snapshot_read_one_in > 0 &&
+      thread->rand.OneIn(FLAGS_delay_snapshot_read_one_in)) {
+    uint64_t delay_ms = thread->rand.Uniform(100) + 1;
+    db_->GetDBOptions().env->SleepForMicroseconds(
+        static_cast<int>(delay_ms * 1000));
+  }
+
+  s = txn->Get(ropts, db_->DefaultColumnFamily(), pk_str, &value);
+  if (s.ok()) {
+    s = txn->Commit();
+  }
+  if (s.ok()) {
+    delete txn;
+  }
+  return s;
+#endif  // !ROCKSDB_LITE
+}
+
+Status MultiOpsTxnsStressTest::RangeScanTxn(ThreadState* thread,
+                                            ReadOptions ropts, uint32_t c) {
+#ifdef ROCKSDB_LITE
+  (void)thread;
+  (void)ropts;
+  (void)c;
+  return Status::NotSupported();
+#else
+  std::string sk = Record::EncodeSecondaryKey(c);
+
+  Transaction* txn = nullptr;
+  WriteOptions wopts;
+  Status s = NewTxn(wopts, &txn);
+  if (!s.ok()) {
+    assert(!txn);
+    thread->stats.AddErrors(1);
+    return s;
+  }
+
+  assert(txn);
+
+  const Defer cleanup([&s, thread, txn, this]() {
+    if (s.ok()) {
+      thread->stats.AddIterations(1);
+      return;
+    }
+    thread->stats.AddErrors(1);
+    RollbackTxn(txn).PermitUncheckedError();
+  });
+
+  std::shared_ptr<const Snapshot> snapshot;
+  SetupSnapshot(thread, ropts, *txn, snapshot);
+
+  if (FLAGS_delay_snapshot_read_one_in > 0 &&
+      thread->rand.OneIn(FLAGS_delay_snapshot_read_one_in)) {
+    uint64_t delay_ms = thread->rand.Uniform(100) + 1;
+    db_->GetDBOptions().env->SleepForMicroseconds(
+        static_cast<int>(delay_ms * 1000));
+  }
+
+  std::unique_ptr<Iterator> iter(txn->GetIterator(ropts));
+
+  constexpr size_t total_nexts = 10;
+  size_t nexts = 0;
+  for (iter->Seek(sk);
+       iter->Valid() && nexts < total_nexts && iter->status().ok();
+       iter->Next(), ++nexts) {
+  }
+
+  if (iter->status().ok()) {
+    s = txn->Commit();
+  } else {
+    s = iter->status();
+  }
+
+  if (s.ok()) {
+    delete txn;
+  }
+
+  return s;
+#endif  // !ROCKSDB_LITE
+}
+
+void MultiOpsTxnsStressTest::VerifyDb(ThreadState* thread) const {
+  if (thread->shared->HasVerificationFailedYet()) {
+    return;
+  }
+  const Snapshot* const snapshot = db_->GetSnapshot();
+  assert(snapshot);
+  ManagedSnapshot snapshot_guard(db_, snapshot);
+
+  std::ostringstream oss;
+  oss << "[snap=" << snapshot->GetSequenceNumber() << ",";
+
+  auto* dbimpl = static_cast_with_check<DBImpl>(db_->GetRootDB());
+  assert(dbimpl);
+
+  oss << " last_published=" << dbimpl->GetLastPublishedSequence() << "] ";
+
+  if (FLAGS_delay_snapshot_read_one_in > 0 &&
+      thread->rand.OneIn(FLAGS_delay_snapshot_read_one_in)) {
+    uint64_t delay_ms = thread->rand.Uniform(100) + 1;
+    db_->GetDBOptions().env->SleepForMicroseconds(
+        static_cast<int>(delay_ms * 1000));
+  }
+
+  // TODO (yanqin) with a probability, we can use either forward or backward
+  // iterator in subsequent checks. We can also use more advanced features in
+  // range scan. For now, let's just use simple forward iteration with
+  // total_order_seek = true.
+
+  // First, iterate primary index.
+  size_t primary_index_entries_count = 0;
+  {
+    std::string iter_ub_str;
+    PutFixed32(&iter_ub_str, Record::kPrimaryIndexId + 1);
+    std::reverse(iter_ub_str.begin(), iter_ub_str.end());
+    Slice iter_ub = iter_ub_str;
+
+    std::string start_key;
+    PutFixed32(&start_key, Record::kPrimaryIndexId);
+    std::reverse(start_key.begin(), start_key.end());
+
+    // This `ReadOptions` is for validation purposes. Ignore
+    // `FLAGS_rate_limit_user_ops` to avoid slowing any validation.
+    ReadOptions ropts;
+    ropts.snapshot = snapshot;
+    ropts.total_order_seek = true;
+    ropts.iterate_upper_bound = &iter_ub;
+
+    std::unique_ptr<Iterator> it(db_->NewIterator(ropts));
+    for (it->Seek(start_key); it->Valid(); it->Next()) {
+      Record record;
+      Status s = record.DecodePrimaryIndexEntry(it->key(), it->value());
+      if (!s.ok()) {
+        oss << "Cannot decode primary index entry " << it->key().ToString(true)
+            << "=>" << it->value().ToString(true);
+        VerificationAbort(thread->shared, oss.str(), s);
+        assert(false);
+        return;
+      }
+      ++primary_index_entries_count;
+
+      // Search secondary index.
+      uint32_t a = record.a_value();
+      uint32_t c = record.c_value();
+      char sk_buf[12];
+      EncodeFixed32(sk_buf, Record::kSecondaryIndexId);
+      std::reverse(sk_buf, sk_buf + sizeof(uint32_t));
+      EncodeFixed32(sk_buf + sizeof(uint32_t), c);
+      std::reverse(sk_buf + sizeof(uint32_t), sk_buf + 2 * sizeof(uint32_t));
+      EncodeFixed32(sk_buf + 2 * sizeof(uint32_t), a);
+      std::reverse(sk_buf + 2 * sizeof(uint32_t), sk_buf + sizeof(sk_buf));
+      Slice sk(sk_buf, sizeof(sk_buf));
+      std::string value;
+      s = db_->Get(ropts, sk, &value);
+      if (!s.ok()) {
+        oss << "Cannot find secondary index entry " << sk.ToString(true);
+        VerificationAbort(thread->shared, oss.str(), s);
+        assert(false);
+        return;
+      }
+    }
+  }
+
+  // Second, iterate secondary index.
+  size_t secondary_index_entries_count = 0;
+  {
+    std::string start_key;
+    PutFixed32(&start_key, Record::kSecondaryIndexId);
+    std::reverse(start_key.begin(), start_key.end());
+
+    // This `ReadOptions` is for validation purposes. Ignore
+    // `FLAGS_rate_limit_user_ops` to avoid slowing any validation.
+    ReadOptions ropts;
+    ropts.snapshot = snapshot;
+    ropts.total_order_seek = true;
+
+    std::unique_ptr<Iterator> it(db_->NewIterator(ropts));
+    for (it->Seek(start_key); it->Valid(); it->Next()) {
+      ++secondary_index_entries_count;
+      Record record;
+      Status s = record.DecodeSecondaryIndexEntry(it->key(), it->value());
+      if (!s.ok()) {
+        oss << "Cannot decode secondary index entry "
+            << it->key().ToString(true) << "=>" << it->value().ToString(true);
+        VerificationAbort(thread->shared, oss.str(), s);
+        assert(false);
+        return;
+      }
+      // After decoding secondary index entry, we know a and c. Crc is verified
+      // in decoding phase.
+      //
+      // Form a primary key and search in the primary index.
+      std::string pk = Record::EncodePrimaryKey(record.a_value());
+      std::string value;
+      s = db_->Get(ropts, pk, &value);
+      if (!s.ok()) {
+        oss << "Error searching pk " << Slice(pk).ToString(true) << ". "
+            << s.ToString() << ". sk " << it->key().ToString(true);
+        VerificationAbort(thread->shared, oss.str(), s);
+        assert(false);
+        return;
+      }
+      auto result = Record::DecodePrimaryIndexValue(value);
+      s = std::get<0>(result);
+      if (!s.ok()) {
+        oss << "Error decoding primary index value "
+            << Slice(value).ToString(true) << ". " << s.ToString();
+        VerificationAbort(thread->shared, oss.str(), s);
+        assert(false);
+        return;
+      }
+      uint32_t c_in_primary = std::get<2>(result);
+      if (c_in_primary != record.c_value()) {
+        oss << "Pk/sk mismatch. pk: " << Slice(pk).ToString(true) << "=>"
+            << Slice(value).ToString(true) << " (a=" << record.a_value()
+            << ", c=" << c_in_primary << "), sk: " << it->key().ToString(true)
+            << " (c=" << record.c_value() << ")";
+        VerificationAbort(thread->shared, oss.str(), s);
+        assert(false);
+        return;
+      }
+    }
+  }
+
+  if (secondary_index_entries_count != primary_index_entries_count) {
+    oss << "Pk/sk mismatch: primary index has " << primary_index_entries_count
+        << " entries. Secondary index has " << secondary_index_entries_count
+        << " entries.";
+    VerificationAbort(thread->shared, oss.str(), Status::OK());
+    assert(false);
+    return;
+  }
+}
+
+// VerifyPkSkFast() can be called by MultiOpsTxnsStressListener's callbacks
+// which can be called before TransactionDB::Open() returns to caller.
+// Therefore, at that time, db_ and txn_db_  may still be nullptr.
+// Caller has to make sure that the race condition does not happen.
+void MultiOpsTxnsStressTest::VerifyPkSkFast(int job_id) {
+  DB* const db = db_aptr_.load(std::memory_order_acquire);
+  if (db == nullptr) {
+    return;
+  }
+
+  assert(db_ == db);
+  assert(db_ != nullptr);
+
+  const Snapshot* const snapshot = db_->GetSnapshot();
+  assert(snapshot);
+  ManagedSnapshot snapshot_guard(db_, snapshot);
+
+  std::ostringstream oss;
+  auto* dbimpl = static_cast_with_check<DBImpl>(db_->GetRootDB());
+  assert(dbimpl);
+
+  oss << "Job " << job_id << ": [" << snapshot->GetSequenceNumber() << ","
+      << dbimpl->GetLastPublishedSequence() << "] ";
+
+  std::string start_key;
+  PutFixed32(&start_key, Record::kSecondaryIndexId);
+  std::reverse(start_key.begin(), start_key.end());
+
+  // This `ReadOptions` is for validation purposes. Ignore
+  // `FLAGS_rate_limit_user_ops` to avoid slowing any validation.
+  ReadOptions ropts;
+  ropts.snapshot = snapshot;
+  ropts.total_order_seek = true;
+
+  std::unique_ptr<Iterator> it(db_->NewIterator(ropts));
+  for (it->Seek(start_key); it->Valid(); it->Next()) {
+    Record record;
+    Status s = record.DecodeSecondaryIndexEntry(it->key(), it->value());
+    if (!s.ok()) {
+      oss << "Cannot decode secondary index entry " << it->key().ToString(true)
+          << "=>" << it->value().ToString(true);
+      fprintf(stderr, "%s\n", oss.str().c_str());
+      fflush(stderr);
+      assert(false);
+    }
+    // After decoding secondary index entry, we know a and c. Crc is verified
+    // in decoding phase.
+    //
+    // Form a primary key and search in the primary index.
+    std::string pk = Record::EncodePrimaryKey(record.a_value());
+    std::string value;
+    s = db_->Get(ropts, pk, &value);
+    if (!s.ok()) {
+      oss << "Error searching pk " << Slice(pk).ToString(true) << ". "
+          << s.ToString() << ". sk " << it->key().ToString(true);
+      fprintf(stderr, "%s\n", oss.str().c_str());
+      fflush(stderr);
+      assert(false);
+    }
+    auto result = Record::DecodePrimaryIndexValue(value);
+    s = std::get<0>(result);
+    if (!s.ok()) {
+      oss << "Error decoding primary index value "
+          << Slice(value).ToString(true) << ". " << s.ToString();
+      fprintf(stderr, "%s\n", oss.str().c_str());
+      fflush(stderr);
+      assert(false);
+    }
+    uint32_t c_in_primary = std::get<2>(result);
+    if (c_in_primary != record.c_value()) {
+      oss << "Pk/sk mismatch. pk: " << Slice(pk).ToString(true) << "=>"
+          << Slice(value).ToString(true) << " (a=" << record.a_value()
+          << ", c=" << c_in_primary << "), sk: " << it->key().ToString(true)
+          << " (c=" << record.c_value() << ")";
+      fprintf(stderr, "%s\n", oss.str().c_str());
+      fflush(stderr);
+      assert(false);
+    }
+  }
+}
+
+std::pair<uint32_t, uint32_t> MultiOpsTxnsStressTest::ChooseExistingA(
+    ThreadState* thread) {
+  uint32_t tid = thread->tid;
+  auto& key_gen = key_gen_for_a_.at(tid);
+  return key_gen->ChooseExisting();
+}
+
+uint32_t MultiOpsTxnsStressTest::GenerateNextA(ThreadState* thread) {
+  uint32_t tid = thread->tid;
+  auto& key_gen = key_gen_for_a_.at(tid);
+  return key_gen->Allocate();
+}
+
+std::pair<uint32_t, uint32_t> MultiOpsTxnsStressTest::ChooseExistingC(
+    ThreadState* thread) {
+  uint32_t tid = thread->tid;
+  auto& key_gen = key_gen_for_c_.at(tid);
+  return key_gen->ChooseExisting();
+}
+
+uint32_t MultiOpsTxnsStressTest::GenerateNextC(ThreadState* thread) {
+  uint32_t tid = thread->tid;
+  auto& key_gen = key_gen_for_c_.at(tid);
+  return key_gen->Allocate();
+}
+
+#ifndef ROCKSDB_LITE
+void MultiOpsTxnsStressTest::ProcessRecoveredPreparedTxnsHelper(
+    Transaction* txn, SharedState*) {
+  thread_local Random rand(static_cast<uint32_t>(FLAGS_seed));
+  if (rand.OneIn(2)) {
+    Status s = txn->Commit();
+    assert(s.ok());
+  } else {
+    Status s = txn->Rollback();
+    assert(s.ok());
+  }
+}
+
+Status MultiOpsTxnsStressTest::WriteToCommitTimeWriteBatch(Transaction& txn) {
+  WriteBatch* ctwb = txn.GetCommitTimeWriteBatch();
+  assert(ctwb);
+  // Do not change the content in key_buf.
+  static constexpr char key_buf[sizeof(Record::kMetadataPrefix) + 4] = {
+      '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\xff'};
+
+  uint64_t counter_val = counter_.Next();
+  char val_buf[sizeof(counter_val)];
+  EncodeFixed64(val_buf, counter_val);
+  return ctwb->Put(Slice(key_buf, sizeof(key_buf)),
+                   Slice(val_buf, sizeof(val_buf)));
+}
+
+Status MultiOpsTxnsStressTest::CommitAndCreateTimestampedSnapshotIfNeeded(
+    ThreadState* thread, Transaction& txn) {
+  Status s;
+  if (FLAGS_create_timestamped_snapshot_one_in > 0 &&
+      thread->rand.OneInOpt(FLAGS_create_timestamped_snapshot_one_in)) {
+    uint64_t ts = db_stress_env->NowNanos();
+    std::shared_ptr<const Snapshot> snapshot;
+    s = txn.CommitAndTryCreateSnapshot(/*notifier=*/nullptr, ts, &snapshot);
+  } else {
+    s = txn.Commit();
+  }
+  assert(txn_db_);
+  if (FLAGS_create_timestamped_snapshot_one_in > 0 &&
+      thread->rand.OneInOpt(50000)) {
+    uint64_t now = db_stress_env->NowNanos();
+    constexpr uint64_t time_diff = static_cast<uint64_t>(1000) * 1000 * 1000;
+    txn_db_->ReleaseTimestampedSnapshotsOlderThan(now - time_diff);
+  }
+  return s;
+}
+
+void MultiOpsTxnsStressTest::SetupSnapshot(
+    ThreadState* thread, ReadOptions& read_opts, Transaction& txn,
+    std::shared_ptr<const Snapshot>& snapshot) {
+  if (thread->rand.OneInOpt(2)) {
+    snapshot = txn_db_->GetLatestTimestampedSnapshot();
+  }
+
+  if (snapshot) {
+    read_opts.snapshot = snapshot.get();
+  } else {
+    txn.SetSnapshot();
+    read_opts.snapshot = txn.GetSnapshot();
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+std::string MultiOpsTxnsStressTest::KeySpaces::EncodeTo() const {
+  std::string result;
+  PutFixed32(&result, lb_a);
+  PutFixed32(&result, ub_a);
+  PutFixed32(&result, lb_c);
+  PutFixed32(&result, ub_c);
+  return result;
+}
+
+bool MultiOpsTxnsStressTest::KeySpaces::DecodeFrom(Slice data) {
+  if (!GetFixed32(&data, &lb_a) || !GetFixed32(&data, &ub_a) ||
+      !GetFixed32(&data, &lb_c) || !GetFixed32(&data, &ub_c)) {
+    return false;
+  }
+  return true;
+}
+
+void MultiOpsTxnsStressTest::PersistKeySpacesDesc(
+    const std::string& key_spaces_path, uint32_t lb_a, uint32_t ub_a,
+    uint32_t lb_c, uint32_t ub_c) {
+  KeySpaces key_spaces(lb_a, ub_a, lb_c, ub_c);
+  std::string key_spaces_rep = key_spaces.EncodeTo();
+
+  std::unique_ptr<WritableFile> wfile;
+  Status s1 =
+      Env::Default()->NewWritableFile(key_spaces_path, &wfile, EnvOptions());
+  assert(s1.ok());
+  assert(wfile);
+  s1 = wfile->Append(key_spaces_rep);
+  assert(s1.ok());
+}
+
+MultiOpsTxnsStressTest::KeySpaces MultiOpsTxnsStressTest::ReadKeySpacesDesc(
+    const std::string& key_spaces_path) {
+  KeySpaces key_spaces;
+  std::unique_ptr<SequentialFile> sfile;
+  Status s1 =
+      Env::Default()->NewSequentialFile(key_spaces_path, &sfile, EnvOptions());
+  assert(s1.ok());
+  assert(sfile);
+  char buf[16];
+  Slice result;
+  s1 = sfile->Read(sizeof(buf), &result, buf);
+  assert(s1.ok());
+  if (!key_spaces.DecodeFrom(result)) {
+    assert(false);
+  }
+  return key_spaces;
+}
+
+// Create an empty database if necessary and preload it with initial test data.
+// Key range [lb_a, ub_a), [lb_c, ub_c). The key ranges will be shared by
+// 'threads' threads.
+// PreloadDb() also sets up KeyGenerator objects for each sub key range
+// operated on by each thread.
+// Both [lb_a, ub_a) and [lb_c, ub_c) are partitioned. Each thread operates on
+// one sub range, using KeyGenerators to generate keys.
+// For example, we choose a from [0, 10000) and c from [0, 100). Number of
+// threads is 32, their tids range from 0 to 31.
+// Thread k chooses a from [312*k,312*(k+1)) and c from [3*k,3*(k+1)) if k<31.
+// Thread 31 chooses a from [9672, 10000) and c from [93, 100).
+// Within each subrange: a from [low1, high1), c from [low2, high2).
+// high1 - low1 > high2 - low2
+// We reserve {high1 - 1} and {high2 - 1} as unallocated.
+// The records are <low1,low2>, <low1+1,low2+1>, ...,
+// <low1+k,low2+k%(high2-low2-1), <low1+k+1,low2+(k+1)%(high2-low2-1)>, ...
+void MultiOpsTxnsStressTest::PreloadDb(SharedState* shared, int threads,
+                                       uint32_t lb_a, uint32_t ub_a,
+                                       uint32_t lb_c, uint32_t ub_c) {
+#ifdef ROCKSDB_LITE
+  (void)shared;
+  (void)threads;
+  (void)lb_a;
+  (void)ub_a;
+  (void)lb_c;
+  (void)ub_c;
+#else
+  key_gen_for_a_.resize(threads);
+  key_gen_for_c_.resize(threads);
+
+  assert(ub_a > lb_a && ub_a > lb_a + threads);
+  assert(ub_c > lb_c && ub_c > lb_c + threads);
+
+  PersistKeySpacesDesc(FLAGS_key_spaces_path, lb_a, ub_a, lb_c, ub_c);
+
+  fprintf(stdout, "a from [%u, %u), c from [%u, %u)\n",
+          static_cast<unsigned int>(lb_a), static_cast<unsigned int>(ub_a),
+          static_cast<unsigned int>(lb_c), static_cast<unsigned int>(ub_c));
+
+  const uint32_t num_c = ub_c - lb_c;
+  const uint32_t num_c_per_thread = num_c / threads;
+  const uint32_t num_a = ub_a - lb_a;
+  const uint32_t num_a_per_thread = num_a / threads;
+
+  WriteOptions wopts;
+  wopts.disableWAL = FLAGS_disable_wal;
+  Random rnd(shared->GetSeed());
+  assert(txn_db_);
+
+  std::vector<KeySet> existing_a_uniqs(threads);
+  std::vector<KeySet> non_existing_a_uniqs(threads);
+  std::vector<KeySet> existing_c_uniqs(threads);
+  std::vector<KeySet> non_existing_c_uniqs(threads);
+
+  for (uint32_t a = lb_a; a < ub_a; ++a) {
+    uint32_t tid = (a - lb_a) / num_a_per_thread;
+    if (tid >= static_cast<uint32_t>(threads)) {
+      tid = threads - 1;
+    }
+
+    uint32_t a_base = lb_a + tid * num_a_per_thread;
+    uint32_t a_hi = (tid < static_cast<uint32_t>(threads - 1))
+                        ? (a_base + num_a_per_thread)
+                        : ub_a;
+    uint32_t a_delta = a - a_base;
+
+    if (a == a_hi - 1) {
+      non_existing_a_uniqs[tid].insert(a);
+      continue;
+    }
+
+    uint32_t c_base = lb_c + tid * num_c_per_thread;
+    uint32_t c_hi = (tid < static_cast<uint32_t>(threads - 1))
+                        ? (c_base + num_c_per_thread)
+                        : ub_c;
+    uint32_t c_delta = a_delta % (c_hi - c_base - 1);
+    uint32_t c = c_base + c_delta;
+
+    uint32_t b = rnd.Next();
+    Record record(a, b, c);
+    WriteBatch wb;
+    const auto primary_index_entry = record.EncodePrimaryIndexEntry();
+    Status s = wb.Put(primary_index_entry.first, primary_index_entry.second);
+    assert(s.ok());
+
+    const auto secondary_index_entry = record.EncodeSecondaryIndexEntry();
+    s = wb.Put(secondary_index_entry.first, secondary_index_entry.second);
+    assert(s.ok());
+
+    s = txn_db_->Write(wopts, &wb);
+    assert(s.ok());
+
+    // TODO (yanqin): make the following check optional, especially when data
+    // size is large.
+    Record tmp_rec;
+    tmp_rec.SetB(record.b_value());
+    s = tmp_rec.DecodeSecondaryIndexEntry(secondary_index_entry.first,
+                                          secondary_index_entry.second);
+    assert(s.ok());
+    assert(tmp_rec == record);
+
+    existing_a_uniqs[tid].insert(a);
+    existing_c_uniqs[tid].insert(c);
+  }
+
+  for (int i = 0; i < threads; ++i) {
+    uint32_t my_seed = i + shared->GetSeed();
+
+    auto& key_gen_for_a = key_gen_for_a_[i];
+    assert(!key_gen_for_a);
+    uint32_t low = lb_a + i * num_a_per_thread;
+    uint32_t high = (i < threads - 1) ? (low + num_a_per_thread) : ub_a;
+    assert(existing_a_uniqs[i].size() == high - low - 1);
+    assert(non_existing_a_uniqs[i].size() == 1);
+    key_gen_for_a = std::make_unique<KeyGenerator>(
+        my_seed, low, high, std::move(existing_a_uniqs[i]),
+        std::move(non_existing_a_uniqs[i]));
+
+    auto& key_gen_for_c = key_gen_for_c_[i];
+    assert(!key_gen_for_c);
+    low = lb_c + i * num_c_per_thread;
+    high = (i < threads - 1) ? (low + num_c_per_thread) : ub_c;
+    non_existing_c_uniqs[i].insert(high - 1);
+    assert(existing_c_uniqs[i].size() == high - low - 1);
+    assert(non_existing_c_uniqs[i].size() == 1);
+    key_gen_for_c = std::make_unique<KeyGenerator>(
+        my_seed, low, high, std::move(existing_c_uniqs[i]),
+        std::move(non_existing_c_uniqs[i]));
+  }
+#endif  // !ROCKSDB_LITE
+}
+
+// Scan an existing, non-empty database.
+// Set up [lb_a, ub_a) and [lb_c, ub_c) as test key ranges.
+// Set up KeyGenerator objects for each sub key range operated on by each
+// thread.
+// Scan the entire database and for each subrange, populate the existing keys
+// and non-existing keys. We currently require the non-existing keys be
+// non-empty after initialization.
+void MultiOpsTxnsStressTest::ScanExistingDb(SharedState* shared, int threads) {
+  key_gen_for_a_.resize(threads);
+  key_gen_for_c_.resize(threads);
+
+  KeySpaces key_spaces = ReadKeySpacesDesc(FLAGS_key_spaces_path);
+
+  const uint32_t lb_a = key_spaces.lb_a;
+  const uint32_t ub_a = key_spaces.ub_a;
+  const uint32_t lb_c = key_spaces.lb_c;
+  const uint32_t ub_c = key_spaces.ub_c;
+
+  assert(lb_a < ub_a && lb_c < ub_c);
+
+  fprintf(stdout, "a from [%u, %u), c from [%u, %u)\n",
+          static_cast<unsigned int>(lb_a), static_cast<unsigned int>(ub_a),
+          static_cast<unsigned int>(lb_c), static_cast<unsigned int>(ub_c));
+
+  assert(ub_a > lb_a && ub_a > lb_a + threads);
+  assert(ub_c > lb_c && ub_c > lb_c + threads);
+
+  const uint32_t num_c = ub_c - lb_c;
+  const uint32_t num_c_per_thread = num_c / threads;
+  const uint32_t num_a = ub_a - lb_a;
+  const uint32_t num_a_per_thread = num_a / threads;
+
+  assert(db_);
+  ReadOptions ropts;
+  std::vector<KeySet> existing_a_uniqs(threads);
+  std::vector<KeySet> non_existing_a_uniqs(threads);
+  std::vector<KeySet> existing_c_uniqs(threads);
+  std::vector<KeySet> non_existing_c_uniqs(threads);
+  {
+    std::string pk_lb_str = Record::EncodePrimaryKey(0);
+    std::string pk_ub_str =
+        Record::EncodePrimaryKey(std::numeric_limits<uint32_t>::max());
+    Slice pk_lb = pk_lb_str;
+    Slice pk_ub = pk_ub_str;
+    ropts.iterate_lower_bound = &pk_lb;
+    ropts.iterate_upper_bound = &pk_ub;
+    ropts.total_order_seek = true;
+    std::unique_ptr<Iterator> it(db_->NewIterator(ropts));
+
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      Record record;
+      Status s = record.DecodePrimaryIndexEntry(it->key(), it->value());
+      if (!s.ok()) {
+        fprintf(stderr, "Cannot decode primary index entry (%s => %s): %s\n",
+                it->key().ToString(true).c_str(),
+                it->value().ToString(true).c_str(), s.ToString().c_str());
+        assert(false);
+      }
+      uint32_t a = record.a_value();
+      assert(a >= lb_a);
+      assert(a < ub_a);
+      uint32_t tid = (a - lb_a) / num_a_per_thread;
+      if (tid >= static_cast<uint32_t>(threads)) {
+        tid = threads - 1;
+      }
+
+      existing_a_uniqs[tid].insert(a);
+
+      uint32_t c = record.c_value();
+      assert(c >= lb_c);
+      assert(c < ub_c);
+      tid = (c - lb_c) / num_c_per_thread;
+      if (tid >= static_cast<uint32_t>(threads)) {
+        tid = threads - 1;
+      }
+      auto& existing_c_uniq = existing_c_uniqs[tid];
+      existing_c_uniq.insert(c);
+    }
+
+    for (uint32_t a = lb_a; a < ub_a; ++a) {
+      uint32_t tid = (a - lb_a) / num_a_per_thread;
+      if (tid >= static_cast<uint32_t>(threads)) {
+        tid = threads - 1;
+      }
+      if (0 == existing_a_uniqs[tid].count(a)) {
+        non_existing_a_uniqs[tid].insert(a);
+      }
+    }
+
+    for (uint32_t c = lb_c; c < ub_c; ++c) {
+      uint32_t tid = (c - lb_c) / num_c_per_thread;
+      if (tid >= static_cast<uint32_t>(threads)) {
+        tid = threads - 1;
+      }
+      if (0 == existing_c_uniqs[tid].count(c)) {
+        non_existing_c_uniqs[tid].insert(c);
+      }
+    }
+
+    for (int i = 0; i < threads; ++i) {
+      uint32_t my_seed = i + shared->GetSeed();
+      auto& key_gen_for_a = key_gen_for_a_[i];
+      assert(!key_gen_for_a);
+      uint32_t low = lb_a + i * num_a_per_thread;
+      uint32_t high = (i < threads - 1) ? (low + num_a_per_thread) : ub_a;
+
+      // The following two assertions assume the test thread count and key
+      // space remain the same across different runs. Will need to relax.
+      assert(existing_a_uniqs[i].size() == high - low - 1);
+      assert(non_existing_a_uniqs[i].size() == 1);
+
+      key_gen_for_a = std::make_unique<KeyGenerator>(
+          my_seed, low, high, std::move(existing_a_uniqs[i]),
+          std::move(non_existing_a_uniqs[i]));
+
+      auto& key_gen_for_c = key_gen_for_c_[i];
+      assert(!key_gen_for_c);
+      low = lb_c + i * num_c_per_thread;
+      high = (i < threads - 1) ? (low + num_c_per_thread) : ub_c;
+
+      // The following two assertions assume the test thread count and key
+      // space remain the same across different runs. Will need to relax.
+      assert(existing_c_uniqs[i].size() == high - low - 1);
+      assert(non_existing_c_uniqs[i].size() == 1);
+
+      key_gen_for_c = std::make_unique<KeyGenerator>(
+          my_seed, low, high, std::move(existing_c_uniqs[i]),
+          std::move(non_existing_c_uniqs[i]));
+    }
+  }
+}
+
+StressTest* CreateMultiOpsTxnsStressTest() {
+  return new MultiOpsTxnsStressTest();
+}
+
+void CheckAndSetOptionsForMultiOpsTxnStressTest() {
+#ifndef ROCKSDB_LITE
+  if (FLAGS_test_batches_snapshots || FLAGS_test_cf_consistency) {
+    fprintf(stderr,
+            "-test_multi_ops_txns is not compatible with "
+            "-test_bathces_snapshots and -test_cf_consistency\n");
+    exit(1);
+  }
+  if (!FLAGS_use_txn) {
+    fprintf(stderr, "-use_txn must be true if -test_multi_ops_txns\n");
+    exit(1);
+  } else if (FLAGS_test_secondary > 0) {
+    fprintf(
+        stderr,
+        "secondary instance does not support replaying logs (MANIFEST + WAL) "
+        "of TransactionDB with write-prepared/write-unprepared policy\n");
+    exit(1);
+  }
+  if (FLAGS_clear_column_family_one_in > 0) {
+    fprintf(stderr,
+            "-test_multi_ops_txns is not compatible with clearing column "
+            "families\n");
+    exit(1);
+  }
+  if (FLAGS_column_families > 1) {
+    // TODO (yanqin) support separating primary index and secondary index in
+    // different column families.
+    fprintf(stderr,
+            "-test_multi_ops_txns currently does not use more than one column "
+            "family\n");
+    exit(1);
+  }
+  if (FLAGS_writepercent > 0 || FLAGS_delpercent > 0 ||
+      FLAGS_delrangepercent > 0) {
+    fprintf(stderr,
+            "-test_multi_ops_txns requires that -writepercent, -delpercent and "
+            "-delrangepercent be 0\n");
+    exit(1);
+  }
+  if (FLAGS_key_spaces_path.empty()) {
+    fprintf(stderr,
+            "Must specify a file to store ranges of A and C via "
+            "-key_spaces_path\n");
+    exit(1);
+  }
+  if (FLAGS_create_timestamped_snapshot_one_in > 0) {
+    if (FLAGS_txn_write_policy !=
+        static_cast<uint64_t>(TxnDBWritePolicy::WRITE_COMMITTED)) {
+      fprintf(stderr,
+              "Timestamped snapshot is not yet supported by "
+              "write-prepared/write-unprepared transactions\n");
+      exit(1);
+    }
+  }
+  if (FLAGS_sync_fault_injection == 1) {
+    fprintf(stderr,
+            "Sync fault injection is currently not supported in "
+            "-test_multi_ops_txns\n");
+    exit(1);
+  }
+#else
+  fprintf(stderr, "-test_multi_ops_txns not supported in ROCKSDB_LITE mode\n");
+  exit(1);
+#endif  // !ROCKSDB_LITE
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/db_stress_tool/multi_ops_txns_stress.h b/src/rocksdb/db_stress_tool/multi_ops_txns_stress.h
new file mode 100644
index 000000000..7463d05d7
--- /dev/null
+++ b/src/rocksdb/db_stress_tool/multi_ops_txns_stress.h
@@ -0,0 +1,444 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifdef GFLAGS
+#include "db_stress_tool/db_stress_common.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This file defines MultiOpsTxnsStress so that we can stress test RocksDB
+// transactions on a simple, emulated relational table.
+//
+// The record format is similar to the example found at
+// https://github.com/facebook/mysql-5.6/wiki/MyRocks-record-format.
+//
+// The table is created by
+// ```
+// create table t1 (
+//   a int primary key,
+//   b int,
+//   c int,
+//   key(c),
+//   )
+// ```
+//
+// (For simplicity, we use uint32_t for int here.)
+//
+// For this table, there is a primary index using `a`, as well as a secondary
+// index using `c` and `a`.
+//
+// Primary key format:
+// | index id | M(a) |
+// Primary index value:
+// | b | c |
+// M(a) represents the big-endian format of a.
+//
+// Secondary key format:
+// | index id | M(c) | M(a) |
+// Secondary index value:
+// | crc32 |
+// Similarly to M(a), M(c) is the big-endian format of c.
+//
+// The in-memory representation of a record is defined in class
+// MultiOpsTxnsStress:Record that includes a number of helper methods to
+// encode/decode primary index keys, primary index values, secondary index keys,
+// secondary index values, etc.
+//
+// Sometimes primary index and secondary index reside on different column
+// families, but sometimes they colocate in the same column family. Current
+// implementation puts them in the same (default) column family, and this is
+// subject to future change if we find it interesting to test the other case.
+//
+// Class MultiOpsTxnsStressTest has the following transactions for testing.
+//
+// 1. Primary key update
+// UPDATE t1 SET a = 3 WHERE a = 2;
+// ```
+// tx->GetForUpdate(primary key a=2)
+// tx->GetForUpdate(primary key a=3)
+// tx->Delete(primary key a=2)
+// tx->Put(primary key a=3, value)
+// tx->batch->SingleDelete(secondary key a=2)
+// tx->batch->Put(secondary key a=3, value)
+// tx->Prepare()
+// Tx->Commit()
+// ```
+//
+// 2. Secondary key update
+// UPDATE t1 SET c = 3 WHERE c = 2;
+// ```
+// iter->Seek(secondary key)
+// // Get corresponding primary key value(s) from iterator
+// tx->GetForUpdate(primary key)
+// tx->Put(primary key, value c=3)
+// tx->batch->SingleDelete(secondary key c=2)
+// tx->batch->Put(secondary key c=3)
+// tx->Prepare()
+// tx->Commit()
+// ```
+//
+// 3. Primary index value update
+// UPDATE t1 SET b = b + 1 WHERE a = 2;
+// ```
+// tx->GetForUpdate(primary key a=2)
+// tx->Put(primary key a=2, value b=b+1)
+// tx->Prepare()
+// tx->Commit()
+// ```
+//
+// 4. Point lookup
+// SELECT * FROM t1 WHERE a = 3;
+// ```
+// tx->Get(primary key a=3)
+// tx->Commit()
+// ```
+//
+// 5. Range scan
+// SELECT * FROM t1 WHERE c = 2;
+// ```
+// it = tx->GetIterator()
+// it->Seek(secondary key c=2)
+// tx->Commit()
+// ```
+
+class MultiOpsTxnsStressTest : public StressTest {
+ public:
+  class Record {
+   public:
+    static constexpr uint32_t kMetadataPrefix = 0;
+    static constexpr uint32_t kPrimaryIndexId = 1;
+    static constexpr uint32_t kSecondaryIndexId = 2;
+
+    static constexpr size_t kPrimaryIndexEntrySize = 8 + 8;
+    static constexpr size_t kSecondaryIndexEntrySize = 12 + 4;
+
+    static_assert(kPrimaryIndexId < kSecondaryIndexId,
+                  "kPrimaryIndexId must be smaller than kSecondaryIndexId");
+
+    static_assert(sizeof(kPrimaryIndexId) == sizeof(uint32_t),
+                  "kPrimaryIndexId must be 4 bytes");
+    static_assert(sizeof(kSecondaryIndexId) == sizeof(uint32_t),
+                  "kSecondaryIndexId must be 4 bytes");
+
+    // Used for generating search key to probe primary index.
+    static std::string EncodePrimaryKey(uint32_t a);
+    // Used for generating search prefix to probe secondary index.
+    static std::string EncodeSecondaryKey(uint32_t c);
+    // Used for generating search key to probe secondary index.
+    static std::string EncodeSecondaryKey(uint32_t c, uint32_t a);
+
+    static std::tuple<Status, uint32_t, uint32_t> DecodePrimaryIndexValue(
+        Slice primary_index_value);
+
+    static std::pair<Status, uint32_t> DecodeSecondaryIndexValue(
+        Slice secondary_index_value);
+
+    Record() = default;
+    Record(uint32_t _a, uint32_t _b, uint32_t _c) : a_(_a), b_(_b), c_(_c) {}
+
+    bool operator==(const Record& other) const {
+      return a_ == other.a_ && b_ == other.b_ && c_ == other.c_;
+    }
+
+    bool operator!=(const Record& other) const { return !(*this == other); }
+
+    std::pair<std::string, std::string> EncodePrimaryIndexEntry() const;
+
+    std::string EncodePrimaryKey() const;
+
+    std::string EncodePrimaryIndexValue() const;
+
+    std::pair<std::string, std::string> EncodeSecondaryIndexEntry() const;
+
+    std::string EncodeSecondaryKey() const;
+
+    Status DecodePrimaryIndexEntry(Slice primary_index_key,
+                                   Slice primary_index_value);
+
+    Status DecodeSecondaryIndexEntry(Slice secondary_index_key,
+                                     Slice secondary_index_value);
+
+    uint32_t a_value() const { return a_; }
+    uint32_t b_value() const { return b_; }
+    uint32_t c_value() const { return c_; }
+
+    void SetA(uint32_t _a) { a_ = _a; }
+    void SetB(uint32_t _b) { b_ = _b; }
+    void SetC(uint32_t _c) { c_ = _c; }
+
+    std::string ToString() const {
+      std::string ret("(");
+      ret.append(std::to_string(a_));
+      ret.append(",");
+      ret.append(std::to_string(b_));
+      ret.append(",");
+      ret.append(std::to_string(c_));
+      ret.append(")");
+      return ret;
+    }
+
+   private:
+    friend class InvariantChecker;
+
+    uint32_t a_{0};
+    uint32_t b_{0};
+    uint32_t c_{0};
+  };
+
+  MultiOpsTxnsStressTest() {}
+
+  ~MultiOpsTxnsStressTest() override {}
+
+  void FinishInitDb(SharedState*) override;
+
+  void ReopenAndPreloadDbIfNeeded(SharedState* shared);
+
+  bool IsStateTracked() const override { return false; }
+
+  Status TestGet(ThreadState* thread, const ReadOptions& read_opts,
+                 const std::vector<int>& rand_column_families,
+                 const std::vector<int64_t>& rand_keys) override;
+
+  std::vector<Status> TestMultiGet(
+      ThreadState* thread, const ReadOptions& read_opts,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys) override;
+
+  Status TestPrefixScan(ThreadState* thread, const ReadOptions& read_opts,
+                        const std::vector<int>& rand_column_families,
+                        const std::vector<int64_t>& rand_keys) override;
+
+  // Given a key K, this creates an iterator which scans to K and then
+  // does a random sequence of Next/Prev operations.
+  Status TestIterate(ThreadState* thread, const ReadOptions& read_opts,
+                     const std::vector<int>& rand_column_families,
+                     const std::vector<int64_t>& rand_keys) override;
+
+  Status TestPut(ThreadState* thread, WriteOptions& write_opts,
+                 const ReadOptions& read_opts, const std::vector<int>& cf_ids,
+                 const std::vector<int64_t>& keys, char (&value)[100]) override;
+
+  Status TestDelete(ThreadState* thread, WriteOptions& write_opts,
+                    const std::vector<int>& rand_column_families,
+                    const std::vector<int64_t>& rand_keys) override;
+
+  Status TestDeleteRange(ThreadState* thread, WriteOptions& write_opts,
+                         const std::vector<int>& rand_column_families,
+                         const std::vector<int64_t>& rand_keys) override;
+
+  void TestIngestExternalFile(ThreadState* thread,
+                              const std::vector<int>& rand_column_families,
+                              const std::vector<int64_t>& rand_keys) override;
+
+  void TestCompactRange(ThreadState* thread, int64_t rand_key,
+                        const Slice& start_key,
+                        ColumnFamilyHandle* column_family) override;
+
+  Status TestBackupRestore(ThreadState* thread,
+                           const std::vector<int>& rand_column_families,
+                           const std::vector<int64_t>& rand_keys) override;
+
+  Status TestCheckpoint(ThreadState* thread,
+                        const std::vector<int>& rand_column_families,
+                        const std::vector<int64_t>& rand_keys) override;
+
+#ifndef ROCKSDB_LITE
+  Status TestApproximateSize(ThreadState* thread, uint64_t iteration,
+                             const std::vector<int>& rand_column_families,
+                             const std::vector<int64_t>& rand_keys) override;
+#endif  // !ROCKSDB_LITE
+
+  Status TestCustomOperations(
+      ThreadState* thread,
+      const std::vector<int>& rand_column_families) override;
+
+  void RegisterAdditionalListeners() override;
+
+#ifndef ROCKSDB_LITE
+  void PrepareTxnDbOptions(SharedState* /*shared*/,
+                           TransactionDBOptions& txn_db_opts) override;
+#endif  // !ROCKSDB_LITE
+
+  Status PrimaryKeyUpdateTxn(ThreadState* thread, uint32_t old_a,
+                             uint32_t old_a_pos, uint32_t new_a);
+
+  Status SecondaryKeyUpdateTxn(ThreadState* thread, uint32_t old_c,
+                               uint32_t old_c_pos, uint32_t new_c);
+
+  Status UpdatePrimaryIndexValueTxn(ThreadState* thread, uint32_t a,
+                                    uint32_t b_delta);
+
+  Status PointLookupTxn(ThreadState* thread, ReadOptions ropts, uint32_t a);
+
+  Status RangeScanTxn(ThreadState* thread, ReadOptions ropts, uint32_t c);
+
+  void VerifyDb(ThreadState* thread) const override;
+
+  void ContinuouslyVerifyDb(ThreadState* thread) const override {
+    VerifyDb(thread);
+  }
+
+  void VerifyPkSkFast(int job_id);
+
+ protected:
+  class Counter {
+   public:
+    uint64_t Next() { return value_.fetch_add(1); }
+
+   private:
+    std::atomic<uint64_t> value_ = Env::Default()->NowNanos();
+  };
+
+  using KeySet = std::set<uint32_t>;
+  class KeyGenerator {
+   public:
+    explicit KeyGenerator(uint32_t s, uint32_t low, uint32_t high,
+                          KeySet&& existing_uniq, KeySet&& non_existing_uniq)
+        : rand_(s),
+          low_(low),
+          high_(high),
+          existing_uniq_(std::move(existing_uniq)),
+          non_existing_uniq_(std::move(non_existing_uniq)) {}
+    ~KeyGenerator() {
+      assert(!existing_uniq_.empty());
+      assert(!non_existing_uniq_.empty());
+    }
+    void FinishInit();
+
+    std::pair<uint32_t, uint32_t> ChooseExisting();
+    void Replace(uint32_t old_val, uint32_t old_pos, uint32_t new_val);
+    uint32_t Allocate();
+    void UndoAllocation(uint32_t new_val);
+
+    std::string ToString() const {
+      std::ostringstream oss;
+      oss << "[" << low_ << ", " << high_ << "): " << existing_.size()
+          << " elements, " << existing_uniq_.size() << " unique values, "
+          << non_existing_uniq_.size() << " unique non-existing values";
+      return oss.str();
+    }
+
+   private:
+    Random rand_;
+    uint32_t low_ = 0;
+    uint32_t high_ = 0;
+    std::vector<uint32_t> existing_{};
+    KeySet existing_uniq_{};
+    KeySet non_existing_uniq_{};
+    bool initialized_ = false;
+  };
+
+  // Return <a, pos>
+  std::pair<uint32_t, uint32_t> ChooseExistingA(ThreadState* thread);
+
+  uint32_t GenerateNextA(ThreadState* thread);
+
+  // Return <c, pos>
+  std::pair<uint32_t, uint32_t> ChooseExistingC(ThreadState* thread);
+
+  uint32_t GenerateNextC(ThreadState* thread);
+
+#ifndef ROCKSDB_LITE
+  // Randomly commit or rollback `txn`
+  void ProcessRecoveredPreparedTxnsHelper(Transaction* txn,
+                                          SharedState*) override;
+
+  // Some applications, e.g. MyRocks writes a KV pair to the database via
+  // commit-time-write-batch (ctwb) in additional to the transaction's regular
+  // write batch. The key is usually constant representing some system
+  // metadata, while the value is monoticailly increasing which represents the
+  // actual value of the metadata. Method WriteToCommitTimeWriteBatch()
+  // emulates this scenario.
+  Status WriteToCommitTimeWriteBatch(Transaction& txn);
+
+  Status CommitAndCreateTimestampedSnapshotIfNeeded(ThreadState* thread,
+                                                    Transaction& txn);
+
+  void SetupSnapshot(ThreadState* thread, ReadOptions& read_opts,
+                     Transaction& txn,
+                     std::shared_ptr<const Snapshot>& snapshot);
+#endif  //! ROCKSDB_LITE
+
+  std::vector<std::unique_ptr<KeyGenerator>> key_gen_for_a_;
+  std::vector<std::unique_ptr<KeyGenerator>> key_gen_for_c_;
+
+  Counter counter_{};
+
+ private:
+  struct KeySpaces {
+    uint32_t lb_a = 0;
+    uint32_t ub_a = 0;
+    uint32_t lb_c = 0;
+    uint32_t ub_c = 0;
+
+    explicit KeySpaces() = default;
+    explicit KeySpaces(uint32_t _lb_a, uint32_t _ub_a, uint32_t _lb_c,
+                       uint32_t _ub_c)
+        : lb_a(_lb_a), ub_a(_ub_a), lb_c(_lb_c), ub_c(_ub_c) {}
+
+    std::string EncodeTo() const;
+    bool DecodeFrom(Slice data);
+  };
+
+  void PersistKeySpacesDesc(const std::string& key_spaces_path, uint32_t lb_a,
+                            uint32_t ub_a, uint32_t lb_c, uint32_t ub_c);
+
+  KeySpaces ReadKeySpacesDesc(const std::string& key_spaces_path);
+
+  void PreloadDb(SharedState* shared, int threads, uint32_t lb_a, uint32_t ub_a,
+                 uint32_t lb_c, uint32_t ub_c);
+
+  void ScanExistingDb(SharedState* shared, int threads);
+};
+
+class InvariantChecker {
+ public:
+  static_assert(sizeof(MultiOpsTxnsStressTest::Record().a_) == sizeof(uint32_t),
+                "MultiOpsTxnsStressTest::Record::a_ must be 4 bytes");
+  static_assert(sizeof(MultiOpsTxnsStressTest::Record().b_) == sizeof(uint32_t),
+                "MultiOpsTxnsStressTest::Record::b_ must be 4 bytes");
+  static_assert(sizeof(MultiOpsTxnsStressTest::Record().c_) == sizeof(uint32_t),
+                "MultiOpsTxnsStressTest::Record::c_ must be 4 bytes");
+};
+
+class MultiOpsTxnsStressListener : public EventListener {
+ public:
+  explicit MultiOpsTxnsStressListener(MultiOpsTxnsStressTest* stress_test)
+      : stress_test_(stress_test) {
+    assert(stress_test_);
+  }
+
+#ifndef ROCKSDB_LITE
+  ~MultiOpsTxnsStressListener() override {}
+
+  void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
+    assert(db);
+#ifdef NDEBUG
+    (void)db;
+#endif
+    assert(info.cf_id == 0);
+    stress_test_->VerifyPkSkFast(info.job_id);
+  }
+
+  void OnCompactionCompleted(DB* db, const CompactionJobInfo& info) override {
+    assert(db);
+#ifdef NDEBUG
+    (void)db;
+#endif
+    assert(info.cf_id == 0);
+    stress_test_->VerifyPkSkFast(info.job_id);
+  }
+#endif  //! ROCKSDB_LITE
+
+ private:
+  MultiOpsTxnsStressTest* const stress_test_ = nullptr;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // GFLAGS
diff --git a/src/rocksdb/db_stress_tool/no_batched_ops_stress.cc b/src/rocksdb/db_stress_tool/no_batched_ops_stress.cc
new file mode 100644
index 000000000..bf01b788f
--- /dev/null
+++ b/src/rocksdb/db_stress_tool/no_batched_ops_stress.cc
@@ -0,0 +1,1505 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifdef GFLAGS
+#include "db_stress_tool/db_stress_common.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "utilities/fault_injection_fs.h"
+
+namespace ROCKSDB_NAMESPACE {
+class NonBatchedOpsStressTest : public StressTest {
+ public:
+  NonBatchedOpsStressTest() {}
+
+  virtual ~NonBatchedOpsStressTest() {}
+
+  void VerifyDb(ThreadState* thread) const override {
+    // This `ReadOptions` is for validation purposes. Ignore
+    // `FLAGS_rate_limit_user_ops` to avoid slowing any validation.
+    ReadOptions options(FLAGS_verify_checksum, true);
+    std::string ts_str;
+    Slice ts;
+    if (FLAGS_user_timestamp_size > 0) {
+      ts_str = GetNowNanos();
+      ts = ts_str;
+      options.timestamp = &ts;
+    }
+
+    auto shared = thread->shared;
+    const int64_t max_key = shared->GetMaxKey();
+    const int64_t keys_per_thread = max_key / shared->GetNumThreads();
+    int64_t start = keys_per_thread * thread->tid;
+    int64_t end = start + keys_per_thread;
+    uint64_t prefix_to_use =
+        (FLAGS_prefix_size < 0) ? 1 : static_cast<size_t>(FLAGS_prefix_size);
+
+    if (thread->tid == shared->GetNumThreads() - 1) {
+      end = max_key;
+    }
+
+    for (size_t cf = 0; cf < column_families_.size(); ++cf) {
+      if (thread->shared->HasVerificationFailedYet()) {
+        break;
+      }
+
+      enum class VerificationMethod {
+        kIterator,
+        kGet,
+        kMultiGet,
+        kGetMergeOperands,
+        // Add any new items above kNumberOfMethods
+        kNumberOfMethods
+      };
+
+      constexpr int num_methods =
+          static_cast<int>(VerificationMethod::kNumberOfMethods);
+
+      const VerificationMethod method =
+          static_cast<VerificationMethod>(thread->rand.Uniform(
+              (FLAGS_user_timestamp_size > 0) ? num_methods - 1 : num_methods));
+
+      if (method == VerificationMethod::kIterator) {
+        std::unique_ptr<Iterator> iter(
+            db_->NewIterator(options, column_families_[cf]));
+
+        std::string seek_key = Key(start);
+        iter->Seek(seek_key);
+
+        Slice prefix(seek_key.data(), prefix_to_use);
+
+        for (int64_t i = start; i < end; ++i) {
+          if (thread->shared->HasVerificationFailedYet()) {
+            break;
+          }
+
+          const std::string key = Key(i);
+          const Slice k(key);
+          const Slice pfx(key.data(), prefix_to_use);
+
+          // Reseek when the prefix changes
+          if (prefix_to_use > 0 && prefix.compare(pfx) != 0) {
+            iter->Seek(k);
+            seek_key = key;
+            prefix = Slice(seek_key.data(), prefix_to_use);
+          }
+
+          Status s = iter->status();
+
+          std::string from_db;
+
+          if (iter->Valid()) {
+            const int diff = iter->key().compare(k);
+
+            if (diff > 0) {
+              s = Status::NotFound();
+            } else if (diff == 0) {
+              const WideColumns expected_columns = GenerateExpectedWideColumns(
+                  GetValueBase(iter->value()), iter->value());
+              if (iter->columns() != expected_columns) {
+                VerificationAbort(shared, static_cast<int>(cf), i,
+                                  iter->value(), iter->columns(),
+                                  expected_columns);
+                break;
+              }
+
+              from_db = iter->value().ToString();
+              iter->Next();
+            } else {
+              assert(diff < 0);
+
+              VerificationAbort(shared, "An out of range key was found",
+                                static_cast<int>(cf), i);
+            }
+          } else {
+            // The iterator found no value for the key in question, so do not
+            // move to the next item in the iterator
+            s = Status::NotFound();
+          }
+
+          VerifyOrSyncValue(static_cast<int>(cf), i, options, shared, from_db,
+                            s, /* strict */ true);
+
+          if (!from_db.empty()) {
+            PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
+                          from_db.data(), from_db.size());
+          }
+        }
+      } else if (method == VerificationMethod::kGet) {
+        for (int64_t i = start; i < end; ++i) {
+          if (thread->shared->HasVerificationFailedYet()) {
+            break;
+          }
+
+          const std::string key = Key(i);
+          std::string from_db;
+
+          Status s = db_->Get(options, column_families_[cf], key, &from_db);
+
+          VerifyOrSyncValue(static_cast<int>(cf), i, options, shared, from_db,
+                            s, /* strict */ true);
+
+          if (!from_db.empty()) {
+            PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
+                          from_db.data(), from_db.size());
+          }
+        }
+      } else if (method == VerificationMethod::kMultiGet) {
+        for (int64_t i = start; i < end;) {
+          if (thread->shared->HasVerificationFailedYet()) {
+            break;
+          }
+
+          // Keep the batch size to some reasonable value
+          size_t batch_size = thread->rand.Uniform(128) + 1;
+          batch_size = std::min<size_t>(batch_size, end - i);
+
+          std::vector<std::string> keystrs(batch_size);
+          std::vector<Slice> keys(batch_size);
+          std::vector<PinnableSlice> values(batch_size);
+          std::vector<Status> statuses(batch_size);
+
+          for (size_t j = 0; j < batch_size; ++j) {
+            keystrs[j] = Key(i + j);
+            keys[j] = Slice(keystrs[j].data(), keystrs[j].size());
+          }
+
+          db_->MultiGet(options, column_families_[cf], batch_size, keys.data(),
+                        values.data(), statuses.data());
+
+          for (size_t j = 0; j < batch_size; ++j) {
+            const std::string from_db = values[j].ToString();
+
+            VerifyOrSyncValue(static_cast<int>(cf), i + j, options, shared,
+                              from_db, statuses[j], /* strict */ true);
+
+            if (!from_db.empty()) {
+              PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i + j),
+                            from_db.data(), from_db.size());
+            }
+          }
+
+          i += batch_size;
+        }
+      } else {
+        assert(method == VerificationMethod::kGetMergeOperands);
+
+        // Start off with small size that will be increased later if necessary
+        std::vector<PinnableSlice> values(4);
+
+        GetMergeOperandsOptions merge_operands_info;
+        merge_operands_info.expected_max_number_of_operands =
+            static_cast<int>(values.size());
+
+        for (int64_t i = start; i < end; ++i) {
+          if (thread->shared->HasVerificationFailedYet()) {
+            break;
+          }
+
+          const std::string key = Key(i);
+          const Slice k(key);
+          std::string from_db;
+          int number_of_operands = 0;
+
+          Status s = db_->GetMergeOperands(options, column_families_[cf], k,
+                                           values.data(), &merge_operands_info,
+                                           &number_of_operands);
+
+          if (s.IsIncomplete()) {
+            // Need to resize values as there are more than values.size() merge
+            // operands on this key. Should only happen a few times when we
+            // encounter a key that had more merge operands than any key seen so
+            // far
+            values.resize(number_of_operands);
+            merge_operands_info.expected_max_number_of_operands =
+                static_cast<int>(number_of_operands);
+            s = db_->GetMergeOperands(options, column_families_[cf], k,
+                                      values.data(), &merge_operands_info,
+                                      &number_of_operands);
+          }
+          // Assumed here that GetMergeOperands always sets number_of_operand
+          if (number_of_operands) {
+            from_db = values[number_of_operands - 1].ToString();
+          }
+
+          VerifyOrSyncValue(static_cast<int>(cf), i, options, shared, from_db,
+                            s, /* strict */ true);
+
+          if (!from_db.empty()) {
+            PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
+                          from_db.data(), from_db.size());
+          }
+        }
+      }
+    }
+  }
+
+#ifndef ROCKSDB_LITE
+  void ContinuouslyVerifyDb(ThreadState* thread) const override {
+    if (!cmp_db_) {
+      return;
+    }
+    assert(cmp_db_);
+    assert(!cmp_cfhs_.empty());
+    Status s = cmp_db_->TryCatchUpWithPrimary();
+    if (!s.ok()) {
+      assert(false);
+      exit(1);
+    }
+
+    const auto checksum_column_family = [](Iterator* iter,
+                                           uint32_t* checksum) -> Status {
+      assert(nullptr != checksum);
+      uint32_t ret = 0;
+      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+        ret = crc32c::Extend(ret, iter->key().data(), iter->key().size());
+        ret = crc32c::Extend(ret, iter->value().data(), iter->value().size());
+      }
+      *checksum = ret;
+      return iter->status();
+    };
+
+    auto* shared = thread->shared;
+    assert(shared);
+    const int64_t max_key = shared->GetMaxKey();
+    ReadOptions read_opts(FLAGS_verify_checksum, true);
+    std::string ts_str;
+    Slice ts;
+    if (FLAGS_user_timestamp_size > 0) {
+      ts_str = GetNowNanos();
+      ts = ts_str;
+      read_opts.timestamp = &ts;
+    }
+
+    static Random64 rand64(shared->GetSeed());
+
+    {
+      uint32_t crc = 0;
+      std::unique_ptr<Iterator> it(cmp_db_->NewIterator(read_opts));
+      s = checksum_column_family(it.get(), &crc);
+      if (!s.ok()) {
+        fprintf(stderr, "Computing checksum of default cf: %s\n",
+                s.ToString().c_str());
+        assert(false);
+      }
+    }
+
+    for (auto* handle : cmp_cfhs_) {
+      if (thread->rand.OneInOpt(3)) {
+        // Use Get()
+        uint64_t key = rand64.Uniform(static_cast<uint64_t>(max_key));
+        std::string key_str = Key(key);
+        std::string value;
+        std::string key_ts;
+        s = cmp_db_->Get(read_opts, handle, key_str, &value,
+                         FLAGS_user_timestamp_size > 0 ? &key_ts : nullptr);
+        s.PermitUncheckedError();
+      } else {
+        // Use range scan
+        std::unique_ptr<Iterator> iter(cmp_db_->NewIterator(read_opts, handle));
+        uint32_t rnd = (thread->rand.Next()) % 4;
+        if (0 == rnd) {
+          // SeekToFirst() + Next()*5
+          read_opts.total_order_seek = true;
+          iter->SeekToFirst();
+          for (int i = 0; i < 5 && iter->Valid(); ++i, iter->Next()) {
+          }
+        } else if (1 == rnd) {
+          // SeekToLast() + Prev()*5
+          read_opts.total_order_seek = true;
+          iter->SeekToLast();
+          for (int i = 0; i < 5 && iter->Valid(); ++i, iter->Prev()) {
+          }
+        } else if (2 == rnd) {
+          // Seek() +Next()*5
+          uint64_t key = rand64.Uniform(static_cast<uint64_t>(max_key));
+          std::string key_str = Key(key);
+          iter->Seek(key_str);
+          for (int i = 0; i < 5 && iter->Valid(); ++i, iter->Next()) {
+          }
+        } else {
+          // SeekForPrev() + Prev()*5
+          uint64_t key = rand64.Uniform(static_cast<uint64_t>(max_key));
+          std::string key_str = Key(key);
+          iter->SeekForPrev(key_str);
+          for (int i = 0; i < 5 && iter->Valid(); ++i, iter->Prev()) {
+          }
+        }
+      }
+    }
+  }
+#else
+  void ContinuouslyVerifyDb(ThreadState* /*thread*/) const override {}
+#endif  // ROCKSDB_LITE
+
+  void MaybeClearOneColumnFamily(ThreadState* thread) override {
+    if (FLAGS_column_families > 1) {
+      if (thread->rand.OneInOpt(FLAGS_clear_column_family_one_in)) {
+        // drop column family and then create it again (can't drop default)
+        int cf = thread->rand.Next() % (FLAGS_column_families - 1) + 1;
+        std::string new_name =
+            std::to_string(new_column_family_name_.fetch_add(1));
+        {
+          MutexLock l(thread->shared->GetMutex());
+          fprintf(
+              stdout,
+              "[CF %d] Dropping and recreating column family. new name: %s\n",
+              cf, new_name.c_str());
+        }
+        thread->shared->LockColumnFamily(cf);
+        Status s = db_->DropColumnFamily(column_families_[cf]);
+        delete column_families_[cf];
+        if (!s.ok()) {
+          fprintf(stderr, "dropping column family error: %s\n",
+                  s.ToString().c_str());
+          std::terminate();
+        }
+        s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), new_name,
+                                    &column_families_[cf]);
+        column_family_names_[cf] = new_name;
+        thread->shared->ClearColumnFamily(cf);
+        if (!s.ok()) {
+          fprintf(stderr, "creating column family error: %s\n",
+                  s.ToString().c_str());
+          std::terminate();
+        }
+        thread->shared->UnlockColumnFamily(cf);
+      }
+    }
+  }
+
+  bool ShouldAcquireMutexOnKey() const override { return true; }
+
+  bool IsStateTracked() const override { return true; }
+
+  Status TestGet(ThreadState* thread, const ReadOptions& read_opts,
+                 const std::vector<int>& rand_column_families,
+                 const std::vector<int64_t>& rand_keys) override {
+    auto cfh = column_families_[rand_column_families[0]];
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    std::string from_db;
+    int error_count = 0;
+
+    if (fault_fs_guard) {
+      fault_fs_guard->EnableErrorInjection();
+      SharedState::ignore_read_error = false;
+    }
+
+    std::unique_ptr<MutexLock> lock(new MutexLock(
+        thread->shared->GetMutexForKey(rand_column_families[0], rand_keys[0])));
+
+    ReadOptions read_opts_copy = read_opts;
+    std::string read_ts_str;
+    Slice read_ts_slice;
+    bool read_older_ts = MaybeUseOlderTimestampForPointLookup(
+        thread, read_ts_str, read_ts_slice, read_opts_copy);
+
+    Status s = db_->Get(read_opts_copy, cfh, key, &from_db);
+    if (fault_fs_guard) {
+      error_count = fault_fs_guard->GetAndResetErrorCount();
+    }
+    if (s.ok()) {
+      if (fault_fs_guard) {
+        if (error_count && !SharedState::ignore_read_error) {
+          // Grab mutex so multiple thread don't try to print the
+          // stack trace at the same time
+          MutexLock l(thread->shared->GetMutex());
+          fprintf(stderr, "Didn't get expected error from Get\n");
+          fprintf(stderr, "Callstack that injected the fault\n");
+          fault_fs_guard->PrintFaultBacktrace();
+          std::terminate();
+        }
+      }
+      // found case
+      thread->stats.AddGets(1, 1);
+      // we only have the latest expected state
+      if (!FLAGS_skip_verifydb && !read_opts_copy.timestamp &&
+          thread->shared->Get(rand_column_families[0], rand_keys[0]) ==
+              SharedState::DELETION_SENTINEL) {
+        thread->shared->SetVerificationFailure();
+        fprintf(stderr,
+                "error : inconsistent values for key %s: Get returns %s, "
+                "expected state does not have the key.\n",
+                key.ToString(true).c_str(), StringToHex(from_db).c_str());
+      }
+    } else if (s.IsNotFound()) {
+      // not found case
+      thread->stats.AddGets(1, 0);
+      if (!FLAGS_skip_verifydb && !read_older_ts) {
+        auto expected =
+            thread->shared->Get(rand_column_families[0], rand_keys[0]);
+        if (expected != SharedState::DELETION_SENTINEL &&
+            expected != SharedState::UNKNOWN_SENTINEL) {
+          thread->shared->SetVerificationFailure();
+          fprintf(stderr,
+                  "error : inconsistent values for key %s: expected state has "
+                  "the key, Get() returns NotFound.\n",
+                  key.ToString(true).c_str());
+        }
+      }
+    } else {
+      if (error_count == 0) {
+        // errors case
+        thread->stats.AddErrors(1);
+      } else {
+        thread->stats.AddVerifiedErrors(1);
+      }
+    }
+    if (fault_fs_guard) {
+      fault_fs_guard->DisableErrorInjection();
+    }
+    return s;
+  }
+
+  std::vector<Status> TestMultiGet(
+      ThreadState* thread, const ReadOptions& read_opts,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys) override {
+    size_t num_keys = rand_keys.size();
+    std::vector<std::string> key_str;
+    std::vector<Slice> keys;
+    key_str.reserve(num_keys);
+    keys.reserve(num_keys);
+    std::vector<PinnableSlice> values(num_keys);
+    std::vector<Status> statuses(num_keys);
+    ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]];
+    int error_count = 0;
+    // Do a consistency check between Get and MultiGet. Don't do it too
+    // often as it will slow db_stress down
+    bool do_consistency_check = thread->rand.OneIn(4);
+
+    ReadOptions readoptionscopy = read_opts;
+    if (do_consistency_check) {
+      readoptionscopy.snapshot = db_->GetSnapshot();
+    }
+
+    std::string read_ts_str;
+    Slice read_ts_slice;
+    MaybeUseOlderTimestampForPointLookup(thread, read_ts_str, read_ts_slice,
+                                         readoptionscopy);
+
+    readoptionscopy.rate_limiter_priority =
+        FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL;
+
+    // To appease clang analyzer
+    const bool use_txn = FLAGS_use_txn;
+
+    // Create a transaction in order to write some data. The purpose is to
+    // exercise WriteBatchWithIndex::MultiGetFromBatchAndDB. The transaction
+    // will be rolled back once MultiGet returns.
+#ifndef ROCKSDB_LITE
+    Transaction* txn = nullptr;
+    if (use_txn) {
+      WriteOptions wo;
+      if (FLAGS_rate_limit_auto_wal_flush) {
+        wo.rate_limiter_priority = Env::IO_USER;
+      }
+      Status s = NewTxn(wo, &txn);
+      if (!s.ok()) {
+        fprintf(stderr, "NewTxn: %s\n", s.ToString().c_str());
+        std::terminate();
+      }
+    }
+#endif
+    for (size_t i = 0; i < num_keys; ++i) {
+      key_str.emplace_back(Key(rand_keys[i]));
+      keys.emplace_back(key_str.back());
+#ifndef ROCKSDB_LITE
+      if (use_txn) {
+        // With a 1 in 10 probability, insert the just added key in the batch
+        // into the transaction. This will create an overlap with the MultiGet
+        // keys and exercise some corner cases in the code
+        if (thread->rand.OneIn(10)) {
+          int op = thread->rand.Uniform(2);
+          Status s;
+          switch (op) {
+            case 0:
+            case 1: {
+              uint32_t value_base =
+                  thread->rand.Next() % thread->shared->UNKNOWN_SENTINEL;
+              char value[100];
+              size_t sz = GenerateValue(value_base, value, sizeof(value));
+              Slice v(value, sz);
+              if (op == 0) {
+                s = txn->Put(cfh, keys.back(), v);
+              } else {
+                s = txn->Merge(cfh, keys.back(), v);
+              }
+              break;
+            }
+            case 2:
+              s = txn->Delete(cfh, keys.back());
+              break;
+            default:
+              assert(false);
+          }
+          if (!s.ok()) {
+            fprintf(stderr, "Transaction put: %s\n", s.ToString().c_str());
+            std::terminate();
+          }
+        }
+      }
+#endif
+    }
+
+    if (!use_txn) {
+      if (fault_fs_guard) {
+        fault_fs_guard->EnableErrorInjection();
+        SharedState::ignore_read_error = false;
+      }
+      db_->MultiGet(readoptionscopy, cfh, num_keys, keys.data(), values.data(),
+                    statuses.data());
+      if (fault_fs_guard) {
+        error_count = fault_fs_guard->GetAndResetErrorCount();
+      }
+    } else {
+#ifndef ROCKSDB_LITE
+      txn->MultiGet(readoptionscopy, cfh, num_keys, keys.data(), values.data(),
+                    statuses.data());
+#endif
+    }
+
+    if (fault_fs_guard && error_count && !SharedState::ignore_read_error) {
+      int stat_nok = 0;
+      for (const auto& s : statuses) {
+        if (!s.ok() && !s.IsNotFound()) {
+          stat_nok++;
+        }
+      }
+
+      if (stat_nok < error_count) {
+        // Grab mutex so multiple thread don't try to print the
+        // stack trace at the same time
+        MutexLock l(thread->shared->GetMutex());
+        fprintf(stderr, "Didn't get expected error from MultiGet. \n");
+        fprintf(stderr, "num_keys %zu Expected %d errors, seen %d\n", num_keys,
+                error_count, stat_nok);
+        fprintf(stderr, "Callstack that injected the fault\n");
+        fault_fs_guard->PrintFaultBacktrace();
+        std::terminate();
+      }
+    }
+    if (fault_fs_guard) {
+      fault_fs_guard->DisableErrorInjection();
+    }
+
+    for (size_t i = 0; i < statuses.size(); ++i) {
+      Status s = statuses[i];
+      bool is_consistent = true;
+      // Only do the consistency check if no error was injected and MultiGet
+      // didn't return an unexpected error
+      if (do_consistency_check && !error_count && (s.ok() || s.IsNotFound())) {
+        Status tmp_s;
+        std::string value;
+
+        if (use_txn) {
+#ifndef ROCKSDB_LITE
+          tmp_s = txn->Get(readoptionscopy, cfh, keys[i], &value);
+#endif  // ROCKSDB_LITE
+        } else {
+          tmp_s = db_->Get(readoptionscopy, cfh, keys[i], &value);
+        }
+        if (!tmp_s.ok() && !tmp_s.IsNotFound()) {
+          fprintf(stderr, "Get error: %s\n", s.ToString().c_str());
+          is_consistent = false;
+        } else if (!s.ok() && tmp_s.ok()) {
+          fprintf(stderr, "MultiGet returned different results with key %s\n",
+                  keys[i].ToString(true).c_str());
+          fprintf(stderr, "Get returned ok, MultiGet returned not found\n");
+          is_consistent = false;
+        } else if (s.ok() && tmp_s.IsNotFound()) {
+          fprintf(stderr, "MultiGet returned different results with key %s\n",
+                  keys[i].ToString(true).c_str());
+          fprintf(stderr, "MultiGet returned ok, Get returned not found\n");
+          is_consistent = false;
+        } else if (s.ok() && value != values[i].ToString()) {
+          fprintf(stderr, "MultiGet returned different results with key %s\n",
+                  keys[i].ToString(true).c_str());
+          fprintf(stderr, "MultiGet returned value %s\n",
+                  values[i].ToString(true).c_str());
+          fprintf(stderr, "Get returned value %s\n",
+                  Slice(value).ToString(true /* hex */).c_str());
+          is_consistent = false;
+        }
+      }
+
+      if (!is_consistent) {
+        fprintf(stderr, "TestMultiGet error: is_consistent is false\n");
+        thread->stats.AddErrors(1);
+        // Fail fast to preserve the DB state
+        thread->shared->SetVerificationFailure();
+        break;
+      } else if (s.ok()) {
+        // found case
+        thread->stats.AddGets(1, 1);
+      } else if (s.IsNotFound()) {
+        // not found case
+        thread->stats.AddGets(1, 0);
+      } else if (s.IsMergeInProgress() && use_txn) {
+        // With txn this is sometimes expected.
+        thread->stats.AddGets(1, 1);
+      } else {
+        if (error_count == 0) {
+          // errors case
+          fprintf(stderr, "MultiGet error: %s\n", s.ToString().c_str());
+          thread->stats.AddErrors(1);
+        } else {
+          thread->stats.AddVerifiedErrors(1);
+        }
+      }
+    }
+
+    if (readoptionscopy.snapshot) {
+      db_->ReleaseSnapshot(readoptionscopy.snapshot);
+    }
+    if (use_txn) {
+#ifndef ROCKSDB_LITE
+      RollbackTxn(txn);
+#endif
+    }
+    return statuses;
+  }
+
+  Status TestPrefixScan(ThreadState* thread, const ReadOptions& read_opts,
+                        const std::vector<int>& rand_column_families,
+                        const std::vector<int64_t>& rand_keys) override {
+    assert(!rand_column_families.empty());
+    assert(!rand_keys.empty());
+
+    ColumnFamilyHandle* const cfh = column_families_[rand_column_families[0]];
+    assert(cfh);
+
+    const std::string key = Key(rand_keys[0]);
+    const Slice prefix(key.data(), FLAGS_prefix_size);
+
+    std::string upper_bound;
+    Slice ub_slice;
+    ReadOptions ro_copy = read_opts;
+
+    // Get the next prefix first and then see if we want to set upper bound.
+    // We'll use the next prefix in an assertion later on
+    if (GetNextPrefix(prefix, &upper_bound) && thread->rand.OneIn(2)) {
+      // For half of the time, set the upper bound to the next prefix
+      ub_slice = Slice(upper_bound);
+      ro_copy.iterate_upper_bound = &ub_slice;
+    }
+
+    std::string read_ts_str;
+    Slice read_ts_slice;
+    MaybeUseOlderTimestampForRangeScan(thread, read_ts_str, read_ts_slice,
+                                       ro_copy);
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ro_copy, cfh));
+
+    uint64_t count = 0;
+    Status s;
+
+    for (iter->Seek(prefix); iter->Valid() && iter->key().starts_with(prefix);
+         iter->Next()) {
+      ++count;
+
+      // When iter_start_ts is set, iterator exposes internal keys, including
+      // tombstones; however, we want to perform column validation only for
+      // value-like types.
+      if (ro_copy.iter_start_ts) {
+        const ValueType value_type = ExtractValueType(iter->key());
+        if (value_type != kTypeValue && value_type != kTypeBlobIndex &&
+            value_type != kTypeWideColumnEntity) {
+          continue;
+        }
+      }
+
+      const WideColumns expected_columns = GenerateExpectedWideColumns(
+          GetValueBase(iter->value()), iter->value());
+      if (iter->columns() != expected_columns) {
+        s = Status::Corruption(
+            "Value and columns inconsistent",
+            DebugString(iter->value(), iter->columns(), expected_columns));
+        break;
+      }
+    }
+
+    if (ro_copy.iter_start_ts == nullptr) {
+      assert(count <= GetPrefixKeyCount(prefix.ToString(), upper_bound));
+    }
+
+    if (s.ok()) {
+      s = iter->status();
+    }
+
+    if (!s.ok()) {
+      fprintf(stderr, "TestPrefixScan error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+
+      return s;
+    }
+
+    thread->stats.AddPrefixes(1, count);
+
+    return Status::OK();
+  }
+
+  Status TestPut(ThreadState* thread, WriteOptions& write_opts,
+                 const ReadOptions& read_opts,
+                 const std::vector<int>& rand_column_families,
+                 const std::vector<int64_t>& rand_keys,
+                 char (&value)[100]) override {
+    assert(!rand_column_families.empty());
+    assert(!rand_keys.empty());
+
+    auto shared = thread->shared;
+    assert(shared);
+
+    const int64_t max_key = shared->GetMaxKey();
+
+    int64_t rand_key = rand_keys[0];
+    int rand_column_family = rand_column_families[0];
+    std::string write_ts;
+
+    std::unique_ptr<MutexLock> lock(
+        new MutexLock(shared->GetMutexForKey(rand_column_family, rand_key)));
+    while (!shared->AllowsOverwrite(rand_key) &&
+           (FLAGS_use_merge || shared->Exists(rand_column_family, rand_key))) {
+      lock.reset();
+
+      rand_key = thread->rand.Next() % max_key;
+      rand_column_family = thread->rand.Next() % FLAGS_column_families;
+
+      lock.reset(
+          new MutexLock(shared->GetMutexForKey(rand_column_family, rand_key)));
+      if (FLAGS_user_timestamp_size > 0) {
+        write_ts = GetNowNanos();
+      }
+    }
+
+    if (write_ts.empty() && FLAGS_user_timestamp_size) {
+      write_ts = GetNowNanos();
+    }
+
+    const std::string k = Key(rand_key);
+
+    ColumnFamilyHandle* const cfh = column_families_[rand_column_family];
+    assert(cfh);
+
+    if (FLAGS_verify_before_write) {
+      std::string from_db;
+      Status s = db_->Get(read_opts, cfh, k, &from_db);
+      if (!VerifyOrSyncValue(rand_column_family, rand_key, read_opts, shared,
+                             from_db, s, /* strict */ true)) {
+        return s;
+      }
+    }
+
+    const uint32_t value_base = thread->rand.Next() % shared->UNKNOWN_SENTINEL;
+    const size_t sz = GenerateValue(value_base, value, sizeof(value));
+    const Slice v(value, sz);
+
+    shared->Put(rand_column_family, rand_key, value_base, true /* pending */);
+
+    Status s;
+
+    if (FLAGS_use_merge) {
+      if (!FLAGS_use_txn) {
+        if (FLAGS_user_timestamp_size == 0) {
+          s = db_->Merge(write_opts, cfh, k, v);
+        } else {
+          s = db_->Merge(write_opts, cfh, k, write_ts, v);
+        }
+      } else {
+#ifndef ROCKSDB_LITE
+        Transaction* txn;
+        s = NewTxn(write_opts, &txn);
+        if (s.ok()) {
+          s = txn->Merge(cfh, k, v);
+          if (s.ok()) {
+            s = CommitTxn(txn, thread);
+          }
+        }
+#endif
+      }
+    } else if (FLAGS_use_put_entity_one_in > 0 &&
+               (value_base % FLAGS_use_put_entity_one_in) == 0) {
+      s = db_->PutEntity(write_opts, cfh, k,
+                         GenerateWideColumns(value_base, v));
+    } else {
+      if (!FLAGS_use_txn) {
+        if (FLAGS_user_timestamp_size == 0) {
+          s = db_->Put(write_opts, cfh, k, v);
+        } else {
+          s = db_->Put(write_opts, cfh, k, write_ts, v);
+        }
+      } else {
+#ifndef ROCKSDB_LITE
+        Transaction* txn;
+        s = NewTxn(write_opts, &txn);
+        if (s.ok()) {
+          s = txn->Put(cfh, k, v);
+          if (s.ok()) {
+            s = CommitTxn(txn, thread);
+          }
+        }
+#endif
+      }
+    }
+
+    shared->Put(rand_column_family, rand_key, value_base, false /* pending */);
+
+    if (!s.ok()) {
+      if (FLAGS_injest_error_severity >= 2) {
+        if (!is_db_stopped_ && s.severity() >= Status::Severity::kFatalError) {
+          is_db_stopped_ = true;
+        } else if (!is_db_stopped_ ||
+                   s.severity() < Status::Severity::kFatalError) {
+          fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
+          std::terminate();
+        }
+      } else {
+        fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
+        std::terminate();
+      }
+    }
+
+    thread->stats.AddBytesForWrites(1, sz);
+    PrintKeyValue(rand_column_family, static_cast<uint32_t>(rand_key), value,
+                  sz);
+    return s;
+  }
+
+  Status TestDelete(ThreadState* thread, WriteOptions& write_opts,
+                    const std::vector<int>& rand_column_families,
+                    const std::vector<int64_t>& rand_keys) override {
+    int64_t rand_key = rand_keys[0];
+    int rand_column_family = rand_column_families[0];
+    auto shared = thread->shared;
+
+    std::unique_ptr<MutexLock> lock(
+        new MutexLock(shared->GetMutexForKey(rand_column_family, rand_key)));
+
+    // OPERATION delete
+    std::string write_ts_str = GetNowNanos();
+    Slice write_ts = write_ts_str;
+
+    std::string key_str = Key(rand_key);
+    Slice key = key_str;
+    auto cfh = column_families_[rand_column_family];
+
+    // Use delete if the key may be overwritten and a single deletion
+    // otherwise.
+    Status s;
+    if (shared->AllowsOverwrite(rand_key)) {
+      shared->Delete(rand_column_family, rand_key, true /* pending */);
+      if (!FLAGS_use_txn) {
+        if (FLAGS_user_timestamp_size == 0) {
+          s = db_->Delete(write_opts, cfh, key);
+        } else {
+          s = db_->Delete(write_opts, cfh, key, write_ts);
+        }
+      } else {
+#ifndef ROCKSDB_LITE
+        Transaction* txn;
+        s = NewTxn(write_opts, &txn);
+        if (s.ok()) {
+          s = txn->Delete(cfh, key);
+          if (s.ok()) {
+            s = CommitTxn(txn, thread);
+          }
+        }
+#endif
+      }
+      shared->Delete(rand_column_family, rand_key, false /* pending */);
+      thread->stats.AddDeletes(1);
+      if (!s.ok()) {
+        if (FLAGS_injest_error_severity >= 2) {
+          if (!is_db_stopped_ &&
+              s.severity() >= Status::Severity::kFatalError) {
+            is_db_stopped_ = true;
+          } else if (!is_db_stopped_ ||
+                     s.severity() < Status::Severity::kFatalError) {
+            fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
+            std::terminate();
+          }
+        } else {
+          fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
+          std::terminate();
+        }
+      }
+    } else {
+      shared->SingleDelete(rand_column_family, rand_key, true /* pending */);
+      if (!FLAGS_use_txn) {
+        if (FLAGS_user_timestamp_size == 0) {
+          s = db_->SingleDelete(write_opts, cfh, key);
+        } else {
+          s = db_->SingleDelete(write_opts, cfh, key, write_ts);
+        }
+      } else {
+#ifndef ROCKSDB_LITE
+        Transaction* txn;
+        s = NewTxn(write_opts, &txn);
+        if (s.ok()) {
+          s = txn->SingleDelete(cfh, key);
+          if (s.ok()) {
+            s = CommitTxn(txn, thread);
+          }
+        }
+#endif
+      }
+      shared->SingleDelete(rand_column_family, rand_key, false /* pending */);
+      thread->stats.AddSingleDeletes(1);
+      if (!s.ok()) {
+        if (FLAGS_injest_error_severity >= 2) {
+          if (!is_db_stopped_ &&
+              s.severity() >= Status::Severity::kFatalError) {
+            is_db_stopped_ = true;
+          } else if (!is_db_stopped_ ||
+                     s.severity() < Status::Severity::kFatalError) {
+            fprintf(stderr, "single delete error: %s\n", s.ToString().c_str());
+            std::terminate();
+          }
+        } else {
+          fprintf(stderr, "single delete error: %s\n", s.ToString().c_str());
+          std::terminate();
+        }
+      }
+    }
+    return s;
+  }
+
+  Status TestDeleteRange(ThreadState* thread, WriteOptions& write_opts,
+                         const std::vector<int>& rand_column_families,
+                         const std::vector<int64_t>& rand_keys) override {
+    // OPERATION delete range
+    std::vector<std::unique_ptr<MutexLock>> range_locks;
+    // delete range does not respect disallowed overwrites. the keys for
+    // which overwrites are disallowed are randomly distributed so it
+    // could be expensive to find a range where each key allows
+    // overwrites.
+    int64_t rand_key = rand_keys[0];
+    int rand_column_family = rand_column_families[0];
+    auto shared = thread->shared;
+    int64_t max_key = shared->GetMaxKey();
+    if (rand_key > max_key - FLAGS_range_deletion_width) {
+      rand_key =
+          thread->rand.Next() % (max_key - FLAGS_range_deletion_width + 1);
+    }
+    for (int j = 0; j < FLAGS_range_deletion_width; ++j) {
+      if (j == 0 ||
+          ((rand_key + j) & ((1 << FLAGS_log2_keys_per_lock) - 1)) == 0) {
+        range_locks.emplace_back(new MutexLock(
+            shared->GetMutexForKey(rand_column_family, rand_key + j)));
+      }
+    }
+    shared->DeleteRange(rand_column_family, rand_key,
+                        rand_key + FLAGS_range_deletion_width,
+                        true /* pending */);
+
+    std::string keystr = Key(rand_key);
+    Slice key = keystr;
+    auto cfh = column_families_[rand_column_family];
+    std::string end_keystr = Key(rand_key + FLAGS_range_deletion_width);
+    Slice end_key = end_keystr;
+    std::string write_ts_str;
+    Slice write_ts;
+    Status s;
+    if (FLAGS_user_timestamp_size) {
+      write_ts_str = GetNowNanos();
+      write_ts = write_ts_str;
+      s = db_->DeleteRange(write_opts, cfh, key, end_key, write_ts);
+    } else {
+      s = db_->DeleteRange(write_opts, cfh, key, end_key);
+    }
+    if (!s.ok()) {
+      if (FLAGS_injest_error_severity >= 2) {
+        if (!is_db_stopped_ && s.severity() >= Status::Severity::kFatalError) {
+          is_db_stopped_ = true;
+        } else if (!is_db_stopped_ ||
+                   s.severity() < Status::Severity::kFatalError) {
+          fprintf(stderr, "delete range error: %s\n", s.ToString().c_str());
+          std::terminate();
+        }
+      } else {
+        fprintf(stderr, "delete range error: %s\n", s.ToString().c_str());
+        std::terminate();
+      }
+    }
+    int covered = shared->DeleteRange(rand_column_family, rand_key,
+                                      rand_key + FLAGS_range_deletion_width,
+                                      false /* pending */);
+    thread->stats.AddRangeDeletions(1);
+    thread->stats.AddCoveredByRangeDeletions(covered);
+    return s;
+  }
+
+#ifdef ROCKSDB_LITE
+  void TestIngestExternalFile(
+      ThreadState* /* thread */,
+      const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */) override {
+    assert(false);
+    fprintf(stderr,
+            "RocksDB lite does not support "
+            "TestIngestExternalFile\n");
+    std::terminate();
+  }
+#else
+  void TestIngestExternalFile(ThreadState* thread,
+                              const std::vector<int>& rand_column_families,
+                              const std::vector<int64_t>& rand_keys) override {
+    const std::string sst_filename =
+        FLAGS_db + "/." + std::to_string(thread->tid) + ".sst";
+    Status s;
+    if (db_stress_env->FileExists(sst_filename).ok()) {
+      // Maybe we terminated abnormally before, so cleanup to give this file
+      // ingestion a clean slate
+      s = db_stress_env->DeleteFile(sst_filename);
+    }
+
+    SstFileWriter sst_file_writer(EnvOptions(options_), options_);
+    if (s.ok()) {
+      s = sst_file_writer.Open(sst_filename);
+    }
+    int64_t key_base = rand_keys[0];
+    int column_family = rand_column_families[0];
+    std::vector<std::unique_ptr<MutexLock>> range_locks;
+    range_locks.reserve(FLAGS_ingest_external_file_width);
+    std::vector<int64_t> keys;
+    keys.reserve(FLAGS_ingest_external_file_width);
+    std::vector<uint32_t> values;
+    values.reserve(FLAGS_ingest_external_file_width);
+    SharedState* shared = thread->shared;
+
+    assert(FLAGS_nooverwritepercent < 100);
+    // Grab locks, set pending state on expected values, and add keys
+    for (int64_t key = key_base;
+         s.ok() && key < shared->GetMaxKey() &&
+         static_cast<int32_t>(keys.size()) < FLAGS_ingest_external_file_width;
+         ++key) {
+      if (key == key_base ||
+          (key & ((1 << FLAGS_log2_keys_per_lock) - 1)) == 0) {
+        range_locks.emplace_back(
+            new MutexLock(shared->GetMutexForKey(column_family, key)));
+      }
+      if (!shared->AllowsOverwrite(key)) {
+        // We could alternatively include `key` on the condition its current
+        // value is `DELETION_SENTINEL`.
+        continue;
+      }
+      keys.push_back(key);
+
+      uint32_t value_base = thread->rand.Next() % shared->UNKNOWN_SENTINEL;
+      values.push_back(value_base);
+      shared->Put(column_family, key, value_base, true /* pending */);
+
+      char value[100];
+      size_t value_len = GenerateValue(value_base, value, sizeof(value));
+      auto key_str = Key(key);
+      s = sst_file_writer.Put(Slice(key_str), Slice(value, value_len));
+    }
+
+    if (s.ok() && keys.empty()) {
+      return;
+    }
+
+    if (s.ok()) {
+      s = sst_file_writer.Finish();
+    }
+    if (s.ok()) {
+      s = db_->IngestExternalFile(column_families_[column_family],
+                                  {sst_filename}, IngestExternalFileOptions());
+    }
+    if (!s.ok()) {
+      fprintf(stderr, "file ingestion error: %s\n", s.ToString().c_str());
+      std::terminate();
+    }
+    for (size_t i = 0; i < keys.size(); ++i) {
+      shared->Put(column_family, keys[i], values[i], false /* pending */);
+    }
+  }
+#endif  // ROCKSDB_LITE
+
+  // Given a key K, this creates an iterator which scans the range
+  // [K, K + FLAGS_num_iterations) forward and backward.
+  // Then does a random sequence of Next/Prev operations.
+  Status TestIterateAgainstExpected(
+      ThreadState* thread, const ReadOptions& read_opts,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys) override {
+    assert(thread);
+    assert(!rand_column_families.empty());
+    assert(!rand_keys.empty());
+
+    auto shared = thread->shared;
+    assert(shared);
+
+    int64_t max_key = shared->GetMaxKey();
+
+    const int64_t num_iter = static_cast<int64_t>(FLAGS_num_iterations);
+
+    int64_t lb = rand_keys[0];
+    if (lb > max_key - num_iter) {
+      lb = thread->rand.Next() % (max_key - num_iter + 1);
+    }
+
+    const int64_t ub = lb + num_iter;
+
+    // Lock the whole range over which we might iterate to ensure it doesn't
+    // change under us.
+    const int rand_column_family = rand_column_families[0];
+    std::vector<std::unique_ptr<MutexLock>> range_locks =
+        shared->GetLocksForKeyRange(rand_column_family, lb, ub);
+
+    ReadOptions ro(read_opts);
+    ro.total_order_seek = true;
+
+    std::string read_ts_str;
+    Slice read_ts;
+    if (FLAGS_user_timestamp_size > 0) {
+      read_ts_str = GetNowNanos();
+      read_ts = read_ts_str;
+      ro.timestamp = &read_ts;
+    }
+
+    std::string max_key_str;
+    Slice max_key_slice;
+    if (!FLAGS_destroy_db_initially) {
+      max_key_str = Key(max_key);
+      max_key_slice = max_key_str;
+      // to restrict iterator from reading keys written in batched_op_stress
+      // that do not have expected state updated and may not be parseable by
+      // GetIntVal().
+      ro.iterate_upper_bound = &max_key_slice;
+    }
+
+    ColumnFamilyHandle* const cfh = column_families_[rand_column_family];
+    assert(cfh);
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ro, cfh));
+
+    std::string op_logs;
+
+    auto check_columns = [&]() {
+      assert(iter);
+      assert(iter->Valid());
+
+      const WideColumns expected_columns = GenerateExpectedWideColumns(
+          GetValueBase(iter->value()), iter->value());
+      if (iter->columns() != expected_columns) {
+        shared->SetVerificationFailure();
+
+        fprintf(stderr,
+                "Verification failed for key %s: "
+                "Value and columns inconsistent: %s\n",
+                Slice(iter->key()).ToString(/* hex */ true).c_str(),
+                DebugString(iter->value(), iter->columns(), expected_columns)
+                    .c_str());
+        fprintf(stderr, "Column family: %s, op_logs: %s\n",
+                cfh->GetName().c_str(), op_logs.c_str());
+
+        thread->stats.AddErrors(1);
+
+        return false;
+      }
+
+      return true;
+    };
+
+    auto check_no_key_in_range = [&](int64_t start, int64_t end) {
+      for (auto j = std::max(start, lb); j < std::min(end, ub); ++j) {
+        auto expected_value =
+            shared->Get(rand_column_family, static_cast<int64_t>(j));
+        if (expected_value != shared->DELETION_SENTINEL &&
+            expected_value != shared->UNKNOWN_SENTINEL) {
+          // Fail fast to preserve the DB state.
+          thread->shared->SetVerificationFailure();
+          if (iter->Valid()) {
+            fprintf(stderr,
+                    "Expected state has key %s, iterator is at key %s\n",
+                    Slice(Key(j)).ToString(true).c_str(),
+                    iter->key().ToString(true).c_str());
+          } else {
+            fprintf(stderr, "Expected state has key %s, iterator is invalid\n",
+                    Slice(Key(j)).ToString(true).c_str());
+          }
+          fprintf(stderr, "Column family: %s, op_logs: %s\n",
+                  cfh->GetName().c_str(), op_logs.c_str());
+          thread->stats.AddErrors(1);
+          return false;
+        }
+      }
+      return true;
+    };
+
+    // Forward and backward scan to ensure we cover the entire range [lb, ub).
+    // The random sequence Next and Prev test below tends to be very short
+    // ranged.
+    int64_t last_key = lb - 1;
+
+    std::string key_str = Key(lb);
+    iter->Seek(key_str);
+
+    op_logs += "S " + Slice(key_str).ToString(true) + " ";
+
+    uint64_t curr = 0;
+    while (true) {
+      if (!iter->Valid()) {
+        if (!iter->status().ok()) {
+          thread->shared->SetVerificationFailure();
+          fprintf(stderr, "TestIterate against expected state error: %s\n",
+                  iter->status().ToString().c_str());
+          fprintf(stderr, "Column family: %s, op_logs: %s\n",
+                  cfh->GetName().c_str(), op_logs.c_str());
+          thread->stats.AddErrors(1);
+          return iter->status();
+        }
+        if (!check_no_key_in_range(last_key + 1, ub)) {
+          return Status::OK();
+        }
+        break;
+      }
+
+      if (!check_columns()) {
+        return Status::OK();
+      }
+
+      // iter is valid, the range (last_key, current key) was skipped
+      GetIntVal(iter->key().ToString(), &curr);
+      if (!check_no_key_in_range(last_key + 1, static_cast<int64_t>(curr))) {
+        return Status::OK();
+      }
+
+      last_key = static_cast<int64_t>(curr);
+      if (last_key >= ub - 1) {
+        break;
+      }
+
+      iter->Next();
+
+      op_logs += "N";
+    }
+
+    // backward scan
+    key_str = Key(ub - 1);
+    iter->SeekForPrev(key_str);
+
+    op_logs += " SFP " + Slice(key_str).ToString(true) + " ";
+
+    last_key = ub;
+    while (true) {
+      if (!iter->Valid()) {
+        if (!iter->status().ok()) {
+          thread->shared->SetVerificationFailure();
+          fprintf(stderr, "TestIterate against expected state error: %s\n",
+                  iter->status().ToString().c_str());
+          fprintf(stderr, "Column family: %s, op_logs: %s\n",
+                  cfh->GetName().c_str(), op_logs.c_str());
+          thread->stats.AddErrors(1);
+          return iter->status();
+        }
+        if (!check_no_key_in_range(lb, last_key)) {
+          return Status::OK();
+        }
+        break;
+      }
+
+      if (!check_columns()) {
+        return Status::OK();
+      }
+
+      // the range (current key, last key) was skipped
+      GetIntVal(iter->key().ToString(), &curr);
+      if (!check_no_key_in_range(static_cast<int64_t>(curr + 1), last_key)) {
+        return Status::OK();
+      }
+
+      last_key = static_cast<int64_t>(curr);
+      if (last_key <= lb) {
+        break;
+      }
+
+      iter->Prev();
+
+      op_logs += "P";
+    }
+
+    if (thread->rand.OneIn(2)) {
+      // Refresh after forward/backward scan to allow higher chance of SV
+      // change. It is safe to refresh since the testing key range is locked.
+      iter->Refresh();
+    }
+
+    // start from middle of [lb, ub) otherwise it is easy to iterate out of
+    // locked range
+    const int64_t mid = lb + num_iter / 2;
+
+    key_str = Key(mid);
+    const Slice key(key_str);
+
+    if (thread->rand.OneIn(2)) {
+      iter->Seek(key);
+      op_logs += " S " + key.ToString(true) + " ";
+      if (!iter->Valid() && iter->status().ok()) {
+        if (!check_no_key_in_range(mid, ub)) {
+          return Status::OK();
+        }
+      }
+    } else {
+      iter->SeekForPrev(key);
+      op_logs += " SFP " + key.ToString(true) + " ";
+      if (!iter->Valid() && iter->status().ok()) {
+        // iterator says nothing <= mid
+        if (!check_no_key_in_range(lb, mid + 1)) {
+          return Status::OK();
+        }
+      }
+    }
+
+    for (int64_t i = 0; i < num_iter && iter->Valid(); ++i) {
+      if (!check_columns()) {
+        return Status::OK();
+      }
+
+      GetIntVal(iter->key().ToString(), &curr);
+      if (static_cast<int64_t>(curr) < lb) {
+        iter->Next();
+        op_logs += "N";
+      } else if (static_cast<int64_t>(curr) >= ub) {
+        iter->Prev();
+        op_logs += "P";
+      } else {
+        const uint32_t expected_value =
+            shared->Get(rand_column_family, static_cast<int64_t>(curr));
+        if (expected_value == shared->DELETION_SENTINEL) {
+          // Fail fast to preserve the DB state.
+          thread->shared->SetVerificationFailure();
+          fprintf(stderr, "Iterator has key %s, but expected state does not.\n",
+                  iter->key().ToString(true).c_str());
+          fprintf(stderr, "Column family: %s, op_logs: %s\n",
+                  cfh->GetName().c_str(), op_logs.c_str());
+          thread->stats.AddErrors(1);
+          break;
+        }
+
+        if (thread->rand.OneIn(2)) {
+          iter->Next();
+          op_logs += "N";
+          if (!iter->Valid()) {
+            break;
+          }
+          uint64_t next = 0;
+          GetIntVal(iter->key().ToString(), &next);
+          if (!check_no_key_in_range(static_cast<int64_t>(curr + 1),
+                                     static_cast<int64_t>(next))) {
+            return Status::OK();
+          }
+        } else {
+          iter->Prev();
+          op_logs += "P";
+          if (!iter->Valid()) {
+            break;
+          }
+          uint64_t prev = 0;
+          GetIntVal(iter->key().ToString(), &prev);
+          if (!check_no_key_in_range(static_cast<int64_t>(prev + 1),
+                                     static_cast<int64_t>(curr))) {
+            return Status::OK();
+          }
+        }
+      }
+    }
+
+    if (!iter->status().ok()) {
+      thread->shared->SetVerificationFailure();
+      fprintf(stderr, "TestIterate against expected state error: %s\n",
+              iter->status().ToString().c_str());
+      fprintf(stderr, "Column family: %s, op_logs: %s\n",
+              cfh->GetName().c_str(), op_logs.c_str());
+      thread->stats.AddErrors(1);
+      return iter->status();
+    }
+
+    thread->stats.AddIterations(1);
+
+    return Status::OK();
+  }
+
+  bool VerifyOrSyncValue(int cf, int64_t key, const ReadOptions& /*opts*/,
+                         SharedState* shared, const std::string& value_from_db,
+                         const Status& s, bool strict = false) const {
+    if (shared->HasVerificationFailedYet()) {
+      return false;
+    }
+    // compare value_from_db with the value in the shared state
+    uint32_t value_base = shared->Get(cf, key);
+    if (value_base == SharedState::UNKNOWN_SENTINEL) {
+      if (s.ok()) {
+        // Value exists in db, update state to reflect that
+        Slice slice(value_from_db);
+        value_base = GetValueBase(slice);
+        shared->Put(cf, key, value_base, false);
+      } else if (s.IsNotFound()) {
+        // Value doesn't exist in db, update state to reflect that
+        shared->SingleDelete(cf, key, false);
+      }
+      return true;
+    }
+    if (value_base == SharedState::DELETION_SENTINEL && !strict) {
+      return true;
+    }
+
+    if (s.ok()) {
+      char value[kValueMaxLen];
+      if (value_base == SharedState::DELETION_SENTINEL) {
+        VerificationAbort(shared, "Unexpected value found", cf, key,
+                          value_from_db, "");
+        return false;
+      }
+      size_t sz = GenerateValue(value_base, value, sizeof(value));
+      if (value_from_db.length() != sz) {
+        VerificationAbort(shared, "Length of value read is not equal", cf, key,
+                          value_from_db, Slice(value, sz));
+        return false;
+      }
+      if (memcmp(value_from_db.data(), value, sz) != 0) {
+        VerificationAbort(shared, "Contents of value read don't match", cf, key,
+                          value_from_db, Slice(value, sz));
+        return false;
+      }
+    } else {
+      if (value_base != SharedState::DELETION_SENTINEL) {
+        char value[kValueMaxLen];
+        size_t sz = GenerateValue(value_base, value, sizeof(value));
+        VerificationAbort(shared, "Value not found: " + s.ToString(), cf, key,
+                          "", Slice(value, sz));
+        return false;
+      }
+    }
+    return true;
+  }
+
+#ifndef ROCKSDB_LITE
+  void PrepareTxnDbOptions(SharedState* shared,
+                           TransactionDBOptions& txn_db_opts) override {
+    txn_db_opts.rollback_deletion_type_callback =
+        [shared](TransactionDB*, ColumnFamilyHandle*, const Slice& key) {
+          assert(shared);
+          uint64_t key_num = 0;
+          bool ok = GetIntVal(key.ToString(), &key_num);
+          assert(ok);
+          (void)ok;
+          return !shared->AllowsOverwrite(key_num);
+        };
+  }
+#endif  // ROCKSDB_LITE
+};
+
+StressTest* CreateNonBatchedOpsStressTest() {
+  return new NonBatchedOpsStressTest();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // GFLAGS
diff --git a/src/rocksdb/docs/.gitignore b/src/rocksdb/docs/.gitignore
new file mode 100644
index 000000000..3938549cb
--- /dev/null
+++ b/src/rocksdb/docs/.gitignore
@@ -0,0 +1,8 @@
+.DS_STORE
+_site/
+*.swo
+*.swp
+_site
+.sass-cache
+*.psd
+*~
diff --git a/src/rocksdb/docs/CNAME b/src/rocksdb/docs/CNAME
new file mode 100644
index 000000000..827d1c0ed
--- /dev/null
+++ b/src/rocksdb/docs/CNAME
@@ -0,0 +1 @@
+rocksdb.org
+\ No newline at end of file
diff --git a/src/rocksdb/docs/CONTRIBUTING.md b/src/rocksdb/docs/CONTRIBUTING.md
new file mode 100644
index 000000000..2c5842fb4
--- /dev/null
+++ b/src/rocksdb/docs/CONTRIBUTING.md
@@ -0,0 +1,115 @@
+This provides guidance on how to contribute various content to `rocksdb.org`.
+
+## Getting started
+
+You should only have to do these one time.
+
+- Rename this file to `CONTRIBUTING.md`.
+- Rename `EXAMPLE-README-FOR-RUNNING-DOCS.md` to `README.md` (replacing the existing `README.md` that came with the template).
+- Rename `EXAMPLE-LICENSE` to `LICENSE`.
+- Review the [template information](./TEMPLATE-INFORMATION.md).
+- Review `./_config.yml`.
+- Make sure you update `title`, `description`, `tagline` and `gacode` (Google Analytics) in `./_config.yml`.
+
+## Basic Structure
+
+Most content is written in markdown. You name the file `something.md`, then have a header that looks like this:
+
+```
+---
+docid: getting-started
+title: Getting started with ProjectName
+layout: docs
+permalink: /docs/getting-started.html
+---
+```
+
+Customize these values for each document, blog post, etc.
+
+> The filename of the `.md` file doesn't actually matter; what is important is the `docid` being unique and the `permalink` correct and unique too).
+
+## Landing page
+
+Modify `index.md` with your new or updated content.
+
+If you want a `GridBlock` as part of your content, you can do so directly with HTML:
+
+```
+<div class="gridBlock">
+  <div class="blockElement twoByGridBlock alignLeft">
+    <div class="blockContent">
+      <h3>Your Features</h3>
+      <ul>
+        <li>The <a href="http://example.org/">Example</a></li>
+        <li><a href="http://example.com">Another Example</a></li>
+      </ul>
+    </div>
+  </div>
+
+  <div class="blockElement twoByGridBlock alignLeft">
+    <div class="blockContent">
+      <h3>More information</h3>
+      <p>
+         Stuff here
+      </p>
+    </div>
+  </div>
+</div>
+```
+
+or with a combination of changing `./_data/features.yml` and adding some Liquid to `index.md`, such as:
+
+```
+{% include content/gridblocks.html data_source=site.data.features imagealign="bottom"%}
+```
+
+## Blog
+
+To modify a blog post, edit the appopriate markdown file in `./_posts/`.
+
+Adding a new blog post is a four-step process.
+
+> Some posts have a `permalink` and `comments` in the blog post YAML header. You will not need these for new blog posts. These are an artifact of migrating the blog from Wordpress to gh-pages.
+
+1. Create your blog post in `./_posts/` in markdown (file extension `.md` or `.markdown`). See current posts in that folder or `./doc-type-examples/2016-04-07-blog-post-example.md` for an example of the YAML format. **If the `./_posts` directory does not exist, create it**.
+  - You can add a `<!--truncate-->` tag in the middle of your post such that you show only the excerpt above that tag in the main `/blog` index on your page.
+1. If you have not authored a blog post before, modify the `./_data/authors.yml` file with the `author` id you used in your blog post, along with your full name and Facebook ID to get your profile picture.
+1. [Run the site locally](./README.md) to test your changes. It will be at `http://127.0.0.1/blog/your-new-blog-post-title.html`
+1. Push your changes to GitHub.
+
+## Docs
+
+To modify docs, edit the appropriate markdown file in `./_docs/`.
+
+To add docs to the site....
+
+1. Add your markdown file to the `./_docs/` folder. See `./doc-type-examples/docs-hello-world.md` for an example of the YAML header format. **If the `./_docs/` directory does not exist, create it**.
+  - You can use folders in the `./_docs/` directory to organize your content if you want.
+1. Update `_data/nav_docs.yml` to add your new document to the navigation bar. Use the `docid` you put in your doc markdown in as the `id` in the `_data/nav_docs.yml` file.
+1. [Run the site locally](./README.md) to test your changes. It will be at `http://127.0.0.1/docs/your-new-doc-permalink.html`
+1. Push your changes to GitHub.
+
+## Header Bar
+
+To modify the header bar, change `./_data/nav.yml`.
+
+## Top Level Page
+
+To modify a top-level page, edit the appropriate markdown file in `./top-level/`
+
+If you want a top-level page (e.g., http://your-site.com/top-level.html) -- not in `/blog/` or `/docs/`....
+
+1. Create a markdown file in the root `./top-level/`. See `./doc-type-examples/top-level-example.md` for more information.
+1. If you want a visible link to that file, update `_data/nav.yml` to add a link to your new top-level document in the header bar.
+
+   > This is not necessary if you just want to have a page that is linked to from another page, but not exposed as direct link to the user.
+
+1. [Run the site locally](./README.md) to test your changes. It will be at `http://127.0.0.1/your-top-level-page-permalink.html`
+1. Push your changes to GitHub.
+
+## Other Changes
+
+- CSS: `./css/main.css` or `./_sass/*.scss`.
+- Images: `./static/images/[docs | posts]/....`
+- Main Blog post HTML: `./_includes/post.html`
+- Main Docs HTML: `./_includes/doc.html`
diff --git a/src/rocksdb/docs/Gemfile b/src/rocksdb/docs/Gemfile
new file mode 100644
index 000000000..d0602ba2b
--- /dev/null
+++ b/src/rocksdb/docs/Gemfile
@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+gem 'github-pages', '~> 225'
+
+gem "webrick", "~> 1.7"
diff --git a/src/rocksdb/docs/Gemfile.lock b/src/rocksdb/docs/Gemfile.lock
new file mode 100644
index 000000000..68cd94963
--- /dev/null
+++ b/src/rocksdb/docs/Gemfile.lock
@@ -0,0 +1,285 @@
+GEM
+  remote: https://rubygems.org/
+  specs:
+    activesupport (6.0.4.6)
+      concurrent-ruby (~> 1.0, >= 1.0.2)
+      i18n (>= 0.7, < 2)
+      minitest (~> 5.1)
+      tzinfo (~> 1.1)
+      zeitwerk (~> 2.2, >= 2.2.2)
+    addressable (2.8.0)
+      public_suffix (>= 2.0.2, < 5.0)
+    coffee-script (2.4.1)
+      coffee-script-source
+      execjs
+    coffee-script-source (1.11.1)
+    colorator (1.1.0)
+    commonmarker (0.23.6)
+    concurrent-ruby (1.1.9)
+    dnsruby (1.61.9)
+      simpleidn (~> 0.1)
+    em-websocket (0.5.3)
+      eventmachine (>= 0.12.9)
+      http_parser.rb (~> 0)
+    ethon (0.15.0)
+      ffi (>= 1.15.0)
+    eventmachine (1.2.7)
+    execjs (2.8.1)
+    faraday (1.10.0)
+      faraday-em_http (~> 1.0)
+      faraday-em_synchrony (~> 1.0)
+      faraday-excon (~> 1.1)
+      faraday-httpclient (~> 1.0)
+      faraday-multipart (~> 1.0)
+      faraday-net_http (~> 1.0)
+      faraday-net_http_persistent (~> 1.0)
+      faraday-patron (~> 1.0)
+      faraday-rack (~> 1.0)
+      faraday-retry (~> 1.0)
+      ruby2_keywords (>= 0.0.4)
+    faraday-em_http (1.0.0)
+    faraday-em_synchrony (1.0.0)
+    faraday-excon (1.1.0)
+    faraday-httpclient (1.0.1)
+    faraday-multipart (1.0.3)
+      multipart-post (>= 1.2, < 3)
+    faraday-net_http (1.0.1)
+    faraday-net_http_persistent (1.2.0)
+    faraday-patron (1.0.0)
+    faraday-rack (1.0.0)
+    faraday-retry (1.0.3)
+    ffi (1.15.5)
+    forwardable-extended (2.6.0)
+    gemoji (3.0.1)
+    github-pages (225)
+      github-pages-health-check (= 1.17.9)
+      jekyll (= 3.9.0)
+      jekyll-avatar (= 0.7.0)
+      jekyll-coffeescript (= 1.1.1)
+      jekyll-commonmark-ghpages (= 0.2.0)
+      jekyll-default-layout (= 0.1.4)
+      jekyll-feed (= 0.15.1)
+      jekyll-gist (= 1.5.0)
+      jekyll-github-metadata (= 2.13.0)
+      jekyll-include-cache (= 0.2.1)
+      jekyll-mentions (= 1.6.0)
+      jekyll-optional-front-matter (= 0.3.2)
+      jekyll-paginate (= 1.1.0)
+      jekyll-readme-index (= 0.3.0)
+      jekyll-redirect-from (= 0.16.0)
+      jekyll-relative-links (= 0.6.1)
+      jekyll-remote-theme (= 0.4.3)
+      jekyll-sass-converter (= 1.5.2)
+      jekyll-seo-tag (= 2.8.0)
+      jekyll-sitemap (= 1.4.0)
+      jekyll-swiss (= 1.0.0)
+      jekyll-theme-architect (= 0.2.0)
+      jekyll-theme-cayman (= 0.2.0)
+      jekyll-theme-dinky (= 0.2.0)
+      jekyll-theme-hacker (= 0.2.0)
+      jekyll-theme-leap-day (= 0.2.0)
+      jekyll-theme-merlot (= 0.2.0)
+      jekyll-theme-midnight (= 0.2.0)
+      jekyll-theme-minimal (= 0.2.0)
+      jekyll-theme-modernist (= 0.2.0)
+      jekyll-theme-primer (= 0.6.0)
+      jekyll-theme-slate (= 0.2.0)
+      jekyll-theme-tactile (= 0.2.0)
+      jekyll-theme-time-machine (= 0.2.0)
+      jekyll-titles-from-headings (= 0.5.3)
+      jemoji (= 0.12.0)
+      kramdown (= 2.3.1)
+      kramdown-parser-gfm (= 1.1.0)
+      liquid (= 4.0.3)
+      mercenary (~> 0.3)
+      minima (= 2.5.1)
+      nokogiri (>= 1.12.5, < 2.0)
+      rouge (= 3.26.0)
+      terminal-table (~> 1.4)
+    github-pages-health-check (1.17.9)
+      addressable (~> 2.3)
+      dnsruby (~> 1.60)
+      octokit (~> 4.0)
+      public_suffix (>= 3.0, < 5.0)
+      typhoeus (~> 1.3)
+    html-pipeline (2.14.0)
+      activesupport (>= 2)
+      nokogiri (>= 1.4)
+    http_parser.rb (0.8.0)
+    i18n (0.9.5)
+      concurrent-ruby (~> 1.0)
+    jekyll (3.9.0)
+      addressable (~> 2.4)
+      colorator (~> 1.0)
+      em-websocket (~> 0.5)
+      i18n (~> 0.7)
+      jekyll-sass-converter (~> 1.0)
+      jekyll-watch (~> 2.0)
+      kramdown (>= 1.17, < 3)
+      liquid (~> 4.0)
+      mercenary (~> 0.3.3)
+      pathutil (~> 0.9)
+      rouge (>= 1.7, < 4)
+      safe_yaml (~> 1.0)
+    jekyll-avatar (0.7.0)
+      jekyll (>= 3.0, < 5.0)
+    jekyll-coffeescript (1.1.1)
+      coffee-script (~> 2.2)
+      coffee-script-source (~> 1.11.1)
+    jekyll-commonmark (1.4.0)
+      commonmarker (~> 0.22)
+    jekyll-commonmark-ghpages (0.2.0)
+      commonmarker (~> 0.23.4)
+      jekyll (~> 3.9.0)
+      jekyll-commonmark (~> 1.4.0)
+      rouge (>= 2.0, < 4.0)
+    jekyll-default-layout (0.1.4)
+      jekyll (~> 3.0)
+    jekyll-feed (0.15.1)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-gist (1.5.0)
+      octokit (~> 4.2)
+    jekyll-github-metadata (2.13.0)
+      jekyll (>= 3.4, < 5.0)
+      octokit (~> 4.0, != 4.4.0)
+    jekyll-include-cache (0.2.1)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-mentions (1.6.0)
+      html-pipeline (~> 2.3)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-optional-front-matter (0.3.2)
+      jekyll (>= 3.0, < 5.0)
+    jekyll-paginate (1.1.0)
+    jekyll-readme-index (0.3.0)
+      jekyll (>= 3.0, < 5.0)
+    jekyll-redirect-from (0.16.0)
+      jekyll (>= 3.3, < 5.0)
+    jekyll-relative-links (0.6.1)
+      jekyll (>= 3.3, < 5.0)
+    jekyll-remote-theme (0.4.3)
+      addressable (~> 2.0)
+      jekyll (>= 3.5, < 5.0)
+      jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0)
+      rubyzip (>= 1.3.0, < 3.0)
+    jekyll-sass-converter (1.5.2)
+      sass (~> 3.4)
+    jekyll-seo-tag (2.8.0)
+      jekyll (>= 3.8, < 5.0)
+    jekyll-sitemap (1.4.0)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-swiss (1.0.0)
+    jekyll-theme-architect (0.2.0)
+      jekyll (> 3.5, < 5.0)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-cayman (0.2.0)
+      jekyll (> 3.5, < 5.0)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-dinky (0.2.0)
+      jekyll (> 3.5, < 5.0)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-hacker (0.2.0)
+      jekyll (> 3.5, < 5.0)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-leap-day (0.2.0)
+      jekyll (> 3.5, < 5.0)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-merlot (0.2.0)
+      jekyll (> 3.5, < 5.0)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-midnight (0.2.0)
+      jekyll (> 3.5, < 5.0)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-minimal (0.2.0)
+      jekyll (> 3.5, < 5.0)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-modernist (0.2.0)
+      jekyll (> 3.5, < 5.0)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-primer (0.6.0)
+      jekyll (> 3.5, < 5.0)
+      jekyll-github-metadata (~> 2.9)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-slate (0.2.0)
+      jekyll (> 3.5, < 5.0)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-tactile (0.2.0)
+      jekyll (> 3.5, < 5.0)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-time-machine (0.2.0)
+      jekyll (> 3.5, < 5.0)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-titles-from-headings (0.5.3)
+      jekyll (>= 3.3, < 5.0)
+    jekyll-watch (2.2.1)
+      listen (~> 3.0)
+    jemoji (0.12.0)
+      gemoji (~> 3.0)
+      html-pipeline (~> 2.2)
+      jekyll (>= 3.0, < 5.0)
+    kramdown (2.3.1)
+      rexml
+    kramdown-parser-gfm (1.1.0)
+      kramdown (~> 2.0)
+    liquid (4.0.3)
+    listen (3.7.1)
+      rb-fsevent (~> 0.10, >= 0.10.3)
+      rb-inotify (~> 0.9, >= 0.9.10)
+    mercenary (0.3.6)
+    mini_portile2 (2.8.0)
+    minima (2.5.1)
+      jekyll (>= 3.5, < 5.0)
+      jekyll-feed (~> 0.9)
+      jekyll-seo-tag (~> 2.1)
+    minitest (5.15.0)
+    multipart-post (2.1.1)
+    nokogiri (1.13.9)
+      mini_portile2 (~> 2.8.0)
+      racc (~> 1.4)
+    octokit (4.22.0)
+      faraday (>= 0.9)
+      sawyer (~> 0.8.0, >= 0.5.3)
+    pathutil (0.16.2)
+      forwardable-extended (~> 2.6)
+    public_suffix (4.0.6)
+    racc (1.6.0)
+    rb-fsevent (0.11.1)
+    rb-inotify (0.10.1)
+      ffi (~> 1.0)
+    rexml (3.2.5)
+    rouge (3.26.0)
+    ruby2_keywords (0.0.5)
+    rubyzip (2.3.2)
+    safe_yaml (1.0.5)
+    sass (3.7.4)
+      sass-listen (~> 4.0.0)
+    sass-listen (4.0.0)
+      rb-fsevent (~> 0.9, >= 0.9.4)
+      rb-inotify (~> 0.9, >= 0.9.7)
+    sawyer (0.8.2)
+      addressable (>= 2.3.5)
+      faraday (> 0.8, < 2.0)
+    simpleidn (0.2.1)
+      unf (~> 0.1.4)
+    terminal-table (1.8.0)
+      unicode-display_width (~> 1.1, >= 1.1.1)
+    thread_safe (0.3.6)
+    typhoeus (1.4.0)
+      ethon (>= 0.9.0)
+    tzinfo (1.2.10)
+      thread_safe (~> 0.1)
+    unf (0.1.4)
+      unf_ext
+    unf_ext (0.0.8)
+    unicode-display_width (1.8.0)
+    webrick (1.7.0)
+    zeitwerk (2.5.4)
+
+PLATFORMS
+  ruby
+
+DEPENDENCIES
+  github-pages (~> 225)
+  webrick (~> 1.7)
+
+BUNDLED WITH
+   2.2.3
diff --git a/src/rocksdb/docs/LICENSE-DOCUMENTATION b/src/rocksdb/docs/LICENSE-DOCUMENTATION
new file mode 100644
index 000000000..1f255c9f3
--- /dev/null
+++ b/src/rocksdb/docs/LICENSE-DOCUMENTATION
@@ -0,0 +1,385 @@
+Attribution 4.0 International
+
+=======================================================================
+
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+
+Using Creative Commons Public Licenses
+
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+  wiki.creativecommons.org/Considerations_for_licensors
+
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More_considerations
+     for the public:
+  wiki.creativecommons.org/Considerations_for_licensees
+
+=======================================================================
+
+Creative Commons Attribution 4.0 International Public License
+
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution 4.0 International Public License ("Public License"). To the
+extent this Public License may be interpreted as a contract, You are
+granted the Licensed Rights in consideration of Your acceptance of
+these terms and conditions, and the Licensor grants You such rights in
+consideration of benefits the Licensor receives from making the
+Licensed Material available under these terms and conditions.
+
+Section 1 -- Definitions.
+
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+
+b. Adapter's License means the license You apply to Your Copyright
+   and Similar Rights in Your contributions to Adapted Material in
+   accordance with the terms and conditions of this Public License.
+
+c. Copyright and Similar Rights means copyright and/or similar rights
+   closely related to copyright including, without limitation,
+   performance, broadcast, sound recording, and Sui Generis Database
+   Rights, without regard to how the rights are labeled or
+   categorized. For purposes of this Public License, the rights
+   specified in Section 2(b)(1)-(2) are not Copyright and Similar
+   Rights.
+
+d. Effective Technological Measures means those measures that, in the
+   absence of proper authority, may not be circumvented under laws
+   fulfilling obligations under Article 11 of the WIPO Copyright
+   Treaty adopted on December 20, 1996, and/or similar international
+   agreements.
+
+e. Exceptions and Limitations means fair use, fair dealing, and/or
+   any other exception or limitation to Copyright and Similar Rights
+   that applies to Your use of the Licensed Material.
+
+f. Licensed Material means the artistic or literary work, database,
+   or other material to which the Licensor applied this Public
+   License.
+
+g. Licensed Rights means the rights granted to You subject to the
+   terms and conditions of this Public License, which are limited to
+   all Copyright and Similar Rights that apply to Your use of the
+   Licensed Material and that the Licensor has authority to license.
+
+h. Licensor means the individual(s) or entity(ies) granting rights
+   under this Public License.
+
+i. Share means to provide material to the public by any means or
+   process that requires permission under the Licensed Rights, such
+   as reproduction, public display, public performance, distribution,
+   dissemination, communication, or importation, and to make material
+   available to the public including in ways that members of the
+   public may access the material from a place and at a time
+   individually chosen by them.
+
+j. Sui Generis Database Rights means rights other than copyright
+   resulting from Directive 96/9/EC of the European Parliament and of
+   the Council of 11 March 1996 on the legal protection of databases,
+   as amended and/or succeeded, as well as other essentially
+   equivalent rights anywhere in the world.
+
+k. You means the individual or entity exercising the Licensed Rights
+   under this Public License. Your has a corresponding meaning.
+
+Section 2 -- Scope.
+
+a. License grant.
+
+     1. Subject to the terms and conditions of this Public License,
+        the Licensor hereby grants You a worldwide, royalty-free,
+        non-sublicensable, non-exclusive, irrevocable license to
+        exercise the Licensed Rights in the Licensed Material to:
+
+          a. reproduce and Share the Licensed Material, in whole or
+             in part; and
+
+          b. produce, reproduce, and Share Adapted Material.
+
+     2. Exceptions and Limitations. For the avoidance of doubt, where
+        Exceptions and Limitations apply to Your use, this Public
+        License does not apply, and You do not need to comply with
+        its terms and conditions.
+
+     3. Term. The term of this Public License is specified in Section
+        6(a).
+
+     4. Media and formats; technical modifications allowed. The
+        Licensor authorizes You to exercise the Licensed Rights in
+        all media and formats whether now known or hereafter created,
+        and to make technical modifications necessary to do so. The
+        Licensor waives and/or agrees not to assert any right or
+        authority to forbid You from making technical modifications
+        necessary to exercise the Licensed Rights, including
+        technical modifications necessary to circumvent Effective
+        Technological Measures. For purposes of this Public License,
+        simply making modifications authorized by this Section 2(a)
+        (4) never produces Adapted Material.
+
+     5. Downstream recipients.
+
+          a. Offer from the Licensor -- Licensed Material. Every
+             recipient of the Licensed Material automatically
+             receives an offer from the Licensor to exercise the
+             Licensed Rights under the terms and conditions of this
+             Public License.
+
+          b. No downstream restrictions. You may not offer or impose
+             any additional or different terms or conditions on, or
+             apply any Effective Technological Measures to, the
+             Licensed Material if doing so restricts exercise of the
+             Licensed Rights by any recipient of the Licensed
+             Material.
+
+     6. No endorsement. Nothing in this Public License constitutes or
+        may be construed as permission to assert or imply that You
+        are, or that Your use of the Licensed Material is, connected
+        with, or sponsored, endorsed, or granted official status by,
+        the Licensor or others designated to receive attribution as
+        provided in Section 3(a)(1)(A)(i).
+
+b. Other rights.
+
+     1. Moral rights, such as the right of integrity, are not
+        licensed under this Public License, nor are publicity,
+        privacy, and/or other similar personality rights; however, to
+        the extent possible, the Licensor waives and/or agrees not to
+        assert any such rights held by the Licensor to the limited
+        extent necessary to allow You to exercise the Licensed
+        Rights, but not otherwise.
+
+     2. Patent and trademark rights are not licensed under this
+        Public License.
+
+     3. To the extent possible, the Licensor waives any right to
+        collect royalties from You for the exercise of the Licensed
+        Rights, whether directly or through a collecting society
+        under any voluntary or waivable statutory or compulsory
+        licensing scheme. In all other cases the Licensor expressly
+        reserves any right to collect such royalties.
+
+Section 3 -- License Conditions.
+
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+
+a. Attribution.
+
+     1. If You Share the Licensed Material (including in modified
+        form), You must:
+
+          a. retain the following if it is supplied by the Licensor
+             with the Licensed Material:
+
+               i. identification of the creator(s) of the Licensed
+                  Material and any others designated to receive
+                  attribution, in any reasonable manner requested by
+                  the Licensor (including by pseudonym if
+                  designated);
+
+              ii. a copyright notice;
+
+             iii. a notice that refers to this Public License;
+
+              iv. a notice that refers to the disclaimer of
+                  warranties;
+
+               v. a URI or hyperlink to the Licensed Material to the
+                  extent reasonably practicable;
+
+          b. indicate if You modified the Licensed Material and
+             retain an indication of any previous modifications; and
+
+          c. indicate the Licensed Material is licensed under this
+             Public License, and include the text of, or the URI or
+             hyperlink to, this Public License.
+
+     2. You may satisfy the conditions in Section 3(a)(1) in any
+        reasonable manner based on the medium, means, and context in
+        which You Share the Licensed Material. For example, it may be
+        reasonable to satisfy the conditions by providing a URI or
+        hyperlink to a resource that includes the required
+        information.
+
+     3. If requested by the Licensor, You must remove any of the
+        information required by Section 3(a)(1)(A) to the extent
+        reasonably practicable.
+
+     4. If You Share Adapted Material You produce, the Adapter's
+        License You apply must not prevent recipients of the Adapted
+        Material from complying with this Public License.
+
+Section 4 -- Sui Generis Database Rights.
+
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+
+a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+   to extract, reuse, reproduce, and Share all or a substantial
+   portion of the contents of the database;
+
+b. if You include all or a substantial portion of the database
+   contents in a database in which You have Sui Generis Database
+   Rights, then the database in which You have Sui Generis Database
+   Rights (but not its individual contents) is Adapted Material; and
+
+c. You must comply with the conditions in Section 3(a) if You Share
+   all or a substantial portion of the contents of the database.
+
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+
+a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+   EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+   AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+   ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+   IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+   WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+   PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+   ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+   KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+   ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+
+b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+   TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+   NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+   INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+   COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+   USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+   ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+   DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+   IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+
+c. The disclaimer of warranties and limitation of liability provided
+   above shall be interpreted in a manner that, to the extent
+   possible, most closely approximates an absolute disclaimer and
+   waiver of all liability.
+
+Section 6 -- Term and Termination.
+
+a. This Public License applies for the term of the Copyright and
+   Similar Rights licensed here. However, if You fail to comply with
+   this Public License, then Your rights under this Public License
+   terminate automatically.
+
+b. Where Your right to use the Licensed Material has terminated under
+   Section 6(a), it reinstates:
+
+     1. automatically as of the date the violation is cured, provided
+        it is cured within 30 days of Your discovery of the
+        violation; or
+
+     2. upon express reinstatement by the Licensor.
+
+   For the avoidance of doubt, this Section 6(b) does not affect any
+   right the Licensor may have to seek remedies for Your violations
+   of this Public License.
+
+c. For the avoidance of doubt, the Licensor may also offer the
+   Licensed Material under separate terms or conditions or stop
+   distributing the Licensed Material at any time; however, doing so
+   will not terminate this Public License.
+
+d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+   License.
+
+Section 7 -- Other Terms and Conditions.
+
+a. The Licensor shall not be bound by any additional or different
+   terms or conditions communicated by You unless expressly agreed.
+
+b. Any arrangements, understandings, or agreements regarding the
+   Licensed Material not stated herein are separate from and
+   independent of the terms and conditions of this Public License.
+
+Section 8 -- Interpretation.
+
+a. For the avoidance of doubt, this Public License does not, and
+   shall not be interpreted to, reduce, limit, restrict, or impose
+   conditions on any use of the Licensed Material that could lawfully
+   be made without permission under this Public License.
+
+b. To the extent possible, if any provision of this Public License is
+   deemed unenforceable, it shall be automatically reformed to the
+   minimum extent necessary to make it enforceable. If the provision
+   cannot be reformed, it shall be severed from this Public License
+   without affecting the enforceability of the remaining terms and
+   conditions.
+
+c. No term or condition of this Public License will be waived and no
+   failure to comply consented to unless expressly agreed to by the
+   Licensor.
+
+d. Nothing in this Public License constitutes or may be interpreted
+   as a limitation upon, or waiver of, any privileges and immunities
+   that apply to the Licensor or You, including from the legal
+   processes of any jurisdiction or authority.
+
+=======================================================================
+
+Creative Commons is not a party to its public licenses.
+Notwithstanding, Creative Commons may elect to apply one of its public
+licenses to material it publishes and in those instances will be
+considered the "Licensor." Except for the limited purpose of indicating
+that material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the public
+licenses.
+
+Creative Commons may be contacted at creativecommons.org.
+
diff --git a/src/rocksdb/docs/README.md b/src/rocksdb/docs/README.md
new file mode 100644
index 000000000..0ae8978bc
--- /dev/null
+++ b/src/rocksdb/docs/README.md
@@ -0,0 +1,80 @@
+## User Documentation for rocksdb.org
+
+This directory will contain the user and feature documentation for RocksDB. The documentation will be hosted on GitHub pages.
+
+### Contributing
+
+See [CONTRIBUTING.md](./CONTRIBUTING.md) for details on how to add or modify content.
+
+### Run the Site Locally
+
+The requirements for running a GitHub pages site locally is described in [GitHub help](https://help.github.com/articles/setting-up-your-github-pages-site-locally-with-jekyll/#requirements). The steps below summarize these steps.
+
+> If you have run the site before, you can start with step 1 and then move on to step 5.
+
+1. Ensure that you are in the `/docs` directory in your local RocksDB clone (i.e., the same directory where this `README.md` exists). The below RubyGems commands, etc. must be run from there.
+
+1. Make sure you have Ruby and [RubyGems](https://rubygems.org/) installed.
+
+   > Ruby >= 2.2 is required for the gems. On the latest versions of Mac OS X, Ruby 2.0 is the
+   > default. Use `brew install ruby` (or your preferred upgrade mechanism) to install a newer
+   > version of Ruby for your Mac OS X system.
+
+1. Make sure you have [Bundler](http://bundler.io/) installed.
+
+    ```
+    # may require sudo
+    gem install bundler
+    ```
+1. Install the project's dependencies
+
+    ```
+    # run this in the 'docs' directory
+    bundle install
+    ```
+
+    > If you get an error when installing `nokogiri`, you may be running into the problem described
+    > in [this nokogiri issue](https://github.com/sparklemotion/nokogiri/issues/1483). You can
+    > either `brew uninstall xz` (and then `brew install xz` after the bundle is installed) or
+    > `xcode-select --install` (although this may not work if you have already installed command
+    > line tools).
+
+1. Run Jekyll's server.
+
+    - On first runs or for structural changes to the documentation (e.g., new sidebar menu item), do a full build.
+
+    ```
+    bundle exec jekyll serve
+    ```
+
+    - For content changes only, you can use `--incremental` for faster builds.
+
+    ```
+    bundle exec jekyll serve --incremental
+    ```
+
+    > We use `bundle exec` instead of running straight `jekyll` because `bundle exec` will always use the version of Jekyll from our `Gemfile`. Just running `jekyll` will use the system version and may not necessarily be compatible.
+
+    - To run using an actual IP address, you can use `--host=0.0.0.0`
+
+    ```
+    bundle exec jekyll serve --host=0.0.0.0
+    ```
+
+    This will allow you to use the IP address associated with your machine in the URL. That way you could share it with other people.
+
+    e.g., on a Mac, you can your IP address with something like `ifconfig | grep "inet " | grep -v 127.0.0.1`.    
+
+1. Either of commands in the previous step will serve up the site on your local device at http://127.0.0.1:4000/ or http://localhost:4000.
+
+### Updating the Bundle
+
+The site depends on Github Pages and the installed bundle is based on the `github-pages` gem.
+Occasionally that gem might get updated with new or changed functionality. If that is the case,
+you can run:
+
+```
+bundle update
+```
+
+to get the latest packages for the installation.
diff --git a/src/rocksdb/docs/TEMPLATE-INFORMATION.md b/src/rocksdb/docs/TEMPLATE-INFORMATION.md
new file mode 100644
index 000000000..9175bc0c2
--- /dev/null
+++ b/src/rocksdb/docs/TEMPLATE-INFORMATION.md
@@ -0,0 +1,17 @@
+## Template Details
+
+First, go through `_config.yml` and adjust the available settings to your project's standard. When you make changes here, you'll have to kill the `jekyll serve` instance and restart it to see those changes, but that's only the case with the config file.
+
+Next, update some image assets - you'll want to update `favicon.png`, `logo.svg`, and `og_image.png` (used for Like button stories and Shares on Facbeook) in the `static` folder with your own logos.
+
+Next, if you're going to have docs on your site, keep the `_docs` and `docs` folders, if not, you can safely remove them (or you can safely leave them and not include them in your navigation - Jekyll renders all of this before a client views the site anyway, so there's no performance hit from just leaving it there for a future expansion).
+
+Same thing with a blog section, either keep or delete the `_posts` and `blog` folders.
+
+You can customize your homepage in three parts - the first in the homepage header, which is mostly automatically derived from the elements you insert into your config file. However, you can also specify a series of 'promotional' elements in `_data/promo.yml`. You can read that file for more information.
+
+The second place for your homepage is in `index.md` which contains the bulk of the main content below the header. This is all markdown if you want, but you can use HTML and Jekyll's template tags (called Liquid) in there too. Checkout this folder's index.md for an example of one common template tag that we use on our sites called gridblocks.
+
+The third and last place is in the `_data/powered_by.yml` and `_data/powered_by_highlight.yml` files. Both these files combine to create a section on the homepage that is intended to show a list of companies or apps that are using your project. The `powered_by_highlight` file is a list of curated companies/apps that you want to show as a highlight at the top of this section, including their logos in whatever format you want. The `powered_by` file is a more open list that is just text links to the companies/apps and can be updated via Pull Request by the community. If you don't want these sections on your homepage, just empty out both files and leave them blank.
+
+The last thing you'll want to do is setup your top level navigation bar. You can do this by editing `nav.yml` and keeping the existing title/href/category structure used there. Although the nav is responsive and fairly flexible design-wise, no more than 5 or 6 nav items is recommended.
diff --git a/src/rocksdb/docs/_config.yml b/src/rocksdb/docs/_config.yml
new file mode 100644
index 000000000..a4055fd1f
--- /dev/null
+++ b/src/rocksdb/docs/_config.yml
@@ -0,0 +1,85 @@
+# Site settings
+permalink: /blog/:year/:month/:day/:title.html
+title: RocksDB
+tagline: A persistent key-value store for fast storage environments
+description: >
+  RocksDB is an embeddable persistent key-value store for fast storage.
+fbappid: "1615782811974223"
+gacode: "UA-49459723-1"
+# baseurl determines the subpath of your site. For example if you're using an
+# organisation.github.io/reponame/ basic site URL, then baseurl would be set
+# as "/reponame" but leave blank if you have a top-level domain URL as it is
+# now set to "" by default as discussed in:
+# http://jekyllrb.com/news/2016/10/06/jekyll-3-3-is-here/
+baseurl: ""
+
+# the base hostname & protocol for your site
+# If baseurl is set, then the absolute url for your site would be url/baseurl
+# This was also be set to the right thing automatically for local development
+# https://github.com/blog/2277-what-s-new-in-github-pages-with-jekyll-3-3
+# http://jekyllrb.com/news/2016/10/06/jekyll-3-3-is-here/
+url: "http://rocksdb.org"
+
+# Note: There are new filters in Jekyll 3.3 to help with absolute and relative urls
+# absolute_url
+# relative_url
+# So you will see these used throughout the Jekyll code in this template.
+# no more need for | prepend: site.url | prepend: site.baseurl
+# http://jekyllrb.com/news/2016/10/06/jekyll-3-3-is-here/
+#https://github.com/blog/2277-what-s-new-in-github-pages-with-jekyll-3-3
+
+# The GitHub repo for your project
+ghrepo: "facebook/rocksdb"
+
+# Use these color settings to determine your colour scheme for the site.
+color:
+  # primary should be a vivid color that reflects the project's brand
+  primary: "#2a2a2a"
+  # secondary should be a subtle light or dark color used on page backgrounds
+  secondary: "#f9f9f9"
+  # Use the following to specify whether the previous two colours are 'light'
+  # or 'dark' and therefore what colors can be overlaid on them
+  primary-overlay: "dark"
+  secondary-overlay: "light"
+
+#Uncomment this if you want to enable Algolia doc search with your own values
+#searchconfig:
+#  apikey: ""
+#  indexname: ""
+
+# Blog posts are builtin to Jekyll by default, with the `_posts` directory.
+# Here you can specify other types of documentation. The names here are `docs`
+# and `top-level`. This means their content will be in `_docs` and `_top-level`.
+# The permalink format is also given.
+# http://ben.balter.com/2015/02/20/jekyll-collections/
+collections:
+  docs:
+    output: true
+    permalink: /docs/:name/
+  top-level:
+    output: true
+    permalink: :name.html
+
+# DO NOT ADJUST BELOW THIS LINE UNLESS YOU KNOW WHAT YOU ARE CHANGING
+
+markdown: kramdown
+kramdown:
+  input: GFM
+  syntax_highlighter: rouge
+
+  syntax_highlighter_opts:
+    css_class: 'rougeHighlight'
+    span:
+      line_numbers: false
+    block:
+      line_numbers: true
+      start_line: 1
+
+sass:
+  style: :compressed
+
+redcarpet:
+  extensions: [with_toc_data]
+
+plugins:
+  - jekyll-redirect-from
diff --git a/src/rocksdb/docs/_data/authors.yml b/src/rocksdb/docs/_data/authors.yml
new file mode 100644
index 000000000..210987c0b
--- /dev/null
+++ b/src/rocksdb/docs/_data/authors.yml
@@ -0,0 +1,81 @@
+icanadi:
+  full_name: Igor Canadi
+  fbid: 706165749
+
+xjin:
+  full_name: Xing Jin
+  fbid: 100000739847320
+
+leijin:
+  full_name: Lei Jin
+  fbid: 634570164
+
+yhciang:
+  full_name: Yueh-Hsuan Chiang
+  fbid: 1619020986
+
+radheshyam:
+  full_name: Radheshyam Balasundaram
+  fbid: 800837305
+
+zagfox:
+  full_name: Feng Zhu
+  fbid: 100006493823622
+
+lgalanis:
+  full_name: Leonidas Galanis
+  fbid: 8649950
+
+sdong:
+  full_name: Siying Dong
+  fbid: 9805119
+
+dmitrism:
+  full_name: Dmitri Smirnov
+
+rven2:
+  full_name: Venkatesh Radhakrishnan
+  fbid: 100008352697325
+
+yiwu:
+  full_name: Yi Wu
+  fbid: 100000476362039
+
+maysamyabandeh:
+  full_name: Maysam Yabandeh
+  fbid: 100003482360101
+
+IslamAbdelRahman:
+  full_name: Islam AbdelRahman
+  fbid: 642759407
+
+ajkr:
+  full_name: Andrew Kryczka
+  fbid: 568694102
+
+abhimadan:
+  full_name: Abhishek Madan
+  fbid: 1850247869
+
+sagar0:
+  full_name: Sagar Vemuri
+  fbid: 2419111
+
+lightmark:
+  full_name: Aaron Gao
+  fbid: 1351549072
+
+fgwu:
+  full_name: Fenggang Wu
+  fbid: 100002297362180
+
+ltamasi:
+  full_name: Levi Tamasi
+
+cbi42:
+  full_name: Changyu Bi
+  fbid: 100078474793041
+
+zjay:
+  full_name: Jay Zhuang
+  fbid: 100032386042884
diff --git a/src/rocksdb/docs/_data/features.yml b/src/rocksdb/docs/_data/features.yml
new file mode 100644
index 000000000..d692c1849
--- /dev/null
+++ b/src/rocksdb/docs/_data/features.yml
@@ -0,0 +1,19 @@
+- title: High Performance
+  text: |
+        RocksDB uses a log structured database engine, written entirely in C++, for maximum performance. Keys and values are just arbitrarily-sized byte streams.
+  image: images/promo-performance.svg
+
+- title: Optimized for Fast Storage
+  text: |
+        RocksDB is optimized for fast, low latency storage such as flash drives and high-speed disk drives. RocksDB exploits the full potential of high read/write rates offered by flash or RAM.
+  image: images/promo-flash.svg
+
+- title: Adaptable
+  text: |
+        RocksDB is adaptable to different workloads. From database storage engines such as [MyRocks](https://github.com/facebook/mysql-5.6) to [application data caching](http://techblog.netflix.com/2016/05/application-data-caching-using-ssds.html) to embedded workloads, RocksDB can be used for a variety of data needs.
+  image: images/promo-adapt.svg
+
+- title: Basic and Advanced Database Operations
+  text: |
+        RocksDB provides basic operations such as opening and closing a database, reading and writing to more advanced operations such as merging and compaction filters.
+  image: images/promo-operations.svg
diff --git a/src/rocksdb/docs/_data/nav.yml b/src/rocksdb/docs/_data/nav.yml
new file mode 100644
index 000000000..b70c65ff7
--- /dev/null
+++ b/src/rocksdb/docs/_data/nav.yml
@@ -0,0 +1,30 @@
+- title: Docs
+  href: /docs/
+  category: docs
+
+- title: GitHub
+  href: https://github.com/facebook/rocksdb/
+  category: external
+
+- title: API (C++)
+  href: https://github.com/facebook/rocksdb/tree/main/include/rocksdb
+  category: external
+
+- title: API (Java)
+  href: https://github.com/facebook/rocksdb/tree/main/java/src/main/java/org/rocksdb
+  category: external
+
+- title: Support
+  href: /support.html
+  category: support
+
+- title: Blog
+  href: /blog/
+  category: blog
+
+- title: Facebook
+  href: https://www.facebook.com/groups/rocksdb.dev/
+  category: external
+
+# Use external for external links not associated with the paths of the current site.
+# If a category is external, site urls, for example, are not prepended to the href, etc..
diff --git a/src/rocksdb/docs/_data/nav_docs.yml b/src/rocksdb/docs/_data/nav_docs.yml
new file mode 100644
index 000000000..8cdfd2d04
--- /dev/null
+++ b/src/rocksdb/docs/_data/nav_docs.yml
@@ -0,0 +1,3 @@
+- title: Quick Start
+  items:
+  - id: getting-started
diff --git a/src/rocksdb/docs/_data/powered_by.yml b/src/rocksdb/docs/_data/powered_by.yml
new file mode 100644
index 000000000..a780cfe40
--- /dev/null
+++ b/src/rocksdb/docs/_data/powered_by.yml
@@ -0,0 +1 @@
+# Fill in later if desired
diff --git a/src/rocksdb/docs/_data/powered_by_highlight.yml b/src/rocksdb/docs/_data/powered_by_highlight.yml
new file mode 100644
index 000000000..a780cfe40
--- /dev/null
+++ b/src/rocksdb/docs/_data/powered_by_highlight.yml
@@ -0,0 +1 @@
+# Fill in later if desired
diff --git a/src/rocksdb/docs/_data/promo.yml b/src/rocksdb/docs/_data/promo.yml
new file mode 100644
index 000000000..9a72aa844
--- /dev/null
+++ b/src/rocksdb/docs/_data/promo.yml
@@ -0,0 +1,6 @@
+# This file determines the list of promotional elements added to the header of \
+# your site's homepage. Full list of plugins are shown
+
+- type: button
+  href: docs/getting-started.html
+  text: Get Started
diff --git a/src/rocksdb/docs/_docs/faq.md b/src/rocksdb/docs/_docs/faq.md
new file mode 100644
index 000000000..0887a0987
--- /dev/null
+++ b/src/rocksdb/docs/_docs/faq.md
@@ -0,0 +1,48 @@
+---
+docid: support-faq
+title: FAQ
+layout: docs
+permalink: /docs/support/faq.html
+---
+
+Here is an ever-growing list of frequently asked questions around RocksDB
+
+## What is RocksDB?
+
+RocksDB is an embeddable persistent key-value store for fast storage. RocksDB can also be the foundation for a client-server database but our current focus is on embedded workloads.
+
+RocksDB builds on [LevelDB](https://code.google.com/p/leveldb/) to be scalable to run on servers with many CPU cores, to efficiently use fast storage, to support IO-bound, in-memory and write-once workloads, and to be flexible to allow for innovation.
+
+For the latest details, watch [Mark Callaghan’s and Igor Canadi’s talk at CMU on 10/2015](https://scs.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=f4e0eb37-ae18-468f-9248-cb73edad3e56). [Dhruba Borthakur’s introductory talk](https://github.com/facebook/rocksdb/blob/gh-pages-old/intro.pdf?raw=true) from the Data @ Scale 2013 conference provides some perspective about how RocksDB has evolved.
+
+## How does performance compare?
+
+We benchmarked LevelDB and found that it was unsuitable for our server workloads. The [benchmark results](http://leveldb.googlecode.com/svn/trunk/doc/benchmark.html) look awesome at first sight, but we quickly realized that those results were for a database whose size was smaller than the size of RAM on the test machine – where the entire database could fit in the OS page cache. When we performed the same benchmarks on a database that was at least 5 times larger than main memory, the performance results were dismal.
+
+By contrast, we’ve published the [RocksDB benchmark results](https://github.com/facebook/rocksdb/wiki/Performance-Benchmarks) for server side workloads on Flash. We also measured the performance of LevelDB on these server-workload benchmarks and found that RocksDB solidly outperforms LevelDB for these IO bound workloads. We found that LevelDB’s single-threaded compaction process was insufficient to drive server workloads. We saw frequent write-stalls with LevelDB that caused 99-percentile latency to be tremendously large. We found that mmap-ing a file into the OS cache introduced performance bottlenecks for reads. We could not make LevelDB consume all the IOs offered by the underlying Flash storage.
+
+## What is RocksDB suitable for?
+
+RocksDB can be used by applications that need low latency database accesses. Possibilities include:
+
+* A user-facing application that stores the viewing history and state of users of a website.
+* A spam detection application that needs fast access to big data sets.
+* A graph-search query that needs to scan a data set in realtime.
+* A cache data from Hadoop, thereby allowing applications to query Hadoop data in realtime.
+* A message-queue that supports a high number of inserts and deletes.
+
+## How big is RocksDB adoption?
+
+RocksDB is an embedded storage engine that is used in a number of backend systems at Facebook. In the Facebook newsfeed’s backend, it replaced another internal storage engine called Centrifuge and is one of the many components used. ZippyDB, a distributed key value store service used by Facebook products relies RocksDB. Details on ZippyDB are in [Muthu Annamalai’s talk at Data@Scale in Seattle](https://youtu.be/DfiN7pG0D0k). Dragon, a distributed graph query engine part of the social graph infrastructure, is using RocksDB to store data. Parse has been running [MongoDB on RocksDB in production](http://blog.parse.com/announcements/mongodb-rocksdb-parse/) since early 2015.
+
+RocksDB is proving to be a useful component for a lot of other groups in the industry. For a list of projects currently using RocksDB, take a look at our USERS.md list on github.
+
+## How good is RocksDB as a database storage engine?
+
+Our engineering team at Facebook firmly believes that RocksDB has great potential as storage engine for databases. It has been proven in production with MongoDB: [MongoRocks](https://github.com/mongodb-partners/mongo-rocks) is the RocksDB based storage engine for MongoDB.
+
+[MyRocks](https://code.facebook.com/posts/190251048047090/myrocks-a-space-and-write-optimized-mysql-database/) is the RocksDB based storage engine for MySQL. Using RocksDB we have managed to achieve 2x better compression and 10x less write amplification for our benchmarks compared to our existing MySQL setup. Given our current results, work is currently underway to develop MyRocks into a production ready solution for web-scale MySQL workloads. Follow along on [GitHub](https://github.com/facebook/mysql-5.6)!
+
+## Why is RocksDB open sourced?
+
+We are open sourcing this project on [GitHub](http://github.com/facebook/rocksdb) because we think it will be useful beyond Facebook. We are hoping that software programmers and database developers will use, enhance, and customize RocksDB for their use-cases. We would also like to engage with the academic community on topics related to efficiency for modern database algorithms.
diff --git a/src/rocksdb/docs/_docs/getting-started.md b/src/rocksdb/docs/_docs/getting-started.md
new file mode 100644
index 000000000..efd17c031
--- /dev/null
+++ b/src/rocksdb/docs/_docs/getting-started.md
@@ -0,0 +1,78 @@
+---
+docid: getting-started
+title: Getting started
+layout: docs
+permalink: /docs/getting-started.html
+---
+
+## Overview
+
+The RocksDB library provides a persistent key value store. Keys and values are arbitrary byte arrays. The keys are ordered within the key value store according to a user-specified comparator function.
+
+The library is maintained by the Facebook Database Engineering Team, and is based on [LevelDB](https://github.com/google/leveldb), by Sanjay Ghemawat and Jeff Dean at Google.
+
+This overview gives some simple examples of how RocksDB is used. For the story of why RocksDB was created in the first place, see [Dhruba Borthakur’s introductory talk](https://github.com/facebook/rocksdb/blob/gh-pages-old/intro.pdf?raw=true) from the Data @ Scale 2013 conference.
+
+## Opening A Database
+
+A rocksdb database has a name which corresponds to a file system directory. All of the contents of database are stored in this directory. The following example shows how to open a database, creating it if necessary:
+
+```c++
+#include <assert>
+#include "rocksdb/db.h"
+
+rocksdb::DB* db;
+rocksdb::Options options;
+options.create_if_missing = true;
+rocksdb::Status status =
+  rocksdb::DB::Open(options, "/tmp/testdb", &db);
+assert(status.ok());
+...
+```
+
+If you want to raise an error if the database already exists, add the following line before the rocksdb::DB::Open call:
+
+```c++
+options.error_if_exists = true;
+```
+
+## Status
+
+You may have noticed the `rocksdb::Status` type above. Values of this type are returned by most functions in RocksDB that may encounter
+an error. You can check if such a result is ok, and also print an associated error message:
+
+```c++
+rocksdb::Status s = ...;
+if (!s.ok()) cerr << s.ToString() << endl;
+```
+
+## Closing A Database
+
+When you are done with a database, just delete the database object. For example:
+
+```c++
+/* open the db as described above */
+/* do something with db */
+delete db;
+```
+
+## Reads And Writes
+
+The database provides Put, Delete, and Get methods to modify/query the database. For example, the following code moves the value stored under `key1` to `key2`.
+
+```c++
+std::string value;
+rocksdb::Status s = db->Get(rocksdb::ReadOptions(), key1, &value);
+if (s.ok()) s = db->Put(rocksdb::WriteOptions(), key2, value);
+if (s.ok()) s = db->Delete(rocksdb::WriteOptions(), key1);
+```
+
+## Further documentation
+
+These are just simple examples of how RocksDB is used. The full documentation is currently on the [GitHub wiki](https://github.com/facebook/rocksdb/wiki).
+
+Here are some specific details about the RocksDB implementation:
+
+- [RocksDB Overview](https://github.com/facebook/rocksdb/wiki/RocksDB-Overview)
+- [Immutable BlockBased Table file format](https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format)
+- [Log file format](https://github.com/facebook/rocksdb/wiki/Write-Ahead-Log-File-Format)
diff --git a/src/rocksdb/docs/_includes/blog_pagination.html b/src/rocksdb/docs/_includes/blog_pagination.html
new file mode 100644
index 000000000..6a1f33436
--- /dev/null
+++ b/src/rocksdb/docs/_includes/blog_pagination.html
@@ -0,0 +1,28 @@
+<!-- Pagination links - copied from http://jekyllrb.com/docs/pagination/ -->
+{% if paginator.total_pages > 1 %}
+<br />
+<div class="pagination">
+  {% if paginator.previous_page %}
+    <a href="{{ paginator.previous_page_path | replace: '//', '/' }}">&laquo; Prev</a>
+  {% else %}
+    <span>&laquo; Prev</span>
+  {% endif %}
+
+  {% for page in (1..paginator.total_pages) %}
+    {% if page == paginator.page %}
+      <em>{{ page }}</em>
+    {% elsif page == 1 %}
+      <a href="{{ '/blog' }}">{{ page }}</a>
+    {% else %}
+      <a href="{{ site.paginate_path | replace: '//', '/' | replace: ':num', page }}">{{ page }}</a>
+    {% endif %}
+  {% endfor %}
+
+  {% if paginator.next_page %}
+    <a href="{{ paginator.next_page_path | replace: '//', '/' }}">Next &raquo;</a>
+  {% else %}
+    <span>Next &raquo;</span>
+  {% endif %}
+</div>
+<br />
+{% endif %}
diff --git a/src/rocksdb/docs/_includes/content/gridblocks.html b/src/rocksdb/docs/_includes/content/gridblocks.html
new file mode 100644
index 000000000..49c5e5917
--- /dev/null
+++ b/src/rocksdb/docs/_includes/content/gridblocks.html
@@ -0,0 +1,5 @@
+<div class="gridBlock">
+{% for item in {{include.data_source}} %}
+  {% include content/items/gridblock.html item=item layout=include.layout imagealign=include.imagealign align=include.align %}
+{% endfor %}
+</div>
+\ No newline at end of file
diff --git a/src/rocksdb/docs/_includes/content/items/gridblock.html b/src/rocksdb/docs/_includes/content/items/gridblock.html
new file mode 100644
index 000000000..58c9e7fda
--- /dev/null
+++ b/src/rocksdb/docs/_includes/content/items/gridblock.html
@@ -0,0 +1,37 @@
+{% if include.layout == "fourColumn" %}
+  {% assign layout = "fourByGridBlock" %}
+{% else %}
+  {% assign layout = "twoByGridBlock" %}
+{% endif %}
+
+{% if include.imagealign == "side" %}
+  {% assign imagealign = "imageAlignSide" %}
+{% else %}
+  {% if item.image %}
+    {% assign imagealign = "imageAlignTop" %}
+  {% else %}
+    {% assign imagealign = "" %}
+  {% endif %}
+{% endif %}
+
+{% if include.align == "right" %}
+  {% assign align = "alignRight" %}
+{% elsif include.align == "center" %}
+  {% assign align = "alignCenter" %}
+{% else %}
+  {% assign align = "alignLeft" %}
+{% endif %}
+
+<div class="blockElement {{ layout }} {{ imagealign }} {{ align }}">
+  {% if item.image %}
+  <div class="blockImage">
+    <img src="/static/{{ item.image }}" alt="{{ item.title }}" title="{{ item.title }}" />
+  </div>
+  {% endif %}
+  <div class="blockContent">
+    <h3>{{ item.title }}</h3>
+    {% if item.text %}
+    {{ item.text | markdownify }}
+    {% endif %}
+  </div>
+</div>
diff --git a/src/rocksdb/docs/_includes/doc.html b/src/rocksdb/docs/_includes/doc.html
new file mode 100644
index 000000000..31e365ffe
--- /dev/null
+++ b/src/rocksdb/docs/_includes/doc.html
@@ -0,0 +1,25 @@
+<div class="post">
+  <header class="post-header">
+    <h1 class="post-title">{% if include.truncate %}<a href="{{ page.url | absolute_url }}">{{ page.title }}</a>{% else %}{{ page.title }}{% endif %}</h1>
+  </header>
+
+  <article class="post-content">
+   {% if include.truncate %}
+      {% if page.content contains '<!--truncate-->' %}
+        {{ page.content | split:'<!--truncate-->' | first }}
+        <div class="read-more">
+          <a href="{{ page.url | absolute_url }}" >
+            ...Read More
+          </a>
+        </div>
+      {% else %}
+        {{ page.content }}
+      {% endif %}
+    {% else %}
+      {{ content }}
+
+      <p><a class="edit-page-link" href="https://github.com/{{ site.ghrepo }}/blob/main/docs/{{ page.path }}" target="_blank">Edit on GitHub</a></p>
+    {% endif %}
+  </article>
+  {% include doc_paging.html %}
+</div>
diff --git a/src/rocksdb/docs/_includes/doc_paging.html b/src/rocksdb/docs/_includes/doc_paging.html
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/src/rocksdb/docs/_includes/doc_paging.html
diff --git a/src/rocksdb/docs/_includes/footer.html b/src/rocksdb/docs/_includes/footer.html
new file mode 100644
index 000000000..f560172d1
--- /dev/null
+++ b/src/rocksdb/docs/_includes/footer.html
@@ -0,0 +1,34 @@
+<div class="footerContainer">
+  <div id="footer_wrap" class="wrapper footerWrapper">
+    <div class="footerBlocks">
+      <div id="fb_oss" class="footerSection fbOpenSourceFooter">
+          <svg class="facebookOSSLogoSvg" viewBox="0 0 1133.9 1133.9" x="0px" y="0px">
+            <g>
+              <path class="logoRing outerRing" d="M 498.3 3.7 c 153.6 88.9 307.3 177.7 461.1 266.2 c 7.6 4.4 10.3 9.1 10.3 17.8 c -0.3 179.1 -0.2 358.3 0 537.4 c 0 8.1 -2.4 12.8 -9.7 17.1 c -154.5 88.9 -308.8 178.1 -462.9 267.5 c -9 5.2 -15.5 5.3 -24.6 0.1 c -153.9 -89.2 -307.9 -178 -462.1 -266.8 C 3 838.8 0 833.9 0 825.1 c 0.3 -179.1 0.2 -358.3 0 -537.4 c 0 -8.6 2.6 -13.6 10.2 -18 C 164.4 180.9 318.4 92 472.4 3 C 477 -1.5 494.3 -0.7 498.3 3.7 Z M 48.8 555.3 c 0 79.9 0.2 159.9 -0.2 239.8 c -0.1 10 3 15.6 11.7 20.6 c 137.2 78.8 274.2 157.8 411 237.3 c 9.9 5.7 17 5.7 26.8 0.1 c 137.5 -79.8 275.2 -159.2 412.9 -238.5 c 7.4 -4.3 10.5 -8.9 10.5 -17.8 c -0.3 -160.2 -0.3 -320.5 0 -480.7 c 0 -8.8 -2.8 -13.6 -10.3 -18 C 772.1 218 633.1 137.8 494.2 57.4 c -6.5 -3.8 -11.5 -4.5 -18.5 -0.5 C 336.8 137.4 197.9 217.7 58.8 297.7 c -7.7 4.4 -10.2 9.2 -10.2 17.9 C 48.9 395.5 48.8 475.4 48.8 555.3 Z" />
+              <path class="logoRing middleRing" d="M 184.4 555.9 c 0 -33.3 -1 -66.7 0.3 -100 c 1.9 -48 24.1 -86 64.7 -110.9 c 54.8 -33.6 110.7 -65.5 167 -96.6 c 45.7 -25.2 92.9 -24.7 138.6 1 c 54.4 30.6 108.7 61.5 162.2 93.7 c 44 26.5 67.3 66.8 68 118.4 c 0.9 63.2 0.9 126.5 0 189.7 c -0.7 50.6 -23.4 90.7 -66.6 116.9 c -55 33.4 -110.8 65.4 -167.1 96.5 c -43.4 24 -89 24.2 -132.3 0.5 c -57.5 -31.3 -114.2 -64 -170 -98.3 c -41 -25.1 -62.9 -63.7 -64.5 -112.2 C 183.5 621.9 184.3 588.9 184.4 555.9 Z M 232.9 556.3 c 0 29.5 0.5 59.1 -0.1 88.6 c -0.8 39.2 16.9 67.1 50.2 86.2 c 51.2 29.4 102.2 59.2 153.4 88.4 c 31.4 17.9 63.6 18.3 95 0.6 c 53.7 -30.3 107.1 -61.2 160.3 -92.5 c 29.7 -17.5 45 -44.5 45.3 -78.8 c 0.6 -61.7 0.5 -123.5 0 -185.2 c -0.3 -34.4 -15.3 -61.5 -44.9 -79 C 637.7 352.6 583 320.8 527.9 290 c -27.5 -15.4 -57.2 -16.1 -84.7 -0.7 c -56.9 31.6 -113.4 64 -169.1 97.6 c -26.4 15.9 -40.7 41.3 -41.1 72.9 C 232.6 491.9 232.9 524.1 232.9 556.3 Z" />
+              <path class="logoRing innerRing" d="M 484.9 424.4 c 69.8 -2.8 133.2 57.8 132.6 132 C 617 630 558.5 688.7 484.9 689.1 c -75.1 0.4 -132.6 -63.6 -132.7 -132.7 C 352.1 485 413.4 421.5 484.9 424.4 Z M 401.3 556.7 c -3.4 37.2 30.5 83.6 83 84.1 c 46.6 0.4 84.8 -37.6 84.9 -84 c 0.1 -46.6 -37.2 -84.4 -84.2 -84.6 C 432.2 472.1 397.9 518.3 401.3 556.7 Z" />
+            </g>
+          </svg>
+        <h2><a href="https://opensource.fb.com/" target="_blank">Meta Open Source</a></h2>
+      </div>
+      <div class="footerSection">
+        <a class="footerLink" href="https://github.com/facebook/rocksdb" target="_blank">GitHub</a>
+        <a class="footerLink" href="hhttps://twitter.com/rocksdb" target="_blank">Twitter</a>
+        <a class="footerLink" href="https://opensource.fb.com/legal/terms" target="_blank">Terms of Use</a>
+        <a class="footerLink" href="https://opensource.fb.com/legal/privacy" target="_blank">Privacy Policy</a>
+      </div>
+      <div class="footerSection rightAlign">
+       Copyright © 2022 Meta Platforms, Inc.
+      </div>
+    </div>
+  </div>
+</div>
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+
+  ga('create', '{{ site.gacode }}', 'auto');
+  ga('send', 'pageview');
+</script>
diff --git a/src/rocksdb/docs/_includes/head.html b/src/rocksdb/docs/_includes/head.html
new file mode 100644
index 000000000..10845ec1d
--- /dev/null
+++ b/src/rocksdb/docs/_includes/head.html
@@ -0,0 +1,23 @@
+<head>
+  <meta charset="utf-8">
+  <meta http-equiv="X-UA-Compatible" content="IE=edge">
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+
+  <meta property="og:url" content="{{ page.url | replace:'index.html','' | absolute_url }}" />
+  <meta property="og:site_name" content="{{ site.title }}"/>
+  <meta property="og:title" content="{% if page.title %}{{ page.title }}{% else %}{{ site.title }}{% endif %}" />
+  <meta property="og:image" content="{{ '/static/og_image.png' | absolute_url }}" />
+  <meta property="og:description" content="{% if page.excerpt %}{{ page.excerpt | strip_html | strip_newlines | truncate: 160 }}{% else %}{{ site.description }}{% endif %}" />
+
+  <link rel="stylesheet" href="{{ '/css/main.css' }}" media="screen">
+  <link rel="icon" href="{{ '/static/favicon.png' }}" type="image/x-icon">
+  {% if site.searchconfig %}
+  <link rel="stylesheet" href="https://cdn.jsdelivr.net/docsearch.js/1/docsearch.min.css" />
+  {% endif %}
+
+  <title>{% if page.title %}{{ page.title }} | {{ site.title }}{% else %}{{ site.title }}{% endif %}</title>
+  <meta name="description" content="{% if page.excerpt %}{{ page.excerpt | strip_html | strip_newlines | truncate: 160 }}{% else %}{{ site.description }}{% endif %}">
+
+  <link rel="canonical" href="{{ page.url | replace:'index.html','' | absolute_url }}">
+  <link rel="alternate" type="application/rss+xml" title="{{ site.title }}" href="{{ '/feed.xml' |  absolute_url }}" />
+</head>
diff --git a/src/rocksdb/docs/_includes/header.html b/src/rocksdb/docs/_includes/header.html
new file mode 100644
index 000000000..8108d222b
--- /dev/null
+++ b/src/rocksdb/docs/_includes/header.html
@@ -0,0 +1,19 @@
+<div class="headerContainer">
+  <div id="header_wrap" class="wrapper headerWrapper">
+    <div class="inner">
+      <img class="projectLogo" height="200px" src="{{ '/static/logo.svg' }}" alt="{{ site.title }}" title="{{ site.title }}" />
+      <h1 id="project_title">{{ site.title }}</h1>
+      <h2 id="project_tagline" class="fbossFontLight">{{ site.tagline }}</h2>
+
+      <section id="intro">
+        <p>{% if page.excerpt %}{{ page.excerpt | strip_html }}{% else %}{{ site.description }}{% endif %}</p>
+      </section>
+      <div id="promo" class="section promoSection">
+        {% for promo in site.data.promo %}
+          {% include plugins/{{promo.type}}.html button_href=promo.href button_text=promo.text %}
+          <div class="gridClear"></div>
+        {% endfor %}
+      </div>
+    </div>
+  </div>
+</div>
diff --git a/src/rocksdb/docs/_includes/hero.html b/src/rocksdb/docs/_includes/hero.html
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/src/rocksdb/docs/_includes/hero.html
diff --git a/src/rocksdb/docs/_includes/home_header.html b/src/rocksdb/docs/_includes/home_header.html
new file mode 100644
index 000000000..90880d17c
--- /dev/null
+++ b/src/rocksdb/docs/_includes/home_header.html
@@ -0,0 +1,22 @@
+<div class="homeContainer">
+  <div class="homeSplashFade">
+    <div id="home_wrap" class="wrapper homeWrapper">
+      <div id="inner">
+        <h2 id="project_tagline">{{ site.tagline }}</h2>
+        <section id="intro">
+          <p>{% if page.excerpt %}{{ page.excerpt | strip_html }}{% else %}{{ site.description }}{% endif %}</p>
+        </section>
+        <div id="promo" class="section promoSection">
+          {% for promo in site.data.promo %}
+            <div class="promoRow">
+            {% include plugins/{{promo.type}}.html href=promo.href text=promo.text children=promo.children %}
+            </div>
+          {% endfor %}
+        </div>
+      </div>
+      <div class="projectLogo">
+        <img src="{{ '/static/logo.svg' }}" alt="{{ site.title }}">
+      </div>
+    </div>
+  </div>
+</div>
diff --git a/src/rocksdb/docs/_includes/katex_import.html b/src/rocksdb/docs/_includes/katex_import.html
new file mode 100644
index 000000000..6d6b7cf44
--- /dev/null
+++ b/src/rocksdb/docs/_includes/katex_import.html
@@ -0,0 +1,3 @@
+<script src="//code.jquery.com/jquery-1.11.1.min.js"></script>
+<link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/KaTeX/0.2.0/katex.min.css">
+<script src="//cdnjs.cloudflare.com/ajax/libs/KaTeX/0.2.0/katex.min.js"></script>
diff --git a/src/rocksdb/docs/_includes/katex_render.html b/src/rocksdb/docs/_includes/katex_render.html
new file mode 100644
index 000000000..56e2e8974
--- /dev/null
+++ b/src/rocksdb/docs/_includes/katex_render.html
@@ -0,0 +1,210 @@
+<script type="text/javascript">
+/* global katex */
+
+var findEndOfMath = function(delimiter, text, startIndex) {
+    // Adapted from
+    // https://github.com/Khan/perseus/blob/master/src/perseus-markdown.jsx
+    var index = startIndex;
+    var braceLevel = 0;
+
+    var delimLength = delimiter.length;
+
+    while (index < text.length) {
+        var character = text[index];
+
+        if (braceLevel <= 0 &&
+            text.slice(index, index + delimLength) === delimiter) {
+            return index;
+        } else if (character === "\\") {
+            index++;
+        } else if (character === "{") {
+            braceLevel++;
+        } else if (character === "}") {
+            braceLevel--;
+        }
+
+        index++;
+    }
+
+    return -1;
+};
+
+var splitAtDelimiters = function(startData, leftDelim, rightDelim, display) {
+    var finalData = [];
+
+    for (var i = 0; i < startData.length; i++) {
+        if (startData[i].type === "text") {
+            var text = startData[i].data;
+
+            var lookingForLeft = true;
+            var currIndex = 0;
+            var nextIndex;
+
+            nextIndex = text.indexOf(leftDelim);
+            if (nextIndex !== -1) {
+                currIndex = nextIndex;
+                finalData.push({
+                    type: "text",
+                    data: text.slice(0, currIndex)
+                });
+                lookingForLeft = false;
+            }
+
+            while (true) {
+                if (lookingForLeft) {
+                    nextIndex = text.indexOf(leftDelim, currIndex);
+                    if (nextIndex === -1) {
+                        break;
+                    }
+
+                    finalData.push({
+                        type: "text",
+                        data: text.slice(currIndex, nextIndex)
+                    });
+
+                    currIndex = nextIndex;
+                } else {
+                    nextIndex = findEndOfMath(
+                        rightDelim,
+                        text,
+                        currIndex + leftDelim.length);
+                    if (nextIndex === -1) {
+                        break;
+                    }
+
+                    finalData.push({
+                        type: "math",
+                        data: text.slice(
+                            currIndex + leftDelim.length,
+                            nextIndex),
+                        rawData: text.slice(
+                            currIndex,
+                            nextIndex + rightDelim.length),
+                        display: display
+                    });
+
+                    currIndex = nextIndex + rightDelim.length;
+                }
+
+                lookingForLeft = !lookingForLeft;
+            }
+
+            finalData.push({
+                type: "text",
+                data: text.slice(currIndex)
+            });
+        } else {
+            finalData.push(startData[i]);
+        }
+    }
+
+    return finalData;
+};
+
+var splitWithDelimiters = function(text, delimiters) {
+    var data = [{type: "text", data: text}];
+    for (var i = 0; i < delimiters.length; i++) {
+        var delimiter = delimiters[i];
+        data = splitAtDelimiters(
+            data, delimiter.left, delimiter.right,
+            delimiter.display || false);
+    }
+    return data;
+};
+
+var renderMathInText = function(text, delimiters) {
+    var data = splitWithDelimiters(text, delimiters);
+
+    var fragment = document.createDocumentFragment();
+
+    for (var i = 0; i < data.length; i++) {
+        if (data[i].type === "text") {
+            fragment.appendChild(document.createTextNode(data[i].data));
+        } else {
+            var span = document.createElement("span");
+            var math = data[i].data;
+            try {
+                katex.render(math, span, {
+                    displayMode: data[i].display
+                });
+            } catch (e) {
+                if (!(e instanceof katex.ParseError)) {
+                    throw e;
+                }
+                console.error(
+                    "KaTeX auto-render: Failed to parse `" + data[i].data +
+                    "` with ",
+                    e
+                );
+                fragment.appendChild(document.createTextNode(data[i].rawData));
+                continue;
+            }
+            fragment.appendChild(span);
+        }
+    }
+
+    return fragment;
+};
+
+var renderElem = function(elem, delimiters, ignoredTags) {
+    for (var i = 0; i < elem.childNodes.length; i++) {
+        var childNode = elem.childNodes[i];
+        if (childNode.nodeType === 3) {
+            // Text node
+            var frag = renderMathInText(childNode.textContent, delimiters);
+            i += frag.childNodes.length - 1;
+            elem.replaceChild(frag, childNode);
+        } else if (childNode.nodeType === 1) {
+            // Element node
+            var shouldRender = ignoredTags.indexOf(
+                childNode.nodeName.toLowerCase()) === -1;
+
+            if (shouldRender) {
+                renderElem(childNode, delimiters, ignoredTags);
+            }
+        }
+        // Otherwise, it's something else, and ignore it.
+    }
+};
+
+var defaultOptions = {
+    delimiters: [
+        {left: "$$", right: "$$", display: true},
+        {left: "\\[", right: "\\]", display: true},
+        {left: "\\(", right: "\\)", display: false}
+        // LaTeX uses this, but it ruins the display of normal `$` in text:
+        // {left: "$", right: "$", display: false}
+    ],
+
+    ignoredTags: [
+        "script", "noscript", "style", "textarea", "pre", "code"
+    ]
+};
+
+var extend = function(obj) {
+    // Adapted from underscore.js' `_.extend`. See LICENSE.txt for license.
+    var source, prop;
+    for (var i = 1, length = arguments.length; i < length; i++) {
+        source = arguments[i];
+        for (prop in source) {
+            if (Object.prototype.hasOwnProperty.call(source, prop)) {
+                obj[prop] = source[prop];
+            }
+        }
+    }
+    return obj;
+};
+
+var renderMathInElement = function(elem, options) {
+    if (!elem) {
+        throw new Error("No element provided to render");
+    }
+
+    options = extend({}, defaultOptions, options);
+
+    renderElem(elem, options.delimiters, options.ignoredTags);
+};
+
+renderMathInElement(document.body);
+
+</script>
diff --git a/src/rocksdb/docs/_includes/nav.html b/src/rocksdb/docs/_includes/nav.html
new file mode 100644
index 000000000..9c6fed06b
--- /dev/null
+++ b/src/rocksdb/docs/_includes/nav.html
@@ -0,0 +1,37 @@
+<div id="fixed_header" class="fixedHeaderContainer{% if include.alwayson %} visible{% endif %}">
+  <div class="headerWrapper wrapper">
+    <header>
+      <a href="{{ '/' | absolute_url }}">
+        <img src="{{ '/static/logo.svg' }}">
+        <h2>{{ site.title }}</h2>
+      </a>
+
+      <div class="navigationWrapper navigationFull" id="flat_nav">
+        <nav class="navigation">
+          <ul>
+            {% for item in site.data.nav %}
+            <li class="navItem{% if page.collection == item.category or page.category == item.category %} navItemActive{% endif %}">
+              {% if item.category == "external" %}
+                <a href="{{ item.href }}">{{ item.title }}</a>
+              {% else %}
+                {% comment %}
+                I removed `relative_url` from here for now until the problem we are having with
+                GitHub pages is resolved. Yes, I know this is exactly the same as the if above.
+                See: https://github.com/facebook/rocksdb/commit/800e51553ee029f29581f7f338cbc988c7f6da62
+                {% endcomment %}
+                <a href="{{ item.href }}">{{ item.title }}</a>
+              {% endif %}
+            </li>
+            {% endfor %}
+            {% if site.searchconfig %}
+            {% include nav_search.html inputselector="search_input" %}
+            {% endif %}
+          </ul>
+        </nav>
+      </div>
+      <div class="navigationWrapper navigationSlider" id="navigation_wrap">
+        {% include nav/header_nav.html %}
+      </div>
+    </header>
+  </div>
+</div>
diff --git a/src/rocksdb/docs/_includes/nav/collection_nav.html b/src/rocksdb/docs/_includes/nav/collection_nav.html
new file mode 100644
index 000000000..a3c7a2dd3
--- /dev/null
+++ b/src/rocksdb/docs/_includes/nav/collection_nav.html
@@ -0,0 +1,64 @@
+<div class="docsNavContainer">
+  <nav class="toc" id="doc_nav">
+    <div class="toggleNav" id="collection_nav">
+      <section class="navWrapper wrapper">
+        <div class="navBreadcrumb wrapper">
+          <div class="navToggle" id="collection_nav_toggler">
+            <i></i>
+          </div>
+          <h2>
+            <a href="{{ include.sectionpath }}">{{ include.sectiontitle }}</a>
+            {% if include.currentgroup %}
+            <i>›</i>
+            <span>{{ include.currentgroup }}</span>
+            {% endif %}
+          </h2>
+        </div>
+        <div class="navGroups">
+          {% if include.type == "blog" %}
+            {% assign grouptitle = "All Posts" %}
+            {% assign groupitems = include.navdata %}
+            {% include nav/collection_nav_group.html %}
+          {% else %}
+          {% for group in include.navdata %}
+            {% assign grouptitle = group.title %}
+            {% for item in group.items %}
+              {% if item.id == page.docid %}
+              {% assign currentgroup = group %}
+              {% endif %}
+            {% endfor %}
+            {% include nav/collection_nav_group.html %}
+          {% endfor %}
+          {% endif %}
+        </div>
+      </section>
+    </div>
+  </nav>
+</div>
+<script>
+  var docsevent = document.createEvent('Event');
+  docsevent.initEvent('docs_slide', true, true);
+  document.addEventListener('docs_slide', function (e) {
+    document.body.classList.toggle('docsSliderActive');
+  }, false);
+
+  var collectionNav = document.getElementById('collection_nav');
+  var collectionNavToggler =
+    document.getElementById('collection_nav_toggler');
+  collectionNavToggler.addEventListener('click', function(e) {
+    collectionNav.classList.toggle('toggleNavActive');
+    document.dispatchEvent(docsevent);
+  });
+
+  var groups = document.getElementsByClassName('navGroup');
+  for(var i = 0; i < groups.length; i++) {
+    var thisGroup = groups[i];
+    thisGroup.onclick = function() {
+      for(var j = 0; j < groups.length; j++) {
+        var group = groups[j];
+        group.classList.remove('navGroupActive');
+      }
+      this.classList.add('navGroupActive');
+    }
+  }
+</script>
diff --git a/src/rocksdb/docs/_includes/nav/collection_nav_group.html b/src/rocksdb/docs/_includes/nav/collection_nav_group.html
new file mode 100644
index 000000000..b236ac5e3
--- /dev/null
+++ b/src/rocksdb/docs/_includes/nav/collection_nav_group.html
@@ -0,0 +1,19 @@
+<div class="navGroup{% if currentgroup == group %} navGroupActive navGroupCurrent{% endif %}">
+  <h3><i>+</i><span>{{ grouptitle }}</span></h3>
+  <ul>
+    {% if include.data_collection %}
+      {% for item in group.items %}
+        {% for collectionitem in include.data_collection %}
+        {% if collectionitem.docid == item.id %}
+          {% assign groupitem = collectionitem %}
+          {% include nav/collection_nav_group_item.html %}
+        {% endif %}
+        {% endfor %}
+      {% endfor %}
+    {% else %}
+      {% for groupitem in groupitems %}
+      {% include nav/collection_nav_group_item.html %}
+      {% endfor %}
+    {% endif %}
+  </ul>
+</div>
+\ No newline at end of file
diff --git a/src/rocksdb/docs/_includes/nav/collection_nav_group_item.html b/src/rocksdb/docs/_includes/nav/collection_nav_group_item.html
new file mode 100644
index 000000000..fbb063deb
--- /dev/null
+++ b/src/rocksdb/docs/_includes/nav/collection_nav_group_item.html
@@ -0,0 +1 @@
+<li class="navListItem"><a class="navItem" href="{{ groupitem.url | absolute_url }}">{{ groupitem.title }}</a></li>
diff --git a/src/rocksdb/docs/_includes/nav/header_nav.html b/src/rocksdb/docs/_includes/nav/header_nav.html
new file mode 100644
index 000000000..0fe945cdc
--- /dev/null
+++ b/src/rocksdb/docs/_includes/nav/header_nav.html
@@ -0,0 +1,30 @@
+<div id="header_nav">
+  <div class="navSlideout">
+    <i class="menuExpand" id="header_nav_expander"><span></span><span></span><span></span></i>
+  </div>
+  <nav class="slidingNav">
+    <ul>
+      {% for item in site.data.nav %}
+      <li class="navItem">
+        <a href="{{ item.href }}"{% if item.category == "external" %} target="_blank"{% endif %}>{{ item.title }}</a>
+      </li>
+      {% endfor %}
+      {% if site.searchconfig %}
+      {% include nav_search.html inputselector="search_input_react" %}
+      {% endif %}
+    </ul>
+  </nav>
+</div>
+<script>
+  var event = document.createEvent('Event');
+  event.initEvent('slide', true, true);
+  document.addEventListener('slide', function (e) {
+    document.body.classList.toggle('sliderActive');
+  }, false);
+  var headerNav = document.getElementById('header_nav');
+  var headerNavExpander = document.getElementById('header_nav_expander');
+  headerNavExpander.addEventListener('click', function(e) {
+    headerNav.classList.toggle('navSlideoutActive');
+    document.dispatchEvent(event);
+  }, false);
+</script>
+\ No newline at end of file
diff --git a/src/rocksdb/docs/_includes/nav_search.html b/src/rocksdb/docs/_includes/nav_search.html
new file mode 100644
index 000000000..84956b9f7
--- /dev/null
+++ b/src/rocksdb/docs/_includes/nav_search.html
@@ -0,0 +1,15 @@
+<li class="navSearchWrapper">
+  <input id="{{ include.inputselector }}" type="search" />
+</li>
+<script type="text/javascript" src="https://cdn.jsdelivr.net/docsearch.js/1/docsearch.min.js"></script>
+<script>
+// For Algolia search
+(function() {
+  // Algolia
+  docsearch({
+    apiKey: '{{ site.searchconfig.apikey }}',
+    indexName: '{{ site.searchconfig.indexname }}',
+    inputSelector: '#{{ include.inputselector }}',
+  });
+}());
+</script>
+\ No newline at end of file
diff --git a/src/rocksdb/docs/_includes/plugins/all_share.html b/src/rocksdb/docs/_includes/plugins/all_share.html
new file mode 100644
index 000000000..59b00d615
--- /dev/null
+++ b/src/rocksdb/docs/_includes/plugins/all_share.html
@@ -0,0 +1,3 @@
+<div class="pluginBlock allShareBlock">
+  {% include plugins/like_button.html %}{% include plugins/twitter_share.html %}{% include plugins/google_share.html %}
+</div>
+\ No newline at end of file
diff --git a/src/rocksdb/docs/_includes/plugins/ascii_cinema.html b/src/rocksdb/docs/_includes/plugins/ascii_cinema.html
new file mode 100644
index 000000000..7d3f97148
--- /dev/null
+++ b/src/rocksdb/docs/_includes/plugins/ascii_cinema.html
@@ -0,0 +1,2 @@
+<div class="ascii-cinema pluginBlock"></div>
+<script type="text/javascript" src="https://asciinema.org/a/{{ include.href }}.js" id="asciicast-{{ include.href }}" async data-autoplay="true" data-loop="true" data-speed="2" data-t="23"></script>
+\ No newline at end of file
diff --git a/src/rocksdb/docs/_includes/plugins/button.html b/src/rocksdb/docs/_includes/plugins/button.html
new file mode 100644
index 000000000..9e499fe3f
--- /dev/null
+++ b/src/rocksdb/docs/_includes/plugins/button.html
@@ -0,0 +1,6 @@
+<div class="pluginWrapper buttonWrapper">
+  <a
+    class="button"
+    href="{{ include.href }}"
+  >{{ include.text }}</a>
+</div>
+\ No newline at end of file
diff --git a/src/rocksdb/docs/_includes/plugins/github_star.html b/src/rocksdb/docs/_includes/plugins/github_star.html
new file mode 100644
index 000000000..6aea70fc7
--- /dev/null
+++ b/src/rocksdb/docs/_includes/plugins/github_star.html
@@ -0,0 +1,4 @@
+<div class="pluginWrapper ghStarWrapper">
+  <a aria-label="Star {{ site.ghrepo }} on GitHub" data-count-aria-label="# stargazers on GitHub" data-count-api="/repos/{{ site.ghrepo }}#stargazers_count" data-count-href="/{{ site.ghrepo }}/stargazers" data-style="mega" data-icon="octicon-star" href="https://github.com/{{ site.ghrepo }}" class="github-button">Star</a>
+</div>
+<script async defer id="github-bjs" src="https://buttons.github.io/buttons.js"></script>
+\ No newline at end of file
diff --git a/src/rocksdb/docs/_includes/plugins/github_watch.html b/src/rocksdb/docs/_includes/plugins/github_watch.html
new file mode 100644
index 000000000..64233b57b
--- /dev/null
+++ b/src/rocksdb/docs/_includes/plugins/github_watch.html
@@ -0,0 +1,4 @@
+<div class="pluginWrapper ghWatchWrapper">
+  <a aria-label="Watch {{ site.ghrepo }} on GitHub" data-count-aria-label="# watchers on GitHub" data-count-api="/repos/{{ site.ghrepo }}#subscribers_count" data-count-href="/{{ site.ghrepo }}/watchers" data-style="mega" data-icon="octicon-eye" href="https://github.com/{{ site.ghrepo }}" class="github-button">Watch</a>
+</div>
+<script async defer id="github-bjs" src="https://buttons.github.io/buttons.js"></script>
+\ No newline at end of file
diff --git a/src/rocksdb/docs/_includes/plugins/google_share.html b/src/rocksdb/docs/_includes/plugins/google_share.html
new file mode 100644
index 000000000..1b557db86
--- /dev/null
+++ b/src/rocksdb/docs/_includes/plugins/google_share.html
@@ -0,0 +1,5 @@
+<div class="pluginBlock">
+  <div class="g-plusone" data-size="medium"></div>
+</div>
+
+<script src="https://apis.google.com/js/platform.js" async defer></script>
diff --git a/src/rocksdb/docs/_includes/plugins/iframe.html b/src/rocksdb/docs/_includes/plugins/iframe.html
new file mode 100644
index 000000000..525b59f22
--- /dev/null
+++ b/src/rocksdb/docs/_includes/plugins/iframe.html
@@ -0,0 +1,6 @@
+<div class="iframeContent">
+  <iframe class="pluginIframe" src="{{ include.href }}" seamless></iframe>
+</div>
+<div class="iframePreview">
+  {% include plugins/button.html href=include.href text=include.text %}
+</div>
+\ No newline at end of file
diff --git a/src/rocksdb/docs/_includes/plugins/like_button.html b/src/rocksdb/docs/_includes/plugins/like_button.html
new file mode 100644
index 000000000..bcb8a7bee
--- /dev/null
+++ b/src/rocksdb/docs/_includes/plugins/like_button.html
@@ -0,0 +1,18 @@
+<div class="fb-like pluginWrapper likeButtonWrapper" data-layout="button_count" data-action="like" data-show-faces="true" data-share="true"></div>
+<script>
+  window.fbAsyncInit = function() {
+  FB.init({
+    appId      : '{{ site.fbappid }}',
+    xfbml      : true,
+    version    : 'v2.3'
+  });
+  };
+
+  (function(d, s, id){
+   var js, fjs = d.getElementsByTagName(s)[0];
+   if (d.getElementById(id)) {return;}
+   js = d.createElement(s); js.id = id;
+   js.src = "//connect.facebook.net/en_US/sdk.js";
+   fjs.parentNode.insertBefore(js, fjs);
+   }(document, 'script', 'facebook-jssdk'));
+</script>
+\ No newline at end of file
diff --git a/src/rocksdb/docs/_includes/plugins/plugin_row.html b/src/rocksdb/docs/_includes/plugins/plugin_row.html
new file mode 100644
index 000000000..800f50b82
--- /dev/null
+++ b/src/rocksdb/docs/_includes/plugins/plugin_row.html
@@ -0,0 +1,5 @@
+<div class="pluginRowBlock">
+{% for child in include.children %}
+  {% include plugins/{{child.type}}.html href=child.href text=child.text %}
+{% endfor %}
+</div>
+\ No newline at end of file
diff --git a/src/rocksdb/docs/_includes/plugins/post_social_plugins.html b/src/rocksdb/docs/_includes/plugins/post_social_plugins.html
new file mode 100644
index 000000000..a2ecb90ee
--- /dev/null
+++ b/src/rocksdb/docs/_includes/plugins/post_social_plugins.html
@@ -0,0 +1,41 @@
+<div class="postSocialPlugins">
+  <a
+    href="https://twitter.com/share"
+    class="twitter-share-button"
+    data-url="{{ page.url | replace:'index.html','' | absolute_url }}"
+    data-text="{% if page.title %}{{ page.title }}{% else %}{{ site.title }}{% endif %}"
+    data-hashtags="flowtype">Tweet</a>
+  <div
+    class="fb-like"
+    data-href="{{ page.url | replace:'index.html','' | absolute_url }}"
+    data-layout="button_count"
+    data-action="like"
+    data-show-faces="false"
+    data-share="true"></div>
+</div>
+<script>
+  window.fbAsyncInit = function() {
+  FB.init({
+    appId      : '{{ site.fbappid }}',
+    xfbml      : true,
+    version    : 'v2.2'
+  });
+  };
+
+  (function(d, s, id){
+   var js, fjs = d.getElementsByTagName(s)[0];
+   if (d.getElementById(id)) {return;}
+   js = d.createElement(s); js.id = id;
+   js.src = "//connect.facebook.net/en_US/sdk.js";
+   fjs.parentNode.insertBefore(js, fjs);
+   }(document, 'script', 'facebook-jssdk'));
+</script>
+
+<script>!function(d,s,id){
+  var js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';
+  if(!d.getElementById(id)){
+  js=d.createElement(s);js.id=id;js.src=p+'://platform.twitter.com/widgets.js';
+  fjs.parentNode.insertBefore(js,fjs);
+  }
+  }(document, 'script', 'twitter-wjs');
+</script>
diff --git a/src/rocksdb/docs/_includes/plugins/slideshow.html b/src/rocksdb/docs/_includes/plugins/slideshow.html
new file mode 100644
index 000000000..69fa2b300
--- /dev/null
+++ b/src/rocksdb/docs/_includes/plugins/slideshow.html
@@ -0,0 +1,88 @@
+<div class="slideshowBlock pluginWrapper" id="slideshow"></div>
+<script>
+  var slideshowData = [
+    {% for image in site.data.slideshow %}
+    {
+      id         : "{{ image.id }}",
+      imagesrc   : "{{ image.src }}",
+      tooltip    : "{{ image.tooltip }}",
+      href       : "{{ image.link }}",
+    },
+    {% endfor %}
+  ];
+</script>
+<script src="http://fb.me/react-with-addons-0.13.1.min.js"></script>
+<script type="text/javascript">
+  var Slideshow = React.createClass({displayName: "Slideshow",
+    getInitialState: function() {
+      return {
+        currentSlide: 0,
+      };
+    },
+    getDefaultProps: function() {
+      return {
+        data: slideshowData,
+      };
+    },
+    handleSelect: function(id) {
+      var index = this.props.data.map(function (el, elIndex) {
+        return (
+          elIndex
+        );
+      });
+      var currentIndex = index.indexOf(id);
+      this.setState({
+        currentSlide: currentIndex,
+      });
+    },
+    render: function() {
+      return (
+        React.createElement("div", {className: "slideshow"},
+          React.createElement("div", {className: "slides"},
+            this.props.data.map(this.renderSlide)
+          ),
+          React.createElement("div", {className: "pagination"},
+            this.props.data.map(this.renderPager)
+          )
+        )
+      );
+    },
+    renderSlide: function(child, index) {
+      var classes = React.addons.classSet({
+        'slide': true,
+        'slideActive': this.state.currentSlide === index,
+      });
+      if (child.href) {
+        return (
+          React.createElement("div", {key: index, className: classes},
+            React.createElement("a", {href: child.href, alt: child.tooltip, title: child.tooltip},
+              React.createElement("img", {src: child.imagesrc, alt: child.tooltip, title: child.tooltip})
+            )
+          )
+        );
+      }
+      return (
+        React.createElement("div", {key: index, className: classes},
+          React.createElement("img", {src: child.imagesrc, alt: child.tooltip})
+        )
+      );
+    },
+    renderPager: function(child, index) {
+      var classes = React.addons.classSet({
+        'pager': true,
+        'pagerActive': this.state.currentSlide === index,
+      });
+      return (
+        React.createElement("span", {key: index, className: classes, onClick: this.handleSelect.bind(this, index)})
+      );
+    },
+  });
+
+  function render(slideshowData) {
+    React.render(
+      React.createElement(Slideshow, {data: slideshowData}),
+      document.getElementById('slideshow')
+    );
+  }
+  render(slideshowData);
+</script>
+\ No newline at end of file
diff --git a/src/rocksdb/docs/_includes/plugins/twitter_follow.html b/src/rocksdb/docs/_includes/plugins/twitter_follow.html
new file mode 100644
index 000000000..b0f25dc60
--- /dev/null
+++ b/src/rocksdb/docs/_includes/plugins/twitter_follow.html
@@ -0,0 +1,12 @@
+<div class="pluginBlock">
+  <a href="https://twitter.com/{{ include.href }}" class="twitter-follow-button pluginBlock" data-show-count="false">Follow @{{ include.href }}</a>
+</div>
+
+<script>!function(d,s,id){
+  var js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';
+  if(!d.getElementById(id)){
+  js=d.createElement(s);js.id=id;js.src=p+'://platform.twitter.com/widgets.js';
+  fjs.parentNode.insertBefore(js,fjs);
+  }
+  }(document, 'script', 'twitter-wjs');
+</script>
diff --git a/src/rocksdb/docs/_includes/plugins/twitter_share.html b/src/rocksdb/docs/_includes/plugins/twitter_share.html
new file mode 100644
index 000000000..a60f2a8df
--- /dev/null
+++ b/src/rocksdb/docs/_includes/plugins/twitter_share.html
@@ -0,0 +1,11 @@
+<div class="pluginWrapper twitterSharePlugin">
+  <a href="https://twitter.com/share" class="twitter-share-button" data-hashtags="{{ site.title| replace: ' ', '' }}">Tweet</a>
+</div>
+<script>!function(d,s,id){
+  var js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';
+  if(!d.getElementById(id)){js=d.createElement(s);
+  js.id=id;js.src=p+'://platform.twitter.com/widgets.js';
+  fjs.parentNode.insertBefore(js,fjs);
+}
+}(document, 'script', 'twitter-wjs');
+</script>
diff --git a/src/rocksdb/docs/_includes/post.html b/src/rocksdb/docs/_includes/post.html
new file mode 100644
index 000000000..3ae0a2a80
--- /dev/null
+++ b/src/rocksdb/docs/_includes/post.html
@@ -0,0 +1,40 @@
+<div class="post">
+  <header class="post-header">
+    <div style="display: flex; align-content: center; align-items: center; justify-content: center">
+    {% for author_idx in page.author %}
+      <div style="padding: 16px; display: inline-block; text-align: center">
+        {% assign author = site.data.authors[author_idx] %}
+        {% if author.fbid %}
+        <div class="authorPhoto">
+          <img src="http://graph.facebook.com/{{ author.fbid }}/picture/" alt="{{ author.fullname }}" title="{{ author.fullname }}" />
+        </div>
+        {% endif %}
+        {% if author.full_name %}
+        <p class="post-authorName">{{ author.full_name }}</p>
+        {% endif %}
+      </div>
+    {% endfor %}
+    </div>
+    <h1 class="post-title">{% if include.truncate %}<a href="{{ page.url | absolute_url }}">{{ page.title }}</a>{% else %}{{ page.title }}{% endif %}</h1>
+    <p class="post-meta">Posted {{ page.date | date: '%B %d, %Y' }}{% if page.meta %} • {{ page.meta }}{% endif %}</p>
+  </header>
+  <article class="post-content">
+  {% if include.truncate %}
+    {% if page.content contains '<!--truncate-->' %}
+      {{ page.content | split:'<!--truncate-->' | first | markdownify }}
+      <div class="read-more">
+        <a href="{{ page.url | absolute_url }}" >
+          Read More
+        </a>
+      </div>
+    {% else %}
+      {{ page.content | markdownify }}
+    {% endif %}
+  {% else %}
+    {{ content }}
+  {% endif %}
+  {% unless include.truncate %}
+    {% include plugins/like_button.html %}
+  {% endunless %}
+  </article>
+</div>
diff --git a/src/rocksdb/docs/_includes/powered_by.html b/src/rocksdb/docs/_includes/powered_by.html
new file mode 100644
index 000000000..c629429cd
--- /dev/null
+++ b/src/rocksdb/docs/_includes/powered_by.html
@@ -0,0 +1,28 @@
+{% if site.data.powered_by.first.items or site.data.powered_by_highlight.first.items %}
+<div class="poweredByContainer">
+  <div class="wrapper mainWrapper poweredByWrapper">
+    {% if site.data.powered_by_highlight.first.title %}
+    <h2>{{ site.data.powered_by_highlight.first.title }}</h2>
+    {% else %}
+    <h2>{{ site.data.powered_by.first.title }}</h2>
+    {% endif %}
+    {% if site.data.powered_by_highlight.first.items %}
+    <div class="poweredByItems">
+      {% for item in site.data.powered_by_highlight.first.items %}
+      <div class="poweredByItem itemLarge">
+        <a href="{{ item.url }}" target="_blank"><img src="{{ item.img }}" alt="{{ item.name }}" /></a>
+      </div>
+      {% endfor %}
+    </div>
+    {% endif %}
+    <div class="poweredByItems">
+      {% for item in site.data.powered_by.first.items %}
+      <div class="poweredByItem itemSmall">
+        <a href="{{ item.url }}" target="_blank">{{ item.name }}</a>
+      </div>
+      {% endfor %}
+    </div>
+    <div class="poweredByMessage">Does your app use {{ site.title }}? Add it to this list with <a href="https://github.com/{{ site.ghrepo }}/edit/gh-pages/_data/powered_by.yml" target="_blank">a pull request!</a></div>
+  </div>
+</div>
+{% endif %}
diff --git a/src/rocksdb/docs/_includes/social_plugins.html b/src/rocksdb/docs/_includes/social_plugins.html
new file mode 100644
index 000000000..9b36580dc
--- /dev/null
+++ b/src/rocksdb/docs/_includes/social_plugins.html
@@ -0,0 +1,31 @@
+<a
+  href="https://twitter.com/share"
+  class="twitter-share-button"
+  data-url="http://facebook.github.io/fresco{{ page.url }}"
+  data-text="Fresco | {{ page.title }}"
+  data-hashtags="fresco">Tweet</a>
+<div
+  class="fb-like"
+  data-href="http://facebook.github.io/fresco{{ page.url }}"
+  data-layout="standard"
+  data-action="like"
+  data-show-faces="true"
+  data-share="true"></div>
+
+<div id="fb-root"></div>
+<script>(function(d, s, id) {
+  var js, fjs = d.getElementsByTagName(s)[0];
+  if (d.getElementById(id)) return;
+  js = d.createElement(s); js.id = id;
+  js.src = "//connect.facebook.net/en_US/sdk.js#xfbml=1&version=v2.0";
+  fjs.parentNode.insertBefore(js, fjs);
+}(document, 'script', 'facebook-jssdk'));</script>
+
+<script>!function(d,s,id){
+  var js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';
+  if(!d.getElementById(id)){
+  js=d.createElement(s);js.id=id;js.src=p+'://platform.twitter.com/widgets.js';
+  fjs.parentNode.insertBefore(js,fjs);
+  }
+  }(document, 'script', 'twitter-wjs');
+</script>
diff --git a/src/rocksdb/docs/_includes/ui/button.html b/src/rocksdb/docs/_includes/ui/button.html
new file mode 100644
index 000000000..729ccc33b
--- /dev/null
+++ b/src/rocksdb/docs/_includes/ui/button.html
@@ -0,0 +1 @@
+<span class="buttonWrap {{ include.align }}"><a class="button blockButton fbossFontLight pluginBlock margin{{ include.margin }}" target="{{ include.button_target }}" href="{{ include.button_href }}">{{ include.button_text }}</a></span>
+\ No newline at end of file
diff --git a/src/rocksdb/docs/_layouts/basic.html b/src/rocksdb/docs/_layouts/basic.html
new file mode 100644
index 000000000..65bd21060
--- /dev/null
+++ b/src/rocksdb/docs/_layouts/basic.html
@@ -0,0 +1,12 @@
+---
+layout: doc_default
+---
+
+<div class="mainContainer blogContainer postContainer">
+  <div id="main_wrap" class="wrapper mainWrapper">
+    <div class="post basicPost">
+      {{ content }}
+    </div>
+  </div>
+</div>
+
diff --git a/src/rocksdb/docs/_layouts/blog.html b/src/rocksdb/docs/_layouts/blog.html
new file mode 100644
index 000000000..1b0da4135
--- /dev/null
+++ b/src/rocksdb/docs/_layouts/blog.html
@@ -0,0 +1,11 @@
+---
+category: blog
+layout: blog_default
+---
+
+<div class="mainContainer blogContainer postContainer">
+  <div id="main_wrap" class="wrapper mainWrapper">
+    {{ content }}
+  </div>
+</div>
+
diff --git a/src/rocksdb/docs/_layouts/blog_default.html b/src/rocksdb/docs/_layouts/blog_default.html
new file mode 100644
index 000000000..a29d58d3d
--- /dev/null
+++ b/src/rocksdb/docs/_layouts/blog_default.html
@@ -0,0 +1,14 @@
+<!DOCTYPE html>
+<html>
+  {% include head.html %}
+  <body class="docsNavVisible">
+    {% include nav.html alwayson=true %}
+    <div class="navPusher">
+      <div class="docMainWrapper wrapper">
+      {% include nav/collection_nav.html navdata=site.posts type="blog" sectionpath="/blog/" sectiontitle="Blog" %}
+      {{ content }}
+      </div>
+      {% include footer.html %}
+    </div>
+  </body>
+</html>
diff --git a/src/rocksdb/docs/_layouts/default.html b/src/rocksdb/docs/_layouts/default.html
new file mode 100644
index 000000000..0167d9fd9
--- /dev/null
+++ b/src/rocksdb/docs/_layouts/default.html
@@ -0,0 +1,12 @@
+<!DOCTYPE html>
+<html>
+  {% include head.html %}
+  <body>
+    {% include nav.html alwayson=true %}
+    <div class="navPusher">
+      {{ content }}
+      {% include footer.html %}
+    </div>
+  </body>
+
+</html>
diff --git a/src/rocksdb/docs/_layouts/doc_default.html b/src/rocksdb/docs/_layouts/doc_default.html
new file mode 100644
index 000000000..4a4139247
--- /dev/null
+++ b/src/rocksdb/docs/_layouts/doc_default.html
@@ -0,0 +1,14 @@
+<!DOCTYPE html>
+<html>
+  {% include head.html %}
+  <body class="docsNavVisible">
+    {% include nav.html alwayson=true %}
+    <div class="navPusher">
+      <div class="docMainWrapper wrapper">
+      {% include nav/collection_nav.html navdata=site.data.nav_docs type="docs" sectionpath="/docs/" sectiontitle="Docs" data_collection=site.docs %}
+      {{ content }}
+      </div>
+      {% include footer.html %}
+    </div>
+  </body>
+</html>
diff --git a/src/rocksdb/docs/_layouts/doc_page.html b/src/rocksdb/docs/_layouts/doc_page.html
new file mode 100644
index 000000000..dba761e7d
--- /dev/null
+++ b/src/rocksdb/docs/_layouts/doc_page.html
@@ -0,0 +1,10 @@
+---
+layout: doc_default
+---
+
+<div class="mainContainer documentContainer postContainer">
+  <div id="main_wrap" class="wrapper mainWrapper">
+    {{ content }}
+  </div>
+</div>
+
diff --git a/src/rocksdb/docs/_layouts/docs.html b/src/rocksdb/docs/_layouts/docs.html
new file mode 100644
index 000000000..749dafabb
--- /dev/null
+++ b/src/rocksdb/docs/_layouts/docs.html
@@ -0,0 +1,5 @@
+---
+layout: doc_page
+---
+
+{% include doc.html %}
+\ No newline at end of file
diff --git a/src/rocksdb/docs/_layouts/home.html b/src/rocksdb/docs/_layouts/home.html
new file mode 100644
index 000000000..b17732fa1
--- /dev/null
+++ b/src/rocksdb/docs/_layouts/home.html
@@ -0,0 +1,26 @@
+<!DOCTYPE html>
+<html>
+  {% include head.html %}
+  <div class="socialBanner">
+    <div>
+      Support Ukraine 🇺🇦 
+      <a href="https://opensource.facebook.com/support-ukraine">
+        Help Provide Humanitarian Aid to Ukraine
+      </a>
+      .
+    </div>
+  </div>
+  <body>
+    {% include nav.html alwayson=true %}
+    <div class="navPusher">
+      {% include home_header.html %}
+      <div class="mainContainer">
+        <div id="main_wrap" class="wrapper mainWrapper">
+          {{ content }}
+        </div>
+        {% include powered_by.html %}
+      </div>
+      {% include footer.html %}
+    </div>
+  </body>
+</html>
diff --git a/src/rocksdb/docs/_layouts/page.html b/src/rocksdb/docs/_layouts/page.html
new file mode 100644
index 000000000..bec36805b
--- /dev/null
+++ b/src/rocksdb/docs/_layouts/page.html
@@ -0,0 +1,3 @@
+---
+layout: blog
+---
diff --git a/src/rocksdb/docs/_layouts/plain.html b/src/rocksdb/docs/_layouts/plain.html
new file mode 100644
index 000000000..fccc02ce1
--- /dev/null
+++ b/src/rocksdb/docs/_layouts/plain.html
@@ -0,0 +1,10 @@
+---
+layout: default
+---
+
+<div class="mainContainer blogContainer postContainer">
+  <div id="main_wrap" class="wrapper mainWrapper">
+    {{ content }}
+  </div>
+</div>
+
diff --git a/src/rocksdb/docs/_layouts/post.html b/src/rocksdb/docs/_layouts/post.html
new file mode 100644
index 000000000..4c92cf214
--- /dev/null
+++ b/src/rocksdb/docs/_layouts/post.html
@@ -0,0 +1,8 @@
+---
+collection: blog
+layout: blog
+---
+
+<div class="lonePost">
+{% include post.html %}
+</div>
+\ No newline at end of file
diff --git a/src/rocksdb/docs/_layouts/redirect.html b/src/rocksdb/docs/_layouts/redirect.html
new file mode 100644
index 000000000..c24f81748
--- /dev/null
+++ b/src/rocksdb/docs/_layouts/redirect.html
@@ -0,0 +1,6 @@
+<html>
+<head>
+  <meta http-equiv="refresh" content="0; {{ page.destination }}">
+</head>
+<body></body>
+</html>
diff --git a/src/rocksdb/docs/_layouts/top-level.html b/src/rocksdb/docs/_layouts/top-level.html
new file mode 100644
index 000000000..fccc02ce1
--- /dev/null
+++ b/src/rocksdb/docs/_layouts/top-level.html
@@ -0,0 +1,10 @@
+---
+layout: default
+---
+
+<div class="mainContainer blogContainer postContainer">
+  <div id="main_wrap" class="wrapper mainWrapper">
+    {{ content }}
+  </div>
+</div>
+
diff --git a/src/rocksdb/docs/_posts/2014-03-27-how-to-backup-rocksdb.markdown b/src/rocksdb/docs/_posts/2014-03-27-how-to-backup-rocksdb.markdown
new file mode 100644
index 000000000..f9e4a5444
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2014-03-27-how-to-backup-rocksdb.markdown
@@ -0,0 +1,135 @@
+---
+title: How to backup RocksDB?
+layout: post
+author: icanadi
+category: blog
+redirect_from:
+  - /blog/191/how-to-backup-rocksdb/
+---
+
+In RocksDB, we have implemented an easy way to backup your DB. Here is a simple example:
+
+
+
+    #include "rocksdb/db.h"
+    #include "utilities/backupable_db.h"
+    using namespace rocksdb;
+
+    DB* db;
+    DB::Open(Options(), "/tmp/rocksdb", &db);
+    BackupableDB* backupable_db = new BackupableDB(db, BackupableDBOptions("/tmp/rocksdb_backup"));
+    backupable_db->Put(...); // do your thing
+    backupable_db->CreateNewBackup();
+    delete backupable_db; // no need to also delete db
+
+<!--truncate-->
+
+
+This simple example will create a backup of your DB in "/tmp/rocksdb_backup". Creating new BackupableDB consumes DB* and you should be calling all the DB methods on object `backupable_db` going forward.
+
+Restoring is also easy:
+
+
+
+    RestoreBackupableDB* restore = new RestoreBackupableDB(Env::Default(), BackupableDBOptions("/tmp/rocksdb_backup"));
+    restore->RestoreDBFromLatestBackup("/tmp/rocksdb", "/tmp/rocksdb");
+    delete restore;
+
+
+
+
+This code will restore the backup back to "/tmp/rocksdb". The second parameter is the location of log files (In some DBs they are different from DB directory, but usually they are the same. See Options::wal_dir for more info).
+
+An alternative API for backups is to use BackupEngine directly:
+
+
+
+    #include "rocksdb/db.h"
+    #include "utilities/backupable_db.h"
+    using namespace rocksdb;
+
+    DB* db;
+    DB::Open(Options(), "/tmp/rocksdb", &db);
+    db->Put(...); // do your thing
+    BackupEngine* backup_engine = BackupEngine::NewBackupEngine(Env::Default(), BackupableDBOptions("/tmp/rocksdb_backup"));
+    backup_engine->CreateNewBackup(db);
+    delete db;
+    delete backup_engine;
+
+
+
+
+Restoring with BackupEngine is similar to RestoreBackupableDB:
+
+
+
+    BackupEngine* backup_engine = BackupEngine::NewBackupEngine(Env::Default(), BackupableDBOptions("/tmp/rocksdb_backup"));
+    backup_engine->RestoreDBFromLatestBackup("/tmp/rocksdb", "/tmp/rocksdb");
+    delete backup_engine;
+
+
+
+
+Backups are incremental. You can create a new backup with `CreateNewBackup()` and only the new data will be copied to backup directory (for more details on what gets copied, see "Under the hood"). Checksum is always calculated for any backuped file (including sst, log, and etc). It is used to make sure files are kept sound in the file system. Checksum is also verified for files from the previous backups even though they do not need to be copied. A checksum mismatch aborts the current backup (see "Under the hood" for more details). Once you have more backups saved, you can issue `GetBackupInfo()` call to get a list of all backups together with information on timestamp of the backup and the size (please note that sum of all backups' sizes is bigger than the actual size of the backup directory because some data is shared by multiple backups). Backups are identified by their always-increasing IDs. `GetBackupInfo()` is available both in `BackupableDB` and `RestoreBackupableDB`.
+
+You probably want to keep around only small number of backups. To delete old backups, just call `PurgeOldBackups(N)`, where N is how many backups you'd like to keep. All backups except the N newest ones will be deleted. You can also choose to delete arbitrary backup with call `DeleteBackup(id)`.
+
+`RestoreDBFromLatestBackup()` will restore the DB from the latest consistent backup. An alternative is `RestoreDBFromBackup()` which takes a backup ID and restores that particular backup. Checksum is calculated for any restored file and compared against the one stored during the backup time. If a checksum mismatch is detected, the restore process is aborted and `Status::Corruption` is returned. Very important thing to note here: Let's say you have backups 1, 2, 3, 4. If you restore from backup 2 and start writing more data to your database, newly created backup will delete old backups 3 and 4 and create new backup 3 on top of 2.
+
+
+
+## Advanced usage
+
+
+Let's say you want to backup your DB to HDFS. There is an option in `BackupableDBOptions` to set `backup_env`, which will be used for all file I/O related to backup dir (writes when backuping, reads when restoring). If you set it to HDFS Env, all the backups will be stored in HDFS.
+
+`BackupableDBOptions::info_log` is a Logger object that is used to print out LOG messages if not-nullptr.
+
+If `BackupableDBOptions::sync` is true, we will sync data to disk after every file write, guaranteeing that backups will be consistent after a reboot or if machine crashes. Setting it to false will speed things up a bit, but some (newer) backups might be inconsistent. In most cases, everything should be fine, though.
+
+If you set `BackupableDBOptions::destroy_old_data` to true, creating new `BackupableDB` will delete all the old backups in the backup directory.
+
+`BackupableDB::CreateNewBackup()` method takes a parameter `flush_before_backup`, which is false by default. When `flush_before_backup` is true, `BackupableDB` will first issue a memtable flush and only then copy the DB files to the backup directory. Doing so will prevent log files from being copied to the backup directory (since flush will delete them). If `flush_before_backup` is false, backup will not issue flush before starting the backup. In that case, the backup will also include log files corresponding to live memtables. Backup will be consistent with current state of the database regardless of `flush_before_backup` parameter.
+
+
+
+## Under the hood
+
+
+`BackupableDB` implements `DB` interface and adds four methods to it: `CreateNewBackup()`, `GetBackupInfo()`, `PurgeOldBackups()`, `DeleteBackup()`. Any `DB` interface calls will get forwarded to underlying `DB` object.
+
+When you call `BackupableDB::CreateNewBackup()`, it does the following:
+
+
+
+
+
+  1. Disable file deletions
+
+
+
+  2. Get live files (this includes table files, current and manifest file).
+
+
+
+  3. Copy live files to the backup directory. Since table files are immutable and filenames unique, we don't copy a table file that is already present in the backup directory. For example, if there is a file `00050.sst` already backed up and `GetLiveFiles()` returns `00050.sst`, we will not copy that file to the backup directory. However, checksum is calculated for all files regardless if a file needs to be copied or not. If a file is already present, the calculated checksum is compared against previously calculated checksum to make sure nothing crazy happened between backups. If a mismatch is detected, backup is aborted and the system is restored back to the state before `BackupableDB::CreateNewBackup()` is called. One thing to note is that a backup abortion could mean a corruption from a file in backup directory or the corresponding live file in current DB. Both manifest and current files are copied, since they are not immutable.
+
+
+
+  4. If `flush_before_backup` was set to false, we also need to copy log files to the backup directory. We call `GetSortedWalFiles()` and copy all live files to the backup directory.
+
+
+
+  5. Enable file deletions
+
+
+
+
+Backup IDs are always increasing and we have a file `LATEST_BACKUP` that contains the ID of the latest backup. If we crash in middle of backing up, on a restart we will detect that there are newer backup files than `LATEST_BACKUP` claims there are. In that case, we will delete any backup newer than `LATEST_BACKUP` and clean up all the files since some of the table files might be corrupted. Having corrupted table files in the backup directory is dangerous because of our deduplication strategy.
+
+
+
+## Further reading
+
+
+For the API details, see `include/utilities/backupable_db.h`. For the implementation, see `utilities/backupable/backupable_db.cc`.
diff --git a/src/rocksdb/docs/_posts/2014-03-27-how-to-persist-in-memory-rocksdb-database.markdown b/src/rocksdb/docs/_posts/2014-03-27-how-to-persist-in-memory-rocksdb-database.markdown
new file mode 100644
index 000000000..89ffb2d97
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2014-03-27-how-to-persist-in-memory-rocksdb-database.markdown
@@ -0,0 +1,54 @@
+---
+title: How to persist in-memory RocksDB database?
+layout: post
+author: icanadi
+category: blog
+redirect_from:
+  - /blog/245/how-to-persist-in-memory-rocksdb-database/
+---
+
+In recent months, we have focused on optimizing RocksDB for in-memory workloads. With growing RAM sizes and strict low-latency requirements, lots of applications decide to keep their entire data in memory. Running in-memory database with RocksDB is easy -- just mount your RocksDB directory on tmpfs or ramfs [1]. Even if the process crashes, RocksDB can recover all of your data from in-memory filesystem. However, what happens if the machine reboots?
+
+<!--truncate-->
+
+In this article we will explain how you can recover your in-memory RocksDB database even after a machine reboot.
+
+Every update to RocksDB is written to two places - one is an in-memory data structure called memtable and second is write-ahead log. Write-ahead log can be used to completely recover the data in memtable. By default, when we flush the memtable to table file, we also delete the current log, since we don't need it anymore for recovery (the data from the log is "persisted" in the table file -- we say that the log file is obsolete). However, if your table file is stored in in-memory file system, you may need the obsolete write-ahead log to recover the data after the machine reboots. Here's how you can do that.
+
+Options::wal_dir is the directory where RocksDB stores write-ahead log files. If you configure this directory to be on flash or disk, you will not lose current log file on machine reboot.
+Options::WAL_ttl_seconds is the timeout when we delete the archived log files. If the timeout is non-zero, obsolete log files will be moved to `archive/` directory under Options::wal_dir. Those archived log files will only be deleted after the specified timeout.
+
+Let's assume Options::wal_dir is a directory on persistent storage and Options::WAL_ttl_seconds is set to one day. To fully recover the DB, we also need to backup the current snapshot of the database (containing table and metadata files) with a frequency of less than one day. RocksDB provides an utility that enables you to easily backup the snapshot of your database. You can learn more about it here: [How to backup RocksDB?](https://github.com/facebook/rocksdb/wiki/How-to-backup-RocksDB%3F)
+
+You should configure the backup process to avoid backing up log files, since they are already stored in persistent storage. To do that, set BackupableDBOptions::backup_log_files to false.
+
+Restore process by default cleans up entire DB and WAL directory. Since we didn't include log files in the backup, we need to make sure that restoring the database doesn't delete log files in WAL directory. When restoring, configure RestoreOptions::keep_log_file to true. That option will also move any archived log files back to WAL directory, enabling RocksDB to replay all archived log files and rebuild the in-memory database state.
+
+To reiterate, here's what you have to do:
+
+
+
+
+  * Set DB directory to tmpfs or ramfs mounted drive
+
+
+
+  * Set Options::wal_log to a directory on persistent storage
+
+
+
+  * Set Options::WAL_ttl_seconds to T seconds
+
+
+
+  * Backup RocksDB every T/2 seconds, with BackupableDBOptions::backup_log_files = false
+
+
+
+  * When you lose data, restore from backup with RestoreOptions::keep_log_file = true
+
+
+
+
+
+[1] You might also want to consider using [PlainTable format](https://github.com/facebook/rocksdb/wiki/PlainTable-Format) for table files
diff --git a/src/rocksdb/docs/_posts/2014-04-02-the-1st-rocksdb-local-meetup-held-on-march-27-2014.markdown b/src/rocksdb/docs/_posts/2014-04-02-the-1st-rocksdb-local-meetup-held-on-march-27-2014.markdown
new file mode 100644
index 000000000..7ccbdbaad
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2014-04-02-the-1st-rocksdb-local-meetup-held-on-march-27-2014.markdown
@@ -0,0 +1,53 @@
+---
+title: The 1st RocksDB Local Meetup Held on March 27, 2014
+layout: post
+author: xjin
+category: blog
+redirect_from:
+  - /blog/323/the-1st-rocksdb-local-meetup-held-on-march-27-2014/
+---
+
+On Mar 27, 2014, RocksDB team @ Facebook held the 1st RocksDB local meetup in FB HQ (Menlo Park, California). We invited around 80 guests from 20+ local companies, including LinkedIn, Twitter, Dropbox, Square, Pinterest, MapR, Microsoft and IBM. Finally around 50 guests showed up, totaling around 60% show-up rate.
+
+<!--truncate-->
+
+[![Resize of 20140327_200754](/static/images/Resize-of-20140327_200754-300x225.jpg)](/static/images/Resize-of-20140327_200754-300x225.jpg)
+
+RocksDB team @ Facebook gave four talks about the latest progress and experience on RocksDB:
+
+
+
+
+  * [Supporting a 1PB In-Memory Workload](https://github.com/facebook/rocksdb/raw/gh-pages/talks/2014-03-27-RocksDB-Meetup-Haobo-RocksDB-In-Memory.pdf)
+
+
+
+
+  * [Column Families in RocksDB](https://github.com/facebook/rocksdb/raw/gh-pages/talks/2014-03-27-RocksDB-Meetup-Igor-Column-Families.pdf)
+
+
+
+
+  * ["Lockless" Get() in RocksDB?](https://github.com/facebook/rocksdb/raw/gh-pages/talks/2014-03-27-RocksDB-Meetup-Lei-Lockless-Get.pdf)
+
+
+
+
+  * [Prefix Hashing in RocksDB](https://github.com/facebook/rocksdb/raw/gh-pages/talks/2014-03-27-RocksDB-Meetup-Siying-Prefix-Hash.pdf)
+
+
+A very interesting question asked by a massive number of guests is: does RocksDB plan to provide replication functionality? Obviously, many applications need a resilient and distributed storage solution, not just single-node storage. We are considering how to approach this issue.
+
+When will be the next meetup? We haven't decided yet. We will see whether the community is interested in it and how it can help RocksDB grow.
+
+If you have any questions or feedback for the meetup or RocksDB, please let us know in [our Facebook group](https://www.facebook.com/groups/rocksdb.dev/).
+
+### Comments
+
+**[Rajiv](geetasen@gmail.com)**
+
+Have any of these talks been recorded and if so will they be published?
+
+**[Igor Canadi](icanadi@fb.com)**
+
+Yes, I think we plan to publish them soon.
diff --git a/src/rocksdb/docs/_posts/2014-04-07-rocksdb-2-8-release.markdown b/src/rocksdb/docs/_posts/2014-04-07-rocksdb-2-8-release.markdown
new file mode 100644
index 000000000..7be7842a5
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2014-04-07-rocksdb-2-8-release.markdown
@@ -0,0 +1,40 @@
+---
+title: RocksDB 2.8 release
+layout: post
+author: icanadi
+category: blog
+redirect_from:
+  - /blog/371/rocksdb-2-8-release/
+---
+
+Check out the new RocksDB 2.8 release on [Github](https://github.com/facebook/rocksdb/releases/tag/2.8.fb).
+
+RocksDB 2.8. is mostly focused on improving performance for in-memory workloads. We are seeing read QPS as high as 5M (we will write a separate blog post on this).
+
+<!--truncate-->
+
+Here is the summary of new features:
+
+  * Added a new table format called PlainTable, which is optimized for RAM storage (ramfs or tmpfs). You can read more details about it on [our wiki](https://github.com/facebook/rocksdb/wiki/PlainTable-Format).
+
+
+  * New prefixed memtable format HashLinkedList, which is optimized for cases where there are only a few keys for each prefix.
+
+
+  * Merge operator supports a new function PartialMergeMulti() that allows users to do partial merges against multiple operands. This function enables big speedups for workloads that use merge operators.
+
+
+  * Added a V2 compaction filter interface. It buffers the kv-pairs sharing the same key prefix, process them in batches, and return the batched results back to DB.
+
+
+  * Geo-spatial support for locations and radial-search.
+
+
+  * Improved read performance using thread local cache for frequently accessed data.
+
+
+  * Stability improvements -- we're now ignoring partially written tailing record to MANIFEST or WAL files.
+
+
+
+We have also introduced small incompatible API changes (mostly for advanced users). You can see full release notes in our [HISTORY.my](https://github.com/facebook/rocksdb/blob/2.8.fb/HISTORY.md) file.
diff --git a/src/rocksdb/docs/_posts/2014-04-21-indexing-sst-files-for-better-lookup-performance.markdown b/src/rocksdb/docs/_posts/2014-04-21-indexing-sst-files-for-better-lookup-performance.markdown
new file mode 100644
index 000000000..368055d2c
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2014-04-21-indexing-sst-files-for-better-lookup-performance.markdown
@@ -0,0 +1,28 @@
+---
+title: Indexing SST Files for Better Lookup Performance
+layout: post
+author: leijin
+category: blog
+redirect_from:
+  - /blog/431/indexing-sst-files-for-better-lookup-performance/
+---
+
+For a `Get()` request, RocksDB goes through mutable memtable, list of immutable memtables, and SST files to look up the target key. SST files are organized in levels.
+
+On level 0, files are sorted based on the time they are flushed. Their key range (as defined by FileMetaData.smallest and FileMetaData.largest) are mostly overlapped with each other. So it needs to look up every L0 file.
+
+<!--truncate-->
+
+Compaction is scheduled periodically to pick up files from an upper level and merges them with files from lower level. As a result, key/values are moved from L0 down the LSM tree gradually. Compaction sorts key/values and split them into files. From level 1 and below, SST files are sorted based on key. Their key range are mutually exclusive. Instead of scanning through each SST file and checking if a key falls into its range, RocksDB performs a binary search based on FileMetaData.largest to locate a candidate file that can potentially contain the target key. This reduces complexity from O(N) to O(log(N)). However, log(N) can still be large for bottom levels. For a fan-out ratio of 10, level 3 can have 1000 files. That requires 10 comparisons to locate a candidate file. This is a significant cost for an in-memory database when you can do [several million gets per second](https://github.com/facebook/rocksdb/wiki/RocksDB-In-Memory-Workload-Performance-Benchmarks).
+
+One observation to this problem is that: after the LSM tree is built, an SST file's position in its level is fixed. Furthermore, its order relative to files from the next level is also fixed. Based on this idea, we can perform [fractional cascading](http://en.wikipedia.org/wiki/Fractional_cascading) kind of optimization to narrow down the binary search range. Here is an example:
+
+[![tree_example](/static/images/tree_example1.png)](/static/images/tree_example1.png)
+
+Level 1 has 2 files and level 2 has 8 files. Now, we want to look up key 80. A binary search based FileMetaData.largest tells you file 1 is the candidate. Then key 80 is compared with its FileMetaData.smallest and FileMetaData.largest to decide if it falls into the range. The comparison shows 80 is less than FileMetaData.smallest (100), so file 1 does not possibly contain key 80. We to proceed to check level 2. Usually, we need to do binary search among all 8 files on level 2. But since we already know target key 80 is less than 100 and only file 1 to file 3 can contain key less than 100, we can safely exclude other files from the search. As a result we cut down the search space from 8 files to 3 files.
+
+Let's look at another example. We want to get key 230. A binary search on level 1 locates to file 2 (this also implies key 230 is larger than file 1's FileMetaData.largest 200). A comparison with file 2's range shows the target key is smaller than file 2's FileMetaData.smallest 300. Even though, we couldn't find key on level 1, we have derived hints that target key is in range between 200 and 300. Any files on level 2 that cannot overlap with [200, 300] can be safely excluded. As a result, we only need to look at file 5 and file 6 on level 2.
+
+Inspired by this concept, we pre-build pointers at compaction time on level 1 files that point to a range of files on level 2. For example, file 1 on level 1 points to file 3 (on level 2) on the left and file 4 on the right. File 2 will point to level 2 files 6 and 7. At query time, these pointers are used to determine the actual binary search range based on comparison result.
+
+Our benchmark shows that this optimization improves lookup QPS by ~5% for similar setup mentioned [here](https://github.com/facebook/rocksdb/wiki/RocksDB-In-Memory-Workload-Performance-Benchmarks).
diff --git a/src/rocksdb/docs/_posts/2014-05-14-lock.markdown b/src/rocksdb/docs/_posts/2014-05-14-lock.markdown
new file mode 100644
index 000000000..12009cc88
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2014-05-14-lock.markdown
@@ -0,0 +1,88 @@
+---
+title: Reducing Lock Contention in RocksDB
+layout: post
+author: sdong
+category: blog
+redirect_from:
+  - /blog/521/lock/
+---
+
+In this post, we briefly introduce the recent improvements we did to RocksDB to improve the issue of lock contention costs.
+
+RocksDB has a simple thread synchronization mechanism (See [RocksDB Architecture Guide](https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide)  to understand terms used below, like SST tables or mem tables). SST tables are immutable after being written and mem tables are lock-free data structures supporting single writer and multiple readers. There is only one single major lock, the DB mutex (DBImpl.mutex_) protecting all the meta operations, including:
+
+<!--truncate-->
+
+  * Increase or decrease reference counters of mem tables and SST tables
+
+
+  * Change and check meta data structures, before and after finishing compactions, flushes and new mem table creations
+
+
+  * Coordinating writers
+
+
+This DB mutex used to be scalability bottleneck preventing us from scaling to more than 16 threads. To address the issue, we improved RocksDB in several ways.
+
+1. Consolidate reference counters and introduce "super version". For every read operation, mutex was acquired, and reference counters for each mem table and each SST table were increased. One such operation is not expensive but if you are building a high throughput server with lots of reads, the lock contention will become the bottleneck. This is especially true if you store all your data in RAM.
+
+To solve this problem, we created a meta-meta data structure called “[super version](https://reviews.facebook.net/rROCKSDB1fdb3f7dc60e96394e3e5b69a46ede5d67fb976c)”, which holds reference counters to all those mem table and SST tables, so that readers only need to increase the reference counters for this single data structure. In RocksDB, list of live mem tables and SST tables only changes infrequently, which would happen when new mem tables are created or flush/compaction happens. Now, at those times, a new super version is created with their reference counters increased. A super version lists live mem tables and SST tables so a reader only needs acquire the lock in order to find the latest super version and increase its reference counter. From the super version, the reader can find all the mem and SST tables which are safety accessible as long as the reader holds the reference count for the super version.
+
+2. We replace some reference counters to stc::atomic objects, so that decreasing reference count of an object usually doesn’t need to be inside the mutex any more.
+
+3. Make fetching super version and reference counting lock-free in read queries. After consolidating reference counting to one single super version and removing the locking for decreasing reference counts, in read case, we only acquire mutex for one thing: fetch the latest super version and increase the reference count for that (dereference the counter is done in an atomic decrease). We designed and implemented a (mostly) lock-free approach to do it. See [details](https://github.com/facebook/rocksdb/raw/gh-pages/talks/2014-03-27-RocksDB-Meetup-Lei-Lockless-Get.pdf). We will write a separate blog post for that.
+
+4. Avoid disk I/O inside the mutex. As we know, each disk I/O to hard drives takes several milliseconds. It can be even longer if file system journal is involved or I/Os are queued. Even occasional disk I/O within mutex can cause huge performance outliers.
+We identified in two situations, we might do disk I/O inside mutex and we removed them:
+(1) Opening and closing transactional log files. We moved those operations out of the mutex.
+(2) Information logging. In multiple places we write to logs within mutex. There is a chance that file write will wait for disk I/O to finish before finishing, even if fsync() is not issued, especially in EXT systems. We occasionally see 100+ milliseconds write() latency on EXT. Instead of removing those logging, we came up with a solution of delay logging. When inside mutex, instead of directly writing to the log file, we write to a log buffer, with the timing information. As soon as mutex is released, we flush the log buffer to log files.
+
+5. Reduce object creation inside the mutex.
+Object creation can be slow because it involves malloc (in our case). Malloc sometimes is slow because it needs to lock some shared data structures. Allocating can also be slow because we sometimes do expensive operations in some of our classes' constructors. For these reasons, we try to reduce object creations inside the mutex. Here are two examples:
+
+(1) std::vector uses malloc inside. We introduced “[autovector](https://reviews.facebook.net/rROCKSDBc01676e46d3be08c3c140361ef1f5884f47d3b3c)” data structure, in which memory for first a few elements are pre-allocated as members of the autovector class. When an autovector is used as a stack variable, no malloc will be needed unless the pre-allocated buffer is used up. This autovector is quite useful for manipulating those meta data structures. Those meta operations are often locked inside DB mutex.
+
+(2) When building an iterator, we used to creating iterator of every live men table and SST table within the mutex and a merging iterator on top of them. Besides malloc, some of those iterators can be quite expensive to create, like sorting. Now, instead of doing that, we simply increase the reference counters of them, and release the mutex before creating any iterator.
+
+6. Deal with mutexes in LRU caches.
+When I said there was only one single major lock, I was lying. In RocksDB, all LRU caches had exclusive mutexes within to protect writes to the LRU lists, which are done in both of read and write operations. LRU caches are used in block cache and table cache. Both of them are accessed more frequently than DB data structures. Lock contention of these two locks are as intense as the DB mutex. Even if LRU cache is sharded into ShardedLRUCache, we can still see lock contentions, especially table caches. We further address this issue in two way:
+(1) Bypassing table caches. A table cache maintains list of SST table’s read handlers. Those handlers contain SST files’ descriptors, table metadata, and possibly data indexes, as well as bloom filters. When the table handler needs to be evicted based on LRU, those information is cleared. When the SST table needs to be read and its table handler is not in LRU cache, the table is opened and those metadata is loaded. In some cases, users want to tune the system in a way that table handler evictions should never happen. It is common for high-throughput, low-latency servers. We introduce a mode where table cache is bypassed in read queries. In this mode, all table handlers are cached and accessed directly, so there is no need to query and adjust table caches for reading the database. It is the users’ responsibility to reserve enough resource for it. This mode can be turned on by setting options.max_open_files=-1.
+
+(2) [New PlainTable format](//github.com/facebook/rocksdb/wiki/PlainTable-Format) (optimized for SST in ramfs/tmpfs) does not organize data by blocks. Data are located by memory addresses so no block cache is needed.
+
+With all of those improvements, lock contention is not a bottleneck anymore, which is shown in our [memory-only benchmark](https://github.com/facebook/rocksdb/wiki/RocksDB-In-Memory-Workload-Performance-Benchmarks) . Furthermore, lock contentions are not causing some huge (50 milliseconds+) latency outliers they used to cause.
+
+### Comments
+
+**[Lee Hounshell](lee@apsalar.com)**
+
+Please post an example of reading the same rocksdb concurrently.
+
+We are using the latest 3.0 rocksdb; however, when two separate processes
+try and open the same rocksdb for reading, only one of the open requests
+succeed. The other open always fails with “db/LOCK: Resource temporarily unavailable” So far we have not found an option that allows sharing the rocksdb for reads. An example would be most appreciated.
+
+**[Siying Dong](siying.d@fb.com)**
+
+Sorry for the delay. We don’t have feature support for this scenario yet. Here is an example you can work around this problem. You can build a snapshot of the DB by doing this:
+
+1. create a separate directory on the same host for a snapshot of the DB.
+1. call `DB::DisableFileDeletions()`
+1. call `DB::GetLiveFiles()` to get a full list of the files.
+1. for all the files except manifest, add a hardlink file in your new directory pointing to the original file
+1. copy the manifest file and truncate the size (you can read the comments of `DB::GetLiveFiles()` for more information)
+1. call `DB::EnableFileDeletions()`
+1. now you can open the snapshot directory in another process to access those files. Please remember to delete the directory after reading the data to allow those files to be recycled.
+
+By the way, the best way to ask those questions is in our [facebook group](https://www.facebook.com/groups/rocksdb.dev/). Let us know if you need any further help.
+
+**[Darshan](darshan.ghumare@gmail.com)**
+
+Will this consistency problem of RocksDB all occurs in case of single put/write?
+What all ACID properties is supported by RocksDB, only durability irrespective of single or batch write?
+
+**[Siying Dong](siying.d@fb.com)**
+
+We recently [introduced optimistic transaction](https://reviews.facebook.net/D33435) which can help you ensure all of ACID.
+
+This blog post is mainly about optimizations in implementation. The RocksDB consistency semantic is not changed.
diff --git a/src/rocksdb/docs/_posts/2014-05-19-rocksdb-3-0-release.markdown b/src/rocksdb/docs/_posts/2014-05-19-rocksdb-3-0-release.markdown
new file mode 100644
index 000000000..61c90dc93
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2014-05-19-rocksdb-3-0-release.markdown
@@ -0,0 +1,24 @@
+---
+title: RocksDB 3.0 release
+layout: post
+author: icanadi
+category: blog
+redirect_from:
+  - /blog/557/rocksdb-3-0-release/
+---
+
+Check out new RocksDB release on [Github](https://github.com/facebook/rocksdb/releases/tag/3.0.fb)!
+
+New features in RocksDB 3.0:
+
+  * [Column Family support](https://github.com/facebook/rocksdb/wiki/Column-Families)
+
+
+  * [Ability to chose different checksum function](https://github.com/facebook/rocksdb/commit/0afc8bc29a5800e3212388c327c750d32e31f3d6)
+
+
+  * Deprecated ReadOptions::prefix_seek and ReadOptions::prefix
+
+<!--truncate-->
+
+Check out the full [change log](https://github.com/facebook/rocksdb/blob/3.0.fb/HISTORY.md).
diff --git a/src/rocksdb/docs/_posts/2014-05-22-rocksdb-3-1-release.markdown b/src/rocksdb/docs/_posts/2014-05-22-rocksdb-3-1-release.markdown
new file mode 100644
index 000000000..30156742b
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2014-05-22-rocksdb-3-1-release.markdown
@@ -0,0 +1,20 @@
+---
+title: RocksDB 3.1 release
+layout: post
+author: icanadi
+category: blog
+redirect_from:
+  - /blog/575/rocksdb-3-1-release/
+---
+
+Check out the new release on [Github](https://github.com/facebook/rocksdb/releases/tag/rocksdb-3.1)!
+
+New features in RocksDB 3.1:
+
+  * [Materialized hash index](https://github.com/facebook/rocksdb/commit/0b3d03d026a7248e438341264b4c6df339edc1d7)
+
+
+  * [FIFO compaction style](https://github.com/facebook/rocksdb/wiki/FIFO-compaction-style)
+
+
+We released 3.1 so fast after 3.0 because one of our internal customers needed materialized hash index.
diff --git a/src/rocksdb/docs/_posts/2014-06-23-plaintable-a-new-file-format.markdown b/src/rocksdb/docs/_posts/2014-06-23-plaintable-a-new-file-format.markdown
new file mode 100644
index 000000000..6a641f233
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2014-06-23-plaintable-a-new-file-format.markdown
@@ -0,0 +1,47 @@
+---
+title: PlainTable — A New File Format
+layout: post
+author: sdong
+category: blog
+redirect_from:
+  - /blog/599/plaintable-a-new-file-format/
+---
+
+In this post, we are introducing "PlainTable" -- a file format we designed for RocksDB, initially to satisfy a production use case at Facebook.
+
+Design goals:
+
+1. All data stored in memory, in files stored in tmpfs/ramfs. Support DBs larger than 100GB (may be sharded across multiple RocksDB instance).
+1. Optimize for [prefix hashing](https://github.com/facebook/rocksdb/raw/gh-pages/talks/2014-03-27-RocksDB-Meetup-Siying-Prefix-Hash.pdf)
+1. Less than or around 1 micro-second average latency for single Get() or Seek().
+1. Minimize memory consumption.
+1. Queries efficiently return empty results
+
+<!--truncate-->
+
+Notice that our priority was not to maximize query performance, but to strike a balance between query performance and memory consumption. PlainTable query performance is not as good as you would see with a nicely-designed hash table, but they are of the same order of magnitude, while keeping memory overhead to a minimum.
+
+Since we are targeting micro-second latency, it is on the level of the number of CPU cache misses (if they cannot be parallellized, which are usually the case for index look-ups). On our target hardware with Intel CPUs of multiple sockets with NUMA, we can only allow 4-5 CPU cache misses (including costs of data TLB).
+
+To meet our requirements, given that only hash prefix iterating is needed, we made two decisions:
+
+1. to use a hash index, which is
+1. directly addressed to rows, with no block structure.
+
+Having addressed our latency goal, the next task was to design a very compact hash index to minimize memory consumption. Some tricks we used to meet this goal:
+
+1. We only use 32-bit integers for data and index offsets.The first bit serves as a flag, so we can avoid using 8-byte pointers.
+1. We never copy keys or parts of keys to index search structures. We store only offsets from which keys can be retrieved, to make comparisons with search keys.
+1. Since our file is immutable, we can accurately estimate the number of hash buckets needed.
+
+To make sure the format works efficiently with empty queries, we added a bloom filter check before the query. This adds only one cache miss for non-empty cases [1], but avoids multiple cache misses for most empty results queries. This is a good trade-off for use cases with a large percentage of empty results.
+
+These are the design goals and basic ideas of PlainTable file format. For detailed information, see [this wiki page](https://github.com/facebook/rocksdb/wiki/PlainTable-Format).
+
+[1] Bloom filter checks typically require multiple memory access. However, because they are independent, they usually do not make the CPU pipeline stale. In any case, we improved the bloom filter to improve data locality - we may cover this further in a future blog post.
+
+### Comments
+
+**[Siying Dong](siying.d@fb.com)**
+
+Does [http://rocksdb.org/feed/](http://rocksdb.org/feed/) work?
diff --git a/src/rocksdb/docs/_posts/2014-06-27-avoid-expensive-locks-in-get.markdown b/src/rocksdb/docs/_posts/2014-06-27-avoid-expensive-locks-in-get.markdown
new file mode 100644
index 000000000..4411c7ae3
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2014-06-27-avoid-expensive-locks-in-get.markdown
@@ -0,0 +1,89 @@
+---
+title: Avoid Expensive Locks in Get()
+layout: post
+author: leijin
+category: blog
+redirect_from:
+  - /blog/677/avoid-expensive-locks-in-get/
+---
+
+As promised in the previous [blog post](blog/2014/05/14/lock.html)!
+
+RocksDB employs a multiversion concurrency control strategy. Before reading data, it needs to grab the current version, which is encapsulated in a data structure called [SuperVersion](https://reviews.facebook.net/rROCKSDB1fdb3f7dc60e96394e3e5b69a46ede5d67fb976c).
+
+<!--truncate-->
+
+At the beginning of `GetImpl()`, it used to do this:
+
+
+    <span class="zw-portion">mutex_.Lock();
+    </span>auto* s = super_version_->Ref();
+    mutex_.Unlock();
+
+
+The lock is necessary because pointer super_version_ may be updated, the corresponding SuperVersion may be deleted while Ref() is in progress.
+
+
+`Ref()` simply increases the reference counter and returns “this” pointer. However, this simple operation posed big challenges for in-memory workload and stopped RocksDB from scaling read throughput beyond 8 cores. Running 32 read threads on a 32-core CPU leads to [70% system CPU usage](https://github.com/facebook/rocksdb/raw/gh-pages/talks/2014-03-27-RocksDB-Meetup-Lei-Lockless-Get.pdf). This is outrageous!
+
+
+
+
+Luckily, we found a way to circumvent this problem by using [thread local storage](http://en.wikipedia.org/wiki/Thread-local_storage). Version change is a rare event comparable to millions of read requests. On the very first Get() request, each thread pays the mutex cost to acquire a reference to the new super version. Instead of releasing the reference after use, the reference is cached in thread’s local storage. An atomic variable is used to track global super version number. Subsequent reads simply compare the local super version number against the global super version number. If they are the same, the cached super version reference may be used directly, at no cost. If a version change is detected, mutex must be acquired to update the reference. The cost of mutex lock is amortized among millions of reads and becomes negligible.
+
+
+
+
+The code looks something like this:
+
+
+
+
+
+    SuperVersion* s = thread_local_->Get();
+    if (s->version_number != super_version_number_.load()) {
+      // slow path, cleanup of current super version is omitted
+      mutex_.Lock();
+      s = super_version_->Ref();
+      mutex_.Unlock();
+    }
+
+
+
+
+The result is quite amazing. RocksDB can nicely [scale to 32 cores](https://github.com/facebook/rocksdb/raw/gh-pages/talks/2014-03-27-RocksDB-Meetup-Lei-Lockless-Get.pdf) and most CPU time is spent in user land.
+
+
+
+
+Daryl Grove gives a pretty good [comparison between mutex and atomic](https://blogs.oracle.com/d/entry/the_cost_of_mutexes). However, the real cost difference lies beyond what is shown in the assembly code. Mutex can keep threads spinning on CPU or even trigger thread context switches in which all readers compete to access the critical area. Our approach prevents mutual competition by directing threads to check against a global version which does not change at high frequency, and is therefore much more cache-friendly.
+
+
+
+
+The new approach entails one issue: a thread can visit GetImpl() once but can never come back again. SuperVersion is referenced and cached in its thread local storage. All resources (e.g., memtables, files) which belong to that version are frozen. A “supervisor” is required to visit each thread’s local storage and free its resources without incurring a lock. We designed a lockless sweep using CAS (compare and switch instruction). Here is how it works:
+
+
+
+
+(1) A reader thread uses CAS to acquire SuperVersion from its local storage and to put in a special flag (SuperVersion::kSVInUse).
+
+
+
+
+(2) Upon completion of GetImpl(), the reader thread tries to return SuperVersion to local storage by CAS, expecting the special flag (SuperVersion::kSVInUse) in its local storage. If it does not see SuperVersion::kSVInUse, that means a “sweep” was done and the reader thread is responsible for cleanup (this is expensive, but does not happen often on the hot path).
+
+
+
+
+(3) After any flush/compaction, the background thread performs a sweep (CAS) across all threads’ local storage and frees encountered SuperVersion. A reader thread must re-acquire a new SuperVersion reference on its next visit.
+
+### Comments
+
+**[David Barbour](dmbarbour@gmail.com)**
+
+Please post an example of reading the same rocksdb concurrently.
+
+We are using the latest 3.0 rocksdb; however, when two separate processes
+try and open the same rocksdb for reading, only one of the open requests
+succeed. The other open always fails with “db/LOCK: Resource temporarily unavailable” So far we have not found an option that allows sharing the rocksdb for reads. An example would be most appreciated.
diff --git a/src/rocksdb/docs/_posts/2014-06-27-rocksdb-3-2-release.markdown b/src/rocksdb/docs/_posts/2014-06-27-rocksdb-3-2-release.markdown
new file mode 100644
index 000000000..e4eba6af4
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2014-06-27-rocksdb-3-2-release.markdown
@@ -0,0 +1,30 @@
+---
+title: RocksDB 3.2 release
+layout: post
+author: leijin
+category: blog
+redirect_from:
+  - /blog/647/rocksdb-3-2-release/
+---
+
+Check out new RocksDB release on [GitHub](https://github.com/facebook/rocksdb/releases/tag/rocksdb-3.2)!
+
+New Features in RocksDB 3.2:
+
+  * PlainTable now supports a new key encoding: for keys of the same prefix, the prefix is only written once. It can be enabled through encoding_type paramter of NewPlainTableFactory()
+
+
+  * Add AdaptiveTableFactory, which is used to convert from a DB of PlainTable to BlockBasedTabe, or vise versa. It can be created using NewAdaptiveTableFactory()
+
+<!--truncate-->
+
+Public API changes:
+
+
+  * We removed seek compaction as a concept from RocksDB
+
+
+  * Add two paramters to NewHashLinkListRepFactory() for logging on too many entries in a hash bucket when flushing
+
+
+  * Added new option BlockBasedTableOptions::hash_index_allow_collision. When enabled, prefix hash index for block-based table will not store prefix and allow hash collision, reducing memory consumption
diff --git a/src/rocksdb/docs/_posts/2014-07-29-rocksdb-3-3-release.markdown b/src/rocksdb/docs/_posts/2014-07-29-rocksdb-3-3-release.markdown
new file mode 100644
index 000000000..d858e4faf
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2014-07-29-rocksdb-3-3-release.markdown
@@ -0,0 +1,34 @@
+---
+title: RocksDB 3.3 Release
+layout: post
+author: yhciang
+category: blog
+redirect_from:
+  - /blog/1301/rocksdb-3-3-release/
+---
+
+Check out new RocksDB release on [GitHub](https://github.com/facebook/rocksdb/releases/tag/rocksdb-3.3)!
+
+New Features in RocksDB 3.3:
+
+  * **JSON API prototype**.
+
+
+  * **Performance improvement on HashLinkList**:  We addressed performance outlier of HashLinkList caused by skewed bucket by switching data in the bucket from linked list to skip list. Add parameter threshold_use_skiplist in NewHashLinkListRepFactory().
+
+<!--truncate-->
+
+  * **More effective on storage space reclaim**:  RocksDB is now able to reclaim storage space more effectively during the compaction process.  This is done by compensating the size of each deletion entry by the 2X average value size, which makes compaction to be triggerred by deletion entries more easily.
+
+
+  * **TimeOut API to write**:  Now WriteOptions have a variable called timeout_hint_us.  With timeout_hint_us set to non-zero, any write associated with this timeout_hint_us may be aborted when it runs longer than the specified timeout_hint_us, and it is guaranteed that any write completes earlier than the specified time-out will not be aborted due to the time-out condition.
+
+
+  * **rate_limiter option**: We added an option that controls total throughput of flush and compaction. The throughput is specified in bytes/sec. Flush always has precedence over compaction when available bandwidth is constrained.
+
+
+
+Public API changes:
+
+
+  * Removed NewTotalOrderPlainTableFactory because it is not used and implemented semantically incorrect.
diff --git a/src/rocksdb/docs/_posts/2014-09-12-cuckoo.markdown b/src/rocksdb/docs/_posts/2014-09-12-cuckoo.markdown
new file mode 100644
index 000000000..22178f7ca
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2014-09-12-cuckoo.markdown
@@ -0,0 +1,74 @@
+---
+title: Cuckoo Hashing Table Format
+layout: post
+author: radheshyam
+category: blog
+redirect_from:
+  - /blog/1427/new-bloom-filter-format/
+---
+
+## Introduction
+
+We recently introduced a new [Cuckoo Hashing](http://en.wikipedia.org/wiki/Cuckoo_hashing) based SST file format which is optimized for fast point lookups. The new format was built for applications which require very high point lookup rates (~4Mqps) in read only mode but do not use operations like range scan, merge operator, etc. But, the existing RocksDB file formats were built to support range scan and other operations and the current best point lookup in RocksDB is 1.2 Mqps given by [PlainTable](https://github.com/facebook/rocksdb/wiki/PlainTable-Format)[ format](https://github.com/facebook/rocksdb/wiki/PlainTable-Format). This prompted a hashing based file format, which we present here. The new table format uses a cache friendly version of Cuckoo Hashing algorithm with only 1 or 2 memory accesses per lookup.
+
+<!--truncate-->
+
+Goals:
+
+  * Reduce memory accesses per lookup to 1 or 2
+
+
+  * Get an end to end point lookup rate of at least 4 Mqps
+
+
+  * Minimize database size
+
+
+Assumptions:
+
+  * Key length and value length are fixed
+
+
+  * The database is operated in read only mode
+
+
+Non-goals:
+
+
+  * While optimizing the performance of Get() operation was our primary goal, compaction and build times were secondary. We may work on improving them in future.
+
+
+Details for setting up the table format can be found in [GitHub](https://github.com/facebook/rocksdb/wiki/CuckooTable-Format).
+
+
+## Cuckoo Hashing Algorithm
+
+In order to achieve high lookup speeds, we did multiple optimizations, including a cache friendly cuckoo hash algorithm. Cuckoo Hashing uses multiple hash functions, _h1, ..., __hn._
+
+### Original Cuckoo Hashing
+
+To insert any new key _k_, we compute hashes of the key _h1(k), ..., __hn__(k)_. We insert the key in the first hash location that is free. If all the locations are blocked, we try to move one of the colliding keys to a different location by trying to re-insert it.
+
+Finding smallest set of keys to displace in order to accommodate the new key is naturally a shortest path problem in a directed graph where nodes are buckets of hash table and there is an edge from bucket _A_ to bucket _B_ if the element stored in bucket _A_ can be accommodated in bucket _B_ using one of the hash functions. The source nodes are the possible hash locations for the given key _k_ and destination is any one of the empty buckets. We use this algorithm to handle collision.
+
+To retrieve a key _k_, we compute hashes, _h1(k), ..., __hn__(k)_ and the key must be present in one of these locations.
+
+Our goal is to minimize average (and maximum) number of hash functions required and hence the number of memory accesses. In our experiments, with a hash utilization of 90%, we found that the average number of lookups is 1.8 and maximum is 3. Around 44% of keys are accommodated in first hash location and 33% in second location.
+
+
+### Cache Friendly Cuckoo Hashing
+
+We noticed the following two sub-optimal properties in original Cuckoo implementation:
+
+
+  * If the key is not present in first hash location, we jump to second hash location which may not be in cache. This results in many cache misses.
+
+
+  * Because only 44% of keys are located in first cuckoo block, we couldn't have an optimal prefetching strategy - prefetching all hash locations for a key is wasteful. But prefetching only the first hash location helps only 44% of cases.
+
+
+
+The solution is to insert more keys near first location. In case of collision in the first hash location - _h1(k)_, we try to insert it in next few buckets, _h1(k)+1, _h1(k)+2, _..., h1(k)+t-1_. If all of these _t_ locations are occupied, we skip over to next hash function _h2_ and repeat the process. We call the set of _t_ buckets as a _Cuckoo Block_. We chose _t_ such that size of a block is not bigger than a cache line and we prefetch the first cuckoo block.
+
+
+With the new algorithm, for 90% hash utilization, we found that 85% of keys are accommodated in first Cuckoo Block. Prefetching the first cuckoo block yields best results. For a database of 100 million keys with key length 8 and value length 4, the hash algorithm alone can achieve 9.6 Mqps and we are working on improving it further. End to end RocksDB performance results can be found [here](https://github.com/facebook/rocksdb/wiki/CuckooTable-Format).
diff --git a/src/rocksdb/docs/_posts/2014-09-12-new-bloom-filter-format.markdown b/src/rocksdb/docs/_posts/2014-09-12-new-bloom-filter-format.markdown
new file mode 100644
index 000000000..96fa50a40
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2014-09-12-new-bloom-filter-format.markdown
@@ -0,0 +1,52 @@
+---
+title: New Bloom Filter Format
+layout: post
+author: zagfox
+category: blog
+redirect_from:
+  - /blog/1367/cuckoo/
+---
+
+## Introduction
+
+In this post, we are introducing "full filter block" --- a new bloom filter format for [block based table](https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format). This could bring about 40% of improvement for key query under in-memory (all data stored in memory, files stored in tmpfs/ramfs, an [example](https://github.com/facebook/rocksdb/wiki/RocksDB-In-Memory-Workload-Performance-Benchmarks) workload. The main idea behind is to generate a big filter that covers all the keys in SST file to avoid lots of unnecessary memory look ups.
+
+
+<!--truncate-->
+
+## What is Bloom Filter
+
+In brief, [bloom filter](https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter) is a bits array generated for a set of keys that could tell if an arbitrary key may exist in that set.
+
+In RocksDB, we generate such a bloom filter for each SST file. When we conduct a query for a key, we first goes to the bloom filter block of SST file. If key may exist in filter, we goes into data block in SST file to search for the key. If not, we would return directly. So it could help speed up point look up operation a lot.
+
+## Original Bloom Filter Format
+
+Original bloom filter creates filters for each individual data block in SST file. It has complex structure (ref [here](https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format#filter-meta-block)) which results in a lot of non-adjacent memory look ups.
+
+Here's the work flow for checking original bloom filter in block based table:
+
+1. Given the target key, we goes to the index block to get the "data block ID" where this key may reside.
+1. Using the "data block ID", we goes to the filter block and get the correct "offset of filter".
+1. Using the "offset of filter", we goes to the actual filter and do the checking.
+
+## New Bloom Filter Format
+
+New bloom filter creates filter for all keys in SST file and we name it "full filter". The data structure of full filter is very simple, there is just one big filter:
+
+    [ full filter ]
+
+In this way, the work flow of bloom filter checking is much simplified.
+
+(1) Given the target key, we goes directly to the filter block and conduct the filter checking.
+
+To be specific, there would be no checking for index block and no address jumping inside of filter block.
+
+Though it is a big filter, the total filter size would be the same as the original filter.
+
+One little draw back is that the new bloom filter introduces more memory consumption when building SST file because we need to buffer keys (or their hashes) before generating filter. Original filter just creates a bunch of small filters so it just buffer a small amount of keys. For full filter, we buffer hashes of all keys, which would take more memory when SST file size increases.
+
+
+## Usage & Customization
+
+You can refer to the document here for [usage](https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter#usage-of-new-bloom-filter) and [customization](https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter#customize-your-own-filterpolicy).
diff --git a/src/rocksdb/docs/_posts/2014-09-15-rocksdb-3-5-release.markdown b/src/rocksdb/docs/_posts/2014-09-15-rocksdb-3-5-release.markdown
new file mode 100644
index 000000000..1878a5a56
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2014-09-15-rocksdb-3-5-release.markdown
@@ -0,0 +1,38 @@
+---
+title: RocksDB 3.5 Release!
+layout: post
+author: leijin
+category: blog
+redirect_from:
+  - /blog/1547/rocksdb-3-5-release/
+---
+
+New RocksDB release - 3.5!
+
+
+**New Features**
+
+
+  1. Add include/utilities/write_batch_with_index.h, providing a utility class to query data out of WriteBatch when building it.
+
+
+  2. new ReadOptions.total_order_seek to force total order seek when block-based table is built with hash index.
+
+<!--truncate-->
+
+**Public API changes**
+
+
+  1. The Prefix Extractor used with V2 compaction filters is now passed user key to SliceTransform::Transform instead of unparsed RocksDB key.
+
+
+  2. Move BlockBasedTable related options to BlockBasedTableOptions from Options. Change corresponding JNI interface. Options affected include: no_block_cache, block_cache, block_cache_compressed, block_size, block_size_deviation, block_restart_interval, filter_policy, whole_key_filtering. filter_policy is changed to shared_ptr from a raw pointer.
+
+
+  3. Remove deprecated options: disable_seek_compaction and db_stats_log_interval
+
+
+  4. OptimizeForPointLookup() takes one parameter for block cache size. It now builds hash index, bloom filter, and block cache.
+
+
+[https://github.com/facebook/rocksdb/releases/tag/v3.5](https://github.com/facebook/rocksdb/releases/tag/rocksdb-3.5)
diff --git a/src/rocksdb/docs/_posts/2015-01-16-migrating-from-leveldb-to-rocksdb-2.markdown b/src/rocksdb/docs/_posts/2015-01-16-migrating-from-leveldb-to-rocksdb-2.markdown
new file mode 100644
index 000000000..f18de0bbc
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2015-01-16-migrating-from-leveldb-to-rocksdb-2.markdown
@@ -0,0 +1,112 @@
+---
+title: Migrating from LevelDB to RocksDB
+layout: post
+author: lgalanis
+category: blog
+redirect_from:
+  - /blog/1811/migrating-from-leveldb-to-rocksdb-2/
+---
+
+If you have an existing application that uses LevelDB and would like to migrate to using RocksDB, one problem you need to overcome is to map the options for LevelDB to proper options for RocksDB. As of release 3.9 this can be automatically done by using our option conversion utility found in rocksdb/utilities/leveldb_options.h. What is needed, is to first replace `leveldb::Options` with `rocksdb::LevelDBOptions`. Then, use `rocksdb::ConvertOptions( )` to convert the `LevelDBOptions` struct into appropriate RocksDB options. Here is an example:
+
+<!--truncate-->
+
+LevelDB code:
+
+```c++
+#include <string>
+#include "leveldb/db.h"
+
+using namespace leveldb;
+
+int main(int argc, char** argv) {
+  DB *db;
+
+  Options opt;
+  opt.create_if_missing = true;
+  opt.max_open_files = 1000;
+  opt.block_size = 4096;
+
+  Status s = DB::Open(opt, "/tmp/mydb", &db);
+
+  delete db;
+}
+```
+
+RocksDB code:
+
+```c++
+#include <string>  
+#include "rocksdb/db.h"  
+#include "rocksdb/utilities/leveldb_options.h"  
+
+using namespace rocksdb;  
+
+int main(int argc, char** argv) {  
+  DB *db;  
+
+  LevelDBOptions opt;  
+  opt.create_if_missing = true;  
+  opt.max_open_files = 1000;  
+  opt.block_size = 4096;  
+
+  Options rocksdb_options = ConvertOptions(opt);  
+  // add rocksdb specific options here  
+
+  Status s = DB::Open(rocksdb_options, "/tmp/mydb_rocks", &db);
+
+  delete db;  
+}  
+```
+
+The difference is:
+
+```diff
+-#include "leveldb/db.h"
++#include "rocksdb/db.h"
++#include "rocksdb/utilities/leveldb_options.h"
+
+-using namespace leveldb;
++using namespace rocksdb;
+
+-  Options opt;
++  LevelDBOptions opt;
+
+-  Status s = DB::Open(opt, "/tmp/mydb", &db);
++  Options rocksdb_options = ConvertOptions(opt);
++  // add rockdb specific options here
++
++  Status s = DB::Open(rocksdb_options, "/tmp/mydb_rocks", &db);
+```
+
+Once you get up and running with RocksDB you can then focus on tuning RocksDB further by modifying the converted options struct.
+
+The reason why ConvertOptions is handy is because a lot of individual options in RocksDB have moved to other structures in different components. For example, block_size is not available in struct rocksdb::Options. It resides in struct rocksdb::BlockBasedTableOptions, which is used to create a TableFactory object that RocksDB uses internally to create the proper TableBuilder objects. If you were to write your application from scratch it would look like this:
+
+RocksDB code from scratch:
+
+```c++
+#include <string>
+#include "rocksdb/db.h"
+#include "rocksdb/table.h"
+
+using namespace rocksdb;
+
+int main(int argc, char** argv) {
+  DB *db;
+
+  Options opt;
+  opt.create_if_missing = true;
+  opt.max_open_files = 1000;
+
+  BlockBasedTableOptions topt;
+  topt.block_size = 4096;
+  opt.table_factory.reset(NewBlockBasedTableFactory(topt));
+
+  Status s = DB::Open(opt, "/tmp/mydb_rocks", &db);
+
+  delete db;
+}
+```
+
+The LevelDBOptions utility can ease migration to RocksDB from LevelDB and allows us to break down the various options across classes as it is needed.
diff --git a/src/rocksdb/docs/_posts/2015-02-24-reading-rocksdb-options-from-a-file.markdown b/src/rocksdb/docs/_posts/2015-02-24-reading-rocksdb-options-from-a-file.markdown
new file mode 100644
index 000000000..cddc0dd01
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2015-02-24-reading-rocksdb-options-from-a-file.markdown
@@ -0,0 +1,41 @@
+---
+title: Reading RocksDB options from a file
+layout: post
+author: lgalanis
+category: blog
+redirect_from:
+  - /blog/1883/reading-rocksdb-options-from-a-file/
+---
+
+RocksDB options can be provided using a file or any string to RocksDB. The format is straightforward: `write_buffer_size=1024;max_write_buffer_number=2`. Any whitespace around `=` and `;` is OK. Moreover, options can be nested as necessary. For example `BlockBasedTableOptions` can be nested as follows: `write_buffer_size=1024; max_write_buffer_number=2; block_based_table_factory={block_size=4k};`. Similarly any white space around `{` or `}` is ok. Here is what it looks like in code:
+
+<!--truncate-->
+
+```c++
+#include <string>
+#include "rocksdb/db.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/convenience.h"
+
+using namespace rocksdb;                                                                                           
+
+int main(int argc, char** argv) {                                                                                  
+  DB *db;
+
+  Options opt;
+
+  std::string options_string =                                                                                     
+    "create_if_missing=true;max_open_files=1000;"                                                                  
+    "block_based_table_factory={block_size=4096}";                                                                 
+
+  Status s = GetDBOptionsFromString(opt, options_string, &opt);
+
+  s = DB::Open(opt, "/tmp/mydb_rocks", &db);                                                                       
+
+  // use db
+
+  delete db;
+}
+```
+
+Using `GetDBOptionsFromString` is a convenient way of changing options for your RocksDB application without needing to resort to recompilation or tedious command line parsing.
diff --git a/src/rocksdb/docs/_posts/2015-02-27-write-batch-with-index.markdown b/src/rocksdb/docs/_posts/2015-02-27-write-batch-with-index.markdown
new file mode 100644
index 000000000..7f9f77653
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2015-02-27-write-batch-with-index.markdown
@@ -0,0 +1,20 @@
+---
+title: 'WriteBatchWithIndex: Utility for Implementing Read-Your-Own-Writes'
+layout: post
+author: sdong
+category: blog
+redirect_from:
+  - /blog/1901/write-batch-with-index/
+---
+
+RocksDB can be used as a storage engine of a higher level database. In fact, we are currently plugging RocksDB into MySQL and MongoDB as one of their storage engines. RocksDB can help with guaranteeing some of the ACID properties: durability is guaranteed by RocksDB by design; while consistency and isolation need to be enforced by concurrency controls on top of RocksDB; Atomicity can be implemented by committing a transaction's writes with one write batch to RocksDB in the end.
+
+<!--truncate-->
+
+However, if we enforce atomicity by only committing all writes in the end of the transaction in one batch, you cannot get the updated value from RocksDB previously written by the same transaction (read-your-own-write). To read the updated value, the databases on top of RocksDB need to maintain an internal buffer for all the written keys, and when a read happens they need to merge the result from RocksDB and from this buffer. This is a problem we faced when building the RocksDB storage engine in MongoDB. We solved it by creating a utility class, WriteBatchWithIndex (a write batch with a searchable index) and made it part of public API so that the community can also benefit from it.
+
+Before talking about the index part, let me introduce write batch first. The write batch class, `WriteBatch`, is a RocksDB data structure for atomic writes of multiple keys. Users can buffer their updates to a `WriteBatch` by calling `write_batch.Put("key1", "value1")` or `write_batch.Delete("key2")`, similar as calling RocksDB's functions of the same names. In the end, they call `db->Write(write_batch)` to atomically update all those batched operations to the DB. It is how a database can guarantee atomicity, as shown above. Adding a searchable index to `WriteBatch`, we now have `WriteBatchWithIndex`. Users can put updates to WriteBatchIndex in the same way as to `WriteBatch`. In the end, users can get a `WriteBatch` object from it and issue `db->Write()`. Additionally, users can create an iterator of a WriteBatchWithIndex, seek to any key location and iterate from there.
+
+To implement read-your-own-write using `WriteBatchWithIndex`, every time the user creates a transaction, we create a `WriteBatchWithIndex` attached to it. All the writes of the transaction go to the `WriteBatchWithIndex` first. When we commit the transaction, we atomically write the batch to RocksDB. When the user wants to call `Get()`, we first check if the value exists in the `WriteBatchWithIndex` and return the value if existing, by seeking and reading from an iterator of the write batch, before checking data in RocksDB. For example, here is the we implement it in MongoDB's RocksDB storage engine: [link](https://github.com/mongodb/mongo/blob/a31cc114a89a3645e97645805ba77db32c433dce/src/mongo/db/storage/rocks/rocks_recovery_unit.cpp#L245-L260). If a range query comes, we pass a DB's iterator to `WriteBatchWithIndex`, which creates a super iterator which combines the results from the DB iterator with the batch's iterator. Using this super iterator, we can iterate the DB with the transaction's own writes. Here is the iterator creation codes in MongoDB's RocksDB storage engine: [link](https://github.com/mongodb/mongo/blob/a31cc114a89a3645e97645805ba77db32c433dce/src/mongo/db/storage/rocks/rocks_recovery_unit.cpp#L266-L269). In this way, the database can solve the read-your-own-write problem by using RocksDB to handle a transaction's uncommitted writes.
+
+Using `WriteBatchWithIndex`, we successfully implemented read-your-own-writes in the RocksDB storage engine of MongoDB. If you also have a read-your-own-write problem, `WriteBatchWithIndex` can help you implement it quickly and correctly.
diff --git a/src/rocksdb/docs/_posts/2015-04-22-integrating-rocksdb-with-mongodb-2.markdown b/src/rocksdb/docs/_posts/2015-04-22-integrating-rocksdb-with-mongodb-2.markdown
new file mode 100644
index 000000000..1ffe2c532
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2015-04-22-integrating-rocksdb-with-mongodb-2.markdown
@@ -0,0 +1,16 @@
+---
+title: Integrating RocksDB with MongoDB
+layout: post
+author: icanadi
+category: blog
+redirect_from:
+  - /blog/1967/integrating-rocksdb-with-mongodb-2/
+---
+
+Over the last couple of years, we have been busy integrating RocksDB with various services here at Facebook that needed to store key-value pairs locally. We have also seen other companies using RocksDB as local storage components of their distributed systems.
+
+<!--truncate-->
+
+The next big challenge for us is to bring RocksDB storage engine to general purpose databases. Today we have an exciting milestone to share with our community! We're running MongoDB with RocksDB in production and seeing great results! You can read more about it here: [http://blog.parse.com/announcements/mongodb-rocksdb-parse/](http://blog.parse.com/announcements/mongodb-rocksdb-parse/)
+
+Keep tuned for benchmarks and more stability and performance improvements.
diff --git a/src/rocksdb/docs/_posts/2015-06-12-rocksdb-in-osquery.markdown b/src/rocksdb/docs/_posts/2015-06-12-rocksdb-in-osquery.markdown
new file mode 100644
index 000000000..f3a55faae
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2015-06-12-rocksdb-in-osquery.markdown
@@ -0,0 +1,10 @@
+---
+title: RocksDB in osquery
+layout: post
+author: icanadi
+category: lgalanis
+redirect_from:
+  - /blog/1997/rocksdb-in-osquery/
+---
+
+Check out [this](https://code.facebook.com/posts/1411870269134471/how-rocksdb-is-used-in-osquery/) blog post by [Mike Arpaia](https://www.facebook.com/mike.arpaia) and [Ted Reed](https://www.facebook.com/treeded) about how osquery leverages RocksDB to build an embedded pub-sub system. This article is a great read and contains insights on how to properly use RocksDB.
diff --git a/src/rocksdb/docs/_posts/2015-07-15-rocksdb-2015-h2-roadmap.markdown b/src/rocksdb/docs/_posts/2015-07-15-rocksdb-2015-h2-roadmap.markdown
new file mode 100644
index 000000000..b3e2703fc
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2015-07-15-rocksdb-2015-h2-roadmap.markdown
@@ -0,0 +1,92 @@
+---
+title: RocksDB 2015 H2 roadmap
+layout: post
+author: icanadi
+category: blog
+redirect_from:
+  - /blog/2015/rocksdb-2015-h2-roadmap/
+---
+
+Every 6 months, RocksDB team gets together to prioritize the work ahead of us. We just went through this exercise and we wanted to share the results with the community. Here's what RocksDB team will be focusing on for the next 6 months:
+
+<!--truncate-->
+
+**MyRocks**
+
+As you might know, we're working hard to integrate RocksDB as a storage engine for MySQL. This project is pretty important for us because we're heavy users of MySQL. We're already getting pretty good performance results, but there is more work to be done. We need to focus on both performance and stability. The most high priority items on are list are:
+
+
+
+
+  1. Reduce CPU costs of RocksDB as a MySQL storage engine
+
+
+  2. Implement pessimistic concurrency control to support repeatable read isolation level in MyRocks
+
+
+  3. Reduce P99 read latency, which is high mostly because of lingering tombstones
+
+
+  4. Port ZSTD compression
+
+
+**MongoRocks**
+
+Another database that we're working on is MongoDB. The project of integrating MongoDB with RocksDB storage engine is called MongoRocks. It's already running in production at Parse [1] and we're seeing surprisingly few issues. Our plans for the next half:
+
+
+
+
+  1. Keep improving performance and stability, possibly reuse work done on MyRocks (workloads are pretty similar).
+
+
+  2. Increase internal and external adoption.
+
+
+  3. Support new MongoDB 3.2.
+
+
+**RocksDB on cheaper storage media**
+
+Up to now, our mission was to build the best key-value store “for fast storage” (flash and in-memory). However, there are some use-cases at Facebook that don't need expensive high-end storage. In the next six months, we plan to deploy RocksDB on cheaper storage media. We will optimize performance to RocksDB on either or both:
+
+
+
+
+  1. Hard drive storage array.
+
+
+  2. Tiered Storage.
+
+
+**Quality of Service**
+
+When talking to our customers, there are couple of issues that keep reoccurring. We need to fix them to make our customers happy. We will improve RocksDB to provide better assurance of performance and resource usage. Non-exhaustive list includes:
+
+
+
+
+  1. Iterate P99 can be high due to the presence of tombstones.
+
+
+  2. Write stalls can happen during high write loads.
+
+
+  3. Better control of memory and disk usage.
+
+
+  4. Service quality and performance of backup engine.
+
+
+**Operation's user experience**
+
+As we increase deployment of RocksDB, engineers are spending more time on debugging RocksDB issues. We plan to improve user experience when running RocksDB. The goal is to reduce TTD (time-to-debug). The work includes monitoring, visualizations and documentations.
+
+[1]( http://blog.parse.com/announcements/mongodb-rocksdb-parse/](http://blog.parse.com/announcements/mongodb-rocksdb-parse/)
+
+
+### Comments
+
+**[Mike](allspace2012@outlook.com)**
+
+What’s the status of this roadmap? “RocksDB on cheaper storage media”, has this been implemented?
diff --git a/src/rocksdb/docs/_posts/2015-07-17-spatial-indexing-in-rocksdb.markdown b/src/rocksdb/docs/_posts/2015-07-17-spatial-indexing-in-rocksdb.markdown
new file mode 100644
index 000000000..53c1f5a90
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2015-07-17-spatial-indexing-in-rocksdb.markdown
@@ -0,0 +1,78 @@
+---
+title: Spatial indexing in RocksDB
+layout: post
+author: icanadi
+category: blog
+redirect_from:
+  - /blog/2039/spatial-indexing-in-rocksdb/
+---
+
+About a year ago, there was a need to develop a spatial database at Facebook. We needed to store and index Earth's map data. Before building our own, we looked at the existing spatial databases. They were all very good technology, but also general purpose. We could sacrifice a general-purpose API, so we thought we could build a more performant database, since it would be specifically designed for our use-case. Furthermore, we decided to build the spatial database on top of RocksDB, because we have a lot of operational experience with running and tuning RocksDB at a large scale.
+
+<!--truncate-->
+
+When we started looking at this project, the first thing that surprised us was that our planet is not that big. Earth's entire map data can fit in memory on a reasonably high-end machine. Thus, we also decided to build a spatial database optimized for memory-resident dataset.
+
+The first use-case of our spatial database was an experimental map renderer. As part of our project, we successfully loaded [Open Street Maps](https://www.openstreetmap.org/) dataset and hooked it up with [Mapnik](http://mapnik.org/), a map rendering engine.
+
+The usual Mapnik workflow is to load the map data into a SQL-based database and then define map layers with SQL statements. To render a tile, Mapnik needs to execute a couple of SQL queries. The benefit of this approach is that you don't need to reload your database when you change your map style. You can just change your SQL query and Mapnik picks it up. In our model, we decided to precompute the features we need for each tile. We need to know the map style before we create the database. However, when rendering the map tile, we only fetch the features that we need to render.
+
+We haven't open sourced the RocksDB Mapnik plugin or the database loading pipeline. However, the spatial indexing is available in RocksDB under a name [SpatialDB](https://github.com/facebook/rocksdb/blob/main/include/rocksdb/utilities/spatial_db.h). The API is focused on map rendering use-case, but we hope that it can also be used for other spatial-based applications.
+
+Let's take a tour of the API. When you create a spatial database, you specify the spatial indexes that need to be built. Each spatial index is defined by a bounding box and granularity. For map rendering, we create a spatial index for each zoom levels. Higher zoom levels have more granularity.
+
+
+
+    SpatialDB::Create(
+      SpatialDBOptions(),
+      "/data/map", {
+        SpatialIndexOptions("zoom10", BoundingBox(0, 0, 100, 100), 10),
+        SpatialIndexOptions("zoom16", BoundingBox(0, 0, 100, 100), 16)
+      }
+    );
+
+
+
+
+When you insert a feature (building, street, country border) into SpatialDB, you need to specify the list of spatial indexes that will index the feature. In the loading phase we process the map style to determine the list of zoom levels on which we'll render the feature. For example, we will not render the building on zoom level that shows an entire country. Building will only be indexed on higher zoom level's index. Country borders will be indexes on all zoom levels.
+
+
+
+    FeatureSet feature;
+    feature.Set("type", "building");
+    feature.Set("height", 6);
+    db->Insert(WriteOptions(), BoundingBox<double>(5, 5, 10, 10),
+               well_known_binary_blob, feature, {"zoom16"});
+
+
+
+
+The indexing part is pretty simple. For each feature, we first find a list of index tiles that it intersects. Then, we add a link from the tile's [quad key](https://msdn.microsoft.com/en-us/library/bb259689.aspx) to the feature's primary key. Using quad keys improves data locality, i.e. features closer together geographically will have similar quad keys. Even though we're optimizing for a memory-resident dataset, data locality is still very important due to different caching effects.
+
+After you're done inserting all the features, you can call an API Compact() that will compact the dataset and speed up read queries.
+
+
+
+    db->Compact();
+
+
+
+
+SpatialDB's query specifies: 1) bounding box we're interested in, and 2) a zoom level. We find all tiles that intersect with the query's bounding box and return all features in those tiles.
+
+
+
+
+    Cursor* c = db_->Query(ReadOptions(), BoundingBox<double>(1, 1, 7, 7), "zoom16");
+    for (c->Valid(); c->Next()) {
+        Render(c->blob(), c->feature_set());
+    }
+
+
+
+
+Note: `Render()` function is not part of RocksDB. You will need to use one of many open source map renderers, for example check out [Mapnik](http://mapnik.org/).
+
+TL;DR If you need an embedded spatial database, check out RocksDB's SpatialDB. [Let us know](https://www.facebook.com/groups/rocksdb.dev/) how we can make it better.
+
+If you're interested in learning more, check out this [talk](https://www.youtube.com/watch?v=T1jWsDMONM8).
diff --git a/src/rocksdb/docs/_posts/2015-07-22-rocksdb-is-now-available-in-windows-platform.markdown b/src/rocksdb/docs/_posts/2015-07-22-rocksdb-is-now-available-in-windows-platform.markdown
new file mode 100644
index 000000000..b6bb47d53
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2015-07-22-rocksdb-is-now-available-in-windows-platform.markdown
@@ -0,0 +1,30 @@
+---
+title: RocksDB is now available in Windows Platform
+layout: post
+author: dmitrism
+category: blog
+redirect_from:
+  - /blog/2033/rocksdb-is-now-available-in-windows-platform/
+---
+
+Over the past 6 months we have seen a number of use cases where RocksDB is successfully used by the community and various companies to achieve high throughput and volume in a modern server environment.
+
+We at Microsoft Bing could not be left behind. As a result we are happy to [announce](http://bit.ly/1OmWBT9) the availability of the Windows Port created here at Microsoft which we intend to use as a storage option for one of our key/value data stores.
+
+<!--truncate-->
+
+We are happy to make this available for the community. Keep tuned for more announcements to come.
+
+### Comments
+
+**[Siying Dong](siying.d@fb.com)**
+
+Appreciate your contributions to RocksDB project! I believe it will benefits many users!
+
+**[empresas sevilla](oxofkx@gmail.com)**
+
+Magnifico artículo|, un placer leer el blog
+
+**[jak usunac](tomogedac@o2.pl)**
+
+I believe it will benefits too
diff --git a/src/rocksdb/docs/_posts/2015-07-23-dynamic-level.markdown b/src/rocksdb/docs/_posts/2015-07-23-dynamic-level.markdown
new file mode 100644
index 000000000..0ff3a0542
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2015-07-23-dynamic-level.markdown
@@ -0,0 +1,29 @@
+---
+title: Dynamic Level Size for Level-Based Compaction
+layout: post
+author: sdong
+category: blog
+redirect_from:
+  - /blog/2207/dynamic-level/
+---
+
+In this article, we follow up on the first part of an answer to one of the questions in our [AMA](https://www.reddit.com/r/IAmA/comments/3de3cv/we_are_rocksdb_engineering_team_ask_us_anything/ct4a8tb), the dynamic level size in level-based compaction.
+
+<!--truncate-->
+
+Level-based compaction is the original LevelDB compaction style and one of the two major compaction styles in RocksDB (See [our wiki](https://github.com/facebook/rocksdb/wiki/RocksDB-Basics#multi-threaded-compactions)). In RocksDB we introduced parallelism and more configurable options to it but the main algorithm stayed the same, until we recently introduced the dynamic level size mode.
+
+
+In level-based compaction, we organize data to different sorted runs, called levels. Each level has a target size.  Usually target size of levels increases by the same size multiplier. For example, you can set target size of level 1 to be 1GB, and size multiplier to be 10, and the target size of level 1, 2, 3, 4 will be 1GB, 10GB, 100GB and 1000GB. Before level 1, there will be some staging file flushed from mem tables, called Level 0 files, which will later be merged to level 1. Compactions will be triggered as soon as actual size of a level exceeds its target size. We will merge a subset of data of that level to next level, to reduce size of the level. More compactions will be triggered until sizes of all the levels are lower than their target sizes. In a steady state, the size of each level will be around the same size of the size of level targets.
+
+
+Level-based compaction’s advantage is its good space efficiency. We usually use the metric space amplification to measure the space efficiency. In this article ignore the effects of data compression so space amplification= size_on_file_system / size_of_user_data.
+
+
+How do we estimate space amplification of level-based compaction? We focus specifically on the databases in steady state, which means database size is stable or grows slowly over time. This means updates will add roughly the same or little more data than what is removed by deletes. Given that, if we compact all the data all to the last level, the size of level will be equal as the size of last level before the compaction. On the other hand, the size of user data will be approximately the size of DB if we compact all the levels down to the last level. So the size of the last level will be a good estimation of user data size. So total size of the DB divided by the size of the last level will be a good estimation of space amplification.
+
+
+Applying the equation, if we have four non-zero levels, their sizes are 1GB, 10GB, 100GB, 1000GB, the size amplification will be approximately (1000GB + 100GB + 10GB + 1GB) / 1000GB = 1.111, which is a very good number. However, there is a catch here: how to make sure the last level’s size is 1000GB, the same as the level’s size target? A user has to fine tune level sizes to achieve this number and will need to re-tune if DB size changes. The theoretic number 1.11 is hard to achieve in practice. In a worse case, if you have the target size of last level to be 1000GB but the user data is only 200GB, then the actual space amplification will be (200GB + 100GB + 10GB + 1GB) / 200GB = 1.555, a much worse number.
+
+
+To solve this problem, my colleague Igor Kabiljo came up with a solution of dynamic level size target mode. You can enable it by setting options.level_compaction_dynamic_level_bytes=true. In this mode, size target of levels are changed dynamically based on size of the last level. Suppose the level size multiplier to be 10, and the DB size is 200GB. The target size of the last level is automatically set to be the actual size of the level, which is 200GB, the second to last level’s size target will be automatically set to be size_last_level / 10 = 20GB, the third last level’s will be size_last_level/100 = 2GB, and next level to be size_last_level/1000 = 200MB. We stop here because 200MB is within the range of the first level. In this way, we can achieve the 1.111 space amplification, without fine tuning of the level size targets. More details can be found in [code comments of the option](https://github.com/facebook/rocksdb/blob/v3.11/include/rocksdb/options.h#L366-L423) in the header file.
diff --git a/src/rocksdb/docs/_posts/2015-10-27-getthreadlist.markdown b/src/rocksdb/docs/_posts/2015-10-27-getthreadlist.markdown
new file mode 100644
index 000000000..92f743adc
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2015-10-27-getthreadlist.markdown
@@ -0,0 +1,193 @@
+---
+title: GetThreadList
+layout: post
+author: yhciang
+category: blog
+redirect_from:
+  - /blog/2261/getthreadlist/
+---
+
+We recently added a new API, called `GetThreadList()`, that exposes the RocksDB background thread activity. With this feature, developers will be able to obtain the real-time information about the currently running compactions and flushes such as the input / output size, elapsed time, the number of bytes it has written. Below is an example output of `GetThreadList`.  To better illustrate the example, we have put a sample output of `GetThreadList` into a table where each column represents a thread status:
+
+<!--truncate-->
+
+<table width="637" >
+<tbody >
+<tr style="border:2px solid #000000" >
+
+<td style="padding:3px" >ThreadID
+</td>
+
+<td >140716395198208
+</td>
+
+<td >140716416169728
+</td>
+</tr>
+<tr >
+
+<td style="padding:3px" >DB
+</td>
+
+<td >db1
+</td>
+
+<td >db2
+</td>
+</tr>
+<tr >
+
+<td style="padding:3px" >CF
+</td>
+
+<td >default
+</td>
+
+<td >picachu
+</td>
+</tr>
+<tr >
+
+<td style="padding:3px" >ThreadType
+</td>
+
+<td >High Pri
+</td>
+
+<td >Low Pri
+</td>
+</tr>
+<tr >
+
+<td style="padding:3px" >Operation
+</td>
+
+<td >Flush
+</td>
+
+<td >Compaction
+</td>
+</tr>
+<tr >
+
+<td style="padding:3px" >ElapsedTime
+</td>
+
+<td >143.459 ms
+</td>
+
+<td >607.538 ms
+</td>
+</tr>
+<tr >
+
+<td style="padding:3px" >Stage
+</td>
+
+<td >FlushJob::WriteLevel0Table
+</td>
+
+<td >CompactionJob::Install
+</td>
+</tr>
+<tr >
+
+<td style="vertical-align:top;padding:3px" >OperationProperties
+</td>
+
+<td style="vertical-align:top;padding:3px" >
+BytesMemtables 4092938
+BytesWritten 1050701
+</td>
+
+<td style="vertical-align:top" >
+BaseInputLevel 1
+BytesRead 4876417
+BytesWritten 4140109
+IsDeletion 0
+IsManual 0
+IsTrivialMove 0
+JobID 146
+OutputLevel 2
+TotalInputBytes 4883044
+</td>
+</tr>
+</tbody>
+</table>
+
+In the above output, we can see `GetThreadList()` reports the activity of two threads: one thread running flush job (middle column) and the other thread running a compaction job (right-most column).  In each thread status, it shows basic information about the thread such as thread id, it's target db / column family, and the job it is currently doing and the current status of the job.  For instance, we can see thread 140716416169728 is doing compaction on the `picachu` column family in database `db2`.  In addition, we can see the compaction has been running for 600 ms, and it has read 4876417 bytes out of 4883044 bytes. This indicates the compaction is about to complete.  The stage property indicates which code block the thread is currently executing.  For instance, thread 140716416169728 is currently running `CompactionJob::Install`, which further indicates the compaction job is almost done.
+
+Below we briefly describe its API.
+
+
+## How to Enable it?
+
+
+To enable thread-tracking of a rocksdb instance, simply set `enable_thread_tracking` to true in its DBOptions:
+
+```c++
+// If true, then the status of the threads involved in this DB will
+// be tracked and available via GetThreadList() API.
+//
+// Default: false
+bool enable_thread_tracking;
+```
+
+
+
+## The API
+
+
+The GetThreadList API is defined in [include/rocksdb/env.h](https://github.com/facebook/rocksdb/blob/main/include/rocksdb/env.h#L317-L318), which is an Env
+function:
+
+```c++
+virtual Status GetThreadList(std::vector* thread_list)
+```
+
+Since an Env can be shared across multiple rocksdb instances, the output of
+`GetThreadList()` include the background activity of all the rocksdb instances
+that using the same Env.
+
+The `GetThreadList()` API simply returns a vector of `ThreadStatus`, each describes
+the current status of a thread. The `ThreadStatus` structure, defined in
+[include/rocksdb/thread_status.h](https://github.com/facebook/rocksdb/blob/main/include/rocksdb/thread_status.h), contains the following information:
+
+```c++
+// An unique ID for the thread.
+const uint64_t thread_id;
+
+// The type of the thread, it could be HIGH_PRIORITY,
+// LOW_PRIORITY, and USER
+const ThreadType thread_type;
+
+// The name of the DB instance where the thread is currently
+// involved with. It would be set to empty string if the thread
+// does not involve in any DB operation.
+const std::string db_name;
+
+// The name of the column family where the thread is currently
+// It would be set to empty string if the thread does not involve
+// in any column family.
+const std::string cf_name;
+
+// The operation (high-level action) that the current thread is involved.
+const OperationType operation_type;
+
+// The elapsed time in micros of the current thread operation.
+const uint64_t op_elapsed_micros;
+
+// An integer showing the current stage where the thread is involved
+// in the current operation.
+const OperationStage operation_stage;
+
+// A list of properties that describe some details about the current
+// operation. Same field in op_properties[] might have different
+// meanings for different operations.
+uint64_t op_properties[kNumOperationProperties];
+
+// The state (lower-level action) that the current thread is involved.
+const StateType state_type;
+```
+
+If you are interested in the background thread activity of your RocksDB application, please feel free to give `GetThreadList()` a try :)
diff --git a/src/rocksdb/docs/_posts/2015-11-10-use-checkpoints-for-efficient-snapshots.markdown b/src/rocksdb/docs/_posts/2015-11-10-use-checkpoints-for-efficient-snapshots.markdown
new file mode 100644
index 000000000..6852b8ffa
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2015-11-10-use-checkpoints-for-efficient-snapshots.markdown
@@ -0,0 +1,45 @@
+---
+title: Use Checkpoints for Efficient Snapshots
+layout: post
+author: rven2
+category: blog
+redirect_from:
+  - /blog/2609/use-checkpoints-for-efficient-snapshots/
+---
+
+**Checkpoint** is a feature in RocksDB which provides the ability to take a snapshot of a running RocksDB database in a separate directory. Checkpoints can be used as a point in time snapshot, which can be opened Read-only to query rows as of the point in time or as a Writeable snapshot by opening it Read-Write. Checkpoints can be used for both full and incremental backups.
+
+<!--truncate-->
+
+
+The Checkpoint feature enables RocksDB to create a consistent snapshot of a given RocksDB database in the specified directory. If the snapshot is on the same filesystem as the original database, the SST files will be hard-linked, otherwise SST files will be copied. The manifest and CURRENT files will be copied. In addition, if there are multiple column families, log files will be copied for the period covering the start and end of the checkpoint, in order to provide a consistent snapshot across column families.
+
+
+
+
+A Checkpoint object needs to be created for a database before checkpoints are created. The API is as follows:
+
+
+
+
+`Status Create(DB* db, Checkpoint** checkpoint_ptr);`
+
+
+
+
+Given a checkpoint object and a directory, the CreateCheckpoint function creates a consistent snapshot of the database in the given directory.
+
+
+
+
+`Status CreateCheckpoint(const std::string& checkpoint_dir);`
+
+
+
+
+The directory should not already exist and will be created by this API. The directory will be an absolute path. The checkpoint can be used as a read-only copy of the DB or can be opened as a standalone DB. When opened read/write, the SST files continue to be hard links and these links are removed when the files are obsoleted. When the user is done with the snapshot, the user can delete the directory to remove the snapshot.
+
+
+
+
+Checkpoints are used for online backup in MyRocks. which is MySQL using RocksDB as the storage engine . ([MySQL on RocksDB](https://github.com/facebook/mysql-5.6)) 
diff --git a/src/rocksdb/docs/_posts/2015-11-16-analysis-file-read-latency-by-level.markdown b/src/rocksdb/docs/_posts/2015-11-16-analysis-file-read-latency-by-level.markdown
new file mode 100644
index 000000000..b21b04fe3
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2015-11-16-analysis-file-read-latency-by-level.markdown
@@ -0,0 +1,244 @@
+---
+title: Analysis File Read Latency by Level
+layout: post
+author: sdong
+category: blog
+redirect_from:
+  - /blog/2537/analysis-file-read-latency-by-level/
+---
+
+In many use cases of RocksDB, people rely on OS page cache for caching compressed data. With this approach, verifying effective of the OS page caching is challenging, because file system is a black box to users.
+
+As an example, a user can tune the DB as following: use level-based compaction, with L1 - L4 sizes to be 1GB, 10GB, 100GB and 1TB. And they reserve about 20GB memory as OS page cache, expecting level 0, 1 and 2 are mostly cached in memory, leaving only reads from level 3 and 4 requiring disk I/Os. However, in practice, it's not easy to verify whether OS page cache does exactly what we expect. For example, if we end up with doing 4 instead of 2 I/Os per query, it's not easy for users to figure out whether the it's because of efficiency of OS page cache or reading multiple blocks for a level. Analysis like it is especially important if users run RocksDB on hard drive disks, for the gap of latency between hard drives and memory is much higher than flash-based SSDs.
+
+<!--truncate-->
+
+In order to make tuning easier, we added new instrumentation to help users analysis latency distribution of file reads in different levels. If users turn DB statistics on, we always keep track of distribution of file read latency for each level. Users can retrieve the information by querying DB property “rocksdb.stats” ( [https://github.com/facebook/rocksdb/blob/v3.13.1/include/rocksdb/db.h#L315-L316](https://github.com/facebook/rocksdb/blob/v3.13.1/include/rocksdb/db.h#L315-L316) ). It will also printed out as a part of compaction summary in info logs periodically.
+
+The output looks like this:
+
+
+```
+** Level 0 read latency histogram (micros):
+Count: 696 Average: 489.8118 StdDev: 222.40
+Min: 3.0000 Median: 452.3077 Max: 1896.0000
+Percentiles: P50: 452.31 P75: 641.30 P99: 1068.00 P99.9: 1860.80 P99.99: 1896.00
+------------------------------------------------------
+[ 2, 3 ) 1 0.144% 0.144%
+[ 18, 20 ) 1 0.144% 0.287%
+[ 45, 50 ) 5 0.718% 1.006%
+[ 50, 60 ) 26 3.736% 4.741% #
+[ 60, 70 ) 6 0.862% 5.603%
+[ 90, 100 ) 1 0.144% 5.747%
+[ 120, 140 ) 2 0.287% 6.034%
+[ 140, 160 ) 1 0.144% 6.178%
+[ 160, 180 ) 1 0.144% 6.322%
+[ 200, 250 ) 9 1.293% 7.615%
+[ 250, 300 ) 45 6.466% 14.080% #
+[ 300, 350 ) 88 12.644% 26.724% ###
+[ 350, 400 ) 88 12.644% 39.368% ###
+[ 400, 450 ) 71 10.201% 49.569% ##
+[ 450, 500 ) 65 9.339% 58.908% ##
+[ 500, 600 ) 74 10.632% 69.540% ##
+[ 600, 700 ) 92 13.218% 82.759% ###
+[ 700, 800 ) 64 9.195% 91.954% ##
+[ 800, 900 ) 35 5.029% 96.983% #
+[ 900, 1000 ) 12 1.724% 98.707%
+[ 1000, 1200 ) 6 0.862% 99.569%
+[ 1200, 1400 ) 2 0.287% 99.856%
+[ 1800, 2000 ) 1 0.144% 100.000%
+
+** Level 1 read latency histogram (micros):
+(......not pasted.....)
+
+** Level 2 read latency histogram (micros):
+(......not pasted.....)
+
+** Level 3 read latency histogram (micros):
+(......not pasted.....)
+
+** Level 4 read latency histogram (micros):
+(......not pasted.....)
+
+** Level 5 read latency histogram (micros):
+Count: 25583746 Average: 421.1326 StdDev: 385.11
+Min: 1.0000 Median: 376.0011 Max: 202444.0000
+Percentiles: P50: 376.00 P75: 438.00 P99: 1421.68 P99.9: 4164.43 P99.99: 9056.52
+------------------------------------------------------
+[ 0, 1 ) 2351 0.009% 0.009%
+[ 1, 2 ) 6077 0.024% 0.033%
+[ 2, 3 ) 8471 0.033% 0.066%
+[ 3, 4 ) 788 0.003% 0.069%
+[ 4, 5 ) 393 0.002% 0.071%
+[ 5, 6 ) 786 0.003% 0.074%
+[ 6, 7 ) 1709 0.007% 0.080%
+[ 7, 8 ) 1769 0.007% 0.087%
+[ 8, 9 ) 1573 0.006% 0.093%
+[ 9, 10 ) 1495 0.006% 0.099%
+[ 10, 12 ) 3043 0.012% 0.111%
+[ 12, 14 ) 2259 0.009% 0.120%
+[ 14, 16 ) 1233 0.005% 0.125%
+[ 16, 18 ) 762 0.003% 0.128%
+[ 18, 20 ) 451 0.002% 0.130%
+[ 20, 25 ) 794 0.003% 0.133%
+[ 25, 30 ) 1279 0.005% 0.138%
+[ 30, 35 ) 1172 0.005% 0.142%
+[ 35, 40 ) 1363 0.005% 0.148%
+[ 40, 45 ) 409 0.002% 0.149%
+[ 45, 50 ) 105 0.000% 0.150%
+[ 50, 60 ) 80 0.000% 0.150%
+[ 60, 70 ) 280 0.001% 0.151%
+[ 70, 80 ) 1583 0.006% 0.157%
+[ 80, 90 ) 4245 0.017% 0.174%
+[ 90, 100 ) 6572 0.026% 0.200%
+[ 100, 120 ) 9724 0.038% 0.238%
+[ 120, 140 ) 3713 0.015% 0.252%
+[ 140, 160 ) 2383 0.009% 0.261%
+[ 160, 180 ) 18344 0.072% 0.333%
+[ 180, 200 ) 51873 0.203% 0.536%
+[ 200, 250 ) 631722 2.469% 3.005%
+[ 250, 300 ) 2721970 10.639% 13.644% ##
+[ 300, 350 ) 5909249 23.098% 36.742% #####
+[ 350, 400 ) 6522507 25.495% 62.237% #####
+[ 400, 450 ) 4296332 16.793% 79.030% ###
+[ 450, 500 ) 2130323 8.327% 87.357% ##
+[ 500, 600 ) 1553208 6.071% 93.428% #
+[ 600, 700 ) 642129 2.510% 95.938% #
+[ 700, 800 ) 372428 1.456% 97.394%
+[ 800, 900 ) 187561 0.733% 98.127%
+[ 900, 1000 ) 85858 0.336% 98.462%
+[ 1000, 1200 ) 82730 0.323% 98.786%
+[ 1200, 1400 ) 50691 0.198% 98.984%
+[ 1400, 1600 ) 38026 0.149% 99.133%
+[ 1600, 1800 ) 32991 0.129% 99.261%
+[ 1800, 2000 ) 30200 0.118% 99.380%
+[ 2000, 2500 ) 62195 0.243% 99.623%
+[ 2500, 3000 ) 36684 0.143% 99.766%
+[ 3000, 3500 ) 21317 0.083% 99.849%
+[ 3500, 4000 ) 10216 0.040% 99.889%
+[ 4000, 4500 ) 8351 0.033% 99.922%
+[ 4500, 5000 ) 4152 0.016% 99.938%
+[ 5000, 6000 ) 6328 0.025% 99.963%
+[ 6000, 7000 ) 3253 0.013% 99.976%
+[ 7000, 8000 ) 2082 0.008% 99.984%
+[ 8000, 9000 ) 1546 0.006% 99.990%
+[ 9000, 10000 ) 1055 0.004% 99.994%
+[ 10000, 12000 ) 1566 0.006% 100.000%
+[ 12000, 14000 ) 761 0.003% 100.003%
+[ 14000, 16000 ) 462 0.002% 100.005%
+[ 16000, 18000 ) 226 0.001% 100.006%
+[ 18000, 20000 ) 126 0.000% 100.006%
+[ 20000, 25000 ) 107 0.000% 100.007%
+[ 25000, 30000 ) 43 0.000% 100.007%
+[ 30000, 35000 ) 15 0.000% 100.007%
+[ 35000, 40000 ) 14 0.000% 100.007%
+[ 40000, 45000 ) 16 0.000% 100.007%
+[ 45000, 50000 ) 1 0.000% 100.007%
+[ 50000, 60000 ) 22 0.000% 100.007%
+[ 60000, 70000 ) 10 0.000% 100.007%
+[ 70000, 80000 ) 5 0.000% 100.007%
+[ 80000, 90000 ) 14 0.000% 100.007%
+[ 90000, 100000 ) 11 0.000% 100.007%
+[ 100000, 120000 ) 33 0.000% 100.007%
+[ 120000, 140000 ) 6 0.000% 100.007%
+[ 140000, 160000 ) 3 0.000% 100.007%
+[ 160000, 180000 ) 7 0.000% 100.007%
+[ 200000, 250000 ) 2 0.000% 100.007%
+```
+
+
+In this example, you can see we only issued 696 reads from level 0 while issued 25 million reads from level 5. The latency distribution is also clearly shown among those reads. This will be helpful for users to analysis OS page cache efficiency.
+
+Currently the read latency per level includes reads from data blocks, index blocks, as well as bloom filter blocks. We are also working on a feature to break down those three type of blocks.
+
+### Comments
+
+**[Tao Feng](fengtao04@gmail.com)**
+
+Is this feature also included in RocksJava?
+
+**[Siying Dong](siying.d@fb.com)**
+
+Should be. As long as you enable statistics, you should be able to get the value from `RocksDB.getProperty()` with property `rocksdb.dbstats`. Let me know if you can’t find it.
+
+**[chiddu](cnbscience@gmail.com)**
+
+> In this example, you can see we only issued 696 reads from level 0 while issued 256K reads from level 5.
+
+Isn’t it 2.5 M of reads instead of 256K ? .
+
+Also could anyone please provide more description on the histogram ? especially
+
+> Count: 25583746 Average: 421.1326 StdDev: 385.11
+> Min: 1.0000 Median: 376.0011 Max: 202444.0000
+> Percentiles: P50: 376.00 P75: 438.00 P99: 1421.68 P99.9: 4164.43 P99.99: 9056.52
+
+and
+
+> [ 0, 1 ) 2351 0.009% 0.009%
+> [ 1, 2 ) 6077 0.024% 0.033%
+> [ 2, 3 ) 8471 0.033% 0.066%
+> [ 3, 4 ) 788 0.003% 0.069%”
+
+thanks in advance
+
+**[Siying Dong](siying.d@fb.com)**
+
+Thank you for pointing out the mistake. I fixed it now.
+
+In this output, there are 2.5 million samples, average latency is 421 micro seconds, with standard deviation 385. Median is 376, max value is 202 milliseconds. 0.009% has value of 1, 0.024% has value of 1, 0.033% has value of 2. Accumulated value from 0 to 2 is 0.066%.
+
+Hope it helps.
+
+**[chiddu](cnbscience@gmail.com)**
+
+Thank you Siying for the quick reply, I was running couple of benchmark testing to check the performance of rocksdb on SSD. One of the test is similar to what is mentioned in the wiki, TEST 4 : Random read , except the key_size is 10 and value_size is 20. I am inserting 1 billion hashes and reading 1 billion hashes with 32 threads. The histogram shows something like this
+
+```
+Level 5 read latency histogram (micros):
+Count: 7133903059 Average: 480.4357 StdDev: 309.18
+Min: 0.0000 Median: 551.1491 Max: 224142.0000
+Percentiles: P50: 551.15 P75: 651.44 P99: 996.52 P99.9: 2073.07 P99.99: 3196.32
+——————————————————
+[ 0, 1 ) 28587385 0.401% 0.401%
+[ 1, 2 ) 686572516 9.624% 10.025% ##
+[ 2, 3 ) 567317522 7.952% 17.977% ##
+[ 3, 4 ) 44979472 0.631% 18.608%
+[ 4, 5 ) 50379685 0.706% 19.314%
+[ 5, 6 ) 64930061 0.910% 20.224%
+[ 6, 7 ) 22613561 0.317% 20.541%
+…………more………….
+```
+
+If I understand your previous comment correctly,
+
+1. How is it that the count is around 7 billion when I have only inserted 1 billion hashes ? is the stat broken ?
+1. What does the percentiles and the numbers signify ?
+1. 0, 1 ) 28587385 0.401% 0.401% what does this “28587385” stand for in the histogram row ?
+
+**[Siying Dong](siying.d@fb.com)**
+
+If I remember correctly, with db_bench, if you specify –num=1000000000 –threads=32, it is every thread reading one billion keys, total of 32 billions. Is it the case you ran into?
+
+28,587,385 means that number of data points take the value [0,1)
+28,587,385 / 7,133,903,058 = 0.401% provides percentage.
+
+**[chiddu](cnbscience@gmail.com)**
+
+I do have `num=1000000000` and `t=32`. The script says reading 1 billion hashes and not 32 billion hashes.
+
+this is the script on which I have used
+
+```
+echo “Load 1B keys sequentially into database…..”
+bpl=10485760;overlap=10;mcz=2;del=300000000;levels=6;ctrig=4; delay=8; stop=12; wbn=3; mbc=20; mb=67108864;wbs=134217728; dds=1; sync=0; r=1000000000; t=1; vs=20; bs=4096; cs=1048576; of=500000; si=1000000; ./db_bench –benchmarks=fillseq –disable_seek_compaction=1 –mmap_read=0 –statistics=1 –histogram=1 –num=$r –threads=$t –value_size=$vs –block_size=$bs –cache_size=$cs –bloom_bits=10 –cache_numshardbits=6 –open_files=$of –verify_checksum=1 –db=/data/mysql/leveldb/test –sync=$sync –disable_wal=1 –compression_type=none –stats_interval=$si –compression_ratio=0.5 –disable_data_sync=$dds –write_buffer_size=$wbs –target_file_size_base=$mb –max_write_buffer_number=$wbn –max_background_compactions=$mbc –level0_file_num_compaction_trigger=$ctrig –level0_slowdown_writes_trigger=$delay –level0_stop_writes_trigger=$stop –num_levels=$levels –delete_obsolete_files_period_micros=$del –min_level_to_compress=$mcz –max_grandparent_overlap_factor=$overlap –stats_per_interval=1 –max_bytes_for_level_base=$bpl –use_existing_db=0 –key_size=10
+
+echo “Reading 1B keys in database in random order….”
+bpl=10485760;overlap=10;mcz=2;del=300000000;levels=6;ctrig=4; delay=8; stop=12; wbn=3; mbc=20; mb=67108864;wbs=134217728; dds=0; sync=0; r=1000000000; t=32; vs=20; bs=4096; cs=1048576; of=500000; si=1000000; ./db_bench –benchmarks=readrandom –disable_seek_compaction=1 –mmap_read=0 –statistics=1 –histogram=1 –num=$r –threads=$t –value_size=$vs –block_size=$bs –cache_size=$cs –bloom_bits=10 –cache_numshardbits=6 –open_files=$of –verify_checksum=1 –db=/some_data_base –sync=$sync –disable_wal=1 –compression_type=none –stats_interval=$si –compression_ratio=0.5 –disable_data_sync=$dds –write_buffer_size=$wbs –target_file_size_base=$mb –max_write_buffer_number=$wbn –max_background_compactions=$mbc –level0_file_num_compaction_trigger=$ctrig –level0_slowdown_writes_trigger=$delay –level0_stop_writes_trigger=$stop –num_levels=$levels –delete_obsolete_files_period_micros=$del –min_level_to_compress=$mcz –max_grandparent_overlap_factor=$overlap –stats_per_interval=1 –max_bytes_for_level_base=$bpl –use_existing_db=1 –key_size=10
+```
+
+After running this script, there were no issues wrt to loading billion hashes , but when it came to reading part, its been almost 4 days and still I have only read 7 billion hashes and have read 200 million hashes in 2 and half days. Is there something which is missing in db_bench or something which I am missing ?
+
+**[Siying Dong](siying.d@fb.com)**
+
+It’s a printing error then. If you have `num=1000000000` and `t=32`, it will be 32 threads, and each reads 1 billion keys.
diff --git a/src/rocksdb/docs/_posts/2016-01-29-compaction_pri.markdown b/src/rocksdb/docs/_posts/2016-01-29-compaction_pri.markdown
new file mode 100644
index 000000000..ba9ee627c
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2016-01-29-compaction_pri.markdown
@@ -0,0 +1,51 @@
+---
+title: Option of Compaction Priority
+layout: post
+author: sdong
+category: blog
+redirect_from:
+  - /blog/2921/compaction_pri/
+---
+
+The most popular compaction style of RocksDB is level-based compaction, which is an improved version of LevelDB's compaction algorithm. Page 9- 16 of this [slides](https://github.com/facebook/rocksdb/blob/gh-pages/talks/2015-09-29-HPTS-Siying-RocksDB.pdf) gives an illustrated introduction of this compaction style. The basic idea that: data is organized by multiple levels with exponential increasing target size. Except a special level 0, every level is key-range partitioned into many files. When size of a level exceeds its target size, we pick one or more of its files, and merge the file into the next level.
+
+<!--truncate-->
+
+Which file to pick to compact is an interesting question. LevelDB only uses one thread for compaction and it always picks files in round robin manner. We implemented multi-thread compaction in RocksDB by picking multiple files from the same level and compact them in parallel. We had to move away from LevelDB's file picking approach. Recently, we created an option [options.compaction_pri](https://github.com/facebook/rocksdb/blob/d6c838f1e130d8860407bc771fa6d4ac238859ba/include/rocksdb/options.h#L83-L93), which indicated three different algorithms to pick files to compact.
+
+Why do we need to multiple algorithms to choose from? Because there are different factors to consider when picking the files, and we now don't yet know how to balance them automatically, so we expose it to users to choose. Here are factors to consider:
+
+**Write amplification**
+
+When we estimate write amplification, we usually simplify the problem by assuming keys are uniformly distributed inside each level. In reality, it is not the case, even if user updates are uniformly distributed across the whole key range. For instance, when we compact one file of a level to the next level, it creates a hole. Over time, incoming compaction will fill data to the hole, but the density will still be lower for a while. Picking a file with keys least densely populated is more expensive to get the file to the next level, because there will be more overlapping files in the next level so we need to rewrite more data. For example, assume a file is 100MB, if an L2 file overlaps with 8 L3 files, we need to rewrite about 800MB of data to get the file to L3. If the file overlaps with 12 L3 files, we'll need to rewrite about 1200MB to get a file of the same size out of L2. It uses 50% more writes. (This analysis ignores the key density of the next level, because the range covers N times of files in that level so one hole only impacts write amplification by 1/N)
+
+If all the updates are uniformly distributed, LevelDB's approach optimizes write amplification, because a file being picked covers a range whose last compaction time to the next level is the oldest, so the range will accumulated keys from incoming compactions for the longest and the density is the highest.
+
+We created a compaction priority **kOldestSmallestSeqFirst** for the same effect. With this mode, we always pick the file covers the oldest updates in the level, which usually is contains the densest key range. If you have a use case where writes are uniformly distributed across the key space and you want to reduce write amplification, you should set options.compaction_pri=kOldestSmallestSeqFirst.
+
+**Optimize for small working set**
+
+We are assuming updates are uniformly distributed across the whole key space in previous analysis. However, in many use cases, there are subset of keys that are frequently updated while other key ranges are very cold. In this case, keeping hot key ranges from compacting to deeper levels will benefit write amplification, as well as space amplification. For example, if in a DB only key 150-160 are updated and other keys are seldom updated. If level 1 contains 20 keys, we want to keep 150-160 all stay in level 1. Because when next level 0 -> 1 compaction comes, it will simply overwrite existing keys so size level 1 doesn't increase, so no need to schedule further compaction for level 1->2. On the other hand, if we compact key 150-155 to level2, when a new Level 1->2 compaction comes, it increases the size of level 1, making size of level 1 exceed target size and more compactions will be needed, which generates more writes.
+
+The compaction priority **kOldestLargestSeqFirst** optimizes this use case. In this mode, we will pick a file whose latest update is the oldest. It means there is no incoming data for the range for the longest. Usually it is the coldest range. By compacting coldest range first, we leave the hot ranges in the level. If your use case is to overwrite existing keys in a small range, try options.compaction_pri=kOldestLargestSeqFirst**.**
+
+**Drop delete marker sooner**
+
+If one file contains a lot of delete markers, it may slow down iterating over this area, because we still need to iterate those deleted keys just to ignore them. Furthermore, the sooner we compact delete keys into the last level, the sooner the disk space is reclaimed, so it is good for space efficiency.
+
+Our default compaction priority **kByCompensatedSize** considers the case. If number of deletes in a file exceeds number of inserts, it is more likely to be picked for compaction. The more number of deletes exceed inserts, the more likely it is being compacted. The optimization is added to avoid the worst performance of space efficiency and query performance when a large percentage of the DB is deleted.
+
+**Efficiency of compaction filter**
+
+Usually people use [compaction filters](https://github.com/facebook/rocksdb/blob/v4.1/include/rocksdb/options.h#L201-L226) to clean up old data to free up space. Picking files to compact may impact space efficiency. We don't yet have a a compaction priority to optimize this case. In some of our use cases, we solved the problem in a different way: we have an external service checking modify time of all SST files. If any of the files is too old, we force the single file to compaction by calling DB::CompactFiles() using the single file. In this way, we can provide a time bound of data passing through compaction filters.
+
+
+In all, there three choices of compaction priority modes optimizing different scenarios. if you have a new use case, we suggest you start with `options.compaction_pri=kOldestSmallestSeqFirst` (note it is not the default one for backward compatible reason). If you want to further optimize your use case, you can try other two use cases if your use cases apply.
+
+If you have good ideas about better compaction picker approach, you are welcome to implement and benchmark it. We'll be glad to review and merge your a pull requests.
+
+### Comments
+
+**[Mark Callaghan](mdcallag@gmail.com)**
+
+Performance results for compaction_pri values and linkbench are explained at [http://smalldatum.blogspot.com/2016/02/compaction-priority-in-rocksdb.html](http://smalldatum.blogspot.com/2016/02/compaction-priority-in-rocksdb.html)
diff --git a/src/rocksdb/docs/_posts/2016-02-24-rocksdb-4-2-release.markdown b/src/rocksdb/docs/_posts/2016-02-24-rocksdb-4-2-release.markdown
new file mode 100644
index 000000000..409015cc8
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2016-02-24-rocksdb-4-2-release.markdown
@@ -0,0 +1,41 @@
+---
+title: RocksDB 4.2 Release!
+layout: post
+author: sdong
+category: blog
+redirect_from:
+  - /blog/3017/rocksdb-4-2-release/
+---
+
+New RocksDB release - 4.2!
+
+
+**New Features**
+
+  1. Introduce CreateLoggerFromOptions(), this function create a Logger for provided DBOptions.
+
+
+  2. Add GetAggregatedIntProperty(), which returns the sum of the GetIntProperty of all the column families.
+
+
+  3. Add MemoryUtil in rocksdb/utilities/memory.h. It currently offers a way to get the memory usage by type from a list rocksdb instances.
+
+
+<!--truncate-->
+
+
+**Public API changes**
+
+  1. CompactionFilter::Context includes information of Column Family ID
+
+
+  2. The need-compaction hint given by TablePropertiesCollector::NeedCompact() will be persistent and recoverable after DB recovery. This introduces a breaking format change. If you use this experimental feature, including NewCompactOnDeletionCollectorFactory() in the new version, you may not be able to directly downgrade the DB back to version 4.0 or lower.
+
+
+  3. TablePropertiesCollectorFactory::CreateTablePropertiesCollector() now takes an option Context, containing the information of column family ID for the file being written.
+
+
+  4. Remove DefaultCompactionFilterFactory.
+
+
+[https://github.com/facebook/rocksdb/releases/tag/v4.2](https://github.com/facebook/rocksdb/releases/tag/v4.2)
diff --git a/src/rocksdb/docs/_posts/2016-02-25-rocksdb-ama.markdown b/src/rocksdb/docs/_posts/2016-02-25-rocksdb-ama.markdown
new file mode 100644
index 000000000..2ba04f39a
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2016-02-25-rocksdb-ama.markdown
@@ -0,0 +1,20 @@
+---
+title: RocksDB AMA
+layout: post
+author: yhchiang
+category: blog
+redirect_from:
+  - /blog/3065/rocksdb-ama/
+---
+
+RocksDB developers are doing a Reddit Ask-Me-Anything now at 10AM – 11AM PDT! We welcome you to stop by and ask any RocksDB related questions, including existing / upcoming features, tuning tips, or database design.
+
+Here are some enhancements that we'd like to focus on over the next six months:
+
+* 2-Phase Commit
+* Lua support in some custom functions
+* Backup and repair tools
+* Direct I/O to bypass OS cache
+* RocksDB Java API
+
+[https://www.reddit.com/r/IAmA/comments/47k1si/we_are_rocksdb_developers_ask_us_anything/](https://www.reddit.com/r/IAmA/comments/47k1si/we_are_rocksdb_developers_ask_us_anything/)
diff --git a/src/rocksdb/docs/_posts/2016-03-07-rocksdb-options-file.markdown b/src/rocksdb/docs/_posts/2016-03-07-rocksdb-options-file.markdown
new file mode 100644
index 000000000..703449b01
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2016-03-07-rocksdb-options-file.markdown
@@ -0,0 +1,24 @@
+---
+title: RocksDB Options File
+layout: post
+author: yhciang
+category: blog
+redirect_from:
+  - /blog/3089/rocksdb-options-file/
+---
+
+In RocksDB 4.3, we added a new set of features that makes managing RocksDB options easier.  Specifically:
+
+  * **Persisting Options Automatically**: Each RocksDB database will now automatically persist its current set of options into an INI file on every successful call of DB::Open(), SetOptions(), and CreateColumnFamily() / DropColumnFamily().
+
+
+
+  * **Load Options from File**: We added [LoadLatestOptions() / LoadOptionsFromFile()](https://github.com/facebook/rocksdb/blob/4.3.fb/include/rocksdb/utilities/options_util.h#L48-L58) that enables developers to construct RocksDB options object from an options file.
+
+
+
+  * **Sanity Check Options**: We added [CheckOptionsCompatibility](https://github.com/facebook/rocksdb/blob/4.3.fb/include/rocksdb/utilities/options_util.h#L64-L77) that performs compatibility check on two sets of RocksDB options.
+
+<!--truncate-->
+
+Want to know more about how to use this new features? Check out the [RocksDB Options File wiki page](https://github.com/facebook/rocksdb/wiki/RocksDB-Options-File) and start using this new feature today!
diff --git a/src/rocksdb/docs/_posts/2016-04-26-rocksdb-4-5-1-released.markdown b/src/rocksdb/docs/_posts/2016-04-26-rocksdb-4-5-1-released.markdown
new file mode 100644
index 000000000..247768d30
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2016-04-26-rocksdb-4-5-1-released.markdown
@@ -0,0 +1,60 @@
+---
+title: RocksDB 4.5.1 Released!
+layout: post
+author: sdong
+category: blog
+redirect_from:
+  - /blog/3179/rocksdb-4-5-1-released/
+---
+
+## 4.5.1 (3/25/2016)
+
+### Bug Fixes
+
+  *  Fix failures caused by the destorying order of singleton objects.
+
+<br/>
+
+## 4.5.0 (2/5/2016)
+
+### Public API Changes
+
+  * Add a new perf context level between kEnableCount and kEnableTime. Level 2 now does not include timers for mutexes.
+  * Statistics of mutex operation durations will not be measured by default. If you want to have them enabled, you need to set Statistics::stats_level_ to kAll.
+  * DBOptions::delete_scheduler and NewDeleteScheduler() are removed, please use DBOptions::sst_file_manager and NewSstFileManager() instead
+
+### New Features
+  * ldb tool now supports operations to non-default column families.
+  * Add kPersistedTier to ReadTier. This option allows Get and MultiGet to read only the persited data and skip mem-tables if writes were done with disableWAL = true.
+  * Add DBOptions::sst_file_manager. Use NewSstFileManager() in include/rocksdb/sst_file_manager.h to create a SstFileManager that can be used to track the total size of SST files and control the SST files deletion rate.
+
+<br/>
+
+<!--truncate-->
+
+## 4.4.0 (1/14/2016)
+
+### Public API Changes
+
+  * Change names in CompactionPri and add a new one.
+  * Deprecate options.soft_rate_limit and add options.soft_pending_compaction_bytes_limit.
+  * If options.max_write_buffer_number > 3, writes will be slowed down when writing to the last write buffer to delay a full stop.
+  * Introduce CompactionJobInfo::compaction_reason, this field include the reason to trigger the compaction.
+  * After slow down is triggered, if estimated pending compaction bytes keep increasing, slowdown more.
+  * Increase default options.delayed_write_rate to 2MB/s.
+  * Added a new parameter --path to ldb tool. --path accepts the name of either MANIFEST, SST or a WAL file. Either --db or --path can be used when calling ldb.
+
+<br/>
+
+## 4.3.0 (12/8/2015)
+
+### New Features
+
+  * CompactionFilter has new member function called IgnoreSnapshots which allows CompactionFilter to be called even if there are snapshots later than the key.
+  * RocksDB will now persist options under the same directory as the RocksDB database on successful DB::Open, CreateColumnFamily, DropColumnFamily, and SetOptions.
+  * Introduce LoadLatestOptions() in rocksdb/utilities/options_util.h. This function can construct the latest DBOptions / ColumnFamilyOptions used by the specified RocksDB intance.
+  * Introduce CheckOptionsCompatibility() in rocksdb/utilities/options_util.h. This function checks whether the input set of options is able to open the specified DB successfully.
+
+### Public API Changes
+
+  * When options.db_write_buffer_size triggers, only the column family with the largest column family size will be flushed, not all the column families.
diff --git a/src/rocksdb/docs/_posts/2016-07-26-rocksdb-4-8-released.markdown b/src/rocksdb/docs/_posts/2016-07-26-rocksdb-4-8-released.markdown
new file mode 100644
index 000000000..0db275ddf
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2016-07-26-rocksdb-4-8-released.markdown
@@ -0,0 +1,48 @@
+---
+title: RocksDB 4.8 Released!
+layout: post
+author: yiwu
+category: blog
+redirect_from:
+  - /blog/3239/rocksdb-4-8-released/
+---
+
+## 4.8.0 (5/2/2016)
+
+### [](https://github.com/facebook/rocksdb/blob/main/HISTORY.md#public-api-change-1)Public API Change
+
+  * Allow preset compression dictionary for improved compression of block-based tables. This is supported for zlib, zstd, and lz4. The compression dictionary's size is configurable via CompressionOptions::max_dict_bytes.
+  * Delete deprecated classes for creating backups (BackupableDB) and restoring from backups (RestoreBackupableDB). Now, BackupEngine should be used for creating backups, and BackupEngineReadOnly should be used for restorations. For more details, see [https://github.com/facebook/rocksdb/wiki/How-to-backup-RocksDB%3F](https://github.com/facebook/rocksdb/wiki/How-to-backup-RocksDB%3F)
+  * Expose estimate of per-level compression ratio via DB property: "rocksdb.compression-ratio-at-levelN".
+  * Added EventListener::OnTableFileCreationStarted. EventListener::OnTableFileCreated will be called on failure case. User can check creation status via TableFileCreationInfo::status.
+
+### [](https://github.com/facebook/rocksdb/blob/main/HISTORY.md#new-features-2)New Features
+
+  * Add ReadOptions::readahead_size. If non-zero, NewIterator will create a new table reader which performs reads of the given size.
+
+<br/>
+
+<!--truncate-->
+
+## [](https://github.com/facebook/rocksdb/blob/main/HISTORY.md#470-482016)4.7.0 (4/8/2016)
+
+### [](https://github.com/facebook/rocksdb/blob/main/HISTORY.md#public-api-change-2)Public API Change
+
+  * rename options compaction_measure_io_stats to report_bg_io_stats and include flush too.
+  * Change some default options. Now default options will optimize for server-workloads. Also enable slowdown and full stop triggers for pending compaction bytes. These changes may cause sub-optimal performance or significant increase of resource usage. To avoid these risks, users can open existing RocksDB with options extracted from RocksDB option files. See [https://github.com/facebook/rocksdb/wiki/RocksDB-Options-File](https://github.com/facebook/rocksdb/wiki/RocksDB-Options-File) for how to use RocksDB option files. Or you can call Options.OldDefaults() to recover old defaults. DEFAULT_OPTIONS_HISTORY.md will track change history of default options.
+
+<br/>
+
+## [](https://github.com/facebook/rocksdb/blob/main/HISTORY.md#460-3102016)4.6.0 (3/10/2016)
+
+### [](https://github.com/facebook/rocksdb/blob/main/HISTORY.md#public-api-changes-1)Public API Changes
+
+  * Change default of BlockBasedTableOptions.format_version to 2. It means default DB created by 4.6 or up cannot be opened by RocksDB version 3.9 or earlier
+  * Added strict_capacity_limit option to NewLRUCache. If the flag is set to true, insert to cache will fail if no enough capacity can be free. Signature of Cache::Insert() is updated accordingly.
+  * Tickers [NUMBER_DB_NEXT, NUMBER_DB_PREV, NUMBER_DB_NEXT_FOUND, NUMBER_DB_PREV_FOUND, ITER_BYTES_READ] are not updated immediately. The are updated when the Iterator is deleted.
+  * Add monotonically increasing counter (DB property "rocksdb.current-super-version-number") that increments upon any change to the LSM tree.
+
+### [](https://github.com/facebook/rocksdb/blob/main/HISTORY.md#new-features-3)New Features
+
+  * Add CompactionPri::kMinOverlappingRatio, a compaction picking mode friendly to write amplification.
+  * Deprecate Iterator::IsKeyPinned() and replace it with Iterator::GetProperty() with prop_name="rocksdb.iterator.is.key.pinned"
diff --git a/src/rocksdb/docs/_posts/2016-09-28-rocksdb-4-11-2-released.markdown b/src/rocksdb/docs/_posts/2016-09-28-rocksdb-4-11-2-released.markdown
new file mode 100644
index 000000000..87c20eb47
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2016-09-28-rocksdb-4-11-2-released.markdown
@@ -0,0 +1,49 @@
+---
+title: RocksDB 4.11.2 Released!
+layout: post
+author: sdong
+category: blog
+---
+We abandoned release candidates 4.10.x and directly go to 4.11.2 from 4.9, to make sure the latest release is stable. In 4.11.2, we fixed several data corruption related bugs introduced in 4.9.0.
+
+## 4.11.2 (9/15/2016)
+
+### Bug fixes
+
+  * Segfault when failing to open an SST file for read-ahead iterators.
+  * WAL without data for all CFs is not deleted after recovery.
+
+<!--truncate-->
+
+## 4.11.1 (8/30/2016)
+
+### Bug Fixes
+
+  * Mitigate the regression bug of deadlock condition during recovery when options.max_successive_merges hits.
+  * Fix data race condition related to hash index in block based table when putting indexes in the block cache.
+
+## 4.11.0 (8/1/2016)
+
+### Public API Change
+
+  * options.memtable_prefix_bloom_huge_page_tlb_size => memtable_huge_page_size. When it is set, RocksDB will try to allocate memory from huge page for memtable too, rather than just memtable bloom filter.
+
+### New Features
+
+  * A tool to migrate DB after options change. See include/rocksdb/utilities/option_change_migration.h.
+  * Add ReadOptions.background_purge_on_iterator_cleanup. If true, we avoid file deletion when destorying iterators.
+
+## 4.10.0 (7/5/2016)
+
+### Public API Change
+
+  * options.memtable_prefix_bloom_bits changes to options.memtable_prefix_bloom_bits_ratio and deprecate options.memtable_prefix_bloom_probes
+  * enum type CompressionType and PerfLevel changes from char to unsigned char. Value of all PerfLevel shift by one.
+  * Deprecate options.filter_deletes.
+
+### New Features
+
+  * Add avoid_flush_during_recovery option.
+  * Add a read option background_purge_on_iterator_cleanup to avoid deleting files in foreground when destroying iterators. Instead, a job is scheduled in high priority queue and would be executed in a separate background thread.
+  * RepairDB support for column families. RepairDB now associates data with non-default column families using information embedded in the SST/WAL files (4.7 or later). For data written by 4.6 or earlier, RepairDB associates it with the default column family.
+  * Add options.write_buffer_manager which allows users to control total memtable sizes across multiple DB instances.
diff --git a/src/rocksdb/docs/_posts/2017-01-06-rocksdb-5-0-1-released.markdown b/src/rocksdb/docs/_posts/2017-01-06-rocksdb-5-0-1-released.markdown
new file mode 100644
index 000000000..fb0413055
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2017-01-06-rocksdb-5-0-1-released.markdown
@@ -0,0 +1,26 @@
+---
+title: RocksDB 5.0.1 Released!
+layout: post
+author: yiwu
+category: blog
+---
+
+### Public API Change
+
+  * Options::max_bytes_for_level_multiplier is now a double along with all getters and setters.
+  * Support dynamically change `delayed_write_rate` and `max_total_wal_size` options via SetDBOptions().
+  * Introduce DB::DeleteRange for optimized deletion of large ranges of contiguous keys.
+  * Support dynamically change `delayed_write_rate` option via SetDBOptions().
+  * Options::allow_concurrent_memtable_write and Options::enable_write_thread_adaptive_yield are now true by default.
+  * Remove Tickers::SEQUENCE_NUMBER to avoid confusion if statistics object is shared among RocksDB instance. Alternatively DB::GetLatestSequenceNumber() can be used to get the same value.
+  * Options.level0_stop_writes_trigger default value changes from 24 to 32.
+  * New compaction filter API: CompactionFilter::FilterV2(). Allows to drop ranges of keys.
+  * Removed flashcache support.
+  * DB::AddFile() is deprecated and is replaced with DB::IngestExternalFile(). DB::IngestExternalFile() remove all the restrictions that existed for DB::AddFile.
+
+### New Features
+
+  * Add avoid_flush_during_shutdown option, which speeds up DB shutdown by not flushing unpersisted data (i.e. with disableWAL = true). Unpersisted data will be lost. The options is dynamically changeable via SetDBOptions().
+  * Add memtable_insert_with_hint_prefix_extractor option. The option is mean to reduce CPU usage for inserting keys into memtable, if keys can be group by prefix and insert for each prefix are sequential or almost sequential. See include/rocksdb/options.h for more details.
+  * Add LuaCompactionFilter in utilities.  This allows developers to write compaction filters in Lua.  To use this feature, LUA_PATH needs to be set to the root directory of Lua.
+  * No longer populate "LATEST_BACKUP" file in backup directory, which formerly contained the number of the latest backup. The latest backup can be determined by finding the highest numbered file in the "meta/" subdirectory.
diff --git a/src/rocksdb/docs/_posts/2017-02-07-rocksdb-5-1-2-released.markdown b/src/rocksdb/docs/_posts/2017-02-07-rocksdb-5-1-2-released.markdown
new file mode 100644
index 000000000..35bafb219
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2017-02-07-rocksdb-5-1-2-released.markdown
@@ -0,0 +1,15 @@
+---
+title: RocksDB 5.1.2 Released!
+layout: post
+author: maysamyabandeh
+category: blog
+---
+
+### Public API Change
+* Support dynamically change `delete_obsolete_files_period_micros` option via SetDBOptions().
+* Added EventListener::OnExternalFileIngested which will be called when IngestExternalFile() add a file successfully.
+* BackupEngine::Open and BackupEngineReadOnly::Open now always return error statuses matching those of the backup Env.
+
+### Bug Fixes
+* Fix the bug that if 2PC is enabled, checkpoints may loss some recent transactions.
+* When file copying is needed when creating checkpoints or bulk loading files, fsync the file after the file copying.
diff --git a/src/rocksdb/docs/_posts/2017-02-17-bulkoad-ingest-sst-file.markdown b/src/rocksdb/docs/_posts/2017-02-17-bulkoad-ingest-sst-file.markdown
new file mode 100644
index 000000000..9a43a846a
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2017-02-17-bulkoad-ingest-sst-file.markdown
@@ -0,0 +1,50 @@
+---
+title: Bulkloading by ingesting external SST files
+layout: post
+author: IslamAbdelRahman
+category: blog
+---
+
+## Introduction
+
+One of the basic operations of RocksDB is writing to RocksDB, Writes happen when user call (DB::Put, DB::Write, DB::Delete ... ), but what happens when you write to RocksDB ? .. this is a brief description of what happens.
+- User insert a new key/value by calling DB::Put() (or DB::Write())
+- We create a new entry for the new key/value in our in-memory structure (memtable / SkipList by default) and we assign it a new sequence number.
+- When the memtable exceeds a specific size (64 MB for example), we convert this memtable to a SST file, and put this file in level 0 of our LSM-Tree
+- Later, compaction will kick in and move data from level 0 to level 1, and then from level 1 to level 2 .. and so on 
+
+But what if we can skip these steps and add data to the lowest possible level directly ? This is what bulk-loading does
+
+## Bulkloading
+
+- Write all of our keys and values into SST file outside of the DB
+- Add the SST file into the LSM directly
+
+This is bulk-loading, and in specific use-cases it allow users to achieve faster data loading and better write-amplification.
+
+and doing it is as simple as 
+```cpp
+Options options;
+SstFileWriter sst_file_writer(EnvOptions(), options, options.comparator);
+Status s = sst_file_writer.Open(file_path);
+assert(s.ok());
+
+// Insert rows into the SST file, note that inserted keys must be 
+// strictly increasing (based on options.comparator)
+for (...) {
+  s = sst_file_writer.Add(key, value);
+  assert(s.ok());
+}
+
+// Ingest the external SST file into the DB
+s = db_->IngestExternalFile({"/home/usr/file1.sst"}, IngestExternalFileOptions());
+assert(s.ok());
+```
+
+You can find more details about how to generate SST files and ingesting them into RocksDB in this [wiki page](https://github.com/facebook/rocksdb/wiki/Creating-and-Ingesting-SST-files)
+
+## Use cases
+There are multiple use cases where bulkloading could be useful, for example
+- Generating SST files in offline jobs in Hadoop, then downloading and ingesting the SST files into RocksDB
+- Migrating shards between machines by dumping key-range in SST File and loading the file in a different machine
+- Migrating from a different storage (InnoDB to RocksDB migration in MyRocks)
diff --git a/src/rocksdb/docs/_posts/2017-03-02-rocksdb-5-2-1-released.markdown b/src/rocksdb/docs/_posts/2017-03-02-rocksdb-5-2-1-released.markdown
new file mode 100644
index 000000000..c6ce27d64
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2017-03-02-rocksdb-5-2-1-released.markdown
@@ -0,0 +1,22 @@
+---
+title: RocksDB 5.2.1 Released!
+layout: post
+author: sdong
+category: blog
+---
+
+### Public API Change
+* NewLRUCache() will determine number of shard bits automatically based on capacity, if the user doesn't pass one. This also impacts the default block cache when the user doesn't explict provide one.
+* Change the default of delayed slowdown value to 16MB/s and further increase the L0 stop condition to 36 files.
+
+### New Features
+* Added new overloaded function GetApproximateSizes that allows to specify if memtable stats should be computed only without computing SST files' stats approximations.
+* Added new function GetApproximateMemTableStats that approximates both number of records and size of memtables.
+* (Experimental) Two-level indexing that partition the index and creates a 2nd level index on the partitions. The feature can be enabled by setting kTwoLevelIndexSearch as IndexType and configuring index_per_partition.
+
+### Bug Fixes
+* RangeSync() should work if ROCKSDB_FALLOCATE_PRESENT is not set
+* Fix wrong results in a data race case in Get()
+* Some fixes related to 2PC.
+* Fix several bugs in Direct I/O supports.
+* Fix a regression bug which can cause Seek() to miss some keys if the return key has been updated many times after the snapshot which is used by the iterator.
diff --git a/src/rocksdb/docs/_posts/2017-05-12-partitioned-index-filter.markdown b/src/rocksdb/docs/_posts/2017-05-12-partitioned-index-filter.markdown
new file mode 100644
index 000000000..a537feb0c
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2017-05-12-partitioned-index-filter.markdown
@@ -0,0 +1,34 @@
+---
+title: Partitioned Index/Filters
+layout: post
+author: maysamyabandeh
+category: blog
+---
+
+As DB/mem ratio gets larger, the memory footprint of filter/index blocks becomes non-trivial. Although `cache_index_and_filter_blocks` allows storing only a subset of them in block cache, their relatively large size negatively affects the performance by i) occupying the block cache space that could otherwise be used for caching data, ii) increasing the load on the disk storage by loading them into the cache after a miss. Here we illustrate these problems in more detail and explain how partitioning index/filters alleviates the overhead.
+
+### How large are the index/filter blocks?
+
+RocksDB has by default one index/filter block per SST file. The size of the index/filter varies based on the configuration but for a SST of size 256MB the index/filter block of size 0.5/5MB is typical, which is much larger than the typical data block size of 4-32KB. That is fine when all index/filters fit perfectly into memory and hence are read once per SST lifetime, not so much when they compete with data blocks for the block cache space and are also likely to be re-read many times from the disk.
+
+### What is the big deal with large index/filter blocks?
+
+When index/filter blocks are stored in block cache they are effectively competing with data blocks (as well as with each other) on this scarce resource. A filter of size 5MB is occupying the space that could otherwise be used to cache 1000s of data blocks (of size 4KB). This would result in more cache misses for data blocks. The large index/filters also kick each other out of the block cache more often and exacerbate their own cache miss rate too. This is while only a small part of the index/filter block might have been actually used during its lifetime in the cache.
+
+After the cache miss of an index/filter, it has to be reloaded from the disk, and its large size is not helping in reducing the IO cost. While a simple point lookup might need at most a couple of data block reads (of size 4KB) one from each layer of LSM, it might end up also loading multiple megabytes of index/filter blocks. If that happens often then the disk is spending more time serving index/filters rather than the actual data blocks.
+
+## What is partitioned index/filters?
+
+With partitioning, the index/filter of a SST file is partitioned into smaller blocks with an additional top-level index on them. When reading an index/filter, only top-level index is loaded into memory. The partitioned index/filter then uses the top-level index to load on demand into the block cache the partitions that are required to perform the index/filter query. The top-level index, which has much smaller memory footprint, can be stored in heap or block cache depending on the `cache_index_and_filter_blocks` setting.
+
+### Success stories
+
+#### HDD, 100TB DB
+
+In this example we have a DB of size 86G on HDD and emulate the small memory that is present to a node with 100TB of data by using direct IO (skipping OS file cache) and a very small block cache of size 60MB. Partitioning improves throughput by 11x from 5 op/s to 55 op/s.
+
+#### SSD, Linkbench
+
+In this example we have a DB of size 300G on SSD and emulate the small memory that would be available in presence of other DBs on the same node by by using direct IO (skipping OS file cache) and block cache of size 6G and 2G. Without partitioning the linkbench throughput drops from 38k tps to 23k when reducing block cache size from 6G to 2G. With partitioning the throughput drops from 38k to only 30k.
+
+Learn more [here](https://github.com/facebook/rocksdb/wiki/Partitioned-Index-Filters).
diff --git a/src/rocksdb/docs/_posts/2017-05-14-core-local-stats.markdown b/src/rocksdb/docs/_posts/2017-05-14-core-local-stats.markdown
new file mode 100644
index 000000000..a806541fc
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2017-05-14-core-local-stats.markdown
@@ -0,0 +1,106 @@
+---
+title: Core-local Statistics
+layout: post
+author: ajkr
+category: blog
+---
+
+## Origins: Global Atomics
+
+Until RocksDB 4.12, ticker/histogram statistics were implemented with std::atomic values shared across the entire program. A ticker consists of a single atomic, while a histogram consists of several atomics to represent things like min/max/per-bucket counters. These statistics could be updated by all user/background threads.
+
+For concurrent/high-throughput workloads, cache line bouncing of atomics caused high CPU utilization. For example, we have tickers that count block cache hits and misses. Almost every user read increments these tickers a few times. Many concurrent user reads would cause the cache lines containing these atomics to bounce between cores.
+
+### Performance
+
+Here are perf results for 32 reader threads where most reads (99%+) are served by uncompressed block cache. Such a scenario stresses the statistics code heavily.
+
+Benchmark command: `TEST_TMPDIR=/dev/shm/ perf record -g ./db_bench -statistics -use_existing_db=true -benchmarks=readrandom -threads=32 -cache_size=1048576000 -num=1000000 -reads=1000000 && perf report -g --children`
+
+Perf snippet for "cycles" event:
+
+```
+  Children  Self    Command   Shared Object  Symbol
++   30.33%  30.17%  db_bench  db_bench       [.] rocksdb::StatisticsImpl::recordTick
++    3.65%   0.98%  db_bench  db_bench       [.] rocksdb::StatisticsImpl::measureTime
+```
+
+Perf snippet for "cache-misses" event:
+
+```
+  Children  Self    Command   Shared Object  Symbol
++   19.54%  19.50%  db_bench  db_bench 	     [.] rocksdb::StatisticsImpl::recordTick
++    3.44%   0.57%  db_bench  db_bench       [.] rocksdb::StatisticsImpl::measureTime
+```
+
+The high CPU overhead for updating tickers and histograms corresponds well to the high cache misses.
+
+## Thread-locals: Faster Updates
+
+Since RocksDB 4.12, ticker/histogram statistics use thread-local storage. Each thread has a local set of atomic values that no other thread can update. This prevents the cache line bouncing problem described above. Even though updates to a given value are always made by the same thread, atomics are still useful to synchronize with aggregations for querying statistics.
+
+Implementing this approach involved a couple challenges. First, each query for a statistic's global value must aggregate all threads' local values. This adds some overhead, which may pass unnoticed if statistics are queried infrequently. Second, exited threads' local values are still needed to provide accurate statistics. We handle this by merging a thread's local values into process-wide variables upon thread exit.
+
+### Performance
+
+Update benchmark setup is same as before. CPU overhead improved 7.8x compared to global atomics, corresponding to a 17.8x reduction in cache-misses overhead.
+
+Perf snippet for "cycles" event:
+
+```
+  Children  Self    Command   Shared Object  Symbol
++    2.96%  0.87%   db_bench  db_bench       [.] rocksdb::StatisticsImpl::recordTick
++    1.37%  0.10%   db_bench  db_bench       [.] rocksdb::StatisticsImpl::measureTime
+```
+
+Perf snippet for "cache-misses" event:
+
+```
+  Children  Self    Command   Shared Object  Symbol
++    1.21%  0.65%   db_bench  db_bench       [.] rocksdb::StatisticsImpl::recordTick
+     0.08%  0.00%   db_bench  db_bench       [.] rocksdb::StatisticsImpl::measureTime
+```
+
+To measure statistics query latency, we ran sysbench with 4K OLTP clients concurrently with one client that queries statistics repeatedly. Times shown are in milliseconds.
+
+```
+ min: 18.45
+ avg: 27.91
+ max: 231.65
+ 95th percentile: 55.82
+```
+
+## Core-locals: Faster Querying
+
+The thread-local approach is working well for applications calling RocksDB from only a few threads, or polling statistics infrequently. Eventually, though, we found use cases where those assumptions do not hold. For example, one application has per-connection threads and typically runs into performance issues when connection count grows very high. For debugging such issues, they want high-frequency statistics polling to correlate issues in their application with changes in RocksDB's state.
+
+Once [PR #2258](https://github.com/facebook/rocksdb/pull/2258) lands, ticker/histogram statistics will be local to each CPU core. Similarly to thread-local, each core updates only its local values, thus avoiding cache line bouncing. Local values are still atomics to make aggregation possible. With this change, query work depends only on number of cores, not the number of threads. So, applications with many more threads than cores can no longer impact statistics query latency.
+
+### Performance
+
+Update benchmark setup is same as before. CPU overhead worsened ~23% compared to thread-local, while cache performance was unchanged.
+
+Perf snippet for "cycles" event:
+
+```
+  Children  Self    Command   Shared Object  Symbol
++    2.96%  0.87%   db_bench  db_bench       [.] rocksdb::StatisticsImpl::recordTick
++    1.37%  0.10%   db_bench  db_bench       [.] rocksdb::StatisticsImpl::measureTime
+```
+
+Perf snippet for "cache-misses" event:
+
+```
+  Children  Self    Command   Shared Object  Symbol
++    1.21%  0.65%   db_bench  db_bench       [.] rocksdb::StatisticsImpl::recordTick
+     0.08%  0.00%   db_bench  db_bench       [.] rocksdb::StatisticsImpl::measureTime
+```
+
+Query latency is measured same as before with times in milliseconds. Average latency improved by 6.3x compared to thread-local.
+
+```
+ min: 2.47
+ avg: 4.45
+ max: 91.13
+ 95th percentile: 7.56
+```
diff --git a/src/rocksdb/docs/_posts/2017-05-26-rocksdb-5-4-5-released.markdown b/src/rocksdb/docs/_posts/2017-05-26-rocksdb-5-4-5-released.markdown
new file mode 100644
index 000000000..561dab4c2
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2017-05-26-rocksdb-5-4-5-released.markdown
@@ -0,0 +1,39 @@
+---
+title: RocksDB 5.4.5 Released!
+layout: post
+author: sagar0
+category: blog
+---
+
+### Public API Change
+* Support dynamically changing `stats_dump_period_sec` option via SetDBOptions().
+* Added ReadOptions::max_skippable_internal_keys to set a threshold to fail a request as incomplete when too many keys are being skipped while using iterators.
+* DB::Get in place of std::string accepts PinnableSlice, which avoids the extra memcpy of value to std::string in most of cases.
+    * PinnableSlice releases the pinned resources that contain the value when it is destructed or when ::Reset() is called on it.
+    * The old API that accepts std::string, although discouraged, is still supported.
+* Replace Options::use_direct_writes with Options::use_direct_io_for_flush_and_compaction. See Direct IO wiki for details.
+
+### New Features
+* Memtable flush can be avoided during checkpoint creation if total log file size is smaller than a threshold specified by the user.
+* Introduce level-based L0->L0 compactions to reduce file count, so write delays are incurred less often.
+* (Experimental) Partitioning filters which creates an index on the partitions. The feature can be enabled by setting partition_filters when using kFullFilter. Currently the feature also requires two-level indexing to be enabled. Number of partitions is the same as the number of partitions for indexes, which is controlled by metadata_block_size.
+* DB::ResetStats() to reset internal stats.
+* Added CompactionEventListener and EventListener::OnFlushBegin interfaces.
+* Added DB::CreateColumnFamilie() and DB::DropColumnFamilies() to bulk create/drop column families.
+* Facility for cross-building RocksJava using Docker.
+
+### Bug Fixes
+* Fix WriteBatchWithIndex address use after scope error.
+* Fix WritableFile buffer size in direct IO.
+* Add prefetch to PosixRandomAccessFile in buffered io.
+* Fix PinnableSlice access invalid address when row cache is enabled.
+* Fix huge fallocate calls fail and make XFS unhappy.
+* Fix memory alignment with logical sector size.
+* Fix alignment in ReadaheadRandomAccessFile.
+* Fix bias with read amplification stats (READ_AMP_ESTIMATE_USEFUL_BYTES and READ_AMP_TOTAL_READ_BYTES).
+* Fix a manual / auto compaction data race.
+* Fix CentOS 5 cross-building of RocksJava.
+* Build and link with ZStd when creating the static RocksJava build.
+* Fix snprintf's usage to be cross-platform.
+* Fix build errors with blob DB.
+* Fix readamp test type inconsistency.
diff --git a/src/rocksdb/docs/_posts/2017-06-26-17-level-based-changes.markdown b/src/rocksdb/docs/_posts/2017-06-26-17-level-based-changes.markdown
new file mode 100644
index 000000000..9e838eb7f
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2017-06-26-17-level-based-changes.markdown
@@ -0,0 +1,60 @@
+---
+title: Level-based Compaction Changes
+layout: post
+author: ajkr
+category: blog
+---
+
+### Introduction
+
+RocksDB provides an option to limit the number of L0 files, which bounds read-amplification. Since L0 files (unlike files at lower levels) can span the entire key-range, a key might be in any file, thus reads need to check them one-by-one. Users often wish to configure a low limit to improve their read latency.
+
+Although, the mechanism with which we enforce L0's file count limit may be unappealing. When the limit is reached, RocksDB intentionally delays user writes. This slows down accumulation of files in L0, and frees up resources for compacting files down to lower levels. But adding delays will significantly increase user-visible write latency jitter.
+
+Also, due to how L0 files can span the entire key-range, compaction parallelization is limited. Files at L0 or L1 may be locked due to involvement in pending L0->L1 or L1->L2 compactions. We can only schedule a parallel L0->L1 compaction if it does not require any of the locked files, which is typically not the case.
+
+To handle these constraints better, we added a new type of compaction, L0->L0. It quickly reduces file count in L0 and can be scheduled even when L1 files are locked, unlike L0->L1. We also changed the L0->L1 picking algorithm to increase opportunities for parallelism.
+
+### Old L0->L1 Picking Logic
+
+Previously, our logic for picking which L0 file to compact was the same as every other level: pick the largest file in the level. One special property of L0->L1 compaction is that files can overlap in the input level, so those overlapping files must be pulled in as well. For example, a compaction may look like this:
+
+![full-range.png](/static/images/compaction/full-range.png)
+
+This compaction pulls in every L0 and L1 file. This happens regardless of which L0 file is initially chosen as each file overlaps with every other file.
+
+Users may insert their data less uniformly in the key-range. For example, a database may look like this during L0->L1 compaction:
+
+![part-range-old.png](/static/images/compaction/part-range-old.png)
+
+Let's say the third file from the top is the largest, and let's say the top two files are created after the compaction started. When the compaction is picked, the fourth L0 file and six rightmost L1 files are pulled in due to overlap. Notice this leaves the database in a state where we might not be able to schedule parallel compactions. For example, if the sixth file from the top is the next largest, we can't compact it because it overlaps with the top two files, which overlap with the locked L0 files.
+
+We can now see the high-level problems with this approach more clearly. First, locked files in L0 or L1 prevent us from parallelizing compactions. When locked files block L0->L1 compaction, there is nothing we can do to eliminate L0 files. Second, L0->L1 compactions are relatively slow. As we saw, when keys are uniformly distributed, L0->L1 compacts two entire levels. While this is happening, new files are being flushed to L0, advancing towards the file count limit.
+
+### New L0->L0 Algorithm
+
+We introduced compaction within L0 to improve both parallelization and speed of reducing L0 file count. An L0->L0 compaction may look like this:
+
+![l1-l2-contend.png](/static/images/compaction/l1-l2-contend.png)
+
+Say the L1->L2 compaction started first. Now L0->L1 is prevented by the locked L1 file. In this case, we compact files within L0. This allows us to start the work for eliminating L0 files earlier. It also lets us do less work since we don't pull in any L1 files, whereas L0->L1 compaction would've pulled in all of them. This lets us quickly reduce L0 file count to keep read-amp low while sustaining large bursts of writes (i.e., fast accumulation of L0 files).
+
+The tradeoff is this increases total compaction work, as we're now compacting files without contributing towards our eventual goal of moving them towards lower levels. Our benchmarks, though, consistently show less compaction stalls and improved write throughput. One justification is that L0 file data is highly likely in page cache and/or block cache due to it being recently written and frequently accessed. So, this type of compaction is relatively cheap compared to compactions at lower levels.
+
+This feature is available since RocksDB 5.4.
+
+### New L0->L1 Picking Logic
+
+Recall how the old L0->L1 picking algorithm chose the largest L0 file for compaction. This didn't fit well with L0->L0 compaction, which operates on a span of files. That span begins at the newest L0 file, and expands towards older files as long as they're not being compacted. Since the largest file may be anywhere, the old L0->L1 picking logic could arbitrarily prevent us from getting a long span of files. See the second illustration in this post for a scenario where this would happen.
+
+So, we changed the L0->L1 picking algorithm to start from the oldest file and expand towards newer files as long as they're not being compacted. For example:
+
+![l0-l1-contend.png](/static/images/compaction/l0-l1-contend.png)
+
+Now, there can never be L0 files unreachable for L0->L0 due to L0->L1 selecting files in the middle. When longer spans of files are available for L0->L0, we perform less compaction work per deleted L0 file, thus improving efficiency.
+
+This feature will be available in RocksDB 5.7.
+
+### Performance Changes
+
+Mark Callaghan did the most extensive benchmarking of this feature's impact on MyRocks. See his results [here](http://smalldatum.blogspot.com/2017/05/innodb-myrocks-and-tokudb-on-insert.html). Note the primary change between his March 17 and April 14 builds is the latter performs L0->L0 compaction.
diff --git a/src/rocksdb/docs/_posts/2017-06-29-rocksdb-5-5-1-released.markdown b/src/rocksdb/docs/_posts/2017-06-29-rocksdb-5-5-1-released.markdown
new file mode 100644
index 000000000..d7856088b
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2017-06-29-rocksdb-5-5-1-released.markdown
@@ -0,0 +1,22 @@
+---
+title: RocksDB 5.5.1 Released!
+layout: post
+author: lightmark
+category: blog
+---
+
+### New Features
+* FIFO compaction to support Intra L0 compaction too with CompactionOptionsFIFO.allow_compaction=true.
+* Statistics::Reset() to reset user stats.
+* ldb add option --try_load_options, which will open DB with its own option file.
+* Introduce WriteBatch::PopSavePoint to pop the most recent save point explicitly.
+* Support dynamically change `max_open_files` option via SetDBOptions()
+* Added DB::CreateColumnFamilie() and DB::DropColumnFamilies() to bulk create/drop column families.
+* Add debugging function `GetAllKeyVersions` to see internal versions of a range of keys.
+* Support file ingestion with universal compaction style
+* Support file ingestion behind with option `allow_ingest_behind`
+* New option enable_pipelined_write which may improve write throughput in case writing from multiple threads and WAL enabled.
+
+### Bug Fixes
+* Fix the bug that Direct I/O uses direct reads for non-SST file
+* Fix the bug that flush doesn't respond to fsync result
diff --git a/src/rocksdb/docs/_posts/2017-07-25-rocksdb-5-6-1-released.markdown b/src/rocksdb/docs/_posts/2017-07-25-rocksdb-5-6-1-released.markdown
new file mode 100644
index 000000000..3b54ffd5a
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2017-07-25-rocksdb-5-6-1-released.markdown
@@ -0,0 +1,22 @@
+---
+title: RocksDB 5.6.1 Released!
+layout: post
+author: yiwu
+category: blog
+---
+
+### Public API Change
+* Scheduling flushes and compactions in the same thread pool is no longer supported by setting `max_background_flushes=0`. Instead, users can achieve this by configuring their high-pri thread pool to have zero threads. See https://github.com/facebook/rocksdb/wiki/Thread-Pool for more details.
+* Replace `Options::max_background_flushes`, `Options::max_background_compactions`, and `Options::base_background_compactions` all with `Options::max_background_jobs`, which automatically decides how many threads to allocate towards flush/compaction.
+* options.delayed_write_rate by default take the value of options.rate_limiter rate.
+* Replace global variable `IOStatsContext iostats_context` with `IOStatsContext* get_iostats_context()`; replace global variable `PerfContext perf_context` with `PerfContext* get_perf_context()`.
+
+### New Features
+* Change ticker/histogram statistics implementations to use core-local storage. This improves aggregation speed compared to our previous thread-local approach, particularly for applications with many threads. See http://rocksdb.org/blog/2017/05/14/core-local-stats.html for more details.
+* Users can pass a cache object to write buffer manager, so that they can cap memory usage for memtable and block cache using one single limit.
+* Flush will be triggered when 7/8 of the limit introduced by write_buffer_manager or db_write_buffer_size is triggered, so that the hard threshold is hard to hit. See https://github.com/facebook/rocksdb/wiki/Write-Buffer-Manager for more details.
+* Introduce WriteOptions.low_pri. If it is true, low priority writes will be throttled if the compaction is behind. See https://github.com/facebook/rocksdb/wiki/Low-Priority-Write for more details.
+* `DB::IngestExternalFile()` now supports ingesting files into a database containing range deletions.
+
+### Bug Fixes
+* Shouldn't ignore return value of fsync() in flush.
diff --git a/src/rocksdb/docs/_posts/2017-08-24-pinnableslice.markdown b/src/rocksdb/docs/_posts/2017-08-24-pinnableslice.markdown
new file mode 100644
index 000000000..06e0bcb2f
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2017-08-24-pinnableslice.markdown
@@ -0,0 +1,37 @@
+---
+title: PinnableSlice; less memcpy with point lookups
+layout: post
+author: maysamyabandeh
+category: blog
+---
+
+The classic API for [DB::Get](https://github.com/facebook/rocksdb/blob/9e583711144f580390ce21a49a8ceacca338fcd5/include/rocksdb/db.h#L310) receives a std::string as argument to which it will copy the value. The memcpy overhead could be non-trivial when the value is large. The [new API](https://github.com/facebook/rocksdb/blob/9e583711144f580390ce21a49a8ceacca338fcd5/include/rocksdb/db.h#L322) receives a PinnableSlice instead, which avoids memcpy in most of the cases.
+
+### What is PinnableSlice?
+
+Similarly to Slice, PinnableSlice refers to some in-memory data so it does not incur the memcpy cost. To ensure that the data will not be erased while it is being processed by the user, PinnableSlice, as its name suggests, has the data pinned in memory. The pinned data are released when PinnableSlice object is destructed or when ::Reset is invoked explicitly on it.
+
+### How good is it?
+
+Here are the improvements in throughput for an [in-memory benchmark](https://github.com/facebook/rocksdb/pull/1756#issuecomment-286201693):
+* value 1k byte: 14%
+* value 10k byte: 34%
+
+### Any limitations?
+
+PinnableSlice tries to avoid memcpy as much as possible. The primary gain is when reading large values from the block cache. There are however cases that it would still have to copy the data into its internal buffer. The reason is mainly the complexity of implementation and if there is enough motivation on the application side. the scope of PinnableSlice could be extended to such cases too. These include:
+* Merged values
+* Reads from memtables
+
+### How to use it?
+
+```cpp
+PinnableSlice pinnable_val;
+while (!stopped) { 
+   auto s = db->Get(opt, cf, key, &pinnable_val);
+   // ... use it
+   pinnable_val.Reset(); // then release it immediately
+}
+```
+
+You can also [initialize the internal buffer](https://github.com/facebook/rocksdb/blob/9e583711144f580390ce21a49a8ceacca338fcd5/include/rocksdb/db.h#L314) of PinnableSlice by passing your own string in the constructor. [simple_example.cc](https://github.com/facebook/rocksdb/blob/main/examples/simple_example.cc) demonstrates that with more examples.
diff --git a/src/rocksdb/docs/_posts/2017-08-25-flushwal.markdown b/src/rocksdb/docs/_posts/2017-08-25-flushwal.markdown
new file mode 100644
index 000000000..751fe5249
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2017-08-25-flushwal.markdown
@@ -0,0 +1,26 @@
+---
+title: FlushWAL; less fwrite, faster writes
+layout: post
+author: maysamyabandeh
+category: blog
+---
+
+When `DB::Put` is called, the data is written to both memtable (to be flushed to SST files later) and the WAL (write-ahead log) if it is enabled. In the case of a crash, RocksDB can recover as much as the memtable state that is reflected into the WAL. By default RocksDB automatically flushes the WAL from the application memory to the OS buffer after each `::Put`. It however can be configured to perform the flush manually after an explicit call to `::FlushWAL`. Not doing fwrite syscall after each `::Put` offers a tradeoff between reliability and write latency for the general case. As we explain below, some applications such as MyRocks benefit from this API to gain higher write throughput with however no compromise in reliability.
+
+### How much is the gain?
+
+Using `::FlushWAL` API along with setting `DBOptions.concurrent_prepare`, MyRocks achieves 40% higher throughput in Sysbench's [update-nonindex](https://github.com/akopytov/sysbench/blob/master/src/lua/oltp_update_non_index.lua) benchmark.
+
+### Write, Flush, and Sync
+
+The write to the WAL is first written to the application memory buffer. The buffer in the next step is "flushed" to OS buffer by calling fwrite syscall. The OS buffer is later "synced" to the persistent storage. The data in the OS buffer, although not persisted yet, will survive the application crash. By default, the flush occurs automatically upon each call to `DB::Put` or `DB::Write`. The user can additionally request sync after each write by setting `WriteOptions::sync`.
+
+### FlushWAL API
+
+The user can turn off the automatic flush of the WAL by setting `DBOptions::manual_wal_flush`. In that case, the WAL buffer is flushed when it is either full or `DB::FlushWAL` is called by the user. The API also accepts a boolean argument should we want to sync right after the flush: `::FlushWAL(true)`.
+
+### Success story: MyRocks
+
+Some applications that use RocksDB, already have other machinsims in place to provide reliability. MySQL for example uses 2PC (two-phase commit) to write to both binlog as well as the storage engine such as InnoDB and MyRocks. The group commit logic in MySQL allows the 1st phase (Prepare) to be run in parallel but after a commit group is formed performs the 2nd phase (Commit) in a serial manner. This makes low commit latency in the storage engine essential for achieving high throughput. The commit in MyRocks includes writing to the RocksDB WAL, which as explaiend above, by default incures the latency of flushing the WAL new appends to the OS buffer.
+
+Since binlog helps in recovering from some failure scenarios, MySQL can provide reliability without however needing a storage WAL flush after each individual commit. MyRocks benefits from this property, disables automatic WAL flush in RocksDB, and manually calls `::FlushWAL` when requested by MySQL.
diff --git a/src/rocksdb/docs/_posts/2017-09-28-rocksdb-5-8-released.markdown b/src/rocksdb/docs/_posts/2017-09-28-rocksdb-5-8-released.markdown
new file mode 100644
index 000000000..a22dcaa1c
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2017-09-28-rocksdb-5-8-released.markdown
@@ -0,0 +1,25 @@
+---
+title: RocksDB 5.8 Released!
+layout: post
+author: maysamyabandeh
+category: blog
+---
+
+### Public API Change
+* Users of `Statistics::getHistogramString()` will see fewer histogram buckets and different bucket endpoints.
+* `Slice::compare` and BytewiseComparator `Compare` no longer accept `Slice`s containing nullptr.
+* `Transaction::Get` and `Transaction::GetForUpdate` variants with `PinnableSlice` added.
+
+### New Features
+* Add Iterator::Refresh(), which allows users to update the iterator state so that they can avoid some initialization costs of recreating iterators.
+* Replace dynamic_cast<> (except unit test) so people can choose to build with RTTI off. With make, release mode is by default built with -fno-rtti and debug mode is built without it. Users can override it by setting USE_RTTI=0 or 1.
+* Universal compactions including the bottom level can be executed in a dedicated thread pool. This alleviates head-of-line blocking in the compaction queue, which cause write stalling, particularly in multi-instance use cases. Users can enable this feature via `Env::SetBackgroundThreads(N, Env::Priority::BOTTOM)`, where `N > 0`.
+* Allow merge operator to be called even with a single merge operand during compactions, by appropriately overriding `MergeOperator::AllowSingleOperand`.
+* Add `DB::VerifyChecksum()`, which verifies the checksums in all SST files in a running DB.
+* Block-based table support for disabling checksums by setting `BlockBasedTableOptions::checksum = kNoChecksum`.
+
+### Bug Fixes
+* Fix wrong latencies in `rocksdb.db.get.micros`, `rocksdb.db.write.micros`, and `rocksdb.sst.read.micros`.
+* Fix incorrect dropping of deletions during intra-L0 compaction.
+* Fix transient reappearance of keys covered by range deletions when memtable prefix bloom filter is enabled.
+* Fix potentially wrong file smallest key when range deletions separated by snapshot are written together.
diff --git a/src/rocksdb/docs/_posts/2017-12-18-17-auto-tuned-rate-limiter.markdown b/src/rocksdb/docs/_posts/2017-12-18-17-auto-tuned-rate-limiter.markdown
new file mode 100644
index 000000000..d2e6204e1
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2017-12-18-17-auto-tuned-rate-limiter.markdown
@@ -0,0 +1,28 @@
+---
+title: Auto-tuned Rate Limiter
+layout: post
+author: ajkr
+category: blog
+---
+
+### Introduction
+
+Our rate limiter has been hard to configure since users need to pick a value that is low enough to prevent background I/O spikes, which can impact user-visible read/write latencies. Meanwhile, picking too low a value can cause memtables and L0 files to pile up, eventually leading to writes stalling. Tuning the rate limiter has been especially difficult for users whose DB instances have different workloads, or have workloads that vary over time, or commonly both.
+
+To address this, in RocksDB 5.9 we released a dynamic rate limiter that adjusts itself over time according to demand for background I/O. It can be enabled simply by passing `auto_tuned=true` in the `NewGenericRateLimiter()` call. In this case `rate_bytes_per_sec` will indicate the upper-bound of the window within which a rate limit will be picked dynamically. The chosen rate limit will be much lower unless absolutely necessary, so setting this to the device's maximum throughput is a reasonable choice on dedicated hosts.
+
+### Algorithm
+
+We use a simple multiplicative-increase, multiplicative-decrease algorithm. We measure demand for background I/O as the ratio of intervals where the rate limiter is drained. There are low and high watermarks for this ratio, which will trigger a change in rate limit when breached. The rate limit can move within a window bounded by the user-specified upper-bound, and a lower-bound that we derive internally. Users can expect this lower bound to be 1-2 orders of magnitude less than the provided upper-bound (so don't provide INT64_MAX as your upper-bound), although it's subject to change.
+
+### Benchmark Results
+
+Data is ingested at 10MB/s and the rate limiter was created with 1000MB/s as its upper bound. The dynamically chosen rate limit hovers around 125MB/s. The other clustering of points at 50MB/s is due to number of compaction threads being reduced to one when there's no compaction pressure. 
+
+![](/static/images/rate-limiter/write-KBps-series.png)
+
+![](/static/images/rate-limiter/auto-tuned-write-KBps-series.png)
+
+The following graph summarizes the above two time series graphs in CDF form. In particular, notice the p90 - p100 for background write rate are significantly lower with auto-tuned rate limiter enabled.
+
+![](/static/images/rate-limiter/write-KBps-cdf.png)
diff --git a/src/rocksdb/docs/_posts/2017-12-19-write-prepared-txn.markdown b/src/rocksdb/docs/_posts/2017-12-19-write-prepared-txn.markdown
new file mode 100644
index 000000000..439b3f83c
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2017-12-19-write-prepared-txn.markdown
@@ -0,0 +1,41 @@
+---
+title: WritePrepared Transactions
+layout: post
+author: maysamyabandeh
+category: blog
+---
+
+RocksDB supports both optimistic and pessimistic concurrency controls. The pessimistic transactions make use of locks to provide isolation between the transactions. The default write policy in pessimistic transactions is _WriteCommitted_, which means that the data is written to the DB, i.e., the memtable, only after the transaction is committed. This policy simplified the implementation but came with some limitations in throughput, transaction size, and variety in supported isolation levels. In the below, we explain these in detail and present the other write policies, _WritePrepared_ and _WriteUnprepared_. We then dive into the design of _WritePrepared_ transactions.
+
+### WriteCommitted, Pros and Cons
+
+With _WriteCommitted_ write policy, the data is written to the memtable only after the transaction commits. This greatly simplifies the read path as any data that is read by other transactions can be assumed to be committed. This write policy, however, implies that the writes are buffered in memory in the meanwhile. This makes memory a bottleneck for large transactions. The delay of the commit phase in 2PC (two-phase commit) also becomes noticeable since most of the work, i.e., writing to memtable, is done at the commit phase. When the commit of multiple transactions are done in a serial fashion, such as in 2PC implementation of MySQL, the lengthy commit latency becomes a major contributor to lower throughput. Moreover this write policy cannot provide weaker isolation levels, such as READ UNCOMMITTED, that could potentially provide higher throughput for some applications.
+
+### Alternatives: _WritePrepared_ and _WriteUnprepared_
+
+To tackle the lengthy commit issue, we should do memtable writes at earlier phases of 2PC so that the commit phase become lightweight and fast. 2PC is composed of Write stage, where the transaction `::Put` is invoked, the prepare phase, where `::Prepare` is invoked (upon which the DB promises to commit the transaction if later is requested), and commit phase, where `::Commit` is invoked and the transaction writes become visible to all readers. To make the commit phase lightweight, the memtable write could be done at either `::Prepare` or `::Put` stages, resulting into _WritePrepared_ and _WriteUnprepared_ write policies respectively. The downside is that when another transaction is reading data, it would need a way to tell apart which data is committed, and if they are, whether they are committed before the transaction's start, i.e., in the read snapshot of the transaction. _WritePrepared_ would still have the issue of buffering the data, which makes the memory the bottleneck for large transactions. It however provides a good milestone for transitioning from _WriteCommitted_ to _WriteUnprepared_ write policy. Here we explain the design of _WritePrepared_ policy. We will cover the changes that make the design to also supported _WriteUnprepared_ in an upcoming post.
+
+### _WritePrepared_ in a nutshell
+
+These are the primary design questions that needs to be addressed:
+1) How do we identify the key/values in the DB with transactions that wrote them?
+2) How do we figure if a key/value written by transaction Txn_w is in the read snapshot of the reading transaction Txn_r?
+3) How do we rollback the data written by aborted transactions?
+
+With _WritePrepared_, a transaction still buffers the writes in a write batch object in memory. When 2PC `::Prepare` is called, it writes the in-memory write batch to the WAL (write-ahead log) as well as to the memtable(s) (one memtable per column family); We reuse the existing notion of sequence numbers in RocksDB to tag all the key/values in the same write batch with the same sequence number, `prepare_seq`, which is also used as the identifier for the transaction. At commit time, it writes a commit marker to the WAL, whose sequence number, `commit_seq`, will be used as the commit timestamp of the transaction. Before releasing the commit sequence number to the readers, it stores a mapping from `prepare_seq` to `commit_seq` in an in-memory data structure that we call _CommitCache_. When a transaction reading values from the DB (tagged with `prepare_seq`) it makes use of the _CommitCache_ to figure if `commit_seq` of the value is in its read snapshot. To rollback an aborted transaction, we apply the status before the transaction by making another write that cancels out the writes of the aborted transaction.
+
+The _CommitCache_ is a lock-free data structure that caches the recent commit entries. Looking up the entries in the cache must be enough for almost all th transactions that commit in a timely manner. When evicting the older entries from the cache, it still maintains some other data structures to cover the corner cases for transactions that takes abnormally too long to finish. We will cover them in the design details below.
+
+### Benchmark Results
+Here we presents the improvements observed in MyRocks with sysbench and linkbench:
+* benchmark...........tps.........p95 latency....cpu/query
+* insert...................68%
+* update-noindex...30%......38%
+* update-index.......61%.......28%
+* read-write............6%........3.5%
+* read-only...........-1.2%.....-1.8%
+* linkbench.............1.9%......+overall........0.6%
+
+Here are also the detailed results for [In-Memory Sysbench](https://gist.github.com/maysamyabandeh/bdb868091b2929a6d938615fdcf58424) and [SSD Sysbench](https://gist.github.com/maysamyabandeh/ff94f378ab48925025c34c47eff99306) curtesy of [@mdcallag](https://github.com/mdcallag).
+
+Learn more [here](https://github.com/facebook/rocksdb/wiki/WritePrepared-Transactions).
diff --git a/src/rocksdb/docs/_posts/2018-02-05-rocksdb-5-10-2-released.markdown b/src/rocksdb/docs/_posts/2018-02-05-rocksdb-5-10-2-released.markdown
new file mode 100644
index 000000000..9f32d3f94
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2018-02-05-rocksdb-5-10-2-released.markdown
@@ -0,0 +1,22 @@
+---
+title: RocksDB 5.10.2 Released!
+layout: post
+author: siying
+category: blog
+---
+
+### Public API Change
+* When running `make` with environment variable `USE_SSE` set and `PORTABLE` unset, will use all machine features available locally. Previously this combination only compiled SSE-related features.
+
+### New Features
+* CRC32C is now using the 3-way pipelined SSE algorithm `crc32c_3way` on supported platforms to improve performance. The system will choose to use this algorithm on supported platforms automatically whenever possible. If PCLMULQDQ is not supported it will fall back to the old Fast_CRC32 algorithm.
+* Provide lifetime hints when writing files on Linux. This reduces hardware write-amp on storage devices supporting multiple streams.
+* Add a DB stat, `NUMBER_ITER_SKIP`, which returns how many internal keys were skipped during iterations (e.g., due to being tombstones or duplicate versions of a key).
+* Add PerfContext counters, `key_lock_wait_count` and `key_lock_wait_time`, which measure the number of times transactions wait on key locks and total amount of time waiting.
+
+### Bug Fixes
+* Fix IOError on WAL write doesn't propagate to write group follower
+* Make iterator invalid on merge error.
+* Fix performance issue in `IngestExternalFile()` affecting databases with large number of SST files.
+* Fix possible corruption to LSM structure when `DeleteFilesInRange()` deletes a subset of files spanned by a `DeleteRange()` marker.
+* Fix DB::Flush() keep waiting after flush finish under certain condition.
diff --git a/src/rocksdb/docs/_posts/2018-08-01-rocksdb-tuning-advisor.markdown b/src/rocksdb/docs/_posts/2018-08-01-rocksdb-tuning-advisor.markdown
new file mode 100644
index 000000000..ff9b1e464
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2018-08-01-rocksdb-tuning-advisor.markdown
@@ -0,0 +1,58 @@
+---
+title: Rocksdb Tuning Advisor
+layout: post
+author: poojam23
+category: blog
+---
+
+The performance of Rocksdb is contingent on its tuning. However, because
+of the complexity of its underlying technology and a large number of
+configurable parameters, a good configuration is sometimes hard to obtain. The aim of
+the python command-line tool, Rocksdb Advisor, is to automate the process of
+suggesting improvements in the configuration based on advice from Rocksdb
+experts.
+
+### Overview
+
+Experts share their wisdom as rules comprising of conditions and suggestions in the INI format (refer
+[rules.ini](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/rules.ini)).
+Users provide the Rocksdb configuration that they want to improve upon (as the
+familiar Rocksdb OPTIONS file —
+[example](https://github.com/facebook/rocksdb/blob/main/examples/rocksdb_option_file_example.ini))
+and the path of the file which contains Rocksdb logs and statistics.
+The [Advisor](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/rule_parser_example.py)
+creates appropriate DataSource objects (for Rocksdb
+[logs](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/db_log_parser.py),
+[options](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/db_options_parser.py),
+[statistics](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/db_stats_fetcher.py) etc.)
+and provides them to the [Rules Engine](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/rule_parser.py).
+The Rules uses rules from experts to parse data-sources and trigger appropriate rules.
+The Advisor's output gives information about which rules were triggered,
+why they were triggered and what each of them suggests. Each suggestion
+provided by a triggered rule advises some action on a Rocksdb
+configuration option, for example, increase CFOptions.write_buffer_size,
+set bloom_bits to 2 etc.
+
+### Usage
+
+An example command to run the tool:
+
+```shell
+cd rocksdb/tools/advisor
+python3 -m advisor.rule_parser_example --rules_spec=advisor/rules.ini --rocksdb_options=test/input_files/OPTIONS-000005 --log_files_path_prefix=test/input_files/LOG-0 --stats_dump_period_sec=20
+```
+
+Sample output where a Rocksdb log-based rule has been triggered :
+
+```shell
+Rule: stall-too-many-memtables
+LogCondition: stall-too-many-memtables regex: Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+Suggestion: inc-bg-flush option : DBOptions.max_background_flushes action : increase suggested_values : ['2']
+Suggestion: inc-write-buffer option : CFOptions.max_write_buffer_number action : increase
+scope: col_fam:
+{'default'}
+```
+
+### Read more
+
+For more information, refer to [advisor](https://github.com/facebook/rocksdb/tree/main/tools/advisor/README.md).
diff --git a/src/rocksdb/docs/_posts/2018-08-23-data-block-hash-index.markdown b/src/rocksdb/docs/_posts/2018-08-23-data-block-hash-index.markdown
new file mode 100644
index 000000000..c4b24ec2a
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2018-08-23-data-block-hash-index.markdown
@@ -0,0 +1,118 @@
+---
+title: Improving Point-Lookup Using Data Block Hash Index
+layout: post
+author: fgwu
+category: blog
+---
+We've designed and implemented a _data block hash index_ in RocksDB that has the benefit of both reducing the CPU util and increasing the throughput for point lookup queries with a reasonable and tunable space overhead. 
+
+Specifially, we append a compact hash table to the end of the data block for efficient indexing. It is backward compatible with the data base created without this feature. After turned on the hash index feature, existing data will be gradually converted to the hash index format.
+
+Benchmarks with `db_bench`  show the CPU utilization of one of the main functions in the point lookup code path, `DataBlockIter::Seek()`, is reduced by 21.8%, and the overall RocksDB throughput is increased by 10% under purely cached workloads, at an overhead of 4.6% more space. Shadow testing with Facebook production traffic shows good CPU improvements too.
+
+
+### How to use it
+Two new options are added as part of this feature: `BlockBasedTableOptions::data_block_index_type` and `BlockBasedTableOptions::data_block_hash_table_util_ratio`.
+
+The hash index is disabled by default unless `BlockBasedTableOptions::data_block_index_type` is set to `data_block_index_type = kDataBlockBinaryAndHash`. The hash table utilization ratio is adjustable using `BlockBasedTableOptions::data_block_hash_table_util_ratio`, which is valid only if `data_block_index_type = kDataBlockBinaryAndHash`.
+
+
+```
+// the definitions can be found in include/rocksdb/table.h
+
+// The index type that will be used for the data block.
+enum DataBlockIndexType : char {
+  kDataBlockBinarySearch = 0,  // traditional block type
+  kDataBlockBinaryAndHash = 1, // additional hash index
+};
+
+// Set to kDataBlockBinaryAndHash to enable hash index
+DataBlockIndexType data_block_index_type = kDataBlockBinarySearch;
+
+// #entries/#buckets. It is valid only when data_block_hash_index_type is
+// kDataBlockBinaryAndHash.
+double data_block_hash_table_util_ratio = 0.75;
+
+```
+
+
+### Data Block Hash Index Design
+
+Current data block format groups adjacent keys together as a restart interval. One block consists of multiple restart intervals. The byte offset of the beginning of each restart interval, i.e. a restart point, is stored in an array called restart interval index or binary seek index. RocksDB does a binary search when performing point lookup for keys in data blocks to find the right restart interval the key may reside. We will use binary seek and binary search interchangeably in this post.
+
+In order to find the right location where the key may reside using binary search, multiple key parsing and comparison are needed. Each binary search branching triggers CPU cache miss, causing much CPU utilization. We have seen that this binary search takes up considerable CPU in production use-cases.
+
+![](/static/images/data-block-hash-index/block-format-binary-seek.png)
+
+We implemented a hash map at the end of the block to index the key to reduce the CPU overhead of the binary search. The hash index is just an array of pointers pointing into the binary seek index.
+
+![](/static/images/data-block-hash-index/block-format-hash-index.png)
+
+
+Each array element is considered as a hash bucket when storing the location of a key (or more precisely, the restart index of the restart interval where the key resides). When multiple keys happen to hash into the same bucket (hash collision), we just mark the bucket as “collision”. So that when later querying on that key, the hash table lookup knows that there was a hash collision happened so it can fall back to the traditional binary search to find the location of the key.
+
+We define hash table utilization ratio as the #keys/#buckets. If a utilization ratio is 0.5 and there are 100 buckets, 50 keys are stored in the bucket. The less the util ratio, the less hash collision, and the less chance for a point lookup falls back to binary seek (fall back ratio) due to the collision. So a small util ratio has more benefit to reduce the CPU time but introduces more space overhead.
+
+Space overhead depends on the util ratio. Each bucket is a `uint8_t`  (i.e. one byte). For a util ratio of 1, the space overhead is 1Byte per key, the fall back ratio observed is ~52%.
+
+![](/static/images/data-block-hash-index/hash-index-data-structure.png)
+
+### Things that Need Attention
+
+**Customized Comparator**
+
+Hash index will hash different keys (keys with different content, or byte sequence) into different hash values. This assumes the comparator will not treat different keys as equal if they have different content. 
+
+The default bytewise comparator orders the keys in alphabetical order and works well with hash index, as different keys will never be regarded as equal. However, some specially crafted comparators will do. For example, say, a `StringToIntComparator` can convert a string into an integer, and use the integer to perform the comparison. Key string “16” and “0x10” is equal to each other as seen by this `StringToIntComparator`, but they probably hash to different value. Later queries to one form of the key will not be able to find the existing key been stored in the other format.
+
+We add a new function member to the comparator interface: 
+
+```
+virtual bool CanKeysWithDifferentByteContentsBeEqual() const { return true; }
+```
+
+
+Every comparator implementation should override this function and specify the behavior of the comparator. If a comparator can regard different keys equal, the function returns true, and as a result the hash index feature will not be enabled, and vice versa.
+
+NOTE: to use the hash index feature, one should 1) have a comparator that can never treat different keys as equal; and 2) override the `CanKeysWithDifferentByteContentsBeEqual()` function to return `false`, so the hash index can be enabled.
+
+
+**Util Ratio's Impact on Data Block Cache**
+
+Adding the hash index to the end of the data block essentially takes up the data block cache space, making the effective data block cache size smaller and increasing the data block cache miss ratio. Therefore, a very small util ratio will result in a large data block cache miss ratio, and the extra I/O may drag down the throughput gain achieved by the hash index lookup. Besides, when compression is enabled, cache miss also incurs data block decompression, which is CPU-consuming. Therefore the CPU may even increase if using a too small util ratio. The best util ratio depends on workloads, cache to data ratio, disk bandwidth/latency etc. In our experiment, we found util ratio = 0.5 ~ 1 is a good range to explore that brings both CPU and throughput gains.
+
+
+### Limitations
+
+As we use `uint8_t` to store binary seek index, i.e. restart interval index, the total number of restart intervals cannot be more than 253 (we reserved  255 and 254 as special flags). For blocks having a larger number of restart intervals, the hash index will not be created and the point lookup will be done by traditional binary seek.
+
+Data block hash index only supports point lookup. We do not support range lookup. Range lookup request will fall back to BinarySeek.
+
+RocksDB supports many types of records, such as `Put`, `Delete`, `Merge`, etc (visit [here](https://github.com/facebook/rocksdb/wiki/rocksdb-basics) for more information). Currently we only support `Put` and `Delete`, but not `Merge`. Internally we have a limited set of supported record types:
+
+
+```
+kPutRecord,          <=== supported
+kDeleteRecord,       <=== supported
+kSingleDeleteRecord, <=== supported
+kTypeBlobIndex,      <=== supported
+```
+
+For records not supported, the searching process will fall back to the traditional binary seek. 
+
+
+
+### Evaluation
+To evaluate the CPU util reduction and isolate other factors such as disk I/O and block decompression, we first evaluate the hash idnex in a purely cached workload. We observe that the CPU utilization of one of the main functions in the point lookup code path, DataBlockIter::Seek(), is reduced by 21.8% and the overall throughput is increased by 10% at an overhead of 4.6% more space.
+
+However, general worload is not always purely cached. So we also evaluate the performance under different cache space pressure. In the following test, we use `db_bench` with RocksDB deployed on SSDs. The total DB size is 5~6GB, and it is about 14GB if decompressed. Different block cache sizes are used, ranging from 14GB down to 2GB, with an increasing cache miss ratio.
+
+Orange bars are representing our hash index performance. We use a hash util ratio of 1.0 in this test. Block size are set to 16KiB with the restart interval as 16.
+
+![](/static/images/data-block-hash-index/perf-throughput.png)
+![](/static/images/data-block-hash-index/perf-cache-miss.png)
+
+We can see that if cache size is greater than 8GB, hash index can bring throughput gain. Cache size greater than 8GB can be translated to a cache miss ratio smaller than 40%. So if the workload has a cache miss ratio smaller than 40%, hash index is able to increase the throughput.
+
+Besides, shadow testing with Facebook production traffic shows good CPU improvements too.
+
diff --git a/src/rocksdb/docs/_posts/2018-11-21-delete-range.markdown b/src/rocksdb/docs/_posts/2018-11-21-delete-range.markdown
new file mode 100644
index 000000000..96fc3562d
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2018-11-21-delete-range.markdown
@@ -0,0 +1,292 @@
+---
+title: "DeleteRange: A New Native RocksDB Operation"
+layout: post
+author:
+- abhimadan
+- ajkr
+category: blog
+---
+## Motivation
+
+### Deletion patterns in LSM
+
+Deleting a range of keys is a common pattern in RocksDB. Most systems built on top of
+RocksDB have multi-component key schemas, where keys sharing a common prefix are
+logically related. Here are some examples.
+
+MyRocks is a MySQL fork using RocksDB as its storage engine. Each key's first
+four bytes identify the table or index to which that key belongs. Thus dropping
+a table or index involves deleting all the keys with that prefix.
+
+Rockssandra is a Cassandra variant that uses RocksDB as its storage engine. One
+of its admin tool commands, `nodetool cleanup`, removes key-ranges that have been migrated
+to other nodes in the cluster.
+
+Marketplace uses RocksDB to store product data. Its key begins with product ID,
+and it stores various data associated with the product in separate keys. When a
+product is removed, all these keys must be deleted.
+
+When we decide what to improve, we try to find a use case that's common across
+users, since we want to build a generally useful system, not one that has many
+one-off features for individual users. The range deletion pattern is common as
+illustrated above, so from this perspective it's a good target for optimization.
+
+### Existing mechanisms: challenges and opportunities
+
+The most common pattern we see is scan-and-delete, i.e., advance an iterator
+through the to-be-deleted range, and issue a `Delete` for each key. This is
+slow (involves read I/O) so cannot be done in any critical path. Additionally,
+it creates many tombstones, which slows down iterators and doesn't offer a deadline
+for space reclamation.
+
+Another common pattern is using a custom compaction filter that drops keys in
+the deleted range(s). This deletes the range asynchronously, so cannot be used
+in cases where readers must not see keys in deleted ranges. Further, it has the
+disadvantage of outputting tombstones to all but the bottom level. That's
+because compaction cannot detect whether dropping a key would cause an older
+version at a lower level to reappear.
+
+If space reclamation time is important, or it is important that the deleted
+range not affect iterators, the user can trigger `CompactRange` on the deleted
+range. This can involve arbitrarily long waits in the compaction queue, and
+increases write-amp. By the time it's finished, however, the range is completely
+gone from the LSM.
+
+`DeleteFilesInRange` can be used prior to compacting the deleted range as long
+as snapshot readers do not need to access them. It drops files that are
+completely contained in the deleted range. That saves write-amp because, in
+`CompactRange`, the file data would have to be rewritten several times before it
+reaches the bottom of the LSM, where tombstones can finally be dropped.
+
+In addition to the above approaches having various drawbacks, they are quite
+complicated to reason about and implement. In an ideal world, deleting a range
+of keys would be (1) simple, i.e., a single API call; (2) synchronous, i.e.,
+when the call finishes, the keys are guaranteed to be wiped from the DB; (3) low
+latency so it can be used in critical paths; and (4) a first-class operation
+with all the guarantees of any other write, like atomicity, crash-recovery, etc.
+
+## v1: Getting it to work
+
+### Where to persist them?
+
+The first place we thought about storing them is inline with the data blocks.
+We could not think of a good way to do it, however, since the start of a range
+tombstone covering a key could be anywhere, making binary search impossible.
+So, we decided to investigate segregated storage.
+
+A second solution we considered is appending to the manifest. This file is
+append-only, periodically compacted, and stores metadata like the level to which
+each SST belongs. This is tempting because it leverages an existing file, which
+is maintained in the background and fully read when the DB is opened. However,
+it conceptually violates the manifest's purpose, which is to store metadata. It
+also has no way to detect when a range tombstone no longer covers anything and
+is droppable. Further, it'd be possible for keys above a range tombstone to disappear
+when they have their seqnums zeroed upon compaction to the bottommost level.
+
+A third candidate is using a separate column family. This has similar problems
+to the manifest approach. That is, we cannot easily detect when a range
+tombstone is obsolete, and seqnum zeroing can cause a key
+to go from above a range tombstone to below, i.e., disappearing. The upside is
+we can reuse logic for memory buffering, consistent reads/writes, etc.
+
+The problems with the second and third solutions indicate a need for range
+tombstones to be aware of flush/compaction. An easy way to achieve this is put
+them in the SST files themselves - but not in the data blocks, as explained for
+the first solution. So, we introduced a separate meta-block for range tombstones.
+This resolved the problem of when to obsolete range tombstones, as it's simple:
+when they're compacted to the bottom level. We also reused the LSM invariants
+that newer versions of a key are always in a higher level to prevent the seqnum
+zeroing problem. This approach has the side benefit of constraining the range
+tombstones seen during reads to ones in a similar key-range.
+
+![](/static/images/delrange/delrange_sst_blocks.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+*When there are range tombstones in an SST, they are segregated in a separate meta-block*
+{: style="text-align: center"}
+
+![](/static/images/delrange/delrange_key_schema.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+*Logical range tombstones (left) and their corresponding physical key-value representation (right)*
+{: style="text-align: center"}
+
+### Write path
+
+`WriteBatch` stores range tombstones in its buffer which are logged to the WAL and
+then applied to a dedicated range tombstone memtable during `Write`. Later in
+the background the range tombstone memtable and its corresponding data memtable
+are flushed together into a single SST with a range tombstone meta-block. SSTs
+periodically undergo compaction which rewrites SSTs with point data and range
+tombstones dropped or merged wherever possible.
+
+We chose to use a dedicated memtable for range tombstones. The memtable
+representation is always skiplist in order to minimize overhead in the usual
+case, which is the memtable contains zero or a small number of range tombstones.
+The range tombstones are segregated to a separate memtable for the same reason
+we segregated range tombstones in SSTs. That is, we did not know how to
+interleave the range tombstone with point data in a way that we would be able to
+find it for arbitrary keys that it covers.
+
+![](/static/images/delrange/delrange_write_path.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 70%"}
+
+*Lifetime of point keys and range tombstones in RocksDB*
+{: style="text-align: center"}
+
+During flush and compaction, we chose to write out all non-obsolete range
+tombstones unsorted. Sorting by a single dimension is easy to implement, but
+doesn't bring asymptotic improvement to queries over range data. Ideally, we
+want to store skylines (see “Read Path” subsection below) computed over our ranges so we can binary search.
+However, a couple of concerns cause doing this in flush and compaction to feel
+unsatisfactory: (1) we need to store multiple skylines, one for each snapshot,
+which further complicates the range tombstone meta-block encoding; and (2) even
+if we implement this, the range tombstone memtable still needs to be linearly
+scanned. Given these concerns we decided to defer collapsing work to the read
+side, hoping a good caching strategy could optimize this at some future point.
+
+
+### Read path
+
+In point lookups, we aggregate range tombstones in an unordered vector as we
+search through live memtable, immutable memtables, and then SSTs. When a key is
+found that matches the lookup key, we do a scan through the vector, checking
+whether the key is deleted.
+
+In iterators, we aggregate range tombstones into a skyline as we visit live
+memtable, immutable memtables, and SSTs. The skyline is expensive to construct but fast to determine whether a key is covered. The skyline keeps track of the most recent range tombstone found to optimize `Next` and `Prev`.
+
+|![](/static/images/delrange/delrange_uncollapsed.png)	|![](/static/images/delrange/delrange_collapsed.png)	|
+
+*([Image source: Leetcode](https://leetcode.com/problems/the-skyline-problem/description/)) The skyline problem involves taking building location/height data in the
+unsearchable form of A and converting it to the form of B, which is
+binary-searchable. With overlapping range tombstones, to achieve efficient
+searching we need to solve an analogous problem, where the x-axis is the
+key-space and the y-axis is the sequence number.*
+{: style="text-align: center"}
+
+### Performance characteristics
+
+For the v1 implementation, writes are much faster compared to the scan and
+delete (optionally within a transaction) pattern. `DeleteRange` only logs to WAL
+and applies to memtable. Logging to WAL always `fflush`es, and optionally
+`fsync`s or `fdatasync`s. Applying to memtable is always an in-memory operation.
+Since range tombstones have a dedicated skiplist memtable, the complexity of inserting is O(log(T)), where T is the number of existing buffered range tombstones.
+
+Reading in the presence of v1 range tombstones, however, is much slower than reads
+in a database where scan-and-delete has happened, due to the linear scan over
+range tombstone memtables/meta-blocks.
+
+Iterating in a database with v1 range tombstones is usually slower than in a
+scan-and-delete database, although the gap lessens as iterations grow longer.
+When an iterator is first created and seeked, we construct a skyline over its
+tombstones. This operation is O(T\*log(T)) where T is the number of tombstones
+found across live memtable, immutable memtable, L0 files, and one file from each
+of the L1+ levels. However, moving the iterator forwards or backwards is simply
+a constant-time operation (excluding edge cases, e.g., many range tombstones
+between consecutive point keys).
+
+## v2: Making it fast
+
+`DeleteRange`’s negative impact on read perf is a barrier to its adoption. The
+root cause is range tombstones are not stored or cached in a format that can be
+efficiently searched. We needed to design DeleteRange so that we could maintain
+write performance while making read performance competitive with workarounds
+used in production (e.g., scan-and-delete).
+
+### Representations
+
+The key idea of the redesign is that, instead of globally collapsing range tombstones,
+ we can locally “fragment” them for each SST file and memtable to guarantee that:
+
+* no range tombstones overlap; and
+* range tombstones are ordered by start key.
+
+Combined, these properties make range tombstones binary searchable. This
+ fragmentation will happen on the read path, but unlike the previous design, we can
+ easily cache many of these range tombstone fragments on the read path.
+
+### Write path
+
+The write path remains unchanged.
+
+### Read path
+
+When an SST file is opened, its range tombstones are fragmented and cached. For point
+ lookups, we binary search each file's fragmented range tombstones for one that covers
+ the lookup key. Unlike the old design, once we find a tombstone, we no longer need to
+ search for the key in lower levels, since we know that any keys on those levels will be
+ covered (though we do still check the current level since there may be keys written after
+ the range tombstone).
+
+For range scans, we create iterators over all the fragmented range
+ tombstones and store them in a list, seeking each one to cover the start key of the range
+ scan (if possible), and query each encountered key in this structure as in the old design,
+ advancing range tombstone iterators as necessary. In effect, we implicitly create a skyline.
+ This requires significantly less work on iterator creation, but since each memtable/SST has
+its own range tombstone iterator, querying range tombstones requires key comparisons (and
+possibly iterator increments) for several iterators (as opposed to v1, where we had a global
+collapsed representation of all range tombstones). As a result, very long range scans may become
+ slower than before, but short range scans are an order of magnitude faster, which are the
+ more common class of range scan.
+
+## Benchmarks
+
+To understand the performance of this new design, we used `db_bench` to compare point lookup, short range scan,
+ and long range scan performance across:
+
+* the v1 DeleteRange design,
+* the scan-and-delete workaround, and
+* the v2 DeleteRange design.
+
+In these benchmarks, we used a database with 5 million data keys, and 10000 range tombstones (ignoring
+those dropped during compaction) that were written in regular intervals after 4.5 million data keys were written.
+Writing the range tombstones ensures that most of them are not compacted away, and we have more tombstones
+in higher levels that cover keys in lower levels, which allows the benchmarks to exercise more interesting behavior
+when reading deleted keys.
+
+Point lookup benchmarks read 100000 keys from a database using `readwhilewriting`. Range scan benchmarks used
+`seekrandomwhilewriting` and seeked 100000 times, and advanced up to 10 keys away from the seek position for short range scans, and advanced up to 1000 keys away from the seek position for long range scans.
+
+The results are summarized in the tables below, averaged over 10 runs (note the
+different SHAs for v1 benchmarks are due to a new `db_bench` flag that was added in order to compare performance with databases with no tombstones; for brevity, those results are not reported here). Also note that the block cache was large enough to hold the entire db, so the large throughput is due to limited I/Os and little time spent on decompression. The range tombstone blocks are always pinned uncompressed in memory. We believe these setup details should not affect relative performance between versions.
+
+### Point Lookups
+
+|Name	|SHA	|avg micros/op	|avg ops/sec	|
+|v1	|35cd754a6	|1.3179	|759,830.90	|
+|scan-del	|7528130e3	|0.6036	|1,667,237.70	|
+|v2	|7528130e3	|0.6128	|1,634,633.40	|
+
+### Short Range Scans
+
+|Name	|SHA	|avg micros/op	|avg ops/sec	|
+|v1	|0ed738fdd	|6.23	|176,562.00	|
+|scan-del	|PR 4677	|2.6844	|377,313.00	|
+|v2	|PR 4677	|2.8226	|361,249.70	|
+
+### Long Range scans
+
+|Name	|SHA	|avg micros/op	|avg ops/sec	|
+|v1	|0ed738fdd	|52.7066	|19,074.00	|
+|scan-del	|PR 4677	|38.0325	|26,648.60	|
+|v2	|PR 4677	|41.2882	|24,714.70	|
+
+## Future Work
+
+Note that memtable range tombstones are fragmented every read; for now this is acceptable,
+ since we expect there to be relatively few range tombstones in memtables (and users can
+ enforce this by keeping track of the number of memtable range deletions and manually flushing
+ after it passes a threshold). In the future, a specialized data structure can be used for storing
+ range tombstones in memory to avoid this work.
+
+Another future optimization is to create a new format version that requires range tombstones to
+ be stored in a fragmented form. This would save time when opening SST files, and when `max_open_files` 
+is not -1 (i.e., files may be opened several times).
+
+## Acknowledgements
+
+Special thanks to Peter Mattis and Nikhil Benesch from Cockroach Labs, who were early users of
+DeleteRange v1 in production, contributed the cleanest/most efficient v1 aggregation implementation, found and fixed bugs, and provided initial DeleteRange v2 design and continued help.
+
+Thanks to Huachao Huang and Jinpeng Zhang from PingCAP for early DeleteRange v1 adoption, bug reports, and fixes.
diff --git a/src/rocksdb/docs/_posts/2019-03-08-format-version-4.markdown b/src/rocksdb/docs/_posts/2019-03-08-format-version-4.markdown
new file mode 100644
index 000000000..ce657696c
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2019-03-08-format-version-4.markdown
@@ -0,0 +1,36 @@
+---
+title: format_version 4
+layout: post
+author: maysamyabandeh
+category: blog
+---
+
+The data blocks in RocksDB consist of a sequence of key/values pairs sorted by key, where the pairs are grouped into _restart intervals_ specified by `block_restart_interval`. Up to RocksDB version 5.14, where the latest and default value of `BlockBasedTableOptions::format_version` is 2, the format of index and data blocks are the same: index blocks use the same key format of <`user_key`,`seq`> and encode pointers to data blocks, <`offset`,`size`>, to a byte string and use them as values. The only difference is that the index blocks use `index_block_restart_interval` for the size of _restart intervals_. `format_version=`3,4 offer more optimized, backward-compatible, yet forward-incompatible format for index blocks. 
+
+### Pros
+
+Using `format_version`=4 significantly reduces the index block size, in some cases around 4-5x. This frees more space in block cache, which would result in higher hit rate for data and filter blocks, or offer the same performance with a smaller block cache size.
+
+### Cons
+
+Being _forward-incompatible_ means that if you enable `format_version=`4 you cannot downgrade to a RocksDB version lower than 5.16.
+
+### How to use it?
+
+- `BlockBasedTableOptions::format_version` = 4
+- `BlockBasedTableOptions::index_block_restart_interval` = 16
+
+### What is format_version 3?
+(Since RocksDB 5.15) In most cases, the sequence number `seq` is not necessary for keys in the index blocks. In such cases, `format_version`=3 skips encoding the sequence number and sets `index_key_is_user_key` in TableProperties, which is used by the reader to know how to decode the index block.
+
+### What is format_version 4?
+(Since RocksDB 5.16) Changes the format of index blocks by delta encoding the index values, which are the block handles. This saves the encoding of `BlockHandle::offset` of the non-head index entries in each restart interval. If used, `TableProperties::index_value_is_delta_encoded` is set, which is used by the reader to know how to decode the index block.  The format of each key is (shared_size, non_shared_size, shared, non_shared). The format of each value, i.e., block handle, is (offset, size) whenever the shared_size is 0, which included the first entry in each restart point. Otherwise the format is delta-size = block handle size - size of last block handle.
+
+The index format in `format_version=4` would be as follows:
+
+    restart_point   0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
+    restart_point   1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
+    ...
+    restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
+    where, k is key, v is value, and its encoding is in parenthesis.
+
diff --git a/src/rocksdb/docs/_posts/2019-08-15-unordered-write.markdown b/src/rocksdb/docs/_posts/2019-08-15-unordered-write.markdown
new file mode 100644
index 000000000..5f0eb2880
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2019-08-15-unordered-write.markdown
@@ -0,0 +1,56 @@
+---
+title: Higher write throughput with `unordered_write` feature
+layout: post
+author: maysamyabandeh
+category: blog
+---
+
+Since RocksDB 6.3, The `unordered_write=`true option together with WritePrepared transactions offers 34-42% higher write throughput compared to vanilla RocksDB. If the application can handle more relaxed ordering guarantees, the gain in throughput would increase to 63-131%.
+
+### Background
+
+Currently RocksDB API delivers the following powerful guarantees:
+- Atomic reads: Either all of a write batch is visible to reads or none of it.
+- Read-your-own writes: When a write thread returns to the user, a subsequent read by the same thread will be able to see its own writes.
+- Immutable Snapshots: The reads visible to the snapshot are immutable in the sense that it will not be affected by any in-flight or future writes.
+
+### `unordered_write`
+
+The `unordered_write` feature, when turned on, relaxes the default guarantees of RocksDB. While it still gives read-your-own-write property, neither atomic reads nor the immutable snapshot properties are provided any longer. However, RocksDB users could still get read-your-own-write and immutable snapshots when using this feature in conjunction with TransactionDB configured with WritePrepared transactions and `two_write_queues`. You can read [here](https://github.com/facebook/rocksdb/wiki/unordered_write) to learn about the design of `unordered_write` and [here](https://github.com/facebook/rocksdb/wiki/WritePrepared-Transactions) to learn more about WritePrepared transactions.
+
+### How to use it?
+
+To get the same guarantees as vanilla RocksdB:
+
+    DBOptions db_options;
+    db_options.unordered_write = true;
+    db_options.two_write_queues = true;
+    DB* db;
+    {
+      TransactionDBOptions txn_db_options;
+      txn_db_options.write_policy = TxnDBWritePolicy::WRITE_PREPARED;
+      txn_db_options.skip_concurrency_control = true;
+      TransactionDB* txn_db;
+      TransactionDB::Open(options, txn_db_options, kDBPath, &txn_db);
+      db = txn_db;
+    }
+    db->Write(...);
+
+To get relaxed guarantees:
+
+    DBOptions db_options;
+    db_options.unordered_write = true;
+    DB* db;
+    DB::Open(db_options, kDBPath, &db);
+    db->Write(...);
+
+# Benchmarks
+
+    TEST_TMPDIR=/dev/shm/ ~/db_bench --benchmarks=fillrandom --threads=32 --num=10000000 -max_write_buffer_number=16 --max_background_jobs=64 --batch_size=8 --writes=3000000 -level0_file_num_compaction_trigger=99999 --level0_slowdown_writes_trigger=99999 --level0_stop_writes_trigger=99999 -enable_pipelined_write=false -disable_auto_compactions --transaction_db=true --unordered_write=1 --disable_wal=0
+
+Throughput with `unordered_write`=true and using WritePrepared transaction:
+- WAL: +42%
+- No-WAL: +34%
+Throughput with `unordered_write`=true
+- WAL: +63%
+- NoWAL: +131%
diff --git a/src/rocksdb/docs/_posts/2021-04-12-universal-improvements.markdown b/src/rocksdb/docs/_posts/2021-04-12-universal-improvements.markdown
new file mode 100644
index 000000000..fa4e9d463
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2021-04-12-universal-improvements.markdown
@@ -0,0 +1,46 @@
+---
+title: (Call For Contribution) Make Universal Compaction More Incremental
+layout: post
+author: sdong
+category: blog
+---
+
+### Motivation
+
+Universal Compaction is an important compaction style, but few changes were made after we made the structure multi-leveled. Yet the major restriction of always compacting full sorted run is not relaxed. Compared to Leveled Compaction, where we usually only compile several SST files together, in universal compaction, we frequently compact GBs of data. Two issues with this gap: 1. it makes it harder to unify universal and leveled compaction; 2. periodically data is fully compacted, and in the mean time space is doubled. To ease the problem, we can break the restriction and do similar as leveled compaction, and bring it closer to unified compaction.
+
+We call for help for making following improvements.
+
+
+### How Universal Compaction Works
+
+In universal, whole levels are compacted together to satisfy two conditions (See [wiki page](https://github.com/facebook/rocksdb/wiki/Universal-Compaction) for more details):
+
+1. total size / bottommost level size > a threshold, or
+2. total number of sorted runs (non-0 levels + L0 files) is within a threshold
+
+1 is to limit extra space overhead used for dead data and 2 is for read performance.
+
+If 1 is triggered, likely a full compaction will be triggered. If 2 is triggered, RocksDB compact some sorted runs to bring the number down. It does it by using a simple heuristic so that less writes needed for that purpose over time: it starts from compacting smaller files, but if total size to compact is similar to or larger than size of the next level, it will take that level together, as soon on (whether it is the best heuristic is another question and we’ve never seriously looked at it).
+
+### How We Can Improve?
+
+Let’s start from condition 1. Here we do full compaction but is not necessary.  A simple optimization would be to compact so that just enough files are merged into the bottommost level (Lmax) to satisfy condition 1. It would work if we only need to pick some files from Lmax-1, or if it is cheaper over time, we can pick some files from other levels too.
+
+Then condition 2. If we finish condition 1, there might be holes in some ranges in older levels. These holes might make it possible that only by compacting some sub ranges, we can fix the LSM-tree for condition 2. RocksDB can take single files into consideration and apply more sophisticated heuristic.
+
+This new approach makes universal compaction closer to leveled compaction. The operation for 1 is closer to how Leveled compaction triggeres Lmax-1 to Lmax compaction. And 2 can potentially be implemented as something similar to level picking in Leveled Compaction. In fact, all those file picking can co-existing in one single compaction style and there isn’t fundamental conflicts to that.
+
+### Limitation
+
+There are two limitations:
+
+* Periodic automatic full compaction is unpleasant but at the same time is pleasant in another way. Some users might uses it to reason that everything is periodically collapsed so dead data is gone and old data is rewritten. We need to make sure periodic compaction works to continue with that.
+* L0 to the first non-L0 level compaction is the first time data is partitioned in LSM-tree so that incremental compaction by range is possible. We might need to do more of these compactions in order to make incremental possible, which will increase compaction slightly.
+* Compacting subset of a level would introduce some extra overhead for unaligned files, just as in leveled compaction. More SST boundary cutting heuristic can reduce this overhead but it will be there.
+
+But I believe the benefits would outweight the limitations. Reducing temporary space doubling and moving towards to unified compaction would be important achievements.
+
+### Interested in Help?
+
+Compaction is the core of LSM-tree, but its improvements are far overdue. If you are a user of universal compaction and would be able to benefit from those improvements, we will be happy to work with you on speeding up the project and bring them to RocksDB sooner. Feel free to communicate with us in [this issue](https://github.com/facebook/rocksdb/issues/8181).
diff --git a/src/rocksdb/docs/_posts/2021-05-26-integrated-blob-db.markdown b/src/rocksdb/docs/_posts/2021-05-26-integrated-blob-db.markdown
new file mode 100644
index 000000000..9f3a22fa2
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2021-05-26-integrated-blob-db.markdown
@@ -0,0 +1,101 @@
+---
+title: Integrated BlobDB
+layout: post
+author: ltamasi
+category: blog
+---
+## Background
+
+BlobDB is essentially RocksDB for large-value use cases. The basic idea, which was proposed in the [WiscKey paper](https://www.usenix.org/system/files/conference/fast16/fast16-papers-lu.pdf), is key-value separation: by storing large values in dedicated blob files and storing only small pointers to them in the LSM tree, we avoid copying the values over and over again during compaction, thus reducing write amplification. Historically, BlobDB supported only FIFO and TTL based use cases that can tolerate some data loss. In addition, it was incompatible with many widely used RocksDB features, and required users to adopt a custom API. In 2020, we decided to rearchitect BlobDB from the ground up, taking the lessons learned from WiscKey and the original BlobDB but also drawing inspiration and incorporating ideas from other similar systems. Our goals were to eliminate the above limitations and to create a new integrated version that enables customers to use the well-known RocksDB API, has feature parity with the core of RocksDB, and offers better performance. This new implementation is now available and provides the following improvements over the original:
+
+* **API.** In contrast with the legacy BlobDB implementation, which had its own `StackableDB`-based interface (`rocksdb::blob_db::BlobDB`), the new version can be used via the well-known `rocksdb::DB` API, and can be configured simply by using a few column family options.
+* **Consistency.** With the integrated BlobDB implementation, RocksDB’s consistency guarantees and various write options (like using the WAL or synchronous writes) now apply to blobs as well. Moreover, the new BlobDB keeps track of blob files in the RocksDB MANIFEST.
+* **Write performance.** When using the old BlobDB, blobs are extracted and immediately written to blob files by the BlobDB layer *in the application thread*. This has multiple drawbacks from a performance perspective: first, it requires synchronization; second, it means that expensive operations like compression are performed in the application thread; and finally, it involves flushing the blob file after each blob. The new code takes a completely different approach by *offloading blob file building to RocksDB’s background jobs*, i.e. flushes and compactions. This means that similarly to SSTs, any given blob file is now written by a single background thread, eliminating the need for locking, flushing, or performing compression in the foreground. Note that this approach is also a better fit for network-based file systems where small writes might be expensive and opens up the possibility of file format optimizations that involve buffering (like dictionary compression).
+* **Read performance.** The old code relies on each read (i.e. `Get`, `MultiGet`, or iterator) taking a snapshot and uses those snapshots when deciding which obsolete blob files can be removed. The new BlobDB improves this by generalizing RocksDB’s Version concept, which historically referred to the set of live SST files at a given point in time, to include the set of live blob files as well. This has performance benefits like [making the read path mostly lock-free by utilizing thread-local storage](https://rocksdb.org/blog/2014/06/27/avoid-expensive-locks-in-get.html). We have also introduced a blob file cache that can be utilized to keep frequently accessed blob files open.
+* **Garbage collection.** Key-value separation means that if a key pointing to a blob gets overwritten or deleted, the blob becomes unreferenced garbage. To be able to reclaim this space, BlobDB now has garbage collection capabilities. GC is integrated into the compaction process and works by relocating valid blobs residing in old blob files as they are encountered during compaction. Blob files can be marked obsolete (and eventually deleted in one shot) once they contain nothing but garbage. This is more efficient than the method used by WiscKey, which involves performing a `Get` operation to find out whether a blob is still referenced followed by a `Put` to update the reference, which in turn results in garbage collection competing and potentially conflicting with the application’s writes.
+* **Feature parity with the RocksDB core.** The new BlobDB supports way more features than the original and is near feature parity with vanilla RocksDB. In particular, we support all basic read/write APIs (with the exception of `Merge`, which is coming soon), recovery, compression, atomic flush, column families, compaction filters, checkpoints, backup/restore, transactions, per-file checksums, and the SST file manager. In addition, the new BlobDB’s options can be dynamically adjusted using the `SetOptions` interface.
+
+## API
+
+The new BlobDB can be configured (on a per-column family basis if needed) simply by using the following options:
+
+* `enable_blob_files`: set it to `true` to enable key-value separation.
+* `min_blob_size`: values at or above this threshold will be written to blob files during flush or compaction.
+* `blob_file_size`: the size limit for blob files.
+* `blob_compression_type`: the compression type to use for blob files. All blobs in the same file are compressed using the same algorithm.
+* `enable_blob_garbage_collection`: set this to `true` to make BlobDB actively relocate valid blobs from the oldest blob files as they are encountered during compaction.
+* `blob_garbage_collection_age_cutoff`: the threshold that the GC logic uses to determine which blob files should be considered “old.” For example, the default value of 0.25 signals to RocksDB that blobs residing in the oldest 25% of blob files should be relocated by GC. This parameter can be tuned to adjust the trade-off between write amplification and space amplification.
+
+The above options are all dynamically adjustable via the `SetOptions` API; changing them will affect subsequent flushes and compactions but not ones that are already in progress.
+
+In terms of compaction styles, we recommend using leveled compaction with BlobDB. The rationale behind universal compaction in general is to provide lower write amplification at the expense of higher read amplification; however, as we will see later in the Performance section, BlobDB can provide very low write amp and good read performance with leveled compaction. Therefore, there is really no reason to take the hit in read performance that comes with universal compaction.
+
+In addition to the above, consider tuning the following non-BlobDB specific options:
+
+* `write_buffer_size`: this is the memtable size. You might want to increase it for large-value workloads to ensure that SST and blob files contain a decent number of keys.
+* `target_file_size_base`: the target size of SST files. Note that even when using BlobDB, it is important to have an LSM tree with a “nice” shape and multiple levels and files per level to prevent heavy compactions. Since BlobDB extracts and writes large values to blob files, it makes sense to make this parameter significantly smaller than the memtable size. One guideline is to set `blob_file_size` to the same value as `write_buffer_size` (adjusted for compression if needed) and make `target_file_size_base` proportionally smaller based on the ratio of key size to value size.
+* `max_bytes_for_level_base`: consider setting this to a multiple (e.g. 8x or 10x) of `target_file_size_base`.
+
+As mentioned above, the new BlobDB now also supports compaction filters. Key-value separation actually enables an optimization here: if the compaction filter of an application can make a decision about a key-value solely based on the key, it is unnecessary to read the value from the blob file. Applications can take advantage of this optimization by implementing the new `FilterBlobByKey` method of the `CompactionFilter` interface. This method gets called by RocksDB first whenever it encounters a key-value where the value is stored in a blob file. If this method returns a “final” decision like `kKeep`, `kRemove`, `kChangeValue`, or `kRemoveAndSkipUntil`, RocksDB will honor that decision; on the other hand, if the method returns `kUndetermined`, RocksDB will read the blob from the blob file and call `FilterV2` with the value in the usual fashion.
+
+## Performance
+
+We tested the performance of the new BlobDB for six different value sizes between 1 KB and 1 MB using a customized version of our [standard benchmark suite](https://github.com/facebook/rocksdb/wiki/Performance-Benchmarks) on a box with an 18-core Skylake DE CPU (running at 1.6 GHz, with hyperthreading enabled), 64 GB RAM, a 512 GB boot SSD, and two 1.88 TB M.2 SSDs in a RAID0 configuration for data. The RocksDB version used was equivalent to 6.18.1, with some benchmarking and statistics related enhancements. Leveled and universal compaction without key-value separation were used as reference points. Note that for simplicity, we use “leveled compaction” and “universal compaction” as shorthand for leveled and universal compaction without key-value separation, respectively, and “BlobDB” for BlobDB with leveled compaction.
+
+Our benchmarks cycled through six different workloads: two write-only ones (initial load and overwrite), two read/write ones (point lookup/write mix and range scan/write mix), and finally two read-only ones (point lookups and range scans). The first two phases performed a fixed amount of work (see below), while the final four were run for a fixed amount of time, namely 30 minutes each. Each phase other than the first one started with the database state left behind by the previous one. Here’s a brief description of the workloads:
+
+* **Initial load**: this workload has two distinct stages, a single-threaded random write stage during which compactions are disabled (so all data is flushed to L0, where it remains for the rest of the stage), followed by a full manual compaction. The random writes are performed with load-optimized settings, namely using the vector memtable implementation and with concurrent memtable writes and WAL disabled. This stage was used to populate the database with 1 TB worth of raw values, e.g. 2^30 (~1 billion) 1 KB values or 2^20 (~1 million) 1 MB values.
+* **Overwrite**: this is a multi-threaded random write workload using the usual skiplist memtable, with compactions, WAL, and concurrent memtable writes enabled. In our tests, 16 writer threads were used. The total number of writes was set to the same number as in the initial load stage and split up evenly between the writer threads. For instance, for the 1 MB value size, we had 2^20 writes divided up between the 16 threads, resulting in each thread performing 2^16 write operations. At the end of this phase, a “wait for compactions” step was added to prevent this workload from exhibiting artificially low write amp or conversely, the next phase showing inflated write amp.
+* **Point lookup/write mix**: a single writer thread performing random writes while N (in our case, 16) threads perform random point lookups. WAL is enabled and all writes are synced.
+* **Range scan/write mix**: similar to the above, with one writer thread and N reader threads (where N was again set to 16 in our tests). The reader threads perform random range scans, with 10 `Next` calls per `Seek`. Again, WAL is enabled, and sync writes are used.
+* **Point lookups (read-only)**: N=16 threads perform random point lookups.
+* **Range scans (read-only)**: N=16 threads execute random range scans, with 10 `Next`s per `Seek` like above.
+
+With that out of the way, let’s see how the new BlobDB performs against traditional leveled and universal compaction. In the next few sections, we’ll be looking at write amplification as well as read and write performance. We’ll also briefly compare the write performance of the new BlobDB with the legacy implementation.
+
+### Write amplification
+
+Reducing write amp is the original motivation for key-value separation. Here, we follow RocksDB’s definition of write amplification (as used in compaction statistics and the info log). That is, we define write amp as the total amount of data written by flushes and compactions divided by the amount of data written by flushes, where “data written” includes SST files and blob files as well (if applicable). The following charts show that BlobDB significantly reduces write amplification for all of our (non-read only) workloads.
+
+For the initial load, where due to the nature of the workload both leveled and universal already have a low write amp factor of 1.6, BlobDB has a write amp close to the theoretical minimum of 1.0, namely in the 1.0..1.02 range, depending on value size. How is this possible? Well, the trick is that when key-value separation is used, the full compaction step only has to sort the keys but not the values. This results in a write amp that is about **36% lower** than the already low write amp you get with either leveled or universal.
+
+In the case of the overwrite workload, BlobDB had a write amp between 1.4 and 1.7 depending on value size. This is around **75-78% lower** than the write amp of leveled compaction (6.1 to 6.8) and **70-77% lower** than universal (5.7 to 6.2); for this workload, there wasn’t a huge difference between the performance of leveled and universal.
+
+When it comes to the point lookup/write mix workload, BlobDB had a write amp between 1.4 and 1.8. This is **83-88% lower** than the write amp of leveled compaction, which had values between 10.8 and 12.5. Universal fared much better than leveled under this workload, and had write amp in the 2.2..6.6 range; however, BlobDB still provided significant gains for all value sizes we tested: namely, write amp was **18-77% lower** than that of universal, depending on value size.
+
+As for the range scan/write mix workload, BlobDB again had a write amp between 1.4 and 1.8, while leveled had values between 13.6 and 14.9, and universal was between 2.8 and 5.0. In other words, BlobDB’s write amp was **88-90% lower** than that of leveled, and **46-70% lower** than that of universal.
+
+![Write amplification](/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Amp.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+### Write performance
+
+In terms of write performance, there are other factors to consider besides write amplification. The following charts show some interesting metrics for the two write-only workloads (initial load and overwrite). As discussed earlier, these two workloads perform a fixed amount of work; the two charts in the top row show how long it took BlobDB, leveled, and universal to complete that work. Note that each bar is broken down into two, corresponding to the two stages of each workload (random write and full compaction for initial load, and random write and waiting for compactions for overwrite).
+
+For initial load, note that the random write stage takes the same amount of time regardless of which algorithm is used. This is not surprising considering the fact that compactions are disabled during this stage and thus RocksDB is simply writing L0 files (and in BlobDB’s case, blob files) as fast as it can. The second stage, on the other hand, is very different: as mentioned above, BlobDB essentially only needs to read, sort, and rewrite the keys during compaction, which can be done much much faster (with 1 MB values, more than a hundred times faster) than doing the same for large key-values. Due to this, initial load completed **2.3x to 4.7x faster** overall when using BlobDB.
+
+As for the overwrite workload, BlobDB performs much better during both stages. The two charts in the bottom row help explain why. In the case of both leveled and universal compaction, compactions can’t keep up with the write rate, which eventually leads to back pressure in the form of write stalls. As shown in the chart below, both leveled and universal stall between ~40% and ~70% of the time; on the other hand, BlobDB is stall-free except for the largest value size tested (1 MB). This naturally leads to higher throughput, namely **2.1x to 3.5x higher** throughput compared to leveled, and **1.6x to 3.0x higher** throughput compared to universal. The overwrite time chart also shows that the catch-up stage that waits for all compactions to finish is much shorter (and in fact, at larger value sizes, negligible) with BlobDB.
+
+![Write performance](/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Perf.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+### Read/write and read-only performance
+
+The charts below show the read performance (in terms of operations per second) of BlobDB versus leveled and universal compaction under the two read/write workloads and the two read-only workloads. BlobDB meets or exceeds the read performance of leveled compaction, except for workloads involving range scans at the two smallest value sizes tested (1 KB and 4 KB). It also provides better (in some cases, much better) read performance than universal across the board. In particular, BlobDB provides up **1.4x higher** read performance than leveled (for larger values), and up to **5.6x higher** than universal.
+
+![Read-write and read-only performance](/static/images/integrated-blob-db/BlobDB_Benchmarks_RW_RO_Perf.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+### Comparing the two BlobDB implementations
+
+To compare the write performance of the new BlobDB with the legacy implementation, we ran two versions of the first (single-threaded random write) stage of the initial load benchmark using 1 KB values: one with WAL disabled, and one with WAL enabled. The new implementation completed the load **4.6x faster** than the old one without WAL, and **2.3x faster** with WAL.
+
+![Comparing the two BlobDB implementations](/static/images/integrated-blob-db/BlobDB_Benchmarks_Legacy_Vs_Integrated.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+## Future work
+
+There are a few remaining features that are not yet supported by the new BlobDB. The most important one is `Merge` (and the related `GetMergeOperands` API); in addition, we don’t currently support the `EventListener` interface, the `GetLiveFilesMetaData` and `GetColumnFamilyMetaData` APIs, secondary instances, and ingestion of blob files. We will continue to work on closing this gap.
+
+We also have further plans when it comes to performance. These include optimizing garbage collection, introducing a dedicated cache for blobs, improving iterator and `MultiGet` performance, and evolving the blob file format amongst others.
+
diff --git a/src/rocksdb/docs/_posts/2021-05-26-online-validation.markdown b/src/rocksdb/docs/_posts/2021-05-26-online-validation.markdown
new file mode 100644
index 000000000..33e9dfc15
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2021-05-26-online-validation.markdown
@@ -0,0 +1,17 @@
+---
+title: Online Validation
+layout: post
+author: sdong
+category: blog
+---
+To prevent or mitigate data corrution in RocksDB when some software or hardware issues happens, we keep adding online consistency checks and improving existing ones.
+
+We improved ColumnFamilyOptions::force_consistency_checks and enabled it by default. The option does some basic consistency checks to LSM-tree, e.g., files in one level are not overlapping. The DB will be frozen from new writes if a violation is detected. Previously, the feature’s check was too limited and didn’t always freeze the DB in a timely manner. Last year, we made the checking stricter so that it can [catch much more corrupted LSM-tree structures](https://github.com/facebook/rocksdb/pull/6901). We also fixed several issues where the checking failure was swallowed without freezing the DB. After making force_consistency_checks more reliable, we changed the default value to be on.
+
+ColumnFamilyOptions::paranoid_file_checks does some more expensive extra checking when generating a new SST file. Last year, we advanced coverage to this feature: after every SST file is generated, the SST file is created, read back keys one by one and check two things: (1) the keys are in comparator order (also available and enabled by default during file write via ColumnFamilyOptions::check_flush_compaction_key_order); (2) the hash of all the KVs is the same as calculated when we add KVs into it. These checks detect certain corruptions so we can prevent the corrupt files from being applied to the DB. We suggest users turn it on at least in shadow environments, and consider to run it in production too if you can afford the overheads.
+
+A recent feature is added to check the count of entries added into memtable while flushing it into an SST file. This feature is to have some online coverage to memtable corruption, caused by either software bug or hardware issue. This feature will be released in the coming release (6.21) and by default on. In the future, we will check more counters during memtables, e.g. number of puts or number of deletes.
+
+We also improved the reporting of online validation errors to improve debuggability. For example, failure to parse a corrupt key now reports details about the corrupt key. Since we did not want to expose key data in logs, error messages, etc., by default, this reporting is opt-in via DBOptions::allow_data_in_errors.
+
+More online checking features are planned and some are more sophisticated, including key/value checksums and sample based query validation.
diff --git a/src/rocksdb/docs/_posts/2021-05-27-rocksdb-secondary-cache.markdown b/src/rocksdb/docs/_posts/2021-05-27-rocksdb-secondary-cache.markdown
new file mode 100644
index 000000000..3ad1141bf
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2021-05-27-rocksdb-secondary-cache.markdown
@@ -0,0 +1,195 @@
+---
+title: RocksDB Secondary Cache
+layout: post
+author: anand1976
+category: blog
+---
+## Introduction
+
+The RocksDB team is implementing support for a block cache on non-volatile media, such as a local flash device or NVM/SCM. It can be viewed as an extension of RocksDB’s current volatile block cache (LRUCache or ClockCache). The non-volatile block cache acts as a second tier cache that contains blocks evicted from the volatile cache. Those blocks are then promoted to the volatile cache as they become hotter due to access.
+
+This feature is meant for cases where the DB is located on remote storage or cloud storage. The non-volatile cache is officially referred to in RocksDB as the SecondaryCache. By maintaining a SecondaryCache that’s an order of magnitude larger than DRAM, fewer reads would be required from remote storage, thus reducing read latency as well as network bandwidth consumption. 
+
+From the user point of view, the local flash cache will support the following requirements -
+
+1. Provide a pointer to a secondary cache when opening a DB
+2. Be able to share the secondary cache across DBs in the same process
+3. Have multiple secondary caches on a host
+4. Support persisting the cache across process restarts and reboots by ensuring repeatability of the cache key
+
+![Architecture](/static/images/rocksdb-secondary-cache/arch_diagram.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+## Design
+
+When designing the API for a SecondaryCache, we had a choice between making it visible to the RocksDB code (table reader) or hiding it behind the RocksDB block cache. There are several advantages of hiding it behind the block cache -
+
+* Allows flexibility in insertion of blocks into the secondary cache. A block can be inserted on eviction from the RAM tier, or it could be eagerly inserted.
+* It makes the rest of the RocksDB code less complex by providing a uniform interface regardless of whether a secondary cache is configured or not
+* Makes parallel reads, peeking in the cache for prefetching, failure handling etc. easier
+* Makes it easier to extend to compressed data if needed, and allows other persistent media, such as PM, to be added as an additional tier
+
+
+We decided to make the secondary cache transparent to the rest of RocksDB code by hiding it behind the block cache. A key issue that we needed to address was the allocation and ownership of memory of the cached items - insertion into the secondary cache may require that memory be allocated by the same. This means that parts of the cached object that can be transferred to the secondary cache needs to be copied out (referred to as **unpacking**), and on a lookup the data stored in the secondary cache needs to be provided to the object constructor (referred to as **packing**). For RocksDB cached objects such as data blocks, index and filter blocks, and compression dictionaries, unpacking involves copying out the raw uncompressed BlockContents of the block, and packing involves constructing the corresponding block/index/filter/dictionary object using the raw uncompressed data.
+
+Another alternative we considered was the existing PersistentCache interface. However, we decided to not pursue it and eventually deprecate it for the following reasons -
+* It is exposed directly to the table reader code, which makes it more difficult to implement different policies such as inclusive/exclusive cache, as well as extending it to more sophisticated admission control policies
+* The interface does not allow for custom memory allocation and object packing/unpacking, so new APIs would have to be defined anyway
+* The current PersistentCache implementation is very simple and does not have any admission control policies
+
+## API
+
+The interface between RocksDB’s block cache and the secondary cache is designed to allow pluggable implementations. For FB internal usage, we plan to use Cachelib  with a wrapper to provide the plug-in implementation and use folly and other fbcode libraries, which cannot be used directly by RocksDB, to efficiently implement the cache operations. The following diagrams show the flow of insertion  and lookup of a block.
+
+![Insert flow](/static/images/rocksdb-secondary-cache/insert_flow.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+![Lookup flow](/static/images/rocksdb-secondary-cache/lookup_flow.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+An item in the secondary cache is referenced by a SecondaryCacheHandle. The handle may not be immediately ready or have a valid  value. The caller can call IsReady() to determine if its ready, and can call Wait() in order to block until it becomes ready. The caller must call Value() after it becomes ready to determine if the item was successfully read. Value() must return nullptr on failure.
+
+```
+class SecondaryCacheHandle {
+ public:
+  virtual ~SecondaryCacheHandle() {}
+
+  // Returns whether the handle is ready or not
+  virtual bool IsReady() = 0;
+
+  // Block until handle becomes ready
+  virtual void Wait() = 0;
+
+  // Return the value. If nullptr, it means the lookup was unsuccessful
+  virtual void* Value() = 0;
+
+  // Return the size of value
+  virtual size_t Size() = 0;
+};
+```
+
+The user of the secondary cache (for example, BlockBasedTableReader indirectly through LRUCache) must implement the callbacks defined in CacheItemHelper, in order to facilitate the unpacking/packing of objects for saving to and restoring from the secondary cache. The CreateCallback must be implemented to construct a cacheable object from the raw data in secondary cache.
+
+```
+  // The SizeCallback takes a void* pointer to the object and returns the size
+  // of the persistable data. It can be used by the secondary cache to allocate
+  // memory if needed.
+  using SizeCallback = size_t (*)(void* obj);
+
+  // The SaveToCallback takes a void* object pointer and saves the persistable
+  // data into a buffer. The secondary cache may decide to not store it in a
+  // contiguous buffer, in which case this callback will be called multiple
+  // times with increasing offset
+  using SaveToCallback = Status (*)(void* from_obj, size_t from_offset,
+                                    size_t length, void* out);
+
+  // A function pointer type for custom destruction of an entry's
+  // value. The Cache is responsible for copying and reclaiming space
+  // for the key, but values are managed by the caller.
+  using DeleterFn = void (*)(const Slice& key, void* value);
+
+  // A struct with pointers to helper functions for spilling items from the
+  // cache into the secondary cache. May be extended in the future. An
+  // instance of this struct is expected to outlive the cache.
+  struct CacheItemHelper {
+    SizeCallback size_cb;
+    SaveToCallback saveto_cb;
+    DeleterFn del_cb;
+
+    CacheItemHelper() : size_cb(nullptr), saveto_cb(nullptr), del_cb(nullptr) {}
+    CacheItemHelper(SizeCallback _size_cb, SaveToCallback _saveto_cb,
+                    DeleterFn _del_cb)
+        : size_cb(_size_cb), saveto_cb(_saveto_cb), del_cb(_del_cb) {}
+  };
+
+  // The CreateCallback is passed by the block cache user to Lookup(). It
+  // takes in a buffer from the NVM cache and constructs an object using
+  // it. The callback doesn't have ownership of the buffer and should
+  // copy the contents into its own buffer.
+  // typedef std::function<Status(void* buf, size_t size, void** out_obj,
+  //                             size_t* charge)>
+  //    CreateCallback;
+  using CreateCallback = std::function<Status(void* buf, size_t size,
+                                              void** out_obj, size_t* charge)>;
+```
+
+The secondary cache provider must provide a concrete implementation of the SecondaryCache abstract class.
+
+```
+// SecondaryCache
+//
+// Cache interface for caching blocks on a secondary tier (which can include
+// non-volatile media, or alternate forms of caching such as compressed data)
+class SecondaryCache {
+ public:
+  virtual ~SecondaryCache() {}
+
+  virtual std::string Name() = 0;
+
+  static const std::string Type() { return "SecondaryCache"; }
+
+  // Insert the given value into this cache. The value is not written
+  // directly. Rather, the SaveToCallback provided by helper_cb will be
+  // used to extract the persistable data in value, which will be written
+  // to this tier. The implementation may or may not write it to cache
+  // depending on the admission control policy, even if the return status is
+  // success.
+  virtual Status Insert(const Slice& key, void* value,
+                        const Cache::CacheItemHelper* helper) = 0;
+
+  // Lookup the data for the given key in this cache. The create_cb
+  // will be used to create the object. The handle returned may not be
+  // ready yet, unless wait=true, in which case Lookup() will block until
+  // the handle is ready
+  virtual std::unique_ptr<SecondaryCacheHandle> Lookup(
+      const Slice& key, const Cache::CreateCallback& create_cb, bool wait) = 0;
+
+  // At the discretion of the implementation, erase the data associated
+  // with key
+  virtual void Erase(const Slice& key) = 0;
+
+  // Wait for a collection of handles to become ready. This would be used
+  // by MultiGet, for example, to read multitple data blocks in parallel
+  virtual void WaitAll(std::vector<SecondaryCacheHandle*> handles) = 0;
+
+  virtual std::string GetPrintableOptions() const = 0;
+};
+```
+
+A SecondaryCache is configured by the user by providing a pointer to it in LRUCacheOptions -
+```
+struct LRUCacheOptions {
+  ...
+  // A SecondaryCache instance to use as an additional cache tier
+  std::shared_ptr<SecondaryCache> secondary_cache;
+  ...
+};
+```
+
+## Current Status
+
+The initial RocksDB support for the secondary cache has been merged into the main branch, and will be available in the 6.21 release. This includes providing a way for the user to configure a secondary cache when instantiating RocksDB’s LRU cache (volatile block cache), spilling blocks evicted from the LRU cache to the flash cache, promoting a block read from the SecondaryCache to the LRU cache, update tools such as cache_bench and db_bench to specify a flash cache. The relevant PRs are [#8271](https://github.com/facebook/rocksdb/pull/8271), [#8191](https://github.com/facebook/rocksdb/pull/8191), and [#8312](https://github.com/facebook/rocksdb/pull/8312).
+
+We prototyped an end-to-end solution, with the above PRs as well as a Cachelib based implementation of the SecondaryCache. We ran a mixgraph benchmark to simulate a realistic read/write workload. The results showed a 15% gain with the local flash cache over no local cache, and a ~25-30% reduction in network reads with a corresponding decrease in cache misses.
+
+![Throughput](/static/images/rocksdb-secondary-cache/Mixgraph_throughput.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+![Hit Rate](/static/images/rocksdb-secondary-cache/Mixgraph_hit_rate.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+## Future Work
+
+In the short term, we plan to do the following in order to fully integrate the SecondaryCache with RocksDB -
+
+1. Use DB session ID as the cache key prefix to ensure uniqueness and repeatability
+2. Optimize flash cache usage of MultiGet and iterator workloads
+3. Stress testing
+4. More benchmarking
+
+Longer term, we plan to deploy this in production at Facebook.
+
+## Call to Action
+
+We are hoping for a community contribution of a secondary cache implementation, which would make this feature usable by the broader RocksDB userbase. If you are interested in contributing, please reach out to us in [this issue](https://github.com/facebook/rocksdb/issues/8347).
+
diff --git a/src/rocksdb/docs/_posts/2021-05-31-dictionary-compression.markdown b/src/rocksdb/docs/_posts/2021-05-31-dictionary-compression.markdown
new file mode 100644
index 000000000..9b0f45293
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2021-05-31-dictionary-compression.markdown
@@ -0,0 +1,157 @@
+---
+title: Preset Dictionary Compression
+layout: post
+author: ajkr
+category: blog
+---
+
+## Summary
+
+Compression algorithms relying on an adaptive dictionary, such as LZ4, zstd, and zlib, struggle to achieve good compression ratios on small inputs when using the basic compress API.
+With the basic compress API, the compressor starts with an empty dictionary.
+With small inputs, not much content gets added to the dictionary during the compression.
+Combined, these factors suggest the dictionary will never have enough contents to achieve great compression ratios.
+
+RocksDB groups key-value pairs into data blocks before storing them in files.
+For use cases that are heavy on random accesses, smaller data block size is sometimes desirable for reducing I/O and CPU spent reading blocks.
+However, as explained above, smaller data block size comes with the downside of worse compression ratio when using the basic compress API.
+
+Fortunately, zstd and other libraries offer advanced compress APIs that preset the dictionary.
+A preset dictionary makes it possible for the compressor to start from a useful state instead of from an empty one, making compression immediately effective.
+
+RocksDB now optionally takes advantage of these dictionary presetting APIs.
+The challenges in integrating this feature into the storage engine were more substantial than apparent on the surface.
+First, we need to target a preset dictionary to the relevant data.
+Second, preset dictionaries need to be trained from data samples, which need to be gathered.
+Third, preset dictionaries need to be persisted since they are needed at decompression time.
+Fourth, overhead in accessing the preset dictionary must be minimized to prevent regression in critical code paths.
+Fifth, we need easy-to-use measurement to evaluate candidate use cases and production impact.
+
+In production, we have deployed dictionary presetting to save space in multiple RocksDB use cases with data block size 8KB or smaller.
+We have measured meaningful benefit to compression ratio in use cases with data block size up to 16KB.
+We have also measured a use case that can save both CPU and space by reducing data block size and turning on dictionary presetting at the same time.
+
+## Feature design
+#### Targeting
+
+Over time we have considered a few possibilities for the scope of a dictionary.
+
+- Subcompaction
+- SST file
+- Column family
+
+The original choice was subcompaction scope.
+This enabled an approach with minimal buffering overhead because we could collect samples while generating the first output SST file.
+The dictionary could then be trained and applied to subsequent SST files in the same subcompaction.
+
+However, we found a large use case where the proximity of data in the keyspace was more correlated with its similarity than we had predicted.
+In particular, the approach of training a dictionary on an adjacent file yielded substantially worse ratios than training the dictionary on the same file it would be used to compress.
+In response to this finding, we changed the preset dictionary scope to per SST file.
+
+With this change in approach, we had to face the problem we had hoped to avoid: how can we compress all of an SST file's data blocks with the same preset dictionary while that dictionary can only be trained after many data blocks have been sampled?
+The solutions we considered both involved a new overhead.
+We could read the input more than once and introduce I/O overhead, or we could buffer the uncompressed output file data blocks until a dictionary is trained, introducing memory overhead.
+We chose to take the hit on memory overhead.
+
+Another approach that we considered was associating multiple dictionaries with a column family.
+For example, in MyRocks there could be a dictionary trained on data from each large table.
+When compressing a data block, we would look at the table to which its data belongs and pick the corresponding dictionary.
+However, this approach would introduce many challenges.
+RocksDB would need to be aware of the key schema to know where are the table boundaries.
+RocksDB would also need to periodically update the dictionaries to account for changes in data pattern.
+It would need somewhere to store dictionaries at column family scope.
+Overall, we thought these challenges were too difficult to pursue the approach.
+
+#### Training
+
+![](/static/images/dictcmp/dictcmp_raw_sampled.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+<p align="center"><i>
+Raw samples mode (`zstd_max_train_bytes == 0`)
+</i></p>
+
+As mentioned earlier, the approach we took is to build the dictionary from buffered uncompressed data blocks.
+The first row of data blocks in these diagrams illustrate this buffering.
+The second row illustrates training samples selected from the buffered blocks.
+In raw samples mode (above), the final dictionary is simply the concatenation of these samples.
+Whereas, in zstd training mode (below), these samples will be passed to the trainer to produce the final dictionary.
+
+![](/static/images/dictcmp/dictcmp_zstd_trained.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+<p align="center"><i>
+zstd training mode (`zstd_max_train_bytes > 0`)
+</i></p>
+
+#### Compression path
+
+Once the preset dictionary is generated by the above process, we apply it to the buffered data blocks and write them to the output file.
+Thereafter, newly generated data blocks are immediately compressed and written out.
+
+One optimization here is available to zstd v0.7.0+ users.
+Instead of deserializing the dictionary on each compress invocation, we can do that work once and reuse it.
+A `ZSTD_CDict` holds this digested dictionary state and is passed to the compress API.
+
+#### Persistence
+
+When an SST file's data blocks are compressed using a preset dictionary, that dictionary is stored inside the file for later use in decompression.
+
+![](/static/images/dictcmp/dictcmp_sst_blocks.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+<p align="center"><i>
+SST file layout with the preset dictionary in its own (uncompressed) block
+</i></p>
+
+#### Decompression path
+
+To decompress, we need to provide both the data block and the dictionary used to compress it.
+Since dictionaries are just blocks in a file, we access them through block cache.
+However this additional load on block cache can be problematic.
+It can be alleviated by pinning the dictionaries to avoid going through the LRU locks.
+
+An optimization analogous to the digested dictionary exists for certain zstd users (see User API section for details).
+When enabled, the block cache stores the digested dictionary state for decompression (`ZSTD_DDict`) instead of the block contents.
+In some cases we have seen decompression CPU decrease overall when enabling dictionary thanks to this optimization.
+
+#### Measurement
+
+Typically our first step in evaluating a candidate use case is an offline analysis of the data.
+This gives us a quick idea whether presetting dictionary will be beneficial without any code, config, or data changes.
+Our `sst_dump` tool reports what size SST files would have been using specified compression libraries and options.
+We can select random SST files and compare the size with vs. without dictionary.
+
+When that goes well, the next step is to see how it works in a live DB, like a production shadow or canary.
+There we can observe how it affects application/system metrics.
+
+Even after dictionary is enabled, there is the question of how much space was finally saved.
+We provide a way to A/B test size with vs. without dictionary while running in production.
+This feature picks a sample of data blocks to compress in multiple ways -- one of the outputs is stored, while the other outputs are thrown away after counting their size.
+Due to API limitations, the stored output always has to be the dictionary-compressed one, so this feature can only be used after enabling dictionary.
+The size with and without dictionary are stored in the SST file as table properties.
+These properties can be aggregated across all SST files in a DB (and across all DBs in a tier) to learn the final space saving.
+
+## User API
+
+RocksDB allows presetting compression dictionary for users of LZ4, zstd, and zlib.
+The most advanced capabilities are available to zstd v1.1.4+ users who statically link (see below).
+Newer versions of zstd (v1.3.6+) have internal changes to the dictionary trainer and digested dictionary management, which significantly improve memory and CPU efficiency.
+
+Run-time settings:
+
+- `CompressionOptions::max_dict_bytes`: Limit on per-SST file dictionary size. Increasing this causes dictionaries to consume more space and memory for the possibility of better data block compression. A typical value we use is 16KB.
+- (**zstd only**) `CompressionOptions::zstd_max_train_bytes`: Limit on training data passed to zstd dictionary trainer. Larger values cause the training to consume more CPU (and take longer) while generating more effective dictionaries. The starting point guidance we received from zstd team is to set it to 100x `CompressionOptions::max_dict_bytes`.
+- `CompressionOptions::max_dict_buffer_bytes`: Limit on data buffering from which training samples are gathered. By default we buffer up to the target file size per ongoing background job. If this amount of memory is concerning, this option can constrain the buffering with the downside that training samples will cover a smaller portion of the SST file. Work is ongoing to charge this memory usage to block cache so it will not need to be accounted for separately.
+- `BlockBasedTableOptions::cache_index_and_filter_blocks`: Controls whether metadata blocks including dictionary are accessed through block cache or held in table reader memory (yes, its name is outdated).
+- `BlockBasedTableOptions::metadata_cache_options`: Controls what metadata blocks are pinned in block cache. Pinning avoids LRU contention at the risk of cold blocks holding memory.
+- `ColumnFamilyOptions::sample_for_compression`: Controls frequency of measuring extra compressions on data blocks using various libraries with default settings (i.e., without preset dictionary).
+
+Compile-time setting:
+
+- (**zstd only**) `EXTRA_CXXFLAGS=-DZSTD_STATIC_LINKING_ONLY`: Hold digested dictionaries in block cache to save repetitive deserialization overhead. This saves a lot of CPU for read-heavy workloads. This compiler flag is necessary because one of the digested dictionary APIs we use is marked as experimental. We still use it in production, however.
+
+Function:
+
+- `DB::GetPropertiesOfAllTables()`: The properties `kSlowCompressionEstimatedDataSize` and `kFastCompressionEstimatedDataSize` estimate what the data block size (`kDataSize`) would have been if the corresponding compression library had been used. These properties are only present when `ColumnFamilyOptions::sample_for_compression` causes one or more samples to be measured, and they become more accurate with higher sampling frequency.
+
+Tool:
+
+- `sst_dump --command=recompress`: Offline analysis tool that reports what the SST file size would have been using the specified compression library and options.
diff --git a/src/rocksdb/docs/_posts/2021-12-29-ribbon-filter.markdown b/src/rocksdb/docs/_posts/2021-12-29-ribbon-filter.markdown
new file mode 100644
index 000000000..c6a52ce84
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2021-12-29-ribbon-filter.markdown
@@ -0,0 +1,281 @@
+---
+title: Ribbon Filter
+layout: post
+author: pdillinger
+category: blog
+---
+
+## Summary
+Since version 6.15 last year, RocksDB supports Ribbon filters, a new
+alternative to Bloom filters that save space, especially memory, at
+the cost of more CPU usage, mostly in constructing the filters in the
+background. Most applications with long-lived data (many hours or
+longer) will likely benefit from adopting a Ribbon+Bloom hybrid filter
+policy. Here we explain why and how.
+
+[Ribbon filter on RocksDB wiki](https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter#ribbon-filter)
+
+[Ribbon filter paper](https://arxiv.org/abs/2103.02515)
+
+## Problem & background
+Bloom filters play a critical role in optimizing point queries and
+some range queries in LSM-tree storage systems like RocksDB. Very
+large DBs can use 10% or more of their RAM memory for (Bloom) filters,
+so that (average case) read performance can be very good despite high
+(worst case) read amplification, [which is useful for lowering write
+and/or space
+amplification](http://smalldatum.blogspot.com/2015/11/read-write-space-amplification-pick-2_23.html).
+Although the `format_version=5` Bloom filter in RocksDB is extremely
+fast, all Bloom filters use around 50% more space than is
+theoretically possible for a hashed structure configured for the same
+false positive (FP) rate and number of keys added. What would it take
+to save that significant share of “wasted” filter memory, and when
+does it make sense to use such a Bloom alternative?
+
+A number of alternatives to Bloom filters were known, especially for
+static filters (not modified after construction), but all the
+previously known structures were unsatisfying for SSTs because of some
+combination of
+* Not enough space savings for CPU increase. For example, [Xor
+  filters](https://arxiv.org/abs/1912.08258) use 3-4x more CPU than
+  Bloom but only save 15-20% of
+  space. [GOV](https://arxiv.org/pdf/1603.04330.pdf) can save around
+  30% space but requires around 10x more CPU than Bloom.
+* Inconsistent space savings. [Cuckoo
+  filters](https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf)
+  and Xor+ filters offer significant space savings for very low FP
+  rates (high bits per key) but little or no savings for higher FP
+  rates (low bits per key). ([Higher FP rates are considered best for
+  largest levels of
+  LSM.](https://stratos.seas.harvard.edu/files/stratos/files/monkeykeyvaluestore.pdf))
+  [Spatially-coupled Xor
+  filters](https://arxiv.org/pdf/2001.10500.pdf) require very large
+  number of keys per filter for large space savings.
+* Inflexible configuration. No published alternatives offered the same
+  continuous configurability of Bloom filters, where any FP rate and
+  any fractional bits per key could be chosen. This flexibility
+  improves memory efficiency with the `optimize_filters_for_memory`
+  option that minimizes internal fragmentation on filters.
+
+## Ribbon filter development and implementation
+The Ribbon filter came about when I developed a faster, simpler, and
+more adaptable algorithm for constructing a little-known [Xor-based
+structure from Dietzfelbinger and
+Walzer](https://arxiv.org/pdf/1907.04750.pdf). It has very good space
+usage for required CPU time (~30% space savings for 3-4x CPU) and,
+with some engineering, Bloom-like configurability. The complications
+were managable for use in RocksDB:
+* Ribbon space efficiency does not naturally scale to very large
+  number of keys in a single filter (whole SST file or partition), but
+  with the current 128-bit Ribbon implementation in RocksDB, even 100
+  million keys in one filter saves 27% space vs. Bloom rather than 30%
+  for 100,000 keys in a filter.
+* More temporary memory is required during construction, ~230 bits per
+  key for 128-bit Ribbon vs. ~75 bits per key for Bloom filter. A
+  quick calculation shows that if you are saving 3 bits per key on the
+  generated filter, you only need about 50 generated filters in memory
+  to offset this temporary memory usage. (Thousands of filters in
+  memory is typical.) Starting in RocksDB version 6.27, this temporary
+  memory can be accounted for under block cache using
+  `BlockBasedTableOptions::reserve_table_builder_memory`.
+* Ribbon filter queries use relatively more CPU for lower FP rates
+  (but still O(1) relative to number of keys added to filter). This
+  should be OK because lower FP rates are only appropriate when then
+  cost of a false positive is very high (worth extra query time) or
+  memory is not so constrained (can use Bloom instead).
+
+Future: data in [the paper](https://arxiv.org/abs/2103.02515) suggests
+that 32-bit Balanced Ribbon (new name: [Bump-Once
+Ribbon](https://arxiv.org/pdf/2109.01892.pdf)) would improve all of
+these issues and be better all around (except for code complexity).
+
+## Ribbon vs. Bloom in RocksDB configuration
+Different applications and hardware configurations have different
+constraints, but we can use hardware costs to examine and better
+understand the trade-off between Bloom and Ribbon.
+
+### Same FP rate, RAM vs. CPU hardware cost
+Under ideal conditions where we can adjust our hardware to suit the
+application, in terms of dollars, how much does it cost to construct,
+query, and keep in memory a Bloom filter vs. a Ribbon filter?  The
+Ribbon filter costs more for CPU but less for RAM. Importantly, the
+RAM cost directly depends on how long the filter is kept in memory,
+which in RocksDB is essentially the lifetime of the filter.
+(Temporary RAM during construction is so short-lived that it is
+ignored.)  Using some consumer hardware and electricity prices and a
+predicted balance between construction and queries, we can compute a
+“break even” duration in memory. To minimize cost, filters with a
+lifetime shorter than this should be Bloom and filters with a lifetime
+longer than this should be Ribbon. (Python code)
+
+```
+# Commodity prices based roughly on consumer prices and rough guesses
+# Upfront cost of a CPU per hardware thread
+upfront_dollars_per_cpu_thread = 30.0
+
+# CPU average power usage per hardware thread
+watts_per_cpu_thread = 3.5
+
+# Upfront cost of a GB of RAM
+upfront_dollars_per_gb_ram = 8.0
+
+# RAM average power usage per GB
+# https://www.crucial.com/support/articles-faq-memory/how-much-power-does-memory-use
+watts_per_gb_ram = 0.375
+
+# Estimated price of power per kilowatt-hour, including overheads like conversion losses and cooling
+dollars_per_kwh = 0.35
+
+# Assume 3 year hardware lifetime
+hours_per_lifetime = 3 * 365 * 24
+seconds_per_lifetime = hours_per_lifetime * 60 * 60
+
+# Number of filter queries per key added in filter construction is heavily dependent on workload.
+# When replication is in layer above RocksDB, it will be low, likely < 1. When replication is in
+# storage layer below RocksDB, it will likely be > 1. Using a rough and general guesstimate.
+key_query_per_construct = 1.0
+
+#==================================
+# Bloom & Ribbon filter performance
+typical_bloom_bits_per_key = 10.0
+typical_ribbon_bits_per_key = 7.0
+
+# Speeds here are sensitive to many variables, especially query speed because it
+# is so dependent on memory latency. Using this benchmark here:
+# for IMPL in 2 3; do
+#   ./filter_bench -impl=$IMPL -quick -m_keys_total_max=200 -use_full_block_reader
+# done
+# and "Random filter" queries.
+nanoseconds_per_construct_bloom_key = 32.0
+nanoseconds_per_construct_ribbon_key = 140.0
+
+nanoseconds_per_query_bloom_key = 500.0
+nanoseconds_per_query_ribbon_key = 600.0
+
+#==================================
+# Some constants
+kwh_per_watt_lifetime = hours_per_lifetime / 1000.0
+bits_per_gb = 8 * 1024 * 1024 * 1024
+
+#==================================
+# Crunching the numbers
+# on CPU for constructing filters
+dollars_per_cpu_thread_lifetime = upfront_dollars_per_cpu_thread + watts_per_cpu_thread * kwh_per_watt_lifetime * dollars_per_kwh
+dollars_per_cpu_thread_second = dollars_per_cpu_thread_lifetime / seconds_per_lifetime
+
+dollars_per_construct_bloom_key = dollars_per_cpu_thread_second * nanoseconds_per_construct_bloom_key / 10**9
+dollars_per_construct_ribbon_key = dollars_per_cpu_thread_second * nanoseconds_per_construct_ribbon_key / 10**9
+
+dollars_per_query_bloom_key = dollars_per_cpu_thread_second * nanoseconds_per_query_bloom_key / 10**9
+dollars_per_query_ribbon_key = dollars_per_cpu_thread_second * nanoseconds_per_query_ribbon_key / 10**9
+
+dollars_per_bloom_key_cpu = dollars_per_construct_bloom_key + key_query_per_construct * dollars_per_query_bloom_key
+dollars_per_ribbon_key_cpu = dollars_per_construct_ribbon_key + key_query_per_construct * dollars_per_query_ribbon_key
+
+# on holding filters in RAM
+dollars_per_gb_ram_lifetime = upfront_dollars_per_gb_ram + watts_per_gb_ram * kwh_per_watt_lifetime * dollars_per_kwh
+dollars_per_gb_ram_second = dollars_per_gb_ram_lifetime / seconds_per_lifetime
+
+dollars_per_bloom_key_in_ram_second = dollars_per_gb_ram_second / bits_per_gb * typical_bloom_bits_per_key
+dollars_per_ribbon_key_in_ram_second = dollars_per_gb_ram_second / bits_per_gb * typical_ribbon_bits_per_key
+
+#==================================
+# How many seconds does it take for the added cost of constructing a ribbon filter instead
+# of bloom to be offset by the added cost of holding the bloom filter in memory?
+break_even_seconds = (dollars_per_ribbon_key_cpu - dollars_per_bloom_key_cpu) / (dollars_per_bloom_key_in_ram_second - dollars_per_ribbon_key_in_ram_second)
+print(break_even_seconds)
+# -> 3235.1647730256936
+```
+
+So roughly speaking, filters that live in memory for more than an hour
+should be Ribbon, and filters that live less than an hour should be
+Bloom. This is very interesting, but how long do filters live in
+RocksDB?
+
+First let's consider the average case. Write-heavy RocksDB loads are
+often backed by flash storage, which has some specified write
+endurance for its intended lifetime. This can be expressed as *device
+writes per day* (DWPD), and supported DWPD is typically < 10.0 even
+for high end devices (excluding NVRAM). Roughly speaking, the DB would
+need to be writing at a rate of 20+ DWPD for data to have an average
+lifetime of less than one hour. Thus, unless you are prematurely
+burning out your flash or massively under-utilizing available storage,
+using the Ribbon filter has the better cost profile *on average*.
+
+### Predictable lifetime
+But we can do even better than optimizing for the average case. LSM
+levels give us very strong data lifetime hints.  Data in L0 might live
+for minutes or a small number of hours. Data in Lmax might live for
+days or weeks. So even if Ribbon filters weren't the best choice on
+average for a workload, they almost certainly make sense for the
+larger, longer-lived levels of the LSM. As of RocksDB 6.24, you can
+specify a minimum LSM level for Ribbon filters with
+`NewRibbonFilterPolicy`, and earlier levels will use Bloom filters.
+
+### Resident filter memory
+The above analysis assumes that nearly all filters for all live SST
+files are resident in memory. This is true if using
+`cache_index_and_filter_blocks=0` and `max_open_files=-1` (defaults),
+but `cache_index_and_filter_blocks=1` is popular. In that case,
+if you use `optimize_filters_for_hits=1` and non-partitioned filters
+(a popular MyRocks configuration), it is also likely that nearly all
+live filters are in memory. However, if you don't use
+`optimize_filters_for_hits` and use partitioned filters, then
+cold data (by age or by key range) can lead to only a portion of
+filters being resident in memory. In that case, benefit from Ribbon
+filter is not as clear, though because Ribbon filters are smaller,
+they are more efficient to read into memory.
+
+RocksDB version 6.21 and later include a rough feature to determine
+block cache usage for data blocks, filter blocks, index blocks, etc.
+Data like this is periodically dumped to LOG file
+(`stats_dump_period_sec`):
+
+```
+Block cache entry stats(count,size,portion): DataBlock(441761,6.82 GB,75.765%) FilterBlock(3002,1.27 GB,14.1387%) IndexBlock(17777,887.75 MB,9.63267%) Misc(1,0.00 KB,0%)
+Block cache LRUCache@0x7fdd08104290#7004432 capacity: 9.00 GB collections: 2573 last_copies: 10 last_secs: 0.143248 secs_since: 0
+```
+
+This indicates that at this moment in time, the block cache object
+identified by `LRUCache@0x7fdd08104290#7004432` (potentially used
+by multiple DBs) uses roughly 14% of its 9GB, about 1.27 GB, on filter
+blocks. This same data is available through `DB::GetMapProperty` with
+`DB::Properties::kBlockCacheEntryStats`, and (with some effort) can
+be compared to total size of all filters (not necessarily in memory)
+using `rocksdb.filter.size` from
+`DB::Properties::kAggregatedTableProperties`.
+
+### Sanity checking lifetime
+Can we be sure that using filters even makes sense for such long-lived
+data? We can apply [the current 5 minute rule for caching SSD data in
+RAM](http://renata.borovica-gajic.com/data/adms2017_5minuterule.pdf). A
+4KB filter page holds data for roughly 4K keys. If we assume at least
+one negative (useful) filter query in its lifetime per added key, it
+can satisfy the 5 minute rule with a lifetime of up to about two
+weeks. Thus, the lifetime threshold for “no filter” is about 300x
+higher than the lifetime threshold for Ribbon filter.
+
+### What to do with saved memory
+The default way to improve overall RocksDB performance with more
+available memory is to use more space for caching, which improves
+latency, CPU load, read IOs, etc.  With
+`cache_index_and_filter_blocks=1`, savings in filters will
+automatically make room for caching more data blocks in block
+cache. With `cache_index_and_filter_blocks=0`, consider increasing
+block cache size.
+
+Using the space savings to lower filter FP rates is also an option,
+but there is less evidence for this commonly improving existing
+*optimized* configurations.
+
+## Generic recommendation
+If using `NewBloomFilterPolicy(bpk)` for a large persistent DB using
+compression, try using `NewRibbonFilterPolicy(bpk)` instead, which
+will generate Ribbon filters during compaction and Bloom filters
+for flush, both with the same FP rate as the old setting. Once new SST
+files are generated under the new policy, this should free up some
+memory for more caching without much effect on burst or sustained
+write speed. Both kinds of filters can be read under either policy, so
+there's always an option to adjust settings or gracefully roll back to
+using Bloom filter only (keeping in mind that SST files must be
+replaced to see effect of that change).
diff --git a/src/rocksdb/docs/_posts/2022-07-18-per-key-value-checksum.markdown b/src/rocksdb/docs/_posts/2022-07-18-per-key-value-checksum.markdown
new file mode 100644
index 000000000..6b9ad801c
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2022-07-18-per-key-value-checksum.markdown
@@ -0,0 +1,142 @@
+---
+title: "Per Key-Value Checksum"
+layout: post
+author:
+- cbi42
+- ajkr
+category: blog
+---
+
+## Summary
+
+Silent data corruptions can severely impact RocksDB users. As a key-value library, RocksDB resides at the bottom of the user space software stack for many diverse applications. Returning wrong query results can cause unpredictable consequences for our users so must be avoided.
+
+To prevent and detect corruption, RocksDB has several consistency checks [1], especially focusing on the storage layer. For example, SST files contain block checksums that are verified during reads, and each SST file has a full file checksum that can be verified when files are transferred.
+
+Other sources of corruptions, such as those from faulty CPU/memory or heap corruptions, pose risks for which protections are relatively underdeveloped. Meanwhile, recent work [2] suggests one per thousand machines in our fleet will at some point experience a hardware error that is exposed to an application. Additionally, software bugs can increase the risk of heap corruptions at any time.
+
+Hardware/heap corruptions are naturally difficult to detect in the application layer since they can compromise any data or control flow. Some factors we take into account when choosing where to add protection are the volume of data, the importance of the data, the CPU instructions that operate on the data, and the duration it resides in memory. One recently added protection, `detect_filter_construct_corruption`, has proven itself useful in preventing corrupt filters from being persisted. We have seen hardware encounter machine-check exceptions a few hours after we detected a corrupt filter.
+
+The next way we intend to detect hardware and heap corruptions before they cause queries to return wrong results is through developing a new feature: per key-value checksum. This feature will eventually provide optional end-to-end integrity protection for every key-value pair. RocksDB 7.4 offers substantial coverage of the user write and recovery paths with per key-value checksum protection.
+
+## User API
+
+For integrity protection during recovery, no change is required. Recovery is always protected.
+
+For user write protection, RocksDB allows the user to specify per key-value protection through `WriteOptions::protection_bytes_per_key`  or pass in `protection_bytes_per_key` to `WriteBatch`  constructor when creating a `WriteBatch` directly. Currently, only 0 (default, no protection) and 8 bytes per key are supported. This should be fine for write batches as they do not usually contain a huge number of keys. We are working on supporting more settings as 8 bytes per key might cause considerable memory overhead when the protection is extended to memtable entries.
+
+## Feature Design
+
+### Data Structures
+
+#### Protection info
+
+For protecting key-value pairs, we chose to use a hashing algorithm, xxh3 [3], for its good efficiency without relying on special hardware. While algorithms like crc32c can guarantee detection of certain patterns of bit flips, xxh3 offers no such guarantees. This is acceptable for us as we do not expect any particular error pattern [4], and even if we did, xxh3 can achieve a collision probability close enough to zero for us by tuning the number of protection bytes per key-value.
+
+Key-value pairs have multiple representations in RocksDB: in [WriteBatch](https://github.com/facebook/rocksdb/blob/7d0ecab570742c7280628b08ddc03cfd692f484f/db/write_batch.cc#L14-L31), in memtable [entries](https://github.com/facebook/rocksdb/blob/fc51b7f33adcba7ac725ed0e7fe8b8155aaeaee4/db/memtable.cc#L541-L545) and in [data blocks](https://github.com/facebook/rocksdb/blob/fc51b7f33adcba7ac725ed0e7fe8b8155aaeaee4/table/block_based/block_builder.cc#L21-L27). In this post we focus on key-values in write batches and memtable as in-memory data blocks are not yet protected.
+
+Besides user key and value, RocksDB includes internal metadata in the per key-value checksum calculation. Depending on the representation, internal metadata consists of some combination of sequence number, operation type, and column family ID. Note that since timestamp (when enabled) is part of the user key it is protected as well.
+
+The protection info consists of the XOR’d result of the xxh3 hash for all the protected components. This allows us to efficiently transform protection info for different representations. See below for an example converting WriteBatch protection info to memtable protection info.
+
+A risk of using XOR is the possibility of swapping corruptions (e.g., key becomes the value and the value becomes the key). To mitigate this risk, we use an independent seed for hashing each type of component.
+
+The following two figures illustrate how protection info in WriteBatch and memtable are calculated from a key-value’s components.
+
+![](/static/images/kv-checksum/ProtInfo-Writebatch.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+*Protection info for a key-value in a WriteBatch*
+{: style="text-align: center"}
+
+![](/static/images/kv-checksum/ProtInfo-Memtable.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+*Protection info for a key-value in a memtable*
+{: style="text-align: center"}
+
+The next figure illustrates how protection info for a key-value can be transformed to protect that same key-value in a different representation. Note this is done without recalculating the hash for all the key-value’s components.
+
+![](/static/images/kv-checksum/ProtInfo-Writebatch-to-Memtable.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+*Protection info for a key-value in a memtable derived from an existing WriteBatch protection info*
+{: style="text-align: center"}
+
+Above, we see two (small) components are hashed: column family ID and sequence number. When a key-value is inserted from WriteBatch into memtable, it is assigned a sequence number and drops the column family ID since each memtable is associated with one column family. Recall the xxh3 of column family ID was included in the WriteBatch protection info, which is canceled out by the column family ID xxh3 included in the XOR.
+
+#### WAL fragment
+
+WAL (Write-ahead-log) persists write batches that correspond to operations in memtables and enables consistent database recovery after restart. RocksDB writes to WAL in chunks of some [fixed block size](https://github.com/facebook/rocksdb/blob/fc51b7f33adcba7ac725ed0e7fe8b8155aaeaee4/db/log_writer.h#L44) for efficiency. It is possible that some write batch does not fit into the space left in the current block and/or is larger than the fixed block size. Thus, serialized write batches (WAL records) are divided into WAL fragments before being written to WAL. The format of a WAL fragment is in the following diagram (there is another legacy format detailed in code [comments](https://github.com/facebook/rocksdb/blob/fc51b7f33adcba7ac725ed0e7fe8b8155aaeaee4/db/log_writer.h#L47-L59)). Roughly, the `Type` field indicates whether a fragment is at the beginning, middle or end of a record, and is used to group fragments.
+
+![](/static/images/kv-checksum/WAL-fragment.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+Note that each fragment is prefixed by a crc32c checksum that is calculated over `Type`, `Log #` and `Payload`. This ensures that RocksDB can detect corruptions that happened to the WAL in the storage layer.
+
+#### Write batch
+
+As mentioned above, a WAL record is a serialized `WriteBatch` that is split into physical fragments during writes to WAL. During DB recovery, once a WAL record is reconstructed from one or more fragments, it is [copied](https://github.com/facebook/rocksdb/blob/fc51b7f33adcba7ac725ed0e7fe8b8155aaeaee4/db/db_impl/db_impl_open.cc#L1127) into the content of a `WriteBatch`. The write batch will then be used to restore the memtable states.
+
+Besides the recovery path, a write batch is always constructed during user writes. Firstly, RocksDB allows users to construct a write batch directly, and pass it to DB through `DB::Write()` API for execution. Higher-level buffered write APIs like Transaction rely on a write batch to buffer writes prior to executing them. For unbuffered write APIs like `DB::Put()`, RocksDB constructs a write batch internally with the input user key and value.
+
+![](/static/images/kv-checksum/Write-batch.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+The above diagram shows a rough representation of a write batch in memory. `Contents` is the concatenation of serialized user operations in this write batch. Each operation consists of user key, value, op_type and optionally column family ID. With per key-value checksum protection enabled, a vector of ProtectionInfo is stored in the write batch, one for each user operation.
+
+#### Memtable entry
+
+![](/static/images/kv-checksum/Memtable-entry.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+A memtable entry is similar to write batch content, except that it captures only a single user operation and that it does not contain column family ID (since memtable is per column family). User key and value are length-prefixed, and seqno and optype are combined in a fixed 8 bytes representation.
+
+### Processes
+
+In order to protect user writes and recovery, per key-value checksum is covered in the following code paths.
+
+#### WriteBatch write
+
+Per key-value checksum coverage starts with the user buffers that contain user key and/or value. When users call DB Write APIs (e.g., `DB::Put()`), or when users add operations into write batches directly (e.g. `WriteBatch::Put()`), RocksDB constructs `ProtectionInfo` from the user buffer (e.g. [here](https://github.com/facebook/rocksdb/blob/96206531bc0bb56d87012921c5458c8a3047a6b3/db/write_batch.cc#L813)) and [stores](https://github.com/facebook/rocksdb/blob/96206531bc0bb56d87012921c5458c8a3047a6b3/include/rocksdb/write_batch.h#L478) the protection information within the corresponding `WriteBatch` object as diagramed below. Then the user key and/or value are copied into the `WriteBatch`, thus starting per key-value checksum protection from user buffer.
+
+![](/static/images/kv-checksum/Writebatch-write.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+
+#### WAL write
+
+Before a `WriteBatch` leaves RocksDB and be persisted in a WAL file, it is verified against its `ProtectionInfo` to ensure its content is not corrupted. We added `WriteBatch::VerifyChecksum()` for this purpose. Once we verify the content of a `WriteBatch`, it is then divided into potentially multiple WAL fragments and persisted in the underlying file system. From that point on, the integrity protection is handed off to the per fragment crc32c checksum that is persisted in WAL too.
+
+![](/static/images/kv-checksum/WAL-write.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+#### Memtable write
+
+Similar to the WAL write path, `ProtectionInfo` is verified before an entry is inserted into a memtable. The difference here is that an memtable entry has its own buffer, and the content of a `WriteBatch` is copied into the memtable entry. So the `ProtectionInfo` is verified against the memtable entry buffer instead. The current per key-value checksum protection ends at this verification on the buffer containing a memtable entry, and one of the future work is to extend the coverage to key-value pairs in memtables.
+
+![](/static/images/kv-checksum/Memtable-write.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+#### WAL read
+
+This is for the DB recovery path: WAL fragments are read into memory, concatenated together to form WAL records, and then `WriteBatch`es are constructed from WAL records and added to memtables. In RocksDB 7.4, once a `WriteBatch` copies its content from a WAL record, `ProtectionInfo` is constructed from the `WriteBatch` content and per key-value protection starts. However, this copy operation is not protected, neither is the reconstruction of a WAL record from WAL fragments. To provide protection from silent data corruption during these memory copying operations, we added checksum handshake detailed below in RocksDB 7.5.
+
+When a WAL fragment is first read into memory, its crc32c checksum is [verified](https://github.com/facebook/rocksdb/blob/2f13f5f7d09c589d5adebf0cbc42fadf0da0f00e/db/log_reader.cc#L483). The WAL fragment is then appended to the buffer containing a WAL record. RocksDB uses xxh3’s streaming API to calculate the checksum of the WAL record and updates the streaming hash state with the new WAL fragment content whenever it is appended to the WAL record buffer (e.g. [here](https://github.com/facebook/rocksdb/blob/2f13f5f7d09c589d5adebf0cbc42fadf0da0f00e/db/log_reader.cc#L135)). After the WAL record is constructed, it is copied into a `WriteBatch` and `ProtectionInfo` is constructed from the write batch content. Then, the xxh3 checksum of the WAL record is [verified](https://github.com/facebook/rocksdb/blob/2f13f5f7d09c589d5adebf0cbc42fadf0da0f00e/db/write_batch.cc#L3081-L3085) against the write batch content to complete the checksum handshake. If the checksum verification succeeds, then we are more confident that `ProtectionInfo` is calculated based on uncorrupted data, and the protection coverage continues with the newly constructed `ProtectionInfo` along the write code paths mentioned above.
+
+![](/static/images/kv-checksum/WAL-read.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+## Future work
+
+Future coverage expansion will cover memtable KVs, flush, compaction and user reads etc.
+
+## References
+
+[1] http://rocksdb.org/blog/2021/05/26/online-validation.html
+
+[2] H. D. Dixit, L. Boyle, G. Vunnam, S. Pendharkar, M. Beadon, and S. Sankar, ‘Detecting silent data corruptions in the wild’. arXiv, 2022.
+
+[3] https://github.com/Cyan4973/xxHash
+
+[4] https://github.com/Cyan4973/xxHash/issues/229#issuecomment-511956403
diff --git a/src/rocksdb/docs/_posts/2022-10-05-lost-buffered-write-recovery.markdown b/src/rocksdb/docs/_posts/2022-10-05-lost-buffered-write-recovery.markdown
new file mode 100644
index 000000000..fca3ea739
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2022-10-05-lost-buffered-write-recovery.markdown
@@ -0,0 +1,123 @@
+---
+title: "Verifying crash-recovery with lost buffered writes"
+layout: post
+author:
+- ajkr
+category: blog
+---
+
+## Introduction
+
+Writes to a RocksDB instance go through multiple layers before they are fully persisted.
+Those layers may buffer writes, delaying their persistence.
+Depending on the layer, buffered writes may be lost in a process or system crash.
+A process crash loses writes buffered in process memory only.
+A system crash additionally loses writes buffered in OS memory.
+
+The new test coverage introduced in this post verifies there is no hole in the recovered data in either type of crash.
+A hole would exist if any recovered write were newer than any lost write, as illustrated below.
+This guarantee is important for many applications, such as those that use the newest recovered write to determine the starting point for replication.
+
+![](/static/images/lost-buffered-write-recovery/happy-cat.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+*Valid (no hole) recovery: all recovered writes (1 and 2) are older than all lost writes (3 and 4)*
+{: style="text-align: center"}
+
+![](/static/images/lost-buffered-write-recovery/angry-cat.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+*Invalid (hole) recovery: a recovered write (4) is newer than a lost write (3)*
+{: style="text-align: center"}
+
+The new test coverage assumes all writes use the same options related to buffering/persistence.
+For example, we do not cover the case of alternating writes with WAL disabled and WAL enabled (`WriteOptions::disableWAL`).
+It also assumes the crash does not have any unexpected consequences like corrupting persisted data.
+
+Testing for holes in the recovery is challenging because there are many valid recovery outcomes.
+Our solution involves tracing all the writes and then verifying the recovery matches a prefix of the trace.
+This proves there are no holes in the recovery.
+See "Extensions for lost buffered writes" subsection below for more details.
+
+Testing actual system crashes would be operationally difficult.
+Our solution simulates system crash by buffering written but unsynced data in process memory such that it is lost in a process crash.
+See "Simulating system crash" subsection below for more details.
+
+## Scenarios covered
+
+We began testing recovery has no hole in the following new scenarios.
+This coverage is included in our internal CI that periodically runs against the latest commit on the main branch.
+
+1. **Process crash with WAL disabled** (`WriteOptions::disableWAL=1`), which loses writes since the last memtable flush.
+2. **System crash with WAL enabled** (`WriteOptions::disableWAL=0`), which loses writes since the last memtable flush or WAL sync (`WriteOptions::sync=1`, `SyncWAL()`, or `FlushWAL(true /* sync */)`).
+3. **Process crash with manual WAL flush** (`DBOptions::manual_wal_flush=1`), which loses writes since the last memtable flush or manual WAL flush (`FlushWAL()`).
+4. **System crash with manual WAL flush** (`DBOptions::manual_wal_flush=1`), which loses writes since the last memtable flush or synced manual WAL flush (`FlushWAL(true /* sync */)`, or `FlushWAL(false /* sync */)` followed by WAL sync).
+
+## Issues found
+
+* [False detection of corruption after system crash due to race condition with WAL sync and `track_and_verify_wals_in_manifest](https://github.com/facebook/rocksdb/pull/10185)
+* [Undetected hole in recovery after system crash due to race condition in WAL sync](https://github.com/facebook/rocksdb/pull/10560)
+* [Recovery failure after system crash due to missing directory sync for critical metadata file](https://github.com/facebook/rocksdb/pull/10573)
+
+## Solution details
+
+### Basic setup
+
+![](/static/images/lost-buffered-write-recovery/basic-setup.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+Our correctness testing framework consists of a stress test program (`db_stress`) and a wrapper script (`db_crashtest.py`).
+`db_crashtest.py` manages instances of `db_stress`, starting them and injecting crashes.
+`db_stress` operates a DB and test oracle ("Latest values file").
+
+At startup,  `db_stress` verifies the DB using the test oracle, skipping keys that had pending writes when the last crash happened.
+`db_stress` then stresses the DB with random operations, keeping the test oracle up-to-date.
+
+As the name "Latest values file" implies, this test oracle only tracks the latest value for each key.
+As a result, this setup is unable to verify recoveries involving lost buffered writes, where recovering older values is tolerated as long as there is no hole.
+
+### Extensions for lost buffered writes
+
+To accommodate lost buffered writes, we extended the test oracle to include two new files: "`verifiedSeqno`.state" and "`verifiedSeqno`.trace".
+`verifiedSeqno` is the sequence number of the last successful verification.
+"`verifiedSeqno`.state" is the expected values file at that sequence number, and "`verifiedSeqno`.trace" is the trace file of all operations that happened after that sequence number.
+
+![](/static/images/lost-buffered-write-recovery/replay-extension.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+When buffered writes may have been lost by the previous `db_stress` instance, the current `db_stress` instance must reconstruct the latest values file before startup verification.
+M is the recovery sequence number of the current `db_stress` instance and N is the recovery sequence number of the previous `db_stress` instance.
+M is learned from the DB, while N is learned from the filesystem by parsing the "*.{trace,state}" filenames.
+Then, the latest values file ("LATEST.state") can be reconstructed by replaying the first M-N traced operations (in "N.trace") on top of the last instance's starting point ("N.state").
+
+![](/static/images/lost-buffered-write-recovery/trace-extension.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+When buffered writes may be lost by the current `db_stress` instance, we save the current expected values into "M.state" and begin tracing newer operations in "M.trace".
+
+### Simulating system crash
+
+When simulating system crash, we send file writes to a `TestFSWritableFile`, which buffers unsynced writes in process memory.
+That way, the existing `db_stress` process crash mechanism will lose unsynced writes.
+
+![](/static/images/lost-buffered-write-recovery/test-fs-writable-file.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+`TestFSWritableFile` is implemented as follows.
+
+* `Append()` buffers the write in a local `std::string` rather than calling `write()`.
+* `Sync()` transfers the local `std::string`s content to `PosixWritableFile::Append()`, which will then `write()` it to the OS page cache.
+
+## Next steps
+An untested guarantee is that RocksDB recovers all writes that the user explicitly flushed out of the buffers lost in the crash.
+We may recover more writes than these due to internal flushing of buffers, but never less.
+Our test oracle needs to be further extended to track the lower bound on the sequence number that is expected to survive a crash.
+
+We would also like to make our system crash simulation more realistic.
+Currently we only drop unsynced regular file data, but we should drop unsynced directory entries as well.
+
+## Acknowledgements
+
+Hui Xiao added the manual WAL flush coverage and compatibility with `TransactionDB`.
+Zhichao Cao added the system crash simulation.
+Several RocksDB team members contributed to this feature's dependencies.
diff --git a/src/rocksdb/docs/_posts/2022-10-07-asynchronous-io-in-rocksdb.markdown b/src/rocksdb/docs/_posts/2022-10-07-asynchronous-io-in-rocksdb.markdown
new file mode 100644
index 000000000..0586f1c3d
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2022-10-07-asynchronous-io-in-rocksdb.markdown
@@ -0,0 +1,133 @@
+---
+title: Asynchronous IO in RocksDB
+layout: post
+author:
+- akankshamahajan15
+- anand1976
+category: blog
+---
+## Summary
+
+RocksDB provides several APIs to read KV pairs from a database, including Get and MultiGet for point lookups and Iterator for sequential scanning. These APIs may result in RocksDB reading blocks from SST files on disk storage. The types of blocks and the frequency with which they are read from storage is workload dependent. Some workloads may have a small working set and thus may be able to cache most of the data required, while others may have large working sets and have to read from disk more often. In the latter case, the latency would be much higher and throughput would be lower than the former. They would also be dependent on the characteristics of the underlying storage media, making it difficult to migrate from one medium to another, for example, local flash to disaggregated flash.
+
+One way to mitigate the impact of storage latency is to read asynchronously and in parallel as much as possible, in order to hide IO latency. We have implemented this in RocksDB in Iterators and MultiGet. In Iterators, we prefetch data asynchronously in the background for each file being iterated on, unlike the current implementation that does prefetching synchronously, thus blocking the iterator thread. In MultiGet, we determine the set of files that a given batch of keys overlaps, and read the necessary data blocks from those files in parallel using an asynchronous file system API. These optimizations have significantly decreased the overall latency of the RocksDB MultiGet and iteration APIs on slower storage compared to local flash.
+
+The optimizations described here are in the internal implementation of Iterator and MultiGet in RocksDB. The user API is still synchronous, so existing code can easily benefit from it. We might consider async user APIs in the future.
+
+
+## Design
+
+### API
+
+A new flag in `ReadOptions`, `async_io`, controls the usage of async IO. This flag, when set, enables async IO in Iterators and MultiGet. For MultiGet, an additional `ReadOptions` flag, `optimize_multiget_for_io` (defaults to true), controls how aggressively to use async IO. If the flag is not set, files in the same level are read in parallel but not different levels. If the flag is set, the level restriction is removed and as many files as possible are read in parallel, regardless of level. The latter might have a higher CPU cost depending on the workload.
+
+At the FileSystem layer, we use the `FSRandomAccessFile::ReadAsync` API to start an async read, providing a completion callback.
+
+### Scan
+
+A RocksDB scan usually involves the allocation of a new iterator, followed by a Seek call with a target key to position the iterator, followed by multiple Next calls to iterate through the keys sequentially. Both the Seek and Next operations present opportunities to read asynchronously, thereby reducing the scan latency.
+
+A scan usually involves iterating through keys in multiple entities - the active memtable, sealed and unflushed memtables, every L0 file, and every non-empty non-zero level. The first two are completely in memory and thus not impacted by IO latency. The latter two involve reading from SST files. This means that an increase in IO latency has a multiplier effect, since multiple L0 files and levels have to be iterated on.
+
+Some factors, such as block cache and prefix bloom filters, can reduce the number of files to iterate and number of reads from the files. Nevertheless, even a few reads from disk can dominate the overall latency. RocksDB uses async IO in both Seek and Next to mitigate the latency impact, as described below.
+
+
+#### Seek
+
+A RocksDB iterator maintains a collection of child iterators, one for each L0 file and for each non-empty non-zero levels. For a Seek operation every child iterator has to Seek to the target key. This is normally done serially, by doing synchronous reads from SST files when the required data blocks are not in cache. When the async_io option is enabled, RocksDB performs the Seek in 2 phases - 1) Locate the data block required for Seek in each file/level and issue an async read, and 2) in the second phase, reseek with the same key, which will wait for the async read to finish at each level and position the table iterator. Phase 1 reads multiple blocks in parallel, reducing overall Seek latency.
+
+
+#### Next
+
+For the iterator Next operation, RocksDB tries to reduce the latency due to IO by prefetching data from the file. This prefetching occurs when a data block required by Next is not present in the cache. The reads from file and prefetching is managed by the FilePrefetchBuffer, which is an object that’s created per table iterator (BlockBasedTableIterator). The FilePrefetchBuffer reads the required data block, and an additional amount of data that varies depending on the options provided by the user in ReadOptions and BlockBasedTableOptions. The default behavior is to start prefetching on the third read from a file, with an initial prefetch size of 8KB and doubling it on every subsequent read, upto a max of 256KB.
+
+While the prefetching in the previous paragraph helps, it is still synchronous and contributes to the iterator latency. When the async_io option is enabled, RocksDB prefetches in the background, i.e while the iterator is scanning KV pairs. This is accomplished in FilePrefetchBuffer by maintaining two prefetch buffers. The prefetch size is calculated as usual, but its then split across the two buffers. As the iteration proceeds and data in the first buffer is consumed, the buffer is cleared and an async read is scheduled to prefetch additional data. This read continues in the background while the iterator continues to process data in the second buffer. At this point, the roles of the two buffers are reversed. This does not completely hide the IO latency, since the iterator would have to wait for an async read to complete after the data in memory has been consumed. However, it does hide some of it by overlapping CPU and IO, and async prefetch can be happening on multiple levels in parallel, further reducing the latency.
+
+![Scan flow](/static/images/asynchronous-io/scan_async.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+### MultiGet
+
+The MultiGet API accepts a batch of keys as input. Its a more efficient way of looking up multiple keys compared to a loop of Gets. One way MultiGet is more efficient is by reading multiple data blocks from an SST file in a batch, for keys in the same file. This greatly reduces the latency of the request, compared to a loop of Gets. The MultiRead FileSystem API is used to read a batch of data blocks.
+
+![MultiGet flow](/static/images/asynchronous-io/mget_async.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+Even with the MultiRead optimization, subset of keys that are in different files still need to be read serially. We can take this one step further and read multiple files in parallel. In order to do this, a few fundamental changes were required in the MultiGet implementation -
+
+1. Coroutines - A MultiGet involves determining the set of keys in a batch that overlap an SST file, and then calling TableReader::MultiGet to do the actual lookup. The TableReader probes the bloom filter, traverses the index block, looks up the block cache for the necessary, reads the missing data blocks from the SST file, and then searches for the keys in the data blocks. There is a significant amount of context that’s accumulated at each stage, and it would be rather complex to interleave data blocks reads by multiple TableReaders. In order to simplify it, we used async IO with C++ coroutines. The TableReader::MultiGet is implemented as a coroutine, and the coroutine is suspended after issuing async reads for missing data blocks. This allows the top-level MultiGet to iterate through the TableReaders for all the keys, before waiting for the reads to finish and resuming the coroutines.
+2. Filtering - The downside of using coroutines is the CPU overhead, which is non-trivial. To minimize the overhead, its desirable to not use coroutines as much as possible. One scenario in which we can completely avoid the call to a TableReader::MultiGet coroutine is if we know that none of the overlapping keys are actually present in the SST file. This can easily determined by probing the bloom filter. In the previous implementation, the bloom filter lookup was embedded in TableReader::MultiGet. However, we could easily implement is as a separate step, before calling TableReader::MultiGet.
+3. Splitting batches - The default strategy of MultiGet is to lookup keys in one level (or L0 file), before moving on to the next. This limits the amount of IO parallelism we can exploit. For example, the keys in a batch may not be clustered together, and may be scattered over multiple files. Even if they are clustered together in the key space, they may not all be in the same level. In order to optimize for these situations, we determine the subset of keys that are likely to be in a given level, and then split the MultiGet batch into 2 - the subset in that level, and the remainder. The batch containing the remainder can then be processed in parallel. The subset of keys likely to be in a level is determined by the filtering step.
+
+Together, these changes enabled two types of latency optimization in MultiGet using async IO - single-level and multi-level. The former reads data blocks in parallel from multiple files in the same LSM level, while the latter reads in parallel from multiple files in multiple levels.
+
+## Results
+
+Command used to generate the database:
+
+`buck-out/opt/gen/rocks/tools/rocks_db_bench —db=/rocks_db_team/prefix_scan —env_uri=ws://ws.flash.ftw3preprod1 -logtostderr=false -benchmarks="fillseqdeterministic" -key_size=32 -value_size=512 -num=5000000 -num_levels=4 -multiread_batched=true -use_direct_reads=false -adaptive_readahead=true -threads=1 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -disable_auto_compactions=true -compaction_style=1 -bloom_bits=10`
+
+Structure of the database:
+
+`Level[0]: /000233.sst(size: 24828520 bytes)`
+`Level[0]: /000232.sst(size: 49874113 bytes)`
+`Level[0]: /000231.sst(size: 100243447 bytes)`
+`Level[0]: /000230.sst(size: 201507232 bytes)`
+`Level[1]: /000224.sst - /000229.sst(total size: 405046844 bytes)`
+`Level[2]: /000211.sst - /000223.sst(total size: 814190051 bytes)`
+`Level[3]: /000188.sst - /000210.sst(total size: 1515327216 bytes)`
+
+
+### MultiGet
+
+MultiGet benchmark command:
+
+`buck-out/opt/gen/rocks/tools/rocks_db_bench -use_existing_db=true —db=/rocks_db_team/prefix_scan -benchmarks="multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=8 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -threads=4 -cache_size=300000000 -async_io=true -multiread_stride=40000 -statistics —env_uri=ws://ws.flash.ftw3preprod1 -logtostderr=false -adaptive_readahead=true -bloom_bits=10`
+
+#### Single-file
+
+The default MultiGet implementation of reading from one file at a time had a latency of 1292 micros/op.
+
+`multireadrandom : 1291.992 micros/op 3095 ops/sec 60.007 seconds 185768 operations; 1.6 MB/s (46768 of 46768 found) `
+`rocksdb.db.multiget.micros P50 : 9664.419795 P95 : 20757.097056 P99 : 29329.444444 P100 : 46162.000000 COUNT : 23221 SUM : 239839394`
+
+#### Single-level
+
+MultiGet with async_io=true and optimize_multiget_for_io=false had a latency of 775 micros/op.
+
+`multireadrandom : 774.587 micros/op 5163 ops/sec 60.009 seconds 309864 operations; 2.7 MB/s (77816 of 77816 found)`
+`rocksdb.db.multiget.micros P50 : [6029.601964](tel:6029601964) P95 : 10727.467932 P99 : 13986.683940 P100 : 47466.000000 COUNT : 38733 SUM : 239750172`
+
+#### Multi-level
+
+With all optimizations turned on, MultiGet had the lowest latency of 508 micros/op.
+
+`multireadrandom : 507.533 micros/op 7881 ops/sec 60.003 seconds 472896 operations; 4.1 MB/s (117536 of 117536 found)`
+`rocksdb.db.multiget.micros P50 : 3923.819467 P95 : 7356.182075 P99 : 10880.728723 P100 : 28511.000000 COUNT : 59112 SUM : 239642721`
+
+### Scan
+
+Benchmark command:
+
+`buck-out/opt/gen/rocks/tools/rocks_db_bench -use_existing_db=true —db=/rocks_db_team/prefix_scan -ben``chmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=8 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_che``cks=1 -readonly=true -threads=4 -cache_size=300000000 -async_io=true -multiread_stride=40000 -statistics —env_uri=ws://ws.flash.ftw3preprod1 -logtostderr=false -a``daptive_readahead=true -bloom_bits=10 -seek_nexts=65536`
+
+### With async scan
+
+`seekrandom : 414442.303 micros/op 9 ops/sec 60.288 seconds 581 operations; 326.2 MB/s (145 of 145 found)`
+
+### Without async scan
+
+`seekrandom : 848858.669 micros/op 4 ops/sec 60.529 seconds 284 operations; 158.1 MB/s (74 of 74 found)`
+
+## Known Limitations
+
+These optimizations apply only to block based table SSTs. File system support for the `ReadAsync` and `Poll` interfaces is required. Currently, it is available only for `PosixFileSystem`.
+
+The MultiGet async IO optimization has a few additional limitations -
+
+1. Depends on folly, which introduces a few additional build steps
+2. Higher CPU overhead due to coroutines. The CPU overhead of MultiGet may increase 6-15%, with the worst case being a single threaded MultiGet batch of keys with 1 key/file intersection and 100% cache hit rate. A more realistic case of multiple threads with a few keys (~4) overlap per file should see ~6% higher CPU util.
+3. No parallelization of metadata reads. A metadata read will block the thread.
+4. A few other cases will also be in serial, such as additional block reads for merge operands.
+
+
diff --git a/src/rocksdb/docs/_posts/2022-10-31-align-compaction-output-file.markdown b/src/rocksdb/docs/_posts/2022-10-31-align-compaction-output-file.markdown
new file mode 100644
index 000000000..6df61b551
--- /dev/null
+++ b/src/rocksdb/docs/_posts/2022-10-31-align-compaction-output-file.markdown
@@ -0,0 +1,107 @@
+---
+title: Reduce Write Amplification by Aligning Compaction Output File Boundaries
+layout: post
+author:
+- zjay
+category: blog
+---
+## TL;DR
+By cutting the compaction output file earlier and allowing larger than targeted_file_size to align the compaction output files to the next level files, it can **reduce WA (Write Amplification) by more than 10%**. The feature is **enabled by default** after the user upgrades RocksDB to version `7.8.0+`.
+
+## Background
+RocksDB level compaction picks one file from the source level and compacts to the next level, which is a typical partial merge compaction algorithm. Compared to the full merge compaction strategy for example [universal compaction](https://github.com/facebook/rocksdb/wiki/Universal-Compaction), it has the benefits of smaller compaction size, better parallelism, etc. But it also has a larger write amplification (typically 20-30 times user data). One of the problems is wasted compaction at the beginning and ending:
+
+![](/static/images/align-compaction-output/file_cut_normal.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+In the diagram above, `SST11` is selected for the compaction, it overlaps with `SST20` to `SST23`, so all these files are selected for compaction. But the beginning and ending of the SST on Level 2 are wasted, which also means it will be compacted again when `SST10` is compacting down. If the file boundaries are aligned, then the wasted compaction size could be reduced. On average, the wasted compaction is `1` file size: `0.5` at the beginning, and `0.5` at the end. Typically the average compaction fan-out is about 6 (with the default max_bytes_for_level_multiplier = 10), then `1 / (6 + 1) ~= 14%` of compaction is wasted.
+## Implemtation
+To reduce such wasted compaction, RocksDB now tries to align the compaction output file to the next level's file. So future compactions will have fewer wasted compaction. For example, the above case might be cut like this:
+
+![](/static/images/align-compaction-output/file_cut_align.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+The trade-off is the file won't be cut exactly after it exceeds target_file_size_base, instead, it will be more likely cut when it's aligned with the next level file's boundary, so the file size might be more varied. It could be as small as 50% of `target_file_size` or as large as `2x target_file_size`. It will only impact non-bottommost-level files, which should be only `~11%` of the data.
+Internally, RocksDB tries to cut the file so its size is close to the `target_file_size` setting but also aligned with the next level boundary. When the compaction output file hit a next-level file boundary, either the beginning or ending boundary, it will cut if:
+```
+current_size > ((5 * min(bounderies_num, 8) + 50) / 100) * target_file_size
+```
+([details](https://github.com/facebook/rocksdb/blob/23fa5b7789d6acd0c211d6bdd41448bbf1513bb6/db/compaction/compaction_outputs.cc#L270-L290))
+
+The file size is also capped at `2x target_file_size`: [details](https://github.com/facebook/rocksdb/blob/f726d29a8268ae4e2ffeec09172383cff2ab4db9/db/compaction/compaction.cc#L273-L277).
+Another benefit of cutting the file earlier is having more trivial move compaction, which is moving the file from a high level to a low level without compacting anything. Based on a compaction simulator test, the trivial move data is increased by 30% (but still less than 1% compaction data is trivial move):
+
+![](/static/images/align-compaction-output/file_cut_trival_move.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+Based on the db_bench test, it can save `~12%` compaction load, here is the test command and result:
+```
+TEST_TMPDIR=/data/dbbench ./db_bench --benchmarks=fillrandom,readrandom -max_background_jobs=12 -num=400000000 -target_file_size_base=33554432
+
+# baseline:
+Flush(GB): cumulative 25.882, interval 7.216
+Cumulative compaction: 285.90 GB write, 162.36 MB/s write, 269.68 GB read, 153.15 MB/s read, 2926.7 seconds
+
+# with this change:
+Flush(GB): cumulative 25.882, interval 7.753
+Cumulative compaction: 249.97 GB write, 141.96 MB/s write, 233.74 GB read, 132.74 MB/s read, 2534.9 seconds
+```
+
+The feature is enabled by default by upgrading to RocksDB 7.8 or later versions, as the feature should have a limited impact on the file size and have great write amplification improvements. If in a rare case, it needs to opt out, set
+```
+options.level_compaction_dynamic_file_size = false;
+```
+
+## Other Options and Benchmark
+We also tested a few other options, starting with a fixed threshold: 75% of the target_file_size and 50%. Then with a dynamic threshold that is explained, but still limiting file size smaller than the target_file_size.
+1. Baseline (main branch before [PR#10655](https://github.com/facebook/rocksdb/pull/10655));
+2. Fixed Threshold `75%`: after 75% of target file size, cut the file whenever it aligns with a low level file boundary;
+3. Fixed Threshold `50%`: reduce the threshold to 50% of target file size;
+4. Dynamic Threshold `(5*bounderies_num + 50)` percent of target file size and maxed at 90%;
+5. Dynamic Threshold + allow 2x the target file size (chosen option).
+
+### Test Environment and Data
+To speed up the benchmark, we introduced a compaction simulator within Rocksdb ([details](https://github.com/jay-zhuang/rocksdb/tree/compaction_sim)), which replaced the physical SST with in-memory data (a large bitset). Which can test compaction more consistently. As it's a simulator, it has its limitations:
+
+it assumes each key-value has the same size;
+1. no deletion (but has override);
+2. doesn't consider data compression;
+3. single-threaded and finish all compactions before the next flush (so no write stall). 
+
+We use 3 kinds of the dataset for tests:
+1. Random Data, has an override, evenly distributed;
+2. Zipf distribution with alpha = 1.01, moderately skewed;
+3. Zipf distribution with alpha = 1.2, highly skewed.
+
+#### Write Amplification
+
+![](/static/images/align-compaction-output/write_amp_compare.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 100%"}
+
+As we can see, all options are better than the baseline. Option5 (brown) and option3 (green) have similar WA improvements. (The sudden WA drop during ~40G Random Dataset is because we enabled `level_compaction_dynamic_level_bytes` and the level number was increased from 3 to 4, the similar test result without enabling `level_compaction_dynamic_level_bytes`).
+
+#### File Size Distribution at the End of Test
+This is the file size distribution at the end of the test, which loads about 100G data. As this change only impacts the non-bottommost file size, and the majority of the SST files are bottommost, there're no significant differences:
+
+![](/static/images/align-compaction-output/file_size_compare.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 100%"}
+
+#### All Compaction Generated File Sizes
+The high-level files are much more likely to be compacted, so all compaction-generated files size has more significant change:
+
+![](/static/images/align-compaction-output/compaction_output_file_size_compare.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 100%"}
+
+Overall option5 has most of the file size close to the target file size. vs. option3 has a much smaller size. Here are more detailed stats for compaction output file size:
+```
+              base           50p           75p       dynamic     2xdynamic
+count  1.656000e+03  1.960000e+03  1.770000e+03  1.687000e+03  1.705000e+03
+mean   3.116062e+07  2.634125e+07  2.917876e+07  3.060135e+07  3.028076e+07
+std    7.145242e+06  1.065134e+07  8.800474e+06  7.612939e+06  8.046139e+06
+```
+
+## Summary
+Allowing more dynamic file size and aligning the compaction output file to the next level file's boundary improves the RocksDB write amplification by more than 10%, which will be enabled by default in `7.8.0` release. We picked a simple algorithm to decide when to cut the output file, which can be further improved. For example, by estimating output file size with index information. Any suggestions or PR are welcomed.
+
+## Acknowledgements
+We thank Siying Dong for initializing the file-cutting idea and thank Andrew Kryczka, Mark Callaghan for contributing to the ideas. And Changyu Bi for the detailed code review.
diff --git a/src/rocksdb/docs/_sass/_base.scss b/src/rocksdb/docs/_sass/_base.scss
new file mode 100644
index 000000000..6d26d9feb
--- /dev/null
+++ b/src/rocksdb/docs/_sass/_base.scss
@@ -0,0 +1,492 @@
+body {
+  background: $secondary-bg;
+  color: $text;
+	font: normal #{$base-font-size}/#{$base-line-height} $base-font-family;
+  height: 100vh;
+	text-align: left;
+	text-rendering: optimizeLegibility;
+}
+
+img {
+  max-width: 100%;
+}
+
+article {
+  p {
+    img {
+      max-width: 100%;
+      display:block;
+      margin-left: auto;
+      margin-right: auto;
+    }
+  }
+}
+
+a {
+  border-bottom: 1px dotted $primary-bg;
+  color: $text;
+  text-decoration: none;
+  -webkit-transition: background 0.3s, color 0.3s;
+  transition: background 0.3s, color 0.3s;
+}
+
+blockquote {
+  padding: 15px 30px 15px 15px;
+  margin: 20px 0 0 10px;
+  background-color: rgba(204, 122, 111, 0.1);
+  border-left: 10px solid rgba(191, 87, 73, 0.2);
+}
+
+#fb_oss a {
+  border: 0;
+}
+
+h1, h2, h3, h4 {
+  font-family: $header-font-family;
+  font-weight: 900;
+}
+
+.navPusher {
+  border-top: $header-height + $header-ptop + $header-pbot solid $primary-bg;
+	height: 100%;
+	left: 0;
+	position: relative;
+	z-index: 99;
+}
+
+.homeContainer {
+  background: $primary-bg;
+  color: $primary-overlay;
+
+  a {
+    color: $primary-overlay;
+  }
+
+  .homeSplashFade {
+    color: white;
+  }
+
+  .homeWrapper {
+    padding: 2em 10px;
+    text-align: left;
+
+      .wrapper {
+        margin: 0px auto;
+        max-width: $content-width;
+        padding: 0 20px;
+      }
+
+      .projectLogo {
+        img {
+          height: 100px;
+          margin-bottom: 0px;
+        }
+      }
+
+      h1#project_title {
+        font-family: $header-font-family;
+        font-size: 300%;
+        letter-spacing: -0.08em;
+        line-height: 1em;
+        margin-bottom: 80px;
+      }
+
+      h2#project_tagline {
+        font-family: $header-font-family;
+        font-size: 200%;
+        letter-spacing: -0.04em;
+        line-height: 1em;
+      }
+  }
+}
+
+.wrapper {
+	margin: 0px auto;
+	max-width: $content-width;
+	padding: 0 10px;
+}
+
+.projectLogo {
+  display: none;
+
+  img {
+    height: 100px;
+    margin-bottom: 0px;
+  }
+}
+
+section#intro {
+  margin: 40px 0;
+}
+
+.fbossFontLight {
+  font-family: $base-font-family;
+  font-weight: 300;
+  font-style: normal;
+}
+
+.fb-like {
+  display: block;
+  margin-bottom: 20px;
+  width: 100%;
+}
+
+.center {
+  display: block;
+  text-align: center;
+}
+
+.mainContainer {
+  background: $secondary-bg;
+  overflow: auto;
+
+  .mainWrapper {
+    padding: 4vh 10px;
+    text-align: left;
+
+    .allShareBlock {
+      padding: 10px 0;
+
+      .pluginBlock {
+        margin: 12px 0;
+        padding: 0;
+      }
+    }
+
+    a {
+      &:hover,
+      &:focus {
+        background: $primary-bg;
+        color: $primary-overlay;
+      }
+    }
+
+    em, i {
+      font-style: italic;
+    }
+
+    strong, b {
+      font-weight: bold;
+    }
+
+    h1 {
+      font-size: 300%;
+      line-height: 1em;
+      padding: 1.4em 0 1em;
+      text-align: left;
+    }
+
+    h2 {
+      font-size: 250%;
+      line-height: 1em;
+      margin-bottom: 20px;
+      padding: 1.4em 0 20px;
+      text-align: left;
+
+      & {
+        border-bottom: 1px solid darken($primary-bg, 10%);
+        color: darken($primary-bg, 10%);
+        font-size: 22px;
+        padding: 10px 0;
+      }
+
+      &.blockHeader {
+        border-bottom: 1px solid white;
+        color: white;
+        font-size: 22px;
+        margin-bottom: 20px;
+        padding: 10px 0;
+      }
+    }
+
+    h3 {
+      font-size: 150%;
+      line-height: 1.2em;
+      padding: 1em 0 0.8em;
+    }
+
+    h4 {
+      font-size: 130%;
+      line-height: 1.2em;
+      padding: 1em 0 0.8em;
+    }
+
+    p {
+      padding: 0.8em 0;
+    }
+
+    ul {
+      list-style: disc;
+    }
+
+    ol, ul {
+      padding-left: 24px;
+      li {
+        padding-bottom: 4px;
+        padding-left: 6px;
+      }
+    }
+
+    strong {
+      font-weight: bold;
+    }
+
+    .post {
+      position: relative;
+
+      .katex {
+        font-weight: 700;
+      }
+
+      &.basicPost {
+        margin-top: 30px;
+      }
+
+      a {
+        color: $primary-bg;
+
+        &:hover,
+        &:focus {
+          color: #fff;
+        }
+      }
+
+      h2 {
+        border-bottom: 4px solid $primary-bg;
+        font-size: 130%;
+      }
+
+      h3 {
+        border-bottom: 1px solid $primary-bg;
+        font-size: 110%;
+      }
+
+      ol {
+        list-style: decimal outside none;
+      }
+
+      .post-header {
+        padding: 1em 0;
+
+        h1 {
+          font-size: 150%;
+          line-height: 1em;
+          padding: 0.4em 0 0;
+
+          a {
+            border: none;
+          }
+        }
+
+        .post-meta {
+          color: $primary-bg;
+          font-family: $header-font-family;
+          text-align: center;
+        }
+      }
+
+      .postSocialPlugins {
+        padding-top: 1em;
+      }
+
+      .docPagination {
+        background: $primary-bg;
+        bottom: 0px;
+        left: 0px;
+        position: absolute;
+        right: 0px;
+
+        .pager {
+          display: inline-block;
+          width: 50%;
+        }
+
+        .pagingNext {
+          float: right;
+          text-align: right;
+        }
+
+        a {
+          border: none;
+          color: $primary-overlay;
+          display: block;
+          padding: 4px 12px;
+
+          &:hover {
+            background-color: $secondary-bg;
+            color: $text;
+          }
+
+          .pagerLabel {
+            display: inline;
+          }
+
+          .pagerTitle {
+            display: none;
+          }
+        }
+      }
+    }
+
+    .posts {
+      .post {
+        margin-bottom: 6vh;
+      }
+    }
+  }
+}
+
+#integrations_title  {
+  font-size: 250%;
+  margin: 80px 0;
+}
+
+.ytVideo {
+  height: 0;
+  overflow: hidden;
+  padding-bottom: 53.4%; /* 16:9 */
+  padding-top: 25px;
+  position: relative;
+}
+
+.ytVideo iframe,
+.ytVideo object,
+.ytVideo embed {
+  height: 100%;
+  left: 0;
+  position: absolute;
+  top: 0;
+  width: 100%;
+}
+
+@media only screen and (min-width: 480px) {
+  h1#project_title {
+    font-size: 500%;
+  }
+
+  h2#project_tagline {
+    font-size: 250%;
+  }
+
+  .projectLogo {
+    img {
+      margin-bottom: 10px;
+      height: 200px;
+    }
+  }
+
+  .homeContainer .homeWrapper {
+    padding-left: 10px;
+    padding-right: 10px;
+  }
+
+  .mainContainer {
+    .mainWrapper {
+      .post {
+        h2 {
+          font-size: 180%;
+        }
+
+        h3 {
+          font-size: 120%;
+        }
+
+        .docPagination {
+          a {
+            .pagerLabel {
+              display: none;
+            }
+            .pagerTitle {
+              display: inline;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+@media only screen and (min-width: 900px) {
+  .homeContainer {
+    .homeWrapper {
+      position: relative;
+
+      #inner {
+        box-sizing: border-box;
+        max-width: 600px;
+        padding-right: 40px;
+      }
+
+      .projectLogo {
+        align-items: center;
+        bottom: 0;
+        display: flex;
+        justify-content: flex-end;
+        left: 0;
+        padding: 2em 20px 4em;
+        position: absolute;
+        right: 20px;
+        top: 0;
+
+        img {
+          height: 100%;
+          max-height: 250px;
+        }
+      }
+    }
+  }
+}
+
+@media only screen and (min-width: 1024px) {
+  .mainContainer {
+    .mainWrapper {
+      .post {
+        box-sizing: border-box;
+        display: block;
+
+        .post-header {
+          h1 {
+            font-size: 250%;
+          }
+        }
+      }
+
+      .posts {
+        .post {
+          margin-bottom: 4vh;
+          width: 100%;
+        }
+      }
+    }
+  }
+}
+
+@media only screen and (min-width: 1200px) {
+  .homeContainer {
+    .homeWrapper {
+      #inner {
+        max-width: 750px;
+      }
+    }
+  }
+
+  .wrapper {
+    max-width: 1100px;
+  }
+}
+
+@media only screen and (min-width: 1500px) {
+  .homeContainer {
+    .homeWrapper {
+      #inner {
+        max-width: 1100px;
+        padding-bottom: 40px;
+        padding-top: 40px;
+      }
+    }
+  }
+
+  .wrapper {
+    max-width: 1400px;
+  }
+}
diff --git a/src/rocksdb/docs/_sass/_blog.scss b/src/rocksdb/docs/_sass/_blog.scss
new file mode 100644
index 000000000..12a73c1fc
--- /dev/null
+++ b/src/rocksdb/docs/_sass/_blog.scss
@@ -0,0 +1,47 @@
+.blogContainer {
+  .posts {
+    margin-top: 60px;
+
+    .post {
+      border: 1px solid $primary-bg;
+      border-radius: 3px;
+      padding: 10px 20px 20px;
+    }
+  }
+
+  .lonePost {
+    margin-top: 60px;
+
+    .post {
+      padding: 10px 0px 0px;
+    }
+  }
+
+  .post-header {
+    h1 {
+      text-align: center;
+    }
+
+    .post-authorName {
+      color: rgba($text, 0.7);
+      font-size: 14px;
+      font-weight: 900;
+      margin-top: 0;
+      padding: 0;
+      text-align: center;
+    }
+
+    .authorPhoto {
+      border-radius: 50%;
+      height: 50px;
+      left: 50%;
+      margin-left: auto;
+      margin-right: auto;
+      display: inline-block;
+      overflow: hidden;
+      position: static;
+      top: -25px;
+      width: 50px;
+    }
+  }
+}
diff --git a/src/rocksdb/docs/_sass/_buttons.scss b/src/rocksdb/docs/_sass/_buttons.scss
new file mode 100644
index 000000000..a0371618f
--- /dev/null
+++ b/src/rocksdb/docs/_sass/_buttons.scss
@@ -0,0 +1,47 @@
+.button {
+  border: 1px solid $primary-bg;
+  border-radius: 3px;
+  color: $primary-bg;
+  display: inline-block;
+  font-size: 14px;
+  font-weight: 900;
+  line-height: 1.2em;
+  padding: 10px;
+  text-transform: uppercase;
+  transition: background 0.3s, color 0.3s;
+
+  &:hover {
+    background: $primary-bg;
+    color: $primary-overlay;
+  }
+}
+
+.homeContainer {
+  .button {
+    border-color: $primary-overlay;
+    border-width: 1px;
+    color: $primary-overlay;
+
+    &:hover {
+      background: $primary-overlay;
+      color: $primary-bg;
+    }
+  }
+}
+
+.blockButton {
+  display: block;
+}
+
+.edit-page-link {
+    float: right;
+    font-size: 14px;
+    font-weight: normal;
+    line-height: 20px;
+    opacity: 0.6;
+    transition: opacity 0.5s;
+}
+
+.edit-page-link:hover {
+  opacity: 1;
+}
diff --git a/src/rocksdb/docs/_sass/_footer.scss b/src/rocksdb/docs/_sass/_footer.scss
new file mode 100644
index 000000000..5b7439517
--- /dev/null
+++ b/src/rocksdb/docs/_sass/_footer.scss
@@ -0,0 +1,82 @@
+.footerContainer {
+  background: $secondary-bg;
+  color: $primary-bg;
+  overflow: hidden;
+  padding: 0 10px;
+  text-align: left;
+
+  .footerWrapper {
+    border-top: 1px solid $primary-bg;
+    padding: 0;
+
+    .footerBlocks {
+      align-items: center;
+      align-content: center;
+      display: flex;
+      flex-flow: row wrap;
+      margin: 0 -20px;
+      padding: 10px 0;
+    }
+
+    .footerSection {
+      box-sizing: border-box;
+      flex: 1 1 25%;
+      font-size: 14px;
+      min-width: 275px;
+      padding: 0px 20px;
+
+      a {
+        border: 0;
+        color: inherit;
+        display: inline-block;
+        line-height: 1.2em;
+      }
+
+      .footerLink {
+        padding-right: 20px;
+      }
+    }
+
+    .fbOpenSourceFooter {
+      align-items: center;
+      display: flex;
+      flex-flow: row nowrap;
+      max-width: 25%;
+
+      .facebookOSSLogoSvg {
+        flex: 0 0 31px;
+        height: 30px;
+        margin-right: 10px;
+        width: 31px;
+
+        path {
+          fill: $primary-bg;
+        }
+
+        .middleRing {
+          opacity: 0.7;
+        }
+
+        .innerRing {
+          opacity: 0.45;
+        }
+      }
+
+      h2 {
+        display: block;
+        font-weight: 900;
+        line-height: 1em;
+      }
+    }
+  }
+}
+
+@media only screen and (min-width: 900px) {
+  .footerSection {
+    &.rightAlign {
+      margin-left: auto;
+      max-width: 25%;
+      text-align: right;
+    }
+  }
+}
+\ No newline at end of file
diff --git a/src/rocksdb/docs/_sass/_gridBlock.scss b/src/rocksdb/docs/_sass/_gridBlock.scss
new file mode 100644
index 000000000..679b31c14
--- /dev/null
+++ b/src/rocksdb/docs/_sass/_gridBlock.scss
@@ -0,0 +1,115 @@
+.gridBlock {
+  margin: -5px 0;
+  padding: 0;
+  padding-bottom: 20px;
+
+  .blockElement {
+    padding: 5px 0;
+
+    img {
+      max-width: 100%;
+    }
+
+    h3 {
+      border-bottom: 1px solid rgba($primary-bg, 0.5);
+      color: $primary-bg;
+      font-size: 18px;
+      margin: 0;
+      padding: 10px 0;
+    }
+  }
+
+  .gridClear {
+    clear: both;
+  }
+
+}
+
+.gridBlock .alignCenter {
+	text-align: center;
+}
+.gridBlock .alignRight {
+	text-align: right;
+}
+.gridBlock .imageAlignSide {
+	align-items: center;
+	display: flex;
+	flex-flow: row wrap;
+}
+.blockImage {
+	max-width: 150px;
+	width: 50%;
+}
+.imageAlignTop .blockImage {
+	margin-bottom: 20px;
+}
+.imageAlignTop.alignCenter .blockImage {
+	margin-left: auto;
+	margin-right: auto;
+}
+.imageAlignSide .blockImage {
+	flex: 0 1 100px;
+	margin-right: 20px;
+}
+.imageAlignSide .blockContent {
+	flex: 1 1;
+}
+
+@media only screen and (max-width: 1023px) {
+	.responsiveList .blockContent {
+		position: relative;
+	}
+	.responsiveList .blockContent > div {
+		padding-left: 20px;
+	}
+	.responsiveList .blockContent::before {
+		content: "\2022";
+		position: absolute;
+	}
+}
+
+@media only screen and (min-width: 1024px) {
+  .gridBlock {
+    display: flex;
+    flex-direction: row;
+    flex-wrap: wrap;
+    margin: -10px -10px 10px -10px;
+
+    .twoByGridBlock {
+      box-sizing: border-box;
+      flex: 1 0 50%;
+      padding: 10px;
+    }
+
+    .fourByGridBlock {
+      box-sizing: border-box;
+      flex: 1 0 25%;
+      padding: 10px;
+    }
+  }
+
+  h2 + .gridBlock {
+    padding-top: 20px;
+  }
+}
+
+@media only screen and (min-width: 1400px) {
+  .gridBlock {
+    display: flex;
+    flex-direction: row;
+    flex-wrap: wrap;
+    margin: -10px -20px 10px -20px;
+
+    .twoByGridBlock {
+      box-sizing: border-box;
+      flex: 1 0 50%;
+      padding: 10px 20px;
+    }
+
+    .fourByGridBlock {
+      box-sizing: border-box;
+      flex: 1 0 25%;
+      padding: 10px 20px;
+    }
+  }
+}
+\ No newline at end of file
diff --git a/src/rocksdb/docs/_sass/_header.scss b/src/rocksdb/docs/_sass/_header.scss
new file mode 100644
index 000000000..ac79390f4
--- /dev/null
+++ b/src/rocksdb/docs/_sass/_header.scss
@@ -0,0 +1,139 @@
+.fixedHeaderContainer {
+  background: $primary-bg;
+  color: $primary-overlay;
+  height: $header-height;
+  padding: $header-ptop 0 $header-pbot;
+  position: sticky;
+  top: 0;
+  width: 100%;
+  z-index: 9999;
+
+  a {
+    align-items: center;
+    border: 0;
+    color: $primary-overlay;
+    display: flex;
+    flex-flow: row nowrap;
+    height: $header-height;
+  }
+
+  header {
+    display: flex;
+    flex-flow: row nowrap;
+    position: relative;
+    text-align: left;
+
+    img {
+      height: 24px;
+      margin-right: 10px;
+    }
+
+    h2 {
+      display: block;
+      font-family: $header-font-family;
+      font-weight: 900;
+      line-height: 18px;
+      position: relative;
+    }
+  }
+}
+
+.navigationFull {
+  height: 34px;
+  margin-left: auto;
+
+  nav {
+    position: relative;
+
+    ul {
+      display: flex;
+      flex-flow: row nowrap;
+      margin: 0 -10px;
+
+      li {
+        padding: 0 10px;
+        display: block;
+
+        a {
+          border: 0;
+          color: $primary-overlay-special;
+          font-size: 16px;
+          font-weight: 400;
+          line-height: 1.2em;
+
+          &:hover {
+            border-bottom: 2px solid $primary-overlay;
+            color: $primary-overlay;
+          }
+        }
+
+        &.navItemActive {
+          a {
+            color: $primary-overlay;
+          }
+        }
+      }
+    }
+  }
+}
+
+/* 900px
+
+
+  .fixedHeaderContainer {
+    .navigationWrapper {
+      nav {
+        padding: 0 1em;
+        position: relative;
+        top: -9px;
+
+        ul {
+          margin: 0 -0.4em;
+          li {
+            display: inline-block;
+
+            a {
+              padding: 14px 0.4em;
+              border: 0;
+              color: $primary-overlay-special;
+              display: inline-block;
+
+              &:hover {
+                color: $primary-overlay;
+              }
+            }
+
+            &.navItemActive {
+              a {
+                color: $primary-overlay;
+              }
+            }
+          }
+        }
+      }
+
+      &.navigationFull {
+        display: inline-block;
+      }
+
+      &.navigationSlider {
+        display: none;
+      }
+    }
+  }
+
+  1200px
+
+  .fixedHeaderContainer {
+    header {
+      max-width: 1100px;
+    }
+  }
+
+  1500px
+  .fixedHeaderContainer {
+    header {
+      max-width: 1400px;
+    }
+  }
+ */
diff --git a/src/rocksdb/docs/_sass/_poweredby.scss b/src/rocksdb/docs/_sass/_poweredby.scss
new file mode 100644
index 000000000..4155b6053
--- /dev/null
+++ b/src/rocksdb/docs/_sass/_poweredby.scss
@@ -0,0 +1,69 @@
+.poweredByContainer {
+  background: $primary-bg;
+  color: $primary-overlay;
+  margin-bottom: 20px;
+
+  a {
+    color: $primary-overlay;
+  }
+
+  .poweredByWrapper {
+    h2 {
+      border-color: $primary-overlay-special;
+      color: $primary-overlay-special;
+    }
+  }
+
+  .poweredByMessage {
+    color: $primary-overlay-special;
+    font-size: 14px;
+    padding-top: 20px;
+  }
+}
+
+.poweredByItems {
+  display: flex;
+  flex-flow: row wrap;
+  margin: 0 -10px;
+}
+
+.poweredByItem {
+  box-sizing: border-box;
+  flex: 1 0 50%;
+  line-height: 1.1em;
+  padding: 5px 10px;
+
+  &.itemLarge {
+    flex-basis: 100%;
+    padding: 10px;
+    text-align: center;
+
+    &:nth-child(4) {
+      padding-bottom: 20px;
+    }
+
+    img {
+      max-height: 30px;
+    }
+  }
+}
+
+@media only screen and (min-width: 480px) {
+  .itemLarge {
+    flex-basis: 50%;
+    max-width: 50%;
+  }
+}
+
+@media only screen and (min-width: 1024px) {
+  .poweredByItem {
+    flex-basis: 25%;
+    max-width: 25%;
+
+    &.itemLarge {
+      padding-bottom: 20px;
+      text-align: left;
+    }
+  }
+}
+
diff --git a/src/rocksdb/docs/_sass/_promo.scss b/src/rocksdb/docs/_sass/_promo.scss
new file mode 100644
index 000000000..8c9a809dc
--- /dev/null
+++ b/src/rocksdb/docs/_sass/_promo.scss
@@ -0,0 +1,55 @@
+.promoSection {
+  display: flex;
+  flex-flow: column wrap;
+  font-size: 125%;
+  line-height: 1.6em;
+  margin: -10px 0;
+  position: relative;
+  z-index: 99;
+
+  .promoRow {
+    padding: 10px 0;
+
+    .pluginWrapper {
+      display: block;
+
+      &.ghWatchWrapper, &.ghStarWrapper {
+        height: 28px;
+      }
+    }
+
+    .pluginRowBlock {
+      display: flex;
+      flex-flow: row wrap;
+      margin: 0 -2px;
+
+      .pluginWrapper {
+        padding: 0 2px;
+      }
+    }
+  }
+}
+
+iframe.pluginIframe {
+  height: 500px;
+  margin-top: 20px;
+  width: 100%;
+}
+
+.iframeContent {
+  display: none;
+}
+
+.iframePreview {
+  display: inline-block;
+  margin-top: 20px;
+}
+
+@media only screen and (min-width: 1024px) {
+  .iframeContent {
+    display: block;
+  }
+  .iframePreview {
+    display: none;
+  }
+}
+\ No newline at end of file
diff --git a/src/rocksdb/docs/_sass/_react_docs_nav.scss b/src/rocksdb/docs/_sass/_react_docs_nav.scss
new file mode 100644
index 000000000..f0a651e7f
--- /dev/null
+++ b/src/rocksdb/docs/_sass/_react_docs_nav.scss
@@ -0,0 +1,332 @@
+.docsNavContainer {
+  background: $sidenav;
+  height: 35px;
+  left: 0;
+  position: fixed;
+  width: 100%;
+  z-index: 100;
+}
+
+.docMainWrapper {
+  .wrapper {
+    &.mainWrapper {
+      padding-left: 0;
+      padding-right: 0;
+      padding-top: 10px;
+    }
+  }
+}
+
+.docsSliderActive {
+  .docsNavContainer {
+    box-sizing: border-box;
+    height: 100%;
+    overflow-y: auto;
+    -webkit-overflow-scrolling: touch;
+    padding-bottom: 50px;
+  }
+
+  .mainContainer {
+    display: none;
+  }
+}
+
+.navBreadcrumb {
+  box-sizing: border-box;
+  display: flex;
+  flex-flow: row nowrap;
+  font-size: 12px;
+  height: 35px;
+  overflow: hidden;
+  padding: 5px 10px;
+
+  a, span {
+    border: 0;
+    color: $sidenav-text;
+  }
+
+  i {
+    padding: 0 3px;
+  }
+}
+
+nav.toc {
+  position: relative;
+
+  section {
+    padding: 0px;
+    position: relative;
+
+    .navGroups {
+      display: none;
+      padding: 40px 10px 10px;
+    }
+  }
+
+  .toggleNav {
+    background: $sidenav;
+    color: $sidenav-text;
+    position: relative;
+    transition: background-color 0.3s, color 0.3s;
+
+    .navToggle {
+      cursor: pointer;
+      height: 24px;
+      margin-right: 10px;
+      position: relative;
+      text-align: left;
+      width: 18px;
+
+      &::before, &::after {
+        content: "";
+        position: absolute;
+        top: 50%;
+        left: 0;
+        left: 8px;
+        width: 3px;
+        height: 6px;
+        border: 5px solid $sidenav-text;
+        border-width: 5px 0;
+        margin-top: -8px;
+        transform: rotate(45deg);
+        z-index: 1;
+      }
+
+      &::after {
+        transform: rotate(-45deg);
+      }
+
+      i {
+        &::before, &::after {
+          content: "";
+          position: absolute;
+          top: 50%;
+          left: 2px;
+          background: transparent;
+          border-width: 0 5px 5px;
+          border-style: solid;
+          border-color: transparent $sidenav-text;
+          height: 0;
+          margin-top: -7px;
+          opacity: 1;
+          width: 5px;
+          z-index: 10;
+        }
+
+        &::after {
+          border-width: 5px 5px 0;
+          margin-top: 2px;
+        }
+      }
+    }
+
+    .navGroup {
+      background: $sidenav-overlay;
+      margin: 1px 0;
+
+      ul {
+        display: none;
+      }
+
+      h3 {
+        background: $sidenav-overlay;
+        color: $sidenav-text;
+        cursor: pointer;
+        font-size: 14px;
+        font-weight: 400;
+        line-height: 1.2em;
+        padding: 10px;
+        transition: color 0.2s;
+
+        i:not(:empty) {
+          width: 16px;
+          height: 16px;
+          display: inline-block;
+          box-sizing: border-box;
+          text-align: center;
+          color: rgba($sidenav-text, 0.5);
+          margin-right: 10px;
+          transition: color 0.2s;
+        }
+
+        &:hover {
+          color: $primary-bg;
+
+          i:not(:empty) {
+            color: $primary-bg;
+          }
+        }
+      }
+
+      &.navGroupActive {
+        background: $sidenav-active;
+        color: $sidenav-text;
+
+        ul {
+          display: block;
+          padding-bottom: 10px;
+          padding-top: 10px;
+        }
+
+        h3 {
+          background: $primary-bg;
+          color: $primary-overlay;
+
+          i {
+            display: none;
+          }
+        }
+      }
+    }
+
+    ul {
+      padding-left: 0;
+      padding-right: 24px;
+
+      li {
+        list-style-type: none;
+        padding-bottom: 0;
+        padding-left: 0;
+
+        a {
+          border: none;
+          color: $sidenav-text;
+          display: inline-block;
+          font-size: 14px;
+          line-height: 1.1em;
+          margin: 2px 10px 5px;
+          padding: 5px 0 2px;
+          transition: color 0.3s;
+
+          &:hover,
+          &:focus {
+            color: $primary-bg;
+          }
+
+          &.navItemActive {
+            color: $primary-bg;
+            font-weight: 900;
+          }
+        }
+      }
+    }
+  }
+
+  .toggleNavActive {
+    .navBreadcrumb {
+      background: $sidenav;
+      margin-bottom: 20px;
+      position: fixed;
+      width: 100%;
+    }
+
+    section {
+      .navGroups {
+        display: block;
+      }
+    }
+
+
+    .navToggle {
+      &::before, &::after {
+        border-width: 6px 0;
+        height: 0px;
+        margin-top: -6px;
+      }
+
+      i {
+        opacity: 0;
+      }
+    }
+  }
+}
+
+.docsNavVisible {
+  .navPusher {
+    .mainContainer {
+      padding-top: 35px;
+    }
+  }
+}
+
+@media only screen and (min-width: 900px) {
+  .navBreadcrumb {
+    padding: 5px 0;
+  }
+
+  nav.toc {
+    section {
+      .navGroups {
+        padding: 40px 0 0;
+      }
+    }
+  }
+}
+
+@media only screen and (min-width: 1024px) {
+  .navToggle {
+    display: none;
+  }
+
+  .docsSliderActive {
+    .mainContainer {
+      display: block;
+    }
+  }
+
+  .docsNavVisible {
+    .navPusher {
+      .mainContainer {
+        padding-top: 0;
+      }
+    }
+  }
+
+  .docsNavContainer {
+    background: none;
+    box-sizing: border-box;
+    height: auto;
+    margin: 40px 40px 0 0;
+    overflow-y: auto;
+    position: relative;
+    width: 300px;
+  }
+
+  nav.toc {
+    section {
+      .navGroups {
+        display: block;
+        padding-top: 0px;
+      }
+    }
+
+    .toggleNavActive {
+      .navBreadcrumb {
+        margin-bottom: 0;
+        position: relative;
+      }
+    }
+  }
+
+  .docMainWrapper {
+    display: flex;
+    flex-flow: row nowrap;
+    margin-bottom: 40px;
+
+    .wrapper {
+      padding-left: 0;
+      padding-right: 0;
+
+      &.mainWrapper {
+        padding-top: 0;
+      }
+    }
+  }
+
+  .navBreadcrumb {
+    display: none;
+    h2 {
+      padding: 0 10px;
+    }
+  }
+}
+\ No newline at end of file
diff --git a/src/rocksdb/docs/_sass/_react_header_nav.scss b/src/rocksdb/docs/_sass/_react_header_nav.scss
new file mode 100644
index 000000000..13c0e562b
--- /dev/null
+++ b/src/rocksdb/docs/_sass/_react_header_nav.scss
@@ -0,0 +1,141 @@
+.navigationFull {
+  display: none;
+}
+
+.navigationSlider {
+  position: absolute;
+  right: 0px;
+
+  .navSlideout {
+    cursor: pointer;
+    padding-top: 4px;
+    position: absolute;
+    right: 10px;
+    top: 0;
+    transition: top 0.3s;
+    z-index: 101;
+  }
+
+  .slidingNav {
+    background: $secondary-bg;
+    box-sizing: border-box;
+    height: 0px;
+    overflow-x: hidden;
+    padding: 0;
+    position: absolute;
+    right: 0px;
+    top: 0;
+    transition: height 0.3s cubic-bezier(0.68, -0.55, 0.265, 1.55), width 0.3s cubic-bezier(0.68, -0.55, 0.265, 1.55);
+    width: 0;
+
+    ul {
+      flex-flow: column nowrap;
+      list-style: none;
+      padding: 10px;
+
+      li {
+        margin: 0;
+        padding: 2px 0;
+
+        a {
+          color: $primary-bg;
+          display: inline;
+          margin: 3px 5px;
+          padding: 2px 0px;
+          transition: background-color 0.3s;
+
+          &:focus,
+          &:hover {
+            border-bottom: 2px solid $primary-bg;
+          }
+        }
+      }
+    }
+  }
+
+  .navSlideoutActive {
+    .slidingNav {
+      height: auto;
+      padding-top: $header-height + $header-pbot;
+      width: 300px;
+    }
+
+    .navSlideout {
+      top: -2px;
+      .menuExpand {
+        span:nth-child(1) {
+          background-color: $text;
+          top: 16px;
+          transform: rotate(45deg);
+        }
+        span:nth-child(2) {
+          opacity: 0;
+        }
+        span:nth-child(3) {
+          background-color: $text;
+          transform: rotate(-45deg);
+        }
+      }
+    }
+  }
+}
+
+.menuExpand {
+  display: flex;
+  flex-flow: column nowrap;
+  height: 20px;
+  justify-content: space-between;
+
+  span {
+    background: $primary-overlay;
+    border-radius: 3px;
+    display: block;
+    flex: 0 0 4px;
+    height: 4px;
+    position: relative;
+    top: 0;
+    transition: background-color 0.3s, top 0.3s, opacity 0.3s, transform 0.3s;
+    width: 20px;
+  }
+}
+
+.navPusher {
+  border-top: $header-height + $header-ptop + $header-pbot solid $primary-bg;
+	position: relative;
+	left: 0;
+	z-index: 99;
+	height: 100%;
+
+  &::after {
+    position: absolute;
+    top: 0;
+    right: 0;
+    width: 0;
+    height: 0;
+    background: rgba(0,0,0,0.4);
+    content: '';
+    opacity: 0;
+    -webkit-transition: opacity 0.5s, width 0.1s 0.5s, height 0.1s 0.5s;
+    transition: opacity 0.5s, width 0.1s 0.5s, height 0.1s 0.5s;
+  }
+
+  .sliderActive &::after {
+    width: 100%;
+    height: 100%;
+    opacity: 1;
+    -webkit-transition: opacity 0.5s;
+    transition: opacity 0.5s;
+    z-index: 100;
+  }
+}
+
+
+@media only screen and (min-width: 1024px) {
+  .navigationFull {
+    display: block;
+  }
+
+  .navigationSlider {
+    display: none;
+  }
+}
+\ No newline at end of file
diff --git a/src/rocksdb/docs/_sass/_reset.scss b/src/rocksdb/docs/_sass/_reset.scss
new file mode 100644
index 000000000..0e5f2e0c1
--- /dev/null
+++ b/src/rocksdb/docs/_sass/_reset.scss
@@ -0,0 +1,43 @@
+html, body, div, span, applet, object, iframe,
+h1, h2, h3, h4, h5, h6, p, blockquote, pre,
+a, abbr, acronym, address, big, cite, code,
+del, dfn, em, img, ins, kbd, q, s, samp,
+small, strike, strong, sub, sup, tt, var,
+b, u, i, center,
+dl, dt, dd, ol, ul, li,
+fieldset, form, label, legend,
+table, caption, tbody, tfoot, thead, tr, th, td,
+article, aside, canvas, details, embed,
+figure, figcaption, footer, header, hgroup,
+menu, nav, output, ruby, section, summary,
+time, mark, audio, video {
+	margin: 0;
+	padding: 0;
+	border: 0;
+	font-size: 100%;
+	font: inherit;
+	vertical-align: baseline;
+}
+/* HTML5 display-role reset for older browsers */
+article, aside, details, figcaption, figure,
+footer, header, hgroup, menu, nav, section {
+	display: block;
+}
+body {
+	line-height: 1;
+}
+ol, ul {
+	list-style: none;
+}
+blockquote, q {
+	quotes: none;
+}
+blockquote:before, blockquote:after,
+q:before, q:after {
+	content: '';
+	content: none;
+}
+table {
+	border-collapse: collapse;
+	border-spacing: 0;
+}
diff --git a/src/rocksdb/docs/_sass/_search.scss b/src/rocksdb/docs/_sass/_search.scss
new file mode 100644
index 000000000..eadfa11d1
--- /dev/null
+++ b/src/rocksdb/docs/_sass/_search.scss
@@ -0,0 +1,142 @@
+input[type="search"] {
+    -moz-appearance:    none;
+    -webkit-appearance: none;
+}
+
+.navSearchWrapper {
+  align-self: center;
+  position: relative;
+
+  &::before {
+    border: 3px solid $primary-overlay-special;
+    border-radius: 50%;
+    content: " ";
+    display: block;
+    height: 6px;
+    left: 15px;
+    width: 6px;
+    position: absolute;
+    top: 4px;
+    z-index: 1;
+  }
+
+  &::after {
+    background: $primary-overlay-special;
+    content: " ";
+    height: 7px;
+    left: 24px;
+    position: absolute;
+    transform: rotate(-45deg);
+    top: 12px;
+    width: 3px;
+    z-index: 1;
+  }
+
+  .aa-dropdown-menu {
+    background: $secondary-bg;
+    border: 3px solid rgba($text, 0.25);
+    color: $text;
+    font-size: 14px;
+    left: auto !important;
+    line-height: 1.2em;
+    right: 0 !important;
+
+    .algolia-docsearch-suggestion--category-header {
+      background: $primary-overlay-special;
+      color: $primary-bg;
+
+      .algolia-docsearch-suggestion--highlight {
+        background-color: $primary-bg;
+        color: $primary-overlay;
+      }
+    }
+
+    .algolia-docsearch-suggestion--title .algolia-docsearch-suggestion--highlight,
+    .algolia-docsearch-suggestion--subcategory-column .algolia-docsearch-suggestion--highlight {
+        color: $primary-bg;
+    }
+
+    .algolia-docsearch-suggestion__secondary,
+    .algolia-docsearch-suggestion--subcategory-column {
+      border-color: rgba($text, 0.3);
+    }
+  }
+}
+
+input#search_input {
+  padding-left: 25px;
+  font-size: 14px;
+  line-height: 20px;
+  border-radius: 20px;
+  background-color: rgba($primary-overlay-special, 0.25);
+  border: none;
+  color: rgba($primary-overlay-special, 0);
+  outline: none;
+  position: relative;
+  transition: background-color .2s cubic-bezier(0.68, -0.55, 0.265, 1.55), width .2s cubic-bezier(0.68, -0.55, 0.265, 1.55), color .2s ease;
+  width: 60px;
+
+  &:focus, &:active {
+    background-color: $secondary-bg;
+    color: $text;
+    width: 240px;
+  }
+}
+
+.navigationSlider {
+  .navSearchWrapper {
+    &::before {
+      left: 6px;
+      top: 6px;
+    }
+
+    &::after {
+      left: 15px;
+      top: 14px;
+    }
+  }
+
+  input#search_input_react {
+    box-sizing: border-box;
+    padding-left: 25px;
+    font-size: 14px;
+    line-height: 20px;
+    border-radius: 20px;
+    background-color: rgba($primary-overlay-special, 0.25);
+    border: none;
+    color: $text;
+    outline: none;
+    position: relative;
+    transition: background-color .2s cubic-bezier(0.68, -0.55, 0.265, 1.55), width .2s cubic-bezier(0.68, -0.55, 0.265, 1.55), color .2s ease;
+    width: 100%;
+
+    &:focus, &:active {
+      background-color: $primary-bg;
+      color: $primary-overlay;
+    }
+  }
+
+  .algolia-docsearch-suggestion--subcategory-inline {
+    display: none;
+  }
+
+  & > span {
+    width: 100%;
+  }
+
+  .aa-dropdown-menu {
+    background: $secondary-bg;
+    border: 0px solid $secondary-bg;
+    color: $text;
+    font-size: 12px;
+    line-height: 2em;
+    max-height: 140px;
+    min-width: auto;
+    overflow-y: scroll;
+    -webkit-overflow-scrolling: touch;
+    padding: 0;
+    border-radius: 0;
+    position: relative !important;
+    width: 100%;
+  }
+}
+\ No newline at end of file
diff --git a/src/rocksdb/docs/_sass/_slideshow.scss b/src/rocksdb/docs/_sass/_slideshow.scss
new file mode 100644
index 000000000..cd98a6cdb
--- /dev/null
+++ b/src/rocksdb/docs/_sass/_slideshow.scss
@@ -0,0 +1,48 @@
+.slideshow {
+  position: relative;
+
+  .slide {
+    display: none;
+
+    img {
+      display: block;
+      margin: 0 auto;
+    }
+
+    &.slideActive {
+      display: block;
+    }
+
+    a {
+      border: none;
+      display: block;
+    }
+  }
+
+  .pagination {
+    display: block;
+    margin: -10px;
+    padding: 1em 0;
+    text-align: center;
+    width: 100%;
+
+    .pager {
+      background: transparent;
+      border: 2px solid rgba(255, 255, 255, 0.5);
+      border-radius: 50%;
+      cursor: pointer;
+      display: inline-block;
+      height: 12px;
+      margin: 10px;
+      transition: background-color 0.3s, border-color 0.3s;
+      width: 12px;
+
+      &.pagerActive {
+        background: rgba(255, 255, 255, 0.5);
+        border-width: 4px;
+        height: 8px;
+        width: 8px;
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/docs/_sass/_syntax-highlighting.scss b/src/rocksdb/docs/_sass/_syntax-highlighting.scss
new file mode 100644
index 000000000..e55c88a2e
--- /dev/null
+++ b/src/rocksdb/docs/_sass/_syntax-highlighting.scss
@@ -0,0 +1,129 @@
+
+
+.rougeHighlight { background-color: $code-bg; color: #93a1a1 }
+.rougeHighlight .c { color: #586e75 } /* Comment */
+.rougeHighlight .err { color: #93a1a1 } /* Error */
+.rougeHighlight .g { color: #93a1a1 } /* Generic */
+.rougeHighlight .k { color: #859900 } /* Keyword */
+.rougeHighlight .l { color: #93a1a1 } /* Literal */
+.rougeHighlight .n { color: #93a1a1 } /* Name */
+.rougeHighlight .o { color: #859900 } /* Operator */
+.rougeHighlight .x { color: #cb4b16 } /* Other */
+.rougeHighlight .p { color: #93a1a1 } /* Punctuation */
+.rougeHighlight .cm { color: #586e75 } /* Comment.Multiline */
+.rougeHighlight .cp { color: #859900 } /* Comment.Preproc */
+.rougeHighlight .c1 { color: #72c02c; } /* Comment.Single */
+.rougeHighlight .cs { color: #859900 } /* Comment.Special */
+.rougeHighlight .gd { color: #2aa198 } /* Generic.Deleted */
+.rougeHighlight .ge { color: #93a1a1; font-style: italic } /* Generic.Emph */
+.rougeHighlight .gr { color: #dc322f } /* Generic.Error */
+.rougeHighlight .gh { color: #cb4b16 } /* Generic.Heading */
+.rougeHighlight .gi { color: #859900 } /* Generic.Inserted */
+.rougeHighlight .go { color: #93a1a1 } /* Generic.Output */
+.rougeHighlight .gp { color: #93a1a1 } /* Generic.Prompt */
+.rougeHighlight .gs { color: #93a1a1; font-weight: bold } /* Generic.Strong */
+.rougeHighlight .gu { color: #cb4b16 } /* Generic.Subheading */
+.rougeHighlight .gt { color: #93a1a1 } /* Generic.Traceback */
+.rougeHighlight .kc { color: #cb4b16 } /* Keyword.Constant */
+.rougeHighlight .kd { color: #268bd2 } /* Keyword.Declaration */
+.rougeHighlight .kn { color: #859900 } /* Keyword.Namespace */
+.rougeHighlight .kp { color: #859900 } /* Keyword.Pseudo */
+.rougeHighlight .kr { color: #268bd2 } /* Keyword.Reserved */
+.rougeHighlight .kt { color: #dc322f } /* Keyword.Type */
+.rougeHighlight .ld { color: #93a1a1 } /* Literal.Date */
+.rougeHighlight .m { color: #2aa198 } /* Literal.Number */
+.rougeHighlight .s { color: #2aa198 } /* Literal.String */
+.rougeHighlight .na { color: #93a1a1 } /* Name.Attribute */
+.rougeHighlight .nb { color: #B58900 } /* Name.Builtin */
+.rougeHighlight .nc { color: #268bd2 } /* Name.Class */
+.rougeHighlight .no { color: #cb4b16 } /* Name.Constant */
+.rougeHighlight .nd { color: #268bd2 } /* Name.Decorator */
+.rougeHighlight .ni { color: #cb4b16 } /* Name.Entity */
+.rougeHighlight .ne { color: #cb4b16 } /* Name.Exception */
+.rougeHighlight .nf { color: #268bd2 } /* Name.Function */
+.rougeHighlight .nl { color: #93a1a1 } /* Name.Label */
+.rougeHighlight .nn { color: #93a1a1 } /* Name.Namespace */
+.rougeHighlight .nx { color: #93a1a1 } /* Name.Other */
+.rougeHighlight .py { color: #93a1a1 } /* Name.Property */
+.rougeHighlight .nt { color: #268bd2 } /* Name.Tag */
+.rougeHighlight .nv { color: #268bd2 } /* Name.Variable */
+.rougeHighlight .ow { color: #859900 } /* Operator.Word */
+.rougeHighlight .w { color: #93a1a1 } /* Text.Whitespace */
+.rougeHighlight .mf { color: #2aa198 } /* Literal.Number.Float */
+.rougeHighlight .mh { color: #2aa198 } /* Literal.Number.Hex */
+.rougeHighlight .mi { color: #2aa198 } /* Literal.Number.Integer */
+.rougeHighlight .mo { color: #2aa198 } /* Literal.Number.Oct */
+.rougeHighlight .sb { color: #586e75 } /* Literal.String.Backtick */
+.rougeHighlight .sc { color: #2aa198 } /* Literal.String.Char */
+.rougeHighlight .sd { color: #93a1a1 } /* Literal.String.Doc */
+.rougeHighlight .s2 { color: #2aa198 } /* Literal.String.Double */
+.rougeHighlight .se { color: #cb4b16 } /* Literal.String.Escape */
+.rougeHighlight .sh { color: #93a1a1 } /* Literal.String.Heredoc */
+.rougeHighlight .si { color: #2aa198 } /* Literal.String.Interpol */
+.rougeHighlight .sx { color: #2aa198 } /* Literal.String.Other */
+.rougeHighlight .sr { color: #dc322f } /* Literal.String.Regex */
+.rougeHighlight .s1 { color: #2aa198 } /* Literal.String.Single */
+.rougeHighlight .ss { color: #2aa198 } /* Literal.String.Symbol */
+.rougeHighlight .bp { color: #268bd2 } /* Name.Builtin.Pseudo */
+.rougeHighlight .vc { color: #268bd2 } /* Name.Variable.Class */
+.rougeHighlight .vg { color: #268bd2 } /* Name.Variable.Global */
+.rougeHighlight .vi { color: #268bd2 } /* Name.Variable.Instance */
+.rougeHighlight .il { color: #2aa198 } /* Literal.Number.Integer.Long */
+
+.highlighter-rouge {
+  color: darken(#72c02c, 8%);
+  font: 800 12px/1.5em Hack, monospace;
+  max-width: 100%;
+
+  .rougeHighlight {
+    border-radius: 3px;
+    margin: 20px 0;
+    padding: 0px;
+    overflow-x: scroll;
+    -webkit-overflow-scrolling: touch;
+
+    table {
+      background: none;
+      border: none;
+
+      tbody {
+        tr {
+          background: none;
+          display: flex;
+          flex-flow: row nowrap;
+
+          td {
+            display: block;
+            flex: 1 1;
+
+            &.gutter {
+              border-right: 1px solid lighten($code-bg, 10%);
+              color: lighten($code-bg, 15%);
+              margin-right: 10px;
+              max-width: 40px;
+              padding-right: 10px;
+
+              pre {
+                max-width: 20px;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+p > .highlighter-rouge,
+li > .highlighter-rouge,
+a > .highlighter-rouge {
+  font-size: 16px;
+  font-weight: 400;
+  line-height: inherit;
+}
+
+a:hover {
+  .highlighter-rouge {
+    color: white;
+  }
+}
+\ No newline at end of file
diff --git a/src/rocksdb/docs/_sass/_tables.scss b/src/rocksdb/docs/_sass/_tables.scss
new file mode 100644
index 000000000..f847c7013
--- /dev/null
+++ b/src/rocksdb/docs/_sass/_tables.scss
@@ -0,0 +1,47 @@
+table {
+  background: $lightergrey;
+  border: 1px solid $lightgrey;
+  border-collapse: collapse;
+  display:table;
+  margin: 20px 0;
+
+  thead {
+    border-bottom: 1px solid $lightgrey;
+    display: table-header-group;
+  }
+  tbody {
+    display: table-row-group;
+  }
+  tr {
+    display: table-row;
+    &:nth-of-type(odd) {
+      background: $greyish;
+    }
+
+    th, td {
+      border-right: 1px dotted $lightgrey;
+      display: table-cell;
+      font-size: 14px;
+      line-height: 1.3em;
+      padding: 10px;
+      text-align: left;
+
+      &:last-of-type {
+        border-right: 0;
+      }
+
+      code {
+        color: $green;
+        display: inline-block;
+        font-size: 12px;
+      }
+    }
+
+    th {
+      color: #000000;
+      font-weight: bold;
+      font-family: $header-font-family;
+      text-transform: uppercase;
+    }
+  }
+}
+\ No newline at end of file
diff --git a/src/rocksdb/docs/_top-level/support.md b/src/rocksdb/docs/_top-level/support.md
new file mode 100644
index 000000000..05c39befd
--- /dev/null
+++ b/src/rocksdb/docs/_top-level/support.md
@@ -0,0 +1,22 @@
+---
+layout: top-level
+title: Support
+id: support
+category: support
+---
+
+## Need help?
+
+Do not hesitate to ask questions if you are having trouble with RocksDB.
+
+### GitHub issues
+
+Use [GitHub issues](https://github.com/facebook/rocksdb/issues) to report bugs, issues and feature requests for the RocksDB codebase.
+
+### Facebook Group
+
+Use the [RocksDB Facebook group](https://www.facebook.com/groups/rocksdb.dev/) for general questions and discussion about RocksDB.
+
+### FAQ
+
+Check out a list of [commonly asked questions](https://github.com/facebook/rocksdb/wiki/RocksDB-FAQ) about RocksDB.
diff --git a/src/rocksdb/docs/blog/all.html b/src/rocksdb/docs/blog/all.html
new file mode 100644
index 000000000..3be2d3bff
--- /dev/null
+++ b/src/rocksdb/docs/blog/all.html
@@ -0,0 +1,20 @@
+---
+id: all
+layout: blog
+category: blog
+---
+
+<div class="posts">
+  <div class="post">
+    <h1>All Posts</h1>
+    {% for post in site.posts %}
+      {% assign author = site.data.authors[post.author] %}
+      <p>
+        <strong>
+          <a href="{{ post.url }}">{{ post.title }}</a>
+        </strong>
+          on {{ post.date | date: "%B %e, %Y" }} by {{ author.display_name }}
+      </p>
+    {% endfor %}
+  </div>
+</div>
diff --git a/src/rocksdb/docs/blog/index.html b/src/rocksdb/docs/blog/index.html
new file mode 100644
index 000000000..9f6b25d03
--- /dev/null
+++ b/src/rocksdb/docs/blog/index.html
@@ -0,0 +1,12 @@
+---
+id: blog 
+title: Blog
+layout: blog 
+category: blog
+---
+
+<div class="posts">
+  {% for page in site.posts %}
+    {% include post.html truncate=true %}
+  {% endfor %}
+</div>
diff --git a/src/rocksdb/docs/css/main.scss b/src/rocksdb/docs/css/main.scss
new file mode 100644
index 000000000..88ab4e811
--- /dev/null
+++ b/src/rocksdb/docs/css/main.scss
@@ -0,0 +1,159 @@
+---
+# Only the main Sass file needs front matter (the dashes are enough)
+---
+@charset "utf-8";
+
+@font-face {
+	font-family: 'Lato';
+	src: url("{{ '/static/fonts/LatoLatin-Italic.woff2' }}") format('woff2'),
+		url("{{ '/static/fonts/LatoLatin-Italic.woff' }}") format('woff');
+	font-weight: normal;
+	font-style: italic;
+}
+
+@font-face {
+	font-family: 'Lato';
+	src: url("{{ '/static/fonts/LatoLatin-Black.woff2' }}") format('woff2'),
+		url("{{ '/static/fonts/LatoLatin-Black.woff' }}") format('woff');
+	font-weight: 900;
+	font-style: normal;
+}
+
+@font-face {
+	font-family: 'Lato';
+	src: url("{{ '/static/fonts/LatoLatin-BlackItalic.woff2' }}") format('woff2'),
+		url("{{ '/static/fonts/LatoLatin-BlackItalic.woff' }}") format('woff');
+	font-weight: 900;
+	font-style: italic;
+}
+
+@font-face {
+	font-family: 'Lato';
+	src: url("{{ '/static/fonts/LatoLatin-Light.woff2' }}") format('woff2'),
+		url("{{ '/static/fonts/LatoLatin-Light.woff' }}") format('woff');
+	font-weight: 300;
+	font-style: normal;
+}
+
+@font-face {
+	font-family: 'Lato';
+	src: url("{{ '/static/fonts/LatoLatin-Regular.woff2' }}") format('woff2'),
+		url("{{ '/static/fonts/LatoLatin-Regular.woff' }}") format('woff');
+	font-weight: normal;
+	font-style: normal;
+}
+
+// Our variables
+$base-font-family:  'Lato', Calibri, Arial, sans-serif;
+$header-font-family:  'Lato', 'Helvetica Neue', Arial, sans-serif;
+$base-font-size:    18px;
+$small-font-size:   $base-font-size * 0.875;
+$base-line-height:  1.4em;
+
+$spacing-unit:      12px;
+
+// Two configured colors (see _config.yml)
+$primary-bg:        				{{ site.color.primary }};
+$secondary-bg:      				{{ site.color.secondary }};
+
+// $primary-bg overlays
+{% if site.color.primary-overlay == 'light' %}
+$primary-overlay:       		darken($primary-bg, 70%);
+$primary-overlay-special:		darken($primary-bg, 40%);
+{% else %}
+$primary-overlay:      		 	#fff;
+$primary-overlay-special:		lighten($primary-bg, 30%);
+{% endif %}
+
+// $secondary-bg overlays
+{% if site.color.secondary-overlay == 'light' %}
+$text:              #393939;
+$sidenav: 				  darken($secondary-bg, 20%);
+$sidenav-text: 			$text;
+$sidenav-overlay: 	darken($sidenav, 10%);
+$sidenav-active: 		lighten($sidenav, 10%);
+{% else %}
+$text:              #fff;
+$sidenav: 				  lighten($secondary-bg, 20%);
+$sidenav-text: 			$text;
+$sidenav-overlay: 	lighten($sidenav, 10%);
+$sidenav-active: 		darken($sidenav, 10%);
+{% endif %}
+
+$code-bg: 					#002b36;
+
+$header-height: 34px;
+$header-ptop: 10px;
+$header-pbot: 8px;
+
+// Width of the content area
+$content-width:     900px;
+
+// Table setting variables
+$lightergrey: #F8F8F8;
+$greyish: #E8E8E8;
+$lightgrey: #B0B0B0;
+$green: #2db04b;
+
+// Using media queries with like this:
+// @include media-query($on-palm) {
+//     .wrapper {
+//         padding-right: $spacing-unit / 2;
+//         padding-left: $spacing-unit / 2;
+//     }
+// }
+@mixin media-query($device) {
+    @media screen and (max-width: $device) {
+        @content;
+    }
+}
+
+
+
+// Import partials from `sass_dir` (defaults to `_sass`)
+@import
+        "reset",
+        "base",
+				"header",
+        "search",
+        "syntax-highlighting",
+				"promo",
+				"buttons",
+				"gridBlock",
+				"poweredby",
+				"footer",
+				"react_header_nav",
+				"react_docs_nav",
+				"tables",
+				"blog"
+;
+
+// Anchor links
+// http://ben.balter.com/2014/03/13/pages-anchor-links/
+.header-link {
+  position: absolute;
+  margin-left: 0.2em;
+  opacity: 0;
+
+  -webkit-transition: opacity 0.2s ease-in-out 0.1s;
+  -moz-transition: opacity 0.2s ease-in-out 0.1s;
+  -ms-transition: opacity 0.2s ease-in-out 0.1s;
+}
+
+h2:hover .header-link,
+h3:hover .header-link,
+h4:hover .header-link,
+h5:hover .header-link,
+h6:hover .header-link {
+  opacity: 1;
+}
+
+/* Social Banner */
+.socialBanner {
+	font-weight: bold;
+	font-size: 20px;
+	padding: 20px;
+	max-width: 768px;
+	margin: 0 auto;
+	text-align: center;
+  }
diff --git a/src/rocksdb/docs/doc-type-examples/2016-04-07-blog-post-example.md b/src/rocksdb/docs/doc-type-examples/2016-04-07-blog-post-example.md
new file mode 100644
index 000000000..ef954d63a
--- /dev/null
+++ b/src/rocksdb/docs/doc-type-examples/2016-04-07-blog-post-example.md
@@ -0,0 +1,21 @@
+---
+title: Blog Post Example
+layout: post
+author: exampleauthor
+category: blog
+---
+
+Any local blog posts would go in the `_posts` directory.
+
+This is an example blog post introduction, try to keep it short and about a paragraph long, to encourage people to click through to read the entire post.
+
+<!--truncate-->
+
+Everything below the `<!--truncate-->` tag will only show on the actual blog post page, not on the `/blog/` index.
+
+Author is defined in `_data/authors.yml`
+
+
+## No posts?
+
+If you have no blog for your site, you can remove the entire `_posts` folder. Otherwise add markdown files in here. See CONTRIBUTING.md for details.
diff --git a/src/rocksdb/docs/doc-type-examples/docs-hello-world.md b/src/rocksdb/docs/doc-type-examples/docs-hello-world.md
new file mode 100644
index 000000000..c7094ba5a
--- /dev/null
+++ b/src/rocksdb/docs/doc-type-examples/docs-hello-world.md
@@ -0,0 +1,12 @@
+---
+docid: hello-world
+title: Hello, World!
+layout: docs
+permalink: /docs/hello-world.html
+---
+
+Any local docs would go in the `_docs` directory.
+
+## No documentation?
+
+If you have no documentation for your site, you can remove the entire `_docs` folder. Otherwise add markdown files in here. See CONTRIBUTING.md for details.
diff --git a/src/rocksdb/docs/doc-type-examples/top-level-example.md b/src/rocksdb/docs/doc-type-examples/top-level-example.md
new file mode 100644
index 000000000..67b1fa711
--- /dev/null
+++ b/src/rocksdb/docs/doc-type-examples/top-level-example.md
@@ -0,0 +1,8 @@
+---
+layout: top-level
+title: Support Example
+id: top-level-example
+category: top-level
+---
+
+This is a static page disconnected from the blog or docs collections that can be added at a top-level (i.e., the same level as `index.md`).
diff --git a/src/rocksdb/docs/docs/index.html b/src/rocksdb/docs/docs/index.html
new file mode 100644
index 000000000..fa6ec8b5a
--- /dev/null
+++ b/src/rocksdb/docs/docs/index.html
@@ -0,0 +1,6 @@
+---
+id: docs
+title: Docs
+layout: redirect
+destination: getting-started.html
+---
diff --git a/src/rocksdb/docs/feed.xml b/src/rocksdb/docs/feed.xml
new file mode 100644
index 000000000..725f00566
--- /dev/null
+++ b/src/rocksdb/docs/feed.xml
@@ -0,0 +1,30 @@
+---
+layout: null
+---
+<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
+  <channel>
+    <title>{{ site.title | xml_escape }}</title>
+    <description>{{ site.description | xml_escape }}</description>
+    <link>https://rocksdb.org/feed.xml</link>
+    <atom:link href="{{ '/feed.xml' | absolute_url }}" rel="self" type="application/rss+xml"/>
+    <pubDate>{{ site.time | date_to_rfc822 }}</pubDate>
+    <lastBuildDate>{{ site.time | date_to_rfc822 }}</lastBuildDate>
+    <generator>Jekyll v{{ jekyll.version }}</generator>
+    {% for post in site.posts limit:10 %}
+      <item>
+        <title>{{ post.title | xml_escape }}</title>
+        <description>{{ post.content | xml_escape }}</description>
+        <pubDate>{{ post.date | date_to_rfc822 }}</pubDate>
+        <link>{{ post.url | absolute_url }}</link>
+        <guid isPermaLink="true">{{ post.url | absolute_url }}</guid>
+        {% for tag in post.tags %}
+        <category>{{ tag | xml_escape }}</category>
+        {% endfor %}
+        {% for cat in post.categories %}
+        <category>{{ cat | xml_escape }}</category>
+        {% endfor %}
+      </item>
+    {% endfor %}
+  </channel>
+</rss>
diff --git a/src/rocksdb/docs/index.md b/src/rocksdb/docs/index.md
new file mode 100644
index 000000000..2b9570d23
--- /dev/null
+++ b/src/rocksdb/docs/index.md
@@ -0,0 +1,9 @@
+---
+layout: home
+title: RocksDB | A persistent key-value store
+id: home
+---
+
+## Features
+
+{% include content/gridblocks.html data_source=site.data.features align="center" %}
diff --git a/src/rocksdb/docs/static/favicon.png b/src/rocksdb/docs/static/favicon.png
new file mode 100644
index 000000000..7f668f38f
--- /dev/null
+++ b/src/rocksdb/docs/static/favicon.png
diff --git a/src/rocksdb/docs/static/fonts/LatoLatin-Black.woff b/src/rocksdb/docs/static/fonts/LatoLatin-Black.woff
new file mode 100644
index 000000000..d1e2579bf
--- /dev/null
+++ b/src/rocksdb/docs/static/fonts/LatoLatin-Black.woff
diff --git a/src/rocksdb/docs/static/fonts/LatoLatin-Black.woff2 b/src/rocksdb/docs/static/fonts/LatoLatin-Black.woff2
new file mode 100644
index 000000000..4127b4d0b
--- /dev/null
+++ b/src/rocksdb/docs/static/fonts/LatoLatin-Black.woff2
diff --git a/src/rocksdb/docs/static/fonts/LatoLatin-BlackItalic.woff b/src/rocksdb/docs/static/fonts/LatoLatin-BlackItalic.woff
new file mode 100644
index 000000000..142c1c9c4
--- /dev/null
+++ b/src/rocksdb/docs/static/fonts/LatoLatin-BlackItalic.woff
diff --git a/src/rocksdb/docs/static/fonts/LatoLatin-BlackItalic.woff2 b/src/rocksdb/docs/static/fonts/LatoLatin-BlackItalic.woff2
new file mode 100644
index 000000000..e9862e690
--- /dev/null
+++ b/src/rocksdb/docs/static/fonts/LatoLatin-BlackItalic.woff2
diff --git a/src/rocksdb/docs/static/fonts/LatoLatin-Italic.woff b/src/rocksdb/docs/static/fonts/LatoLatin-Italic.woff
new file mode 100644
index 000000000..d8cf84c8b
--- /dev/null
+++ b/src/rocksdb/docs/static/fonts/LatoLatin-Italic.woff
diff --git a/src/rocksdb/docs/static/fonts/LatoLatin-Italic.woff2 b/src/rocksdb/docs/static/fonts/LatoLatin-Italic.woff2
new file mode 100644
index 000000000..aaa5a35c3
--- /dev/null
+++ b/src/rocksdb/docs/static/fonts/LatoLatin-Italic.woff2
diff --git a/src/rocksdb/docs/static/fonts/LatoLatin-Light.woff b/src/rocksdb/docs/static/fonts/LatoLatin-Light.woff
new file mode 100644
index 000000000..e7d4278cc
--- /dev/null
+++ b/src/rocksdb/docs/static/fonts/LatoLatin-Light.woff
diff --git a/src/rocksdb/docs/static/fonts/LatoLatin-Light.woff2 b/src/rocksdb/docs/static/fonts/LatoLatin-Light.woff2
new file mode 100644
index 000000000..b6d028836
--- /dev/null
+++ b/src/rocksdb/docs/static/fonts/LatoLatin-Light.woff2
diff --git a/src/rocksdb/docs/static/fonts/LatoLatin-Regular.woff b/src/rocksdb/docs/static/fonts/LatoLatin-Regular.woff
new file mode 100644
index 000000000..bf73a6d9f
--- /dev/null
+++ b/src/rocksdb/docs/static/fonts/LatoLatin-Regular.woff
diff --git a/src/rocksdb/docs/static/fonts/LatoLatin-Regular.woff2 b/src/rocksdb/docs/static/fonts/LatoLatin-Regular.woff2
new file mode 100644
index 000000000..a4d084bfb
--- /dev/null
+++ b/src/rocksdb/docs/static/fonts/LatoLatin-Regular.woff2
diff --git a/src/rocksdb/docs/static/images/Resize-of-20140327_200754-300x225.jpg b/src/rocksdb/docs/static/images/Resize-of-20140327_200754-300x225.jpg
new file mode 100644
index 000000000..9f9315101
--- /dev/null
+++ b/src/rocksdb/docs/static/images/Resize-of-20140327_200754-300x225.jpg
diff --git a/src/rocksdb/docs/static/images/align-compaction-output/compaction_output_file_size_compare.png b/src/rocksdb/docs/static/images/align-compaction-output/compaction_output_file_size_compare.png
new file mode 100644
index 000000000..2ce86fb28
--- /dev/null
+++ b/src/rocksdb/docs/static/images/align-compaction-output/compaction_output_file_size_compare.png
diff --git a/src/rocksdb/docs/static/images/align-compaction-output/file_cut_align.png b/src/rocksdb/docs/static/images/align-compaction-output/file_cut_align.png
new file mode 100644
index 000000000..bc3e8990e
--- /dev/null
+++ b/src/rocksdb/docs/static/images/align-compaction-output/file_cut_align.png
diff --git a/src/rocksdb/docs/static/images/align-compaction-output/file_cut_normal.png b/src/rocksdb/docs/static/images/align-compaction-output/file_cut_normal.png
new file mode 100644
index 000000000..e17133ed2
--- /dev/null
+++ b/src/rocksdb/docs/static/images/align-compaction-output/file_cut_normal.png
diff --git a/src/rocksdb/docs/static/images/align-compaction-output/file_cut_trival_move.png b/src/rocksdb/docs/static/images/align-compaction-output/file_cut_trival_move.png
new file mode 100644
index 000000000..7aca9aeb5
--- /dev/null
+++ b/src/rocksdb/docs/static/images/align-compaction-output/file_cut_trival_move.png
diff --git a/src/rocksdb/docs/static/images/align-compaction-output/file_size_compare.png b/src/rocksdb/docs/static/images/align-compaction-output/file_size_compare.png
new file mode 100644
index 000000000..5f39a806f
--- /dev/null
+++ b/src/rocksdb/docs/static/images/align-compaction-output/file_size_compare.png
diff --git a/src/rocksdb/docs/static/images/align-compaction-output/write_amp_compare.png b/src/rocksdb/docs/static/images/align-compaction-output/write_amp_compare.png
new file mode 100644
index 000000000..8b20f2ae3
--- /dev/null
+++ b/src/rocksdb/docs/static/images/align-compaction-output/write_amp_compare.png
diff --git a/src/rocksdb/docs/static/images/asynchronous-io/mget_async.png b/src/rocksdb/docs/static/images/asynchronous-io/mget_async.png
new file mode 100644
index 000000000..79d1a851f
--- /dev/null
+++ b/src/rocksdb/docs/static/images/asynchronous-io/mget_async.png
diff --git a/src/rocksdb/docs/static/images/asynchronous-io/scan_async.png b/src/rocksdb/docs/static/images/asynchronous-io/scan_async.png
new file mode 100644
index 000000000..ee84189f4
--- /dev/null
+++ b/src/rocksdb/docs/static/images/asynchronous-io/scan_async.png
diff --git a/src/rocksdb/docs/static/images/binaryseek.png b/src/rocksdb/docs/static/images/binaryseek.png
new file mode 100644
index 000000000..0e213f048
--- /dev/null
+++ b/src/rocksdb/docs/static/images/binaryseek.png
diff --git a/src/rocksdb/docs/static/images/bloom_fp_vs_bpk.png b/src/rocksdb/docs/static/images/bloom_fp_vs_bpk.png
new file mode 100644
index 000000000..e83f4d085
--- /dev/null
+++ b/src/rocksdb/docs/static/images/bloom_fp_vs_bpk.png
diff --git a/src/rocksdb/docs/static/images/compaction/full-range.png b/src/rocksdb/docs/static/images/compaction/full-range.png
new file mode 100644
index 000000000..5b2c9fc61
--- /dev/null
+++ b/src/rocksdb/docs/static/images/compaction/full-range.png
diff --git a/src/rocksdb/docs/static/images/compaction/l0-l1-contend.png b/src/rocksdb/docs/static/images/compaction/l0-l1-contend.png
new file mode 100644
index 000000000..bcf8ec73a
--- /dev/null
+++ b/src/rocksdb/docs/static/images/compaction/l0-l1-contend.png
diff --git a/src/rocksdb/docs/static/images/compaction/l1-l2-contend.png b/src/rocksdb/docs/static/images/compaction/l1-l2-contend.png
new file mode 100644
index 000000000..6dafbbbf2
--- /dev/null
+++ b/src/rocksdb/docs/static/images/compaction/l1-l2-contend.png
diff --git a/src/rocksdb/docs/static/images/compaction/part-range-old.png b/src/rocksdb/docs/static/images/compaction/part-range-old.png
new file mode 100644
index 000000000..1cc723d13
--- /dev/null
+++ b/src/rocksdb/docs/static/images/compaction/part-range-old.png
diff --git a/src/rocksdb/docs/static/images/data-block-hash-index/block-format-binary-seek.png b/src/rocksdb/docs/static/images/data-block-hash-index/block-format-binary-seek.png
new file mode 100644
index 000000000..0e213f048
--- /dev/null
+++ b/src/rocksdb/docs/static/images/data-block-hash-index/block-format-binary-seek.png
diff --git a/src/rocksdb/docs/static/images/data-block-hash-index/block-format-hash-index.png b/src/rocksdb/docs/static/images/data-block-hash-index/block-format-hash-index.png
new file mode 100644
index 000000000..accb8639e
--- /dev/null
+++ b/src/rocksdb/docs/static/images/data-block-hash-index/block-format-hash-index.png
diff --git a/src/rocksdb/docs/static/images/data-block-hash-index/hash-index-data-structure.png b/src/rocksdb/docs/static/images/data-block-hash-index/hash-index-data-structure.png
new file mode 100644
index 000000000..9acc71d8e
--- /dev/null
+++ b/src/rocksdb/docs/static/images/data-block-hash-index/hash-index-data-structure.png
diff --git a/src/rocksdb/docs/static/images/data-block-hash-index/perf-cache-miss.png b/src/rocksdb/docs/static/images/data-block-hash-index/perf-cache-miss.png
new file mode 100644
index 000000000..71788735d
--- /dev/null
+++ b/src/rocksdb/docs/static/images/data-block-hash-index/perf-cache-miss.png
diff --git a/src/rocksdb/docs/static/images/data-block-hash-index/perf-throughput.png b/src/rocksdb/docs/static/images/data-block-hash-index/perf-throughput.png
new file mode 100644
index 000000000..54948af2f
--- /dev/null
+++ b/src/rocksdb/docs/static/images/data-block-hash-index/perf-throughput.png
diff --git a/src/rocksdb/docs/static/images/delrange/delrange_collapsed.png b/src/rocksdb/docs/static/images/delrange/delrange_collapsed.png
new file mode 100644
index 000000000..52246c2c1
--- /dev/null
+++ b/src/rocksdb/docs/static/images/delrange/delrange_collapsed.png
diff --git a/src/rocksdb/docs/static/images/delrange/delrange_key_schema.png b/src/rocksdb/docs/static/images/delrange/delrange_key_schema.png
new file mode 100644
index 000000000..0a14d4a3a
--- /dev/null
+++ b/src/rocksdb/docs/static/images/delrange/delrange_key_schema.png
diff --git a/src/rocksdb/docs/static/images/delrange/delrange_sst_blocks.png b/src/rocksdb/docs/static/images/delrange/delrange_sst_blocks.png
new file mode 100644
index 000000000..6003e42ae
--- /dev/null
+++ b/src/rocksdb/docs/static/images/delrange/delrange_sst_blocks.png
diff --git a/src/rocksdb/docs/static/images/delrange/delrange_uncollapsed.png b/src/rocksdb/docs/static/images/delrange/delrange_uncollapsed.png
new file mode 100644
index 000000000..39c7097af
--- /dev/null
+++ b/src/rocksdb/docs/static/images/delrange/delrange_uncollapsed.png
diff --git a/src/rocksdb/docs/static/images/delrange/delrange_write_path.png b/src/rocksdb/docs/static/images/delrange/delrange_write_path.png
new file mode 100644
index 000000000..229dfb349
--- /dev/null
+++ b/src/rocksdb/docs/static/images/delrange/delrange_write_path.png
diff --git a/src/rocksdb/docs/static/images/dictcmp/dictcmp_raw_sampled.png b/src/rocksdb/docs/static/images/dictcmp/dictcmp_raw_sampled.png
new file mode 100644
index 000000000..2eb6463c2
--- /dev/null
+++ b/src/rocksdb/docs/static/images/dictcmp/dictcmp_raw_sampled.png
diff --git a/src/rocksdb/docs/static/images/dictcmp/dictcmp_sst_blocks.png b/src/rocksdb/docs/static/images/dictcmp/dictcmp_sst_blocks.png
new file mode 100644
index 000000000..551860b2e
--- /dev/null
+++ b/src/rocksdb/docs/static/images/dictcmp/dictcmp_sst_blocks.png
diff --git a/src/rocksdb/docs/static/images/dictcmp/dictcmp_zstd_trained.png b/src/rocksdb/docs/static/images/dictcmp/dictcmp_zstd_trained.png
new file mode 100644
index 000000000..966c7fe0f
--- /dev/null
+++ b/src/rocksdb/docs/static/images/dictcmp/dictcmp_zstd_trained.png
diff --git a/src/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Legacy_Vs_Integrated.png b/src/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Legacy_Vs_Integrated.png
new file mode 100644
index 000000000..7215390cb
--- /dev/null
+++ b/src/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Legacy_Vs_Integrated.png
diff --git a/src/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_RW_RO_Perf.png b/src/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_RW_RO_Perf.png
new file mode 100644
index 000000000..f412ee60f
--- /dev/null
+++ b/src/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_RW_RO_Perf.png
diff --git a/src/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Amp.png b/src/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Amp.png
new file mode 100644
index 000000000..19f40b035
--- /dev/null
+++ b/src/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Amp.png
diff --git a/src/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Perf.png b/src/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Perf.png
new file mode 100644
index 000000000..a1d43da0c
--- /dev/null
+++ b/src/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Perf.png
diff --git a/src/rocksdb/docs/static/images/kv-checksum/Memtable-entry.png b/src/rocksdb/docs/static/images/kv-checksum/Memtable-entry.png
new file mode 100644
index 000000000..31eb7278a
--- /dev/null
+++ b/src/rocksdb/docs/static/images/kv-checksum/Memtable-entry.png
diff --git a/src/rocksdb/docs/static/images/kv-checksum/Memtable-write.png b/src/rocksdb/docs/static/images/kv-checksum/Memtable-write.png
new file mode 100644
index 000000000..32f526fdf
--- /dev/null
+++ b/src/rocksdb/docs/static/images/kv-checksum/Memtable-write.png
diff --git a/src/rocksdb/docs/static/images/kv-checksum/ProtInfo-Memtable.png b/src/rocksdb/docs/static/images/kv-checksum/ProtInfo-Memtable.png
new file mode 100644
index 000000000..c2e21bb15
--- /dev/null
+++ b/src/rocksdb/docs/static/images/kv-checksum/ProtInfo-Memtable.png
diff --git a/src/rocksdb/docs/static/images/kv-checksum/ProtInfo-Writebatch-to-Memtable.png b/src/rocksdb/docs/static/images/kv-checksum/ProtInfo-Writebatch-to-Memtable.png
new file mode 100644
index 000000000..91ad93b2b
--- /dev/null
+++ b/src/rocksdb/docs/static/images/kv-checksum/ProtInfo-Writebatch-to-Memtable.png
diff --git a/src/rocksdb/docs/static/images/kv-checksum/ProtInfo-Writebatch.png b/src/rocksdb/docs/static/images/kv-checksum/ProtInfo-Writebatch.png
new file mode 100644
index 000000000..b3cd5315b
--- /dev/null
+++ b/src/rocksdb/docs/static/images/kv-checksum/ProtInfo-Writebatch.png
diff --git a/src/rocksdb/docs/static/images/kv-checksum/WAL-fragment.png b/src/rocksdb/docs/static/images/kv-checksum/WAL-fragment.png
new file mode 100644
index 000000000..9bbacca0d
--- /dev/null
+++ b/src/rocksdb/docs/static/images/kv-checksum/WAL-fragment.png
diff --git a/src/rocksdb/docs/static/images/kv-checksum/WAL-read.png b/src/rocksdb/docs/static/images/kv-checksum/WAL-read.png
new file mode 100644
index 000000000..e130733d3
--- /dev/null
+++ b/src/rocksdb/docs/static/images/kv-checksum/WAL-read.png
diff --git a/src/rocksdb/docs/static/images/kv-checksum/WAL-write.png b/src/rocksdb/docs/static/images/kv-checksum/WAL-write.png
new file mode 100644
index 000000000..fb9fd8fd5
--- /dev/null
+++ b/src/rocksdb/docs/static/images/kv-checksum/WAL-write.png
diff --git a/src/rocksdb/docs/static/images/kv-checksum/Write-batch.png b/src/rocksdb/docs/static/images/kv-checksum/Write-batch.png
new file mode 100644
index 000000000..121d42555
--- /dev/null
+++ b/src/rocksdb/docs/static/images/kv-checksum/Write-batch.png
diff --git a/src/rocksdb/docs/static/images/kv-checksum/Writebatch-write.png b/src/rocksdb/docs/static/images/kv-checksum/Writebatch-write.png
new file mode 100644
index 000000000..b10ab35ef
--- /dev/null
+++ b/src/rocksdb/docs/static/images/kv-checksum/Writebatch-write.png
diff --git a/src/rocksdb/docs/static/images/lost-buffered-write-recovery/angry-cat.png b/src/rocksdb/docs/static/images/lost-buffered-write-recovery/angry-cat.png
new file mode 100644
index 000000000..e956fb6e0
--- /dev/null
+++ b/src/rocksdb/docs/static/images/lost-buffered-write-recovery/angry-cat.png
diff --git a/src/rocksdb/docs/static/images/lost-buffered-write-recovery/basic-setup.png b/src/rocksdb/docs/static/images/lost-buffered-write-recovery/basic-setup.png
new file mode 100644
index 000000000..f79831a29
--- /dev/null
+++ b/src/rocksdb/docs/static/images/lost-buffered-write-recovery/basic-setup.png
diff --git a/src/rocksdb/docs/static/images/lost-buffered-write-recovery/happy-cat.png b/src/rocksdb/docs/static/images/lost-buffered-write-recovery/happy-cat.png
new file mode 100644
index 000000000..155b5341d
--- /dev/null
+++ b/src/rocksdb/docs/static/images/lost-buffered-write-recovery/happy-cat.png
diff --git a/src/rocksdb/docs/static/images/lost-buffered-write-recovery/replay-extension.png b/src/rocksdb/docs/static/images/lost-buffered-write-recovery/replay-extension.png
new file mode 100644
index 000000000..5bedd949f
--- /dev/null
+++ b/src/rocksdb/docs/static/images/lost-buffered-write-recovery/replay-extension.png
diff --git a/src/rocksdb/docs/static/images/lost-buffered-write-recovery/test-fs-writable-file.png b/src/rocksdb/docs/static/images/lost-buffered-write-recovery/test-fs-writable-file.png
new file mode 100644
index 000000000..58db8e2a8
--- /dev/null
+++ b/src/rocksdb/docs/static/images/lost-buffered-write-recovery/test-fs-writable-file.png
diff --git a/src/rocksdb/docs/static/images/lost-buffered-write-recovery/trace-extension.png b/src/rocksdb/docs/static/images/lost-buffered-write-recovery/trace-extension.png
new file mode 100644
index 000000000..f782955b6
--- /dev/null
+++ b/src/rocksdb/docs/static/images/lost-buffered-write-recovery/trace-extension.png
diff --git a/src/rocksdb/docs/static/images/pcache-blockindex.jpg b/src/rocksdb/docs/static/images/pcache-blockindex.jpg
new file mode 100644
index 000000000..9c18bde93
--- /dev/null
+++ b/src/rocksdb/docs/static/images/pcache-blockindex.jpg
diff --git a/src/rocksdb/docs/static/images/pcache-fileindex.jpg b/src/rocksdb/docs/static/images/pcache-fileindex.jpg
new file mode 100644
index 000000000..51f4e095c
--- /dev/null
+++ b/src/rocksdb/docs/static/images/pcache-fileindex.jpg
diff --git a/src/rocksdb/docs/static/images/pcache-filelayout.jpg b/src/rocksdb/docs/static/images/pcache-filelayout.jpg
new file mode 100644
index 000000000..771ee60c1
--- /dev/null
+++ b/src/rocksdb/docs/static/images/pcache-filelayout.jpg
diff --git a/src/rocksdb/docs/static/images/pcache-readiopath.jpg b/src/rocksdb/docs/static/images/pcache-readiopath.jpg
new file mode 100644
index 000000000..4993f0072
--- /dev/null
+++ b/src/rocksdb/docs/static/images/pcache-readiopath.jpg
diff --git a/src/rocksdb/docs/static/images/pcache-tieredstorage.jpg b/src/rocksdb/docs/static/images/pcache-tieredstorage.jpg
new file mode 100644
index 000000000..c362a2d69
--- /dev/null
+++ b/src/rocksdb/docs/static/images/pcache-tieredstorage.jpg
diff --git a/src/rocksdb/docs/static/images/pcache-writeiopath.jpg b/src/rocksdb/docs/static/images/pcache-writeiopath.jpg
new file mode 100644
index 000000000..561b55181
--- /dev/null
+++ b/src/rocksdb/docs/static/images/pcache-writeiopath.jpg
diff --git a/src/rocksdb/docs/static/images/promo-adapt.svg b/src/rocksdb/docs/static/images/promo-adapt.svg
new file mode 100644
index 000000000..7cd44434d
--- /dev/null
+++ b/src/rocksdb/docs/static/images/promo-adapt.svg
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg id="svg4136" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://www.w3.org/2000/svg" height="256px" width="256px" version="1.1" xmlns:cc="http://creativecommons.org/ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" viewBox="0 0 2691.7869 1102.369">
+ <g id="layer1" transform="translate(2299.2 62.602)">
+  <path id="path4167" style="color-rendering:auto;text-decoration-color:#000000;color:#000000;isolation:auto;mix-blend-mode:normal;shape-rendering:auto;solid-color:#000000;block-progression:tb;text-decoration-line:none;text-decoration-style:solid;image-rendering:auto;white-space:normal;text-indent:0;text-transform:none" d="m392.59 123.17c0.0121-1.9688-0.10333-3.9361-0.34718-5.8898-0.15395-1.3644-0.36978-2.7214-0.64725-4.0662-0.12442-0.56729-0.25999-1.1325-0.40601-1.6946-0.17896-0.75016-0.37707-1.4955-0.59429-2.2357-0.34263-1.1286-0.72934-2.2438-1.1592-3.3423-0.0693-0.18679-0.14009-0.37234-0.21179-0.5591-4.65-11.643-13.94-20.829-25.63-25.358l-235.94-136.22c-21.65-12.935-49.703-5.641-62.314 16.201-12.61 21.841-4.902 49.778 17.123 62.066l95.659 55.229-590.22 0.00006c-278.88 0.00003-504.81 225.93-504.81 504.81l-0.68255 171.28 91.222 0.15305-0.15877-171.43c-0.00003-230.38 184.06-414.43 414.43-414.43l590.22 0.00003-95.659 55.229c-22.025 12.288-29.733 40.224-17.123 62.066s40.659 29.136 62.313 16.205l236.18-136.36c12.515-4.9182 22.19-15.154 26.396-27.926 0.17775-0.55981 0.34438-1.1222 0.5001-1.6886 0.22565-0.75133 0.43169-1.5088 0.61787-2.2712 0.26151-1.1286 0.47932-2.2671 0.65315-3.4128 0.0346-0.20366 0.0681-0.40725 0.10003-0.61218 0.28861-1.8998 0.45557-3.8159 0.50016-5.737zm-1219 630.38-493.8-0.00012c-199.29-0.00006-363.89-137.75-404.69-324.05l283.68-0.00003-95.659 55.228c-22.023 12.288-29.728 40.224-17.119 62.064 12.61 21.841 40.656 29.135 62.309 16.207l235.94-136.22c11.696-4.5293 20.985-13.716 25.642-25.361 0.072-0.18618 0.1416-0.37291 0.2109-0.56005 0.4299-1.0986 0.8167-2.2135 1.1593-3.3423 0.2169-0.74012 0.4154-1.4857 0.5944-2.236 0.1446-0.56217 0.2801-1.1271 0.4061-1.6945 0.2771-1.3448 0.4934-2.7015 0.6471-4.066 0.244-1.9537 0.36-3.9213 0.3474-5.8901-0.045-1.9211-0.2109-3.8374-0.5005-5.7372-0.033-0.20396-0.066-0.40801-0.099-0.61167-0.1747-1.1456-0.3919-2.2841-0.6534-3.413-0.1868-0.76233-0.3923-1.5197-0.6176-2.2713-0.1567-0.56618-0.3227-1.1292-0.5004-1.6889-4.206-12.772-13.881-23.008-26.396-27.926l-236.18-136.36c-21.654-12.936-49.706-5.6434-62.318 16.201s-4.9022 49.785 17.128 62.07l95.659 55.229-293.42-0.00006h-90.38l-428.37 0.00009c-25.208-0.35649-45.834 19.98-45.834 45.19s20.626 45.547 45.834 45.19l436.44-0.00015c42.578 235.73 248.71 414.43 496.74 414.43l910.97-0.00009-95.659 55.229c-22.025 12.287-29.727 40.224-17.117 62.066s40.654 29.136 62.307 16.205l236.16-136.23c11.697-4.529 20.985-13.715 25.643-25.361 0.0717-0.18589 0.1425-0.37219 0.21179-0.55904 0.42982-1.0985 0.81653-2.2134 1.1592-3.3422 0.21722-0.74013 0.41533-1.4857 0.59431-2.236 0.14612-0.56229 0.28169-1.1272 0.40602-1.6946 0.27747-1.3448 0.49336-2.7015 0.64725-4.066 0.24373-1.9537 0.35986-3.9212 0.34715-5.89-0.0446-1.921-0.21149-3.8373-0.50013-5.737-0.0319-0.20396-0.0654-0.40828-0.10003-0.61194-0.17383-1.1456-0.39174-2.284-0.65315-3.4128-0.18618-0.76233-0.39219-1.5197-0.61781-2.2713-0.15575-0.56609-0.32248-1.1291-0.50016-1.6887-4.2062-12.772-13.881-23.008-26.396-27.926l-236.18-136.36c-21.654-12.931-49.697-5.6367-62.307 16.205s-4.908 49.779 17.117 62.066l95.659 55.229h-417.17z" fill-rule="evenodd"/>
+  <path id="path6042" d="m-1139.3 565.22v-376.9" stroke="#000" stroke-linecap="round" stroke-width="90.381" fill="none"/>
+  <path id="path6044" d="m-111.15 993.93v-376.9" stroke="#000" stroke-linecap="round" stroke-width="90.381" fill="none"/>
+ </g>
+</svg>
diff --git a/src/rocksdb/docs/static/images/promo-flash.svg b/src/rocksdb/docs/static/images/promo-flash.svg
new file mode 100644
index 000000000..79810c30a
--- /dev/null
+++ b/src/rocksdb/docs/static/images/promo-flash.svg
@@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Generator: Adobe Illustrator 15.0.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd" [
+	<!ENTITY ns_extend "http://ns.adobe.com/Extensibility/1.0/">
+	<!ENTITY ns_ai "http://ns.adobe.com/AdobeIllustrator/10.0/">
+	<!ENTITY ns_graphs "http://ns.adobe.com/Graphs/1.0/">
+	<!ENTITY ns_vars "http://ns.adobe.com/Variables/1.0/">
+	<!ENTITY ns_imrep "http://ns.adobe.com/ImageReplacement/1.0/">
+	<!ENTITY ns_sfw "http://ns.adobe.com/SaveForWeb/1.0/">
+	<!ENTITY ns_custom "http://ns.adobe.com/GenericCustomNamespace/1.0/">
+	<!ENTITY ns_adobe_xpath "http://ns.adobe.com/XPath/1.0/">
+]>
+<svg version="1.1" id="Ebene_1" xmlns:x="&ns_extend;" xmlns:i="&ns_ai;" xmlns:graph="&ns_graphs;"
+	 xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px" width="256px"
+	 height="256px" viewBox="0 0 815.261 584.491" enable-background="new 0 0 815.261 584.491" xml:space="preserve">
+<switch>
+	<foreignObject requiredExtensions="&ns_ai;" x="0" y="0" width="1" height="1">
+		<i:pgfRef  xlink:href="#adobe_illustrator_pgf">
+		</i:pgfRef>
+	</foreignObject>
+	<g i:extraneous="self">
+		<path fill-rule="evenodd" clip-rule="evenodd" d="M571.106,229.357l86.136,62.324l-500.458,292.81l15.751-116.254L0,392.661
+			l346.5-136.874l-79.521-80.771l332.31-123.92L538.864,0h276.397l-73.986,207.161l-59.189-36.993L571.106,229.357z M630.295,51.79
+			L312.155,170.168l96.182,81.385L60.602,384.729l133.175,51.79l-14.797,96.183l421.722-251.554l-81.385-66.588l170.168-96.182
+			l44.392,22.195l66.587-125.776H593.302L630.295,51.79z"/>
+	</g>
+</switch>
+</svg>
diff --git a/src/rocksdb/docs/static/images/promo-operations.svg b/src/rocksdb/docs/static/images/promo-operations.svg
new file mode 100644
index 000000000..3036294ab
--- /dev/null
+++ b/src/rocksdb/docs/static/images/promo-operations.svg
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg height="256px" width="256px" viewBox="0 0 154.875 154.91782" xmlns="http://www.w3.org/2000/svg">
+<g transform="translate(-263.3125,-563.76968)">
+<path d="m336.77535,565.51158c-1.59726.10936-3.15002.35671-4.71936.53629h-.1063l-3.75379,20.48605c-6.11896,1.39345-11.87541,3.75824-17.05404,6.97168l-16.83926-12.12002c-4.55215,3.53414-8.69458,7.65592-12.3345,12.12002l11.69091,17.05386c-3.54965,5.42465-6.21914,11.6188-7.72253,18.12639-.00018.031-.00018.10198 0,.10702l-20.37883,3.2177c-.3725,3.04312-.53624,6.18809-.53624,9.33134 0,2.57176.071,5.10917.32165,7.61524l20.37883,3.64673c1.44933,7.07687 4.20261,13.68602 8.04414,19.52075l-12.11991,16.6248c3.4711,4.30922 7.47839,8.23258 11.79812,11.69099l17.16106-11.79826c5.9977,3.82597 12.69269,6.50875 19.94983,7.82975l3.21758,20.27155c2.28662.20798 4.63161.2145 6.97192.2145 3.30389,0 6.46004-.12522 9.65312-.53628l3.86135-20.70056c6.89012-1.71472 13.36295-4.68941 18.98427-8.68781l16.51747,12.01276c4.28351-3.64433 8.20054-7.83321 11.5837-12.33452l-12.0127-17.37561c3.25344-5.61849 5.50726-11.8176 6.64976-18.44817l20.2718-3.21771c.17838-2.11543.21297-4.16701.21297-6.32815 0-3.75557-.43675-7.43787-.96556-11.04745l-20.59342-3.75397c-1.61384-5.95909-4.26171-11.51888-7.61497-16.51756l12.11974-16.6248c-3.75686-4.59442-8.04235-8.83858-12.76333-12.4418l-17.48303,12.01278c-5.02475-2.97177-10.43184-5.25192-16.30306-6.54268l-3.21759-20.37879c-2.92858-.34452-5.88149-.53628-8.90214-.53628-.81656,0-1.65672-.024-2.46715,0-.39495.0126-.78593-.024-1.17962,0-.1063.007-.21621-.007-.32269,0zm2.78876,52.1268c.39207-.0213.78323,0 1.17998,0 12.69611,0 23.06003,10.36401 23.06003,23.06023s-10.36392,22.95297-23.06003,22.95297-22.95301-10.25675-22.95301-22.95297c0-12.29946 9.6261-22.44383 21.77303-23.06023z" fill="#333" stroke="#333" stroke-width="3.43222"/>
+</g>
+</svg>
diff --git a/src/rocksdb/docs/static/images/promo-performance.svg b/src/rocksdb/docs/static/images/promo-performance.svg
new file mode 100644
index 000000000..be8a10120
--- /dev/null
+++ b/src/rocksdb/docs/static/images/promo-performance.svg
@@ -0,0 +1,134 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg viewBox="0 0 64 64" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<defs>
+<linearGradient id="linearGradient4121">
+<stop offset="0" stop-color="#fff"/>
+<stop offset="1" stop-color="#fff" stop-opacity="0"/>
+</linearGradient>
+<linearGradient gradientTransform="matrix(.30081,0,0,.20986,-1650.25,-38.87783)" gradientUnits="userSpaceOnUse" id="linearGradient4107" x1="-342.85715" x2="-376.17944" xlink:href="#linearGradient4121" y1="-397.01691" y2="-915.50836"/>
+<linearGradient gradientUnits="userSpaceOnUse" id="linearGradient15735" x1="797.06111" x2="788.0298" xlink:href="#linearGradient4121" y1="-667.70464" y2="-819.35937"/>
+</defs>
+<title>netalloy chequered flag</title>
+<g transform="matrix(.08343,-.03199,0,.08343,-73.61005,-8.84057)">
+<path d="m1688.925,115.67413h30.08058v20.9864h-30.08058z" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m1719.6503,115.67413h30.08058v20.9864h-30.08058z" fill="#fff" opacity=".60096" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m1658.8445,136.51062h30.08058v20.9864h-30.08058z" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m1689.5697,136.51062h30.08058v20.9864h-30.08058z" fill="#fff" opacity=".60096" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m136.5856-1749.1506h20.9864v30.08058h-20.9864z" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m158.0217-1749.1506h20.9864v30.08058h-20.9864z" fill="#fff" opacity=".60096" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m156.97238-1718.8552h20.9864v30.08058h-20.9864z" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m178.40849-1718.8552h20.9864v30.08058h-20.9864z" fill="#fff" opacity=".60096" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m1718.5759,177.7339h30.08058v20.9864h-30.08058z" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m1719.2205,198.5704h30.08058v20.9864h-30.08058z" fill="#fff" opacity=".60096" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m219.03217-1748.5061h20.9864v30.08058h-20.9864z" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m1748.2268,157.64693h30.08058v20.9864h-30.08058z" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m1748.8713,178.48341h30.08058v20.9864h-30.08058z" fill="#fff" opacity=".60096" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m198.94519-1778.157h20.9864v30.08058h-20.9864z" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m220.3813-1778.157h20.9864v30.08058h-20.9864z" fill="#fff" opacity=".60096" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m1777.8777,137.26013h30.08058v20.9864h-30.08058z" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m1778.5222,158.09662h30.08058v20.9864h-30.08058z" fill="#fff" opacity=".60096" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m177.95879-1808.6672h20.9864v30.08058h-20.9864z" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m199.99451-1807.8077h20.9864v30.08058h-20.9864z" fill="#fff" opacity=".60096" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m1808.8176,157.34711h30.08058v20.9864h-30.08058z" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m1809.4622,178.18361h30.08058v20.9864h-30.08058z" fill="#fff" opacity=".60096" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m199.24498-1838.7478h20.9864v30.08058h-20.9864z" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m220.08148-1838.7478h20.9864v30.08058h-20.9864z" fill="#fff" opacity=".60096" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m1657.9851,178.93314h30.08058v20.9864h-30.08058z" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m1658.6296,199.76964h30.08058v20.9864h-30.08058z" fill="#fff" opacity=".60096" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m220.2314-1687.9152h20.9864v30.08058h-20.9864z" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m1688.0657,199.31993h30.08058v20.9864h-30.08058z" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m1688.7102,220.15643h30.08058v20.9864h-30.08058z" fill="#fff" opacity=".60096" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m220.00832-1868.8394h20.9864v30.08058h-20.9864z" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m221.05762-1837.2548h20.9864v30.08058h-20.9864z" fill="#fff" opacity=".60096" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m-1807.9261-241.48938h30.08058v20.9864h-30.08058z" transform="matrix(-.87909,-.47665,.59562,-.80326,0,0)"/>
+<path d="m-1777.201-241.48938h30.08058v20.9864h-30.08058z" fill="#fff" opacity=".60096" transform="matrix(-.87909,-.47665,.59562,-.80326,0,0)"/>
+<path d="m115.67591-1837.6415h20.9864v30.08058h-20.9864z" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m116.12562-1807.7758h20.9864v30.08058h-20.9864z" fill="#fff" opacity=".60096" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m-1778.4473-136.55739h30.08058v20.9864h-30.08058z" fill="url(#linearGradient4107)" transform="matrix(-.87909,-.47665,.59562,-.80326,0,0)"/>
+<path d="m-1776.9432-156.36673h27.94272v19.09188h-27.94272z" fill="#fff" opacity=".60096" transform="matrix(-.87909,-.47665,.59562,-.80326,0,0)"/>
+<path d="m-1869.0726-199.46429h30.08058v20.9864h-30.08058z" transform="matrix(-.87909,-.47665,.59562,-.80326,0,0)"/>
+<path d="m-1868.4281-178.62779h30.08058v20.9864h-30.08058z" fill="#fff" opacity=".60096" transform="matrix(-.87909,-.47665,.59562,-.80326,0,0)"/>
+<path d="m-158.16603,1839.1426h20.9864v30.08058h-20.9864z" transform="matrix(.59562,-.80326,.87909,.47665,0,0)"/>
+<path d="m-136.72992,1838.5409h20.9864v30.08058h-20.9864z" fill="#fff" opacity=".60096" transform="matrix(.59562,-.80326,.87909,.47665,0,0)"/>
+<path d="m116.27553-1899.0918h20.9864v30.08058h-20.9864z" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m-1899.7549-220.4507h30.08058v20.9864h-30.08058z" transform="matrix(-.87909,-.47665,.59562,-.80326,0,0)"/>
+<path d="m-1899.1104-199.6142h30.08058v20.9864h-30.08058z" fill="#fff" opacity=".60096" transform="matrix(-.87909,-.47665,.59562,-.80326,0,0)"/>
+<path d="m-179.15244,1869.8247h20.9864v30.08058h-20.9864z" transform="matrix(.59562,-.80326,.87909,.47665,0,0)"/>
+<path d="m-157.71632,1869.2231h20.9864v30.08058h-20.9864z" fill="#fff" opacity=".60096" transform="matrix(.59562,-.80326,.87909,.47665,0,0)"/>
+<path d="m239.53888-1718.5974h20.9864v30.08058h-20.9864z" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m1719.3923,239.46391h30.08058v20.9864h-30.08058z" fill="#fff" opacity=".60096" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m239.91541-1897.8026h20.9864v30.08058h-20.9864z" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m240.36511-1867.9369h20.9864v30.08058h-20.9864z" fill="#fff" opacity=".60096" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m-1838.6084-260.79688h30.08058v20.9864h-30.08058z" transform="matrix(-.87909,-.47665,.59562,-.80326,0,0)"/>
+<path d="m240.21521-1778.3398h20.9864v30.08058h-20.9864z" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m115.63581-1776.621h20.9864v30.08058h-20.9864z" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m-1837.9637-155.30208h27.82166v17.65242h-27.82166z" fill="#fff" opacity=".60096" transform="matrix(-.87909,-.47665,.59562,-.80326,0,0)"/>
+<path d="m1389.2368,874.53916-207.8251,263.56844" fill="none" stroke="#000" stroke-linecap="round" stroke-width="14.84459"/>
+<path d="m199.22183-1868.7964h20.9864v30.08058h-20.9864z" fill="#fff" opacity=".60096" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m156.79575-1690.0319h21.6532v29.62879h-21.6532z" fill="#fff" opacity=".60096" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m221.44484-1899.1632h18.00162v29.54019h-18.00162z" fill="#fff" opacity=".60096" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m241.58183-1687.8284h18.77428v29.87089h-18.77428z" fill="#fff" opacity=".60096" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m240.00063-1806.0569h20.9864v30.08058h-20.9864z" fill="#fff" opacity=".60096" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m114.93913-1688.511h21.01802v24.93378h-21.01802z" fill="#fff" opacity=".60096" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m-855.13867,658.61304h27.1029v29.24923h-27.1029z" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m-827.45502,658.61304h27.1029v29.24923h-27.1029z" fill="#fff" opacity=".60096" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m-882.24158,687.65332h27.1029v29.24923h-27.1029z" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m-854.55792,687.65332h27.1029v29.24923h-27.1029z" fill="#fff" opacity=".60096" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m687.75781,800.87488h29.24923v27.1029h-29.24923z" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m717.63379,800.87488h29.24923v27.1029h-29.24923z" fill="#fff" opacity=".60096" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m716.17133,828.17133h29.24923v27.1029h-29.24923z" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m746.0473,828.17133h29.24923v27.1029h-29.24923z" fill="#fff" opacity=".60096" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m-828.42303,745.10712h27.1029v29.24923h-27.1029z" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m-827.84222,774.14746h27.1029v29.24923h-27.1029z" fill="#fff" opacity=".60096" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m802.66547,801.45563h29.24923v27.1029h-29.24923z" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m-801.70728,717.11145h27.1029v29.24923h-27.1029z" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m-801.12653,746.15173h27.1029v29.24923h-27.1029z" fill="#fff" opacity=".60096" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m774.66974,774.73993h29.24923v27.1029h-29.24923z" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m804.54578,774.73993h29.24923v27.1029h-29.24923z" fill="#fff" opacity=".60096" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m-774.99152,688.69794h27.1029v29.24923h-27.1029z" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m-774.41077,717.73822h27.1029v29.24923h-27.1029z" fill="#fff" opacity=".60096" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m745.42053,747.24982h29.24923v27.1029h-29.24923z" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m776.1322,748.02417h29.24923v27.1029h-29.24923z" fill="#fff" opacity=".60096" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m-747.11432,716.6936h27.1029v29.24923h-27.1029z" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m-746.53357,745.73389h27.1029v29.24923h-27.1029z" fill="#fff" opacity=".60096" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m775.08759,720.14691h29.24923v27.1029h-29.24923z" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m804.12793,720.14691h29.24923v27.1029h-29.24923z" fill="#fff" opacity=".60096" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m-883.01599,746.7785h27.1029v29.24923h-27.1029z" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m-882.43518,775.81885h27.1029v29.24923h-27.1029z" fill="#fff" opacity=".60096" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m804.33685,856.04858h29.24923v27.1029h-29.24923z" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m-855.91309,775.19208h27.1029v29.24923h-27.1029z" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m-855.33228,804.23236h27.1029v29.24923h-27.1029z" fill="#fff" opacity=".60096" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m804.02594,693.03412h29.24923v27.1029h-29.24923z" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m805.48834,721.49219h29.24923v27.1029h-29.24923z" fill="#fff" opacity=".60096" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m747.91748-833.9646h27.1029v29.24923h-27.1029z" transform="matrix(.97568,.21922,-.42736,-.90408,0,0)"/>
+<path d="m775.6012-833.9646h27.1029v29.24923h-27.1029z" fill="#fff" opacity=".60096" transform="matrix(.97568,.21922,-.42736,-.90408,0,0)"/>
+<path d="m658.61548,721.14368h29.24923v27.1029h-29.24923z" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m659.24225,748.05304h29.24923v27.1029h-29.24923z" fill="#fff" opacity=".60096" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m774.47833-687.71844h27.1029v29.24923h-27.1029z" fill="url(#linearGradient15735)" transform="matrix(.97568,.21922,-.42736,-.90408,0,0)"/>
+<path d="m775.83344-715.32721h27.1029v29.24923h-27.1029z" fill="#fff" opacity=".60096" transform="matrix(.97568,.21922,-.42736,-.90408,0,0)"/>
+<path d="m692.82385-775.39325h27.1029v29.24923h-27.1029z" transform="matrix(.97568,.21922,-.42736,-.90408,0,0)"/>
+<path d="m693.40466-746.35297h27.1029v29.24923h-27.1029z" fill="#fff" opacity=".60096" transform="matrix(.97568,.21922,-.42736,-.90408,0,0)"/>
+<path d="m-717.83496-719.79126h29.24923v27.1029h-29.24923z" transform="matrix(-.42736,-.90408,-.97568,-.21922,0,0)"/>
+<path d="m-687.95892-720.33331h29.24923v27.1029h-29.24923z" fill="#fff" opacity=".60096" transform="matrix(-.42736,-.90408,-.97568,-.21922,0,0)"/>
+<path d="m659.45117,665.77631h29.24923v27.1029h-29.24923z" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m665.17896-804.64252h27.1029v29.24923h-27.1029z" transform="matrix(.97568,.21922,-.42736,-.90408,0,0)"/>
+<path d="m665.7597-775.60217h27.1029v29.24923h-27.1029z" fill="#fff" opacity=".60096" transform="matrix(.97568,.21922,-.42736,-.90408,0,0)"/>
+<path d="m-747.08417-692.1463h29.24923v27.1029h-29.24923z" transform="matrix(-.42736,-.90408,-.97568,-.21922,0,0)"/>
+<path d="m-717.20813-692.68835h29.24923v27.1029h-29.24923z" fill="#fff" opacity=".60096" transform="matrix(-.42736,-.90408,-.97568,-.21922,0,0)"/>
+<path d="m831.24615,828.40363h29.24923v27.1029h-29.24923z" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m-827.68732,831.14166h27.1029v29.24923h-27.1029z" fill="#fff" opacity=".60096" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m831.77087,666.93793h29.24923v27.1029h-29.24923z" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m832.39764,693.84723h29.24923v27.1029h-29.24923z" fill="#fff" opacity=".60096" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m720.27252-860.8739h27.1029v29.24923h-27.1029z" transform="matrix(.97568,.21922,-.42736,-.90408,0,0)"/>
+<path d="m832.18872,774.57513h29.24923v27.1029h-29.24923z" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m658.55963,776.12384h29.24923v27.1029h-29.24923z" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m720.85333-713.84338h27.18274v24.15996h-27.18274z" fill="#fff" opacity=".60096" transform="matrix(.97568,.21922,-.42736,-.90408,0,0)"/>
+<path d="m1142.0758,779.44917 192.2989,413.58843" fill="none" stroke="#000" stroke-linecap="round" stroke-width="14.20834"/>
+<path d="m775.05536,693.07288h29.24923v27.1029h-29.24923z" fill="#fff" opacity=".60096" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m715.92511,854.14148h29.44084v23.17059h-29.44084z" fill="#fff" opacity=".60096" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m805.88055,665.00702h23.23064v27.43012h-23.23064z" fill="#fff" opacity=".60096" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m833.31165,857.23889h28.21643v22.16755h-28.21643z" fill="#fff" opacity=".60096" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m833.60077,748.08258h29.24923v27.1029h-29.24923z" fill="#fff" opacity=".60096" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m657.4411,854.80676h28.65906v24.2827h-28.65906z" fill="#fff" opacity=".60096" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+</g>
+</svg>
diff --git a/src/rocksdb/docs/static/images/rate-limiter/auto-tuned-write-KBps-series.png b/src/rocksdb/docs/static/images/rate-limiter/auto-tuned-write-KBps-series.png
new file mode 100644
index 000000000..b4b24849c
--- /dev/null
+++ b/src/rocksdb/docs/static/images/rate-limiter/auto-tuned-write-KBps-series.png
diff --git a/src/rocksdb/docs/static/images/rate-limiter/write-KBps-cdf.png b/src/rocksdb/docs/static/images/rate-limiter/write-KBps-cdf.png
new file mode 100644
index 000000000..742f985bf
--- /dev/null
+++ b/src/rocksdb/docs/static/images/rate-limiter/write-KBps-cdf.png
diff --git a/src/rocksdb/docs/static/images/rate-limiter/write-KBps-series.png b/src/rocksdb/docs/static/images/rate-limiter/write-KBps-series.png
new file mode 100644
index 000000000..c7bdcb95a
--- /dev/null
+++ b/src/rocksdb/docs/static/images/rate-limiter/write-KBps-series.png
diff --git a/src/rocksdb/docs/static/images/rocksdb-secondary-cache/Mixgraph_hit_rate.png b/src/rocksdb/docs/static/images/rocksdb-secondary-cache/Mixgraph_hit_rate.png
new file mode 100644
index 000000000..10fa73728
--- /dev/null
+++ b/src/rocksdb/docs/static/images/rocksdb-secondary-cache/Mixgraph_hit_rate.png
diff --git a/src/rocksdb/docs/static/images/rocksdb-secondary-cache/Mixgraph_throughput.png b/src/rocksdb/docs/static/images/rocksdb-secondary-cache/Mixgraph_throughput.png
new file mode 100644
index 000000000..df2e333f9
--- /dev/null
+++ b/src/rocksdb/docs/static/images/rocksdb-secondary-cache/Mixgraph_throughput.png
diff --git a/src/rocksdb/docs/static/images/rocksdb-secondary-cache/arch_diagram.png b/src/rocksdb/docs/static/images/rocksdb-secondary-cache/arch_diagram.png
new file mode 100644
index 000000000..696a376ed
--- /dev/null
+++ b/src/rocksdb/docs/static/images/rocksdb-secondary-cache/arch_diagram.png
diff --git a/src/rocksdb/docs/static/images/rocksdb-secondary-cache/insert_flow.png b/src/rocksdb/docs/static/images/rocksdb-secondary-cache/insert_flow.png
new file mode 100644
index 000000000..f02e7e4c5
--- /dev/null
+++ b/src/rocksdb/docs/static/images/rocksdb-secondary-cache/insert_flow.png
diff --git a/src/rocksdb/docs/static/images/rocksdb-secondary-cache/lookup_flow.png b/src/rocksdb/docs/static/images/rocksdb-secondary-cache/lookup_flow.png
new file mode 100644
index 000000000..2b3c70edb
--- /dev/null
+++ b/src/rocksdb/docs/static/images/rocksdb-secondary-cache/lookup_flow.png
diff --git a/src/rocksdb/docs/static/images/tree_example1.png b/src/rocksdb/docs/static/images/tree_example1.png
new file mode 100644
index 000000000..9f725860c
--- /dev/null
+++ b/src/rocksdb/docs/static/images/tree_example1.png
diff --git a/src/rocksdb/docs/static/logo.svg b/src/rocksdb/docs/static/logo.svg
new file mode 100644
index 000000000..e6e1e8afa
--- /dev/null
+++ b/src/rocksdb/docs/static/logo.svg
@@ -0,0 +1,76 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Generator: Adobe Illustrator 18.0.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
+	 viewBox="0 0 841.9 595.3" enable-background="new 0 0 841.9 595.3" xml:space="preserve">
+<g id="Layer_2" display="none">
+	<g display="inline">
+		<path fill="#0F2647" d="M213,86.8h415.9c17,0,31,13.9,31,31v359.7c0,17-14,31-31,31H213c-17.1,0-31-14-31-31V117.8
+			C182,100.7,195.9,86.8,213,86.8z"/>
+	</g>
+</g>
+<g id="Layer_4">
+	<g>
+		<path fill="#FFBE00" d="M501.9,398l-64.7,1.6l-24.4,56l-63.6,16.6l-39.8,36.3h320.4c16.6,0,30.1-13.5,30.1-30.1V319.2l-113.6,28.4
+			L501.9,398z M268.5,402.7L214.6,449c-0.5-3.3-1.8-6.6-4.1-9.4l-28.3,23.8c6,7.1,16.1,8.5,23.8,3.8l52.7-23.9L268.5,402.7z
+			 M581.4,240.2l6.5,17.2c10.1-2.6,25.6-13.7,36.5-22.3c-0.4,3-0.2,6,0.9,9l34.7-12.8c-3.6-9.6-14.2-14.4-23.8-10.9
+			c-0.3,0.1-0.6,0.3-0.9,0.5l0.1-0.1L581.4,240.2z M641.2,269.2c-0.1,0-0.1,0-0.2,0l-63.6-5.5l-14.5-38.1v-40.4
+			c2.9,1.6,6.1,2.5,9.6,2.5c10.5,0,19.1-8.1,20.1-18.3l15.2-10.1l-0.7-20.5c-0.1-2.1-1.7-3.7-3.8-3.7l-19-0.3c-4-5.6-11.4-8-17.9-5
+			l-41.8,19.6c0.1,0.1,0.1,0.3,0.1,0.4c1.4,2.9,3.5,5.1,6,6.7c-50.9,26.3-72.5,61.8-72.5,61.8L263.6,323.1
+			c-11.4,6.1-25.7,1.8-31.8-9.6c-6.1-11.4-1.8-25.7,9.6-31.8l55.2-29.7c14-7.5,22.2-21.5,23.2-36.2l-33.1,17.8l-55.2,29.7
+			c-21.6,11.6-29.7,38.6-18.1,60.1c8,14.9,23.3,23.4,39.2,23.4c7.1,0,14.3-1.7,20.9-5.3l24.9-13.4c-1.8,9.8-1,20.3,2.9,30.2
+			c1.3,3.4,2.2,5.5,2.2,5.5c8.8,19.1-2.8,34.1-18,34.1h-4.8l-17.5,76.2c-2.3-2.4-5.3-4.2-8.8-5.1l-8.7,35.9
+			c8.8,2.1,17.5-2.4,21.1-10.4l39.7-71.2c50.2-4.9,76.6-38.9,75.4-86.8c52,3.2,121.3-29.5,152.1-74.2c23.4,29.7,67.6,25.5,87.3,18
+			l3.8-1.3c-1.4,2.6-2.2,5.6-2.2,8.8l37-0.4C659.7,277.3,651.4,269.1,641.2,269.2z M296.3,201.8c1.8,3.4,2.7,7,2.8,10.6l19.5-10.5
+			c-0.9-3.4-2.1-6.8-3.9-10l-1-1.9l-18.4,9.9L296.3,201.8z M289.2,188.6l18.4-9.9l-2.4-4.4c-2.7-5.1-9.1-7-14.2-4.3
+			c-5.1,2.7-7,9.1-4.3,14.2L289.2,188.6z"/>
+		<path fill="#0F2647" d="M571.4,139.9c-3.3,0-5.9,2.7-5.9,6c0,3.3,2.7,6,5.9,6c3.3,0,6-2.7,6-6
+			C577.4,142.6,574.7,139.9,571.4,139.9z M536.6,184.6c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9c3.1,0,5.6-2.9,5.6-2.9
+			s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3s-1.5-1.4-3.7-2.3c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9
+			s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3S534.4,183.8,536.6,184.6z M516.4,191.9c0,0-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9
+			s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9c3.1,0,5.6-2.9,5.6-2.9
+			s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3s-1.5-1.4-3.7-2.3C515.5,192.9,516.4,191.9,516.4,191.9z M543,220.2
+			c0,0-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3s-1.5-1.4-3.7-2.3c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9
+			s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9
+			C540.4,223.1,543,220.2,543,220.2z M512.7,230.3c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9
+			s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9c3.1,0,5.6-2.9,5.6-2.9
+			s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3S515,231.2,512.7,230.3z M484.1,221.6c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9
+			c-3.1,0-5.6,2.9-5.6,2.9s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9
+			c3.1,0,5.6-2.9,5.6-2.9s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3S486.3,222.5,484.1,221.6z M473,255.6c0,0-0.9-1.1-2.3-1.9
+			c2.2-0.9,3.7-2.3,3.7-2.3s-1.5-1.4-3.7-2.3c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9s0.9,1.1,2.3,1.9
+			c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9C470.5,258.5,473,255.6,473,255.6z
+			 M423.9,260.6c0,0,2.5,2.9,5.6,2.9c3.1,0,5.6-2.9,5.6-2.9s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3s-1.5-1.4-3.7-2.3
+			c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3
+			C424.8,259.6,423.9,260.6,423.9,260.6z M450.9,277.8c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9
+			s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9c3.1,0,5.6-2.9,5.6-2.9
+			s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3S453.1,278.6,450.9,277.8z M480.5,284.3c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9
+			c-3.1,0-5.6,2.9-5.6,2.9s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9
+			c3.1,0,5.6-2.9,5.6-2.9s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3S482.7,285.2,480.5,284.3z M504.7,260.9c1.4-0.8,2.3-1.9,2.3-1.9
+			s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3c-1.4,0.8-2.3,1.9-2.3,1.9
+			s2.5,2.9,5.6,2.9c3.1,0,5.6-2.9,5.6-2.9s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3S506.9,261.7,504.7,260.9z M548.1,241.4
+			c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3
+			c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9c3.1,0,5.6-2.9,5.6-2.9s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3
+			S550.3,242.2,548.1,241.4z M405.7,278.2c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9s0.9,1.1,2.3,1.9
+			c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9c3.1,0,5.6-2.9,5.6-2.9s-0.9-1.1-2.3-1.9
+			c2.2-0.9,3.7-2.3,3.7-2.3S407.9,279.1,405.7,278.2z M434.6,306.7c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9
+			c-3.1,0-5.6,2.9-5.6,2.9s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9
+			c3.1,0,5.6-2.9,5.6-2.9s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3S436.8,307.5,434.6,306.7z M397.8,311.9c1.4-0.8,2.3-1.9,2.3-1.9
+			s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3c-1.4,0.8-2.3,1.9-2.3,1.9
+			s2.5,2.9,5.6,2.9c3.1,0,5.6-2.9,5.6-2.9s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3S400.1,312.7,397.8,311.9z M367.8,290.7
+			c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3
+			c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9c3.1,0,5.6-2.9,5.6-2.9s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3
+			S370,291.5,367.8,290.7z M325.2,318.1c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9c3.1,0,5.6-2.9,5.6-2.9s-0.9-1.1-2.3-1.9
+			c2.2-0.9,3.7-2.3,3.7-2.3s-1.5-1.4-3.7-2.3c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9s0.9,1.1,2.3,1.9
+			c-2.2,0.9-3.7,2.3-3.7,2.3S323,317.2,325.2,318.1z M363.5,328.3c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9
+			s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9c3.1,0,5.6-2.9,5.6-2.9
+			s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3S365.7,329.1,363.5,328.3z M357.2,359.5c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9
+			c-3.1,0-5.6,2.9-5.6,2.9s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9
+			c3.1,0,5.6-2.9,5.6-2.9s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3S359.4,360.4,357.2,359.5z M328,355c0,0-0.9-1.1-2.3-1.9
+			c2.2-0.9,3.7-2.3,3.7-2.3s-1.5-1.4-3.7-2.3c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9s0.9,1.1,2.3,1.9
+			c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9C325.4,358,328,355,328,355z M335.6,384.5
+			c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3
+			c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9c3.1,0,5.6-2.9,5.6-2.9s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3
+			S337.8,385.3,335.6,384.5z"/>
+	</g>
+</g>
+</svg>
diff --git a/src/rocksdb/docs/static/og_image.png b/src/rocksdb/docs/static/og_image.png
new file mode 100644
index 000000000..4e2759e61
--- /dev/null
+++ b/src/rocksdb/docs/static/og_image.png
diff --git a/src/rocksdb/env/composite_env.cc b/src/rocksdb/env/composite_env.cc
new file mode 100644
index 000000000..b93aa9fcb
--- /dev/null
+++ b/src/rocksdb/env/composite_env.cc
@@ -0,0 +1,544 @@
+// Copyright (c) 2019-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "env/composite_env_wrapper.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+// The CompositeEnvWrapper class provides an interface that is compatible
+// with the old monolithic Env API, and an implementation that wraps around
+// the new Env that provides threading and other OS related functionality, and
+// the new FileSystem API that provides storage functionality. By
+// providing the old Env interface, it allows the rest of RocksDB code to
+// be agnostic of whether the underlying Env implementation is a monolithic
+// Env or an Env + FileSystem. In the former case, the user will specify
+// Options::env only, whereas in the latter case, the user will specify
+// Options::env and Options::file_system.
+
+class CompositeSequentialFileWrapper : public SequentialFile {
+ public:
+  explicit CompositeSequentialFileWrapper(
+      std::unique_ptr<FSSequentialFile>& target)
+      : target_(std::move(target)) {}
+
+  Status Read(size_t n, Slice* result, char* scratch) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Read(n, io_opts, result, scratch, &dbg);
+  }
+  Status Skip(uint64_t n) override { return target_->Skip(n); }
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  Status InvalidateCache(size_t offset, size_t length) override {
+    return target_->InvalidateCache(offset, length);
+  }
+  Status PositionedRead(uint64_t offset, size_t n, Slice* result,
+                        char* scratch) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->PositionedRead(offset, n, io_opts, result, scratch, &dbg);
+  }
+
+ private:
+  std::unique_ptr<FSSequentialFile> target_;
+};
+
+class CompositeRandomAccessFileWrapper : public RandomAccessFile {
+ public:
+  explicit CompositeRandomAccessFileWrapper(
+      std::unique_ptr<FSRandomAccessFile>& target)
+      : target_(std::move(target)) {}
+
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Read(offset, n, io_opts, result, scratch, &dbg);
+  }
+  Status MultiRead(ReadRequest* reqs, size_t num_reqs) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    std::vector<FSReadRequest> fs_reqs;
+    Status status;
+
+    fs_reqs.resize(num_reqs);
+    for (size_t i = 0; i < num_reqs; ++i) {
+      fs_reqs[i].offset = reqs[i].offset;
+      fs_reqs[i].len = reqs[i].len;
+      fs_reqs[i].scratch = reqs[i].scratch;
+      fs_reqs[i].status = IOStatus::OK();
+    }
+    status = target_->MultiRead(fs_reqs.data(), num_reqs, io_opts, &dbg);
+    for (size_t i = 0; i < num_reqs; ++i) {
+      reqs[i].result = fs_reqs[i].result;
+      reqs[i].status = fs_reqs[i].status;
+    }
+    return status;
+  }
+  Status Prefetch(uint64_t offset, size_t n) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Prefetch(offset, n, io_opts, &dbg);
+  }
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+  void Hint(AccessPattern pattern) override {
+    target_->Hint((FSRandomAccessFile::AccessPattern)pattern);
+  }
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  Status InvalidateCache(size_t offset, size_t length) override {
+    return target_->InvalidateCache(offset, length);
+  }
+
+ private:
+  std::unique_ptr<FSRandomAccessFile> target_;
+};
+
+class CompositeWritableFileWrapper : public WritableFile {
+ public:
+  explicit CompositeWritableFileWrapper(std::unique_ptr<FSWritableFile>& t)
+      : target_(std::move(t)) {}
+
+  Status Append(const Slice& data) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Append(data, io_opts, &dbg);
+  }
+  Status Append(const Slice& data,
+                const DataVerificationInfo& verification_info) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Append(data, io_opts, verification_info, &dbg);
+  }
+  Status PositionedAppend(const Slice& data, uint64_t offset) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->PositionedAppend(data, offset, io_opts, &dbg);
+  }
+  Status PositionedAppend(
+      const Slice& data, uint64_t offset,
+      const DataVerificationInfo& verification_info) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->PositionedAppend(data, offset, io_opts, verification_info,
+                                     &dbg);
+  }
+  Status Truncate(uint64_t size) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Truncate(size, io_opts, &dbg);
+  }
+  Status Close() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Close(io_opts, &dbg);
+  }
+  Status Flush() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Flush(io_opts, &dbg);
+  }
+  Status Sync() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Sync(io_opts, &dbg);
+  }
+  Status Fsync() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Fsync(io_opts, &dbg);
+  }
+  bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); }
+
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+
+  void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override {
+    target_->SetWriteLifeTimeHint(hint);
+  }
+
+  Env::WriteLifeTimeHint GetWriteLifeTimeHint() override {
+    return target_->GetWriteLifeTimeHint();
+  }
+
+  uint64_t GetFileSize() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->GetFileSize(io_opts, &dbg);
+  }
+
+  void SetPreallocationBlockSize(size_t size) override {
+    target_->SetPreallocationBlockSize(size);
+  }
+
+  void GetPreallocationStatus(size_t* block_size,
+                              size_t* last_allocated_block) override {
+    target_->GetPreallocationStatus(block_size, last_allocated_block);
+  }
+
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+
+  Status InvalidateCache(size_t offset, size_t length) override {
+    return target_->InvalidateCache(offset, length);
+  }
+
+  Status RangeSync(uint64_t offset, uint64_t nbytes) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->RangeSync(offset, nbytes, io_opts, &dbg);
+  }
+
+  void PrepareWrite(size_t offset, size_t len) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    target_->PrepareWrite(offset, len, io_opts, &dbg);
+  }
+
+  Status Allocate(uint64_t offset, uint64_t len) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Allocate(offset, len, io_opts, &dbg);
+  }
+
+  std::unique_ptr<FSWritableFile>* target() { return &target_; }
+
+ private:
+  std::unique_ptr<FSWritableFile> target_;
+};
+
+class CompositeRandomRWFileWrapper : public RandomRWFile {
+ public:
+  explicit CompositeRandomRWFileWrapper(std::unique_ptr<FSRandomRWFile>& target)
+      : target_(std::move(target)) {}
+
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  Status Write(uint64_t offset, const Slice& data) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Write(offset, data, io_opts, &dbg);
+  }
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Read(offset, n, io_opts, result, scratch, &dbg);
+  }
+  Status Flush() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Flush(io_opts, &dbg);
+  }
+  Status Sync() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Sync(io_opts, &dbg);
+  }
+  Status Fsync() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Fsync(io_opts, &dbg);
+  }
+  Status Close() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Close(io_opts, &dbg);
+  }
+
+ private:
+  std::unique_ptr<FSRandomRWFile> target_;
+};
+
+class CompositeDirectoryWrapper : public Directory {
+ public:
+  explicit CompositeDirectoryWrapper(std::unique_ptr<FSDirectory>& target)
+      : target_(std::move(target)) {}
+
+  Status Fsync() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->FsyncWithDirOptions(io_opts, &dbg, DirFsyncOptions());
+  }
+
+  Status Close() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Close(io_opts, &dbg);
+  }
+
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+
+ private:
+  std::unique_ptr<FSDirectory> target_;
+};
+}  // namespace
+
+Status CompositeEnv::NewSequentialFile(const std::string& f,
+                                       std::unique_ptr<SequentialFile>* r,
+                                       const EnvOptions& options) {
+  IODebugContext dbg;
+  std::unique_ptr<FSSequentialFile> file;
+  Status status;
+  status =
+      file_system_->NewSequentialFile(f, FileOptions(options), &file, &dbg);
+  if (status.ok()) {
+    r->reset(new CompositeSequentialFileWrapper(file));
+  }
+  return status;
+}
+
+Status CompositeEnv::NewRandomAccessFile(const std::string& f,
+                                         std::unique_ptr<RandomAccessFile>* r,
+                                         const EnvOptions& options) {
+  IODebugContext dbg;
+  std::unique_ptr<FSRandomAccessFile> file;
+  Status status;
+  status =
+      file_system_->NewRandomAccessFile(f, FileOptions(options), &file, &dbg);
+  if (status.ok()) {
+    r->reset(new CompositeRandomAccessFileWrapper(file));
+  }
+  return status;
+}
+
+Status CompositeEnv::NewWritableFile(const std::string& f,
+                                     std::unique_ptr<WritableFile>* r,
+                                     const EnvOptions& options) {
+  IODebugContext dbg;
+  std::unique_ptr<FSWritableFile> file;
+  Status status;
+  status = file_system_->NewWritableFile(f, FileOptions(options), &file, &dbg);
+  if (status.ok()) {
+    r->reset(new CompositeWritableFileWrapper(file));
+  }
+  return status;
+}
+
+Status CompositeEnv::ReopenWritableFile(const std::string& fname,
+                                        std::unique_ptr<WritableFile>* result,
+                                        const EnvOptions& options) {
+  IODebugContext dbg;
+  Status status;
+  std::unique_ptr<FSWritableFile> file;
+  status = file_system_->ReopenWritableFile(fname, FileOptions(options), &file,
+                                            &dbg);
+  if (status.ok()) {
+    result->reset(new CompositeWritableFileWrapper(file));
+  }
+  return status;
+}
+
+Status CompositeEnv::ReuseWritableFile(const std::string& fname,
+                                       const std::string& old_fname,
+                                       std::unique_ptr<WritableFile>* r,
+                                       const EnvOptions& options) {
+  IODebugContext dbg;
+  Status status;
+  std::unique_ptr<FSWritableFile> file;
+  status = file_system_->ReuseWritableFile(fname, old_fname,
+                                           FileOptions(options), &file, &dbg);
+  if (status.ok()) {
+    r->reset(new CompositeWritableFileWrapper(file));
+  }
+  return status;
+}
+
+Status CompositeEnv::NewRandomRWFile(const std::string& fname,
+                                     std::unique_ptr<RandomRWFile>* result,
+                                     const EnvOptions& options) {
+  IODebugContext dbg;
+  std::unique_ptr<FSRandomRWFile> file;
+  Status status;
+  status =
+      file_system_->NewRandomRWFile(fname, FileOptions(options), &file, &dbg);
+  if (status.ok()) {
+    result->reset(new CompositeRandomRWFileWrapper(file));
+  }
+  return status;
+}
+
+Status CompositeEnv::NewDirectory(const std::string& name,
+                                  std::unique_ptr<Directory>* result) {
+  IOOptions io_opts;
+  IODebugContext dbg;
+  std::unique_ptr<FSDirectory> dir;
+  Status status;
+  status = file_system_->NewDirectory(name, io_opts, &dir, &dbg);
+  if (status.ok()) {
+    result->reset(new CompositeDirectoryWrapper(dir));
+  }
+  return status;
+}
+
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo> env_wrapper_type_info = {
+#ifndef ROCKSDB_LITE
+    {"target",
+     OptionTypeInfo(0, OptionType::kUnknown, OptionVerificationType::kByName,
+                    OptionTypeFlags::kDontSerialize)
+         .SetParseFunc([](const ConfigOptions& opts,
+                          const std::string& /*name*/, const std::string& value,
+                          void* addr) {
+           auto target = static_cast<EnvWrapper::Target*>(addr);
+           return Env::CreateFromString(opts, value, &(target->env),
+                                        &(target->guard));
+         })
+         .SetEqualsFunc([](const ConfigOptions& opts,
+                           const std::string& /*name*/, const void* addr1,
+                           const void* addr2, std::string* mismatch) {
+           const auto target1 = static_cast<const EnvWrapper::Target*>(addr1);
+           const auto target2 = static_cast<const EnvWrapper::Target*>(addr2);
+           if (target1->env != nullptr) {
+             return target1->env->AreEquivalent(opts, target2->env, mismatch);
+           } else {
+             return (target2->env == nullptr);
+           }
+         })
+         .SetPrepareFunc([](const ConfigOptions& opts,
+                            const std::string& /*name*/, void* addr) {
+           auto target = static_cast<EnvWrapper::Target*>(addr);
+           if (target->guard.get() != nullptr) {
+             target->env = target->guard.get();
+           } else if (target->env == nullptr) {
+             target->env = Env::Default();
+           }
+           return target->env->PrepareOptions(opts);
+         })
+         .SetValidateFunc([](const DBOptions& db_opts,
+                             const ColumnFamilyOptions& cf_opts,
+                             const std::string& /*name*/, const void* addr) {
+           const auto target = static_cast<const EnvWrapper::Target*>(addr);
+           if (target->env == nullptr) {
+             return Status::InvalidArgument("Target Env not specified");
+           } else {
+             return target->env->ValidateOptions(db_opts, cf_opts);
+           }
+         })},
+#endif  // ROCKSDB_LITE
+};
+static std::unordered_map<std::string, OptionTypeInfo>
+    composite_fs_wrapper_type_info = {
+#ifndef ROCKSDB_LITE
+        {"file_system",
+         OptionTypeInfo::AsCustomSharedPtr<FileSystem>(
+             0, OptionVerificationType::kByName, OptionTypeFlags::kNone)},
+#endif  // ROCKSDB_LITE
+};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    composite_clock_wrapper_type_info = {
+#ifndef ROCKSDB_LITE
+        {"clock",
+         OptionTypeInfo::AsCustomSharedPtr<SystemClock>(
+             0, OptionVerificationType::kByName, OptionTypeFlags::kNone)},
+#endif  // ROCKSDB_LITE
+};
+
+}  // namespace
+
+std::unique_ptr<Env> NewCompositeEnv(const std::shared_ptr<FileSystem>& fs) {
+  return std::unique_ptr<Env>(new CompositeEnvWrapper(Env::Default(), fs));
+}
+
+CompositeEnvWrapper::CompositeEnvWrapper(Env* env,
+                                         const std::shared_ptr<FileSystem>& fs,
+                                         const std::shared_ptr<SystemClock>& sc)
+    : CompositeEnv(fs, sc), target_(env) {
+  RegisterOptions("", &target_, &env_wrapper_type_info);
+  RegisterOptions("", &file_system_, &composite_fs_wrapper_type_info);
+  RegisterOptions("", &system_clock_, &composite_clock_wrapper_type_info);
+}
+
+CompositeEnvWrapper::CompositeEnvWrapper(const std::shared_ptr<Env>& env,
+                                         const std::shared_ptr<FileSystem>& fs,
+                                         const std::shared_ptr<SystemClock>& sc)
+    : CompositeEnv(fs, sc), target_(env) {
+  RegisterOptions("", &target_, &env_wrapper_type_info);
+  RegisterOptions("", &file_system_, &composite_fs_wrapper_type_info);
+  RegisterOptions("", &system_clock_, &composite_clock_wrapper_type_info);
+}
+
+Status CompositeEnvWrapper::PrepareOptions(const ConfigOptions& options) {
+  target_.Prepare();
+  if (file_system_ == nullptr) {
+    file_system_ = target_.env->GetFileSystem();
+  }
+  if (system_clock_ == nullptr) {
+    system_clock_ = target_.env->GetSystemClock();
+  }
+  return Env::PrepareOptions(options);
+}
+
+#ifndef ROCKSDB_LITE
+std::string CompositeEnvWrapper::SerializeOptions(
+    const ConfigOptions& config_options, const std::string& header) const {
+  auto options = CompositeEnv::SerializeOptions(config_options, header);
+  if (target_.env != nullptr && target_.env != Env::Default()) {
+    options.append("target=");
+    options.append(target_.env->ToString(config_options));
+  }
+  return options;
+}
+#endif  // ROCKSDB_LITE
+
+EnvWrapper::EnvWrapper(Env* t) : target_(t) {
+  RegisterOptions("", &target_, &env_wrapper_type_info);
+}
+
+EnvWrapper::EnvWrapper(std::unique_ptr<Env>&& t) : target_(std::move(t)) {
+  RegisterOptions("", &target_, &env_wrapper_type_info);
+}
+
+EnvWrapper::EnvWrapper(const std::shared_ptr<Env>& t) : target_(t) {
+  RegisterOptions("", &target_, &env_wrapper_type_info);
+}
+
+EnvWrapper::~EnvWrapper() {}
+
+Status EnvWrapper::PrepareOptions(const ConfigOptions& options) {
+  target_.Prepare();
+  return Env::PrepareOptions(options);
+}
+
+#ifndef ROCKSDB_LITE
+std::string EnvWrapper::SerializeOptions(const ConfigOptions& config_options,
+                                         const std::string& header) const {
+  auto parent = Env::SerializeOptions(config_options, "");
+  if (config_options.IsShallow() || target_.env == nullptr ||
+      target_.env == Env::Default()) {
+    return parent;
+  } else {
+    std::string result = header;
+    if (!StartsWith(parent, OptionTypeInfo::kIdPropName())) {
+      result.append(OptionTypeInfo::kIdPropName()).append("=");
+    }
+    result.append(parent);
+    if (!EndsWith(result, config_options.delimiter)) {
+      result.append(config_options.delimiter);
+    }
+    result.append("target=").append(target_.env->ToString(config_options));
+    return result;
+  }
+}
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/env/composite_env_wrapper.h b/src/rocksdb/env/composite_env_wrapper.h
new file mode 100644
index 000000000..78da6f0ed
--- /dev/null
+++ b/src/rocksdb/env/composite_env_wrapper.h
@@ -0,0 +1,380 @@
+// Copyright (c) 2019-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
+
+#ifdef _WIN32
+// Windows API macro interference
+#undef DeleteFile
+#undef GetCurrentTime
+#undef LoadLibrary
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class CompositeEnv : public Env {
+ public:
+  // Initialize a CompositeEnvWrapper that delegates all thread/time related
+  // calls to env, and all file operations to fs
+  explicit CompositeEnv(const std::shared_ptr<FileSystem>& fs,
+                        const std::shared_ptr<SystemClock>& clock)
+      : Env(fs, clock) {}
+
+  Status RegisterDbPaths(const std::vector<std::string>& paths) override {
+    return file_system_->RegisterDbPaths(paths);
+  }
+  Status UnregisterDbPaths(const std::vector<std::string>& paths) override {
+    return file_system_->UnregisterDbPaths(paths);
+  }
+
+  // The following text is boilerplate that forwards all methods to target()
+  Status NewSequentialFile(const std::string& f,
+                           std::unique_ptr<SequentialFile>* r,
+                           const EnvOptions& options) override;
+
+  Status NewRandomAccessFile(const std::string& f,
+                             std::unique_ptr<RandomAccessFile>* r,
+                             const EnvOptions& options) override;
+
+  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
+                         const EnvOptions& options) override;
+
+  Status ReopenWritableFile(const std::string& fname,
+                            std::unique_ptr<WritableFile>* result,
+                            const EnvOptions& options) override;
+
+  Status ReuseWritableFile(const std::string& fname,
+                           const std::string& old_fname,
+                           std::unique_ptr<WritableFile>* r,
+                           const EnvOptions& options) override;
+
+  Status NewRandomRWFile(const std::string& fname,
+                         std::unique_ptr<RandomRWFile>* result,
+                         const EnvOptions& options) override;
+
+  Status NewMemoryMappedFileBuffer(
+      const std::string& fname,
+      std::unique_ptr<MemoryMappedFileBuffer>* result) override {
+    return file_system_->NewMemoryMappedFileBuffer(fname, result);
+  }
+
+  Status NewDirectory(const std::string& name,
+                      std::unique_ptr<Directory>* result) override;
+
+  Status FileExists(const std::string& f) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return file_system_->FileExists(f, io_opts, &dbg);
+  }
+  Status GetChildren(const std::string& dir,
+                     std::vector<std::string>* r) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return file_system_->GetChildren(dir, io_opts, r, &dbg);
+  }
+  Status GetChildrenFileAttributes(
+      const std::string& dir, std::vector<FileAttributes>* result) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return file_system_->GetChildrenFileAttributes(dir, io_opts, result, &dbg);
+  }
+  Status DeleteFile(const std::string& f) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return file_system_->DeleteFile(f, io_opts, &dbg);
+  }
+  Status Truncate(const std::string& fname, size_t size) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return file_system_->Truncate(fname, size, io_opts, &dbg);
+  }
+  Status CreateDir(const std::string& d) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return file_system_->CreateDir(d, io_opts, &dbg);
+  }
+  Status CreateDirIfMissing(const std::string& d) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return file_system_->CreateDirIfMissing(d, io_opts, &dbg);
+  }
+  Status DeleteDir(const std::string& d) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return file_system_->DeleteDir(d, io_opts, &dbg);
+  }
+  Status GetFileSize(const std::string& f, uint64_t* s) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return file_system_->GetFileSize(f, io_opts, s, &dbg);
+  }
+
+  Status GetFileModificationTime(const std::string& fname,
+                                 uint64_t* file_mtime) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return file_system_->GetFileModificationTime(fname, io_opts, file_mtime,
+                                                 &dbg);
+  }
+
+  Status RenameFile(const std::string& s, const std::string& t) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return file_system_->RenameFile(s, t, io_opts, &dbg);
+  }
+
+  Status LinkFile(const std::string& s, const std::string& t) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return file_system_->LinkFile(s, t, io_opts, &dbg);
+  }
+
+  Status NumFileLinks(const std::string& fname, uint64_t* count) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return file_system_->NumFileLinks(fname, io_opts, count, &dbg);
+  }
+
+  Status AreFilesSame(const std::string& first, const std::string& second,
+                      bool* res) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return file_system_->AreFilesSame(first, second, io_opts, res, &dbg);
+  }
+
+  Status LockFile(const std::string& f, FileLock** l) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return file_system_->LockFile(f, io_opts, l, &dbg);
+  }
+
+  Status UnlockFile(FileLock* l) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return file_system_->UnlockFile(l, io_opts, &dbg);
+  }
+
+  Status GetAbsolutePath(const std::string& db_path,
+                         std::string* output_path) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return file_system_->GetAbsolutePath(db_path, io_opts, output_path, &dbg);
+  }
+
+  Status NewLogger(const std::string& fname,
+                   std::shared_ptr<Logger>* result) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return file_system_->NewLogger(fname, io_opts, result, &dbg);
+  }
+
+  Status IsDirectory(const std::string& path, bool* is_dir) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return file_system_->IsDirectory(path, io_opts, is_dir, &dbg);
+  }
+
+  Status GetTestDirectory(std::string* path) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return file_system_->GetTestDirectory(io_opts, path, &dbg);
+  }
+
+  EnvOptions OptimizeForLogRead(const EnvOptions& env_options) const override {
+    return file_system_->OptimizeForLogRead(FileOptions(env_options));
+  }
+
+  EnvOptions OptimizeForManifestRead(
+      const EnvOptions& env_options) const override {
+    return file_system_->OptimizeForManifestRead(FileOptions(env_options));
+  }
+
+  EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
+                                 const DBOptions& db_options) const override {
+    return file_system_->OptimizeForLogWrite(FileOptions(env_options),
+                                             db_options);
+  }
+
+  EnvOptions OptimizeForManifestWrite(
+      const EnvOptions& env_options) const override {
+    return file_system_->OptimizeForManifestWrite(FileOptions(env_options));
+  }
+
+  EnvOptions OptimizeForCompactionTableWrite(
+      const EnvOptions& env_options,
+      const ImmutableDBOptions& immutable_ops) const override {
+    return file_system_->OptimizeForCompactionTableWrite(
+        FileOptions(env_options), immutable_ops);
+  }
+  EnvOptions OptimizeForCompactionTableRead(
+      const EnvOptions& env_options,
+      const ImmutableDBOptions& db_options) const override {
+    return file_system_->OptimizeForCompactionTableRead(
+        FileOptions(env_options), db_options);
+  }
+  EnvOptions OptimizeForBlobFileRead(
+      const EnvOptions& env_options,
+      const ImmutableDBOptions& db_options) const override {
+    return file_system_->OptimizeForBlobFileRead(FileOptions(env_options),
+                                                 db_options);
+  }
+  // This seems to clash with a macro on Windows, so #undef it here
+#ifdef GetFreeSpace
+#undef GetFreeSpace
+#endif
+  Status GetFreeSpace(const std::string& path, uint64_t* diskfree) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return file_system_->GetFreeSpace(path, io_opts, diskfree, &dbg);
+  }
+  uint64_t NowMicros() override { return system_clock_->NowMicros(); }
+  uint64_t NowNanos() override { return system_clock_->NowNanos(); }
+
+  uint64_t NowCPUNanos() override { return system_clock_->CPUNanos(); }
+
+  void SleepForMicroseconds(int micros) override {
+    system_clock_->SleepForMicroseconds(micros);
+  }
+
+  Status GetCurrentTime(int64_t* unix_time) override {
+    return system_clock_->GetCurrentTime(unix_time);
+  }
+  std::string TimeToString(uint64_t time) override {
+    return system_clock_->TimeToString(time);
+  }
+};
+
+class CompositeEnvWrapper : public CompositeEnv {
+ public:
+  // Initialize a CompositeEnvWrapper that delegates all thread/time related
+  // calls to env, and all file operations to fs
+  explicit CompositeEnvWrapper(Env* env)
+      : CompositeEnvWrapper(env, env->GetFileSystem(), env->GetSystemClock()) {}
+  explicit CompositeEnvWrapper(Env* env, const std::shared_ptr<FileSystem>& fs)
+      : CompositeEnvWrapper(env, fs, env->GetSystemClock()) {}
+
+  explicit CompositeEnvWrapper(Env* env, const std::shared_ptr<SystemClock>& sc)
+      : CompositeEnvWrapper(env, env->GetFileSystem(), sc) {}
+
+  explicit CompositeEnvWrapper(Env* env, const std::shared_ptr<FileSystem>& fs,
+                               const std::shared_ptr<SystemClock>& sc);
+
+  explicit CompositeEnvWrapper(const std::shared_ptr<Env>& env,
+                               const std::shared_ptr<FileSystem>& fs)
+      : CompositeEnvWrapper(env, fs, env->GetSystemClock()) {}
+
+  explicit CompositeEnvWrapper(const std::shared_ptr<Env>& env,
+                               const std::shared_ptr<SystemClock>& sc)
+      : CompositeEnvWrapper(env, env->GetFileSystem(), sc) {}
+
+  explicit CompositeEnvWrapper(const std::shared_ptr<Env>& env,
+                               const std::shared_ptr<FileSystem>& fs,
+                               const std::shared_ptr<SystemClock>& sc);
+
+  static const char* kClassName() { return "CompositeEnv"; }
+  const char* Name() const override { return kClassName(); }
+  bool IsInstanceOf(const std::string& name) const override {
+    if (name == kClassName()) {
+      return true;
+    } else {
+      return CompositeEnv::IsInstanceOf(name);
+    }
+  }
+  const Customizable* Inner() const override { return target_.env; }
+
+  Status PrepareOptions(const ConfigOptions& options) override;
+#ifndef ROCKSDB_LITE
+  std::string SerializeOptions(const ConfigOptions& config_options,
+                               const std::string& header) const override;
+#endif  // ROCKSDB_LITE
+
+  // Return the target to which this Env forwards all calls
+  Env* env_target() const { return target_.env; }
+
+#if !defined(OS_WIN) && !defined(ROCKSDB_NO_DYNAMIC_EXTENSION)
+  Status LoadLibrary(const std::string& lib_name,
+                     const std::string& search_path,
+                     std::shared_ptr<DynamicLibrary>* result) override {
+    return target_.env->LoadLibrary(lib_name, search_path, result);
+  }
+#endif
+
+  void Schedule(void (*f)(void* arg), void* a, Priority pri,
+                void* tag = nullptr, void (*u)(void* arg) = nullptr) override {
+    return target_.env->Schedule(f, a, pri, tag, u);
+  }
+
+  int UnSchedule(void* tag, Priority pri) override {
+    return target_.env->UnSchedule(tag, pri);
+  }
+
+  void StartThread(void (*f)(void*), void* a) override {
+    return target_.env->StartThread(f, a);
+  }
+  void WaitForJoin() override { return target_.env->WaitForJoin(); }
+  unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override {
+    return target_.env->GetThreadPoolQueueLen(pri);
+  }
+
+  int ReserveThreads(int threads_to_be_reserved, Priority pri) override {
+    return target_.env->ReserveThreads(threads_to_be_reserved, pri);
+  }
+
+  int ReleaseThreads(int threads_to_be_released, Priority pri) override {
+    return target_.env->ReleaseThreads(threads_to_be_released, pri);
+  }
+
+  Status GetHostName(char* name, uint64_t len) override {
+    return target_.env->GetHostName(name, len);
+  }
+  void SetBackgroundThreads(int num, Priority pri) override {
+    return target_.env->SetBackgroundThreads(num, pri);
+  }
+  int GetBackgroundThreads(Priority pri) override {
+    return target_.env->GetBackgroundThreads(pri);
+  }
+
+  Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override {
+    return target_.env->SetAllowNonOwnerAccess(allow_non_owner_access);
+  }
+
+  void IncBackgroundThreadsIfNeeded(int num, Priority pri) override {
+    return target_.env->IncBackgroundThreadsIfNeeded(num, pri);
+  }
+
+  void LowerThreadPoolIOPriority(Priority pool) override {
+    target_.env->LowerThreadPoolIOPriority(pool);
+  }
+
+  void LowerThreadPoolCPUPriority(Priority pool) override {
+    target_.env->LowerThreadPoolCPUPriority(pool);
+  }
+
+  Status LowerThreadPoolCPUPriority(Priority pool, CpuPriority pri) override {
+    return target_.env->LowerThreadPoolCPUPriority(pool, pri);
+  }
+
+  Status GetThreadList(std::vector<ThreadStatus>* thread_list) override {
+    return target_.env->GetThreadList(thread_list);
+  }
+
+  ThreadStatusUpdater* GetThreadStatusUpdater() const override {
+    return target_.env->GetThreadStatusUpdater();
+  }
+
+  uint64_t GetThreadID() const override { return target_.env->GetThreadID(); }
+
+  std::string GenerateUniqueId() override {
+    return target_.env->GenerateUniqueId();
+  }
+
+ private:
+  EnvWrapper::Target target_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/env/emulated_clock.h b/src/rocksdb/env/emulated_clock.h
new file mode 100644
index 000000000..622737635
--- /dev/null
+++ b/src/rocksdb/env/emulated_clock.h
@@ -0,0 +1,114 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <string>
+
+#include "rocksdb/status.h"
+#include "rocksdb/system_clock.h"
+
+namespace ROCKSDB_NAMESPACE {
+// A SystemClock that can "mock" sleep and counts its operations.
+class EmulatedSystemClock : public SystemClockWrapper {
+ private:
+  // Something to return when mocking current time
+  const int64_t maybe_starting_time_;
+  std::atomic<int> sleep_counter_{0};
+  std::atomic<int> cpu_counter_{0};
+  std::atomic<int64_t> addon_microseconds_{0};
+  // Do not modify in the env of a running DB (could cause deadlock)
+  std::atomic<bool> time_elapse_only_sleep_;
+  bool no_slowdown_;
+
+ public:
+  explicit EmulatedSystemClock(const std::shared_ptr<SystemClock>& base,
+                               bool time_elapse_only_sleep = false);
+
+  static const char* kClassName() { return "TimeEmulatedSystemClock"; }
+  const char* Name() const override { return kClassName(); }
+
+  virtual void SleepForMicroseconds(int micros) override {
+    sleep_counter_++;
+    if (no_slowdown_ || time_elapse_only_sleep_) {
+      addon_microseconds_.fetch_add(micros);
+    }
+    if (!no_slowdown_) {
+      SystemClockWrapper::SleepForMicroseconds(micros);
+    }
+  }
+
+  void MockSleepForMicroseconds(int64_t micros) {
+    sleep_counter_++;
+    assert(no_slowdown_);
+    addon_microseconds_.fetch_add(micros);
+  }
+
+  void MockSleepForSeconds(int64_t seconds) {
+    sleep_counter_++;
+    assert(no_slowdown_);
+    addon_microseconds_.fetch_add(seconds * 1000000);
+  }
+
+  void SetTimeElapseOnlySleep(bool enabled) {
+    // We cannot set these before destroying the last DB because they might
+    // cause a deadlock or similar without the appropriate options set in
+    // the DB.
+    time_elapse_only_sleep_ = enabled;
+    no_slowdown_ = enabled;
+  }
+
+  bool IsTimeElapseOnlySleep() const { return time_elapse_only_sleep_.load(); }
+  void SetMockSleep(bool enabled = true) { no_slowdown_ = enabled; }
+  bool IsMockSleepEnabled() const { return no_slowdown_; }
+
+  int GetSleepCounter() const { return sleep_counter_.load(); }
+
+  virtual Status GetCurrentTime(int64_t* unix_time) override {
+    Status s;
+    if (time_elapse_only_sleep_) {
+      *unix_time = maybe_starting_time_;
+    } else {
+      s = SystemClockWrapper::GetCurrentTime(unix_time);
+    }
+    if (s.ok()) {
+      // mock microseconds elapsed to seconds of time
+      *unix_time += addon_microseconds_.load() / 1000000;
+    }
+    return s;
+  }
+
+  virtual uint64_t CPUNanos() override {
+    cpu_counter_++;
+    return SystemClockWrapper::CPUNanos();
+  }
+
+  virtual uint64_t CPUMicros() override {
+    cpu_counter_++;
+    return SystemClockWrapper::CPUMicros();
+  }
+
+  virtual uint64_t NowNanos() override {
+    return (time_elapse_only_sleep_ ? 0 : SystemClockWrapper::NowNanos()) +
+           addon_microseconds_.load() * 1000;
+  }
+
+  virtual uint64_t NowMicros() override {
+    return (time_elapse_only_sleep_ ? 0 : SystemClockWrapper::NowMicros()) +
+           addon_microseconds_.load();
+  }
+
+  int GetCpuCounter() const { return cpu_counter_.load(); }
+
+  void ResetCounters() {
+    cpu_counter_.store(0);
+    sleep_counter_.store(0);
+  }
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/env/env.cc b/src/rocksdb/env/env.cc
new file mode 100644
index 000000000..f70d1f067
--- /dev/null
+++ b/src/rocksdb/env/env.cc
@@ -0,0 +1,1264 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/env.h"
+
+#include <thread>
+
+#include "env/composite_env_wrapper.h"
+#include "env/emulated_clock.h"
+#include "env/mock_env.h"
+#include "env/unique_id_gen.h"
+#include "logging/env_logger.h"
+#include "memory/arena.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/options.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+#ifndef ROCKSDB_LITE
+static int RegisterBuiltinEnvs(ObjectLibrary& library,
+                               const std::string& /*arg*/) {
+  library.AddFactory<Env>(MockEnv::kClassName(), [](const std::string& /*uri*/,
+                                                    std::unique_ptr<Env>* guard,
+                                                    std::string* /* errmsg */) {
+    guard->reset(MockEnv::Create(Env::Default()));
+    return guard->get();
+  });
+  library.AddFactory<Env>(
+      CompositeEnvWrapper::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<Env>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new CompositeEnvWrapper(Env::Default()));
+        return guard->get();
+      });
+  size_t num_types;
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+#endif  // ROCKSDB_LITE
+
+static void RegisterSystemEnvs() {
+#ifndef ROCKSDB_LITE
+  static std::once_flag loaded;
+  std::call_once(loaded, [&]() {
+    RegisterBuiltinEnvs(*(ObjectLibrary::Default().get()), "");
+  });
+#endif  // ROCKSDB_LITE
+}
+
+class LegacySystemClock : public SystemClock {
+ private:
+  Env* env_;
+
+ public:
+  explicit LegacySystemClock(Env* env) : env_(env) {}
+  const char* Name() const override { return "LegacySystemClock"; }
+
+  // Returns the number of micro-seconds since some fixed point in time.
+  // It is often used as system time such as in GenericRateLimiter
+  // and other places so a port needs to return system time in order to work.
+  uint64_t NowMicros() override { return env_->NowMicros(); }
+
+  // Returns the number of nano-seconds since some fixed point in time. Only
+  // useful for computing deltas of time in one run.
+  // Default implementation simply relies on NowMicros.
+  // In platform-specific implementations, NowNanos() should return time points
+  // that are MONOTONIC.
+  uint64_t NowNanos() override { return env_->NowNanos(); }
+
+  uint64_t CPUMicros() override { return CPUNanos() / 1000; }
+  uint64_t CPUNanos() override { return env_->NowCPUNanos(); }
+
+  // Sleep/delay the thread for the prescribed number of micro-seconds.
+  void SleepForMicroseconds(int micros) override {
+    env_->SleepForMicroseconds(micros);
+  }
+
+  // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC).
+  // Only overwrites *unix_time on success.
+  Status GetCurrentTime(int64_t* unix_time) override {
+    return env_->GetCurrentTime(unix_time);
+  }
+  // Converts seconds-since-Jan-01-1970 to a printable string
+  std::string TimeToString(uint64_t time) override {
+    return env_->TimeToString(time);
+  }
+
+#ifndef ROCKSDB_LITE
+  std::string SerializeOptions(const ConfigOptions& /*config_options*/,
+                               const std::string& /*prefix*/) const override {
+    // We do not want the LegacySystemClock to appear in the serialized output.
+    // This clock is an internal class for those who do not implement one and
+    // would be part of the Env.  As such, do not serialize it here.
+    return "";
+  }
+#endif  // ROCKSDB_LITE
+};
+
+class LegacySequentialFileWrapper : public FSSequentialFile {
+ public:
+  explicit LegacySequentialFileWrapper(
+      std::unique_ptr<SequentialFile>&& _target)
+      : target_(std::move(_target)) {}
+
+  IOStatus Read(size_t n, const IOOptions& /*options*/, Slice* result,
+                char* scratch, IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Read(n, result, scratch));
+  }
+  IOStatus Skip(uint64_t n) override {
+    return status_to_io_status(target_->Skip(n));
+  }
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  IOStatus InvalidateCache(size_t offset, size_t length) override {
+    return status_to_io_status(target_->InvalidateCache(offset, length));
+  }
+  IOStatus PositionedRead(uint64_t offset, size_t n,
+                          const IOOptions& /*options*/, Slice* result,
+                          char* scratch, IODebugContext* /*dbg*/) override {
+    return status_to_io_status(
+        target_->PositionedRead(offset, n, result, scratch));
+  }
+
+ private:
+  std::unique_ptr<SequentialFile> target_;
+};
+
+class LegacyRandomAccessFileWrapper : public FSRandomAccessFile {
+ public:
+  explicit LegacyRandomAccessFileWrapper(
+      std::unique_ptr<RandomAccessFile>&& target)
+      : target_(std::move(target)) {}
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/,
+                Slice* result, char* scratch,
+                IODebugContext* /*dbg*/) const override {
+    return status_to_io_status(target_->Read(offset, n, result, scratch));
+  }
+
+  IOStatus MultiRead(FSReadRequest* fs_reqs, size_t num_reqs,
+                     const IOOptions& /*options*/,
+                     IODebugContext* /*dbg*/) override {
+    std::vector<ReadRequest> reqs;
+    Status status;
+
+    reqs.reserve(num_reqs);
+    for (size_t i = 0; i < num_reqs; ++i) {
+      ReadRequest req;
+
+      req.offset = fs_reqs[i].offset;
+      req.len = fs_reqs[i].len;
+      req.scratch = fs_reqs[i].scratch;
+      req.status = Status::OK();
+
+      reqs.emplace_back(req);
+    }
+    status = target_->MultiRead(reqs.data(), num_reqs);
+    for (size_t i = 0; i < num_reqs; ++i) {
+      fs_reqs[i].result = reqs[i].result;
+      fs_reqs[i].status = status_to_io_status(std::move(reqs[i].status));
+    }
+    return status_to_io_status(std::move(status));
+  }
+
+  IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Prefetch(offset, n));
+  }
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+  void Hint(AccessPattern pattern) override {
+    target_->Hint((RandomAccessFile::AccessPattern)pattern);
+  }
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  IOStatus InvalidateCache(size_t offset, size_t length) override {
+    return status_to_io_status(target_->InvalidateCache(offset, length));
+  }
+
+ private:
+  std::unique_ptr<RandomAccessFile> target_;
+};
+
+class LegacyRandomRWFileWrapper : public FSRandomRWFile {
+ public:
+  explicit LegacyRandomRWFileWrapper(std::unique_ptr<RandomRWFile>&& target)
+      : target_(std::move(target)) {}
+
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  IOStatus Write(uint64_t offset, const Slice& data,
+                 const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Write(offset, data));
+  }
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/,
+                Slice* result, char* scratch,
+                IODebugContext* /*dbg*/) const override {
+    return status_to_io_status(target_->Read(offset, n, result, scratch));
+  }
+  IOStatus Flush(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Flush());
+  }
+  IOStatus Sync(const IOOptions& /*options*/,
+                IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Sync());
+  }
+  IOStatus Fsync(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Fsync());
+  }
+  IOStatus Close(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Close());
+  }
+
+ private:
+  std::unique_ptr<RandomRWFile> target_;
+};
+
+class LegacyWritableFileWrapper : public FSWritableFile {
+ public:
+  explicit LegacyWritableFileWrapper(std::unique_ptr<WritableFile>&& _target)
+      : target_(std::move(_target)) {}
+
+  IOStatus Append(const Slice& data, const IOOptions& /*options*/,
+                  IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Append(data));
+  }
+  IOStatus Append(const Slice& data, const IOOptions& /*options*/,
+                  const DataVerificationInfo& /*verification_info*/,
+                  IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Append(data));
+  }
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& /*options*/,
+                            IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->PositionedAppend(data, offset));
+  }
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& /*options*/,
+                            const DataVerificationInfo& /*verification_info*/,
+                            IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->PositionedAppend(data, offset));
+  }
+  IOStatus Truncate(uint64_t size, const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Truncate(size));
+  }
+  IOStatus Close(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Close());
+  }
+  IOStatus Flush(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Flush());
+  }
+  IOStatus Sync(const IOOptions& /*options*/,
+                IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Sync());
+  }
+  IOStatus Fsync(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Fsync());
+  }
+  bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); }
+
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+
+  void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override {
+    target_->SetWriteLifeTimeHint(hint);
+  }
+
+  Env::WriteLifeTimeHint GetWriteLifeTimeHint() override {
+    return target_->GetWriteLifeTimeHint();
+  }
+
+  uint64_t GetFileSize(const IOOptions& /*options*/,
+                       IODebugContext* /*dbg*/) override {
+    return target_->GetFileSize();
+  }
+
+  void SetPreallocationBlockSize(size_t size) override {
+    target_->SetPreallocationBlockSize(size);
+  }
+
+  void GetPreallocationStatus(size_t* block_size,
+                              size_t* last_allocated_block) override {
+    target_->GetPreallocationStatus(block_size, last_allocated_block);
+  }
+
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+
+  IOStatus InvalidateCache(size_t offset, size_t length) override {
+    return status_to_io_status(target_->InvalidateCache(offset, length));
+  }
+
+  IOStatus RangeSync(uint64_t offset, uint64_t nbytes,
+                     const IOOptions& /*options*/,
+                     IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->RangeSync(offset, nbytes));
+  }
+
+  void PrepareWrite(size_t offset, size_t len, const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    target_->PrepareWrite(offset, len);
+  }
+
+  IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Allocate(offset, len));
+  }
+
+ private:
+  std::unique_ptr<WritableFile> target_;
+};
+
+class LegacyDirectoryWrapper : public FSDirectory {
+ public:
+  explicit LegacyDirectoryWrapper(std::unique_ptr<Directory>&& target)
+      : target_(std::move(target)) {}
+
+  IOStatus Fsync(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Fsync());
+  }
+  IOStatus Close(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Close());
+  }
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+
+ private:
+  std::unique_ptr<Directory> target_;
+};
+
+class LegacyFileSystemWrapper : public FileSystem {
+ public:
+  // Initialize an EnvWrapper that delegates all calls to *t
+  explicit LegacyFileSystemWrapper(Env* t) : target_(t) {}
+  ~LegacyFileSystemWrapper() override {}
+
+  static const char* kClassName() { return "LegacyFileSystem"; }
+  const char* Name() const override { return kClassName(); }
+
+  // Return the target to which this Env forwards all calls
+  Env* target() const { return target_; }
+
+  // The following text is boilerplate that forwards all methods to target()
+  IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts,
+                             std::unique_ptr<FSSequentialFile>* r,
+                             IODebugContext* /*dbg*/) override {
+    std::unique_ptr<SequentialFile> file;
+    Status s = target_->NewSequentialFile(f, &file, file_opts);
+    if (s.ok()) {
+      r->reset(new LegacySequentialFileWrapper(std::move(file)));
+    }
+    return status_to_io_status(std::move(s));
+  }
+  IOStatus NewRandomAccessFile(const std::string& f,
+                               const FileOptions& file_opts,
+                               std::unique_ptr<FSRandomAccessFile>* r,
+                               IODebugContext* /*dbg*/) override {
+    std::unique_ptr<RandomAccessFile> file;
+    Status s = target_->NewRandomAccessFile(f, &file, file_opts);
+    if (s.ok()) {
+      r->reset(new LegacyRandomAccessFileWrapper(std::move(file)));
+    }
+    return status_to_io_status(std::move(s));
+  }
+  IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts,
+                           std::unique_ptr<FSWritableFile>* r,
+                           IODebugContext* /*dbg*/) override {
+    std::unique_ptr<WritableFile> file;
+    Status s = target_->NewWritableFile(f, &file, file_opts);
+    if (s.ok()) {
+      r->reset(new LegacyWritableFileWrapper(std::move(file)));
+    }
+    return status_to_io_status(std::move(s));
+  }
+  IOStatus ReopenWritableFile(const std::string& fname,
+                              const FileOptions& file_opts,
+                              std::unique_ptr<FSWritableFile>* result,
+                              IODebugContext* /*dbg*/) override {
+    std::unique_ptr<WritableFile> file;
+    Status s = target_->ReopenWritableFile(fname, &file, file_opts);
+    if (s.ok()) {
+      result->reset(new LegacyWritableFileWrapper(std::move(file)));
+    }
+    return status_to_io_status(std::move(s));
+  }
+  IOStatus ReuseWritableFile(const std::string& fname,
+                             const std::string& old_fname,
+                             const FileOptions& file_opts,
+                             std::unique_ptr<FSWritableFile>* r,
+                             IODebugContext* /*dbg*/) override {
+    std::unique_ptr<WritableFile> file;
+    Status s = target_->ReuseWritableFile(fname, old_fname, &file, file_opts);
+    if (s.ok()) {
+      r->reset(new LegacyWritableFileWrapper(std::move(file)));
+    }
+    return status_to_io_status(std::move(s));
+  }
+  IOStatus NewRandomRWFile(const std::string& fname,
+                           const FileOptions& file_opts,
+                           std::unique_ptr<FSRandomRWFile>* result,
+                           IODebugContext* /*dbg*/) override {
+    std::unique_ptr<RandomRWFile> file;
+    Status s = target_->NewRandomRWFile(fname, &file, file_opts);
+    if (s.ok()) {
+      result->reset(new LegacyRandomRWFileWrapper(std::move(file)));
+    }
+    return status_to_io_status(std::move(s));
+  }
+  IOStatus NewMemoryMappedFileBuffer(
+      const std::string& fname,
+      std::unique_ptr<MemoryMappedFileBuffer>* result) override {
+    return status_to_io_status(
+        target_->NewMemoryMappedFileBuffer(fname, result));
+  }
+  IOStatus NewDirectory(const std::string& name, const IOOptions& /*io_opts*/,
+                        std::unique_ptr<FSDirectory>* result,
+                        IODebugContext* /*dbg*/) override {
+    std::unique_ptr<Directory> dir;
+    Status s = target_->NewDirectory(name, &dir);
+    if (s.ok()) {
+      result->reset(new LegacyDirectoryWrapper(std::move(dir)));
+    }
+    return status_to_io_status(std::move(s));
+  }
+  IOStatus FileExists(const std::string& f, const IOOptions& /*io_opts*/,
+                      IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->FileExists(f));
+  }
+  IOStatus GetChildren(const std::string& dir, const IOOptions& /*io_opts*/,
+                       std::vector<std::string>* r,
+                       IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->GetChildren(dir, r));
+  }
+  IOStatus GetChildrenFileAttributes(const std::string& dir,
+                                     const IOOptions& /*options*/,
+                                     std::vector<FileAttributes>* result,
+                                     IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->GetChildrenFileAttributes(dir, result));
+  }
+  IOStatus DeleteFile(const std::string& f, const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->DeleteFile(f));
+  }
+  IOStatus Truncate(const std::string& fname, size_t size,
+                    const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Truncate(fname, size));
+  }
+  IOStatus CreateDir(const std::string& d, const IOOptions& /*options*/,
+                     IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->CreateDir(d));
+  }
+  IOStatus CreateDirIfMissing(const std::string& d,
+                              const IOOptions& /*options*/,
+                              IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->CreateDirIfMissing(d));
+  }
+  IOStatus DeleteDir(const std::string& d, const IOOptions& /*options*/,
+                     IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->DeleteDir(d));
+  }
+  IOStatus GetFileSize(const std::string& f, const IOOptions& /*options*/,
+                       uint64_t* s, IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->GetFileSize(f, s));
+  }
+
+  IOStatus GetFileModificationTime(const std::string& fname,
+                                   const IOOptions& /*options*/,
+                                   uint64_t* file_mtime,
+                                   IODebugContext* /*dbg*/) override {
+    return status_to_io_status(
+        target_->GetFileModificationTime(fname, file_mtime));
+  }
+
+  IOStatus GetAbsolutePath(const std::string& db_path,
+                           const IOOptions& /*options*/,
+                           std::string* output_path,
+                           IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->GetAbsolutePath(db_path, output_path));
+  }
+
+  IOStatus RenameFile(const std::string& s, const std::string& t,
+                      const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->RenameFile(s, t));
+  }
+
+  IOStatus LinkFile(const std::string& s, const std::string& t,
+                    const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->LinkFile(s, t));
+  }
+
+  IOStatus NumFileLinks(const std::string& fname, const IOOptions& /*options*/,
+                        uint64_t* count, IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->NumFileLinks(fname, count));
+  }
+
+  IOStatus AreFilesSame(const std::string& first, const std::string& second,
+                        const IOOptions& /*options*/, bool* res,
+                        IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->AreFilesSame(first, second, res));
+  }
+
+  IOStatus LockFile(const std::string& f, const IOOptions& /*options*/,
+                    FileLock** l, IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->LockFile(f, l));
+  }
+
+  IOStatus UnlockFile(FileLock* l, const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->UnlockFile(l));
+  }
+
+  IOStatus GetTestDirectory(const IOOptions& /*options*/, std::string* path,
+                            IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->GetTestDirectory(path));
+  }
+  IOStatus NewLogger(const std::string& fname, const IOOptions& /*options*/,
+                     std::shared_ptr<Logger>* result,
+                     IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->NewLogger(fname, result));
+  }
+
+  void SanitizeFileOptions(FileOptions* opts) const override {
+    target_->SanitizeEnvOptions(opts);
+  }
+
+  FileOptions OptimizeForLogRead(
+      const FileOptions& file_options) const override {
+    return target_->OptimizeForLogRead(file_options);
+  }
+  FileOptions OptimizeForManifestRead(
+      const FileOptions& file_options) const override {
+    return target_->OptimizeForManifestRead(file_options);
+  }
+  FileOptions OptimizeForLogWrite(const FileOptions& file_options,
+                                  const DBOptions& db_options) const override {
+    return target_->OptimizeForLogWrite(file_options, db_options);
+  }
+  FileOptions OptimizeForManifestWrite(
+      const FileOptions& file_options) const override {
+    return target_->OptimizeForManifestWrite(file_options);
+  }
+  FileOptions OptimizeForCompactionTableWrite(
+      const FileOptions& file_options,
+      const ImmutableDBOptions& immutable_ops) const override {
+    return target_->OptimizeForCompactionTableWrite(file_options,
+                                                    immutable_ops);
+  }
+  FileOptions OptimizeForCompactionTableRead(
+      const FileOptions& file_options,
+      const ImmutableDBOptions& db_options) const override {
+    return target_->OptimizeForCompactionTableRead(file_options, db_options);
+  }
+  FileOptions OptimizeForBlobFileRead(
+      const FileOptions& file_options,
+      const ImmutableDBOptions& db_options) const override {
+    return target_->OptimizeForBlobFileRead(file_options, db_options);
+  }
+
+#ifdef GetFreeSpace
+#undef GetFreeSpace
+#endif
+  IOStatus GetFreeSpace(const std::string& path, const IOOptions& /*options*/,
+                        uint64_t* diskfree, IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->GetFreeSpace(path, diskfree));
+  }
+  IOStatus IsDirectory(const std::string& path, const IOOptions& /*options*/,
+                       bool* is_dir, IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->IsDirectory(path, is_dir));
+  }
+
+#ifndef ROCKSDB_LITE
+  std::string SerializeOptions(const ConfigOptions& /*config_options*/,
+                               const std::string& /*prefix*/) const override {
+    // We do not want the LegacyFileSystem to appear in the serialized output.
+    // This clock is an internal class for those who do not implement one and
+    // would be part of the Env.  As such, do not serialize it here.
+    return "";
+  }
+#endif  // ROCKSDB_LITE
+ private:
+  Env* target_;
+};
+}  // end anonymous namespace
+
+Env::Env() : thread_status_updater_(nullptr) {
+  file_system_ = std::make_shared<LegacyFileSystemWrapper>(this);
+  system_clock_ = std::make_shared<LegacySystemClock>(this);
+}
+
+Env::Env(const std::shared_ptr<FileSystem>& fs)
+    : thread_status_updater_(nullptr), file_system_(fs) {
+  system_clock_ = std::make_shared<LegacySystemClock>(this);
+}
+
+Env::Env(const std::shared_ptr<FileSystem>& fs,
+         const std::shared_ptr<SystemClock>& clock)
+    : thread_status_updater_(nullptr), file_system_(fs), system_clock_(clock) {}
+
+Env::~Env() {}
+
+Status Env::NewLogger(const std::string& fname,
+                      std::shared_ptr<Logger>* result) {
+  return NewEnvLogger(fname, this, result);
+}
+
+Status Env::LoadEnv(const std::string& value, Env** result) {
+  return CreateFromString(ConfigOptions(), value, result);
+}
+
+Status Env::CreateFromString(const ConfigOptions& config_options,
+                             const std::string& value, Env** result) {
+  Env* base = Env::Default();
+  if (value.empty() || base->IsInstanceOf(value)) {
+    *result = base;
+    return Status::OK();
+  } else {
+    RegisterSystemEnvs();
+    Env* env = *result;
+    Status s = LoadStaticObject<Env>(config_options, value, nullptr, &env);
+    if (s.ok()) {
+      *result = env;
+    }
+    return s;
+  }
+}
+
+Status Env::LoadEnv(const std::string& value, Env** result,
+                    std::shared_ptr<Env>* guard) {
+  return CreateFromString(ConfigOptions(), value, result, guard);
+}
+
+Status Env::CreateFromString(const ConfigOptions& config_options,
+                             const std::string& value, Env** result,
+                             std::shared_ptr<Env>* guard) {
+  assert(result);
+  assert(guard != nullptr);
+  std::unique_ptr<Env> uniq;
+
+  Env* env = *result;
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+
+  Status status =
+      Customizable::GetOptionsMap(config_options, env, value, &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
+  }
+  Env* base = Env::Default();
+  if (id.empty() || base->IsInstanceOf(id)) {
+    env = base;
+    status = Status::OK();
+  } else {
+    RegisterSystemEnvs();
+#ifndef ROCKSDB_LITE
+    // First, try to load the Env as a unique object.
+    status = config_options.registry->NewObject<Env>(id, &env, &uniq);
+#else
+    status =
+        Status::NotSupported("Cannot load environment in LITE mode", value);
+#endif
+  }
+  if (config_options.ignore_unsupported_options && status.IsNotSupported()) {
+    status = Status::OK();
+  } else if (status.ok()) {
+    status = Customizable::ConfigureNewObject(config_options, env, opt_map);
+  }
+  if (status.ok()) {
+    guard->reset(uniq.release());
+    *result = env;
+  }
+  return status;
+}
+
+Status Env::CreateFromUri(const ConfigOptions& config_options,
+                          const std::string& env_uri, const std::string& fs_uri,
+                          Env** result, std::shared_ptr<Env>* guard) {
+  *result = config_options.env;
+  if (env_uri.empty() && fs_uri.empty()) {
+    // Neither specified.  Use the default
+    guard->reset();
+    return Status::OK();
+  } else if (!env_uri.empty() && !fs_uri.empty()) {
+    // Both specified.  Cannot choose.  Return Invalid
+    return Status::InvalidArgument("cannot specify both fs_uri and env_uri");
+  } else if (fs_uri.empty()) {  // Only have an ENV URI.  Create an Env from it
+    return CreateFromString(config_options, env_uri, result, guard);
+  } else {
+    std::shared_ptr<FileSystem> fs;
+    Status s = FileSystem::CreateFromString(config_options, fs_uri, &fs);
+    if (s.ok()) {
+      guard->reset(new CompositeEnvWrapper(*result, fs));
+      *result = guard->get();
+    }
+    return s;
+  }
+}
+
+std::string Env::PriorityToString(Env::Priority priority) {
+  switch (priority) {
+    case Env::Priority::BOTTOM:
+      return "Bottom";
+    case Env::Priority::LOW:
+      return "Low";
+    case Env::Priority::HIGH:
+      return "High";
+    case Env::Priority::USER:
+      return "User";
+    case Env::Priority::TOTAL:
+      assert(false);
+  }
+  return "Invalid";
+}
+
+uint64_t Env::GetThreadID() const {
+  std::hash<std::thread::id> hasher;
+  return hasher(std::this_thread::get_id());
+}
+
+Status Env::ReuseWritableFile(const std::string& fname,
+                              const std::string& old_fname,
+                              std::unique_ptr<WritableFile>* result,
+                              const EnvOptions& options) {
+  Status s = RenameFile(old_fname, fname);
+  if (!s.ok()) {
+    return s;
+  }
+  return NewWritableFile(fname, result, options);
+}
+
+Status Env::GetChildrenFileAttributes(const std::string& dir,
+                                      std::vector<FileAttributes>* result) {
+  assert(result != nullptr);
+  std::vector<std::string> child_fnames;
+  Status s = GetChildren(dir, &child_fnames);
+  if (!s.ok()) {
+    return s;
+  }
+  result->resize(child_fnames.size());
+  size_t result_size = 0;
+  for (size_t i = 0; i < child_fnames.size(); ++i) {
+    const std::string path = dir + "/" + child_fnames[i];
+    if (!(s = GetFileSize(path, &(*result)[result_size].size_bytes)).ok()) {
+      if (FileExists(path).IsNotFound()) {
+        // The file may have been deleted since we listed the directory
+        continue;
+      }
+      return s;
+    }
+    (*result)[result_size].name = std::move(child_fnames[i]);
+    result_size++;
+  }
+  result->resize(result_size);
+  return Status::OK();
+}
+
+Status Env::GetHostNameString(std::string* result) {
+  std::array<char, kMaxHostNameLen> hostname_buf{};
+  Status s = GetHostName(hostname_buf.data(), hostname_buf.size());
+  if (s.ok()) {
+    hostname_buf[hostname_buf.size() - 1] = '\0';
+    result->assign(hostname_buf.data());
+  }
+  return s;
+}
+
+std::string Env::GenerateUniqueId() {
+  std::string result;
+  bool success = port::GenerateRfcUuid(&result);
+  if (!success) {
+    // Fall back on our own way of generating a unique ID and adapt it to
+    // RFC 4122 variant 1 version 4 (a random ID).
+    // https://en.wikipedia.org/wiki/Universally_unique_identifier
+    // We already tried GenerateRfcUuid so no need to try it again in
+    // GenerateRawUniqueId
+    constexpr bool exclude_port_uuid = true;
+    uint64_t upper, lower;
+    GenerateRawUniqueId(&upper, &lower, exclude_port_uuid);
+
+    // Set 4-bit version to 4
+    upper = (upper & (~uint64_t{0xf000})) | 0x4000;
+    // Set unary-encoded variant to 1 (0b10)
+    lower = (lower & (~(uint64_t{3} << 62))) | (uint64_t{2} << 62);
+
+    // Use 36 character format of RFC 4122
+    result.resize(36U);
+    char* buf = &result[0];
+    PutBaseChars<16>(&buf, 8, upper >> 32, /*!uppercase*/ false);
+    *(buf++) = '-';
+    PutBaseChars<16>(&buf, 4, upper >> 16, /*!uppercase*/ false);
+    *(buf++) = '-';
+    PutBaseChars<16>(&buf, 4, upper, /*!uppercase*/ false);
+    *(buf++) = '-';
+    PutBaseChars<16>(&buf, 4, lower >> 48, /*!uppercase*/ false);
+    *(buf++) = '-';
+    PutBaseChars<16>(&buf, 12, lower, /*!uppercase*/ false);
+    assert(buf == &result[36]);
+
+    // Verify variant 1 version 4
+    assert(result[14] == '4');
+    assert(result[19] == '8' || result[19] == '9' || result[19] == 'a' ||
+           result[19] == 'b');
+  }
+  return result;
+}
+
+SequentialFile::~SequentialFile() {}
+
+RandomAccessFile::~RandomAccessFile() {}
+
+WritableFile::~WritableFile() {}
+
+MemoryMappedFileBuffer::~MemoryMappedFileBuffer() {}
+
+Logger::~Logger() {}
+
+Status Logger::Close() {
+  if (!closed_) {
+    closed_ = true;
+    return CloseImpl();
+  } else {
+    return Status::OK();
+  }
+}
+
+Status Logger::CloseImpl() { return Status::NotSupported(); }
+
+FileLock::~FileLock() {}
+
+void LogFlush(Logger* info_log) {
+  if (info_log) {
+    info_log->Flush();
+  }
+}
+
+static void Logv(Logger* info_log, const char* format, va_list ap) {
+  if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::INFO_LEVEL) {
+    info_log->Logv(InfoLogLevel::INFO_LEVEL, format, ap);
+  }
+}
+
+void Log(Logger* info_log, const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Logv(info_log, format, ap);
+  va_end(ap);
+}
+
+void Logger::Logv(const InfoLogLevel log_level, const char* format,
+                  va_list ap) {
+  static const char* kInfoLogLevelNames[5] = {"DEBUG", "INFO", "WARN", "ERROR",
+                                              "FATAL"};
+  if (log_level < log_level_) {
+    return;
+  }
+
+  if (log_level == InfoLogLevel::INFO_LEVEL) {
+    // Doesn't print log level if it is INFO level.
+    // This is to avoid unexpected performance regression after we add
+    // the feature of log level. All the logs before we add the feature
+    // are INFO level. We don't want to add extra costs to those existing
+    // logging.
+    Logv(format, ap);
+  } else if (log_level == InfoLogLevel::HEADER_LEVEL) {
+    LogHeader(format, ap);
+  } else {
+    char new_format[500];
+    snprintf(new_format, sizeof(new_format) - 1, "[%s] %s",
+             kInfoLogLevelNames[log_level], format);
+    Logv(new_format, ap);
+  }
+
+  if (log_level >= InfoLogLevel::WARN_LEVEL &&
+      log_level != InfoLogLevel::HEADER_LEVEL) {
+    // Log messages with severity of warning or higher should be rare and are
+    // sometimes followed by an unclean crash. We want to be sure important
+    // messages are not lost in an application buffer when that happens.
+    Flush();
+  }
+}
+
+static void Logv(const InfoLogLevel log_level, Logger* info_log,
+                 const char* format, va_list ap) {
+  if (info_log && info_log->GetInfoLogLevel() <= log_level) {
+    if (log_level == InfoLogLevel::HEADER_LEVEL) {
+      info_log->LogHeader(format, ap);
+    } else {
+      info_log->Logv(log_level, format, ap);
+    }
+  }
+}
+
+void Log(const InfoLogLevel log_level, Logger* info_log, const char* format,
+         ...) {
+  va_list ap;
+  va_start(ap, format);
+  Logv(log_level, info_log, format, ap);
+  va_end(ap);
+}
+
+static void Headerv(Logger* info_log, const char* format, va_list ap) {
+  if (info_log) {
+    info_log->LogHeader(format, ap);
+  }
+}
+
+void Header(Logger* info_log, const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Headerv(info_log, format, ap);
+  va_end(ap);
+}
+
+static void Debugv(Logger* info_log, const char* format, va_list ap) {
+  if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::DEBUG_LEVEL) {
+    info_log->Logv(InfoLogLevel::DEBUG_LEVEL, format, ap);
+  }
+}
+
+void Debug(Logger* info_log, const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Debugv(info_log, format, ap);
+  va_end(ap);
+}
+
+static void Infov(Logger* info_log, const char* format, va_list ap) {
+  if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::INFO_LEVEL) {
+    info_log->Logv(InfoLogLevel::INFO_LEVEL, format, ap);
+  }
+}
+
+void Info(Logger* info_log, const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Infov(info_log, format, ap);
+  va_end(ap);
+}
+
+static void Warnv(Logger* info_log, const char* format, va_list ap) {
+  if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::WARN_LEVEL) {
+    info_log->Logv(InfoLogLevel::WARN_LEVEL, format, ap);
+  }
+}
+
+void Warn(Logger* info_log, const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Warnv(info_log, format, ap);
+  va_end(ap);
+}
+
+static void Errorv(Logger* info_log, const char* format, va_list ap) {
+  if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::ERROR_LEVEL) {
+    info_log->Logv(InfoLogLevel::ERROR_LEVEL, format, ap);
+  }
+}
+
+void Error(Logger* info_log, const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Errorv(info_log, format, ap);
+  va_end(ap);
+}
+
+static void Fatalv(Logger* info_log, const char* format, va_list ap) {
+  if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::FATAL_LEVEL) {
+    info_log->Logv(InfoLogLevel::FATAL_LEVEL, format, ap);
+  }
+}
+
+void Fatal(Logger* info_log, const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Fatalv(info_log, format, ap);
+  va_end(ap);
+}
+
+void LogFlush(const std::shared_ptr<Logger>& info_log) {
+  LogFlush(info_log.get());
+}
+
+void Log(const InfoLogLevel log_level, const std::shared_ptr<Logger>& info_log,
+         const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Logv(log_level, info_log.get(), format, ap);
+  va_end(ap);
+}
+
+void Header(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Headerv(info_log.get(), format, ap);
+  va_end(ap);
+}
+
+void Debug(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Debugv(info_log.get(), format, ap);
+  va_end(ap);
+}
+
+void Info(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Infov(info_log.get(), format, ap);
+  va_end(ap);
+}
+
+void Warn(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Warnv(info_log.get(), format, ap);
+  va_end(ap);
+}
+
+void Error(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Errorv(info_log.get(), format, ap);
+  va_end(ap);
+}
+
+void Fatal(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Fatalv(info_log.get(), format, ap);
+  va_end(ap);
+}
+
+void Log(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Logv(info_log.get(), format, ap);
+  va_end(ap);
+}
+
+Status WriteStringToFile(Env* env, const Slice& data, const std::string& fname,
+                         bool should_sync) {
+  const auto& fs = env->GetFileSystem();
+  return WriteStringToFile(fs.get(), data, fname, should_sync);
+}
+
+Status ReadFileToString(Env* env, const std::string& fname, std::string* data) {
+  const auto& fs = env->GetFileSystem();
+  return ReadFileToString(fs.get(), fname, data);
+}
+
+namespace {  // anonymous namespace
+
+void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) {
+  env_options->use_mmap_reads = options.allow_mmap_reads;
+  env_options->use_mmap_writes = options.allow_mmap_writes;
+  env_options->use_direct_reads = options.use_direct_reads;
+  env_options->set_fd_cloexec = options.is_fd_close_on_exec;
+  env_options->bytes_per_sync = options.bytes_per_sync;
+  env_options->compaction_readahead_size = options.compaction_readahead_size;
+  env_options->random_access_max_buffer_size =
+      options.random_access_max_buffer_size;
+  env_options->rate_limiter = options.rate_limiter.get();
+  env_options->writable_file_max_buffer_size =
+      options.writable_file_max_buffer_size;
+  env_options->allow_fallocate = options.allow_fallocate;
+  env_options->strict_bytes_per_sync = options.strict_bytes_per_sync;
+  options.env->SanitizeEnvOptions(env_options);
+}
+
+}  // namespace
+
+EnvOptions Env::OptimizeForLogWrite(const EnvOptions& env_options,
+                                    const DBOptions& db_options) const {
+  EnvOptions optimized_env_options(env_options);
+  optimized_env_options.bytes_per_sync = db_options.wal_bytes_per_sync;
+  optimized_env_options.writable_file_max_buffer_size =
+      db_options.writable_file_max_buffer_size;
+  return optimized_env_options;
+}
+
+EnvOptions Env::OptimizeForManifestWrite(const EnvOptions& env_options) const {
+  return env_options;
+}
+
+EnvOptions Env::OptimizeForLogRead(const EnvOptions& env_options) const {
+  EnvOptions optimized_env_options(env_options);
+  optimized_env_options.use_direct_reads = false;
+  return optimized_env_options;
+}
+
+EnvOptions Env::OptimizeForManifestRead(const EnvOptions& env_options) const {
+  EnvOptions optimized_env_options(env_options);
+  optimized_env_options.use_direct_reads = false;
+  return optimized_env_options;
+}
+
+EnvOptions Env::OptimizeForCompactionTableWrite(
+    const EnvOptions& env_options, const ImmutableDBOptions& db_options) const {
+  EnvOptions optimized_env_options(env_options);
+  optimized_env_options.use_direct_writes =
+      db_options.use_direct_io_for_flush_and_compaction;
+  return optimized_env_options;
+}
+
+EnvOptions Env::OptimizeForCompactionTableRead(
+    const EnvOptions& env_options, const ImmutableDBOptions& db_options) const {
+  EnvOptions optimized_env_options(env_options);
+  optimized_env_options.use_direct_reads = db_options.use_direct_reads;
+  return optimized_env_options;
+}
+EnvOptions Env::OptimizeForBlobFileRead(
+    const EnvOptions& env_options, const ImmutableDBOptions& db_options) const {
+  EnvOptions optimized_env_options(env_options);
+  optimized_env_options.use_direct_reads = db_options.use_direct_reads;
+  return optimized_env_options;
+}
+
+EnvOptions::EnvOptions(const DBOptions& options) {
+  AssignEnvOptions(this, options);
+}
+
+EnvOptions::EnvOptions() {
+  DBOptions options;
+  AssignEnvOptions(this, options);
+}
+
+Status NewEnvLogger(const std::string& fname, Env* env,
+                    std::shared_ptr<Logger>* result) {
+  FileOptions options;
+  // TODO: Tune the buffer size.
+  options.writable_file_max_buffer_size = 1024 * 1024;
+  std::unique_ptr<FSWritableFile> writable_file;
+  const auto status = env->GetFileSystem()->NewWritableFile(
+      fname, options, &writable_file, nullptr);
+  if (!status.ok()) {
+    return status;
+  }
+
+  *result = std::make_shared<EnvLogger>(std::move(writable_file), fname,
+                                        options, env);
+  return Status::OK();
+}
+
+const std::shared_ptr<FileSystem>& Env::GetFileSystem() const {
+  return file_system_;
+}
+
+const std::shared_ptr<SystemClock>& Env::GetSystemClock() const {
+  return system_clock_;
+}
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo> sc_wrapper_type_info = {
+#ifndef ROCKSDB_LITE
+    {"target",
+     OptionTypeInfo::AsCustomSharedPtr<SystemClock>(
+         0, OptionVerificationType::kByName, OptionTypeFlags::kDontSerialize)},
+#endif  // ROCKSDB_LITE
+};
+
+}  // namespace
+SystemClockWrapper::SystemClockWrapper(const std::shared_ptr<SystemClock>& t)
+    : target_(t) {
+  RegisterOptions("", &target_, &sc_wrapper_type_info);
+}
+
+Status SystemClockWrapper::PrepareOptions(const ConfigOptions& options) {
+  if (target_ == nullptr) {
+    target_ = SystemClock::Default();
+  }
+  return SystemClock::PrepareOptions(options);
+}
+
+#ifndef ROCKSDB_LITE
+std::string SystemClockWrapper::SerializeOptions(
+    const ConfigOptions& config_options, const std::string& header) const {
+  auto parent = SystemClock::SerializeOptions(config_options, "");
+  if (config_options.IsShallow() || target_ == nullptr ||
+      target_->IsInstanceOf(SystemClock::kDefaultName())) {
+    return parent;
+  } else {
+    std::string result = header;
+    if (!StartsWith(parent, OptionTypeInfo::kIdPropName())) {
+      result.append(OptionTypeInfo::kIdPropName()).append("=");
+    }
+    result.append(parent);
+    if (!EndsWith(result, config_options.delimiter)) {
+      result.append(config_options.delimiter);
+    }
+    result.append("target=").append(target_->ToString(config_options));
+    return result;
+  }
+}
+#endif  // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+static int RegisterBuiltinSystemClocks(ObjectLibrary& library,
+                                       const std::string& /*arg*/) {
+  library.AddFactory<SystemClock>(
+      EmulatedSystemClock::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<SystemClock>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new EmulatedSystemClock(SystemClock::Default()));
+        return guard->get();
+      });
+  size_t num_types;
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+#endif  // ROCKSDB_LITE
+
+Status SystemClock::CreateFromString(const ConfigOptions& config_options,
+                                     const std::string& value,
+                                     std::shared_ptr<SystemClock>* result) {
+  auto clock = SystemClock::Default();
+  if (clock->IsInstanceOf(value)) {
+    *result = clock;
+    return Status::OK();
+  } else {
+#ifndef ROCKSDB_LITE
+    static std::once_flag once;
+    std::call_once(once, [&]() {
+      RegisterBuiltinSystemClocks(*(ObjectLibrary::Default().get()), "");
+    });
+#endif  // ROCKSDB_LITE
+    return LoadSharedObject<SystemClock>(config_options, value, nullptr,
+                                         result);
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/env/env_basic_test.cc b/src/rocksdb/env/env_basic_test.cc
new file mode 100644
index 000000000..0f18b3218
--- /dev/null
+++ b/src/rocksdb/env/env_basic_test.cc
@@ -0,0 +1,401 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "env/mock_env.h"
+#include "file/file_util.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/env.h"
+#include "rocksdb/env_encryption.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+using CreateEnvFunc = Env*();
+
+// These functions are used to create the various environments under which this
+// test can execute. These functions are used to allow the test cases to be
+// created without the Env being initialized, thereby eliminating a potential
+// static initialization fiasco/race condition when attempting to get a
+// custom/configured env prior to main being invoked.
+
+static Env* GetDefaultEnv() { return Env::Default(); }
+
+static Env* GetMockEnv() {
+  static std::unique_ptr<Env> mock_env(MockEnv::Create(Env::Default()));
+  return mock_env.get();
+}
+#ifndef ROCKSDB_LITE
+static Env* NewTestEncryptedEnv(Env* base, const std::string& provider_id) {
+  ConfigOptions config_opts;
+  config_opts.invoke_prepare_options = false;
+
+  std::shared_ptr<EncryptionProvider> provider;
+  EXPECT_OK(EncryptionProvider::CreateFromString(config_opts, provider_id,
+                                                 &provider));
+  return NewEncryptedEnv(base, provider);
+}
+
+static Env* GetCtrEncryptedEnv() {
+  static std::unique_ptr<Env> ctr_encrypt_env(
+      NewTestEncryptedEnv(Env::Default(), "CTR://test"));
+  return ctr_encrypt_env.get();
+}
+
+static Env* GetMemoryEnv() {
+  static std::unique_ptr<Env> mem_env(NewMemEnv(Env::Default()));
+  return mem_env.get();
+}
+
+static Env* GetTestEnv() {
+  static std::shared_ptr<Env> env_guard;
+  static Env* custom_env = nullptr;
+  if (custom_env == nullptr) {
+    const char* uri = getenv("TEST_ENV_URI");
+    if (uri != nullptr) {
+      EXPECT_OK(Env::CreateFromUri(ConfigOptions(), uri, "", &custom_env,
+                                   &env_guard));
+    }
+  }
+  EXPECT_NE(custom_env, nullptr);
+  return custom_env;
+}
+
+static Env* GetTestFS() {
+  static std::shared_ptr<Env> fs_env_guard;
+  static Env* fs_env = nullptr;
+  if (fs_env == nullptr) {
+    const char* uri = getenv("TEST_FS_URI");
+    if (uri != nullptr) {
+      EXPECT_OK(
+          Env::CreateFromUri(ConfigOptions(), uri, "", &fs_env, &fs_env_guard));
+    }
+  }
+  EXPECT_NE(fs_env, nullptr);
+  return fs_env;
+}
+#endif  // ROCKSDB_LITE
+
+}  // namespace
+class EnvBasicTestWithParam
+    : public testing::Test,
+      public ::testing::WithParamInterface<CreateEnvFunc*> {
+ public:
+  Env* env_;
+  const EnvOptions soptions_;
+  std::string test_dir_;
+
+  EnvBasicTestWithParam() : env_(GetParam()()) {
+    test_dir_ = test::PerThreadDBPath(env_, "env_basic_test");
+  }
+
+  void SetUp() override { ASSERT_OK(env_->CreateDirIfMissing(test_dir_)); }
+
+  void TearDown() override { ASSERT_OK(DestroyDir(env_, test_dir_)); }
+};
+
+class EnvMoreTestWithParam : public EnvBasicTestWithParam {};
+
+INSTANTIATE_TEST_CASE_P(EnvDefault, EnvBasicTestWithParam,
+                        ::testing::Values(&GetDefaultEnv));
+INSTANTIATE_TEST_CASE_P(EnvDefault, EnvMoreTestWithParam,
+                        ::testing::Values(&GetDefaultEnv));
+
+INSTANTIATE_TEST_CASE_P(MockEnv, EnvBasicTestWithParam,
+                        ::testing::Values(&GetMockEnv));
+
+#ifndef ROCKSDB_LITE
+// next statements run env test against default encryption code.
+INSTANTIATE_TEST_CASE_P(EncryptedEnv, EnvBasicTestWithParam,
+                        ::testing::Values(&GetCtrEncryptedEnv));
+INSTANTIATE_TEST_CASE_P(EncryptedEnv, EnvMoreTestWithParam,
+                        ::testing::Values(&GetCtrEncryptedEnv));
+
+INSTANTIATE_TEST_CASE_P(MemEnv, EnvBasicTestWithParam,
+                        ::testing::Values(&GetMemoryEnv));
+
+namespace {
+
+// Returns a vector of 0 or 1 Env*, depending whether an Env is registered for
+// TEST_ENV_URI.
+//
+// The purpose of returning an empty vector (instead of nullptr) is that gtest
+// ValuesIn() will skip running tests when given an empty collection.
+std::vector<CreateEnvFunc*> GetCustomEnvs() {
+  std::vector<CreateEnvFunc*> res;
+  const char* uri = getenv("TEST_ENV_URI");
+  if (uri != nullptr) {
+    res.push_back(&GetTestEnv);
+  }
+  uri = getenv("TEST_FS_URI");
+  if (uri != nullptr) {
+    res.push_back(&GetTestFS);
+  }
+  return res;
+}
+
+}  // anonymous namespace
+
+INSTANTIATE_TEST_CASE_P(CustomEnv, EnvBasicTestWithParam,
+                        ::testing::ValuesIn(GetCustomEnvs()));
+
+INSTANTIATE_TEST_CASE_P(CustomEnv, EnvMoreTestWithParam,
+                        ::testing::ValuesIn(GetCustomEnvs()));
+#endif  // ROCKSDB_LITE
+
+TEST_P(EnvBasicTestWithParam, Basics) {
+  uint64_t file_size;
+  std::unique_ptr<WritableFile> writable_file;
+  std::vector<std::string> children;
+
+  // Check that the directory is empty.
+  ASSERT_EQ(Status::NotFound(), env_->FileExists(test_dir_ + "/non_existent"));
+  ASSERT_TRUE(!env_->GetFileSize(test_dir_ + "/non_existent", &file_size).ok());
+  ASSERT_OK(env_->GetChildren(test_dir_, &children));
+  ASSERT_EQ(0U, children.size());
+
+  // Create a file.
+  ASSERT_OK(env_->NewWritableFile(test_dir_ + "/f", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Close());
+  writable_file.reset();
+
+  // Check that the file exists.
+  ASSERT_OK(env_->FileExists(test_dir_ + "/f"));
+  ASSERT_OK(env_->GetFileSize(test_dir_ + "/f", &file_size));
+  ASSERT_EQ(0U, file_size);
+  ASSERT_OK(env_->GetChildren(test_dir_, &children));
+  ASSERT_EQ(1U, children.size());
+  ASSERT_EQ("f", children[0]);
+  ASSERT_OK(env_->DeleteFile(test_dir_ + "/f"));
+
+  // Write to the file.
+  ASSERT_OK(
+      env_->NewWritableFile(test_dir_ + "/f1", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append("abc"));
+  ASSERT_OK(writable_file->Close());
+  writable_file.reset();
+  ASSERT_OK(
+      env_->NewWritableFile(test_dir_ + "/f2", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Close());
+  writable_file.reset();
+
+  // Check for expected size.
+  ASSERT_OK(env_->GetFileSize(test_dir_ + "/f1", &file_size));
+  ASSERT_EQ(3U, file_size);
+
+  // Check that renaming works.
+  ASSERT_TRUE(
+      !env_->RenameFile(test_dir_ + "/non_existent", test_dir_ + "/g").ok());
+  ASSERT_OK(env_->RenameFile(test_dir_ + "/f1", test_dir_ + "/g"));
+  ASSERT_EQ(Status::NotFound(), env_->FileExists(test_dir_ + "/f1"));
+  ASSERT_OK(env_->FileExists(test_dir_ + "/g"));
+  ASSERT_OK(env_->GetFileSize(test_dir_ + "/g", &file_size));
+  ASSERT_EQ(3U, file_size);
+
+  // Check that renaming overwriting works
+  ASSERT_OK(env_->RenameFile(test_dir_ + "/f2", test_dir_ + "/g"));
+  ASSERT_OK(env_->GetFileSize(test_dir_ + "/g", &file_size));
+  ASSERT_EQ(0U, file_size);
+
+  // Check that opening non-existent file fails.
+  std::unique_ptr<SequentialFile> seq_file;
+  std::unique_ptr<RandomAccessFile> rand_file;
+  ASSERT_TRUE(!env_->NewSequentialFile(test_dir_ + "/non_existent", &seq_file,
+                                       soptions_)
+                   .ok());
+  ASSERT_TRUE(!seq_file);
+  ASSERT_NOK(env_->NewRandomAccessFile(test_dir_ + "/non_existent", &rand_file,
+                                       soptions_));
+  ASSERT_TRUE(!rand_file);
+
+  // Check that deleting works.
+  ASSERT_NOK(env_->DeleteFile(test_dir_ + "/non_existent"));
+  ASSERT_OK(env_->DeleteFile(test_dir_ + "/g"));
+  ASSERT_EQ(Status::NotFound(), env_->FileExists(test_dir_ + "/g"));
+  ASSERT_OK(env_->GetChildren(test_dir_, &children));
+  ASSERT_EQ(0U, children.size());
+  Status s = env_->GetChildren(test_dir_ + "/non_existent", &children);
+  ASSERT_TRUE(s.IsNotFound());
+}
+
+TEST_P(EnvBasicTestWithParam, ReadWrite) {
+  std::unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<SequentialFile> seq_file;
+  std::unique_ptr<RandomAccessFile> rand_file;
+  Slice result;
+  char scratch[100];
+
+  ASSERT_OK(env_->NewWritableFile(test_dir_ + "/f", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append("hello "));
+  ASSERT_OK(writable_file->Append("world"));
+  ASSERT_OK(writable_file->Close());
+  writable_file.reset();
+
+  // Read sequentially.
+  ASSERT_OK(env_->NewSequentialFile(test_dir_ + "/f", &seq_file, soptions_));
+  ASSERT_OK(seq_file->Read(5, &result, scratch));  // Read "hello".
+  ASSERT_EQ(0, result.compare("hello"));
+  ASSERT_OK(seq_file->Skip(1));
+  ASSERT_OK(seq_file->Read(1000, &result, scratch));  // Read "world".
+  ASSERT_EQ(0, result.compare("world"));
+  ASSERT_OK(seq_file->Read(1000, &result, scratch));  // Try reading past EOF.
+  ASSERT_EQ(0U, result.size());
+  ASSERT_OK(seq_file->Skip(100));  // Try to skip past end of file.
+  ASSERT_OK(seq_file->Read(1000, &result, scratch));
+  ASSERT_EQ(0U, result.size());
+
+  // Random reads.
+  ASSERT_OK(env_->NewRandomAccessFile(test_dir_ + "/f", &rand_file, soptions_));
+  ASSERT_OK(rand_file->Read(6, 5, &result, scratch));  // Read "world".
+  ASSERT_EQ(0, result.compare("world"));
+  ASSERT_OK(rand_file->Read(0, 5, &result, scratch));  // Read "hello".
+  ASSERT_EQ(0, result.compare("hello"));
+  ASSERT_OK(rand_file->Read(10, 100, &result, scratch));  // Read "d".
+  ASSERT_EQ(0, result.compare("d"));
+
+  // Too high offset.
+  ASSERT_TRUE(rand_file->Read(1000, 5, &result, scratch).ok());
+}
+
+TEST_P(EnvBasicTestWithParam, Misc) {
+  std::unique_ptr<WritableFile> writable_file;
+  ASSERT_OK(env_->NewWritableFile(test_dir_ + "/b", &writable_file, soptions_));
+
+  // These are no-ops, but we test they return success.
+  ASSERT_OK(writable_file->Sync());
+  ASSERT_OK(writable_file->Flush());
+  ASSERT_OK(writable_file->Close());
+  writable_file.reset();
+}
+
+TEST_P(EnvBasicTestWithParam, LargeWrite) {
+  const size_t kWriteSize = 300 * 1024;
+  char* scratch = new char[kWriteSize * 2];
+
+  std::string write_data;
+  for (size_t i = 0; i < kWriteSize; ++i) {
+    write_data.append(1, static_cast<char>(i));
+  }
+
+  std::unique_ptr<WritableFile> writable_file;
+  ASSERT_OK(env_->NewWritableFile(test_dir_ + "/f", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append("foo"));
+  ASSERT_OK(writable_file->Append(write_data));
+  ASSERT_OK(writable_file->Close());
+  writable_file.reset();
+
+  std::unique_ptr<SequentialFile> seq_file;
+  Slice result;
+  ASSERT_OK(env_->NewSequentialFile(test_dir_ + "/f", &seq_file, soptions_));
+  ASSERT_OK(seq_file->Read(3, &result, scratch));  // Read "foo".
+  ASSERT_EQ(0, result.compare("foo"));
+
+  size_t read = 0;
+  std::string read_data;
+  while (read < kWriteSize) {
+    ASSERT_OK(seq_file->Read(kWriteSize - read, &result, scratch));
+    read_data.append(result.data(), result.size());
+    read += result.size();
+  }
+  ASSERT_TRUE(write_data == read_data);
+  delete[] scratch;
+}
+
+TEST_P(EnvMoreTestWithParam, GetModTime) {
+  ASSERT_OK(env_->CreateDirIfMissing(test_dir_ + "/dir1"));
+  uint64_t mtime1 = 0x0;
+  ASSERT_OK(env_->GetFileModificationTime(test_dir_ + "/dir1", &mtime1));
+}
+
+TEST_P(EnvMoreTestWithParam, MakeDir) {
+  ASSERT_OK(env_->CreateDir(test_dir_ + "/j"));
+  ASSERT_OK(env_->FileExists(test_dir_ + "/j"));
+  std::vector<std::string> children;
+  ASSERT_OK(env_->GetChildren(test_dir_, &children));
+  ASSERT_EQ(1U, children.size());
+  // fail because file already exists
+  ASSERT_TRUE(!env_->CreateDir(test_dir_ + "/j").ok());
+  ASSERT_OK(env_->CreateDirIfMissing(test_dir_ + "/j"));
+  ASSERT_OK(env_->DeleteDir(test_dir_ + "/j"));
+  ASSERT_EQ(Status::NotFound(), env_->FileExists(test_dir_ + "/j"));
+}
+
+TEST_P(EnvMoreTestWithParam, GetChildren) {
+  // empty folder returns empty vector
+  std::vector<std::string> children;
+  std::vector<Env::FileAttributes> childAttr;
+  ASSERT_OK(env_->CreateDirIfMissing(test_dir_));
+  ASSERT_OK(env_->GetChildren(test_dir_, &children));
+  ASSERT_OK(env_->FileExists(test_dir_));
+  ASSERT_OK(env_->GetChildrenFileAttributes(test_dir_, &childAttr));
+  ASSERT_EQ(0U, children.size());
+  ASSERT_EQ(0U, childAttr.size());
+
+  // folder with contents returns relative path to test dir
+  ASSERT_OK(env_->CreateDirIfMissing(test_dir_ + "/niu"));
+  ASSERT_OK(env_->CreateDirIfMissing(test_dir_ + "/you"));
+  ASSERT_OK(env_->CreateDirIfMissing(test_dir_ + "/guo"));
+  ASSERT_OK(env_->GetChildren(test_dir_, &children));
+  ASSERT_OK(env_->GetChildrenFileAttributes(test_dir_, &childAttr));
+  ASSERT_EQ(3U, children.size());
+  ASSERT_EQ(3U, childAttr.size());
+  for (auto each : children) {
+    env_->DeleteDir(test_dir_ + "/" + each).PermitUncheckedError();
+  }  // necessary for default POSIX env
+
+  // non-exist directory returns IOError
+  ASSERT_OK(env_->DeleteDir(test_dir_));
+  ASSERT_NOK(env_->FileExists(test_dir_));
+  ASSERT_NOK(env_->GetChildren(test_dir_, &children));
+  ASSERT_NOK(env_->GetChildrenFileAttributes(test_dir_, &childAttr));
+
+  // if dir is a file, returns IOError
+  ASSERT_OK(env_->CreateDir(test_dir_));
+  std::unique_ptr<WritableFile> writable_file;
+  ASSERT_OK(
+      env_->NewWritableFile(test_dir_ + "/file", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Close());
+  writable_file.reset();
+  ASSERT_NOK(env_->GetChildren(test_dir_ + "/file", &children));
+  ASSERT_EQ(0U, children.size());
+}
+
+TEST_P(EnvMoreTestWithParam, GetChildrenIgnoresDotAndDotDot) {
+  auto* env = Env::Default();
+  ASSERT_OK(env->CreateDirIfMissing(test_dir_));
+
+  // Create a single file
+  std::string path = test_dir_;
+  const EnvOptions soptions;
+#ifdef OS_WIN
+  path.append("\\test_file");
+#else
+  path.append("/test_file");
+#endif
+  std::string data("test data");
+  std::unique_ptr<WritableFile> file;
+  ASSERT_OK(env->NewWritableFile(path, &file, soptions));
+  ASSERT_OK(file->Append("test data"));
+
+  // get the children
+  std::vector<std::string> result;
+  ASSERT_OK(env->GetChildren(test_dir_, &result));
+
+  // expect only one file named `test_data`, i.e. no `.` or `..` names
+  ASSERT_EQ(result.size(), 1);
+  ASSERT_EQ(result.at(0), "test_file");
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/env/env_chroot.cc b/src/rocksdb/env/env_chroot.cc
new file mode 100644
index 000000000..a64373517
--- /dev/null
+++ b/src/rocksdb/env/env_chroot.cc
@@ -0,0 +1,148 @@
+//  Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#if !defined(ROCKSDB_LITE) && !defined(OS_WIN)
+
+#include "env/env_chroot.h"
+
+#include <errno.h>   // errno
+#include <stdlib.h>  // realpath, free
+#include <unistd.h>  // geteuid
+
+#include "env/composite_env_wrapper.h"
+#include "env/fs_remap.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/string_util.h"  // errnoStr
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo> chroot_fs_type_info = {
+    {"chroot_dir", {0, OptionType::kString}}};
+}  // namespace
+ChrootFileSystem::ChrootFileSystem(const std::shared_ptr<FileSystem>& base,
+                                   const std::string& chroot_dir)
+    : RemapFileSystem(base), chroot_dir_(chroot_dir) {
+  RegisterOptions("chroot_dir", &chroot_dir_, &chroot_fs_type_info);
+}
+
+Status ChrootFileSystem::PrepareOptions(const ConfigOptions& options) {
+  Status s = FileSystemWrapper::PrepareOptions(options);
+  if (!s.ok()) {
+    return s;
+  } else if (chroot_dir_.empty()) {
+    s = Status::InvalidArgument("ChRootFileSystem requires a chroot dir");
+  } else {
+    s = target_->FileExists(chroot_dir_, IOOptions(), nullptr);
+  }
+  if (s.ok()) {
+#if defined(OS_AIX)
+    char resolvedName[PATH_MAX];
+    char* real_chroot_dir = realpath(chroot_dir_.c_str(), resolvedName);
+#else
+    char* real_chroot_dir = realpath(chroot_dir_.c_str(), nullptr);
+#endif
+    // chroot_dir must exist so realpath() returns non-nullptr.
+    assert(real_chroot_dir != nullptr);
+    chroot_dir_ = real_chroot_dir;
+#if !defined(OS_AIX)
+    free(real_chroot_dir);
+#endif
+  }
+  return s;
+}
+
+IOStatus ChrootFileSystem::GetTestDirectory(const IOOptions& options,
+                                            std::string* path,
+                                            IODebugContext* dbg) {
+  // Adapted from PosixEnv's implementation since it doesn't provide a way to
+  // create directory in the chroot.
+  char buf[256];
+  snprintf(buf, sizeof(buf), "/rocksdbtest-%d", static_cast<int>(geteuid()));
+  *path = buf;
+
+  // Directory may already exist, so ignore return
+  return CreateDirIfMissing(*path, options, dbg);
+}
+
+// Returns status and expanded absolute path including the chroot directory.
+// Checks whether the provided path breaks out of the chroot. If it returns
+// non-OK status, the returned path should not be used.
+std::pair<IOStatus, std::string> ChrootFileSystem::EncodePath(
+    const std::string& path) {
+  if (path.empty() || path[0] != '/') {
+    return {IOStatus::InvalidArgument(path, "Not an absolute path"), ""};
+  }
+  std::pair<IOStatus, std::string> res;
+  res.second = chroot_dir_ + path;
+#if defined(OS_AIX)
+  char resolvedName[PATH_MAX];
+  char* normalized_path = realpath(res.second.c_str(), resolvedName);
+#else
+  char* normalized_path = realpath(res.second.c_str(), nullptr);
+#endif
+  if (normalized_path == nullptr) {
+    res.first = IOStatus::NotFound(res.second, errnoStr(errno).c_str());
+  } else if (strlen(normalized_path) < chroot_dir_.size() ||
+             strncmp(normalized_path, chroot_dir_.c_str(),
+                     chroot_dir_.size()) != 0) {
+    res.first = IOStatus::IOError(res.second,
+                                  "Attempted to access path outside chroot");
+  } else {
+    res.first = IOStatus::OK();
+  }
+#if !defined(OS_AIX)
+  free(normalized_path);
+#endif
+  return res;
+}
+
+// Similar to EncodePath() except assumes the basename in the path hasn't been
+// created yet.
+std::pair<IOStatus, std::string> ChrootFileSystem::EncodePathWithNewBasename(
+    const std::string& path) {
+  if (path.empty() || path[0] != '/') {
+    return {IOStatus::InvalidArgument(path, "Not an absolute path"), ""};
+  }
+  // Basename may be followed by trailing slashes
+  size_t final_idx = path.find_last_not_of('/');
+  if (final_idx == std::string::npos) {
+    // It's only slashes so no basename to extract
+    return EncodePath(path);
+  }
+
+  // Pull off the basename temporarily since realname(3) (used by
+  // EncodePath()) requires a path that exists
+  size_t base_sep = path.rfind('/', final_idx);
+  auto status_and_enc_path = EncodePath(path.substr(0, base_sep + 1));
+  status_and_enc_path.second.append(path.substr(base_sep + 1));
+  return status_and_enc_path;
+}
+
+std::shared_ptr<FileSystem> NewChrootFileSystem(
+    const std::shared_ptr<FileSystem>& base, const std::string& chroot_dir) {
+  auto chroot_fs = std::make_shared<ChrootFileSystem>(base, chroot_dir);
+  Status s = chroot_fs->PrepareOptions(ConfigOptions());
+  if (s.ok()) {
+    return chroot_fs;
+  } else {
+    return nullptr;
+  }
+}
+
+Env* NewChrootEnv(Env* base_env, const std::string& chroot_dir) {
+  if (!base_env->FileExists(chroot_dir).ok()) {
+    return nullptr;
+  }
+  auto chroot_fs = NewChrootFileSystem(base_env->GetFileSystem(), chroot_dir);
+  if (chroot_fs != nullptr) {
+    return new CompositeEnvWrapper(base_env, chroot_fs);
+  } else {
+    return nullptr;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !defined(ROCKSDB_LITE) && !defined(OS_WIN)
diff --git a/src/rocksdb/env/env_chroot.h b/src/rocksdb/env/env_chroot.h
new file mode 100644
index 000000000..9e5b9a1e9
--- /dev/null
+++ b/src/rocksdb/env/env_chroot.h
@@ -0,0 +1,55 @@
+//  Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#if !defined(ROCKSDB_LITE) && !defined(OS_WIN)
+
+#include <string>
+
+#include "env/fs_remap.h"
+#include "rocksdb/file_system.h"
+
+namespace ROCKSDB_NAMESPACE {
+class ChrootFileSystem : public RemapFileSystem {
+ public:
+  ChrootFileSystem(const std::shared_ptr<FileSystem>& base,
+                   const std::string& chroot_dir);
+
+  static const char* kClassName() { return "ChrootFS"; }
+  const char* Name() const override { return kClassName(); }
+
+  IOStatus GetTestDirectory(const IOOptions& options, std::string* path,
+                            IODebugContext* dbg) override;
+
+  Status PrepareOptions(const ConfigOptions& options) override;
+
+ protected:
+  // Returns status and expanded absolute path including the chroot directory.
+  // Checks whether the provided path breaks out of the chroot. If it returns
+  // non-OK status, the returned path should not be used.
+  std::pair<IOStatus, std::string> EncodePath(const std::string& path) override;
+
+  // Similar to EncodePath() except assumes the basename in the path hasn't been
+  // created yet.
+  std::pair<IOStatus, std::string> EncodePathWithNewBasename(
+      const std::string& path) override;
+
+ private:
+  std::string chroot_dir_;
+};
+
+// Returns an Env that translates paths such that the root directory appears to
+// be chroot_dir. chroot_dir should refer to an existing directory.
+//
+// This class has not been fully analyzed for providing strong security
+// guarantees.
+Env* NewChrootEnv(Env* base_env, const std::string& chroot_dir);
+std::shared_ptr<FileSystem> NewChrootFileSystem(
+    const std::shared_ptr<FileSystem>& base, const std::string& chroot_dir);
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !defined(ROCKSDB_LITE) && !defined(OS_WIN)
diff --git a/src/rocksdb/env/env_encryption.cc b/src/rocksdb/env/env_encryption.cc
new file mode 100644
index 000000000..c6b0a257d
--- /dev/null
+++ b/src/rocksdb/env/env_encryption.cc
@@ -0,0 +1,1351 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/env_encryption.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cctype>
+#include <iostream>
+
+#include "env/composite_env_wrapper.h"
+#include "env/env_encryption_ctr.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/aligned_buffer.h"
+#include "util/coding.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+#endif
+namespace ROCKSDB_NAMESPACE {
+#ifndef ROCKSDB_LITE
+std::shared_ptr<EncryptionProvider> EncryptionProvider::NewCTRProvider(
+    const std::shared_ptr<BlockCipher>& cipher) {
+  return std::make_shared<CTREncryptionProvider>(cipher);
+}
+
+// Read up to "n" bytes from the file.  "scratch[0..n-1]" may be
+// written by this routine.  Sets "*result" to the data that was
+// read (including if fewer than "n" bytes were successfully read).
+// May set "*result" to point at data in "scratch[0..n-1]", so
+// "scratch[0..n-1]" must be live when "*result" is used.
+// If an error was encountered, returns a non-OK status.
+//
+// REQUIRES: External synchronization
+IOStatus EncryptedSequentialFile::Read(size_t n, const IOOptions& options,
+                                       Slice* result, char* scratch,
+                                       IODebugContext* dbg) {
+  assert(scratch);
+  IOStatus io_s = file_->Read(n, options, result, scratch, dbg);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  {
+    PERF_TIMER_GUARD(decrypt_data_nanos);
+    io_s = status_to_io_status(
+        stream_->Decrypt(offset_, (char*)result->data(), result->size()));
+  }
+  if (io_s.ok()) {
+    offset_ += result->size();  // We've already ready data from disk, so update
+                                // offset_ even if decryption fails.
+  }
+  return io_s;
+}
+
+// Skip "n" bytes from the file. This is guaranteed to be no
+// slower that reading the same data, but may be faster.
+//
+// If end of file is reached, skipping will stop at the end of the
+// file, and Skip will return OK.
+//
+// REQUIRES: External synchronization
+IOStatus EncryptedSequentialFile::Skip(uint64_t n) {
+  auto status = file_->Skip(n);
+  if (!status.ok()) {
+    return status;
+  }
+  offset_ += n;
+  return status;
+}
+
+// Indicates the upper layers if the current SequentialFile implementation
+// uses direct IO.
+bool EncryptedSequentialFile::use_direct_io() const {
+  return file_->use_direct_io();
+}
+
+// Use the returned alignment value to allocate
+// aligned buffer for Direct I/O
+size_t EncryptedSequentialFile::GetRequiredBufferAlignment() const {
+  return file_->GetRequiredBufferAlignment();
+}
+
+// Remove any kind of caching of data from the offset to offset+length
+// of this file. If the length is 0, then it refers to the end of file.
+// If the system is not caching the file contents, then this is a noop.
+IOStatus EncryptedSequentialFile::InvalidateCache(size_t offset,
+                                                  size_t length) {
+  return file_->InvalidateCache(offset + prefixLength_, length);
+}
+
+// Positioned Read for direct I/O
+// If Direct I/O enabled, offset, n, and scratch should be properly aligned
+IOStatus EncryptedSequentialFile::PositionedRead(uint64_t offset, size_t n,
+                                                 const IOOptions& options,
+                                                 Slice* result, char* scratch,
+                                                 IODebugContext* dbg) {
+  assert(scratch);
+  offset += prefixLength_;  // Skip prefix
+  auto io_s = file_->PositionedRead(offset, n, options, result, scratch, dbg);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  offset_ = offset + result->size();
+  {
+    PERF_TIMER_GUARD(decrypt_data_nanos);
+    io_s = status_to_io_status(
+        stream_->Decrypt(offset, (char*)result->data(), result->size()));
+  }
+  return io_s;
+}
+
+// Read up to "n" bytes from the file starting at "offset".
+// "scratch[0..n-1]" may be written by this routine.  Sets "*result"
+// to the data that was read (including if fewer than "n" bytes were
+// successfully read).  May set "*result" to point at data in
+// "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
+// "*result" is used.  If an error was encountered, returns a non-OK
+// status.
+//
+// Safe for concurrent use by multiple threads.
+// If Direct I/O enabled, offset, n, and scratch should be aligned properly.
+IOStatus EncryptedRandomAccessFile::Read(uint64_t offset, size_t n,
+                                         const IOOptions& options,
+                                         Slice* result, char* scratch,
+                                         IODebugContext* dbg) const {
+  assert(scratch);
+  offset += prefixLength_;
+  auto io_s = file_->Read(offset, n, options, result, scratch, dbg);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  {
+    PERF_TIMER_GUARD(decrypt_data_nanos);
+    io_s = status_to_io_status(
+        stream_->Decrypt(offset, (char*)result->data(), result->size()));
+  }
+  return io_s;
+}
+
+// Readahead the file starting from offset by n bytes for caching.
+IOStatus EncryptedRandomAccessFile::Prefetch(uint64_t offset, size_t n,
+                                             const IOOptions& options,
+                                             IODebugContext* dbg) {
+  // return Status::OK();
+  return file_->Prefetch(offset + prefixLength_, n, options, dbg);
+}
+
+// Tries to get an unique ID for this file that will be the same each time
+// the file is opened (and will stay the same while the file is open).
+// Furthermore, it tries to make this ID at most "max_size" bytes. If such an
+// ID can be created this function returns the length of the ID and places it
+// in "id"; otherwise, this function returns 0, in which case "id"
+// may not have been modified.
+//
+// This function guarantees, for IDs from a given environment, two unique ids
+// cannot be made equal to each other by adding arbitrary bytes to one of
+// them. That is, no unique ID is the prefix of another.
+//
+// This function guarantees that the returned ID will not be interpretable as
+// a single varint.
+//
+// Note: these IDs are only valid for the duration of the process.
+size_t EncryptedRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
+  return file_->GetUniqueId(id, max_size);
+};
+
+void EncryptedRandomAccessFile::Hint(AccessPattern pattern) {
+  file_->Hint(pattern);
+}
+
+// Indicates the upper layers if the current RandomAccessFile implementation
+// uses direct IO.
+bool EncryptedRandomAccessFile::use_direct_io() const {
+  return file_->use_direct_io();
+}
+
+// Use the returned alignment value to allocate
+// aligned buffer for Direct I/O
+size_t EncryptedRandomAccessFile::GetRequiredBufferAlignment() const {
+  return file_->GetRequiredBufferAlignment();
+}
+
+// Remove any kind of caching of data from the offset to offset+length
+// of this file. If the length is 0, then it refers to the end of file.
+// If the system is not caching the file contents, then this is a noop.
+IOStatus EncryptedRandomAccessFile::InvalidateCache(size_t offset,
+                                                    size_t length) {
+  return file_->InvalidateCache(offset + prefixLength_, length);
+}
+
+// A file abstraction for sequential writing.  The implementation
+// must provide buffering since callers may append small fragments
+// at a time to the file.
+IOStatus EncryptedWritableFile::Append(const Slice& data,
+                                       const IOOptions& options,
+                                       IODebugContext* dbg) {
+  AlignedBuffer buf;
+  Slice dataToAppend(data);
+  if (data.size() > 0) {
+    auto offset = file_->GetFileSize(options, dbg);  // size including prefix
+    // Encrypt in cloned buffer
+    buf.Alignment(GetRequiredBufferAlignment());
+    buf.AllocateNewBuffer(data.size());
+    // TODO (sagar0): Modify AlignedBuffer.Append to allow doing a memmove
+    // so that the next two lines can be replaced with buf.Append().
+    memmove(buf.BufferStart(), data.data(), data.size());
+    buf.Size(data.size());
+    IOStatus io_s;
+    {
+      PERF_TIMER_GUARD(encrypt_data_nanos);
+      io_s = status_to_io_status(
+          stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize()));
+    }
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize());
+  }
+  return file_->Append(dataToAppend, options, dbg);
+}
+
+IOStatus EncryptedWritableFile::PositionedAppend(const Slice& data,
+                                                 uint64_t offset,
+                                                 const IOOptions& options,
+                                                 IODebugContext* dbg) {
+  AlignedBuffer buf;
+  Slice dataToAppend(data);
+  offset += prefixLength_;
+  if (data.size() > 0) {
+    // Encrypt in cloned buffer
+    buf.Alignment(GetRequiredBufferAlignment());
+    buf.AllocateNewBuffer(data.size());
+    memmove(buf.BufferStart(), data.data(), data.size());
+    buf.Size(data.size());
+    IOStatus io_s;
+    {
+      PERF_TIMER_GUARD(encrypt_data_nanos);
+      io_s = status_to_io_status(
+          stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize()));
+    }
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize());
+  }
+  return file_->PositionedAppend(dataToAppend, offset, options, dbg);
+}
+
+// Indicates the upper layers if the current WritableFile implementation
+// uses direct IO.
+bool EncryptedWritableFile::use_direct_io() const {
+  return file_->use_direct_io();
+}
+
+// true if Sync() and Fsync() are safe to call concurrently with Append()
+// and Flush().
+bool EncryptedWritableFile::IsSyncThreadSafe() const {
+  return file_->IsSyncThreadSafe();
+}
+
+// Use the returned alignment value to allocate
+// aligned buffer for Direct I/O
+size_t EncryptedWritableFile::GetRequiredBufferAlignment() const {
+  return file_->GetRequiredBufferAlignment();
+}
+
+/*
+ * Get the size of valid data in the file.
+ */
+uint64_t EncryptedWritableFile::GetFileSize(const IOOptions& options,
+                                            IODebugContext* dbg) {
+  return file_->GetFileSize(options, dbg) - prefixLength_;
+}
+
+// Truncate is necessary to trim the file to the correct size
+// before closing. It is not always possible to keep track of the file
+// size due to whole pages writes. The behavior is undefined if called
+// with other writes to follow.
+IOStatus EncryptedWritableFile::Truncate(uint64_t size,
+                                         const IOOptions& options,
+                                         IODebugContext* dbg) {
+  return file_->Truncate(size + prefixLength_, options, dbg);
+}
+
+// Remove any kind of caching of data from the offset to offset+length
+// of this file. If the length is 0, then it refers to the end of file.
+// If the system is not caching the file contents, then this is a noop.
+// This call has no effect on dirty pages in the cache.
+IOStatus EncryptedWritableFile::InvalidateCache(size_t offset, size_t length) {
+  return file_->InvalidateCache(offset + prefixLength_, length);
+}
+
+// Sync a file range with disk.
+// offset is the starting byte of the file range to be synchronized.
+// nbytes specifies the length of the range to be synchronized.
+// This asks the OS to initiate flushing the cached data to disk,
+// without waiting for completion.
+// Default implementation does nothing.
+IOStatus EncryptedWritableFile::RangeSync(uint64_t offset, uint64_t nbytes,
+                                          const IOOptions& options,
+                                          IODebugContext* dbg) {
+  return file_->RangeSync(offset + prefixLength_, nbytes, options, dbg);
+}
+
+// PrepareWrite performs any necessary preparation for a write
+// before the write actually occurs.  This allows for pre-allocation
+// of space on devices where it can result in less file
+// fragmentation and/or less waste from over-zealous filesystem
+// pre-allocation.
+void EncryptedWritableFile::PrepareWrite(size_t offset, size_t len,
+                                         const IOOptions& options,
+                                         IODebugContext* dbg) {
+  file_->PrepareWrite(offset + prefixLength_, len, options, dbg);
+}
+
+void EncryptedWritableFile::SetPreallocationBlockSize(size_t size) {
+  // the size here doesn't need to include prefixLength_, as it's a
+  // configuration will be use for `PrepareWrite()`.
+  file_->SetPreallocationBlockSize(size);
+}
+
+void EncryptedWritableFile::GetPreallocationStatus(
+    size_t* block_size, size_t* last_allocated_block) {
+  file_->GetPreallocationStatus(block_size, last_allocated_block);
+}
+
+// Pre-allocates space for a file.
+IOStatus EncryptedWritableFile::Allocate(uint64_t offset, uint64_t len,
+                                         const IOOptions& options,
+                                         IODebugContext* dbg) {
+  return file_->Allocate(offset + prefixLength_, len, options, dbg);
+}
+
+IOStatus EncryptedWritableFile::Flush(const IOOptions& options,
+                                      IODebugContext* dbg) {
+  return file_->Flush(options, dbg);
+}
+
+IOStatus EncryptedWritableFile::Sync(const IOOptions& options,
+                                     IODebugContext* dbg) {
+  return file_->Sync(options, dbg);
+}
+
+IOStatus EncryptedWritableFile::Close(const IOOptions& options,
+                                      IODebugContext* dbg) {
+  return file_->Close(options, dbg);
+}
+
+// A file abstraction for random reading and writing.
+
+// Indicates if the class makes use of direct I/O
+// If false you must pass aligned buffer to Write()
+bool EncryptedRandomRWFile::use_direct_io() const {
+  return file_->use_direct_io();
+}
+
+// Use the returned alignment value to allocate
+// aligned buffer for Direct I/O
+size_t EncryptedRandomRWFile::GetRequiredBufferAlignment() const {
+  return file_->GetRequiredBufferAlignment();
+}
+
+// Write bytes in `data` at  offset `offset`, Returns Status::OK() on success.
+// Pass aligned buffer when use_direct_io() returns true.
+IOStatus EncryptedRandomRWFile::Write(uint64_t offset, const Slice& data,
+                                      const IOOptions& options,
+                                      IODebugContext* dbg) {
+  AlignedBuffer buf;
+  Slice dataToWrite(data);
+  offset += prefixLength_;
+  if (data.size() > 0) {
+    // Encrypt in cloned buffer
+    buf.Alignment(GetRequiredBufferAlignment());
+    buf.AllocateNewBuffer(data.size());
+    memmove(buf.BufferStart(), data.data(), data.size());
+    buf.Size(data.size());
+    IOStatus io_s;
+    {
+      PERF_TIMER_GUARD(encrypt_data_nanos);
+      io_s = status_to_io_status(
+          stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize()));
+    }
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    dataToWrite = Slice(buf.BufferStart(), buf.CurrentSize());
+  }
+  return file_->Write(offset, dataToWrite, options, dbg);
+}
+
+// Read up to `n` bytes starting from offset `offset` and store them in
+// result, provided `scratch` size should be at least `n`.
+// Returns Status::OK() on success.
+IOStatus EncryptedRandomRWFile::Read(uint64_t offset, size_t n,
+                                     const IOOptions& options, Slice* result,
+                                     char* scratch, IODebugContext* dbg) const {
+  assert(scratch);
+  offset += prefixLength_;
+  auto status = file_->Read(offset, n, options, result, scratch, dbg);
+  if (!status.ok()) {
+    return status;
+  }
+  {
+    PERF_TIMER_GUARD(decrypt_data_nanos);
+    status = status_to_io_status(
+        stream_->Decrypt(offset, (char*)result->data(), result->size()));
+  }
+  return status;
+}
+
+IOStatus EncryptedRandomRWFile::Flush(const IOOptions& options,
+                                      IODebugContext* dbg) {
+  return file_->Flush(options, dbg);
+}
+
+IOStatus EncryptedRandomRWFile::Sync(const IOOptions& options,
+                                     IODebugContext* dbg) {
+  return file_->Sync(options, dbg);
+}
+
+IOStatus EncryptedRandomRWFile::Fsync(const IOOptions& options,
+                                      IODebugContext* dbg) {
+  return file_->Fsync(options, dbg);
+}
+
+IOStatus EncryptedRandomRWFile::Close(const IOOptions& options,
+                                      IODebugContext* dbg) {
+  return file_->Close(options, dbg);
+}
+
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo> encrypted_fs_type_info =
+    {
+        {"provider",
+         OptionTypeInfo::AsCustomSharedPtr<EncryptionProvider>(
+             0 /* No offset, whole struct*/, OptionVerificationType::kByName,
+             OptionTypeFlags::kNone)},
+};
+// EncryptedFileSystemImpl implements an FileSystemWrapper that adds encryption
+// to files stored on disk.
+class EncryptedFileSystemImpl : public EncryptedFileSystem {
+ public:
+  const char* Name() const override {
+    return EncryptedFileSystem::kClassName();
+  }
+  // Returns the raw encryption provider that should be used to write the input
+  // encrypted file.  If there is no such provider, NotFound is returned.
+  IOStatus GetWritableProvider(const std::string& /*fname*/,
+                               EncryptionProvider** result) {
+    if (provider_) {
+      *result = provider_.get();
+      return IOStatus::OK();
+    } else {
+      *result = nullptr;
+      return IOStatus::NotFound("No WriteProvider specified");
+    }
+  }
+
+  // Returns the raw encryption provider that should be used to read the input
+  // encrypted file.  If there is no such provider, NotFound is returned.
+  IOStatus GetReadableProvider(const std::string& /*fname*/,
+                               EncryptionProvider** result) {
+    if (provider_) {
+      *result = provider_.get();
+      return IOStatus::OK();
+    } else {
+      *result = nullptr;
+      return IOStatus::NotFound("No Provider specified");
+    }
+  }
+
+  // Creates a CipherStream for the underlying file/name using the options
+  // If a writable provider is found and encryption is enabled, uses
+  // this provider to create a cipher stream.
+  // @param fname         Name of the writable file
+  // @param underlying    The underlying "raw" file
+  // @param options       Options for creating the file/cipher
+  // @param prefix_length Returns the length of the encryption prefix used for
+  // this file
+  // @param stream        Returns the cipher stream to use for this file if it
+  // should be encrypted
+  // @return OK on success, non-OK on failure.
+  template <class TypeFile>
+  IOStatus CreateWritableCipherStream(
+      const std::string& fname, const std::unique_ptr<TypeFile>& underlying,
+      const FileOptions& options, size_t* prefix_length,
+      std::unique_ptr<BlockAccessCipherStream>* stream, IODebugContext* dbg) {
+    EncryptionProvider* provider = nullptr;
+    *prefix_length = 0;
+    IOStatus status = GetWritableProvider(fname, &provider);
+    if (!status.ok()) {
+      return status;
+    } else if (provider != nullptr) {
+      // Initialize & write prefix (if needed)
+      AlignedBuffer buffer;
+      Slice prefix;
+      *prefix_length = provider->GetPrefixLength();
+      if (*prefix_length > 0) {
+        // Initialize prefix
+        buffer.Alignment(underlying->GetRequiredBufferAlignment());
+        buffer.AllocateNewBuffer(*prefix_length);
+        status = status_to_io_status(provider->CreateNewPrefix(
+            fname, buffer.BufferStart(), *prefix_length));
+        if (status.ok()) {
+          buffer.Size(*prefix_length);
+          prefix = Slice(buffer.BufferStart(), buffer.CurrentSize());
+          // Write prefix
+          status = underlying->Append(prefix, options.io_options, dbg);
+        }
+        if (!status.ok()) {
+          return status;
+        }
+      }
+      // Create cipher stream
+      status = status_to_io_status(
+          provider->CreateCipherStream(fname, options, prefix, stream));
+    }
+    return status;
+  }
+
+  template <class TypeFile>
+  IOStatus CreateWritableEncryptedFile(const std::string& fname,
+                                       std::unique_ptr<TypeFile>& underlying,
+                                       const FileOptions& options,
+                                       std::unique_ptr<TypeFile>* result,
+                                       IODebugContext* dbg) {
+    // Create cipher stream
+    std::unique_ptr<BlockAccessCipherStream> stream;
+    size_t prefix_length;
+    IOStatus status = CreateWritableCipherStream(fname, underlying, options,
+                                                 &prefix_length, &stream, dbg);
+    if (status.ok()) {
+      if (stream) {
+        result->reset(new EncryptedWritableFile(
+            std::move(underlying), std::move(stream), prefix_length));
+      } else {
+        result->reset(underlying.release());
+      }
+    }
+    return status;
+  }
+
+  // Creates a CipherStream for the underlying file/name using the options
+  // If a writable provider is found and encryption is enabled, uses
+  // this provider to create a cipher stream.
+  // @param fname         Name of the writable file
+  // @param underlying    The underlying "raw" file
+  // @param options       Options for creating the file/cipher
+  // @param prefix_length Returns the length of the encryption prefix used for
+  // this file
+  // @param stream        Returns the cipher stream to use for this file if it
+  // should be encrypted
+  // @return OK on success, non-OK on failure.
+  template <class TypeFile>
+  IOStatus CreateRandomWriteCipherStream(
+      const std::string& fname, const std::unique_ptr<TypeFile>& underlying,
+      const FileOptions& options, size_t* prefix_length,
+      std::unique_ptr<BlockAccessCipherStream>* stream, IODebugContext* dbg) {
+    EncryptionProvider* provider = nullptr;
+    *prefix_length = 0;
+    IOStatus io_s = GetWritableProvider(fname, &provider);
+    if (!io_s.ok()) {
+      return io_s;
+    } else if (provider != nullptr) {
+      // Initialize & write prefix (if needed)
+      AlignedBuffer buffer;
+      Slice prefix;
+      *prefix_length = provider->GetPrefixLength();
+      if (*prefix_length > 0) {
+        // Initialize prefix
+        buffer.Alignment(underlying->GetRequiredBufferAlignment());
+        buffer.AllocateNewBuffer(*prefix_length);
+        io_s = status_to_io_status(provider->CreateNewPrefix(
+            fname, buffer.BufferStart(), *prefix_length));
+        if (io_s.ok()) {
+          buffer.Size(*prefix_length);
+          prefix = Slice(buffer.BufferStart(), buffer.CurrentSize());
+          // Write prefix
+          io_s = underlying->Write(0, prefix, options.io_options, dbg);
+        }
+        if (!io_s.ok()) {
+          return io_s;
+        }
+      }
+      // Create cipher stream
+      io_s = status_to_io_status(
+          provider->CreateCipherStream(fname, options, prefix, stream));
+    }
+    return io_s;
+  }
+
+  // Creates a CipherStream for the underlying file/name using the options
+  // If a readable provider is found and the file is encrypted, uses
+  // this provider to create a cipher stream.
+  // @param fname         Name of the writable file
+  // @param underlying    The underlying "raw" file
+  // @param options       Options for creating the file/cipher
+  // @param prefix_length Returns the length of the encryption prefix used for
+  // this file
+  // @param stream        Returns the cipher stream to use for this file if it
+  // is encrypted
+  // @return OK on success, non-OK on failure.
+  template <class TypeFile>
+  IOStatus CreateSequentialCipherStream(
+      const std::string& fname, const std::unique_ptr<TypeFile>& underlying,
+      const FileOptions& options, size_t* prefix_length,
+      std::unique_ptr<BlockAccessCipherStream>* stream, IODebugContext* dbg) {
+    // Read prefix (if needed)
+    AlignedBuffer buffer;
+    Slice prefix;
+    *prefix_length = provider_->GetPrefixLength();
+    if (*prefix_length > 0) {
+      // Read prefix
+      buffer.Alignment(underlying->GetRequiredBufferAlignment());
+      buffer.AllocateNewBuffer(*prefix_length);
+      IOStatus status = underlying->Read(*prefix_length, options.io_options,
+                                         &prefix, buffer.BufferStart(), dbg);
+      if (!status.ok()) {
+        return status;
+      }
+      buffer.Size(*prefix_length);
+    }
+    return status_to_io_status(
+        provider_->CreateCipherStream(fname, options, prefix, stream));
+  }
+
+  // Creates a CipherStream for the underlying file/name using the options
+  // If a readable provider is found and the file is encrypted, uses
+  // this provider to create a cipher stream.
+  // @param fname         Name of the writable file
+  // @param underlying    The underlying "raw" file
+  // @param options       Options for creating the file/cipher
+  // @param prefix_length Returns the length of the encryption prefix used for
+  // this file
+  // @param stream        Returns the cipher stream to use for this file if it
+  // is encrypted
+  // @return OK on success, non-OK on failure.
+  template <class TypeFile>
+  IOStatus CreateRandomReadCipherStream(
+      const std::string& fname, const std::unique_ptr<TypeFile>& underlying,
+      const FileOptions& options, size_t* prefix_length,
+      std::unique_ptr<BlockAccessCipherStream>* stream, IODebugContext* dbg) {
+    // Read prefix (if needed)
+    AlignedBuffer buffer;
+    Slice prefix;
+    *prefix_length = provider_->GetPrefixLength();
+    if (*prefix_length > 0) {
+      // Read prefix
+      buffer.Alignment(underlying->GetRequiredBufferAlignment());
+      buffer.AllocateNewBuffer(*prefix_length);
+      IOStatus status = underlying->Read(0, *prefix_length, options.io_options,
+                                         &prefix, buffer.BufferStart(), dbg);
+      if (!status.ok()) {
+        return status;
+      }
+      buffer.Size(*prefix_length);
+    }
+    return status_to_io_status(
+        provider_->CreateCipherStream(fname, options, prefix, stream));
+  }
+
+ public:
+  EncryptedFileSystemImpl(const std::shared_ptr<FileSystem>& base,
+                          const std::shared_ptr<EncryptionProvider>& provider)
+      : EncryptedFileSystem(base) {
+    provider_ = provider;
+    RegisterOptions("EncryptionProvider", &provider_, &encrypted_fs_type_info);
+  }
+
+  Status AddCipher(const std::string& descriptor, const char* cipher,
+                   size_t len, bool for_write) override {
+    return provider_->AddCipher(descriptor, cipher, len, for_write);
+  }
+
+  // NewSequentialFile opens a file for sequential reading.
+  IOStatus NewSequentialFile(const std::string& fname,
+                             const FileOptions& options,
+                             std::unique_ptr<FSSequentialFile>* result,
+                             IODebugContext* dbg) override {
+    result->reset();
+    if (options.use_mmap_reads) {
+      return IOStatus::InvalidArgument();
+    }
+    // Open file using underlying Env implementation
+    std::unique_ptr<FSSequentialFile> underlying;
+    auto status =
+        FileSystemWrapper::NewSequentialFile(fname, options, &underlying, dbg);
+    if (!status.ok()) {
+      return status;
+    }
+    uint64_t file_size;
+    status = FileSystemWrapper::GetFileSize(fname, options.io_options,
+                                            &file_size, dbg);
+    if (!status.ok()) {
+      return status;
+    }
+    if (!file_size) {
+      *result = std::move(underlying);
+      return status;
+    }
+    // Create cipher stream
+    std::unique_ptr<BlockAccessCipherStream> stream;
+    size_t prefix_length;
+    status = CreateSequentialCipherStream(fname, underlying, options,
+                                          &prefix_length, &stream, dbg);
+    if (status.ok()) {
+      result->reset(new EncryptedSequentialFile(
+          std::move(underlying), std::move(stream), prefix_length));
+    }
+    return status;
+  }
+
+  // NewRandomAccessFile opens a file for random read access.
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& options,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override {
+    result->reset();
+    if (options.use_mmap_reads) {
+      return IOStatus::InvalidArgument();
+    }
+    // Open file using underlying Env implementation
+    std::unique_ptr<FSRandomAccessFile> underlying;
+    auto status = FileSystemWrapper::NewRandomAccessFile(fname, options,
+                                                         &underlying, dbg);
+    if (!status.ok()) {
+      return status;
+    }
+    std::unique_ptr<BlockAccessCipherStream> stream;
+    size_t prefix_length;
+    status = CreateRandomReadCipherStream(fname, underlying, options,
+                                          &prefix_length, &stream, dbg);
+    if (status.ok()) {
+      if (stream) {
+        result->reset(new EncryptedRandomAccessFile(
+            std::move(underlying), std::move(stream), prefix_length));
+      } else {
+        result->reset(underlying.release());
+      }
+    }
+    return status;
+  }
+
+  // NewWritableFile opens a file for sequential writing.
+  IOStatus NewWritableFile(const std::string& fname, const FileOptions& options,
+                           std::unique_ptr<FSWritableFile>* result,
+                           IODebugContext* dbg) override {
+    result->reset();
+    if (options.use_mmap_writes) {
+      return IOStatus::InvalidArgument();
+    }
+    // Open file using underlying Env implementation
+    std::unique_ptr<FSWritableFile> underlying;
+    IOStatus status =
+        FileSystemWrapper::NewWritableFile(fname, options, &underlying, dbg);
+    if (!status.ok()) {
+      return status;
+    }
+    return CreateWritableEncryptedFile(fname, underlying, options, result, dbg);
+  }
+
+  // Create an object that writes to a new file with the specified
+  // name.  Deletes any existing file with the same name and creates a
+  // new file.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure stores nullptr in *result and
+  // returns non-OK.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  IOStatus ReopenWritableFile(const std::string& fname,
+                              const FileOptions& options,
+                              std::unique_ptr<FSWritableFile>* result,
+                              IODebugContext* dbg) override {
+    result->reset();
+    if (options.use_mmap_writes) {
+      return IOStatus::InvalidArgument();
+    }
+    // Open file using underlying Env implementation
+    std::unique_ptr<FSWritableFile> underlying;
+    IOStatus status =
+        FileSystemWrapper::ReopenWritableFile(fname, options, &underlying, dbg);
+    if (!status.ok()) {
+      return status;
+    }
+    return CreateWritableEncryptedFile(fname, underlying, options, result, dbg);
+  }
+
+  // Reuse an existing file by renaming it and opening it as writable.
+  IOStatus ReuseWritableFile(const std::string& fname,
+                             const std::string& old_fname,
+                             const FileOptions& options,
+                             std::unique_ptr<FSWritableFile>* result,
+                             IODebugContext* dbg) override {
+    result->reset();
+    if (options.use_mmap_writes) {
+      return IOStatus::InvalidArgument();
+    }
+    // Open file using underlying Env implementation
+    std::unique_ptr<FSWritableFile> underlying;
+    auto status = FileSystemWrapper::ReuseWritableFile(
+        fname, old_fname, options, &underlying, dbg);
+    if (!status.ok()) {
+      return status;
+    }
+    return CreateWritableEncryptedFile(fname, underlying, options, result, dbg);
+  }
+
+  // Open `fname` for random read and write, if file doesn't exist the file
+  // will be created.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure returns non-OK.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options,
+                           std::unique_ptr<FSRandomRWFile>* result,
+                           IODebugContext* dbg) override {
+    result->reset();
+    if (options.use_mmap_reads || options.use_mmap_writes) {
+      return IOStatus::InvalidArgument();
+    }
+    // Check file exists
+    bool isNewFile = !FileExists(fname, options.io_options, dbg).ok();
+
+    // Open file using underlying Env implementation
+    std::unique_ptr<FSRandomRWFile> underlying;
+    auto status =
+        FileSystemWrapper::NewRandomRWFile(fname, options, &underlying, dbg);
+    if (!status.ok()) {
+      return status;
+    }
+    // Create cipher stream
+    std::unique_ptr<BlockAccessCipherStream> stream;
+    size_t prefix_length = 0;
+    if (!isNewFile) {
+      // File already exists, read prefix
+      status = CreateRandomReadCipherStream(fname, underlying, options,
+                                            &prefix_length, &stream, dbg);
+    } else {
+      status = CreateRandomWriteCipherStream(fname, underlying, options,
+                                             &prefix_length, &stream, dbg);
+    }
+    if (status.ok()) {
+      if (stream) {
+        result->reset(new EncryptedRandomRWFile(
+            std::move(underlying), std::move(stream), prefix_length));
+      } else {
+        result->reset(underlying.release());
+      }
+    }
+    return status;
+  }
+
+  // Store in *result the attributes of the children of the specified
+  // directory.
+  // In case the implementation lists the directory prior to iterating the
+  // files
+  // and files are concurrently deleted, the deleted files will be omitted
+  // from
+  // result.
+  // The name attributes are relative to "dir".
+  // Original contents of *results are dropped.
+  // Returns OK if "dir" exists and "*result" contains its children.
+  //         NotFound if "dir" does not exist, the calling process does not
+  //         have
+  //                  permission to access "dir", or if "dir" is invalid.
+  //         IOError if an IO Error was encountered
+  IOStatus GetChildrenFileAttributes(const std::string& dir,
+                                     const IOOptions& options,
+                                     std::vector<FileAttributes>* result,
+                                     IODebugContext* dbg) override {
+    auto status =
+        FileSystemWrapper::GetChildrenFileAttributes(dir, options, result, dbg);
+    if (!status.ok()) {
+      return status;
+    }
+    for (auto it = std::begin(*result); it != std::end(*result); ++it) {
+      // assert(it->size_bytes >= prefixLength);
+      //  breaks env_basic_test when called on directory containing
+      //  directories
+      // which makes subtraction of prefixLength worrisome since
+      // FileAttributes does not identify directories
+      EncryptionProvider* provider;
+      status = GetReadableProvider(it->name, &provider);
+      if (!status.ok()) {
+        return status;
+      } else if (provider != nullptr) {
+        it->size_bytes -= provider->GetPrefixLength();
+      }
+    }
+    return IOStatus::OK();
+  }
+
+  // Store the size of fname in *file_size.
+  IOStatus GetFileSize(const std::string& fname, const IOOptions& options,
+                       uint64_t* file_size, IODebugContext* dbg) override {
+    auto status =
+        FileSystemWrapper::GetFileSize(fname, options, file_size, dbg);
+    if (!status.ok() || !(*file_size)) {
+      return status;
+    }
+    EncryptionProvider* provider;
+    status = GetReadableProvider(fname, &provider);
+    if (provider != nullptr && status.ok()) {
+      size_t prefixLength = provider->GetPrefixLength();
+      assert(*file_size >= prefixLength);
+      *file_size -= prefixLength;
+    }
+    return status;
+  }
+
+ private:
+  std::shared_ptr<EncryptionProvider> provider_;
+};
+}  // namespace
+
+Status NewEncryptedFileSystemImpl(
+    const std::shared_ptr<FileSystem>& base,
+    const std::shared_ptr<EncryptionProvider>& provider,
+    std::unique_ptr<FileSystem>* result) {
+  result->reset(new EncryptedFileSystemImpl(base, provider));
+  return Status::OK();
+}
+
+std::shared_ptr<FileSystem> NewEncryptedFS(
+    const std::shared_ptr<FileSystem>& base,
+    const std::shared_ptr<EncryptionProvider>& provider) {
+  std::unique_ptr<FileSystem> efs;
+  Status s = NewEncryptedFileSystemImpl(base, provider, &efs);
+  if (s.ok()) {
+    s = efs->PrepareOptions(ConfigOptions());
+  }
+  if (s.ok()) {
+    std::shared_ptr<FileSystem> result(efs.release());
+    return result;
+  } else {
+    return nullptr;
+  }
+}
+// Returns an Env that encrypts data when stored on disk and decrypts data when
+// read from disk.
+Env* NewEncryptedEnv(Env* base_env,
+                     const std::shared_ptr<EncryptionProvider>& provider) {
+  return new CompositeEnvWrapper(
+      base_env, NewEncryptedFS(base_env->GetFileSystem(), provider));
+}
+
+// Encrypt one or more (partial) blocks of data at the file offset.
+// Length of data is given in dataSize.
+Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char* data,
+                                        size_t dataSize) {
+  // Calculate block index
+  auto blockSize = BlockSize();
+  uint64_t blockIndex = fileOffset / blockSize;
+  size_t blockOffset = fileOffset % blockSize;
+  std::unique_ptr<char[]> blockBuffer;
+
+  std::string scratch;
+  AllocateScratch(scratch);
+
+  // Encrypt individual blocks.
+  while (1) {
+    char* block = data;
+    size_t n = std::min(dataSize, blockSize - blockOffset);
+    if (n != blockSize) {
+      // We're not encrypting a full block.
+      // Copy data to blockBuffer
+      if (!blockBuffer.get()) {
+        // Allocate buffer
+        blockBuffer = std::unique_ptr<char[]>(new char[blockSize]);
+      }
+      block = blockBuffer.get();
+      // Copy plain data to block buffer
+      memmove(block + blockOffset, data, n);
+    }
+    auto status = EncryptBlock(blockIndex, block, (char*)scratch.data());
+    if (!status.ok()) {
+      return status;
+    }
+    if (block != data) {
+      // Copy encrypted data back to `data`.
+      memmove(data, block + blockOffset, n);
+    }
+    dataSize -= n;
+    if (dataSize == 0) {
+      return Status::OK();
+    }
+    data += n;
+    blockOffset = 0;
+    blockIndex++;
+  }
+}
+
+// Decrypt one or more (partial) blocks of data at the file offset.
+// Length of data is given in dataSize.
+Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char* data,
+                                        size_t dataSize) {
+  // Calculate block index
+  auto blockSize = BlockSize();
+  uint64_t blockIndex = fileOffset / blockSize;
+  size_t blockOffset = fileOffset % blockSize;
+  std::unique_ptr<char[]> blockBuffer;
+
+  std::string scratch;
+  AllocateScratch(scratch);
+
+  // Decrypt individual blocks.
+  while (1) {
+    char* block = data;
+    size_t n = std::min(dataSize, blockSize - blockOffset);
+    if (n != blockSize) {
+      // We're not decrypting a full block.
+      // Copy data to blockBuffer
+      if (!blockBuffer.get()) {
+        // Allocate buffer
+        blockBuffer = std::unique_ptr<char[]>(new char[blockSize]);
+      }
+      block = blockBuffer.get();
+      // Copy encrypted data to block buffer
+      memmove(block + blockOffset, data, n);
+    }
+    auto status = DecryptBlock(blockIndex, block, (char*)scratch.data());
+    if (!status.ok()) {
+      return status;
+    }
+    if (block != data) {
+      // Copy decrypted data back to `data`.
+      memmove(data, block + blockOffset, n);
+    }
+
+    // Simply decrementing dataSize by n could cause it to underflow,
+    // which will very likely make it read over the original bounds later
+    assert(dataSize >= n);
+    if (dataSize < n) {
+      return Status::Corruption("Cannot decrypt data at given offset");
+    }
+
+    dataSize -= n;
+    if (dataSize == 0) {
+      return Status::OK();
+    }
+    data += n;
+    blockOffset = 0;
+    blockIndex++;
+  }
+}
+
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo>
+    rot13_block_cipher_type_info = {
+        {"block_size",
+         {0 /* No offset, whole struct*/, OptionType::kInt,
+          OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+};
+// Implements a BlockCipher using ROT13.
+//
+// Note: This is a sample implementation of BlockCipher,
+// it is NOT considered safe and should NOT be used in production.
+class ROT13BlockCipher : public BlockCipher {
+ private:
+  size_t blockSize_;
+
+ public:
+  explicit ROT13BlockCipher(size_t blockSize) : blockSize_(blockSize) {
+    RegisterOptions("ROT13BlockCipherOptions", &blockSize_,
+                    &rot13_block_cipher_type_info);
+  }
+
+  static const char* kClassName() { return "ROT13"; }
+  const char* Name() const override { return kClassName(); }
+  // BlockSize returns the size of each block supported by this cipher stream.
+  size_t BlockSize() override { return blockSize_; }
+
+  // Encrypt a block of data.
+  // Length of data is equal to BlockSize().
+  Status Encrypt(char* data) override {
+    for (size_t i = 0; i < blockSize_; ++i) {
+      data[i] += 13;
+    }
+    return Status::OK();
+  }
+
+  // Decrypt a block of data.
+  // Length of data is equal to BlockSize().
+  Status Decrypt(char* data) override { return Encrypt(data); }
+};
+static const std::unordered_map<std::string, OptionTypeInfo>
+    ctr_encryption_provider_type_info = {
+        {"cipher",
+         OptionTypeInfo::AsCustomSharedPtr<BlockCipher>(
+             0 /* No offset, whole struct*/, OptionVerificationType::kByName,
+             OptionTypeFlags::kNone)},
+};
+}  // anonymous namespace
+
+// Allocate scratch space which is passed to EncryptBlock/DecryptBlock.
+void CTRCipherStream::AllocateScratch(std::string& scratch) {
+  auto blockSize = cipher_->BlockSize();
+  scratch.reserve(blockSize);
+}
+
+// Encrypt a block of data at the given block index.
+// Length of data is equal to BlockSize();
+Status CTRCipherStream::EncryptBlock(uint64_t blockIndex, char* data,
+                                     char* scratch) {
+  // Create nonce + counter
+  auto blockSize = cipher_->BlockSize();
+  memmove(scratch, iv_.data(), blockSize);
+  EncodeFixed64(scratch, blockIndex + initialCounter_);
+
+  // Encrypt nonce+counter
+  auto status = cipher_->Encrypt(scratch);
+  if (!status.ok()) {
+    return status;
+  }
+
+  // XOR data with ciphertext.
+  for (size_t i = 0; i < blockSize; i++) {
+    data[i] = data[i] ^ scratch[i];
+  }
+  return Status::OK();
+}
+
+// Decrypt a block of data at the given block index.
+// Length of data is equal to BlockSize();
+Status CTRCipherStream::DecryptBlock(uint64_t blockIndex, char* data,
+                                     char* scratch) {
+  // For CTR decryption & encryption are the same
+  return EncryptBlock(blockIndex, data, scratch);
+}
+
+CTREncryptionProvider::CTREncryptionProvider(
+    const std::shared_ptr<BlockCipher>& c)
+    : cipher_(c) {
+  RegisterOptions("Cipher", &cipher_, &ctr_encryption_provider_type_info);
+}
+
+bool CTREncryptionProvider::IsInstanceOf(const std::string& name) const {
+  // Special case for test purposes.
+  if (name == "1://test" && cipher_ != nullptr) {
+    return cipher_->IsInstanceOf(ROT13BlockCipher::kClassName());
+  } else {
+    return EncryptionProvider::IsInstanceOf(name);
+  }
+}
+
+// GetPrefixLength returns the length of the prefix that is added to every file
+// and used for storing encryption options.
+// For optimal performance, the prefix length should be a multiple of
+// the page size.
+size_t CTREncryptionProvider::GetPrefixLength() const {
+  return defaultPrefixLength;
+}
+
+Status CTREncryptionProvider::AddCipher(const std::string& /*descriptor*/,
+                                        const char* cipher, size_t len,
+                                        bool /*for_write*/) {
+  if (cipher_) {
+    return Status::NotSupported("Cannot add keys to CTREncryptionProvider");
+  } else if (strcmp(ROT13BlockCipher::kClassName(), cipher) == 0) {
+    cipher_.reset(new ROT13BlockCipher(len));
+    return Status::OK();
+  } else {
+    return BlockCipher::CreateFromString(ConfigOptions(), std::string(cipher),
+                                         &cipher_);
+  }
+}
+
+// decodeCTRParameters decodes the initial counter & IV from the given
+// (plain text) prefix.
+static void decodeCTRParameters(const char* prefix, size_t blockSize,
+                                uint64_t& initialCounter, Slice& iv) {
+  // First block contains 64-bit initial counter
+  initialCounter = DecodeFixed64(prefix);
+  // Second block contains IV
+  iv = Slice(prefix + blockSize, blockSize);
+}
+
+// CreateNewPrefix initialized an allocated block of prefix memory
+// for a new file.
+Status CTREncryptionProvider::CreateNewPrefix(const std::string& /*fname*/,
+                                              char* prefix,
+                                              size_t prefixLength) const {
+  if (!cipher_) {
+    return Status::InvalidArgument("Encryption Cipher is missing");
+  }
+  // Create & seed rnd.
+  Random rnd((uint32_t)SystemClock::Default()->NowMicros());
+  // Fill entire prefix block with random values.
+  for (size_t i = 0; i < prefixLength; i++) {
+    prefix[i] = rnd.Uniform(256) & 0xFF;
+  }
+  // Take random data to extract initial counter & IV
+  auto blockSize = cipher_->BlockSize();
+  uint64_t initialCounter;
+  Slice prefixIV;
+  decodeCTRParameters(prefix, blockSize, initialCounter, prefixIV);
+
+  // Now populate the rest of the prefix, starting from the third block.
+  PopulateSecretPrefixPart(prefix + (2 * blockSize),
+                           prefixLength - (2 * blockSize), blockSize);
+
+  // Encrypt the prefix, starting from block 2 (leave block 0, 1 with initial
+  // counter & IV unencrypted)
+  CTRCipherStream cipherStream(cipher_, prefixIV.data(), initialCounter);
+  Status status;
+  {
+    PERF_TIMER_GUARD(encrypt_data_nanos);
+    status = cipherStream.Encrypt(0, prefix + (2 * blockSize),
+                                  prefixLength - (2 * blockSize));
+  }
+  if (!status.ok()) {
+    return status;
+  }
+  return Status::OK();
+}
+
+// PopulateSecretPrefixPart initializes the data into a new prefix block
+// in plain text.
+// Returns the amount of space (starting from the start of the prefix)
+// that has been initialized.
+size_t CTREncryptionProvider::PopulateSecretPrefixPart(
+    char* /*prefix*/, size_t /*prefixLength*/, size_t /*blockSize*/) const {
+  // Nothing to do here, put in custom data in override when needed.
+  return 0;
+}
+
+Status CTREncryptionProvider::CreateCipherStream(
+    const std::string& fname, const EnvOptions& options, Slice& prefix,
+    std::unique_ptr<BlockAccessCipherStream>* result) {
+  if (!cipher_) {
+    return Status::InvalidArgument("Encryption Cipher is missing");
+  }
+  // Read plain text part of prefix.
+  auto blockSize = cipher_->BlockSize();
+  uint64_t initialCounter;
+  Slice iv;
+  decodeCTRParameters(prefix.data(), blockSize, initialCounter, iv);
+
+  // If the prefix is smaller than twice the block size, we would below read a
+  // very large chunk of the file (and very likely read over the bounds)
+  assert(prefix.size() >= 2 * blockSize);
+  if (prefix.size() < 2 * blockSize) {
+    return Status::Corruption("Unable to read from file " + fname +
+                              ": read attempt would read beyond file bounds");
+  }
+
+  // Decrypt the encrypted part of the prefix, starting from block 2 (block 0, 1
+  // with initial counter & IV are unencrypted)
+  CTRCipherStream cipherStream(cipher_, iv.data(), initialCounter);
+  Status status;
+  {
+    PERF_TIMER_GUARD(decrypt_data_nanos);
+    status = cipherStream.Decrypt(0, (char*)prefix.data() + (2 * blockSize),
+                                  prefix.size() - (2 * blockSize));
+  }
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Create cipher stream
+  return CreateCipherStreamFromPrefix(fname, options, initialCounter, iv,
+                                      prefix, result);
+}
+
+// CreateCipherStreamFromPrefix creates a block access cipher stream for a file
+// given given name and options. The given prefix is already decrypted.
+Status CTREncryptionProvider::CreateCipherStreamFromPrefix(
+    const std::string& /*fname*/, const EnvOptions& /*options*/,
+    uint64_t initialCounter, const Slice& iv, const Slice& /*prefix*/,
+    std::unique_ptr<BlockAccessCipherStream>* result) {
+  (*result) = std::unique_ptr<BlockAccessCipherStream>(
+      new CTRCipherStream(cipher_, iv.data(), initialCounter));
+  return Status::OK();
+}
+
+namespace {
+static void RegisterEncryptionBuiltins() {
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    auto lib = ObjectRegistry::Default()->AddLibrary("encryption");
+    // Match "CTR" or "CTR://test"
+    lib->AddFactory<EncryptionProvider>(
+        ObjectLibrary::PatternEntry(CTREncryptionProvider::kClassName(), true)
+            .AddSuffix("://test"),
+        [](const std::string& uri, std::unique_ptr<EncryptionProvider>* guard,
+           std::string* /*errmsg*/) {
+          if (EndsWith(uri, "://test")) {
+            std::shared_ptr<BlockCipher> cipher =
+                std::make_shared<ROT13BlockCipher>(32);
+            guard->reset(new CTREncryptionProvider(cipher));
+          } else {
+            guard->reset(new CTREncryptionProvider());
+          }
+          return guard->get();
+        });
+
+    lib->AddFactory<EncryptionProvider>(
+        "1://test", [](const std::string& /*uri*/,
+                       std::unique_ptr<EncryptionProvider>* guard,
+                       std::string* /*errmsg*/) {
+          std::shared_ptr<BlockCipher> cipher =
+              std::make_shared<ROT13BlockCipher>(32);
+          guard->reset(new CTREncryptionProvider(cipher));
+          return guard->get();
+        });
+
+    // Match "ROT13" or "ROT13:[0-9]+"
+    lib->AddFactory<BlockCipher>(
+        ObjectLibrary::PatternEntry(ROT13BlockCipher::kClassName(), true)
+            .AddNumber(":"),
+        [](const std::string& uri, std::unique_ptr<BlockCipher>* guard,
+           std::string* /* errmsg */) {
+          size_t colon = uri.find(':');
+          if (colon != std::string::npos) {
+            size_t block_size = ParseSizeT(uri.substr(colon + 1));
+            guard->reset(new ROT13BlockCipher(block_size));
+          } else {
+            guard->reset(new ROT13BlockCipher(32));
+          }
+
+          return guard->get();
+        });
+  });
+}
+}  // namespace
+
+Status BlockCipher::CreateFromString(const ConfigOptions& config_options,
+                                     const std::string& value,
+                                     std::shared_ptr<BlockCipher>* result) {
+  RegisterEncryptionBuiltins();
+  return LoadSharedObject<BlockCipher>(config_options, value, nullptr, result);
+}
+
+Status EncryptionProvider::CreateFromString(
+    const ConfigOptions& config_options, const std::string& value,
+    std::shared_ptr<EncryptionProvider>* result) {
+  RegisterEncryptionBuiltins();
+  return LoadSharedObject<EncryptionProvider>(config_options, value, nullptr,
+                                              result);
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/env/env_encryption_ctr.h b/src/rocksdb/env/env_encryption_ctr.h
new file mode 100644
index 000000000..cfb440c72
--- /dev/null
+++ b/src/rocksdb/env/env_encryption_ctr.h
@@ -0,0 +1,116 @@
+//  Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#if !defined(ROCKSDB_LITE)
+
+#include "rocksdb/env_encryption.h"
+
+namespace ROCKSDB_NAMESPACE {
+// CTRCipherStream implements BlockAccessCipherStream using an
+// Counter operations mode.
+// See https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation
+//
+// Note: This is a possible implementation of BlockAccessCipherStream,
+// it is considered suitable for use.
+class CTRCipherStream final : public BlockAccessCipherStream {
+ private:
+  std::shared_ptr<BlockCipher> cipher_;
+  std::string iv_;
+  uint64_t initialCounter_;
+
+ public:
+  CTRCipherStream(const std::shared_ptr<BlockCipher>& c, const char* iv,
+                  uint64_t initialCounter)
+      : cipher_(c), iv_(iv, c->BlockSize()), initialCounter_(initialCounter){};
+  virtual ~CTRCipherStream(){};
+
+  // BlockSize returns the size of each block supported by this cipher stream.
+  size_t BlockSize() override { return cipher_->BlockSize(); }
+
+ protected:
+  // Allocate scratch space which is passed to EncryptBlock/DecryptBlock.
+  void AllocateScratch(std::string&) override;
+
+  // Encrypt a block of data at the given block index.
+  // Length of data is equal to BlockSize();
+  Status EncryptBlock(uint64_t blockIndex, char* data, char* scratch) override;
+
+  // Decrypt a block of data at the given block index.
+  // Length of data is equal to BlockSize();
+  Status DecryptBlock(uint64_t blockIndex, char* data, char* scratch) override;
+};
+
+// This encryption provider uses a CTR cipher stream, with a given block cipher
+// and IV.
+//
+// Note: This is a possible implementation of EncryptionProvider,
+// it is considered suitable for use, provided a safe BlockCipher is used.
+class CTREncryptionProvider : public EncryptionProvider {
+ private:
+  std::shared_ptr<BlockCipher> cipher_;
+
+ protected:
+  // For optimal performance when using direct IO, the prefix length should be a
+  // multiple of the page size. This size is to ensure the first real data byte
+  // is placed at largest known alignment point for direct io.
+  const static size_t defaultPrefixLength = 4096;
+
+ public:
+  explicit CTREncryptionProvider(
+      const std::shared_ptr<BlockCipher>& c = nullptr);
+  virtual ~CTREncryptionProvider() {}
+
+  static const char* kClassName() { return "CTR"; }
+  const char* Name() const override { return kClassName(); }
+  bool IsInstanceOf(const std::string& name) const override;
+  // GetPrefixLength returns the length of the prefix that is added to every
+  // file
+  // and used for storing encryption options.
+  // For optimal performance when using direct IO, the prefix length should be a
+  // multiple of the page size.
+  size_t GetPrefixLength() const override;
+
+  // CreateNewPrefix initialized an allocated block of prefix memory
+  // for a new file.
+  Status CreateNewPrefix(const std::string& fname, char* prefix,
+                         size_t prefixLength) const override;
+
+  // CreateCipherStream creates a block access cipher stream for a file given
+  // given name and options.
+  Status CreateCipherStream(
+      const std::string& fname, const EnvOptions& options, Slice& prefix,
+      std::unique_ptr<BlockAccessCipherStream>* result) override;
+
+  Status AddCipher(const std::string& descriptor, const char* /*cipher*/,
+                   size_t /*len*/, bool /*for_write*/) override;
+
+ protected:
+  // PopulateSecretPrefixPart initializes the data into a new prefix block
+  // that will be encrypted. This function will store the data in plain text.
+  // It will be encrypted later (before written to disk).
+  // Returns the amount of space (starting from the start of the prefix)
+  // that has been initialized.
+  virtual size_t PopulateSecretPrefixPart(char* prefix, size_t prefixLength,
+                                          size_t blockSize) const;
+
+  // CreateCipherStreamFromPrefix creates a block access cipher stream for a
+  // file given
+  // given name and options. The given prefix is already decrypted.
+  virtual Status CreateCipherStreamFromPrefix(
+      const std::string& fname, const EnvOptions& options,
+      uint64_t initialCounter, const Slice& iv, const Slice& prefix,
+      std::unique_ptr<BlockAccessCipherStream>* result);
+};
+
+Status NewEncryptedFileSystemImpl(
+    const std::shared_ptr<FileSystem>& base_fs,
+    const std::shared_ptr<EncryptionProvider>& provider,
+    std::unique_ptr<FileSystem>* fs);
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !defined(ROCKSDB_LITE)
diff --git a/src/rocksdb/env/env_posix.cc b/src/rocksdb/env/env_posix.cc
new file mode 100644
index 000000000..77f28e1f5
--- /dev/null
+++ b/src/rocksdb/env/env_posix.cc
@@ -0,0 +1,520 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors
+
+#include "port/lang.h"
+#if !defined(OS_WIN)
+
+#include <dirent.h>
+#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
+#include <dlfcn.h>
+#endif
+#include <errno.h>
+#include <fcntl.h>
+
+#if defined(ROCKSDB_IOURING_PRESENT)
+#include <liburing.h>
+#endif
+#include <pthread.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#if defined(OS_LINUX) || defined(OS_SOLARIS) || defined(OS_ANDROID)
+#include <sys/statfs.h>
+#endif
+#include <sys/statvfs.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#if defined(ROCKSDB_IOURING_PRESENT)
+#include <sys/uio.h>
+#endif
+#include <time.h>
+#include <unistd.h>
+
+#include <algorithm>
+// Get nano time includes
+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD)
+#elif defined(__MACH__)
+#include <Availability.h>
+#include <mach/clock.h>
+#include <mach/mach.h>
+#else
+#include <chrono>
+#endif
+#include <deque>
+#include <set>
+#include <vector>
+
+#include "env/composite_env_wrapper.h"
+#include "env/io_posix.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/thread_status_updater.h"
+#include "port/port.h"
+#include "port/sys_time.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/compression_context_cache.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "util/thread_local.h"
+#include "util/threadpool_imp.h"
+
+#if !defined(TMPFS_MAGIC)
+#define TMPFS_MAGIC 0x01021994
+#endif
+#if !defined(XFS_SUPER_MAGIC)
+#define XFS_SUPER_MAGIC 0x58465342
+#endif
+#if !defined(EXT4_SUPER_MAGIC)
+#define EXT4_SUPER_MAGIC 0xEF53
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+#if defined(OS_WIN)
+static const std::string kSharedLibExt = ".dll";
+static const char kPathSeparator = ';';
+#else
+static const char kPathSeparator = ':';
+#if defined(OS_MACOSX)
+static const std::string kSharedLibExt = ".dylib";
+#else
+static const std::string kSharedLibExt = ".so";
+#endif
+#endif
+
+namespace {
+
+ThreadStatusUpdater* CreateThreadStatusUpdater() {
+  return new ThreadStatusUpdater();
+}
+
+#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
+class PosixDynamicLibrary : public DynamicLibrary {
+ public:
+  PosixDynamicLibrary(const std::string& name, void* handle)
+      : name_(name), handle_(handle) {}
+  ~PosixDynamicLibrary() override { dlclose(handle_); }
+
+  Status LoadSymbol(const std::string& sym_name, void** func) override {
+    assert(nullptr != func);
+    dlerror();  // Clear any old error
+    *func = dlsym(handle_, sym_name.c_str());
+    if (*func != nullptr) {
+      return Status::OK();
+    } else {
+      char* err = dlerror();
+      return Status::NotFound("Error finding symbol: " + sym_name, err);
+    }
+  }
+
+  const char* Name() const override { return name_.c_str(); }
+
+ private:
+  std::string name_;
+  void* handle_;
+};
+#endif  // !ROCKSDB_NO_DYNAMIC_EXTENSION
+
+class PosixClock : public SystemClock {
+ public:
+  static const char* kClassName() { return "PosixClock"; }
+  const char* Name() const override { return kDefaultName(); }
+  const char* NickName() const override { return kClassName(); }
+
+  uint64_t NowMicros() override {
+    port::TimeVal tv;
+    port::GetTimeOfDay(&tv, nullptr);
+    return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+  }
+
+  uint64_t NowNanos() override {
+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \
+    defined(OS_AIX)
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
+#elif defined(OS_SOLARIS)
+    return gethrtime();
+#elif defined(__MACH__)
+    clock_serv_t cclock;
+    mach_timespec_t ts;
+    host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
+    clock_get_time(cclock, &ts);
+    mach_port_deallocate(mach_task_self(), cclock);
+    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
+#else
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(
+               std::chrono::steady_clock::now().time_since_epoch())
+        .count();
+#endif
+  }
+
+  uint64_t CPUMicros() override {
+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \
+    defined(OS_AIX) || (defined(__MACH__) && defined(__MAC_10_12))
+    struct timespec ts;
+    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
+    return (static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec) / 1000;
+#endif
+    return 0;
+  }
+
+  uint64_t CPUNanos() override {
+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \
+    defined(OS_AIX) || (defined(__MACH__) && defined(__MAC_10_12))
+    struct timespec ts;
+    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
+    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
+#endif
+    return 0;
+  }
+
+  void SleepForMicroseconds(int micros) override { usleep(micros); }
+
+  Status GetCurrentTime(int64_t* unix_time) override {
+    time_t ret = time(nullptr);
+    if (ret == (time_t)-1) {
+      return IOError("GetCurrentTime", "", errno);
+    }
+    *unix_time = (int64_t)ret;
+    return Status::OK();
+  }
+
+  std::string TimeToString(uint64_t secondsSince1970) override {
+    const time_t seconds = (time_t)secondsSince1970;
+    struct tm t;
+    int maxsize = 64;
+    std::string dummy;
+    dummy.reserve(maxsize);
+    dummy.resize(maxsize);
+    char* p = &dummy[0];
+    port::LocalTimeR(&seconds, &t);
+    snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ", t.tm_year + 1900,
+             t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec);
+    return dummy;
+  }
+};
+
+class PosixEnv : public CompositeEnv {
+ public:
+  static const char* kClassName() { return "PosixEnv"; }
+  const char* Name() const override { return kClassName(); }
+  const char* NickName() const override { return kDefaultName(); }
+
+  ~PosixEnv() override {
+    if (this == Env::Default()) {
+      for (const auto tid : threads_to_join_) {
+        pthread_join(tid, nullptr);
+      }
+      for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) {
+        thread_pools_[pool_id].JoinAllThreads();
+      }
+      // Do not delete the thread_status_updater_ in order to avoid the
+      // free after use when Env::Default() is destructed while some other
+      // child threads are still trying to update thread status. All
+      // PosixEnv instances use the same thread_status_updater_, so never
+      // explicitly delete it.
+    }
+  }
+
+  void SetFD_CLOEXEC(int fd, const EnvOptions* options) {
+    if ((options == nullptr || options->set_fd_cloexec) && fd > 0) {
+      fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
+    }
+  }
+
+#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
+  // Loads the named library into the result.
+  // If the input name is empty, the current executable is loaded
+  // On *nix systems, a "lib" prefix is added to the name if one is not supplied
+  // Comparably, the appropriate shared library extension is added to the name
+  // if not supplied. If search_path is not specified, the shared library will
+  // be loaded using the default path (LD_LIBRARY_PATH) If search_path is
+  // specified, the shared library will be searched for in the directories
+  // provided by the search path
+  Status LoadLibrary(const std::string& name, const std::string& path,
+                     std::shared_ptr<DynamicLibrary>* result) override {
+    assert(result != nullptr);
+    if (name.empty()) {
+      void* hndl = dlopen(NULL, RTLD_NOW);
+      if (hndl != nullptr) {
+        result->reset(new PosixDynamicLibrary(name, hndl));
+        return Status::OK();
+      }
+    } else {
+      std::string library_name = name;
+      if (library_name.find(kSharedLibExt) == std::string::npos) {
+        library_name = library_name + kSharedLibExt;
+      }
+#if !defined(OS_WIN)
+      if (library_name.find('/') == std::string::npos &&
+          library_name.compare(0, 3, "lib") != 0) {
+        library_name = "lib" + library_name;
+      }
+#endif
+      if (path.empty()) {
+        void* hndl = dlopen(library_name.c_str(), RTLD_NOW);
+        if (hndl != nullptr) {
+          result->reset(new PosixDynamicLibrary(library_name, hndl));
+          return Status::OK();
+        }
+      } else {
+        std::string local_path;
+        std::stringstream ss(path);
+        while (getline(ss, local_path, kPathSeparator)) {
+          if (!path.empty()) {
+            std::string full_name = local_path + "/" + library_name;
+            void* hndl = dlopen(full_name.c_str(), RTLD_NOW);
+            if (hndl != nullptr) {
+              result->reset(new PosixDynamicLibrary(full_name, hndl));
+              return Status::OK();
+            }
+          }
+        }
+      }
+    }
+    return Status::IOError(
+        IOErrorMsg("Failed to open shared library: xs", name), dlerror());
+  }
+#endif  // !ROCKSDB_NO_DYNAMIC_EXTENSION
+
+  void Schedule(void (*function)(void* arg1), void* arg, Priority pri = LOW,
+                void* tag = nullptr,
+                void (*unschedFunction)(void* arg) = nullptr) override;
+
+  int UnSchedule(void* arg, Priority pri) override;
+
+  void StartThread(void (*function)(void* arg), void* arg) override;
+
+  void WaitForJoin() override;
+
+  unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override;
+
+  int ReserveThreads(int threads_to_be_reserved, Priority pri) override;
+
+  int ReleaseThreads(int threads_to_be_released, Priority pri) override;
+
+  Status GetThreadList(std::vector<ThreadStatus>* thread_list) override {
+    assert(thread_status_updater_);
+    return thread_status_updater_->GetThreadList(thread_list);
+  }
+
+  uint64_t GetThreadID() const override {
+    uint64_t thread_id = 0;
+#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
+#if __GLIBC_PREREQ(2, 30)
+    thread_id = ::gettid();
+#else   // __GLIBC_PREREQ(2, 30)
+    pthread_t tid = pthread_self();
+    memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid)));
+#endif  // __GLIBC_PREREQ(2, 30)
+#else   // defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
+    pthread_t tid = pthread_self();
+    memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid)));
+#endif  // defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
+    return thread_id;
+  }
+
+  Status GetHostName(char* name, uint64_t len) override {
+    int ret = gethostname(name, static_cast<size_t>(len));
+    if (ret < 0) {
+      if (errno == EFAULT || errno == EINVAL) {
+        return Status::InvalidArgument(errnoStr(errno).c_str());
+      } else {
+        return IOError("GetHostName", name, errno);
+      }
+    }
+    return Status::OK();
+  }
+
+  ThreadStatusUpdater* GetThreadStatusUpdater() const override {
+    return Env::GetThreadStatusUpdater();
+  }
+
+  std::string GenerateUniqueId() override { return Env::GenerateUniqueId(); }
+
+  // Allow increasing the number of worker threads.
+  void SetBackgroundThreads(int num, Priority pri) override {
+    assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH);
+    thread_pools_[pri].SetBackgroundThreads(num);
+  }
+
+  int GetBackgroundThreads(Priority pri) override {
+    assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH);
+    return thread_pools_[pri].GetBackgroundThreads();
+  }
+
+  Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override {
+    allow_non_owner_access_ = allow_non_owner_access;
+    return Status::OK();
+  }
+
+  // Allow increasing the number of worker threads.
+  void IncBackgroundThreadsIfNeeded(int num, Priority pri) override {
+    assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH);
+    thread_pools_[pri].IncBackgroundThreadsIfNeeded(num);
+  }
+
+  void LowerThreadPoolIOPriority(Priority pool) override {
+    assert(pool >= Priority::BOTTOM && pool <= Priority::HIGH);
+#ifdef OS_LINUX
+    thread_pools_[pool].LowerIOPriority();
+#else
+    (void)pool;
+#endif
+  }
+
+  void LowerThreadPoolCPUPriority(Priority pool) override {
+    assert(pool >= Priority::BOTTOM && pool <= Priority::HIGH);
+    thread_pools_[pool].LowerCPUPriority(CpuPriority::kLow);
+  }
+
+  Status LowerThreadPoolCPUPriority(Priority pool, CpuPriority pri) override {
+    assert(pool >= Priority::BOTTOM && pool <= Priority::HIGH);
+    thread_pools_[pool].LowerCPUPriority(pri);
+    return Status::OK();
+  }
+
+ private:
+  friend Env* Env::Default();
+  // Constructs the default Env, a singleton
+  PosixEnv();
+
+  // The below 4 members are only used by the default PosixEnv instance.
+  // Non-default instances simply maintain references to the backing
+  // members in te default instance
+  std::vector<ThreadPoolImpl> thread_pools_storage_;
+  pthread_mutex_t mu_storage_;
+  std::vector<pthread_t> threads_to_join_storage_;
+  bool allow_non_owner_access_storage_;
+
+  std::vector<ThreadPoolImpl>& thread_pools_;
+  pthread_mutex_t& mu_;
+  std::vector<pthread_t>& threads_to_join_;
+  // If true, allow non owner read access for db files. Otherwise, non-owner
+  //  has no access to db files.
+  bool& allow_non_owner_access_;
+};
+
+PosixEnv::PosixEnv()
+    : CompositeEnv(FileSystem::Default(), SystemClock::Default()),
+      thread_pools_storage_(Priority::TOTAL),
+      allow_non_owner_access_storage_(true),
+      thread_pools_(thread_pools_storage_),
+      mu_(mu_storage_),
+      threads_to_join_(threads_to_join_storage_),
+      allow_non_owner_access_(allow_non_owner_access_storage_) {
+  ThreadPoolImpl::PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr));
+  for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) {
+    thread_pools_[pool_id].SetThreadPriority(
+        static_cast<Env::Priority>(pool_id));
+    // This allows later initializing the thread-local-env of each thread.
+    thread_pools_[pool_id].SetHostEnv(this);
+  }
+  thread_status_updater_ = CreateThreadStatusUpdater();
+}
+
+void PosixEnv::Schedule(void (*function)(void* arg1), void* arg, Priority pri,
+                        void* tag, void (*unschedFunction)(void* arg)) {
+  assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH);
+  thread_pools_[pri].Schedule(function, arg, tag, unschedFunction);
+}
+
+int PosixEnv::UnSchedule(void* arg, Priority pri) {
+  return thread_pools_[pri].UnSchedule(arg);
+}
+
+unsigned int PosixEnv::GetThreadPoolQueueLen(Priority pri) const {
+  assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH);
+  return thread_pools_[pri].GetQueueLen();
+}
+
+int PosixEnv::ReserveThreads(int threads_to_reserved, Priority pri) {
+  assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH);
+  return thread_pools_[pri].ReserveThreads(threads_to_reserved);
+}
+
+int PosixEnv::ReleaseThreads(int threads_to_released, Priority pri) {
+  assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH);
+  return thread_pools_[pri].ReleaseThreads(threads_to_released);
+}
+
+struct StartThreadState {
+  void (*user_function)(void*);
+  void* arg;
+};
+
+static void* StartThreadWrapper(void* arg) {
+  StartThreadState* state = reinterpret_cast<StartThreadState*>(arg);
+  state->user_function(state->arg);
+  delete state;
+  return nullptr;
+}
+
+void PosixEnv::StartThread(void (*function)(void* arg), void* arg) {
+  pthread_t t;
+  StartThreadState* state = new StartThreadState;
+  state->user_function = function;
+  state->arg = arg;
+  ThreadPoolImpl::PthreadCall(
+      "start thread", pthread_create(&t, nullptr, &StartThreadWrapper, state));
+  ThreadPoolImpl::PthreadCall("lock", pthread_mutex_lock(&mu_));
+  threads_to_join_.push_back(t);
+  ThreadPoolImpl::PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+}
+
+void PosixEnv::WaitForJoin() {
+  for (const auto tid : threads_to_join_) {
+    pthread_join(tid, nullptr);
+  }
+  threads_to_join_.clear();
+}
+
+}  // namespace
+
+//
+// Default Posix Env
+//
+Env* Env::Default() {
+  // The following function call initializes the singletons of ThreadLocalPtr
+  // right before the static default_env.  This guarantees default_env will
+  // always being destructed before the ThreadLocalPtr singletons get
+  // destructed as C++ guarantees that the destructions of static variables
+  // is in the reverse order of their constructions.
+  //
+  // Since static members are destructed in the reverse order
+  // of their construction, having this call here guarantees that
+  // the destructor of static PosixEnv will go first, then the
+  // the singletons of ThreadLocalPtr.
+  ThreadLocalPtr::InitSingletons();
+  CompressionContextCache::InitSingleton();
+  INIT_SYNC_POINT_SINGLETONS();
+  // ~PosixEnv must be called on exit
+  //**TODO: Can we make this a STATIC_AVOID_DESTRUCTION?
+  static PosixEnv default_env;
+  return &default_env;
+}
+
+//
+// Default Posix SystemClock
+//
+const std::shared_ptr<SystemClock>& SystemClock::Default() {
+  STATIC_AVOID_DESTRUCTION(std::shared_ptr<SystemClock>, instance)
+  (std::make_shared<PosixClock>());
+  return instance;
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/env/env_test.cc b/src/rocksdb/env/env_test.cc
new file mode 100644
index 000000000..f4e9d50b2
--- /dev/null
+++ b/src/rocksdb/env/env_test.cc
@@ -0,0 +1,3562 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef OS_WIN
+#include <sys/ioctl.h>
+#endif
+
+#if defined(ROCKSDB_IOURING_PRESENT)
+#include <liburing.h>
+#include <sys/uio.h>
+#endif
+
+#include <sys/types.h>
+
+#include <atomic>
+#include <list>
+#include <mutex>
+#include <unordered_set>
+
+#ifdef OS_LINUX
+#include <fcntl.h>
+#include <linux/fs.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#endif
+
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+#include <errno.h>
+#endif
+
+#include "db/db_impl/db_impl.h"
+#include "env/emulated_clock.h"
+#include "env/env_chroot.h"
+#include "env/env_encryption_ctr.h"
+#include "env/fs_readonly.h"
+#include "env/mock_env.h"
+#include "env/unique_id_gen.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "options/options_helper.h"
+#include "port/malloc.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/env.h"
+#include "rocksdb/env_encryption.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "test_util/mock_time_env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/counted_fs.h"
+#include "utilities/env_timed.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/fault_injection_fs.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+using port::kPageSize;
+
+static const int kDelayMicros = 100000;
+
+struct Deleter {
+  explicit Deleter(void (*fn)(void*)) : fn_(fn) {}
+
+  void operator()(void* ptr) {
+    assert(fn_);
+    assert(ptr);
+    (*fn_)(ptr);
+  }
+
+  void (*fn_)(void*);
+};
+
+extern "C" bool RocksDbIOUringEnable() { return true; }
+
+std::unique_ptr<char, Deleter> NewAligned(const size_t size, const char ch) {
+  char* ptr = nullptr;
+#ifdef OS_WIN
+  if (nullptr ==
+      (ptr = reinterpret_cast<char*>(_aligned_malloc(size, kPageSize)))) {
+    return std::unique_ptr<char, Deleter>(nullptr, Deleter(_aligned_free));
+  }
+  std::unique_ptr<char, Deleter> uptr(ptr, Deleter(_aligned_free));
+#else
+  if (posix_memalign(reinterpret_cast<void**>(&ptr), kPageSize, size) != 0) {
+    return std::unique_ptr<char, Deleter>(nullptr, Deleter(free));
+  }
+  std::unique_ptr<char, Deleter> uptr(ptr, Deleter(free));
+#endif
+  memset(uptr.get(), ch, size);
+  return uptr;
+}
+
+class EnvPosixTest : public testing::Test {
+ private:
+  port::Mutex mu_;
+  std::string events_;
+
+ public:
+  Env* env_;
+  bool direct_io_;
+  EnvPosixTest() : env_(Env::Default()), direct_io_(false) {}
+  ~EnvPosixTest() {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->LoadDependency({});
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+  }
+};
+
+class EnvPosixTestWithParam
+    : public EnvPosixTest,
+      public ::testing::WithParamInterface<std::pair<Env*, bool>> {
+ public:
+  EnvPosixTestWithParam() {
+    std::pair<Env*, bool> param_pair = GetParam();
+    env_ = param_pair.first;
+    direct_io_ = param_pair.second;
+  }
+
+  void WaitThreadPoolsEmpty() {
+    // Wait until the thread pools are empty.
+    while (env_->GetThreadPoolQueueLen(Env::Priority::LOW) != 0) {
+      Env::Default()->SleepForMicroseconds(kDelayMicros);
+    }
+    while (env_->GetThreadPoolQueueLen(Env::Priority::HIGH) != 0) {
+      Env::Default()->SleepForMicroseconds(kDelayMicros);
+    }
+  }
+
+  ~EnvPosixTestWithParam() override { WaitThreadPoolsEmpty(); }
+};
+
+static void SetBool(void* ptr) {
+  reinterpret_cast<std::atomic<bool>*>(ptr)->store(true);
+}
+
+TEST_F(EnvPosixTest, DISABLED_RunImmediately) {
+  for (int pri = Env::BOTTOM; pri < Env::TOTAL; ++pri) {
+    std::atomic<bool> called(false);
+    env_->SetBackgroundThreads(1, static_cast<Env::Priority>(pri));
+    env_->Schedule(&SetBool, &called, static_cast<Env::Priority>(pri));
+    Env::Default()->SleepForMicroseconds(kDelayMicros);
+    ASSERT_TRUE(called.load());
+  }
+}
+
+TEST_F(EnvPosixTest, RunEventually) {
+  std::atomic<bool> called(false);
+  env_->StartThread(&SetBool, &called);
+  env_->WaitForJoin();
+  ASSERT_TRUE(called.load());
+}
+
+#ifdef OS_WIN
+TEST_F(EnvPosixTest, AreFilesSame) {
+  {
+    bool tmp;
+    if (env_->AreFilesSame("", "", &tmp).IsNotSupported()) {
+      fprintf(stderr,
+              "skipping EnvBasicTestWithParam.AreFilesSame due to "
+              "unsupported Env::AreFilesSame\n");
+      return;
+    }
+  }
+
+  const EnvOptions soptions;
+  auto* env = Env::Default();
+  std::string same_file_name = test::PerThreadDBPath(env, "same_file");
+  std::string same_file_link_name = same_file_name + "_link";
+
+  std::unique_ptr<WritableFile> same_file;
+  ASSERT_OK(env->NewWritableFile(same_file_name, &same_file, soptions));
+  same_file->Append("random_data");
+  ASSERT_OK(same_file->Flush());
+  same_file.reset();
+
+  ASSERT_OK(env->LinkFile(same_file_name, same_file_link_name));
+  bool result = false;
+  ASSERT_OK(env->AreFilesSame(same_file_name, same_file_link_name, &result));
+  ASSERT_TRUE(result);
+}
+#endif
+
+#ifdef OS_LINUX
+TEST_F(EnvPosixTest, DISABLED_FilePermission) {
+  // Only works for Linux environment
+  if (env_ == Env::Default()) {
+    EnvOptions soptions;
+    std::vector<std::string> fileNames{
+        test::PerThreadDBPath(env_, "testfile"),
+        test::PerThreadDBPath(env_, "testfile1")};
+    std::unique_ptr<WritableFile> wfile;
+    ASSERT_OK(env_->NewWritableFile(fileNames[0], &wfile, soptions));
+    ASSERT_OK(env_->NewWritableFile(fileNames[1], &wfile, soptions));
+    wfile.reset();
+    std::unique_ptr<RandomRWFile> rwfile;
+    ASSERT_OK(env_->NewRandomRWFile(fileNames[1], &rwfile, soptions));
+
+    struct stat sb;
+    for (const auto& filename : fileNames) {
+      if (::stat(filename.c_str(), &sb) == 0) {
+        ASSERT_EQ(sb.st_mode & 0777, 0644);
+      }
+      ASSERT_OK(env_->DeleteFile(filename));
+    }
+
+    env_->SetAllowNonOwnerAccess(false);
+    ASSERT_OK(env_->NewWritableFile(fileNames[0], &wfile, soptions));
+    ASSERT_OK(env_->NewWritableFile(fileNames[1], &wfile, soptions));
+    wfile.reset();
+    ASSERT_OK(env_->NewRandomRWFile(fileNames[1], &rwfile, soptions));
+
+    for (const auto& filename : fileNames) {
+      if (::stat(filename.c_str(), &sb) == 0) {
+        ASSERT_EQ(sb.st_mode & 0777, 0600);
+      }
+      ASSERT_OK(env_->DeleteFile(filename));
+    }
+  }
+}
+
+TEST_F(EnvPosixTest, LowerThreadPoolCpuPriority) {
+  std::atomic<CpuPriority> from_priority(CpuPriority::kNormal);
+  std::atomic<CpuPriority> to_priority(CpuPriority::kNormal);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ThreadPoolImpl::BGThread::BeforeSetCpuPriority", [&](void* pri) {
+        from_priority.store(*reinterpret_cast<CpuPriority*>(pri));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ThreadPoolImpl::BGThread::AfterSetCpuPriority", [&](void* pri) {
+        to_priority.store(*reinterpret_cast<CpuPriority*>(pri));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  env_->SetBackgroundThreads(1, Env::BOTTOM);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+
+  auto RunTask = [&](Env::Priority pool) {
+    std::atomic<bool> called(false);
+    env_->Schedule(&SetBool, &called, pool);
+    for (int i = 0; i < kDelayMicros; i++) {
+      if (called.load()) {
+        break;
+      }
+      Env::Default()->SleepForMicroseconds(1);
+    }
+    ASSERT_TRUE(called.load());
+  };
+
+  {
+    // Same priority, no-op.
+    env_->LowerThreadPoolCPUPriority(Env::Priority::BOTTOM,
+                                     CpuPriority::kNormal)
+        .PermitUncheckedError();
+    RunTask(Env::Priority::BOTTOM);
+    ASSERT_EQ(from_priority, CpuPriority::kNormal);
+    ASSERT_EQ(to_priority, CpuPriority::kNormal);
+  }
+
+  {
+    // Higher priority, no-op.
+    env_->LowerThreadPoolCPUPriority(Env::Priority::BOTTOM, CpuPriority::kHigh)
+        .PermitUncheckedError();
+    RunTask(Env::Priority::BOTTOM);
+    ASSERT_EQ(from_priority, CpuPriority::kNormal);
+    ASSERT_EQ(to_priority, CpuPriority::kNormal);
+  }
+
+  {
+    // Lower priority from kNormal -> kLow.
+    env_->LowerThreadPoolCPUPriority(Env::Priority::BOTTOM, CpuPriority::kLow)
+        .PermitUncheckedError();
+    RunTask(Env::Priority::BOTTOM);
+    ASSERT_EQ(from_priority, CpuPriority::kNormal);
+    ASSERT_EQ(to_priority, CpuPriority::kLow);
+  }
+
+  {
+    // Lower priority from kLow -> kIdle.
+    env_->LowerThreadPoolCPUPriority(Env::Priority::BOTTOM, CpuPriority::kIdle)
+        .PermitUncheckedError();
+    RunTask(Env::Priority::BOTTOM);
+    ASSERT_EQ(from_priority, CpuPriority::kLow);
+    ASSERT_EQ(to_priority, CpuPriority::kIdle);
+  }
+
+  {
+    // Lower priority from kNormal -> kIdle for another pool.
+    env_->LowerThreadPoolCPUPriority(Env::Priority::HIGH, CpuPriority::kIdle)
+        .PermitUncheckedError();
+    RunTask(Env::Priority::HIGH);
+    ASSERT_EQ(from_priority, CpuPriority::kNormal);
+    ASSERT_EQ(to_priority, CpuPriority::kIdle);
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif
+
+TEST_F(EnvPosixTest, MemoryMappedFileBuffer) {
+  const int kFileBytes = 1 << 15;  // 32 KB
+  std::string expected_data;
+  std::string fname = test::PerThreadDBPath(env_, "testfile");
+  {
+    std::unique_ptr<WritableFile> wfile;
+    const EnvOptions soptions;
+    ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
+
+    Random rnd(301);
+    expected_data = rnd.RandomString(kFileBytes);
+    ASSERT_OK(wfile->Append(expected_data));
+  }
+
+  std::unique_ptr<MemoryMappedFileBuffer> mmap_buffer;
+  Status status = env_->NewMemoryMappedFileBuffer(fname, &mmap_buffer);
+  // it should be supported at least on linux
+#if !defined(OS_LINUX)
+  if (status.IsNotSupported()) {
+    fprintf(stderr,
+            "skipping EnvPosixTest.MemoryMappedFileBuffer due to "
+            "unsupported Env::NewMemoryMappedFileBuffer\n");
+    return;
+  }
+#endif  // !defined(OS_LINUX)
+
+  ASSERT_OK(status);
+  ASSERT_NE(nullptr, mmap_buffer.get());
+  ASSERT_NE(nullptr, mmap_buffer->GetBase());
+  ASSERT_EQ(kFileBytes, mmap_buffer->GetLen());
+  std::string actual_data(reinterpret_cast<const char*>(mmap_buffer->GetBase()),
+                          mmap_buffer->GetLen());
+  ASSERT_EQ(expected_data, actual_data);
+}
+
+#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
+TEST_F(EnvPosixTest, LoadRocksDBLibrary) {
+  std::shared_ptr<DynamicLibrary> library;
+  std::function<void*(void*, const char*)> function;
+  Status status = env_->LoadLibrary("no-such-library", "", &library);
+  ASSERT_NOK(status);
+  ASSERT_EQ(nullptr, library.get());
+  status = env_->LoadLibrary("rocksdb", "", &library);
+  if (status.ok()) {  // If we have can find a rocksdb shared library
+    ASSERT_NE(nullptr, library.get());
+    ASSERT_OK(library->LoadFunction("rocksdb_create_default_env",
+                                    &function));  // from C definition
+    ASSERT_NE(nullptr, function);
+    ASSERT_NOK(library->LoadFunction("no-such-method", &function));
+    ASSERT_EQ(nullptr, function);
+    ASSERT_OK(env_->LoadLibrary(library->Name(), "", &library));
+  } else {
+    ASSERT_EQ(nullptr, library.get());
+  }
+}
+#endif  // !ROCKSDB_NO_DYNAMIC_EXTENSION
+
+#if !defined(OS_WIN) && !defined(ROCKSDB_NO_DYNAMIC_EXTENSION)
+TEST_F(EnvPosixTest, LoadRocksDBLibraryWithSearchPath) {
+  std::shared_ptr<DynamicLibrary> library;
+  std::function<void*(void*, const char*)> function;
+  ASSERT_NOK(env_->LoadLibrary("no-such-library", "/tmp", &library));
+  ASSERT_EQ(nullptr, library.get());
+  ASSERT_NOK(env_->LoadLibrary("dl", "/tmp", &library));
+  ASSERT_EQ(nullptr, library.get());
+  Status status = env_->LoadLibrary("rocksdb", "/tmp:./", &library);
+  if (status.ok()) {
+    ASSERT_NE(nullptr, library.get());
+    ASSERT_OK(env_->LoadLibrary(library->Name(), "", &library));
+  }
+  char buff[1024];
+  std::string cwd = getcwd(buff, sizeof(buff));
+
+  status = env_->LoadLibrary("rocksdb", "/tmp:" + cwd, &library);
+  if (status.ok()) {
+    ASSERT_NE(nullptr, library.get());
+    ASSERT_OK(env_->LoadLibrary(library->Name(), "", &library));
+  }
+}
+#endif  // !OS_WIN && !ROCKSDB_NO_DYNAMIC_EXTENSION
+
+TEST_P(EnvPosixTestWithParam, UnSchedule) {
+  std::atomic<bool> called(false);
+  env_->SetBackgroundThreads(1, Env::LOW);
+
+  /* Block the low priority queue */
+  test::SleepingBackgroundTask sleeping_task, sleeping_task1;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+                 Env::Priority::LOW);
+
+  /* Schedule another task */
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task1,
+                 Env::Priority::LOW, &sleeping_task1);
+
+  /* Remove it with a different tag  */
+  ASSERT_EQ(0, env_->UnSchedule(&called, Env::Priority::LOW));
+
+  /* Remove it from the queue with the right tag */
+  ASSERT_EQ(1, env_->UnSchedule(&sleeping_task1, Env::Priority::LOW));
+
+  // Unblock background thread
+  sleeping_task.WakeUp();
+
+  /* Schedule another task */
+  env_->Schedule(&SetBool, &called);
+  for (int i = 0; i < kDelayMicros; i++) {
+    if (called.load()) {
+      break;
+    }
+    Env::Default()->SleepForMicroseconds(1);
+  }
+  ASSERT_TRUE(called.load());
+
+  ASSERT_TRUE(!sleeping_task.IsSleeping() && !sleeping_task1.IsSleeping());
+  WaitThreadPoolsEmpty();
+}
+
+// This tests assumes that the last scheduled
+// task will run last. In fact, in the allotted
+// sleeping time nothing may actually run or they may
+// run in any order. The purpose of the test is unclear.
+#ifndef OS_WIN
+TEST_P(EnvPosixTestWithParam, RunMany) {
+  env_->SetBackgroundThreads(1, Env::LOW);
+  std::atomic<int> last_id(0);
+
+  struct CB {
+    std::atomic<int>* last_id_ptr;  // Pointer to shared slot
+    int id;                         // Order# for the execution of this callback
+
+    CB(std::atomic<int>* p, int i) : last_id_ptr(p), id(i) {}
+
+    static void Run(void* v) {
+      CB* cb = reinterpret_cast<CB*>(v);
+      int cur = cb->last_id_ptr->load();
+      ASSERT_EQ(cb->id - 1, cur);
+      cb->last_id_ptr->store(cb->id);
+    }
+  };
+
+  // Schedule in different order than start time
+  CB cb1(&last_id, 1);
+  CB cb2(&last_id, 2);
+  CB cb3(&last_id, 3);
+  CB cb4(&last_id, 4);
+  env_->Schedule(&CB::Run, &cb1);
+  env_->Schedule(&CB::Run, &cb2);
+  env_->Schedule(&CB::Run, &cb3);
+  env_->Schedule(&CB::Run, &cb4);
+  // thread-pool pops a thread function and then run the function, which may
+  // cause threadpool is empty but the last function is still running. Add a
+  // dummy function at the end, to make sure the last callback is finished
+  // before threadpool is empty.
+  struct DummyCB {
+    static void Run(void*) {}
+  };
+  env_->Schedule(&DummyCB::Run, nullptr);
+
+  WaitThreadPoolsEmpty();
+  ASSERT_EQ(4, last_id.load(std::memory_order_acquire));
+}
+#endif
+
+struct State {
+  port::Mutex mu;
+  int val;
+  int num_running;
+};
+
+static void ThreadBody(void* arg) {
+  State* s = reinterpret_cast<State*>(arg);
+  s->mu.Lock();
+  s->val += 1;
+  s->num_running -= 1;
+  s->mu.Unlock();
+}
+
+TEST_P(EnvPosixTestWithParam, StartThread) {
+  State state;
+  state.val = 0;
+  state.num_running = 3;
+  for (int i = 0; i < 3; i++) {
+    env_->StartThread(&ThreadBody, &state);
+  }
+  while (true) {
+    state.mu.Lock();
+    int num = state.num_running;
+    state.mu.Unlock();
+    if (num == 0) {
+      break;
+    }
+    Env::Default()->SleepForMicroseconds(kDelayMicros);
+  }
+  ASSERT_EQ(state.val, 3);
+  WaitThreadPoolsEmpty();
+}
+
+TEST_P(EnvPosixTestWithParam, TwoPools) {
+  // Data structures to signal tasks to run.
+  port::Mutex mutex;
+  port::CondVar cv(&mutex);
+  bool should_start = false;
+
+  class CB {
+   public:
+    CB(const std::string& pool_name, int pool_size, port::Mutex* trigger_mu,
+       port::CondVar* trigger_cv, bool* _should_start)
+        : mu_(),
+          num_running_(0),
+          num_finished_(0),
+          pool_size_(pool_size),
+          pool_name_(pool_name),
+          trigger_mu_(trigger_mu),
+          trigger_cv_(trigger_cv),
+          should_start_(_should_start) {}
+
+    static void Run(void* v) {
+      CB* cb = reinterpret_cast<CB*>(v);
+      cb->Run();
+    }
+
+    void Run() {
+      {
+        MutexLock l(&mu_);
+        num_running_++;
+        // make sure we don't have more than pool_size_ jobs running.
+        ASSERT_LE(num_running_, pool_size_.load());
+      }
+
+      {
+        MutexLock l(trigger_mu_);
+        while (!(*should_start_)) {
+          trigger_cv_->Wait();
+        }
+      }
+
+      {
+        MutexLock l(&mu_);
+        num_running_--;
+        num_finished_++;
+      }
+    }
+
+    int NumFinished() {
+      MutexLock l(&mu_);
+      return num_finished_;
+    }
+
+    void Reset(int pool_size) {
+      pool_size_.store(pool_size);
+      num_finished_ = 0;
+    }
+
+   private:
+    port::Mutex mu_;
+    int num_running_;
+    int num_finished_;
+    std::atomic<int> pool_size_;
+    std::string pool_name_;
+    port::Mutex* trigger_mu_;
+    port::CondVar* trigger_cv_;
+    bool* should_start_;
+  };
+
+  const int kLowPoolSize = 2;
+  const int kHighPoolSize = 4;
+  const int kJobs = 8;
+
+  CB low_pool_job("low", kLowPoolSize, &mutex, &cv, &should_start);
+  CB high_pool_job("high", kHighPoolSize, &mutex, &cv, &should_start);
+
+  env_->SetBackgroundThreads(kLowPoolSize);
+  env_->SetBackgroundThreads(kHighPoolSize, Env::Priority::HIGH);
+
+  ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+  ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+
+  // schedule same number of jobs in each pool
+  for (int i = 0; i < kJobs; i++) {
+    env_->Schedule(&CB::Run, &low_pool_job);
+    env_->Schedule(&CB::Run, &high_pool_job, Env::Priority::HIGH);
+  }
+  // Wait a short while for the jobs to be dispatched.
+  int sleep_count = 0;
+  while ((unsigned int)(kJobs - kLowPoolSize) !=
+             env_->GetThreadPoolQueueLen(Env::Priority::LOW) ||
+         (unsigned int)(kJobs - kHighPoolSize) !=
+             env_->GetThreadPoolQueueLen(Env::Priority::HIGH)) {
+    env_->SleepForMicroseconds(kDelayMicros);
+    if (++sleep_count > 100) {
+      break;
+    }
+  }
+
+  ASSERT_EQ((unsigned int)(kJobs - kLowPoolSize),
+            env_->GetThreadPoolQueueLen());
+  ASSERT_EQ((unsigned int)(kJobs - kLowPoolSize),
+            env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+  ASSERT_EQ((unsigned int)(kJobs - kHighPoolSize),
+            env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+
+  // Trigger jobs to run.
+  {
+    MutexLock l(&mutex);
+    should_start = true;
+    cv.SignalAll();
+  }
+
+  // wait for all jobs to finish
+  while (low_pool_job.NumFinished() < kJobs ||
+         high_pool_job.NumFinished() < kJobs) {
+    env_->SleepForMicroseconds(kDelayMicros);
+  }
+
+  ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+  ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+
+  // Hold jobs to schedule;
+  should_start = false;
+
+  // call IncBackgroundThreadsIfNeeded to two pools. One increasing and
+  // the other decreasing
+  env_->IncBackgroundThreadsIfNeeded(kLowPoolSize - 1, Env::Priority::LOW);
+  env_->IncBackgroundThreadsIfNeeded(kHighPoolSize + 1, Env::Priority::HIGH);
+  high_pool_job.Reset(kHighPoolSize + 1);
+  low_pool_job.Reset(kLowPoolSize);
+
+  // schedule same number of jobs in each pool
+  for (int i = 0; i < kJobs; i++) {
+    env_->Schedule(&CB::Run, &low_pool_job);
+    env_->Schedule(&CB::Run, &high_pool_job, Env::Priority::HIGH);
+  }
+  // Wait a short while for the jobs to be dispatched.
+  sleep_count = 0;
+  while ((unsigned int)(kJobs - kLowPoolSize) !=
+             env_->GetThreadPoolQueueLen(Env::Priority::LOW) ||
+         (unsigned int)(kJobs - (kHighPoolSize + 1)) !=
+             env_->GetThreadPoolQueueLen(Env::Priority::HIGH)) {
+    env_->SleepForMicroseconds(kDelayMicros);
+    if (++sleep_count > 100) {
+      break;
+    }
+  }
+  ASSERT_EQ((unsigned int)(kJobs - kLowPoolSize),
+            env_->GetThreadPoolQueueLen());
+  ASSERT_EQ((unsigned int)(kJobs - kLowPoolSize),
+            env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+  ASSERT_EQ((unsigned int)(kJobs - (kHighPoolSize + 1)),
+            env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+
+  // Trigger jobs to run.
+  {
+    MutexLock l(&mutex);
+    should_start = true;
+    cv.SignalAll();
+  }
+
+  // wait for all jobs to finish
+  while (low_pool_job.NumFinished() < kJobs ||
+         high_pool_job.NumFinished() < kJobs) {
+    env_->SleepForMicroseconds(kDelayMicros);
+  }
+
+  env_->SetBackgroundThreads(kHighPoolSize, Env::Priority::HIGH);
+  WaitThreadPoolsEmpty();
+}
+
+TEST_P(EnvPosixTestWithParam, DecreaseNumBgThreads) {
+  constexpr int kWaitMicros = 60000000;  // 1min
+
+  std::vector<test::SleepingBackgroundTask> tasks(10);
+
+  // Set number of thread to 1 first.
+  env_->SetBackgroundThreads(1, Env::Priority::HIGH);
+
+  // Schedule 3 tasks. 0 running; Task 1, 2 waiting.
+  for (size_t i = 0; i < 3; i++) {
+    env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &tasks[i],
+                   Env::Priority::HIGH);
+  }
+  ASSERT_FALSE(tasks[0].TimedWaitUntilSleeping(kWaitMicros));
+  ASSERT_EQ(2U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+  ASSERT_TRUE(tasks[0].IsSleeping());
+  ASSERT_TRUE(!tasks[1].IsSleeping());
+  ASSERT_TRUE(!tasks[2].IsSleeping());
+
+  // Increase to 2 threads. Task 0, 1 running; 2 waiting
+  env_->SetBackgroundThreads(2, Env::Priority::HIGH);
+  ASSERT_FALSE(tasks[1].TimedWaitUntilSleeping(kWaitMicros));
+  ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+  ASSERT_TRUE(tasks[0].IsSleeping());
+  ASSERT_TRUE(tasks[1].IsSleeping());
+  ASSERT_TRUE(!tasks[2].IsSleeping());
+
+  // Shrink back to 1 thread. Still task 0, 1 running, 2 waiting
+  env_->SetBackgroundThreads(1, Env::Priority::HIGH);
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+  ASSERT_TRUE(tasks[0].IsSleeping());
+  ASSERT_TRUE(tasks[1].IsSleeping());
+  ASSERT_TRUE(!tasks[2].IsSleeping());
+
+  // The last task finishes. Task 0 running, 2 waiting.
+  tasks[1].WakeUp();
+  ASSERT_FALSE(tasks[1].TimedWaitUntilDone(kWaitMicros));
+  ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+  ASSERT_TRUE(tasks[0].IsSleeping());
+  ASSERT_TRUE(!tasks[1].IsSleeping());
+  ASSERT_TRUE(!tasks[2].IsSleeping());
+
+  // Increase to 5 threads. Task 0 and 2 running.
+  env_->SetBackgroundThreads(5, Env::Priority::HIGH);
+  ASSERT_FALSE(tasks[2].TimedWaitUntilSleeping(kWaitMicros));
+  ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+  ASSERT_TRUE(tasks[0].IsSleeping());
+  ASSERT_TRUE(!tasks[1].IsSleeping());
+  ASSERT_TRUE(tasks[2].IsSleeping());
+
+  // Change number of threads a couple of times while there is no sufficient
+  // tasks.
+  env_->SetBackgroundThreads(7, Env::Priority::HIGH);
+  tasks[2].WakeUp();
+  ASSERT_FALSE(tasks[2].TimedWaitUntilDone(kWaitMicros));
+  ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+  env_->SetBackgroundThreads(3, Env::Priority::HIGH);
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+  env_->SetBackgroundThreads(4, Env::Priority::HIGH);
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+  env_->SetBackgroundThreads(5, Env::Priority::HIGH);
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+  env_->SetBackgroundThreads(4, Env::Priority::HIGH);
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+
+  Env::Default()->SleepForMicroseconds(kDelayMicros * 50);
+
+  // Enqueue 5 more tasks. Thread pool size now is 4.
+  // Task 0, 3, 4, 5 running;6, 7 waiting.
+  for (size_t i = 3; i < 8; i++) {
+    env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &tasks[i],
+                   Env::Priority::HIGH);
+  }
+  for (size_t i = 3; i <= 5; i++) {
+    ASSERT_FALSE(tasks[i].TimedWaitUntilSleeping(kWaitMicros));
+  }
+  ASSERT_EQ(2U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+  ASSERT_TRUE(tasks[0].IsSleeping());
+  ASSERT_TRUE(!tasks[1].IsSleeping());
+  ASSERT_TRUE(!tasks[2].IsSleeping());
+  ASSERT_TRUE(tasks[3].IsSleeping());
+  ASSERT_TRUE(tasks[4].IsSleeping());
+  ASSERT_TRUE(tasks[5].IsSleeping());
+  ASSERT_TRUE(!tasks[6].IsSleeping());
+  ASSERT_TRUE(!tasks[7].IsSleeping());
+
+  // Wake up task 0, 3 and 4. Task 5, 6, 7 running.
+  tasks[0].WakeUp();
+  tasks[3].WakeUp();
+  tasks[4].WakeUp();
+
+  for (size_t i = 5; i < 8; i++) {
+    ASSERT_FALSE(tasks[i].TimedWaitUntilSleeping(kWaitMicros));
+  }
+  ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+  for (size_t i = 5; i < 8; i++) {
+    ASSERT_TRUE(tasks[i].IsSleeping());
+  }
+
+  // Shrink back to 1 thread. Still task 5, 6, 7 running
+  env_->SetBackgroundThreads(1, Env::Priority::HIGH);
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_TRUE(tasks[5].IsSleeping());
+  ASSERT_TRUE(tasks[6].IsSleeping());
+  ASSERT_TRUE(tasks[7].IsSleeping());
+
+  // Wake up task  6. Task 5, 7 running
+  tasks[6].WakeUp();
+  ASSERT_FALSE(tasks[6].TimedWaitUntilDone(kWaitMicros));
+  ASSERT_TRUE(tasks[5].IsSleeping());
+  ASSERT_TRUE(!tasks[6].IsSleeping());
+  ASSERT_TRUE(tasks[7].IsSleeping());
+
+  // Wake up threads 7. Task 5 running
+  tasks[7].WakeUp();
+  ASSERT_FALSE(tasks[7].TimedWaitUntilDone(kWaitMicros));
+  ASSERT_TRUE(!tasks[7].IsSleeping());
+
+  // Enqueue thread 8 and 9. Task 5 running; one of 8, 9 might be running.
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &tasks[8],
+                 Env::Priority::HIGH);
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &tasks[9],
+                 Env::Priority::HIGH);
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_GT(env_->GetThreadPoolQueueLen(Env::Priority::HIGH), (unsigned int)0);
+  ASSERT_TRUE(!tasks[8].IsSleeping() || !tasks[9].IsSleeping());
+
+  // Increase to 4 threads. Task 5, 8, 9 running.
+  env_->SetBackgroundThreads(4, Env::Priority::HIGH);
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_EQ((unsigned int)0, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+  ASSERT_TRUE(tasks[8].IsSleeping());
+  ASSERT_TRUE(tasks[9].IsSleeping());
+
+  // Shrink to 1 thread
+  env_->SetBackgroundThreads(1, Env::Priority::HIGH);
+
+  // Wake up thread 9.
+  tasks[9].WakeUp();
+  ASSERT_FALSE(tasks[9].TimedWaitUntilDone(kWaitMicros));
+  ASSERT_TRUE(!tasks[9].IsSleeping());
+  ASSERT_TRUE(tasks[8].IsSleeping());
+
+  // Wake up thread 8
+  tasks[8].WakeUp();
+  ASSERT_FALSE(tasks[8].TimedWaitUntilDone(kWaitMicros));
+  ASSERT_TRUE(!tasks[8].IsSleeping());
+
+  // Wake up the last thread
+  tasks[5].WakeUp();
+  ASSERT_FALSE(tasks[5].TimedWaitUntilDone(kWaitMicros));
+  WaitThreadPoolsEmpty();
+}
+
+TEST_P(EnvPosixTestWithParam, ReserveThreads) {
+  // Initialize the background thread to 1 in case other threads exist
+  // from the last unit test
+  env_->SetBackgroundThreads(1, Env::Priority::HIGH);
+  ASSERT_EQ(env_->GetBackgroundThreads(Env::HIGH), 1);
+  constexpr int kWaitMicros = 10000000;  // 10seconds
+  std::vector<test::SleepingBackgroundTask> tasks(4);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  // Set the sync point to ensure thread 0 can terminate
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"ThreadPoolImpl::BGThread::Termination:th0",
+        "EnvTest::ReserveThreads:0"}});
+  // Empty the thread pool to ensure all the threads can start later
+  env_->SetBackgroundThreads(0, Env::Priority::HIGH);
+  TEST_SYNC_POINT("EnvTest::ReserveThreads:0");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  // Set the sync point to ensure threads start and pass the sync point
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"ThreadPoolImpl::BGThread::Start:th0", "EnvTest::ReserveThreads:1"},
+       {"ThreadPoolImpl::BGThread::Start:th1", "EnvTest::ReserveThreads:2"},
+       {"ThreadPoolImpl::BGThread::Start:th2", "EnvTest::ReserveThreads:3"},
+       {"ThreadPoolImpl::BGThread::Start:th3", "EnvTest::ReserveThreads:4"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Set number of thread to 3 first.
+  env_->SetBackgroundThreads(3, Env::Priority::HIGH);
+  ASSERT_EQ(env_->GetBackgroundThreads(Env::HIGH), 3);
+  // Add sync points to ensure all 3 threads start
+  TEST_SYNC_POINT("EnvTest::ReserveThreads:1");
+  TEST_SYNC_POINT("EnvTest::ReserveThreads:2");
+  TEST_SYNC_POINT("EnvTest::ReserveThreads:3");
+  // Reserve 2 threads
+  ASSERT_EQ(2, env_->ReserveThreads(2, Env::Priority::HIGH));
+
+  // Schedule 3 tasks. Task 0 running (in this context, doing
+  // SleepingBackgroundTask); Task 1, 2 waiting; 3 reserved threads.
+  for (size_t i = 0; i < 3; i++) {
+    env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &tasks[i],
+                   Env::Priority::HIGH);
+  }
+  ASSERT_FALSE(tasks[0].TimedWaitUntilSleeping(kWaitMicros));
+  ASSERT_EQ(2U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+  ASSERT_TRUE(tasks[0].IsSleeping());
+  ASSERT_TRUE(!tasks[1].IsSleeping());
+  ASSERT_TRUE(!tasks[2].IsSleeping());
+
+  // Release 2 threads. Task 0, 1, 2 running; 0 reserved thread.
+  ASSERT_EQ(2, env_->ReleaseThreads(2, Env::Priority::HIGH));
+  ASSERT_FALSE(tasks[1].TimedWaitUntilSleeping(kWaitMicros));
+  ASSERT_FALSE(tasks[2].TimedWaitUntilSleeping(kWaitMicros));
+  ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+  ASSERT_TRUE(tasks[1].IsSleeping());
+  ASSERT_TRUE(tasks[2].IsSleeping());
+  // No more threads can be reserved
+  ASSERT_EQ(0, env_->ReserveThreads(3, Env::Priority::HIGH));
+  // Expand the number of background threads so that the last thread
+  // is waiting
+  env_->SetBackgroundThreads(4, Env::Priority::HIGH);
+  // Add sync point to ensure the 4th thread starts
+  TEST_SYNC_POINT("EnvTest::ReserveThreads:4");
+  // As the thread pool is expanded, we can reserve one more thread
+  ASSERT_EQ(1, env_->ReserveThreads(3, Env::Priority::HIGH));
+  // No more threads can be reserved
+  ASSERT_EQ(0, env_->ReserveThreads(3, Env::Priority::HIGH));
+
+  // Reset the sync points for the next iteration in BGThread or the
+  // next time Submit() is called
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"ThreadPoolImpl::BGThread::WaitingThreadsInc",
+        "EnvTest::ReserveThreads:5"},
+       {"ThreadPoolImpl::BGThread::Termination", "EnvTest::ReserveThreads:6"},
+       {"ThreadPoolImpl::Submit::Enqueue", "EnvTest::ReserveThreads:7"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  tasks[0].WakeUp();
+  ASSERT_FALSE(tasks[0].TimedWaitUntilDone(kWaitMicros));
+  // Add sync point to ensure the number of waiting threads increases
+  TEST_SYNC_POINT("EnvTest::ReserveThreads:5");
+  // 1 more thread can be reserved
+  ASSERT_EQ(1, env_->ReserveThreads(3, Env::Priority::HIGH));
+  // 2 reserved threads now
+
+  // Currently, two threads are blocked since the number of waiting
+  // threads is equal to the number of reserved threads (i.e., 2).
+  // If we reduce the number of background thread to 1, at least one thread
+  // will be the last excessive thread (here we have no control over the
+  // number of excessive threads because thread order does not
+  // necessarily follows the schedule order, but we ensure that the last thread
+  // shall not run any task by expanding the thread pool after we schedule
+  // the tasks), and thus they(it) become(s) unblocked, the number of waiting
+  // threads decreases to 0 or 1, but the number of reserved threads is still 2
+  env_->SetBackgroundThreads(1, Env::Priority::HIGH);
+
+  // Task 1,2 running; 2 reserved threads, however, in fact, we only have
+  // 0 or 1 waiting thread in the thread pool, proved by the
+  // following test, we CANNOT reserve 2 threads even though we just
+  // release 2
+  TEST_SYNC_POINT("EnvTest::ReserveThreads:6");
+  ASSERT_EQ(2, env_->ReleaseThreads(2, Env::Priority::HIGH));
+  ASSERT_GT(2, env_->ReserveThreads(2, Env::Priority::HIGH));
+
+  // Every new task will be put into the queue at this point
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &tasks[3],
+                 Env::Priority::HIGH);
+  TEST_SYNC_POINT("EnvTest::ReserveThreads:7");
+  ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+  ASSERT_TRUE(!tasks[3].IsSleeping());
+
+  // Set the number of threads to 3 so that Task 3 can dequeue
+  env_->SetBackgroundThreads(3, Env::Priority::HIGH);
+  // Wakup Task 1
+  tasks[1].WakeUp();
+  ASSERT_FALSE(tasks[1].TimedWaitUntilDone(kWaitMicros));
+  // Task 2, 3 running (Task 3 dequeue); 0 or 1 reserved thread
+  ASSERT_FALSE(tasks[3].TimedWaitUntilSleeping(kWaitMicros));
+  ASSERT_TRUE(tasks[3].IsSleeping());
+  ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+
+  // At most 1 thread can be released
+  ASSERT_GT(2, env_->ReleaseThreads(3, Env::Priority::HIGH));
+  tasks[2].WakeUp();
+  ASSERT_FALSE(tasks[2].TimedWaitUntilDone(kWaitMicros));
+  tasks[3].WakeUp();
+  ASSERT_FALSE(tasks[3].TimedWaitUntilDone(kWaitMicros));
+  WaitThreadPoolsEmpty();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+#if (defined OS_LINUX || defined OS_WIN)
+namespace {
+bool IsSingleVarint(const std::string& s) {
+  Slice slice(s);
+
+  uint64_t v;
+  if (!GetVarint64(&slice, &v)) {
+    return false;
+  }
+
+  return slice.size() == 0;
+}
+
+bool IsUniqueIDValid(const std::string& s) {
+  return !s.empty() && !IsSingleVarint(s);
+}
+
+const size_t MAX_ID_SIZE = 100;
+char temp_id[MAX_ID_SIZE];
+
+}  // namespace
+
+// Determine whether we can use the FS_IOC_GETVERSION ioctl
+// on a file in directory DIR.  Create a temporary file therein,
+// try to apply the ioctl (save that result), cleanup and
+// return the result.  Return true if it is supported, and
+// false if anything fails.
+// Note that this function "knows" that dir has just been created
+// and is empty, so we create a simply-named test file: "f".
+bool ioctl_support__FS_IOC_GETVERSION(const std::string& dir) {
+#ifdef OS_WIN
+  return true;
+#else
+  const std::string file = dir + "/f";
+  int fd;
+  do {
+    fd = open(file.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
+  } while (fd < 0 && errno == EINTR);
+  long int version;
+  bool ok = (fd >= 0 && ioctl(fd, FS_IOC_GETVERSION, &version) >= 0);
+
+  close(fd);
+  unlink(file.c_str());
+
+  return ok;
+#endif
+}
+
+// To ensure that Env::GetUniqueId-related tests work correctly, the files
+// should be stored in regular storage like "hard disk" or "flash device",
+// and not on a tmpfs file system (like /dev/shm and /tmp on some systems).
+// Otherwise we cannot get the correct id.
+//
+// This function serves as the replacement for test::TmpDir(), which may be
+// customized to be on a file system that doesn't work with GetUniqueId().
+
+class IoctlFriendlyTmpdir {
+ public:
+  explicit IoctlFriendlyTmpdir() {
+    char dir_buf[100];
+
+    const char* fmt = "%s/rocksdb.XXXXXX";
+    const char* tmp = getenv("TEST_IOCTL_FRIENDLY_TMPDIR");
+
+#ifdef OS_WIN
+#define rmdir _rmdir
+    if (tmp == nullptr) {
+      tmp = getenv("TMP");
+    }
+
+    snprintf(dir_buf, sizeof dir_buf, fmt, tmp);
+    auto result = _mktemp(dir_buf);
+    assert(result != nullptr);
+    BOOL ret = CreateDirectory(dir_buf, NULL);
+    assert(ret == TRUE);
+    dir_ = dir_buf;
+#else
+    std::list<std::string> candidate_dir_list = {"/var/tmp", "/tmp"};
+
+    // If $TEST_IOCTL_FRIENDLY_TMPDIR/rocksdb.XXXXXX fits, use
+    // $TEST_IOCTL_FRIENDLY_TMPDIR; subtract 2 for the "%s", and
+    // add 1 for the trailing NUL byte.
+    if (tmp && strlen(tmp) + strlen(fmt) - 2 + 1 <= sizeof dir_buf) {
+      // use $TEST_IOCTL_FRIENDLY_TMPDIR value
+      candidate_dir_list.push_front(tmp);
+    }
+
+    for (const std::string& d : candidate_dir_list) {
+      snprintf(dir_buf, sizeof dir_buf, fmt, d.c_str());
+      if (mkdtemp(dir_buf)) {
+        if (ioctl_support__FS_IOC_GETVERSION(dir_buf)) {
+          dir_ = dir_buf;
+          return;
+        } else {
+          // Diagnose ioctl-related failure only if this is the
+          // directory specified via that envvar.
+          if (tmp && tmp == d) {
+            fprintf(stderr,
+                    "TEST_IOCTL_FRIENDLY_TMPDIR-specified directory is "
+                    "not suitable: %s\n",
+                    d.c_str());
+          }
+          rmdir(dir_buf);  // ignore failure
+        }
+      } else {
+        // mkdtemp failed: diagnose it, but don't give up.
+        fprintf(stderr, "mkdtemp(%s/...) failed: %s\n", d.c_str(),
+                errnoStr(errno).c_str());
+      }
+    }
+
+    // check if it's running test within a docker container, in which case, the
+    // file system inside `overlayfs` may not support FS_IOC_GETVERSION
+    // skip the tests
+    struct stat buffer;
+    if (stat("/.dockerenv", &buffer) == 0) {
+      is_supported_ = false;
+      return;
+    }
+
+    fprintf(stderr,
+            "failed to find an ioctl-friendly temporary directory;"
+            " specify one via the TEST_IOCTL_FRIENDLY_TMPDIR envvar\n");
+    std::abort();
+#endif
+  }
+
+  ~IoctlFriendlyTmpdir() { rmdir(dir_.c_str()); }
+
+  const std::string& name() const { return dir_; }
+
+  bool is_supported() const { return is_supported_; }
+
+ private:
+  std::string dir_;
+
+  bool is_supported_ = true;
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(EnvPosixTest, PositionedAppend) {
+  std::unique_ptr<WritableFile> writable_file;
+  EnvOptions options;
+  options.use_direct_writes = true;
+  options.use_mmap_writes = false;
+  std::string fname = test::PerThreadDBPath(env_, "positioned_append");
+  SetupSyncPointsToMockDirectIO();
+
+  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, options));
+  const size_t kBlockSize = 4096;
+  const size_t kDataSize = kPageSize;
+  // Write a page worth of 'a'
+  auto data_ptr = NewAligned(kDataSize, 'a');
+  Slice data_a(data_ptr.get(), kDataSize);
+  ASSERT_OK(writable_file->PositionedAppend(data_a, 0U));
+  // Write a page worth of 'b' right after the first sector
+  data_ptr = NewAligned(kDataSize, 'b');
+  Slice data_b(data_ptr.get(), kDataSize);
+  ASSERT_OK(writable_file->PositionedAppend(data_b, kBlockSize));
+  ASSERT_OK(writable_file->Close());
+  // The file now has 1 sector worth of a followed by a page worth of b
+
+  // Verify the above
+  std::unique_ptr<SequentialFile> seq_file;
+  ASSERT_OK(env_->NewSequentialFile(fname, &seq_file, options));
+  size_t scratch_len = kPageSize * 2;
+  std::unique_ptr<char[]> scratch(new char[scratch_len]);
+  Slice result;
+  ASSERT_OK(seq_file->Read(scratch_len, &result, scratch.get()));
+  ASSERT_EQ(kPageSize + kBlockSize, result.size());
+  ASSERT_EQ('a', result[kBlockSize - 1]);
+  ASSERT_EQ('b', result[kBlockSize]);
+}
+#endif  // !ROCKSDB_LITE
+
+// `GetUniqueId()` temporarily returns zero on Windows. `BlockBasedTable` can
+// handle a return value of zero but this test case cannot.
+#ifndef OS_WIN
+TEST_P(EnvPosixTestWithParam, RandomAccessUniqueID) {
+  // Create file.
+  if (env_ == Env::Default()) {
+    EnvOptions soptions;
+    soptions.use_direct_reads = soptions.use_direct_writes = direct_io_;
+    IoctlFriendlyTmpdir ift;
+    if (!ift.is_supported()) {
+      ROCKSDB_GTEST_BYPASS(
+          "FS_IOC_GETVERSION is not supported by the filesystem");
+      return;
+    }
+    std::string fname = ift.name() + "/testfile";
+    std::unique_ptr<WritableFile> wfile;
+    ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
+
+    std::unique_ptr<RandomAccessFile> file;
+
+    // Get Unique ID
+    ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+    size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE);
+    ASSERT_TRUE(id_size > 0);
+    std::string unique_id1(temp_id, id_size);
+    ASSERT_TRUE(IsUniqueIDValid(unique_id1));
+
+    // Get Unique ID again
+    ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+    id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE);
+    ASSERT_TRUE(id_size > 0);
+    std::string unique_id2(temp_id, id_size);
+    ASSERT_TRUE(IsUniqueIDValid(unique_id2));
+
+    // Get Unique ID again after waiting some time.
+    env_->SleepForMicroseconds(1000000);
+    ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+    id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE);
+    ASSERT_TRUE(id_size > 0);
+    std::string unique_id3(temp_id, id_size);
+    ASSERT_TRUE(IsUniqueIDValid(unique_id3));
+
+    // Check IDs are the same.
+    ASSERT_EQ(unique_id1, unique_id2);
+    ASSERT_EQ(unique_id2, unique_id3);
+
+    // Delete the file
+    ASSERT_OK(env_->DeleteFile(fname));
+  }
+}
+#endif  // !defined(OS_WIN)
+
+// only works in linux platforms
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+TEST_P(EnvPosixTestWithParam, AllocateTest) {
+  if (env_ == Env::Default()) {
+    SetupSyncPointsToMockDirectIO();
+    std::string fname = test::PerThreadDBPath(env_, "preallocate_testfile");
+    // Try fallocate in a file to see whether the target file system supports
+    // it.
+    // Skip the test if fallocate is not supported.
+    std::string fname_test_fallocate =
+        test::PerThreadDBPath(env_, "preallocate_testfile_2");
+    int fd = -1;
+    do {
+      fd = open(fname_test_fallocate.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
+    } while (fd < 0 && errno == EINTR);
+    ASSERT_GT(fd, 0);
+
+    int alloc_status = fallocate(fd, 0, 0, 1);
+
+    int err_number = 0;
+    if (alloc_status != 0) {
+      err_number = errno;
+      fprintf(stderr, "Warning: fallocate() fails, %s\n",
+              errnoStr(err_number).c_str());
+    }
+    close(fd);
+    ASSERT_OK(env_->DeleteFile(fname_test_fallocate));
+    if (alloc_status != 0 && err_number == EOPNOTSUPP) {
+      // The filesystem containing the file does not support fallocate
+      return;
+    }
+
+    EnvOptions soptions;
+    soptions.use_mmap_writes = false;
+    soptions.use_direct_reads = soptions.use_direct_writes = direct_io_;
+    std::unique_ptr<WritableFile> wfile;
+    ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
+
+    // allocate 100 MB
+    size_t kPreallocateSize = 100 * 1024 * 1024;
+    size_t kBlockSize = 512;
+    size_t kDataSize = 1024 * 1024;
+    auto data_ptr = NewAligned(kDataSize, 'A');
+    Slice data(data_ptr.get(), kDataSize);
+    wfile->SetPreallocationBlockSize(kPreallocateSize);
+    wfile->PrepareWrite(wfile->GetFileSize(), kDataSize);
+    ASSERT_OK(wfile->Append(data));
+    ASSERT_OK(wfile->Flush());
+
+    struct stat f_stat;
+    ASSERT_EQ(stat(fname.c_str(), &f_stat), 0);
+    ASSERT_EQ((unsigned int)kDataSize, f_stat.st_size);
+    // verify that blocks are preallocated
+    // Note here that we don't check the exact number of blocks preallocated --
+    // we only require that number of allocated blocks is at least what we
+    // expect.
+    // It looks like some FS give us more blocks that we asked for. That's fine.
+    // It might be worth investigating further.
+    ASSERT_LE((unsigned int)(kPreallocateSize / kBlockSize), f_stat.st_blocks);
+
+    // close the file, should deallocate the blocks
+    wfile.reset();
+
+    stat(fname.c_str(), &f_stat);
+    ASSERT_EQ((unsigned int)kDataSize, f_stat.st_size);
+    // verify that preallocated blocks were deallocated on file close
+    // Because the FS might give us more blocks, we add a full page to the size
+    // and expect the number of blocks to be less or equal to that.
+    ASSERT_GE((f_stat.st_size + kPageSize + kBlockSize - 1) / kBlockSize,
+              (unsigned int)f_stat.st_blocks);
+  }
+}
+#endif  // ROCKSDB_FALLOCATE_PRESENT
+
+// Returns true if any of the strings in ss are the prefix of another string.
+bool HasPrefix(const std::unordered_set<std::string>& ss) {
+  for (const std::string& s : ss) {
+    if (s.empty()) {
+      return true;
+    }
+    for (size_t i = 1; i < s.size(); ++i) {
+      if (ss.count(s.substr(0, i)) != 0) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// `GetUniqueId()` temporarily returns zero on Windows. `BlockBasedTable` can
+// handle a return value of zero but this test case cannot.
+#ifndef OS_WIN
+TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDConcurrent) {
+  if (env_ == Env::Default()) {
+    // Check whether a bunch of concurrently existing files have unique IDs.
+    EnvOptions soptions;
+    soptions.use_direct_reads = soptions.use_direct_writes = direct_io_;
+
+    // Create the files
+    IoctlFriendlyTmpdir ift;
+    if (!ift.is_supported()) {
+      ROCKSDB_GTEST_BYPASS(
+          "FS_IOC_GETVERSION is not supported by the filesystem");
+      return;
+    }
+    std::vector<std::string> fnames;
+    for (int i = 0; i < 1000; ++i) {
+      fnames.push_back(ift.name() + "/" + "testfile" + std::to_string(i));
+
+      // Create file.
+      std::unique_ptr<WritableFile> wfile;
+      ASSERT_OK(env_->NewWritableFile(fnames[i], &wfile, soptions));
+    }
+
+    // Collect and check whether the IDs are unique.
+    std::unordered_set<std::string> ids;
+    for (const std::string& fname : fnames) {
+      std::unique_ptr<RandomAccessFile> file;
+      std::string unique_id;
+      ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+      size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE);
+      ASSERT_TRUE(id_size > 0);
+      unique_id = std::string(temp_id, id_size);
+      ASSERT_TRUE(IsUniqueIDValid(unique_id));
+
+      ASSERT_TRUE(ids.count(unique_id) == 0);
+      ids.insert(unique_id);
+    }
+
+    // Delete the files
+    for (const std::string& fname : fnames) {
+      ASSERT_OK(env_->DeleteFile(fname));
+    }
+
+    ASSERT_TRUE(!HasPrefix(ids));
+  }
+}
+
+// TODO: Disable the flaky test, it's a known issue that ext4 may return same
+// key after file deletion. The issue is tracked in #7405, #7470.
+TEST_P(EnvPosixTestWithParam, DISABLED_RandomAccessUniqueIDDeletes) {
+  if (env_ == Env::Default()) {
+    EnvOptions soptions;
+    soptions.use_direct_reads = soptions.use_direct_writes = direct_io_;
+
+    IoctlFriendlyTmpdir ift;
+    if (!ift.is_supported()) {
+      ROCKSDB_GTEST_BYPASS(
+          "FS_IOC_GETVERSION is not supported by the filesystem");
+      return;
+    }
+    std::string fname = ift.name() + "/" + "testfile";
+
+    // Check that after file is deleted we don't get same ID again in a new
+    // file.
+    std::unordered_set<std::string> ids;
+    for (int i = 0; i < 1000; ++i) {
+      // Create file.
+      {
+        std::unique_ptr<WritableFile> wfile;
+        ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
+      }
+
+      // Get Unique ID
+      std::string unique_id;
+      {
+        std::unique_ptr<RandomAccessFile> file;
+        ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+        size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE);
+        ASSERT_TRUE(id_size > 0);
+        unique_id = std::string(temp_id, id_size);
+      }
+
+      ASSERT_TRUE(IsUniqueIDValid(unique_id));
+      ASSERT_TRUE(ids.count(unique_id) == 0);
+      ids.insert(unique_id);
+
+      // Delete the file
+      ASSERT_OK(env_->DeleteFile(fname));
+    }
+
+    ASSERT_TRUE(!HasPrefix(ids));
+  }
+}
+#endif  // !defined(OS_WIN)
+
+TEST_P(EnvPosixTestWithParam, MultiRead) {
+  EnvOptions soptions;
+  soptions.use_direct_reads = soptions.use_direct_writes = direct_io_;
+  std::string fname = test::PerThreadDBPath(env_, "testfile");
+
+  const size_t kSectorSize = 4096;
+  const size_t kNumSectors = 8;
+
+  // Create file.
+  {
+    std::unique_ptr<WritableFile> wfile;
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+    !defined(OS_AIX)
+    if (soptions.use_direct_writes) {
+      soptions.use_direct_writes = false;
+    }
+#endif
+    ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
+    for (size_t i = 0; i < kNumSectors; ++i) {
+      auto data = NewAligned(kSectorSize * 8, static_cast<char>(i + 1));
+      Slice slice(data.get(), kSectorSize);
+      ASSERT_OK(wfile->Append(slice));
+    }
+    ASSERT_OK(wfile->Close());
+  }
+
+  // More attempts to simulate more partial result sequences.
+  for (uint32_t attempt = 0; attempt < 20; attempt++) {
+    // Random Read
+    Random rnd(301 + attempt);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "UpdateResults::io_uring_result", [&](void* arg) {
+          if (attempt > 0) {
+            // No failure in the first attempt.
+            size_t& bytes_read = *static_cast<size_t*>(arg);
+            if (rnd.OneIn(4)) {
+              bytes_read = 0;
+            } else if (rnd.OneIn(3)) {
+              bytes_read = static_cast<size_t>(
+                  rnd.Uniform(static_cast<int>(bytes_read)));
+            }
+          }
+        });
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    std::unique_ptr<RandomAccessFile> file;
+    std::vector<ReadRequest> reqs(3);
+    std::vector<std::unique_ptr<char, Deleter>> data;
+    uint64_t offset = 0;
+    for (size_t i = 0; i < reqs.size(); ++i) {
+      reqs[i].offset = offset;
+      offset += 2 * kSectorSize;
+      reqs[i].len = kSectorSize;
+      data.emplace_back(NewAligned(kSectorSize, 0));
+      reqs[i].scratch = data.back().get();
+    }
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+    !defined(OS_AIX)
+    if (soptions.use_direct_reads) {
+      soptions.use_direct_reads = false;
+    }
+#endif
+    ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+    ASSERT_OK(file->MultiRead(reqs.data(), reqs.size()));
+    for (size_t i = 0; i < reqs.size(); ++i) {
+      auto buf = NewAligned(kSectorSize * 8, static_cast<char>(i * 2 + 1));
+      ASSERT_OK(reqs[i].status);
+      ASSERT_EQ(memcmp(reqs[i].scratch, buf.get(), kSectorSize), 0);
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(EnvPosixTest, MultiReadNonAlignedLargeNum) {
+  // In this test we don't do aligned read, so it doesn't work for
+  // direct I/O case.
+  EnvOptions soptions;
+  soptions.use_direct_reads = soptions.use_direct_writes = false;
+  std::string fname = test::PerThreadDBPath(env_, "testfile");
+
+  const size_t kTotalSize = 81920;
+  Random rnd(301);
+  std::string expected_data = rnd.RandomString(kTotalSize);
+
+  // Create file.
+  {
+    std::unique_ptr<WritableFile> wfile;
+    ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
+    ASSERT_OK(wfile->Append(expected_data));
+    ASSERT_OK(wfile->Close());
+  }
+
+  // More attempts to simulate more partial result sequences.
+  for (uint32_t attempt = 0; attempt < 25; attempt++) {
+    // Right now kIoUringDepth is hard coded as 256, so we need very large
+    // number of keys to cover the case of multiple rounds of submissions.
+    // Right now the test latency is still acceptable. If it ends up with
+    // too long, we can modify the io uring depth with SyncPoint here.
+    const int num_reads = rnd.Uniform(512) + 1;
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "UpdateResults::io_uring_result", [&](void* arg) {
+          if (attempt > 5) {
+            // Improve partial result rates in second half of the run to
+            // cover the case of repeated partial results.
+            int odd = (attempt < 15) ? num_reads / 2 : 4;
+            // No failure in first several attempts.
+            size_t& bytes_read = *static_cast<size_t*>(arg);
+            if (rnd.OneIn(odd)) {
+              bytes_read = 0;
+            } else if (rnd.OneIn(odd / 2)) {
+              bytes_read = static_cast<size_t>(
+                  rnd.Uniform(static_cast<int>(bytes_read)));
+            }
+          }
+        });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    // Generate (offset, len) pairs
+    std::set<int> start_offsets;
+    for (int i = 0; i < num_reads; i++) {
+      int rnd_off;
+      // No repeat offsets.
+      while (start_offsets.find(rnd_off = rnd.Uniform(81920)) !=
+             start_offsets.end()) {
+      }
+      start_offsets.insert(rnd_off);
+    }
+    std::vector<size_t> offsets;
+    std::vector<size_t> lens;
+    // std::set already sorted the offsets.
+    for (int so : start_offsets) {
+      offsets.push_back(so);
+    }
+    for (size_t i = 0; i + 1 < offsets.size(); i++) {
+      lens.push_back(static_cast<size_t>(
+          rnd.Uniform(static_cast<int>(offsets[i + 1] - offsets[i])) + 1));
+    }
+    lens.push_back(static_cast<size_t>(
+        rnd.Uniform(static_cast<int>(kTotalSize - offsets.back())) + 1));
+    ASSERT_EQ(num_reads, lens.size());
+
+    // Create requests
+    std::vector<std::string> scratches;
+    scratches.reserve(num_reads);
+    std::vector<ReadRequest> reqs(num_reads);
+    for (size_t i = 0; i < reqs.size(); ++i) {
+      reqs[i].offset = offsets[i];
+      reqs[i].len = lens[i];
+      scratches.emplace_back(reqs[i].len, ' ');
+      reqs[i].scratch = const_cast<char*>(scratches.back().data());
+    }
+
+    // Query the data
+    std::unique_ptr<RandomAccessFile> file;
+    ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+    ASSERT_OK(file->MultiRead(reqs.data(), reqs.size()));
+
+    // Validate results
+    for (int i = 0; i < num_reads; ++i) {
+      ASSERT_OK(reqs[i].status);
+      ASSERT_EQ(
+          Slice(expected_data.data() + offsets[i], lens[i]).ToString(true),
+          reqs[i].result.ToString(true));
+    }
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(EnvPosixTest, NonAlignedDirectIOMultiReadBeyondFileSize) {
+  EnvOptions soptions;
+  soptions.use_direct_reads = true;
+  soptions.use_direct_writes = false;
+  std::string fname = test::PerThreadDBPath(env_, "testfile");
+
+  Random rnd(301);
+  std::unique_ptr<WritableFile> wfile;
+  size_t alignment = 0;
+  // Create file.
+  {
+    ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
+    auto data_ptr = NewAligned(4095, 'b');
+    Slice data_b(data_ptr.get(), 4095);
+    ASSERT_OK(wfile->PositionedAppend(data_b, 0U));
+    ASSERT_OK(wfile->Close());
+  }
+
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+    !defined(OS_AIX) && !defined(OS_OPENBSD) && !defined(OS_FREEBSD)
+  if (soptions.use_direct_reads) {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "NewRandomAccessFile:O_DIRECT", [&](void* arg) {
+          int* val = static_cast<int*>(arg);
+          *val &= ~O_DIRECT;
+        });
+  }
+#endif
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  const int num_reads = 2;
+  // Create requests
+  std::vector<std::string> scratches;
+  scratches.reserve(num_reads);
+  std::vector<ReadRequest> reqs(num_reads);
+
+  std::unique_ptr<RandomAccessFile> file;
+  ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+  alignment = file->GetRequiredBufferAlignment();
+  ASSERT_EQ(num_reads, reqs.size());
+
+  std::vector<std::unique_ptr<char, Deleter>> data;
+
+  std::vector<size_t> offsets = {0, 2047};
+  std::vector<size_t> lens = {2047, 4096 - 2047};
+
+  for (size_t i = 0; i < num_reads; i++) {
+    // Do alignment
+    reqs[i].offset = static_cast<uint64_t>(TruncateToPageBoundary(
+        alignment, static_cast<size_t>(/*offset=*/offsets[i])));
+    reqs[i].len =
+        Roundup(static_cast<size_t>(/*offset=*/offsets[i]) + /*length=*/lens[i],
+                alignment) -
+        reqs[i].offset;
+
+    size_t new_capacity = Roundup(reqs[i].len, alignment);
+    data.emplace_back(NewAligned(new_capacity, 0));
+    reqs[i].scratch = data.back().get();
+  }
+
+  // Query the data
+  ASSERT_OK(file->MultiRead(reqs.data(), reqs.size()));
+
+  // Validate results
+  for (size_t i = 0; i < num_reads; ++i) {
+    ASSERT_OK(reqs[i].status);
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+#endif  // ROCKSDB_LITE
+
+#if defined(ROCKSDB_IOURING_PRESENT)
+void GenerateFilesAndRequest(Env* env, const std::string& fname,
+                             std::vector<ReadRequest>* ret_reqs,
+                             std::vector<std::string>* scratches) {
+  const size_t kTotalSize = 81920;
+  Random rnd(301);
+  std::string expected_data = rnd.RandomString(kTotalSize);
+
+  // Create file.
+  {
+    std::unique_ptr<WritableFile> wfile;
+    ASSERT_OK(env->NewWritableFile(fname, &wfile, EnvOptions()));
+    ASSERT_OK(wfile->Append(expected_data));
+    ASSERT_OK(wfile->Close());
+  }
+
+  // Right now kIoUringDepth is hard coded as 256, so we need very large
+  // number of keys to cover the case of multiple rounds of submissions.
+  // Right now the test latency is still acceptable. If it ends up with
+  // too long, we can modify the io uring depth with SyncPoint here.
+  const int num_reads = 3;
+  std::vector<size_t> offsets = {10000, 20000, 30000};
+  std::vector<size_t> lens = {3000, 200, 100};
+
+  // Create requests
+  scratches->reserve(num_reads);
+  std::vector<ReadRequest>& reqs = *ret_reqs;
+  reqs.resize(num_reads);
+  for (int i = 0; i < num_reads; ++i) {
+    reqs[i].offset = offsets[i];
+    reqs[i].len = lens[i];
+    scratches->emplace_back(reqs[i].len, ' ');
+    reqs[i].scratch = const_cast<char*>(scratches->back().data());
+  }
+}
+
+TEST_F(EnvPosixTest, MultiReadIOUringError) {
+  // In this test we don't do aligned read, so we can't do direct I/O.
+  EnvOptions soptions;
+  soptions.use_direct_reads = soptions.use_direct_writes = false;
+  std::string fname = test::PerThreadDBPath(env_, "testfile");
+
+  std::vector<std::string> scratches;
+  std::vector<ReadRequest> reqs;
+  GenerateFilesAndRequest(env_, fname, &reqs, &scratches);
+  // Query the data
+  std::unique_ptr<RandomAccessFile> file;
+  ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+
+  bool io_uring_wait_cqe_called = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "PosixRandomAccessFile::MultiRead:io_uring_wait_cqe:return",
+      [&](void* arg) {
+        if (!io_uring_wait_cqe_called) {
+          io_uring_wait_cqe_called = true;
+          ssize_t& ret = *(static_cast<ssize_t*>(arg));
+          ret = 1;
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = file->MultiRead(reqs.data(), reqs.size());
+  if (io_uring_wait_cqe_called) {
+    ASSERT_NOK(s);
+  } else {
+    s.PermitUncheckedError();
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(EnvPosixTest, MultiReadIOUringError2) {
+  // In this test we don't do aligned read, so we can't do direct I/O.
+  EnvOptions soptions;
+  soptions.use_direct_reads = soptions.use_direct_writes = false;
+  std::string fname = test::PerThreadDBPath(env_, "testfile");
+
+  std::vector<std::string> scratches;
+  std::vector<ReadRequest> reqs;
+  GenerateFilesAndRequest(env_, fname, &reqs, &scratches);
+  // Query the data
+  std::unique_ptr<RandomAccessFile> file;
+  ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+
+  bool io_uring_submit_and_wait_called = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return1",
+      [&](void* arg) {
+        io_uring_submit_and_wait_called = true;
+        ssize_t* ret = static_cast<ssize_t*>(arg);
+        (*ret)--;
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2",
+      [&](void* arg) {
+        struct io_uring* iu = static_cast<struct io_uring*>(arg);
+        struct io_uring_cqe* cqe;
+        assert(io_uring_wait_cqe(iu, &cqe) == 0);
+        io_uring_cqe_seen(iu, cqe);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = file->MultiRead(reqs.data(), reqs.size());
+  if (io_uring_submit_and_wait_called) {
+    ASSERT_NOK(s);
+  } else {
+    s.PermitUncheckedError();
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif  // ROCKSDB_IOURING_PRESENT
+
+// Only works in linux platforms
+#ifdef OS_WIN
+TEST_P(EnvPosixTestWithParam, DISABLED_InvalidateCache) {
+#else
+TEST_P(EnvPosixTestWithParam, InvalidateCache) {
+#endif
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  EnvOptions soptions;
+  soptions.use_direct_reads = soptions.use_direct_writes = direct_io_;
+  std::string fname = test::PerThreadDBPath(env_, "testfile");
+
+  const size_t kSectorSize = 512;
+  auto data = NewAligned(kSectorSize, 0);
+  Slice slice(data.get(), kSectorSize);
+
+  // Create file.
+  {
+    std::unique_ptr<WritableFile> wfile;
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+    !defined(OS_AIX)
+    if (soptions.use_direct_writes) {
+      soptions.use_direct_writes = false;
+    }
+#endif
+    ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
+    ASSERT_OK(wfile->Append(slice));
+    ASSERT_OK(wfile->InvalidateCache(0, 0));
+    ASSERT_OK(wfile->Close());
+  }
+
+  // Random Read
+  {
+    std::unique_ptr<RandomAccessFile> file;
+    auto scratch = NewAligned(kSectorSize, 0);
+    Slice result;
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+    !defined(OS_AIX)
+    if (soptions.use_direct_reads) {
+      soptions.use_direct_reads = false;
+    }
+#endif
+    ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+    ASSERT_OK(file->Read(0, kSectorSize, &result, scratch.get()));
+    ASSERT_EQ(memcmp(scratch.get(), data.get(), kSectorSize), 0);
+    ASSERT_OK(file->InvalidateCache(0, 11));
+    ASSERT_OK(file->InvalidateCache(0, 0));
+  }
+
+  // Sequential Read
+  {
+    std::unique_ptr<SequentialFile> file;
+    auto scratch = NewAligned(kSectorSize, 0);
+    Slice result;
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+    !defined(OS_AIX)
+    if (soptions.use_direct_reads) {
+      soptions.use_direct_reads = false;
+    }
+#endif
+    ASSERT_OK(env_->NewSequentialFile(fname, &file, soptions));
+    if (file->use_direct_io()) {
+      ASSERT_OK(file->PositionedRead(0, kSectorSize, &result, scratch.get()));
+    } else {
+      ASSERT_OK(file->Read(kSectorSize, &result, scratch.get()));
+    }
+    ASSERT_EQ(memcmp(scratch.get(), data.get(), kSectorSize), 0);
+    ASSERT_OK(file->InvalidateCache(0, 11));
+    ASSERT_OK(file->InvalidateCache(0, 0));
+  }
+  // Delete the file
+  ASSERT_OK(env_->DeleteFile(fname));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+}
+#endif  // OS_LINUX || OS_WIN
+
+class TestLogger : public Logger {
+ public:
+  using Logger::Logv;
+  void Logv(const char* format, va_list ap) override {
+    log_count++;
+
+    char new_format[550];
+    std::fill_n(new_format, sizeof(new_format), '2');
+    {
+      va_list backup_ap;
+      va_copy(backup_ap, ap);
+      int n = vsnprintf(new_format, sizeof(new_format) - 1, format, backup_ap);
+      // 48 bytes for extra information + bytes allocated
+
+// When we have n == -1 there is not a terminating zero expected
+#ifdef OS_WIN
+      if (n < 0) {
+        char_0_count++;
+      }
+#endif
+
+      if (new_format[0] == '[') {
+        // "[DEBUG] "
+        ASSERT_TRUE(n <= 56 + (512 - static_cast<int>(sizeof(port::TimeVal))));
+      } else {
+        ASSERT_TRUE(n <= 48 + (512 - static_cast<int>(sizeof(port::TimeVal))));
+      }
+      va_end(backup_ap);
+    }
+
+    for (size_t i = 0; i < sizeof(new_format); i++) {
+      if (new_format[i] == 'x') {
+        char_x_count++;
+      } else if (new_format[i] == '\0') {
+        char_0_count++;
+      }
+    }
+  }
+  int log_count;
+  int char_x_count;
+  int char_0_count;
+};
+
+TEST_P(EnvPosixTestWithParam, LogBufferTest) {
+  TestLogger test_logger;
+  test_logger.SetInfoLogLevel(InfoLogLevel::INFO_LEVEL);
+  test_logger.log_count = 0;
+  test_logger.char_x_count = 0;
+  test_logger.char_0_count = 0;
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, &test_logger);
+  LogBuffer log_buffer_debug(DEBUG_LEVEL, &test_logger);
+
+  char bytes200[200];
+  std::fill_n(bytes200, sizeof(bytes200), '1');
+  bytes200[sizeof(bytes200) - 1] = '\0';
+  char bytes600[600];
+  std::fill_n(bytes600, sizeof(bytes600), '1');
+  bytes600[sizeof(bytes600) - 1] = '\0';
+  char bytes9000[9000];
+  std::fill_n(bytes9000, sizeof(bytes9000), '1');
+  bytes9000[sizeof(bytes9000) - 1] = '\0';
+
+  ROCKS_LOG_BUFFER(&log_buffer, "x%sx", bytes200);
+  ROCKS_LOG_BUFFER(&log_buffer, "x%sx", bytes600);
+  ROCKS_LOG_BUFFER(&log_buffer, "x%sx%sx%sx", bytes200, bytes200, bytes200);
+  ROCKS_LOG_BUFFER(&log_buffer, "x%sx%sx", bytes200, bytes600);
+  ROCKS_LOG_BUFFER(&log_buffer, "x%sx%sx", bytes600, bytes9000);
+
+  ROCKS_LOG_BUFFER(&log_buffer_debug, "x%sx", bytes200);
+  test_logger.SetInfoLogLevel(DEBUG_LEVEL);
+  ROCKS_LOG_BUFFER(&log_buffer_debug, "x%sx%sx%sx", bytes600, bytes9000,
+                   bytes200);
+
+  ASSERT_EQ(0, test_logger.log_count);
+  log_buffer.FlushBufferToLog();
+  log_buffer_debug.FlushBufferToLog();
+  ASSERT_EQ(6, test_logger.log_count);
+  ASSERT_EQ(6, test_logger.char_0_count);
+  ASSERT_EQ(10, test_logger.char_x_count);
+}
+
+class TestLogger2 : public Logger {
+ public:
+  explicit TestLogger2(size_t max_log_size) : max_log_size_(max_log_size) {}
+  using Logger::Logv;
+  void Logv(const char* format, va_list ap) override {
+    char new_format[2000];
+    std::fill_n(new_format, sizeof(new_format), '2');
+    {
+      va_list backup_ap;
+      va_copy(backup_ap, ap);
+      int n = vsnprintf(new_format, sizeof(new_format) - 1, format, backup_ap);
+      // 48 bytes for extra information + bytes allocated
+      ASSERT_TRUE(n <=
+                  48 + static_cast<int>(max_log_size_ - sizeof(port::TimeVal)));
+      ASSERT_TRUE(n > static_cast<int>(max_log_size_ - sizeof(port::TimeVal)));
+      va_end(backup_ap);
+    }
+  }
+  size_t max_log_size_;
+};
+
+TEST_P(EnvPosixTestWithParam, LogBufferMaxSizeTest) {
+  char bytes9000[9000];
+  std::fill_n(bytes9000, sizeof(bytes9000), '1');
+  bytes9000[sizeof(bytes9000) - 1] = '\0';
+
+  for (size_t max_log_size = 256; max_log_size <= 1024;
+       max_log_size += 1024 - 256) {
+    TestLogger2 test_logger(max_log_size);
+    test_logger.SetInfoLogLevel(InfoLogLevel::INFO_LEVEL);
+    LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, &test_logger);
+    ROCKS_LOG_BUFFER_MAX_SZ(&log_buffer, max_log_size, "%s", bytes9000);
+    log_buffer.FlushBufferToLog();
+  }
+}
+
+TEST_P(EnvPosixTestWithParam, Preallocation) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  const std::string src = test::PerThreadDBPath(env_, "testfile");
+  std::unique_ptr<WritableFile> srcfile;
+  EnvOptions soptions;
+  soptions.use_direct_reads = soptions.use_direct_writes = direct_io_;
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+    !defined(OS_AIX) && !defined(OS_OPENBSD) && !defined(OS_FREEBSD)
+  if (soptions.use_direct_writes) {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "NewWritableFile:O_DIRECT", [&](void* arg) {
+          int* val = static_cast<int*>(arg);
+          *val &= ~O_DIRECT;
+        });
+  }
+#endif
+  ASSERT_OK(env_->NewWritableFile(src, &srcfile, soptions));
+  srcfile->SetPreallocationBlockSize(1024 * 1024);
+
+  // No writes should mean no preallocation
+  size_t block_size, last_allocated_block;
+  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
+  ASSERT_EQ(last_allocated_block, 0UL);
+
+  // Small write should preallocate one block
+  size_t kStrSize = 4096;
+  auto data = NewAligned(kStrSize, 'A');
+  Slice str(data.get(), kStrSize);
+  srcfile->PrepareWrite(srcfile->GetFileSize(), kStrSize);
+  ASSERT_OK(srcfile->Append(str));
+  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
+  ASSERT_EQ(last_allocated_block, 1UL);
+
+  // Write an entire preallocation block, make sure we increased by two.
+  {
+    auto buf_ptr = NewAligned(block_size, ' ');
+    Slice buf(buf_ptr.get(), block_size);
+    srcfile->PrepareWrite(srcfile->GetFileSize(), block_size);
+    ASSERT_OK(srcfile->Append(buf));
+    srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
+    ASSERT_EQ(last_allocated_block, 2UL);
+  }
+
+  // Write five more blocks at once, ensure we're where we need to be.
+  {
+    auto buf_ptr = NewAligned(block_size * 5, ' ');
+    Slice buf = Slice(buf_ptr.get(), block_size * 5);
+    srcfile->PrepareWrite(srcfile->GetFileSize(), buf.size());
+    ASSERT_OK(srcfile->Append(buf));
+    srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
+    ASSERT_EQ(last_allocated_block, 7UL);
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+}
+
+// Test that the two ways to get children file attributes (in bulk or
+// individually) behave consistently.
+TEST_P(EnvPosixTestWithParam, ConsistentChildrenAttributes) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  EnvOptions soptions;
+  soptions.use_direct_reads = soptions.use_direct_writes = direct_io_;
+  const int kNumChildren = 10;
+
+  std::string data;
+  std::string test_base_dir = test::PerThreadDBPath(env_, "env_test_chr_attr");
+  env_->CreateDir(test_base_dir).PermitUncheckedError();
+  for (int i = 0; i < kNumChildren; ++i) {
+    const std::string path = test_base_dir + "/testfile_" + std::to_string(i);
+    std::unique_ptr<WritableFile> file;
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+    !defined(OS_AIX) && !defined(OS_OPENBSD) && !defined(OS_FREEBSD)
+    if (soptions.use_direct_writes) {
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "NewWritableFile:O_DIRECT", [&](void* arg) {
+            int* val = static_cast<int*>(arg);
+            *val &= ~O_DIRECT;
+          });
+    }
+#endif
+    ASSERT_OK(env_->NewWritableFile(path, &file, soptions));
+    auto buf_ptr = NewAligned(data.size(), 'T');
+    Slice buf(buf_ptr.get(), data.size());
+    ASSERT_OK(file->Append(buf));
+    data.append(std::string(4096, 'T'));
+  }
+
+  std::vector<Env::FileAttributes> file_attrs;
+  ASSERT_OK(env_->GetChildrenFileAttributes(test_base_dir, &file_attrs));
+  for (int i = 0; i < kNumChildren; ++i) {
+    const std::string name = "testfile_" + std::to_string(i);
+    const std::string path = test_base_dir + "/" + name;
+
+    auto file_attrs_iter = std::find_if(
+        file_attrs.begin(), file_attrs.end(),
+        [&name](const Env::FileAttributes& fm) { return fm.name == name; });
+    ASSERT_TRUE(file_attrs_iter != file_attrs.end());
+    uint64_t size;
+    ASSERT_OK(env_->GetFileSize(path, &size));
+    ASSERT_EQ(size, 4096 * i);
+    ASSERT_EQ(size, file_attrs_iter->size_bytes);
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+}
+
+// Test that all WritableFileWrapper forwards all calls to WritableFile.
+TEST_P(EnvPosixTestWithParam, WritableFileWrapper) {
+  class Base : public WritableFile {
+   public:
+    mutable int* step_;
+
+    void inc(int x) const { EXPECT_EQ(x, (*step_)++); }
+
+    explicit Base(int* step) : step_(step) { inc(0); }
+
+    Status Append(const Slice& /*data*/) override {
+      inc(1);
+      return Status::OK();
+    }
+
+    Status Append(
+        const Slice& /*data*/,
+        const DataVerificationInfo& /* verification_info */) override {
+      inc(1);
+      return Status::OK();
+    }
+
+    Status PositionedAppend(const Slice& /*data*/,
+                            uint64_t /*offset*/) override {
+      inc(2);
+      return Status::OK();
+    }
+
+    Status PositionedAppend(
+        const Slice& /*data*/, uint64_t /*offset*/,
+        const DataVerificationInfo& /* verification_info */) override {
+      inc(2);
+      return Status::OK();
+    }
+
+    Status Truncate(uint64_t /*size*/) override {
+      inc(3);
+      return Status::OK();
+    }
+
+    Status Close() override {
+      inc(4);
+      return Status::OK();
+    }
+
+    Status Flush() override {
+      inc(5);
+      return Status::OK();
+    }
+
+    Status Sync() override {
+      inc(6);
+      return Status::OK();
+    }
+
+    Status Fsync() override {
+      inc(7);
+      return Status::OK();
+    }
+
+    bool IsSyncThreadSafe() const override {
+      inc(8);
+      return true;
+    }
+
+    bool use_direct_io() const override {
+      inc(9);
+      return true;
+    }
+
+    size_t GetRequiredBufferAlignment() const override {
+      inc(10);
+      return 0;
+    }
+
+    void SetIOPriority(Env::IOPriority /*pri*/) override { inc(11); }
+
+    Env::IOPriority GetIOPriority() override {
+      inc(12);
+      return Env::IOPriority::IO_LOW;
+    }
+
+    void SetWriteLifeTimeHint(Env::WriteLifeTimeHint /*hint*/) override {
+      inc(13);
+    }
+
+    Env::WriteLifeTimeHint GetWriteLifeTimeHint() override {
+      inc(14);
+      return Env::WriteLifeTimeHint::WLTH_NOT_SET;
+    }
+
+    uint64_t GetFileSize() override {
+      inc(15);
+      return 0;
+    }
+
+    void SetPreallocationBlockSize(size_t /*size*/) override { inc(16); }
+
+    void GetPreallocationStatus(size_t* /*block_size*/,
+                                size_t* /*last_allocated_block*/) override {
+      inc(17);
+    }
+
+    size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const override {
+      inc(18);
+      return 0;
+    }
+
+    Status InvalidateCache(size_t /*offset*/, size_t /*length*/) override {
+      inc(19);
+      return Status::OK();
+    }
+
+    Status RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/) override {
+      inc(20);
+      return Status::OK();
+    }
+
+    void PrepareWrite(size_t /*offset*/, size_t /*len*/) override { inc(21); }
+
+    Status Allocate(uint64_t /*offset*/, uint64_t /*len*/) override {
+      inc(22);
+      return Status::OK();
+    }
+
+   public:
+    ~Base() override { inc(23); }
+  };
+
+  class Wrapper : public WritableFileWrapper {
+   public:
+    explicit Wrapper(WritableFile* target) : WritableFileWrapper(target) {}
+  };
+
+  int step = 0;
+
+  {
+    Base b(&step);
+    Wrapper w(&b);
+    ASSERT_OK(w.Append(Slice()));
+    ASSERT_OK(w.PositionedAppend(Slice(), 0));
+    ASSERT_OK(w.Truncate(0));
+    ASSERT_OK(w.Close());
+    ASSERT_OK(w.Flush());
+    ASSERT_OK(w.Sync());
+    ASSERT_OK(w.Fsync());
+    w.IsSyncThreadSafe();
+    w.use_direct_io();
+    w.GetRequiredBufferAlignment();
+    w.SetIOPriority(Env::IOPriority::IO_HIGH);
+    w.GetIOPriority();
+    w.SetWriteLifeTimeHint(Env::WriteLifeTimeHint::WLTH_NOT_SET);
+    w.GetWriteLifeTimeHint();
+    w.GetFileSize();
+    w.SetPreallocationBlockSize(0);
+    w.GetPreallocationStatus(nullptr, nullptr);
+    w.GetUniqueId(nullptr, 0);
+    ASSERT_OK(w.InvalidateCache(0, 0));
+    ASSERT_OK(w.RangeSync(0, 0));
+    w.PrepareWrite(0, 0);
+    ASSERT_OK(w.Allocate(0, 0));
+  }
+
+  EXPECT_EQ(24, step);
+}
+
+TEST_P(EnvPosixTestWithParam, PosixRandomRWFile) {
+  const std::string path = test::PerThreadDBPath(env_, "random_rw_file");
+
+  env_->DeleteFile(path).PermitUncheckedError();
+
+  std::unique_ptr<RandomRWFile> file;
+
+  // Cannot open non-existing file.
+  ASSERT_NOK(env_->NewRandomRWFile(path, &file, EnvOptions()));
+
+  // Create the file using WritableFile
+  {
+    std::unique_ptr<WritableFile> wf;
+    ASSERT_OK(env_->NewWritableFile(path, &wf, EnvOptions()));
+  }
+
+  ASSERT_OK(env_->NewRandomRWFile(path, &file, EnvOptions()));
+
+  char buf[10000];
+  Slice read_res;
+
+  ASSERT_OK(file->Write(0, "ABCD"));
+  ASSERT_OK(file->Read(0, 10, &read_res, buf));
+  ASSERT_EQ(read_res.ToString(), "ABCD");
+
+  ASSERT_OK(file->Write(2, "XXXX"));
+  ASSERT_OK(file->Read(0, 10, &read_res, buf));
+  ASSERT_EQ(read_res.ToString(), "ABXXXX");
+
+  ASSERT_OK(file->Write(10, "ZZZ"));
+  ASSERT_OK(file->Read(10, 10, &read_res, buf));
+  ASSERT_EQ(read_res.ToString(), "ZZZ");
+
+  ASSERT_OK(file->Write(11, "Y"));
+  ASSERT_OK(file->Read(10, 10, &read_res, buf));
+  ASSERT_EQ(read_res.ToString(), "ZYZ");
+
+  ASSERT_OK(file->Write(200, "FFFFF"));
+  ASSERT_OK(file->Read(200, 10, &read_res, buf));
+  ASSERT_EQ(read_res.ToString(), "FFFFF");
+
+  ASSERT_OK(file->Write(205, "XXXX"));
+  ASSERT_OK(file->Read(200, 10, &read_res, buf));
+  ASSERT_EQ(read_res.ToString(), "FFFFFXXXX");
+
+  ASSERT_OK(file->Write(5, "QQQQ"));
+  ASSERT_OK(file->Read(0, 9, &read_res, buf));
+  ASSERT_EQ(read_res.ToString(), "ABXXXQQQQ");
+
+  ASSERT_OK(file->Read(2, 4, &read_res, buf));
+  ASSERT_EQ(read_res.ToString(), "XXXQ");
+
+  // Close file and reopen it
+  ASSERT_OK(file->Close());
+  ASSERT_OK(env_->NewRandomRWFile(path, &file, EnvOptions()));
+
+  ASSERT_OK(file->Read(0, 9, &read_res, buf));
+  ASSERT_EQ(read_res.ToString(), "ABXXXQQQQ");
+
+  ASSERT_OK(file->Read(10, 3, &read_res, buf));
+  ASSERT_EQ(read_res.ToString(), "ZYZ");
+
+  ASSERT_OK(file->Read(200, 9, &read_res, buf));
+  ASSERT_EQ(read_res.ToString(), "FFFFFXXXX");
+
+  ASSERT_OK(file->Write(4, "TTTTTTTTTTTTTTTT"));
+  ASSERT_OK(file->Read(0, 10, &read_res, buf));
+  ASSERT_EQ(read_res.ToString(), "ABXXTTTTTT");
+
+  // Clean up
+  ASSERT_OK(env_->DeleteFile(path));
+}
+
+class RandomRWFileWithMirrorString {
+ public:
+  explicit RandomRWFileWithMirrorString(RandomRWFile* _file) : file_(_file) {}
+
+  void Write(size_t offset, const std::string& data) {
+    // Write to mirror string
+    StringWrite(offset, data);
+
+    // Write to file
+    Status s = file_->Write(offset, data);
+    ASSERT_OK(s) << s.ToString();
+  }
+
+  void Read(size_t offset = 0, size_t n = 1000000) {
+    Slice str_res(nullptr, 0);
+    if (offset < file_mirror_.size()) {
+      size_t str_res_sz = std::min(file_mirror_.size() - offset, n);
+      str_res = Slice(file_mirror_.data() + offset, str_res_sz);
+      StopSliceAtNull(&str_res);
+    }
+
+    Slice file_res;
+    Status s = file_->Read(offset, n, &file_res, buf_);
+    ASSERT_OK(s) << s.ToString();
+    StopSliceAtNull(&file_res);
+
+    ASSERT_EQ(str_res.ToString(), file_res.ToString()) << offset << " " << n;
+  }
+
+  void SetFile(RandomRWFile* _file) { file_ = _file; }
+
+ private:
+  void StringWrite(size_t offset, const std::string& src) {
+    if (offset + src.size() > file_mirror_.size()) {
+      file_mirror_.resize(offset + src.size(), '\0');
+    }
+
+    char* pos = const_cast<char*>(file_mirror_.data() + offset);
+    memcpy(pos, src.data(), src.size());
+  }
+
+  void StopSliceAtNull(Slice* slc) {
+    for (size_t i = 0; i < slc->size(); i++) {
+      if ((*slc)[i] == '\0') {
+        *slc = Slice(slc->data(), i);
+        break;
+      }
+    }
+  }
+
+  char buf_[10000];
+  RandomRWFile* file_;
+  std::string file_mirror_;
+};
+
+TEST_P(EnvPosixTestWithParam, PosixRandomRWFileRandomized) {
+  const std::string path = test::PerThreadDBPath(env_, "random_rw_file_rand");
+  env_->DeleteFile(path).PermitUncheckedError();
+
+  std::unique_ptr<RandomRWFile> file;
+
+#ifdef OS_LINUX
+  // Cannot open non-existing file.
+  ASSERT_NOK(env_->NewRandomRWFile(path, &file, EnvOptions()));
+#endif
+
+  // Create the file using WritableFile
+  {
+    std::unique_ptr<WritableFile> wf;
+    ASSERT_OK(env_->NewWritableFile(path, &wf, EnvOptions()));
+  }
+
+  ASSERT_OK(env_->NewRandomRWFile(path, &file, EnvOptions()));
+  RandomRWFileWithMirrorString file_with_mirror(file.get());
+
+  Random rnd(301);
+  std::string buf;
+  for (int i = 0; i < 10000; i++) {
+    // Genrate random data
+    buf = rnd.RandomString(10);
+
+    // Pick random offset for write
+    size_t write_off = rnd.Next() % 1000;
+    file_with_mirror.Write(write_off, buf);
+
+    // Pick random offset for read
+    size_t read_off = rnd.Next() % 1000;
+    size_t read_sz = rnd.Next() % 20;
+    file_with_mirror.Read(read_off, read_sz);
+
+    if (i % 500 == 0) {
+      // Reopen the file every 500 iters
+      ASSERT_OK(env_->NewRandomRWFile(path, &file, EnvOptions()));
+      file_with_mirror.SetFile(file.get());
+    }
+  }
+
+  // clean up
+  ASSERT_OK(env_->DeleteFile(path));
+}
+
+class TestEnv : public EnvWrapper {
+ public:
+  explicit TestEnv() : EnvWrapper(Env::Default()), close_count(0) {}
+  const char* Name() const override { return "TestEnv"; }
+  class TestLogger : public Logger {
+   public:
+    using Logger::Logv;
+    explicit TestLogger(TestEnv* env_ptr) : Logger() { env = env_ptr; }
+    ~TestLogger() override {
+      if (!closed_) {
+        Status s = CloseHelper();
+        s.PermitUncheckedError();
+      }
+    }
+    void Logv(const char* /*format*/, va_list /*ap*/) override {}
+
+   protected:
+    Status CloseImpl() override { return CloseHelper(); }
+
+   private:
+    Status CloseHelper() {
+      env->CloseCountInc();
+      return Status::OK();
+    }
+    TestEnv* env;
+  };
+
+  void CloseCountInc() { close_count++; }
+
+  int GetCloseCount() { return close_count; }
+
+  Status NewLogger(const std::string& /*fname*/,
+                   std::shared_ptr<Logger>* result) override {
+    result->reset(new TestLogger(this));
+    return Status::OK();
+  }
+
+ private:
+  int close_count;
+};
+
+class EnvTest : public testing::Test {
+ public:
+  EnvTest() : test_directory_(test::PerThreadDBPath("env_test")) {}
+
+ protected:
+  const std::string test_directory_;
+};
+
+TEST_F(EnvTest, Close) {
+  TestEnv* env = new TestEnv();
+  std::shared_ptr<Logger> logger;
+  Status s;
+
+  s = env->NewLogger("", &logger);
+  ASSERT_OK(s);
+  ASSERT_OK(logger.get()->Close());
+  ASSERT_EQ(env->GetCloseCount(), 1);
+  // Call Close() again. CloseHelper() should not be called again
+  ASSERT_OK(logger.get()->Close());
+  ASSERT_EQ(env->GetCloseCount(), 1);
+  logger.reset();
+  ASSERT_EQ(env->GetCloseCount(), 1);
+
+  s = env->NewLogger("", &logger);
+  ASSERT_OK(s);
+  logger.reset();
+  ASSERT_EQ(env->GetCloseCount(), 2);
+
+  delete env;
+}
+
+class LogvWithInfoLogLevelLogger : public Logger {
+ public:
+  using Logger::Logv;
+  void Logv(const InfoLogLevel /* log_level */, const char* /* format */,
+            va_list /* ap */) override {}
+};
+
+TEST_F(EnvTest, LogvWithInfoLogLevel) {
+  // Verifies the log functions work on a `Logger` that only overrides the
+  // `Logv()` overload including `InfoLogLevel`.
+  const std::string kSampleMessage("sample log message");
+  LogvWithInfoLogLevelLogger logger;
+  ROCKS_LOG_HEADER(&logger, "%s", kSampleMessage.c_str());
+  ROCKS_LOG_DEBUG(&logger, "%s", kSampleMessage.c_str());
+  ROCKS_LOG_INFO(&logger, "%s", kSampleMessage.c_str());
+  ROCKS_LOG_WARN(&logger, "%s", kSampleMessage.c_str());
+  ROCKS_LOG_ERROR(&logger, "%s", kSampleMessage.c_str());
+  ROCKS_LOG_FATAL(&logger, "%s", kSampleMessage.c_str());
+}
+
+INSTANTIATE_TEST_CASE_P(DefaultEnvWithoutDirectIO, EnvPosixTestWithParam,
+                        ::testing::Values(std::pair<Env*, bool>(Env::Default(),
+                                                                false)));
+#if !defined(ROCKSDB_LITE)
+INSTANTIATE_TEST_CASE_P(DefaultEnvWithDirectIO, EnvPosixTestWithParam,
+                        ::testing::Values(std::pair<Env*, bool>(Env::Default(),
+                                                                true)));
+#endif  // !defined(ROCKSDB_LITE)
+
+#if !defined(ROCKSDB_LITE) && !defined(OS_WIN)
+static Env* GetChrootEnv() {
+  static std::unique_ptr<Env> chroot_env(
+      NewChrootEnv(Env::Default(), test::TmpDir(Env::Default())));
+  return chroot_env.get();
+}
+INSTANTIATE_TEST_CASE_P(ChrootEnvWithoutDirectIO, EnvPosixTestWithParam,
+                        ::testing::Values(std::pair<Env*, bool>(GetChrootEnv(),
+                                                                false)));
+INSTANTIATE_TEST_CASE_P(ChrootEnvWithDirectIO, EnvPosixTestWithParam,
+                        ::testing::Values(std::pair<Env*, bool>(GetChrootEnv(),
+                                                                true)));
+#endif  // !defined(ROCKSDB_LITE) && !defined(OS_WIN)
+
+class EnvFSTestWithParam
+    : public ::testing::Test,
+      public ::testing::WithParamInterface<std::tuple<bool, bool, bool>> {
+ public:
+  EnvFSTestWithParam() {
+    bool env_non_null = std::get<0>(GetParam());
+    bool env_default = std::get<1>(GetParam());
+    bool fs_default = std::get<2>(GetParam());
+
+    env_ = env_non_null ? (env_default ? Env::Default() : nullptr) : nullptr;
+    fs_ = fs_default
+              ? FileSystem::Default()
+              : std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
+    if (env_non_null && env_default && !fs_default) {
+      env_ptr_ = NewCompositeEnv(fs_);
+    }
+    if (env_non_null && !env_default && fs_default) {
+      env_ptr_ =
+          std::unique_ptr<Env>(new FaultInjectionTestEnv(Env::Default()));
+      fs_.reset();
+    }
+    if (env_non_null && !env_default && !fs_default) {
+      env_ptr_.reset(new FaultInjectionTestEnv(Env::Default()));
+      composite_env_ptr_.reset(new CompositeEnvWrapper(env_ptr_.get(), fs_));
+      env_ = composite_env_ptr_.get();
+    } else {
+      env_ = env_ptr_.get();
+    }
+
+    dbname1_ = test::PerThreadDBPath("env_fs_test1");
+    dbname2_ = test::PerThreadDBPath("env_fs_test2");
+  }
+
+  ~EnvFSTestWithParam() = default;
+
+  Env* env_;
+  std::unique_ptr<Env> env_ptr_;
+  std::unique_ptr<Env> composite_env_ptr_;
+  std::shared_ptr<FileSystem> fs_;
+  std::string dbname1_;
+  std::string dbname2_;
+};
+
+TEST_P(EnvFSTestWithParam, OptionsTest) {
+  Options opts;
+  opts.env = env_;
+  opts.create_if_missing = true;
+  std::string dbname = dbname1_;
+
+  if (env_) {
+    if (fs_) {
+      ASSERT_EQ(fs_.get(), env_->GetFileSystem().get());
+    } else {
+      ASSERT_NE(FileSystem::Default().get(), env_->GetFileSystem().get());
+    }
+  }
+  for (int i = 0; i < 2; ++i) {
+    DB* db;
+    Status s = DB::Open(opts, dbname, &db);
+    ASSERT_OK(s);
+
+    WriteOptions wo;
+    ASSERT_OK(db->Put(wo, "a", "a"));
+    ASSERT_OK(db->Flush(FlushOptions()));
+    ASSERT_OK(db->Put(wo, "b", "b"));
+    ASSERT_OK(db->Flush(FlushOptions()));
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+    std::string val;
+    ASSERT_OK(db->Get(ReadOptions(), "a", &val));
+    ASSERT_EQ("a", val);
+    ASSERT_OK(db->Get(ReadOptions(), "b", &val));
+    ASSERT_EQ("b", val);
+
+    ASSERT_OK(db->Close());
+    delete db;
+    ASSERT_OK(DestroyDB(dbname, opts));
+
+    dbname = dbname2_;
+  }
+}
+
+// The parameters are as follows -
+// 1. True means Options::env is non-null, false means null
+// 2. True means use Env::Default, false means custom
+// 3. True means use FileSystem::Default, false means custom
+INSTANTIATE_TEST_CASE_P(EnvFSTest, EnvFSTestWithParam,
+                        ::testing::Combine(::testing::Bool(), ::testing::Bool(),
+                                           ::testing::Bool()));
+// This test ensures that default Env and those allocated by
+// NewCompositeEnv() all share the same threadpool
+TEST_F(EnvTest, MultipleCompositeEnv) {
+  std::shared_ptr<FaultInjectionTestFS> fs1 =
+      std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
+  std::shared_ptr<FaultInjectionTestFS> fs2 =
+      std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
+  std::unique_ptr<Env> env1 = NewCompositeEnv(fs1);
+  std::unique_ptr<Env> env2 = NewCompositeEnv(fs2);
+  Env::Default()->SetBackgroundThreads(8, Env::HIGH);
+  Env::Default()->SetBackgroundThreads(16, Env::LOW);
+  ASSERT_EQ(env1->GetBackgroundThreads(Env::LOW), 16);
+  ASSERT_EQ(env1->GetBackgroundThreads(Env::HIGH), 8);
+  ASSERT_EQ(env2->GetBackgroundThreads(Env::LOW), 16);
+  ASSERT_EQ(env2->GetBackgroundThreads(Env::HIGH), 8);
+}
+
+TEST_F(EnvTest, IsDirectory) {
+  Status s = Env::Default()->CreateDirIfMissing(test_directory_);
+  ASSERT_OK(s);
+  const std::string test_sub_dir = test_directory_ + "sub1";
+  const std::string test_file_path = test_directory_ + "file1";
+  ASSERT_OK(Env::Default()->CreateDirIfMissing(test_sub_dir));
+  bool is_dir = false;
+  ASSERT_OK(Env::Default()->IsDirectory(test_sub_dir, &is_dir));
+  ASSERT_TRUE(is_dir);
+  {
+    std::unique_ptr<FSWritableFile> wfile;
+    s = Env::Default()->GetFileSystem()->NewWritableFile(
+        test_file_path, FileOptions(), &wfile, /*dbg=*/nullptr);
+    ASSERT_OK(s);
+    std::unique_ptr<WritableFileWriter> fwriter;
+    fwriter.reset(new WritableFileWriter(std::move(wfile), test_file_path,
+                                         FileOptions(),
+                                         SystemClock::Default().get()));
+    constexpr char buf[] = "test";
+    s = fwriter->Append(buf);
+    ASSERT_OK(s);
+  }
+  ASSERT_OK(Env::Default()->IsDirectory(test_file_path, &is_dir));
+  ASSERT_FALSE(is_dir);
+}
+
+TEST_F(EnvTest, EnvWriteVerificationTest) {
+  Status s = Env::Default()->CreateDirIfMissing(test_directory_);
+  const std::string test_file_path = test_directory_ + "file1";
+  ASSERT_OK(s);
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  std::unique_ptr<WritableFile> file;
+  s = fault_fs_env->NewWritableFile(test_file_path, &file, EnvOptions());
+  ASSERT_OK(s);
+
+  DataVerificationInfo v_info;
+  std::string test_data = "test";
+  std::string checksum;
+  uint32_t v_crc32c = crc32c::Extend(0, test_data.c_str(), test_data.size());
+  PutFixed32(&checksum, v_crc32c);
+  v_info.checksum = Slice(checksum);
+  s = file->Append(Slice(test_data), v_info);
+  ASSERT_OK(s);
+}
+
+class CreateEnvTest : public testing::Test {
+ public:
+  CreateEnvTest() {
+    config_options_.ignore_unknown_options = false;
+    config_options_.ignore_unsupported_options = false;
+  }
+  ConfigOptions config_options_;
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(CreateEnvTest, LoadCTRProvider) {
+  config_options_.invoke_prepare_options = false;
+  std::string CTR = CTREncryptionProvider::kClassName();
+  std::shared_ptr<EncryptionProvider> provider;
+  // Test a provider with no cipher
+  ASSERT_OK(
+      EncryptionProvider::CreateFromString(config_options_, CTR, &provider));
+  ASSERT_NE(provider, nullptr);
+  ASSERT_EQ(provider->Name(), CTR);
+  ASSERT_NOK(provider->PrepareOptions(config_options_));
+  ASSERT_NOK(provider->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+  auto cipher = provider->GetOptions<std::shared_ptr<BlockCipher>>("Cipher");
+  ASSERT_NE(cipher, nullptr);
+  ASSERT_EQ(cipher->get(), nullptr);
+  provider.reset();
+
+  ASSERT_OK(EncryptionProvider::CreateFromString(config_options_,
+                                                 CTR + "://test", &provider));
+  ASSERT_NE(provider, nullptr);
+  ASSERT_EQ(provider->Name(), CTR);
+  ASSERT_OK(provider->PrepareOptions(config_options_));
+  ASSERT_OK(provider->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+  cipher = provider->GetOptions<std::shared_ptr<BlockCipher>>("Cipher");
+  ASSERT_NE(cipher, nullptr);
+  ASSERT_NE(cipher->get(), nullptr);
+  ASSERT_STREQ(cipher->get()->Name(), "ROT13");
+  provider.reset();
+
+  ASSERT_OK(EncryptionProvider::CreateFromString(config_options_, "1://test",
+                                                 &provider));
+  ASSERT_NE(provider, nullptr);
+  ASSERT_EQ(provider->Name(), CTR);
+  ASSERT_OK(provider->PrepareOptions(config_options_));
+  ASSERT_OK(provider->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+  cipher = provider->GetOptions<std::shared_ptr<BlockCipher>>("Cipher");
+  ASSERT_NE(cipher, nullptr);
+  ASSERT_NE(cipher->get(), nullptr);
+  ASSERT_STREQ(cipher->get()->Name(), "ROT13");
+  provider.reset();
+
+  ASSERT_OK(EncryptionProvider::CreateFromString(
+      config_options_, "id=" + CTR + "; cipher=ROT13", &provider));
+  ASSERT_NE(provider, nullptr);
+  ASSERT_EQ(provider->Name(), CTR);
+  cipher = provider->GetOptions<std::shared_ptr<BlockCipher>>("Cipher");
+  ASSERT_NE(cipher, nullptr);
+  ASSERT_NE(cipher->get(), nullptr);
+  ASSERT_STREQ(cipher->get()->Name(), "ROT13");
+  provider.reset();
+}
+
+TEST_F(CreateEnvTest, LoadROT13Cipher) {
+  std::shared_ptr<BlockCipher> cipher;
+  // Test a provider with no cipher
+  ASSERT_OK(BlockCipher::CreateFromString(config_options_, "ROT13", &cipher));
+  ASSERT_NE(cipher, nullptr);
+  ASSERT_STREQ(cipher->Name(), "ROT13");
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(CreateEnvTest, CreateDefaultSystemClock) {
+  std::shared_ptr<SystemClock> clock, copy;
+  ASSERT_OK(SystemClock::CreateFromString(config_options_,
+                                          SystemClock::kDefaultName(), &clock));
+  ASSERT_NE(clock, nullptr);
+  ASSERT_EQ(clock, SystemClock::Default());
+#ifndef ROCKSDB_LITE
+  std::string opts_str = clock->ToString(config_options_);
+  std::string mismatch;
+  ASSERT_OK(SystemClock::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(clock->AreEquivalent(config_options_, copy.get(), &mismatch));
+#endif  // ROCKSDB_LITE
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(CreateEnvTest, CreateMockSystemClock) {
+  std::shared_ptr<SystemClock> mock, copy;
+
+  config_options_.registry->AddLibrary("test")->AddFactory<SystemClock>(
+      MockSystemClock::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<SystemClock>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new MockSystemClock(nullptr));
+        return guard->get();
+      });
+  ASSERT_OK(SystemClock::CreateFromString(
+      config_options_, EmulatedSystemClock::kClassName(), &mock));
+  ASSERT_NE(mock, nullptr);
+  ASSERT_STREQ(mock->Name(), EmulatedSystemClock::kClassName());
+  ASSERT_EQ(mock->Inner(), SystemClock::Default().get());
+  std::string opts_str = mock->ToString(config_options_);
+  std::string mismatch;
+  ASSERT_OK(SystemClock::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(mock->AreEquivalent(config_options_, copy.get(), &mismatch));
+
+  std::string id = std::string("id=") + EmulatedSystemClock::kClassName() +
+                   ";target=" + MockSystemClock::kClassName();
+
+  ASSERT_OK(SystemClock::CreateFromString(config_options_, id, &mock));
+  ASSERT_NE(mock, nullptr);
+  ASSERT_STREQ(mock->Name(), EmulatedSystemClock::kClassName());
+  ASSERT_NE(mock->Inner(), nullptr);
+  ASSERT_STREQ(mock->Inner()->Name(), MockSystemClock::kClassName());
+  ASSERT_EQ(mock->Inner()->Inner(), SystemClock::Default().get());
+  opts_str = mock->ToString(config_options_);
+  ASSERT_OK(SystemClock::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(mock->AreEquivalent(config_options_, copy.get(), &mismatch));
+  ASSERT_OK(SystemClock::CreateFromString(
+      config_options_, EmulatedSystemClock::kClassName(), &mock));
+}
+
+TEST_F(CreateEnvTest, CreateReadOnlyFileSystem) {
+  std::shared_ptr<FileSystem> fs, copy;
+
+  ASSERT_OK(FileSystem::CreateFromString(
+      config_options_, ReadOnlyFileSystem::kClassName(), &fs));
+  ASSERT_NE(fs, nullptr);
+  ASSERT_STREQ(fs->Name(), ReadOnlyFileSystem::kClassName());
+  ASSERT_EQ(fs->Inner(), FileSystem::Default().get());
+
+  std::string opts_str = fs->ToString(config_options_);
+  std::string mismatch;
+
+  ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+
+  ASSERT_OK(FileSystem::CreateFromString(
+      config_options_,
+      std::string("id=") + ReadOnlyFileSystem::kClassName() +
+          "; target=" + TimedFileSystem::kClassName(),
+      &fs));
+  ASSERT_NE(fs, nullptr);
+  opts_str = fs->ToString(config_options_);
+  ASSERT_STREQ(fs->Name(), ReadOnlyFileSystem::kClassName());
+  ASSERT_NE(fs->Inner(), nullptr);
+  ASSERT_STREQ(fs->Inner()->Name(), TimedFileSystem::kClassName());
+  ASSERT_EQ(fs->Inner()->Inner(), FileSystem::Default().get());
+  ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+
+TEST_F(CreateEnvTest, CreateTimedFileSystem) {
+  std::shared_ptr<FileSystem> fs, copy;
+
+  ASSERT_OK(FileSystem::CreateFromString(config_options_,
+                                         TimedFileSystem::kClassName(), &fs));
+  ASSERT_NE(fs, nullptr);
+  ASSERT_STREQ(fs->Name(), TimedFileSystem::kClassName());
+  ASSERT_EQ(fs->Inner(), FileSystem::Default().get());
+
+  std::string opts_str = fs->ToString(config_options_);
+  std::string mismatch;
+
+  ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+
+  ASSERT_OK(FileSystem::CreateFromString(
+      config_options_,
+      std::string("id=") + TimedFileSystem::kClassName() +
+          "; target=" + ReadOnlyFileSystem::kClassName(),
+      &fs));
+  ASSERT_NE(fs, nullptr);
+  opts_str = fs->ToString(config_options_);
+  ASSERT_STREQ(fs->Name(), TimedFileSystem::kClassName());
+  ASSERT_NE(fs->Inner(), nullptr);
+  ASSERT_STREQ(fs->Inner()->Name(), ReadOnlyFileSystem::kClassName());
+  ASSERT_EQ(fs->Inner()->Inner(), FileSystem::Default().get());
+  ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+
+TEST_F(CreateEnvTest, CreateCountedFileSystem) {
+  std::shared_ptr<FileSystem> fs, copy;
+
+  ASSERT_OK(FileSystem::CreateFromString(config_options_,
+                                         CountedFileSystem::kClassName(), &fs));
+  ASSERT_NE(fs, nullptr);
+  ASSERT_STREQ(fs->Name(), CountedFileSystem::kClassName());
+  ASSERT_EQ(fs->Inner(), FileSystem::Default().get());
+
+  std::string opts_str = fs->ToString(config_options_);
+  std::string mismatch;
+
+  ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+
+  ASSERT_OK(FileSystem::CreateFromString(
+      config_options_,
+      std::string("id=") + CountedFileSystem::kClassName() +
+          "; target=" + ReadOnlyFileSystem::kClassName(),
+      &fs));
+  ASSERT_NE(fs, nullptr);
+  opts_str = fs->ToString(config_options_);
+  ASSERT_STREQ(fs->Name(), CountedFileSystem::kClassName());
+  ASSERT_NE(fs->Inner(), nullptr);
+  ASSERT_STREQ(fs->Inner()->Name(), ReadOnlyFileSystem::kClassName());
+  ASSERT_EQ(fs->Inner()->Inner(), FileSystem::Default().get());
+  ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+
+#ifndef OS_WIN
+TEST_F(CreateEnvTest, CreateChrootFileSystem) {
+  std::shared_ptr<FileSystem> fs, copy;
+  auto tmp_dir = test::TmpDir(Env::Default());
+  // The Chroot FileSystem has a required "chroot_dir" option.
+  ASSERT_NOK(FileSystem::CreateFromString(config_options_,
+                                          ChrootFileSystem::kClassName(), &fs));
+
+  // ChrootFileSystem fails with an invalid directory
+  ASSERT_NOK(FileSystem::CreateFromString(
+      config_options_,
+      std::string("chroot_dir=/No/Such/Directory; id=") +
+          ChrootFileSystem::kClassName(),
+      &fs));
+  std::string chroot_opts = std::string("chroot_dir=") + tmp_dir +
+                            std::string("; id=") +
+                            ChrootFileSystem::kClassName();
+
+  // Create a valid ChrootFileSystem with an inner Default
+  ASSERT_OK(FileSystem::CreateFromString(config_options_, chroot_opts, &fs));
+  ASSERT_NE(fs, nullptr);
+  ASSERT_STREQ(fs->Name(), ChrootFileSystem::kClassName());
+  ASSERT_EQ(fs->Inner(), FileSystem::Default().get());
+  std::string opts_str = fs->ToString(config_options_);
+  std::string mismatch;
+  ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+
+  // Create a valid ChrootFileSystem with an inner TimedFileSystem
+  ASSERT_OK(FileSystem::CreateFromString(
+      config_options_,
+      chroot_opts + "; target=" + TimedFileSystem::kClassName(), &fs));
+  ASSERT_NE(fs, nullptr);
+  ASSERT_STREQ(fs->Name(), ChrootFileSystem::kClassName());
+  ASSERT_NE(fs->Inner(), nullptr);
+  ASSERT_STREQ(fs->Inner()->Name(), TimedFileSystem::kClassName());
+  ASSERT_EQ(fs->Inner()->Inner(), FileSystem::Default().get());
+  opts_str = fs->ToString(config_options_);
+  ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+
+  // Create a TimedFileSystem with an inner ChrootFileSystem
+  ASSERT_OK(FileSystem::CreateFromString(
+      config_options_,
+      "target={" + chroot_opts + "}; id=" + TimedFileSystem::kClassName(),
+      &fs));
+  ASSERT_NE(fs, nullptr);
+  ASSERT_STREQ(fs->Name(), TimedFileSystem::kClassName());
+  ASSERT_NE(fs->Inner(), nullptr);
+  ASSERT_STREQ(fs->Inner()->Name(), ChrootFileSystem::kClassName());
+  ASSERT_EQ(fs->Inner()->Inner(), FileSystem::Default().get());
+  opts_str = fs->ToString(config_options_);
+  ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+#endif  // OS_WIN
+
+TEST_F(CreateEnvTest, CreateEncryptedFileSystem) {
+  std::shared_ptr<FileSystem> fs, copy;
+
+  std::string base_opts =
+      std::string("provider=1://test; id=") + EncryptedFileSystem::kClassName();
+  // The EncryptedFileSystem requires a "provider" option.
+  ASSERT_NOK(FileSystem::CreateFromString(
+      config_options_, EncryptedFileSystem::kClassName(), &fs));
+
+  ASSERT_OK(FileSystem::CreateFromString(config_options_, base_opts, &fs));
+
+  ASSERT_NE(fs, nullptr);
+  ASSERT_STREQ(fs->Name(), EncryptedFileSystem::kClassName());
+  ASSERT_EQ(fs->Inner(), FileSystem::Default().get());
+  std::string opts_str = fs->ToString(config_options_);
+  std::string mismatch;
+  ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+  ASSERT_OK(FileSystem::CreateFromString(
+      config_options_, base_opts + "; target=" + TimedFileSystem::kClassName(),
+      &fs));
+  ASSERT_NE(fs, nullptr);
+  ASSERT_STREQ(fs->Name(), EncryptedFileSystem::kClassName());
+  ASSERT_NE(fs->Inner(), nullptr);
+  ASSERT_STREQ(fs->Inner()->Name(), TimedFileSystem::kClassName());
+  ASSERT_EQ(fs->Inner()->Inner(), FileSystem::Default().get());
+  opts_str = fs->ToString(config_options_);
+  ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+
+#endif  // ROCKSDB_LITE
+
+namespace {
+
+constexpr size_t kThreads = 8;
+constexpr size_t kIdsPerThread = 1000;
+
+// This is a mini-stress test to check for duplicates in functions like
+// GenerateUniqueId()
+template <typename IdType, class Hash = std::hash<IdType>>
+struct NoDuplicateMiniStressTest {
+  std::unordered_set<IdType, Hash> ids;
+  std::mutex mutex;
+  Env* env;
+
+  NoDuplicateMiniStressTest() { env = Env::Default(); }
+
+  virtual ~NoDuplicateMiniStressTest() {}
+
+  void Run() {
+    std::array<std::thread, kThreads> threads;
+    for (size_t i = 0; i < kThreads; ++i) {
+      threads[i] = std::thread([&]() { ThreadFn(); });
+    }
+    for (auto& thread : threads) {
+      thread.join();
+    }
+    // All must be unique
+    ASSERT_EQ(ids.size(), kThreads * kIdsPerThread);
+  }
+
+  void ThreadFn() {
+    std::array<IdType, kIdsPerThread> my_ids;
+    // Generate in parallel threads as fast as possible
+    for (size_t i = 0; i < kIdsPerThread; ++i) {
+      my_ids[i] = Generate();
+    }
+    // Now collate
+    std::lock_guard<std::mutex> lock(mutex);
+    for (auto& id : my_ids) {
+      ids.insert(id);
+    }
+  }
+
+  virtual IdType Generate() = 0;
+};
+
+void VerifyRfcUuids(const std::unordered_set<std::string>& uuids) {
+  if (uuids.empty()) {
+    return;
+  }
+}
+
+using uint64_pair_t = std::pair<uint64_t, uint64_t>;
+struct HashUint64Pair {
+  std::size_t operator()(
+      std::pair<uint64_t, uint64_t> const& u) const noexcept {
+    // Assume suitable distribution already
+    return static_cast<size_t>(u.first ^ u.second);
+  }
+};
+
+}  // namespace
+
+TEST_F(EnvTest, GenerateUniqueId) {
+  struct MyStressTest : public NoDuplicateMiniStressTest<std::string> {
+    std::string Generate() override { return env->GenerateUniqueId(); }
+  };
+
+  MyStressTest t;
+  t.Run();
+
+  // Basically verify RFC-4122 format
+  for (auto& uuid : t.ids) {
+    ASSERT_EQ(36U, uuid.size());
+    ASSERT_EQ('-', uuid[8]);
+    ASSERT_EQ('-', uuid[13]);
+    ASSERT_EQ('-', uuid[18]);
+    ASSERT_EQ('-', uuid[23]);
+  }
+}
+
+TEST_F(EnvTest, GenerateDbSessionId) {
+  struct MyStressTest : public NoDuplicateMiniStressTest<std::string> {
+    std::string Generate() override { return DBImpl::GenerateDbSessionId(env); }
+  };
+
+  MyStressTest t;
+  t.Run();
+
+  // Basically verify session ID
+  for (auto& id : t.ids) {
+    ASSERT_EQ(20U, id.size());
+  }
+}
+
+constexpr bool kRequirePortGenerateRfcUuid =
+#if defined(OS_LINUX) || defined(OS_ANDROID) || defined(OS_WIN)
+    true;
+#else
+    false;
+#endif
+
+TEST_F(EnvTest, PortGenerateRfcUuid) {
+  if (!kRequirePortGenerateRfcUuid) {
+    ROCKSDB_GTEST_SKIP("Not supported/expected on this platform");
+    return;
+  }
+  struct MyStressTest : public NoDuplicateMiniStressTest<std::string> {
+    std::string Generate() override {
+      std::string u;
+      assert(port::GenerateRfcUuid(&u));
+      return u;
+    }
+  };
+
+  MyStressTest t;
+  t.Run();
+
+  // Extra verification on versions and variants
+  VerifyRfcUuids(t.ids);
+}
+
+// Test the atomic, linear generation of GenerateRawUuid
+TEST_F(EnvTest, GenerateRawUniqueId) {
+  struct MyStressTest
+      : public NoDuplicateMiniStressTest<uint64_pair_t, HashUint64Pair> {
+    uint64_pair_t Generate() override {
+      uint64_pair_t p;
+      GenerateRawUniqueId(&p.first, &p.second);
+      return p;
+    }
+  };
+
+  MyStressTest t;
+  t.Run();
+}
+
+// Test that each entropy source ("track") is at least adequate
+TEST_F(EnvTest, GenerateRawUniqueIdTrackPortUuidOnly) {
+  if (!kRequirePortGenerateRfcUuid) {
+    ROCKSDB_GTEST_SKIP("Not supported/expected on this platform");
+    return;
+  }
+
+  struct MyStressTest
+      : public NoDuplicateMiniStressTest<uint64_pair_t, HashUint64Pair> {
+    uint64_pair_t Generate() override {
+      uint64_pair_t p;
+      TEST_GenerateRawUniqueId(&p.first, &p.second, false, true, true);
+      return p;
+    }
+  };
+
+  MyStressTest t;
+  t.Run();
+}
+
+TEST_F(EnvTest, GenerateRawUniqueIdTrackEnvDetailsOnly) {
+  struct MyStressTest
+      : public NoDuplicateMiniStressTest<uint64_pair_t, HashUint64Pair> {
+    uint64_pair_t Generate() override {
+      uint64_pair_t p;
+      TEST_GenerateRawUniqueId(&p.first, &p.second, true, false, true);
+      return p;
+    }
+  };
+
+  MyStressTest t;
+  t.Run();
+}
+
+TEST_F(EnvTest, GenerateRawUniqueIdTrackRandomDeviceOnly) {
+  struct MyStressTest
+      : public NoDuplicateMiniStressTest<uint64_pair_t, HashUint64Pair> {
+    uint64_pair_t Generate() override {
+      uint64_pair_t p;
+      TEST_GenerateRawUniqueId(&p.first, &p.second, true, true, false);
+      return p;
+    }
+  };
+
+  MyStressTest t;
+  t.Run();
+}
+
+TEST_F(EnvTest, SemiStructuredUniqueIdGenTest) {
+  // Must be thread safe and usable as a static
+  static SemiStructuredUniqueIdGen gen;
+
+  struct MyStressTest
+      : public NoDuplicateMiniStressTest<uint64_pair_t, HashUint64Pair> {
+    uint64_pair_t Generate() override {
+      uint64_pair_t p;
+      gen.GenerateNext(&p.first, &p.second);
+      return p;
+    }
+  };
+
+  MyStressTest t;
+  t.Run();
+}
+
+TEST_F(EnvTest, FailureToCreateLockFile) {
+  auto env = Env::Default();
+  auto fs = env->GetFileSystem();
+  std::string dir = test::PerThreadDBPath(env, "lockdir");
+  std::string file = dir + "/lockfile";
+
+  // Ensure directory doesn't exist
+  ASSERT_OK(DestroyDir(env, dir));
+
+  // Make sure that we can acquire a file lock after the first attempt fails
+  FileLock* lock = nullptr;
+  ASSERT_NOK(fs->LockFile(file, IOOptions(), &lock, /*dbg*/ nullptr));
+  ASSERT_FALSE(lock);
+
+  ASSERT_OK(fs->CreateDir(dir, IOOptions(), /*dbg*/ nullptr));
+  ASSERT_OK(fs->LockFile(file, IOOptions(), &lock, /*dbg*/ nullptr));
+  ASSERT_OK(fs->UnlockFile(lock, IOOptions(), /*dbg*/ nullptr));
+
+  // Clean up
+  ASSERT_OK(DestroyDir(env, dir));
+}
+
+TEST_F(CreateEnvTest, CreateDefaultEnv) {
+  ConfigOptions options;
+  options.ignore_unsupported_options = false;
+
+  std::shared_ptr<Env> guard;
+  Env* env = nullptr;
+  ASSERT_OK(Env::CreateFromString(options, "", &env));
+  ASSERT_EQ(env, Env::Default());
+
+  env = nullptr;
+  ASSERT_OK(Env::CreateFromString(options, Env::kDefaultName(), &env));
+  ASSERT_EQ(env, Env::Default());
+
+  env = nullptr;
+  ASSERT_OK(Env::CreateFromString(options, "", &env, &guard));
+  ASSERT_EQ(env, Env::Default());
+  ASSERT_EQ(guard, nullptr);
+
+  env = nullptr;
+  ASSERT_OK(Env::CreateFromString(options, Env::kDefaultName(), &env, &guard));
+  ASSERT_EQ(env, Env::Default());
+  ASSERT_EQ(guard, nullptr);
+
+#ifndef ROCKSDB_LITE
+  std::string opt_str = env->ToString(options);
+  ASSERT_OK(Env::CreateFromString(options, opt_str, &env));
+  ASSERT_EQ(env, Env::Default());
+  ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &guard));
+  ASSERT_EQ(env, Env::Default());
+  ASSERT_EQ(guard, nullptr);
+#endif  // ROCKSDB_LITE
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+class WrappedEnv : public EnvWrapper {
+ public:
+  explicit WrappedEnv(Env* t) : EnvWrapper(t) {}
+  explicit WrappedEnv(const std::shared_ptr<Env>& t) : EnvWrapper(t) {}
+  static const char* kClassName() { return "WrappedEnv"; }
+  const char* Name() const override { return kClassName(); }
+  static void Register(ObjectLibrary& lib, const std::string& /*arg*/) {
+    lib.AddFactory<Env>(
+        WrappedEnv::kClassName(),
+        [](const std::string& /*uri*/, std::unique_ptr<Env>* guard,
+           std::string* /* errmsg */) {
+          guard->reset(new WrappedEnv(nullptr));
+          return guard->get();
+        });
+  }
+};
+}  // namespace
+TEST_F(CreateEnvTest, CreateMockEnv) {
+  ConfigOptions options;
+  options.ignore_unsupported_options = false;
+  WrappedEnv::Register(*(options.registry->AddLibrary("test")), "");
+  std::shared_ptr<Env> guard, copy;
+  std::string opt_str;
+
+  Env* env = nullptr;
+  ASSERT_NOK(Env::CreateFromString(options, MockEnv::kClassName(), &env));
+  ASSERT_OK(
+      Env::CreateFromString(options, MockEnv::kClassName(), &env, &guard));
+  ASSERT_NE(env, nullptr);
+  ASSERT_NE(env, Env::Default());
+  opt_str = env->ToString(options);
+  ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &copy));
+  ASSERT_NE(copy, guard);
+  std::string mismatch;
+  ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch));
+  guard.reset(MockEnv::Create(Env::Default(), SystemClock::Default()));
+  opt_str = guard->ToString(options);
+  ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &copy));
+  std::unique_ptr<Env> wrapped_env(new WrappedEnv(Env::Default()));
+  guard.reset(MockEnv::Create(wrapped_env.get(), SystemClock::Default()));
+  opt_str = guard->ToString(options);
+  ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &copy));
+  opt_str = copy->ToString(options);
+}
+
+TEST_F(CreateEnvTest, CreateWrappedEnv) {
+  ConfigOptions options;
+  options.ignore_unsupported_options = false;
+  WrappedEnv::Register(*(options.registry->AddLibrary("test")), "");
+  Env* env = nullptr;
+  std::shared_ptr<Env> guard, copy;
+  std::string opt_str;
+  std::string mismatch;
+
+  ASSERT_NOK(Env::CreateFromString(options, WrappedEnv::kClassName(), &env));
+  ASSERT_OK(
+      Env::CreateFromString(options, WrappedEnv::kClassName(), &env, &guard));
+  ASSERT_NE(env, nullptr);
+  ASSERT_NE(env, Env::Default());
+  ASSERT_FALSE(guard->AreEquivalent(options, Env::Default(), &mismatch));
+
+  opt_str = env->ToString(options);
+  ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &copy));
+  ASSERT_NE(copy, guard);
+  ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch));
+
+  guard.reset(new WrappedEnv(std::make_shared<WrappedEnv>(Env::Default())));
+  ASSERT_NE(guard.get(), env);
+  opt_str = guard->ToString(options);
+  ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &copy));
+  ASSERT_NE(copy, guard);
+  ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch));
+
+  guard.reset(new WrappedEnv(std::make_shared<WrappedEnv>(
+      std::make_shared<WrappedEnv>(Env::Default()))));
+  ASSERT_NE(guard.get(), env);
+  opt_str = guard->ToString(options);
+  ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &copy));
+  ASSERT_NE(copy, guard);
+  ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch));
+}
+
+TEST_F(CreateEnvTest, CreateCompositeEnv) {
+  ConfigOptions options;
+  options.ignore_unsupported_options = false;
+  std::shared_ptr<Env> guard, copy;
+  Env* env = nullptr;
+  std::string mismatch, opt_str;
+
+  WrappedEnv::Register(*(options.registry->AddLibrary("test")), "");
+  std::unique_ptr<Env> base(NewCompositeEnv(FileSystem::Default()));
+  std::unique_ptr<Env> wrapped(new WrappedEnv(Env::Default()));
+  std::shared_ptr<FileSystem> timed_fs =
+      std::make_shared<TimedFileSystem>(FileSystem::Default());
+  std::shared_ptr<SystemClock> clock =
+      std::make_shared<EmulatedSystemClock>(SystemClock::Default());
+
+  opt_str = base->ToString(options);
+  ASSERT_NOK(Env::CreateFromString(options, opt_str, &env));
+  ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &guard));
+  ASSERT_NE(env, nullptr);
+  ASSERT_NE(env, Env::Default());
+  ASSERT_EQ(env->GetFileSystem(), FileSystem::Default());
+  ASSERT_EQ(env->GetSystemClock(), SystemClock::Default());
+
+  base = NewCompositeEnv(timed_fs);
+  opt_str = base->ToString(options);
+  ASSERT_NOK(Env::CreateFromString(options, opt_str, &env));
+  ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &guard));
+  ASSERT_NE(env, nullptr);
+  ASSERT_NE(env, Env::Default());
+  ASSERT_NE(env->GetFileSystem(), FileSystem::Default());
+  ASSERT_EQ(env->GetSystemClock(), SystemClock::Default());
+
+  env = nullptr;
+  guard.reset(new CompositeEnvWrapper(wrapped.get(), timed_fs));
+  opt_str = guard->ToString(options);
+  ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &copy));
+  ASSERT_NE(env, nullptr);
+  ASSERT_NE(env, Env::Default());
+  ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch));
+
+  env = nullptr;
+  guard.reset(new CompositeEnvWrapper(wrapped.get(), clock));
+  opt_str = guard->ToString(options);
+  ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &copy));
+  ASSERT_NE(env, nullptr);
+  ASSERT_NE(env, Env::Default());
+  ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch));
+
+  env = nullptr;
+  guard.reset(new CompositeEnvWrapper(wrapped.get(), timed_fs, clock));
+  opt_str = guard->ToString(options);
+  ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &copy));
+  ASSERT_NE(env, nullptr);
+  ASSERT_NE(env, Env::Default());
+  ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch));
+
+  guard.reset(new CompositeEnvWrapper(nullptr, timed_fs, clock));
+  ColumnFamilyOptions cf_opts;
+  DBOptions db_opts;
+  db_opts.env = guard.get();
+  auto comp = db_opts.env->CheckedCast<CompositeEnvWrapper>();
+  ASSERT_NE(comp, nullptr);
+  ASSERT_EQ(comp->Inner(), nullptr);
+  ASSERT_NOK(ValidateOptions(db_opts, cf_opts));
+  ASSERT_OK(db_opts.env->PrepareOptions(options));
+  ASSERT_NE(comp->Inner(), nullptr);
+  ASSERT_OK(ValidateOptions(db_opts, cf_opts));
+}
+#endif  // ROCKSDB_LITE
+
+// Forward declaration
+class ReadAsyncFS;
+
+struct MockIOHandle {
+  std::function<void(const FSReadRequest&, void*)> cb;
+  void* cb_arg;
+  bool create_io_error;
+};
+
+// ReadAsyncFS and ReadAsyncRandomAccessFile mocks the FS doing asynchronous
+// reads by creating threads that submit read requests and then calling Poll API
+// to obtain those results.
+class ReadAsyncRandomAccessFile : public FSRandomAccessFileOwnerWrapper {
+ public:
+  ReadAsyncRandomAccessFile(ReadAsyncFS& fs,
+                            std::unique_ptr<FSRandomAccessFile>& file)
+      : FSRandomAccessFileOwnerWrapper(std::move(file)), fs_(fs) {}
+
+  IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts,
+                     std::function<void(const FSReadRequest&, void*)> cb,
+                     void* cb_arg, void** io_handle, IOHandleDeleter* del_fn,
+                     IODebugContext* dbg) override;
+
+ private:
+  ReadAsyncFS& fs_;
+  std::unique_ptr<FSRandomAccessFile> file_;
+  int counter = 0;
+};
+
+class ReadAsyncFS : public FileSystemWrapper {
+ public:
+  explicit ReadAsyncFS(const std::shared_ptr<FileSystem>& wrapped)
+      : FileSystemWrapper(wrapped) {}
+
+  static const char* kClassName() { return "ReadAsyncFS"; }
+  const char* Name() const override { return kClassName(); }
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& opts,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override {
+    std::unique_ptr<FSRandomAccessFile> file;
+    IOStatus s = target()->NewRandomAccessFile(fname, opts, &file, dbg);
+    EXPECT_OK(s);
+    result->reset(new ReadAsyncRandomAccessFile(*this, file));
+    return s;
+  }
+
+  IOStatus Poll(std::vector<void*>& io_handles,
+                size_t /*min_completions*/) override {
+    // Wait for the threads completion.
+    for (auto& t : workers) {
+      t.join();
+    }
+
+    for (size_t i = 0; i < io_handles.size(); i++) {
+      MockIOHandle* handle = static_cast<MockIOHandle*>(io_handles[i]);
+      if (handle->create_io_error) {
+        FSReadRequest req;
+        req.status = IOStatus::IOError();
+        handle->cb(req, handle->cb_arg);
+      }
+    }
+    return IOStatus::OK();
+  }
+
+  std::vector<std::thread> workers;
+};
+
+IOStatus ReadAsyncRandomAccessFile::ReadAsync(
+    FSReadRequest& req, const IOOptions& opts,
+    std::function<void(const FSReadRequest&, void*)> cb, void* cb_arg,
+    void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) {
+  IOHandleDeleter deletefn = [](void* args) -> void {
+    delete (static_cast<MockIOHandle*>(args));
+    args = nullptr;
+  };
+  *del_fn = deletefn;
+
+  // Allocate and populate io_handle.
+  MockIOHandle* mock_handle = new MockIOHandle();
+  bool create_io_error = false;
+  if (counter % 2) {
+    create_io_error = true;
+  }
+  mock_handle->create_io_error = create_io_error;
+  mock_handle->cb = cb;
+  mock_handle->cb_arg = cb_arg;
+  *io_handle = static_cast<void*>(mock_handle);
+  counter++;
+
+  // Submit read request asynchronously.
+  std::function<void(FSReadRequest)> submit_request =
+      [&opts, cb, cb_arg, dbg, create_io_error, this](FSReadRequest _req) {
+        if (!create_io_error) {
+          _req.status = target()->Read(_req.offset, _req.len, opts,
+                                       &(_req.result), _req.scratch, dbg);
+          cb(_req, cb_arg);
+        }
+      };
+
+  fs_.workers.emplace_back(submit_request, req);
+  return IOStatus::OK();
+}
+
+class TestAsyncRead : public testing::Test {
+ public:
+  TestAsyncRead() { env_ = Env::Default(); }
+  Env* env_;
+};
+
+// Tests the default implementation of ReadAsync API.
+TEST_F(TestAsyncRead, ReadAsync) {
+  EnvOptions soptions;
+  std::shared_ptr<ReadAsyncFS> fs =
+      std::make_shared<ReadAsyncFS>(env_->GetFileSystem());
+
+  std::string fname = test::PerThreadDBPath(env_, "testfile");
+
+  const size_t kSectorSize = 4096;
+  const size_t kNumSectors = 8;
+
+  // 1. create & write to a file.
+  {
+    std::unique_ptr<FSWritableFile> wfile;
+    ASSERT_OK(
+        fs->NewWritableFile(fname, FileOptions(), &wfile, nullptr /*dbg*/));
+
+    for (size_t i = 0; i < kNumSectors; ++i) {
+      auto data = NewAligned(kSectorSize * 8, static_cast<char>(i + 1));
+      Slice slice(data.get(), kSectorSize);
+      ASSERT_OK(wfile->Append(slice, IOOptions(), nullptr));
+    }
+    ASSERT_OK(wfile->Close(IOOptions(), nullptr));
+  }
+  // 2. Read file
+  {
+    std::unique_ptr<FSRandomAccessFile> file;
+    ASSERT_OK(fs->NewRandomAccessFile(fname, FileOptions(), &file, nullptr));
+
+    IOOptions opts;
+    std::vector<void*> io_handles(kNumSectors);
+    std::vector<FSReadRequest> reqs(kNumSectors);
+    std::vector<std::unique_ptr<char, Deleter>> data;
+    std::vector<size_t> vals;
+    IOHandleDeleter del_fn;
+    uint64_t offset = 0;
+
+    // Initialize read requests
+    for (size_t i = 0; i < kNumSectors; i++) {
+      reqs[i].offset = offset;
+      reqs[i].len = kSectorSize;
+      data.emplace_back(NewAligned(kSectorSize, 0));
+      reqs[i].scratch = data.back().get();
+      vals.push_back(i);
+      offset += kSectorSize;
+    }
+
+    // callback function passed to async read.
+    std::function<void(const FSReadRequest&, void*)> callback =
+        [&](const FSReadRequest& req, void* cb_arg) {
+          assert(cb_arg != nullptr);
+          size_t i = *(reinterpret_cast<size_t*>(cb_arg));
+          reqs[i].offset = req.offset;
+          reqs[i].result = req.result;
+          reqs[i].status = req.status;
+        };
+
+    // Submit asynchronous read requests.
+    for (size_t i = 0; i < kNumSectors; i++) {
+      void* cb_arg = static_cast<void*>(&(vals[i]));
+      ASSERT_OK(file->ReadAsync(reqs[i], opts, callback, cb_arg,
+                                &(io_handles[i]), &del_fn, nullptr));
+    }
+
+    // Poll for the submitted requests.
+    fs->Poll(io_handles, kNumSectors);
+
+    // Check the status of read requests.
+    for (size_t i = 0; i < kNumSectors; i++) {
+      if (i % 2) {
+        ASSERT_EQ(reqs[i].status, IOStatus::IOError());
+      } else {
+        auto buf = NewAligned(kSectorSize * 8, static_cast<char>(i + 1));
+        Slice expected_data(buf.get(), kSectorSize);
+
+        ASSERT_EQ(reqs[i].offset, i * kSectorSize);
+        ASSERT_OK(reqs[i].status);
+        ASSERT_EQ(expected_data.ToString(), reqs[i].result.ToString());
+      }
+    }
+
+    // Delete io_handles.
+    for (size_t i = 0; i < io_handles.size(); i++) {
+      del_fn(io_handles[i]);
+    }
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/env/file_system.cc b/src/rocksdb/env/file_system.cc
new file mode 100644
index 000000000..f9dda429a
--- /dev/null
+++ b/src/rocksdb/env/file_system.cc
@@ -0,0 +1,290 @@
+// Copyright (c) 2019-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "rocksdb/file_system.h"
+
+#include "env/composite_env_wrapper.h"
+#include "env/env_chroot.h"
+#include "env/env_encryption_ctr.h"
+#include "env/fs_readonly.h"
+#include "env/mock_env.h"
+#include "logging/env_logger.h"
+#include "options/db_options.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/string_util.h"
+#include "utilities/counted_fs.h"
+#include "utilities/env_timed.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+FileSystem::FileSystem() {}
+
+FileSystem::~FileSystem() {}
+
+Status FileSystem::Load(const std::string& value,
+                        std::shared_ptr<FileSystem>* result) {
+  return CreateFromString(ConfigOptions(), value, result);
+}
+
+#ifndef ROCKSDB_LITE
+static int RegisterBuiltinFileSystems(ObjectLibrary& library,
+                                      const std::string& /*arg*/) {
+  library.AddFactory<FileSystem>(
+      TimedFileSystem::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<FileSystem>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new TimedFileSystem(nullptr));
+        return guard->get();
+      });
+  library.AddFactory<FileSystem>(
+      ReadOnlyFileSystem::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<FileSystem>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new ReadOnlyFileSystem(nullptr));
+        return guard->get();
+      });
+  library.AddFactory<FileSystem>(
+      EncryptedFileSystem::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<FileSystem>* guard,
+         std::string* errmsg) {
+        Status s = NewEncryptedFileSystemImpl(nullptr, nullptr, guard);
+        if (!s.ok()) {
+          *errmsg = s.ToString();
+        }
+        return guard->get();
+      });
+  library.AddFactory<FileSystem>(
+      CountedFileSystem::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<FileSystem>* guard,
+         std::string* /*errmsg*/) {
+        guard->reset(new CountedFileSystem(FileSystem::Default()));
+        return guard->get();
+      });
+  library.AddFactory<FileSystem>(
+      MockFileSystem::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<FileSystem>* guard,
+         std::string* /*errmsg*/) {
+        guard->reset(new MockFileSystem(SystemClock::Default()));
+        return guard->get();
+      });
+#ifndef OS_WIN
+  library.AddFactory<FileSystem>(
+      ChrootFileSystem::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<FileSystem>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new ChrootFileSystem(nullptr, ""));
+        return guard->get();
+      });
+#endif  // OS_WIN
+  size_t num_types;
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+#endif  // ROCKSDB_LITE
+
+Status FileSystem::CreateFromString(const ConfigOptions& config_options,
+                                    const std::string& value,
+                                    std::shared_ptr<FileSystem>* result) {
+  auto default_fs = FileSystem::Default();
+  if (default_fs->IsInstanceOf(value)) {
+    *result = default_fs;
+    return Status::OK();
+  } else {
+#ifndef ROCKSDB_LITE
+    static std::once_flag once;
+    std::call_once(once, [&]() {
+      RegisterBuiltinFileSystems(*(ObjectLibrary::Default().get()), "");
+    });
+#endif  // ROCKSDB_LITE
+    return LoadSharedObject<FileSystem>(config_options, value, nullptr, result);
+  }
+}
+
+IOStatus FileSystem::ReuseWritableFile(const std::string& fname,
+                                       const std::string& old_fname,
+                                       const FileOptions& opts,
+                                       std::unique_ptr<FSWritableFile>* result,
+                                       IODebugContext* dbg) {
+  IOStatus s = RenameFile(old_fname, fname, opts.io_options, dbg);
+  if (!s.ok()) {
+    return s;
+  }
+  return NewWritableFile(fname, opts, result, dbg);
+}
+
+IOStatus FileSystem::NewLogger(const std::string& fname,
+                               const IOOptions& io_opts,
+                               std::shared_ptr<Logger>* result,
+                               IODebugContext* dbg) {
+  FileOptions options;
+  options.io_options = io_opts;
+  // TODO: Tune the buffer size.
+  options.writable_file_max_buffer_size = 1024 * 1024;
+  std::unique_ptr<FSWritableFile> writable_file;
+  const IOStatus status = NewWritableFile(fname, options, &writable_file, dbg);
+  if (!status.ok()) {
+    return status;
+  }
+
+  *result = std::make_shared<EnvLogger>(std::move(writable_file), fname,
+                                        options, Env::Default());
+  return IOStatus::OK();
+}
+
+FileOptions FileSystem::OptimizeForLogRead(
+    const FileOptions& file_options) const {
+  FileOptions optimized_file_options(file_options);
+  optimized_file_options.use_direct_reads = false;
+  return optimized_file_options;
+}
+
+FileOptions FileSystem::OptimizeForManifestRead(
+    const FileOptions& file_options) const {
+  FileOptions optimized_file_options(file_options);
+  optimized_file_options.use_direct_reads = false;
+  return optimized_file_options;
+}
+
+FileOptions FileSystem::OptimizeForLogWrite(const FileOptions& file_options,
+                                            const DBOptions& db_options) const {
+  FileOptions optimized_file_options(file_options);
+  optimized_file_options.bytes_per_sync = db_options.wal_bytes_per_sync;
+  optimized_file_options.writable_file_max_buffer_size =
+      db_options.writable_file_max_buffer_size;
+  return optimized_file_options;
+}
+
+FileOptions FileSystem::OptimizeForManifestWrite(
+    const FileOptions& file_options) const {
+  return file_options;
+}
+
+FileOptions FileSystem::OptimizeForCompactionTableWrite(
+    const FileOptions& file_options,
+    const ImmutableDBOptions& db_options) const {
+  FileOptions optimized_file_options(file_options);
+  optimized_file_options.use_direct_writes =
+      db_options.use_direct_io_for_flush_and_compaction;
+  return optimized_file_options;
+}
+
+FileOptions FileSystem::OptimizeForCompactionTableRead(
+    const FileOptions& file_options,
+    const ImmutableDBOptions& db_options) const {
+  FileOptions optimized_file_options(file_options);
+  optimized_file_options.use_direct_reads = db_options.use_direct_reads;
+  return optimized_file_options;
+}
+
+FileOptions FileSystem::OptimizeForBlobFileRead(
+    const FileOptions& file_options,
+    const ImmutableDBOptions& db_options) const {
+  FileOptions optimized_file_options(file_options);
+  optimized_file_options.use_direct_reads = db_options.use_direct_reads;
+  return optimized_file_options;
+}
+
+IOStatus WriteStringToFile(FileSystem* fs, const Slice& data,
+                           const std::string& fname, bool should_sync) {
+  std::unique_ptr<FSWritableFile> file;
+  EnvOptions soptions;
+  IOStatus s = fs->NewWritableFile(fname, soptions, &file, nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+  s = file->Append(data, IOOptions(), nullptr);
+  if (s.ok() && should_sync) {
+    s = file->Sync(IOOptions(), nullptr);
+  }
+  if (!s.ok()) {
+    fs->DeleteFile(fname, IOOptions(), nullptr);
+  }
+  return s;
+}
+
+IOStatus ReadFileToString(FileSystem* fs, const std::string& fname,
+                          std::string* data) {
+  FileOptions soptions;
+  data->clear();
+  std::unique_ptr<FSSequentialFile> file;
+  IOStatus s = status_to_io_status(
+      fs->NewSequentialFile(fname, soptions, &file, nullptr));
+  if (!s.ok()) {
+    return s;
+  }
+  static const int kBufferSize = 8192;
+  char* space = new char[kBufferSize];
+  while (true) {
+    Slice fragment;
+    s = file->Read(kBufferSize, IOOptions(), &fragment, space, nullptr);
+    if (!s.ok()) {
+      break;
+    }
+    data->append(fragment.data(), fragment.size());
+    if (fragment.empty()) {
+      break;
+    }
+  }
+  delete[] space;
+  return s;
+}
+
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo> fs_wrapper_type_info = {
+#ifndef ROCKSDB_LITE
+    {"target",
+     OptionTypeInfo::AsCustomSharedPtr<FileSystem>(
+         0, OptionVerificationType::kByName, OptionTypeFlags::kDontSerialize)},
+#endif  // ROCKSDB_LITE
+};
+}  // namespace
+FileSystemWrapper::FileSystemWrapper(const std::shared_ptr<FileSystem>& t)
+    : target_(t) {
+  RegisterOptions("", &target_, &fs_wrapper_type_info);
+}
+
+Status FileSystemWrapper::PrepareOptions(const ConfigOptions& options) {
+  if (target_ == nullptr) {
+    target_ = FileSystem::Default();
+  }
+  return FileSystem::PrepareOptions(options);
+}
+
+#ifndef ROCKSDB_LITE
+std::string FileSystemWrapper::SerializeOptions(
+    const ConfigOptions& config_options, const std::string& header) const {
+  auto parent = FileSystem::SerializeOptions(config_options, "");
+  if (config_options.IsShallow() || target_ == nullptr ||
+      target_->IsInstanceOf(FileSystem::kDefaultName())) {
+    return parent;
+  } else {
+    std::string result = header;
+    if (!StartsWith(parent, OptionTypeInfo::kIdPropName())) {
+      result.append(OptionTypeInfo::kIdPropName()).append("=");
+    }
+    result.append(parent);
+    if (!EndsWith(result, config_options.delimiter)) {
+      result.append(config_options.delimiter);
+    }
+    result.append("target=").append(target_->ToString(config_options));
+    return result;
+  }
+}
+#endif  // ROCKSDB_LITE
+
+DirFsyncOptions::DirFsyncOptions() { reason = kDefault; }
+
+DirFsyncOptions::DirFsyncOptions(std::string file_renamed_new_name) {
+  reason = kFileRenamed;
+  renamed_new_name = file_renamed_new_name;
+}
+
+DirFsyncOptions::DirFsyncOptions(FsyncReason fsync_reason) {
+  assert(fsync_reason != kFileRenamed);
+  reason = fsync_reason;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/env/file_system_tracer.cc b/src/rocksdb/env/file_system_tracer.cc
new file mode 100644
index 000000000..d0c45c57e
--- /dev/null
+++ b/src/rocksdb/env/file_system_tracer.cc
@@ -0,0 +1,564 @@
+// Copyright (c) 2019-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "env/file_system_tracer.h"
+
+#include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/trace_record.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+IOStatus FileSystemTracingWrapper::NewSequentialFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSSequentialFile>* result, IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->NewSequentialFile(fname, file_opts, result, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          fname.substr(fname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::NewRandomAccessFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->NewRandomAccessFile(fname, file_opts, result, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          fname.substr(fname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::NewWritableFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->NewWritableFile(fname, file_opts, result, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          fname.substr(fname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::ReopenWritableFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->ReopenWritableFile(fname, file_opts, result, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          fname.substr(fname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::ReuseWritableFile(
+    const std::string& fname, const std::string& old_fname,
+    const FileOptions& file_opts, std::unique_ptr<FSWritableFile>* result,
+    IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s =
+      target()->ReuseWritableFile(fname, old_fname, file_opts, result, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          fname.substr(fname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::NewRandomRWFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSRandomRWFile>* result, IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->NewRandomRWFile(fname, file_opts, result, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          fname.substr(fname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::NewDirectory(
+    const std::string& name, const IOOptions& io_opts,
+    std::unique_ptr<FSDirectory>* result, IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->NewDirectory(name, io_opts, result, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          name.substr(name.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::GetChildren(const std::string& dir,
+                                               const IOOptions& io_opts,
+                                               std::vector<std::string>* r,
+                                               IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->GetChildren(dir, io_opts, r, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          dir.substr(dir.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::DeleteFile(const std::string& fname,
+                                              const IOOptions& options,
+                                              IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->DeleteFile(fname, options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          fname.substr(fname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::CreateDir(const std::string& dirname,
+                                             const IOOptions& options,
+                                             IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->CreateDir(dirname, options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          dirname.substr(dirname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::CreateDirIfMissing(
+    const std::string& dirname, const IOOptions& options, IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->CreateDirIfMissing(dirname, options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          dirname.substr(dirname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::DeleteDir(const std::string& dirname,
+                                             const IOOptions& options,
+                                             IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->DeleteDir(dirname, options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          dirname.substr(dirname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::GetFileSize(const std::string& fname,
+                                               const IOOptions& options,
+                                               uint64_t* file_size,
+                                               IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->GetFileSize(fname, options, file_size, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOFileSize);
+  IOTraceRecord io_record(
+      clock_->NowNanos(), TraceType::kIOTracer, io_op_data, __func__, elapsed,
+      s.ToString(), fname.substr(fname.find_last_of("/\\") + 1), *file_size);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::Truncate(const std::string& fname,
+                                            size_t size,
+                                            const IOOptions& options,
+                                            IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Truncate(fname, size, options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOFileSize);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(),
+                          fname.substr(fname.find_last_of("/\\") + 1), size);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSSequentialFileTracingWrapper::Read(size_t n,
+                                              const IOOptions& options,
+                                              Slice* result, char* scratch,
+                                              IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Read(n, options, result, scratch, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_,
+                          result->size(), 0 /*Offset*/);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSSequentialFileTracingWrapper::InvalidateCache(size_t offset,
+                                                         size_t length) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->InvalidateCache(offset, length);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_, length,
+                          offset);
+  io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/);
+  return s;
+}
+
+IOStatus FSSequentialFileTracingWrapper::PositionedRead(
+    uint64_t offset, size_t n, const IOOptions& options, Slice* result,
+    char* scratch, IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s =
+      target()->PositionedRead(offset, n, options, result, scratch, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_,
+                          result->size(), offset);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSRandomAccessFileTracingWrapper::Read(uint64_t offset, size_t n,
+                                                const IOOptions& options,
+                                                Slice* result, char* scratch,
+                                                IODebugContext* dbg) const {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Read(offset, n, options, result, scratch, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_, n,
+                          offset);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSRandomAccessFileTracingWrapper::MultiRead(FSReadRequest* reqs,
+                                                     size_t num_reqs,
+                                                     const IOOptions& options,
+                                                     IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->MultiRead(reqs, num_reqs, options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t latency = elapsed;
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  for (size_t i = 0; i < num_reqs; i++) {
+    IOTraceRecord io_record(
+        clock_->NowNanos(), TraceType::kIOTracer, io_op_data, __func__, latency,
+        reqs[i].status.ToString(), file_name_, reqs[i].len, reqs[i].offset);
+    io_tracer_->WriteIOOp(io_record, dbg);
+  }
+  return s;
+}
+
+IOStatus FSRandomAccessFileTracingWrapper::Prefetch(uint64_t offset, size_t n,
+                                                    const IOOptions& options,
+                                                    IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Prefetch(offset, n, options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_, n,
+                          offset);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSRandomAccessFileTracingWrapper::InvalidateCache(size_t offset,
+                                                           size_t length) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->InvalidateCache(offset, length);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_, length,
+                          static_cast<uint64_t>(offset));
+  io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/);
+  return s;
+}
+
+IOStatus FSRandomAccessFileTracingWrapper::ReadAsync(
+    FSReadRequest& req, const IOOptions& opts,
+    std::function<void(const FSReadRequest&, void*)> cb, void* cb_arg,
+    void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) {
+  // Create a callback and populate info.
+  auto read_async_callback =
+      std::bind(&FSRandomAccessFileTracingWrapper::ReadAsyncCallback, this,
+                std::placeholders::_1, std::placeholders::_2);
+  ReadAsyncCallbackInfo* read_async_cb_info = new ReadAsyncCallbackInfo;
+  read_async_cb_info->cb_ = cb;
+  read_async_cb_info->cb_arg_ = cb_arg;
+  read_async_cb_info->start_time_ = clock_->NowNanos();
+  read_async_cb_info->file_op_ = __func__;
+
+  IOStatus s = target()->ReadAsync(req, opts, read_async_callback,
+                                   read_async_cb_info, io_handle, del_fn, dbg);
+
+  if (!s.ok()) {
+    delete read_async_cb_info;
+  }
+  return s;
+}
+
+void FSRandomAccessFileTracingWrapper::ReadAsyncCallback(
+    const FSReadRequest& req, void* cb_arg) {
+  ReadAsyncCallbackInfo* read_async_cb_info =
+      static_cast<ReadAsyncCallbackInfo*>(cb_arg);
+  assert(read_async_cb_info);
+  assert(read_async_cb_info->cb_);
+
+  uint64_t elapsed = clock_->NowNanos() - read_async_cb_info->start_time_;
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          read_async_cb_info->file_op_, elapsed,
+                          req.status.ToString(), file_name_, req.result.size(),
+                          req.offset);
+  io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/);
+
+  // call the underlying callback.
+  read_async_cb_info->cb_(req, read_async_cb_info->cb_arg_);
+  delete read_async_cb_info;
+}
+
+IOStatus FSWritableFileTracingWrapper::Append(const Slice& data,
+                                              const IOOptions& options,
+                                              IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Append(data, options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_,
+                          data.size(), 0 /*Offset*/);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSWritableFileTracingWrapper::PositionedAppend(
+    const Slice& data, uint64_t offset, const IOOptions& options,
+    IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->PositionedAppend(data, offset, options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_,
+                          data.size(), offset);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSWritableFileTracingWrapper::Truncate(uint64_t size,
+                                                const IOOptions& options,
+                                                IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Truncate(size, options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_, size,
+                          0 /*Offset*/);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSWritableFileTracingWrapper::Close(const IOOptions& options,
+                                             IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Close(options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          file_name_);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+uint64_t FSWritableFileTracingWrapper::GetFileSize(const IOOptions& options,
+                                                   IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  uint64_t file_size = target()->GetFileSize(options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOFileSize);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, "OK", file_name_, file_size);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return file_size;
+}
+
+IOStatus FSWritableFileTracingWrapper::InvalidateCache(size_t offset,
+                                                       size_t length) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->InvalidateCache(offset, length);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_, length,
+                          static_cast<uint64_t>(offset));
+  io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/);
+  return s;
+}
+
+IOStatus FSRandomRWFileTracingWrapper::Write(uint64_t offset, const Slice& data,
+                                             const IOOptions& options,
+                                             IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Write(offset, data, options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_,
+                          data.size(), offset);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSRandomRWFileTracingWrapper::Read(uint64_t offset, size_t n,
+                                            const IOOptions& options,
+                                            Slice* result, char* scratch,
+                                            IODebugContext* dbg) const {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Read(offset, n, options, result, scratch, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_, n,
+                          offset);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSRandomRWFileTracingWrapper::Flush(const IOOptions& options,
+                                             IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Flush(options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          file_name_);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSRandomRWFileTracingWrapper::Close(const IOOptions& options,
+                                             IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Close(options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          file_name_);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSRandomRWFileTracingWrapper::Sync(const IOOptions& options,
+                                            IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Sync(options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          file_name_);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSRandomRWFileTracingWrapper::Fsync(const IOOptions& options,
+                                             IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Fsync(options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          file_name_);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/env/file_system_tracer.h b/src/rocksdb/env/file_system_tracer.h
new file mode 100644
index 000000000..979a0bf12
--- /dev/null
+++ b/src/rocksdb/env/file_system_tracer.h
@@ -0,0 +1,461 @@
+// Copyright (c) 2019-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
+#include "trace_replay/io_tracer.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// FileSystemTracingWrapper is a wrapper class above FileSystem that forwards
+// the call to the underlying storage system. It then invokes IOTracer to record
+// file operations and other contextual information in a binary format for
+// tracing. It overrides methods we are interested in tracing and extends
+// FileSystemWrapper, which forwards all methods that are not explicitly
+// overridden.
+class FileSystemTracingWrapper : public FileSystemWrapper {
+ public:
+  FileSystemTracingWrapper(const std::shared_ptr<FileSystem>& t,
+                           const std::shared_ptr<IOTracer>& io_tracer)
+      : FileSystemWrapper(t),
+        io_tracer_(io_tracer),
+        clock_(SystemClock::Default().get()) {}
+
+  ~FileSystemTracingWrapper() override {}
+
+  static const char* kClassName() { return "FileSystemTracing"; }
+  const char* Name() const override { return kClassName(); }
+
+  IOStatus NewSequentialFile(const std::string& fname,
+                             const FileOptions& file_opts,
+                             std::unique_ptr<FSSequentialFile>* result,
+                             IODebugContext* dbg) override;
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& file_opts,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override;
+
+  IOStatus NewWritableFile(const std::string& fname,
+                           const FileOptions& file_opts,
+                           std::unique_ptr<FSWritableFile>* result,
+                           IODebugContext* dbg) override;
+
+  IOStatus ReopenWritableFile(const std::string& fname,
+                              const FileOptions& file_opts,
+                              std::unique_ptr<FSWritableFile>* result,
+                              IODebugContext* dbg) override;
+
+  IOStatus ReuseWritableFile(const std::string& fname,
+                             const std::string& old_fname,
+                             const FileOptions& file_opts,
+                             std::unique_ptr<FSWritableFile>* result,
+                             IODebugContext* dbg) override;
+
+  IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options,
+                           std::unique_ptr<FSRandomRWFile>* result,
+                           IODebugContext* dbg) override;
+
+  IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts,
+                        std::unique_ptr<FSDirectory>* result,
+                        IODebugContext* dbg) override;
+
+  IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts,
+                       std::vector<std::string>* r,
+                       IODebugContext* dbg) override;
+
+  IOStatus DeleteFile(const std::string& fname, const IOOptions& options,
+                      IODebugContext* dbg) override;
+
+  IOStatus CreateDir(const std::string& dirname, const IOOptions& options,
+                     IODebugContext* dbg) override;
+
+  IOStatus CreateDirIfMissing(const std::string& dirname,
+                              const IOOptions& options,
+                              IODebugContext* dbg) override;
+
+  IOStatus DeleteDir(const std::string& dirname, const IOOptions& options,
+                     IODebugContext* dbg) override;
+
+  IOStatus GetFileSize(const std::string& fname, const IOOptions& options,
+                       uint64_t* file_size, IODebugContext* dbg) override;
+
+  IOStatus Truncate(const std::string& fname, size_t size,
+                    const IOOptions& options, IODebugContext* dbg) override;
+
+ private:
+  std::shared_ptr<IOTracer> io_tracer_;
+  SystemClock* clock_;
+};
+
+// The FileSystemPtr is a wrapper class that takes pointer to storage systems
+// (such as posix filesystems). It overloads operator -> and returns a pointer
+// of either FileSystem or FileSystemTracingWrapper based on whether tracing is
+// enabled or  not. It is added to bypass FileSystemTracingWrapper when tracing
+// is disabled.
+class FileSystemPtr {
+ public:
+  FileSystemPtr(std::shared_ptr<FileSystem> fs,
+                const std::shared_ptr<IOTracer>& io_tracer)
+      : fs_(fs), io_tracer_(io_tracer) {
+    fs_tracer_ = std::make_shared<FileSystemTracingWrapper>(fs_, io_tracer_);
+  }
+
+  std::shared_ptr<FileSystem> operator->() const {
+    if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+      return fs_tracer_;
+    } else {
+      return fs_;
+    }
+  }
+
+  /* Returns the underlying File System pointer */
+  FileSystem* get() const {
+    if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+      return fs_tracer_.get();
+    } else {
+      return fs_.get();
+    }
+  }
+
+ private:
+  std::shared_ptr<FileSystem> fs_;
+  std::shared_ptr<IOTracer> io_tracer_;
+  std::shared_ptr<FileSystemTracingWrapper> fs_tracer_;
+};
+
+// FSSequentialFileTracingWrapper is a wrapper class above FSSequentialFile that
+// forwards the call to the underlying storage system. It then invokes IOTracer
+// to record file operations and other contextual information in a binary format
+// for tracing. It overrides methods we are interested in tracing and extends
+// FSSequentialFileWrapper, which forwards all methods that are not explicitly
+// overridden.
+class FSSequentialFileTracingWrapper : public FSSequentialFileOwnerWrapper {
+ public:
+  FSSequentialFileTracingWrapper(std::unique_ptr<FSSequentialFile>&& t,
+                                 std::shared_ptr<IOTracer> io_tracer,
+                                 const std::string& file_name)
+      : FSSequentialFileOwnerWrapper(std::move(t)),
+        io_tracer_(io_tracer),
+        clock_(SystemClock::Default().get()),
+        file_name_(file_name) {}
+
+  ~FSSequentialFileTracingWrapper() override {}
+
+  IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+                char* scratch, IODebugContext* dbg) override;
+
+  IOStatus InvalidateCache(size_t offset, size_t length) override;
+
+  IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options,
+                          Slice* result, char* scratch,
+                          IODebugContext* dbg) override;
+
+ private:
+  std::shared_ptr<IOTracer> io_tracer_;
+  SystemClock* clock_;
+  std::string file_name_;
+};
+
+// The FSSequentialFilePtr is a wrapper class that takes pointer to storage
+// systems (such as posix filesystems). It overloads operator -> and returns a
+// pointer of either FSSequentialFile or FSSequentialFileTracingWrapper based on
+// whether tracing is enabled or not. It is added to bypass
+// FSSequentialFileTracingWrapper when tracing is disabled.
+class FSSequentialFilePtr {
+ public:
+  FSSequentialFilePtr() = delete;
+  FSSequentialFilePtr(std::unique_ptr<FSSequentialFile>&& fs,
+                      const std::shared_ptr<IOTracer>& io_tracer,
+                      const std::string& file_name)
+      : io_tracer_(io_tracer),
+        fs_tracer_(std::move(fs), io_tracer_,
+                   file_name.substr(file_name.find_last_of("/\\") +
+                                    1) /* pass file name */) {}
+
+  FSSequentialFile* operator->() const {
+    if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+      return const_cast<FSSequentialFileTracingWrapper*>(&fs_tracer_);
+    } else {
+      return fs_tracer_.target();
+    }
+  }
+
+  FSSequentialFile* get() const {
+    if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+      return const_cast<FSSequentialFileTracingWrapper*>(&fs_tracer_);
+    } else {
+      return fs_tracer_.target();
+    }
+  }
+
+ private:
+  std::shared_ptr<IOTracer> io_tracer_;
+  FSSequentialFileTracingWrapper fs_tracer_;
+};
+
+// FSRandomAccessFileTracingWrapper is a wrapper class above FSRandomAccessFile
+// that forwards the call to the underlying storage system. It then invokes
+// IOTracer to record file operations and other contextual information in a
+// binary format for tracing. It overrides methods we are interested in tracing
+// and extends FSRandomAccessFileWrapper, which forwards all methods that are
+// not explicitly overridden.
+class FSRandomAccessFileTracingWrapper : public FSRandomAccessFileOwnerWrapper {
+ public:
+  FSRandomAccessFileTracingWrapper(std::unique_ptr<FSRandomAccessFile>&& t,
+                                   std::shared_ptr<IOTracer> io_tracer,
+                                   const std::string& file_name)
+      : FSRandomAccessFileOwnerWrapper(std::move(t)),
+        io_tracer_(io_tracer),
+        clock_(SystemClock::Default().get()),
+        file_name_(file_name) {}
+
+  ~FSRandomAccessFileTracingWrapper() override {}
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
+
+  IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
+                     const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
+                    IODebugContext* dbg) override;
+
+  IOStatus InvalidateCache(size_t offset, size_t length) override;
+
+  IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts,
+                     std::function<void(const FSReadRequest&, void*)> cb,
+                     void* cb_arg, void** io_handle, IOHandleDeleter* del_fn,
+                     IODebugContext* dbg) override;
+
+  void ReadAsyncCallback(const FSReadRequest& req, void* cb_arg);
+
+ private:
+  std::shared_ptr<IOTracer> io_tracer_;
+  SystemClock* clock_;
+  // Stores file name instead of full path.
+  std::string file_name_;
+
+  struct ReadAsyncCallbackInfo {
+    uint64_t start_time_;
+    std::function<void(const FSReadRequest&, void*)> cb_;
+    void* cb_arg_;
+    std::string file_op_;
+  };
+};
+
+// The FSRandomAccessFilePtr is a wrapper class that takes pointer to storage
+// systems (such as posix filesystems). It overloads operator -> and returns a
+// pointer of either FSRandomAccessFile or FSRandomAccessFileTracingWrapper
+// based on whether tracing is enabled or not. It is added to bypass
+// FSRandomAccessFileTracingWrapper when tracing is disabled.
+class FSRandomAccessFilePtr {
+ public:
+  FSRandomAccessFilePtr(std::unique_ptr<FSRandomAccessFile>&& fs,
+                        const std::shared_ptr<IOTracer>& io_tracer,
+                        const std::string& file_name)
+      : io_tracer_(io_tracer),
+        fs_tracer_(std::move(fs), io_tracer_,
+                   file_name.substr(file_name.find_last_of("/\\") +
+                                    1) /* pass file name */) {}
+
+  FSRandomAccessFile* operator->() const {
+    if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+      return const_cast<FSRandomAccessFileTracingWrapper*>(&fs_tracer_);
+    } else {
+      return fs_tracer_.target();
+    }
+  }
+
+  FSRandomAccessFile* get() const {
+    if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+      return const_cast<FSRandomAccessFileTracingWrapper*>(&fs_tracer_);
+    } else {
+      return fs_tracer_.target();
+    }
+  }
+
+ private:
+  std::shared_ptr<IOTracer> io_tracer_;
+  FSRandomAccessFileTracingWrapper fs_tracer_;
+};
+
+// FSWritableFileTracingWrapper is a wrapper class above FSWritableFile that
+// forwards the call to the underlying storage system. It then invokes IOTracer
+// to record file operations and other contextual information in a binary format
+// for tracing. It overrides methods we are interested in tracing and extends
+// FSWritableFileWrapper, which forwards all methods that are not explicitly
+// overridden.
+class FSWritableFileTracingWrapper : public FSWritableFileOwnerWrapper {
+ public:
+  FSWritableFileTracingWrapper(std::unique_ptr<FSWritableFile>&& t,
+                               std::shared_ptr<IOTracer> io_tracer,
+                               const std::string& file_name)
+      : FSWritableFileOwnerWrapper(std::move(t)),
+        io_tracer_(io_tracer),
+        clock_(SystemClock::Default().get()),
+        file_name_(file_name) {}
+
+  ~FSWritableFileTracingWrapper() override {}
+
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  IODebugContext* dbg) override;
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  const DataVerificationInfo& /*verification_info*/,
+                  IODebugContext* dbg) override {
+    return Append(data, options, dbg);
+  }
+
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& options,
+                            IODebugContext* dbg) override;
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& options,
+                            const DataVerificationInfo& /*verification_info*/,
+                            IODebugContext* dbg) override {
+    return PositionedAppend(data, offset, options, dbg);
+  }
+
+  IOStatus Truncate(uint64_t size, const IOOptions& options,
+                    IODebugContext* dbg) override;
+
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
+
+  uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus InvalidateCache(size_t offset, size_t length) override;
+
+ private:
+  std::shared_ptr<IOTracer> io_tracer_;
+  SystemClock* clock_;
+  // Stores file name instead of full path.
+  std::string file_name_;
+};
+
+// The FSWritableFilePtr is a wrapper class that takes pointer to storage
+// systems (such as posix filesystems). It overloads operator -> and returns a
+// pointer of either FSWritableFile or FSWritableFileTracingWrapper based on
+// whether tracing is enabled or not. It is added to bypass
+// FSWritableFileTracingWrapper when tracing is disabled.
+class FSWritableFilePtr {
+ public:
+  FSWritableFilePtr(std::unique_ptr<FSWritableFile>&& fs,
+                    const std::shared_ptr<IOTracer>& io_tracer,
+                    const std::string& file_name)
+      : io_tracer_(io_tracer) {
+    fs_tracer_.reset(new FSWritableFileTracingWrapper(
+        std::move(fs), io_tracer_,
+        file_name.substr(file_name.find_last_of("/\\") +
+                         1) /* pass file name */));
+  }
+
+  FSWritableFile* operator->() const {
+    if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+      return fs_tracer_.get();
+    } else {
+      return fs_tracer_->target();
+    }
+  }
+
+  FSWritableFile* get() const {
+    if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+      return fs_tracer_.get();
+    } else if (fs_tracer_) {
+      return fs_tracer_->target();
+    } else {
+      return nullptr;
+    }
+  }
+
+  void reset() {
+    fs_tracer_.reset();
+    io_tracer_ = nullptr;
+  }
+
+ private:
+  std::shared_ptr<IOTracer> io_tracer_;
+  std::unique_ptr<FSWritableFileTracingWrapper> fs_tracer_;
+};
+
+// FSRandomRWFileTracingWrapper is a wrapper class above FSRandomRWFile that
+// forwards the call to the underlying storage system. It then invokes IOTracer
+// to record file operations and other contextual information in a binary format
+// for tracing. It overrides methods we are interested in tracing and extends
+// FSRandomRWFileWrapper, which forwards all methods that are not explicitly
+// overridden.
+class FSRandomRWFileTracingWrapper : public FSRandomRWFileOwnerWrapper {
+ public:
+  FSRandomRWFileTracingWrapper(std::unique_ptr<FSRandomRWFile>&& t,
+                               std::shared_ptr<IOTracer> io_tracer,
+                               const std::string& file_name)
+      : FSRandomRWFileOwnerWrapper(std::move(t)),
+        io_tracer_(io_tracer),
+        clock_(SystemClock::Default().get()),
+        file_name_(file_name) {}
+
+  ~FSRandomRWFileTracingWrapper() override {}
+
+  IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options,
+                 IODebugContext* dbg) override;
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
+
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override;
+
+ private:
+  std::shared_ptr<IOTracer> io_tracer_;
+  SystemClock* clock_;
+  // Stores file name instead of full path.
+  std::string file_name_;
+};
+
+// The FSRandomRWFilePtr is a wrapper class that takes pointer to storage
+// systems (such as posix filesystems). It overloads operator -> and returns a
+// pointer of either FSRandomRWFile or FSRandomRWFileTracingWrapper based on
+// whether tracing is enabled or not. It is added to bypass
+// FSRandomRWFileTracingWrapper when tracing is disabled.
+class FSRandomRWFilePtr {
+ public:
+  FSRandomRWFilePtr(std::unique_ptr<FSRandomRWFile>&& fs,
+                    std::shared_ptr<IOTracer> io_tracer,
+                    const std::string& file_name)
+      : io_tracer_(io_tracer),
+        fs_tracer_(std::move(fs), io_tracer_,
+                   file_name.substr(file_name.find_last_of("/\\") +
+                                    1) /* pass file name */) {}
+
+  FSRandomRWFile* operator->() const {
+    if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+      return const_cast<FSRandomRWFileTracingWrapper*>(&fs_tracer_);
+    } else {
+      return fs_tracer_.target();
+    }
+  }
+
+  FSRandomRWFile* get() const {
+    if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+      return const_cast<FSRandomRWFileTracingWrapper*>(&fs_tracer_);
+    } else {
+      return fs_tracer_.target();
+    }
+  }
+
+ private:
+  std::shared_ptr<IOTracer> io_tracer_;
+  FSRandomRWFileTracingWrapper fs_tracer_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/env/fs_posix.cc b/src/rocksdb/env/fs_posix.cc
new file mode 100644
index 000000000..e179a421d
--- /dev/null
+++ b/src/rocksdb/env/fs_posix.cc
@@ -0,0 +1,1294 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors
+
+#if !defined(OS_WIN)
+
+#include <dirent.h>
+#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
+#include <dlfcn.h>
+#endif
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#if defined(OS_LINUX) || defined(OS_SOLARIS) || defined(OS_ANDROID)
+#include <sys/statfs.h>
+#include <sys/sysmacros.h>
+#endif
+#include <sys/statvfs.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <time.h>
+
+#include <algorithm>
+// Get nano time includes
+#if defined(OS_LINUX) || defined(OS_FREEBSD)
+#elif defined(__MACH__)
+#include <Availability.h>
+#include <mach/clock.h>
+#include <mach/mach.h>
+#else
+#include <chrono>
+#endif
+#include <deque>
+#include <set>
+#include <vector>
+
+#include "env/composite_env_wrapper.h"
+#include "env/io_posix.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/thread_status_updater.h"
+#include "port/lang.h"
+#include "port/port.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/compression_context_cache.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "util/thread_local.h"
+#include "util/threadpool_imp.h"
+
+#if !defined(TMPFS_MAGIC)
+#define TMPFS_MAGIC 0x01021994
+#endif
+#if !defined(XFS_SUPER_MAGIC)
+#define XFS_SUPER_MAGIC 0x58465342
+#endif
+#if !defined(EXT4_SUPER_MAGIC)
+#define EXT4_SUPER_MAGIC 0xEF53
+#endif
+
+extern "C" bool RocksDbIOUringEnable() __attribute__((__weak__));
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+inline mode_t GetDBFileMode(bool allow_non_owner_access) {
+  return allow_non_owner_access ? 0644 : 0600;
+}
+
+// list of pathnames that are locked
+// Only used for error message.
+struct LockHoldingInfo {
+  int64_t acquire_time;
+  uint64_t acquiring_thread;
+};
+static std::map<std::string, LockHoldingInfo> locked_files;
+static port::Mutex mutex_locked_files;
+
+static int LockOrUnlock(int fd, bool lock) {
+  errno = 0;
+  struct flock f;
+  memset(&f, 0, sizeof(f));
+  f.l_type = (lock ? F_WRLCK : F_UNLCK);
+  f.l_whence = SEEK_SET;
+  f.l_start = 0;
+  f.l_len = 0;  // Lock/unlock entire file
+  int value = fcntl(fd, F_SETLK, &f);
+
+  return value;
+}
+
+class PosixFileLock : public FileLock {
+ public:
+  int fd_ = /*invalid*/ -1;
+  std::string filename;
+
+  void Clear() {
+    fd_ = -1;
+    filename.clear();
+  }
+
+  virtual ~PosixFileLock() override {
+    // Check for destruction without UnlockFile
+    assert(fd_ == -1);
+  }
+};
+
+int cloexec_flags(int flags, const EnvOptions* options) {
+  // If the system supports opening the file with cloexec enabled,
+  // do so, as this avoids a race condition if a db is opened around
+  // the same time that a child process is forked
+#ifdef O_CLOEXEC
+  if (options == nullptr || options->set_fd_cloexec) {
+    flags |= O_CLOEXEC;
+  }
+#else
+  (void)options;
+#endif
+  return flags;
+}
+
+class PosixFileSystem : public FileSystem {
+ public:
+  PosixFileSystem();
+
+  static const char* kClassName() { return "PosixFileSystem"; }
+  const char* Name() const override { return kClassName(); }
+  const char* NickName() const override { return kDefaultName(); }
+
+  ~PosixFileSystem() override {}
+  bool IsInstanceOf(const std::string& name) const override {
+    if (name == "posix") {
+      return true;
+    } else {
+      return FileSystem::IsInstanceOf(name);
+    }
+  }
+
+  void SetFD_CLOEXEC(int fd, const EnvOptions* options) {
+    if ((options == nullptr || options->set_fd_cloexec) && fd > 0) {
+      fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
+    }
+  }
+
+  IOStatus NewSequentialFile(const std::string& fname,
+                             const FileOptions& options,
+                             std::unique_ptr<FSSequentialFile>* result,
+                             IODebugContext* /*dbg*/) override {
+    result->reset();
+    int fd = -1;
+    int flags = cloexec_flags(O_RDONLY, &options);
+    FILE* file = nullptr;
+
+    if (options.use_direct_reads && !options.use_mmap_reads) {
+#ifdef ROCKSDB_LITE
+      return IOStatus::IOError(fname,
+                               "Direct I/O not supported in RocksDB lite");
+#endif  // !ROCKSDB_LITE
+#if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
+      flags |= O_DIRECT;
+      TEST_SYNC_POINT_CALLBACK("NewSequentialFile:O_DIRECT", &flags);
+#endif
+    }
+
+    do {
+      IOSTATS_TIMER_GUARD(open_nanos);
+      fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
+    } while (fd < 0 && errno == EINTR);
+    if (fd < 0) {
+      return IOError("While opening a file for sequentially reading", fname,
+                     errno);
+    }
+
+    SetFD_CLOEXEC(fd, &options);
+
+    if (options.use_direct_reads && !options.use_mmap_reads) {
+#ifdef OS_MACOSX
+      if (fcntl(fd, F_NOCACHE, 1) == -1) {
+        close(fd);
+        return IOError("While fcntl NoCache", fname, errno);
+      }
+#endif
+    } else {
+      do {
+        IOSTATS_TIMER_GUARD(open_nanos);
+        file = fdopen(fd, "r");
+      } while (file == nullptr && errno == EINTR);
+      if (file == nullptr) {
+        close(fd);
+        return IOError("While opening file for sequentially read", fname,
+                       errno);
+      }
+    }
+    result->reset(new PosixSequentialFile(
+        fname, file, fd, GetLogicalBlockSizeForReadIfNeeded(options, fname, fd),
+        options));
+    return IOStatus::OK();
+  }
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& options,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* /*dbg*/) override {
+    result->reset();
+    IOStatus s = IOStatus::OK();
+    int fd;
+    int flags = cloexec_flags(O_RDONLY, &options);
+
+    if (options.use_direct_reads && !options.use_mmap_reads) {
+#ifdef ROCKSDB_LITE
+      return IOStatus::IOError(fname,
+                               "Direct I/O not supported in RocksDB lite");
+#endif  // !ROCKSDB_LITE
+#if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
+      flags |= O_DIRECT;
+      TEST_SYNC_POINT_CALLBACK("NewRandomAccessFile:O_DIRECT", &flags);
+#endif
+    }
+
+    do {
+      IOSTATS_TIMER_GUARD(open_nanos);
+      fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
+    } while (fd < 0 && errno == EINTR);
+    if (fd < 0) {
+      s = IOError("While open a file for random read", fname, errno);
+      return s;
+    }
+    SetFD_CLOEXEC(fd, &options);
+
+    if (options.use_mmap_reads) {
+      // Use of mmap for random reads has been removed because it
+      // kills performance when storage is fast.
+      // Use mmap when virtual address-space is plentiful.
+      uint64_t size;
+      IOOptions opts;
+      s = GetFileSize(fname, opts, &size, nullptr);
+      if (s.ok()) {
+        void* base = mmap(nullptr, size, PROT_READ, MAP_SHARED, fd, 0);
+        if (base != MAP_FAILED) {
+          result->reset(
+              new PosixMmapReadableFile(fd, fname, base, size, options));
+        } else {
+          s = IOError("while mmap file for read", fname, errno);
+          close(fd);
+        }
+      } else {
+        close(fd);
+      }
+    } else {
+      if (options.use_direct_reads && !options.use_mmap_reads) {
+#ifdef OS_MACOSX
+        if (fcntl(fd, F_NOCACHE, 1) == -1) {
+          close(fd);
+          return IOError("while fcntl NoCache", fname, errno);
+        }
+#endif
+      }
+      result->reset(new PosixRandomAccessFile(
+          fname, fd, GetLogicalBlockSizeForReadIfNeeded(options, fname, fd),
+          options
+#if defined(ROCKSDB_IOURING_PRESENT)
+          ,
+          !IsIOUringEnabled() ? nullptr : thread_local_io_urings_.get()
+#endif
+              ));
+    }
+    return s;
+  }
+
+  virtual IOStatus OpenWritableFile(const std::string& fname,
+                                    const FileOptions& options, bool reopen,
+                                    std::unique_ptr<FSWritableFile>* result,
+                                    IODebugContext* /*dbg*/) {
+    result->reset();
+    IOStatus s;
+    int fd = -1;
+    int flags = (reopen) ? (O_CREAT | O_APPEND) : (O_CREAT | O_TRUNC);
+    // Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX)
+    if (options.use_direct_writes && !options.use_mmap_writes) {
+      // Note: we should avoid O_APPEND here due to ta the following bug:
+      // POSIX requires that opening a file with the O_APPEND flag should
+      // have no affect on the location at which pwrite() writes data.
+      // However, on Linux, if a file is opened with O_APPEND, pwrite()
+      // appends data to the end of the file, regardless of the value of
+      // offset.
+      // More info here: https://linux.die.net/man/2/pwrite
+#ifdef ROCKSDB_LITE
+      return IOStatus::IOError(fname,
+                               "Direct I/O not supported in RocksDB lite");
+#endif  // ROCKSDB_LITE
+      flags |= O_WRONLY;
+#if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
+      flags |= O_DIRECT;
+#endif
+      TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags);
+    } else if (options.use_mmap_writes) {
+      // non-direct I/O
+      flags |= O_RDWR;
+    } else {
+      flags |= O_WRONLY;
+    }
+
+    flags = cloexec_flags(flags, &options);
+
+    do {
+      IOSTATS_TIMER_GUARD(open_nanos);
+      fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
+    } while (fd < 0 && errno == EINTR);
+
+    if (fd < 0) {
+      s = IOError("While open a file for appending", fname, errno);
+      return s;
+    }
+    SetFD_CLOEXEC(fd, &options);
+
+    if (options.use_mmap_writes) {
+      MaybeForceDisableMmap(fd);
+    }
+    if (options.use_mmap_writes && !forceMmapOff_) {
+      result->reset(new PosixMmapFile(fname, fd, page_size_, options));
+    } else if (options.use_direct_writes && !options.use_mmap_writes) {
+#ifdef OS_MACOSX
+      if (fcntl(fd, F_NOCACHE, 1) == -1) {
+        close(fd);
+        s = IOError("While fcntl NoCache an opened file for appending", fname,
+                    errno);
+        return s;
+      }
+#elif defined(OS_SOLARIS)
+      if (directio(fd, DIRECTIO_ON) == -1) {
+        if (errno != ENOTTY) {  // ZFS filesystems don't support DIRECTIO_ON
+          close(fd);
+          s = IOError("While calling directio()", fname, errno);
+          return s;
+        }
+      }
+#endif
+      result->reset(new PosixWritableFile(
+          fname, fd, GetLogicalBlockSizeForWriteIfNeeded(options, fname, fd),
+          options));
+    } else {
+      // disable mmap writes
+      EnvOptions no_mmap_writes_options = options;
+      no_mmap_writes_options.use_mmap_writes = false;
+      result->reset(
+          new PosixWritableFile(fname, fd,
+                                GetLogicalBlockSizeForWriteIfNeeded(
+                                    no_mmap_writes_options, fname, fd),
+                                no_mmap_writes_options));
+    }
+    return s;
+  }
+
+  IOStatus NewWritableFile(const std::string& fname, const FileOptions& options,
+                           std::unique_ptr<FSWritableFile>* result,
+                           IODebugContext* dbg) override {
+    return OpenWritableFile(fname, options, false, result, dbg);
+  }
+
+  IOStatus ReopenWritableFile(const std::string& fname,
+                              const FileOptions& options,
+                              std::unique_ptr<FSWritableFile>* result,
+                              IODebugContext* dbg) override {
+    return OpenWritableFile(fname, options, true, result, dbg);
+  }
+
+  IOStatus ReuseWritableFile(const std::string& fname,
+                             const std::string& old_fname,
+                             const FileOptions& options,
+                             std::unique_ptr<FSWritableFile>* result,
+                             IODebugContext* /*dbg*/) override {
+    result->reset();
+    IOStatus s;
+    int fd = -1;
+
+    int flags = 0;
+    // Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX)
+    if (options.use_direct_writes && !options.use_mmap_writes) {
+#ifdef ROCKSDB_LITE
+      return IOStatus::IOError(fname,
+                               "Direct I/O not supported in RocksDB lite");
+#endif  // !ROCKSDB_LITE
+      flags |= O_WRONLY;
+#if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
+      flags |= O_DIRECT;
+#endif
+      TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags);
+    } else if (options.use_mmap_writes) {
+      // mmap needs O_RDWR mode
+      flags |= O_RDWR;
+    } else {
+      flags |= O_WRONLY;
+    }
+
+    flags = cloexec_flags(flags, &options);
+
+    do {
+      IOSTATS_TIMER_GUARD(open_nanos);
+      fd = open(old_fname.c_str(), flags,
+                GetDBFileMode(allow_non_owner_access_));
+    } while (fd < 0 && errno == EINTR);
+    if (fd < 0) {
+      s = IOError("while reopen file for write", fname, errno);
+      return s;
+    }
+
+    SetFD_CLOEXEC(fd, &options);
+    // rename into place
+    if (rename(old_fname.c_str(), fname.c_str()) != 0) {
+      s = IOError("while rename file to " + fname, old_fname, errno);
+      close(fd);
+      return s;
+    }
+
+    if (options.use_mmap_writes) {
+      MaybeForceDisableMmap(fd);
+    }
+    if (options.use_mmap_writes && !forceMmapOff_) {
+      result->reset(new PosixMmapFile(fname, fd, page_size_, options));
+    } else if (options.use_direct_writes && !options.use_mmap_writes) {
+#ifdef OS_MACOSX
+      if (fcntl(fd, F_NOCACHE, 1) == -1) {
+        close(fd);
+        s = IOError("while fcntl NoCache for reopened file for append", fname,
+                    errno);
+        return s;
+      }
+#elif defined(OS_SOLARIS)
+      if (directio(fd, DIRECTIO_ON) == -1) {
+        if (errno != ENOTTY) {  // ZFS filesystems don't support DIRECTIO_ON
+          close(fd);
+          s = IOError("while calling directio()", fname, errno);
+          return s;
+        }
+      }
+#endif
+      result->reset(new PosixWritableFile(
+          fname, fd, GetLogicalBlockSizeForWriteIfNeeded(options, fname, fd),
+          options));
+    } else {
+      // disable mmap writes
+      FileOptions no_mmap_writes_options = options;
+      no_mmap_writes_options.use_mmap_writes = false;
+      result->reset(
+          new PosixWritableFile(fname, fd,
+                                GetLogicalBlockSizeForWriteIfNeeded(
+                                    no_mmap_writes_options, fname, fd),
+                                no_mmap_writes_options));
+    }
+    return s;
+  }
+
+  IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options,
+                           std::unique_ptr<FSRandomRWFile>* result,
+                           IODebugContext* /*dbg*/) override {
+    int fd = -1;
+    int flags = cloexec_flags(O_RDWR, &options);
+
+    while (fd < 0) {
+      IOSTATS_TIMER_GUARD(open_nanos);
+
+      fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
+      if (fd < 0) {
+        // Error while opening the file
+        if (errno == EINTR) {
+          continue;
+        }
+        return IOError("While open file for random read/write", fname, errno);
+      }
+    }
+
+    SetFD_CLOEXEC(fd, &options);
+    result->reset(new PosixRandomRWFile(fname, fd, options));
+    return IOStatus::OK();
+  }
+
+  IOStatus NewMemoryMappedFileBuffer(
+      const std::string& fname,
+      std::unique_ptr<MemoryMappedFileBuffer>* result) override {
+    int fd = -1;
+    IOStatus status;
+    int flags = cloexec_flags(O_RDWR, nullptr);
+
+    while (fd < 0) {
+      IOSTATS_TIMER_GUARD(open_nanos);
+      fd = open(fname.c_str(), flags, 0644);
+      if (fd < 0) {
+        // Error while opening the file
+        if (errno == EINTR) {
+          continue;
+        }
+        status =
+            IOError("While open file for raw mmap buffer access", fname, errno);
+        break;
+      }
+    }
+    uint64_t size;
+    if (status.ok()) {
+      IOOptions opts;
+      status = GetFileSize(fname, opts, &size, nullptr);
+    }
+    void* base = nullptr;
+    if (status.ok()) {
+      base = mmap(nullptr, static_cast<size_t>(size), PROT_READ | PROT_WRITE,
+                  MAP_SHARED, fd, 0);
+      if (base == MAP_FAILED) {
+        status = IOError("while mmap file for read", fname, errno);
+      }
+    }
+    if (status.ok()) {
+      result->reset(
+          new PosixMemoryMappedFileBuffer(base, static_cast<size_t>(size)));
+    }
+    if (fd >= 0) {
+      // don't need to keep it open after mmap has been called
+      close(fd);
+    }
+    return status;
+  }
+
+  IOStatus NewDirectory(const std::string& name, const IOOptions& /*opts*/,
+                        std::unique_ptr<FSDirectory>* result,
+                        IODebugContext* /*dbg*/) override {
+    result->reset();
+    int fd;
+    int flags = cloexec_flags(0, nullptr);
+    {
+      IOSTATS_TIMER_GUARD(open_nanos);
+      fd = open(name.c_str(), flags);
+    }
+    if (fd < 0) {
+      return IOError("While open directory", name, errno);
+    } else {
+      result->reset(new PosixDirectory(fd, name));
+    }
+    return IOStatus::OK();
+  }
+
+  IOStatus FileExists(const std::string& fname, const IOOptions& /*opts*/,
+                      IODebugContext* /*dbg*/) override {
+    int result = access(fname.c_str(), F_OK);
+
+    if (result == 0) {
+      return IOStatus::OK();
+    }
+
+    int err = errno;
+    switch (err) {
+      case EACCES:
+      case ELOOP:
+      case ENAMETOOLONG:
+      case ENOENT:
+      case ENOTDIR:
+        return IOStatus::NotFound();
+      default:
+        assert(err == EIO || err == ENOMEM);
+        return IOStatus::IOError("Unexpected error(" + std::to_string(err) +
+                                 ") accessing file `" + fname + "' ");
+    }
+  }
+
+  IOStatus GetChildren(const std::string& dir, const IOOptions& opts,
+                       std::vector<std::string>* result,
+                       IODebugContext* /*dbg*/) override {
+    result->clear();
+
+    DIR* d = opendir(dir.c_str());
+    if (d == nullptr) {
+      switch (errno) {
+        case EACCES:
+        case ENOENT:
+        case ENOTDIR:
+          return IOStatus::NotFound();
+        default:
+          return IOError("While opendir", dir, errno);
+      }
+    }
+
+    // reset errno before calling readdir()
+    errno = 0;
+    struct dirent* entry;
+
+    while ((entry = readdir(d)) != nullptr) {
+      // filter out '.' and '..' directory entries
+      // which appear only on some platforms
+      const bool ignore =
+          entry->d_type == DT_DIR &&
+          (strcmp(entry->d_name, ".") == 0 ||
+           strcmp(entry->d_name, "..") == 0
+#ifndef ASSERT_STATUS_CHECKED
+           // In case of ASSERT_STATUS_CHECKED, GetChildren support older
+           // version of API for debugging purpose.
+           || opts.do_not_recurse
+#endif
+          );
+      if (!ignore) {
+        result->push_back(entry->d_name);
+      }
+      errno = 0;  // reset errno if readdir() success
+    }
+
+    // always attempt to close the dir
+    const auto pre_close_errno = errno;  // errno may be modified by closedir
+    const int close_result = closedir(d);
+
+    if (pre_close_errno != 0) {
+      // error occurred during readdir
+      return IOError("While readdir", dir, pre_close_errno);
+    }
+
+    if (close_result != 0) {
+      // error occurred during closedir
+      return IOError("While closedir", dir, errno);
+    }
+
+    return IOStatus::OK();
+  }
+
+  IOStatus DeleteFile(const std::string& fname, const IOOptions& /*opts*/,
+                      IODebugContext* /*dbg*/) override {
+    IOStatus result;
+    if (unlink(fname.c_str()) != 0) {
+      result = IOError("while unlink() file", fname, errno);
+    }
+    return result;
+  }
+
+  IOStatus CreateDir(const std::string& name, const IOOptions& /*opts*/,
+                     IODebugContext* /*dbg*/) override {
+    if (mkdir(name.c_str(), 0755) != 0) {
+      return IOError("While mkdir", name, errno);
+    }
+    return IOStatus::OK();
+  }
+
+  IOStatus CreateDirIfMissing(const std::string& name,
+                              const IOOptions& /*opts*/,
+                              IODebugContext* /*dbg*/) override {
+    if (mkdir(name.c_str(), 0755) != 0) {
+      if (errno != EEXIST) {
+        return IOError("While mkdir if missing", name, errno);
+      } else if (!DirExists(name)) {  // Check that name is actually a
+                                      // directory.
+        // Message is taken from mkdir
+        return IOStatus::IOError("`" + name +
+                                 "' exists but is not a directory");
+      }
+    }
+    return IOStatus::OK();
+  }
+
+  IOStatus DeleteDir(const std::string& name, const IOOptions& /*opts*/,
+                     IODebugContext* /*dbg*/) override {
+    if (rmdir(name.c_str()) != 0) {
+      return IOError("file rmdir", name, errno);
+    }
+    return IOStatus::OK();
+  }
+
+  IOStatus GetFileSize(const std::string& fname, const IOOptions& /*opts*/,
+                       uint64_t* size, IODebugContext* /*dbg*/) override {
+    struct stat sbuf;
+    if (stat(fname.c_str(), &sbuf) != 0) {
+      *size = 0;
+      return IOError("while stat a file for size", fname, errno);
+    } else {
+      *size = sbuf.st_size;
+    }
+    return IOStatus::OK();
+  }
+
+  IOStatus GetFileModificationTime(const std::string& fname,
+                                   const IOOptions& /*opts*/,
+                                   uint64_t* file_mtime,
+                                   IODebugContext* /*dbg*/) override {
+    struct stat s;
+    if (stat(fname.c_str(), &s) != 0) {
+      return IOError("while stat a file for modification time", fname, errno);
+    }
+    *file_mtime = static_cast<uint64_t>(s.st_mtime);
+    return IOStatus::OK();
+  }
+
+  IOStatus RenameFile(const std::string& src, const std::string& target,
+                      const IOOptions& /*opts*/,
+                      IODebugContext* /*dbg*/) override {
+    if (rename(src.c_str(), target.c_str()) != 0) {
+      return IOError("While renaming a file to " + target, src, errno);
+    }
+    return IOStatus::OK();
+  }
+
+  IOStatus LinkFile(const std::string& src, const std::string& target,
+                    const IOOptions& /*opts*/,
+                    IODebugContext* /*dbg*/) override {
+    if (link(src.c_str(), target.c_str()) != 0) {
+      if (errno == EXDEV || errno == ENOTSUP) {
+        return IOStatus::NotSupported(errno == EXDEV
+                                          ? "No cross FS links allowed"
+                                          : "Links not supported by FS");
+      }
+      return IOError("while link file to " + target, src, errno);
+    }
+    return IOStatus::OK();
+  }
+
+  IOStatus NumFileLinks(const std::string& fname, const IOOptions& /*opts*/,
+                        uint64_t* count, IODebugContext* /*dbg*/) override {
+    struct stat s;
+    if (stat(fname.c_str(), &s) != 0) {
+      return IOError("while stat a file for num file links", fname, errno);
+    }
+    *count = static_cast<uint64_t>(s.st_nlink);
+    return IOStatus::OK();
+  }
+
+  IOStatus AreFilesSame(const std::string& first, const std::string& second,
+                        const IOOptions& /*opts*/, bool* res,
+                        IODebugContext* /*dbg*/) override {
+    struct stat statbuf[2];
+    if (stat(first.c_str(), &statbuf[0]) != 0) {
+      return IOError("stat file", first, errno);
+    }
+    if (stat(second.c_str(), &statbuf[1]) != 0) {
+      return IOError("stat file", second, errno);
+    }
+
+    if (major(statbuf[0].st_dev) != major(statbuf[1].st_dev) ||
+        minor(statbuf[0].st_dev) != minor(statbuf[1].st_dev) ||
+        statbuf[0].st_ino != statbuf[1].st_ino) {
+      *res = false;
+    } else {
+      *res = true;
+    }
+    return IOStatus::OK();
+  }
+
+  IOStatus LockFile(const std::string& fname, const IOOptions& /*opts*/,
+                    FileLock** lock, IODebugContext* /*dbg*/) override {
+    *lock = nullptr;
+
+    LockHoldingInfo lhi;
+    int64_t current_time = 0;
+    // Ignore status code as the time is only used for error message.
+    SystemClock::Default()
+        ->GetCurrentTime(&current_time)
+        .PermitUncheckedError();
+    lhi.acquire_time = current_time;
+    lhi.acquiring_thread = Env::Default()->GetThreadID();
+
+    mutex_locked_files.Lock();
+    // If it already exists in the locked_files set, then it is already locked,
+    // and fail this lock attempt. Otherwise, insert it into locked_files.
+    // This check is needed because fcntl() does not detect lock conflict
+    // if the fcntl is issued by the same thread that earlier acquired
+    // this lock.
+    // We must do this check *before* opening the file:
+    // Otherwise, we will open a new file descriptor. Locks are associated with
+    // a process, not a file descriptor and when *any* file descriptor is
+    // closed, all locks the process holds for that *file* are released
+    const auto it_success = locked_files.insert({fname, lhi});
+    if (it_success.second == false) {
+      LockHoldingInfo prev_info = it_success.first->second;
+      mutex_locked_files.Unlock();
+      errno = ENOLCK;
+      // Note that the thread ID printed is the same one as the one in
+      // posix logger, but posix logger prints it hex format.
+      return IOError("lock hold by current process, acquire time " +
+                         std::to_string(prev_info.acquire_time) +
+                         " acquiring thread " +
+                         std::to_string(prev_info.acquiring_thread),
+                     fname, errno);
+    }
+
+    IOStatus result = IOStatus::OK();
+    int fd;
+    int flags = cloexec_flags(O_RDWR | O_CREAT, nullptr);
+
+    {
+      IOSTATS_TIMER_GUARD(open_nanos);
+      fd = open(fname.c_str(), flags, 0644);
+    }
+    if (fd < 0) {
+      result = IOError("while open a file for lock", fname, errno);
+    } else if (LockOrUnlock(fd, true) == -1) {
+      result = IOError("While lock file", fname, errno);
+      close(fd);
+    } else {
+      SetFD_CLOEXEC(fd, nullptr);
+      PosixFileLock* my_lock = new PosixFileLock;
+      my_lock->fd_ = fd;
+      my_lock->filename = fname;
+      *lock = my_lock;
+    }
+    if (!result.ok()) {
+      // If there is an error in locking, then remove the pathname from
+      // locked_files. (If we got this far, it did not exist in locked_files
+      // before this call.)
+      locked_files.erase(fname);
+    }
+
+    mutex_locked_files.Unlock();
+    return result;
+  }
+
+  IOStatus UnlockFile(FileLock* lock, const IOOptions& /*opts*/,
+                      IODebugContext* /*dbg*/) override {
+    PosixFileLock* my_lock = reinterpret_cast<PosixFileLock*>(lock);
+    IOStatus result;
+    mutex_locked_files.Lock();
+    // If we are unlocking, then verify that we had locked it earlier,
+    // it should already exist in locked_files. Remove it from locked_files.
+    if (locked_files.erase(my_lock->filename) != 1) {
+      errno = ENOLCK;
+      result = IOError("unlock", my_lock->filename, errno);
+    } else if (LockOrUnlock(my_lock->fd_, false) == -1) {
+      result = IOError("unlock", my_lock->filename, errno);
+    }
+    close(my_lock->fd_);
+    my_lock->Clear();
+    delete my_lock;
+    mutex_locked_files.Unlock();
+    return result;
+  }
+
+  IOStatus GetAbsolutePath(const std::string& db_path,
+                           const IOOptions& /*opts*/, std::string* output_path,
+                           IODebugContext* /*dbg*/) override {
+    if (!db_path.empty() && db_path[0] == '/') {
+      *output_path = db_path;
+      return IOStatus::OK();
+    }
+
+    char the_path[4096];
+    char* ret = getcwd(the_path, 4096);
+    if (ret == nullptr) {
+      return IOStatus::IOError(errnoStr(errno).c_str());
+    }
+
+    *output_path = ret;
+    return IOStatus::OK();
+  }
+
+  IOStatus GetTestDirectory(const IOOptions& /*opts*/, std::string* result,
+                            IODebugContext* /*dbg*/) override {
+    const char* env = getenv("TEST_TMPDIR");
+    if (env && env[0] != '\0') {
+      *result = env;
+    } else {
+      char buf[100];
+      snprintf(buf, sizeof(buf), "/tmp/rocksdbtest-%d", int(geteuid()));
+      *result = buf;
+    }
+    // Directory may already exist
+    {
+      IOOptions opts;
+      return CreateDirIfMissing(*result, opts, nullptr);
+    }
+    return IOStatus::OK();
+  }
+
+  IOStatus GetFreeSpace(const std::string& fname, const IOOptions& /*opts*/,
+                        uint64_t* free_space,
+                        IODebugContext* /*dbg*/) override {
+    struct statvfs sbuf;
+
+    if (statvfs(fname.c_str(), &sbuf) < 0) {
+      return IOError("While doing statvfs", fname, errno);
+    }
+
+    // sbuf.bfree is total free space available to root
+    // sbuf.bavail is total free space available to unprivileged user
+    //  sbuf.bavail <= sbuf.bfree ... pick correct based upon effective user id
+    if (geteuid()) {
+      // non-zero user is unprivileged, or -1 if error.  take more conservative
+      // size
+      *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bavail);
+    } else {
+      // root user can access all disk space
+      *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bfree);
+    }
+    return IOStatus::OK();
+  }
+
+  IOStatus IsDirectory(const std::string& path, const IOOptions& /*opts*/,
+                       bool* is_dir, IODebugContext* /*dbg*/) override {
+    // First open
+    int fd = -1;
+    int flags = cloexec_flags(O_RDONLY, nullptr);
+    {
+      IOSTATS_TIMER_GUARD(open_nanos);
+      fd = open(path.c_str(), flags);
+    }
+    if (fd < 0) {
+      return IOError("While open for IsDirectory()", path, errno);
+    }
+    IOStatus io_s;
+    struct stat sbuf;
+    if (fstat(fd, &sbuf) < 0) {
+      io_s = IOError("While doing stat for IsDirectory()", path, errno);
+    }
+    close(fd);
+    if (io_s.ok() && nullptr != is_dir) {
+      *is_dir = S_ISDIR(sbuf.st_mode);
+    }
+    return io_s;
+  }
+
+  FileOptions OptimizeForLogWrite(const FileOptions& file_options,
+                                  const DBOptions& db_options) const override {
+    FileOptions optimized = file_options;
+    optimized.use_mmap_writes = false;
+    optimized.use_direct_writes = false;
+    optimized.bytes_per_sync = db_options.wal_bytes_per_sync;
+    // TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it
+    // breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
+    // test and make this false
+    optimized.fallocate_with_keep_size = true;
+    optimized.writable_file_max_buffer_size =
+        db_options.writable_file_max_buffer_size;
+    return optimized;
+  }
+
+  FileOptions OptimizeForManifestWrite(
+      const FileOptions& file_options) const override {
+    FileOptions optimized = file_options;
+    optimized.use_mmap_writes = false;
+    optimized.use_direct_writes = false;
+    optimized.fallocate_with_keep_size = true;
+    return optimized;
+  }
+#ifdef OS_LINUX
+  Status RegisterDbPaths(const std::vector<std::string>& paths) override {
+    return logical_block_size_cache_.RefAndCacheLogicalBlockSize(paths);
+  }
+  Status UnregisterDbPaths(const std::vector<std::string>& paths) override {
+    logical_block_size_cache_.UnrefAndTryRemoveCachedLogicalBlockSize(paths);
+    return Status::OK();
+  }
+#endif
+ private:
+  bool forceMmapOff_ = false;  // do we override Env options?
+
+  // Returns true iff the named directory exists and is a directory.
+  virtual bool DirExists(const std::string& dname) {
+    struct stat statbuf;
+    if (stat(dname.c_str(), &statbuf) == 0) {
+      return S_ISDIR(statbuf.st_mode);
+    }
+    return false;  // stat() failed return false
+  }
+
+  bool SupportsFastAllocate(int fd) {
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+    struct statfs s;
+    if (fstatfs(fd, &s)) {
+      return false;
+    }
+    switch (s.f_type) {
+      case EXT4_SUPER_MAGIC:
+        return true;
+      case XFS_SUPER_MAGIC:
+        return true;
+      case TMPFS_MAGIC:
+        return true;
+      default:
+        return false;
+    }
+#else
+    (void)fd;
+    return false;
+#endif
+  }
+
+  void MaybeForceDisableMmap(int fd) {
+    static std::once_flag s_check_disk_for_mmap_once;
+    assert(this == FileSystem::Default().get());
+    std::call_once(
+        s_check_disk_for_mmap_once,
+        [this](int fdesc) {
+          // this will be executed once in the program's lifetime.
+          // do not use mmapWrite on non ext-3/xfs/tmpfs systems.
+          if (!SupportsFastAllocate(fdesc)) {
+            forceMmapOff_ = true;
+          }
+        },
+        fd);
+  }
+
+#ifdef ROCKSDB_IOURING_PRESENT
+  bool IsIOUringEnabled() {
+    if (RocksDbIOUringEnable && RocksDbIOUringEnable()) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+#endif  // ROCKSDB_IOURING_PRESENT
+
+  // EXPERIMENTAL
+  //
+  // TODO akankshamahajan:
+  // 1. Update Poll API to take into account min_completions
+  // and returns if number of handles in io_handles (any order) completed is
+  // equal to atleast min_completions.
+  // 2. Currently in case of direct_io, Read API is called because of which call
+  // to Poll API fails as it expects IOHandle to be populated.
+  virtual IOStatus Poll(std::vector<void*>& io_handles,
+                        size_t /*min_completions*/) override {
+#if defined(ROCKSDB_IOURING_PRESENT)
+    // io_uring_queue_init.
+    struct io_uring* iu = nullptr;
+    if (thread_local_io_urings_) {
+      iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
+    }
+
+    // Init failed, platform doesn't support io_uring.
+    if (iu == nullptr) {
+      return IOStatus::NotSupported("Poll");
+    }
+
+    for (size_t i = 0; i < io_handles.size(); i++) {
+      // The request has been completed in earlier runs.
+      if ((static_cast<Posix_IOHandle*>(io_handles[i]))->is_finished) {
+        continue;
+      }
+      // Loop until IO for io_handles[i] is completed.
+      while (true) {
+        // io_uring_wait_cqe.
+        struct io_uring_cqe* cqe = nullptr;
+        ssize_t ret = io_uring_wait_cqe(iu, &cqe);
+        if (ret) {
+          // abort as it shouldn't be in indeterminate state and there is no
+          // good way currently to handle this error.
+          abort();
+        }
+
+        // Step 3: Populate the request.
+        assert(cqe != nullptr);
+        Posix_IOHandle* posix_handle =
+            static_cast<Posix_IOHandle*>(io_uring_cqe_get_data(cqe));
+        assert(posix_handle->iu == iu);
+        if (posix_handle->iu != iu) {
+          return IOStatus::IOError("");
+        }
+        // Reset cqe data to catch any stray reuse of it
+        static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5;
+
+        FSReadRequest req;
+        req.scratch = posix_handle->scratch;
+        req.offset = posix_handle->offset;
+        req.len = posix_handle->len;
+
+        size_t finished_len = 0;
+        size_t bytes_read = 0;
+        bool read_again = false;
+        UpdateResult(cqe, "", req.len, posix_handle->iov.iov_len,
+                     true /*async_read*/, posix_handle->use_direct_io,
+                     posix_handle->alignment, finished_len, &req, bytes_read,
+                     read_again);
+        posix_handle->is_finished = true;
+        io_uring_cqe_seen(iu, cqe);
+        posix_handle->cb(req, posix_handle->cb_arg);
+
+        (void)finished_len;
+        (void)bytes_read;
+        (void)read_again;
+
+        if (static_cast<Posix_IOHandle*>(io_handles[i]) == posix_handle) {
+          break;
+        }
+      }
+    }
+    return IOStatus::OK();
+#else
+    (void)io_handles;
+    return IOStatus::NotSupported("Poll");
+#endif
+  }
+
+  virtual IOStatus AbortIO(std::vector<void*>& io_handles) override {
+#if defined(ROCKSDB_IOURING_PRESENT)
+    // io_uring_queue_init.
+    struct io_uring* iu = nullptr;
+    if (thread_local_io_urings_) {
+      iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
+    }
+
+    // Init failed, platform doesn't support io_uring.
+    // If Poll is not supported then it didn't submit any request and it should
+    // return OK.
+    if (iu == nullptr) {
+      return IOStatus::OK();
+    }
+
+    for (size_t i = 0; i < io_handles.size(); i++) {
+      Posix_IOHandle* posix_handle =
+          static_cast<Posix_IOHandle*>(io_handles[i]);
+      if (posix_handle->is_finished == true) {
+        continue;
+      }
+      assert(posix_handle->iu == iu);
+      if (posix_handle->iu != iu) {
+        return IOStatus::IOError("");
+      }
+
+      // Prepare the cancel request.
+      struct io_uring_sqe* sqe;
+      sqe = io_uring_get_sqe(iu);
+
+      // In order to cancel the request, sqe->addr of cancel request should
+      // match with the read request submitted which is posix_handle->iov.
+      io_uring_prep_cancel(sqe, &posix_handle->iov, 0);
+      // Sets sqe->user_data to posix_handle.
+      io_uring_sqe_set_data(sqe, posix_handle);
+
+      // submit the request.
+      ssize_t ret = io_uring_submit(iu);
+      if (ret < 0) {
+        fprintf(stderr, "io_uring_submit error: %ld\n", long(ret));
+        return IOStatus::IOError("io_uring_submit() requested but returned " +
+                                 std::to_string(ret));
+      }
+    }
+
+    // After submitting the requests, wait for the requests.
+    for (size_t i = 0; i < io_handles.size(); i++) {
+      if ((static_cast<Posix_IOHandle*>(io_handles[i]))->is_finished) {
+        continue;
+      }
+
+      while (true) {
+        struct io_uring_cqe* cqe = nullptr;
+        ssize_t ret = io_uring_wait_cqe(iu, &cqe);
+        if (ret) {
+          // abort as it shouldn't be in indeterminate state and there is no
+          // good way currently to handle this error.
+          abort();
+        }
+        assert(cqe != nullptr);
+
+        // Returns cqe->user_data.
+        Posix_IOHandle* posix_handle =
+            static_cast<Posix_IOHandle*>(io_uring_cqe_get_data(cqe));
+        assert(posix_handle->iu == iu);
+        if (posix_handle->iu != iu) {
+          return IOStatus::IOError("");
+        }
+        posix_handle->req_count++;
+
+        // Reset cqe data to catch any stray reuse of it
+        static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5;
+        io_uring_cqe_seen(iu, cqe);
+
+        // - If the request is cancelled successfully, the original request is
+        //   completed with -ECANCELED and the cancel request is completed with
+        //   a result of 0.
+        // - If the request was already running, the original may or
+        //   may not complete in error. The cancel request will complete with
+        //  -EALREADY for that case.
+        // - And finally, if the request to cancel wasn't
+        //   found, the cancel request is completed with -ENOENT.
+        //
+        // Every handle has to wait for 2 requests completion: original one and
+        // the cancel request which is tracked by PosixHandle::req_count.
+        if (posix_handle->req_count == 2 &&
+            static_cast<Posix_IOHandle*>(io_handles[i]) == posix_handle) {
+          posix_handle->is_finished = true;
+          FSReadRequest req;
+          req.status = IOStatus::Aborted();
+          posix_handle->cb(req, posix_handle->cb_arg);
+
+          break;
+        }
+      }
+    }
+    return IOStatus::OK();
+#else
+    // If Poll is not supported then it didn't submit any request and it should
+    // return OK.
+    (void)io_handles;
+    return IOStatus::OK();
+#endif
+  }
+
+#if defined(ROCKSDB_IOURING_PRESENT)
+  // io_uring instance
+  std::unique_ptr<ThreadLocalPtr> thread_local_io_urings_;
+#endif
+
+  size_t page_size_;
+
+  // If true, allow non owner read access for db files. Otherwise, non-owner
+  //  has no access to db files.
+  bool allow_non_owner_access_;
+
+#ifdef OS_LINUX
+  static LogicalBlockSizeCache logical_block_size_cache_;
+#endif
+  static size_t GetLogicalBlockSize(const std::string& fname, int fd);
+  // In non-direct IO mode, this directly returns kDefaultPageSize.
+  // Otherwise call GetLogicalBlockSize.
+  static size_t GetLogicalBlockSizeForReadIfNeeded(const EnvOptions& options,
+                                                   const std::string& fname,
+                                                   int fd);
+  static size_t GetLogicalBlockSizeForWriteIfNeeded(const EnvOptions& options,
+                                                    const std::string& fname,
+                                                    int fd);
+};
+
+#ifdef OS_LINUX
+LogicalBlockSizeCache PosixFileSystem::logical_block_size_cache_;
+#endif
+
+size_t PosixFileSystem::GetLogicalBlockSize(const std::string& fname, int fd) {
+#ifdef OS_LINUX
+  return logical_block_size_cache_.GetLogicalBlockSize(fname, fd);
+#else
+  (void)fname;
+  return PosixHelper::GetLogicalBlockSizeOfFd(fd);
+#endif
+}
+
+size_t PosixFileSystem::GetLogicalBlockSizeForReadIfNeeded(
+    const EnvOptions& options, const std::string& fname, int fd) {
+  return options.use_direct_reads
+             ? PosixFileSystem::GetLogicalBlockSize(fname, fd)
+             : kDefaultPageSize;
+}
+
+size_t PosixFileSystem::GetLogicalBlockSizeForWriteIfNeeded(
+    const EnvOptions& options, const std::string& fname, int fd) {
+  return options.use_direct_writes
+             ? PosixFileSystem::GetLogicalBlockSize(fname, fd)
+             : kDefaultPageSize;
+}
+
+PosixFileSystem::PosixFileSystem()
+    : forceMmapOff_(false),
+      page_size_(getpagesize()),
+      allow_non_owner_access_(true) {
+#if defined(ROCKSDB_IOURING_PRESENT)
+  // Test whether IOUring is supported, and if it does, create a managing
+  // object for thread local point so that in the future thread-local
+  // io_uring can be created.
+  struct io_uring* new_io_uring = CreateIOUring();
+  if (new_io_uring != nullptr) {
+    thread_local_io_urings_.reset(new ThreadLocalPtr(DeleteIOUring));
+    delete new_io_uring;
+  }
+#endif
+}
+
+}  // namespace
+
+//
+// Default Posix FileSystem
+//
+std::shared_ptr<FileSystem> FileSystem::Default() {
+  STATIC_AVOID_DESTRUCTION(std::shared_ptr<FileSystem>, instance)
+  (std::make_shared<PosixFileSystem>());
+  return instance;
+}
+
+#ifndef ROCKSDB_LITE
+static FactoryFunc<FileSystem> posix_filesystem_reg =
+    ObjectLibrary::Default()->AddFactory<FileSystem>(
+        ObjectLibrary::PatternEntry("posix").AddSeparator("://", false),
+        [](const std::string& /* uri */, std::unique_ptr<FileSystem>* f,
+           std::string* /* errmsg */) {
+          f->reset(new PosixFileSystem());
+          return f->get();
+        });
+#endif
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/env/fs_readonly.h b/src/rocksdb/env/fs_readonly.h
new file mode 100644
index 000000000..1bbe60784
--- /dev/null
+++ b/src/rocksdb/env/fs_readonly.h
@@ -0,0 +1,107 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/file_system.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A FileSystem wrapper that only allows read-only operation.
+//
+// This class has not been fully analyzed for providing strong security
+// guarantees.
+class ReadOnlyFileSystem : public FileSystemWrapper {
+  static inline IOStatus FailReadOnly() {
+    IOStatus s = IOStatus::IOError("Attempted write to ReadOnlyFileSystem");
+    assert(s.GetRetryable() == false);
+    return s;
+  }
+
+ public:
+  explicit ReadOnlyFileSystem(const std::shared_ptr<FileSystem>& base)
+      : FileSystemWrapper(base) {}
+
+  static const char* kClassName() { return "ReadOnlyFileSystem"; }
+  const char* Name() const override { return kClassName(); }
+
+  IOStatus NewWritableFile(const std::string& /*fname*/,
+                           const FileOptions& /*options*/,
+                           std::unique_ptr<FSWritableFile>* /*result*/,
+                           IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus ReuseWritableFile(const std::string& /*fname*/,
+                             const std::string& /*old_fname*/,
+                             const FileOptions& /*options*/,
+                             std::unique_ptr<FSWritableFile>* /*result*/,
+                             IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus NewRandomRWFile(const std::string& /*fname*/,
+                           const FileOptions& /*options*/,
+                           std::unique_ptr<FSRandomRWFile>* /*result*/,
+                           IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus NewDirectory(const std::string& /*dir*/,
+                        const IOOptions& /*options*/,
+                        std::unique_ptr<FSDirectory>* /*result*/,
+                        IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus DeleteFile(const std::string& /*fname*/,
+                      const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus CreateDir(const std::string& /*dirname*/,
+                     const IOOptions& /*options*/,
+                     IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus CreateDirIfMissing(const std::string& dirname,
+                              const IOOptions& options,
+                              IODebugContext* dbg) override {
+    // Allow if dir already exists
+    bool is_dir = false;
+    IOStatus s = IsDirectory(dirname, options, &is_dir, dbg);
+    if (s.ok() && is_dir) {
+      return s;
+    } else {
+      return FailReadOnly();
+    }
+  }
+  IOStatus DeleteDir(const std::string& /*dirname*/,
+                     const IOOptions& /*options*/,
+                     IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus RenameFile(const std::string& /*src*/, const std::string& /*dest*/,
+                      const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus LinkFile(const std::string& /*src*/, const std::string& /*dest*/,
+                    const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus LockFile(const std::string& /*fname*/, const IOOptions& /*options*/,
+                    FileLock** /*lock*/, IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus NewLogger(const std::string& /*fname*/, const IOOptions& /*options*/,
+                     std::shared_ptr<Logger>* /*result*/,
+                     IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/env/fs_remap.cc b/src/rocksdb/env/fs_remap.cc
new file mode 100644
index 000000000..fd9241181
--- /dev/null
+++ b/src/rocksdb/env/fs_remap.cc
@@ -0,0 +1,343 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "env/fs_remap.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+RemapFileSystem::RemapFileSystem(const std::shared_ptr<FileSystem>& base)
+    : FileSystemWrapper(base) {}
+
+std::pair<IOStatus, std::string> RemapFileSystem::EncodePathWithNewBasename(
+    const std::string& path) {
+  // No difference by default
+  return EncodePath(path);
+}
+
+Status RemapFileSystem::RegisterDbPaths(const std::vector<std::string>& paths) {
+  std::vector<std::string> encoded_paths;
+  encoded_paths.reserve(paths.size());
+  for (auto& path : paths) {
+    auto status_and_enc_path = EncodePathWithNewBasename(path);
+    if (!status_and_enc_path.first.ok()) {
+      return status_and_enc_path.first;
+    }
+    encoded_paths.emplace_back(status_and_enc_path.second);
+  }
+  return FileSystemWrapper::RegisterDbPaths(encoded_paths);
+}
+
+Status RemapFileSystem::UnregisterDbPaths(
+    const std::vector<std::string>& paths) {
+  std::vector<std::string> encoded_paths;
+  encoded_paths.reserve(paths.size());
+  for (auto& path : paths) {
+    auto status_and_enc_path = EncodePathWithNewBasename(path);
+    if (!status_and_enc_path.first.ok()) {
+      return status_and_enc_path.first;
+    }
+    encoded_paths.emplace_back(status_and_enc_path.second);
+  }
+  return FileSystemWrapper::UnregisterDbPaths(encoded_paths);
+}
+
+IOStatus RemapFileSystem::NewSequentialFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSSequentialFile>* result, IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::NewSequentialFile(status_and_enc_path.second,
+                                              options, result, dbg);
+}
+
+IOStatus RemapFileSystem::NewRandomAccessFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::NewRandomAccessFile(status_and_enc_path.second,
+                                                options, result, dbg);
+}
+
+IOStatus RemapFileSystem::NewWritableFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::NewWritableFile(status_and_enc_path.second, options,
+                                            result, dbg);
+}
+
+IOStatus RemapFileSystem::ReuseWritableFile(
+    const std::string& fname, const std::string& old_fname,
+    const FileOptions& options, std::unique_ptr<FSWritableFile>* result,
+    IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  auto status_and_old_enc_path = EncodePath(old_fname);
+  if (!status_and_old_enc_path.first.ok()) {
+    return status_and_old_enc_path.first;
+  }
+  return FileSystemWrapper::ReuseWritableFile(status_and_old_enc_path.second,
+                                              status_and_old_enc_path.second,
+                                              options, result, dbg);
+}
+
+IOStatus RemapFileSystem::NewRandomRWFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSRandomRWFile>* result, IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::NewRandomRWFile(status_and_enc_path.second, options,
+                                            result, dbg);
+}
+
+IOStatus RemapFileSystem::NewDirectory(const std::string& dir,
+                                       const IOOptions& options,
+                                       std::unique_ptr<FSDirectory>* result,
+                                       IODebugContext* dbg) {
+  // A hassle to remap DirFsyncOptions::renamed_new_name
+  class RemapFSDirectory : public FSDirectoryWrapper {
+   public:
+    RemapFSDirectory(RemapFileSystem* fs, std::unique_ptr<FSDirectory>&& t)
+        : FSDirectoryWrapper(std::move(t)), fs_(fs) {}
+    IOStatus FsyncWithDirOptions(
+        const IOOptions& options, IODebugContext* dbg,
+        const DirFsyncOptions& dir_fsync_options) override {
+      if (dir_fsync_options.renamed_new_name.empty()) {
+        return FSDirectoryWrapper::FsyncWithDirOptions(options, dbg,
+                                                       dir_fsync_options);
+      } else {
+        auto status_and_enc_path =
+            fs_->EncodePath(dir_fsync_options.renamed_new_name);
+        if (status_and_enc_path.first.ok()) {
+          DirFsyncOptions mapped_options = dir_fsync_options;
+          mapped_options.renamed_new_name = status_and_enc_path.second;
+          return FSDirectoryWrapper::FsyncWithDirOptions(options, dbg,
+                                                         mapped_options);
+        } else {
+          return status_and_enc_path.first;
+        }
+      }
+    }
+
+   private:
+    RemapFileSystem* const fs_;
+  };
+
+  auto status_and_enc_path = EncodePathWithNewBasename(dir);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  IOStatus ios = FileSystemWrapper::NewDirectory(status_and_enc_path.second,
+                                                 options, result, dbg);
+  if (ios.ok()) {
+    *result = std::make_unique<RemapFSDirectory>(this, std::move(*result));
+  }
+  return ios;
+}
+
+IOStatus RemapFileSystem::FileExists(const std::string& fname,
+                                     const IOOptions& options,
+                                     IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::FileExists(status_and_enc_path.second, options,
+                                       dbg);
+}
+
+IOStatus RemapFileSystem::GetChildren(const std::string& dir,
+                                      const IOOptions& options,
+                                      std::vector<std::string>* result,
+                                      IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePath(dir);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::GetChildren(status_and_enc_path.second, options,
+                                        result, dbg);
+}
+
+IOStatus RemapFileSystem::GetChildrenFileAttributes(
+    const std::string& dir, const IOOptions& options,
+    std::vector<FileAttributes>* result, IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePath(dir);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::GetChildrenFileAttributes(
+      status_and_enc_path.second, options, result, dbg);
+}
+
+IOStatus RemapFileSystem::DeleteFile(const std::string& fname,
+                                     const IOOptions& options,
+                                     IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePath(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::DeleteFile(status_and_enc_path.second, options,
+                                       dbg);
+}
+
+IOStatus RemapFileSystem::CreateDir(const std::string& dirname,
+                                    const IOOptions& options,
+                                    IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(dirname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::CreateDir(status_and_enc_path.second, options, dbg);
+}
+
+IOStatus RemapFileSystem::CreateDirIfMissing(const std::string& dirname,
+                                             const IOOptions& options,
+                                             IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(dirname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::CreateDirIfMissing(status_and_enc_path.second,
+                                               options, dbg);
+}
+
+IOStatus RemapFileSystem::DeleteDir(const std::string& dirname,
+                                    const IOOptions& options,
+                                    IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePath(dirname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::DeleteDir(status_and_enc_path.second, options, dbg);
+}
+
+IOStatus RemapFileSystem::GetFileSize(const std::string& fname,
+                                      const IOOptions& options,
+                                      uint64_t* file_size,
+                                      IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePath(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::GetFileSize(status_and_enc_path.second, options,
+                                        file_size, dbg);
+}
+
+IOStatus RemapFileSystem::GetFileModificationTime(const std::string& fname,
+                                                  const IOOptions& options,
+                                                  uint64_t* file_mtime,
+                                                  IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePath(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::GetFileModificationTime(status_and_enc_path.second,
+                                                    options, file_mtime, dbg);
+}
+
+IOStatus RemapFileSystem::IsDirectory(const std::string& path,
+                                      const IOOptions& options, bool* is_dir,
+                                      IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePath(path);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::IsDirectory(status_and_enc_path.second, options,
+                                        is_dir, dbg);
+}
+
+IOStatus RemapFileSystem::RenameFile(const std::string& src,
+                                     const std::string& dest,
+                                     const IOOptions& options,
+                                     IODebugContext* dbg) {
+  auto status_and_src_enc_path = EncodePath(src);
+  if (!status_and_src_enc_path.first.ok()) {
+    if (status_and_src_enc_path.first.IsNotFound()) {
+      const IOStatus& s = status_and_src_enc_path.first;
+      status_and_src_enc_path.first = IOStatus::PathNotFound(s.ToString());
+    }
+    return status_and_src_enc_path.first;
+  }
+  auto status_and_dest_enc_path = EncodePathWithNewBasename(dest);
+  if (!status_and_dest_enc_path.first.ok()) {
+    return status_and_dest_enc_path.first;
+  }
+  return FileSystemWrapper::RenameFile(status_and_src_enc_path.second,
+                                       status_and_dest_enc_path.second, options,
+                                       dbg);
+}
+
+IOStatus RemapFileSystem::LinkFile(const std::string& src,
+                                   const std::string& dest,
+                                   const IOOptions& options,
+                                   IODebugContext* dbg) {
+  auto status_and_src_enc_path = EncodePath(src);
+  if (!status_and_src_enc_path.first.ok()) {
+    return status_and_src_enc_path.first;
+  }
+  auto status_and_dest_enc_path = EncodePathWithNewBasename(dest);
+  if (!status_and_dest_enc_path.first.ok()) {
+    return status_and_dest_enc_path.first;
+  }
+  return FileSystemWrapper::LinkFile(status_and_src_enc_path.second,
+                                     status_and_dest_enc_path.second, options,
+                                     dbg);
+}
+
+IOStatus RemapFileSystem::LockFile(const std::string& fname,
+                                   const IOOptions& options, FileLock** lock,
+                                   IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  // FileLock subclasses may store path (e.g., PosixFileLock stores it). We
+  // can skip stripping the chroot directory from this path because callers
+  // shouldn't use it.
+  return FileSystemWrapper::LockFile(status_and_enc_path.second, options, lock,
+                                     dbg);
+}
+
+IOStatus RemapFileSystem::NewLogger(const std::string& fname,
+                                    const IOOptions& options,
+                                    std::shared_ptr<Logger>* result,
+                                    IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::NewLogger(status_and_enc_path.second, options,
+                                      result, dbg);
+}
+
+IOStatus RemapFileSystem::GetAbsolutePath(const std::string& db_path,
+                                          const IOOptions& options,
+                                          std::string* output_path,
+                                          IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(db_path);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::GetAbsolutePath(status_and_enc_path.second, options,
+                                            output_path, dbg);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/env/fs_remap.h b/src/rocksdb/env/fs_remap.h
new file mode 100644
index 000000000..1f6e061fd
--- /dev/null
+++ b/src/rocksdb/env/fs_remap.h
@@ -0,0 +1,139 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <utility>
+
+#include "rocksdb/file_system.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// An abstract FileSystem wrapper that creates a view of an existing
+// FileSystem by remapping names in some way.
+//
+// This class has not been fully analyzed for providing strong security
+// guarantees.
+class RemapFileSystem : public FileSystemWrapper {
+ public:
+  explicit RemapFileSystem(const std::shared_ptr<FileSystem>& base);
+
+ protected:
+  // Returns status and mapped-to path in the wrapped filesystem.
+  // If it returns non-OK status, the returned path should not be used.
+  virtual std::pair<IOStatus, std::string> EncodePath(
+      const std::string& path) = 0;
+
+  // Similar to EncodePath() except used in cases in which it is OK for
+  // no file or directory on 'path' to already exist, such as if the
+  // operation would create one. However, the parent of 'path' is expected
+  // to exist for the operation to succeed.
+  // Default implementation: call EncodePath
+  virtual std::pair<IOStatus, std::string> EncodePathWithNewBasename(
+      const std::string& path);
+
+ public:
+  // Left abstract:
+  // const char* Name() const override { ... }
+  static const char* kClassName() { return "RemapFileSystem"; }
+  bool IsInstanceOf(const std::string& id) const override {
+    if (id == kClassName()) {
+      return true;
+    } else {
+      return FileSystemWrapper::IsInstanceOf(id);
+    }
+  }
+
+  Status RegisterDbPaths(const std::vector<std::string>& paths) override;
+
+  Status UnregisterDbPaths(const std::vector<std::string>& paths) override;
+
+  IOStatus NewSequentialFile(const std::string& fname,
+                             const FileOptions& options,
+                             std::unique_ptr<FSSequentialFile>* result,
+                             IODebugContext* dbg) override;
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& options,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override;
+
+  IOStatus NewWritableFile(const std::string& fname, const FileOptions& options,
+                           std::unique_ptr<FSWritableFile>* result,
+                           IODebugContext* dbg) override;
+
+  IOStatus ReuseWritableFile(const std::string& fname,
+                             const std::string& old_fname,
+                             const FileOptions& options,
+                             std::unique_ptr<FSWritableFile>* result,
+                             IODebugContext* dbg) override;
+
+  IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options,
+                           std::unique_ptr<FSRandomRWFile>* result,
+                           IODebugContext* dbg) override;
+
+  IOStatus NewDirectory(const std::string& dir, const IOOptions& options,
+                        std::unique_ptr<FSDirectory>* result,
+                        IODebugContext* dbg) override;
+
+  IOStatus FileExists(const std::string& fname, const IOOptions& options,
+                      IODebugContext* dbg) override;
+
+  IOStatus GetChildren(const std::string& dir, const IOOptions& options,
+                       std::vector<std::string>* result,
+                       IODebugContext* dbg) override;
+
+  IOStatus GetChildrenFileAttributes(const std::string& dir,
+                                     const IOOptions& options,
+                                     std::vector<FileAttributes>* result,
+                                     IODebugContext* dbg) override;
+
+  IOStatus DeleteFile(const std::string& fname, const IOOptions& options,
+                      IODebugContext* dbg) override;
+
+  IOStatus CreateDir(const std::string& dirname, const IOOptions& options,
+                     IODebugContext* dbg) override;
+
+  IOStatus CreateDirIfMissing(const std::string& dirname,
+                              const IOOptions& options,
+                              IODebugContext* dbg) override;
+
+  IOStatus DeleteDir(const std::string& dirname, const IOOptions& options,
+                     IODebugContext* dbg) override;
+
+  IOStatus GetFileSize(const std::string& fname, const IOOptions& options,
+                       uint64_t* file_size, IODebugContext* dbg) override;
+
+  IOStatus GetFileModificationTime(const std::string& fname,
+                                   const IOOptions& options,
+                                   uint64_t* file_mtime,
+                                   IODebugContext* dbg) override;
+
+  IOStatus IsDirectory(const std::string& path, const IOOptions& options,
+                       bool* is_dir, IODebugContext* dbg) override;
+
+  IOStatus RenameFile(const std::string& src, const std::string& dest,
+                      const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus LinkFile(const std::string& src, const std::string& dest,
+                    const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus LockFile(const std::string& fname, const IOOptions& options,
+                    FileLock** lock, IODebugContext* dbg) override;
+
+  IOStatus NewLogger(const std::string& fname, const IOOptions& options,
+                     std::shared_ptr<Logger>* result,
+                     IODebugContext* dbg) override;
+
+  IOStatus GetAbsolutePath(const std::string& db_path, const IOOptions& options,
+                           std::string* output_path,
+                           IODebugContext* dbg) override;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/env/io_posix.cc b/src/rocksdb/env/io_posix.cc
new file mode 100644
index 000000000..0ec0e9c83
--- /dev/null
+++ b/src/rocksdb/env/io_posix.cc
@@ -0,0 +1,1733 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifdef ROCKSDB_LIB_IO_POSIX
+#include "env/io_posix.h"
+
+#include <errno.h>
+#include <fcntl.h>
+
+#include <algorithm>
+#if defined(OS_LINUX)
+#include <linux/fs.h>
+#ifndef FALLOC_FL_KEEP_SIZE
+#include <linux/falloc.h>
+#endif
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#ifdef OS_LINUX
+#include <sys/statfs.h>
+#include <sys/sysmacros.h>
+#endif
+#include "monitoring/iostats_context_imp.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/slice.h"
+#include "test_util/sync_point.h"
+#include "util/autovector.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+#if defined(OS_LINUX) && !defined(F_SET_RW_HINT)
+#define F_LINUX_SPECIFIC_BASE 1024
+#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12)
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+std::string IOErrorMsg(const std::string& context,
+                       const std::string& file_name) {
+  if (file_name.empty()) {
+    return context;
+  }
+  return context + ": " + file_name;
+}
+
+// file_name can be left empty if it is not unkown.
+IOStatus IOError(const std::string& context, const std::string& file_name,
+                 int err_number) {
+  switch (err_number) {
+    case ENOSPC: {
+      IOStatus s = IOStatus::NoSpace(IOErrorMsg(context, file_name),
+                                     errnoStr(err_number).c_str());
+      s.SetRetryable(true);
+      return s;
+    }
+    case ESTALE:
+      return IOStatus::IOError(IOStatus::kStaleFile);
+    case ENOENT:
+      return IOStatus::PathNotFound(IOErrorMsg(context, file_name),
+                                    errnoStr(err_number).c_str());
+    default:
+      return IOStatus::IOError(IOErrorMsg(context, file_name),
+                               errnoStr(err_number).c_str());
+  }
+}
+
+// A wrapper for fadvise, if the platform doesn't support fadvise,
+// it will simply return 0.
+int Fadvise(int fd, off_t offset, size_t len, int advice) {
+#ifdef OS_LINUX
+  return posix_fadvise(fd, offset, len, advice);
+#else
+  (void)fd;
+  (void)offset;
+  (void)len;
+  (void)advice;
+  return 0;  // simply do nothing.
+#endif
+}
+
+// A wrapper for fadvise, if the platform doesn't support fadvise,
+// it will simply return 0.
+int Madvise(void* addr, size_t len, int advice) {
+#ifdef OS_LINUX
+  return posix_madvise(addr, len, advice);
+#else
+  (void)addr;
+  (void)len;
+  (void)advice;
+  return 0;  // simply do nothing.
+#endif
+}
+
+namespace {
+
+// On MacOS (and probably *BSD), the posix write and pwrite calls do not support
+// buffers larger than 2^31-1 bytes. These two wrappers fix this issue by
+// cutting the buffer in 1GB chunks. We use this chunk size to be sure to keep
+// the writes aligned.
+
+bool PosixWrite(int fd, const char* buf, size_t nbyte) {
+  const size_t kLimit1Gb = 1UL << 30;
+
+  const char* src = buf;
+  size_t left = nbyte;
+
+  while (left != 0) {
+    size_t bytes_to_write = std::min(left, kLimit1Gb);
+
+    ssize_t done = write(fd, src, bytes_to_write);
+    if (done < 0) {
+      if (errno == EINTR) {
+        continue;
+      }
+      return false;
+    }
+    left -= done;
+    src += done;
+  }
+  return true;
+}
+
+bool PosixPositionedWrite(int fd, const char* buf, size_t nbyte, off_t offset) {
+  const size_t kLimit1Gb = 1UL << 30;
+
+  const char* src = buf;
+  size_t left = nbyte;
+
+  while (left != 0) {
+    size_t bytes_to_write = std::min(left, kLimit1Gb);
+
+    ssize_t done = pwrite(fd, src, bytes_to_write, offset);
+    if (done < 0) {
+      if (errno == EINTR) {
+        continue;
+      }
+      return false;
+    }
+    left -= done;
+    offset += done;
+    src += done;
+  }
+
+  return true;
+}
+
+#ifdef ROCKSDB_RANGESYNC_PRESENT
+
+#if !defined(ZFS_SUPER_MAGIC)
+// The magic number for ZFS was not exposed until recently. It should be fixed
+// forever so we can just copy the magic number here.
+#define ZFS_SUPER_MAGIC 0x2fc12fc1
+#endif
+
+bool IsSyncFileRangeSupported(int fd) {
+  // This function tracks and checks for cases where we know `sync_file_range`
+  // definitely will not work properly despite passing the compile-time check
+  // (`ROCKSDB_RANGESYNC_PRESENT`). If we are unsure, or if any of the checks
+  // fail in unexpected ways, we allow `sync_file_range` to be used. This way
+  // should minimize risk of impacting existing use cases.
+  struct statfs buf;
+  int ret = fstatfs(fd, &buf);
+  assert(ret == 0);
+  if (ret == 0 && buf.f_type == ZFS_SUPER_MAGIC) {
+    // Testing on ZFS showed the writeback did not happen asynchronously when
+    // `sync_file_range` was called, even though it returned success. Avoid it
+    // and use `fdatasync` instead to preserve the contract of `bytes_per_sync`,
+    // even though this'll incur extra I/O for metadata.
+    return false;
+  }
+
+  ret = sync_file_range(fd, 0 /* offset */, 0 /* nbytes */, 0 /* flags */);
+  assert(!(ret == -1 && errno != ENOSYS));
+  if (ret == -1 && errno == ENOSYS) {
+    // `sync_file_range` is not implemented on all platforms even if
+    // compile-time checks pass and a supported filesystem is in-use. For
+    // example, using ext4 on WSL (Windows Subsystem for Linux),
+    // `sync_file_range()` returns `ENOSYS`
+    // ("Function not implemented").
+    return false;
+  }
+  // None of the known cases matched, so allow `sync_file_range` use.
+  return true;
+}
+
+#undef ZFS_SUPER_MAGIC
+
+#endif  // ROCKSDB_RANGESYNC_PRESENT
+
+}  // anonymous namespace
+
+/*
+ * PosixSequentialFile
+ */
+PosixSequentialFile::PosixSequentialFile(const std::string& fname, FILE* file,
+                                         int fd, size_t logical_block_size,
+                                         const EnvOptions& options)
+    : filename_(fname),
+      file_(file),
+      fd_(fd),
+      use_direct_io_(options.use_direct_reads),
+      logical_sector_size_(logical_block_size) {
+  assert(!options.use_direct_reads || !options.use_mmap_reads);
+}
+
+PosixSequentialFile::~PosixSequentialFile() {
+  if (!use_direct_io()) {
+    assert(file_);
+    fclose(file_);
+  } else {
+    assert(fd_);
+    close(fd_);
+  }
+}
+
+IOStatus PosixSequentialFile::Read(size_t n, const IOOptions& /*opts*/,
+                                   Slice* result, char* scratch,
+                                   IODebugContext* /*dbg*/) {
+  assert(result != nullptr && !use_direct_io());
+  IOStatus s;
+  size_t r = 0;
+  do {
+    clearerr(file_);
+    r = fread_unlocked(scratch, 1, n, file_);
+  } while (r == 0 && ferror(file_) && errno == EINTR);
+  *result = Slice(scratch, r);
+  if (r < n) {
+    if (feof(file_)) {
+      // We leave status as ok if we hit the end of the file
+      // We also clear the error so that the reads can continue
+      // if a new data is written to the file
+      clearerr(file_);
+    } else {
+      // A partial read with an error: return a non-ok status
+      s = IOError("While reading file sequentially", filename_, errno);
+    }
+  }
+  return s;
+}
+
+IOStatus PosixSequentialFile::PositionedRead(uint64_t offset, size_t n,
+                                             const IOOptions& /*opts*/,
+                                             Slice* result, char* scratch,
+                                             IODebugContext* /*dbg*/) {
+  assert(use_direct_io());
+  assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
+  assert(IsSectorAligned(n, GetRequiredBufferAlignment()));
+  assert(IsSectorAligned(scratch, GetRequiredBufferAlignment()));
+
+  IOStatus s;
+  ssize_t r = -1;
+  size_t left = n;
+  char* ptr = scratch;
+  while (left > 0) {
+    r = pread(fd_, ptr, left, static_cast<off_t>(offset));
+    if (r <= 0) {
+      if (r == -1 && errno == EINTR) {
+        continue;
+      }
+      break;
+    }
+    ptr += r;
+    offset += r;
+    left -= r;
+    if (!IsSectorAligned(r, GetRequiredBufferAlignment())) {
+      // Bytes reads don't fill sectors. Should only happen at the end
+      // of the file.
+      break;
+    }
+  }
+  if (r < 0) {
+    // An error: return a non-ok status
+    s = IOError("While pread " + std::to_string(n) + " bytes from offset " +
+                    std::to_string(offset),
+                filename_, errno);
+  }
+  *result = Slice(scratch, (r < 0) ? 0 : n - left);
+  return s;
+}
+
+IOStatus PosixSequentialFile::Skip(uint64_t n) {
+  if (fseek(file_, static_cast<long int>(n), SEEK_CUR)) {
+    return IOError("While fseek to skip " + std::to_string(n) + " bytes",
+                   filename_, errno);
+  }
+  return IOStatus::OK();
+}
+
+IOStatus PosixSequentialFile::InvalidateCache(size_t offset, size_t length) {
+#ifndef OS_LINUX
+  (void)offset;
+  (void)length;
+  return IOStatus::OK();
+#else
+  if (!use_direct_io()) {
+    // free OS pages
+    int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
+    if (ret != 0) {
+      return IOError("While fadvise NotNeeded offset " +
+                         std::to_string(offset) + " len " +
+                         std::to_string(length),
+                     filename_, errno);
+    }
+  }
+  return IOStatus::OK();
+#endif
+}
+
+/*
+ * PosixRandomAccessFile
+ */
+#if defined(OS_LINUX)
+size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
+  if (max_size < kMaxVarint64Length * 3) {
+    return 0;
+  }
+
+  struct stat buf;
+  int result = fstat(fd, &buf);
+  if (result == -1) {
+    return 0;
+  }
+
+  long version = 0;
+  result = ioctl(fd, FS_IOC_GETVERSION, &version);
+  TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result);
+  if (result == -1) {
+    return 0;
+  }
+  uint64_t uversion = (uint64_t)version;
+
+  char* rid = id;
+  rid = EncodeVarint64(rid, buf.st_dev);
+  rid = EncodeVarint64(rid, buf.st_ino);
+  rid = EncodeVarint64(rid, uversion);
+  assert(rid >= id);
+  return static_cast<size_t>(rid - id);
+}
+#endif
+
+#if defined(OS_MACOSX) || defined(OS_AIX)
+size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
+  if (max_size < kMaxVarint64Length * 3) {
+    return 0;
+  }
+
+  struct stat buf;
+  int result = fstat(fd, &buf);
+  if (result == -1) {
+    return 0;
+  }
+
+  char* rid = id;
+  rid = EncodeVarint64(rid, buf.st_dev);
+  rid = EncodeVarint64(rid, buf.st_ino);
+  rid = EncodeVarint64(rid, buf.st_gen);
+  assert(rid >= id);
+  return static_cast<size_t>(rid - id);
+}
+#endif
+
+#ifdef OS_LINUX
+std::string RemoveTrailingSlash(const std::string& path) {
+  std::string p = path;
+  if (p.size() > 1 && p.back() == '/') {
+    p.pop_back();
+  }
+  return p;
+}
+
+Status LogicalBlockSizeCache::RefAndCacheLogicalBlockSize(
+    const std::vector<std::string>& directories) {
+  std::vector<std::string> dirs;
+  dirs.reserve(directories.size());
+  for (auto& d : directories) {
+    dirs.emplace_back(RemoveTrailingSlash(d));
+  }
+
+  std::map<std::string, size_t> dir_sizes;
+  {
+    ReadLock lock(&cache_mutex_);
+    for (const auto& dir : dirs) {
+      if (cache_.find(dir) == cache_.end()) {
+        dir_sizes.emplace(dir, 0);
+      }
+    }
+  }
+
+  Status s;
+  for (auto& dir_size : dir_sizes) {
+    s = get_logical_block_size_of_directory_(dir_size.first, &dir_size.second);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  WriteLock lock(&cache_mutex_);
+  for (const auto& dir : dirs) {
+    auto& v = cache_[dir];
+    v.ref++;
+    auto dir_size = dir_sizes.find(dir);
+    if (dir_size != dir_sizes.end()) {
+      v.size = dir_size->second;
+    }
+  }
+  return s;
+}
+
+void LogicalBlockSizeCache::UnrefAndTryRemoveCachedLogicalBlockSize(
+    const std::vector<std::string>& directories) {
+  std::vector<std::string> dirs;
+  dirs.reserve(directories.size());
+  for (auto& dir : directories) {
+    dirs.emplace_back(RemoveTrailingSlash(dir));
+  }
+
+  WriteLock lock(&cache_mutex_);
+  for (const auto& dir : dirs) {
+    auto it = cache_.find(dir);
+    if (it != cache_.end() && !(--(it->second.ref))) {
+      cache_.erase(it);
+    }
+  }
+}
+
+size_t LogicalBlockSizeCache::GetLogicalBlockSize(const std::string& fname,
+                                                  int fd) {
+  std::string dir = fname.substr(0, fname.find_last_of("/"));
+  if (dir.empty()) {
+    dir = "/";
+  }
+  {
+    ReadLock lock(&cache_mutex_);
+    auto it = cache_.find(dir);
+    if (it != cache_.end()) {
+      return it->second.size;
+    }
+  }
+  return get_logical_block_size_of_fd_(fd);
+}
+#endif
+
+Status PosixHelper::GetLogicalBlockSizeOfDirectory(const std::string& directory,
+                                                   size_t* size) {
+  int fd = open(directory.c_str(), O_DIRECTORY | O_RDONLY);
+  if (fd == -1) {
+    close(fd);
+    return Status::IOError("Cannot open directory " + directory);
+  }
+  *size = PosixHelper::GetLogicalBlockSizeOfFd(fd);
+  close(fd);
+  return Status::OK();
+}
+
+size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) {
+#ifdef OS_LINUX
+  struct stat buf;
+  int result = fstat(fd, &buf);
+  if (result == -1) {
+    return kDefaultPageSize;
+  }
+  if (major(buf.st_dev) == 0) {
+    // Unnamed devices (e.g. non-device mounts), reserved as null device number.
+    // These don't have an entry in /sys/dev/block/. Return a sensible default.
+    return kDefaultPageSize;
+  }
+
+  // Reading queue/logical_block_size does not require special permissions.
+  const int kBufferSize = 100;
+  char path[kBufferSize];
+  char real_path[PATH_MAX + 1];
+  snprintf(path, kBufferSize, "/sys/dev/block/%u:%u", major(buf.st_dev),
+           minor(buf.st_dev));
+  if (realpath(path, real_path) == nullptr) {
+    return kDefaultPageSize;
+  }
+  std::string device_dir(real_path);
+  if (!device_dir.empty() && device_dir.back() == '/') {
+    device_dir.pop_back();
+  }
+  // NOTE: sda3 and nvme0n1p1 do not have a `queue/` subdir, only the parent sda
+  // and nvme0n1 have it.
+  // $ ls -al '/sys/dev/block/8:3'
+  // lrwxrwxrwx. 1 root root 0 Jun 26 01:38 /sys/dev/block/8:3 ->
+  // ../../block/sda/sda3
+  // $ ls -al '/sys/dev/block/259:4'
+  // lrwxrwxrwx 1 root root 0 Jan 31 16:04 /sys/dev/block/259:4 ->
+  // ../../devices/pci0000:17/0000:17:00.0/0000:18:00.0/nvme/nvme0/nvme0n1/nvme0n1p1
+  size_t parent_end = device_dir.rfind('/', device_dir.length() - 1);
+  if (parent_end == std::string::npos) {
+    return kDefaultPageSize;
+  }
+  size_t parent_begin = device_dir.rfind('/', parent_end - 1);
+  if (parent_begin == std::string::npos) {
+    return kDefaultPageSize;
+  }
+  std::string parent =
+      device_dir.substr(parent_begin + 1, parent_end - parent_begin - 1);
+  std::string child = device_dir.substr(parent_end + 1, std::string::npos);
+  if (parent != "block" &&
+      (child.compare(0, 4, "nvme") || child.find('p') != std::string::npos)) {
+    device_dir = device_dir.substr(0, parent_end);
+  }
+  std::string fname = device_dir + "/queue/logical_block_size";
+  FILE* fp;
+  size_t size = 0;
+  fp = fopen(fname.c_str(), "r");
+  if (fp != nullptr) {
+    char* line = nullptr;
+    size_t len = 0;
+    if (getline(&line, &len, fp) != -1) {
+      sscanf(line, "%zu", &size);
+    }
+    free(line);
+    fclose(fp);
+  }
+  if (size != 0 && (size & (size - 1)) == 0) {
+    return size;
+  }
+#endif
+  (void)fd;
+  return kDefaultPageSize;
+}
+
+/*
+ * PosixRandomAccessFile
+ *
+ * pread() based random-access
+ */
+PosixRandomAccessFile::PosixRandomAccessFile(
+    const std::string& fname, int fd, size_t logical_block_size,
+    const EnvOptions& options
+#if defined(ROCKSDB_IOURING_PRESENT)
+    ,
+    ThreadLocalPtr* thread_local_io_urings
+#endif
+    )
+    : filename_(fname),
+      fd_(fd),
+      use_direct_io_(options.use_direct_reads),
+      logical_sector_size_(logical_block_size)
+#if defined(ROCKSDB_IOURING_PRESENT)
+      ,
+      thread_local_io_urings_(thread_local_io_urings)
+#endif
+{
+  assert(!options.use_direct_reads || !options.use_mmap_reads);
+  assert(!options.use_mmap_reads);
+}
+
+PosixRandomAccessFile::~PosixRandomAccessFile() { close(fd_); }
+
+IOStatus PosixRandomAccessFile::Read(uint64_t offset, size_t n,
+                                     const IOOptions& /*opts*/, Slice* result,
+                                     char* scratch,
+                                     IODebugContext* /*dbg*/) const {
+  if (use_direct_io()) {
+    assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
+    assert(IsSectorAligned(n, GetRequiredBufferAlignment()));
+    assert(IsSectorAligned(scratch, GetRequiredBufferAlignment()));
+  }
+  IOStatus s;
+  ssize_t r = -1;
+  size_t left = n;
+  char* ptr = scratch;
+  while (left > 0) {
+    r = pread(fd_, ptr, left, static_cast<off_t>(offset));
+    if (r <= 0) {
+      if (r == -1 && errno == EINTR) {
+        continue;
+      }
+      break;
+    }
+    ptr += r;
+    offset += r;
+    left -= r;
+    if (use_direct_io() &&
+        r % static_cast<ssize_t>(GetRequiredBufferAlignment()) != 0) {
+      // Bytes reads don't fill sectors. Should only happen at the end
+      // of the file.
+      break;
+    }
+  }
+  if (r < 0) {
+    // An error: return a non-ok status
+    s = IOError("While pread offset " + std::to_string(offset) + " len " +
+                    std::to_string(n),
+                filename_, errno);
+  }
+  *result = Slice(scratch, (r < 0) ? 0 : n - left);
+  return s;
+}
+
+IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs,
+                                          const IOOptions& options,
+                                          IODebugContext* dbg) {
+  if (use_direct_io()) {
+    for (size_t i = 0; i < num_reqs; i++) {
+      assert(IsSectorAligned(reqs[i].offset, GetRequiredBufferAlignment()));
+      assert(IsSectorAligned(reqs[i].len, GetRequiredBufferAlignment()));
+      assert(IsSectorAligned(reqs[i].scratch, GetRequiredBufferAlignment()));
+    }
+  }
+
+#if defined(ROCKSDB_IOURING_PRESENT)
+  struct io_uring* iu = nullptr;
+  if (thread_local_io_urings_) {
+    iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
+    if (iu == nullptr) {
+      iu = CreateIOUring();
+      if (iu != nullptr) {
+        thread_local_io_urings_->Reset(iu);
+      }
+    }
+  }
+
+  // Init failed, platform doesn't support io_uring. Fall back to
+  // serialized reads
+  if (iu == nullptr) {
+    return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg);
+  }
+
+  IOStatus ios = IOStatus::OK();
+
+  struct WrappedReadRequest {
+    FSReadRequest* req;
+    struct iovec iov;
+    size_t finished_len;
+    explicit WrappedReadRequest(FSReadRequest* r) : req(r), finished_len(0) {}
+  };
+
+  autovector<WrappedReadRequest, 32> req_wraps;
+  autovector<WrappedReadRequest*, 4> incomplete_rq_list;
+  std::unordered_set<WrappedReadRequest*> wrap_cache;
+
+  for (size_t i = 0; i < num_reqs; i++) {
+    req_wraps.emplace_back(&reqs[i]);
+  }
+
+  size_t reqs_off = 0;
+  while (num_reqs > reqs_off || !incomplete_rq_list.empty()) {
+    size_t this_reqs = (num_reqs - reqs_off) + incomplete_rq_list.size();
+
+    // If requests exceed depth, split it into batches
+    if (this_reqs > kIoUringDepth) this_reqs = kIoUringDepth;
+
+    assert(incomplete_rq_list.size() <= this_reqs);
+    for (size_t i = 0; i < this_reqs; i++) {
+      WrappedReadRequest* rep_to_submit;
+      if (i < incomplete_rq_list.size()) {
+        rep_to_submit = incomplete_rq_list[i];
+      } else {
+        rep_to_submit = &req_wraps[reqs_off++];
+      }
+      assert(rep_to_submit->req->len > rep_to_submit->finished_len);
+      rep_to_submit->iov.iov_base =
+          rep_to_submit->req->scratch + rep_to_submit->finished_len;
+      rep_to_submit->iov.iov_len =
+          rep_to_submit->req->len - rep_to_submit->finished_len;
+
+      struct io_uring_sqe* sqe;
+      sqe = io_uring_get_sqe(iu);
+      io_uring_prep_readv(
+          sqe, fd_, &rep_to_submit->iov, 1,
+          rep_to_submit->req->offset + rep_to_submit->finished_len);
+      io_uring_sqe_set_data(sqe, rep_to_submit);
+      wrap_cache.emplace(rep_to_submit);
+    }
+    incomplete_rq_list.clear();
+
+    ssize_t ret =
+        io_uring_submit_and_wait(iu, static_cast<unsigned int>(this_reqs));
+    TEST_SYNC_POINT_CALLBACK(
+        "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return1",
+        &ret);
+    TEST_SYNC_POINT_CALLBACK(
+        "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2",
+        iu);
+
+    if (static_cast<size_t>(ret) != this_reqs) {
+      fprintf(stderr, "ret = %ld this_reqs: %ld\n", (long)ret, (long)this_reqs);
+      // If error happens and we submitted fewer than expected, it is an
+      // exception case and we don't retry here. We should still consume
+      // what is is submitted in the ring.
+      for (ssize_t i = 0; i < ret; i++) {
+        struct io_uring_cqe* cqe = nullptr;
+        io_uring_wait_cqe(iu, &cqe);
+        if (cqe != nullptr) {
+          io_uring_cqe_seen(iu, cqe);
+        }
+      }
+      return IOStatus::IOError("io_uring_submit_and_wait() requested " +
+                               std::to_string(this_reqs) + " but returned " +
+                               std::to_string(ret));
+    }
+
+    for (size_t i = 0; i < this_reqs; i++) {
+      struct io_uring_cqe* cqe = nullptr;
+      WrappedReadRequest* req_wrap;
+
+      // We could use the peek variant here, but this seems safer in terms
+      // of our initial wait not reaping all completions
+      ret = io_uring_wait_cqe(iu, &cqe);
+      TEST_SYNC_POINT_CALLBACK(
+          "PosixRandomAccessFile::MultiRead:io_uring_wait_cqe:return", &ret);
+      if (ret) {
+        ios = IOStatus::IOError("io_uring_wait_cqe() returns " +
+                                std::to_string(ret));
+
+        if (cqe != nullptr) {
+          io_uring_cqe_seen(iu, cqe);
+        }
+        continue;
+      }
+
+      req_wrap = static_cast<WrappedReadRequest*>(io_uring_cqe_get_data(cqe));
+      // Reset cqe data to catch any stray reuse of it
+      static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5;
+      // Check that we got a valid unique cqe data
+      auto wrap_check = wrap_cache.find(req_wrap);
+      if (wrap_check == wrap_cache.end()) {
+        fprintf(stderr,
+                "PosixRandomAccessFile::MultiRead: "
+                "Bad cqe data from IO uring - %p\n",
+                req_wrap);
+        port::PrintStack();
+        ios = IOStatus::IOError("io_uring_cqe_get_data() returned " +
+                                std::to_string((uint64_t)req_wrap));
+        continue;
+      }
+      wrap_cache.erase(wrap_check);
+
+      FSReadRequest* req = req_wrap->req;
+      size_t bytes_read = 0;
+      bool read_again = false;
+      UpdateResult(cqe, filename_, req->len, req_wrap->iov.iov_len,
+                   false /*async_read*/, use_direct_io(),
+                   GetRequiredBufferAlignment(), req_wrap->finished_len, req,
+                   bytes_read, read_again);
+      int32_t res = cqe->res;
+      if (res >= 0) {
+        if (bytes_read == 0) {
+          if (read_again) {
+            Slice tmp_slice;
+            req->status =
+                Read(req->offset + req_wrap->finished_len,
+                     req->len - req_wrap->finished_len, options, &tmp_slice,
+                     req->scratch + req_wrap->finished_len, dbg);
+            req->result =
+                Slice(req->scratch, req_wrap->finished_len + tmp_slice.size());
+          }
+          // else It means EOF so no need to do anything.
+        } else if (bytes_read < req_wrap->iov.iov_len) {
+          incomplete_rq_list.push_back(req_wrap);
+        }
+      }
+      io_uring_cqe_seen(iu, cqe);
+    }
+    wrap_cache.clear();
+  }
+  return ios;
+#else
+  return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg);
+#endif
+}
+
+IOStatus PosixRandomAccessFile::Prefetch(uint64_t offset, size_t n,
+                                         const IOOptions& /*opts*/,
+                                         IODebugContext* /*dbg*/) {
+  IOStatus s;
+  if (!use_direct_io()) {
+    ssize_t r = 0;
+#ifdef OS_LINUX
+    r = readahead(fd_, offset, n);
+#endif
+#ifdef OS_MACOSX
+    radvisory advice;
+    advice.ra_offset = static_cast<off_t>(offset);
+    advice.ra_count = static_cast<int>(n);
+    r = fcntl(fd_, F_RDADVISE, &advice);
+#endif
+    if (r == -1) {
+      s = IOError("While prefetching offset " + std::to_string(offset) +
+                      " len " + std::to_string(n),
+                  filename_, errno);
+    }
+  }
+  return s;
+}
+
+#if defined(OS_LINUX) || defined(OS_MACOSX) || defined(OS_AIX)
+size_t PosixRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
+  return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size);
+}
+#endif
+
+void PosixRandomAccessFile::Hint(AccessPattern pattern) {
+  if (use_direct_io()) {
+    return;
+  }
+  switch (pattern) {
+    case kNormal:
+      Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL);
+      break;
+    case kRandom:
+      Fadvise(fd_, 0, 0, POSIX_FADV_RANDOM);
+      break;
+    case kSequential:
+      Fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL);
+      break;
+    case kWillNeed:
+      Fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED);
+      break;
+    case kWontNeed:
+      Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED);
+      break;
+    default:
+      assert(false);
+      break;
+  }
+}
+
+IOStatus PosixRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
+  if (use_direct_io()) {
+    return IOStatus::OK();
+  }
+#ifndef OS_LINUX
+  (void)offset;
+  (void)length;
+  return IOStatus::OK();
+#else
+  // free OS pages
+  int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
+  if (ret == 0) {
+    return IOStatus::OK();
+  }
+  return IOError("While fadvise NotNeeded offset " + std::to_string(offset) +
+                     " len " + std::to_string(length),
+                 filename_, errno);
+#endif
+}
+
+IOStatus PosixRandomAccessFile::ReadAsync(
+    FSReadRequest& req, const IOOptions& /*opts*/,
+    std::function<void(const FSReadRequest&, void*)> cb, void* cb_arg,
+    void** io_handle, IOHandleDeleter* del_fn, IODebugContext* /*dbg*/) {
+  if (use_direct_io()) {
+    assert(IsSectorAligned(req.offset, GetRequiredBufferAlignment()));
+    assert(IsSectorAligned(req.len, GetRequiredBufferAlignment()));
+    assert(IsSectorAligned(req.scratch, GetRequiredBufferAlignment()));
+  }
+
+#if defined(ROCKSDB_IOURING_PRESENT)
+  // io_uring_queue_init.
+  struct io_uring* iu = nullptr;
+  if (thread_local_io_urings_) {
+    iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
+    if (iu == nullptr) {
+      iu = CreateIOUring();
+      if (iu != nullptr) {
+        thread_local_io_urings_->Reset(iu);
+      }
+    }
+  }
+
+  // Init failed, platform doesn't support io_uring.
+  if (iu == nullptr) {
+    return IOStatus::NotSupported("ReadAsync");
+  }
+
+  // Allocate io_handle.
+  IOHandleDeleter deletefn = [](void* args) -> void {
+    delete (static_cast<Posix_IOHandle*>(args));
+    args = nullptr;
+  };
+
+  // Initialize Posix_IOHandle.
+  Posix_IOHandle* posix_handle =
+      new Posix_IOHandle(iu, cb, cb_arg, req.offset, req.len, req.scratch,
+                         use_direct_io(), GetRequiredBufferAlignment());
+  posix_handle->iov.iov_base = req.scratch;
+  posix_handle->iov.iov_len = req.len;
+
+  *io_handle = static_cast<void*>(posix_handle);
+  *del_fn = deletefn;
+
+  // Step 3: io_uring_sqe_set_data
+  struct io_uring_sqe* sqe;
+  sqe = io_uring_get_sqe(iu);
+
+  io_uring_prep_readv(sqe, fd_, /*sqe->addr=*/&posix_handle->iov,
+                      /*sqe->len=*/1, /*sqe->offset=*/posix_handle->offset);
+
+  // Sets sqe->user_data to posix_handle.
+  io_uring_sqe_set_data(sqe, posix_handle);
+
+  // Step 4: io_uring_submit
+  ssize_t ret = io_uring_submit(iu);
+  if (ret < 0) {
+    fprintf(stderr, "io_uring_submit error: %ld\n", long(ret));
+    return IOStatus::IOError("io_uring_submit() requested but returned " +
+                             std::to_string(ret));
+  }
+  return IOStatus::OK();
+#else
+  (void)req;
+  (void)cb;
+  (void)cb_arg;
+  (void)io_handle;
+  (void)del_fn;
+  return IOStatus::NotSupported("ReadAsync");
+#endif
+}
+
+/*
+ * PosixMmapReadableFile
+ *
+ * mmap() based random-access
+ */
+// base[0,length-1] contains the mmapped contents of the file.
+PosixMmapReadableFile::PosixMmapReadableFile(const int fd,
+                                             const std::string& fname,
+                                             void* base, size_t length,
+                                             const EnvOptions& options)
+    : fd_(fd), filename_(fname), mmapped_region_(base), length_(length) {
+#ifdef NDEBUG
+  (void)options;
+#endif
+  fd_ = fd_ + 0;  // suppress the warning for used variables
+  assert(options.use_mmap_reads);
+  assert(!options.use_direct_reads);
+}
+
+PosixMmapReadableFile::~PosixMmapReadableFile() {
+  int ret = munmap(mmapped_region_, length_);
+  if (ret != 0) {
+    fprintf(stdout, "failed to munmap %p length %" ROCKSDB_PRIszt " \n",
+            mmapped_region_, length_);
+  }
+  close(fd_);
+}
+
+IOStatus PosixMmapReadableFile::Read(uint64_t offset, size_t n,
+                                     const IOOptions& /*opts*/, Slice* result,
+                                     char* /*scratch*/,
+                                     IODebugContext* /*dbg*/) const {
+  IOStatus s;
+  if (offset > length_) {
+    *result = Slice();
+    return IOError("While mmap read offset " + std::to_string(offset) +
+                       " larger than file length " + std::to_string(length_),
+                   filename_, EINVAL);
+  } else if (offset + n > length_) {
+    n = static_cast<size_t>(length_ - offset);
+  }
+  *result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n);
+  return s;
+}
+
+void PosixMmapReadableFile::Hint(AccessPattern pattern) {
+  switch (pattern) {
+    case kNormal:
+      Madvise(mmapped_region_, length_, POSIX_MADV_NORMAL);
+      break;
+    case kRandom:
+      Madvise(mmapped_region_, length_, POSIX_MADV_RANDOM);
+      break;
+    case kSequential:
+      Madvise(mmapped_region_, length_, POSIX_MADV_SEQUENTIAL);
+      break;
+    case kWillNeed:
+      Madvise(mmapped_region_, length_, POSIX_MADV_WILLNEED);
+      break;
+    case kWontNeed:
+      Madvise(mmapped_region_, length_, POSIX_MADV_DONTNEED);
+      break;
+    default:
+      assert(false);
+      break;
+  }
+}
+
+IOStatus PosixMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
+#ifndef OS_LINUX
+  (void)offset;
+  (void)length;
+  return IOStatus::OK();
+#else
+  // free OS pages
+  int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
+  if (ret == 0) {
+    return IOStatus::OK();
+  }
+  return IOError("While fadvise not needed. Offset " + std::to_string(offset) +
+                     " len" + std::to_string(length),
+                 filename_, errno);
+#endif
+}
+
+/*
+ * PosixMmapFile
+ *
+ * We preallocate up to an extra megabyte and use memcpy to append new
+ * data to the file.  This is safe since we either properly close the
+ * file before reading from it, or for log files, the reading code
+ * knows enough to skip zero suffixes.
+ */
+IOStatus PosixMmapFile::UnmapCurrentRegion() {
+  TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0");
+  if (base_ != nullptr) {
+    int munmap_status = munmap(base_, limit_ - base_);
+    if (munmap_status != 0) {
+      return IOError("While munmap", filename_, munmap_status);
+    }
+    file_offset_ += limit_ - base_;
+    base_ = nullptr;
+    limit_ = nullptr;
+    last_sync_ = nullptr;
+    dst_ = nullptr;
+
+    // Increase the amount we map the next time, but capped at 1MB
+    if (map_size_ < (1 << 20)) {
+      map_size_ *= 2;
+    }
+  }
+  return IOStatus::OK();
+}
+
+IOStatus PosixMmapFile::MapNewRegion() {
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+  assert(base_ == nullptr);
+  TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0");
+  // we can't fallocate with FALLOC_FL_KEEP_SIZE here
+  if (allow_fallocate_) {
+    IOSTATS_TIMER_GUARD(allocate_nanos);
+    int alloc_status = fallocate(fd_, 0, file_offset_, map_size_);
+    if (alloc_status != 0) {
+      // fallback to posix_fallocate
+      alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
+    }
+    if (alloc_status != 0) {
+      return IOStatus::IOError("Error allocating space to file : " + filename_ +
+                               "Error : " + errnoStr(alloc_status).c_str());
+    }
+  }
+
+  TEST_KILL_RANDOM("PosixMmapFile::Append:1");
+  void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, fd_,
+                   file_offset_);
+  if (ptr == MAP_FAILED) {
+    return IOStatus::IOError("MMap failed on " + filename_);
+  }
+  TEST_KILL_RANDOM("PosixMmapFile::Append:2");
+
+  base_ = reinterpret_cast<char*>(ptr);
+  limit_ = base_ + map_size_;
+  dst_ = base_;
+  last_sync_ = base_;
+  return IOStatus::OK();
+#else
+  return IOStatus::NotSupported("This platform doesn't support fallocate()");
+#endif
+}
+
+IOStatus PosixMmapFile::Msync() {
+  if (dst_ == last_sync_) {
+    return IOStatus::OK();
+  }
+  // Find the beginnings of the pages that contain the first and last
+  // bytes to be synced.
+  size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
+  size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
+  last_sync_ = dst_;
+  TEST_KILL_RANDOM("PosixMmapFile::Msync:0");
+  if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
+    return IOError("While msync", filename_, errno);
+  }
+  return IOStatus::OK();
+}
+
+PosixMmapFile::PosixMmapFile(const std::string& fname, int fd, size_t page_size,
+                             const EnvOptions& options)
+    : filename_(fname),
+      fd_(fd),
+      page_size_(page_size),
+      map_size_(Roundup(65536, page_size)),
+      base_(nullptr),
+      limit_(nullptr),
+      dst_(nullptr),
+      last_sync_(nullptr),
+      file_offset_(0) {
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+  allow_fallocate_ = options.allow_fallocate;
+  fallocate_with_keep_size_ = options.fallocate_with_keep_size;
+#else
+  (void)options;
+#endif
+  assert((page_size & (page_size - 1)) == 0);
+  assert(options.use_mmap_writes);
+  assert(!options.use_direct_writes);
+}
+
+PosixMmapFile::~PosixMmapFile() {
+  if (fd_ >= 0) {
+    IOStatus s = PosixMmapFile::Close(IOOptions(), nullptr);
+    s.PermitUncheckedError();
+  }
+}
+
+IOStatus PosixMmapFile::Append(const Slice& data, const IOOptions& /*opts*/,
+                               IODebugContext* /*dbg*/) {
+  const char* src = data.data();
+  size_t left = data.size();
+  while (left > 0) {
+    assert(base_ <= dst_);
+    assert(dst_ <= limit_);
+    size_t avail = limit_ - dst_;
+    if (avail == 0) {
+      IOStatus s = UnmapCurrentRegion();
+      if (!s.ok()) {
+        return s;
+      }
+      s = MapNewRegion();
+      if (!s.ok()) {
+        return s;
+      }
+      TEST_KILL_RANDOM("PosixMmapFile::Append:0");
+    }
+
+    size_t n = (left <= avail) ? left : avail;
+    assert(dst_);
+    memcpy(dst_, src, n);
+    dst_ += n;
+    src += n;
+    left -= n;
+  }
+  return IOStatus::OK();
+}
+
+IOStatus PosixMmapFile::Close(const IOOptions& /*opts*/,
+                              IODebugContext* /*dbg*/) {
+  IOStatus s;
+  size_t unused = limit_ - dst_;
+
+  s = UnmapCurrentRegion();
+  if (!s.ok()) {
+    s = IOError("While closing mmapped file", filename_, errno);
+  } else if (unused > 0) {
+    // Trim the extra space at the end of the file
+    if (ftruncate(fd_, file_offset_ - unused) < 0) {
+      s = IOError("While ftruncating mmaped file", filename_, errno);
+    }
+  }
+
+  if (close(fd_) < 0) {
+    if (s.ok()) {
+      s = IOError("While closing mmapped file", filename_, errno);
+    }
+  }
+
+  fd_ = -1;
+  base_ = nullptr;
+  limit_ = nullptr;
+  return s;
+}
+
+IOStatus PosixMmapFile::Flush(const IOOptions& /*opts*/,
+                              IODebugContext* /*dbg*/) {
+  return IOStatus::OK();
+}
+
+IOStatus PosixMmapFile::Sync(const IOOptions& /*opts*/,
+                             IODebugContext* /*dbg*/) {
+#ifdef HAVE_FULLFSYNC
+  if (::fcntl(fd_, F_FULLFSYNC) < 0) {
+    return IOError("while fcntl(F_FULLSYNC) mmapped file", filename_, errno);
+  }
+#else   // HAVE_FULLFSYNC
+  if (fdatasync(fd_) < 0) {
+    return IOError("While fdatasync mmapped file", filename_, errno);
+  }
+#endif  // HAVE_FULLFSYNC
+
+  return Msync();
+}
+
+/**
+ * Flush data as well as metadata to stable storage.
+ */
+IOStatus PosixMmapFile::Fsync(const IOOptions& /*opts*/,
+                              IODebugContext* /*dbg*/) {
+#ifdef HAVE_FULLFSYNC
+  if (::fcntl(fd_, F_FULLFSYNC) < 0) {
+    return IOError("While fcntl(F_FULLSYNC) on mmaped file", filename_, errno);
+  }
+#else   // HAVE_FULLFSYNC
+  if (fsync(fd_) < 0) {
+    return IOError("While fsync mmaped file", filename_, errno);
+  }
+#endif  // HAVE_FULLFSYNC
+
+  return Msync();
+}
+
+/**
+ * Get the size of valid data in the file. This will not match the
+ * size that is returned from the filesystem because we use mmap
+ * to extend file by map_size every time.
+ */
+uint64_t PosixMmapFile::GetFileSize(const IOOptions& /*opts*/,
+                                    IODebugContext* /*dbg*/) {
+  size_t used = dst_ - base_;
+  return file_offset_ + used;
+}
+
+IOStatus PosixMmapFile::InvalidateCache(size_t offset, size_t length) {
+#ifndef OS_LINUX
+  (void)offset;
+  (void)length;
+  return IOStatus::OK();
+#else
+  // free OS pages
+  int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
+  if (ret == 0) {
+    return IOStatus::OK();
+  }
+  return IOError("While fadvise NotNeeded mmapped file", filename_, errno);
+#endif
+}
+
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+IOStatus PosixMmapFile::Allocate(uint64_t offset, uint64_t len,
+                                 const IOOptions& /*opts*/,
+                                 IODebugContext* /*dbg*/) {
+  assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
+  assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
+  TEST_KILL_RANDOM("PosixMmapFile::Allocate:0");
+  int alloc_status = 0;
+  if (allow_fallocate_) {
+    alloc_status =
+        fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
+                  static_cast<off_t>(offset), static_cast<off_t>(len));
+  }
+  if (alloc_status == 0) {
+    return IOStatus::OK();
+  } else {
+    return IOError("While fallocate offset " + std::to_string(offset) +
+                       " len " + std::to_string(len),
+                   filename_, errno);
+  }
+}
+#endif
+
+/*
+ * PosixWritableFile
+ *
+ * Use posix write to write data to a file.
+ */
+PosixWritableFile::PosixWritableFile(const std::string& fname, int fd,
+                                     size_t logical_block_size,
+                                     const EnvOptions& options)
+    : FSWritableFile(options),
+      filename_(fname),
+      use_direct_io_(options.use_direct_writes),
+      fd_(fd),
+      filesize_(0),
+      logical_sector_size_(logical_block_size) {
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+  allow_fallocate_ = options.allow_fallocate;
+  fallocate_with_keep_size_ = options.fallocate_with_keep_size;
+#endif
+#ifdef ROCKSDB_RANGESYNC_PRESENT
+  sync_file_range_supported_ = IsSyncFileRangeSupported(fd_);
+#endif  // ROCKSDB_RANGESYNC_PRESENT
+  assert(!options.use_mmap_writes);
+}
+
+PosixWritableFile::~PosixWritableFile() {
+  if (fd_ >= 0) {
+    IOStatus s = PosixWritableFile::Close(IOOptions(), nullptr);
+    s.PermitUncheckedError();
+  }
+}
+
+IOStatus PosixWritableFile::Append(const Slice& data, const IOOptions& /*opts*/,
+                                   IODebugContext* /*dbg*/) {
+  if (use_direct_io()) {
+    assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment()));
+    assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment()));
+  }
+  const char* src = data.data();
+  size_t nbytes = data.size();
+
+  if (!PosixWrite(fd_, src, nbytes)) {
+    return IOError("While appending to file", filename_, errno);
+  }
+
+  filesize_ += nbytes;
+  return IOStatus::OK();
+}
+
+IOStatus PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset,
+                                             const IOOptions& /*opts*/,
+                                             IODebugContext* /*dbg*/) {
+  if (use_direct_io()) {
+    assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
+    assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment()));
+    assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment()));
+  }
+  assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
+  const char* src = data.data();
+  size_t nbytes = data.size();
+  if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) {
+    return IOError("While pwrite to file at offset " + std::to_string(offset),
+                   filename_, errno);
+  }
+  filesize_ = offset + nbytes;
+  return IOStatus::OK();
+}
+
+IOStatus PosixWritableFile::Truncate(uint64_t size, const IOOptions& /*opts*/,
+                                     IODebugContext* /*dbg*/) {
+  IOStatus s;
+  int r = ftruncate(fd_, size);
+  if (r < 0) {
+    s = IOError("While ftruncate file to size " + std::to_string(size),
+                filename_, errno);
+  } else {
+    filesize_ = size;
+  }
+  return s;
+}
+
+IOStatus PosixWritableFile::Close(const IOOptions& /*opts*/,
+                                  IODebugContext* /*dbg*/) {
+  IOStatus s;
+
+  size_t block_size;
+  size_t last_allocated_block;
+  GetPreallocationStatus(&block_size, &last_allocated_block);
+  TEST_SYNC_POINT_CALLBACK("PosixWritableFile::Close", &last_allocated_block);
+  if (last_allocated_block > 0) {
+    // trim the extra space preallocated at the end of the file
+    // NOTE(ljin): we probably don't want to surface failure as an IOError,
+    // but it will be nice to log these errors.
+    int dummy __attribute__((__unused__));
+    dummy = ftruncate(fd_, filesize_);
+#if defined(ROCKSDB_FALLOCATE_PRESENT) && defined(FALLOC_FL_PUNCH_HOLE)
+    // in some file systems, ftruncate only trims trailing space if the
+    // new file size is smaller than the current size. Calling fallocate
+    // with FALLOC_FL_PUNCH_HOLE flag to explicitly release these unused
+    // blocks. FALLOC_FL_PUNCH_HOLE is supported on at least the following
+    // filesystems:
+    //   XFS (since Linux 2.6.38)
+    //   ext4 (since Linux 3.0)
+    //   Btrfs (since Linux 3.7)
+    //   tmpfs (since Linux 3.5)
+    // We ignore error since failure of this operation does not affect
+    // correctness.
+    struct stat file_stats;
+    int result = fstat(fd_, &file_stats);
+    // After ftruncate, we check whether ftruncate has the correct behavior.
+    // If not, we should hack it with FALLOC_FL_PUNCH_HOLE
+    if (result == 0 &&
+        (file_stats.st_size + file_stats.st_blksize - 1) /
+                file_stats.st_blksize !=
+            file_stats.st_blocks / (file_stats.st_blksize / 512)) {
+      IOSTATS_TIMER_GUARD(allocate_nanos);
+      if (allow_fallocate_) {
+        fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_,
+                  block_size * last_allocated_block - filesize_);
+      }
+    }
+#endif
+  }
+
+  if (close(fd_) < 0) {
+    s = IOError("While closing file after writing", filename_, errno);
+  }
+  fd_ = -1;
+  return s;
+}
+
+// write out the cached data to the OS cache
+IOStatus PosixWritableFile::Flush(const IOOptions& /*opts*/,
+                                  IODebugContext* /*dbg*/) {
+  return IOStatus::OK();
+}
+
+IOStatus PosixWritableFile::Sync(const IOOptions& /*opts*/,
+                                 IODebugContext* /*dbg*/) {
+#ifdef HAVE_FULLFSYNC
+  if (::fcntl(fd_, F_FULLFSYNC) < 0) {
+    return IOError("while fcntl(F_FULLFSYNC)", filename_, errno);
+  }
+#else   // HAVE_FULLFSYNC
+  if (fdatasync(fd_) < 0) {
+    return IOError("While fdatasync", filename_, errno);
+  }
+#endif  // HAVE_FULLFSYNC
+  return IOStatus::OK();
+}
+
+IOStatus PosixWritableFile::Fsync(const IOOptions& /*opts*/,
+                                  IODebugContext* /*dbg*/) {
+#ifdef HAVE_FULLFSYNC
+  if (::fcntl(fd_, F_FULLFSYNC) < 0) {
+    return IOError("while fcntl(F_FULLFSYNC)", filename_, errno);
+  }
+#else   // HAVE_FULLFSYNC
+  if (fsync(fd_) < 0) {
+    return IOError("While fsync", filename_, errno);
+  }
+#endif  // HAVE_FULLFSYNC
+  return IOStatus::OK();
+}
+
+bool PosixWritableFile::IsSyncThreadSafe() const { return true; }
+
+uint64_t PosixWritableFile::GetFileSize(const IOOptions& /*opts*/,
+                                        IODebugContext* /*dbg*/) {
+  return filesize_;
+}
+
+void PosixWritableFile::SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) {
+#ifdef OS_LINUX
+// Suppress Valgrind "Unimplemented functionality" error.
+#ifndef ROCKSDB_VALGRIND_RUN
+  if (hint == write_hint_) {
+    return;
+  }
+  if (fcntl(fd_, F_SET_RW_HINT, &hint) == 0) {
+    write_hint_ = hint;
+  }
+#else
+  (void)hint;
+#endif  // ROCKSDB_VALGRIND_RUN
+#else
+  (void)hint;
+#endif  // OS_LINUX
+}
+
+IOStatus PosixWritableFile::InvalidateCache(size_t offset, size_t length) {
+  if (use_direct_io()) {
+    return IOStatus::OK();
+  }
+#ifndef OS_LINUX
+  (void)offset;
+  (void)length;
+  return IOStatus::OK();
+#else
+  // free OS pages
+  int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
+  if (ret == 0) {
+    return IOStatus::OK();
+  }
+  return IOError("While fadvise NotNeeded", filename_, errno);
+#endif
+}
+
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+IOStatus PosixWritableFile::Allocate(uint64_t offset, uint64_t len,
+                                     const IOOptions& /*opts*/,
+                                     IODebugContext* /*dbg*/) {
+  assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
+  assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
+  TEST_KILL_RANDOM("PosixWritableFile::Allocate:0");
+  IOSTATS_TIMER_GUARD(allocate_nanos);
+  int alloc_status = 0;
+  if (allow_fallocate_) {
+    alloc_status =
+        fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
+                  static_cast<off_t>(offset), static_cast<off_t>(len));
+  }
+  if (alloc_status == 0) {
+    return IOStatus::OK();
+  } else {
+    return IOError("While fallocate offset " + std::to_string(offset) +
+                       " len " + std::to_string(len),
+                   filename_, errno);
+  }
+}
+#endif
+
+IOStatus PosixWritableFile::RangeSync(uint64_t offset, uint64_t nbytes,
+                                      const IOOptions& opts,
+                                      IODebugContext* dbg) {
+#ifdef ROCKSDB_RANGESYNC_PRESENT
+  assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
+  assert(nbytes <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
+  if (sync_file_range_supported_) {
+    int ret;
+    if (strict_bytes_per_sync_) {
+      // Specifying `SYNC_FILE_RANGE_WAIT_BEFORE` together with an offset/length
+      // that spans all bytes written so far tells `sync_file_range` to wait for
+      // any outstanding writeback requests to finish before issuing a new one.
+      ret =
+          sync_file_range(fd_, 0, static_cast<off_t>(offset + nbytes),
+                          SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE);
+    } else {
+      ret = sync_file_range(fd_, static_cast<off_t>(offset),
+                            static_cast<off_t>(nbytes), SYNC_FILE_RANGE_WRITE);
+    }
+    if (ret != 0) {
+      return IOError("While sync_file_range returned " + std::to_string(ret),
+                     filename_, errno);
+    }
+    return IOStatus::OK();
+  }
+#endif  // ROCKSDB_RANGESYNC_PRESENT
+  return FSWritableFile::RangeSync(offset, nbytes, opts, dbg);
+}
+
+#ifdef OS_LINUX
+size_t PosixWritableFile::GetUniqueId(char* id, size_t max_size) const {
+  return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size);
+}
+#endif
+
+/*
+ * PosixRandomRWFile
+ */
+
+PosixRandomRWFile::PosixRandomRWFile(const std::string& fname, int fd,
+                                     const EnvOptions& /*options*/)
+    : filename_(fname), fd_(fd) {}
+
+PosixRandomRWFile::~PosixRandomRWFile() {
+  if (fd_ >= 0) {
+    IOStatus s = Close(IOOptions(), nullptr);
+    s.PermitUncheckedError();
+  }
+}
+
+IOStatus PosixRandomRWFile::Write(uint64_t offset, const Slice& data,
+                                  const IOOptions& /*opts*/,
+                                  IODebugContext* /*dbg*/) {
+  const char* src = data.data();
+  size_t nbytes = data.size();
+  if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) {
+    return IOError("While write random read/write file at offset " +
+                       std::to_string(offset),
+                   filename_, errno);
+  }
+
+  return IOStatus::OK();
+}
+
+IOStatus PosixRandomRWFile::Read(uint64_t offset, size_t n,
+                                 const IOOptions& /*opts*/, Slice* result,
+                                 char* scratch, IODebugContext* /*dbg*/) const {
+  size_t left = n;
+  char* ptr = scratch;
+  while (left > 0) {
+    ssize_t done = pread(fd_, ptr, left, offset);
+    if (done < 0) {
+      // error while reading from file
+      if (errno == EINTR) {
+        // read was interrupted, try again.
+        continue;
+      }
+      return IOError("While reading random read/write file offset " +
+                         std::to_string(offset) + " len " + std::to_string(n),
+                     filename_, errno);
+    } else if (done == 0) {
+      // Nothing more to read
+      break;
+    }
+
+    // Read `done` bytes
+    ptr += done;
+    offset += done;
+    left -= done;
+  }
+
+  *result = Slice(scratch, n - left);
+  return IOStatus::OK();
+}
+
+IOStatus PosixRandomRWFile::Flush(const IOOptions& /*opts*/,
+                                  IODebugContext* /*dbg*/) {
+  return IOStatus::OK();
+}
+
+IOStatus PosixRandomRWFile::Sync(const IOOptions& /*opts*/,
+                                 IODebugContext* /*dbg*/) {
+#ifdef HAVE_FULLFSYNC
+  if (::fcntl(fd_, F_FULLFSYNC) < 0) {
+    return IOError("while fcntl(F_FULLFSYNC) random rw file", filename_, errno);
+  }
+#else   // HAVE_FULLFSYNC
+  if (fdatasync(fd_) < 0) {
+    return IOError("While fdatasync random read/write file", filename_, errno);
+  }
+#endif  // HAVE_FULLFSYNC
+  return IOStatus::OK();
+}
+
+IOStatus PosixRandomRWFile::Fsync(const IOOptions& /*opts*/,
+                                  IODebugContext* /*dbg*/) {
+#ifdef HAVE_FULLFSYNC
+  if (::fcntl(fd_, F_FULLFSYNC) < 0) {
+    return IOError("While fcntl(F_FULLSYNC) random rw file", filename_, errno);
+  }
+#else   // HAVE_FULLFSYNC
+  if (fsync(fd_) < 0) {
+    return IOError("While fsync random read/write file", filename_, errno);
+  }
+#endif  // HAVE_FULLFSYNC
+  return IOStatus::OK();
+}
+
+IOStatus PosixRandomRWFile::Close(const IOOptions& /*opts*/,
+                                  IODebugContext* /*dbg*/) {
+  if (close(fd_) < 0) {
+    return IOError("While close random read/write file", filename_, errno);
+  }
+  fd_ = -1;
+  return IOStatus::OK();
+}
+
+PosixMemoryMappedFileBuffer::~PosixMemoryMappedFileBuffer() {
+  // TODO should have error handling though not much we can do...
+  munmap(this->base_, length_);
+}
+
+/*
+ * PosixDirectory
+ */
+#if !defined(BTRFS_SUPER_MAGIC)
+// The magic number for BTRFS is fixed, if it's not defined, define it here
+#define BTRFS_SUPER_MAGIC 0x9123683E
+#endif
+PosixDirectory::PosixDirectory(int fd, const std::string& directory_name)
+    : fd_(fd), directory_name_(directory_name) {
+  is_btrfs_ = false;
+#ifdef OS_LINUX
+  struct statfs buf;
+  int ret = fstatfs(fd, &buf);
+  is_btrfs_ = (ret == 0 && buf.f_type == static_cast<decltype(buf.f_type)>(
+                                             BTRFS_SUPER_MAGIC));
+#endif
+}
+
+PosixDirectory::~PosixDirectory() {
+  if (fd_ >= 0) {
+    IOStatus s = PosixDirectory::Close(IOOptions(), nullptr);
+    s.PermitUncheckedError();
+  }
+}
+
+IOStatus PosixDirectory::Fsync(const IOOptions& opts, IODebugContext* dbg) {
+  return FsyncWithDirOptions(opts, dbg, DirFsyncOptions());
+}
+
+// Users who want the file entries synced in Directory project must call a
+// Fsync or FsyncWithDirOptions function before Close
+IOStatus PosixDirectory::Close(const IOOptions& /*opts*/,
+                               IODebugContext* /*dbg*/) {
+  IOStatus s = IOStatus::OK();
+  if (close(fd_) < 0) {
+    s = IOError("While closing directory ", directory_name_, errno);
+  } else {
+    fd_ = -1;
+  }
+  return s;
+}
+
+IOStatus PosixDirectory::FsyncWithDirOptions(
+    const IOOptions& /*opts*/, IODebugContext* /*dbg*/,
+    const DirFsyncOptions& dir_fsync_options) {
+  assert(fd_ >= 0);  // Check use after close
+  IOStatus s = IOStatus::OK();
+#ifndef OS_AIX
+  if (is_btrfs_) {
+    // skip dir fsync for new file creation, which is not needed for btrfs
+    if (dir_fsync_options.reason == DirFsyncOptions::kNewFileSynced) {
+      return s;
+    }
+    // skip dir fsync for renaming file, only need to sync new file
+    if (dir_fsync_options.reason == DirFsyncOptions::kFileRenamed) {
+      std::string new_name = dir_fsync_options.renamed_new_name;
+      assert(!new_name.empty());
+      int fd;
+      do {
+        IOSTATS_TIMER_GUARD(open_nanos);
+        fd = open(new_name.c_str(), O_RDONLY);
+      } while (fd < 0 && errno == EINTR);
+      if (fd < 0) {
+        s = IOError("While open renaming file", new_name, errno);
+      } else if (fsync(fd) < 0) {
+        s = IOError("While fsync renaming file", new_name, errno);
+      }
+      if (close(fd) < 0) {
+        s = IOError("While closing file after fsync", new_name, errno);
+      }
+      return s;
+    }
+    // fallback to dir-fsync for kDefault, kDirRenamed and kFileDeleted
+  }
+
+  // skip fsync/fcntl when fd_ == -1 since this file descriptor has been closed
+  // in either the de-construction or the close function, data must have been
+  // fsync-ed before de-construction and close is called
+#ifdef HAVE_FULLFSYNC
+  // btrfs is a Linux file system, while currently F_FULLFSYNC is available on
+  // Mac OS.
+  assert(!is_btrfs_);
+  if (fd_ != -1 && ::fcntl(fd_, F_FULLFSYNC) < 0) {
+    return IOError("while fcntl(F_FULLFSYNC)", "a directory", errno);
+  }
+#else   // HAVE_FULLFSYNC
+  if (fd_ != -1 && fsync(fd_) == -1) {
+    s = IOError("While fsync", "a directory", errno);
+  }
+#endif  // HAVE_FULLFSYNC
+#endif  // OS_AIX
+  return s;
+}
+}  // namespace ROCKSDB_NAMESPACE
+#endif
diff --git a/src/rocksdb/env/io_posix.h b/src/rocksdb/env/io_posix.h
new file mode 100644
index 000000000..f129668ea
--- /dev/null
+++ b/src/rocksdb/env/io_posix.h
@@ -0,0 +1,523 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include <errno.h>
+#if defined(ROCKSDB_IOURING_PRESENT)
+#include <liburing.h>
+#include <sys/uio.h>
+#endif
+#include <unistd.h>
+
+#include <atomic>
+#include <functional>
+#include <map>
+#include <string>
+
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/io_status.h"
+#include "test_util/sync_point.h"
+#include "util/mutexlock.h"
+#include "util/thread_local.h"
+
+// For non linux platform, the following macros are used only as place
+// holder.
+#if !(defined OS_LINUX) && !(defined CYGWIN) && !(defined OS_AIX)
+#define POSIX_FADV_NORMAL 0     /* [MC1] no further special treatment */
+#define POSIX_FADV_RANDOM 1     /* [MC1] expect random page refs */
+#define POSIX_FADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */
+#define POSIX_FADV_WILLNEED 3   /* [MC1] will need these pages */
+#define POSIX_FADV_DONTNEED 4   /* [MC1] don't need these pages */
+
+#define POSIX_MADV_NORMAL 0     /* [MC1] no further special treatment */
+#define POSIX_MADV_RANDOM 1     /* [MC1] expect random page refs */
+#define POSIX_MADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */
+#define POSIX_MADV_WILLNEED 3   /* [MC1] will need these pages */
+#define POSIX_MADV_DONTNEED 4   /* [MC1] don't need these pages */
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+std::string IOErrorMsg(const std::string& context,
+                       const std::string& file_name);
+// file_name can be left empty if it is not unkown.
+IOStatus IOError(const std::string& context, const std::string& file_name,
+                 int err_number);
+
+class PosixHelper {
+ public:
+  static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size);
+  static size_t GetLogicalBlockSizeOfFd(int fd);
+  static Status GetLogicalBlockSizeOfDirectory(const std::string& directory,
+                                               size_t* size);
+};
+
+/*
+ * DirectIOHelper
+ */
+inline bool IsSectorAligned(const size_t off, size_t sector_size) {
+  assert((sector_size & (sector_size - 1)) == 0);
+  return (off & (sector_size - 1)) == 0;
+}
+
+#ifndef NDEBUG
+inline bool IsSectorAligned(const void* ptr, size_t sector_size) {
+  return uintptr_t(ptr) % sector_size == 0;
+}
+#endif
+
+#if defined(ROCKSDB_IOURING_PRESENT)
+struct Posix_IOHandle {
+  Posix_IOHandle(struct io_uring* _iu,
+                 std::function<void(const FSReadRequest&, void*)> _cb,
+                 void* _cb_arg, uint64_t _offset, size_t _len, char* _scratch,
+                 bool _use_direct_io, size_t _alignment)
+      : iu(_iu),
+        cb(_cb),
+        cb_arg(_cb_arg),
+        offset(_offset),
+        len(_len),
+        scratch(_scratch),
+        use_direct_io(_use_direct_io),
+        alignment(_alignment),
+        is_finished(false),
+        req_count(0) {}
+
+  struct iovec iov;
+  struct io_uring* iu;
+  std::function<void(const FSReadRequest&, void*)> cb;
+  void* cb_arg;
+  uint64_t offset;
+  size_t len;
+  char* scratch;
+  bool use_direct_io;
+  size_t alignment;
+  bool is_finished;
+  // req_count is used by AbortIO API to keep track of number of requests.
+  uint32_t req_count;
+};
+
+inline void UpdateResult(struct io_uring_cqe* cqe, const std::string& file_name,
+                         size_t len, size_t iov_len, bool async_read,
+                         bool use_direct_io, size_t alignment,
+                         size_t& finished_len, FSReadRequest* req,
+                         size_t& bytes_read, bool& read_again) {
+  read_again = false;
+  if (cqe->res < 0) {
+    req->result = Slice(req->scratch, 0);
+    req->status = IOError("Req failed", file_name, cqe->res);
+  } else {
+    bytes_read = static_cast<size_t>(cqe->res);
+    TEST_SYNC_POINT_CALLBACK("UpdateResults::io_uring_result", &bytes_read);
+    if (bytes_read == iov_len) {
+      req->result = Slice(req->scratch, req->len);
+      req->status = IOStatus::OK();
+    } else if (bytes_read == 0) {
+      /// cqe->res == 0 can means EOF, or can mean partial results. See
+      // comment
+      // https://github.com/facebook/rocksdb/pull/6441#issuecomment-589843435
+      // Fall back to pread in this case.
+      if (use_direct_io && !IsSectorAligned(finished_len, alignment)) {
+        // Bytes reads don't fill sectors. Should only happen at the end
+        // of the file.
+        req->result = Slice(req->scratch, finished_len);
+        req->status = IOStatus::OK();
+      } else {
+        if (async_read) {
+          // No  bytes read. It can means EOF. In case of partial results, it's
+          // caller responsibility to call read/readasync again.
+          req->result = Slice(req->scratch, 0);
+          req->status = IOStatus::OK();
+        } else {
+          read_again = true;
+        }
+      }
+    } else if (bytes_read < iov_len) {
+      assert(bytes_read > 0);
+      if (async_read) {
+        req->result = Slice(req->scratch, bytes_read);
+        req->status = IOStatus::OK();
+      } else {
+        assert(bytes_read + finished_len < len);
+        finished_len += bytes_read;
+      }
+    } else {
+      req->result = Slice(req->scratch, 0);
+      req->status = IOError("Req returned more bytes than requested", file_name,
+                            cqe->res);
+    }
+  }
+#ifdef NDEBUG
+  (void)len;
+#endif
+}
+#endif
+
+#ifdef OS_LINUX
+// Files under a specific directory have the same logical block size.
+// This class caches the logical block size for the specified directories to
+// save the CPU cost of computing the size.
+// Safe for concurrent access from multiple threads without any external
+// synchronization.
+class LogicalBlockSizeCache {
+ public:
+  LogicalBlockSizeCache(
+      std::function<size_t(int)> get_logical_block_size_of_fd =
+          PosixHelper::GetLogicalBlockSizeOfFd,
+      std::function<Status(const std::string&, size_t*)>
+          get_logical_block_size_of_directory =
+              PosixHelper::GetLogicalBlockSizeOfDirectory)
+      : get_logical_block_size_of_fd_(get_logical_block_size_of_fd),
+        get_logical_block_size_of_directory_(
+            get_logical_block_size_of_directory) {}
+
+  // Takes the following actions:
+  // 1. Increases reference count of the directories;
+  // 2. If the directory's logical block size is not cached,
+  //    compute the buffer size and cache the result.
+  Status RefAndCacheLogicalBlockSize(
+      const std::vector<std::string>& directories);
+
+  // Takes the following actions:
+  // 1. Decreases reference count of the directories;
+  // 2. If the reference count of a directory reaches 0, remove the directory
+  //    from the cache.
+  void UnrefAndTryRemoveCachedLogicalBlockSize(
+      const std::vector<std::string>& directories);
+
+  // Returns the logical block size for the file.
+  //
+  // If the file is under a cached directory, return the cached size.
+  // Otherwise, the size is computed.
+  size_t GetLogicalBlockSize(const std::string& fname, int fd);
+
+  int GetRefCount(const std::string& dir) {
+    ReadLock lock(&cache_mutex_);
+    auto it = cache_.find(dir);
+    if (it == cache_.end()) {
+      return 0;
+    }
+    return it->second.ref;
+  }
+
+  size_t Size() const { return cache_.size(); }
+
+  bool Contains(const std::string& dir) {
+    ReadLock lock(&cache_mutex_);
+    return cache_.find(dir) != cache_.end();
+  }
+
+ private:
+  struct CacheValue {
+    CacheValue() : size(0), ref(0) {}
+
+    // Logical block size of the directory.
+    size_t size;
+    // Reference count of the directory.
+    int ref;
+  };
+
+  std::function<size_t(int)> get_logical_block_size_of_fd_;
+  std::function<Status(const std::string&, size_t*)>
+      get_logical_block_size_of_directory_;
+
+  std::map<std::string, CacheValue> cache_;
+  port::RWMutex cache_mutex_;
+};
+#endif
+
+class PosixSequentialFile : public FSSequentialFile {
+ private:
+  std::string filename_;
+  FILE* file_;
+  int fd_;
+  bool use_direct_io_;
+  size_t logical_sector_size_;
+
+ public:
+  PosixSequentialFile(const std::string& fname, FILE* file, int fd,
+                      size_t logical_block_size, const EnvOptions& options);
+  virtual ~PosixSequentialFile();
+
+  virtual IOStatus Read(size_t n, const IOOptions& opts, Slice* result,
+                        char* scratch, IODebugContext* dbg) override;
+  virtual IOStatus PositionedRead(uint64_t offset, size_t n,
+                                  const IOOptions& opts, Slice* result,
+                                  char* scratch, IODebugContext* dbg) override;
+  virtual IOStatus Skip(uint64_t n) override;
+  virtual IOStatus InvalidateCache(size_t offset, size_t length) override;
+  virtual bool use_direct_io() const override { return use_direct_io_; }
+  virtual size_t GetRequiredBufferAlignment() const override {
+    return logical_sector_size_;
+  }
+};
+
+#if defined(ROCKSDB_IOURING_PRESENT)
+// io_uring instance queue depth
+const unsigned int kIoUringDepth = 256;
+
+inline void DeleteIOUring(void* p) {
+  struct io_uring* iu = static_cast<struct io_uring*>(p);
+  delete iu;
+}
+
+inline struct io_uring* CreateIOUring() {
+  struct io_uring* new_io_uring = new struct io_uring;
+  int ret = io_uring_queue_init(kIoUringDepth, new_io_uring, 0);
+  if (ret) {
+    delete new_io_uring;
+    new_io_uring = nullptr;
+  }
+  return new_io_uring;
+}
+#endif  // defined(ROCKSDB_IOURING_PRESENT)
+
+class PosixRandomAccessFile : public FSRandomAccessFile {
+ protected:
+  std::string filename_;
+  int fd_;
+  bool use_direct_io_;
+  size_t logical_sector_size_;
+#if defined(ROCKSDB_IOURING_PRESENT)
+  ThreadLocalPtr* thread_local_io_urings_;
+#endif
+
+ public:
+  PosixRandomAccessFile(const std::string& fname, int fd,
+                        size_t logical_block_size, const EnvOptions& options
+#if defined(ROCKSDB_IOURING_PRESENT)
+                        ,
+                        ThreadLocalPtr* thread_local_io_urings
+#endif
+  );
+  virtual ~PosixRandomAccessFile();
+
+  virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& opts,
+                        Slice* result, char* scratch,
+                        IODebugContext* dbg) const override;
+
+  virtual IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
+                             const IOOptions& options,
+                             IODebugContext* dbg) override;
+
+  virtual IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& opts,
+                            IODebugContext* dbg) override;
+
+#if defined(OS_LINUX) || defined(OS_MACOSX) || defined(OS_AIX)
+  virtual size_t GetUniqueId(char* id, size_t max_size) const override;
+#endif
+  virtual void Hint(AccessPattern pattern) override;
+  virtual IOStatus InvalidateCache(size_t offset, size_t length) override;
+  virtual bool use_direct_io() const override { return use_direct_io_; }
+  virtual size_t GetRequiredBufferAlignment() const override {
+    return logical_sector_size_;
+  }
+  // EXPERIMENTAL
+  virtual IOStatus ReadAsync(
+      FSReadRequest& req, const IOOptions& opts,
+      std::function<void(const FSReadRequest&, void*)> cb, void* cb_arg,
+      void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) override;
+};
+
+class PosixWritableFile : public FSWritableFile {
+ protected:
+  const std::string filename_;
+  const bool use_direct_io_;
+  int fd_;
+  uint64_t filesize_;
+  size_t logical_sector_size_;
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+  bool allow_fallocate_;
+  bool fallocate_with_keep_size_;
+#endif
+#ifdef ROCKSDB_RANGESYNC_PRESENT
+  // Even if the syscall is present, the filesystem may still not properly
+  // support it, so we need to do a dynamic check too.
+  bool sync_file_range_supported_;
+#endif  // ROCKSDB_RANGESYNC_PRESENT
+
+ public:
+  explicit PosixWritableFile(const std::string& fname, int fd,
+                             size_t logical_block_size,
+                             const EnvOptions& options);
+  virtual ~PosixWritableFile();
+
+  // Need to implement this so the file is truncated correctly
+  // with direct I/O
+  virtual IOStatus Truncate(uint64_t size, const IOOptions& opts,
+                            IODebugContext* dbg) override;
+  virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override;
+  virtual IOStatus Append(const Slice& data, const IOOptions& opts,
+                          IODebugContext* dbg) override;
+  virtual IOStatus Append(const Slice& data, const IOOptions& opts,
+                          const DataVerificationInfo& /* verification_info */,
+                          IODebugContext* dbg) override {
+    return Append(data, opts, dbg);
+  }
+  virtual IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                                    const IOOptions& opts,
+                                    IODebugContext* dbg) override;
+  virtual IOStatus PositionedAppend(
+      const Slice& data, uint64_t offset, const IOOptions& opts,
+      const DataVerificationInfo& /* verification_info */,
+      IODebugContext* dbg) override {
+    return PositionedAppend(data, offset, opts, dbg);
+  }
+  virtual IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override;
+  virtual IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override;
+  virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override;
+  virtual bool IsSyncThreadSafe() const override;
+  virtual bool use_direct_io() const override { return use_direct_io_; }
+  virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override;
+  virtual uint64_t GetFileSize(const IOOptions& opts,
+                               IODebugContext* dbg) override;
+  virtual IOStatus InvalidateCache(size_t offset, size_t length) override;
+  virtual size_t GetRequiredBufferAlignment() const override {
+    return logical_sector_size_;
+  }
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+  virtual IOStatus Allocate(uint64_t offset, uint64_t len,
+                            const IOOptions& opts,
+                            IODebugContext* dbg) override;
+#endif
+  virtual IOStatus RangeSync(uint64_t offset, uint64_t nbytes,
+                             const IOOptions& opts,
+                             IODebugContext* dbg) override;
+#ifdef OS_LINUX
+  virtual size_t GetUniqueId(char* id, size_t max_size) const override;
+#endif
+};
+
+// mmap() based random-access
+class PosixMmapReadableFile : public FSRandomAccessFile {
+ private:
+  int fd_;
+  std::string filename_;
+  void* mmapped_region_;
+  size_t length_;
+
+ public:
+  PosixMmapReadableFile(const int fd, const std::string& fname, void* base,
+                        size_t length, const EnvOptions& options);
+  virtual ~PosixMmapReadableFile();
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& opts, Slice* result,
+                char* scratch, IODebugContext* dbg) const override;
+  void Hint(AccessPattern pattern) override;
+  IOStatus InvalidateCache(size_t offset, size_t length) override;
+};
+
+class PosixMmapFile : public FSWritableFile {
+ private:
+  std::string filename_;
+  int fd_;
+  size_t page_size_;
+  size_t map_size_;       // How much extra memory to map at a time
+  char* base_;            // The mapped region
+  char* limit_;           // Limit of the mapped region
+  char* dst_;             // Where to write next  (in range [base_,limit_])
+  char* last_sync_;       // Where have we synced up to
+  uint64_t file_offset_;  // Offset of base_ in file
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+  bool allow_fallocate_;  // If false, fallocate calls are bypassed
+  bool fallocate_with_keep_size_;
+#endif
+
+  // Roundup x to a multiple of y
+  static size_t Roundup(size_t x, size_t y) { return ((x + y - 1) / y) * y; }
+
+  size_t TruncateToPageBoundary(size_t s) {
+    s -= (s & (page_size_ - 1));
+    assert((s % page_size_) == 0);
+    return s;
+  }
+
+  IOStatus MapNewRegion();
+  IOStatus UnmapCurrentRegion();
+  IOStatus Msync();
+
+ public:
+  PosixMmapFile(const std::string& fname, int fd, size_t page_size,
+                const EnvOptions& options);
+  ~PosixMmapFile();
+
+  // Means Close() will properly take care of truncate
+  // and it does not need any additional information
+  virtual IOStatus Truncate(uint64_t /*size*/, const IOOptions& /*opts*/,
+                            IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
+  virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override;
+  virtual IOStatus Append(const Slice& data, const IOOptions& opts,
+                          IODebugContext* dbg) override;
+  virtual IOStatus Append(const Slice& data, const IOOptions& opts,
+                          const DataVerificationInfo& /* verification_info */,
+                          IODebugContext* dbg) override {
+    return Append(data, opts, dbg);
+  }
+  virtual IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override;
+  virtual IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override;
+  virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override;
+  virtual uint64_t GetFileSize(const IOOptions& opts,
+                               IODebugContext* dbg) override;
+  virtual IOStatus InvalidateCache(size_t offset, size_t length) override;
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+  virtual IOStatus Allocate(uint64_t offset, uint64_t len,
+                            const IOOptions& opts,
+                            IODebugContext* dbg) override;
+#endif
+};
+
+class PosixRandomRWFile : public FSRandomRWFile {
+ public:
+  explicit PosixRandomRWFile(const std::string& fname, int fd,
+                             const EnvOptions& options);
+  virtual ~PosixRandomRWFile();
+
+  virtual IOStatus Write(uint64_t offset, const Slice& data,
+                         const IOOptions& opts, IODebugContext* dbg) override;
+
+  virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& opts,
+                        Slice* result, char* scratch,
+                        IODebugContext* dbg) const override;
+
+  virtual IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override;
+  virtual IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override;
+  virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override;
+  virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override;
+
+ private:
+  const std::string filename_;
+  int fd_;
+};
+
+struct PosixMemoryMappedFileBuffer : public MemoryMappedFileBuffer {
+  PosixMemoryMappedFileBuffer(void* _base, size_t _length)
+      : MemoryMappedFileBuffer(_base, _length) {}
+  virtual ~PosixMemoryMappedFileBuffer();
+};
+
+class PosixDirectory : public FSDirectory {
+ public:
+  explicit PosixDirectory(int fd, const std::string& directory_name);
+  ~PosixDirectory();
+  virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override;
+
+  virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override;
+
+  virtual IOStatus FsyncWithDirOptions(
+      const IOOptions&, IODebugContext*,
+      const DirFsyncOptions& dir_fsync_options) override;
+
+ private:
+  int fd_;
+  bool is_btrfs_;
+  const std::string directory_name_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/env/io_posix_test.cc b/src/rocksdb/env/io_posix_test.cc
new file mode 100644
index 000000000..81ce50587
--- /dev/null
+++ b/src/rocksdb/env/io_posix_test.cc
@@ -0,0 +1,141 @@
+// Copyright (c) 2020-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "test_util/testharness.h"
+
+#ifdef ROCKSDB_LIB_IO_POSIX
+#include "env/io_posix.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifdef OS_LINUX
+class LogicalBlockSizeCacheTest : public testing::Test {};
+
+// Tests the caching behavior.
+TEST_F(LogicalBlockSizeCacheTest, Cache) {
+  int ncall = 0;
+  auto get_fd_block_size = [&](int fd) {
+    ncall++;
+    return fd;
+  };
+  std::map<std::string, int> dir_fds{
+      {"/", 0},
+      {"/db", 1},
+      {"/db1", 2},
+      {"/db2", 3},
+  };
+  auto get_dir_block_size = [&](const std::string& dir, size_t* size) {
+    ncall++;
+    *size = dir_fds[dir];
+    return Status::OK();
+  };
+  LogicalBlockSizeCache cache(get_fd_block_size, get_dir_block_size);
+  ASSERT_EQ(0, ncall);
+  ASSERT_EQ(0, cache.Size());
+
+  ASSERT_EQ(6, cache.GetLogicalBlockSize("/sst", 6));
+  ASSERT_EQ(1, ncall);
+  ASSERT_EQ(7, cache.GetLogicalBlockSize("/db/sst1", 7));
+  ASSERT_EQ(2, ncall);
+  ASSERT_EQ(8, cache.GetLogicalBlockSize("/db/sst2", 8));
+  ASSERT_EQ(3, ncall);
+
+  ASSERT_OK(cache.RefAndCacheLogicalBlockSize({"/", "/db1/", "/db2"}));
+  ASSERT_EQ(3, cache.Size());
+  ASSERT_TRUE(cache.Contains("/"));
+  ASSERT_TRUE(cache.Contains("/db1"));
+  ASSERT_TRUE(cache.Contains("/db2"));
+  ASSERT_EQ(6, ncall);
+  // Block size for / is cached.
+  ASSERT_EQ(0, cache.GetLogicalBlockSize("/sst", 6));
+  ASSERT_EQ(6, ncall);
+  // No cached size for /db.
+  ASSERT_EQ(7, cache.GetLogicalBlockSize("/db/sst1", 7));
+  ASSERT_EQ(7, ncall);
+  ASSERT_EQ(8, cache.GetLogicalBlockSize("/db/sst2", 8));
+  ASSERT_EQ(8, ncall);
+  // Block size for /db1 is cached.
+  ASSERT_EQ(2, cache.GetLogicalBlockSize("/db1/sst1", 4));
+  ASSERT_EQ(8, ncall);
+  ASSERT_EQ(2, cache.GetLogicalBlockSize("/db1/sst2", 5));
+  ASSERT_EQ(8, ncall);
+  // Block size for /db2 is cached.
+  ASSERT_EQ(3, cache.GetLogicalBlockSize("/db2/sst1", 6));
+  ASSERT_EQ(8, ncall);
+  ASSERT_EQ(3, cache.GetLogicalBlockSize("/db2/sst2", 7));
+  ASSERT_EQ(8, ncall);
+
+  ASSERT_OK(cache.RefAndCacheLogicalBlockSize({"/db"}));
+  ASSERT_EQ(4, cache.Size());
+  ASSERT_TRUE(cache.Contains("/"));
+  ASSERT_TRUE(cache.Contains("/db1"));
+  ASSERT_TRUE(cache.Contains("/db2"));
+  ASSERT_TRUE(cache.Contains("/db"));
+
+  ASSERT_EQ(9, ncall);
+  // Block size for /db is cached.
+  ASSERT_EQ(1, cache.GetLogicalBlockSize("/db/sst1", 7));
+  ASSERT_EQ(9, ncall);
+  ASSERT_EQ(1, cache.GetLogicalBlockSize("/db/sst2", 8));
+  ASSERT_EQ(9, ncall);
+}
+
+// Tests the reference counting behavior.
+TEST_F(LogicalBlockSizeCacheTest, Ref) {
+  int ncall = 0;
+  auto get_fd_block_size = [&](int fd) {
+    ncall++;
+    return fd;
+  };
+  std::map<std::string, int> dir_fds{
+      {"/db", 0},
+  };
+  auto get_dir_block_size = [&](const std::string& dir, size_t* size) {
+    ncall++;
+    *size = dir_fds[dir];
+    return Status::OK();
+  };
+  LogicalBlockSizeCache cache(get_fd_block_size, get_dir_block_size);
+
+  ASSERT_EQ(0, ncall);
+
+  ASSERT_EQ(1, cache.GetLogicalBlockSize("/db/sst0", 1));
+  ASSERT_EQ(1, ncall);
+
+  ASSERT_OK(cache.RefAndCacheLogicalBlockSize({"/db"}));
+  ASSERT_EQ(2, ncall);
+  ASSERT_EQ(1, cache.GetRefCount("/db"));
+  // Block size for /db is cached. Ref count = 1.
+  ASSERT_EQ(0, cache.GetLogicalBlockSize("/db/sst1", 1));
+  ASSERT_EQ(2, ncall);
+
+  // Ref count = 2, but won't recompute the cached buffer size.
+  ASSERT_OK(cache.RefAndCacheLogicalBlockSize({"/db"}));
+  ASSERT_EQ(2, cache.GetRefCount("/db"));
+  ASSERT_EQ(2, ncall);
+
+  // Ref count = 1.
+  cache.UnrefAndTryRemoveCachedLogicalBlockSize({"/db"});
+  ASSERT_EQ(1, cache.GetRefCount("/db"));
+  // Block size for /db is still cached.
+  ASSERT_EQ(0, cache.GetLogicalBlockSize("/db/sst2", 1));
+  ASSERT_EQ(2, ncall);
+
+  // Ref count = 0 and cached buffer size for /db is removed.
+  cache.UnrefAndTryRemoveCachedLogicalBlockSize({"/db"});
+  ASSERT_EQ(0, cache.Size());
+  ASSERT_EQ(1, cache.GetLogicalBlockSize("/db/sst0", 1));
+  ASSERT_EQ(3, ncall);
+}
+#endif
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/env/mock_env.cc b/src/rocksdb/env/mock_env.cc
new file mode 100644
index 000000000..bfa7dc2f4
--- /dev/null
+++ b/src/rocksdb/env/mock_env.cc
@@ -0,0 +1,1070 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "env/mock_env.h"
+
+#include <algorithm>
+#include <chrono>
+
+#include "env/emulated_clock.h"
+#include "file/filename.h"
+#include "port/sys_time.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/utilities/options_type.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/hash.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+int64_t MaybeCurrentTime(const std::shared_ptr<SystemClock>& clock) {
+  int64_t time = 1337346000;  // arbitrary fallback default
+  clock->GetCurrentTime(&time).PermitUncheckedError();
+  return time;
+}
+
+static std::unordered_map<std::string, OptionTypeInfo> time_elapse_type_info = {
+#ifndef ROCKSDB_LITE
+    {"time_elapse_only_sleep",
+     {0, OptionType::kBoolean, OptionVerificationType::kNormal,
+      OptionTypeFlags::kCompareNever,
+      [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
+         const std::string& value, void* addr) {
+        auto clock = static_cast<EmulatedSystemClock*>(addr);
+        clock->SetTimeElapseOnlySleep(ParseBoolean("", value));
+        return Status::OK();
+      },
+      [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
+         const void* addr, std::string* value) {
+        const auto clock = static_cast<const EmulatedSystemClock*>(addr);
+        *value = clock->IsTimeElapseOnlySleep() ? "true" : "false";
+        return Status::OK();
+      },
+      nullptr}},
+#endif  // ROCKSDB_LITE
+};
+static std::unordered_map<std::string, OptionTypeInfo> mock_sleep_type_info = {
+#ifndef ROCKSDB_LITE
+    {"mock_sleep",
+     {0, OptionType::kBoolean, OptionVerificationType::kNormal,
+      OptionTypeFlags::kCompareNever,
+      [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
+         const std::string& value, void* addr) {
+        auto clock = static_cast<EmulatedSystemClock*>(addr);
+        clock->SetMockSleep(ParseBoolean("", value));
+        return Status::OK();
+      },
+      [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
+         const void* addr, std::string* value) {
+        const auto clock = static_cast<const EmulatedSystemClock*>(addr);
+        *value = clock->IsMockSleepEnabled() ? "true" : "false";
+        return Status::OK();
+      },
+      nullptr}},
+#endif  // ROCKSDB_LITE
+};
+}  // namespace
+
+EmulatedSystemClock::EmulatedSystemClock(
+    const std::shared_ptr<SystemClock>& base, bool time_elapse_only_sleep)
+    : SystemClockWrapper(base),
+      maybe_starting_time_(MaybeCurrentTime(base)),
+      time_elapse_only_sleep_(time_elapse_only_sleep),
+      no_slowdown_(time_elapse_only_sleep) {
+  RegisterOptions("", this, &time_elapse_type_info);
+  RegisterOptions("", this, &mock_sleep_type_info);
+}
+
+class MemFile {
+ public:
+  explicit MemFile(SystemClock* clock, const std::string& fn,
+                   bool _is_lock_file = false)
+      : clock_(clock),
+        fn_(fn),
+        refs_(0),
+        is_lock_file_(_is_lock_file),
+        locked_(false),
+        size_(0),
+        modified_time_(Now()),
+        rnd_(Lower32of64(GetSliceNPHash64(fn))),
+        fsynced_bytes_(0) {}
+  // No copying allowed.
+  MemFile(const MemFile&) = delete;
+  void operator=(const MemFile&) = delete;
+
+  void Ref() {
+    MutexLock lock(&mutex_);
+    ++refs_;
+  }
+
+  bool is_lock_file() const { return is_lock_file_; }
+
+  bool Lock() {
+    assert(is_lock_file_);
+    MutexLock lock(&mutex_);
+    if (locked_) {
+      return false;
+    } else {
+      locked_ = true;
+      return true;
+    }
+  }
+
+  void Unlock() {
+    assert(is_lock_file_);
+    MutexLock lock(&mutex_);
+    locked_ = false;
+  }
+
+  void Unref() {
+    bool do_delete = false;
+    {
+      MutexLock lock(&mutex_);
+      --refs_;
+      assert(refs_ >= 0);
+      if (refs_ <= 0) {
+        do_delete = true;
+      }
+    }
+
+    if (do_delete) {
+      delete this;
+    }
+  }
+
+  uint64_t Size() const { return size_; }
+
+  void Truncate(size_t size, const IOOptions& /*options*/,
+                IODebugContext* /*dbg*/) {
+    MutexLock lock(&mutex_);
+    if (size < size_) {
+      data_.resize(size);
+      size_ = size;
+    }
+  }
+
+  void CorruptBuffer() {
+    if (fsynced_bytes_ >= size_) {
+      return;
+    }
+    uint64_t buffered_bytes = size_ - fsynced_bytes_;
+    uint64_t start =
+        fsynced_bytes_ + rnd_.Uniform(static_cast<int>(buffered_bytes));
+    uint64_t end = std::min(start + 512, size_.load());
+    MutexLock lock(&mutex_);
+    for (uint64_t pos = start; pos < end; ++pos) {
+      data_[static_cast<size_t>(pos)] = static_cast<char>(rnd_.Uniform(256));
+    }
+  }
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/,
+                Slice* result, char* scratch, IODebugContext* /*dbg*/) const {
+    {
+      IOStatus s;
+      TEST_SYNC_POINT_CALLBACK("MemFile::Read:IOStatus", &s);
+      if (!s.ok()) {
+        // with sync point only
+        *result = Slice();
+        return s;
+      }
+    }
+    MutexLock lock(&mutex_);
+    const uint64_t available = Size() - std::min(Size(), offset);
+    size_t offset_ = static_cast<size_t>(offset);
+    if (n > available) {
+      n = static_cast<size_t>(available);
+    }
+    if (n == 0) {
+      *result = Slice();
+      return IOStatus::OK();
+    }
+    if (scratch) {
+      memcpy(scratch, &(data_[offset_]), n);
+      *result = Slice(scratch, n);
+    } else {
+      *result = Slice(&(data_[offset_]), n);
+    }
+    return IOStatus::OK();
+  }
+
+  IOStatus Write(uint64_t offset, const Slice& data,
+                 const IOOptions& /*options*/, IODebugContext* /*dbg*/) {
+    MutexLock lock(&mutex_);
+    size_t offset_ = static_cast<size_t>(offset);
+    if (offset + data.size() > data_.size()) {
+      data_.resize(offset_ + data.size());
+    }
+    data_.replace(offset_, data.size(), data.data(), data.size());
+    size_ = data_.size();
+    modified_time_ = Now();
+    return IOStatus::OK();
+  }
+
+  IOStatus Append(const Slice& data, const IOOptions& /*options*/,
+                  IODebugContext* /*dbg*/) {
+    MutexLock lock(&mutex_);
+    data_.append(data.data(), data.size());
+    size_ = data_.size();
+    modified_time_ = Now();
+    return IOStatus::OK();
+  }
+
+  IOStatus Fsync(const IOOptions& /*options*/, IODebugContext* /*dbg*/) {
+    fsynced_bytes_ = size_.load();
+    return IOStatus::OK();
+  }
+
+  uint64_t ModifiedTime() const { return modified_time_; }
+
+ private:
+  uint64_t Now() {
+    int64_t unix_time = 0;
+    auto s = clock_->GetCurrentTime(&unix_time);
+    assert(s.ok());
+    return static_cast<uint64_t>(unix_time);
+  }
+
+  // Private since only Unref() should be used to delete it.
+  ~MemFile() { assert(refs_ == 0); }
+
+  SystemClock* clock_;
+  const std::string fn_;
+  mutable port::Mutex mutex_;
+  int refs_;
+  bool is_lock_file_;
+  bool locked_;
+
+  // Data written into this file, all bytes before fsynced_bytes are
+  // persistent.
+  std::string data_;
+  std::atomic<uint64_t> size_;
+  std::atomic<uint64_t> modified_time_;
+
+  Random rnd_;
+  std::atomic<uint64_t> fsynced_bytes_;
+};
+
+namespace {
+
+class MockSequentialFile : public FSSequentialFile {
+ public:
+  explicit MockSequentialFile(MemFile* file, const FileOptions& opts)
+      : file_(file),
+        use_direct_io_(opts.use_direct_reads),
+        use_mmap_read_(opts.use_mmap_reads),
+        pos_(0) {
+    file_->Ref();
+  }
+
+  ~MockSequentialFile() override { file_->Unref(); }
+
+  IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+                char* scratch, IODebugContext* dbg) override {
+    IOStatus s = file_->Read(pos_, n, options, result,
+                             (use_mmap_read_) ? nullptr : scratch, dbg);
+    if (s.ok()) {
+      pos_ += result->size();
+    }
+    return s;
+  }
+
+  bool use_direct_io() const override { return use_direct_io_; }
+  IOStatus Skip(uint64_t n) override {
+    if (pos_ > file_->Size()) {
+      return IOStatus::IOError("pos_ > file_->Size()");
+    }
+    const uint64_t available = file_->Size() - pos_;
+    if (n > available) {
+      n = available;
+    }
+    pos_ += static_cast<size_t>(n);
+    return IOStatus::OK();
+  }
+
+ private:
+  MemFile* file_;
+  bool use_direct_io_;
+  bool use_mmap_read_;
+  size_t pos_;
+};
+
+class MockRandomAccessFile : public FSRandomAccessFile {
+ public:
+  explicit MockRandomAccessFile(MemFile* file, const FileOptions& opts)
+      : file_(file),
+        use_direct_io_(opts.use_direct_reads),
+        use_mmap_read_(opts.use_mmap_reads) {
+    file_->Ref();
+  }
+
+  ~MockRandomAccessFile() override { file_->Unref(); }
+
+  bool use_direct_io() const override { return use_direct_io_; }
+
+  IOStatus Prefetch(uint64_t /*offset*/, size_t /*n*/,
+                    const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override {
+    if (use_mmap_read_) {
+      return file_->Read(offset, n, options, result, nullptr, dbg);
+    } else {
+      return file_->Read(offset, n, options, result, scratch, dbg);
+    }
+  }
+
+ private:
+  MemFile* file_;
+  bool use_direct_io_;
+  bool use_mmap_read_;
+};
+
+class MockRandomRWFile : public FSRandomRWFile {
+ public:
+  explicit MockRandomRWFile(MemFile* file) : file_(file) { file_->Ref(); }
+
+  ~MockRandomRWFile() override { file_->Unref(); }
+
+  IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options,
+                 IODebugContext* dbg) override {
+    return file_->Write(offset, data, options, dbg);
+  }
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override {
+    return file_->Read(offset, n, options, result, scratch, dbg);
+  }
+
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+    return file_->Fsync(options, dbg);
+  }
+
+  IOStatus Flush(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
+
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override {
+    return file_->Fsync(options, dbg);
+  }
+
+ private:
+  MemFile* file_;
+};
+
+class MockWritableFile : public FSWritableFile {
+ public:
+  MockWritableFile(MemFile* file, const FileOptions& opts)
+      : file_(file),
+        use_direct_io_(opts.use_direct_writes),
+        rate_limiter_(opts.rate_limiter) {
+    file_->Ref();
+  }
+
+  ~MockWritableFile() override { file_->Unref(); }
+
+  bool use_direct_io() const override { return false && use_direct_io_; }
+
+  using FSWritableFile::Append;
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  IODebugContext* dbg) override {
+    size_t bytes_written = 0;
+    while (bytes_written < data.size()) {
+      auto bytes = RequestToken(data.size() - bytes_written);
+      IOStatus s = file_->Append(Slice(data.data() + bytes_written, bytes),
+                                 options, dbg);
+      if (!s.ok()) {
+        return s;
+      }
+      bytes_written += bytes;
+    }
+    return IOStatus::OK();
+  }
+
+  using FSWritableFile::PositionedAppend;
+  IOStatus PositionedAppend(const Slice& data, uint64_t /*offset*/,
+                            const IOOptions& options,
+                            IODebugContext* dbg) override {
+    assert(use_direct_io_);
+    return Append(data, options, dbg);
+  }
+
+  IOStatus Truncate(uint64_t size, const IOOptions& options,
+                    IODebugContext* dbg) override {
+    file_->Truncate(static_cast<size_t>(size), options, dbg);
+    return IOStatus::OK();
+  }
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+    return file_->Fsync(options, dbg);
+  }
+
+  IOStatus Flush(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
+
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override {
+    return file_->Fsync(options, dbg);
+  }
+
+  uint64_t GetFileSize(const IOOptions& /*options*/,
+                       IODebugContext* /*dbg*/) override {
+    return file_->Size();
+  }
+
+ private:
+  inline size_t RequestToken(size_t bytes) {
+    if (rate_limiter_ && io_priority_ < Env::IO_TOTAL) {
+      bytes = std::min(
+          bytes, static_cast<size_t>(rate_limiter_->GetSingleBurstBytes()));
+      rate_limiter_->Request(bytes, io_priority_);
+    }
+    return bytes;
+  }
+
+  MemFile* file_;
+  bool use_direct_io_;
+  RateLimiter* rate_limiter_;
+};
+
+class MockEnvDirectory : public FSDirectory {
+ public:
+  IOStatus Fsync(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
+
+  IOStatus Close(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
+};
+
+class MockEnvFileLock : public FileLock {
+ public:
+  explicit MockEnvFileLock(const std::string& fname) : fname_(fname) {}
+
+  std::string FileName() const { return fname_; }
+
+ private:
+  const std::string fname_;
+};
+
+class TestMemLogger : public Logger {
+ private:
+  std::unique_ptr<FSWritableFile> file_;
+  std::atomic_size_t log_size_;
+  static const uint64_t flush_every_seconds_ = 5;
+  std::atomic_uint_fast64_t last_flush_micros_;
+  SystemClock* clock_;
+  IOOptions options_;
+  IODebugContext* dbg_;
+  std::atomic<bool> flush_pending_;
+
+ public:
+  TestMemLogger(std::unique_ptr<FSWritableFile> f, SystemClock* clock,
+                const IOOptions& options, IODebugContext* dbg,
+                const InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL)
+      : Logger(log_level),
+        file_(std::move(f)),
+        log_size_(0),
+        last_flush_micros_(0),
+        clock_(clock),
+        options_(options),
+        dbg_(dbg),
+        flush_pending_(false) {}
+  ~TestMemLogger() override {}
+
+  void Flush() override {
+    if (flush_pending_) {
+      flush_pending_ = false;
+    }
+    last_flush_micros_ = clock_->NowMicros();
+  }
+
+  using Logger::Logv;
+  void Logv(const char* format, va_list ap) override {
+    // We try twice: the first time with a fixed-size stack allocated buffer,
+    // and the second time with a much larger dynamically allocated buffer.
+    char buffer[500];
+    for (int iter = 0; iter < 2; iter++) {
+      char* base;
+      int bufsize;
+      if (iter == 0) {
+        bufsize = sizeof(buffer);
+        base = buffer;
+      } else {
+        bufsize = 30000;
+        base = new char[bufsize];
+      }
+      char* p = base;
+      char* limit = base + bufsize;
+
+      port::TimeVal now_tv;
+      port::GetTimeOfDay(&now_tv, nullptr);
+      const time_t seconds = now_tv.tv_sec;
+      struct tm t;
+      memset(&t, 0, sizeof(t));
+      struct tm* ret __attribute__((__unused__));
+      ret = port::LocalTimeR(&seconds, &t);
+      assert(ret);
+      p += snprintf(p, limit - p, "%04d/%02d/%02d-%02d:%02d:%02d.%06d ",
+                    t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour,
+                    t.tm_min, t.tm_sec, static_cast<int>(now_tv.tv_usec));
+
+      // Print the message
+      if (p < limit) {
+        va_list backup_ap;
+        va_copy(backup_ap, ap);
+        p += vsnprintf(p, limit - p, format, backup_ap);
+        va_end(backup_ap);
+      }
+
+      // Truncate to available space if necessary
+      if (p >= limit) {
+        if (iter == 0) {
+          continue;  // Try again with larger buffer
+        } else {
+          p = limit - 1;
+        }
+      }
+
+      // Add newline if necessary
+      if (p == base || p[-1] != '\n') {
+        *p++ = '\n';
+      }
+
+      assert(p <= limit);
+      const size_t write_size = p - base;
+
+      Status s = file_->Append(Slice(base, write_size), options_, dbg_);
+      if (s.ok()) {
+        flush_pending_ = true;
+        log_size_ += write_size;
+      }
+      uint64_t now_micros =
+          static_cast<uint64_t>(now_tv.tv_sec) * 1000000 + now_tv.tv_usec;
+      if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) {
+        flush_pending_ = false;
+        last_flush_micros_ = now_micros;
+      }
+      if (base != buffer) {
+        delete[] base;
+      }
+      break;
+    }
+  }
+  size_t GetLogFileSize() const override { return log_size_; }
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> mock_fs_type_info = {
+#ifndef ROCKSDB_LITE
+    {"supports_direct_io",
+     {0, OptionType::kBoolean, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+};
+}  // namespace
+
+MockFileSystem::MockFileSystem(const std::shared_ptr<SystemClock>& clock,
+                               bool supports_direct_io)
+    : system_clock_(clock), supports_direct_io_(supports_direct_io) {
+  clock_ = system_clock_.get();
+  RegisterOptions("", &supports_direct_io_, &mock_fs_type_info);
+}
+
+MockFileSystem::~MockFileSystem() {
+  for (auto i = file_map_.begin(); i != file_map_.end(); ++i) {
+    i->second->Unref();
+  }
+}
+
+Status MockFileSystem::PrepareOptions(const ConfigOptions& options) {
+  Status s = FileSystem::PrepareOptions(options);
+  if (s.ok() && system_clock_ == SystemClock::Default()) {
+    system_clock_ = options.env->GetSystemClock();
+    clock_ = system_clock_.get();
+  }
+  return s;
+}
+
+IOStatus MockFileSystem::GetAbsolutePath(const std::string& db_path,
+                                         const IOOptions& /*options*/,
+                                         std::string* output_path,
+                                         IODebugContext* /*dbg*/) {
+  *output_path = NormalizeMockPath(db_path);
+  if (output_path->at(0) != '/') {
+    return IOStatus::NotSupported("GetAbsolutePath");
+  } else {
+    return IOStatus::OK();
+  }
+}
+
+std::string MockFileSystem::NormalizeMockPath(const std::string& path) {
+  std::string p = NormalizePath(path);
+  if (p.back() == kFilePathSeparator && p.size() > 1) {
+    p.pop_back();
+  }
+  return p;
+}
+
+// Partial implementation of the FileSystem interface.
+IOStatus MockFileSystem::NewSequentialFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSSequentialFile>* result, IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
+
+  MutexLock lock(&mutex_);
+  if (file_map_.find(fn) == file_map_.end()) {
+    *result = nullptr;
+    return IOStatus::PathNotFound(fn);
+  }
+  auto* f = file_map_[fn];
+  if (f->is_lock_file()) {
+    return IOStatus::InvalidArgument(fn, "Cannot open a lock file.");
+  } else if (file_opts.use_direct_reads && !supports_direct_io_) {
+    return IOStatus::NotSupported("Direct I/O Not Supported");
+  } else {
+    result->reset(new MockSequentialFile(f, file_opts));
+    return IOStatus::OK();
+  }
+}
+
+IOStatus MockFileSystem::NewRandomAccessFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
+  MutexLock lock(&mutex_);
+  if (file_map_.find(fn) == file_map_.end()) {
+    *result = nullptr;
+    return IOStatus::PathNotFound(fn);
+  }
+  auto* f = file_map_[fn];
+  if (f->is_lock_file()) {
+    return IOStatus::InvalidArgument(fn, "Cannot open a lock file.");
+  } else if (file_opts.use_direct_reads && !supports_direct_io_) {
+    return IOStatus::NotSupported("Direct I/O Not Supported");
+  } else {
+    result->reset(new MockRandomAccessFile(f, file_opts));
+    return IOStatus::OK();
+  }
+}
+
+IOStatus MockFileSystem::NewRandomRWFile(
+    const std::string& fname, const FileOptions& /*file_opts*/,
+    std::unique_ptr<FSRandomRWFile>* result, IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
+  MutexLock lock(&mutex_);
+  if (file_map_.find(fn) == file_map_.end()) {
+    *result = nullptr;
+    return IOStatus::PathNotFound(fn);
+  }
+  auto* f = file_map_[fn];
+  if (f->is_lock_file()) {
+    return IOStatus::InvalidArgument(fn, "Cannot open a lock file.");
+  }
+  result->reset(new MockRandomRWFile(f));
+  return IOStatus::OK();
+}
+
+IOStatus MockFileSystem::ReuseWritableFile(
+    const std::string& fname, const std::string& old_fname,
+    const FileOptions& options, std::unique_ptr<FSWritableFile>* result,
+    IODebugContext* dbg) {
+  auto s = RenameFile(old_fname, fname, IOOptions(), dbg);
+  if (!s.ok()) {
+    return s;
+  } else {
+    result->reset();
+    return NewWritableFile(fname, options, result, dbg);
+  }
+}
+
+IOStatus MockFileSystem::NewWritableFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSWritableFile>* result, IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
+  MutexLock lock(&mutex_);
+  if (file_map_.find(fn) != file_map_.end()) {
+    DeleteFileInternal(fn);
+  }
+  MemFile* file = new MemFile(clock_, fn, false);
+  file->Ref();
+  file_map_[fn] = file;
+  if (file_opts.use_direct_writes && !supports_direct_io_) {
+    return IOStatus::NotSupported("Direct I/O Not Supported");
+  } else {
+    result->reset(new MockWritableFile(file, file_opts));
+    return IOStatus::OK();
+  }
+}
+
+IOStatus MockFileSystem::ReopenWritableFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSWritableFile>* result, IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
+  MutexLock lock(&mutex_);
+  MemFile* file = nullptr;
+  if (file_map_.find(fn) == file_map_.end()) {
+    file = new MemFile(clock_, fn, false);
+    // Only take a reference when we create the file objectt
+    file->Ref();
+    file_map_[fn] = file;
+  } else {
+    file = file_map_[fn];
+  }
+  if (file_opts.use_direct_writes && !supports_direct_io_) {
+    return IOStatus::NotSupported("Direct I/O Not Supported");
+  } else {
+    result->reset(new MockWritableFile(file, file_opts));
+    return IOStatus::OK();
+  }
+}
+
+IOStatus MockFileSystem::NewDirectory(const std::string& /*name*/,
+                                      const IOOptions& /*io_opts*/,
+                                      std::unique_ptr<FSDirectory>* result,
+                                      IODebugContext* /*dbg*/) {
+  result->reset(new MockEnvDirectory());
+  return IOStatus::OK();
+}
+
+IOStatus MockFileSystem::FileExists(const std::string& fname,
+                                    const IOOptions& /*io_opts*/,
+                                    IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
+  MutexLock lock(&mutex_);
+  if (file_map_.find(fn) != file_map_.end()) {
+    // File exists
+    return IOStatus::OK();
+  }
+  // Now also check if fn exists as a dir
+  for (const auto& iter : file_map_) {
+    const std::string& filename = iter.first;
+    if (filename.size() >= fn.size() + 1 && filename[fn.size()] == '/' &&
+        Slice(filename).starts_with(Slice(fn))) {
+      return IOStatus::OK();
+    }
+  }
+  return IOStatus::NotFound();
+}
+
+bool MockFileSystem::GetChildrenInternal(const std::string& dir,
+                                         std::vector<std::string>* result) {
+  auto d = NormalizeMockPath(dir);
+  bool found_dir = false;
+  result->clear();
+  for (const auto& iter : file_map_) {
+    const std::string& filename = iter.first;
+
+    if (filename == d) {
+      found_dir = true;
+    } else if (filename.size() >= d.size() + 1 && filename[d.size()] == '/' &&
+               Slice(filename).starts_with(Slice(d))) {
+      found_dir = true;
+      size_t next_slash = filename.find('/', d.size() + 1);
+      if (next_slash != std::string::npos) {
+        result->push_back(
+            filename.substr(d.size() + 1, next_slash - d.size() - 1));
+      } else {
+        result->push_back(filename.substr(d.size() + 1));
+      }
+    }
+  }
+  result->erase(std::unique(result->begin(), result->end()), result->end());
+  return found_dir;
+}
+
+IOStatus MockFileSystem::GetChildren(const std::string& dir,
+                                     const IOOptions& /*options*/,
+                                     std::vector<std::string>* result,
+                                     IODebugContext* /*dbg*/) {
+  MutexLock lock(&mutex_);
+  bool found_dir = GetChildrenInternal(dir, result);
+#ifndef __clang_analyzer__
+  return found_dir ? IOStatus::OK() : IOStatus::NotFound(dir);
+#else
+  return found_dir ? IOStatus::OK() : IOStatus::NotFound();
+#endif
+}
+
+void MockFileSystem::DeleteFileInternal(const std::string& fname) {
+  assert(fname == NormalizeMockPath(fname));
+  const auto& pair = file_map_.find(fname);
+  if (pair != file_map_.end()) {
+    pair->second->Unref();
+    file_map_.erase(fname);
+  }
+}
+
+IOStatus MockFileSystem::DeleteFile(const std::string& fname,
+                                    const IOOptions& /*options*/,
+                                    IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
+  MutexLock lock(&mutex_);
+  if (file_map_.find(fn) == file_map_.end()) {
+    return IOStatus::PathNotFound(fn);
+  }
+
+  DeleteFileInternal(fn);
+  return IOStatus::OK();
+}
+
+IOStatus MockFileSystem::Truncate(const std::string& fname, size_t size,
+                                  const IOOptions& options,
+                                  IODebugContext* dbg) {
+  auto fn = NormalizeMockPath(fname);
+  MutexLock lock(&mutex_);
+  auto iter = file_map_.find(fn);
+  if (iter == file_map_.end()) {
+    return IOStatus::PathNotFound(fn);
+  }
+  iter->second->Truncate(size, options, dbg);
+  return IOStatus::OK();
+}
+
+IOStatus MockFileSystem::CreateDir(const std::string& dirname,
+                                   const IOOptions& /*options*/,
+                                   IODebugContext* /*dbg*/) {
+  auto dn = NormalizeMockPath(dirname);
+  MutexLock lock(&mutex_);
+  if (file_map_.find(dn) == file_map_.end()) {
+    MemFile* file = new MemFile(clock_, dn, false);
+    file->Ref();
+    file_map_[dn] = file;
+  } else {
+    return IOStatus::IOError();
+  }
+  return IOStatus::OK();
+}
+
+IOStatus MockFileSystem::CreateDirIfMissing(const std::string& dirname,
+                                            const IOOptions& options,
+                                            IODebugContext* dbg) {
+  CreateDir(dirname, options, dbg).PermitUncheckedError();
+  return IOStatus::OK();
+}
+
+IOStatus MockFileSystem::DeleteDir(const std::string& dirname,
+                                   const IOOptions& /*options*/,
+                                   IODebugContext* /*dbg*/) {
+  auto dir = NormalizeMockPath(dirname);
+  MutexLock lock(&mutex_);
+  if (file_map_.find(dir) == file_map_.end()) {
+    return IOStatus::PathNotFound(dir);
+  } else {
+    std::vector<std::string> children;
+    if (GetChildrenInternal(dir, &children)) {
+      for (const auto& child : children) {
+        DeleteFileInternal(child);
+      }
+    }
+    DeleteFileInternal(dir);
+    return IOStatus::OK();
+  }
+}
+
+IOStatus MockFileSystem::GetFileSize(const std::string& fname,
+                                     const IOOptions& /*options*/,
+                                     uint64_t* file_size,
+                                     IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
+  TEST_SYNC_POINT_CALLBACK("MockFileSystem::GetFileSize:CheckFileType", &fn);
+  MutexLock lock(&mutex_);
+  auto iter = file_map_.find(fn);
+  if (iter == file_map_.end()) {
+    return IOStatus::PathNotFound(fn);
+  }
+
+  *file_size = iter->second->Size();
+  return IOStatus::OK();
+}
+
+IOStatus MockFileSystem::GetFileModificationTime(const std::string& fname,
+                                                 const IOOptions& /*options*/,
+                                                 uint64_t* time,
+                                                 IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
+  MutexLock lock(&mutex_);
+  auto iter = file_map_.find(fn);
+  if (iter == file_map_.end()) {
+    return IOStatus::PathNotFound(fn);
+  }
+  *time = iter->second->ModifiedTime();
+  return IOStatus::OK();
+}
+
+bool MockFileSystem::RenameFileInternal(const std::string& src,
+                                        const std::string& dest) {
+  if (file_map_.find(src) == file_map_.end()) {
+    return false;
+  } else {
+    std::vector<std::string> children;
+    if (GetChildrenInternal(src, &children)) {
+      for (const auto& child : children) {
+        RenameFileInternal(src + "/" + child, dest + "/" + child);
+      }
+    }
+    DeleteFileInternal(dest);
+    file_map_[dest] = file_map_[src];
+    file_map_.erase(src);
+    return true;
+  }
+}
+
+IOStatus MockFileSystem::RenameFile(const std::string& src,
+                                    const std::string& dest,
+                                    const IOOptions& /*options*/,
+                                    IODebugContext* /*dbg*/) {
+  auto s = NormalizeMockPath(src);
+  auto t = NormalizeMockPath(dest);
+  MutexLock lock(&mutex_);
+  bool found = RenameFileInternal(s, t);
+  if (!found) {
+    return IOStatus::PathNotFound(s);
+  } else {
+    return IOStatus::OK();
+  }
+}
+
+IOStatus MockFileSystem::LinkFile(const std::string& src,
+                                  const std::string& dest,
+                                  const IOOptions& /*options*/,
+                                  IODebugContext* /*dbg*/) {
+  auto s = NormalizeMockPath(src);
+  auto t = NormalizeMockPath(dest);
+  MutexLock lock(&mutex_);
+  if (file_map_.find(s) == file_map_.end()) {
+    return IOStatus::PathNotFound(s);
+  }
+
+  DeleteFileInternal(t);
+  file_map_[t] = file_map_[s];
+  file_map_[t]->Ref();  // Otherwise it might get deleted when noone uses s
+  return IOStatus::OK();
+}
+
+IOStatus MockFileSystem::NewLogger(const std::string& fname,
+                                   const IOOptions& io_opts,
+                                   std::shared_ptr<Logger>* result,
+                                   IODebugContext* dbg) {
+  auto fn = NormalizeMockPath(fname);
+  MutexLock lock(&mutex_);
+  auto iter = file_map_.find(fn);
+  MemFile* file = nullptr;
+  if (iter == file_map_.end()) {
+    file = new MemFile(clock_, fn, false);
+    file->Ref();
+    file_map_[fn] = file;
+  } else {
+    file = iter->second;
+  }
+  std::unique_ptr<FSWritableFile> f(new MockWritableFile(file, FileOptions()));
+  result->reset(new TestMemLogger(std::move(f), clock_, io_opts, dbg));
+  return IOStatus::OK();
+}
+
+IOStatus MockFileSystem::LockFile(const std::string& fname,
+                                  const IOOptions& /*options*/,
+                                  FileLock** flock, IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
+  {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(fn) != file_map_.end()) {
+      if (!file_map_[fn]->is_lock_file()) {
+        return IOStatus::InvalidArgument(fname, "Not a lock file.");
+      }
+      if (!file_map_[fn]->Lock()) {
+        return IOStatus::IOError(fn, "lock is already held.");
+      }
+    } else {
+      auto* file = new MemFile(clock_, fn, true);
+      file->Ref();
+      file->Lock();
+      file_map_[fn] = file;
+    }
+  }
+  *flock = new MockEnvFileLock(fn);
+  return IOStatus::OK();
+}
+
+IOStatus MockFileSystem::UnlockFile(FileLock* flock,
+                                    const IOOptions& /*options*/,
+                                    IODebugContext* /*dbg*/) {
+  std::string fn = static_cast_with_check<MockEnvFileLock>(flock)->FileName();
+  {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(fn) != file_map_.end()) {
+      if (!file_map_[fn]->is_lock_file()) {
+        return IOStatus::InvalidArgument(fn, "Not a lock file.");
+      }
+      file_map_[fn]->Unlock();
+    }
+  }
+  delete flock;
+  return IOStatus::OK();
+}
+
+IOStatus MockFileSystem::GetTestDirectory(const IOOptions& /*options*/,
+                                          std::string* path,
+                                          IODebugContext* /*dbg*/) {
+  *path = "/test";
+  return IOStatus::OK();
+}
+
+Status MockFileSystem::CorruptBuffer(const std::string& fname) {
+  auto fn = NormalizeMockPath(fname);
+  MutexLock lock(&mutex_);
+  auto iter = file_map_.find(fn);
+  if (iter == file_map_.end()) {
+    return Status::IOError(fn, "File not found");
+  }
+  iter->second->CorruptBuffer();
+  return Status::OK();
+}
+
+MockEnv::MockEnv(Env* env, const std::shared_ptr<FileSystem>& fs,
+                 const std::shared_ptr<SystemClock>& clock)
+    : CompositeEnvWrapper(env, fs, clock) {}
+
+MockEnv* MockEnv::Create(Env* env) {
+  auto clock =
+      std::make_shared<EmulatedSystemClock>(env->GetSystemClock(), true);
+  return MockEnv::Create(env, clock);
+}
+
+MockEnv* MockEnv::Create(Env* env, const std::shared_ptr<SystemClock>& clock) {
+  auto fs = std::make_shared<MockFileSystem>(clock);
+  return new MockEnv(env, fs, clock);
+}
+
+Status MockEnv::CorruptBuffer(const std::string& fname) {
+  auto mock = static_cast_with_check<MockFileSystem>(GetFileSystem().get());
+  return mock->CorruptBuffer(fname);
+}
+
+#ifndef ROCKSDB_LITE
+// This is to maintain the behavior before swithcing from InMemoryEnv to MockEnv
+Env* NewMemEnv(Env* base_env) { return MockEnv::Create(base_env); }
+
+#else  // ROCKSDB_LITE
+
+Env* NewMemEnv(Env* /*base_env*/) { return nullptr; }
+
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/env/mock_env.h b/src/rocksdb/env/mock_env.h
new file mode 100644
index 000000000..406a31f63
--- /dev/null
+++ b/src/rocksdb/env/mock_env.h
@@ -0,0 +1,144 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "env/composite_env_wrapper.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/system_clock.h"
+
+namespace ROCKSDB_NAMESPACE {
+class MemFile;
+class MockFileSystem : public FileSystem {
+ public:
+  explicit MockFileSystem(const std::shared_ptr<SystemClock>& clock,
+                          bool supports_direct_io = true);
+  ~MockFileSystem() override;
+
+  static const char* kClassName() { return "MemoryFileSystem"; }
+  const char* Name() const override { return kClassName(); }
+  IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts,
+                             std::unique_ptr<FSSequentialFile>* r,
+                             IODebugContext* dbg) override;
+  IOStatus NewRandomAccessFile(const std::string& f,
+                               const FileOptions& file_opts,
+                               std::unique_ptr<FSRandomAccessFile>* r,
+                               IODebugContext* dbg) override;
+
+  IOStatus NewRandomRWFile(const std::string& fname,
+                           const FileOptions& file_opts,
+                           std::unique_ptr<FSRandomRWFile>* result,
+                           IODebugContext* dbg) override;
+  IOStatus ReuseWritableFile(const std::string& fname,
+                             const std::string& old_fname,
+                             const FileOptions& file_opts,
+                             std::unique_ptr<FSWritableFile>* result,
+                             IODebugContext* dbg) override;
+  IOStatus NewWritableFile(const std::string& fname,
+                           const FileOptions& file_opts,
+                           std::unique_ptr<FSWritableFile>* result,
+                           IODebugContext* dbg) override;
+  IOStatus ReopenWritableFile(const std::string& fname,
+                              const FileOptions& options,
+                              std::unique_ptr<FSWritableFile>* result,
+                              IODebugContext* dbg) override;
+  IOStatus NewDirectory(const std::string& /*name*/, const IOOptions& io_opts,
+                        std::unique_ptr<FSDirectory>* result,
+                        IODebugContext* dbg) override;
+  IOStatus FileExists(const std::string& fname, const IOOptions& /*io_opts*/,
+                      IODebugContext* /*dbg*/) override;
+  IOStatus GetChildren(const std::string& dir, const IOOptions& options,
+                       std::vector<std::string>* result,
+                       IODebugContext* dbg) override;
+  IOStatus DeleteFile(const std::string& fname, const IOOptions& options,
+                      IODebugContext* dbg) override;
+  IOStatus Truncate(const std::string& fname, size_t size,
+                    const IOOptions& options, IODebugContext* dbg) override;
+  IOStatus CreateDir(const std::string& dirname, const IOOptions& options,
+                     IODebugContext* dbg) override;
+  IOStatus CreateDirIfMissing(const std::string& dirname,
+                              const IOOptions& options,
+                              IODebugContext* dbg) override;
+  IOStatus DeleteDir(const std::string& dirname, const IOOptions& options,
+                     IODebugContext* dbg) override;
+
+  IOStatus GetFileSize(const std::string& fname, const IOOptions& options,
+                       uint64_t* file_size, IODebugContext* dbg) override;
+
+  IOStatus GetFileModificationTime(const std::string& fname,
+                                   const IOOptions& options,
+                                   uint64_t* file_mtime,
+                                   IODebugContext* dbg) override;
+  IOStatus RenameFile(const std::string& src, const std::string& target,
+                      const IOOptions& options, IODebugContext* dbg) override;
+  IOStatus LinkFile(const std::string& /*src*/, const std::string& /*target*/,
+                    const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override;
+  IOStatus LockFile(const std::string& fname, const IOOptions& options,
+                    FileLock** lock, IODebugContext* dbg) override;
+  IOStatus UnlockFile(FileLock* lock, const IOOptions& options,
+                      IODebugContext* dbg) override;
+  IOStatus GetTestDirectory(const IOOptions& options, std::string* path,
+                            IODebugContext* dbg) override;
+  IOStatus NewLogger(const std::string& fname, const IOOptions& io_opts,
+                     std::shared_ptr<Logger>* result,
+                     IODebugContext* dbg) override;
+  // Get full directory name for this db.
+  IOStatus GetAbsolutePath(const std::string& db_path,
+                           const IOOptions& /*options*/,
+                           std::string* output_path,
+                           IODebugContext* /*dbg*/) override;
+  IOStatus IsDirectory(const std::string& /*path*/,
+                       const IOOptions& /*options*/, bool* /*is_dir*/,
+                       IODebugContext* /*dgb*/) override {
+    return IOStatus::NotSupported("IsDirectory");
+  }
+
+  Status CorruptBuffer(const std::string& fname);
+  Status PrepareOptions(const ConfigOptions& options) override;
+
+ private:
+  bool RenameFileInternal(const std::string& src, const std::string& dest);
+  void DeleteFileInternal(const std::string& fname);
+  bool GetChildrenInternal(const std::string& fname,
+                           std::vector<std::string>* results);
+
+  std::string NormalizeMockPath(const std::string& path);
+
+ private:
+  // Map from filenames to MemFile objects, representing a simple file system.
+  port::Mutex mutex_;
+  std::map<std::string, MemFile*> file_map_;  // Protected by mutex_.
+  std::shared_ptr<SystemClock> system_clock_;
+  SystemClock* clock_;
+  bool supports_direct_io_;
+};
+
+class MockEnv : public CompositeEnvWrapper {
+ public:
+  static MockEnv* Create(Env* base);
+  static MockEnv* Create(Env* base, const std::shared_ptr<SystemClock>& clock);
+
+  static const char* kClassName() { return "MockEnv"; }
+  const char* Name() const override { return kClassName(); }
+
+  Status CorruptBuffer(const std::string& fname);
+
+ private:
+  MockEnv(Env* env, const std::shared_ptr<FileSystem>& fs,
+          const std::shared_ptr<SystemClock>& clock);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/env/mock_env_test.cc b/src/rocksdb/env/mock_env_test.cc
new file mode 100644
index 000000000..be174bd73
--- /dev/null
+++ b/src/rocksdb/env/mock_env_test.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+#include "env/mock_env.h"
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/env.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MockEnvTest : public testing::Test {
+ public:
+  MockEnv* env_;
+  const EnvOptions soptions_;
+
+  MockEnvTest() : env_(MockEnv::Create(Env::Default())) {}
+  ~MockEnvTest() override { delete env_; }
+};
+
+TEST_F(MockEnvTest, Corrupt) {
+  const std::string kGood = "this is a good string, synced to disk";
+  const std::string kCorrupted = "this part may be corrupted";
+  const std::string kFileName = "/dir/f";
+  std::unique_ptr<WritableFile> writable_file;
+  ASSERT_OK(env_->NewWritableFile(kFileName, &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append(kGood));
+  ASSERT_TRUE(writable_file->GetFileSize() == kGood.size());
+
+  std::string scratch;
+  scratch.resize(kGood.size() + kCorrupted.size() + 16);
+  Slice result;
+  std::unique_ptr<RandomAccessFile> rand_file;
+  ASSERT_OK(env_->NewRandomAccessFile(kFileName, &rand_file, soptions_));
+  ASSERT_OK(rand_file->Read(0, kGood.size(), &result, &(scratch[0])));
+  ASSERT_EQ(result.compare(kGood), 0);
+
+  // Sync + corrupt => no change
+  ASSERT_OK(writable_file->Fsync());
+  ASSERT_OK(dynamic_cast<MockEnv*>(env_)->CorruptBuffer(kFileName));
+  result.clear();
+  ASSERT_OK(rand_file->Read(0, kGood.size(), &result, &(scratch[0])));
+  ASSERT_EQ(result.compare(kGood), 0);
+
+  // Add new data and corrupt it
+  ASSERT_OK(writable_file->Append(kCorrupted));
+  ASSERT_TRUE(writable_file->GetFileSize() == kGood.size() + kCorrupted.size());
+  result.clear();
+  ASSERT_OK(
+      rand_file->Read(kGood.size(), kCorrupted.size(), &result, &(scratch[0])));
+  ASSERT_EQ(result.compare(kCorrupted), 0);
+  // Corrupted
+  ASSERT_OK(dynamic_cast<MockEnv*>(env_)->CorruptBuffer(kFileName));
+  result.clear();
+  ASSERT_OK(
+      rand_file->Read(kGood.size(), kCorrupted.size(), &result, &(scratch[0])));
+  ASSERT_NE(result.compare(kCorrupted), 0);
+}
+
+TEST_F(MockEnvTest, FakeSleeping) {
+  int64_t now = 0;
+  auto s = env_->GetCurrentTime(&now);
+  ASSERT_OK(s);
+  env_->SleepForMicroseconds(3 * 1000 * 1000);
+  int64_t after_sleep = 0;
+  s = env_->GetCurrentTime(&after_sleep);
+  ASSERT_OK(s);
+  auto delta = after_sleep - now;
+  // this will be true unless test runs for 2 seconds
+  ASSERT_TRUE(delta == 3 || delta == 4);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/env/unique_id_gen.cc b/src/rocksdb/env/unique_id_gen.cc
new file mode 100644
index 000000000..a1986fa15
--- /dev/null
+++ b/src/rocksdb/env/unique_id_gen.cc
@@ -0,0 +1,164 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "env/unique_id_gen.h"
+
+#include <algorithm>
+#include <array>
+#include <cstring>
+#include <random>
+
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/version.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+struct GenerateRawUniqueIdOpts {
+  Env* env = Env::Default();
+  bool exclude_port_uuid = false;
+  bool exclude_env_details = false;
+  bool exclude_random_device = false;
+};
+
+// Each of these "tracks" below should be sufficient for generating 128 bits
+// of entropy, after hashing the raw bytes. The tracks are separable for
+// testing purposes, but in production we combine as many tracks as possible
+// to ensure quality results even if some environments have degraded
+// capabilities or quality in some APIs.
+//
+// This approach has not been validated for use in cryptography. The goal is
+// generating globally unique values with high probability without coordination
+// between instances.
+//
+// Linux performance: EntropyTrackRandomDevice is much faster than
+// EntropyTrackEnvDetails, which is much faster than EntropyTrackPortUuid.
+
+struct EntropyTrackPortUuid {
+  std::array<char, 36> uuid;
+
+  void Populate(const GenerateRawUniqueIdOpts& opts) {
+    if (opts.exclude_port_uuid) {
+      return;
+    }
+    std::string s;
+    port::GenerateRfcUuid(&s);
+    if (s.size() >= uuid.size()) {
+      std::copy_n(s.begin(), uuid.size(), uuid.begin());
+    }
+  }
+};
+
+struct EntropyTrackEnvDetails {
+  std::array<char, 64> hostname_buf;
+  int64_t process_id;
+  uint64_t thread_id;
+  int64_t unix_time;
+  uint64_t nano_time;
+
+  void Populate(const GenerateRawUniqueIdOpts& opts) {
+    if (opts.exclude_env_details) {
+      return;
+    }
+    opts.env->GetHostName(hostname_buf.data(), hostname_buf.size())
+        .PermitUncheckedError();
+    process_id = port::GetProcessID();
+    thread_id = opts.env->GetThreadID();
+    opts.env->GetCurrentTime(&unix_time).PermitUncheckedError();
+    nano_time = opts.env->NowNanos();
+  }
+};
+
+struct EntropyTrackRandomDevice {
+  using RandType = std::random_device::result_type;
+  static constexpr size_t kNumRandVals =
+      /* generous bits */ 192U / (8U * sizeof(RandType));
+  std::array<RandType, kNumRandVals> rand_vals;
+
+  void Populate(const GenerateRawUniqueIdOpts& opts) {
+    if (opts.exclude_random_device) {
+      return;
+    }
+    std::random_device r;
+    for (auto& val : rand_vals) {
+      val = r();
+    }
+  }
+};
+
+struct Entropy {
+  uint64_t version_identifier;
+  EntropyTrackRandomDevice et1;
+  EntropyTrackEnvDetails et2;
+  EntropyTrackPortUuid et3;
+
+  void Populate(const GenerateRawUniqueIdOpts& opts) {
+    // If we change the format of what goes into the entropy inputs, it's
+    // conceivable there could be a physical collision in the hash input
+    // even though they are logically different. This value should change
+    // if there's a change to the "schema" here, including byte order.
+    version_identifier = (uint64_t{ROCKSDB_MAJOR} << 32) +
+                         (uint64_t{ROCKSDB_MINOR} << 16) +
+                         uint64_t{ROCKSDB_PATCH};
+    et1.Populate(opts);
+    et2.Populate(opts);
+    et3.Populate(opts);
+  }
+};
+
+void GenerateRawUniqueIdImpl(uint64_t* a, uint64_t* b,
+                             const GenerateRawUniqueIdOpts& opts) {
+  Entropy e;
+  std::memset(&e, 0, sizeof(e));
+  e.Populate(opts);
+  Hash2x64(reinterpret_cast<const char*>(&e), sizeof(e), a, b);
+}
+
+}  // namespace
+
+void GenerateRawUniqueId(uint64_t* a, uint64_t* b, bool exclude_port_uuid) {
+  GenerateRawUniqueIdOpts opts;
+  opts.exclude_port_uuid = exclude_port_uuid;
+  assert(!opts.exclude_env_details);
+  assert(!opts.exclude_random_device);
+  GenerateRawUniqueIdImpl(a, b, opts);
+}
+
+#ifndef NDEBUG
+void TEST_GenerateRawUniqueId(uint64_t* a, uint64_t* b, bool exclude_port_uuid,
+                              bool exclude_env_details,
+                              bool exclude_random_device) {
+  GenerateRawUniqueIdOpts opts;
+  opts.exclude_port_uuid = exclude_port_uuid;
+  opts.exclude_env_details = exclude_env_details;
+  opts.exclude_random_device = exclude_random_device;
+  GenerateRawUniqueIdImpl(a, b, opts);
+}
+#endif
+
+void SemiStructuredUniqueIdGen::Reset() {
+  saved_process_id_ = port::GetProcessID();
+  GenerateRawUniqueId(&base_upper_, &base_lower_);
+  counter_ = 0;
+}
+
+void SemiStructuredUniqueIdGen::GenerateNext(uint64_t* upper, uint64_t* lower) {
+  if (port::GetProcessID() == saved_process_id_) {
+    // Safe to increment the atomic for guaranteed uniqueness within this
+    // process lifetime. Xor slightly better than +. See
+    // https://github.com/pdillinger/unique_id
+    *lower = base_lower_ ^ counter_.fetch_add(1);
+    *upper = base_upper_;
+  } else {
+    // There must have been a fork() or something. Rather than attempting to
+    // update in a thread-safe way, simply fall back on GenerateRawUniqueId.
+    GenerateRawUniqueId(upper, lower);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/env/unique_id_gen.h b/src/rocksdb/env/unique_id_gen.h
new file mode 100644
index 000000000..17e71e622
--- /dev/null
+++ b/src/rocksdb/env/unique_id_gen.h
@@ -0,0 +1,71 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+// This file is for functions that generate unique identifiers by
+// (at least in part) by extracting novel entropy or sources of uniqueness
+// from the execution environment. (By contrast, random.h is for algorithmic
+// pseudorandomness.)
+//
+// These functions could eventually migrate to public APIs, such as in Env.
+
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Generates a new 128-bit identifier that is universally unique
+// (with high probability) for each call. The result is split into
+// two 64-bit pieces. This function has NOT been validated for use in
+// cryptography.
+//
+// This is used in generating DB session IDs and by Env::GenerateUniqueId
+// (used for DB IDENTITY) if the platform does not provide a generator of
+// RFC 4122 UUIDs or fails somehow. (Set exclude_port_uuid=true if this
+// function is used as a fallback for GenerateRfcUuid, because no need
+// trying it again.)
+void GenerateRawUniqueId(uint64_t* a, uint64_t* b,
+                         bool exclude_port_uuid = false);
+
+#ifndef NDEBUG
+// A version of above with options for challenge testing
+void TEST_GenerateRawUniqueId(uint64_t* a, uint64_t* b, bool exclude_port_uuid,
+                              bool exclude_env_details,
+                              bool exclude_random_device);
+#endif
+
+// Generates globally unique ids with lower probability of any collisions
+// vs. each unique id being independently random (GenerateRawUniqueId).
+// We call this "semi-structured" because between different
+// SemiStructuredUniqueIdGen objects, the IDs are separated by random
+// intervals (unstructured), but within a single SemiStructuredUniqueIdGen
+// object, the generated IDs are trivially related (structured). See
+// https://github.com/pdillinger/unique_id for how this improves probability
+// of no collision. In short, if we have n SemiStructuredUniqueIdGen
+// objects each generating m IDs, the first collision is expected at
+// around n = sqrt(2^128 / m), equivalently n * sqrt(m) = 2^64,
+// rather than n * m = 2^64 for fully random IDs.
+class SemiStructuredUniqueIdGen {
+ public:
+  // Initializes with random starting state (from GenerateRawUniqueId)
+  SemiStructuredUniqueIdGen() { Reset(); }
+  // Re-initializes, but not thread safe
+  void Reset();
+
+  // Assuming no fork(), `lower` is guaranteed unique from one call
+  // to the next (thread safe).
+  void GenerateNext(uint64_t* upper, uint64_t* lower);
+
+ private:
+  uint64_t base_upper_;
+  uint64_t base_lower_;
+  std::atomic<uint64_t> counter_;
+  int64_t saved_process_id_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/examples/.gitignore b/src/rocksdb/examples/.gitignore
new file mode 100644
index 000000000..39da06a85
--- /dev/null
+++ b/src/rocksdb/examples/.gitignore
@@ -0,0 +1,10 @@
+c_simple_example
+column_families_example
+compact_files_example
+compaction_filter_example
+multi_processes_example
+optimistic_transaction_example
+options_file_example
+rocksdb_backup_restore_example
+simple_example
+transaction_example
diff --git a/src/rocksdb/examples/CMakeLists.txt b/src/rocksdb/examples/CMakeLists.txt
new file mode 100644
index 000000000..0b93a6d8d
--- /dev/null
+++ b/src/rocksdb/examples/CMakeLists.txt
@@ -0,0 +1,45 @@
+add_executable(simple_example
+  simple_example.cc)
+target_link_libraries(simple_example
+  ${ROCKSDB_LIB})
+
+add_executable(column_families_example
+  column_families_example.cc)
+target_link_libraries(column_families_example
+  ${ROCKSDB_LIB})
+
+add_executable(compact_files_example
+  compact_files_example.cc)
+target_link_libraries(compact_files_example
+  ${ROCKSDB_LIB})
+
+add_executable(c_simple_example
+  c_simple_example.c)
+target_link_libraries(c_simple_example
+  ${ROCKSDB_LIB})
+
+add_executable(optimistic_transaction_example
+  optimistic_transaction_example.cc)
+target_link_libraries(optimistic_transaction_example
+  ${ROCKSDB_LIB})
+
+add_executable(transaction_example
+  transaction_example.cc)
+target_link_libraries(transaction_example
+  ${ROCKSDB_LIB})
+
+add_executable(compaction_filter_example
+  compaction_filter_example.cc)
+target_link_libraries(compaction_filter_example
+  ${ROCKSDB_LIB})
+
+add_executable(options_file_example
+  options_file_example.cc)
+target_link_libraries(options_file_example
+  ${ROCKSDB_LIB})
+
+add_executable(multi_processes_example
+  EXCLUDE_FROM_ALL
+  multi_processes_example.cc)
+target_link_libraries(multi_processes_example
+  ${ROCKSDB_LIB})
diff --git a/src/rocksdb/examples/Makefile b/src/rocksdb/examples/Makefile
new file mode 100644
index 000000000..b056508a6
--- /dev/null
+++ b/src/rocksdb/examples/Makefile
@@ -0,0 +1,58 @@
+include ../make_config.mk
+
+ifndef DISABLE_JEMALLOC
+	ifdef JEMALLOC
+		PLATFORM_CXXFLAGS += -DROCKSDB_JEMALLOC -DJEMALLOC_NO_DEMANGLE
+	endif
+	EXEC_LDFLAGS := $(JEMALLOC_LIB) $(EXEC_LDFLAGS) -lpthread
+	PLATFORM_CXXFLAGS += $(JEMALLOC_INCLUDE)
+endif
+
+ifneq ($(USE_RTTI), 1)
+	CXXFLAGS += -fno-rtti
+endif
+
+CFLAGS += -Wstrict-prototypes
+
+.PHONY: clean librocksdb
+
+all: simple_example column_families_example compact_files_example c_simple_example optimistic_transaction_example transaction_example compaction_filter_example options_file_example rocksdb_backup_restore_example
+
+simple_example: librocksdb simple_example.cc
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+
+column_families_example: librocksdb column_families_example.cc
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+
+compaction_filter_example: librocksdb compaction_filter_example.cc
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+
+compact_files_example: librocksdb compact_files_example.cc
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+
+.c.o:
+	$(CC) $(CFLAGS) -c $< -o $@ -I../include
+
+c_simple_example: librocksdb c_simple_example.o
+	$(CXX) $@.o -o$@ ../librocksdb.a $(PLATFORM_LDFLAGS) $(EXEC_LDFLAGS)
+
+optimistic_transaction_example: librocksdb optimistic_transaction_example.cc
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+
+transaction_example: librocksdb transaction_example.cc
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+
+options_file_example: librocksdb options_file_example.cc
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+
+multi_processes_example: librocksdb multi_processes_example.cc
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+
+rocksdb_backup_restore_example: librocksdb rocksdb_backup_restore_example.cc
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+
+clean:
+	rm -rf ./simple_example ./column_families_example ./compact_files_example ./compaction_filter_example ./c_simple_example c_simple_example.o ./optimistic_transaction_example ./transaction_example ./options_file_example ./multi_processes_example ./rocksdb_backup_restore_example
+
+librocksdb:
+	cd .. && $(MAKE) static_lib
diff --git a/src/rocksdb/examples/README.md b/src/rocksdb/examples/README.md
new file mode 100644
index 000000000..f4ba2384b
--- /dev/null
+++ b/src/rocksdb/examples/README.md
@@ -0,0 +1,2 @@
+1. Compile RocksDB first by executing `make static_lib` in parent dir
+2. Compile all examples: `cd examples/; make all`
diff --git a/src/rocksdb/examples/c_simple_example.c b/src/rocksdb/examples/c_simple_example.c
new file mode 100644
index 000000000..fe2f917b4
--- /dev/null
+++ b/src/rocksdb/examples/c_simple_example.c
@@ -0,0 +1,96 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "rocksdb/c.h"
+
+#if defined(OS_WIN)
+#include <Windows.h>
+#else
+#include <unistd.h>  // sysconf() - get CPU count
+#endif
+
+#if defined(OS_WIN)
+const char DBPath[] = "C:\\Windows\\TEMP\\rocksdb_c_simple_example";
+const char DBBackupPath[] =
+    "C:\\Windows\\TEMP\\rocksdb_c_simple_example_backup";
+#else
+const char DBPath[] = "/tmp/rocksdb_c_simple_example";
+const char DBBackupPath[] = "/tmp/rocksdb_c_simple_example_backup";
+#endif
+
+int main(int argc, char **argv) {
+  rocksdb_t *db;
+  rocksdb_backup_engine_t *be;
+  rocksdb_options_t *options = rocksdb_options_create();
+  // Optimize RocksDB. This is the easiest way to
+  // get RocksDB to perform well.
+#if defined(OS_WIN)
+  SYSTEM_INFO system_info;
+  GetSystemInfo(&system_info);
+  long cpus = system_info.dwNumberOfProcessors;
+#else
+  long cpus = sysconf(_SC_NPROCESSORS_ONLN);
+#endif
+  // Set # of online cores
+  rocksdb_options_increase_parallelism(options, (int)(cpus));
+  rocksdb_options_optimize_level_style_compaction(options, 0);
+  // create the DB if it's not already present
+  rocksdb_options_set_create_if_missing(options, 1);
+
+  // open DB
+  char *err = NULL;
+  db = rocksdb_open(options, DBPath, &err);
+  assert(!err);
+
+  // open Backup Engine that we will use for backing up our database
+  be = rocksdb_backup_engine_open(options, DBBackupPath, &err);
+  assert(!err);
+
+  // Put key-value
+  rocksdb_writeoptions_t *writeoptions = rocksdb_writeoptions_create();
+  const char key[] = "key";
+  const char *value = "value";
+  rocksdb_put(db, writeoptions, key, strlen(key), value, strlen(value) + 1,
+              &err);
+  assert(!err);
+  // Get value
+  rocksdb_readoptions_t *readoptions = rocksdb_readoptions_create();
+  size_t len;
+  char *returned_value =
+      rocksdb_get(db, readoptions, key, strlen(key), &len, &err);
+  assert(!err);
+  assert(strcmp(returned_value, "value") == 0);
+  free(returned_value);
+
+  // create new backup in a directory specified by DBBackupPath
+  rocksdb_backup_engine_create_new_backup(be, db, &err);
+  assert(!err);
+
+  rocksdb_close(db);
+
+  // If something is wrong, you might want to restore data from last backup
+  rocksdb_restore_options_t *restore_options = rocksdb_restore_options_create();
+  rocksdb_backup_engine_restore_db_from_latest_backup(be, DBPath, DBPath,
+                                                      restore_options, &err);
+  assert(!err);
+  rocksdb_restore_options_destroy(restore_options);
+
+  db = rocksdb_open(options, DBPath, &err);
+  assert(!err);
+
+  // cleanup
+  rocksdb_writeoptions_destroy(writeoptions);
+  rocksdb_readoptions_destroy(readoptions);
+  rocksdb_options_destroy(options);
+  rocksdb_backup_engine_close(be);
+  rocksdb_close(db);
+
+  return 0;
+}
diff --git a/src/rocksdb/examples/column_families_example.cc b/src/rocksdb/examples/column_families_example.cc
new file mode 100644
index 000000000..3828d3fb3
--- /dev/null
+++ b/src/rocksdb/examples/column_families_example.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#include <cstdio>
+#include <string>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+
+#if defined(OS_WIN)
+std::string kDBPath = "C:\\Windows\\TEMP\\rocksdb_column_families_example";
+#else
+std::string kDBPath = "/tmp/rocksdb_column_families_example";
+#endif
+
+using ROCKSDB_NAMESPACE::ColumnFamilyDescriptor;
+using ROCKSDB_NAMESPACE::ColumnFamilyHandle;
+using ROCKSDB_NAMESPACE::ColumnFamilyOptions;
+using ROCKSDB_NAMESPACE::DB;
+using ROCKSDB_NAMESPACE::DBOptions;
+using ROCKSDB_NAMESPACE::Options;
+using ROCKSDB_NAMESPACE::ReadOptions;
+using ROCKSDB_NAMESPACE::Slice;
+using ROCKSDB_NAMESPACE::Status;
+using ROCKSDB_NAMESPACE::WriteBatch;
+using ROCKSDB_NAMESPACE::WriteOptions;
+
+int main() {
+  // open DB
+  Options options;
+  options.create_if_missing = true;
+  DB* db;
+  Status s = DB::Open(options, kDBPath, &db);
+  assert(s.ok());
+
+  // create column family
+  ColumnFamilyHandle* cf;
+  s = db->CreateColumnFamily(ColumnFamilyOptions(), "new_cf", &cf);
+  assert(s.ok());
+
+  // close DB
+  s = db->DestroyColumnFamilyHandle(cf);
+  assert(s.ok());
+  delete db;
+
+  // open DB with two column families
+  std::vector<ColumnFamilyDescriptor> column_families;
+  // have to open default column family
+  column_families.push_back(ColumnFamilyDescriptor(
+      ROCKSDB_NAMESPACE::kDefaultColumnFamilyName, ColumnFamilyOptions()));
+  // open the new one, too
+  column_families.push_back(
+      ColumnFamilyDescriptor("new_cf", ColumnFamilyOptions()));
+  std::vector<ColumnFamilyHandle*> handles;
+  s = DB::Open(DBOptions(), kDBPath, column_families, &handles, &db);
+  assert(s.ok());
+
+  // put and get from non-default column family
+  s = db->Put(WriteOptions(), handles[1], Slice("key"), Slice("value"));
+  assert(s.ok());
+  std::string value;
+  s = db->Get(ReadOptions(), handles[1], Slice("key"), &value);
+  assert(s.ok());
+
+  // atomic write
+  WriteBatch batch;
+  batch.Put(handles[0], Slice("key2"), Slice("value2"));
+  batch.Put(handles[1], Slice("key3"), Slice("value3"));
+  batch.Delete(handles[0], Slice("key"));
+  s = db->Write(WriteOptions(), &batch);
+  assert(s.ok());
+
+  // drop column family
+  s = db->DropColumnFamily(handles[1]);
+  assert(s.ok());
+
+  // close db
+  for (auto handle : handles) {
+    s = db->DestroyColumnFamilyHandle(handle);
+    assert(s.ok());
+  }
+  delete db;
+
+  return 0;
+}
diff --git a/src/rocksdb/examples/compact_files_example.cc b/src/rocksdb/examples/compact_files_example.cc
new file mode 100644
index 000000000..1ecf8c794
--- /dev/null
+++ b/src/rocksdb/examples/compact_files_example.cc
@@ -0,0 +1,177 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// An example code demonstrating how to use CompactFiles, EventListener,
+// and GetColumnFamilyMetaData APIs to implement custom compaction algorithm.
+
+#include <mutex>
+#include <string>
+
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+
+using ROCKSDB_NAMESPACE::ColumnFamilyMetaData;
+using ROCKSDB_NAMESPACE::CompactionOptions;
+using ROCKSDB_NAMESPACE::DB;
+using ROCKSDB_NAMESPACE::EventListener;
+using ROCKSDB_NAMESPACE::FlushJobInfo;
+using ROCKSDB_NAMESPACE::Options;
+using ROCKSDB_NAMESPACE::ReadOptions;
+using ROCKSDB_NAMESPACE::Status;
+using ROCKSDB_NAMESPACE::WriteOptions;
+
+#if defined(OS_WIN)
+std::string kDBPath = "C:\\Windows\\TEMP\\rocksdb_compact_files_example";
+#else
+std::string kDBPath = "/tmp/rocksdb_compact_files_example";
+#endif
+
+struct CompactionTask;
+
+// This is an example interface of external-compaction algorithm.
+// Compaction algorithm can be implemented outside the core-RocksDB
+// code by using the pluggable compaction APIs that RocksDb provides.
+class Compactor : public EventListener {
+ public:
+  // Picks and returns a compaction task given the specified DB
+  // and column family.  It is the caller's responsibility to
+  // destroy the returned CompactionTask.  Returns "nullptr"
+  // if it cannot find a proper compaction task.
+  virtual CompactionTask* PickCompaction(DB* db,
+                                         const std::string& cf_name) = 0;
+
+  // Schedule and run the specified compaction task in background.
+  virtual void ScheduleCompaction(CompactionTask* task) = 0;
+};
+
+// Example structure that describes a compaction task.
+struct CompactionTask {
+  CompactionTask(DB* _db, Compactor* _compactor,
+                 const std::string& _column_family_name,
+                 const std::vector<std::string>& _input_file_names,
+                 const int _output_level,
+                 const CompactionOptions& _compact_options, bool _retry_on_fail)
+      : db(_db),
+        compactor(_compactor),
+        column_family_name(_column_family_name),
+        input_file_names(_input_file_names),
+        output_level(_output_level),
+        compact_options(_compact_options),
+        retry_on_fail(_retry_on_fail) {}
+  DB* db;
+  Compactor* compactor;
+  const std::string& column_family_name;
+  std::vector<std::string> input_file_names;
+  int output_level;
+  CompactionOptions compact_options;
+  bool retry_on_fail;
+};
+
+// A simple compaction algorithm that always compacts everything
+// to the highest level whenever possible.
+class FullCompactor : public Compactor {
+ public:
+  explicit FullCompactor(const Options options) : options_(options) {
+    compact_options_.compression = options_.compression;
+    compact_options_.output_file_size_limit = options_.target_file_size_base;
+  }
+
+  // When flush happens, it determines whether to trigger compaction. If
+  // triggered_writes_stop is true, it will also set the retry flag of
+  // compaction-task to true.
+  void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
+    CompactionTask* task = PickCompaction(db, info.cf_name);
+    if (task != nullptr) {
+      if (info.triggered_writes_stop) {
+        task->retry_on_fail = true;
+      }
+      // Schedule compaction in a different thread.
+      ScheduleCompaction(task);
+    }
+  }
+
+  // Always pick a compaction which includes all files whenever possible.
+  CompactionTask* PickCompaction(DB* db, const std::string& cf_name) override {
+    ColumnFamilyMetaData cf_meta;
+    db->GetColumnFamilyMetaData(&cf_meta);
+
+    std::vector<std::string> input_file_names;
+    for (auto level : cf_meta.levels) {
+      for (auto file : level.files) {
+        if (file.being_compacted) {
+          return nullptr;
+        }
+        input_file_names.push_back(file.name);
+      }
+    }
+    return new CompactionTask(db, this, cf_name, input_file_names,
+                              options_.num_levels - 1, compact_options_, false);
+  }
+
+  // Schedule the specified compaction task in background.
+  void ScheduleCompaction(CompactionTask* task) override {
+    options_.env->Schedule(&FullCompactor::CompactFiles, task);
+  }
+
+  static void CompactFiles(void* arg) {
+    std::unique_ptr<CompactionTask> task(
+        reinterpret_cast<CompactionTask*>(arg));
+    assert(task);
+    assert(task->db);
+    Status s = task->db->CompactFiles(
+        task->compact_options, task->input_file_names, task->output_level);
+    printf("CompactFiles() finished with status %s\n", s.ToString().c_str());
+    if (!s.ok() && !s.IsIOError() && task->retry_on_fail) {
+      // If a compaction task with its retry_on_fail=true failed,
+      // try to schedule another compaction in case the reason
+      // is not an IO error.
+      CompactionTask* new_task =
+          task->compactor->PickCompaction(task->db, task->column_family_name);
+      task->compactor->ScheduleCompaction(new_task);
+    }
+  }
+
+ private:
+  Options options_;
+  CompactionOptions compact_options_;
+};
+
+int main() {
+  Options options;
+  options.create_if_missing = true;
+  // Disable RocksDB background compaction.
+  options.compaction_style = ROCKSDB_NAMESPACE::kCompactionStyleNone;
+  // Small slowdown and stop trigger for experimental purpose.
+  options.level0_slowdown_writes_trigger = 3;
+  options.level0_stop_writes_trigger = 5;
+  options.IncreaseParallelism(5);
+  options.listeners.emplace_back(new FullCompactor(options));
+
+  DB* db = nullptr;
+  ROCKSDB_NAMESPACE::DestroyDB(kDBPath, options);
+  Status s = DB::Open(options, kDBPath, &db);
+  assert(s.ok());
+  assert(db);
+
+  // if background compaction is not working, write will stall
+  // because of options.level0_stop_writes_trigger
+  for (int i = 1000; i < 99999; ++i) {
+    db->Put(WriteOptions(), std::to_string(i),
+            std::string(500, 'a' + (i % 26)));
+  }
+
+  // verify the values are still there
+  std::string value;
+  for (int i = 1000; i < 99999; ++i) {
+    db->Get(ReadOptions(), std::to_string(i), &value);
+    assert(value == std::string(500, 'a' + (i % 26)));
+  }
+
+  // close the db.
+  delete db;
+
+  return 0;
+}
diff --git a/src/rocksdb/examples/compaction_filter_example.cc b/src/rocksdb/examples/compaction_filter_example.cc
new file mode 100644
index 000000000..ed1ada823
--- /dev/null
+++ b/src/rocksdb/examples/compaction_filter_example.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/options.h"
+
+class MyMerge : public ROCKSDB_NAMESPACE::MergeOperator {
+ public:
+  virtual bool FullMergeV2(const MergeOperationInput& merge_in,
+                           MergeOperationOutput* merge_out) const override {
+    merge_out->new_value.clear();
+    if (merge_in.existing_value != nullptr) {
+      merge_out->new_value.assign(merge_in.existing_value->data(),
+                                  merge_in.existing_value->size());
+    }
+    for (const ROCKSDB_NAMESPACE::Slice& m : merge_in.operand_list) {
+      fprintf(stderr, "Merge(%s)\n", m.ToString().c_str());
+      // the compaction filter filters out bad values
+      assert(m.ToString() != "bad");
+      merge_out->new_value.assign(m.data(), m.size());
+    }
+    return true;
+  }
+
+  const char* Name() const override { return "MyMerge"; }
+};
+
+class MyFilter : public ROCKSDB_NAMESPACE::CompactionFilter {
+ public:
+  bool Filter(int level, const ROCKSDB_NAMESPACE::Slice& key,
+              const ROCKSDB_NAMESPACE::Slice& existing_value,
+              std::string* new_value, bool* value_changed) const override {
+    fprintf(stderr, "Filter(%s)\n", key.ToString().c_str());
+    ++count_;
+    assert(*value_changed == false);
+    return false;
+  }
+
+  bool FilterMergeOperand(
+      int level, const ROCKSDB_NAMESPACE::Slice& key,
+      const ROCKSDB_NAMESPACE::Slice& existing_value) const override {
+    fprintf(stderr, "FilterMerge(%s)\n", key.ToString().c_str());
+    ++merge_count_;
+    return existing_value == "bad";
+  }
+
+  const char* Name() const override { return "MyFilter"; }
+
+  mutable int count_ = 0;
+  mutable int merge_count_ = 0;
+};
+
+#if defined(OS_WIN)
+std::string kDBPath = "C:\\Windows\\TEMP\\rocksmergetest";
+std::string kRemoveDirCommand = "rmdir /Q /S ";
+#else
+std::string kDBPath = "/tmp/rocksmergetest";
+std::string kRemoveDirCommand = "rm -rf ";
+#endif
+
+int main() {
+  ROCKSDB_NAMESPACE::DB* raw_db;
+  ROCKSDB_NAMESPACE::Status status;
+
+  MyFilter filter;
+
+  std::string rm_cmd = kRemoveDirCommand + kDBPath;
+  int ret = system(rm_cmd.c_str());
+  if (ret != 0) {
+    fprintf(stderr, "Error deleting %s, code: %d\n", kDBPath.c_str(), ret);
+  }
+  ROCKSDB_NAMESPACE::Options options;
+  options.create_if_missing = true;
+  options.merge_operator.reset(new MyMerge);
+  options.compaction_filter = &filter;
+  status = ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &raw_db);
+  assert(status.ok());
+  std::unique_ptr<ROCKSDB_NAMESPACE::DB> db(raw_db);
+
+  ROCKSDB_NAMESPACE::WriteOptions wopts;
+  db->Merge(wopts, "0", "bad");  // This is filtered out
+  db->Merge(wopts, "1", "data1");
+  db->Merge(wopts, "1", "bad");
+  db->Merge(wopts, "1", "data2");
+  db->Merge(wopts, "1", "bad");
+  db->Merge(wopts, "3", "data3");
+  db->CompactRange(ROCKSDB_NAMESPACE::CompactRangeOptions(), nullptr, nullptr);
+  fprintf(stderr, "filter.count_ = %d\n", filter.count_);
+  assert(filter.count_ == 0);
+  fprintf(stderr, "filter.merge_count_ = %d\n", filter.merge_count_);
+  assert(filter.merge_count_ == 6);
+}
diff --git a/src/rocksdb/examples/multi_processes_example.cc b/src/rocksdb/examples/multi_processes_example.cc
new file mode 100644
index 000000000..93c54d755
--- /dev/null
+++ b/src/rocksdb/examples/multi_processes_example.cc
@@ -0,0 +1,393 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+// How to use this example
+// Open two terminals, in one of them, run `./multi_processes_example 0` to
+// start a process running the primary instance. This will create a new DB in
+// kDBPath. The process will run for a while inserting keys to the normal
+// RocksDB database.
+// Next, go to the other terminal and run `./multi_processes_example 1` to
+// start a process running the secondary instance. This will create a secondary
+// instance following the aforementioned primary instance. This process will
+// run for a while, tailing the logs of the primary. After process with primary
+// instance exits, this process will keep running until you hit 'CTRL+C'.
+
+#include <chrono>
+#include <cinttypes>
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+#include <string>
+#include <thread>
+#include <vector>
+
+// TODO: port this example to other systems. It should be straightforward for
+// POSIX-compliant systems.
+#if defined(OS_LINUX)
+#include <dirent.h>
+#include <signal.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+
+using ROCKSDB_NAMESPACE::ColumnFamilyDescriptor;
+using ROCKSDB_NAMESPACE::ColumnFamilyHandle;
+using ROCKSDB_NAMESPACE::ColumnFamilyOptions;
+using ROCKSDB_NAMESPACE::DB;
+using ROCKSDB_NAMESPACE::FlushOptions;
+using ROCKSDB_NAMESPACE::Iterator;
+using ROCKSDB_NAMESPACE::Options;
+using ROCKSDB_NAMESPACE::ReadOptions;
+using ROCKSDB_NAMESPACE::Slice;
+using ROCKSDB_NAMESPACE::Status;
+using ROCKSDB_NAMESPACE::WriteOptions;
+
+const std::string kDBPath = "/tmp/rocksdb_multi_processes_example";
+const std::string kPrimaryStatusFile =
+    "/tmp/rocksdb_multi_processes_example_primary_status";
+const uint64_t kMaxKey = 600000;
+const size_t kMaxValueLength = 256;
+const size_t kNumKeysPerFlush = 1000;
+
+const std::vector<std::string>& GetColumnFamilyNames() {
+  static std::vector<std::string> column_family_names = {
+      ROCKSDB_NAMESPACE::kDefaultColumnFamilyName, "pikachu"};
+  return column_family_names;
+}
+
+inline bool IsLittleEndian() {
+  uint32_t x = 1;
+  return *reinterpret_cast<char*>(&x) != 0;
+}
+
+static std::atomic<int>& ShouldSecondaryWait() {
+  static std::atomic<int> should_secondary_wait{1};
+  return should_secondary_wait;
+}
+
+static std::string Key(uint64_t k) {
+  std::string ret;
+  if (IsLittleEndian()) {
+    ret.append(reinterpret_cast<char*>(&k), sizeof(k));
+  } else {
+    char buf[sizeof(k)];
+    buf[0] = k & 0xff;
+    buf[1] = (k >> 8) & 0xff;
+    buf[2] = (k >> 16) & 0xff;
+    buf[3] = (k >> 24) & 0xff;
+    buf[4] = (k >> 32) & 0xff;
+    buf[5] = (k >> 40) & 0xff;
+    buf[6] = (k >> 48) & 0xff;
+    buf[7] = (k >> 56) & 0xff;
+    ret.append(buf, sizeof(k));
+  }
+  size_t i = 0, j = ret.size() - 1;
+  while (i < j) {
+    char tmp = ret[i];
+    ret[i] = ret[j];
+    ret[j] = tmp;
+    ++i;
+    --j;
+  }
+  return ret;
+}
+
+static uint64_t Key(std::string key) {
+  assert(key.size() == sizeof(uint64_t));
+  size_t i = 0, j = key.size() - 1;
+  while (i < j) {
+    char tmp = key[i];
+    key[i] = key[j];
+    key[j] = tmp;
+    ++i;
+    --j;
+  }
+  uint64_t ret = 0;
+  if (IsLittleEndian()) {
+    memcpy(&ret, key.c_str(), sizeof(uint64_t));
+  } else {
+    const char* buf = key.c_str();
+    ret |= static_cast<uint64_t>(buf[0]);
+    ret |= (static_cast<uint64_t>(buf[1]) << 8);
+    ret |= (static_cast<uint64_t>(buf[2]) << 16);
+    ret |= (static_cast<uint64_t>(buf[3]) << 24);
+    ret |= (static_cast<uint64_t>(buf[4]) << 32);
+    ret |= (static_cast<uint64_t>(buf[5]) << 40);
+    ret |= (static_cast<uint64_t>(buf[6]) << 48);
+    ret |= (static_cast<uint64_t>(buf[7]) << 56);
+  }
+  return ret;
+}
+
+static Slice GenerateRandomValue(const size_t max_length, char scratch[]) {
+  size_t sz = 1 + (std::rand() % max_length);
+  int rnd = std::rand();
+  for (size_t i = 0; i != sz; ++i) {
+    scratch[i] = static_cast<char>(rnd ^ i);
+  }
+  return Slice(scratch, sz);
+}
+
+static bool ShouldCloseDB() { return true; }
+
+void CreateDB() {
+  long my_pid = static_cast<long>(getpid());
+  Options options;
+  Status s = ROCKSDB_NAMESPACE::DestroyDB(kDBPath, options);
+  if (!s.ok()) {
+    fprintf(stderr, "[process %ld] Failed to destroy DB: %s\n", my_pid,
+            s.ToString().c_str());
+    assert(false);
+  }
+  options.create_if_missing = true;
+  DB* db = nullptr;
+  s = DB::Open(options, kDBPath, &db);
+  if (!s.ok()) {
+    fprintf(stderr, "[process %ld] Failed to open DB: %s\n", my_pid,
+            s.ToString().c_str());
+    assert(false);
+  }
+  std::vector<ColumnFamilyHandle*> handles;
+  ColumnFamilyOptions cf_opts(options);
+  for (const auto& cf_name : GetColumnFamilyNames()) {
+    if (ROCKSDB_NAMESPACE::kDefaultColumnFamilyName != cf_name) {
+      ColumnFamilyHandle* handle = nullptr;
+      s = db->CreateColumnFamily(cf_opts, cf_name, &handle);
+      if (!s.ok()) {
+        fprintf(stderr, "[process %ld] Failed to create CF %s: %s\n", my_pid,
+                cf_name.c_str(), s.ToString().c_str());
+        assert(false);
+      }
+      handles.push_back(handle);
+    }
+  }
+  fprintf(stdout, "[process %ld] Column families created\n", my_pid);
+  for (auto h : handles) {
+    delete h;
+  }
+  handles.clear();
+  delete db;
+}
+
+void RunPrimary() {
+  long my_pid = static_cast<long>(getpid());
+  fprintf(stdout, "[process %ld] Primary instance starts\n", my_pid);
+  CreateDB();
+  std::srand(time(nullptr));
+  DB* db = nullptr;
+  Options options;
+  options.create_if_missing = false;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (const auto& cf_name : GetColumnFamilyNames()) {
+    column_families.push_back(ColumnFamilyDescriptor(cf_name, options));
+  }
+  std::vector<ColumnFamilyHandle*> handles;
+  WriteOptions write_opts;
+  char val_buf[kMaxValueLength] = {0};
+  uint64_t curr_key = 0;
+  while (curr_key < kMaxKey) {
+    Status s;
+    if (nullptr == db) {
+      s = DB::Open(options, kDBPath, column_families, &handles, &db);
+      if (!s.ok()) {
+        fprintf(stderr, "[process %ld] Failed to open DB: %s\n", my_pid,
+                s.ToString().c_str());
+        assert(false);
+      }
+    }
+    assert(nullptr != db);
+    assert(handles.size() == GetColumnFamilyNames().size());
+    for (auto h : handles) {
+      assert(nullptr != h);
+      for (size_t i = 0; i != kNumKeysPerFlush; ++i) {
+        Slice key = Key(curr_key + static_cast<uint64_t>(i));
+        Slice value = GenerateRandomValue(kMaxValueLength, val_buf);
+        s = db->Put(write_opts, h, key, value);
+        if (!s.ok()) {
+          fprintf(stderr, "[process %ld] Failed to insert\n", my_pid);
+          assert(false);
+        }
+      }
+      s = db->Flush(FlushOptions(), h);
+      if (!s.ok()) {
+        fprintf(stderr, "[process %ld] Failed to flush\n", my_pid);
+        assert(false);
+      }
+    }
+    curr_key += static_cast<uint64_t>(kNumKeysPerFlush);
+    if (ShouldCloseDB()) {
+      for (auto h : handles) {
+        delete h;
+      }
+      handles.clear();
+      delete db;
+      db = nullptr;
+    }
+  }
+  if (nullptr != db) {
+    for (auto h : handles) {
+      delete h;
+    }
+    handles.clear();
+    delete db;
+    db = nullptr;
+  }
+  fprintf(stdout, "[process %ld] Finished adding keys\n", my_pid);
+}
+
+void secondary_instance_sigint_handler(int signal) {
+  ShouldSecondaryWait().store(0, std::memory_order_relaxed);
+  fprintf(stdout, "\n");
+  fflush(stdout);
+};
+
+void RunSecondary() {
+  ::signal(SIGINT, secondary_instance_sigint_handler);
+  long my_pid = static_cast<long>(getpid());
+  const std::string kSecondaryPath =
+      "/tmp/rocksdb_multi_processes_example_secondary";
+  // Create directory if necessary
+  if (nullptr == opendir(kSecondaryPath.c_str())) {
+    int ret =
+        mkdir(kSecondaryPath.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
+    if (ret < 0) {
+      perror("failed to create directory for secondary instance");
+      exit(0);
+    }
+  }
+  DB* db = nullptr;
+  Options options;
+  options.create_if_missing = false;
+  options.max_open_files = -1;
+  Status s = DB::OpenAsSecondary(options, kDBPath, kSecondaryPath, &db);
+  if (!s.ok()) {
+    fprintf(stderr, "[process %ld] Failed to open in secondary mode: %s\n",
+            my_pid, s.ToString().c_str());
+    assert(false);
+  } else {
+    fprintf(stdout, "[process %ld] Secondary instance starts\n", my_pid);
+  }
+
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  ropts.total_order_seek = true;
+
+  std::vector<std::thread> test_threads;
+  test_threads.emplace_back([&]() {
+    while (1 == ShouldSecondaryWait().load(std::memory_order_relaxed)) {
+      std::unique_ptr<Iterator> iter(db->NewIterator(ropts));
+      iter->SeekToFirst();
+      size_t count = 0;
+      for (; iter->Valid(); iter->Next()) {
+        ++count;
+      }
+    }
+    fprintf(stdout, "[process %ld] Range_scan thread finished\n", my_pid);
+  });
+
+  test_threads.emplace_back([&]() {
+    std::srand(time(nullptr));
+    while (1 == ShouldSecondaryWait().load(std::memory_order_relaxed)) {
+      Slice key = Key(std::rand() % kMaxKey);
+      std::string value;
+      db->Get(ropts, key, &value);
+    }
+    fprintf(stdout, "[process %ld] Point lookup thread finished\n", my_pid);
+  });
+
+  uint64_t curr_key = 0;
+  while (1 == ShouldSecondaryWait().load(std::memory_order_relaxed)) {
+    s = db->TryCatchUpWithPrimary();
+    if (!s.ok()) {
+      fprintf(stderr,
+              "[process %ld] error while trying to catch up with "
+              "primary %s\n",
+              my_pid, s.ToString().c_str());
+      assert(false);
+    }
+    {
+      std::unique_ptr<Iterator> iter(db->NewIterator(ropts));
+      if (!iter) {
+        fprintf(stderr, "[process %ld] Failed to create iterator\n", my_pid);
+        assert(false);
+      }
+      iter->SeekToLast();
+      if (iter->Valid()) {
+        uint64_t curr_max_key = Key(iter->key().ToString());
+        if (curr_max_key != curr_key) {
+          fprintf(stdout, "[process %ld] Observed key %" PRIu64 "\n", my_pid,
+                  curr_key);
+          curr_key = curr_max_key;
+        }
+      }
+    }
+    std::this_thread::sleep_for(std::chrono::seconds(1));
+  }
+  s = db->TryCatchUpWithPrimary();
+  if (!s.ok()) {
+    fprintf(stderr,
+            "[process %ld] error while trying to catch up with "
+            "primary %s\n",
+            my_pid, s.ToString().c_str());
+    assert(false);
+  }
+
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (const auto& cf_name : GetColumnFamilyNames()) {
+    column_families.push_back(ColumnFamilyDescriptor(cf_name, options));
+  }
+  std::vector<ColumnFamilyHandle*> handles;
+  DB* verification_db = nullptr;
+  s = DB::OpenForReadOnly(options, kDBPath, column_families, &handles,
+                          &verification_db);
+  assert(s.ok());
+  Iterator* iter1 = verification_db->NewIterator(ropts);
+  iter1->SeekToFirst();
+
+  Iterator* iter = db->NewIterator(ropts);
+  iter->SeekToFirst();
+  for (; iter->Valid() && iter1->Valid(); iter->Next(), iter1->Next()) {
+    if (iter->key().ToString() != iter1->key().ToString()) {
+      fprintf(stderr, "%" PRIu64 "!= %" PRIu64 "\n",
+              Key(iter->key().ToString()), Key(iter1->key().ToString()));
+      assert(false);
+    } else if (iter->value().ToString() != iter1->value().ToString()) {
+      fprintf(stderr, "Value mismatch\n");
+      assert(false);
+    }
+  }
+  fprintf(stdout, "[process %ld] Verification succeeded\n", my_pid);
+  for (auto& thr : test_threads) {
+    thr.join();
+  }
+  delete iter;
+  delete iter1;
+  delete db;
+  delete verification_db;
+}
+
+int main(int argc, char** argv) {
+  if (argc < 2) {
+    fprintf(stderr, "%s <0 for primary, 1 for secondary>\n", argv[0]);
+    return 0;
+  }
+  if (atoi(argv[1]) == 0) {
+    RunPrimary();
+  } else {
+    RunSecondary();
+  }
+  return 0;
+}
+#else   // OS_LINUX
+int main() {
+  fprintf(stderr, "Not implemented.\n");
+  return 0;
+}
+#endif  // !OS_LINUX
diff --git a/src/rocksdb/examples/optimistic_transaction_example.cc b/src/rocksdb/examples/optimistic_transaction_example.cc
new file mode 100644
index 000000000..fb0514a69
--- /dev/null
+++ b/src/rocksdb/examples/optimistic_transaction_example.cc
@@ -0,0 +1,192 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksdb/utilities/transaction.h"
+
+using ROCKSDB_NAMESPACE::DB;
+using ROCKSDB_NAMESPACE::OptimisticTransactionDB;
+using ROCKSDB_NAMESPACE::OptimisticTransactionOptions;
+using ROCKSDB_NAMESPACE::Options;
+using ROCKSDB_NAMESPACE::ReadOptions;
+using ROCKSDB_NAMESPACE::Snapshot;
+using ROCKSDB_NAMESPACE::Status;
+using ROCKSDB_NAMESPACE::Transaction;
+using ROCKSDB_NAMESPACE::WriteOptions;
+
+#if defined(OS_WIN)
+std::string kDBPath = "C:\\Windows\\TEMP\\rocksdb_transaction_example";
+#else
+std::string kDBPath = "/tmp/rocksdb_transaction_example";
+#endif
+
+int main() {
+  // open DB
+  Options options;
+  options.create_if_missing = true;
+  DB* db;
+  OptimisticTransactionDB* txn_db;
+
+  Status s = OptimisticTransactionDB::Open(options, kDBPath, &txn_db);
+  assert(s.ok());
+  db = txn_db->GetBaseDB();
+
+  WriteOptions write_options;
+  ReadOptions read_options;
+  OptimisticTransactionOptions txn_options;
+  std::string value;
+
+  ////////////////////////////////////////////////////////
+  //
+  // Simple OptimisticTransaction Example ("Read Committed")
+  //
+  ////////////////////////////////////////////////////////
+
+  // Start a transaction
+  Transaction* txn = txn_db->BeginTransaction(write_options);
+  assert(txn);
+
+  // Read a key in this transaction
+  s = txn->Get(read_options, "abc", &value);
+  assert(s.IsNotFound());
+
+  // Write a key in this transaction
+  s = txn->Put("abc", "xyz");
+  assert(s.ok());
+
+  // Read a key OUTSIDE this transaction. Does not affect txn.
+  s = db->Get(read_options, "abc", &value);
+  assert(s.IsNotFound());
+
+  // Write a key OUTSIDE of this transaction.
+  // Does not affect txn since this is an unrelated key.  If we wrote key 'abc'
+  // here, the transaction would fail to commit.
+  s = db->Put(write_options, "xyz", "zzz");
+  assert(s.ok());
+  s = db->Put(write_options, "abc", "def");
+  assert(s.ok());
+
+  // Commit transaction
+  s = txn->Commit();
+  assert(s.IsBusy());
+  delete txn;
+
+  s = db->Get(read_options, "xyz", &value);
+  assert(s.ok());
+  assert(value == "zzz");
+
+  s = db->Get(read_options, "abc", &value);
+  assert(s.ok());
+  assert(value == "def");
+
+  ////////////////////////////////////////////////////////
+  //
+  // "Repeatable Read" (Snapshot Isolation) Example
+  //   -- Using a single Snapshot
+  //
+  ////////////////////////////////////////////////////////
+
+  // Set a snapshot at start of transaction by setting set_snapshot=true
+  txn_options.set_snapshot = true;
+  txn = txn_db->BeginTransaction(write_options, txn_options);
+
+  const Snapshot* snapshot = txn->GetSnapshot();
+
+  // Write a key OUTSIDE of transaction
+  s = db->Put(write_options, "abc", "xyz");
+  assert(s.ok());
+
+  // Read a key using the snapshot
+  read_options.snapshot = snapshot;
+  s = txn->GetForUpdate(read_options, "abc", &value);
+  assert(s.ok());
+  assert(value == "def");
+
+  // Attempt to commit transaction
+  s = txn->Commit();
+
+  // Transaction could not commit since the write outside of the txn conflicted
+  // with the read!
+  assert(s.IsBusy());
+
+  delete txn;
+  // Clear snapshot from read options since it is no longer valid
+  read_options.snapshot = nullptr;
+  snapshot = nullptr;
+
+  s = db->Get(read_options, "abc", &value);
+  assert(s.ok());
+  assert(value == "xyz");
+
+  ////////////////////////////////////////////////////////
+  //
+  // "Read Committed" (Monotonic Atomic Views) Example
+  //   --Using multiple Snapshots
+  //
+  ////////////////////////////////////////////////////////
+
+  // In this example, we set the snapshot multiple times.  This is probably
+  // only necessary if you have very strict isolation requirements to
+  // implement.
+
+  // Set a snapshot at start of transaction
+  txn_options.set_snapshot = true;
+  txn = txn_db->BeginTransaction(write_options, txn_options);
+
+  // Do some reads and writes to key "x"
+  read_options.snapshot = db->GetSnapshot();
+  s = txn->Get(read_options, "x", &value);
+  assert(s.IsNotFound());
+  s = txn->Put("x", "x");
+  assert(s.ok());
+
+  // The transaction hasn't committed, so the write is not visible
+  // outside of txn.
+  s = db->Get(read_options, "x", &value);
+  assert(s.IsNotFound());
+
+  // Do a write outside of the transaction to key "y"
+  s = db->Put(write_options, "y", "z");
+  assert(s.ok());
+
+  // Set a new snapshot in the transaction
+  txn->SetSnapshot();
+  read_options.snapshot = db->GetSnapshot();
+
+  // Do some reads and writes to key "y"
+  s = txn->GetForUpdate(read_options, "y", &value);
+  assert(s.ok());
+  assert(value == "z");
+  txn->Put("y", "y");
+
+  // Commit.  Since the snapshot was advanced, the write done outside of the
+  // transaction does not prevent this transaction from Committing.
+  s = txn->Commit();
+  assert(s.ok());
+  delete txn;
+  // Clear snapshot from read options since it is no longer valid
+  read_options.snapshot = nullptr;
+
+  // txn is committed, read the latest values.
+  s = db->Get(read_options, "x", &value);
+  assert(s.ok());
+  assert(value == "x");
+
+  s = db->Get(read_options, "y", &value);
+  assert(s.ok());
+  assert(value == "y");
+
+  // Cleanup
+  delete txn_db;
+  DestroyDB(kDBPath, options);
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/examples/options_file_example.cc b/src/rocksdb/examples/options_file_example.cc
new file mode 100644
index 000000000..00632f391
--- /dev/null
+++ b/src/rocksdb/examples/options_file_example.cc
@@ -0,0 +1,132 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file demonstrates how to use the utility functions defined in
+// rocksdb/utilities/options_util.h to open a rocksdb database without
+// remembering all the rocksdb options.
+#include <cstdio>
+#include <string>
+#include <vector>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/options_util.h"
+
+using ROCKSDB_NAMESPACE::BlockBasedTableOptions;
+using ROCKSDB_NAMESPACE::ColumnFamilyDescriptor;
+using ROCKSDB_NAMESPACE::ColumnFamilyHandle;
+using ROCKSDB_NAMESPACE::ColumnFamilyOptions;
+using ROCKSDB_NAMESPACE::CompactionFilter;
+using ROCKSDB_NAMESPACE::ConfigOptions;
+using ROCKSDB_NAMESPACE::DB;
+using ROCKSDB_NAMESPACE::DBOptions;
+using ROCKSDB_NAMESPACE::NewLRUCache;
+using ROCKSDB_NAMESPACE::Options;
+using ROCKSDB_NAMESPACE::Slice;
+using ROCKSDB_NAMESPACE::Status;
+
+#if defined(OS_WIN)
+std::string kDBPath = "C:\\Windows\\TEMP\\rocksdb_options_file_example";
+#else
+std::string kDBPath = "/tmp/rocksdb_options_file_example";
+#endif
+
+namespace {
+// A dummy compaction filter
+class DummyCompactionFilter : public CompactionFilter {
+ public:
+  virtual ~DummyCompactionFilter() {}
+  virtual bool Filter(int level, const Slice& key, const Slice& existing_value,
+                      std::string* new_value, bool* value_changed) const {
+    return false;
+  }
+  virtual const char* Name() const { return "DummyCompactionFilter"; }
+};
+
+}  // namespace
+
+int main() {
+  DBOptions db_opt;
+  db_opt.create_if_missing = true;
+
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.push_back(
+      {ROCKSDB_NAMESPACE::kDefaultColumnFamilyName, ColumnFamilyOptions()});
+  cf_descs.push_back({"new_cf", ColumnFamilyOptions()});
+
+  // initialize BlockBasedTableOptions
+  auto cache = NewLRUCache(1 * 1024 * 1024 * 1024);
+  BlockBasedTableOptions bbt_opts;
+  bbt_opts.block_size = 32 * 1024;
+  bbt_opts.block_cache = cache;
+
+  // initialize column families options
+  std::unique_ptr<CompactionFilter> compaction_filter;
+  compaction_filter.reset(new DummyCompactionFilter());
+  cf_descs[0].options.table_factory.reset(NewBlockBasedTableFactory(bbt_opts));
+  cf_descs[0].options.compaction_filter = compaction_filter.get();
+  cf_descs[1].options.table_factory.reset(NewBlockBasedTableFactory(bbt_opts));
+
+  // destroy and open DB
+  DB* db;
+  Status s = ROCKSDB_NAMESPACE::DestroyDB(kDBPath,
+                                          Options(db_opt, cf_descs[0].options));
+  assert(s.ok());
+  s = DB::Open(Options(db_opt, cf_descs[0].options), kDBPath, &db);
+  assert(s.ok());
+
+  // Create column family, and rocksdb will persist the options.
+  ColumnFamilyHandle* cf;
+  s = db->CreateColumnFamily(ColumnFamilyOptions(), "new_cf", &cf);
+  assert(s.ok());
+
+  // close DB
+  delete cf;
+  delete db;
+
+  // In the following code, we will reopen the rocksdb instance using
+  // the options file stored in the db directory.
+
+  // Load the options file.
+  DBOptions loaded_db_opt;
+  std::vector<ColumnFamilyDescriptor> loaded_cf_descs;
+  ConfigOptions config_options;
+  s = LoadLatestOptions(config_options, kDBPath, &loaded_db_opt,
+                        &loaded_cf_descs);
+  assert(s.ok());
+  assert(loaded_db_opt.create_if_missing == db_opt.create_if_missing);
+
+  // Initialize pointer options for each column family
+  for (size_t i = 0; i < loaded_cf_descs.size(); ++i) {
+    auto* loaded_bbt_opt =
+        loaded_cf_descs[0]
+            .options.table_factory->GetOptions<BlockBasedTableOptions>();
+    // Expect the same as BlockBasedTableOptions will be loaded form file.
+    assert(loaded_bbt_opt->block_size == bbt_opts.block_size);
+    // However, block_cache needs to be manually initialized as documented
+    // in rocksdb/utilities/options_util.h.
+    loaded_bbt_opt->block_cache = cache;
+  }
+  // In addition, as pointer options are initialized with default value,
+  // we need to properly initialized all the pointer options if non-defalut
+  // values are used before calling DB::Open().
+  assert(loaded_cf_descs[0].options.compaction_filter == nullptr);
+  loaded_cf_descs[0].options.compaction_filter = compaction_filter.get();
+
+  // reopen the db using the loaded options.
+  std::vector<ColumnFamilyHandle*> handles;
+  s = DB::Open(loaded_db_opt, kDBPath, loaded_cf_descs, &handles, &db);
+  assert(s.ok());
+
+  // close DB
+  for (auto* handle : handles) {
+    delete handle;
+  }
+  delete db;
+}
diff --git a/src/rocksdb/examples/rocksdb_backup_restore_example.cc b/src/rocksdb/examples/rocksdb_backup_restore_example.cc
new file mode 100644
index 000000000..c833ed1c2
--- /dev/null
+++ b/src/rocksdb/examples/rocksdb_backup_restore_example.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <cstdio>
+#include <string>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/backup_engine.h"
+
+using ROCKSDB_NAMESPACE::BackupEngine;
+using ROCKSDB_NAMESPACE::BackupEngineOptions;
+using ROCKSDB_NAMESPACE::BackupEngineReadOnly;
+using ROCKSDB_NAMESPACE::BackupInfo;
+using ROCKSDB_NAMESPACE::DB;
+using ROCKSDB_NAMESPACE::Env;
+using ROCKSDB_NAMESPACE::Options;
+using ROCKSDB_NAMESPACE::ReadOptions;
+using ROCKSDB_NAMESPACE::Status;
+using ROCKSDB_NAMESPACE::WriteOptions;
+
+#if defined(OS_WIN)
+std::string kDBPath = "C:\\Windows\\TEMP\\rocksdb_example";
+#else
+std::string kDBPath = "/tmp/rocksdb_example";
+#endif
+
+int main() {
+  DB* db;
+  Options options;
+  // Optimize RocksDB. This is the easiest way to get RocksDB to perform well
+  options.IncreaseParallelism();
+  options.OptimizeLevelStyleCompaction();
+  // create the DB if it's not already present
+  options.create_if_missing = true;
+
+  // open DB
+  Status s = DB::Open(options, kDBPath, &db);
+  assert(s.ok());
+
+  // Put key-value
+  db->Put(WriteOptions(), "key1", "value1");
+  assert(s.ok());
+
+  // create backup
+  BackupEngine* backup_engine;
+  s = BackupEngine::Open(Env::Default(),
+                         BackupEngineOptions("/tmp/rocksdb_example_backup"),
+                         &backup_engine);
+  assert(s.ok());
+
+  backup_engine->CreateNewBackup(db);
+  assert(s.ok());
+
+  std::vector<BackupInfo> backup_info;
+  backup_engine->GetBackupInfo(&backup_info);
+
+  s = backup_engine->VerifyBackup(1);
+  assert(s.ok());
+
+  // Put key-value
+  db->Put(WriteOptions(), "key2", "value2");
+  assert(s.ok());
+
+  db->Close();
+  delete db;
+  db = nullptr;
+
+  // restore db to backup 1
+  BackupEngineReadOnly* backup_engine_ro;
+  s = BackupEngineReadOnly::Open(
+      Env::Default(), BackupEngineOptions("/tmp/rocksdb_example_backup"),
+      &backup_engine_ro);
+  assert(s.ok());
+
+  s = backup_engine_ro->RestoreDBFromBackup(1, "/tmp/rocksdb_example",
+                                            "/tmp/rocksdb_example");
+  assert(s.ok());
+
+  // open db again
+  s = DB::Open(options, kDBPath, &db);
+  assert(s.ok());
+
+  std::string value;
+  s = db->Get(ReadOptions(), "key1", &value);
+  assert(!s.IsNotFound());
+
+  s = db->Get(ReadOptions(), "key2", &value);
+  assert(s.IsNotFound());
+
+  delete backup_engine;
+  delete backup_engine_ro;
+  delete db;
+
+  return 0;
+}
diff --git a/src/rocksdb/examples/rocksdb_option_file_example.ini b/src/rocksdb/examples/rocksdb_option_file_example.ini
new file mode 100644
index 000000000..351890e51
--- /dev/null
+++ b/src/rocksdb/examples/rocksdb_option_file_example.ini
@@ -0,0 +1,142 @@
+# This is a RocksDB option file.
+#
+# A typical RocksDB options file has four sections, which are
+# Version section, DBOptions section, at least one CFOptions
+# section, and one TableOptions section for each column family.
+# The RocksDB options file in general follows the basic INI
+# file format with the following extensions / modifications:
+#
+#  * Escaped characters
+#    We escaped the following characters:
+#     - \n -- line feed - new line
+#     - \r -- carriage return
+#     - \\ -- backslash \
+#     - \: -- colon symbol :
+#     - \# -- hash tag #
+#  * Comments
+#    We support # style comments.  Comments can appear at the ending
+#    part of a line.
+#  * Statements
+#    A statement is of the form option_name = value.
+#    Each statement contains a '=', where extra white-spaces
+#    are supported. However, we don't support multi-lined statement.
+#    Furthermore, each line can only contain at most one statement.
+#  * Sections
+#    Sections are of the form [SecitonTitle "SectionArgument"],
+#    where section argument is optional.
+#  * List
+#    We use colon-separated string to represent a list.
+#    For instance, n1:n2:n3:n4 is a list containing four values.
+#
+# Below is an example of a RocksDB options file:
+[Version]
+  rocksdb_version=4.3.0
+  options_file_version=1.1
+
+[DBOptions]
+  stats_dump_period_sec=600
+  max_manifest_file_size=18446744073709551615
+  bytes_per_sync=8388608
+  delayed_write_rate=2097152
+  WAL_ttl_seconds=0
+  WAL_size_limit_MB=0
+  max_subcompactions=1
+  wal_dir=
+  wal_bytes_per_sync=0
+  db_write_buffer_size=0
+  keep_log_file_num=1000
+  table_cache_numshardbits=4
+  max_file_opening_threads=1
+  writable_file_max_buffer_size=1048576
+  random_access_max_buffer_size=1048576
+  use_fsync=false
+  max_total_wal_size=0
+  max_open_files=-1
+  skip_stats_update_on_db_open=false
+  max_background_compactions=16
+  manifest_preallocation_size=4194304
+  max_background_flushes=7
+  is_fd_close_on_exec=true
+  max_log_file_size=0
+  advise_random_on_open=true
+  create_missing_column_families=false
+  paranoid_checks=true
+  delete_obsolete_files_period_micros=21600000000
+  log_file_time_to_roll=0
+  compaction_readahead_size=0
+  create_if_missing=false
+  use_adaptive_mutex=false
+  enable_thread_tracking=false
+  allow_fallocate=true
+  error_if_exists=false
+  recycle_log_file_num=0
+  db_log_dir=
+  skip_log_error_on_recovery=false
+  new_table_reader_for_compaction_inputs=true
+  allow_mmap_reads=false
+  allow_mmap_writes=false
+  use_direct_reads=false
+  use_direct_writes=false
+
+
+[CFOptions "default"]
+  compaction_style=kCompactionStyleLevel
+  compaction_filter=nullptr
+  num_levels=6
+  table_factory=BlockBasedTable
+  comparator=leveldb.BytewiseComparator
+  max_sequential_skip_in_iterations=8
+  max_bytes_for_level_base=1073741824
+  memtable_prefix_bloom_probes=6
+  memtable_prefix_bloom_bits=0
+  memtable_prefix_bloom_huge_page_tlb_size=0
+  max_successive_merges=0
+  arena_block_size=16777216
+  min_write_buffer_number_to_merge=1
+  target_file_size_multiplier=1
+  source_compaction_factor=1
+  max_bytes_for_level_multiplier=8
+  max_bytes_for_level_multiplier_additional=2:3:5
+  compaction_filter_factory=nullptr
+  max_write_buffer_number=8
+  level0_stop_writes_trigger=20
+  compression=kSnappyCompression
+  level0_file_num_compaction_trigger=4
+  purge_redundant_kvs_while_flush=true
+  max_write_buffer_size_to_maintain=0
+  memtable_factory=SkipListFactory
+  max_grandparent_overlap_factor=8
+  expanded_compaction_factor=25
+  hard_pending_compaction_bytes_limit=137438953472
+  inplace_update_num_locks=10000
+  level_compaction_dynamic_level_bytes=true
+  level0_slowdown_writes_trigger=12
+  filter_deletes=false
+  verify_checksums_in_compaction=true
+  min_partial_merge_operands=2
+  paranoid_file_checks=false
+  target_file_size_base=134217728
+  optimize_filters_for_hits=false
+  merge_operator=PutOperator
+  compression_per_level=kNoCompression:kNoCompression:kNoCompression:kSnappyCompression:kSnappyCompression:kSnappyCompression
+  compaction_measure_io_stats=false
+  prefix_extractor=nullptr
+  bloom_locality=0
+  write_buffer_size=134217728
+  disable_auto_compactions=false
+  inplace_update_support=false
+
+[TableOptions/BlockBasedTable "default"]
+  format_version=2
+  whole_key_filtering=true
+  no_block_cache=false
+  checksum=kCRC32c
+  filter_policy=rocksdb.BuiltinBloomFilter
+  block_size_deviation=10
+  block_size=8192
+  block_restart_interval=16
+  cache_index_and_filter_blocks=false
+  pin_l0_filter_and_index_blocks_in_cache=false
+  pin_top_level_index_and_filter=false
+  index_type=kBinarySearch
+  flush_block_policy_factory=FlushBlockBySizePolicyFactory
diff --git a/src/rocksdb/examples/simple_example.cc b/src/rocksdb/examples/simple_example.cc
new file mode 100644
index 000000000..2d49c4d14
--- /dev/null
+++ b/src/rocksdb/examples/simple_example.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <cstdio>
+#include <string>
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+
+using ROCKSDB_NAMESPACE::DB;
+using ROCKSDB_NAMESPACE::Options;
+using ROCKSDB_NAMESPACE::PinnableSlice;
+using ROCKSDB_NAMESPACE::ReadOptions;
+using ROCKSDB_NAMESPACE::Status;
+using ROCKSDB_NAMESPACE::WriteBatch;
+using ROCKSDB_NAMESPACE::WriteOptions;
+
+#if defined(OS_WIN)
+std::string kDBPath = "C:\\Windows\\TEMP\\rocksdb_simple_example";
+#else
+std::string kDBPath = "/tmp/rocksdb_simple_example";
+#endif
+
+int main() {
+  DB* db;
+  Options options;
+  // Optimize RocksDB. This is the easiest way to get RocksDB to perform well
+  options.IncreaseParallelism();
+  options.OptimizeLevelStyleCompaction();
+  // create the DB if it's not already present
+  options.create_if_missing = true;
+
+  // open DB
+  Status s = DB::Open(options, kDBPath, &db);
+  assert(s.ok());
+
+  // Put key-value
+  s = db->Put(WriteOptions(), "key1", "value");
+  assert(s.ok());
+  std::string value;
+  // get value
+  s = db->Get(ReadOptions(), "key1", &value);
+  assert(s.ok());
+  assert(value == "value");
+
+  // atomically apply a set of updates
+  {
+    WriteBatch batch;
+    batch.Delete("key1");
+    batch.Put("key2", value);
+    s = db->Write(WriteOptions(), &batch);
+  }
+
+  s = db->Get(ReadOptions(), "key1", &value);
+  assert(s.IsNotFound());
+
+  db->Get(ReadOptions(), "key2", &value);
+  assert(value == "value");
+
+  {
+    PinnableSlice pinnable_val;
+    db->Get(ReadOptions(), db->DefaultColumnFamily(), "key2", &pinnable_val);
+    assert(pinnable_val == "value");
+  }
+
+  {
+    std::string string_val;
+    // If it cannot pin the value, it copies the value to its internal buffer.
+    // The intenral buffer could be set during construction.
+    PinnableSlice pinnable_val(&string_val);
+    db->Get(ReadOptions(), db->DefaultColumnFamily(), "key2", &pinnable_val);
+    assert(pinnable_val == "value");
+    // If the value is not pinned, the internal buffer must have the value.
+    assert(pinnable_val.IsPinned() || string_val == "value");
+  }
+
+  PinnableSlice pinnable_val;
+  s = db->Get(ReadOptions(), db->DefaultColumnFamily(), "key1", &pinnable_val);
+  assert(s.IsNotFound());
+  // Reset PinnableSlice after each use and before each reuse
+  pinnable_val.Reset();
+  db->Get(ReadOptions(), db->DefaultColumnFamily(), "key2", &pinnable_val);
+  assert(pinnable_val == "value");
+  pinnable_val.Reset();
+  // The Slice pointed by pinnable_val is not valid after this point
+
+  delete db;
+
+  return 0;
+}
diff --git a/src/rocksdb/examples/transaction_example.cc b/src/rocksdb/examples/transaction_example.cc
new file mode 100644
index 000000000..08bcca1b6
--- /dev/null
+++ b/src/rocksdb/examples/transaction_example.cc
@@ -0,0 +1,198 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+
+using ROCKSDB_NAMESPACE::Options;
+using ROCKSDB_NAMESPACE::ReadOptions;
+using ROCKSDB_NAMESPACE::Snapshot;
+using ROCKSDB_NAMESPACE::Status;
+using ROCKSDB_NAMESPACE::Transaction;
+using ROCKSDB_NAMESPACE::TransactionDB;
+using ROCKSDB_NAMESPACE::TransactionDBOptions;
+using ROCKSDB_NAMESPACE::TransactionOptions;
+using ROCKSDB_NAMESPACE::WriteOptions;
+
+#if defined(OS_WIN)
+std::string kDBPath = "C:\\Windows\\TEMP\\rocksdb_transaction_example";
+#else
+std::string kDBPath = "/tmp/rocksdb_transaction_example";
+#endif
+
+int main() {
+  // open DB
+  Options options;
+  TransactionDBOptions txn_db_options;
+  options.create_if_missing = true;
+  TransactionDB* txn_db;
+
+  Status s = TransactionDB::Open(options, txn_db_options, kDBPath, &txn_db);
+  assert(s.ok());
+
+  WriteOptions write_options;
+  ReadOptions read_options;
+  TransactionOptions txn_options;
+  std::string value;
+
+  ////////////////////////////////////////////////////////
+  //
+  // Simple Transaction Example ("Read Committed")
+  //
+  ////////////////////////////////////////////////////////
+
+  // Start a transaction
+  Transaction* txn = txn_db->BeginTransaction(write_options);
+  assert(txn);
+
+  // Read a key in this transaction
+  s = txn->Get(read_options, "abc", &value);
+  assert(s.IsNotFound());
+
+  // Write a key in this transaction
+  s = txn->Put("abc", "def");
+  assert(s.ok());
+
+  // Read a key OUTSIDE this transaction. Does not affect txn.
+  s = txn_db->Get(read_options, "abc", &value);
+  assert(s.IsNotFound());
+
+  // Write a key OUTSIDE of this transaction.
+  // Does not affect txn since this is an unrelated key.
+  s = txn_db->Put(write_options, "xyz", "zzz");
+  assert(s.ok());
+
+  // Write a key OUTSIDE of this transaction.
+  // Fail because the key conflicts with the key written in txn.
+  s = txn_db->Put(write_options, "abc", "def");
+  assert(s.subcode() == Status::kLockTimeout);
+
+  // Value for key "xyz" has been committed, can be read in txn.
+  s = txn->Get(read_options, "xyz", &value);
+  assert(s.ok());
+  assert(value == "zzz");
+
+  // Commit transaction
+  s = txn->Commit();
+  assert(s.ok());
+  delete txn;
+
+  // Value is committed, can be read now.
+  s = txn_db->Get(read_options, "abc", &value);
+  assert(s.ok());
+  assert(value == "def");
+
+  ////////////////////////////////////////////////////////
+  //
+  // "Repeatable Read" (Snapshot Isolation) Example
+  //   -- Using a single Snapshot
+  //
+  ////////////////////////////////////////////////////////
+
+  // Set a snapshot at start of transaction by setting set_snapshot=true
+  txn_options.set_snapshot = true;
+  txn = txn_db->BeginTransaction(write_options, txn_options);
+
+  const Snapshot* snapshot = txn->GetSnapshot();
+
+  // Write a key OUTSIDE of transaction
+  s = txn_db->Put(write_options, "abc", "xyz");
+  assert(s.ok());
+
+  // Read the latest committed value.
+  s = txn->Get(read_options, "abc", &value);
+  assert(s.ok());
+  assert(value == "xyz");
+
+  // Read the snapshotted value.
+  read_options.snapshot = snapshot;
+  s = txn->Get(read_options, "abc", &value);
+  assert(s.ok());
+  assert(value == "def");
+
+  // Attempt to read a key using the snapshot.  This will fail since
+  // the previous write outside this txn conflicts with this read.
+  s = txn->GetForUpdate(read_options, "abc", &value);
+  assert(s.IsBusy());
+
+  txn->Rollback();
+
+  // Snapshot will be released upon deleting the transaction.
+  delete txn;
+  // Clear snapshot from read options since it is no longer valid
+  read_options.snapshot = nullptr;
+  snapshot = nullptr;
+
+  ////////////////////////////////////////////////////////
+  //
+  // "Read Committed" (Monotonic Atomic Views) Example
+  //   --Using multiple Snapshots
+  //
+  ////////////////////////////////////////////////////////
+
+  // In this example, we set the snapshot multiple times.  This is probably
+  // only necessary if you have very strict isolation requirements to
+  // implement.
+
+  // Set a snapshot at start of transaction
+  txn_options.set_snapshot = true;
+  txn = txn_db->BeginTransaction(write_options, txn_options);
+
+  // Do some reads and writes to key "x"
+  read_options.snapshot = txn_db->GetSnapshot();
+  s = txn->Get(read_options, "x", &value);
+  assert(s.IsNotFound());
+  s = txn->Put("x", "x");
+  assert(s.ok());
+
+  // Do a write outside of the transaction to key "y"
+  s = txn_db->Put(write_options, "y", "y1");
+  assert(s.ok());
+
+  // Set a new snapshot in the transaction
+  txn->SetSnapshot();
+  txn->SetSavePoint();
+  read_options.snapshot = txn_db->GetSnapshot();
+
+  // Do some reads and writes to key "y"
+  // Since the snapshot was advanced, the write done outside of the
+  // transaction does not conflict.
+  s = txn->GetForUpdate(read_options, "y", &value);
+  assert(s.ok());
+  assert(value == "y1");
+  s = txn->Put("y", "y2");
+  assert(s.ok());
+
+  // Decide we want to revert the last write from this transaction.
+  txn->RollbackToSavePoint();
+
+  // Commit.
+  s = txn->Commit();
+  assert(s.ok());
+  delete txn;
+  // Clear snapshot from read options since it is no longer valid
+  read_options.snapshot = nullptr;
+
+  // db state is at the save point.
+  s = txn_db->Get(read_options, "x", &value);
+  assert(s.ok());
+  assert(value == "x");
+
+  s = txn_db->Get(read_options, "y", &value);
+  assert(s.ok());
+  assert(value == "y1");
+
+  // Cleanup
+  delete txn_db;
+  ROCKSDB_NAMESPACE::DestroyDB(kDBPath, options);
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/file/delete_scheduler.cc b/src/rocksdb/file/delete_scheduler.cc
new file mode 100644
index 000000000..b97a0f224
--- /dev/null
+++ b/src/rocksdb/file/delete_scheduler.cc
@@ -0,0 +1,411 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "file/delete_scheduler.h"
+
+#include <cinttypes>
+#include <thread>
+#include <vector>
+
+#include "file/sst_file_manager_impl.h"
+#include "logging/logging.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/sync_point.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+DeleteScheduler::DeleteScheduler(SystemClock* clock, FileSystem* fs,
+                                 int64_t rate_bytes_per_sec, Logger* info_log,
+                                 SstFileManagerImpl* sst_file_manager,
+                                 double max_trash_db_ratio,
+                                 uint64_t bytes_max_delete_chunk)
+    : clock_(clock),
+      fs_(fs),
+      total_trash_size_(0),
+      rate_bytes_per_sec_(rate_bytes_per_sec),
+      pending_files_(0),
+      bytes_max_delete_chunk_(bytes_max_delete_chunk),
+      closing_(false),
+      cv_(&mu_),
+      bg_thread_(nullptr),
+      info_log_(info_log),
+      sst_file_manager_(sst_file_manager),
+      max_trash_db_ratio_(max_trash_db_ratio) {
+  assert(sst_file_manager != nullptr);
+  assert(max_trash_db_ratio >= 0);
+  MaybeCreateBackgroundThread();
+}
+
+DeleteScheduler::~DeleteScheduler() {
+  {
+    InstrumentedMutexLock l(&mu_);
+    closing_ = true;
+    cv_.SignalAll();
+  }
+  if (bg_thread_) {
+    bg_thread_->join();
+  }
+  for (const auto& it : bg_errors_) {
+    it.second.PermitUncheckedError();
+  }
+}
+
+Status DeleteScheduler::DeleteFile(const std::string& file_path,
+                                   const std::string& dir_to_sync,
+                                   const bool force_bg) {
+  if (rate_bytes_per_sec_.load() <= 0 ||
+      (!force_bg &&
+       total_trash_size_.load() >
+           sst_file_manager_->GetTotalSize() * max_trash_db_ratio_.load())) {
+    // Rate limiting is disabled or trash size makes up more than
+    // max_trash_db_ratio_ (default 25%) of the total DB size
+    TEST_SYNC_POINT("DeleteScheduler::DeleteFile");
+    Status s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
+    if (s.ok()) {
+      s = sst_file_manager_->OnDeleteFile(file_path);
+      ROCKS_LOG_INFO(info_log_,
+                     "Deleted file %s immediately, rate_bytes_per_sec %" PRIi64
+                     ", total_trash_size %" PRIu64 " max_trash_db_ratio %lf",
+                     file_path.c_str(), rate_bytes_per_sec_.load(),
+                     total_trash_size_.load(), max_trash_db_ratio_.load());
+      InstrumentedMutexLock l(&mu_);
+      RecordTick(stats_.get(), FILES_DELETED_IMMEDIATELY);
+    }
+    return s;
+  }
+
+  // Move file to trash
+  std::string trash_file;
+  Status s = MarkAsTrash(file_path, &trash_file);
+  ROCKS_LOG_INFO(info_log_, "Mark file: %s as trash -- %s", trash_file.c_str(),
+                 s.ToString().c_str());
+
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(info_log_, "Failed to mark %s as trash -- %s",
+                    file_path.c_str(), s.ToString().c_str());
+    s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
+    if (s.ok()) {
+      s = sst_file_manager_->OnDeleteFile(file_path);
+      ROCKS_LOG_INFO(info_log_, "Deleted file %s immediately",
+                     trash_file.c_str());
+      InstrumentedMutexLock l(&mu_);
+      RecordTick(stats_.get(), FILES_DELETED_IMMEDIATELY);
+    }
+    return s;
+  }
+
+  // Update the total trash size
+  uint64_t trash_file_size = 0;
+  IOStatus io_s =
+      fs_->GetFileSize(trash_file, IOOptions(), &trash_file_size, nullptr);
+  if (io_s.ok()) {
+    total_trash_size_.fetch_add(trash_file_size);
+  }
+  //**TODO: What should we do if we failed to
+  // get the file size?
+
+  // Add file to delete queue
+  {
+    InstrumentedMutexLock l(&mu_);
+    RecordTick(stats_.get(), FILES_MARKED_TRASH);
+    queue_.emplace(trash_file, dir_to_sync);
+    pending_files_++;
+    if (pending_files_ == 1) {
+      cv_.SignalAll();
+    }
+  }
+  return s;
+}
+
+std::map<std::string, Status> DeleteScheduler::GetBackgroundErrors() {
+  InstrumentedMutexLock l(&mu_);
+  return bg_errors_;
+}
+
+const std::string DeleteScheduler::kTrashExtension = ".trash";
+bool DeleteScheduler::IsTrashFile(const std::string& file_path) {
+  return (file_path.size() >= kTrashExtension.size() &&
+          file_path.rfind(kTrashExtension) ==
+              file_path.size() - kTrashExtension.size());
+}
+
+Status DeleteScheduler::CleanupDirectory(Env* env, SstFileManagerImpl* sfm,
+                                         const std::string& path) {
+  Status s;
+  // Check if there are any files marked as trash in this path
+  std::vector<std::string> files_in_path;
+  const auto& fs = env->GetFileSystem();
+  IOOptions io_opts;
+  io_opts.do_not_recurse = true;
+  s = fs->GetChildren(path, io_opts, &files_in_path,
+                      /*IODebugContext*=*/nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+  for (const std::string& current_file : files_in_path) {
+    if (!DeleteScheduler::IsTrashFile(current_file)) {
+      // not a trash file, skip
+      continue;
+    }
+
+    Status file_delete;
+    std::string trash_file = path + "/" + current_file;
+    if (sfm) {
+      // We have an SstFileManager that will schedule the file delete
+      s = sfm->OnAddFile(trash_file);
+      file_delete = sfm->ScheduleFileDeletion(trash_file, path);
+    } else {
+      // Delete the file immediately
+      file_delete = env->DeleteFile(trash_file);
+    }
+
+    if (s.ok() && !file_delete.ok()) {
+      s = file_delete;
+    }
+  }
+
+  return s;
+}
+
+Status DeleteScheduler::MarkAsTrash(const std::string& file_path,
+                                    std::string* trash_file) {
+  // Sanity check of the path
+  size_t idx = file_path.rfind("/");
+  if (idx == std::string::npos || idx == file_path.size() - 1) {
+    return Status::InvalidArgument("file_path is corrupted");
+  }
+
+  if (DeleteScheduler::IsTrashFile(file_path)) {
+    // This is already a trash file
+    *trash_file = file_path;
+    return Status::OK();
+  }
+
+  *trash_file = file_path + kTrashExtension;
+  // TODO(tec) : Implement Env::RenameFileIfNotExist and remove
+  //             file_move_mu mutex.
+  int cnt = 0;
+  Status s;
+  InstrumentedMutexLock l(&file_move_mu_);
+  while (true) {
+    s = fs_->FileExists(*trash_file, IOOptions(), nullptr);
+    if (s.IsNotFound()) {
+      // We found a path for our file in trash
+      s = fs_->RenameFile(file_path, *trash_file, IOOptions(), nullptr);
+      break;
+    } else if (s.ok()) {
+      // Name conflict, generate new random suffix
+      *trash_file = file_path + std::to_string(cnt) + kTrashExtension;
+    } else {
+      // Error during FileExists call, we cannot continue
+      break;
+    }
+    cnt++;
+  }
+  if (s.ok()) {
+    s = sst_file_manager_->OnMoveFile(file_path, *trash_file);
+  }
+  return s;
+}
+
+void DeleteScheduler::BackgroundEmptyTrash() {
+  TEST_SYNC_POINT("DeleteScheduler::BackgroundEmptyTrash");
+
+  while (true) {
+    InstrumentedMutexLock l(&mu_);
+    while (queue_.empty() && !closing_) {
+      cv_.Wait();
+    }
+
+    if (closing_) {
+      return;
+    }
+
+    // Delete all files in queue_
+    uint64_t start_time = clock_->NowMicros();
+    uint64_t total_deleted_bytes = 0;
+    int64_t current_delete_rate = rate_bytes_per_sec_.load();
+    while (!queue_.empty() && !closing_) {
+      if (current_delete_rate != rate_bytes_per_sec_.load()) {
+        // User changed the delete rate
+        current_delete_rate = rate_bytes_per_sec_.load();
+        start_time = clock_->NowMicros();
+        total_deleted_bytes = 0;
+        ROCKS_LOG_INFO(info_log_, "rate_bytes_per_sec is changed to %" PRIi64,
+                       current_delete_rate);
+      }
+
+      // Get new file to delete
+      const FileAndDir& fad = queue_.front();
+      std::string path_in_trash = fad.fname;
+
+      // We don't need to hold the lock while deleting the file
+      mu_.Unlock();
+      uint64_t deleted_bytes = 0;
+      bool is_complete = true;
+      // Delete file from trash and update total_penlty value
+      Status s =
+          DeleteTrashFile(path_in_trash, fad.dir, &deleted_bytes, &is_complete);
+      total_deleted_bytes += deleted_bytes;
+      mu_.Lock();
+      if (is_complete) {
+        queue_.pop();
+      }
+
+      if (!s.ok()) {
+        bg_errors_[path_in_trash] = s;
+      }
+
+      // Apply penalty if necessary
+      uint64_t total_penalty;
+      if (current_delete_rate > 0) {
+        // rate limiting is enabled
+        total_penalty =
+            ((total_deleted_bytes * kMicrosInSecond) / current_delete_rate);
+        ROCKS_LOG_INFO(info_log_,
+                       "Rate limiting is enabled with penalty %" PRIu64
+                       " after deleting file %s",
+                       total_penalty, path_in_trash.c_str());
+        while (!closing_ && !cv_.TimedWait(start_time + total_penalty)) {
+        }
+      } else {
+        // rate limiting is disabled
+        total_penalty = 0;
+        ROCKS_LOG_INFO(info_log_,
+                       "Rate limiting is disabled after deleting file %s",
+                       path_in_trash.c_str());
+      }
+      TEST_SYNC_POINT_CALLBACK("DeleteScheduler::BackgroundEmptyTrash:Wait",
+                               &total_penalty);
+
+      if (is_complete) {
+        pending_files_--;
+      }
+      if (pending_files_ == 0) {
+        // Unblock WaitForEmptyTrash since there are no more files waiting
+        // to be deleted
+        cv_.SignalAll();
+      }
+    }
+  }
+}
+
+Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash,
+                                        const std::string& dir_to_sync,
+                                        uint64_t* deleted_bytes,
+                                        bool* is_complete) {
+  uint64_t file_size;
+  Status s = fs_->GetFileSize(path_in_trash, IOOptions(), &file_size, nullptr);
+  *is_complete = true;
+  TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:DeleteFile");
+  if (s.ok()) {
+    bool need_full_delete = true;
+    if (bytes_max_delete_chunk_ != 0 && file_size > bytes_max_delete_chunk_) {
+      uint64_t num_hard_links = 2;
+      // We don't have to worry aobut data race between linking a new
+      // file after the number of file link check and ftruncte because
+      // the file is now in trash and no hardlink is supposed to create
+      // to trash files by RocksDB.
+      Status my_status = fs_->NumFileLinks(path_in_trash, IOOptions(),
+                                           &num_hard_links, nullptr);
+      if (my_status.ok()) {
+        if (num_hard_links == 1) {
+          std::unique_ptr<FSWritableFile> wf;
+          my_status = fs_->ReopenWritableFile(path_in_trash, FileOptions(), &wf,
+                                              nullptr);
+          if (my_status.ok()) {
+            my_status = wf->Truncate(file_size - bytes_max_delete_chunk_,
+                                     IOOptions(), nullptr);
+            if (my_status.ok()) {
+              TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:Fsync");
+              my_status = wf->Fsync(IOOptions(), nullptr);
+            }
+          }
+          if (my_status.ok()) {
+            *deleted_bytes = bytes_max_delete_chunk_;
+            need_full_delete = false;
+            *is_complete = false;
+          } else {
+            ROCKS_LOG_WARN(info_log_,
+                           "Failed to partially delete %s from trash -- %s",
+                           path_in_trash.c_str(), my_status.ToString().c_str());
+          }
+        } else {
+          ROCKS_LOG_INFO(info_log_,
+                         "Cannot delete %s slowly through ftruncate from trash "
+                         "as it has other links",
+                         path_in_trash.c_str());
+        }
+      } else if (!num_link_error_printed_) {
+        ROCKS_LOG_INFO(
+            info_log_,
+            "Cannot delete files slowly through ftruncate from trash "
+            "as Env::NumFileLinks() returns error: %s",
+            my_status.ToString().c_str());
+        num_link_error_printed_ = true;
+      }
+    }
+
+    if (need_full_delete) {
+      s = fs_->DeleteFile(path_in_trash, IOOptions(), nullptr);
+      if (!dir_to_sync.empty()) {
+        std::unique_ptr<FSDirectory> dir_obj;
+        if (s.ok()) {
+          s = fs_->NewDirectory(dir_to_sync, IOOptions(), &dir_obj, nullptr);
+        }
+        if (s.ok()) {
+          s = dir_obj->FsyncWithDirOptions(
+              IOOptions(), nullptr,
+              DirFsyncOptions(DirFsyncOptions::FsyncReason::kFileDeleted));
+          TEST_SYNC_POINT_CALLBACK(
+              "DeleteScheduler::DeleteTrashFile::AfterSyncDir",
+              reinterpret_cast<void*>(const_cast<std::string*>(&dir_to_sync)));
+        }
+      }
+      if (s.ok()) {
+        *deleted_bytes = file_size;
+        s = sst_file_manager_->OnDeleteFile(path_in_trash);
+      }
+    }
+  }
+  if (!s.ok()) {
+    // Error while getting file size or while deleting
+    ROCKS_LOG_ERROR(info_log_, "Failed to delete %s from trash -- %s",
+                    path_in_trash.c_str(), s.ToString().c_str());
+    *deleted_bytes = 0;
+  } else {
+    total_trash_size_.fetch_sub(*deleted_bytes);
+  }
+
+  return s;
+}
+
+void DeleteScheduler::WaitForEmptyTrash() {
+  InstrumentedMutexLock l(&mu_);
+  while (pending_files_ > 0 && !closing_) {
+    cv_.Wait();
+  }
+}
+
+void DeleteScheduler::MaybeCreateBackgroundThread() {
+  if (bg_thread_ == nullptr && rate_bytes_per_sec_.load() > 0) {
+    bg_thread_.reset(
+        new port::Thread(&DeleteScheduler::BackgroundEmptyTrash, this));
+    ROCKS_LOG_INFO(info_log_,
+                   "Created background thread for deletion scheduler with "
+                   "rate_bytes_per_sec: %" PRIi64,
+                   rate_bytes_per_sec_.load());
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/file/delete_scheduler.h b/src/rocksdb/file/delete_scheduler.h
new file mode 100644
index 000000000..2904ec621
--- /dev/null
+++ b/src/rocksdb/file/delete_scheduler.h
@@ -0,0 +1,149 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <map>
+#include <queue>
+#include <string>
+#include <thread>
+
+#include "monitoring/instrumented_mutex.h"
+#include "port/port.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Env;
+class FileSystem;
+class Logger;
+class SstFileManagerImpl;
+class SystemClock;
+
+// DeleteScheduler allows the DB to enforce a rate limit on file deletion,
+// Instead of deleteing files immediately, files are marked as trash
+// and deleted in a background thread that apply sleep penalty between deletes
+// if they are happening in a rate faster than rate_bytes_per_sec,
+//
+// Rate limiting can be turned off by setting rate_bytes_per_sec = 0, In this
+// case DeleteScheduler will delete files immediately.
+class DeleteScheduler {
+ public:
+  DeleteScheduler(SystemClock* clock, FileSystem* fs,
+                  int64_t rate_bytes_per_sec, Logger* info_log,
+                  SstFileManagerImpl* sst_file_manager,
+                  double max_trash_db_ratio, uint64_t bytes_max_delete_chunk);
+
+  ~DeleteScheduler();
+
+  // Return delete rate limit in bytes per second
+  int64_t GetRateBytesPerSecond() { return rate_bytes_per_sec_.load(); }
+
+  // Set delete rate limit in bytes per second
+  void SetRateBytesPerSecond(int64_t bytes_per_sec) {
+    rate_bytes_per_sec_.store(bytes_per_sec);
+    MaybeCreateBackgroundThread();
+  }
+
+  // Mark file as trash directory and schedule its deletion. If force_bg is
+  // set, it forces the file to always be deleted in the background thread,
+  // except when rate limiting is disabled
+  Status DeleteFile(const std::string& fname, const std::string& dir_to_sync,
+                    const bool force_bg = false);
+
+  // Wait for all files being deleteing in the background to finish or for
+  // destructor to be called.
+  void WaitForEmptyTrash();
+
+  // Return a map containing errors that happened in BackgroundEmptyTrash
+  // file_path => error status
+  std::map<std::string, Status> GetBackgroundErrors();
+
+  uint64_t GetTotalTrashSize() { return total_trash_size_.load(); }
+
+  // Return trash/DB size ratio where new files will be deleted immediately
+  double GetMaxTrashDBRatio() { return max_trash_db_ratio_.load(); }
+
+  // Update trash/DB size ratio where new files will be deleted immediately
+  void SetMaxTrashDBRatio(double r) {
+    assert(r >= 0);
+    max_trash_db_ratio_.store(r);
+  }
+
+  static const std::string kTrashExtension;
+  static bool IsTrashFile(const std::string& file_path);
+
+  // Check if there are any .trash files in path, and schedule their deletion
+  // Or delete immediately if sst_file_manager is nullptr
+  static Status CleanupDirectory(Env* env, SstFileManagerImpl* sfm,
+                                 const std::string& path);
+
+  void SetStatisticsPtr(const std::shared_ptr<Statistics>& stats) {
+    InstrumentedMutexLock l(&mu_);
+    stats_ = stats;
+  }
+
+ private:
+  Status MarkAsTrash(const std::string& file_path, std::string* path_in_trash);
+
+  Status DeleteTrashFile(const std::string& path_in_trash,
+                         const std::string& dir_to_sync,
+                         uint64_t* deleted_bytes, bool* is_complete);
+
+  void BackgroundEmptyTrash();
+
+  void MaybeCreateBackgroundThread();
+
+  SystemClock* clock_;
+  FileSystem* fs_;
+
+  // total size of trash files
+  std::atomic<uint64_t> total_trash_size_;
+  // Maximum number of bytes that should be deleted per second
+  std::atomic<int64_t> rate_bytes_per_sec_;
+  // Mutex to protect queue_, pending_files_, bg_errors_, closing_, stats_
+  InstrumentedMutex mu_;
+
+  struct FileAndDir {
+    FileAndDir(const std::string& f, const std::string& d) : fname(f), dir(d) {}
+    std::string fname;
+    std::string dir;  // empty will be skipped.
+  };
+
+  // Queue of trash files that need to be deleted
+  std::queue<FileAndDir> queue_;
+  // Number of trash files that are waiting to be deleted
+  int32_t pending_files_;
+  uint64_t bytes_max_delete_chunk_;
+  // Errors that happened in BackgroundEmptyTrash (file_path => error)
+  std::map<std::string, Status> bg_errors_;
+
+  bool num_link_error_printed_ = false;
+  // Set to true in ~DeleteScheduler() to force BackgroundEmptyTrash to stop
+  bool closing_;
+  // Condition variable signaled in these conditions
+  //    - pending_files_ value change from 0 => 1
+  //    - pending_files_ value change from 1 => 0
+  //    - closing_ value is set to true
+  InstrumentedCondVar cv_;
+  // Background thread running BackgroundEmptyTrash
+  std::unique_ptr<port::Thread> bg_thread_;
+  // Mutex to protect threads from file name conflicts
+  InstrumentedMutex file_move_mu_;
+  Logger* info_log_;
+  SstFileManagerImpl* sst_file_manager_;
+  // If the trash size constitutes for more than this fraction of the total DB
+  // size we will start deleting new files passed to DeleteScheduler
+  // immediately
+  std::atomic<double> max_trash_db_ratio_;
+  static const uint64_t kMicrosInSecond = 1000 * 1000LL;
+  std::shared_ptr<Statistics> stats_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/file/delete_scheduler_test.cc b/src/rocksdb/file/delete_scheduler_test.cc
new file mode 100644
index 000000000..d825da32a
--- /dev/null
+++ b/src/rocksdb/file/delete_scheduler_test.cc
@@ -0,0 +1,724 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "file/delete_scheduler.h"
+
+#include <atomic>
+#include <cinttypes>
+#include <thread>
+#include <vector>
+
+#include "file/file_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/string_util.h"
+
+#ifndef ROCKSDB_LITE
+
+namespace ROCKSDB_NAMESPACE {
+
+class DeleteSchedulerTest : public testing::Test {
+ public:
+  DeleteSchedulerTest() : env_(Env::Default()) {
+    const int kNumDataDirs = 3;
+    dummy_files_dirs_.reserve(kNumDataDirs);
+    for (size_t i = 0; i < kNumDataDirs; ++i) {
+      dummy_files_dirs_.emplace_back(
+          test::PerThreadDBPath(env_, "delete_scheduler_dummy_data_dir") +
+          std::to_string(i));
+      DestroyAndCreateDir(dummy_files_dirs_.back());
+    }
+    stats_ = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  }
+
+  ~DeleteSchedulerTest() override {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({});
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+    for (const auto& dummy_files_dir : dummy_files_dirs_) {
+      DestroyDir(env_, dummy_files_dir);
+    }
+  }
+
+  void DestroyAndCreateDir(const std::string& dir) {
+    ASSERT_OK(DestroyDir(env_, dir));
+    EXPECT_OK(env_->CreateDir(dir));
+  }
+
+  int CountNormalFiles(size_t dummy_files_dirs_idx = 0) {
+    std::vector<std::string> files_in_dir;
+    EXPECT_OK(env_->GetChildren(dummy_files_dirs_[dummy_files_dirs_idx],
+                                &files_in_dir));
+
+    int normal_cnt = 0;
+    for (auto& f : files_in_dir) {
+      if (!DeleteScheduler::IsTrashFile(f)) {
+        normal_cnt++;
+      }
+    }
+    return normal_cnt;
+  }
+
+  int CountTrashFiles(size_t dummy_files_dirs_idx = 0) {
+    std::vector<std::string> files_in_dir;
+    EXPECT_OK(env_->GetChildren(dummy_files_dirs_[dummy_files_dirs_idx],
+                                &files_in_dir));
+
+    int trash_cnt = 0;
+    for (auto& f : files_in_dir) {
+      if (DeleteScheduler::IsTrashFile(f)) {
+        trash_cnt++;
+      }
+    }
+    return trash_cnt;
+  }
+
+  std::string NewDummyFile(const std::string& file_name, uint64_t size = 1024,
+                           size_t dummy_files_dirs_idx = 0) {
+    std::string file_path =
+        dummy_files_dirs_[dummy_files_dirs_idx] + "/" + file_name;
+    std::unique_ptr<WritableFile> f;
+    env_->NewWritableFile(file_path, &f, EnvOptions());
+    std::string data(size, 'A');
+    EXPECT_OK(f->Append(data));
+    EXPECT_OK(f->Close());
+    sst_file_mgr_->OnAddFile(file_path);
+    return file_path;
+  }
+
+  void NewDeleteScheduler() {
+    // Tests in this file are for DeleteScheduler component and don't create any
+    // DBs, so we need to set max_trash_db_ratio to 100% (instead of default
+    // 25%)
+    sst_file_mgr_.reset(
+        new SstFileManagerImpl(env_->GetSystemClock(), env_->GetFileSystem(),
+                               nullptr, rate_bytes_per_sec_,
+                               /* max_trash_db_ratio= */ 1.1, 128 * 1024));
+    delete_scheduler_ = sst_file_mgr_->delete_scheduler();
+    sst_file_mgr_->SetStatisticsPtr(stats_);
+  }
+
+  Env* env_;
+  std::vector<std::string> dummy_files_dirs_;
+  int64_t rate_bytes_per_sec_;
+  DeleteScheduler* delete_scheduler_;
+  std::unique_ptr<SstFileManagerImpl> sst_file_mgr_;
+  std::shared_ptr<Statistics> stats_;
+};
+
+// Test the basic functionality of DeleteScheduler (Rate Limiting).
+// 1- Create 100 dummy files
+// 2- Delete the 100 dummy files using DeleteScheduler
+// --- Hold DeleteScheduler::BackgroundEmptyTrash ---
+// 3- Wait for DeleteScheduler to delete all files in trash
+// 4- Verify that BackgroundEmptyTrash used to correct penlties for the files
+// 5- Make sure that all created files were completely deleted
+TEST_F(DeleteSchedulerTest, BasicRateLimiting) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::BasicRateLimiting:1",
+       "DeleteScheduler::BackgroundEmptyTrash"},
+  });
+
+  std::vector<uint64_t> penalties;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::BackgroundEmptyTrash:Wait",
+      [&](void* arg) { penalties.push_back(*(static_cast<uint64_t*>(arg))); });
+  int dir_synced = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile::AfterSyncDir", [&](void* arg) {
+        dir_synced++;
+        std::string* dir = reinterpret_cast<std::string*>(arg);
+        EXPECT_EQ(dummy_files_dirs_[0], *dir);
+      });
+
+  int num_files = 100;        // 100 files
+  uint64_t file_size = 1024;  // every file is 1 kb
+  std::vector<uint64_t> delete_kbs_per_sec = {512, 200, 100, 50, 25};
+
+  for (size_t t = 0; t < delete_kbs_per_sec.size(); t++) {
+    penalties.clear();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    DestroyAndCreateDir(dummy_files_dirs_[0]);
+    rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024;
+    NewDeleteScheduler();
+
+    dir_synced = 0;
+    // Create 100 dummy files, every file is 1 Kb
+    std::vector<std::string> generated_files;
+    for (int i = 0; i < num_files; i++) {
+      std::string file_name = "file" + std::to_string(i) + ".data";
+      generated_files.push_back(NewDummyFile(file_name, file_size));
+    }
+
+    // Delete dummy files and measure time spent to empty trash
+    for (int i = 0; i < num_files; i++) {
+      ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i],
+                                              dummy_files_dirs_[0]));
+    }
+    ASSERT_EQ(CountNormalFiles(), 0);
+
+    uint64_t delete_start_time = env_->NowMicros();
+    TEST_SYNC_POINT("DeleteSchedulerTest::BasicRateLimiting:1");
+    delete_scheduler_->WaitForEmptyTrash();
+    uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time;
+
+    auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+    ASSERT_EQ(bg_errors.size(), 0);
+
+    uint64_t total_files_size = 0;
+    uint64_t expected_penlty = 0;
+    ASSERT_EQ(penalties.size(), num_files);
+    for (int i = 0; i < num_files; i++) {
+      total_files_size += file_size;
+      expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec_);
+      ASSERT_EQ(expected_penlty, penalties[i]);
+    }
+    ASSERT_GT(time_spent_deleting, expected_penlty * 0.9);
+
+    ASSERT_EQ(num_files, dir_synced);
+
+    ASSERT_EQ(CountTrashFiles(), 0);
+    ASSERT_EQ(num_files, stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
+    ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(DeleteSchedulerTest, MultiDirectoryDeletionsScheduled) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::MultiDbPathDeletionsScheduled:1",
+       "DeleteScheduler::BackgroundEmptyTrash"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  rate_bytes_per_sec_ = 1 << 20;  // 1MB
+  NewDeleteScheduler();
+
+  // Generate dummy files in multiple directories
+  const size_t kNumFiles = dummy_files_dirs_.size();
+  const size_t kFileSize = 1 << 10;  // 1KB
+  std::vector<std::string> generated_files;
+  for (size_t i = 0; i < kNumFiles; i++) {
+    generated_files.push_back(NewDummyFile("file", kFileSize, i));
+    ASSERT_EQ(1, CountNormalFiles(i));
+  }
+
+  // Mark dummy files as trash
+  for (size_t i = 0; i < kNumFiles; i++) {
+    ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i], ""));
+    ASSERT_EQ(0, CountNormalFiles(i));
+    ASSERT_EQ(1, CountTrashFiles(i));
+  }
+  TEST_SYNC_POINT("DeleteSchedulerTest::MultiDbPathDeletionsScheduled:1");
+  delete_scheduler_->WaitForEmptyTrash();
+
+  // Verify dummy files eventually got deleted
+  for (size_t i = 0; i < kNumFiles; i++) {
+    ASSERT_EQ(0, CountNormalFiles(i));
+    ASSERT_EQ(0, CountTrashFiles(i));
+  }
+
+  ASSERT_EQ(kNumFiles, stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
+  ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Same as the BasicRateLimiting test but delete files in multiple threads.
+// 1- Create 100 dummy files
+// 2- Delete the 100 dummy files using DeleteScheduler using 10 threads
+// --- Hold DeleteScheduler::BackgroundEmptyTrash ---
+// 3- Wait for DeleteScheduler to delete all files in queue
+// 4- Verify that BackgroundEmptyTrash used to correct penlties for the files
+// 5- Make sure that all created files were completely deleted
+TEST_F(DeleteSchedulerTest, RateLimitingMultiThreaded) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::RateLimitingMultiThreaded:1",
+       "DeleteScheduler::BackgroundEmptyTrash"},
+  });
+
+  std::vector<uint64_t> penalties;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::BackgroundEmptyTrash:Wait",
+      [&](void* arg) { penalties.push_back(*(static_cast<uint64_t*>(arg))); });
+
+  int thread_cnt = 10;
+  int num_files = 10;         // 10 files per thread
+  uint64_t file_size = 1024;  // every file is 1 kb
+
+  std::vector<uint64_t> delete_kbs_per_sec = {512, 200, 100, 50, 25};
+  for (size_t t = 0; t < delete_kbs_per_sec.size(); t++) {
+    penalties.clear();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    DestroyAndCreateDir(dummy_files_dirs_[0]);
+    rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024;
+    NewDeleteScheduler();
+
+    // Create 100 dummy files, every file is 1 Kb
+    std::vector<std::string> generated_files;
+    for (int i = 0; i < num_files * thread_cnt; i++) {
+      std::string file_name = "file" + std::to_string(i) + ".data";
+      generated_files.push_back(NewDummyFile(file_name, file_size));
+    }
+
+    // Delete dummy files using 10 threads and measure time spent to empty trash
+    std::atomic<int> thread_num(0);
+    std::vector<port::Thread> threads;
+    std::function<void()> delete_thread = [&]() {
+      int idx = thread_num.fetch_add(1);
+      int range_start = idx * num_files;
+      int range_end = range_start + num_files;
+      for (int j = range_start; j < range_end; j++) {
+        ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[j], ""));
+      }
+    };
+
+    for (int i = 0; i < thread_cnt; i++) {
+      threads.emplace_back(delete_thread);
+    }
+
+    for (size_t i = 0; i < threads.size(); i++) {
+      threads[i].join();
+    }
+
+    uint64_t delete_start_time = env_->NowMicros();
+    TEST_SYNC_POINT("DeleteSchedulerTest::RateLimitingMultiThreaded:1");
+    delete_scheduler_->WaitForEmptyTrash();
+    uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time;
+
+    auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+    ASSERT_EQ(bg_errors.size(), 0);
+
+    uint64_t total_files_size = 0;
+    uint64_t expected_penlty = 0;
+    ASSERT_EQ(penalties.size(), num_files * thread_cnt);
+    for (int i = 0; i < num_files * thread_cnt; i++) {
+      total_files_size += file_size;
+      expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec_);
+      ASSERT_EQ(expected_penlty, penalties[i]);
+    }
+    ASSERT_GT(time_spent_deleting, expected_penlty * 0.9);
+
+    ASSERT_EQ(CountNormalFiles(), 0);
+    ASSERT_EQ(CountTrashFiles(), 0);
+    ASSERT_EQ(num_files * thread_cnt,
+              stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
+    ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+// Disable rate limiting by setting rate_bytes_per_sec_ to 0 and make sure
+// that when DeleteScheduler delete a file it delete it immediately and don't
+// move it to trash
+TEST_F(DeleteSchedulerTest, DisableRateLimiting) {
+  int bg_delete_file = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void* /*arg*/) { bg_delete_file++; });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 0;
+  NewDeleteScheduler();
+  constexpr int num_files = 10;
+
+  for (int i = 0; i < num_files; i++) {
+    // Every file we delete will be deleted immediately
+    std::string dummy_file = NewDummyFile("dummy.data");
+    ASSERT_OK(delete_scheduler_->DeleteFile(dummy_file, ""));
+    ASSERT_TRUE(env_->FileExists(dummy_file).IsNotFound());
+    ASSERT_EQ(CountNormalFiles(), 0);
+    ASSERT_EQ(CountTrashFiles(), 0);
+  }
+
+  ASSERT_EQ(bg_delete_file, 0);
+  ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
+  ASSERT_EQ(num_files,
+            stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Testing that moving files to trash with the same name is not a problem
+// 1- Create 10 files with the same name "conflict.data"
+// 2- Delete the 10 files using DeleteScheduler
+// 3- Make sure that trash directory contain 10 files ("conflict.data" x 10)
+// --- Hold DeleteScheduler::BackgroundEmptyTrash ---
+// 4- Make sure that files are deleted from trash
+TEST_F(DeleteSchedulerTest, ConflictNames) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::ConflictNames:1",
+       "DeleteScheduler::BackgroundEmptyTrash"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1024 * 1024;  // 1 Mb/sec
+  NewDeleteScheduler();
+
+  // Create "conflict.data" and move it to trash 10 times
+  for (int i = 0; i < 10; i++) {
+    std::string dummy_file = NewDummyFile("conflict.data");
+    ASSERT_OK(delete_scheduler_->DeleteFile(dummy_file, ""));
+  }
+  ASSERT_EQ(CountNormalFiles(), 0);
+  // 10 files ("conflict.data" x 10) in trash
+  ASSERT_EQ(CountTrashFiles(), 10);
+
+  // Hold BackgroundEmptyTrash
+  TEST_SYNC_POINT("DeleteSchedulerTest::ConflictNames:1");
+  delete_scheduler_->WaitForEmptyTrash();
+  ASSERT_EQ(CountTrashFiles(), 0);
+
+  auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+  ASSERT_EQ(bg_errors.size(), 0);
+  ASSERT_EQ(10, stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
+  ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// 1- Create 10 dummy files
+// 2- Delete the 10 files using DeleteScheduler (move them to trsah)
+// 3- Delete the 10 files directly (using env_->DeleteFile)
+// --- Hold DeleteScheduler::BackgroundEmptyTrash ---
+// 4- Make sure that DeleteScheduler failed to delete the 10 files and
+//    reported 10 background errors
+TEST_F(DeleteSchedulerTest, BackgroundError) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::BackgroundError:1",
+       "DeleteScheduler::BackgroundEmptyTrash"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1024 * 1024;  // 1 Mb/sec
+  NewDeleteScheduler();
+
+  // Generate 10 dummy files and move them to trash
+  for (int i = 0; i < 10; i++) {
+    std::string file_name = "data_" + std::to_string(i) + ".data";
+    ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name), ""));
+  }
+  ASSERT_EQ(CountNormalFiles(), 0);
+  ASSERT_EQ(CountTrashFiles(), 10);
+
+  // Delete 10 files from trash, this will cause background errors in
+  // BackgroundEmptyTrash since we already deleted the files it was
+  // goind to delete
+  for (int i = 0; i < 10; i++) {
+    std::string file_name = "data_" + std::to_string(i) + ".data.trash";
+    ASSERT_OK(env_->DeleteFile(dummy_files_dirs_[0] + "/" + file_name));
+  }
+
+  // Hold BackgroundEmptyTrash
+  TEST_SYNC_POINT("DeleteSchedulerTest::BackgroundError:1");
+  delete_scheduler_->WaitForEmptyTrash();
+  auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+  ASSERT_EQ(bg_errors.size(), 10);
+  for (const auto& it : bg_errors) {
+    ASSERT_TRUE(it.second.IsPathNotFound());
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// 1- Create kTestFileNum dummy files
+// 2- Delete kTestFileNum dummy files using DeleteScheduler
+// 3- Wait for DeleteScheduler to delete all files in queue
+// 4- Make sure all files in trash directory were deleted
+// 5- Repeat previous steps 5 times
+TEST_F(DeleteSchedulerTest, StartBGEmptyTrashMultipleTimes) {
+  constexpr int kTestFileNum = 10;
+  std::atomic_int bg_delete_file = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void* /*arg*/) { bg_delete_file++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1024 * 1024;  // 1 MB / sec
+  NewDeleteScheduler();
+
+  // If trash file is generated faster than deleting, delete_scheduler will
+  // delete it directly instead of waiting for background trash empty thread to
+  // clean it. Set the ratio higher to avoid that.
+  sst_file_mgr_->SetMaxTrashDBRatio(kTestFileNum + 1);
+
+  // Move files to trash, wait for empty trash, start again
+  for (int run = 1; run <= 5; run++) {
+    // Generate kTestFileNum dummy files and move them to trash
+    for (int i = 0; i < kTestFileNum; i++) {
+      std::string file_name = "data_" + std::to_string(i) + ".data";
+      ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name), ""));
+    }
+    ASSERT_EQ(CountNormalFiles(), 0);
+    delete_scheduler_->WaitForEmptyTrash();
+    ASSERT_EQ(bg_delete_file, kTestFileNum * run);
+    ASSERT_EQ(CountTrashFiles(), 0);
+
+    auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+    ASSERT_EQ(bg_errors.size(), 0);
+    ASSERT_EQ(kTestFileNum, stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
+    ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
+  }
+
+  ASSERT_EQ(bg_delete_file, 5 * kTestFileNum);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+}
+
+TEST_F(DeleteSchedulerTest, DeletePartialFile) {
+  int bg_delete_file = 0;
+  int bg_fsync = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void*) { bg_delete_file++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:Fsync", [&](void*) { bg_fsync++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1024 * 1024;  // 1 MB / sec
+  NewDeleteScheduler();
+
+  // Should delete in 4 batch
+  ASSERT_OK(
+      delete_scheduler_->DeleteFile(NewDummyFile("data_1", 500 * 1024), ""));
+  ASSERT_OK(
+      delete_scheduler_->DeleteFile(NewDummyFile("data_2", 100 * 1024), ""));
+  // Should delete in 2 batch
+  ASSERT_OK(
+      delete_scheduler_->DeleteFile(NewDummyFile("data_2", 200 * 1024), ""));
+
+  delete_scheduler_->WaitForEmptyTrash();
+
+  auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+  ASSERT_EQ(bg_errors.size(), 0);
+  ASSERT_EQ(7, bg_delete_file);
+  ASSERT_EQ(4, bg_fsync);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+}
+
+#ifdef OS_LINUX
+TEST_F(DeleteSchedulerTest, NoPartialDeleteWithLink) {
+  int bg_delete_file = 0;
+  int bg_fsync = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void*) { bg_delete_file++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:Fsync", [&](void*) { bg_fsync++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1024 * 1024;  // 1 MB / sec
+  NewDeleteScheduler();
+
+  std::string file1 = NewDummyFile("data_1", 500 * 1024);
+  std::string file2 = NewDummyFile("data_2", 100 * 1024);
+
+  ASSERT_OK(env_->LinkFile(file1, dummy_files_dirs_[0] + "/data_1b"));
+  ASSERT_OK(env_->LinkFile(file2, dummy_files_dirs_[0] + "/data_2b"));
+
+  // Should delete in 4 batch if there is no hardlink
+  ASSERT_OK(delete_scheduler_->DeleteFile(file1, ""));
+  ASSERT_OK(delete_scheduler_->DeleteFile(file2, ""));
+
+  delete_scheduler_->WaitForEmptyTrash();
+
+  auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+  ASSERT_EQ(bg_errors.size(), 0);
+  ASSERT_EQ(2, bg_delete_file);
+  ASSERT_EQ(0, bg_fsync);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+}
+#endif
+
+// 1- Create a DeleteScheduler with very slow rate limit (1 Byte / sec)
+// 2- Delete 100 files using DeleteScheduler
+// 3- Delete the DeleteScheduler (call the destructor while queue is not empty)
+// 4- Make sure that not all files were deleted from trash and that
+//    DeleteScheduler background thread did not delete all files
+TEST_F(DeleteSchedulerTest, DestructorWithNonEmptyQueue) {
+  int bg_delete_file = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void* /*arg*/) { bg_delete_file++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1;  // 1 Byte / sec
+  NewDeleteScheduler();
+
+  for (int i = 0; i < 100; i++) {
+    std::string file_name = "data_" + std::to_string(i) + ".data";
+    ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name), ""));
+  }
+
+  // Deleting 100 files will need >28 hours to delete
+  // we will delete the DeleteScheduler while delete queue is not empty
+  sst_file_mgr_.reset();
+
+  ASSERT_LT(bg_delete_file, 100);
+  ASSERT_GT(CountTrashFiles(), 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DeleteSchedulerTest, DISABLED_DynamicRateLimiting1) {
+  std::vector<uint64_t> penalties;
+  int bg_delete_file = 0;
+  int fg_delete_file = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void* /*arg*/) { bg_delete_file++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteFile", [&](void* /*arg*/) { fg_delete_file++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::BackgroundEmptyTrash:Wait",
+      [&](void* arg) { penalties.push_back(*(static_cast<int*>(arg))); });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::DynamicRateLimiting1:1",
+       "DeleteScheduler::BackgroundEmptyTrash"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 0;  // Disable rate limiting initially
+  NewDeleteScheduler();
+
+  int num_files = 10;         // 10 files
+  uint64_t file_size = 1024;  // every file is 1 kb
+
+  std::vector<int64_t> delete_kbs_per_sec = {512, 200, 0, 100, 50, -2, 25};
+  for (size_t t = 0; t < delete_kbs_per_sec.size(); t++) {
+    penalties.clear();
+    bg_delete_file = 0;
+    fg_delete_file = 0;
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    DestroyAndCreateDir(dummy_files_dirs_[0]);
+    rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024;
+    delete_scheduler_->SetRateBytesPerSecond(rate_bytes_per_sec_);
+
+    // Create 100 dummy files, every file is 1 Kb
+    std::vector<std::string> generated_files;
+    for (int i = 0; i < num_files; i++) {
+      std::string file_name = "file" + std::to_string(i) + ".data";
+      generated_files.push_back(NewDummyFile(file_name, file_size));
+    }
+
+    // Delete dummy files and measure time spent to empty trash
+    for (int i = 0; i < num_files; i++) {
+      ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i], ""));
+    }
+    ASSERT_EQ(CountNormalFiles(), 0);
+
+    if (rate_bytes_per_sec_ > 0) {
+      uint64_t delete_start_time = env_->NowMicros();
+      TEST_SYNC_POINT("DeleteSchedulerTest::DynamicRateLimiting1:1");
+      delete_scheduler_->WaitForEmptyTrash();
+      uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time;
+
+      auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+      ASSERT_EQ(bg_errors.size(), 0);
+
+      uint64_t total_files_size = 0;
+      uint64_t expected_penlty = 0;
+      ASSERT_EQ(penalties.size(), num_files);
+      for (int i = 0; i < num_files; i++) {
+        total_files_size += file_size;
+        expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec_);
+        ASSERT_EQ(expected_penlty, penalties[i]);
+      }
+      ASSERT_GT(time_spent_deleting, expected_penlty * 0.9);
+      ASSERT_EQ(bg_delete_file, num_files);
+      ASSERT_EQ(fg_delete_file, 0);
+    } else {
+      ASSERT_EQ(penalties.size(), 0);
+      ASSERT_EQ(bg_delete_file, 0);
+      ASSERT_EQ(fg_delete_file, num_files);
+    }
+
+    ASSERT_EQ(CountTrashFiles(), 0);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(DeleteSchedulerTest, ImmediateDeleteOn25PercDBSize) {
+  int bg_delete_file = 0;
+  int fg_delete_file = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void* /*arg*/) { bg_delete_file++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteFile", [&](void* /*arg*/) { fg_delete_file++; });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  int num_files = 100;             // 100 files
+  uint64_t file_size = 1024 * 10;  // 100 KB as a file size
+  rate_bytes_per_sec_ = 1;         // 1 byte per sec (very slow trash delete)
+
+  NewDeleteScheduler();
+  delete_scheduler_->SetMaxTrashDBRatio(0.25);
+
+  std::vector<std::string> generated_files;
+  for (int i = 0; i < num_files; i++) {
+    std::string file_name = "file" + std::to_string(i) + ".data";
+    generated_files.push_back(NewDummyFile(file_name, file_size));
+  }
+
+  for (std::string& file_name : generated_files) {
+    ASSERT_OK(delete_scheduler_->DeleteFile(file_name, ""));
+  }
+
+  // When we end up with 26 files in trash we will start
+  // deleting new files immediately
+  ASSERT_EQ(fg_delete_file, 74);
+  ASSERT_EQ(26, stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
+  ASSERT_EQ(74, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DeleteSchedulerTest, IsTrashCheck) {
+  // Trash files
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("x.trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile(".trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("abc.sst.trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("/a/b/c/abc..sst.trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("log.trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("^^^^^.log.trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("abc.t.trash"));
+
+  // Not trash files
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.sst"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.txt"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("/a/b/c/abc.sst"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("/a/b/c/abc.sstrash"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("^^^^^.trashh"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.ttrash"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile(".ttrash"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.trashx"));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+int main(int /*argc*/, char** /*argv*/) {
+  printf("DeleteScheduler is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/file/file_prefetch_buffer.cc b/src/rocksdb/file/file_prefetch_buffer.cc
new file mode 100644
index 000000000..4ac0d0504
--- /dev/null
+++ b/src/rocksdb/file/file_prefetch_buffer.cc
@@ -0,0 +1,918 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/file_prefetch_buffer.h"
+
+#include <algorithm>
+#include <cassert>
+
+#include "file/random_access_file_reader.h"
+#include "monitoring/histogram.h"
+#include "monitoring/iostats_context_imp.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void FilePrefetchBuffer::CalculateOffsetAndLen(size_t alignment,
+                                               uint64_t offset,
+                                               size_t roundup_len,
+                                               uint32_t index, bool refit_tail,
+                                               uint64_t& chunk_len) {
+  uint64_t chunk_offset_in_buffer = 0;
+  bool copy_data_to_new_buffer = false;
+  // Check if requested bytes are in the existing buffer_.
+  // If only a few bytes exist -- reuse them & read only what is really needed.
+  //     This is typically the case of incremental reading of data.
+  // If no bytes exist in buffer -- full pread.
+  if (DoesBufferContainData(index) && IsOffsetInBuffer(offset, index)) {
+    // Only a few requested bytes are in the buffer. memmove those chunk of
+    // bytes to the beginning, and memcpy them back into the new buffer if a
+    // new buffer is created.
+    chunk_offset_in_buffer = Rounddown(
+        static_cast<size_t>(offset - bufs_[index].offset_), alignment);
+    chunk_len = static_cast<uint64_t>(bufs_[index].buffer_.CurrentSize()) -
+                chunk_offset_in_buffer;
+    assert(chunk_offset_in_buffer % alignment == 0);
+    assert(chunk_len % alignment == 0);
+    assert(chunk_offset_in_buffer + chunk_len <=
+           bufs_[index].offset_ + bufs_[index].buffer_.CurrentSize());
+    if (chunk_len > 0) {
+      copy_data_to_new_buffer = true;
+    } else {
+      // this reset is not necessary, but just to be safe.
+      chunk_offset_in_buffer = 0;
+    }
+  }
+
+  // Create a new buffer only if current capacity is not sufficient, and memcopy
+  // bytes from old buffer if needed (i.e., if chunk_len is greater than 0).
+  if (bufs_[index].buffer_.Capacity() < roundup_len) {
+    bufs_[index].buffer_.Alignment(alignment);
+    bufs_[index].buffer_.AllocateNewBuffer(
+        static_cast<size_t>(roundup_len), copy_data_to_new_buffer,
+        chunk_offset_in_buffer, static_cast<size_t>(chunk_len));
+  } else if (chunk_len > 0 && refit_tail) {
+    // New buffer not needed. But memmove bytes from tail to the beginning since
+    // chunk_len is greater than 0.
+    bufs_[index].buffer_.RefitTail(static_cast<size_t>(chunk_offset_in_buffer),
+                                   static_cast<size_t>(chunk_len));
+  } else if (chunk_len > 0) {
+    // For async prefetching, it doesn't call RefitTail with chunk_len > 0.
+    // Allocate new buffer if needed because aligned buffer calculate remaining
+    // buffer as capacity_ - cursize_ which might not be the case in this as we
+    // are not refitting.
+    // TODO akanksha: Update the condition when asynchronous prefetching is
+    // stable.
+    bufs_[index].buffer_.Alignment(alignment);
+    bufs_[index].buffer_.AllocateNewBuffer(
+        static_cast<size_t>(roundup_len), copy_data_to_new_buffer,
+        chunk_offset_in_buffer, static_cast<size_t>(chunk_len));
+  }
+}
+
+Status FilePrefetchBuffer::Read(const IOOptions& opts,
+                                RandomAccessFileReader* reader,
+                                Env::IOPriority rate_limiter_priority,
+                                uint64_t read_len, uint64_t chunk_len,
+                                uint64_t rounddown_start, uint32_t index) {
+  Slice result;
+  Status s = reader->Read(opts, rounddown_start + chunk_len, read_len, &result,
+                          bufs_[index].buffer_.BufferStart() + chunk_len,
+                          /*aligned_buf=*/nullptr, rate_limiter_priority);
+#ifndef NDEBUG
+  if (result.size() < read_len) {
+    // Fake an IO error to force db_stress fault injection to ignore
+    // truncated read errors
+    IGNORE_STATUS_IF_ERROR(Status::IOError());
+  }
+#endif
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Update the buffer offset and size.
+  bufs_[index].offset_ = rounddown_start;
+  bufs_[index].buffer_.Size(static_cast<size_t>(chunk_len) + result.size());
+  return s;
+}
+
+Status FilePrefetchBuffer::ReadAsync(const IOOptions& opts,
+                                     RandomAccessFileReader* reader,
+                                     uint64_t read_len,
+                                     uint64_t rounddown_start, uint32_t index) {
+  // callback for async read request.
+  auto fp = std::bind(&FilePrefetchBuffer::PrefetchAsyncCallback, this,
+                      std::placeholders::_1, std::placeholders::_2);
+  FSReadRequest req;
+  Slice result;
+  req.len = read_len;
+  req.offset = rounddown_start;
+  req.result = result;
+  req.scratch = bufs_[index].buffer_.BufferStart();
+  bufs_[index].async_req_len_ = req.len;
+
+  Status s =
+      reader->ReadAsync(req, opts, fp, &(bufs_[index].pos_),
+                        &(bufs_[index].io_handle_), &(bufs_[index].del_fn_),
+                        /*aligned_buf=*/nullptr);
+  req.status.PermitUncheckedError();
+  if (s.ok()) {
+    bufs_[index].async_read_in_progress_ = true;
+  }
+  return s;
+}
+
+Status FilePrefetchBuffer::Prefetch(const IOOptions& opts,
+                                    RandomAccessFileReader* reader,
+                                    uint64_t offset, size_t n,
+                                    Env::IOPriority rate_limiter_priority) {
+  if (!enable_ || reader == nullptr) {
+    return Status::OK();
+  }
+  TEST_SYNC_POINT("FilePrefetchBuffer::Prefetch:Start");
+
+  if (offset + n <= bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize()) {
+    // All requested bytes are already in the curr_ buffer. So no need to Read
+    // again.
+    return Status::OK();
+  }
+
+  size_t alignment = reader->file()->GetRequiredBufferAlignment();
+  size_t offset_ = static_cast<size_t>(offset);
+  uint64_t rounddown_offset = Rounddown(offset_, alignment);
+  uint64_t roundup_end = Roundup(offset_ + n, alignment);
+  uint64_t roundup_len = roundup_end - rounddown_offset;
+  assert(roundup_len >= alignment);
+  assert(roundup_len % alignment == 0);
+
+  uint64_t chunk_len = 0;
+  CalculateOffsetAndLen(alignment, offset, roundup_len, curr_,
+                        true /*refit_tail*/, chunk_len);
+  size_t read_len = static_cast<size_t>(roundup_len - chunk_len);
+
+  Status s = Read(opts, reader, rate_limiter_priority, read_len, chunk_len,
+                  rounddown_offset, curr_);
+  return s;
+}
+
+// Copy data from src to third buffer.
+void FilePrefetchBuffer::CopyDataToBuffer(uint32_t src, uint64_t& offset,
+                                          size_t& length) {
+  if (length == 0) {
+    return;
+  }
+  uint64_t copy_offset = (offset - bufs_[src].offset_);
+  size_t copy_len = 0;
+  if (IsDataBlockInBuffer(offset, length, src)) {
+    // All the bytes are in src.
+    copy_len = length;
+  } else {
+    copy_len = bufs_[src].buffer_.CurrentSize() - copy_offset;
+  }
+
+  memcpy(bufs_[2].buffer_.BufferStart() + bufs_[2].buffer_.CurrentSize(),
+         bufs_[src].buffer_.BufferStart() + copy_offset, copy_len);
+
+  bufs_[2].buffer_.Size(bufs_[2].buffer_.CurrentSize() + copy_len);
+
+  // Update offset and length.
+  offset += copy_len;
+  length -= copy_len;
+
+  // length > 0 indicates it has consumed all data from the src buffer and it
+  // still needs to read more other buffer.
+  if (length > 0) {
+    bufs_[src].buffer_.Clear();
+  }
+}
+
+// Clear the buffers if it contains outdated data. Outdated data can be
+// because previous sequential reads were read from the cache instead of these
+// buffer. In that case outdated IOs should be aborted.
+void FilePrefetchBuffer::AbortIOIfNeeded(uint64_t offset) {
+  uint32_t second = curr_ ^ 1;
+  std::vector<void*> handles;
+  autovector<uint32_t> buf_pos;
+  if (IsBufferOutdatedWithAsyncProgress(offset, curr_)) {
+    handles.emplace_back(bufs_[curr_].io_handle_);
+    buf_pos.emplace_back(curr_);
+  }
+  if (IsBufferOutdatedWithAsyncProgress(offset, second)) {
+    handles.emplace_back(bufs_[second].io_handle_);
+    buf_pos.emplace_back(second);
+  }
+  if (!handles.empty()) {
+    StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS);
+    Status s = fs_->AbortIO(handles);
+    assert(s.ok());
+  }
+
+  for (auto& pos : buf_pos) {
+    // Release io_handle.
+    DestroyAndClearIOHandle(pos);
+  }
+
+  if (bufs_[second].io_handle_ == nullptr) {
+    bufs_[second].async_read_in_progress_ = false;
+  }
+
+  if (bufs_[curr_].io_handle_ == nullptr) {
+    bufs_[curr_].async_read_in_progress_ = false;
+  }
+}
+
+void FilePrefetchBuffer::AbortAllIOs() {
+  uint32_t second = curr_ ^ 1;
+  std::vector<void*> handles;
+  for (uint32_t i = 0; i < 2; i++) {
+    if (bufs_[i].async_read_in_progress_ && bufs_[i].io_handle_ != nullptr) {
+      handles.emplace_back(bufs_[i].io_handle_);
+    }
+  }
+  if (!handles.empty()) {
+    StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS);
+    Status s = fs_->AbortIO(handles);
+    assert(s.ok());
+  }
+
+  // Release io_handles.
+  if (bufs_[curr_].io_handle_ != nullptr && bufs_[curr_].del_fn_ != nullptr) {
+    DestroyAndClearIOHandle(curr_);
+  } else {
+    bufs_[curr_].async_read_in_progress_ = false;
+  }
+
+  if (bufs_[second].io_handle_ != nullptr && bufs_[second].del_fn_ != nullptr) {
+    DestroyAndClearIOHandle(second);
+  } else {
+    bufs_[second].async_read_in_progress_ = false;
+  }
+}
+
+// Clear the buffers if it contains outdated data. Outdated data can be
+// because previous sequential reads were read from the cache instead of these
+// buffer.
+void FilePrefetchBuffer::UpdateBuffersIfNeeded(uint64_t offset) {
+  uint32_t second = curr_ ^ 1;
+  if (IsBufferOutdated(offset, curr_)) {
+    bufs_[curr_].buffer_.Clear();
+  }
+  if (IsBufferOutdated(offset, second)) {
+    bufs_[second].buffer_.Clear();
+  }
+
+  {
+    // In case buffers do not align, reset second buffer. This can happen in
+    // case readahead_size is set.
+    if (!bufs_[second].async_read_in_progress_ &&
+        !bufs_[curr_].async_read_in_progress_) {
+      if (DoesBufferContainData(curr_)) {
+        if (bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize() !=
+            bufs_[second].offset_) {
+          bufs_[second].buffer_.Clear();
+        }
+      } else {
+        if (!IsOffsetInBuffer(offset, second)) {
+          bufs_[second].buffer_.Clear();
+        }
+      }
+    }
+  }
+
+  // If data starts from second buffer, make it curr_. Second buffer can be
+  // either partial filled, full or async read is in progress.
+  if (bufs_[second].async_read_in_progress_) {
+    if (IsOffsetInBufferWithAsyncProgress(offset, second)) {
+      curr_ = curr_ ^ 1;
+    }
+  } else {
+    if (DoesBufferContainData(second) && IsOffsetInBuffer(offset, second)) {
+      assert(bufs_[curr_].async_read_in_progress_ ||
+             bufs_[curr_].buffer_.CurrentSize() == 0);
+      curr_ = curr_ ^ 1;
+    }
+  }
+}
+
+void FilePrefetchBuffer::PollAndUpdateBuffersIfNeeded(uint64_t offset) {
+  if (bufs_[curr_].async_read_in_progress_ && fs_ != nullptr) {
+    if (bufs_[curr_].io_handle_ != nullptr) {
+      // Wait for prefetch data to complete.
+      // No mutex is needed as async_read_in_progress behaves as mutex and is
+      // updated by main thread only.
+      std::vector<void*> handles;
+      handles.emplace_back(bufs_[curr_].io_handle_);
+      StopWatch sw(clock_, stats_, POLL_WAIT_MICROS);
+      fs_->Poll(handles, 1).PermitUncheckedError();
+    }
+
+    // Reset and Release io_handle after the Poll API as request has been
+    // completed.
+    DestroyAndClearIOHandle(curr_);
+  }
+  UpdateBuffersIfNeeded(offset);
+}
+
+Status FilePrefetchBuffer::HandleOverlappingData(
+    const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset,
+    size_t length, size_t readahead_size,
+    Env::IOPriority /*rate_limiter_priority*/, bool& copy_to_third_buffer,
+    uint64_t& tmp_offset, size_t& tmp_length) {
+  Status s;
+  size_t alignment = reader->file()->GetRequiredBufferAlignment();
+  uint32_t second;
+
+  // Check if the first buffer has the required offset and the async read is
+  // still in progress. This should only happen if a prefetch was initiated
+  // by Seek, but the next access is at another offset.
+  if (bufs_[curr_].async_read_in_progress_ &&
+      IsOffsetInBufferWithAsyncProgress(offset, curr_)) {
+    PollAndUpdateBuffersIfNeeded(offset);
+  }
+  second = curr_ ^ 1;
+
+  // If data is overlapping over two buffers, copy the data from curr_ and
+  // call ReadAsync on curr_.
+  if (!bufs_[curr_].async_read_in_progress_ && DoesBufferContainData(curr_) &&
+      IsOffsetInBuffer(offset, curr_) &&
+      (/*Data extends over curr_ buffer and second buffer either has data or in
+         process of population=*/
+       (offset + length > bufs_[second].offset_) &&
+       (bufs_[second].async_read_in_progress_ ||
+        DoesBufferContainData(second)))) {
+    // Allocate new buffer to third buffer;
+    bufs_[2].buffer_.Clear();
+    bufs_[2].buffer_.Alignment(alignment);
+    bufs_[2].buffer_.AllocateNewBuffer(length);
+    bufs_[2].offset_ = offset;
+    copy_to_third_buffer = true;
+
+    CopyDataToBuffer(curr_, tmp_offset, tmp_length);
+
+    // Call async prefetching on curr_ since data has been consumed in curr_
+    // only if data lies within second buffer.
+    size_t second_size = bufs_[second].async_read_in_progress_
+                             ? bufs_[second].async_req_len_
+                             : bufs_[second].buffer_.CurrentSize();
+    if (tmp_offset + tmp_length <= bufs_[second].offset_ + second_size) {
+      uint64_t rounddown_start = bufs_[second].offset_ + second_size;
+      uint64_t roundup_end =
+          Roundup(rounddown_start + readahead_size, alignment);
+      uint64_t roundup_len = roundup_end - rounddown_start;
+      uint64_t chunk_len = 0;
+      CalculateOffsetAndLen(alignment, rounddown_start, roundup_len, curr_,
+                            false, chunk_len);
+      assert(chunk_len == 0);
+      assert(roundup_len >= chunk_len);
+
+      bufs_[curr_].offset_ = rounddown_start;
+      uint64_t read_len = static_cast<size_t>(roundup_len - chunk_len);
+      s = ReadAsync(opts, reader, read_len, rounddown_start, curr_);
+      if (!s.ok()) {
+        DestroyAndClearIOHandle(curr_);
+        bufs_[curr_].buffer_.Clear();
+        return s;
+      }
+    }
+    curr_ = curr_ ^ 1;
+  }
+  return s;
+}
+// If async_io is enabled in case of sequential reads, PrefetchAsyncInternal is
+// called. When buffers are switched, we clear the curr_ buffer as we assume the
+// data has been consumed because of sequential reads.
+// Data in buffers will always be sequential with curr_ following second and
+// not vice versa.
+//
+// Scenarios for prefetching asynchronously:
+// Case1: If both buffers are empty, prefetch n + readahead_size_/2 bytes
+//        synchronously in curr_ and prefetch readahead_size_/2 async in second
+//        buffer.
+// Case2: If second buffer has partial or full data, make it current and
+//        prefetch readahead_size_/2 async in second buffer. In case of
+//        partial data, prefetch remaining bytes from size n synchronously to
+//        fulfill the requested bytes request.
+// Case3: If curr_ has partial data, prefetch remaining bytes from size n
+//        synchronously in curr_ to fulfill the requested bytes request and
+//        prefetch readahead_size_/2 bytes async in second buffer.
+// Case4: (Special case) If data is in both buffers, copy requested data from
+//        curr_, send async request on curr_, wait for poll to fill second
+//        buffer (if any), and copy remaining data from second buffer to third
+//        buffer.
+Status FilePrefetchBuffer::PrefetchAsyncInternal(
+    const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset,
+    size_t length, size_t readahead_size, Env::IOPriority rate_limiter_priority,
+    bool& copy_to_third_buffer) {
+  if (!enable_) {
+    return Status::OK();
+  }
+
+  TEST_SYNC_POINT("FilePrefetchBuffer::PrefetchAsyncInternal:Start");
+
+  size_t alignment = reader->file()->GetRequiredBufferAlignment();
+  Status s;
+  uint64_t tmp_offset = offset;
+  size_t tmp_length = length;
+
+  // 1. Abort IO and swap buffers if needed to point curr_ to first buffer with
+  // data.
+  if (!explicit_prefetch_submitted_) {
+    AbortIOIfNeeded(offset);
+  }
+  UpdateBuffersIfNeeded(offset);
+
+  // 2. Handle overlapping data over two buffers. If data is overlapping then
+  //    during this call:
+  //   - data from curr_ is copied into third buffer,
+  //   - curr_ is send for async prefetching of further data if second buffer
+  //     contains remaining requested data or in progress for async prefetch,
+  //   - switch buffers and curr_ now points to second buffer to copy remaining
+  //     data.
+  s = HandleOverlappingData(opts, reader, offset, length, readahead_size,
+                            rate_limiter_priority, copy_to_third_buffer,
+                            tmp_offset, tmp_length);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // 3. Call Poll only if data is needed for the second buffer.
+  //    - Return if whole data is in curr_ and second buffer is in progress or
+  //      already full.
+  //    - If second buffer is empty, it will go for ReadAsync for second buffer.
+  if (!bufs_[curr_].async_read_in_progress_ && DoesBufferContainData(curr_) &&
+      IsDataBlockInBuffer(offset, length, curr_)) {
+    // Whole data is in curr_.
+    UpdateBuffersIfNeeded(offset);
+    if (!IsSecondBuffEligibleForPrefetching()) {
+      return s;
+    }
+  } else {
+    // After poll request, curr_ might be empty because of IOError in
+    // callback while reading or may contain required data.
+    PollAndUpdateBuffersIfNeeded(offset);
+  }
+
+  if (copy_to_third_buffer) {
+    offset = tmp_offset;
+    length = tmp_length;
+  }
+
+  // 4. After polling and swapping buffers, if all the requested bytes are in
+  // curr_, it will only go for async prefetching.
+  // copy_to_third_buffer is a special case so it will be handled separately.
+  if (!copy_to_third_buffer && DoesBufferContainData(curr_) &&
+      IsDataBlockInBuffer(offset, length, curr_)) {
+    offset += length;
+    length = 0;
+
+    // Since async request was submitted directly by calling PrefetchAsync in
+    // last call, we don't need to prefetch further as this call is to poll
+    // the data submitted in previous call.
+    if (explicit_prefetch_submitted_) {
+      return s;
+    }
+    if (!IsSecondBuffEligibleForPrefetching()) {
+      return s;
+    }
+  }
+
+  uint32_t second = curr_ ^ 1;
+  assert(!bufs_[curr_].async_read_in_progress_);
+
+  // In case because of some IOError curr_ got empty, abort IO for second as
+  // well. Otherwise data might not align if more data needs to be read in curr_
+  // which might overlap with second buffer.
+  if (!DoesBufferContainData(curr_) && bufs_[second].async_read_in_progress_) {
+    if (bufs_[second].io_handle_ != nullptr) {
+      std::vector<void*> handles;
+      handles.emplace_back(bufs_[second].io_handle_);
+      {
+        StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS);
+        Status status = fs_->AbortIO(handles);
+        assert(status.ok());
+      }
+    }
+    DestroyAndClearIOHandle(second);
+    bufs_[second].buffer_.Clear();
+  }
+
+  // 5. Data is overlapping i.e. some of the data has been copied to third
+  // buffer and remaining will be updated below.
+  if (copy_to_third_buffer && DoesBufferContainData(curr_)) {
+    CopyDataToBuffer(curr_, offset, length);
+
+    // Length == 0: All the requested data has been copied to third buffer and
+    // it has already gone for async prefetching. It can return without doing
+    // anything further.
+    // Length > 0: More data needs to be consumed so it will continue async
+    // and sync prefetching and copy the remaining data to third buffer in the
+    // end.
+    if (length == 0) {
+      return s;
+    }
+  }
+
+  // 6. Go for ReadAsync and Read (if needed).
+  size_t prefetch_size = length + readahead_size;
+  size_t _offset = static_cast<size_t>(offset);
+
+  // offset and size alignment for curr_ buffer with synchronous prefetching
+  uint64_t rounddown_start1 = Rounddown(_offset, alignment);
+  uint64_t roundup_end1 = Roundup(_offset + prefetch_size, alignment);
+  uint64_t roundup_len1 = roundup_end1 - rounddown_start1;
+  assert(roundup_len1 >= alignment);
+  assert(roundup_len1 % alignment == 0);
+  uint64_t chunk_len1 = 0;
+  uint64_t read_len1 = 0;
+
+  assert(!bufs_[second].async_read_in_progress_ &&
+         !DoesBufferContainData(second));
+
+  // For length == 0, skip the synchronous prefetching. read_len1 will be 0.
+  if (length > 0) {
+    CalculateOffsetAndLen(alignment, offset, roundup_len1, curr_,
+                          false /*refit_tail*/, chunk_len1);
+    assert(roundup_len1 >= chunk_len1);
+    read_len1 = static_cast<size_t>(roundup_len1 - chunk_len1);
+  }
+  {
+    // offset and size alignment for second buffer for asynchronous
+    // prefetching
+    uint64_t rounddown_start2 = roundup_end1;
+    uint64_t roundup_end2 =
+        Roundup(rounddown_start2 + readahead_size, alignment);
+
+    // For length == 0, do the asynchronous prefetching in second instead of
+    // synchronous prefetching in curr_.
+    if (length == 0) {
+      rounddown_start2 =
+          bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize();
+      roundup_end2 = Roundup(rounddown_start2 + prefetch_size, alignment);
+    }
+
+    uint64_t roundup_len2 = roundup_end2 - rounddown_start2;
+    uint64_t chunk_len2 = 0;
+    CalculateOffsetAndLen(alignment, rounddown_start2, roundup_len2, second,
+                          false /*refit_tail*/, chunk_len2);
+    assert(chunk_len2 == 0);
+    // Update the buffer offset.
+    bufs_[second].offset_ = rounddown_start2;
+    assert(roundup_len2 >= chunk_len2);
+    uint64_t read_len2 = static_cast<size_t>(roundup_len2 - chunk_len2);
+    Status tmp_s = ReadAsync(opts, reader, read_len2, rounddown_start2, second);
+    if (!tmp_s.ok()) {
+      DestroyAndClearIOHandle(second);
+      bufs_[second].buffer_.Clear();
+    }
+  }
+
+  if (read_len1 > 0) {
+    s = Read(opts, reader, rate_limiter_priority, read_len1, chunk_len1,
+             rounddown_start1, curr_);
+    if (!s.ok()) {
+      if (bufs_[second].io_handle_ != nullptr) {
+        std::vector<void*> handles;
+        handles.emplace_back(bufs_[second].io_handle_);
+        {
+          StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS);
+          Status status = fs_->AbortIO(handles);
+          assert(status.ok());
+        }
+      }
+      DestroyAndClearIOHandle(second);
+      bufs_[second].buffer_.Clear();
+      bufs_[curr_].buffer_.Clear();
+      return s;
+    }
+  }
+  // Copy remaining requested bytes to third_buffer.
+  if (copy_to_third_buffer && length > 0) {
+    CopyDataToBuffer(curr_, offset, length);
+  }
+  return s;
+}
+
+bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts,
+                                          RandomAccessFileReader* reader,
+                                          uint64_t offset, size_t n,
+                                          Slice* result, Status* status,
+                                          Env::IOPriority rate_limiter_priority,
+                                          bool for_compaction /* = false */) {
+  if (track_min_offset_ && offset < min_offset_read_) {
+    min_offset_read_ = static_cast<size_t>(offset);
+  }
+  if (!enable_ || (offset < bufs_[curr_].offset_)) {
+    return false;
+  }
+
+  // If the buffer contains only a few of the requested bytes:
+  //    If readahead is enabled: prefetch the remaining bytes + readahead bytes
+  //        and satisfy the request.
+  //    If readahead is not enabled: return false.
+  TEST_SYNC_POINT_CALLBACK("FilePrefetchBuffer::TryReadFromCache",
+                           &readahead_size_);
+  if (offset + n > bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize()) {
+    if (readahead_size_ > 0) {
+      Status s;
+      assert(reader != nullptr);
+      assert(max_readahead_size_ >= readahead_size_);
+      if (for_compaction) {
+        s = Prefetch(opts, reader, offset, std::max(n, readahead_size_),
+                     rate_limiter_priority);
+      } else {
+        if (implicit_auto_readahead_) {
+          if (!IsEligibleForPrefetch(offset, n)) {
+            // Ignore status as Prefetch is not called.
+            s.PermitUncheckedError();
+            return false;
+          }
+        }
+        s = Prefetch(opts, reader, offset, n + readahead_size_,
+                     rate_limiter_priority);
+      }
+      if (!s.ok()) {
+        if (status) {
+          *status = s;
+        }
+#ifndef NDEBUG
+        IGNORE_STATUS_IF_ERROR(s);
+#endif
+        return false;
+      }
+      readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2);
+    } else {
+      return false;
+    }
+  }
+  UpdateReadPattern(offset, n, false /*decrease_readaheadsize*/);
+
+  uint64_t offset_in_buffer = offset - bufs_[curr_].offset_;
+  *result = Slice(bufs_[curr_].buffer_.BufferStart() + offset_in_buffer, n);
+  return true;
+}
+
+bool FilePrefetchBuffer::TryReadFromCacheAsync(
+    const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset,
+    size_t n, Slice* result, Status* status,
+    Env::IOPriority rate_limiter_priority) {
+  if (track_min_offset_ && offset < min_offset_read_) {
+    min_offset_read_ = static_cast<size_t>(offset);
+  }
+
+  if (!enable_) {
+    return false;
+  }
+
+  if (explicit_prefetch_submitted_) {
+    // explicit_prefetch_submitted_ is special case where it expects request
+    // submitted in PrefetchAsync should match with this request. Otherwise
+    // buffers will be outdated.
+    // Random offset called. So abort the IOs.
+    if (prev_offset_ != offset) {
+      AbortAllIOs();
+      bufs_[curr_].buffer_.Clear();
+      bufs_[curr_ ^ 1].buffer_.Clear();
+      explicit_prefetch_submitted_ = false;
+      return false;
+    }
+  }
+
+  if (!explicit_prefetch_submitted_ && offset < bufs_[curr_].offset_) {
+    return false;
+  }
+
+  bool prefetched = false;
+  bool copy_to_third_buffer = false;
+  // If the buffer contains only a few of the requested bytes:
+  //    If readahead is enabled: prefetch the remaining bytes + readahead bytes
+  //        and satisfy the request.
+  //    If readahead is not enabled: return false.
+  TEST_SYNC_POINT_CALLBACK("FilePrefetchBuffer::TryReadFromCache",
+                           &readahead_size_);
+
+  if (explicit_prefetch_submitted_ ||
+      (bufs_[curr_].async_read_in_progress_ ||
+       offset + n >
+           bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize())) {
+    if (readahead_size_ > 0) {
+      Status s;
+      assert(reader != nullptr);
+      assert(max_readahead_size_ >= readahead_size_);
+
+      if (implicit_auto_readahead_) {
+        if (!IsEligibleForPrefetch(offset, n)) {
+          // Ignore status as Prefetch is not called.
+          s.PermitUncheckedError();
+          return false;
+        }
+      }
+      // Prefetch n + readahead_size_/2 synchronously as remaining
+      // readahead_size_/2 will be prefetched asynchronously.
+      s = PrefetchAsyncInternal(opts, reader, offset, n, readahead_size_ / 2,
+                                rate_limiter_priority, copy_to_third_buffer);
+      explicit_prefetch_submitted_ = false;
+      if (!s.ok()) {
+        if (status) {
+          *status = s;
+        }
+#ifndef NDEBUG
+        IGNORE_STATUS_IF_ERROR(s);
+#endif
+        return false;
+      }
+      prefetched = explicit_prefetch_submitted_ ? false : true;
+    } else {
+      return false;
+    }
+  }
+
+  UpdateReadPattern(offset, n, false /*decrease_readaheadsize*/);
+
+  uint32_t index = curr_;
+  if (copy_to_third_buffer) {
+    index = 2;
+  }
+  uint64_t offset_in_buffer = offset - bufs_[index].offset_;
+  *result = Slice(bufs_[index].buffer_.BufferStart() + offset_in_buffer, n);
+  if (prefetched) {
+    readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2);
+  }
+  return true;
+}
+
+void FilePrefetchBuffer::PrefetchAsyncCallback(const FSReadRequest& req,
+                                               void* cb_arg) {
+  uint32_t index = *(static_cast<uint32_t*>(cb_arg));
+#ifndef NDEBUG
+  if (req.result.size() < req.len) {
+    // Fake an IO error to force db_stress fault injection to ignore
+    // truncated read errors
+    IGNORE_STATUS_IF_ERROR(Status::IOError());
+  }
+  IGNORE_STATUS_IF_ERROR(req.status);
+#endif
+
+  if (req.status.ok()) {
+    if (req.offset + req.result.size() <=
+        bufs_[index].offset_ + bufs_[index].buffer_.CurrentSize()) {
+      // All requested bytes are already in the buffer or no data is read
+      // because of EOF. So no need to update.
+      return;
+    }
+    if (req.offset < bufs_[index].offset_) {
+      // Next block to be read has changed (Recent read was not a sequential
+      // read). So ignore this read.
+      return;
+    }
+    size_t current_size = bufs_[index].buffer_.CurrentSize();
+    bufs_[index].buffer_.Size(current_size + req.result.size());
+  }
+}
+
+Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts,
+                                         RandomAccessFileReader* reader,
+                                         uint64_t offset, size_t n,
+                                         Slice* result) {
+  assert(reader != nullptr);
+  if (!enable_) {
+    return Status::NotSupported();
+  }
+
+  TEST_SYNC_POINT("FilePrefetchBuffer::PrefetchAsync:Start");
+
+  num_file_reads_ = 0;
+  explicit_prefetch_submitted_ = false;
+  bool is_eligible_for_prefetching = false;
+  if (readahead_size_ > 0 &&
+      (!implicit_auto_readahead_ ||
+       num_file_reads_ + 1 >= num_file_reads_for_auto_readahead_)) {
+    is_eligible_for_prefetching = true;
+  }
+
+  // 1. Cancel any pending async read to make code simpler as buffers can be out
+  // of sync.
+  AbortAllIOs();
+
+  // 2. Clear outdated data.
+  UpdateBuffersIfNeeded(offset);
+  uint32_t second = curr_ ^ 1;
+  // Since PrefetchAsync can be called on non sequential reads. So offset can
+  // be less than curr_ buffers' offset. In that case also it clears both
+  // buffers.
+  if (DoesBufferContainData(curr_) && !IsOffsetInBuffer(offset, curr_)) {
+    bufs_[curr_].buffer_.Clear();
+    bufs_[second].buffer_.Clear();
+  }
+
+  UpdateReadPattern(offset, n, /*decrease_readaheadsize=*/false);
+
+  bool data_found = false;
+
+  // 3. If curr_ has full data.
+  if (DoesBufferContainData(curr_) && IsDataBlockInBuffer(offset, n, curr_)) {
+    uint64_t offset_in_buffer = offset - bufs_[curr_].offset_;
+    *result = Slice(bufs_[curr_].buffer_.BufferStart() + offset_in_buffer, n);
+    data_found = true;
+    // Update num_file_reads_ as TryReadFromCacheAsync won't be called for
+    // poll and update num_file_reads_ if data is found.
+    num_file_reads_++;
+
+    // 3.1 If second also has some data or is not eligible for prefetching,
+    // return.
+    if (!is_eligible_for_prefetching || DoesBufferContainData(second)) {
+      return Status::OK();
+    }
+  } else {
+    // Partial data in curr_.
+    bufs_[curr_].buffer_.Clear();
+  }
+  bufs_[second].buffer_.Clear();
+
+  Status s;
+  size_t alignment = reader->file()->GetRequiredBufferAlignment();
+  size_t prefetch_size = is_eligible_for_prefetching ? readahead_size_ / 2 : 0;
+  size_t offset_to_read = static_cast<size_t>(offset);
+  uint64_t rounddown_start1 = 0;
+  uint64_t roundup_end1 = 0;
+  uint64_t rounddown_start2 = 0;
+  uint64_t roundup_end2 = 0;
+  uint64_t chunk_len1 = 0;
+  uint64_t chunk_len2 = 0;
+  size_t read_len1 = 0;
+  size_t read_len2 = 0;
+
+  // - If curr_ is empty.
+  //   - Call async read for full data +  prefetch_size on curr_.
+  //   - Call async read for prefetch_size on second if eligible.
+  // - If curr_ is filled.
+  //   - prefetch_size on second.
+  // Calculate length and offsets for reading.
+  if (!DoesBufferContainData(curr_)) {
+    // Prefetch full data + prefetch_size in curr_.
+    rounddown_start1 = Rounddown(offset_to_read, alignment);
+    roundup_end1 = Roundup(offset_to_read + n + prefetch_size, alignment);
+    uint64_t roundup_len1 = roundup_end1 - rounddown_start1;
+    assert(roundup_len1 >= alignment);
+    assert(roundup_len1 % alignment == 0);
+
+    CalculateOffsetAndLen(alignment, rounddown_start1, roundup_len1, curr_,
+                          false, chunk_len1);
+    assert(chunk_len1 == 0);
+    assert(roundup_len1 >= chunk_len1);
+    read_len1 = static_cast<size_t>(roundup_len1 - chunk_len1);
+    bufs_[curr_].offset_ = rounddown_start1;
+  }
+
+  if (is_eligible_for_prefetching) {
+    if (DoesBufferContainData(curr_)) {
+      rounddown_start2 =
+          bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize();
+    } else {
+      rounddown_start2 = roundup_end1;
+    }
+
+    roundup_end2 = Roundup(rounddown_start2 + prefetch_size, alignment);
+    uint64_t roundup_len2 = roundup_end2 - rounddown_start2;
+
+    assert(roundup_len2 >= alignment);
+    CalculateOffsetAndLen(alignment, rounddown_start2, roundup_len2, second,
+                          false, chunk_len2);
+    assert(chunk_len2 == 0);
+    assert(roundup_len2 >= chunk_len2);
+    read_len2 = static_cast<size_t>(roundup_len2 - chunk_len2);
+    // Update the buffer offset.
+    bufs_[second].offset_ = rounddown_start2;
+  }
+
+  if (read_len1) {
+    s = ReadAsync(opts, reader, read_len1, rounddown_start1, curr_);
+    if (!s.ok()) {
+      DestroyAndClearIOHandle(curr_);
+      bufs_[curr_].buffer_.Clear();
+      return s;
+    }
+    explicit_prefetch_submitted_ = true;
+    prev_len_ = 0;
+  }
+  if (read_len2) {
+    s = ReadAsync(opts, reader, read_len2, rounddown_start2, second);
+    if (!s.ok()) {
+      DestroyAndClearIOHandle(second);
+      bufs_[second].buffer_.Clear();
+      return s;
+    }
+    readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2);
+  }
+  return (data_found ? Status::OK() : Status::TryAgain());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/file_prefetch_buffer.h b/src/rocksdb/file/file_prefetch_buffer.h
new file mode 100644
index 000000000..a4a75fe2b
--- /dev/null
+++ b/src/rocksdb/file/file_prefetch_buffer.h
@@ -0,0 +1,446 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <algorithm>
+#include <atomic>
+#include <sstream>
+#include <string>
+
+#include "file/readahead_file_info.h"
+#include "monitoring/statistics.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/options.h"
+#include "util/aligned_buffer.h"
+#include "util/autovector.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#define DEFAULT_DECREMENT 8 * 1024
+
+struct IOOptions;
+class RandomAccessFileReader;
+
+struct BufferInfo {
+  AlignedBuffer buffer_;
+
+  uint64_t offset_ = 0;
+
+  // Below parameters are used in case of async read flow.
+  // Length requested for in ReadAsync.
+  size_t async_req_len_ = 0;
+
+  // async_read_in_progress can be used as mutex. Callback can update the buffer
+  // and its size but async_read_in_progress is only set by main thread.
+  bool async_read_in_progress_ = false;
+
+  // io_handle is allocated and used by underlying file system in case of
+  // asynchronous reads.
+  void* io_handle_ = nullptr;
+
+  IOHandleDeleter del_fn_ = nullptr;
+
+  // pos represents the index of this buffer in vector of BufferInfo.
+  uint32_t pos_ = 0;
+};
+
+// FilePrefetchBuffer is a smart buffer to store and read data from a file.
+class FilePrefetchBuffer {
+ public:
+  // Constructor.
+  //
+  // All arguments are optional.
+  // readahead_size     : the initial readahead size.
+  // max_readahead_size : the maximum readahead size.
+  //   If max_readahead_size > readahead_size, the readahead size will be
+  //   doubled on every IO until max_readahead_size is hit.
+  //   Typically this is set as a multiple of readahead_size.
+  //   max_readahead_size should be greater than equal to readahead_size.
+  // enable : controls whether reading from the buffer is enabled.
+  //   If false, TryReadFromCache() always return false, and we only take stats
+  //   for the minimum offset if track_min_offset = true.
+  // track_min_offset : Track the minimum offset ever read and collect stats on
+  //   it. Used for adaptable readahead of the file footer/metadata.
+  // implicit_auto_readahead : Readahead is enabled implicitly by rocksdb after
+  //   doing sequential scans for two times.
+  //
+  // Automatic readhead is enabled for a file if readahead_size
+  // and max_readahead_size are passed in.
+  // A user can construct a FilePrefetchBuffer without any arguments, but use
+  // `Prefetch` to load data into the buffer.
+  FilePrefetchBuffer(size_t readahead_size = 0, size_t max_readahead_size = 0,
+                     bool enable = true, bool track_min_offset = false,
+                     bool implicit_auto_readahead = false,
+                     uint64_t num_file_reads = 0,
+                     uint64_t num_file_reads_for_auto_readahead = 0,
+                     FileSystem* fs = nullptr, SystemClock* clock = nullptr,
+                     Statistics* stats = nullptr)
+      : curr_(0),
+        readahead_size_(readahead_size),
+        initial_auto_readahead_size_(readahead_size),
+        max_readahead_size_(max_readahead_size),
+        min_offset_read_(std::numeric_limits<size_t>::max()),
+        enable_(enable),
+        track_min_offset_(track_min_offset),
+        implicit_auto_readahead_(implicit_auto_readahead),
+        prev_offset_(0),
+        prev_len_(0),
+        num_file_reads_for_auto_readahead_(num_file_reads_for_auto_readahead),
+        num_file_reads_(num_file_reads),
+        explicit_prefetch_submitted_(false),
+        fs_(fs),
+        clock_(clock),
+        stats_(stats) {
+    assert((num_file_reads_ >= num_file_reads_for_auto_readahead_ + 1) ||
+           (num_file_reads_ == 0));
+    // If ReadOptions.async_io is enabled, data is asynchronously filled in
+    // second buffer while curr_ is being consumed. If data is overlapping in
+    // two buffers, data is copied to third buffer to return continuous buffer.
+    bufs_.resize(3);
+    for (uint32_t i = 0; i < 2; i++) {
+      bufs_[i].pos_ = i;
+    }
+  }
+
+  ~FilePrefetchBuffer() {
+    // Abort any pending async read request before destroying the class object.
+    if (fs_ != nullptr) {
+      std::vector<void*> handles;
+      for (uint32_t i = 0; i < 2; i++) {
+        if (bufs_[i].async_read_in_progress_ &&
+            bufs_[i].io_handle_ != nullptr) {
+          handles.emplace_back(bufs_[i].io_handle_);
+        }
+      }
+      if (!handles.empty()) {
+        StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS);
+        Status s = fs_->AbortIO(handles);
+        assert(s.ok());
+      }
+    }
+
+    // Prefetch buffer bytes discarded.
+    uint64_t bytes_discarded = 0;
+    // Iterated over 2 buffers.
+    for (int i = 0; i < 2; i++) {
+      int first = i;
+      int second = i ^ 1;
+
+      if (DoesBufferContainData(first)) {
+        // If last block was read completely from first and some bytes in
+        // first buffer are still unconsumed.
+        if (prev_offset_ >= bufs_[first].offset_ &&
+            prev_offset_ + prev_len_ <
+                bufs_[first].offset_ + bufs_[first].buffer_.CurrentSize()) {
+          bytes_discarded += bufs_[first].buffer_.CurrentSize() -
+                             (prev_offset_ + prev_len_ - bufs_[first].offset_);
+        }
+        // If data was in second buffer and some/whole block bytes were read
+        // from second buffer.
+        else if (prev_offset_ < bufs_[first].offset_ &&
+                 !DoesBufferContainData(second)) {
+          // If last block read was completely from different buffer, this
+          // buffer is unconsumed.
+          if (prev_offset_ + prev_len_ <= bufs_[first].offset_) {
+            bytes_discarded += bufs_[first].buffer_.CurrentSize();
+          }
+          // If last block read overlaps with this buffer and some data is
+          // still unconsumed and previous buffer (second) is not cleared.
+          else if (prev_offset_ + prev_len_ > bufs_[first].offset_ &&
+                   bufs_[first].offset_ + bufs_[first].buffer_.CurrentSize() ==
+                       bufs_[second].offset_) {
+            bytes_discarded += bufs_[first].buffer_.CurrentSize() -
+                               (/*bytes read from this buffer=*/prev_len_ -
+                                (bufs_[first].offset_ - prev_offset_));
+          }
+        }
+      }
+    }
+
+    for (uint32_t i = 0; i < 2; i++) {
+      // Release io_handle.
+      DestroyAndClearIOHandle(i);
+    }
+    RecordInHistogram(stats_, PREFETCHED_BYTES_DISCARDED, bytes_discarded);
+  }
+
+  // Load data into the buffer from a file.
+  // reader                : the file reader.
+  // offset                : the file offset to start reading from.
+  // n                     : the number of bytes to read.
+  // rate_limiter_priority : rate limiting priority, or `Env::IO_TOTAL` to
+  //                         bypass.
+  Status Prefetch(const IOOptions& opts, RandomAccessFileReader* reader,
+                  uint64_t offset, size_t n,
+                  Env::IOPriority rate_limiter_priority);
+
+  // Request for reading the data from a file asynchronously.
+  // If data already exists in the buffer, result will be updated.
+  // reader                : the file reader.
+  // offset                : the file offset to start reading from.
+  // n                     : the number of bytes to read.
+  // result                : if data already exists in the buffer, result will
+  //                         be updated with the data.
+  //
+  // If data already exist in the buffer, it will return Status::OK, otherwise
+  // it will send asynchronous request and return Status::TryAgain.
+  Status PrefetchAsync(const IOOptions& opts, RandomAccessFileReader* reader,
+                       uint64_t offset, size_t n, Slice* result);
+
+  // Tries returning the data for a file read from this buffer if that data is
+  // in the buffer.
+  // It handles tracking the minimum read offset if track_min_offset = true.
+  // It also does the exponential readahead when readahead_size is set as part
+  // of the constructor.
+  //
+  // opts                  : the IO options to use.
+  // reader                : the file reader.
+  // offset                : the file offset.
+  // n                     : the number of bytes.
+  // result                : output buffer to put the data into.
+  // s                     : output status.
+  // rate_limiter_priority : rate limiting priority, or `Env::IO_TOTAL` to
+  //                         bypass.
+  // for_compaction        : true if cache read is done for compaction read.
+  bool TryReadFromCache(const IOOptions& opts, RandomAccessFileReader* reader,
+                        uint64_t offset, size_t n, Slice* result, Status* s,
+                        Env::IOPriority rate_limiter_priority,
+                        bool for_compaction = false);
+
+  bool TryReadFromCacheAsync(const IOOptions& opts,
+                             RandomAccessFileReader* reader, uint64_t offset,
+                             size_t n, Slice* result, Status* status,
+                             Env::IOPriority rate_limiter_priority);
+
+  // The minimum `offset` ever passed to TryReadFromCache(). This will nly be
+  // tracked if track_min_offset = true.
+  size_t min_offset_read() const { return min_offset_read_; }
+
+  // Called in case of implicit auto prefetching.
+  void UpdateReadPattern(const uint64_t& offset, const size_t& len,
+                         bool decrease_readaheadsize) {
+    if (decrease_readaheadsize) {
+      // Since this block was eligible for prefetch but it was found in
+      // cache, so check and decrease the readahead_size by 8KB (default)
+      // if eligible.
+      DecreaseReadAheadIfEligible(offset, len);
+    }
+    prev_offset_ = offset;
+    prev_len_ = len;
+    explicit_prefetch_submitted_ = false;
+  }
+
+  void GetReadaheadState(ReadaheadFileInfo::ReadaheadInfo* readahead_info) {
+    readahead_info->readahead_size = readahead_size_;
+    readahead_info->num_file_reads = num_file_reads_;
+  }
+
+  void DecreaseReadAheadIfEligible(uint64_t offset, size_t size,
+                                   size_t value = DEFAULT_DECREMENT) {
+    // Decrease the readahead_size if
+    // - its enabled internally by RocksDB (implicit_auto_readahead_) and,
+    // - readahead_size is greater than 0 and,
+    // - this block would have called prefetch API if not found in cache for
+    //   which conditions are:
+    //   - few/no bytes are in buffer and,
+    //   - block is sequential with the previous read and,
+    //   - num_file_reads_ + 1 (including this read) >
+    //   num_file_reads_for_auto_readahead_
+    size_t curr_size = bufs_[curr_].async_read_in_progress_
+                           ? bufs_[curr_].async_req_len_
+                           : bufs_[curr_].buffer_.CurrentSize();
+    if (implicit_auto_readahead_ && readahead_size_ > 0) {
+      if ((offset + size > bufs_[curr_].offset_ + curr_size) &&
+          IsBlockSequential(offset) &&
+          (num_file_reads_ + 1 > num_file_reads_for_auto_readahead_)) {
+        readahead_size_ =
+            std::max(initial_auto_readahead_size_,
+                     (readahead_size_ >= value ? readahead_size_ - value : 0));
+      }
+    }
+  }
+
+  // Callback function passed to underlying FS in case of asynchronous reads.
+  void PrefetchAsyncCallback(const FSReadRequest& req, void* cb_arg);
+
+ private:
+  // Calculates roundoff offset and length to be prefetched based on alignment
+  // and data present in buffer_. It also allocates new buffer or refit tail if
+  // required.
+  void CalculateOffsetAndLen(size_t alignment, uint64_t offset,
+                             size_t roundup_len, uint32_t index,
+                             bool refit_tail, uint64_t& chunk_len);
+
+  void AbortIOIfNeeded(uint64_t offset);
+
+  void AbortAllIOs();
+
+  void UpdateBuffersIfNeeded(uint64_t offset);
+
+  // It calls Poll API if any there is any pending asynchronous request. It then
+  // checks if data is in any buffer. It clears the outdated data and swaps the
+  // buffers if required.
+  void PollAndUpdateBuffersIfNeeded(uint64_t offset);
+
+  Status PrefetchAsyncInternal(const IOOptions& opts,
+                               RandomAccessFileReader* reader, uint64_t offset,
+                               size_t length, size_t readahead_size,
+                               Env::IOPriority rate_limiter_priority,
+                               bool& copy_to_third_buffer);
+
+  Status Read(const IOOptions& opts, RandomAccessFileReader* reader,
+              Env::IOPriority rate_limiter_priority, uint64_t read_len,
+              uint64_t chunk_len, uint64_t rounddown_start, uint32_t index);
+
+  Status ReadAsync(const IOOptions& opts, RandomAccessFileReader* reader,
+                   uint64_t read_len, uint64_t rounddown_start, uint32_t index);
+
+  // Copy the data from src to third buffer.
+  void CopyDataToBuffer(uint32_t src, uint64_t& offset, size_t& length);
+
+  bool IsBlockSequential(const size_t& offset) {
+    return (prev_len_ == 0 || (prev_offset_ + prev_len_ == offset));
+  }
+
+  // Called in case of implicit auto prefetching.
+  void ResetValues() {
+    num_file_reads_ = 1;
+    readahead_size_ = initial_auto_readahead_size_;
+  }
+
+  // Called in case of implicit auto prefetching.
+  bool IsEligibleForPrefetch(uint64_t offset, size_t n) {
+    // Prefetch only if this read is sequential otherwise reset readahead_size_
+    // to initial value.
+    if (!IsBlockSequential(offset)) {
+      UpdateReadPattern(offset, n, false /*decrease_readaheadsize*/);
+      ResetValues();
+      return false;
+    }
+    num_file_reads_++;
+
+    // Since async request was submitted in last call directly by calling
+    // PrefetchAsync, it skips num_file_reads_ check as this call is to poll the
+    // data submitted in previous call.
+    if (explicit_prefetch_submitted_) {
+      return true;
+    }
+    if (num_file_reads_ <= num_file_reads_for_auto_readahead_) {
+      UpdateReadPattern(offset, n, false /*decrease_readaheadsize*/);
+      return false;
+    }
+    return true;
+  }
+
+  // Helper functions.
+  bool IsDataBlockInBuffer(uint64_t offset, size_t length, uint32_t index) {
+    return (offset >= bufs_[index].offset_ &&
+            offset + length <=
+                bufs_[index].offset_ + bufs_[index].buffer_.CurrentSize());
+  }
+  bool IsOffsetInBuffer(uint64_t offset, uint32_t index) {
+    return (offset >= bufs_[index].offset_ &&
+            offset < bufs_[index].offset_ + bufs_[index].buffer_.CurrentSize());
+  }
+  bool DoesBufferContainData(uint32_t index) {
+    return bufs_[index].buffer_.CurrentSize() > 0;
+  }
+  bool IsBufferOutdated(uint64_t offset, uint32_t index) {
+    return (
+        !bufs_[index].async_read_in_progress_ && DoesBufferContainData(index) &&
+        offset >= bufs_[index].offset_ + bufs_[index].buffer_.CurrentSize());
+  }
+  bool IsBufferOutdatedWithAsyncProgress(uint64_t offset, uint32_t index) {
+    return (bufs_[index].async_read_in_progress_ &&
+            bufs_[index].io_handle_ != nullptr &&
+            offset >= bufs_[index].offset_ + bufs_[index].async_req_len_);
+  }
+  bool IsOffsetInBufferWithAsyncProgress(uint64_t offset, uint32_t index) {
+    return (bufs_[index].async_read_in_progress_ &&
+            offset >= bufs_[index].offset_ &&
+            offset < bufs_[index].offset_ + bufs_[index].async_req_len_);
+  }
+
+  bool IsSecondBuffEligibleForPrefetching() {
+    uint32_t second = curr_ ^ 1;
+    if (bufs_[second].async_read_in_progress_) {
+      return false;
+    }
+    assert(!bufs_[curr_].async_read_in_progress_);
+
+    if (DoesBufferContainData(curr_) && DoesBufferContainData(second) &&
+        (bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize() ==
+         bufs_[second].offset_)) {
+      return false;
+    }
+    bufs_[second].buffer_.Clear();
+    return true;
+  }
+
+  void DestroyAndClearIOHandle(uint32_t index) {
+    if (bufs_[index].io_handle_ != nullptr && bufs_[index].del_fn_ != nullptr) {
+      bufs_[index].del_fn_(bufs_[index].io_handle_);
+      bufs_[index].io_handle_ = nullptr;
+      bufs_[index].del_fn_ = nullptr;
+    }
+    bufs_[index].async_read_in_progress_ = false;
+  }
+
+  Status HandleOverlappingData(const IOOptions& opts,
+                               RandomAccessFileReader* reader, uint64_t offset,
+                               size_t length, size_t readahead_size,
+                               Env::IOPriority rate_limiter_priority,
+                               bool& copy_to_third_buffer, uint64_t& tmp_offset,
+                               size_t& tmp_length);
+
+  std::vector<BufferInfo> bufs_;
+  // curr_ represents the index for bufs_ indicating which buffer is being
+  // consumed currently.
+  uint32_t curr_;
+
+  size_t readahead_size_;
+  size_t initial_auto_readahead_size_;
+  // FilePrefetchBuffer object won't be created from Iterator flow if
+  // max_readahead_size_ = 0.
+  size_t max_readahead_size_;
+
+  // The minimum `offset` ever passed to TryReadFromCache().
+  size_t min_offset_read_;
+  // if false, TryReadFromCache() always return false, and we only take stats
+  // for track_min_offset_ if track_min_offset_ = true
+  bool enable_;
+  // If true, track minimum `offset` ever passed to TryReadFromCache(), which
+  // can be fetched from min_offset_read().
+  bool track_min_offset_;
+
+  // implicit_auto_readahead is enabled by rocksdb internally after 2
+  // sequential IOs.
+  bool implicit_auto_readahead_;
+  uint64_t prev_offset_;
+  size_t prev_len_;
+  // num_file_reads_ and num_file_reads_for_auto_readahead_ is only used when
+  // implicit_auto_readahead_ is set.
+  uint64_t num_file_reads_for_auto_readahead_;
+  uint64_t num_file_reads_;
+
+  // If explicit_prefetch_submitted_ is set then it indicates RocksDB called
+  // PrefetchAsync to submit request. It needs to call TryReadFromCacheAsync to
+  // poll the submitted request without checking if data is sequential and
+  // num_file_reads_.
+  bool explicit_prefetch_submitted_;
+
+  FileSystem* fs_;
+  SystemClock* clock_;
+  Statistics* stats_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/file_util.cc b/src/rocksdb/file/file_util.cc
new file mode 100644
index 000000000..7997d6e11
--- /dev/null
+++ b/src/rocksdb/file/file_util.cc
@@ -0,0 +1,282 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "file/file_util.h"
+
+#include <algorithm>
+#include <string>
+
+#include "file/random_access_file_reader.h"
+#include "file/sequence_file_reader.h"
+#include "file/sst_file_manager_impl.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Utility function to copy a file up to a specified length
+IOStatus CopyFile(FileSystem* fs, const std::string& source,
+                  std::unique_ptr<WritableFileWriter>& dest_writer,
+                  uint64_t size, bool use_fsync,
+                  const std::shared_ptr<IOTracer>& io_tracer,
+                  const Temperature temperature) {
+  FileOptions soptions;
+  IOStatus io_s;
+  std::unique_ptr<SequentialFileReader> src_reader;
+
+  {
+    soptions.temperature = temperature;
+    std::unique_ptr<FSSequentialFile> srcfile;
+    io_s = fs->NewSequentialFile(source, soptions, &srcfile, nullptr);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+
+    if (size == 0) {
+      // default argument means copy everything
+      io_s = fs->GetFileSize(source, IOOptions(), &size, nullptr);
+      if (!io_s.ok()) {
+        return io_s;
+      }
+    }
+    src_reader.reset(
+        new SequentialFileReader(std::move(srcfile), source, io_tracer));
+  }
+
+  char buffer[4096];
+  Slice slice;
+  while (size > 0) {
+    size_t bytes_to_read = std::min(sizeof(buffer), static_cast<size_t>(size));
+    // TODO: rate limit copy file
+    io_s = status_to_io_status(
+        src_reader->Read(bytes_to_read, &slice, buffer,
+                         Env::IO_TOTAL /* rate_limiter_priority */));
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    if (slice.size() == 0) {
+      return IOStatus::Corruption("file too small");
+    }
+    io_s = dest_writer->Append(slice);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    size -= slice.size();
+  }
+  return dest_writer->Sync(use_fsync);
+}
+
+IOStatus CopyFile(FileSystem* fs, const std::string& source,
+                  const std::string& destination, uint64_t size, bool use_fsync,
+                  const std::shared_ptr<IOTracer>& io_tracer,
+                  const Temperature temperature) {
+  FileOptions options;
+  IOStatus io_s;
+  std::unique_ptr<WritableFileWriter> dest_writer;
+
+  {
+    options.temperature = temperature;
+    std::unique_ptr<FSWritableFile> destfile;
+    io_s = fs->NewWritableFile(destination, options, &destfile, nullptr);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+
+    dest_writer.reset(
+        new WritableFileWriter(std::move(destfile), destination, options));
+  }
+
+  return CopyFile(fs, source, dest_writer, size, use_fsync, io_tracer,
+                  temperature);
+}
+
+// Utility function to create a file with the provided contents
+IOStatus CreateFile(FileSystem* fs, const std::string& destination,
+                    const std::string& contents, bool use_fsync) {
+  const EnvOptions soptions;
+  IOStatus io_s;
+  std::unique_ptr<WritableFileWriter> dest_writer;
+
+  std::unique_ptr<FSWritableFile> destfile;
+  io_s = fs->NewWritableFile(destination, soptions, &destfile, nullptr);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  dest_writer.reset(
+      new WritableFileWriter(std::move(destfile), destination, soptions));
+  io_s = dest_writer->Append(Slice(contents));
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  return dest_writer->Sync(use_fsync);
+}
+
+Status DeleteDBFile(const ImmutableDBOptions* db_options,
+                    const std::string& fname, const std::string& dir_to_sync,
+                    const bool force_bg, const bool force_fg) {
+#ifndef ROCKSDB_LITE
+  SstFileManagerImpl* sfm =
+      static_cast<SstFileManagerImpl*>(db_options->sst_file_manager.get());
+  if (sfm && !force_fg) {
+    return sfm->ScheduleFileDeletion(fname, dir_to_sync, force_bg);
+  } else {
+    return db_options->env->DeleteFile(fname);
+  }
+#else
+  (void)dir_to_sync;
+  (void)force_bg;
+  (void)force_fg;
+  // SstFileManager is not supported in ROCKSDB_LITE
+  // Delete file immediately
+  return db_options->env->DeleteFile(fname);
+#endif
+}
+
+// requested_checksum_func_name brings the function name of the checksum
+// generator in checksum_factory. Empty string is permitted, in which case the
+// name of the generator created by the factory is unchecked. When
+// `requested_checksum_func_name` is non-empty, however, the created generator's
+// name must match it, otherwise an `InvalidArgument` error is returned.
+IOStatus GenerateOneFileChecksum(
+    FileSystem* fs, const std::string& file_path,
+    FileChecksumGenFactory* checksum_factory,
+    const std::string& requested_checksum_func_name, std::string* file_checksum,
+    std::string* file_checksum_func_name,
+    size_t verify_checksums_readahead_size, bool allow_mmap_reads,
+    std::shared_ptr<IOTracer>& io_tracer, RateLimiter* rate_limiter,
+    Env::IOPriority rate_limiter_priority) {
+  if (checksum_factory == nullptr) {
+    return IOStatus::InvalidArgument("Checksum factory is invalid");
+  }
+  assert(file_checksum != nullptr);
+  assert(file_checksum_func_name != nullptr);
+
+  FileChecksumGenContext gen_context;
+  gen_context.requested_checksum_func_name = requested_checksum_func_name;
+  gen_context.file_name = file_path;
+  std::unique_ptr<FileChecksumGenerator> checksum_generator =
+      checksum_factory->CreateFileChecksumGenerator(gen_context);
+  if (checksum_generator == nullptr) {
+    std::string msg =
+        "Cannot get the file checksum generator based on the requested "
+        "checksum function name: " +
+        requested_checksum_func_name +
+        " from checksum factory: " + checksum_factory->Name();
+    return IOStatus::InvalidArgument(msg);
+  } else {
+    // For backward compatibility and use in file ingestion clients where there
+    // is no stored checksum function name, `requested_checksum_func_name` can
+    // be empty. If we give the requested checksum function name, we expect it
+    // is the same name of the checksum generator.
+    if (!requested_checksum_func_name.empty() &&
+        checksum_generator->Name() != requested_checksum_func_name) {
+      std::string msg = "Expected file checksum generator named '" +
+                        requested_checksum_func_name +
+                        "', while the factory created one "
+                        "named '" +
+                        checksum_generator->Name() + "'";
+      return IOStatus::InvalidArgument(msg);
+    }
+  }
+
+  uint64_t size;
+  IOStatus io_s;
+  std::unique_ptr<RandomAccessFileReader> reader;
+  {
+    std::unique_ptr<FSRandomAccessFile> r_file;
+    io_s = fs->NewRandomAccessFile(file_path, FileOptions(), &r_file, nullptr);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    io_s = fs->GetFileSize(file_path, IOOptions(), &size, nullptr);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    reader.reset(new RandomAccessFileReader(std::move(r_file), file_path,
+                                            nullptr /*Env*/, io_tracer, nullptr,
+                                            0, nullptr, rate_limiter));
+  }
+
+  // Found that 256 KB readahead size provides the best performance, based on
+  // experiments, for auto readahead. Experiment data is in PR #3282.
+  size_t default_max_read_ahead_size = 256 * 1024;
+  size_t readahead_size = (verify_checksums_readahead_size != 0)
+                              ? verify_checksums_readahead_size
+                              : default_max_read_ahead_size;
+
+  FilePrefetchBuffer prefetch_buffer(readahead_size /* readahead_size */,
+                                     readahead_size /* max_readahead_size */,
+                                     !allow_mmap_reads /* enable */);
+
+  Slice slice;
+  uint64_t offset = 0;
+  IOOptions opts;
+  while (size > 0) {
+    size_t bytes_to_read =
+        static_cast<size_t>(std::min(uint64_t{readahead_size}, size));
+    if (!prefetch_buffer.TryReadFromCache(
+            opts, reader.get(), offset, bytes_to_read, &slice,
+            nullptr /* status */, rate_limiter_priority,
+            false /* for_compaction */)) {
+      return IOStatus::Corruption("file read failed");
+    }
+    if (slice.size() == 0) {
+      return IOStatus::Corruption("file too small");
+    }
+    checksum_generator->Update(slice.data(), slice.size());
+    size -= slice.size();
+    offset += slice.size();
+  }
+  checksum_generator->Finalize();
+  *file_checksum = checksum_generator->GetChecksum();
+  *file_checksum_func_name = checksum_generator->Name();
+  return IOStatus::OK();
+}
+
+Status DestroyDir(Env* env, const std::string& dir) {
+  Status s;
+  if (env->FileExists(dir).IsNotFound()) {
+    return s;
+  }
+  std::vector<std::string> files_in_dir;
+  s = env->GetChildren(dir, &files_in_dir);
+  if (s.ok()) {
+    for (auto& file_in_dir : files_in_dir) {
+      std::string path = dir + "/" + file_in_dir;
+      bool is_dir = false;
+      s = env->IsDirectory(path, &is_dir);
+      if (s.ok()) {
+        if (is_dir) {
+          s = DestroyDir(env, path);
+        } else {
+          s = env->DeleteFile(path);
+        }
+      } else if (s.IsNotSupported()) {
+        s = Status::OK();
+      }
+      if (!s.ok()) {
+        // IsDirectory, etc. might not report NotFound
+        if (s.IsNotFound() || env->FileExists(path).IsNotFound()) {
+          // Allow files to be deleted externally
+          s = Status::OK();
+        } else {
+          break;
+        }
+      }
+    }
+  }
+
+  if (s.ok()) {
+    s = env->DeleteDir(dir);
+    // DeleteDir might or might not report NotFound
+    if (!s.ok() && (s.IsNotFound() || env->FileExists(dir).IsNotFound())) {
+      // Allow to be deleted externally
+      s = Status::OK();
+    }
+  }
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/file_util.h b/src/rocksdb/file/file_util.h
new file mode 100644
index 000000000..d46a7ba0e
--- /dev/null
+++ b/src/rocksdb/file/file_util.h
@@ -0,0 +1,89 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include <string>
+
+#include "file/filename.h"
+#include "options/db_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/sst_file_writer.h"
+#include "rocksdb/status.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/types.h"
+#include "trace_replay/io_tracer.h"
+
+namespace ROCKSDB_NAMESPACE {
+// use_fsync maps to options.use_fsync, which determines the way that
+// the file is synced after copying.
+extern IOStatus CopyFile(FileSystem* fs, const std::string& source,
+                         std::unique_ptr<WritableFileWriter>& dest_writer,
+                         uint64_t size, bool use_fsync,
+                         const std::shared_ptr<IOTracer>& io_tracer,
+                         const Temperature temperature);
+extern IOStatus CopyFile(FileSystem* fs, const std::string& source,
+                         const std::string& destination, uint64_t size,
+                         bool use_fsync,
+                         const std::shared_ptr<IOTracer>& io_tracer,
+                         const Temperature temperature);
+inline IOStatus CopyFile(const std::shared_ptr<FileSystem>& fs,
+                         const std::string& source,
+                         const std::string& destination, uint64_t size,
+                         bool use_fsync,
+                         const std::shared_ptr<IOTracer>& io_tracer,
+                         const Temperature temperature) {
+  return CopyFile(fs.get(), source, destination, size, use_fsync, io_tracer,
+                  temperature);
+}
+extern IOStatus CreateFile(FileSystem* fs, const std::string& destination,
+                           const std::string& contents, bool use_fsync);
+
+inline IOStatus CreateFile(const std::shared_ptr<FileSystem>& fs,
+                           const std::string& destination,
+                           const std::string& contents, bool use_fsync) {
+  return CreateFile(fs.get(), destination, contents, use_fsync);
+}
+
+extern Status DeleteDBFile(const ImmutableDBOptions* db_options,
+                           const std::string& fname,
+                           const std::string& path_to_sync, const bool force_bg,
+                           const bool force_fg);
+
+extern IOStatus GenerateOneFileChecksum(
+    FileSystem* fs, const std::string& file_path,
+    FileChecksumGenFactory* checksum_factory,
+    const std::string& requested_checksum_func_name, std::string* file_checksum,
+    std::string* file_checksum_func_name,
+    size_t verify_checksums_readahead_size, bool allow_mmap_reads,
+    std::shared_ptr<IOTracer>& io_tracer, RateLimiter* rate_limiter,
+    Env::IOPriority rate_limiter_priority);
+
+inline IOStatus PrepareIOFromReadOptions(const ReadOptions& ro,
+                                         SystemClock* clock, IOOptions& opts) {
+  if (ro.deadline.count()) {
+    std::chrono::microseconds now =
+        std::chrono::microseconds(clock->NowMicros());
+    // Ensure there is atleast 1us available. We don't want to pass a value of
+    // 0 as that means no timeout
+    if (now >= ro.deadline) {
+      return IOStatus::TimedOut("Deadline exceeded");
+    }
+    opts.timeout = ro.deadline - now;
+  }
+
+  if (ro.io_timeout.count() &&
+      (!opts.timeout.count() || ro.io_timeout < opts.timeout)) {
+    opts.timeout = ro.io_timeout;
+  }
+
+  opts.rate_limiter_priority = ro.rate_limiter_priority;
+  return IOStatus::OK();
+}
+
+// Test method to delete the input directory and all of its contents.
+// This method is destructive and is meant for use only in tests!!!
+Status DestroyDir(Env* env, const std::string& dir);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/filename.cc b/src/rocksdb/file/filename.cc
new file mode 100644
index 000000000..1e04c7339
--- /dev/null
+++ b/src/rocksdb/file/filename.cc
@@ -0,0 +1,523 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "file/filename.h"
+
+#include <ctype.h>
+#include <stdio.h>
+
+#include <cinttypes>
+#include <vector>
+
+#include "file/writable_file_writer.h"
+#include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const std::string kCurrentFileName = "CURRENT";
+const std::string kOptionsFileNamePrefix = "OPTIONS-";
+const std::string kTempFileNameSuffix = "dbtmp";
+
+static const std::string kRocksDbTFileExt = "sst";
+static const std::string kLevelDbTFileExt = "ldb";
+static const std::string kRocksDBBlobFileExt = "blob";
+static const std::string kArchivalDirName = "archive";
+
+// Given a path, flatten the path name by replacing all chars not in
+// {[0-9,a-z,A-Z,-,_,.]} with _. And append '_LOG\0' at the end.
+// Return the number of chars stored in dest not including the trailing '\0'.
+static size_t GetInfoLogPrefix(const std::string& path, char* dest, int len) {
+  const char suffix[] = "_LOG";
+
+  size_t write_idx = 0;
+  size_t i = 0;
+  size_t src_len = path.size();
+
+  while (i < src_len && write_idx < len - sizeof(suffix)) {
+    if ((path[i] >= 'a' && path[i] <= 'z') ||
+        (path[i] >= '0' && path[i] <= '9') ||
+        (path[i] >= 'A' && path[i] <= 'Z') || path[i] == '-' ||
+        path[i] == '.' || path[i] == '_') {
+      dest[write_idx++] = path[i];
+    } else {
+      if (i > 0) {
+        dest[write_idx++] = '_';
+      }
+    }
+    i++;
+  }
+  assert(sizeof(suffix) <= len - write_idx);
+  // "\0" is automatically added by snprintf
+  snprintf(dest + write_idx, len - write_idx, suffix);
+  write_idx += sizeof(suffix) - 1;
+  return write_idx;
+}
+
+static std::string MakeFileName(uint64_t number, const char* suffix) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "%06llu.%s",
+           static_cast<unsigned long long>(number), suffix);
+  return buf;
+}
+
+static std::string MakeFileName(const std::string& name, uint64_t number,
+                                const char* suffix) {
+  return name + "/" + MakeFileName(number, suffix);
+}
+
+std::string LogFileName(const std::string& name, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(name, number, "log");
+}
+
+std::string LogFileName(uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(number, "log");
+}
+
+std::string BlobFileName(uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(number, kRocksDBBlobFileExt.c_str());
+}
+
+std::string BlobFileName(const std::string& blobdirname, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(blobdirname, number, kRocksDBBlobFileExt.c_str());
+}
+
+std::string BlobFileName(const std::string& dbname, const std::string& blob_dir,
+                         uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(dbname + "/" + blob_dir, number,
+                      kRocksDBBlobFileExt.c_str());
+}
+
+std::string ArchivalDirectory(const std::string& dir) {
+  return dir + "/" + kArchivalDirName;
+}
+std::string ArchivedLogFileName(const std::string& name, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(name + "/" + kArchivalDirName, number, "log");
+}
+
+std::string MakeTableFileName(const std::string& path, uint64_t number) {
+  return MakeFileName(path, number, kRocksDbTFileExt.c_str());
+}
+
+std::string MakeTableFileName(uint64_t number) {
+  return MakeFileName(number, kRocksDbTFileExt.c_str());
+}
+
+std::string Rocks2LevelTableFileName(const std::string& fullname) {
+  assert(fullname.size() > kRocksDbTFileExt.size() + 1);
+  if (fullname.size() <= kRocksDbTFileExt.size() + 1) {
+    return "";
+  }
+  return fullname.substr(0, fullname.size() - kRocksDbTFileExt.size()) +
+         kLevelDbTFileExt;
+}
+
+uint64_t TableFileNameToNumber(const std::string& name) {
+  uint64_t number = 0;
+  uint64_t base = 1;
+  int pos = static_cast<int>(name.find_last_of('.'));
+  while (--pos >= 0 && name[pos] >= '0' && name[pos] <= '9') {
+    number += (name[pos] - '0') * base;
+    base *= 10;
+  }
+  return number;
+}
+
+std::string TableFileName(const std::vector<DbPath>& db_paths, uint64_t number,
+                          uint32_t path_id) {
+  assert(number > 0);
+  std::string path;
+  if (path_id >= db_paths.size()) {
+    path = db_paths.back().path;
+  } else {
+    path = db_paths[path_id].path;
+  }
+  return MakeTableFileName(path, number);
+}
+
+void FormatFileNumber(uint64_t number, uint32_t path_id, char* out_buf,
+                      size_t out_buf_size) {
+  if (path_id == 0) {
+    snprintf(out_buf, out_buf_size, "%" PRIu64, number);
+  } else {
+    snprintf(out_buf, out_buf_size,
+             "%" PRIu64
+             "(path "
+             "%" PRIu32 ")",
+             number, path_id);
+  }
+}
+
+std::string DescriptorFileName(uint64_t number) {
+  assert(number > 0);
+  char buf[100];
+  snprintf(buf, sizeof(buf), "MANIFEST-%06llu",
+           static_cast<unsigned long long>(number));
+  return buf;
+}
+
+std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
+  return dbname + "/" + DescriptorFileName(number);
+}
+
+std::string CurrentFileName(const std::string& dbname) {
+  return dbname + "/" + kCurrentFileName;
+}
+
+std::string LockFileName(const std::string& dbname) { return dbname + "/LOCK"; }
+
+std::string TempFileName(const std::string& dbname, uint64_t number) {
+  return MakeFileName(dbname, number, kTempFileNameSuffix.c_str());
+}
+
+InfoLogPrefix::InfoLogPrefix(bool has_log_dir,
+                             const std::string& db_absolute_path) {
+  if (!has_log_dir) {
+    const char kInfoLogPrefix[] = "LOG";
+    // "\0" is automatically added to the end
+    snprintf(buf, sizeof(buf), kInfoLogPrefix);
+    prefix = Slice(buf, sizeof(kInfoLogPrefix) - 1);
+  } else {
+    size_t len =
+        GetInfoLogPrefix(NormalizePath(db_absolute_path), buf, sizeof(buf));
+    prefix = Slice(buf, len);
+  }
+}
+
+std::string InfoLogFileName(const std::string& dbname,
+                            const std::string& db_path,
+                            const std::string& log_dir) {
+  if (log_dir.empty()) {
+    return dbname + "/LOG";
+  }
+
+  InfoLogPrefix info_log_prefix(true, db_path);
+  return log_dir + "/" + info_log_prefix.buf;
+}
+
+// Return the name of the old info log file for "dbname".
+std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
+                               const std::string& db_path,
+                               const std::string& log_dir) {
+  char buf[50];
+  snprintf(buf, sizeof(buf), "%llu", static_cast<unsigned long long>(ts));
+
+  if (log_dir.empty()) {
+    return dbname + "/LOG.old." + buf;
+  }
+
+  InfoLogPrefix info_log_prefix(true, db_path);
+  return log_dir + "/" + info_log_prefix.buf + ".old." + buf;
+}
+
+std::string OptionsFileName(uint64_t file_num) {
+  char buffer[256];
+  snprintf(buffer, sizeof(buffer), "%s%06" PRIu64,
+           kOptionsFileNamePrefix.c_str(), file_num);
+  return buffer;
+}
+std::string OptionsFileName(const std::string& dbname, uint64_t file_num) {
+  return dbname + "/" + OptionsFileName(file_num);
+}
+
+std::string TempOptionsFileName(const std::string& dbname, uint64_t file_num) {
+  char buffer[256];
+  snprintf(buffer, sizeof(buffer), "%s%06" PRIu64 ".%s",
+           kOptionsFileNamePrefix.c_str(), file_num,
+           kTempFileNameSuffix.c_str());
+  return dbname + "/" + buffer;
+}
+
+std::string MetaDatabaseName(const std::string& dbname, uint64_t number) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "/METADB-%llu",
+           static_cast<unsigned long long>(number));
+  return dbname + buf;
+}
+
+std::string IdentityFileName(const std::string& dbname) {
+  return dbname + "/IDENTITY";
+}
+
+// Owned filenames have the form:
+//    dbname/IDENTITY
+//    dbname/CURRENT
+//    dbname/LOCK
+//    dbname/<info_log_name_prefix>
+//    dbname/<info_log_name_prefix>.old.[0-9]+
+//    dbname/MANIFEST-[0-9]+
+//    dbname/[0-9]+.(log|sst|blob)
+//    dbname/METADB-[0-9]+
+//    dbname/OPTIONS-[0-9]+
+//    dbname/OPTIONS-[0-9]+.dbtmp
+//    Disregards / at the beginning
+bool ParseFileName(const std::string& fname, uint64_t* number, FileType* type,
+                   WalFileType* log_type) {
+  return ParseFileName(fname, number, "", type, log_type);
+}
+
+bool ParseFileName(const std::string& fname, uint64_t* number,
+                   const Slice& info_log_name_prefix, FileType* type,
+                   WalFileType* log_type) {
+  Slice rest(fname);
+  if (fname.length() > 1 && fname[0] == '/') {
+    rest.remove_prefix(1);
+  }
+  if (rest == "IDENTITY") {
+    *number = 0;
+    *type = kIdentityFile;
+  } else if (rest == "CURRENT") {
+    *number = 0;
+    *type = kCurrentFile;
+  } else if (rest == "LOCK") {
+    *number = 0;
+    *type = kDBLockFile;
+  } else if (info_log_name_prefix.size() > 0 &&
+             rest.starts_with(info_log_name_prefix)) {
+    rest.remove_prefix(info_log_name_prefix.size());
+    if (rest == "" || rest == ".old") {
+      *number = 0;
+      *type = kInfoLogFile;
+    } else if (rest.starts_with(".old.")) {
+      uint64_t ts_suffix;
+      // sizeof also counts the trailing '\0'.
+      rest.remove_prefix(sizeof(".old.") - 1);
+      if (!ConsumeDecimalNumber(&rest, &ts_suffix)) {
+        return false;
+      }
+      *number = ts_suffix;
+      *type = kInfoLogFile;
+    }
+  } else if (rest.starts_with("MANIFEST-")) {
+    rest.remove_prefix(strlen("MANIFEST-"));
+    uint64_t num;
+    if (!ConsumeDecimalNumber(&rest, &num)) {
+      return false;
+    }
+    if (!rest.empty()) {
+      return false;
+    }
+    *type = kDescriptorFile;
+    *number = num;
+  } else if (rest.starts_with("METADB-")) {
+    rest.remove_prefix(strlen("METADB-"));
+    uint64_t num;
+    if (!ConsumeDecimalNumber(&rest, &num)) {
+      return false;
+    }
+    if (!rest.empty()) {
+      return false;
+    }
+    *type = kMetaDatabase;
+    *number = num;
+  } else if (rest.starts_with(kOptionsFileNamePrefix)) {
+    uint64_t ts_suffix;
+    bool is_temp_file = false;
+    rest.remove_prefix(kOptionsFileNamePrefix.size());
+    const std::string kTempFileNameSuffixWithDot =
+        std::string(".") + kTempFileNameSuffix;
+    if (rest.ends_with(kTempFileNameSuffixWithDot)) {
+      rest.remove_suffix(kTempFileNameSuffixWithDot.size());
+      is_temp_file = true;
+    }
+    if (!ConsumeDecimalNumber(&rest, &ts_suffix)) {
+      return false;
+    }
+    *number = ts_suffix;
+    *type = is_temp_file ? kTempFile : kOptionsFile;
+  } else {
+    // Avoid strtoull() to keep filename format independent of the
+    // current locale
+    bool archive_dir_found = false;
+    if (rest.starts_with(kArchivalDirName)) {
+      if (rest.size() <= kArchivalDirName.size()) {
+        return false;
+      }
+      rest.remove_prefix(kArchivalDirName.size() +
+                         1);  // Add 1 to remove / also
+      if (log_type) {
+        *log_type = kArchivedLogFile;
+      }
+      archive_dir_found = true;
+    }
+    uint64_t num;
+    if (!ConsumeDecimalNumber(&rest, &num)) {
+      return false;
+    }
+    if (rest.size() <= 1 || rest[0] != '.') {
+      return false;
+    }
+    rest.remove_prefix(1);
+
+    Slice suffix = rest;
+    if (suffix == Slice("log")) {
+      *type = kWalFile;
+      if (log_type && !archive_dir_found) {
+        *log_type = kAliveLogFile;
+      }
+    } else if (archive_dir_found) {
+      return false;  // Archive dir can contain only log files
+    } else if (suffix == Slice(kRocksDbTFileExt) ||
+               suffix == Slice(kLevelDbTFileExt)) {
+      *type = kTableFile;
+    } else if (suffix == Slice(kRocksDBBlobFileExt)) {
+      *type = kBlobFile;
+    } else if (suffix == Slice(kTempFileNameSuffix)) {
+      *type = kTempFile;
+    } else {
+      return false;
+    }
+    *number = num;
+  }
+  return true;
+}
+
+IOStatus SetCurrentFile(FileSystem* fs, const std::string& dbname,
+                        uint64_t descriptor_number,
+                        FSDirectory* dir_contains_current_file) {
+  // Remove leading "dbname/" and add newline to manifest file name
+  std::string manifest = DescriptorFileName(dbname, descriptor_number);
+  Slice contents = manifest;
+  assert(contents.starts_with(dbname + "/"));
+  contents.remove_prefix(dbname.size() + 1);
+  std::string tmp = TempFileName(dbname, descriptor_number);
+  IOStatus s = WriteStringToFile(fs, contents.ToString() + "\n", tmp, true);
+  TEST_SYNC_POINT_CALLBACK("SetCurrentFile:BeforeRename", &s);
+  if (s.ok()) {
+    TEST_KILL_RANDOM_WITH_WEIGHT("SetCurrentFile:0", REDUCE_ODDS2);
+    s = fs->RenameFile(tmp, CurrentFileName(dbname), IOOptions(), nullptr);
+    TEST_KILL_RANDOM_WITH_WEIGHT("SetCurrentFile:1", REDUCE_ODDS2);
+    TEST_SYNC_POINT_CALLBACK("SetCurrentFile:AfterRename", &s);
+  }
+  if (s.ok()) {
+    if (dir_contains_current_file != nullptr) {
+      s = dir_contains_current_file->FsyncWithDirOptions(
+          IOOptions(), nullptr, DirFsyncOptions(CurrentFileName(dbname)));
+    }
+  } else {
+    fs->DeleteFile(tmp, IOOptions(), nullptr)
+        .PermitUncheckedError();  // NOTE: PermitUncheckedError is acceptable
+                                  // here as we are already handling an error
+                                  // case, and this is just a best-attempt
+                                  // effort at some cleanup
+  }
+  return s;
+}
+
+Status SetIdentityFile(Env* env, const std::string& dbname,
+                       const std::string& db_id) {
+  std::string id;
+  if (db_id.empty()) {
+    id = env->GenerateUniqueId();
+  } else {
+    id = db_id;
+  }
+  assert(!id.empty());
+  // Reserve the filename dbname/000000.dbtmp for the temporary identity file
+  std::string tmp = TempFileName(dbname, 0);
+  std::string identify_file_name = IdentityFileName(dbname);
+  Status s = WriteStringToFile(env, id, tmp, true);
+  if (s.ok()) {
+    s = env->RenameFile(tmp, identify_file_name);
+  }
+  std::unique_ptr<FSDirectory> dir_obj;
+  if (s.ok()) {
+    s = env->GetFileSystem()->NewDirectory(dbname, IOOptions(), &dir_obj,
+                                           nullptr);
+  }
+  if (s.ok()) {
+    s = dir_obj->FsyncWithDirOptions(IOOptions(), nullptr,
+                                     DirFsyncOptions(identify_file_name));
+  }
+
+  // The default Close() could return "NotSupported" and we bypass it
+  // if it is not impelmented. Detailed explanations can be found in
+  // db/db_impl/db_impl.h
+  if (s.ok()) {
+    Status temp_s = dir_obj->Close(IOOptions(), nullptr);
+    if (!temp_s.ok()) {
+      if (temp_s.IsNotSupported()) {
+        temp_s.PermitUncheckedError();
+      } else {
+        s = temp_s;
+      }
+    }
+  }
+  if (!s.ok()) {
+    env->DeleteFile(tmp).PermitUncheckedError();
+  }
+  return s;
+}
+
+IOStatus SyncManifest(const ImmutableDBOptions* db_options,
+                      WritableFileWriter* file) {
+  TEST_KILL_RANDOM_WITH_WEIGHT("SyncManifest:0", REDUCE_ODDS2);
+  StopWatch sw(db_options->clock, db_options->stats, MANIFEST_FILE_SYNC_MICROS);
+  return file->Sync(db_options->use_fsync);
+}
+
+Status GetInfoLogFiles(const std::shared_ptr<FileSystem>& fs,
+                       const std::string& db_log_dir, const std::string& dbname,
+                       std::string* parent_dir,
+                       std::vector<std::string>* info_log_list) {
+  assert(parent_dir != nullptr);
+  assert(info_log_list != nullptr);
+  uint64_t number = 0;
+  FileType type = kWalFile;
+
+  if (!db_log_dir.empty()) {
+    *parent_dir = db_log_dir;
+  } else {
+    *parent_dir = dbname;
+  }
+
+  InfoLogPrefix info_log_prefix(!db_log_dir.empty(), dbname);
+
+  std::vector<std::string> file_names;
+  Status s = fs->GetChildren(*parent_dir, IOOptions(), &file_names, nullptr);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  for (auto& f : file_names) {
+    if (ParseFileName(f, &number, info_log_prefix.prefix, &type) &&
+        (type == kInfoLogFile)) {
+      info_log_list->push_back(f);
+    }
+  }
+  return Status::OK();
+}
+
+std::string NormalizePath(const std::string& path) {
+  std::string dst;
+
+  if (path.length() > 2 && path[0] == kFilePathSeparator &&
+      path[1] == kFilePathSeparator) {  // Handle UNC names
+    dst.append(2, kFilePathSeparator);
+  }
+
+  for (auto c : path) {
+    if (!dst.empty() && (c == kFilePathSeparator || c == '/') &&
+        (dst.back() == kFilePathSeparator || dst.back() == '/')) {
+      continue;
+    }
+    dst.push_back(c);
+  }
+  return dst;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/filename.h b/src/rocksdb/file/filename.h
new file mode 100644
index 000000000..2eb125b6a
--- /dev/null
+++ b/src/rocksdb/file/filename.h
@@ -0,0 +1,188 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// File names used by DB code
+
+#pragma once
+#include <stdint.h>
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/transaction_log.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Env;
+class Directory;
+class SystemClock;
+class WritableFileWriter;
+
+#ifdef OS_WIN
+constexpr char kFilePathSeparator = '\\';
+#else
+constexpr char kFilePathSeparator = '/';
+#endif
+
+// Return the name of the log file with the specified number
+// in the db named by "dbname".  The result will be prefixed with
+// "dbname".
+extern std::string LogFileName(const std::string& dbname, uint64_t number);
+
+extern std::string LogFileName(uint64_t number);
+
+extern std::string BlobFileName(uint64_t number);
+
+extern std::string BlobFileName(const std::string& bdirname, uint64_t number);
+
+extern std::string BlobFileName(const std::string& dbname,
+                                const std::string& blob_dir, uint64_t number);
+
+extern std::string ArchivalDirectory(const std::string& dbname);
+
+//  Return the name of the archived log file with the specified number
+//  in the db named by "dbname". The result will be prefixed with "dbname".
+extern std::string ArchivedLogFileName(const std::string& dbname, uint64_t num);
+
+extern std::string MakeTableFileName(const std::string& name, uint64_t number);
+
+extern std::string MakeTableFileName(uint64_t number);
+
+// Return the name of sstable with LevelDB suffix
+// created from RocksDB sstable suffixed name
+extern std::string Rocks2LevelTableFileName(const std::string& fullname);
+
+// the reverse function of MakeTableFileName
+// TODO(yhchiang): could merge this function with ParseFileName()
+extern uint64_t TableFileNameToNumber(const std::string& name);
+
+// Return the name of the sstable with the specified number
+// in the db named by "dbname".  The result will be prefixed with
+// "dbname".
+extern std::string TableFileName(const std::vector<DbPath>& db_paths,
+                                 uint64_t number, uint32_t path_id);
+
+// Sufficient buffer size for FormatFileNumber.
+const size_t kFormatFileNumberBufSize = 38;
+
+extern void FormatFileNumber(uint64_t number, uint32_t path_id, char* out_buf,
+                             size_t out_buf_size);
+
+// Return the name of the descriptor file for the db named by
+// "dbname" and the specified incarnation number.  The result will be
+// prefixed with "dbname".
+extern std::string DescriptorFileName(const std::string& dbname,
+                                      uint64_t number);
+
+extern std::string DescriptorFileName(uint64_t number);
+
+extern const std::string kCurrentFileName;  // = "CURRENT"
+
+// Return the name of the current file.  This file contains the name
+// of the current manifest file.  The result will be prefixed with
+// "dbname".
+extern std::string CurrentFileName(const std::string& dbname);
+
+// Return the name of the lock file for the db named by
+// "dbname".  The result will be prefixed with "dbname".
+extern std::string LockFileName(const std::string& dbname);
+
+// Return the name of a temporary file owned by the db named "dbname".
+// The result will be prefixed with "dbname".
+extern std::string TempFileName(const std::string& dbname, uint64_t number);
+
+// A helper structure for prefix of info log names.
+struct InfoLogPrefix {
+  char buf[260];
+  Slice prefix;
+  // Prefix with DB absolute path encoded
+  explicit InfoLogPrefix(bool has_log_dir, const std::string& db_absolute_path);
+  // Default Prefix
+  explicit InfoLogPrefix();
+};
+
+// Return the name of the info log file for "dbname".
+extern std::string InfoLogFileName(const std::string& dbname,
+                                   const std::string& db_path = "",
+                                   const std::string& log_dir = "");
+
+// Return the name of the old info log file for "dbname".
+extern std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
+                                      const std::string& db_path = "",
+                                      const std::string& log_dir = "");
+
+extern const std::string kOptionsFileNamePrefix;  // = "OPTIONS-"
+extern const std::string kTempFileNameSuffix;     // = "dbtmp"
+
+// Return a options file name given the "dbname" and file number.
+// Format:  OPTIONS-[number].dbtmp
+extern std::string OptionsFileName(const std::string& dbname,
+                                   uint64_t file_num);
+extern std::string OptionsFileName(uint64_t file_num);
+
+// Return a temp options file name given the "dbname" and file number.
+// Format:  OPTIONS-[number]
+extern std::string TempOptionsFileName(const std::string& dbname,
+                                       uint64_t file_num);
+
+// Return the name to use for a metadatabase. The result will be prefixed with
+// "dbname".
+extern std::string MetaDatabaseName(const std::string& dbname, uint64_t number);
+
+// Return the name of the Identity file which stores a unique number for the db
+// that will get regenerated if the db loses all its data and is recreated fresh
+// either from a backup-image or empty
+extern std::string IdentityFileName(const std::string& dbname);
+
+// If filename is a rocksdb file, store the type of the file in *type.
+// The number encoded in the filename is stored in *number.  If the
+// filename was successfully parsed, returns true.  Else return false.
+// info_log_name_prefix is the path of info logs.
+extern bool ParseFileName(const std::string& filename, uint64_t* number,
+                          const Slice& info_log_name_prefix, FileType* type,
+                          WalFileType* log_type = nullptr);
+// Same as previous function, but skip info log files.
+extern bool ParseFileName(const std::string& filename, uint64_t* number,
+                          FileType* type, WalFileType* log_type = nullptr);
+
+// Make the CURRENT file point to the descriptor file with the
+// specified number. On its success and when dir_contains_current_file is not
+// nullptr, the function will fsync the directory containing the CURRENT file
+// when
+extern IOStatus SetCurrentFile(FileSystem* fs, const std::string& dbname,
+                               uint64_t descriptor_number,
+                               FSDirectory* dir_contains_current_file);
+
+// Make the IDENTITY file for the db
+extern Status SetIdentityFile(Env* env, const std::string& dbname,
+                              const std::string& db_id = {});
+
+// Sync manifest file `file`.
+extern IOStatus SyncManifest(const ImmutableDBOptions* db_options,
+                             WritableFileWriter* file);
+
+// Return list of file names of info logs in `file_names`.
+// The list only contains file name. The parent directory name is stored
+// in `parent_dir`.
+// `db_log_dir` should be the one as in options.db_log_dir
+extern Status GetInfoLogFiles(const std::shared_ptr<FileSystem>& fs,
+                              const std::string& db_log_dir,
+                              const std::string& dbname,
+                              std::string* parent_dir,
+                              std::vector<std::string>* file_names);
+
+extern std::string NormalizePath(const std::string& path);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/line_file_reader.cc b/src/rocksdb/file/line_file_reader.cc
new file mode 100644
index 000000000..50c415dc6
--- /dev/null
+++ b/src/rocksdb/file/line_file_reader.cc
@@ -0,0 +1,73 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "file/line_file_reader.h"
+
+#include <cstring>
+
+#include "monitoring/iostats_context_imp.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+IOStatus LineFileReader::Create(const std::shared_ptr<FileSystem>& fs,
+                                const std::string& fname,
+                                const FileOptions& file_opts,
+                                std::unique_ptr<LineFileReader>* reader,
+                                IODebugContext* dbg,
+                                RateLimiter* rate_limiter) {
+  std::unique_ptr<FSSequentialFile> file;
+  IOStatus io_s = fs->NewSequentialFile(fname, file_opts, &file, dbg);
+  if (io_s.ok()) {
+    reader->reset(new LineFileReader(
+        std::move(file), fname, nullptr,
+        std::vector<std::shared_ptr<EventListener>>{}, rate_limiter));
+  }
+  return io_s;
+}
+
+bool LineFileReader::ReadLine(std::string* out,
+                              Env::IOPriority rate_limiter_priority) {
+  assert(out);
+  if (!io_status_.ok()) {
+    // Status should be checked (or permit unchecked) any time we return false.
+    io_status_.MustCheck();
+    return false;
+  }
+  out->clear();
+  for (;;) {
+    // Look for line delimiter
+    const char* found = static_cast<const char*>(
+        std::memchr(buf_begin_, '\n', buf_end_ - buf_begin_));
+    if (found) {
+      size_t len = found - buf_begin_;
+      out->append(buf_begin_, len);
+      buf_begin_ += len + /*delim*/ 1;
+      ++line_number_;
+      return true;
+    }
+    if (at_eof_) {
+      io_status_.MustCheck();
+      return false;
+    }
+    // else flush and reload buffer
+    out->append(buf_begin_, buf_end_ - buf_begin_);
+    Slice result;
+    io_status_ =
+        sfr_.Read(buf_.size(), &result, buf_.data(), rate_limiter_priority);
+    IOSTATS_ADD(bytes_read, result.size());
+    if (!io_status_.ok()) {
+      io_status_.MustCheck();
+      return false;
+    }
+    if (result.size() != buf_.size()) {
+      // The obscure way of indicating EOF
+      at_eof_ = true;
+    }
+    buf_begin_ = result.data();
+    buf_end_ = result.data() + result.size();
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/line_file_reader.h b/src/rocksdb/file/line_file_reader.h
new file mode 100644
index 000000000..cc302d311
--- /dev/null
+++ b/src/rocksdb/file/line_file_reader.h
@@ -0,0 +1,60 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <array>
+
+#include "file/sequence_file_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A wrapper on top of Env::SequentialFile for reading text lines from a file.
+// Lines are delimited by '\n'. The last line may or may not include a
+// trailing newline. Uses SequentialFileReader internally.
+class LineFileReader {
+ private:
+  std::array<char, 8192> buf_;
+  SequentialFileReader sfr_;
+  IOStatus io_status_;
+  const char* buf_begin_ = buf_.data();
+  const char* buf_end_ = buf_.data();
+  size_t line_number_ = 0;
+  bool at_eof_ = false;
+
+ public:
+  // See SequentialFileReader constructors
+  template <typename... Args>
+  explicit LineFileReader(Args&&... args)
+      : sfr_(std::forward<Args&&>(args)...) {}
+
+  static IOStatus Create(const std::shared_ptr<FileSystem>& fs,
+                         const std::string& fname, const FileOptions& file_opts,
+                         std::unique_ptr<LineFileReader>* reader,
+                         IODebugContext* dbg, RateLimiter* rate_limiter);
+
+  LineFileReader(const LineFileReader&) = delete;
+  LineFileReader& operator=(const LineFileReader&) = delete;
+
+  // Reads another line from the file, returning true on success and saving
+  // the line to `out`, without delimiter, or returning false on failure. You
+  // must check GetStatus() to determine whether the failure was just
+  // end-of-file (OK status) or an I/O error (another status).
+  // The internal rate limiter will be charged at the specified priority.
+  bool ReadLine(std::string* out, Env::IOPriority rate_limiter_priority);
+
+  // Returns the number of the line most recently returned from ReadLine.
+  // Return value is unspecified if ReadLine has returned false due to
+  // I/O error. After ReadLine returns false due to end-of-file, return
+  // value is the last returned line number, or equivalently the total
+  // number of lines returned.
+  size_t GetLineNumber() const { return line_number_; }
+
+  // Returns any error encountered during read. The error is considered
+  // permanent and no retry or recovery is attempted with the same
+  // LineFileReader.
+  const IOStatus& GetStatus() const { return io_status_; }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/prefetch_test.cc b/src/rocksdb/file/prefetch_test.cc
new file mode 100644
index 000000000..438286bfc
--- /dev/null
+++ b/src/rocksdb/file/prefetch_test.cc
@@ -0,0 +1,2109 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "file/file_prefetch_buffer.h"
+#include "file/file_util.h"
+#include "rocksdb/file_system.h"
+#include "test_util/sync_point.h"
+#ifdef GFLAGS
+#include "tools/io_tracer_parser_tool.h"
+#endif
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MockFS;
+
+class MockRandomAccessFile : public FSRandomAccessFileOwnerWrapper {
+ public:
+  MockRandomAccessFile(std::unique_ptr<FSRandomAccessFile>& file,
+                       bool support_prefetch, std::atomic_int& prefetch_count)
+      : FSRandomAccessFileOwnerWrapper(std::move(file)),
+        support_prefetch_(support_prefetch),
+        prefetch_count_(prefetch_count) {}
+
+  IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
+                    IODebugContext* dbg) override {
+    if (support_prefetch_) {
+      prefetch_count_.fetch_add(1);
+      return target()->Prefetch(offset, n, options, dbg);
+    } else {
+      return IOStatus::NotSupported("Prefetch not supported");
+    }
+  }
+
+ private:
+  const bool support_prefetch_;
+  std::atomic_int& prefetch_count_;
+};
+
+class MockFS : public FileSystemWrapper {
+ public:
+  explicit MockFS(const std::shared_ptr<FileSystem>& wrapped,
+                  bool support_prefetch)
+      : FileSystemWrapper(wrapped), support_prefetch_(support_prefetch) {}
+
+  static const char* kClassName() { return "MockFS"; }
+  const char* Name() const override { return kClassName(); }
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& opts,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override {
+    std::unique_ptr<FSRandomAccessFile> file;
+    IOStatus s;
+    s = target()->NewRandomAccessFile(fname, opts, &file, dbg);
+    result->reset(
+        new MockRandomAccessFile(file, support_prefetch_, prefetch_count_));
+    return s;
+  }
+
+  void ClearPrefetchCount() { prefetch_count_ = 0; }
+
+  bool IsPrefetchCalled() { return prefetch_count_ > 0; }
+
+  int GetPrefetchCount() {
+    return prefetch_count_.load(std::memory_order_relaxed);
+  }
+
+ private:
+  const bool support_prefetch_;
+  std::atomic_int prefetch_count_{0};
+};
+
+class PrefetchTest
+    : public DBTestBase,
+      public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  PrefetchTest() : DBTestBase("prefetch_test", true) {}
+};
+
+INSTANTIATE_TEST_CASE_P(PrefetchTest, PrefetchTest,
+                        ::testing::Combine(::testing::Bool(),
+                                           ::testing::Bool()));
+
+std::string BuildKey(int num, std::string postfix = "") {
+  return "my_key_" + std::to_string(num) + postfix;
+}
+
+TEST_P(PrefetchTest, Basic) {
+  // First param is if the mockFS support_prefetch or not
+  bool support_prefetch =
+      std::get<0>(GetParam()) &&
+      test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
+
+  // Second param is if directIO is enabled or not
+  bool use_direct_io = std::get<1>(GetParam());
+  const int kNumKeys = 1100;
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+
+  int buff_prefetch_count = 0;
+  SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
+                                        [&](void*) { buff_prefetch_count++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = TryReopen(options);
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  // create first key range
+  WriteBatch batch;
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Put(BuildKey(i), "value for range 1 key"));
+  }
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  // create second key range
+  batch.Clear();
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Put(BuildKey(i, "key2"), "value for range 2 key"));
+  }
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  // delete second key range
+  batch.Clear();
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Delete(BuildKey(i, "key2")));
+  }
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  // compact database
+  std::string start_key = BuildKey(0);
+  std::string end_key = BuildKey(kNumKeys - 1);
+  Slice least(start_key.data(), start_key.size());
+  Slice greatest(end_key.data(), end_key.size());
+
+  // commenting out the line below causes the example to work correctly
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
+
+  if (support_prefetch && !use_direct_io) {
+    // If underline file system supports prefetch, and directIO is not enabled
+    // make sure prefetch() is called and FilePrefetchBuffer is not used.
+    ASSERT_TRUE(fs->IsPrefetchCalled());
+    fs->ClearPrefetchCount();
+    ASSERT_EQ(0, buff_prefetch_count);
+  } else {
+    // If underline file system doesn't support prefetch, or directIO is
+    // enabled, make sure prefetch() is not called and FilePrefetchBuffer is
+    // used.
+    ASSERT_FALSE(fs->IsPrefetchCalled());
+    ASSERT_GT(buff_prefetch_count, 0);
+    buff_prefetch_count = 0;
+  }
+
+  // count the keys
+  {
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    int num_keys = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      num_keys++;
+    }
+  }
+
+  // Make sure prefetch is called only if file system support prefetch.
+  if (support_prefetch && !use_direct_io) {
+    ASSERT_TRUE(fs->IsPrefetchCalled());
+    fs->ClearPrefetchCount();
+    ASSERT_EQ(0, buff_prefetch_count);
+  } else {
+    ASSERT_FALSE(fs->IsPrefetchCalled());
+    ASSERT_GT(buff_prefetch_count, 0);
+    buff_prefetch_count = 0;
+  }
+  Close();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) {
+  // First param is if the mockFS support_prefetch or not
+  bool support_prefetch =
+      std::get<0>(GetParam()) &&
+      test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
+
+  // Second param is if directIO is enabled or not
+  bool use_direct_io = std::get<1>(GetParam());
+
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  options.disable_auto_compactions = true;
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  table_options.max_auto_readahead_size = 0;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  int buff_prefetch_count = 0;
+  SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
+                                        [&](void*) { buff_prefetch_count++; });
+
+  // DB open will create table readers unless we reduce the table cache
+  // capacity. SanitizeOptions will set max_open_files to minimum of 20. Table
+  // cache is allocated with max_open_files - 10 as capacity. So override
+  // max_open_files to 10 so table cache capacity will become 0. This will
+  // prevent file open during DB open and force the file to be opened during
+  // Iteration.
+  SyncPoint::GetInstance()->SetCallBack(
+      "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+        int* max_open_files = (int*)arg;
+        *max_open_files = 11;
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = TryReopen(options);
+
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  Random rnd(309);
+  int key_count = 0;
+  const int num_keys_per_level = 100;
+  // Level 0 : Keys in range [0, 99], Level 1:[100, 199], Level 2:[200, 299].
+  for (int level = 2; level >= 0; level--) {
+    key_count = level * num_keys_per_level;
+    for (int i = 0; i < num_keys_per_level; ++i) {
+      ASSERT_OK(Put(Key(key_count++), rnd.RandomString(500)));
+    }
+    ASSERT_OK(Flush());
+    MoveFilesToLevel(level);
+  }
+  Close();
+  std::vector<int> buff_prefectch_level_count = {0, 0, 0};
+  TryReopen(options);
+  {
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    fs->ClearPrefetchCount();
+    buff_prefetch_count = 0;
+
+    for (int level = 2; level >= 0; level--) {
+      key_count = level * num_keys_per_level;
+      switch (level) {
+        case 0:
+          // max_auto_readahead_size is set 0 so data and index blocks are not
+          // prefetched.
+          ASSERT_OK(db_->SetOptions(
+              {{"block_based_table_factory", "{max_auto_readahead_size=0;}"}}));
+          break;
+        case 1:
+          // max_auto_readahead_size is set less than
+          // initial_auto_readahead_size. So readahead_size remains equal to
+          // max_auto_readahead_size.
+          ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
+                                      "{max_auto_readahead_size=4096;}"}}));
+          break;
+        case 2:
+          ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
+                                      "{max_auto_readahead_size=65536;}"}}));
+          break;
+        default:
+          assert(false);
+      }
+
+      for (int i = 0; i < num_keys_per_level; ++i) {
+        iter->Seek(Key(key_count++));
+        iter->Next();
+      }
+
+      buff_prefectch_level_count[level] = buff_prefetch_count;
+      if (support_prefetch && !use_direct_io) {
+        if (level == 0) {
+          ASSERT_FALSE(fs->IsPrefetchCalled());
+        } else {
+          ASSERT_TRUE(fs->IsPrefetchCalled());
+        }
+        fs->ClearPrefetchCount();
+      } else {
+        ASSERT_FALSE(fs->IsPrefetchCalled());
+        if (level == 0) {
+          ASSERT_EQ(buff_prefetch_count, 0);
+        } else {
+          ASSERT_GT(buff_prefetch_count, 0);
+        }
+        buff_prefetch_count = 0;
+      }
+    }
+  }
+
+  if (!support_prefetch) {
+    ASSERT_GT(buff_prefectch_level_count[1], buff_prefectch_level_count[2]);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  Close();
+}
+
+TEST_P(PrefetchTest, ConfigureInternalAutoReadaheadSize) {
+  // First param is if the mockFS support_prefetch or not
+  bool support_prefetch =
+      std::get<0>(GetParam()) &&
+      test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
+
+  // Second param is if directIO is enabled or not
+  bool use_direct_io = std::get<1>(GetParam());
+
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  options.disable_auto_compactions = true;
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  table_options.initial_auto_readahead_size = 0;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  int buff_prefetch_count = 0;
+  // DB open will create table readers unless we reduce the table cache
+  // capacity. SanitizeOptions will set max_open_files to minimum of 20.
+  // Table cache is allocated with max_open_files - 10 as capacity. So
+  // override max_open_files to 10 so table cache capacity will become 0.
+  // This will prevent file open during DB open and force the file to be
+  // opened during Iteration.
+  SyncPoint::GetInstance()->SetCallBack(
+      "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+        int* max_open_files = (int*)arg;
+        *max_open_files = 11;
+      });
+
+  SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
+                                        [&](void*) { buff_prefetch_count++; });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = TryReopen(options);
+
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  Random rnd(309);
+  int key_count = 0;
+  const int num_keys_per_level = 100;
+  // Level 0 : Keys in range [0, 99], Level 1:[100, 199], Level 2:[200, 299].
+  for (int level = 2; level >= 0; level--) {
+    key_count = level * num_keys_per_level;
+    for (int i = 0; i < num_keys_per_level; ++i) {
+      ASSERT_OK(Put(Key(key_count++), rnd.RandomString(500)));
+    }
+    ASSERT_OK(Flush());
+    MoveFilesToLevel(level);
+  }
+  Close();
+
+  TryReopen(options);
+  {
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    fs->ClearPrefetchCount();
+    buff_prefetch_count = 0;
+    std::vector<int> buff_prefetch_level_count = {0, 0, 0};
+
+    for (int level = 2; level >= 0; level--) {
+      key_count = level * num_keys_per_level;
+      switch (level) {
+        case 0:
+          // initial_auto_readahead_size is set 0 so data and index blocks are
+          // not prefetched.
+          ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
+                                      "{initial_auto_readahead_size=0;}"}}));
+          break;
+        case 1:
+          // intial_auto_readahead_size and max_auto_readahead_size are set same
+          // so readahead_size remains same.
+          ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
+                                      "{initial_auto_readahead_size=4096;max_"
+                                      "auto_readahead_size=4096;}"}}));
+          break;
+        case 2:
+          ASSERT_OK(
+              db_->SetOptions({{"block_based_table_factory",
+                                "{initial_auto_readahead_size=65536;}"}}));
+          break;
+        default:
+          assert(false);
+      }
+
+      for (int i = 0; i < num_keys_per_level; ++i) {
+        iter->Seek(Key(key_count++));
+        iter->Next();
+      }
+
+      buff_prefetch_level_count[level] = buff_prefetch_count;
+      if (support_prefetch && !use_direct_io) {
+        if (level == 0) {
+          ASSERT_FALSE(fs->IsPrefetchCalled());
+        } else {
+          ASSERT_TRUE(fs->IsPrefetchCalled());
+        }
+        fs->ClearPrefetchCount();
+      } else {
+        ASSERT_FALSE(fs->IsPrefetchCalled());
+        if (level == 0) {
+          ASSERT_EQ(buff_prefetch_count, 0);
+        } else {
+          ASSERT_GT(buff_prefetch_count, 0);
+        }
+        buff_prefetch_count = 0;
+      }
+    }
+    if (!support_prefetch) {
+      ASSERT_GT(buff_prefetch_level_count[1], buff_prefetch_level_count[2]);
+    }
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  Close();
+}
+
+TEST_P(PrefetchTest, ConfigureNumFilesReadsForReadaheadSize) {
+  // First param is if the mockFS support_prefetch or not
+  bool support_prefetch =
+      std::get<0>(GetParam()) &&
+      test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
+
+  const int kNumKeys = 2000;
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  // Second param is if directIO is enabled or not
+  bool use_direct_io = std::get<1>(GetParam());
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  table_options.num_file_reads_for_auto_readahead = 0;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+
+  int buff_prefetch_count = 0;
+  SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
+                                        [&](void*) { buff_prefetch_count++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = TryReopen(options);
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  WriteBatch batch;
+  Random rnd(309);
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+  }
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  std::string start_key = BuildKey(0);
+  std::string end_key = BuildKey(kNumKeys - 1);
+  Slice least(start_key.data(), start_key.size());
+  Slice greatest(end_key.data(), end_key.size());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
+
+  Close();
+  TryReopen(options);
+
+  fs->ClearPrefetchCount();
+  buff_prefetch_count = 0;
+
+  {
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    /*
+     * Reseek keys from sequential Data Blocks within same partitioned
+     * index. It will prefetch the data block at the first seek since
+     * num_file_reads_for_auto_readahead = 0. Data Block size is nearly 4076 so
+     * readahead will fetch 8 * 1024 data more initially (2 more data blocks).
+     */
+    iter->Seek(BuildKey(0));  // Prefetch data + index block since
+                              // num_file_reads_for_auto_readahead = 0.
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1000));  // In buffer
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1004));  // In buffer
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1008));  // Prefetch Data
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1011));  // In buffer
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1015));  // In buffer
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1019));  // In buffer
+    ASSERT_TRUE(iter->Valid());
+    // Missed 2 blocks but they are already in buffer so no reset.
+    iter->Seek(BuildKey(103));  // Already in buffer.
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1033));  // Prefetch Data.
+    ASSERT_TRUE(iter->Valid());
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 4);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 4);
+      buff_prefetch_count = 0;
+    }
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_P(PrefetchTest, PrefetchWhenReseek) {
+  // First param is if the mockFS support_prefetch or not
+  bool support_prefetch =
+      std::get<0>(GetParam()) &&
+      test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
+
+  const int kNumKeys = 2000;
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  // Second param is if directIO is enabled or not
+  bool use_direct_io = std::get<1>(GetParam());
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+
+  int buff_prefetch_count = 0;
+  SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
+                                        [&](void*) { buff_prefetch_count++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = TryReopen(options);
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  WriteBatch batch;
+  Random rnd(309);
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+  }
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  std::string start_key = BuildKey(0);
+  std::string end_key = BuildKey(kNumKeys - 1);
+  Slice least(start_key.data(), start_key.size());
+  Slice greatest(end_key.data(), end_key.size());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
+
+  fs->ClearPrefetchCount();
+  buff_prefetch_count = 0;
+
+  {
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    /*
+     * Reseek keys from sequential Data Blocks within same partitioned
+     * index. After 2 sequential reads it will prefetch the data block.
+     * Data Block size is nearly 4076 so readahead will fetch 8 * 1024 data more
+     * initially (2 more data blocks).
+     */
+    iter->Seek(BuildKey(0));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1000));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1004));  // Prefetch Data
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1008));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1011));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1015));  // Prefetch Data
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1019));
+    ASSERT_TRUE(iter->Valid());
+    // Missed 2 blocks but they are already in buffer so no reset.
+    iter->Seek(BuildKey(103));  // Already in buffer.
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1033));  // Prefetch Data
+    ASSERT_TRUE(iter->Valid());
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 3);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 3);
+      buff_prefetch_count = 0;
+    }
+  }
+  {
+    /*
+     * Reseek keys from  non sequential data blocks within same partitioned
+     * index. buff_prefetch_count will be 0 in that case.
+     */
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    iter->Seek(BuildKey(0));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1008));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1019));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1033));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1048));
+    ASSERT_TRUE(iter->Valid());
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 0);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 0);
+      buff_prefetch_count = 0;
+    }
+  }
+  {
+    /*
+     * Reesek keys from Single Data Block.
+     */
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    iter->Seek(BuildKey(0));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(10));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(100));
+    ASSERT_TRUE(iter->Valid());
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 0);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 0);
+      buff_prefetch_count = 0;
+    }
+  }
+  {
+    /*
+     * Reseek keys from  sequential data blocks to set implicit auto readahead
+     * and prefetch data but after that iterate over different (non sequential)
+     * data blocks which won't prefetch any data further. So buff_prefetch_count
+     * will be 1 for the first one.
+     */
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    iter->Seek(BuildKey(0));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1000));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1004));  // This iteration will prefetch buffer
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1008));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(
+        BuildKey(996));  // Reseek won't prefetch any data and
+                         // readahead_size will be initiallized to 8*1024.
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(992));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(989));
+    ASSERT_TRUE(iter->Valid());
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 1);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 1);
+      buff_prefetch_count = 0;
+    }
+
+    // Read sequentially to confirm readahead_size is reset to initial value (2
+    // more data blocks)
+    iter->Seek(BuildKey(1011));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1015));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1019));  // Prefetch Data
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1022));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1026));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(103));  // Prefetch Data
+    ASSERT_TRUE(iter->Valid());
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 2);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 2);
+      buff_prefetch_count = 0;
+    }
+  }
+  {
+    /* Reseek keys from sequential partitioned index block. Since partitioned
+     * index fetch are sequential, buff_prefetch_count will be 1.
+     */
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    iter->Seek(BuildKey(0));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1167));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1334));  // This iteration will prefetch buffer
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1499));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1667));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1847));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1999));
+    ASSERT_TRUE(iter->Valid());
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 1);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 1);
+      buff_prefetch_count = 0;
+    }
+  }
+  {
+    /*
+     * Reseek over different keys from different blocks. buff_prefetch_count is
+     * set 0.
+     */
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    int i = 0;
+    int j = 1000;
+    do {
+      iter->Seek(BuildKey(i));
+      if (!iter->Valid()) {
+        break;
+      }
+      i = i + 100;
+      iter->Seek(BuildKey(j));
+      j = j + 100;
+    } while (i < 1000 && j < kNumKeys && iter->Valid());
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 0);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 0);
+      buff_prefetch_count = 0;
+    }
+  }
+  {
+    /* Iterates sequentially over all keys. It will prefetch the buffer.*/
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    }
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 13);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 13);
+      buff_prefetch_count = 0;
+    }
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  Close();
+}
+
+TEST_P(PrefetchTest, PrefetchWhenReseekwithCache) {
+  // First param is if the mockFS support_prefetch or not
+  bool support_prefetch =
+      std::get<0>(GetParam()) &&
+      test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
+
+  const int kNumKeys = 2000;
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  // Second param is if directIO is enabled or not
+  bool use_direct_io = std::get<1>(GetParam());
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+
+  BlockBasedTableOptions table_options;
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);  // 8MB
+  table_options.block_cache = cache;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+
+  int buff_prefetch_count = 0;
+  SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
+                                        [&](void*) { buff_prefetch_count++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = TryReopen(options);
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  WriteBatch batch;
+  Random rnd(309);
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+  }
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  std::string start_key = BuildKey(0);
+  std::string end_key = BuildKey(kNumKeys - 1);
+  Slice least(start_key.data(), start_key.size());
+  Slice greatest(end_key.data(), end_key.size());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
+
+  fs->ClearPrefetchCount();
+  buff_prefetch_count = 0;
+
+  {
+    /*
+     * Reseek keys from sequential Data Blocks within same partitioned
+     * index. After 2 sequential reads it will prefetch the data block.
+     * Data Block size is nearly 4076 so readahead will fetch 8 * 1024 data more
+     * initially (2 more data blocks).
+     */
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    // Warm up the cache
+    iter->Seek(BuildKey(1011));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1015));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1019));
+    ASSERT_TRUE(iter->Valid());
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 1);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 1);
+      buff_prefetch_count = 0;
+    }
+  }
+  {
+    // After caching, blocks will be read from cache (Sequential blocks)
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    iter->Seek(BuildKey(0));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1000));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1004));  // Prefetch data (not in cache).
+    ASSERT_TRUE(iter->Valid());
+    // Missed one sequential block but next is in already in buffer so readahead
+    // will not be reset.
+    iter->Seek(BuildKey(1011));
+    ASSERT_TRUE(iter->Valid());
+    // Prefetch data but blocks are in cache so no prefetch and reset.
+    iter->Seek(BuildKey(1015));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1019));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1022));
+    ASSERT_TRUE(iter->Valid());
+    // Prefetch data with readahead_size = 4 blocks.
+    iter->Seek(BuildKey(1026));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(103));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1033));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1037));
+    ASSERT_TRUE(iter->Valid());
+
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 3);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 2);
+      buff_prefetch_count = 0;
+    }
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  Close();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(PrefetchTest, DBIterLevelReadAhead) {
+  const int kNumKeys = 1000;
+  // Set options
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), false);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  bool use_direct_io = std::get<0>(GetParam());
+  bool is_adaptive_readahead = std::get<1>(GetParam());
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.statistics = CreateDBStatistics();
+  options.env = env.get();
+
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  Status s = TryReopen(options);
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  WriteBatch batch;
+  Random rnd(309);
+  int total_keys = 0;
+  for (int j = 0; j < 5; j++) {
+    for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) {
+      ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+      total_keys++;
+    }
+    ASSERT_OK(db_->Write(WriteOptions(), &batch));
+    ASSERT_OK(Flush());
+  }
+  MoveFilesToLevel(2);
+  int buff_prefetch_count = 0;
+  int buff_async_prefetch_count = 0;
+  int readahead_carry_over_count = 0;
+  int num_sst_files = NumTableFilesAtLevel(2);
+  size_t current_readahead_size = 0;
+
+  // Test - Iterate over the keys sequentially.
+  {
+    SyncPoint::GetInstance()->SetCallBack(
+        "FilePrefetchBuffer::Prefetch:Start",
+        [&](void*) { buff_prefetch_count++; });
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "FilePrefetchBuffer::PrefetchAsyncInternal:Start",
+        [&](void*) { buff_async_prefetch_count++; });
+
+    // The callback checks, since reads are sequential, readahead_size doesn't
+    // start from 8KB when iterator moves to next file and its called
+    // num_sst_files-1 times (excluding for first file).
+    SyncPoint::GetInstance()->SetCallBack(
+        "BlockPrefetcher::SetReadaheadState", [&](void* arg) {
+          readahead_carry_over_count++;
+          size_t readahead_size = *reinterpret_cast<size_t*>(arg);
+          if (readahead_carry_over_count) {
+            ASSERT_GT(readahead_size, 8 * 1024);
+          }
+        });
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "FilePrefetchBuffer::TryReadFromCache", [&](void* arg) {
+          current_readahead_size = *reinterpret_cast<size_t*>(arg);
+          ASSERT_GT(current_readahead_size, 0);
+        });
+
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    ReadOptions ro;
+    if (is_adaptive_readahead) {
+      ro.adaptive_readahead = true;
+      ro.async_io = true;
+    }
+
+    ASSERT_OK(options.statistics->Reset());
+
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+    int num_keys = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      num_keys++;
+    }
+    ASSERT_EQ(num_keys, total_keys);
+
+    // For index and data blocks.
+    if (is_adaptive_readahead) {
+      ASSERT_EQ(readahead_carry_over_count, 2 * (num_sst_files - 1));
+      ASSERT_GT(buff_async_prefetch_count, 0);
+    } else {
+      ASSERT_GT(buff_prefetch_count, 0);
+      ASSERT_EQ(readahead_carry_over_count, 0);
+    }
+
+    // Check stats to make sure async prefetch is done.
+    {
+      HistogramData async_read_bytes;
+      options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
+      if (ro.async_io) {
+        ASSERT_GT(async_read_bytes.count, 0);
+      } else {
+        ASSERT_EQ(async_read_bytes.count, 0);
+      }
+    }
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+  }
+  Close();
+}
+#endif  //! ROCKSDB_LITE
+
+class PrefetchTest1 : public DBTestBase,
+                      public ::testing::WithParamInterface<bool> {
+ public:
+  PrefetchTest1() : DBTestBase("prefetch_test1", true) {}
+};
+
+INSTANTIATE_TEST_CASE_P(PrefetchTest1, PrefetchTest1, ::testing::Bool());
+
+#ifndef ROCKSDB_LITE
+TEST_P(PrefetchTest1, NonSequentialReadsWithAdaptiveReadahead) {
+  const int kNumKeys = 1000;
+  // Set options
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), false);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  if (GetParam()) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  Status s = TryReopen(options);
+  if (GetParam() && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  WriteBatch batch;
+  Random rnd(309);
+  for (int j = 0; j < 5; j++) {
+    for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) {
+      ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+    }
+    ASSERT_OK(db_->Write(WriteOptions(), &batch));
+    ASSERT_OK(Flush());
+  }
+  MoveFilesToLevel(2);
+
+  int buff_prefetch_count = 0;
+  int set_readahead = 0;
+  size_t readahead_size = 0;
+
+  SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
+                                        [&](void*) { buff_prefetch_count++; });
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockPrefetcher::SetReadaheadState",
+      [&](void* /*arg*/) { set_readahead++; });
+  SyncPoint::GetInstance()->SetCallBack(
+      "FilePrefetchBuffer::TryReadFromCache",
+      [&](void* arg) { readahead_size = *reinterpret_cast<size_t*>(arg); });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  {
+    // Iterate until prefetch is done.
+    ReadOptions ro;
+    ro.adaptive_readahead = true;
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+
+    while (iter->Valid() && buff_prefetch_count == 0) {
+      iter->Next();
+    }
+
+    ASSERT_EQ(readahead_size, 8 * 1024);
+    ASSERT_EQ(buff_prefetch_count, 1);
+    ASSERT_EQ(set_readahead, 0);
+    buff_prefetch_count = 0;
+
+    // Move to last file and check readahead size fallbacks to 8KB. So next
+    // readahead size after prefetch should be 8 * 1024;
+    iter->Seek(BuildKey(4004));
+    ASSERT_TRUE(iter->Valid());
+
+    while (iter->Valid() && buff_prefetch_count == 0) {
+      iter->Next();
+    }
+
+    ASSERT_EQ(readahead_size, 8 * 1024);
+    ASSERT_EQ(set_readahead, 0);
+    ASSERT_EQ(buff_prefetch_count, 1);
+  }
+  Close();
+}
+#endif  //! ROCKSDB_LITE
+
+TEST_P(PrefetchTest1, DecreaseReadAheadIfInCache) {
+  const int kNumKeys = 2000;
+  // Set options
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), false);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  if (GetParam()) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);  // 8MB
+  table_options.block_cache = cache;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  Status s = TryReopen(options);
+  if (GetParam() && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  WriteBatch batch;
+  Random rnd(309);
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+  }
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  std::string start_key = BuildKey(0);
+  std::string end_key = BuildKey(kNumKeys - 1);
+  Slice least(start_key.data(), start_key.size());
+  Slice greatest(end_key.data(), end_key.size());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
+
+  int buff_prefetch_count = 0;
+  size_t current_readahead_size = 0;
+  size_t expected_current_readahead_size = 8 * 1024;
+  size_t decrease_readahead_size = 8 * 1024;
+
+  SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
+                                        [&](void*) { buff_prefetch_count++; });
+  SyncPoint::GetInstance()->SetCallBack(
+      "FilePrefetchBuffer::TryReadFromCache", [&](void* arg) {
+        current_readahead_size = *reinterpret_cast<size_t*>(arg);
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  ReadOptions ro;
+  ro.adaptive_readahead = true;
+  {
+    /*
+     * Reseek keys from sequential Data Blocks within same partitioned
+     * index. After 2 sequential reads it will prefetch the data block.
+     * Data Block size is nearly 4076 so readahead will fetch 8 * 1024 data
+     * more initially (2 more data blocks).
+     */
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+    // Warm up the cache
+    iter->Seek(BuildKey(1011));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1015));
+    ASSERT_TRUE(iter->Valid());
+    iter->Seek(BuildKey(1019));
+    ASSERT_TRUE(iter->Valid());
+    buff_prefetch_count = 0;
+  }
+
+  {
+    ASSERT_OK(options.statistics->Reset());
+    // After caching, blocks will be read from cache (Sequential blocks)
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+    iter->Seek(
+        BuildKey(0));  // In cache so it will decrease the readahead_size.
+    ASSERT_TRUE(iter->Valid());
+    expected_current_readahead_size = std::max(
+        decrease_readahead_size,
+        (expected_current_readahead_size >= decrease_readahead_size
+             ? (expected_current_readahead_size - decrease_readahead_size)
+             : 0));
+
+    iter->Seek(BuildKey(1000));  // Won't prefetch the block.
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(current_readahead_size, expected_current_readahead_size);
+
+    iter->Seek(BuildKey(1004));  // Prefetch the block.
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(current_readahead_size, expected_current_readahead_size);
+    expected_current_readahead_size *= 2;
+
+    iter->Seek(BuildKey(1011));
+    ASSERT_TRUE(iter->Valid());
+
+    // Eligible to Prefetch data (not in buffer) but block is in cache so no
+    // prefetch will happen and will result in decrease in readahead_size.
+    // readahead_size will be 8 * 1024
+    iter->Seek(BuildKey(1015));
+    ASSERT_TRUE(iter->Valid());
+    expected_current_readahead_size = std::max(
+        decrease_readahead_size,
+        (expected_current_readahead_size >= decrease_readahead_size
+             ? (expected_current_readahead_size - decrease_readahead_size)
+             : 0));
+
+    // 1016 is the same block as 1015. So no change in readahead_size.
+    iter->Seek(BuildKey(1016));
+    ASSERT_TRUE(iter->Valid());
+
+    // Prefetch data (not in buffer) but found in cache. So decrease
+    // readahead_size. Since it will 0 after decrementing so readahead_size will
+    // be set to initial value.
+    iter->Seek(BuildKey(1019));
+    ASSERT_TRUE(iter->Valid());
+    expected_current_readahead_size = std::max(
+        decrease_readahead_size,
+        (expected_current_readahead_size >= decrease_readahead_size
+             ? (expected_current_readahead_size - decrease_readahead_size)
+             : 0));
+
+    // Prefetch next sequential data.
+    iter->Seek(BuildKey(1022));
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(current_readahead_size, expected_current_readahead_size);
+    ASSERT_EQ(buff_prefetch_count, 2);
+
+    buff_prefetch_count = 0;
+  }
+  Close();
+}
+
+TEST_P(PrefetchTest1, SeekParallelizationTest) {
+  const int kNumKeys = 2000;
+  // Set options
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), false);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  if (GetParam()) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  Status s = TryReopen(options);
+  if (GetParam() && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  WriteBatch batch;
+  Random rnd(309);
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+  }
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  std::string start_key = BuildKey(0);
+  std::string end_key = BuildKey(kNumKeys - 1);
+  Slice least(start_key.data(), start_key.size());
+  Slice greatest(end_key.data(), end_key.size());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
+
+  int buff_prefetch_count = 0;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "FilePrefetchBuffer::PrefetchAsyncInternal:Start",
+      [&](void*) { buff_prefetch_count++; });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  ReadOptions ro;
+  ro.adaptive_readahead = true;
+  ro.async_io = true;
+
+  {
+    ASSERT_OK(options.statistics->Reset());
+    // Each block contains around 4 keys.
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+    iter->Seek(BuildKey(0));  // Prefetch data because of seek parallelization.
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+
+    // New data block. Since num_file_reads in FilePrefetch after this read is
+    // 2, it won't go for prefetching.
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+
+    // Prefetch data.
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+
+    ASSERT_EQ(buff_prefetch_count, 2);
+
+    // Check stats to make sure async prefetch is done.
+    {
+      HistogramData async_read_bytes;
+      options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
+      ASSERT_GT(async_read_bytes.count, 0);
+      ASSERT_GT(get_perf_context()->number_async_seek, 0);
+    }
+
+    buff_prefetch_count = 0;
+  }
+  Close();
+}
+
+extern "C" bool RocksDbIOUringEnable() { return true; }
+
+namespace {
+#ifndef ROCKSDB_LITE
+#ifdef GFLAGS
+const int kMaxArgCount = 100;
+const size_t kArgBufferSize = 100000;
+
+void RunIOTracerParserTool(std::string trace_file) {
+  std::vector<std::string> params = {"./io_tracer_parser",
+                                     "-io_trace_file=" + trace_file};
+
+  char arg_buffer[kArgBufferSize];
+  char* argv[kMaxArgCount];
+  int argc = 0;
+  int cursor = 0;
+  for (const auto& arg : params) {
+    ASSERT_LE(cursor + arg.size() + 1, kArgBufferSize);
+    ASSERT_LE(argc + 1, kMaxArgCount);
+
+    snprintf(arg_buffer + cursor, arg.size() + 1, "%s", arg.c_str());
+
+    argv[argc++] = arg_buffer + cursor;
+    cursor += static_cast<int>(arg.size()) + 1;
+  }
+  ASSERT_EQ(0, ROCKSDB_NAMESPACE::io_tracer_parser(argc, argv));
+}
+#endif  // GFLAGS
+#endif  // ROCKSDB_LITE
+}  // namespace
+
+// Tests the default implementation of ReadAsync API with PosixFileSystem.
+TEST_P(PrefetchTest, ReadAsyncWithPosixFS) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+
+  const int kNumKeys = 1000;
+  std::shared_ptr<MockFS> fs = std::make_shared<MockFS>(
+      FileSystem::Default(), /*support_prefetch=*/false);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  bool use_direct_io = std::get<0>(GetParam());
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  options.statistics = CreateDBStatistics();
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  Status s = TryReopen(options);
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  int total_keys = 0;
+  // Write the keys.
+  {
+    WriteBatch batch;
+    Random rnd(309);
+    for (int j = 0; j < 5; j++) {
+      for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) {
+        ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+        total_keys++;
+      }
+      ASSERT_OK(db_->Write(WriteOptions(), &batch));
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(2);
+  }
+
+  int buff_prefetch_count = 0;
+  bool read_async_called = false;
+  ReadOptions ro;
+  ro.adaptive_readahead = true;
+  ro.async_io = true;
+
+  if (std::get<1>(GetParam())) {
+    ro.readahead_size = 16 * 1024;
+  }
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "FilePrefetchBuffer::PrefetchAsyncInternal:Start",
+      [&](void*) { buff_prefetch_count++; });
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "UpdateResults::io_uring_result",
+      [&](void* /*arg*/) { read_async_called = true; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Read the keys.
+  {
+    ASSERT_OK(options.statistics->Reset());
+    get_perf_context()->Reset();
+
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+    int num_keys = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      num_keys++;
+    }
+
+    ASSERT_EQ(num_keys, total_keys);
+    ASSERT_GT(buff_prefetch_count, 0);
+
+    // Check stats to make sure async prefetch is done.
+    {
+      HistogramData async_read_bytes;
+      options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
+      HistogramData prefetched_bytes_discarded;
+      options.statistics->histogramData(PREFETCHED_BYTES_DISCARDED,
+                                        &prefetched_bytes_discarded);
+
+      // Not all platforms support iouring. In that case, ReadAsync in posix
+      // won't submit async requests.
+      if (read_async_called) {
+        ASSERT_GT(async_read_bytes.count, 0);
+      } else {
+        ASSERT_EQ(async_read_bytes.count, 0);
+      }
+      ASSERT_GT(prefetched_bytes_discarded.count, 0);
+    }
+    ASSERT_EQ(get_perf_context()->number_async_seek, 0);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  Close();
+}
+
+TEST_P(PrefetchTest, MultipleSeekWithPosixFS) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+
+  const int kNumKeys = 1000;
+  std::shared_ptr<MockFS> fs = std::make_shared<MockFS>(
+      FileSystem::Default(), /*support_prefetch=*/false);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  bool use_direct_io = std::get<0>(GetParam());
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  options.statistics = CreateDBStatistics();
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  Status s = TryReopen(options);
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  int total_keys = 0;
+  // Write the keys.
+  {
+    WriteBatch batch;
+    Random rnd(309);
+    for (int j = 0; j < 5; j++) {
+      for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) {
+        ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+        total_keys++;
+      }
+      ASSERT_OK(db_->Write(WriteOptions(), &batch));
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(2);
+  }
+
+  int num_keys_first_batch = 0;
+  int num_keys_second_batch = 0;
+  // Calculate number of keys without async_io for correctness validation.
+  {
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    // First Seek.
+    iter->Seek(BuildKey(450));
+    while (iter->Valid() && num_keys_first_batch < 100) {
+      ASSERT_OK(iter->status());
+      num_keys_first_batch++;
+      iter->Next();
+    }
+    ASSERT_OK(iter->status());
+
+    iter->Seek(BuildKey(942));
+    while (iter->Valid()) {
+      ASSERT_OK(iter->status());
+      num_keys_second_batch++;
+      iter->Next();
+    }
+    ASSERT_OK(iter->status());
+  }
+
+  int buff_prefetch_count = 0;
+  bool read_async_called = false;
+  ReadOptions ro;
+  ro.adaptive_readahead = true;
+  ro.async_io = true;
+
+  if (std::get<1>(GetParam())) {
+    ro.readahead_size = 16 * 1024;
+  }
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "FilePrefetchBuffer::PrefetchAsyncInternal:Start",
+      [&](void*) { buff_prefetch_count++; });
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "UpdateResults::io_uring_result",
+      [&](void* /*arg*/) { read_async_called = true; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Read the keys using seek.
+  {
+    ASSERT_OK(options.statistics->Reset());
+    get_perf_context()->Reset();
+
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+    int num_keys = 0;
+    // First Seek.
+    {
+      iter->Seek(BuildKey(450));
+      while (iter->Valid() && num_keys < 100) {
+        ASSERT_OK(iter->status());
+        num_keys++;
+        iter->Next();
+      }
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(num_keys, num_keys_first_batch);
+      // Check stats to make sure async prefetch is done.
+      {
+        HistogramData async_read_bytes;
+        options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
+
+        // Not all platforms support iouring. In that case, ReadAsync in posix
+        // won't submit async requests.
+        if (read_async_called) {
+          ASSERT_GT(async_read_bytes.count, 0);
+          ASSERT_GT(get_perf_context()->number_async_seek, 0);
+        } else {
+          ASSERT_EQ(async_read_bytes.count, 0);
+          ASSERT_EQ(get_perf_context()->number_async_seek, 0);
+        }
+      }
+    }
+
+    // Second Seek.
+    {
+      num_keys = 0;
+      ASSERT_OK(options.statistics->Reset());
+      get_perf_context()->Reset();
+
+      iter->Seek(BuildKey(942));
+      while (iter->Valid()) {
+        ASSERT_OK(iter->status());
+        num_keys++;
+        iter->Next();
+      }
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(num_keys, num_keys_second_batch);
+
+      ASSERT_GT(buff_prefetch_count, 0);
+
+      // Check stats to make sure async prefetch is done.
+      {
+        HistogramData async_read_bytes;
+        options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
+        HistogramData prefetched_bytes_discarded;
+        options.statistics->histogramData(PREFETCHED_BYTES_DISCARDED,
+                                          &prefetched_bytes_discarded);
+
+        // Not all platforms support iouring. In that case, ReadAsync in posix
+        // won't submit async requests.
+        if (read_async_called) {
+          ASSERT_GT(async_read_bytes.count, 0);
+          ASSERT_GT(get_perf_context()->number_async_seek, 0);
+        } else {
+          ASSERT_EQ(async_read_bytes.count, 0);
+          ASSERT_EQ(get_perf_context()->number_async_seek, 0);
+        }
+        ASSERT_GT(prefetched_bytes_discarded.count, 0);
+      }
+    }
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  Close();
+}
+
+TEST_P(PrefetchTest, SeekParallelizationTest1) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  const int kNumKeys = 2000;
+  // Set options
+  std::shared_ptr<MockFS> fs = std::make_shared<MockFS>(
+      FileSystem::Default(), /*support_prefetch=*/false);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  bool use_direct_io = std::get<0>(GetParam());
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  Status s = TryReopen(options);
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  WriteBatch batch;
+  Random rnd(309);
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+  }
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  std::string start_key = BuildKey(0);
+  std::string end_key = BuildKey(kNumKeys - 1);
+  Slice least(start_key.data(), start_key.size());
+  Slice greatest(end_key.data(), end_key.size());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
+
+  int buff_prefetch_count = 0;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "FilePrefetchBuffer::PrefetchAsyncInternal:Start",
+      [&](void*) { buff_prefetch_count++; });
+
+  bool read_async_called = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "UpdateResults::io_uring_result",
+      [&](void* /*arg*/) { read_async_called = true; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  ReadOptions ro;
+  ro.adaptive_readahead = true;
+  ro.async_io = true;
+
+  if (std::get<1>(GetParam())) {
+    ro.readahead_size = 16 * 1024;
+  }
+
+  {
+    ASSERT_OK(options.statistics->Reset());
+    // Each block contains around 4 keys.
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+    iter->Seek(BuildKey(0));  // Prefetch data because of seek parallelization.
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+
+    // New data block. Since num_file_reads in FilePrefetch after this read is
+    // 2, it won't go for prefetching.
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+
+    // Prefetch data.
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+
+    // Check stats to make sure async prefetch is done.
+    {
+      HistogramData async_read_bytes;
+      options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
+      // Not all platforms support iouring. In that case, ReadAsync in posix
+      // won't submit async requests.
+      if (read_async_called) {
+        ASSERT_GT(async_read_bytes.count, 0);
+        ASSERT_GT(get_perf_context()->number_async_seek, 0);
+        if (std::get<1>(GetParam())) {
+          ASSERT_EQ(buff_prefetch_count, 1);
+        } else {
+          ASSERT_EQ(buff_prefetch_count, 2);
+        }
+      } else {
+        ASSERT_EQ(async_read_bytes.count, 0);
+        ASSERT_EQ(get_perf_context()->number_async_seek, 0);
+        ASSERT_EQ(buff_prefetch_count, 1);
+      }
+    }
+
+    buff_prefetch_count = 0;
+  }
+  Close();
+}
+
+#ifndef ROCKSDB_LITE
+#ifdef GFLAGS
+TEST_P(PrefetchTest, TraceReadAsyncWithCallbackWrapper) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+
+  const int kNumKeys = 1000;
+  std::shared_ptr<MockFS> fs = std::make_shared<MockFS>(
+      FileSystem::Default(), /*support_prefetch=*/false);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  bool use_direct_io = std::get<0>(GetParam());
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  options.statistics = CreateDBStatistics();
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  Status s = TryReopen(options);
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  int total_keys = 0;
+  // Write the keys.
+  {
+    WriteBatch batch;
+    Random rnd(309);
+    for (int j = 0; j < 5; j++) {
+      for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) {
+        ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+        total_keys++;
+      }
+      ASSERT_OK(db_->Write(WriteOptions(), &batch));
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(2);
+  }
+
+  int buff_prefetch_count = 0;
+  bool read_async_called = false;
+  ReadOptions ro;
+  ro.adaptive_readahead = true;
+  ro.async_io = true;
+
+  if (std::get<1>(GetParam())) {
+    ro.readahead_size = 16 * 1024;
+  }
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "FilePrefetchBuffer::PrefetchAsyncInternal:Start",
+      [&](void*) { buff_prefetch_count++; });
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "UpdateResults::io_uring_result",
+      [&](void* /*arg*/) { read_async_called = true; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Read the keys.
+  {
+    // Start io_tracing.
+    WriteOptions write_opt;
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    std::string trace_file_path = dbname_ + "/io_trace_file";
+
+    ASSERT_OK(
+        NewFileTraceWriter(env_, EnvOptions(), trace_file_path, &trace_writer));
+    ASSERT_OK(db_->StartIOTrace(trace_opt, std::move(trace_writer)));
+    ASSERT_OK(options.statistics->Reset());
+
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+    int num_keys = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      num_keys++;
+    }
+
+    // End the tracing.
+    ASSERT_OK(db_->EndIOTrace());
+    ASSERT_OK(env_->FileExists(trace_file_path));
+
+    ASSERT_EQ(num_keys, total_keys);
+    ASSERT_GT(buff_prefetch_count, 0);
+
+    // Check stats to make sure async prefetch is done.
+    {
+      HistogramData async_read_bytes;
+      options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
+      // Not all platforms support iouring. In that case, ReadAsync in posix
+      // won't submit async requests.
+      if (read_async_called) {
+        ASSERT_GT(async_read_bytes.count, 0);
+      } else {
+        ASSERT_EQ(async_read_bytes.count, 0);
+      }
+    }
+
+    // Check the file to see if ReadAsync is logged.
+    RunIOTracerParserTool(trace_file_path);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  Close();
+}
+#endif  // GFLAGS
+
+class FilePrefetchBufferTest : public testing::Test {
+ public:
+  void SetUp() override {
+    SetupSyncPointsToMockDirectIO();
+    env_ = Env::Default();
+    fs_ = FileSystem::Default();
+    test_dir_ = test::PerThreadDBPath("file_prefetch_buffer_test");
+    ASSERT_OK(fs_->CreateDir(test_dir_, IOOptions(), nullptr));
+  }
+
+  void TearDown() override { EXPECT_OK(DestroyDir(env_, test_dir_)); }
+
+  void Write(const std::string& fname, const std::string& content) {
+    std::unique_ptr<FSWritableFile> f;
+    ASSERT_OK(fs_->NewWritableFile(Path(fname), FileOptions(), &f, nullptr));
+    ASSERT_OK(f->Append(content, IOOptions(), nullptr));
+    ASSERT_OK(f->Close(IOOptions(), nullptr));
+  }
+
+  void Read(const std::string& fname, const FileOptions& opts,
+            std::unique_ptr<RandomAccessFileReader>* reader) {
+    std::string fpath = Path(fname);
+    std::unique_ptr<FSRandomAccessFile> f;
+    ASSERT_OK(fs_->NewRandomAccessFile(fpath, opts, &f, nullptr));
+    reader->reset(new RandomAccessFileReader(std::move(f), fpath,
+                                             env_->GetSystemClock().get()));
+  }
+
+  void AssertResult(const std::string& content,
+                    const std::vector<FSReadRequest>& reqs) {
+    for (const auto& r : reqs) {
+      ASSERT_OK(r.status);
+      ASSERT_EQ(r.len, r.result.size());
+      ASSERT_EQ(content.substr(r.offset, r.len), r.result.ToString());
+    }
+  }
+
+  FileSystem* fs() { return fs_.get(); }
+
+ private:
+  Env* env_;
+  std::shared_ptr<FileSystem> fs_;
+  std::string test_dir_;
+
+  std::string Path(const std::string& fname) { return test_dir_ + "/" + fname; }
+};
+
+TEST_F(FilePrefetchBufferTest, SeekWithBlockCacheHit) {
+  std::string fname = "seek-with-block-cache-hit";
+  Random rand(0);
+  std::string content = rand.RandomString(32768);
+  Write(fname, content);
+
+  FileOptions opts;
+  std::unique_ptr<RandomAccessFileReader> r;
+  Read(fname, opts, &r);
+
+  FilePrefetchBuffer fpb(16384, 16384, true, false, false, 0, 0, fs());
+  Slice result;
+  // Simulate a seek of 4096 bytes at offset 0. Due to the readahead settings,
+  // it will do two reads of 4096+8192 and 8192
+  Status s = fpb.PrefetchAsync(IOOptions(), r.get(), 0, 4096, &result);
+  ASSERT_EQ(s, Status::TryAgain());
+  // Simulate a block cache hit
+  fpb.UpdateReadPattern(0, 4096, false);
+  // Now read some data that straddles the two prefetch buffers - offset 8192 to
+  // 16384
+  ASSERT_TRUE(fpb.TryReadFromCacheAsync(IOOptions(), r.get(), 8192, 8192,
+                                        &result, &s, Env::IOPriority::IO_LOW));
+}
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/file/random_access_file_reader.cc b/src/rocksdb/file/random_access_file_reader.cc
new file mode 100644
index 000000000..030cd8d07
--- /dev/null
+++ b/src/rocksdb/file/random_access_file_reader.cc
@@ -0,0 +1,602 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/random_access_file_reader.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include "file/file_util.h"
+#include "monitoring/histogram.h"
+#include "monitoring/iostats_context_imp.h"
+#include "port/port.h"
+#include "table/format.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+inline void RecordIOStats(Statistics* stats, Temperature file_temperature,
+                          bool is_last_level, size_t size) {
+  IOSTATS_ADD(bytes_read, size);
+  // record for last/non-last level
+  if (is_last_level) {
+    RecordTick(stats, LAST_LEVEL_READ_BYTES, size);
+    RecordTick(stats, LAST_LEVEL_READ_COUNT, 1);
+  } else {
+    RecordTick(stats, NON_LAST_LEVEL_READ_BYTES, size);
+    RecordTick(stats, NON_LAST_LEVEL_READ_COUNT, 1);
+  }
+
+  // record for temperature file
+  if (file_temperature != Temperature::kUnknown) {
+    switch (file_temperature) {
+      case Temperature::kHot:
+        IOSTATS_ADD(file_io_stats_by_temperature.hot_file_bytes_read, size);
+        IOSTATS_ADD(file_io_stats_by_temperature.hot_file_read_count, 1);
+        RecordTick(stats, HOT_FILE_READ_BYTES, size);
+        RecordTick(stats, HOT_FILE_READ_COUNT, 1);
+        break;
+      case Temperature::kWarm:
+        IOSTATS_ADD(file_io_stats_by_temperature.warm_file_bytes_read, size);
+        IOSTATS_ADD(file_io_stats_by_temperature.warm_file_read_count, 1);
+        RecordTick(stats, WARM_FILE_READ_BYTES, size);
+        RecordTick(stats, WARM_FILE_READ_COUNT, 1);
+        break;
+      case Temperature::kCold:
+        IOSTATS_ADD(file_io_stats_by_temperature.cold_file_bytes_read, size);
+        IOSTATS_ADD(file_io_stats_by_temperature.cold_file_read_count, 1);
+        RecordTick(stats, COLD_FILE_READ_BYTES, size);
+        RecordTick(stats, COLD_FILE_READ_COUNT, 1);
+        break;
+      default:
+        break;
+    }
+  }
+}
+
+IOStatus RandomAccessFileReader::Create(
+    const std::shared_ptr<FileSystem>& fs, const std::string& fname,
+    const FileOptions& file_opts,
+    std::unique_ptr<RandomAccessFileReader>* reader, IODebugContext* dbg) {
+  std::unique_ptr<FSRandomAccessFile> file;
+  IOStatus io_s = fs->NewRandomAccessFile(fname, file_opts, &file, dbg);
+  if (io_s.ok()) {
+    reader->reset(new RandomAccessFileReader(std::move(file), fname));
+  }
+  return io_s;
+}
+
+IOStatus RandomAccessFileReader::Read(
+    const IOOptions& opts, uint64_t offset, size_t n, Slice* result,
+    char* scratch, AlignedBuf* aligned_buf,
+    Env::IOPriority rate_limiter_priority) const {
+  (void)aligned_buf;
+
+  TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::Read", nullptr);
+
+  // To be paranoid: modify scratch a little bit, so in case underlying
+  // FileSystem doesn't fill the buffer but return success and `scratch` returns
+  // contains a previous block, returned value will not pass checksum.
+  if (n > 0 && scratch != nullptr) {
+    // This byte might not change anything for direct I/O case, but it's OK.
+    scratch[0]++;
+  }
+
+  IOStatus io_s;
+  uint64_t elapsed = 0;
+  {
+    StopWatch sw(clock_, stats_, hist_type_,
+                 (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
+                 true /*delay_enabled*/);
+    auto prev_perf_level = GetPerfLevel();
+    IOSTATS_TIMER_GUARD(read_nanos);
+    if (use_direct_io()) {
+#ifndef ROCKSDB_LITE
+      size_t alignment = file_->GetRequiredBufferAlignment();
+      size_t aligned_offset =
+          TruncateToPageBoundary(alignment, static_cast<size_t>(offset));
+      size_t offset_advance = static_cast<size_t>(offset) - aligned_offset;
+      size_t read_size =
+          Roundup(static_cast<size_t>(offset + n), alignment) - aligned_offset;
+      AlignedBuffer buf;
+      buf.Alignment(alignment);
+      buf.AllocateNewBuffer(read_size);
+      while (buf.CurrentSize() < read_size) {
+        size_t allowed;
+        if (rate_limiter_priority != Env::IO_TOTAL &&
+            rate_limiter_ != nullptr) {
+          allowed = rate_limiter_->RequestToken(
+              buf.Capacity() - buf.CurrentSize(), buf.Alignment(),
+              rate_limiter_priority, stats_, RateLimiter::OpType::kRead);
+        } else {
+          assert(buf.CurrentSize() == 0);
+          allowed = read_size;
+        }
+        Slice tmp;
+
+        FileOperationInfo::StartTimePoint start_ts;
+        uint64_t orig_offset = 0;
+        if (ShouldNotifyListeners()) {
+          start_ts = FileOperationInfo::StartNow();
+          orig_offset = aligned_offset + buf.CurrentSize();
+        }
+
+        {
+          IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_);
+          // Only user reads are expected to specify a timeout. And user reads
+          // are not subjected to rate_limiter and should go through only
+          // one iteration of this loop, so we don't need to check and adjust
+          // the opts.timeout before calling file_->Read
+          assert(!opts.timeout.count() || allowed == read_size);
+          io_s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, opts,
+                             &tmp, buf.Destination(), nullptr);
+        }
+        if (ShouldNotifyListeners()) {
+          auto finish_ts = FileOperationInfo::FinishNow();
+          NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts,
+                                 io_s);
+          if (!io_s.ok()) {
+            NotifyOnIOError(io_s, FileOperationType::kRead, file_name(),
+                            tmp.size(), orig_offset);
+          }
+        }
+
+        buf.Size(buf.CurrentSize() + tmp.size());
+        if (!io_s.ok() || tmp.size() < allowed) {
+          break;
+        }
+      }
+      size_t res_len = 0;
+      if (io_s.ok() && offset_advance < buf.CurrentSize()) {
+        res_len = std::min(buf.CurrentSize() - offset_advance, n);
+        if (aligned_buf == nullptr) {
+          buf.Read(scratch, offset_advance, res_len);
+        } else {
+          scratch = buf.BufferStart() + offset_advance;
+          aligned_buf->reset(buf.Release());
+        }
+      }
+      *result = Slice(scratch, res_len);
+#endif  // !ROCKSDB_LITE
+    } else {
+      size_t pos = 0;
+      const char* res_scratch = nullptr;
+      while (pos < n) {
+        size_t allowed;
+        if (rate_limiter_priority != Env::IO_TOTAL &&
+            rate_limiter_ != nullptr) {
+          if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) {
+            sw.DelayStart();
+          }
+          allowed = rate_limiter_->RequestToken(n - pos, 0 /* alignment */,
+                                                rate_limiter_priority, stats_,
+                                                RateLimiter::OpType::kRead);
+          if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) {
+            sw.DelayStop();
+          }
+        } else {
+          allowed = n;
+        }
+        Slice tmp_result;
+
+#ifndef ROCKSDB_LITE
+        FileOperationInfo::StartTimePoint start_ts;
+        if (ShouldNotifyListeners()) {
+          start_ts = FileOperationInfo::StartNow();
+        }
+#endif
+
+        {
+          IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_);
+          // Only user reads are expected to specify a timeout. And user reads
+          // are not subjected to rate_limiter and should go through only
+          // one iteration of this loop, so we don't need to check and adjust
+          // the opts.timeout before calling file_->Read
+          assert(!opts.timeout.count() || allowed == n);
+          io_s = file_->Read(offset + pos, allowed, opts, &tmp_result,
+                             scratch + pos, nullptr);
+        }
+#ifndef ROCKSDB_LITE
+        if (ShouldNotifyListeners()) {
+          auto finish_ts = FileOperationInfo::FinishNow();
+          NotifyOnFileReadFinish(offset + pos, tmp_result.size(), start_ts,
+                                 finish_ts, io_s);
+
+          if (!io_s.ok()) {
+            NotifyOnIOError(io_s, FileOperationType::kRead, file_name(),
+                            tmp_result.size(), offset + pos);
+          }
+        }
+#endif
+        if (res_scratch == nullptr) {
+          // we can't simply use `scratch` because reads of mmap'd files return
+          // data in a different buffer.
+          res_scratch = tmp_result.data();
+        } else {
+          // make sure chunks are inserted contiguously into `res_scratch`.
+          assert(tmp_result.data() == res_scratch + pos);
+        }
+        pos += tmp_result.size();
+        if (!io_s.ok() || tmp_result.size() < allowed) {
+          break;
+        }
+      }
+      *result = Slice(res_scratch, io_s.ok() ? pos : 0);
+    }
+    RecordIOStats(stats_, file_temperature_, is_last_level_, result->size());
+    SetPerfLevel(prev_perf_level);
+  }
+  if (stats_ != nullptr && file_read_hist_ != nullptr) {
+    file_read_hist_->Add(elapsed);
+  }
+
+  return io_s;
+}
+
+size_t End(const FSReadRequest& r) {
+  return static_cast<size_t>(r.offset) + r.len;
+}
+
+FSReadRequest Align(const FSReadRequest& r, size_t alignment) {
+  FSReadRequest req;
+  req.offset = static_cast<uint64_t>(
+      TruncateToPageBoundary(alignment, static_cast<size_t>(r.offset)));
+  req.len = Roundup(End(r), alignment) - req.offset;
+  req.scratch = nullptr;
+  return req;
+}
+
+bool TryMerge(FSReadRequest* dest, const FSReadRequest& src) {
+  size_t dest_offset = static_cast<size_t>(dest->offset);
+  size_t src_offset = static_cast<size_t>(src.offset);
+  size_t dest_end = End(*dest);
+  size_t src_end = End(src);
+  if (std::max(dest_offset, src_offset) > std::min(dest_end, src_end)) {
+    return false;
+  }
+  dest->offset = static_cast<uint64_t>(std::min(dest_offset, src_offset));
+  dest->len = std::max(dest_end, src_end) - dest->offset;
+  return true;
+}
+
+IOStatus RandomAccessFileReader::MultiRead(
+    const IOOptions& opts, FSReadRequest* read_reqs, size_t num_reqs,
+    AlignedBuf* aligned_buf, Env::IOPriority rate_limiter_priority) const {
+  (void)aligned_buf;  // suppress warning of unused variable in LITE mode
+  assert(num_reqs > 0);
+
+#ifndef NDEBUG
+  for (size_t i = 0; i < num_reqs - 1; ++i) {
+    assert(read_reqs[i].offset <= read_reqs[i + 1].offset);
+  }
+#endif  // !NDEBUG
+
+  // To be paranoid modify scratch a little bit, so in case underlying
+  // FileSystem doesn't fill the buffer but return success and `scratch` returns
+  // contains a previous block, returned value will not pass checksum.
+  // This byte might not change anything for direct I/O case, but it's OK.
+  for (size_t i = 0; i < num_reqs; i++) {
+    FSReadRequest& r = read_reqs[i];
+    if (r.len > 0 && r.scratch != nullptr) {
+      r.scratch[0]++;
+    }
+  }
+
+  IOStatus io_s;
+  uint64_t elapsed = 0;
+  {
+    StopWatch sw(clock_, stats_, hist_type_,
+                 (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
+                 true /*delay_enabled*/);
+    auto prev_perf_level = GetPerfLevel();
+    IOSTATS_TIMER_GUARD(read_nanos);
+
+    FSReadRequest* fs_reqs = read_reqs;
+    size_t num_fs_reqs = num_reqs;
+#ifndef ROCKSDB_LITE
+    std::vector<FSReadRequest> aligned_reqs;
+    if (use_direct_io()) {
+      // num_reqs is the max possible size,
+      // this can reduce std::vecector's internal resize operations.
+      aligned_reqs.reserve(num_reqs);
+      // Align and merge the read requests.
+      size_t alignment = file_->GetRequiredBufferAlignment();
+      for (size_t i = 0; i < num_reqs; i++) {
+        const auto& r = Align(read_reqs[i], alignment);
+        if (i == 0) {
+          // head
+          aligned_reqs.push_back(r);
+
+        } else if (!TryMerge(&aligned_reqs.back(), r)) {
+          // head + n
+          aligned_reqs.push_back(r);
+
+        } else {
+          // unused
+          r.status.PermitUncheckedError();
+        }
+      }
+      TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::MultiRead:AlignedReqs",
+                               &aligned_reqs);
+
+      // Allocate aligned buffer and let scratch buffers point to it.
+      size_t total_len = 0;
+      for (const auto& r : aligned_reqs) {
+        total_len += r.len;
+      }
+      AlignedBuffer buf;
+      buf.Alignment(alignment);
+      buf.AllocateNewBuffer(total_len);
+      char* scratch = buf.BufferStart();
+      for (auto& r : aligned_reqs) {
+        r.scratch = scratch;
+        scratch += r.len;
+      }
+
+      aligned_buf->reset(buf.Release());
+      fs_reqs = aligned_reqs.data();
+      num_fs_reqs = aligned_reqs.size();
+    }
+#endif  // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+    FileOperationInfo::StartTimePoint start_ts;
+    if (ShouldNotifyListeners()) {
+      start_ts = FileOperationInfo::StartNow();
+    }
+#endif  // ROCKSDB_LITE
+
+    {
+      IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_);
+      if (rate_limiter_priority != Env::IO_TOTAL && rate_limiter_ != nullptr) {
+        // TODO: ideally we should call `RateLimiter::RequestToken()` for
+        // allowed bytes to multi-read and then consume those bytes by
+        // satisfying as many requests in `MultiRead()` as possible, instead of
+        // what we do here, which can cause burst when the
+        // `total_multi_read_size` is big.
+        size_t total_multi_read_size = 0;
+        assert(fs_reqs != nullptr);
+        for (size_t i = 0; i < num_fs_reqs; ++i) {
+          FSReadRequest& req = fs_reqs[i];
+          total_multi_read_size += req.len;
+        }
+        size_t remaining_bytes = total_multi_read_size;
+        size_t request_bytes = 0;
+        while (remaining_bytes > 0) {
+          request_bytes = std::min(
+              static_cast<size_t>(rate_limiter_->GetSingleBurstBytes()),
+              remaining_bytes);
+          rate_limiter_->Request(request_bytes, rate_limiter_priority,
+                                 nullptr /* stats */,
+                                 RateLimiter::OpType::kRead);
+          remaining_bytes -= request_bytes;
+        }
+      }
+      io_s = file_->MultiRead(fs_reqs, num_fs_reqs, opts, nullptr);
+      RecordInHistogram(stats_, MULTIGET_IO_BATCH_SIZE, num_fs_reqs);
+    }
+
+#ifndef ROCKSDB_LITE
+    if (use_direct_io()) {
+      // Populate results in the unaligned read requests.
+      size_t aligned_i = 0;
+      for (size_t i = 0; i < num_reqs; i++) {
+        auto& r = read_reqs[i];
+        if (static_cast<size_t>(r.offset) > End(aligned_reqs[aligned_i])) {
+          aligned_i++;
+        }
+        const auto& fs_r = fs_reqs[aligned_i];
+        r.status = fs_r.status;
+        if (r.status.ok()) {
+          uint64_t offset = r.offset - fs_r.offset;
+          if (fs_r.result.size() <= offset) {
+            // No byte in the read range is returned.
+            r.result = Slice();
+          } else {
+            size_t len = std::min(
+                r.len, static_cast<size_t>(fs_r.result.size() - offset));
+            r.result = Slice(fs_r.scratch + offset, len);
+          }
+        } else {
+          r.result = Slice();
+        }
+      }
+    }
+#endif  // ROCKSDB_LITE
+
+    for (size_t i = 0; i < num_reqs; ++i) {
+#ifndef ROCKSDB_LITE
+      if (ShouldNotifyListeners()) {
+        auto finish_ts = FileOperationInfo::FinishNow();
+        NotifyOnFileReadFinish(read_reqs[i].offset, read_reqs[i].result.size(),
+                               start_ts, finish_ts, read_reqs[i].status);
+      }
+      if (!read_reqs[i].status.ok()) {
+        NotifyOnIOError(read_reqs[i].status, FileOperationType::kRead,
+                        file_name(), read_reqs[i].result.size(),
+                        read_reqs[i].offset);
+      }
+
+#endif  // ROCKSDB_LITE
+      RecordIOStats(stats_, file_temperature_, is_last_level_,
+                    read_reqs[i].result.size());
+    }
+    SetPerfLevel(prev_perf_level);
+  }
+  if (stats_ != nullptr && file_read_hist_ != nullptr) {
+    file_read_hist_->Add(elapsed);
+  }
+
+  return io_s;
+}
+
+IOStatus RandomAccessFileReader::PrepareIOOptions(const ReadOptions& ro,
+                                                  IOOptions& opts) {
+  if (clock_ != nullptr) {
+    return PrepareIOFromReadOptions(ro, clock_, opts);
+  } else {
+    return PrepareIOFromReadOptions(ro, SystemClock::Default().get(), opts);
+  }
+}
+
+IOStatus RandomAccessFileReader::ReadAsync(
+    FSReadRequest& req, const IOOptions& opts,
+    std::function<void(const FSReadRequest&, void*)> cb, void* cb_arg,
+    void** io_handle, IOHandleDeleter* del_fn, AlignedBuf* aligned_buf) {
+  IOStatus s;
+  // Create a callback and populate info.
+  auto read_async_callback =
+      std::bind(&RandomAccessFileReader::ReadAsyncCallback, this,
+                std::placeholders::_1, std::placeholders::_2);
+  ReadAsyncInfo* read_async_info =
+      new ReadAsyncInfo(cb, cb_arg, clock_->NowMicros());
+
+#ifndef ROCKSDB_LITE
+  if (ShouldNotifyListeners()) {
+    read_async_info->fs_start_ts_ = FileOperationInfo::StartNow();
+  }
+#endif
+
+  size_t alignment = file_->GetRequiredBufferAlignment();
+  bool is_aligned = (req.offset & (alignment - 1)) == 0 &&
+                    (req.len & (alignment - 1)) == 0 &&
+                    (uintptr_t(req.scratch) & (alignment - 1)) == 0;
+  read_async_info->is_aligned_ = is_aligned;
+
+  uint64_t elapsed = 0;
+  if (use_direct_io() && is_aligned == false) {
+    FSReadRequest aligned_req = Align(req, alignment);
+    aligned_req.status.PermitUncheckedError();
+
+    // Allocate aligned buffer.
+    read_async_info->buf_.Alignment(alignment);
+    read_async_info->buf_.AllocateNewBuffer(aligned_req.len);
+
+    // Set rem fields in aligned FSReadRequest.
+    aligned_req.scratch = read_async_info->buf_.BufferStart();
+
+    // Set user provided fields to populate back in callback.
+    read_async_info->user_scratch_ = req.scratch;
+    read_async_info->user_aligned_buf_ = aligned_buf;
+    read_async_info->user_len_ = req.len;
+    read_async_info->user_offset_ = req.offset;
+    read_async_info->user_result_ = req.result;
+
+    assert(read_async_info->buf_.CurrentSize() == 0);
+
+    StopWatch sw(clock_, nullptr /*stats*/, 0 /*hist_type*/, &elapsed,
+                 true /*overwrite*/, true /*delay_enabled*/);
+    s = file_->ReadAsync(aligned_req, opts, read_async_callback,
+                         read_async_info, io_handle, del_fn, nullptr /*dbg*/);
+  } else {
+    StopWatch sw(clock_, nullptr /*stats*/, 0 /*hist_type*/, &elapsed,
+                 true /*overwrite*/, true /*delay_enabled*/);
+    s = file_->ReadAsync(req, opts, read_async_callback, read_async_info,
+                         io_handle, del_fn, nullptr /*dbg*/);
+  }
+  RecordTick(stats_, READ_ASYNC_MICROS, elapsed);
+
+// Suppress false positive clang analyzer warnings.
+// Memory is not released if file_->ReadAsync returns !s.ok(), because
+// ReadAsyncCallback is never called in that case. If ReadAsyncCallback is
+// called then ReadAsync should always return IOStatus::OK().
+#ifndef __clang_analyzer__
+  if (!s.ok()) {
+    delete read_async_info;
+  }
+#endif  // __clang_analyzer__
+
+  return s;
+}
+
+void RandomAccessFileReader::ReadAsyncCallback(const FSReadRequest& req,
+                                               void* cb_arg) {
+  ReadAsyncInfo* read_async_info = static_cast<ReadAsyncInfo*>(cb_arg);
+  assert(read_async_info);
+  assert(read_async_info->cb_);
+
+  if (use_direct_io() && read_async_info->is_aligned_ == false) {
+    // Create FSReadRequest with user provided fields.
+    FSReadRequest user_req;
+    user_req.scratch = read_async_info->user_scratch_;
+    user_req.offset = read_async_info->user_offset_;
+    user_req.len = read_async_info->user_len_;
+
+    // Update results in user_req.
+    user_req.result = req.result;
+    user_req.status = req.status;
+
+    read_async_info->buf_.Size(read_async_info->buf_.CurrentSize() +
+                               req.result.size());
+
+    size_t offset_advance_len = static_cast<size_t>(
+        /*offset_passed_by_user=*/read_async_info->user_offset_ -
+        /*aligned_offset=*/req.offset);
+
+    size_t res_len = 0;
+    if (req.status.ok() &&
+        offset_advance_len < read_async_info->buf_.CurrentSize()) {
+      res_len =
+          std::min(read_async_info->buf_.CurrentSize() - offset_advance_len,
+                   read_async_info->user_len_);
+      if (read_async_info->user_aligned_buf_ == nullptr) {
+        // Copy the data into user's scratch.
+// Clang analyzer assumes that it will take use_direct_io() == false in
+// ReadAsync and use_direct_io() == true in Callback which cannot be true.
+#ifndef __clang_analyzer__
+        read_async_info->buf_.Read(user_req.scratch, offset_advance_len,
+                                   res_len);
+#endif  // __clang_analyzer__
+      } else {
+        // Set aligned_buf provided by user without additional copy.
+        user_req.scratch =
+            read_async_info->buf_.BufferStart() + offset_advance_len;
+        read_async_info->user_aligned_buf_->reset(
+            read_async_info->buf_.Release());
+      }
+      user_req.result = Slice(user_req.scratch, res_len);
+    } else {
+      // Either req.status is not ok or data was not read.
+      user_req.result = Slice();
+    }
+    read_async_info->cb_(user_req, read_async_info->cb_arg_);
+  } else {
+    read_async_info->cb_(req, read_async_info->cb_arg_);
+  }
+
+  // Update stats and notify listeners.
+  if (stats_ != nullptr && file_read_hist_ != nullptr) {
+    // elapsed doesn't take into account delay and overwrite as StopWatch does
+    // in Read.
+    uint64_t elapsed = clock_->NowMicros() - read_async_info->start_time_;
+    file_read_hist_->Add(elapsed);
+  }
+  if (req.status.ok()) {
+    RecordInHistogram(stats_, ASYNC_READ_BYTES, req.result.size());
+  } else if (!req.status.IsAborted()) {
+    RecordTick(stats_, ASYNC_READ_ERROR_COUNT, 1);
+  }
+#ifndef ROCKSDB_LITE
+  if (ShouldNotifyListeners()) {
+    auto finish_ts = FileOperationInfo::FinishNow();
+    NotifyOnFileReadFinish(req.offset, req.result.size(),
+                           read_async_info->fs_start_ts_, finish_ts,
+                           req.status);
+  }
+  if (!req.status.ok()) {
+    NotifyOnIOError(req.status, FileOperationType::kRead, file_name(),
+                    req.result.size(), req.offset);
+  }
+#endif
+  RecordIOStats(stats_, file_temperature_, is_last_level_, req.result.size());
+  delete read_async_info;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/random_access_file_reader.h b/src/rocksdb/file/random_access_file_reader.h
new file mode 100644
index 000000000..ea7cfd234
--- /dev/null
+++ b/src/rocksdb/file/random_access_file_reader.h
@@ -0,0 +1,217 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <sstream>
+#include <string>
+
+#include "env/file_system_tracer.h"
+#include "port/port.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/options.h"
+#include "rocksdb/rate_limiter.h"
+#include "util/aligned_buffer.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Statistics;
+class HistogramImpl;
+class SystemClock;
+
+using AlignedBuf = std::unique_ptr<char[]>;
+
+// Align the request r according to alignment and return the aligned result.
+FSReadRequest Align(const FSReadRequest& r, size_t alignment);
+
+// Try to merge src to dest if they have overlap.
+//
+// Each request represents an inclusive interval [offset, offset + len].
+// If the intervals have overlap, update offset and len to represent the
+// merged interval, and return true.
+// Otherwise, do nothing and return false.
+bool TryMerge(FSReadRequest* dest, const FSReadRequest& src);
+
+// RandomAccessFileReader is a wrapper on top of FSRandomAccessFile. It is
+// responsible for:
+// - Handling Buffered and Direct reads appropriately.
+// - Rate limiting compaction reads.
+// - Notifying any interested listeners on the completion of a read.
+// - Updating IO stats.
+class RandomAccessFileReader {
+ private:
+#ifndef ROCKSDB_LITE
+  void NotifyOnFileReadFinish(
+      uint64_t offset, size_t length,
+      const FileOperationInfo::StartTimePoint& start_ts,
+      const FileOperationInfo::FinishTimePoint& finish_ts,
+      const Status& status) const {
+    FileOperationInfo info(FileOperationType::kRead, file_name_, start_ts,
+                           finish_ts, status, file_temperature_);
+    info.offset = offset;
+    info.length = length;
+
+    for (auto& listener : listeners_) {
+      listener->OnFileReadFinish(info);
+    }
+    info.status.PermitUncheckedError();
+  }
+
+  void NotifyOnIOError(const IOStatus& io_status, FileOperationType operation,
+                       const std::string& file_path, size_t length,
+                       uint64_t offset) const {
+    if (listeners_.empty()) {
+      return;
+    }
+    IOErrorInfo io_error_info(io_status, operation, file_path, length, offset);
+
+    for (auto& listener : listeners_) {
+      listener->OnIOError(io_error_info);
+    }
+    io_status.PermitUncheckedError();
+  }
+
+#endif  // ROCKSDB_LITE
+
+  bool ShouldNotifyListeners() const { return !listeners_.empty(); }
+
+  FSRandomAccessFilePtr file_;
+  std::string file_name_;
+  SystemClock* clock_;
+  Statistics* stats_;
+  uint32_t hist_type_;
+  HistogramImpl* file_read_hist_;
+  RateLimiter* rate_limiter_;
+  std::vector<std::shared_ptr<EventListener>> listeners_;
+  const Temperature file_temperature_;
+  const bool is_last_level_;
+
+  struct ReadAsyncInfo {
+    ReadAsyncInfo(std::function<void(const FSReadRequest&, void*)> cb,
+                  void* cb_arg, uint64_t start_time)
+        : cb_(cb),
+          cb_arg_(cb_arg),
+          start_time_(start_time),
+          user_scratch_(nullptr),
+          user_aligned_buf_(nullptr),
+          user_offset_(0),
+          user_len_(0),
+          is_aligned_(false) {}
+
+    std::function<void(const FSReadRequest&, void*)> cb_;
+    void* cb_arg_;
+    uint64_t start_time_;
+#ifndef ROCKSDB_LITE
+    FileOperationInfo::StartTimePoint fs_start_ts_;
+#endif
+    // Below fields stores the parameters passed by caller in case of direct_io.
+    char* user_scratch_;
+    AlignedBuf* user_aligned_buf_;
+    uint64_t user_offset_;
+    size_t user_len_;
+    Slice user_result_;
+    // Used in case of direct_io
+    AlignedBuffer buf_;
+    bool is_aligned_;
+  };
+
+ public:
+  explicit RandomAccessFileReader(
+      std::unique_ptr<FSRandomAccessFile>&& raf, const std::string& _file_name,
+      SystemClock* clock = nullptr,
+      const std::shared_ptr<IOTracer>& io_tracer = nullptr,
+      Statistics* stats = nullptr, uint32_t hist_type = 0,
+      HistogramImpl* file_read_hist = nullptr,
+      RateLimiter* rate_limiter = nullptr,
+      const std::vector<std::shared_ptr<EventListener>>& listeners = {},
+      Temperature file_temperature = Temperature::kUnknown,
+      bool is_last_level = false)
+      : file_(std::move(raf), io_tracer, _file_name),
+        file_name_(std::move(_file_name)),
+        clock_(clock),
+        stats_(stats),
+        hist_type_(hist_type),
+        file_read_hist_(file_read_hist),
+        rate_limiter_(rate_limiter),
+        listeners_(),
+        file_temperature_(file_temperature),
+        is_last_level_(is_last_level) {
+#ifndef ROCKSDB_LITE
+    std::for_each(listeners.begin(), listeners.end(),
+                  [this](const std::shared_ptr<EventListener>& e) {
+                    if (e->ShouldBeNotifiedOnFileIO()) {
+                      listeners_.emplace_back(e);
+                    }
+                  });
+#else  // !ROCKSDB_LITE
+    (void)listeners;
+#endif
+  }
+
+  static IOStatus Create(const std::shared_ptr<FileSystem>& fs,
+                         const std::string& fname, const FileOptions& file_opts,
+                         std::unique_ptr<RandomAccessFileReader>* reader,
+                         IODebugContext* dbg);
+  RandomAccessFileReader(const RandomAccessFileReader&) = delete;
+  RandomAccessFileReader& operator=(const RandomAccessFileReader&) = delete;
+
+  // In non-direct IO mode,
+  // 1. if using mmap, result is stored in a buffer other than scratch;
+  // 2. if not using mmap, result is stored in the buffer starting from scratch.
+  //
+  // In direct IO mode, an aligned buffer is allocated internally.
+  // 1. If aligned_buf is null, then results are copied to the buffer
+  // starting from scratch;
+  // 2. Otherwise, scratch is not used and can be null, the aligned_buf owns
+  // the internally allocated buffer on return, and the result refers to a
+  // region in aligned_buf.
+  //
+  // `rate_limiter_priority` is used to charge the internal rate limiter when
+  // enabled. The special value `Env::IO_TOTAL` makes this operation bypass the
+  // rate limiter.
+  IOStatus Read(const IOOptions& opts, uint64_t offset, size_t n, Slice* result,
+                char* scratch, AlignedBuf* aligned_buf,
+                Env::IOPriority rate_limiter_priority) const;
+
+  // REQUIRES:
+  // num_reqs > 0, reqs do not overlap, and offsets in reqs are increasing.
+  // In non-direct IO mode, aligned_buf should be null;
+  // In direct IO mode, aligned_buf stores the aligned buffer allocated inside
+  // MultiRead, the result Slices in reqs refer to aligned_buf.
+  //
+  // `rate_limiter_priority` will be used to charge the internal rate limiter.
+  // It is not yet supported so the client must provide the special value
+  // `Env::IO_TOTAL` to bypass the rate limiter.
+  IOStatus MultiRead(const IOOptions& opts, FSReadRequest* reqs,
+                     size_t num_reqs, AlignedBuf* aligned_buf,
+                     Env::IOPriority rate_limiter_priority) const;
+
+  IOStatus Prefetch(uint64_t offset, size_t n,
+                    const Env::IOPriority rate_limiter_priority) const {
+    IOOptions opts;
+    opts.rate_limiter_priority = rate_limiter_priority;
+    return file_->Prefetch(offset, n, opts, nullptr);
+  }
+
+  FSRandomAccessFile* file() { return file_.get(); }
+
+  const std::string& file_name() const { return file_name_; }
+
+  bool use_direct_io() const { return file_->use_direct_io(); }
+
+  IOStatus PrepareIOOptions(const ReadOptions& ro, IOOptions& opts);
+
+  IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts,
+                     std::function<void(const FSReadRequest&, void*)> cb,
+                     void* cb_arg, void** io_handle, IOHandleDeleter* del_fn,
+                     AlignedBuf* aligned_buf);
+
+  void ReadAsyncCallback(const FSReadRequest& req, void* cb_arg);
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/random_access_file_reader_test.cc b/src/rocksdb/file/random_access_file_reader_test.cc
new file mode 100644
index 000000000..ac0e9e57a
--- /dev/null
+++ b/src/rocksdb/file/random_access_file_reader_test.cc
@@ -0,0 +1,481 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "file/random_access_file_reader.h"
+
+#include <algorithm>
+
+#include "file/file_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/file_system.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class RandomAccessFileReaderTest : public testing::Test {
+ public:
+  void SetUp() override {
+    SetupSyncPointsToMockDirectIO();
+    env_ = Env::Default();
+    fs_ = FileSystem::Default();
+    test_dir_ = test::PerThreadDBPath("random_access_file_reader_test");
+    ASSERT_OK(fs_->CreateDir(test_dir_, IOOptions(), nullptr));
+  }
+
+  void TearDown() override { EXPECT_OK(DestroyDir(env_, test_dir_)); }
+
+  void Write(const std::string& fname, const std::string& content) {
+    std::unique_ptr<FSWritableFile> f;
+    ASSERT_OK(fs_->NewWritableFile(Path(fname), FileOptions(), &f, nullptr));
+    ASSERT_OK(f->Append(content, IOOptions(), nullptr));
+    ASSERT_OK(f->Close(IOOptions(), nullptr));
+  }
+
+  void Read(const std::string& fname, const FileOptions& opts,
+            std::unique_ptr<RandomAccessFileReader>* reader) {
+    std::string fpath = Path(fname);
+    std::unique_ptr<FSRandomAccessFile> f;
+    ASSERT_OK(fs_->NewRandomAccessFile(fpath, opts, &f, nullptr));
+    reader->reset(new RandomAccessFileReader(std::move(f), fpath,
+                                             env_->GetSystemClock().get()));
+  }
+
+  void AssertResult(const std::string& content,
+                    const std::vector<FSReadRequest>& reqs) {
+    for (const auto& r : reqs) {
+      ASSERT_OK(r.status);
+      ASSERT_EQ(r.len, r.result.size());
+      ASSERT_EQ(content.substr(r.offset, r.len), r.result.ToString());
+    }
+  }
+
+ private:
+  Env* env_;
+  std::shared_ptr<FileSystem> fs_;
+  std::string test_dir_;
+
+  std::string Path(const std::string& fname) { return test_dir_ + "/" + fname; }
+};
+
+// Skip the following tests in lite mode since direct I/O is unsupported.
+#ifndef ROCKSDB_LITE
+
+TEST_F(RandomAccessFileReaderTest, ReadDirectIO) {
+  std::string fname = "read-direct-io";
+  Random rand(0);
+  std::string content = rand.RandomString(kDefaultPageSize);
+  Write(fname, content);
+
+  FileOptions opts;
+  opts.use_direct_reads = true;
+  std::unique_ptr<RandomAccessFileReader> r;
+  Read(fname, opts, &r);
+  ASSERT_TRUE(r->use_direct_io());
+
+  const size_t page_size = r->file()->GetRequiredBufferAlignment();
+  size_t offset = page_size / 2;
+  size_t len = page_size / 3;
+  Slice result;
+  AlignedBuf buf;
+  for (Env::IOPriority rate_limiter_priority : {Env::IO_LOW, Env::IO_TOTAL}) {
+    ASSERT_OK(r->Read(IOOptions(), offset, len, &result, nullptr, &buf,
+                      rate_limiter_priority));
+    ASSERT_EQ(result.ToString(), content.substr(offset, len));
+  }
+}
+
+TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) {
+  std::vector<FSReadRequest> aligned_reqs;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "RandomAccessFileReader::MultiRead:AlignedReqs", [&](void* reqs) {
+        // Copy reqs, since it's allocated on stack inside MultiRead, which will
+        // be deallocated after MultiRead returns.
+        aligned_reqs = *reinterpret_cast<std::vector<FSReadRequest>*>(reqs);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Creates a file with 3 pages.
+  std::string fname = "multi-read-direct-io";
+  Random rand(0);
+  std::string content = rand.RandomString(3 * kDefaultPageSize);
+  Write(fname, content);
+
+  FileOptions opts;
+  opts.use_direct_reads = true;
+  std::unique_ptr<RandomAccessFileReader> r;
+  Read(fname, opts, &r);
+  ASSERT_TRUE(r->use_direct_io());
+
+  const size_t page_size = r->file()->GetRequiredBufferAlignment();
+
+  {
+    // Reads 2 blocks in the 1st page.
+    // The results should be SharedSlices of the same underlying buffer.
+    //
+    // Illustration (each x is a 1/4 page)
+    // First page: xxxx
+    // 1st block:  x
+    // 2nd block:    xx
+    FSReadRequest r0;
+    r0.offset = 0;
+    r0.len = page_size / 4;
+    r0.scratch = nullptr;
+
+    FSReadRequest r1;
+    r1.offset = page_size / 2;
+    r1.len = page_size / 2;
+    r1.scratch = nullptr;
+
+    std::vector<FSReadRequest> reqs;
+    reqs.push_back(std::move(r0));
+    reqs.push_back(std::move(r1));
+    AlignedBuf aligned_buf;
+    ASSERT_OK(r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf,
+                           Env::IO_TOTAL /* rate_limiter_priority */));
+
+    AssertResult(content, reqs);
+
+    // Reads the first page internally.
+    ASSERT_EQ(aligned_reqs.size(), 1);
+    const FSReadRequest& aligned_r = aligned_reqs[0];
+    ASSERT_OK(aligned_r.status);
+    ASSERT_EQ(aligned_r.offset, 0);
+    ASSERT_EQ(aligned_r.len, page_size);
+  }
+
+  {
+    // Reads 3 blocks:
+    // 1st block in the 1st page;
+    // 2nd block from the middle of the 1st page to the middle of the 2nd page;
+    // 3rd block in the 2nd page.
+    // The results should be SharedSlices of the same underlying buffer.
+    //
+    // Illustration (each x is a 1/4 page)
+    // 2 pages:   xxxxxxxx
+    // 1st block: x
+    // 2nd block:   xxxx
+    // 3rd block:        x
+    FSReadRequest r0;
+    r0.offset = 0;
+    r0.len = page_size / 4;
+    r0.scratch = nullptr;
+
+    FSReadRequest r1;
+    r1.offset = page_size / 2;
+    r1.len = page_size;
+    r1.scratch = nullptr;
+
+    FSReadRequest r2;
+    r2.offset = 2 * page_size - page_size / 4;
+    r2.len = page_size / 4;
+    r2.scratch = nullptr;
+
+    std::vector<FSReadRequest> reqs;
+    reqs.push_back(std::move(r0));
+    reqs.push_back(std::move(r1));
+    reqs.push_back(std::move(r2));
+    AlignedBuf aligned_buf;
+    ASSERT_OK(r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf,
+                           Env::IO_TOTAL /* rate_limiter_priority */));
+
+    AssertResult(content, reqs);
+
+    // Reads the first two pages in one request internally.
+    ASSERT_EQ(aligned_reqs.size(), 1);
+    const FSReadRequest& aligned_r = aligned_reqs[0];
+    ASSERT_OK(aligned_r.status);
+    ASSERT_EQ(aligned_r.offset, 0);
+    ASSERT_EQ(aligned_r.len, 2 * page_size);
+  }
+
+  {
+    // Reads 3 blocks:
+    // 1st block in the middle of the 1st page;
+    // 2nd block in the middle of the 2nd page;
+    // 3rd block in the middle of the 3rd page.
+    // The results should be SharedSlices of the same underlying buffer.
+    //
+    // Illustration (each x is a 1/4 page)
+    // 3 pages:   xxxxxxxxxxxx
+    // 1st block:  xx
+    // 2nd block:      xx
+    // 3rd block:          xx
+    FSReadRequest r0;
+    r0.offset = page_size / 4;
+    r0.len = page_size / 2;
+    r0.scratch = nullptr;
+
+    FSReadRequest r1;
+    r1.offset = page_size + page_size / 4;
+    r1.len = page_size / 2;
+    r1.scratch = nullptr;
+
+    FSReadRequest r2;
+    r2.offset = 2 * page_size + page_size / 4;
+    r2.len = page_size / 2;
+    r2.scratch = nullptr;
+
+    std::vector<FSReadRequest> reqs;
+    reqs.push_back(std::move(r0));
+    reqs.push_back(std::move(r1));
+    reqs.push_back(std::move(r2));
+    AlignedBuf aligned_buf;
+    ASSERT_OK(r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf,
+                           Env::IO_TOTAL /* rate_limiter_priority */));
+
+    AssertResult(content, reqs);
+
+    // Reads the first 3 pages in one request internally.
+    ASSERT_EQ(aligned_reqs.size(), 1);
+    const FSReadRequest& aligned_r = aligned_reqs[0];
+    ASSERT_OK(aligned_r.status);
+    ASSERT_EQ(aligned_r.offset, 0);
+    ASSERT_EQ(aligned_r.len, 3 * page_size);
+  }
+
+  {
+    // Reads 2 blocks:
+    // 1st block in the middle of the 1st page;
+    // 2nd block in the middle of the 3rd page.
+    // The results are two different buffers.
+    //
+    // Illustration (each x is a 1/4 page)
+    // 3 pages:   xxxxxxxxxxxx
+    // 1st block:  xx
+    // 2nd block:          xx
+    FSReadRequest r0;
+    r0.offset = page_size / 4;
+    r0.len = page_size / 2;
+    r0.scratch = nullptr;
+
+    FSReadRequest r1;
+    r1.offset = 2 * page_size + page_size / 4;
+    r1.len = page_size / 2;
+    r1.scratch = nullptr;
+
+    std::vector<FSReadRequest> reqs;
+    reqs.push_back(std::move(r0));
+    reqs.push_back(std::move(r1));
+    AlignedBuf aligned_buf;
+    ASSERT_OK(r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf,
+                           Env::IO_TOTAL /* rate_limiter_priority */));
+
+    AssertResult(content, reqs);
+
+    // Reads the 1st and 3rd pages in two requests internally.
+    ASSERT_EQ(aligned_reqs.size(), 2);
+    const FSReadRequest& aligned_r0 = aligned_reqs[0];
+    const FSReadRequest& aligned_r1 = aligned_reqs[1];
+    ASSERT_OK(aligned_r0.status);
+    ASSERT_EQ(aligned_r0.offset, 0);
+    ASSERT_EQ(aligned_r0.len, page_size);
+    ASSERT_OK(aligned_r1.status);
+    ASSERT_EQ(aligned_r1.offset, 2 * page_size);
+    ASSERT_EQ(aligned_r1.len, page_size);
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST(FSReadRequest, Align) {
+  FSReadRequest r;
+  r.offset = 2000;
+  r.len = 2000;
+  r.scratch = nullptr;
+  ASSERT_OK(r.status);
+
+  FSReadRequest aligned_r = Align(r, 1024);
+  ASSERT_OK(r.status);
+  ASSERT_OK(aligned_r.status);
+  ASSERT_EQ(aligned_r.offset, 1024);
+  ASSERT_EQ(aligned_r.len, 3072);
+}
+
+TEST(FSReadRequest, TryMerge) {
+  // reverse means merging dest into src.
+  for (bool reverse : {true, false}) {
+    {
+      // dest: [ ]
+      //  src:      [ ]
+      FSReadRequest dest;
+      dest.offset = 0;
+      dest.len = 10;
+      dest.scratch = nullptr;
+      ASSERT_OK(dest.status);
+
+      FSReadRequest src;
+      src.offset = 15;
+      src.len = 10;
+      src.scratch = nullptr;
+      ASSERT_OK(src.status);
+
+      if (reverse) {
+        std::swap(dest, src);
+      }
+      ASSERT_FALSE(TryMerge(&dest, src));
+      ASSERT_OK(dest.status);
+      ASSERT_OK(src.status);
+    }
+
+    {
+      // dest: [ ]
+      //  src:   [ ]
+      FSReadRequest dest;
+      dest.offset = 0;
+      dest.len = 10;
+      dest.scratch = nullptr;
+      ASSERT_OK(dest.status);
+
+      FSReadRequest src;
+      src.offset = 10;
+      src.len = 10;
+      src.scratch = nullptr;
+      ASSERT_OK(src.status);
+
+      if (reverse) {
+        std::swap(dest, src);
+      }
+      ASSERT_TRUE(TryMerge(&dest, src));
+      ASSERT_EQ(dest.offset, 0);
+      ASSERT_EQ(dest.len, 20);
+      ASSERT_OK(dest.status);
+      ASSERT_OK(src.status);
+    }
+
+    {
+      // dest: [    ]
+      //  src:   [    ]
+      FSReadRequest dest;
+      dest.offset = 0;
+      dest.len = 10;
+      dest.scratch = nullptr;
+      ASSERT_OK(dest.status);
+
+      FSReadRequest src;
+      src.offset = 5;
+      src.len = 10;
+      src.scratch = nullptr;
+      ASSERT_OK(src.status);
+
+      if (reverse) {
+        std::swap(dest, src);
+      }
+      ASSERT_TRUE(TryMerge(&dest, src));
+      ASSERT_EQ(dest.offset, 0);
+      ASSERT_EQ(dest.len, 15);
+      ASSERT_OK(dest.status);
+      ASSERT_OK(src.status);
+    }
+
+    {
+      // dest: [    ]
+      //  src:   [  ]
+      FSReadRequest dest;
+      dest.offset = 0;
+      dest.len = 10;
+      dest.scratch = nullptr;
+      ASSERT_OK(dest.status);
+
+      FSReadRequest src;
+      src.offset = 5;
+      src.len = 5;
+      src.scratch = nullptr;
+      ASSERT_OK(src.status);
+
+      if (reverse) {
+        std::swap(dest, src);
+      }
+      ASSERT_TRUE(TryMerge(&dest, src));
+      ASSERT_EQ(dest.offset, 0);
+      ASSERT_EQ(dest.len, 10);
+      ASSERT_OK(dest.status);
+      ASSERT_OK(src.status);
+    }
+
+    {
+      // dest: [     ]
+      //  src:   [ ]
+      FSReadRequest dest;
+      dest.offset = 0;
+      dest.len = 10;
+      dest.scratch = nullptr;
+      ASSERT_OK(dest.status);
+
+      FSReadRequest src;
+      src.offset = 5;
+      src.len = 1;
+      src.scratch = nullptr;
+      ASSERT_OK(src.status);
+
+      if (reverse) std::swap(dest, src);
+      ASSERT_TRUE(TryMerge(&dest, src));
+      ASSERT_EQ(dest.offset, 0);
+      ASSERT_EQ(dest.len, 10);
+      ASSERT_OK(dest.status);
+      ASSERT_OK(src.status);
+    }
+
+    {
+      // dest: [ ]
+      //  src: [ ]
+      FSReadRequest dest;
+      dest.offset = 0;
+      dest.len = 10;
+      dest.scratch = nullptr;
+      ASSERT_OK(dest.status);
+
+      FSReadRequest src;
+      src.offset = 0;
+      src.len = 10;
+      src.scratch = nullptr;
+      ASSERT_OK(src.status);
+
+      if (reverse) std::swap(dest, src);
+      ASSERT_TRUE(TryMerge(&dest, src));
+      ASSERT_EQ(dest.offset, 0);
+      ASSERT_EQ(dest.len, 10);
+      ASSERT_OK(dest.status);
+      ASSERT_OK(src.status);
+    }
+
+    {
+      // dest: [   ]
+      //  src: [ ]
+      FSReadRequest dest;
+      dest.offset = 0;
+      dest.len = 10;
+      dest.scratch = nullptr;
+      ASSERT_OK(dest.status);
+
+      FSReadRequest src;
+      src.offset = 0;
+      src.len = 5;
+      src.scratch = nullptr;
+      ASSERT_OK(src.status);
+
+      if (reverse) std::swap(dest, src);
+      ASSERT_TRUE(TryMerge(&dest, src));
+      ASSERT_EQ(dest.offset, 0);
+      ASSERT_EQ(dest.len, 10);
+      ASSERT_OK(dest.status);
+      ASSERT_OK(src.status);
+    }
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/file/read_write_util.cc b/src/rocksdb/file/read_write_util.cc
new file mode 100644
index 000000000..3617a35e3
--- /dev/null
+++ b/src/rocksdb/file/read_write_util.cc
@@ -0,0 +1,33 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/read_write_util.h"
+
+#include <sstream>
+
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+IOStatus NewWritableFile(FileSystem* fs, const std::string& fname,
+                         std::unique_ptr<FSWritableFile>* result,
+                         const FileOptions& options) {
+  TEST_SYNC_POINT_CALLBACK("NewWritableFile::FileOptions.temperature",
+                           const_cast<Temperature*>(&options.temperature));
+  IOStatus s = fs->NewWritableFile(fname, options, result, nullptr);
+  TEST_KILL_RANDOM_WITH_WEIGHT("NewWritableFile:0", REDUCE_ODDS2);
+  return s;
+}
+
+#ifndef NDEBUG
+bool IsFileSectorAligned(const size_t off, size_t sector_size) {
+  return off % sector_size == 0;
+}
+#endif  // NDEBUG
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/read_write_util.h b/src/rocksdb/file/read_write_util.h
new file mode 100644
index 000000000..9f034b705
--- /dev/null
+++ b/src/rocksdb/file/read_write_util.h
@@ -0,0 +1,31 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+
+#include "file/sequence_file_reader.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Returns a WritableFile.
+//
+// env     : the Env.
+// fname   : the file name.
+// result  : output arg. A WritableFile based on `fname` returned.
+// options : the Env Options.
+extern IOStatus NewWritableFile(FileSystem* fs, const std::string& fname,
+                                std::unique_ptr<FSWritableFile>* result,
+                                const FileOptions& options);
+
+#ifndef NDEBUG
+bool IsFileSectorAligned(const size_t off, size_t sector_size);
+#endif  // NDEBUG
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/readahead_file_info.h b/src/rocksdb/file/readahead_file_info.h
new file mode 100644
index 000000000..f0208bf2d
--- /dev/null
+++ b/src/rocksdb/file/readahead_file_info.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// struct ReadaheadFileInfo contains readahead information that is passed from
+// one file to another file per level during iterations. This information helps
+// iterators to carry forward the internal automatic prefetching readahead value
+// to next file during sequential reads instead of starting from the scratch.
+
+struct ReadaheadFileInfo {
+  struct ReadaheadInfo {
+    size_t readahead_size = 0;
+    int64_t num_file_reads = 0;
+  };
+
+  // Used by Data block iterators to update readahead info.
+  ReadaheadInfo data_block_readahead_info;
+
+  // Used by Index block iterators to update readahead info.
+  ReadaheadInfo index_block_readahead_info;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/readahead_raf.cc b/src/rocksdb/file/readahead_raf.cc
new file mode 100644
index 000000000..6d346432e
--- /dev/null
+++ b/src/rocksdb/file/readahead_raf.cc
@@ -0,0 +1,169 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/readahead_raf.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include "file/read_write_util.h"
+#include "rocksdb/file_system.h"
+#include "util/aligned_buffer.h"
+#include "util/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+class ReadaheadRandomAccessFile : public FSRandomAccessFile {
+ public:
+  ReadaheadRandomAccessFile(std::unique_ptr<FSRandomAccessFile>&& file,
+                            size_t readahead_size)
+      : file_(std::move(file)),
+        alignment_(file_->GetRequiredBufferAlignment()),
+        readahead_size_(Roundup(readahead_size, alignment_)),
+        buffer_(),
+        buffer_offset_(0) {
+    buffer_.Alignment(alignment_);
+    buffer_.AllocateNewBuffer(readahead_size_);
+  }
+
+  ReadaheadRandomAccessFile(const ReadaheadRandomAccessFile&) = delete;
+
+  ReadaheadRandomAccessFile& operator=(const ReadaheadRandomAccessFile&) =
+      delete;
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override {
+    // Read-ahead only make sense if we have some slack left after reading
+    if (n + alignment_ >= readahead_size_) {
+      return file_->Read(offset, n, options, result, scratch, dbg);
+    }
+
+    std::unique_lock<std::mutex> lk(lock_);
+
+    size_t cached_len = 0;
+    // Check if there is a cache hit, meaning that [offset, offset + n) is
+    // either completely or partially in the buffer. If it's completely cached,
+    // including end of file case when offset + n is greater than EOF, then
+    // return.
+    if (TryReadFromCache(offset, n, &cached_len, scratch) &&
+        (cached_len == n || buffer_.CurrentSize() < readahead_size_)) {
+      // We read exactly what we needed, or we hit end of file - return.
+      *result = Slice(scratch, cached_len);
+      return IOStatus::OK();
+    }
+    size_t advanced_offset = static_cast<size_t>(offset + cached_len);
+    // In the case of cache hit advanced_offset is already aligned, means that
+    // chunk_offset equals to advanced_offset
+    size_t chunk_offset = TruncateToPageBoundary(alignment_, advanced_offset);
+
+    IOStatus s = ReadIntoBuffer(chunk_offset, readahead_size_, options, dbg);
+    if (s.ok()) {
+      // The data we need is now in cache, so we can safely read it
+      size_t remaining_len;
+      TryReadFromCache(advanced_offset, n - cached_len, &remaining_len,
+                       scratch + cached_len);
+      *result = Slice(scratch, cached_len + remaining_len);
+    }
+    return s;
+  }
+
+  IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
+                    IODebugContext* dbg) override {
+    if (n < readahead_size_) {
+      // Don't allow smaller prefetches than the configured `readahead_size_`.
+      // `Read()` assumes a smaller prefetch buffer indicates EOF was reached.
+      return IOStatus::OK();
+    }
+
+    std::unique_lock<std::mutex> lk(lock_);
+
+    size_t offset_ = static_cast<size_t>(offset);
+    size_t prefetch_offset = TruncateToPageBoundary(alignment_, offset_);
+    if (prefetch_offset == buffer_offset_) {
+      return IOStatus::OK();
+    }
+    return ReadIntoBuffer(prefetch_offset,
+                          Roundup(offset_ + n, alignment_) - prefetch_offset,
+                          options, dbg);
+  }
+
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return file_->GetUniqueId(id, max_size);
+  }
+
+  void Hint(AccessPattern pattern) override { file_->Hint(pattern); }
+
+  IOStatus InvalidateCache(size_t offset, size_t length) override {
+    std::unique_lock<std::mutex> lk(lock_);
+    buffer_.Clear();
+    return file_->InvalidateCache(offset, length);
+  }
+
+  bool use_direct_io() const override { return file_->use_direct_io(); }
+
+ private:
+  // Tries to read from buffer_ n bytes starting at offset. If anything was read
+  // from the cache, it sets cached_len to the number of bytes actually read,
+  // copies these number of bytes to scratch and returns true.
+  // If nothing was read sets cached_len to 0 and returns false.
+  bool TryReadFromCache(uint64_t offset, size_t n, size_t* cached_len,
+                        char* scratch) const {
+    if (offset < buffer_offset_ ||
+        offset >= buffer_offset_ + buffer_.CurrentSize()) {
+      *cached_len = 0;
+      return false;
+    }
+    uint64_t offset_in_buffer = offset - buffer_offset_;
+    *cached_len = std::min(
+        buffer_.CurrentSize() - static_cast<size_t>(offset_in_buffer), n);
+    memcpy(scratch, buffer_.BufferStart() + offset_in_buffer, *cached_len);
+    return true;
+  }
+
+  // Reads into buffer_ the next n bytes from file_ starting at offset.
+  // Can actually read less if EOF was reached.
+  // Returns the status of the read operastion on the file.
+  IOStatus ReadIntoBuffer(uint64_t offset, size_t n, const IOOptions& options,
+                          IODebugContext* dbg) const {
+    if (n > buffer_.Capacity()) {
+      n = buffer_.Capacity();
+    }
+    assert(IsFileSectorAligned(offset, alignment_));
+    assert(IsFileSectorAligned(n, alignment_));
+    Slice result;
+    IOStatus s =
+        file_->Read(offset, n, options, &result, buffer_.BufferStart(), dbg);
+    if (s.ok()) {
+      buffer_offset_ = offset;
+      buffer_.Size(result.size());
+      assert(result.size() == 0 || buffer_.BufferStart() == result.data());
+    }
+    return s;
+  }
+
+  const std::unique_ptr<FSRandomAccessFile> file_;
+  const size_t alignment_;
+  const size_t readahead_size_;
+
+  mutable std::mutex lock_;
+  // The buffer storing the prefetched data
+  mutable AlignedBuffer buffer_;
+  // The offset in file_, corresponding to data stored in buffer_
+  mutable uint64_t buffer_offset_;
+};
+}  // namespace
+
+std::unique_ptr<FSRandomAccessFile> NewReadaheadRandomAccessFile(
+    std::unique_ptr<FSRandomAccessFile>&& file, size_t readahead_size) {
+  std::unique_ptr<FSRandomAccessFile> result(
+      new ReadaheadRandomAccessFile(std::move(file), readahead_size));
+  return result;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/readahead_raf.h b/src/rocksdb/file/readahead_raf.h
new file mode 100644
index 000000000..dfaf2b4fa
--- /dev/null
+++ b/src/rocksdb/file/readahead_raf.h
@@ -0,0 +1,29 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <memory>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+class FSRandomAccessFile;
+// This file provides the following main abstractions:
+// SequentialFileReader : wrapper over Env::SequentialFile
+// RandomAccessFileReader : wrapper over Env::RandomAccessFile
+// WritableFileWriter : wrapper over Env::WritableFile
+// In addition, it also exposed NewReadaheadRandomAccessFile, NewWritableFile,
+// and ReadOneLine primitives.
+
+// NewReadaheadRandomAccessFile provides a wrapper over RandomAccessFile to
+// always prefetch additional data with every read. This is mainly used in
+// Compaction Table Readers.
+std::unique_ptr<FSRandomAccessFile> NewReadaheadRandomAccessFile(
+    std::unique_ptr<FSRandomAccessFile>&& file, size_t readahead_size);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/sequence_file_reader.cc b/src/rocksdb/file/sequence_file_reader.cc
new file mode 100644
index 000000000..d51d5be46
--- /dev/null
+++ b/src/rocksdb/file/sequence_file_reader.cc
@@ -0,0 +1,328 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/sequence_file_reader.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include "file/read_write_util.h"
+#include "monitoring/histogram.h"
+#include "monitoring/iostats_context_imp.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/aligned_buffer.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+IOStatus SequentialFileReader::Create(
+    const std::shared_ptr<FileSystem>& fs, const std::string& fname,
+    const FileOptions& file_opts, std::unique_ptr<SequentialFileReader>* reader,
+    IODebugContext* dbg, RateLimiter* rate_limiter) {
+  std::unique_ptr<FSSequentialFile> file;
+  IOStatus io_s = fs->NewSequentialFile(fname, file_opts, &file, dbg);
+  if (io_s.ok()) {
+    reader->reset(new SequentialFileReader(std::move(file), fname, nullptr, {},
+                                           rate_limiter));
+  }
+  return io_s;
+}
+
+IOStatus SequentialFileReader::Read(size_t n, Slice* result, char* scratch,
+                                    Env::IOPriority rate_limiter_priority) {
+  IOStatus io_s;
+  if (use_direct_io()) {
+#ifndef ROCKSDB_LITE
+    //
+    //    |-offset_advance-|---bytes returned--|
+    //    |----------------------buf size-------------------------|
+    //    |                |                   |                  |
+    // aligned           offset          offset + n  Roundup(offset + n,
+    // offset                                             alignment)
+    //
+    size_t offset = offset_.fetch_add(n);
+    size_t alignment = file_->GetRequiredBufferAlignment();
+    size_t aligned_offset = TruncateToPageBoundary(alignment, offset);
+    size_t offset_advance = offset - aligned_offset;
+    size_t size = Roundup(offset + n, alignment) - aligned_offset;
+    size_t r = 0;
+    AlignedBuffer buf;
+    buf.Alignment(alignment);
+    buf.AllocateNewBuffer(size);
+
+    while (buf.CurrentSize() < size) {
+      size_t allowed;
+      if (rate_limiter_priority != Env::IO_TOTAL && rate_limiter_ != nullptr) {
+        allowed = rate_limiter_->RequestToken(
+            buf.Capacity() - buf.CurrentSize(), buf.Alignment(),
+            rate_limiter_priority, nullptr /* stats */,
+            RateLimiter::OpType::kRead);
+      } else {
+        assert(buf.CurrentSize() == 0);
+        allowed = size;
+      }
+
+      Slice tmp;
+      uint64_t orig_offset = 0;
+      FileOperationInfo::StartTimePoint start_ts;
+      if (ShouldNotifyListeners()) {
+        orig_offset = aligned_offset + buf.CurrentSize();
+        start_ts = FileOperationInfo::StartNow();
+      }
+      io_s = file_->PositionedRead(aligned_offset + buf.CurrentSize(), allowed,
+                                   IOOptions(), &tmp, buf.Destination(),
+                                   nullptr /* dbg */);
+      if (ShouldNotifyListeners()) {
+        auto finish_ts = FileOperationInfo::FinishNow();
+        NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts,
+                               io_s);
+      }
+      buf.Size(buf.CurrentSize() + tmp.size());
+      if (!io_s.ok() || tmp.size() < allowed) {
+        break;
+      }
+    }
+
+    if (io_s.ok() && offset_advance < buf.CurrentSize()) {
+      r = buf.Read(scratch, offset_advance,
+                   std::min(buf.CurrentSize() - offset_advance, n));
+    }
+    *result = Slice(scratch, r);
+#endif  // !ROCKSDB_LITE
+  } else {
+    // To be paranoid, modify scratch a little bit, so in case underlying
+    // FileSystem doesn't fill the buffer but return success and `scratch`
+    // returns contains a previous block, returned value will not pass
+    // checksum.
+    // It's hard to find useful byte for direct I/O case, so we skip it.
+    if (n > 0 && scratch != nullptr) {
+      scratch[0]++;
+    }
+
+    size_t read = 0;
+    while (read < n) {
+      size_t allowed;
+      if (rate_limiter_priority != Env::IO_TOTAL && rate_limiter_ != nullptr) {
+        allowed = rate_limiter_->RequestToken(
+            n - read, 0 /* alignment */, rate_limiter_priority,
+            nullptr /* stats */, RateLimiter::OpType::kRead);
+      } else {
+        allowed = n;
+      }
+#ifndef ROCKSDB_LITE
+      FileOperationInfo::StartTimePoint start_ts;
+      if (ShouldNotifyListeners()) {
+        start_ts = FileOperationInfo::StartNow();
+      }
+#endif
+      Slice tmp;
+      io_s = file_->Read(allowed, IOOptions(), &tmp, scratch + read,
+                         nullptr /* dbg */);
+#ifndef ROCKSDB_LITE
+      if (ShouldNotifyListeners()) {
+        auto finish_ts = FileOperationInfo::FinishNow();
+        size_t offset = offset_.fetch_add(tmp.size());
+        NotifyOnFileReadFinish(offset, tmp.size(), start_ts, finish_ts, io_s);
+      }
+#endif
+      read += tmp.size();
+      if (!io_s.ok() || tmp.size() < allowed) {
+        break;
+      }
+    }
+    *result = Slice(scratch, read);
+  }
+  IOSTATS_ADD(bytes_read, result->size());
+  return io_s;
+}
+
+IOStatus SequentialFileReader::Skip(uint64_t n) {
+#ifndef ROCKSDB_LITE
+  if (use_direct_io()) {
+    offset_ += static_cast<size_t>(n);
+    return IOStatus::OK();
+  }
+#endif  // !ROCKSDB_LITE
+  return file_->Skip(n);
+}
+
+namespace {
+// This class wraps a SequentialFile, exposing same API, with the differenece
+// of being able to prefetch up to readahead_size bytes and then serve them
+// from memory, avoiding the entire round-trip if, for example, the data for the
+// file is actually remote.
+class ReadaheadSequentialFile : public FSSequentialFile {
+ public:
+  ReadaheadSequentialFile(std::unique_ptr<FSSequentialFile>&& file,
+                          size_t readahead_size)
+      : file_(std::move(file)),
+        alignment_(file_->GetRequiredBufferAlignment()),
+        readahead_size_(Roundup(readahead_size, alignment_)),
+        buffer_(),
+        buffer_offset_(0),
+        read_offset_(0) {
+    buffer_.Alignment(alignment_);
+    buffer_.AllocateNewBuffer(readahead_size_);
+  }
+
+  ReadaheadSequentialFile(const ReadaheadSequentialFile&) = delete;
+
+  ReadaheadSequentialFile& operator=(const ReadaheadSequentialFile&) = delete;
+
+  IOStatus Read(size_t n, const IOOptions& opts, Slice* result, char* scratch,
+                IODebugContext* dbg) override {
+    std::unique_lock<std::mutex> lk(lock_);
+
+    size_t cached_len = 0;
+    // Check if there is a cache hit, meaning that [offset, offset + n) is
+    // either completely or partially in the buffer. If it's completely cached,
+    // including end of file case when offset + n is greater than EOF, then
+    // return.
+    if (TryReadFromCache(n, &cached_len, scratch) &&
+        (cached_len == n || buffer_.CurrentSize() < readahead_size_)) {
+      // We read exactly what we needed, or we hit end of file - return.
+      *result = Slice(scratch, cached_len);
+      return IOStatus::OK();
+    }
+    n -= cached_len;
+
+    IOStatus s;
+    // Read-ahead only make sense if we have some slack left after reading
+    if (n + alignment_ >= readahead_size_) {
+      s = file_->Read(n, opts, result, scratch + cached_len, dbg);
+      if (s.ok()) {
+        read_offset_ += result->size();
+        *result = Slice(scratch, cached_len + result->size());
+      }
+      buffer_.Clear();
+      return s;
+    }
+
+    s = ReadIntoBuffer(readahead_size_, opts, dbg);
+    if (s.ok()) {
+      // The data we need is now in cache, so we can safely read it
+      size_t remaining_len;
+      TryReadFromCache(n, &remaining_len, scratch + cached_len);
+      *result = Slice(scratch, cached_len + remaining_len);
+    }
+    return s;
+  }
+
+  IOStatus Skip(uint64_t n) override {
+    std::unique_lock<std::mutex> lk(lock_);
+    IOStatus s = IOStatus::OK();
+    // First check if we need to skip already cached data
+    if (buffer_.CurrentSize() > 0) {
+      // Do we need to skip beyond cached data?
+      if (read_offset_ + n >= buffer_offset_ + buffer_.CurrentSize()) {
+        // Yes. Skip whaterver is in memory and adjust offset accordingly
+        n -= buffer_offset_ + buffer_.CurrentSize() - read_offset_;
+        read_offset_ = buffer_offset_ + buffer_.CurrentSize();
+      } else {
+        // No. The entire section to be skipped is entirely i cache.
+        read_offset_ += n;
+        n = 0;
+      }
+    }
+    if (n > 0) {
+      // We still need to skip more, so call the file API for skipping
+      s = file_->Skip(n);
+      if (s.ok()) {
+        read_offset_ += n;
+      }
+      buffer_.Clear();
+    }
+    return s;
+  }
+
+  IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& opts,
+                          Slice* result, char* scratch,
+                          IODebugContext* dbg) override {
+    return file_->PositionedRead(offset, n, opts, result, scratch, dbg);
+  }
+
+  IOStatus InvalidateCache(size_t offset, size_t length) override {
+    std::unique_lock<std::mutex> lk(lock_);
+    buffer_.Clear();
+    return file_->InvalidateCache(offset, length);
+  }
+
+  bool use_direct_io() const override { return file_->use_direct_io(); }
+
+ private:
+  // Tries to read from buffer_ n bytes. If anything was read from the cache, it
+  // sets cached_len to the number of bytes actually read, copies these number
+  // of bytes to scratch and returns true.
+  // If nothing was read sets cached_len to 0 and returns false.
+  bool TryReadFromCache(size_t n, size_t* cached_len, char* scratch) {
+    if (read_offset_ < buffer_offset_ ||
+        read_offset_ >= buffer_offset_ + buffer_.CurrentSize()) {
+      *cached_len = 0;
+      return false;
+    }
+    uint64_t offset_in_buffer = read_offset_ - buffer_offset_;
+    *cached_len = std::min(
+        buffer_.CurrentSize() - static_cast<size_t>(offset_in_buffer), n);
+    memcpy(scratch, buffer_.BufferStart() + offset_in_buffer, *cached_len);
+    read_offset_ += *cached_len;
+    return true;
+  }
+
+  // Reads into buffer_ the next n bytes from file_.
+  // Can actually read less if EOF was reached.
+  // Returns the status of the read operastion on the file.
+  IOStatus ReadIntoBuffer(size_t n, const IOOptions& opts,
+                          IODebugContext* dbg) {
+    if (n > buffer_.Capacity()) {
+      n = buffer_.Capacity();
+    }
+    assert(IsFileSectorAligned(n, alignment_));
+    Slice result;
+    IOStatus s = file_->Read(n, opts, &result, buffer_.BufferStart(), dbg);
+    if (s.ok()) {
+      buffer_offset_ = read_offset_;
+      buffer_.Size(result.size());
+      assert(result.size() == 0 || buffer_.BufferStart() == result.data());
+    }
+    return s;
+  }
+
+  const std::unique_ptr<FSSequentialFile> file_;
+  const size_t alignment_;
+  const size_t readahead_size_;
+
+  std::mutex lock_;
+  // The buffer storing the prefetched data
+  AlignedBuffer buffer_;
+  // The offset in file_, corresponding to data stored in buffer_
+  uint64_t buffer_offset_;
+  // The offset up to which data was read from file_. In fact, it can be larger
+  // than the actual file size, since the file_->Skip(n) call doesn't return the
+  // actual number of bytes that were skipped, which can be less than n.
+  // This is not a problemm since read_offset_ is monotonically increasing and
+  // its only use is to figure out if next piece of data should be read from
+  // buffer_ or file_ directly.
+  uint64_t read_offset_;
+};
+}  // namespace
+
+std::unique_ptr<FSSequentialFile>
+SequentialFileReader::NewReadaheadSequentialFile(
+    std::unique_ptr<FSSequentialFile>&& file, size_t readahead_size) {
+  if (file->GetRequiredBufferAlignment() >= readahead_size) {
+    // Short-circuit and return the original file if readahead_size is
+    // too small and hence doesn't make sense to be used for prefetching.
+    return std::move(file);
+  }
+  std::unique_ptr<FSSequentialFile> result(
+      new ReadaheadSequentialFile(std::move(file), readahead_size));
+  return result;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/sequence_file_reader.h b/src/rocksdb/file/sequence_file_reader.h
new file mode 100644
index 000000000..baea10eb7
--- /dev/null
+++ b/src/rocksdb/file/sequence_file_reader.h
@@ -0,0 +1,129 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <string>
+
+#include "env/file_system_tracer.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// SequentialFileReader is a wrapper on top of Env::SequentialFile. It handles
+// Buffered (i.e when page cache is enabled) and Direct (with O_DIRECT / page
+// cache disabled) reads appropriately, and also updates the IO stats.
+class SequentialFileReader {
+ private:
+#ifndef ROCKSDB_LITE
+  void NotifyOnFileReadFinish(
+      uint64_t offset, size_t length,
+      const FileOperationInfo::StartTimePoint& start_ts,
+      const FileOperationInfo::FinishTimePoint& finish_ts,
+      const Status& status) const {
+    FileOperationInfo info(FileOperationType::kRead, file_name_, start_ts,
+                           finish_ts, status);
+    info.offset = offset;
+    info.length = length;
+
+    for (auto& listener : listeners_) {
+      listener->OnFileReadFinish(info);
+    }
+    info.status.PermitUncheckedError();
+  }
+
+  void AddFileIOListeners(
+      const std::vector<std::shared_ptr<EventListener>>& listeners) {
+    std::for_each(listeners.begin(), listeners.end(),
+                  [this](const std::shared_ptr<EventListener>& e) {
+                    if (e->ShouldBeNotifiedOnFileIO()) {
+                      listeners_.emplace_back(e);
+                    }
+                  });
+  }
+#endif  // ROCKSDB_LITE
+
+  bool ShouldNotifyListeners() const { return !listeners_.empty(); }
+
+  std::string file_name_;
+  FSSequentialFilePtr file_;
+  std::atomic<size_t> offset_{0};  // read offset
+  std::vector<std::shared_ptr<EventListener>> listeners_{};
+  RateLimiter* rate_limiter_;
+
+ public:
+  explicit SequentialFileReader(
+      std::unique_ptr<FSSequentialFile>&& _file, const std::string& _file_name,
+      const std::shared_ptr<IOTracer>& io_tracer = nullptr,
+      const std::vector<std::shared_ptr<EventListener>>& listeners = {},
+      RateLimiter* rate_limiter =
+          nullptr)  // TODO: migrate call sites to provide rate limiter
+      : file_name_(_file_name),
+        file_(std::move(_file), io_tracer, _file_name),
+        listeners_(),
+        rate_limiter_(rate_limiter) {
+#ifndef ROCKSDB_LITE
+    AddFileIOListeners(listeners);
+#else
+    (void)listeners;
+#endif
+  }
+
+  explicit SequentialFileReader(
+      std::unique_ptr<FSSequentialFile>&& _file, const std::string& _file_name,
+      size_t _readahead_size,
+      const std::shared_ptr<IOTracer>& io_tracer = nullptr,
+      const std::vector<std::shared_ptr<EventListener>>& listeners = {},
+      RateLimiter* rate_limiter =
+          nullptr)  // TODO: migrate call sites to provide rate limiter
+      : file_name_(_file_name),
+        file_(NewReadaheadSequentialFile(std::move(_file), _readahead_size),
+              io_tracer, _file_name),
+        listeners_(),
+        rate_limiter_(rate_limiter) {
+#ifndef ROCKSDB_LITE
+    AddFileIOListeners(listeners);
+#else
+    (void)listeners;
+#endif
+  }
+  static IOStatus Create(const std::shared_ptr<FileSystem>& fs,
+                         const std::string& fname, const FileOptions& file_opts,
+                         std::unique_ptr<SequentialFileReader>* reader,
+                         IODebugContext* dbg, RateLimiter* rate_limiter);
+
+  SequentialFileReader(const SequentialFileReader&) = delete;
+  SequentialFileReader& operator=(const SequentialFileReader&) = delete;
+
+  // `rate_limiter_priority` is used to charge the internal rate limiter when
+  // enabled. The special value `Env::IO_TOTAL` makes this operation bypass the
+  // rate limiter. The amount charged to the internal rate limiter is n, even
+  // when less than n bytes are actually read (e.g. at end of file). To avoid
+  // overcharging the rate limiter, the caller can use file size to cap n to
+  // read until end of file.
+  IOStatus Read(size_t n, Slice* result, char* scratch,
+                Env::IOPriority rate_limiter_priority);
+
+  IOStatus Skip(uint64_t n);
+
+  FSSequentialFile* file() { return file_.get(); }
+
+  std::string file_name() { return file_name_; }
+
+  bool use_direct_io() const { return file_->use_direct_io(); }
+
+ private:
+  // NewReadaheadSequentialFile provides a wrapper over SequentialFile to
+  // always prefetch additional data with every read.
+  static std::unique_ptr<FSSequentialFile> NewReadaheadSequentialFile(
+      std::unique_ptr<FSSequentialFile>&& file, size_t readahead_size);
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/sst_file_manager_impl.cc b/src/rocksdb/file/sst_file_manager_impl.cc
new file mode 100644
index 000000000..7053e6a07
--- /dev/null
+++ b/src/rocksdb/file/sst_file_manager_impl.cc
@@ -0,0 +1,525 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "file/sst_file_manager_impl.h"
+
+#include <cinttypes>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "logging/logging.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/sst_file_manager.h"
+#include "test_util/sync_point.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+SstFileManagerImpl::SstFileManagerImpl(
+    const std::shared_ptr<SystemClock>& clock,
+    const std::shared_ptr<FileSystem>& fs,
+    const std::shared_ptr<Logger>& logger, int64_t rate_bytes_per_sec,
+    double max_trash_db_ratio, uint64_t bytes_max_delete_chunk)
+    : clock_(clock),
+      fs_(fs),
+      logger_(logger),
+      total_files_size_(0),
+      compaction_buffer_size_(0),
+      cur_compactions_reserved_size_(0),
+      max_allowed_space_(0),
+      delete_scheduler_(clock_.get(), fs_.get(), rate_bytes_per_sec,
+                        logger.get(), this, max_trash_db_ratio,
+                        bytes_max_delete_chunk),
+      cv_(&mu_),
+      closing_(false),
+      bg_thread_(nullptr),
+      reserved_disk_buffer_(0),
+      free_space_trigger_(0),
+      cur_instance_(nullptr) {}
+
+SstFileManagerImpl::~SstFileManagerImpl() {
+  Close();
+  bg_err_.PermitUncheckedError();
+}
+
+void SstFileManagerImpl::Close() {
+  {
+    MutexLock l(&mu_);
+    if (closing_) {
+      return;
+    }
+    closing_ = true;
+    cv_.SignalAll();
+  }
+  if (bg_thread_) {
+    bg_thread_->join();
+  }
+}
+
+Status SstFileManagerImpl::OnAddFile(const std::string& file_path) {
+  uint64_t file_size;
+  Status s = fs_->GetFileSize(file_path, IOOptions(), &file_size, nullptr);
+  if (s.ok()) {
+    MutexLock l(&mu_);
+    OnAddFileImpl(file_path, file_size);
+  }
+  TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnAddFile",
+                           const_cast<std::string*>(&file_path));
+  return s;
+}
+
+Status SstFileManagerImpl::OnAddFile(const std::string& file_path,
+                                     uint64_t file_size) {
+  MutexLock l(&mu_);
+  OnAddFileImpl(file_path, file_size);
+  TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnAddFile",
+                           const_cast<std::string*>(&file_path));
+  return Status::OK();
+}
+
+Status SstFileManagerImpl::OnDeleteFile(const std::string& file_path) {
+  {
+    MutexLock l(&mu_);
+    OnDeleteFileImpl(file_path);
+  }
+  TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnDeleteFile",
+                           const_cast<std::string*>(&file_path));
+  return Status::OK();
+}
+
+void SstFileManagerImpl::OnCompactionCompletion(Compaction* c) {
+  MutexLock l(&mu_);
+  uint64_t size_added_by_compaction = 0;
+  for (size_t i = 0; i < c->num_input_levels(); i++) {
+    for (size_t j = 0; j < c->num_input_files(i); j++) {
+      FileMetaData* filemeta = c->input(i, j);
+      size_added_by_compaction += filemeta->fd.GetFileSize();
+    }
+  }
+  cur_compactions_reserved_size_ -= size_added_by_compaction;
+}
+
+Status SstFileManagerImpl::OnMoveFile(const std::string& old_path,
+                                      const std::string& new_path,
+                                      uint64_t* file_size) {
+  {
+    MutexLock l(&mu_);
+    if (file_size != nullptr) {
+      *file_size = tracked_files_[old_path];
+    }
+    OnAddFileImpl(new_path, tracked_files_[old_path]);
+    OnDeleteFileImpl(old_path);
+  }
+  TEST_SYNC_POINT("SstFileManagerImpl::OnMoveFile");
+  return Status::OK();
+}
+
+void SstFileManagerImpl::SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) {
+  MutexLock l(&mu_);
+  max_allowed_space_ = max_allowed_space;
+}
+
+void SstFileManagerImpl::SetCompactionBufferSize(
+    uint64_t compaction_buffer_size) {
+  MutexLock l(&mu_);
+  compaction_buffer_size_ = compaction_buffer_size;
+}
+
+bool SstFileManagerImpl::IsMaxAllowedSpaceReached() {
+  MutexLock l(&mu_);
+  if (max_allowed_space_ <= 0) {
+    return false;
+  }
+  return total_files_size_ >= max_allowed_space_;
+}
+
+bool SstFileManagerImpl::IsMaxAllowedSpaceReachedIncludingCompactions() {
+  MutexLock l(&mu_);
+  if (max_allowed_space_ <= 0) {
+    return false;
+  }
+  return total_files_size_ + cur_compactions_reserved_size_ >=
+         max_allowed_space_;
+}
+
+bool SstFileManagerImpl::EnoughRoomForCompaction(
+    ColumnFamilyData* cfd, const std::vector<CompactionInputFiles>& inputs,
+    const Status& bg_error) {
+  MutexLock l(&mu_);
+  uint64_t size_added_by_compaction = 0;
+  // First check if we even have the space to do the compaction
+  for (size_t i = 0; i < inputs.size(); i++) {
+    for (size_t j = 0; j < inputs[i].size(); j++) {
+      FileMetaData* filemeta = inputs[i][j];
+      size_added_by_compaction += filemeta->fd.GetFileSize();
+    }
+  }
+
+  // Update cur_compactions_reserved_size_ so concurrent compaction
+  // don't max out space
+  size_t needed_headroom = cur_compactions_reserved_size_ +
+                           size_added_by_compaction + compaction_buffer_size_;
+  if (max_allowed_space_ != 0 &&
+      (needed_headroom + total_files_size_ > max_allowed_space_)) {
+    return false;
+  }
+
+  // Implement more aggressive checks only if this DB instance has already
+  // seen a NoSpace() error. This is tin order to contain a single potentially
+  // misbehaving DB instance and prevent it from slowing down compactions of
+  // other DB instances
+  if (bg_error.IsNoSpace() && CheckFreeSpace()) {
+    auto fn =
+        TableFileName(cfd->ioptions()->cf_paths, inputs[0][0]->fd.GetNumber(),
+                      inputs[0][0]->fd.GetPathId());
+    uint64_t free_space = 0;
+    Status s = fs_->GetFreeSpace(fn, IOOptions(), &free_space, nullptr);
+    s.PermitUncheckedError();  // TODO: Check the status
+    // needed_headroom is based on current size reserved by compactions,
+    // minus any files created by running compactions as they would count
+    // against the reserved size. If user didn't specify any compaction
+    // buffer, add reserved_disk_buffer_ that's calculated by default so the
+    // compaction doesn't end up leaving nothing for logs and flush SSTs
+    if (compaction_buffer_size_ == 0) {
+      needed_headroom += reserved_disk_buffer_;
+    }
+    if (free_space < needed_headroom + size_added_by_compaction) {
+      // We hit the condition of not enough disk space
+      ROCKS_LOG_ERROR(logger_,
+                      "free space [%" PRIu64
+                      " bytes] is less than "
+                      "needed headroom [%" ROCKSDB_PRIszt " bytes]\n",
+                      free_space, needed_headroom);
+      return false;
+    }
+  }
+
+  cur_compactions_reserved_size_ += size_added_by_compaction;
+  // Take a snapshot of cur_compactions_reserved_size_ for when we encounter
+  // a NoSpace error.
+  free_space_trigger_ = cur_compactions_reserved_size_;
+  return true;
+}
+
+uint64_t SstFileManagerImpl::GetCompactionsReservedSize() {
+  MutexLock l(&mu_);
+  return cur_compactions_reserved_size_;
+}
+
+uint64_t SstFileManagerImpl::GetTotalSize() {
+  MutexLock l(&mu_);
+  return total_files_size_;
+}
+
+std::unordered_map<std::string, uint64_t>
+SstFileManagerImpl::GetTrackedFiles() {
+  MutexLock l(&mu_);
+  return tracked_files_;
+}
+
+int64_t SstFileManagerImpl::GetDeleteRateBytesPerSecond() {
+  return delete_scheduler_.GetRateBytesPerSecond();
+}
+
+void SstFileManagerImpl::SetDeleteRateBytesPerSecond(int64_t delete_rate) {
+  return delete_scheduler_.SetRateBytesPerSecond(delete_rate);
+}
+
+double SstFileManagerImpl::GetMaxTrashDBRatio() {
+  return delete_scheduler_.GetMaxTrashDBRatio();
+}
+
+void SstFileManagerImpl::SetMaxTrashDBRatio(double r) {
+  return delete_scheduler_.SetMaxTrashDBRatio(r);
+}
+
+uint64_t SstFileManagerImpl::GetTotalTrashSize() {
+  return delete_scheduler_.GetTotalTrashSize();
+}
+
+void SstFileManagerImpl::ReserveDiskBuffer(uint64_t size,
+                                           const std::string& path) {
+  MutexLock l(&mu_);
+
+  reserved_disk_buffer_ += size;
+  if (path_.empty()) {
+    path_ = path;
+  }
+}
+
+void SstFileManagerImpl::ClearError() {
+  while (true) {
+    MutexLock l(&mu_);
+
+    if (error_handler_list_.empty() || closing_) {
+      return;
+    }
+
+    uint64_t free_space = 0;
+    Status s = fs_->GetFreeSpace(path_, IOOptions(), &free_space, nullptr);
+    free_space = max_allowed_space_ > 0
+                     ? std::min(max_allowed_space_, free_space)
+                     : free_space;
+    if (s.ok()) {
+      // In case of multi-DB instances, some of them may have experienced a
+      // soft error and some a hard error. In the SstFileManagerImpl, a hard
+      // error will basically override previously reported soft errors. Once
+      // we clear the hard error, we don't keep track of previous errors for
+      // now
+      if (bg_err_.severity() == Status::Severity::kHardError) {
+        if (free_space < reserved_disk_buffer_) {
+          ROCKS_LOG_ERROR(logger_,
+                          "free space [%" PRIu64
+                          " bytes] is less than "
+                          "required disk buffer [%" PRIu64 " bytes]\n",
+                          free_space, reserved_disk_buffer_);
+          ROCKS_LOG_ERROR(logger_, "Cannot clear hard error\n");
+          s = Status::NoSpace();
+        }
+      } else if (bg_err_.severity() == Status::Severity::kSoftError) {
+        if (free_space < free_space_trigger_) {
+          ROCKS_LOG_WARN(logger_,
+                         "free space [%" PRIu64
+                         " bytes] is less than "
+                         "free space for compaction trigger [%" PRIu64
+                         " bytes]\n",
+                         free_space, free_space_trigger_);
+          ROCKS_LOG_WARN(logger_, "Cannot clear soft error\n");
+          s = Status::NoSpace();
+        }
+      }
+    }
+
+    // Someone could have called CancelErrorRecovery() and the list could have
+    // become empty, so check again here
+    if (s.ok()) {
+      assert(!error_handler_list_.empty());
+      auto error_handler = error_handler_list_.front();
+      // Since we will release the mutex, set cur_instance_ to signal to the
+      // shutdown thread, if it calls // CancelErrorRecovery() the meantime,
+      // to indicate that this DB instance is busy. The DB instance is
+      // guaranteed to not be deleted before RecoverFromBGError() returns,
+      // since the ErrorHandler::recovery_in_prog_ flag would be true
+      cur_instance_ = error_handler;
+      mu_.Unlock();
+      s = error_handler->RecoverFromBGError();
+      TEST_SYNC_POINT("SstFileManagerImpl::ErrorCleared");
+      mu_.Lock();
+      // The DB instance might have been deleted while we were
+      // waiting for the mutex, so check cur_instance_ to make sure its
+      // still non-null
+      if (cur_instance_) {
+        // Check for error again, since the instance may have recovered but
+        // immediately got another error. If that's the case, and the new
+        // error is also a NoSpace() non-fatal error, leave the instance in
+        // the list
+        Status err = cur_instance_->GetBGError();
+        if (s.ok() && err.subcode() == IOStatus::SubCode::kNoSpace &&
+            err.severity() < Status::Severity::kFatalError) {
+          s = err;
+        }
+        cur_instance_ = nullptr;
+      }
+
+      if (s.ok() || s.IsShutdownInProgress() ||
+          (!s.ok() && s.severity() >= Status::Severity::kFatalError)) {
+        // If shutdown is in progress, abandon this handler instance
+        // and continue with the others
+        error_handler_list_.pop_front();
+      }
+    }
+
+    if (!error_handler_list_.empty()) {
+      // If there are more instances to be recovered, reschedule after 5
+      // seconds
+      int64_t wait_until = clock_->NowMicros() + 5000000;
+      cv_.TimedWait(wait_until);
+    }
+
+    // Check again for error_handler_list_ empty, as a DB instance shutdown
+    // could have removed it from the queue while we were in timed wait
+    if (error_handler_list_.empty()) {
+      ROCKS_LOG_INFO(logger_, "Clearing error\n");
+      bg_err_ = Status::OK();
+      return;
+    }
+  }
+}
+
+void SstFileManagerImpl::StartErrorRecovery(ErrorHandler* handler,
+                                            Status bg_error) {
+  MutexLock l(&mu_);
+  if (bg_error.severity() == Status::Severity::kSoftError) {
+    if (bg_err_.ok()) {
+      // Setting bg_err_ basically means we're in degraded mode
+      // Assume that all pending compactions will fail similarly. The trigger
+      // for clearing this condition is set to current compaction reserved
+      // size, so we stop checking disk space available in
+      // EnoughRoomForCompaction once this much free space is available
+      bg_err_ = bg_error;
+    }
+  } else if (bg_error.severity() == Status::Severity::kHardError) {
+    bg_err_ = bg_error;
+  } else {
+    assert(false);
+  }
+
+  // If this is the first instance of this error, kick of a thread to poll
+  // and recover from this condition
+  if (error_handler_list_.empty()) {
+    error_handler_list_.push_back(handler);
+    // Release lock before calling join. Its ok to do so because
+    // error_handler_list_ is now non-empty, so no other invocation of this
+    // function will execute this piece of code
+    mu_.Unlock();
+    if (bg_thread_) {
+      bg_thread_->join();
+    }
+    // Start a new thread. The previous one would have exited.
+    bg_thread_.reset(new port::Thread(&SstFileManagerImpl::ClearError, this));
+    mu_.Lock();
+  } else {
+    // Check if this DB instance is already in the list
+    for (auto iter = error_handler_list_.begin();
+         iter != error_handler_list_.end(); ++iter) {
+      if ((*iter) == handler) {
+        return;
+      }
+    }
+    error_handler_list_.push_back(handler);
+  }
+}
+
+bool SstFileManagerImpl::CancelErrorRecovery(ErrorHandler* handler) {
+  MutexLock l(&mu_);
+
+  if (cur_instance_ == handler) {
+    // This instance is currently busy attempting to recover
+    // Nullify it so the recovery thread doesn't attempt to access it again
+    cur_instance_ = nullptr;
+    return false;
+  }
+
+  for (auto iter = error_handler_list_.begin();
+       iter != error_handler_list_.end(); ++iter) {
+    if ((*iter) == handler) {
+      error_handler_list_.erase(iter);
+      return true;
+    }
+  }
+  return false;
+}
+
+Status SstFileManagerImpl::ScheduleFileDeletion(const std::string& file_path,
+                                                const std::string& path_to_sync,
+                                                const bool force_bg) {
+  TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::ScheduleFileDeletion",
+                           const_cast<std::string*>(&file_path));
+  return delete_scheduler_.DeleteFile(file_path, path_to_sync, force_bg);
+}
+
+void SstFileManagerImpl::WaitForEmptyTrash() {
+  delete_scheduler_.WaitForEmptyTrash();
+}
+
+void SstFileManagerImpl::OnAddFileImpl(const std::string& file_path,
+                                       uint64_t file_size) {
+  auto tracked_file = tracked_files_.find(file_path);
+  if (tracked_file != tracked_files_.end()) {
+    // File was added before, we will just update the size
+    total_files_size_ -= tracked_file->second;
+    total_files_size_ += file_size;
+    cur_compactions_reserved_size_ -= file_size;
+  } else {
+    total_files_size_ += file_size;
+  }
+  tracked_files_[file_path] = file_size;
+}
+
+void SstFileManagerImpl::OnDeleteFileImpl(const std::string& file_path) {
+  auto tracked_file = tracked_files_.find(file_path);
+  if (tracked_file == tracked_files_.end()) {
+    // File is not tracked
+    return;
+  }
+
+  total_files_size_ -= tracked_file->second;
+  tracked_files_.erase(tracked_file);
+}
+
+SstFileManager* NewSstFileManager(Env* env, std::shared_ptr<Logger> info_log,
+                                  std::string trash_dir,
+                                  int64_t rate_bytes_per_sec,
+                                  bool delete_existing_trash, Status* status,
+                                  double max_trash_db_ratio,
+                                  uint64_t bytes_max_delete_chunk) {
+  const auto& fs = env->GetFileSystem();
+  return NewSstFileManager(env, fs, info_log, trash_dir, rate_bytes_per_sec,
+                           delete_existing_trash, status, max_trash_db_ratio,
+                           bytes_max_delete_chunk);
+}
+
+SstFileManager* NewSstFileManager(Env* env, std::shared_ptr<FileSystem> fs,
+                                  std::shared_ptr<Logger> info_log,
+                                  const std::string& trash_dir,
+                                  int64_t rate_bytes_per_sec,
+                                  bool delete_existing_trash, Status* status,
+                                  double max_trash_db_ratio,
+                                  uint64_t bytes_max_delete_chunk) {
+  const auto& clock = env->GetSystemClock();
+  SstFileManagerImpl* res =
+      new SstFileManagerImpl(clock, fs, info_log, rate_bytes_per_sec,
+                             max_trash_db_ratio, bytes_max_delete_chunk);
+
+  // trash_dir is deprecated and not needed anymore, but if user passed it
+  // we will still remove files in it.
+  Status s = Status::OK();
+  if (delete_existing_trash && trash_dir != "") {
+    std::vector<std::string> files_in_trash;
+    s = fs->GetChildren(trash_dir, IOOptions(), &files_in_trash, nullptr);
+    if (s.ok()) {
+      for (const std::string& trash_file : files_in_trash) {
+        std::string path_in_trash = trash_dir + "/" + trash_file;
+        res->OnAddFile(path_in_trash);
+        Status file_delete =
+            res->ScheduleFileDeletion(path_in_trash, trash_dir);
+        if (s.ok() && !file_delete.ok()) {
+          s = file_delete;
+        }
+      }
+    }
+  }
+
+  if (status) {
+    *status = s;
+  } else {
+    // No one passed us a Status, so they must not care about the error...
+    s.PermitUncheckedError();
+  }
+
+  return res;
+}
+
+#else
+
+SstFileManager* NewSstFileManager(Env* /*env*/,
+                                  std::shared_ptr<Logger> /*info_log*/,
+                                  std::string /*trash_dir*/,
+                                  int64_t /*rate_bytes_per_sec*/,
+                                  bool /*delete_existing_trash*/,
+                                  Status* status, double /*max_trash_db_ratio*/,
+                                  uint64_t /*bytes_max_delete_chunk*/) {
+  if (status) {
+    *status =
+        Status::NotSupported("SstFileManager is not supported in ROCKSDB_LITE");
+  }
+  return nullptr;
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/sst_file_manager_impl.h b/src/rocksdb/file/sst_file_manager_impl.h
new file mode 100644
index 000000000..548eb57f8
--- /dev/null
+++ b/src/rocksdb/file/sst_file_manager_impl.h
@@ -0,0 +1,195 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "db/compaction/compaction.h"
+#include "file/delete_scheduler.h"
+#include "port/port.h"
+#include "rocksdb/sst_file_manager.h"
+
+namespace ROCKSDB_NAMESPACE {
+class ErrorHandler;
+class FileSystem;
+class SystemClock;
+class Logger;
+
+// SstFileManager is used to track SST and blob files in the DB and control
+// their deletion rate. All SstFileManager public functions are thread-safe.
+class SstFileManagerImpl : public SstFileManager {
+ public:
+  explicit SstFileManagerImpl(const std::shared_ptr<SystemClock>& clock,
+                              const std::shared_ptr<FileSystem>& fs,
+                              const std::shared_ptr<Logger>& logger,
+                              int64_t rate_bytes_per_sec,
+                              double max_trash_db_ratio,
+                              uint64_t bytes_max_delete_chunk);
+
+  ~SstFileManagerImpl();
+
+  // DB will call OnAddFile whenever a new sst/blob file is added.
+  Status OnAddFile(const std::string& file_path);
+
+  // Overload where size of the file is provided by the caller rather than
+  // queried from the filesystem. This is an optimization.
+  Status OnAddFile(const std::string& file_path, uint64_t file_size);
+
+  // DB will call OnDeleteFile whenever a sst/blob file is deleted.
+  Status OnDeleteFile(const std::string& file_path);
+
+  // DB will call OnMoveFile whenever a sst/blob file is move to a new path.
+  Status OnMoveFile(const std::string& old_path, const std::string& new_path,
+                    uint64_t* file_size = nullptr);
+
+  // Update the maximum allowed space that should be used by RocksDB, if
+  // the total size of the SST and blob files exceeds max_allowed_space, writes
+  // to RocksDB will fail.
+  //
+  // Setting max_allowed_space to 0 will disable this feature, maximum allowed
+  // space will be infinite (Default value).
+  //
+  // thread-safe.
+  void SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) override;
+
+  void SetCompactionBufferSize(uint64_t compaction_buffer_size) override;
+
+  // Return true if the total size of SST and blob files exceeded the maximum
+  // allowed space usage.
+  //
+  // thread-safe.
+  bool IsMaxAllowedSpaceReached() override;
+
+  bool IsMaxAllowedSpaceReachedIncludingCompactions() override;
+
+  // Returns true is there is enough (approximate) space for the specified
+  // compaction. Space is approximate because this function conservatively
+  // estimates how much space is currently being used by compactions (i.e.
+  // if a compaction has started, this function bumps the used space by
+  // the full compaction size).
+  bool EnoughRoomForCompaction(ColumnFamilyData* cfd,
+                               const std::vector<CompactionInputFiles>& inputs,
+                               const Status& bg_error);
+
+  // Bookkeeping so total_file_sizes_ goes back to normal after compaction
+  // finishes
+  void OnCompactionCompletion(Compaction* c);
+
+  uint64_t GetCompactionsReservedSize();
+
+  // Return the total size of all tracked files.
+  uint64_t GetTotalSize() override;
+
+  // Return a map containing all tracked files and there corresponding sizes.
+  std::unordered_map<std::string, uint64_t> GetTrackedFiles() override;
+
+  // Return delete rate limit in bytes per second.
+  virtual int64_t GetDeleteRateBytesPerSecond() override;
+
+  // Update the delete rate limit in bytes per second.
+  virtual void SetDeleteRateBytesPerSecond(int64_t delete_rate) override;
+
+  // Return trash/DB size ratio where new files will be deleted immediately
+  virtual double GetMaxTrashDBRatio() override;
+
+  // Update trash/DB size ratio where new files will be deleted immediately
+  virtual void SetMaxTrashDBRatio(double ratio) override;
+
+  // Return the total size of trash files
+  uint64_t GetTotalTrashSize() override;
+
+  // Called by each DB instance using this sst file manager to reserve
+  // disk buffer space for recovery from out of space errors
+  void ReserveDiskBuffer(uint64_t buffer, const std::string& path);
+
+  // Set a flag upon encountering disk full. May enqueue the ErrorHandler
+  // instance for background polling and recovery
+  void StartErrorRecovery(ErrorHandler* db, Status bg_error);
+
+  // Remove the given Errorhandler instance from the recovery queue. Its
+  // not guaranteed
+  bool CancelErrorRecovery(ErrorHandler* db);
+
+  // Mark file as trash and schedule it's deletion. If force_bg is set, it
+  // forces the file to be deleting in the background regardless of DB size,
+  // except when rate limited delete is disabled
+  virtual Status ScheduleFileDeletion(const std::string& file_path,
+                                      const std::string& dir_to_sync,
+                                      const bool force_bg = false);
+
+  // Wait for all files being deleteing in the background to finish or for
+  // destructor to be called.
+  virtual void WaitForEmptyTrash();
+
+  DeleteScheduler* delete_scheduler() { return &delete_scheduler_; }
+
+  // Stop the error recovery background thread. This should be called only
+  // once in the object's lifetime, and before the destructor
+  void Close();
+
+  void SetStatisticsPtr(const std::shared_ptr<Statistics>& stats) override {
+    stats_ = stats;
+    delete_scheduler_.SetStatisticsPtr(stats);
+  }
+
+ private:
+  // REQUIRES: mutex locked
+  void OnAddFileImpl(const std::string& file_path, uint64_t file_size);
+  // REQUIRES: mutex locked
+  void OnDeleteFileImpl(const std::string& file_path);
+
+  void ClearError();
+  bool CheckFreeSpace() {
+    return bg_err_.severity() == Status::Severity::kSoftError;
+  }
+
+  std::shared_ptr<SystemClock> clock_;
+  std::shared_ptr<FileSystem> fs_;
+  std::shared_ptr<Logger> logger_;
+  // Mutex to protect tracked_files_, total_files_size_
+  port::Mutex mu_;
+  // The summation of the sizes of all files in tracked_files_ map
+  uint64_t total_files_size_;
+  // Compactions should only execute if they can leave at least
+  // this amount of buffer space for logs and flushes
+  uint64_t compaction_buffer_size_;
+  // Estimated size of the current ongoing compactions
+  uint64_t cur_compactions_reserved_size_;
+  // A map containing all tracked files and there sizes
+  //  file_path => file_size
+  std::unordered_map<std::string, uint64_t> tracked_files_;
+  // The maximum allowed space (in bytes) for sst and blob files.
+  uint64_t max_allowed_space_;
+  // DeleteScheduler used to throttle file deletion.
+  DeleteScheduler delete_scheduler_;
+  port::CondVar cv_;
+  // Flag to force error recovery thread to exit
+  bool closing_;
+  // Background error recovery thread
+  std::unique_ptr<port::Thread> bg_thread_;
+  // A path in the filesystem corresponding to this SFM. This is used for
+  // calling Env::GetFreeSpace. Posix requires a path in the filesystem
+  std::string path_;
+  // Save the current background error
+  Status bg_err_;
+  // Amount of free disk headroom before allowing recovery from hard errors
+  uint64_t reserved_disk_buffer_;
+  // For soft errors, amount of free disk space before we can allow
+  // compactions to run full throttle. If disk space is below this trigger,
+  // compactions will be gated by free disk space > input size
+  uint64_t free_space_trigger_;
+  // List of database error handler instances tracked by this SstFileManager.
+  std::list<ErrorHandler*> error_handler_list_;
+  // Pointer to ErrorHandler instance that is currently processing recovery
+  ErrorHandler* cur_instance_;
+  std::shared_ptr<Statistics> stats_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/file/writable_file_writer.cc b/src/rocksdb/file/writable_file_writer.cc
new file mode 100644
index 000000000..3afc51c56
--- /dev/null
+++ b/src/rocksdb/file/writable_file_writer.cc
@@ -0,0 +1,1025 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/writable_file_writer.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include "db/version_edit.h"
+#include "monitoring/histogram.h"
+#include "monitoring/iostats_context_imp.h"
+#include "port/port.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/sync_point.h"
+#include "util/crc32c.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+IOStatus WritableFileWriter::Create(const std::shared_ptr<FileSystem>& fs,
+                                    const std::string& fname,
+                                    const FileOptions& file_opts,
+                                    std::unique_ptr<WritableFileWriter>* writer,
+                                    IODebugContext* dbg) {
+  if (file_opts.use_direct_writes &&
+      0 == file_opts.writable_file_max_buffer_size) {
+    return IOStatus::InvalidArgument(
+        "Direct write requires writable_file_max_buffer_size > 0");
+  }
+  std::unique_ptr<FSWritableFile> file;
+  IOStatus io_s = fs->NewWritableFile(fname, file_opts, &file, dbg);
+  if (io_s.ok()) {
+    writer->reset(new WritableFileWriter(std::move(file), fname, file_opts));
+  }
+  return io_s;
+}
+
+IOStatus WritableFileWriter::Append(const Slice& data, uint32_t crc32c_checksum,
+                                    Env::IOPriority op_rate_limiter_priority) {
+  if (seen_error()) {
+    return AssertFalseAndGetStatusForPrevError();
+  }
+
+  const char* src = data.data();
+  size_t left = data.size();
+  IOStatus s;
+  pending_sync_ = true;
+
+  TEST_KILL_RANDOM_WITH_WEIGHT("WritableFileWriter::Append:0", REDUCE_ODDS2);
+
+  // Calculate the checksum of appended data
+  UpdateFileChecksum(data);
+
+  {
+    IOOptions io_options;
+    io_options.rate_limiter_priority =
+        WritableFileWriter::DecideRateLimiterPriority(
+            writable_file_->GetIOPriority(), op_rate_limiter_priority);
+    IOSTATS_TIMER_GUARD(prepare_write_nanos);
+    TEST_SYNC_POINT("WritableFileWriter::Append:BeforePrepareWrite");
+    writable_file_->PrepareWrite(static_cast<size_t>(GetFileSize()), left,
+                                 io_options, nullptr);
+  }
+
+  // See whether we need to enlarge the buffer to avoid the flush
+  if (buf_.Capacity() - buf_.CurrentSize() < left) {
+    for (size_t cap = buf_.Capacity();
+         cap < max_buffer_size_;  // There is still room to increase
+         cap *= 2) {
+      // See whether the next available size is large enough.
+      // Buffer will never be increased to more than max_buffer_size_.
+      size_t desired_capacity = std::min(cap * 2, max_buffer_size_);
+      if (desired_capacity - buf_.CurrentSize() >= left ||
+          (use_direct_io() && desired_capacity == max_buffer_size_)) {
+        buf_.AllocateNewBuffer(desired_capacity, true);
+        break;
+      }
+    }
+  }
+
+  // Flush only when buffered I/O
+  if (!use_direct_io() && (buf_.Capacity() - buf_.CurrentSize()) < left) {
+    if (buf_.CurrentSize() > 0) {
+      s = Flush(op_rate_limiter_priority);
+      if (!s.ok()) {
+        set_seen_error();
+        return s;
+      }
+    }
+    assert(buf_.CurrentSize() == 0);
+  }
+
+  if (perform_data_verification_ && buffered_data_with_checksum_ &&
+      crc32c_checksum != 0) {
+    // Since we want to use the checksum of the input data, we cannot break it
+    // into several pieces. We will only write them in the buffer when buffer
+    // size is enough. Otherwise, we will directly write it down.
+    if (use_direct_io() || (buf_.Capacity() - buf_.CurrentSize()) >= left) {
+      if ((buf_.Capacity() - buf_.CurrentSize()) >= left) {
+        size_t appended = buf_.Append(src, left);
+        if (appended != left) {
+          s = IOStatus::Corruption("Write buffer append failure");
+        }
+        buffered_data_crc32c_checksum_ = crc32c::Crc32cCombine(
+            buffered_data_crc32c_checksum_, crc32c_checksum, appended);
+      } else {
+        while (left > 0) {
+          size_t appended = buf_.Append(src, left);
+          buffered_data_crc32c_checksum_ =
+              crc32c::Extend(buffered_data_crc32c_checksum_, src, appended);
+          left -= appended;
+          src += appended;
+
+          if (left > 0) {
+            s = Flush(op_rate_limiter_priority);
+            if (!s.ok()) {
+              break;
+            }
+          }
+        }
+      }
+    } else {
+      assert(buf_.CurrentSize() == 0);
+      buffered_data_crc32c_checksum_ = crc32c_checksum;
+      s = WriteBufferedWithChecksum(src, left, op_rate_limiter_priority);
+    }
+  } else {
+    // In this case, either we do not need to do the data verification or
+    // caller does not provide the checksum of the data (crc32c_checksum = 0).
+    //
+    // We never write directly to disk with direct I/O on.
+    // or we simply use it for its original purpose to accumulate many small
+    // chunks
+    if (use_direct_io() || (buf_.Capacity() >= left)) {
+      while (left > 0) {
+        size_t appended = buf_.Append(src, left);
+        if (perform_data_verification_ && buffered_data_with_checksum_) {
+          buffered_data_crc32c_checksum_ =
+              crc32c::Extend(buffered_data_crc32c_checksum_, src, appended);
+        }
+        left -= appended;
+        src += appended;
+
+        if (left > 0) {
+          s = Flush(op_rate_limiter_priority);
+          if (!s.ok()) {
+            break;
+          }
+        }
+      }
+    } else {
+      // Writing directly to file bypassing the buffer
+      assert(buf_.CurrentSize() == 0);
+      if (perform_data_verification_ && buffered_data_with_checksum_) {
+        buffered_data_crc32c_checksum_ = crc32c::Value(src, left);
+        s = WriteBufferedWithChecksum(src, left, op_rate_limiter_priority);
+      } else {
+        s = WriteBuffered(src, left, op_rate_limiter_priority);
+      }
+    }
+  }
+
+  TEST_KILL_RANDOM("WritableFileWriter::Append:1");
+  if (s.ok()) {
+    uint64_t cur_size = filesize_.load(std::memory_order_acquire);
+    filesize_.store(cur_size + data.size(), std::memory_order_release);
+  } else {
+    set_seen_error();
+  }
+  return s;
+}
+
+IOStatus WritableFileWriter::Pad(const size_t pad_bytes,
+                                 Env::IOPriority op_rate_limiter_priority) {
+  if (seen_error()) {
+    return AssertFalseAndGetStatusForPrevError();
+  }
+  assert(pad_bytes < kDefaultPageSize);
+  size_t left = pad_bytes;
+  size_t cap = buf_.Capacity() - buf_.CurrentSize();
+  size_t pad_start = buf_.CurrentSize();
+
+  // Assume pad_bytes is small compared to buf_ capacity. So we always
+  // use buf_ rather than write directly to file in certain cases like
+  // Append() does.
+  while (left) {
+    size_t append_bytes = std::min(cap, left);
+    buf_.PadWith(append_bytes, 0);
+    left -= append_bytes;
+    if (left > 0) {
+      IOStatus s = Flush(op_rate_limiter_priority);
+      if (!s.ok()) {
+        set_seen_error();
+        return s;
+      }
+    }
+    cap = buf_.Capacity() - buf_.CurrentSize();
+  }
+  pending_sync_ = true;
+  uint64_t cur_size = filesize_.load(std::memory_order_acquire);
+  filesize_.store(cur_size + pad_bytes, std::memory_order_release);
+  if (perform_data_verification_) {
+    buffered_data_crc32c_checksum_ =
+        crc32c::Extend(buffered_data_crc32c_checksum_,
+                       buf_.BufferStart() + pad_start, pad_bytes);
+  }
+  return IOStatus::OK();
+}
+
+IOStatus WritableFileWriter::Close() {
+  if (seen_error()) {
+    IOStatus interim;
+    if (writable_file_.get() != nullptr) {
+      interim = writable_file_->Close(IOOptions(), nullptr);
+      writable_file_.reset();
+    }
+    if (interim.ok()) {
+      return IOStatus::IOError(
+          "File is closed but data not flushed as writer has previous error.");
+    } else {
+      return interim;
+    }
+  }
+
+  // Do not quit immediately on failure the file MUST be closed
+
+  // Possible to close it twice now as we MUST close
+  // in __dtor, simply flushing is not enough
+  // Windows when pre-allocating does not fill with zeros
+  // also with unbuffered access we also set the end of data.
+  if (writable_file_.get() == nullptr) {
+    return IOStatus::OK();
+  }
+
+  IOStatus s;
+  s = Flush();  // flush cache to OS
+
+  IOStatus interim;
+  IOOptions io_options;
+  io_options.rate_limiter_priority = writable_file_->GetIOPriority();
+  // In direct I/O mode we write whole pages so
+  // we need to let the file know where data ends.
+  if (use_direct_io()) {
+    {
+#ifndef ROCKSDB_LITE
+      FileOperationInfo::StartTimePoint start_ts;
+      if (ShouldNotifyListeners()) {
+        start_ts = FileOperationInfo::StartNow();
+      }
+#endif
+      uint64_t filesz = filesize_.load(std::memory_order_acquire);
+      interim = writable_file_->Truncate(filesz, io_options, nullptr);
+#ifndef ROCKSDB_LITE
+      if (ShouldNotifyListeners()) {
+        auto finish_ts = FileOperationInfo::FinishNow();
+        NotifyOnFileTruncateFinish(start_ts, finish_ts, s);
+        if (!interim.ok()) {
+          NotifyOnIOError(interim, FileOperationType::kTruncate, file_name(),
+                          filesz);
+        }
+      }
+#endif
+    }
+    if (interim.ok()) {
+      {
+#ifndef ROCKSDB_LITE
+        FileOperationInfo::StartTimePoint start_ts;
+        if (ShouldNotifyListeners()) {
+          start_ts = FileOperationInfo::StartNow();
+        }
+#endif
+        interim = writable_file_->Fsync(io_options, nullptr);
+#ifndef ROCKSDB_LITE
+        if (ShouldNotifyListeners()) {
+          auto finish_ts = FileOperationInfo::FinishNow();
+          NotifyOnFileSyncFinish(start_ts, finish_ts, s,
+                                 FileOperationType::kFsync);
+          if (!interim.ok()) {
+            NotifyOnIOError(interim, FileOperationType::kFsync, file_name());
+          }
+        }
+#endif
+      }
+    }
+    if (!interim.ok() && s.ok()) {
+      s = interim;
+    }
+  }
+
+  TEST_KILL_RANDOM("WritableFileWriter::Close:0");
+  {
+#ifndef ROCKSDB_LITE
+    FileOperationInfo::StartTimePoint start_ts;
+    if (ShouldNotifyListeners()) {
+      start_ts = FileOperationInfo::StartNow();
+    }
+#endif
+    interim = writable_file_->Close(io_options, nullptr);
+#ifndef ROCKSDB_LITE
+    if (ShouldNotifyListeners()) {
+      auto finish_ts = FileOperationInfo::FinishNow();
+      NotifyOnFileCloseFinish(start_ts, finish_ts, s);
+      if (!interim.ok()) {
+        NotifyOnIOError(interim, FileOperationType::kClose, file_name());
+      }
+    }
+#endif
+  }
+  if (!interim.ok() && s.ok()) {
+    s = interim;
+  }
+
+  writable_file_.reset();
+  TEST_KILL_RANDOM("WritableFileWriter::Close:1");
+
+  if (s.ok()) {
+    if (checksum_generator_ != nullptr && !checksum_finalized_) {
+      checksum_generator_->Finalize();
+      checksum_finalized_ = true;
+    }
+  } else {
+    set_seen_error();
+  }
+
+  return s;
+}
+
+// write out the cached data to the OS cache or storage if direct I/O
+// enabled
+IOStatus WritableFileWriter::Flush(Env::IOPriority op_rate_limiter_priority) {
+  if (seen_error()) {
+    return AssertFalseAndGetStatusForPrevError();
+  }
+
+  IOStatus s;
+  TEST_KILL_RANDOM_WITH_WEIGHT("WritableFileWriter::Flush:0", REDUCE_ODDS2);
+
+  if (buf_.CurrentSize() > 0) {
+    if (use_direct_io()) {
+#ifndef ROCKSDB_LITE
+      if (pending_sync_) {
+        if (perform_data_verification_ && buffered_data_with_checksum_) {
+          s = WriteDirectWithChecksum(op_rate_limiter_priority);
+        } else {
+          s = WriteDirect(op_rate_limiter_priority);
+        }
+      }
+#endif  // !ROCKSDB_LITE
+    } else {
+      if (perform_data_verification_ && buffered_data_with_checksum_) {
+        s = WriteBufferedWithChecksum(buf_.BufferStart(), buf_.CurrentSize(),
+                                      op_rate_limiter_priority);
+      } else {
+        s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize(),
+                          op_rate_limiter_priority);
+      }
+    }
+    if (!s.ok()) {
+      set_seen_error();
+      return s;
+    }
+  }
+
+  {
+#ifndef ROCKSDB_LITE
+    FileOperationInfo::StartTimePoint start_ts;
+    if (ShouldNotifyListeners()) {
+      start_ts = FileOperationInfo::StartNow();
+    }
+#endif
+    IOOptions io_options;
+    io_options.rate_limiter_priority =
+        WritableFileWriter::DecideRateLimiterPriority(
+            writable_file_->GetIOPriority(), op_rate_limiter_priority);
+    s = writable_file_->Flush(io_options, nullptr);
+#ifndef ROCKSDB_LITE
+    if (ShouldNotifyListeners()) {
+      auto finish_ts = std::chrono::steady_clock::now();
+      NotifyOnFileFlushFinish(start_ts, finish_ts, s);
+      if (!s.ok()) {
+        NotifyOnIOError(s, FileOperationType::kFlush, file_name());
+      }
+    }
+#endif
+  }
+
+  if (!s.ok()) {
+    set_seen_error();
+    return s;
+  }
+
+  // sync OS cache to disk for every bytes_per_sync_
+  // TODO: give log file and sst file different options (log
+  // files could be potentially cached in OS for their whole
+  // life time, thus we might not want to flush at all).
+
+  // We try to avoid sync to the last 1MB of data. For two reasons:
+  // (1) avoid rewrite the same page that is modified later.
+  // (2) for older version of OS, write can block while writing out
+  //     the page.
+  // Xfs does neighbor page flushing outside of the specified ranges. We
+  // need to make sure sync range is far from the write offset.
+  if (!use_direct_io() && bytes_per_sync_) {
+    const uint64_t kBytesNotSyncRange =
+        1024 * 1024;                                // recent 1MB is not synced.
+    const uint64_t kBytesAlignWhenSync = 4 * 1024;  // Align 4KB.
+    uint64_t cur_size = filesize_.load(std::memory_order_acquire);
+    if (cur_size > kBytesNotSyncRange) {
+      uint64_t offset_sync_to = cur_size - kBytesNotSyncRange;
+      offset_sync_to -= offset_sync_to % kBytesAlignWhenSync;
+      assert(offset_sync_to >= last_sync_size_);
+      if (offset_sync_to > 0 &&
+          offset_sync_to - last_sync_size_ >= bytes_per_sync_) {
+        s = RangeSync(last_sync_size_, offset_sync_to - last_sync_size_);
+        if (!s.ok()) {
+          set_seen_error();
+        }
+        last_sync_size_ = offset_sync_to;
+      }
+    }
+  }
+
+  return s;
+}
+
+std::string WritableFileWriter::GetFileChecksum() {
+  if (checksum_generator_ != nullptr) {
+    assert(checksum_finalized_);
+    return checksum_generator_->GetChecksum();
+  } else {
+    return kUnknownFileChecksum;
+  }
+}
+
+const char* WritableFileWriter::GetFileChecksumFuncName() const {
+  if (checksum_generator_ != nullptr) {
+    return checksum_generator_->Name();
+  } else {
+    return kUnknownFileChecksumFuncName;
+  }
+}
+
+IOStatus WritableFileWriter::Sync(bool use_fsync) {
+  if (seen_error()) {
+    return AssertFalseAndGetStatusForPrevError();
+  }
+
+  IOStatus s = Flush();
+  if (!s.ok()) {
+    set_seen_error();
+    return s;
+  }
+  TEST_KILL_RANDOM("WritableFileWriter::Sync:0");
+  if (!use_direct_io() && pending_sync_) {
+    s = SyncInternal(use_fsync);
+    if (!s.ok()) {
+      set_seen_error();
+      return s;
+    }
+  }
+  TEST_KILL_RANDOM("WritableFileWriter::Sync:1");
+  pending_sync_ = false;
+  return IOStatus::OK();
+}
+
+IOStatus WritableFileWriter::SyncWithoutFlush(bool use_fsync) {
+  if (seen_error()) {
+    return AssertFalseAndGetStatusForPrevError();
+  }
+  if (!writable_file_->IsSyncThreadSafe()) {
+    return IOStatus::NotSupported(
+        "Can't WritableFileWriter::SyncWithoutFlush() because "
+        "WritableFile::IsSyncThreadSafe() is false");
+  }
+  TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:1");
+  IOStatus s = SyncInternal(use_fsync);
+  TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:2");
+  if (!s.ok()) {
+#ifndef NDEBUG
+    sync_without_flush_called_ = true;
+#endif  // NDEBUG
+    set_seen_error();
+  }
+  return s;
+}
+
+IOStatus WritableFileWriter::SyncInternal(bool use_fsync) {
+  // Caller is supposed to check seen_error_
+  IOStatus s;
+  IOSTATS_TIMER_GUARD(fsync_nanos);
+  TEST_SYNC_POINT("WritableFileWriter::SyncInternal:0");
+  auto prev_perf_level = GetPerfLevel();
+
+  IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, clock_);
+
+#ifndef ROCKSDB_LITE
+  FileOperationInfo::StartTimePoint start_ts;
+  if (ShouldNotifyListeners()) {
+    start_ts = FileOperationInfo::StartNow();
+  }
+#endif
+
+  IOOptions io_options;
+  io_options.rate_limiter_priority = writable_file_->GetIOPriority();
+  if (use_fsync) {
+    s = writable_file_->Fsync(io_options, nullptr);
+  } else {
+    s = writable_file_->Sync(io_options, nullptr);
+  }
+#ifndef ROCKSDB_LITE
+  if (ShouldNotifyListeners()) {
+    auto finish_ts = std::chrono::steady_clock::now();
+    NotifyOnFileSyncFinish(
+        start_ts, finish_ts, s,
+        use_fsync ? FileOperationType::kFsync : FileOperationType::kSync);
+    if (!s.ok()) {
+      NotifyOnIOError(
+          s, (use_fsync ? FileOperationType::kFsync : FileOperationType::kSync),
+          file_name());
+    }
+  }
+#endif
+  SetPerfLevel(prev_perf_level);
+
+  // The caller will be responsible to call set_seen_error() if s is not OK.
+  return s;
+}
+
+IOStatus WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) {
+  if (seen_error()) {
+    return AssertFalseAndGetStatusForPrevError();
+  }
+
+  IOSTATS_TIMER_GUARD(range_sync_nanos);
+  TEST_SYNC_POINT("WritableFileWriter::RangeSync:0");
+#ifndef ROCKSDB_LITE
+  FileOperationInfo::StartTimePoint start_ts;
+  if (ShouldNotifyListeners()) {
+    start_ts = FileOperationInfo::StartNow();
+  }
+#endif
+  IOOptions io_options;
+  io_options.rate_limiter_priority = writable_file_->GetIOPriority();
+  IOStatus s = writable_file_->RangeSync(offset, nbytes, io_options, nullptr);
+  if (!s.ok()) {
+    set_seen_error();
+  }
+#ifndef ROCKSDB_LITE
+  if (ShouldNotifyListeners()) {
+    auto finish_ts = std::chrono::steady_clock::now();
+    NotifyOnFileRangeSyncFinish(offset, nbytes, start_ts, finish_ts, s);
+    if (!s.ok()) {
+      NotifyOnIOError(s, FileOperationType::kRangeSync, file_name(), nbytes,
+                      offset);
+    }
+  }
+#endif
+  return s;
+}
+
+// This method writes to disk the specified data and makes use of the rate
+// limiter if available
+IOStatus WritableFileWriter::WriteBuffered(
+    const char* data, size_t size, Env::IOPriority op_rate_limiter_priority) {
+  if (seen_error()) {
+    return AssertFalseAndGetStatusForPrevError();
+  }
+
+  IOStatus s;
+  assert(!use_direct_io());
+  const char* src = data;
+  size_t left = size;
+  DataVerificationInfo v_info;
+  char checksum_buf[sizeof(uint32_t)];
+  Env::IOPriority rate_limiter_priority_used =
+      WritableFileWriter::DecideRateLimiterPriority(
+          writable_file_->GetIOPriority(), op_rate_limiter_priority);
+  IOOptions io_options;
+  io_options.rate_limiter_priority = rate_limiter_priority_used;
+
+  while (left > 0) {
+    size_t allowed = left;
+    if (rate_limiter_ != nullptr &&
+        rate_limiter_priority_used != Env::IO_TOTAL) {
+      allowed = rate_limiter_->RequestToken(left, 0 /* alignment */,
+                                            rate_limiter_priority_used, stats_,
+                                            RateLimiter::OpType::kWrite);
+    }
+
+    {
+      IOSTATS_TIMER_GUARD(write_nanos);
+      TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
+
+#ifndef ROCKSDB_LITE
+      FileOperationInfo::StartTimePoint start_ts;
+      uint64_t old_size = writable_file_->GetFileSize(io_options, nullptr);
+      if (ShouldNotifyListeners()) {
+        start_ts = FileOperationInfo::StartNow();
+        old_size = next_write_offset_;
+      }
+#endif
+      {
+        auto prev_perf_level = GetPerfLevel();
+
+        IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, clock_);
+        if (perform_data_verification_) {
+          Crc32cHandoffChecksumCalculation(src, allowed, checksum_buf);
+          v_info.checksum = Slice(checksum_buf, sizeof(uint32_t));
+          s = writable_file_->Append(Slice(src, allowed), io_options, v_info,
+                                     nullptr);
+        } else {
+          s = writable_file_->Append(Slice(src, allowed), io_options, nullptr);
+        }
+        if (!s.ok()) {
+          // If writable_file_->Append() failed, then the data may or may not
+          // exist in the underlying memory buffer, OS page cache, remote file
+          // system's buffer, etc. If WritableFileWriter keeps the data in
+          // buf_, then a future Close() or write retry may send the data to
+          // the underlying file again. If the data does exist in the
+          // underlying buffer and gets written to the file eventually despite
+          // returning error, the file may end up with two duplicate pieces of
+          // data. Therefore, clear the buf_ at the WritableFileWriter layer
+          // and let caller determine error handling.
+          buf_.Size(0);
+          buffered_data_crc32c_checksum_ = 0;
+        }
+        SetPerfLevel(prev_perf_level);
+      }
+#ifndef ROCKSDB_LITE
+      if (ShouldNotifyListeners()) {
+        auto finish_ts = std::chrono::steady_clock::now();
+        NotifyOnFileWriteFinish(old_size, allowed, start_ts, finish_ts, s);
+        if (!s.ok()) {
+          NotifyOnIOError(s, FileOperationType::kAppend, file_name(), allowed,
+                          old_size);
+        }
+      }
+#endif
+      if (!s.ok()) {
+        set_seen_error();
+        return s;
+      }
+    }
+
+    IOSTATS_ADD(bytes_written, allowed);
+    TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0");
+
+    left -= allowed;
+    src += allowed;
+    uint64_t cur_size = flushed_size_.load(std::memory_order_acquire);
+    flushed_size_.store(cur_size + allowed, std::memory_order_release);
+  }
+  buf_.Size(0);
+  buffered_data_crc32c_checksum_ = 0;
+  if (!s.ok()) {
+    set_seen_error();
+  }
+  return s;
+}
+
+IOStatus WritableFileWriter::WriteBufferedWithChecksum(
+    const char* data, size_t size, Env::IOPriority op_rate_limiter_priority) {
+  if (seen_error()) {
+    return AssertFalseAndGetStatusForPrevError();
+  }
+
+  IOStatus s;
+  assert(!use_direct_io());
+  assert(perform_data_verification_ && buffered_data_with_checksum_);
+  const char* src = data;
+  size_t left = size;
+  DataVerificationInfo v_info;
+  char checksum_buf[sizeof(uint32_t)];
+  Env::IOPriority rate_limiter_priority_used =
+      WritableFileWriter::DecideRateLimiterPriority(
+          writable_file_->GetIOPriority(), op_rate_limiter_priority);
+  IOOptions io_options;
+  io_options.rate_limiter_priority = rate_limiter_priority_used;
+  // Check how much is allowed. Here, we loop until the rate limiter allows to
+  // write the entire buffer.
+  // TODO: need to be improved since it sort of defeats the purpose of the rate
+  // limiter
+  size_t data_size = left;
+  if (rate_limiter_ != nullptr && rate_limiter_priority_used != Env::IO_TOTAL) {
+    while (data_size > 0) {
+      size_t tmp_size;
+      tmp_size = rate_limiter_->RequestToken(data_size, buf_.Alignment(),
+                                             rate_limiter_priority_used, stats_,
+                                             RateLimiter::OpType::kWrite);
+      data_size -= tmp_size;
+    }
+  }
+
+  {
+    IOSTATS_TIMER_GUARD(write_nanos);
+    TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
+
+#ifndef ROCKSDB_LITE
+    FileOperationInfo::StartTimePoint start_ts;
+    uint64_t old_size = writable_file_->GetFileSize(io_options, nullptr);
+    if (ShouldNotifyListeners()) {
+      start_ts = FileOperationInfo::StartNow();
+      old_size = next_write_offset_;
+    }
+#endif
+    {
+      auto prev_perf_level = GetPerfLevel();
+
+      IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, clock_);
+
+      EncodeFixed32(checksum_buf, buffered_data_crc32c_checksum_);
+      v_info.checksum = Slice(checksum_buf, sizeof(uint32_t));
+      s = writable_file_->Append(Slice(src, left), io_options, v_info, nullptr);
+      SetPerfLevel(prev_perf_level);
+    }
+#ifndef ROCKSDB_LITE
+    if (ShouldNotifyListeners()) {
+      auto finish_ts = std::chrono::steady_clock::now();
+      NotifyOnFileWriteFinish(old_size, left, start_ts, finish_ts, s);
+      if (!s.ok()) {
+        NotifyOnIOError(s, FileOperationType::kAppend, file_name(), left,
+                        old_size);
+      }
+    }
+#endif
+    if (!s.ok()) {
+      // If writable_file_->Append() failed, then the data may or may not
+      // exist in the underlying memory buffer, OS page cache, remote file
+      // system's buffer, etc. If WritableFileWriter keeps the data in
+      // buf_, then a future Close() or write retry may send the data to
+      // the underlying file again. If the data does exist in the
+      // underlying buffer and gets written to the file eventually despite
+      // returning error, the file may end up with two duplicate pieces of
+      // data. Therefore, clear the buf_ at the WritableFileWriter layer
+      // and let caller determine error handling.
+      buf_.Size(0);
+      buffered_data_crc32c_checksum_ = 0;
+      set_seen_error();
+      return s;
+    }
+  }
+
+  IOSTATS_ADD(bytes_written, left);
+  TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0");
+
+  // Buffer write is successful, reset the buffer current size to 0 and reset
+  // the corresponding checksum value
+  buf_.Size(0);
+  buffered_data_crc32c_checksum_ = 0;
+  uint64_t cur_size = flushed_size_.load(std::memory_order_acquire);
+  flushed_size_.store(cur_size + left, std::memory_order_release);
+  if (!s.ok()) {
+    set_seen_error();
+  }
+  return s;
+}
+
+void WritableFileWriter::UpdateFileChecksum(const Slice& data) {
+  if (checksum_generator_ != nullptr) {
+    checksum_generator_->Update(data.data(), data.size());
+  }
+}
+
+// Currently, crc32c checksum is used to calculate the checksum value of the
+// content in the input buffer for handoff. In the future, the checksum might be
+// calculated from the existing crc32c checksums of the in WAl and Manifest
+// records, or even SST file blocks.
+// TODO: effectively use the existing checksum of the data being writing to
+// generate the crc32c checksum instead of a raw calculation.
+void WritableFileWriter::Crc32cHandoffChecksumCalculation(const char* data,
+                                                          size_t size,
+                                                          char* buf) {
+  uint32_t v_crc32c = crc32c::Extend(0, data, size);
+  EncodeFixed32(buf, v_crc32c);
+}
+
+// This flushes the accumulated data in the buffer. We pad data with zeros if
+// necessary to the whole page.
+// However, during automatic flushes padding would not be necessary.
+// We always use RateLimiter if available. We move (Refit) any buffer bytes
+// that are left over the
+// whole number of pages to be written again on the next flush because we can
+// only write on aligned
+// offsets.
+#ifndef ROCKSDB_LITE
+IOStatus WritableFileWriter::WriteDirect(
+    Env::IOPriority op_rate_limiter_priority) {
+  if (seen_error()) {
+    assert(false);
+
+    return IOStatus::IOError("Writer has previous error.");
+  }
+
+  assert(use_direct_io());
+  IOStatus s;
+  const size_t alignment = buf_.Alignment();
+  assert((next_write_offset_ % alignment) == 0);
+
+  // Calculate whole page final file advance if all writes succeed
+  const size_t file_advance =
+      TruncateToPageBoundary(alignment, buf_.CurrentSize());
+
+  // Calculate the leftover tail, we write it here padded with zeros BUT we
+  // will write it again in the future either on Close() OR when the current
+  // whole page fills out.
+  const size_t leftover_tail = buf_.CurrentSize() - file_advance;
+
+  // Round up and pad
+  buf_.PadToAlignmentWith(0);
+
+  const char* src = buf_.BufferStart();
+  uint64_t write_offset = next_write_offset_;
+  size_t left = buf_.CurrentSize();
+  DataVerificationInfo v_info;
+  char checksum_buf[sizeof(uint32_t)];
+  Env::IOPriority rate_limiter_priority_used =
+      WritableFileWriter::DecideRateLimiterPriority(
+          writable_file_->GetIOPriority(), op_rate_limiter_priority);
+  IOOptions io_options;
+  io_options.rate_limiter_priority = rate_limiter_priority_used;
+
+  while (left > 0) {
+    // Check how much is allowed
+    size_t size = left;
+    if (rate_limiter_ != nullptr &&
+        rate_limiter_priority_used != Env::IO_TOTAL) {
+      size = rate_limiter_->RequestToken(left, buf_.Alignment(),
+                                         rate_limiter_priority_used, stats_,
+                                         RateLimiter::OpType::kWrite);
+    }
+
+    {
+      IOSTATS_TIMER_GUARD(write_nanos);
+      TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
+      FileOperationInfo::StartTimePoint start_ts;
+      if (ShouldNotifyListeners()) {
+        start_ts = FileOperationInfo::StartNow();
+      }
+      // direct writes must be positional
+      if (perform_data_verification_) {
+        Crc32cHandoffChecksumCalculation(src, size, checksum_buf);
+        v_info.checksum = Slice(checksum_buf, sizeof(uint32_t));
+        s = writable_file_->PositionedAppend(Slice(src, size), write_offset,
+                                             io_options, v_info, nullptr);
+      } else {
+        s = writable_file_->PositionedAppend(Slice(src, size), write_offset,
+                                             io_options, nullptr);
+      }
+
+      if (ShouldNotifyListeners()) {
+        auto finish_ts = std::chrono::steady_clock::now();
+        NotifyOnFileWriteFinish(write_offset, size, start_ts, finish_ts, s);
+        if (!s.ok()) {
+          NotifyOnIOError(s, FileOperationType::kPositionedAppend, file_name(),
+                          size, write_offset);
+        }
+      }
+      if (!s.ok()) {
+        buf_.Size(file_advance + leftover_tail);
+        set_seen_error();
+        return s;
+      }
+    }
+
+    IOSTATS_ADD(bytes_written, size);
+    left -= size;
+    src += size;
+    write_offset += size;
+    uint64_t cur_size = flushed_size_.load(std::memory_order_acquire);
+    flushed_size_.store(cur_size + size, std::memory_order_release);
+    assert((next_write_offset_ % alignment) == 0);
+  }
+
+  if (s.ok()) {
+    // Move the tail to the beginning of the buffer
+    // This never happens during normal Append but rather during
+    // explicit call to Flush()/Sync() or Close()
+    buf_.RefitTail(file_advance, leftover_tail);
+    // This is where we start writing next time which may or not be
+    // the actual file size on disk. They match if the buffer size
+    // is a multiple of whole pages otherwise filesize_ is leftover_tail
+    // behind
+    next_write_offset_ += file_advance;
+  } else {
+    set_seen_error();
+  }
+  return s;
+}
+
+IOStatus WritableFileWriter::WriteDirectWithChecksum(
+    Env::IOPriority op_rate_limiter_priority) {
+  if (seen_error()) {
+    return AssertFalseAndGetStatusForPrevError();
+  }
+
+  assert(use_direct_io());
+  assert(perform_data_verification_ && buffered_data_with_checksum_);
+  IOStatus s;
+  const size_t alignment = buf_.Alignment();
+  assert((next_write_offset_ % alignment) == 0);
+
+  // Calculate whole page final file advance if all writes succeed
+  const size_t file_advance =
+      TruncateToPageBoundary(alignment, buf_.CurrentSize());
+
+  // Calculate the leftover tail, we write it here padded with zeros BUT we
+  // will write it again in the future either on Close() OR when the current
+  // whole page fills out.
+  const size_t leftover_tail = buf_.CurrentSize() - file_advance;
+
+  // Round up, pad, and combine the checksum.
+  size_t last_cur_size = buf_.CurrentSize();
+  buf_.PadToAlignmentWith(0);
+  size_t padded_size = buf_.CurrentSize() - last_cur_size;
+  const char* padded_start = buf_.BufferStart() + last_cur_size;
+  uint32_t padded_checksum = crc32c::Value(padded_start, padded_size);
+  buffered_data_crc32c_checksum_ = crc32c::Crc32cCombine(
+      buffered_data_crc32c_checksum_, padded_checksum, padded_size);
+
+  const char* src = buf_.BufferStart();
+  uint64_t write_offset = next_write_offset_;
+  size_t left = buf_.CurrentSize();
+  DataVerificationInfo v_info;
+  char checksum_buf[sizeof(uint32_t)];
+
+  Env::IOPriority rate_limiter_priority_used =
+      WritableFileWriter::DecideRateLimiterPriority(
+          writable_file_->GetIOPriority(), op_rate_limiter_priority);
+  IOOptions io_options;
+  io_options.rate_limiter_priority = rate_limiter_priority_used;
+  // Check how much is allowed. Here, we loop until the rate limiter allows to
+  // write the entire buffer.
+  // TODO: need to be improved since it sort of defeats the purpose of the rate
+  // limiter
+  size_t data_size = left;
+  if (rate_limiter_ != nullptr && rate_limiter_priority_used != Env::IO_TOTAL) {
+    while (data_size > 0) {
+      size_t size;
+      size = rate_limiter_->RequestToken(data_size, buf_.Alignment(),
+                                         rate_limiter_priority_used, stats_,
+                                         RateLimiter::OpType::kWrite);
+      data_size -= size;
+    }
+  }
+
+  {
+    IOSTATS_TIMER_GUARD(write_nanos);
+    TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
+    FileOperationInfo::StartTimePoint start_ts;
+    if (ShouldNotifyListeners()) {
+      start_ts = FileOperationInfo::StartNow();
+    }
+    // direct writes must be positional
+    EncodeFixed32(checksum_buf, buffered_data_crc32c_checksum_);
+    v_info.checksum = Slice(checksum_buf, sizeof(uint32_t));
+    s = writable_file_->PositionedAppend(Slice(src, left), write_offset,
+                                         io_options, v_info, nullptr);
+
+    if (ShouldNotifyListeners()) {
+      auto finish_ts = std::chrono::steady_clock::now();
+      NotifyOnFileWriteFinish(write_offset, left, start_ts, finish_ts, s);
+      if (!s.ok()) {
+        NotifyOnIOError(s, FileOperationType::kPositionedAppend, file_name(),
+                        left, write_offset);
+      }
+    }
+    if (!s.ok()) {
+      // In this case, we do not change buffered_data_crc32c_checksum_ because
+      // it still aligns with the data in the buffer.
+      buf_.Size(file_advance + leftover_tail);
+      buffered_data_crc32c_checksum_ =
+          crc32c::Value(buf_.BufferStart(), buf_.CurrentSize());
+      set_seen_error();
+      return s;
+    }
+  }
+
+  IOSTATS_ADD(bytes_written, left);
+  assert((next_write_offset_ % alignment) == 0);
+  uint64_t cur_size = flushed_size_.load(std::memory_order_acquire);
+  flushed_size_.store(cur_size + left, std::memory_order_release);
+
+  if (s.ok()) {
+    // Move the tail to the beginning of the buffer
+    // This never happens during normal Append but rather during
+    // explicit call to Flush()/Sync() or Close(). Also the buffer checksum will
+    // recalculated accordingly.
+    buf_.RefitTail(file_advance, leftover_tail);
+    // Adjust the checksum value to align with the data in the buffer
+    buffered_data_crc32c_checksum_ =
+        crc32c::Value(buf_.BufferStart(), buf_.CurrentSize());
+    // This is where we start writing next time which may or not be
+    // the actual file size on disk. They match if the buffer size
+    // is a multiple of whole pages otherwise filesize_ is leftover_tail
+    // behind
+    next_write_offset_ += file_advance;
+  } else {
+    set_seen_error();
+  }
+  return s;
+}
+#endif  // !ROCKSDB_LITE
+Env::IOPriority WritableFileWriter::DecideRateLimiterPriority(
+    Env::IOPriority writable_file_io_priority,
+    Env::IOPriority op_rate_limiter_priority) {
+  if (writable_file_io_priority == Env::IO_TOTAL &&
+      op_rate_limiter_priority == Env::IO_TOTAL) {
+    return Env::IO_TOTAL;
+  } else if (writable_file_io_priority == Env::IO_TOTAL) {
+    return op_rate_limiter_priority;
+  } else if (op_rate_limiter_priority == Env::IO_TOTAL) {
+    return writable_file_io_priority;
+  } else {
+    return op_rate_limiter_priority;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/file/writable_file_writer.h b/src/rocksdb/file/writable_file_writer.h
new file mode 100644
index 000000000..b3985eb20
--- /dev/null
+++ b/src/rocksdb/file/writable_file_writer.h
@@ -0,0 +1,336 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <string>
+
+#include "db/version_edit.h"
+#include "env/file_system_tracer.h"
+#include "port/port.h"
+#include "rocksdb/file_checksum.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/rate_limiter.h"
+#include "test_util/sync_point.h"
+#include "util/aligned_buffer.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Statistics;
+class SystemClock;
+
+// WritableFileWriter is a wrapper on top of Env::WritableFile. It provides
+// facilities to:
+// - Handle Buffered and Direct writes.
+// - Rate limit writes.
+// - Flush and Sync the data to the underlying filesystem.
+// - Notify any interested listeners on the completion of a write.
+// - Update IO stats.
+class WritableFileWriter {
+ private:
+#ifndef ROCKSDB_LITE
+  void NotifyOnFileWriteFinish(
+      uint64_t offset, size_t length,
+      const FileOperationInfo::StartTimePoint& start_ts,
+      const FileOperationInfo::FinishTimePoint& finish_ts,
+      const IOStatus& io_status) {
+    FileOperationInfo info(FileOperationType::kWrite, file_name_, start_ts,
+                           finish_ts, io_status, temperature_);
+    info.offset = offset;
+    info.length = length;
+
+    for (auto& listener : listeners_) {
+      listener->OnFileWriteFinish(info);
+    }
+    info.status.PermitUncheckedError();
+  }
+  void NotifyOnFileFlushFinish(
+      const FileOperationInfo::StartTimePoint& start_ts,
+      const FileOperationInfo::FinishTimePoint& finish_ts,
+      const IOStatus& io_status) {
+    FileOperationInfo info(FileOperationType::kFlush, file_name_, start_ts,
+                           finish_ts, io_status, temperature_);
+
+    for (auto& listener : listeners_) {
+      listener->OnFileFlushFinish(info);
+    }
+    info.status.PermitUncheckedError();
+  }
+  void NotifyOnFileSyncFinish(
+      const FileOperationInfo::StartTimePoint& start_ts,
+      const FileOperationInfo::FinishTimePoint& finish_ts,
+      const IOStatus& io_status,
+      FileOperationType type = FileOperationType::kSync) {
+    FileOperationInfo info(type, file_name_, start_ts, finish_ts, io_status,
+                           temperature_);
+
+    for (auto& listener : listeners_) {
+      listener->OnFileSyncFinish(info);
+    }
+    info.status.PermitUncheckedError();
+  }
+  void NotifyOnFileRangeSyncFinish(
+      uint64_t offset, size_t length,
+      const FileOperationInfo::StartTimePoint& start_ts,
+      const FileOperationInfo::FinishTimePoint& finish_ts,
+      const IOStatus& io_status) {
+    FileOperationInfo info(FileOperationType::kRangeSync, file_name_, start_ts,
+                           finish_ts, io_status, temperature_);
+    info.offset = offset;
+    info.length = length;
+
+    for (auto& listener : listeners_) {
+      listener->OnFileRangeSyncFinish(info);
+    }
+    info.status.PermitUncheckedError();
+  }
+  void NotifyOnFileTruncateFinish(
+      const FileOperationInfo::StartTimePoint& start_ts,
+      const FileOperationInfo::FinishTimePoint& finish_ts,
+      const IOStatus& io_status) {
+    FileOperationInfo info(FileOperationType::kTruncate, file_name_, start_ts,
+                           finish_ts, io_status, temperature_);
+
+    for (auto& listener : listeners_) {
+      listener->OnFileTruncateFinish(info);
+    }
+    info.status.PermitUncheckedError();
+  }
+  void NotifyOnFileCloseFinish(
+      const FileOperationInfo::StartTimePoint& start_ts,
+      const FileOperationInfo::FinishTimePoint& finish_ts,
+      const IOStatus& io_status) {
+    FileOperationInfo info(FileOperationType::kClose, file_name_, start_ts,
+                           finish_ts, io_status, temperature_);
+
+    for (auto& listener : listeners_) {
+      listener->OnFileCloseFinish(info);
+    }
+    info.status.PermitUncheckedError();
+  }
+
+  void NotifyOnIOError(const IOStatus& io_status, FileOperationType operation,
+                       const std::string& file_path, size_t length = 0,
+                       uint64_t offset = 0) {
+    if (listeners_.empty()) {
+      return;
+    }
+    IOErrorInfo io_error_info(io_status, operation, file_path, length, offset);
+    for (auto& listener : listeners_) {
+      listener->OnIOError(io_error_info);
+    }
+    io_error_info.io_status.PermitUncheckedError();
+  }
+#endif  // ROCKSDB_LITE
+
+  bool ShouldNotifyListeners() const { return !listeners_.empty(); }
+  void UpdateFileChecksum(const Slice& data);
+  void Crc32cHandoffChecksumCalculation(const char* data, size_t size,
+                                        char* buf);
+
+  std::string file_name_;
+  FSWritableFilePtr writable_file_;
+  SystemClock* clock_;
+  AlignedBuffer buf_;
+  size_t max_buffer_size_;
+  // Actually written data size can be used for truncate
+  // not counting padding data
+  std::atomic<uint64_t> filesize_;
+  std::atomic<uint64_t> flushed_size_;
+#ifndef ROCKSDB_LITE
+  // This is necessary when we use unbuffered access
+  // and writes must happen on aligned offsets
+  // so we need to go back and write that page again
+  uint64_t next_write_offset_;
+#endif  // ROCKSDB_LITE
+  bool pending_sync_;
+  std::atomic<bool> seen_error_;
+#ifndef NDEBUG
+  // SyncWithoutFlush() is the function that is allowed to be called
+  // concurrently with other function. One of the concurrent call
+  // could set seen_error_, and the other one would hit assertion
+  // in debug mode.
+  std::atomic<bool> sync_without_flush_called_ = false;
+#endif  // NDEBUG
+  uint64_t last_sync_size_;
+  uint64_t bytes_per_sync_;
+  RateLimiter* rate_limiter_;
+  Statistics* stats_;
+  std::vector<std::shared_ptr<EventListener>> listeners_;
+  std::unique_ptr<FileChecksumGenerator> checksum_generator_;
+  bool checksum_finalized_;
+  bool perform_data_verification_;
+  uint32_t buffered_data_crc32c_checksum_;
+  bool buffered_data_with_checksum_;
+#ifndef ROCKSDB_LITE
+  Temperature temperature_;
+#endif  // ROCKSDB_LITE
+
+ public:
+  WritableFileWriter(
+      std::unique_ptr<FSWritableFile>&& file, const std::string& _file_name,
+      const FileOptions& options, SystemClock* clock = nullptr,
+      const std::shared_ptr<IOTracer>& io_tracer = nullptr,
+      Statistics* stats = nullptr,
+      const std::vector<std::shared_ptr<EventListener>>& listeners = {},
+      FileChecksumGenFactory* file_checksum_gen_factory = nullptr,
+      bool perform_data_verification = false,
+      bool buffered_data_with_checksum = false)
+      : file_name_(_file_name),
+        writable_file_(std::move(file), io_tracer, _file_name),
+        clock_(clock),
+        buf_(),
+        max_buffer_size_(options.writable_file_max_buffer_size),
+        filesize_(0),
+        flushed_size_(0),
+#ifndef ROCKSDB_LITE
+        next_write_offset_(0),
+#endif  // ROCKSDB_LITE
+        pending_sync_(false),
+        seen_error_(false),
+        last_sync_size_(0),
+        bytes_per_sync_(options.bytes_per_sync),
+        rate_limiter_(options.rate_limiter),
+        stats_(stats),
+        listeners_(),
+        checksum_generator_(nullptr),
+        checksum_finalized_(false),
+        perform_data_verification_(perform_data_verification),
+        buffered_data_crc32c_checksum_(0),
+        buffered_data_with_checksum_(buffered_data_with_checksum) {
+#ifndef ROCKSDB_LITE
+    temperature_ = options.temperature;
+#endif  // ROCKSDB_LITE
+    assert(!use_direct_io() || max_buffer_size_ > 0);
+    TEST_SYNC_POINT_CALLBACK("WritableFileWriter::WritableFileWriter:0",
+                             reinterpret_cast<void*>(max_buffer_size_));
+    buf_.Alignment(writable_file_->GetRequiredBufferAlignment());
+    buf_.AllocateNewBuffer(std::min((size_t)65536, max_buffer_size_));
+#ifndef ROCKSDB_LITE
+    std::for_each(listeners.begin(), listeners.end(),
+                  [this](const std::shared_ptr<EventListener>& e) {
+                    if (e->ShouldBeNotifiedOnFileIO()) {
+                      listeners_.emplace_back(e);
+                    }
+                  });
+#else  // !ROCKSDB_LITE
+    (void)listeners;
+#endif
+    if (file_checksum_gen_factory != nullptr) {
+      FileChecksumGenContext checksum_gen_context;
+      checksum_gen_context.file_name = _file_name;
+      checksum_generator_ =
+          file_checksum_gen_factory->CreateFileChecksumGenerator(
+              checksum_gen_context);
+    }
+  }
+
+  static IOStatus Create(const std::shared_ptr<FileSystem>& fs,
+                         const std::string& fname, const FileOptions& file_opts,
+                         std::unique_ptr<WritableFileWriter>* writer,
+                         IODebugContext* dbg);
+  WritableFileWriter(const WritableFileWriter&) = delete;
+
+  WritableFileWriter& operator=(const WritableFileWriter&) = delete;
+
+  ~WritableFileWriter() {
+    auto s = Close();
+    s.PermitUncheckedError();
+  }
+
+  std::string file_name() const { return file_name_; }
+
+  // When this Append API is called, if the crc32c_checksum is not provided, we
+  // will calculate the checksum internally.
+  IOStatus Append(const Slice& data, uint32_t crc32c_checksum = 0,
+                  Env::IOPriority op_rate_limiter_priority = Env::IO_TOTAL);
+
+  IOStatus Pad(const size_t pad_bytes,
+               Env::IOPriority op_rate_limiter_priority = Env::IO_TOTAL);
+
+  IOStatus Flush(Env::IOPriority op_rate_limiter_priority = Env::IO_TOTAL);
+
+  IOStatus Close();
+
+  IOStatus Sync(bool use_fsync);
+
+  // Sync only the data that was already Flush()ed. Safe to call concurrently
+  // with Append() and Flush(). If !writable_file_->IsSyncThreadSafe(),
+  // returns NotSupported status.
+  IOStatus SyncWithoutFlush(bool use_fsync);
+
+  uint64_t GetFileSize() const {
+    return filesize_.load(std::memory_order_acquire);
+  }
+
+  // Returns the size of data flushed to the underlying `FSWritableFile`.
+  // Expected to match `writable_file()->GetFileSize()`.
+  // The return value can serve as a lower-bound for the amount of data synced
+  // by a future call to `SyncWithoutFlush()`.
+  uint64_t GetFlushedSize() const {
+    return flushed_size_.load(std::memory_order_acquire);
+  }
+
+  IOStatus InvalidateCache(size_t offset, size_t length) {
+    return writable_file_->InvalidateCache(offset, length);
+  }
+
+  FSWritableFile* writable_file() const { return writable_file_.get(); }
+
+  bool use_direct_io() { return writable_file_->use_direct_io(); }
+
+  bool BufferIsEmpty() { return buf_.CurrentSize() == 0; }
+
+  void TEST_SetFileChecksumGenerator(
+      FileChecksumGenerator* checksum_generator) {
+    checksum_generator_.reset(checksum_generator);
+  }
+
+  std::string GetFileChecksum();
+
+  const char* GetFileChecksumFuncName() const;
+
+  bool seen_error() const {
+    return seen_error_.load(std::memory_order_relaxed);
+  }
+  // For options of relaxed consistency, users might hope to continue
+  // operating on the file after an error happens.
+  void reset_seen_error() {
+    seen_error_.store(false, std::memory_order_relaxed);
+  }
+  void set_seen_error() { seen_error_.store(true, std::memory_order_relaxed); }
+
+  IOStatus AssertFalseAndGetStatusForPrevError() {
+    // This should only happen if SyncWithoutFlush() was called.
+    assert(sync_without_flush_called_);
+    return IOStatus::IOError("Writer has previous error.");
+  }
+
+ private:
+  // Decide the Rate Limiter priority.
+  static Env::IOPriority DecideRateLimiterPriority(
+      Env::IOPriority writable_file_io_priority,
+      Env::IOPriority op_rate_limiter_priority);
+
+  // Used when os buffering is OFF and we are writing
+  // DMA such as in Direct I/O mode
+#ifndef ROCKSDB_LITE
+  IOStatus WriteDirect(Env::IOPriority op_rate_limiter_priority);
+  IOStatus WriteDirectWithChecksum(Env::IOPriority op_rate_limiter_priority);
+#endif  // !ROCKSDB_LITE
+  // Normal write.
+  IOStatus WriteBuffered(const char* data, size_t size,
+                         Env::IOPriority op_rate_limiter_priority);
+  IOStatus WriteBufferedWithChecksum(const char* data, size_t size,
+                                     Env::IOPriority op_rate_limiter_priority);
+  IOStatus RangeSync(uint64_t offset, uint64_t nbytes);
+  IOStatus SyncInternal(bool use_fsync);
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/fuzz/.gitignore b/src/rocksdb/fuzz/.gitignore
new file mode 100644
index 000000000..9dab42105
--- /dev/null
+++ b/src/rocksdb/fuzz/.gitignore
@@ -0,0 +1,5 @@
+db_fuzzer
+db_map_fuzzer
+sst_file_writer_fuzzer
+
+proto/gen/*
diff --git a/src/rocksdb/fuzz/Makefile b/src/rocksdb/fuzz/Makefile
new file mode 100644
index 000000000..b83040504
--- /dev/null
+++ b/src/rocksdb/fuzz/Makefile
@@ -0,0 +1,67 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+# This source code is licensed under both the GPLv2 (found in the
+# COPYING file in the root directory) and Apache 2.0 License
+# (found in the LICENSE.Apache file in the root directory).
+
+ROOT_DIR = $(abspath $(shell pwd)/../)
+
+include $(ROOT_DIR)/make_config.mk
+
+PROTOBUF_CFLAGS = `pkg-config --cflags protobuf`
+PROTOBUF_LDFLAGS = `pkg-config --libs protobuf`
+
+PROTOBUF_MUTATOR_CFLAGS = `pkg-config --cflags libprotobuf-mutator`
+PROTOBUF_MUTATOR_LDFLAGS = `pkg-config --libs libprotobuf-mutator`
+
+ROCKSDB_INCLUDE_DIR = $(ROOT_DIR)/include
+ROCKSDB_LIB_DIR = $(ROOT_DIR)
+
+PROTO_IN = $(ROOT_DIR)/fuzz/proto
+PROTO_OUT = $(ROOT_DIR)/fuzz/proto/gen
+
+ifneq ($(FUZZ_ENV), ossfuzz)
+CC = $(CXX)
+CCFLAGS += -Wall -fsanitize=address,fuzzer
+CFLAGS += $(PLATFORM_CXXFLAGS) $(PROTOBUF_CFLAGS) $(PROTOBUF_MUTATOR_CFLAGS) -I$(PROTO_OUT) -I$(ROCKSDB_INCLUDE_DIR) -I$(ROCKSDB_LIB_DIR)
+LDFLAGS += $(PLATFORM_LDFLAGS) $(PROTOBUF_MUTATOR_LDFLAGS) $(PROTOBUF_LDFLAGS) -L$(ROCKSDB_LIB_DIR) -lrocksdb
+else
+# OSS-Fuzz sets various environment flags that are used for compilation.
+# These environment flags depend on which type of sanitizer build is being
+# used, however, an ASan build would set the environment flags as follows:
+# CFLAGS="-O1 -fno-omit-frame-pointer -gline-tables-only \
+         -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION -fsanitize=address \
+         -fsanitize-address-use-after-scope -fsanitize=fuzzer-no-link"
+# CXXFLAGS="-O1 -fno-omit-frame-pointer -gline-tables-only \
+           -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION -fsanitize=address \
+           -fsanitize-address-use-after-scope -fsanitize=fuzzer-no-link \
+           -stdlib=libc++"
+# LIB_FUZZING_ENGINE="-fsanitize=fuzzer"
+CC = $(CXX)
+CCFLAGS = $(CXXFLAGS)
+CFLAGS += $(PROTOBUF_CFLAGS) $(PROTOBUF_MUTATOR_CFLAGS) -I$(PROTO_OUT) -I$(ROCKSDB_INCLUDE_DIR) -I$(ROCKSDB_LIB_DIR)
+LDFLAGS += $(PLATFORM_LDFLAGS) $(LIB_FUZZING_ENGINE) $(PROTOBUF_MUTATOR_LDFLAGS) $(PROTOBUF_LDFLAGS) -L$(ROCKSDB_LIB_DIR) -lrocksdb
+endif
+
+.PHONY: gen_proto clean
+
+# Set PROTOC_BIN when invoking `make` if a custom protoc is required.
+PROTOC_BIN ?= protoc
+
+gen_proto:
+	mkdir -p $(PROTO_OUT)
+	$(PROTOC_BIN) \
+		--proto_path=$(PROTO_IN) \
+		--cpp_out=$(PROTO_OUT) \
+		$(PROTO_IN)/*.proto
+
+clean:
+	rm -rf db_fuzzer db_map_fuzzer sst_file_writer_fuzzer $(PROTO_OUT)
+
+db_fuzzer: db_fuzzer.cc
+	$(CC) $(CCFLAGS) -o db_fuzzer db_fuzzer.cc $(CFLAGS) $(LDFLAGS)
+
+db_map_fuzzer: gen_proto db_map_fuzzer.cc proto/gen/db_operation.pb.cc
+	$(CC) $(CCFLAGS) -o db_map_fuzzer db_map_fuzzer.cc proto/gen/db_operation.pb.cc $(CFLAGS) $(LDFLAGS)
+
+sst_file_writer_fuzzer: gen_proto sst_file_writer_fuzzer.cc proto/gen/db_operation.pb.cc
+	$(CC) $(CCFLAGS) -o sst_file_writer_fuzzer sst_file_writer_fuzzer.cc proto/gen/db_operation.pb.cc $(CFLAGS) $(LDFLAGS)
diff --git a/src/rocksdb/fuzz/README.md b/src/rocksdb/fuzz/README.md
new file mode 100644
index 000000000..238b283a2
--- /dev/null
+++ b/src/rocksdb/fuzz/README.md
@@ -0,0 +1,165 @@
+# Fuzzing RocksDB
+
+## Overview
+
+This directory contains [fuzz tests](https://en.wikipedia.org/wiki/Fuzzing) for RocksDB.
+RocksDB testing infrastructure currently includes unit tests and [stress tests](https://github.com/facebook/rocksdb/wiki/Stress-test),
+we hope fuzz testing can catch more bugs.
+
+## Prerequisite
+
+We use [LLVM libFuzzer](http://llvm.org/docs/LibFuzzer.html) as the fuzzying engine,
+so make sure you have [clang](https://clang.llvm.org/get_started.html) as your compiler.
+
+Some tests rely on [structure aware fuzzing](https://github.com/google/fuzzing/blob/master/docs/structure-aware-fuzzing.md).
+We use [protobuf](https://developers.google.com/protocol-buffers) to define structured input to the fuzzer,
+and use [libprotobuf-mutator](https://github.com/google/libprotobuf-mutator) as the custom libFuzzer mutator.
+So make sure you have protobuf and libprotobuf-mutator installed, and make sure `pkg-config` can find them.
+On some systems, there are both protobuf2 and protobuf3 in the package management system,
+make sure protobuf3 is installed.
+
+If you do not want to install protobuf library yourself, you can rely on libprotobuf-mutator to download protobuf
+for you. For details about installation, please refer to [libprotobuf-mutator README](https://github.com/google/libprotobuf-mutator#readme)
+
+## Example
+
+This example shows you how to do structure aware fuzzing to `rocksdb::SstFileWriter`.
+
+After walking through the steps to create the fuzzer, we'll introduce a bug into `rocksdb::SstFileWriter::Put`,
+then show that the fuzzer can catch the bug.
+
+### Design the test
+
+We want the fuzzing engine to automatically generate a list of database operations,
+then we apply these operations to `SstFileWriter` in sequence,
+finally, after the SST file is generated, we use `SstFileReader` to check the file's checksum.
+
+### Define input
+
+We define the database operations in protobuf, each operation has a type of operation and a key value pair,
+see [proto/db_operation.proto](proto/db_operation.proto) for details.
+
+### Define tests with the input
+
+In [sst_file_writer_fuzzer.cc](sst_file_writer_fuzzer.cc),
+we define the tests to be run on the generated input:
+
+```
+DEFINE_PROTO_FUZZER(DBOperations& input) {
+  // apply the operations to SstFileWriter and use SstFileReader to verify checksum.
+  // ...
+}
+```
+
+`SstFileWriter` requires the keys of the operations to be unique and be in ascending order,
+but the fuzzing engine generates the input randomly, so we need to process the generated input before
+passing it to `DEFINE_PROTO_FUZZER`, this is accomplished by registering a post processor:
+
+```
+protobuf_mutator::libfuzzer::PostProcessorRegistration<DBOperations>
+```
+
+### Compile and link the fuzzer
+
+In the rocksdb root directory, compile rocksdb library by `make static_lib`.
+
+Go to the `fuzz` directory,
+run `make sst_file_writer_fuzzer` to generate the fuzzer,
+it will compile rocksdb static library, generate protobuf, then compile and link `sst_file_writer_fuzzer`.
+
+### Introduce a bug
+
+Manually introduce a bug to `SstFileWriter::Put`:
+
+```
+diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc
+index ab1ee7c4e..c7da9ffa0 100644
+--- a/table/sst_file_writer.cc
++++ b/table/sst_file_writer.cc
+@@ -277,6 +277,11 @@ Status SstFileWriter::Add(const Slice& user_key, const Slice& value) {
+ }
+
+ Status SstFileWriter::Put(const Slice& user_key, const Slice& value) {
++  if (user_key.starts_with("!")) {
++    if (value.ends_with("!")) {
++      return Status::Corruption("bomb");
++    }
++  }
+   return rep_->Add(user_key, value, ValueType::kTypeValue);
+ }
+```
+
+The bug is that for `Put`, if `user_key` starts with `!` and `value` ends with `!`, then corrupt.
+
+### Run fuzz testing to catch the bug
+
+Run the fuzzer by `time ./sst_file_writer_fuzzer`.
+
+Here is the output on my machine:
+
+```
+Corruption: bomb
+==59680== ERROR: libFuzzer: deadly signal
+    #0 0x109487315 in __sanitizer_print_stack_trace+0x35 (libclang_rt.asan_osx_dynamic.dylib:x86_64+0x4d315)
+    #1 0x108d63f18 in fuzzer::PrintStackTrace() FuzzerUtil.cpp:205
+    #2 0x108d47613 in fuzzer::Fuzzer::CrashCallback() FuzzerLoop.cpp:232
+    #3 0x7fff6af535fc in _sigtramp+0x1c (libsystem_platform.dylib:x86_64+0x35fc)
+    #4 0x7ffee720f3ef  (<unknown module>)
+    #5 0x7fff6ae29807 in abort+0x77 (libsystem_c.dylib:x86_64+0x7f807)
+    #6 0x108cf1c4c in TestOneProtoInput(DBOperations&)+0x113c (sst_file_writer_fuzzer:x86_64+0x100302c4c)
+    #7 0x108cf09be in LLVMFuzzerTestOneInput+0x16e (sst_file_writer_fuzzer:x86_64+0x1003019be)
+    #8 0x108d48ce0 in fuzzer::Fuzzer::ExecuteCallback(unsigned char const*, unsigned long) FuzzerLoop.cpp:556
+    #9 0x108d48425 in fuzzer::Fuzzer::RunOne(unsigned char const*, unsigned long, bool, fuzzer::InputInfo*, bool*) FuzzerLoop.cpp:470
+    #10 0x108d4a626 in fuzzer::Fuzzer::MutateAndTestOne() FuzzerLoop.cpp:698
+    #11 0x108d4b325 in fuzzer::Fuzzer::Loop(std::__1::vector<fuzzer::SizedFile, fuzzer::fuzzer_allocator<fuzzer::SizedFile> >&) FuzzerLoop.cpp:830
+    #12 0x108d37fcd in fuzzer::FuzzerDriver(int*, char***, int (*)(unsigned char const*, unsigned long)) FuzzerDriver.cpp:829
+    #13 0x108d652b2 in main FuzzerMain.cpp:19
+    #14 0x7fff6ad5acc8 in start+0x0 (libdyld.dylib:x86_64+0x1acc8)
+
+NOTE: libFuzzer has rudimentary signal handlers.
+      Combine libFuzzer with AddressSanitizer or similar for better crash reports.
+SUMMARY: libFuzzer: deadly signal
+MS: 7 Custom-CustomCrossOver-InsertByte-Custom-ChangeBit-Custom-CustomCrossOver-; base unit: 90863b4d83c3f994bba0a417d0c2ee3b68f9e795
+0x6f,0x70,0x65,0x72,0x61,0x74,0x69,0x6f,0x6e,0x73,0x20,0x7b,0xa,0x20,0x20,0x6b,0x65,0x79,0x3a,0x20,0x22,0x21,0x22,0xa,0x20,0x20,0x76,0x61,0x6c,0x75,0x65,0x3a,0x20,0x22,0x21,0x22,0xa,0x20,0x20,0x74,0x79,0x70,0x65,0x3a,0x20,0x50,0x55,0x54,0xa,0x7d,0xa,0x6f,0x70,0x65,0x72,0x61,0x74,0x69,0x6f,0x6e,0x73,0x20,0x7b,0xa,0x20,0x20,0x6b,0x65,0x79,0x3a,0x20,0x22,0x2b,0x22,0xa,0x20,0x20,0x74,0x79,0x70,0x65,0x3a,0x20,0x50,0x55,0x54,0xa,0x7d,0xa,0x6f,0x70,0x65,0x72,0x61,0x74,0x69,0x6f,0x6e,0x73,0x20,0x7b,0xa,0x20,0x20,0x6b,0x65,0x79,0x3a,0x20,0x22,0x2e,0x22,0xa,0x20,0x20,0x74,0x79,0x70,0x65,0x3a,0x20,0x50,0x55,0x54,0xa,0x7d,0xa,0x6f,0x70,0x65,0x72,0x61,0x74,0x69,0x6f,0x6e,0x73,0x20,0x7b,0xa,0x20,0x20,0x6b,0x65,0x79,0x3a,0x20,0x22,0x5c,0x32,0x35,0x33,0x22,0xa,0x20,0x20,0x74,0x79,0x70,0x65,0x3a,0x20,0x50,0x55,0x54,0xa,0x7d,0xa,
+operations {\x0a  key: \"!\"\x0a  value: \"!\"\x0a  type: PUT\x0a}\x0aoperations {\x0a  key: \"+\"\x0a  type: PUT\x0a}\x0aoperations {\x0a  key: \".\"\x0a  type: PUT\x0a}\x0aoperations {\x0a  key: \"\\253\"\x0a  type: PUT\x0a}\x0a
+artifact_prefix='./'; Test unit written to ./crash-a1460be302d09b548e61787178d9edaa40aea467
+Base64: b3BlcmF0aW9ucyB7CiAga2V5OiAiISIKICB2YWx1ZTogIiEiCiAgdHlwZTogUFVUCn0Kb3BlcmF0aW9ucyB7CiAga2V5OiAiKyIKICB0eXBlOiBQVVQKfQpvcGVyYXRpb25zIHsKICBrZXk6ICIuIgogIHR5cGU6IFBVVAp9Cm9wZXJhdGlvbnMgewogIGtleTogIlwyNTMiCiAgdHlwZTogUFVUCn0K
+./sst_file_writer_fuzzer  5.97s user 4.40s system 64% cpu 16.195 total
+```
+
+Within 6 seconds, it catches the bug.
+
+The input that triggers the bug is persisted in `./crash-a1460be302d09b548e61787178d9edaa40aea467`:
+
+```
+$ cat ./crash-a1460be302d09b548e61787178d9edaa40aea467
+operations {
+  key: "!"
+  value: "!"
+  type: PUT
+}
+operations {
+  key: "+"
+  type: PUT
+}
+operations {
+  key: "."
+  type: PUT
+}
+operations {
+  key: "\253"
+  type: PUT
+}
+```
+
+### Reproduce the crash to debug
+
+The above crash can be reproduced by `./sst_file_writer_fuzzer ./crash-a1460be302d09b548e61787178d9edaa40aea467`,
+so you can debug the crash.
+
+## Future Work
+
+According to [OSS-Fuzz](https://github.com/google/oss-fuzz),
+`as of June 2020, OSS-Fuzz has found over 20,000 bugs in 300 open source projects.`
+
+RocksDB can join OSS-Fuzz together with other open source projects such as sqlite.
diff --git a/src/rocksdb/fuzz/db_fuzzer.cc b/src/rocksdb/fuzz/db_fuzzer.cc
new file mode 100644
index 000000000..e6d5bb63c
--- /dev/null
+++ b/src/rocksdb/fuzz/db_fuzzer.cc
@@ -0,0 +1,172 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <fuzzer/FuzzedDataProvider.h>
+
+#include "rocksdb/db.h"
+
+enum OperationType {
+  kPut,
+  kGet,
+  kDelete,
+  kGetProperty,
+  kIterator,
+  kSnapshot,
+  kOpenClose,
+  kColumn,
+  kCompactRange,
+  kSeekForPrev,
+  OP_COUNT
+};
+
+constexpr char db_path[] = "/tmp/testdb";
+
+// Fuzzes DB operations by doing interpretations on the data. Both the
+// sequence of API calls to be called on the DB as well as the arguments
+// to each of these APIs are interpreted by way of the data buffer.
+// The operations that the fuzzer supports are given by the OperationType
+// enum. The goal is to capture sanitizer bugs, so the code should be
+// compiled with a given sanitizer (ASan, UBSan, MSan).
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+  ROCKSDB_NAMESPACE::DB* db;
+  ROCKSDB_NAMESPACE::Options options;
+  options.create_if_missing = true;
+  ROCKSDB_NAMESPACE::Status status =
+      ROCKSDB_NAMESPACE::DB::Open(options, db_path, &db);
+  if (!status.ok()) {
+    return 0;
+  }
+  FuzzedDataProvider fuzzed_data(data, size);
+
+  // perform a sequence of calls on our db instance
+  int max_iter = static_cast<int>(data[0]);
+  for (int i = 0; i < max_iter && i < size; i++) {
+    OperationType op = static_cast<OperationType>(data[i] % OP_COUNT);
+
+    switch (op) {
+      case kPut: {
+        std::string key = fuzzed_data.ConsumeRandomLengthString();
+        std::string val = fuzzed_data.ConsumeRandomLengthString();
+        db->Put(ROCKSDB_NAMESPACE::WriteOptions(), key, val);
+        break;
+      }
+      case kGet: {
+        std::string key = fuzzed_data.ConsumeRandomLengthString();
+        std::string value;
+        db->Get(ROCKSDB_NAMESPACE::ReadOptions(), key, &value);
+        break;
+      }
+      case kDelete: {
+        std::string key = fuzzed_data.ConsumeRandomLengthString();
+        db->Delete(ROCKSDB_NAMESPACE::WriteOptions(), key);
+        break;
+      }
+      case kGetProperty: {
+        std::string prop;
+        std::string property_name = fuzzed_data.ConsumeRandomLengthString();
+        db->GetProperty(property_name, &prop);
+        break;
+      }
+      case kIterator: {
+        ROCKSDB_NAMESPACE::Iterator* it =
+            db->NewIterator(ROCKSDB_NAMESPACE::ReadOptions());
+        for (it->SeekToFirst(); it->Valid(); it->Next()) {
+        }
+        delete it;
+        break;
+      }
+      case kSnapshot: {
+        ROCKSDB_NAMESPACE::ReadOptions snapshot_options;
+        snapshot_options.snapshot = db->GetSnapshot();
+        ROCKSDB_NAMESPACE::Iterator* it = db->NewIterator(snapshot_options);
+        db->ReleaseSnapshot(snapshot_options.snapshot);
+        delete it;
+        break;
+      }
+      case kOpenClose: {
+        db->Close();
+        delete db;
+        status = ROCKSDB_NAMESPACE::DB::Open(options, db_path, &db);
+        if (!status.ok()) {
+          ROCKSDB_NAMESPACE::DestroyDB(db_path, options);
+          return 0;
+        }
+
+        break;
+      }
+      case kColumn: {
+        ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf;
+        ROCKSDB_NAMESPACE::Status s;
+        s = db->CreateColumnFamily(ROCKSDB_NAMESPACE::ColumnFamilyOptions(),
+                                   "new_cf", &cf);
+        s = db->DestroyColumnFamilyHandle(cf);
+        db->Close();
+        delete db;
+
+        // open DB with two column families
+        std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor> column_families;
+        // have to open default column family
+        column_families.push_back(ROCKSDB_NAMESPACE::ColumnFamilyDescriptor(
+            ROCKSDB_NAMESPACE::kDefaultColumnFamilyName,
+            ROCKSDB_NAMESPACE::ColumnFamilyOptions()));
+        // open the new one, too
+        column_families.push_back(ROCKSDB_NAMESPACE::ColumnFamilyDescriptor(
+            "new_cf", ROCKSDB_NAMESPACE::ColumnFamilyOptions()));
+        std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*> handles;
+        s = ROCKSDB_NAMESPACE::DB::Open(ROCKSDB_NAMESPACE::DBOptions(), db_path,
+                                        column_families, &handles, &db);
+
+        if (s.ok()) {
+          std::string key1 = fuzzed_data.ConsumeRandomLengthString();
+          std::string val1 = fuzzed_data.ConsumeRandomLengthString();
+          std::string key2 = fuzzed_data.ConsumeRandomLengthString();
+          s = db->Put(ROCKSDB_NAMESPACE::WriteOptions(), handles[1], key1,
+                      val1);
+          std::string value;
+          s = db->Get(ROCKSDB_NAMESPACE::ReadOptions(), handles[1], key2,
+                      &value);
+          s = db->DropColumnFamily(handles[1]);
+          for (auto handle : handles) {
+            s = db->DestroyColumnFamilyHandle(handle);
+          }
+        } else {
+          status = ROCKSDB_NAMESPACE::DB::Open(options, db_path, &db);
+          if (!status.ok()) {
+            // At this point there is no saving to do. So we exit
+            ROCKSDB_NAMESPACE::DestroyDB(db_path, ROCKSDB_NAMESPACE::Options());
+            return 0;
+          }
+        }
+        break;
+      }
+      case kCompactRange: {
+        std::string slice_start = fuzzed_data.ConsumeRandomLengthString();
+        std::string slice_end = fuzzed_data.ConsumeRandomLengthString();
+
+        ROCKSDB_NAMESPACE::Slice begin(slice_start);
+        ROCKSDB_NAMESPACE::Slice end(slice_end);
+        ROCKSDB_NAMESPACE::CompactRangeOptions options;
+        ROCKSDB_NAMESPACE::Status s = db->CompactRange(options, &begin, &end);
+        break;
+      }
+      case kSeekForPrev: {
+        std::string key = fuzzed_data.ConsumeRandomLengthString();
+        auto iter = db->NewIterator(ROCKSDB_NAMESPACE::ReadOptions());
+        iter->SeekForPrev(key);
+        delete iter;
+        break;
+      }
+      case OP_COUNT:
+        break;
+    }
+  }
+
+  // Cleanup DB
+  db->Close();
+  delete db;
+  ROCKSDB_NAMESPACE::DestroyDB(db_path, options);
+  return 0;
+}
diff --git a/src/rocksdb/fuzz/db_map_fuzzer.cc b/src/rocksdb/fuzz/db_map_fuzzer.cc
new file mode 100644
index 000000000..ed9df8f84
--- /dev/null
+++ b/src/rocksdb/fuzz/db_map_fuzzer.cc
@@ -0,0 +1,107 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <algorithm>
+#include <iostream>
+#include <map>
+#include <string>
+
+#include "proto/gen/db_operation.pb.h"
+#include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
+#include "src/libfuzzer/libfuzzer_macro.h"
+#include "util.h"
+
+protobuf_mutator::libfuzzer::PostProcessorRegistration<DBOperations> reg = {
+    [](DBOperations* input, unsigned int /* seed */) {
+      const ROCKSDB_NAMESPACE::Comparator* comparator =
+          ROCKSDB_NAMESPACE::BytewiseComparator();
+      auto ops = input->mutable_operations();
+      // Make sure begin <= end for DELETE_RANGE.
+      for (DBOperation& op : *ops) {
+        if (op.type() == OpType::DELETE_RANGE) {
+          auto begin = op.key();
+          auto end = op.value();
+          if (comparator->Compare(begin, end) > 0) {
+            std::swap(begin, end);
+            op.set_key(begin);
+            op.set_value(end);
+          }
+        }
+      }
+    }};
+
+// Execute randomly generated operations on both a DB and a std::map,
+// then reopen the DB and make sure that iterating the DB produces the
+// same key-value pairs as iterating through the std::map.
+DEFINE_PROTO_FUZZER(DBOperations& input) {
+  if (input.operations().empty()) {
+    return;
+  }
+
+  const std::string kDbPath = "/tmp/db_map_fuzzer_test";
+  auto fs = ROCKSDB_NAMESPACE::FileSystem::Default();
+  if (fs->FileExists(kDbPath, ROCKSDB_NAMESPACE::IOOptions(), /*dbg=*/nullptr)
+          .ok()) {
+    std::cerr << "db path " << kDbPath << " already exists" << std::endl;
+    abort();
+  }
+
+  std::map<std::string, std::string> kv;
+  ROCKSDB_NAMESPACE::DB* db = nullptr;
+  ROCKSDB_NAMESPACE::Options options;
+  options.create_if_missing = true;
+  CHECK_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDbPath, &db));
+
+  for (const DBOperation& op : input.operations()) {
+    switch (op.type()) {
+      case OpType::PUT: {
+        CHECK_OK(
+            db->Put(ROCKSDB_NAMESPACE::WriteOptions(), op.key(), op.value()));
+        kv[op.key()] = op.value();
+        break;
+      }
+      case OpType::MERGE: {
+        break;
+      }
+      case OpType::DELETE: {
+        CHECK_OK(db->Delete(ROCKSDB_NAMESPACE::WriteOptions(), op.key()));
+        kv.erase(op.key());
+        break;
+      }
+      case OpType::DELETE_RANGE: {
+        // [op.key(), op.value()) corresponds to [begin, end).
+        CHECK_OK(db->DeleteRange(ROCKSDB_NAMESPACE::WriteOptions(),
+                                 db->DefaultColumnFamily(), op.key(),
+                                 op.value()));
+        kv.erase(kv.lower_bound(op.key()), kv.lower_bound(op.value()));
+        break;
+      }
+      default: {
+        std::cerr << "Unsupported operation" << static_cast<int>(op.type());
+        return;
+      }
+    }
+  }
+  CHECK_OK(db->Close());
+  delete db;
+  db = nullptr;
+
+  CHECK_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDbPath, &db));
+  auto kv_it = kv.begin();
+  ROCKSDB_NAMESPACE::Iterator* it =
+      db->NewIterator(ROCKSDB_NAMESPACE::ReadOptions());
+  for (it->SeekToFirst(); it->Valid(); it->Next(), kv_it++) {
+    CHECK_TRUE(kv_it != kv.end());
+    CHECK_EQ(it->key().ToString(), kv_it->first);
+    CHECK_EQ(it->value().ToString(), kv_it->second);
+  }
+  CHECK_TRUE(kv_it == kv.end());
+  delete it;
+
+  CHECK_OK(db->Close());
+  delete db;
+  CHECK_OK(ROCKSDB_NAMESPACE::DestroyDB(kDbPath, options));
+}
diff --git a/src/rocksdb/fuzz/proto/db_operation.proto b/src/rocksdb/fuzz/proto/db_operation.proto
new file mode 100644
index 000000000..20a55eaa5
--- /dev/null
+++ b/src/rocksdb/fuzz/proto/db_operation.proto
@@ -0,0 +1,28 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+// Defines database operations.
+// Each operation is a key-value pair and an operation type.
+
+syntax = "proto2";
+
+enum OpType {
+  PUT = 0;
+  MERGE = 1;
+  DELETE = 2;
+  DELETE_RANGE = 3;
+}
+
+message DBOperation {
+  required string key = 1;
+  // value is ignored for DELETE.
+  // [key, value] is the range for DELETE_RANGE.
+  optional string value = 2;
+  required OpType type = 3;
+}
+
+message DBOperations {
+  repeated DBOperation operations = 1;
+}
diff --git a/src/rocksdb/fuzz/sst_file_writer_fuzzer.cc b/src/rocksdb/fuzz/sst_file_writer_fuzzer.cc
new file mode 100644
index 000000000..e93b9a3f5
--- /dev/null
+++ b/src/rocksdb/fuzz/sst_file_writer_fuzzer.cc
@@ -0,0 +1,209 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <string>
+
+#include "proto/gen/db_operation.pb.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/sst_file_writer.h"
+#include "src/libfuzzer/libfuzzer_macro.h"
+#include "table/table_builder.h"
+#include "table/table_reader.h"
+#include "util.h"
+
+using ROCKSDB_NAMESPACE::BytewiseComparator;
+using ROCKSDB_NAMESPACE::Comparator;
+using ROCKSDB_NAMESPACE::EnvOptions;
+using ROCKSDB_NAMESPACE::ExternalSstFileInfo;
+using ROCKSDB_NAMESPACE::FileOptions;
+using ROCKSDB_NAMESPACE::FileSystem;
+using ROCKSDB_NAMESPACE::ImmutableCFOptions;
+using ROCKSDB_NAMESPACE::ImmutableOptions;
+using ROCKSDB_NAMESPACE::InternalIterator;
+using ROCKSDB_NAMESPACE::IOOptions;
+using ROCKSDB_NAMESPACE::kMaxSequenceNumber;
+using ROCKSDB_NAMESPACE::Options;
+using ROCKSDB_NAMESPACE::ParsedInternalKey;
+using ROCKSDB_NAMESPACE::ParseInternalKey;
+using ROCKSDB_NAMESPACE::RandomAccessFileReader;
+using ROCKSDB_NAMESPACE::ReadOptions;
+using ROCKSDB_NAMESPACE::SstFileWriter;
+using ROCKSDB_NAMESPACE::Status;
+using ROCKSDB_NAMESPACE::TableReader;
+using ROCKSDB_NAMESPACE::TableReaderCaller;
+using ROCKSDB_NAMESPACE::TableReaderOptions;
+using ROCKSDB_NAMESPACE::ValueType;
+
+// Keys in SST file writer operations must be unique and in ascending order.
+// For each DBOperation generated by the fuzzer, this function is called on
+// it to deduplicate and sort the keys in the DBOperations.
+protobuf_mutator::libfuzzer::PostProcessorRegistration<DBOperations> reg = {
+    [](DBOperations* input, unsigned int /* seed */) {
+      const Comparator* comparator = BytewiseComparator();
+      auto ops = input->mutable_operations();
+
+      // Make sure begin <= end for DELETE_RANGE.
+      for (DBOperation& op : *ops) {
+        if (op.type() == OpType::DELETE_RANGE) {
+          auto begin = op.key();
+          auto end = op.value();
+          if (comparator->Compare(begin, end) > 0) {
+            std::swap(begin, end);
+            op.set_key(begin);
+            op.set_value(end);
+          }
+        }
+      }
+
+      std::sort(ops->begin(), ops->end(),
+                [&comparator](const DBOperation& a, const DBOperation& b) {
+                  return comparator->Compare(a.key(), b.key()) < 0;
+                });
+
+      auto last = std::unique(
+          ops->begin(), ops->end(),
+          [&comparator](const DBOperation& a, const DBOperation& b) {
+            return comparator->Compare(a.key(), b.key()) == 0;
+          });
+      ops->erase(last, ops->end());
+    }};
+
+TableReader* NewTableReader(const std::string& sst_file_path,
+                            const Options& options,
+                            const EnvOptions& env_options,
+                            const ImmutableCFOptions& cf_ioptions) {
+  // This code block is similar to SstFileReader::Open.
+
+  uint64_t file_size = 0;
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  std::unique_ptr<TableReader> table_reader;
+  const auto& fs = options.env->GetFileSystem();
+  FileOptions fopts(env_options);
+  Status s = options.env->GetFileSize(sst_file_path, &file_size);
+  if (s.ok()) {
+    s = RandomAccessFileReader::Create(fs, sst_file_path, fopts, &file_reader,
+                                       nullptr);
+  }
+  if (s.ok()) {
+    ImmutableOptions iopts(options, cf_ioptions);
+    TableReaderOptions t_opt(iopts, /*prefix_extractor=*/nullptr, env_options,
+                             cf_ioptions.internal_comparator);
+    t_opt.largest_seqno = kMaxSequenceNumber;
+    s = options.table_factory->NewTableReader(t_opt, std::move(file_reader),
+                                              file_size, &table_reader,
+                                              /*prefetch=*/false);
+  }
+  if (!s.ok()) {
+    std::cerr << "Failed to create TableReader for " << sst_file_path << ": "
+              << s.ToString() << std::endl;
+    abort();
+  }
+  return table_reader.release();
+}
+
+ValueType ToValueType(OpType op_type) {
+  switch (op_type) {
+    case OpType::PUT:
+      return ValueType::kTypeValue;
+    case OpType::MERGE:
+      return ValueType::kTypeMerge;
+    case OpType::DELETE:
+      return ValueType::kTypeDeletion;
+    case OpType::DELETE_RANGE:
+      return ValueType::kTypeRangeDeletion;
+    default:
+      std::cerr << "Unknown operation type " << static_cast<int>(op_type)
+                << std::endl;
+      abort();
+  }
+}
+
+// Fuzzes DB operations as input, let SstFileWriter generate a SST file
+// according to the operations, then let TableReader read and check all the
+// key-value pairs from the generated SST file.
+DEFINE_PROTO_FUZZER(DBOperations& input) {
+  if (input.operations().empty()) {
+    return;
+  }
+
+  std::string sstfile;
+  {
+    auto fs = FileSystem::Default();
+    std::string dir;
+    IOOptions opt;
+    CHECK_OK(fs->GetTestDirectory(opt, &dir, nullptr));
+    sstfile = dir + "/SstFileWriterFuzzer.sst";
+  }
+
+  Options options;
+  EnvOptions env_options(options);
+  ImmutableCFOptions cf_ioptions(options);
+
+  // Generate sst file.
+  SstFileWriter writer(env_options, options);
+  CHECK_OK(writer.Open(sstfile));
+  for (const DBOperation& op : input.operations()) {
+    switch (op.type()) {
+      case OpType::PUT: {
+        CHECK_OK(writer.Put(op.key(), op.value()));
+        break;
+      }
+      case OpType::MERGE: {
+        CHECK_OK(writer.Merge(op.key(), op.value()));
+        break;
+      }
+      case OpType::DELETE: {
+        CHECK_OK(writer.Delete(op.key()));
+        break;
+      }
+      case OpType::DELETE_RANGE: {
+        CHECK_OK(writer.DeleteRange(op.key(), op.value()));
+        break;
+      }
+      default: {
+        std::cerr << "Unsupported operation" << static_cast<int>(op.type())
+                  << std::endl;
+        abort();
+      }
+    }
+  }
+  ExternalSstFileInfo info;
+  CHECK_OK(writer.Finish(&info));
+
+  // Iterate and verify key-value pairs.
+  std::unique_ptr<TableReader> table_reader(
+      ::NewTableReader(sstfile, options, env_options, cf_ioptions));
+  ReadOptions roptions;
+  CHECK_OK(table_reader->VerifyChecksum(roptions,
+                                        TableReaderCaller::kUncategorized));
+  std::unique_ptr<InternalIterator> it(
+      table_reader->NewIterator(roptions, /*prefix_extractor=*/nullptr,
+                                /*arena=*/nullptr, /*skip_filters=*/true,
+                                TableReaderCaller::kUncategorized));
+  it->SeekToFirst();
+  for (const DBOperation& op : input.operations()) {
+    if (op.type() == OpType::DELETE_RANGE) {
+      // InternalIterator cannot iterate over DELETE_RANGE entries.
+      continue;
+    }
+    CHECK_TRUE(it->Valid());
+    ParsedInternalKey ikey;
+    CHECK_OK(ParseInternalKey(it->key(), &ikey, /*log_err_key=*/true));
+    CHECK_EQ(ikey.user_key.ToString(), op.key());
+    CHECK_EQ(ikey.sequence, 0);
+    CHECK_EQ(ikey.type, ToValueType(op.type()));
+    if (op.type() != OpType::DELETE) {
+      CHECK_EQ(op.value(), it->value().ToString());
+    }
+    it->Next();
+  }
+  CHECK_TRUE(!it->Valid());
+
+  // Delete sst file.
+  remove(sstfile.c_str());
+}
diff --git a/src/rocksdb/fuzz/util.h b/src/rocksdb/fuzz/util.h
new file mode 100644
index 000000000..97011823a
--- /dev/null
+++ b/src/rocksdb/fuzz/util.h
@@ -0,0 +1,29 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#define CHECK_OK(expression)                       \
+  do {                                             \
+    auto status = (expression);                    \
+    if (!status.ok()) {                            \
+      std::cerr << status.ToString() << std::endl; \
+      abort();                                     \
+    }                                              \
+  } while (0)
+
+#define CHECK_EQ(a, b)                                                      \
+  if (a != b) {                                                             \
+    std::cerr << "(" << #a << "=" << a << ") != (" << #b << "=" << b << ")" \
+              << std::endl;                                                 \
+    abort();                                                                \
+  }
+
+#define CHECK_TRUE(cond)                                      \
+  if (!(cond)) {                                              \
+    std::cerr << "\"" << #cond << "\" is false" << std::endl; \
+    abort();                                                  \
+  }
diff --git a/src/rocksdb/include/rocksdb/advanced_options.h b/src/rocksdb/include/rocksdb/advanced_options.h
new file mode 100644
index 000000000..258cf82a1
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/advanced_options.h
@@ -0,0 +1,1098 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <memory>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/universal_compaction.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class SliceTransform;
+class TablePropertiesCollectorFactory;
+class TableFactory;
+struct Options;
+
+enum CompactionStyle : char {
+  // level based compaction style
+  kCompactionStyleLevel = 0x0,
+  // Universal compaction style
+  // Not supported in ROCKSDB_LITE.
+  kCompactionStyleUniversal = 0x1,
+  // FIFO compaction style
+  // Not supported in ROCKSDB_LITE
+  kCompactionStyleFIFO = 0x2,
+  // Disable background compaction. Compaction jobs are submitted
+  // via CompactFiles().
+  // Not supported in ROCKSDB_LITE
+  kCompactionStyleNone = 0x3,
+};
+
+// In Level-based compaction, it Determines which file from a level to be
+// picked to merge to the next level. We suggest people try
+// kMinOverlappingRatio first when you tune your database.
+enum CompactionPri : char {
+  // Slightly prioritize larger files by size compensated by #deletes
+  kByCompensatedSize = 0x0,
+  // First compact files whose data's latest update time is oldest.
+  // Try this if you only update some hot keys in small ranges.
+  kOldestLargestSeqFirst = 0x1,
+  // First compact files whose range hasn't been compacted to the next level
+  // for the longest. If your updates are random across the key space,
+  // write amplification is slightly better with this option.
+  kOldestSmallestSeqFirst = 0x2,
+  // First compact files whose ratio between overlapping size in next level
+  // and its size is the smallest. It in many cases can optimize write
+  // amplification.
+  kMinOverlappingRatio = 0x3,
+  // Keeps a cursor(s) of the successor of the file (key range) was/were
+  // compacted before, and always picks the next files (key range) in that
+  // level. The file picking process will cycle through all the files in a
+  // round-robin manner.
+  kRoundRobin = 0x4,
+};
+
+struct CompactionOptionsFIFO {
+  // once the total sum of table files reaches this, we will delete the oldest
+  // table file
+  // Default: 1GB
+  uint64_t max_table_files_size;
+
+  // If true, try to do compaction to compact smaller files into larger ones.
+  // Minimum files to compact follows options.level0_file_num_compaction_trigger
+  // and compaction won't trigger if average compact bytes per del file is
+  // larger than options.write_buffer_size. This is to protect large files
+  // from being compacted again.
+  // Default: false;
+  bool allow_compaction = false;
+
+  // When not 0, if the data in the file is older than this threshold, RocksDB
+  // will soon move the file to warm temperature.
+  uint64_t age_for_warm = 0;
+
+  CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {}
+  CompactionOptionsFIFO(uint64_t _max_table_files_size, bool _allow_compaction)
+      : max_table_files_size(_max_table_files_size),
+        allow_compaction(_allow_compaction) {}
+};
+
+// Compression options for different compression algorithms like Zlib
+struct CompressionOptions {
+  // RocksDB's generic default compression level. Internally it'll be translated
+  // to the default compression level specific to the library being used (see
+  // comment above `ColumnFamilyOptions::compression`).
+  //
+  // The default value is the max 16-bit int as it'll be written out in OPTIONS
+  // file, which should be portable.
+  const static int kDefaultCompressionLevel = 32767;
+
+  int window_bits;
+  int level;
+  int strategy;
+
+  // Maximum size of dictionaries used to prime the compression library.
+  // Enabling dictionary can improve compression ratios when there are
+  // repetitions across data blocks.
+  //
+  // The dictionary is created by sampling the SST file data. If
+  // `zstd_max_train_bytes` is nonzero, the samples are passed through zstd's
+  // dictionary generator (see comments for option `use_zstd_dict_trainer` for
+  // detail on dictionary generator). If `zstd_max_train_bytes` is zero, the
+  // random samples are used directly as the dictionary.
+  //
+  // When compression dictionary is disabled, we compress and write each block
+  // before buffering data for the next one. When compression dictionary is
+  // enabled, we buffer SST file data in-memory so we can sample it, as data
+  // can only be compressed and written after the dictionary has been finalized.
+  //
+  // The amount of data buffered can be limited by `max_dict_buffer_bytes`. This
+  // buffered memory is charged to the block cache when there is a block cache.
+  // If block cache insertion fails with `Status::MemoryLimit` (i.e., it is
+  // full), we finalize the dictionary with whatever data we have and then stop
+  // buffering.
+  //
+  // Default: 0.
+  uint32_t max_dict_bytes;
+
+  // Maximum size of training data passed to zstd's dictionary trainer. Using
+  // zstd's dictionary trainer can achieve even better compression ratio
+  // improvements than using `max_dict_bytes` alone.
+  //
+  // The training data will be used to generate a dictionary of max_dict_bytes.
+  //
+  // Default: 0.
+  uint32_t zstd_max_train_bytes;
+
+  // Number of threads for parallel compression.
+  // Parallel compression is enabled only if threads > 1.
+  // THE FEATURE IS STILL EXPERIMENTAL
+  //
+  // This option is valid only when BlockBasedTable is used.
+  //
+  // When parallel compression is enabled, SST size file sizes might be
+  // more inflated compared to the target size, because more data of unknown
+  // compressed size is in flight when compression is parallelized. To be
+  // reasonably accurate, this inflation is also estimated by using historical
+  // compression ratio and current bytes inflight.
+  //
+  // Default: 1.
+  uint32_t parallel_threads;
+
+  // When the compression options are set by the user, it will be set to "true".
+  // For bottommost_compression_opts, to enable it, user must set enabled=true.
+  // Otherwise, bottommost compression will use compression_opts as default
+  // compression options.
+  //
+  // For compression_opts, if compression_opts.enabled=false, it is still
+  // used as compression options for compression process.
+  //
+  // Default: false.
+  bool enabled;
+
+  // Limit on data buffering when gathering samples to build a dictionary. Zero
+  // means no limit. When dictionary is disabled (`max_dict_bytes == 0`),
+  // enabling this limit (`max_dict_buffer_bytes != 0`) has no effect.
+  //
+  // In compaction, the buffering is limited to the target file size (see
+  // `target_file_size_base` and `target_file_size_multiplier`) even if this
+  // setting permits more buffering. Since we cannot determine where the file
+  // should be cut until data blocks are compressed with dictionary, buffering
+  // more than the target file size could lead to selecting samples that belong
+  // to a later output SST.
+  //
+  // Limiting too strictly may harm dictionary effectiveness since it forces
+  // RocksDB to pick samples from the initial portion of the output SST, which
+  // may not be representative of the whole file. Configuring this limit below
+  // `zstd_max_train_bytes` (when enabled) can restrict how many samples we can
+  // pass to the dictionary trainer. Configuring it below `max_dict_bytes` can
+  // restrict the size of the final dictionary.
+  //
+  // Default: 0 (unlimited)
+  uint64_t max_dict_buffer_bytes;
+
+  // Use zstd trainer to generate dictionaries. When this option is set to true,
+  // zstd_max_train_bytes of training data sampled from max_dict_buffer_bytes
+  // buffered data will be passed to zstd dictionary trainer to generate a
+  // dictionary of size max_dict_bytes.
+  //
+  // When this option is false, zstd's API ZDICT_finalizeDictionary() will be
+  // called to generate dictionaries. zstd_max_train_bytes of training sampled
+  // data will be passed to this API. Using this API should save CPU time on
+  // dictionary training, but the compression ratio may not be as good as using
+  // a dictionary trainer.
+  //
+  // Default: true
+  bool use_zstd_dict_trainer;
+
+  CompressionOptions()
+      : window_bits(-14),
+        level(kDefaultCompressionLevel),
+        strategy(0),
+        max_dict_bytes(0),
+        zstd_max_train_bytes(0),
+        parallel_threads(1),
+        enabled(false),
+        max_dict_buffer_bytes(0),
+        use_zstd_dict_trainer(true) {}
+  CompressionOptions(int wbits, int _lev, int _strategy,
+                     uint32_t _max_dict_bytes, uint32_t _zstd_max_train_bytes,
+                     uint32_t _parallel_threads, bool _enabled,
+                     uint64_t _max_dict_buffer_bytes,
+                     bool _use_zstd_dict_trainer)
+      : window_bits(wbits),
+        level(_lev),
+        strategy(_strategy),
+        max_dict_bytes(_max_dict_bytes),
+        zstd_max_train_bytes(_zstd_max_train_bytes),
+        parallel_threads(_parallel_threads),
+        enabled(_enabled),
+        max_dict_buffer_bytes(_max_dict_buffer_bytes),
+        use_zstd_dict_trainer(_use_zstd_dict_trainer) {}
+};
+
+// Temperature of a file. Used to pass to FileSystem for a different
+// placement and/or coding.
+// Reserve some numbers in the middle, in case we need to insert new tier
+// there.
+enum class Temperature : uint8_t {
+  kUnknown = 0,
+  kHot = 0x04,
+  kWarm = 0x08,
+  kCold = 0x0C,
+  kLastTemperature,
+};
+
+// The control option of how the cache tiers will be used. Currently rocksdb
+// support block cache (volatile tier), secondary cache (non-volatile tier).
+// In the future, we may add more caching layers.
+enum class CacheTier : uint8_t {
+  kVolatileTier = 0,
+  kNonVolatileBlockTier = 0x01,
+};
+
+enum UpdateStatus {     // Return status For inplace update callback
+  UPDATE_FAILED = 0,    // Nothing to update
+  UPDATED_INPLACE = 1,  // Value updated inplace
+  UPDATED = 2,          // No inplace update. Merged value set
+};
+
+enum class PrepopulateBlobCache : uint8_t {
+  kDisable = 0x0,    // Disable prepopulate blob cache
+  kFlushOnly = 0x1,  // Prepopulate blobs during flush only
+};
+
+struct AdvancedColumnFamilyOptions {
+  // The maximum number of write buffers that are built up in memory.
+  // The default and the minimum number is 2, so that when 1 write buffer
+  // is being flushed to storage, new writes can continue to the other
+  // write buffer.
+  // If max_write_buffer_number > 3, writing will be slowed down to
+  // options.delayed_write_rate if we are writing to the last write buffer
+  // allowed.
+  //
+  // Default: 2
+  //
+  // Dynamically changeable through SetOptions() API
+  int max_write_buffer_number = 2;
+
+  // The minimum number of write buffers that will be merged together
+  // before writing to storage.  If set to 1, then
+  // all write buffers are flushed to L0 as individual files and this increases
+  // read amplification because a get request has to check in all of these
+  // files. Also, an in-memory merge may result in writing lesser
+  // data to storage if there are duplicate records in each of these
+  // individual write buffers.
+  // If atomic flush is enabled (options.atomic_flush == true), then this
+  // option will be sanitized to 1.
+  // Default: 1
+  int min_write_buffer_number_to_merge = 1;
+
+  // DEPRECATED
+  // The total maximum number of write buffers to maintain in memory including
+  // copies of buffers that have already been flushed.  Unlike
+  // max_write_buffer_number, this parameter does not affect flushing.
+  // This parameter is being replaced by max_write_buffer_size_to_maintain.
+  // If both parameters are set to non-zero values, this parameter will be
+  // ignored.
+  int max_write_buffer_number_to_maintain = 0;
+
+  // The target number of write history bytes to hold in memory. Write history
+  // comprises the latest write buffers (memtables). To reach the target, write
+  // buffers that were most recently flushed to SST files may be retained in
+  // memory.
+  //
+  // This controls the target amount of write history that will be available
+  // in memory for conflict checking when Transactions are used.
+  //
+  // This target may be undershot when the CF first opens and has not recovered
+  // or received enough writes to reach the target. After reaching the target
+  // once, it is guaranteed to never undershoot again. That guarantee is
+  // implemented by retaining flushed write buffers in-memory until the oldest
+  // one can be trimmed without dropping below the target.
+  //
+  // Examples with `max_write_buffer_size_to_maintain` set to 32MB:
+  //
+  // - One mutable memtable of 64MB, one unflushed immutable memtable of 64MB,
+  //   and zero flushed immutable memtables. Nothing trimmable exists.
+  // - One mutable memtable of 16MB, zero unflushed immutable memtables, and
+  //   one flushed immutable memtable of 64MB. Trimming is disallowed because
+  //   dropping the earliest (only) flushed immutable memtable would result in
+  //   write history of 16MB < 32MB.
+  // - One mutable memtable of 24MB, one unflushed immutable memtable of 16MB,
+  //   and one flushed immutable memtable of 16MB. The earliest (only) flushed
+  //   immutable memtable is trimmed because without it we still have
+  //   16MB + 24MB = 40MB > 32MB of write history.
+  //
+  // When using an OptimisticTransactionDB:
+  // If this value is too low, some transactions may fail at commit time due
+  // to not being able to determine whether there were any write conflicts.
+  //
+  // When using a TransactionDB:
+  // If Transaction::SetSnapshot is used, TransactionDB will read either
+  // in-memory write buffers or SST files to do write-conflict checking.
+  // Increasing this value can reduce the number of reads to SST files
+  // done for conflict detection.
+  //
+  // Setting this value to 0 will cause write buffers to be freed immediately
+  // after they are flushed. If this value is set to -1,
+  // 'max_write_buffer_number * write_buffer_size' will be used.
+  //
+  // Default:
+  // If using a TransactionDB/OptimisticTransactionDB, the default value will
+  // be set to the value of 'max_write_buffer_number * write_buffer_size'
+  // if it is not explicitly set by the user.  Otherwise, the default is 0.
+  int64_t max_write_buffer_size_to_maintain = 0;
+
+  // Allows thread-safe inplace updates. If this is true, there is no way to
+  // achieve point-in-time consistency using snapshot or iterator (assuming
+  // concurrent updates). Hence iterator and multi-get will return results
+  // which are not consistent as of any point-in-time.
+  // Backward iteration on memtables will not work either.
+  // If inplace_callback function is not set,
+  //   Put(key, new_value) will update inplace the existing_value iff
+  //   * key exists in current memtable
+  //   * new sizeof(new_value) <= sizeof(existing_value)
+  //   * existing_value for that key is a put i.e. kTypeValue
+  // If inplace_callback function is set, check doc for inplace_callback.
+  // Default: false.
+  bool inplace_update_support = false;
+
+  // Number of locks used for inplace update
+  // Default: 10000, if inplace_update_support = true, else 0.
+  //
+  // Dynamically changeable through SetOptions() API
+  size_t inplace_update_num_locks = 10000;
+
+  // [experimental]
+  // Used to activate or deactive the Mempurge feature (memtable garbage
+  // collection). (deactivated by default). At every flush, the total useful
+  // payload (total entries minus garbage entries) is estimated as a ratio
+  // [useful payload bytes]/[size of a memtable (in bytes)]. This ratio is then
+  // compared to this `threshold` value:
+  //     - if ratio<threshold: the flush is replaced by a mempurge operation
+  //     - else: a regular flush operation takes place.
+  // Threshold values:
+  //   0.0: mempurge deactivated (default).
+  //   1.0: recommended threshold value.
+  //   >1.0 : aggressive mempurge.
+  //   0 < threshold < 1.0: mempurge triggered only for very low useful payload
+  //   ratios.
+  // [experimental]
+  double experimental_mempurge_threshold = 0.0;
+
+  // existing_value - pointer to previous value (from both memtable and sst).
+  //                  nullptr if key doesn't exist
+  // existing_value_size - pointer to size of existing_value).
+  //                       nullptr if key doesn't exist
+  // delta_value - Delta value to be merged with the existing_value.
+  //               Stored in transaction logs.
+  // merged_value - Set when delta is applied on the previous value.
+  //
+  // Applicable only when inplace_update_support is true,
+  // this callback function is called at the time of updating the memtable
+  // as part of a Put operation, lets say Put(key, delta_value). It allows the
+  // 'delta_value' specified as part of the Put operation to be merged with
+  // an 'existing_value' of the key in the database.
+  //
+  // If the merged value is smaller in size that the 'existing_value',
+  // then this function can update the 'existing_value' buffer inplace and
+  // the corresponding 'existing_value'_size pointer, if it wishes to.
+  // The callback should return UpdateStatus::UPDATED_INPLACE.
+  // In this case. (In this case, the snapshot-semantics of the rocksdb
+  // Iterator is not atomic anymore).
+  //
+  // If the merged value is larger in size than the 'existing_value' or the
+  // application does not wish to modify the 'existing_value' buffer inplace,
+  // then the merged value should be returned via *merge_value. It is set by
+  // merging the 'existing_value' and the Put 'delta_value'. The callback should
+  // return UpdateStatus::UPDATED in this case. This merged value will be added
+  // to the memtable.
+  //
+  // If merging fails or the application does not wish to take any action,
+  // then the callback should return UpdateStatus::UPDATE_FAILED.
+  //
+  // Please remember that the original call from the application is Put(key,
+  // delta_value). So the transaction log (if enabled) will still contain (key,
+  // delta_value). The 'merged_value' is not stored in the transaction log.
+  // Hence the inplace_callback function should be consistent across db reopens.
+  //
+  // RocksDB callbacks are NOT exception-safe. A callback completing with an
+  // exception can lead to undefined behavior in RocksDB, including data loss,
+  // unreported corruption, deadlocks, and more.
+  //
+  // Default: nullptr
+  UpdateStatus (*inplace_callback)(char* existing_value,
+                                   uint32_t* existing_value_size,
+                                   Slice delta_value,
+                                   std::string* merged_value) = nullptr;
+
+  // Should really be called `memtable_bloom_size_ratio`. Enables a dynamic
+  // Bloom filter in memtable to optimize many queries that must go beyond
+  // the memtable. The size in bytes of the filter is
+  // write_buffer_size * memtable_prefix_bloom_size_ratio.
+  // * If prefix_extractor is set, the filter includes prefixes.
+  // * If memtable_whole_key_filtering, the filter includes whole keys.
+  // * If both, the filter includes both.
+  // * If neither, the feature is disabled.
+  //
+  // If this value is larger than 0.25, it is sanitized to 0.25.
+  //
+  // Default: 0 (disabled)
+  //
+  // Dynamically changeable through SetOptions() API
+  double memtable_prefix_bloom_size_ratio = 0.0;
+
+  // Enable whole key bloom filter in memtable. Note this will only take effect
+  // if memtable_prefix_bloom_size_ratio is not 0. Enabling whole key filtering
+  // can potentially reduce CPU usage for point-look-ups.
+  //
+  // Default: false (disabled)
+  //
+  // Dynamically changeable through SetOptions() API
+  bool memtable_whole_key_filtering = false;
+
+  // Page size for huge page for the arena used by the memtable. If <=0, it
+  // won't allocate from huge page but from malloc.
+  // Users are responsible to reserve huge pages for it to be allocated. For
+  // example:
+  //      sysctl -w vm.nr_hugepages=20
+  // See linux doc Documentation/vm/hugetlbpage.txt
+  // If there isn't enough free huge page available, it will fall back to
+  // malloc.
+  //
+  // Dynamically changeable through SetOptions() API
+  size_t memtable_huge_page_size = 0;
+
+  // If non-nullptr, memtable will use the specified function to extract
+  // prefixes for keys, and for each prefix maintain a hint of insert location
+  // to reduce CPU usage for inserting keys with the prefix. Keys out of
+  // domain of the prefix extractor will be insert without using hints.
+  //
+  // Currently only the default skiplist based memtable implements the feature.
+  // All other memtable implementation will ignore the option. It incurs ~250
+  // additional bytes of memory overhead to store a hint for each prefix.
+  // Also concurrent writes (when allow_concurrent_memtable_write is true) will
+  // ignore the option.
+  //
+  // The option is best suited for workloads where keys will likely to insert
+  // to a location close the last inserted key with the same prefix.
+  // One example could be inserting keys of the form (prefix + timestamp),
+  // and keys of the same prefix always comes in with time order. Another
+  // example would be updating the same key over and over again, in which case
+  // the prefix can be the key itself.
+  //
+  // Default: nullptr (disabled)
+  std::shared_ptr<const SliceTransform>
+      memtable_insert_with_hint_prefix_extractor = nullptr;
+
+  // Control locality of bloom filter probes to improve CPU cache hit rate.
+  // This option now only applies to plaintable prefix bloom. This
+  // optimization is turned off when set to 0, and positive number to turn
+  // it on.
+  // Default: 0
+  uint32_t bloom_locality = 0;
+
+  // size of one block in arena memory allocation.
+  // If <= 0, a proper value is automatically calculated (usually 1/8 of
+  // writer_buffer_size, rounded up to a multiple of 4KB, or 1MB which ever is
+  // smaller).
+  //
+  // There are two additional restriction of the specified size:
+  // (1) size should be in the range of [4096, 2 << 30] and
+  // (2) be the multiple of the CPU word (which helps with the memory
+  // alignment).
+  //
+  // We'll automatically check and adjust the size number to make sure it
+  // conforms to the restrictions.
+  //
+  // Default: 0
+  //
+  // Dynamically changeable through SetOptions() API
+  size_t arena_block_size = 0;
+
+  // Different levels can have different compression policies. There
+  // are cases where most lower levels would like to use quick compression
+  // algorithms while the higher levels (which have more data) use
+  // compression algorithms that have better compression but could
+  // be slower. This array, if non-empty, should have an entry for
+  // each level of the database; these override the value specified in
+  // the previous field 'compression'.
+  //
+  // NOTICE if level_compaction_dynamic_level_bytes=true,
+  // compression_per_level[0] still determines L0, but other elements
+  // of the array are based on base level (the level L0 files are merged
+  // to), and may not match the level users see from info log for metadata.
+  // If L0 files are merged to level-n, then, for i>0, compression_per_level[i]
+  // determines compaction type for level n+i-1.
+  // For example, if we have three 5 levels, and we determine to merge L0
+  // data to L4 (which means L1..L3 will be empty), then the new files go to
+  // L4 uses compression type compression_per_level[1].
+  // If now L0 is merged to L2. Data goes to L2 will be compressed
+  // according to compression_per_level[1], L3 using compression_per_level[2]
+  // and L4 using compression_per_level[3]. Compaction for each level can
+  // change when data grows.
+  //
+  // NOTE: if the vector size is smaller than the level number, the undefined
+  // lower level uses the last option in the vector, for example, for 3 level
+  // LSM tree the following settings are the same:
+  // {kNoCompression, kSnappyCompression}
+  // {kNoCompression, kSnappyCompression, kSnappyCompression}
+  //
+  // Dynamically changeable through SetOptions() API
+  std::vector<CompressionType> compression_per_level;
+
+  // Number of levels for this database
+  int num_levels = 7;
+
+  // Soft limit on number of level-0 files. We start slowing down writes at this
+  // point. A value <0 means that no writing slow down will be triggered by
+  // number of files in level-0.
+  //
+  // Default: 20
+  //
+  // Dynamically changeable through SetOptions() API
+  int level0_slowdown_writes_trigger = 20;
+
+  // Maximum number of level-0 files.  We stop writes at this point.
+  //
+  // Default: 36
+  //
+  // Dynamically changeable through SetOptions() API
+  int level0_stop_writes_trigger = 36;
+
+  // Target file size for compaction.
+  // target_file_size_base is per-file size for level-1.
+  // Target file size for level L can be calculated by
+  // target_file_size_base * (target_file_size_multiplier ^ (L-1))
+  // For example, if target_file_size_base is 2MB and
+  // target_file_size_multiplier is 10, then each file on level-1 will
+  // be 2MB, and each file on level 2 will be 20MB,
+  // and each file on level-3 will be 200MB.
+  //
+  // Default: 64MB.
+  //
+  // Dynamically changeable through SetOptions() API
+  uint64_t target_file_size_base = 64 * 1048576;
+
+  // By default target_file_size_multiplier is 1, which means
+  // by default files in different levels will have similar size.
+  //
+  // Dynamically changeable through SetOptions() API
+  int target_file_size_multiplier = 1;
+
+  // If true, RocksDB will pick target size of each level dynamically.
+  // We will pick a base level b >= 1. L0 will be directly merged into level b,
+  // instead of always into level 1. Level 1 to b-1 need to be empty.
+  // We try to pick b and its target size so that
+  // 1. target size is in the range of
+  //   (max_bytes_for_level_base / max_bytes_for_level_multiplier,
+  //    max_bytes_for_level_base]
+  // 2. target size of the last level (level num_levels-1) equals to extra size
+  //    of the level.
+  // At the same time max_bytes_for_level_multiplier and
+  // max_bytes_for_level_multiplier_additional are still satisfied.
+  // (When L0 is too large, we make some adjustment. See below.)
+  //
+  // With this option on, from an empty DB, we make last level the base level,
+  // which means merging L0 data into the last level, until it exceeds
+  // max_bytes_for_level_base. And then we make the second last level to be
+  // base level, to start to merge L0 data to second last level, with its
+  // target size to be 1/max_bytes_for_level_multiplier of the last level's
+  // extra size. After the data accumulates more so that we need to move the
+  // base level to the third last one, and so on.
+  //
+  // For example, assume max_bytes_for_level_multiplier=10, num_levels=6,
+  // and max_bytes_for_level_base=10MB.
+  // Target sizes of level 1 to 5 starts with:
+  // [- - - - 10MB]
+  // with base level is level. Target sizes of level 1 to 4 are not applicable
+  // because they will not be used.
+  // Until the size of Level 5 grows to more than 10MB, say 11MB, we make
+  // base target to level 4 and now the targets looks like:
+  // [- - - 1.1MB 11MB]
+  // While data are accumulated, size targets are tuned based on actual data
+  // of level 5. When level 5 has 50MB of data, the target is like:
+  // [- - - 5MB 50MB]
+  // Until level 5's actual size is more than 100MB, say 101MB. Now if we keep
+  // level 4 to be the base level, its target size needs to be 10.1MB, which
+  // doesn't satisfy the target size range. So now we make level 3 the target
+  // size and the target sizes of the levels look like:
+  // [- - 1.01MB 10.1MB 101MB]
+  // In the same way, while level 5 further grows, all levels' targets grow,
+  // like
+  // [- - 5MB 50MB 500MB]
+  // Until level 5 exceeds 1000MB and becomes 1001MB, we make level 2 the
+  // base level and make levels' target sizes like this:
+  // [- 1.001MB 10.01MB 100.1MB 1001MB]
+  // and go on...
+  //
+  // By doing it, we give max_bytes_for_level_multiplier a priority against
+  // max_bytes_for_level_base, for a more predictable LSM tree shape. It is
+  // useful to limit worse case space amplification.
+  //
+  //
+  // If the compaction from L0 is lagged behind, a special mode will be turned
+  // on to prioritize write amplification against max_bytes_for_level_multiplier
+  // or max_bytes_for_level_base. The L0 compaction is lagged behind by looking
+  // at number of L0 files and total L0 size. If number of L0 files is at least
+  // the double of level0_file_num_compaction_trigger, or the total size is
+  // at least max_bytes_for_level_base, this mode is on. The target of L1 grows
+  // to the actual data size in L0, and then determine the target for each level
+  // so that each level will have the same level multiplier.
+  //
+  // For example, when L0 size is 100MB, the size of last level is 1600MB,
+  // max_bytes_for_level_base = 80MB, and max_bytes_for_level_multiplier = 10.
+  // Since L0 size is larger than max_bytes_for_level_base, this is a L0
+  // compaction backlogged mode. So that the L1 size is determined to be 100MB.
+  // Based on max_bytes_for_level_multiplier = 10, at least 3 non-0 levels will
+  // be needed. The level multiplier will be calculated to be 4 and the three
+  // levels' target to be [100MB, 400MB, 1600MB].
+  //
+  // In this mode, The number of levels will be no more than the normal mode,
+  // and the level multiplier will be lower. The write amplification will
+  // likely to be reduced.
+  //
+  //
+  // max_bytes_for_level_multiplier_additional is ignored with this flag on.
+  //
+  // Turning this feature on or off for an existing DB can cause unexpected
+  // LSM tree structure so it's not recommended.
+  //
+  // Default: false
+  bool level_compaction_dynamic_level_bytes = false;
+
+  // Allows RocksDB to generate files that are not exactly the target_file_size
+  // only for the non-bottommost files. Which can reduce the write-amplification
+  // from compaction. The file size could be from 0 to 2x target_file_size.
+  // Once enabled, non-bottommost compaction will try to cut the files align
+  // with the next level file boundaries (grandparent level).
+  //
+  // Default: true
+  bool level_compaction_dynamic_file_size = true;
+
+  // Default: 10.
+  //
+  // Dynamically changeable through SetOptions() API
+  double max_bytes_for_level_multiplier = 10;
+
+  // Different max-size multipliers for different levels.
+  // These are multiplied by max_bytes_for_level_multiplier to arrive
+  // at the max-size of each level.
+  //
+  // Default: 1
+  //
+  // Dynamically changeable through SetOptions() API
+  std::vector<int> max_bytes_for_level_multiplier_additional =
+      std::vector<int>(num_levels, 1);
+
+  // We try to limit number of bytes in one compaction to be lower than this
+  // threshold. But it's not guaranteed.
+  // Value 0 will be sanitized.
+  //
+  // Default: target_file_size_base * 25
+  //
+  // Dynamically changeable through SetOptions() API
+  uint64_t max_compaction_bytes = 0;
+
+  // When setting up compaction input files, we ignore the
+  // `max_compaction_bytes` limit when pulling in input files that are entirely
+  // within output key range.
+  //
+  // Default: true
+  //
+  // Dynamically changeable through SetOptions() API
+  // We could remove this knob and always ignore the limit once it is proven
+  // safe.
+  bool ignore_max_compaction_bytes_for_input = true;
+
+  // All writes will be slowed down to at least delayed_write_rate if estimated
+  // bytes needed to be compaction exceed this threshold.
+  //
+  // Default: 64GB
+  //
+  // Dynamically changeable through SetOptions() API
+  uint64_t soft_pending_compaction_bytes_limit = 64 * 1073741824ull;
+
+  // All writes are stopped if estimated bytes needed to be compaction exceed
+  // this threshold.
+  //
+  // Default: 256GB
+  //
+  // Dynamically changeable through SetOptions() API
+  uint64_t hard_pending_compaction_bytes_limit = 256 * 1073741824ull;
+
+  // The compaction style. Default: kCompactionStyleLevel
+  CompactionStyle compaction_style = kCompactionStyleLevel;
+
+  // If level compaction_style = kCompactionStyleLevel, for each level,
+  // which files are prioritized to be picked to compact.
+  // Default: kMinOverlappingRatio
+  CompactionPri compaction_pri = kMinOverlappingRatio;
+
+  // The options needed to support Universal Style compactions
+  //
+  // Dynamically changeable through SetOptions() API
+  // Dynamic change example:
+  // SetOptions("compaction_options_universal", "{size_ratio=2;}")
+  CompactionOptionsUniversal compaction_options_universal;
+
+  // The options for FIFO compaction style
+  //
+  // Dynamically changeable through SetOptions() API
+  // Dynamic change example:
+  // SetOptions("compaction_options_fifo", "{max_table_files_size=100;}")
+  CompactionOptionsFIFO compaction_options_fifo;
+
+  // An iteration->Next() sequentially skips over keys with the same
+  // user-key unless this option is set. This number specifies the number
+  // of keys (with the same userkey) that will be sequentially
+  // skipped before a reseek is issued.
+  //
+  // Default: 8
+  //
+  // Dynamically changeable through SetOptions() API
+  uint64_t max_sequential_skip_in_iterations = 8;
+
+  // This is a factory that provides MemTableRep objects.
+  // Default: a factory that provides a skip-list-based implementation of
+  // MemTableRep.
+  std::shared_ptr<MemTableRepFactory> memtable_factory =
+      std::shared_ptr<SkipListFactory>(new SkipListFactory);
+
+  // Block-based table related options are moved to BlockBasedTableOptions.
+  // Related options that were originally here but now moved include:
+  //   no_block_cache
+  //   block_cache
+  //   block_cache_compressed
+  //   block_size
+  //   block_size_deviation
+  //   block_restart_interval
+  //   filter_policy
+  //   whole_key_filtering
+  // If you'd like to customize some of these options, you will need to
+  // use NewBlockBasedTableFactory() to construct a new table factory.
+
+  // This option allows user to collect their own interested statistics of
+  // the tables.
+  // Default: empty vector -- no user-defined statistics collection will be
+  // performed.
+  using TablePropertiesCollectorFactories =
+      std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>;
+  TablePropertiesCollectorFactories table_properties_collector_factories;
+
+  // Maximum number of successive merge operations on a key in the memtable.
+  //
+  // When a merge operation is added to the memtable and the maximum number of
+  // successive merges is reached, the value of the key will be calculated and
+  // inserted into the memtable instead of the merge operation. This will
+  // ensure that there are never more than max_successive_merges merge
+  // operations in the memtable.
+  //
+  // Default: 0 (disabled)
+  //
+  // Dynamically changeable through SetOptions() API
+  size_t max_successive_merges = 0;
+
+  // This flag specifies that the implementation should optimize the filters
+  // mainly for cases where keys are found rather than also optimize for keys
+  // missed. This would be used in cases where the application knows that
+  // there are very few misses or the performance in the case of misses is not
+  // important.
+  //
+  // For now, this flag allows us to not store filters for the last level i.e
+  // the largest level which contains data of the LSM store. For keys which
+  // are hits, the filters in this level are not useful because we will search
+  // for the data anyway. NOTE: the filters in other levels are still useful
+  // even for key hit because they tell us whether to look in that level or go
+  // to the higher level.
+  //
+  // Default: false
+  bool optimize_filters_for_hits = false;
+
+  // During flush or compaction, check whether keys inserted to output files
+  // are in order.
+  //
+  // Default: true
+  //
+  // Dynamically changeable through SetOptions() API
+  bool check_flush_compaction_key_order = true;
+
+  // After writing every SST file, reopen it and read all the keys.
+  // Checks the hash of all of the keys and values written versus the
+  // keys in the file and signals a corruption if they do not match
+  //
+  // Default: false
+  //
+  // Dynamically changeable through SetOptions() API
+  bool paranoid_file_checks = false;
+
+  // In debug mode, RocksDB runs consistency checks on the LSM every time the
+  // LSM changes (Flush, Compaction, AddFile). When this option is true, these
+  // checks are also enabled in release mode. These checks were historically
+  // disabled in release mode, but are now enabled by default for proactive
+  // corruption detection. The CPU overhead is negligible for normal mixed
+  // operations but can slow down saturated writing. See
+  // Options::DisableExtraChecks().
+  // Default: true
+  bool force_consistency_checks = true;
+
+  // Measure IO stats in compactions and flushes, if true.
+  //
+  // Default: false
+  //
+  // Dynamically changeable through SetOptions() API
+  bool report_bg_io_stats = false;
+
+  // Files containing updates older than TTL will go through the compaction
+  // process. This usually happens in a cascading way so that those entries
+  // will be compacted to bottommost level/file.
+  // The feature is used to remove stale entries that have been deleted or
+  // updated from the file system.
+  // Pre-req: This needs max_open_files to be set to -1.
+  // In Level: Non-bottom-level files older than TTL will go through the
+  //           compaction process.
+  // In FIFO: Files older than TTL will be deleted.
+  // unit: seconds. Ex: 1 day = 1 * 24 * 60 * 60
+  // In FIFO, this option will have the same meaning as
+  // periodic_compaction_seconds. Whichever stricter will be used.
+  // 0 means disabling.
+  // UINT64_MAX - 1 (0xfffffffffffffffe) is special flag to allow RocksDB to
+  // pick default.
+  //
+  // Default: 30 days for leveled compaction + block based table. disable
+  //          otherwise.
+  //
+  // Dynamically changeable through SetOptions() API
+  uint64_t ttl = 0xfffffffffffffffe;
+
+  // Files older than this value will be picked up for compaction, and
+  // re-written to the same level as they were before.
+  // One main use of the feature is to make sure a file goes through compaction
+  // filters periodically. Users can also use the feature to clear up SST
+  // files using old format.
+  //
+  // A file's age is computed by looking at file_creation_time or creation_time
+  // table properties in order, if they have valid non-zero values; if not, the
+  // age is based on the file's last modified time (given by the underlying
+  // Env).
+  //
+  // Supported in Level and FIFO compaction.
+  // In FIFO compaction, this option has the same meaning as TTL and whichever
+  // stricter will be used.
+  // Pre-req: max_open_file == -1.
+  // unit: seconds. Ex: 7 days = 7 * 24 * 60 * 60
+  //
+  // Values:
+  // 0: Turn off Periodic compactions.
+  // UINT64_MAX - 1 (i.e 0xfffffffffffffffe): Let RocksDB control this feature
+  //     as needed. For now, RocksDB will change this value to 30 days
+  //     (i.e 30 * 24 * 60 * 60) so that every file goes through the compaction
+  //     process at least once every 30 days if not compacted sooner.
+  //     In FIFO compaction, since the option has the same meaning as ttl,
+  //     when this value is left default, and ttl is left to 0, 30 days will be
+  //     used. Otherwise, min(ttl, periodic_compaction_seconds) will be used.
+  //
+  // Default: UINT64_MAX - 1 (allow RocksDB to auto-tune)
+  //
+  // Dynamically changeable through SetOptions() API
+  uint64_t periodic_compaction_seconds = 0xfffffffffffffffe;
+
+  // If this option is set then 1 in N blocks are compressed
+  // using a fast (lz4) and slow (zstd) compression algorithm.
+  // The compressibility is reported as stats and the stored
+  // data is left uncompressed (unless compression is also requested).
+  uint64_t sample_for_compression = 0;
+
+  // EXPERIMENTAL
+  // The feature is still in development and is incomplete.
+  // If this option is set, when creating the last level files, pass this
+  // temperature to FileSystem used. Should be no-op for default FileSystem
+  // and users need to plug in their own FileSystem to take advantage of it.
+  //
+  // Note: the feature is changed from `bottommost_temperature` to
+  //  `last_level_temperature` which now only apply for the last level files.
+  //  The option name `bottommost_temperature` is kept only for migration, the
+  //  behavior is the same as `last_level_temperature`. Please stop using
+  //  `bottommost_temperature` and will be removed in next release.
+  //
+  // Dynamically changeable through the SetOptions() API
+  Temperature bottommost_temperature = Temperature::kUnknown;
+  Temperature last_level_temperature = Temperature::kUnknown;
+
+  // EXPERIMENTAL
+  // The feature is still in development and is incomplete.
+  // If this option is set, when data insert time is within this time range, it
+  // will be precluded from the last level.
+  // 0 means no key will be precluded from the last level.
+  //
+  // Note: when enabled, universal size amplification (controlled by option
+  //  `compaction_options_universal.max_size_amplification_percent`) calculation
+  //  will exclude the last level. As the feature is designed for tiered storage
+  //  and a typical setting is the last level is cold tier which is likely not
+  //  size constrained, the size amp is going to be only for non-last levels.
+  //
+  // Default: 0 (disable the feature)
+  //
+  // Not dynamically changeable, change it requires db restart.
+  uint64_t preclude_last_level_data_seconds = 0;
+
+  // EXPERIMENTAL
+  // If this option is set, it will preserve the internal time information about
+  // the data until it's older than the specified time here.
+  // Internally the time information is a map between sequence number and time,
+  // which is the same as `preclude_last_level_data_seconds`. But it won't
+  // preclude the data from the last level and the data in the last level won't
+  // have the sequence number zeroed out.
+  // Internally, rocksdb would sample the sequence number to time pair and store
+  // that in SST property "rocksdb.seqno.time.map". The information is currently
+  // only used for tiered storage compaction (option
+  // `preclude_last_level_data_seconds`).
+  //
+  // Note: if both `preclude_last_level_data_seconds` and this option is set, it
+  //  will preserve the max time of the 2 options and compaction still preclude
+  //  the data based on `preclude_last_level_data_seconds`.
+  //  The higher the preserve_time is, the less the sampling frequency will be (
+  //  which means less accuracy of the time estimation).
+  //
+  // Default: 0 (disable the feature)
+  //
+  // Not dynamically changeable, change it requires db restart.
+  uint64_t preserve_internal_time_seconds = 0;
+
+  // When set, large values (blobs) are written to separate blob files, and
+  // only pointers to them are stored in SST files. This can reduce write
+  // amplification for large-value use cases at the cost of introducing a level
+  // of indirection for reads. See also the options min_blob_size,
+  // blob_file_size, blob_compression_type, enable_blob_garbage_collection,
+  // blob_garbage_collection_age_cutoff,
+  // blob_garbage_collection_force_threshold, and blob_compaction_readahead_size
+  // below.
+  //
+  // Default: false
+  //
+  // Dynamically changeable through the SetOptions() API
+  bool enable_blob_files = false;
+
+  // The size of the smallest value to be stored separately in a blob file.
+  // Values which have an uncompressed size smaller than this threshold are
+  // stored alongside the keys in SST files in the usual fashion. A value of
+  // zero for this option means that all values are stored in blob files. Note
+  // that enable_blob_files has to be set in order for this option to have any
+  // effect.
+  //
+  // Default: 0
+  //
+  // Dynamically changeable through the SetOptions() API
+  uint64_t min_blob_size = 0;
+
+  // The size limit for blob files. When writing blob files, a new file is
+  // opened once this limit is reached. Note that enable_blob_files has to be
+  // set in order for this option to have any effect.
+  //
+  // Default: 256 MB
+  //
+  // Dynamically changeable through the SetOptions() API
+  uint64_t blob_file_size = 1ULL << 28;
+
+  // The compression algorithm to use for large values stored in blob files.
+  // Note that enable_blob_files has to be set in order for this option to have
+  // any effect.
+  //
+  // Default: no compression
+  //
+  // Dynamically changeable through the SetOptions() API
+  CompressionType blob_compression_type = kNoCompression;
+
+  // Enables garbage collection of blobs. Blob GC is performed as part of
+  // compaction. Valid blobs residing in blob files older than a cutoff get
+  // relocated to new files as they are encountered during compaction, which
+  // makes it possible to clean up blob files once they contain nothing but
+  // obsolete/garbage blobs. See also blob_garbage_collection_age_cutoff and
+  // blob_garbage_collection_force_threshold below.
+  //
+  // Default: false
+  //
+  // Dynamically changeable through the SetOptions() API
+  bool enable_blob_garbage_collection = false;
+
+  // The cutoff in terms of blob file age for garbage collection. Blobs in
+  // the oldest N blob files will be relocated when encountered during
+  // compaction, where N = garbage_collection_cutoff * number_of_blob_files.
+  // Note that enable_blob_garbage_collection has to be set in order for this
+  // option to have any effect.
+  //
+  // Default: 0.25
+  //
+  // Dynamically changeable through the SetOptions() API
+  double blob_garbage_collection_age_cutoff = 0.25;
+
+  // If the ratio of garbage in the oldest blob files exceeds this threshold,
+  // targeted compactions are scheduled in order to force garbage collecting
+  // the blob files in question, assuming they are all eligible based on the
+  // value of blob_garbage_collection_age_cutoff above. This option is
+  // currently only supported with leveled compactions.
+  // Note that enable_blob_garbage_collection has to be set in order for this
+  // option to have any effect.
+  //
+  // Default: 1.0
+  //
+  // Dynamically changeable through the SetOptions() API
+  double blob_garbage_collection_force_threshold = 1.0;
+
+  // Compaction readahead for blob files.
+  //
+  // Default: 0
+  //
+  // Dynamically changeable through the SetOptions() API
+  uint64_t blob_compaction_readahead_size = 0;
+
+  // Enable blob files starting from a certain LSM tree level.
+  //
+  // For certain use cases that have a mix of short-lived and long-lived values,
+  // it might make sense to support extracting large values only during
+  // compactions whose output level is greater than or equal to a specified LSM
+  // tree level (e.g. compactions into L1/L2/... or above). This could reduce
+  // the space amplification caused by large values that are turned into garbage
+  // shortly after being written at the price of some write amplification
+  // incurred by long-lived values whose extraction to blob files is delayed.
+  //
+  // Default: 0
+  //
+  // Dynamically changeable through the SetOptions() API
+  int blob_file_starting_level = 0;
+
+  // The Cache object to use for blobs. Using a dedicated object for blobs and
+  // using the same object for the block and blob caches are both supported. In
+  // the latter case, note that blobs are less valuable from a caching
+  // perspective than SST blocks, and some cache implementations have
+  // configuration options that can be used to prioritize items accordingly (see
+  // Cache::Priority and LRUCacheOptions::{high,low}_pri_pool_ratio).
+  //
+  // Default: nullptr (disabled)
+  std::shared_ptr<Cache> blob_cache = nullptr;
+
+  // Enable/disable prepopulating the blob cache. When set to kFlushOnly, BlobDB
+  // will insert newly written blobs into the blob cache during flush. This can
+  // improve performance when reading back these blobs would otherwise be
+  // expensive (e.g. when using direct I/O or remote storage), or when the
+  // workload has a high temporal locality.
+  //
+  // Default: disabled
+  //
+  // Dynamically changeable through the SetOptions() API
+  PrepopulateBlobCache prepopulate_blob_cache = PrepopulateBlobCache::kDisable;
+
+  // Enable memtable per key-value checksum protection.
+  //
+  // Each entry in memtable will be suffixed by a per key-value checksum.
+  // This options determines the size of such checksums.
+  //
+  // It is suggested to turn on write batch per key-value
+  // checksum protection together with this option, so that the checksum
+  // computation is done outside of writer threads (memtable kv checksum can be
+  // computed from write batch checksum) See
+  // WriteOptions::protection_bytes_per_key for more detail.
+  //
+  // Default: 0 (no protection)
+  // Supported values: 0, 1, 2, 4, 8.
+  uint32_t memtable_protection_bytes_per_key = 0;
+
+  // Create ColumnFamilyOptions with default values for all fields
+  AdvancedColumnFamilyOptions();
+  // Create ColumnFamilyOptions from Options
+  explicit AdvancedColumnFamilyOptions(const Options& options);
+
+  // ---------------- OPTIONS NOT SUPPORTED ANYMORE ----------------
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/block_cache_trace_writer.h b/src/rocksdb/include/rocksdb/block_cache_trace_writer.h
new file mode 100644
index 000000000..18d28685b
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/block_cache_trace_writer.h
@@ -0,0 +1,149 @@
+// Copyright (c) 2022, Meta Platforms, Inc. and affiliates.  All rights
+// reserved. This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/options.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/table_reader_caller.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "rocksdb/trace_record.h"
+
+namespace ROCKSDB_NAMESPACE {
+// A record for block cache lookups/inserts. This is passed by the table
+// reader to the BlockCacheTraceWriter for every block cache op.
+struct BlockCacheTraceRecord {
+  // Required fields for all accesses.
+  uint64_t access_timestamp = 0;
+
+  // Info related to the block being looked up or inserted
+  //
+  // 1. The cache key for the block
+  std::string block_key;
+
+  // 2. The type of block
+  TraceType block_type = TraceType::kTraceMax;
+
+  // 3. Size of the block
+  uint64_t block_size = 0;
+
+  // Info about the SST file the block is in
+  //
+  // 1. Column family ID
+  uint64_t cf_id = 0;
+
+  // 2. Column family name
+  std::string cf_name;
+
+  // 3. LSM level of the file
+  uint32_t level = 0;
+
+  // 4. SST file number
+  uint64_t sst_fd_number = 0;
+
+  // Info about the calling context
+  //
+  // 1. The higher level request triggering the block cache request
+  TableReaderCaller caller = TableReaderCaller::kMaxBlockCacheLookupCaller;
+
+  // 2. Cache lookup hit/miss. Not relevant for inserts
+  bool is_cache_hit = false;
+
+  // 3. Whether this request is a lookup
+  bool no_insert = false;
+
+  // Get/MultiGet specific info
+  //
+  // 1. A unique ID for Get/MultiGet
+  uint64_t get_id = kReservedGetId;
+
+  // 2. Whether the Get/MultiGet is from a user-specified snapshot
+  bool get_from_user_specified_snapshot = false;
+
+  // 3. The target user key in the block
+  std::string referenced_key;
+
+  // Required fields for data block and user Get/Multi-Get only.
+  //
+  // 1. Size of te useful data in the block
+  uint64_t referenced_data_size = 0;
+
+  // 2. Only for MultiGet, number of keys from the batch found in the block
+  uint64_t num_keys_in_block = 0;
+
+  // 3. Whether the key was found in the block or not (false positive)
+  bool referenced_key_exist_in_block = false;
+
+  static const uint64_t kReservedGetId;
+
+  BlockCacheTraceRecord() {}
+
+  BlockCacheTraceRecord(uint64_t _access_timestamp, std::string _block_key,
+                        TraceType _block_type, uint64_t _block_size,
+                        uint64_t _cf_id, std::string _cf_name, uint32_t _level,
+                        uint64_t _sst_fd_number, TableReaderCaller _caller,
+                        bool _is_cache_hit, bool _no_insert, uint64_t _get_id,
+                        bool _get_from_user_specified_snapshot = false,
+                        std::string _referenced_key = "",
+                        uint64_t _referenced_data_size = 0,
+                        uint64_t _num_keys_in_block = 0,
+                        bool _referenced_key_exist_in_block = false)
+      : access_timestamp(_access_timestamp),
+        block_key(_block_key),
+        block_type(_block_type),
+        block_size(_block_size),
+        cf_id(_cf_id),
+        cf_name(_cf_name),
+        level(_level),
+        sst_fd_number(_sst_fd_number),
+        caller(_caller),
+        is_cache_hit(_is_cache_hit),
+        no_insert(_no_insert),
+        get_id(_get_id),
+        get_from_user_specified_snapshot(_get_from_user_specified_snapshot),
+        referenced_key(_referenced_key),
+        referenced_data_size(_referenced_data_size),
+        num_keys_in_block(_num_keys_in_block),
+        referenced_key_exist_in_block(_referenced_key_exist_in_block) {}
+};
+
+// Options for tracing block cache accesses
+struct BlockCacheTraceOptions {
+  // Specify trace sampling option, i.e. capture one per how many requests.
+  // Default to 1 (capture every request).
+  uint64_t sampling_frequency = 1;
+};
+
+// Options for the built-in implementation of BlockCacheTraceWriter
+struct BlockCacheTraceWriterOptions {
+  uint64_t max_trace_file_size = uint64_t{64} * 1024 * 1024 * 1024;
+};
+
+// BlockCacheTraceWriter is an abstract class that captures all RocksDB block
+// cache accesses. Every RocksDB operation is passed to WriteBlockAccess()
+// with a BlockCacheTraceRecord.
+class BlockCacheTraceWriter {
+ public:
+  virtual ~BlockCacheTraceWriter() {}
+
+  // Pass Slice references to avoid copy.
+  virtual Status WriteBlockAccess(const BlockCacheTraceRecord& record,
+                                  const Slice& block_key, const Slice& cf_name,
+                                  const Slice& referenced_key) = 0;
+
+  // Write a trace header at the beginning, typically on initiating a trace,
+  // with some metadata like a magic number and RocksDB version.
+  virtual Status WriteHeader() = 0;
+};
+
+// Allocate an instance of the built-in BlockCacheTraceWriter implementation,
+// that traces all block cache accesses to a user-provided TraceWriter. Each
+// access is traced to a file with a timestamp and type, followed by the
+// payload.
+std::unique_ptr<BlockCacheTraceWriter> NewBlockCacheTraceWriter(
+    SystemClock* clock, const BlockCacheTraceWriterOptions& trace_options,
+    std::unique_ptr<TraceWriter>&& trace_writer);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/c.h b/src/rocksdb/include/rocksdb/c.h
new file mode 100644
index 000000000..1639f3cd3
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/c.h
@@ -0,0 +1,2793 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+/* Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+  Use of this source code is governed by a BSD-style license that can be
+  found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+  C bindings for rocksdb.  May be useful as a stable ABI that can be
+  used by programs that keep rocksdb in a shared library, or for
+  a JNI api.
+
+  Does not support:
+  . getters for the option types
+  . custom comparators that implement key shortening
+  . capturing post-write-snapshot
+  . custom iter, db, env, cache implementations using just the C bindings
+
+  Some conventions:
+
+  (1) We expose just opaque struct pointers and functions to clients.
+  This allows us to change internal representations without having to
+  recompile clients.
+
+  (2) For simplicity, there is no equivalent to the Slice type.  Instead,
+  the caller has to pass the pointer and length as separate
+  arguments.
+
+  (3) Errors are represented by a null-terminated c string.  NULL
+  means no error.  All operations that can raise an error are passed
+  a "char** errptr" as the last argument.  One of the following must
+  be true on entry:
+     *errptr == NULL
+     *errptr points to a malloc()ed null-terminated error message
+  On success, a leveldb routine leaves *errptr unchanged.
+  On failure, leveldb frees the old value of *errptr and
+  set *errptr to a malloc()ed error message.
+
+  (4) Bools have the type unsigned char (0 == false; rest == true)
+
+  (5) All of the pointer arguments must be non-NULL.
+*/
+
+#pragma once
+
+#ifdef _WIN32
+#ifdef ROCKSDB_DLL
+#ifdef ROCKSDB_LIBRARY_EXPORTS
+#define ROCKSDB_LIBRARY_API __declspec(dllexport)
+#else
+#define ROCKSDB_LIBRARY_API __declspec(dllimport)
+#endif
+#else
+#define ROCKSDB_LIBRARY_API
+#endif
+#else
+#define ROCKSDB_LIBRARY_API
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+/* Exported types */
+
+typedef struct rocksdb_t rocksdb_t;
+typedef struct rocksdb_backup_engine_t rocksdb_backup_engine_t;
+typedef struct rocksdb_backup_engine_info_t rocksdb_backup_engine_info_t;
+typedef struct rocksdb_backup_engine_options_t rocksdb_backup_engine_options_t;
+typedef struct rocksdb_restore_options_t rocksdb_restore_options_t;
+typedef struct rocksdb_memory_allocator_t rocksdb_memory_allocator_t;
+typedef struct rocksdb_lru_cache_options_t rocksdb_lru_cache_options_t;
+typedef struct rocksdb_cache_t rocksdb_cache_t;
+typedef struct rocksdb_compactionfilter_t rocksdb_compactionfilter_t;
+typedef struct rocksdb_compactionfiltercontext_t
+    rocksdb_compactionfiltercontext_t;
+typedef struct rocksdb_compactionfilterfactory_t
+    rocksdb_compactionfilterfactory_t;
+typedef struct rocksdb_comparator_t rocksdb_comparator_t;
+typedef struct rocksdb_dbpath_t rocksdb_dbpath_t;
+typedef struct rocksdb_env_t rocksdb_env_t;
+typedef struct rocksdb_fifo_compaction_options_t
+    rocksdb_fifo_compaction_options_t;
+typedef struct rocksdb_filelock_t rocksdb_filelock_t;
+typedef struct rocksdb_filterpolicy_t rocksdb_filterpolicy_t;
+typedef struct rocksdb_flushoptions_t rocksdb_flushoptions_t;
+typedef struct rocksdb_iterator_t rocksdb_iterator_t;
+typedef struct rocksdb_logger_t rocksdb_logger_t;
+typedef struct rocksdb_mergeoperator_t rocksdb_mergeoperator_t;
+typedef struct rocksdb_options_t rocksdb_options_t;
+typedef struct rocksdb_compactoptions_t rocksdb_compactoptions_t;
+typedef struct rocksdb_block_based_table_options_t
+    rocksdb_block_based_table_options_t;
+typedef struct rocksdb_cuckoo_table_options_t rocksdb_cuckoo_table_options_t;
+typedef struct rocksdb_randomfile_t rocksdb_randomfile_t;
+typedef struct rocksdb_readoptions_t rocksdb_readoptions_t;
+typedef struct rocksdb_seqfile_t rocksdb_seqfile_t;
+typedef struct rocksdb_slicetransform_t rocksdb_slicetransform_t;
+typedef struct rocksdb_snapshot_t rocksdb_snapshot_t;
+typedef struct rocksdb_writablefile_t rocksdb_writablefile_t;
+typedef struct rocksdb_writebatch_t rocksdb_writebatch_t;
+typedef struct rocksdb_writebatch_wi_t rocksdb_writebatch_wi_t;
+typedef struct rocksdb_writeoptions_t rocksdb_writeoptions_t;
+typedef struct rocksdb_universal_compaction_options_t
+    rocksdb_universal_compaction_options_t;
+typedef struct rocksdb_livefiles_t rocksdb_livefiles_t;
+typedef struct rocksdb_column_family_handle_t rocksdb_column_family_handle_t;
+typedef struct rocksdb_column_family_metadata_t
+    rocksdb_column_family_metadata_t;
+typedef struct rocksdb_level_metadata_t rocksdb_level_metadata_t;
+typedef struct rocksdb_sst_file_metadata_t rocksdb_sst_file_metadata_t;
+typedef struct rocksdb_envoptions_t rocksdb_envoptions_t;
+typedef struct rocksdb_ingestexternalfileoptions_t
+    rocksdb_ingestexternalfileoptions_t;
+typedef struct rocksdb_sstfilewriter_t rocksdb_sstfilewriter_t;
+typedef struct rocksdb_ratelimiter_t rocksdb_ratelimiter_t;
+typedef struct rocksdb_perfcontext_t rocksdb_perfcontext_t;
+typedef struct rocksdb_pinnableslice_t rocksdb_pinnableslice_t;
+typedef struct rocksdb_transactiondb_options_t rocksdb_transactiondb_options_t;
+typedef struct rocksdb_transactiondb_t rocksdb_transactiondb_t;
+typedef struct rocksdb_transaction_options_t rocksdb_transaction_options_t;
+typedef struct rocksdb_optimistictransactiondb_t
+    rocksdb_optimistictransactiondb_t;
+typedef struct rocksdb_optimistictransaction_options_t
+    rocksdb_optimistictransaction_options_t;
+typedef struct rocksdb_transaction_t rocksdb_transaction_t;
+typedef struct rocksdb_checkpoint_t rocksdb_checkpoint_t;
+typedef struct rocksdb_wal_iterator_t rocksdb_wal_iterator_t;
+typedef struct rocksdb_wal_readoptions_t rocksdb_wal_readoptions_t;
+typedef struct rocksdb_memory_consumers_t rocksdb_memory_consumers_t;
+typedef struct rocksdb_memory_usage_t rocksdb_memory_usage_t;
+
+/* DB operations */
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open(
+    const rocksdb_options_t* options, const char* name, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_with_ttl(
+    const rocksdb_options_t* options, const char* name, int ttl, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_for_read_only(
+    const rocksdb_options_t* options, const char* name,
+    unsigned char error_if_wal_file_exists, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary(
+    const rocksdb_options_t* options, const char* name,
+    const char* secondary_path, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_t* rocksdb_backup_engine_open(
+    const rocksdb_options_t* options, const char* path, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_t*
+rocksdb_backup_engine_open_opts(const rocksdb_backup_engine_options_t* options,
+                                rocksdb_env_t* env, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_create_new_backup(
+    rocksdb_backup_engine_t* be, rocksdb_t* db, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_create_new_backup_flush(
+    rocksdb_backup_engine_t* be, rocksdb_t* db,
+    unsigned char flush_before_backup, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_purge_old_backups(
+    rocksdb_backup_engine_t* be, uint32_t num_backups_to_keep, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_restore_options_t*
+rocksdb_restore_options_create(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_destroy(
+    rocksdb_restore_options_t* opt);
+extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_set_keep_log_files(
+    rocksdb_restore_options_t* opt, int v);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_verify_backup(
+    rocksdb_backup_engine_t* be, uint32_t backup_id, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_restore_db_from_latest_backup(
+    rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir,
+    const rocksdb_restore_options_t* restore_options, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_restore_db_from_backup(
+    rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir,
+    const rocksdb_restore_options_t* restore_options, const uint32_t backup_id,
+    char** errptr);
+
+extern ROCKSDB_LIBRARY_API const rocksdb_backup_engine_info_t*
+rocksdb_backup_engine_get_backup_info(rocksdb_backup_engine_t* be);
+
+extern ROCKSDB_LIBRARY_API int rocksdb_backup_engine_info_count(
+    const rocksdb_backup_engine_info_t* info);
+
+extern ROCKSDB_LIBRARY_API int64_t rocksdb_backup_engine_info_timestamp(
+    const rocksdb_backup_engine_info_t* info, int index);
+
+extern ROCKSDB_LIBRARY_API uint32_t rocksdb_backup_engine_info_backup_id(
+    const rocksdb_backup_engine_info_t* info, int index);
+
+extern ROCKSDB_LIBRARY_API uint64_t rocksdb_backup_engine_info_size(
+    const rocksdb_backup_engine_info_t* info, int index);
+
+extern ROCKSDB_LIBRARY_API uint32_t rocksdb_backup_engine_info_number_files(
+    const rocksdb_backup_engine_info_t* info, int index);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_info_destroy(
+    const rocksdb_backup_engine_info_t* info);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_close(
+    rocksdb_backup_engine_t* be);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_put_with_ts(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key,
+    size_t keylen, const char* ts, size_t tslen, const char* val, size_t vallen,
+    char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_put_cf_with_ts(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, const char* ts, size_t tslen, const char* val, size_t vallen,
+    char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete_with_ts(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key,
+    size_t keylen, const char* ts, size_t tslen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete_cf_with_ts(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, const char* ts, size_t tslen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_singledelete(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key,
+    size_t keylen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_singledelete_cf(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_singledelete_with_ts(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key,
+    size_t keylen, const char* ts, size_t tslen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_singledelete_cf_with_ts(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, const char* ts, size_t tslen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_increase_full_history_ts_low(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    const char* ts_low, size_t ts_lowlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_get_full_history_ts_low(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    size_t* ts_lowlen, char** errptr);
+
+/* BackupEngineOptions */
+
+extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_options_t*
+rocksdb_backup_engine_options_create(const char* backup_dir);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_options_set_backup_dir(
+    rocksdb_backup_engine_options_t* options, const char* backup_dir);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_options_set_env(
+    rocksdb_backup_engine_options_t* options, rocksdb_env_t* env);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_options_set_share_table_files(
+    rocksdb_backup_engine_options_t* options, unsigned char val);
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_backup_engine_options_get_share_table_files(
+    rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_options_set_sync(
+    rocksdb_backup_engine_options_t* options, unsigned char val);
+
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_backup_engine_options_get_sync(
+    rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_options_set_destroy_old_data(
+    rocksdb_backup_engine_options_t* options, unsigned char val);
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_backup_engine_options_get_destroy_old_data(
+    rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_options_set_backup_log_files(
+    rocksdb_backup_engine_options_t* options, unsigned char val);
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_backup_engine_options_get_backup_log_files(
+    rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_options_set_backup_rate_limit(
+    rocksdb_backup_engine_options_t* options, uint64_t limit);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_backup_engine_options_get_backup_rate_limit(
+    rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_options_set_restore_rate_limit(
+    rocksdb_backup_engine_options_t* options, uint64_t limit);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_backup_engine_options_get_restore_rate_limit(
+    rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_options_set_max_background_operations(
+    rocksdb_backup_engine_options_t* options, int val);
+
+extern ROCKSDB_LIBRARY_API int
+rocksdb_backup_engine_options_get_max_background_operations(
+    rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_options_set_callback_trigger_interval_size(
+    rocksdb_backup_engine_options_t* options, uint64_t size);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_backup_engine_options_get_callback_trigger_interval_size(
+    rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_options_set_max_valid_backups_to_open(
+    rocksdb_backup_engine_options_t* options, int val);
+
+extern ROCKSDB_LIBRARY_API int
+rocksdb_backup_engine_options_get_max_valid_backups_to_open(
+    rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_options_set_share_files_with_checksum_naming(
+    rocksdb_backup_engine_options_t* options, int val);
+
+extern ROCKSDB_LIBRARY_API int
+rocksdb_backup_engine_options_get_share_files_with_checksum_naming(
+    rocksdb_backup_engine_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_options_destroy(
+    rocksdb_backup_engine_options_t*);
+
+/* Checkpoint */
+
+extern ROCKSDB_LIBRARY_API rocksdb_checkpoint_t*
+rocksdb_checkpoint_object_create(rocksdb_t* db, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_checkpoint_create(
+    rocksdb_checkpoint_t* checkpoint, const char* checkpoint_dir,
+    uint64_t log_size_for_flush, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_checkpoint_object_destroy(
+    rocksdb_checkpoint_t* checkpoint);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_and_trim_history(
+    const rocksdb_options_t* options, const char* name, int num_column_families,
+    const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char* trim_ts,
+    size_t trim_tslen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_column_families(
+    const rocksdb_options_t* options, const char* name, int num_column_families,
+    const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_column_families_with_ttl(
+    const rocksdb_options_t* options, const char* name, int num_column_families,
+    const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, const int* ttls,
+    char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t*
+rocksdb_open_for_read_only_column_families(
+    const rocksdb_options_t* options, const char* name, int num_column_families,
+    const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles,
+    unsigned char error_if_wal_file_exists, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary_column_families(
+    const rocksdb_options_t* options, const char* name,
+    const char* secondary_path, int num_column_families,
+    const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char** rocksdb_list_column_families(
+    const rocksdb_options_t* options, const char* name, size_t* lencf,
+    char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_list_column_families_destroy(
+    char** list, size_t len);
+
+extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t*
+rocksdb_create_column_family(rocksdb_t* db,
+                             const rocksdb_options_t* column_family_options,
+                             const char* column_family_name, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t*
+rocksdb_create_column_family_with_ttl(
+    rocksdb_t* db, const rocksdb_options_t* column_family_options,
+    const char* column_family_name, int ttl, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_drop_column_family(
+    rocksdb_t* db, rocksdb_column_family_handle_t* handle, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_column_family_handle_destroy(
+    rocksdb_column_family_handle_t*);
+
+extern ROCKSDB_LIBRARY_API uint32_t
+rocksdb_column_family_handle_get_id(rocksdb_column_family_handle_t* handle);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_column_family_handle_get_name(
+    rocksdb_column_family_handle_t* handle, size_t* name_len);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_close(rocksdb_t* db);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_put(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key,
+    size_t keylen, const char* val, size_t vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_put_cf(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, const char* val, size_t vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key,
+    size_t keylen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete_cf(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete_range_cf(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* start_key,
+    size_t start_key_len, const char* end_key, size_t end_key_len,
+    char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_merge(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key,
+    size_t keylen, const char* val, size_t vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_merge_cf(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, const char* val, size_t vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_write(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options,
+    rocksdb_writebatch_t* batch, char** errptr);
+
+/* Returns NULL if not found.  A malloc()ed array otherwise.
+   Stores the length of the array in *vallen. */
+extern ROCKSDB_LIBRARY_API char* rocksdb_get(
+    rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key,
+    size_t keylen, size_t* vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_get_with_ts(
+    rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key,
+    size_t keylen, size_t* vallen, char** ts, size_t* tslen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_get_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, size_t* vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_get_cf_with_ts(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, size_t* vallen, char** ts, size_t* tslen, char** errptr);
+
+// if values_list[i] == NULL and errs[i] == NULL,
+// then we got status.IsNotFound(), which we will not return.
+// all errors except status status.ok() and status.IsNotFound() are returned.
+//
+// errs, values_list and values_list_sizes must be num_keys in length,
+// allocated by the caller.
+// errs is a list of strings as opposed to the conventional one error,
+// where errs[i] is the status for retrieval of keys_list[i].
+// each non-NULL errs entry is a malloc()ed, null terminated string.
+// each non-NULL values_list entry is a malloc()ed array, with
+// the length for each stored in values_list_sizes[i].
+extern ROCKSDB_LIBRARY_API void rocksdb_multi_get(
+    rocksdb_t* db, const rocksdb_readoptions_t* options, size_t num_keys,
+    const char* const* keys_list, const size_t* keys_list_sizes,
+    char** values_list, size_t* values_list_sizes, char** errs);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_multi_get_with_ts(
+    rocksdb_t* db, const rocksdb_readoptions_t* options, size_t num_keys,
+    const char* const* keys_list, const size_t* keys_list_sizes,
+    char** values_list, size_t* values_list_sizes, char** timestamp_list,
+    size_t* timestamp_list_sizes, char** errs);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_multi_get_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    const rocksdb_column_family_handle_t* const* column_families,
+    size_t num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, char** values_list,
+    size_t* values_list_sizes, char** errs);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_multi_get_cf_with_ts(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    const rocksdb_column_family_handle_t* const* column_families,
+    size_t num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, char** values_list,
+    size_t* values_list_sizes, char** timestamps_list,
+    size_t* timestamps_list_sizes, char** errs);
+
+// The MultiGet API that improves performance by batching operations
+// in the read path for greater efficiency. Currently, only the block based
+// table format with full filters are supported. Other table formats such
+// as plain table, block based table with block based filters and
+// partitioned indexes will still work, but will not get any performance
+// benefits.
+//
+// Note that all the keys passed to this API are restricted to a single
+// column family.
+//
+// Parameters -
+// db - the RocksDB instance.
+// options - ReadOptions
+// column_family - ColumnFamilyHandle* that the keys belong to. All the keys
+//                 passed to the API are restricted to a single column family
+// num_keys - Number of keys to lookup
+// keys_list - Pointer to C style array of keys with num_keys elements
+// keys_list_sizes - Pointer to C style array of the size of corresponding key
+//   in key_list with num_keys elements.
+// values - Pointer to C style array of PinnableSlices with num_keys elements
+// statuses - Pointer to C style array of Status with num_keys elements
+// sorted_input - If true, it means the input keys are already sorted by key
+//                order, so the MultiGet() API doesn't have to sort them
+//                again. If false, the keys will be copied and sorted
+//                internally by the API - the input array will not be
+//                modified
+extern ROCKSDB_LIBRARY_API void rocksdb_batched_multi_get_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, size_t num_keys,
+    const char* const* keys_list, const size_t* keys_list_sizes,
+    rocksdb_pinnableslice_t** values, char** errs, const bool sorted_input);
+
+// The value is only allocated (using malloc) and returned if it is found and
+// value_found isn't NULL. In that case the user is responsible for freeing it.
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_key_may_exist(
+    rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key,
+    size_t key_len, char** value, size_t* val_len, const char* timestamp,
+    size_t timestamp_len, unsigned char* value_found);
+
+// The value is only allocated (using malloc) and returned if it is found and
+// value_found isn't NULL. In that case the user is responsible for freeing it.
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_key_may_exist_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t key_len, char** value, size_t* val_len, const char* timestamp,
+    size_t timestamp_len, unsigned char* value_found);
+
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_create_iterator(
+    rocksdb_t* db, const rocksdb_readoptions_t* options);
+
+extern ROCKSDB_LIBRARY_API rocksdb_wal_iterator_t* rocksdb_get_updates_since(
+    rocksdb_t* db, uint64_t seq_number,
+    const rocksdb_wal_readoptions_t* options, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_create_iterator_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_create_iterators(
+    rocksdb_t* db, rocksdb_readoptions_t* opts,
+    rocksdb_column_family_handle_t** column_families,
+    rocksdb_iterator_t** iterators, size_t size, char** errptr);
+
+extern ROCKSDB_LIBRARY_API const rocksdb_snapshot_t* rocksdb_create_snapshot(
+    rocksdb_t* db);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_release_snapshot(
+    rocksdb_t* db, const rocksdb_snapshot_t* snapshot);
+
+/* Returns NULL if property name is unknown.
+   Else returns a pointer to a malloc()-ed null-terminated value. */
+extern ROCKSDB_LIBRARY_API char* rocksdb_property_value(rocksdb_t* db,
+                                                        const char* propname);
+/* returns 0 on success, -1 otherwise */
+int rocksdb_property_int(rocksdb_t* db, const char* propname,
+                         uint64_t* out_val);
+
+/* returns 0 on success, -1 otherwise */
+int rocksdb_property_int_cf(rocksdb_t* db,
+                            rocksdb_column_family_handle_t* column_family,
+                            const char* propname, uint64_t* out_val);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_property_value_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    const char* propname);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_approximate_sizes(
+    rocksdb_t* db, int num_ranges, const char* const* range_start_key,
+    const size_t* range_start_key_len, const char* const* range_limit_key,
+    const size_t* range_limit_key_len, uint64_t* sizes, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_approximate_sizes_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    int num_ranges, const char* const* range_start_key,
+    const size_t* range_start_key_len, const char* const* range_limit_key,
+    const size_t* range_limit_key_len, uint64_t* sizes, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_compact_range(rocksdb_t* db,
+                                                      const char* start_key,
+                                                      size_t start_key_len,
+                                                      const char* limit_key,
+                                                      size_t limit_key_len);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_compact_range_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    const char* start_key, size_t start_key_len, const char* limit_key,
+    size_t limit_key_len);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_suggest_compact_range(
+    rocksdb_t* db, const char* start_key, size_t start_key_len,
+    const char* limit_key, size_t limit_key_len, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_suggest_compact_range_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    const char* start_key, size_t start_key_len, const char* limit_key,
+    size_t limit_key_len, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_compact_range_opt(
+    rocksdb_t* db, rocksdb_compactoptions_t* opt, const char* start_key,
+    size_t start_key_len, const char* limit_key, size_t limit_key_len);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_compact_range_cf_opt(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    rocksdb_compactoptions_t* opt, const char* start_key, size_t start_key_len,
+    const char* limit_key, size_t limit_key_len);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete_file(rocksdb_t* db,
+                                                    const char* name);
+
+extern ROCKSDB_LIBRARY_API const rocksdb_livefiles_t* rocksdb_livefiles(
+    rocksdb_t* db);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_flush(
+    rocksdb_t* db, const rocksdb_flushoptions_t* options, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_flush_cf(
+    rocksdb_t* db, const rocksdb_flushoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_flush_wal(rocksdb_t* db,
+                                                  unsigned char sync,
+                                                  char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_disable_file_deletions(rocksdb_t* db,
+                                                               char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_enable_file_deletions(
+    rocksdb_t* db, unsigned char force, char** errptr);
+
+/* Management operations */
+
+extern ROCKSDB_LIBRARY_API void rocksdb_destroy_db(
+    const rocksdb_options_t* options, const char* name, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_repair_db(
+    const rocksdb_options_t* options, const char* name, char** errptr);
+
+/* Iterator */
+
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_destroy(rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_iter_valid(
+    const rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek_to_first(rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek_to_last(rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek(rocksdb_iterator_t*,
+                                                  const char* k, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek_for_prev(rocksdb_iterator_t*,
+                                                           const char* k,
+                                                           size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_next(rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_prev(rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_iter_key(
+    const rocksdb_iterator_t*, size_t* klen);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_iter_value(
+    const rocksdb_iterator_t*, size_t* vlen);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_iter_timestamp(
+    const rocksdb_iterator_t*, size_t* tslen);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_get_error(
+    const rocksdb_iterator_t*, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_next(
+    rocksdb_wal_iterator_t* iter);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_wal_iter_valid(
+    const rocksdb_wal_iterator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_status(
+    const rocksdb_wal_iterator_t* iter, char** errptr);
+extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_wal_iter_get_batch(
+    const rocksdb_wal_iterator_t* iter, uint64_t* seq);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_get_latest_sequence_number(rocksdb_t* db);
+extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_destroy(
+    const rocksdb_wal_iterator_t* iter);
+
+/* Write batch */
+
+extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create(
+    void);
+extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create_from(
+    const char* rep, size_t size);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_destroy(
+    rocksdb_writebatch_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_clear(rocksdb_writebatch_t*);
+extern ROCKSDB_LIBRARY_API int rocksdb_writebatch_count(rocksdb_writebatch_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put(rocksdb_writebatch_t*,
+                                                       const char* key,
+                                                       size_t klen,
+                                                       const char* val,
+                                                       size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put_cf(
+    rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* val, size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put_cf_with_ts(
+    rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* ts, size_t tslen, const char* val,
+    size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_putv(
+    rocksdb_writebatch_t* b, int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, int num_values,
+    const char* const* values_list, const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_putv_cf(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list, const size_t* keys_list_sizes,
+    int num_values, const char* const* values_list,
+    const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_merge(rocksdb_writebatch_t*,
+                                                         const char* key,
+                                                         size_t klen,
+                                                         const char* val,
+                                                         size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_merge_cf(
+    rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* val, size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_mergev(
+    rocksdb_writebatch_t* b, int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, int num_values,
+    const char* const* values_list, const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_mergev_cf(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list, const size_t* keys_list_sizes,
+    int num_values, const char* const* values_list,
+    const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete(rocksdb_writebatch_t*,
+                                                          const char* key,
+                                                          size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_singledelete(
+    rocksdb_writebatch_t* b, const char* key, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_cf(
+    rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_cf_with_ts(
+    rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* ts, size_t tslen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_singledelete_cf(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_singledelete_cf_with_ts(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* ts, size_t tslen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_deletev(
+    rocksdb_writebatch_t* b, int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_deletev_cf(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list, const size_t* keys_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_range(
+    rocksdb_writebatch_t* b, const char* start_key, size_t start_key_len,
+    const char* end_key, size_t end_key_len);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_range_cf(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* start_key, size_t start_key_len, const char* end_key,
+    size_t end_key_len);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_rangev(
+    rocksdb_writebatch_t* b, int num_keys, const char* const* start_keys_list,
+    const size_t* start_keys_list_sizes, const char* const* end_keys_list,
+    const size_t* end_keys_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_rangev_cf(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* start_keys_list,
+    const size_t* start_keys_list_sizes, const char* const* end_keys_list,
+    const size_t* end_keys_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put_log_data(
+    rocksdb_writebatch_t*, const char* blob, size_t len);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_iterate(
+    rocksdb_writebatch_t*, void* state,
+    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+    void (*deleted)(void*, const char* k, size_t klen));
+extern ROCKSDB_LIBRARY_API const char* rocksdb_writebatch_data(
+    rocksdb_writebatch_t*, size_t* size);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_set_save_point(
+    rocksdb_writebatch_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_rollback_to_save_point(
+    rocksdb_writebatch_t*, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_pop_save_point(
+    rocksdb_writebatch_t*, char** errptr);
+
+/* Write batch with index */
+
+extern ROCKSDB_LIBRARY_API rocksdb_writebatch_wi_t*
+rocksdb_writebatch_wi_create(size_t reserved_bytes,
+                             unsigned char overwrite_keys);
+extern ROCKSDB_LIBRARY_API rocksdb_writebatch_wi_t*
+rocksdb_writebatch_wi_create_from(const char* rep, size_t size);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_destroy(
+    rocksdb_writebatch_wi_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_clear(
+    rocksdb_writebatch_wi_t*);
+extern ROCKSDB_LIBRARY_API int rocksdb_writebatch_wi_count(
+    rocksdb_writebatch_wi_t* b);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_put(
+    rocksdb_writebatch_wi_t*, const char* key, size_t klen, const char* val,
+    size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_put_cf(
+    rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* val, size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_putv(
+    rocksdb_writebatch_wi_t* b, int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, int num_values,
+    const char* const* values_list, const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_putv_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list, const size_t* keys_list_sizes,
+    int num_values, const char* const* values_list,
+    const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_merge(
+    rocksdb_writebatch_wi_t*, const char* key, size_t klen, const char* val,
+    size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_merge_cf(
+    rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* val, size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_mergev(
+    rocksdb_writebatch_wi_t* b, int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, int num_values,
+    const char* const* values_list, const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_mergev_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list, const size_t* keys_list_sizes,
+    int num_values, const char* const* values_list,
+    const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete(
+    rocksdb_writebatch_wi_t*, const char* key, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_singledelete(
+    rocksdb_writebatch_wi_t*, const char* key, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_cf(
+    rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_singledelete_cf(
+    rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_deletev(
+    rocksdb_writebatch_wi_t* b, int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_deletev_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list, const size_t* keys_list_sizes);
+// DO NOT USE - rocksdb_writebatch_wi_delete_range is not yet supported
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_range(
+    rocksdb_writebatch_wi_t* b, const char* start_key, size_t start_key_len,
+    const char* end_key, size_t end_key_len);
+// DO NOT USE - rocksdb_writebatch_wi_delete_range_cf is not yet supported
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_range_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* start_key, size_t start_key_len, const char* end_key,
+    size_t end_key_len);
+// DO NOT USE - rocksdb_writebatch_wi_delete_rangev is not yet supported
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_rangev(
+    rocksdb_writebatch_wi_t* b, int num_keys,
+    const char* const* start_keys_list, const size_t* start_keys_list_sizes,
+    const char* const* end_keys_list, const size_t* end_keys_list_sizes);
+// DO NOT USE - rocksdb_writebatch_wi_delete_rangev_cf is not yet supported
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_rangev_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* start_keys_list,
+    const size_t* start_keys_list_sizes, const char* const* end_keys_list,
+    const size_t* end_keys_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_put_log_data(
+    rocksdb_writebatch_wi_t*, const char* blob, size_t len);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_iterate(
+    rocksdb_writebatch_wi_t* b, void* state,
+    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+    void (*deleted)(void*, const char* k, size_t klen));
+extern ROCKSDB_LIBRARY_API const char* rocksdb_writebatch_wi_data(
+    rocksdb_writebatch_wi_t* b, size_t* size);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_set_save_point(
+    rocksdb_writebatch_wi_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_rollback_to_save_point(
+    rocksdb_writebatch_wi_t*, char** errptr);
+extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch(
+    rocksdb_writebatch_wi_t* wbwi, const rocksdb_options_t* options,
+    const char* key, size_t keylen, size_t* vallen, char** errptr);
+extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_cf(
+    rocksdb_writebatch_wi_t* wbwi, const rocksdb_options_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, size_t* vallen, char** errptr);
+extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_and_db(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db,
+    const rocksdb_readoptions_t* options, const char* key, size_t keylen,
+    size_t* vallen, char** errptr);
+extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_and_db_cf(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, size_t* vallen, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_write_writebatch_wi(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options,
+    rocksdb_writebatch_wi_t* wbwi, char** errptr);
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_writebatch_wi_create_iterator_with_base(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator);
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_writebatch_wi_create_iterator_with_base_cf(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator,
+    rocksdb_column_family_handle_t* cf);
+
+/* Options utils */
+
+// Load the latest rocksdb options from the specified db_path.
+//
+// On success, num_column_families will be updated with a non-zero
+// number indicating the number of column families.
+// The returned db_options, column_family_names, and column_family_options
+// should be released via rocksdb_load_latest_options_destroy().
+//
+// On error, a non-null errptr that includes the error message will be
+// returned.  db_options, column_family_names, and column_family_options
+// will be set to NULL.
+extern ROCKSDB_LIBRARY_API void rocksdb_load_latest_options(
+    const char* db_path, rocksdb_env_t* env, bool ignore_unknown_options,
+    rocksdb_cache_t* cache, rocksdb_options_t** db_options,
+    size_t* num_column_families, char*** column_family_names,
+    rocksdb_options_t*** column_family_options, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_load_latest_options_destroy(
+    rocksdb_options_t* db_options, char** list_column_family_names,
+    rocksdb_options_t** list_column_family_options, size_t len);
+
+/* Block based table options */
+
+extern ROCKSDB_LIBRARY_API rocksdb_block_based_table_options_t*
+rocksdb_block_based_options_create(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_destroy(
+    rocksdb_block_based_table_options_t* options);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_checksum(
+    rocksdb_block_based_table_options_t*, char);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_block_size(
+    rocksdb_block_based_table_options_t* options, size_t block_size);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_block_size_deviation(
+    rocksdb_block_based_table_options_t* options, int block_size_deviation);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_block_restart_interval(
+    rocksdb_block_based_table_options_t* options, int block_restart_interval);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_index_block_restart_interval(
+    rocksdb_block_based_table_options_t* options,
+    int index_block_restart_interval);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_metadata_block_size(
+    rocksdb_block_based_table_options_t* options, uint64_t metadata_block_size);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_partition_filters(
+    rocksdb_block_based_table_options_t* options,
+    unsigned char partition_filters);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_use_delta_encoding(
+    rocksdb_block_based_table_options_t* options,
+    unsigned char use_delta_encoding);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_filter_policy(
+    rocksdb_block_based_table_options_t* options,
+    rocksdb_filterpolicy_t* filter_policy);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_no_block_cache(
+    rocksdb_block_based_table_options_t* options, unsigned char no_block_cache);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_block_cache(
+    rocksdb_block_based_table_options_t* options, rocksdb_cache_t* block_cache);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_block_cache_compressed(
+    rocksdb_block_based_table_options_t* options,
+    rocksdb_cache_t* block_cache_compressed);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_whole_key_filtering(
+    rocksdb_block_based_table_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_format_version(
+    rocksdb_block_based_table_options_t*, int);
+enum {
+  rocksdb_block_based_table_index_type_binary_search = 0,
+  rocksdb_block_based_table_index_type_hash_search = 1,
+  rocksdb_block_based_table_index_type_two_level_index_search = 2,
+};
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_index_type(
+    rocksdb_block_based_table_options_t*, int);  // uses one of the above enums
+enum {
+  rocksdb_block_based_table_data_block_index_type_binary_search = 0,
+  rocksdb_block_based_table_data_block_index_type_binary_search_and_hash = 1,
+};
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_data_block_index_type(
+    rocksdb_block_based_table_options_t*, int);  // uses one of the above enums
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_data_block_hash_ratio(
+    rocksdb_block_based_table_options_t* options, double v);
+// rocksdb_block_based_options_set_hash_index_allow_collision()
+// is removed since BlockBasedTableOptions.hash_index_allow_collision()
+// is removed
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_cache_index_and_filter_blocks(
+    rocksdb_block_based_table_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_cache_index_and_filter_blocks_with_high_priority(
+    rocksdb_block_based_table_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_pin_l0_filter_and_index_blocks_in_cache(
+    rocksdb_block_based_table_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_pin_top_level_index_and_filter(
+    rocksdb_block_based_table_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_block_based_table_factory(
+    rocksdb_options_t* opt, rocksdb_block_based_table_options_t* table_options);
+
+/* Cuckoo table options */
+
+extern ROCKSDB_LIBRARY_API rocksdb_cuckoo_table_options_t*
+rocksdb_cuckoo_options_create(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_destroy(
+    rocksdb_cuckoo_table_options_t* options);
+extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_hash_ratio(
+    rocksdb_cuckoo_table_options_t* options, double v);
+extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_max_search_depth(
+    rocksdb_cuckoo_table_options_t* options, uint32_t v);
+extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_cuckoo_block_size(
+    rocksdb_cuckoo_table_options_t* options, uint32_t v);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_cuckoo_options_set_identity_as_first_hash(
+    rocksdb_cuckoo_table_options_t* options, unsigned char v);
+extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_use_module_hash(
+    rocksdb_cuckoo_table_options_t* options, unsigned char v);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_cuckoo_table_factory(
+    rocksdb_options_t* opt, rocksdb_cuckoo_table_options_t* table_options);
+
+/* Options */
+extern ROCKSDB_LIBRARY_API void rocksdb_set_options(rocksdb_t* db, int count,
+                                                    const char* const keys[],
+                                                    const char* const values[],
+                                                    char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_set_options_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* handle, int count,
+    const char* const keys[], const char* const values[], char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_options_t* rocksdb_options_create(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_destroy(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API rocksdb_options_t* rocksdb_options_create_copy(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_increase_parallelism(
+    rocksdb_options_t* opt, int total_threads);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_optimize_for_point_lookup(
+    rocksdb_options_t* opt, uint64_t block_cache_size_mb);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_optimize_level_style_compaction(
+    rocksdb_options_t* opt, uint64_t memtable_memory_budget);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_optimize_universal_style_compaction(
+    rocksdb_options_t* opt, uint64_t memtable_memory_budget);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_ingest_behind(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_allow_ingest_behind(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_filter(
+    rocksdb_options_t*, rocksdb_compactionfilter_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_filter_factory(
+    rocksdb_options_t*, rocksdb_compactionfilterfactory_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_compaction_readahead_size(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_compaction_readahead_size(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_comparator(
+    rocksdb_options_t*, rocksdb_comparator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_merge_operator(
+    rocksdb_options_t*, rocksdb_mergeoperator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_uint64add_merge_operator(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression_per_level(
+    rocksdb_options_t* opt, const int* level_values, size_t num_levels);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_create_if_missing(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_create_if_missing(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_create_missing_column_families(rocksdb_options_t*,
+                                                   unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_create_missing_column_families(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_error_if_exists(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_error_if_exists(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_paranoid_checks(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_paranoid_checks(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_paths(
+    rocksdb_options_t*, const rocksdb_dbpath_t** path_values, size_t num_paths);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_env(rocksdb_options_t*,
+                                                        rocksdb_env_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_info_log(rocksdb_options_t*,
+                                                             rocksdb_logger_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_info_log_level(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_info_log_level(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_write_buffer_size(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_write_buffer_size(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_write_buffer_size(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_db_write_buffer_size(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_open_files(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_open_files(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_file_opening_threads(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_file_opening_threads(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_total_wal_size(
+    rocksdb_options_t* opt, uint64_t n);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_max_total_wal_size(rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression_options(
+    rocksdb_options_t*, int, int, int, int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_compression_options_zstd_max_train_bytes(rocksdb_options_t*,
+                                                             int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_compression_options_zstd_max_train_bytes(
+    rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_compression_options_use_zstd_dict_trainer(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_compression_options_use_zstd_dict_trainer(
+    rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_compression_options_parallel_threads(rocksdb_options_t*,
+                                                         int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_compression_options_parallel_threads(
+    rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_compression_options_max_dict_buffer_bytes(
+    rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_compression_options_max_dict_buffer_bytes(
+    rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_bottommost_compression_options(rocksdb_options_t*, int, int,
+                                                   int, int, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_bottommost_compression_options_zstd_max_train_bytes(
+    rocksdb_options_t*, int, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_bottommost_compression_options_use_zstd_dict_trainer(
+    rocksdb_options_t*, unsigned char, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_bottommost_compression_options_use_zstd_dict_trainer(
+    rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_bottommost_compression_options_max_dict_buffer_bytes(
+    rocksdb_options_t*, uint64_t, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_prefix_extractor(
+    rocksdb_options_t*, rocksdb_slicetransform_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_num_levels(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_num_levels(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_level0_file_num_compaction_trigger(rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_level0_file_num_compaction_trigger(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_level0_slowdown_writes_trigger(rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_level0_slowdown_writes_trigger(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_level0_stop_writes_trigger(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_level0_stop_writes_trigger(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_target_file_size_base(
+    rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_target_file_size_base(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_target_file_size_multiplier(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_target_file_size_multiplier(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_bytes_for_level_base(
+    rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_max_bytes_for_level_base(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_level_compaction_dynamic_level_bytes(rocksdb_options_t*,
+                                                         unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_level_compaction_dynamic_level_bytes(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_max_bytes_for_level_multiplier(rocksdb_options_t*, double);
+extern ROCKSDB_LIBRARY_API double
+rocksdb_options_get_max_bytes_for_level_multiplier(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_max_bytes_for_level_multiplier_additional(
+    rocksdb_options_t*, int* level_values, size_t num_levels);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_enable_statistics(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt,
+                                                 unsigned char val);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_skip_stats_update_on_db_open(rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(
+    rocksdb_options_t* opt, unsigned char val);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(
+    rocksdb_options_t* opt);
+
+/* Blob Options Settings */
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_blob_files(
+    rocksdb_options_t* opt, unsigned char val);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_files(
+    rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_min_blob_size(
+    rocksdb_options_t* opt, uint64_t val);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_min_blob_size(rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_file_size(
+    rocksdb_options_t* opt, uint64_t val);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_blob_file_size(rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_compression_type(
+    rocksdb_options_t* opt, int val);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_blob_compression_type(
+    rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_blob_gc(
+    rocksdb_options_t* opt, unsigned char val);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_gc(
+    rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_gc_age_cutoff(
+    rocksdb_options_t* opt, double val);
+extern ROCKSDB_LIBRARY_API double rocksdb_options_get_blob_gc_age_cutoff(
+    rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_gc_force_threshold(
+    rocksdb_options_t* opt, double val);
+extern ROCKSDB_LIBRARY_API double rocksdb_options_get_blob_gc_force_threshold(
+    rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_blob_compaction_readahead_size(rocksdb_options_t* opt,
+                                                   uint64_t val);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_blob_compaction_readahead_size(rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_file_starting_level(
+    rocksdb_options_t* opt, int val);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_blob_file_starting_level(
+    rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_cache(
+    rocksdb_options_t* opt, rocksdb_cache_t* blob_cache);
+
+enum {
+  rocksdb_prepopulate_blob_disable = 0,
+  rocksdb_prepopulate_blob_flush_only = 1
+};
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_prepopulate_blob_cache(
+    rocksdb_options_t* opt, int val);
+
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_prepopulate_blob_cache(
+    rocksdb_options_t* opt);
+
+/* returns a pointer to a malloc()-ed, null terminated string */
+extern ROCKSDB_LIBRARY_API char* rocksdb_options_statistics_get_string(
+    rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_write_buffer_number(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_write_buffer_number(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_min_write_buffer_number_to_merge(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_max_write_buffer_number_to_maintain(rocksdb_options_t*,
+                                                        int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_max_write_buffer_number_to_maintain(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_max_write_buffer_size_to_maintain(rocksdb_options_t*,
+                                                      int64_t);
+extern ROCKSDB_LIBRARY_API int64_t
+rocksdb_options_get_max_write_buffer_size_to_maintain(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_pipelined_write(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_enable_pipelined_write(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_unordered_write(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_unordered_write(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_subcompactions(
+    rocksdb_options_t*, uint32_t);
+extern ROCKSDB_LIBRARY_API uint32_t
+rocksdb_options_get_max_subcompactions(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_jobs(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_background_jobs(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_compactions(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_background_compactions(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_flushes(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_background_flushes(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_log_file_size(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_max_log_file_size(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_log_file_time_to_roll(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_log_file_time_to_roll(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_keep_log_file_num(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_keep_log_file_num(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_recycle_log_file_num(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_recycle_log_file_num(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_soft_pending_compaction_bytes_limit(rocksdb_options_t* opt,
+                                                        size_t v);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_soft_pending_compaction_bytes_limit(rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_hard_pending_compaction_bytes_limit(rocksdb_options_t* opt,
+                                                        size_t v);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_hard_pending_compaction_bytes_limit(rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_manifest_file_size(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_max_manifest_file_size(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_table_cache_numshardbits(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_table_cache_numshardbits(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_arena_block_size(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_arena_block_size(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_fsync(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_use_fsync(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_log_dir(
+    rocksdb_options_t*, const char*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_dir(rocksdb_options_t*,
+                                                            const char*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_WAL_ttl_seconds(
+    rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_WAL_ttl_seconds(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_WAL_size_limit_MB(
+    rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_WAL_size_limit_MB(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_manifest_preallocation_size(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_manifest_preallocation_size(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_mmap_reads(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_allow_mmap_reads(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_mmap_writes(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_allow_mmap_writes(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_direct_reads(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_use_direct_reads(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_use_direct_io_for_flush_and_compaction(rocksdb_options_t*,
+                                                           unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_use_direct_io_for_flush_and_compaction(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_is_fd_close_on_exec(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_is_fd_close_on_exec(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_stats_dump_period_sec(
+    rocksdb_options_t*, unsigned int);
+extern ROCKSDB_LIBRARY_API unsigned int
+rocksdb_options_get_stats_dump_period_sec(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_stats_persist_period_sec(
+    rocksdb_options_t*, unsigned int);
+extern ROCKSDB_LIBRARY_API unsigned int
+rocksdb_options_get_stats_persist_period_sec(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_advise_random_on_open(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_advise_random_on_open(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_access_hint_on_compaction_start(rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_access_hint_on_compaction_start(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_adaptive_mutex(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_use_adaptive_mutex(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bytes_per_sync(
+    rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_bytes_per_sync(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_bytes_per_sync(
+    rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_wal_bytes_per_sync(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_writable_file_max_buffer_size(rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_writable_file_max_buffer_size(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_allow_concurrent_memtable_write(rocksdb_options_t*,
+                                                    unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_allow_concurrent_memtable_write(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_enable_write_thread_adaptive_yield(rocksdb_options_t*,
+                                                       unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_enable_write_thread_adaptive_yield(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_max_sequential_skip_in_iterations(rocksdb_options_t*,
+                                                      uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_max_sequential_skip_in_iterations(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_disable_auto_compactions(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_disable_auto_compactions(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_optimize_filters_for_hits(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_optimize_filters_for_hits(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_delete_obsolete_files_period_micros(rocksdb_options_t*,
+                                                        uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_delete_obsolete_files_period_micros(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_prepare_for_bulk_load(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_vector_rep(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_memtable_prefix_bloom_size_ratio(rocksdb_options_t*,
+                                                     double);
+extern ROCKSDB_LIBRARY_API double
+rocksdb_options_get_memtable_prefix_bloom_size_ratio(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_compaction_bytes(
+    rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_max_compaction_bytes(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hash_skip_list_rep(
+    rocksdb_options_t*, size_t, int32_t, int32_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hash_link_list_rep(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_plain_table_factory(
+    rocksdb_options_t*, uint32_t, int, double, size_t);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_min_level_to_compress(
+    rocksdb_options_t* opt, int level);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_huge_page_size(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_memtable_huge_page_size(rocksdb_options_t*);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_successive_merges(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_max_successive_merges(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bloom_locality(
+    rocksdb_options_t*, uint32_t);
+extern ROCKSDB_LIBRARY_API uint32_t
+rocksdb_options_get_bloom_locality(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_inplace_update_support(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_inplace_update_support(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_inplace_update_num_locks(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_inplace_update_num_locks(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_report_bg_io_stats(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_report_bg_io_stats(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_avoid_unnecessary_blocking_io(rocksdb_options_t*,
+                                                  unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_avoid_unnecessary_blocking_io(rocksdb_options_t*);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_experimental_mempurge_threshold(rocksdb_options_t*, double);
+extern ROCKSDB_LIBRARY_API double
+rocksdb_options_get_experimental_mempurge_threshold(rocksdb_options_t*);
+
+enum {
+  rocksdb_tolerate_corrupted_tail_records_recovery = 0,
+  rocksdb_absolute_consistency_recovery = 1,
+  rocksdb_point_in_time_recovery = 2,
+  rocksdb_skip_any_corrupted_records_recovery = 3
+};
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_recovery_mode(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_wal_recovery_mode(
+    rocksdb_options_t*);
+
+enum {
+  rocksdb_no_compression = 0,
+  rocksdb_snappy_compression = 1,
+  rocksdb_zlib_compression = 2,
+  rocksdb_bz2_compression = 3,
+  rocksdb_lz4_compression = 4,
+  rocksdb_lz4hc_compression = 5,
+  rocksdb_xpress_compression = 6,
+  rocksdb_zstd_compression = 7
+};
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_compression(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bottommost_compression(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_bottommost_compression(
+    rocksdb_options_t*);
+
+enum {
+  rocksdb_level_compaction = 0,
+  rocksdb_universal_compaction = 1,
+  rocksdb_fifo_compaction = 2
+};
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_style(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_compaction_style(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_universal_compaction_options(
+    rocksdb_options_t*, rocksdb_universal_compaction_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_fifo_compaction_options(
+    rocksdb_options_t* opt, rocksdb_fifo_compaction_options_t* fifo);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_ratelimiter(
+    rocksdb_options_t* opt, rocksdb_ratelimiter_t* limiter);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_atomic_flush(
+    rocksdb_options_t* opt, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_atomic_flush(
+    rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_row_cache(
+    rocksdb_options_t* opt, rocksdb_cache_t* cache);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_add_compact_on_deletion_collector_factory(
+    rocksdb_options_t*, size_t window_size, size_t num_dels_trigger);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_manual_wal_flush(
+    rocksdb_options_t* opt, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_manual_wal_flush(
+    rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_compression(
+    rocksdb_options_t* opt, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_wal_compression(
+    rocksdb_options_t* opt);
+
+/* RateLimiter */
+extern ROCKSDB_LIBRARY_API rocksdb_ratelimiter_t* rocksdb_ratelimiter_create(
+    int64_t rate_bytes_per_sec, int64_t refill_period_us, int32_t fairness);
+extern ROCKSDB_LIBRARY_API void rocksdb_ratelimiter_destroy(
+    rocksdb_ratelimiter_t*);
+
+/* PerfContext */
+enum {
+  rocksdb_uninitialized = 0,
+  rocksdb_disable = 1,
+  rocksdb_enable_count = 2,
+  rocksdb_enable_time_except_for_mutex = 3,
+  rocksdb_enable_time = 4,
+  rocksdb_out_of_bounds = 5
+};
+
+enum {
+  rocksdb_user_key_comparison_count = 0,
+  rocksdb_block_cache_hit_count,
+  rocksdb_block_read_count,
+  rocksdb_block_read_byte,
+  rocksdb_block_read_time,
+  rocksdb_block_checksum_time,
+  rocksdb_block_decompress_time,
+  rocksdb_get_read_bytes,
+  rocksdb_multiget_read_bytes,
+  rocksdb_iter_read_bytes,
+  rocksdb_internal_key_skipped_count,
+  rocksdb_internal_delete_skipped_count,
+  rocksdb_internal_recent_skipped_count,
+  rocksdb_internal_merge_count,
+  rocksdb_get_snapshot_time,
+  rocksdb_get_from_memtable_time,
+  rocksdb_get_from_memtable_count,
+  rocksdb_get_post_process_time,
+  rocksdb_get_from_output_files_time,
+  rocksdb_seek_on_memtable_time,
+  rocksdb_seek_on_memtable_count,
+  rocksdb_next_on_memtable_count,
+  rocksdb_prev_on_memtable_count,
+  rocksdb_seek_child_seek_time,
+  rocksdb_seek_child_seek_count,
+  rocksdb_seek_min_heap_time,
+  rocksdb_seek_max_heap_time,
+  rocksdb_seek_internal_seek_time,
+  rocksdb_find_next_user_entry_time,
+  rocksdb_write_wal_time,
+  rocksdb_write_memtable_time,
+  rocksdb_write_delay_time,
+  rocksdb_write_pre_and_post_process_time,
+  rocksdb_db_mutex_lock_nanos,
+  rocksdb_db_condition_wait_nanos,
+  rocksdb_merge_operator_time_nanos,
+  rocksdb_read_index_block_nanos,
+  rocksdb_read_filter_block_nanos,
+  rocksdb_new_table_block_iter_nanos,
+  rocksdb_new_table_iterator_nanos,
+  rocksdb_block_seek_nanos,
+  rocksdb_find_table_nanos,
+  rocksdb_bloom_memtable_hit_count,
+  rocksdb_bloom_memtable_miss_count,
+  rocksdb_bloom_sst_hit_count,
+  rocksdb_bloom_sst_miss_count,
+  rocksdb_key_lock_wait_time,
+  rocksdb_key_lock_wait_count,
+  rocksdb_env_new_sequential_file_nanos,
+  rocksdb_env_new_random_access_file_nanos,
+  rocksdb_env_new_writable_file_nanos,
+  rocksdb_env_reuse_writable_file_nanos,
+  rocksdb_env_new_random_rw_file_nanos,
+  rocksdb_env_new_directory_nanos,
+  rocksdb_env_file_exists_nanos,
+  rocksdb_env_get_children_nanos,
+  rocksdb_env_get_children_file_attributes_nanos,
+  rocksdb_env_delete_file_nanos,
+  rocksdb_env_create_dir_nanos,
+  rocksdb_env_create_dir_if_missing_nanos,
+  rocksdb_env_delete_dir_nanos,
+  rocksdb_env_get_file_size_nanos,
+  rocksdb_env_get_file_modification_time_nanos,
+  rocksdb_env_rename_file_nanos,
+  rocksdb_env_link_file_nanos,
+  rocksdb_env_lock_file_nanos,
+  rocksdb_env_unlock_file_nanos,
+  rocksdb_env_new_logger_nanos,
+  rocksdb_number_async_seek,
+  rocksdb_blob_cache_hit_count,
+  rocksdb_blob_read_count,
+  rocksdb_blob_read_byte,
+  rocksdb_blob_read_time,
+  rocksdb_blob_checksum_time,
+  rocksdb_blob_decompress_time,
+  rocksdb_internal_range_del_reseek_count,
+  rocksdb_total_metric_count = 78
+};
+
+extern ROCKSDB_LIBRARY_API void rocksdb_set_perf_level(int);
+extern ROCKSDB_LIBRARY_API rocksdb_perfcontext_t* rocksdb_perfcontext_create(
+    void);
+extern ROCKSDB_LIBRARY_API void rocksdb_perfcontext_reset(
+    rocksdb_perfcontext_t* context);
+extern ROCKSDB_LIBRARY_API char* rocksdb_perfcontext_report(
+    rocksdb_perfcontext_t* context, unsigned char exclude_zero_counters);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_perfcontext_metric(rocksdb_perfcontext_t* context, int metric);
+extern ROCKSDB_LIBRARY_API void rocksdb_perfcontext_destroy(
+    rocksdb_perfcontext_t* context);
+
+/* Compaction Filter */
+
+extern ROCKSDB_LIBRARY_API rocksdb_compactionfilter_t*
+rocksdb_compactionfilter_create(
+    void* state, void (*destructor)(void*),
+    unsigned char (*filter)(void*, int level, const char* key,
+                            size_t key_length, const char* existing_value,
+                            size_t value_length, char** new_value,
+                            size_t* new_value_length,
+                            unsigned char* value_changed),
+    const char* (*name)(void*));
+extern ROCKSDB_LIBRARY_API void rocksdb_compactionfilter_set_ignore_snapshots(
+    rocksdb_compactionfilter_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_compactionfilter_destroy(
+    rocksdb_compactionfilter_t*);
+
+/* Compaction Filter Context */
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_compactionfiltercontext_is_full_compaction(
+    rocksdb_compactionfiltercontext_t* context);
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_compactionfiltercontext_is_manual_compaction(
+    rocksdb_compactionfiltercontext_t* context);
+
+/* Compaction Filter Factory */
+
+extern ROCKSDB_LIBRARY_API rocksdb_compactionfilterfactory_t*
+rocksdb_compactionfilterfactory_create(
+    void* state, void (*destructor)(void*),
+    rocksdb_compactionfilter_t* (*create_compaction_filter)(
+        void*, rocksdb_compactionfiltercontext_t* context),
+    const char* (*name)(void*));
+extern ROCKSDB_LIBRARY_API void rocksdb_compactionfilterfactory_destroy(
+    rocksdb_compactionfilterfactory_t*);
+
+/* Comparator */
+
+extern ROCKSDB_LIBRARY_API rocksdb_comparator_t* rocksdb_comparator_create(
+    void* state, void (*destructor)(void*),
+    int (*compare)(void*, const char* a, size_t alen, const char* b,
+                   size_t blen),
+    const char* (*name)(void*));
+extern ROCKSDB_LIBRARY_API void rocksdb_comparator_destroy(
+    rocksdb_comparator_t*);
+
+extern ROCKSDB_LIBRARY_API rocksdb_comparator_t*
+rocksdb_comparator_with_ts_create(
+    void* state, void (*destructor)(void*),
+    int (*compare)(void*, const char* a, size_t alen, const char* b,
+                   size_t blen),
+    int (*compare_ts)(void*, const char* a_ts, size_t a_tslen, const char* b_ts,
+                      size_t b_tslen),
+    int (*compare_without_ts)(void*, const char* a, size_t alen,
+                              unsigned char a_has_ts, const char* b,
+                              size_t blen, unsigned char b_has_ts),
+    const char* (*name)(void*), size_t timestamp_size);
+
+/* Filter policy */
+
+extern ROCKSDB_LIBRARY_API void rocksdb_filterpolicy_destroy(
+    rocksdb_filterpolicy_t*);
+
+extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t*
+rocksdb_filterpolicy_create_bloom(double bits_per_key);
+extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t*
+rocksdb_filterpolicy_create_bloom_full(double bits_per_key);
+extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t*
+rocksdb_filterpolicy_create_ribbon(double bloom_equivalent_bits_per_key);
+extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t*
+rocksdb_filterpolicy_create_ribbon_hybrid(double bloom_equivalent_bits_per_key,
+                                          int bloom_before_level);
+
+/* Merge Operator */
+
+extern ROCKSDB_LIBRARY_API rocksdb_mergeoperator_t*
+rocksdb_mergeoperator_create(
+    void* state, void (*destructor)(void*),
+    char* (*full_merge)(void*, const char* key, size_t key_length,
+                        const char* existing_value,
+                        size_t existing_value_length,
+                        const char* const* operands_list,
+                        const size_t* operands_list_length, int num_operands,
+                        unsigned char* success, size_t* new_value_length),
+    char* (*partial_merge)(void*, const char* key, size_t key_length,
+                           const char* const* operands_list,
+                           const size_t* operands_list_length, int num_operands,
+                           unsigned char* success, size_t* new_value_length),
+    void (*delete_value)(void*, const char* value, size_t value_length),
+    const char* (*name)(void*));
+extern ROCKSDB_LIBRARY_API void rocksdb_mergeoperator_destroy(
+    rocksdb_mergeoperator_t*);
+
+/* Read options */
+
+extern ROCKSDB_LIBRARY_API rocksdb_readoptions_t* rocksdb_readoptions_create(
+    void);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_destroy(
+    rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_verify_checksums(
+    rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_readoptions_get_verify_checksums(rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_fill_cache(
+    rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_fill_cache(
+    rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_snapshot(
+    rocksdb_readoptions_t*, const rocksdb_snapshot_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_iterate_upper_bound(
+    rocksdb_readoptions_t*, const char* key, size_t keylen);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_iterate_lower_bound(
+    rocksdb_readoptions_t*, const char* key, size_t keylen);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_read_tier(
+    rocksdb_readoptions_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_readoptions_get_read_tier(
+    rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_tailing(
+    rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_tailing(
+    rocksdb_readoptions_t*);
+// The functionality that this option controlled has been removed.
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_managed(
+    rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_readahead_size(
+    rocksdb_readoptions_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_readoptions_get_readahead_size(rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_prefix_same_as_start(
+    rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_readoptions_get_prefix_same_as_start(rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_pin_data(
+    rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_pin_data(
+    rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_total_order_seek(
+    rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_readoptions_get_total_order_seek(rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_readoptions_set_max_skippable_internal_keys(rocksdb_readoptions_t*,
+                                                    uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_readoptions_get_max_skippable_internal_keys(rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_readoptions_set_background_purge_on_iterator_cleanup(
+    rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_readoptions_get_background_purge_on_iterator_cleanup(
+    rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_ignore_range_deletions(
+    rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_readoptions_get_ignore_range_deletions(rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_deadline(
+    rocksdb_readoptions_t*, uint64_t microseconds);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_readoptions_get_deadline(rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_io_timeout(
+    rocksdb_readoptions_t*, uint64_t microseconds);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_readoptions_get_io_timeout(rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_timestamp(
+    rocksdb_readoptions_t*, const char* ts, size_t tslen);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_iter_start_ts(
+    rocksdb_readoptions_t*, const char* ts, size_t tslen);
+
+/* Write options */
+
+extern ROCKSDB_LIBRARY_API rocksdb_writeoptions_t* rocksdb_writeoptions_create(
+    void);
+extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_destroy(
+    rocksdb_writeoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_sync(
+    rocksdb_writeoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_sync(
+    rocksdb_writeoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_disable_WAL(
+    rocksdb_writeoptions_t* opt, int disable);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_disable_WAL(
+    rocksdb_writeoptions_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_writeoptions_set_ignore_missing_column_families(rocksdb_writeoptions_t*,
+                                                        unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_writeoptions_get_ignore_missing_column_families(
+    rocksdb_writeoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_no_slowdown(
+    rocksdb_writeoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_no_slowdown(
+    rocksdb_writeoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_low_pri(
+    rocksdb_writeoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_low_pri(
+    rocksdb_writeoptions_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_writeoptions_set_memtable_insert_hint_per_batch(rocksdb_writeoptions_t*,
+                                                        unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_writeoptions_get_memtable_insert_hint_per_batch(
+    rocksdb_writeoptions_t*);
+
+/* Compact range options */
+
+extern ROCKSDB_LIBRARY_API rocksdb_compactoptions_t*
+rocksdb_compactoptions_create(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_destroy(
+    rocksdb_compactoptions_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compactoptions_set_exclusive_manual_compaction(
+    rocksdb_compactoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_compactoptions_get_exclusive_manual_compaction(
+    rocksdb_compactoptions_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compactoptions_set_bottommost_level_compaction(
+    rocksdb_compactoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_compactoptions_get_bottommost_level_compaction(
+    rocksdb_compactoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_set_change_level(
+    rocksdb_compactoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_compactoptions_get_change_level(rocksdb_compactoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_set_target_level(
+    rocksdb_compactoptions_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_compactoptions_get_target_level(
+    rocksdb_compactoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_set_full_history_ts_low(
+    rocksdb_compactoptions_t*, char* ts, size_t tslen);
+
+/* Flush options */
+
+extern ROCKSDB_LIBRARY_API rocksdb_flushoptions_t* rocksdb_flushoptions_create(
+    void);
+extern ROCKSDB_LIBRARY_API void rocksdb_flushoptions_destroy(
+    rocksdb_flushoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_flushoptions_set_wait(
+    rocksdb_flushoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_flushoptions_get_wait(
+    rocksdb_flushoptions_t*);
+
+/* Memory allocator */
+
+extern ROCKSDB_LIBRARY_API rocksdb_memory_allocator_t*
+rocksdb_jemalloc_nodump_allocator_create(char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_memory_allocator_destroy(
+    rocksdb_memory_allocator_t*);
+
+/* Cache */
+
+extern ROCKSDB_LIBRARY_API rocksdb_lru_cache_options_t*
+rocksdb_lru_cache_options_create(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_destroy(
+    rocksdb_lru_cache_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_set_capacity(
+    rocksdb_lru_cache_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_set_num_shard_bits(
+    rocksdb_lru_cache_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_set_memory_allocator(
+    rocksdb_lru_cache_options_t*, rocksdb_memory_allocator_t*);
+
+extern ROCKSDB_LIBRARY_API rocksdb_cache_t* rocksdb_cache_create_lru(
+    size_t capacity);
+extern ROCKSDB_LIBRARY_API rocksdb_cache_t*
+rocksdb_cache_create_lru_with_strict_capacity_limit(size_t capacity);
+extern ROCKSDB_LIBRARY_API rocksdb_cache_t* rocksdb_cache_create_lru_opts(
+    rocksdb_lru_cache_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_cache_destroy(rocksdb_cache_t* cache);
+extern ROCKSDB_LIBRARY_API void rocksdb_cache_disown_data(
+    rocksdb_cache_t* cache);
+extern ROCKSDB_LIBRARY_API void rocksdb_cache_set_capacity(
+    rocksdb_cache_t* cache, size_t capacity);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_cache_get_capacity(rocksdb_cache_t* cache);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_cache_get_usage(rocksdb_cache_t* cache);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_cache_get_pinned_usage(rocksdb_cache_t* cache);
+
+/* DBPath */
+
+extern ROCKSDB_LIBRARY_API rocksdb_dbpath_t* rocksdb_dbpath_create(
+    const char* path, uint64_t target_size);
+extern ROCKSDB_LIBRARY_API void rocksdb_dbpath_destroy(rocksdb_dbpath_t*);
+
+/* Env */
+
+extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_default_env(void);
+extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_mem_env(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_set_background_threads(
+    rocksdb_env_t* env, int n);
+extern ROCKSDB_LIBRARY_API int rocksdb_env_get_background_threads(
+    rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n);
+extern ROCKSDB_LIBRARY_API int rocksdb_env_get_high_priority_background_threads(
+    rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_set_low_priority_background_threads(
+    rocksdb_env_t* env, int n);
+extern ROCKSDB_LIBRARY_API int rocksdb_env_get_low_priority_background_threads(
+    rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_env_set_bottom_priority_background_threads(rocksdb_env_t* env, int n);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_env_get_bottom_priority_background_threads(rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_join_all_threads(
+    rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_io_priority(
+    rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_env_lower_high_priority_thread_pool_io_priority(rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_cpu_priority(
+    rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_env_lower_high_priority_thread_pool_cpu_priority(rocksdb_env_t* env);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_env_destroy(rocksdb_env_t*);
+
+extern ROCKSDB_LIBRARY_API rocksdb_envoptions_t* rocksdb_envoptions_create(
+    void);
+extern ROCKSDB_LIBRARY_API void rocksdb_envoptions_destroy(
+    rocksdb_envoptions_t* opt);
+extern ROCKSDB_LIBRARY_API void rocksdb_create_dir_if_missing(
+    rocksdb_env_t* env, const char* path, char** errptr);
+
+/* SstFile */
+
+extern ROCKSDB_LIBRARY_API rocksdb_sstfilewriter_t*
+rocksdb_sstfilewriter_create(const rocksdb_envoptions_t* env,
+                             const rocksdb_options_t* io_options);
+extern ROCKSDB_LIBRARY_API rocksdb_sstfilewriter_t*
+rocksdb_sstfilewriter_create_with_comparator(
+    const rocksdb_envoptions_t* env, const rocksdb_options_t* io_options,
+    const rocksdb_comparator_t* comparator);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_open(
+    rocksdb_sstfilewriter_t* writer, const char* name, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_add(
+    rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen,
+    const char* val, size_t vallen, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_put(
+    rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen,
+    const char* val, size_t vallen, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_put_with_ts(
+    rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen,
+    const char* ts, size_t tslen, const char* val, size_t vallen,
+    char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_merge(
+    rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen,
+    const char* val, size_t vallen, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_delete(
+    rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen,
+    char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_delete_with_ts(
+    rocksdb_sstfilewriter_t* writer, const char* key, size_t keylen,
+    const char* ts, size_t tslen, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_delete_range(
+    rocksdb_sstfilewriter_t* writer, const char* begin_key, size_t begin_keylen,
+    const char* end_key, size_t end_keylen, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_finish(
+    rocksdb_sstfilewriter_t* writer, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_file_size(
+    rocksdb_sstfilewriter_t* writer, uint64_t* file_size);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_destroy(
+    rocksdb_sstfilewriter_t* writer);
+extern ROCKSDB_LIBRARY_API rocksdb_ingestexternalfileoptions_t*
+rocksdb_ingestexternalfileoptions_create(void);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_ingestexternalfileoptions_set_move_files(
+    rocksdb_ingestexternalfileoptions_t* opt, unsigned char move_files);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_ingestexternalfileoptions_set_snapshot_consistency(
+    rocksdb_ingestexternalfileoptions_t* opt,
+    unsigned char snapshot_consistency);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_ingestexternalfileoptions_set_allow_global_seqno(
+    rocksdb_ingestexternalfileoptions_t* opt, unsigned char allow_global_seqno);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_ingestexternalfileoptions_set_allow_blocking_flush(
+    rocksdb_ingestexternalfileoptions_t* opt,
+    unsigned char allow_blocking_flush);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_ingestexternalfileoptions_set_ingest_behind(
+    rocksdb_ingestexternalfileoptions_t* opt, unsigned char ingest_behind);
+extern ROCKSDB_LIBRARY_API void rocksdb_ingestexternalfileoptions_destroy(
+    rocksdb_ingestexternalfileoptions_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_ingest_external_file(
+    rocksdb_t* db, const char* const* file_list, const size_t list_len,
+    const rocksdb_ingestexternalfileoptions_t* opt, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_ingest_external_file_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* handle,
+    const char* const* file_list, const size_t list_len,
+    const rocksdb_ingestexternalfileoptions_t* opt, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_try_catch_up_with_primary(
+    rocksdb_t* db, char** errptr);
+
+/* SliceTransform */
+
+extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
+rocksdb_slicetransform_create(
+    void* state, void (*destructor)(void*),
+    char* (*transform)(void*, const char* key, size_t length,
+                       size_t* dst_length),
+    unsigned char (*in_domain)(void*, const char* key, size_t length),
+    unsigned char (*in_range)(void*, const char* key, size_t length),
+    const char* (*name)(void*));
+extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
+    rocksdb_slicetransform_create_fixed_prefix(size_t);
+extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
+rocksdb_slicetransform_create_noop(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_slicetransform_destroy(
+    rocksdb_slicetransform_t*);
+
+/* Universal Compaction options */
+
+enum {
+  rocksdb_similar_size_compaction_stop_style = 0,
+  rocksdb_total_size_compaction_stop_style = 1
+};
+
+extern ROCKSDB_LIBRARY_API rocksdb_universal_compaction_options_t*
+rocksdb_universal_compaction_options_create(void);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_size_ratio(
+    rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_size_ratio(
+    rocksdb_universal_compaction_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_min_merge_width(
+    rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_min_merge_width(
+    rocksdb_universal_compaction_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_max_merge_width(
+    rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_max_merge_width(
+    rocksdb_universal_compaction_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_max_size_amplification_percent(
+    rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_max_size_amplification_percent(
+    rocksdb_universal_compaction_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_compression_size_percent(
+    rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_compression_size_percent(
+    rocksdb_universal_compaction_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_stop_style(
+    rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_stop_style(
+    rocksdb_universal_compaction_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_universal_compaction_options_destroy(
+    rocksdb_universal_compaction_options_t*);
+
+extern ROCKSDB_LIBRARY_API rocksdb_fifo_compaction_options_t*
+rocksdb_fifo_compaction_options_create(void);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_fifo_compaction_options_set_max_table_files_size(
+    rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_fifo_compaction_options_get_max_table_files_size(
+    rocksdb_fifo_compaction_options_t* fifo_opts);
+extern ROCKSDB_LIBRARY_API void rocksdb_fifo_compaction_options_destroy(
+    rocksdb_fifo_compaction_options_t* fifo_opts);
+
+extern ROCKSDB_LIBRARY_API int rocksdb_livefiles_count(
+    const rocksdb_livefiles_t*);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_column_family_name(
+    const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_name(
+    const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API int rocksdb_livefiles_level(
+    const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_livefiles_size(const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_smallestkey(
+    const rocksdb_livefiles_t*, int index, size_t* size);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_largestkey(
+    const rocksdb_livefiles_t*, int index, size_t* size);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_livefiles_entries(const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_livefiles_deletions(const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefiles_destroy(
+    const rocksdb_livefiles_t*);
+
+/* Utility Helpers */
+
+extern ROCKSDB_LIBRARY_API void rocksdb_get_options_from_string(
+    const rocksdb_options_t* base_options, const char* opts_str,
+    rocksdb_options_t* new_options, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete_file_in_range(
+    rocksdb_t* db, const char* start_key, size_t start_key_len,
+    const char* limit_key, size_t limit_key_len, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete_file_in_range_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    const char* start_key, size_t start_key_len, const char* limit_key,
+    size_t limit_key_len, char** errptr);
+
+/* MetaData */
+
+extern ROCKSDB_LIBRARY_API rocksdb_column_family_metadata_t*
+rocksdb_get_column_family_metadata(rocksdb_t* db);
+
+/**
+ * Returns the rocksdb_column_family_metadata_t of the specified
+ * column family.
+ *
+ * Note that the caller is responsible to release the returned memory
+ * using rocksdb_column_family_metadata_destroy.
+ */
+extern ROCKSDB_LIBRARY_API rocksdb_column_family_metadata_t*
+rocksdb_get_column_family_metadata_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_column_family_metadata_destroy(
+    rocksdb_column_family_metadata_t* cf_meta);
+
+extern ROCKSDB_LIBRARY_API uint64_t rocksdb_column_family_metadata_get_size(
+    rocksdb_column_family_metadata_t* cf_meta);
+
+extern ROCKSDB_LIBRARY_API size_t rocksdb_column_family_metadata_get_file_count(
+    rocksdb_column_family_metadata_t* cf_meta);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_column_family_metadata_get_name(
+    rocksdb_column_family_metadata_t* cf_meta);
+
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_column_family_metadata_get_level_count(
+    rocksdb_column_family_metadata_t* cf_meta);
+
+/**
+ * Returns the rocksdb_level_metadata_t of the ith level from the specified
+ * column family metadata.
+ *
+ * If the specified i is greater than or equal to the number of levels
+ * in the specified column family, then NULL will be returned.
+ *
+ * Note that the caller is responsible to release the returned memory
+ * using rocksdb_level_metadata_destroy before releasing its parent
+ * rocksdb_column_family_metadata_t.
+ */
+extern ROCKSDB_LIBRARY_API rocksdb_level_metadata_t*
+rocksdb_column_family_metadata_get_level_metadata(
+    rocksdb_column_family_metadata_t* cf_meta, size_t i);
+
+/**
+ * Releases the specified rocksdb_level_metadata_t.
+ *
+ * Note that the specified rocksdb_level_metadata_t must be released
+ * before the release of its parent rocksdb_column_family_metadata_t.
+ */
+extern ROCKSDB_LIBRARY_API void rocksdb_level_metadata_destroy(
+    rocksdb_level_metadata_t* level_meta);
+
+extern ROCKSDB_LIBRARY_API int rocksdb_level_metadata_get_level(
+    rocksdb_level_metadata_t* level_meta);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_level_metadata_get_size(rocksdb_level_metadata_t* level_meta);
+
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_level_metadata_get_file_count(rocksdb_level_metadata_t* level_meta);
+
+/**
+ * Returns the sst_file_metadata_t of the ith file from the specified level
+ * metadata.
+ *
+ * If the specified i is greater than or equal to the number of files
+ * in the specified level, then NULL will be returned.
+ *
+ * Note that the caller is responsible to release the returned memory
+ * using rocksdb_sst_file_metadata_destroy before releasing its
+ * parent rocksdb_level_metadata_t.
+ */
+extern ROCKSDB_LIBRARY_API rocksdb_sst_file_metadata_t*
+rocksdb_level_metadata_get_sst_file_metadata(
+    rocksdb_level_metadata_t* level_meta, size_t i);
+
+/**
+ * Releases the specified rocksdb_sst_file_metadata_t.
+ *
+ * Note that the specified rocksdb_sst_file_metadata_t must be released
+ * before the release of its parent rocksdb_level_metadata_t.
+ */
+extern ROCKSDB_LIBRARY_API void rocksdb_sst_file_metadata_destroy(
+    rocksdb_sst_file_metadata_t* file_meta);
+
+extern ROCKSDB_LIBRARY_API char*
+rocksdb_sst_file_metadata_get_relative_filename(
+    rocksdb_sst_file_metadata_t* file_meta);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_sst_file_metadata_get_size(rocksdb_sst_file_metadata_t* file_meta);
+
+/**
+ * Returns the smallest key of the specified sst file.
+ * The caller is responsible for releasing the returned memory.
+ *
+ * @param file_meta the metadata of an SST file to obtain its smallest key.
+ * @param len the out value which will contain the length of the returned key
+ *     after the function call.
+ */
+extern ROCKSDB_LIBRARY_API char* rocksdb_sst_file_metadata_get_smallestkey(
+    rocksdb_sst_file_metadata_t* file_meta, size_t* len);
+
+/**
+ * Returns the smallest key of the specified sst file.
+ * The caller is responsible for releasing the returned memory.
+ *
+ * @param file_meta the metadata of an SST file to obtain its smallest key.
+ * @param len the out value which will contain the length of the returned key
+ *     after the function call.
+ */
+extern ROCKSDB_LIBRARY_API char* rocksdb_sst_file_metadata_get_largestkey(
+    rocksdb_sst_file_metadata_t* file_meta, size_t* len);
+
+/* Transactions */
+
+extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t*
+rocksdb_transactiondb_create_column_family(
+    rocksdb_transactiondb_t* txn_db,
+    const rocksdb_options_t* column_family_options,
+    const char* column_family_name, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_t* rocksdb_transactiondb_open(
+    const rocksdb_options_t* options,
+    const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
+    char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_t*
+rocksdb_transactiondb_open_column_families(
+    const rocksdb_options_t* options,
+    const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
+    int num_column_families, const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char** errptr);
+
+extern ROCKSDB_LIBRARY_API const rocksdb_snapshot_t*
+rocksdb_transactiondb_create_snapshot(rocksdb_transactiondb_t* txn_db);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_release_snapshot(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_snapshot_t* snapshot);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transactiondb_property_value(
+    rocksdb_transactiondb_t* db, const char* propname);
+
+extern ROCKSDB_LIBRARY_API int rocksdb_transactiondb_property_int(
+    rocksdb_transactiondb_t* db, const char* propname, uint64_t* out_val);
+
+extern ROCKSDB_LIBRARY_API rocksdb_transaction_t* rocksdb_transaction_begin(
+    rocksdb_transactiondb_t* txn_db,
+    const rocksdb_writeoptions_t* write_options,
+    const rocksdb_transaction_options_t* txn_options,
+    rocksdb_transaction_t* old_txn);
+
+extern ROCKSDB_LIBRARY_API rocksdb_transaction_t**
+rocksdb_transactiondb_get_prepared_transactions(rocksdb_transactiondb_t* txn_db,
+                                                size_t* cnt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_set_name(
+    rocksdb_transaction_t* txn, const char* name, size_t name_len,
+    char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_name(
+    rocksdb_transaction_t* txn, size_t* name_len);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_prepare(
+    rocksdb_transaction_t* txn, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_commit(
+    rocksdb_transaction_t* txn, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_rollback(
+    rocksdb_transaction_t* txn, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_set_savepoint(
+    rocksdb_transaction_t* txn);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_rollback_to_savepoint(
+    rocksdb_transaction_t* txn, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_destroy(
+    rocksdb_transaction_t* txn);
+
+extern ROCKSDB_LIBRARY_API rocksdb_writebatch_wi_t*
+rocksdb_transaction_get_writebatch_wi(rocksdb_transaction_t* txn);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_rebuild_from_writebatch(
+    rocksdb_transaction_t* txn, rocksdb_writebatch_t* writebatch,
+    char** errptr);
+
+// This rocksdb_writebatch_wi_t should be freed with rocksdb_free
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_rebuild_from_writebatch_wi(
+    rocksdb_transaction_t* txn, rocksdb_writebatch_wi_t* wi, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_set_commit_timestamp(
+    rocksdb_transaction_t* txn, uint64_t commit_timestamp);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_transaction_set_read_timestamp_for_validation(
+    rocksdb_transaction_t* txn, uint64_t read_timestamp);
+
+// This snapshot should be freed using rocksdb_free
+extern ROCKSDB_LIBRARY_API const rocksdb_snapshot_t*
+rocksdb_transaction_get_snapshot(rocksdb_transaction_t* txn);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    const char* key, size_t klen, size_t* vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t*
+rocksdb_transaction_get_pinned(rocksdb_transaction_t* txn,
+                               const rocksdb_readoptions_t* options,
+                               const char* key, size_t klen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_cf(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+    size_t* vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t*
+rocksdb_transaction_get_pinned_cf(rocksdb_transaction_t* txn,
+                                  const rocksdb_readoptions_t* options,
+                                  rocksdb_column_family_handle_t* column_family,
+                                  const char* key, size_t klen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_for_update(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    const char* key, size_t klen, size_t* vlen, unsigned char exclusive,
+    char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t*
+rocksdb_transaction_get_pinned_for_update(rocksdb_transaction_t* txn,
+                                          const rocksdb_readoptions_t* options,
+                                          const char* key, size_t klen,
+                                          unsigned char exclusive,
+                                          char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_for_update_cf(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+    size_t* vlen, unsigned char exclusive, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t*
+rocksdb_transaction_get_pinned_for_update_cf(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+    unsigned char exclusive, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_multi_get(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    size_t num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, char** values_list,
+    size_t* values_list_sizes, char** errs);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_multi_get_cf(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    const rocksdb_column_family_handle_t* const* column_families,
+    size_t num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, char** values_list,
+    size_t* values_list_sizes, char** errs);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transactiondb_get(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+    const char* key, size_t klen, size_t* vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t*
+rocksdb_transactiondb_get_pinned(rocksdb_transactiondb_t* txn_db,
+                                 const rocksdb_readoptions_t* options,
+                                 const char* key, size_t klen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transactiondb_get_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, size_t* vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t*
+rocksdb_transactiondb_get_pinned_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_multi_get(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+    size_t num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, char** values_list,
+    size_t* values_list_sizes, char** errs);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_multi_get_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+    const rocksdb_column_family_handle_t* const* column_families,
+    size_t num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, char** values_list,
+    size_t* values_list_sizes, char** errs);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_put(
+    rocksdb_transaction_t* txn, const char* key, size_t klen, const char* val,
+    size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_put_cf(
+    rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* val, size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_put(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+    const char* key, size_t klen, const char* val, size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_put_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, const char* val, size_t vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_write(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+    rocksdb_writebatch_t* batch, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_merge(
+    rocksdb_transaction_t* txn, const char* key, size_t klen, const char* val,
+    size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_merge_cf(
+    rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* val, size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_merge(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+    const char* key, size_t klen, const char* val, size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_merge_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+    const char* val, size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_delete(
+    rocksdb_transaction_t* txn, const char* key, size_t klen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_delete_cf(
+    rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_delete(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+    const char* key, size_t klen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_delete_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_transaction_create_iterator(rocksdb_transaction_t* txn,
+                                    const rocksdb_readoptions_t* options);
+
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_transaction_create_iterator_cf(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family);
+
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_transactiondb_create_iterator(rocksdb_transactiondb_t* txn_db,
+                                      const rocksdb_readoptions_t* options);
+
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_transactiondb_create_iterator_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_close(
+    rocksdb_transactiondb_t* txn_db);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_flush(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_flushoptions_t* options,
+    char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_flush_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_flushoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_flush_wal(
+    rocksdb_transactiondb_t* txn_db, unsigned char sync, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_checkpoint_t*
+rocksdb_transactiondb_checkpoint_object_create(rocksdb_transactiondb_t* txn_db,
+                                               char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_optimistictransactiondb_t*
+rocksdb_optimistictransactiondb_open(const rocksdb_options_t* options,
+                                     const char* name, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_optimistictransactiondb_t*
+rocksdb_optimistictransactiondb_open_column_families(
+    const rocksdb_options_t* options, const char* name, int num_column_families,
+    const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t*
+rocksdb_optimistictransactiondb_get_base_db(
+    rocksdb_optimistictransactiondb_t* otxn_db);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransactiondb_close_base_db(
+    rocksdb_t* base_db);
+
+extern ROCKSDB_LIBRARY_API rocksdb_transaction_t*
+rocksdb_optimistictransaction_begin(
+    rocksdb_optimistictransactiondb_t* otxn_db,
+    const rocksdb_writeoptions_t* write_options,
+    const rocksdb_optimistictransaction_options_t* otxn_options,
+    rocksdb_transaction_t* old_txn);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransactiondb_write(
+    rocksdb_optimistictransactiondb_t* otxn_db,
+    const rocksdb_writeoptions_t* options, rocksdb_writebatch_t* batch,
+    char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransactiondb_close(
+    rocksdb_optimistictransactiondb_t* otxn_db);
+
+extern ROCKSDB_LIBRARY_API rocksdb_checkpoint_t*
+rocksdb_optimistictransactiondb_checkpoint_object_create(
+    rocksdb_optimistictransactiondb_t* otxn_db, char** errptr);
+
+/* Transaction Options */
+
+extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_options_t*
+rocksdb_transactiondb_options_create(void);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_options_destroy(
+    rocksdb_transactiondb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_options_set_max_num_locks(
+    rocksdb_transactiondb_options_t* opt, int64_t max_num_locks);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_options_set_num_stripes(
+    rocksdb_transactiondb_options_t* opt, size_t num_stripes);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_transactiondb_options_set_transaction_lock_timeout(
+    rocksdb_transactiondb_options_t* opt, int64_t txn_lock_timeout);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_transactiondb_options_set_default_lock_timeout(
+    rocksdb_transactiondb_options_t* opt, int64_t default_lock_timeout);
+
+extern ROCKSDB_LIBRARY_API rocksdb_transaction_options_t*
+rocksdb_transaction_options_create(void);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_destroy(
+    rocksdb_transaction_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_set_snapshot(
+    rocksdb_transaction_options_t* opt, unsigned char v);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_deadlock_detect(
+    rocksdb_transaction_options_t* opt, unsigned char v);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_lock_timeout(
+    rocksdb_transaction_options_t* opt, int64_t lock_timeout);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_expiration(
+    rocksdb_transaction_options_t* opt, int64_t expiration);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_transaction_options_set_deadlock_detect_depth(
+    rocksdb_transaction_options_t* opt, int64_t depth);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_transaction_options_set_max_write_batch_size(
+    rocksdb_transaction_options_t* opt, size_t size);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_set_skip_prepare(
+    rocksdb_transaction_options_t* opt, unsigned char v);
+
+extern ROCKSDB_LIBRARY_API rocksdb_optimistictransaction_options_t*
+rocksdb_optimistictransaction_options_create(void);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransaction_options_destroy(
+    rocksdb_optimistictransaction_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_optimistictransaction_options_set_set_snapshot(
+    rocksdb_optimistictransaction_options_t* opt, unsigned char v);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_optimistictransactiondb_property_value(
+    rocksdb_optimistictransactiondb_t* db, const char* propname);
+
+extern ROCKSDB_LIBRARY_API int rocksdb_optimistictransactiondb_property_int(
+    rocksdb_optimistictransactiondb_t* db, const char* propname,
+    uint64_t* out_val);
+
+// referring to convention (3), this should be used by client
+// to free memory that was malloc()ed
+extern ROCKSDB_LIBRARY_API void rocksdb_free(void* ptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t* rocksdb_get_pinned(
+    rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key,
+    size_t keylen, char** errptr);
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t* rocksdb_get_pinned_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_pinnableslice_destroy(
+    rocksdb_pinnableslice_t* v);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_pinnableslice_value(
+    const rocksdb_pinnableslice_t* t, size_t* vlen);
+
+extern ROCKSDB_LIBRARY_API rocksdb_memory_consumers_t*
+rocksdb_memory_consumers_create(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_db(
+    rocksdb_memory_consumers_t* consumers, rocksdb_t* db);
+extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_cache(
+    rocksdb_memory_consumers_t* consumers, rocksdb_cache_t* cache);
+extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_destroy(
+    rocksdb_memory_consumers_t* consumers);
+extern ROCKSDB_LIBRARY_API rocksdb_memory_usage_t*
+rocksdb_approximate_memory_usage_create(rocksdb_memory_consumers_t* consumers,
+                                        char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_approximate_memory_usage_destroy(
+    rocksdb_memory_usage_t* usage);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_approximate_memory_usage_get_mem_table_total(
+    rocksdb_memory_usage_t* memory_usage);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_approximate_memory_usage_get_mem_table_unflushed(
+    rocksdb_memory_usage_t* memory_usage);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_approximate_memory_usage_get_mem_table_readers_total(
+    rocksdb_memory_usage_t* memory_usage);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_approximate_memory_usage_get_cache_total(
+    rocksdb_memory_usage_t* memory_usage);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_dump_malloc_stats(
+    rocksdb_options_t*, unsigned char);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_memtable_whole_key_filtering(rocksdb_options_t*,
+                                                 unsigned char);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_cancel_all_background_work(
+    rocksdb_t* db, unsigned char wait);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_disable_manual_compaction(
+    rocksdb_t* db);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_enable_manual_compaction(rocksdb_t* db);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
diff --git a/src/rocksdb/include/rocksdb/cache.h b/src/rocksdb/include/rocksdb/cache.h
new file mode 100644
index 000000000..575d276b5
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/cache.h
@@ -0,0 +1,775 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Cache is an interface that maps keys to values.  It has internal
+// synchronization and may be safely accessed concurrently from
+// multiple threads.  It may automatically evict entries to make room
+// for new entries.  Values have a specified charge against the cache
+// capacity.  For example, a cache where the values are variable
+// length strings, may use the length of the string as the charge for
+// the string.
+//
+// A builtin cache implementation with a least-recently-used eviction
+// policy is provided.  Clients may use their own implementations if
+// they want something more sophisticated (like scan-resistance, a
+// custom eviction policy, variable cache sizing, etc.)
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "rocksdb/compression_type.h"
+#include "rocksdb/memory_allocator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cache;
+struct ConfigOptions;
+class Logger;
+class SecondaryCache;
+
+// Classifications of block cache entries.
+//
+// Developer notes: Adding a new enum to this class requires corresponding
+// updates to `kCacheEntryRoleToCamelString` and
+// `kCacheEntryRoleToHyphenString`. Do not add to this enum after `kMisc` since
+// `kNumCacheEntryRoles` assumes `kMisc` comes last.
+enum class CacheEntryRole {
+  // Block-based table data block
+  kDataBlock,
+  // Block-based table filter block (full or partitioned)
+  kFilterBlock,
+  // Block-based table metadata block for partitioned filter
+  kFilterMetaBlock,
+  // OBSOLETE / DEPRECATED: old/removed block-based filter
+  kDeprecatedFilterBlock,
+  // Block-based table index block
+  kIndexBlock,
+  // Other kinds of block-based table block
+  kOtherBlock,
+  // WriteBufferManager's charge to account for its memtable usage
+  kWriteBuffer,
+  // Compression dictionary building buffer's charge to account for
+  // its memory usage
+  kCompressionDictionaryBuildingBuffer,
+  // Filter's charge to account for
+  // (new) bloom and ribbon filter construction's memory usage
+  kFilterConstruction,
+  // BlockBasedTableReader's charge to account for its memory usage
+  kBlockBasedTableReader,
+  // FileMetadata's charge to account for its memory usage
+  kFileMetadata,
+  // Blob value (when using the same cache as block cache and blob cache)
+  kBlobValue,
+  // Blob cache's charge to account for its memory usage (when using a
+  // separate block cache and blob cache)
+  kBlobCache,
+  // Default bucket, for miscellaneous cache entries. Do not use for
+  // entries that could potentially add up to large usage.
+  kMisc,
+};
+constexpr uint32_t kNumCacheEntryRoles =
+    static_cast<uint32_t>(CacheEntryRole::kMisc) + 1;
+
+// Obtain a hyphen-separated, lowercase name of a `CacheEntryRole`.
+const std::string& GetCacheEntryRoleName(CacheEntryRole);
+
+// For use with `GetMapProperty()` for property
+// `DB::Properties::kBlockCacheEntryStats`. On success, the map will
+// be populated with all keys that can be obtained from these functions.
+struct BlockCacheEntryStatsMapKeys {
+  static const std::string& CacheId();
+  static const std::string& CacheCapacityBytes();
+  static const std::string& LastCollectionDurationSeconds();
+  static const std::string& LastCollectionAgeSeconds();
+
+  static std::string EntryCount(CacheEntryRole);
+  static std::string UsedBytes(CacheEntryRole);
+  static std::string UsedPercent(CacheEntryRole);
+};
+
+extern const bool kDefaultToAdaptiveMutex;
+
+enum CacheMetadataChargePolicy {
+  // Only the `charge` of each entry inserted into a Cache counts against
+  // the `capacity`
+  kDontChargeCacheMetadata,
+  // In addition to the `charge`, the approximate space overheads in the
+  // Cache (in bytes) also count against `capacity`. These space overheads
+  // are for supporting fast Lookup and managing the lifetime of entries.
+  kFullChargeCacheMetadata
+};
+const CacheMetadataChargePolicy kDefaultCacheMetadataChargePolicy =
+    kFullChargeCacheMetadata;
+
+// Options shared betweeen various cache implementations that
+// divide the key space into shards using hashing.
+struct ShardedCacheOptions {
+  // Capacity of the cache, in the same units as the `charge` of each entry.
+  // This is typically measured in bytes, but can be a different unit if using
+  // kDontChargeCacheMetadata.
+  size_t capacity = 0;
+
+  // Cache is sharded into 2^num_shard_bits shards, by hash of key.
+  // If < 0, a good default is chosen based on the capacity and the
+  // implementation. (Mutex-based implementations are much more reliant
+  // on many shards for parallel scalability.)
+  int num_shard_bits = -1;
+
+  // If strict_capacity_limit is set, Insert() will fail if there is not
+  // enough capacity for the new entry along with all the existing referenced
+  // (pinned) cache entries. (Unreferenced cache entries are evicted as
+  // needed, sometimes immediately.) If strict_capacity_limit == false
+  // (default), Insert() never fails.
+  bool strict_capacity_limit = false;
+
+  // If non-nullptr, RocksDB will use this allocator instead of system
+  // allocator when allocating memory for cache blocks.
+  //
+  // Caveat: when the cache is used as block cache, the memory allocator is
+  // ignored when dealing with compression libraries that allocate memory
+  // internally (currently only XPRESS).
+  std::shared_ptr<MemoryAllocator> memory_allocator;
+
+  // See CacheMetadataChargePolicy
+  CacheMetadataChargePolicy metadata_charge_policy =
+      kDefaultCacheMetadataChargePolicy;
+
+  ShardedCacheOptions() {}
+  ShardedCacheOptions(
+      size_t _capacity, int _num_shard_bits, bool _strict_capacity_limit,
+      std::shared_ptr<MemoryAllocator> _memory_allocator = nullptr,
+      CacheMetadataChargePolicy _metadata_charge_policy =
+          kDefaultCacheMetadataChargePolicy)
+      : capacity(_capacity),
+        num_shard_bits(_num_shard_bits),
+        strict_capacity_limit(_strict_capacity_limit),
+        memory_allocator(std::move(_memory_allocator)),
+        metadata_charge_policy(_metadata_charge_policy) {}
+};
+
+struct LRUCacheOptions : public ShardedCacheOptions {
+  // Ratio of cache reserved for high-priority and low-priority entries,
+  // respectively. (See Cache::Priority below more information on the levels.)
+  // Valid values are between 0 and 1 (inclusive), and the sum of the two
+  // values cannot exceed 1.
+  //
+  // If high_pri_pool_ratio is greater than zero, a dedicated high-priority LRU
+  // list is maintained by the cache. Similarly, if low_pri_pool_ratio is
+  // greater than zero, a dedicated low-priority LRU list is maintained.
+  // There is also a bottom-priority LRU list, which is always enabled and not
+  // explicitly configurable. Entries are spilled over to the next available
+  // lower-priority pool if a certain pool's capacity is exceeded.
+  //
+  // Entries with cache hits are inserted into the highest priority LRU list
+  // available regardless of the entry's priority. Entries without hits
+  // are inserted into highest priority LRU list available whose priority
+  // does not exceed the entry's priority. (For example, high-priority items
+  // with no hits are placed in the high-priority pool if available;
+  // otherwise, they are placed in the low-priority pool if available;
+  // otherwise, they are placed in the bottom-priority pool.) This results
+  // in lower-priority entries without hits getting evicted from the cache
+  // sooner.
+  //
+  // Default values: high_pri_pool_ratio = 0.5 (which is referred to as
+  // "midpoint insertion"), low_pri_pool_ratio = 0
+  double high_pri_pool_ratio = 0.5;
+  double low_pri_pool_ratio = 0.0;
+
+  // Whether to use adaptive mutexes for cache shards. Note that adaptive
+  // mutexes need to be supported by the platform in order for this to have any
+  // effect. The default value is true if RocksDB is compiled with
+  // -DROCKSDB_DEFAULT_TO_ADAPTIVE_MUTEX, false otherwise.
+  bool use_adaptive_mutex = kDefaultToAdaptiveMutex;
+
+  // A SecondaryCache instance to use a the non-volatile tier.
+  std::shared_ptr<SecondaryCache> secondary_cache;
+
+  LRUCacheOptions() {}
+  LRUCacheOptions(size_t _capacity, int _num_shard_bits,
+                  bool _strict_capacity_limit, double _high_pri_pool_ratio,
+                  std::shared_ptr<MemoryAllocator> _memory_allocator = nullptr,
+                  bool _use_adaptive_mutex = kDefaultToAdaptiveMutex,
+                  CacheMetadataChargePolicy _metadata_charge_policy =
+                      kDefaultCacheMetadataChargePolicy,
+                  double _low_pri_pool_ratio = 0.0)
+      : ShardedCacheOptions(_capacity, _num_shard_bits, _strict_capacity_limit,
+                            std::move(_memory_allocator),
+                            _metadata_charge_policy),
+        high_pri_pool_ratio(_high_pri_pool_ratio),
+        low_pri_pool_ratio(_low_pri_pool_ratio),
+        use_adaptive_mutex(_use_adaptive_mutex) {}
+};
+
+// Create a new cache with a fixed size capacity. The cache is sharded
+// to 2^num_shard_bits shards, by hash of the key. The total capacity
+// is divided and evenly assigned to each shard. If strict_capacity_limit
+// is set, insert to the cache will fail when cache is full. User can also
+// set percentage of the cache reserves for high priority entries via
+// high_pri_pool_pct.
+// num_shard_bits = -1 means it is automatically determined: every shard
+// will be at least 512KB and number of shard bits will not exceed 6.
+extern std::shared_ptr<Cache> NewLRUCache(
+    size_t capacity, int num_shard_bits = -1,
+    bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.5,
+    std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
+    bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
+    CacheMetadataChargePolicy metadata_charge_policy =
+        kDefaultCacheMetadataChargePolicy,
+    double low_pri_pool_ratio = 0.0);
+
+extern std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts);
+
+// EXPERIMENTAL
+// Options structure for configuring a SecondaryCache instance based on
+// LRUCache. The LRUCacheOptions.secondary_cache is not used and
+// should not be set.
+struct CompressedSecondaryCacheOptions : LRUCacheOptions {
+  // The compression method (if any) that is used to compress data.
+  CompressionType compression_type = CompressionType::kLZ4Compression;
+
+  // compress_format_version can have two values:
+  // compress_format_version == 1 -- decompressed size is not included in the
+  // block header.
+  // compress_format_version == 2 -- decompressed size is included in the block
+  // header in varint32 format.
+  uint32_t compress_format_version = 2;
+
+  // Enable the custom split and merge feature, which split the compressed value
+  // into chunks so that they may better fit jemalloc bins.
+  bool enable_custom_split_merge = false;
+
+  CompressedSecondaryCacheOptions() {}
+  CompressedSecondaryCacheOptions(
+      size_t _capacity, int _num_shard_bits, bool _strict_capacity_limit,
+      double _high_pri_pool_ratio, double _low_pri_pool_ratio = 0.0,
+      std::shared_ptr<MemoryAllocator> _memory_allocator = nullptr,
+      bool _use_adaptive_mutex = kDefaultToAdaptiveMutex,
+      CacheMetadataChargePolicy _metadata_charge_policy =
+          kDefaultCacheMetadataChargePolicy,
+      CompressionType _compression_type = CompressionType::kLZ4Compression,
+      uint32_t _compress_format_version = 2,
+      bool _enable_custom_split_merge = false)
+      : LRUCacheOptions(_capacity, _num_shard_bits, _strict_capacity_limit,
+                        _high_pri_pool_ratio, std::move(_memory_allocator),
+                        _use_adaptive_mutex, _metadata_charge_policy,
+                        _low_pri_pool_ratio),
+        compression_type(_compression_type),
+        compress_format_version(_compress_format_version),
+        enable_custom_split_merge(_enable_custom_split_merge) {}
+};
+
+// EXPERIMENTAL
+// Create a new Secondary Cache that is implemented on top of LRUCache.
+extern std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache(
+    size_t capacity, int num_shard_bits = -1,
+    bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.5,
+    double low_pri_pool_ratio = 0.0,
+    std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
+    bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
+    CacheMetadataChargePolicy metadata_charge_policy =
+        kDefaultCacheMetadataChargePolicy,
+    CompressionType compression_type = CompressionType::kLZ4Compression,
+    uint32_t compress_format_version = 2,
+    bool enable_custom_split_merge = false);
+
+extern std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache(
+    const CompressedSecondaryCacheOptions& opts);
+
+// HyperClockCache - A lock-free Cache alternative for RocksDB block cache
+// that offers much improved CPU efficiency vs. LRUCache under high parallel
+// load or high contention, with some caveats:
+// * Not a general Cache implementation: can only be used for
+// BlockBasedTableOptions::block_cache, which RocksDB uses in a way that is
+// compatible with HyperClockCache.
+// * Requires an extra tuning parameter: see estimated_entry_charge below.
+// Similarly, substantially changing the capacity with SetCapacity could
+// harm efficiency.
+// * SecondaryCache is not yet supported.
+// * Cache priorities are less aggressively enforced, which could cause
+// cache dilution from long range scans (unless they use fill_cache=false).
+// * Can be worse for small caches, because if almost all of a cache shard is
+// pinned (more likely with non-partitioned filters), then CLOCK eviction
+// becomes very CPU intensive.
+//
+// See internal cache/clock_cache.h for full description.
+struct HyperClockCacheOptions : public ShardedCacheOptions {
+  // The estimated average `charge` associated with cache entries. This is a
+  // critical configuration parameter for good performance from the hyper
+  // cache, because having a table size that is fixed at creation time greatly
+  // reduces the required synchronization between threads.
+  // * If the estimate is substantially too low (e.g. less than half the true
+  // average) then metadata space overhead with be substantially higher (e.g.
+  // 200 bytes per entry rather than 100). With kFullChargeCacheMetadata, this
+  // can slightly reduce cache hit rates, and slightly reduce access times due
+  // to the larger working memory size.
+  // * If the estimate is substantially too high (e.g. 25% higher than the true
+  // average) then there might not be sufficient slots in the hash table for
+  // both efficient operation and capacity utilization (hit rate). The hyper
+  // cache will evict entries to prevent load factors that could dramatically
+  // affect lookup times, instead letting the hit rate suffer by not utilizing
+  // the full capacity.
+  //
+  // A reasonable choice is the larger of block_size and metadata_block_size.
+  // When WriteBufferManager (and similar) charge memory usage to the block
+  // cache, this can lead to the same effect as estimate being too low, which
+  // is better than the opposite. Therefore, the general recommendation is to
+  // assume that other memory charged to block cache could be negligible, and
+  // ignore it in making the estimate.
+  //
+  // The best parameter choice based on a cache in use is given by
+  // GetUsage() / GetOccupancyCount(), ignoring metadata overheads such as
+  // with kDontChargeCacheMetadata. More precisely with
+  // kFullChargeCacheMetadata is (GetUsage() - 64 * GetTableAddressCount()) /
+  // GetOccupancyCount(). However, when the average value size might vary
+  // (e.g. balance between metadata and data blocks in cache), it is better
+  // to estimate toward the lower side than the higher side.
+  size_t estimated_entry_charge;
+
+  HyperClockCacheOptions(
+      size_t _capacity, size_t _estimated_entry_charge,
+      int _num_shard_bits = -1, bool _strict_capacity_limit = false,
+      std::shared_ptr<MemoryAllocator> _memory_allocator = nullptr,
+      CacheMetadataChargePolicy _metadata_charge_policy =
+          kDefaultCacheMetadataChargePolicy)
+      : ShardedCacheOptions(_capacity, _num_shard_bits, _strict_capacity_limit,
+                            std::move(_memory_allocator),
+                            _metadata_charge_policy),
+        estimated_entry_charge(_estimated_entry_charge) {}
+
+  // Construct an instance of HyperClockCache using these options
+  std::shared_ptr<Cache> MakeSharedCache() const;
+};
+
+// DEPRECATED - The old Clock Cache implementation had an unresolved bug and
+// has been removed. The new HyperClockCache requires an additional
+// configuration parameter that is not provided by this API. This function
+// simply returns a new LRUCache for functional compatibility.
+extern std::shared_ptr<Cache> NewClockCache(
+    size_t capacity, int num_shard_bits = -1,
+    bool strict_capacity_limit = false,
+    CacheMetadataChargePolicy metadata_charge_policy =
+        kDefaultCacheMetadataChargePolicy);
+
+class Cache {
+ public:  // opaque types
+  // Opaque handle to an entry stored in the cache.
+  struct Handle {};
+
+ public:  // type defs
+  // Depending on implementation, cache entries with higher priority levels
+  // could be less likely to get evicted than entries with lower priority
+  // levels. The "high" priority level applies to certain SST metablocks (e.g.
+  // index and filter blocks) if the option
+  // cache_index_and_filter_blocks_with_high_priority is set. The "low" priority
+  // level is used for other kinds of SST blocks (most importantly, data
+  // blocks), as well as the above metablocks in case
+  // cache_index_and_filter_blocks_with_high_priority is
+  // not set. The "bottom" priority level is for BlobDB's blob values.
+  enum class Priority { HIGH, LOW, BOTTOM };
+
+  // A set of callbacks to allow objects in the primary block cache to be
+  // be persisted in a secondary cache. The purpose of the secondary cache
+  // is to support other ways of caching the object, such as persistent or
+  // compressed data, that may require the object to be parsed and transformed
+  // in some way. Since the primary cache holds C++ objects and the secondary
+  // cache may only hold flat data that doesn't need relocation, these
+  // callbacks need to be provided by the user of the block
+  // cache to do the conversion.
+  // The CacheItemHelper is passed to Insert() and Lookup(). It has pointers
+  // to callback functions for size, saving and deletion of the
+  // object. The callbacks are defined in C-style in order to make them
+  // stateless and not add to the cache metadata size.
+  // Saving multiple std::function objects will take up 32 bytes per
+  // function, even if its not bound to an object and does no capture.
+  //
+  // All the callbacks are C-style function pointers in order to simplify
+  // lifecycle management. Objects in the cache can outlive the parent DB,
+  // so anything required for these operations should be contained in the
+  // object itself.
+  //
+  // The SizeCallback takes a void* pointer to the object and returns the size
+  // of the persistable data. It can be used by the secondary cache to allocate
+  // memory if needed.
+  //
+  // RocksDB callbacks are NOT exception-safe. A callback completing with an
+  // exception can lead to undefined behavior in RocksDB, including data loss,
+  // unreported corruption, deadlocks, and more.
+  using SizeCallback = size_t (*)(void* obj);
+
+  // The SaveToCallback takes a void* object pointer and saves the persistable
+  // data into a buffer. The secondary cache may decide to not store it in a
+  // contiguous buffer, in which case this callback will be called multiple
+  // times with increasing offset
+  using SaveToCallback = Status (*)(void* from_obj, size_t from_offset,
+                                    size_t length, void* out);
+
+  // A function pointer type for custom destruction of an entry's
+  // value. The Cache is responsible for copying and reclaiming space
+  // for the key, but values are managed by the caller.
+  using DeleterFn = void (*)(const Slice& key, void* value);
+
+  // A struct with pointers to helper functions for spilling items from the
+  // cache into the secondary cache. May be extended in the future. An
+  // instance of this struct is expected to outlive the cache.
+  struct CacheItemHelper {
+    SizeCallback size_cb;
+    SaveToCallback saveto_cb;
+    DeleterFn del_cb;
+
+    CacheItemHelper() : size_cb(nullptr), saveto_cb(nullptr), del_cb(nullptr) {}
+    CacheItemHelper(SizeCallback _size_cb, SaveToCallback _saveto_cb,
+                    DeleterFn _del_cb)
+        : size_cb(_size_cb), saveto_cb(_saveto_cb), del_cb(_del_cb) {}
+  };
+
+  // The CreateCallback is passed by the block cache user to Lookup(). It
+  // takes in a buffer from the NVM cache and constructs an object using
+  // it. The callback doesn't have ownership of the buffer and should
+  // copy the contents into its own buffer.
+  using CreateCallback = std::function<Status(const void* buf, size_t size,
+                                              void** out_obj, size_t* charge)>;
+
+ public:  // ctor/dtor/create
+  Cache(std::shared_ptr<MemoryAllocator> allocator = nullptr)
+      : memory_allocator_(std::move(allocator)) {}
+  // No copying allowed
+  Cache(const Cache&) = delete;
+  Cache& operator=(const Cache&) = delete;
+
+  // Destroys all remaining entries by calling the associated "deleter"
+  virtual ~Cache() {}
+
+  // Creates a new Cache based on the input value string and returns the result.
+  // Currently, this method can be used to create LRUCaches only
+  // @param config_options
+  // @param value  The value might be:
+  //   - an old-style cache ("1M") -- equivalent to NewLRUCache(1024*102(
+  //   - Name-value option pairs -- "capacity=1M; num_shard_bits=4;
+  //     For the LRUCache, the values are defined in LRUCacheOptions.
+  // @param result The new Cache object
+  // @return OK if the cache was successfully created
+  // @return NotFound if an invalid name was specified in the value
+  // @return InvalidArgument if either the options were not valid
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& value,
+                                 std::shared_ptr<Cache>* result);
+
+ public:  // functions
+  // The type of the Cache
+  virtual const char* Name() const = 0;
+
+  // EXPERIMENTAL SecondaryCache support:
+  // Some APIs here are experimental and might change in the future.
+  // The Insert and Lookup APIs below are intended to allow cached objects
+  // to be demoted/promoted between the primary block cache and a secondary
+  // cache. The secondary cache could be a non-volatile cache, and will
+  // likely store the object in a different representation. They rely on a
+  // per object CacheItemHelper to do the conversions.
+  // The secondary cache may persist across process and system restarts,
+  // and may even be moved between hosts. Therefore, the cache key must
+  // be repeatable across restarts/reboots, and globally unique if
+  // multiple DBs share the same cache and the set of DBs can change
+  // over time.
+
+  // Insert a mapping from key->value into the volatile cache only
+  // and assign it with the specified charge against the total cache capacity.
+  // If strict_capacity_limit is true and cache reaches its full capacity,
+  // return Status::MemoryLimit.
+  //
+  // If handle is not nullptr, returns a handle that corresponds to the
+  // mapping. The caller must call this->Release(handle) when the returned
+  // mapping is no longer needed. In case of error caller is responsible to
+  // cleanup the value (i.e. calling "deleter").
+  //
+  // If handle is nullptr, it is as if Release is called immediately after
+  // insert. In case of error value will be cleanup.
+  //
+  // When the inserted entry is no longer needed, the key and
+  // value will be passed to "deleter" which must delete the value.
+  // (The Cache is responsible for copying and reclaiming space for
+  // the key.)
+  virtual Status Insert(const Slice& key, void* value, size_t charge,
+                        DeleterFn deleter, Handle** handle = nullptr,
+                        Priority priority = Priority::LOW) = 0;
+
+  // EXPERIMENTAL
+  // Insert a mapping from key->value into the cache and assign it
+  // the specified charge against the total cache capacity. If
+  // strict_capacity_limit is true and cache reaches its full capacity,
+  // return Status::MemoryLimit. `value` must be non-nullptr for this
+  // Insert() because Value() == nullptr is reserved for indicating failure
+  // with secondary-cache-compatible mappings.
+  //
+  // The helper argument is saved by the cache and will be used when the
+  // inserted object is evicted or promoted to the secondary cache. It,
+  // therefore, must outlive the cache.
+  //
+  // If handle is not nullptr, returns a handle that corresponds to the
+  // mapping. The caller must call this->Release(handle) when the returned
+  // mapping is no longer needed. In case of error caller is responsible to
+  // cleanup the value (i.e. calling "deleter").
+  //
+  // If handle is nullptr, it is as if Release is called immediately after
+  // insert. In case of error value will be cleanup.
+  //
+  // Regardless of whether the item was inserted into the cache,
+  // it will attempt to insert it into the secondary cache if one is
+  // configured, and the helper supports it.
+  // The cache implementation must support a secondary cache, otherwise
+  // the item is only inserted into the primary cache. It may
+  // defer the insertion to the secondary cache as it sees fit.
+  //
+  // When the inserted entry is no longer needed, the key and
+  // value will be passed to "deleter".
+  virtual Status Insert(const Slice& key, void* value,
+                        const CacheItemHelper* helper, size_t charge,
+                        Handle** handle = nullptr,
+                        Priority priority = Priority::LOW) {
+    if (!helper) {
+      return Status::InvalidArgument();
+    }
+    return Insert(key, value, charge, helper->del_cb, handle, priority);
+  }
+
+  // If the cache has no mapping for "key", returns nullptr.
+  //
+  // Else return a handle that corresponds to the mapping.  The caller
+  // must call this->Release(handle) when the returned mapping is no
+  // longer needed.
+  // If stats is not nullptr, relative tickers could be used inside the
+  // function.
+  virtual Handle* Lookup(const Slice& key, Statistics* stats = nullptr) = 0;
+
+  // EXPERIMENTAL
+  // Lookup the key in the primary and secondary caches (if one is configured).
+  // The create_cb callback function object will be used to contruct the
+  // cached object.
+  // If none of the caches have the mapping for the key, returns nullptr.
+  // Else, returns a handle that corresponds to the mapping.
+  //
+  // This call may promote the object from the secondary cache (if one is
+  // configured, and has the given key) to the primary cache.
+  //
+  // The helper argument should be provided if the caller wants the lookup
+  // to include the secondary cache (if one is configured) and the object,
+  // if it exists, to be promoted to the primary cache. The helper may be
+  // saved and used later when the object is evicted. Therefore, it must
+  // outlive the cache.
+  //
+  // ======================== Async Lookup (wait=false) ======================
+  // When wait=false, the handle returned might be in any of three states:
+  // * Present - If Value() != nullptr, then the result is present and
+  // the handle can be used just as if wait=true.
+  // * Pending, not ready (IsReady() == false) - secondary cache is still
+  // working to retrieve the value. Might become ready any time.
+  // * Pending, ready (IsReady() == true) - secondary cache has the value
+  // but it has not been loaded into primary cache. Call to Wait()/WaitAll()
+  // will not block.
+  //
+  // IMPORTANT: Pending handles are not thread-safe, and only these functions
+  // are allowed on them: Value(), IsReady(), Wait(), WaitAll(). Even Release()
+  // can only come after Wait() or WaitAll() even though a reference is held.
+  //
+  // Only Wait()/WaitAll() gets a Handle out of a Pending state. (Waiting is
+  // safe and has no effect on other handle states.) After waiting on a Handle,
+  // it is in one of two states:
+  // * Present - if Value() != nullptr
+  // * Failed - if Value() == nullptr, such as if the secondary cache
+  // initially thought it had the value but actually did not.
+  //
+  // Note that given an arbitrary Handle, the only way to distinguish the
+  // Pending+ready state from the Failed state is to Wait() on it. A cache
+  // entry not compatible with secondary cache can also have Value()==nullptr
+  // like the Failed state, but this is not generally a concern.
+  virtual Handle* Lookup(const Slice& key, const CacheItemHelper* /*helper_cb*/,
+                         const CreateCallback& /*create_cb*/,
+                         Priority /*priority*/, bool /*wait*/,
+                         Statistics* stats = nullptr) {
+    return Lookup(key, stats);
+  }
+
+  // Increments the reference count for the handle if it refers to an entry in
+  // the cache. Returns true if refcount was incremented; otherwise, returns
+  // false.
+  // REQUIRES: handle must have been returned by a method on *this.
+  virtual bool Ref(Handle* handle) = 0;
+
+  /**
+   * Release a mapping returned by a previous Lookup(). A released entry might
+   * still remain in cache in case it is later looked up by others. If
+   * erase_if_last_ref is set then it also erases it from the cache if there is
+   * no other reference to  it. Erasing it should call the deleter function that
+   * was provided when the entry was inserted.
+   *
+   * Returns true if the entry was also erased.
+   */
+  // REQUIRES: handle must not have been released yet.
+  // REQUIRES: handle must have been returned by a method on *this.
+  virtual bool Release(Handle* handle, bool erase_if_last_ref = false) = 0;
+
+  // Return the value encapsulated in a handle returned by a
+  // successful Lookup().
+  // REQUIRES: handle must not have been released yet.
+  // REQUIRES: handle must have been returned by a method on *this.
+  virtual void* Value(Handle* handle) = 0;
+
+  // If the cache contains the entry for the key, erase it.  Note that the
+  // underlying entry will be kept around until all existing handles
+  // to it have been released.
+  virtual void Erase(const Slice& key) = 0;
+  // Return a new numeric id.  May be used by multiple clients who are
+  // sharding the same cache to partition the key space.  Typically the
+  // client will allocate a new id at startup and prepend the id to
+  // its cache keys.
+  virtual uint64_t NewId() = 0;
+
+  // sets the maximum configured capacity of the cache. When the new
+  // capacity is less than the old capacity and the existing usage is
+  // greater than new capacity, the implementation will do its best job to
+  // purge the released entries from the cache in order to lower the usage
+  virtual void SetCapacity(size_t capacity) = 0;
+
+  // Set whether to return error on insertion when cache reaches its full
+  // capacity.
+  virtual void SetStrictCapacityLimit(bool strict_capacity_limit) = 0;
+
+  // Get the flag whether to return error on insertion when cache reaches its
+  // full capacity.
+  virtual bool HasStrictCapacityLimit() const = 0;
+
+  // Returns the maximum configured capacity of the cache
+  virtual size_t GetCapacity() const = 0;
+
+  // Returns the memory size for the entries residing in the cache.
+  virtual size_t GetUsage() const = 0;
+
+  // Returns the number of entries currently tracked in the table. SIZE_MAX
+  // means "not supported." This is used for inspecting the load factor, along
+  // with GetTableAddressCount().
+  virtual size_t GetOccupancyCount() const { return SIZE_MAX; }
+
+  // Returns the number of ways the hash function is divided for addressing
+  // entries. Zero means "not supported." This is used for inspecting the load
+  // factor, along with GetOccupancyCount().
+  virtual size_t GetTableAddressCount() const { return 0; }
+
+  // Returns the memory size for a specific entry in the cache.
+  virtual size_t GetUsage(Handle* handle) const = 0;
+
+  // Returns the memory size for the entries in use by the system
+  virtual size_t GetPinnedUsage() const = 0;
+
+  // Returns the charge for the specific entry in the cache.
+  virtual size_t GetCharge(Handle* handle) const = 0;
+
+  // Returns the deleter for the specified entry. This might seem useless
+  // as the Cache itself is responsible for calling the deleter, but
+  // the deleter can essentially verify that a cache entry is of an
+  // expected type from an expected code source.
+  virtual DeleterFn GetDeleter(Handle* handle) const = 0;
+
+  // Call this on shutdown if you want to speed it up. Cache will disown
+  // any underlying data and will not free it on delete. This call will leak
+  // memory - call this only if you're shutting down the process.
+  // Any attempts of using cache after this call will fail terribly.
+  // Always delete the DB object before calling this method!
+  virtual void DisownData() {
+    // default implementation is noop
+  }
+
+  struct ApplyToAllEntriesOptions {
+    // If the Cache uses locks, setting `average_entries_per_lock` to
+    // a higher value suggests iterating over more entries each time a lock
+    // is acquired, likely reducing the time for ApplyToAllEntries but
+    // increasing latency for concurrent users of the Cache. Setting
+    // `average_entries_per_lock` to a smaller value could be helpful if
+    // callback is relatively expensive, such as using large data structures.
+    size_t average_entries_per_lock = 256;
+  };
+
+  // Apply a callback to all entries in the cache. The Cache must ensure
+  // thread safety but does not guarantee that a consistent snapshot of all
+  // entries is iterated over if other threads are operating on the Cache
+  // also.
+  virtual void ApplyToAllEntries(
+      const std::function<void(const Slice& key, void* value, size_t charge,
+                               DeleterFn deleter)>& callback,
+      const ApplyToAllEntriesOptions& opts) = 0;
+
+  // DEPRECATED version of above. (Default implementation uses above.)
+  virtual void ApplyToAllCacheEntries(void (*callback)(void* value,
+                                                       size_t charge),
+                                      bool /*thread_safe*/) {
+    ApplyToAllEntries([callback](const Slice&, void* value, size_t charge,
+                                 DeleterFn) { callback(value, charge); },
+                      {});
+  }
+
+  // Remove all entries.
+  // Prerequisite: no entry is referenced.
+  virtual void EraseUnRefEntries() = 0;
+
+  virtual std::string GetPrintableOptions() const { return ""; }
+
+  // Check for any warnings or errors in the operation of the cache and
+  // report them to the logger. This is intended only to be called
+  // periodically so does not need to be very efficient. (Obscure calling
+  // conventions for Logger inherited from env.h)
+  virtual void ReportProblems(
+      const std::shared_ptr<Logger>& /*info_log*/) const {}
+
+  MemoryAllocator* memory_allocator() const { return memory_allocator_.get(); }
+
+  // EXPERIMENTAL
+  // Release a mapping returned by a previous Lookup(). The "useful"
+  // parameter specifies whether the data was actually used or not,
+  // which may be used by the cache implementation to decide whether
+  // to consider it as a hit for retention purposes. As noted elsewhere,
+  // "pending" handles require Wait()/WaitAll() before Release().
+  virtual bool Release(Handle* handle, bool /*useful*/,
+                       bool erase_if_last_ref) {
+    return Release(handle, erase_if_last_ref);
+  }
+
+  // EXPERIMENTAL
+  // Determines if the handle returned by Lookup() can give a value without
+  // blocking, though Wait()/WaitAll() might be required to publish it to
+  // Value(). See secondary cache compatible Lookup() above for details.
+  // This call is not thread safe on "pending" handles.
+  virtual bool IsReady(Handle* /*handle*/) { return true; }
+
+  // EXPERIMENTAL
+  // Convert a "pending" handle into a full thread-shareable handle by
+  // * If necessary, wait until secondary cache finishes loading the value.
+  // * Construct the value for primary cache and set it in the handle.
+  // Even after Wait() on a pending handle, the caller must check for
+  // Value() == nullptr in case of failure. This call is not thread-safe
+  // on pending handles. This call has no effect on non-pending handles.
+  // See secondary cache compatible Lookup() above for details.
+  virtual void Wait(Handle* /*handle*/) {}
+
+  // EXPERIMENTAL
+  // Wait for a vector of handles to become ready. As with Wait(), the user
+  // should check the Value() of each handle for nullptr. This call is not
+  // thread-safe on pending handles.
+  virtual void WaitAll(std::vector<Handle*>& /*handles*/) {}
+
+ private:
+  std::shared_ptr<MemoryAllocator> memory_allocator_;
+};
+
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/cache_bench_tool.h b/src/rocksdb/include/rocksdb/cache_bench_tool.h
new file mode 100644
index 000000000..413ce1593
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/cache_bench_tool.h
@@ -0,0 +1,14 @@
+// Copyright (c) 2013-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+int cache_bench_tool(int argc, char** argv);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/cleanable.h b/src/rocksdb/include/rocksdb/cleanable.h
new file mode 100644
index 000000000..afc736673
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/cleanable.h
@@ -0,0 +1,128 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cleanable {
+ public:
+  Cleanable();
+  // No copy constructor and copy assignment allowed.
+  Cleanable(Cleanable&) = delete;
+  Cleanable& operator=(Cleanable&) = delete;
+
+  // Executes all the registered cleanups
+  ~Cleanable();
+
+  // Move constructor and move assignment is allowed.
+  Cleanable(Cleanable&&) noexcept;
+  Cleanable& operator=(Cleanable&&) noexcept;
+
+  // Clients are allowed to register function/arg1/arg2 triples that
+  // will be invoked when this iterator is destroyed.
+  //
+  // Note that unlike all of the preceding methods, this method is
+  // not abstract and therefore clients should not override it.
+  using CleanupFunction = void (*)(void* arg1, void* arg2);
+
+  // Add another Cleanup to the list
+  void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
+
+  // Move the cleanups owned by this Cleanable to another Cleanable, adding to
+  // any existing cleanups it has
+  void DelegateCleanupsTo(Cleanable* other);
+
+  // DoCleanup and also resets the pointers for reuse
+  inline void Reset() {
+    DoCleanup();
+    cleanup_.function = nullptr;
+    cleanup_.next = nullptr;
+  }
+
+  inline bool HasCleanups() { return cleanup_.function != nullptr; }
+
+ protected:
+  struct Cleanup {
+    CleanupFunction function;
+    void* arg1;
+    void* arg2;
+    Cleanup* next;
+  };
+  Cleanup cleanup_;
+  // It also becomes the owner of c
+  void RegisterCleanup(Cleanup* c);
+
+ private:
+  // Performs all the cleanups. It does not reset the pointers. Making it
+  // private
+  // to prevent misuse
+  inline void DoCleanup() {
+    if (cleanup_.function != nullptr) {
+      (*cleanup_.function)(cleanup_.arg1, cleanup_.arg2);
+      for (Cleanup* c = cleanup_.next; c != nullptr;) {
+        (*c->function)(c->arg1, c->arg2);
+        Cleanup* next = c->next;
+        delete c;
+        c = next;
+      }
+    }
+  }
+};
+
+// A copyable, reference-counted pointer to a simple Cleanable that only
+// performs registered cleanups after all copies are destroy. This is like
+// shared_ptr<Cleanable> but works more efficiently with wrapping the pointer
+// in an outer Cleanable (see RegisterCopyWith() and MoveAsCleanupTo()).
+// WARNING: if you create a reference cycle, for example:
+//   SharedCleanablePtr scp;
+//   scp.Allocate();
+//   scp.RegisterCopyWith(&*scp);
+// It will prevent cleanups from ever happening!
+class SharedCleanablePtr {
+ public:
+  // Empy/null pointer
+  SharedCleanablePtr() {}
+  // Copy and move constructors and assignment
+  SharedCleanablePtr(const SharedCleanablePtr& from);
+  SharedCleanablePtr(SharedCleanablePtr&& from) noexcept;
+  SharedCleanablePtr& operator=(const SharedCleanablePtr& from);
+  SharedCleanablePtr& operator=(SharedCleanablePtr&& from) noexcept;
+  // Destructor (decrement refcount if non-null)
+  ~SharedCleanablePtr();
+  // Create a new simple Cleanable and make this assign this pointer to it.
+  // (Reset()s first if necessary.)
+  void Allocate();
+  // Reset to empty/null (decrement refcount if previously non-null)
+  void Reset();
+  // Dereference to pointed-to Cleanable
+  Cleanable& operator*();
+  Cleanable* operator->();
+  // Get as raw pointer to Cleanable
+  Cleanable* get();
+
+  // Creates a (virtual) copy of this SharedCleanablePtr and registers its
+  // destruction with target, so that the cleanups registered with the
+  // Cleanable pointed to by this can only happen after the cleanups in the
+  // target Cleanable are run.
+  // No-op if this is empty (nullptr).
+  void RegisterCopyWith(Cleanable* target);
+
+  // Moves (virtually) this shared pointer to a new cleanup in the target.
+  // This is essentilly a move semantics version of RegisterCopyWith(), for
+  // performance optimization. No-op if this is empty (nullptr).
+  void MoveAsCleanupTo(Cleanable* target);
+
+ private:
+  struct Impl;
+  Impl* ptr_ = nullptr;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/compaction_filter.h b/src/rocksdb/include/rocksdb/compaction_filter.h
new file mode 100644
index 000000000..9c6a9c30d
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/compaction_filter.h
@@ -0,0 +1,256 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <cassert>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class SliceTransform;
+
+// CompactionFilter allows an application to modify/delete a key-value during
+// table file creation.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class CompactionFilter : public Customizable {
+ public:
+  enum ValueType {
+    kValue,
+    kMergeOperand,
+    kBlobIndex,  // used internally by BlobDB.
+  };
+
+  enum class Decision {
+    kKeep,
+    kRemove,
+    kChangeValue,
+    kRemoveAndSkipUntil,
+    kChangeBlobIndex,  // used internally by BlobDB.
+    kIOError,          // used internally by BlobDB.
+    kPurge,            // used for keys that can only be SingleDelete'ed
+    kUndetermined,
+  };
+
+  enum class BlobDecision { kKeep, kChangeValue, kCorruption, kIOError };
+
+  // Context information for a table file creation.
+  struct Context {
+    // Whether this table file is created as part of a compaction including all
+    // table files.
+    bool is_full_compaction;
+    // Whether this table file is created as part of a compaction requested by
+    // the client.
+    bool is_manual_compaction;
+    // The column family that will contain the created table file.
+    uint32_t column_family_id;
+    // Reason this table file is being created.
+    TableFileCreationReason reason;
+  };
+
+  virtual ~CompactionFilter() {}
+  static const char* Type() { return "CompactionFilter"; }
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& name,
+                                 const CompactionFilter** result);
+
+  // The table file creation process invokes this method before adding a kv to
+  // the table file. A return value of false indicates that the kv should be
+  // preserved in the new table file and a return value of true indicates
+  // that this key-value should be removed from the new table file. The
+  // application can inspect the existing value of the key and make decision
+  // based on it.
+  //
+  // Key-Values that are results of merge operation during table file creation
+  // are not passed into this function. Currently, when you have a mix of Put()s
+  // and Merge()s on a same key, we only guarantee to process the merge operands
+  // through the `CompactionFilter`s. Put()s might be processed, or might not.
+  //
+  // When the value is to be preserved, the application has the option
+  // to modify the existing_value and pass it back through new_value.
+  // value_changed needs to be set to true in this case.
+  //
+  // Note that RocksDB snapshots (i.e. call GetSnapshot() API on a
+  // DB* object) will not guarantee to preserve the state of the DB with
+  // CompactionFilter. Data seen from a snapshot might disappear after a
+  // table file created with a `CompactionFilter` is installed. If you use
+  // snapshots, think twice about whether you want to use `CompactionFilter` and
+  // whether you are using it in a safe way.
+  //
+  // If multithreaded compaction is being used *and* a single CompactionFilter
+  // instance was supplied via Options::compaction_filter, this method may be
+  // called from different threads concurrently.  The application must ensure
+  // that the call is thread-safe.
+  //
+  // If the CompactionFilter was created by a factory, then it will only ever
+  // be used by a single thread that is doing the table file creation, and this
+  // call does not need to be thread-safe.  However, multiple filters may be
+  // in existence and operating concurrently.
+  virtual bool Filter(int /*level*/, const Slice& /*key*/,
+                      const Slice& /*existing_value*/,
+                      std::string* /*new_value*/,
+                      bool* /*value_changed*/) const {
+    return false;
+  }
+
+  // The table file creation process invokes this method on every merge operand.
+  // If this method returns true, the merge operand will be ignored and not
+  // written out in the new table file.
+  //
+  // Note: If you are using a TransactionDB, it is not recommended to implement
+  // FilterMergeOperand().  If a Merge operation is filtered out, TransactionDB
+  // may not realize there is a write conflict and may allow a Transaction to
+  // Commit that should have failed.  Instead, it is better to implement any
+  // Merge filtering inside the MergeOperator.
+  virtual bool FilterMergeOperand(int /*level*/, const Slice& /*key*/,
+                                  const Slice& /*operand*/) const {
+    return false;
+  }
+
+  // An extended API. Called for both values and merge operands.
+  // Allows changing value and skipping ranges of keys.
+  // The default implementation uses Filter() and FilterMergeOperand().
+  // If you're overriding this method, no need to override the other two.
+  // `value_type` indicates whether this key-value corresponds to a normal
+  // value (e.g. written with Put())  or a merge operand (written with Merge()).
+  //
+  // Possible return values:
+  //  * kKeep - keep the key-value pair.
+  //  * kRemove - remove the key-value pair or merge operand.
+  //  * kChangeValue - keep the key and change the value/operand to *new_value.
+  //  * kRemoveAndSkipUntil - remove this key-value pair, and also remove
+  //      all key-value pairs with key in [key, *skip_until). This range
+  //      of keys will be skipped without reading, potentially saving some
+  //      IO operations compared to removing the keys one by one.
+  //
+  //      *skip_until <= key is treated the same as Decision::kKeep
+  //      (since the range [key, *skip_until) is empty).
+  //
+  //      Caveats:
+  //       - The keys are skipped even if there are snapshots containing them,
+  //         i.e. values removed by kRemoveAndSkipUntil can disappear from a
+  //         snapshot - beware if you're using TransactionDB or
+  //         DB::GetSnapshot().
+  //       - If value for a key was overwritten or merged into (multiple Put()s
+  //         or Merge()s), and `CompactionFilter` skips this key with
+  //         kRemoveAndSkipUntil, it's possible that it will remove only
+  //         the new value, exposing the old value that was supposed to be
+  //         overwritten.
+  //       - Doesn't work with PlainTableFactory in prefix mode.
+  //       - If you use kRemoveAndSkipUntil for table files created by
+  //         compaction, consider also reducing compaction_readahead_size
+  //         option.
+  //
+  // Should never return kUndetermined.
+  // Note: If you are using a TransactionDB, it is not recommended to filter
+  // out or modify merge operands (ValueType::kMergeOperand).
+  // If a merge operation is filtered out, TransactionDB may not realize there
+  // is a write conflict and may allow a Transaction to Commit that should have
+  // failed. Instead, it is better to implement any Merge filtering inside the
+  // MergeOperator.
+  // key includes timestamp if user-defined timestamp is enabled.
+  virtual Decision FilterV2(int level, const Slice& key, ValueType value_type,
+                            const Slice& existing_value, std::string* new_value,
+                            std::string* /*skip_until*/) const {
+    switch (value_type) {
+      case ValueType::kValue: {
+        bool value_changed = false;
+        bool rv = Filter(level, key, existing_value, new_value, &value_changed);
+        if (rv) {
+          return Decision::kRemove;
+        }
+        return value_changed ? Decision::kChangeValue : Decision::kKeep;
+      }
+      case ValueType::kMergeOperand: {
+        bool rv = FilterMergeOperand(level, key, existing_value);
+        return rv ? Decision::kRemove : Decision::kKeep;
+      }
+      case ValueType::kBlobIndex:
+        return Decision::kKeep;
+    }
+    assert(false);
+    return Decision::kKeep;
+  }
+
+  // Internal (BlobDB) use only. Do not override in application code.
+  virtual BlobDecision PrepareBlobOutput(const Slice& /* key */,
+                                         const Slice& /* existing_value */,
+                                         std::string* /* new_value */) const {
+    return BlobDecision::kKeep;
+  }
+
+  // This function is deprecated. Snapshots will always be ignored for
+  // `CompactionFilter`s, because we realized that not ignoring snapshots
+  // doesn't provide the guarantee we initially thought it would provide.
+  // Repeatable reads will not be guaranteed anyway. If you override the
+  // function and returns false, we will fail the table file creation.
+  virtual bool IgnoreSnapshots() const { return true; }
+
+  // Returns a name that identifies this `CompactionFilter`.
+  // The name will be printed to LOG file on start up for diagnosis.
+  const char* Name() const override = 0;
+
+  // Internal (BlobDB) use only. Do not override in application code.
+  virtual bool IsStackedBlobDbInternalCompactionFilter() const { return false; }
+
+  // In the case of BlobDB, it may be possible to reach a decision with only
+  // the key without reading the actual value. Keys whose value_type is
+  // kBlobIndex will be checked by this method.
+  // Returning kUndetermined will cause FilterV2() to be called to make a
+  // decision as usual.
+  virtual Decision FilterBlobByKey(int /*level*/, const Slice& /*key*/,
+                                   std::string* /*new_value*/,
+                                   std::string* /*skip_until*/) const {
+    return Decision::kUndetermined;
+  }
+};
+
+// Each thread of work involving creating table files will create a new
+// `CompactionFilter` according to `ShouldFilterTableFileCreation()`. This
+// allows the application to know about the different ongoing threads of work
+// and makes it unnecessary for `CompactionFilter` to provide thread-safety.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class CompactionFilterFactory : public Customizable {
+ public:
+  virtual ~CompactionFilterFactory() {}
+  static const char* Type() { return "CompactionFilterFactory"; }
+  static Status CreateFromString(
+      const ConfigOptions& config_options, const std::string& name,
+      std::shared_ptr<CompactionFilterFactory>* result);
+
+  // Returns whether a thread creating table files for the specified `reason`
+  // should invoke `CreateCompactionFilter()` and pass KVs through the returned
+  // filter.
+  virtual bool ShouldFilterTableFileCreation(
+      TableFileCreationReason reason) const {
+    // For backward compatibility, default implementation only applies
+    // `CompactionFilter` to files generated by compaction.
+    return reason == TableFileCreationReason::kCompaction;
+  }
+
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) = 0;
+
+  // Returns a name that identifies this `CompactionFilter` factory.
+  virtual const char* Name() const override = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/compaction_job_stats.h b/src/rocksdb/include/rocksdb/compaction_job_stats.h
new file mode 100644
index 000000000..5ff8eccc8
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/compaction_job_stats.h
@@ -0,0 +1,109 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+struct CompactionJobStats {
+  CompactionJobStats() { Reset(); }
+  void Reset();
+  // Aggregate the CompactionJobStats from another instance with this one
+  void Add(const CompactionJobStats& stats);
+
+  // the elapsed time of this compaction in microseconds.
+  uint64_t elapsed_micros;
+
+  // the elapsed CPU time of this compaction in microseconds.
+  uint64_t cpu_micros;
+
+  // the number of compaction input records.
+  uint64_t num_input_records;
+  // the number of blobs read from blob files
+  uint64_t num_blobs_read;
+  // the number of compaction input files (table files)
+  size_t num_input_files;
+  // the number of compaction input files at the output level (table files)
+  size_t num_input_files_at_output_level;
+
+  // the number of compaction output records.
+  uint64_t num_output_records;
+  // the number of compaction output files (table files)
+  size_t num_output_files;
+  // the number of compaction output files (blob files)
+  size_t num_output_files_blob;
+
+  // true if the compaction is a full compaction (all live SST files input)
+  bool is_full_compaction;
+  // true if the compaction is a manual compaction
+  bool is_manual_compaction;
+
+  // the total size of table files in the compaction input
+  uint64_t total_input_bytes;
+  // the total size of blobs read from blob files
+  uint64_t total_blob_bytes_read;
+  // the total size of table files in the compaction output
+  uint64_t total_output_bytes;
+  // the total size of blob files in the compaction output
+  uint64_t total_output_bytes_blob;
+
+  // number of records being replaced by newer record associated with same key.
+  // this could be a new value or a deletion entry for that key so this field
+  // sums up all updated and deleted keys
+  uint64_t num_records_replaced;
+
+  // the sum of the uncompressed input keys in bytes.
+  uint64_t total_input_raw_key_bytes;
+  // the sum of the uncompressed input values in bytes.
+  uint64_t total_input_raw_value_bytes;
+
+  // the number of deletion entries before compaction. Deletion entries
+  // can disappear after compaction because they expired
+  uint64_t num_input_deletion_records;
+  // number of deletion records that were found obsolete and discarded
+  // because it is not possible to delete any more keys with this entry
+  // (i.e. all possible deletions resulting from it have been completed)
+  uint64_t num_expired_deletion_records;
+
+  // number of corrupt keys (ParseInternalKey returned false when applied to
+  // the key) encountered and written out.
+  uint64_t num_corrupt_keys;
+
+  // Following counters are only populated if
+  // options.report_bg_io_stats = true;
+
+  // Time spent on file's Append() call.
+  uint64_t file_write_nanos;
+
+  // Time spent on sync file range.
+  uint64_t file_range_sync_nanos;
+
+  // Time spent on file fsync.
+  uint64_t file_fsync_nanos;
+
+  // Time spent on preparing file write (fallocate, etc)
+  uint64_t file_prepare_write_nanos;
+
+  // 0-terminated strings storing the first 8 bytes of the smallest and
+  // largest key in the output.
+  static const size_t kMaxPrefixLength = 8;
+
+  std::string smallest_output_key_prefix;
+  std::string largest_output_key_prefix;
+
+  // number of single-deletes which do not meet a put
+  uint64_t num_single_del_fallthru;
+
+  // number of single-deletes which meet something other than a put
+  uint64_t num_single_del_mismatch;
+
+  // TODO: Add output_to_penultimate_level output information
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/comparator.h b/src/rocksdb/include/rocksdb/comparator.h
new file mode 100644
index 000000000..ad1e71a11
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/comparator.h
@@ -0,0 +1,164 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <string>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+
+// The general interface for comparing two Slices are defined for both of
+// Comparator and some internal data structures.
+class CompareInterface {
+ public:
+  virtual ~CompareInterface() {}
+
+  // Three-way comparison.  Returns value:
+  //   < 0 iff "a" < "b",
+  //   == 0 iff "a" == "b",
+  //   > 0 iff "a" > "b"
+  // Note that Compare(a, b) also compares timestamp if timestamp size is
+  // non-zero. For the same user key with different timestamps, larger (newer)
+  // timestamp comes first.
+  virtual int Compare(const Slice& a, const Slice& b) const = 0;
+};
+
+// A Comparator object provides a total order across slices that are
+// used as keys in an sstable or a database.  A Comparator implementation
+// must be thread-safe since rocksdb may invoke its methods concurrently
+// from multiple threads.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class Comparator : public Customizable, public CompareInterface {
+ public:
+  Comparator() : timestamp_size_(0) {}
+
+  Comparator(size_t ts_sz) : timestamp_size_(ts_sz) {}
+
+  Comparator(const Comparator& orig) : timestamp_size_(orig.timestamp_size_) {}
+
+  Comparator& operator=(const Comparator& rhs) {
+    if (this != &rhs) {
+      timestamp_size_ = rhs.timestamp_size_;
+    }
+    return *this;
+  }
+
+  ~Comparator() override {}
+
+  static Status CreateFromString(const ConfigOptions& opts,
+                                 const std::string& id,
+                                 const Comparator** comp);
+  static const char* Type() { return "Comparator"; }
+
+  // The name of the comparator.  Used to check for comparator
+  // mismatches (i.e., a DB created with one comparator is
+  // accessed using a different comparator.
+  //
+  // The client of this package should switch to a new name whenever
+  // the comparator implementation changes in a way that will cause
+  // the relative ordering of any two keys to change.
+  //
+  // Names starting with "rocksdb." are reserved and should not be used
+  // by any clients of this package.
+  const char* Name() const override = 0;
+
+  // Compares two slices for equality. The following invariant should always
+  // hold (and is the default implementation):
+  //   Equal(a, b) iff Compare(a, b) == 0
+  // Overwrite only if equality comparisons can be done more efficiently than
+  // three-way comparisons.
+  virtual bool Equal(const Slice& a, const Slice& b) const {
+    return Compare(a, b) == 0;
+  }
+
+  // Advanced functions: these are used to reduce the space requirements
+  // for internal data structures like index blocks.
+
+  // If *start < limit, changes *start to a short string in [start,limit).
+  // Simple comparator implementations may return with *start unchanged,
+  // i.e., an implementation of this method that does nothing is correct.
+  virtual void FindShortestSeparator(std::string* start,
+                                     const Slice& limit) const = 0;
+
+  // Changes *key to a short string >= *key.
+  // Simple comparator implementations may return with *key unchanged,
+  // i.e., an implementation of this method that does nothing is correct.
+  virtual void FindShortSuccessor(std::string* key) const = 0;
+
+  // given two keys, determine if t is the successor of s
+  // BUG: only return true if no other keys starting with `t` are ordered
+  // before `t`. Otherwise, the auto_prefix_mode can omit entries within
+  // iterator bounds that have same prefix as upper bound but different
+  // prefix from seek key.
+  virtual bool IsSameLengthImmediateSuccessor(const Slice& /*s*/,
+                                              const Slice& /*t*/) const {
+    return false;
+  }
+
+  // return true if two keys with different byte sequences can be regarded
+  // as equal by this comparator.
+  // The major use case is to determine if DataBlockHashIndex is compatible
+  // with the customized comparator.
+  virtual bool CanKeysWithDifferentByteContentsBeEqual() const { return true; }
+
+  // if it is a wrapped comparator, may return the root one.
+  // return itself it is not wrapped.
+  virtual const Comparator* GetRootComparator() const { return this; }
+
+  inline size_t timestamp_size() const { return timestamp_size_; }
+
+  int CompareWithoutTimestamp(const Slice& a, const Slice& b) const {
+    return CompareWithoutTimestamp(a, /*a_has_ts=*/true, b, /*b_has_ts=*/true);
+  }
+
+  // For two events e1 and e2 whose timestamps are t1 and t2 respectively,
+  // Returns value:
+  // < 0  iff t1 < t2
+  // == 0 iff t1 == t2
+  // > 0  iff t1 > t2
+  // Note that an all-zero byte array will be the smallest (oldest) timestamp
+  // of the same length, and a byte array with all bits 1 will be the largest.
+  // In the future, we can extend Comparator so that subclasses can specify
+  // both largest and smallest timestamps.
+  virtual int CompareTimestamp(const Slice& /*ts1*/,
+                               const Slice& /*ts2*/) const {
+    return 0;
+  }
+
+  virtual int CompareWithoutTimestamp(const Slice& a, bool /*a_has_ts*/,
+                                      const Slice& b, bool /*b_has_ts*/) const {
+    return Compare(a, b);
+  }
+
+  virtual bool EqualWithoutTimestamp(const Slice& a, const Slice& b) const {
+    return 0 ==
+           CompareWithoutTimestamp(a, /*a_has_ts=*/true, b, /*b_has_ts=*/true);
+  }
+
+ private:
+  size_t timestamp_size_;
+};
+
+// Return a builtin comparator that uses lexicographic byte-wise
+// ordering.  The result remains the property of this module and
+// must not be deleted.
+extern const Comparator* BytewiseComparator();
+
+// Return a builtin comparator that uses reverse lexicographic byte-wise
+// ordering.
+extern const Comparator* ReverseBytewiseComparator();
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/compression_type.h b/src/rocksdb/include/rocksdb/compression_type.h
new file mode 100644
index 000000000..bfeb00bde
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/compression_type.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// DB contents are stored in a set of blocks, each of which holds a
+// sequence of key,value pairs.  Each block may be compressed before
+// being stored in a file.  The following enum describes which
+// compression method (if any) is used to compress a block.
+
+enum CompressionType : unsigned char {
+  // NOTE: do not change the values of existing entries, as these are
+  // part of the persistent format on disk.
+  kNoCompression = 0x0,
+  kSnappyCompression = 0x1,
+  kZlibCompression = 0x2,
+  kBZip2Compression = 0x3,
+  kLZ4Compression = 0x4,
+  kLZ4HCCompression = 0x5,
+  kXpressCompression = 0x6,
+  kZSTD = 0x7,
+
+  // Only use kZSTDNotFinalCompression if you have to use ZSTD lib older than
+  // 0.8.0 or consider a possibility of downgrading the service or copying
+  // the database files to another service running with an older version of
+  // RocksDB that doesn't have kZSTD. Otherwise, you should use kZSTD. We will
+  // eventually remove the option from the public API.
+  kZSTDNotFinalCompression = 0x40,
+
+  // kDisableCompressionOption is used to disable some compression options.
+  kDisableCompressionOption = 0xff,
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/concurrent_task_limiter.h b/src/rocksdb/include/rocksdb/concurrent_task_limiter.h
new file mode 100644
index 000000000..9ad741f98
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/concurrent_task_limiter.h
@@ -0,0 +1,51 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This is NOT an extensible interface but a public interface for result of
+// NewConcurrentTaskLimiter. Any derived classes must be RocksDB internal.
+class ConcurrentTaskLimiter {
+ public:
+  virtual ~ConcurrentTaskLimiter() {}
+
+  // Returns a name that identifies this concurrent task limiter.
+  virtual const std::string& GetName() const = 0;
+
+  // Set max concurrent tasks.
+  // limit = 0 means no new task allowed.
+  // limit < 0 means no limitation.
+  virtual void SetMaxOutstandingTask(int32_t limit) = 0;
+
+  // Reset to unlimited max concurrent task.
+  virtual void ResetMaxOutstandingTask() = 0;
+
+  // Returns current outstanding task count.
+  virtual int32_t GetOutstandingTask() const = 0;
+};
+
+// Create a ConcurrentTaskLimiter that can be shared with multiple CFs
+// across RocksDB instances to control concurrent tasks.
+//
+// @param name: Name of the limiter.
+// @param limit: max concurrent tasks.
+//        limit = 0 means no new task allowed.
+//        limit < 0 means no limitation.
+extern ConcurrentTaskLimiter* NewConcurrentTaskLimiter(const std::string& name,
+                                                       int32_t limit);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/configurable.h b/src/rocksdb/include/rocksdb/configurable.h
new file mode 100644
index 000000000..60ae89f97
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/configurable.h
@@ -0,0 +1,400 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <atomic>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Logger;
+class ObjectRegistry;
+class OptionTypeInfo;
+struct ColumnFamilyOptions;
+struct ConfigOptions;
+struct DBOptions;
+
+// Configurable is a base class used by the rocksdb that describes a
+// standard way of configuring objects.  A Configurable object can:
+//   -> Populate itself given:
+//        - One or more "name/value" pair strings
+//        - A string representing the set of name=value properties
+//        - A map of name/value properties.
+//   -> Convert itself into its string representation
+//   -> Dump itself to a Logger
+//   -> Compare itself to another Configurable object to see if the two objects
+// have equivalent options settings
+//
+// If a derived class calls RegisterOptions to register (by name) how its
+// options objects are to be processed, this functionality can typically be
+// handled by this class without additional overrides. Otherwise, the derived
+// class will need to implement the methods for handling the corresponding
+// functionality.
+class Configurable {
+ protected:
+  friend class ConfigurableHelper;
+  struct RegisteredOptions {
+    // The name of the options being registered
+    std::string name;
+    // Pointer to the object being registered
+    void* opt_ptr;
+#ifndef ROCKSDB_LITE
+    // The map of options being registered
+    const std::unordered_map<std::string, OptionTypeInfo>* type_map;
+#endif
+  };
+
+ public:
+  virtual ~Configurable() {}
+
+  // Returns the raw pointer of the named options that is used by this
+  // object, or nullptr if this function is not supported.
+  // Since the return value is a raw pointer, the object owns the
+  // pointer and the caller should not delete the pointer.
+  //
+  // Note that changing the underlying options while the object
+  // is currently used by any open DB is undefined behavior.
+  // Developers should use DB::SetOption() instead to dynamically change
+  // options while the DB is open.
+  template <typename T>
+  const T* GetOptions() const {
+    return GetOptions<T>(T::kName());
+  }
+  template <typename T>
+  T* GetOptions() {
+    return GetOptions<T>(T::kName());
+  }
+  template <typename T>
+  const T* GetOptions(const std::string& name) const {
+    return reinterpret_cast<const T*>(GetOptionsPtr(name));
+  }
+  template <typename T>
+  T* GetOptions(const std::string& name) {
+    return reinterpret_cast<T*>(const_cast<void*>(GetOptionsPtr(name)));
+  }
+
+  // Configures the options for this class based on the input parameters.
+  // On successful completion, the object is updated with the settings from
+  // the opt_map.
+  // If this method fails, an attempt is made to revert the object to original
+  // state. Note that the revert may not be the original state but may be an
+  // equivalent. For example, if the object contains an option that is a
+  // shared_ptr, the shared_ptr may not be the original one but a copy (e.g. not
+  // the Cache object that was passed in, but a Cache object of the same size).
+  //
+  // The acceptable values of the name/value pairs are documented with the
+  // specific class/instance.
+  //
+  // @param config_options Controls how the arguments are processed.
+  // @param opt_map Name/value pairs of the options to update
+  // @param unused If specified, this value will return the name/value
+  //      pairs from opt_map that were NotFound for this object.
+  // @return OK If all values in the map were successfully updated
+  //      If invoke_prepare_options is true, OK also implies
+  //      PrepareOptions ran successfully.
+  // @return NotFound If any of the names in the opt_map were not valid
+  //      for this object.  If unused is specified, it will contain the
+  //      collection of NotFound names.
+  // @return NotSupported  If any of the names are valid but the object does
+  //       not know how to convert the value.  This can happen if, for example,
+  //       there is some nested Configurable that cannot be created.
+  // @return InvalidArgument If any of the values cannot be successfully
+  //       parsed.  This can also be returned if PrepareOptions encounters an
+  //       error.
+  // @see ConfigOptions for a description of the controls.
+  Status ConfigureFromMap(
+      const ConfigOptions& config_options,
+      const std::unordered_map<std::string, std::string>& opt_map);
+  Status ConfigureFromMap(
+      const ConfigOptions& config_options,
+      const std::unordered_map<std::string, std::string>& opt_map,
+      std::unordered_map<std::string, std::string>* unused);
+
+#ifndef ROCKSDB_LITE
+  // Updates the named option to the input value, returning OK if successful.
+  // Note that ConfigureOption does not cause PrepareOptions to be invoked.
+  // @param config_options Controls how the name/value is processed.
+  // @param name The name of the option to update
+  // @param value The value to set for the named option
+  // @return OK If the named field was successfully updated to value.
+  // @return NotFound If the name is not valid for this object.
+  // @return NotSupported  If the name is valid but the object does
+  //       not know how to convert the value.  This can happen if, for example,
+  //       there is some nested Configurable that cannot be created.
+  // @return InvalidArgument If the value cannot be successfully  parsed.
+  Status ConfigureOption(const ConfigOptions& config_options,
+                         const std::string& name, const std::string& value);
+#endif  // ROCKSDB_LITE
+
+  // Configures the options for this class based on the input parameters.
+  // On successful completion, the object is updated with the settings from
+  // the opt_map.  If this method fails, an attempt is made to revert the
+  // object to original state.  Note that the revert may not be the original
+  // state but may be an equivalent.
+  // @see ConfigureFromMap for more details
+  // @param config_options Controls how the arguments are processed.
+  // @param opt_str string containing the values to update.
+  // @param unused If specified, this value will return the name/value
+  //      pairs from opt_map that were NotFound for this object.
+  // @return OK If all specified values were successfully updated
+  //      If invoke_prepare_options is true, OK also implies
+  //      PrepareOptions ran successfully.
+  // @return NotFound If any of the names were not valid for this object.
+  //      If unused is specified, it will contain the collection of NotFound
+  //      names.
+  // @return NotSupported  If any of the names are valid but the object does
+  //       not know how to convert the value.  This can happen if, for example,
+  //       there is some nested Configurable that cannot be created.
+  // @return InvalidArgument If any of the values cannot be successfully
+  //       parsed.  This can also be returned if PrepareOptions encounters an
+  //       error.
+  Status ConfigureFromString(const ConfigOptions& config_options,
+                             const std::string& opts);
+
+  // Fills in result with the serialized options for this object.
+  // This is the inverse of ConfigureFromString.
+  // @param config_options Controls how serialization happens.
+  // @param result The string representation of this object.
+  // @return OK If the options for this object were successfully serialized.
+  // @return InvalidArgument If one or more of the options could not be
+  // serialized.
+  Status GetOptionString(const ConfigOptions& config_options,
+                         std::string* result) const;
+#ifndef ROCKSDB_LITE
+  // Returns the serialized options for this object.
+  // This method is similar to GetOptionString with no errors.
+  // @param config_options Controls how serialization happens.
+  // @param prefix A string to prepend to every option.
+  // @return The serialized representation of the options for this object
+  std::string ToString(const ConfigOptions& config_options) const {
+    return ToString(config_options, "");
+  }
+  std::string ToString(const ConfigOptions& config_options,
+                       const std::string& prefix) const;
+
+  // Returns the list of option names associated with this configurable
+  // @param config_options Controls how the names are returned
+  // @param result The set of option names for this object. Note that
+  //      options that are deprecated or aliases are not returned.
+  // @return OK on success.
+  Status GetOptionNames(const ConfigOptions& config_options,
+                        std::unordered_set<std::string>* result) const;
+
+  // Returns the value of the option associated with the input name
+  // This method is the functional inverse of ConfigureOption
+  // @param config_options Controls how the value is returned
+  // @param name The name of the option to return a value for.
+  // @param value The returned value associated with the named option.
+  // @return OK If the named field was successfully updated to value.
+  // @return NotFound If the name is not valid for this object.
+  // @param InvalidArgument If the name is valid for this object but
+  //      its value cannot be serialized.
+  virtual Status GetOption(const ConfigOptions& config_options,
+                           const std::string& name, std::string* value) const;
+#endif  // ROCKSDB_LITE
+
+  // Checks to see if this Configurable is equivalent to other.
+  // This method assumes that the two objects are of the same class.
+  // @param config_options Controls how the options are compared.
+  // @param other The other object to compare to.
+  // @param mismatch If the objects do not match, this parameter contains
+  //      the name of the option that triggered the match failure.
+  // @param True if the objects match, false otherwise.
+  virtual bool AreEquivalent(const ConfigOptions& config_options,
+                             const Configurable* other,
+                             std::string* name) const;
+
+  // Returns a pretty-printed, human-readable version of the options.
+  // This method is typically used to dump the options to a log file.
+  // Classes should override this method
+  virtual std::string GetPrintableOptions() const { return ""; }
+
+  // Validates that the settings are valid/consistent and performs any object
+  // initialization required by this object.  This method may be called as part
+  // of Configure (if invoke_prepare_options is set), or may be invoked
+  // separately.
+  //
+  // Once an object has been prepared, non-mutable options can no longer be
+  // updated.
+  //
+  // Classes must override this method to provide any implementation-specific
+  // initialization, such as opening log files or setting up cache parameters.
+  // Implementations should be idempotent (e.g. don't re-open the log file or
+  // reconfigure the cache), as there is the potential this method can be called
+  // more than once.
+  //
+  // By default, this method will also prepare all nested (Inner and
+  // OptionType::kConfigurable) objects.
+  //
+  // @param config_options Controls how the object is prepared.  Also contains
+  //      a Logger and Env that can be used to initialize this object.
+  // @return OK If the object was successfully initialized.
+  // @return InvalidArgument If this object could not be successfully
+  // initialized.
+  virtual Status PrepareOptions(const ConfigOptions& config_options);
+
+  // Checks to see if the settings are valid for this object.
+  // This method checks to see if the input DBOptions and ColumnFamilyOptions
+  // are valid for the settings of this object.  For example, an Env might not
+  // support certain mmap modes or a TableFactory might require certain
+  // settings.
+  //
+  // By default, this method will also validate all nested (Inner and
+  // OptionType::kConfigurable) objects.
+  //
+  // @param db_opts The DBOptions to validate
+  // @param cf_opts The ColumnFamilyOptions to validate
+  // @return OK if the options are valid
+  // @return InvalidArgument If the arguments are not valid for the options
+  //       of the current object.
+  virtual Status ValidateOptions(const DBOptions& db_opts,
+                                 const ColumnFamilyOptions& cf_opts) const;
+
+  // Splits the input opt_value into the ID field and the remaining options.
+  // The input opt_value can be in the form of "name" or "name=value
+  // [;name=value]". The first form uses the "name" as an id with no options The
+  // latter form converts the input into a map of name=value pairs and sets "id"
+  // to the "id" value from the map.
+  // @param opt_value The value to split into id and options
+  // @param id The id field from the opt_value
+  // @param options The remaining name/value pairs from the opt_value
+  // @param default_id If specified and there is no id field in the map, this
+  // value is returned as the ID
+  // @return OK if the value was converted to a map successfully and an ID was
+  // found.
+  // @return InvalidArgument if the value could not be converted to a map or
+  // there was or there is no id property in the map.
+  static Status GetOptionsMap(
+      const std::string& opt_value, const std::string& default_id,
+      std::string* id, std::unordered_map<std::string, std::string>* options);
+
+ protected:
+  // Returns the raw pointer for the associated named option.
+  // The name is typically the name of an option registered via the
+  // Classes may override this method to provide further specialization (such as
+  // returning a sub-option)
+  //
+  // The default implementation looks at the registered options.  If the
+  // input name matches that of a registered option, the pointer registered
+  // with that name is returned.
+  // e.g,, RegisterOptions("X", &my_ptr, ...); GetOptionsPtr("X") returns
+  // "my_ptr"
+  virtual const void* GetOptionsPtr(const std::string& name) const;
+
+  // Method for allowing options to be configured outside of the normal
+  // registered options framework.  Classes may override this method if they
+  // wish to support non-standard options implementations (such as configuring
+  // themselves from constant or simple ":"-separated strings.
+  //
+  // The default implementation does nothing and returns OK
+  virtual Status ParseStringOptions(const ConfigOptions& config_options,
+                                    const std::string& opts_str);
+
+  // Internal method to configure an object from a map of name-value options.
+  // This method uses the input config_options to drive the configuration of
+  // the options in opt_map.  Any option name that cannot be found from the
+  // input set will be returned in "unused".
+  //
+  // Classes may override this method to extend the functionality if required.
+  // @param config_options Controls how the options are configured and errors
+  // handled.
+  // @param opts_map The set of options to configure
+  // @param unused Any options from opt_map that were not configured.
+  // @returns a Status based on the rules outlined in ConfigureFromMap
+  virtual Status ConfigureOptions(
+      const ConfigOptions& config_options,
+      const std::unordered_map<std::string, std::string>& opts_map,
+      std::unordered_map<std::string, std::string>* unused);
+
+#ifndef ROCKSDB_LITE
+  // Method that configures a the specific opt_name from opt_value.
+  // By default, this method calls opt_info.ParseOption with the
+  // input parameters.
+  // Classes may override this method to extend the functionality, or
+  // change the returned Status.
+  virtual Status ParseOption(const ConfigOptions& config_options,
+                             const OptionTypeInfo& opt_info,
+                             const std::string& opt_name,
+                             const std::string& opt_value, void* opt_ptr);
+
+  // Internal method to see if the single option name/info matches for this and
+  // that Classes may override this value to change its behavior.
+  // @param config_options Controls how the options are being matched
+  // @param opt_info The OptionTypeInfo registered for this option name
+  //      that controls what field is matched (offset) and how (type).
+  // @param name The name associated with this opt_info.
+  // @param this_ptr The base pointer to compare to.  This is the object
+  // registered for
+  //      for this OptionTypeInfo.
+  // @param that_ptr The other pointer to compare to.  This is the object
+  // registered for
+  //      for this OptionTypeInfo.
+  // @param bad_name  If the match fails, the name of the option that failed to
+  // match.
+  virtual bool OptionsAreEqual(const ConfigOptions& config_options,
+                               const OptionTypeInfo& opt_info,
+                               const std::string& name,
+                               const void* const this_ptr,
+                               const void* const that_ptr,
+                               std::string* bad_name) const;
+#endif
+#ifndef ROCKSDB_LITE
+  // Internal method to serialize options (ToString)
+  // Classes may override this value to change its behavior.
+  virtual std::string SerializeOptions(const ConfigOptions& config_options,
+                                       const std::string& header) const;
+#endif  // ROCKSDB_LITE
+
+  //  Given a name (e.g. rocksdb.my.type.opt), returns the short name (opt)
+  virtual std::string GetOptionName(const std::string& long_name) const;
+
+  // Registers the input name with the options and associated map.
+  // When classes register their options in this manner, most of the
+  // functionality (excluding unknown options and validate/prepare) is
+  // implemented by the base class.
+  //
+  // This method should be called in the class constructor to register the
+  // option set for this object.  For example, to register the options
+  // associated with the BlockBasedTableFactory, the constructor calls this
+  // method passing in:
+  // - the name of the options ("BlockBasedTableOptions");
+  // - the options object (the BlockBasedTableOptions object for this object;
+  // - the options type map for the BlockBasedTableOptions.
+  // This registration allows the Configurable class to process the option
+  // values associated with the BlockBasedTableOptions without further code in
+  // the derived class.
+  //
+  // @param name    The name of this set of options (@see GetOptionsPtr)
+  // @param opt_ptr Pointer to the options to associate with this name
+  // @param opt_map Options map that controls how this option is configured.
+  template <typename T>
+  void RegisterOptions(
+      T* opt_ptr,
+      const std::unordered_map<std::string, OptionTypeInfo>* opt_map) {
+    RegisterOptions(T::kName(), opt_ptr, opt_map);
+  }
+  void RegisterOptions(
+      const std::string& name, void* opt_ptr,
+      const std::unordered_map<std::string, OptionTypeInfo>* opt_map);
+
+  // Returns true if there are registered options for this Configurable object
+  inline bool HasRegisteredOptions() const { return !options_.empty(); }
+
+ private:
+  // Contains the collection of options (name, opt_ptr, opt_map) associated with
+  // this object. This collection is typically set in the constructor of the
+  // Configurable option via
+  std::vector<RegisteredOptions> options_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/convenience.h b/src/rocksdb/include/rocksdb/convenience.h
new file mode 100644
index 000000000..921ec221b
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/convenience.h
@@ -0,0 +1,525 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/compression_type.h"
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Env;
+class Logger;
+class ObjectRegistry;
+
+struct ColumnFamilyOptions;
+struct DBOptions;
+struct Options;
+
+// ConfigOptions containing the parameters/controls for
+// comparing objects and converting to/from strings.
+// These settings control how the methods
+// treat errors (e.g. ignore_unknown_objects), the format
+// of the serialization (e.g. delimiter), and how to compare
+// options (sanity_level).
+struct ConfigOptions {
+  // Constructs a new ConfigOptions with a new object registry.
+  // This method should only be used when a DBOptions is not available,
+  // else registry settings may be lost
+  ConfigOptions();
+
+  // Constructs a new ConfigOptions using the settings from
+  // the input DBOptions.  Currently constructs a new object registry.
+  explicit ConfigOptions(const DBOptions&);
+
+  // This enum defines the RocksDB options sanity level.
+  enum SanityLevel : unsigned char {
+    kSanityLevelNone = 0x01,  // Performs no sanity check at all.
+    // Performs minimum check to ensure the RocksDB instance can be
+    // opened without corrupting / mis-interpreting the data.
+    kSanityLevelLooselyCompatible = 0x02,
+    // Perform exact match sanity check.
+    kSanityLevelExactMatch = 0xFF,
+  };
+
+  enum Depth {
+    kDepthDefault,  // Traverse nested options that are not flagged as "shallow"
+    kDepthShallow,  // Do not traverse into any nested options
+    kDepthDetailed,  // Traverse nested options, overriding the options shallow
+                     // setting
+  };
+
+  // When true, any unused options will be ignored and OK will be returned
+  bool ignore_unknown_options = false;
+
+  // When true, any unsupported options will be ignored and OK will be returned
+  bool ignore_unsupported_options = true;
+
+  // If the strings are escaped (old-style?)
+  bool input_strings_escaped = true;
+
+  // Whether or not to invoke PrepareOptions after configure is called.
+  bool invoke_prepare_options = true;
+
+  // Options can be marked as Mutable (OptionTypeInfo::IsMutable()) or not.
+  // When "mutable_options_only=false", all options are evaluated.
+  // When "mutable_options_only="true", any option not marked as Mutable is
+  // either ignored (in the case of string/equals methods) or results in an
+  // error (in the case of Configure).
+  bool mutable_options_only = false;
+
+  // The separator between options when converting to a string
+  std::string delimiter = ";";
+
+  // Controls how to traverse options during print/match stages
+  Depth depth = Depth::kDepthDefault;
+
+  // Controls how options are serialized
+  // Controls how pedantic the comparison must be for equivalency
+  SanityLevel sanity_level = SanityLevel::kSanityLevelExactMatch;
+  // `file_readahead_size` is used for readahead for the option file.
+  size_t file_readahead_size = 512 * 1024;
+
+  // The environment to use for this option
+  Env* env = Env::Default();
+
+#ifndef ROCKSDB_LITE
+  // The object registry to use for this options
+  std::shared_ptr<ObjectRegistry> registry;
+#endif
+
+  bool IsShallow() const { return depth == Depth::kDepthShallow; }
+  bool IsDetailed() const { return depth == Depth::kDepthDetailed; }
+
+  bool IsCheckDisabled() const {
+    return sanity_level == SanityLevel::kSanityLevelNone;
+  }
+
+  bool IsCheckEnabled(SanityLevel level) const {
+    return (level > SanityLevel::kSanityLevelNone && level <= sanity_level);
+  }
+};
+
+#ifndef ROCKSDB_LITE
+
+// The following set of functions provide a way to construct RocksDB Options
+// from a string or a string-to-string map.  Here is the general rule of
+// setting option values from strings by type.  Some RocksDB types are also
+// supported in these APIs.  Please refer to the comment of the function itself
+// to find more information about how to config those RocksDB types.
+//
+// * Strings:
+//   Strings will be used as values directly without any truncating or
+//   trimming.
+//
+// * Booleans:
+//   - "true" or "1" => true
+//   - "false" or "0" => false.
+//   [Example]:
+//   - {"optimize_filters_for_hits", "1"} in GetColumnFamilyOptionsFromMap, or
+//   - "optimize_filters_for_hits=true" in GetColumnFamilyOptionsFromString.
+//
+// * Integers:
+//   Integers are converted directly from string, in addition to the following
+//   units that we support:
+//   - 'k' or 'K' => 2^10
+//   - 'm' or 'M' => 2^20
+//   - 'g' or 'G' => 2^30
+//   - 't' or 'T' => 2^40  // only for unsigned int with sufficient bits.
+//   [Example]:
+//   - {"arena_block_size", "19G"} in GetColumnFamilyOptionsFromMap, or
+//   - "arena_block_size=19G" in GetColumnFamilyOptionsFromString.
+//
+// * Doubles / Floating Points:
+//   Doubles / Floating Points are converted directly from string.  Note that
+//   currently we do not support units.
+//   [Example]:
+//   - {"memtable_prefix_bloom_size_ratio", "0.1"} in
+//   GetColumnFamilyOptionsFromMap, or
+//   - "memtable_prefix_bloom_size_ratio=0.1" in
+//   GetColumnFamilyOptionsFromString.
+// * Array / Vectors:
+//   An array is specified by a list of values, where ':' is used as
+//   the delimiter to separate each value.
+//   [Example]:
+//   - {"compression_per_level", "kNoCompression:kSnappyCompression"}
+//     in GetColumnFamilyOptionsFromMap, or
+//   - "compression_per_level=kNoCompression:kSnappyCompression" in
+//     GetColumnFamilyOptionsFromMapString
+// * Enums:
+//   The valid values of each enum are identical to the names of its constants.
+//   [Example]:
+//   - CompressionType: valid values are "kNoCompression",
+//     "kSnappyCompression", "kZlibCompression", "kBZip2Compression", ...
+//   - CompactionStyle: valid values are "kCompactionStyleLevel",
+//     "kCompactionStyleUniversal", "kCompactionStyleFIFO", and
+//     "kCompactionStyleNone".
+//
+
+// Take a default ColumnFamilyOptions "base_options" in addition to a
+// map "opts_map" of option name to option value to construct the new
+// ColumnFamilyOptions "new_options".
+//
+// Below are the instructions of how to config some non-primitive-typed
+// options in ColumnFamilyOptions:
+//
+// * table_factory:
+//   table_factory can be configured using our custom nested-option syntax.
+//
+//   {option_a=value_a; option_b=value_b; option_c=value_c; ... }
+//
+//   A nested option is enclosed by two curly braces, within which there are
+//   multiple option assignments.  Each assignment is of the form
+//   "variable_name=value;".
+//
+//   Currently we support the following types of TableFactory:
+//   - BlockBasedTableFactory:
+//     Use name "block_based_table_factory" to initialize table_factory with
+//     BlockBasedTableFactory.  Its BlockBasedTableFactoryOptions can be
+//     configured using the nested-option syntax.
+//     [Example]:
+//     * {"block_based_table_factory", "{block_cache=1M;block_size=4k;}"}
+//       is equivalent to assigning table_factory with a BlockBasedTableFactory
+//       that has 1M LRU block-cache with block size equals to 4k:
+//         ColumnFamilyOptions cf_opt;
+//         BlockBasedTableOptions blk_opt;
+//         blk_opt.block_cache = NewLRUCache(1 * 1024 * 1024);
+//         blk_opt.block_size = 4 * 1024;
+//         cf_opt.table_factory.reset(NewBlockBasedTableFactory(blk_opt));
+//   - PlainTableFactory:
+//     Use name "plain_table_factory" to initialize table_factory with
+//     PlainTableFactory.  Its PlainTableFactoryOptions can be configured using
+//     the nested-option syntax.
+//     [Example]:
+//     * {"plain_table_factory", "{user_key_len=66;bloom_bits_per_key=20;}"}
+//
+// * memtable_factory:
+//   Use "memtable" to config memtable_factory.  Here are the supported
+//   memtable factories:
+//   - SkipList:
+//     Pass "skip_list:<lookahead>" to config memtable to use SkipList,
+//     or simply "skip_list" to use the default SkipList.
+//     [Example]:
+//     * {"memtable", "skip_list:5"} is equivalent to setting
+//       memtable to SkipListFactory(5).
+//   - PrefixHash:
+//     Pass "prefix_hash:<hash_bucket_count>" to config memtable
+//     to use PrefixHash, or simply "prefix_hash" to use the default
+//     PrefixHash.
+//     [Example]:
+//     * {"memtable", "prefix_hash:1000"} is equivalent to setting
+//       memtable to NewHashSkipListRepFactory(hash_bucket_count).
+//   - HashLinkedList:
+//     Pass "hash_linkedlist:<hash_bucket_count>" to config memtable
+//     to use HashLinkedList, or simply "hash_linkedlist" to use the default
+//     HashLinkedList.
+//     [Example]:
+//     * {"memtable", "hash_linkedlist:1000"} is equivalent to
+//       setting memtable to NewHashLinkListRepFactory(1000).
+//   - VectorRepFactory:
+//     Pass "vector:<count>" to config memtable to use VectorRepFactory,
+//     or simply "vector" to use the default Vector memtable.
+//     [Example]:
+//     * {"memtable", "vector:1024"} is equivalent to setting memtable
+//       to VectorRepFactory(1024).
+//
+//  * compression_opts:
+//    Use "compression_opts" to config compression_opts.  The value format
+//    is of the form "<window_bits>:<level>:<strategy>:<max_dict_bytes>".
+//    [Example]:
+//    * {"compression_opts", "4:5:6:7"} is equivalent to setting:
+//        ColumnFamilyOptions cf_opt;
+//        cf_opt.compression_opts.window_bits = 4;
+//        cf_opt.compression_opts.level = 5;
+//        cf_opt.compression_opts.strategy = 6;
+//        cf_opt.compression_opts.max_dict_bytes = 7;
+//
+// The GetColumnFamilyOptionsFromMap(ConfigOptions, ...) should be used; the
+// alternative signature may be deprecated in a future release.  The equivalent
+// functionality can be achieved by setting the corresponding options in
+// the ConfigOptions parameter.
+//
+// @param config_options controls how the map is processed.
+// @param base_options the default options of the output "new_options".
+// @param opts_map an option name to value map for specifying how "new_options"
+//     should be set.
+// @param new_options the resulting options based on "base_options" with the
+//     change specified in "opts_map".
+// @param input_strings_escaped when set to true, each escaped characters
+//     prefixed by '\' in the values of the opts_map will be further converted
+//     back to the raw string before assigning to the associated options.
+// @param ignore_unknown_options when set to true, unknown options are ignored
+//     instead of resulting in an unknown-option error.
+// @return Status::OK() on success.  Otherwise, a non-ok status indicating
+//     error will be returned, and "new_options" will be set to "base_options".
+// @return Status::NotFound means the one (or more) of the option name in
+//     the opts_map is not valid for this option
+// @return Status::NotSupported means we do not know how to parse one of the
+//     value for this option
+// @return Status::InvalidArgument means the one of the option values is not
+//     valid for this option.
+Status GetColumnFamilyOptionsFromMap(
+    const ConfigOptions& config_options,
+    const ColumnFamilyOptions& base_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    ColumnFamilyOptions* new_options);
+Status GetColumnFamilyOptionsFromMap(
+    const ColumnFamilyOptions& base_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    ColumnFamilyOptions* new_options, bool input_strings_escaped = false,
+    bool ignore_unknown_options = false);
+
+// Take a default DBOptions "base_options" in addition to a
+// map "opts_map" of option name to option value to construct the new
+// DBOptions "new_options".
+//
+// Below are the instructions of how to config some non-primitive-typed
+// options in DBOptions:
+//
+// * rate_limiter_bytes_per_sec:
+//   RateLimiter can be configured directly by specifying its bytes_per_sec.
+//   [Example]:
+//   - Passing {"rate_limiter_bytes_per_sec", "1024"} is equivalent to
+//     passing NewGenericRateLimiter(1024) to rate_limiter_bytes_per_sec.
+//
+// The GetDBOptionsFromMap(ConfigOptions, ...) should be used; the
+// alternative signature may be deprecated in a future release. The equivalent
+// functionality can be achieved by setting the corresponding options in
+// the ConfigOptions parameter.
+//
+// @param config_options controls how the map is processed.
+// @param base_options the default options of the output "new_options".
+// @param opts_map an option name to value map for specifying how "new_options"
+//     should be set.
+// @param new_options the resulting options based on "base_options" with the
+//     change specified in "opts_map".
+// @param input_strings_escaped when set to true, each escaped characters
+//     prefixed by '\' in the values of the opts_map will be further converted
+//     back to the raw string before assigning to the associated options.
+// @param ignore_unknown_options when set to true, unknown options are ignored
+//     instead of resulting in an unknown-option error.
+// @return Status::OK() on success.  Otherwise, a non-ok status indicating
+//     error will be returned, and "new_options" will be set to "base_options".
+// @return Status::NotFound means the one (or more) of the option name in
+//     the opts_map is not valid for this option
+// @return Status::NotSupported means we do not know how to parse one of the
+//     value for this option
+// @return Status::InvalidArgument means the one of the option values is not
+//     valid for this option.
+Status GetDBOptionsFromMap(
+    const ConfigOptions& cfg_options, const DBOptions& base_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    DBOptions* new_options);
+Status GetDBOptionsFromMap(
+    const DBOptions& base_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    DBOptions* new_options, bool input_strings_escaped = false,
+    bool ignore_unknown_options = false);
+
+// Take a default BlockBasedTableOptions "table_options" in addition to a
+// map "opts_map" of option name to option value to construct the new
+// BlockBasedTableOptions "new_table_options".
+//
+// Below are the instructions of how to config some non-primitive-typed
+// options in BlockBasedTableOptions:
+//
+// * filter_policy:
+//   We currently only support the following FilterPolicy in the convenience
+//   functions:
+//   - BloomFilter: use "bloomfilter:[bits_per_key]:[use_block_based_builder]"
+//     to specify BloomFilter.  The above string is equivalent to calling
+//     NewBloomFilterPolicy(bits_per_key, use_block_based_builder).
+//     [Example]:
+//     - Pass {"filter_policy", "bloomfilter:4:true"} in
+//       GetBlockBasedTableOptionsFromMap to use a BloomFilter with 4-bits
+//       per key and use_block_based_builder enabled.
+//
+// * block_cache / block_cache_compressed:
+//   We currently only support LRU cache in the GetOptions API.  The LRU
+//   cache can be set by directly specifying its size.
+//   [Example]:
+//   - Passing {"block_cache", "1M"} in GetBlockBasedTableOptionsFromMap is
+//     equivalent to setting block_cache using NewLRUCache(1024 * 1024).
+//
+// The GetBlockBasedTableOptionsFromMap(ConfigOptions, ...) should be used;
+// the alternative signature may be deprecated in a future release. The
+// equivalent functionality can be achieved by setting the corresponding
+// options in the ConfigOptions parameter.
+//
+// @param config_options controls how the map is processed.
+// @param table_options the default options of the output "new_table_options".
+// @param opts_map an option name to value map for specifying how
+//     "new_table_options" should be set.
+// @param new_table_options the resulting options based on "table_options"
+//     with the change specified in "opts_map".
+// @param input_strings_escaped when set to true, each escaped characters
+//     prefixed by '\' in the values of the opts_map will be further converted
+//     back to the raw string before assigning to the associated options.
+// @param ignore_unknown_options when set to true, unknown options are ignored
+//     instead of resulting in an unknown-option error.
+// @return Status::OK() on success.  Otherwise, a non-ok status indicating
+//     error will be returned, and "new_table_options" will be set to
+//     "table_options".
+Status GetBlockBasedTableOptionsFromMap(
+    const ConfigOptions& config_options,
+    const BlockBasedTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    BlockBasedTableOptions* new_table_options);
+Status GetBlockBasedTableOptionsFromMap(
+    const BlockBasedTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    BlockBasedTableOptions* new_table_options,
+    bool input_strings_escaped = false, bool ignore_unknown_options = false);
+
+// Take a default PlainTableOptions "table_options" in addition to a
+// map "opts_map" of option name to option value to construct the new
+// PlainTableOptions "new_table_options".
+//
+// The GetPlainTableOptionsFromMap(ConfigOptions, ...) should be used; the
+// alternative signature may be deprecated in a future release. The equivalent
+// functionality can be achieved by setting the corresponding options in
+// the ConfigOptions parameter.
+//
+// @param config_options controls how the map is processed.
+// @param table_options the default options of the output "new_table_options".
+// @param opts_map an option name to value map for specifying how
+//     "new_table_options" should be set.
+// @param new_table_options the resulting options based on "table_options"
+//     with the change specified in "opts_map".
+// @param input_strings_escaped when set to true, each escaped characters
+//     prefixed by '\' in the values of the opts_map will be further converted
+//     back to the raw string before assigning to the associated options.
+// @param ignore_unknown_options when set to true, unknown options are ignored
+//     instead of resulting in an unknown-option error.
+// @return Status::OK() on success.  Otherwise, a non-ok status indicating
+//     error will be returned, and "new_table_options" will be set to
+//     "table_options".
+Status GetPlainTableOptionsFromMap(
+    const ConfigOptions& config_options, const PlainTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    PlainTableOptions* new_table_options);
+Status GetPlainTableOptionsFromMap(
+    const PlainTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    PlainTableOptions* new_table_options, bool input_strings_escaped = false,
+    bool ignore_unknown_options = false);
+
+// Take a string representation of option names and values, apply them into the
+// base_options, and return the new options as a result. The string has the
+// following format:
+//   "write_buffer_size=1024;max_write_buffer_number=2"
+// Nested options config is also possible. For example, you can define
+// BlockBasedTableOptions as part of the string for block-based table factory:
+//   "write_buffer_size=1024;block_based_table_factory={block_size=4k};"
+//   "max_write_buffer_num=2"
+//
+//
+// The GetColumnFamilyOptionsFromString(ConfigOptions, ...) should be used; the
+// alternative signature may be deprecated in a future release. The equivalent
+// functionality can be achieved by setting the corresponding options in
+// the ConfigOptions parameter.
+Status GetColumnFamilyOptionsFromString(const ConfigOptions& config_options,
+                                        const ColumnFamilyOptions& base_options,
+                                        const std::string& opts_str,
+                                        ColumnFamilyOptions* new_options);
+Status GetColumnFamilyOptionsFromString(const ColumnFamilyOptions& base_options,
+                                        const std::string& opts_str,
+                                        ColumnFamilyOptions* new_options);
+
+Status GetDBOptionsFromString(const ConfigOptions& config_options,
+                              const DBOptions& base_options,
+                              const std::string& opts_str,
+                              DBOptions* new_options);
+
+Status GetDBOptionsFromString(const DBOptions& base_options,
+                              const std::string& opts_str,
+                              DBOptions* new_options);
+
+Status GetStringFromDBOptions(const ConfigOptions& config_options,
+                              const DBOptions& db_options,
+                              std::string* opts_str);
+
+Status GetStringFromDBOptions(std::string* opts_str,
+                              const DBOptions& db_options,
+                              const std::string& delimiter = ";  ");
+
+Status GetStringFromColumnFamilyOptions(const ConfigOptions& config_options,
+                                        const ColumnFamilyOptions& cf_options,
+                                        std::string* opts_str);
+Status GetStringFromColumnFamilyOptions(std::string* opts_str,
+                                        const ColumnFamilyOptions& cf_options,
+                                        const std::string& delimiter = ";  ");
+Status GetStringFromCompressionType(std::string* compression_str,
+                                    CompressionType compression_type);
+
+std::vector<CompressionType> GetSupportedCompressions();
+
+Status GetBlockBasedTableOptionsFromString(
+    const BlockBasedTableOptions& table_options, const std::string& opts_str,
+    BlockBasedTableOptions* new_table_options);
+Status GetBlockBasedTableOptionsFromString(
+    const ConfigOptions& config_options,
+    const BlockBasedTableOptions& table_options, const std::string& opts_str,
+    BlockBasedTableOptions* new_table_options);
+
+Status GetPlainTableOptionsFromString(const PlainTableOptions& table_options,
+                                      const std::string& opts_str,
+                                      PlainTableOptions* new_table_options);
+Status GetPlainTableOptionsFromString(const ConfigOptions& config_options,
+                                      const PlainTableOptions& table_options,
+                                      const std::string& opts_str,
+                                      PlainTableOptions* new_table_options);
+
+Status GetMemTableRepFactoryFromString(
+    const std::string& opts_str,
+    std::unique_ptr<MemTableRepFactory>* new_mem_factory);
+
+Status GetOptionsFromString(const Options& base_options,
+                            const std::string& opts_str, Options* new_options);
+Status GetOptionsFromString(const ConfigOptions& config_options,
+                            const Options& base_options,
+                            const std::string& opts_str, Options* new_options);
+
+Status StringToMap(const std::string& opts_str,
+                   std::unordered_map<std::string, std::string>* opts_map);
+
+// Request stopping background work, if wait is true wait until it's done
+void CancelAllBackgroundWork(DB* db, bool wait = false);
+
+// Delete files which are entirely in the given range
+// Could leave some keys in the range which are in files which are not
+// entirely in the range. Also leaves L0 files regardless of whether they're
+// in the range.
+// Snapshots before the delete might not see the data in the given range.
+Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family,
+                          const Slice* begin, const Slice* end,
+                          bool include_end = true);
+
+// Delete files in multiple ranges at once
+// Delete files in a lot of ranges one at a time can be slow, use this API for
+// better performance in that case.
+Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
+                           const RangePtr* ranges, size_t n,
+                           bool include_end = true);
+
+// Verify the checksum of file
+Status VerifySstFileChecksum(const Options& options,
+                             const EnvOptions& env_options,
+                             const std::string& file_path);
+
+// Verify the checksum of file
+Status VerifySstFileChecksum(const Options& options,
+                             const EnvOptions& env_options,
+                             const ReadOptions& read_options,
+                             const std::string& file_path,
+                             const SequenceNumber& largest_seqno = 0);
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/customizable.h b/src/rocksdb/include/rocksdb/customizable.h
new file mode 100644
index 000000000..92f7504ae
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/customizable.h
@@ -0,0 +1,233 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "rocksdb/configurable.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+/**
+ * Customizable a base class used by the rocksdb that describes a
+ * standard way of configuring and creating objects.  Customizable objects
+ * are configurable objects that can be created from an ObjectRegistry.
+ *
+ * Customizable classes are used when there are multiple potential
+ * implementations of a class for use by RocksDB (e.g. Table, Cache,
+ * MergeOperator, etc).  The abstract base class is expected to define a method
+ * declaring its type and a factory method for creating one of these, such as:
+ * static const char *Type() { return "Table"; }
+ * static Status CreateFromString(const ConfigOptions& options,
+ *                                const std::string& id,
+ *                                std::shared_ptr<TableFactory>* result);
+ * The "Type" string is expected to be unique (no two base classes are the same
+ * type). This factory is expected, based on the options and id, create and
+ * return the appropriate derived type of the customizable class (e.g.
+ * BlockBasedTableFactory, PlainTableFactory, etc). For extension developers,
+ * helper classes and methods are provided for writing this factory.
+ *
+ * Instances of a Customizable class need to define:
+ * - A "static const char *kClassName()" method.  This method defines the name
+ * of the class instance (e.g. BlockBasedTable, LRUCache) and is used by the
+ * CheckedCast method.
+ * - The Name() of the object.  This name is used when creating and saving
+ * instances of this class.  Typically this name will be the same as
+ * kClassName().
+ *
+ * Additionally, Customizable classes should register any options used to
+ * configure themselves with the Configurable subsystem.
+ *
+ * When a Customizable is being created, the "name" property specifies
+ * the name of the instance being created.
+ * For custom objects, their configuration and name can be specified by:
+ * [prop]={name=X;option 1 = value1[; option2=value2...]}
+ *
+ * [prop].name=X
+ * [prop].option1 = value1
+ *
+ * [prop].name=X
+ * X.option1 =value1
+ */
+class Customizable : public Configurable {
+ public:
+  ~Customizable() override {}
+
+  // Returns the name of this class of Customizable
+  virtual const char* Name() const = 0;
+
+  // Returns an identifier for this Customizable.
+  // This could be its name or something more complex (like its URL/pattern).
+  // Used for pretty printing.
+  virtual std::string GetId() const {
+    std::string id = Name();
+    return id;
+  }
+
+  // This is typically determined by if the input name matches the
+  // name of this object.
+  // This method is typically used in conjunction with CheckedCast to find the
+  // derived class instance from its base.  For example, if you have an Env
+  // and want the "Default" env, you would IsInstanceOf("Default") to get
+  // the default implementation.  This method should be used when you need a
+  // specific derivative or implementation of a class.
+  //
+  // Intermediary caches (such as SharedCache) may wish to override this method
+  // to check for the intermediary name (SharedCache).  Classes with multiple
+  // potential names (e.g. "PosixEnv", "DefaultEnv") may also wish to override
+  // this method.
+  //
+  // Note that IsInstanceOf only uses the "is-a" relationship and not "has-a".
+  // Wrapped classes that have an Inner "has-a" should not be returned.
+  //
+  // @param name The name of the instance to find.
+  // Returns true if the class is an instance of the input name.
+  virtual bool IsInstanceOf(const std::string& name) const {
+    if (name.empty()) {
+      return false;
+    } else if (name == Name()) {
+      return true;
+    } else {
+      const char* nickname = NickName();
+      if (nickname != nullptr && name == nickname) {
+        return true;
+      } else {
+        return false;
+      }
+    }
+  }
+
+  const void* GetOptionsPtr(const std::string& name) const override {
+    const void* ptr = Configurable::GetOptionsPtr(name);
+    if (ptr != nullptr) {
+      return ptr;
+    } else {
+      const auto inner = Inner();
+      if (inner != nullptr) {
+        return inner->GetOptionsPtr(name);
+      } else {
+        return nullptr;
+      }
+    }
+  }
+
+  // Returns the named instance of the Customizable as a T*, or nullptr if not
+  // found. This method uses IsInstanceOf/Inner to find the appropriate class
+  // instance and then casts it to the expected return type.
+  template <typename T>
+  const T* CheckedCast() const {
+    if (IsInstanceOf(T::kClassName())) {
+      return static_cast<const T*>(this);
+    } else {
+      const auto inner = Inner();
+      if (inner != nullptr) {
+        return inner->CheckedCast<T>();
+      } else {
+        return nullptr;
+      }
+    }
+  }
+
+  template <typename T>
+  T* CheckedCast() {
+    if (IsInstanceOf(T::kClassName())) {
+      return static_cast<T*>(this);
+    } else {
+      auto inner = const_cast<Customizable*>(Inner());
+      if (inner != nullptr) {
+        return inner->CheckedCast<T>();
+      } else {
+        return nullptr;
+      }
+    }
+  }
+
+  // Checks to see if this Customizable is equivalent to other.
+  // This method assumes that the two objects are of the same class.
+  // @param config_options Controls how the options are compared.
+  // @param other The other object to compare to.
+  // @param mismatch If the objects do not match, this parameter contains
+  //      the name of the option that triggered the match failure.
+  // @param True if the objects match, false otherwise.
+  // @see Configurable::AreEquivalent for more details
+  bool AreEquivalent(const ConfigOptions& config_options,
+                     const Configurable* other,
+                     std::string* mismatch) const override;
+#ifndef ROCKSDB_LITE
+  // Gets the value of the option associated with the input name
+  // @see Configurable::GetOption for more details
+  Status GetOption(const ConfigOptions& config_options, const std::string& name,
+                   std::string* value) const override;
+#endif  // ROCKSDB_LITE
+  // Helper method for getting for parsing the opt_value into the corresponding
+  // options for use in potentially creating a new Customizable object (this
+  // method is primarily a support method for LoadSharedObject et al for new
+  // Customizable objects). The opt_value may be either name-value pairs
+  // separated by ";" (a=b; c=d), or a simple name (a). In order to create a new
+  // Customizable, the ID is determined by:
+  // - If the value is a simple name (e.g. "BlockBasedTable"), the id is this
+  // name;
+  // - Otherwise, if there is a "id=value", the id is set to "value"
+  // - Otherwise, if the input customizable is not null, custom->GetId is used
+  // - Otherwise, an error is returned.
+  //
+  // If the opt_value is name-value pairs, these pairs will be returned in
+  // options (without the id pair). If the ID being returned matches the ID of
+  // the input custom object, then the options from the input object will also
+  // be added to the returned options.
+  //
+  // This method returns non-OK if the ID could not be found, or if the
+  // opt_value could not be parsed into name-value pairs.
+  static Status GetOptionsMap(
+      const ConfigOptions& config_options, const Customizable* custom,
+      const std::string& opt_value, std::string* id,
+      std::unordered_map<std::string, std::string>* options);
+
+  // Helper method to configure a new object with the supplied options.
+  // If the object is not null and invoke_prepare_options=true, the object
+  // will be configured and prepared.
+  // Returns success if the object is properly configured and (optionally)
+  // prepared Returns InvalidArgument if the object is nullptr and there are
+  // options in the map Returns the result of the ConfigureFromMap or
+  // PrepareOptions
+  static Status ConfigureNewObject(
+      const ConfigOptions& config_options, Customizable* object,
+      const std::unordered_map<std::string, std::string>& options);
+
+  // Returns the inner class when a Customizable implements a has-a (wrapped)
+  // relationship.  Derived classes that implement a has-a must override this
+  // method in order to get CheckedCast to function properly.
+  virtual const Customizable* Inner() const { return nullptr; }
+
+ protected:
+  // Generates a ID specific for this instance of the customizable.
+  // The unique ID is of the form <name>:<addr>#pid, where:
+  // - name is the Name() of this object;
+  // - addr is the memory address of this object;
+  // - pid is the process ID of this process ID for this process.
+  // Note that if obj1 and obj2 have the same unique IDs, they must be the
+  // same.  However, if an object is deleted and recreated, it may have the
+  // same unique ID as a predecessor
+  //
+  // This method is useful for objects (especially ManagedObjects) that
+  // wish to generate an ID that is specific for this instance and wish to
+  // override the GetId() method.
+  std::string GenerateIndividualId() const;
+
+  // Some classes have both a class name (e.g. PutOperator) and a nickname
+  // (e.g. put). Classes can override this method to return a
+  // nickname.  Nicknames can be used by InstanceOf and object creation.
+  virtual const char* NickName() const { return ""; }
+  //  Given a name (e.g. rocksdb.my.type.opt), returns the short name (opt)
+  std::string GetOptionName(const std::string& long_name) const override;
+#ifndef ROCKSDB_LITE
+  std::string SerializeOptions(const ConfigOptions& options,
+                               const std::string& prefix) const override;
+#endif  // ROCKSDB_LITE
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/data_structure.h b/src/rocksdb/include/rocksdb/data_structure.h
new file mode 100644
index 000000000..f868a6be5
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/data_structure.h
@@ -0,0 +1,51 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <assert.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This is a data structure specifically designed as a "Set" for a
+// pretty small scale of Enum structure. For now, it can support up
+// to 64 element, and it is expandable in the future.
+template <typename ENUM_TYPE, ENUM_TYPE MAX_VALUE>
+class SmallEnumSet {
+ public:
+  SmallEnumSet() : state_(0) {}
+
+  ~SmallEnumSet() {}
+
+  // Return true if the input enum is included in the "Set" (i.e., changes the
+  // internal scalar state successfully), otherwise, it will return false.
+  bool Add(const ENUM_TYPE value) {
+    static_assert(MAX_VALUE <= 63, "Size currently limited to 64");
+    assert(value >= 0 && value <= MAX_VALUE);
+    uint64_t old_state = state_;
+    uint64_t tmp = 1;
+    state_ |= (tmp << value);
+    return old_state != state_;
+  }
+
+  // Return true if the input enum is contained in the "Set".
+  bool Contains(const ENUM_TYPE value) {
+    static_assert(MAX_VALUE <= 63, "Size currently limited to 64");
+    assert(value >= 0 && value <= MAX_VALUE);
+    uint64_t tmp = 1;
+    return state_ & (tmp << value);
+  }
+
+ private:
+  uint64_t state_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/db.h b/src/rocksdb/include/rocksdb/db.h
new file mode 100644
index 000000000..26c07c19f
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/db.h
@@ -0,0 +1,1859 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/block_cache_trace_writer.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/metadata.h"
+#include "rocksdb/options.h"
+#include "rocksdb/snapshot.h"
+#include "rocksdb/sst_file_writer.h"
+#include "rocksdb/thread_status.h"
+#include "rocksdb/transaction_log.h"
+#include "rocksdb/types.h"
+#include "rocksdb/version.h"
+#include "rocksdb/wide_columns.h"
+
+#ifdef _WIN32
+// Windows API macro interference
+#undef DeleteFile
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#define ROCKSDB_DEPRECATED_FUNC __attribute__((__deprecated__))
+#elif _WIN32
+#define ROCKSDB_DEPRECATED_FUNC __declspec(deprecated)
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+struct ColumnFamilyOptions;
+struct CompactionOptions;
+struct CompactRangeOptions;
+struct DBOptions;
+struct ExternalSstFileInfo;
+struct FlushOptions;
+struct Options;
+struct ReadOptions;
+struct TableProperties;
+struct WriteOptions;
+#ifdef ROCKSDB_LITE
+class CompactionJobInfo;
+#endif
+class Env;
+class EventListener;
+class FileSystem;
+#ifndef ROCKSDB_LITE
+class Replayer;
+#endif
+class StatsHistoryIterator;
+#ifndef ROCKSDB_LITE
+class TraceReader;
+class TraceWriter;
+#endif
+class WriteBatch;
+
+extern const std::string kDefaultColumnFamilyName;
+extern const std::string kPersistentStatsColumnFamilyName;
+struct ColumnFamilyDescriptor {
+  std::string name;
+  ColumnFamilyOptions options;
+  ColumnFamilyDescriptor()
+      : name(kDefaultColumnFamilyName), options(ColumnFamilyOptions()) {}
+  ColumnFamilyDescriptor(const std::string& _name,
+                         const ColumnFamilyOptions& _options)
+      : name(_name), options(_options) {}
+};
+
+class ColumnFamilyHandle {
+ public:
+  virtual ~ColumnFamilyHandle() {}
+  // Returns the name of the column family associated with the current handle.
+  virtual const std::string& GetName() const = 0;
+  // Returns the ID of the column family associated with the current handle.
+  virtual uint32_t GetID() const = 0;
+  // Fills "*desc" with the up-to-date descriptor of the column family
+  // associated with this handle. Since it fills "*desc" with the up-to-date
+  // information, this call might internally lock and release DB mutex to
+  // access the up-to-date CF options.  In addition, all the pointer-typed
+  // options cannot be referenced any longer than the original options exist.
+  //
+  // Note that this function is not supported in RocksDBLite.
+  virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) = 0;
+  // Returns the comparator of the column family associated with the
+  // current handle.
+  virtual const Comparator* GetComparator() const = 0;
+};
+
+static const int kMajorVersion = __ROCKSDB_MAJOR__;
+static const int kMinorVersion = __ROCKSDB_MINOR__;
+
+// A range of keys
+struct Range {
+  Slice start;
+  Slice limit;
+
+  Range() {}
+  Range(const Slice& s, const Slice& l) : start(s), limit(l) {}
+};
+
+struct RangePtr {
+  const Slice* start;
+  const Slice* limit;
+
+  RangePtr() : start(nullptr), limit(nullptr) {}
+  RangePtr(const Slice* s, const Slice* l) : start(s), limit(l) {}
+};
+
+// It is valid that files_checksums and files_checksum_func_names are both
+// empty (no checksum information is provided for ingestion). Otherwise,
+// their sizes should be the same as external_files. The file order should
+// be the same in three vectors and guaranteed by the caller.
+// Note that, we assume the temperatures of this batch of files to be
+// ingested are the same.
+struct IngestExternalFileArg {
+  ColumnFamilyHandle* column_family = nullptr;
+  std::vector<std::string> external_files;
+  IngestExternalFileOptions options;
+  std::vector<std::string> files_checksums;
+  std::vector<std::string> files_checksum_func_names;
+  Temperature file_temperature = Temperature::kUnknown;
+};
+
+struct GetMergeOperandsOptions {
+  int expected_max_number_of_operands = 0;
+};
+
+// A collections of table properties objects, where
+//  key: is the table's file name.
+//  value: the table properties object of the given table.
+using TablePropertiesCollection =
+    std::unordered_map<std::string, std::shared_ptr<const TableProperties>>;
+
+// A DB is a persistent, versioned ordered map from keys to values.
+// A DB is safe for concurrent access from multiple threads without
+// any external synchronization.
+// DB is an abstract base class with one primary implementation (DBImpl)
+// and a number of wrapper implementations.
+class DB {
+ public:
+  // Open the database with the specified "name" for reads and writes.
+  // Stores a pointer to a heap-allocated database in *dbptr and returns
+  // OK on success.
+  // Stores nullptr in *dbptr and returns a non-OK status on error, including
+  // if the DB is already open (read-write) by another DB object. (This
+  // guarantee depends on options.env->LockFile(), which might not provide
+  // this guarantee in a custom Env implementation.)
+  //
+  // Caller must delete *dbptr when it is no longer needed.
+  static Status Open(const Options& options, const std::string& name,
+                     DB** dbptr);
+
+  // Open DB with column families.
+  // db_options specify database specific options
+  // column_families is the vector of all column families in the database,
+  // containing column family name and options. You need to open ALL column
+  // families in the database. To get the list of column families, you can use
+  // ListColumnFamilies().
+  //
+  // The default column family name is 'default' and it's stored
+  // in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName.
+  // If everything is OK, handles will on return be the same size
+  // as column_families --- handles[i] will be a handle that you
+  // will use to operate on column family column_family[i].
+  // Before delete DB, you have to close All column families by calling
+  // DestroyColumnFamilyHandle() with all the handles.
+  static Status Open(const DBOptions& db_options, const std::string& name,
+                     const std::vector<ColumnFamilyDescriptor>& column_families,
+                     std::vector<ColumnFamilyHandle*>* handles, DB** dbptr);
+
+  // OpenForReadOnly() creates a Read-only instance that supports reads alone.
+  //
+  // All DB interfaces that modify data, like put/delete, will return error.
+  // Automatic Flush and Compactions are disabled and any manual calls
+  // to Flush/Compaction will return error.
+  //
+  // While a given DB can be simultaneously opened via OpenForReadOnly
+  // by any number of readers, if a DB is simultaneously opened by Open
+  // and OpenForReadOnly, the read-only instance has undefined behavior
+  // (though can often succeed if quickly closed) and the read-write
+  // instance is unaffected. See also OpenAsSecondary.
+
+  // Open the database for read only.
+  //
+  // Not supported in ROCKSDB_LITE, in which case the function will
+  // return Status::NotSupported.
+  static Status OpenForReadOnly(const Options& options, const std::string& name,
+                                DB** dbptr,
+                                bool error_if_wal_file_exists = false);
+
+  // Open the database for read only with column families.
+  //
+  // When opening DB with read only, you can specify only a subset of column
+  // families in the database that should be opened. However, you always need
+  // to specify default column family. The default column family name is
+  // 'default' and it's stored in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName
+  //
+  // Not supported in ROCKSDB_LITE, in which case the function will
+  // return Status::NotSupported.
+  static Status OpenForReadOnly(
+      const DBOptions& db_options, const std::string& name,
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+      bool error_if_wal_file_exists = false);
+
+  // OpenAsSecondary() creates a secondary instance that supports read-only
+  // operations and supports dynamic catch up with the primary (through a
+  // call to TryCatchUpWithPrimary()).
+  //
+  // All DB interfaces that modify data, like put/delete, will return error.
+  // Automatic Flush and Compactions are disabled and any manual calls
+  // to Flush/Compaction will return error.
+  //
+  // Multiple secondary instances can co-exist at the same time.
+  //
+
+  // Open DB as secondary instance
+  //
+  // The options argument specifies the options to open the secondary instance.
+  // Options.max_open_files should be set to -1.
+  // The name argument specifies the name of the primary db that you have used
+  // to open the primary instance.
+  // The secondary_path argument points to a directory where the secondary
+  // instance stores its info log.
+  // The dbptr is an out-arg corresponding to the opened secondary instance.
+  // The pointer points to a heap-allocated database, and the caller should
+  // delete it after use.
+  //
+  // Return OK on success, non-OK on failures.
+  static Status OpenAsSecondary(const Options& options, const std::string& name,
+                                const std::string& secondary_path, DB** dbptr);
+
+  // Open DB as secondary instance with specified column families
+  //
+  // When opening DB in secondary mode, you can specify only a subset of column
+  // families in the database that should be opened. However, you always need
+  // to specify default column family. The default column family name is
+  // 'default' and it's stored in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName
+  //
+  // Column families created by the primary after the secondary instance starts
+  // are currently ignored by the secondary instance.  Column families opened
+  // by secondary and dropped by the primary will be dropped by secondary as
+  // well (on next invocation of TryCatchUpWithPrimary()). However the user
+  // of the secondary instance can still access the data of such dropped column
+  // family as long as they do not destroy the corresponding column family
+  // handle.
+  //
+  // The options argument specifies the options to open the secondary instance.
+  // Options.max_open_files should be set to -1.
+  // The name argument specifies the name of the primary db that you have used
+  // to open the primary instance.
+  // The secondary_path argument points to a directory where the secondary
+  // instance stores its info log.
+  // The column_families argument specifies a list of column families to open.
+  // If default column family is not specified or if any specified column
+  // families does not exist, the function returns non-OK status.
+  // The handles is an out-arg corresponding to the opened database column
+  // family handles.
+  // The dbptr is an out-arg corresponding to the opened secondary instance.
+  // The pointer points to a heap-allocated database, and the caller should
+  // delete it after use. Before deleting the dbptr, the user should also
+  // delete the pointers stored in handles vector.
+  //
+  // Return OK on success, non-OK on failures.
+  static Status OpenAsSecondary(
+      const DBOptions& db_options, const std::string& name,
+      const std::string& secondary_path,
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr);
+
+  // Open DB and run the compaction.
+  // It's a read-only operation, the result won't be installed to the DB, it
+  // will be output to the `output_directory`. The API should only be used with
+  // `options.CompactionService` to run compaction triggered by
+  // `CompactionService`.
+  static Status OpenAndCompact(
+      const std::string& name, const std::string& output_directory,
+      const std::string& input, std::string* output,
+      const CompactionServiceOptionsOverride& override_options);
+
+  static Status OpenAndCompact(
+      const OpenAndCompactOptions& options, const std::string& name,
+      const std::string& output_directory, const std::string& input,
+      std::string* output,
+      const CompactionServiceOptionsOverride& override_options);
+
+  // Experimental and subject to change
+  // Open DB and trim data newer than specified timestamp.
+  // The trim_ts specified the user-defined timestamp trim bound.
+  // This API should only be used at timestamp enabled column families recovery.
+  // If some input column families do not support timestamp, nothing will
+  // be happened to them. The data with timestamp > trim_ts
+  // will be removed after this API returns successfully.
+  static Status OpenAndTrimHistory(
+      const DBOptions& db_options, const std::string& dbname,
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+      std::string trim_ts);
+
+  virtual Status Resume() { return Status::NotSupported(); }
+
+  // Close the DB by releasing resources, closing files etc. This should be
+  // called before calling the destructor so that the caller can get back a
+  // status in case there are any errors. This will not fsync the WAL files.
+  // If syncing is required, the caller must first call SyncWAL(), or Write()
+  // using an empty write batch with WriteOptions.sync=true.
+  // Regardless of the return status, the DB must be freed.
+  // If the return status is Aborted(), closing fails because there is
+  // unreleased snapshot in the system. In this case, users can release
+  // the unreleased snapshots and try again and expect it to succeed. For
+  // other status, re-calling Close() will be no-op and return the original
+  // close status. If the return status is NotSupported(), then the DB
+  // implementation does cleanup in the destructor
+  virtual Status Close() { return Status::NotSupported(); }
+
+  // ListColumnFamilies will open the DB specified by argument name
+  // and return the list of all column families in that DB
+  // through column_families argument. The ordering of
+  // column families in column_families is unspecified.
+  static Status ListColumnFamilies(const DBOptions& db_options,
+                                   const std::string& name,
+                                   std::vector<std::string>* column_families);
+
+  // Abstract class ctor
+  DB() {}
+  // No copying allowed
+  DB(const DB&) = delete;
+  void operator=(const DB&) = delete;
+
+  virtual ~DB();
+
+  // Create a column_family and return the handle of column family
+  // through the argument handle.
+  virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
+                                    const std::string& column_family_name,
+                                    ColumnFamilyHandle** handle);
+
+  // Bulk create column families with the same column family options.
+  // Return the handles of the column families through the argument handles.
+  // In case of error, the request may succeed partially, and handles will
+  // contain column family handles that it managed to create, and have size
+  // equal to the number of created column families.
+  virtual Status CreateColumnFamilies(
+      const ColumnFamilyOptions& options,
+      const std::vector<std::string>& column_family_names,
+      std::vector<ColumnFamilyHandle*>* handles);
+
+  // Bulk create column families.
+  // Return the handles of the column families through the argument handles.
+  // In case of error, the request may succeed partially, and handles will
+  // contain column family handles that it managed to create, and have size
+  // equal to the number of created column families.
+  virtual Status CreateColumnFamilies(
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      std::vector<ColumnFamilyHandle*>* handles);
+
+  // Drop a column family specified by column_family handle. This call
+  // only records a drop record in the manifest and prevents the column
+  // family from flushing and compacting.
+  virtual Status DropColumnFamily(ColumnFamilyHandle* column_family);
+
+  // Bulk drop column families. This call only records drop records in the
+  // manifest and prevents the column families from flushing and compacting.
+  // In case of error, the request may succeed partially. User may call
+  // ListColumnFamilies to check the result.
+  virtual Status DropColumnFamilies(
+      const std::vector<ColumnFamilyHandle*>& column_families);
+
+  // Release and deallocate a column family handle. A column family is only
+  // removed once it is dropped (DropColumnFamily) and all handles have been
+  // destroyed (DestroyColumnFamilyHandle). Use this method to destroy
+  // column family handles (except for DefaultColumnFamily()!) before closing
+  // a DB.
+  virtual Status DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family);
+
+  // Set the database entry for "key" to "value".
+  // If "key" already exists, it will be overwritten.
+  // Returns OK on success, and a non-OK status on error.
+  // Note: consider setting options.sync = true.
+  virtual Status Put(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& value) = 0;
+  virtual Status Put(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& ts, const Slice& value) = 0;
+  virtual Status Put(const WriteOptions& options, const Slice& key,
+                     const Slice& value) {
+    return Put(options, DefaultColumnFamily(), key, value);
+  }
+  virtual Status Put(const WriteOptions& options, const Slice& key,
+                     const Slice& ts, const Slice& value) {
+    return Put(options, DefaultColumnFamily(), key, ts, value);
+  }
+
+  // Set the database entry for "key" in the column family specified by
+  // "column_family" to the wide-column entity defined by "columns". If the key
+  // already exists in the column family, it will be overwritten.
+  //
+  // Returns OK on success, and a non-OK status on error.
+  virtual Status PutEntity(const WriteOptions& options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           const WideColumns& columns);
+
+  // Remove the database entry (if any) for "key".  Returns OK on
+  // success, and a non-OK status on error.  It is not an error if "key"
+  // did not exist in the database.
+  // Note: consider setting options.sync = true.
+  virtual Status Delete(const WriteOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const Slice& key) = 0;
+  virtual Status Delete(const WriteOptions& options,
+                        ColumnFamilyHandle* column_family, const Slice& key,
+                        const Slice& ts) = 0;
+  virtual Status Delete(const WriteOptions& options, const Slice& key) {
+    return Delete(options, DefaultColumnFamily(), key);
+  }
+  virtual Status Delete(const WriteOptions& options, const Slice& key,
+                        const Slice& ts) {
+    return Delete(options, DefaultColumnFamily(), key, ts);
+  }
+
+  // Remove the database entry for "key". Requires that the key exists
+  // and was not overwritten. Returns OK on success, and a non-OK status
+  // on error.  It is not an error if "key" did not exist in the database.
+  //
+  // If a key is overwritten (by calling Put() multiple times), then the result
+  // of calling SingleDelete() on this key is undefined.  SingleDelete() only
+  // behaves correctly if there has been only one Put() for this key since the
+  // previous call to SingleDelete() for this key.
+  //
+  // This feature is currently an experimental performance optimization
+  // for a very specific workload.  It is up to the caller to ensure that
+  // SingleDelete is only used for a key that is not deleted using Delete() or
+  // written using Merge().  Mixing SingleDelete operations with Deletes and
+  // Merges can result in undefined behavior.
+  //
+  // Note: consider setting options.sync = true.
+  virtual Status SingleDelete(const WriteOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice& key) = 0;
+  virtual Status SingleDelete(const WriteOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice& key, const Slice& ts) = 0;
+  virtual Status SingleDelete(const WriteOptions& options, const Slice& key) {
+    return SingleDelete(options, DefaultColumnFamily(), key);
+  }
+  virtual Status SingleDelete(const WriteOptions& options, const Slice& key,
+                              const Slice& ts) {
+    return SingleDelete(options, DefaultColumnFamily(), key, ts);
+  }
+
+  // Removes the database entries in the range ["begin_key", "end_key"), i.e.,
+  // including "begin_key" and excluding "end_key". Returns OK on success, and
+  // a non-OK status on error. It is not an error if the database does not
+  // contain any existing data in the range ["begin_key", "end_key").
+  //
+  // If "end_key" comes before "start_key" according to the user's comparator,
+  // a `Status::InvalidArgument` is returned.
+  //
+  // This feature is now usable in production, with the following caveats:
+  // 1) Accumulating too many range tombstones in the memtable will degrade read
+  // performance; this can be avoided by manually flushing occasionally.
+  // 2) Limiting the maximum number of open files in the presence of range
+  // tombstones can degrade read performance. To avoid this problem, set
+  // max_open_files to -1 whenever possible.
+  virtual Status DeleteRange(const WriteOptions& options,
+                             ColumnFamilyHandle* column_family,
+                             const Slice& begin_key, const Slice& end_key);
+  virtual Status DeleteRange(const WriteOptions& options,
+                             ColumnFamilyHandle* column_family,
+                             const Slice& begin_key, const Slice& end_key,
+                             const Slice& ts);
+
+  // Merge the database entry for "key" with "value".  Returns OK on success,
+  // and a non-OK status on error. The semantics of this operation is
+  // determined by the user provided merge_operator when opening DB.
+  // Note: consider setting options.sync = true.
+  virtual Status Merge(const WriteOptions& options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) = 0;
+  virtual Status Merge(const WriteOptions& options, const Slice& key,
+                       const Slice& value) {
+    return Merge(options, DefaultColumnFamily(), key, value);
+  }
+  virtual Status Merge(const WriteOptions& /*options*/,
+                       ColumnFamilyHandle* /*column_family*/,
+                       const Slice& /*key*/, const Slice& /*ts*/,
+                       const Slice& /*value*/);
+
+  // Apply the specified updates to the database.
+  // If `updates` contains no update, WAL will still be synced if
+  // options.sync=true.
+  // Returns OK on success, non-OK on failure.
+  // Note: consider setting options.sync = true.
+  virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
+
+  // If the column family specified by "column_family" contains an entry for
+  // "key", return the corresponding value in "*value". If the entry is a plain
+  // key-value, return the value as-is; if it is a wide-column entity, return
+  // the value of its default anonymous column (see kDefaultWideColumnName) if
+  // any, or an empty value otherwise.
+  //
+  // If timestamp is enabled and a non-null timestamp pointer is passed in,
+  // timestamp is returned.
+  //
+  // Returns OK on success. Returns NotFound and an empty value in "*value" if
+  // there is no entry for "key". Returns some other non-OK status on error.
+  virtual inline Status Get(const ReadOptions& options,
+                            ColumnFamilyHandle* column_family, const Slice& key,
+                            std::string* value) {
+    assert(value != nullptr);
+    PinnableSlice pinnable_val(value);
+    assert(!pinnable_val.IsPinned());
+    auto s = Get(options, column_family, key, &pinnable_val);
+    if (s.ok() && pinnable_val.IsPinned()) {
+      value->assign(pinnable_val.data(), pinnable_val.size());
+    }  // else value is already assigned
+    return s;
+  }
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* value) = 0;
+  virtual Status Get(const ReadOptions& options, const Slice& key,
+                     std::string* value) {
+    return Get(options, DefaultColumnFamily(), key, value);
+  }
+
+  // Get() methods that return timestamp. Derived DB classes don't need to worry
+  // about this group of methods if they don't care about timestamp feature.
+  virtual inline Status Get(const ReadOptions& options,
+                            ColumnFamilyHandle* column_family, const Slice& key,
+                            std::string* value, std::string* timestamp) {
+    assert(value != nullptr);
+    PinnableSlice pinnable_val(value);
+    assert(!pinnable_val.IsPinned());
+    auto s = Get(options, column_family, key, &pinnable_val, timestamp);
+    if (s.ok() && pinnable_val.IsPinned()) {
+      value->assign(pinnable_val.data(), pinnable_val.size());
+    }  // else value is already assigned
+    return s;
+  }
+  virtual Status Get(const ReadOptions& /*options*/,
+                     ColumnFamilyHandle* /*column_family*/,
+                     const Slice& /*key*/, PinnableSlice* /*value*/,
+                     std::string* /*timestamp*/) {
+    return Status::NotSupported(
+        "Get() that returns timestamp is not implemented.");
+  }
+  virtual Status Get(const ReadOptions& options, const Slice& key,
+                     std::string* value, std::string* timestamp) {
+    return Get(options, DefaultColumnFamily(), key, value, timestamp);
+  }
+
+  // If the column family specified by "column_family" contains an entry for
+  // "key", return it as a wide-column entity in "*columns". If the entry is a
+  // wide-column entity, return it as-is; if it is a plain key-value, return it
+  // as an entity with a single anonymous column (see kDefaultWideColumnName)
+  // which contains the value.
+  //
+  // Returns OK on success. Returns NotFound and an empty wide-column entity in
+  // "*columns" if there is no entry for "key". Returns some other non-OK status
+  // on error.
+  virtual Status GetEntity(const ReadOptions& /* options */,
+                           ColumnFamilyHandle* /* column_family */,
+                           const Slice& /* key */,
+                           PinnableWideColumns* /* columns */) {
+    return Status::NotSupported("GetEntity not supported");
+  }
+
+  // Populates the `merge_operands` array with all the merge operands in the DB
+  // for `key`. The `merge_operands` array will be populated in the order of
+  // insertion. The number of entries populated in `merge_operands` will be
+  // assigned to `*number_of_operands`.
+  //
+  // If the number of merge operands in DB for `key` is greater than
+  // `merge_operands_options.expected_max_number_of_operands`,
+  // `merge_operands` is not populated and the return value is
+  // `Status::Incomplete`. In that case, `*number_of_operands` will be assigned
+  // the number of merge operands found in the DB for `key`.
+  //
+  // `merge_operands`- Points to an array of at-least
+  //             merge_operands_options.expected_max_number_of_operands and the
+  //             caller is responsible for allocating it.
+  //
+  // The caller should delete or `Reset()` the `merge_operands` entries when
+  // they are no longer needed. All `merge_operands` entries must be destroyed
+  // or `Reset()` before this DB is closed or destroyed.
+  virtual Status GetMergeOperands(
+      const ReadOptions& options, ColumnFamilyHandle* column_family,
+      const Slice& key, PinnableSlice* merge_operands,
+      GetMergeOperandsOptions* get_merge_operands_options,
+      int* number_of_operands) = 0;
+
+  // Consistent Get of many keys across column families without the need
+  // for an explicit snapshot. NOTE: the implementation of this MultiGet API
+  // does not have the performance benefits of the void-returning MultiGet
+  // functions.
+  //
+  // If keys[i] does not exist in the database, then the i'th returned
+  // status will be one for which Status::IsNotFound() is true, and
+  // (*values)[i] will be set to some arbitrary value (often ""). Otherwise,
+  // the i'th returned status will have Status::ok() true, and (*values)[i]
+  // will store the value associated with keys[i].
+  //
+  // (*values) will always be resized to be the same size as (keys).
+  // Similarly, the number of returned statuses will be the number of keys.
+  // Note: keys will not be "de-duplicated". Duplicate keys will return
+  // duplicate values in order.
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
+  virtual std::vector<Status> MultiGet(const ReadOptions& options,
+                                       const std::vector<Slice>& keys,
+                                       std::vector<std::string>* values) {
+    return MultiGet(
+        options,
+        std::vector<ColumnFamilyHandle*>(keys.size(), DefaultColumnFamily()),
+        keys, values);
+  }
+
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& /*options*/,
+      const std::vector<ColumnFamilyHandle*>& /*column_family*/,
+      const std::vector<Slice>& keys, std::vector<std::string>* /*values*/,
+      std::vector<std::string>* /*timestamps*/) {
+    return std::vector<Status>(
+        keys.size(), Status::NotSupported(
+                         "MultiGet() returning timestamps not implemented."));
+  }
+  virtual std::vector<Status> MultiGet(const ReadOptions& options,
+                                       const std::vector<Slice>& keys,
+                                       std::vector<std::string>* values,
+                                       std::vector<std::string>* timestamps) {
+    return MultiGet(
+        options,
+        std::vector<ColumnFamilyHandle*>(keys.size(), DefaultColumnFamily()),
+        keys, values, timestamps);
+  }
+
+  // Overloaded MultiGet API that improves performance by batching operations
+  // in the read path for greater efficiency. Currently, only the block based
+  // table format with full filters are supported. Other table formats such
+  // as plain table, block based table with block based filters and
+  // partitioned indexes will still work, but will not get any performance
+  // benefits.
+  // Parameters -
+  // options - ReadOptions
+  // column_family - ColumnFamilyHandle* that the keys belong to. All the keys
+  //                 passed to the API are restricted to a single column family
+  // num_keys - Number of keys to lookup
+  // keys - Pointer to C style array of key Slices with num_keys elements
+  // values - Pointer to C style array of PinnableSlices with num_keys elements
+  // statuses - Pointer to C style array of Status with num_keys elements
+  // sorted_input - If true, it means the input keys are already sorted by key
+  //                order, so the MultiGet() API doesn't have to sort them
+  //                again. If false, the keys will be copied and sorted
+  //                internally by the API - the input array will not be
+  //                modified
+  virtual void MultiGet(const ReadOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const size_t num_keys, const Slice* keys,
+                        PinnableSlice* values, Status* statuses,
+                        const bool /*sorted_input*/ = false) {
+    std::vector<ColumnFamilyHandle*> cf;
+    std::vector<Slice> user_keys;
+    std::vector<Status> status;
+    std::vector<std::string> vals;
+
+    for (size_t i = 0; i < num_keys; ++i) {
+      cf.emplace_back(column_family);
+      user_keys.emplace_back(keys[i]);
+    }
+    status = MultiGet(options, cf, user_keys, &vals);
+    std::copy(status.begin(), status.end(), statuses);
+    for (auto& value : vals) {
+      values->PinSelf(value);
+      values++;
+    }
+  }
+
+  virtual void MultiGet(const ReadOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const size_t num_keys, const Slice* keys,
+                        PinnableSlice* values, std::string* timestamps,
+                        Status* statuses, const bool /*sorted_input*/ = false) {
+    std::vector<ColumnFamilyHandle*> cf;
+    std::vector<Slice> user_keys;
+    std::vector<Status> status;
+    std::vector<std::string> vals;
+    std::vector<std::string> tss;
+
+    for (size_t i = 0; i < num_keys; ++i) {
+      cf.emplace_back(column_family);
+      user_keys.emplace_back(keys[i]);
+    }
+    status = MultiGet(options, cf, user_keys, &vals, &tss);
+    std::copy(status.begin(), status.end(), statuses);
+    std::copy(tss.begin(), tss.end(), timestamps);
+    for (auto& value : vals) {
+      values->PinSelf(value);
+      values++;
+    }
+  }
+
+  // Overloaded MultiGet API that improves performance by batching operations
+  // in the read path for greater efficiency. Currently, only the block based
+  // table format with full filters are supported. Other table formats such
+  // as plain table, block based table with block based filters and
+  // partitioned indexes will still work, but will not get any performance
+  // benefits.
+  // Parameters -
+  // options - ReadOptions
+  // column_family - ColumnFamilyHandle* that the keys belong to. All the keys
+  //                 passed to the API are restricted to a single column family
+  // num_keys - Number of keys to lookup
+  // keys - Pointer to C style array of key Slices with num_keys elements
+  // values - Pointer to C style array of PinnableSlices with num_keys elements
+  // statuses - Pointer to C style array of Status with num_keys elements
+  // sorted_input - If true, it means the input keys are already sorted by key
+  //                order, so the MultiGet() API doesn't have to sort them
+  //                again. If false, the keys will be copied and sorted
+  //                internally by the API - the input array will not be
+  //                modified
+  virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
+                        ColumnFamilyHandle** column_families, const Slice* keys,
+                        PinnableSlice* values, Status* statuses,
+                        const bool /*sorted_input*/ = false) {
+    std::vector<ColumnFamilyHandle*> cf;
+    std::vector<Slice> user_keys;
+    std::vector<Status> status;
+    std::vector<std::string> vals;
+
+    for (size_t i = 0; i < num_keys; ++i) {
+      cf.emplace_back(column_families[i]);
+      user_keys.emplace_back(keys[i]);
+    }
+    status = MultiGet(options, cf, user_keys, &vals);
+    std::copy(status.begin(), status.end(), statuses);
+    for (auto& value : vals) {
+      values->PinSelf(value);
+      values++;
+    }
+  }
+  virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
+                        ColumnFamilyHandle** column_families, const Slice* keys,
+                        PinnableSlice* values, std::string* timestamps,
+                        Status* statuses, const bool /*sorted_input*/ = false) {
+    std::vector<ColumnFamilyHandle*> cf;
+    std::vector<Slice> user_keys;
+    std::vector<Status> status;
+    std::vector<std::string> vals;
+    std::vector<std::string> tss;
+
+    for (size_t i = 0; i < num_keys; ++i) {
+      cf.emplace_back(column_families[i]);
+      user_keys.emplace_back(keys[i]);
+    }
+    status = MultiGet(options, cf, user_keys, &vals, &tss);
+    std::copy(status.begin(), status.end(), statuses);
+    std::copy(tss.begin(), tss.end(), timestamps);
+    for (auto& value : vals) {
+      values->PinSelf(value);
+      values++;
+    }
+  }
+
+  // If the key definitely does not exist in the database, then this method
+  // returns false, else true. If the caller wants to obtain value when the key
+  // is found in memory, a bool for 'value_found' must be passed. 'value_found'
+  // will be true on return if value has been set properly.
+  // This check is potentially lighter-weight than invoking DB::Get(). One way
+  // to make this lighter weight is to avoid doing any IOs.
+  // Default implementation here returns true and sets 'value_found' to false
+  virtual bool KeyMayExist(const ReadOptions& /*options*/,
+                           ColumnFamilyHandle* /*column_family*/,
+                           const Slice& /*key*/, std::string* /*value*/,
+                           std::string* /*timestamp*/,
+                           bool* value_found = nullptr) {
+    if (value_found != nullptr) {
+      *value_found = false;
+    }
+    return true;
+  }
+
+  virtual bool KeyMayExist(const ReadOptions& options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           std::string* value, bool* value_found = nullptr) {
+    return KeyMayExist(options, column_family, key, value,
+                       /*timestamp=*/nullptr, value_found);
+  }
+
+  virtual bool KeyMayExist(const ReadOptions& options, const Slice& key,
+                           std::string* value, bool* value_found = nullptr) {
+    return KeyMayExist(options, DefaultColumnFamily(), key, value, value_found);
+  }
+
+  virtual bool KeyMayExist(const ReadOptions& options, const Slice& key,
+                           std::string* value, std::string* timestamp,
+                           bool* value_found = nullptr) {
+    return KeyMayExist(options, DefaultColumnFamily(), key, value, timestamp,
+                       value_found);
+  }
+
+  // Return a heap-allocated iterator over the contents of the database.
+  // The result of NewIterator() is initially invalid (caller must
+  // call one of the Seek methods on the iterator before using it).
+  //
+  // Caller should delete the iterator when it is no longer needed.
+  // The returned iterator should be deleted before this db is deleted.
+  virtual Iterator* NewIterator(const ReadOptions& options,
+                                ColumnFamilyHandle* column_family) = 0;
+  virtual Iterator* NewIterator(const ReadOptions& options) {
+    return NewIterator(options, DefaultColumnFamily());
+  }
+  // Returns iterators from a consistent database state across multiple
+  // column families. Iterators are heap allocated and need to be deleted
+  // before the db is deleted
+  virtual Status NewIterators(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      std::vector<Iterator*>* iterators) = 0;
+
+  // Return a handle to the current DB state.  Iterators created with
+  // this handle will all observe a stable snapshot of the current DB
+  // state.  The caller must call ReleaseSnapshot(result) when the
+  // snapshot is no longer needed.
+  //
+  // nullptr will be returned if the DB fails to take a snapshot or does
+  // not support snapshot (eg: inplace_update_support enabled).
+  virtual const Snapshot* GetSnapshot() = 0;
+
+  // Release a previously acquired snapshot.  The caller must not
+  // use "snapshot" after this call.
+  virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
+
+#ifndef ROCKSDB_LITE
+  // Contains all valid property arguments for GetProperty() or
+  // GetMapProperty(). Each is a "string" property for retrieval with
+  // GetProperty() unless noted as a "map" property, for GetMapProperty().
+  //
+  // NOTE: Property names cannot end in numbers since those are interpreted as
+  //       arguments, e.g., see kNumFilesAtLevelPrefix.
+  struct Properties {
+    //  "rocksdb.num-files-at-level<N>" - returns string containing the number
+    //      of files at level <N>, where <N> is an ASCII representation of a
+    //      level number (e.g., "0").
+    static const std::string kNumFilesAtLevelPrefix;
+
+    //  "rocksdb.compression-ratio-at-level<N>" - returns string containing the
+    //      compression ratio of data at level <N>, where <N> is an ASCII
+    //      representation of a level number (e.g., "0"). Here, compression
+    //      ratio is defined as uncompressed data size / compressed file size.
+    //      Returns "-1.0" if no open files at level <N>.
+    static const std::string kCompressionRatioAtLevelPrefix;
+
+    //  "rocksdb.stats" - returns a multi-line string containing the data
+    //      described by kCFStats followed by the data described by kDBStats.
+    static const std::string kStats;
+
+    //  "rocksdb.sstables" - returns a multi-line string summarizing current
+    //      SST files.
+    static const std::string kSSTables;
+
+    //  "rocksdb.cfstats" - Raw data from "rocksdb.cfstats-no-file-histogram"
+    //      and "rocksdb.cf-file-histogram" as a "map" property.
+    static const std::string kCFStats;
+
+    //  "rocksdb.cfstats-no-file-histogram" - returns a multi-line string with
+    //      general column family stats per-level over db's lifetime ("L<n>"),
+    //      aggregated over db's lifetime ("Sum"), and aggregated over the
+    //      interval since the last retrieval ("Int").
+    static const std::string kCFStatsNoFileHistogram;
+
+    //  "rocksdb.cf-file-histogram" - print out how many file reads to every
+    //      level, as well as the histogram of latency of single requests.
+    static const std::string kCFFileHistogram;
+
+    //  "rocksdb.dbstats" - As a string property, returns a multi-line string
+    //      with general database stats, both cumulative (over the db's
+    //      lifetime) and interval (since the last retrieval of kDBStats).
+    //      As a map property, returns cumulative stats only and does not
+    //      update the baseline for the interval stats.
+    static const std::string kDBStats;
+
+    //  "rocksdb.levelstats" - returns multi-line string containing the number
+    //      of files per level and total size of each level (MB).
+    static const std::string kLevelStats;
+
+    //  "rocksdb.block-cache-entry-stats" - returns a multi-line string or
+    //      map with statistics on block cache usage. See
+    //      `BlockCacheEntryStatsMapKeys` for structured representation of keys
+    //      available in the map form.
+    static const std::string kBlockCacheEntryStats;
+
+    //  "rocksdb.fast-block-cache-entry-stats" - same as above, but returns
+    //      stale values more frequently to reduce overhead and latency.
+    static const std::string kFastBlockCacheEntryStats;
+
+    //  "rocksdb.num-immutable-mem-table" - returns number of immutable
+    //      memtables that have not yet been flushed.
+    static const std::string kNumImmutableMemTable;
+
+    //  "rocksdb.num-immutable-mem-table-flushed" - returns number of immutable
+    //      memtables that have already been flushed.
+    static const std::string kNumImmutableMemTableFlushed;
+
+    //  "rocksdb.mem-table-flush-pending" - returns 1 if a memtable flush is
+    //      pending; otherwise, returns 0.
+    static const std::string kMemTableFlushPending;
+
+    //  "rocksdb.num-running-flushes" - returns the number of currently running
+    //      flushes.
+    static const std::string kNumRunningFlushes;
+
+    //  "rocksdb.compaction-pending" - returns 1 if at least one compaction is
+    //      pending; otherwise, returns 0.
+    static const std::string kCompactionPending;
+
+    //  "rocksdb.num-running-compactions" - returns the number of currently
+    //      running compactions.
+    static const std::string kNumRunningCompactions;
+
+    //  "rocksdb.background-errors" - returns accumulated number of background
+    //      errors.
+    static const std::string kBackgroundErrors;
+
+    //  "rocksdb.cur-size-active-mem-table" - returns approximate size of active
+    //      memtable (bytes).
+    static const std::string kCurSizeActiveMemTable;
+
+    //  "rocksdb.cur-size-all-mem-tables" - returns approximate size of active
+    //      and unflushed immutable memtables (bytes).
+    static const std::string kCurSizeAllMemTables;
+
+    //  "rocksdb.size-all-mem-tables" - returns approximate size of active,
+    //      unflushed immutable, and pinned immutable memtables (bytes).
+    static const std::string kSizeAllMemTables;
+
+    //  "rocksdb.num-entries-active-mem-table" - returns total number of entries
+    //      in the active memtable.
+    static const std::string kNumEntriesActiveMemTable;
+
+    //  "rocksdb.num-entries-imm-mem-tables" - returns total number of entries
+    //      in the unflushed immutable memtables.
+    static const std::string kNumEntriesImmMemTables;
+
+    //  "rocksdb.num-deletes-active-mem-table" - returns total number of delete
+    //      entries in the active memtable.
+    static const std::string kNumDeletesActiveMemTable;
+
+    //  "rocksdb.num-deletes-imm-mem-tables" - returns total number of delete
+    //      entries in the unflushed immutable memtables.
+    static const std::string kNumDeletesImmMemTables;
+
+    //  "rocksdb.estimate-num-keys" - returns estimated number of total keys in
+    //      the active and unflushed immutable memtables and storage.
+    static const std::string kEstimateNumKeys;
+
+    //  "rocksdb.estimate-table-readers-mem" - returns estimated memory used for
+    //      reading SST tables, excluding memory used in block cache (e.g.,
+    //      filter and index blocks).
+    static const std::string kEstimateTableReadersMem;
+
+    //  "rocksdb.is-file-deletions-enabled" - returns 0 if deletion of obsolete
+    //      files is enabled; otherwise, returns a non-zero number.
+    //  This name may be misleading because true(non-zero) means disable,
+    //  but we keep the name for backward compatibility.
+    static const std::string kIsFileDeletionsEnabled;
+
+    //  "rocksdb.num-snapshots" - returns number of unreleased snapshots of the
+    //      database.
+    static const std::string kNumSnapshots;
+
+    //  "rocksdb.oldest-snapshot-time" - returns number representing unix
+    //      timestamp of oldest unreleased snapshot.
+    static const std::string kOldestSnapshotTime;
+
+    //  "rocksdb.oldest-snapshot-sequence" - returns number representing
+    //      sequence number of oldest unreleased snapshot.
+    static const std::string kOldestSnapshotSequence;
+
+    //  "rocksdb.num-live-versions" - returns number of live versions. `Version`
+    //      is an internal data structure. See version_set.h for details. More
+    //      live versions often mean more SST files are held from being deleted,
+    //      by iterators or unfinished compactions.
+    static const std::string kNumLiveVersions;
+
+    //  "rocksdb.current-super-version-number" - returns number of current LSM
+    //  version. It is a uint64_t integer number, incremented after there is
+    //  any change to the LSM tree. The number is not preserved after restarting
+    //  the DB. After DB restart, it will start from 0 again.
+    static const std::string kCurrentSuperVersionNumber;
+
+    //  "rocksdb.estimate-live-data-size" - returns an estimate of the amount of
+    //      live data in bytes. For BlobDB, it also includes the exact value of
+    //      live bytes in the blob files of the version.
+    static const std::string kEstimateLiveDataSize;
+
+    //  "rocksdb.min-log-number-to-keep" - return the minimum log number of the
+    //      log files that should be kept.
+    static const std::string kMinLogNumberToKeep;
+
+    //  "rocksdb.min-obsolete-sst-number-to-keep" - return the minimum file
+    //      number for an obsolete SST to be kept. The max value of `uint64_t`
+    //      will be returned if all obsolete files can be deleted.
+    static const std::string kMinObsoleteSstNumberToKeep;
+
+    //  "rocksdb.total-sst-files-size" - returns total size (bytes) of all SST
+    //      files.
+    //  WARNING: may slow down online queries if there are too many files.
+    static const std::string kTotalSstFilesSize;
+
+    //  "rocksdb.live-sst-files-size" - returns total size (bytes) of all SST
+    //      files belong to the latest LSM tree.
+    static const std::string kLiveSstFilesSize;
+
+    // "rocksdb.live_sst_files_size_at_temperature" - returns total size (bytes)
+    //      of SST files at all certain file temperature
+    static const std::string kLiveSstFilesSizeAtTemperature;
+
+    //  "rocksdb.base-level" - returns number of level to which L0 data will be
+    //      compacted.
+    static const std::string kBaseLevel;
+
+    //  "rocksdb.estimate-pending-compaction-bytes" - returns estimated total
+    //      number of bytes compaction needs to rewrite to get all levels down
+    //      to under target size. Not valid for other compactions than level-
+    //      based.
+    static const std::string kEstimatePendingCompactionBytes;
+
+    //  "rocksdb.aggregated-table-properties" - returns a string or map
+    //      representation of the aggregated table properties of the target
+    //      column family. Only properties that make sense for aggregation
+    //      are included.
+    static const std::string kAggregatedTableProperties;
+
+    //  "rocksdb.aggregated-table-properties-at-level<N>", same as the previous
+    //      one but only returns the aggregated table properties of the
+    //      specified level "N" at the target column family.
+    static const std::string kAggregatedTablePropertiesAtLevel;
+
+    //  "rocksdb.actual-delayed-write-rate" - returns the current actual delayed
+    //      write rate. 0 means no delay.
+    static const std::string kActualDelayedWriteRate;
+
+    //  "rocksdb.is-write-stopped" - Return 1 if write has been stopped.
+    static const std::string kIsWriteStopped;
+
+    //  "rocksdb.estimate-oldest-key-time" - returns an estimation of
+    //      oldest key timestamp in the DB. Currently only available for
+    //      FIFO compaction with
+    //      compaction_options_fifo.allow_compaction = false.
+    static const std::string kEstimateOldestKeyTime;
+
+    //  "rocksdb.block-cache-capacity" - returns block cache capacity.
+    static const std::string kBlockCacheCapacity;
+
+    //  "rocksdb.block-cache-usage" - returns the memory size for the entries
+    //      residing in block cache.
+    static const std::string kBlockCacheUsage;
+
+    // "rocksdb.block-cache-pinned-usage" - returns the memory size for the
+    //      entries being pinned.
+    static const std::string kBlockCachePinnedUsage;
+
+    // "rocksdb.options-statistics" - returns multi-line string
+    //      of options.statistics
+    static const std::string kOptionsStatistics;
+
+    // "rocksdb.num-blob-files" - returns number of blob files in the current
+    //      version.
+    static const std::string kNumBlobFiles;
+
+    // "rocksdb.blob-stats" - return the total number and size of all blob
+    //      files, and total amount of garbage (bytes) in the blob files in
+    //      the current version.
+    static const std::string kBlobStats;
+
+    // "rocksdb.total-blob-file-size" - returns the total size of all blob
+    //      files over all versions.
+    static const std::string kTotalBlobFileSize;
+
+    // "rocksdb.live-blob-file-size" - returns the total size of all blob
+    //      files in the current version.
+    static const std::string kLiveBlobFileSize;
+
+    // "rocksdb.live-blob-file-garbage-size" - returns the total amount of
+    // garbage in the blob files in the current version.
+    static const std::string kLiveBlobFileGarbageSize;
+
+    //  "rocksdb.blob-cache-capacity" - returns blob cache capacity.
+    static const std::string kBlobCacheCapacity;
+
+    //  "rocksdb.blob-cache-usage" - returns the memory size for the entries
+    //      residing in blob cache.
+    static const std::string kBlobCacheUsage;
+
+    // "rocksdb.blob-cache-pinned-usage" - returns the memory size for the
+    //      entries being pinned in blob cache.
+    static const std::string kBlobCachePinnedUsage;
+  };
+#endif /* ROCKSDB_LITE */
+
+  // DB implementations export properties about their state via this method.
+  // If "property" is a valid "string" property understood by this DB
+  // implementation (see Properties struct above for valid options), fills
+  // "*value" with its current value and returns true.  Otherwise, returns
+  // false.
+  virtual bool GetProperty(ColumnFamilyHandle* column_family,
+                           const Slice& property, std::string* value) = 0;
+  virtual bool GetProperty(const Slice& property, std::string* value) {
+    return GetProperty(DefaultColumnFamily(), property, value);
+  }
+
+  // Like GetProperty but for valid "map" properties. (Some properties can be
+  // accessed as either "string" properties or "map" properties.)
+  virtual bool GetMapProperty(ColumnFamilyHandle* column_family,
+                              const Slice& property,
+                              std::map<std::string, std::string>* value) = 0;
+  virtual bool GetMapProperty(const Slice& property,
+                              std::map<std::string, std::string>* value) {
+    return GetMapProperty(DefaultColumnFamily(), property, value);
+  }
+
+  // Similar to GetProperty(), but only works for a subset of properties whose
+  // return value is an integer. Return the value by integer. Supported
+  // properties:
+  //  "rocksdb.num-immutable-mem-table"
+  //  "rocksdb.mem-table-flush-pending"
+  //  "rocksdb.compaction-pending"
+  //  "rocksdb.background-errors"
+  //  "rocksdb.cur-size-active-mem-table"
+  //  "rocksdb.cur-size-all-mem-tables"
+  //  "rocksdb.size-all-mem-tables"
+  //  "rocksdb.num-entries-active-mem-table"
+  //  "rocksdb.num-entries-imm-mem-tables"
+  //  "rocksdb.num-deletes-active-mem-table"
+  //  "rocksdb.num-deletes-imm-mem-tables"
+  //  "rocksdb.estimate-num-keys"
+  //  "rocksdb.estimate-table-readers-mem"
+  //  "rocksdb.is-file-deletions-enabled"
+  //  "rocksdb.num-snapshots"
+  //  "rocksdb.oldest-snapshot-time"
+  //  "rocksdb.num-live-versions"
+  //  "rocksdb.current-super-version-number"
+  //  "rocksdb.estimate-live-data-size"
+  //  "rocksdb.min-log-number-to-keep"
+  //  "rocksdb.min-obsolete-sst-number-to-keep"
+  //  "rocksdb.total-sst-files-size"
+  //  "rocksdb.live-sst-files-size"
+  //  "rocksdb.base-level"
+  //  "rocksdb.estimate-pending-compaction-bytes"
+  //  "rocksdb.num-running-compactions"
+  //  "rocksdb.num-running-flushes"
+  //  "rocksdb.actual-delayed-write-rate"
+  //  "rocksdb.is-write-stopped"
+  //  "rocksdb.estimate-oldest-key-time"
+  //  "rocksdb.block-cache-capacity"
+  //  "rocksdb.block-cache-usage"
+  //  "rocksdb.block-cache-pinned-usage"
+  //
+  //  Properties dedicated for BlobDB:
+  //  "rocksdb.num-blob-files"
+  //  "rocksdb.total-blob-file-size"
+  //  "rocksdb.live-blob-file-size"
+  //  "rocksdb.blob-cache-capacity"
+  //  "rocksdb.blob-cache-usage"
+  //  "rocksdb.blob-cache-pinned-usage"
+  virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
+                              const Slice& property, uint64_t* value) = 0;
+  virtual bool GetIntProperty(const Slice& property, uint64_t* value) {
+    return GetIntProperty(DefaultColumnFamily(), property, value);
+  }
+
+  // Reset internal stats for DB and all column families.
+  // Note this doesn't reset options.statistics as it is not owned by
+  // DB.
+  virtual Status ResetStats() {
+    return Status::NotSupported("Not implemented");
+  }
+
+  // Same as GetIntProperty(), but this one returns the aggregated int
+  // property from all column families.
+  virtual bool GetAggregatedIntProperty(const Slice& property,
+                                        uint64_t* value) = 0;
+
+  // Flags for DB::GetSizeApproximation that specify whether memtable
+  // stats should be included, or file stats approximation or both
+  enum class SizeApproximationFlags : uint8_t {
+    NONE = 0,
+    INCLUDE_MEMTABLES = 1 << 0,
+    INCLUDE_FILES = 1 << 1
+  };
+
+  // For each i in [0,n-1], store in "sizes[i]", the approximate
+  // file system space used by keys in "[range[i].start .. range[i].limit)"
+  // in a single column family.
+  //
+  // Note that the returned sizes measure file system space usage, so
+  // if the user data compresses by a factor of ten, the returned
+  // sizes will be one-tenth the size of the corresponding user data size.
+  virtual Status GetApproximateSizes(const SizeApproximationOptions& options,
+                                     ColumnFamilyHandle* column_family,
+                                     const Range* ranges, int n,
+                                     uint64_t* sizes) = 0;
+
+  // Simpler versions of the GetApproximateSizes() method above.
+  // The include_flags argument must of type DB::SizeApproximationFlags
+  // and can not be NONE.
+  virtual Status GetApproximateSizes(ColumnFamilyHandle* column_family,
+                                     const Range* ranges, int n,
+                                     uint64_t* sizes,
+                                     SizeApproximationFlags include_flags =
+                                         SizeApproximationFlags::INCLUDE_FILES);
+
+  virtual Status GetApproximateSizes(
+      const Range* ranges, int n, uint64_t* sizes,
+      SizeApproximationFlags include_flags =
+          SizeApproximationFlags::INCLUDE_FILES) {
+    return GetApproximateSizes(DefaultColumnFamily(), ranges, n, sizes,
+                               include_flags);
+  }
+
+  // The method is similar to GetApproximateSizes, except it
+  // returns approximate number of records in memtables.
+  virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
+                                           const Range& range,
+                                           uint64_t* const count,
+                                           uint64_t* const size) = 0;
+  virtual void GetApproximateMemTableStats(const Range& range,
+                                           uint64_t* const count,
+                                           uint64_t* const size) {
+    GetApproximateMemTableStats(DefaultColumnFamily(), range, count, size);
+  }
+
+  // Compact the underlying storage for the key range [*begin,*end].
+  // The actual compaction interval might be superset of [*begin, *end].
+  // In particular, deleted and overwritten versions are discarded,
+  // and the data is rearranged to reduce the cost of operations
+  // needed to access the data.  This operation should typically only
+  // be invoked by users who understand the underlying implementation.
+  // This call blocks until the operation completes successfully, fails,
+  // or is aborted (Status::Incomplete). See DisableManualCompaction.
+  //
+  // begin==nullptr is treated as a key before all keys in the database.
+  // end==nullptr is treated as a key after all keys in the database.
+  // Therefore the following call will compact the entire database:
+  //    db->CompactRange(options, nullptr, nullptr);
+  // Note that after the entire database is compacted, all data are pushed
+  // down to the last level containing any data. If the total data size after
+  // compaction is reduced, that level might not be appropriate for hosting all
+  // the files. In this case, client could set options.change_level to true, to
+  // move the files back to the minimum level capable of holding the data set
+  // or a given level (specified by non-negative options.target_level).
+  virtual Status CompactRange(const CompactRangeOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice* begin, const Slice* end) = 0;
+  virtual Status CompactRange(const CompactRangeOptions& options,
+                              const Slice* begin, const Slice* end) {
+    return CompactRange(options, DefaultColumnFamily(), begin, end);
+  }
+
+  // Dynamically change column family options or table factory options in a
+  // running DB, for the specified column family. Only options internally
+  // marked as "mutable" can be changed. Options not listed in `opts_map` will
+  // keep their current values. See GetColumnFamilyOptionsFromMap() in
+  // convenience.h for the details of `opts_map`. Not supported in LITE mode.
+  //
+  // USABILITY NOTE: SetOptions is intended only for expert users, and does
+  // not apply the same sanitization to options as the standard DB::Open code
+  // path does. Use with caution.
+  //
+  // RELIABILITY & PERFORMANCE NOTE: SetOptions is not fully stress-tested for
+  // reliability, and this is a slow call because a new OPTIONS file is
+  // serialized and persisted for each call. Use only infrequently.
+  //
+  // EXAMPLES:
+  //  s = db->SetOptions(cfh, {{"ttl", "36000"}});
+  //  s = db->SetOptions(cfh, {{"block_based_table_factory",
+  //                            "{prepopulate_block_cache=kDisable;}"}});
+  virtual Status SetOptions(
+      ColumnFamilyHandle* /*column_family*/,
+      const std::unordered_map<std::string, std::string>& /*opts_map*/) {
+    return Status::NotSupported("Not implemented");
+  }
+  // Shortcut for SetOptions on the default column family handle.
+  virtual Status SetOptions(
+      const std::unordered_map<std::string, std::string>& new_options) {
+    return SetOptions(DefaultColumnFamily(), new_options);
+  }
+
+  // Like SetOptions but for DBOptions, including the same caveats for
+  // usability, reliability, and performance. See GetDBOptionsFromMap() (and
+  // GetColumnFamilyOptionsFromMap()) in convenience.h for details on
+  // `opts_map`. Note supported in LITE mode.
+  //
+  // EXAMPLES:
+  //  s = db->SetDBOptions({{"max_subcompactions", "2"}});
+  //  s = db->SetDBOptions({{"stats_dump_period_sec", "0"},
+  //                        {"stats_persist_period_sec", "0"}});
+  virtual Status SetDBOptions(
+      const std::unordered_map<std::string, std::string>& new_options) = 0;
+
+  // CompactFiles() inputs a list of files specified by file numbers and
+  // compacts them to the specified level. A small difference compared to
+  // CompactRange() is that CompactFiles() performs the compaction job
+  // using the CURRENT thread, so is not considered a "background" job.
+  //
+  // @see GetDataBaseMetaData
+  // @see GetColumnFamilyMetaData
+  virtual Status CompactFiles(
+      const CompactionOptions& compact_options,
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& input_file_names, const int output_level,
+      const int output_path_id = -1,
+      std::vector<std::string>* const output_file_names = nullptr,
+      CompactionJobInfo* compaction_job_info = nullptr) = 0;
+
+  virtual Status CompactFiles(
+      const CompactionOptions& compact_options,
+      const std::vector<std::string>& input_file_names, const int output_level,
+      const int output_path_id = -1,
+      std::vector<std::string>* const output_file_names = nullptr,
+      CompactionJobInfo* compaction_job_info = nullptr) {
+    return CompactFiles(compact_options, DefaultColumnFamily(),
+                        input_file_names, output_level, output_path_id,
+                        output_file_names, compaction_job_info);
+  }
+
+  // This function will wait until all currently running background processes
+  // finish. After it returns, no background process will be run until
+  // ContinueBackgroundWork is called, once for each preceding OK-returning
+  // call to PauseBackgroundWork.
+  virtual Status PauseBackgroundWork() = 0;
+  virtual Status ContinueBackgroundWork() = 0;
+
+  // This function will enable automatic compactions for the given column
+  // families if they were previously disabled. The function will first set the
+  // disable_auto_compactions option for each column family to 'false', after
+  // which it will schedule a flush/compaction.
+  //
+  // NOTE: Setting disable_auto_compactions to 'false' through SetOptions() API
+  // does NOT schedule a flush/compaction afterwards, and only changes the
+  // parameter itself within the column family option.
+  //
+  virtual Status EnableAutoCompaction(
+      const std::vector<ColumnFamilyHandle*>& column_family_handles) = 0;
+
+  // After this function call, CompactRange() or CompactFiles() will not
+  // run compactions and fail. Calling this function will tell outstanding
+  // manual compactions to abort and will wait for them to finish or abort
+  // before returning.
+  virtual void DisableManualCompaction() = 0;
+  // Re-enable CompactRange() and ComapctFiles() that are disabled by
+  // DisableManualCompaction(). This function must be called as many times
+  // as DisableManualCompaction() has been called in order to re-enable
+  // manual compactions, and must not be called more times than
+  // DisableManualCompaction() has been called.
+  virtual void EnableManualCompaction() = 0;
+
+  // Number of levels used for this DB.
+  virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0;
+  virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); }
+
+  // Maximum level to which a new compacted memtable is pushed if it
+  // does not create overlap.
+  virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) = 0;
+  virtual int MaxMemCompactionLevel() {
+    return MaxMemCompactionLevel(DefaultColumnFamily());
+  }
+
+  // Number of files in level-0 that would stop writes.
+  virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) = 0;
+  virtual int Level0StopWriteTrigger() {
+    return Level0StopWriteTrigger(DefaultColumnFamily());
+  }
+
+  // Get DB name -- the exact same name that was provided as an argument to
+  // DB::Open()
+  virtual const std::string& GetName() const = 0;
+
+  // Get Env object from the DB
+  virtual Env* GetEnv() const = 0;
+
+  // A shortcut for GetEnv()->->GetFileSystem().get(), possibly cached for
+  // efficiency.
+  virtual FileSystem* GetFileSystem() const;
+
+  // Get DB Options that we use.  During the process of opening the
+  // column family, the options provided when calling DB::Open() or
+  // DB::CreateColumnFamily() will have been "sanitized" and transformed
+  // in an implementation-defined manner.
+  virtual Options GetOptions(ColumnFamilyHandle* column_family) const = 0;
+  virtual Options GetOptions() const {
+    return GetOptions(DefaultColumnFamily());
+  }
+
+  virtual DBOptions GetDBOptions() const = 0;
+
+  // Flush all mem-table data.
+  // Flush a single column family, even when atomic flush is enabled. To flush
+  // multiple column families, use Flush(options, column_families).
+  virtual Status Flush(const FlushOptions& options,
+                       ColumnFamilyHandle* column_family) = 0;
+  virtual Status Flush(const FlushOptions& options) {
+    return Flush(options, DefaultColumnFamily());
+  }
+  // Flushes multiple column families.
+  // If atomic flush is not enabled, Flush(options, column_families) is
+  // equivalent to calling Flush(options, column_family) multiple times.
+  // If atomic flush is enabled, Flush(options, column_families) will flush all
+  // column families specified in 'column_families' up to the latest sequence
+  // number at the time when flush is requested.
+  // Note that RocksDB 5.15 and earlier may not be able to open later versions
+  // with atomic flush enabled.
+  virtual Status Flush(
+      const FlushOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families) = 0;
+
+  // Flush the WAL memory buffer to the file. If sync is true, it calls SyncWAL
+  // afterwards.
+  virtual Status FlushWAL(bool /*sync*/) {
+    return Status::NotSupported("FlushWAL not implemented");
+  }
+  // Sync the wal. Note that Write() followed by SyncWAL() is not exactly the
+  // same as Write() with sync=true: in the latter case the changes won't be
+  // visible until the sync is done.
+  // Currently only works if allow_mmap_writes = false in Options.
+  virtual Status SyncWAL() = 0;
+
+  // Lock the WAL. Also flushes the WAL after locking.
+  virtual Status LockWAL() {
+    return Status::NotSupported("LockWAL not implemented");
+  }
+
+  // Unlock the WAL.
+  virtual Status UnlockWAL() {
+    return Status::NotSupported("UnlockWAL not implemented");
+  }
+
+  // The sequence number of the most recent transaction.
+  virtual SequenceNumber GetLatestSequenceNumber() const = 0;
+
+  // Prevent file deletions. Compactions will continue to occur,
+  // but no obsolete files will be deleted. Calling this multiple
+  // times have the same effect as calling it once.
+  virtual Status DisableFileDeletions() = 0;
+
+  // Increase the full_history_ts of column family. The new ts_low value should
+  // be newer than current full_history_ts value.
+  // If another thread updates full_history_ts_low concurrently to a higher
+  // timestamp than the requested ts_low, a try again error will be returned.
+  virtual Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family,
+                                          std::string ts_low) = 0;
+
+  // Get current full_history_ts value.
+  virtual Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family,
+                                     std::string* ts_low) = 0;
+
+  // Allow compactions to delete obsolete files.
+  // If force == true, the call to EnableFileDeletions() will guarantee that
+  // file deletions are enabled after the call, even if DisableFileDeletions()
+  // was called multiple times before.
+  // If force == false, EnableFileDeletions will only enable file deletion
+  // after it's been called at least as many times as DisableFileDeletions(),
+  // enabling the two methods to be called by two threads concurrently without
+  // synchronization -- i.e., file deletions will be enabled only after both
+  // threads call EnableFileDeletions()
+  virtual Status EnableFileDeletions(bool force = true) = 0;
+
+#ifndef ROCKSDB_LITE
+  // Retrieves the creation time of the oldest file in the DB.
+  // This API only works if max_open_files = -1, if it is not then
+  // Status returned is Status::NotSupported()
+  // The file creation time is set using the env provided to the DB.
+  // If the DB was created from a very old release then its possible that
+  // the SST files might not have file_creation_time property and even after
+  // moving to a newer release its possible that some files never got compacted
+  // and may not have file_creation_time property. In both the cases
+  // file_creation_time is considered 0 which means this API will return
+  // creation_time = 0 as there wouldn't be a timestamp lower than 0.
+  virtual Status GetCreationTimeOfOldestFile(uint64_t* creation_time) = 0;
+
+  // Note: this API is not yet consistent with WritePrepared transactions.
+  //
+  // Sets iter to an iterator that is positioned at a write-batch whose
+  // sequence number range [start_seq, end_seq] covers seq_number. If no such
+  // write-batch exists, then iter is positioned at the next write-batch whose
+  // start_seq > seq_number.
+  //
+  // Returns Status::OK if iterator is valid
+  // Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to
+  // use this api, else the WAL files will get
+  // cleared aggressively and the iterator might keep getting invalid before
+  // an update is read.
+  virtual Status GetUpdatesSince(
+      SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
+      const TransactionLogIterator::ReadOptions& read_options =
+          TransactionLogIterator::ReadOptions()) = 0;
+
+// Windows API macro interference
+#undef DeleteFile
+  // WARNING: This API is planned for removal in RocksDB 7.0 since it does not
+  // operate at the proper level of abstraction for a key-value store, and its
+  // contract/restrictions are poorly documented. For example, it returns non-OK
+  // `Status` for non-bottommost files and files undergoing compaction. Since we
+  // do not plan to maintain it, the contract will likely remain underspecified
+  // until its removal. Any user is encouraged to read the implementation
+  // carefully and migrate away from it when possible.
+  //
+  // Delete the file name from the db directory and update the internal state to
+  // reflect that. Supports deletion of sst and log files only. 'name' must be
+  // path relative to the db directory. eg. 000001.sst, /archive/000003.log
+  virtual Status DeleteFile(std::string name) = 0;
+
+  // Obtains a list of all live table (SST) files and how they fit into the
+  // LSM-trees, such as column family, level, key range, etc.
+  // This builds a de-normalized form of GetAllColumnFamilyMetaData().
+  // For information about all files in a DB, use GetLiveFilesStorageInfo().
+  virtual void GetLiveFilesMetaData(
+      std::vector<LiveFileMetaData>* /*metadata*/) {}
+
+  // Return a list of all table (SST) and blob files checksum info.
+  // Note: This function might be of limited use because it cannot be
+  // synchronized with other "live files" APIs. GetLiveFilesStorageInfo()
+  // is recommended instead.
+  virtual Status GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) = 0;
+
+  // Get information about all live files that make up a DB, for making
+  // live copies (Checkpoint, backups, etc.) or other storage-related purposes.
+  // If creating a live copy, use DisableFileDeletions() before and
+  // EnableFileDeletions() after to prevent deletions.
+  // For LSM-tree metadata, use Get*MetaData() functions instead.
+  virtual Status GetLiveFilesStorageInfo(
+      const LiveFilesStorageInfoOptions& opts,
+      std::vector<LiveFileStorageInfo>* files) = 0;
+
+  // Obtains the LSM-tree meta data of the specified column family of the DB,
+  // including metadata for each live table (SST) file in that column family.
+  virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/,
+                                       ColumnFamilyMetaData* /*metadata*/) {}
+
+  // Get the metadata of the default column family.
+  void GetColumnFamilyMetaData(ColumnFamilyMetaData* metadata) {
+    GetColumnFamilyMetaData(DefaultColumnFamily(), metadata);
+  }
+
+  // Obtains the LSM-tree meta data of all column families of the DB, including
+  // metadata for each live table (SST) file and each blob file in the DB.
+  virtual void GetAllColumnFamilyMetaData(
+      std::vector<ColumnFamilyMetaData>* /*metadata*/) {}
+
+  // Retrieve the list of all files in the database except WAL files. The files
+  // are relative to the dbname (or db_paths/cf_paths), not absolute paths.
+  // (Not recommended with db_paths/cf_paths because that information is not
+  // returned.) Despite being relative paths, the file names begin with "/".
+  // The valid size of the manifest file is returned in manifest_file_size.
+  // The manifest file is an ever growing file, but only the portion specified
+  // by manifest_file_size is valid for this snapshot. Setting flush_memtable
+  // to true does Flush before recording the live files (unless DB is
+  // read-only). Setting flush_memtable to false is useful when we don't want
+  // to wait for flush which may have to wait for compaction to complete
+  // taking an indeterminate time.
+  //
+  // NOTE: Although GetLiveFiles() followed by GetSortedWalFiles() can generate
+  // a lossless backup, GetLiveFilesStorageInfo() is strongly recommended
+  // instead, because it ensures a single consistent view of all files is
+  // captured in one call.
+  virtual Status GetLiveFiles(std::vector<std::string>&,
+                              uint64_t* manifest_file_size,
+                              bool flush_memtable = true) = 0;
+
+  // Retrieve the sorted list of all wal files with earliest file first
+  virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0;
+
+  // Retrieve information about the current wal file
+  //
+  // Note that the log might have rolled after this call in which case
+  // the current_log_file would not point to the current log file.
+  //
+  // Additionally, for the sake of optimization current_log_file->StartSequence
+  // would always be set to 0
+  virtual Status GetCurrentWalFile(
+      std::unique_ptr<LogFile>* current_log_file) = 0;
+
+  // IngestExternalFile() will load a list of external SST files (1) into the DB
+  // Two primary modes are supported:
+  // - Duplicate keys in the new files will overwrite exiting keys (default)
+  // - Duplicate keys will be skipped (set ingest_behind=true)
+  // In the first mode we will try to find the lowest possible level that
+  // the file can fit in, and ingest the file into this level (2). A file that
+  // have a key range that overlap with the memtable key range will require us
+  // to Flush the memtable first before ingesting the file.
+  // In the second mode we will always ingest in the bottom most level (see
+  // docs to IngestExternalFileOptions::ingest_behind).
+  //
+  // (1) External SST files can be created using SstFileWriter
+  // (2) We will try to ingest the files to the lowest possible level
+  //     even if the file compression doesn't match the level compression
+  // (3) If IngestExternalFileOptions->ingest_behind is set to true,
+  //     we always ingest at the bottommost level, which should be reserved
+  //     for this purpose (see DBOPtions::allow_ingest_behind flag).
+  // (4) If IngestExternalFileOptions->fail_if_not_bottommost_level is set to
+  //     true, then this method can return Status:TryAgain() indicating that
+  //     the files cannot be ingested to the bottommost level, and it is the
+  //     user's responsibility to clear the bottommost level in the overlapping
+  //     range before re-attempting the ingestion.
+  virtual Status IngestExternalFile(
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& external_files,
+      const IngestExternalFileOptions& options) = 0;
+
+  virtual Status IngestExternalFile(
+      const std::vector<std::string>& external_files,
+      const IngestExternalFileOptions& options) {
+    return IngestExternalFile(DefaultColumnFamily(), external_files, options);
+  }
+
+  // IngestExternalFiles() will ingest files for multiple column families, and
+  // record the result atomically to the MANIFEST.
+  // If this function returns OK, all column families' ingestion must succeed.
+  // If this function returns NOK, or the process crashes, then non-of the
+  // files will be ingested into the database after recovery.
+  // Note that it is possible for application to observe a mixed state during
+  // the execution of this function. If the user performs range scan over the
+  // column families with iterators, iterator on one column family may return
+  // ingested data, while iterator on other column family returns old data.
+  // Users can use snapshot for a consistent view of data.
+  // If your db ingests multiple SST files using this API, i.e. args.size()
+  // > 1, then RocksDB 5.15 and earlier will not be able to open it.
+  //
+  // REQUIRES: each arg corresponds to a different column family: namely, for
+  // 0 <= i < j < len(args), args[i].column_family != args[j].column_family.
+  virtual Status IngestExternalFiles(
+      const std::vector<IngestExternalFileArg>& args) = 0;
+
+  // CreateColumnFamilyWithImport() will create a new column family with
+  // column_family_name and import external SST files specified in metadata into
+  // this column family.
+  // (1) External SST files can be created using SstFileWriter.
+  // (2) External SST files can be exported from a particular column family in
+  //     an existing DB using Checkpoint::ExportColumnFamily.
+  // Option in import_options specifies whether the external files are copied or
+  // moved (default is copy). When option specifies copy, managing files at
+  // external_file_path is caller's responsibility. When option specifies a
+  // move, the call makes a best effort to delete the specified files at
+  // external_file_path on successful return, logging any failure to delete
+  // rather than returning in Status. Files are not modified on any error
+  // return, and a best effort is made to remove any newly-created files.
+  // On error return, column family handle returned will be nullptr.
+  // ColumnFamily will be present on successful return and will not be present
+  // on error return. ColumnFamily may be present on any crash during this call.
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& options, const std::string& column_family_name,
+      const ImportColumnFamilyOptions& import_options,
+      const ExportImportFilesMetaData& metadata,
+      ColumnFamilyHandle** handle) = 0;
+
+  // Verify the checksums of files in db. Currently the whole-file checksum of
+  // table files are checked.
+  virtual Status VerifyFileChecksums(const ReadOptions& /*read_options*/) {
+    return Status::NotSupported("File verification not supported");
+  }
+
+  // Verify the block checksums of files in db. The block checksums of table
+  // files are checked.
+  virtual Status VerifyChecksum(const ReadOptions& read_options) = 0;
+
+  virtual Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); }
+
+#endif  // ROCKSDB_LITE
+
+  // Returns the unique ID which is read from IDENTITY file during the opening
+  // of database by setting in the identity variable
+  // Returns Status::OK if identity could be set properly
+  virtual Status GetDbIdentity(std::string& identity) const = 0;
+
+  // Return a unique identifier for each DB object that is opened
+  // This DB session ID should be unique among all open DB instances on all
+  // hosts, and should be unique among re-openings of the same or other DBs.
+  // (Two open DBs have the same identity from other function GetDbIdentity when
+  // one is physically copied from the other.)
+  virtual Status GetDbSessionId(std::string& session_id) const = 0;
+
+  // Returns default column family handle
+  virtual ColumnFamilyHandle* DefaultColumnFamily() const = 0;
+
+#ifndef ROCKSDB_LITE
+
+  virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
+                                          TablePropertiesCollection* props) = 0;
+  virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) {
+    return GetPropertiesOfAllTables(DefaultColumnFamily(), props);
+  }
+  virtual Status GetPropertiesOfTablesInRange(
+      ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
+      TablePropertiesCollection* props) = 0;
+
+  virtual Status SuggestCompactRange(ColumnFamilyHandle* /*column_family*/,
+                                     const Slice* /*begin*/,
+                                     const Slice* /*end*/) {
+    return Status::NotSupported("SuggestCompactRange() is not implemented.");
+  }
+
+  virtual Status PromoteL0(ColumnFamilyHandle* /*column_family*/,
+                           int /*target_level*/) {
+    return Status::NotSupported("PromoteL0() is not implemented.");
+  }
+
+  // Trace DB operations. Use EndTrace() to stop tracing.
+  virtual Status StartTrace(const TraceOptions& /*options*/,
+                            std::unique_ptr<TraceWriter>&& /*trace_writer*/) {
+    return Status::NotSupported("StartTrace() is not implemented.");
+  }
+
+  virtual Status EndTrace() {
+    return Status::NotSupported("EndTrace() is not implemented.");
+  }
+
+  // IO Tracing operations. Use EndIOTrace() to stop tracing.
+  virtual Status StartIOTrace(const TraceOptions& /*options*/,
+                              std::unique_ptr<TraceWriter>&& /*trace_writer*/) {
+    return Status::NotSupported("StartIOTrace() is not implemented.");
+  }
+
+  virtual Status EndIOTrace() {
+    return Status::NotSupported("EndIOTrace() is not implemented.");
+  }
+
+  // Trace block cache accesses. Use EndBlockCacheTrace() to stop tracing.
+  virtual Status StartBlockCacheTrace(
+      const TraceOptions& /*trace_options*/,
+      std::unique_ptr<TraceWriter>&& /*trace_writer*/) {
+    return Status::NotSupported("StartBlockCacheTrace() is not implemented.");
+  }
+
+  virtual Status StartBlockCacheTrace(
+      const BlockCacheTraceOptions& /*options*/,
+      std::unique_ptr<BlockCacheTraceWriter>&& /*trace_writer*/) {
+    return Status::NotSupported("StartBlockCacheTrace() is not implemented.");
+  }
+
+  virtual Status EndBlockCacheTrace() {
+    return Status::NotSupported("EndBlockCacheTrace() is not implemented.");
+  }
+
+  // Create a default trace replayer.
+  virtual Status NewDefaultReplayer(
+      const std::vector<ColumnFamilyHandle*>& /*handles*/,
+      std::unique_ptr<TraceReader>&& /*reader*/,
+      std::unique_ptr<Replayer>* /*replayer*/) {
+    return Status::NotSupported("NewDefaultReplayer() is not implemented.");
+  }
+
+#endif  // ROCKSDB_LITE
+
+  // Needed for StackableDB
+  virtual DB* GetRootDB() { return this; }
+
+  // Given a window [start_time, end_time), setup a StatsHistoryIterator
+  // to access stats history. Note the start_time and end_time are epoch
+  // time measured in seconds, and end_time is an exclusive bound.
+  virtual Status GetStatsHistory(
+      uint64_t /*start_time*/, uint64_t /*end_time*/,
+      std::unique_ptr<StatsHistoryIterator>* /*stats_iterator*/) {
+    return Status::NotSupported("GetStatsHistory() is not implemented.");
+  }
+
+#ifndef ROCKSDB_LITE
+  // Make the secondary instance catch up with the primary by tailing and
+  // replaying the MANIFEST and WAL of the primary.
+  // Column families created by the primary after the secondary instance starts
+  // will be ignored unless the secondary instance closes and restarts with the
+  // newly created column families.
+  // Column families that exist before secondary instance starts and dropped by
+  // the primary afterwards will be marked as dropped. However, as long as the
+  // secondary instance does not delete the corresponding column family
+  // handles, the data of the column family is still accessible to the
+  // secondary.
+  virtual Status TryCatchUpWithPrimary() {
+    return Status::NotSupported("Supported only by secondary instance");
+  }
+#endif  // !ROCKSDB_LITE
+};
+
+// Overloaded operators for enum class SizeApproximationFlags.
+inline DB::SizeApproximationFlags operator&(DB::SizeApproximationFlags lhs,
+                                            DB::SizeApproximationFlags rhs) {
+  return static_cast<DB::SizeApproximationFlags>(static_cast<uint8_t>(lhs) &
+                                                 static_cast<uint8_t>(rhs));
+}
+inline DB::SizeApproximationFlags operator|(DB::SizeApproximationFlags lhs,
+                                            DB::SizeApproximationFlags rhs) {
+  return static_cast<DB::SizeApproximationFlags>(static_cast<uint8_t>(lhs) |
+                                                 static_cast<uint8_t>(rhs));
+}
+
+inline Status DB::GetApproximateSizes(ColumnFamilyHandle* column_family,
+                                      const Range* ranges, int n,
+                                      uint64_t* sizes,
+                                      SizeApproximationFlags include_flags) {
+  SizeApproximationOptions options;
+  options.include_memtables =
+      ((include_flags & SizeApproximationFlags::INCLUDE_MEMTABLES) !=
+       SizeApproximationFlags::NONE);
+  options.include_files =
+      ((include_flags & SizeApproximationFlags::INCLUDE_FILES) !=
+       SizeApproximationFlags::NONE);
+  return GetApproximateSizes(options, column_family, ranges, n, sizes);
+}
+
+// Destroy the contents of the specified database.
+// Be very careful using this method.
+Status DestroyDB(const std::string& name, const Options& options,
+                 const std::vector<ColumnFamilyDescriptor>& column_families =
+                     std::vector<ColumnFamilyDescriptor>());
+
+#ifndef ROCKSDB_LITE
+// If a DB cannot be opened, you may attempt to call this method to
+// resurrect as much of the contents of the database as possible.
+// Some data may be lost, so be careful when calling this function
+// on a database that contains important information.
+//
+// With this API, we will warn and skip data associated with column families not
+// specified in column_families.
+//
+// @param column_families Descriptors for known column families
+Status RepairDB(const std::string& dbname, const DBOptions& db_options,
+                const std::vector<ColumnFamilyDescriptor>& column_families);
+
+// @param unknown_cf_opts Options for column families encountered during the
+//                        repair that were not specified in column_families.
+Status RepairDB(const std::string& dbname, const DBOptions& db_options,
+                const std::vector<ColumnFamilyDescriptor>& column_families,
+                const ColumnFamilyOptions& unknown_cf_opts);
+
+// @param options These options will be used for the database and for ALL column
+//                families encountered during the repair
+Status RepairDB(const std::string& dbname, const Options& options);
+
+#endif
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/db_bench_tool.h b/src/rocksdb/include/rocksdb/db_bench_tool.h
new file mode 100644
index 000000000..17f4e6bde
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/db_bench_tool.h
@@ -0,0 +1,11 @@
+// Copyright (c) 2013-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+int db_bench_tool(int argc, char** argv);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/db_dump_tool.h b/src/rocksdb/include/rocksdb/db_dump_tool.h
new file mode 100644
index 000000000..b7d4766a2
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/db_dump_tool.h
@@ -0,0 +1,45 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "rocksdb/db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct DumpOptions {
+  // Database that will be dumped
+  std::string db_path;
+  // File location that will contain dump output
+  std::string dump_location;
+  // Don't include db information header in the dump
+  bool anonymous = false;
+};
+
+class DbDumpTool {
+ public:
+  bool Run(const DumpOptions& dump_options,
+           ROCKSDB_NAMESPACE::Options options = ROCKSDB_NAMESPACE::Options());
+};
+
+struct UndumpOptions {
+  // Database that we will load the dumped file into
+  std::string db_path;
+  // File location of the dumped file that will be loaded
+  std::string dump_location;
+  // Compact the db after loading the dumped file
+  bool compact_db = false;
+};
+
+class DbUndumpTool {
+ public:
+  bool Run(const UndumpOptions& undump_options,
+           ROCKSDB_NAMESPACE::Options options = ROCKSDB_NAMESPACE::Options());
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/db_stress_tool.h b/src/rocksdb/include/rocksdb/db_stress_tool.h
new file mode 100644
index 000000000..7d3d42c9d
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/db_stress_tool.h
@@ -0,0 +1,11 @@
+// Copyright (c) 2013-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+int db_stress_tool(int argc, char** argv);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/env.h b/src/rocksdb/include/rocksdb/env.h
new file mode 100644
index 000000000..bef60a212
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/env.h
@@ -0,0 +1,1893 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// An Env is an interface used by the rocksdb implementation to access
+// operating system functionality like the filesystem etc.  Callers
+// may wish to provide a custom Env object when opening a database to
+// get fine gain control; e.g., to rate limit file system operations.
+//
+// All Env implementations are safe for concurrent access from
+// multiple threads without any external synchronization.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <cstdarg>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/functor_wrapper.h"
+#include "rocksdb/status.h"
+#include "rocksdb/thread_status.h"
+
+#ifdef _WIN32
+// Windows API macro interference
+#undef DeleteFile
+#undef GetCurrentTime
+#undef LoadLibrary
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#define ROCKSDB_PRINTF_FORMAT_ATTR(format_param, dots_param) \
+  __attribute__((__format__(__printf__, format_param, dots_param)))
+#else
+#define ROCKSDB_PRINTF_FORMAT_ATTR(format_param, dots_param)
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class DynamicLibrary;
+class FileLock;
+class Logger;
+class RandomAccessFile;
+class SequentialFile;
+class Slice;
+struct DataVerificationInfo;
+class WritableFile;
+class RandomRWFile;
+class MemoryMappedFileBuffer;
+class Directory;
+struct DBOptions;
+struct ImmutableDBOptions;
+struct MutableDBOptions;
+class RateLimiter;
+class ThreadStatusUpdater;
+struct ThreadStatus;
+class FileSystem;
+class SystemClock;
+struct ConfigOptions;
+
+const size_t kDefaultPageSize = 4 * 1024;
+
+enum class CpuPriority {
+  kIdle = 0,
+  kLow = 1,
+  kNormal = 2,
+  kHigh = 3,
+};
+
+// Options while opening a file to read/write
+struct EnvOptions {
+  // Construct with default Options
+  EnvOptions();
+
+  // Construct from Options
+  explicit EnvOptions(const DBOptions& options);
+
+  // If true, then use mmap to read data.
+  // Not recommended for 32-bit OS.
+  bool use_mmap_reads = false;
+
+  // If true, then use mmap to write data
+  bool use_mmap_writes = true;
+
+  // If true, then use O_DIRECT for reading data
+  bool use_direct_reads = false;
+
+  // If true, then use O_DIRECT for writing data
+  bool use_direct_writes = false;
+
+  // If false, fallocate() calls are bypassed
+  bool allow_fallocate = true;
+
+  // If true, set the FD_CLOEXEC on open fd.
+  bool set_fd_cloexec = true;
+
+  // Allows OS to incrementally sync files to disk while they are being
+  // written, in the background. Issue one request for every bytes_per_sync
+  // written. 0 turns it off.
+  // Default: 0
+  uint64_t bytes_per_sync = 0;
+
+  // When true, guarantees the file has at most `bytes_per_sync` bytes submitted
+  // for writeback at any given time.
+  //
+  //  - If `sync_file_range` is supported it achieves this by waiting for any
+  //    prior `sync_file_range`s to finish before proceeding. In this way,
+  //    processing (compression, etc.) can proceed uninhibited in the gap
+  //    between `sync_file_range`s, and we block only when I/O falls behind.
+  //  - Otherwise the `WritableFile::Sync` method is used. Note this mechanism
+  //    always blocks, thus preventing the interleaving of I/O and processing.
+  //
+  // Note: Enabling this option does not provide any additional persistence
+  // guarantees, as it may use `sync_file_range`, which does not write out
+  // metadata.
+  //
+  // Default: false
+  bool strict_bytes_per_sync = false;
+
+  // If true, we will preallocate the file with FALLOC_FL_KEEP_SIZE flag, which
+  // means that file size won't change as part of preallocation.
+  // If false, preallocation will also change the file size. This option will
+  // improve the performance in workloads where you sync the data on every
+  // write. By default, we set it to true for MANIFEST writes and false for
+  // WAL writes
+  bool fallocate_with_keep_size = true;
+
+  // See DBOptions doc
+  size_t compaction_readahead_size = 0;
+
+  // See DBOptions doc
+  size_t random_access_max_buffer_size = 0;
+
+  // See DBOptions doc
+  size_t writable_file_max_buffer_size = 1024 * 1024;
+
+  // If not nullptr, write rate limiting is enabled for flush and compaction
+  RateLimiter* rate_limiter = nullptr;
+};
+
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class Env : public Customizable {
+ public:
+  static const char* kDefaultName() { return "DefaultEnv"; }
+  struct FileAttributes {
+    // File name
+    std::string name;
+
+    // Size of file in bytes
+    uint64_t size_bytes;
+  };
+
+  Env();
+  // Construct an Env with a separate FileSystem and/or SystemClock
+  // implementation
+  explicit Env(const std::shared_ptr<FileSystem>& fs);
+  Env(const std::shared_ptr<FileSystem>& fs,
+      const std::shared_ptr<SystemClock>& clock);
+  // No copying allowed
+  Env(const Env&) = delete;
+  void operator=(const Env&) = delete;
+
+  ~Env() override;
+
+  static const char* Type() { return "Environment"; }
+
+  // Deprecated. Will be removed in a major release. Derived classes
+  // should implement this method.
+  const char* Name() const override { return ""; }
+
+  // Loads the environment specified by the input value into the result
+  // The CreateFromString alternative should be used; this method may be
+  // deprecated in a future release.
+  static Status LoadEnv(const std::string& value, Env** result);
+
+  // Loads the environment specified by the input value into the result
+  // The CreateFromString alternative should be used; this method may be
+  // deprecated in a future release.
+  static Status LoadEnv(const std::string& value, Env** result,
+                        std::shared_ptr<Env>* guard);
+
+  // Loads the environment specified by the input value into the result
+  // @see Customizable for a more detailed description of the parameters and
+  // return codes
+  //
+  // @param config_options Controls how the environment is loaded.
+  // @param value the name and associated properties for the environment.
+  // @param result On success, the environment that was loaded.
+  // @param guard If specified and the loaded environment is not static,
+  //      this value will contain the loaded environment (guard.get() ==
+  //      result).
+  // @return OK If the environment was successfully loaded (and optionally
+  // prepared)
+  // @return not-OK if the load failed.
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& value, Env** result);
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& value, Env** result,
+                                 std::shared_ptr<Env>* guard);
+
+  // Loads the environment specified by the env and fs uri.
+  // If both are specified, an error is returned.
+  // Otherwise, the environment is created by loading (via CreateFromString)
+  // the appropriate env/fs from the corresponding values.
+  static Status CreateFromUri(const ConfigOptions& options,
+                              const std::string& env_uri,
+                              const std::string& fs_uri, Env** result,
+                              std::shared_ptr<Env>* guard);
+
+  // Return a default environment suitable for the current operating
+  // system.  Sophisticated users may wish to provide their own Env
+  // implementation instead of relying on this default environment.
+  //
+  // The result of Default() belongs to rocksdb and must never be deleted.
+  static Env* Default();
+
+  // See FileSystem::RegisterDbPaths.
+  virtual Status RegisterDbPaths(const std::vector<std::string>& /*paths*/) {
+    return Status::OK();
+  }
+  // See FileSystem::UnregisterDbPaths.
+  virtual Status UnregisterDbPaths(const std::vector<std::string>& /*paths*/) {
+    return Status::OK();
+  }
+
+  // Create a brand new sequentially-readable file with the specified name.
+  // On success, stores a pointer to the new file in *result and returns OK.
+  // On failure stores nullptr in *result and returns non-OK.  If the file does
+  // not exist, returns a non-OK status.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual Status NewSequentialFile(const std::string& fname,
+                                   std::unique_ptr<SequentialFile>* result,
+                                   const EnvOptions& options) = 0;
+
+  // Create a brand new random access read-only file with the
+  // specified name.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure stores nullptr in *result and
+  // returns non-OK.  If the file does not exist, returns a non-OK
+  // status.
+  //
+  // The returned file may be concurrently accessed by multiple threads.
+  virtual Status NewRandomAccessFile(const std::string& fname,
+                                     std::unique_ptr<RandomAccessFile>* result,
+                                     const EnvOptions& options) = 0;
+  // These values match Linux definition
+  // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56
+  enum WriteLifeTimeHint {
+    WLTH_NOT_SET = 0,  // No hint information set
+    WLTH_NONE,         // No hints about write life time
+    WLTH_SHORT,        // Data written has a short life time
+    WLTH_MEDIUM,       // Data written has a medium life time
+    WLTH_LONG,         // Data written has a long life time
+    WLTH_EXTREME,      // Data written has an extremely long life time
+  };
+
+  // Create an object that writes to a new file with the specified
+  // name.  Deletes any existing file with the same name and creates a
+  // new file.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure stores nullptr in *result and
+  // returns non-OK.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual Status NewWritableFile(const std::string& fname,
+                                 std::unique_ptr<WritableFile>* result,
+                                 const EnvOptions& options) = 0;
+
+  // Create an object that writes to a file with the specified name.
+  // `WritableFile::Append()`s will append after any existing content.  If the
+  // file does not already exist, creates it.
+  //
+  // On success, stores a pointer to the file in *result and returns OK.  On
+  // failure stores nullptr in *result and returns non-OK.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual Status ReopenWritableFile(const std::string& /*fname*/,
+                                    std::unique_ptr<WritableFile>* /*result*/,
+                                    const EnvOptions& /*options*/) {
+    return Status::NotSupported("Env::ReopenWritableFile() not supported.");
+  }
+
+  // Reuse an existing file by renaming it and opening it as writable.
+  virtual Status ReuseWritableFile(const std::string& fname,
+                                   const std::string& old_fname,
+                                   std::unique_ptr<WritableFile>* result,
+                                   const EnvOptions& options);
+
+  // Open `fname` for random read and write, if file doesn't exist the file
+  // will be created.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure returns non-OK.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual Status NewRandomRWFile(const std::string& /*fname*/,
+                                 std::unique_ptr<RandomRWFile>* /*result*/,
+                                 const EnvOptions& /*options*/) {
+    return Status::NotSupported("RandomRWFile is not implemented in this Env");
+  }
+
+  // Opens `fname` as a memory-mapped file for read and write (in-place updates
+  // only, i.e., no appends). On success, stores a raw buffer covering the whole
+  // file in `*result`. The file must exist prior to this call.
+  virtual Status NewMemoryMappedFileBuffer(
+      const std::string& /*fname*/,
+      std::unique_ptr<MemoryMappedFileBuffer>* /*result*/) {
+    return Status::NotSupported(
+        "MemoryMappedFileBuffer is not implemented in this Env");
+  }
+
+  // Create an object that represents a directory. Will fail if directory
+  // doesn't exist. If the directory exists, it will open the directory
+  // and create a new Directory object.
+  //
+  // On success, stores a pointer to the new Directory in
+  // *result and returns OK. On failure stores nullptr in *result and
+  // returns non-OK.
+  virtual Status NewDirectory(const std::string& name,
+                              std::unique_ptr<Directory>* result) = 0;
+
+  // Returns OK if the named file exists.
+  //         NotFound if the named file does not exist,
+  //                  the calling process does not have permission to determine
+  //                  whether this file exists, or if the path is invalid.
+  //         IOError if an IO Error was encountered
+  virtual Status FileExists(const std::string& fname) = 0;
+
+  // Store in *result the names of the children of the specified directory.
+  // The names are relative to "dir", and shall never include the
+  // names `.` or `..`.
+  // Original contents of *results are dropped.
+  // Returns OK if "dir" exists and "*result" contains its children.
+  //         NotFound if "dir" does not exist, the calling process does not have
+  //                  permission to access "dir", or if "dir" is invalid.
+  //         IOError if an IO Error was encountered
+  virtual Status GetChildren(const std::string& dir,
+                             std::vector<std::string>* result) = 0;
+
+  // Store in *result the attributes of the children of the specified directory.
+  // In case the implementation lists the directory prior to iterating the files
+  // and files are concurrently deleted, the deleted files will be omitted from
+  // result.
+  // The name attributes are relative to "dir", and shall never include the
+  // names `.` or `..`.
+  // Original contents of *results are dropped.
+  // Returns OK if "dir" exists and "*result" contains its children.
+  //         NotFound if "dir" does not exist, the calling process does not have
+  //                  permission to access "dir", or if "dir" is invalid.
+  //         IOError if an IO Error was encountered
+  virtual Status GetChildrenFileAttributes(const std::string& dir,
+                                           std::vector<FileAttributes>* result);
+
+  // Delete the named file.
+  virtual Status DeleteFile(const std::string& fname) = 0;
+
+  // Truncate the named file to the specified size.
+  virtual Status Truncate(const std::string& /*fname*/, size_t /*size*/) {
+    return Status::NotSupported("Truncate is not supported for this Env");
+  }
+
+  // Create the specified directory. Returns error if directory exists.
+  virtual Status CreateDir(const std::string& dirname) = 0;
+
+  // Creates directory if missing. Return Ok if it exists, or successful in
+  // Creating.
+  virtual Status CreateDirIfMissing(const std::string& dirname) = 0;
+
+  // Delete the specified directory.
+  // Many implementations of this function will only delete a directory if it is
+  // empty.
+  virtual Status DeleteDir(const std::string& dirname) = 0;
+
+  // Store the size of fname in *file_size.
+  virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0;
+
+  // Store the last modification time of fname in *file_mtime.
+  virtual Status GetFileModificationTime(const std::string& fname,
+                                         uint64_t* file_mtime) = 0;
+  // Rename file src to target.
+  virtual Status RenameFile(const std::string& src,
+                            const std::string& target) = 0;
+
+  // Hard Link file src to target.
+  virtual Status LinkFile(const std::string& /*src*/,
+                          const std::string& /*target*/) {
+    return Status::NotSupported("LinkFile is not supported for this Env");
+  }
+
+  virtual Status NumFileLinks(const std::string& /*fname*/,
+                              uint64_t* /*count*/) {
+    return Status::NotSupported(
+        "Getting number of file links is not supported for this Env");
+  }
+
+  virtual Status AreFilesSame(const std::string& /*first*/,
+                              const std::string& /*second*/, bool* /*res*/) {
+    return Status::NotSupported("AreFilesSame is not supported for this Env");
+  }
+
+  // Lock the specified file.  Used to prevent concurrent access to
+  // the same db by multiple processes.  On failure, stores nullptr in
+  // *lock and returns non-OK.
+  //
+  // On success, stores a pointer to the object that represents the
+  // acquired lock in *lock and returns OK.  The caller should call
+  // UnlockFile(*lock) to release the lock.  If the process exits,
+  // the lock will be automatically released.
+  //
+  // If somebody else already holds the lock, finishes immediately
+  // with a failure.  I.e., this call does not wait for existing locks
+  // to go away.
+  //
+  // May create the named file if it does not already exist.
+  virtual Status LockFile(const std::string& fname, FileLock** lock) = 0;
+
+  // Release the lock acquired by a previous successful call to LockFile.
+  // REQUIRES: lock was returned by a successful LockFile() call
+  // REQUIRES: lock has not already been unlocked.
+  virtual Status UnlockFile(FileLock* lock) = 0;
+
+  // Opens `lib_name` as a dynamic library.
+  // If the 'search_path' is specified, breaks the path into its components
+  // based on the appropriate platform separator (";" or ";") and looks for the
+  // library in those directories.  If 'search path is not specified, uses the
+  // default library path search mechanism (such as LD_LIBRARY_PATH). On
+  // success, stores a dynamic library in `*result`.
+  virtual Status LoadLibrary(const std::string& /*lib_name*/,
+                             const std::string& /*search_path */,
+                             std::shared_ptr<DynamicLibrary>* /*result*/) {
+    return Status::NotSupported("LoadLibrary is not implemented in this Env");
+  }
+
+  // Priority for scheduling job in thread pool
+  enum Priority { BOTTOM, LOW, HIGH, USER, TOTAL };
+
+  static std::string PriorityToString(Priority priority);
+
+  // Priority for requesting bytes in rate limiter scheduler
+  enum IOPriority {
+    IO_LOW = 0,
+    IO_MID = 1,
+    IO_HIGH = 2,
+    IO_USER = 3,
+    IO_TOTAL = 4
+  };
+
+  // Arrange to run "(*function)(arg)" once in a background thread, in
+  // the thread pool specified by pri. By default, jobs go to the 'LOW'
+  // priority thread pool.
+
+  // "function" may run in an unspecified thread.  Multiple functions
+  // added to the same Env may run concurrently in different threads.
+  // I.e., the caller may not assume that background work items are
+  // serialized.
+  // When the UnSchedule function is called, the unschedFunction
+  // registered at the time of Schedule is invoked with arg as a parameter.
+  virtual void Schedule(void (*function)(void* arg), void* arg,
+                        Priority pri = LOW, void* tag = nullptr,
+                        void (*unschedFunction)(void* arg) = nullptr) = 0;
+
+  // Arrange to remove jobs for given arg from the queue_ if they are not
+  // already scheduled. Caller is expected to have exclusive lock on arg.
+  virtual int UnSchedule(void* /*arg*/, Priority /*pri*/) { return 0; }
+
+  // Start a new thread, invoking "function(arg)" within the new thread.
+  // When "function(arg)" returns, the thread will be destroyed.
+  virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
+
+  // Start a new thread, invoking "function(args...)" within the new thread.
+  // When "function(args...)" returns, the thread will be destroyed.
+  template <typename FunctionT, typename... Args>
+  void StartThreadTyped(FunctionT function, Args&&... args) {
+    using FWType = FunctorWrapper<Args...>;
+    StartThread(
+        [](void* arg) {
+          auto* functor = static_cast<FWType*>(arg);
+          functor->invoke();
+          delete functor;
+        },
+        new FWType(std::function<void(Args...)>(function),
+                   std::forward<Args>(args)...));
+  }
+
+  // Wait for all threads started by StartThread to terminate.
+  virtual void WaitForJoin() {}
+
+  // Reserve available background threads in the specified thread pool.
+  virtual int ReserveThreads(int /*threads_to_be_reserved*/, Priority /*pri*/) {
+    return 0;
+  }
+
+  // Release a specific number of reserved threads from the specified thread
+  // pool
+  virtual int ReleaseThreads(int /*threads_to_be_released*/, Priority /*pri*/) {
+    return 0;
+  }
+
+  // Get thread pool queue length for specific thread pool.
+  virtual unsigned int GetThreadPoolQueueLen(Priority /*pri*/ = LOW) const {
+    return 0;
+  }
+
+  // *path is set to a temporary directory that can be used for testing. It may
+  // or many not have just been created. The directory may or may not differ
+  // between runs of the same process, but subsequent calls will return the
+  // same directory.
+  virtual Status GetTestDirectory(std::string* path) = 0;
+
+  // Create and returns a default logger (an instance of EnvLogger) for storing
+  // informational messages. Derived classes can override to provide custom
+  // logger.
+  virtual Status NewLogger(const std::string& fname,
+                           std::shared_ptr<Logger>* result);
+
+  // Returns the number of micro-seconds since some fixed point in time.
+  // It is often used as system time such as in GenericRateLimiter
+  // and other places so a port needs to return system time in order to work.
+  virtual uint64_t NowMicros() = 0;
+
+  // Returns the number of nano-seconds since some fixed point in time. Only
+  // useful for computing deltas of time in one run.
+  // Default implementation simply relies on NowMicros.
+  // In platform-specific implementations, NowNanos() should return time points
+  // that are MONOTONIC.
+  virtual uint64_t NowNanos() { return NowMicros() * 1000; }
+
+  // 0 indicates not supported.
+  virtual uint64_t NowCPUNanos() { return 0; }
+
+  // Sleep/delay the thread for the prescribed number of micro-seconds.
+  virtual void SleepForMicroseconds(int micros) = 0;
+
+  // Get the current host name as a null terminated string iff the string
+  // length is < len. The hostname should otherwise be truncated to len.
+  virtual Status GetHostName(char* name, uint64_t len) = 0;
+
+  // Get the current hostname from the given env as a std::string in result.
+  // The result may be truncated if the hostname is too
+  // long
+  virtual Status GetHostNameString(std::string* result);
+
+  // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC).
+  // Only overwrites *unix_time on success.
+  virtual Status GetCurrentTime(int64_t* unix_time) = 0;
+
+  // Get full directory name for this db.
+  virtual Status GetAbsolutePath(const std::string& db_path,
+                                 std::string* output_path) = 0;
+
+  // The number of background worker threads of a specific thread pool
+  // for this environment. 'LOW' is the default pool.
+  // default number: 1
+  virtual void SetBackgroundThreads(int number, Priority pri = LOW) = 0;
+  virtual int GetBackgroundThreads(Priority pri = LOW) = 0;
+
+  virtual Status SetAllowNonOwnerAccess(bool /*allow_non_owner_access*/) {
+    return Status::NotSupported("Env::SetAllowNonOwnerAccess() not supported.");
+  }
+
+  // Enlarge number of background worker threads of a specific thread pool
+  // for this environment if it is smaller than specified. 'LOW' is the default
+  // pool.
+  virtual void IncBackgroundThreadsIfNeeded(int number, Priority pri) = 0;
+
+  // Lower IO priority for threads from the specified pool.
+  virtual void LowerThreadPoolIOPriority(Priority /*pool*/ = LOW) {}
+
+  // Lower CPU priority for threads from the specified pool.
+  virtual Status LowerThreadPoolCPUPriority(Priority /*pool*/,
+                                            CpuPriority /*pri*/) {
+    return Status::NotSupported(
+        "Env::LowerThreadPoolCPUPriority(Priority, CpuPriority) not supported");
+  }
+
+  // Lower CPU priority for threads from the specified pool.
+  virtual void LowerThreadPoolCPUPriority(Priority /*pool*/ = LOW) {}
+
+  // Converts seconds-since-Jan-01-1970 to a printable string
+  virtual std::string TimeToString(uint64_t time) = 0;
+
+  // Generates a human-readable unique ID that can be used to identify a DB.
+  // In built-in implementations, this is an RFC-4122 UUID string, but might
+  // not be in all implementations. Overriding is not recommended.
+  // NOTE: this has not be validated for use in cryptography
+  virtual std::string GenerateUniqueId();
+
+  // OptimizeForLogWrite will create a new EnvOptions object that is a copy of
+  // the EnvOptions in the parameters, but is optimized for reading log files.
+  virtual EnvOptions OptimizeForLogRead(const EnvOptions& env_options) const;
+
+  // OptimizeForManifestRead will create a new EnvOptions object that is a copy
+  // of the EnvOptions in the parameters, but is optimized for reading manifest
+  // files.
+  virtual EnvOptions OptimizeForManifestRead(
+      const EnvOptions& env_options) const;
+
+  // OptimizeForLogWrite will create a new EnvOptions object that is a copy of
+  // the EnvOptions in the parameters, but is optimized for writing log files.
+  // Default implementation returns the copy of the same object.
+  virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
+                                         const DBOptions& db_options) const;
+  // OptimizeForManifestWrite will create a new EnvOptions object that is a copy
+  // of the EnvOptions in the parameters, but is optimized for writing manifest
+  // files. Default implementation returns the copy of the same object.
+  virtual EnvOptions OptimizeForManifestWrite(
+      const EnvOptions& env_options) const;
+
+  // OptimizeForCompactionTableWrite will create a new EnvOptions object that is
+  // a copy of the EnvOptions in the parameters, but is optimized for writing
+  // table files.
+  virtual EnvOptions OptimizeForCompactionTableWrite(
+      const EnvOptions& env_options,
+      const ImmutableDBOptions& immutable_ops) const;
+
+  // OptimizeForCompactionTableWrite will create a new EnvOptions object that
+  // is a copy of the EnvOptions in the parameters, but is optimized for reading
+  // table files.
+  virtual EnvOptions OptimizeForCompactionTableRead(
+      const EnvOptions& env_options,
+      const ImmutableDBOptions& db_options) const;
+
+  // OptimizeForBlobFileRead will create a new EnvOptions object that
+  // is a copy of the EnvOptions in the parameters, but is optimized for reading
+  // blob files.
+  virtual EnvOptions OptimizeForBlobFileRead(
+      const EnvOptions& env_options,
+      const ImmutableDBOptions& db_options) const;
+
+  // Returns the status of all threads that belong to the current Env.
+  virtual Status GetThreadList(std::vector<ThreadStatus>* /*thread_list*/) {
+    return Status::NotSupported("Env::GetThreadList() not supported.");
+  }
+
+  // Returns the pointer to ThreadStatusUpdater.  This function will be
+  // used in RocksDB internally to update thread status and supports
+  // GetThreadList().
+  virtual ThreadStatusUpdater* GetThreadStatusUpdater() const {
+    return thread_status_updater_;
+  }
+
+  // Returns the ID of the current thread.
+  virtual uint64_t GetThreadID() const;
+
+// This seems to clash with a macro on Windows, so #undef it here
+#undef GetFreeSpace
+
+  // Get the amount of free disk space
+  virtual Status GetFreeSpace(const std::string& /*path*/,
+                              uint64_t* /*diskfree*/) {
+    return Status::NotSupported("Env::GetFreeSpace() not supported.");
+  }
+
+  // Check whether the specified path is a directory
+  virtual Status IsDirectory(const std::string& /*path*/, bool* /*is_dir*/) {
+    return Status::NotSupported("Env::IsDirectory() not supported.");
+  }
+
+  virtual void SanitizeEnvOptions(EnvOptions* /*env_opts*/) const {}
+
+  // Get the FileSystem implementation this Env was constructed with. It
+  // could be a fully implemented one, or a wrapper class around the Env
+  const std::shared_ptr<FileSystem>& GetFileSystem() const;
+
+  // Get the SystemClock implementation this Env was constructed with. It
+  // could be a fully implemented one, or a wrapper class around the Env
+  const std::shared_ptr<SystemClock>& GetSystemClock() const;
+
+  // If you're adding methods here, remember to add them to EnvWrapper too.
+
+ protected:
+  // The pointer to an internal structure that will update the
+  // status of each thread.
+  ThreadStatusUpdater* thread_status_updater_;
+
+  // Pointer to the underlying FileSystem implementation
+  std::shared_ptr<FileSystem> file_system_;
+
+  // Pointer to the underlying SystemClock implementation
+  std::shared_ptr<SystemClock> system_clock_;
+
+ private:
+  static const size_t kMaxHostNameLen = 256;
+};
+
+// The factory function to construct a ThreadStatusUpdater.  Any Env
+// that supports GetThreadList() feature should call this function in its
+// constructor to initialize thread_status_updater_.
+ThreadStatusUpdater* CreateThreadStatusUpdater();
+
+// A file abstraction for reading sequentially through a file
+class SequentialFile {
+ public:
+  SequentialFile() {}
+  virtual ~SequentialFile();
+
+  // Read up to "n" bytes from the file.  "scratch[0..n-1]" may be
+  // written by this routine.  Sets "*result" to the data that was
+  // read (including if fewer than "n" bytes were successfully read).
+  // May set "*result" to point at data in "scratch[0..n-1]", so
+  // "scratch[0..n-1]" must be live when "*result" is used.
+  // If an error was encountered, returns a non-OK status.
+  //
+  // After call, result->size() < n only if end of file has been
+  // reached (or non-OK status). Read might fail if called again after
+  // first result->size() < n.
+  //
+  // REQUIRES: External synchronization
+  virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
+
+  // Skip "n" bytes from the file. This is guaranteed to be no
+  // slower that reading the same data, but may be faster.
+  //
+  // If end of file is reached, skipping will stop at the end of the
+  // file, and Skip will return OK.
+  //
+  // REQUIRES: External synchronization
+  virtual Status Skip(uint64_t n) = 0;
+
+  // Indicates the upper layers if the current SequentialFile implementation
+  // uses direct IO.
+  virtual bool use_direct_io() const { return false; }
+
+  // Use the returned alignment value to allocate
+  // aligned buffer for Direct I/O
+  virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) {
+    return Status::NotSupported(
+        "SequentialFile::InvalidateCache not supported.");
+  }
+
+  // Positioned Read for direct I/O
+  // If Direct I/O enabled, offset, n, and scratch should be properly aligned
+  virtual Status PositionedRead(uint64_t /*offset*/, size_t /*n*/,
+                                Slice* /*result*/, char* /*scratch*/) {
+    return Status::NotSupported(
+        "SequentialFile::PositionedRead() not supported.");
+  }
+
+  // If you're adding methods here, remember to add them to
+  // SequentialFileWrapper too.
+};
+
+// A read IO request structure for use in MultiRead
+struct ReadRequest {
+  // File offset in bytes
+  uint64_t offset;
+
+  // Length to read in bytes. `result` only returns fewer bytes if end of file
+  // is hit (or `status` is not OK).
+  size_t len;
+
+  // A buffer that MultiRead()  can optionally place data in. It can
+  // ignore this and allocate its own buffer
+  char* scratch;
+
+  // Output parameter set by MultiRead() to point to the data buffer, and
+  // the number of valid bytes
+  Slice result;
+
+  // Status of read
+  Status status;
+};
+
+// A file abstraction for randomly reading the contents of a file.
+class RandomAccessFile {
+ public:
+  RandomAccessFile() {}
+  virtual ~RandomAccessFile();
+
+  // Read up to "n" bytes from the file starting at "offset".
+  // "scratch[0..n-1]" may be written by this routine.  Sets "*result"
+  // to the data that was read (including if fewer than "n" bytes were
+  // successfully read).  May set "*result" to point at data in
+  // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
+  // "*result" is used.  If an error was encountered, returns a non-OK
+  // status.
+  //
+  // After call, result->size() < n only if end of file has been
+  // reached (or non-OK status). Read might fail if called again after
+  // first result->size() < n.
+  //
+  // Safe for concurrent use by multiple threads.
+  // If Direct I/O enabled, offset, n, and scratch should be aligned properly.
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const = 0;
+
+  // Readahead the file starting from offset by n bytes for caching.
+  virtual Status Prefetch(uint64_t /*offset*/, size_t /*n*/) {
+    return Status::OK();
+  }
+
+  // Read a bunch of blocks as described by reqs. The blocks can
+  // optionally be read in parallel. This is a synchronous call, i.e it
+  // should return after all reads have completed. The reads will be
+  // non-overlapping. If the function return Status is not ok, status of
+  // individual requests will be ignored and return status will be assumed
+  // for all read requests. The function return status is only meant for
+  // any errors that occur before even processing specific read requests
+  virtual Status MultiRead(ReadRequest* reqs, size_t num_reqs) {
+    assert(reqs != nullptr);
+    for (size_t i = 0; i < num_reqs; ++i) {
+      ReadRequest& req = reqs[i];
+      req.status = Read(req.offset, req.len, &req.result, req.scratch);
+    }
+    return Status::OK();
+  }
+
+  // Tries to get an unique ID for this file that will be the same each time
+  // the file is opened (and will stay the same while the file is open).
+  // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
+  // ID can be created this function returns the length of the ID and places it
+  // in "id"; otherwise, this function returns 0, in which case "id"
+  // may not have been modified.
+  //
+  // This function guarantees, for IDs from a given environment, two unique ids
+  // cannot be made equal to each other by adding arbitrary bytes to one of
+  // them. That is, no unique ID is the prefix of another.
+  //
+  // This function guarantees that the returned ID will not be interpretable as
+  // a single varint.
+  //
+  // Note: these IDs are only valid for the duration of the process.
+  virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
+    return 0;  // Default implementation to prevent issues with backwards
+               // compatibility.
+  }
+
+  enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
+
+  virtual void Hint(AccessPattern /*pattern*/) {}
+
+  // Indicates the upper layers if the current RandomAccessFile implementation
+  // uses direct IO.
+  virtual bool use_direct_io() const { return false; }
+
+  // Use the returned alignment value to allocate
+  // aligned buffer for Direct I/O
+  virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) {
+    return Status::NotSupported(
+        "RandomAccessFile::InvalidateCache not supported.");
+  }
+
+  // If you're adding methods here, remember to add them to
+  // RandomAccessFileWrapper too.
+};
+
+// A file abstraction for sequential writing.  The implementation
+// must provide buffering since callers may append small fragments
+// at a time to the file.
+class WritableFile {
+ public:
+  WritableFile()
+      : last_preallocated_block_(0),
+        preallocation_block_size_(0),
+        io_priority_(Env::IO_TOTAL),
+        write_hint_(Env::WLTH_NOT_SET),
+        strict_bytes_per_sync_(false) {}
+
+  explicit WritableFile(const EnvOptions& options)
+      : last_preallocated_block_(0),
+        preallocation_block_size_(0),
+        io_priority_(Env::IO_TOTAL),
+        write_hint_(Env::WLTH_NOT_SET),
+        strict_bytes_per_sync_(options.strict_bytes_per_sync) {}
+  // No copying allowed
+  WritableFile(const WritableFile&) = delete;
+  void operator=(const WritableFile&) = delete;
+
+  virtual ~WritableFile();
+
+  // Append data to the end of the file
+  // Note: A WritableFile object must support either Append or
+  // PositionedAppend, so the users cannot mix the two.
+  virtual Status Append(const Slice& data) = 0;
+
+  // Append data with verification information.
+  // Note that this API change is experimental and it might be changed in
+  // the future. Currently, RocksDB only generates crc32c based checksum for
+  // the file writes when the checksum handoff option is set.
+  // Expected behavior: if currently ChecksumType::kCRC32C is not supported by
+  // WritableFile, the information in DataVerificationInfo can be ignored
+  // (i.e. does not perform checksum verification).
+  virtual Status Append(const Slice& data,
+                        const DataVerificationInfo& /* verification_info */) {
+    return Append(data);
+  }
+
+  // PositionedAppend data to the specified offset. The new EOF after append
+  // must be larger than the previous EOF. This is to be used when writes are
+  // not backed by OS buffers and hence has to always start from the start of
+  // the sector. The implementation thus needs to also rewrite the last
+  // partial sector.
+  // Note: PositionAppend does not guarantee moving the file offset after the
+  // write. A WritableFile object must support either Append or
+  // PositionedAppend, so the users cannot mix the two.
+  //
+  // PositionedAppend() can only happen on the page/sector boundaries. For that
+  // reason, if the last write was an incomplete sector we still need to rewind
+  // back to the nearest sector/page and rewrite the portion of it with whatever
+  // we need to add. We need to keep where we stop writing.
+  //
+  // PositionedAppend() can only write whole sectors. For that reason we have to
+  // pad with zeros for the last write and trim the file when closing according
+  // to the position we keep in the previous step.
+  //
+  // PositionedAppend() requires aligned buffer to be passed in. The alignment
+  // required is queried via GetRequiredBufferAlignment()
+  virtual Status PositionedAppend(const Slice& /* data */,
+                                  uint64_t /* offset */) {
+    return Status::NotSupported(
+        "WritableFile::PositionedAppend() not supported.");
+  }
+
+  // PositionedAppend data with verification information.
+  // Note that this API change is experimental and it might be changed in
+  // the future. Currently, RocksDB only generates crc32c based checksum for
+  // the file writes when the checksum handoff option is set.
+  // Expected behavior: if currently ChecksumType::kCRC32C is not supported by
+  // WritableFile, the information in DataVerificationInfo can be ignored
+  // (i.e. does not perform checksum verification).
+  virtual Status PositionedAppend(
+      const Slice& /* data */, uint64_t /* offset */,
+      const DataVerificationInfo& /* verification_info */) {
+    return Status::NotSupported("PositionedAppend");
+  }
+
+  // Truncate is necessary to trim the file to the correct size
+  // before closing. It is not always possible to keep track of the file
+  // size due to whole pages writes. The behavior is undefined if called
+  // with other writes to follow.
+  virtual Status Truncate(uint64_t /*size*/) { return Status::OK(); }
+  virtual Status Close() = 0;
+  virtual Status Flush() = 0;
+  virtual Status Sync() = 0;  // sync data
+
+  /*
+   * Sync data and/or metadata as well.
+   * By default, sync only data.
+   * Override this method for environments where we need to sync
+   * metadata as well.
+   */
+  virtual Status Fsync() { return Sync(); }
+
+  // true if Sync() and Fsync() are safe to call concurrently with Append()
+  // and Flush().
+  virtual bool IsSyncThreadSafe() const { return false; }
+
+  // Indicates the upper layers if the current WritableFile implementation
+  // uses direct IO.
+  virtual bool use_direct_io() const { return false; }
+
+  // Use the returned alignment value to allocate
+  // aligned buffer for Direct I/O
+  virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+  /*
+   * If rate limiting is enabled, change the file-granularity priority used in
+   * rate-limiting writes.
+   *
+   * In the presence of finer-granularity priority such as
+   * `WriteOptions::rate_limiter_priority`, this file-granularity priority may
+   * be overridden by a non-Env::IO_TOTAL finer-granularity priority and used as
+   * a fallback for Env::IO_TOTAL finer-granularity priority.
+   *
+   * If rate limiting is not enabled, this call has no effect.
+   */
+  virtual void SetIOPriority(Env::IOPriority pri) { io_priority_ = pri; }
+
+  virtual Env::IOPriority GetIOPriority() { return io_priority_; }
+
+  virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) {
+    write_hint_ = hint;
+  }
+
+  virtual Env::WriteLifeTimeHint GetWriteLifeTimeHint() { return write_hint_; }
+  /*
+   * Get the size of valid data in the file.
+   */
+  virtual uint64_t GetFileSize() { return 0; }
+
+  /*
+   * Get and set the default pre-allocation block size for writes to
+   * this file.  If non-zero, then Allocate will be used to extend the
+   * underlying storage of a file (generally via fallocate) if the Env
+   * instance supports it.
+   */
+  virtual void SetPreallocationBlockSize(size_t size) {
+    preallocation_block_size_ = size;
+  }
+
+  virtual void GetPreallocationStatus(size_t* block_size,
+                                      size_t* last_allocated_block) {
+    *last_allocated_block = last_preallocated_block_;
+    *block_size = preallocation_block_size_;
+  }
+
+  // For documentation, refer to RandomAccessFile::GetUniqueId()
+  virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
+    return 0;  // Default implementation to prevent issues with backwards
+  }
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  // This call has no effect on dirty pages in the cache.
+  virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) {
+    return Status::NotSupported("WritableFile::InvalidateCache not supported.");
+  }
+
+  // Sync a file range with disk.
+  // offset is the starting byte of the file range to be synchronized.
+  // nbytes specifies the length of the range to be synchronized.
+  // This asks the OS to initiate flushing the cached data to disk,
+  // without waiting for completion.
+  // Default implementation does nothing.
+  virtual Status RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/) {
+    if (strict_bytes_per_sync_) {
+      return Sync();
+    }
+    return Status::OK();
+  }
+
+  // PrepareWrite performs any necessary preparation for a write
+  // before the write actually occurs.  This allows for pre-allocation
+  // of space on devices where it can result in less file
+  // fragmentation and/or less waste from over-zealous filesystem
+  // pre-allocation.
+  virtual void PrepareWrite(size_t offset, size_t len) {
+    if (preallocation_block_size_ == 0) {
+      return;
+    }
+    // If this write would cross one or more preallocation blocks,
+    // determine what the last preallocation block necessary to
+    // cover this write would be and Allocate to that point.
+    const auto block_size = preallocation_block_size_;
+    size_t new_last_preallocated_block =
+        (offset + len + block_size - 1) / block_size;
+    if (new_last_preallocated_block > last_preallocated_block_) {
+      size_t num_spanned_blocks =
+          new_last_preallocated_block - last_preallocated_block_;
+      // TODO: Don't ignore errors from allocate
+      Allocate(block_size * last_preallocated_block_,
+               block_size * num_spanned_blocks)
+          .PermitUncheckedError();
+      last_preallocated_block_ = new_last_preallocated_block;
+    }
+  }
+
+  // Pre-allocates space for a file.
+  virtual Status Allocate(uint64_t /*offset*/, uint64_t /*len*/) {
+    return Status::OK();
+  }
+
+  // If you're adding methods here, remember to add them to
+  // WritableFileWrapper too.
+
+ protected:
+  size_t preallocation_block_size() { return preallocation_block_size_; }
+
+ private:
+  size_t last_preallocated_block_;
+  size_t preallocation_block_size_;
+
+ protected:
+  Env::IOPriority io_priority_;
+  Env::WriteLifeTimeHint write_hint_;
+  const bool strict_bytes_per_sync_;
+};
+
+// A file abstraction for random reading and writing.
+class RandomRWFile {
+ public:
+  RandomRWFile() {}
+  // No copying allowed
+  RandomRWFile(const RandomRWFile&) = delete;
+  RandomRWFile& operator=(const RandomRWFile&) = delete;
+
+  virtual ~RandomRWFile() {}
+
+  // Indicates if the class makes use of direct I/O
+  // If false you must pass aligned buffer to Write()
+  virtual bool use_direct_io() const { return false; }
+
+  // Use the returned alignment value to allocate
+  // aligned buffer for Direct I/O
+  virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+  // Write bytes in `data` at  offset `offset`, Returns Status::OK() on success.
+  // Pass aligned buffer when use_direct_io() returns true.
+  virtual Status Write(uint64_t offset, const Slice& data) = 0;
+
+  // Read up to `n` bytes starting from offset `offset` and store them in
+  // result, provided `scratch` size should be at least `n`.
+  //
+  // After call, result->size() < n only if end of file has been
+  // reached (or non-OK status). Read might fail if called again after
+  // first result->size() < n.
+  //
+  // Returns Status::OK() on success.
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const = 0;
+
+  virtual Status Flush() = 0;
+
+  virtual Status Sync() = 0;
+
+  virtual Status Fsync() { return Sync(); }
+
+  virtual Status Close() = 0;
+
+  // If you're adding methods here, remember to add them to
+  // RandomRWFileWrapper too.
+};
+
+// MemoryMappedFileBuffer object represents a memory-mapped file's raw buffer.
+// Subclasses should release the mapping upon destruction.
+class MemoryMappedFileBuffer {
+ public:
+  MemoryMappedFileBuffer(void* _base, size_t _length)
+      : base_(_base), length_(_length) {}
+
+  virtual ~MemoryMappedFileBuffer() = 0;
+
+  // We do not want to unmap this twice. We can make this class
+  // movable if desired, however, since
+  MemoryMappedFileBuffer(const MemoryMappedFileBuffer&) = delete;
+  MemoryMappedFileBuffer& operator=(const MemoryMappedFileBuffer&) = delete;
+
+  void* GetBase() const { return base_; }
+  size_t GetLen() const { return length_; }
+
+ protected:
+  void* base_;
+  const size_t length_;
+};
+
+// Directory object represents collection of files and implements
+// filesystem operations that can be executed on directories.
+class Directory {
+ public:
+  virtual ~Directory() {}
+  // Fsync directory. Can be called concurrently from multiple threads.
+  virtual Status Fsync() = 0;
+  // Close directory.
+  virtual Status Close() { return Status::NotSupported("Close"); }
+
+  virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
+    return 0;
+  }
+
+  // If you're adding methods here, remember to add them to
+  // DirectoryWrapper too.
+};
+
+enum InfoLogLevel : unsigned char {
+  DEBUG_LEVEL = 0,
+  INFO_LEVEL,
+  WARN_LEVEL,
+  ERROR_LEVEL,
+  FATAL_LEVEL,
+  HEADER_LEVEL,
+  NUM_INFO_LOG_LEVELS,
+};
+
+// An interface for writing log messages.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class Logger {
+ public:
+  size_t kDoNotSupportGetLogFileSize = (std::numeric_limits<size_t>::max)();
+
+  explicit Logger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL)
+      : closed_(false), log_level_(log_level) {}
+  // No copying allowed
+  Logger(const Logger&) = delete;
+  void operator=(const Logger&) = delete;
+
+  virtual ~Logger();
+
+  // Close the log file. Must be called before destructor. If the return
+  // status is NotSupported(), it means the implementation does cleanup in
+  // the destructor
+  virtual Status Close();
+
+  // Write a header to the log file with the specified format
+  // It is recommended that you log all header information at the start of the
+  // application. But it is not enforced.
+  virtual void LogHeader(const char* format, va_list ap) {
+    // Default implementation does a simple INFO level log write.
+    // Please override as per the logger class requirement.
+    Logv(InfoLogLevel::INFO_LEVEL, format, ap);
+  }
+
+  // Write an entry to the log file with the specified format.
+  //
+  // Users who override the `Logv()` overload taking `InfoLogLevel` do not need
+  // to implement this, unless they explicitly invoke it in
+  // `Logv(InfoLogLevel, ...)`.
+  virtual void Logv(const char* /* format */, va_list /* ap */) {
+    assert(false);
+  }
+
+  // Write an entry to the log file with the specified log level
+  // and format.  Any log with level under the internal log level
+  // of *this (see @SetInfoLogLevel and @GetInfoLogLevel) will not be
+  // printed.
+  virtual void Logv(const InfoLogLevel log_level, const char* format,
+                    va_list ap);
+
+  virtual size_t GetLogFileSize() const { return kDoNotSupportGetLogFileSize; }
+  // Flush to the OS buffers
+  virtual void Flush() {}
+  virtual InfoLogLevel GetInfoLogLevel() const { return log_level_; }
+  virtual void SetInfoLogLevel(const InfoLogLevel log_level) {
+    log_level_ = log_level;
+  }
+
+  // If you're adding methods here, remember to add them to LoggerWrapper too.
+
+ protected:
+  virtual Status CloseImpl();
+  bool closed_;
+
+ private:
+  InfoLogLevel log_level_;
+};
+
+// Identifies a locked file. Except in custom Env/Filesystem implementations,
+// the lifetime of a FileLock object should be managed only by LockFile() and
+// UnlockFile().
+class FileLock {
+ public:
+  FileLock() {}
+  virtual ~FileLock();
+
+ private:
+  // No copying allowed
+  FileLock(const FileLock&) = delete;
+  void operator=(const FileLock&) = delete;
+};
+
+class DynamicLibrary {
+ public:
+  virtual ~DynamicLibrary() {}
+
+  // Returns the name of the dynamic library.
+  virtual const char* Name() const = 0;
+
+  // Loads the symbol for sym_name from the library and updates the input
+  // function. Returns the loaded symbol.
+  template <typename T>
+  Status LoadFunction(const std::string& sym_name, std::function<T>* function) {
+    assert(nullptr != function);
+    void* ptr = nullptr;
+    Status s = LoadSymbol(sym_name, &ptr);
+    *function = reinterpret_cast<T*>(ptr);
+    return s;
+  }
+  // Loads and returns the symbol for sym_name from the library.
+  virtual Status LoadSymbol(const std::string& sym_name, void** func) = 0;
+};
+
+extern void LogFlush(const std::shared_ptr<Logger>& info_log);
+
+extern void Log(const InfoLogLevel log_level,
+                const std::shared_ptr<Logger>& info_log, const char* format,
+                ...) ROCKSDB_PRINTF_FORMAT_ATTR(3, 4);
+
+// a set of log functions with different log levels.
+extern void Header(const std::shared_ptr<Logger>& info_log, const char* format,
+                   ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Debug(const std::shared_ptr<Logger>& info_log, const char* format,
+                  ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Info(const std::shared_ptr<Logger>& info_log, const char* format,
+                 ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Warn(const std::shared_ptr<Logger>& info_log, const char* format,
+                 ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Error(const std::shared_ptr<Logger>& info_log, const char* format,
+                  ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Fatal(const std::shared_ptr<Logger>& info_log, const char* format,
+                  ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+
+// Log the specified data to *info_log if info_log is non-nullptr.
+// The default info log level is InfoLogLevel::INFO_LEVEL.
+extern void Log(const std::shared_ptr<Logger>& info_log, const char* format,
+                ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+
+extern void LogFlush(Logger* info_log);
+
+extern void Log(const InfoLogLevel log_level, Logger* info_log,
+                const char* format, ...) ROCKSDB_PRINTF_FORMAT_ATTR(3, 4);
+
+// The default info log level is InfoLogLevel::INFO_LEVEL.
+extern void Log(Logger* info_log, const char* format, ...)
+    ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+
+// a set of log functions with different log levels.
+extern void Header(Logger* info_log, const char* format, ...)
+    ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Debug(Logger* info_log, const char* format, ...)
+    ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Info(Logger* info_log, const char* format, ...)
+    ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Warn(Logger* info_log, const char* format, ...)
+    ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Error(Logger* info_log, const char* format, ...)
+    ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Fatal(Logger* info_log, const char* format, ...)
+    ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+
+// A utility routine: write "data" to the named file.
+extern Status WriteStringToFile(Env* env, const Slice& data,
+                                const std::string& fname,
+                                bool should_sync = false);
+
+// A utility routine: read contents of named file into *data
+extern Status ReadFileToString(Env* env, const std::string& fname,
+                               std::string* data);
+
+// Below are helpers for wrapping most of the classes in this file.
+// They forward all calls to another instance of the class.
+// Useful when wrapping the default implementations.
+// Typical usage is to inherit your wrapper from *Wrapper, e.g.:
+//
+// class MySequentialFileWrapper : public
+// ROCKSDB_NAMESPACE::SequentialFileWrapper {
+//  public:
+//   MySequentialFileWrapper(ROCKSDB_NAMESPACE::SequentialFile* target):
+//     ROCKSDB_NAMESPACE::SequentialFileWrapper(target) {}
+//   Status Read(size_t n, Slice* result, char* scratch) override {
+//     cout << "Doing a read of size " << n << "!" << endl;
+//     return ROCKSDB_NAMESPACE::SequentialFileWrapper::Read(n, result,
+//     scratch);
+//   }
+//   // All other methods are forwarded to target_ automatically.
+// };
+//
+// This is often more convenient than inheriting the class directly because
+// (a) Don't have to override and forward all methods - the Wrapper will
+//     forward everything you're not explicitly overriding.
+// (b) Don't need to update the wrapper when more methods are added to the
+//     rocksdb class. Unless you actually want to override the behavior.
+//     (And unless rocksdb people forgot to update the *Wrapper class.)
+
+// An implementation of Env that forwards all calls to another Env.
+// May be useful to clients who wish to override just part of the
+// functionality of another Env.
+class EnvWrapper : public Env {
+ public:
+  // The Target struct allows an Env to be stored as a raw (Env*) or
+  // std::shared_ptr<Env>.  By using this struct, the wrapping/calling
+  // class does not need to worry about the ownership/lifetime of the
+  // wrapped target env.  If the guard is set, then the Env will point
+  // to the guard.get().
+  struct Target {
+    Env* env;                    // The raw Env
+    std::shared_ptr<Env> guard;  // The guarded Env
+
+    // Creates a Target without assuming ownership of the target Env
+    explicit Target(Env* t) : env(t) {}
+
+    // Creates a Target from the guarded env, assuming ownership
+    explicit Target(std::unique_ptr<Env>&& t) : guard(t.release()) {
+      env = guard.get();
+    }
+
+    // Creates a Target from the guarded env, assuming ownership
+    explicit Target(const std::shared_ptr<Env>& t) : guard(t) {
+      env = guard.get();
+    }
+
+    // Makes sure the raw Env is not nullptr
+    void Prepare() {
+      if (guard.get() != nullptr) {
+        env = guard.get();
+      } else if (env == nullptr) {
+        env = Env::Default();
+      }
+    }
+  };
+
+  // Initialize an EnvWrapper that delegates all calls to *t
+  explicit EnvWrapper(Env* t);
+  explicit EnvWrapper(std::unique_ptr<Env>&& t);
+  explicit EnvWrapper(const std::shared_ptr<Env>& t);
+  ~EnvWrapper() override;
+
+  // Return the target to which this Env forwards all calls
+  Env* target() const { return target_.env; }
+
+  // Deprecated. Will be removed in a major release. Derived classes
+  // should implement this method.
+  const char* Name() const override { return target_.env->Name(); }
+
+  // The following text is boilerplate that forwards all methods to target()
+  Status RegisterDbPaths(const std::vector<std::string>& paths) override {
+    return target_.env->RegisterDbPaths(paths);
+  }
+
+  Status UnregisterDbPaths(const std::vector<std::string>& paths) override {
+    return target_.env->UnregisterDbPaths(paths);
+  }
+
+  Status NewSequentialFile(const std::string& f,
+                           std::unique_ptr<SequentialFile>* r,
+                           const EnvOptions& options) override {
+    return target_.env->NewSequentialFile(f, r, options);
+  }
+  Status NewRandomAccessFile(const std::string& f,
+                             std::unique_ptr<RandomAccessFile>* r,
+                             const EnvOptions& options) override {
+    return target_.env->NewRandomAccessFile(f, r, options);
+  }
+  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
+                         const EnvOptions& options) override {
+    return target_.env->NewWritableFile(f, r, options);
+  }
+  Status ReopenWritableFile(const std::string& fname,
+                            std::unique_ptr<WritableFile>* result,
+                            const EnvOptions& options) override {
+    return target_.env->ReopenWritableFile(fname, result, options);
+  }
+  Status ReuseWritableFile(const std::string& fname,
+                           const std::string& old_fname,
+                           std::unique_ptr<WritableFile>* r,
+                           const EnvOptions& options) override {
+    return target_.env->ReuseWritableFile(fname, old_fname, r, options);
+  }
+  Status NewRandomRWFile(const std::string& fname,
+                         std::unique_ptr<RandomRWFile>* result,
+                         const EnvOptions& options) override {
+    return target_.env->NewRandomRWFile(fname, result, options);
+  }
+  Status NewMemoryMappedFileBuffer(
+      const std::string& fname,
+      std::unique_ptr<MemoryMappedFileBuffer>* result) override {
+    return target_.env->NewMemoryMappedFileBuffer(fname, result);
+  }
+  Status NewDirectory(const std::string& name,
+                      std::unique_ptr<Directory>* result) override {
+    return target_.env->NewDirectory(name, result);
+  }
+  Status FileExists(const std::string& f) override {
+    return target_.env->FileExists(f);
+  }
+  Status GetChildren(const std::string& dir,
+                     std::vector<std::string>* r) override {
+    return target_.env->GetChildren(dir, r);
+  }
+  Status GetChildrenFileAttributes(
+      const std::string& dir, std::vector<FileAttributes>* result) override {
+    return target_.env->GetChildrenFileAttributes(dir, result);
+  }
+  Status DeleteFile(const std::string& f) override {
+    return target_.env->DeleteFile(f);
+  }
+  Status Truncate(const std::string& fname, size_t size) override {
+    return target_.env->Truncate(fname, size);
+  }
+  Status CreateDir(const std::string& d) override {
+    return target_.env->CreateDir(d);
+  }
+  Status CreateDirIfMissing(const std::string& d) override {
+    return target_.env->CreateDirIfMissing(d);
+  }
+  Status DeleteDir(const std::string& d) override {
+    return target_.env->DeleteDir(d);
+  }
+  Status GetFileSize(const std::string& f, uint64_t* s) override {
+    return target_.env->GetFileSize(f, s);
+  }
+
+  Status GetFileModificationTime(const std::string& fname,
+                                 uint64_t* file_mtime) override {
+    return target_.env->GetFileModificationTime(fname, file_mtime);
+  }
+
+  Status RenameFile(const std::string& s, const std::string& t) override {
+    return target_.env->RenameFile(s, t);
+  }
+
+  Status LinkFile(const std::string& s, const std::string& t) override {
+    return target_.env->LinkFile(s, t);
+  }
+
+  Status NumFileLinks(const std::string& fname, uint64_t* count) override {
+    return target_.env->NumFileLinks(fname, count);
+  }
+
+  Status AreFilesSame(const std::string& first, const std::string& second,
+                      bool* res) override {
+    return target_.env->AreFilesSame(first, second, res);
+  }
+
+  Status LockFile(const std::string& f, FileLock** l) override {
+    return target_.env->LockFile(f, l);
+  }
+
+  Status UnlockFile(FileLock* l) override { return target_.env->UnlockFile(l); }
+
+  Status IsDirectory(const std::string& path, bool* is_dir) override {
+    return target_.env->IsDirectory(path, is_dir);
+  }
+
+  Status LoadLibrary(const std::string& lib_name,
+                     const std::string& search_path,
+                     std::shared_ptr<DynamicLibrary>* result) override {
+    return target_.env->LoadLibrary(lib_name, search_path, result);
+  }
+
+  void Schedule(void (*f)(void* arg), void* a, Priority pri,
+                void* tag = nullptr, void (*u)(void* arg) = nullptr) override {
+    return target_.env->Schedule(f, a, pri, tag, u);
+  }
+
+  int UnSchedule(void* tag, Priority pri) override {
+    return target_.env->UnSchedule(tag, pri);
+  }
+
+  void StartThread(void (*f)(void*), void* a) override {
+    return target_.env->StartThread(f, a);
+  }
+  void WaitForJoin() override { return target_.env->WaitForJoin(); }
+  unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override {
+    return target_.env->GetThreadPoolQueueLen(pri);
+  }
+
+  int ReserveThreads(int threads_to_be_reserved, Priority pri) override {
+    return target_.env->ReserveThreads(threads_to_be_reserved, pri);
+  }
+
+  int ReleaseThreads(int threads_to_be_released, Priority pri) override {
+    return target_.env->ReleaseThreads(threads_to_be_released, pri);
+  }
+
+  Status GetTestDirectory(std::string* path) override {
+    return target_.env->GetTestDirectory(path);
+  }
+  Status NewLogger(const std::string& fname,
+                   std::shared_ptr<Logger>* result) override {
+    return target_.env->NewLogger(fname, result);
+  }
+  uint64_t NowMicros() override { return target_.env->NowMicros(); }
+  uint64_t NowNanos() override { return target_.env->NowNanos(); }
+  uint64_t NowCPUNanos() override { return target_.env->NowCPUNanos(); }
+
+  void SleepForMicroseconds(int micros) override {
+    target_.env->SleepForMicroseconds(micros);
+  }
+  Status GetHostName(char* name, uint64_t len) override {
+    return target_.env->GetHostName(name, len);
+  }
+  Status GetCurrentTime(int64_t* unix_time) override {
+    return target_.env->GetCurrentTime(unix_time);
+  }
+  Status GetAbsolutePath(const std::string& db_path,
+                         std::string* output_path) override {
+    return target_.env->GetAbsolutePath(db_path, output_path);
+  }
+  void SetBackgroundThreads(int num, Priority pri) override {
+    return target_.env->SetBackgroundThreads(num, pri);
+  }
+  int GetBackgroundThreads(Priority pri) override {
+    return target_.env->GetBackgroundThreads(pri);
+  }
+
+  Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override {
+    return target_.env->SetAllowNonOwnerAccess(allow_non_owner_access);
+  }
+
+  void IncBackgroundThreadsIfNeeded(int num, Priority pri) override {
+    return target_.env->IncBackgroundThreadsIfNeeded(num, pri);
+  }
+
+  void LowerThreadPoolIOPriority(Priority pool) override {
+    target_.env->LowerThreadPoolIOPriority(pool);
+  }
+
+  void LowerThreadPoolCPUPriority(Priority pool) override {
+    target_.env->LowerThreadPoolCPUPriority(pool);
+  }
+
+  Status LowerThreadPoolCPUPriority(Priority pool, CpuPriority pri) override {
+    return target_.env->LowerThreadPoolCPUPriority(pool, pri);
+  }
+
+  std::string TimeToString(uint64_t time) override {
+    return target_.env->TimeToString(time);
+  }
+
+  Status GetThreadList(std::vector<ThreadStatus>* thread_list) override {
+    return target_.env->GetThreadList(thread_list);
+  }
+
+  ThreadStatusUpdater* GetThreadStatusUpdater() const override {
+    return target_.env->GetThreadStatusUpdater();
+  }
+
+  uint64_t GetThreadID() const override { return target_.env->GetThreadID(); }
+
+  std::string GenerateUniqueId() override {
+    return target_.env->GenerateUniqueId();
+  }
+
+  EnvOptions OptimizeForLogRead(const EnvOptions& env_options) const override {
+    return target_.env->OptimizeForLogRead(env_options);
+  }
+  EnvOptions OptimizeForManifestRead(
+      const EnvOptions& env_options) const override {
+    return target_.env->OptimizeForManifestRead(env_options);
+  }
+  EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
+                                 const DBOptions& db_options) const override {
+    return target_.env->OptimizeForLogWrite(env_options, db_options);
+  }
+  EnvOptions OptimizeForManifestWrite(
+      const EnvOptions& env_options) const override {
+    return target_.env->OptimizeForManifestWrite(env_options);
+  }
+  EnvOptions OptimizeForCompactionTableWrite(
+      const EnvOptions& env_options,
+      const ImmutableDBOptions& immutable_ops) const override {
+    return target_.env->OptimizeForCompactionTableWrite(env_options,
+                                                        immutable_ops);
+  }
+  EnvOptions OptimizeForCompactionTableRead(
+      const EnvOptions& env_options,
+      const ImmutableDBOptions& db_options) const override {
+    return target_.env->OptimizeForCompactionTableRead(env_options, db_options);
+  }
+  EnvOptions OptimizeForBlobFileRead(
+      const EnvOptions& env_options,
+      const ImmutableDBOptions& db_options) const override {
+    return target_.env->OptimizeForBlobFileRead(env_options, db_options);
+  }
+  Status GetFreeSpace(const std::string& path, uint64_t* diskfree) override {
+    return target_.env->GetFreeSpace(path, diskfree);
+  }
+  void SanitizeEnvOptions(EnvOptions* env_opts) const override {
+    target_.env->SanitizeEnvOptions(env_opts);
+  }
+  Status PrepareOptions(const ConfigOptions& options) override;
+#ifndef ROCKSDB_LITE
+  std::string SerializeOptions(const ConfigOptions& config_options,
+                               const std::string& header) const override;
+#endif  // ROCKSDB_LITE
+
+ private:
+  Target target_;
+};
+
+class SequentialFileWrapper : public SequentialFile {
+ public:
+  explicit SequentialFileWrapper(SequentialFile* target) : target_(target) {}
+
+  Status Read(size_t n, Slice* result, char* scratch) override {
+    return target_->Read(n, result, scratch);
+  }
+  Status Skip(uint64_t n) override { return target_->Skip(n); }
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  Status InvalidateCache(size_t offset, size_t length) override {
+    return target_->InvalidateCache(offset, length);
+  }
+  Status PositionedRead(uint64_t offset, size_t n, Slice* result,
+                        char* scratch) override {
+    return target_->PositionedRead(offset, n, result, scratch);
+  }
+
+ private:
+  SequentialFile* target_;
+};
+
+class RandomAccessFileWrapper : public RandomAccessFile {
+ public:
+  explicit RandomAccessFileWrapper(RandomAccessFile* target)
+      : target_(target) {}
+
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const override {
+    return target_->Read(offset, n, result, scratch);
+  }
+  Status MultiRead(ReadRequest* reqs, size_t num_reqs) override {
+    return target_->MultiRead(reqs, num_reqs);
+  }
+  Status Prefetch(uint64_t offset, size_t n) override {
+    return target_->Prefetch(offset, n);
+  }
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+  void Hint(AccessPattern pattern) override { target_->Hint(pattern); }
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  Status InvalidateCache(size_t offset, size_t length) override {
+    return target_->InvalidateCache(offset, length);
+  }
+
+ private:
+  RandomAccessFile* target_;
+};
+
+class WritableFileWrapper : public WritableFile {
+ public:
+  explicit WritableFileWrapper(WritableFile* t) : target_(t) {}
+
+  Status Append(const Slice& data) override { return target_->Append(data); }
+  Status Append(const Slice& data,
+                const DataVerificationInfo& verification_info) override {
+    return target_->Append(data, verification_info);
+  }
+  Status PositionedAppend(const Slice& data, uint64_t offset) override {
+    return target_->PositionedAppend(data, offset);
+  }
+  Status PositionedAppend(
+      const Slice& data, uint64_t offset,
+      const DataVerificationInfo& verification_info) override {
+    return target_->PositionedAppend(data, offset, verification_info);
+  }
+  Status Truncate(uint64_t size) override { return target_->Truncate(size); }
+  Status Close() override { return target_->Close(); }
+  Status Flush() override { return target_->Flush(); }
+  Status Sync() override { return target_->Sync(); }
+  Status Fsync() override { return target_->Fsync(); }
+  bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); }
+
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+
+  void SetIOPriority(Env::IOPriority pri) override {
+    target_->SetIOPriority(pri);
+  }
+
+  Env::IOPriority GetIOPriority() override { return target_->GetIOPriority(); }
+
+  void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override {
+    target_->SetWriteLifeTimeHint(hint);
+  }
+
+  Env::WriteLifeTimeHint GetWriteLifeTimeHint() override {
+    return target_->GetWriteLifeTimeHint();
+  }
+
+  uint64_t GetFileSize() override { return target_->GetFileSize(); }
+
+  void SetPreallocationBlockSize(size_t size) override {
+    target_->SetPreallocationBlockSize(size);
+  }
+
+  void GetPreallocationStatus(size_t* block_size,
+                              size_t* last_allocated_block) override {
+    target_->GetPreallocationStatus(block_size, last_allocated_block);
+  }
+
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+
+  Status InvalidateCache(size_t offset, size_t length) override {
+    return target_->InvalidateCache(offset, length);
+  }
+
+  Status RangeSync(uint64_t offset, uint64_t nbytes) override {
+    return target_->RangeSync(offset, nbytes);
+  }
+
+  void PrepareWrite(size_t offset, size_t len) override {
+    target_->PrepareWrite(offset, len);
+  }
+
+  Status Allocate(uint64_t offset, uint64_t len) override {
+    return target_->Allocate(offset, len);
+  }
+
+ private:
+  WritableFile* target_;
+};
+
+class RandomRWFileWrapper : public RandomRWFile {
+ public:
+  explicit RandomRWFileWrapper(RandomRWFile* target) : target_(target) {}
+
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  Status Write(uint64_t offset, const Slice& data) override {
+    return target_->Write(offset, data);
+  }
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const override {
+    return target_->Read(offset, n, result, scratch);
+  }
+  Status Flush() override { return target_->Flush(); }
+  Status Sync() override { return target_->Sync(); }
+  Status Fsync() override { return target_->Fsync(); }
+  Status Close() override { return target_->Close(); }
+
+ private:
+  RandomRWFile* target_;
+};
+
+class DirectoryWrapper : public Directory {
+ public:
+  explicit DirectoryWrapper(Directory* target) : target_(target) {}
+
+  Status Fsync() override { return target_->Fsync(); }
+  Status Close() override { return target_->Close(); }
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+
+ private:
+  Directory* target_;
+};
+
+class LoggerWrapper : public Logger {
+ public:
+  explicit LoggerWrapper(Logger* target) : target_(target) {}
+
+  Status Close() override { return target_->Close(); }
+  void LogHeader(const char* format, va_list ap) override {
+    return target_->LogHeader(format, ap);
+  }
+  void Logv(const char* format, va_list ap) override {
+    return target_->Logv(format, ap);
+  }
+  void Logv(const InfoLogLevel log_level, const char* format,
+            va_list ap) override {
+    return target_->Logv(log_level, format, ap);
+  }
+  size_t GetLogFileSize() const override { return target_->GetLogFileSize(); }
+  void Flush() override { return target_->Flush(); }
+  InfoLogLevel GetInfoLogLevel() const override {
+    return target_->GetInfoLogLevel();
+  }
+  void SetInfoLogLevel(const InfoLogLevel log_level) override {
+    return target_->SetInfoLogLevel(log_level);
+  }
+
+ private:
+  Logger* target_;
+};
+
+// Returns a new environment that stores its data in memory and delegates
+// all non-file-storage tasks to base_env. The caller must delete the result
+// when it is no longer needed.
+// *base_env must remain live while the result is in use.
+Env* NewMemEnv(Env* base_env);
+
+// Returns a new environment that measures function call times for filesystem
+// operations, reporting results to variables in PerfContext.
+// This is a factory method for TimedEnv defined in utilities/env_timed.cc.
+Env* NewTimedEnv(Env* base_env);
+
+// Returns an instance of logger that can be used for storing informational
+// messages.
+// This is a factory method for EnvLogger declared in logging/env_logging.h
+Status NewEnvLogger(const std::string& fname, Env* env,
+                    std::shared_ptr<Logger>* result);
+
+// Creates a new Env based on Env::Default() but modified to use the specified
+// FileSystem.
+std::unique_ptr<Env> NewCompositeEnv(const std::shared_ptr<FileSystem>& fs);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/env_encryption.h b/src/rocksdb/include/rocksdb/env_encryption.h
new file mode 100644
index 000000000..282db6ed4
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/env_encryption.h
@@ -0,0 +1,465 @@
+//  Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#if !defined(ROCKSDB_LITE)
+
+#include <string>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class EncryptionProvider;
+
+struct ConfigOptions;
+
+// Returns an Env that encrypts data when stored on disk and decrypts data when
+// read from disk.
+Env* NewEncryptedEnv(Env* base_env,
+                     const std::shared_ptr<EncryptionProvider>& provider);
+std::shared_ptr<FileSystem> NewEncryptedFS(
+    const std::shared_ptr<FileSystem>& base_fs,
+    const std::shared_ptr<EncryptionProvider>& provider);
+
+// BlockAccessCipherStream is the base class for any cipher stream that
+// supports random access at block level (without requiring data from other
+// blocks). E.g. CTR (Counter operation mode) supports this requirement.
+class BlockAccessCipherStream {
+ public:
+  virtual ~BlockAccessCipherStream(){};
+
+  // BlockSize returns the size of each block supported by this cipher stream.
+  virtual size_t BlockSize() = 0;
+
+  // Encrypt one or more (partial) blocks of data at the file offset.
+  // Length of data is given in dataSize.
+  virtual Status Encrypt(uint64_t fileOffset, char* data, size_t dataSize);
+
+  // Decrypt one or more (partial) blocks of data at the file offset.
+  // Length of data is given in dataSize.
+  virtual Status Decrypt(uint64_t fileOffset, char* data, size_t dataSize);
+
+ protected:
+  // Allocate scratch space which is passed to EncryptBlock/DecryptBlock.
+  virtual void AllocateScratch(std::string&) = 0;
+
+  // Encrypt a block of data at the given block index.
+  // Length of data is equal to BlockSize();
+  virtual Status EncryptBlock(uint64_t blockIndex, char* data,
+                              char* scratch) = 0;
+
+  // Decrypt a block of data at the given block index.
+  // Length of data is equal to BlockSize();
+  virtual Status DecryptBlock(uint64_t blockIndex, char* data,
+                              char* scratch) = 0;
+};
+
+// BlockCipher
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class BlockCipher : public Customizable {
+ public:
+  virtual ~BlockCipher(){};
+
+  // Creates a new BlockCipher from the input config_options and value
+  // The value describes the type of provider (and potentially optional
+  // configuration parameters) used to create this provider.
+  // For example, if the value is "ROT13", a ROT13BlockCipher is created.
+  //
+  // @param config_options  Options to control how this cipher is created
+  //                        and initialized.
+  // @param value  The value might be:
+  //   - ROT13         Create a ROT13 Cipher
+  //   - ROT13:nn      Create a ROT13 Cipher with block size of nn
+  // @param result The new cipher object
+  // @return OK if the cipher was successfully created
+  // @return NotFound if an invalid name was specified in the value
+  // @return InvalidArgument if either the options were not valid
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& value,
+                                 std::shared_ptr<BlockCipher>* result);
+
+  static const char* Type() { return "BlockCipher"; }
+  // Short-cut method to create a ROT13 BlockCipher.
+  // This cipher is only suitable for test purposes and should not be used in
+  // production!!!
+  static std::shared_ptr<BlockCipher> NewROT13Cipher(size_t block_size);
+
+  // BlockSize returns the size of each block supported by this cipher stream.
+  virtual size_t BlockSize() = 0;
+
+  // Encrypt a block of data.
+  // Length of data is equal to BlockSize().
+  virtual Status Encrypt(char* data) = 0;
+
+  // Decrypt a block of data.
+  // Length of data is equal to BlockSize().
+  virtual Status Decrypt(char* data) = 0;
+};
+
+// The encryption provider is used to create a cipher stream for a specific
+// file. The returned cipher stream will be used for actual
+// encryption/decryption actions.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class EncryptionProvider : public Customizable {
+ public:
+  virtual ~EncryptionProvider(){};
+
+  // Creates a new EncryptionProvider from the input config_options and value
+  // The value describes the type of provider (and potentially optional
+  // configuration parameters) used to create this provider.
+  // For example, if the value is "CTR", a CTREncryptionProvider will be
+  // created. If the value is ends with "://test" (e.g CTR://test"), the
+  // provider will be initialized in "TEST" mode prior to being returned.
+  //
+  // @param config_options  Options to control how this provider is created
+  //                        and initialized.
+  // @param value  The value might be:
+  //   - CTR         Create a CTR provider
+  //   - CTR://test Create a CTR provider and initialize it for tests.
+  // @param result The new provider object
+  // @return OK if the provider was successfully created
+  // @return NotFound if an invalid name was specified in the value
+  // @return InvalidArgument if either the options were not valid
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& value,
+                                 std::shared_ptr<EncryptionProvider>* result);
+
+  static const char* Type() { return "EncryptionProvider"; }
+
+  // Short-cut method to create a CTR-provider
+  static std::shared_ptr<EncryptionProvider> NewCTRProvider(
+      const std::shared_ptr<BlockCipher>& cipher);
+
+  // GetPrefixLength returns the length of the prefix that is added to every
+  // file and used for storing encryption options. For optimal performance, the
+  // prefix length should be a multiple of the page size.
+  virtual size_t GetPrefixLength() const = 0;
+
+  // CreateNewPrefix initialized an allocated block of prefix memory
+  // for a new file.
+  virtual Status CreateNewPrefix(const std::string& fname, char* prefix,
+                                 size_t prefixLength) const = 0;
+
+  // Method to add a new cipher key for use by the EncryptionProvider.
+  // @param description  Descriptor for this key.
+  // @param cipher       The cryptographic key to use
+  // @param len          The length of the cipher key
+  // @param for_write If true, this cipher should be used for writing files.
+  //                  If false, this cipher should only be used for reading
+  //                  files
+  // @return OK if the cipher was successfully added to the provider, non-OK
+  // otherwise
+  virtual Status AddCipher(const std::string& descriptor, const char* cipher,
+                           size_t len, bool for_write) = 0;
+
+  // CreateCipherStream creates a block access cipher stream for a file given
+  // given name and options.
+  virtual Status CreateCipherStream(
+      const std::string& fname, const EnvOptions& options, Slice& prefix,
+      std::unique_ptr<BlockAccessCipherStream>* result) = 0;
+
+  // Returns a string representing an encryption marker prefix for this
+  // provider. If a marker is provided, this marker can be used to tell whether
+  // or not a file is encrypted by this provider.  The maker will also be part
+  // of any encryption prefix for this provider.
+  virtual std::string GetMarker() const { return ""; }
+};
+
+class EncryptedSequentialFile : public FSSequentialFile {
+ protected:
+  std::unique_ptr<FSSequentialFile> file_;
+  std::unique_ptr<BlockAccessCipherStream> stream_;
+  uint64_t offset_;
+  size_t prefixLength_;
+
+ public:
+  // Default ctor. Given underlying sequential file is supposed to be at
+  // offset == prefixLength.
+  EncryptedSequentialFile(std::unique_ptr<FSSequentialFile>&& f,
+                          std::unique_ptr<BlockAccessCipherStream>&& s,
+                          size_t prefixLength)
+      : file_(std::move(f)),
+        stream_(std::move(s)),
+        offset_(prefixLength),
+        prefixLength_(prefixLength) {}
+
+  // Read up to "n" bytes from the file.  "scratch[0..n-1]" may be
+  // written by this routine.  Sets "*result" to the data that was
+  // read (including if fewer than "n" bytes were successfully read).
+  // May set "*result" to point at data in "scratch[0..n-1]", so
+  // "scratch[0..n-1]" must be live when "*result" is used.
+  // If an error was encountered, returns a non-OK status.
+  //
+  // REQUIRES: External synchronization
+  IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+                char* scratch, IODebugContext* dbg) override;
+
+  // Skip "n" bytes from the file. This is guaranteed to be no
+  // slower that reading the same data, but may be faster.
+  //
+  // If end of file is reached, skipping will stop at the end of the
+  // file, and Skip will return OK.
+  //
+  // REQUIRES: External synchronization
+  IOStatus Skip(uint64_t n) override;
+
+  // Indicates the upper layers if the current SequentialFile implementation
+  // uses direct IO.
+  bool use_direct_io() const override;
+
+  // Use the returned alignment value to allocate
+  // aligned buffer for Direct I/O
+  size_t GetRequiredBufferAlignment() const override;
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  IOStatus InvalidateCache(size_t offset, size_t length) override;
+
+  // Positioned Read for direct I/O
+  // If Direct I/O enabled, offset, n, and scratch should be properly aligned
+  IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options,
+                          Slice* result, char* scratch,
+                          IODebugContext* dbg) override;
+};
+
+// A file abstraction for randomly reading the contents of a file.
+class EncryptedRandomAccessFile : public FSRandomAccessFile {
+ protected:
+  std::unique_ptr<FSRandomAccessFile> file_;
+  std::unique_ptr<BlockAccessCipherStream> stream_;
+  size_t prefixLength_;
+
+ public:
+  EncryptedRandomAccessFile(std::unique_ptr<FSRandomAccessFile>&& f,
+                            std::unique_ptr<BlockAccessCipherStream>&& s,
+                            size_t prefixLength)
+      : file_(std::move(f)),
+        stream_(std::move(s)),
+        prefixLength_(prefixLength) {}
+
+  // Read up to "n" bytes from the file starting at "offset".
+  // "scratch[0..n-1]" may be written by this routine.  Sets "*result"
+  // to the data that was read (including if fewer than "n" bytes were
+  // successfully read).  May set "*result" to point at data in
+  // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
+  // "*result" is used.  If an error was encountered, returns a non-OK
+  // status.
+  //
+  // Safe for concurrent use by multiple threads.
+  // If Direct I/O enabled, offset, n, and scratch should be aligned properly.
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
+
+  // Readahead the file starting from offset by n bytes for caching.
+  IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
+                    IODebugContext* dbg) override;
+
+  // Tries to get an unique ID for this file that will be the same each time
+  // the file is opened (and will stay the same while the file is open).
+  // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
+  // ID can be created this function returns the length of the ID and places it
+  // in "id"; otherwise, this function returns 0, in which case "id"
+  // may not have been modified.
+  //
+  // This function guarantees, for IDs from a given environment, two unique ids
+  // cannot be made equal to each other by adding arbitrary bytes to one of
+  // them. That is, no unique ID is the prefix of another.
+  //
+  // This function guarantees that the returned ID will not be interpretable as
+  // a single varint.
+  //
+  // Note: these IDs are only valid for the duration of the process.
+  size_t GetUniqueId(char* id, size_t max_size) const override;
+
+  void Hint(AccessPattern pattern) override;
+
+  // Indicates the upper layers if the current RandomAccessFile implementation
+  // uses direct IO.
+  bool use_direct_io() const override;
+
+  // Use the returned alignment value to allocate
+  // aligned buffer for Direct I/O
+  size_t GetRequiredBufferAlignment() const override;
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  IOStatus InvalidateCache(size_t offset, size_t length) override;
+};
+
+// A file abstraction for sequential writing.  The implementation
+// must provide buffering since callers may append small fragments
+// at a time to the file.
+class EncryptedWritableFile : public FSWritableFile {
+ protected:
+  std::unique_ptr<FSWritableFile> file_;
+  std::unique_ptr<BlockAccessCipherStream> stream_;
+  size_t prefixLength_;
+
+ public:
+  // Default ctor. Prefix is assumed to be written already.
+  EncryptedWritableFile(std::unique_ptr<FSWritableFile>&& f,
+                        std::unique_ptr<BlockAccessCipherStream>&& s,
+                        size_t prefixLength)
+      : file_(std::move(f)),
+        stream_(std::move(s)),
+        prefixLength_(prefixLength) {}
+
+  using FSWritableFile::Append;
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  IODebugContext* dbg) override;
+
+  using FSWritableFile::PositionedAppend;
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& options,
+                            IODebugContext* dbg) override;
+
+  // true if Sync() and Fsync() are safe to call concurrently with Append()
+  // and Flush().
+  bool IsSyncThreadSafe() const override;
+
+  // Indicates the upper layers if the current WritableFile implementation
+  // uses direct IO.
+  bool use_direct_io() const override;
+
+  // Use the returned alignment value to allocate
+  // aligned buffer for Direct I/O
+  size_t GetRequiredBufferAlignment() const override;
+
+  /*
+   * Get the size of valid data in the file.
+   */
+  uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override;
+
+  // Truncate is necessary to trim the file to the correct size
+  // before closing. It is not always possible to keep track of the file
+  // size due to whole pages writes. The behavior is undefined if called
+  // with other writes to follow.
+  IOStatus Truncate(uint64_t size, const IOOptions& options,
+                    IODebugContext* dbg) override;
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  // This call has no effect on dirty pages in the cache.
+  IOStatus InvalidateCache(size_t offset, size_t length) override;
+
+  // Sync a file range with disk.
+  // offset is the starting byte of the file range to be synchronized.
+  // nbytes specifies the length of the range to be synchronized.
+  // This asks the OS to initiate flushing the cached data to disk,
+  // without waiting for completion.
+  // Default implementation does nothing.
+  IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options,
+                     IODebugContext* dbg) override;
+
+  // PrepareWrite performs any necessary preparation for a write
+  // before the write actually occurs.  This allows for pre-allocation
+  // of space on devices where it can result in less file
+  // fragmentation and/or less waste from over-zealous filesystem
+  // pre-allocation.
+  void PrepareWrite(size_t offset, size_t len, const IOOptions& options,
+                    IODebugContext* dbg) override;
+
+  void SetPreallocationBlockSize(size_t size) override;
+
+  void GetPreallocationStatus(size_t* block_size,
+                              size_t* last_allocated_block) override;
+
+  // Pre-allocates space for a file.
+  IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options,
+                    IODebugContext* dbg) override;
+
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
+};
+
+// A file abstraction for random reading and writing.
+class EncryptedRandomRWFile : public FSRandomRWFile {
+ protected:
+  std::unique_ptr<FSRandomRWFile> file_;
+  std::unique_ptr<BlockAccessCipherStream> stream_;
+  size_t prefixLength_;
+
+ public:
+  EncryptedRandomRWFile(std::unique_ptr<FSRandomRWFile>&& f,
+                        std::unique_ptr<BlockAccessCipherStream>&& s,
+                        size_t prefixLength)
+      : file_(std::move(f)),
+        stream_(std::move(s)),
+        prefixLength_(prefixLength) {}
+
+  // Indicates if the class makes use of direct I/O
+  // If false you must pass aligned buffer to Write()
+  bool use_direct_io() const override;
+
+  // Use the returned alignment value to allocate
+  // aligned buffer for Direct I/O
+  size_t GetRequiredBufferAlignment() const override;
+
+  // Write bytes in `data` at  offset `offset`, Returns Status::OK() on success.
+  // Pass aligned buffer when use_direct_io() returns true.
+  IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options,
+                 IODebugContext* dbg) override;
+
+  // Read up to `n` bytes starting from offset `offset` and store them in
+  // result, provided `scratch` size should be at least `n`.
+  // Returns Status::OK() on success.
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
+
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
+};
+
+class EncryptedFileSystem : public FileSystemWrapper {
+ public:
+  explicit EncryptedFileSystem(const std::shared_ptr<FileSystem>& base)
+      : FileSystemWrapper(base) {}
+  // Method to add a new cipher key for use by the EncryptionProvider.
+  // @param description  Descriptor for this key.
+  // @param cipher       The cryptographic key to use
+  // @param len          The length of the cipher key
+  // @param for_write If true, this cipher should be used for writing files.
+  //                  If false, this cipher should only be used for reading
+  //                  files
+  // @return OK if the cipher was successfully added to the provider, non-OK
+  // otherwise
+  virtual Status AddCipher(const std::string& descriptor, const char* cipher,
+                           size_t len, bool for_write) = 0;
+  static const char* kClassName() { return "EncryptedFileSystem"; }
+  bool IsInstanceOf(const std::string& name) const override {
+    if (name == kClassName()) {
+      return true;
+    } else {
+      return FileSystemWrapper::IsInstanceOf(name);
+    }
+  }
+};
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !defined(ROCKSDB_LITE)
diff --git a/src/rocksdb/include/rocksdb/experimental.h b/src/rocksdb/include/rocksdb/experimental.h
new file mode 100644
index 000000000..b59395255
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/experimental.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace experimental {
+
+// Supported only for Leveled compaction
+Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family,
+                           const Slice* begin, const Slice* end);
+Status SuggestCompactRange(DB* db, const Slice* begin, const Slice* end);
+
+// Move all L0 files to target_level skipping compaction.
+// This operation succeeds only if the files in L0 have disjoint ranges; this
+// is guaranteed to happen, for instance, if keys are inserted in sorted
+// order. Furthermore, all levels between 1 and target_level must be empty.
+// If any of the above condition is violated, InvalidArgument will be
+// returned.
+Status PromoteL0(DB* db, ColumnFamilyHandle* column_family,
+                 int target_level = 1);
+
+struct UpdateManifestForFilesStateOptions {
+  // When true, read current file temperatures from FileSystem and update in
+  // DB manifest when a temperature other than Unknown is reported and
+  // inconsistent with manifest.
+  bool update_temperatures = true;
+
+  // TODO: new_checksums: to update files to latest file checksum algorithm
+};
+
+// Utility for updating manifest of DB directory (not open) for current state
+// of files on filesystem. See UpdateManifestForFilesStateOptions.
+//
+// To minimize interference with ongoing DB operations, only the following
+// guarantee is provided, assuming no IO error encountered:
+// * Only files live in DB at start AND end of call to
+// UpdateManifestForFilesState() are guaranteed to be updated (as needed) in
+// manifest.
+//   * For example, new files after start of call to
+//   UpdateManifestForFilesState() might not be updated, but that is not
+//   typically required to achieve goal of manifest consistency/completeness
+//   (because current DB configuration would ensure new files get the desired
+//   consistent metadata).
+Status UpdateManifestForFilesState(
+    const DBOptions& db_opts, const std::string& db_name,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    const UpdateManifestForFilesStateOptions& opts = {});
+
+}  // namespace experimental
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/file_checksum.h b/src/rocksdb/include/rocksdb/file_checksum.h
new file mode 100644
index 000000000..758bae4ac
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/file_checksum.h
@@ -0,0 +1,146 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <cassert>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The unknown file checksum.
+constexpr char kUnknownFileChecksum[] = "";
+// The unknown sst file checksum function name.
+constexpr char kUnknownFileChecksumFuncName[] = "Unknown";
+// The standard DB file checksum function name.
+// This is the name of the checksum function returned by
+// GetFileChecksumGenCrc32cFactory();
+constexpr char kStandardDbFileChecksumFuncName[] = "FileChecksumCrc32c";
+
+struct FileChecksumGenContext {
+  std::string file_name;
+  // The name of the requested checksum generator.
+  // Checksum factories may use or ignore requested_checksum_func_name,
+  // and checksum factories written before this field was available are still
+  // compatible.
+  std::string requested_checksum_func_name;
+};
+
+// FileChecksumGenerator is the class to generates the checksum value
+// for each file when the file is written to the file system.
+// Implementations may assume that
+// * Finalize is called at most once during the life of the object
+// * All calls to Update come before Finalize
+// * All calls to GetChecksum come after Finalize
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class FileChecksumGenerator {
+ public:
+  virtual ~FileChecksumGenerator() {}
+
+  // Update the current result after process the data. For different checksum
+  // functions, the temporal results may be stored and used in Update to
+  // include the new data.
+  virtual void Update(const char* data, size_t n) = 0;
+
+  // Generate the final results if no further new data will be updated.
+  virtual void Finalize() = 0;
+
+  // Get the checksum. The result should not be the empty string and may
+  // include arbitrary bytes, including non-printable characters.
+  virtual std::string GetChecksum() const = 0;
+
+  // Returns a name that identifies the current file checksum function.
+  virtual const char* Name() const = 0;
+};
+
+// Create the FileChecksumGenerator object for each SST file.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class FileChecksumGenFactory : public Customizable {
+ public:
+  ~FileChecksumGenFactory() override {}
+  static const char* Type() { return "FileChecksumGenFactory"; }
+  static Status CreateFromString(
+      const ConfigOptions& options, const std::string& value,
+      std::shared_ptr<FileChecksumGenFactory>* result);
+
+  // Create a new FileChecksumGenerator.
+  virtual std::unique_ptr<FileChecksumGenerator> CreateFileChecksumGenerator(
+      const FileChecksumGenContext& context) = 0;
+
+  // Return the name of this FileChecksumGenFactory.
+  const char* Name() const override = 0;
+};
+
+// FileChecksumList stores the checksum information of a list of files (e.g.,
+// SST files). The FileChecksumList can be used to store the checksum
+// information of all SST file getting  from the MANIFEST, which are
+// the checksum information of all valid SST file of a DB instance. It can
+// also be used to store the checksum information of a list of SST files to
+// be ingested.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class FileChecksumList {
+ public:
+  virtual ~FileChecksumList() {}
+
+  // Clean the previously stored file checksum information.
+  virtual void reset() = 0;
+
+  // Get the number of checksums in the checksum list
+  virtual size_t size() const = 0;
+
+  // Return all the file checksum information being stored in a unordered_map.
+  // File_number is the key, the first part of the value is checksum value,
+  // and the second part of the value is checksum function name.
+  virtual Status GetAllFileChecksums(
+      std::vector<uint64_t>* file_numbers, std::vector<std::string>* checksums,
+      std::vector<std::string>* checksum_func_names) = 0;
+
+  // Given the file_number, it searches if the file checksum information is
+  // stored.
+  virtual Status SearchOneFileChecksum(uint64_t file_number,
+                                       std::string* checksum,
+                                       std::string* checksum_func_name) = 0;
+
+  // Insert the checksum information of one file to the FileChecksumList.
+  virtual Status InsertOneFileChecksum(
+      uint64_t file_number, const std::string& checksum,
+      const std::string& checksum_func_name) = 0;
+
+  // Remove the checksum information of one SST file.
+  virtual Status RemoveOneFileChecksum(uint64_t file_number) = 0;
+};
+
+// Create a new file checksum list.
+extern FileChecksumList* NewFileChecksumList();
+
+// Return a shared_ptr of the builtin Crc32c based file checksum generator
+// factory object, which can be shared to create the Crc32c based checksum
+// generator object.
+// Note: this implementation is compatible with many other crc32c checksum
+// implementations and uses big-endian encoding of the result, unlike most
+// other crc32c checksums in RocksDB, which alter the result with
+// crc32c::Mask and use little-endian encoding.
+extern std::shared_ptr<FileChecksumGenFactory>
+GetFileChecksumGenCrc32cFactory();
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/file_system.h b/src/rocksdb/include/rocksdb/file_system.h
new file mode 100644
index 000000000..91ad47218
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/file_system.h
@@ -0,0 +1,1849 @@
+// Copyright (c) 2019-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// A FileSystem is an interface used by the rocksdb implementation to access
+// storage functionality like the filesystem etc.  Callers
+// may wish to provide a custom FileSystem object when opening a database to
+// get fine gain control; e.g., to rate limit file system operations.
+//
+// All FileSystem implementations are safe for concurrent access from
+// multiple threads without any external synchronization.
+//
+// WARNING: Since this is a new interface, it is expected that there will be
+// some changes as storage systems are ported over.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <chrono>
+#include <cstdarg>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/env.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "rocksdb/thread_status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class FileLock;
+class FSDirectory;
+class FSRandomAccessFile;
+class FSRandomRWFile;
+class FSSequentialFile;
+class FSWritableFile;
+class Logger;
+class Slice;
+struct ImmutableDBOptions;
+struct MutableDBOptions;
+class RateLimiter;
+struct ConfigOptions;
+
+using AccessPattern = RandomAccessFile::AccessPattern;
+using FileAttributes = Env::FileAttributes;
+
+// DEPRECATED
+// Priority of an IO request. This is a hint and does not guarantee any
+// particular QoS.
+// IO_LOW - Typically background reads/writes such as compaction/flush
+// IO_HIGH - Typically user reads/synchronous WAL writes
+enum class IOPriority : uint8_t {
+  kIOLow,
+  kIOHigh,
+  kIOTotal,
+};
+
+// Type of the data begin read/written. It can be passed down as a flag
+// for the FileSystem implementation to optionally handle different types in
+// different ways
+enum class IOType : uint8_t {
+  kData,
+  kFilter,
+  kIndex,
+  kMetadata,
+  kWAL,
+  kManifest,
+  kLog,
+  kUnknown,
+  kInvalid,
+};
+
+// Per-request options that can be passed down to the FileSystem
+// implementation. These are hints and are not necessarily guaranteed to be
+// honored. More hints can be added here in the future to indicate things like
+// storage media (HDD/SSD) to be used, replication level etc.
+struct IOOptions {
+  // Timeout for the operation in microseconds
+  std::chrono::microseconds timeout;
+
+  // DEPRECATED
+  // Priority - high or low
+  IOPriority prio;
+
+  // Priority used to charge rate limiter configured in file system level (if
+  // any)
+  // Limitation: right now RocksDB internal does not consider this
+  // rate_limiter_priority
+  Env::IOPriority rate_limiter_priority;
+
+  // Type of data being read/written
+  IOType type;
+
+  // EXPERIMENTAL
+  // An option map that's opaque to RocksDB. It can be used to implement a
+  // custom contract between a FileSystem user and the provider. This is only
+  // useful in cases where a RocksDB user directly uses the FileSystem or file
+  // object for their own purposes, and wants to pass extra options to APIs
+  // such as NewRandomAccessFile and NewWritableFile.
+  std::unordered_map<std::string, std::string> property_bag;
+
+  // Force directory fsync, some file systems like btrfs may skip directory
+  // fsync, set this to force the fsync
+  bool force_dir_fsync;
+
+  // Can be used by underlying file systems to skip recursing through sub
+  // directories and list only files in GetChildren API.
+  bool do_not_recurse;
+
+  IOOptions() : IOOptions(false) {}
+
+  explicit IOOptions(bool force_dir_fsync_)
+      : timeout(std::chrono::microseconds::zero()),
+        prio(IOPriority::kIOLow),
+        rate_limiter_priority(Env::IO_TOTAL),
+        type(IOType::kUnknown),
+        force_dir_fsync(force_dir_fsync_),
+        do_not_recurse(false) {}
+};
+
+struct DirFsyncOptions {
+  enum FsyncReason : uint8_t {
+    kNewFileSynced,
+    kFileRenamed,
+    kDirRenamed,
+    kFileDeleted,
+    kDefault,
+  } reason;
+
+  std::string renamed_new_name;  // for kFileRenamed
+  // add other options for other FsyncReason
+
+  DirFsyncOptions();
+
+  explicit DirFsyncOptions(std::string file_renamed_new_name);
+
+  explicit DirFsyncOptions(FsyncReason fsync_reason);
+};
+
+// File scope options that control how a file is opened/created and accessed
+// while its open. We may add more options here in the future such as
+// redundancy level, media to use etc.
+struct FileOptions : EnvOptions {
+  // Embedded IOOptions to control the parameters for any IOs that need
+  // to be issued for the file open/creation
+  IOOptions io_options;
+
+  // EXPERIMENTAL
+  // The feature is in development and is subject to change.
+  // When creating a new file, set the temperature of the file so that
+  // underlying file systems can put it with appropriate storage media and/or
+  // coding.
+  Temperature temperature = Temperature::kUnknown;
+
+  // The checksum type that is used to calculate the checksum value for
+  // handoff during file writes.
+  ChecksumType handoff_checksum_type;
+
+  FileOptions() : EnvOptions(), handoff_checksum_type(ChecksumType::kCRC32c) {}
+
+  FileOptions(const DBOptions& opts)
+      : EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {}
+
+  FileOptions(const EnvOptions& opts)
+      : EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {}
+
+  FileOptions(const FileOptions& opts)
+      : EnvOptions(opts),
+        io_options(opts.io_options),
+        temperature(opts.temperature),
+        handoff_checksum_type(opts.handoff_checksum_type) {}
+
+  FileOptions& operator=(const FileOptions&) = default;
+};
+
+// A structure to pass back some debugging information from the FileSystem
+// implementation to RocksDB in case of an IO error
+struct IODebugContext {
+  // file_path to be filled in by RocksDB in case of an error
+  std::string file_path;
+
+  // A map of counter names to values - set by the FileSystem implementation
+  std::map<std::string, uint64_t> counters;
+
+  // To be set by the FileSystem implementation
+  std::string msg;
+
+  // To be set by the underlying FileSystem implementation.
+  std::string request_id;
+
+  // In order to log required information in IO tracing for different
+  // operations, Each bit in trace_data stores which corresponding info from
+  // IODebugContext will be added in the trace. Foreg, if trace_data = 1, it
+  // means bit at position 0 is set so TraceData::kRequestID (request_id) will
+  // be logged in the trace record.
+  //
+  enum TraceData : char {
+    // The value of each enum represents the bitwise position for
+    // that information in trace_data which will be used by IOTracer for
+    // tracing. Make sure to add them sequentially.
+    kRequestID = 0,
+  };
+  uint64_t trace_data = 0;
+
+  IODebugContext() {}
+
+  void AddCounter(std::string& name, uint64_t value) {
+    counters.emplace(name, value);
+  }
+
+  // Called by underlying file system to set request_id and log request_id in
+  // IOTracing.
+  void SetRequestId(const std::string& _request_id) {
+    request_id = _request_id;
+    trace_data |= (1 << TraceData::kRequestID);
+  }
+
+  std::string ToString() {
+    std::ostringstream ss;
+    ss << file_path << ", ";
+    for (auto counter : counters) {
+      ss << counter.first << " = " << counter.second << ",";
+    }
+    ss << msg;
+    return ss.str();
+  }
+};
+
+// A function pointer type for custom destruction of void pointer passed to
+// ReadAsync API. RocksDB/caller is responsible for deleting the void pointer
+// allocated by FS in ReadAsync API.
+using IOHandleDeleter = std::function<void(void*)>;
+
+// The FileSystem, FSSequentialFile, FSRandomAccessFile, FSWritableFile,
+// FSRandomRWFileclass, and FSDIrectory classes define the interface between
+// RocksDB and storage systems, such as Posix filesystems,
+// remote filesystems etc.
+// The interface allows for fine grained control of individual IO operations,
+// such as setting a timeout, prioritization, hints on data placement,
+// different handling based on type of IO etc.
+// This is accomplished by passing an instance of IOOptions to every
+// API call that can potentially perform IO. Additionally, each such API is
+// passed a pointer to a IODebugContext structure that can be used by the
+// storage system to include troubleshooting information. The return values
+// of the APIs is of type IOStatus, which can indicate an error code/sub-code,
+// as well as metadata about the error such as its scope and whether its
+// retryable.
+// NewCompositeEnv can be used to create an Env with a custom FileSystem for
+// DBOptions::env.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class FileSystem : public Customizable {
+ public:
+  FileSystem();
+
+  // No copying allowed
+  FileSystem(const FileSystem&) = delete;
+
+  virtual ~FileSystem();
+
+  static const char* Type() { return "FileSystem"; }
+  static const char* kDefaultName() { return "DefaultFileSystem"; }
+
+  // Loads the FileSystem specified by the input value into the result
+  // The CreateFromString alternative should be used; this method may be
+  // deprecated in a future release.
+  static Status Load(const std::string& value,
+                     std::shared_ptr<FileSystem>* result);
+
+  // Loads the FileSystem specified by the input value into the result
+  // @see Customizable for a more detailed description of the parameters and
+  // return codes
+  // @param config_options Controls how the FileSystem is loaded
+  // @param value The name and optional properties describing the file system
+  //      to load.
+  // @param result On success, returns the loaded FileSystem
+  // @return OK if the FileSystem was successfully loaded.
+  // @return not-OK if the load failed.
+  static Status CreateFromString(const ConfigOptions& options,
+                                 const std::string& value,
+                                 std::shared_ptr<FileSystem>* result);
+
+  // Return a default FileSystem suitable for the current operating
+  // system.
+  static std::shared_ptr<FileSystem> Default();
+
+  // Handles the event when a new DB or a new ColumnFamily starts using the
+  // specified data paths.
+  //
+  // The data paths might be shared by different DBs or ColumnFamilies,
+  // so RegisterDbPaths might be called with the same data paths.
+  // For example, when CreateColumnFamily is called multiple times with the same
+  // data path, RegisterDbPaths will also be called with the same data path.
+  //
+  // If the return status is ok, then the paths must be correspondingly
+  // called in UnregisterDbPaths;
+  // otherwise this method should have no side effect, and UnregisterDbPaths
+  // do not need to be called for the paths.
+  //
+  // Different implementations may take different actions.
+  // By default, it's a no-op and returns Status::OK.
+  virtual Status RegisterDbPaths(const std::vector<std::string>& /*paths*/) {
+    return Status::OK();
+  }
+  // Handles the event a DB or a ColumnFamily stops using the specified data
+  // paths.
+  //
+  // It should be called corresponding to each successful RegisterDbPaths.
+  //
+  // Different implementations may take different actions.
+  // By default, it's a no-op and returns Status::OK.
+  virtual Status UnregisterDbPaths(const std::vector<std::string>& /*paths*/) {
+    return Status::OK();
+  }
+
+  // Create a brand new sequentially-readable file with the specified name.
+  // On success, stores a pointer to the new file in *result and returns OK.
+  // On failure stores nullptr in *result and returns non-OK.  If the file does
+  // not exist, returns a non-OK status.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual IOStatus NewSequentialFile(const std::string& fname,
+                                     const FileOptions& file_opts,
+                                     std::unique_ptr<FSSequentialFile>* result,
+                                     IODebugContext* dbg) = 0;
+
+  // Create a brand new random access read-only file with the
+  // specified name.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure stores nullptr in *result and
+  // returns non-OK.  If the file does not exist, returns a non-OK
+  // status.
+  //
+  // The returned file may be concurrently accessed by multiple threads.
+  virtual IOStatus NewRandomAccessFile(
+      const std::string& fname, const FileOptions& file_opts,
+      std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) = 0;
+  // These values match Linux definition
+  // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56
+  enum WriteLifeTimeHint {
+    kWLTHNotSet = 0,  // No hint information set
+    kWLTHNone,        // No hints about write life time
+    kWLTHShort,       // Data written has a short life time
+    kWLTHMedium,      // Data written has a medium life time
+    kWLTHLong,        // Data written has a long life time
+    kWLTHExtreme,     // Data written has an extremely long life time
+  };
+
+  // Create an object that writes to a new file with the specified
+  // name.  Deletes any existing file with the same name and creates a
+  // new file.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure stores nullptr in *result and
+  // returns non-OK.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual IOStatus NewWritableFile(const std::string& fname,
+                                   const FileOptions& file_opts,
+                                   std::unique_ptr<FSWritableFile>* result,
+                                   IODebugContext* dbg) = 0;
+
+  // Create an object that writes to a file with the specified name.
+  // `FSWritableFile::Append()`s will append after any existing content.  If the
+  // file does not already exist, creates it.
+  //
+  // On success, stores a pointer to the file in *result and returns OK.  On
+  // failure stores nullptr in *result and returns non-OK.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual IOStatus ReopenWritableFile(
+      const std::string& /*fname*/, const FileOptions& /*options*/,
+      std::unique_ptr<FSWritableFile>* /*result*/, IODebugContext* /*dbg*/) {
+    return IOStatus::NotSupported("ReopenWritableFile");
+  }
+
+  // Reuse an existing file by renaming it and opening it as writable.
+  virtual IOStatus ReuseWritableFile(const std::string& fname,
+                                     const std::string& old_fname,
+                                     const FileOptions& file_opts,
+                                     std::unique_ptr<FSWritableFile>* result,
+                                     IODebugContext* dbg);
+
+  // Open `fname` for random read and write, if file doesn't exist the file
+  // will be created.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure returns non-OK.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual IOStatus NewRandomRWFile(const std::string& /*fname*/,
+                                   const FileOptions& /*options*/,
+                                   std::unique_ptr<FSRandomRWFile>* /*result*/,
+                                   IODebugContext* /*dbg*/) {
+    return IOStatus::NotSupported(
+        "RandomRWFile is not implemented in this FileSystem");
+  }
+
+  // Opens `fname` as a memory-mapped file for read and write (in-place updates
+  // only, i.e., no appends). On success, stores a raw buffer covering the whole
+  // file in `*result`. The file must exist prior to this call.
+  virtual IOStatus NewMemoryMappedFileBuffer(
+      const std::string& /*fname*/,
+      std::unique_ptr<MemoryMappedFileBuffer>* /*result*/) {
+    return IOStatus::NotSupported(
+        "MemoryMappedFileBuffer is not implemented in this FileSystem");
+  }
+
+  // Create an object that represents a directory. Will fail if directory
+  // doesn't exist. If the directory exists, it will open the directory
+  // and create a new Directory object.
+  //
+  // On success, stores a pointer to the new Directory in
+  // *result and returns OK. On failure stores nullptr in *result and
+  // returns non-OK.
+  virtual IOStatus NewDirectory(const std::string& name,
+                                const IOOptions& io_opts,
+                                std::unique_ptr<FSDirectory>* result,
+                                IODebugContext* dbg) = 0;
+
+  // Returns OK if the named file exists.
+  //         NotFound if the named file does not exist,
+  //                  the calling process does not have permission to determine
+  //                  whether this file exists, or if the path is invalid.
+  //         IOError if an IO Error was encountered
+  virtual IOStatus FileExists(const std::string& fname,
+                              const IOOptions& options,
+                              IODebugContext* dbg) = 0;
+
+  // Store in *result the names of the children of the specified directory.
+  // The names are relative to "dir".
+  // Original contents of *results are dropped.
+  // Returns OK if "dir" exists and "*result" contains its children.
+  //         NotFound if "dir" does not exist, the calling process does not have
+  //                  permission to access "dir", or if "dir" is invalid.
+  //         IOError if an IO Error was encountered
+  virtual IOStatus GetChildren(const std::string& dir, const IOOptions& options,
+                               std::vector<std::string>* result,
+                               IODebugContext* dbg) = 0;
+
+  // Store in *result the attributes of the children of the specified directory.
+  // In case the implementation lists the directory prior to iterating the files
+  // and files are concurrently deleted, the deleted files will be omitted from
+  // result.
+  // The name attributes are relative to "dir".
+  // Original contents of *results are dropped.
+  // Returns OK if "dir" exists and "*result" contains its children.
+  //         NotFound if "dir" does not exist, the calling process does not have
+  //                  permission to access "dir", or if "dir" is invalid.
+  //         IOError if an IO Error was encountered
+  virtual IOStatus GetChildrenFileAttributes(
+      const std::string& dir, const IOOptions& options,
+      std::vector<FileAttributes>* result, IODebugContext* dbg) {
+    assert(result != nullptr);
+    std::vector<std::string> child_fnames;
+    IOStatus s = GetChildren(dir, options, &child_fnames, dbg);
+    if (!s.ok()) {
+      return s;
+    }
+    result->resize(child_fnames.size());
+    size_t result_size = 0;
+    for (size_t i = 0; i < child_fnames.size(); ++i) {
+      const std::string path = dir + "/" + child_fnames[i];
+      if (!(s = GetFileSize(path, options, &(*result)[result_size].size_bytes,
+                            dbg))
+               .ok()) {
+        if (FileExists(path, options, dbg).IsNotFound()) {
+          // The file may have been deleted since we listed the directory
+          continue;
+        }
+        return s;
+      }
+      (*result)[result_size].name = std::move(child_fnames[i]);
+      result_size++;
+    }
+    result->resize(result_size);
+    return IOStatus::OK();
+  }
+
+// This seems to clash with a macro on Windows, so #undef it here
+#ifdef DeleteFile
+#undef DeleteFile
+#endif
+  // Delete the named file.
+  virtual IOStatus DeleteFile(const std::string& fname,
+                              const IOOptions& options,
+                              IODebugContext* dbg) = 0;
+
+  // Truncate the named file to the specified size.
+  virtual IOStatus Truncate(const std::string& /*fname*/, size_t /*size*/,
+                            const IOOptions& /*options*/,
+                            IODebugContext* /*dbg*/) {
+    return IOStatus::NotSupported(
+        "Truncate is not supported for this FileSystem");
+  }
+
+  // Create the specified directory. Returns error if directory exists.
+  virtual IOStatus CreateDir(const std::string& dirname,
+                             const IOOptions& options, IODebugContext* dbg) = 0;
+
+  // Creates directory if missing. Return Ok if it exists, or successful in
+  // Creating.
+  virtual IOStatus CreateDirIfMissing(const std::string& dirname,
+                                      const IOOptions& options,
+                                      IODebugContext* dbg) = 0;
+
+  // Delete the specified directory.
+  virtual IOStatus DeleteDir(const std::string& dirname,
+                             const IOOptions& options, IODebugContext* dbg) = 0;
+
+  // Store the size of fname in *file_size.
+  virtual IOStatus GetFileSize(const std::string& fname,
+                               const IOOptions& options, uint64_t* file_size,
+                               IODebugContext* dbg) = 0;
+
+  // Store the last modification time of fname in *file_mtime.
+  virtual IOStatus GetFileModificationTime(const std::string& fname,
+                                           const IOOptions& options,
+                                           uint64_t* file_mtime,
+                                           IODebugContext* dbg) = 0;
+  // Rename file src to target.
+  virtual IOStatus RenameFile(const std::string& src, const std::string& target,
+                              const IOOptions& options,
+                              IODebugContext* dbg) = 0;
+
+  // Hard Link file src to target.
+  virtual IOStatus LinkFile(const std::string& /*src*/,
+                            const std::string& /*target*/,
+                            const IOOptions& /*options*/,
+                            IODebugContext* /*dbg*/) {
+    return IOStatus::NotSupported(
+        "LinkFile is not supported for this FileSystem");
+  }
+
+  virtual IOStatus NumFileLinks(const std::string& /*fname*/,
+                                const IOOptions& /*options*/,
+                                uint64_t* /*count*/, IODebugContext* /*dbg*/) {
+    return IOStatus::NotSupported(
+        "Getting number of file links is not supported for this FileSystem");
+  }
+
+  virtual IOStatus AreFilesSame(const std::string& /*first*/,
+                                const std::string& /*second*/,
+                                const IOOptions& /*options*/, bool* /*res*/,
+                                IODebugContext* /*dbg*/) {
+    return IOStatus::NotSupported(
+        "AreFilesSame is not supported for this FileSystem");
+  }
+
+  // Lock the specified file.  Used to prevent concurrent access to
+  // the same db by multiple processes.  On failure, stores nullptr in
+  // *lock and returns non-OK.
+  //
+  // On success, stores a pointer to the object that represents the
+  // acquired lock in *lock and returns OK.  The caller should call
+  // UnlockFile(*lock) to release the lock.  If the process exits,
+  // the lock will be automatically released.
+  //
+  // If somebody else already holds the lock, finishes immediately
+  // with a failure.  I.e., this call does not wait for existing locks
+  // to go away.
+  //
+  // May create the named file if it does not already exist.
+  virtual IOStatus LockFile(const std::string& fname, const IOOptions& options,
+                            FileLock** lock, IODebugContext* dbg) = 0;
+
+  // Release the lock acquired by a previous successful call to LockFile.
+  // REQUIRES: lock was returned by a successful LockFile() call
+  // REQUIRES: lock has not already been unlocked.
+  virtual IOStatus UnlockFile(FileLock* lock, const IOOptions& options,
+                              IODebugContext* dbg) = 0;
+
+  // *path is set to a temporary directory that can be used for testing. It may
+  // or many not have just been created. The directory may or may not differ
+  // between runs of the same process, but subsequent calls will return the
+  // same directory.
+  virtual IOStatus GetTestDirectory(const IOOptions& options, std::string* path,
+                                    IODebugContext* dbg) = 0;
+
+  // Create and returns a default logger (an instance of EnvLogger) for storing
+  // informational messages. Derived classes can override to provide custom
+  // logger.
+  virtual IOStatus NewLogger(const std::string& fname, const IOOptions& io_opts,
+                             std::shared_ptr<Logger>* result,
+                             IODebugContext* dbg);
+
+  // Get full directory name for this db.
+  virtual IOStatus GetAbsolutePath(const std::string& db_path,
+                                   const IOOptions& options,
+                                   std::string* output_path,
+                                   IODebugContext* dbg) = 0;
+
+  // Sanitize the FileOptions. Typically called by a FileOptions/EnvOptions
+  // copy constructor
+  virtual void SanitizeFileOptions(FileOptions* /*opts*/) const {}
+
+  // OptimizeForLogRead will create a new FileOptions object that is a copy of
+  // the FileOptions in the parameters, but is optimized for reading log files.
+  virtual FileOptions OptimizeForLogRead(const FileOptions& file_options) const;
+
+  // OptimizeForManifestRead will create a new FileOptions object that is a copy
+  // of the FileOptions in the parameters, but is optimized for reading manifest
+  // files.
+  virtual FileOptions OptimizeForManifestRead(
+      const FileOptions& file_options) const;
+
+  // OptimizeForLogWrite will create a new FileOptions object that is a copy of
+  // the FileOptions in the parameters, but is optimized for writing log files.
+  // Default implementation returns the copy of the same object.
+  virtual FileOptions OptimizeForLogWrite(const FileOptions& file_options,
+                                          const DBOptions& db_options) const;
+
+  // OptimizeForManifestWrite will create a new FileOptions object that is a
+  // copy of the FileOptions in the parameters, but is optimized for writing
+  // manifest files. Default implementation returns the copy of the same
+  // object.
+  virtual FileOptions OptimizeForManifestWrite(
+      const FileOptions& file_options) const;
+
+  // OptimizeForCompactionTableWrite will create a new FileOptions object that
+  // is a copy of the FileOptions in the parameters, but is optimized for
+  // writing table files.
+  virtual FileOptions OptimizeForCompactionTableWrite(
+      const FileOptions& file_options,
+      const ImmutableDBOptions& immutable_ops) const;
+
+  // OptimizeForCompactionTableRead will create a new FileOptions object that
+  // is a copy of the FileOptions in the parameters, but is optimized for
+  // reading table files.
+  virtual FileOptions OptimizeForCompactionTableRead(
+      const FileOptions& file_options,
+      const ImmutableDBOptions& db_options) const;
+
+  // OptimizeForBlobFileRead will create a new FileOptions object that
+  // is a copy of the FileOptions in the parameters, but is optimized for
+  // reading blob files.
+  virtual FileOptions OptimizeForBlobFileRead(
+      const FileOptions& file_options,
+      const ImmutableDBOptions& db_options) const;
+
+// This seems to clash with a macro on Windows, so #undef it here
+#ifdef GetFreeSpace
+#undef GetFreeSpace
+#endif
+
+  // Get the amount of free disk space
+  virtual IOStatus GetFreeSpace(const std::string& /*path*/,
+                                const IOOptions& /*options*/,
+                                uint64_t* /*diskfree*/,
+                                IODebugContext* /*dbg*/) {
+    return IOStatus::NotSupported("GetFreeSpace");
+  }
+
+  virtual IOStatus IsDirectory(const std::string& /*path*/,
+                               const IOOptions& options, bool* is_dir,
+                               IODebugContext* /*dgb*/) = 0;
+
+  // EXPERIMENTAL
+  // Poll for completion of read IO requests. The Poll() method should call the
+  // callback functions to indicate completion of read requests.
+  // Underlying FS is required to support Poll API. Poll implementation should
+  // ensure that the callback gets called at IO completion, and return only
+  // after the callback has been called.
+  // If Poll returns partial results for any reads, its caller reponsibility to
+  // call Read or ReadAsync in order to get the remaining bytes.
+  //
+  // Default implementation is to return IOStatus::OK.
+
+  virtual IOStatus Poll(std::vector<void*>& /*io_handles*/,
+                        size_t /*min_completions*/) {
+    return IOStatus::OK();
+  }
+
+  // EXPERIMENTAL
+  // Abort the read IO requests submitted asynchronously. Underlying FS is
+  // required to support AbortIO API. AbortIO implementation should ensure that
+  // the all the read requests related to io_handles should be aborted and
+  // it shouldn't call the callback for these io_handles.
+  //
+  // Default implementation is to return IOStatus::OK.
+  virtual IOStatus AbortIO(std::vector<void*>& /*io_handles*/) {
+    return IOStatus::OK();
+  }
+
+  // If you're adding methods here, remember to add them to EnvWrapper too.
+
+ private:
+  void operator=(const FileSystem&);
+};
+
+// A file abstraction for reading sequentially through a file
+class FSSequentialFile {
+ public:
+  FSSequentialFile() {}
+
+  virtual ~FSSequentialFile() {}
+
+  // Read up to "n" bytes from the file.  "scratch[0..n-1]" may be
+  // written by this routine.  Sets "*result" to the data that was
+  // read (including if fewer than "n" bytes were successfully read).
+  // May set "*result" to point at data in "scratch[0..n-1]", so
+  // "scratch[0..n-1]" must be live when "*result" is used.
+  // If an error was encountered, returns a non-OK status.
+  //
+  // After call, result->size() < n only if end of file has been
+  // reached (or non-OK status). Read might fail if called again after
+  // first result->size() < n.
+  //
+  // REQUIRES: External synchronization
+  virtual IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+                        char* scratch, IODebugContext* dbg) = 0;
+
+  // Skip "n" bytes from the file. This is guaranteed to be no
+  // slower that reading the same data, but may be faster.
+  //
+  // If end of file is reached, skipping will stop at the end of the
+  // file, and Skip will return OK.
+  //
+  // REQUIRES: External synchronization
+  virtual IOStatus Skip(uint64_t n) = 0;
+
+  // Indicates the upper layers if the current SequentialFile implementation
+  // uses direct IO.
+  virtual bool use_direct_io() const { return false; }
+
+  // Use the returned alignment value to allocate
+  // aligned buffer for Direct I/O
+  virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  virtual IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) {
+    return IOStatus::NotSupported("InvalidateCache not supported.");
+  }
+
+  // Positioned Read for direct I/O
+  // If Direct I/O enabled, offset, n, and scratch should be properly aligned
+  virtual IOStatus PositionedRead(uint64_t /*offset*/, size_t /*n*/,
+                                  const IOOptions& /*options*/,
+                                  Slice* /*result*/, char* /*scratch*/,
+                                  IODebugContext* /*dbg*/) {
+    return IOStatus::NotSupported("PositionedRead");
+  }
+
+  // EXPERIMENTAL
+  // When available, returns the actual temperature for the file. This is
+  // useful in case some outside process moves a file from one tier to another,
+  // though the temperature is generally expected not to change while a file is
+  // open.
+  virtual Temperature GetTemperature() const { return Temperature::kUnknown; }
+
+  // If you're adding methods here, remember to add them to
+  // SequentialFileWrapper too.
+};
+
+// A read IO request structure for use in MultiRead and asynchronous Read APIs.
+struct FSReadRequest {
+  // Input parameter that represents the file offset in bytes.
+  uint64_t offset;
+
+  // Input parameter that represents the length to read in bytes. `result` only
+  // returns fewer bytes if end of file is hit (or `status` is not OK).
+  size_t len;
+
+  // A buffer that MultiRead() can optionally place data in. It can
+  // ignore this and allocate its own buffer.
+  // The lifecycle of scratch will be until IO is completed.
+  //
+  // In case of asynchronous reads, its an output parameter and it will be
+  // maintained until callback has been called. Scratch is allocated by RocksDB
+  // and will be passed to underlying FileSystem.
+  char* scratch;
+
+  // Output parameter set by MultiRead() to point to the data buffer, and
+  // the number of valid bytes
+  //
+  // In case of asynchronous reads, this output parameter is set by Async Read
+  // APIs to point to the data buffer, and
+  // the number of valid bytes.
+  // Slice result should point to scratch i.e the data should
+  // always be read into scratch.
+  Slice result;
+
+  // Output parameter set by underlying FileSystem that represents status of
+  // read request.
+  IOStatus status;
+};
+
+// A file abstraction for randomly reading the contents of a file.
+class FSRandomAccessFile {
+ public:
+  FSRandomAccessFile() {}
+
+  virtual ~FSRandomAccessFile() {}
+
+  // Read up to "n" bytes from the file starting at "offset".
+  // "scratch[0..n-1]" may be written by this routine.  Sets "*result"
+  // to the data that was read (including if fewer than "n" bytes were
+  // successfully read).  May set "*result" to point at data in
+  // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
+  // "*result" is used.  If an error was encountered, returns a non-OK
+  // status.
+  //
+  // After call, result->size() < n only if end of file has been
+  // reached (or non-OK status). Read might fail if called again after
+  // first result->size() < n.
+  //
+  // Safe for concurrent use by multiple threads.
+  // If Direct I/O enabled, offset, n, and scratch should be aligned properly.
+  virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                        Slice* result, char* scratch,
+                        IODebugContext* dbg) const = 0;
+
+  // Readahead the file starting from offset by n bytes for caching.
+  // If it's not implemented (default: `NotSupported`), RocksDB will create
+  // internal prefetch buffer to improve read performance.
+  virtual IOStatus Prefetch(uint64_t /*offset*/, size_t /*n*/,
+                            const IOOptions& /*options*/,
+                            IODebugContext* /*dbg*/) {
+    return IOStatus::NotSupported("Prefetch");
+  }
+
+  // Read a bunch of blocks as described by reqs. The blocks can
+  // optionally be read in parallel. This is a synchronous call, i.e it
+  // should return after all reads have completed. The reads will be
+  // non-overlapping but can be in any order. If the function return Status
+  // is not ok, status of individual requests will be ignored and return
+  // status will be assumed for all read requests. The function return status
+  // is only meant for errors that occur before processing individual read
+  // requests.
+  virtual IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
+                             const IOOptions& options, IODebugContext* dbg) {
+    assert(reqs != nullptr);
+    for (size_t i = 0; i < num_reqs; ++i) {
+      FSReadRequest& req = reqs[i];
+      req.status =
+          Read(req.offset, req.len, options, &req.result, req.scratch, dbg);
+    }
+    return IOStatus::OK();
+  }
+
+  // Tries to get an unique ID for this file that will be the same each time
+  // the file is opened (and will stay the same while the file is open).
+  // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
+  // ID can be created this function returns the length of the ID and places it
+  // in "id"; otherwise, this function returns 0, in which case "id"
+  // may not have been modified.
+  //
+  // This function guarantees, for IDs from a given environment, two unique ids
+  // cannot be made equal to each other by adding arbitrary bytes to one of
+  // them. That is, no unique ID is the prefix of another.
+  //
+  // This function guarantees that the returned ID will not be interpretable as
+  // a single varint.
+  //
+  // Note: these IDs are only valid for the duration of the process.
+  virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
+    return 0;  // Default implementation to prevent issues with backwards
+               // compatibility.
+  };
+
+  enum AccessPattern { kNormal, kRandom, kSequential, kWillNeed, kWontNeed };
+
+  virtual void Hint(AccessPattern /*pattern*/) {}
+
+  // Indicates the upper layers if the current RandomAccessFile implementation
+  // uses direct IO.
+  virtual bool use_direct_io() const { return false; }
+
+  // Use the returned alignment value to allocate
+  // aligned buffer for Direct I/O
+  virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  virtual IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) {
+    return IOStatus::NotSupported("InvalidateCache not supported.");
+  }
+
+  // EXPERIMENTAL
+  // This API reads the requested data in FSReadRequest asynchronously. This is
+  // a asynchronous call, i.e it should return after submitting the request.
+  //
+  // When the read request is completed, callback function specified in cb
+  // should be called with arguments cb_arg and the result populated in
+  // FSReadRequest with result and status fileds updated by FileSystem.
+  // cb_arg should be used by the callback to track the original request
+  // submitted.
+  //
+  // This API should also populate io_handle which should be used by
+  // underlying FileSystem to store the context in order to distinguish the read
+  // requests at their side and provide the custom deletion function in del_fn.
+  // RocksDB guarantees that the del_fn for io_handle will be called after
+  // receiving the callback. Furthermore, RocksDB guarantees that if it calls
+  // the Poll API for this io_handle, del_fn will be called after the Poll
+  // returns. RocksDB is responsible for managing the lifetime of io_handle.
+  //
+  // req contains the request offset and size passed as input parameter of read
+  // request and result and status fields are output parameter set by underlying
+  // FileSystem. The data should always be read into scratch field.
+  //
+  // Default implementation is to read the data synchronously.
+  virtual IOStatus ReadAsync(
+      FSReadRequest& req, const IOOptions& opts,
+      std::function<void(const FSReadRequest&, void*)> cb, void* cb_arg,
+      void** /*io_handle*/, IOHandleDeleter* /*del_fn*/, IODebugContext* dbg) {
+    req.status =
+        Read(req.offset, req.len, opts, &(req.result), req.scratch, dbg);
+    cb(req, cb_arg);
+    return IOStatus::OK();
+  }
+
+  // EXPERIMENTAL
+  // When available, returns the actual temperature for the file. This is
+  // useful in case some outside process moves a file from one tier to another,
+  // though the temperature is generally expected not to change while a file is
+  // open.
+  virtual Temperature GetTemperature() const { return Temperature::kUnknown; }
+
+  // If you're adding methods here, remember to add them to
+  // RandomAccessFileWrapper too.
+};
+
+// A data structure brings the data verification information, which is
+// used together with data being written to a file.
+struct DataVerificationInfo {
+  // checksum of the data being written.
+  Slice checksum;
+};
+
+// A file abstraction for sequential writing.  The implementation
+// must provide buffering since callers may append small fragments
+// at a time to the file.
+class FSWritableFile {
+ public:
+  FSWritableFile()
+      : last_preallocated_block_(0),
+        preallocation_block_size_(0),
+        io_priority_(Env::IO_TOTAL),
+        write_hint_(Env::WLTH_NOT_SET),
+        strict_bytes_per_sync_(false) {}
+
+  explicit FSWritableFile(const FileOptions& options)
+      : last_preallocated_block_(0),
+        preallocation_block_size_(0),
+        io_priority_(Env::IO_TOTAL),
+        write_hint_(Env::WLTH_NOT_SET),
+        strict_bytes_per_sync_(options.strict_bytes_per_sync) {}
+
+  virtual ~FSWritableFile() {}
+
+  // Append data to the end of the file
+  // Note: A WritableFile object must support either Append or
+  // PositionedAppend, so the users cannot mix the two.
+  virtual IOStatus Append(const Slice& data, const IOOptions& options,
+                          IODebugContext* dbg) = 0;
+
+  // Append data with verification information.
+  // Note that this API change is experimental and it might be changed in
+  // the future. Currently, RocksDB only generates crc32c based checksum for
+  // the file writes when the checksum handoff option is set.
+  // Expected behavior: if the handoff_checksum_type in FileOptions (currently,
+  // ChecksumType::kCRC32C is set as default) is not supported by this
+  // FSWritableFile, the information in DataVerificationInfo can be ignored
+  // (i.e. does not perform checksum verification).
+  virtual IOStatus Append(const Slice& data, const IOOptions& options,
+                          const DataVerificationInfo& /* verification_info */,
+                          IODebugContext* dbg) {
+    return Append(data, options, dbg);
+  }
+
+  // PositionedAppend data to the specified offset. The new EOF after append
+  // must be larger than the previous EOF. This is to be used when writes are
+  // not backed by OS buffers and hence has to always start from the start of
+  // the sector. The implementation thus needs to also rewrite the last
+  // partial sector.
+  // Note: PositionAppend does not guarantee moving the file offset after the
+  // write. A WritableFile object must support either Append or
+  // PositionedAppend, so the users cannot mix the two.
+  //
+  // PositionedAppend() can only happen on the page/sector boundaries. For that
+  // reason, if the last write was an incomplete sector we still need to rewind
+  // back to the nearest sector/page and rewrite the portion of it with whatever
+  // we need to add. We need to keep where we stop writing.
+  //
+  // PositionedAppend() can only write whole sectors. For that reason we have to
+  // pad with zeros for the last write and trim the file when closing according
+  // to the position we keep in the previous step.
+  //
+  // PositionedAppend() requires aligned buffer to be passed in. The alignment
+  // required is queried via GetRequiredBufferAlignment()
+  virtual IOStatus PositionedAppend(const Slice& /* data */,
+                                    uint64_t /* offset */,
+                                    const IOOptions& /*options*/,
+                                    IODebugContext* /*dbg*/) {
+    return IOStatus::NotSupported("PositionedAppend");
+  }
+
+  // PositionedAppend data with verification information.
+  // Note that this API change is experimental and it might be changed in
+  // the future. Currently, RocksDB only generates crc32c based checksum for
+  // the file writes when the checksum handoff option is set.
+  // Expected behavior: if the handoff_checksum_type in FileOptions (currently,
+  // ChecksumType::kCRC32C is set as default) is not supported by this
+  // FSWritableFile, the information in DataVerificationInfo can be ignored
+  // (i.e. does not perform checksum verification).
+  virtual IOStatus PositionedAppend(
+      const Slice& /* data */, uint64_t /* offset */,
+      const IOOptions& /*options*/,
+      const DataVerificationInfo& /* verification_info */,
+      IODebugContext* /*dbg*/) {
+    return IOStatus::NotSupported("PositionedAppend");
+  }
+
+  // Truncate is necessary to trim the file to the correct size
+  // before closing. It is not always possible to keep track of the file
+  // size due to whole pages writes. The behavior is undefined if called
+  // with other writes to follow.
+  virtual IOStatus Truncate(uint64_t /*size*/, const IOOptions& /*options*/,
+                            IODebugContext* /*dbg*/) {
+    return IOStatus::OK();
+  }
+  virtual IOStatus Close(const IOOptions& /*options*/,
+                         IODebugContext* /*dbg*/) = 0;
+
+  virtual IOStatus Flush(const IOOptions& options, IODebugContext* dbg) = 0;
+  virtual IOStatus Sync(const IOOptions& options,
+                        IODebugContext* dbg) = 0;  // sync data
+
+  /*
+   * Sync data and/or metadata as well.
+   * By default, sync only data.
+   * Override this method for environments where we need to sync
+   * metadata as well.
+   */
+  virtual IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) {
+    return Sync(options, dbg);
+  }
+
+  // true if Sync() and Fsync() are safe to call concurrently with Append()
+  // and Flush().
+  virtual bool IsSyncThreadSafe() const { return false; }
+
+  // Indicates the upper layers if the current WritableFile implementation
+  // uses direct IO.
+  virtual bool use_direct_io() const { return false; }
+
+  // Use the returned alignment value to allocate
+  // aligned buffer for Direct I/O
+  virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+  virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) {
+    write_hint_ = hint;
+  }
+
+  /*
+   * If rate limiting is enabled, change the file-granularity priority used in
+   * rate-limiting writes.
+   *
+   * In the presence of finer-granularity priority such as
+   * `WriteOptions::rate_limiter_priority`, this file-granularity priority may
+   * be overridden by a non-Env::IO_TOTAL finer-granularity priority and used as
+   * a fallback for Env::IO_TOTAL finer-granularity priority.
+   *
+   * If rate limiting is not enabled, this call has no effect.
+   */
+  virtual void SetIOPriority(Env::IOPriority pri) { io_priority_ = pri; }
+
+  virtual Env::IOPriority GetIOPriority() { return io_priority_; }
+
+  virtual Env::WriteLifeTimeHint GetWriteLifeTimeHint() { return write_hint_; }
+  /*
+   * Get the size of valid data in the file.
+   */
+  virtual uint64_t GetFileSize(const IOOptions& /*options*/,
+                               IODebugContext* /*dbg*/) {
+    return 0;
+  }
+
+  /*
+   * Get and set the default pre-allocation block size for writes to
+   * this file.  If non-zero, then Allocate will be used to extend the
+   * underlying storage of a file (generally via fallocate) if the Env
+   * instance supports it.
+   */
+  virtual void SetPreallocationBlockSize(size_t size) {
+    preallocation_block_size_ = size;
+  }
+
+  virtual void GetPreallocationStatus(size_t* block_size,
+                                      size_t* last_allocated_block) {
+    *last_allocated_block = last_preallocated_block_;
+    *block_size = preallocation_block_size_;
+  }
+
+  // For documentation, refer to RandomAccessFile::GetUniqueId()
+  virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
+    return 0;  // Default implementation to prevent issues with backwards
+  }
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  // This call has no effect on dirty pages in the cache.
+  virtual IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) {
+    return IOStatus::NotSupported("InvalidateCache not supported.");
+  }
+
+  // Sync a file range with disk.
+  // offset is the starting byte of the file range to be synchronized.
+  // nbytes specifies the length of the range to be synchronized.
+  // This asks the OS to initiate flushing the cached data to disk,
+  // without waiting for completion.
+  // Default implementation does nothing.
+  virtual IOStatus RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/,
+                             const IOOptions& options, IODebugContext* dbg) {
+    if (strict_bytes_per_sync_) {
+      return Sync(options, dbg);
+    }
+    return IOStatus::OK();
+  }
+
+  // PrepareWrite performs any necessary preparation for a write
+  // before the write actually occurs.  This allows for pre-allocation
+  // of space on devices where it can result in less file
+  // fragmentation and/or less waste from over-zealous filesystem
+  // pre-allocation.
+  virtual void PrepareWrite(size_t offset, size_t len, const IOOptions& options,
+                            IODebugContext* dbg) {
+    if (preallocation_block_size_ == 0) {
+      return;
+    }
+    // If this write would cross one or more preallocation blocks,
+    // determine what the last preallocation block necessary to
+    // cover this write would be and Allocate to that point.
+    const auto block_size = preallocation_block_size_;
+    size_t new_last_preallocated_block =
+        (offset + len + block_size - 1) / block_size;
+    if (new_last_preallocated_block > last_preallocated_block_) {
+      size_t num_spanned_blocks =
+          new_last_preallocated_block - last_preallocated_block_;
+      Allocate(block_size * last_preallocated_block_,
+               block_size * num_spanned_blocks, options, dbg)
+          .PermitUncheckedError();
+      last_preallocated_block_ = new_last_preallocated_block;
+    }
+  }
+
+  // Pre-allocates space for a file.
+  virtual IOStatus Allocate(uint64_t /*offset*/, uint64_t /*len*/,
+                            const IOOptions& /*options*/,
+                            IODebugContext* /*dbg*/) {
+    return IOStatus::OK();
+  }
+
+  // If you're adding methods here, remember to add them to
+  // WritableFileWrapper too.
+
+ protected:
+  size_t preallocation_block_size() { return preallocation_block_size_; }
+
+ private:
+  size_t last_preallocated_block_;
+  size_t preallocation_block_size_;
+  // No copying allowed
+  FSWritableFile(const FSWritableFile&);
+  void operator=(const FSWritableFile&);
+
+ protected:
+  Env::IOPriority io_priority_;
+  Env::WriteLifeTimeHint write_hint_;
+  const bool strict_bytes_per_sync_;
+};
+
+// A file abstraction for random reading and writing.
+class FSRandomRWFile {
+ public:
+  FSRandomRWFile() {}
+
+  virtual ~FSRandomRWFile() {}
+
+  // Indicates if the class makes use of direct I/O
+  // If false you must pass aligned buffer to Write()
+  virtual bool use_direct_io() const { return false; }
+
+  // Use the returned alignment value to allocate
+  // aligned buffer for Direct I/O
+  virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
+
+  // Write bytes in `data` at  offset `offset`, Returns Status::OK() on success.
+  // Pass aligned buffer when use_direct_io() returns true.
+  virtual IOStatus Write(uint64_t offset, const Slice& data,
+                         const IOOptions& options, IODebugContext* dbg) = 0;
+
+  // Read up to `n` bytes starting from offset `offset` and store them in
+  // result, provided `scratch` size should be at least `n`.
+  //
+  // After call, result->size() < n only if end of file has been
+  // reached (or non-OK status). Read might fail if called again after
+  // first result->size() < n.
+  //
+  // Returns Status::OK() on success.
+  virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                        Slice* result, char* scratch,
+                        IODebugContext* dbg) const = 0;
+
+  virtual IOStatus Flush(const IOOptions& options, IODebugContext* dbg) = 0;
+
+  virtual IOStatus Sync(const IOOptions& options, IODebugContext* dbg) = 0;
+
+  virtual IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) {
+    return Sync(options, dbg);
+  }
+
+  virtual IOStatus Close(const IOOptions& options, IODebugContext* dbg) = 0;
+
+  // EXPERIMENTAL
+  // When available, returns the actual temperature for the file. This is
+  // useful in case some outside process moves a file from one tier to another,
+  // though the temperature is generally expected not to change while a file is
+  // open.
+  virtual Temperature GetTemperature() const { return Temperature::kUnknown; }
+
+  // If you're adding methods here, remember to add them to
+  // RandomRWFileWrapper too.
+
+  // No copying allowed
+  FSRandomRWFile(const RandomRWFile&) = delete;
+  FSRandomRWFile& operator=(const RandomRWFile&) = delete;
+};
+
+// MemoryMappedFileBuffer object represents a memory-mapped file's raw buffer.
+// Subclasses should release the mapping upon destruction.
+class FSMemoryMappedFileBuffer {
+ public:
+  FSMemoryMappedFileBuffer(void* _base, size_t _length)
+      : base_(_base), length_(_length) {}
+
+  virtual ~FSMemoryMappedFileBuffer() = 0;
+
+  // We do not want to unmap this twice. We can make this class
+  // movable if desired, however, since
+  FSMemoryMappedFileBuffer(const FSMemoryMappedFileBuffer&) = delete;
+  FSMemoryMappedFileBuffer& operator=(const FSMemoryMappedFileBuffer&) = delete;
+
+  void* GetBase() const { return base_; }
+  size_t GetLen() const { return length_; }
+
+ protected:
+  void* base_;
+  const size_t length_;
+};
+
+// Directory object represents collection of files and implements
+// filesystem operations that can be executed on directories.
+class FSDirectory {
+ public:
+  virtual ~FSDirectory() {}
+  // Fsync directory. Can be called concurrently from multiple threads.
+  virtual IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) = 0;
+
+  // FsyncWithDirOptions after renaming a file. Depends on the filesystem, it
+  // may fsync directory or just the renaming file (e.g. btrfs). By default, it
+  // just calls directory fsync.
+  virtual IOStatus FsyncWithDirOptions(
+      const IOOptions& options, IODebugContext* dbg,
+      const DirFsyncOptions& /*dir_fsync_options*/) {
+    return Fsync(options, dbg);
+  }
+
+  // Close directory
+  virtual IOStatus Close(const IOOptions& /*options*/,
+                         IODebugContext* /*dbg*/) {
+    return IOStatus::NotSupported("Close");
+  }
+
+  virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
+    return 0;
+  }
+
+  // If you're adding methods here, remember to add them to
+  // DirectoryWrapper too.
+};
+
+// Below are helpers for wrapping most of the classes in this file.
+// They forward all calls to another instance of the class.
+// Useful when wrapping the default implementations.
+// Typical usage is to inherit your wrapper from *Wrapper, e.g.:
+//
+// class MySequentialFileWrapper : public
+// ROCKSDB_NAMESPACE::FSSequentialFileWrapper {
+//  public:
+//   MySequentialFileWrapper(ROCKSDB_NAMESPACE::FSSequentialFile* target):
+//     ROCKSDB_NAMESPACE::FSSequentialFileWrapper(target) {}
+//   Status Read(size_t n, FileSystem::IOOptions& options, Slice* result,
+//               char* scratch, FileSystem::IODebugContext* dbg) override {
+//     cout << "Doing a read of size " << n << "!" << endl;
+//     return ROCKSDB_NAMESPACE::FSSequentialFileWrapper::Read(n, options,
+//     result,
+//                                                 scratch, dbg);
+//   }
+//   // All other methods are forwarded to target_ automatically.
+// };
+//
+// This is often more convenient than inheriting the class directly because
+// (a) Don't have to override and forward all methods - the Wrapper will
+//     forward everything you're not explicitly overriding.
+// (b) Don't need to update the wrapper when more methods are added to the
+//     rocksdb class. Unless you actually want to override the behavior.
+//     (And unless rocksdb people forgot to update the *Wrapper class.)
+
+// An implementation of Env that forwards all calls to another Env.
+// May be useful to clients who wish to override just part of the
+// functionality of another Env.
+class FileSystemWrapper : public FileSystem {
+ public:
+  // Initialize an EnvWrapper that delegates all calls to *t
+  explicit FileSystemWrapper(const std::shared_ptr<FileSystem>& t);
+  ~FileSystemWrapper() override {}
+
+  // Return the target to which this Env forwards all calls
+  FileSystem* target() const { return target_.get(); }
+
+  // The following text is boilerplate that forwards all methods to target()
+  IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts,
+                             std::unique_ptr<FSSequentialFile>* r,
+                             IODebugContext* dbg) override {
+    return target_->NewSequentialFile(f, file_opts, r, dbg);
+  }
+  IOStatus NewRandomAccessFile(const std::string& f,
+                               const FileOptions& file_opts,
+                               std::unique_ptr<FSRandomAccessFile>* r,
+                               IODebugContext* dbg) override {
+    return target_->NewRandomAccessFile(f, file_opts, r, dbg);
+  }
+  IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts,
+                           std::unique_ptr<FSWritableFile>* r,
+                           IODebugContext* dbg) override {
+    return target_->NewWritableFile(f, file_opts, r, dbg);
+  }
+  IOStatus ReopenWritableFile(const std::string& fname,
+                              const FileOptions& file_opts,
+                              std::unique_ptr<FSWritableFile>* result,
+                              IODebugContext* dbg) override {
+    return target_->ReopenWritableFile(fname, file_opts, result, dbg);
+  }
+  IOStatus ReuseWritableFile(const std::string& fname,
+                             const std::string& old_fname,
+                             const FileOptions& file_opts,
+                             std::unique_ptr<FSWritableFile>* r,
+                             IODebugContext* dbg) override {
+    return target_->ReuseWritableFile(fname, old_fname, file_opts, r, dbg);
+  }
+  IOStatus NewRandomRWFile(const std::string& fname,
+                           const FileOptions& file_opts,
+                           std::unique_ptr<FSRandomRWFile>* result,
+                           IODebugContext* dbg) override {
+    return target_->NewRandomRWFile(fname, file_opts, result, dbg);
+  }
+  IOStatus NewMemoryMappedFileBuffer(
+      const std::string& fname,
+      std::unique_ptr<MemoryMappedFileBuffer>* result) override {
+    return target_->NewMemoryMappedFileBuffer(fname, result);
+  }
+  IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts,
+                        std::unique_ptr<FSDirectory>* result,
+                        IODebugContext* dbg) override {
+    return target_->NewDirectory(name, io_opts, result, dbg);
+  }
+  IOStatus FileExists(const std::string& f, const IOOptions& io_opts,
+                      IODebugContext* dbg) override {
+    return target_->FileExists(f, io_opts, dbg);
+  }
+  IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts,
+                       std::vector<std::string>* r,
+                       IODebugContext* dbg) override {
+    return target_->GetChildren(dir, io_opts, r, dbg);
+  }
+  IOStatus GetChildrenFileAttributes(const std::string& dir,
+                                     const IOOptions& options,
+                                     std::vector<FileAttributes>* result,
+                                     IODebugContext* dbg) override {
+    return target_->GetChildrenFileAttributes(dir, options, result, dbg);
+  }
+  IOStatus DeleteFile(const std::string& f, const IOOptions& options,
+                      IODebugContext* dbg) override {
+    return target_->DeleteFile(f, options, dbg);
+  }
+  IOStatus Truncate(const std::string& fname, size_t size,
+                    const IOOptions& options, IODebugContext* dbg) override {
+    return target_->Truncate(fname, size, options, dbg);
+  }
+  IOStatus CreateDir(const std::string& d, const IOOptions& options,
+                     IODebugContext* dbg) override {
+    return target_->CreateDir(d, options, dbg);
+  }
+  IOStatus CreateDirIfMissing(const std::string& d, const IOOptions& options,
+                              IODebugContext* dbg) override {
+    return target_->CreateDirIfMissing(d, options, dbg);
+  }
+  IOStatus DeleteDir(const std::string& d, const IOOptions& options,
+                     IODebugContext* dbg) override {
+    return target_->DeleteDir(d, options, dbg);
+  }
+  IOStatus GetFileSize(const std::string& f, const IOOptions& options,
+                       uint64_t* s, IODebugContext* dbg) override {
+    return target_->GetFileSize(f, options, s, dbg);
+  }
+
+  IOStatus GetFileModificationTime(const std::string& fname,
+                                   const IOOptions& options,
+                                   uint64_t* file_mtime,
+                                   IODebugContext* dbg) override {
+    return target_->GetFileModificationTime(fname, options, file_mtime, dbg);
+  }
+
+  IOStatus GetAbsolutePath(const std::string& db_path, const IOOptions& options,
+                           std::string* output_path,
+                           IODebugContext* dbg) override {
+    return target_->GetAbsolutePath(db_path, options, output_path, dbg);
+  }
+
+  IOStatus RenameFile(const std::string& s, const std::string& t,
+                      const IOOptions& options, IODebugContext* dbg) override {
+    return target_->RenameFile(s, t, options, dbg);
+  }
+
+  IOStatus LinkFile(const std::string& s, const std::string& t,
+                    const IOOptions& options, IODebugContext* dbg) override {
+    return target_->LinkFile(s, t, options, dbg);
+  }
+
+  IOStatus NumFileLinks(const std::string& fname, const IOOptions& options,
+                        uint64_t* count, IODebugContext* dbg) override {
+    return target_->NumFileLinks(fname, options, count, dbg);
+  }
+
+  IOStatus AreFilesSame(const std::string& first, const std::string& second,
+                        const IOOptions& options, bool* res,
+                        IODebugContext* dbg) override {
+    return target_->AreFilesSame(first, second, options, res, dbg);
+  }
+
+  IOStatus LockFile(const std::string& f, const IOOptions& options,
+                    FileLock** l, IODebugContext* dbg) override {
+    return target_->LockFile(f, options, l, dbg);
+  }
+
+  IOStatus UnlockFile(FileLock* l, const IOOptions& options,
+                      IODebugContext* dbg) override {
+    return target_->UnlockFile(l, options, dbg);
+  }
+
+  IOStatus GetTestDirectory(const IOOptions& options, std::string* path,
+                            IODebugContext* dbg) override {
+    return target_->GetTestDirectory(options, path, dbg);
+  }
+  IOStatus NewLogger(const std::string& fname, const IOOptions& options,
+                     std::shared_ptr<Logger>* result,
+                     IODebugContext* dbg) override {
+    return target_->NewLogger(fname, options, result, dbg);
+  }
+
+  void SanitizeFileOptions(FileOptions* opts) const override {
+    target_->SanitizeFileOptions(opts);
+  }
+
+  FileOptions OptimizeForLogRead(
+      const FileOptions& file_options) const override {
+    return target_->OptimizeForLogRead(file_options);
+  }
+  FileOptions OptimizeForManifestRead(
+      const FileOptions& file_options) const override {
+    return target_->OptimizeForManifestRead(file_options);
+  }
+  FileOptions OptimizeForLogWrite(const FileOptions& file_options,
+                                  const DBOptions& db_options) const override {
+    return target_->OptimizeForLogWrite(file_options, db_options);
+  }
+  FileOptions OptimizeForManifestWrite(
+      const FileOptions& file_options) const override {
+    return target_->OptimizeForManifestWrite(file_options);
+  }
+  FileOptions OptimizeForCompactionTableWrite(
+      const FileOptions& file_options,
+      const ImmutableDBOptions& immutable_ops) const override {
+    return target_->OptimizeForCompactionTableWrite(file_options,
+                                                    immutable_ops);
+  }
+  FileOptions OptimizeForCompactionTableRead(
+      const FileOptions& file_options,
+      const ImmutableDBOptions& db_options) const override {
+    return target_->OptimizeForCompactionTableRead(file_options, db_options);
+  }
+  FileOptions OptimizeForBlobFileRead(
+      const FileOptions& file_options,
+      const ImmutableDBOptions& db_options) const override {
+    return target_->OptimizeForBlobFileRead(file_options, db_options);
+  }
+  IOStatus GetFreeSpace(const std::string& path, const IOOptions& options,
+                        uint64_t* diskfree, IODebugContext* dbg) override {
+    return target_->GetFreeSpace(path, options, diskfree, dbg);
+  }
+  IOStatus IsDirectory(const std::string& path, const IOOptions& options,
+                       bool* is_dir, IODebugContext* dbg) override {
+    return target_->IsDirectory(path, options, is_dir, dbg);
+  }
+
+  const Customizable* Inner() const override { return target_.get(); }
+  Status PrepareOptions(const ConfigOptions& options) override;
+#ifndef ROCKSDB_LITE
+  std::string SerializeOptions(const ConfigOptions& config_options,
+                               const std::string& header) const override;
+#endif  // ROCKSDB_LITE
+
+  virtual IOStatus Poll(std::vector<void*>& io_handles,
+                        size_t min_completions) override {
+    return target_->Poll(io_handles, min_completions);
+  }
+
+  virtual IOStatus AbortIO(std::vector<void*>& io_handles) override {
+    return target_->AbortIO(io_handles);
+  }
+
+ protected:
+  std::shared_ptr<FileSystem> target_;
+};
+
+class FSSequentialFileWrapper : public FSSequentialFile {
+ public:
+  // Creates a FileWrapper around the input File object and without
+  // taking ownership of the object
+  explicit FSSequentialFileWrapper(FSSequentialFile* t) : target_(t) {}
+
+  FSSequentialFile* target() const { return target_; }
+
+  IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+                char* scratch, IODebugContext* dbg) override {
+    return target_->Read(n, options, result, scratch, dbg);
+  }
+  IOStatus Skip(uint64_t n) override { return target_->Skip(n); }
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  IOStatus InvalidateCache(size_t offset, size_t length) override {
+    return target_->InvalidateCache(offset, length);
+  }
+  IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options,
+                          Slice* result, char* scratch,
+                          IODebugContext* dbg) override {
+    return target_->PositionedRead(offset, n, options, result, scratch, dbg);
+  }
+  Temperature GetTemperature() const override {
+    return target_->GetTemperature();
+  }
+
+ private:
+  FSSequentialFile* target_;
+};
+
+class FSSequentialFileOwnerWrapper : public FSSequentialFileWrapper {
+ public:
+  // Creates a FileWrapper around the input File object and takes
+  // ownership of the object
+  explicit FSSequentialFileOwnerWrapper(std::unique_ptr<FSSequentialFile>&& t)
+      : FSSequentialFileWrapper(t.get()), guard_(std::move(t)) {}
+
+ private:
+  std::unique_ptr<FSSequentialFile> guard_;
+};
+
+class FSRandomAccessFileWrapper : public FSRandomAccessFile {
+ public:
+  // Creates a FileWrapper around the input File object and without
+  // taking ownership of the object
+  explicit FSRandomAccessFileWrapper(FSRandomAccessFile* t) : target_(t) {}
+
+  FSRandomAccessFile* target() const { return target_; }
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override {
+    return target_->Read(offset, n, options, result, scratch, dbg);
+  }
+  IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
+                     const IOOptions& options, IODebugContext* dbg) override {
+    return target_->MultiRead(reqs, num_reqs, options, dbg);
+  }
+  IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
+                    IODebugContext* dbg) override {
+    return target_->Prefetch(offset, n, options, dbg);
+  }
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  };
+  void Hint(AccessPattern pattern) override { target_->Hint(pattern); }
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  IOStatus InvalidateCache(size_t offset, size_t length) override {
+    return target_->InvalidateCache(offset, length);
+  }
+  IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts,
+                     std::function<void(const FSReadRequest&, void*)> cb,
+                     void* cb_arg, void** io_handle, IOHandleDeleter* del_fn,
+                     IODebugContext* dbg) override {
+    return target()->ReadAsync(req, opts, cb, cb_arg, io_handle, del_fn, dbg);
+  }
+  Temperature GetTemperature() const override {
+    return target_->GetTemperature();
+  }
+
+ private:
+  std::unique_ptr<FSRandomAccessFile> guard_;
+  FSRandomAccessFile* target_;
+};
+
+class FSRandomAccessFileOwnerWrapper : public FSRandomAccessFileWrapper {
+ public:
+  // Creates a FileWrapper around the input File object and takes
+  // ownership of the object
+  explicit FSRandomAccessFileOwnerWrapper(
+      std::unique_ptr<FSRandomAccessFile>&& t)
+      : FSRandomAccessFileWrapper(t.get()), guard_(std::move(t)) {}
+
+ private:
+  std::unique_ptr<FSRandomAccessFile> guard_;
+};
+
+class FSWritableFileWrapper : public FSWritableFile {
+ public:
+  // Creates a FileWrapper around the input File object and without
+  // taking ownership of the object
+  explicit FSWritableFileWrapper(FSWritableFile* t) : target_(t) {}
+
+  FSWritableFile* target() const { return target_; }
+
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  IODebugContext* dbg) override {
+    return target_->Append(data, options, dbg);
+  }
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  const DataVerificationInfo& verification_info,
+                  IODebugContext* dbg) override {
+    return target_->Append(data, options, verification_info, dbg);
+  }
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& options,
+                            IODebugContext* dbg) override {
+    return target_->PositionedAppend(data, offset, options, dbg);
+  }
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& options,
+                            const DataVerificationInfo& verification_info,
+                            IODebugContext* dbg) override {
+    return target_->PositionedAppend(data, offset, options, verification_info,
+                                     dbg);
+  }
+  IOStatus Truncate(uint64_t size, const IOOptions& options,
+                    IODebugContext* dbg) override {
+    return target_->Truncate(size, options, dbg);
+  }
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+    return target_->Close(options, dbg);
+  }
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override {
+    return target_->Flush(options, dbg);
+  }
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override {
+    return target_->Sync(options, dbg);
+  }
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override {
+    return target_->Fsync(options, dbg);
+  }
+  bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); }
+
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+
+  void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override {
+    target_->SetWriteLifeTimeHint(hint);
+  }
+
+  Env::WriteLifeTimeHint GetWriteLifeTimeHint() override {
+    return target_->GetWriteLifeTimeHint();
+  }
+
+  uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override {
+    return target_->GetFileSize(options, dbg);
+  }
+
+  void SetPreallocationBlockSize(size_t size) override {
+    target_->SetPreallocationBlockSize(size);
+  }
+
+  void GetPreallocationStatus(size_t* block_size,
+                              size_t* last_allocated_block) override {
+    target_->GetPreallocationStatus(block_size, last_allocated_block);
+  }
+
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+
+  IOStatus InvalidateCache(size_t offset, size_t length) override {
+    return target_->InvalidateCache(offset, length);
+  }
+
+  IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options,
+                     IODebugContext* dbg) override {
+    return target_->RangeSync(offset, nbytes, options, dbg);
+  }
+
+  void PrepareWrite(size_t offset, size_t len, const IOOptions& options,
+                    IODebugContext* dbg) override {
+    target_->PrepareWrite(offset, len, options, dbg);
+  }
+
+  IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options,
+                    IODebugContext* dbg) override {
+    return target_->Allocate(offset, len, options, dbg);
+  }
+
+ private:
+  FSWritableFile* target_;
+};
+
+class FSWritableFileOwnerWrapper : public FSWritableFileWrapper {
+ public:
+  // Creates a FileWrapper around the input File object and takes
+  // ownership of the object
+  explicit FSWritableFileOwnerWrapper(std::unique_ptr<FSWritableFile>&& t)
+      : FSWritableFileWrapper(t.get()), guard_(std::move(t)) {}
+
+ private:
+  std::unique_ptr<FSWritableFile> guard_;
+};
+
+class FSRandomRWFileWrapper : public FSRandomRWFile {
+ public:
+  // Creates a FileWrapper around the input File object and without
+  // taking ownership of the object
+  explicit FSRandomRWFileWrapper(FSRandomRWFile* t) : target_(t) {}
+
+  FSRandomRWFile* target() const { return target_; }
+
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options,
+                 IODebugContext* dbg) override {
+    return target_->Write(offset, data, options, dbg);
+  }
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override {
+    return target_->Read(offset, n, options, result, scratch, dbg);
+  }
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override {
+    return target_->Flush(options, dbg);
+  }
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override {
+    return target_->Sync(options, dbg);
+  }
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override {
+    return target_->Fsync(options, dbg);
+  }
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+    return target_->Close(options, dbg);
+  }
+  Temperature GetTemperature() const override {
+    return target_->GetTemperature();
+  }
+
+ private:
+  FSRandomRWFile* target_;
+};
+
+class FSRandomRWFileOwnerWrapper : public FSRandomRWFileWrapper {
+ public:
+  // Creates a FileWrapper around the input File object and takes
+  // ownership of the object
+  explicit FSRandomRWFileOwnerWrapper(std::unique_ptr<FSRandomRWFile>&& t)
+      : FSRandomRWFileWrapper(t.get()), guard_(std::move(t)) {}
+
+ private:
+  std::unique_ptr<FSRandomRWFile> guard_;
+};
+
+class FSDirectoryWrapper : public FSDirectory {
+ public:
+  // Creates a FileWrapper around the input File object and takes
+  // ownership of the object
+  explicit FSDirectoryWrapper(std::unique_ptr<FSDirectory>&& t)
+      : guard_(std::move(t)) {
+    target_ = guard_.get();
+  }
+
+  // Creates a FileWrapper around the input File object and without
+  // taking ownership of the object
+  explicit FSDirectoryWrapper(FSDirectory* t) : target_(t) {}
+
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override {
+    return target_->Fsync(options, dbg);
+  }
+
+  IOStatus FsyncWithDirOptions(
+      const IOOptions& options, IODebugContext* dbg,
+      const DirFsyncOptions& dir_fsync_options) override {
+    return target_->FsyncWithDirOptions(options, dbg, dir_fsync_options);
+  }
+
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+    return target_->Close(options, dbg);
+  }
+
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+
+ private:
+  std::unique_ptr<FSDirectory> guard_;
+  FSDirectory* target_;
+};
+
+// A utility routine: write "data" to the named file.
+extern IOStatus WriteStringToFile(FileSystem* fs, const Slice& data,
+                                  const std::string& fname,
+                                  bool should_sync = false);
+
+// A utility routine: read contents of named file into *data
+extern IOStatus ReadFileToString(FileSystem* fs, const std::string& fname,
+                                 std::string* data);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/filter_policy.h b/src/rocksdb/include/rocksdb/filter_policy.h
new file mode 100644
index 000000000..954d15b4a
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/filter_policy.h
@@ -0,0 +1,206 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A database can be configured with a custom FilterPolicy object.
+// This object is responsible for creating a small filter from a set
+// of keys.  These filters are stored in rocksdb and are consulted
+// automatically by rocksdb to decide whether or not to read some
+// information from disk. In many cases, a filter can cut down the
+// number of disk seeks form a handful to a single disk seek per
+// DB::Get() call.
+//
+// Most people will want to use the builtin bloom filter support (see
+// NewBloomFilterPolicy() below).
+
+#pragma once
+
+#include <stdlib.h>
+
+#include <algorithm>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+struct BlockBasedTableOptions;
+struct ConfigOptions;
+
+// As of RocksDB 7.0, the details of these classes are internal
+class FilterBitsBuilder;
+class FilterBitsReader;
+
+// Contextual information passed to BloomFilterPolicy at filter building time.
+// Used in overriding FilterPolicy::GetBuilderWithContext(). References other
+// structs because this is expected to be a temporary, stack-allocated object.
+struct FilterBuildingContext {
+  // This constructor is for internal use only and subject to change.
+  FilterBuildingContext(const BlockBasedTableOptions& table_options);
+
+  // Options for the table being built
+  const BlockBasedTableOptions& table_options;
+
+  // BEGIN from (DB|ColumnFamily)Options in effect at table creation time
+  CompactionStyle compaction_style = kCompactionStyleLevel;
+
+  // Number of LSM levels, or -1 if unknown
+  int num_levels = -1;
+
+  // An optional logger for reporting errors, warnings, etc.
+  Logger* info_log = nullptr;
+  // END from (DB|ColumnFamily)Options
+
+  // Name of the column family for the table (or empty string if unknown)
+  // TODO: consider changing to Slice
+  std::string column_family_name;
+
+  // The table level at time of constructing the SST file, or -1 if unknown
+  // or N/A as in SstFileWriter. (The table file could later be used at a
+  // different level.)
+  int level_at_creation = -1;
+
+  // True if known to be going into bottommost sorted run for applicable
+  // key range (which might not even be last level with data). False
+  // otherwise.
+  bool is_bottommost = false;
+
+  // Reason for creating the file with the filter
+  TableFileCreationReason reason = TableFileCreationReason::kMisc;
+};
+
+// Determines what kind of filter (if any) to generate in SST files, and under
+// which conditions. API users can create custom filter policies that
+// defer to other built-in policies (see NewBloomFilterPolicy and
+// NewRibbonFilterPolicy) based on the context provided to
+// GetBuilderWithContext.
+class FilterPolicy : public Customizable {
+ public:
+  virtual ~FilterPolicy();
+  static const char* Type() { return "FilterPolicy"; }
+
+  // The name used for identifying whether a filter on disk is readable
+  // by this FilterPolicy. If this FilterPolicy is part of a family that
+  // can read each others filters, such as built-in BloomFilterPolcy and
+  // RibbonFilterPolicy, the CompatibilityName is a shared family name,
+  // while kinds of filters in the family can have distinct Customizable
+  // Names. This function is pure virtual so that wrappers around built-in
+  // policies are prompted to defer to CompatibilityName() of the wrapped
+  // policy, which is important for compatibility.
+  //
+  // For custom filter policies that are not part of a read-compatible
+  // family (rare), implementations may return Name().
+  virtual const char* CompatibilityName() const = 0;
+
+  // Creates a new FilterPolicy based on the input value string and returns the
+  // result The value might be an ID, and ID with properties, or an old-style
+  // policy string.
+  // The value describes the FilterPolicy being created.
+  // For BloomFilters, value may be a ":"-delimited value of the form:
+  //   "bloomfilter:[bits_per_key]",
+  //   e.g. ""bloomfilter:4"
+  //   The above string is equivalent to calling NewBloomFilterPolicy(4).
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& value,
+                                 std::shared_ptr<const FilterPolicy>* result);
+
+  // Return a new FilterBitsBuilder for constructing full or partitioned
+  // filter blocks, or return nullptr to indicate "no filter". Custom
+  // implementations should defer to a built-in FilterPolicy to get a
+  // new FilterBitsBuilder, but the FilterBuildingContext can be used
+  // to decide which built-in FilterPolicy to defer to.
+  virtual FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext&) const = 0;
+
+  // Return a new FilterBitsReader for full or partitioned filter blocks.
+  // Caller retains ownership of any buffer pointed to by the input Slice.
+  // Custom implementation should defer to GetFilterBitsReader on any
+  // built-in FilterPolicy, which can read filters generated by any other
+  // built-in FilterPolicy.
+  virtual FilterBitsReader* GetFilterBitsReader(
+      const Slice& /*contents*/) const = 0;
+};
+
+// Return a new filter policy that uses a bloom filter with approximately
+// the specified number of bits per key. See
+// https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter
+//
+// bits_per_key: average bits allocated per key in bloom filter. A good
+// choice is 9.9, which yields a filter with ~ 1% false positive rate.
+// When format_version < 5, the value will be rounded to the nearest
+// integer. Recommend using no more than three decimal digits after the
+// decimal point, as in 6.667.
+//
+// To avoid configurations that are unlikely to produce good filtering
+// value for the CPU overhead, bits_per_key < 0.5 is rounded down to 0.0
+// which means "generate no filter", and 0.5 <= bits_per_key < 1.0 is
+// rounded up to 1.0, for a 62% FP rate.
+//
+// The caller is responsible for eventually deleting the result, though
+// this is typically handled automatically with BlockBasedTableOptions:
+//   table_options.filter_policy.reset(NewBloomFilterPolicy(...));
+//
+// As of RocksDB 7.0, the use_block_based_builder parameter is ignored.
+// (The old, inefficient block-based filter is no longer accessible in
+// the public API.)
+//
+// Note: if you are using a custom comparator that ignores some parts
+// of the keys being compared, you must not use NewBloomFilterPolicy()
+// and must provide your own FilterPolicy that also ignores the
+// corresponding parts of the keys.  For example, if the comparator
+// ignores trailing spaces, it would be incorrect to use a
+// FilterPolicy (like NewBloomFilterPolicy) that does not ignore
+// trailing spaces in keys.
+extern const FilterPolicy* NewBloomFilterPolicy(
+    double bits_per_key, bool IGNORED_use_block_based_builder = false);
+
+// A new Bloom alternative that saves about 30% space compared to
+// Bloom filters, with similar query times but roughly 3-4x CPU time
+// and 3x temporary space usage during construction.  For example, if
+// you pass in 10 for bloom_equivalent_bits_per_key, you'll get the same
+// 0.95% FP rate as Bloom filter but only using about 7 bits per key.
+//
+// The space savings of Ribbon filters makes sense for lower (higher
+// numbered; larger; longer-lived) levels of LSM, whereas the speed of
+// Bloom filters make sense for highest levels of LSM. Setting
+// bloom_before_level allows for this design with Level and Universal
+// compaction styles. For example, bloom_before_level=1 means that Bloom
+// filters will be used in level 0, including flushes, and Ribbon
+// filters elsewhere, including FIFO compaction and external SST files.
+// For this option, memtable flushes are considered level -1 (so that
+// flushes can be distinguished from intra-L0 compaction).
+// bloom_before_level=0 (default) -> Generate Bloom filters only for
+// flushes under Level and Universal compaction styles.
+// bloom_before_level=-1 -> Always generate Ribbon filters (except in
+// some extreme or exceptional cases).
+//
+// Ribbon filters are compatible with RocksDB >= 6.15.0. Earlier
+// versions reading the data will behave as if no filter was used
+// (degraded performance until compaction rebuilds filters). All
+// built-in FilterPolicies (Bloom or Ribbon) are able to read other
+// kinds of built-in filters.
+//
+// Note: the current Ribbon filter schema uses some extra resources
+// when constructing very large filters. For example, for 100 million
+// keys in a single filter (one SST file without partitioned filters),
+// 3GB of temporary, untracked memory is used, vs. 1GB for Bloom.
+// However, the savings in filter space from just ~60 open SST files
+// makes up for the additional temporary memory use.
+//
+// Also consider using optimize_filters_for_memory to save filter
+// memory.
+extern const FilterPolicy* NewRibbonFilterPolicy(
+    double bloom_equivalent_bits_per_key, int bloom_before_level = 0);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/flush_block_policy.h b/src/rocksdb/include/rocksdb/flush_block_policy.h
new file mode 100644
index 000000000..7a5dd957e
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/flush_block_policy.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class BlockBuilder;
+struct ConfigOptions;
+struct Options;
+
+// FlushBlockPolicy provides a configurable way to determine when to flush a
+// block in the block based tables.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class FlushBlockPolicy {
+ public:
+  // Keep track of the key/value sequences and return the boolean value to
+  // determine if table builder should flush current data block.
+  virtual bool Update(const Slice& key, const Slice& value) = 0;
+
+  virtual ~FlushBlockPolicy() {}
+};
+
+class FlushBlockPolicyFactory : public Customizable {
+ public:
+  static const char* Type() { return "FlushBlockPolicyFactory"; }
+
+  // Creates a FlushBlockPolicyFactory based on the input value.
+  // By default, this method can create EveryKey or BySize PolicyFactory,
+  // which take now config_options.
+  static Status CreateFromString(
+      const ConfigOptions& config_options, const std::string& value,
+      std::shared_ptr<FlushBlockPolicyFactory>* result);
+
+  // Return a new block flush policy that flushes data blocks by data size.
+  // FlushBlockPolicy may need to access the metadata of the data block
+  // builder to determine when to flush the blocks.
+  //
+  // Callers must delete the result after any database that is using the
+  // result has been closed.
+  virtual FlushBlockPolicy* NewFlushBlockPolicy(
+      const BlockBasedTableOptions& table_options,
+      const BlockBuilder& data_block_builder) const = 0;
+
+  virtual ~FlushBlockPolicyFactory() {}
+};
+
+class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory {
+ public:
+  FlushBlockBySizePolicyFactory();
+
+  static const char* kClassName() { return "FlushBlockBySizePolicyFactory"; }
+  const char* Name() const override { return kClassName(); }
+
+  FlushBlockPolicy* NewFlushBlockPolicy(
+      const BlockBasedTableOptions& table_options,
+      const BlockBuilder& data_block_builder) const override;
+
+  static FlushBlockPolicy* NewFlushBlockPolicy(
+      const uint64_t size, const int deviation,
+      const BlockBuilder& data_block_builder);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/functor_wrapper.h b/src/rocksdb/include/rocksdb/functor_wrapper.h
new file mode 100644
index 000000000..17b021bf7
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/functor_wrapper.h
@@ -0,0 +1,56 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace detail {
+template <std::size_t...>
+struct IndexSequence {};
+
+template <std::size_t N, std::size_t... Next>
+struct IndexSequenceHelper
+    : public IndexSequenceHelper<N - 1U, N - 1U, Next...> {};
+
+template <std::size_t... Next>
+struct IndexSequenceHelper<0U, Next...> {
+  using type = IndexSequence<Next...>;
+};
+
+template <std::size_t N>
+using make_index_sequence = typename IndexSequenceHelper<N>::type;
+
+template <typename Function, typename Tuple, size_t... I>
+void call(Function f, Tuple t, IndexSequence<I...>) {
+  f(std::get<I>(t)...);
+}
+
+template <typename Function, typename Tuple>
+void call(Function f, Tuple t) {
+  static constexpr auto size = std::tuple_size<Tuple>::value;
+  call(f, t, make_index_sequence<size>{});
+}
+}  // namespace detail
+
+template <typename... Args>
+class FunctorWrapper {
+ public:
+  explicit FunctorWrapper(std::function<void(Args...)> functor, Args &&...args)
+      : functor_(std::move(functor)), args_(std::forward<Args>(args)...) {}
+
+  void invoke() { detail::call(functor_, args_); }
+
+ private:
+  std::function<void(Args...)> functor_;
+  std::tuple<Args...> args_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/io_status.h b/src/rocksdb/include/rocksdb/io_status.h
new file mode 100644
index 000000000..0bf5e939a
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/io_status.h
@@ -0,0 +1,244 @@
+// Copyright (c) 2019-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// An IOStatus encapsulates the result of an operation.  It may indicate
+// success, or it may indicate an error with an associated error message.
+//
+// Multiple threads can invoke const methods on an IOStatus without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same IOStatus must use
+// external synchronization.
+
+#pragma once
+
+#include <string>
+
+#include "rocksdb/slice.h"
+#ifdef OS_WIN
+#include <string.h>
+#endif
+#include <cstring>
+
+#include "status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class IOStatus : public Status {
+ public:
+  using Code = Status::Code;
+  using SubCode = Status::SubCode;
+
+  enum IOErrorScope : unsigned char {
+    kIOErrorScopeFileSystem,
+    kIOErrorScopeFile,
+    kIOErrorScopeRange,
+    kIOErrorScopeMax,
+  };
+
+  // Create a success status.
+  IOStatus() : IOStatus(kOk, kNone) {}
+  ~IOStatus() {}
+
+  // Copy the specified status.
+  IOStatus(const IOStatus& s);
+  IOStatus& operator=(const IOStatus& s);
+  IOStatus(IOStatus&& s) noexcept;
+  IOStatus& operator=(IOStatus&& s) noexcept;
+  bool operator==(const IOStatus& rhs) const;
+  bool operator!=(const IOStatus& rhs) const;
+
+  void SetRetryable(bool retryable) { retryable_ = retryable; }
+  void SetDataLoss(bool data_loss) { data_loss_ = data_loss; }
+  void SetScope(IOErrorScope scope) {
+    scope_ = static_cast<unsigned char>(scope);
+  }
+
+  bool GetRetryable() const { return retryable_; }
+  bool GetDataLoss() const { return data_loss_; }
+  IOErrorScope GetScope() const { return static_cast<IOErrorScope>(scope_); }
+
+  // Return a success status.
+  static IOStatus OK() { return IOStatus(); }
+
+  static IOStatus NotSupported(const Slice& msg, const Slice& msg2 = Slice()) {
+    return IOStatus(kNotSupported, msg, msg2);
+  }
+  static IOStatus NotSupported(SubCode msg = kNone) {
+    return IOStatus(kNotSupported, msg);
+  }
+
+  // Return error status of an appropriate type.
+  static IOStatus NotFound(const Slice& msg, const Slice& msg2 = Slice()) {
+    return IOStatus(kNotFound, msg, msg2);
+  }
+  // Fast path for not found without malloc;
+  static IOStatus NotFound(SubCode msg = kNone) {
+    return IOStatus(kNotFound, msg);
+  }
+
+  static IOStatus Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
+    return IOStatus(kCorruption, msg, msg2);
+  }
+  static IOStatus Corruption(SubCode msg = kNone) {
+    return IOStatus(kCorruption, msg);
+  }
+
+  static IOStatus InvalidArgument(const Slice& msg,
+                                  const Slice& msg2 = Slice()) {
+    return IOStatus(kInvalidArgument, msg, msg2);
+  }
+  static IOStatus InvalidArgument(SubCode msg = kNone) {
+    return IOStatus(kInvalidArgument, msg);
+  }
+
+  static IOStatus IOError(const Slice& msg, const Slice& msg2 = Slice()) {
+    return IOStatus(kIOError, msg, msg2);
+  }
+  static IOStatus IOError(SubCode msg = kNone) {
+    return IOStatus(kIOError, msg);
+  }
+
+  static IOStatus Busy(SubCode msg = kNone) { return IOStatus(kBusy, msg); }
+  static IOStatus Busy(const Slice& msg, const Slice& msg2 = Slice()) {
+    return IOStatus(kBusy, msg, msg2);
+  }
+
+  static IOStatus TimedOut(SubCode msg = kNone) {
+    return IOStatus(kTimedOut, msg);
+  }
+  static IOStatus TimedOut(const Slice& msg, const Slice& msg2 = Slice()) {
+    return IOStatus(kTimedOut, msg, msg2);
+  }
+
+  static IOStatus NoSpace() { return IOStatus(kIOError, kNoSpace); }
+  static IOStatus NoSpace(const Slice& msg, const Slice& msg2 = Slice()) {
+    return IOStatus(kIOError, kNoSpace, msg, msg2);
+  }
+
+  static IOStatus PathNotFound() { return IOStatus(kIOError, kPathNotFound); }
+  static IOStatus PathNotFound(const Slice& msg, const Slice& msg2 = Slice()) {
+    return IOStatus(kIOError, kPathNotFound, msg, msg2);
+  }
+
+  static IOStatus IOFenced() { return IOStatus(kIOError, kIOFenced); }
+  static IOStatus IOFenced(const Slice& msg, const Slice& msg2 = Slice()) {
+    return IOStatus(kIOError, kIOFenced, msg, msg2);
+  }
+
+  static IOStatus Aborted(SubCode msg = kNone) {
+    return IOStatus(kAborted, msg);
+  }
+  static IOStatus Aborted(const Slice& msg, const Slice& msg2 = Slice()) {
+    return IOStatus(kAborted, msg, msg2);
+  }
+
+  // Return a string representation of this status suitable for printing.
+  // Returns the string "OK" for success.
+  // std::string ToString() const;
+
+ private:
+  friend IOStatus status_to_io_status(Status&&);
+
+  explicit IOStatus(Code _code, SubCode _subcode = kNone)
+      : Status(_code, _subcode, false, false, kIOErrorScopeFileSystem) {}
+
+  IOStatus(Code _code, SubCode _subcode, const Slice& msg, const Slice& msg2);
+  IOStatus(Code _code, const Slice& msg, const Slice& msg2)
+      : IOStatus(_code, kNone, msg, msg2) {}
+};
+
+inline IOStatus::IOStatus(Code _code, SubCode _subcode, const Slice& msg,
+                          const Slice& msg2)
+    : Status(_code, _subcode, false, false, kIOErrorScopeFileSystem) {
+  assert(code_ != kOk);
+  assert(subcode_ != kMaxSubCode);
+  const size_t len1 = msg.size();
+  const size_t len2 = msg2.size();
+  const size_t size = len1 + (len2 ? (2 + len2) : 0);
+  char* const result = new char[size + 1];  // +1 for null terminator
+  memcpy(result, msg.data(), len1);
+  if (len2) {
+    result[len1] = ':';
+    result[len1 + 1] = ' ';
+    memcpy(result + len1 + 2, msg2.data(), len2);
+  }
+  result[size] = '\0';  // null terminator for C style string
+  state_.reset(result);
+}
+
+inline IOStatus::IOStatus(const IOStatus& s) : Status(s.code_, s.subcode_) {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+  s.checked_ = true;
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+  retryable_ = s.retryable_;
+  data_loss_ = s.data_loss_;
+  scope_ = s.scope_;
+  state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_.get());
+}
+inline IOStatus& IOStatus::operator=(const IOStatus& s) {
+  // The following condition catches both aliasing (when this == &s),
+  // and the common case where both s and *this are ok.
+  if (this != &s) {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+    s.checked_ = true;
+    checked_ = false;
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    code_ = s.code_;
+    subcode_ = s.subcode_;
+    retryable_ = s.retryable_;
+    data_loss_ = s.data_loss_;
+    scope_ = s.scope_;
+    state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_.get());
+  }
+  return *this;
+}
+
+inline IOStatus::IOStatus(IOStatus&& s) noexcept : IOStatus() {
+  *this = std::move(s);
+}
+
+inline IOStatus& IOStatus::operator=(IOStatus&& s) noexcept {
+  if (this != &s) {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+    s.checked_ = true;
+    checked_ = false;
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    code_ = std::move(s.code_);
+    s.code_ = kOk;
+    subcode_ = std::move(s.subcode_);
+    s.subcode_ = kNone;
+    retryable_ = s.retryable_;
+    data_loss_ = s.data_loss_;
+    scope_ = s.scope_;
+    s.scope_ = kIOErrorScopeFileSystem;
+    state_ = std::move(s.state_);
+  }
+  return *this;
+}
+
+inline bool IOStatus::operator==(const IOStatus& rhs) const {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+  checked_ = true;
+  rhs.checked_ = true;
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+  return (code_ == rhs.code_);
+}
+
+inline bool IOStatus::operator!=(const IOStatus& rhs) const {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+  checked_ = true;
+  rhs.checked_ = true;
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+  return !(*this == rhs);
+}
+
+inline IOStatus status_to_io_status(Status&& status) {
+  IOStatus io_s;
+  Status& s = io_s;
+  s = std::move(status);
+  return io_s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/iostats_context.h b/src/rocksdb/include/rocksdb/iostats_context.h
new file mode 100644
index 000000000..559d44c57
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/iostats_context.h
@@ -0,0 +1,98 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <stdint.h>
+
+#include <string>
+
+#include "rocksdb/perf_level.h"
+
+// A thread local context for gathering io-stats efficiently and transparently.
+// Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats.
+
+namespace ROCKSDB_NAMESPACE {
+
+// EXPERIMENTAL: the IO statistics for tiered storage. It matches with each
+// item in Temperature class.
+struct FileIOByTemperature {
+  // the number of bytes read to Temperature::kHot file
+  uint64_t hot_file_bytes_read;
+  // the number of bytes read to Temperature::kWarm file
+  uint64_t warm_file_bytes_read;
+  // the number of bytes read to Temperature::kCold file
+  uint64_t cold_file_bytes_read;
+  // total number of reads to Temperature::kHot file
+  uint64_t hot_file_read_count;
+  // total number of reads to Temperature::kWarm file
+  uint64_t warm_file_read_count;
+  // total number of reads to Temperature::kCold file
+  uint64_t cold_file_read_count;
+  // reset all the statistics to 0.
+  void Reset() {
+    hot_file_bytes_read = 0;
+    warm_file_bytes_read = 0;
+    cold_file_bytes_read = 0;
+    hot_file_read_count = 0;
+    warm_file_read_count = 0;
+    cold_file_read_count = 0;
+  }
+};
+
+struct IOStatsContext {
+  // reset all io-stats counter to zero
+  void Reset();
+
+  std::string ToString(bool exclude_zero_counters = false) const;
+
+  // the thread pool id
+  uint64_t thread_pool_id;
+
+  // number of bytes that has been written.
+  uint64_t bytes_written;
+  // number of bytes that has been read.
+  uint64_t bytes_read;
+
+  // time spent in open() and fopen().
+  uint64_t open_nanos;
+  // time spent in fallocate().
+  uint64_t allocate_nanos;
+  // time spent in write() and pwrite().
+  uint64_t write_nanos;
+  // time spent in read() and pread()
+  uint64_t read_nanos;
+  // time spent in sync_file_range().
+  uint64_t range_sync_nanos;
+  // time spent in fsync
+  uint64_t fsync_nanos;
+  // time spent in preparing write (fallocate etc).
+  uint64_t prepare_write_nanos;
+  // time spent in Logger::Logv().
+  uint64_t logger_nanos;
+  // CPU time spent in write() and pwrite()
+  uint64_t cpu_write_nanos;
+  // CPU time spent in read() and pread()
+  uint64_t cpu_read_nanos;
+
+  FileIOByTemperature file_io_stats_by_temperature;
+
+  // It is not consistent that whether iostats follows PerfLevel.Timer counters
+  // follows it but BackupEngine relies on counter metrics to always be there.
+  // Here we create a backdoor option to disable some counters, so that some
+  // existing stats are not polluted by file operations, such as logging, by
+  // turning this off.
+  bool disable_iostats = false;
+};
+
+// If RocksDB is compiled with -DNIOSTATS_CONTEXT, then a pointer to a global,
+// non-thread-local IOStatsContext object will be returned. Attempts to update
+// this object will be ignored, and reading from it will also be no-op.
+// Otherwise, a pointer to a thread-local IOStatsContext object will be
+// returned.
+//
+// This function never returns nullptr.
+IOStatsContext* get_iostats_context();
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/iterator.h b/src/rocksdb/include/rocksdb/iterator.h
new file mode 100644
index 000000000..9d4c9f73a
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/iterator.h
@@ -0,0 +1,144 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// An iterator yields a sequence of key/value pairs from a source.
+// The following class defines the interface.  Multiple implementations
+// are provided by this library.  In particular, iterators are provided
+// to access the contents of a Table or a DB.
+//
+// Multiple threads can invoke const methods on an Iterator without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Iterator must use
+// external synchronization.
+
+#pragma once
+
+#include <string>
+
+#include "rocksdb/cleanable.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/wide_columns.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Iterator : public Cleanable {
+ public:
+  Iterator() {}
+  // No copying allowed
+  Iterator(const Iterator&) = delete;
+  void operator=(const Iterator&) = delete;
+
+  virtual ~Iterator() {}
+
+  // An iterator is either positioned at a key/value pair, or
+  // not valid.  This method returns true iff the iterator is valid.
+  // Always returns false if !status().ok().
+  virtual bool Valid() const = 0;
+
+  // Position at the first key in the source.  The iterator is Valid()
+  // after this call iff the source is not empty.
+  virtual void SeekToFirst() = 0;
+
+  // Position at the last key in the source.  The iterator is
+  // Valid() after this call iff the source is not empty.
+  virtual void SeekToLast() = 0;
+
+  // Position at the first key in the source that at or past target.
+  // The iterator is Valid() after this call iff the source contains
+  // an entry that comes at or past target.
+  // All Seek*() methods clear any error status() that the iterator had prior to
+  // the call; after the seek, status() indicates only the error (if any) that
+  // happened during the seek, not any past errors.
+  // Target does not contain timestamp.
+  virtual void Seek(const Slice& target) = 0;
+
+  // Position at the last key in the source that at or before target.
+  // The iterator is Valid() after this call iff the source contains
+  // an entry that comes at or before target.
+  // Target does not contain timestamp.
+  virtual void SeekForPrev(const Slice& target) = 0;
+
+  // Moves to the next entry in the source.  After this call, Valid() is
+  // true iff the iterator was not positioned at the last entry in the source.
+  // REQUIRES: Valid()
+  virtual void Next() = 0;
+
+  // Moves to the previous entry in the source.  After this call, Valid() is
+  // true iff the iterator was not positioned at the first entry in source.
+  // REQUIRES: Valid()
+  virtual void Prev() = 0;
+
+  // Return the key for the current entry.  The underlying storage for
+  // the returned slice is valid only until the next modification of the
+  // iterator (i.e. the next SeekToFirst/SeekToLast/Seek/SeekForPrev/Next/Prev
+  // operation).
+  // REQUIRES: Valid()
+  virtual Slice key() const = 0;
+
+  // Return the value for the current entry.  If the entry is a plain key-value,
+  // return the value as-is; if it is a wide-column entity, return the value of
+  // the default anonymous column (see kDefaultWideColumnName) if any, or an
+  // empty value otherwise.  The underlying storage for the returned slice is
+  // valid only until the next modification of the iterator (i.e. the next
+  // SeekToFirst/SeekToLast/Seek/SeekForPrev/Next/Prev operation).
+  // REQUIRES: Valid()
+  virtual Slice value() const = 0;
+
+  // Return the wide columns for the current entry.  If the entry is a
+  // wide-column entity, return it as-is; if it is a plain key-value, return it
+  // as an entity with a single anonymous column (see kDefaultWideColumnName)
+  // which contains the value.  The underlying storage for the returned
+  // structure is valid only until the next modification of the iterator (i.e.
+  // the next SeekToFirst/SeekToLast/Seek/SeekForPrev/Next/Prev operation).
+  // REQUIRES: Valid()
+  virtual const WideColumns& columns() const {
+    assert(false);
+    return kNoWideColumns;
+  }
+
+  // If an error has occurred, return it.  Else return an ok status.
+  // If non-blocking IO is requested and this operation cannot be
+  // satisfied without doing some IO, then this returns Status::Incomplete().
+  virtual Status status() const = 0;
+
+  // If supported, renew the iterator to represent the latest state. The
+  // iterator will be invalidated after the call. Not supported if
+  // ReadOptions.snapshot is given when creating the iterator.
+  virtual Status Refresh() {
+    return Status::NotSupported("Refresh() is not supported");
+  }
+
+  // Property "rocksdb.iterator.is-key-pinned":
+  //   If returning "1", this means that the Slice returned by key() is valid
+  //   as long as the iterator is not deleted.
+  //   It is guaranteed to always return "1" if
+  //      - Iterator created with ReadOptions::pin_data = true
+  //      - DB tables were created with
+  //        BlockBasedTableOptions::use_delta_encoding = false.
+  // Property "rocksdb.iterator.super-version-number":
+  //   LSM version used by the iterator. The same format as DB Property
+  //   kCurrentSuperVersionNumber. See its comment for more information.
+  // Property "rocksdb.iterator.internal-key":
+  //   Get the user-key portion of the internal key at which the iteration
+  //   stopped.
+  virtual Status GetProperty(std::string prop_name, std::string* prop);
+
+  virtual Slice timestamp() const {
+    assert(false);
+    return Slice();
+  }
+};
+
+// Return an empty iterator (yields nothing).
+extern Iterator* NewEmptyIterator();
+
+// Return an empty iterator with the specified status.
+extern Iterator* NewErrorIterator(const Status& status);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/ldb_tool.h b/src/rocksdb/include/rocksdb/ldb_tool.h
new file mode 100644
index 000000000..7408cbc87
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/ldb_tool.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <string>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// An interface for converting a slice to a readable string
+class SliceFormatter {
+ public:
+  virtual ~SliceFormatter() {}
+  virtual std::string Format(const Slice& s) const = 0;
+};
+
+// Options for customizing ldb tool (beyond the DB Options)
+struct LDBOptions {
+  // Create LDBOptions with default values for all fields
+  LDBOptions();
+
+  // Key formatter that converts a slice to a readable string.
+  // Default: Slice::ToString()
+  std::shared_ptr<SliceFormatter> key_formatter;
+
+  std::string print_help_header = "ldb - RocksDB Tool";
+};
+
+class LDBTool {
+ public:
+  void Run(
+      int argc, char** argv, Options db_options = Options(),
+      const LDBOptions& ldb_options = LDBOptions(),
+      const std::vector<ColumnFamilyDescriptor>* column_families = nullptr);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/listener.h b/src/rocksdb/include/rocksdb/listener.h
new file mode 100644
index 000000000..8644fcf3f
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/listener.h
@@ -0,0 +1,847 @@
+// Copyright (c) 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+#pragma once
+
+#include <chrono>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/compaction_job_stats.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+using TablePropertiesCollection =
+    std::unordered_map<std::string, std::shared_ptr<const TableProperties>>;
+
+class DB;
+class ColumnFamilyHandle;
+class Status;
+struct CompactionJobStats;
+
+struct FileCreationBriefInfo {
+  FileCreationBriefInfo() = default;
+  FileCreationBriefInfo(const std::string& _db_name,
+                        const std::string& _cf_name,
+                        const std::string& _file_path, int _job_id)
+      : db_name(_db_name),
+        cf_name(_cf_name),
+        file_path(_file_path),
+        job_id(_job_id) {}
+  // the name of the database where the file was created.
+  std::string db_name;
+  // the name of the column family where the file was created.
+  std::string cf_name;
+  // the path to the created file.
+  std::string file_path;
+  // the id of the job (which could be flush or compaction) that
+  // created the file.
+  int job_id = 0;
+};
+
+struct TableFileCreationBriefInfo : public FileCreationBriefInfo {
+  // reason of creating the table.
+  TableFileCreationReason reason;
+};
+
+struct TableFileCreationInfo : public TableFileCreationBriefInfo {
+  TableFileCreationInfo() = default;
+  explicit TableFileCreationInfo(TableProperties&& prop)
+      : table_properties(prop) {}
+  // the size of the file.
+  uint64_t file_size;
+  // Detailed properties of the created file.
+  TableProperties table_properties;
+  // The status indicating whether the creation was successful or not.
+  Status status;
+  // The checksum of the table file being created
+  std::string file_checksum;
+  // The checksum function name of checksum generator used for this table file
+  std::string file_checksum_func_name;
+};
+
+struct BlobFileCreationBriefInfo : public FileCreationBriefInfo {
+  BlobFileCreationBriefInfo(const std::string& _db_name,
+                            const std::string& _cf_name,
+                            const std::string& _file_path, int _job_id,
+                            BlobFileCreationReason _reason)
+      : FileCreationBriefInfo(_db_name, _cf_name, _file_path, _job_id),
+        reason(_reason) {}
+  // reason of creating the blob file.
+  BlobFileCreationReason reason;
+};
+
+struct BlobFileCreationInfo : public BlobFileCreationBriefInfo {
+  BlobFileCreationInfo(const std::string& _db_name, const std::string& _cf_name,
+                       const std::string& _file_path, int _job_id,
+                       BlobFileCreationReason _reason,
+                       uint64_t _total_blob_count, uint64_t _total_blob_bytes,
+                       Status _status, const std::string& _file_checksum,
+                       const std::string& _file_checksum_func_name)
+      : BlobFileCreationBriefInfo(_db_name, _cf_name, _file_path, _job_id,
+                                  _reason),
+        total_blob_count(_total_blob_count),
+        total_blob_bytes(_total_blob_bytes),
+        status(_status),
+        file_checksum(_file_checksum),
+        file_checksum_func_name(_file_checksum_func_name) {}
+
+  // the number of blob in a file.
+  uint64_t total_blob_count;
+  // the total bytes in a file.
+  uint64_t total_blob_bytes;
+  // The status indicating whether the creation was successful or not.
+  Status status;
+  // The checksum of the blob file being created.
+  std::string file_checksum;
+  // The checksum function name of checksum generator used for this blob file.
+  std::string file_checksum_func_name;
+};
+
+enum class CompactionReason : int {
+  kUnknown = 0,
+  // [Level] number of L0 files > level0_file_num_compaction_trigger
+  kLevelL0FilesNum,
+  // [Level] total size of level > MaxBytesForLevel()
+  kLevelMaxLevelSize,
+  // [Universal] Compacting for size amplification
+  kUniversalSizeAmplification,
+  // [Universal] Compacting for size ratio
+  kUniversalSizeRatio,
+  // [Universal] number of sorted runs > level0_file_num_compaction_trigger
+  kUniversalSortedRunNum,
+  // [FIFO] total size > max_table_files_size
+  kFIFOMaxSize,
+  // [FIFO] reduce number of files.
+  kFIFOReduceNumFiles,
+  // [FIFO] files with creation time < (current_time - interval)
+  kFIFOTtl,
+  // Manual compaction
+  kManualCompaction,
+  // DB::SuggestCompactRange() marked files for compaction
+  kFilesMarkedForCompaction,
+  // [Level] Automatic compaction within bottommost level to cleanup duplicate
+  // versions of same user key, usually due to a released snapshot.
+  kBottommostFiles,
+  // Compaction based on TTL
+  kTtl,
+  // According to the comments in flush_job.cc, RocksDB treats flush as
+  // a level 0 compaction in internal stats.
+  kFlush,
+  // Compaction caused by external sst file ingestion
+  kExternalSstIngestion,
+  // Compaction due to SST file being too old
+  kPeriodicCompaction,
+  // Compaction in order to move files to temperature
+  kChangeTemperature,
+  // Compaction scheduled to force garbage collection of blob files
+  kForcedBlobGC,
+  // A special TTL compaction for RoundRobin policy, which basically the same as
+  // kLevelMaxLevelSize, but the goal is to compact TTLed files.
+  kRoundRobinTtl,
+  // total number of compaction reasons, new reasons must be added above this.
+  kNumOfReasons,
+};
+
+enum class FlushReason : int {
+  kOthers = 0x00,
+  kGetLiveFiles = 0x01,
+  kShutDown = 0x02,
+  kExternalFileIngestion = 0x03,
+  kManualCompaction = 0x04,
+  kWriteBufferManager = 0x05,
+  kWriteBufferFull = 0x06,
+  kTest = 0x07,
+  kDeleteFiles = 0x08,
+  kAutoCompaction = 0x09,
+  kManualFlush = 0x0a,
+  kErrorRecovery = 0xb,
+  // When set the flush reason to kErrorRecoveryRetryFlush, SwitchMemtable
+  // will not be called to avoid many small immutable memtables.
+  kErrorRecoveryRetryFlush = 0xc,
+  kWalFull = 0xd,
+};
+
+// TODO: In the future, BackgroundErrorReason will only be used to indicate
+// why the BG Error is happening (e.g., flush, compaction). We may introduce
+// other data structure to indicate other essential information such as
+// the file type (e.g., Manifest, SST) and special context.
+enum class BackgroundErrorReason {
+  kFlush,
+  kCompaction,
+  kWriteCallback,
+  kMemTable,
+  kManifestWrite,
+  kFlushNoWAL,
+  kManifestWriteNoWAL,
+};
+
+enum class WriteStallCondition {
+  kNormal,
+  kDelayed,
+  kStopped,
+};
+
+struct WriteStallInfo {
+  // the name of the column family
+  std::string cf_name;
+  // state of the write controller
+  struct {
+    WriteStallCondition cur;
+    WriteStallCondition prev;
+  } condition;
+};
+
+#ifndef ROCKSDB_LITE
+
+struct FileDeletionInfo {
+  FileDeletionInfo() = default;
+
+  FileDeletionInfo(const std::string& _db_name, const std::string& _file_path,
+                   int _job_id, Status _status)
+      : db_name(_db_name),
+        file_path(_file_path),
+        job_id(_job_id),
+        status(_status) {}
+  // The name of the database where the file was deleted.
+  std::string db_name;
+  // The path to the deleted file.
+  std::string file_path;
+  // The id of the job which deleted the file.
+  int job_id = 0;
+  // The status indicating whether the deletion was successful or not.
+  Status status;
+};
+
+struct TableFileDeletionInfo : public FileDeletionInfo {};
+
+struct BlobFileDeletionInfo : public FileDeletionInfo {
+  BlobFileDeletionInfo(const std::string& _db_name,
+                       const std::string& _file_path, int _job_id,
+                       Status _status)
+      : FileDeletionInfo(_db_name, _file_path, _job_id, _status) {}
+};
+
+enum class FileOperationType {
+  kRead,
+  kWrite,
+  kTruncate,
+  kClose,
+  kFlush,
+  kSync,
+  kFsync,
+  kRangeSync,
+  kAppend,
+  kPositionedAppend,
+  kOpen
+};
+
+struct FileOperationInfo {
+  using Duration = std::chrono::nanoseconds;
+  using SteadyTimePoint =
+      std::chrono::time_point<std::chrono::steady_clock, Duration>;
+  using SystemTimePoint =
+      std::chrono::time_point<std::chrono::system_clock, Duration>;
+  using StartTimePoint = std::pair<SystemTimePoint, SteadyTimePoint>;
+  using FinishTimePoint = SteadyTimePoint;
+
+  FileOperationType type;
+  const std::string& path;
+  // Rocksdb try to provide file temperature information, but it's not
+  // guaranteed.
+  Temperature temperature;
+  uint64_t offset;
+  size_t length;
+  const Duration duration;
+  const SystemTimePoint& start_ts;
+  Status status;
+
+  FileOperationInfo(const FileOperationType _type, const std::string& _path,
+                    const StartTimePoint& _start_ts,
+                    const FinishTimePoint& _finish_ts, const Status& _status,
+                    const Temperature _temperature = Temperature::kUnknown)
+      : type(_type),
+        path(_path),
+        temperature(_temperature),
+        duration(std::chrono::duration_cast<std::chrono::nanoseconds>(
+            _finish_ts - _start_ts.second)),
+        start_ts(_start_ts.first),
+        status(_status) {}
+  static StartTimePoint StartNow() {
+    return std::make_pair<SystemTimePoint, SteadyTimePoint>(
+        std::chrono::system_clock::now(), std::chrono::steady_clock::now());
+  }
+  static FinishTimePoint FinishNow() {
+    return std::chrono::steady_clock::now();
+  }
+};
+
+struct BlobFileInfo {
+  BlobFileInfo(const std::string& _blob_file_path,
+               const uint64_t _blob_file_number)
+      : blob_file_path(_blob_file_path), blob_file_number(_blob_file_number) {}
+
+  std::string blob_file_path;
+  uint64_t blob_file_number;
+};
+
+struct BlobFileAdditionInfo : public BlobFileInfo {
+  BlobFileAdditionInfo(const std::string& _blob_file_path,
+                       const uint64_t _blob_file_number,
+                       const uint64_t _total_blob_count,
+                       const uint64_t _total_blob_bytes)
+      : BlobFileInfo(_blob_file_path, _blob_file_number),
+        total_blob_count(_total_blob_count),
+        total_blob_bytes(_total_blob_bytes) {}
+  uint64_t total_blob_count;
+  uint64_t total_blob_bytes;
+};
+
+struct BlobFileGarbageInfo : public BlobFileInfo {
+  BlobFileGarbageInfo(const std::string& _blob_file_path,
+                      const uint64_t _blob_file_number,
+                      const uint64_t _garbage_blob_count,
+                      const uint64_t _garbage_blob_bytes)
+      : BlobFileInfo(_blob_file_path, _blob_file_number),
+        garbage_blob_count(_garbage_blob_count),
+        garbage_blob_bytes(_garbage_blob_bytes) {}
+  uint64_t garbage_blob_count;
+  uint64_t garbage_blob_bytes;
+};
+
+struct FlushJobInfo {
+  // the id of the column family
+  uint32_t cf_id;
+  // the name of the column family
+  std::string cf_name;
+  // the path to the newly created file
+  std::string file_path;
+  // the file number of the newly created file
+  uint64_t file_number;
+  // the oldest blob file referenced by the newly created file
+  uint64_t oldest_blob_file_number;
+  // the id of the thread that completed this flush job.
+  uint64_t thread_id;
+  // the job id, which is unique in the same thread.
+  int job_id;
+  // If true, then rocksdb is currently slowing-down all writes to prevent
+  // creating too many Level 0 files as compaction seems not able to
+  // catch up the write request speed.  This indicates that there are
+  // too many files in Level 0.
+  bool triggered_writes_slowdown;
+  // If true, then rocksdb is currently blocking any writes to prevent
+  // creating more L0 files.  This indicates that there are too many
+  // files in level 0.  Compactions should try to compact L0 files down
+  // to lower levels as soon as possible.
+  bool triggered_writes_stop;
+  // The smallest sequence number in the newly created file
+  SequenceNumber smallest_seqno;
+  // The largest sequence number in the newly created file
+  SequenceNumber largest_seqno;
+  // Table properties of the table being flushed
+  TableProperties table_properties;
+
+  FlushReason flush_reason;
+
+  // Compression algorithm used for blob output files
+  CompressionType blob_compression_type;
+
+  // Information about blob files created during flush in Integrated BlobDB.
+  std::vector<BlobFileAdditionInfo> blob_file_addition_infos;
+};
+
+struct CompactionFileInfo {
+  // The level of the file.
+  int level;
+
+  // The file number of the file.
+  uint64_t file_number;
+
+  // The file number of the oldest blob file this SST file references.
+  uint64_t oldest_blob_file_number;
+};
+
+struct SubcompactionJobInfo {
+  ~SubcompactionJobInfo() { status.PermitUncheckedError(); }
+  // the id of the column family where the compaction happened.
+  uint32_t cf_id;
+  // the name of the column family where the compaction happened.
+  std::string cf_name;
+  // the status indicating whether the compaction was successful or not.
+  Status status;
+  // the id of the thread that completed this compaction job.
+  uint64_t thread_id;
+  // the job id, which is unique in the same thread.
+  int job_id;
+
+  // sub-compaction job id, which is only unique within the same compaction, so
+  // use both 'job_id' and 'subcompaction_job_id' to identify a subcompaction
+  // within an instance.
+  // For non subcompaction job, it's set to -1.
+  int subcompaction_job_id;
+  // the smallest input level of the compaction.
+  int base_input_level;
+  // the output level of the compaction.
+  int output_level;
+
+  // Reason to run the compaction
+  CompactionReason compaction_reason;
+
+  // Compression algorithm used for output files
+  CompressionType compression;
+
+  // Statistics and other additional details on the compaction
+  CompactionJobStats stats;
+
+  // Compression algorithm used for blob output files.
+  CompressionType blob_compression_type;
+};
+
+struct CompactionJobInfo {
+  ~CompactionJobInfo() { status.PermitUncheckedError(); }
+  // the id of the column family where the compaction happened.
+  uint32_t cf_id;
+  // the name of the column family where the compaction happened.
+  std::string cf_name;
+  // the status indicating whether the compaction was successful or not.
+  Status status;
+  // the id of the thread that completed this compaction job.
+  uint64_t thread_id;
+  // the job id, which is unique in the same thread.
+  int job_id;
+
+  // the smallest input level of the compaction.
+  int base_input_level;
+  // the output level of the compaction.
+  int output_level;
+
+  // The following variables contain information about compaction inputs
+  // and outputs. A file may appear in both the input and output lists
+  // if it was simply moved to a different level. The order of elements
+  // is the same across input_files and input_file_infos; similarly, it is
+  // the same across output_files and output_file_infos.
+
+  // The names of the compaction input files.
+  std::vector<std::string> input_files;
+
+  // Additional information about the compaction input files.
+  std::vector<CompactionFileInfo> input_file_infos;
+
+  // The names of the compaction output files.
+  std::vector<std::string> output_files;
+
+  // Additional information about the compaction output files.
+  std::vector<CompactionFileInfo> output_file_infos;
+
+  // Table properties for input and output tables.
+  // The map is keyed by values from input_files and output_files.
+  TablePropertiesCollection table_properties;
+
+  // Reason to run the compaction
+  CompactionReason compaction_reason;
+
+  // Compression algorithm used for output files
+  CompressionType compression;
+
+  // Statistics and other additional details on the compaction
+  CompactionJobStats stats;
+
+  // Compression algorithm used for blob output files.
+  CompressionType blob_compression_type;
+
+  // Information about blob files created during compaction in Integrated
+  // BlobDB.
+  std::vector<BlobFileAdditionInfo> blob_file_addition_infos;
+
+  // Information about blob files deleted during compaction in Integrated
+  // BlobDB.
+  std::vector<BlobFileGarbageInfo> blob_file_garbage_infos;
+};
+
+struct MemTableInfo {
+  // the name of the column family to which memtable belongs
+  std::string cf_name;
+  // Sequence number of the first element that was inserted
+  // into the memtable.
+  SequenceNumber first_seqno;
+  // Sequence number that is guaranteed to be smaller than or equal
+  // to the sequence number of any key that could be inserted into this
+  // memtable. It can then be assumed that any write with a larger(or equal)
+  // sequence number will be present in this memtable or a later memtable.
+  SequenceNumber earliest_seqno;
+  // Total number of entries in memtable
+  uint64_t num_entries;
+  // Total number of deletes in memtable
+  uint64_t num_deletes;
+};
+
+struct ExternalFileIngestionInfo {
+  // the name of the column family
+  std::string cf_name;
+  // Path of the file outside the DB
+  std::string external_file_path;
+  // Path of the file inside the DB
+  std::string internal_file_path;
+  // The global sequence number assigned to keys in this file
+  SequenceNumber global_seqno;
+  // Table properties of the table being flushed
+  TableProperties table_properties;
+};
+
+// Result of auto background error recovery
+struct BackgroundErrorRecoveryInfo {
+  // The original error that triggered the recovery
+  Status old_bg_error;
+
+  // The final bg_error after all recovery attempts. Status::OK() means
+  // the recovery was successful and the database is fully operational.
+  Status new_bg_error;
+};
+
+struct IOErrorInfo {
+  IOErrorInfo(const IOStatus& _io_status, FileOperationType _operation,
+              const std::string& _file_path, size_t _length, uint64_t _offset)
+      : io_status(_io_status),
+        operation(_operation),
+        file_path(_file_path),
+        length(_length),
+        offset(_offset) {}
+
+  IOStatus io_status;
+  FileOperationType operation;
+  std::string file_path;
+  size_t length;
+  uint64_t offset;
+};
+
+// EventListener class contains a set of callback functions that will
+// be called when specific RocksDB event happens such as flush.  It can
+// be used as a building block for developing custom features such as
+// stats-collector or external compaction algorithm.
+//
+// IMPORTANT
+// Because compaction is needed to resolve a "writes stopped" condition,
+// calling or waiting for any blocking DB write function (no_slowdown=false)
+// from a compaction-related listener callback can hang RocksDB. For DB
+// writes from a callback we recommend a WriteBatch and no_slowdown=true,
+// because the WriteBatch can accumulate writes for later in case DB::Write
+// returns Status::Incomplete. Similarly, calling CompactRange or similar
+// could hang by waiting for a background worker that is occupied until the
+// callback returns.
+//
+// Otherwise, callback functions should not run for an extended period of
+// time before the function returns, because this will slow RocksDB.
+//
+// [Threading] All EventListener callback will be called using the
+// actual thread that involves in that specific event.   For example, it
+// is the RocksDB background flush thread that does the actual flush to
+// call EventListener::OnFlushCompleted().
+//
+// [Locking] All EventListener callbacks are designed to be called without
+// the current thread holding any DB mutex. This is to prevent potential
+// deadlock and performance issue when using EventListener callback
+// in a complex way.
+//
+// [Exceptions] Exceptions MUST NOT propagate out of overridden functions into
+// RocksDB, because RocksDB is not exception-safe. This could cause undefined
+// behavior including data loss, unreported corruption, deadlocks, and more.
+class EventListener : public Customizable {
+ public:
+  static const char* Type() { return "EventListener"; }
+  static Status CreateFromString(const ConfigOptions& options,
+                                 const std::string& id,
+                                 std::shared_ptr<EventListener>* result);
+  const char* Name() const override {
+    // Since EventListeners did not have a name previously, we will assume
+    // an empty name.  Instances should override this method.
+    return "";
+  }
+  // A callback function to RocksDB which will be called whenever a
+  // registered RocksDB flushes a file.  The default implementation is
+  // no-op.
+  //
+  // Note that the this function must be implemented in a way such that
+  // it should not run for an extended period of time before the function
+  // returns.  Otherwise, RocksDB may be blocked.
+  virtual void OnFlushCompleted(DB* /*db*/,
+                                const FlushJobInfo& /*flush_job_info*/) {}
+
+  // A callback function to RocksDB which will be called before a
+  // RocksDB starts to flush memtables.  The default implementation is
+  // no-op.
+  //
+  // Note that the this function must be implemented in a way such that
+  // it should not run for an extended period of time before the function
+  // returns.  Otherwise, RocksDB may be blocked.
+  virtual void OnFlushBegin(DB* /*db*/,
+                            const FlushJobInfo& /*flush_job_info*/) {}
+
+  // A callback function for RocksDB which will be called whenever
+  // a SST file is deleted.  Different from OnCompactionCompleted and
+  // OnFlushCompleted, this callback is designed for external logging
+  // service and thus only provide string parameters instead
+  // of a pointer to DB.  Applications that build logic basic based
+  // on file creations and deletions is suggested to implement
+  // OnFlushCompleted and OnCompactionCompleted.
+  //
+  // Note that if applications would like to use the passed reference
+  // outside this function call, they should make copies from the
+  // returned value.
+  virtual void OnTableFileDeleted(const TableFileDeletionInfo& /*info*/) {}
+
+  // A callback function to RocksDB which will be called before a
+  // RocksDB starts to compact.  The default implementation is
+  // no-op.
+  //
+  // Note that the this function must be implemented in a way such that
+  // it should not run for an extended period of time before the function
+  // returns.  Otherwise, RocksDB may be blocked.
+  virtual void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& /*ci*/) {}
+
+  // A callback function for RocksDB which will be called whenever
+  // a registered RocksDB compacts a file. The default implementation
+  // is a no-op.
+  //
+  // Note that this function must be implemented in a way such that
+  // it should not run for an extended period of time before the function
+  // returns. Otherwise, RocksDB may be blocked.
+  //
+  // @param db a pointer to the rocksdb instance which just compacted
+  //   a file.
+  // @param ci a reference to a CompactionJobInfo struct. 'ci' is released
+  //  after this function is returned, and must be copied if it is needed
+  //  outside of this function.
+  virtual void OnCompactionCompleted(DB* /*db*/,
+                                     const CompactionJobInfo& /*ci*/) {}
+
+  // A callback function to RocksDB which will be called before a sub-compaction
+  // begins. If a compaction is split to 2 sub-compactions, it will trigger one
+  // `OnCompactionBegin()` first, then two `OnSubcompactionBegin()`.
+  // If compaction is not split, it will still trigger one
+  // `OnSubcompactionBegin()`, as internally, compaction is always handled by
+  // sub-compaction. The default implementation is a no-op.
+  //
+  // Note that this function must be implemented in a way such that
+  // it should not run for an extended period of time before the function
+  // returns.  Otherwise, RocksDB may be blocked.
+  //
+  // @param ci a reference to a CompactionJobInfo struct, it contains a
+  //  `sub_job_id` which is only unique within the specified compaction (which
+  //  can be identified by `job_id`). 'ci' is released after this function is
+  //  returned, and must be copied if it's needed outside this function.
+  //  Note: `table_properties` is not set for sub-compaction, the information
+  //  could be got from `OnCompactionBegin()`.
+  virtual void OnSubcompactionBegin(const SubcompactionJobInfo& /*si*/) {}
+
+  // A callback function to RocksDB which will be called whenever a
+  // sub-compaction completed. The same as `OnSubcompactionBegin()`, if a
+  // compaction is split to 2 sub-compactions, it will be triggered twice. If
+  // a compaction is not split, it will still be triggered once.
+  // The default implementation is a no-op.
+  //
+  // Note that this function must be implemented in a way such that
+  // it should not run for an extended period of time before the function
+  // returns.  Otherwise, RocksDB may be blocked.
+  //
+  // @param ci a reference to a CompactionJobInfo struct, it contains a
+  //  `sub_job_id` which is only unique within the specified compaction (which
+  //  can be identified by `job_id`). 'ci' is released after this function is
+  //  returned, and must be copied if it's needed outside this function.
+  //  Note: `table_properties` is not set for sub-compaction, the information
+  //  could be got from `OnCompactionCompleted()`.
+  virtual void OnSubcompactionCompleted(const SubcompactionJobInfo& /*si*/) {}
+
+  // A callback function for RocksDB which will be called whenever
+  // a SST file is created.  Different from OnCompactionCompleted and
+  // OnFlushCompleted, this callback is designed for external logging
+  // service and thus only provide string parameters instead
+  // of a pointer to DB.  Applications that build logic basic based
+  // on file creations and deletions is suggested to implement
+  // OnFlushCompleted and OnCompactionCompleted.
+  //
+  // Historically it will only be called if the file is successfully created.
+  // Now it will also be called on failure case. User can check info.status
+  // to see if it succeeded or not.
+  //
+  // Note that if applications would like to use the passed reference
+  // outside this function call, they should make copies from these
+  // returned value.
+  virtual void OnTableFileCreated(const TableFileCreationInfo& /*info*/) {}
+
+  // A callback function for RocksDB which will be called before
+  // a SST file is being created. It will follow by OnTableFileCreated after
+  // the creation finishes.
+  //
+  // Note that if applications would like to use the passed reference
+  // outside this function call, they should make copies from these
+  // returned value.
+  virtual void OnTableFileCreationStarted(
+      const TableFileCreationBriefInfo& /*info*/) {}
+
+  // A callback function for RocksDB which will be called before
+  // a memtable is made immutable.
+  //
+  // Note that the this function must be implemented in a way such that
+  // it should not run for an extended period of time before the function
+  // returns.  Otherwise, RocksDB may be blocked.
+  //
+  // Note that if applications would like to use the passed reference
+  // outside this function call, they should make copies from these
+  // returned value.
+  virtual void OnMemTableSealed(const MemTableInfo& /*info*/) {}
+
+  // A callback function for RocksDB which will be called before
+  // a column family handle is deleted.
+  //
+  // Note that the this function must be implemented in a way such that
+  // it should not run for an extended period of time before the function
+  // returns.  Otherwise, RocksDB may be blocked.
+  // @param handle is a pointer to the column family handle to be deleted
+  // which will become a dangling pointer after the deletion.
+  virtual void OnColumnFamilyHandleDeletionStarted(
+      ColumnFamilyHandle* /*handle*/) {}
+
+  // A callback function for RocksDB which will be called after an external
+  // file is ingested using IngestExternalFile.
+  //
+  // Note that the this function will run on the same thread as
+  // IngestExternalFile(), if this function is blocked, IngestExternalFile()
+  // will be blocked from finishing.
+  virtual void OnExternalFileIngested(
+      DB* /*db*/, const ExternalFileIngestionInfo& /*info*/) {}
+
+  // A callback function for RocksDB which will be called before setting the
+  // background error status to a non-OK value. The new background error status
+  // is provided in `bg_error` and can be modified by the callback. E.g., a
+  // callback can suppress errors by resetting it to Status::OK(), thus
+  // preventing the database from entering read-only mode. We do not provide any
+  // guarantee when failed flushes/compactions will be rescheduled if the user
+  // suppresses an error.
+  //
+  // Note that this function can run on the same threads as flush, compaction,
+  // and user writes. So, it is extremely important not to perform heavy
+  // computations or blocking calls in this function.
+  virtual void OnBackgroundError(BackgroundErrorReason /* reason */,
+                                 Status* /* bg_error */) {}
+
+  // A callback function for RocksDB which will be called whenever a change
+  // of superversion triggers a change of the stall conditions.
+  //
+  // Note that the this function must be implemented in a way such that
+  // it should not run for an extended period of time before the function
+  // returns.  Otherwise, RocksDB may be blocked.
+  virtual void OnStallConditionsChanged(const WriteStallInfo& /*info*/) {}
+
+  // A callback function for RocksDB which will be called whenever a file read
+  // operation finishes.
+  virtual void OnFileReadFinish(const FileOperationInfo& /* info */) {}
+
+  // A callback function for RocksDB which will be called whenever a file write
+  // operation finishes.
+  virtual void OnFileWriteFinish(const FileOperationInfo& /* info */) {}
+
+  // A callback function for RocksDB which will be called whenever a file flush
+  // operation finishes.
+  virtual void OnFileFlushFinish(const FileOperationInfo& /* info */) {}
+
+  // A callback function for RocksDB which will be called whenever a file sync
+  // operation finishes.
+  virtual void OnFileSyncFinish(const FileOperationInfo& /* info */) {}
+
+  // A callback function for RocksDB which will be called whenever a file
+  // rangeSync operation finishes.
+  virtual void OnFileRangeSyncFinish(const FileOperationInfo& /* info */) {}
+
+  // A callback function for RocksDB which will be called whenever a file
+  // truncate operation finishes.
+  virtual void OnFileTruncateFinish(const FileOperationInfo& /* info */) {}
+
+  // A callback function for RocksDB which will be called whenever a file close
+  // operation finishes.
+  virtual void OnFileCloseFinish(const FileOperationInfo& /* info */) {}
+
+  // If true, the OnFile*Finish functions will be called. If
+  // false, then they won't be called.
+  virtual bool ShouldBeNotifiedOnFileIO() { return false; }
+
+  // A callback function for RocksDB which will be called just before
+  // starting the automatic recovery process for recoverable background
+  // errors, such as NoSpace(). The callback can suppress the automatic
+  // recovery by setting *auto_recovery to false. The database will then
+  // have to be transitioned out of read-only mode by calling DB::Resume()
+  virtual void OnErrorRecoveryBegin(BackgroundErrorReason /* reason */,
+                                    Status /* bg_error */,
+                                    bool* /* auto_recovery */) {}
+
+  // DEPRECATED
+  // A callback function for RocksDB which will be called once the database
+  // is recovered from read-only mode after an error. When this is called, it
+  // means normal writes to the database can be issued and the user can
+  // initiate any further recovery actions needed
+  virtual void OnErrorRecoveryCompleted(Status old_bg_error) {
+    old_bg_error.PermitUncheckedError();
+  }
+
+  // A callback function for RocksDB which will be called once the recovery
+  // attempt from a background retryable error is completed. The recovery
+  // may have been successful or not. In either case, the callback is called
+  // with the old and new error. If info.new_bg_error is Status::OK(), that
+  // means the recovery succeeded.
+  virtual void OnErrorRecoveryEnd(const BackgroundErrorRecoveryInfo& /*info*/) {
+  }
+
+  // A callback function for RocksDB which will be called before
+  // a blob file is being created. It will follow by OnBlobFileCreated after
+  // the creation finishes.
+  //
+  // Note that if applications would like to use the passed reference
+  // outside this function call, they should make copies from these
+  // returned value.
+  virtual void OnBlobFileCreationStarted(
+      const BlobFileCreationBriefInfo& /*info*/) {}
+
+  // A callback function for RocksDB which will be called whenever
+  // a blob file is created.
+  // It will be called whether the file is successfully created or not. User can
+  // check info.status to see if it succeeded or not.
+  //
+  // Note that if applications would like to use the passed reference
+  // outside this function call, they should make copies from these
+  // returned value.
+  virtual void OnBlobFileCreated(const BlobFileCreationInfo& /*info*/) {}
+
+  // A callback function for RocksDB which will be called whenever
+  // a blob file is deleted.
+  //
+  // Note that if applications would like to use the passed reference
+  // outside this function call, they should make copies from these
+  // returned value.
+  virtual void OnBlobFileDeleted(const BlobFileDeletionInfo& /*info*/) {}
+
+  // A callback function for RocksDB which will be called whenever an IO error
+  // happens. ShouldBeNotifiedOnFileIO should be set to true to get a callback.
+  virtual void OnIOError(const IOErrorInfo& /*info*/) {}
+
+  ~EventListener() override {}
+};
+
+#else
+
+class EventListener {};
+struct FlushJobInfo {};
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/memory_allocator.h b/src/rocksdb/include/rocksdb/memory_allocator.h
new file mode 100644
index 000000000..5cb799e42
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/memory_allocator.h
@@ -0,0 +1,81 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// MemoryAllocator is an interface that a client can implement to supply custom
+// memory allocation and deallocation methods. See rocksdb/cache.h for more
+// information.
+// All methods should be thread-safe.
+class MemoryAllocator : public Customizable {
+ public:
+  static const char* Type() { return "MemoryAllocator"; }
+  static Status CreateFromString(const ConfigOptions& options,
+                                 const std::string& value,
+                                 std::shared_ptr<MemoryAllocator>* result);
+
+  // Allocate a block of at least size. Has to be thread-safe.
+  virtual void* Allocate(size_t size) = 0;
+
+  // Deallocate previously allocated block. Has to be thread-safe.
+  virtual void Deallocate(void* p) = 0;
+
+  // Returns the memory size of the block allocated at p. The default
+  // implementation that just returns the original allocation_size is fine.
+  virtual size_t UsableSize(void* /*p*/, size_t allocation_size) const {
+    // default implementation just returns the allocation size
+    return allocation_size;
+  }
+
+  std::string GetId() const override { return GenerateIndividualId(); }
+};
+
+struct JemallocAllocatorOptions {
+  static const char* kName() { return "JemallocAllocatorOptions"; }
+  // Jemalloc tcache cache allocations by size class. For each size class,
+  // it caches between 20 (for large size classes) to 200 (for small size
+  // classes). To reduce tcache memory usage in case the allocator is access
+  // by large number of threads, we can control whether to cache an allocation
+  // by its size.
+  bool limit_tcache_size = false;
+
+  // Lower bound of allocation size to use tcache, if limit_tcache_size=true.
+  // When used with block cache, it is recommended to set it to block_size/4.
+  size_t tcache_size_lower_bound = 1024;
+
+  // Upper bound of allocation size to use tcache, if limit_tcache_size=true.
+  // When used with block cache, it is recommended to set it to block_size.
+  size_t tcache_size_upper_bound = 16 * 1024;
+};
+
+// Generate memory allocator which allocates through Jemalloc and utilize
+// MADV_DONTDUMP through madvise to exclude cache items from core dump.
+// Applications can use the allocator with block cache to exclude block cache
+// usage from core dump.
+//
+// Implementation details:
+// The JemallocNodumpAllocator creates a dedicated jemalloc arena, and all
+// allocations of the JemallocNodumpAllocator are through the same arena.
+// The memory allocator hooks memory allocation of the arena, and calls
+// madvise() with MADV_DONTDUMP flag to exclude the piece of memory from
+// core dump. Side benefit of using single arena would be reduction of jemalloc
+// metadata for some workloads.
+//
+// To mitigate mutex contention for using one single arena, jemalloc tcache
+// (thread-local cache) is enabled to cache unused allocations for future use.
+// The tcache normally incurs 0.5M extra memory usage per-thread. The usage
+// can be reduced by limiting allocation sizes to cache.
+extern Status NewJemallocNodumpAllocator(
+    JemallocAllocatorOptions& options,
+    std::shared_ptr<MemoryAllocator>* memory_allocator);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/memtablerep.h b/src/rocksdb/include/rocksdb/memtablerep.h
new file mode 100644
index 000000000..cb5444dca
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/memtablerep.h
@@ -0,0 +1,423 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file contains the interface that must be implemented by any collection
+// to be used as the backing store for a MemTable. Such a collection must
+// satisfy the following properties:
+//  (1) It does not store duplicate items.
+//  (2) It uses MemTableRep::KeyComparator to compare items for iteration and
+//     equality.
+//  (3) It can be accessed concurrently by multiple readers and can support
+//     during reads. However, it needn't support multiple concurrent writes.
+//  (4) Items are never deleted.
+// The liberal use of assertions is encouraged to enforce (1).
+//
+// The factory will be passed an MemTableAllocator object when a new MemTableRep
+// is requested.
+//
+// Users can implement their own memtable representations. We include three
+// types built in:
+//  - SkipListRep: This is the default; it is backed by a skip list.
+//  - HashSkipListRep: The memtable rep that is best used for keys that are
+//  structured like "prefix:suffix" where iteration within a prefix is
+//  common and iteration across different prefixes is rare. It is backed by
+//  a hash map where each bucket is a skip list.
+//  - VectorRep: This is backed by an unordered std::vector. On iteration, the
+// vector is sorted. It is intelligent about sorting; once the MarkReadOnly()
+// has been called, the vector will only be sorted once. It is optimized for
+// random-write-heavy workloads.
+//
+// The last four implementations are designed for situations in which
+// iteration over the entire collection is rare since doing so requires all the
+// keys to be copied into a sorted data structure.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <memory>
+#include <stdexcept>
+#include <unordered_set>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena;
+class Allocator;
+class LookupKey;
+class SliceTransform;
+class Logger;
+struct DBOptions;
+
+using KeyHandle = void*;
+
+extern Slice GetLengthPrefixedSlice(const char* data);
+
+class MemTableRep {
+ public:
+  // KeyComparator provides a means to compare keys, which are internal keys
+  // concatenated with values.
+  class KeyComparator {
+   public:
+    using DecodedType = ROCKSDB_NAMESPACE::Slice;
+
+    virtual DecodedType decode_key(const char* key) const {
+      // The format of key is frozen and can be treated as a part of the API
+      // contract. Refer to MemTable::Add for details.
+      return GetLengthPrefixedSlice(key);
+    }
+
+    // Compare a and b. Return a negative value if a is less than b, 0 if they
+    // are equal, and a positive value if a is greater than b
+    virtual int operator()(const char* prefix_len_key1,
+                           const char* prefix_len_key2) const = 0;
+
+    virtual int operator()(const char* prefix_len_key,
+                           const Slice& key) const = 0;
+
+    virtual ~KeyComparator() {}
+  };
+
+  explicit MemTableRep(Allocator* allocator) : allocator_(allocator) {}
+
+  // Allocate a buf of len size for storing key. The idea is that a
+  // specific memtable representation knows its underlying data structure
+  // better. By allowing it to allocate memory, it can possibly put
+  // correlated stuff in consecutive memory area to make processor
+  // prefetching more efficient.
+  virtual KeyHandle Allocate(const size_t len, char** buf);
+
+  // Insert key into the collection. (The caller will pack key and value into a
+  // single buffer and pass that in as the parameter to Insert).
+  // REQUIRES: nothing that compares equal to key is currently in the
+  // collection, and no concurrent modifications to the table in progress
+  virtual void Insert(KeyHandle handle) = 0;
+
+  // Same as ::Insert
+  // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+  // the <key, seq> already exists.
+  virtual bool InsertKey(KeyHandle handle) {
+    Insert(handle);
+    return true;
+  }
+
+  // Same as Insert(), but in additional pass a hint to insert location for
+  // the key. If hint points to nullptr, a new hint will be populated.
+  // otherwise the hint will be updated to reflect the last insert location.
+  //
+  // Currently only skip-list based memtable implement the interface. Other
+  // implementations will fallback to Insert() by default.
+  virtual void InsertWithHint(KeyHandle handle, void** /*hint*/) {
+    // Ignore the hint by default.
+    Insert(handle);
+  }
+
+  // Same as ::InsertWithHint
+  // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+  // the <key, seq> already exists.
+  virtual bool InsertKeyWithHint(KeyHandle handle, void** hint) {
+    InsertWithHint(handle, hint);
+    return true;
+  }
+
+  // Same as ::InsertWithHint, but allow concurrent write
+  //
+  // If hint points to nullptr, a new hint will be allocated on heap, otherwise
+  // the hint will be updated to reflect the last insert location. The hint is
+  // owned by the caller and it is the caller's responsibility to delete the
+  // hint later.
+  //
+  // Currently only skip-list based memtable implement the interface. Other
+  // implementations will fallback to InsertConcurrently() by default.
+  virtual void InsertWithHintConcurrently(KeyHandle handle, void** /*hint*/) {
+    // Ignore the hint by default.
+    InsertConcurrently(handle);
+  }
+
+  // Same as ::InsertWithHintConcurrently
+  // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+  // the <key, seq> already exists.
+  virtual bool InsertKeyWithHintConcurrently(KeyHandle handle, void** hint) {
+    InsertWithHintConcurrently(handle, hint);
+    return true;
+  }
+
+  // Like Insert(handle), but may be called concurrent with other calls
+  // to InsertConcurrently for other handles.
+  //
+  // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+  // the <key, seq> already exists.
+  virtual void InsertConcurrently(KeyHandle handle);
+
+  // Same as ::InsertConcurrently
+  // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+  // the <key, seq> already exists.
+  virtual bool InsertKeyConcurrently(KeyHandle handle) {
+    InsertConcurrently(handle);
+    return true;
+  }
+
+  // Returns true iff an entry that compares equal to key is in the collection.
+  virtual bool Contains(const char* key) const = 0;
+
+  // Notify this table rep that it will no longer be added to. By default,
+  // does nothing.  After MarkReadOnly() is called, this table rep will
+  // not be written to (ie No more calls to Allocate(), Insert(),
+  // or any writes done directly to entries accessed through the iterator.)
+  virtual void MarkReadOnly() {}
+
+  // Notify this table rep that it has been flushed to stable storage.
+  // By default, does nothing.
+  //
+  // Invariant: MarkReadOnly() is called, before MarkFlushed().
+  // Note that this method if overridden, should not run for an extended period
+  // of time. Otherwise, RocksDB may be blocked.
+  virtual void MarkFlushed() {}
+
+  // Look up key from the mem table, since the first key in the mem table whose
+  // user_key matches the one given k, call the function callback_func(), with
+  // callback_args directly forwarded as the first parameter, and the mem table
+  // key as the second parameter. If the return value is false, then terminates.
+  // Otherwise, go through the next key.
+  //
+  // It's safe for Get() to terminate after having finished all the potential
+  // key for the k.user_key(), or not.
+  //
+  // Default:
+  // Get() function with a default value of dynamically construct an iterator,
+  // seek and call the call back function.
+  virtual void Get(const LookupKey& k, void* callback_args,
+                   bool (*callback_func)(void* arg, const char* entry));
+
+  virtual uint64_t ApproximateNumEntries(const Slice& /*start_ikey*/,
+                                         const Slice& /*end_key*/) {
+    return 0;
+  }
+
+  // Returns a vector of unique random memtable entries of approximate
+  // size 'target_sample_size' (this size is not strictly enforced).
+  virtual void UniqueRandomSample(const uint64_t num_entries,
+                                  const uint64_t target_sample_size,
+                                  std::unordered_set<const char*>* entries) {
+    (void)num_entries;
+    (void)target_sample_size;
+    (void)entries;
+    assert(false);
+  }
+
+  // Report an approximation of how much memory has been used other than memory
+  // that was allocated through the allocator.  Safe to call from any thread.
+  virtual size_t ApproximateMemoryUsage() = 0;
+
+  virtual ~MemTableRep() {}
+
+  // Iteration over the contents of a skip collection
+  class Iterator {
+   public:
+    // Initialize an iterator over the specified collection.
+    // The returned iterator is not valid.
+    // explicit Iterator(const MemTableRep* collection);
+    virtual ~Iterator() {}
+
+    // Returns true iff the iterator is positioned at a valid node.
+    virtual bool Valid() const = 0;
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    virtual const char* key() const = 0;
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    virtual void Next() = 0;
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    virtual void Prev() = 0;
+
+    // Advance to the first entry with a key >= target
+    virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0;
+
+    // retreat to the first entry with a key <= target
+    virtual void SeekForPrev(const Slice& internal_key,
+                             const char* memtable_key) = 0;
+
+    virtual void RandomSeek() {}
+
+    // Position at the first entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToFirst() = 0;
+
+    // Position at the last entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    virtual void SeekToLast() = 0;
+  };
+
+  // Return an iterator over the keys in this representation.
+  // arena: If not null, the arena needs to be used to allocate the Iterator.
+  //        When destroying the iterator, the caller will not call "delete"
+  //        but Iterator::~Iterator() directly. The destructor needs to destroy
+  //        all the states but those allocated in arena.
+  virtual Iterator* GetIterator(Arena* arena = nullptr) = 0;
+
+  // Return an iterator that has a special Seek semantics. The result of
+  // a Seek might only include keys with the same prefix as the target key.
+  // arena: If not null, the arena is used to allocate the Iterator.
+  //        When destroying the iterator, the caller will not call "delete"
+  //        but Iterator::~Iterator() directly. The destructor needs to destroy
+  //        all the states but those allocated in arena.
+  virtual Iterator* GetDynamicPrefixIterator(Arena* arena = nullptr) {
+    return GetIterator(arena);
+  }
+
+  // Return true if the current MemTableRep supports merge operator.
+  // Default: true
+  virtual bool IsMergeOperatorSupported() const { return true; }
+
+  // Return true if the current MemTableRep supports snapshot
+  // Default: true
+  virtual bool IsSnapshotSupported() const { return true; }
+
+ protected:
+  // When *key is an internal key concatenated with the value, returns the
+  // user key.
+  virtual Slice UserKey(const char* key) const;
+
+  Allocator* allocator_;
+};
+
+// This is the base class for all factories that are used by RocksDB to create
+// new MemTableRep objects
+class MemTableRepFactory : public Customizable {
+ public:
+  ~MemTableRepFactory() override {}
+
+  static const char* Type() { return "MemTableRepFactory"; }
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& id,
+                                 std::unique_ptr<MemTableRepFactory>* factory);
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& id,
+                                 std::shared_ptr<MemTableRepFactory>* factory);
+
+  virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
+                                         Allocator*, const SliceTransform*,
+                                         Logger* logger) = 0;
+  virtual MemTableRep* CreateMemTableRep(
+      const MemTableRep::KeyComparator& key_cmp, Allocator* allocator,
+      const SliceTransform* slice_transform, Logger* logger,
+      uint32_t /* column_family_id */) {
+    return CreateMemTableRep(key_cmp, allocator, slice_transform, logger);
+  }
+
+  const char* Name() const override = 0;
+
+  // Return true if the current MemTableRep supports concurrent inserts
+  // Default: false
+  virtual bool IsInsertConcurrentlySupported() const { return false; }
+
+  // Return true if the current MemTableRep supports detecting duplicate
+  // <key,seq> at insertion time. If true, then MemTableRep::Insert* returns
+  // false when if the <key,seq> already exists.
+  // Default: false
+  virtual bool CanHandleDuplicatedKey() const { return false; }
+};
+
+// This uses a skip list to store keys. It is the default.
+//
+// Parameters:
+//   lookahead: If non-zero, each iterator's seek operation will start the
+//     search from the previously visited record (doing at most 'lookahead'
+//     steps). This is an optimization for the access pattern including many
+//     seeks with consecutive keys.
+class SkipListFactory : public MemTableRepFactory {
+ public:
+  explicit SkipListFactory(size_t lookahead = 0);
+
+  // Methods for Configurable/Customizable class overrides
+  static const char* kClassName() { return "SkipListFactory"; }
+  static const char* kNickName() { return "skip_list"; }
+  virtual const char* Name() const override { return kClassName(); }
+  virtual const char* NickName() const override { return kNickName(); }
+  std::string GetId() const override;
+
+  // Methods for MemTableRepFactory class overrides
+  using MemTableRepFactory::CreateMemTableRep;
+  virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
+                                         Allocator*, const SliceTransform*,
+                                         Logger* logger) override;
+
+  bool IsInsertConcurrentlySupported() const override { return true; }
+
+  bool CanHandleDuplicatedKey() const override { return true; }
+
+ private:
+  size_t lookahead_;
+};
+
+#ifndef ROCKSDB_LITE
+// This creates MemTableReps that are backed by an std::vector. On iteration,
+// the vector is sorted. This is useful for workloads where iteration is very
+// rare and writes are generally not issued after reads begin.
+//
+// Parameters:
+//   count: Passed to the constructor of the underlying std::vector of each
+//     VectorRep. On initialization, the underlying array will be at least count
+//     bytes reserved for usage.
+class VectorRepFactory : public MemTableRepFactory {
+  size_t count_;
+
+ public:
+  explicit VectorRepFactory(size_t count = 0);
+
+  // Methods for Configurable/Customizable class overrides
+  static const char* kClassName() { return "VectorRepFactory"; }
+  static const char* kNickName() { return "vector"; }
+  const char* Name() const override { return kClassName(); }
+  const char* NickName() const override { return kNickName(); }
+
+  // Methods for MemTableRepFactory class overrides
+  using MemTableRepFactory::CreateMemTableRep;
+  virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
+                                         Allocator*, const SliceTransform*,
+                                         Logger* logger) override;
+};
+
+// This class contains a fixed array of buckets, each
+// pointing to a skiplist (null if the bucket is empty).
+// bucket_count: number of fixed array buckets
+// skiplist_height: the max height of the skiplist
+// skiplist_branching_factor: probabilistic size ratio between adjacent
+//                            link lists in the skiplist
+extern MemTableRepFactory* NewHashSkipListRepFactory(
+    size_t bucket_count = 1000000, int32_t skiplist_height = 4,
+    int32_t skiplist_branching_factor = 4);
+
+// The factory is to create memtables based on a hash table:
+// it contains a fixed array of buckets, each pointing to either a linked list
+// or a skip list if number of entries inside the bucket exceeds
+// threshold_use_skiplist.
+// @bucket_count: number of fixed array buckets
+// @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc.
+//                      Otherwise from huge page TLB. The user needs to reserve
+//                      huge pages for it to be allocated, like:
+//                          sysctl -w vm.nr_hugepages=20
+//                      See linux doc Documentation/vm/hugetlbpage.txt
+// @bucket_entries_logging_threshold: if number of entries in one bucket
+//                                    exceeds this number, log about it.
+// @if_log_bucket_dist_when_flash: if true, log distribution of number of
+//                                 entries when flushing.
+// @threshold_use_skiplist: a bucket switches to skip list if number of
+//                          entries exceed this parameter.
+extern MemTableRepFactory* NewHashLinkListRepFactory(
+    size_t bucket_count = 50000, size_t huge_page_tlb_size = 0,
+    int bucket_entries_logging_threshold = 4096,
+    bool if_log_bucket_dist_when_flash = true,
+    uint32_t threshold_use_skiplist = 256);
+
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/merge_operator.h b/src/rocksdb/include/rocksdb/merge_operator.h
new file mode 100644
index 000000000..ae795220b
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/merge_operator.h
@@ -0,0 +1,265 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <deque>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class Logger;
+
+// The Merge Operator
+//
+// Essentially, a MergeOperator specifies the SEMANTICS of a merge, which only
+// client knows. It could be numeric addition, list append, string
+// concatenation, edit data structure, ... , anything.
+// The library, on the other hand, is concerned with the exercise of this
+// interface, at the right time (during get, iteration, compaction...)
+//
+// To use merge, the client needs to provide an object implementing one of
+// the following interfaces:
+//  a) AssociativeMergeOperator - for most simple semantics (always take
+//    two values, and merge them into one value, which is then put back
+//    into rocksdb); numeric addition and string concatenation are examples;
+//
+//  b) MergeOperator - the generic class for all the more abstract / complex
+//    operations; one method (FullMergeV2) to merge a Put/Delete value with a
+//    merge operand; and another method (PartialMerge) that merges multiple
+//    operands together. this is especially useful if your key values have
+//    complex structures but you would still like to support client-specific
+//    incremental updates.
+//
+// AssociativeMergeOperator is simpler to implement. MergeOperator is simply
+// more powerful.
+//
+// Refer to rocksdb-merge wiki for more details and example implementations.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class MergeOperator : public Customizable {
+ public:
+  virtual ~MergeOperator() {}
+  static const char* Type() { return "MergeOperator"; }
+  static Status CreateFromString(const ConfigOptions& opts,
+                                 const std::string& id,
+                                 std::shared_ptr<MergeOperator>* result);
+
+  // Gives the client a way to express the read -> modify -> write semantics
+  // key:      (IN)    The key that's associated with this merge operation.
+  //                   Client could multiplex the merge operator based on it
+  //                   if the key space is partitioned and different subspaces
+  //                   refer to different types of data which have different
+  //                   merge operation semantics
+  // existing: (IN)    null indicates that the key does not exist before this op
+  // operand_list:(IN) the sequence of merge operations to apply, front() first.
+  // new_value:(OUT)   Client is responsible for filling the merge result here.
+  // The string that new_value is pointing to will be empty.
+  // logger:   (IN)    Client could use this to log errors during merge.
+  //
+  // Return true on success.
+  // All values passed in will be client-specific values. So if this method
+  // returns false, it is because client specified bad data or there was
+  // internal corruption. This will be treated as an error by the library.
+  //
+  // Also make use of the *logger for error messages.
+  virtual bool FullMerge(const Slice& /*key*/, const Slice* /*existing_value*/,
+                         const std::deque<std::string>& /*operand_list*/,
+                         std::string* /*new_value*/, Logger* /*logger*/) const {
+    // deprecated, please use FullMergeV2()
+    assert(false);
+    return false;
+  }
+
+  struct MergeOperationInput {
+    // If user-defined timestamp is enabled, `_key` includes timestamp.
+    explicit MergeOperationInput(const Slice& _key,
+                                 const Slice* _existing_value,
+                                 const std::vector<Slice>& _operand_list,
+                                 Logger* _logger)
+        : key(_key),
+          existing_value(_existing_value),
+          operand_list(_operand_list),
+          logger(_logger) {}
+
+    // The key associated with the merge operation.
+    const Slice& key;
+    // The existing value of the current key, nullptr means that the
+    // value doesn't exist.
+    const Slice* existing_value;
+    // A list of operands to apply.
+    const std::vector<Slice>& operand_list;
+    // Logger could be used by client to log any errors that happen during
+    // the merge operation.
+    Logger* logger;
+  };
+
+  struct MergeOperationOutput {
+    explicit MergeOperationOutput(std::string& _new_value,
+                                  Slice& _existing_operand)
+        : new_value(_new_value), existing_operand(_existing_operand) {}
+
+    // Client is responsible for filling the merge result here.
+    std::string& new_value;
+    // If the merge result is one of the existing operands (or existing_value),
+    // client can set this field to the operand (or existing_value) instead of
+    // using new_value.
+    Slice& existing_operand;
+  };
+
+  // This function applies a stack of merge operands in chronological order
+  // on top of an existing value. There are two ways in which this method is
+  // being used:
+  // a) During Get() operation, it used to calculate the final value of a key
+  // b) During compaction, in order to collapse some operands with the based
+  //    value.
+  //
+  // Note: The name of the method is somewhat misleading, as both in the cases
+  // of Get() or compaction it may be called on a subset of operands:
+  // K:    0    +1    +2    +7    +4     +5      2     +1     +2
+  //                              ^
+  //                              |
+  //                          snapshot
+  // In the example above, Get(K) operation will call FullMerge with a base
+  // value of 2 and operands [+1, +2]. Compaction process might decide to
+  // collapse the beginning of the history up to the snapshot by performing
+  // full Merge with base value of 0 and operands [+1, +2, +7, +4].
+  virtual bool FullMergeV2(const MergeOperationInput& merge_in,
+                           MergeOperationOutput* merge_out) const;
+
+  // This function performs merge(left_op, right_op)
+  // when both the operands are themselves merge operation types
+  // that you would have passed to a DB::Merge() call in the same order
+  // (i.e.: DB::Merge(key,left_op), followed by DB::Merge(key,right_op)).
+  //
+  // PartialMerge should combine them into a single merge operation that is
+  // saved into *new_value, and then it should return true.
+  // *new_value should be constructed such that a call to
+  // DB::Merge(key, *new_value) would yield the same result as a call
+  // to DB::Merge(key, left_op) followed by DB::Merge(key, right_op).
+  //
+  // The string that new_value is pointing to will be empty.
+  //
+  // The default implementation of PartialMergeMulti will use this function
+  // as a helper, for backward compatibility.  Any successor class of
+  // MergeOperator should either implement PartialMerge or PartialMergeMulti,
+  // although implementing PartialMergeMulti is suggested as it is in general
+  // more effective to merge multiple operands at a time instead of two
+  // operands at a time.
+  //
+  // If it is impossible or infeasible to combine the two operations,
+  // leave new_value unchanged and return false. The library will
+  // internally keep track of the operations, and apply them in the
+  // correct order once a base-value (a Put/Delete/End-of-Database) is seen.
+  //
+  // TODO: Presently there is no way to differentiate between error/corruption
+  // and simply "return false". For now, the client should simply return
+  // false in any case it cannot perform partial-merge, regardless of reason.
+  // If there is corruption in the data, handle it in the FullMergeV2() function
+  // and return false there.  The default implementation of PartialMerge will
+  // always return false.
+  virtual bool PartialMerge(const Slice& /*key*/, const Slice& /*left_operand*/,
+                            const Slice& /*right_operand*/,
+                            std::string* /*new_value*/,
+                            Logger* /*logger*/) const {
+    return false;
+  }
+
+  // This function performs merge when all the operands are themselves merge
+  // operation types that you would have passed to a DB::Merge() call in the
+  // same order (front() first)
+  // (i.e. DB::Merge(key, operand_list[0]), followed by
+  //  DB::Merge(key, operand_list[1]), ...)
+  //
+  // PartialMergeMulti should combine them into a single merge operation that is
+  // saved into *new_value, and then it should return true.  *new_value should
+  // be constructed such that a call to DB::Merge(key, *new_value) would yield
+  // the same result as sequential individual calls to DB::Merge(key, operand)
+  // for each operand in operand_list from front() to back().
+  //
+  // The string that new_value is pointing to will be empty.
+  //
+  // The PartialMergeMulti function will be called when there are at least two
+  // operands.
+  //
+  // In the default implementation, PartialMergeMulti will invoke PartialMerge
+  // multiple times, where each time it only merges two operands.  Developers
+  // should either implement PartialMergeMulti, or implement PartialMerge which
+  // is served as the helper function of the default PartialMergeMulti.
+  virtual bool PartialMergeMulti(const Slice& key,
+                                 const std::deque<Slice>& operand_list,
+                                 std::string* new_value, Logger* logger) const;
+
+  // The name of the MergeOperator. Used to check for MergeOperator
+  // mismatches (i.e., a DB created with one MergeOperator is
+  // accessed using a different MergeOperator)
+  // TODO: the name is currently not stored persistently and thus
+  //       no checking is enforced. Client is responsible for providing
+  //       consistent MergeOperator between DB opens.
+  virtual const char* Name() const override = 0;
+
+  // Determines whether the PartialMerge can be called with just a single
+  // merge operand.
+  // Override and return true for allowing a single operand. PartialMerge
+  // and PartialMergeMulti should be overridden and implemented
+  // correctly to properly handle a single operand.
+  virtual bool AllowSingleOperand() const { return false; }
+
+  // Allows to control when to invoke a full merge during Get.
+  // This could be used to limit the number of merge operands that are looked at
+  // during a point lookup, thereby helping in limiting the number of levels to
+  // read from.
+  // Doesn't help with iterators.
+  //
+  // Note: the merge operands are passed to this function in the reversed order
+  // relative to how they were merged (passed to FullMerge or FullMergeV2)
+  // for performance reasons, see also:
+  // https://github.com/facebook/rocksdb/issues/3865
+  virtual bool ShouldMerge(const std::vector<Slice>& /*operands*/) const {
+    return false;
+  }
+};
+
+// The simpler, associative merge operator.
+class AssociativeMergeOperator : public MergeOperator {
+ public:
+  ~AssociativeMergeOperator() override {}
+
+  // Gives the client a way to express the read -> modify -> write semantics
+  // key:           (IN) The key that's associated with this merge operation.
+  // existing_value:(IN) null indicates the key does not exist before this op
+  // value:         (IN) the value to update/merge the existing_value with
+  // new_value:    (OUT) Client is responsible for filling the merge result
+  // here. The string that new_value is pointing to will be empty.
+  // logger:        (IN) Client could use this to log errors during merge.
+  //
+  // Return true on success.
+  // All values passed in will be client-specific values. So if this method
+  // returns false, it is because client specified bad data or there was
+  // internal corruption. The client should assume that this will be treated
+  // as an error by the library.
+  virtual bool Merge(const Slice& key, const Slice* existing_value,
+                     const Slice& value, std::string* new_value,
+                     Logger* logger) const = 0;
+
+ private:
+  // Default implementations of the MergeOperator functions
+  bool FullMergeV2(const MergeOperationInput& merge_in,
+                   MergeOperationOutput* merge_out) const override;
+
+  bool PartialMerge(const Slice& key, const Slice& left_operand,
+                    const Slice& right_operand, std::string* new_value,
+                    Logger* logger) const override;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/metadata.h b/src/rocksdb/include/rocksdb/metadata.h
new file mode 100644
index 000000000..0cdffcd5f
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/metadata.h
@@ -0,0 +1,245 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/options.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Basic identifiers and metadata for a file in a DB. This only includes
+// information considered relevant for taking backups, checkpoints, or other
+// services relating to DB file storage.
+// This is only appropriate for immutable files, such as SST files or all
+// files in a backup. See also LiveFileStorageInfo.
+struct FileStorageInfo {
+  // The name of the file within its directory (e.g. "123456.sst")
+  std::string relative_filename;
+  // The directory containing the file, without a trailing '/'. This could be
+  // a DB path, wal_dir, etc.
+  std::string directory;
+
+  // The id of the file within a single DB. Set to 0 if the file does not have
+  // a number (e.g. CURRENT)
+  uint64_t file_number = 0;
+  // The type of the file as part of a DB.
+  FileType file_type = kTempFile;
+
+  // File size in bytes. See also `trim_to_size`.
+  uint64_t size = 0;
+
+  // This feature is experimental and subject to change.
+  Temperature temperature = Temperature::kUnknown;
+
+  // The checksum of a SST file, the value is decided by the file content and
+  // the checksum algorithm used for this SST file. The checksum function is
+  // identified by the file_checksum_func_name. If the checksum function is
+  // not specified, file_checksum is "0" by default.
+  std::string file_checksum;
+
+  // The name of the checksum function used to generate the file checksum
+  // value. If file checksum is not enabled (e.g., sst_file_checksum_func is
+  // null), file_checksum_func_name is UnknownFileChecksumFuncName, which is
+  // "Unknown".
+  std::string file_checksum_func_name;
+};
+
+// Adds to FileStorageInfo the ability to capture the state of files that
+// might change in a running DB.
+struct LiveFileStorageInfo : public FileStorageInfo {
+  // If non-empty, this string represents the "saved" contents of the file
+  // for the current context. (This field is used for checkpointing CURRENT
+  // file.) In that case, size == replacement_contents.size() and file on disk
+  // should be ignored. If empty string, the file on disk should still have
+  // "saved" contents. (See trim_to_size.)
+  std::string replacement_contents;
+
+  // If true, the file on disk is allowed to be larger than `size` but only
+  // the first `size` bytes should be used for the current context. If false,
+  // the file is corrupt if size on disk does not equal `size`.
+  bool trim_to_size = false;
+};
+
+// The metadata that describes an SST file. (Does not need to extend
+// LiveFileStorageInfo because SST files are always immutable.)
+struct SstFileMetaData : public FileStorageInfo {
+  SstFileMetaData() { file_type = kTableFile; }
+
+  SstFileMetaData(const std::string& _file_name, uint64_t _file_number,
+                  const std::string& _directory, uint64_t _size,
+                  SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno,
+                  const std::string& _smallestkey,
+                  const std::string& _largestkey, uint64_t _num_reads_sampled,
+                  bool _being_compacted, Temperature _temperature,
+                  uint64_t _oldest_blob_file_number,
+                  uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
+                  std::string& _file_checksum,
+                  std::string& _file_checksum_func_name)
+      : smallest_seqno(_smallest_seqno),
+        largest_seqno(_largest_seqno),
+        smallestkey(_smallestkey),
+        largestkey(_largestkey),
+        num_reads_sampled(_num_reads_sampled),
+        being_compacted(_being_compacted),
+        num_entries(0),
+        num_deletions(0),
+        oldest_blob_file_number(_oldest_blob_file_number),
+        oldest_ancester_time(_oldest_ancester_time),
+        file_creation_time(_file_creation_time) {
+    if (!_file_name.empty()) {
+      if (_file_name[0] == '/') {
+        relative_filename = _file_name.substr(1);
+        name = _file_name;  // Deprecated field
+      } else {
+        relative_filename = _file_name;
+        name = std::string("/") + _file_name;  // Deprecated field
+      }
+      assert(relative_filename.size() + 1 == name.size());
+      assert(relative_filename[0] != '/');
+      assert(name[0] == '/');
+    }
+    directory = _directory;
+    db_path = _directory;  // Deprecated field
+    file_number = _file_number;
+    file_type = kTableFile;
+    size = _size;
+    temperature = _temperature;
+    file_checksum = _file_checksum;
+    file_checksum_func_name = _file_checksum_func_name;
+  }
+
+  SequenceNumber smallest_seqno = 0;  // Smallest sequence number in file.
+  SequenceNumber largest_seqno = 0;   // Largest sequence number in file.
+  std::string smallestkey;            // Smallest user defined key in the file.
+  std::string largestkey;             // Largest user defined key in the file.
+  uint64_t num_reads_sampled = 0;     // How many times the file is read.
+  bool being_compacted =
+      false;  // true if the file is currently being compacted.
+
+  uint64_t num_entries = 0;
+  uint64_t num_deletions = 0;
+
+  uint64_t oldest_blob_file_number = 0;  // The id of the oldest blob file
+                                         // referenced by the file.
+  // An SST file may be generated by compactions whose input files may
+  // in turn be generated by earlier compactions. The creation time of the
+  // oldest SST file that is the compaction ancestor of this file.
+  // The timestamp is provided SystemClock::GetCurrentTime().
+  // 0 if the information is not available.
+  //
+  // Note: for TTL blob files, it contains the start of the expiration range.
+  uint64_t oldest_ancester_time = 0;
+  // Timestamp when the SST file is created, provided by
+  // SystemClock::GetCurrentTime(). 0 if the information is not available.
+  uint64_t file_creation_time = 0;
+
+  // DEPRECATED: The name of the file within its directory with a
+  // leading slash (e.g. "/123456.sst"). Use relative_filename from base struct
+  // instead.
+  std::string name;
+
+  // DEPRECATED: replaced by `directory` in base struct
+  std::string db_path;
+};
+
+// The full set of metadata associated with each SST file.
+struct LiveFileMetaData : SstFileMetaData {
+  std::string column_family_name;  // Name of the column family
+  int level;                       // Level at which this file resides.
+  LiveFileMetaData() : column_family_name(), level(0) {}
+};
+
+// The MetaData that describes a Blob file
+struct BlobMetaData {
+  BlobMetaData()
+      : blob_file_number(0),
+        blob_file_size(0),
+        total_blob_count(0),
+        total_blob_bytes(0),
+        garbage_blob_count(0),
+        garbage_blob_bytes(0) {}
+
+  BlobMetaData(uint64_t _file_number, const std::string& _file_name,
+               const std::string& _file_path, uint64_t _file_size,
+               uint64_t _total_blob_count, uint64_t _total_blob_bytes,
+               uint64_t _garbage_blob_count, uint64_t _garbage_blob_bytes,
+               const std::string& _file_checksum,
+               const std::string& _file_checksum_func_name)
+      : blob_file_number(_file_number),
+        blob_file_name(_file_name),
+        blob_file_path(_file_path),
+        blob_file_size(_file_size),
+        total_blob_count(_total_blob_count),
+        total_blob_bytes(_total_blob_bytes),
+        garbage_blob_count(_garbage_blob_count),
+        garbage_blob_bytes(_garbage_blob_bytes),
+        checksum_method(_file_checksum),
+        checksum_value(_file_checksum_func_name) {}
+  uint64_t blob_file_number;
+  std::string blob_file_name;
+  std::string blob_file_path;
+  uint64_t blob_file_size;
+  uint64_t total_blob_count;
+  uint64_t total_blob_bytes;
+  uint64_t garbage_blob_count;
+  uint64_t garbage_blob_bytes;
+  std::string checksum_method;
+  std::string checksum_value;
+};
+
+// The metadata that describes a level.
+struct LevelMetaData {
+  LevelMetaData(int _level, uint64_t _size,
+                const std::vector<SstFileMetaData>&& _files)
+      : level(_level), size(_size), files(_files) {}
+
+  // The level which this meta data describes.
+  const int level;
+  // The size of this level in bytes, which is equal to the sum of
+  // the file size of its "files".
+  const uint64_t size;
+  // The metadata of all sst files in this level.
+  const std::vector<SstFileMetaData> files;
+};
+
+// The metadata that describes a column family.
+struct ColumnFamilyMetaData {
+  ColumnFamilyMetaData() : size(0), file_count(0), name("") {}
+  ColumnFamilyMetaData(const std::string& _name, uint64_t _size,
+                       const std::vector<LevelMetaData>&& _levels)
+      : size(_size), name(_name), levels(_levels) {}
+
+  // The size of this column family in bytes, which is equal to the sum of
+  // the file size of its "levels".
+  uint64_t size;
+  // The number of files in this column family.
+  size_t file_count;
+  // The name of the column family.
+  std::string name;
+  // The metadata of all levels in this column family.
+  std::vector<LevelMetaData> levels;
+
+  // The total size of all blob files
+  uint64_t blob_file_size = 0;
+  // The number of blob files in this column family.
+  size_t blob_file_count = 0;
+  // The metadata of the blobs in this column family.
+  std::vector<BlobMetaData> blob_files;
+};
+
+// Metadata returned as output from ExportColumnFamily() and used as input to
+// CreateColumnFamiliesWithImport().
+struct ExportImportFilesMetaData {
+  std::string db_comparator_name;       // Used to safety check at import.
+  std::vector<LiveFileMetaData> files;  // Vector of file metadata.
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/options.h b/src/rocksdb/include/rocksdb/options.h
new file mode 100644
index 000000000..7a4d8b5a6
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/options.h
@@ -0,0 +1,2113 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <limits>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/data_structure.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_checksum.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/sst_partitioner.h"
+#include "rocksdb/types.h"
+#include "rocksdb/universal_compaction.h"
+#include "rocksdb/version.h"
+#include "rocksdb/write_buffer_manager.h"
+
+#ifdef max
+#undef max
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cache;
+class CompactionFilter;
+class CompactionFilterFactory;
+class Comparator;
+class ConcurrentTaskLimiter;
+class Env;
+enum InfoLogLevel : unsigned char;
+class SstFileManager;
+class FilterPolicy;
+class Logger;
+class MergeOperator;
+class Snapshot;
+class MemTableRepFactory;
+class RateLimiter;
+class Slice;
+class Statistics;
+class InternalKeyComparator;
+class WalFilter;
+class FileSystem;
+
+struct Options;
+struct DbPath;
+
+using FileTypeSet = SmallEnumSet<FileType, FileType::kBlobFile>;
+
+struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
+  // The function recovers options to a previous version. Only 4.6 or later
+  // versions are supported.
+  // NOT MAINTAINED: This function has not been and is not maintained.
+  // DEPRECATED: This function might be removed in a future release.
+  // In general, defaults are changed to suit broad interests. Opting
+  // out of a change on upgrade should be deliberate and considered.
+  ColumnFamilyOptions* OldDefaults(int rocksdb_major_version = 4,
+                                   int rocksdb_minor_version = 6);
+
+  // Some functions that make it easier to optimize RocksDB
+  // Use this if your DB is very small (like under 1GB) and you don't want to
+  // spend lots of memory for memtables.
+  // An optional cache object is passed in to be used as the block cache
+  ColumnFamilyOptions* OptimizeForSmallDb(
+      std::shared_ptr<Cache>* cache = nullptr);
+
+  // Use this if you don't need to keep the data sorted, i.e. you'll never use
+  // an iterator, only Put() and Get() API calls
+  //
+  // Not supported in ROCKSDB_LITE
+  ColumnFamilyOptions* OptimizeForPointLookup(uint64_t block_cache_size_mb);
+
+  // Default values for some parameters in ColumnFamilyOptions are not
+  // optimized for heavy workloads and big datasets, which means you might
+  // observe write stalls under some conditions. As a starting point for tuning
+  // RocksDB options, use the following two functions:
+  // * OptimizeLevelStyleCompaction -- optimizes level style compaction
+  // * OptimizeUniversalStyleCompaction -- optimizes universal style compaction
+  // Universal style compaction is focused on reducing Write Amplification
+  // Factor for big data sets, but increases Space Amplification. You can learn
+  // more about the different styles here:
+  // https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide
+  // Make sure to also call IncreaseParallelism(), which will provide the
+  // biggest performance gains.
+  // Note: we might use more memory than memtable_memory_budget during high
+  // write rate period
+  //
+  // OptimizeUniversalStyleCompaction is not supported in ROCKSDB_LITE
+  ColumnFamilyOptions* OptimizeLevelStyleCompaction(
+      uint64_t memtable_memory_budget = 512 * 1024 * 1024);
+  ColumnFamilyOptions* OptimizeUniversalStyleCompaction(
+      uint64_t memtable_memory_budget = 512 * 1024 * 1024);
+
+  // -------------------
+  // Parameters that affect behavior
+
+  // Comparator used to define the order of keys in the table.
+  // Default: a comparator that uses lexicographic byte-wise ordering
+  //
+  // REQUIRES: The client must ensure that the comparator supplied
+  // here has the same name and orders keys *exactly* the same as the
+  // comparator provided to previous open calls on the same DB.
+  const Comparator* comparator = BytewiseComparator();
+
+  // REQUIRES: The client must provide a merge operator if Merge operation
+  // needs to be accessed. Calling Merge on a DB without a merge operator
+  // would result in Status::NotSupported. The client must ensure that the
+  // merge operator supplied here has the same name and *exactly* the same
+  // semantics as the merge operator provided to previous open calls on
+  // the same DB. The only exception is reserved for upgrade, where a DB
+  // previously without a merge operator is introduced to Merge operation
+  // for the first time. It's necessary to specify a merge operator when
+  // opening the DB in this case.
+  // Default: nullptr
+  std::shared_ptr<MergeOperator> merge_operator = nullptr;
+
+  // A single CompactionFilter instance to call into during compaction.
+  // Allows an application to modify/delete a key-value during background
+  // compaction.
+  //
+  // If the client requires a new `CompactionFilter` to be used for different
+  // compaction runs and/or requires a `CompactionFilter` for table file
+  // creations outside of compaction, it can specify compaction_filter_factory
+  // instead of this option.  The client should specify only one of the two.
+  // compaction_filter takes precedence over compaction_filter_factory if
+  // client specifies both.
+  //
+  // If multithreaded compaction is being used, the supplied CompactionFilter
+  // instance may be used from different threads concurrently and so should be
+  // thread-safe.
+  //
+  // Default: nullptr
+  const CompactionFilter* compaction_filter = nullptr;
+
+  // This is a factory that provides `CompactionFilter` objects which allow
+  // an application to modify/delete a key-value during table file creation.
+  //
+  // Unlike the `compaction_filter` option, which is used when compaction
+  // creates a table file, this factory allows using a `CompactionFilter` when a
+  // table file is created for various reasons. The factory can decide what
+  // `TableFileCreationReason`s use a `CompactionFilter`. For compatibility, by
+  // default the decision is to use a `CompactionFilter` for
+  // `TableFileCreationReason::kCompaction` only.
+  //
+  // Each thread of work involving creating table files will create a new
+  // `CompactionFilter` when it will be used according to the above
+  // `TableFileCreationReason`-based decision. This allows the application to
+  // know about the different ongoing threads of work and makes it unnecessary
+  // for `CompactionFilter` to provide thread-safety.
+  //
+  // Default: nullptr
+  std::shared_ptr<CompactionFilterFactory> compaction_filter_factory = nullptr;
+
+  // -------------------
+  // Parameters that affect performance
+
+  // Amount of data to build up in memory (backed by an unsorted log
+  // on disk) before converting to a sorted on-disk file.
+  //
+  // Larger values increase performance, especially during bulk loads.
+  // Up to max_write_buffer_number write buffers may be held in memory
+  // at the same time,
+  // so you may wish to adjust this parameter to control memory usage.
+  // Also, a larger write buffer will result in a longer recovery time
+  // the next time the database is opened.
+  //
+  // Note that write_buffer_size is enforced per column family.
+  // See db_write_buffer_size for sharing memory across column families.
+  //
+  // Default: 64MB
+  //
+  // Dynamically changeable through SetOptions() API
+  size_t write_buffer_size = 64 << 20;
+
+  // Compress blocks using the specified compression algorithm.
+  //
+  // Default: kSnappyCompression, if it's supported. If snappy is not linked
+  // with the library, the default is kNoCompression.
+  //
+  // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
+  //    ~200-500MB/s compression
+  //    ~400-800MB/s decompression
+  //
+  // Note that these speeds are significantly faster than most
+  // persistent storage speeds, and therefore it is typically never
+  // worth switching to kNoCompression.  Even if the input data is
+  // incompressible, the kSnappyCompression implementation will
+  // efficiently detect that and will switch to uncompressed mode.
+  //
+  // If you do not set `compression_opts.level`, or set it to
+  // `CompressionOptions::kDefaultCompressionLevel`, we will attempt to pick the
+  // default corresponding to `compression` as follows:
+  //
+  // - kZSTD: 3
+  // - kZlibCompression: Z_DEFAULT_COMPRESSION (currently -1)
+  // - kLZ4HCCompression: 0
+  // - For all others, we do not specify a compression level
+  //
+  // Dynamically changeable through SetOptions() API
+  CompressionType compression;
+
+  // Compression algorithm that will be used for the bottommost level that
+  // contain files. The behavior for num_levels = 1 is not well defined.
+  // Right now, with num_levels = 1,  all compaction outputs will use
+  // bottommost_compression and all flush outputs still use options.compression,
+  // but the behavior is subject to change.
+  //
+  // Default: kDisableCompressionOption (Disabled)
+  CompressionType bottommost_compression = kDisableCompressionOption;
+
+  // different options for compression algorithms used by bottommost_compression
+  // if it is enabled. To enable it, please see the definition of
+  // CompressionOptions. Behavior for num_levels = 1 is the same as
+  // options.bottommost_compression.
+  CompressionOptions bottommost_compression_opts;
+
+  // different options for compression algorithms
+  CompressionOptions compression_opts;
+
+  // Number of files to trigger level-0 compaction. A value <0 means that
+  // level-0 compaction will not be triggered by number of files at all.
+  //
+  // Default: 4
+  //
+  // Dynamically changeable through SetOptions() API
+  int level0_file_num_compaction_trigger = 4;
+
+  // If non-nullptr, use the specified function to put keys in contiguous
+  // groups called "prefixes". These prefixes are used to place one
+  // representative entry for the group into the Bloom filter
+  // rather than an entry for each key (see whole_key_filtering).
+  // Under certain conditions, this enables optimizing some range queries
+  // (Iterators) in addition to some point lookups (Get/MultiGet).
+  //
+  // Together `prefix_extractor` and `comparator` must satisfy one essential
+  // property for valid prefix filtering of range queries:
+  //   If Compare(k1, k2) <= 0 and Compare(k2, k3) <= 0 and
+  //      InDomain(k1) and InDomain(k3) and prefix(k1) == prefix(k3),
+  //   Then InDomain(k2) and prefix(k2) == prefix(k1)
+  //
+  // In other words, all keys with the same prefix must be in a contiguous
+  // group by comparator order, and cannot be interrupted by keys with no
+  // prefix ("out of domain"). (This makes it valid to conclude that no
+  // entries within some bounds are present if the upper and lower bounds
+  // have a common prefix and no entries with that same prefix are present.)
+  //
+  // Some other properties are recommended but not strictly required. Under
+  // most sensible comparators, the following will need to hold true to
+  // satisfy the essential property above:
+  // * "Prefix is a prefix": key.starts_with(prefix(key))
+  // * "Prefixes preserve ordering": If Compare(k1, k2) <= 0, then
+  //   Compare(prefix(k1), prefix(k2)) <= 0
+  //
+  // The next two properties ensure that seeking to a prefix allows
+  // enumerating all entries with that prefix:
+  // * "Prefix starts the group": Compare(prefix(key), key) <= 0
+  // * "Prefix idempotent": prefix(prefix(key)) == prefix(key)
+  //
+  // Default: nullptr
+  std::shared_ptr<const SliceTransform> prefix_extractor = nullptr;
+
+  // Control maximum total data size for a level.
+  // max_bytes_for_level_base is the max total for level-1.
+  // Maximum number of bytes for level L can be calculated as
+  // (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1))
+  // For example, if max_bytes_for_level_base is 200MB, and if
+  // max_bytes_for_level_multiplier is 10, total data size for level-1
+  // will be 200MB, total file size for level-2 will be 2GB,
+  // and total file size for level-3 will be 20GB.
+  //
+  // Default: 256MB.
+  //
+  // Dynamically changeable through SetOptions() API
+  uint64_t max_bytes_for_level_base = 256 * 1048576;
+
+  // Deprecated.
+  uint64_t snap_refresh_nanos = 0;
+
+  // Disable automatic compactions. Manual compactions can still
+  // be issued on this column family
+  //
+  // Dynamically changeable through SetOptions() API
+  bool disable_auto_compactions = false;
+
+  // This is a factory that provides TableFactory objects.
+  // Default: a block-based table factory that provides a default
+  // implementation of TableBuilder and TableReader with default
+  // BlockBasedTableOptions.
+  std::shared_ptr<TableFactory> table_factory;
+
+  // A list of paths where SST files for this column family
+  // can be put into, with its target size. Similar to db_paths,
+  // newer data is placed into paths specified earlier in the
+  // vector while older data gradually moves to paths specified
+  // later in the vector.
+  // Note that, if a path is supplied to multiple column
+  // families, it would have files and total size from all
+  // the column families combined. User should provision for the
+  // total size(from all the column families) in such cases.
+  //
+  // If left empty, db_paths will be used.
+  // Default: empty
+  std::vector<DbPath> cf_paths;
+
+  // Compaction concurrent thread limiter for the column family.
+  // If non-nullptr, use given concurrent thread limiter to control
+  // the max outstanding compaction tasks. Limiter can be shared with
+  // multiple column families across db instances.
+  //
+  // Default: nullptr
+  std::shared_ptr<ConcurrentTaskLimiter> compaction_thread_limiter = nullptr;
+
+  // If non-nullptr, use the specified factory for a function to determine the
+  // partitioning of sst files. This helps compaction to split the files
+  // on interesting boundaries (key prefixes) to make propagation of sst
+  // files less write amplifying (covering the whole key space).
+  // THE FEATURE IS STILL EXPERIMENTAL
+  //
+  // Default: nullptr
+  std::shared_ptr<SstPartitionerFactory> sst_partitioner_factory = nullptr;
+
+  // Create ColumnFamilyOptions with default values for all fields
+  ColumnFamilyOptions();
+  // Create ColumnFamilyOptions from Options
+  explicit ColumnFamilyOptions(const Options& options);
+
+  void Dump(Logger* log) const;
+};
+
+enum class WALRecoveryMode : char {
+  // Original levelDB recovery
+  //
+  // We tolerate the last record in any log to be incomplete due to a crash
+  // while writing it. Zeroed bytes from preallocation are also tolerated in the
+  // trailing data of any log.
+  //
+  // Use case: Applications for which updates, once applied, must not be rolled
+  // back even after a crash-recovery. In this recovery mode, RocksDB guarantees
+  // this as long as `WritableFile::Append()` writes are durable. In case the
+  // user needs the guarantee in more situations (e.g., when
+  // `WritableFile::Append()` writes to page cache, but the user desires this
+  // guarantee in face of power-loss crash-recovery), RocksDB offers various
+  // mechanisms to additionally invoke `WritableFile::Sync()` in order to
+  // strengthen the guarantee.
+  //
+  // This differs from `kPointInTimeRecovery` in that, in case a corruption is
+  // detected during recovery, this mode will refuse to open the DB. Whereas,
+  // `kPointInTimeRecovery` will stop recovery just before the corruption since
+  // that is a valid point-in-time to which to recover.
+  kTolerateCorruptedTailRecords = 0x00,
+  // Recover from clean shutdown
+  // We don't expect to find any corruption in the WAL
+  // Use case : This is ideal for unit tests and rare applications that
+  // can require high consistency guarantee
+  kAbsoluteConsistency = 0x01,
+  // Recover to point-in-time consistency (default)
+  // We stop the WAL playback on discovering WAL inconsistency
+  // Use case : Ideal for systems that have disk controller cache like
+  // hard disk, SSD without super capacitor that store related data
+  kPointInTimeRecovery = 0x02,
+  // Recovery after a disaster
+  // We ignore any corruption in the WAL and try to salvage as much data as
+  // possible
+  // Use case : Ideal for last ditch effort to recover data or systems that
+  // operate with low grade unrelated data
+  kSkipAnyCorruptedRecords = 0x03,
+};
+
+struct DbPath {
+  std::string path;
+  uint64_t target_size;  // Target size of total files under the path, in byte.
+
+  DbPath() : target_size(0) {}
+  DbPath(const std::string& p, uint64_t t) : path(p), target_size(t) {}
+};
+
+extern const char* kHostnameForDbHostId;
+
+enum class CompactionServiceJobStatus : char {
+  kSuccess,
+  kFailure,
+  kUseLocal,
+};
+
+struct CompactionServiceJobInfo {
+  std::string db_name;
+  std::string db_id;
+  std::string db_session_id;
+  uint64_t job_id;  // job_id is only unique within the current DB and session,
+                    // restart DB will reset the job_id. `db_id` and
+                    // `db_session_id` could help you build unique id across
+                    // different DBs and sessions.
+
+  Env::Priority priority;
+
+  CompactionServiceJobInfo(std::string db_name_, std::string db_id_,
+                           std::string db_session_id_, uint64_t job_id_,
+                           Env::Priority priority_)
+      : db_name(std::move(db_name_)),
+        db_id(std::move(db_id_)),
+        db_session_id(std::move(db_session_id_)),
+        job_id(job_id_),
+        priority(priority_) {}
+};
+
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class CompactionService : public Customizable {
+ public:
+  static const char* Type() { return "CompactionService"; }
+
+  // Returns the name of this compaction service.
+  const char* Name() const override = 0;
+
+  // Start the remote compaction with `compaction_service_input`, which can be
+  // passed to `DB::OpenAndCompact()` on the remote side. `info` provides the
+  // information the user might want to know, which includes `job_id`.
+  virtual CompactionServiceJobStatus StartV2(
+      const CompactionServiceJobInfo& /*info*/,
+      const std::string& /*compaction_service_input*/) {
+    return CompactionServiceJobStatus::kUseLocal;
+  }
+
+  // Wait for remote compaction to finish.
+  virtual CompactionServiceJobStatus WaitForCompleteV2(
+      const CompactionServiceJobInfo& /*info*/,
+      std::string* /*compaction_service_result*/) {
+    return CompactionServiceJobStatus::kUseLocal;
+  }
+
+  ~CompactionService() override = default;
+};
+
+struct DBOptions {
+  // The function recovers options to the option as in version 4.6.
+  // NOT MAINTAINED: This function has not been and is not maintained.
+  // DEPRECATED: This function might be removed in a future release.
+  // In general, defaults are changed to suit broad interests. Opting
+  // out of a change on upgrade should be deliberate and considered.
+  DBOptions* OldDefaults(int rocksdb_major_version = 4,
+                         int rocksdb_minor_version = 6);
+
+  // Some functions that make it easier to optimize RocksDB
+
+  // Use this if your DB is very small (like under 1GB) and you don't want to
+  // spend lots of memory for memtables.
+  // An optional cache object is passed in for the memory of the
+  // memtable to cost to
+  DBOptions* OptimizeForSmallDb(std::shared_ptr<Cache>* cache = nullptr);
+
+#ifndef ROCKSDB_LITE
+  // By default, RocksDB uses only one background thread for flush and
+  // compaction. Calling this function will set it up such that total of
+  // `total_threads` is used. Good value for `total_threads` is the number of
+  // cores. You almost definitely want to call this function if your system is
+  // bottlenecked by RocksDB.
+  DBOptions* IncreaseParallelism(int total_threads = 16);
+#endif  // ROCKSDB_LITE
+
+  // If true, the database will be created if it is missing.
+  // Default: false
+  bool create_if_missing = false;
+
+  // If true, missing column families will be automatically created.
+  // Default: false
+  bool create_missing_column_families = false;
+
+  // If true, an error is raised if the database already exists.
+  // Default: false
+  bool error_if_exists = false;
+
+  // If true, RocksDB will aggressively check consistency of the data.
+  // Also, if any of the  writes to the database fails (Put, Delete, Merge,
+  // Write), the database will switch to read-only mode and fail all other
+  // Write operations.
+  // In most cases you want this to be set to true.
+  // Default: true
+  bool paranoid_checks = true;
+
+  // If true, during memtable flush, RocksDB will validate total entries
+  // read in flush, and compare with counter inserted into it.
+  // The option is here to turn the feature off in case this new validation
+  // feature has a bug.
+  // Default: true
+  bool flush_verify_memtable_count = true;
+
+  // If true, the log numbers and sizes of the synced WALs are tracked
+  // in MANIFEST. During DB recovery, if a synced WAL is missing
+  // from disk, or the WAL's size does not match the recorded size in
+  // MANIFEST, an error will be reported and the recovery will be aborted.
+  //
+  // This is one additional protection against WAL corruption besides the
+  // per-WAL-entry checksum.
+  //
+  // Note that this option does not work with secondary instance.
+  // Currently, only syncing closed WALs are tracked. Calling `DB::SyncWAL()`,
+  // etc. or writing with `WriteOptions::sync=true` to sync the live WAL is not
+  // tracked for performance/efficiency reasons.
+  //
+  // Default: false
+  bool track_and_verify_wals_in_manifest = false;
+
+  // If true, verifies the SST unique id between MANIFEST and actual file
+  // each time an SST file is opened. This check ensures an SST file is not
+  // overwritten or misplaced. A corruption error will be reported if mismatch
+  // detected, but only when MANIFEST tracks the unique id, which starts from
+  // RocksDB version 7.3. Although the tracked internal unique id is related
+  // to the one returned by GetUniqueIdFromTableProperties, that is subject to
+  // change.
+  // NOTE: verification is currently only done on SST files using block-based
+  // table format.
+  //
+  // Setting to false should only be needed in case of unexpected problems.
+  //
+  // Although an early version of this option opened all SST files for
+  // verification on DB::Open, that is no longer guaranteed. However, as
+  // documented in an above option, if max_open_files is -1, DB will open all
+  // files on DB::Open().
+  //
+  // Default: true
+  bool verify_sst_unique_id_in_manifest = true;
+
+  // Use the specified object to interact with the environment,
+  // e.g. to read/write files, schedule background work, etc. In the near
+  // future, support for doing storage operations such as read/write files
+  // through env will be deprecated in favor of file_system (see below)
+  // Default: Env::Default()
+  Env* env = Env::Default();
+
+  // Limits internal file read/write bandwidth:
+  //
+  // - Flush requests write bandwidth at `Env::IOPriority::IO_HIGH`
+  // - Compaction requests read and write bandwidth at
+  //   `Env::IOPriority::IO_LOW`
+  // - Reads associated with a `ReadOptions` can be charged at
+  //   `ReadOptions::rate_limiter_priority` (see that option's API doc for usage
+  //   and limitations).
+  // - Writes associated with a `WriteOptions` can be charged at
+  //   `WriteOptions::rate_limiter_priority` (see that option's API doc for
+  //   usage and limitations).
+  //
+  // Rate limiting is disabled if nullptr. If rate limiter is enabled,
+  // bytes_per_sync is set to 1MB by default.
+  //
+  // Default: nullptr
+  std::shared_ptr<RateLimiter> rate_limiter = nullptr;
+
+  // Use to track SST files and control their file deletion rate.
+  //
+  // Features:
+  //  - Throttle the deletion rate of the SST files.
+  //  - Keep track the total size of all SST files.
+  //  - Set a maximum allowed space limit for SST files that when reached
+  //    the DB wont do any further flushes or compactions and will set the
+  //    background error.
+  //  - Can be shared between multiple dbs.
+  // Limitations:
+  //  - Only track and throttle deletes of SST files in
+  //    first db_path (db_name if db_paths is empty).
+  //
+  // Default: nullptr
+  std::shared_ptr<SstFileManager> sst_file_manager = nullptr;
+
+  // Any internal progress/error information generated by the db will
+  // be written to info_log if it is non-nullptr, or to a file stored
+  // in the same directory as the DB contents if info_log is nullptr.
+  // Default: nullptr
+  std::shared_ptr<Logger> info_log = nullptr;
+
+#ifdef NDEBUG
+  InfoLogLevel info_log_level = INFO_LEVEL;
+#else
+  InfoLogLevel info_log_level = DEBUG_LEVEL;
+#endif  // NDEBUG
+
+  // Number of open files that can be used by the DB.  You may need to
+  // increase this if your database has a large working set. Value -1 means
+  // files opened are always kept open. You can estimate number of files based
+  // on target_file_size_base and target_file_size_multiplier for level-based
+  // compaction. For universal-style compaction, you can usually set it to -1.
+  //
+  // A high value or -1 for this option can cause high memory usage.
+  // See BlockBasedTableOptions::cache_usage_options to constrain
+  // memory usage in case of block based table format.
+  //
+  // Default: -1
+  //
+  // Dynamically changeable through SetDBOptions() API.
+  int max_open_files = -1;
+
+  // If max_open_files is -1, DB will open all files on DB::Open(). You can
+  // use this option to increase the number of threads used to open the files.
+  // Default: 16
+  int max_file_opening_threads = 16;
+
+  // Once write-ahead logs exceed this size, we will start forcing the flush of
+  // column families whose memtables are backed by the oldest live WAL file
+  // (i.e. the ones that are causing all the space amplification). If set to 0
+  // (default), we will dynamically choose the WAL size limit to be
+  // [sum of all write_buffer_size * max_write_buffer_number] * 4
+  //
+  // For example, with 15 column families, each with
+  // write_buffer_size = 128 MB
+  // max_write_buffer_number = 6
+  // max_total_wal_size will be calculated to be [15 * 128MB * 6] * 4 = 45GB
+  //
+  // The RocksDB wiki has some discussion about how the WAL interacts
+  // with memtables and flushing of column families.
+  // https://github.com/facebook/rocksdb/wiki/Column-Families
+  //
+  // This option takes effect only when there are more than one column
+  // family as otherwise the wal size is dictated by the write_buffer_size.
+  //
+  // Default: 0
+  //
+  // Dynamically changeable through SetDBOptions() API.
+  uint64_t max_total_wal_size = 0;
+
+  // If non-null, then we should collect metrics about database operations
+  std::shared_ptr<Statistics> statistics = nullptr;
+
+  // By default, writes to stable storage use fdatasync (on platforms
+  // where this function is available). If this option is true,
+  // fsync is used instead.
+  //
+  // fsync and fdatasync are equally safe for our purposes and fdatasync is
+  // faster, so it is rarely necessary to set this option. It is provided
+  // as a workaround for kernel/filesystem bugs, such as one that affected
+  // fdatasync with ext4 in kernel versions prior to 3.7.
+  bool use_fsync = false;
+
+  // A list of paths where SST files can be put into, with its target size.
+  // Newer data is placed into paths specified earlier in the vector while
+  // older data gradually moves to paths specified later in the vector.
+  //
+  // For example, you have a flash device with 10GB allocated for the DB,
+  // as well as a hard drive of 2TB, you should config it to be:
+  //   [{"/flash_path", 10GB}, {"/hard_drive", 2TB}]
+  //
+  // The system will try to guarantee data under each path is close to but
+  // not larger than the target size. But current and future file sizes used
+  // by determining where to place a file are based on best-effort estimation,
+  // which means there is a chance that the actual size under the directory
+  // is slightly more than target size under some workloads. User should give
+  // some buffer room for those cases.
+  //
+  // If none of the paths has sufficient room to place a file, the file will
+  // be placed to the last path anyway, despite to the target size.
+  //
+  // Placing newer data to earlier paths is also best-efforts. User should
+  // expect user files to be placed in higher levels in some extreme cases.
+  //
+  // If left empty, only one path will be used, which is db_name passed when
+  // opening the DB.
+  // Default: empty
+  std::vector<DbPath> db_paths;
+
+  // This specifies the info LOG dir.
+  // If it is empty, the log files will be in the same dir as data.
+  // If it is non empty, the log files will be in the specified dir,
+  // and the db data dir's absolute path will be used as the log file
+  // name's prefix.
+  std::string db_log_dir = "";
+
+  // This specifies the absolute dir path for write-ahead logs (WAL).
+  // If it is empty, the log files will be in the same dir as data,
+  //   dbname is used as the data dir by default
+  // If it is non empty, the log files will be in kept the specified dir.
+  // When destroying the db,
+  //   all log files in wal_dir and the dir itself is deleted
+  std::string wal_dir = "";
+
+  // The periodicity when obsolete files get deleted. The default
+  // value is 6 hours. The files that get out of scope by compaction
+  // process will still get automatically delete on every compaction,
+  // regardless of this setting
+  //
+  // Default: 6 hours
+  //
+  // Dynamically changeable through SetDBOptions() API.
+  uint64_t delete_obsolete_files_period_micros = 6ULL * 60 * 60 * 1000000;
+
+  // Maximum number of concurrent background jobs (compactions and flushes).
+  //
+  // Default: 2
+  //
+  // Dynamically changeable through SetDBOptions() API.
+  int max_background_jobs = 2;
+
+  // DEPRECATED: RocksDB automatically decides this based on the
+  // value of max_background_jobs. For backwards compatibility we will set
+  // `max_background_jobs = max_background_compactions + max_background_flushes`
+  // in the case where user sets at least one of `max_background_compactions` or
+  // `max_background_flushes` (we replace -1 by 1 in case one option is unset).
+  //
+  // Maximum number of concurrent background compaction jobs, submitted to
+  // the default LOW priority thread pool.
+  //
+  // If you're increasing this, also consider increasing number of threads in
+  // LOW priority thread pool. For more information, see
+  // Env::SetBackgroundThreads
+  //
+  // Default: -1
+  //
+  // Dynamically changeable through SetDBOptions() API.
+  int max_background_compactions = -1;
+
+  // This value represents the maximum number of threads that will
+  // concurrently perform a compaction job by breaking it into multiple,
+  // smaller ones that are run simultaneously.
+  // Default: 1 (i.e. no subcompactions)
+  //
+  // Dynamically changeable through SetDBOptions() API.
+  uint32_t max_subcompactions = 1;
+
+  // DEPRECATED: RocksDB automatically decides this based on the
+  // value of max_background_jobs. For backwards compatibility we will set
+  // `max_background_jobs = max_background_compactions + max_background_flushes`
+  // in the case where user sets at least one of `max_background_compactions` or
+  // `max_background_flushes`.
+  //
+  // Maximum number of concurrent background memtable flush jobs, submitted by
+  // default to the HIGH priority thread pool. If the HIGH priority thread pool
+  // is configured to have zero threads, flush jobs will share the LOW priority
+  // thread pool with compaction jobs.
+  //
+  // It is important to use both thread pools when the same Env is shared by
+  // multiple db instances. Without a separate pool, long running compaction
+  // jobs could potentially block memtable flush jobs of other db instances,
+  // leading to unnecessary Put stalls.
+  //
+  // If you're increasing this, also consider increasing number of threads in
+  // HIGH priority thread pool. For more information, see
+  // Env::SetBackgroundThreads
+  // Default: -1
+  int max_background_flushes = -1;
+
+  // Specify the maximal size of the info log file. If the log file
+  // is larger than `max_log_file_size`, a new info log file will
+  // be created.
+  // If max_log_file_size == 0, all logs will be written to one
+  // log file.
+  size_t max_log_file_size = 0;
+
+  // Time for the info log file to roll (in seconds).
+  // If specified with non-zero value, log file will be rolled
+  // if it has been active longer than `log_file_time_to_roll`.
+  // Default: 0 (disabled)
+  // Not supported in ROCKSDB_LITE mode!
+  size_t log_file_time_to_roll = 0;
+
+  // Maximal info log files to be kept.
+  // Default: 1000
+  size_t keep_log_file_num = 1000;
+
+  // Recycle log files.
+  // If non-zero, we will reuse previously written log files for new
+  // logs, overwriting the old data.  The value indicates how many
+  // such files we will keep around at any point in time for later
+  // use.  This is more efficient because the blocks are already
+  // allocated and fdatasync does not need to update the inode after
+  // each write.
+  // Default: 0
+  size_t recycle_log_file_num = 0;
+
+  // manifest file is rolled over on reaching this limit.
+  // The older manifest file be deleted.
+  // The default value is 1GB so that the manifest file can grow, but not
+  // reach the limit of storage capacity.
+  uint64_t max_manifest_file_size = 1024 * 1024 * 1024;
+
+  // Number of shards used for table cache.
+  int table_cache_numshardbits = 6;
+
+  // The following two fields affect how archived logs will be deleted.
+  // 1. If both set to 0, logs will be deleted asap and will not get into
+  //    the archive.
+  // 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
+  //    WAL files will be checked every 10 min and if total size is greater
+  //    then WAL_size_limit_MB, they will be deleted starting with the
+  //    earliest until size_limit is met. All empty files will be deleted.
+  // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
+  //    WAL files will be checked every WAL_ttl_seconds / 2 and those that
+  //    are older than WAL_ttl_seconds will be deleted.
+  // 4. If both are not 0, WAL files will be checked every 10 min and both
+  //    checks will be performed with ttl being first.
+  uint64_t WAL_ttl_seconds = 0;
+  uint64_t WAL_size_limit_MB = 0;
+
+  // Number of bytes to preallocate (via fallocate) the manifest
+  // files.  Default is 4mb, which is reasonable to reduce random IO
+  // as well as prevent overallocation for mounts that preallocate
+  // large amounts of data (such as xfs's allocsize option).
+  size_t manifest_preallocation_size = 4 * 1024 * 1024;
+
+  // Allow the OS to mmap file for reading sst tables.
+  // Not recommended for 32-bit OS.
+  // When the option is set to true and compression is disabled, the blocks
+  // will not be copied and will be read directly from the mmap-ed memory
+  // area, and the block will not be inserted into the block cache. However,
+  // checksums will still be checked if ReadOptions.verify_checksums is set
+  // to be true. It means a checksum check every time a block is read, more
+  // than the setup where the option is set to false and the block cache is
+  // used. The common use of the options is to run RocksDB on ramfs, where
+  // checksum verification is usually not needed.
+  // Default: false
+  bool allow_mmap_reads = false;
+
+  // Allow the OS to mmap file for writing.
+  // DB::SyncWAL() only works if this is set to false.
+  // Default: false
+  bool allow_mmap_writes = false;
+
+  // Enable direct I/O mode for read/write
+  // they may or may not improve performance depending on the use case
+  //
+  // Files will be opened in "direct I/O" mode
+  // which means that data r/w from the disk will not be cached or
+  // buffered. The hardware buffer of the devices may however still
+  // be used. Memory mapped files are not impacted by these parameters.
+
+  // Use O_DIRECT for user and compaction reads.
+  // Default: false
+  // Not supported in ROCKSDB_LITE mode!
+  bool use_direct_reads = false;
+
+  // Use O_DIRECT for writes in background flush and compactions.
+  // Default: false
+  // Not supported in ROCKSDB_LITE mode!
+  bool use_direct_io_for_flush_and_compaction = false;
+
+  // If false, fallocate() calls are bypassed, which disables file
+  // preallocation. The file space preallocation is used to increase the file
+  // write/append performance. By default, RocksDB preallocates space for WAL,
+  // SST, Manifest files, the extra space is truncated when the file is written.
+  // Warning: if you're using btrfs, we would recommend setting
+  // `allow_fallocate=false` to disable preallocation. As on btrfs, the extra
+  // allocated space cannot be freed, which could be significant if you have
+  // lots of files. More details about this limitation:
+  // https://github.com/btrfs/btrfs-dev-docs/blob/471c5699336e043114d4bca02adcd57d9dab9c44/data-extent-reference-counts.md
+  bool allow_fallocate = true;
+
+  // Disable child process inherit open files. Default: true
+  bool is_fd_close_on_exec = true;
+
+  // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
+  //
+  // Default: 600 (10 min)
+  //
+  // Dynamically changeable through SetDBOptions() API.
+  unsigned int stats_dump_period_sec = 600;
+
+  // if not zero, dump rocksdb.stats to RocksDB every stats_persist_period_sec
+  // Default: 600
+  unsigned int stats_persist_period_sec = 600;
+
+  // If true, automatically persist stats to a hidden column family (column
+  // family name: ___rocksdb_stats_history___) every
+  // stats_persist_period_sec seconds; otherwise, write to an in-memory
+  // struct. User can query through `GetStatsHistory` API.
+  // If user attempts to create a column family with the same name on a DB
+  // which have previously set persist_stats_to_disk to true, the column family
+  // creation will fail, but the hidden column family will survive, as well as
+  // the previously persisted statistics.
+  // When peristing stats to disk, the stat name will be limited at 100 bytes.
+  // Default: false
+  bool persist_stats_to_disk = false;
+
+  // if not zero, periodically take stats snapshots and store in memory, the
+  // memory size for stats snapshots is capped at stats_history_buffer_size
+  // Default: 1MB
+  size_t stats_history_buffer_size = 1024 * 1024;
+
+  // If set true, will hint the underlying file system that the file
+  // access pattern is random, when a sst file is opened.
+  // Default: true
+  bool advise_random_on_open = true;
+
+  // Amount of data to build up in memtables across all column
+  // families before writing to disk.
+  //
+  // This is distinct from write_buffer_size, which enforces a limit
+  // for a single memtable.
+  //
+  // This feature is disabled by default. Specify a non-zero value
+  // to enable it.
+  //
+  // Default: 0 (disabled)
+  size_t db_write_buffer_size = 0;
+
+  // The memory usage of memtable will report to this object. The same object
+  // can be passed into multiple DBs and it will track the sum of size of all
+  // the DBs. If the total size of all live memtables of all the DBs exceeds
+  // a limit, a flush will be triggered in the next DB to which the next write
+  // is issued, as long as there is one or more column family not already
+  // flushing.
+  //
+  // If the object is only passed to one DB, the behavior is the same as
+  // db_write_buffer_size. When write_buffer_manager is set, the value set will
+  // override db_write_buffer_size.
+  //
+  // This feature is disabled by default. Specify a non-zero value
+  // to enable it.
+  //
+  // Default: null
+  std::shared_ptr<WriteBufferManager> write_buffer_manager = nullptr;
+
+  // Specify the file access pattern once a compaction is started.
+  // It will be applied to all input files of a compaction.
+  // Default: NORMAL
+  enum AccessHint { NONE, NORMAL, SEQUENTIAL, WILLNEED };
+  AccessHint access_hint_on_compaction_start = NORMAL;
+
+  // If non-zero, we perform bigger reads when doing compaction. If you're
+  // running RocksDB on spinning disks, you should set this to at least 2MB.
+  // That way RocksDB's compaction is doing sequential instead of random reads.
+  //
+  // Default: 0
+  //
+  // Dynamically changeable through SetDBOptions() API.
+  size_t compaction_readahead_size = 0;
+
+  // This is a maximum buffer size that is used by WinMmapReadableFile in
+  // unbuffered disk I/O mode. We need to maintain an aligned buffer for
+  // reads. We allow the buffer to grow until the specified value and then
+  // for bigger requests allocate one shot buffers. In unbuffered mode we
+  // always bypass read-ahead buffer at ReadaheadRandomAccessFile
+  // When read-ahead is required we then make use of compaction_readahead_size
+  // value and always try to read ahead. With read-ahead we always
+  // pre-allocate buffer to the size instead of growing it up to a limit.
+  //
+  // This option is currently honored only on Windows
+  //
+  // Default: 1 Mb
+  //
+  // Special value: 0 - means do not maintain per instance buffer. Allocate
+  //                per request buffer and avoid locking.
+  size_t random_access_max_buffer_size = 1024 * 1024;
+
+  // This is the maximum buffer size that is used by WritableFileWriter.
+  // With direct IO, we need to maintain an aligned buffer for writes.
+  // We allow the buffer to grow until it's size hits the limit in buffered
+  // IO and fix the buffer size when using direct IO to ensure alignment of
+  // write requests if the logical sector size is unusual
+  //
+  // Default: 1024 * 1024 (1 MB)
+  //
+  // Dynamically changeable through SetDBOptions() API.
+  size_t writable_file_max_buffer_size = 1024 * 1024;
+
+  // Use adaptive mutex, which spins in the user space before resorting
+  // to kernel. This could reduce context switch when the mutex is not
+  // heavily contended. However, if the mutex is hot, we could end up
+  // wasting spin time.
+  // Default: false
+  bool use_adaptive_mutex = false;
+
+  // Create DBOptions with default values for all fields
+  DBOptions();
+  // Create DBOptions from Options
+  explicit DBOptions(const Options& options);
+
+  void Dump(Logger* log) const;
+
+  // Allows OS to incrementally sync files to disk while they are being
+  // written, asynchronously, in the background. This operation can be used
+  // to smooth out write I/Os over time. Users shouldn't rely on it for
+  // persistence guarantee.
+  // Issue one request for every bytes_per_sync written. 0 turns it off.
+  //
+  // You may consider using rate_limiter to regulate write rate to device.
+  // When rate limiter is enabled, it automatically enables bytes_per_sync
+  // to 1MB.
+  //
+  // This option applies to table files
+  //
+  // Default: 0, turned off
+  //
+  // Note: DOES NOT apply to WAL files. See wal_bytes_per_sync instead
+  // Dynamically changeable through SetDBOptions() API.
+  uint64_t bytes_per_sync = 0;
+
+  // Same as bytes_per_sync, but applies to WAL files
+  //
+  // Default: 0, turned off
+  //
+  // Dynamically changeable through SetDBOptions() API.
+  uint64_t wal_bytes_per_sync = 0;
+
+  // When true, guarantees WAL files have at most `wal_bytes_per_sync`
+  // bytes submitted for writeback at any given time, and SST files have at most
+  // `bytes_per_sync` bytes pending writeback at any given time. This can be
+  // used to handle cases where processing speed exceeds I/O speed during file
+  // generation, which can lead to a huge sync when the file is finished, even
+  // with `bytes_per_sync` / `wal_bytes_per_sync` properly configured.
+  //
+  //  - If `sync_file_range` is supported it achieves this by waiting for any
+  //    prior `sync_file_range`s to finish before proceeding. In this way,
+  //    processing (compression, etc.) can proceed uninhibited in the gap
+  //    between `sync_file_range`s, and we block only when I/O falls behind.
+  //  - Otherwise the `WritableFile::Sync` method is used. Note this mechanism
+  //    always blocks, thus preventing the interleaving of I/O and processing.
+  //
+  // Note: Enabling this option does not provide any additional persistence
+  // guarantees, as it may use `sync_file_range`, which does not write out
+  // metadata.
+  //
+  // Default: false
+  bool strict_bytes_per_sync = false;
+
+  // A vector of EventListeners whose callback functions will be called
+  // when specific RocksDB event happens.
+  std::vector<std::shared_ptr<EventListener>> listeners;
+
+  // If true, then the status of the threads involved in this DB will
+  // be tracked and available via GetThreadList() API.
+  //
+  // Default: false
+  bool enable_thread_tracking = false;
+
+  // The limited write rate to DB if soft_pending_compaction_bytes_limit or
+  // level0_slowdown_writes_trigger is triggered, or we are writing to the
+  // last mem table allowed and we allow more than 3 mem tables. It is
+  // calculated using size of user write requests before compression.
+  // RocksDB may decide to slow down more if the compaction still
+  // gets behind further.
+  // If the value is 0, we will infer a value from `rater_limiter` value
+  // if it is not empty, or 16MB if `rater_limiter` is empty. Note that
+  // if users change the rate in `rate_limiter` after DB is opened,
+  // `delayed_write_rate` won't be adjusted.
+  //
+  // Unit: byte per second.
+  //
+  // Default: 0
+  //
+  // Dynamically changeable through SetDBOptions() API.
+  uint64_t delayed_write_rate = 0;
+
+  // By default, a single write thread queue is maintained. The thread gets
+  // to the head of the queue becomes write batch group leader and responsible
+  // for writing to WAL and memtable for the batch group.
+  //
+  // If enable_pipelined_write is true, separate write thread queue is
+  // maintained for WAL write and memtable write. A write thread first enter WAL
+  // writer queue and then memtable writer queue. Pending thread on the WAL
+  // writer queue thus only have to wait for previous writers to finish their
+  // WAL writing but not the memtable writing. Enabling the feature may improve
+  // write throughput and reduce latency of the prepare phase of two-phase
+  // commit.
+  //
+  // Default: false
+  bool enable_pipelined_write = false;
+
+  // Setting unordered_write to true trades higher write throughput with
+  // relaxing the immutability guarantee of snapshots. This violates the
+  // repeatability one expects from ::Get from a snapshot, as well as
+  // ::MultiGet and Iterator's consistent-point-in-time view property.
+  // If the application cannot tolerate the relaxed guarantees, it can implement
+  // its own mechanisms to work around that and yet benefit from the higher
+  // throughput. Using TransactionDB with WRITE_PREPARED write policy and
+  // two_write_queues=true is one way to achieve immutable snapshots despite
+  // unordered_write.
+  //
+  // By default, i.e., when it is false, rocksdb does not advance the sequence
+  // number for new snapshots unless all the writes with lower sequence numbers
+  // are already finished. This provides the immutability that we except from
+  // snapshots. Moreover, since Iterator and MultiGet internally depend on
+  // snapshots, the snapshot immutability results into Iterator and MultiGet
+  // offering consistent-point-in-time view. If set to true, although
+  // Read-Your-Own-Write property is still provided, the snapshot immutability
+  // property is relaxed: the writes issued after the snapshot is obtained (with
+  // larger sequence numbers) will be still not visible to the reads from that
+  // snapshot, however, there still might be pending writes (with lower sequence
+  // number) that will change the state visible to the snapshot after they are
+  // landed to the memtable.
+  //
+  // Default: false
+  bool unordered_write = false;
+
+  // If true, allow multi-writers to update mem tables in parallel.
+  // Only some memtable_factory-s support concurrent writes; currently it
+  // is implemented only for SkipListFactory.  Concurrent memtable writes
+  // are not compatible with inplace_update_support or filter_deletes.
+  // It is strongly recommended to set enable_write_thread_adaptive_yield
+  // if you are going to use this feature.
+  //
+  // Default: true
+  bool allow_concurrent_memtable_write = true;
+
+  // If true, threads synchronizing with the write batch group leader will
+  // wait for up to write_thread_max_yield_usec before blocking on a mutex.
+  // This can substantially improve throughput for concurrent workloads,
+  // regardless of whether allow_concurrent_memtable_write is enabled.
+  //
+  // Default: true
+  bool enable_write_thread_adaptive_yield = true;
+
+  // The maximum limit of number of bytes that are written in a single batch
+  // of WAL or memtable write. It is followed when the leader write size
+  // is larger than 1/8 of this limit.
+  //
+  // Default: 1 MB
+  uint64_t max_write_batch_group_size_bytes = 1 << 20;
+
+  // The maximum number of microseconds that a write operation will use
+  // a yielding spin loop to coordinate with other write threads before
+  // blocking on a mutex.  (Assuming write_thread_slow_yield_usec is
+  // set properly) increasing this value is likely to increase RocksDB
+  // throughput at the expense of increased CPU usage.
+  //
+  // Default: 100
+  uint64_t write_thread_max_yield_usec = 100;
+
+  // The latency in microseconds after which a std::this_thread::yield
+  // call (sched_yield on Linux) is considered to be a signal that
+  // other processes or threads would like to use the current core.
+  // Increasing this makes writer threads more likely to take CPU
+  // by spinning, which will show up as an increase in the number of
+  // involuntary context switches.
+  //
+  // Default: 3
+  uint64_t write_thread_slow_yield_usec = 3;
+
+  // If true, then DB::Open() will not update the statistics used to optimize
+  // compaction decision by loading table properties from many files.
+  // Turning off this feature will improve DBOpen time especially in
+  // disk environment.
+  //
+  // Default: false
+  bool skip_stats_update_on_db_open = false;
+
+  // If true, then DB::Open() will not fetch and check sizes of all sst files.
+  // This may significantly speed up startup if there are many sst files,
+  // especially when using non-default Env with expensive GetFileSize().
+  // We'll still check that all required sst files exist.
+  // If paranoid_checks is false, this option is ignored, and sst files are
+  // not checked at all.
+  //
+  // Default: false
+  bool skip_checking_sst_file_sizes_on_db_open = false;
+
+  // Recovery mode to control the consistency while replaying WAL
+  // Default: kPointInTimeRecovery
+  WALRecoveryMode wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+
+  // if set to false then recovery will fail when a prepared
+  // transaction is encountered in the WAL
+  bool allow_2pc = false;
+
+  // A global cache for table-level rows.
+  // Default: nullptr (disabled)
+  // Not supported in ROCKSDB_LITE mode!
+  std::shared_ptr<Cache> row_cache = nullptr;
+
+#ifndef ROCKSDB_LITE
+  // A filter object supplied to be invoked while processing write-ahead-logs
+  // (WALs) during recovery. The filter provides a way to inspect log
+  // records, ignoring a particular record or skipping replay.
+  // The filter is invoked at startup and is invoked from a single-thread
+  // currently.
+  WalFilter* wal_filter = nullptr;
+#endif  // ROCKSDB_LITE
+
+  // If true, then DB::Open / CreateColumnFamily / DropColumnFamily
+  // SetOptions will fail if options file is not properly persisted.
+  //
+  // DEFAULT: false
+  bool fail_if_options_file_error = false;
+
+  // If true, then print malloc stats together with rocksdb.stats
+  // when printing to LOG.
+  // DEFAULT: false
+  bool dump_malloc_stats = false;
+
+  // By default RocksDB replay WAL logs and flush them on DB open, which may
+  // create very small SST files. If this option is enabled, RocksDB will try
+  // to avoid (but not guarantee not to) flush during recovery. Also, existing
+  // WAL logs will be kept, so that if crash happened before flush, we still
+  // have logs to recover from.
+  //
+  // DEFAULT: false
+  bool avoid_flush_during_recovery = false;
+
+  // By default RocksDB will flush all memtables on DB close if there are
+  // unpersisted data (i.e. with WAL disabled) The flush can be skip to speedup
+  // DB close. Unpersisted data WILL BE LOST.
+  //
+  // DEFAULT: false
+  //
+  // Dynamically changeable through SetDBOptions() API.
+  bool avoid_flush_during_shutdown = false;
+
+  // Set this option to true during creation of database if you want
+  // to be able to ingest behind (call IngestExternalFile() skipping keys
+  // that already exist, rather than overwriting matching keys).
+  // Setting this option to true will affect 2 things:
+  // 1) Disable some internal optimizations around SST file compression
+  // 2) Reserve bottom-most level for ingested files only.
+  // 3) Note that num_levels should be >= 3 if this option is turned on.
+  //
+  // DEFAULT: false
+  // Immutable.
+  bool allow_ingest_behind = false;
+
+  // If enabled it uses two queues for writes, one for the ones with
+  // disable_memtable and one for the ones that also write to memtable. This
+  // allows the memtable writes not to lag behind other writes. It can be used
+  // to optimize MySQL 2PC in which only the commits, which are serial, write to
+  // memtable.
+  bool two_write_queues = false;
+
+  // If true WAL is not flushed automatically after each write. Instead it
+  // relies on manual invocation of FlushWAL to write the WAL buffer to its
+  // file.
+  bool manual_wal_flush = false;
+
+  // This feature is WORK IN PROGRESS
+  // If enabled WAL records will be compressed before they are written.
+  // Only zstd is supported. Compressed WAL records will be read in supported
+  // versions regardless of the wal_compression settings.
+  CompressionType wal_compression = kNoCompression;
+
+  // If true, RocksDB supports flushing multiple column families and committing
+  // their results atomically to MANIFEST. Note that it is not
+  // necessary to set atomic_flush to true if WAL is always enabled since WAL
+  // allows the database to be restored to the last persistent state in WAL.
+  // This option is useful when there are column families with writes NOT
+  // protected by WAL.
+  // For manual flush, application has to specify which column families to
+  // flush atomically in DB::Flush.
+  // For auto-triggered flush, RocksDB atomically flushes ALL column families.
+  //
+  // Currently, any WAL-enabled writes after atomic flush may be replayed
+  // independently if the process crashes later and tries to recover.
+  bool atomic_flush = false;
+
+  // If true, working thread may avoid doing unnecessary and long-latency
+  // operation (such as deleting obsolete files directly or deleting memtable)
+  // and will instead schedule a background job to do it.
+  // Use it if you're latency-sensitive.
+  // If set to true, takes precedence over
+  // ReadOptions::background_purge_on_iterator_cleanup.
+  bool avoid_unnecessary_blocking_io = false;
+
+  // Historically DB ID has always been stored in Identity File in DB folder.
+  // If this flag is true, the DB ID is written to Manifest file in addition
+  // to the Identity file. By doing this 2 problems are solved
+  // 1. We don't checksum the Identity file where as Manifest file is.
+  // 2. Since the source of truth for DB is Manifest file DB ID will sit with
+  //    the source of truth. Previously the Identity file could be copied
+  //    independent of Manifest and that can result in wrong DB ID.
+  // We recommend setting this flag to true.
+  // Default: false
+  bool write_dbid_to_manifest = false;
+
+  // The number of bytes to prefetch when reading the log. This is mostly useful
+  // for reading a remotely located log, as it can save the number of
+  // round-trips. If 0, then the prefetching is disabled.
+  //
+  // Default: 0
+  size_t log_readahead_size = 0;
+
+  // If user does NOT provide the checksum generator factory, the file checksum
+  // will NOT be used. A new file checksum generator object will be created
+  // when a SST file is created. Therefore, each created FileChecksumGenerator
+  // will only be used from a single thread and so does not need to be
+  // thread-safe.
+  //
+  // Default: nullptr
+  std::shared_ptr<FileChecksumGenFactory> file_checksum_gen_factory = nullptr;
+
+  // By default, RocksDB recovery fails if any table/blob file referenced in the
+  // final version reconstructed from the
+  // MANIFEST are missing after scanning the MANIFEST pointed to by the
+  // CURRENT file. It can also fail if verification of unique SST id fails.
+  // Best-efforts recovery is another recovery mode that does not necessarily
+  // fail when certain table/blob files are missing/corrupted or have mismatched
+  // unique id table property. Instead, best-efforts recovery recovers each
+  // column family to a point in the MANIFEST that corresponds to a version. In
+  // such a version, all valid table/blob files referenced have the expected
+  // file size. For table files, their unique id table property match the
+  // MANIFEST.
+  //
+  // Best-efforts recovery does not need a valid CURRENT file, and tries to
+  // recover the database using one of the available MANIFEST files in the db
+  // directory.
+  // Best-efforts recovery tries the available MANIFEST files from high file
+  // numbers (newer) to low file numbers (older), and stops after finding the
+  // first MANIFEST file from which the db can be recovered to a state without
+  // invalid (missing/filesize-mismatch/unique-id-mismatch) table and blob
+  // files. It is possible that the database can be restored to an empty state
+  // with no table or blob files.
+  //
+  // Regardless of this option, the IDENTITY file
+  // is updated if needed during recovery to match the DB ID in the MANIFEST (if
+  // previously using write_dbid_to_manifest) or to be in some valid state
+  // (non-empty DB ID). Currently, not compatible with atomic flush.
+  // Furthermore, WAL files will not be used for recovery if
+  // best_efforts_recovery is true. Also requires either 1) LOCK file exists or
+  // 2) underlying env's LockFile() call returns ok even for non-existing LOCK
+  // file.
+  //
+  // Default: false
+  bool best_efforts_recovery = false;
+
+  // It defines how many times db resume is called by a separate thread when
+  // background retryable IO Error happens. When background retryable IO
+  // Error happens, SetBGError is called to deal with the error. If the error
+  // can be auto-recovered (e.g., retryable IO Error during Flush or WAL write),
+  // then db resume is called in background to recover from the error. If this
+  // value is 0 or negative, db resume will not be called.
+  //
+  // Default: INT_MAX
+  int max_bgerror_resume_count = INT_MAX;
+
+  // If max_bgerror_resume_count is >= 2, db resume is called multiple times.
+  // This option decides how long to wait to retry the next resume if the
+  // previous resume fails and satisfy redo resume conditions.
+  //
+  // Default: 1000000 (microseconds).
+  uint64_t bgerror_resume_retry_interval = 1000000;
+
+  // It allows user to opt-in to get error messages containing corrupted
+  // keys/values. Corrupt keys, values will be logged in the
+  // messages/logs/status that will help users with the useful information
+  // regarding affected data. By default value is set false to prevent users
+  // data to be exposed in the logs/messages etc.
+  //
+  // Default: false
+  bool allow_data_in_errors = false;
+
+  // A string identifying the machine hosting the DB. This
+  // will be written as a property in every SST file written by the DB (or
+  // by offline writers such as SstFileWriter and RepairDB). It can be useful
+  // for troubleshooting in memory corruption caused by a failing host when
+  // writing a file, by tracing back to the writing host. These corruptions
+  // may not be caught by the checksum since they happen before checksumming.
+  // If left as default, the table writer will substitute it with the actual
+  // hostname when writing the SST file. If set to an empty string, the
+  // property will not be written to the SST file.
+  //
+  // Default: hostname
+  std::string db_host_id = kHostnameForDbHostId;
+
+  // Use this if your DB want to enable checksum handoff for specific file
+  // types writes. Make sure that the File_system you use support the
+  // crc32c checksum verification
+  // Currently supported file tyes: kWALFile, kTableFile, kDescriptorFile.
+  // NOTE: currently RocksDB only generates crc32c based checksum for the
+  // handoff. If the storage layer has different checksum support, user
+  // should enble this set as empty. Otherwise,it may cause unexpected
+  // write failures.
+  FileTypeSet checksum_handoff_file_types;
+
+  // EXPERIMENTAL
+  // CompactionService is a feature allows the user to run compactions on a
+  // different host or process, which offloads the background load from the
+  // primary host.
+  // It's an experimental feature, the interface will be changed without
+  // backward/forward compatibility support for now. Some known issues are still
+  // under development.
+  std::shared_ptr<CompactionService> compaction_service = nullptr;
+
+  // It indicates, which lowest cache tier we want to
+  // use for a certain DB. Currently we support volatile_tier and
+  // non_volatile_tier. They are layered. By setting it to kVolatileTier, only
+  // the block cache (current implemented volatile_tier) is used. So
+  // cache entries will not spill to secondary cache (current
+  // implemented non_volatile_tier), and block cache lookup misses will not
+  // lookup in the secondary cache. When kNonVolatileBlockTier is used, we use
+  // both block cache and secondary cache.
+  //
+  // Default: kNonVolatileBlockTier
+  CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier;
+
+  // If set to false, when compaction or flush sees a SingleDelete followed by
+  // a Delete for the same user key, compaction job will not fail.
+  // Otherwise, compaction job will fail.
+  // This is a temporary option to help existing use cases migrate, and
+  // will be removed in a future release.
+  // Warning: do not set to false unless you are trying to migrate existing
+  // data in which the contract of single delete
+  // (https://github.com/facebook/rocksdb/wiki/Single-Delete) is not enforced,
+  // thus has Delete mixed with SingleDelete for the same user key. Violation
+  // of the contract leads to undefined behaviors with high possibility of data
+  // inconsistency, e.g. deleted old data become visible again, etc.
+  bool enforce_single_del_contracts = true;
+};
+
+// Options to control the behavior of a database (passed to DB::Open)
+struct Options : public DBOptions, public ColumnFamilyOptions {
+  // Create an Options object with default values for all fields.
+  Options() : DBOptions(), ColumnFamilyOptions() {}
+
+  Options(const DBOptions& db_options,
+          const ColumnFamilyOptions& column_family_options)
+      : DBOptions(db_options), ColumnFamilyOptions(column_family_options) {}
+
+  // Change to some default settings from an older version.
+  // NOT MAINTAINED: This function has not been and is not maintained.
+  // DEPRECATED: This function might be removed in a future release.
+  // In general, defaults are changed to suit broad interests. Opting
+  // out of a change on upgrade should be deliberate and considered.
+  Options* OldDefaults(int rocksdb_major_version = 4,
+                       int rocksdb_minor_version = 6);
+
+  void Dump(Logger* log) const;
+
+  void DumpCFOptions(Logger* log) const;
+
+  // Some functions that make it easier to optimize RocksDB
+
+  // Set appropriate parameters for bulk loading.
+  // The reason that this is a function that returns "this" instead of a
+  // constructor is to enable chaining of multiple similar calls in the future.
+  //
+
+  // All data will be in level 0 without any automatic compaction.
+  // It's recommended to manually call CompactRange(NULL, NULL) before reading
+  // from the database, because otherwise the read can be very slow.
+  Options* PrepareForBulkLoad();
+
+  // Use this if your DB is very small (like under 1GB) and you don't want to
+  // spend lots of memory for memtables.
+  Options* OptimizeForSmallDb();
+
+  // Disable some checks that should not be necessary in the absence of
+  // software logic errors or CPU+memory hardware errors. This can improve
+  // write speeds but is only recommended for temporary use. Does not
+  // change protection against corrupt storage (e.g. verify_checksums).
+  Options* DisableExtraChecks();
+};
+
+// An application can issue a read request (via Get/Iterators) and specify
+// if that read should process data that ALREADY resides on a specified cache
+// level. For example, if an application specifies kBlockCacheTier then the
+// Get call will process data that is already processed in the memtable or
+// the block cache. It will not page in data from the OS cache or data that
+// resides in storage.
+enum ReadTier {
+  kReadAllTier = 0x0,     // data in memtable, block cache, OS cache or storage
+  kBlockCacheTier = 0x1,  // data in memtable or block cache
+  kPersistedTier = 0x2,   // persisted data.  When WAL is disabled, this option
+                          // will skip data in memtable.
+                          // Note that this ReadTier currently only supports
+                          // Get and MultiGet and does not support iterators.
+  kMemtableTier = 0x3     // data in memtable. used for memtable-only iterators.
+};
+
+// Options that control read operations
+struct ReadOptions {
+  // If "snapshot" is non-nullptr, read as of the supplied snapshot
+  // (which must belong to the DB that is being read and which must
+  // not have been released).  If "snapshot" is nullptr, use an implicit
+  // snapshot of the state at the beginning of this read operation.
+  // Default: nullptr
+  const Snapshot* snapshot;
+
+  // `iterate_lower_bound` defines the smallest key at which the backward
+  // iterator can return an entry. Once the bound is passed, Valid() will be
+  // false. `iterate_lower_bound` is inclusive ie the bound value is a valid
+  // entry.
+  //
+  // If prefix_extractor is not null, the Seek target and `iterate_lower_bound`
+  // need to have the same prefix. This is because ordering is not guaranteed
+  // outside of prefix domain.
+  //
+  // In case of user_defined timestamp, if enabled, iterate_lower_bound should
+  // point to key without timestamp part.
+  // Default: nullptr
+  const Slice* iterate_lower_bound;
+
+  // "iterate_upper_bound" defines the extent up to which the forward iterator
+  // can return entries. Once the bound is reached, Valid() will be false.
+  // "iterate_upper_bound" is exclusive ie the bound value is
+  // not a valid entry. If prefix_extractor is not null:
+  // 1. If options.auto_prefix_mode = true, iterate_upper_bound will be used
+  //    to infer whether prefix iterating (e.g. applying prefix bloom filter)
+  //    can be used within RocksDB. This is done by comparing
+  //    iterate_upper_bound with the seek key.
+  // 2. If options.auto_prefix_mode = false, iterate_upper_bound only takes
+  //    effect if it shares the same prefix as the seek key. If
+  //    iterate_upper_bound is outside the prefix of the seek key, then keys
+  //    returned outside the prefix range will be undefined, just as if
+  //    iterate_upper_bound = null.
+  // If iterate_upper_bound is not null, SeekToLast() will position the iterator
+  // at the first key smaller than iterate_upper_bound.
+  //
+  // In case of user_defined timestamp, if enabled, iterate_upper_bound should
+  // point to key without timestamp part.
+  // Default: nullptr
+  const Slice* iterate_upper_bound;
+
+  // RocksDB does auto-readahead for iterators on noticing more than two reads
+  // for a table file. The readahead starts at 8KB and doubles on every
+  // additional read up to 256KB.
+  // This option can help if most of the range scans are large, and if it is
+  // determined that a larger readahead than that enabled by auto-readahead is
+  // needed.
+  // Using a large readahead size (> 2MB) can typically improve the performance
+  // of forward iteration on spinning disks.
+  // Default: 0
+  size_t readahead_size;
+
+  // A threshold for the number of keys that can be skipped before failing an
+  // iterator seek as incomplete. The default value of 0 should be used to
+  // never fail a request as incomplete, even on skipping too many keys.
+  // Default: 0
+  uint64_t max_skippable_internal_keys;
+
+  // Specify if this read request should process data that ALREADY
+  // resides on a particular cache. If the required data is not
+  // found at the specified cache, then Status::Incomplete is returned.
+  // Default: kReadAllTier
+  ReadTier read_tier;
+
+  // If true, all data read from underlying storage will be
+  // verified against corresponding checksums.
+  // Default: true
+  bool verify_checksums;
+
+  // Should the "data block"/"index block" read for this iteration be placed in
+  // block cache?
+  // Callers may wish to set this field to false for bulk scans.
+  // This would help not to the change eviction order of existing items in the
+  // block cache.
+  // Default: true
+  bool fill_cache;
+
+  // Specify to create a tailing iterator -- a special iterator that has a
+  // view of the complete database (i.e. it can also be used to read newly
+  // added data) and is optimized for sequential reads. It will return records
+  // that were inserted into the database after the creation of the iterator.
+  // Default: false
+  // Not supported in ROCKSDB_LITE mode!
+  bool tailing;
+
+  // This options is not used anymore. It was to turn on a functionality that
+  // has been removed.
+  bool managed;
+
+  // Enable a total order seek regardless of index format (e.g. hash index)
+  // used in the table. Some table format (e.g. plain table) may not support
+  // this option.
+  // If true when calling Get(), we also skip prefix bloom when reading from
+  // block based table, which only affects Get() performance.
+  // Default: false
+  bool total_order_seek;
+
+  // When true, by default use total_order_seek = true, and RocksDB can
+  // selectively enable prefix seek mode if won't generate a different result
+  // from total_order_seek, based on seek key, and iterator upper bound.
+  // Not supported in ROCKSDB_LITE mode, in the way that even with value true
+  // prefix mode is not used.
+  // BUG: Using Comparator::IsSameLengthImmediateSuccessor and
+  // SliceTransform::FullLengthEnabled to enable prefix mode in cases where
+  // prefix of upper bound differs from prefix of seek key has a flaw.
+  // If present in the DB, "short keys" (shorter than "full length" prefix)
+  // can be omitted from auto_prefix_mode iteration when they would be present
+  // in total_order_seek iteration, regardless of whether the short keys are
+  // "in domain" of the prefix extractor. This is not an issue if no short
+  // keys are added to DB or are not expected to be returned by such
+  // iterators. (We are also assuming the new condition on
+  // IsSameLengthImmediateSuccessor is satisfied; see its BUG section).
+  // A bug example is in DBTest2::AutoPrefixMode1, search for "BUG".
+  // Default: false
+  bool auto_prefix_mode;
+
+  // Enforce that the iterator only iterates over the same prefix as the seek.
+  // This option is effective only for prefix seeks, i.e. prefix_extractor is
+  // non-null for the column family and total_order_seek is false.  Unlike
+  // iterate_upper_bound, prefix_same_as_start only works within a prefix
+  // but in both directions.
+  // Default: false
+  bool prefix_same_as_start;
+
+  // Keep the blocks loaded by the iterator pinned in memory as long as the
+  // iterator is not deleted, If used when reading from tables created with
+  // BlockBasedTableOptions::use_delta_encoding = false,
+  // Iterator's property "rocksdb.iterator.is-key-pinned" is guaranteed to
+  // return 1.
+  // Default: false
+  bool pin_data;
+
+  // If true, when PurgeObsoleteFile is called in CleanupIteratorState, we
+  // schedule a background job in the flush job queue and delete obsolete files
+  // in background.
+  // Default: false
+  bool background_purge_on_iterator_cleanup;
+
+  // If true, range tombstones handling will be skipped in key lookup paths.
+  // For DB instances that don't use DeleteRange() calls, this setting can
+  // be used to optimize the read performance.
+  // Note that, if this assumption (of no previous DeleteRange() calls) is
+  // broken, stale keys could be served in read paths.
+  // Default: false
+  bool ignore_range_deletions;
+
+  // A callback to determine whether relevant keys for this scan exist in a
+  // given table based on the table's properties. The callback is passed the
+  // properties of each table during iteration. If the callback returns false,
+  // the table will not be scanned. This option only affects Iterators and has
+  // no impact on point lookups.
+  // Default: empty (every table will be scanned)
+  std::function<bool(const TableProperties&)> table_filter;
+
+  // Timestamp of operation. Read should return the latest data visible to the
+  // specified timestamp. All timestamps of the same database must be of the
+  // same length and format. The user is responsible for providing a customized
+  // compare function via Comparator to order <key, timestamp> tuples.
+  // For iterator, iter_start_ts is the lower bound (older) and timestamp
+  // serves as the upper bound. Versions of the same record that fall in
+  // the timestamp range will be returned. If iter_start_ts is nullptr,
+  // only the most recent version visible to timestamp is returned.
+  // The user-specified timestamp feature is still under active development,
+  // and the API is subject to change.
+  // Default: nullptr
+  const Slice* timestamp;
+  const Slice* iter_start_ts;
+
+  // Deadline for completing an API call (Get/MultiGet/Seek/Next for now)
+  // in microseconds.
+  // It should be set to microseconds since epoch, i.e, gettimeofday or
+  // equivalent plus allowed duration in microseconds. The best way is to use
+  // env->NowMicros() + some timeout.
+  // This is best efforts. The call may exceed the deadline if there is IO
+  // involved and the file system doesn't support deadlines, or due to
+  // checking for deadline periodically rather than for every key if
+  // processing a batch
+  std::chrono::microseconds deadline;
+
+  // A timeout in microseconds to be passed to the underlying FileSystem for
+  // reads. As opposed to deadline, this determines the timeout for each
+  // individual file read request. If a MultiGet/Get/Seek/Next etc call
+  // results in multiple reads, each read can last up to io_timeout us.
+  std::chrono::microseconds io_timeout;
+
+  // It limits the maximum cumulative value size of the keys in batch while
+  // reading through MultiGet. Once the cumulative value size exceeds this
+  // soft limit then all the remaining keys are returned with status Aborted.
+  //
+  // Default: std::numeric_limits<uint64_t>::max()
+  uint64_t value_size_soft_limit;
+
+  // For iterators, RocksDB does auto-readahead on noticing more than two
+  // sequential reads for a table file if user doesn't provide readahead_size.
+  // The readahead starts at 8KB and doubles on every additional read upto
+  // max_auto_readahead_size only when reads are sequential. However at each
+  // level, if iterator moves over next file, readahead_size starts again from
+  // 8KB.
+  //
+  // By enabling this option, RocksDB will do some enhancements for
+  // prefetching the data.
+  //
+  // Default: false
+  bool adaptive_readahead;
+
+  // For file reads associated with this option, charge the internal rate
+  // limiter (see `DBOptions::rate_limiter`) at the specified priority. The
+  // special value `Env::IO_TOTAL` disables charging the rate limiter.
+  //
+  // The rate limiting is bypassed no matter this option's value for file reads
+  // on plain tables (these can exist when `ColumnFamilyOptions::table_factory`
+  // is a `PlainTableFactory`) and cuckoo tables (these can exist when
+  // `ColumnFamilyOptions::table_factory` is a `CuckooTableFactory`).
+  //
+  // The bytes charged to rate limiter may not exactly match the file read bytes
+  // since there are some seemingly insignificant reads, like for file
+  // headers/footers, that we currently do not charge to rate limiter.
+  //
+  // Default: `Env::IO_TOTAL`.
+  Env::IOPriority rate_limiter_priority = Env::IO_TOTAL;
+
+  // Experimental
+  //
+  // If async_io is enabled, RocksDB will prefetch some of data asynchronously.
+  // RocksDB apply it if reads are sequential and its internal automatic
+  // prefetching.
+  //
+  // Default: false
+  bool async_io;
+
+  // Experimental
+  //
+  // If async_io is set, then this flag controls whether we read SST files
+  // in multiple levels asynchronously. Enabling this flag can help reduce
+  // MultiGet latency by maximizing the number of SST files read in
+  // parallel if the keys in the MultiGet batch are in different levels. It
+  // comes at the expense of slightly higher CPU overhead.
+  //
+  // Default: true
+  bool optimize_multiget_for_io;
+
+  ReadOptions();
+  ReadOptions(bool cksum, bool cache);
+};
+
+// Options that control write operations
+struct WriteOptions {
+  // If true, the write will be flushed from the operating system
+  // buffer cache (by calling WritableFile::Sync()) before the write
+  // is considered complete.  If this flag is true, writes will be
+  // slower.
+  //
+  // If this flag is false, and the machine crashes, some recent
+  // writes may be lost.  Note that if it is just the process that
+  // crashes (i.e., the machine does not reboot), no writes will be
+  // lost even if sync==false.
+  //
+  // In other words, a DB write with sync==false has similar
+  // crash semantics as the "write()" system call.  A DB write
+  // with sync==true has similar crash semantics to a "write()"
+  // system call followed by "fdatasync()".
+  //
+  // Default: false
+  bool sync;
+
+  // If true, writes will not first go to the write ahead log,
+  // and the write may get lost after a crash. The backup engine
+  // relies on write-ahead logs to back up the memtable, so if
+  // you disable write-ahead logs, you must create backups with
+  // flush_before_backup=true to avoid losing unflushed memtable data.
+  // Default: false
+  bool disableWAL;
+
+  // If true and if user is trying to write to column families that don't exist
+  // (they were dropped),  ignore the write (don't return an error). If there
+  // are multiple writes in a WriteBatch, other writes will succeed.
+  // Default: false
+  bool ignore_missing_column_families;
+
+  // If true and we need to wait or sleep for the write request, fails
+  // immediately with Status::Incomplete().
+  // Default: false
+  bool no_slowdown;
+
+  // If true, this write request is of lower priority if compaction is
+  // behind. In this case, no_slowdown = true, the request will be canceled
+  // immediately with Status::Incomplete() returned. Otherwise, it will be
+  // slowed down. The slowdown value is determined by RocksDB to guarantee
+  // it introduces minimum impacts to high priority writes.
+  //
+  // Default: false
+  bool low_pri;
+
+  // If true, this writebatch will maintain the last insert positions of each
+  // memtable as hints in concurrent write. It can improve write performance
+  // in concurrent writes if keys in one writebatch are sequential. In
+  // non-concurrent writes (when concurrent_memtable_writes is false) this
+  // option will be ignored.
+  //
+  // Default: false
+  bool memtable_insert_hint_per_batch;
+
+  // For writes associated with this option, charge the internal rate
+  // limiter (see `DBOptions::rate_limiter`) at the specified priority. The
+  // special value `Env::IO_TOTAL` disables charging the rate limiter.
+  //
+  // Currently the support covers automatic WAL flushes, which happen during
+  // live updates (`Put()`, `Write()`, `Delete()`, etc.)
+  // when `WriteOptions::disableWAL == false`
+  // and `DBOptions::manual_wal_flush == false`.
+  //
+  // Only `Env::IO_USER` and `Env::IO_TOTAL` are allowed
+  // due to implementation constraints.
+  //
+  // Default: `Env::IO_TOTAL`
+  Env::IOPriority rate_limiter_priority;
+
+  // `protection_bytes_per_key` is the number of bytes used to store
+  // protection information for each key entry. Currently supported values are
+  // zero (disabled) and eight.
+  //
+  // Default: zero (disabled).
+  size_t protection_bytes_per_key;
+
+  WriteOptions()
+      : sync(false),
+        disableWAL(false),
+        ignore_missing_column_families(false),
+        no_slowdown(false),
+        low_pri(false),
+        memtable_insert_hint_per_batch(false),
+        rate_limiter_priority(Env::IO_TOTAL),
+        protection_bytes_per_key(0) {}
+};
+
+// Options that control flush operations
+struct FlushOptions {
+  // If true, the flush will wait until the flush is done.
+  // Default: true
+  bool wait;
+  // If true, the flush would proceed immediately even it means writes will
+  // stall for the duration of the flush; if false the operation will wait
+  // until it's possible to do flush w/o causing stall or until required flush
+  // is performed by someone else (foreground call or background thread).
+  // Default: false
+  bool allow_write_stall;
+  FlushOptions() : wait(true), allow_write_stall(false) {}
+};
+
+// Create a Logger from provided DBOptions
+extern Status CreateLoggerFromOptions(const std::string& dbname,
+                                      const DBOptions& options,
+                                      std::shared_ptr<Logger>* logger);
+
+// CompactionOptions are used in CompactFiles() call.
+struct CompactionOptions {
+  // Compaction output compression type
+  // Default: snappy
+  // If set to `kDisableCompressionOption`, RocksDB will choose compression type
+  // according to the `ColumnFamilyOptions`, taking into account the output
+  // level if `compression_per_level` is specified.
+  CompressionType compression;
+  // Compaction will create files of size `output_file_size_limit`.
+  // Default: MAX, which means that compaction will create a single file
+  uint64_t output_file_size_limit;
+  // If > 0, it will replace the option in the DBOptions for this compaction.
+  uint32_t max_subcompactions;
+
+  CompactionOptions()
+      : compression(kSnappyCompression),
+        output_file_size_limit(std::numeric_limits<uint64_t>::max()),
+        max_subcompactions(0) {}
+};
+
+// For level based compaction, we can configure if we want to skip/force
+// bottommost level compaction.
+enum class BottommostLevelCompaction {
+  // Skip bottommost level compaction
+  kSkip,
+  // Only compact bottommost level if there is a compaction filter
+  // This is the default option
+  kIfHaveCompactionFilter,
+  // Always compact bottommost level
+  kForce,
+  // Always compact bottommost level but in bottommost level avoid
+  // double-compacting files created in the same compaction
+  kForceOptimized,
+};
+
+// For manual compaction, we can configure if we want to skip/force garbage
+// collection of blob files.
+enum class BlobGarbageCollectionPolicy {
+  // Force blob file garbage collection.
+  kForce,
+  // Skip blob file garbage collection.
+  kDisable,
+  // Inherit blob file garbage collection policy from ColumnFamilyOptions.
+  kUseDefault,
+};
+
+// CompactRangeOptions is used by CompactRange() call.
+struct CompactRangeOptions {
+  // If true, no other compaction will run at the same time as this
+  // manual compaction.
+  //
+  // Default: false
+  bool exclusive_manual_compaction = false;
+
+  // If true, compacted files will be moved to the minimum level capable
+  // of holding the data or given level (specified non-negative target_level).
+  bool change_level = false;
+  // If change_level is true and target_level have non-negative value, compacted
+  // files will be moved to target_level.
+  int target_level = -1;
+  // Compaction outputs will be placed in options.db_paths[target_path_id].
+  // Behavior is undefined if target_path_id is out of range.
+  uint32_t target_path_id = 0;
+  // By default level based compaction will only compact the bottommost level
+  // if there is a compaction filter
+  BottommostLevelCompaction bottommost_level_compaction =
+      BottommostLevelCompaction::kIfHaveCompactionFilter;
+  // If true, will execute immediately even if doing so would cause the DB to
+  // enter write stall mode. Otherwise, it'll sleep until load is low enough.
+  bool allow_write_stall = false;
+  // If > 0, it will replace the option in the DBOptions for this compaction.
+  uint32_t max_subcompactions = 0;
+  // Set user-defined timestamp low bound, the data with older timestamp than
+  // low bound maybe GCed by compaction. Default: nullptr
+  const Slice* full_history_ts_low = nullptr;
+
+  // Allows cancellation of an in-progress manual compaction.
+  //
+  // Cancellation can be delayed waiting on automatic compactions when used
+  // together with `exclusive_manual_compaction == true`.
+  std::atomic<bool>* canceled = nullptr;
+  // NOTE: Calling DisableManualCompaction() overwrites the uer-provided
+  // canceled variable in CompactRangeOptions.
+  // Typically, when CompactRange is being called in one thread (t1) with
+  // canceled = false, and DisableManualCompaction is being called in the
+  // other thread (t2), manual compaction is disabled normally, even if the
+  // compaction iterator may still scan a few items before *canceled is
+  // set to true
+
+  // If set to kForce, RocksDB will override enable_blob_file_garbage_collection
+  // to true; if set to kDisable, RocksDB will override it to false, and
+  // kUseDefault leaves the setting in effect. This enables customers to both
+  // force-enable and force-disable GC when calling CompactRange.
+  BlobGarbageCollectionPolicy blob_garbage_collection_policy =
+      BlobGarbageCollectionPolicy::kUseDefault;
+
+  // If set to < 0 or > 1, RocksDB leaves blob_garbage_collection_age_cutoff
+  // from ColumnFamilyOptions in effect. Otherwise, it will override the
+  // user-provided setting. This enables customers to selectively override the
+  // age cutoff.
+  double blob_garbage_collection_age_cutoff = -1;
+};
+
+// IngestExternalFileOptions is used by IngestExternalFile()
+struct IngestExternalFileOptions {
+  // Can be set to true to move the files instead of copying them.
+  bool move_files = false;
+  // If set to true, ingestion falls back to copy when move fails.
+  bool failed_move_fall_back_to_copy = true;
+  // If set to false, an ingested file keys could appear in existing snapshots
+  // that where created before the file was ingested.
+  bool snapshot_consistency = true;
+  // If set to false, IngestExternalFile() will fail if the file key range
+  // overlaps with existing keys or tombstones in the DB.
+  bool allow_global_seqno = true;
+  // If set to false and the file key range overlaps with the memtable key range
+  // (memtable flush required), IngestExternalFile will fail.
+  bool allow_blocking_flush = true;
+  // Set to true if you would like duplicate keys in the file being ingested
+  // to be skipped rather than overwriting existing data under that key.
+  // Use case: back-fill of some historical data in the database without
+  // over-writing existing newer version of data.
+  // This option could only be used if the DB has been running
+  // with allow_ingest_behind=true since the dawn of time.
+  // All files will be ingested at the bottommost level with seqno=0.
+  bool ingest_behind = false;
+  // Set to true if you would like to write global_seqno to a given offset in
+  // the external SST file for backward compatibility. Older versions of
+  // RocksDB writes a global_seqno to a given offset within ingested SST files,
+  // and new versions of RocksDB do not. If you ingest an external SST using
+  // new version of RocksDB and would like to be able to downgrade to an
+  // older version of RocksDB, you should set 'write_global_seqno' to true. If
+  // your service is just starting to use the new RocksDB, we recommend that
+  // you set this option to false, which brings two benefits:
+  // 1. No extra random write for global_seqno during ingestion.
+  // 2. Without writing external SST file, it's possible to do checksum.
+  // We have a plan to set this option to false by default in the future.
+  bool write_global_seqno = true;
+  // Set to true if you would like to verify the checksums of each block of the
+  // external SST file before ingestion.
+  // Warning: setting this to true causes slowdown in file ingestion because
+  // the external SST file has to be read.
+  bool verify_checksums_before_ingest = false;
+  // When verify_checksums_before_ingest = true, RocksDB uses default
+  // readahead setting to scan the file while verifying checksums before
+  // ingestion.
+  // Users can override the default value using this option.
+  // Using a large readahead size (> 2MB) can typically improve the performance
+  // of forward iteration on spinning disks.
+  size_t verify_checksums_readahead_size = 0;
+  // Set to TRUE if user wants to verify the sst file checksum of ingested
+  // files. The DB checksum function will generate the checksum of each
+  // ingested file (if file_checksum_gen_factory is set) and compare the
+  // checksum function name and checksum with the ingested checksum information.
+  //
+  // If this option is set to True: 1) if DB does not enable checksum
+  // (file_checksum_gen_factory == nullptr), the ingested checksum information
+  // will be ignored; 2) If DB enable the checksum function, we calculate the
+  // sst file checksum after the file is moved or copied and compare the
+  // checksum and checksum name. If checksum or checksum function name does
+  // not match, ingestion will be failed. If the verification is successful,
+  // checksum and checksum function name will be stored in Manifest.
+  // If this option is set to FALSE, 1) if DB does not enable checksum,
+  // the ingested checksum information will be ignored; 2) if DB enable the
+  // checksum, we only verify the ingested checksum function name and we
+  // trust the ingested checksum. If the checksum function name matches, we
+  // store the checksum in Manifest. DB does not calculate the checksum during
+  // ingestion. However, if no checksum information is provided with the
+  // ingested files, DB will generate the checksum and store in the Manifest.
+  bool verify_file_checksum = true;
+  // Set to TRUE if user wants file to be ingested to the bottommost level. An
+  // error of Status::TryAgain() will be returned if a file cannot fit in the
+  // bottommost level when calling
+  // DB::IngestExternalFile()/DB::IngestExternalFiles(). The user should clear
+  // the bottommost level in the overlapping range before re-attempt.
+  //
+  // ingest_behind takes precedence over fail_if_not_bottommost_level.
+  bool fail_if_not_bottommost_level = false;
+};
+
+enum TraceFilterType : uint64_t {
+  // Trace all the operations
+  kTraceFilterNone = 0x0,
+  // Do not trace the get operations
+  kTraceFilterGet = 0x1 << 0,
+  // Do not trace the write operations
+  kTraceFilterWrite = 0x1 << 1,
+  // Do not trace the `Iterator::Seek()` operations
+  kTraceFilterIteratorSeek = 0x1 << 2,
+  // Do not trace the `Iterator::SeekForPrev()` operations
+  kTraceFilterIteratorSeekForPrev = 0x1 << 3,
+  // Do not trace the `MultiGet()` operations
+  kTraceFilterMultiGet = 0x1 << 4,
+};
+
+// TraceOptions is used for StartTrace
+struct TraceOptions {
+  // To avoid the trace file size grows large than the storage space,
+  // user can set the max trace file size in Bytes. Default is 64GB
+  uint64_t max_trace_file_size = uint64_t{64} * 1024 * 1024 * 1024;
+  // Specify trace sampling option, i.e. capture one per how many requests.
+  // Default to 1 (capture every request).
+  uint64_t sampling_frequency = 1;
+  // Note: The filtering happens before sampling.
+  uint64_t filter = kTraceFilterNone;
+  // When true, the order of write records in the trace will match the order of
+  // the corresponding write records in the WAL and applied to the DB. There may
+  // be a performance penalty associated with preserving this ordering.
+  //
+  // Default: false. This means write records in the trace may be in an order
+  // different from the WAL's order.
+  bool preserve_write_order = false;
+};
+
+// ImportColumnFamilyOptions is used by ImportColumnFamily()
+struct ImportColumnFamilyOptions {
+  // Can be set to true to move the files instead of copying them.
+  bool move_files = false;
+};
+
+// Options used with DB::GetApproximateSizes()
+struct SizeApproximationOptions {
+  // Defines whether the returned size should include the recently written
+  // data in the memtables. If set to false, include_files must be true.
+  bool include_memtables = false;
+  // Defines whether the returned size should include data serialized to disk.
+  // If set to false, include_memtables must be true.
+  bool include_files = true;
+  // When approximating the files total size that is used to store a keys range
+  // using DB::GetApproximateSizes, allow approximation with an error margin of
+  // up to total_files_size * files_size_error_margin. This allows to take some
+  // shortcuts in files size approximation, resulting in better performance,
+  // while guaranteeing the resulting error is within a reasonable margin.
+  // E.g., if the value is 0.1, then the error margin of the returned files size
+  // approximation will be within 10%.
+  // If the value is non-positive - a more precise yet more CPU intensive
+  // estimation is performed.
+  double files_size_error_margin = -1.0;
+};
+
+struct CompactionServiceOptionsOverride {
+  // Currently pointer configurations are not passed to compaction service
+  // compaction so the user needs to set it. It will be removed once pointer
+  // configuration passing is supported.
+  Env* env = Env::Default();
+  std::shared_ptr<FileChecksumGenFactory> file_checksum_gen_factory = nullptr;
+
+  const Comparator* comparator = BytewiseComparator();
+  std::shared_ptr<MergeOperator> merge_operator = nullptr;
+  const CompactionFilter* compaction_filter = nullptr;
+  std::shared_ptr<CompactionFilterFactory> compaction_filter_factory = nullptr;
+  std::shared_ptr<const SliceTransform> prefix_extractor = nullptr;
+  std::shared_ptr<TableFactory> table_factory;
+  std::shared_ptr<SstPartitionerFactory> sst_partitioner_factory = nullptr;
+
+  // Only subsets of events are triggered in remote compaction worker, like:
+  // `OnTableFileCreated`, `OnTableFileCreationStarted`,
+  // `ShouldBeNotifiedOnFileIO` `OnSubcompactionBegin`,
+  // `OnSubcompactionCompleted`, etc. Worth mentioning, `OnCompactionBegin` and
+  // `OnCompactionCompleted` won't be triggered. They will be triggered on the
+  // primary DB side.
+  std::vector<std::shared_ptr<EventListener>> listeners;
+
+  // statistics is used to collect DB operation metrics, the metrics won't be
+  // returned to CompactionService primary host, to collect that, the user needs
+  // to set it here.
+  std::shared_ptr<Statistics> statistics = nullptr;
+
+  // Only compaction generated SST files use this user defined table properties
+  // collector.
+  std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
+      table_properties_collector_factories;
+};
+
+struct OpenAndCompactOptions {
+  // Allows cancellation of an in-progress compaction.
+  std::atomic<bool>* canceled = nullptr;
+};
+
+#ifndef ROCKSDB_LITE
+struct LiveFilesStorageInfoOptions {
+  // Whether to populate FileStorageInfo::file_checksum* or leave blank
+  bool include_checksum_info = false;
+  // Flushes memtables if total size in bytes of live WAL files is >= this
+  // number (and DB is not read-only).
+  // Default: always force a flush without checking sizes.
+  uint64_t wal_size_for_flush = 0;
+};
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/perf_context.h b/src/rocksdb/include/rocksdb/perf_context.h
new file mode 100644
index 000000000..cd1dd99f0
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/perf_context.h
@@ -0,0 +1,274 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+
+#include <map>
+#include <string>
+
+#include "rocksdb/perf_level.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A thread local context for gathering performance counter efficiently
+// and transparently.
+// Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats.
+
+// Break down performance counters by level and store per-level perf context in
+// PerfContextByLevel
+struct PerfContextByLevel {
+  // # of times bloom filter has avoided file reads, i.e., negatives.
+  uint64_t bloom_filter_useful = 0;
+  // # of times bloom FullFilter has not avoided the reads.
+  uint64_t bloom_filter_full_positive = 0;
+  // # of times bloom FullFilter has not avoided the reads and data actually
+  // exist.
+  uint64_t bloom_filter_full_true_positive = 0;
+
+  // total number of user key returned (only include keys that are found, does
+  // not include keys that are deleted or merged without a final put
+  uint64_t user_key_return_count = 0;
+
+  // total nanos spent on reading data from SST files
+  uint64_t get_from_table_nanos = 0;
+
+  uint64_t block_cache_hit_count = 0;   // total number of block cache hits
+  uint64_t block_cache_miss_count = 0;  // total number of block cache misses
+
+  void Reset();  // reset all performance counters to zero
+};
+
+struct PerfContext {
+  ~PerfContext();
+
+  PerfContext() {}
+
+  PerfContext(const PerfContext&);
+  PerfContext& operator=(const PerfContext&);
+  PerfContext(PerfContext&&) noexcept;
+
+  void Reset();  // reset all performance counters to zero
+
+  std::string ToString(bool exclude_zero_counters = false) const;
+
+  // enable per level perf context and allocate storage for PerfContextByLevel
+  void EnablePerLevelPerfContext();
+
+  // temporarily disable per level perf context by setting the flag to false
+  void DisablePerLevelPerfContext();
+
+  // free the space for PerfContextByLevel, also disable per level perf context
+  void ClearPerLevelPerfContext();
+
+  uint64_t user_key_comparison_count;  // total number of user key comparisons
+  uint64_t block_cache_hit_count;      // total number of block cache hits
+  uint64_t block_read_count;           // total number of block reads (with IO)
+  uint64_t block_read_byte;            // total number of bytes from block reads
+  uint64_t block_read_time;            // total nanos spent on block reads
+  uint64_t block_cache_index_hit_count;  // total number of index block hits
+  // total number of standalone handles lookup from secondary cache
+  uint64_t block_cache_standalone_handle_count;
+  // total number of real handles lookup from secondary cache that are inserted
+  // into primary cache
+  uint64_t block_cache_real_handle_count;
+  uint64_t index_block_read_count;        // total number of index block reads
+  uint64_t block_cache_filter_hit_count;  // total number of filter block hits
+  uint64_t filter_block_read_count;       // total number of filter block reads
+  uint64_t compression_dict_block_read_count;  // total number of compression
+                                               // dictionary block reads
+
+  uint64_t secondary_cache_hit_count;  // total number of secondary cache hits
+  // total number of real handles inserted into secondary cache
+  uint64_t compressed_sec_cache_insert_real_count;
+  // total number of dummy handles inserted into secondary cache
+  uint64_t compressed_sec_cache_insert_dummy_count;
+  // bytes for vals before compression in secondary cache
+  uint64_t compressed_sec_cache_uncompressed_bytes;
+  // bytes for vals after compression in secondary cache
+  uint64_t compressed_sec_cache_compressed_bytes;
+
+  uint64_t block_checksum_time;    // total nanos spent on block checksum
+  uint64_t block_decompress_time;  // total nanos spent on block decompression
+
+  uint64_t get_read_bytes;       // bytes for vals returned by Get
+  uint64_t multiget_read_bytes;  // bytes for vals returned by MultiGet
+  uint64_t iter_read_bytes;      // bytes for keys/vals decoded by iterator
+
+  uint64_t blob_cache_hit_count;  // total number of blob cache hits
+  uint64_t blob_read_count;       // total number of blob reads (with IO)
+  uint64_t blob_read_byte;        // total number of bytes from blob reads
+  uint64_t blob_read_time;        // total nanos spent on blob reads
+  uint64_t blob_checksum_time;    // total nanos spent on blob checksum
+  uint64_t blob_decompress_time;  // total nanos spent on blob decompression
+
+  // total number of internal keys skipped over during iteration.
+  // There are several reasons for it:
+  // 1. when calling Next(), the iterator is in the position of the previous
+  //    key, so that we'll need to skip it. It means this counter will always
+  //    be incremented in Next().
+  // 2. when calling Next(), we need to skip internal entries for the previous
+  //    keys that are overwritten.
+  // 3. when calling Next(), Seek() or SeekToFirst(), after previous key
+  //    before calling Next(), the seek key in Seek() or the beginning for
+  //    SeekToFirst(), there may be one or more deleted keys before the next
+  //    valid key that the operation should place the iterator to. We need
+  //    to skip both of the tombstone and updates hidden by the tombstones. The
+  //    tombstones are not included in this counter, while previous updates
+  //    hidden by the tombstones will be included here.
+  // 4. symmetric cases for Prev() and SeekToLast()
+  // internal_recent_skipped_count is not included in this counter.
+  //
+  uint64_t internal_key_skipped_count;
+  // Total number of deletes and single deletes skipped over during iteration
+  // When calling Next(), Seek() or SeekToFirst(), after previous position
+  // before calling Next(), the seek key in Seek() or the beginning for
+  // SeekToFirst(), there may be one or more deleted keys before the next valid
+  // key. Every deleted key is counted once. We don't recount here if there are
+  // still older updates invalidated by the tombstones.
+  //
+  uint64_t internal_delete_skipped_count;
+  // How many times iterators skipped over internal keys that are more recent
+  // than the snapshot that iterator is using.
+  //
+  uint64_t internal_recent_skipped_count;
+  // How many values were fed into merge operator by iterators.
+  //
+  uint64_t internal_merge_count;
+  // Number of times we reseeked inside a merging iterator, specifically to skip
+  // after or before a range of keys covered by a range deletion in a newer LSM
+  // component.
+  uint64_t internal_range_del_reseek_count;
+
+  uint64_t get_snapshot_time;        // total nanos spent on getting snapshot
+  uint64_t get_from_memtable_time;   // total nanos spent on querying memtables
+  uint64_t get_from_memtable_count;  // number of mem tables queried
+  // total nanos spent after Get() finds a key
+  uint64_t get_post_process_time;
+  uint64_t get_from_output_files_time;  // total nanos reading from output files
+  // total nanos spent on seeking memtable
+  uint64_t seek_on_memtable_time;
+  // number of seeks issued on memtable
+  // (including SeekForPrev but not SeekToFirst and SeekToLast)
+  uint64_t seek_on_memtable_count;
+  // number of Next()s issued on memtable
+  uint64_t next_on_memtable_count;
+  // number of Prev()s issued on memtable
+  uint64_t prev_on_memtable_count;
+  // total nanos spent on seeking child iters
+  uint64_t seek_child_seek_time;
+  // number of seek issued in child iterators
+  uint64_t seek_child_seek_count;
+  uint64_t seek_min_heap_time;  // total nanos spent on the merge min heap
+  uint64_t seek_max_heap_time;  // total nanos spent on the merge max heap
+  // total nanos spent on seeking the internal entries
+  uint64_t seek_internal_seek_time;
+  // total nanos spent on iterating internal entries to find the next user entry
+  uint64_t find_next_user_entry_time;
+
+  // This group of stats provide a breakdown of time spent by Write().
+  // May be inaccurate when 2PC, two_write_queues or enable_pipelined_write
+  // are enabled.
+  //
+  // total nanos spent on writing to WAL
+  uint64_t write_wal_time;
+  // total nanos spent on writing to mem tables
+  uint64_t write_memtable_time;
+  // total nanos spent on delaying or throttling write
+  uint64_t write_delay_time;
+  // total nanos spent on switching memtable/wal and scheduling
+  // flushes/compactions.
+  uint64_t write_scheduling_flushes_compactions_time;
+  // total nanos spent on writing a record, excluding the above four things
+  uint64_t write_pre_and_post_process_time;
+
+  // time spent waiting for other threads of the batch group
+  uint64_t write_thread_wait_nanos;
+
+  // time spent on acquiring DB mutex.
+  uint64_t db_mutex_lock_nanos;
+  // Time spent on waiting with a condition variable created with DB mutex.
+  uint64_t db_condition_wait_nanos;
+  // Time spent on merge operator.
+  uint64_t merge_operator_time_nanos;
+
+  // Time spent on reading index block from block cache or SST file
+  uint64_t read_index_block_nanos;
+  // Time spent on reading filter block from block cache or SST file
+  uint64_t read_filter_block_nanos;
+  // Time spent on creating data block iterator
+  uint64_t new_table_block_iter_nanos;
+  // Time spent on creating a iterator of an SST file.
+  uint64_t new_table_iterator_nanos;
+  // Time spent on seeking a key in data/index blocks
+  uint64_t block_seek_nanos;
+  // Time spent on finding or creating a table reader
+  uint64_t find_table_nanos;
+  // total number of mem table bloom hits
+  uint64_t bloom_memtable_hit_count;
+  // total number of mem table bloom misses
+  uint64_t bloom_memtable_miss_count;
+  // total number of SST table bloom hits
+  uint64_t bloom_sst_hit_count;
+  // total number of SST table bloom misses
+  uint64_t bloom_sst_miss_count;
+
+  // Time spent waiting on key locks in transaction lock manager.
+  uint64_t key_lock_wait_time;
+  // number of times acquiring a lock was blocked by another transaction.
+  uint64_t key_lock_wait_count;
+
+  // Total time spent in Env filesystem operations. These are only populated
+  // when TimedEnv is used.
+  uint64_t env_new_sequential_file_nanos;
+  uint64_t env_new_random_access_file_nanos;
+  uint64_t env_new_writable_file_nanos;
+  uint64_t env_reuse_writable_file_nanos;
+  uint64_t env_new_random_rw_file_nanos;
+  uint64_t env_new_directory_nanos;
+  uint64_t env_file_exists_nanos;
+  uint64_t env_get_children_nanos;
+  uint64_t env_get_children_file_attributes_nanos;
+  uint64_t env_delete_file_nanos;
+  uint64_t env_create_dir_nanos;
+  uint64_t env_create_dir_if_missing_nanos;
+  uint64_t env_delete_dir_nanos;
+  uint64_t env_get_file_size_nanos;
+  uint64_t env_get_file_modification_time_nanos;
+  uint64_t env_rename_file_nanos;
+  uint64_t env_link_file_nanos;
+  uint64_t env_lock_file_nanos;
+  uint64_t env_unlock_file_nanos;
+  uint64_t env_new_logger_nanos;
+
+  uint64_t get_cpu_nanos;
+  uint64_t iter_next_cpu_nanos;
+  uint64_t iter_prev_cpu_nanos;
+  uint64_t iter_seek_cpu_nanos;
+
+  // Time spent in encrypting data. Populated when EncryptedEnv is used.
+  uint64_t encrypt_data_nanos;
+  // Time spent in decrypting data. Populated when EncryptedEnv is used.
+  uint64_t decrypt_data_nanos;
+
+  uint64_t number_async_seek;
+
+  std::map<uint32_t, PerfContextByLevel>* level_to_perf_context = nullptr;
+  bool per_level_perf_context_enabled = false;
+};
+
+// If RocksDB is compiled with -DNPERF_CONTEXT, then a pointer to a global,
+// non-thread-local PerfContext object will be returned. Attempts to update
+// this object will be ignored, and reading from it will also be no-op.
+// Otherwise,
+// a) if thread-local is supported on the platform, then a pointer to
+//    a thread-local PerfContext object will be returned.
+// b) if thread-local is NOT supported, then compilation will fail.
+//
+// This function never returns nullptr.
+PerfContext* get_perf_context();
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/perf_level.h b/src/rocksdb/include/rocksdb/perf_level.h
new file mode 100644
index 000000000..e7dded0e3
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/perf_level.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// How much perf stats to collect. Affects perf_context and iostats_context.
+enum PerfLevel : unsigned char {
+  kUninitialized = 0,             // unknown setting
+  kDisable = 1,                   // disable perf stats
+  kEnableCount = 2,               // enable only count stats
+  kEnableTimeExceptForMutex = 3,  // Other than count stats, also enable time
+                                  // stats except for mutexes
+  // Other than time, also measure CPU time counters. Still don't measure
+  // time (neither wall time nor CPU time) for mutexes.
+  kEnableTimeAndCPUTimeExceptForMutex = 4,
+  kEnableTime = 5,  // enable count and time stats
+  kOutOfBounds = 6  // N.B. Must always be the last value!
+};
+
+// set the perf stats level for current thread
+void SetPerfLevel(PerfLevel level);
+
+// get current perf stats level for current thread
+PerfLevel GetPerfLevel();
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/persistent_cache.h b/src/rocksdb/include/rocksdb/persistent_cache.h
new file mode 100644
index 000000000..f14f01999
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/persistent_cache.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// PersistentCache
+//
+// Persistent cache interface for caching IO pages on a persistent medium. The
+// cache interface is specifically designed for persistent read cache.
+class PersistentCache {
+ public:
+  using StatsType = std::vector<std::map<std::string, double>>;
+
+  virtual ~PersistentCache() {}
+
+  // Insert to page cache
+  //
+  // page_key   Identifier to identify a page uniquely across restarts
+  // data       Page data to copy (caller retains ownership)
+  // size       Size of the page
+  virtual Status Insert(const Slice& key, const char* data,
+                        const size_t size) = 0;
+
+  // Lookup page cache by page identifier
+  //
+  // page_key   Page identifier
+  // buf        Buffer where the data should be copied
+  // size       Size of the page
+  virtual Status Lookup(const Slice& key, std::unique_ptr<char[]>* data,
+                        size_t* size) = 0;
+
+  // True if the cache is configured to store serialized blocks, which are
+  // potentially compressed and include a trailer (when SST format calls for
+  // one). False if the cache stores uncompressed blocks (no trailer).
+  virtual bool IsCompressed() = 0;
+
+  // Return stats as map of {string, double} per-tier
+  //
+  // Persistent cache can be initialized as a tier of caches. The stats are per
+  // tire top-down
+  virtual StatsType Stats() = 0;
+
+  virtual std::string GetPrintableOptions() const = 0;
+
+  // Return a new numeric id.  May be used by multiple clients who are
+  // sharding the same persistent cache to partition the key space.  Typically
+  // the client will allocate a new id at startup and prepend the id to its
+  // cache keys.
+  virtual uint64_t NewId() = 0;
+};
+
+// Factor method to create a new persistent cache
+Status NewPersistentCache(Env* const env, const std::string& path,
+                          const uint64_t size,
+                          const std::shared_ptr<Logger>& log,
+                          const bool optimized_for_nvm,
+                          std::shared_ptr<PersistentCache>* cache);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/rate_limiter.h b/src/rocksdb/include/rocksdb/rate_limiter.h
new file mode 100644
index 000000000..9cad6edf4
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/rate_limiter.h
@@ -0,0 +1,159 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "rocksdb/env.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class RateLimiter {
+ public:
+  enum class OpType {
+    kRead,
+    kWrite,
+  };
+
+  enum class Mode {
+    kReadsOnly,
+    kWritesOnly,
+    kAllIo,
+  };
+
+  // For API compatibility, default to rate-limiting writes only.
+  explicit RateLimiter(Mode mode = Mode::kWritesOnly) : mode_(mode) {}
+
+  virtual ~RateLimiter() {}
+
+  // This API allows user to dynamically change rate limiter's bytes per second.
+  // REQUIRED: bytes_per_second > 0
+  virtual void SetBytesPerSecond(int64_t bytes_per_second) = 0;
+
+  // Deprecated. New RateLimiter derived classes should override
+  // Request(const int64_t, const Env::IOPriority, Statistics*) or
+  // Request(const int64_t, const Env::IOPriority, Statistics*, OpType)
+  // instead.
+  //
+  // Request for token for bytes. If this request can not be satisfied, the call
+  // is blocked. Caller is responsible to make sure
+  // bytes <= GetSingleBurstBytes()
+  // and bytes >= 0.
+  virtual void Request(const int64_t /*bytes*/, const Env::IOPriority /*pri*/) {
+    assert(false);
+  }
+
+  // Request for token for bytes and potentially update statistics. If this
+  // request can not be satisfied, the call is blocked. Caller is responsible to
+  // make sure bytes <= GetSingleBurstBytes()
+  // and bytes >= 0.
+  virtual void Request(const int64_t bytes, const Env::IOPriority pri,
+                       Statistics* /* stats */) {
+    // For API compatibility, default implementation calls the older API in
+    // which statistics are unsupported.
+    Request(bytes, pri);
+  }
+
+  // Requests token to read or write bytes and potentially updates statistics.
+  //
+  // If this request can not be satisfied, the call is blocked. Caller is
+  // responsible to make sure bytes <= GetSingleBurstBytes()
+  // and bytes >= 0.
+  virtual void Request(const int64_t bytes, const Env::IOPriority pri,
+                       Statistics* stats, OpType op_type) {
+    if (IsRateLimited(op_type)) {
+      Request(bytes, pri, stats);
+    }
+  }
+
+  // Requests token to read or write bytes and potentially updates statistics.
+  // Takes into account GetSingleBurstBytes() and alignment (e.g., in case of
+  // direct I/O) to allocate an appropriate number of bytes, which may be less
+  // than the number of bytes requested.
+  virtual size_t RequestToken(size_t bytes, size_t alignment,
+                              Env::IOPriority io_priority, Statistics* stats,
+                              RateLimiter::OpType op_type);
+
+  // Max bytes can be granted in a single burst
+  virtual int64_t GetSingleBurstBytes() const = 0;
+
+  // Total bytes that go through rate limiter
+  virtual int64_t GetTotalBytesThrough(
+      const Env::IOPriority pri = Env::IO_TOTAL) const = 0;
+
+  // Total # of requests that go through rate limiter
+  virtual int64_t GetTotalRequests(
+      const Env::IOPriority pri = Env::IO_TOTAL) const = 0;
+
+  // Total # of requests that are pending for bytes in rate limiter
+  // For convenience, this function is supported by the RateLimiter returned
+  // by NewGenericRateLimiter but is not required by RocksDB.
+  //
+  // REQUIRED: total_pending_request != nullptr
+  virtual Status GetTotalPendingRequests(
+      int64_t* total_pending_requests,
+      const Env::IOPriority pri = Env::IO_TOTAL) const {
+    assert(total_pending_requests != nullptr);
+    (void)total_pending_requests;
+    (void)pri;
+    return Status::NotSupported();
+  }
+
+  virtual int64_t GetBytesPerSecond() const = 0;
+
+  virtual bool IsRateLimited(OpType op_type) {
+    if ((mode_ == RateLimiter::Mode::kWritesOnly &&
+         op_type == RateLimiter::OpType::kRead) ||
+        (mode_ == RateLimiter::Mode::kReadsOnly &&
+         op_type == RateLimiter::OpType::kWrite)) {
+      return false;
+    }
+    return true;
+  }
+
+ protected:
+  Mode GetMode() { return mode_; }
+
+ private:
+  const Mode mode_;
+};
+
+// Create a RateLimiter object, which can be shared among RocksDB instances to
+// control write rate of flush and compaction.
+// @rate_bytes_per_sec: this is the only parameter you want to set most of the
+// time. It controls the total write rate of compaction and flush in bytes per
+// second. Currently, RocksDB does not enforce rate limit for anything other
+// than flush and compaction, e.g. write to WAL.
+// @refill_period_us: this controls how often tokens are refilled. For example,
+// when rate_bytes_per_sec is set to 10MB/s and refill_period_us is set to
+// 100ms, then 1MB is refilled every 100ms internally. Larger value can lead to
+// burstier writes while smaller value introduces more CPU overhead.
+// The default should work for most cases.
+// @fairness: RateLimiter accepts high-pri requests and low-pri requests.
+// A low-pri request is usually blocked in favor of hi-pri request. Currently,
+// RocksDB assigns low-pri to request from compaction and high-pri to request
+// from flush. Low-pri requests can get blocked if flush requests come in
+// continuously. This fairness parameter grants low-pri requests permission by
+// 1/fairness chance even though high-pri requests exist to avoid starvation.
+// You should be good by leaving it at default 10.
+// @mode: Mode indicates which types of operations count against the limit.
+// @auto_tuned: Enables dynamic adjustment of rate limit within the range
+//              `[rate_bytes_per_sec / 20, rate_bytes_per_sec]`, according to
+//              the recent demand for background I/O.
+extern RateLimiter* NewGenericRateLimiter(
+    int64_t rate_bytes_per_sec, int64_t refill_period_us = 100 * 1000,
+    int32_t fairness = 10,
+    RateLimiter::Mode mode = RateLimiter::Mode::kWritesOnly,
+    bool auto_tuned = false);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/rocksdb_namespace.h b/src/rocksdb/include/rocksdb/rocksdb_namespace.h
new file mode 100644
index 000000000..a339ec2aa
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/rocksdb_namespace.h
@@ -0,0 +1,16 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+// For testing purposes
+#if ROCKSDB_NAMESPACE == 42
+#undef ROCKSDB_NAMESPACE
+#endif
+
+// Normal logic
+#ifndef ROCKSDB_NAMESPACE
+#define ROCKSDB_NAMESPACE rocksdb
+#endif
diff --git a/src/rocksdb/include/rocksdb/secondary_cache.h b/src/rocksdb/include/rocksdb/secondary_cache.h
new file mode 100644
index 000000000..a6a8c8b1d
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/secondary_cache.h
@@ -0,0 +1,133 @@
+// Copyright (c) 2021, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A handle for lookup result. The handle may not be immediately ready or
+// have a valid value. The caller must call isReady() to determine if its
+// ready, and call Wait() in order to block until it becomes ready.
+// The caller must call value() after it becomes ready to determine if the
+// handle successfullly read the item.
+class SecondaryCacheResultHandle {
+ public:
+  virtual ~SecondaryCacheResultHandle() = default;
+
+  // Returns whether the handle is ready or not
+  virtual bool IsReady() = 0;
+
+  // Block until handle becomes ready
+  virtual void Wait() = 0;
+
+  // Return the value. If nullptr, it means the lookup was unsuccessful
+  virtual void* Value() = 0;
+
+  // Return the size of value
+  virtual size_t Size() = 0;
+};
+
+// SecondaryCache
+//
+// Cache interface for caching blocks on a secondary tier (which can include
+// non-volatile media, or alternate forms of caching such as compressed data)
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class SecondaryCache : public Customizable {
+ public:
+  ~SecondaryCache() override = default;
+
+  static const char* Type() { return "SecondaryCache"; }
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& id,
+                                 std::shared_ptr<SecondaryCache>* result);
+
+  // Insert the given value into this cache. Ownership of `value` is
+  // transferred to the callee, who is reponsible for deleting the value
+  // with helper->del_cb if del_cb is not nullptr. Unlike Cache::Insert(),
+  // the callee is responsible for such cleanup even in case of non-OK
+  // Status.
+  // Typically, the value is not saved directly but the implementation
+  // uses the SaveToCallback provided by helper to extract value's
+  // persistable data (typically uncompressed block), which will be written
+  // to this tier. The implementation may or may not write it to cache
+  // depending on the admission control policy, even if the return status
+  // is success (OK).
+  //
+  // If the implementation is asynchronous or otherwise uses `value` after
+  // the call returns, then InsertSaved() must be overridden not to rely on
+  // Insert(). For example, there could be a "holding area" in memory where
+  // Lookup() might return the same parsed value back. But more typically, if
+  // the implementation only uses `value` for getting persistable data during
+  // the call, then the default implementation of `InsertSaved()` suffices.
+  virtual Status Insert(const Slice& key, void* value,
+                        const Cache::CacheItemHelper* helper) = 0;
+
+  // Insert a value from its saved/persistable data (typically uncompressed
+  // block), as if generated by SaveToCallback/SizeCallback. This can be used
+  // in "warming up" the cache from some auxiliary source, and like Insert()
+  // may or may not write it to cache depending on the admission control
+  // policy, even if the return status is success.
+  //
+  // The default implementation assumes synchronous, non-escaping Insert(),
+  // wherein `value` is not used after return of Insert(). See Insert().
+  virtual Status InsertSaved(const Slice& key, const Slice& saved);
+
+  // Lookup the data for the given key in this cache. The create_cb
+  // will be used to create the object. The handle returned may not be
+  // ready yet, unless wait=true, in which case Lookup() will block until
+  // the handle is ready.
+  //
+  // advise_erase is a hint from the primary cache indicating that the handle
+  // will be cached there, so the secondary cache is advised to drop it from
+  // the cache as an optimization. To use this feature, SupportForceErase()
+  // needs to return true.
+  // This hint can also be safely ignored.
+  //
+  // is_in_sec_cache is to indicate whether the handle is possibly erased
+  // from the secondary cache after the Lookup.
+  virtual std::unique_ptr<SecondaryCacheResultHandle> Lookup(
+      const Slice& key, const Cache::CreateCallback& create_cb, bool wait,
+      bool advise_erase, bool& is_in_sec_cache) = 0;
+
+  // Indicate whether a handle can be erased in this secondary cache.
+  [[nodiscard]] virtual bool SupportForceErase() const = 0;
+
+  // At the discretion of the implementation, erase the data associated
+  // with key.
+  virtual void Erase(const Slice& key) = 0;
+
+  // Wait for a collection of handles to become ready.
+  virtual void WaitAll(std::vector<SecondaryCacheResultHandle*> handles) = 0;
+
+  // Set the maximum configured capacity of the cache.
+  // When the new capacity is less than the old capacity and the existing usage
+  // is greater than new capacity, the implementation will do its best job to
+  // purge the released entries from the cache in order to lower the usage.
+  //
+  // The derived class can make this function no-op and return NotSupported().
+  virtual Status SetCapacity(size_t /* capacity */) {
+    return Status::NotSupported();
+  }
+
+  // The derived class can make this function no-op and return NotSupported().
+  virtual Status GetCapacity(size_t& /* capacity */) {
+    return Status::NotSupported();
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/slice.h b/src/rocksdb/include/rocksdb/slice.h
new file mode 100644
index 000000000..0d7eb5949
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/slice.h
@@ -0,0 +1,264 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Slice is a simple structure containing a pointer into some external
+// storage and a size.  The user of a Slice must ensure that the slice
+// is not used after the corresponding external storage has been
+// deallocated.
+//
+// Multiple threads can invoke const methods on a Slice without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Slice must use
+// external synchronization.
+
+#pragma once
+
+#include <cassert>
+#include <cstddef>
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <string_view>  // RocksDB now requires C++17 support
+
+#include "rocksdb/cleanable.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice {
+ public:
+  // Create an empty slice.
+  Slice() : data_(""), size_(0) {}
+
+  // Create a slice that refers to d[0,n-1].
+  Slice(const char* d, size_t n) : data_(d), size_(n) {}
+
+  // Create a slice that refers to the contents of "s"
+  /* implicit */
+  Slice(const std::string& s) : data_(s.data()), size_(s.size()) {}
+
+  // Create a slice that refers to the same contents as "sv"
+  /* implicit */
+  Slice(const std::string_view& sv) : data_(sv.data()), size_(sv.size()) {}
+
+  // Create a slice that refers to s[0,strlen(s)-1]
+  /* implicit */
+  Slice(const char* s) : data_(s) { size_ = (s == nullptr) ? 0 : strlen(s); }
+
+  // Create a single slice from SliceParts using buf as storage.
+  // buf must exist as long as the returned Slice exists.
+  Slice(const struct SliceParts& parts, std::string* buf);
+
+  // Return a pointer to the beginning of the referenced data
+  const char* data() const { return data_; }
+
+  // Return the length (in bytes) of the referenced data
+  size_t size() const { return size_; }
+
+  // Return true iff the length of the referenced data is zero
+  bool empty() const { return size_ == 0; }
+
+  // Return the ith byte in the referenced data.
+  // REQUIRES: n < size()
+  char operator[](size_t n) const {
+    assert(n < size());
+    return data_[n];
+  }
+
+  // Change this slice to refer to an empty array
+  void clear() {
+    data_ = "";
+    size_ = 0;
+  }
+
+  // Drop the first "n" bytes from this slice.
+  void remove_prefix(size_t n) {
+    assert(n <= size());
+    data_ += n;
+    size_ -= n;
+  }
+
+  void remove_suffix(size_t n) {
+    assert(n <= size());
+    size_ -= n;
+  }
+
+  // Return a string that contains the copy of the referenced data.
+  // when hex is true, returns a string of twice the length hex encoded (0-9A-F)
+  std::string ToString(bool hex = false) const;
+
+  // Return a string_view that references the same data as this slice.
+  std::string_view ToStringView() const {
+    return std::string_view(data_, size_);
+  }
+
+  // Decodes the current slice interpreted as an hexadecimal string into result,
+  // if successful returns true, if this isn't a valid hex string
+  // (e.g not coming from Slice::ToString(true)) DecodeHex returns false.
+  // This slice is expected to have an even number of 0-9A-F characters
+  // also accepts lowercase (a-f)
+  bool DecodeHex(std::string* result) const;
+
+  // Three-way comparison.  Returns value:
+  //   <  0 iff "*this" <  "b",
+  //   == 0 iff "*this" == "b",
+  //   >  0 iff "*this" >  "b"
+  int compare(const Slice& b) const;
+
+  // Return true iff "x" is a prefix of "*this"
+  bool starts_with(const Slice& x) const {
+    return ((size_ >= x.size_) && (memcmp(data_, x.data_, x.size_) == 0));
+  }
+
+  bool ends_with(const Slice& x) const {
+    return ((size_ >= x.size_) &&
+            (memcmp(data_ + size_ - x.size_, x.data_, x.size_) == 0));
+  }
+
+  // Compare two slices and returns the first byte where they differ
+  size_t difference_offset(const Slice& b) const;
+
+  // private: make these public for rocksdbjni access
+  const char* data_;
+  size_t size_;
+
+  // Intentionally copyable
+};
+
+/**
+ * A Slice that can be pinned with some cleanup tasks, which will be run upon
+ * ::Reset() or object destruction, whichever is invoked first. This can be used
+ * to avoid memcpy by having the PinnableSlice object referring to the data
+ * that is locked in the memory and release them after the data is consumed.
+ */
+class PinnableSlice : public Slice, public Cleanable {
+ public:
+  PinnableSlice() { buf_ = &self_space_; }
+  explicit PinnableSlice(std::string* buf) { buf_ = buf; }
+
+  PinnableSlice(PinnableSlice&& other);
+  PinnableSlice& operator=(PinnableSlice&& other);
+
+  // No copy constructor and copy assignment allowed.
+  PinnableSlice(PinnableSlice&) = delete;
+  PinnableSlice& operator=(PinnableSlice&) = delete;
+
+  inline void PinSlice(const Slice& s, CleanupFunction f, void* arg1,
+                       void* arg2) {
+    assert(!pinned_);
+    pinned_ = true;
+    data_ = s.data();
+    size_ = s.size();
+    RegisterCleanup(f, arg1, arg2);
+    assert(pinned_);
+  }
+
+  inline void PinSlice(const Slice& s, Cleanable* cleanable) {
+    assert(!pinned_);
+    pinned_ = true;
+    data_ = s.data();
+    size_ = s.size();
+    if (cleanable != nullptr) {
+      cleanable->DelegateCleanupsTo(this);
+    }
+    assert(pinned_);
+  }
+
+  inline void PinSelf(const Slice& slice) {
+    assert(!pinned_);
+    buf_->assign(slice.data(), slice.size());
+    data_ = buf_->data();
+    size_ = buf_->size();
+    assert(!pinned_);
+  }
+
+  inline void PinSelf() {
+    assert(!pinned_);
+    data_ = buf_->data();
+    size_ = buf_->size();
+    assert(!pinned_);
+  }
+
+  void remove_suffix(size_t n) {
+    assert(n <= size());
+    if (pinned_) {
+      size_ -= n;
+    } else {
+      buf_->erase(size() - n, n);
+      PinSelf();
+    }
+  }
+
+  void remove_prefix(size_t n) {
+    assert(n <= size());
+    if (pinned_) {
+      data_ += n;
+      size_ -= n;
+    } else {
+      buf_->erase(0, n);
+      PinSelf();
+    }
+  }
+
+  void Reset() {
+    Cleanable::Reset();
+    pinned_ = false;
+    size_ = 0;
+  }
+
+  inline std::string* GetSelf() { return buf_; }
+
+  inline bool IsPinned() const { return pinned_; }
+
+ private:
+  friend class PinnableSlice4Test;
+  std::string self_space_;
+  std::string* buf_;
+  bool pinned_ = false;
+};
+
+// A set of Slices that are virtually concatenated together.  'parts' points
+// to an array of Slices.  The number of elements in the array is 'num_parts'.
+struct SliceParts {
+  SliceParts(const Slice* _parts, int _num_parts)
+      : parts(_parts), num_parts(_num_parts) {}
+  SliceParts() : parts(nullptr), num_parts(0) {}
+
+  const Slice* parts;
+  int num_parts;
+};
+
+inline bool operator==(const Slice& x, const Slice& y) {
+  return ((x.size() == y.size()) &&
+          (memcmp(x.data(), y.data(), x.size()) == 0));
+}
+
+inline bool operator!=(const Slice& x, const Slice& y) { return !(x == y); }
+
+inline int Slice::compare(const Slice& b) const {
+  assert(data_ != nullptr && b.data_ != nullptr);
+  const size_t min_len = (size_ < b.size_) ? size_ : b.size_;
+  int r = memcmp(data_, b.data_, min_len);
+  if (r == 0) {
+    if (size_ < b.size_)
+      r = -1;
+    else if (size_ > b.size_)
+      r = +1;
+  }
+  return r;
+}
+
+inline size_t Slice::difference_offset(const Slice& b) const {
+  size_t off = 0;
+  const size_t len = (size_ < b.size_) ? size_ : b.size_;
+  for (; off < len; off++) {
+    if (data_[off] != b.data_[off]) break;
+  }
+  return off;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/slice_transform.h b/src/rocksdb/include/rocksdb/slice_transform.h
new file mode 100644
index 000000000..8909b9c53
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/slice_transform.h
@@ -0,0 +1,135 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Class for specifying user-defined functions which perform a
+// transformation on a slice.  It is not required that every slice
+// belong to the domain and/or range of a function.  Subclasses should
+// define InDomain and InRange to determine which slices are in either
+// of these sets respectively.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+struct ConfigOptions;
+
+// A SliceTransform is a generic pluggable way of transforming one string
+// to another. Its primary use-case is in configuring RocksDB prefix Bloom
+// filters, by setting prefix_extractor in ColumnFamilyOptions.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class SliceTransform : public Customizable {
+ public:
+  virtual ~SliceTransform(){};
+
+  // Return the name of this transformation.
+  virtual const char* Name() const override = 0;
+  static const char* Type() { return "SliceTransform"; }
+
+  // Creates and configures a new SliceTransform from the input options and id.
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& id,
+                                 std::shared_ptr<const SliceTransform>* result);
+
+  // Returns a string representation of this SliceTransform, representing the ID
+  // and any additional properties.
+  std::string AsString() const;
+
+  // Extract a prefix from a specified key, partial key, iterator upper bound,
+  // etc. This is normally used for building and checking prefix Bloom filters
+  // but should accept any string for which InDomain() returns true.
+  // See ColumnFamilyOptions::prefix_extractor for specific properties that
+  // must be satisfied by prefix extractors.
+  virtual Slice Transform(const Slice& key) const = 0;
+
+  // Determine whether the specified key is compatible with the logic
+  // specified in the Transform method. Keys for which InDomain returns
+  // false will not be added to or queried against prefix Bloom filters.
+  //
+  // For example, if the Transform method returns a fixed length
+  // prefix of size 4, then an invocation to InDomain("abc") returns
+  // false because the specified key length(3) is shorter than the
+  // prefix size of 4.
+  //
+  // Wiki documentation here:
+  // https://github.com/facebook/rocksdb/wiki/Prefix-Seek
+  //
+  virtual bool InDomain(const Slice& key) const = 0;
+
+  // DEPRECATED: This is currently not used and remains here for backward
+  // compatibility.
+  virtual bool InRange(const Slice& /*dst*/) const { return false; }
+
+  // Returns information on maximum prefix length, if there is one.
+  // If Transform(x).size() == n for some keys and otherwise < n,
+  // should return true and set *len = n. Returning false is safe but
+  // currently disables some auto_prefix_mode filtering.
+  // Specifically, if the iterate_upper_bound is the immediate successor (see
+  // Comparator::IsSameLengthImmediateSuccessor) of the seek key's prefix,
+  // we require this function return true and iterate_upper_bound.size() == n
+  // to recognize and optimize the prefix seek.
+  // Otherwise (including FullLengthEnabled returns false, or prefix length is
+  // less than maximum), Seek with auto_prefix_mode is only optimized if the
+  // iterate_upper_bound and seek key have the same prefix.
+  // BUG: Despite all these conditions and even with the extra condition on
+  // IsSameLengthImmediateSuccessor (see it's "BUG" section), it is not
+  // sufficient to ensure auto_prefix_mode returns all entries that
+  // total_order_seek would return. See auto_prefix_mode "BUG" section.
+  virtual bool FullLengthEnabled(size_t* /*len*/) const { return false; }
+
+  // Transform(s)=Transform(`prefix`) for any s with `prefix` as a prefix.
+  //
+  // This function is not used by RocksDB, but for users. If users pass
+  // Options by string to RocksDB, they might not know what prefix extractor
+  // they are using. This function is to help users can determine:
+  //   if they want to iterate all keys prefixing `prefix`, whether it is
+  //   safe to use prefix bloom filter and seek to key `prefix`.
+  // If this function returns true, this means a user can Seek() to a prefix
+  // using the bloom filter. Otherwise, user needs to skip the bloom filter
+  // by setting ReadOptions.total_order_seek = true.
+  //
+  // Here is an example: Suppose we implement a slice transform that returns
+  // the first part of the string up to and including first ",":
+  // 1. SameResultWhenAppended("abc,") should return true. If applying prefix
+  //    bloom filter using it, all slices matching "abc,.*" will be extracted
+  //    to "abc,", so any SST file or memtable containing any of those key
+  //    will not be filtered out.
+  // 2. SameResultWhenAppended("abc") should return false. A user will not be
+  //    guaranteed to see all the keys matching "abc.*" if a user prefix
+  //    seeks to "abc" against a DB with the same setting. If one SST file
+  //    only contains "abcd,e", the file can be filtered out and the key will
+  //    be invisible, because the prefix according to the configured extractor
+  //    is "abcd,".
+  //
+  // i.e., an implementation always returning false is safe.
+  virtual bool SameResultWhenAppended(const Slice& /*prefix*/) const {
+    return false;
+  }
+};
+
+// The prefix is the first `prefix_len` bytes of the key, and keys shorter
+// then `prefix_len` are not InDomain.
+extern const SliceTransform* NewFixedPrefixTransform(size_t prefix_len);
+
+// The prefix is the first min(length(key),`cap_len`) bytes of the key, and
+// all keys are InDomain.
+extern const SliceTransform* NewCappedPrefixTransform(size_t cap_len);
+
+// Prefix is equal to key. All keys are InDomain.
+extern const SliceTransform* NewNoopTransform();
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/snapshot.h b/src/rocksdb/include/rocksdb/snapshot.h
new file mode 100644
index 000000000..1ea56e71e
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/snapshot.h
@@ -0,0 +1,53 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DB;
+
+// Abstract handle to particular state of a DB.
+// A Snapshot is an immutable object and can therefore be safely
+// accessed from multiple threads without any external synchronization.
+//
+// To Create a Snapshot, call DB::GetSnapshot().
+// To Destroy a Snapshot, call DB::ReleaseSnapshot(snapshot).
+class Snapshot {
+ public:
+  virtual SequenceNumber GetSequenceNumber() const = 0;
+
+  // Returns unix time i.e. the number of seconds since the Epoch, 1970-01-01
+  // 00:00:00 (UTC).
+  virtual int64_t GetUnixTime() const = 0;
+
+  virtual uint64_t GetTimestamp() const = 0;
+
+ protected:
+  virtual ~Snapshot();
+};
+
+// Simple RAII wrapper class for Snapshot.
+// Constructing this object will create a snapshot.  Destructing will
+// release the snapshot.
+class ManagedSnapshot {
+ public:
+  explicit ManagedSnapshot(DB* db);
+
+  // Instead of creating a snapshot, take ownership of the input snapshot.
+  ManagedSnapshot(DB* db, const Snapshot* _snapshot);
+
+  ~ManagedSnapshot();
+
+  const Snapshot* snapshot();
+
+ private:
+  DB* db_;
+  const Snapshot* snapshot_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/sst_dump_tool.h b/src/rocksdb/include/rocksdb/sst_dump_tool.h
new file mode 100644
index 000000000..9261ba47d
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/sst_dump_tool.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#ifndef ROCKSDB_LITE
+#pragma once
+
+#include "rocksdb/options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SSTDumpTool {
+ public:
+  int Run(int argc, char const* const* argv, Options options = Options());
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/sst_file_manager.h b/src/rocksdb/include/rocksdb/sst_file_manager.h
new file mode 100644
index 000000000..613292151
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/sst_file_manager.h
@@ -0,0 +1,136 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/file_system.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Env;
+class Logger;
+
+// SstFileManager is used to track SST and blob files in the DB and control
+// their deletion rate. All SstFileManager public functions are thread-safe.
+// SstFileManager is NOT an extensible interface but a public interface for
+// result of NewSstFileManager. Any derived classes must be RocksDB internal.
+class SstFileManager {
+ public:
+  virtual ~SstFileManager() {}
+
+  // Update the maximum allowed space that should be used by RocksDB, if
+  // the total size of the SST and blob files exceeds max_allowed_space, writes
+  // to RocksDB will fail.
+  //
+  // Setting max_allowed_space to 0 will disable this feature; maximum allowed
+  // space will be infinite (Default value).
+  //
+  // thread-safe.
+  virtual void SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) = 0;
+
+  // Set the amount of buffer room each compaction should be able to leave.
+  // In other words, at its maximum disk space consumption, the compaction
+  // should still leave compaction_buffer_size available on the disk so that
+  // other background functions may continue, such as logging and flushing.
+  virtual void SetCompactionBufferSize(uint64_t compaction_buffer_size) = 0;
+
+  // Return true if the total size of SST  and blob files exceeded the maximum
+  // allowed space usage.
+  //
+  // thread-safe.
+  virtual bool IsMaxAllowedSpaceReached() = 0;
+
+  // Returns true if the total size of SST and blob files as well as estimated
+  // size of ongoing compactions exceeds the maximums allowed space usage.
+  virtual bool IsMaxAllowedSpaceReachedIncludingCompactions() = 0;
+
+  // Return the total size of all tracked files.
+  // thread-safe
+  virtual uint64_t GetTotalSize() = 0;
+
+  // Return a map containing all tracked files and their corresponding sizes.
+  // thread-safe
+  virtual std::unordered_map<std::string, uint64_t> GetTrackedFiles() = 0;
+
+  // Return delete rate limit in bytes per second.
+  // thread-safe
+  virtual int64_t GetDeleteRateBytesPerSecond() = 0;
+
+  // Update the delete rate limit in bytes per second.
+  // zero means disable delete rate limiting and delete files immediately
+  // thread-safe
+  virtual void SetDeleteRateBytesPerSecond(int64_t delete_rate) = 0;
+
+  // Return trash/DB size ratio where new files will be deleted immediately
+  // thread-safe
+  virtual double GetMaxTrashDBRatio() = 0;
+
+  // Update trash/DB size ratio where new files will be deleted immediately
+  // thread-safe
+  virtual void SetMaxTrashDBRatio(double ratio) = 0;
+
+  // Return the total size of trash files
+  // thread-safe
+  virtual uint64_t GetTotalTrashSize() = 0;
+
+  // Set the statistics ptr to dump the stat information
+  virtual void SetStatisticsPtr(const std::shared_ptr<Statistics>& stats) = 0;
+};
+
+// Create a new SstFileManager that can be shared among multiple RocksDB
+// instances to track SST and blob files and control there deletion rate.
+// Even though SstFileManager don't track WAL files but it still control
+// there deletion rate.
+//
+// @param env: Pointer to Env object, please see "rocksdb/env.h".
+// @param fs: Pointer to FileSystem object (rocksdb/file_system.h"
+// @param info_log: If not nullptr, info_log will be used to log errors.
+//
+// == Deletion rate limiting specific arguments ==
+// @param trash_dir: Deprecated, this argument have no effect
+// @param rate_bytes_per_sec: How many bytes should be deleted per second, If
+//    this value is set to 1024 (1 Kb / sec) and we deleted a file of size 4 Kb
+//    in 1 second, we will wait for another 3 seconds before we delete other
+//    files, Set to 0 to disable deletion rate limiting.
+//    This option also affect the delete rate of WAL files in the DB.
+// @param delete_existing_trash: Deprecated, this argument have no effect, but
+//    if user provide trash_dir we will schedule deletes for files in the dir
+// @param status: If not nullptr, status will contain any errors that happened
+//    during creating the missing trash_dir or deleting existing files in trash.
+// @param max_trash_db_ratio: If the trash size constitutes for more than this
+//    fraction of the total DB size we will start deleting new files passed to
+//    DeleteScheduler immediately
+// @param bytes_max_delete_chunk: if a file to delete is larger than delete
+//    chunk, ftruncate the file by this size each time, rather than dropping the
+//    whole file. 0 means to always delete the whole file. If the file has more
+//    than one linked names, the file will be deleted as a whole. Either way,
+//    `rate_bytes_per_sec` will be appreciated. NOTE that with this option,
+//    files already renamed as a trash may be partial, so users should not
+//    directly recover them without checking.
+extern SstFileManager* NewSstFileManager(
+    Env* env, std::shared_ptr<FileSystem> fs,
+    std::shared_ptr<Logger> info_log = nullptr,
+    const std::string& trash_dir = "", int64_t rate_bytes_per_sec = 0,
+    bool delete_existing_trash = true, Status* status = nullptr,
+    double max_trash_db_ratio = 0.25,
+    uint64_t bytes_max_delete_chunk = 64 * 1024 * 1024);
+
+// Same as above, but takes a pointer to a legacy Env object, instead of
+// Env and FileSystem objects
+extern SstFileManager* NewSstFileManager(
+    Env* env, std::shared_ptr<Logger> info_log = nullptr,
+    std::string trash_dir = "", int64_t rate_bytes_per_sec = 0,
+    bool delete_existing_trash = true, Status* status = nullptr,
+    double max_trash_db_ratio = 0.25,
+    uint64_t bytes_max_delete_chunk = 64 * 1024 * 1024);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/sst_file_reader.h b/src/rocksdb/include/rocksdb/sst_file_reader.h
new file mode 100644
index 000000000..4b8642480
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/sst_file_reader.h
@@ -0,0 +1,47 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/table_properties.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// SstFileReader is used to read sst files that are generated by DB or
+// SstFileWriter.
+class SstFileReader {
+ public:
+  SstFileReader(const Options& options);
+
+  ~SstFileReader();
+
+  // Prepares to read from the file located at "file_path".
+  Status Open(const std::string& file_path);
+
+  // Returns a new iterator over the table contents.
+  // Most read options provide the same control as we read from DB.
+  // If "snapshot" is nullptr, the iterator returns only the latest keys.
+  Iterator* NewIterator(const ReadOptions& options);
+
+  std::shared_ptr<const TableProperties> GetTableProperties() const;
+
+  // Verifies whether there is corruption in this table.
+  Status VerifyChecksum(const ReadOptions& /*read_options*/);
+
+  Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); }
+
+ private:
+  struct Rep;
+  std::unique_ptr<Rep> rep_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/sst_file_writer.h b/src/rocksdb/include/rocksdb/sst_file_writer.h
new file mode 100644
index 000000000..c85f097a5
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/sst_file_writer.h
@@ -0,0 +1,174 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/types.h"
+
+#if defined(__GNUC__) || defined(__clang__)
+#define ROCKSDB_DEPRECATED_FUNC __attribute__((__deprecated__))
+#elif _WIN32
+#define ROCKSDB_DEPRECATED_FUNC __declspec(deprecated)
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class Comparator;
+
+// ExternalSstFileInfo include information about sst files created
+// using SstFileWriter.
+struct ExternalSstFileInfo {
+  ExternalSstFileInfo()
+      : file_path(""),
+        smallest_key(""),
+        largest_key(""),
+        smallest_range_del_key(""),
+        largest_range_del_key(""),
+        file_checksum(""),
+        file_checksum_func_name(""),
+        sequence_number(0),
+        file_size(0),
+        num_entries(0),
+        num_range_del_entries(0),
+        version(0) {}
+
+  ExternalSstFileInfo(const std::string& _file_path,
+                      const std::string& _smallest_key,
+                      const std::string& _largest_key,
+                      SequenceNumber _sequence_number, uint64_t _file_size,
+                      int32_t _num_entries, int32_t _version)
+      : file_path(_file_path),
+        smallest_key(_smallest_key),
+        largest_key(_largest_key),
+        smallest_range_del_key(""),
+        largest_range_del_key(""),
+        file_checksum(""),
+        file_checksum_func_name(""),
+        sequence_number(_sequence_number),
+        file_size(_file_size),
+        num_entries(_num_entries),
+        num_range_del_entries(0),
+        version(_version) {}
+
+  std::string file_path;     // external sst file path
+  std::string smallest_key;  // smallest user key in file
+  std::string largest_key;   // largest user key in file
+  std::string
+      smallest_range_del_key;  // smallest range deletion user key in file
+  std::string largest_range_del_key;  // largest range deletion user key in file
+  std::string file_checksum;          // sst file checksum;
+  std::string file_checksum_func_name;  // The name of file checksum function
+  SequenceNumber sequence_number;       // sequence number of all keys in file
+  uint64_t file_size;                   // file size in bytes
+  uint64_t num_entries;                 // number of entries in file
+  uint64_t num_range_del_entries;  // number of range deletion entries in file
+  int32_t version;                 // file version
+};
+
+// SstFileWriter is used to create sst files that can be added to database later
+// All keys in files generated by SstFileWriter will have sequence number = 0.
+class SstFileWriter {
+ public:
+  // User can pass `column_family` to specify that the generated file will
+  // be ingested into this column_family, note that passing nullptr means that
+  // the column_family is unknown.
+  // If invalidate_page_cache is set to true, SstFileWriter will give the OS a
+  // hint that this file pages is not needed every time we write 1MB to the
+  // file. To use the rate limiter an io_priority smaller than IO_TOTAL can be
+  // passed.
+  // The `skip_filters` option is DEPRECATED and could be removed in the
+  // future. Use `BlockBasedTableOptions::filter_policy` to control filter
+  // generation.
+  SstFileWriter(const EnvOptions& env_options, const Options& options,
+                ColumnFamilyHandle* column_family = nullptr,
+                bool invalidate_page_cache = true,
+                Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL,
+                bool skip_filters = false)
+      : SstFileWriter(env_options, options, options.comparator, column_family,
+                      invalidate_page_cache, io_priority, skip_filters) {}
+
+  // Deprecated API
+  SstFileWriter(const EnvOptions& env_options, const Options& options,
+                const Comparator* user_comparator,
+                ColumnFamilyHandle* column_family = nullptr,
+                bool invalidate_page_cache = true,
+                Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL,
+                bool skip_filters = false);
+
+  ~SstFileWriter();
+
+  // Prepare SstFileWriter to write into file located at "file_path".
+  Status Open(const std::string& file_path);
+
+  // Add a Put key with value to currently opened file (deprecated)
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: comparator is *not* timestamp-aware.
+  ROCKSDB_DEPRECATED_FUNC Status Add(const Slice& user_key, const Slice& value);
+
+  // Add a Put key with value to currently opened file
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: comparator is *not* timestamp-aware.
+  Status Put(const Slice& user_key, const Slice& value);
+
+  // Add a Put (key with timestamp, value) to the currently opened file
+  // REQUIRES: key is after any previously added key according to the
+  // comparator.
+  // REQUIRES: the timestamp's size is equal to what is expected by
+  // the comparator.
+  Status Put(const Slice& user_key, const Slice& timestamp, const Slice& value);
+
+  // Add a Merge key with value to currently opened file
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: comparator is *not* timestamp-aware.
+  Status Merge(const Slice& user_key, const Slice& value);
+
+  // Add a deletion key to currently opened file
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: comparator is *not* timestamp-aware.
+  Status Delete(const Slice& user_key);
+
+  // Add a deletion key with timestamp to the currently opened file
+  // REQUIRES: key is after any previously added key according to the
+  // comparator.
+  // REQUIRES: the timestamp's size is equal to what is expected by
+  // the comparator.
+  Status Delete(const Slice& user_key, const Slice& timestamp);
+
+  // Add a range deletion tombstone to currently opened file
+  // REQUIRES: comparator is *not* timestamp-aware.
+  Status DeleteRange(const Slice& begin_key, const Slice& end_key);
+
+  // Add a range deletion tombstone to currently opened file.
+  // REQUIRES: begin_key and end_key are user keys without timestamp.
+  // REQUIRES: the timestamp's size is equal to what is expected by
+  // the comparator.
+  Status DeleteRange(const Slice& begin_key, const Slice& end_key,
+                     const Slice& timestamp);
+
+  // Finalize writing to sst file and close file.
+  //
+  // An optional ExternalSstFileInfo pointer can be passed to the function
+  // which will be populated with information about the created sst file.
+  Status Finish(ExternalSstFileInfo* file_info = nullptr);
+
+  // Return the current file size.
+  uint64_t FileSize();
+
+ private:
+  void InvalidatePageCache(bool closing);
+  struct Rep;
+  std::unique_ptr<Rep> rep_;
+};
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/sst_partitioner.h b/src/rocksdb/include/rocksdb/sst_partitioner.h
new file mode 100644
index 000000000..3af8e9492
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/sst_partitioner.h
@@ -0,0 +1,142 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+
+enum PartitionerResult : char {
+  // Partitioner does not require to create new file
+  kNotRequired = 0x0,
+  // Partitioner is requesting forcefully to create new file
+  kRequired = 0x1
+  // Additional constants can be added
+};
+
+struct PartitionerRequest {
+  PartitionerRequest(const Slice& prev_user_key_,
+                     const Slice& current_user_key_,
+                     uint64_t current_output_file_size_)
+      : prev_user_key(&prev_user_key_),
+        current_user_key(&current_user_key_),
+        current_output_file_size(current_output_file_size_) {}
+  const Slice* prev_user_key;
+  const Slice* current_user_key;
+  uint64_t current_output_file_size;
+};
+
+/*
+ * A SstPartitioner is a generic pluggable way of defining the partition
+ * of SST files. Compaction job will split the SST files on partition boundary
+ * to lower the write amplification during SST file promote to higher level.
+ */
+class SstPartitioner {
+ public:
+  virtual ~SstPartitioner() {}
+
+  // Return the name of this partitioner.
+  virtual const char* Name() const = 0;
+
+  // It is called for all keys in compaction. When partitioner want to create
+  // new SST file it needs to return true. It means compaction job will finish
+  // current SST file where last key is "prev_user_key" parameter and start new
+  // SST file where first key is "current_user_key". Returns decision if
+  // partition boundary was detected and compaction should create new file.
+  virtual PartitionerResult ShouldPartition(
+      const PartitionerRequest& request) = 0;
+
+  // Called with smallest and largest keys in SST file when compaction try to do
+  // trivial move. Returns true is partitioner allows to do trivial move.
+  virtual bool CanDoTrivialMove(const Slice& smallest_user_key,
+                                const Slice& largest_user_key) = 0;
+
+  // Context information of a compaction run
+  struct Context {
+    // Does this compaction run include all data files
+    bool is_full_compaction;
+    // Is this compaction requested by the client (true),
+    // or is it occurring as an automatic compaction process
+    bool is_manual_compaction;
+    // Output level for this compaction
+    int output_level;
+    // Smallest key for compaction
+    Slice smallest_user_key;
+    // Largest key for compaction
+    Slice largest_user_key;
+  };
+};
+
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class SstPartitionerFactory : public Customizable {
+ public:
+  ~SstPartitionerFactory() override {}
+  static const char* Type() { return "SstPartitionerFactory"; }
+  static Status CreateFromString(
+      const ConfigOptions& options, const std::string& value,
+      std::shared_ptr<SstPartitionerFactory>* result);
+
+  virtual std::unique_ptr<SstPartitioner> CreatePartitioner(
+      const SstPartitioner::Context& context) const = 0;
+
+  // Returns a name that identifies this partitioner factory.
+  const char* Name() const override = 0;
+};
+
+/*
+ * Fixed key prefix partitioner. It splits the output SST files when prefix
+ * defined by size changes.
+ */
+class SstPartitionerFixedPrefix : public SstPartitioner {
+ public:
+  explicit SstPartitionerFixedPrefix(size_t len) : len_(len) {}
+
+  virtual ~SstPartitionerFixedPrefix() override {}
+
+  const char* Name() const override { return "SstPartitionerFixedPrefix"; }
+
+  PartitionerResult ShouldPartition(const PartitionerRequest& request) override;
+
+  bool CanDoTrivialMove(const Slice& smallest_user_key,
+                        const Slice& largest_user_key) override;
+
+ private:
+  size_t len_;
+};
+
+/*
+ * Factory for fixed prefix partitioner.
+ */
+class SstPartitionerFixedPrefixFactory : public SstPartitionerFactory {
+ public:
+  explicit SstPartitionerFixedPrefixFactory(size_t len);
+
+  ~SstPartitionerFixedPrefixFactory() override {}
+
+  static const char* kClassName() { return "SstPartitionerFixedPrefixFactory"; }
+  const char* Name() const override { return kClassName(); }
+
+  std::unique_ptr<SstPartitioner> CreatePartitioner(
+      const SstPartitioner::Context& /* context */) const override;
+
+ private:
+  size_t len_;
+};
+
+extern std::shared_ptr<SstPartitionerFactory>
+NewSstPartitionerFixedPrefixFactory(size_t prefix_len);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/statistics.h b/src/rocksdb/include/rocksdb/statistics.h
new file mode 100644
index 000000000..42a938f30
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/statistics.h
@@ -0,0 +1,707 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+/**
+ * Keep adding tickers here.
+ *  1. Any ticker should be added immediately before TICKER_ENUM_MAX, taking
+ *     over its old value.
+ *  2. Add a readable string in TickersNameMap below for the newly added ticker.
+ *  3. Add a corresponding enum value to TickerType.java in the java API
+ *  4. Add the enum conversions from Java and C++ to portal.h's toJavaTickerType
+ *     and toCppTickers
+ */
+enum Tickers : uint32_t {
+  // total block cache misses
+  // REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS +
+  //                               BLOCK_CACHE_FILTER_MISS +
+  //                               BLOCK_CACHE_DATA_MISS;
+  BLOCK_CACHE_MISS = 0,
+  // total block cache hit
+  // REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT +
+  //                              BLOCK_CACHE_FILTER_HIT +
+  //                              BLOCK_CACHE_DATA_HIT;
+  BLOCK_CACHE_HIT,
+  // # of blocks added to block cache.
+  BLOCK_CACHE_ADD,
+  // # of failures when adding blocks to block cache.
+  BLOCK_CACHE_ADD_FAILURES,
+  // # of times cache miss when accessing index block from block cache.
+  BLOCK_CACHE_INDEX_MISS,
+  // # of times cache hit when accessing index block from block cache.
+  BLOCK_CACHE_INDEX_HIT,
+  // # of index blocks added to block cache.
+  BLOCK_CACHE_INDEX_ADD,
+  // # of bytes of index blocks inserted into cache
+  BLOCK_CACHE_INDEX_BYTES_INSERT,
+  // # of bytes of index block erased from cache
+  BLOCK_CACHE_INDEX_BYTES_EVICT,
+  // # of times cache miss when accessing filter block from block cache.
+  BLOCK_CACHE_FILTER_MISS,
+  // # of times cache hit when accessing filter block from block cache.
+  BLOCK_CACHE_FILTER_HIT,
+  // # of filter blocks added to block cache.
+  BLOCK_CACHE_FILTER_ADD,
+  // # of bytes of bloom filter blocks inserted into cache
+  BLOCK_CACHE_FILTER_BYTES_INSERT,
+  // # of bytes of bloom filter block erased from cache
+  BLOCK_CACHE_FILTER_BYTES_EVICT,
+  // # of times cache miss when accessing data block from block cache.
+  BLOCK_CACHE_DATA_MISS,
+  // # of times cache hit when accessing data block from block cache.
+  BLOCK_CACHE_DATA_HIT,
+  // # of data blocks added to block cache.
+  BLOCK_CACHE_DATA_ADD,
+  // # of bytes of data blocks inserted into cache
+  BLOCK_CACHE_DATA_BYTES_INSERT,
+  // # of bytes read from cache.
+  BLOCK_CACHE_BYTES_READ,
+  // # of bytes written into cache.
+  BLOCK_CACHE_BYTES_WRITE,
+
+  // # of times bloom filter has avoided file reads, i.e., negatives.
+  BLOOM_FILTER_USEFUL,
+  // # of times bloom FullFilter has not avoided the reads.
+  BLOOM_FILTER_FULL_POSITIVE,
+  // # of times bloom FullFilter has not avoided the reads and data actually
+  // exist.
+  BLOOM_FILTER_FULL_TRUE_POSITIVE,
+
+  BLOOM_FILTER_MICROS,
+
+  // # persistent cache hit
+  PERSISTENT_CACHE_HIT,
+  // # persistent cache miss
+  PERSISTENT_CACHE_MISS,
+
+  // # total simulation block cache hits
+  SIM_BLOCK_CACHE_HIT,
+  // # total simulation block cache misses
+  SIM_BLOCK_CACHE_MISS,
+
+  // # of memtable hits.
+  MEMTABLE_HIT,
+  // # of memtable misses.
+  MEMTABLE_MISS,
+
+  // # of Get() queries served by L0
+  GET_HIT_L0,
+  // # of Get() queries served by L1
+  GET_HIT_L1,
+  // # of Get() queries served by L2 and up
+  GET_HIT_L2_AND_UP,
+
+  /**
+   * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction
+   * There are 4 reasons currently.
+   */
+  COMPACTION_KEY_DROP_NEWER_ENTRY,  // key was written with a newer value.
+                                    // Also includes keys dropped for range del.
+  COMPACTION_KEY_DROP_OBSOLETE,     // The key is obsolete.
+  COMPACTION_KEY_DROP_RANGE_DEL,    // key was covered by a range tombstone.
+  COMPACTION_KEY_DROP_USER,  // user compaction function has dropped the key.
+  COMPACTION_RANGE_DEL_DROP_OBSOLETE,  // all keys in range were deleted.
+  // Deletions obsoleted before bottom level due to file gap optimization.
+  COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
+  // If a compaction was canceled in sfm to prevent ENOSPC
+  COMPACTION_CANCELLED,
+
+  // Number of keys written to the database via the Put and Write call's
+  NUMBER_KEYS_WRITTEN,
+  // Number of Keys read,
+  NUMBER_KEYS_READ,
+  // Number keys updated, if inplace update is enabled
+  NUMBER_KEYS_UPDATED,
+  // The number of uncompressed bytes issued by DB::Put(), DB::Delete(),
+  // DB::Merge(), and DB::Write().
+  BYTES_WRITTEN,
+  // The number of uncompressed bytes read from DB::Get().  It could be
+  // either from memtables, cache, or table files.
+  // For the number of logical bytes read from DB::MultiGet(),
+  // please use NUMBER_MULTIGET_BYTES_READ.
+  BYTES_READ,
+  // The number of calls to seek/next/prev
+  NUMBER_DB_SEEK,
+  NUMBER_DB_NEXT,
+  NUMBER_DB_PREV,
+  // The number of calls to seek/next/prev that returned data
+  NUMBER_DB_SEEK_FOUND,
+  NUMBER_DB_NEXT_FOUND,
+  NUMBER_DB_PREV_FOUND,
+  // The number of uncompressed bytes read from an iterator.
+  // Includes size of key and value.
+  ITER_BYTES_READ,
+  NO_FILE_CLOSES,
+  NO_FILE_OPENS,
+  NO_FILE_ERRORS,
+  // DEPRECATED Time system had to wait to do LO-L1 compactions
+  STALL_L0_SLOWDOWN_MICROS,
+  // DEPRECATED Time system had to wait to move memtable to L1.
+  STALL_MEMTABLE_COMPACTION_MICROS,
+  // DEPRECATED write throttle because of too many files in L0
+  STALL_L0_NUM_FILES_MICROS,
+  // Writer has to wait for compaction or flush to finish.
+  STALL_MICROS,
+  // The wait time for db mutex.
+  // Disabled by default. To enable it set stats level to kAll
+  DB_MUTEX_WAIT_MICROS,
+  RATE_LIMIT_DELAY_MILLIS,
+  // DEPRECATED number of iterators currently open
+  NO_ITERATORS,
+
+  // Number of MultiGet calls, keys read, and bytes read
+  NUMBER_MULTIGET_CALLS,
+  NUMBER_MULTIGET_KEYS_READ,
+  NUMBER_MULTIGET_BYTES_READ,
+
+  // Number of deletes records that were not required to be
+  // written to storage because key does not exist
+  NUMBER_FILTERED_DELETES,
+  NUMBER_MERGE_FAILURES,
+
+  // number of times bloom was checked before creating iterator on a
+  // file, and the number of times the check was useful in avoiding
+  // iterator creation (and thus likely IOPs).
+  BLOOM_FILTER_PREFIX_CHECKED,
+  BLOOM_FILTER_PREFIX_USEFUL,
+
+  // Number of times we had to reseek inside an iteration to skip
+  // over large number of keys with same userkey.
+  NUMBER_OF_RESEEKS_IN_ITERATION,
+
+  // Record the number of calls to GetUpdatesSince. Useful to keep track of
+  // transaction log iterator refreshes
+  GET_UPDATES_SINCE_CALLS,
+  BLOCK_CACHE_COMPRESSED_MISS,  // miss in the compressed block cache
+  BLOCK_CACHE_COMPRESSED_HIT,   // hit in the compressed block cache
+  // Number of blocks added to compressed block cache
+  BLOCK_CACHE_COMPRESSED_ADD,
+  // Number of failures when adding blocks to compressed block cache
+  BLOCK_CACHE_COMPRESSED_ADD_FAILURES,
+  WAL_FILE_SYNCED,  // Number of times WAL sync is done
+  WAL_FILE_BYTES,   // Number of bytes written to WAL
+
+  // Writes can be processed by requesting thread or by the thread at the
+  // head of the writers queue.
+  WRITE_DONE_BY_SELF,
+  WRITE_DONE_BY_OTHER,  // Equivalent to writes done for others
+  WRITE_TIMEDOUT,       // Number of writes ending up with timed-out.
+  WRITE_WITH_WAL,       // Number of Write calls that request WAL
+  COMPACT_READ_BYTES,   // Bytes read during compaction
+  COMPACT_WRITE_BYTES,  // Bytes written during compaction
+  FLUSH_WRITE_BYTES,    // Bytes written during flush
+
+  // Compaction read and write statistics broken down by CompactionReason
+  COMPACT_READ_BYTES_MARKED,
+  COMPACT_READ_BYTES_PERIODIC,
+  COMPACT_READ_BYTES_TTL,
+  COMPACT_WRITE_BYTES_MARKED,
+  COMPACT_WRITE_BYTES_PERIODIC,
+  COMPACT_WRITE_BYTES_TTL,
+
+  // Number of table's properties loaded directly from file, without creating
+  // table reader object.
+  NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
+  NUMBER_SUPERVERSION_ACQUIRES,
+  NUMBER_SUPERVERSION_RELEASES,
+  NUMBER_SUPERVERSION_CLEANUPS,
+
+  // # of compressions/decompressions executed
+  NUMBER_BLOCK_COMPRESSED,
+  NUMBER_BLOCK_DECOMPRESSED,
+
+  NUMBER_BLOCK_NOT_COMPRESSED,
+  MERGE_OPERATION_TOTAL_TIME,
+  FILTER_OPERATION_TOTAL_TIME,
+
+  // Row cache.
+  ROW_CACHE_HIT,
+  ROW_CACHE_MISS,
+
+  // Read amplification statistics.
+  // Read amplification can be calculated using this formula
+  // (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES)
+  //
+  // REQUIRES: ReadOptions::read_amp_bytes_per_bit to be enabled
+  READ_AMP_ESTIMATE_USEFUL_BYTES,  // Estimate of total bytes actually used.
+  READ_AMP_TOTAL_READ_BYTES,       // Total size of loaded data blocks.
+
+  // Number of refill intervals where rate limiter's bytes are fully consumed.
+  NUMBER_RATE_LIMITER_DRAINS,
+
+  // Number of internal keys skipped by Iterator
+  NUMBER_ITER_SKIP,
+
+  // BlobDB specific stats
+  // # of Put/PutTTL/PutUntil to BlobDB. Only applicable to legacy BlobDB.
+  BLOB_DB_NUM_PUT,
+  // # of Write to BlobDB. Only applicable to legacy BlobDB.
+  BLOB_DB_NUM_WRITE,
+  // # of Get to BlobDB. Only applicable to legacy BlobDB.
+  BLOB_DB_NUM_GET,
+  // # of MultiGet to BlobDB. Only applicable to legacy BlobDB.
+  BLOB_DB_NUM_MULTIGET,
+  // # of Seek/SeekToFirst/SeekToLast/SeekForPrev to BlobDB iterator. Only
+  // applicable to legacy BlobDB.
+  BLOB_DB_NUM_SEEK,
+  // # of Next to BlobDB iterator. Only applicable to legacy BlobDB.
+  BLOB_DB_NUM_NEXT,
+  // # of Prev to BlobDB iterator. Only applicable to legacy BlobDB.
+  BLOB_DB_NUM_PREV,
+  // # of keys written to BlobDB. Only applicable to legacy BlobDB.
+  BLOB_DB_NUM_KEYS_WRITTEN,
+  // # of keys read from BlobDB. Only applicable to legacy BlobDB.
+  BLOB_DB_NUM_KEYS_READ,
+  // # of bytes (key + value) written to BlobDB. Only applicable to legacy
+  // BlobDB.
+  BLOB_DB_BYTES_WRITTEN,
+  // # of bytes (keys + value) read from BlobDB. Only applicable to legacy
+  // BlobDB.
+  BLOB_DB_BYTES_READ,
+  // # of keys written by BlobDB as non-TTL inlined value. Only applicable to
+  // legacy BlobDB.
+  BLOB_DB_WRITE_INLINED,
+  // # of keys written by BlobDB as TTL inlined value. Only applicable to legacy
+  // BlobDB.
+  BLOB_DB_WRITE_INLINED_TTL,
+  // # of keys written by BlobDB as non-TTL blob value. Only applicable to
+  // legacy BlobDB.
+  BLOB_DB_WRITE_BLOB,
+  // # of keys written by BlobDB as TTL blob value. Only applicable to legacy
+  // BlobDB.
+  BLOB_DB_WRITE_BLOB_TTL,
+  // # of bytes written to blob file.
+  BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
+  // # of bytes read from blob file.
+  BLOB_DB_BLOB_FILE_BYTES_READ,
+  // # of times a blob files being synced.
+  BLOB_DB_BLOB_FILE_SYNCED,
+  // # of blob index evicted from base DB by BlobDB compaction filter because
+  // of expiration. Only applicable to legacy BlobDB.
+  BLOB_DB_BLOB_INDEX_EXPIRED_COUNT,
+  // size of blob index evicted from base DB by BlobDB compaction filter
+  // because of expiration. Only applicable to legacy BlobDB.
+  BLOB_DB_BLOB_INDEX_EXPIRED_SIZE,
+  // # of blob index evicted from base DB by BlobDB compaction filter because
+  // of corresponding file deleted. Only applicable to legacy BlobDB.
+  BLOB_DB_BLOB_INDEX_EVICTED_COUNT,
+  // size of blob index evicted from base DB by BlobDB compaction filter
+  // because of corresponding file deleted. Only applicable to legacy BlobDB.
+  BLOB_DB_BLOB_INDEX_EVICTED_SIZE,
+  // # of blob files that were obsoleted by garbage collection. Only applicable
+  // to legacy BlobDB.
+  BLOB_DB_GC_NUM_FILES,
+  // # of blob files generated by garbage collection. Only applicable to legacy
+  // BlobDB.
+  BLOB_DB_GC_NUM_NEW_FILES,
+  // # of BlobDB garbage collection failures. Only applicable to legacy BlobDB.
+  BLOB_DB_GC_FAILURES,
+  // # of keys dropped by BlobDB garbage collection because they had been
+  // overwritten. DEPRECATED.
+  BLOB_DB_GC_NUM_KEYS_OVERWRITTEN,
+  // # of keys dropped by BlobDB garbage collection because of expiration.
+  // DEPRECATED.
+  BLOB_DB_GC_NUM_KEYS_EXPIRED,
+  // # of keys relocated to new blob file by garbage collection.
+  BLOB_DB_GC_NUM_KEYS_RELOCATED,
+  // # of bytes dropped by BlobDB garbage collection because they had been
+  // overwritten. DEPRECATED.
+  BLOB_DB_GC_BYTES_OVERWRITTEN,
+  // # of bytes dropped by BlobDB garbage collection because of expiration.
+  // DEPRECATED.
+  BLOB_DB_GC_BYTES_EXPIRED,
+  // # of bytes relocated to new blob file by garbage collection.
+  BLOB_DB_GC_BYTES_RELOCATED,
+  // # of blob files evicted because of BlobDB is full. Only applicable to
+  // legacy BlobDB.
+  BLOB_DB_FIFO_NUM_FILES_EVICTED,
+  // # of keys in the blob files evicted because of BlobDB is full. Only
+  // applicable to legacy BlobDB.
+  BLOB_DB_FIFO_NUM_KEYS_EVICTED,
+  // # of bytes in the blob files evicted because of BlobDB is full. Only
+  // applicable to legacy BlobDB.
+  BLOB_DB_FIFO_BYTES_EVICTED,
+
+  // These counters indicate a performance issue in WritePrepared transactions.
+  // We should not seem them ticking them much.
+  // # of times prepare_mutex_ is acquired in the fast path.
+  TXN_PREPARE_MUTEX_OVERHEAD,
+  // # of times old_commit_map_mutex_ is acquired in the fast path.
+  TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD,
+  // # of times we checked a batch for duplicate keys.
+  TXN_DUPLICATE_KEY_OVERHEAD,
+  // # of times snapshot_mutex_ is acquired in the fast path.
+  TXN_SNAPSHOT_MUTEX_OVERHEAD,
+  // # of times ::Get returned TryAgain due to expired snapshot seq
+  TXN_GET_TRY_AGAIN,
+
+  // Number of keys actually found in MultiGet calls (vs number requested by
+  // caller)
+  // NUMBER_MULTIGET_KEYS_READ gives the number requested by caller
+  NUMBER_MULTIGET_KEYS_FOUND,
+
+  NO_ITERATOR_CREATED,  // number of iterators created
+  NO_ITERATOR_DELETED,  // number of iterators deleted
+
+  BLOCK_CACHE_COMPRESSION_DICT_MISS,
+  BLOCK_CACHE_COMPRESSION_DICT_HIT,
+  BLOCK_CACHE_COMPRESSION_DICT_ADD,
+  BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
+  BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT,
+
+  // # of blocks redundantly inserted into block cache.
+  // REQUIRES: BLOCK_CACHE_ADD_REDUNDANT <= BLOCK_CACHE_ADD
+  BLOCK_CACHE_ADD_REDUNDANT,
+  // # of index blocks redundantly inserted into block cache.
+  // REQUIRES: BLOCK_CACHE_INDEX_ADD_REDUNDANT <= BLOCK_CACHE_INDEX_ADD
+  BLOCK_CACHE_INDEX_ADD_REDUNDANT,
+  // # of filter blocks redundantly inserted into block cache.
+  // REQUIRES: BLOCK_CACHE_FILTER_ADD_REDUNDANT <= BLOCK_CACHE_FILTER_ADD
+  BLOCK_CACHE_FILTER_ADD_REDUNDANT,
+  // # of data blocks redundantly inserted into block cache.
+  // REQUIRES: BLOCK_CACHE_DATA_ADD_REDUNDANT <= BLOCK_CACHE_DATA_ADD
+  BLOCK_CACHE_DATA_ADD_REDUNDANT,
+  // # of dict blocks redundantly inserted into block cache.
+  // REQUIRES: BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT
+  //           <= BLOCK_CACHE_COMPRESSION_DICT_ADD
+  BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT,
+
+  // # of files marked as trash by sst file manager and will be deleted
+  // later by background thread.
+  FILES_MARKED_TRASH,
+  // # of files deleted immediately by sst file manger through delete scheduler.
+  FILES_DELETED_IMMEDIATELY,
+
+  // The counters for error handler, not that, bg_io_error is the subset of
+  // bg_error and bg_retryable_io_error is the subset of bg_io_error
+  ERROR_HANDLER_BG_ERROR_COUNT,
+  ERROR_HANDLER_BG_IO_ERROR_COUNT,
+  ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT,
+  ERROR_HANDLER_AUTORESUME_COUNT,
+  ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT,
+  ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT,
+
+  // Statistics for memtable garbage collection:
+  // Raw bytes of data (payload) present on memtable at flush time.
+  MEMTABLE_PAYLOAD_BYTES_AT_FLUSH,
+  // Outdated bytes of data present on memtable at flush time.
+  MEMTABLE_GARBAGE_BYTES_AT_FLUSH,
+
+  // Secondary cache statistics
+  SECONDARY_CACHE_HITS,
+
+  // Bytes read by `VerifyChecksum()` and `VerifyFileChecksums()` APIs.
+  VERIFY_CHECKSUM_READ_BYTES,
+
+  // Bytes read/written while creating backups
+  BACKUP_READ_BYTES,
+  BACKUP_WRITE_BYTES,
+
+  // Remote compaction read/write statistics
+  REMOTE_COMPACT_READ_BYTES,
+  REMOTE_COMPACT_WRITE_BYTES,
+
+  // Tiered storage related statistics
+  HOT_FILE_READ_BYTES,
+  WARM_FILE_READ_BYTES,
+  COLD_FILE_READ_BYTES,
+  HOT_FILE_READ_COUNT,
+  WARM_FILE_READ_COUNT,
+  COLD_FILE_READ_COUNT,
+
+  // Last level and non-last level read statistics
+  LAST_LEVEL_READ_BYTES,
+  LAST_LEVEL_READ_COUNT,
+  NON_LAST_LEVEL_READ_BYTES,
+  NON_LAST_LEVEL_READ_COUNT,
+
+  BLOCK_CHECKSUM_COMPUTE_COUNT,
+  MULTIGET_COROUTINE_COUNT,
+
+  // Integrated BlobDB specific stats
+  // # of times cache miss when accessing blob from blob cache.
+  BLOB_DB_CACHE_MISS,
+  // # of times cache hit when accessing blob from blob cache.
+  BLOB_DB_CACHE_HIT,
+  // # of data blocks added to blob cache.
+  BLOB_DB_CACHE_ADD,
+  // # of failures when adding blobs to blob cache.
+  BLOB_DB_CACHE_ADD_FAILURES,
+  // # of bytes read from blob cache.
+  BLOB_DB_CACHE_BYTES_READ,
+  // # of bytes written into blob cache.
+  BLOB_DB_CACHE_BYTES_WRITE,
+
+  // Time spent in the ReadAsync file system call
+  READ_ASYNC_MICROS,
+  // Number of errors returned to the async read callback
+  ASYNC_READ_ERROR_COUNT,
+
+  TICKER_ENUM_MAX
+};
+
+// The order of items listed in  Tickers should be the same as
+// the order listed in TickersNameMap
+extern const std::vector<std::pair<Tickers, std::string>> TickersNameMap;
+
+/**
+ * Keep adding histogram's here.
+ * Any histogram should have value less than HISTOGRAM_ENUM_MAX
+ * Add a new Histogram by assigning it the current value of HISTOGRAM_ENUM_MAX
+ * Add a string representation in HistogramsNameMap below
+ * And increment HISTOGRAM_ENUM_MAX
+ * Add a corresponding enum value to HistogramType.java in the java API
+ */
+enum Histograms : uint32_t {
+  DB_GET = 0,
+  DB_WRITE,
+  COMPACTION_TIME,
+  COMPACTION_CPU_TIME,
+  SUBCOMPACTION_SETUP_TIME,
+  TABLE_SYNC_MICROS,
+  COMPACTION_OUTFILE_SYNC_MICROS,
+  WAL_FILE_SYNC_MICROS,
+  MANIFEST_FILE_SYNC_MICROS,
+  // TIME SPENT IN IO DURING TABLE OPEN
+  TABLE_OPEN_IO_MICROS,
+  DB_MULTIGET,
+  READ_BLOCK_COMPACTION_MICROS,
+  READ_BLOCK_GET_MICROS,
+  WRITE_RAW_BLOCK_MICROS,
+  STALL_L0_SLOWDOWN_COUNT,
+  STALL_MEMTABLE_COMPACTION_COUNT,
+  STALL_L0_NUM_FILES_COUNT,
+  HARD_RATE_LIMIT_DELAY_COUNT,
+  SOFT_RATE_LIMIT_DELAY_COUNT,
+  NUM_FILES_IN_SINGLE_COMPACTION,
+  DB_SEEK,
+  WRITE_STALL,
+  SST_READ_MICROS,
+  // The number of subcompactions actually scheduled during a compaction
+  NUM_SUBCOMPACTIONS_SCHEDULED,
+  // Value size distribution in each operation
+  BYTES_PER_READ,
+  BYTES_PER_WRITE,
+  BYTES_PER_MULTIGET,
+
+  // number of bytes compressed/decompressed
+  // number of bytes is when uncompressed; i.e. before/after respectively
+  BYTES_COMPRESSED,
+  BYTES_DECOMPRESSED,
+  COMPRESSION_TIMES_NANOS,
+  DECOMPRESSION_TIMES_NANOS,
+  // Number of merge operands passed to the merge operator in user read
+  // requests.
+  READ_NUM_MERGE_OPERANDS,
+
+  // BlobDB specific stats
+  // Size of keys written to BlobDB. Only applicable to legacy BlobDB.
+  BLOB_DB_KEY_SIZE,
+  // Size of values written to BlobDB. Only applicable to legacy BlobDB.
+  BLOB_DB_VALUE_SIZE,
+  // BlobDB Put/PutWithTTL/PutUntil/Write latency. Only applicable to legacy
+  // BlobDB.
+  BLOB_DB_WRITE_MICROS,
+  // BlobDB Get latency. Only applicable to legacy BlobDB.
+  BLOB_DB_GET_MICROS,
+  // BlobDB MultiGet latency. Only applicable to legacy BlobDB.
+  BLOB_DB_MULTIGET_MICROS,
+  // BlobDB Seek/SeekToFirst/SeekToLast/SeekForPrev latency. Only applicable to
+  // legacy BlobDB.
+  BLOB_DB_SEEK_MICROS,
+  // BlobDB Next latency. Only applicable to legacy BlobDB.
+  BLOB_DB_NEXT_MICROS,
+  // BlobDB Prev latency. Only applicable to legacy BlobDB.
+  BLOB_DB_PREV_MICROS,
+  // Blob file write latency.
+  BLOB_DB_BLOB_FILE_WRITE_MICROS,
+  // Blob file read latency.
+  BLOB_DB_BLOB_FILE_READ_MICROS,
+  // Blob file sync latency.
+  BLOB_DB_BLOB_FILE_SYNC_MICROS,
+  // BlobDB garbage collection time. DEPRECATED.
+  BLOB_DB_GC_MICROS,
+  // BlobDB compression time.
+  BLOB_DB_COMPRESSION_MICROS,
+  // BlobDB decompression time.
+  BLOB_DB_DECOMPRESSION_MICROS,
+  // Time spent flushing memtable to disk
+  FLUSH_TIME,
+  SST_BATCH_SIZE,
+
+  // MultiGet stats logged per level
+  // Num of index and filter blocks read from file system per level.
+  NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
+  // Num of data blocks read from file system per level.
+  // Obsolete
+  NUM_DATA_BLOCKS_READ_PER_LEVEL,
+  // Num of sst files read from file system per level.
+  NUM_SST_READ_PER_LEVEL,
+
+  // Error handler statistics
+  ERROR_HANDLER_AUTORESUME_RETRY_COUNT,
+
+  // Stats related to asynchronous read requests.
+  ASYNC_READ_BYTES,
+  POLL_WAIT_MICROS,
+
+  // Number of prefetched bytes discarded by RocksDB.
+  PREFETCHED_BYTES_DISCARDED,
+
+  // Number of IOs issued in parallel in a MultiGet batch
+  MULTIGET_IO_BATCH_SIZE,
+
+  // Number of levels requiring IO for MultiGet
+  NUM_LEVEL_READ_PER_MULTIGET,
+
+  // Wait time for aborting async read in FilePrefetchBuffer destructor
+  ASYNC_PREFETCH_ABORT_MICROS,
+
+  HISTOGRAM_ENUM_MAX,
+};
+
+extern const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap;
+
+struct HistogramData {
+  double median;
+  double percentile95;
+  double percentile99;
+  double average;
+  double standard_deviation;
+  // zero-initialize new members since old Statistics::histogramData()
+  // implementations won't write them.
+  double max = 0.0;
+  uint64_t count = 0;
+  uint64_t sum = 0;
+  double min = 0.0;
+};
+
+// StatsLevel can be used to reduce statistics overhead by skipping certain
+// types of stats in the stats collection process.
+// Usage:
+//   options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
+enum StatsLevel : uint8_t {
+  // Disable all metrics
+  kDisableAll,
+  // Disable tickers
+  kExceptTickers = kDisableAll,
+  // Disable timer stats, and skip histogram stats
+  kExceptHistogramOrTimers,
+  // Skip timer stats
+  kExceptTimers,
+  // Collect all stats except time inside mutex lock AND time spent on
+  // compression.
+  kExceptDetailedTimers,
+  // Collect all stats except the counters requiring to get time inside the
+  // mutex lock.
+  kExceptTimeForMutex,
+  // Collect all stats, including measuring duration of mutex operations.
+  // If getting time is expensive on the platform to run, it can
+  // reduce scalability to more threads, especially for writes.
+  kAll,
+};
+
+// Analyze the performance of a db by providing cumulative stats over time.
+// Usage:
+//  Options options;
+//  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+//  Status s = DB::Open(options, kDBPath, &db);
+//  ...
+//  options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
+//  HistogramData hist;
+//  options.statistics->histogramData(FLUSH_TIME, &hist);
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class Statistics : public Customizable {
+ public:
+  ~Statistics() override {}
+  static const char* Type() { return "Statistics"; }
+  static Status CreateFromString(const ConfigOptions& opts,
+                                 const std::string& value,
+                                 std::shared_ptr<Statistics>* result);
+  // Default name of empty, for backwards compatibility.  Derived classes should
+  // override this method.
+  // This default implementation will likely be removed in a future release
+  const char* Name() const override { return ""; }
+  virtual uint64_t getTickerCount(uint32_t tickerType) const = 0;
+  virtual void histogramData(uint32_t type,
+                             HistogramData* const data) const = 0;
+  virtual std::string getHistogramString(uint32_t /*type*/) const { return ""; }
+  virtual void recordTick(uint32_t tickerType, uint64_t count = 0) = 0;
+  virtual void setTickerCount(uint32_t tickerType, uint64_t count) = 0;
+  virtual uint64_t getAndResetTickerCount(uint32_t tickerType) = 0;
+  virtual void reportTimeToHistogram(uint32_t histogramType, uint64_t time) {
+    if (get_stats_level() <= StatsLevel::kExceptTimers) {
+      return;
+    }
+    recordInHistogram(histogramType, time);
+  }
+  // The function is here only for backward compatibility reason.
+  // Users implementing their own Statistics class should override
+  // recordInHistogram() instead and leave measureTime() as it is.
+  virtual void measureTime(uint32_t /*histogramType*/, uint64_t /*time*/) {
+    // This is not supposed to be called.
+    assert(false);
+  }
+  virtual void recordInHistogram(uint32_t histogramType, uint64_t time) {
+    // measureTime() is the old and inaccurate function name.
+    // To keep backward compatible. If users implement their own
+    // statistics, which overrides measureTime() but doesn't override
+    // this function. We forward to measureTime().
+    measureTime(histogramType, time);
+  }
+
+  // Resets all ticker and histogram stats
+  virtual Status Reset() { return Status::NotSupported("Not implemented"); }
+
+#ifndef ROCKSDB_LITE
+  using Customizable::ToString;
+#endif  // ROCKSDB_LITE
+  // String representation of the statistic object. Must be thread-safe.
+  virtual std::string ToString() const {
+    // Do nothing by default
+    return std::string("ToString(): not implemented");
+  }
+
+  virtual bool getTickerMap(std::map<std::string, uint64_t>*) const {
+    // Do nothing by default
+    return false;
+  }
+
+  // Override this function to disable particular histogram collection
+  virtual bool HistEnabledForType(uint32_t type) const {
+    return type < HISTOGRAM_ENUM_MAX;
+  }
+  void set_stats_level(StatsLevel sl) {
+    stats_level_.store(sl, std::memory_order_relaxed);
+  }
+  StatsLevel get_stats_level() const {
+    return stats_level_.load(std::memory_order_relaxed);
+  }
+
+ private:
+  std::atomic<StatsLevel> stats_level_{kExceptDetailedTimers};
+};
+
+// Create a concrete DBStatistics object
+std::shared_ptr<Statistics> CreateDBStatistics();
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/stats_history.h b/src/rocksdb/include/rocksdb/stats_history.h
new file mode 100644
index 000000000..57e469295
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/stats_history.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <map>
+#include <string>
+
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImpl;
+
+// StatsHistoryIterator is the main interface for users to programmatically
+// access statistics snapshots that was automatically stored by RocksDB.
+// Depending on options, the stats can be in memory or on disk.
+// The stats snapshots are indexed by time that they were recorded, and each
+// stats snapshot contains individual stat name and value at the time of
+// recording.
+// Example:
+//   std::unique_ptr<StatsHistoryIterator> stats_iter;
+//   Status s = db->GetStatsHistory(0 /* start_time */,
+//                                  env->NowMicros() /* end_time*/,
+//                                  &stats_iter);
+//   if (s.ok) {
+//     for (; stats_iter->Valid(); stats_iter->Next()) {
+//       uint64_t stats_time = stats_iter->GetStatsTime();
+//       const std::map<std::string, uint64_t>& stats_map =
+//           stats_iter->GetStatsMap();
+//       process(stats_time, stats_map);
+//     }
+//   }
+class StatsHistoryIterator {
+ public:
+  StatsHistoryIterator() {}
+  virtual ~StatsHistoryIterator() {}
+
+  virtual bool Valid() const = 0;
+
+  // Moves to the next stats history record.  After this call, Valid() is
+  // true iff the iterator was not positioned at the last entry in the source.
+  // REQUIRES: Valid()
+  virtual void Next() = 0;
+
+  // Return the time stamp (in seconds) when stats history is recorded.
+  // REQUIRES: Valid()
+  virtual uint64_t GetStatsTime() const = 0;
+
+  // DEPRECATED (was never used)
+  virtual int GetFormatVersion() const { return -1; }
+
+  // Return the current stats history as an std::map which specifies the
+  // mapping from stats name to stats value . The underlying storage
+  // for the returned map is valid only until the next modification of
+  // the iterator.
+  // REQUIRES: Valid()
+  virtual const std::map<std::string, uint64_t>& GetStatsMap() const = 0;
+
+  // If an error has occurred, return it.  Else return an ok status.
+  virtual Status status() const = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/status.h b/src/rocksdb/include/rocksdb/status.h
new file mode 100644
index 000000000..1ab3dc4cb
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/status.h
@@ -0,0 +1,570 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Status encapsulates the result of an operation.  It may indicate success,
+// or it may indicate an error with an associated error message.
+//
+// Multiple threads can invoke const methods on a Status without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Status must use
+// external synchronization.
+
+#pragma once
+
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+#include <stdio.h>
+#include <stdlib.h>
+#endif
+
+#include <memory>
+#include <string>
+
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+#include "port/stack_trace.h"
+#endif
+
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Status {
+ public:
+  // Create a success status.
+  Status()
+      : code_(kOk),
+        subcode_(kNone),
+        sev_(kNoError),
+        retryable_(false),
+        data_loss_(false),
+        scope_(0),
+        state_(nullptr) {}
+  ~Status() {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+    if (!checked_) {
+      fprintf(stderr, "Failed to check Status %p\n", this);
+      port::PrintStack();
+      std::abort();
+    }
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+  }
+
+  // Copy the specified status.
+  Status(const Status& s);
+  Status& operator=(const Status& s);
+  Status(Status&& s) noexcept;
+  Status& operator=(Status&& s) noexcept;
+  bool operator==(const Status& rhs) const;
+  bool operator!=(const Status& rhs) const;
+
+  // In case of intentionally swallowing an error, user must explicitly call
+  // this function. That way we are easily able to search the code to find where
+  // error swallowing occurs.
+  inline void PermitUncheckedError() const { MarkChecked(); }
+
+  inline void MustCheck() const {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+    checked_ = false;
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+  }
+
+  enum Code : unsigned char {
+    kOk = 0,
+    kNotFound = 1,
+    kCorruption = 2,
+    kNotSupported = 3,
+    kInvalidArgument = 4,
+    kIOError = 5,
+    kMergeInProgress = 6,
+    kIncomplete = 7,
+    kShutdownInProgress = 8,
+    kTimedOut = 9,
+    kAborted = 10,
+    kBusy = 11,
+    kExpired = 12,
+    kTryAgain = 13,
+    kCompactionTooLarge = 14,
+    kColumnFamilyDropped = 15,
+    kMaxCode
+  };
+
+  Code code() const {
+    MarkChecked();
+    return code_;
+  }
+
+  enum SubCode : unsigned char {
+    kNone = 0,
+    kMutexTimeout = 1,
+    kLockTimeout = 2,
+    kLockLimit = 3,
+    kNoSpace = 4,
+    kDeadlock = 5,
+    kStaleFile = 6,
+    kMemoryLimit = 7,
+    kSpaceLimit = 8,
+    kPathNotFound = 9,
+    KMergeOperandsInsufficientCapacity = 10,
+    kManualCompactionPaused = 11,
+    kOverwritten = 12,
+    kTxnNotPrepared = 13,
+    kIOFenced = 14,
+    kMaxSubCode
+  };
+
+  SubCode subcode() const {
+    MarkChecked();
+    return subcode_;
+  }
+
+  enum Severity : unsigned char {
+    kNoError = 0,
+    kSoftError = 1,
+    kHardError = 2,
+    kFatalError = 3,
+    kUnrecoverableError = 4,
+    kMaxSeverity
+  };
+
+  Status(const Status& s, Severity sev);
+
+  Status(Code _code, SubCode _subcode, Severity _sev, const Slice& msg)
+      : Status(_code, _subcode, msg, "", _sev) {}
+
+  Severity severity() const {
+    MarkChecked();
+    return sev_;
+  }
+
+  // Returns a C style string indicating the message of the Status
+  const char* getState() const {
+    MarkChecked();
+    return state_.get();
+  }
+
+  // Return a success status.
+  static Status OK() { return Status(); }
+
+  // Successful, though an existing something was overwritten
+  // Note: using variants of OK status for program logic is discouraged,
+  // but it can be useful for communicating statistical information without
+  // changing public APIs.
+  static Status OkOverwritten() { return Status(kOk, kOverwritten); }
+
+  // Return error status of an appropriate type.
+  static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kNotFound, msg, msg2);
+  }
+
+  // Fast path for not found without malloc;
+  static Status NotFound(SubCode msg = kNone) { return Status(kNotFound, msg); }
+
+  static Status NotFound(SubCode sc, const Slice& msg,
+                         const Slice& msg2 = Slice()) {
+    return Status(kNotFound, sc, msg, msg2);
+  }
+
+  static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kCorruption, msg, msg2);
+  }
+  static Status Corruption(SubCode msg = kNone) {
+    return Status(kCorruption, msg);
+  }
+
+  static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kNotSupported, msg, msg2);
+  }
+  static Status NotSupported(SubCode msg = kNone) {
+    return Status(kNotSupported, msg);
+  }
+
+  static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kInvalidArgument, msg, msg2);
+  }
+  static Status InvalidArgument(SubCode msg = kNone) {
+    return Status(kInvalidArgument, msg);
+  }
+
+  static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kIOError, msg, msg2);
+  }
+  static Status IOError(SubCode msg = kNone) { return Status(kIOError, msg); }
+
+  static Status MergeInProgress(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kMergeInProgress, msg, msg2);
+  }
+  static Status MergeInProgress(SubCode msg = kNone) {
+    return Status(kMergeInProgress, msg);
+  }
+
+  static Status Incomplete(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kIncomplete, msg, msg2);
+  }
+  static Status Incomplete(SubCode msg = kNone) {
+    return Status(kIncomplete, msg);
+  }
+
+  static Status ShutdownInProgress(SubCode msg = kNone) {
+    return Status(kShutdownInProgress, msg);
+  }
+  static Status ShutdownInProgress(const Slice& msg,
+                                   const Slice& msg2 = Slice()) {
+    return Status(kShutdownInProgress, msg, msg2);
+  }
+  static Status Aborted(SubCode msg = kNone) { return Status(kAborted, msg); }
+  static Status Aborted(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kAborted, msg, msg2);
+  }
+
+  static Status Busy(SubCode msg = kNone) { return Status(kBusy, msg); }
+  static Status Busy(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kBusy, msg, msg2);
+  }
+
+  static Status TimedOut(SubCode msg = kNone) { return Status(kTimedOut, msg); }
+  static Status TimedOut(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kTimedOut, msg, msg2);
+  }
+
+  static Status Expired(SubCode msg = kNone) { return Status(kExpired, msg); }
+  static Status Expired(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kExpired, msg, msg2);
+  }
+
+  static Status TryAgain(SubCode msg = kNone) { return Status(kTryAgain, msg); }
+  static Status TryAgain(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kTryAgain, msg, msg2);
+  }
+
+  static Status CompactionTooLarge(SubCode msg = kNone) {
+    return Status(kCompactionTooLarge, msg);
+  }
+  static Status CompactionTooLarge(const Slice& msg,
+                                   const Slice& msg2 = Slice()) {
+    return Status(kCompactionTooLarge, msg, msg2);
+  }
+
+  static Status ColumnFamilyDropped(SubCode msg = kNone) {
+    return Status(kColumnFamilyDropped, msg);
+  }
+
+  static Status ColumnFamilyDropped(const Slice& msg,
+                                    const Slice& msg2 = Slice()) {
+    return Status(kColumnFamilyDropped, msg, msg2);
+  }
+
+  static Status NoSpace() { return Status(kIOError, kNoSpace); }
+  static Status NoSpace(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kIOError, kNoSpace, msg, msg2);
+  }
+
+  static Status MemoryLimit() { return Status(kAborted, kMemoryLimit); }
+  static Status MemoryLimit(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kAborted, kMemoryLimit, msg, msg2);
+  }
+
+  static Status SpaceLimit() { return Status(kIOError, kSpaceLimit); }
+  static Status SpaceLimit(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kIOError, kSpaceLimit, msg, msg2);
+  }
+
+  static Status PathNotFound() { return Status(kIOError, kPathNotFound); }
+  static Status PathNotFound(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kIOError, kPathNotFound, msg, msg2);
+  }
+
+  static Status TxnNotPrepared() {
+    return Status(kInvalidArgument, kTxnNotPrepared);
+  }
+  static Status TxnNotPrepared(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kInvalidArgument, kTxnNotPrepared, msg, msg2);
+  }
+
+  // Returns true iff the status indicates success.
+  bool ok() const {
+    MarkChecked();
+    return code() == kOk;
+  }
+
+  // Returns true iff the status indicates success *with* something
+  // overwritten
+  bool IsOkOverwritten() const {
+    MarkChecked();
+    return code() == kOk && subcode() == kOverwritten;
+  }
+
+  // Returns true iff the status indicates a NotFound error.
+  bool IsNotFound() const {
+    MarkChecked();
+    return code() == kNotFound;
+  }
+
+  // Returns true iff the status indicates a Corruption error.
+  bool IsCorruption() const {
+    MarkChecked();
+    return code() == kCorruption;
+  }
+
+  // Returns true iff the status indicates a NotSupported error.
+  bool IsNotSupported() const {
+    MarkChecked();
+    return code() == kNotSupported;
+  }
+
+  // Returns true iff the status indicates an InvalidArgument error.
+  bool IsInvalidArgument() const {
+    MarkChecked();
+    return code() == kInvalidArgument;
+  }
+
+  // Returns true iff the status indicates an IOError.
+  bool IsIOError() const {
+    MarkChecked();
+    return code() == kIOError;
+  }
+
+  // Returns true iff the status indicates an MergeInProgress.
+  bool IsMergeInProgress() const {
+    MarkChecked();
+    return code() == kMergeInProgress;
+  }
+
+  // Returns true iff the status indicates Incomplete
+  bool IsIncomplete() const {
+    MarkChecked();
+    return code() == kIncomplete;
+  }
+
+  // Returns true iff the status indicates Shutdown In progress
+  bool IsShutdownInProgress() const {
+    MarkChecked();
+    return code() == kShutdownInProgress;
+  }
+
+  bool IsTimedOut() const {
+    MarkChecked();
+    return code() == kTimedOut;
+  }
+
+  bool IsAborted() const {
+    MarkChecked();
+    return code() == kAborted;
+  }
+
+  bool IsLockLimit() const {
+    MarkChecked();
+    return code() == kAborted && subcode() == kLockLimit;
+  }
+
+  // Returns true iff the status indicates that a resource is Busy and
+  // temporarily could not be acquired.
+  bool IsBusy() const {
+    MarkChecked();
+    return code() == kBusy;
+  }
+
+  bool IsDeadlock() const {
+    MarkChecked();
+    return code() == kBusy && subcode() == kDeadlock;
+  }
+
+  // Returns true iff the status indicated that the operation has Expired.
+  bool IsExpired() const {
+    MarkChecked();
+    return code() == kExpired;
+  }
+
+  // Returns true iff the status indicates a TryAgain error.
+  // This usually means that the operation failed, but may succeed if
+  // re-attempted.
+  bool IsTryAgain() const {
+    MarkChecked();
+    return code() == kTryAgain;
+  }
+
+  // Returns true iff the status indicates the proposed compaction is too large
+  bool IsCompactionTooLarge() const {
+    MarkChecked();
+    return code() == kCompactionTooLarge;
+  }
+
+  // Returns true iff the status indicates Column Family Dropped
+  bool IsColumnFamilyDropped() const {
+    MarkChecked();
+    return code() == kColumnFamilyDropped;
+  }
+
+  // Returns true iff the status indicates a NoSpace error
+  // This is caused by an I/O error returning the specific "out of space"
+  // error condition. Stricto sensu, an NoSpace error is an I/O error
+  // with a specific subcode, enabling users to take the appropriate action
+  // if needed
+  bool IsNoSpace() const {
+    MarkChecked();
+    return (code() == kIOError) && (subcode() == kNoSpace);
+  }
+
+  // Returns true iff the status indicates a memory limit error.  There may be
+  // cases where we limit the memory used in certain operations (eg. the size
+  // of a write batch) in order to avoid out of memory exceptions.
+  bool IsMemoryLimit() const {
+    MarkChecked();
+    return (code() == kAborted) && (subcode() == kMemoryLimit);
+  }
+
+  // Returns true iff the status indicates a PathNotFound error
+  // This is caused by an I/O error returning the specific "no such file or
+  // directory" error condition. A PathNotFound error is an I/O error with
+  // a specific subcode, enabling users to take appropriate action if necessary
+  bool IsPathNotFound() const {
+    MarkChecked();
+    return (code() == kIOError || code() == kNotFound) &&
+           (subcode() == kPathNotFound);
+  }
+
+  // Returns true iff the status indicates manual compaction paused. This
+  // is caused by a call to PauseManualCompaction
+  bool IsManualCompactionPaused() const {
+    MarkChecked();
+    return (code() == kIncomplete) && (subcode() == kManualCompactionPaused);
+  }
+
+  // Returns true iff the status indicates a TxnNotPrepared error.
+  bool IsTxnNotPrepared() const {
+    MarkChecked();
+    return (code() == kInvalidArgument) && (subcode() == kTxnNotPrepared);
+  }
+
+  // Returns true iff the status indicates a IOFenced error.
+  bool IsIOFenced() const {
+    MarkChecked();
+    return (code() == kIOError) && (subcode() == kIOFenced);
+  }
+
+  // Return a string representation of this status suitable for printing.
+  // Returns the string "OK" for success.
+  std::string ToString() const;
+
+ protected:
+  Code code_;
+  SubCode subcode_;
+  Severity sev_;
+  bool retryable_;
+  bool data_loss_;
+  unsigned char scope_;
+  // A nullptr state_ (which is at least the case for OK) means the extra
+  // message is empty.
+  std::unique_ptr<const char[]> state_;
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+  mutable bool checked_ = false;
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+
+  explicit Status(Code _code, SubCode _subcode = kNone)
+      : code_(_code),
+        subcode_(_subcode),
+        sev_(kNoError),
+        retryable_(false),
+        data_loss_(false),
+        scope_(0) {}
+
+  explicit Status(Code _code, SubCode _subcode, bool retryable, bool data_loss,
+                  unsigned char scope)
+      : code_(_code),
+        subcode_(_subcode),
+        sev_(kNoError),
+        retryable_(retryable),
+        data_loss_(data_loss),
+        scope_(scope) {}
+
+  Status(Code _code, SubCode _subcode, const Slice& msg, const Slice& msg2,
+         Severity sev = kNoError);
+  Status(Code _code, const Slice& msg, const Slice& msg2)
+      : Status(_code, kNone, msg, msg2) {}
+
+  static std::unique_ptr<const char[]> CopyState(const char* s);
+
+  inline void MarkChecked() const {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+    checked_ = true;
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+  }
+};
+
+inline Status::Status(const Status& s)
+    : code_(s.code_),
+      subcode_(s.subcode_),
+      sev_(s.sev_),
+      retryable_(s.retryable_),
+      data_loss_(s.data_loss_),
+      scope_(s.scope_) {
+  s.MarkChecked();
+  state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_.get());
+}
+inline Status::Status(const Status& s, Severity sev)
+    : code_(s.code_),
+      subcode_(s.subcode_),
+      sev_(sev),
+      retryable_(s.retryable_),
+      data_loss_(s.data_loss_),
+      scope_(s.scope_) {
+  s.MarkChecked();
+  state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_.get());
+}
+inline Status& Status::operator=(const Status& s) {
+  if (this != &s) {
+    s.MarkChecked();
+    MustCheck();
+    code_ = s.code_;
+    subcode_ = s.subcode_;
+    sev_ = s.sev_;
+    retryable_ = s.retryable_;
+    data_loss_ = s.data_loss_;
+    scope_ = s.scope_;
+    state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_.get());
+  }
+  return *this;
+}
+
+inline Status::Status(Status&& s) noexcept : Status() {
+  s.MarkChecked();
+  *this = std::move(s);
+}
+
+inline Status& Status::operator=(Status&& s) noexcept {
+  if (this != &s) {
+    s.MarkChecked();
+    MustCheck();
+    code_ = std::move(s.code_);
+    s.code_ = kOk;
+    subcode_ = std::move(s.subcode_);
+    s.subcode_ = kNone;
+    sev_ = std::move(s.sev_);
+    s.sev_ = kNoError;
+    retryable_ = std::move(s.retryable_);
+    s.retryable_ = false;
+    data_loss_ = std::move(s.data_loss_);
+    s.data_loss_ = false;
+    scope_ = std::move(s.scope_);
+    s.scope_ = 0;
+    state_ = std::move(s.state_);
+  }
+  return *this;
+}
+
+inline bool Status::operator==(const Status& rhs) const {
+  MarkChecked();
+  rhs.MarkChecked();
+  return (code_ == rhs.code_);
+}
+
+inline bool Status::operator!=(const Status& rhs) const {
+  MarkChecked();
+  rhs.MarkChecked();
+  return !(*this == rhs);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/system_clock.h b/src/rocksdb/include/rocksdb/system_clock.h
new file mode 100644
index 000000000..486183d60
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/system_clock.h
@@ -0,0 +1,116 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+
+#include <memory>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+
+#ifdef _WIN32
+// Windows API macro interference
+#undef GetCurrentTime
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+struct ConfigOptions;
+
+// A SystemClock is an interface used by the rocksdb implementation to access
+// operating system time-related functionality.
+class SystemClock : public Customizable {
+ public:
+  ~SystemClock() override {}
+
+  static const char* Type() { return "SystemClock"; }
+  static Status CreateFromString(const ConfigOptions& options,
+                                 const std::string& value,
+                                 std::shared_ptr<SystemClock>* result);
+  // The name of this system clock
+  virtual const char* Name() const override = 0;
+
+  // The name/nickname for the Default SystemClock.  This name can be used
+  // to determine if the clock is the default one.
+  static const char* kDefaultName() { return "DefaultClock"; }
+
+  // Return a default SystemClock suitable for the current operating
+  // system.
+  static const std::shared_ptr<SystemClock>& Default();
+
+  // Returns the number of micro-seconds since some fixed point in time.
+  // It is often used as system time such as in GenericRateLimiter
+  // and other places so a port needs to return system time in order to work.
+  virtual uint64_t NowMicros() = 0;
+
+  // Returns the number of nano-seconds since some fixed point in time. Only
+  // useful for computing deltas of time in one run.
+  // Default implementation simply relies on NowMicros.
+  // In platform-specific implementations, NowNanos() should return time points
+  // that are MONOTONIC.
+  virtual uint64_t NowNanos() { return NowMicros() * 1000; }
+
+  // Returns the number of micro-seconds of CPU time used by the current thread.
+  // 0 indicates not supported.
+  virtual uint64_t CPUMicros() { return 0; }
+
+  // Returns the number of nano-seconds of CPU time used by the current thread.
+  // Default implementation simply relies on CPUMicros.
+  // 0 indicates not supported.
+  virtual uint64_t CPUNanos() { return CPUMicros() * 1000; }
+
+  // Sleep/delay the thread for the prescribed number of micro-seconds.
+  virtual void SleepForMicroseconds(int micros) = 0;
+
+  // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC).
+  // Only overwrites *unix_time on success.
+  virtual Status GetCurrentTime(int64_t* unix_time) = 0;
+
+  // Converts seconds-since-Jan-01-1970 to a printable string
+  virtual std::string TimeToString(uint64_t time) = 0;
+};
+
+// Wrapper class for a SystemClock.  Redirects all methods (except Name)
+// of the SystemClock interface to the target/wrapped class.
+class SystemClockWrapper : public SystemClock {
+ public:
+  explicit SystemClockWrapper(const std::shared_ptr<SystemClock>& t);
+
+  uint64_t NowMicros() override { return target_->NowMicros(); }
+
+  uint64_t NowNanos() override { return target_->NowNanos(); }
+
+  uint64_t CPUMicros() override { return target_->CPUMicros(); }
+
+  uint64_t CPUNanos() override { return target_->CPUNanos(); }
+
+  virtual void SleepForMicroseconds(int micros) override {
+    return target_->SleepForMicroseconds(micros);
+  }
+
+  Status GetCurrentTime(int64_t* unix_time) override {
+    return target_->GetCurrentTime(unix_time);
+  }
+
+  std::string TimeToString(uint64_t time) override {
+    return target_->TimeToString(time);
+  }
+
+  Status PrepareOptions(const ConfigOptions& options) override;
+#ifndef ROCKSDB_LITE
+  std::string SerializeOptions(const ConfigOptions& config_options,
+                               const std::string& header) const override;
+#endif  // ROCKSDB_LITE
+  const Customizable* Inner() const override { return target_.get(); }
+
+ protected:
+  std::shared_ptr<SystemClock> target_;
+};
+
+}  // end namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/table.h b/src/rocksdb/include/rocksdb/table.h
new file mode 100644
index 000000000..3a2bf2629
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/table.h
@@ -0,0 +1,940 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Currently we support two types of tables: plain table and block-based table.
+//   1. Block-based table: this is the default table type that we inherited from
+//      LevelDB, which was designed for storing data in hard disk or flash
+//      device.
+//   2. Plain table: it is one of RocksDB's SST file format optimized
+//      for low query latency on pure-memory or really low-latency media.
+//
+// A tutorial of rocksdb table formats is available here:
+//   https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats
+//
+// Example code is also available
+//   https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// -- Block-based Table
+class Cache;
+class FilterPolicy;
+class FlushBlockPolicyFactory;
+class PersistentCache;
+class RandomAccessFile;
+struct TableReaderOptions;
+struct TableBuilderOptions;
+class TableBuilder;
+class TableFactory;
+class TableReader;
+class WritableFileWriter;
+struct ConfigOptions;
+struct EnvOptions;
+
+// Types of checksums to use for checking integrity of logical blocks within
+// files. All checksums currently use 32 bits of checking power (1 in 4B
+// chance of failing to detect random corruption).
+enum ChecksumType : char {
+  kNoChecksum = 0x0,
+  kCRC32c = 0x1,
+  kxxHash = 0x2,
+  kxxHash64 = 0x3,
+  kXXH3 = 0x4,  // Supported since RocksDB 6.27
+};
+
+// `PinningTier` is used to specify which tier of block-based tables should
+// be affected by a block cache pinning setting (see
+// `MetadataCacheOptions` below).
+enum class PinningTier {
+  // For compatibility, this value specifies to fallback to the behavior
+  // indicated by the deprecated options,
+  // `pin_l0_filter_and_index_blocks_in_cache` and
+  // `pin_top_level_index_and_filter`.
+  kFallback,
+
+  // This tier contains no block-based tables.
+  kNone,
+
+  // This tier contains block-based tables that may have originated from a
+  // memtable flush. In particular, it includes tables from L0 that are smaller
+  // than 1.5 times the current `write_buffer_size`. Note these criteria imply
+  // it can include intra-L0 compaction outputs and ingested files, as long as
+  // they are not abnormally large compared to flushed files in L0.
+  kFlushedAndSimilar,
+
+  // This tier contains all block-based tables.
+  kAll,
+};
+
+// `MetadataCacheOptions` contains members indicating the desired caching
+// behavior for the different categories of metadata blocks.
+struct MetadataCacheOptions {
+  // The tier of block-based tables whose top-level index into metadata
+  // partitions will be pinned. Currently indexes and filters may be
+  // partitioned.
+  //
+  // Note `cache_index_and_filter_blocks` must be true for this option to have
+  // any effect. Otherwise any top-level index into metadata partitions would be
+  // held in table reader memory, outside the block cache.
+  PinningTier top_level_index_pinning = PinningTier::kFallback;
+
+  // The tier of block-based tables whose metadata partitions will be pinned.
+  // Currently indexes and filters may be partitioned.
+  PinningTier partition_pinning = PinningTier::kFallback;
+
+  // The tier of block-based tables whose unpartitioned metadata blocks will be
+  // pinned.
+  //
+  // Note `cache_index_and_filter_blocks` must be true for this option to have
+  // any effect. Otherwise the unpartitioned meta-blocks would be held in table
+  // reader memory, outside the block cache.
+  PinningTier unpartitioned_pinning = PinningTier::kFallback;
+};
+
+struct CacheEntryRoleOptions {
+  enum class Decision {
+    kEnabled,
+    kDisabled,
+    kFallback,
+  };
+  Decision charged = Decision::kFallback;
+  bool operator==(const CacheEntryRoleOptions& other) const {
+    return charged == other.charged;
+  }
+};
+
+struct CacheUsageOptions {
+  CacheEntryRoleOptions options;
+  std::map<CacheEntryRole, CacheEntryRoleOptions> options_overrides;
+};
+
+// For advanced user only
+struct BlockBasedTableOptions {
+  static const char* kName() { return "BlockTableOptions"; };
+  // @flush_block_policy_factory creates the instances of flush block policy.
+  // which provides a configurable way to determine when to flush a block in
+  // the block based tables.  If not set, table builder will use the default
+  // block flush policy, which cut blocks by block size (please refer to
+  // `FlushBlockBySizePolicy`).
+  std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory;
+
+  // TODO(kailiu) Temporarily disable this feature by making the default value
+  // to be false.
+  //
+  // TODO(ajkr) we need to update names of variables controlling meta-block
+  // caching as they should now apply to range tombstone and compression
+  // dictionary meta-blocks, in addition to index and filter meta-blocks.
+  //
+  // Whether to put index/filter blocks in the block cache. When false,
+  // each "table reader" object will pre-load index/filter blocks during
+  // table initialization. Index and filter partition blocks always use
+  // block cache regardless of this option.
+  bool cache_index_and_filter_blocks = false;
+
+  // If cache_index_and_filter_blocks is enabled, cache index and filter
+  // blocks with high priority. If set to true, depending on implementation of
+  // block cache, index, filter, and other metadata blocks may be less likely
+  // to be evicted than data blocks.
+  bool cache_index_and_filter_blocks_with_high_priority = true;
+
+  // DEPRECATED: This option will be removed in a future version. For now, this
+  // option still takes effect by updating each of the following variables that
+  // has the default value, `PinningTier::kFallback`:
+  //
+  // - `MetadataCacheOptions::partition_pinning`
+  // - `MetadataCacheOptions::unpartitioned_pinning`
+  //
+  // The updated value is chosen as follows:
+  //
+  // - `pin_l0_filter_and_index_blocks_in_cache == false` ->
+  //   `PinningTier::kNone`
+  // - `pin_l0_filter_and_index_blocks_in_cache == true` ->
+  //   `PinningTier::kFlushedAndSimilar`
+  //
+  // To migrate away from this flag, explicitly configure
+  // `MetadataCacheOptions` as described above.
+  //
+  // if cache_index_and_filter_blocks is true and the below is true, then
+  // filter and index blocks are stored in the cache, but a reference is
+  // held in the "table reader" object so the blocks are pinned and only
+  // evicted from cache when the table reader is freed.
+  bool pin_l0_filter_and_index_blocks_in_cache = false;
+
+  // DEPRECATED: This option will be removed in a future version. For now, this
+  // option still takes effect by updating
+  // `MetadataCacheOptions::top_level_index_pinning` when it has the
+  // default value, `PinningTier::kFallback`.
+  //
+  // The updated value is chosen as follows:
+  //
+  // - `pin_top_level_index_and_filter == false` ->
+  //   `PinningTier::kNone`
+  // - `pin_top_level_index_and_filter == true` ->
+  //   `PinningTier::kAll`
+  //
+  // To migrate away from this flag, explicitly configure
+  // `MetadataCacheOptions` as described above.
+  //
+  // If cache_index_and_filter_blocks is true and the below is true, then
+  // the top-level index of partitioned filter and index blocks are stored in
+  // the cache, but a reference is held in the "table reader" object so the
+  // blocks are pinned and only evicted from cache when the table reader is
+  // freed. This is not limited to l0 in LSM tree.
+  bool pin_top_level_index_and_filter = true;
+
+  // The desired block cache pinning behavior for the different categories of
+  // metadata blocks. While pinning can reduce block cache contention, users
+  // must take care not to pin excessive amounts of data, which risks
+  // overflowing block cache.
+  MetadataCacheOptions metadata_cache_options;
+
+  // The index type that will be used for this table.
+  enum IndexType : char {
+    // A space efficient index block that is optimized for
+    // binary-search-based index.
+    kBinarySearch = 0x00,
+
+    // The hash index, if enabled, will do the hash lookup when
+    // `Options.prefix_extractor` is provided.
+    kHashSearch = 0x01,
+
+    // A two-level index implementation. Both levels are binary search indexes.
+    // Second level index blocks ("partitions") use block cache even when
+    // cache_index_and_filter_blocks=false.
+    kTwoLevelIndexSearch = 0x02,
+
+    // Like kBinarySearch, but index also contains first key of each block.
+    // This allows iterators to defer reading the block until it's actually
+    // needed. May significantly reduce read amplification of short range scans.
+    // Without it, iterator seek usually reads one block from each level-0 file
+    // and from each level, which may be expensive.
+    // Works best in combination with:
+    //  - IndexShorteningMode::kNoShortening,
+    //  - custom FlushBlockPolicy to cut blocks at some meaningful boundaries,
+    //    e.g. when prefix changes.
+    // Makes the index significantly bigger (2x or more), especially when keys
+    // are long.
+    kBinarySearchWithFirstKey = 0x03,
+  };
+
+  IndexType index_type = kBinarySearch;
+
+  // The index type that will be used for the data block.
+  enum DataBlockIndexType : char {
+    kDataBlockBinarySearch = 0,   // traditional block type
+    kDataBlockBinaryAndHash = 1,  // additional hash index
+  };
+
+  DataBlockIndexType data_block_index_type = kDataBlockBinarySearch;
+
+  // #entries/#buckets. It is valid only when data_block_hash_index_type is
+  // kDataBlockBinaryAndHash.
+  double data_block_hash_table_util_ratio = 0.75;
+
+  // Option hash_index_allow_collision is now deleted.
+  // It will behave as if hash_index_allow_collision=true.
+
+  // Use the specified checksum type. Newly created table files will be
+  // protected with this checksum type. Old table files will still be readable,
+  // even though they have different checksum type.
+  ChecksumType checksum = kXXH3;
+
+  // Disable block cache. If this is set to true,
+  // then no block cache should be used, and the block_cache should
+  // point to a nullptr object.
+  bool no_block_cache = false;
+
+  // If non-NULL use the specified cache for blocks.
+  // If NULL, rocksdb will automatically create and use an 8MB internal cache.
+  std::shared_ptr<Cache> block_cache = nullptr;
+
+  // If non-NULL use the specified cache for pages read from device
+  // IF NULL, no page cache is used
+  std::shared_ptr<PersistentCache> persistent_cache = nullptr;
+
+  // DEPRECATED: This feature is planned for removal in a future release.
+  // Use SecondaryCache instead.
+  //
+  // If non-NULL use the specified cache for compressed blocks.
+  // If NULL, rocksdb will not use a compressed block cache.
+  // Note: though it looks similar to `block_cache`, RocksDB doesn't put the
+  //       same type of object there.
+  std::shared_ptr<Cache> block_cache_compressed = nullptr;
+
+  // Approximate size of user data packed per block.  Note that the
+  // block size specified here corresponds to uncompressed data.  The
+  // actual size of the unit read from disk may be smaller if
+  // compression is enabled.  This parameter can be changed dynamically.
+  uint64_t block_size = 4 * 1024;
+
+  // This is used to close a block before it reaches the configured
+  // 'block_size'. If the percentage of free space in the current block is less
+  // than this specified number and adding a new record to the block will
+  // exceed the configured block size, then this block will be closed and the
+  // new record will be written to the next block.
+  int block_size_deviation = 10;
+
+  // Number of keys between restart points for delta encoding of keys.
+  // This parameter can be changed dynamically.  Most clients should
+  // leave this parameter alone.  The minimum value allowed is 1.  Any smaller
+  // value will be silently overwritten with 1.
+  int block_restart_interval = 16;
+
+  // Same as block_restart_interval but used for the index block.
+  int index_block_restart_interval = 1;
+
+  // Block size for partitioned metadata. Currently applied to indexes when
+  // kTwoLevelIndexSearch is used and to filters when partition_filters is used.
+  // Note: Since in the current implementation the filters and index partitions
+  // are aligned, an index/filter block is created when either index or filter
+  // block size reaches the specified limit.
+  // Note: this limit is currently applied to only index blocks; a filter
+  // partition is cut right after an index block is cut
+  // TODO(myabandeh): remove the note above when filter partitions are cut
+  // separately
+  uint64_t metadata_block_size = 4096;
+
+  // `cache_usage_options` allows users to specify the default
+  // options (`cache_usage_options.options`) and the overriding
+  // options (`cache_usage_options.options_overrides`)
+  // for different `CacheEntryRole` under various features related to cache
+  // usage.
+  //
+  // For a certain `CacheEntryRole role` and a certain feature `f` of
+  // `CacheEntryRoleOptions`:
+  // 1. If `options_overrides` has an entry for `role` and
+  // `options_overrides[role].f != kFallback`, we use
+  // `options_overrides[role].f`
+  // 2. Otherwise, if `options[role].f != kFallback`, we use `options[role].f`
+  // 3. Otherwise, we follow the compatible existing behavior for `f` (see
+  // each feature's comment for more)
+  //
+  // `cache_usage_options` currently supports specifying options for the
+  // following features:
+  //
+  // 1. Memory charging to block cache (`CacheEntryRoleOptions::charged`)
+  // Memory charging is a feature of accounting memory usage of specific area
+  // (represented by `CacheEntryRole`) toward usage in block cache (if
+  // available), by updating a dynamical charge to the block cache loosely based
+  // on the actual memory usage of that area.
+  //
+  // (a) CacheEntryRole::kCompressionDictionaryBuildingBuffer
+  // (i) If kEnabled:
+  // Charge memory usage of the buffered data used as training samples for
+  // dictionary compression.
+  // If such memory usage exceeds the avaible space left in the block cache
+  // at some point (i.e, causing a cache full under
+  // `LRUCacheOptions::strict_capacity_limit` = true), the data will then be
+  // unbuffered.
+  // (ii) If kDisabled:
+  // Does not charge the memory usage mentioned above.
+  // (iii) Compatible existing behavior:
+  // Same as kEnabled.
+  //
+  // (b) CacheEntryRole::kFilterConstruction
+  // (i) If kEnabled:
+  // Charge memory usage of Bloom Filter
+  // (format_version >= 5) and Ribbon Filter construction.
+  // If additional temporary memory of Ribbon Filter exceeds the avaible
+  // space left in the block cache at some point (i.e, causing a cache full
+  // under `LRUCacheOptions::strict_capacity_limit` = true),
+  // construction will fall back to Bloom Filter.
+  // (ii) If kDisabled:
+  // Does not charge the memory usage mentioned above.
+  // (iii) Compatible existing behavior:
+  // Same as kDisabled.
+  //
+  // (c) CacheEntryRole::kBlockBasedTableReader
+  // (i) If kEnabled:
+  // Charge memory usage of table properties +
+  // index block/filter block/uncompression dictionary (when stored in table
+  // reader i.e, BlockBasedTableOptions::cache_index_and_filter_blocks ==
+  // false) + some internal data structures during table reader creation.
+  // If such a table reader exceeds
+  // the avaible space left in the block cache at some point (i.e, causing
+  // a cache full under `LRUCacheOptions::strict_capacity_limit` = true),
+  // creation will fail with Status::MemoryLimit().
+  // (ii) If kDisabled:
+  // Does not charge the memory usage mentioned above.
+  // (iii) Compatible existing behavior:
+  // Same as kDisabled.
+  //
+  // (d) CacheEntryRole::kFileMetadata
+  // (i) If kEnabled:
+  // Charge memory usage of file metadata. RocksDB holds one file metadata
+  // structure in-memory per on-disk table file.
+  // If such file metadata's
+  // memory exceeds the avaible space left in the block cache at some point
+  // (i.e, causing a cache full under `LRUCacheOptions::strict_capacity_limit` =
+  // true), creation will fail with Status::MemoryLimit().
+  // (ii) If kDisabled:
+  // Does not charge the memory usage mentioned above.
+  // (iii) Compatible existing behavior:
+  // Same as kDisabled.
+  //
+  // (e) Other CacheEntryRole
+  // Not supported.
+  // `Status::kNotSupported` will be returned if
+  // `CacheEntryRoleOptions::charged` is set to {`kEnabled`, `kDisabled`}.
+  //
+  //
+  // 2. More to come ...
+  //
+  CacheUsageOptions cache_usage_options;
+
+  // Note: currently this option requires kTwoLevelIndexSearch to be set as
+  // well.
+  // TODO(myabandeh): remove the note above once the limitation is lifted
+  // Use partitioned full filters for each SST file. This option is
+  // incompatible with block-based filters. Filter partition blocks use
+  // block cache even when cache_index_and_filter_blocks=false.
+  bool partition_filters = false;
+
+  // Option to generate Bloom/Ribbon filters that minimize memory
+  // internal fragmentation.
+  //
+  // When false, malloc_usable_size is not available, or format_version < 5,
+  // filters are generated without regard to internal fragmentation when
+  // loaded into memory (historical behavior). When true (and
+  // malloc_usable_size is available and format_version >= 5), then
+  // filters are generated to "round up" and "round down" their sizes to
+  // minimize internal fragmentation when loaded into memory, assuming the
+  // reading DB has the same memory allocation characteristics as the
+  // generating DB. This option does not break forward or backward
+  // compatibility.
+  //
+  // While individual filters will vary in bits/key and false positive rate
+  // when setting is true, the implementation attempts to maintain a weighted
+  // average FP rate for filters consistent with this option set to false.
+  //
+  // With Jemalloc for example, this setting is expected to save about 10% of
+  // the memory footprint and block cache charge of filters, while increasing
+  // disk usage of filters by about 1-2% due to encoding efficiency losses
+  // with variance in bits/key.
+  //
+  // NOTE: Because some memory counted by block cache might be unmapped pages
+  // within internal fragmentation, this option can increase observed RSS
+  // memory usage. With cache_index_and_filter_blocks=true, this option makes
+  // the block cache better at using space it is allowed. (These issues
+  // should not arise with partitioned filters.)
+  //
+  // NOTE: Do not set to true if you do not trust malloc_usable_size. With
+  // this option, RocksDB might access an allocated memory object beyond its
+  // original size if malloc_usable_size says it is safe to do so. While this
+  // can be considered bad practice, it should not produce undefined behavior
+  // unless malloc_usable_size is buggy or broken.
+  bool optimize_filters_for_memory = false;
+
+  // Use delta encoding to compress keys in blocks.
+  // ReadOptions::pin_data requires this option to be disabled.
+  //
+  // Default: true
+  bool use_delta_encoding = true;
+
+  // If non-nullptr, use the specified filter policy to reduce disk reads.
+  // Many applications will benefit from passing the result of
+  // NewBloomFilterPolicy() here.
+  std::shared_ptr<const FilterPolicy> filter_policy = nullptr;
+
+  // If true, place whole keys in the filter (not just prefixes).
+  // This must generally be true for gets to be efficient.
+  bool whole_key_filtering = true;
+
+  // If true, detect corruption during Bloom Filter (format_version >= 5)
+  // and Ribbon Filter construction.
+  //
+  // This is an extra check that is only
+  // useful in detecting software bugs or CPU+memory malfunction.
+  // Turning on this feature increases filter construction time by 30%.
+  //
+  // This parameter can be changed dynamically by
+  // DB::SetOptions({{"block_based_table_factory",
+  //                  "{detect_filter_construct_corruption=true;}"}});
+  //
+  // TODO: optimize this performance
+  bool detect_filter_construct_corruption = false;
+
+  // Verify that decompressing the compressed block gives back the input. This
+  // is a verification mode that we use to detect bugs in compression
+  // algorithms.
+  bool verify_compression = false;
+
+  // If used, For every data block we load into memory, we will create a bitmap
+  // of size ((block_size / `read_amp_bytes_per_bit`) / 8) bytes. This bitmap
+  // will be used to figure out the percentage we actually read of the blocks.
+  //
+  // When this feature is used Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES and
+  // Tickers::READ_AMP_TOTAL_READ_BYTES can be used to calculate the
+  // read amplification using this formula
+  // (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES)
+  //
+  // value  =>  memory usage (percentage of loaded blocks memory)
+  // 1      =>  12.50 %
+  // 2      =>  06.25 %
+  // 4      =>  03.12 %
+  // 8      =>  01.56 %
+  // 16     =>  00.78 %
+  //
+  // Note: This number must be a power of 2, if not it will be sanitized
+  // to be the next lowest power of 2, for example a value of 7 will be
+  // treated as 4, a value of 19 will be treated as 16.
+  //
+  // Default: 0 (disabled)
+  uint32_t read_amp_bytes_per_bit = 0;
+
+  // We currently have these versions:
+  // 0 -- This version can be read by really old RocksDB's. Doesn't support
+  // changing checksum type (default is CRC32).
+  // 1 -- Can be read by RocksDB's versions since 3.0. Supports non-default
+  // checksum, like xxHash. It is written by RocksDB when
+  // BlockBasedTableOptions::checksum is something other than kCRC32c. (version
+  // 0 is silently upconverted)
+  // 2 -- Can be read by RocksDB's versions since 3.10. Changes the way we
+  // encode compressed blocks with LZ4, BZip2 and Zlib compression. If you
+  // don't plan to run RocksDB before version 3.10, you should probably use
+  // this.
+  // 3 -- Can be read by RocksDB's versions since 5.15. Changes the way we
+  // encode the keys in index blocks. If you don't plan to run RocksDB before
+  // version 5.15, you should probably use this.
+  // This option only affects newly written tables. When reading existing
+  // tables, the information about version is read from the footer.
+  // 4 -- Can be read by RocksDB's versions since 5.16. Changes the way we
+  // encode the values in index blocks. If you don't plan to run RocksDB before
+  // version 5.16 and you are using index_block_restart_interval > 1, you should
+  // probably use this as it would reduce the index size.
+  // This option only affects newly written tables. When reading existing
+  // tables, the information about version is read from the footer.
+  // 5 -- Can be read by RocksDB's versions since 6.6.0. Full and partitioned
+  // filters use a generally faster and more accurate Bloom filter
+  // implementation, with a different schema.
+  uint32_t format_version = 5;
+
+  // Store index blocks on disk in compressed format. Changing this option to
+  // false  will avoid the overhead of decompression if index blocks are evicted
+  // and read back
+  bool enable_index_compression = true;
+
+  // Align data blocks on lesser of page size and block size
+  bool block_align = false;
+
+  // This enum allows trading off increased index size for improved iterator
+  // seek performance in some situations, particularly when block cache is
+  // disabled (ReadOptions::fill_cache = false) and direct IO is
+  // enabled (DBOptions::use_direct_reads = true).
+  // The default mode is the best tradeoff for most use cases.
+  // This option only affects newly written tables.
+  //
+  // The index contains a key separating each pair of consecutive blocks.
+  // Let A be the highest key in one block, B the lowest key in the next block,
+  // and I the index entry separating these two blocks:
+  // [ ... A] I [B ...]
+  // I is allowed to be anywhere in [A, B).
+  // If an iterator is seeked to a key in (A, I], we'll unnecessarily read the
+  // first block, then immediately fall through to the second block.
+  // However, if I=A, this can't happen, and we'll read only the second block.
+  // In kNoShortening mode, we use I=A. In other modes, we use the shortest
+  // key in [A, B), which usually significantly reduces index size.
+  //
+  // There's a similar story for the last index entry, which is an upper bound
+  // of the highest key in the file. If it's shortened and therefore
+  // overestimated, iterator is likely to unnecessarily read the last data block
+  // from each file on each seek.
+  enum class IndexShorteningMode : char {
+    // Use full keys.
+    kNoShortening,
+    // Shorten index keys between blocks, but use full key for the last index
+    // key, which is the upper bound of the whole file.
+    kShortenSeparators,
+    // Shorten both keys between blocks and key after last block.
+    kShortenSeparatorsAndSuccessor,
+  };
+
+  IndexShorteningMode index_shortening =
+      IndexShorteningMode::kShortenSeparators;
+
+  // RocksDB does auto-readahead for iterators on noticing more than two reads
+  // for a table file if user doesn't provide readahead_size. The readahead
+  // starts at BlockBasedTableOptions.initial_auto_readahead_size (default: 8KB)
+  // and doubles on every additional read upto max_auto_readahead_size and
+  // max_auto_readahead_size can be configured.
+  //
+  // Special Value: 0 - If max_auto_readahead_size is set 0 then it will disable
+  // the implicit auto prefetching.
+  // If max_auto_readahead_size provided is less
+  // than initial_auto_readahead_size, then RocksDB will sanitize the
+  // initial_auto_readahead_size and set it to max_auto_readahead_size.
+  //
+  // Value should be provided along with KB i.e. 256 * 1024 as it will prefetch
+  // the blocks.
+  //
+  // Found that 256 KB readahead size provides the best performance, based on
+  // experiments, for auto readahead. Experiment data is in PR #3282.
+  //
+  // This parameter can be changed dynamically by
+  // DB::SetOptions({{"block_based_table_factory",
+  //                  "{max_auto_readahead_size=0;}"}}));
+  //
+  // Changing the value dynamically will only affect files opened after the
+  // change.
+  //
+  // Default: 256 KB (256 * 1024).
+  size_t max_auto_readahead_size = 256 * 1024;
+
+  // If enabled, prepopulate warm/hot blocks (data, uncompressed dict, index and
+  // filter blocks) which are already in memory into block cache at the time of
+  // flush. On a flush, the block that is in memory (in memtables) get flushed
+  // to the device. If using Direct IO, additional IO is incurred to read this
+  // data back into memory again, which is avoided by enabling this option. This
+  // further helps if the workload exhibits high temporal locality, where most
+  // of the reads go to recently written data. This also helps in case of
+  // Distributed FileSystem.
+  //
+  // This parameter can be changed dynamically by
+  // DB::SetOptions({{"block_based_table_factory",
+  //                  "{prepopulate_block_cache=kFlushOnly;}"}}));
+  enum class PrepopulateBlockCache : char {
+    // Disable prepopulate block cache.
+    kDisable,
+    // Prepopulate blocks during flush only.
+    kFlushOnly,
+  };
+
+  PrepopulateBlockCache prepopulate_block_cache =
+      PrepopulateBlockCache::kDisable;
+
+  // RocksDB does auto-readahead for iterators on noticing more than two reads
+  // for a table file if user doesn't provide readahead_size. The readahead size
+  // starts at initial_auto_readahead_size and doubles on every additional read
+  // upto BlockBasedTableOptions.max_auto_readahead_size.
+  // max_auto_readahead_size can also be configured.
+  //
+  // Scenarios:
+  // - If initial_auto_readahead_size is set 0 then it will disabled the
+  //   implicit auto prefetching irrespective of max_auto_readahead_size.
+  // - If max_auto_readahead_size is set 0, it will disable the internal
+  //    prefetching irrespective of initial_auto_readahead_size.
+  // - If initial_auto_readahead_size > max_auto_readahead_size, then RocksDB
+  //   will sanitize the value of initial_auto_readahead_size to
+  //   max_auto_readahead_size and readahead_size will be
+  //   max_auto_readahead_size.
+  //
+  // Value should be provided along with KB i.e. 8 * 1024 as it will prefetch
+  // the blocks.
+  //
+  // This parameter can be changed dynamically by
+  // DB::SetOptions({{"block_based_table_factory",
+  //                  "{initial_auto_readahead_size=0;}"}}));
+  //
+  // Changing the value dynamically will only affect files opened after the
+  // change.
+  //
+  // Default: 8 KB (8 * 1024).
+  size_t initial_auto_readahead_size = 8 * 1024;
+
+  // RocksDB does auto-readahead for iterators on noticing more than two reads
+  // for a table file if user doesn't provide readahead_size and reads are
+  // sequential.
+  // num_file_reads_for_auto_readahead indicates after how many
+  // sequential reads internal auto prefetching should be start.
+  //
+  // For example, if value is 2 then after reading 2 sequential data blocks on
+  // third data block prefetching will start.
+  // If set 0, it will start prefetching from the first read.
+  //
+  // This parameter can be changed dynamically by
+  // DB::SetOptions({{"block_based_table_factory",
+  //                  "{num_file_reads_for_auto_readahead=0;}"}}));
+  //
+  // Changing the value dynamically will only affect files opened after the
+  // change.
+  //
+  // Default: 2
+  uint64_t num_file_reads_for_auto_readahead = 2;
+};
+
+// Table Properties that are specific to block-based table properties.
+struct BlockBasedTablePropertyNames {
+  // value of this properties is a fixed int32 number.
+  static const std::string kIndexType;
+  // value is "1" for true and "0" for false.
+  static const std::string kWholeKeyFiltering;
+  // value is "1" for true and "0" for false.
+  static const std::string kPrefixFiltering;
+};
+
+// Create default block based table factory.
+extern TableFactory* NewBlockBasedTableFactory(
+    const BlockBasedTableOptions& table_options = BlockBasedTableOptions());
+
+#ifndef ROCKSDB_LITE
+
+enum EncodingType : char {
+  // Always write full keys without any special encoding.
+  kPlain,
+  // Find opportunity to write the same prefix once for multiple rows.
+  // In some cases, when a key follows a previous key with the same prefix,
+  // instead of writing out the full key, it just writes out the size of the
+  // shared prefix, as well as other bytes, to save some bytes.
+  //
+  // When using this option, the user is required to use the same prefix
+  // extractor to make sure the same prefix will be extracted from the same key.
+  // The Name() value of the prefix extractor will be stored in the file. When
+  // reopening the file, the name of the options.prefix_extractor given will be
+  // bitwise compared to the prefix extractors stored in the file. An error
+  // will be returned if the two don't match.
+  kPrefix,
+};
+
+// Table Properties that are specific to plain table properties.
+struct PlainTablePropertyNames {
+  static const std::string kEncodingType;
+  static const std::string kBloomVersion;
+  static const std::string kNumBloomBlocks;
+};
+
+const uint32_t kPlainTableVariableLength = 0;
+
+struct PlainTableOptions {
+  static const char* kName() { return "PlainTableOptions"; };
+  // @user_key_len: plain table has optimization for fix-sized keys, which can
+  //                be specified via user_key_len.  Alternatively, you can pass
+  //                `kPlainTableVariableLength` if your keys have variable
+  //                lengths.
+  uint32_t user_key_len = kPlainTableVariableLength;
+
+  // @bloom_bits_per_key: the number of bits used for bloom filer per prefix.
+  //                      You may disable it by passing a zero.
+  int bloom_bits_per_key = 10;
+
+  // @hash_table_ratio: the desired utilization of the hash table used for
+  //                    prefix hashing.
+  //                    hash_table_ratio = number of prefixes / #buckets in the
+  //                    hash table
+  double hash_table_ratio = 0.75;
+
+  // @index_sparseness: inside each prefix, need to build one index record for
+  //                    how many keys for binary search inside each hash bucket.
+  //                    For encoding type kPrefix, the value will be used when
+  //                    writing to determine an interval to rewrite the full
+  //                    key. It will also be used as a suggestion and satisfied
+  //                    when possible.
+  size_t index_sparseness = 16;
+
+  // @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
+  //                      Otherwise from huge page TLB. The user needs to
+  //                      reserve huge pages for it to be allocated, like:
+  //                          sysctl -w vm.nr_hugepages=20
+  //                      See linux doc Documentation/vm/hugetlbpage.txt
+  size_t huge_page_tlb_size = 0;
+
+  // @encoding_type: how to encode the keys. See enum EncodingType above for
+  //                 the choices. The value will determine how to encode keys
+  //                 when writing to a new SST file. This value will be stored
+  //                 inside the SST file which will be used when reading from
+  //                 the file, which makes it possible for users to choose
+  //                 different encoding type when reopening a DB. Files with
+  //                 different encoding types can co-exist in the same DB and
+  //                 can be read.
+  EncodingType encoding_type = kPlain;
+
+  // @full_scan_mode: mode for reading the whole file one record by one without
+  //                  using the index.
+  bool full_scan_mode = false;
+
+  // @store_index_in_file: compute plain table index and bloom filter during
+  //                       file building and store it in file. When reading
+  //                       file, index will be mapped instead of recomputation.
+  bool store_index_in_file = false;
+};
+
+// -- Plain Table with prefix-only seek
+// For this factory, you need to set Options.prefix_extractor properly to make
+// it work. Look-up will starts with prefix hash lookup for key prefix. Inside
+// the hash bucket found, a binary search is executed for hash conflicts.
+// Finally, a linear search is used.
+
+extern TableFactory* NewPlainTableFactory(
+    const PlainTableOptions& options = PlainTableOptions());
+
+struct CuckooTablePropertyNames {
+  // The key that is used to fill empty buckets.
+  static const std::string kEmptyKey;
+  // Fixed length of value.
+  static const std::string kValueLength;
+  // Number of hash functions used in Cuckoo Hash.
+  static const std::string kNumHashFunc;
+  // It denotes the number of buckets in a Cuckoo Block. Given a key and a
+  // particular hash function, a Cuckoo Block is a set of consecutive buckets,
+  // where starting bucket id is given by the hash function on the key. In case
+  // of a collision during inserting the key, the builder tries to insert the
+  // key in other locations of the cuckoo block before using the next hash
+  // function. This reduces cache miss during read operation in case of
+  // collision.
+  static const std::string kCuckooBlockSize;
+  // Size of the hash table. Use this number to compute the modulo of hash
+  // function. The actual number of buckets will be kMaxHashTableSize +
+  // kCuckooBlockSize - 1. The last kCuckooBlockSize-1 buckets are used to
+  // accommodate the Cuckoo Block from end of hash table, due to cache friendly
+  // implementation.
+  static const std::string kHashTableSize;
+  // Denotes if the key sorted in the file is Internal Key (if false)
+  // or User Key only (if true).
+  static const std::string kIsLastLevel;
+  // Indicate if using identity function for the first hash function.
+  static const std::string kIdentityAsFirstHash;
+  // Indicate if using module or bit and to calculate hash value
+  static const std::string kUseModuleHash;
+  // Fixed user key length
+  static const std::string kUserKeyLength;
+};
+
+struct CuckooTableOptions {
+  static const char* kName() { return "CuckooTableOptions"; };
+
+  // Determines the utilization of hash tables. Smaller values
+  // result in larger hash tables with fewer collisions.
+  double hash_table_ratio = 0.9;
+  // A property used by builder to determine the depth to go to
+  // to search for a path to displace elements in case of
+  // collision. See Builder.MakeSpaceForKey method. Higher
+  // values result in more efficient hash tables with fewer
+  // lookups but take more time to build.
+  uint32_t max_search_depth = 100;
+  // In case of collision while inserting, the builder
+  // attempts to insert in the next cuckoo_block_size
+  // locations before skipping over to the next Cuckoo hash
+  // function. This makes lookups more cache friendly in case
+  // of collisions.
+  uint32_t cuckoo_block_size = 5;
+  // If this option is enabled, user key is treated as uint64_t and its value
+  // is used as hash value directly. This option changes builder's behavior.
+  // Reader ignore this option and behave according to what specified in table
+  // property.
+  bool identity_as_first_hash = false;
+  // If this option is set to true, module is used during hash calculation.
+  // This often yields better space efficiency at the cost of performance.
+  // If this option is set to false, # of entries in table is constrained to be
+  // power of two, and bit and is used to calculate hash, which is faster in
+  // general.
+  bool use_module_hash = true;
+};
+
+// Cuckoo Table Factory for SST table format using Cache Friendly Cuckoo Hashing
+extern TableFactory* NewCuckooTableFactory(
+    const CuckooTableOptions& table_options = CuckooTableOptions());
+
+#endif  // ROCKSDB_LITE
+
+class RandomAccessFileReader;
+
+// A base class for table factories.
+class TableFactory : public Customizable {
+ public:
+  virtual ~TableFactory() override {}
+
+  static const char* kBlockCacheOpts() { return "BlockCache"; };
+  static const char* kBlockBasedTableName() { return "BlockBasedTable"; };
+  static const char* kPlainTableName() { return "PlainTable"; }
+  static const char* kCuckooTableName() { return "CuckooTable"; };
+
+  // Creates and configures a new TableFactory from the input options and id.
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& id,
+                                 std::shared_ptr<TableFactory>* factory);
+
+  static const char* Type() { return "TableFactory"; }
+
+  // Returns a Table object table that can fetch data from file specified
+  // in parameter file. It's the caller's responsibility to make sure
+  // file is in the correct format.
+  //
+  // NewTableReader() is called in three places:
+  // (1) TableCache::FindTable() calls the function when table cache miss
+  //     and cache the table object returned.
+  // (2) SstFileDumper (for SST Dump) opens the table and dump the table
+  //     contents using the iterator of the table.
+  // (3) DBImpl::IngestExternalFile() calls this function to read the contents
+  //     of the sst file it's attempting to add
+  //
+  // table_reader_options is a TableReaderOptions which contain all the
+  //    needed parameters and configuration to open the table.
+  // file is a file handler to handle the file for the table.
+  // file_size is the physical file size of the file.
+  // table_reader is the output table reader.
+  virtual Status NewTableReader(
+      const TableReaderOptions& table_reader_options,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table_reader,
+      bool prefetch_index_and_filter_in_cache = true) const {
+    ReadOptions ro;
+    return NewTableReader(ro, table_reader_options, std::move(file), file_size,
+                          table_reader, prefetch_index_and_filter_in_cache);
+  }
+
+  // Overload of the above function that allows the caller to pass in a
+  // ReadOptions
+  virtual Status NewTableReader(
+      const ReadOptions& ro, const TableReaderOptions& table_reader_options,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table_reader,
+      bool prefetch_index_and_filter_in_cache) const = 0;
+
+  // Return a table builder to write to a file for this table type.
+  //
+  // It is called in several places:
+  // (1) When flushing memtable to a level-0 output file, it creates a table
+  //     builder (In DBImpl::WriteLevel0Table(), by calling BuildTable())
+  // (2) During compaction, it gets the builder for writing compaction output
+  //     files in DBImpl::OpenCompactionOutputFile().
+  // (3) When recovering from transaction logs, it creates a table builder to
+  //     write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery,
+  //     by calling BuildTable())
+  // (4) When running Repairer, it creates a table builder to convert logs to
+  //     SST files (In Repairer::ConvertLogToTable() by calling BuildTable())
+  //
+  // Multiple configured can be accessed from there, including and not limited
+  // to compression options. file is a handle of a writable file.
+  // It is the caller's responsibility to keep the file open and close the file
+  // after closing the table builder. compression_type is the compression type
+  // to use in this table.
+  virtual TableBuilder* NewTableBuilder(
+      const TableBuilderOptions& table_builder_options,
+      WritableFileWriter* file) const = 0;
+
+  // Return is delete range supported
+  virtual bool IsDeleteRangeSupported() const { return false; }
+};
+
+#ifndef ROCKSDB_LITE
+// Create a special table factory that can open either of the supported
+// table formats, based on setting inside the SST files. It should be used to
+// convert a DB from one table format to another.
+// @table_factory_to_write: the table factory used when writing to new files.
+// @block_based_table_factory:  block based table factory to use. If NULL, use
+//                              a default one.
+// @plain_table_factory: plain table factory to use. If NULL, use a default one.
+// @cuckoo_table_factory: cuckoo table factory to use. If NULL, use a default
+// one.
+extern TableFactory* NewAdaptiveTableFactory(
+    std::shared_ptr<TableFactory> table_factory_to_write = nullptr,
+    std::shared_ptr<TableFactory> block_based_table_factory = nullptr,
+    std::shared_ptr<TableFactory> plain_table_factory = nullptr,
+    std::shared_ptr<TableFactory> cuckoo_table_factory = nullptr);
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/table_properties.h b/src/rocksdb/include/rocksdb/table_properties.h
new file mode 100644
index 000000000..cbe87fa3a
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/table_properties.h
@@ -0,0 +1,327 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#pragma once
+
+#include <stdint.h>
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// -- Table Properties
+// Other than basic table properties, each table may also have the user
+// collected properties.
+// The value of the user-collected properties are encoded as raw bytes --
+// users have to interpret these values by themselves.
+// Note: To do prefix seek/scan in `UserCollectedProperties`, you can do
+// something similar to:
+//
+// UserCollectedProperties props = ...;
+// for (auto pos = props.lower_bound(prefix);
+//      pos != props.end() && pos->first.compare(0, prefix.size(), prefix) == 0;
+//      ++pos) {
+//   ...
+// }
+using UserCollectedProperties = std::map<std::string, std::string>;
+
+// table properties' human-readable names in the property block.
+struct TablePropertiesNames {
+  static const std::string kDbId;
+  static const std::string kDbSessionId;
+  static const std::string kDbHostId;
+  static const std::string kOriginalFileNumber;
+  static const std::string kDataSize;
+  static const std::string kIndexSize;
+  static const std::string kIndexPartitions;
+  static const std::string kTopLevelIndexSize;
+  static const std::string kIndexKeyIsUserKey;
+  static const std::string kIndexValueIsDeltaEncoded;
+  static const std::string kFilterSize;
+  static const std::string kRawKeySize;
+  static const std::string kRawValueSize;
+  static const std::string kNumDataBlocks;
+  static const std::string kNumEntries;
+  static const std::string kNumFilterEntries;
+  static const std::string kDeletedKeys;
+  static const std::string kMergeOperands;
+  static const std::string kNumRangeDeletions;
+  static const std::string kFormatVersion;
+  static const std::string kFixedKeyLen;
+  static const std::string kFilterPolicy;
+  static const std::string kColumnFamilyName;
+  static const std::string kColumnFamilyId;
+  static const std::string kComparator;
+  static const std::string kMergeOperator;
+  static const std::string kPrefixExtractorName;
+  static const std::string kPropertyCollectors;
+  static const std::string kCompression;
+  static const std::string kCompressionOptions;
+  static const std::string kCreationTime;
+  static const std::string kOldestKeyTime;
+  static const std::string kFileCreationTime;
+  static const std::string kSlowCompressionEstimatedDataSize;
+  static const std::string kFastCompressionEstimatedDataSize;
+  static const std::string kSequenceNumberTimeMapping;
+};
+
+// `TablePropertiesCollector` provides the mechanism for users to collect
+// their own properties that they are interested in. This class is essentially
+// a collection of callback functions that will be invoked during table
+// building. It is constructed with TablePropertiesCollectorFactory. The methods
+// don't need to be thread-safe, as we will create exactly one
+// TablePropertiesCollector object per table and then call it sequentially.
+//
+// Statuses from these callbacks are currently logged when not OK, but
+// otherwise ignored by RocksDB.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class TablePropertiesCollector {
+ public:
+  virtual ~TablePropertiesCollector() {}
+
+  // DEPRECATE User defined collector should implement AddUserKey(), though
+  //           this old function still works for backward compatible reason.
+  // Add() will be called when a new key/value pair is inserted into the table.
+  // @params key    the user key that is inserted into the table.
+  // @params value  the value that is inserted into the table.
+  virtual Status Add(const Slice& /*key*/, const Slice& /*value*/) {
+    return Status::InvalidArgument(
+        "TablePropertiesCollector::Add() deprecated.");
+  }
+
+  // AddUserKey() will be called when a new key/value pair is inserted into the
+  // table.
+  // @params key    the user key that is inserted into the table.
+  // @params value  the value that is inserted into the table.
+  virtual Status AddUserKey(const Slice& key, const Slice& value,
+                            EntryType /*type*/, SequenceNumber /*seq*/,
+                            uint64_t /*file_size*/) {
+    // For backwards-compatibility.
+    return Add(key, value);
+  }
+
+  // Called after each new block is cut
+  virtual void BlockAdd(uint64_t /* block_uncomp_bytes */,
+                        uint64_t /* block_compressed_bytes_fast */,
+                        uint64_t /* block_compressed_bytes_slow */) {
+    // Nothing to do here. Callback registers can override.
+    return;
+  }
+
+  // Finish() will be called when a table has already been built and is ready
+  // for writing the properties block.
+  // @params properties  User will add their collected statistics to
+  // `properties`.
+  virtual Status Finish(UserCollectedProperties* properties) = 0;
+
+  // Return the human-readable properties, where the key is property name and
+  // the value is the human-readable form of value.
+  virtual UserCollectedProperties GetReadableProperties() const = 0;
+
+  // The name of the properties collector can be used for debugging purpose.
+  virtual const char* Name() const = 0;
+
+  // EXPERIMENTAL Return whether the output file should be further compacted
+  virtual bool NeedCompact() const { return false; }
+};
+
+// Constructs TablePropertiesCollector. Internals create a new
+// TablePropertiesCollector for each new table
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class TablePropertiesCollectorFactory : public Customizable {
+ public:
+  struct Context {
+    uint32_t column_family_id;
+    // The level at creating the SST file (i.e, table), of which the
+    // properties are being collected.
+    int level_at_creation = kUnknownLevelAtCreation;
+    static const uint32_t kUnknownColumnFamily;
+    static const int kUnknownLevelAtCreation = -1;
+  };
+
+  ~TablePropertiesCollectorFactory() override {}
+  static const char* Type() { return "TablePropertiesCollectorFactory"; }
+  static Status CreateFromString(
+      const ConfigOptions& options, const std::string& value,
+      std::shared_ptr<TablePropertiesCollectorFactory>* result);
+
+  // has to be thread-safe
+  virtual TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context context) = 0;
+
+  // The name of the properties collector can be used for debugging purpose.
+  const char* Name() const override = 0;
+
+  // Can be overridden by sub-classes to return the Name, followed by
+  // configuration info that will // be logged to the info log when the
+  // DB is opened
+  virtual std::string ToString() const { return Name(); }
+};
+
+// TableProperties contains a bunch of read-only properties of its associated
+// table.
+struct TableProperties {
+ public:
+  // the file number at creation time, or 0 for unknown. When known,
+  // combining with db_session_id must uniquely identify an SST file.
+  uint64_t orig_file_number = 0;
+  // the total size of all data blocks.
+  uint64_t data_size = 0;
+  // the size of index block.
+  uint64_t index_size = 0;
+  // Total number of index partitions if kTwoLevelIndexSearch is used
+  uint64_t index_partitions = 0;
+  // Size of the top-level index if kTwoLevelIndexSearch is used
+  uint64_t top_level_index_size = 0;
+  // Whether the index key is user key. Otherwise it includes 8 byte of sequence
+  // number added by internal key format.
+  uint64_t index_key_is_user_key = 0;
+  // Whether delta encoding is used to encode the index values.
+  uint64_t index_value_is_delta_encoded = 0;
+  // the size of filter block.
+  uint64_t filter_size = 0;
+  // total raw (uncompressed, undelineated) key size
+  uint64_t raw_key_size = 0;
+  // total raw (uncompressed, undelineated) value size
+  uint64_t raw_value_size = 0;
+  // the number of blocks in this table
+  uint64_t num_data_blocks = 0;
+  // the number of entries in this table
+  uint64_t num_entries = 0;
+  // the number of unique entries (keys or prefixes) added to filters
+  uint64_t num_filter_entries = 0;
+  // the number of deletions in the table
+  uint64_t num_deletions = 0;
+  // the number of merge operands in the table
+  uint64_t num_merge_operands = 0;
+  // the number of range deletions in this table
+  uint64_t num_range_deletions = 0;
+  // format version, reserved for backward compatibility
+  uint64_t format_version = 0;
+  // If 0, key is variable length. Otherwise number of bytes for each key.
+  uint64_t fixed_key_len = 0;
+  // ID of column family for this SST file, corresponding to the CF identified
+  // by column_family_name.
+  uint64_t column_family_id = ROCKSDB_NAMESPACE::
+      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily;
+  // Timestamp of the latest key. 0 means unknown.
+  // TODO(sagar0): Should be changed to latest_key_time ... but don't know the
+  // full implications of backward compatibility. Hence retaining for now.
+  uint64_t creation_time = 0;
+
+  // Timestamp of the earliest key. 0 means unknown.
+  uint64_t oldest_key_time = 0;
+  // Actual SST file creation time. 0 means unknown.
+  uint64_t file_creation_time = 0;
+  // Estimated size of data blocks if compressed using a relatively slower
+  // compression algorithm (see `ColumnFamilyOptions::sample_for_compression`).
+  // 0 means unknown.
+  uint64_t slow_compression_estimated_data_size = 0;
+  // Estimated size of data blocks if compressed using a relatively faster
+  // compression algorithm (see `ColumnFamilyOptions::sample_for_compression`).
+  // 0 means unknown.
+  uint64_t fast_compression_estimated_data_size = 0;
+  // Offset of the value of the property "external sst file global seqno" in the
+  // file if the property exists.
+  // 0 means not exists.
+  uint64_t external_sst_file_global_seqno_offset = 0;
+
+  // DB identity
+  // db_id is an identifier generated the first time the DB is created
+  // If DB identity is unset or unassigned, `db_id` will be an empty string.
+  std::string db_id;
+
+  // DB session identity
+  // db_session_id is an identifier that gets reset every time the DB is opened
+  // If DB session identity is unset or unassigned, `db_session_id` will be an
+  // empty string.
+  std::string db_session_id;
+
+  // Location of the machine hosting the DB instance
+  // db_host_id identifies the location of the host in some form
+  // (hostname by default, but can also be any string of the user's choosing).
+  // It can potentially change whenever the DB is opened
+  std::string db_host_id;
+
+  // Name of the column family with which this SST file is associated.
+  // If column family is unknown, `column_family_name` will be an empty string.
+  std::string column_family_name;
+
+  // The name of the filter policy used in this table.
+  // If no filter policy is used, `filter_policy_name` will be an empty string.
+  std::string filter_policy_name;
+
+  // The name of the comparator used in this table.
+  std::string comparator_name;
+
+  // The name of the merge operator used in this table.
+  // If no merge operator is used, `merge_operator_name` will be "nullptr".
+  std::string merge_operator_name;
+
+  // The name of the prefix extractor used in this table
+  // If no prefix extractor is used, `prefix_extractor_name` will be "nullptr".
+  std::string prefix_extractor_name;
+
+  // The names of the property collectors factories used in this table
+  // separated by commas
+  // {collector_name[1]},{collector_name[2]},{collector_name[3]} ..
+  std::string property_collectors_names;
+
+  // The compression algo used to compress the SST files.
+  std::string compression_name;
+
+  // Compression options used to compress the SST files.
+  std::string compression_options;
+
+  // Sequence number to time mapping, delta encoded.
+  std::string seqno_to_time_mapping;
+
+  // user collected properties
+  UserCollectedProperties user_collected_properties;
+  UserCollectedProperties readable_properties;
+
+  // convert this object to a human readable form
+  //   @prop_delim: delimiter for each property.
+  std::string ToString(const std::string& prop_delim = "; ",
+                       const std::string& kv_delim = "=") const;
+
+  // Aggregate the numerical member variables of the specified
+  // TableProperties.
+  void Add(const TableProperties& tp);
+
+  // Subset of properties that make sense when added together
+  // between tables. Keys match field names in this class instead
+  // of using full property names.
+  std::map<std::string, uint64_t> GetAggregatablePropertiesAsMap() const;
+
+  // Return the approximated memory usage of this TableProperties object,
+  // including memory used by the string properties and UserCollectedProperties
+  std::size_t ApproximateMemoryUsage() const;
+};
+
+// Extra properties
+// Below is a list of non-basic properties that are collected by database
+// itself. Especially some properties regarding to the internal keys (which
+// is unknown to `table`).
+//
+// DEPRECATED: these properties now belong as TableProperties members. Please
+// use TableProperties::num_deletions and TableProperties::num_merge_operands,
+// respectively.
+extern uint64_t GetDeletedKeys(const UserCollectedProperties& props);
+extern uint64_t GetMergeOperands(const UserCollectedProperties& props,
+                                 bool* property_present);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/table_reader_caller.h b/src/rocksdb/include/rocksdb/table_reader_caller.h
new file mode 100644
index 000000000..10ec08130
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/table_reader_caller.h
@@ -0,0 +1,41 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+// A list of callers for a table reader. It is used to trace the caller that
+// accesses on a block. This is only used for block cache tracing and analysis.
+// A user may use kUncategorized if the caller is not interesting for analysis
+// or the table reader is called in the test environment, e.g., unit test, table
+// reader benchmark, etc.
+enum TableReaderCaller : char {
+  kUserGet = 1,
+  kUserMultiGet = 2,
+  kUserIterator = 3,
+  kUserApproximateSize = 4,
+  kUserVerifyChecksum = 5,
+  kSSTDumpTool = 6,
+  kExternalSSTIngestion = 7,
+  kRepair = 8,
+  kPrefetch = 9,
+  kCompaction = 10,
+  // A compaction job may refill the block cache with blocks in the new SST
+  // files if paranoid_file_checks is true.
+  kCompactionRefill = 11,
+  // After building a table, it may load all its blocks into the block cache if
+  // paranoid_file_checks is true.
+  kFlush = 12,
+  // sst_file_reader.
+  kSSTFileReader = 13,
+  // A list of callers that are either not interesting for analysis or are
+  // calling from a test environment, e.g., unit test, benchmark, etc.
+  kUncategorized = 14,
+  // All callers should be added before kMaxBlockCacheLookupCaller.
+  kMaxBlockCacheLookupCaller
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/thread_status.h b/src/rocksdb/include/rocksdb/thread_status.h
new file mode 100644
index 000000000..1b5f8c046
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/thread_status.h
@@ -0,0 +1,189 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file defines the structures for exposing run-time status of any
+// rocksdb-related thread.  Such run-time status can be obtained via
+// GetThreadList() API.
+//
+// Note that all thread-status features are still under-development, and
+// thus APIs and class definitions might subject to change at this point.
+// Will remove this comment once the APIs have been finalized.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+#if !defined(ROCKSDB_LITE) && !defined(NROCKSDB_THREAD_STATUS)
+#define ROCKSDB_USING_THREAD_STATUS
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO(yhchiang): remove this function once c++14 is available
+//                 as std::max will be able to cover this.
+// Current MS compiler does not support constexpr
+template <int A, int B>
+struct constexpr_max {
+  static const int result = (A > B) ? A : B;
+};
+
+// A structure that describes the current status of a thread.
+// The status of active threads can be fetched using
+// ROCKSDB_NAMESPACE::GetThreadList().
+struct ThreadStatus {
+  // The type of a thread.
+  enum ThreadType : int {
+    HIGH_PRIORITY = 0,  // RocksDB BG thread in high-pri thread pool
+    LOW_PRIORITY,       // RocksDB BG thread in low-pri thread pool
+    USER,               // User thread (Non-RocksDB BG thread)
+    BOTTOM_PRIORITY,    // RocksDB BG thread in bottom-pri thread pool
+    NUM_THREAD_TYPES
+  };
+
+  // The type used to refer to a thread operation.
+  // A thread operation describes high-level action of a thread.
+  // Examples include compaction and flush.
+  enum OperationType : int {
+    OP_UNKNOWN = 0,
+    OP_COMPACTION,
+    OP_FLUSH,
+    NUM_OP_TYPES
+  };
+
+  enum OperationStage : int {
+    STAGE_UNKNOWN = 0,
+    STAGE_FLUSH_RUN,
+    STAGE_FLUSH_WRITE_L0,
+    STAGE_COMPACTION_PREPARE,
+    STAGE_COMPACTION_RUN,
+    STAGE_COMPACTION_PROCESS_KV,
+    STAGE_COMPACTION_INSTALL,
+    STAGE_COMPACTION_SYNC_FILE,
+    STAGE_PICK_MEMTABLES_TO_FLUSH,
+    STAGE_MEMTABLE_ROLLBACK,
+    STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS,
+    NUM_OP_STAGES
+  };
+
+  enum CompactionPropertyType : int {
+    COMPACTION_JOB_ID = 0,
+    COMPACTION_INPUT_OUTPUT_LEVEL,
+    COMPACTION_PROP_FLAGS,
+    COMPACTION_TOTAL_INPUT_BYTES,
+    COMPACTION_BYTES_READ,
+    COMPACTION_BYTES_WRITTEN,
+    NUM_COMPACTION_PROPERTIES
+  };
+
+  enum FlushPropertyType : int {
+    FLUSH_JOB_ID = 0,
+    FLUSH_BYTES_MEMTABLES,
+    FLUSH_BYTES_WRITTEN,
+    NUM_FLUSH_PROPERTIES
+  };
+
+  // The maximum number of properties of an operation.
+  // This number should be set to the biggest NUM_XXX_PROPERTIES.
+  static const int kNumOperationProperties =
+      constexpr_max<NUM_COMPACTION_PROPERTIES, NUM_FLUSH_PROPERTIES>::result;
+
+  // The type used to refer to a thread state.
+  // A state describes lower-level action of a thread
+  // such as reading / writing a file or waiting for a mutex.
+  enum StateType : int {
+    STATE_UNKNOWN = 0,
+    STATE_MUTEX_WAIT = 1,
+    NUM_STATE_TYPES
+  };
+
+  ThreadStatus(const uint64_t _id, const ThreadType _thread_type,
+               const std::string& _db_name, const std::string& _cf_name,
+               const OperationType _operation_type,
+               const uint64_t _op_elapsed_micros,
+               const OperationStage _operation_stage,
+               const uint64_t _op_props[], const StateType _state_type)
+      : thread_id(_id),
+        thread_type(_thread_type),
+        db_name(_db_name),
+        cf_name(_cf_name),
+        operation_type(_operation_type),
+        op_elapsed_micros(_op_elapsed_micros),
+        operation_stage(_operation_stage),
+        state_type(_state_type) {
+    for (int i = 0; i < kNumOperationProperties; ++i) {
+      op_properties[i] = _op_props[i];
+    }
+  }
+
+  // An unique ID for the thread.
+  const uint64_t thread_id;
+
+  // The type of the thread, it could be HIGH_PRIORITY,
+  // LOW_PRIORITY, and USER
+  const ThreadType thread_type;
+
+  // The name of the DB instance where the thread is currently
+  // involved with.  It would be set to empty string if the thread
+  // does not involve in any DB operation.
+  const std::string db_name;
+
+  // The name of the column family where the thread is currently
+  // It would be set to empty string if the thread does not involve
+  // in any column family.
+  const std::string cf_name;
+
+  // The operation (high-level action) that the current thread is involved.
+  const OperationType operation_type;
+
+  // The elapsed time of the current thread operation in microseconds.
+  const uint64_t op_elapsed_micros;
+
+  // An integer showing the current stage where the thread is involved
+  // in the current operation.
+  const OperationStage operation_stage;
+
+  // A list of properties that describe some details about the current
+  // operation.  Same field in op_properties[] might have different
+  // meanings for different operations.
+  uint64_t op_properties[kNumOperationProperties];
+
+  // The state (lower-level action) that the current thread is involved.
+  const StateType state_type;
+
+  // The followings are a set of utility functions for interpreting
+  // the information of ThreadStatus
+
+  static std::string GetThreadTypeName(ThreadType thread_type);
+
+  // Obtain the name of an operation given its type.
+  static const std::string& GetOperationName(OperationType op_type);
+
+  static const std::string MicrosToString(uint64_t op_elapsed_time);
+
+  // Obtain a human-readable string describing the specified operation stage.
+  static const std::string& GetOperationStageName(OperationStage stage);
+
+  // Obtain the name of the "i"th operation property of the
+  // specified operation.
+  static const std::string& GetOperationPropertyName(OperationType op_type,
+                                                     int i);
+
+  // Translate the "i"th property of the specified operation given
+  // a property value.
+  static std::map<std::string, uint64_t> InterpretOperationProperties(
+      OperationType op_type, const uint64_t* op_properties);
+
+  // Obtain the name of a state given its type.
+  static const std::string& GetStateName(StateType state_type);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/threadpool.h b/src/rocksdb/include/rocksdb/threadpool.h
new file mode 100644
index 000000000..f1cc55752
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/threadpool.h
@@ -0,0 +1,67 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <functional>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+/*
+ * ThreadPool is a component that will spawn N background threads that will
+ * be used to execute scheduled work, The number of background threads could
+ * be modified by calling SetBackgroundThreads().
+ * */
+class ThreadPool {
+ public:
+  virtual ~ThreadPool() {}
+
+  // Wait for all threads to finish.
+  // Discard those threads that did not start
+  // executing
+  virtual void JoinAllThreads() = 0;
+
+  // Set the number of background threads that will be executing the
+  // scheduled jobs.
+  virtual void SetBackgroundThreads(int num) = 0;
+  virtual int GetBackgroundThreads() = 0;
+
+  // Get the number of jobs scheduled in the ThreadPool queue.
+  virtual unsigned int GetQueueLen() const = 0;
+
+  // Waits for all jobs to complete those
+  // that already started running and those that did not
+  // start yet. This ensures that everything that was thrown
+  // on the TP runs even though
+  // we may not have specified enough threads for the amount
+  // of jobs
+  virtual void WaitForJobsAndJoinAllThreads() = 0;
+
+  // Submit a fire and forget jobs
+  // This allows to submit the same job multiple times
+  virtual void SubmitJob(const std::function<void()>&) = 0;
+  // This moves the function in for efficiency
+  virtual void SubmitJob(std::function<void()>&&) = 0;
+
+  // Reserve available background threads. This function does not ensure
+  // so many threads can be reserved, instead it will return the number of
+  // threads that can be reserved against the desired one. In other words,
+  // the number of available threads could be less than the input.
+  virtual int ReserveThreads(int /*threads_to_be_reserved*/) { return 0; }
+
+  // Release a specific number of reserved threads
+  virtual int ReleaseThreads(int /*threads_to_be_released*/) { return 0; }
+};
+
+// NewThreadPool() is a function that could be used to create a ThreadPool
+// with `num_threads` background threads.
+extern ThreadPool* NewThreadPool(int num_threads);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/trace_reader_writer.h b/src/rocksdb/include/rocksdb/trace_reader_writer.h
new file mode 100644
index 000000000..335e091dc
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/trace_reader_writer.h
@@ -0,0 +1,52 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Allow custom implementations of TraceWriter and TraceReader.
+// By default, RocksDB provides a way to capture the traces to a file using the
+// factory NewFileTraceWriter(). But users could also choose to export traces to
+// any other system by providing custom implementations of TraceWriter and
+// TraceReader.
+
+// TraceWriter allows exporting RocksDB traces to any system, one operation at
+// a time.
+class TraceWriter {
+ public:
+  virtual ~TraceWriter() = default;
+
+  virtual Status Write(const Slice& data) = 0;
+  virtual Status Close() = 0;
+  virtual uint64_t GetFileSize() = 0;
+};
+
+// TraceReader allows reading RocksDB traces from any system, one operation at
+// a time. A RocksDB Replayer could depend on this to replay operations.
+class TraceReader {
+ public:
+  virtual ~TraceReader() = default;
+
+  virtual Status Read(std::string* data) = 0;
+  virtual Status Close() = 0;
+
+  // Seek back to the trace header. Replayer can call this method to restart
+  // replaying. Note this method may fail if the reader is already closed.
+  virtual Status Reset() = 0;
+};
+
+// Factory methods to write/read traces to/from a file.
+// The implementations may not be thread-safe.
+Status NewFileTraceWriter(Env* env, const EnvOptions& env_options,
+                          const std::string& trace_filename,
+                          std::unique_ptr<TraceWriter>* trace_writer);
+Status NewFileTraceReader(Env* env, const EnvOptions& env_options,
+                          const std::string& trace_filename,
+                          std::unique_ptr<TraceReader>* trace_reader);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/trace_record.h b/src/rocksdb/include/rocksdb/trace_record.h
new file mode 100644
index 000000000..c00f5cafb
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/trace_record.h
@@ -0,0 +1,248 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ColumnFamilyHandle;
+class DB;
+
+// Supported trace record types.
+enum TraceType : char {
+  kTraceNone = 0,
+  kTraceBegin = 1,
+  kTraceEnd = 2,
+  // Query level tracing related trace types.
+  kTraceWrite = 3,
+  kTraceGet = 4,
+  kTraceIteratorSeek = 5,
+  kTraceIteratorSeekForPrev = 6,
+  // Block cache tracing related trace types.
+  kBlockTraceIndexBlock = 7,
+  // TODO: split out kinds of filter blocks?
+  kBlockTraceFilterBlock = 8,
+  kBlockTraceDataBlock = 9,
+  kBlockTraceUncompressionDictBlock = 10,
+  kBlockTraceRangeDeletionBlock = 11,
+  // IO tracing related trace type.
+  kIOTracer = 12,
+  // Query level tracing related trace type.
+  kTraceMultiGet = 13,
+  // All trace types should be added before kTraceMax
+  kTraceMax,
+};
+
+class GetQueryTraceRecord;
+class IteratorSeekQueryTraceRecord;
+class MultiGetQueryTraceRecord;
+class TraceRecordResult;
+class WriteQueryTraceRecord;
+
+// Base class for all types of trace records.
+class TraceRecord {
+ public:
+  explicit TraceRecord(uint64_t timestamp);
+
+  virtual ~TraceRecord() = default;
+
+  // Type of the trace record.
+  virtual TraceType GetTraceType() const = 0;
+
+  // Timestamp (in microseconds) of this trace.
+  virtual uint64_t GetTimestamp() const;
+
+  class Handler {
+   public:
+    virtual ~Handler() = default;
+
+    virtual Status Handle(const WriteQueryTraceRecord& record,
+                          std::unique_ptr<TraceRecordResult>* result) = 0;
+
+    virtual Status Handle(const GetQueryTraceRecord& record,
+                          std::unique_ptr<TraceRecordResult>* result) = 0;
+
+    virtual Status Handle(const IteratorSeekQueryTraceRecord& record,
+                          std::unique_ptr<TraceRecordResult>* result) = 0;
+
+    virtual Status Handle(const MultiGetQueryTraceRecord& record,
+                          std::unique_ptr<TraceRecordResult>* result) = 0;
+  };
+
+  // Accept the handler and report the corresponding result in `result`.
+  virtual Status Accept(Handler* handler,
+                        std::unique_ptr<TraceRecordResult>* result) = 0;
+
+  // Create a handler for the exeution of TraceRecord.
+  static Handler* NewExecutionHandler(
+      DB* db, const std::vector<ColumnFamilyHandle*>& handles);
+
+ private:
+  uint64_t timestamp_;
+};
+
+// Base class for all query types of trace records.
+class QueryTraceRecord : public TraceRecord {
+ public:
+  explicit QueryTraceRecord(uint64_t timestamp);
+};
+
+// Trace record for DB::Write() operation.
+class WriteQueryTraceRecord : public QueryTraceRecord {
+ public:
+  WriteQueryTraceRecord(PinnableSlice&& write_batch_rep, uint64_t timestamp);
+
+  WriteQueryTraceRecord(const std::string& write_batch_rep, uint64_t timestamp);
+
+  virtual ~WriteQueryTraceRecord() override;
+
+  TraceType GetTraceType() const override { return kTraceWrite; }
+
+  // rep string for the WriteBatch.
+  virtual Slice GetWriteBatchRep() const;
+
+  Status Accept(Handler* handler,
+                std::unique_ptr<TraceRecordResult>* result) override;
+
+ private:
+  PinnableSlice rep_;
+};
+
+// Trace record for DB::Get() operation
+class GetQueryTraceRecord : public QueryTraceRecord {
+ public:
+  GetQueryTraceRecord(uint32_t column_family_id, PinnableSlice&& key,
+                      uint64_t timestamp);
+
+  GetQueryTraceRecord(uint32_t column_family_id, const std::string& key,
+                      uint64_t timestamp);
+
+  virtual ~GetQueryTraceRecord() override;
+
+  TraceType GetTraceType() const override { return kTraceGet; }
+
+  // Column family ID.
+  virtual uint32_t GetColumnFamilyID() const;
+
+  // Key to get.
+  virtual Slice GetKey() const;
+
+  Status Accept(Handler* handler,
+                std::unique_ptr<TraceRecordResult>* result) override;
+
+ private:
+  uint32_t cf_id_;
+  PinnableSlice key_;
+};
+
+// Base class for all Iterator related operations.
+class IteratorQueryTraceRecord : public QueryTraceRecord {
+ public:
+  explicit IteratorQueryTraceRecord(uint64_t timestamp);
+
+  IteratorQueryTraceRecord(PinnableSlice&& lower_bound,
+                           PinnableSlice&& upper_bound, uint64_t timestamp);
+
+  IteratorQueryTraceRecord(const std::string& lower_bound,
+                           const std::string& upper_bound, uint64_t timestamp);
+
+  virtual ~IteratorQueryTraceRecord() override;
+
+  // Get the iterator's lower/upper bound. They may be used in ReadOptions to
+  // create an Iterator instance.
+  virtual Slice GetLowerBound() const;
+  virtual Slice GetUpperBound() const;
+
+ private:
+  PinnableSlice lower_;
+  PinnableSlice upper_;
+};
+
+// Trace record for Iterator::Seek() and Iterator::SeekForPrev() operation.
+class IteratorSeekQueryTraceRecord : public IteratorQueryTraceRecord {
+ public:
+  // Currently we only support Seek() and SeekForPrev().
+  enum SeekType {
+    kSeek = kTraceIteratorSeek,
+    kSeekForPrev = kTraceIteratorSeekForPrev
+  };
+
+  IteratorSeekQueryTraceRecord(SeekType seekType, uint32_t column_family_id,
+                               PinnableSlice&& key, uint64_t timestamp);
+
+  IteratorSeekQueryTraceRecord(SeekType seekType, uint32_t column_family_id,
+                               const std::string& key, uint64_t timestamp);
+
+  IteratorSeekQueryTraceRecord(SeekType seekType, uint32_t column_family_id,
+                               PinnableSlice&& key, PinnableSlice&& lower_bound,
+                               PinnableSlice&& upper_bound, uint64_t timestamp);
+
+  IteratorSeekQueryTraceRecord(SeekType seekType, uint32_t column_family_id,
+                               const std::string& key,
+                               const std::string& lower_bound,
+                               const std::string& upper_bound,
+                               uint64_t timestamp);
+
+  virtual ~IteratorSeekQueryTraceRecord() override;
+
+  // Trace type matches the seek type.
+  TraceType GetTraceType() const override;
+
+  // Type of seek, Seek or SeekForPrev.
+  virtual SeekType GetSeekType() const;
+
+  // Column family ID.
+  virtual uint32_t GetColumnFamilyID() const;
+
+  // Key to seek to.
+  virtual Slice GetKey() const;
+
+  Status Accept(Handler* handler,
+                std::unique_ptr<TraceRecordResult>* result) override;
+
+ private:
+  SeekType type_;
+  uint32_t cf_id_;
+  PinnableSlice key_;
+};
+
+// Trace record for DB::MultiGet() operation.
+class MultiGetQueryTraceRecord : public QueryTraceRecord {
+ public:
+  MultiGetQueryTraceRecord(std::vector<uint32_t> column_family_ids,
+                           std::vector<PinnableSlice>&& keys,
+                           uint64_t timestamp);
+
+  MultiGetQueryTraceRecord(std::vector<uint32_t> column_family_ids,
+                           const std::vector<std::string>& keys,
+                           uint64_t timestamp);
+
+  virtual ~MultiGetQueryTraceRecord() override;
+
+  TraceType GetTraceType() const override { return kTraceMultiGet; }
+
+  // Column familiy IDs.
+  virtual std::vector<uint32_t> GetColumnFamilyIDs() const;
+
+  // Keys to get.
+  virtual std::vector<Slice> GetKeys() const;
+
+  Status Accept(Handler* handler,
+                std::unique_ptr<TraceRecordResult>* result) override;
+
+ private:
+  std::vector<uint32_t> cf_ids_;
+  std::vector<PinnableSlice> keys_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/trace_record_result.h b/src/rocksdb/include/rocksdb/trace_record_result.h
new file mode 100644
index 000000000..0cd0004a6
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/trace_record_result.h
@@ -0,0 +1,187 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_record.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class IteratorTraceExecutionResult;
+class MultiValuesTraceExecutionResult;
+class SingleValueTraceExecutionResult;
+class StatusOnlyTraceExecutionResult;
+
+// Base class for the results of all types of trace records.
+// Theses classes can be used to report the execution result of
+// TraceRecord::Handler::Handle() or TraceRecord::Accept().
+class TraceRecordResult {
+ public:
+  explicit TraceRecordResult(TraceType trace_type);
+
+  virtual ~TraceRecordResult() = default;
+
+  // Trace type of the corresponding TraceRecord.
+  virtual TraceType GetTraceType() const;
+
+  class Handler {
+   public:
+    virtual ~Handler() = default;
+
+    virtual Status Handle(const StatusOnlyTraceExecutionResult& result) = 0;
+
+    virtual Status Handle(const SingleValueTraceExecutionResult& result) = 0;
+
+    virtual Status Handle(const MultiValuesTraceExecutionResult& result) = 0;
+
+    virtual Status Handle(const IteratorTraceExecutionResult& result) = 0;
+  };
+
+  // Accept the handler.
+  virtual Status Accept(Handler* handler) = 0;
+
+ private:
+  TraceType trace_type_;
+};
+
+// Base class for the results from the trace record execution handler (created
+// by TraceRecord::NewExecutionHandler()).
+//
+// The actual execution status or returned values may be hidden from
+// TraceRecord::Handler::Handle and TraceRecord::Accept. For example, a
+// GetQueryTraceRecord's execution calls DB::Get() internally. DB::Get() may
+// return Status::NotFound() but TraceRecord::Handler::Handle() or
+// TraceRecord::Accept() will still return Status::OK(). The actual status from
+// DB::Get() and the returned value string may be saved in a
+// SingleValueTraceExecutionResult.
+class TraceExecutionResult : public TraceRecordResult {
+ public:
+  TraceExecutionResult(uint64_t start_timestamp, uint64_t end_timestamp,
+                       TraceType trace_type);
+
+  // Execution start/end timestamps and request latency in microseconds.
+  virtual uint64_t GetStartTimestamp() const;
+  virtual uint64_t GetEndTimestamp() const;
+  inline uint64_t GetLatency() const {
+    return GetEndTimestamp() - GetStartTimestamp();
+  }
+
+ private:
+  uint64_t ts_start_;
+  uint64_t ts_end_;
+};
+
+// Result for operations that only return a single Status.
+// Example operation: DB::Write()
+class StatusOnlyTraceExecutionResult : public TraceExecutionResult {
+ public:
+  StatusOnlyTraceExecutionResult(Status status, uint64_t start_timestamp,
+                                 uint64_t end_timestamp, TraceType trace_type);
+
+  virtual ~StatusOnlyTraceExecutionResult() override = default;
+
+  // Return value of DB::Write(), etc.
+  virtual const Status& GetStatus() const;
+
+  virtual Status Accept(Handler* handler) override;
+
+ private:
+  Status status_;
+};
+
+// Result for operations that return a Status and a value.
+// Example operation: DB::Get()
+class SingleValueTraceExecutionResult : public TraceExecutionResult {
+ public:
+  SingleValueTraceExecutionResult(Status status, const std::string& value,
+                                  uint64_t start_timestamp,
+                                  uint64_t end_timestamp, TraceType trace_type);
+
+  SingleValueTraceExecutionResult(Status status, std::string&& value,
+                                  uint64_t start_timestamp,
+                                  uint64_t end_timestamp, TraceType trace_type);
+
+  virtual ~SingleValueTraceExecutionResult() override;
+
+  // Return status of DB::Get().
+  virtual const Status& GetStatus() const;
+
+  // Value for the searched key.
+  virtual const std::string& GetValue() const;
+
+  virtual Status Accept(Handler* handler) override;
+
+ private:
+  Status status_;
+  std::string value_;
+};
+
+// Result for operations that return multiple Status(es) and values as vectors.
+// Example operation: DB::MultiGet()
+class MultiValuesTraceExecutionResult : public TraceExecutionResult {
+ public:
+  MultiValuesTraceExecutionResult(std::vector<Status> multi_status,
+                                  std::vector<std::string> values,
+                                  uint64_t start_timestamp,
+                                  uint64_t end_timestamp, TraceType trace_type);
+
+  virtual ~MultiValuesTraceExecutionResult() override;
+
+  // Returned Status(es) of DB::MultiGet().
+  virtual const std::vector<Status>& GetMultiStatus() const;
+
+  // Returned values for the searched keys.
+  virtual const std::vector<std::string>& GetValues() const;
+
+  virtual Status Accept(Handler* handler) override;
+
+ private:
+  std::vector<Status> multi_status_;
+  std::vector<std::string> values_;
+};
+
+// Result for Iterator operations.
+// Example operations: Iterator::Seek(), Iterator::SeekForPrev()
+class IteratorTraceExecutionResult : public TraceExecutionResult {
+ public:
+  IteratorTraceExecutionResult(bool valid, Status status, PinnableSlice&& key,
+                               PinnableSlice&& value, uint64_t start_timestamp,
+                               uint64_t end_timestamp, TraceType trace_type);
+
+  IteratorTraceExecutionResult(bool valid, Status status,
+                               const std::string& key, const std::string& value,
+                               uint64_t start_timestamp, uint64_t end_timestamp,
+                               TraceType trace_type);
+
+  virtual ~IteratorTraceExecutionResult() override;
+
+  // Return if the Iterator is valid.
+  virtual bool GetValid() const;
+
+  // Return the status of the Iterator.
+  virtual const Status& GetStatus() const;
+
+  // Key of the current iterating entry, empty if GetValid() is false.
+  virtual Slice GetKey() const;
+
+  // Value of the current iterating entry, empty if GetValid() is false.
+  virtual Slice GetValue() const;
+
+  virtual Status Accept(Handler* handler) override;
+
+ private:
+  bool valid_;
+  Status status_;
+  PinnableSlice key_;
+  PinnableSlice value_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/transaction_log.h b/src/rocksdb/include/rocksdb/transaction_log.h
new file mode 100644
index 000000000..e13ad8f80
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/transaction_log.h
@@ -0,0 +1,122 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/write_batch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class LogFile;
+using VectorLogPtr = std::vector<std::unique_ptr<LogFile>>;
+
+enum WalFileType {
+  /* Indicates that WAL file is in archive directory. WAL files are moved from
+   * the main db directory to archive directory once they are not live and stay
+   * there until cleaned up. Files are cleaned depending on archive size
+   * (Options::WAL_size_limit_MB) and time since last cleaning
+   * (Options::WAL_ttl_seconds).
+   */
+  kArchivedLogFile = 0,
+
+  /* Indicates that WAL file is live and resides in the main db directory */
+  kAliveLogFile = 1
+};
+
+class LogFile {
+ public:
+  LogFile() {}
+  virtual ~LogFile() {}
+
+  // Returns log file's pathname relative to the main db dir
+  // Eg. For a live-log-file = /000003.log
+  //     For an archived-log-file = /archive/000003.log
+  virtual std::string PathName() const = 0;
+
+  // Primary identifier for log file.
+  // This is directly proportional to creation time of the log file
+  virtual uint64_t LogNumber() const = 0;
+
+  // Log file can be either alive or archived
+  virtual WalFileType Type() const = 0;
+
+  // Starting sequence number of writebatch written in this log file
+  virtual SequenceNumber StartSequence() const = 0;
+
+  // Size of log file on disk in Bytes
+  virtual uint64_t SizeFileBytes() const = 0;
+};
+
+struct BatchResult {
+  SequenceNumber sequence = 0;
+  std::unique_ptr<WriteBatch> writeBatchPtr;
+
+  // Add empty __ctor and __dtor for the rule of five
+  // However, preserve the original semantics and prohibit copying
+  // as the std::unique_ptr member does not copy.
+  BatchResult() {}
+
+  ~BatchResult() {}
+
+  BatchResult(const BatchResult&) = delete;
+
+  BatchResult& operator=(const BatchResult&) = delete;
+
+  BatchResult(BatchResult&& bResult)
+      : sequence(std::move(bResult.sequence)),
+        writeBatchPtr(std::move(bResult.writeBatchPtr)) {}
+
+  BatchResult& operator=(BatchResult&& bResult) {
+    sequence = std::move(bResult.sequence);
+    writeBatchPtr = std::move(bResult.writeBatchPtr);
+    return *this;
+  }
+};
+
+// A TransactionLogIterator is used to iterate over the transactions in a db.
+// One run of the iterator is continuous, i.e. the iterator will stop at the
+// beginning of any gap in sequences
+class TransactionLogIterator {
+ public:
+  TransactionLogIterator() {}
+  virtual ~TransactionLogIterator() {}
+
+  // An iterator is either positioned at a WriteBatch or not valid.
+  // This method returns true if the iterator is valid.
+  // Can read data from a valid iterator.
+  virtual bool Valid() = 0;
+
+  // Moves the iterator to the next WriteBatch.
+  // REQUIRES: Valid() to be true.
+  virtual void Next() = 0;
+
+  // Returns ok if the iterator is valid.
+  // Returns the Error when something has gone wrong.
+  virtual Status status() = 0;
+
+  // If valid return's the current write_batch and the sequence number of the
+  // earliest transaction contained in the batch.
+  // ONLY use if Valid() is true and status() is OK.
+  virtual BatchResult GetBatch() = 0;
+
+  // The read options for TransactionLogIterator.
+  struct ReadOptions {
+    // If true, all data read from underlying storage will be
+    // verified against corresponding checksums.
+    // Default: true
+    bool verify_checksums_;
+
+    ReadOptions() : verify_checksums_(true) {}
+
+    explicit ReadOptions(bool verify_checksums)
+        : verify_checksums_(verify_checksums) {}
+  };
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/types.h b/src/rocksdb/include/rocksdb/types.h
new file mode 100644
index 000000000..6fb53d846
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/types.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Define all public custom types here.
+
+using ColumnFamilyId = uint32_t;
+
+// Represents a sequence number in a WAL file.
+using SequenceNumber = uint64_t;
+
+const SequenceNumber kMinUnCommittedSeq = 1;  // 0 is always committed
+
+enum class TableFileCreationReason {
+  kFlush,
+  kCompaction,
+  kRecovery,
+  kMisc,
+};
+
+enum class BlobFileCreationReason {
+  kFlush,
+  kCompaction,
+  kRecovery,
+};
+
+// The types of files RocksDB uses in a DB directory. (Available for
+// advanced options.)
+enum FileType {
+  kWalFile,
+  kDBLockFile,
+  kTableFile,
+  kDescriptorFile,
+  kCurrentFile,
+  kTempFile,
+  kInfoLogFile,  // Either the current one, or an old one
+  kMetaDatabase,
+  kIdentityFile,
+  kOptionsFile,
+  kBlobFile
+};
+
+// User-oriented representation of internal key types.
+// Ordering of this enum entries should not change.
+enum EntryType {
+  kEntryPut,
+  kEntryDelete,
+  kEntrySingleDelete,
+  kEntryMerge,
+  kEntryRangeDeletion,
+  kEntryBlobIndex,
+  kEntryDeleteWithTimestamp,
+  kEntryWideColumnEntity,
+  kEntryOther,
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/unique_id.h b/src/rocksdb/include/rocksdb/unique_id.h
new file mode 100644
index 000000000..eb0c77826
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/unique_id.h
@@ -0,0 +1,55 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/table_properties.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Computes a stable, universally unique 128-bit (16 binary char) identifier
+// for an SST file from TableProperties. This is supported for table (SST)
+// files created with RocksDB 6.24 and later. NotSupported will be returned
+// for other cases. The first 16 bytes (128 bits) is of sufficient quality
+// for almost all applications, and shorter prefixes are usable as a
+// hash of the full unique id.
+//
+// Note: .c_str() is not compatible with binary char strings, so using
+// .c_str() on the result will often result in information loss and very
+// poor uniqueness probability.
+//
+// More detail: the value is *guaranteed* unique for SST files
+// generated in the same process (even different DBs, RocksDB >= 6.26),
+// and first 128 bits are guaranteed not "all zeros" (RocksDB >= 6.26)
+// so that the "all zeros" value can be used reliably for a null ID.
+// These IDs are more than sufficient for SST uniqueness within each of
+// many DBs or hosts. For an extreme example assuming random IDs, consider
+// 10^9 hosts each with 10^9 live SST files being replaced at 10^6/second.
+// Such a service would need to run for 10 million years to see an ID
+// collision among live SST files on any host.
+//
+// And assuming one generates many SST files in the lifetime of each process,
+// the probability of ID collisions is much "better than random"; see
+// https://github.com/pdillinger/unique_id
+Status GetUniqueIdFromTableProperties(const TableProperties &props,
+                                      std::string *out_id);
+
+// Computes a 192-bit (24 binary char) stable, universally unique ID
+// with an extra 64 bits of uniqueness compared to the standard ID. It is only
+// appropriate to use this ID instead of the 128-bit ID if ID collisions
+// between files among any hosts in a vast fleet is a problem, such as a shared
+// global namespace for SST file backups. Under this criteria, the extreme
+// example above would expect a global file ID collision every 4 days with
+// 128-bit IDs (using some worst-case assumptions about process lifetime).
+// It's 10^17 years with 192-bit IDs.
+Status GetExtendedUniqueIdFromTableProperties(const TableProperties &props,
+                                              std::string *out_id);
+
+// Converts a binary string (unique id) to hexadecimal, with each 64 bits
+// separated by '-', e.g. 6474DF650323BDF0-B48E64F3039308CA-17284B32E7F7444B
+// Also works on unique id prefix.
+std::string UniqueIdToHumanString(const std::string &id);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/universal_compaction.h b/src/rocksdb/include/rocksdb/universal_compaction.h
new file mode 100644
index 000000000..0b0a85e1c
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/universal_compaction.h
@@ -0,0 +1,96 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <climits>
+#include <cstdint>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+//
+// Algorithm used to make a compaction request stop picking new files
+// into a single compaction run
+//
+enum CompactionStopStyle {
+  kCompactionStopStyleSimilarSize,  // pick files of similar size
+  kCompactionStopStyleTotalSize     // total size of picked files > next file
+};
+
+class CompactionOptionsUniversal {
+ public:
+  // Percentage flexibility while comparing file size. If the candidate file(s)
+  // size is 1% smaller than the next file's size, then include next file into
+  // this candidate set. // Default: 1
+  unsigned int size_ratio;
+
+  // The minimum number of files in a single compaction run. Default: 2
+  unsigned int min_merge_width;
+
+  // The maximum number of files in a single compaction run. Default: UINT_MAX
+  unsigned int max_merge_width;
+
+  // The size amplification is defined as the amount (in percentage) of
+  // additional storage needed to store a single byte of data in the database.
+  // For example, a size amplification of 2% means that a database that
+  // contains 100 bytes of user-data may occupy up to 102 bytes of
+  // physical storage. By this definition, a fully compacted database has
+  // a size amplification of 0%. Rocksdb uses the following heuristic
+  // to calculate size amplification: it assumes that all files excluding
+  // the earliest file contribute to the size amplification.
+  // Default: 200, which means that a 100 byte database could require up to
+  // 300 bytes of storage.
+  unsigned int max_size_amplification_percent;
+
+  // If this option is set to be -1 (the default value), all the output files
+  // will follow compression type specified.
+  //
+  // If this option is not negative, we will try to make sure compressed
+  // size is just above this value. In normal cases, at least this percentage
+  // of data will be compressed.
+  // When we are compacting to a new file, here is the criteria whether
+  // it needs to be compressed: assuming here are the list of files sorted
+  // by generation time:
+  //    A1...An B1...Bm C1...Ct
+  // where A1 is the newest and Ct is the oldest, and we are going to compact
+  // B1...Bm, we calculate the total size of all the files as total_size, as
+  // well as the total size of C1...Ct as total_C, the compaction output file
+  // will be compressed iff
+  //   total_C / total_size < this percentage
+  // Default: -1
+  int compression_size_percent;
+
+  // The algorithm used to stop picking files into a single compaction run
+  // Default: kCompactionStopStyleTotalSize
+  CompactionStopStyle stop_style;
+
+  // Option to optimize the universal multi level compaction by enabling
+  // trivial move for non overlapping files.
+  // Default: false
+  bool allow_trivial_move;
+
+  // EXPERIMENTAL
+  // If true, try to limit compaction size under max_compaction_bytes.
+  // This might cause higher write amplification, but can prevent some
+  // problem caused by large compactions.
+  // Default: false
+  bool incremental;
+
+  // Default set of parameters
+  CompactionOptionsUniversal()
+      : size_ratio(1),
+        min_merge_width(2),
+        max_merge_width(UINT_MAX),
+        max_size_amplification_percent(200),
+        compression_size_percent(-1),
+        stop_style(kCompactionStopStyleTotalSize),
+        allow_trivial_move(false),
+        incremental(false) {}
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/agg_merge.h b/src/rocksdb/include/rocksdb/utilities/agg_merge.h
new file mode 100644
index 000000000..4e21082db
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/agg_merge.h
@@ -0,0 +1,138 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+// The feature is still in development so the encoding format is subject
+// to change.
+//
+// Aggregation Merge Operator is a merge operator that allows users to
+// aggregate merge operands of different keys with different registered
+// aggregation functions. The aggregation can also change for the same
+// key if the functions store the data in the same format.
+// The target application highly overlaps with merge operator in general
+// but we try to provide a better interface so that users are more likely
+// to use pre-implemented plug-in functions and connect with existing
+// third-party aggregation functions (such as those from SQL engines).
+// In this case, the need for users to write customized C++ plug-in code
+// is reduced.
+// If the idea proves to useful, we might consider to move it to be
+// a core functionality of RocksDB, and reduce the support of merge
+// operators.
+//
+// Users can implement aggregation functions by implementing abstract
+// class Aggregator, and register it using AddAggregator().
+// The merge operator can be retrieved from GetAggMergeOperator() and
+// it is a singleton.
+//
+// Users can push values to be updated with a merge operand encoded with
+// registered function name and payload using EncodeAggFuncAndPayload(),
+// and the merge operator will invoke the aggregation function.
+// An example:
+//
+//    // Assume class ExampleSumAggregator is implemented to do simple sum.
+//    AddAggregator("sum", std::make_unique<ExampleSumAggregator>());
+//    std::shared_ptr<MergeOperator> mp_guard = CreateAggMergeOperator();
+//    options.merge_operator = mp_guard.get();
+//    ...... // Creating DB
+//
+//
+//    std::string encoded_value;
+//    s = EncodeAggFuncAndPayload(kUnamedFuncName, "200", encoded_value);
+//    assert(s.ok());
+//    db->Put(WriteOptions(), "foo", encoded_value);
+//    s = EncodeAggFuncAndPayload("sum", "200", encoded_value);
+//    assert(s.ok());
+//    db->Merge(WriteOptions(), "foo", encoded_value);
+//    s = EncodeAggFuncAndPayload("sum", "200", encoded_value);
+//    assert(s.ok());
+//    db->Merge(WriteOptions(), "foo", encoded_value);
+//
+//    std::string value;
+//    Status s = db->Get(ReadOptions, "foo", &value);
+//    assert(s.ok());
+//    Slice func, aggregated_value;
+//    assert(ExtractAggFuncAndValue(value, func, aggregated_value));
+//    assert(func == "sum");
+//    assert(aggregated_value == "600");
+//
+//
+// DB::Put() can also be used to add a payloadin the same way as Merge().
+//
+// kUnamedFuncName can be used as a placeholder function name. This will
+// be aggregated with merge operands inserted later based on function
+// name given there.
+//
+// If the aggregation function is not registered or there is an error
+// returned by aggregation function, the result will be encoded with a fake
+// aggregation function kErrorFuncName, with each merge operands to be encoded
+// into a list that can be extracted using ExtractList();
+//
+// If users add a merge operand using a different aggregation function from
+// the previous one, the merge operands for the previous one is aggregated
+// and the payload part of the result is treated as the first payload of
+// the items for the new aggregation function. For example, users can
+// Merge("plus, 1"), merge("plus 2"), merge("minus 3") and the aggregation
+// result would be "minus 0".
+//
+
+// A class used to aggregate data per key/value. The plug-in function is
+// implemented and registered using AddAggregator(). And then use it
+// with merge operator created using CreateAggMergeOperator().
+class Aggregator {
+ public:
+  virtual ~Aggregator() {}
+  // The input list is in reverse insertion order, with values[0] to be
+  // the one inserted last and values.back() to be the one inserted first.
+  // The oldest one might be from Get().
+  // Return whether aggregation succeeded. False for aggregation error.
+  virtual bool Aggregate(const std::vector<Slice>& values,
+                         std::string& result) const = 0;
+
+  // True if a partial aggregation should be invoked. Some aggregators
+  // might opt to skip partial aggregation if possible.
+  virtual bool DoPartialAggregate() const { return true; }
+};
+
+// The function adds aggregation plugin by function name. It is used
+// by all the aggregation operator created using CreateAggMergeOperator().
+// It's currently not thread safe to run concurrently with the aggregation
+// merge operator. It is recommended that all the aggregation function
+// is added before calling CreateAggMergeOperator().
+Status AddAggregator(const std::string& function_name,
+                     std::unique_ptr<Aggregator>&& agg);
+
+// Get the singleton instance of merge operator for aggregation.
+// Always the same one is returned with a shared_ptr is hold as a
+// static variable by the function.
+// This is done so because options.merge_operator is shared_ptr.
+std::shared_ptr<MergeOperator> GetAggMergeOperator();
+
+// Encode aggregation function and payload that can be consumed by aggregation
+// merge operator.
+Status EncodeAggFuncAndPayload(const Slice& function_name, const Slice& payload,
+                               std::string& output);
+// Helper function to extract aggregation function name and payload.
+// Return false if it fails to decode.
+bool ExtractAggFuncAndValue(const Slice& op, Slice& func, Slice& value);
+
+// Extract encoded list. This can be used to extract error merge operands when
+// the returned function name is kErrorFuncName.
+bool ExtractList(const Slice& encoded_list, std::vector<Slice>& decoded_list);
+
+// Special function name that allows it to be merged to subsequent type.
+extern const std::string kUnnamedFuncName;
+
+// Special error function name reserved for merging or aggregation error.
+extern const std::string kErrorFuncName;
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/backup_engine.h b/src/rocksdb/include/rocksdb/utilities/backup_engine.h
new file mode 100644
index 000000000..f28ad9618
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/backup_engine.h
@@ -0,0 +1,631 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <cstdint>
+#include <functional>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "rocksdb/env.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/metadata.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The default DB file checksum function name.
+constexpr char kDbFileChecksumFuncName[] = "FileChecksumCrc32c";
+// The default BackupEngine file checksum function name.
+constexpr char kBackupFileChecksumFuncName[] = "crc32c";
+
+struct BackupEngineOptions {
+  // Where to keep the backup files. Has to be different than dbname_
+  // Best to set this to dbname_ + "/backups"
+  // Required
+  std::string backup_dir;
+
+  // Backup Env object. It will be used for backup file I/O. If it's
+  // nullptr, backups will be written out using DBs Env. If it's
+  // non-nullptr, backup's I/O will be performed using this object.
+  // Default: nullptr
+  Env* backup_env;
+
+  // share_table_files supports table and blob files.
+  //
+  // If share_table_files == true, the backup directory will share table and
+  // blob files among backups, to save space among backups of the same DB and to
+  // enable incremental backups by only copying new files.
+  // If share_table_files == false, each backup will be on its own and will not
+  // share any data with other backups.
+  //
+  // default: true
+  bool share_table_files;
+
+  // Backup info and error messages will be written to info_log
+  // if non-nullptr.
+  // Default: nullptr
+  Logger* info_log;
+
+  // If sync == true, we can guarantee you'll get consistent backup and
+  // restore even on a machine crash/reboot. Backup and restore processes are
+  // slower with sync enabled. If sync == false, we can only guarantee that
+  // other previously synced backups and restores are not modified while
+  // creating a new one.
+  // Default: true
+  bool sync;
+
+  // If true, it will delete whatever backups there are already
+  // Default: false
+  bool destroy_old_data;
+
+  // If false, we won't backup log files. This option can be useful for backing
+  // up in-memory databases where log file are persisted, but table files are in
+  // memory.
+  // Default: true
+  bool backup_log_files;
+
+  // Max bytes that can be transferred in a second during backup.
+  // If 0, go as fast as you can
+  // This limit only applies to writes. To also limit reads,
+  // a rate limiter able to also limit reads (e.g, its mode = kAllIo)
+  // have to be passed in through the option "backup_rate_limiter"
+  // Default: 0
+  uint64_t backup_rate_limit;
+
+  // Backup rate limiter. Used to control transfer speed for backup. If this is
+  // not null, backup_rate_limit is ignored.
+  // Default: nullptr
+  std::shared_ptr<RateLimiter> backup_rate_limiter{nullptr};
+
+  // Max bytes that can be transferred in a second during restore.
+  // If 0, go as fast as you can
+  // This limit only applies to writes. To also limit reads,
+  // a rate limiter able to also limit reads (e.g, its mode = kAllIo)
+  // have to be passed in through the option "restore_rate_limiter"
+  // Default: 0
+  uint64_t restore_rate_limit;
+
+  // Restore rate limiter. Used to control transfer speed during restore. If
+  // this is not null, restore_rate_limit is ignored.
+  // Default: nullptr
+  std::shared_ptr<RateLimiter> restore_rate_limiter{nullptr};
+
+  // share_files_with_checksum supports table and blob files.
+  //
+  // Only used if share_table_files is set to true. Setting to false is
+  // DEPRECATED and potentially dangerous because in that case BackupEngine
+  // can lose data if backing up databases with distinct or divergent
+  // history, for example if restoring from a backup other than the latest,
+  // writing to the DB, and creating another backup. Setting to true (default)
+  // prevents these issues by ensuring that different table files (SSTs) and
+  // blob files with the same number are treated as distinct. See
+  // share_files_with_checksum_naming and ShareFilesNaming.
+  //
+  // Default: true
+  bool share_files_with_checksum;
+
+  // Up to this many background threads will copy files for CreateNewBackup()
+  // and RestoreDBFromBackup()
+  // Default: 1
+  int max_background_operations;
+
+  // During backup user can get callback every time next
+  // callback_trigger_interval_size bytes being copied.
+  // Default: 4194304
+  uint64_t callback_trigger_interval_size;
+
+  // For BackupEngineReadOnly, Open() will open at most this many of the
+  // latest non-corrupted backups.
+  //
+  // Note: this setting is ignored (behaves like INT_MAX) for any kind of
+  // writable BackupEngine because it would inhibit accounting for shared
+  // files for proper backup deletion, including purging any incompletely
+  // created backups on creation of a new backup.
+  //
+  // Default: INT_MAX
+  int max_valid_backups_to_open;
+
+  // ShareFilesNaming describes possible naming schemes for backup
+  // table and blob file names when they are stored in the
+  // shared_checksum directory (i.e., both share_table_files and
+  // share_files_with_checksum are true).
+  enum ShareFilesNaming : uint32_t {
+    // Backup blob filenames are <file_number>_<crc32c>_<file_size>.blob and
+    // backup SST filenames are <file_number>_<crc32c>_<file_size>.sst
+    // where <crc32c> is an unsigned decimal integer. This is the
+    // original/legacy naming scheme for share_files_with_checksum,
+    // with two problems:
+    // * At massive scale, collisions on this triple with different file
+    //   contents is plausible.
+    // * Determining the name to use requires computing the checksum,
+    //   so generally requires reading the whole file even if the file
+    //   is already backed up.
+    //
+    // ** ONLY RECOMMENDED FOR PRESERVING OLD BEHAVIOR **
+    kLegacyCrc32cAndFileSize = 1U,
+
+    // Backup SST filenames are <file_number>_s<db_session_id>.sst. This
+    // pair of values should be very strongly unique for a given SST file
+    // and easily determined before computing a checksum. The 's' indicates
+    // the value is a DB session id, not a checksum.
+    //
+    // Exceptions:
+    // * For blob files, kLegacyCrc32cAndFileSize is used as currently
+    //   db_session_id is not supported by the blob file format.
+    // * For old SST files without a DB session id, kLegacyCrc32cAndFileSize
+    //   will be used instead, matching the names assigned by RocksDB versions
+    //   not supporting the newer naming scheme.
+    // * See also flags below.
+    kUseDbSessionId = 2U,
+
+    kMaskNoNamingFlags = 0xffffU,
+
+    // If not already part of the naming scheme, insert
+    //   _<file_size>
+    // before .sst and .blob in the name. In case of user code actually parsing
+    // the last _<whatever> before the .sst  and .blob as the file size, this
+    // preserves that feature of kLegacyCrc32cAndFileSize. In other words, this
+    // option makes official that unofficial feature of the backup metadata.
+    //
+    // We do not consider SST and blob file sizes to have sufficient entropy to
+    // contribute significantly to naming uniqueness.
+    kFlagIncludeFileSize = 1U << 31,
+
+    kMaskNamingFlags = ~kMaskNoNamingFlags,
+  };
+
+  // Naming option for share_files_with_checksum table and blob files. See
+  // ShareFilesNaming for details.
+  //
+  // Modifying this option cannot introduce a downgrade compatibility issue
+  // because RocksDB can read, restore, and delete backups using different file
+  // names, and it's OK for a backup directory to use a mixture of table and
+  // blob files naming schemes.
+  //
+  // However, modifying this option and saving more backups to the same
+  // directory can lead to the same file getting saved again to that
+  // directory, under the new shared name in addition to the old shared
+  // name.
+  //
+  // Default: kUseDbSessionId | kFlagIncludeFileSize
+  //
+  // Note: This option comes into effect only if both share_files_with_checksum
+  // and share_table_files are true.
+  ShareFilesNaming share_files_with_checksum_naming;
+
+  // Major schema version to use when writing backup meta files
+  // 1 (default) - compatible with very old versions of RocksDB.
+  // 2 - can be read by RocksDB versions >= 6.19.0. Minimum schema version for
+  //   * (Experimental) saving and restoring file temperature metadata
+  int schema_version = 1;
+
+  // (Experimental - subject to change or removal) When taking a backup and
+  // saving file temperature info (minimum schema_version is 2), there are
+  // two potential sources of truth for the placement of files into temperature
+  // tiers: (a) the current file temperature reported by the FileSystem or
+  // (b) the expected file temperature recorded in DB manifest. When this
+  // option is false (default), (b) overrides (a) if both are not UNKNOWN.
+  // When true, (a) overrides (b) if both are not UNKNOWN. Regardless of this
+  // setting, a known temperature overrides UNKNOWN.
+  bool current_temperatures_override_manifest = false;
+
+  void Dump(Logger* logger) const;
+
+  explicit BackupEngineOptions(
+      const std::string& _backup_dir, Env* _backup_env = nullptr,
+      bool _share_table_files = true, Logger* _info_log = nullptr,
+      bool _sync = true, bool _destroy_old_data = false,
+      bool _backup_log_files = true, uint64_t _backup_rate_limit = 0,
+      uint64_t _restore_rate_limit = 0, int _max_background_operations = 1,
+      uint64_t _callback_trigger_interval_size = 4 * 1024 * 1024,
+      int _max_valid_backups_to_open = INT_MAX,
+      ShareFilesNaming _share_files_with_checksum_naming =
+          static_cast<ShareFilesNaming>(kUseDbSessionId | kFlagIncludeFileSize))
+      : backup_dir(_backup_dir),
+        backup_env(_backup_env),
+        share_table_files(_share_table_files),
+        info_log(_info_log),
+        sync(_sync),
+        destroy_old_data(_destroy_old_data),
+        backup_log_files(_backup_log_files),
+        backup_rate_limit(_backup_rate_limit),
+        restore_rate_limit(_restore_rate_limit),
+        share_files_with_checksum(true),
+        max_background_operations(_max_background_operations),
+        callback_trigger_interval_size(_callback_trigger_interval_size),
+        max_valid_backups_to_open(_max_valid_backups_to_open),
+        share_files_with_checksum_naming(_share_files_with_checksum_naming) {
+    assert(share_table_files || !share_files_with_checksum);
+    assert((share_files_with_checksum_naming & kMaskNoNamingFlags) != 0);
+  }
+};
+
+inline BackupEngineOptions::ShareFilesNaming operator&(
+    BackupEngineOptions::ShareFilesNaming lhs,
+    BackupEngineOptions::ShareFilesNaming rhs) {
+  uint32_t l = static_cast<uint32_t>(lhs);
+  uint32_t r = static_cast<uint32_t>(rhs);
+  assert(r == BackupEngineOptions::kMaskNoNamingFlags ||
+         (r & BackupEngineOptions::kMaskNoNamingFlags) == 0);
+  return static_cast<BackupEngineOptions::ShareFilesNaming>(l & r);
+}
+
+inline BackupEngineOptions::ShareFilesNaming operator|(
+    BackupEngineOptions::ShareFilesNaming lhs,
+    BackupEngineOptions::ShareFilesNaming rhs) {
+  uint32_t l = static_cast<uint32_t>(lhs);
+  uint32_t r = static_cast<uint32_t>(rhs);
+  assert((r & BackupEngineOptions::kMaskNoNamingFlags) == 0);
+  return static_cast<BackupEngineOptions::ShareFilesNaming>(l | r);
+}
+
+struct CreateBackupOptions {
+  // Flush will always trigger if 2PC is enabled.
+  // If write-ahead logs are disabled, set flush_before_backup=true to
+  // avoid losing unflushed key/value pairs from the memtable.
+  bool flush_before_backup = false;
+
+  // Callback for reporting progress, based on callback_trigger_interval_size.
+  //
+  // RocksDB callbacks are NOT exception-safe. A callback completing with an
+  // exception can lead to undefined behavior in RocksDB, including data loss,
+  // unreported corruption, deadlocks, and more.
+  std::function<void()> progress_callback = []() {};
+
+  // If false, background_thread_cpu_priority is ignored.
+  // Otherwise, the cpu priority can be decreased,
+  // if you try to increase the priority, the priority will not change.
+  // The initial priority of the threads is CpuPriority::kNormal,
+  // so you can decrease to priorities lower than kNormal.
+  bool decrease_background_thread_cpu_priority = false;
+  CpuPriority background_thread_cpu_priority = CpuPriority::kNormal;
+};
+
+struct RestoreOptions {
+  // If true, restore won't overwrite the existing log files in wal_dir. It will
+  // also move all log files from archive directory to wal_dir. Use this option
+  // in combination with BackupEngineOptions::backup_log_files = false for
+  // persisting in-memory databases.
+  // Default: false
+  bool keep_log_files;
+
+  explicit RestoreOptions(bool _keep_log_files = false)
+      : keep_log_files(_keep_log_files) {}
+};
+
+using BackupID = uint32_t;
+
+using BackupFileInfo = FileStorageInfo;
+
+struct BackupInfo {
+  BackupID backup_id = 0U;
+  // Creation time, according to GetCurrentTime
+  int64_t timestamp = 0;
+
+  // Total size in bytes (based on file payloads, not including filesystem
+  // overheads or backup meta file)
+  uint64_t size = 0U;
+
+  // Number of backed up files, some of which might be shared with other
+  // backups. Does not include backup meta file.
+  uint32_t number_files = 0U;
+
+  // Backup API user metadata
+  std::string app_metadata;
+
+  // Backup file details, if requested with include_file_details=true
+  std::vector<BackupFileInfo> file_details;
+
+  // DB "name" (a directory in the backup_env) for opening this backup as a
+  // read-only DB. This should also be used as the DBOptions::wal_dir, such
+  // as by default setting wal_dir="". See also env_for_open.
+  // This field is only set if include_file_details=true
+  std::string name_for_open;
+
+  // An Env(+FileSystem) for opening this backup as a read-only DB, with
+  // DB::OpenForReadOnly or similar. This field is only set if
+  // include_file_details=true. (The FileSystem in this Env takes care
+  // of making shared backup files openable from the `name_for_open` DB
+  // directory.) See also name_for_open.
+  //
+  // This Env might or might not be shared with other backups. To work
+  // around DBOptions::env being a raw pointer, this is a shared_ptr so
+  // that keeping either this BackupInfo, the BackupEngine, or a copy of
+  // this shared_ptr alive is sufficient to keep the Env alive for use by
+  // a read-only DB.
+  std::shared_ptr<Env> env_for_open;
+
+  BackupInfo() {}
+
+  BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size,
+             uint32_t _number_files, const std::string& _app_metadata)
+      : backup_id(_backup_id),
+        timestamp(_timestamp),
+        size(_size),
+        number_files(_number_files),
+        app_metadata(_app_metadata) {}
+};
+
+class BackupStatistics {
+ public:
+  BackupStatistics() {
+    number_success_backup = 0;
+    number_fail_backup = 0;
+  }
+
+  BackupStatistics(uint32_t _number_success_backup,
+                   uint32_t _number_fail_backup)
+      : number_success_backup(_number_success_backup),
+        number_fail_backup(_number_fail_backup) {}
+
+  ~BackupStatistics() {}
+
+  void IncrementNumberSuccessBackup();
+  void IncrementNumberFailBackup();
+
+  uint32_t GetNumberSuccessBackup() const;
+  uint32_t GetNumberFailBackup() const;
+
+  std::string ToString() const;
+
+ private:
+  uint32_t number_success_backup;
+  uint32_t number_fail_backup;
+};
+
+// Read-only functions of a BackupEngine. (Restore writes to another directory
+// not the backup directory.) See BackupEngine comments for details on
+// safe concurrent operations.
+class BackupEngineReadOnlyBase {
+ public:
+  virtual ~BackupEngineReadOnlyBase() {}
+
+  // Returns info about the latest good backup in backup_info, or NotFound
+  // no good backup exists.
+  // Setting include_file_details=true provides information about each
+  // backed-up file in BackupInfo::file_details and more.
+  virtual Status GetLatestBackupInfo(
+      BackupInfo* backup_info, bool include_file_details = false) const = 0;
+
+  // Returns info about a specific backup in backup_info, or NotFound
+  // or Corruption status if the requested backup id does not exist or is
+  // known corrupt.
+  // Setting include_file_details=true provides information about each
+  // backed-up file in BackupInfo::file_details and more.
+  virtual Status GetBackupInfo(BackupID backup_id, BackupInfo* backup_info,
+                               bool include_file_details = false) const = 0;
+
+  // Returns info about non-corrupt backups in backup_infos.
+  // Setting include_file_details=true provides information about each
+  // backed-up file in BackupInfo::file_details and more.
+  virtual void GetBackupInfo(std::vector<BackupInfo>* backup_infos,
+                             bool include_file_details = false) const = 0;
+
+  // Returns info about corrupt backups in corrupt_backups.
+  // WARNING: Any write to the BackupEngine could trigger automatic
+  // GarbageCollect(), which could delete files that would be needed to
+  // manually recover a corrupt backup or to preserve an unrecognized (e.g.
+  // incompatible future version) backup.
+  virtual void GetCorruptedBackups(
+      std::vector<BackupID>* corrupt_backup_ids) const = 0;
+
+  // Restore to specified db_dir and wal_dir from backup_id.
+  virtual IOStatus RestoreDBFromBackup(const RestoreOptions& options,
+                                       BackupID backup_id,
+                                       const std::string& db_dir,
+                                       const std::string& wal_dir) const = 0;
+
+  // keep for backward compatibility.
+  virtual IOStatus RestoreDBFromBackup(
+      BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
+      const RestoreOptions& options = RestoreOptions()) const {
+    return RestoreDBFromBackup(options, backup_id, db_dir, wal_dir);
+  }
+
+  // Like RestoreDBFromBackup but restores from latest non-corrupt backup_id
+  virtual IOStatus RestoreDBFromLatestBackup(
+      const RestoreOptions& options, const std::string& db_dir,
+      const std::string& wal_dir) const = 0;
+
+  // keep for backward compatibility.
+  virtual IOStatus RestoreDBFromLatestBackup(
+      const std::string& db_dir, const std::string& wal_dir,
+      const RestoreOptions& options = RestoreOptions()) const {
+    return RestoreDBFromLatestBackup(options, db_dir, wal_dir);
+  }
+
+  // If verify_with_checksum is true, this function
+  // inspects the current checksums and file sizes of backup files to see if
+  // they match our expectation.
+  //
+  // If verify_with_checksum is false, this function
+  // checks that each file exists and that the size of the file matches our
+  // expectation. It does not check file checksum.
+  //
+  // If this BackupEngine created the backup, it compares the files' current
+  // sizes (and current checksum) against the number of bytes written to
+  // them (and the checksum calculated) during creation.
+  // Otherwise, it compares the files' current sizes (and checksums) against
+  // their sizes (and checksums) when the BackupEngine was opened.
+  //
+  // Returns Status::OK() if all checks are good
+  virtual IOStatus VerifyBackup(BackupID backup_id,
+                                bool verify_with_checksum = false) const = 0;
+};
+
+// Append-only functions of a BackupEngine. See BackupEngine comment for
+// details on distinction between Append and Write operations and safe
+// concurrent operations.
+class BackupEngineAppendOnlyBase {
+ public:
+  virtual ~BackupEngineAppendOnlyBase() {}
+
+  // same as CreateNewBackup, but stores extra application metadata.
+  virtual IOStatus CreateNewBackupWithMetadata(
+      const CreateBackupOptions& options, DB* db,
+      const std::string& app_metadata, BackupID* new_backup_id = nullptr) = 0;
+
+  // keep here for backward compatibility.
+  virtual IOStatus CreateNewBackupWithMetadata(
+      DB* db, const std::string& app_metadata, bool flush_before_backup = false,
+      std::function<void()> progress_callback = []() {}) {
+    CreateBackupOptions options;
+    options.flush_before_backup = flush_before_backup;
+    options.progress_callback = progress_callback;
+    return CreateNewBackupWithMetadata(options, db, app_metadata);
+  }
+
+  // Captures the state of the database by creating a new (latest) backup.
+  // On success (OK status), the BackupID of the new backup is saved to
+  // *new_backup_id when not nullptr.
+  // NOTE: db_paths and cf_paths are not supported for creating backups,
+  // and NotSupported will be returned when the DB (without WALs) uses more
+  // than one directory.
+  virtual IOStatus CreateNewBackup(const CreateBackupOptions& options, DB* db,
+                                   BackupID* new_backup_id = nullptr) {
+    return CreateNewBackupWithMetadata(options, db, "", new_backup_id);
+  }
+
+  // keep here for backward compatibility.
+  virtual IOStatus CreateNewBackup(
+      DB* db, bool flush_before_backup = false,
+      std::function<void()> progress_callback = []() {}) {
+    CreateBackupOptions options;
+    options.flush_before_backup = flush_before_backup;
+    options.progress_callback = progress_callback;
+    return CreateNewBackup(options, db);
+  }
+
+  // Call this from another thread if you want to stop the backup
+  // that is currently happening. It will return immediately, will
+  // not wait for the backup to stop.
+  // The backup will stop ASAP and the call to CreateNewBackup will
+  // return Status::Incomplete(). It will not clean up after itself, but
+  // the state will remain consistent. The state will be cleaned up the
+  // next time you call CreateNewBackup or GarbageCollect.
+  virtual void StopBackup() = 0;
+
+  // Will delete any files left over from incomplete creation or deletion of
+  // a backup. This is not normally needed as those operations also clean up
+  // after prior incomplete calls to the same kind of operation (create or
+  // delete). This does not delete corrupt backups but can delete files that
+  // would be needed to manually recover a corrupt backup or to preserve an
+  // unrecognized (e.g. incompatible future version) backup.
+  // NOTE: This is not designed to delete arbitrary files added to the backup
+  // directory outside of BackupEngine, and clean-up is always subject to
+  // permissions on and availability of the underlying filesystem.
+  // NOTE2: For concurrency and interference purposes (see BackupEngine
+  // comment), GarbageCollect (GC) is like other Append operations, even
+  // though it seems different. Although GC can delete physical data, it does
+  // not delete any logical data read by Read operations. GC can interfere
+  // with Append or Write operations in another BackupEngine on the same
+  // backup_dir, because temporary files will be treated as obsolete and
+  // deleted.
+  virtual IOStatus GarbageCollect() = 0;
+};
+
+// A backup engine for organizing and managing backups.
+// This class is not user-extensible.
+//
+// This class declaration adds "Write" operations in addition to the
+// operations from BackupEngineAppendOnlyBase and BackupEngineReadOnlyBase.
+//
+// # Concurrency between threads on the same BackupEngine* object
+//
+// As of version 6.20, BackupEngine* operations are generally thread-safe,
+// using a read-write lock, though single-thread operation is still
+// recommended to avoid TOCTOU bugs. Specifically, particular kinds of
+// concurrent operations behave like this:
+//
+// op1\op2| Read  | Append | Write
+// -------|-------|--------|--------
+//   Read | conc  | block  | block
+// Append | block | block  | block
+//  Write | block | block  | block
+//
+// conc = operations safely proceed concurrently
+// block = one of the operations safely blocks until the other completes.
+//   There is generally no guarantee as to which completes first.
+//
+// StopBackup is the only operation that affects an ongoing operation.
+//
+// # Interleaving operations between BackupEngine* objects open on the
+// same backup_dir
+//
+// It is recommended only to have one BackupEngine* object open for a given
+// backup_dir, but it is possible to mix / interleave some operations
+// (regardless of whether they are concurrent) with these caveats:
+//
+// op1\op2|  Open  |  Read  | Append | Write
+// -------|--------|--------|--------|--------
+//   Open | conc   | conc   | atomic | unspec
+//   Read | conc   | conc   | old    | unspec
+// Append | atomic | old    | unspec | unspec
+//  Write | unspec | unspec | unspec | unspec
+//
+// Special case: Open with destroy_old_data=true is really a Write
+//
+// conc = operations safely proceed, concurrently when applicable
+// atomic = operations are effectively atomic; if a concurrent Append
+//   operation has not completed at some key point during Open, the
+//   opened BackupEngine* will never see the result of the Append op.
+// old = Read operations do not include any state changes from other
+//   BackupEngine* objects; they return the state at their Open time.
+// unspec = Behavior is unspecified, including possibly trashing the
+//   backup_dir, but is "memory safe" (no C++ undefined behavior)
+//
+class BackupEngine : public BackupEngineReadOnlyBase,
+                     public BackupEngineAppendOnlyBase {
+ public:
+  virtual ~BackupEngine() {}
+
+  // BackupEngineOptions have to be the same as the ones used in previous
+  // BackupEngines for the same backup directory.
+  static IOStatus Open(const BackupEngineOptions& options, Env* db_env,
+                       BackupEngine** backup_engine_ptr);
+
+  // keep for backward compatibility.
+  static IOStatus Open(Env* db_env, const BackupEngineOptions& options,
+                       BackupEngine** backup_engine_ptr) {
+    return BackupEngine::Open(options, db_env, backup_engine_ptr);
+  }
+
+  // Deletes old backups, keeping latest num_backups_to_keep alive.
+  // See also DeleteBackup.
+  virtual IOStatus PurgeOldBackups(uint32_t num_backups_to_keep) = 0;
+
+  // Deletes a specific backup. If this operation (or PurgeOldBackups)
+  // is not completed due to crash, power failure, etc. the state
+  // will be cleaned up the next time you call DeleteBackup,
+  // PurgeOldBackups, or GarbageCollect.
+  virtual IOStatus DeleteBackup(BackupID backup_id) = 0;
+};
+
+// A variant of BackupEngine that only allows "Read" operations. See
+// BackupEngine comment for details. This class is not user-extensible.
+class BackupEngineReadOnly : public BackupEngineReadOnlyBase {
+ public:
+  virtual ~BackupEngineReadOnly() {}
+
+  static IOStatus Open(const BackupEngineOptions& options, Env* db_env,
+                       BackupEngineReadOnly** backup_engine_ptr);
+  // keep for backward compatibility.
+  static IOStatus Open(Env* db_env, const BackupEngineOptions& options,
+                       BackupEngineReadOnly** backup_engine_ptr) {
+    return BackupEngineReadOnly::Open(options, db_env, backup_engine_ptr);
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/cache_dump_load.h b/src/rocksdb/include/rocksdb/utilities/cache_dump_load.h
new file mode 100644
index 000000000..fde03db7e
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/cache_dump_load.h
@@ -0,0 +1,142 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <set>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/secondary_cache.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The classes and functions in this header file is used for dumping out the
+// blocks in a block cache, storing or transfering the blocks to another
+// destination host, and load these blocks to the secondary cache at destination
+// host.
+// NOTE that: The classes, functions, and data structures are EXPERIMENTAL! They
+// my be changed in the future when the development continues.
+
+// The major and minor version number of the data format to be stored/trandfered
+// via CacheDumpWriter and read out via CacheDumpReader
+static const int kCacheDumpMajorVersion = 0;
+static const int kCacheDumpMinorVersion = 1;
+
+// NOTE that: this class is EXPERIMENTAL! May be changed in the future!
+// This is an abstract class to write or transfer the data that is created by
+// CacheDumper. We pack one block with its block type, dump time, block key in
+// the block cache, block len, block crc32c checksum and block itself as a unit
+// and it is stored via WritePacket. Before we call WritePacket, we must call
+// WriteMetadata once, which stores the sequence number, block unit checksum,
+// and block unit size.
+// We provide file based CacheDumpWriter to store the metadata and its package
+// sequentially in a file as the defualt implementation. Users can implement
+// their own CacheDumpWriter to store/transfer the data. For example, user can
+// create a subclass which transfer the metadata and package on the fly.
+class CacheDumpWriter {
+ public:
+  virtual ~CacheDumpWriter() = default;
+
+  // Called ONCE before the calls to WritePacket
+  virtual IOStatus WriteMetadata(const Slice& metadata) = 0;
+  virtual IOStatus WritePacket(const Slice& data) = 0;
+  virtual IOStatus Close() = 0;
+};
+
+// NOTE that: this class is EXPERIMENTAL! May be changed in the future!
+// This is an abstract class to read or receive the data that is stored
+// or transfered by CacheDumpWriter. Note that, ReadMetadata must be called
+// once before we call a ReadPacket.
+class CacheDumpReader {
+ public:
+  virtual ~CacheDumpReader() = default;
+  // Called ONCE before the calls to ReadPacket
+  virtual IOStatus ReadMetadata(std::string* metadata) = 0;
+  // Sets data to empty string on EOF
+  virtual IOStatus ReadPacket(std::string* data) = 0;
+  // (Close not needed)
+};
+
+// CacheDumpOptions is the option for CacheDumper and CacheDumpedLoader. Any
+// dump or load process related control variables can be added here.
+struct CacheDumpOptions {
+  SystemClock* clock;
+};
+
+// NOTE that: this class is EXPERIMENTAL! May be changed in the future!
+// This the class to dump out the block in the block cache, store/transfer them
+// via CacheDumpWriter. In order to dump out the blocks belonging to a certain
+// DB or a list of DB (block cache can be shared by many DB), user needs to call
+// SetDumpFilter to specify a list of DB to filter out the blocks that do not
+// belong to those DB.
+// A typical use case is: when we migrate a DB instance from host A to host B.
+// We need to reopen the DB at host B after all the files are copied to host B.
+// At this moment, the block cache at host B does not have any block from this
+// migrated DB. Therefore, the read performance can be low due to cache warm up.
+// By using CacheDumper before we shut down the DB at host A and using
+// CacheDumpedLoader at host B before we reopen the DB, we can warmup the cache
+// ahead. This function can be used in other use cases also.
+class CacheDumper {
+ public:
+  virtual ~CacheDumper() = default;
+  // Only dump the blocks in the block cache that belong to the DBs in this list
+  virtual Status SetDumpFilter(std::vector<DB*> db_list) {
+    (void)db_list;
+    return Status::NotSupported("SetDumpFilter is not supported");
+  }
+  // The main function to dump out all the blocks that satisfy the filter
+  // condition from block cache to a certain CacheDumpWriter in one shot. This
+  // process may take some time.
+  virtual IOStatus DumpCacheEntriesToWriter() {
+    return IOStatus::NotSupported("DumpCacheEntriesToWriter is not supported");
+  }
+};
+
+// NOTE that: this class is EXPERIMENTAL! May be changed in the future!
+// This is the class to load the dumped blocks to the destination cache. For now
+// we only load the blocks to the SecondaryCache. In the future, we may plan to
+// support loading to the block cache.
+class CacheDumpedLoader {
+ public:
+  virtual ~CacheDumpedLoader() = default;
+  virtual IOStatus RestoreCacheEntriesToSecondaryCache() {
+    return IOStatus::NotSupported(
+        "RestoreCacheEntriesToSecondaryCache is not supported");
+  }
+};
+
+// Get the writer which stores all the metadata and data sequentially to a file
+IOStatus NewToFileCacheDumpWriter(const std::shared_ptr<FileSystem>& fs,
+                                  const FileOptions& file_opts,
+                                  const std::string& file_name,
+                                  std::unique_ptr<CacheDumpWriter>* writer);
+
+// Get the reader which read out the metadata and data sequentially from a file
+IOStatus NewFromFileCacheDumpReader(const std::shared_ptr<FileSystem>& fs,
+                                    const FileOptions& file_opts,
+                                    const std::string& file_name,
+                                    std::unique_ptr<CacheDumpReader>* reader);
+
+// Get the default cache dumper
+Status NewDefaultCacheDumper(const CacheDumpOptions& dump_options,
+                             const std::shared_ptr<Cache>& cache,
+                             std::unique_ptr<CacheDumpWriter>&& writer,
+                             std::unique_ptr<CacheDumper>* cache_dumper);
+
+// Get the default cache dump loader
+Status NewDefaultCacheDumpedLoader(
+    const CacheDumpOptions& dump_options,
+    const BlockBasedTableOptions& toptions,
+    const std::shared_ptr<SecondaryCache>& secondary_cache,
+    std::unique_ptr<CacheDumpReader>&& reader,
+    std::unique_ptr<CacheDumpedLoader>* cache_dump_loader);
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/checkpoint.h b/src/rocksdb/include/rocksdb/utilities/checkpoint.h
new file mode 100644
index 000000000..ecf920616
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/checkpoint.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// A checkpoint is an openable snapshot of a database at a point in time.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DB;
+class ColumnFamilyHandle;
+struct LiveFileMetaData;
+struct ExportImportFilesMetaData;
+
+class Checkpoint {
+ public:
+  // Creates a Checkpoint object to be used for creating openable snapshots
+  static Status Create(DB* db, Checkpoint** checkpoint_ptr);
+
+  // Builds an openable snapshot of RocksDB. checkpoint_dir should contain an
+  // absolute path. The specified directory should not exist, since it will be
+  // created by the API.
+  // When a checkpoint is created,
+  // (1) SST and blob files are hard linked if the output directory is on the
+  // same filesystem as the database, and copied otherwise.
+  // (2) other required files (like MANIFEST) are always copied.
+  // log_size_for_flush: if the total log file size is equal or larger than
+  // this value, then a flush is triggered for all the column families. The
+  // default value is 0, which means flush is always triggered. If you move
+  // away from the default, the checkpoint may not contain up-to-date data
+  // if WAL writing is not always enabled.
+  // Flush will always trigger if it is 2PC.
+  // sequence_number_ptr: if it is not nullptr, the value it points to will be
+  // set to a sequence number guaranteed to be part of the DB, not necessarily
+  // the latest. The default value of this parameter is nullptr.
+  // NOTE: db_paths and cf_paths are not supported for creating checkpoints
+  // and NotSupported will be returned when the DB (without WALs) uses more
+  // than one directory.
+  virtual Status CreateCheckpoint(const std::string& checkpoint_dir,
+                                  uint64_t log_size_for_flush = 0,
+                                  uint64_t* sequence_number_ptr = nullptr);
+
+  // Exports all live SST files of a specified Column Family onto export_dir,
+  // returning SST files information in metadata.
+  // - SST files will be created as hard links when the directory specified
+  //   is in the same partition as the db directory, copied otherwise.
+  // - export_dir should not already exist and will be created by this API.
+  // - Always triggers a flush.
+  virtual Status ExportColumnFamily(ColumnFamilyHandle* handle,
+                                    const std::string& export_dir,
+                                    ExportImportFilesMetaData** metadata);
+
+  virtual ~Checkpoint() {}
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/convenience.h b/src/rocksdb/include/rocksdb/utilities/convenience.h
new file mode 100644
index 000000000..f61afd69e
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/convenience.h
@@ -0,0 +1,10 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+// This file was moved to rocksdb/convenience.h"
+
+#include "rocksdb/convenience.h"
diff --git a/src/rocksdb/include/rocksdb/utilities/customizable_util.h b/src/rocksdb/include/rocksdb/utilities/customizable_util.h
new file mode 100644
index 000000000..62240763b
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/customizable_util.h
@@ -0,0 +1,377 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// The methods in this file are used to instantiate new Customizable
+// instances of objects.  These methods are most typically used by
+// the "CreateFromString" method of a customizable class.
+// If not developing a new Type of customizable class, you probably
+// do not need the methods in this file.
+//
+// See https://github.com/facebook/rocksdb/wiki/RocksDB-Configurable-Objects
+// for more information on how to develop and use customizable objects
+
+#pragma once
+#include <functional>
+#include <memory>
+#include <unordered_map>
+
+#include "options/configurable_helper.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/object_registry.h"
+
+namespace ROCKSDB_NAMESPACE {
+// The FactoryFunc functions are used to create a new customizable object
+// without going through the ObjectRegistry.  This methodology is especially
+// useful in LITE mode, where there is no ObjectRegistry.  The methods take
+// in an ID of the object to create and a pointer to store the created object.
+// If the factory successfully recognized the input ID, the method should return
+// success; otherwise false should be returned.  On success, the object
+// parameter contains the new object.
+template <typename T>
+using SharedFactoryFunc =
+    std::function<bool(const std::string&, std::shared_ptr<T>*)>;
+
+template <typename T>
+using UniqueFactoryFunc =
+    std::function<bool(const std::string&, std::unique_ptr<T>*)>;
+
+template <typename T>
+using StaticFactoryFunc = std::function<bool(const std::string&, T**)>;
+
+// Creates a new shared customizable instance object based on the
+// input parameters using the object registry.
+//
+// The id parameter specifies the instance class of the object to create.
+// The opt_map parameter specifies the configuration of the new instance.
+//
+// The config_options parameter controls the process and how errors are
+// returned. If ignore_unknown_options=true, unknown values are ignored during
+// the configuration. If ignore_unsupported_options=true, unknown instance types
+// are ignored. If invoke_prepare_options=true, the resulting instance will be
+// initialized (via PrepareOptions)
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param id The identifier of the new object being created.  This string
+// will be used by the object registry to locate the appropriate object to
+// create.
+// @param opt_map Optional name-value pairs of properties to set for the newly
+// created object
+// @param result The newly created and configured instance.
+template <typename T>
+static Status NewSharedObject(
+    const ConfigOptions& config_options, const std::string& id,
+    const std::unordered_map<std::string, std::string>& opt_map,
+    std::shared_ptr<T>* result) {
+  if (!id.empty()) {
+    Status status;
+#ifndef ROCKSDB_LITE
+    status = config_options.registry->NewSharedObject(id, result);
+#else
+    status = Status::NotSupported("Cannot load object in LITE mode ", id);
+#endif  // ROCKSDB_LITE
+    if (config_options.ignore_unsupported_options && status.IsNotSupported()) {
+      status = Status::OK();
+    } else if (status.ok()) {
+      status = Customizable::ConfigureNewObject(config_options, result->get(),
+                                                opt_map);
+    }
+    return status;
+  } else if (opt_map.empty()) {
+    // There was no ID and no map (everything empty), so reset/clear the result
+    result->reset();
+    return Status::OK();
+  } else {
+    return Status::NotSupported("Cannot reset object ");
+  }
+}
+
+// Creates a new managed customizable instance object based on the
+// input parameters using the object registry.  Unlike "shared" objects,
+// managed objects are limited to a single instance per ID.
+//
+// The id parameter specifies the instance class of the object to create.
+// If an object with this id exists in the registry, the existing object
+// will be returned.  If the object does not exist, a new one will be created.
+//
+// The opt_map parameter specifies the configuration of the new instance.
+// If the object already exists, the existing object is returned "as is" and
+// this parameter is ignored.
+//
+// The config_options parameter controls the process and how errors are
+// returned. If ignore_unknown_options=true, unknown values are ignored during
+// the configuration. If ignore_unsupported_options=true, unknown instance types
+// are ignored. If invoke_prepare_options=true, the resulting instance will be
+// initialized (via PrepareOptions)
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param id The identifier of the object.  This string
+// will be used by the object registry to locate the appropriate object to
+// create or return.
+// @param opt_map Optional name-value pairs of properties to set for the newly
+// created object
+// @param result The managed instance.
+template <typename T>
+static Status NewManagedObject(
+    const ConfigOptions& config_options, const std::string& id,
+    const std::unordered_map<std::string, std::string>& opt_map,
+    std::shared_ptr<T>* result) {
+  Status status;
+  if (!id.empty()) {
+#ifndef ROCKSDB_LITE
+    status = config_options.registry->GetOrCreateManagedObject<T>(
+        id, result, [config_options, opt_map](T* object) {
+          return object->ConfigureFromMap(config_options, opt_map);
+        });
+#else
+    (void)result;
+    (void)opt_map;
+    status = Status::NotSupported("Cannot load object in LITE mode ", id);
+#endif  // ROCKSDB_LITE
+    if (config_options.ignore_unsupported_options && status.IsNotSupported()) {
+      return Status::OK();
+    }
+  } else {
+    status = Status::NotSupported("Cannot reset object ");
+  }
+  return status;
+}
+
+// Creates a new shared Customizable object based on the input parameters.
+// This method parses the input value to determine the type of instance to
+// create. If there is an existing instance (in result) and it is the same ID
+// as the object being created, the existing configuration is stored and used as
+// the default for the new object.
+//
+// The value parameter specified the instance class of the object to create.
+// If it is a simple string (e.g. BlockBasedTable), then the instance will be
+// created using the default settings.  If the value is a set of name-value
+// pairs, then the "id" value is used to determine the instance to create and
+// the remaining parameters are used to configure the object.  Id name-value
+// pairs are specified, there should be an "id=value" pairing or an error may
+// result.
+//
+// The config_options parameter controls the process and how errors are
+// returned. If ignore_unknown_options=true, unknown values are ignored during
+// the configuration. If ignore_unsupported_options=true, unknown instance types
+// are ignored. If invoke_prepare_options=true, the resulting instance will be
+// initialized (via PrepareOptions)
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param value Either the simple name of the instance to create, or a set of
+// name-value pairs to create and initailize the object
+// @param func  Optional function to call to attempt to create an instance
+// @param result The newly created instance.
+template <typename T>
+static Status LoadSharedObject(const ConfigOptions& config_options,
+                               const std::string& value,
+                               const SharedFactoryFunc<T>& func,
+                               std::shared_ptr<T>* result) {
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+
+  Status status = Customizable::GetOptionsMap(config_options, result->get(),
+                                              value, &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
+  } else if (func == nullptr ||
+             !func(id, result)) {  // No factory, or it failed
+    return NewSharedObject(config_options, id, opt_map, result);
+  } else {
+    return Customizable::ConfigureNewObject(config_options, result->get(),
+                                            opt_map);
+  }
+}
+
+// Creates a new shared Customizable object based on the input parameters.
+//
+// The value parameter specified the instance class of the object to create.
+// If it is a simple string (e.g. BlockBasedTable), then the instance will be
+// created using the default settings.  If the value is a set of name-value
+// pairs, then the "id" value is used to determine the instance to create and
+// the remaining parameters are used to configure the object.  Id name-value
+// pairs are specified, there should be an "id=value" pairing or an error may
+// result.
+//
+// The "id" field from the value (either the whole field or "id=XX") is used
+// to determine the type/id of the object to return.  For a given id, there
+// the same instance of the object will be returned from this method (as opposed
+// to LoadSharedObject which would create different objects for the same id.
+//
+// The config_options parameter controls the process and how errors are
+// returned. If ignore_unknown_options=true, unknown values are ignored during
+// the configuration. If ignore_unsupported_options=true, unknown instance types
+// are ignored. If invoke_prepare_options=true, the resulting instance will be
+// initialized (via PrepareOptions)
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param value Either the simple name of the instance to create, or a set of
+// name-value pairs to create and initailize the object
+// @param func  Optional function to call to attempt to create an instance
+// @param result The newly created instance.
+template <typename T>
+static Status LoadManagedObject(const ConfigOptions& config_options,
+                                const std::string& value,
+                                std::shared_ptr<T>* result) {
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+  Status status = Customizable::GetOptionsMap(config_options, nullptr, value,
+                                              &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
+  } else if (value.empty()) {  // No Id and no options.  Clear the object
+    *result = nullptr;
+    return Status::OK();
+  } else {
+    return NewManagedObject(config_options, id, opt_map, result);
+  }
+}
+
+// Creates a new unique pointer customizable instance object based on the
+// input parameters using the object registry.
+// @see NewSharedObject for more information on the inner workings of this
+// method.
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param id The identifier of the new object being created.  This string
+// will be used by the object registry to locate the appropriate object to
+// create.
+// @param opt_map Optional name-value pairs of properties to set for the newly
+// created object
+// @param result The newly created and configured instance.
+template <typename T>
+static Status NewUniqueObject(
+    const ConfigOptions& config_options, const std::string& id,
+    const std::unordered_map<std::string, std::string>& opt_map,
+    std::unique_ptr<T>* result) {
+  if (!id.empty()) {
+    Status status;
+#ifndef ROCKSDB_LITE
+    status = config_options.registry->NewUniqueObject(id, result);
+#else
+    status = Status::NotSupported("Cannot load object in LITE mode ", id);
+#endif  // ROCKSDB_LITE
+    if (config_options.ignore_unsupported_options && status.IsNotSupported()) {
+      status = Status::OK();
+    } else if (status.ok()) {
+      status = Customizable::ConfigureNewObject(config_options, result->get(),
+                                                opt_map);
+    }
+    return status;
+  } else if (opt_map.empty()) {
+    // There was no ID and no map (everything empty), so reset/clear the result
+    result->reset();
+    return Status::OK();
+  } else {
+    return Status::NotSupported("Cannot reset object ");
+  }
+}
+
+// Creates a new unique customizable instance object based on the input
+// parameters.
+// @see LoadSharedObject for more information on the inner workings of this
+// method.
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param value Either the simple name of the instance to create, or a set of
+// name-value pairs to create and initailize the object
+// @param func  Optional function to call to attempt to create an instance
+// @param result The newly created instance.
+template <typename T>
+static Status LoadUniqueObject(const ConfigOptions& config_options,
+                               const std::string& value,
+                               const UniqueFactoryFunc<T>& func,
+                               std::unique_ptr<T>* result) {
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+  Status status = Customizable::GetOptionsMap(config_options, result->get(),
+                                              value, &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
+  } else if (func == nullptr ||
+             !func(id, result)) {  // No factory, or it failed
+    return NewUniqueObject(config_options, id, opt_map, result);
+  } else {
+    return Customizable::ConfigureNewObject(config_options, result->get(),
+                                            opt_map);
+  }
+}
+
+// Creates a new static (raw pointer) customizable instance object based on the
+// input parameters using the object registry.
+// @see NewSharedObject for more information on the inner workings of this
+// method.
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param id The identifier of the new object being created.  This string
+// will be used by the object registry to locate the appropriate object to
+// create.
+// @param opt_map Optional name-value pairs of properties to set for the newly
+// created object
+// @param result The newly created and configured instance.
+template <typename T>
+static Status NewStaticObject(
+    const ConfigOptions& config_options, const std::string& id,
+    const std::unordered_map<std::string, std::string>& opt_map, T** result) {
+  if (!id.empty()) {
+    Status status;
+#ifndef ROCKSDB_LITE
+    status = config_options.registry->NewStaticObject(id, result);
+#else
+    status = Status::NotSupported("Cannot load object in LITE mode ", id);
+#endif  // ROCKSDB_LITE
+    if (config_options.ignore_unsupported_options && status.IsNotSupported()) {
+      status = Status::OK();
+    } else if (status.ok()) {
+      status =
+          Customizable::ConfigureNewObject(config_options, *result, opt_map);
+    }
+    return status;
+  } else if (opt_map.empty()) {
+    // There was no ID and no map (everything empty), so reset/clear the result
+    *result = nullptr;
+    return Status::OK();
+  } else {
+    return Status::NotSupported("Cannot reset object ");
+  }
+}
+
+// Creates a new static (raw pointer) customizable instance object based on the
+// input parameters.
+// @see LoadSharedObject for more information on the inner workings of this
+// method.
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param value Either the simple name of the instance to create, or a set of
+// name-value pairs to create and initailize the object
+// @param func  Optional function to call to attempt to create an instance
+// @param result The newly created instance.
+template <typename T>
+static Status LoadStaticObject(const ConfigOptions& config_options,
+                               const std::string& value,
+                               const StaticFactoryFunc<T>& func, T** result) {
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+  Status status = Customizable::GetOptionsMap(config_options, *result, value,
+                                              &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
+  } else if (func == nullptr ||
+             !func(id, result)) {  // No factory, or it failed
+    return NewStaticObject(config_options, id, opt_map, result);
+  } else {
+    return Customizable::ConfigureNewObject(config_options, *result, opt_map);
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/db_ttl.h b/src/rocksdb/include/rocksdb/utilities/db_ttl.h
new file mode 100644
index 000000000..d57e7473a
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/db_ttl.h
@@ -0,0 +1,72 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/stackable_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Database with TTL support.
+//
+// USE-CASES:
+// This API should be used to open the db when key-values inserted are
+//  meant to be removed from the db in a non-strict 'ttl' amount of time
+//  Therefore, this guarantees that key-values inserted will remain in the
+//  db for >= ttl amount of time and the db will make efforts to remove the
+//  key-values as soon as possible after ttl seconds of their insertion.
+//
+// BEHAVIOUR:
+// TTL is accepted in seconds
+// (int32_t)Timestamp(creation) is suffixed to values in Put internally
+// Expired TTL values deleted in compaction only:(Timestamp+ttl<time_now)
+// Get/Iterator may return expired entries(compaction not run on them yet)
+// Different TTL may be used during different Opens
+// Example: Open1 at t=0 with ttl=4 and insert k1,k2, close at t=2
+//          Open2 at t=3 with ttl=5. Now k1,k2 should be deleted at t>=5
+// read_only=true opens in the usual read-only mode. Compactions will not be
+//  triggered(neither manual nor automatic), so no expired entries removed
+//
+// CONSTRAINTS:
+// Not specifying/passing or non-positive TTL behaves like TTL = infinity
+//
+// !!!WARNING!!!:
+// Calling DB::Open directly to re-open a db created by this API will get
+//  corrupt values(timestamp suffixed) and no ttl effect will be there
+//  during the second Open, so use this API consistently to open the db
+// Be careful when passing ttl with a small positive value because the
+//  whole database may be deleted in a small amount of time
+
+class DBWithTTL : public StackableDB {
+ public:
+  virtual Status CreateColumnFamilyWithTtl(
+      const ColumnFamilyOptions& options, const std::string& column_family_name,
+      ColumnFamilyHandle** handle, int ttl) = 0;
+
+  static Status Open(const Options& options, const std::string& dbname,
+                     DBWithTTL** dbptr, int32_t ttl = 0,
+                     bool read_only = false);
+
+  static Status Open(const DBOptions& db_options, const std::string& dbname,
+                     const std::vector<ColumnFamilyDescriptor>& column_families,
+                     std::vector<ColumnFamilyHandle*>* handles,
+                     DBWithTTL** dbptr, const std::vector<int32_t>& ttls,
+                     bool read_only = false);
+
+  virtual void SetTtl(int32_t ttl) = 0;
+
+  virtual void SetTtl(ColumnFamilyHandle* h, int32_t ttl) = 0;
+
+ protected:
+  explicit DBWithTTL(DB* db) : StackableDB(db) {}
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/debug.h b/src/rocksdb/include/rocksdb/utilities/debug.h
new file mode 100644
index 000000000..0e0526557
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/debug.h
@@ -0,0 +1,48 @@
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/db.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Data associated with a particular version of a key. A database may internally
+// store multiple versions of a same user key due to snapshots, compaction not
+// happening yet, etc.
+struct KeyVersion {
+  KeyVersion() : user_key(""), value(""), sequence(0), type(0) {}
+
+  KeyVersion(const std::string& _user_key, const std::string& _value,
+             SequenceNumber _sequence, int _type)
+      : user_key(_user_key), value(_value), sequence(_sequence), type(_type) {}
+
+  std::string user_key;
+  std::string value;
+  SequenceNumber sequence;
+  int type;
+  std::string GetTypeName() const;
+};
+
+// Returns listing of all versions of keys in the provided user key range.
+// The range is inclusive-inclusive, i.e., [`begin_key`, `end_key`], or
+// `max_num_ikeys` has been reached. Since all those keys returned will be
+// copied to memory, if the range covers too many keys, the memory usage
+// may be huge. `max_num_ikeys` can be used to cap the memory usage.
+// The result is inserted into the provided vector, `key_versions`.
+Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key,
+                         size_t max_num_ikeys,
+                         std::vector<KeyVersion>* key_versions);
+
+Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key,
+                         Slice end_key, size_t max_num_ikeys,
+                         std::vector<KeyVersion>* key_versions);
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/env_mirror.h b/src/rocksdb/include/rocksdb/utilities/env_mirror.h
new file mode 100644
index 000000000..ffde5effa
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/env_mirror.h
@@ -0,0 +1,181 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2015, Red Hat, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// MirrorEnv is an Env implementation that mirrors all file-related
+// operations to two backing Env's (provided at construction time).
+// Writes are mirrored.  For read operations, we do the read from both
+// backends and assert that the results match.
+//
+// This is useful when implementing a new Env and ensuring that the
+// semantics and behavior are correct (in that they match that of an
+// existing, stable Env, like the default POSIX one).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SequentialFileMirror;
+class RandomAccessFileMirror;
+class WritableFileMirror;
+
+class EnvMirror : public EnvWrapper {
+  Env *a_, *b_;
+  bool free_a_, free_b_;
+
+ public:
+  EnvMirror(Env* a, Env* b, bool free_a = false, bool free_b = false)
+      : EnvWrapper(a), a_(a), b_(b), free_a_(free_a), free_b_(free_b) {}
+  ~EnvMirror() {
+    if (free_a_) delete a_;
+    if (free_b_) delete b_;
+  }
+
+  Status NewSequentialFile(const std::string& f,
+                           std::unique_ptr<SequentialFile>* r,
+                           const EnvOptions& options) override;
+  Status NewRandomAccessFile(const std::string& f,
+                             std::unique_ptr<RandomAccessFile>* r,
+                             const EnvOptions& options) override;
+  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
+                         const EnvOptions& options) override;
+  Status ReuseWritableFile(const std::string& fname,
+                           const std::string& old_fname,
+                           std::unique_ptr<WritableFile>* r,
+                           const EnvOptions& options) override;
+  virtual Status NewDirectory(const std::string& name,
+                              std::unique_ptr<Directory>* result) override {
+    std::unique_ptr<Directory> br;
+    Status as = a_->NewDirectory(name, result);
+    Status bs = b_->NewDirectory(name, &br);
+    assert(as == bs);
+    return as;
+  }
+  Status FileExists(const std::string& f) override {
+    Status as = a_->FileExists(f);
+    Status bs = b_->FileExists(f);
+    assert(as == bs);
+    return as;
+  }
+#if defined(_MSC_VER)
+#pragma warning(push)
+// logical operation on address of string constant
+#pragma warning(disable : 4130)
+#endif
+  Status GetChildren(const std::string& dir,
+                     std::vector<std::string>* r) override {
+    std::vector<std::string> ar, br;
+    Status as = a_->GetChildren(dir, &ar);
+    Status bs = b_->GetChildren(dir, &br);
+    assert(as == bs);
+    std::sort(ar.begin(), ar.end());
+    std::sort(br.begin(), br.end());
+    if (!as.ok() || ar != br) {
+      assert(0 == "getchildren results don't match");
+    }
+    *r = ar;
+    return as;
+  }
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+  Status DeleteFile(const std::string& f) override {
+    Status as = a_->DeleteFile(f);
+    Status bs = b_->DeleteFile(f);
+    assert(as == bs);
+    return as;
+  }
+  Status CreateDir(const std::string& d) override {
+    Status as = a_->CreateDir(d);
+    Status bs = b_->CreateDir(d);
+    assert(as == bs);
+    return as;
+  }
+  Status CreateDirIfMissing(const std::string& d) override {
+    Status as = a_->CreateDirIfMissing(d);
+    Status bs = b_->CreateDirIfMissing(d);
+    assert(as == bs);
+    return as;
+  }
+  Status DeleteDir(const std::string& d) override {
+    Status as = a_->DeleteDir(d);
+    Status bs = b_->DeleteDir(d);
+    assert(as == bs);
+    return as;
+  }
+  Status GetFileSize(const std::string& f, uint64_t* s) override {
+    uint64_t asize, bsize;
+    Status as = a_->GetFileSize(f, &asize);
+    Status bs = b_->GetFileSize(f, &bsize);
+    assert(as == bs);
+    assert(!as.ok() || asize == bsize);
+    *s = asize;
+    return as;
+  }
+
+  Status GetFileModificationTime(const std::string& fname,
+                                 uint64_t* file_mtime) override {
+    uint64_t amtime, bmtime;
+    Status as = a_->GetFileModificationTime(fname, &amtime);
+    Status bs = b_->GetFileModificationTime(fname, &bmtime);
+    assert(as == bs);
+    assert(!as.ok() || amtime - bmtime < 10000 || bmtime - amtime < 10000);
+    *file_mtime = amtime;
+    return as;
+  }
+
+  Status RenameFile(const std::string& s, const std::string& t) override {
+    Status as = a_->RenameFile(s, t);
+    Status bs = b_->RenameFile(s, t);
+    assert(as == bs);
+    return as;
+  }
+
+  Status LinkFile(const std::string& s, const std::string& t) override {
+    Status as = a_->LinkFile(s, t);
+    Status bs = b_->LinkFile(s, t);
+    assert(as == bs);
+    return as;
+  }
+
+  class FileLockMirror : public FileLock {
+   public:
+    FileLock *a_, *b_;
+    FileLockMirror(FileLock* a, FileLock* b) : a_(a), b_(b) {}
+  };
+
+  Status LockFile(const std::string& f, FileLock** l) override {
+    FileLock *al, *bl;
+    Status as = a_->LockFile(f, &al);
+    Status bs = b_->LockFile(f, &bl);
+    assert(as == bs);
+    if (as.ok()) *l = new FileLockMirror(al, bl);
+    return as;
+  }
+
+  Status UnlockFile(FileLock* l) override {
+    FileLockMirror* ml = static_cast<FileLockMirror*>(l);
+    Status as = a_->UnlockFile(ml->a_);
+    Status bs = b_->UnlockFile(ml->b_);
+    assert(as == bs);
+    delete ml;
+    return as;
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/info_log_finder.h b/src/rocksdb/include/rocksdb/utilities/info_log_finder.h
new file mode 100644
index 000000000..824f8a3df
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/info_log_finder.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This function can be used to list the Information logs,
+// given the db pointer.
+Status GetInfoLogList(DB* db, std::vector<std::string>* info_log_list);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/ldb_cmd.h b/src/rocksdb/include/rocksdb/utilities/ldb_cmd.h
new file mode 100644
index 000000000..007638192
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/ldb_cmd.h
@@ -0,0 +1,318 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <functional>
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/ldb_tool.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "rocksdb/utilities/ldb_cmd_execute_result.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class LDBCommand {
+ public:
+  // Command-line arguments
+  static const std::string ARG_ENV_URI;
+  static const std::string ARG_FS_URI;
+  static const std::string ARG_DB;
+  static const std::string ARG_PATH;
+  static const std::string ARG_SECONDARY_PATH;
+  static const std::string ARG_HEX;
+  static const std::string ARG_KEY_HEX;
+  static const std::string ARG_VALUE_HEX;
+  static const std::string ARG_CF_NAME;
+  static const std::string ARG_TTL;
+  static const std::string ARG_TTL_START;
+  static const std::string ARG_TTL_END;
+  static const std::string ARG_TIMESTAMP;
+  static const std::string ARG_TRY_LOAD_OPTIONS;
+  static const std::string ARG_IGNORE_UNKNOWN_OPTIONS;
+  static const std::string ARG_FROM;
+  static const std::string ARG_TO;
+  static const std::string ARG_MAX_KEYS;
+  static const std::string ARG_BLOOM_BITS;
+  static const std::string ARG_FIX_PREFIX_LEN;
+  static const std::string ARG_COMPRESSION_TYPE;
+  static const std::string ARG_COMPRESSION_MAX_DICT_BYTES;
+  static const std::string ARG_BLOCK_SIZE;
+  static const std::string ARG_AUTO_COMPACTION;
+  static const std::string ARG_DB_WRITE_BUFFER_SIZE;
+  static const std::string ARG_WRITE_BUFFER_SIZE;
+  static const std::string ARG_FILE_SIZE;
+  static const std::string ARG_CREATE_IF_MISSING;
+  static const std::string ARG_NO_VALUE;
+  static const std::string ARG_DISABLE_CONSISTENCY_CHECKS;
+  static const std::string ARG_ENABLE_BLOB_FILES;
+  static const std::string ARG_MIN_BLOB_SIZE;
+  static const std::string ARG_BLOB_FILE_SIZE;
+  static const std::string ARG_BLOB_COMPRESSION_TYPE;
+  static const std::string ARG_ENABLE_BLOB_GARBAGE_COLLECTION;
+  static const std::string ARG_BLOB_GARBAGE_COLLECTION_AGE_CUTOFF;
+  static const std::string ARG_BLOB_GARBAGE_COLLECTION_FORCE_THRESHOLD;
+  static const std::string ARG_BLOB_COMPACTION_READAHEAD_SIZE;
+  static const std::string ARG_BLOB_FILE_STARTING_LEVEL;
+  static const std::string ARG_PREPOPULATE_BLOB_CACHE;
+  static const std::string ARG_DECODE_BLOB_INDEX;
+  static const std::string ARG_DUMP_UNCOMPRESSED_BLOBS;
+
+  struct ParsedParams {
+    std::string cmd;
+    std::vector<std::string> cmd_params;
+    std::map<std::string, std::string> option_map;
+    std::vector<std::string> flags;
+  };
+
+  static LDBCommand* SelectCommand(const ParsedParams& parsed_parms);
+
+  static LDBCommand* InitFromCmdLineArgs(
+      const std::vector<std::string>& args, const Options& options,
+      const LDBOptions& ldb_options,
+      const std::vector<ColumnFamilyDescriptor>* column_families,
+      const std::function<LDBCommand*(const ParsedParams&)>& selector =
+          SelectCommand);
+
+  static LDBCommand* InitFromCmdLineArgs(
+      int argc, char const* const* argv, const Options& options,
+      const LDBOptions& ldb_options,
+      const std::vector<ColumnFamilyDescriptor>* column_families);
+
+  bool ValidateCmdLineOptions();
+
+  virtual void PrepareOptions();
+
+  virtual void OverrideBaseOptions();
+
+  virtual void OverrideBaseCFOptions(ColumnFamilyOptions* cf_opts);
+
+  virtual void SetDBOptions(Options options) { options_ = options; }
+
+  virtual void SetColumnFamilies(
+      const std::vector<ColumnFamilyDescriptor>* column_families) {
+    if (column_families != nullptr) {
+      column_families_ = *column_families;
+    } else {
+      column_families_.clear();
+    }
+  }
+
+  void SetLDBOptions(const LDBOptions& ldb_options) {
+    ldb_options_ = ldb_options;
+  }
+
+  const std::map<std::string, std::string>& TEST_GetOptionMap() {
+    return option_map_;
+  }
+
+  const std::vector<std::string>& TEST_GetFlags() { return flags_; }
+
+  virtual bool NoDBOpen() { return false; }
+
+  virtual ~LDBCommand() { CloseDB(); }
+
+  /* Run the command, and return the execute result. */
+  void Run();
+
+  virtual void DoCommand() = 0;
+
+  LDBCommandExecuteResult GetExecuteState() { return exec_state_; }
+
+  void ClearPreviousRunState() { exec_state_.Reset(); }
+
+  // Consider using Slice::DecodeHex directly instead if you don't need the
+  // 0x prefix
+  static std::string HexToString(const std::string& str);
+
+  // Consider using Slice::ToString(true) directly instead if
+  // you don't need the 0x prefix
+  static std::string StringToHex(const std::string& str);
+
+  static const char* DELIM;
+
+ protected:
+  LDBCommandExecuteResult exec_state_;
+  std::string env_uri_;
+  std::string fs_uri_;
+  std::string db_path_;
+  // If empty, open DB as primary. If non-empty, open the DB as secondary
+  // with this secondary path. When running against a database opened by
+  // another process, ldb wll leave the source directory completely intact.
+  std::string secondary_path_;
+  std::string column_family_name_;
+  DB* db_;
+  DBWithTTL* db_ttl_;
+  std::map<std::string, ColumnFamilyHandle*> cf_handles_;
+
+  /**
+   * true implies that this command can work if the db is opened in read-only
+   * mode.
+   */
+  bool is_read_only_;
+
+  /** If true, the key is input/output as hex in get/put/scan/delete etc. */
+  bool is_key_hex_;
+
+  /** If true, the value is input/output as hex in get/put/scan/delete etc. */
+  bool is_value_hex_;
+
+  /** If true, the value is treated as timestamp suffixed */
+  bool is_db_ttl_;
+
+  // If true, the kvs are output with their insert/modify timestamp in a ttl db
+  bool timestamp_;
+
+  // If true, try to construct options from DB's option files.
+  bool try_load_options_;
+
+  // The value passed to options.force_consistency_checks.
+  bool force_consistency_checks_;
+
+  bool enable_blob_files_;
+
+  bool enable_blob_garbage_collection_;
+
+  bool create_if_missing_;
+
+  /**
+   * Map of options passed on the command-line.
+   */
+  const std::map<std::string, std::string> option_map_;
+
+  /**
+   * Flags passed on the command-line.
+   */
+  const std::vector<std::string> flags_;
+
+  /** List of command-line options valid for this command */
+  const std::vector<std::string> valid_cmd_line_options_;
+
+  /** Shared pointer to underlying environment if applicable **/
+  std::shared_ptr<Env> env_guard_;
+
+  bool ParseKeyValue(const std::string& line, std::string* key,
+                     std::string* value, bool is_key_hex, bool is_value_hex);
+
+  LDBCommand(const std::map<std::string, std::string>& options,
+             const std::vector<std::string>& flags, bool is_read_only,
+             const std::vector<std::string>& valid_cmd_line_options);
+
+  void OpenDB();
+
+  void CloseDB();
+
+  ColumnFamilyHandle* GetCfHandle();
+
+  static std::string PrintKeyValue(const std::string& key,
+                                   const std::string& value, bool is_key_hex,
+                                   bool is_value_hex);
+
+  static std::string PrintKeyValue(const std::string& key,
+                                   const std::string& value, bool is_hex);
+
+  /**
+   * Return true if the specified flag is present in the specified flags vector
+   */
+  static bool IsFlagPresent(const std::vector<std::string>& flags,
+                            const std::string& flag) {
+    return (std::find(flags.begin(), flags.end(), flag) != flags.end());
+  }
+
+  static std::string HelpRangeCmdArgs();
+
+  /**
+   * A helper function that returns a list of command line options
+   * used by this command.  It includes the common options and the ones
+   * passed in.
+   */
+  static std::vector<std::string> BuildCmdLineOptions(
+      std::vector<std::string> options);
+
+  bool ParseIntOption(const std::map<std::string, std::string>& options,
+                      const std::string& option, int& value,
+                      LDBCommandExecuteResult& exec_state);
+
+  bool ParseDoubleOption(const std::map<std::string, std::string>& options,
+                         const std::string& option, double& value,
+                         LDBCommandExecuteResult& exec_state);
+
+  bool ParseStringOption(const std::map<std::string, std::string>& options,
+                         const std::string& option, std::string* value);
+
+  bool ParseCompressionTypeOption(
+      const std::map<std::string, std::string>& options,
+      const std::string& option, CompressionType& value,
+      LDBCommandExecuteResult& exec_state);
+
+  /**
+   * Returns the value of the specified option as a boolean.
+   * default_val is used if the option is not found in options.
+   * Throws an exception if the value of the option is not
+   * "true" or "false" (case insensitive).
+   */
+  bool ParseBooleanOption(const std::map<std::string, std::string>& options,
+                          const std::string& option, bool default_val);
+
+  Options options_;
+  std::vector<ColumnFamilyDescriptor> column_families_;
+  ConfigOptions config_options_;
+  LDBOptions ldb_options_;
+
+ private:
+  /**
+   * Interpret command line options and flags to determine if the key
+   * should be input/output in hex.
+   */
+  bool IsKeyHex(const std::map<std::string, std::string>& options,
+                const std::vector<std::string>& flags);
+
+  /**
+   * Interpret command line options and flags to determine if the value
+   * should be input/output in hex.
+   */
+  bool IsValueHex(const std::map<std::string, std::string>& options,
+                  const std::vector<std::string>& flags);
+
+  bool IsTryLoadOptions(const std::map<std::string, std::string>& options,
+                        const std::vector<std::string>& flags);
+
+  /**
+   * Converts val to a boolean.
+   * val must be either true or false (case insensitive).
+   * Otherwise an exception is thrown.
+   */
+  bool StringToBool(std::string val);
+};
+
+class LDBCommandRunner {
+ public:
+  static void PrintHelp(const LDBOptions& ldb_options, const char* exec_name,
+                        bool to_stderr = true);
+
+  // Returns the status code to return. 0 is no error.
+  static int RunCommand(
+      int argc, char const* const* argv, Options options,
+      const LDBOptions& ldb_options,
+      const std::vector<ColumnFamilyDescriptor>* column_families);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h b/src/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h
new file mode 100644
index 000000000..57bac3346
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h
@@ -0,0 +1,75 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+#ifdef FAILED
+#undef FAILED
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class LDBCommandExecuteResult {
+ public:
+  enum State {
+    EXEC_NOT_STARTED = 0,
+    EXEC_SUCCEED = 1,
+    EXEC_FAILED = 2,
+  };
+
+  LDBCommandExecuteResult() : state_(EXEC_NOT_STARTED), message_("") {}
+
+  LDBCommandExecuteResult(State state, std::string& msg)
+      : state_(state), message_(msg) {}
+
+  std::string ToString() {
+    std::string ret;
+    switch (state_) {
+      case EXEC_SUCCEED:
+        break;
+      case EXEC_FAILED:
+        ret.append("Failed: ");
+        break;
+      case EXEC_NOT_STARTED:
+        ret.append("Not started: ");
+    }
+    if (!message_.empty()) {
+      ret.append(message_);
+    }
+    return ret;
+  }
+
+  void Reset() {
+    state_ = EXEC_NOT_STARTED;
+    message_ = "";
+  }
+
+  bool IsSucceed() { return state_ == EXEC_SUCCEED; }
+
+  bool IsNotStarted() { return state_ == EXEC_NOT_STARTED; }
+
+  bool IsFailed() { return state_ == EXEC_FAILED; }
+
+  static LDBCommandExecuteResult Succeed(std::string msg) {
+    return LDBCommandExecuteResult(EXEC_SUCCEED, msg);
+  }
+
+  static LDBCommandExecuteResult Failed(std::string msg) {
+    return LDBCommandExecuteResult(EXEC_FAILED, msg);
+  }
+
+ private:
+  State state_;
+  std::string message_;
+
+  bool operator==(const LDBCommandExecuteResult&);
+  bool operator!=(const LDBCommandExecuteResult&);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/leveldb_options.h b/src/rocksdb/include/rocksdb/utilities/leveldb_options.h
new file mode 100644
index 000000000..7e4a6faa4
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/leveldb_options.h
@@ -0,0 +1,145 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <stddef.h>
+
+#include "rocksdb/compression_type.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cache;
+class Comparator;
+class Env;
+class FilterPolicy;
+class Logger;
+struct Options;
+class Snapshot;
+
+// Options to control the behavior of a database (passed to
+// DB::Open). A LevelDBOptions object can be initialized as though
+// it were a LevelDB Options object, and then it can be converted into
+// a RocksDB Options object.
+struct LevelDBOptions {
+  // -------------------
+  // Parameters that affect behavior
+
+  // Comparator used to define the order of keys in the table.
+  // Default: a comparator that uses lexicographic byte-wise ordering
+  //
+  // REQUIRES: The client must ensure that the comparator supplied
+  // here has the same name and orders keys *exactly* the same as the
+  // comparator provided to previous open calls on the same DB.
+  const Comparator* comparator;
+
+  // If true, the database will be created if it is missing.
+  // Default: false
+  bool create_if_missing;
+
+  // If true, an error is raised if the database already exists.
+  // Default: false
+  bool error_if_exists;
+
+  // If true, the implementation will do aggressive checking of the
+  // data it is processing and will stop early if it detects any
+  // errors.  This may have unforeseen ramifications: for example, a
+  // corruption of one DB entry may cause a large number of entries to
+  // become unreadable or for the entire DB to become unopenable.
+  // Default: false
+  bool paranoid_checks;
+
+  // Use the specified object to interact with the environment,
+  // e.g. to read/write files, schedule background work, etc.
+  // Default: Env::Default()
+  Env* env;
+
+  // Any internal progress/error information generated by the db will
+  // be written to info_log if it is non-NULL, or to a file stored
+  // in the same directory as the DB contents if info_log is NULL.
+  // Default: NULL
+  Logger* info_log;
+
+  // -------------------
+  // Parameters that affect performance
+
+  // Amount of data to build up in memory (backed by an unsorted log
+  // on disk) before converting to a sorted on-disk file.
+  //
+  // Larger values increase performance, especially during bulk loads.
+  // Up to two write buffers may be held in memory at the same time,
+  // so you may wish to adjust this parameter to control memory usage.
+  // Also, a larger write buffer will result in a longer recovery time
+  // the next time the database is opened.
+  //
+  // Default: 4MB
+  size_t write_buffer_size;
+
+  // Number of open files that can be used by the DB.  You may need to
+  // increase this if your database has a large working set (budget
+  // one open file per 2MB of working set).
+  //
+  // Default: 1000
+  int max_open_files;
+
+  // Control over blocks (user data is stored in a set of blocks, and
+  // a block is the unit of reading from disk).
+
+  // If non-NULL, use the specified cache for blocks.
+  // If NULL, leveldb will automatically create and use an 8MB internal cache.
+  // Default: NULL
+  Cache* block_cache;
+
+  // Approximate size of user data packed per block.  Note that the
+  // block size specified here corresponds to uncompressed data.  The
+  // actual size of the unit read from disk may be smaller if
+  // compression is enabled.  This parameter can be changed dynamically.
+  //
+  // Default: 4K
+  size_t block_size;
+
+  // Number of keys between restart points for delta encoding of keys.
+  // This parameter can be changed dynamically.  Most clients should
+  // leave this parameter alone.
+  //
+  // Default: 16
+  int block_restart_interval;
+
+  // Compress blocks using the specified compression algorithm.  This
+  // parameter can be changed dynamically.
+  //
+  // Default: kSnappyCompression, which gives lightweight but fast
+  // compression.
+  //
+  // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
+  //    ~200-500MB/s compression
+  //    ~400-800MB/s decompression
+  // Note that these speeds are significantly faster than most
+  // persistent storage speeds, and therefore it is typically never
+  // worth switching to kNoCompression.  Even if the input data is
+  // incompressible, the kSnappyCompression implementation will
+  // efficiently detect that and will switch to uncompressed mode.
+  CompressionType compression;
+
+  // If non-NULL, use the specified filter policy to reduce disk reads.
+  // Many applications will benefit from passing the result of
+  // NewBloomFilterPolicy() here.
+  //
+  // Default: NULL
+  const FilterPolicy* filter_policy;
+
+  // Create a LevelDBOptions object with default values for all fields.
+  LevelDBOptions();
+};
+
+// Converts a LevelDBOptions object into a RocksDB Options object.
+Options ConvertOptions(const LevelDBOptions& leveldb_options);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h b/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h
new file mode 100644
index 000000000..f617da02b
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h
@@ -0,0 +1,43 @@
+//  Copyright (c) 2016, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifdef LUA
+
+// lua headers
+extern "C" {
+#include <lauxlib.h>
+#include <lua.h>
+#include <lualib.h>
+}
+
+namespace ROCKSDB_NAMESPACE {
+namespace lua {
+// A class that used to define custom C Library that is callable
+// from Lua script
+class RocksLuaCustomLibrary {
+ public:
+  virtual ~RocksLuaCustomLibrary() {}
+  // The name of the C library.  This name will also be used as the table
+  // (namespace) in Lua that contains the C library.
+  virtual const char* Name() const = 0;
+
+  // Returns a "static const struct luaL_Reg[]", which includes a list of
+  // C functions.  Note that the last entry of this static array must be
+  // {nullptr, nullptr} as required by Lua.
+  //
+  // More details about how to implement Lua C libraries can be found
+  // in the official Lua document http://www.lua.org/pil/26.2.html
+  virtual const struct luaL_Reg* Lib() const = 0;
+
+  // A function that will be called right after the library has been created
+  // and pushed on the top of the lua_State.  This custom setup function
+  // allows developers to put additional table or constant values inside
+  // the same table / namespace.
+  virtual void CustomSetup(lua_State* /*L*/) const {}
+};
+}  // namespace lua
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // LUA
diff --git a/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h b/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h
new file mode 100644
index 000000000..3427b65ef
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/lua/rocks_lua_util.h
@@ -0,0 +1,55 @@
+//  Copyright (c) 2016, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+// lua headers
+extern "C" {
+#include <lauxlib.h>
+#include <lua.h>
+#include <lualib.h>
+}
+
+#ifdef LUA
+#include <string>
+#include <vector>
+
+#include "rocksdb/utilities/lua/rocks_lua_custom_library.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace lua {
+class LuaStateWrapper {
+ public:
+  explicit LuaStateWrapper(const std::string& lua_script) {
+    lua_state_ = luaL_newstate();
+    Init(lua_script, {});
+  }
+  LuaStateWrapper(
+      const std::string& lua_script,
+      const std::vector<std::shared_ptr<RocksLuaCustomLibrary>>& libraries) {
+    lua_state_ = luaL_newstate();
+    Init(lua_script, libraries);
+  }
+  lua_State* GetLuaState() const { return lua_state_; }
+  ~LuaStateWrapper() { lua_close(lua_state_); }
+
+ private:
+  void Init(
+      const std::string& lua_script,
+      const std::vector<std::shared_ptr<RocksLuaCustomLibrary>>& libraries) {
+    if (lua_state_) {
+      luaL_openlibs(lua_state_);
+      for (const auto& library : libraries) {
+        luaL_openlib(lua_state_, library->Name(), library->Lib(), 0);
+        library->CustomSetup(lua_state_);
+      }
+      luaL_dostring(lua_state_, lua_script.c_str());
+    }
+  }
+
+  lua_State* lua_state_;
+};
+}  // namespace lua
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // LUA
diff --git a/src/rocksdb/include/rocksdb/utilities/memory_util.h b/src/rocksdb/include/rocksdb/utilities/memory_util.h
new file mode 100644
index 000000000..4f1606b51
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/memory_util.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Returns the current memory usage of the specified DB instances.
+class MemoryUtil {
+ public:
+  enum UsageType : int {
+    // Memory usage of all the mem-tables.
+    kMemTableTotal = 0,
+    // Memory usage of those un-flushed mem-tables.
+    kMemTableUnFlushed = 1,
+    // Memory usage of all the table readers.
+    kTableReadersTotal = 2,
+    // Memory usage by Cache.
+    kCacheTotal = 3,
+    kNumUsageTypes = 4
+  };
+
+  // Returns the approximate memory usage of different types in the input
+  // list of DBs and Cache set.  For instance, in the output map
+  // usage_by_type, usage_by_type[kMemTableTotal] will store the memory
+  // usage of all the mem-tables from all the input rocksdb instances.
+  //
+  // Note that for memory usage inside Cache class, we will
+  // only report the usage of the input "cache_set" without
+  // including those Cache usage inside the input list "dbs"
+  // of DBs.
+  static Status GetApproximateMemoryUsageByType(
+      const std::vector<DB*>& dbs,
+      const std::unordered_set<const Cache*> cache_set,
+      std::map<MemoryUtil::UsageType, uint64_t>* usage_by_type);
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/object_registry.h b/src/rocksdb/include/rocksdb/utilities/object_registry.h
new file mode 100644
index 000000000..3bafb837c
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/object_registry.h
@@ -0,0 +1,585 @@
+// Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Customizable;
+class Logger;
+class ObjectLibrary;
+
+// Returns a new T when called with a string. Populates the std::unique_ptr
+// argument if granting ownership to caller.
+template <typename T>
+using FactoryFunc =
+    std::function<T*(const std::string&, std::unique_ptr<T>*, std::string*)>;
+
+// The signature of the function for loading factories
+// into an object library.  This method is expected to register
+// factory functions in the supplied ObjectLibrary.
+// The ObjectLibrary is the library in which the factories will be loaded.
+// The std::string is the argument passed to the loader function.
+// The RegistrarFunc should return the number of objects loaded into this
+// library
+using RegistrarFunc = std::function<int(ObjectLibrary&, const std::string&)>;
+
+template <typename T>
+using ConfigureFunc = std::function<Status(T*)>;
+
+class ObjectLibrary {
+ private:
+  // Base class for an Entry in the Registry.
+  class Entry {
+   public:
+    virtual ~Entry() {}
+    virtual bool Matches(const std::string& target) const = 0;
+    virtual const char* Name() const = 0;
+  };
+
+ public:
+  // Class for matching target strings to a pattern.
+  // Entries consist of a name that starts the pattern and attributes
+  // The following attributes can be added to the entry:
+  //   -Suffix: Comparable to name(suffix)
+  //   -Separator: Comparable to name(separator).+ or name(separator).*
+  //   -Number: Comparable to name(separator).[0-9]+
+  //   -AltName: Comparable to (name|alt)
+  //   -Optional: Comparable to name(separator)?
+  // Multiple separators can be combined and cause multiple matches.
+  // For example, Pattern("A").AnotherName("B").AddSeparator("@").AddNumber("#")
+  // is roughly equivalent to "(A|B)@.+#.+"
+  //
+  // Note that though this class does provide some regex-style matching,
+  // it is not a full regex parser and has some key differences:
+  //   - Separators are matched left-most.  For example, an entry
+  //     Name("Hello").AddSeparator(" ").AddSuffix("!") would match
+  //     "Hello world!", but not "Hello world!!"
+  //   - No backtracking is necessary, enabling reliably efficient matching
+  class PatternEntry : public Entry {
+   private:
+    enum Quantifier {
+      kMatchZeroOrMore,  // [suffix].*
+      kMatchAtLeastOne,  // [suffix].+
+      kMatchExact,       // [suffix]
+      kMatchInteger,     // [suffix][0-9]+
+      kMatchDecimal,     // [suffix][0-9]+[.][0-9]+
+    };
+
+   public:
+    // Short-cut for creating an entry that matches to a
+    // Customizable::IndividualId
+    static PatternEntry AsIndividualId(const std::string& name) {
+      PatternEntry entry(name, true);
+      entry.AddSeparator("@");
+      entry.AddSeparator("#");
+      return entry;
+    }
+
+    // Creates a new PatternEntry for "name".  If optional is true,
+    // Matches will also return true if name==target
+    explicit PatternEntry(const std::string& name, bool optional = true)
+        : name_(name), optional_(optional), slength_(0) {
+      nlength_ = name_.size();
+    }
+
+    // Adds a suffix (exact match of separator with no trailing characters) to
+    // the separator
+    PatternEntry& AddSuffix(const std::string& suffix) {
+      separators_.emplace_back(suffix, kMatchExact);
+      slength_ += suffix.size();
+      return *this;
+    }
+
+    // Adds a separator (exact match of separator with trailing characters) to
+    // the entry
+    // If at_least_one is true, the separator must be followed by at least
+    // one character (e.g. separator.+).
+    // If at_least_one is false, the separator may be followed by zero or
+    // more characters (e.g. separator.*).
+    PatternEntry& AddSeparator(const std::string& separator,
+                               bool at_least_one = true) {
+      slength_ += separator.size();
+      if (at_least_one) {
+        separators_.emplace_back(separator, kMatchAtLeastOne);
+        ++slength_;
+      } else {
+        separators_.emplace_back(separator, kMatchZeroOrMore);
+      }
+      return *this;
+    }
+
+    // Adds a separator (exact match of separator with trailing numbers) to the
+    // entry
+    PatternEntry& AddNumber(const std::string& separator, bool is_int = true) {
+      separators_.emplace_back(separator,
+                               (is_int) ? kMatchInteger : kMatchDecimal);
+      slength_ += separator.size() + 1;
+      return *this;
+    }
+
+    // Sets another name that this entry will match, similar to (name|alt)
+    PatternEntry& AnotherName(const std::string& alt) {
+      names_.emplace_back(alt);
+      return *this;
+    }
+
+    // Sets whether the separators are required -- similar to name(separator)?
+    // If optional is true, then name(separator)? would match
+    // If optional is false, then the separators must also match
+    PatternEntry& SetOptional(bool optional) {
+      optional_ = optional;
+      return *this;
+    }
+
+    // Checks to see if the target matches this entry
+    bool Matches(const std::string& target) const override;
+    const char* Name() const override { return name_.c_str(); }
+
+   private:
+    size_t MatchSeparatorAt(size_t start, Quantifier mode,
+                            const std::string& target, size_t tlen,
+                            const std::string& pattern) const;
+
+    bool MatchesTarget(const std::string& name, size_t nlen,
+                       const std::string& target, size_t ylen) const;
+    std::string name_;                // The base name for this entry
+    size_t nlength_;                  // The length of name_
+    std::vector<std::string> names_;  // Alternative names for this entry
+    bool optional_;   // Whether matching of separators is required
+    size_t slength_;  // The minimum required length to match the separators
+    std::vector<std::pair<std::string, Quantifier>>
+        separators_;  // What to match
+  };                  // End class Entry
+
+ private:
+  // An Entry containing a FactoryFunc for creating new Objects
+  template <typename T>
+  class FactoryEntry : public Entry {
+   public:
+    FactoryEntry(Entry* e, FactoryFunc<T> f)
+        : entry_(e), factory_(std::move(f)) {}
+    bool Matches(const std::string& target) const override {
+      return entry_->Matches(target);
+    }
+    const char* Name() const override { return entry_->Name(); }
+
+    // Creates a new T object.
+    T* NewFactoryObject(const std::string& target, std::unique_ptr<T>* guard,
+                        std::string* msg) const {
+      return factory_(target, guard, msg);
+    }
+    const FactoryFunc<T>& GetFactory() const { return factory_; }
+
+   private:
+    std::unique_ptr<Entry> entry_;  // What to match for this entry
+    FactoryFunc<T> factory_;
+  };  // End class FactoryEntry
+ public:
+  explicit ObjectLibrary(const std::string& id) { id_ = id; }
+
+  const std::string& GetID() const { return id_; }
+
+  // Finds the factory function for the input target.
+  // @see PatternEntry for the matching rules to target
+  // @return If matched, the FactoryFunc for this target, else nullptr
+  template <typename T>
+  FactoryFunc<T> FindFactory(const std::string& target) const {
+    std::unique_lock<std::mutex> lock(mu_);
+    auto factories = factories_.find(T::Type());
+    if (factories != factories_.end()) {
+      for (const auto& e : factories->second) {
+        if (e->Matches(target)) {
+          const auto* fe =
+              static_cast<const ObjectLibrary::FactoryEntry<T>*>(e.get());
+          return fe->GetFactory();
+        }
+      }
+    }
+    return nullptr;
+  }
+
+  // Returns the total number of factories registered for this library.
+  // This method returns the sum of all factories registered for all types.
+  // @param num_types returns how many unique types are registered.
+  size_t GetFactoryCount(size_t* num_types) const;
+
+  // Returns the number of factories registered for this library
+  // for the input type.
+  // @param num_types returns how many unique types are registered.
+  size_t GetFactoryCount(const std::string& type) const;
+
+  // Returns the registered factory names for the input type
+  // names is updated to include the names for the type
+  void GetFactoryNames(const std::string& type,
+                       std::vector<std::string>* names) const;
+
+  void GetFactoryTypes(std::unordered_set<std::string>* types) const;
+
+  void Dump(Logger* logger) const;
+
+  // Registers the factory with the library for the name.
+  // If name==target, the factory may be used to create a new object.
+  template <typename T>
+  const FactoryFunc<T>& AddFactory(const std::string& name,
+                                   const FactoryFunc<T>& func) {
+    std::unique_ptr<Entry> entry(
+        new FactoryEntry<T>(new PatternEntry(name), func));
+    AddFactoryEntry(T::Type(), std::move(entry));
+    return func;
+  }
+
+  // Registers the factory with the library for the entry.
+  // If the entry matches the target, the factory may be used to create a new
+  // object.
+  // @see PatternEntry for the matching rules.
+  // NOTE: This function replaces the old ObjectLibrary::Register()
+  template <typename T>
+  const FactoryFunc<T>& AddFactory(const PatternEntry& entry,
+                                   const FactoryFunc<T>& func) {
+    std::unique_ptr<Entry> factory(
+        new FactoryEntry<T>(new PatternEntry(entry), func));
+    AddFactoryEntry(T::Type(), std::move(factory));
+    return func;
+  }
+
+  // Invokes the registrar function with the supplied arg for this library.
+  int Register(const RegistrarFunc& registrar, const std::string& arg) {
+    return registrar(*this, arg);
+  }
+
+  // Returns the default ObjectLibrary
+  static std::shared_ptr<ObjectLibrary>& Default();
+
+ private:
+  void AddFactoryEntry(const char* type, std::unique_ptr<Entry>&& entry) {
+    std::unique_lock<std::mutex> lock(mu_);
+    auto& factories = factories_[type];
+    factories.emplace_back(std::move(entry));
+  }
+
+  // Protects the entry map
+  mutable std::mutex mu_;
+  // ** FactoryFunctions for this loader, organized by type
+  std::unordered_map<std::string, std::vector<std::unique_ptr<Entry>>>
+      factories_;
+
+  // The name for this library
+  std::string id_;
+};
+
+// The ObjectRegistry is used to register objects that can be created by a
+// name/pattern at run-time where the specific implementation of the object may
+// not be known in advance.
+class ObjectRegistry {
+ public:
+  static std::shared_ptr<ObjectRegistry> NewInstance();
+  static std::shared_ptr<ObjectRegistry> NewInstance(
+      const std::shared_ptr<ObjectRegistry>& parent);
+  static std::shared_ptr<ObjectRegistry> Default();
+  explicit ObjectRegistry(const std::shared_ptr<ObjectRegistry>& parent)
+      : parent_(parent) {}
+  explicit ObjectRegistry(const std::shared_ptr<ObjectLibrary>& library);
+
+  std::shared_ptr<ObjectLibrary> AddLibrary(const std::string& id) {
+    auto library = std::make_shared<ObjectLibrary>(id);
+    AddLibrary(library);
+    return library;
+  }
+
+  void AddLibrary(const std::shared_ptr<ObjectLibrary>& library) {
+    std::unique_lock<std::mutex> lock(library_mutex_);
+    libraries_.push_back(library);
+  }
+
+  void AddLibrary(const std::string& id, const RegistrarFunc& registrar,
+                  const std::string& arg) {
+    auto library = AddLibrary(id);
+    library->Register(registrar, arg);
+  }
+
+  // Finds the factory for target and instantiates a new T.
+  // Returns NotSupported if no factory is found
+  // Returns InvalidArgument if a factory is found but the factory failed.
+  template <typename T>
+  Status NewObject(const std::string& target, T** object,
+                   std::unique_ptr<T>* guard) {
+    assert(guard != nullptr);
+    guard->reset();
+    auto factory = FindFactory<T>(target);
+    if (factory != nullptr) {
+      std::string errmsg;
+      *object = factory(target, guard, &errmsg);
+      if (*object != nullptr) {
+        return Status::OK();
+      } else if (errmsg.empty()) {
+        return Status::InvalidArgument(
+            std::string("Could not load ") + T::Type(), target);
+      } else {
+        return Status::InvalidArgument(errmsg, target);
+      }
+    } else {
+      return Status::NotSupported(std::string("Could not load ") + T::Type(),
+                                  target);
+    }
+  }
+  // Creates a new unique T using the input factory functions.
+  // Returns OK if a new unique T was successfully created
+  // Returns NotSupported if the type/target could not be created
+  // Returns InvalidArgument if the factory return an unguarded object
+  //                      (meaning it cannot be managed by a unique ptr)
+  template <typename T>
+  Status NewUniqueObject(const std::string& target,
+                         std::unique_ptr<T>* result) {
+    T* ptr = nullptr;
+    std::unique_ptr<T> guard;
+    Status s = NewObject(target, &ptr, &guard);
+    if (!s.ok()) {
+      return s;
+    } else if (guard) {
+      result->reset(guard.release());
+      return Status::OK();
+    } else {
+      return Status::InvalidArgument(std::string("Cannot make a unique ") +
+                                         T::Type() + " from unguarded one ",
+                                     target);
+    }
+  }
+
+  // Creates a new shared T using the input factory functions.
+  // Returns OK if a new shared T was successfully created
+  // Returns NotSupported if the type/target could not be created
+  // Returns InvalidArgument if the factory return an unguarded object
+  //                      (meaning it cannot be managed by a shared ptr)
+  template <typename T>
+  Status NewSharedObject(const std::string& target,
+                         std::shared_ptr<T>* result) {
+    std::unique_ptr<T> guard;
+    T* ptr = nullptr;
+    Status s = NewObject(target, &ptr, &guard);
+    if (!s.ok()) {
+      return s;
+    } else if (guard) {
+      result->reset(guard.release());
+      return Status::OK();
+    } else {
+      return Status::InvalidArgument(std::string("Cannot make a shared ") +
+                                         T::Type() + " from unguarded one ",
+                                     target);
+    }
+  }
+
+  // Creates a new static T using the input factory functions.
+  // Returns OK if a new static T was successfully created
+  // Returns NotSupported if the type/target could not be created
+  // Returns InvalidArgument if the factory return a guarded object
+  //                      (meaning it is managed by a unique ptr)
+  template <typename T>
+  Status NewStaticObject(const std::string& target, T** result) {
+    std::unique_ptr<T> guard;
+    T* ptr = nullptr;
+    Status s = NewObject(target, &ptr, &guard);
+    if (!s.ok()) {
+      return s;
+    } else if (guard.get()) {
+      return Status::InvalidArgument(std::string("Cannot make a static ") +
+                                         T::Type() + " from a guarded one ",
+                                     target);
+    } else {
+      *result = ptr;
+      return Status::OK();
+    }
+  }
+
+  // Sets the object for the given id/type to be the input object
+  // If the registry does not contain this id/type, the object is added and OK
+  // is returned. If the registry contains a different object, an error is
+  // returned. If the registry contains the input object, OK is returned.
+  template <typename T>
+  Status SetManagedObject(const std::shared_ptr<T>& object) {
+    assert(object != nullptr);
+    return SetManagedObject(object->GetId(), object);
+  }
+
+  template <typename T>
+  Status SetManagedObject(const std::string& id,
+                          const std::shared_ptr<T>& object) {
+    const auto c = std::static_pointer_cast<Customizable>(object);
+    return SetManagedObject(T::Type(), id, c);
+  }
+
+  // Returns the object for the given id, if one exists.
+  // If the object is not found in the registry, a nullptr is returned
+  template <typename T>
+  std::shared_ptr<T> GetManagedObject(const std::string& id) const {
+    auto c = GetManagedObject(T::Type(), id);
+    return std::static_pointer_cast<T>(c);
+  }
+
+  // Returns the set of managed objects found in the registry matching
+  // the input type and ID.
+  // If the input id is not empty, then only objects of that class
+  // (IsInstanceOf(id)) will be returned (for example, only return LRUCache
+  // objects) If the input id is empty, then all objects of that type (all Cache
+  // objects)
+  template <typename T>
+  Status ListManagedObjects(const std::string& id,
+                            std::vector<std::shared_ptr<T>>* results) const {
+    std::vector<std::shared_ptr<Customizable>> customizables;
+    results->clear();
+    Status s = ListManagedObjects(T::Type(), id, &customizables);
+    if (s.ok()) {
+      for (const auto& c : customizables) {
+        results->push_back(std::static_pointer_cast<T>(c));
+      }
+    }
+    return s;
+  }
+
+  template <typename T>
+  Status ListManagedObjects(std::vector<std::shared_ptr<T>>* results) const {
+    return ListManagedObjects("", results);
+  }
+
+  // Creates a new ManagedObject in the registry for the id if one does not
+  // currently exist.  If an object with that ID already exists, the current
+  // object is returned.
+  //
+  // The ID is the identifier of the object to be returned/created and returned
+  // in result
+  // If a new object is created (using the object factories), the cfunc
+  // parameter will be invoked to configure the new object.
+  template <typename T>
+  Status GetOrCreateManagedObject(const std::string& id,
+                                  std::shared_ptr<T>* result,
+                                  const ConfigureFunc<T>& cfunc = nullptr) {
+    if (parent_ != nullptr) {
+      auto object = parent_->GetManagedObject(T::Type(), id);
+      if (object != nullptr) {
+        *result = std::static_pointer_cast<T>(object);
+        return Status::OK();
+      }
+    }
+    {
+      std::unique_lock<std::mutex> lock(objects_mutex_);
+      auto key = ToManagedObjectKey(T::Type(), id);
+      auto iter = managed_objects_.find(key);
+      if (iter != managed_objects_.end()) {
+        auto object = iter->second.lock();
+        if (object != nullptr) {
+          *result = std::static_pointer_cast<T>(object);
+          return Status::OK();
+        }
+      }
+      std::shared_ptr<T> object;
+      Status s = NewSharedObject(id, &object);
+      if (s.ok() && cfunc != nullptr) {
+        s = cfunc(object.get());
+      }
+      if (s.ok()) {
+        auto c = std::static_pointer_cast<Customizable>(object);
+        if (id != c->Name()) {
+          // If the ID is not the base name of the class, add the new
+          // object under the input ID
+          managed_objects_[key] = c;
+        }
+        if (id != c->GetId() && c->GetId() != c->Name()) {
+          // If the input and current ID do not match, and the
+          // current ID is not the base bame, add the new object under
+          // its new ID
+          key = ToManagedObjectKey(T::Type(), c->GetId());
+          managed_objects_[key] = c;
+        }
+        *result = object;
+      }
+      return s;
+    }
+  }
+
+  // Returns the number of factories registered for this library
+  // for the input type.
+  // @param num_types returns how many unique types are registered.
+  size_t GetFactoryCount(const std::string& type) const;
+
+  // Returns the names of registered factories for the input type.
+  // names is updated to include the names for the type
+  void GetFactoryNames(const std::string& type,
+                       std::vector<std::string>* names) const;
+
+  void GetFactoryTypes(std::unordered_set<std::string>* types) const;
+
+  // Dump the contents of the registry to the logger
+  void Dump(Logger* logger) const;
+
+  // Invokes the input function to retrieve the properties for this plugin.
+  int RegisterPlugin(const std::string& name, const RegistrarFunc& func);
+
+ private:
+  static std::string ToManagedObjectKey(const std::string& type,
+                                        const std::string& id) {
+    return type + "://" + id;
+  }
+
+  // Returns the Customizable managed object associated with the key (Type/ID).
+  // If not found, nullptr is returned.
+  std::shared_ptr<Customizable> GetManagedObject(const std::string& type,
+                                                 const std::string& id) const;
+  Status ListManagedObjects(
+      const std::string& type, const std::string& pattern,
+      std::vector<std::shared_ptr<Customizable>>* results) const;
+  // Sets the managed object associated with the key (Type/ID) to c.
+  // If the named managed object does not exist, the object is added and OK is
+  // returned If the object exists and is the same as c, OK is returned
+  // Otherwise, an error status is returned.
+  Status SetManagedObject(const std::string& type, const std::string& id,
+                          const std::shared_ptr<Customizable>& c);
+
+  // Searches (from back to front) the libraries looking for the
+  // factory that matches this name.
+  // Returns the factory if it is found, and nullptr otherwise
+  template <typename T>
+  const FactoryFunc<T> FindFactory(const std::string& name) const {
+    {
+      std::unique_lock<std::mutex> lock(library_mutex_);
+      for (auto iter = libraries_.crbegin(); iter != libraries_.crend();
+           ++iter) {
+        const auto factory = iter->get()->FindFactory<T>(name);
+        if (factory != nullptr) {
+          return factory;
+        }
+      }
+    }
+    if (parent_ == nullptr) {
+      return nullptr;
+    } else {
+      return parent_->FindFactory<T>(name);
+    }
+  }
+
+  // The set of libraries to search for factories for this registry.
+  // The libraries are searched in reverse order (back to front) when
+  // searching for entries.
+  std::vector<std::shared_ptr<ObjectLibrary>> libraries_;
+  std::vector<std::string> plugins_;
+  static std::unordered_map<std::string, RegistrarFunc> builtins_;
+  std::map<std::string, std::weak_ptr<Customizable>> managed_objects_;
+  std::shared_ptr<ObjectRegistry> parent_;
+  mutable std::mutex objects_mutex_;  // Mutex for managed objects
+  mutable std::mutex library_mutex_;  // Mutex for managed libraries
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h b/src/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h
new file mode 100644
index 000000000..c070e49a3
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h
@@ -0,0 +1,100 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/stackable_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Transaction;
+
+// Database with Transaction support.
+//
+// See optimistic_transaction.h and examples/transaction_example.cc
+
+// Options to use when starting an Optimistic Transaction
+struct OptimisticTransactionOptions {
+  // Setting set_snapshot=true is the same as calling SetSnapshot().
+  bool set_snapshot = false;
+
+  // Should be set if the DB has a non-default comparator.
+  // See comment in WriteBatchWithIndex constructor.
+  const Comparator* cmp = BytewiseComparator();
+};
+
+enum class OccValidationPolicy {
+  // Validate serially at commit stage, AFTER entering the write-group.
+  // Isolation validation is processed single-threaded(since in the
+  // write-group).
+  // May suffer from high mutex contention, as per this link:
+  // https://github.com/facebook/rocksdb/issues/4402
+  kValidateSerial = 0,
+  // Validate parallelly before commit stage, BEFORE entering the write-group to
+  // reduce mutex contention. Each txn acquires locks for its write-set
+  // records in some well-defined order.
+  kValidateParallel = 1
+};
+
+struct OptimisticTransactionDBOptions {
+  OccValidationPolicy validate_policy = OccValidationPolicy::kValidateParallel;
+
+  // works only if validate_policy == OccValidationPolicy::kValidateParallel
+  uint32_t occ_lock_buckets = (1 << 20);
+};
+
+// Range deletions (including those in `WriteBatch`es passed to `Write()`) are
+// incompatible with `OptimisticTransactionDB` and will return a non-OK `Status`
+class OptimisticTransactionDB : public StackableDB {
+ public:
+  // Open an OptimisticTransactionDB similar to DB::Open().
+  static Status Open(const Options& options, const std::string& dbname,
+                     OptimisticTransactionDB** dbptr);
+
+  static Status Open(const DBOptions& db_options, const std::string& dbname,
+                     const std::vector<ColumnFamilyDescriptor>& column_families,
+                     std::vector<ColumnFamilyHandle*>* handles,
+                     OptimisticTransactionDB** dbptr);
+
+  static Status Open(const DBOptions& db_options,
+                     const OptimisticTransactionDBOptions& occ_options,
+                     const std::string& dbname,
+                     const std::vector<ColumnFamilyDescriptor>& column_families,
+                     std::vector<ColumnFamilyHandle*>* handles,
+                     OptimisticTransactionDB** dbptr);
+
+  virtual ~OptimisticTransactionDB() {}
+
+  // Starts a new Transaction.
+  //
+  // Caller is responsible for deleting the returned transaction when no
+  // longer needed.
+  //
+  // If old_txn is not null, BeginTransaction will reuse this Transaction
+  // handle instead of allocating a new one.  This is an optimization to avoid
+  // extra allocations when repeatedly creating transactions.
+  virtual Transaction* BeginTransaction(
+      const WriteOptions& write_options,
+      const OptimisticTransactionOptions& txn_options =
+          OptimisticTransactionOptions(),
+      Transaction* old_txn = nullptr) = 0;
+
+  OptimisticTransactionDB(const OptimisticTransactionDB&) = delete;
+  void operator=(const OptimisticTransactionDB&) = delete;
+
+ protected:
+  // To Create an OptimisticTransactionDB, call Open()
+  explicit OptimisticTransactionDB(DB* db) : StackableDB(db) {}
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/option_change_migration.h b/src/rocksdb/include/rocksdb/utilities/option_change_migration.h
new file mode 100644
index 000000000..a73324a9e
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/option_change_migration.h
@@ -0,0 +1,24 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Try to migrate DB created with old_opts to be use new_opts.
+// Multiple column families is not supported.
+// It is best-effort. No guarantee to succeed.
+// A full compaction may be executed.
+// If the target options use FIFO compaction, the FIFO condition might be
+// sacrificed: for data migrated, data inserted later might be dropped
+// earlier. This is to gurantee FIFO compaction won't drop all the
+// migrated data to fit max_table_files_size.
+Status OptionChangeMigration(std::string dbname, const Options& old_opts,
+                             const Options& new_opts);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/options_type.h b/src/rocksdb/include/rocksdb/utilities/options_type.h
new file mode 100644
index 000000000..cd340ed59
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/options_type.h
@@ -0,0 +1,1221 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// The OptionTypeInfo and related classes provide a framework for
+// configuring and validating RocksDB classes via the Options framework.
+// This file is part of the public API to allow developers who wish to
+// write their own extensions and plugins to take use the Options
+// framework in their custom implementations.
+//
+// See https://github.com/facebook/rocksdb/wiki/RocksDB-Configurable-Objects
+// for more information on how to develop and use custom extensions
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <unordered_map>
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+class OptionTypeInfo;
+struct ColumnFamilyOptions;
+struct DBOptions;
+
+// The underlying "class/type" of the option.
+// This enum is used to determine how the option should
+// be converted to/from strings and compared.
+enum class OptionType {
+  kBoolean,
+  kInt,
+  kInt32T,
+  kInt64T,
+  kUInt,
+  kUInt8T,
+  kUInt32T,
+  kUInt64T,
+  kSizeT,
+  kString,
+  kDouble,
+  kCompactionStyle,
+  kCompactionPri,
+  kCompressionType,
+  kCompactionStopStyle,
+  kChecksumType,
+  kEncodingType,
+  kEnv,
+  kEnum,
+  kStruct,
+  kVector,
+  kConfigurable,
+  kCustomizable,
+  kEncodedString,
+  kTemperature,
+  kArray,
+  kUnknown,
+};
+
+enum class OptionVerificationType {
+  kNormal,
+  kByName,               // The option is pointer typed so we can only verify
+                         // based on it's name.
+  kByNameAllowNull,      // Same as kByName, but it also allows the case
+                         // where one of them is a nullptr.
+  kByNameAllowFromNull,  // Same as kByName, but it also allows the case
+                         // where the old option is nullptr.
+  kDeprecated,           // The option is no longer used in rocksdb. The RocksDB
+                         // OptionsParser will still accept this option if it
+                         // happen to exists in some Options file.  However,
+                         // the parser will not include it in serialization
+                         // and verification processes.
+  kAlias,                // This option represents is a name/shortcut for
+                         // another option and should not be written or verified
+                         // independently
+};
+
+// A set of modifier flags used to alter how an option is evaluated or
+// processed. These flags can be combined together (e.g. kMutable | kShared).
+// The kCompare flags can be used to control if/when options are compared.
+// If kCompareNever is set, two related options would never be compared (always
+// equal) If kCompareExact is set, the options will only be compared if the
+// sanity mode
+//                  is exact
+// kMutable       means the option can be changed after it is prepared
+// kShared        means the option is contained in a std::shared_ptr
+// kUnique        means the option is contained in a std::uniqued_ptr
+// kRawPointer    means the option is a raw pointer value.
+// kAllowNull     means that an option is allowed to be null for verification
+//                purposes.
+// kDontSerialize means this option should not be serialized and included in
+//                the string representation.
+// kDontPrepare   means do not call PrepareOptions for this pointer value.
+enum class OptionTypeFlags : uint32_t {
+  kNone = 0x00,  // No flags
+  kCompareDefault = 0x0,
+  kCompareNever = ConfigOptions::kSanityLevelNone,
+  kCompareLoose = ConfigOptions::kSanityLevelLooselyCompatible,
+  kCompareExact = ConfigOptions::kSanityLevelExactMatch,
+
+  kMutable = 0x0100,         // Option is mutable
+  kRawPointer = 0x0200,      // The option is stored as a raw pointer
+  kShared = 0x0400,          // The option is stored as a shared_ptr
+  kUnique = 0x0800,          // The option is stored as a unique_ptr
+  kAllowNull = 0x1000,       // The option can be null
+  kDontSerialize = 0x2000,   // Don't serialize the option
+  kDontPrepare = 0x4000,     // Don't prepare or sanitize this option
+  kStringNameOnly = 0x8000,  // The option serializes to a name only
+};
+
+inline OptionTypeFlags operator|(const OptionTypeFlags& a,
+                                 const OptionTypeFlags& b) {
+  return static_cast<OptionTypeFlags>(static_cast<uint32_t>(a) |
+                                      static_cast<uint32_t>(b));
+}
+
+inline OptionTypeFlags operator&(const OptionTypeFlags& a,
+                                 const OptionTypeFlags& b) {
+  return static_cast<OptionTypeFlags>(static_cast<uint32_t>(a) &
+                                      static_cast<uint32_t>(b));
+}
+
+// Converts an string into its enumerated value.
+// @param type_map Mapping between strings and enum values
+// @param type The string representation of the enum
+// @param value Returns the enum value represented by the string
+// @return true if the string was found in the enum map, false otherwise.
+template <typename T>
+bool ParseEnum(const std::unordered_map<std::string, T>& type_map,
+               const std::string& type, T* value) {
+  auto iter = type_map.find(type);
+  if (iter != type_map.end()) {
+    *value = iter->second;
+    return true;
+  }
+  return false;
+}
+
+// Converts an enum into its string representation.
+// @param type_map Mapping between strings and enum values
+// @param type The enum
+// @param value Returned as the string representation of the enum
+// @return true if the enum was found in the enum map, false otherwise.
+template <typename T>
+bool SerializeEnum(const std::unordered_map<std::string, T>& type_map,
+                   const T& type, std::string* value) {
+  for (const auto& pair : type_map) {
+    if (pair.second == type) {
+      *value = pair.first;
+      return true;
+    }
+  }
+  return false;
+}
+
+template <typename T, size_t kSize>
+Status ParseArray(const ConfigOptions& config_options,
+                  const OptionTypeInfo& elem_info, char separator,
+                  const std::string& name, const std::string& value,
+                  std::array<T, kSize>* result);
+
+template <typename T, size_t kSize>
+Status SerializeArray(const ConfigOptions& config_options,
+                      const OptionTypeInfo& elem_info, char separator,
+                      const std::string& name, const std::array<T, kSize>& vec,
+                      std::string* value);
+
+template <typename T, size_t kSize>
+bool ArraysAreEqual(const ConfigOptions& config_options,
+                    const OptionTypeInfo& elem_info, const std::string& name,
+                    const std::array<T, kSize>& array1,
+                    const std::array<T, kSize>& array2, std::string* mismatch);
+
+template <typename T>
+Status ParseVector(const ConfigOptions& config_options,
+                   const OptionTypeInfo& elem_info, char separator,
+                   const std::string& name, const std::string& value,
+                   std::vector<T>* result);
+
+template <typename T>
+Status SerializeVector(const ConfigOptions& config_options,
+                       const OptionTypeInfo& elem_info, char separator,
+                       const std::string& name, const std::vector<T>& vec,
+                       std::string* value);
+template <typename T>
+bool VectorsAreEqual(const ConfigOptions& config_options,
+                     const OptionTypeInfo& elem_info, const std::string& name,
+                     const std::vector<T>& vec1, const std::vector<T>& vec2,
+                     std::string* mismatch);
+
+// Function for converting a option string value into its underlying
+// representation in "addr"
+// On success, Status::OK is returned and addr is set to the parsed form
+// On failure, a non-OK status is returned
+// @param opts  The ConfigOptions controlling how the value is parsed
+// @param name  The name of the options being parsed
+// @param value The string representation of the option
+// @param addr  Pointer to the object
+using ParseFunc = std::function<Status(
+    const ConfigOptions& /*opts*/, const std::string& /*name*/,
+    const std::string& /*value*/, void* /*addr*/)>;
+
+// Function for converting an option "addr" into its string representation.
+// On success, Status::OK is returned and value is the serialized form.
+// On failure, a non-OK status is returned
+// @param opts  The ConfigOptions controlling how the values are serialized
+// @param name  The name of the options being serialized
+// @param addr  Pointer to the value being serialized
+// @param value The result of the serialization.
+using SerializeFunc = std::function<Status(
+    const ConfigOptions& /*opts*/, const std::string& /*name*/,
+    const void* /*addr*/, std::string* /*value*/)>;
+
+// Function for comparing two option values
+// If they are not equal, updates "mismatch" with the name of the bad option
+// @param opts  The ConfigOptions controlling how the values are compared
+// @param name  The name of the options being compared
+// @param addr1 The first address to compare
+// @param addr2 The address to compare to
+// @param mismatch If the values are not equal, the name of the option that
+// first differs
+using EqualsFunc = std::function<bool(
+    const ConfigOptions& /*opts*/, const std::string& /*name*/,
+    const void* /*addr1*/, const void* /*addr2*/, std::string* mismatch)>;
+
+// Function for preparing/initializing an option.
+using PrepareFunc =
+    std::function<Status(const ConfigOptions& /*opts*/,
+                         const std::string& /*name*/, void* /*addr*/)>;
+
+// Function for validating an option.
+using ValidateFunc = std::function<Status(
+    const DBOptions& /*db_opts*/, const ColumnFamilyOptions& /*cf_opts*/,
+    const std::string& /*name*/, const void* /*addr*/)>;
+
+// A struct for storing constant option information such as option name,
+// option type, and offset.
+class OptionTypeInfo {
+ public:
+  // A simple "normal", non-mutable Type "type" at offset
+  OptionTypeInfo(int offset, OptionType type)
+      : offset_(offset),
+        parse_func_(nullptr),
+        serialize_func_(nullptr),
+        equals_func_(nullptr),
+        type_(type),
+        verification_(OptionVerificationType::kNormal),
+        flags_(OptionTypeFlags::kNone) {}
+
+  OptionTypeInfo(int offset, OptionType type,
+                 OptionVerificationType verification, OptionTypeFlags flags)
+      : offset_(offset),
+        parse_func_(nullptr),
+        serialize_func_(nullptr),
+        equals_func_(nullptr),
+        type_(type),
+        verification_(verification),
+        flags_(flags) {}
+
+  OptionTypeInfo(int offset, OptionType type,
+                 OptionVerificationType verification, OptionTypeFlags flags,
+                 const ParseFunc& parse_func)
+      : offset_(offset),
+        parse_func_(parse_func),
+        serialize_func_(nullptr),
+        equals_func_(nullptr),
+        type_(type),
+        verification_(verification),
+        flags_(flags) {}
+
+  OptionTypeInfo(int offset, OptionType type,
+                 OptionVerificationType verification, OptionTypeFlags flags,
+                 const ParseFunc& parse_func,
+                 const SerializeFunc& serialize_func,
+                 const EqualsFunc& equals_func)
+      : offset_(offset),
+        parse_func_(parse_func),
+        serialize_func_(serialize_func),
+        equals_func_(equals_func),
+        type_(type),
+        verification_(verification),
+        flags_(flags) {}
+
+  // Creates an OptionTypeInfo for an enum type.  Enums use an additional
+  // map to convert the enums to/from their string representation.
+  // To create an OptionTypeInfo that is an Enum, one should:
+  // - Create a static map of string values to the corresponding enum value
+  // - Call this method passing the static map in as a parameter.
+  // Note that it is not necessary to add a new OptionType or make any
+  // other changes -- the returned object handles parsing, serialization, and
+  // comparisons.
+  //
+  // @param offset The offset in the option object for this enum
+  // @param map The string to enum mapping for this enum
+  template <typename T>
+  static OptionTypeInfo Enum(
+      int offset, const std::unordered_map<std::string, T>* const map,
+      OptionTypeFlags flags = OptionTypeFlags::kNone) {
+    OptionTypeInfo info(offset, OptionType::kEnum,
+                        OptionVerificationType::kNormal, flags);
+    info.SetParseFunc(
+        // Uses the map argument to convert the input string into
+        // its corresponding enum value.  If value is found in the map,
+        // addr is updated to the corresponding map entry.
+        // @return OK if the value is found in the map
+        // @return InvalidArgument if the value is not found in the map
+        [map](const ConfigOptions&, const std::string& name,
+              const std::string& value, void* addr) {
+          if (map == nullptr) {
+            return Status::NotSupported("No enum mapping ", name);
+          } else if (ParseEnum<T>(*map, value, static_cast<T*>(addr))) {
+            return Status::OK();
+          } else {
+            return Status::InvalidArgument("No mapping for enum ", name);
+          }
+        });
+    info.SetSerializeFunc(
+        // Uses the map argument to convert the input enum into
+        // its corresponding string value.  If enum value is found in the map,
+        // value is updated to the corresponding string value in the map.
+        // @return OK if the enum is found in the map
+        // @return InvalidArgument if the enum is not found in the map
+        [map](const ConfigOptions&, const std::string& name, const void* addr,
+              std::string* value) {
+          if (map == nullptr) {
+            return Status::NotSupported("No enum mapping ", name);
+          } else if (SerializeEnum<T>(*map, (*static_cast<const T*>(addr)),
+                                      value)) {
+            return Status::OK();
+          } else {
+            return Status::InvalidArgument("No mapping for enum ", name);
+          }
+        });
+    info.SetEqualsFunc(
+        // Casts addr1 and addr2 to the enum type and returns true if
+        // they are equal, false otherwise.
+        [](const ConfigOptions&, const std::string&, const void* addr1,
+           const void* addr2, std::string*) {
+          return (*static_cast<const T*>(addr1) ==
+                  *static_cast<const T*>(addr2));
+        });
+    return info;
+  }  // End OptionTypeInfo::Enum
+
+  // Creates an OptionTypeInfo for a Struct type.  Structs have a
+  // map of string-OptionTypeInfo associated with them that describes how
+  // to process the object for parsing, serializing, and matching.
+  // Structs also have a struct_name, which is the name of the object
+  // as registered in the parent map.
+  // When processing a struct, the option name can be specified as:
+  //   - <struct_name>       Meaning to process the entire struct.
+  //   - <struct_name.field> Meaning to process the single field
+  //   - <field>             Process the single fields
+  // The CompactionOptionsFIFO, CompactionOptionsUniversal, and LRUCacheOptions
+  // are all examples of Struct options.
+  //
+  // To create an OptionTypeInfo that is a Struct, one should:
+  // - Create a static map of string-OptionTypeInfo corresponding to the
+  //   properties of the object that can be set via the options.
+  // - Call this method passing the name and map in as parameters.
+  // Note that it is not necessary to add a new OptionType or make any
+  // other changes -- the returned object handles parsing, serialization, and
+  // comparisons.
+  //
+  // @param offset The offset in the option object for this enum
+  // @param map The string to enum mapping for this enum
+  static OptionTypeInfo Struct(
+      const std::string& struct_name,
+      const std::unordered_map<std::string, OptionTypeInfo>* struct_map,
+      int offset, OptionVerificationType verification, OptionTypeFlags flags) {
+    OptionTypeInfo info(offset, OptionType::kStruct, verification, flags);
+    info.SetParseFunc(
+        // Parses the struct and updates the fields at addr
+        [struct_name, struct_map](const ConfigOptions& opts,
+                                  const std::string& name,
+                                  const std::string& value, void* addr) {
+          return ParseStruct(opts, struct_name, struct_map, name, value, addr);
+        });
+    info.SetSerializeFunc(
+        // Serializes the struct options into value
+        [struct_name, struct_map](const ConfigOptions& opts,
+                                  const std::string& name, const void* addr,
+                                  std::string* value) {
+          return SerializeStruct(opts, struct_name, struct_map, name, addr,
+                                 value);
+        });
+    info.SetEqualsFunc(
+        // Compares the struct fields of addr1 and addr2 for equality
+        [struct_name, struct_map](const ConfigOptions& opts,
+                                  const std::string& name, const void* addr1,
+                                  const void* addr2, std::string* mismatch) {
+          return StructsAreEqual(opts, struct_name, struct_map, name, addr1,
+                                 addr2, mismatch);
+        });
+    return info;
+  }
+  static OptionTypeInfo Struct(
+      const std::string& struct_name,
+      const std::unordered_map<std::string, OptionTypeInfo>* struct_map,
+      int offset, OptionVerificationType verification, OptionTypeFlags flags,
+      const ParseFunc& parse_func) {
+    OptionTypeInfo info(
+        Struct(struct_name, struct_map, offset, verification, flags));
+    return info.SetParseFunc(parse_func);
+  }
+
+  template <typename T, size_t kSize>
+  static OptionTypeInfo Array(int _offset, OptionVerificationType _verification,
+                              OptionTypeFlags _flags,
+                              const OptionTypeInfo& elem_info,
+                              char separator = ':') {
+    OptionTypeInfo info(_offset, OptionType::kArray, _verification, _flags);
+    info.SetParseFunc([elem_info, separator](
+                          const ConfigOptions& opts, const std::string& name,
+                          const std::string& value, void* addr) {
+      auto result = static_cast<std::array<T, kSize>*>(addr);
+      return ParseArray<T, kSize>(opts, elem_info, separator, name, value,
+                                  result);
+    });
+    info.SetSerializeFunc([elem_info, separator](const ConfigOptions& opts,
+                                                 const std::string& name,
+                                                 const void* addr,
+                                                 std::string* value) {
+      const auto& array = *(static_cast<const std::array<T, kSize>*>(addr));
+      return SerializeArray<T, kSize>(opts, elem_info, separator, name, array,
+                                      value);
+    });
+    info.SetEqualsFunc([elem_info](const ConfigOptions& opts,
+                                   const std::string& name, const void* addr1,
+                                   const void* addr2, std::string* mismatch) {
+      const auto& array1 = *(static_cast<const std::array<T, kSize>*>(addr1));
+      const auto& array2 = *(static_cast<const std::array<T, kSize>*>(addr2));
+      return ArraysAreEqual<T, kSize>(opts, elem_info, name, array1, array2,
+                                      mismatch);
+    });
+    return info;
+  }
+
+  template <typename T>
+  static OptionTypeInfo Vector(int _offset,
+                               OptionVerificationType _verification,
+                               OptionTypeFlags _flags,
+                               const OptionTypeInfo& elem_info,
+                               char separator = ':') {
+    OptionTypeInfo info(_offset, OptionType::kVector, _verification, _flags);
+    info.SetParseFunc([elem_info, separator](
+                          const ConfigOptions& opts, const std::string& name,
+                          const std::string& value, void* addr) {
+      auto result = static_cast<std::vector<T>*>(addr);
+      return ParseVector<T>(opts, elem_info, separator, name, value, result);
+    });
+    info.SetSerializeFunc([elem_info, separator](const ConfigOptions& opts,
+                                                 const std::string& name,
+                                                 const void* addr,
+                                                 std::string* value) {
+      const auto& vec = *(static_cast<const std::vector<T>*>(addr));
+      return SerializeVector<T>(opts, elem_info, separator, name, vec, value);
+    });
+    info.SetEqualsFunc([elem_info](const ConfigOptions& opts,
+                                   const std::string& name, const void* addr1,
+                                   const void* addr2, std::string* mismatch) {
+      const auto& vec1 = *(static_cast<const std::vector<T>*>(addr1));
+      const auto& vec2 = *(static_cast<const std::vector<T>*>(addr2));
+      return VectorsAreEqual<T>(opts, elem_info, name, vec1, vec2, mismatch);
+    });
+    return info;
+  }
+
+  // Create a new std::shared_ptr<Customizable> OptionTypeInfo
+  // This function will call the T::CreateFromString method to create a new
+  // std::shared_ptr<T> object.
+  //
+  // @param offset The offset for the Customizable from the base pointer
+  // @param ovt How to verify this option
+  // @param flags, Extra flags specifying the behavior of this option
+  // @param _sfunc Optional function for serializing this option
+  // @param _efunc Optional function for comparing this option
+  template <typename T>
+  static OptionTypeInfo AsCustomSharedPtr(int offset,
+                                          OptionVerificationType ovt,
+                                          OptionTypeFlags flags) {
+    OptionTypeInfo info(offset, OptionType::kCustomizable, ovt,
+                        flags | OptionTypeFlags::kShared);
+    return info.SetParseFunc([](const ConfigOptions& opts,
+                                const std::string& name,
+                                const std::string& value, void* addr) {
+      auto* shared = static_cast<std::shared_ptr<T>*>(addr);
+      if (name == kIdPropName() && value.empty()) {
+        shared->reset();
+        return Status::OK();
+      } else {
+        return T::CreateFromString(opts, value, shared);
+      }
+    });
+  }
+
+  template <typename T>
+  static OptionTypeInfo AsCustomSharedPtr(int offset,
+                                          OptionVerificationType ovt,
+                                          OptionTypeFlags flags,
+                                          const SerializeFunc& serialize_func,
+                                          const EqualsFunc& equals_func) {
+    OptionTypeInfo info(AsCustomSharedPtr<T>(offset, ovt, flags));
+    info.SetSerializeFunc(serialize_func);
+    info.SetEqualsFunc(equals_func);
+    return info;
+  }
+
+  // Create a new std::unique_ptr<Customizable> OptionTypeInfo
+  // This function will call the T::CreateFromString method to create a new
+  // std::unique_ptr<T> object.
+  //
+  // @param offset The offset for the Customizable from the base pointer
+  // @param ovt How to verify this option
+  // @param flags, Extra flags specifying the behavior of this option
+  // @param _sfunc Optional function for serializing this option
+  // @param _efunc Optional function for comparing this option
+  template <typename T>
+  static OptionTypeInfo AsCustomUniquePtr(int offset,
+                                          OptionVerificationType ovt,
+                                          OptionTypeFlags flags) {
+    OptionTypeInfo info(offset, OptionType::kCustomizable, ovt,
+                        flags | OptionTypeFlags::kUnique);
+    return info.SetParseFunc([](const ConfigOptions& opts,
+                                const std::string& name,
+                                const std::string& value, void* addr) {
+      auto* unique = static_cast<std::unique_ptr<T>*>(addr);
+      if (name == kIdPropName() && value.empty()) {
+        unique->reset();
+        return Status::OK();
+      } else {
+        return T::CreateFromString(opts, value, unique);
+      }
+    });
+  }
+
+  template <typename T>
+  static OptionTypeInfo AsCustomUniquePtr(int offset,
+                                          OptionVerificationType ovt,
+                                          OptionTypeFlags flags,
+                                          const SerializeFunc& serialize_func,
+                                          const EqualsFunc& equals_func) {
+    OptionTypeInfo info(AsCustomUniquePtr<T>(offset, ovt, flags));
+    info.SetSerializeFunc(serialize_func);
+    info.SetEqualsFunc(equals_func);
+    return info;
+  }
+
+  // Create a new Customizable* OptionTypeInfo
+  // This function will call the T::CreateFromString method to create a new
+  // T object.
+  //
+  // @param _offset The offset for the Customizable from the base pointer
+  // @param ovt How to verify this option
+  // @param flags, Extra flags specifying the behavior of this option
+  // @param _sfunc Optional function for serializing this option
+  // @param _efunc Optional function for comparing this option
+  template <typename T>
+  static OptionTypeInfo AsCustomRawPtr(int offset, OptionVerificationType ovt,
+                                       OptionTypeFlags flags) {
+    OptionTypeInfo info(offset, OptionType::kCustomizable, ovt,
+                        flags | OptionTypeFlags::kRawPointer);
+    return info.SetParseFunc([](const ConfigOptions& opts,
+                                const std::string& name,
+                                const std::string& value, void* addr) {
+      auto** pointer = static_cast<T**>(addr);
+      if (name == kIdPropName() && value.empty()) {
+        *pointer = nullptr;
+        return Status::OK();
+      } else {
+        return T::CreateFromString(opts, value, pointer);
+      }
+    });
+  }
+
+  template <typename T>
+  static OptionTypeInfo AsCustomRawPtr(int offset, OptionVerificationType ovt,
+                                       OptionTypeFlags flags,
+                                       const SerializeFunc& serialize_func,
+                                       const EqualsFunc& equals_func) {
+    OptionTypeInfo info(AsCustomRawPtr<T>(offset, ovt, flags));
+    info.SetSerializeFunc(serialize_func);
+    info.SetEqualsFunc(equals_func);
+    return info;
+  }
+
+  OptionTypeInfo& SetParseFunc(const ParseFunc& f) {
+    parse_func_ = f;
+    return *this;
+  }
+
+  OptionTypeInfo& SetSerializeFunc(const SerializeFunc& f) {
+    serialize_func_ = f;
+    return *this;
+  }
+  OptionTypeInfo& SetEqualsFunc(const EqualsFunc& f) {
+    equals_func_ = f;
+    return *this;
+  }
+
+  OptionTypeInfo& SetPrepareFunc(const PrepareFunc& f) {
+    prepare_func_ = f;
+    return *this;
+  }
+
+  OptionTypeInfo& SetValidateFunc(const ValidateFunc& f) {
+    validate_func_ = f;
+    return *this;
+  }
+
+  bool IsEnabled(OptionTypeFlags otf) const { return (flags_ & otf) == otf; }
+
+  bool IsEditable(const ConfigOptions& opts) const {
+    if (opts.mutable_options_only) {
+      return IsMutable();
+    } else {
+      return true;
+    }
+  }
+  bool IsMutable() const { return IsEnabled(OptionTypeFlags::kMutable); }
+
+  bool IsDeprecated() const {
+    return IsEnabled(OptionVerificationType::kDeprecated);
+  }
+
+  // Returns true if the option is marked as an Alias.
+  // Aliases are valid options that are parsed but are not converted to strings
+  // or compared.
+  bool IsAlias() const { return IsEnabled(OptionVerificationType::kAlias); }
+
+  bool IsEnabled(OptionVerificationType ovf) const {
+    return verification_ == ovf;
+  }
+
+  // Returns the sanity level for comparing the option.
+  // If the options should not be compared, returns None
+  // If the option has a compare flag, returns it.
+  // Otherwise, returns "exact"
+  ConfigOptions::SanityLevel GetSanityLevel() const {
+    if (IsDeprecated() || IsAlias()) {
+      return ConfigOptions::SanityLevel::kSanityLevelNone;
+    } else {
+      auto match = (flags_ & OptionTypeFlags::kCompareExact);
+      if (match == OptionTypeFlags::kCompareDefault) {
+        return ConfigOptions::SanityLevel::kSanityLevelExactMatch;
+      } else {
+        return (ConfigOptions::SanityLevel)match;
+      }
+    }
+  }
+
+  // Returns true if the option should be serialized.
+  // Options should be serialized if the are not deprecated, aliases,
+  // or marked as "Don't Serialize".
+  bool ShouldSerialize() const {
+    if (IsDeprecated() || IsAlias()) {
+      return false;
+    } else if (IsEnabled(OptionTypeFlags::kDontSerialize)) {
+      return false;
+    } else {
+      return true;
+    }
+  }
+
+  bool ShouldPrepare() const {
+    if (IsDeprecated() || IsAlias()) {
+      return false;
+    } else if (IsEnabled(OptionTypeFlags::kDontPrepare)) {
+      return false;
+    } else {
+      return (prepare_func_ != nullptr || IsConfigurable());
+    }
+  }
+
+  bool ShouldValidate() const {
+    if (IsDeprecated() || IsAlias()) {
+      return false;
+    } else {
+      return (validate_func_ != nullptr || IsConfigurable());
+    }
+  }
+
+  // Returns true if the option is allowed to be null.
+  // Options can be null if the verification type is allow from null
+  // or if the flags specify allow null.
+  bool CanBeNull() const {
+    return (IsEnabled(OptionTypeFlags::kAllowNull) ||
+            IsEnabled(OptionVerificationType::kByNameAllowNull) ||
+            IsEnabled(OptionVerificationType::kByNameAllowFromNull));
+  }
+
+  bool IsSharedPtr() const { return IsEnabled(OptionTypeFlags::kShared); }
+
+  bool IsUniquePtr() const { return IsEnabled(OptionTypeFlags::kUnique); }
+
+  bool IsRawPtr() const { return IsEnabled(OptionTypeFlags::kRawPointer); }
+
+  bool IsByName() const {
+    return (verification_ == OptionVerificationType::kByName ||
+            verification_ == OptionVerificationType::kByNameAllowNull ||
+            verification_ == OptionVerificationType::kByNameAllowFromNull);
+  }
+
+  bool IsStruct() const { return (type_ == OptionType::kStruct); }
+
+  bool IsConfigurable() const {
+    return (type_ == OptionType::kConfigurable ||
+            type_ == OptionType::kCustomizable);
+  }
+
+  bool IsCustomizable() const { return (type_ == OptionType::kCustomizable); }
+
+  inline const void* GetOffset(const void* base) const {
+    return static_cast<const char*>(base) + offset_;
+  }
+
+  inline void* GetOffset(void* base) const {
+    return static_cast<char*>(base) + offset_;
+  }
+
+  template <typename T>
+  const T* GetOffsetAs(const void* base) const {
+    const void* addr = GetOffset(base);
+    return static_cast<const T*>(addr);
+  }
+
+  template <typename T>
+  T* GetOffsetAs(void* base) const {
+    void* addr = GetOffset(base);
+    return static_cast<T*>(addr);
+  }
+
+  // Returns the underlying pointer for the type at base_addr
+  // The value returned is the underlying "raw" pointer, offset from base.
+  template <typename T>
+  const T* AsRawPointer(const void* const base_addr) const {
+    if (base_addr == nullptr) {
+      return nullptr;
+    }
+    if (IsUniquePtr()) {
+      const auto ptr = GetOffsetAs<std::unique_ptr<T>>(base_addr);
+      return ptr->get();
+    } else if (IsSharedPtr()) {
+      const auto ptr = GetOffsetAs<std::shared_ptr<T>>(base_addr);
+      return ptr->get();
+    } else if (IsRawPtr()) {
+      const T* const* ptr = GetOffsetAs<T* const>(base_addr);
+      return *ptr;
+    } else {
+      return GetOffsetAs<T>(base_addr);
+    }
+  }
+
+  // Returns the underlying pointer for the type at base_addr
+  // The value returned is the underlying "raw" pointer, offset from base.
+  template <typename T>
+  T* AsRawPointer(void* base_addr) const {
+    if (base_addr == nullptr) {
+      return nullptr;
+    }
+    if (IsUniquePtr()) {
+      auto ptr = GetOffsetAs<std::unique_ptr<T>>(base_addr);
+      return ptr->get();
+    } else if (IsSharedPtr()) {
+      auto ptr = GetOffsetAs<std::shared_ptr<T>>(base_addr);
+      return ptr->get();
+    } else if (IsRawPtr()) {
+      auto ptr = GetOffsetAs<T*>(base_addr);
+      return *ptr;
+    } else {
+      return GetOffsetAs<T>(base_addr);
+    }
+  }
+
+  // Parses the option in "opt_value" according to the rules of this class
+  // and updates the value at "opt_ptr".
+  // On success, Status::OK() is returned.  On failure:
+  // NotFound means the opt_name is not valid for this option
+  // NotSupported means we do not know how to parse the value for this option
+  // InvalidArgument means the opt_value is not valid for this option.
+  Status Parse(const ConfigOptions& config_options, const std::string& opt_name,
+               const std::string& opt_value, void* const opt_ptr) const;
+
+  // Serializes the option in "opt_addr" according to the rules of this class
+  // into the value at "opt_value".
+  Status Serialize(const ConfigOptions& config_options,
+                   const std::string& opt_name, const void* const opt_ptr,
+                   std::string* opt_value) const;
+
+  // Compares the "addr1" and "addr2" values according to the rules of this
+  // class and returns true if they match.  On a failed match, mismatch is the
+  // name of the option that failed to match.
+  bool AreEqual(const ConfigOptions& config_options,
+                const std::string& opt_name, const void* const addr1,
+                const void* const addr2, std::string* mismatch) const;
+
+  // Used to override the match rules for "ByName" options.
+  bool AreEqualByName(const ConfigOptions& config_options,
+                      const std::string& opt_name, const void* const this_ptr,
+                      const void* const that_ptr) const;
+  bool AreEqualByName(const ConfigOptions& config_options,
+                      const std::string& opt_name, const void* const this_ptr,
+                      const std::string& that_value) const;
+
+  Status Prepare(const ConfigOptions& config_options, const std::string& name,
+                 void* opt_ptr) const;
+  Status Validate(const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts,
+                  const std::string& name, const void* opt_ptr) const;
+
+  // Parses the input opts_map according to the type_map for the opt_addr
+  // For each name-value pair in opts_map, find the corresponding name in
+  // type_map If the name is found:
+  //    - set the corresponding value in opt_addr, returning the status on
+  //    failure;
+  // If the name is not found:
+  //    - If unused is specified, add the name-value to unused and continue
+  //    - If ingore_unknown_options is false, return NotFound
+  // Returns OK if all options were either:
+  //    - Successfully set
+  //    - options were not found and ignore_unknown_options=true
+  //    - options were not found and unused was specified
+  // Note that this method is much less sophisticated than the comparable
+  // Configurable::Configure methods.  For example, on error, there is no
+  // attempt to return opt_addr to the initial state.  Additionally, there
+  // is no effort to initialize (Configurable::PrepareOptions) the object
+  // on success.  This method should typically only be used for simpler,
+  // standalone structures and not those that contain shared and embedded
+  // objects.
+  static Status ParseType(
+      const ConfigOptions& config_options, const std::string& opts_str,
+      const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+      void* opt_addr,
+      std::unordered_map<std::string, std::string>* unused = nullptr);
+  static Status ParseType(
+      const ConfigOptions& config_options,
+      const std::unordered_map<std::string, std::string>& opts_map,
+      const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+      void* opt_addr,
+      std::unordered_map<std::string, std::string>* unused = nullptr);
+
+  // Parses the input value according to the map for the struct at opt_addr
+  // struct_name is the name of the struct option as registered
+  // opt_name is the name of the option being evaluated.  This may
+  // be the whole struct or a sub-element of it, based on struct_name and
+  // opt_name.
+  static Status ParseStruct(
+      const ConfigOptions& config_options, const std::string& struct_name,
+      const std::unordered_map<std::string, OptionTypeInfo>* map,
+      const std::string& opt_name, const std::string& value, void* opt_addr);
+
+  // Serializes the values from opt_addr using the rules in type_map.
+  // Returns the serialized form in result.
+  // Returns OK on success or non-OK if some option could not be serialized.
+  static Status SerializeType(
+      const ConfigOptions& config_options,
+      const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+      const void* opt_addr, std::string* value);
+
+  // Serializes the input addr according to the map for the struct to value.
+  // struct_name is the name of the struct option as registered
+  // opt_name is the name of the option being evaluated.  This may
+  // be the whole struct or a sub-element of it
+  static Status SerializeStruct(
+      const ConfigOptions& config_options, const std::string& struct_name,
+      const std::unordered_map<std::string, OptionTypeInfo>* map,
+      const std::string& opt_name, const void* opt_addr, std::string* value);
+
+  // Compares the values in this_addr and that_addr using the rules in type_map.
+  // If the values are equal, returns true
+  // If the values are not equal, returns false and sets mismatch to the name
+  // of the first value that did not match.
+  static bool TypesAreEqual(
+      const ConfigOptions& config_options,
+      const std::unordered_map<std::string, OptionTypeInfo>& map,
+      const void* this_addr, const void* that_addr, std::string* mismatch);
+
+  // Compares the input offsets according to the map for the struct and returns
+  // true if they are equivalent, false otherwise.
+  // struct_name is the name of the struct option as registered
+  // opt_name is the name of the option being evaluated.  This may
+  // be the whole struct or a sub-element of it
+  static bool StructsAreEqual(
+      const ConfigOptions& config_options, const std::string& struct_name,
+      const std::unordered_map<std::string, OptionTypeInfo>* map,
+      const std::string& opt_name, const void* this_offset,
+      const void* that_offset, std::string* mismatch);
+
+  // Finds the entry for the opt_name in the opt_map, returning
+  // nullptr if not found.
+  // If found, elem_name will be the name of option to find.
+  // This may be opt_name, or a substring of opt_name.
+  // For "simple" options, opt_name will be equal to elem_name.  Given the
+  // opt_name "opt", elem_name will equal "opt".
+  // For "embedded" options (like structs), elem_name may be opt_name
+  // or a field within the opt_name.  For example, given the struct "struct",
+  // and opt_name of "struct.field", elem_name will be "field"
+  static const OptionTypeInfo* Find(
+      const std::string& opt_name,
+      const std::unordered_map<std::string, OptionTypeInfo>& opt_map,
+      std::string* elem_name);
+
+  // Returns the next token marked by the delimiter from "opts" after start in
+  // token and updates end to point to where that token stops. Delimiters inside
+  // of braces are ignored. Returns OK if a token is found and an error if the
+  // input opts string is mis-formatted.
+  // Given "a=AA;b=BB;" start=2 and delimiter=";", token is "AA" and end points
+  // to "b" Given "{a=A;b=B}", the token would be "a=A;b=B"
+  //
+  // @param opts The string in which to find the next token
+  // @param delimiter The delimiter between tokens
+  // @param start     The position in opts to start looking for the token
+  // @param ed        Returns the end position in opts of the token
+  // @param token     Returns the token
+  // @returns OK if a token was found
+  // @return InvalidArgument if the braces mismatch
+  //          (e.g. "{a={b=c;}" ) -- missing closing brace
+  // @return InvalidArgument if an expected delimiter is not found
+  //        e.g. "{a=b}c=d;" -- missing delimiter before "c"
+  static Status NextToken(const std::string& opts, char delimiter, size_t start,
+                          size_t* end, std::string* token);
+
+  constexpr static const char* kIdPropName() { return "id"; }
+  constexpr static const char* kIdPropSuffix() { return ".id"; }
+
+ private:
+  int offset_;
+
+  // The optional function to convert a string to its representation
+  ParseFunc parse_func_;
+
+  // The optional function to convert a value to its string representation
+  SerializeFunc serialize_func_;
+
+  // The optional function to match two option values
+  EqualsFunc equals_func_;
+
+  PrepareFunc prepare_func_;
+  ValidateFunc validate_func_;
+  OptionType type_;
+  OptionVerificationType verification_;
+  OptionTypeFlags flags_;
+};
+
+// Parses the input value into elements of the result array, which has fixed
+// array size. For example, if the value=1:2:3 and elem_info parses integers,
+// the result array will be {1,2,3}. Array size is defined in the OptionTypeInfo
+// the input value has to match with that.
+// @param config_options Controls how the option value is parsed.
+// @param elem_info Controls how individual tokens in value are parsed
+// @param separator Character separating tokens in values (':' in the above
+// example)
+// @param name      The name associated with this array option
+// @param value     The input string to parse into tokens
+// @param result    Returns the results of parsing value into its elements.
+// @return OK if the value was successfully parse
+// @return InvalidArgument if the value is improperly formed or element number
+//                          doesn't match array size defined in OptionTypeInfo
+//                          or if the token could not be parsed
+// @return NotFound         If the tokenized value contains unknown options for
+// its type
+template <typename T, size_t kSize>
+Status ParseArray(const ConfigOptions& config_options,
+                  const OptionTypeInfo& elem_info, char separator,
+                  const std::string& name, const std::string& value,
+                  std::array<T, kSize>* result) {
+  Status status;
+
+  ConfigOptions copy = config_options;
+  copy.ignore_unsupported_options = false;
+  size_t i = 0, start = 0, end = 0;
+  for (; status.ok() && i < kSize && start < value.size() &&
+         end != std::string::npos;
+       i++, start = end + 1) {
+    std::string token;
+    status = OptionTypeInfo::NextToken(value, separator, start, &end, &token);
+    if (status.ok()) {
+      status = elem_info.Parse(copy, name, token, &((*result)[i]));
+      if (config_options.ignore_unsupported_options &&
+          status.IsNotSupported()) {
+        // If we were ignoring unsupported options and this one should be
+        // ignored, ignore it by setting the status to OK
+        status = Status::OK();
+      }
+    }
+  }
+  if (!status.ok()) {
+    return status;
+  }
+  // make sure the element number matches the array size
+  if (i < kSize) {
+    return Status::InvalidArgument(
+        "Serialized value has less elements than array size", name);
+  }
+  if (start < value.size() && end != std::string::npos) {
+    return Status::InvalidArgument(
+        "Serialized value has more elements than array size", name);
+  }
+  return status;
+}
+
+// Serializes the fixed size input array into its output value.  Elements are
+// separated by the separator character.  This element will convert all of the
+// elements in array into their serialized form, using elem_info to perform the
+// serialization.
+// For example, if the array contains the integers 1,2,3 and elem_info
+// serializes the output would be 1:2:3 for separator ":".
+// @param config_options Controls how the option value is serialized.
+// @param elem_info Controls how individual tokens in value are serialized
+// @param separator Character separating tokens in value (':' in the above
+// example)
+// @param name      The name associated with this array option
+// @param array     The input array to serialize
+// @param value     The output string of serialized options
+// @return OK if the value was successfully parse
+// @return InvalidArgument if the value is improperly formed or if the token
+//                          could not be parsed
+// @return NotFound        If the tokenized value contains unknown options for
+//                          its type
+template <typename T, size_t kSize>
+Status SerializeArray(const ConfigOptions& config_options,
+                      const OptionTypeInfo& elem_info, char separator,
+                      const std::string& name,
+                      const std::array<T, kSize>& array, std::string* value) {
+  std::string result;
+  ConfigOptions embedded = config_options;
+  embedded.delimiter = ";";
+  int printed = 0;
+  for (const auto& elem : array) {
+    std::string elem_str;
+    Status s = elem_info.Serialize(embedded, name, &elem, &elem_str);
+    if (!s.ok()) {
+      return s;
+    } else if (!elem_str.empty()) {
+      if (printed++ > 0) {
+        result += separator;
+      }
+      // If the element contains embedded separators, put it inside of brackets
+      if (elem_str.find(separator) != std::string::npos) {
+        result += "{" + elem_str + "}";
+      } else {
+        result += elem_str;
+      }
+    }
+  }
+  if (result.find("=") != std::string::npos) {
+    *value = "{" + result + "}";
+  } else if (printed > 1 && result.at(0) == '{') {
+    *value = "{" + result + "}";
+  } else {
+    *value = result;
+  }
+  return Status::OK();
+}
+
+// Compares the input arrays array1 and array2 for equality
+// Elements of the array are compared one by one using elem_info to perform the
+// comparison.
+//
+// @param config_options Controls how the arrays are compared.
+// @param elem_info  Controls how individual elements in the arrays are compared
+// @param name          The name associated with this array option
+// @param array1,array2 The arrays to compare.
+// @param mismatch      If the arrays are not equivalent, mismatch will point to
+//                       the first element of the comparison that did not match.
+// @return true         If vec1 and vec2 are "equal", false otherwise
+template <typename T, size_t kSize>
+bool ArraysAreEqual(const ConfigOptions& config_options,
+                    const OptionTypeInfo& elem_info, const std::string& name,
+                    const std::array<T, kSize>& array1,
+                    const std::array<T, kSize>& array2, std::string* mismatch) {
+  assert(array1.size() == kSize);
+  assert(array2.size() == kSize);
+  for (size_t i = 0; i < kSize; ++i) {
+    if (!elem_info.AreEqual(config_options, name, &array1[i], &array2[i],
+                            mismatch)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Parses the input value into elements of the result vector.  This method
+// will break the input value into the individual tokens (based on the
+// separator), where each of those tokens will be parsed based on the rules of
+// elem_info. The result vector will be populated with elements based on the
+// input tokens. For example, if the value=1:2:3:4:5 and elem_info parses
+// integers, the result vector will contain the integers 1,2,3,4,5
+// @param config_options Controls how the option value is parsed.
+// @param elem_info Controls how individual tokens in value are parsed
+// @param separator Character separating tokens in values (':' in the above
+// example)
+// @param name      The name associated with this vector option
+// @param value     The input string to parse into tokens
+// @param result    Returns the results of parsing value into its elements.
+// @return OK if the value was successfully parse
+// @return InvalidArgument if the value is improperly formed or if the token
+//                          could not be parsed
+// @return NotFound         If the tokenized value contains unknown options for
+// its type
+template <typename T>
+Status ParseVector(const ConfigOptions& config_options,
+                   const OptionTypeInfo& elem_info, char separator,
+                   const std::string& name, const std::string& value,
+                   std::vector<T>* result) {
+  result->clear();
+  Status status;
+
+  // Turn off ignore_unknown_objects so we can tell if the returned
+  // object is valid or not.
+  ConfigOptions copy = config_options;
+  copy.ignore_unsupported_options = false;
+  for (size_t start = 0, end = 0;
+       status.ok() && start < value.size() && end != std::string::npos;
+       start = end + 1) {
+    std::string token;
+    status = OptionTypeInfo::NextToken(value, separator, start, &end, &token);
+    if (status.ok()) {
+      T elem;
+      status = elem_info.Parse(copy, name, token, &elem);
+      if (status.ok()) {
+        result->emplace_back(elem);
+      } else if (config_options.ignore_unsupported_options &&
+                 status.IsNotSupported()) {
+        // If we were ignoring unsupported options and this one should be
+        // ignored, ignore it by setting the status to OK
+        status = Status::OK();
+      }
+    }
+  }
+  return status;
+}
+
+// Serializes the input vector into its output value.  Elements are
+// separated by the separator character.  This element will convert all of the
+// elements in vec into their serialized form, using elem_info to perform the
+// serialization.
+// For example, if the vec contains the integers 1,2,3,4,5 and elem_info
+// serializes the output would be 1:2:3:4:5 for separator ":".
+// @param config_options Controls how the option value is serialized.
+// @param elem_info Controls how individual tokens in value are serialized
+// @param separator Character separating tokens in value (':' in the above
+// example)
+// @param name      The name associated with this vector option
+// @param vec       The input vector to serialize
+// @param value     The output string of serialized options
+// @return OK if the value was successfully parse
+// @return InvalidArgument if the value is improperly formed or if the token
+//                          could not be parsed
+// @return NotFound         If the tokenized value contains unknown options for
+// its type
+template <typename T>
+Status SerializeVector(const ConfigOptions& config_options,
+                       const OptionTypeInfo& elem_info, char separator,
+                       const std::string& name, const std::vector<T>& vec,
+                       std::string* value) {
+  std::string result;
+  ConfigOptions embedded = config_options;
+  embedded.delimiter = ";";
+  int printed = 0;
+  for (const auto& elem : vec) {
+    std::string elem_str;
+    Status s = elem_info.Serialize(embedded, name, &elem, &elem_str);
+    if (!s.ok()) {
+      return s;
+    } else if (!elem_str.empty()) {
+      if (printed++ > 0) {
+        result += separator;
+      }
+      // If the element contains embedded separators, put it inside of brackets
+      if (elem_str.find(separator) != std::string::npos) {
+        result += "{" + elem_str + "}";
+      } else {
+        result += elem_str;
+      }
+    }
+  }
+  if (result.find("=") != std::string::npos) {
+    *value = "{" + result + "}";
+  } else if (printed > 1 && result.at(0) == '{') {
+    *value = "{" + result + "}";
+  } else {
+    *value = result;
+  }
+  return Status::OK();
+}
+
+// Compares the input vectors vec1 and vec2 for equality
+// If the vectors are the same size, elements of the vectors are compared one by
+// one using elem_info to perform the comparison.
+//
+// @param config_options Controls how the vectors are compared.
+// @param elem_info Controls how individual elements in the vectors are compared
+// @param name      The name associated with this vector option
+// @param vec1,vec2 The vectors to compare.
+// @param mismatch  If the vectors are not equivalent, mismatch will point to
+// the first
+//                  element of the comparison that did not match.
+// @return true     If vec1 and vec2 are "equal", false otherwise
+template <typename T>
+bool VectorsAreEqual(const ConfigOptions& config_options,
+                     const OptionTypeInfo& elem_info, const std::string& name,
+                     const std::vector<T>& vec1, const std::vector<T>& vec2,
+                     std::string* mismatch) {
+  if (vec1.size() != vec2.size()) {
+    *mismatch = name;
+    return false;
+  } else {
+    for (size_t i = 0; i < vec1.size(); ++i) {
+      if (!elem_info.AreEqual(
+              config_options, name, reinterpret_cast<const char*>(&vec1[i]),
+              reinterpret_cast<const char*>(&vec2[i]), mismatch)) {
+        return false;
+      }
+    }
+    return true;
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/options_util.h b/src/rocksdb/include/rocksdb/utilities/options_util.h
new file mode 100644
index 000000000..064c087f0
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/options_util.h
@@ -0,0 +1,128 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+// This file contains utility functions for RocksDB Options.
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+struct ConfigOptions;
+// Constructs the DBOptions and ColumnFamilyDescriptors by loading the
+// latest RocksDB options file stored in the specified rocksdb database.
+//
+// Note that the all the pointer options (except table_factory, which will
+// be described in more details below) will be initialized with the default
+// values.  Developers can further initialize them after this function call.
+// Below is an example list of pointer options which will be initialized
+//
+// * env
+// * memtable_factory
+// * compaction_filter_factory
+// * prefix_extractor
+// * comparator
+// * merge_operator
+// * compaction_filter
+//
+// User can also choose to load customized comparator, env, and/or
+// merge_operator through object registry:
+// * comparator needs to be registered through Registrar<const Comparator>
+// * env needs to be registered through Registrar<Env>
+// * merge operator needs to be registered through
+//     Registrar<std::shared_ptr<MergeOperator>>.
+//
+// For table_factory, this function further supports deserializing
+// BlockBasedTableFactory and its BlockBasedTableOptions except the
+// pointer options of BlockBasedTableOptions (flush_block_policy_factory,
+// block_cache, and block_cache_compressed), which will be initialized with
+// default values.  Developers can further specify these three options by
+// casting the return value of TableFactory::GetOptions() to
+// BlockBasedTableOptions and making necessary changes.
+//
+// ignore_unknown_options can be set to true if you want to ignore options
+// that are from a newer version of the db, essentially for forward
+// compatibility.
+//
+// config_options contains a set of options that controls the processing
+// of the options.  The LoadLatestOptions(ConfigOptions...) should be preferred;
+// the alternative signature may be deprecated in a future release. The
+// equivalent functionality can be achieved by setting the corresponding options
+// in the ConfigOptions parameter.
+//
+// examples/options_file_example.cc demonstrates how to use this function
+// to open a RocksDB instance.
+//
+// @return the function returns an OK status when it went successfully.  If
+//     the specified "dbpath" does not contain any option file, then a
+//     Status::NotFound will be returned.  A return value other than
+//     Status::OK or Status::NotFound indicates there is some error related
+//     to the options file itself.
+//
+// @see LoadOptionsFromFile
+Status LoadLatestOptions(const std::string& dbpath, Env* env,
+                         DBOptions* db_options,
+                         std::vector<ColumnFamilyDescriptor>* cf_descs,
+                         bool ignore_unknown_options = false,
+                         std::shared_ptr<Cache>* cache = {});
+Status LoadLatestOptions(const ConfigOptions& config_options,
+                         const std::string& dbpath, DBOptions* db_options,
+                         std::vector<ColumnFamilyDescriptor>* cf_descs,
+                         std::shared_ptr<Cache>* cache = {});
+
+// Similar to LoadLatestOptions, this function constructs the DBOptions
+// and ColumnFamilyDescriptors based on the specified RocksDB Options file.
+//
+// The LoadOptionsFile(ConfigOptions...) should be preferred;
+// the alternative signature may be deprecated in a future release. The
+// equivalent functionality can be achieved by setting the corresponding
+// options in the ConfigOptions parameter.
+//
+// @see LoadLatestOptions
+Status LoadOptionsFromFile(const std::string& options_file_name, Env* env,
+                           DBOptions* db_options,
+                           std::vector<ColumnFamilyDescriptor>* cf_descs,
+                           bool ignore_unknown_options = false,
+                           std::shared_ptr<Cache>* cache = {});
+Status LoadOptionsFromFile(const ConfigOptions& config_options,
+                           const std::string& options_file_name,
+                           DBOptions* db_options,
+                           std::vector<ColumnFamilyDescriptor>* cf_descs,
+                           std::shared_ptr<Cache>* cache = {});
+
+// Returns the latest options file name under the specified db path.
+Status GetLatestOptionsFileName(const std::string& dbpath, Env* env,
+                                std::string* options_file_name);
+
+// Returns Status::OK if the input DBOptions and ColumnFamilyDescriptors
+// are compatible with the latest options stored in the specified DB path.
+//
+// If the return status is non-ok, it means the specified RocksDB instance
+// might not be correctly opened with the input set of options.  Currently,
+// changing one of the following options will fail the compatibility check:
+//
+// * comparator
+// * prefix_extractor
+// * table_factory
+// * merge_operator
+Status CheckOptionsCompatibility(
+    const std::string& dbpath, Env* env, const DBOptions& db_options,
+    const std::vector<ColumnFamilyDescriptor>& cf_descs,
+    bool ignore_unknown_options = false);
+Status CheckOptionsCompatibility(
+    const ConfigOptions& config_options, const std::string& dbpath,
+    const DBOptions& db_options,
+    const std::vector<ColumnFamilyDescriptor>& cf_descs);
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/replayer.h b/src/rocksdb/include/rocksdb/utilities/replayer.h
new file mode 100644
index 000000000..4fdd8d73a
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/replayer.h
@@ -0,0 +1,87 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+#include <memory>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TraceRecord;
+class TraceRecordResult;
+
+struct ReplayOptions {
+  // Number of threads used for replaying. If 0 or 1, replay using
+  // single thread.
+  uint32_t num_threads;
+
+  // Enables fast forwarding a replay by increasing/reducing the delay between
+  // the ingested traces.
+  //   If > 0.0 and < 1.0, slow down the replay by this amount.
+  //   If 1.0, replay the operations at the same rate as in the trace stream.
+  //   If > 1, speed up the replay by this amount.
+  double fast_forward;
+
+  ReplayOptions() : num_threads(1), fast_forward(1.0) {}
+
+  ReplayOptions(uint32_t num_of_threads, double fast_forward_ratio)
+      : num_threads(num_of_threads), fast_forward(fast_forward_ratio) {}
+};
+
+// Replayer helps to replay the captured RocksDB query level operations.
+// The Replayer can either be created from DB::NewReplayer method, or be
+// instantiated via db_bench today, on using "replay" benchmark.
+class Replayer {
+ public:
+  virtual ~Replayer() = default;
+
+  // Make some preparation before replaying the trace. This will also reset the
+  // replayer in order to restart replaying.
+  virtual Status Prepare() = 0;
+
+  // Return the timestamp when the trace recording was started.
+  virtual uint64_t GetHeaderTimestamp() const = 0;
+
+  // Atomically read one trace into a TraceRecord (excluding the header and
+  // footer traces).
+  // Return Status::OK() on success;
+  // Status::Incomplete() if Prepare() was not called or no more available
+  // trace;
+  // Status::NotSupported() if the read trace type is not supported.
+  virtual Status Next(std::unique_ptr<TraceRecord>* record) = 0;
+
+  // Execute one TraceRecord.
+  // Return Status::OK() if the execution was successful. Get/MultiGet traces
+  // will still return Status::OK() even if they got Status::NotFound()
+  // from DB::Get() or DB::MultiGet();
+  // Status::Incomplete() if Prepare() was not called or no more available
+  // trace;
+  // Status::NotSupported() if the operation is not supported;
+  // Otherwise, return the corresponding error status.
+  //
+  // The actual operation execution status and result(s) will be saved in
+  // result. For example, a GetQueryTraceRecord will have its DB::Get() status
+  // and the returned value saved in a SingleValueTraceExecutionResult.
+  virtual Status Execute(const std::unique_ptr<TraceRecord>& record,
+                         std::unique_ptr<TraceRecordResult>* result) = 0;
+
+  // Replay all the traces from the provided trace stream, taking the delay
+  // between the traces into consideration.
+  //
+  // result_callback reports the status of executing a trace record, and the
+  // actual operation execution result (See the description for Execute()).
+  virtual Status Replay(
+      const ReplayOptions& options,
+      const std::function<void(Status, std::unique_ptr<TraceRecordResult>&&)>&
+          result_callback) = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/sim_cache.h b/src/rocksdb/include/rocksdb/utilities/sim_cache.h
new file mode 100644
index 000000000..a682c7748
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/sim_cache.h
@@ -0,0 +1,96 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SimCache;
+
+// For instrumentation purpose, use NewSimCache instead of NewLRUCache API
+// NewSimCache is a wrapper function returning a SimCache instance that can
+// have additional interface provided in Simcache class besides Cache interface
+// to predict block cache hit rate without actually allocating the memory. It
+// can help users tune their current block cache size, and determine how
+// efficient they are using the memory.
+//
+// Since GetSimCapacity() returns the capacity for simulation, it differs from
+// actual memory usage, which can be estimated as:
+// sim_capacity * entry_size / (entry_size + block_size),
+// where 76 <= entry_size <= 104,
+// BlockBasedTableOptions.block_size = 4096 by default but is configurable,
+// Therefore, generally the actual memory overhead of SimCache is Less than
+// sim_capacity * 2%
+extern std::shared_ptr<SimCache> NewSimCache(std::shared_ptr<Cache> cache,
+                                             size_t sim_capacity,
+                                             int num_shard_bits);
+
+extern std::shared_ptr<SimCache> NewSimCache(std::shared_ptr<Cache> sim_cache,
+                                             std::shared_ptr<Cache> cache,
+                                             int num_shard_bits);
+
+class SimCache : public Cache {
+ public:
+  SimCache() {}
+
+  ~SimCache() override {}
+
+  const char* Name() const override { return "SimCache"; }
+
+  // returns the maximum configured capacity of the simcache for simulation
+  virtual size_t GetSimCapacity() const = 0;
+
+  // simcache doesn't provide internal handler reference to user, so always
+  // PinnedUsage = 0 and the behavior will be not exactly consistent the
+  // with real cache.
+  // returns the memory size for the entries residing in the simcache.
+  virtual size_t GetSimUsage() const = 0;
+
+  // sets the maximum configured capacity of the simcache. When the new
+  // capacity is less than the old capacity and the existing usage is
+  // greater than new capacity, the implementation will purge old entries
+  // to fit new capacity.
+  virtual void SetSimCapacity(size_t capacity) = 0;
+
+  // returns the lookup times of simcache
+  virtual uint64_t get_miss_counter() const = 0;
+  // returns the hit times of simcache
+  virtual uint64_t get_hit_counter() const = 0;
+  // reset the lookup and hit counters
+  virtual void reset_counter() = 0;
+  // String representation of the statistics of the simcache
+  virtual std::string ToString() const = 0;
+
+  // Start storing logs of the cache activity (Add/Lookup) into
+  // a file located at activity_log_file, max_logging_size option can be used to
+  // stop logging to the file automatically after reaching a specific size in
+  // bytes, a values of 0 disable this feature
+  virtual Status StartActivityLogging(const std::string& activity_log_file,
+                                      Env* env,
+                                      uint64_t max_logging_size = 0) = 0;
+
+  // Stop cache activity logging if any
+  virtual void StopActivityLogging() = 0;
+
+  // Status of cache logging happening in background
+  virtual Status GetActivityLoggingStatus() = 0;
+
+ private:
+  SimCache(const SimCache&);
+  SimCache& operator=(const SimCache&);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/stackable_db.h b/src/rocksdb/include/rocksdb/utilities/stackable_db.h
new file mode 100644
index 000000000..9b13c3bdf
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/stackable_db.h
@@ -0,0 +1,566 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <map>
+#include <memory>
+#include <string>
+
+#include "rocksdb/db.h"
+
+#ifdef _WIN32
+// Windows API macro interference
+#undef DeleteFile
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+// This class contains APIs to stack rocksdb wrappers.Eg. Stack TTL over base d
+class StackableDB : public DB {
+ public:
+  // StackableDB take sole ownership of the underlying db.
+  explicit StackableDB(DB* db) : db_(db) {}
+
+  // StackableDB take shared ownership of the underlying db.
+  explicit StackableDB(std::shared_ptr<DB> db)
+      : db_(db.get()), shared_db_ptr_(db) {}
+
+  ~StackableDB() {
+    if (shared_db_ptr_ == nullptr) {
+      delete db_;
+    } else {
+      assert(shared_db_ptr_.get() == db_);
+    }
+    db_ = nullptr;
+  }
+
+  virtual Status Close() override { return db_->Close(); }
+
+  virtual DB* GetBaseDB() { return db_; }
+
+  virtual DB* GetRootDB() override { return db_->GetRootDB(); }
+
+  virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
+                                    const std::string& column_family_name,
+                                    ColumnFamilyHandle** handle) override {
+    return db_->CreateColumnFamily(options, column_family_name, handle);
+  }
+
+  virtual Status CreateColumnFamilies(
+      const ColumnFamilyOptions& options,
+      const std::vector<std::string>& column_family_names,
+      std::vector<ColumnFamilyHandle*>* handles) override {
+    return db_->CreateColumnFamilies(options, column_family_names, handles);
+  }
+
+  virtual Status CreateColumnFamilies(
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      std::vector<ColumnFamilyHandle*>* handles) override {
+    return db_->CreateColumnFamilies(column_families, handles);
+  }
+
+  virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override {
+    return db_->DropColumnFamily(column_family);
+  }
+
+  virtual Status DropColumnFamilies(
+      const std::vector<ColumnFamilyHandle*>& column_families) override {
+    return db_->DropColumnFamilies(column_families);
+  }
+
+  virtual Status DestroyColumnFamilyHandle(
+      ColumnFamilyHandle* column_family) override {
+    return db_->DestroyColumnFamilyHandle(column_family);
+  }
+
+  using DB::Put;
+  virtual Status Put(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& val) override {
+    return db_->Put(options, column_family, key, val);
+  }
+  Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family,
+             const Slice& key, const Slice& ts, const Slice& val) override {
+    return db_->Put(options, column_family, key, ts, val);
+  }
+
+  using DB::PutEntity;
+  Status PutEntity(const WriteOptions& options,
+                   ColumnFamilyHandle* column_family, const Slice& key,
+                   const WideColumns& columns) override {
+    return db_->PutEntity(options, column_family, key, columns);
+  }
+
+  using DB::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* value) override {
+    return db_->Get(options, column_family, key, value);
+  }
+
+  using DB::GetEntity;
+  Status GetEntity(const ReadOptions& options,
+                   ColumnFamilyHandle* column_family, const Slice& key,
+                   PinnableWideColumns* columns) override {
+    return db_->GetEntity(options, column_family, key, columns);
+  }
+
+  using DB::GetMergeOperands;
+  virtual Status GetMergeOperands(
+      const ReadOptions& options, ColumnFamilyHandle* column_family,
+      const Slice& key, PinnableSlice* slice,
+      GetMergeOperandsOptions* get_merge_operands_options,
+      int* number_of_operands) override {
+    return db_->GetMergeOperands(options, column_family, key, slice,
+                                 get_merge_operands_options,
+                                 number_of_operands);
+  }
+
+  using DB::MultiGet;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override {
+    return db_->MultiGet(options, column_family, keys, values);
+  }
+
+  virtual void MultiGet(const ReadOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const size_t num_keys, const Slice* keys,
+                        PinnableSlice* values, Status* statuses,
+                        const bool sorted_input = false) override {
+    return db_->MultiGet(options, column_family, num_keys, keys, values,
+                         statuses, sorted_input);
+  }
+
+  using DB::IngestExternalFile;
+  virtual Status IngestExternalFile(
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& external_files,
+      const IngestExternalFileOptions& options) override {
+    return db_->IngestExternalFile(column_family, external_files, options);
+  }
+
+  using DB::IngestExternalFiles;
+  virtual Status IngestExternalFiles(
+      const std::vector<IngestExternalFileArg>& args) override {
+    return db_->IngestExternalFiles(args);
+  }
+
+  using DB::CreateColumnFamilyWithImport;
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& options, const std::string& column_family_name,
+      const ImportColumnFamilyOptions& import_options,
+      const ExportImportFilesMetaData& metadata,
+      ColumnFamilyHandle** handle) override {
+    return db_->CreateColumnFamilyWithImport(options, column_family_name,
+                                             import_options, metadata, handle);
+  }
+
+  using DB::VerifyFileChecksums;
+  Status VerifyFileChecksums(const ReadOptions& read_opts) override {
+    return db_->VerifyFileChecksums(read_opts);
+  }
+
+  virtual Status VerifyChecksum() override { return db_->VerifyChecksum(); }
+
+  virtual Status VerifyChecksum(const ReadOptions& options) override {
+    return db_->VerifyChecksum(options);
+  }
+
+  using DB::KeyMayExist;
+  virtual bool KeyMayExist(const ReadOptions& options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           std::string* value,
+                           bool* value_found = nullptr) override {
+    return db_->KeyMayExist(options, column_family, key, value, value_found);
+  }
+
+  using DB::Delete;
+  virtual Status Delete(const WriteOptions& wopts,
+                        ColumnFamilyHandle* column_family,
+                        const Slice& key) override {
+    return db_->Delete(wopts, column_family, key);
+  }
+  Status Delete(const WriteOptions& wopts, ColumnFamilyHandle* column_family,
+                const Slice& key, const Slice& ts) override {
+    return db_->Delete(wopts, column_family, key, ts);
+  }
+
+  using DB::SingleDelete;
+  virtual Status SingleDelete(const WriteOptions& wopts,
+                              ColumnFamilyHandle* column_family,
+                              const Slice& key) override {
+    return db_->SingleDelete(wopts, column_family, key);
+  }
+  Status SingleDelete(const WriteOptions& wopts,
+                      ColumnFamilyHandle* column_family, const Slice& key,
+                      const Slice& ts) override {
+    return db_->SingleDelete(wopts, column_family, key, ts);
+  }
+
+  using DB::DeleteRange;
+  Status DeleteRange(const WriteOptions& wopts,
+                     ColumnFamilyHandle* column_family, const Slice& start_key,
+                     const Slice& end_key) override {
+    return db_->DeleteRange(wopts, column_family, start_key, end_key);
+  }
+
+  using DB::Merge;
+  virtual Status Merge(const WriteOptions& options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) override {
+    return db_->Merge(options, column_family, key, value);
+  }
+  Status Merge(const WriteOptions& options, ColumnFamilyHandle* column_family,
+               const Slice& key, const Slice& ts, const Slice& value) override {
+    return db_->Merge(options, column_family, key, ts, value);
+  }
+
+  virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override {
+    return db_->Write(opts, updates);
+  }
+
+  using DB::NewIterator;
+  virtual Iterator* NewIterator(const ReadOptions& opts,
+                                ColumnFamilyHandle* column_family) override {
+    return db_->NewIterator(opts, column_family);
+  }
+
+  virtual Status NewIterators(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      std::vector<Iterator*>* iterators) override {
+    return db_->NewIterators(options, column_families, iterators);
+  }
+
+  virtual const Snapshot* GetSnapshot() override { return db_->GetSnapshot(); }
+
+  virtual void ReleaseSnapshot(const Snapshot* snapshot) override {
+    return db_->ReleaseSnapshot(snapshot);
+  }
+
+  using DB::GetMapProperty;
+  using DB::GetProperty;
+  virtual bool GetProperty(ColumnFamilyHandle* column_family,
+                           const Slice& property, std::string* value) override {
+    return db_->GetProperty(column_family, property, value);
+  }
+  virtual bool GetMapProperty(
+      ColumnFamilyHandle* column_family, const Slice& property,
+      std::map<std::string, std::string>* value) override {
+    return db_->GetMapProperty(column_family, property, value);
+  }
+
+  using DB::GetIntProperty;
+  virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
+                              const Slice& property, uint64_t* value) override {
+    return db_->GetIntProperty(column_family, property, value);
+  }
+
+  using DB::GetAggregatedIntProperty;
+  virtual bool GetAggregatedIntProperty(const Slice& property,
+                                        uint64_t* value) override {
+    return db_->GetAggregatedIntProperty(property, value);
+  }
+
+  using DB::GetApproximateSizes;
+  virtual Status GetApproximateSizes(const SizeApproximationOptions& options,
+                                     ColumnFamilyHandle* column_family,
+                                     const Range* r, int n,
+                                     uint64_t* sizes) override {
+    return db_->GetApproximateSizes(options, column_family, r, n, sizes);
+  }
+
+  using DB::GetApproximateMemTableStats;
+  virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
+                                           const Range& range,
+                                           uint64_t* const count,
+                                           uint64_t* const size) override {
+    return db_->GetApproximateMemTableStats(column_family, range, count, size);
+  }
+
+  using DB::CompactRange;
+  virtual Status CompactRange(const CompactRangeOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice* begin, const Slice* end) override {
+    return db_->CompactRange(options, column_family, begin, end);
+  }
+
+  using DB::CompactFiles;
+  virtual Status CompactFiles(
+      const CompactionOptions& compact_options,
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& input_file_names, const int output_level,
+      const int output_path_id = -1,
+      std::vector<std::string>* const output_file_names = nullptr,
+      CompactionJobInfo* compaction_job_info = nullptr) override {
+    return db_->CompactFiles(compact_options, column_family, input_file_names,
+                             output_level, output_path_id, output_file_names,
+                             compaction_job_info);
+  }
+
+  virtual Status PauseBackgroundWork() override {
+    return db_->PauseBackgroundWork();
+  }
+  virtual Status ContinueBackgroundWork() override {
+    return db_->ContinueBackgroundWork();
+  }
+
+  virtual Status EnableAutoCompaction(
+      const std::vector<ColumnFamilyHandle*>& column_family_handles) override {
+    return db_->EnableAutoCompaction(column_family_handles);
+  }
+
+  virtual void EnableManualCompaction() override {
+    return db_->EnableManualCompaction();
+  }
+  virtual void DisableManualCompaction() override {
+    return db_->DisableManualCompaction();
+  }
+
+  using DB::NumberLevels;
+  virtual int NumberLevels(ColumnFamilyHandle* column_family) override {
+    return db_->NumberLevels(column_family);
+  }
+
+  using DB::MaxMemCompactionLevel;
+  virtual int MaxMemCompactionLevel(
+      ColumnFamilyHandle* column_family) override {
+    return db_->MaxMemCompactionLevel(column_family);
+  }
+
+  using DB::Level0StopWriteTrigger;
+  virtual int Level0StopWriteTrigger(
+      ColumnFamilyHandle* column_family) override {
+    return db_->Level0StopWriteTrigger(column_family);
+  }
+
+  virtual const std::string& GetName() const override { return db_->GetName(); }
+
+  virtual Env* GetEnv() const override { return db_->GetEnv(); }
+
+  virtual FileSystem* GetFileSystem() const override {
+    return db_->GetFileSystem();
+  }
+
+  using DB::GetOptions;
+  virtual Options GetOptions(ColumnFamilyHandle* column_family) const override {
+    return db_->GetOptions(column_family);
+  }
+
+  using DB::GetDBOptions;
+  virtual DBOptions GetDBOptions() const override {
+    return db_->GetDBOptions();
+  }
+
+  using DB::Flush;
+  virtual Status Flush(const FlushOptions& fopts,
+                       ColumnFamilyHandle* column_family) override {
+    return db_->Flush(fopts, column_family);
+  }
+  virtual Status Flush(
+      const FlushOptions& fopts,
+      const std::vector<ColumnFamilyHandle*>& column_families) override {
+    return db_->Flush(fopts, column_families);
+  }
+
+  virtual Status SyncWAL() override { return db_->SyncWAL(); }
+
+  virtual Status FlushWAL(bool sync) override { return db_->FlushWAL(sync); }
+
+  virtual Status LockWAL() override { return db_->LockWAL(); }
+
+  virtual Status UnlockWAL() override { return db_->UnlockWAL(); }
+
+#ifndef ROCKSDB_LITE
+
+  virtual Status DisableFileDeletions() override {
+    return db_->DisableFileDeletions();
+  }
+
+  virtual Status EnableFileDeletions(bool force) override {
+    return db_->EnableFileDeletions(force);
+  }
+
+  virtual void GetLiveFilesMetaData(
+      std::vector<LiveFileMetaData>* metadata) override {
+    db_->GetLiveFilesMetaData(metadata);
+  }
+
+  virtual Status GetLiveFilesChecksumInfo(
+      FileChecksumList* checksum_list) override {
+    return db_->GetLiveFilesChecksumInfo(checksum_list);
+  }
+
+  virtual Status GetLiveFilesStorageInfo(
+      const LiveFilesStorageInfoOptions& opts,
+      std::vector<LiveFileStorageInfo>* files) override {
+    return db_->GetLiveFilesStorageInfo(opts, files);
+  }
+
+  virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
+                                       ColumnFamilyMetaData* cf_meta) override {
+    db_->GetColumnFamilyMetaData(column_family, cf_meta);
+  }
+
+  using DB::StartBlockCacheTrace;
+  Status StartBlockCacheTrace(
+      const TraceOptions& trace_options,
+      std::unique_ptr<TraceWriter>&& trace_writer) override {
+    return db_->StartBlockCacheTrace(trace_options, std::move(trace_writer));
+  }
+
+  Status StartBlockCacheTrace(
+      const BlockCacheTraceOptions& options,
+      std::unique_ptr<BlockCacheTraceWriter>&& trace_writer) override {
+    return db_->StartBlockCacheTrace(options, std::move(trace_writer));
+  }
+
+  using DB::EndBlockCacheTrace;
+  Status EndBlockCacheTrace() override { return db_->EndBlockCacheTrace(); }
+
+  using DB::StartIOTrace;
+  Status StartIOTrace(const TraceOptions& options,
+                      std::unique_ptr<TraceWriter>&& trace_writer) override {
+    return db_->StartIOTrace(options, std::move(trace_writer));
+  }
+
+  using DB::EndIOTrace;
+  Status EndIOTrace() override { return db_->EndIOTrace(); }
+
+  using DB::StartTrace;
+  Status StartTrace(const TraceOptions& options,
+                    std::unique_ptr<TraceWriter>&& trace_writer) override {
+    return db_->StartTrace(options, std::move(trace_writer));
+  }
+
+  using DB::EndTrace;
+  Status EndTrace() override { return db_->EndTrace(); }
+
+  using DB::NewDefaultReplayer;
+  Status NewDefaultReplayer(const std::vector<ColumnFamilyHandle*>& handles,
+                            std::unique_ptr<TraceReader>&& reader,
+                            std::unique_ptr<Replayer>* replayer) override {
+    return db_->NewDefaultReplayer(handles, std::move(reader), replayer);
+  }
+
+#endif  // ROCKSDB_LITE
+
+  virtual Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
+                              bool flush_memtable = true) override {
+    return db_->GetLiveFiles(vec, mfs, flush_memtable);
+  }
+
+  virtual SequenceNumber GetLatestSequenceNumber() const override {
+    return db_->GetLatestSequenceNumber();
+  }
+
+  Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family,
+                                  std::string ts_low) override {
+    return db_->IncreaseFullHistoryTsLow(column_family, ts_low);
+  }
+
+  Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family,
+                             std::string* ts_low) override {
+    return db_->GetFullHistoryTsLow(column_family, ts_low);
+  }
+
+  virtual Status GetSortedWalFiles(VectorLogPtr& files) override {
+    return db_->GetSortedWalFiles(files);
+  }
+
+  virtual Status GetCurrentWalFile(
+      std::unique_ptr<LogFile>* current_log_file) override {
+    return db_->GetCurrentWalFile(current_log_file);
+  }
+
+  virtual Status GetCreationTimeOfOldestFile(uint64_t* creation_time) override {
+    return db_->GetCreationTimeOfOldestFile(creation_time);
+  }
+
+  // WARNING: This API is planned for removal in RocksDB 7.0 since it does not
+  // operate at the proper level of abstraction for a key-value store, and its
+  // contract/restrictions are poorly documented. For example, it returns non-OK
+  // `Status` for non-bottommost files and files undergoing compaction. Since we
+  // do not plan to maintain it, the contract will likely remain underspecified
+  // until its removal. Any user is encouraged to read the implementation
+  // carefully and migrate away from it when possible.
+  virtual Status DeleteFile(std::string name) override {
+    return db_->DeleteFile(name);
+  }
+
+  virtual Status GetDbIdentity(std::string& identity) const override {
+    return db_->GetDbIdentity(identity);
+  }
+
+  virtual Status GetDbSessionId(std::string& session_id) const override {
+    return db_->GetDbSessionId(session_id);
+  }
+
+  using DB::SetOptions;
+  virtual Status SetOptions(ColumnFamilyHandle* column_family_handle,
+                            const std::unordered_map<std::string, std::string>&
+                                new_options) override {
+    return db_->SetOptions(column_family_handle, new_options);
+  }
+
+  virtual Status SetDBOptions(
+      const std::unordered_map<std::string, std::string>& new_options)
+      override {
+    return db_->SetDBOptions(new_options);
+  }
+
+  using DB::ResetStats;
+  virtual Status ResetStats() override { return db_->ResetStats(); }
+
+  using DB::GetPropertiesOfAllTables;
+  virtual Status GetPropertiesOfAllTables(
+      ColumnFamilyHandle* column_family,
+      TablePropertiesCollection* props) override {
+    return db_->GetPropertiesOfAllTables(column_family, props);
+  }
+
+  using DB::GetPropertiesOfTablesInRange;
+  virtual Status GetPropertiesOfTablesInRange(
+      ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
+      TablePropertiesCollection* props) override {
+    return db_->GetPropertiesOfTablesInRange(column_family, range, n, props);
+  }
+
+  virtual Status GetUpdatesSince(
+      SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
+      const TransactionLogIterator::ReadOptions& read_options) override {
+    return db_->GetUpdatesSince(seq_number, iter, read_options);
+  }
+
+  virtual Status SuggestCompactRange(ColumnFamilyHandle* column_family,
+                                     const Slice* begin,
+                                     const Slice* end) override {
+    return db_->SuggestCompactRange(column_family, begin, end);
+  }
+
+  virtual Status PromoteL0(ColumnFamilyHandle* column_family,
+                           int target_level) override {
+    return db_->PromoteL0(column_family, target_level);
+  }
+
+  virtual ColumnFamilyHandle* DefaultColumnFamily() const override {
+    return db_->DefaultColumnFamily();
+  }
+
+#ifndef ROCKSDB_LITE
+  Status TryCatchUpWithPrimary() override {
+    return db_->TryCatchUpWithPrimary();
+  }
+#endif  // ROCKSDB_LITE
+
+ protected:
+  DB* db_;
+  std::shared_ptr<DB> shared_db_ptr_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/utilities/table_properties_collectors.h b/src/rocksdb/include/rocksdb/utilities/table_properties_collectors.h
new file mode 100644
index 000000000..f3a4ba005
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/table_properties_collectors.h
@@ -0,0 +1,90 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <atomic>
+#include <memory>
+
+#include "rocksdb/table_properties.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A factory of a table property collector that marks a SST
+// file as need-compaction when it observe at least "D" deletion
+// entries in any "N" consecutive entries or the ratio of tombstone
+// entries in the whole file >= the specified deletion ratio.
+class CompactOnDeletionCollectorFactory
+    : public TablePropertiesCollectorFactory {
+ public:
+  // A factory of a table property collector that marks a SST
+  // file as need-compaction when it observe at least "D" deletion
+  // entries in any "N" consecutive entries, or the ratio of tombstone
+  // entries >= deletion_ratio.
+  //
+  // @param sliding_window_size "N"
+  // @param deletion_trigger "D"
+  // @param deletion_ratio, if <= 0 or > 1, disable triggering compaction
+  //     based on deletion ratio.
+  CompactOnDeletionCollectorFactory(size_t sliding_window_size,
+                                    size_t deletion_trigger,
+                                    double deletion_ratio);
+
+  ~CompactOnDeletionCollectorFactory() {}
+
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context context) override;
+
+  // Change the value of sliding_window_size "N"
+  // Setting it to 0 disables the delete triggered compaction
+  void SetWindowSize(size_t sliding_window_size) {
+    sliding_window_size_.store(sliding_window_size);
+  }
+  size_t GetWindowSize() const { return sliding_window_size_.load(); }
+
+  // Change the value of deletion_trigger "D"
+  void SetDeletionTrigger(size_t deletion_trigger) {
+    deletion_trigger_.store(deletion_trigger);
+  }
+
+  size_t GetDeletionTrigger() const { return deletion_trigger_.load(); }
+  // Change deletion ratio.
+  // @param deletion_ratio, if <= 0 or > 1, disable triggering compaction
+  //     based on deletion ratio.
+  void SetDeletionRatio(double deletion_ratio) {
+    deletion_ratio_.store(deletion_ratio);
+  }
+
+  double GetDeletionRatio() const { return deletion_ratio_.load(); }
+  static const char* kClassName() { return "CompactOnDeletionCollector"; }
+  const char* Name() const override { return kClassName(); }
+
+  std::string ToString() const override;
+
+ private:
+  std::atomic<size_t> sliding_window_size_;
+  std::atomic<size_t> deletion_trigger_;
+  std::atomic<double> deletion_ratio_;
+};
+
+// Creates a factory of a table property collector that marks a SST
+// file as need-compaction when it observe at least "D" deletion
+// entries in any "N" consecutive entries, or the ratio of tombstone
+// entries >= deletion_ratio.
+//
+// @param sliding_window_size "N". Note that this number will be
+//     round up to the smallest multiple of 128 that is no less
+//     than the specified size.
+// @param deletion_trigger "D".  Note that even when "N" is changed,
+//     the specified number for "D" will not be changed.
+// @param deletion_ratio, if <= 0 or > 1, disable triggering compaction
+//     based on deletion ratio. Disabled by default.
+extern std::shared_ptr<CompactOnDeletionCollectorFactory>
+NewCompactOnDeletionCollectorFactory(size_t sliding_window_size,
+                                     size_t deletion_trigger,
+                                     double deletion_ratio = 0);
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/transaction.h b/src/rocksdb/include/rocksdb/utilities/transaction.h
new file mode 100644
index 000000000..1d2822988
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/transaction.h
@@ -0,0 +1,686 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Iterator;
+class TransactionDB;
+class WriteBatchWithIndex;
+
+using TransactionName = std::string;
+
+using TransactionID = uint64_t;
+
+using TxnTimestamp = uint64_t;
+
+constexpr TxnTimestamp kMaxTxnTimestamp =
+    std::numeric_limits<TxnTimestamp>::max();
+
+/*
+  class Endpoint allows to define prefix ranges.
+
+  Prefix ranges are introduced below.
+
+  == Basic Ranges ==
+  Let's start from basic ranges. Key Comparator defines ordering of rowkeys.
+  Then, one can specify finite closed ranges by just providing rowkeys of their
+  endpoints:
+
+    lower_endpoint <= X <= upper_endpoint
+
+  However our goal is to provide a richer set of endpoints. Read on.
+
+  == Lexicographic ordering ==
+  A lexicographic (or dictionary) ordering satisfies these criteria: If there
+  are two keys in form
+    key_a = {prefix_a, suffix_a}
+    key_b = {prefix_b, suffix_b}
+  and
+    prefix_a < prefix_b
+  then
+    key_a < key_b.
+
+  == Prefix ranges ==
+  With lexicographic ordering, one may want to define ranges in form
+
+     "prefix is $PREFIX"
+
+  which translates to a range in form
+
+    {$PREFIX, -infinity} < X < {$PREFIX, +infinity}
+
+  where -infinity will compare less than any possible suffix, and +infinity
+  will compare as greater than any possible suffix.
+
+  class Endpoint allows to define these kind of rangtes.
+
+  == Notes ==
+  BytewiseComparator and ReverseBytewiseComparator produce lexicographic
+  ordering.
+
+  The row comparison function is able to compare key prefixes. If the data
+  domain includes keys A and B, then the comparison function is able to compare
+  equal-length prefixes:
+
+    min_len= min(byte_length(A), byte_length(B));
+    cmp(Slice(A, min_len), Slice(B, min_len));  // this call is valid
+
+  == Other options ==
+  As far as MyRocks is concerned, the alternative to prefix ranges would be to
+  support both open (non-inclusive) and closed (inclusive) range endpoints.
+*/
+
+class Endpoint {
+ public:
+  Slice slice;
+
+  /*
+    true  : the key has a "+infinity" suffix. A suffix that would compare as
+            greater than any other suffix
+    false : otherwise
+  */
+  bool inf_suffix;
+
+  explicit Endpoint(const Slice& slice_arg, bool inf_suffix_arg = false)
+      : slice(slice_arg), inf_suffix(inf_suffix_arg) {}
+
+  explicit Endpoint(const char* s, bool inf_suffix_arg = false)
+      : slice(s), inf_suffix(inf_suffix_arg) {}
+
+  Endpoint(const char* s, size_t size, bool inf_suffix_arg = false)
+      : slice(s, size), inf_suffix(inf_suffix_arg) {}
+
+  Endpoint() : inf_suffix(false) {}
+};
+
+// Provides notification to the caller of SetSnapshotOnNextOperation when
+// the actual snapshot gets created
+class TransactionNotifier {
+ public:
+  virtual ~TransactionNotifier() {}
+
+  // Implement this method to receive notification when a snapshot is
+  // requested via SetSnapshotOnNextOperation.
+  // Do not take exclusive ownership of `newSnapshot` because it is shared with
+  // the underlying transaction.
+  virtual void SnapshotCreated(const Snapshot* newSnapshot) = 0;
+};
+
+// Provides BEGIN/COMMIT/ROLLBACK transactions.
+//
+// To use transactions, you must first create either an OptimisticTransactionDB
+// or a TransactionDB.  See examples/[optimistic_]transaction_example.cc for
+// more information.
+//
+// To create a transaction, use [Optimistic]TransactionDB::BeginTransaction().
+//
+// It is up to the caller to synchronize access to this object.
+//
+// See examples/transaction_example.cc for some simple examples.
+//
+// TODO(agiardullo): Not yet implemented
+//  -PerfContext statistics
+//  -Support for using Transactions with DBWithTTL
+class Transaction {
+ public:
+  // No copying allowed
+  Transaction(const Transaction&) = delete;
+  void operator=(const Transaction&) = delete;
+
+  virtual ~Transaction() {}
+
+  // If a transaction has a snapshot set, the transaction will ensure that
+  // any keys successfully written(or fetched via GetForUpdate()) have not
+  // been modified outside of this transaction since the time the snapshot was
+  // set.
+  // If a snapshot has not been set, the transaction guarantees that keys have
+  // not been modified since the time each key was first written (or fetched via
+  // GetForUpdate()).
+  //
+  // Using SetSnapshot() will provide stricter isolation guarantees at the
+  // expense of potentially more transaction failures due to conflicts with
+  // other writes.
+  //
+  // Calling SetSnapshot() has no effect on keys written before this function
+  // has been called.
+  //
+  // SetSnapshot() may be called multiple times if you would like to change
+  // the snapshot used for different operations in this transaction.
+  //
+  // Calling SetSnapshot will not affect the version of Data returned by Get()
+  // methods.  See Transaction::Get() for more details.
+  virtual void SetSnapshot() = 0;
+
+  // Similar to SetSnapshot(), but will not change the current snapshot
+  // until Put/Merge/Delete/GetForUpdate/MultigetForUpdate is called.
+  // By calling this function, the transaction will essentially call
+  // SetSnapshot() for you right before performing the next write/GetForUpdate.
+  //
+  // Calling SetSnapshotOnNextOperation() will not affect what snapshot is
+  // returned by GetSnapshot() until the next write/GetForUpdate is executed.
+  //
+  // When the snapshot is created the notifier's SnapshotCreated method will
+  // be called so that the caller can get access to the snapshot.
+  //
+  // This is an optimization to reduce the likelihood of conflicts that
+  // could occur in between the time SetSnapshot() is called and the first
+  // write/GetForUpdate operation.  Eg, this prevents the following
+  // race-condition:
+  //
+  //   txn1->SetSnapshot();
+  //                             txn2->Put("A", ...);
+  //                             txn2->Commit();
+  //   txn1->GetForUpdate(opts, "A", ...);  // FAIL!
+  //
+  // WriteCommittedTxn only: a new snapshot will be taken upon next operation,
+  // and next operation can be a Commit.
+  // TODO(yanqin) remove the "write-committed only" limitation.
+  virtual void SetSnapshotOnNextOperation(
+      std::shared_ptr<TransactionNotifier> notifier = nullptr) = 0;
+
+  // Returns the Snapshot created by the last call to SetSnapshot().
+  //
+  // REQUIRED: The returned Snapshot is only valid up until the next time
+  // SetSnapshot()/SetSnapshotOnNextSavePoint() is called, ClearSnapshot()
+  // is called, or the Transaction is deleted.
+  virtual const Snapshot* GetSnapshot() const = 0;
+
+  // Returns the Snapshot created by the last call to SetSnapshot().
+  // The returned snapshot can outlive the transaction.
+  virtual std::shared_ptr<const Snapshot> GetTimestampedSnapshot() const = 0;
+
+  // Clears the current snapshot (i.e. no snapshot will be 'set')
+  //
+  // This removes any snapshot that currently exists or is set to be created
+  // on the next update operation (SetSnapshotOnNextOperation).
+  //
+  // Calling ClearSnapshot() has no effect on keys written before this function
+  // has been called.
+  //
+  // If a reference to a snapshot was retrieved via GetSnapshot(), it will no
+  // longer be valid and should be discarded after a call to ClearSnapshot().
+  virtual void ClearSnapshot() = 0;
+
+  // Prepare the current transaction for 2PC
+  virtual Status Prepare() = 0;
+
+  // Write all batched keys to the db atomically.
+  //
+  // Returns OK on success.
+  //
+  // May return any error status that could be returned by DB:Write().
+  //
+  // If this transaction was created by an OptimisticTransactionDB(),
+  // Status::Busy() may be returned if the transaction could not guarantee
+  // that there are no write conflicts.  Status::TryAgain() may be returned
+  // if the memtable history size is not large enough
+  //  (See max_write_buffer_size_to_maintain).
+  //
+  // If this transaction was created by a TransactionDB(), Status::Expired()
+  // may be returned if this transaction has lived for longer than
+  // TransactionOptions.expiration. Status::TxnNotPrepared() may be returned if
+  // TransactionOptions.skip_prepare is false and Prepare is not called on this
+  // transaction before Commit.
+  virtual Status Commit() = 0;
+
+  // In addition to Commit(), also creates a snapshot of the db after all
+  // writes by this txn are visible to other readers.
+  // Caller is responsible for ensuring that
+  // snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
+  // in which snapshot1 and snapshot2 are created by this API.
+  //
+  // Currently only supported by WriteCommittedTxn. Calling this method on
+  // other types of transactions will return non-ok Status resulting from
+  // Commit() or a `NotSupported` error.
+  // This method returns OK if and only if the transaction successfully
+  // commits. It is possible that transaction commits successfully but fails to
+  // create a timestamped snapshot. Therefore, the caller should check that the
+  // snapshot is created.
+  // notifier will be notified upon next snapshot creation. Nullable.
+  // ret non-null output argument storing a shared_ptr to the newly created
+  // snapshot.
+  Status CommitAndTryCreateSnapshot(
+      std::shared_ptr<TransactionNotifier> notifier =
+          std::shared_ptr<TransactionNotifier>(),
+      TxnTimestamp ts = kMaxTxnTimestamp,
+      std::shared_ptr<const Snapshot>* snapshot = nullptr);
+
+  // Discard all batched writes in this transaction.
+  virtual Status Rollback() = 0;
+
+  // Records the state of the transaction for future calls to
+  // RollbackToSavePoint().  May be called multiple times to set multiple save
+  // points.
+  virtual void SetSavePoint() = 0;
+
+  // Undo all operations in this transaction (Put, Merge, Delete, PutLogData)
+  // since the most recent call to SetSavePoint() and removes the most recent
+  // SetSavePoint().
+  // If there is no previous call to SetSavePoint(), returns Status::NotFound()
+  virtual Status RollbackToSavePoint() = 0;
+
+  // Pop the most recent save point.
+  // If there is no previous call to SetSavePoint(), Status::NotFound()
+  // will be returned.
+  // Otherwise returns Status::OK().
+  virtual Status PopSavePoint() = 0;
+
+  // This function is similar to DB::Get() except it will also read pending
+  // changes in this transaction.  Currently, this function will return
+  // Status::MergeInProgress if the most recent write to the queried key in
+  // this batch is a Merge.
+  //
+  // If read_options.snapshot is not set, the current version of the key will
+  // be read.  Calling SetSnapshot() does not affect the version of the data
+  // returned.
+  //
+  // Note that setting read_options.snapshot will affect what is read from the
+  // DB but will NOT change which keys are read from this transaction (the keys
+  // in this transaction do not yet belong to any snapshot and will be fetched
+  // regardless).
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     std::string* value) = 0;
+
+  // An overload of the above method that receives a PinnableSlice
+  // For backward compatibility a default implementation is provided
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* pinnable_val) {
+    assert(pinnable_val != nullptr);
+    auto s = Get(options, column_family, key, pinnable_val->GetSelf());
+    pinnable_val->PinSelf();
+    return s;
+  }
+
+  virtual Status Get(const ReadOptions& options, const Slice& key,
+                     std::string* value) = 0;
+  virtual Status Get(const ReadOptions& options, const Slice& key,
+                     PinnableSlice* pinnable_val) {
+    assert(pinnable_val != nullptr);
+    auto s = Get(options, key, pinnable_val->GetSelf());
+    pinnable_val->PinSelf();
+    return s;
+  }
+
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
+
+  virtual std::vector<Status> MultiGet(const ReadOptions& options,
+                                       const std::vector<Slice>& keys,
+                                       std::vector<std::string>* values) = 0;
+
+  // Batched version of MultiGet - see DBImpl::MultiGet(). Sub-classes are
+  // expected to override this with an implementation that calls
+  // DBImpl::MultiGet()
+  virtual void MultiGet(const ReadOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const size_t num_keys, const Slice* keys,
+                        PinnableSlice* values, Status* statuses,
+                        const bool /*sorted_input*/ = false) {
+    for (size_t i = 0; i < num_keys; ++i) {
+      statuses[i] = Get(options, column_family, keys[i], &values[i]);
+    }
+  }
+
+  // Read this key and ensure that this transaction will only
+  // be able to be committed if this key is not written outside this
+  // transaction after it has first been read (or after the snapshot if a
+  // snapshot is set in this transaction and do_validate is true). If
+  // do_validate is false, ReadOptions::snapshot is expected to be nullptr so
+  // that GetForUpdate returns the latest committed value. The transaction
+  // behavior is the same regardless of whether the key exists or not.
+  //
+  // Note: Currently, this function will return Status::MergeInProgress
+  // if the most recent write to the queried key in this batch is a Merge.
+  //
+  // The values returned by this function are similar to Transaction::Get().
+  // If value==nullptr, then this function will not read any data, but will
+  // still ensure that this key cannot be written to by outside of this
+  // transaction.
+  //
+  // If this transaction was created by an OptimisticTransaction, GetForUpdate()
+  // could cause commit() to fail.  Otherwise, it could return any error
+  // that could be returned by DB::Get().
+  //
+  // If this transaction was created by a TransactionDB, it can return
+  // Status::OK() on success,
+  // Status::Busy() if there is a write conflict,
+  // Status::TimedOut() if a lock could not be acquired,
+  // Status::TryAgain() if the memtable history size is not large enough
+  //  (See max_write_buffer_size_to_maintain)
+  // Status::MergeInProgress() if merge operations cannot be resolved.
+  // or other errors if this key could not be read.
+  virtual Status GetForUpdate(const ReadOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice& key, std::string* value,
+                              bool exclusive = true,
+                              const bool do_validate = true) = 0;
+
+  // An overload of the above method that receives a PinnableSlice
+  // For backward compatibility a default implementation is provided
+  virtual Status GetForUpdate(const ReadOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice& key, PinnableSlice* pinnable_val,
+                              bool exclusive = true,
+                              const bool do_validate = true) {
+    if (pinnable_val == nullptr) {
+      std::string* null_str = nullptr;
+      return GetForUpdate(options, column_family, key, null_str, exclusive,
+                          do_validate);
+    } else {
+      auto s = GetForUpdate(options, column_family, key,
+                            pinnable_val->GetSelf(), exclusive, do_validate);
+      pinnable_val->PinSelf();
+      return s;
+    }
+  }
+
+  // Get a range lock on [start_endpoint; end_endpoint].
+  virtual Status GetRangeLock(ColumnFamilyHandle*, const Endpoint&,
+                              const Endpoint&) {
+    return Status::NotSupported();
+  }
+
+  virtual Status GetForUpdate(const ReadOptions& options, const Slice& key,
+                              std::string* value, bool exclusive = true,
+                              const bool do_validate = true) = 0;
+
+  virtual std::vector<Status> MultiGetForUpdate(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
+
+  virtual std::vector<Status> MultiGetForUpdate(
+      const ReadOptions& options, const std::vector<Slice>& keys,
+      std::vector<std::string>* values) = 0;
+
+  // Returns an iterator that will iterate on all keys in the default
+  // column family including both keys in the DB and uncommitted keys in this
+  // transaction.
+  //
+  // Setting read_options.snapshot will affect what is read from the
+  // DB but will NOT change which keys are read from this transaction (the keys
+  // in this transaction do not yet belong to any snapshot and will be fetched
+  // regardless).
+  //
+  // Caller is responsible for deleting the returned Iterator.
+  //
+  // The returned iterator is only valid until Commit(), Rollback(), or
+  // RollbackToSavePoint() is called.
+  virtual Iterator* GetIterator(const ReadOptions& read_options) = 0;
+
+  virtual Iterator* GetIterator(const ReadOptions& read_options,
+                                ColumnFamilyHandle* column_family) = 0;
+
+  // Put, Merge, Delete, and SingleDelete behave similarly to the corresponding
+  // functions in WriteBatch, but will also do conflict checking on the
+  // keys being written.
+  //
+  // assume_tracked=true expects the key be already tracked. More
+  // specifically, it means the the key was previous tracked in the same
+  // savepoint, with the same exclusive flag, and at a lower sequence number.
+  // If valid then it skips ValidateSnapshot.  Returns error otherwise.
+  //
+  // If this Transaction was created on an OptimisticTransactionDB, these
+  // functions should always return Status::OK().
+  //
+  // If this Transaction was created on a TransactionDB, the status returned
+  // can be:
+  // Status::OK() on success,
+  // Status::Busy() if there is a write conflict,
+  // Status::TimedOut() if a lock could not be acquired,
+  // Status::TryAgain() if the memtable history size is not large enough
+  //  (See max_write_buffer_size_to_maintain)
+  // or other errors on unexpected failures.
+  virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& value, const bool assume_tracked = false) = 0;
+  virtual Status Put(const Slice& key, const Slice& value) = 0;
+  virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+                     const SliceParts& value,
+                     const bool assume_tracked = false) = 0;
+  virtual Status Put(const SliceParts& key, const SliceParts& value) = 0;
+
+  virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value,
+                       const bool assume_tracked = false) = 0;
+  virtual Status Merge(const Slice& key, const Slice& value) = 0;
+
+  virtual Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
+                        const bool assume_tracked = false) = 0;
+  virtual Status Delete(const Slice& key) = 0;
+  virtual Status Delete(ColumnFamilyHandle* column_family,
+                        const SliceParts& key,
+                        const bool assume_tracked = false) = 0;
+  virtual Status Delete(const SliceParts& key) = 0;
+
+  virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+                              const Slice& key,
+                              const bool assume_tracked = false) = 0;
+  virtual Status SingleDelete(const Slice& key) = 0;
+  virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+                              const SliceParts& key,
+                              const bool assume_tracked = false) = 0;
+  virtual Status SingleDelete(const SliceParts& key) = 0;
+
+  // PutUntracked() will write a Put to the batch of operations to be committed
+  // in this transaction.  This write will only happen if this transaction
+  // gets committed successfully.  But unlike Transaction::Put(),
+  // no conflict checking will be done for this key.
+  //
+  // If this Transaction was created on a PessimisticTransactionDB, this
+  // function will still acquire locks necessary to make sure this write doesn't
+  // cause conflicts in other transactions and may return Status::Busy().
+  virtual Status PutUntracked(ColumnFamilyHandle* column_family,
+                              const Slice& key, const Slice& value) = 0;
+  virtual Status PutUntracked(const Slice& key, const Slice& value) = 0;
+  virtual Status PutUntracked(ColumnFamilyHandle* column_family,
+                              const SliceParts& key,
+                              const SliceParts& value) = 0;
+  virtual Status PutUntracked(const SliceParts& key,
+                              const SliceParts& value) = 0;
+
+  virtual Status MergeUntracked(ColumnFamilyHandle* column_family,
+                                const Slice& key, const Slice& value) = 0;
+  virtual Status MergeUntracked(const Slice& key, const Slice& value) = 0;
+
+  virtual Status DeleteUntracked(ColumnFamilyHandle* column_family,
+                                 const Slice& key) = 0;
+
+  virtual Status DeleteUntracked(const Slice& key) = 0;
+  virtual Status DeleteUntracked(ColumnFamilyHandle* column_family,
+                                 const SliceParts& key) = 0;
+  virtual Status DeleteUntracked(const SliceParts& key) = 0;
+  virtual Status SingleDeleteUntracked(ColumnFamilyHandle* column_family,
+                                       const Slice& key) = 0;
+
+  virtual Status SingleDeleteUntracked(const Slice& key) = 0;
+
+  // Similar to WriteBatch::PutLogData
+  virtual void PutLogData(const Slice& blob) = 0;
+
+  // By default, all Put/Merge/Delete operations will be indexed in the
+  // transaction so that Get/GetForUpdate/GetIterator can search for these
+  // keys.
+  //
+  // If the caller does not want to fetch the keys about to be written,
+  // they may want to avoid indexing as a performance optimization.
+  // Calling DisableIndexing() will turn off indexing for all future
+  // Put/Merge/Delete operations until EnableIndexing() is called.
+  //
+  // If a key is Put/Merge/Deleted after DisableIndexing is called and then
+  // is fetched via Get/GetForUpdate/GetIterator, the result of the fetch is
+  // undefined.
+  virtual void DisableIndexing() = 0;
+  virtual void EnableIndexing() = 0;
+
+  // Returns the number of distinct Keys being tracked by this transaction.
+  // If this transaction was created by a TransactionDB, this is the number of
+  // keys that are currently locked by this transaction.
+  // If this transaction was created by an OptimisticTransactionDB, this is the
+  // number of keys that need to be checked for conflicts at commit time.
+  virtual uint64_t GetNumKeys() const = 0;
+
+  // Returns the number of Puts/Deletes/Merges that have been applied to this
+  // transaction so far.
+  virtual uint64_t GetNumPuts() const = 0;
+  virtual uint64_t GetNumDeletes() const = 0;
+  virtual uint64_t GetNumMerges() const = 0;
+
+  // Returns the elapsed time in milliseconds since this Transaction began.
+  virtual uint64_t GetElapsedTime() const = 0;
+
+  // Fetch the underlying write batch that contains all pending changes to be
+  // committed.
+  //
+  // Note:  You should not write or delete anything from the batch directly and
+  // should only use the functions in the Transaction class to
+  // write to this transaction.
+  virtual WriteBatchWithIndex* GetWriteBatch() = 0;
+
+  // Change the value of TransactionOptions.lock_timeout (in milliseconds) for
+  // this transaction.
+  // Has no effect on OptimisticTransactions.
+  virtual void SetLockTimeout(int64_t timeout) = 0;
+
+  // Return the WriteOptions that will be used during Commit()
+  virtual WriteOptions* GetWriteOptions() = 0;
+
+  // Reset the WriteOptions that will be used during Commit().
+  virtual void SetWriteOptions(const WriteOptions& write_options) = 0;
+
+  // If this key was previously fetched in this transaction using
+  // GetForUpdate/MultigetForUpdate(), calling UndoGetForUpdate will tell
+  // the transaction that it no longer needs to do any conflict checking
+  // for this key.
+  //
+  // If a key has been fetched N times via GetForUpdate/MultigetForUpdate(),
+  // then UndoGetForUpdate will only have an effect if it is also called N
+  // times.  If this key has been written to in this transaction,
+  // UndoGetForUpdate() will have no effect.
+  //
+  // If SetSavePoint() has been called after the GetForUpdate(),
+  // UndoGetForUpdate() will not have any effect.
+  //
+  // If this Transaction was created by an OptimisticTransactionDB,
+  // calling UndoGetForUpdate can affect whether this key is conflict checked
+  // at commit time.
+  // If this Transaction was created by a TransactionDB,
+  // calling UndoGetForUpdate may release any held locks for this key.
+  virtual void UndoGetForUpdate(ColumnFamilyHandle* column_family,
+                                const Slice& key) = 0;
+  virtual void UndoGetForUpdate(const Slice& key) = 0;
+
+  virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) = 0;
+
+  // Note: data in the commit-time-write-batch bypasses concurrency control,
+  // thus should be used with great caution.
+  // For write-prepared/write-unprepared transactions,
+  // GetCommitTimeWriteBatch() can be used only if the transaction is started
+  // with
+  // `TransactionOptions::use_only_the_last_commit_time_batch_for_recovery` set
+  // to true. Otherwise, it is possible that two uncommitted versions of the
+  // same key exist in the database due to the current implementation (see the
+  // explanation in WritePreparedTxn::CommitInternal).
+  // During bottommost compaction, RocksDB may
+  // set the sequence numbers of both to zero once becoming committed, causing
+  // output SST file to have two identical internal keys.
+  virtual WriteBatch* GetCommitTimeWriteBatch() = 0;
+
+  virtual void SetLogNumber(uint64_t log) { log_number_ = log; }
+
+  virtual uint64_t GetLogNumber() const { return log_number_; }
+
+  virtual Status SetName(const TransactionName& name) = 0;
+
+  virtual TransactionName GetName() const { return name_; }
+
+  virtual TransactionID GetID() const { return 0; }
+
+  virtual bool IsDeadlockDetect() const { return false; }
+
+  virtual std::vector<TransactionID> GetWaitingTxns(
+      uint32_t* /*column_family_id*/, std::string* /*key*/) const {
+    assert(false);
+    return std::vector<TransactionID>();
+  }
+
+  enum TransactionState {
+    STARTED = 0,
+    AWAITING_PREPARE = 1,
+    PREPARED = 2,
+    AWAITING_COMMIT = 3,
+    COMMITTED = 4,
+    COMMITED = COMMITTED,  // old misspelled name
+    AWAITING_ROLLBACK = 5,
+    ROLLEDBACK = 6,
+    LOCKS_STOLEN = 7,
+  };
+
+  TransactionState GetState() const { return txn_state_; }
+  void SetState(TransactionState state) { txn_state_ = state; }
+
+  // NOTE: Experimental feature
+  // The globally unique id with which the transaction is identified. This id
+  // might or might not be set depending on the implementation. Similarly the
+  // implementation decides the point in lifetime of a transaction at which it
+  // assigns the id. Although currently it is the case, the id is not guaranteed
+  // to remain the same across restarts.
+  uint64_t GetId() { return id_; }
+
+  virtual Status SetReadTimestampForValidation(TxnTimestamp /*ts*/) {
+    return Status::NotSupported("timestamp not supported");
+  }
+
+  virtual Status SetCommitTimestamp(TxnTimestamp /*ts*/) {
+    return Status::NotSupported("timestamp not supported");
+  }
+
+  virtual TxnTimestamp GetCommitTimestamp() const { return kMaxTxnTimestamp; }
+
+ protected:
+  explicit Transaction(const TransactionDB* /*db*/) {}
+  Transaction() : log_number_(0), txn_state_(STARTED) {}
+
+  // the log in which the prepared section for this txn resides
+  // (for two phase commit)
+  uint64_t log_number_;
+  TransactionName name_;
+
+  // Execution status of the transaction.
+  std::atomic<TransactionState> txn_state_;
+
+  uint64_t id_ = 0;
+  virtual void SetId(uint64_t id) {
+    assert(id_ == 0);
+    id_ = id;
+  }
+
+  virtual uint64_t GetLastLogNumber() const { return log_number_; }
+
+ private:
+  friend class PessimisticTransactionDB;
+  friend class WriteUnpreparedTxnDB;
+  friend class TransactionTest_TwoPhaseLogRollingTest_Test;
+  friend class TransactionTest_TwoPhaseLogRollingTest2_Test;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/transaction_db.h b/src/rocksdb/include/rocksdb/utilities/transaction_db.h
new file mode 100644
index 000000000..741c59574
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/transaction_db.h
@@ -0,0 +1,508 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/stackable_db.h"
+#include "rocksdb/utilities/transaction.h"
+
+// Database with Transaction support.
+//
+// See transaction.h and examples/transaction_example.cc
+
+namespace ROCKSDB_NAMESPACE {
+
+class TransactionDBMutexFactory;
+
+enum TxnDBWritePolicy {
+  WRITE_COMMITTED = 0,  // write only the committed data
+  WRITE_PREPARED,       // write data after the prepare phase of 2pc
+  WRITE_UNPREPARED      // write data before the prepare phase of 2pc
+};
+
+constexpr uint32_t kInitialMaxDeadlocks = 5;
+
+class LockManager;
+struct RangeLockInfo;
+
+// A lock manager handle
+// The workflow is as follows:
+//  * Use a factory method (like NewRangeLockManager()) to create a lock
+//    manager and get its handle.
+//  * A Handle for a particular kind of lock manager will have extra
+//    methods and parameters to control the lock manager
+//  * Pass the handle to RocksDB in TransactionDBOptions::lock_mgr_handle. It
+//    will be used to perform locking.
+class LockManagerHandle {
+ public:
+  // PessimisticTransactionDB will call this to get the Lock Manager it's going
+  // to use.
+  virtual LockManager* getLockManager() = 0;
+
+  virtual ~LockManagerHandle() {}
+};
+
+// Same as class Endpoint, but use std::string to manage the buffer allocation
+struct EndpointWithString {
+  std::string slice;
+  bool inf_suffix;
+};
+
+struct RangeDeadlockInfo {
+  TransactionID m_txn_id;
+  uint32_t m_cf_id;
+  bool m_exclusive;
+
+  EndpointWithString m_start;
+  EndpointWithString m_end;
+};
+
+struct RangeDeadlockPath {
+  std::vector<RangeDeadlockInfo> path;
+  bool limit_exceeded;
+  int64_t deadlock_time;
+
+  explicit RangeDeadlockPath(std::vector<RangeDeadlockInfo> path_entry,
+                             const int64_t& dl_time)
+      : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {}
+
+  // empty path, limit exceeded constructor and default constructor
+  explicit RangeDeadlockPath(const int64_t& dl_time = 0, bool limit = false)
+      : path(0), limit_exceeded(limit), deadlock_time(dl_time) {}
+
+  bool empty() { return path.empty() && !limit_exceeded; }
+};
+
+// A handle to control RangeLockManager (Range-based lock manager) from outside
+// RocksDB
+class RangeLockManagerHandle : public LockManagerHandle {
+ public:
+  // Set total amount of lock memory to use.
+  //
+  //  @return 0 Ok
+  //  @return EDOM Failed to set because currently using more memory than
+  //        specified
+  virtual int SetMaxLockMemory(size_t max_lock_memory) = 0;
+  virtual size_t GetMaxLockMemory() = 0;
+
+  using RangeLockStatus =
+      std::unordered_multimap<ColumnFamilyId, RangeLockInfo>;
+
+  // Lock Escalation barrier check function.
+  // It is called for a couple of endpoints A and B, such that A < B.
+  // If escalation_barrier_check_func(A, B)==true, then there's a lock
+  // escalation barrier between A and B, and lock escalation is not allowed
+  // to bridge the gap between A and B.
+  //
+  // The function may be called from any thread that acquires or releases
+  // locks. It should not throw exceptions. There is currently no way to return
+  // an error.
+  using EscalationBarrierFunc =
+      std::function<bool(const Endpoint& a, const Endpoint& b)>;
+
+  // Set the user-provided barrier check function
+  virtual void SetEscalationBarrierFunc(EscalationBarrierFunc func) = 0;
+
+  virtual RangeLockStatus GetRangeLockStatusData() = 0;
+
+  class Counters {
+   public:
+    // Number of times lock escalation was triggered (for all column families)
+    uint64_t escalation_count;
+
+    // Number of times lock acquisition had to wait for a conflicting lock
+    // to be released. This counts both successful waits (where the desired
+    // lock was acquired) and waits that timed out or got other error.
+    uint64_t lock_wait_count;
+
+    // How much memory is currently used for locks (total for all column
+    // families)
+    uint64_t current_lock_memory;
+  };
+
+  // Get the current counter values
+  virtual Counters GetStatus() = 0;
+
+  // Functions for range-based Deadlock reporting.
+  virtual std::vector<RangeDeadlockPath> GetRangeDeadlockInfoBuffer() = 0;
+  virtual void SetRangeDeadlockInfoBufferSize(uint32_t target_size) = 0;
+
+  virtual ~RangeLockManagerHandle() {}
+};
+
+// A factory function to create a Range Lock Manager. The created object should
+// be:
+//  1. Passed in TransactionDBOptions::lock_mgr_handle to open the database in
+//     range-locking mode
+//  2. Used to control the lock manager when the DB is already open.
+RangeLockManagerHandle* NewRangeLockManager(
+    std::shared_ptr<TransactionDBMutexFactory> mutex_factory);
+
+struct TransactionDBOptions {
+  // Specifies the maximum number of keys that can be locked at the same time
+  // per column family.
+  // If the number of locked keys is greater than max_num_locks, transaction
+  // writes (or GetForUpdate) will return an error.
+  // If this value is not positive, no limit will be enforced.
+  int64_t max_num_locks = -1;
+
+  // Stores the number of latest deadlocks to track
+  uint32_t max_num_deadlocks = kInitialMaxDeadlocks;
+
+  // Increasing this value will increase the concurrency by dividing the lock
+  // table (per column family) into more sub-tables, each with their own
+  // separate mutex.
+  size_t num_stripes = 16;
+
+  // If positive, specifies the default wait timeout in milliseconds when
+  // a transaction attempts to lock a key if not specified by
+  // TransactionOptions::lock_timeout.
+  //
+  // If 0, no waiting is done if a lock cannot instantly be acquired.
+  // If negative, there is no timeout.  Not using a timeout is not recommended
+  // as it can lead to deadlocks.  Currently, there is no deadlock-detection to
+  // recover from a deadlock.
+  int64_t transaction_lock_timeout = 1000;  // 1 second
+
+  // If positive, specifies the wait timeout in milliseconds when writing a key
+  // OUTSIDE of a transaction (ie by calling DB::Put(),Merge(),Delete(),Write()
+  // directly).
+  // If 0, no waiting is done if a lock cannot instantly be acquired.
+  // If negative, there is no timeout and will block indefinitely when acquiring
+  // a lock.
+  //
+  // Not using a timeout can lead to deadlocks.  Currently, there
+  // is no deadlock-detection to recover from a deadlock.  While DB writes
+  // cannot deadlock with other DB writes, they can deadlock with a transaction.
+  // A negative timeout should only be used if all transactions have a small
+  // expiration set.
+  int64_t default_lock_timeout = 1000;  // 1 second
+
+  // If set, the TransactionDB will use this implementation of a mutex and
+  // condition variable for all transaction locking instead of the default
+  // mutex/condvar implementation.
+  std::shared_ptr<TransactionDBMutexFactory> custom_mutex_factory;
+
+  // The policy for when to write the data into the DB. The default policy is to
+  // write only the committed data (WRITE_COMMITTED). The data could be written
+  // before the commit phase. The DB then needs to provide the mechanisms to
+  // tell apart committed from uncommitted data.
+  TxnDBWritePolicy write_policy = TxnDBWritePolicy::WRITE_COMMITTED;
+
+  // TODO(myabandeh): remove this option
+  // Note: this is a temporary option as a hot fix in rollback of writeprepared
+  // txns in myrocks. MyRocks uses merge operands for autoinc column id without
+  // however obtaining locks. This breaks the assumption behind the rollback
+  // logic in myrocks. This hack of simply not rolling back merge operands works
+  // for the special way that myrocks uses this operands.
+  bool rollback_merge_operands = false;
+
+  // nullptr means use default lock manager.
+  // Other value means the user provides a custom lock manager.
+  std::shared_ptr<LockManagerHandle> lock_mgr_handle;
+
+  // If true, the TransactionDB implementation might skip concurrency control
+  // unless it is overridden by TransactionOptions or
+  // TransactionDBWriteOptimizations. This can be used in conjunction with
+  // DBOptions::unordered_write when the TransactionDB is used solely for write
+  // ordering rather than concurrency control.
+  bool skip_concurrency_control = false;
+
+  // This option is only valid for write unprepared. If a write batch exceeds
+  // this threshold, then the transaction will implicitly flush the currently
+  // pending writes into the database. A value of 0 or less means no limit.
+  int64_t default_write_batch_flush_threshold = 0;
+
+  // This option is valid only for write-prepared/write-unprepared. Transaction
+  // will rely on this callback to determine if a key should be rolled back
+  // with Delete or SingleDelete when necessary. If the callback returns true,
+  // then SingleDelete should be used. If the callback is not callable or the
+  // callback returns false, then a Delete is used.
+  // The application should ensure thread-safety of this callback.
+  // The callback should not throw because RocksDB is not exception-safe.
+  // The callback may be removed if we allow mixing Delete and SingleDelete in
+  // the future.
+  std::function<bool(TransactionDB* /*db*/,
+                     ColumnFamilyHandle* /*column_family*/,
+                     const Slice& /*key*/)>
+      rollback_deletion_type_callback;
+
+ private:
+  // 128 entries
+  // Should the default value change, please also update wp_snapshot_cache_bits
+  // in db_stress_gflags.cc
+  size_t wp_snapshot_cache_bits = static_cast<size_t>(7);
+  // 8m entry, 64MB size
+  // Should the default value change, please also update wp_commit_cache_bits
+  // in db_stress_gflags.cc
+  size_t wp_commit_cache_bits = static_cast<size_t>(23);
+
+  // For testing, whether transaction name should be auto-generated or not. This
+  // is useful for write unprepared which requires named transactions.
+  bool autogenerate_name = false;
+
+  friend class WritePreparedTxnDB;
+  friend class WriteUnpreparedTxn;
+  friend class WritePreparedTransactionTestBase;
+  friend class TransactionTestBase;
+  friend class MySQLStyleTransactionTest;
+  friend class StressTest;
+};
+
+struct TransactionOptions {
+  // Setting set_snapshot=true is the same as calling
+  // Transaction::SetSnapshot().
+  bool set_snapshot = false;
+
+  // Setting to true means that before acquiring locks, this transaction will
+  // check if doing so will cause a deadlock. If so, it will return with
+  // Status::Busy.  The user should retry their transaction.
+  bool deadlock_detect = false;
+
+  // If set, it states that the CommitTimeWriteBatch represents the latest state
+  // of the application, has only one sub-batch, i.e., no duplicate keys,  and
+  // meant to be used later during recovery. It enables an optimization to
+  // postpone updating the memtable with CommitTimeWriteBatch to only
+  // SwitchMemtable or recovery.
+  // This option does not affect write-committed. Only
+  // write-prepared/write-unprepared transactions will be affected.
+  bool use_only_the_last_commit_time_batch_for_recovery = false;
+
+  // TODO(agiardullo): TransactionDB does not yet support comparators that allow
+  // two non-equal keys to be equivalent.  Ie, cmp->Compare(a,b) should only
+  // return 0 if
+  // a.compare(b) returns 0.
+
+  // If positive, specifies the wait timeout in milliseconds when
+  // a transaction attempts to lock a key.
+  //
+  // If 0, no waiting is done if a lock cannot instantly be acquired.
+  // If negative, TransactionDBOptions::transaction_lock_timeout will be used.
+  int64_t lock_timeout = -1;
+
+  // Expiration duration in milliseconds.  If non-negative, transactions that
+  // last longer than this many milliseconds will fail to commit.  If not set,
+  // a forgotten transaction that is never committed, rolled back, or deleted
+  // will never relinquish any locks it holds.  This could prevent keys from
+  // being written by other writers.
+  int64_t expiration = -1;
+
+  // The number of traversals to make during deadlock detection.
+  int64_t deadlock_detect_depth = 50;
+
+  // The maximum number of bytes used for the write batch. 0 means no limit.
+  size_t max_write_batch_size = 0;
+
+  // Skip Concurrency Control. This could be as an optimization if the
+  // application knows that the transaction would not have any conflict with
+  // concurrent transactions. It could also be used during recovery if (i)
+  // application guarantees no conflict between prepared transactions in the WAL
+  // (ii) application guarantees that recovered transactions will be rolled
+  // back/commit before new transactions start.
+  // Default: false
+  bool skip_concurrency_control = false;
+
+  // In pessimistic transaction, if this is true, then you can skip Prepare
+  // before Commit, otherwise, you must Prepare before Commit.
+  bool skip_prepare = true;
+
+  // See TransactionDBOptions::default_write_batch_flush_threshold for
+  // description. If a negative value is specified, then the default value from
+  // TransactionDBOptions is used.
+  int64_t write_batch_flush_threshold = -1;
+};
+
+// The per-write optimizations that do not involve transactions. TransactionDB
+// implementation might or might not make use of the specified optimizations.
+struct TransactionDBWriteOptimizations {
+  // If it is true it means that the application guarantees that the
+  // key-set in the write batch do not conflict with any concurrent transaction
+  // and hence the concurrency control mechanism could be skipped for this
+  // write.
+  bool skip_concurrency_control = false;
+  // If true, the application guarantees that there is no duplicate <column
+  // family, key> in the write batch and any employed mechanism to handle
+  // duplicate keys could be skipped.
+  bool skip_duplicate_key_check = false;
+};
+
+struct KeyLockInfo {
+  std::string key;
+  std::vector<TransactionID> ids;
+  bool exclusive;
+};
+
+struct RangeLockInfo {
+  EndpointWithString start;
+  EndpointWithString end;
+  std::vector<TransactionID> ids;
+  bool exclusive;
+};
+
+struct DeadlockInfo {
+  TransactionID m_txn_id;
+  uint32_t m_cf_id;
+  bool m_exclusive;
+  std::string m_waiting_key;
+};
+
+struct DeadlockPath {
+  std::vector<DeadlockInfo> path;
+  bool limit_exceeded;
+  int64_t deadlock_time;
+
+  explicit DeadlockPath(std::vector<DeadlockInfo> path_entry,
+                        const int64_t& dl_time)
+      : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {}
+
+  // empty path, limit exceeded constructor and default constructor
+  explicit DeadlockPath(const int64_t& dl_time = 0, bool limit = false)
+      : path(0), limit_exceeded(limit), deadlock_time(dl_time) {}
+
+  bool empty() { return path.empty() && !limit_exceeded; }
+};
+
+class TransactionDB : public StackableDB {
+ public:
+  // Optimized version of ::Write that receives more optimization request such
+  // as skip_concurrency_control.
+  using StackableDB::Write;
+  virtual Status Write(const WriteOptions& opts,
+                       const TransactionDBWriteOptimizations&,
+                       WriteBatch* updates) {
+    // The default implementation ignores TransactionDBWriteOptimizations and
+    // falls back to the un-optimized version of ::Write
+    return Write(opts, updates);
+  }
+  // Transactional `DeleteRange()` is not yet supported.
+  // However, users who know their deleted range does not conflict with
+  // anything can still use it via the `Write()` API. In all cases, the
+  // `Write()` overload specifying `TransactionDBWriteOptimizations` must be
+  // used and `skip_concurrency_control` must be set. When using either
+  // WRITE_PREPARED or WRITE_UNPREPARED , `skip_duplicate_key_check` must
+  // additionally be set.
+  using StackableDB::DeleteRange;
+  virtual Status DeleteRange(const WriteOptions&, ColumnFamilyHandle*,
+                             const Slice&, const Slice&) override {
+    return Status::NotSupported();
+  }
+  // Open a TransactionDB similar to DB::Open().
+  // Internally call PrepareWrap() and WrapDB()
+  // If the return status is not ok, then dbptr is set to nullptr.
+  static Status Open(const Options& options,
+                     const TransactionDBOptions& txn_db_options,
+                     const std::string& dbname, TransactionDB** dbptr);
+
+  static Status Open(const DBOptions& db_options,
+                     const TransactionDBOptions& txn_db_options,
+                     const std::string& dbname,
+                     const std::vector<ColumnFamilyDescriptor>& column_families,
+                     std::vector<ColumnFamilyHandle*>* handles,
+                     TransactionDB** dbptr);
+  // Note: PrepareWrap() may change parameters, make copies before the
+  // invocation if needed.
+  static void PrepareWrap(DBOptions* db_options,
+                          std::vector<ColumnFamilyDescriptor>* column_families,
+                          std::vector<size_t>* compaction_enabled_cf_indices);
+  // If the return status is not ok, then dbptr will bet set to nullptr. The
+  // input db parameter might or might not be deleted as a result of the
+  // failure. If it is properly deleted it will be set to nullptr. If the return
+  // status is ok, the ownership of db is transferred to dbptr.
+  static Status WrapDB(DB* db, const TransactionDBOptions& txn_db_options,
+                       const std::vector<size_t>& compaction_enabled_cf_indices,
+                       const std::vector<ColumnFamilyHandle*>& handles,
+                       TransactionDB** dbptr);
+  // If the return status is not ok, then dbptr will bet set to nullptr. The
+  // input db parameter might or might not be deleted as a result of the
+  // failure. If it is properly deleted it will be set to nullptr. If the return
+  // status is ok, the ownership of db is transferred to dbptr.
+  static Status WrapStackableDB(
+      StackableDB* db, const TransactionDBOptions& txn_db_options,
+      const std::vector<size_t>& compaction_enabled_cf_indices,
+      const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr);
+  // Since the destructor in StackableDB is virtual, this destructor is virtual
+  // too. The root db will be deleted by the base's destructor.
+  ~TransactionDB() override {}
+
+  // Starts a new Transaction.
+  //
+  // Caller is responsible for deleting the returned transaction when no
+  // longer needed.
+  //
+  // If old_txn is not null, BeginTransaction will reuse this Transaction
+  // handle instead of allocating a new one.  This is an optimization to avoid
+  // extra allocations when repeatedly creating transactions.
+  virtual Transaction* BeginTransaction(
+      const WriteOptions& write_options,
+      const TransactionOptions& txn_options = TransactionOptions(),
+      Transaction* old_txn = nullptr) = 0;
+
+  virtual Transaction* GetTransactionByName(const TransactionName& name) = 0;
+  virtual void GetAllPreparedTransactions(std::vector<Transaction*>* trans) = 0;
+
+  // Returns set of all locks held.
+  //
+  // The mapping is column family id -> KeyLockInfo
+  virtual std::unordered_multimap<uint32_t, KeyLockInfo>
+  GetLockStatusData() = 0;
+
+  virtual std::vector<DeadlockPath> GetDeadlockInfoBuffer() = 0;
+  virtual void SetDeadlockInfoBufferSize(uint32_t target_size) = 0;
+
+  // Create a snapshot and assign ts to it. Return the snapshot to caller. The
+  // snapshot-timestamp mapping is also tracked by the database.
+  // Caller must ensure there are no active writes when this API is called.
+  virtual std::pair<Status, std::shared_ptr<const Snapshot>>
+  CreateTimestampedSnapshot(TxnTimestamp ts) = 0;
+
+  // Return the latest timestamped snapshot if present.
+  std::shared_ptr<const Snapshot> GetLatestTimestampedSnapshot() const {
+    return GetTimestampedSnapshot(kMaxTxnTimestamp);
+  }
+  // Return the snapshot correponding to given timestamp. If ts is
+  // kMaxTxnTimestamp, then we return the latest timestamped snapshot if
+  // present. Othersise, we return the snapshot whose timestamp is equal to
+  // `ts`. If no such snapshot exists, then we return null.
+  virtual std::shared_ptr<const Snapshot> GetTimestampedSnapshot(
+      TxnTimestamp ts) const = 0;
+  // Release timestamped snapshots whose timestamps are less than or equal to
+  // ts.
+  virtual void ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts) = 0;
+
+  // Get all timestamped snapshots which will be stored in
+  // timestamped_snapshots.
+  Status GetAllTimestampedSnapshots(
+      std::vector<std::shared_ptr<const Snapshot>>& timestamped_snapshots)
+      const {
+    return GetTimestampedSnapshots(/*ts_lb=*/0, /*ts_ub=*/kMaxTxnTimestamp,
+                                   timestamped_snapshots);
+  }
+
+  // Get all timestamped snapshots whose timestamps fall within [ts_lb, ts_ub).
+  // timestamped_snapshots will be cleared and contain returned snapshots.
+  virtual Status GetTimestampedSnapshots(
+      TxnTimestamp ts_lb, TxnTimestamp ts_ub,
+      std::vector<std::shared_ptr<const Snapshot>>& timestamped_snapshots)
+      const = 0;
+
+ protected:
+  // To Create an TransactionDB, call Open()
+  // The ownership of db is transferred to the base StackableDB
+  explicit TransactionDB(DB* db) : StackableDB(db) {}
+  // No copying allowed
+  TransactionDB(const TransactionDB&) = delete;
+  void operator=(const TransactionDB&) = delete;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h b/src/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h
new file mode 100644
index 000000000..e352f325a
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h
@@ -0,0 +1,91 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <memory>
+
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TransactionDBMutex and TransactionDBCondVar APIs allows applications to
+// implement custom mutexes and condition variables to be used by a
+// TransactionDB when locking keys.
+//
+// To open a TransactionDB with a custom TransactionDBMutexFactory, set
+// TransactionDBOptions.custom_mutex_factory.
+class TransactionDBMutex {
+ public:
+  virtual ~TransactionDBMutex() {}
+
+  // Attempt to acquire lock.  Return OK on success, or other Status on failure.
+  // If returned status is OK, TransactionDB will eventually call UnLock().
+  virtual Status Lock() = 0;
+
+  // Attempt to acquire lock.  If timeout is non-negative, operation may be
+  // failed after this many microseconds.
+  // Returns OK on success,
+  //         TimedOut if timed out,
+  //         or other Status on failure.
+  // If returned status is OK, TransactionDB will eventually call UnLock().
+  virtual Status TryLockFor(int64_t timeout_time) = 0;
+
+  // Unlock Mutex that was successfully locked by Lock() or TryLockUntil()
+  virtual void UnLock() = 0;
+};
+
+class TransactionDBCondVar {
+ public:
+  virtual ~TransactionDBCondVar() {}
+
+  // Block current thread until condition variable is notified by a call to
+  // Notify() or NotifyAll().  Wait() will be called with mutex locked.
+  // Returns OK if notified.
+  // Returns non-OK if TransactionDB should stop waiting and fail the operation.
+  // May return OK spuriously even if not notified.
+  virtual Status Wait(std::shared_ptr<TransactionDBMutex> mutex) = 0;
+
+  // Block current thread until condition variable is notified by a call to
+  // Notify() or NotifyAll(), or if the timeout is reached.
+  // Wait() will be called with mutex locked.
+  //
+  // If timeout is non-negative, operation should be failed after this many
+  // microseconds.
+  // If implementing a custom version of this class, the implementation may
+  // choose to ignore the timeout.
+  //
+  // Returns OK if notified.
+  // Returns TimedOut if timeout is reached.
+  // Returns other status if TransactionDB should otherwise stop waiting and
+  //  fail the operation.
+  // May return OK spuriously even if not notified.
+  virtual Status WaitFor(std::shared_ptr<TransactionDBMutex> mutex,
+                         int64_t timeout_time) = 0;
+
+  // If any threads are waiting on *this, unblock at least one of the
+  // waiting threads.
+  virtual void Notify() = 0;
+
+  // Unblocks all threads waiting on *this.
+  virtual void NotifyAll() = 0;
+};
+
+// Factory class that can allocate mutexes and condition variables.
+class TransactionDBMutexFactory {
+ public:
+  // Create a TransactionDBMutex object.
+  virtual std::shared_ptr<TransactionDBMutex> AllocateMutex() = 0;
+
+  // Create a TransactionDBCondVar object.
+  virtual std::shared_ptr<TransactionDBCondVar> AllocateCondVar() = 0;
+
+  virtual ~TransactionDBMutexFactory() {}
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/write_batch_with_index.h b/src/rocksdb/include/rocksdb/utilities/write_batch_with_index.h
new file mode 100644
index 000000000..84dc11a31
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/write_batch_with_index.h
@@ -0,0 +1,309 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A WriteBatchWithIndex with a binary searchable index built for all the keys
+// inserted.
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/write_batch_base.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ColumnFamilyHandle;
+class Comparator;
+class DB;
+class ReadCallback;
+struct ReadOptions;
+struct DBOptions;
+
+enum WriteType {
+  kPutRecord,
+  kMergeRecord,
+  kDeleteRecord,
+  kSingleDeleteRecord,
+  kDeleteRangeRecord,
+  kLogDataRecord,
+  kXIDRecord,
+  kUnknownRecord,
+};
+
+// an entry for Put, Merge, Delete, or SingleDelete entry for write batches.
+// Used in WBWIIterator.
+struct WriteEntry {
+  WriteType type = kUnknownRecord;
+  Slice key;
+  Slice value;
+};
+
+// Iterator of one column family out of a WriteBatchWithIndex.
+class WBWIIterator {
+ public:
+  virtual ~WBWIIterator() {}
+
+  virtual bool Valid() const = 0;
+
+  virtual void SeekToFirst() = 0;
+
+  virtual void SeekToLast() = 0;
+
+  virtual void Seek(const Slice& key) = 0;
+
+  virtual void SeekForPrev(const Slice& key) = 0;
+
+  virtual void Next() = 0;
+
+  virtual void Prev() = 0;
+
+  // the return WriteEntry is only valid until the next mutation of
+  // WriteBatchWithIndex
+  virtual WriteEntry Entry() const = 0;
+
+  virtual Status status() const = 0;
+};
+
+// A WriteBatchWithIndex with a binary searchable index built for all the keys
+// inserted.
+// In Put(), Merge() Delete(), or SingleDelete(), the same function of the
+// wrapped will be called. At the same time, indexes will be built.
+// By calling GetWriteBatch(), a user will get the WriteBatch for the data
+// they inserted, which can be used for DB::Write().
+// A user can call NewIterator() to create an iterator.
+class WriteBatchWithIndex : public WriteBatchBase {
+ public:
+  // backup_index_comparator: the backup comparator used to compare keys
+  // within the same column family, if column family is not given in the
+  // interface, or we can't find a column family from the column family handle
+  // passed in, backup_index_comparator will be used for the column family.
+  // reserved_bytes: reserved bytes in underlying WriteBatch
+  // max_bytes: maximum size of underlying WriteBatch in bytes
+  // overwrite_key: if true, overwrite the key in the index when inserting
+  //                the same key as previously, so iterator will never
+  //                show two entries with the same key.
+  explicit WriteBatchWithIndex(
+      const Comparator* backup_index_comparator = BytewiseComparator(),
+      size_t reserved_bytes = 0, bool overwrite_key = false,
+      size_t max_bytes = 0, size_t protection_bytes_per_key = 0);
+
+  ~WriteBatchWithIndex() override;
+  WriteBatchWithIndex(WriteBatchWithIndex&&);
+  WriteBatchWithIndex& operator=(WriteBatchWithIndex&&);
+
+  using WriteBatchBase::Put;
+  Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+             const Slice& value) override;
+
+  Status Put(const Slice& key, const Slice& value) override;
+
+  Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+             const Slice& ts, const Slice& value) override;
+
+  Status PutEntity(ColumnFamilyHandle* column_family, const Slice& /* key */,
+                   const WideColumns& /* columns */) override {
+    if (!column_family) {
+      return Status::InvalidArgument(
+          "Cannot call this method without a column family handle");
+    }
+
+    return Status::NotSupported(
+        "PutEntity not supported by WriteBatchWithIndex");
+  }
+
+  using WriteBatchBase::Merge;
+  Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+               const Slice& value) override;
+
+  Status Merge(const Slice& key, const Slice& value) override;
+  Status Merge(ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
+               const Slice& /*ts*/, const Slice& /*value*/) override {
+    return Status::NotSupported(
+        "Merge does not support user-defined timestamp");
+  }
+
+  using WriteBatchBase::Delete;
+  Status Delete(ColumnFamilyHandle* column_family, const Slice& key) override;
+  Status Delete(const Slice& key) override;
+  Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
+                const Slice& ts) override;
+
+  using WriteBatchBase::SingleDelete;
+  Status SingleDelete(ColumnFamilyHandle* column_family,
+                      const Slice& key) override;
+  Status SingleDelete(const Slice& key) override;
+  Status SingleDelete(ColumnFamilyHandle* column_family, const Slice& key,
+                      const Slice& ts) override;
+
+  using WriteBatchBase::DeleteRange;
+  Status DeleteRange(ColumnFamilyHandle* /* column_family */,
+                     const Slice& /* begin_key */,
+                     const Slice& /* end_key */) override {
+    return Status::NotSupported(
+        "DeleteRange unsupported in WriteBatchWithIndex");
+  }
+  Status DeleteRange(const Slice& /* begin_key */,
+                     const Slice& /* end_key */) override {
+    return Status::NotSupported(
+        "DeleteRange unsupported in WriteBatchWithIndex");
+  }
+  Status DeleteRange(ColumnFamilyHandle* /*column_family*/,
+                     const Slice& /*begin_key*/, const Slice& /*end_key*/,
+                     const Slice& /*ts*/) override {
+    return Status::NotSupported(
+        "DeleteRange unsupported in WriteBatchWithIndex");
+  }
+
+  using WriteBatchBase::PutLogData;
+  Status PutLogData(const Slice& blob) override;
+
+  using WriteBatchBase::Clear;
+  void Clear() override;
+
+  using WriteBatchBase::GetWriteBatch;
+  WriteBatch* GetWriteBatch() override;
+
+  // Create an iterator of a column family. User can call iterator.Seek() to
+  // search to the next entry of or after a key. Keys will be iterated in the
+  // order given by index_comparator. For multiple updates on the same key,
+  // each update will be returned as a separate entry, in the order of update
+  // time.
+  //
+  // The returned iterator should be deleted by the caller.
+  WBWIIterator* NewIterator(ColumnFamilyHandle* column_family);
+  // Create an iterator of the default column family.
+  WBWIIterator* NewIterator();
+
+  // Will create a new Iterator that will use WBWIIterator as a delta and
+  // base_iterator as base.
+  //
+  // This function is only supported if the WriteBatchWithIndex was
+  // constructed with overwrite_key=true.
+  //
+  // The returned iterator should be deleted by the caller.
+  // The base_iterator is now 'owned' by the returned iterator. Deleting the
+  // returned iterator will also delete the base_iterator.
+  //
+  // Updating write batch with the current key of the iterator is not safe.
+  // We strongly recommend users not to do it. It will invalidate the current
+  // key() and value() of the iterator. This invalidation happens even before
+  // the write batch update finishes. The state may recover after Next() is
+  // called.
+  Iterator* NewIteratorWithBase(ColumnFamilyHandle* column_family,
+                                Iterator* base_iterator,
+                                const ReadOptions* opts = nullptr);
+  // default column family
+  Iterator* NewIteratorWithBase(Iterator* base_iterator);
+
+  // Similar to DB::Get() but will only read the key from this batch.
+  // If the batch does not have enough data to resolve Merge operations,
+  // MergeInProgress status may be returned.
+  Status GetFromBatch(ColumnFamilyHandle* column_family,
+                      const DBOptions& options, const Slice& key,
+                      std::string* value);
+
+  // Similar to previous function but does not require a column_family.
+  // Note:  An InvalidArgument status will be returned if there are any Merge
+  // operators for this key.  Use previous method instead.
+  Status GetFromBatch(const DBOptions& options, const Slice& key,
+                      std::string* value) {
+    return GetFromBatch(nullptr, options, key, value);
+  }
+
+  // Similar to DB::Get() but will also read writes from this batch.
+  //
+  // This function will query both this batch and the DB and then merge
+  // the results using the DB's merge operator (if the batch contains any
+  // merge requests).
+  //
+  // Setting read_options.snapshot will affect what is read from the DB
+  // but will NOT change which keys are read from the batch (the keys in
+  // this batch do not yet belong to any snapshot and will be fetched
+  // regardless).
+  Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+                           const Slice& key, std::string* value);
+
+  // An overload of the above method that receives a PinnableSlice
+  Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+                           const Slice& key, PinnableSlice* value);
+
+  Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           std::string* value);
+
+  // An overload of the above method that receives a PinnableSlice
+  Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           PinnableSlice* value);
+
+  void MultiGetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+                              ColumnFamilyHandle* column_family,
+                              const size_t num_keys, const Slice* keys,
+                              PinnableSlice* values, Status* statuses,
+                              bool sorted_input);
+
+  // Records the state of the batch for future calls to RollbackToSavePoint().
+  // May be called multiple times to set multiple save points.
+  void SetSavePoint() override;
+
+  // Remove all entries in this batch (Put, Merge, Delete, SingleDelete,
+  // PutLogData) since the most recent call to SetSavePoint() and removes the
+  // most recent save point.
+  // If there is no previous call to SetSavePoint(), behaves the same as
+  // Clear().
+  //
+  // Calling RollbackToSavePoint invalidates any open iterators on this batch.
+  //
+  // Returns Status::OK() on success,
+  //         Status::NotFound() if no previous call to SetSavePoint(),
+  //         or other Status on corruption.
+  Status RollbackToSavePoint() override;
+
+  // Pop the most recent save point.
+  // If there is no previous call to SetSavePoint(), Status::NotFound()
+  // will be returned.
+  // Otherwise returns Status::OK().
+  Status PopSavePoint() override;
+
+  void SetMaxBytes(size_t max_bytes) override;
+  size_t GetDataSize() const;
+
+ private:
+  friend class PessimisticTransactionDB;
+  friend class WritePreparedTxn;
+  friend class WriteUnpreparedTxn;
+  friend class WriteBatchWithIndex_SubBatchCnt_Test;
+  friend class WriteBatchWithIndexInternal;
+  // Returns the number of sub-batches inside the write batch. A sub-batch
+  // starts right before inserting a key that is a duplicate of a key in the
+  // last sub-batch.
+  size_t SubBatchCnt();
+
+  Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           PinnableSlice* value, ReadCallback* callback);
+  void MultiGetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+                              ColumnFamilyHandle* column_family,
+                              const size_t num_keys, const Slice* keys,
+                              PinnableSlice* values, Status* statuses,
+                              bool sorted_input, ReadCallback* callback);
+  struct Rep;
+  std::unique_ptr<Rep> rep;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/version.h b/src/rocksdb/include/rocksdb/version.h
new file mode 100644
index 000000000..c54f3a2c3
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/version.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <string>
+#include <unordered_map>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+// NOTE: in 'main' development branch, this should be the *next*
+// minor or major version number planned for release.
+#define ROCKSDB_MAJOR 7
+#define ROCKSDB_MINOR 9
+#define ROCKSDB_PATCH 2
+
+// Do not use these. We made the mistake of declaring macros starting with
+// double underscore. Now we have to live with our choice. We'll deprecate these
+// at some point
+#define __ROCKSDB_MAJOR__ ROCKSDB_MAJOR
+#define __ROCKSDB_MINOR__ ROCKSDB_MINOR
+#define __ROCKSDB_PATCH__ ROCKSDB_PATCH
+
+namespace ROCKSDB_NAMESPACE {
+// Returns a set of properties indicating how/when/where this version of RocksDB
+// was created.
+const std::unordered_map<std::string, std::string>& GetRocksBuildProperties();
+
+// Returns the current version of RocksDB as a string (e.g. "6.16.0").
+// If with_patch is true, the patch is included (6.16.x).
+// Otherwise, only major and minor version is included (6.16)
+std::string GetRocksVersionAsString(bool with_patch = true);
+
+// Gets the set of build properties (@see GetRocksBuildProperties) into a
+// string. Properties are returned one-per-line, with the first line being:
+// "<program> from RocksDB <version>.
+// If verbose is true, the full set of properties is
+// printed. If verbose is false, only the version information (@see
+// GetRocksVersionString) is printed.
+std::string GetRocksBuildInfoAsString(const std::string& program,
+                                      bool verbose = false);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/wal_filter.h b/src/rocksdb/include/rocksdb/wal_filter.h
new file mode 100644
index 000000000..3e66c39e4
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/wal_filter.h
@@ -0,0 +1,111 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <map>
+#include <string>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WriteBatch;
+struct ConfigOptions;
+
+// WALFilter allows an application to inspect write-ahead-log (WAL)
+// records or modify their processing on recovery.
+// Please see the details below.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class WalFilter : public Customizable {
+ public:
+  static const char* Type() { return "WalFilter"; }
+  static Status CreateFromString(const ConfigOptions& options,
+                                 const std::string& value, WalFilter** result);
+  enum class WalProcessingOption {
+    // Continue processing as usual
+    kContinueProcessing = 0,
+    // Ignore the current record but continue processing of log(s)
+    kIgnoreCurrentRecord = 1,
+    // Stop replay of logs and discard logs
+    // Logs won't be replayed on subsequent recovery
+    kStopReplay = 2,
+    // Corrupted record detected by filter
+    kCorruptedRecord = 3,
+    // Marker for enum count
+    kWalProcessingOptionMax = 4
+  };
+
+  virtual ~WalFilter() {}
+
+  // Provide ColumnFamily->LogNumber map to filter
+  // so that filter can determine whether a log number applies to a given
+  // column family (i.e. that log hasn't been flushed to SST already for the
+  // column family).
+  // We also pass in name->id map as only name is known during
+  // recovery (as handles are opened post-recovery).
+  // while write batch callbacks happen in terms of column family id.
+  //
+  // @params cf_lognumber_map column_family_id to lognumber map
+  // @params cf_name_id_map   column_family_name to column_family_id map
+
+  virtual void ColumnFamilyLogNumberMap(
+      const std::map<uint32_t, uint64_t>& /*cf_lognumber_map*/,
+      const std::map<std::string, uint32_t>& /*cf_name_id_map*/) {}
+
+  // LogRecord is invoked for each log record encountered for all the logs
+  // during replay on logs on recovery. This method can be used to:
+  //  * inspect the record (using the batch parameter)
+  //  * ignoring current record
+  //    (by returning WalProcessingOption::kIgnoreCurrentRecord)
+  //  * reporting corrupted record
+  //    (by returning WalProcessingOption::kCorruptedRecord)
+  //  * stop log replay
+  //    (by returning kStop replay) - please note that this implies
+  //    discarding the logs from current record onwards.
+  //
+  // @params log_number     log_number of the current log.
+  //                        Filter might use this to determine if the log
+  //                        record is applicable to a certain column family.
+  // @params log_file_name  log file name - only for informational purposes
+  // @params batch          batch encountered in the log during recovery
+  // @params new_batch      new_batch to populate if filter wants to change
+  //                        the batch (for example to filter some records out,
+  //                        or alter some records).
+  //                        Please note that the new batch MUST NOT contain
+  //                        more records than original, else recovery would
+  //                        be failed.
+  // @params batch_changed  Whether batch was changed by the filter.
+  //                        It must be set to true if new_batch was populated,
+  //                        else new_batch has no effect.
+  // @returns               Processing option for the current record.
+  //                        Please see WalProcessingOption enum above for
+  //                        details.
+  virtual WalProcessingOption LogRecordFound(
+      unsigned long long /*log_number*/, const std::string& /*log_file_name*/,
+      const WriteBatch& batch, WriteBatch* new_batch, bool* batch_changed) {
+    // Default implementation falls back to older function for compatibility
+    return LogRecord(batch, new_batch, batch_changed);
+  }
+
+  // Please see the comments for LogRecord above. This function is for
+  // compatibility only and contains a subset of parameters.
+  // New code should use the function above.
+  virtual WalProcessingOption LogRecord(const WriteBatch& /*batch*/,
+                                        WriteBatch* /*new_batch*/,
+                                        bool* /*batch_changed*/) const {
+    return WalProcessingOption::kContinueProcessing;
+  }
+
+  // Returns a name that identifies this WAL filter.
+  // The name will be printed to LOG file on start up for diagnosis.
+  virtual const char* Name() const override = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/wide_columns.h b/src/rocksdb/include/rocksdb/wide_columns.h
new file mode 100644
index 000000000..7ddc61f03
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/wide_columns.h
@@ -0,0 +1,171 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <ostream>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Class representing a wide column, which is defined as a pair of column name
+// and column value.
+class WideColumn {
+ public:
+  WideColumn() = default;
+
+  // Initializes a WideColumn object by forwarding the name and value
+  // arguments to the corresponding member Slices. This makes it possible to
+  // construct a WideColumn using combinations of const char*, const
+  // std::string&, const Slice& etc., for example:
+  //
+  // constexpr char foo[] = "foo";
+  // const std::string bar("bar");
+  // WideColumn column(foo, bar);
+  template <typename N, typename V>
+  WideColumn(N&& name, V&& value)
+      : name_(std::forward<N>(name)), value_(std::forward<V>(value)) {}
+
+  // Initializes a WideColumn object by forwarding the elements of
+  // name_tuple and value_tuple to the constructors of the corresponding member
+  // Slices. This makes it possible to initialize the Slices using the Slice
+  // constructors that take more than one argument, for example:
+  //
+  // constexpr char foo_name[] = "foo_name";
+  // constexpr char bar_value[] = "bar_value";
+  // WideColumn column(std::piecewise_construct,
+  //                   std::forward_as_tuple(foo_name, 3),
+  //                   std::forward_as_tuple(bar_value, 3));
+  template <typename NTuple, typename VTuple>
+  WideColumn(std::piecewise_construct_t, NTuple&& name_tuple,
+             VTuple&& value_tuple)
+      : name_(std::make_from_tuple<Slice>(std::forward<NTuple>(name_tuple))),
+        value_(std::make_from_tuple<Slice>(std::forward<VTuple>(value_tuple))) {
+  }
+
+  const Slice& name() const { return name_; }
+  const Slice& value() const { return value_; }
+
+  Slice& name() { return name_; }
+  Slice& value() { return value_; }
+
+ private:
+  Slice name_;
+  Slice value_;
+};
+
+// Note: column names and values are compared bytewise.
+inline bool operator==(const WideColumn& lhs, const WideColumn& rhs) {
+  return lhs.name() == rhs.name() && lhs.value() == rhs.value();
+}
+
+inline bool operator!=(const WideColumn& lhs, const WideColumn& rhs) {
+  return !(lhs == rhs);
+}
+
+inline std::ostream& operator<<(std::ostream& os, const WideColumn& column) {
+  const bool hex =
+      (os.flags() & std::ios_base::basefield) == std::ios_base::hex;
+  os << column.name().ToString(hex) << ':' << column.value().ToString(hex);
+
+  return os;
+}
+
+// A collection of wide columns.
+using WideColumns = std::vector<WideColumn>;
+
+// The anonymous default wide column (an empty Slice).
+extern const Slice kDefaultWideColumnName;
+
+// An empty set of wide columns.
+extern const WideColumns kNoWideColumns;
+
+// A self-contained collection of wide columns. Used for the results of
+// wide-column queries.
+class PinnableWideColumns {
+ public:
+  const WideColumns& columns() const { return columns_; }
+  size_t serialized_size() const { return value_.size(); }
+
+  void SetPlainValue(const Slice& value);
+  void SetPlainValue(const Slice& value, Cleanable* cleanable);
+
+  Status SetWideColumnValue(const Slice& value);
+  Status SetWideColumnValue(const Slice& value, Cleanable* cleanable);
+
+  void Reset();
+
+ private:
+  void CopyValue(const Slice& value);
+  void PinOrCopyValue(const Slice& value, Cleanable* cleanable);
+  void CreateIndexForPlainValue();
+  Status CreateIndexForWideColumns();
+
+  PinnableSlice value_;
+  WideColumns columns_;
+};
+
+inline void PinnableWideColumns::CopyValue(const Slice& value) {
+  value_.PinSelf(value);
+}
+
+inline void PinnableWideColumns::PinOrCopyValue(const Slice& value,
+                                                Cleanable* cleanable) {
+  if (!cleanable) {
+    CopyValue(value);
+    return;
+  }
+
+  value_.PinSlice(value, cleanable);
+}
+
+inline void PinnableWideColumns::CreateIndexForPlainValue() {
+  columns_ = WideColumns{{kDefaultWideColumnName, value_}};
+}
+
+inline void PinnableWideColumns::SetPlainValue(const Slice& value) {
+  CopyValue(value);
+  CreateIndexForPlainValue();
+}
+
+inline void PinnableWideColumns::SetPlainValue(const Slice& value,
+                                               Cleanable* cleanable) {
+  PinOrCopyValue(value, cleanable);
+  CreateIndexForPlainValue();
+}
+
+inline Status PinnableWideColumns::SetWideColumnValue(const Slice& value) {
+  CopyValue(value);
+  return CreateIndexForWideColumns();
+}
+
+inline Status PinnableWideColumns::SetWideColumnValue(const Slice& value,
+                                                      Cleanable* cleanable) {
+  PinOrCopyValue(value, cleanable);
+  return CreateIndexForWideColumns();
+}
+
+inline void PinnableWideColumns::Reset() {
+  value_.Reset();
+  columns_.clear();
+}
+
+inline bool operator==(const PinnableWideColumns& lhs,
+                       const PinnableWideColumns& rhs) {
+  return lhs.columns() == rhs.columns();
+}
+
+inline bool operator!=(const PinnableWideColumns& lhs,
+                       const PinnableWideColumns& rhs) {
+  return !(lhs == rhs);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/write_batch.h b/src/rocksdb/include/rocksdb/write_batch.h
new file mode 100644
index 000000000..61ba5a739
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/write_batch.h
@@ -0,0 +1,494 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBatch holds a collection of updates to apply atomically to a DB.
+//
+// The updates are applied in the order in which they are added
+// to the WriteBatch.  For example, the value of "key" will be "v3"
+// after the following batch is written:
+//
+//    batch.Put("key", "v1");
+//    batch.Delete("key");
+//    batch.Put("key", "v2");
+//    batch.Put("key", "v3");
+//
+// Multiple threads can invoke const methods on a WriteBatch without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same WriteBatch must use
+// external synchronization.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/status.h"
+#include "rocksdb/write_batch_base.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class ColumnFamilyHandle;
+struct SavePoints;
+struct SliceParts;
+
+struct SavePoint {
+  size_t size;  // size of rep_
+  int count;    // count of elements in rep_
+  uint32_t content_flags;
+
+  SavePoint() : size(0), count(0), content_flags(0) {}
+
+  SavePoint(size_t _size, int _count, uint32_t _flags)
+      : size(_size), count(_count), content_flags(_flags) {}
+
+  void clear() {
+    size = 0;
+    count = 0;
+    content_flags = 0;
+  }
+
+  bool is_cleared() const { return (size | count | content_flags) == 0; }
+};
+
+class WriteBatch : public WriteBatchBase {
+ public:
+  explicit WriteBatch(size_t reserved_bytes = 0, size_t max_bytes = 0)
+      : WriteBatch(reserved_bytes, max_bytes, 0, 0) {}
+
+  // `protection_bytes_per_key` is the number of bytes used to store
+  // protection information for each key entry. Currently supported values are
+  // zero (disabled) and eight.
+  explicit WriteBatch(size_t reserved_bytes, size_t max_bytes,
+                      size_t protection_bytes_per_key, size_t default_cf_ts_sz);
+  ~WriteBatch() override;
+
+  using WriteBatchBase::Put;
+  // Store the mapping "key->value" in the database.
+  // The following Put(..., const Slice& key, ...) API can also be used when
+  // user-defined timestamp is enabled as long as `key` points to a contiguous
+  // buffer with timestamp appended after user key. The caller is responsible
+  // for setting up the memory buffer pointed to by `key`.
+  Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+             const Slice& value) override;
+  Status Put(const Slice& key, const Slice& value) override {
+    return Put(nullptr, key, value);
+  }
+  Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+             const Slice& ts, const Slice& value) override;
+
+  // Variant of Put() that gathers output like writev(2).  The key and value
+  // that will be written to the database are concatenations of arrays of
+  // slices.
+  // The following Put(..., const SliceParts& key, ...) API can be used when
+  // user-defined timestamp is enabled as long as the timestamp is the last
+  // Slice in `key`, a SliceParts (array of Slices). The caller is responsible
+  // for setting up the `key` SliceParts object.
+  Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+             const SliceParts& value) override;
+  Status Put(const SliceParts& key, const SliceParts& value) override {
+    return Put(nullptr, key, value);
+  }
+
+  // Store the mapping "key->{column1:value1, column2:value2, ...}" in the
+  // column family specified by "column_family".
+  using WriteBatchBase::PutEntity;
+  Status PutEntity(ColumnFamilyHandle* column_family, const Slice& key,
+                   const WideColumns& columns) override;
+
+  using WriteBatchBase::Delete;
+  // If the database contains a mapping for "key", erase it.  Else do nothing.
+  // The following Delete(..., const Slice& key) can be used when user-defined
+  // timestamp is enabled as long as `key` points to a contiguous buffer with
+  // timestamp appended after user key. The caller is responsible for setting
+  // up the memory buffer pointed to by `key`.
+  Status Delete(ColumnFamilyHandle* column_family, const Slice& key) override;
+  Status Delete(const Slice& key) override { return Delete(nullptr, key); }
+  Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
+                const Slice& ts) override;
+
+  // variant that takes SliceParts
+  // These two variants of Delete(..., const SliceParts& key) can be used when
+  // user-defined timestamp is enabled as long as the timestamp is the last
+  // Slice in `key`, a SliceParts (array of Slices). The caller is responsible
+  // for setting up the `key` SliceParts object.
+  Status Delete(ColumnFamilyHandle* column_family,
+                const SliceParts& key) override;
+  Status Delete(const SliceParts& key) override { return Delete(nullptr, key); }
+
+  using WriteBatchBase::SingleDelete;
+  // WriteBatch implementation of DB::SingleDelete().  See db.h.
+  Status SingleDelete(ColumnFamilyHandle* column_family,
+                      const Slice& key) override;
+  Status SingleDelete(const Slice& key) override {
+    return SingleDelete(nullptr, key);
+  }
+  Status SingleDelete(ColumnFamilyHandle* column_family, const Slice& key,
+                      const Slice& ts) override;
+
+  // variant that takes SliceParts
+  Status SingleDelete(ColumnFamilyHandle* column_family,
+                      const SliceParts& key) override;
+  Status SingleDelete(const SliceParts& key) override {
+    return SingleDelete(nullptr, key);
+  }
+
+  using WriteBatchBase::DeleteRange;
+  // WriteBatch implementation of DB::DeleteRange().  See db.h.
+  Status DeleteRange(ColumnFamilyHandle* column_family, const Slice& begin_key,
+                     const Slice& end_key) override;
+  Status DeleteRange(const Slice& begin_key, const Slice& end_key) override {
+    return DeleteRange(nullptr, begin_key, end_key);
+  }
+  // begin_key and end_key should be user keys without timestamp.
+  Status DeleteRange(ColumnFamilyHandle* column_family, const Slice& begin_key,
+                     const Slice& end_key, const Slice& ts) override;
+
+  // variant that takes SliceParts
+  Status DeleteRange(ColumnFamilyHandle* column_family,
+                     const SliceParts& begin_key,
+                     const SliceParts& end_key) override;
+  Status DeleteRange(const SliceParts& begin_key,
+                     const SliceParts& end_key) override {
+    return DeleteRange(nullptr, begin_key, end_key);
+  }
+
+  using WriteBatchBase::Merge;
+  // Merge "value" with the existing value of "key" in the database.
+  // "key->merge(existing, value)"
+  Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+               const Slice& value) override;
+  Status Merge(const Slice& key, const Slice& value) override {
+    return Merge(nullptr, key, value);
+  }
+  Status Merge(ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
+               const Slice& /*ts*/, const Slice& /*value*/) override;
+
+  // variant that takes SliceParts
+  Status Merge(ColumnFamilyHandle* column_family, const SliceParts& key,
+               const SliceParts& value) override;
+  Status Merge(const SliceParts& key, const SliceParts& value) override {
+    return Merge(nullptr, key, value);
+  }
+
+  using WriteBatchBase::PutLogData;
+  // Append a blob of arbitrary size to the records in this batch. The blob will
+  // be stored in the transaction log but not in any other file. In particular,
+  // it will not be persisted to the SST files. When iterating over this
+  // WriteBatch, WriteBatch::Handler::LogData will be called with the contents
+  // of the blob as it is encountered. Blobs, puts, deletes, and merges will be
+  // encountered in the same order in which they were inserted. The blob will
+  // NOT consume sequence number(s) and will NOT increase the count of the batch
+  //
+  // Example application: add timestamps to the transaction log for use in
+  // replication.
+  Status PutLogData(const Slice& blob) override;
+
+  using WriteBatchBase::Clear;
+  // Clear all updates buffered in this batch.
+  void Clear() override;
+
+  // Records the state of the batch for future calls to RollbackToSavePoint().
+  // May be called multiple times to set multiple save points.
+  void SetSavePoint() override;
+
+  // Remove all entries in this batch (Put, Merge, Delete, PutLogData) since the
+  // most recent call to SetSavePoint() and removes the most recent save point.
+  // If there is no previous call to SetSavePoint(), Status::NotFound()
+  // will be returned.
+  // Otherwise returns Status::OK().
+  Status RollbackToSavePoint() override;
+
+  // Pop the most recent save point.
+  // If there is no previous call to SetSavePoint(), Status::NotFound()
+  // will be returned.
+  // Otherwise returns Status::OK().
+  Status PopSavePoint() override;
+
+  // Support for iterating over the contents of a batch.
+  // Objects of subclasses of Handler will be used by WriteBatch::Iterate().
+  class Handler {
+   public:
+    virtual ~Handler();
+    // All handler functions in this class provide default implementations so
+    // we won't break existing clients of Handler on a source code level when
+    // adding a new member function.
+
+    // default implementation will just call Put without column family for
+    // backwards compatibility. If the column family is not default,
+    // the function is noop
+    // If user-defined timestamp is enabled, then `key` includes timestamp.
+    virtual Status PutCF(uint32_t column_family_id, const Slice& key,
+                         const Slice& value) {
+      if (column_family_id == 0) {
+        // Put() historically doesn't return status. We didn't want to be
+        // backwards incompatible so we didn't change the return status
+        // (this is a public API). We do an ordinary get and return Status::OK()
+        Put(key, value);
+        return Status::OK();
+      }
+      return Status::InvalidArgument(
+          "non-default column family and PutCF not implemented");
+    }
+    // If user-defined timestamp is enabled, then `key` includes timestamp.
+    virtual void Put(const Slice& /*key*/, const Slice& /*value*/) {}
+
+    // If user-defined timestamp is enabled, then `key` includes timestamp.
+    virtual Status PutEntityCF(uint32_t /* column_family_id */,
+                               const Slice& /* key */,
+                               const Slice& /* entity */) {
+      return Status::NotSupported("PutEntityCF not implemented");
+    }
+
+    // If user-defined timestamp is enabled, then `key` includes timestamp.
+    virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
+      if (column_family_id == 0) {
+        Delete(key);
+        return Status::OK();
+      }
+      return Status::InvalidArgument(
+          "non-default column family and DeleteCF not implemented");
+    }
+    // If user-defined timestamp is enabled, then `key` includes timestamp.
+    virtual void Delete(const Slice& /*key*/) {}
+
+    // If user-defined timestamp is enabled, then `key` includes timestamp.
+    virtual Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) {
+      if (column_family_id == 0) {
+        SingleDelete(key);
+        return Status::OK();
+      }
+      return Status::InvalidArgument(
+          "non-default column family and SingleDeleteCF not implemented");
+    }
+    // If user-defined timestamp is enabled, then `key` includes timestamp.
+    virtual void SingleDelete(const Slice& /*key*/) {}
+
+    // If user-defined timestamp is enabled, then `begin_key` and `end_key`
+    // both include timestamp.
+    virtual Status DeleteRangeCF(uint32_t /*column_family_id*/,
+                                 const Slice& /*begin_key*/,
+                                 const Slice& /*end_key*/) {
+      return Status::InvalidArgument("DeleteRangeCF not implemented");
+    }
+
+    // If user-defined timestamp is enabled, then `key` includes timestamp.
+    virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
+                           const Slice& value) {
+      if (column_family_id == 0) {
+        Merge(key, value);
+        return Status::OK();
+      }
+      return Status::InvalidArgument(
+          "non-default column family and MergeCF not implemented");
+    }
+    // If user-defined timestamp is enabled, then `key` includes timestamp.
+    virtual void Merge(const Slice& /*key*/, const Slice& /*value*/) {}
+
+    // If user-defined timestamp is enabled, then `key` includes timestamp.
+    virtual Status PutBlobIndexCF(uint32_t /*column_family_id*/,
+                                  const Slice& /*key*/,
+                                  const Slice& /*value*/) {
+      return Status::InvalidArgument("PutBlobIndexCF not implemented");
+    }
+
+    // The default implementation of LogData does nothing.
+    virtual void LogData(const Slice& blob);
+
+    virtual Status MarkBeginPrepare(bool = false) {
+      return Status::InvalidArgument("MarkBeginPrepare() handler not defined.");
+    }
+
+    virtual Status MarkEndPrepare(const Slice& /*xid*/) {
+      return Status::InvalidArgument("MarkEndPrepare() handler not defined.");
+    }
+
+    virtual Status MarkNoop(bool /*empty_batch*/) {
+      return Status::InvalidArgument("MarkNoop() handler not defined.");
+    }
+
+    virtual Status MarkRollback(const Slice& /*xid*/) {
+      return Status::InvalidArgument(
+          "MarkRollbackPrepare() handler not defined.");
+    }
+
+    virtual Status MarkCommit(const Slice& /*xid*/) {
+      return Status::InvalidArgument("MarkCommit() handler not defined.");
+    }
+
+    virtual Status MarkCommitWithTimestamp(const Slice& /*xid*/,
+                                           const Slice& /*commit_ts*/) {
+      return Status::InvalidArgument(
+          "MarkCommitWithTimestamp() handler not defined.");
+    }
+
+    // Continue is called by WriteBatch::Iterate. If it returns false,
+    // iteration is halted. Otherwise, it continues iterating. The default
+    // implementation always returns true.
+    virtual bool Continue();
+
+   protected:
+    friend class WriteBatchInternal;
+    enum class OptionState {
+      kUnknown,
+      kDisabled,
+      kEnabled,
+    };
+    virtual OptionState WriteAfterCommit() const {
+      return OptionState::kUnknown;
+    }
+    virtual OptionState WriteBeforePrepare() const {
+      return OptionState::kUnknown;
+    }
+  };
+  Status Iterate(Handler* handler) const;
+
+  // Retrieve the serialized version of this batch.
+  const std::string& Data() const { return rep_; }
+
+  // Retrieve data size of the batch.
+  size_t GetDataSize() const { return rep_.size(); }
+
+  // Returns the number of updates in the batch
+  uint32_t Count() const;
+
+  // Returns true if PutCF will be called during Iterate
+  bool HasPut() const;
+
+  // Returns true if PutEntityCF will be called during Iterate
+  bool HasPutEntity() const;
+
+  // Returns true if DeleteCF will be called during Iterate
+  bool HasDelete() const;
+
+  // Returns true if SingleDeleteCF will be called during Iterate
+  bool HasSingleDelete() const;
+
+  // Returns true if DeleteRangeCF will be called during Iterate
+  bool HasDeleteRange() const;
+
+  // Returns true if MergeCF will be called during Iterate
+  bool HasMerge() const;
+
+  // Returns true if MarkBeginPrepare will be called during Iterate
+  bool HasBeginPrepare() const;
+
+  // Returns true if MarkEndPrepare will be called during Iterate
+  bool HasEndPrepare() const;
+
+  // Returns true if MarkCommit will be called during Iterate
+  bool HasCommit() const;
+
+  // Returns true if MarkRollback will be called during Iterate
+  bool HasRollback() const;
+
+  // Experimental.
+  //
+  // Update timestamps of existing entries in the write batch if
+  // applicable. If a key is intended for a column family that disables
+  // timestamp, then this API won't set the timestamp for this key.
+  // This requires that all keys, if enable timestamp, (possibly from multiple
+  // column families) in the write batch have timestamps of the same format.
+  //
+  // ts_sz_func: callable object to obtain the timestamp sizes of column
+  // families. If ts_sz_func() accesses data structures, then the caller of this
+  // API must guarantee thread-safety. Like other parts of RocksDB, this API is
+  // not exception-safe. Therefore, ts_sz_func() must not throw.
+  //
+  // in: cf, the column family id.
+  // ret: timestamp size of the given column family. Return
+  //      std::numeric_limits<size_t>::max() indicating "don't know or column
+  //      family info not found", this will cause UpdateTimestamps() to fail.
+  // size_t ts_sz_func(uint32_t cf);
+  Status UpdateTimestamps(const Slice& ts,
+                          std::function<size_t(uint32_t /*cf*/)> ts_sz_func);
+
+  // Verify the per-key-value checksums of this write batch.
+  // Corruption status will be returned if the verification fails.
+  // If this write batch does not have per-key-value checksum,
+  // OK status will be returned.
+  Status VerifyChecksum() const;
+
+  using WriteBatchBase::GetWriteBatch;
+  WriteBatch* GetWriteBatch() override { return this; }
+
+  // Constructor with a serialized string object
+  explicit WriteBatch(const std::string& rep);
+  explicit WriteBatch(std::string&& rep);
+
+  WriteBatch(const WriteBatch& src);
+  WriteBatch(WriteBatch&& src) noexcept;
+  WriteBatch& operator=(const WriteBatch& src);
+  WriteBatch& operator=(WriteBatch&& src);
+
+  // marks this point in the WriteBatch as the last record to
+  // be inserted into the WAL, provided the WAL is enabled
+  void MarkWalTerminationPoint();
+  const SavePoint& GetWalTerminationPoint() const { return wal_term_point_; }
+
+  void SetMaxBytes(size_t max_bytes) override { max_bytes_ = max_bytes; }
+
+  struct ProtectionInfo;
+  size_t GetProtectionBytesPerKey() const;
+
+ private:
+  friend class WriteBatchInternal;
+  friend class LocalSavePoint;
+  // TODO(myabandeh): this is needed for a hack to collapse the write batch and
+  // remove duplicate keys. Remove it when the hack is replaced with a proper
+  // solution.
+  friend class WriteBatchWithIndex;
+  std::unique_ptr<SavePoints> save_points_;
+
+  // When sending a WriteBatch through WriteImpl we might want to
+  // specify that only the first x records of the batch be written to
+  // the WAL.
+  SavePoint wal_term_point_;
+
+  // Is the content of the batch the application's latest state that meant only
+  // to be used for recovery? Refer to
+  // TransactionOptions::use_only_the_last_commit_time_batch_for_recovery for
+  // more details.
+  bool is_latest_persistent_state_ = false;
+
+  // False if all keys are from column families that disable user-defined
+  // timestamp OR UpdateTimestamps() has been called at least once.
+  // This flag will be set to true if any of the above Put(), Delete(),
+  // SingleDelete(), etc. APIs are called at least once.
+  // Calling Put(ts), Delete(ts), SingleDelete(ts), etc. will not set this flag
+  // to true because the assumption is that these APIs have already set the
+  // timestamps to desired values.
+  bool needs_in_place_update_ts_ = false;
+
+  // True if the write batch contains at least one key from a column family
+  // that enables user-defined timestamp.
+  bool has_key_with_ts_ = false;
+
+  // For HasXYZ.  Mutable to allow lazy computation of results
+  mutable std::atomic<uint32_t> content_flags_;
+
+  // Performs deferred computation of content_flags if necessary
+  uint32_t ComputeContentFlags() const;
+
+  // Maximum size of rep_.
+  size_t max_bytes_;
+
+  std::unique_ptr<ProtectionInfo> prot_info_;
+
+  size_t default_cf_ts_sz_ = 0;
+
+ protected:
+  std::string rep_;  // See comment in write_batch.cc for the format of rep_
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/write_batch_base.h b/src/rocksdb/include/rocksdb/write_batch_base.h
new file mode 100644
index 000000000..f6f39ef0b
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/write_batch_base.h
@@ -0,0 +1,144 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <cstddef>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/wide_columns.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class Status;
+class ColumnFamilyHandle;
+class WriteBatch;
+struct SliceParts;
+
+// Abstract base class that defines the basic interface for a write batch.
+// See WriteBatch for a basic implementation and WrithBatchWithIndex for an
+// indexed implementation.
+class WriteBatchBase {
+ public:
+  virtual ~WriteBatchBase() {}
+
+  // Store the mapping "key->value" in the database.
+  virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& value) = 0;
+  virtual Status Put(const Slice& key, const Slice& value) = 0;
+  virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& ts, const Slice& value) = 0;
+
+  // Variant of Put() that gathers output like writev(2).  The key and value
+  // that will be written to the database are concatenations of arrays of
+  // slices.
+  virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+                     const SliceParts& value);
+  virtual Status Put(const SliceParts& key, const SliceParts& value);
+
+  // Store the mapping "key->{column1:value1, column2:value2, ...}" in the
+  // column family specified by "column_family".
+  virtual Status PutEntity(ColumnFamilyHandle* column_family, const Slice& key,
+                           const WideColumns& columns) = 0;
+
+  // Merge "value" with the existing value of "key" in the database.
+  // "key->merge(existing, value)"
+  virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) = 0;
+  virtual Status Merge(const Slice& key, const Slice& value) = 0;
+  virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& ts, const Slice& value) = 0;
+
+  // variant that takes SliceParts
+  virtual Status Merge(ColumnFamilyHandle* column_family, const SliceParts& key,
+                       const SliceParts& value);
+  virtual Status Merge(const SliceParts& key, const SliceParts& value);
+
+  // If the database contains a mapping for "key", erase it.  Else do nothing.
+  virtual Status Delete(ColumnFamilyHandle* column_family,
+                        const Slice& key) = 0;
+  virtual Status Delete(const Slice& key) = 0;
+  virtual Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
+                        const Slice& ts) = 0;
+
+  // variant that takes SliceParts
+  virtual Status Delete(ColumnFamilyHandle* column_family,
+                        const SliceParts& key);
+  virtual Status Delete(const SliceParts& key);
+
+  // If the database contains a mapping for "key", erase it. Expects that the
+  // key was not overwritten. Else do nothing.
+  virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+                              const Slice& key) = 0;
+  virtual Status SingleDelete(const Slice& key) = 0;
+  virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+                              const Slice& key, const Slice& ts) = 0;
+
+  // variant that takes SliceParts
+  virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+                              const SliceParts& key);
+  virtual Status SingleDelete(const SliceParts& key);
+
+  // If the database contains mappings in the range ["begin_key", "end_key"),
+  // erase them. Else do nothing.
+  virtual Status DeleteRange(ColumnFamilyHandle* column_family,
+                             const Slice& begin_key, const Slice& end_key) = 0;
+  virtual Status DeleteRange(const Slice& begin_key, const Slice& end_key) = 0;
+  virtual Status DeleteRange(ColumnFamilyHandle* column_family,
+                             const Slice& begin_key, const Slice& end_key,
+                             const Slice& ts) = 0;
+
+  // variant that takes SliceParts
+  virtual Status DeleteRange(ColumnFamilyHandle* column_family,
+                             const SliceParts& begin_key,
+                             const SliceParts& end_key);
+  virtual Status DeleteRange(const SliceParts& begin_key,
+                             const SliceParts& end_key);
+
+  // Append a blob of arbitrary size to the records in this batch. The blob will
+  // be stored in the transaction log but not in any other file. In particular,
+  // it will not be persisted to the SST files. When iterating over this
+  // WriteBatch, WriteBatch::Handler::LogData will be called with the contents
+  // of the blob as it is encountered. Blobs, puts, deletes, and merges will be
+  // encountered in the same order in which they were inserted. The blob will
+  // NOT consume sequence number(s) and will NOT increase the count of the batch
+  //
+  // Example application: add timestamps to the transaction log for use in
+  // replication.
+  virtual Status PutLogData(const Slice& blob) = 0;
+
+  // Clear all updates buffered in this batch.
+  virtual void Clear() = 0;
+
+  // Covert this batch into a WriteBatch.  This is an abstracted way of
+  // converting any WriteBatchBase(eg WriteBatchWithIndex) into a basic
+  // WriteBatch.
+  virtual WriteBatch* GetWriteBatch() = 0;
+
+  // Records the state of the batch for future calls to RollbackToSavePoint().
+  // May be called multiple times to set multiple save points.
+  virtual void SetSavePoint() = 0;
+
+  // Remove all entries in this batch (Put, Merge, Delete, PutLogData) since the
+  // most recent call to SetSavePoint() and removes the most recent save point.
+  // If there is no previous call to SetSavePoint(), behaves the same as
+  // Clear().
+  virtual Status RollbackToSavePoint() = 0;
+
+  // Pop the most recent save point.
+  // If there is no previous call to SetSavePoint(), Status::NotFound()
+  // will be returned.
+  // Otherwise returns Status::OK().
+  virtual Status PopSavePoint() = 0;
+
+  // Sets the maximum size of the write batch in bytes. 0 means no limit.
+  virtual void SetMaxBytes(size_t max_bytes) = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/include/rocksdb/write_buffer_manager.h b/src/rocksdb/include/rocksdb/write_buffer_manager.h
new file mode 100644
index 000000000..7fb18196d
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/write_buffer_manager.h
@@ -0,0 +1,176 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBufferManager is for managing memory allocation for one or more
+// MemTables.
+
+#pragma once
+
+#include <atomic>
+#include <condition_variable>
+#include <cstddef>
+#include <list>
+#include <mutex>
+
+#include "rocksdb/cache.h"
+
+namespace ROCKSDB_NAMESPACE {
+class CacheReservationManager;
+
+// Interface to block and signal DB instances, intended for RocksDB
+// internal use only. Each DB instance contains ptr to StallInterface.
+class StallInterface {
+ public:
+  virtual ~StallInterface() {}
+
+  virtual void Block() = 0;
+
+  virtual void Signal() = 0;
+};
+
+class WriteBufferManager final {
+ public:
+  // Parameters:
+  // _buffer_size: _buffer_size = 0 indicates no limit. Memory won't be capped.
+  // memory_usage() won't be valid and ShouldFlush() will always return true.
+  //
+  // cache_: if `cache` is provided, we'll put dummy entries in the cache and
+  // cost the memory allocated to the cache. It can be used even if _buffer_size
+  // = 0.
+  //
+  // allow_stall: if set true, it will enable stalling of writes when
+  // memory_usage() exceeds buffer_size. It will wait for flush to complete and
+  // memory usage to drop down.
+  explicit WriteBufferManager(size_t _buffer_size,
+                              std::shared_ptr<Cache> cache = {},
+                              bool allow_stall = false);
+  // No copying allowed
+  WriteBufferManager(const WriteBufferManager&) = delete;
+  WriteBufferManager& operator=(const WriteBufferManager&) = delete;
+
+  ~WriteBufferManager();
+
+  // Returns true if buffer_limit is passed to limit the total memory usage and
+  // is greater than 0.
+  bool enabled() const { return buffer_size() > 0; }
+
+  // Returns true if pointer to cache is passed.
+  bool cost_to_cache() const { return cache_res_mgr_ != nullptr; }
+
+  // Returns the total memory used by memtables.
+  // Only valid if enabled()
+  size_t memory_usage() const {
+    return memory_used_.load(std::memory_order_relaxed);
+  }
+
+  // Returns the total memory used by active memtables.
+  size_t mutable_memtable_memory_usage() const {
+    return memory_active_.load(std::memory_order_relaxed);
+  }
+
+  size_t dummy_entries_in_cache_usage() const;
+
+  // Returns the buffer_size.
+  size_t buffer_size() const {
+    return buffer_size_.load(std::memory_order_relaxed);
+  }
+
+  void SetBufferSize(size_t new_size) {
+    buffer_size_.store(new_size, std::memory_order_relaxed);
+    mutable_limit_.store(new_size * 7 / 8, std::memory_order_relaxed);
+    // Check if stall is active and can be ended.
+    MaybeEndWriteStall();
+  }
+
+  // Below functions should be called by RocksDB internally.
+
+  // Should only be called from write thread
+  bool ShouldFlush() const {
+    if (enabled()) {
+      if (mutable_memtable_memory_usage() >
+          mutable_limit_.load(std::memory_order_relaxed)) {
+        return true;
+      }
+      size_t local_size = buffer_size();
+      if (memory_usage() >= local_size &&
+          mutable_memtable_memory_usage() >= local_size / 2) {
+        // If the memory exceeds the buffer size, we trigger more aggressive
+        // flush. But if already more than half memory is being flushed,
+        // triggering more flush may not help. We will hold it instead.
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Returns true if total memory usage exceeded buffer_size.
+  // We stall the writes untill memory_usage drops below buffer_size. When the
+  // function returns true, all writer threads (including one checking this
+  // condition) across all DBs will be stalled. Stall is allowed only if user
+  // pass allow_stall = true during WriteBufferManager instance creation.
+  //
+  // Should only be called by RocksDB internally .
+  bool ShouldStall() const {
+    if (!allow_stall_ || !enabled()) {
+      return false;
+    }
+
+    return IsStallActive() || IsStallThresholdExceeded();
+  }
+
+  // Returns true if stall is active.
+  bool IsStallActive() const {
+    return stall_active_.load(std::memory_order_relaxed);
+  }
+
+  // Returns true if stalling condition is met.
+  bool IsStallThresholdExceeded() const {
+    return memory_usage() >= buffer_size_;
+  }
+
+  void ReserveMem(size_t mem);
+
+  // We are in the process of freeing `mem` bytes, so it is not considered
+  // when checking the soft limit.
+  void ScheduleFreeMem(size_t mem);
+
+  void FreeMem(size_t mem);
+
+  // Add the DB instance to the queue and block the DB.
+  // Should only be called by RocksDB internally.
+  void BeginWriteStall(StallInterface* wbm_stall);
+
+  // If stall conditions have resolved, remove DB instances from queue and
+  // signal them to continue.
+  void MaybeEndWriteStall();
+
+  void RemoveDBFromQueue(StallInterface* wbm_stall);
+
+ private:
+  std::atomic<size_t> buffer_size_;
+  std::atomic<size_t> mutable_limit_;
+  std::atomic<size_t> memory_used_;
+  // Memory that hasn't been scheduled to free.
+  std::atomic<size_t> memory_active_;
+  std::shared_ptr<CacheReservationManager> cache_res_mgr_;
+  // Protects cache_res_mgr_
+  std::mutex cache_res_mgr_mu_;
+
+  std::list<StallInterface*> queue_;
+  // Protects the queue_ and stall_active_.
+  std::mutex mu_;
+  bool allow_stall_;
+  // Value should only be changed by BeginWriteStall() and MaybeEndWriteStall()
+  // while holding mu_, but it can be read without a lock.
+  std::atomic<bool> stall_active_;
+
+  void ReserveMemWithCache(size_t mem);
+  void FreeMemWithCache(size_t mem);
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/issue_template.md b/src/rocksdb/issue_template.md
new file mode 100644
index 000000000..ca52f5ead
--- /dev/null
+++ b/src/rocksdb/issue_template.md
@@ -0,0 +1,7 @@
+> Note: Please use Issues only for bug reports. For questions, discussions, feature requests, etc. post to dev group: https://groups.google.com/forum/#!forum/rocksdb or https://www.facebook.com/groups/rocksdb.dev
+
+### Expected behavior
+
+### Actual behavior
+
+### Steps to reproduce the behavior
diff --git a/src/rocksdb/java/CMakeLists.txt b/src/rocksdb/java/CMakeLists.txt
new file mode 100644
index 000000000..5d62630fd
--- /dev/null
+++ b/src/rocksdb/java/CMakeLists.txt
@@ -0,0 +1,549 @@
+cmake_minimum_required(VERSION 3.4)
+
+if(${CMAKE_VERSION} VERSION_LESS "3.11.4")
+    message("Please consider switching to CMake 3.11.4 or newer")
+endif()
+
+set(CMAKE_JAVA_COMPILE_FLAGS -source 7)
+
+set(JNI_NATIVE_SOURCES
+        rocksjni/backup_engine_options.cc
+        rocksjni/backupenginejni.cc
+        rocksjni/cassandra_compactionfilterjni.cc
+        rocksjni/cassandra_value_operator.cc
+        rocksjni/checkpoint.cc
+        rocksjni/clock_cache.cc
+        rocksjni/cache.cc
+        rocksjni/columnfamilyhandle.cc
+        rocksjni/compaction_filter.cc
+        rocksjni/compaction_filter_factory.cc
+        rocksjni/compaction_filter_factory_jnicallback.cc
+        rocksjni/compaction_job_info.cc
+        rocksjni/compaction_job_stats.cc
+        rocksjni/compaction_options.cc
+        rocksjni/compaction_options_fifo.cc
+        rocksjni/compaction_options_universal.cc
+        rocksjni/compact_range_options.cc
+        rocksjni/comparator.cc
+        rocksjni/comparatorjnicallback.cc
+        rocksjni/compression_options.cc
+        rocksjni/concurrent_task_limiter.cc
+        rocksjni/config_options.cc
+        rocksjni/env.cc
+        rocksjni/env_options.cc
+        rocksjni/event_listener.cc
+        rocksjni/event_listener_jnicallback.cc
+        rocksjni/filter.cc
+        rocksjni/ingest_external_file_options.cc
+        rocksjni/iterator.cc
+        rocksjni/jnicallback.cc
+        rocksjni/loggerjnicallback.cc
+        rocksjni/lru_cache.cc
+        rocksjni/memory_util.cc
+        rocksjni/memtablejni.cc
+        rocksjni/merge_operator.cc
+        rocksjni/native_comparator_wrapper_test.cc
+        rocksjni/optimistic_transaction_db.cc
+        rocksjni/optimistic_transaction_options.cc
+        rocksjni/options.cc
+        rocksjni/options_util.cc
+        rocksjni/persistent_cache.cc
+        rocksjni/ratelimiterjni.cc
+        rocksjni/remove_emptyvalue_compactionfilterjni.cc
+        rocksjni/restorejni.cc
+        rocksjni/rocks_callback_object.cc
+        rocksjni/rocksdb_exception_test.cc
+        rocksjni/rocksjni.cc
+        rocksjni/slice.cc
+        rocksjni/snapshot.cc
+        rocksjni/sst_file_manager.cc
+        rocksjni/sst_file_writerjni.cc
+        rocksjni/sst_file_readerjni.cc
+        rocksjni/sst_file_reader_iterator.cc
+        rocksjni/sst_partitioner.cc
+        rocksjni/statistics.cc
+        rocksjni/statisticsjni.cc
+        rocksjni/table.cc
+        rocksjni/table_filter.cc
+        rocksjni/table_filter_jnicallback.cc
+        rocksjni/testable_event_listener.cc
+        rocksjni/thread_status.cc
+        rocksjni/trace_writer.cc
+        rocksjni/trace_writer_jnicallback.cc
+        rocksjni/transaction.cc
+        rocksjni/transaction_db.cc
+        rocksjni/transaction_db_options.cc
+        rocksjni/transaction_log.cc
+        rocksjni/transaction_notifier.cc
+        rocksjni/transaction_notifier_jnicallback.cc
+        rocksjni/transaction_options.cc
+        rocksjni/ttl.cc
+        rocksjni/wal_filter.cc
+        rocksjni/wal_filter_jnicallback.cc
+        rocksjni/write_batch.cc
+        rocksjni/writebatchhandlerjnicallback.cc
+        rocksjni/write_batch_test.cc
+        rocksjni/write_batch_with_index.cc
+        rocksjni/write_buffer_manager.cc
+)
+
+set(JAVA_MAIN_CLASSES
+  src/main/java/org/rocksdb/AbstractCompactionFilter.java
+  src/main/java/org/rocksdb/AbstractCompactionFilterFactory.java
+  src/main/java/org/rocksdb/AbstractComparator.java
+  src/main/java/org/rocksdb/AbstractEventListener.java
+  src/main/java/org/rocksdb/AbstractImmutableNativeReference.java
+  src/main/java/org/rocksdb/AbstractMutableOptions.java
+  src/main/java/org/rocksdb/AbstractNativeReference.java
+  src/main/java/org/rocksdb/AbstractRocksIterator.java
+  src/main/java/org/rocksdb/AbstractSlice.java
+  src/main/java/org/rocksdb/AbstractTableFilter.java
+  src/main/java/org/rocksdb/AbstractTraceWriter.java
+  src/main/java/org/rocksdb/AbstractTransactionNotifier.java
+  src/main/java/org/rocksdb/AbstractWalFilter.java
+  src/main/java/org/rocksdb/AbstractWriteBatch.java
+  src/main/java/org/rocksdb/AccessHint.java
+  src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
+  src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java
+  src/main/java/org/rocksdb/BackgroundErrorReason.java
+  src/main/java/org/rocksdb/BackupEngineOptions.java
+  src/main/java/org/rocksdb/BackupEngine.java
+  src/main/java/org/rocksdb/BackupInfo.java
+  src/main/java/org/rocksdb/BlockBasedTableConfig.java
+  src/main/java/org/rocksdb/BloomFilter.java
+  src/main/java/org/rocksdb/BuiltinComparator.java
+  src/main/java/org/rocksdb/ByteBufferGetStatus.java
+  src/main/java/org/rocksdb/Cache.java
+  src/main/java/org/rocksdb/CassandraCompactionFilter.java
+  src/main/java/org/rocksdb/CassandraValueMergeOperator.java
+  src/main/java/org/rocksdb/Checkpoint.java
+  src/main/java/org/rocksdb/ChecksumType.java
+  src/main/java/org/rocksdb/ClockCache.java
+  src/main/java/org/rocksdb/ColumnFamilyDescriptor.java
+  src/main/java/org/rocksdb/ColumnFamilyHandle.java
+  src/main/java/org/rocksdb/ColumnFamilyMetaData.java
+  src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java
+  src/main/java/org/rocksdb/ColumnFamilyOptions.java
+  src/main/java/org/rocksdb/CompactionJobInfo.java
+  src/main/java/org/rocksdb/CompactionJobStats.java
+  src/main/java/org/rocksdb/CompactionOptions.java
+  src/main/java/org/rocksdb/CompactionOptionsFIFO.java
+  src/main/java/org/rocksdb/CompactionOptionsUniversal.java
+  src/main/java/org/rocksdb/CompactionPriority.java
+  src/main/java/org/rocksdb/CompactionReason.java
+  src/main/java/org/rocksdb/CompactRangeOptions.java
+  src/main/java/org/rocksdb/CompactionStopStyle.java
+  src/main/java/org/rocksdb/CompactionStyle.java
+  src/main/java/org/rocksdb/ComparatorOptions.java
+  src/main/java/org/rocksdb/ComparatorType.java
+  src/main/java/org/rocksdb/CompressionOptions.java
+  src/main/java/org/rocksdb/CompressionType.java
+  src/main/java/org/rocksdb/ConfigOptions.java
+  src/main/java/org/rocksdb/DataBlockIndexType.java
+  src/main/java/org/rocksdb/DBOptionsInterface.java
+  src/main/java/org/rocksdb/DBOptions.java
+  src/main/java/org/rocksdb/DbPath.java
+  src/main/java/org/rocksdb/DirectSlice.java
+  src/main/java/org/rocksdb/EncodingType.java
+  src/main/java/org/rocksdb/Env.java
+  src/main/java/org/rocksdb/EnvOptions.java
+  src/main/java/org/rocksdb/EventListener.java
+  src/main/java/org/rocksdb/Experimental.java
+  src/main/java/org/rocksdb/ExternalFileIngestionInfo.java
+  src/main/java/org/rocksdb/Filter.java
+  src/main/java/org/rocksdb/FileOperationInfo.java
+  src/main/java/org/rocksdb/FlushJobInfo.java
+  src/main/java/org/rocksdb/FlushReason.java
+  src/main/java/org/rocksdb/FlushOptions.java
+  src/main/java/org/rocksdb/HashLinkedListMemTableConfig.java
+  src/main/java/org/rocksdb/HashSkipListMemTableConfig.java
+  src/main/java/org/rocksdb/HistogramData.java
+  src/main/java/org/rocksdb/HistogramType.java
+  src/main/java/org/rocksdb/Holder.java
+  src/main/java/org/rocksdb/IndexShorteningMode.java
+  src/main/java/org/rocksdb/IndexType.java
+  src/main/java/org/rocksdb/InfoLogLevel.java
+  src/main/java/org/rocksdb/IngestExternalFileOptions.java
+  src/main/java/org/rocksdb/LevelMetaData.java
+  src/main/java/org/rocksdb/ConcurrentTaskLimiter.java
+  src/main/java/org/rocksdb/ConcurrentTaskLimiterImpl.java
+  src/main/java/org/rocksdb/KeyMayExist.java
+  src/main/java/org/rocksdb/LiveFileMetaData.java
+  src/main/java/org/rocksdb/LogFile.java
+  src/main/java/org/rocksdb/Logger.java
+  src/main/java/org/rocksdb/LRUCache.java
+  src/main/java/org/rocksdb/MemoryUsageType.java
+  src/main/java/org/rocksdb/MemoryUtil.java
+  src/main/java/org/rocksdb/MemTableConfig.java
+  src/main/java/org/rocksdb/MemTableInfo.java
+  src/main/java/org/rocksdb/MergeOperator.java
+  src/main/java/org/rocksdb/MutableColumnFamilyOptions.java
+  src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java
+  src/main/java/org/rocksdb/MutableDBOptions.java
+  src/main/java/org/rocksdb/MutableDBOptionsInterface.java
+  src/main/java/org/rocksdb/MutableOptionKey.java
+  src/main/java/org/rocksdb/MutableOptionValue.java
+  src/main/java/org/rocksdb/NativeComparatorWrapper.java
+  src/main/java/org/rocksdb/NativeLibraryLoader.java
+  src/main/java/org/rocksdb/OperationStage.java
+  src/main/java/org/rocksdb/OperationType.java
+  src/main/java/org/rocksdb/OptimisticTransactionDB.java
+  src/main/java/org/rocksdb/OptimisticTransactionOptions.java
+  src/main/java/org/rocksdb/Options.java
+  src/main/java/org/rocksdb/OptionString.java
+  src/main/java/org/rocksdb/OptionsUtil.java
+  src/main/java/org/rocksdb/PersistentCache.java
+  src/main/java/org/rocksdb/PlainTableConfig.java
+  src/main/java/org/rocksdb/PrepopulateBlobCache.java
+  src/main/java/org/rocksdb/Priority.java
+  src/main/java/org/rocksdb/Range.java
+  src/main/java/org/rocksdb/RateLimiter.java
+  src/main/java/org/rocksdb/RateLimiterMode.java
+  src/main/java/org/rocksdb/ReadOptions.java
+  src/main/java/org/rocksdb/ReadTier.java
+  src/main/java/org/rocksdb/RemoveEmptyValueCompactionFilter.java
+  src/main/java/org/rocksdb/RestoreOptions.java
+  src/main/java/org/rocksdb/ReusedSynchronisationType.java
+  src/main/java/org/rocksdb/RocksCallbackObject.java
+  src/main/java/org/rocksdb/RocksDBException.java
+  src/main/java/org/rocksdb/RocksDB.java
+  src/main/java/org/rocksdb/RocksEnv.java
+  src/main/java/org/rocksdb/RocksIteratorInterface.java
+  src/main/java/org/rocksdb/RocksIterator.java
+  src/main/java/org/rocksdb/RocksMemEnv.java
+  src/main/java/org/rocksdb/RocksMutableObject.java
+  src/main/java/org/rocksdb/RocksObject.java
+  src/main/java/org/rocksdb/SanityLevel.java
+  src/main/java/org/rocksdb/SizeApproximationFlag.java
+  src/main/java/org/rocksdb/SkipListMemTableConfig.java
+  src/main/java/org/rocksdb/Slice.java
+  src/main/java/org/rocksdb/Snapshot.java
+  src/main/java/org/rocksdb/SstFileManager.java
+  src/main/java/org/rocksdb/SstFileMetaData.java
+  src/main/java/org/rocksdb/SstFileReader.java
+  src/main/java/org/rocksdb/SstFileReaderIterator.java
+  src/main/java/org/rocksdb/SstFileWriter.java
+  src/main/java/org/rocksdb/SstPartitionerFactory.java
+  src/main/java/org/rocksdb/SstPartitionerFixedPrefixFactory.java
+  src/main/java/org/rocksdb/StateType.java
+  src/main/java/org/rocksdb/StatisticsCollectorCallback.java
+  src/main/java/org/rocksdb/StatisticsCollector.java
+  src/main/java/org/rocksdb/Statistics.java
+  src/main/java/org/rocksdb/StatsCollectorInput.java
+  src/main/java/org/rocksdb/StatsLevel.java
+  src/main/java/org/rocksdb/Status.java
+  src/main/java/org/rocksdb/StringAppendOperator.java
+  src/main/java/org/rocksdb/TableFileCreationBriefInfo.java
+  src/main/java/org/rocksdb/TableFileCreationInfo.java
+  src/main/java/org/rocksdb/TableFileCreationReason.java
+  src/main/java/org/rocksdb/TableFileDeletionInfo.java
+  src/main/java/org/rocksdb/TableFilter.java
+  src/main/java/org/rocksdb/TableProperties.java
+  src/main/java/org/rocksdb/TableFormatConfig.java
+  src/main/java/org/rocksdb/ThreadType.java
+  src/main/java/org/rocksdb/ThreadStatus.java
+  src/main/java/org/rocksdb/TickerType.java
+  src/main/java/org/rocksdb/TimedEnv.java
+  src/main/java/org/rocksdb/TraceOptions.java
+  src/main/java/org/rocksdb/TraceWriter.java
+  src/main/java/org/rocksdb/TransactionalDB.java
+  src/main/java/org/rocksdb/TransactionalOptions.java
+  src/main/java/org/rocksdb/TransactionDB.java
+  src/main/java/org/rocksdb/TransactionDBOptions.java
+  src/main/java/org/rocksdb/Transaction.java
+  src/main/java/org/rocksdb/TransactionLogIterator.java
+  src/main/java/org/rocksdb/TransactionOptions.java
+  src/main/java/org/rocksdb/TtlDB.java
+  src/main/java/org/rocksdb/TxnDBWritePolicy.java
+  src/main/java/org/rocksdb/VectorMemTableConfig.java
+  src/main/java/org/rocksdb/WalFileType.java
+  src/main/java/org/rocksdb/WalFilter.java
+  src/main/java/org/rocksdb/WalProcessingOption.java
+  src/main/java/org/rocksdb/WALRecoveryMode.java
+  src/main/java/org/rocksdb/WBWIRocksIterator.java
+  src/main/java/org/rocksdb/WriteBatch.java
+  src/main/java/org/rocksdb/WriteBatchInterface.java
+  src/main/java/org/rocksdb/WriteBatchWithIndex.java
+  src/main/java/org/rocksdb/WriteOptions.java
+  src/main/java/org/rocksdb/WriteBufferManager.java
+  src/main/java/org/rocksdb/WriteStallCondition.java
+  src/main/java/org/rocksdb/WriteStallInfo.java
+  src/main/java/org/rocksdb/util/ByteUtil.java
+  src/main/java/org/rocksdb/util/BytewiseComparator.java
+  src/main/java/org/rocksdb/util/Environment.java
+  src/main/java/org/rocksdb/util/IntComparator.java
+  src/main/java/org/rocksdb/util/ReverseBytewiseComparator.java
+  src/main/java/org/rocksdb/util/SizeUnit.java
+  src/main/java/org/rocksdb/UInt64AddOperator.java
+)
+
+set(JAVA_TEST_CLASSES
+  src/test/java/org/rocksdb/BackupEngineTest.java
+  src/test/java/org/rocksdb/IngestExternalFileOptionsTest.java
+  src/test/java/org/rocksdb/NativeComparatorWrapperTest.java
+  src/test/java/org/rocksdb/PlatformRandomHelper.java
+  src/test/java/org/rocksdb/RocksDBExceptionTest.java
+  src/test/java/org/rocksdb/RocksNativeLibraryResource.java
+  src/test/java/org/rocksdb/SnapshotTest.java
+  src/test/java/org/rocksdb/WriteBatchTest.java
+  src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java
+  src/test/java/org/rocksdb/util/WriteBatchGetter.java
+  src/test/java/org/rocksdb/test/TestableEventListener.java
+)
+
+include(FindJava)
+include(UseJava)
+find_package(JNI)
+
+include_directories(${JNI_INCLUDE_DIRS})
+include_directories(${PROJECT_SOURCE_DIR}/java)
+
+set(JAVA_TEST_LIBDIR ${PROJECT_SOURCE_DIR}/java/test-libs)
+set(JAVA_TMP_JAR ${JAVA_TEST_LIBDIR}/tmp.jar)
+set(JAVA_JUNIT_JAR ${JAVA_TEST_LIBDIR}/junit-4.12.jar)
+set(JAVA_HAMCR_JAR ${JAVA_TEST_LIBDIR}/hamcrest-core-1.3.jar)
+set(JAVA_MOCKITO_JAR ${JAVA_TEST_LIBDIR}/mockito-all-1.10.19.jar)
+set(JAVA_CGLIB_JAR ${JAVA_TEST_LIBDIR}/cglib-2.2.2.jar)
+set(JAVA_ASSERTJ_JAR ${JAVA_TEST_LIBDIR}/assertj-core-1.7.1.jar)
+set(JAVA_TESTCLASSPATH ${JAVA_JUNIT_JAR} ${JAVA_HAMCR_JAR} ${JAVA_MOCKITO_JAR} ${JAVA_CGLIB_JAR} ${JAVA_ASSERTJ_JAR})
+
+set(JNI_OUTPUT_DIR ${PROJECT_SOURCE_DIR}/java/include)
+file(MAKE_DIRECTORY ${JNI_OUTPUT_DIR})
+
+if(${Java_VERSION_MINOR} VERSION_LESS_EQUAL "7" AND ${Java_VERSION_MAJOR} STREQUAL "1")
+  message(FATAL_ERROR "Detected Java 7 or older (${Java_VERSION_STRING}), minimum required version in now Java 8")
+endif()
+
+if(${Java_VERSION_MAJOR} VERSION_GREATER_EQUAL "10" AND ${CMAKE_VERSION} VERSION_LESS "3.11.4")
+  # Java 10 and newer don't have javah, but the alternative GENERATE_NATIVE_HEADERS requires CMake 3.11.4 or newer
+  message(FATAL_ERROR "Detected Java 10 or newer (${Java_VERSION_STRING}), to build with CMake please upgrade CMake to 3.11.4 or newer")
+
+elseif(${CMAKE_VERSION} VERSION_LESS "3.11.4")
+  # Old CMake
+  message("Using an old CMAKE (${CMAKE_VERSION}) - JNI headers generated in separate step")
+  add_jar(
+      rocksdbjni_classes
+      SOURCES
+      ${JAVA_MAIN_CLASSES}
+      ${JAVA_TEST_CLASSES}
+      INCLUDE_JARS ${JAVA_TESTCLASSPATH}
+  )
+
+else ()
+  # Java 1.8 or newer prepare the JAR...
+  message("Preparing Jar for JDK ${Java_VERSION_STRING}")
+  add_jar(
+      rocksdbjni_classes
+      SOURCES
+      ${JAVA_MAIN_CLASSES}
+      ${JAVA_TEST_CLASSES}
+      INCLUDE_JARS ${JAVA_TESTCLASSPATH}
+      GENERATE_NATIVE_HEADERS rocksdbjni_headers DESTINATION ${JNI_OUTPUT_DIR}
+  )
+
+endif()
+
+if(NOT EXISTS ${PROJECT_SOURCE_DIR}/java/classes)
+  file(MAKE_DIRECTORY ${PROJECT_SOURCE_DIR}/java/classes)
+endif()
+
+if(NOT EXISTS ${JAVA_TEST_LIBDIR})
+  file(MAKE_DIRECTORY mkdir ${JAVA_TEST_LIBDIR})
+endif()
+
+if (DEFINED CUSTOM_DEPS_URL)
+  set(DEPS_URL ${CUSTOM_DEPS_URL}/)
+else ()
+  # Using a Facebook AWS account for S3 storage. (maven.org has a history
+  # of failing in Travis builds.)
+  set(DEPS_URL "https://rocksdb-deps.s3-us-west-2.amazonaws.com/jars")
+endif()
+
+if(NOT EXISTS ${JAVA_JUNIT_JAR})
+  message("Downloading ${JAVA_JUNIT_JAR}")
+  file(DOWNLOAD ${DEPS_URL}/junit-4.12.jar ${JAVA_TMP_JAR} STATUS downloadStatus)
+  list(GET downloadStatus 0 error_code)
+  list(GET downloadStatus 1 error_message)
+  if(NOT error_code EQUAL 0)
+    message(FATAL_ERROR "Failed downloading ${JAVA_JUNIT_JAR}: ${error_message}")
+  endif()
+  file(RENAME ${JAVA_TMP_JAR} ${JAVA_JUNIT_JAR})
+endif()
+if(NOT EXISTS ${JAVA_HAMCR_JAR})
+  message("Downloading ${JAVA_HAMCR_JAR}")
+  file(DOWNLOAD ${DEPS_URL}/hamcrest-core-1.3.jar ${JAVA_TMP_JAR} STATUS downloadStatus)
+  list(GET downloadStatus 0 error_code)
+  list(GET downloadStatus 1 error_message)
+  if(NOT error_code EQUAL 0)
+    message(FATAL_ERROR "Failed downloading ${JAVA_HAMCR_JAR}: ${error_message}")
+  endif()
+  file(RENAME ${JAVA_TMP_JAR} ${JAVA_HAMCR_JAR})
+endif()
+if(NOT EXISTS ${JAVA_MOCKITO_JAR})
+  message("Downloading ${JAVA_MOCKITO_JAR}")
+  file(DOWNLOAD ${DEPS_URL}/mockito-all-1.10.19.jar ${JAVA_TMP_JAR} STATUS downloadStatus)
+  list(GET downloadStatus 0 error_code)
+  list(GET downloadStatus 1 error_message)
+  if(NOT error_code EQUAL 0)
+    message(FATAL_ERROR "Failed downloading ${JAVA_MOCKITO_JAR}: ${error_message}")
+  endif()
+  file(RENAME ${JAVA_TMP_JAR} ${JAVA_MOCKITO_JAR})
+endif()
+if(NOT EXISTS ${JAVA_CGLIB_JAR})
+  message("Downloading ${JAVA_CGLIB_JAR}")
+  file(DOWNLOAD ${DEPS_URL}/cglib-2.2.2.jar ${JAVA_TMP_JAR} STATUS downloadStatus)
+  list(GET downloadStatus 0 error_code)
+  list(GET downloadStatus 1 error_message)
+  if(NOT error_code EQUAL 0)
+    message(FATAL_ERROR "Failed downloading ${JAVA_CGLIB_JAR}: ${error_message}")
+  endif()
+  file(RENAME ${JAVA_TMP_JAR} ${JAVA_CGLIB_JAR})
+endif()
+if(NOT EXISTS ${JAVA_ASSERTJ_JAR})
+  message("Downloading ${JAVA_ASSERTJ_JAR}")
+  file(DOWNLOAD ${DEPS_URL}/assertj-core-1.7.1.jar ${JAVA_TMP_JAR} STATUS downloadStatus)
+  list(GET downloadStatus 0 error_code)
+  list(GET downloadStatus 1 error_message)
+  if(NOT error_code EQUAL 0)
+    message(FATAL_ERROR "Failed downloading ${JAVA_ASSERTJ_JAR}: ${error_message}")
+  endif()
+  file(RENAME ${JAVA_TMP_JAR} ${JAVA_ASSERTJ_JAR})
+endif()
+
+if(${CMAKE_VERSION} VERSION_LESS "3.11.4")
+  # Old CMake ONLY generate JNI headers, otherwise JNI is handled in add_jar step above
+  message("Preparing JNI headers for old CMake (${CMAKE_VERSION})")
+  set(NATIVE_JAVA_CLASSES
+          org.rocksdb.AbstractCompactionFilter
+          org.rocksdb.AbstractCompactionFilterFactory
+          org.rocksdb.AbstractComparator
+          org.rocksdb.AbstractEventListener
+          org.rocksdb.AbstractImmutableNativeReference
+          org.rocksdb.AbstractNativeReference
+          org.rocksdb.AbstractRocksIterator
+          org.rocksdb.AbstractSlice
+          org.rocksdb.AbstractTableFilter
+          org.rocksdb.AbstractTraceWriter
+          org.rocksdb.AbstractTransactionNotifier
+          org.rocksdb.AbstractWalFilter
+          org.rocksdb.BackupEngineOptions
+          org.rocksdb.BackupEngine
+          org.rocksdb.BlockBasedTableConfig
+          org.rocksdb.BloomFilter
+          org.rocksdb.CassandraCompactionFilter
+          org.rocksdb.CassandraValueMergeOperator
+          org.rocksdb.Checkpoint
+          org.rocksdb.ClockCache
+          org.rocksdb.Cache
+          org.rocksdb.ColumnFamilyHandle
+          org.rocksdb.ColumnFamilyOptions
+          org.rocksdb.CompactionJobInfo
+          org.rocksdb.CompactionJobStats
+          org.rocksdb.CompactionOptions
+          org.rocksdb.CompactionOptionsFIFO
+          org.rocksdb.CompactionOptionsUniversal
+          org.rocksdb.CompactRangeOptions
+          org.rocksdb.ComparatorOptions
+          org.rocksdb.CompressionOptions
+          org.rocksdb.ConcurrentTaskLimiterImpl
+          org.rocksdb.ConfigOptions
+          org.rocksdb.DBOptions
+          org.rocksdb.DirectSlice
+          org.rocksdb.Env
+          org.rocksdb.EnvOptions
+          org.rocksdb.Filter
+          org.rocksdb.FlushOptions
+          org.rocksdb.HashLinkedListMemTableConfig
+          org.rocksdb.HashSkipListMemTableConfig
+          org.rocksdb.IngestExternalFileOptions
+          org.rocksdb.Logger
+          org.rocksdb.LRUCache
+          org.rocksdb.MemoryUtil
+          org.rocksdb.MemTableConfig
+          org.rocksdb.NativeComparatorWrapper
+          org.rocksdb.NativeLibraryLoader
+          org.rocksdb.OptimisticTransactionDB
+          org.rocksdb.OptimisticTransactionOptions
+          org.rocksdb.Options
+          org.rocksdb.OptionsUtil
+          org.rocksdb.PersistentCache
+          org.rocksdb.PlainTableConfig
+          org.rocksdb.RateLimiter
+          org.rocksdb.ReadOptions
+          org.rocksdb.RemoveEmptyValueCompactionFilter
+          org.rocksdb.RestoreOptions
+          org.rocksdb.RocksCallbackObject
+          org.rocksdb.RocksDB
+          org.rocksdb.RocksEnv
+          org.rocksdb.RocksIterator
+          org.rocksdb.RocksIteratorInterface
+          org.rocksdb.RocksMemEnv
+          org.rocksdb.RocksMutableObject
+          org.rocksdb.RocksObject
+          org.rocksdb.SkipListMemTableConfig
+          org.rocksdb.Slice
+          org.rocksdb.Snapshot
+          org.rocksdb.SstFileManager
+          org.rocksdb.SstFileWriter
+          org.rocksdb.SstFileReader
+          org.rocksdb.SstFileReaderIterator
+          org.rocksdb.SstPartitionerFactory
+          org.rocksdb.SstPartitionerFixedPrefixFactory
+          org.rocksdb.Statistics
+          org.rocksdb.StringAppendOperator
+          org.rocksdb.TableFormatConfig
+          org.rocksdb.ThreadStatus
+          org.rocksdb.TimedEnv
+          org.rocksdb.Transaction
+          org.rocksdb.TransactionDB
+          org.rocksdb.TransactionDBOptions
+          org.rocksdb.TransactionLogIterator
+          org.rocksdb.TransactionOptions
+          org.rocksdb.TtlDB
+          org.rocksdb.UInt64AddOperator
+          org.rocksdb.VectorMemTableConfig
+          org.rocksdb.WBWIRocksIterator
+          org.rocksdb.WriteBatch
+          org.rocksdb.WriteBatch.Handler
+          org.rocksdb.WriteBatchInterface
+          org.rocksdb.WriteBatchWithIndex
+          org.rocksdb.WriteOptions
+          org.rocksdb.NativeComparatorWrapperTest
+          org.rocksdb.RocksDBExceptionTest
+          org.rocksdb.SnapshotTest
+          org.rocksdb.WriteBatchTest
+          org.rocksdb.WriteBatchTestInternalHelper
+          org.rocksdb.WriteBufferManager
+          org.rocksdb.test.TestableEventListener
+  )
+
+  create_javah(
+    TARGET rocksdbjni_headers
+    CLASSES ${NATIVE_JAVA_CLASSES}
+    CLASSPATH rocksdbjni_classes ${JAVA_TESTCLASSPATH}
+    OUTPUT_DIR ${JNI_OUTPUT_DIR}
+  )
+endif()
+
+if(NOT MSVC)
+  set_property(TARGET ${ROCKSDB_STATIC_LIB} PROPERTY POSITION_INDEPENDENT_CODE ON)
+endif()
+
+set(ROCKSDBJNI_STATIC_LIB rocksdbjni${ARTIFACT_SUFFIX})
+add_library(${ROCKSDBJNI_STATIC_LIB} ${JNI_NATIVE_SOURCES})
+add_dependencies(${ROCKSDBJNI_STATIC_LIB} rocksdbjni_headers)
+target_link_libraries(${ROCKSDBJNI_STATIC_LIB} ${ROCKSDB_STATIC_LIB} ${ROCKSDB_LIB})
+
+if(NOT MINGW)
+  set(ROCKSDBJNI_SHARED_LIB rocksdbjni-shared${ARTIFACT_SUFFIX})
+  add_library(${ROCKSDBJNI_SHARED_LIB} SHARED ${JNI_NATIVE_SOURCES})
+  add_dependencies(${ROCKSDBJNI_SHARED_LIB} rocksdbjni_headers)
+  target_link_libraries(${ROCKSDBJNI_SHARED_LIB} ${ROCKSDB_STATIC_LIB} ${ROCKSDB_LIB})
+
+  set_target_properties(
+    ${ROCKSDBJNI_SHARED_LIB}
+    PROPERTIES
+    COMPILE_PDB_OUTPUT_DIRECTORY ${CMAKE_CFG_INTDIR}
+    COMPILE_PDB_NAME ${ROCKSDBJNI_STATIC_LIB}.pdb
+  )
+endif()
diff --git a/src/rocksdb/java/HISTORY-JAVA.md b/src/rocksdb/java/HISTORY-JAVA.md
new file mode 100644
index 000000000..731886a61
--- /dev/null
+++ b/src/rocksdb/java/HISTORY-JAVA.md
@@ -0,0 +1,86 @@
+# RocksJava Change Log
+
+## 3.13 (8/4/2015)
+### New Features
+* Exposed BackupEngine API.
+* Added CappedPrefixExtractor support.  To use such extractor, simply call useCappedPrefixExtractor in either Options or ColumnFamilyOptions.
+* Added RemoveEmptyValueCompactionFilter.
+
+## 3.10.0 (3/24/2015)
+### New Features
+* Added compression per level API.
+* MemEnv is now available in RocksJava via RocksMemEnv class.
+* lz4 compression is now included in rocksjava static library when running `make rocksdbjavastatic`.
+
+### Public API Changes
+* Overflowing a size_t when setting rocksdb options now throws an IllegalArgumentException, which removes the necessity for a developer to catch these Exceptions explicitly.
+* The set and get functions for tableCacheRemoveScanCountLimit are deprecated.
+
+
+## By 01/31/2015
+### New Features
+* WriteBatchWithIndex support.
+* Iterator support for WriteBatch and WriteBatchWithIndex
+* GetUpdatesSince support.
+* Snapshots carry now information about the related sequence number.
+* TTL DB support.
+
+## By 11/14/2014
+### New Features
+* Full support for Column Family.
+* Slice and Comparator support.
+* Default merge operator support.
+* RateLimiter support.
+
+## By 06/15/2014
+### New Features
+* Added basic Java binding for rocksdb::Env such that multiple RocksDB can share the same thread pool and environment.
+* Added RestoreBackupableDB
+
+## By 05/30/2014
+### Internal Framework Improvement
+* Added disOwnNativeHandle to RocksObject, which allows a RocksObject to give-up the ownership of its native handle.  This method is useful when sharing and transferring the ownership of RocksDB C++ resources.
+
+## By 05/15/2014
+### New Features
+* Added RocksObject --- the base class of all RocksDB classes which holds some RocksDB resources in the C++ side.
+* Use environmental variable JAVA_HOME in Makefile for RocksJava
+### Public API changes
+* Renamed org.rocksdb.Iterator to org.rocksdb.RocksIterator to avoid potential confliction with Java built-in Iterator.
+
+## By 04/30/2014
+### New Features
+* Added Java binding for MultiGet.
+* Added static method RocksDB.loadLibrary(), which loads necessary library files.
+* Added Java bindings for 60+ rocksdb::Options.
+* Added Java binding for BloomFilter.
+* Added Java binding for ReadOptions.
+* Added Java binding for memtables.
+* Added Java binding for sst formats.
+* Added Java binding for RocksDB Iterator which enables sequential scan operation.
+* Added Java binding for Statistics
+* Added Java binding for BackupableDB.
+
+### DB Benchmark
+* Added filluniquerandom, readseq benchmark.
+* 70+ command-line options.
+* Enabled BloomFilter configuration.
+
+## By 04/15/2014
+### New Features
+* Added Java binding for WriteOptions.
+* Added Java binding for WriteBatch, which enables batch-write.
+* Added Java binding for rocksdb::Options.
+* Added Java binding for block cache.
+* Added Java version DB Benchmark.
+
+### DB Benchmark
+* Added readwhilewriting benchmark.
+
+### Internal Framework Improvement
+* Avoid a potential byte-array-copy between c++ and Java in RocksDB.get.
+* Added SizeUnit in org.rocksdb.util to store consts like KB and GB.
+
+### 03/28/2014
+* RocksJava project started.
+* Added Java binding for RocksDB, which supports Open, Close, Get and Put.
diff --git a/src/rocksdb/java/Makefile b/src/rocksdb/java/Makefile
new file mode 100644
index 000000000..bc7e121c4
--- /dev/null
+++ b/src/rocksdb/java/Makefile
@@ -0,0 +1,452 @@
+NATIVE_JAVA_CLASSES = \
+	org.rocksdb.AbstractCompactionFilter\
+	org.rocksdb.AbstractCompactionFilterFactory\
+	org.rocksdb.AbstractComparator\
+	org.rocksdb.AbstractEventListener\
+	org.rocksdb.AbstractSlice\
+	org.rocksdb.AbstractTableFilter\
+	org.rocksdb.AbstractTraceWriter\
+	org.rocksdb.AbstractTransactionNotifier\
+	org.rocksdb.AbstractWalFilter\
+	org.rocksdb.BackupEngine\
+	org.rocksdb.BackupEngineOptions\
+	org.rocksdb.BlockBasedTableConfig\
+	org.rocksdb.BloomFilter\
+	org.rocksdb.Checkpoint\
+	org.rocksdb.ClockCache\
+	org.rocksdb.Cache\
+	org.rocksdb.CassandraCompactionFilter\
+	org.rocksdb.CassandraValueMergeOperator\
+	org.rocksdb.ColumnFamilyHandle\
+	org.rocksdb.ColumnFamilyOptions\
+	org.rocksdb.CompactionJobInfo\
+	org.rocksdb.CompactionJobStats\
+	org.rocksdb.CompactionOptions\
+	org.rocksdb.CompactionOptionsFIFO\
+	org.rocksdb.CompactionOptionsUniversal\
+	org.rocksdb.CompactRangeOptions\
+	org.rocksdb.ComparatorOptions\
+	org.rocksdb.CompressionOptions\
+	org.rocksdb.ConfigOptions\
+	org.rocksdb.DBOptions\
+	org.rocksdb.DirectSlice\
+	org.rocksdb.Env\
+	org.rocksdb.EnvOptions\
+	org.rocksdb.FlushOptions\
+	org.rocksdb.Filter\
+	org.rocksdb.IngestExternalFileOptions\
+	org.rocksdb.HashLinkedListMemTableConfig\
+	org.rocksdb.HashSkipListMemTableConfig\
+	org.rocksdb.ConcurrentTaskLimiter\
+	org.rocksdb.ConcurrentTaskLimiterImpl\
+	org.rocksdb.KeyMayExist\
+	org.rocksdb.Logger\
+	org.rocksdb.LRUCache\
+	org.rocksdb.MemoryUsageType\
+	org.rocksdb.MemoryUtil\
+	org.rocksdb.MergeOperator\
+	org.rocksdb.NativeComparatorWrapper\
+	org.rocksdb.OptimisticTransactionDB\
+	org.rocksdb.OptimisticTransactionOptions\
+	org.rocksdb.Options\
+	org.rocksdb.OptionsUtil\
+	org.rocksdb.PersistentCache\
+	org.rocksdb.PlainTableConfig\
+	org.rocksdb.RateLimiter\
+	org.rocksdb.ReadOptions\
+	org.rocksdb.RemoveEmptyValueCompactionFilter\
+	org.rocksdb.RestoreOptions\
+	org.rocksdb.RocksCallbackObject\
+	org.rocksdb.RocksDB\
+	org.rocksdb.RocksEnv\
+	org.rocksdb.RocksIterator\
+	org.rocksdb.RocksMemEnv\
+	org.rocksdb.SkipListMemTableConfig\
+	org.rocksdb.Slice\
+	org.rocksdb.SstFileManager\
+	org.rocksdb.SstFileWriter\
+	org.rocksdb.SstFileReader\
+	org.rocksdb.SstFileReaderIterator\
+	org.rocksdb.SstPartitionerFactory\
+	org.rocksdb.SstPartitionerFixedPrefixFactory\
+	org.rocksdb.Statistics\
+	org.rocksdb.ThreadStatus\
+	org.rocksdb.TimedEnv\
+	org.rocksdb.Transaction\
+	org.rocksdb.TransactionDB\
+	org.rocksdb.TransactionDBOptions\
+	org.rocksdb.TransactionOptions\
+	org.rocksdb.TransactionLogIterator\
+	org.rocksdb.TtlDB\
+	org.rocksdb.VectorMemTableConfig\
+	org.rocksdb.Snapshot\
+	org.rocksdb.StringAppendOperator\
+	org.rocksdb.UInt64AddOperator\
+	org.rocksdb.WriteBatch\
+	org.rocksdb.WriteBatch.Handler\
+	org.rocksdb.WriteOptions\
+	org.rocksdb.WriteBatchWithIndex\
+	org.rocksdb.WriteBufferManager\
+	org.rocksdb.WBWIRocksIterator
+
+NATIVE_JAVA_TEST_CLASSES = \
+    org.rocksdb.RocksDBExceptionTest\
+    org.rocksdb.test.TestableEventListener\
+    org.rocksdb.NativeComparatorWrapperTest.NativeStringComparatorWrapper\
+    org.rocksdb.WriteBatchTest\
+    org.rocksdb.WriteBatchTestInternalHelper
+
+ROCKSDB_MAJOR = $(shell grep -E "ROCKSDB_MAJOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
+ROCKSDB_MINOR = $(shell grep -E "ROCKSDB_MINOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
+ROCKSDB_PATCH = $(shell grep -E "ROCKSDB_PATCH.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
+
+NATIVE_INCLUDE = ./include
+ARCH := $(shell getconf LONG_BIT)
+SHA256_CMD ?= sha256sum
+
+JAVA_TESTS = \
+	org.rocksdb.BackupEngineOptionsTest\
+	org.rocksdb.BackupEngineTest\
+	org.rocksdb.BlobOptionsTest\
+	org.rocksdb.BlockBasedTableConfigTest\
+	org.rocksdb.BuiltinComparatorTest\
+	org.rocksdb.BytewiseComparatorRegressionTest\
+	org.rocksdb.util.BytewiseComparatorTest\
+	org.rocksdb.util.BytewiseComparatorIntTest\
+	org.rocksdb.CheckPointTest\
+	org.rocksdb.ClockCacheTest\
+	org.rocksdb.ColumnFamilyOptionsTest\
+	org.rocksdb.ColumnFamilyTest\
+	org.rocksdb.CompactionFilterFactoryTest\
+	org.rocksdb.CompactionJobInfoTest\
+	org.rocksdb.CompactionJobStatsTest\
+	org.rocksdb.CompactionOptionsTest\
+	org.rocksdb.CompactionOptionsFIFOTest\
+	org.rocksdb.CompactionOptionsUniversalTest\
+	org.rocksdb.CompactionPriorityTest\
+	org.rocksdb.CompactionStopStyleTest\
+	org.rocksdb.ComparatorOptionsTest\
+	org.rocksdb.CompressionOptionsTest\
+	org.rocksdb.CompressionTypesTest\
+	org.rocksdb.DBOptionsTest\
+	org.rocksdb.DirectSliceTest\
+	org.rocksdb.util.EnvironmentTest\
+	org.rocksdb.EnvOptionsTest\
+	org.rocksdb.EventListenerTest\
+	org.rocksdb.IngestExternalFileOptionsTest\
+	org.rocksdb.util.IntComparatorTest\
+	org.rocksdb.util.JNIComparatorTest\
+	org.rocksdb.FilterTest\
+	org.rocksdb.FlushTest\
+	org.rocksdb.InfoLogLevelTest\
+	org.rocksdb.KeyMayExistTest\
+	org.rocksdb.ConcurrentTaskLimiterTest\
+	org.rocksdb.LoggerTest\
+	org.rocksdb.LRUCacheTest\
+	org.rocksdb.MemoryUtilTest\
+	org.rocksdb.MemTableTest\
+	org.rocksdb.MergeTest\
+	org.rocksdb.MultiColumnRegressionTest \
+	org.rocksdb.MultiGetManyKeysTest\
+	org.rocksdb.MultiGetTest\
+	org.rocksdb.MixedOptionsTest\
+	org.rocksdb.MutableColumnFamilyOptionsTest\
+	org.rocksdb.MutableDBOptionsTest\
+	org.rocksdb.MutableOptionsGetSetTest \
+	org.rocksdb.NativeComparatorWrapperTest\
+	org.rocksdb.NativeLibraryLoaderTest\
+	org.rocksdb.OptimisticTransactionTest\
+	org.rocksdb.OptimisticTransactionDBTest\
+	org.rocksdb.OptimisticTransactionOptionsTest\
+	org.rocksdb.OptionsUtilTest\
+	org.rocksdb.OptionsTest\
+	org.rocksdb.PlainTableConfigTest\
+	org.rocksdb.RateLimiterTest\
+	org.rocksdb.ReadOnlyTest\
+	org.rocksdb.ReadOptionsTest\
+	org.rocksdb.util.ReverseBytewiseComparatorIntTest\
+	org.rocksdb.RocksDBTest\
+	org.rocksdb.RocksDBExceptionTest\
+	org.rocksdb.DefaultEnvTest\
+	org.rocksdb.RocksIteratorTest\
+	org.rocksdb.RocksMemEnvTest\
+	org.rocksdb.util.SizeUnitTest\
+	org.rocksdb.SecondaryDBTest\
+	org.rocksdb.SliceTest\
+	org.rocksdb.SnapshotTest\
+	org.rocksdb.SstFileManagerTest\
+	org.rocksdb.SstFileWriterTest\
+	org.rocksdb.SstFileReaderTest\
+	org.rocksdb.SstPartitionerTest\
+	org.rocksdb.TableFilterTest\
+	org.rocksdb.TimedEnvTest\
+	org.rocksdb.TransactionTest\
+	org.rocksdb.TransactionDBTest\
+	org.rocksdb.TransactionOptionsTest\
+	org.rocksdb.TransactionDBOptionsTest\
+	org.rocksdb.TransactionLogIteratorTest\
+	org.rocksdb.TtlDBTest\
+	org.rocksdb.StatisticsTest\
+	org.rocksdb.StatisticsCollectorTest\
+	org.rocksdb.VerifyChecksumsTest\
+	org.rocksdb.WalFilterTest\
+	org.rocksdb.WALRecoveryModeTest\
+	org.rocksdb.WriteBatchHandlerTest\
+	org.rocksdb.WriteBatchTest\
+	org.rocksdb.WriteBatchThreadedTest\
+	org.rocksdb.WriteOptionsTest\
+	org.rocksdb.WriteBatchWithIndexTest
+
+MAIN_SRC = src/main/java
+TEST_SRC = src/test/java
+OUTPUT = target
+MAIN_CLASSES = $(OUTPUT)/classes
+TEST_CLASSES = $(OUTPUT)/test-classes
+JAVADOC = $(OUTPUT)/apidocs
+
+BENCHMARK_MAIN_SRC = benchmark/src/main/java
+BENCHMARK_OUTPUT = benchmark/target
+BENCHMARK_MAIN_CLASSES = $(BENCHMARK_OUTPUT)/classes
+
+SAMPLES_MAIN_SRC = samples/src/main/java
+SAMPLES_OUTPUT = samples/target
+SAMPLES_MAIN_CLASSES = $(SAMPLES_OUTPUT)/classes
+
+JAVA_TEST_LIBDIR = test-libs
+JAVA_JUNIT_VER = 4.13.1
+JAVA_JUNIT_SHA256 = c30719db974d6452793fe191b3638a5777005485bae145924044530ffa5f6122
+JAVA_JUNIT_JAR = junit-$(JAVA_JUNIT_VER).jar
+JAVA_JUNIT_JAR_PATH = $(JAVA_TEST_LIBDIR)/$(JAVA_JUNIT_JAR)
+JAVA_HAMCREST_VER = 2.2
+JAVA_HAMCREST_SHA256 = 5e62846a89f05cd78cd9c1a553f340d002458380c320455dd1f8fc5497a8a1c1
+JAVA_HAMCREST_JAR = hamcrest-$(JAVA_HAMCREST_VER).jar
+JAVA_HAMCREST_JAR_PATH = $(JAVA_TEST_LIBDIR)/$(JAVA_HAMCREST_JAR)
+JAVA_MOCKITO_VER = 1.10.19
+JAVA_MOCKITO_SHA256 = d1a7a7ef14b3db5c0fc3e0a63a81b374b510afe85add9f7984b97911f4c70605
+JAVA_MOCKITO_JAR = mockito-all-$(JAVA_MOCKITO_VER).jar
+JAVA_MOCKITO_JAR_PATH = $(JAVA_TEST_LIBDIR)/$(JAVA_MOCKITO_JAR)
+JAVA_CGLIB_VER = 3.3.0
+JAVA_CGLIB_SHA256 = 9fe0c26d7464140ccdfe019ac687be1fb906122b508ab54beb810db0f09a9212
+JAVA_CGLIB_JAR = cglib-$(JAVA_CGLIB_VER).jar
+JAVA_CGLIB_JAR_PATH = $(JAVA_TEST_LIBDIR)/$(JAVA_CGLIB_JAR)
+JAVA_ASSERTJ_VER = 2.9.0
+JAVA_ASSERTJ_SHA256 = 5e88ea3ecbe3c48aa1346fec76c84979fa9c8d22499f11479011691230e8babf
+JAVA_ASSERTJ_JAR = assertj-core-$(JAVA_ASSERTJ_VER).jar
+JAVA_ASSERTJ_JAR_PATH = $(JAVA_TEST_LIBDIR)/$(JAVA_ASSERTJ_JAR)
+JAVA_TESTCLASSPATH = $(JAVA_JUNIT_JAR_PATH):$(JAVA_HAMCREST_JAR_PATH):$(JAVA_MOCKITO_JAR_PATH):$(JAVA_CGLIB_JAR_PATH):$(JAVA_ASSERTJ_JAR_PATH)
+
+MVN_LOCAL = ~/.m2/repository
+
+# Set the path of the java commands
+ifeq ($(JAVA_CMD),)
+ifneq ($(JAVA_HOME),)
+JAVA_CMD := $(JAVA_HOME)/bin/java
+else
+JAVA_CMD := java
+endif
+endif
+
+ifeq ($(JAVAC_CMD),)
+ifneq ($(JAVA_HOME),)
+JAVAC_CMD := $(JAVA_HOME)/bin/javac
+else
+JAVAC_CMD := javac
+endif
+endif
+
+ifeq ($(JAVADOC_CMD),)
+ifneq ($(JAVA_HOME),)
+JAVADOC_CMD := $(JAVA_HOME)/bin/javadoc
+else
+JAVADOC_CMD := javadoc
+endif
+endif
+
+# Look for the Java version (1.6->6, 1.7->7, 1.8->8, 11.0->11, 13.0->13, 15.0->15 etc..)
+JAVAC_VERSION := $(shell $(JAVAC_CMD) -version 2>&1)
+JAVAC_MAJOR_VERSION := $(word 2,$(subst ., ,$(JAVAC_VERSION)))
+ifeq ($(JAVAC_MAJOR_VERSION),1)
+JAVAC_MAJOR_VERSION := $(word 3,$(subst ., ,$(JAVAC_VERSION)))
+endif
+
+# Test whether the version we see meets our minimum
+MIN_JAVAC_MAJOR_VERSION := 8
+JAVAC_VERSION_GE_MIN := $(shell [ $(JAVAC_MAJOR_VERSION) -ge $(MIN_JAVAC_MAJOR_VERSION) ] > /dev/null 2>&1 && echo true)
+
+# Set the default JAVA_ARGS to "" for DEBUG_LEVEL=0
+JAVA_ARGS ?=
+
+JAVAC_ARGS ?=
+
+# Read plugin configuration
+PLUGIN_PATH = ../plugin
+ROCKSDB_PLUGIN_MKS = $(foreach plugin, $(ROCKSDB_PLUGINS), $(PLUGIN_PATH)/$(plugin)/*.mk)
+include $(ROCKSDB_PLUGIN_MKS)
+
+# Add paths to Java sources in plugins
+ROCKSDB_PLUGIN_JAVA_ROOTS = $(foreach plugin, $(ROCKSDB_PLUGINS), $(PLUGIN_PATH)/$(plugin)/java)
+PLUGIN_SOURCES = $(foreach root, $(ROCKSDB_PLUGIN_JAVA_ROOTS), $(foreach pkg, org/rocksdb/util org/rocksdb, $(root)/$(MAIN_SRC)/$(pkg)/*.java))
+CORE_SOURCES = $(foreach pkg, org/rocksdb/util org/rocksdb, $(MAIN_SRC)/$(pkg)/*.java)
+SOURCES = $(wildcard $(CORE_SOURCES) $(PLUGIN_SOURCES))
+PLUGIN_TEST_SOURCES = $(foreach root, $(ROCKSDB_PLUGIN_JAVA_ROOTS), $(foreach pkg, org/rocksdb/test org/rocksdb/util org/rocksdb, $(root)/$(TEST_SRC)/$(pkg)/*.java))
+CORE_TEST_SOURCES = $(foreach pkg, org/rocksdb/test org/rocksdb/util org/rocksdb, $(TEST_SRC)/$(pkg)/*.java)
+TEST_SOURCES = $(wildcard $(CORE_TEST_SOURCES) $(PLUGIN_TEST_SOURCES))
+
+# Configure the plugin tests and java classes
+ROCKSDB_PLUGIN_NATIVE_JAVA_CLASSES = $(foreach plugin, $(ROCKSDB_PLUGINS), $(foreach class, $($(plugin)_NATIVE_JAVA_CLASSES), $(class)))
+NATIVE_JAVA_CLASSES = $(NATIVE_JAVA_CLASSES) $(ROCKSDB_PLUGIN_NATIVE_JAVA_CLASSES)
+ROCKSDB_PLUGIN_JAVA_TESTS = $(foreach plugin, $(ROCKSDB_PLUGINS), $(foreach testclass, $($(plugin)_JAVA_TESTS), $(testclass)))
+ALL_JAVA_TESTS = $(JAVA_TESTS) $(ROCKSDB_PLUGIN_JAVA_TESTS)
+
+# When debugging add -Xcheck:jni to the java args
+ifneq ($(DEBUG_LEVEL),0)
+	JAVA_ARGS += -ea -Xcheck:jni
+	JAVAC_ARGS += -Xlint:deprecation -Xlint:unchecked
+endif
+
+# Using a Facebook AWS account for S3 storage. (maven.org has a history
+# of failing in Travis builds.)
+DEPS_URL?=https://rocksdb-deps.s3-us-west-2.amazonaws.com/jars
+
+java-version:
+ifneq ($(JAVAC_VERSION_GE_MIN),true)
+	echo 'Java version is $(JAVAC_VERSION), minimum required version is $(MIN_JAVAC_MAJOR_VERSION)'
+	exit 1
+endif
+
+clean: clean-not-downloaded clean-downloaded
+
+clean-not-downloaded:
+	$(AM_V_at)rm -rf $(NATIVE_INCLUDE)
+	$(AM_V_at)rm -rf $(OUTPUT)
+	$(AM_V_at)rm -rf $(BENCHMARK_OUTPUT)
+	$(AM_V_at)rm -rf $(SAMPLES_OUTPUT)
+
+clean-downloaded:
+	$(AM_V_at)rm -rf $(JAVA_TEST_LIBDIR)
+
+
+javadocs: java
+	$(AM_V_GEN)mkdir -p $(JAVADOC)
+	$(AM_V_at)$(JAVADOC_CMD) -d $(JAVADOC) -sourcepath $(MAIN_SRC) -subpackages org
+
+javalib: java java_test javadocs
+
+java: java-version
+	$(AM_V_GEN)mkdir -p $(MAIN_CLASSES)
+	$(AM_V_at) $(JAVAC_CMD) $(JAVAC_ARGS) -h $(NATIVE_INCLUDE) -d $(MAIN_CLASSES) $(SOURCES)
+	$(AM_V_at)@cp ../HISTORY.md ./HISTORY-CPP.md
+	$(AM_V_at)@rm -f ./HISTORY-CPP.md
+
+sample: java
+	$(AM_V_GEN)mkdir -p $(SAMPLES_MAIN_CLASSES)
+	$(AM_V_at)$(JAVAC_CMD) $(JAVAC_ARGS) -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/RocksDBSample.java
+	$(AM_V_at)@rm -rf /tmp/rocksdbjni
+	$(AM_V_at)@rm -rf /tmp/rocksdbjni_not_found
+	$(JAVA_CMD) $(JAVA_ARGS) -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) RocksDBSample /tmp/rocksdbjni
+	$(AM_V_at)@rm -rf /tmp/rocksdbjni
+	$(AM_V_at)@rm -rf /tmp/rocksdbjni_not_found
+
+column_family_sample: java
+	$(AM_V_GEN)mkdir -p $(SAMPLES_MAIN_CLASSES)
+	$(AM_V_at)$(JAVAC_CMD) $(JAVAC_ARGS) -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/RocksDBColumnFamilySample.java
+	$(AM_V_at)@rm -rf /tmp/rocksdbjni
+	$(JAVA_CMD) $(JAVA_ARGS) -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) RocksDBColumnFamilySample /tmp/rocksdbjni
+	$(AM_V_at)@rm -rf /tmp/rocksdbjni
+
+transaction_sample: java
+	$(AM_V_GEN)mkdir -p $(SAMPLES_MAIN_CLASSES)
+	$(AM_V_at)$(JAVAC_CMD) -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/TransactionSample.java
+	$(AM_V_at)@rm -rf /tmp/rocksdbjni
+	$(JAVA_CMD) -ea -Xcheck:jni -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) TransactionSample /tmp/rocksdbjni
+	$(AM_V_at)@rm -rf /tmp/rocksdbjni
+
+optimistic_transaction_sample: java
+	$(AM_V_GEN)mkdir -p $(SAMPLES_MAIN_CLASSES)
+	$(AM_V_at)$(JAVAC_CMD) -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/OptimisticTransactionSample.java
+	$(AM_V_at)@rm -rf /tmp/rocksdbjni
+	$(JAVA_CMD) -ea -Xcheck:jni -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) OptimisticTransactionSample /tmp/rocksdbjni
+	$(AM_V_at)@rm -rf /tmp/rocksdbjni
+
+$(JAVA_TEST_LIBDIR):
+	mkdir -p "$(JAVA_TEST_LIBDIR)"
+
+$(JAVA_JUNIT_JAR_PATH): $(JAVA_TEST_LIBDIR)
+ifneq (,$(wildcard $(MVN_LOCAL)/junit/junit/$(JAVA_JUNIT_VER)/$(JAVA_JUNIT_JAR)))
+	cp -v $(MVN_LOCAL)/junit/junit/$(JAVA_JUNIT_VER)/$(JAVA_JUNIT_JAR) $(JAVA_TEST_LIBDIR)
+else
+	curl --fail --insecure --output $(JAVA_JUNIT_JAR_PATH) --location $(DEPS_URL)/$(JAVA_JUNIT_JAR)
+	JAVA_JUNIT_SHA256_ACTUAL=`$(SHA256_CMD) $(JAVA_JUNIT_JAR_PATH) | cut -d ' ' -f 1`; \
+	if [ "$(JAVA_JUNIT_SHA256)" != "$$JAVA_JUNIT_SHA256_ACTUAL" ]; then \
+		echo $(JAVA_JUNIT_JAR_PATH) checksum mismatch, expected=\"$(JAVA_JUNIT_SHA256)\" actual=\"$$JAVA_JUNIT_SHA256_ACTUAL\"; \
+		exit 1; \
+	fi
+endif
+
+$(JAVA_HAMCREST_JAR_PATH): $(JAVA_TEST_LIBDIR)
+ifneq (,$(wildcard $(MVN_LOCAL)/org/hamcrest/hamcrest/$(JAVA_HAMCREST_VER)/$(JAVA_HAMCREST_JAR)))
+	cp -v $(MVN_LOCAL)/org/hamcrest/hamcrest/$(JAVA_HAMCREST_VER)/$(JAVA_HAMCREST_JAR) $(JAVA_TEST_LIBDIR)
+else
+	curl --fail --insecure --output $(JAVA_HAMCREST_JAR_PATH) --location $(DEPS_URL)/$(JAVA_HAMCREST_JAR)
+	JAVA_HAMCREST_SHA256_ACTUAL=`$(SHA256_CMD) $(JAVA_HAMCREST_JAR_PATH) | cut -d ' ' -f 1`; \
+	if [ "$(JAVA_HAMCREST_SHA256)" != "$$JAVA_HAMCREST_SHA256_ACTUAL" ]; then \
+		echo $(JAVA_HAMCREST_JAR_PATH) checksum mismatch, expected=\"$(JAVA_HAMCREST_SHA256)\" actual=\"$$JAVA_HAMCREST_SHA256_ACTUAL\"; \
+		exit 1; \
+	fi
+endif
+
+$(JAVA_MOCKITO_JAR_PATH): $(JAVA_TEST_LIBDIR)
+ifneq (,$(wildcard $(MVN_LOCAL)/org/mockito/mockito-all/$(JAVA_MOCKITO_VER)/$(JAVA_MOCKITO_JAR)))
+	cp -v $(MVN_LOCAL)/org/mockito/mockito-all/$(JAVA_MOCKITO_VER)/$(JAVA_MOCKITO_JAR) $(JAVA_TEST_LIBDIR)
+else
+	curl --fail --insecure --output "$(JAVA_MOCKITO_JAR_PATH)" --location $(DEPS_URL)/$(JAVA_MOCKITO_JAR)
+	JAVA_MOCKITO_SHA256_ACTUAL=`$(SHA256_CMD) $(JAVA_MOCKITO_JAR_PATH) | cut -d ' ' -f 1`; \
+	if [ "$(JAVA_MOCKITO_SHA256)" != "$$JAVA_MOCKITO_SHA256_ACTUAL" ]; then \
+		echo $(JAVA_MOCKITO_JAR_PATH) checksum mismatch, expected=\"$(JAVA_MOCKITO_SHA256)\" actual=\"$$JAVA_MOCKITO_SHA256_ACTUAL\"; \
+		exit 1; \
+	fi
+endif
+
+$(JAVA_CGLIB_JAR_PATH): $(JAVA_TEST_LIBDIR)
+ifneq (,$(wildcard $(MVN_LOCAL)/cglib/cglib/$(JAVA_CGLIB_VER)/$(JAVA_CGLIB_JAR)))
+	cp -v $(MVN_LOCAL)/cglib/cglib/$(JAVA_CGLIB_VER)/$(JAVA_CGLIB_JAR) $(JAVA_TEST_LIBDIR)
+else
+	curl --fail --insecure --output "$(JAVA_CGLIB_JAR_PATH)" --location $(DEPS_URL)/$(JAVA_CGLIB_JAR)
+	JAVA_CGLIB_SHA256_ACTUAL=`$(SHA256_CMD) $(JAVA_CGLIB_JAR_PATH) | cut -d ' ' -f 1`; \
+	if [ "$(JAVA_CGLIB_SHA256)" != "$$JAVA_CGLIB_SHA256_ACTUAL" ]; then \
+		echo $(JAVA_CGLIB_JAR_PATH) checksum mismatch, expected=\"$(JAVA_CGLIB_SHA256)\" actual=\"$$JAVA_CGLIB_SHA256_ACTUAL\"; \
+		exit 1; \
+	fi
+endif
+
+$(JAVA_ASSERTJ_JAR_PATH): $(JAVA_TEST_LIBDIR)
+ifneq (,$(wildcard $(MVN_LOCAL)/org/assertj/assertj-core/$(JAVA_ASSERTJ_VER)/$(JAVA_ASSERTJ_JAR)))
+	cp -v $(MVN_LOCAL)/org/assertj/assertj-core/$(JAVA_ASSERTJ_VER)/$(JAVA_ASSERTJ_JAR) $(JAVA_TEST_LIBDIR)
+else
+	curl --fail --insecure --output "$(JAVA_ASSERTJ_JAR_PATH)" --location $(DEPS_URL)/$(JAVA_ASSERTJ_JAR)
+	JAVA_ASSERTJ_SHA256_ACTUAL=`$(SHA256_CMD) $(JAVA_ASSERTJ_JAR_PATH) | cut -d ' ' -f 1`; \
+	if [ "$(JAVA_ASSERTJ_SHA256)" != "$$JAVA_ASSERTJ_SHA256_ACTUAL" ]; then \
+		echo $(JAVA_ASSERTJ_JAR_PATH) checksum mismatch, expected=\"$(JAVA_ASSERTJ_SHA256)\" actual=\"$$JAVA_ASSERTJ_SHA256_ACTUAL\"; \
+		exit 1; \
+	fi
+endif
+
+resolve_test_deps: $(JAVA_JUNIT_JAR_PATH) $(JAVA_HAMCREST_JAR_PATH) $(JAVA_MOCKITO_JAR_PATH) $(JAVA_CGLIB_JAR_PATH) $(JAVA_ASSERTJ_JAR_PATH)
+
+java_test: java resolve_test_deps
+	$(AM_V_GEN)mkdir -p $(TEST_CLASSES)
+	$(AM_V_at) $(JAVAC_CMD) $(JAVAC_ARGS) -cp $(MAIN_CLASSES):$(JAVA_TESTCLASSPATH) -h $(NATIVE_INCLUDE) -d $(TEST_CLASSES)\
+		$(TEST_SOURCES)
+
+test: java java_test
+	$(MAKE) run_test
+
+run_test:
+	$(JAVA_CMD) $(JAVA_ARGS) -Djava.library.path=target -cp "$(MAIN_CLASSES):$(TEST_CLASSES):$(JAVA_TESTCLASSPATH):target/*" org.rocksdb.test.RocksJunitRunner $(ALL_JAVA_TESTS)
+
+run_plugin_test:
+	$(JAVA_CMD) $(JAVA_ARGS) -Djava.library.path=target -cp "$(MAIN_CLASSES):$(TEST_CLASSES):$(JAVA_TESTCLASSPATH):target/*" org.rocksdb.test.RocksJunitRunner $(ROCKSDB_PLUGIN_JAVA_TESTS)
+
+db_bench: java
+	$(AM_V_GEN)mkdir -p $(BENCHMARK_MAIN_CLASSES)
+	$(AM_V_at)$(JAVAC_CMD) $(JAVAC_ARGS) -cp $(MAIN_CLASSES) -d $(BENCHMARK_MAIN_CLASSES) $(BENCHMARK_MAIN_SRC)/org/rocksdb/benchmark/*.java
diff --git a/src/rocksdb/java/RELEASE.md b/src/rocksdb/java/RELEASE.md
new file mode 100644
index 000000000..dda19455f
--- /dev/null
+++ b/src/rocksdb/java/RELEASE.md
@@ -0,0 +1,59 @@
+## Cross-building
+
+RocksDB can be built as a single self contained cross-platform JAR. The cross-platform jar can be used on any 64-bit OSX system, 32-bit Linux system, or 64-bit Linux system.
+
+Building a cross-platform JAR requires:
+
+ * [Docker](https://www.docker.com/docker-community)
+ * A Mac OSX machine that can compile RocksDB.
+ * Java 7 set as JAVA_HOME.
+
+Once you have these items, run this make command from RocksDB's root source directory:
+
+    make jclean clean rocksdbjavastaticreleasedocker
+
+This command will build RocksDB natively on OSX, and will then spin up docker containers to build RocksDB for 32-bit and 64-bit Linux with glibc, and 32-bit and 64-bit Linux with musl libc.
+
+You can find all native binaries and JARs in the java/target directory upon completion:
+
+    librocksdbjni-linux32.so
+    librocksdbjni-linux64.so
+    librocksdbjni-linux64-musl.so
+    librocksdbjni-linux32-musl.so
+    librocksdbjni-osx.jnilib
+    rocksdbjni-x.y.z-javadoc.jar
+    rocksdbjni-x.y.z-linux32.jar
+    rocksdbjni-x.y.z-linux64.jar
+    rocksdbjni-x.y.z-linux64-musl.jar
+    rocksdbjni-x.y.z-linux32-musl.jar
+    rocksdbjni-x.y.z-osx.jar
+    rocksdbjni-x.y.z-sources.jar
+    rocksdbjni-x.y.z.jar
+
+Where x.y.z is the built version number of RocksDB.
+
+## Maven publication
+
+Set ~/.m2/settings.xml to contain:
+
+    <settings xmlns="http://maven.apache.org/SETTINGS/1.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/SETTINGS/1.0.0 http://maven.apache.org/xsd/settings-1.0.0.xsd">
+      <servers>
+        <server>
+          <id>sonatype-nexus-staging</id>
+          <username>your-sonatype-jira-username</username>
+          <password>your-sonatype-jira-password</password>
+        </server>
+      </servers>
+    </settings>
+
+From RocksDB's root directory, first build the Java static JARs:
+
+    make jclean clean rocksdbjavastaticpublish
+
+This command will [stage the JAR artifacts on the Sonatype staging repository](http://central.sonatype.org/pages/manual-staging-bundle-creation-and-deployment.html). To release the staged artifacts.
+
+1. Go to [https://oss.sonatype.org/#stagingRepositories](https://oss.sonatype.org/#stagingRepositories) and search for "rocksdb" in the upper right hand search box.
+2. Select the rocksdb staging repository, and inspect its contents.
+3. If all is well, follow [these steps](https://oss.sonatype.org/#stagingRepositories) to close the repository and release it.
+
+After the release has occurred, the artifacts will be synced to Maven central within 24-48 hours.
diff --git a/src/rocksdb/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java b/src/rocksdb/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java
new file mode 100644
index 000000000..070f0fe75
--- /dev/null
+++ b/src/rocksdb/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java
@@ -0,0 +1,1640 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+/**
+ * Copyright (C) 2011 the original author or authors.
+ * See the notice.md file distributed with this work for additional
+ * information regarding copyright ownership.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.rocksdb.benchmark;
+
+import java.io.IOException;
+import java.lang.Runnable;
+import java.lang.Math;
+import java.io.File;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
+import java.nio.ByteBuffer;
+import java.nio.file.Files;
+import java.util.Collection;
+import java.util.Date;
+import java.util.EnumMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import java.util.Arrays;
+import java.util.ArrayList;
+import java.util.concurrent.Callable;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import org.rocksdb.*;
+import org.rocksdb.RocksMemEnv;
+import org.rocksdb.util.SizeUnit;
+
+class Stats {
+  int id_;
+  long start_;
+  long finish_;
+  double seconds_;
+  long done_;
+  long found_;
+  long lastOpTime_;
+  long nextReport_;
+  long bytes_;
+  StringBuilder message_;
+  boolean excludeFromMerge_;
+
+  // TODO(yhchiang): use the following arguments:
+  //   (Long)Flag.stats_interval
+  //   (Integer)Flag.stats_per_interval
+
+  Stats(int id) {
+    id_ = id;
+    nextReport_ = 100;
+    done_ = 0;
+    bytes_ = 0;
+    seconds_ = 0;
+    start_ = System.nanoTime();
+    lastOpTime_ = start_;
+    finish_ = start_;
+    found_ = 0;
+    message_ = new StringBuilder("");
+    excludeFromMerge_ = false;
+  }
+
+  void merge(final Stats other) {
+    if (other.excludeFromMerge_) {
+      return;
+    }
+
+    done_ += other.done_;
+    found_ += other.found_;
+    bytes_ += other.bytes_;
+    seconds_ += other.seconds_;
+    if (other.start_ < start_) start_ = other.start_;
+    if (other.finish_ > finish_) finish_ = other.finish_;
+
+    // Just keep the messages from one thread
+    if (message_.length() == 0) {
+      message_ = other.message_;
+    }
+  }
+
+  void stop() {
+    finish_ = System.nanoTime();
+    seconds_ = (double) (finish_ - start_) * 1e-9;
+  }
+
+  void addMessage(String msg) {
+    if (message_.length() > 0) {
+      message_.append(" ");
+    }
+    message_.append(msg);
+  }
+
+  void setId(int id) { id_ = id; }
+  void setExcludeFromMerge() { excludeFromMerge_ = true; }
+
+  void finishedSingleOp(int bytes) {
+    done_++;
+    lastOpTime_ = System.nanoTime();
+    bytes_ += bytes;
+    if (done_ >= nextReport_) {
+      if (nextReport_ < 1000) {
+        nextReport_ += 100;
+      } else if (nextReport_ < 5000) {
+        nextReport_ += 500;
+      } else if (nextReport_ < 10000) {
+        nextReport_ += 1000;
+      } else if (nextReport_ < 50000) {
+        nextReport_ += 5000;
+      } else if (nextReport_ < 100000) {
+        nextReport_ += 10000;
+      } else if (nextReport_ < 500000) {
+        nextReport_ += 50000;
+      } else {
+        nextReport_ += 100000;
+      }
+      System.err.printf("... Task %s finished %d ops%30s\r", id_, done_, "");
+    }
+  }
+
+  void report(String name) {
+    // Pretend at least one op was done in case we are running a benchmark
+    // that does not call FinishedSingleOp().
+    if (done_ < 1) done_ = 1;
+
+    StringBuilder extra = new StringBuilder("");
+    if (bytes_ > 0) {
+      // Rate is computed on actual elapsed time, not the sum of per-thread
+      // elapsed times.
+      double elapsed = (finish_ - start_) * 1e-9;
+      extra.append(String.format("%6.1f MB/s", (bytes_ / 1048576.0) / elapsed));
+    }
+    extra.append(message_.toString());
+    double elapsed = (finish_ - start_);
+    double throughput = (double) done_ / (elapsed * 1e-9);
+
+    System.out.format("%-12s : %11.3f micros/op %d ops/sec;%s%s\n",
+            name, (elapsed * 1e-6) / done_,
+            (long) throughput, (extra.length() == 0 ? "" : " "), extra.toString());
+  }
+}
+
+public class DbBenchmark {
+  enum Order {
+    SEQUENTIAL,
+    RANDOM
+  }
+
+  enum DBState {
+    FRESH,
+    EXISTING
+  }
+
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  abstract class BenchmarkTask implements Callable<Stats> {
+    // TODO(yhchiang): use (Integer)Flag.perf_level.
+    public BenchmarkTask(
+        int tid, long randSeed, long numEntries, long keyRange) {
+      tid_ = tid;
+      rand_ = new Random(randSeed + tid * 1000);
+      numEntries_ = numEntries;
+      keyRange_ = keyRange;
+      stats_ = new Stats(tid);
+    }
+
+    @Override public Stats call() throws RocksDBException {
+      stats_.start_ = System.nanoTime();
+      runTask();
+      stats_.finish_ = System.nanoTime();
+      return stats_;
+    }
+
+    abstract protected void runTask() throws RocksDBException;
+
+    protected int tid_;
+    protected Random rand_;
+    protected long numEntries_;
+    protected long keyRange_;
+    protected Stats stats_;
+
+    protected void getFixedKey(byte[] key, long sn) {
+      generateKeyFromLong(key, sn);
+    }
+
+    protected void getRandomKey(byte[] key, long range) {
+      generateKeyFromLong(key, Math.abs(rand_.nextLong() % range));
+    }
+  }
+
+  abstract class WriteTask extends BenchmarkTask {
+    public WriteTask(
+        int tid, long randSeed, long numEntries, long keyRange,
+        WriteOptions writeOpt, long entriesPerBatch) {
+      super(tid, randSeed, numEntries, keyRange);
+      writeOpt_ = writeOpt;
+      entriesPerBatch_ = entriesPerBatch;
+      maxWritesPerSecond_ = -1;
+    }
+
+    public WriteTask(
+        int tid, long randSeed, long numEntries, long keyRange,
+        WriteOptions writeOpt, long entriesPerBatch, long maxWritesPerSecond) {
+      super(tid, randSeed, numEntries, keyRange);
+      writeOpt_ = writeOpt;
+      entriesPerBatch_ = entriesPerBatch;
+      maxWritesPerSecond_ = maxWritesPerSecond;
+    }
+
+    @Override public void runTask() throws RocksDBException {
+      if (numEntries_ != DbBenchmark.this.num_) {
+        stats_.message_.append(String.format(" (%d ops)", numEntries_));
+      }
+      byte[] key = new byte[keySize_];
+      byte[] value = new byte[valueSize_];
+
+      try {
+        if (entriesPerBatch_ == 1) {
+          for (long i = 0; i < numEntries_; ++i) {
+            getKey(key, i, keyRange_);
+            DbBenchmark.this.gen_.generate(value);
+            db_.put(writeOpt_, key, value);
+            stats_.finishedSingleOp(keySize_ + valueSize_);
+            writeRateControl(i);
+            if (isFinished()) {
+              return;
+            }
+          }
+        } else {
+          for (long i = 0; i < numEntries_; i += entriesPerBatch_) {
+            WriteBatch batch = new WriteBatch();
+            for (long j = 0; j < entriesPerBatch_; j++) {
+              getKey(key, i + j, keyRange_);
+              DbBenchmark.this.gen_.generate(value);
+              batch.put(key, value);
+              stats_.finishedSingleOp(keySize_ + valueSize_);
+            }
+            db_.write(writeOpt_, batch);
+            batch.dispose();
+            writeRateControl(i);
+            if (isFinished()) {
+              return;
+            }
+          }
+        }
+      } catch (InterruptedException e) {
+        // thread has been terminated.
+      }
+    }
+
+    protected void writeRateControl(long writeCount)
+        throws InterruptedException {
+      if (maxWritesPerSecond_ <= 0) return;
+      long minInterval =
+          writeCount * TimeUnit.SECONDS.toNanos(1) / maxWritesPerSecond_;
+      long interval = System.nanoTime() - stats_.start_;
+      if (minInterval - interval > TimeUnit.MILLISECONDS.toNanos(1)) {
+        TimeUnit.NANOSECONDS.sleep(minInterval - interval);
+      }
+    }
+
+    abstract protected void getKey(byte[] key, long id, long range);
+    protected WriteOptions writeOpt_;
+    protected long entriesPerBatch_;
+    protected long maxWritesPerSecond_;
+  }
+
+  class WriteSequentialTask extends WriteTask {
+    public WriteSequentialTask(
+        int tid, long randSeed, long numEntries, long keyRange,
+        WriteOptions writeOpt, long entriesPerBatch) {
+      super(tid, randSeed, numEntries, keyRange,
+            writeOpt, entriesPerBatch);
+    }
+    public WriteSequentialTask(
+        int tid, long randSeed, long numEntries, long keyRange,
+        WriteOptions writeOpt, long entriesPerBatch,
+        long maxWritesPerSecond) {
+      super(tid, randSeed, numEntries, keyRange,
+            writeOpt, entriesPerBatch,
+            maxWritesPerSecond);
+    }
+    @Override protected void getKey(byte[] key, long id, long range) {
+      getFixedKey(key, id);
+    }
+  }
+
+  class WriteRandomTask extends WriteTask {
+    public WriteRandomTask(
+        int tid, long randSeed, long numEntries, long keyRange,
+        WriteOptions writeOpt, long entriesPerBatch) {
+      super(tid, randSeed, numEntries, keyRange,
+            writeOpt, entriesPerBatch);
+    }
+    public WriteRandomTask(
+        int tid, long randSeed, long numEntries, long keyRange,
+        WriteOptions writeOpt, long entriesPerBatch,
+        long maxWritesPerSecond) {
+      super(tid, randSeed, numEntries, keyRange,
+            writeOpt, entriesPerBatch,
+            maxWritesPerSecond);
+    }
+    @Override protected void getKey(byte[] key, long id, long range) {
+      getRandomKey(key, range);
+    }
+  }
+
+  class WriteUniqueRandomTask extends WriteTask {
+    static final int MAX_BUFFER_SIZE = 10000000;
+    public WriteUniqueRandomTask(
+        int tid, long randSeed, long numEntries, long keyRange,
+        WriteOptions writeOpt, long entriesPerBatch) {
+      super(tid, randSeed, numEntries, keyRange,
+            writeOpt, entriesPerBatch);
+      initRandomKeySequence();
+    }
+    public WriteUniqueRandomTask(
+        int tid, long randSeed, long numEntries, long keyRange,
+        WriteOptions writeOpt, long entriesPerBatch,
+        long maxWritesPerSecond) {
+      super(tid, randSeed, numEntries, keyRange,
+            writeOpt, entriesPerBatch,
+            maxWritesPerSecond);
+      initRandomKeySequence();
+    }
+    @Override protected void getKey(byte[] key, long id, long range) {
+      generateKeyFromLong(key, nextUniqueRandom());
+    }
+
+    protected void initRandomKeySequence() {
+      bufferSize_ = MAX_BUFFER_SIZE;
+      if (bufferSize_ > keyRange_) {
+        bufferSize_ = (int) keyRange_;
+      }
+      currentKeyCount_ = bufferSize_;
+      keyBuffer_ = new long[MAX_BUFFER_SIZE];
+      for (int k = 0; k < bufferSize_; ++k) {
+        keyBuffer_[k] = k;
+      }
+    }
+
+    /**
+     * Semi-randomly return the next unique key.  It is guaranteed to be
+     * fully random if keyRange_ <= MAX_BUFFER_SIZE.
+     */
+    long nextUniqueRandom() {
+      if (bufferSize_ == 0) {
+        System.err.println("bufferSize_ == 0.");
+        return 0;
+      }
+      int r = rand_.nextInt(bufferSize_);
+      // randomly pick one from the keyBuffer
+      long randKey = keyBuffer_[r];
+      if (currentKeyCount_ < keyRange_) {
+        // if we have not yet inserted all keys, insert next new key to [r].
+        keyBuffer_[r] = currentKeyCount_++;
+      } else {
+        // move the last element to [r] and decrease the size by 1.
+        keyBuffer_[r] = keyBuffer_[--bufferSize_];
+      }
+      return randKey;
+    }
+
+    int bufferSize_;
+    long currentKeyCount_;
+    long[] keyBuffer_;
+  }
+
+  class ReadRandomTask extends BenchmarkTask {
+    public ReadRandomTask(
+        int tid, long randSeed, long numEntries, long keyRange) {
+      super(tid, randSeed, numEntries, keyRange);
+    }
+    @Override public void runTask() throws RocksDBException {
+      byte[] key = new byte[keySize_];
+      byte[] value = new byte[valueSize_];
+      for (long i = 0; i < numEntries_; i++) {
+        getRandomKey(key, keyRange_);
+        int len = db_.get(key, value);
+        if (len != RocksDB.NOT_FOUND) {
+          stats_.found_++;
+          stats_.finishedSingleOp(keySize_ + valueSize_);
+        } else {
+          stats_.finishedSingleOp(keySize_);
+        }
+        if (isFinished()) {
+          return;
+        }
+      }
+    }
+  }
+
+  class ReadSequentialTask extends BenchmarkTask {
+    public ReadSequentialTask(
+        int tid, long randSeed, long numEntries, long keyRange) {
+      super(tid, randSeed, numEntries, keyRange);
+    }
+    @Override public void runTask() throws RocksDBException {
+      RocksIterator iter = db_.newIterator();
+      long i;
+      for (iter.seekToFirst(), i = 0;
+           iter.isValid() && i < numEntries_;
+           iter.next(), ++i) {
+        stats_.found_++;
+        stats_.finishedSingleOp(iter.key().length + iter.value().length);
+        if (isFinished()) {
+          iter.dispose();
+          return;
+        }
+      }
+      iter.dispose();
+    }
+  }
+
+  public DbBenchmark(Map<Flag, Object> flags) throws Exception {
+    benchmarks_ = (List<String>) flags.get(Flag.benchmarks);
+    num_ = (Integer) flags.get(Flag.num);
+    threadNum_ = (Integer) flags.get(Flag.threads);
+    reads_ = (Integer) (flags.get(Flag.reads) == null ?
+        flags.get(Flag.num) : flags.get(Flag.reads));
+    keySize_ = (Integer) flags.get(Flag.key_size);
+    valueSize_ = (Integer) flags.get(Flag.value_size);
+    compressionRatio_ = (Double) flags.get(Flag.compression_ratio);
+    useExisting_ = (Boolean) flags.get(Flag.use_existing_db);
+    randSeed_ = (Long) flags.get(Flag.seed);
+    databaseDir_ = (String) flags.get(Flag.db);
+    writesPerSeconds_ = (Integer) flags.get(Flag.writes_per_second);
+    memtable_ = (String) flags.get(Flag.memtablerep);
+    maxWriteBufferNumber_ = (Integer) flags.get(Flag.max_write_buffer_number);
+    prefixSize_ = (Integer) flags.get(Flag.prefix_size);
+    keysPerPrefix_ = (Integer) flags.get(Flag.keys_per_prefix);
+    hashBucketCount_ = (Long) flags.get(Flag.hash_bucket_count);
+    usePlainTable_ = (Boolean) flags.get(Flag.use_plain_table);
+    useMemenv_ = (Boolean) flags.get(Flag.use_mem_env);
+    flags_ = flags;
+    finishLock_ = new Object();
+    // options.setPrefixSize((Integer)flags_.get(Flag.prefix_size));
+    // options.setKeysPerPrefix((Long)flags_.get(Flag.keys_per_prefix));
+    compressionType_ = (String) flags.get(Flag.compression_type);
+    compression_ = CompressionType.NO_COMPRESSION;
+    try {
+      if (compressionType_!=null) {
+          final CompressionType compressionType =
+              CompressionType.getCompressionType(compressionType_);
+          if (compressionType != null &&
+              compressionType != CompressionType.NO_COMPRESSION) {
+            System.loadLibrary(compressionType.getLibraryName());
+          }
+
+      }
+    } catch (UnsatisfiedLinkError e) {
+      System.err.format("Unable to load %s library:%s%n" +
+                        "No compression is used.%n",
+          compressionType_, e.toString());
+      compressionType_ = "none";
+    }
+    gen_ = new RandomGenerator(randSeed_, compressionRatio_);
+  }
+
+  private void prepareReadOptions(ReadOptions options) {
+    options.setVerifyChecksums((Boolean)flags_.get(Flag.verify_checksum));
+    options.setTailing((Boolean)flags_.get(Flag.use_tailing_iterator));
+  }
+
+  private void prepareWriteOptions(WriteOptions options) {
+    options.setSync((Boolean)flags_.get(Flag.sync));
+    options.setDisableWAL((Boolean)flags_.get(Flag.disable_wal));
+  }
+
+  private void prepareOptions(Options options) throws RocksDBException {
+    if (!useExisting_) {
+      options.setCreateIfMissing(true);
+    } else {
+      options.setCreateIfMissing(false);
+    }
+    if (useMemenv_) {
+      options.setEnv(new RocksMemEnv(Env.getDefault()));
+    }
+    switch (memtable_) {
+      case "skip_list":
+        options.setMemTableConfig(new SkipListMemTableConfig());
+        break;
+      case "vector":
+        options.setMemTableConfig(new VectorMemTableConfig());
+        break;
+      case "hash_linkedlist":
+        options.setMemTableConfig(
+            new HashLinkedListMemTableConfig()
+                .setBucketCount(hashBucketCount_));
+        options.useFixedLengthPrefixExtractor(prefixSize_);
+        break;
+      case "hash_skiplist":
+      case "prefix_hash":
+        options.setMemTableConfig(
+            new HashSkipListMemTableConfig()
+                .setBucketCount(hashBucketCount_));
+        options.useFixedLengthPrefixExtractor(prefixSize_);
+        break;
+      default:
+        System.err.format(
+            "unable to detect the specified memtable, " +
+                "use the default memtable factory %s%n",
+            options.memTableFactoryName());
+        break;
+    }
+    if (usePlainTable_) {
+      options.setTableFormatConfig(
+          new PlainTableConfig().setKeySize(keySize_));
+    } else {
+      BlockBasedTableConfig table_options = new BlockBasedTableConfig();
+      table_options.setBlockSize((Long)flags_.get(Flag.block_size))
+                   .setBlockCacheSize((Long)flags_.get(Flag.cache_size))
+                   .setCacheNumShardBits(
+                      (Integer)flags_.get(Flag.cache_numshardbits));
+      options.setTableFormatConfig(table_options);
+    }
+    options.setWriteBufferSize(
+        (Long)flags_.get(Flag.write_buffer_size));
+    options.setMaxWriteBufferNumber(
+        (Integer)flags_.get(Flag.max_write_buffer_number));
+    options.setMaxBackgroundCompactions(
+        (Integer)flags_.get(Flag.max_background_compactions));
+    options.getEnv().setBackgroundThreads(
+        (Integer)flags_.get(Flag.max_background_compactions));
+    options.setMaxBackgroundFlushes(
+        (Integer)flags_.get(Flag.max_background_flushes));
+    options.setMaxBackgroundJobs((Integer) flags_.get(Flag.max_background_jobs));
+    options.setMaxOpenFiles(
+        (Integer)flags_.get(Flag.open_files));
+    options.setUseFsync(
+        (Boolean)flags_.get(Flag.use_fsync));
+    options.setWalDir(
+        (String)flags_.get(Flag.wal_dir));
+    options.setDeleteObsoleteFilesPeriodMicros(
+        (Integer)flags_.get(Flag.delete_obsolete_files_period_micros));
+    options.setTableCacheNumshardbits(
+        (Integer)flags_.get(Flag.table_cache_numshardbits));
+    options.setAllowMmapReads(
+        (Boolean)flags_.get(Flag.mmap_read));
+    options.setAllowMmapWrites(
+        (Boolean)flags_.get(Flag.mmap_write));
+    options.setAdviseRandomOnOpen(
+        (Boolean)flags_.get(Flag.advise_random_on_open));
+    options.setUseAdaptiveMutex(
+        (Boolean)flags_.get(Flag.use_adaptive_mutex));
+    options.setBytesPerSync(
+        (Long)flags_.get(Flag.bytes_per_sync));
+    options.setBloomLocality(
+        (Integer)flags_.get(Flag.bloom_locality));
+    options.setMinWriteBufferNumberToMerge(
+        (Integer)flags_.get(Flag.min_write_buffer_number_to_merge));
+    options.setMemtablePrefixBloomSizeRatio((Double) flags_.get(Flag.memtable_bloom_size_ratio));
+    options.setMemtableWholeKeyFiltering((Boolean) flags_.get(Flag.memtable_whole_key_filtering));
+    options.setNumLevels(
+        (Integer)flags_.get(Flag.num_levels));
+    options.setTargetFileSizeBase(
+        (Integer)flags_.get(Flag.target_file_size_base));
+    options.setTargetFileSizeMultiplier((Integer)flags_.get(Flag.target_file_size_multiplier));
+    options.setMaxBytesForLevelBase(
+        (Integer)flags_.get(Flag.max_bytes_for_level_base));
+    options.setMaxBytesForLevelMultiplier((Double) flags_.get(Flag.max_bytes_for_level_multiplier));
+    options.setLevelZeroStopWritesTrigger(
+        (Integer)flags_.get(Flag.level0_stop_writes_trigger));
+    options.setLevelZeroSlowdownWritesTrigger(
+        (Integer)flags_.get(Flag.level0_slowdown_writes_trigger));
+    options.setLevelZeroFileNumCompactionTrigger(
+        (Integer)flags_.get(Flag.level0_file_num_compaction_trigger));
+    options.setMaxCompactionBytes(
+        (Long) flags_.get(Flag.max_compaction_bytes));
+    options.setDisableAutoCompactions(
+        (Boolean)flags_.get(Flag.disable_auto_compactions));
+    options.setMaxSuccessiveMerges(
+        (Integer)flags_.get(Flag.max_successive_merges));
+    options.setWalTtlSeconds((Long)flags_.get(Flag.wal_ttl_seconds));
+    options.setWalSizeLimitMB((Long)flags_.get(Flag.wal_size_limit_MB));
+    if(flags_.get(Flag.java_comparator) != null) {
+      options.setComparator(
+          (AbstractComparator)flags_.get(Flag.java_comparator));
+    }
+
+    /* TODO(yhchiang): enable the following parameters
+    options.setCompressionType((String)flags_.get(Flag.compression_type));
+    options.setCompressionLevel((Integer)flags_.get(Flag.compression_level));
+    options.setMinLevelToCompress((Integer)flags_.get(Flag.min_level_to_compress));
+    options.setStatistics((Boolean)flags_.get(Flag.statistics));
+    options.setUniversalSizeRatio(
+        (Integer)flags_.get(Flag.universal_size_ratio));
+    options.setUniversalMinMergeWidth(
+        (Integer)flags_.get(Flag.universal_min_merge_width));
+    options.setUniversalMaxMergeWidth(
+        (Integer)flags_.get(Flag.universal_max_merge_width));
+    options.setUniversalMaxSizeAmplificationPercent(
+        (Integer)flags_.get(Flag.universal_max_size_amplification_percent));
+    options.setUniversalCompressionSizePercent(
+        (Integer)flags_.get(Flag.universal_compression_size_percent));
+    // TODO(yhchiang): add RocksDB.openForReadOnly() to enable Flag.readonly
+    // TODO(yhchiang): enable Flag.merge_operator by switch
+    options.setAccessHintOnCompactionStart(
+        (String)flags_.get(Flag.compaction_fadvice));
+    // available values of fadvice are "NONE", "NORMAL", "SEQUENTIAL", "WILLNEED" for fadvice
+    */
+  }
+
+  private void run() throws RocksDBException {
+    if (!useExisting_) {
+      destroyDb();
+    }
+    Options options = new Options();
+    prepareOptions(options);
+    open(options);
+
+    printHeader(options);
+
+    for (String benchmark : benchmarks_) {
+      List<Callable<Stats>> tasks = new ArrayList<Callable<Stats>>();
+      List<Callable<Stats>> bgTasks = new ArrayList<Callable<Stats>>();
+      WriteOptions writeOpt = new WriteOptions();
+      prepareWriteOptions(writeOpt);
+      ReadOptions readOpt = new ReadOptions();
+      prepareReadOptions(readOpt);
+      int currentTaskId = 0;
+      boolean known = true;
+
+      switch (benchmark) {
+        case "fillseq":
+          tasks.add(new WriteSequentialTask(
+              currentTaskId++, randSeed_, num_, num_, writeOpt, 1));
+          break;
+        case "fillbatch":
+          tasks.add(
+              new WriteSequentialTask(currentTaskId++, randSeed_, num_, num_, writeOpt, 1000));
+          break;
+        case "fillrandom":
+          tasks.add(new WriteRandomTask(
+              currentTaskId++, randSeed_, num_, num_, writeOpt, 1));
+          break;
+        case "filluniquerandom":
+          tasks.add(new WriteUniqueRandomTask(
+              currentTaskId++, randSeed_, num_, num_, writeOpt, 1));
+          break;
+        case "fillsync":
+          writeOpt.setSync(true);
+          tasks.add(new WriteRandomTask(
+              currentTaskId++, randSeed_, num_ / 1000, num_ / 1000,
+              writeOpt, 1));
+          break;
+        case "readseq":
+          for (int t = 0; t < threadNum_; ++t) {
+            tasks.add(new ReadSequentialTask(
+                currentTaskId++, randSeed_, reads_ / threadNum_, num_));
+          }
+          break;
+        case "readrandom":
+          for (int t = 0; t < threadNum_; ++t) {
+            tasks.add(new ReadRandomTask(
+                currentTaskId++, randSeed_, reads_ / threadNum_, num_));
+          }
+          break;
+        case "readwhilewriting":
+          WriteTask writeTask = new WriteRandomTask(
+              -1, randSeed_, Long.MAX_VALUE, num_, writeOpt, 1, writesPerSeconds_);
+          writeTask.stats_.setExcludeFromMerge();
+          bgTasks.add(writeTask);
+          for (int t = 0; t < threadNum_; ++t) {
+            tasks.add(new ReadRandomTask(
+                currentTaskId++, randSeed_, reads_ / threadNum_, num_));
+          }
+          break;
+        case "readhot":
+          for (int t = 0; t < threadNum_; ++t) {
+            tasks.add(new ReadRandomTask(
+                currentTaskId++, randSeed_, reads_ / threadNum_, num_ / 100));
+          }
+          break;
+        case "delete":
+          destroyDb();
+          open(options);
+          break;
+        default:
+          known = false;
+          System.err.println("Unknown benchmark: " + benchmark);
+          break;
+      }
+      if (known) {
+        ExecutorService executor = Executors.newCachedThreadPool();
+        ExecutorService bgExecutor = Executors.newCachedThreadPool();
+        try {
+          // measure only the main executor time
+          List<Future<Stats>> bgResults = new ArrayList<Future<Stats>>();
+          for (Callable bgTask : bgTasks) {
+            bgResults.add(bgExecutor.submit(bgTask));
+          }
+          start();
+          List<Future<Stats>> results = executor.invokeAll(tasks);
+          executor.shutdown();
+          boolean finished = executor.awaitTermination(10, TimeUnit.SECONDS);
+          if (!finished) {
+            System.out.format(
+                "Benchmark %s was not finished before timeout.",
+                benchmark);
+            executor.shutdownNow();
+          }
+          setFinished(true);
+          bgExecutor.shutdown();
+          finished = bgExecutor.awaitTermination(10, TimeUnit.SECONDS);
+          if (!finished) {
+            System.out.format(
+                "Benchmark %s was not finished before timeout.",
+                benchmark);
+            bgExecutor.shutdownNow();
+          }
+
+          stop(benchmark, results, currentTaskId);
+        } catch (InterruptedException e) {
+          System.err.println(e);
+        }
+      }
+      writeOpt.dispose();
+      readOpt.dispose();
+    }
+    options.dispose();
+    db_.close();
+  }
+
+  private void printHeader(Options options) {
+    int kKeySize = 16;
+    System.out.printf("Keys:     %d bytes each\n", kKeySize);
+    System.out.printf("Values:   %d bytes each (%d bytes after compression)\n",
+        valueSize_,
+        (int) (valueSize_ * compressionRatio_ + 0.5));
+    System.out.printf("Entries:  %d\n", num_);
+    System.out.printf("RawSize:  %.1f MB (estimated)\n",
+        ((double)(kKeySize + valueSize_) * num_) / SizeUnit.MB);
+    System.out.printf("FileSize:   %.1f MB (estimated)\n",
+        (((kKeySize + valueSize_ * compressionRatio_) * num_) / SizeUnit.MB));
+    System.out.format("Memtable Factory: %s%n", options.memTableFactoryName());
+    System.out.format("Prefix:   %d bytes%n", prefixSize_);
+    System.out.format("Compression: %s%n", compressionType_);
+    printWarnings();
+    System.out.printf("------------------------------------------------\n");
+  }
+
+  void printWarnings() {
+    boolean assertsEnabled = false;
+    assert assertsEnabled = true; // Intentional side effect!!!
+    if (assertsEnabled) {
+      System.out.printf(
+          "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
+    }
+  }
+
+  private void open(Options options) throws RocksDBException {
+    System.out.println("Using database directory: " + databaseDir_);
+    db_ = RocksDB.open(options, databaseDir_);
+  }
+
+  private void start() {
+    setFinished(false);
+    startTime_ = System.nanoTime();
+  }
+
+  private void stop(
+      String benchmark, List<Future<Stats>> results, int concurrentThreads) {
+    long endTime = System.nanoTime();
+    double elapsedSeconds =
+        1.0d * (endTime - startTime_) / TimeUnit.SECONDS.toNanos(1);
+
+    Stats stats = new Stats(-1);
+    int taskFinishedCount = 0;
+    for (Future<Stats> result : results) {
+      if (result.isDone()) {
+        try {
+          Stats taskStats = result.get(3, TimeUnit.SECONDS);
+          if (!result.isCancelled()) {
+            taskFinishedCount++;
+          }
+          stats.merge(taskStats);
+        } catch (Exception e) {
+          // then it's not successful, the output will indicate this
+        }
+      }
+    }
+    String extra = "";
+    if (benchmark.indexOf("read") >= 0) {
+      extra = String.format(" %d / %d found; ", stats.found_, stats.done_);
+    } else {
+      extra = String.format(" %d ops done; ", stats.done_);
+    }
+
+    System.out.printf(
+        "%-16s : %11.5f micros/op; %6.1f MB/s;%s %d / %d task(s) finished.\n",
+        benchmark, elapsedSeconds / stats.done_ * 1e6,
+        (stats.bytes_ / 1048576.0) / elapsedSeconds, extra,
+        taskFinishedCount, concurrentThreads);
+  }
+
+  public void generateKeyFromLong(byte[] slice, long n) {
+    assert(n >= 0);
+    int startPos = 0;
+
+    if (keysPerPrefix_ > 0) {
+      long numPrefix = (num_ + keysPerPrefix_ - 1) / keysPerPrefix_;
+      long prefix = n % numPrefix;
+      int bytesToFill = Math.min(prefixSize_, 8);
+      for (int i = 0; i < bytesToFill; ++i) {
+        slice[i] = (byte) (prefix % 256);
+        prefix /= 256;
+      }
+      for (int i = 8; i < bytesToFill; ++i) {
+        slice[i] = '0';
+      }
+      startPos = bytesToFill;
+    }
+
+    for (int i = slice.length - 1; i >= startPos; --i) {
+      slice[i] = (byte) ('0' + (n % 10));
+      n /= 10;
+    }
+  }
+
+  private void destroyDb() {
+    if (db_ != null) {
+      db_.close();
+    }
+    // TODO(yhchiang): develop our own FileUtil
+    // FileUtil.deleteDir(databaseDir_);
+  }
+
+  private void printStats() {
+  }
+
+  static void printHelp() {
+    System.out.println("usage:");
+    for (Flag flag : Flag.values()) {
+      System.out.format("  --%s%n\t%s%n",
+          flag.name(),
+          flag.desc());
+      if (flag.getDefaultValue() != null) {
+        System.out.format("\tDEFAULT: %s%n",
+            flag.getDefaultValue().toString());
+      }
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+    Map<Flag, Object> flags = new EnumMap<Flag, Object>(Flag.class);
+    for (Flag flag : Flag.values()) {
+      if (flag.getDefaultValue() != null) {
+        flags.put(flag, flag.getDefaultValue());
+      }
+    }
+    for (String arg : args) {
+      boolean valid = false;
+      if (arg.equals("--help") || arg.equals("-h")) {
+        printHelp();
+        System.exit(0);
+      }
+      if (arg.startsWith("--")) {
+        try {
+          String[] parts = arg.substring(2).split("=");
+          if (parts.length >= 1) {
+            Flag key = Flag.valueOf(parts[0]);
+            if (key != null) {
+              Object value = null;
+              if (parts.length >= 2) {
+                value = key.parseValue(parts[1]);
+              }
+              flags.put(key, value);
+              valid = true;
+            }
+          }
+        }
+        catch (Exception e) {
+        }
+      }
+      if (!valid) {
+        System.err.println("Invalid argument " + arg);
+        System.exit(1);
+      }
+    }
+    new DbBenchmark(flags).run();
+  }
+
+  private enum Flag {
+    benchmarks(Arrays.asList("fillseq", "readrandom", "fillrandom"),
+        "Comma-separated list of operations to run in the specified order\n"
+            + "\tActual benchmarks:\n"
+            + "\t\tfillseq          -- write N values in sequential key order in async mode.\n"
+            + "\t\tfillrandom       -- write N values in random key order in async mode.\n"
+            + "\t\tfillbatch        -- write N/1000 batch where each batch has 1000 values\n"
+            + "\t\t                   in sequential key order in sync mode.\n"
+            + "\t\tfillsync         -- write N/100 values in random key order in sync mode.\n"
+            + "\t\tfill100K         -- write N/1000 100K values in random order in async mode.\n"
+            + "\t\treadseq          -- read N times sequentially.\n"
+            + "\t\treadrandom       -- read N times in random order.\n"
+            + "\t\treadhot          -- read N times in random order from 1% section of DB.\n"
+            + "\t\treadwhilewriting -- measure the read performance of multiple readers\n"
+            + "\t\t                   with a bg single writer.  The write rate of the bg\n"
+            + "\t\t                   is capped by --writes_per_second.\n"
+            + "\tMeta Operations:\n"
+            + "\t\tdelete            -- delete DB") {
+      @Override public Object parseValue(String value) {
+        return new ArrayList<String>(Arrays.asList(value.split(",")));
+      }
+    },
+    compression_ratio(0.5d,
+        "Arrange to generate values that shrink to this fraction of\n" +
+        "\ttheir original size after compression.") {
+      @Override public Object parseValue(String value) {
+        return Double.parseDouble(value);
+      }
+    },
+    use_existing_db(false,
+        "If true, do not destroy the existing database.  If you set this\n" +
+        "\tflag and also specify a benchmark that wants a fresh database,\n" +
+        "\tthat benchmark will fail.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    num(1000000,
+        "Number of key/values to place in database.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    threads(1,
+        "Number of concurrent threads to run.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    reads(null,
+        "Number of read operations to do.  If negative, do --nums reads.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    key_size(16,
+        "The size of each key in bytes.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    value_size(100,
+        "The size of each value in bytes.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    write_buffer_size(4L * SizeUnit.MB,
+        "Number of bytes to buffer in memtable before compacting\n" +
+        "\t(initialized to default value by 'main'.)") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    max_write_buffer_number(2,
+             "The number of in-memory memtables. Each memtable is of size\n" +
+             "\twrite_buffer_size.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    prefix_size(0, "Controls the prefix size for HashSkipList, HashLinkedList,\n" +
+                   "\tand plain table.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    keys_per_prefix(0, "Controls the average number of keys generated\n" +
+             "\tper prefix, 0 means no special handling of the prefix,\n" +
+             "\ti.e. use the prefix comes with the generated random number.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    memtablerep("skip_list",
+        "The memtable format.  Available options are\n" +
+        "\tskip_list,\n" +
+        "\tvector,\n" +
+        "\thash_linkedlist,\n" +
+        "\thash_skiplist (prefix_hash.)") {
+      @Override public Object parseValue(String value) {
+        return value;
+      }
+    },
+    hash_bucket_count(SizeUnit.MB,
+        "The number of hash buckets used in the hash-bucket-based\n" +
+        "\tmemtables.  Memtables that currently support this argument are\n" +
+        "\thash_linkedlist and hash_skiplist.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    writes_per_second(10000,
+        "The write-rate of the background writer used in the\n" +
+        "\t`readwhilewriting` benchmark.  Non-positive number indicates\n" +
+        "\tusing an unbounded write-rate in `readwhilewriting` benchmark.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    use_plain_table(false,
+        "Use plain-table sst format.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    cache_size(-1L,
+        "Number of bytes to use as a cache of uncompressed data.\n" +
+        "\tNegative means use default settings.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    seed(0L,
+        "Seed base for random number generators.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    num_levels(7,
+        "The total number of levels.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    numdistinct(1000L,
+        "Number of distinct keys to use. Used in RandomWithVerify to\n" +
+        "\tread/write on fewer keys so that gets are more likely to find the\n" +
+        "\tkey and puts are more likely to update the same key.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    merge_keys(-1L,
+        "Number of distinct keys to use for MergeRandom and\n" +
+        "\tReadRandomMergeRandom.\n" +
+        "\tIf negative, there will be FLAGS_num keys.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    bloom_locality(0,"Control bloom filter probes locality.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    duration(0,"Time in seconds for the random-ops tests to run.\n" +
+        "\tWhen 0 then num & reads determine the test duration.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    num_multi_db(0,
+        "Number of DBs used in the benchmark. 0 means single DB.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    histogram(false,"Print histogram of operation timings.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    min_write_buffer_number_to_merge(
+        defaultOptions_.minWriteBufferNumberToMerge(),
+        "The minimum number of write buffers that will be merged together\n" +
+        "\tbefore writing to storage. This is cheap because it is an\n" +
+        "\tin-memory merge. If this feature is not enabled, then all these\n" +
+        "\twrite buffers are flushed to L0 as separate files and this\n" +
+        "\tincreases read amplification because a get request has to check\n" +
+        "\tin all of these files. Also, an in-memory merge may result in\n" +
+        "\twriting less data to storage if there are duplicate records\n" +
+        "\tin each of these individual write buffers.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    max_background_compactions(
+        defaultOptions_.maxBackgroundCompactions(),
+        "The maximum number of concurrent background compactions\n" +
+        "\tthat can occur in parallel.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    max_background_flushes(
+        defaultOptions_.maxBackgroundFlushes(),
+        "The maximum number of concurrent background flushes\n" +
+        "\tthat can occur in parallel.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    max_background_jobs(defaultOptions_.maxBackgroundJobs(),
+        "The maximum number of concurrent background jobs\n"
+            + "\tthat can occur in parallel.") {
+      @Override
+      public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    /* TODO(yhchiang): enable the following
+    compaction_style((int32_t) defaultOptions_.compactionStyle(),
+        "style of compaction: level-based vs universal.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },*/
+    universal_size_ratio(0,
+        "Percentage flexibility while comparing file size\n" +
+        "\t(for universal compaction only).") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    universal_min_merge_width(0,"The minimum number of files in a\n" +
+        "\tsingle compaction run (for universal compaction only).") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    universal_max_merge_width(0,"The max number of files to compact\n" +
+        "\tin universal style compaction.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    universal_max_size_amplification_percent(0,
+        "The max size amplification for universal style compaction.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    universal_compression_size_percent(-1,
+        "The percentage of the database to compress for universal\n" +
+        "\tcompaction. -1 means compress everything.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    block_size(defaultBlockBasedTableOptions_.blockSize(),
+        "Number of bytes in a block.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    compressed_cache_size(-1L,
+        "Number of bytes to use as a cache of compressed data.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    open_files(defaultOptions_.maxOpenFiles(),
+        "Maximum number of files to keep open at the same time\n" +
+        "\t(use default if == 0)") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    bloom_bits(-1,"Bloom filter bits per key. Negative means\n" +
+        "\tuse default settings.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    memtable_bloom_size_ratio(0.0d, "Ratio of memtable used by the bloom filter.\n"
+            + "\t0 means no bloom filter.") {
+      @Override public Object parseValue(String value) {
+        return Double.parseDouble(value);
+      }
+    },
+    memtable_whole_key_filtering(false, "Enable whole key bloom filter in memtable.") {
+      @Override
+      public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    cache_numshardbits(-1,"Number of shards for the block cache\n" +
+        "\tis 2 ** cache_numshardbits. Negative means use default settings.\n" +
+        "\tThis is applied only if FLAGS_cache_size is non-negative.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    verify_checksum(false,"Verify checksum for every block read\n" +
+        "\tfrom storage.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    statistics(false,"Database statistics.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    writes(-1L, "Number of write operations to do. If negative, do\n" +
+        "\t--num reads.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    sync(false,"Sync all writes to disk.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    use_fsync(false,"If true, issue fsync instead of fdatasync.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    disable_wal(false,"If true, do not write WAL for write.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    wal_dir("", "If not empty, use the given dir for WAL.") {
+      @Override public Object parseValue(String value) {
+        return value;
+      }
+    },
+    target_file_size_base(2 * 1048576,"Target file size at level-1") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    target_file_size_multiplier(1,
+        "A multiplier to compute target level-N file size (N >= 2)") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    max_bytes_for_level_base(10 * 1048576,
+      "Max bytes for level-1") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    max_bytes_for_level_multiplier(10.0d,
+        "A multiplier to compute max bytes for level-N (N >= 2)") {
+      @Override public Object parseValue(String value) {
+        return Double.parseDouble(value);
+      }
+    },
+    level0_stop_writes_trigger(12,"Number of files in level-0\n" +
+        "\tthat will trigger put stop.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    level0_slowdown_writes_trigger(8,"Number of files in level-0\n" +
+        "\tthat will slow down writes.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    level0_file_num_compaction_trigger(4,"Number of files in level-0\n" +
+        "\twhen compactions start.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    readwritepercent(90,"Ratio of reads to reads/writes (expressed\n" +
+        "\tas percentage) for the ReadRandomWriteRandom workload. The\n" +
+        "\tdefault value 90 means 90% operations out of all reads and writes\n" +
+        "\toperations are reads. In other words, 9 gets for every 1 put.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    mergereadpercent(70,"Ratio of merges to merges&reads (expressed\n" +
+        "\tas percentage) for the ReadRandomMergeRandom workload. The\n" +
+        "\tdefault value 70 means 70% out of all read and merge operations\n" +
+        "\tare merges. In other words, 7 merges for every 3 gets.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    deletepercent(2,"Percentage of deletes out of reads/writes/\n" +
+        "\tdeletes (used in RandomWithVerify only). RandomWithVerify\n" +
+        "\tcalculates writepercent as (100 - FLAGS_readwritepercent -\n" +
+        "\tdeletepercent), so deletepercent must be smaller than (100 -\n" +
+        "\tFLAGS_readwritepercent)") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    delete_obsolete_files_period_micros(0,"Option to delete\n" +
+        "\tobsolete files periodically. 0 means that obsolete files are\n" +
+        "\tdeleted after every compaction run.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    compression_type("snappy",
+        "Algorithm used to compress the database.") {
+      @Override public Object parseValue(String value) {
+        return value;
+      }
+    },
+    compression_level(-1,
+        "Compression level. For zlib this should be -1 for the\n" +
+        "\tdefault level, or between 0 and 9.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    min_level_to_compress(-1,"If non-negative, compression starts\n" +
+        "\tfrom this level. Levels with number < min_level_to_compress are\n" +
+        "\tnot compressed. Otherwise, apply compression_type to\n" +
+        "\tall levels.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    table_cache_numshardbits(4,"") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    stats_interval(0L, "Stats are reported every N operations when\n" +
+        "\tthis is greater than zero. When 0 the interval grows over time.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    stats_per_interval(0,"Reports additional stats per interval when\n" +
+        "\tthis is greater than 0.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    perf_level(0,"Level of perf collection.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    max_compaction_bytes(0L, "Limit number of bytes in one compaction to be lower than this\n" +
+            "\threshold. But it's not guaranteed.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    readonly(false,"Run read only benchmarks.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    disable_auto_compactions(false,"Do not auto trigger compactions.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    wal_ttl_seconds(0L,"Set the TTL for the WAL Files in seconds.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    wal_size_limit_MB(0L,"Set the size limit for the WAL Files\n" +
+        "\tin MB.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    /* TODO(yhchiang): enable the following
+    direct_reads(rocksdb::EnvOptions().use_direct_reads,
+        "Allow direct I/O reads.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+      },
+    direct_writes(rocksdb::EnvOptions().use_direct_reads,
+      "Allow direct I/O reads.") {
+      @Override public Object parseValue(String value) {
+      return parseBoolean(value);
+      }
+      },
+    */
+    mmap_read(false,
+        "Allow reads to occur via mmap-ing files.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    mmap_write(false,
+        "Allow writes to occur via mmap-ing files.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    advise_random_on_open(defaultOptions_.adviseRandomOnOpen(),
+        "Advise random access on table file open.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    compaction_fadvice("NORMAL",
+      "Access pattern advice when a file is compacted.") {
+      @Override public Object parseValue(String value) {
+        return value;
+      }
+    },
+    use_tailing_iterator(false,
+        "Use tailing iterator to access a series of keys instead of get.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    use_adaptive_mutex(defaultOptions_.useAdaptiveMutex(),
+        "Use adaptive mutex.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    bytes_per_sync(defaultOptions_.bytesPerSync(),
+        "Allows OS to incrementally sync files to disk while they are\n" +
+        "\tbeing written, in the background. Issue one request for every\n" +
+        "\tbytes_per_sync written. 0 turns it off.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    filter_deletes(false," On true, deletes use bloom-filter and drop\n" +
+        "\tthe delete if key not present.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    max_successive_merges(0,"Maximum number of successive merge\n" +
+        "\toperations on a key in the memtable.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    db(getTempDir("rocksdb-jni"),
+       "Use the db with the following name.") {
+      @Override public Object parseValue(String value) {
+        return value;
+      }
+    },
+    use_mem_env(false, "Use RocksMemEnv instead of default filesystem based\n" +
+        "environment.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    java_comparator(null, "Class name of a Java Comparator to use instead\n" +
+        "\tof the default C++ ByteWiseComparatorImpl. Must be available on\n" +
+        "\tthe classpath") {
+      @Override
+      protected Object parseValue(final String value) {
+        try {
+          final ComparatorOptions copt = new ComparatorOptions();
+          final Class<AbstractComparator> clsComparator =
+              (Class<AbstractComparator>)Class.forName(value);
+          final Constructor cstr =
+              clsComparator.getConstructor(ComparatorOptions.class);
+          return cstr.newInstance(copt);
+        } catch(final ClassNotFoundException cnfe) {
+          throw new IllegalArgumentException("Java Comparator '" + value + "'" +
+              " not found on the classpath", cnfe);
+        } catch(final NoSuchMethodException nsme) {
+          throw new IllegalArgumentException("Java Comparator '" + value + "'" +
+              " does not have a public ComparatorOptions constructor", nsme);
+        } catch(final IllegalAccessException | InstantiationException
+            | InvocationTargetException ie) {
+          throw new IllegalArgumentException("Unable to construct Java" +
+              " Comparator '" + value + "'", ie);
+        }
+      }
+    };
+
+    private Flag(Object defaultValue, String desc) {
+      defaultValue_ = defaultValue;
+      desc_ = desc;
+    }
+
+    public Object getDefaultValue() {
+      return defaultValue_;
+    }
+
+    public String desc() {
+      return desc_;
+    }
+
+    public boolean parseBoolean(String value) {
+      if (value.equals("1")) {
+        return true;
+      } else if (value.equals("0")) {
+        return false;
+      }
+      return Boolean.parseBoolean(value);
+    }
+
+    protected abstract Object parseValue(String value);
+
+    private final Object defaultValue_;
+    private final String desc_;
+  }
+
+  private final static String DEFAULT_TEMP_DIR = "/tmp";
+
+  private static String getTempDir(final String dirName) {
+    try {
+      return Files.createTempDirectory(dirName).toAbsolutePath().toString();
+    } catch(final IOException ioe) {
+      System.err.println("Unable to create temp directory, defaulting to: " +
+          DEFAULT_TEMP_DIR);
+      return DEFAULT_TEMP_DIR + File.pathSeparator + dirName;
+    }
+  }
+
+  private static class RandomGenerator {
+    private final byte[] data_;
+    private int dataLength_;
+    private int position_;
+    private double compressionRatio_;
+    Random rand_;
+
+    private RandomGenerator(long seed, double compressionRatio) {
+      // We use a limited amount of data over and over again and ensure
+      // that it is larger than the compression window (32KB), and also
+      byte[] value = new byte[100];
+      // large enough to serve all typical value sizes we want to write.
+      rand_ = new Random(seed);
+      dataLength_ = value.length * 10000;
+      data_ = new byte[dataLength_];
+      compressionRatio_ = compressionRatio;
+      int pos = 0;
+      while (pos < dataLength_) {
+        compressibleBytes(value);
+        System.arraycopy(value, 0, data_, pos,
+                         Math.min(value.length, dataLength_ - pos));
+        pos += value.length;
+      }
+    }
+
+    private void compressibleBytes(byte[] value) {
+      int baseLength = value.length;
+      if (compressionRatio_ < 1.0d) {
+        baseLength = (int) (compressionRatio_ * value.length + 0.5);
+      }
+      if (baseLength <= 0) {
+        baseLength = 1;
+      }
+      int pos;
+      for (pos = 0; pos < baseLength; ++pos) {
+        value[pos] = (byte) (' ' + rand_.nextInt(95));  // ' ' .. '~'
+      }
+      while (pos < value.length) {
+        System.arraycopy(value, 0, value, pos,
+                         Math.min(baseLength, value.length - pos));
+        pos += baseLength;
+      }
+    }
+
+    private void generate(byte[] value) {
+      if (position_ + value.length > data_.length) {
+        position_ = 0;
+        assert(value.length <= data_.length);
+      }
+      position_ += value.length;
+      System.arraycopy(data_, position_ - value.length,
+                       value, 0, value.length);
+    }
+  }
+
+  boolean isFinished() {
+    synchronized(finishLock_) {
+      return isFinished_;
+    }
+  }
+
+  void setFinished(boolean flag) {
+    synchronized(finishLock_) {
+      isFinished_ = flag;
+    }
+  }
+
+  RocksDB db_;
+  final List<String> benchmarks_;
+  final int num_;
+  final int reads_;
+  final int keySize_;
+  final int valueSize_;
+  final int threadNum_;
+  final int writesPerSeconds_;
+  final long randSeed_;
+  final boolean useExisting_;
+  final String databaseDir_;
+  double compressionRatio_;
+  RandomGenerator gen_;
+  long startTime_;
+
+  // env
+  boolean useMemenv_;
+
+  // memtable related
+  final int maxWriteBufferNumber_;
+  final int prefixSize_;
+  final int keysPerPrefix_;
+  final String memtable_;
+  final long hashBucketCount_;
+
+  // sst format related
+  boolean usePlainTable_;
+
+  Object finishLock_;
+  boolean isFinished_;
+  Map<Flag, Object> flags_;
+  // as the scope of a static member equals to the scope of the problem,
+  // we let its c++ pointer to be disposed in its finalizer.
+  static Options defaultOptions_ = new Options();
+  static BlockBasedTableConfig defaultBlockBasedTableOptions_ =
+    new BlockBasedTableConfig();
+  String compressionType_;
+  CompressionType compression_;
+}
diff --git a/src/rocksdb/java/crossbuild/Vagrantfile b/src/rocksdb/java/crossbuild/Vagrantfile
new file mode 100644
index 000000000..0ee50de2c
--- /dev/null
+++ b/src/rocksdb/java/crossbuild/Vagrantfile
@@ -0,0 +1,51 @@
+# -*- mode: ruby -*-
+# vi: set ft=ruby :
+
+# Vagrantfile API/syntax version. Don't touch unless you know what you're doing!
+VAGRANTFILE_API_VERSION = "2"
+
+Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
+
+  config.vm.define "linux32" do |linux32|
+    linux32.vm.box = "bento/centos-6.10-i386"
+    linux32.vm.provision :shell, path: "build-linux-centos.sh"
+  end
+
+  config.vm.define "linux64" do |linux64|
+    linux64.vm.box = "bento/centos-6.10"
+    linux64.vm.provision :shell, path: "build-linux-centos.sh"
+  end
+
+  config.vm.define "linux32-musl" do |musl32|
+    musl32.vm.box = "alpine/alpine32"
+    musl32.vm.box_version = "3.6.0"
+    musl32.vm.provision :shell, path: "build-linux-alpine.sh"
+  end
+
+  config.vm.define "linux64-musl" do |musl64|
+    musl64.vm.box = "generic/alpine36"
+
+    ##  Should use the alpine/alpine64 box, but this issue needs to be fixed first - https://github.com/hashicorp/vagrant/issues/11218 
+    # musl64.vm.box = "alpine/alpine64"
+    # musl64.vm.box_version = "3.6.0"
+
+    musl64.vm.provision :shell, path: "build-linux-alpine.sh"
+  end
+
+  config.vm.provider "virtualbox" do |v|
+    v.memory = 2048
+    v.cpus = 4
+    v.customize ["modifyvm", :id, "--nictype1", "virtio" ]
+  end
+
+  if Vagrant.has_plugin?("vagrant-cachier")
+    config.cache.scope = :box
+  end
+  if Vagrant.has_plugin?("vagrant-vbguest")
+    config.vbguest.no_install = true
+  end
+
+  config.vm.synced_folder "../target", "/rocksdb-build"
+  config.vm.synced_folder "../..", "/rocksdb", type: "rsync"
+  config.vm.boot_timeout = 1200
+end
diff --git a/src/rocksdb/java/crossbuild/build-linux-alpine.sh b/src/rocksdb/java/crossbuild/build-linux-alpine.sh
new file mode 100755
index 000000000..561d34141
--- /dev/null
+++ b/src/rocksdb/java/crossbuild/build-linux-alpine.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+set -e
+
+# update Alpine with latest versions
+echo '@edge http://nl.alpinelinux.org/alpine/edge/main' >> /etc/apk/repositories
+echo '@community http://nl.alpinelinux.org/alpine/edge/community' >> /etc/apk/repositories
+apk update
+apk upgrade
+
+# install CA certificates
+apk add ca-certificates
+
+# install build tools
+apk add \
+  build-base \
+  coreutils \
+  file \
+  git \
+  perl \
+  automake \
+  autoconf \
+  cmake
+
+# install tool dependencies for building RocksDB static library
+apk add \
+  curl \
+  bash \
+  wget \
+  tar \
+  openssl
+
+# install RocksDB dependencies
+apk add \
+  snappy snappy-dev \
+  zlib zlib-dev \
+  bzip2 bzip2-dev \
+  lz4 lz4-dev \
+  zstd zstd-dev \
+  linux-headers \
+  jemalloc jemalloc-dev
+
+# install OpenJDK7
+apk add openjdk7 \
+  && apk add java-cacerts \
+  && rm /usr/lib/jvm/java-1.7-openjdk/jre/lib/security/cacerts \
+  && ln -s /etc/ssl/certs/java/cacerts /usr/lib/jvm/java-1.7-openjdk/jre/lib/security/cacerts
+
+# cleanup
+rm -rf /var/cache/apk/*
+
+# puts javac in the PATH
+export JAVA_HOME=/usr/lib/jvm/java-1.7-openjdk
+export PATH=/usr/lib/jvm/java-1.7-openjdk/bin:$PATH
+
+# gflags from source
+cd /tmp &&\
+  git clone -b v2.0 --single-branch https://github.com/gflags/gflags.git &&\
+  cd gflags &&\
+  ./configure --prefix=/usr && make && make install &&\
+  rm -rf /tmp/*
+
+
+# build rocksdb
+cd /rocksdb
+make jclean clean
+PORTABLE=1 make -j8 rocksdbjavastatic
+cp /rocksdb/java/target/librocksdbjni-* /rocksdb-build
+cp /rocksdb/java/target/rocksdbjni-* /rocksdb-build
diff --git a/src/rocksdb/java/crossbuild/build-linux-centos.sh b/src/rocksdb/java/crossbuild/build-linux-centos.sh
new file mode 100755
index 000000000..176e3456c
--- /dev/null
+++ b/src/rocksdb/java/crossbuild/build-linux-centos.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+set -e
+
+# remove fixed relesever variable present in the hanscode boxes
+sudo rm -f /etc/yum/vars/releasever
+
+# enable EPEL
+sudo yum -y install epel-release
+
+# install all required packages for rocksdb that are available through yum
+sudo yum -y install openssl java-1.7.0-openjdk-devel zlib-devel bzip2-devel lz4-devel snappy-devel libzstd-devel jemalloc-devel cmake3
+
+# set up cmake3 as cmake binary
+sudo alternatives --install /usr/local/bin/cmake cmake /usr/bin/cmake 10 --slave /usr/local/bin/ctest ctest /usr/bin/ctest --slave /usr/local/bin/cpack cpack /usr/bin/cpack --slave /usr/local/bin/ccmake ccmake /usr/bin/ccmake
+sudo alternatives --install /usr/local/bin/cmake cmake /usr/bin/cmake3 20 --slave /usr/local/bin/ctest ctest /usr/bin/ctest3 --slave /usr/local/bin/cpack cpack /usr/bin/cpack3 --slave /usr/local/bin/ccmake ccmake /usr/bin/ccmake3
+
+# install gcc/g++ 4.8.2 from tru/devtools-2
+sudo wget -O /etc/yum.repos.d/devtools-2.repo https://people.centos.org/tru/devtools-2/devtools-2.repo
+sudo yum -y install devtoolset-2-binutils devtoolset-2-gcc devtoolset-2-gcc-c++
+
+# install gflags
+wget https://github.com/gflags/gflags/archive/v2.0.tar.gz -O gflags-2.0.tar.gz
+tar xvfz gflags-2.0.tar.gz; cd gflags-2.0; scl enable devtoolset-2 ./configure; scl enable devtoolset-2 make; sudo make install
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
+
+# set java home so we can build rocksdb jars
+export JAVA_HOME=/usr/lib/jvm/java-1.7.0
+
+export PATH=$JAVA_HOME:/usr/local/bin:$PATH
+
+# build rocksdb
+cd /rocksdb
+scl enable devtoolset-2 'make clean-not-downloaded'
+scl enable devtoolset-2 'PORTABLE=1 make -j8 rocksdbjavastatic'
+cp /rocksdb/java/target/librocksdbjni-* /rocksdb-build
+cp /rocksdb/java/target/rocksdbjni-* /rocksdb-build
diff --git a/src/rocksdb/java/crossbuild/build-linux.sh b/src/rocksdb/java/crossbuild/build-linux.sh
new file mode 100755
index 000000000..74178adb5
--- /dev/null
+++ b/src/rocksdb/java/crossbuild/build-linux.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# install all required packages for rocksdb
+sudo apt-get update
+sudo apt-get -y install git make gcc g++ libgflags-dev libsnappy-dev zlib1g-dev libbz2-dev default-jdk
+
+# set java home so we can build rocksdb jars
+export JAVA_HOME=$(echo /usr/lib/jvm/java-7-openjdk*)
+cd /rocksdb
+make jclean clean
+make -j 4 rocksdbjavastatic
+cp /rocksdb/java/target/librocksdbjni-* /rocksdb-build
+cp /rocksdb/java/target/rocksdbjni-* /rocksdb-build
+sudo shutdown -h now
+
diff --git a/src/rocksdb/java/crossbuild/docker-build-linux-alpine.sh b/src/rocksdb/java/crossbuild/docker-build-linux-alpine.sh
new file mode 100755
index 000000000..e3e852efe
--- /dev/null
+++ b/src/rocksdb/java/crossbuild/docker-build-linux-alpine.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+set -e
+#set -x
+
+# just in-case this is run outside Docker
+mkdir -p /rocksdb-local-build
+
+rm -rf /rocksdb-local-build/*
+cp -r /rocksdb-host/* /rocksdb-local-build
+cd /rocksdb-local-build
+
+make clean-not-downloaded
+PORTABLE=1 make -j2 rocksdbjavastatic
+
+cp java/target/librocksdbjni-linux*.so java/target/rocksdbjni-*-linux*.jar java/target/rocksdbjni-*-linux*.jar.sha1 /rocksdb-java-target
diff --git a/src/rocksdb/java/crossbuild/docker-build-linux-centos.sh b/src/rocksdb/java/crossbuild/docker-build-linux-centos.sh
new file mode 100755
index 000000000..16581dec7
--- /dev/null
+++ b/src/rocksdb/java/crossbuild/docker-build-linux-centos.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+set -e
+#set -x
+
+# just in-case this is run outside Docker
+mkdir -p /rocksdb-local-build
+
+rm -rf /rocksdb-local-build/*
+cp -r /rocksdb-host/* /rocksdb-local-build
+cd /rocksdb-local-build
+
+# Use scl devtoolset if available
+if hash scl 2>/dev/null; then
+  if scl --list | grep -q 'devtoolset-8'; then
+                # CentOS 6+
+                scl enable devtoolset-8 'make clean-not-downloaded'
+                scl enable devtoolset-8 'PORTABLE=1 make -j2 rocksdbjavastatic'
+  elif scl --list | grep -q 'devtoolset-7'; then
+    # CentOS 6+
+    scl enable devtoolset-7 'make clean-not-downloaded'
+    scl enable devtoolset-7 'PORTABLE=1 make -j2 rocksdbjavastatic'
+  elif scl --list | grep -q 'devtoolset-2'; then
+    # CentOS 5 or 6
+    scl enable devtoolset-2 'make clean-not-downloaded'
+    scl enable devtoolset-2 'PORTABLE=1 make -j2 rocksdbjavastatic'
+  else
+    echo "Could not find devtoolset"
+    exit 1;
+  fi
+else
+  make clean-not-downloaded
+  PORTABLE=1 make -j2 rocksdbjavastatic
+fi
+
+cp java/target/librocksdbjni-linux*.so java/target/rocksdbjni-*-linux*.jar java/target/rocksdbjni-*-linux*.jar.sha1 /rocksdb-java-target
+
diff --git a/src/rocksdb/java/jdb_bench.sh b/src/rocksdb/java/jdb_bench.sh
new file mode 100755
index 000000000..5dfc385e3
--- /dev/null
+++ b/src/rocksdb/java/jdb_bench.sh
@@ -0,0 +1,13 @@
+# shellcheck disable=SC2148
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+PLATFORM=64
+if [ `getconf LONG_BIT` != "64" ]
+then
+  PLATFORM=32
+fi
+
+ROCKS_JAR=`find target -name rocksdbjni*.jar`
+
+echo "Running benchmark in $PLATFORM-Bit mode."
+# shellcheck disable=SC2068
+java -server -d$PLATFORM -XX:NewSize=4m -XX:+AggressiveOpts -Djava.library.path=target -cp "${ROCKS_JAR}:benchmark/target/classes" org.rocksdb.benchmark.DbBenchmark $@
diff --git a/src/rocksdb/java/jmh/LICENSE-HEADER.txt b/src/rocksdb/java/jmh/LICENSE-HEADER.txt
new file mode 100644
index 000000000..365ee653b
--- /dev/null
+++ b/src/rocksdb/java/jmh/LICENSE-HEADER.txt
@@ -0,0 +1,5 @@
+Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+ This source code is licensed under both the GPLv2 (found in the
+ COPYING file in the root directory) and Apache 2.0 License
+ (found in the LICENSE.Apache file in the root directory).
+
diff --git a/src/rocksdb/java/jmh/README.md b/src/rocksdb/java/jmh/README.md
new file mode 100644
index 000000000..1575ab517
--- /dev/null
+++ b/src/rocksdb/java/jmh/README.md
@@ -0,0 +1,24 @@
+# JMH Benchmarks for RocksJava
+
+These are micro-benchmarks for RocksJava functionality, using [JMH (Java Microbenchmark Harness)](https://openjdk.java.net/projects/code-tools/jmh/).
+
+## Compiling
+
+**Note**: This uses a specific build of RocksDB that is set in the `<version>` element of the `dependencies` section of the `pom.xml` file. If you are testing local changes you should build and install a SNAPSHOT version of rocksdbjni, and update the `pom.xml` of rocksdbjni-jmh file to test with this.
+
+For instance, this is how to install the OSX jar you just built for 6.26.0
+
+```bash
+$ mvn install:install-file -Dfile=./java/target/rocksdbjni-6.26.0-SNAPSHOT-osx.jar -DgroupId=org.rocksdb -DartifactId=rocksdbjni -Dversion=6.26.0-SNAPSHOT -Dpackaging=jar
+```
+
+```bash
+$ mvn package
+```
+
+## Running
+```bash
+$ java -jar target/rocksdbjni-jmh-1.0-SNAPSHOT-benchmarks.jar
+```
+
+NOTE: you can append `-help` to the command above to see all of the JMH runtime options.
diff --git a/src/rocksdb/java/jmh/pom.xml b/src/rocksdb/java/jmh/pom.xml
new file mode 100644
index 000000000..26615da86
--- /dev/null
+++ b/src/rocksdb/java/jmh/pom.xml
@@ -0,0 +1,138 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <groupId>org.rocksdb</groupId>
+    <artifactId>rocksdbjni-jmh</artifactId>
+    <version>1.0-SNAPSHOT</version>
+
+    <url>http://rocksdb.org/</url>
+
+    <name>rocksdbjni-jmh</name>
+    <description>JMH Benchmarks for RocksDB Java API</description>
+
+    <organization>
+        <name>Facebook, Inc.</name>
+        <url>https://www.facebook.com</url>
+    </organization>
+
+    <licenses>
+        <license>
+            <name>Apache License 2.0</name>
+            <url>http://www.apache.org/licenses/LICENSE-2.0.html</url>
+            <distribution>repo</distribution>
+        </license>
+        <license>
+            <name>GNU General Public License, version 2</name>
+            <url>http://www.gnu.org/licenses/gpl-2.0.html</url>
+            <distribution>repo</distribution>
+        </license>
+    </licenses>
+
+    <scm>
+        <connection>scm:git:git://github.com/facebook/rocksdb.git</connection>
+        <developerConnection>scm:git:git@github.com:facebook/rocksdb.git</developerConnection>
+        <url>http://github.com/facebook/rocksdb/</url>
+    </scm>
+
+    <properties>
+        <project.build.source>1.7</project.build.source>
+        <project.build.target>1.7</project.build.target>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+
+        <jmh.version>1.22</jmh.version>
+        <uberjar.name>benchmarks</uberjar.name>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.rocksdb</groupId>
+            <artifactId>rocksdbjni</artifactId>
+            <version>6.27.0-SNAPSHOT</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.openjdk.jmh</groupId>
+            <artifactId>jmh-core</artifactId>
+            <version>${jmh.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.openjdk.jmh</groupId>
+            <artifactId>jmh-generator-annprocess</artifactId>
+            <version>${jmh.version}</version>
+            <scope>provided</scope>
+        </dependency>
+    </dependencies>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <version>3.8.1</version>
+                <configuration>
+                    <source>${project.build.source}</source>
+                    <target>${project.build.target}</target>
+                    <encoding>${project.build.sourceEncoding}</encoding>
+                </configuration>
+            </plugin>
+
+            <plugin>
+                <groupId>com.mycila</groupId>
+                <artifactId>license-maven-plugin</artifactId>
+                <version>3.0</version>
+                <inherited>true</inherited>
+                <configuration>
+                    <header>LICENSE-HEADER.txt</header>
+                    <failIfMissing>true</failIfMissing>
+                    <aggregate>true</aggregate>
+                    <strictCheck>true</strictCheck>
+                    <excludes>
+                        <exclude>pom.xml</exclude>
+                    </excludes>
+                    <encoding>${project.build.sourceEncoding}</encoding>
+                </configuration>
+            </plugin>
+
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-shade-plugin</artifactId>
+                <version>3.2.1</version>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <finalName>${project.artifactId}-${project.version}-${uberjar.name}</finalName>
+                            <transformers>
+                                <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                                    <mainClass>org.openjdk.jmh.Main</mainClass>
+                                </transformer>
+                            </transformers>
+                            <filters>
+                                <filter>
+                                    <!--
+                                        Shading signed JARs will fail without this.
+                                        http://stackoverflow.com/questions/999489/invalid-signature-file-when-attempting-to-run-a-jar
+                                    -->
+                                    <artifact>*:*</artifact>
+                                    <excludes>
+                                        <exclude>META-INF/*.SF</exclude>
+                                        <exclude>META-INF/*.DSA</exclude>
+                                        <exclude>META-INF/*.RSA</exclude>
+                                    </excludes>
+                                </filter>
+                            </filters>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+
+        </plugins>
+    </build>
+
+</project>
+\ No newline at end of file
diff --git a/src/rocksdb/java/jmh/src/main/java/org/rocksdb/jmh/ComparatorBenchmarks.java b/src/rocksdb/java/jmh/src/main/java/org/rocksdb/jmh/ComparatorBenchmarks.java
new file mode 100644
index 000000000..1973b5487
--- /dev/null
+++ b/src/rocksdb/java/jmh/src/main/java/org/rocksdb/jmh/ComparatorBenchmarks.java
@@ -0,0 +1,139 @@
+/**
+ * Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+ *  This source code is licensed under both the GPLv2 (found in the
+ *  COPYING file in the root directory) and Apache 2.0 License
+ *  (found in the LICENSE.Apache file in the root directory).
+ */
+package org.rocksdb.jmh;
+
+import org.openjdk.jmh.annotations.*;
+import org.rocksdb.*;
+import org.rocksdb.util.BytewiseComparator;
+import org.rocksdb.util.FileUtils;
+import org.rocksdb.util.ReverseBytewiseComparator;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import static org.rocksdb.util.KVUtils.ba;
+
+@State(Scope.Benchmark)
+public class ComparatorBenchmarks {
+
+  @Param({
+      "native_bytewise",
+      "native_reverse_bytewise",
+
+      "java_bytewise_non-direct_reused-64_adaptive-mutex",
+      "java_bytewise_non-direct_reused-64_non-adaptive-mutex",
+      "java_bytewise_non-direct_reused-64_thread-local",
+      "java_bytewise_direct_reused-64_adaptive-mutex",
+      "java_bytewise_direct_reused-64_non-adaptive-mutex",
+      "java_bytewise_direct_reused-64_thread-local",
+      "java_bytewise_non-direct_no-reuse",
+      "java_bytewise_direct_no-reuse",
+
+      "java_reverse_bytewise_non-direct_reused-64_adaptive-mutex",
+      "java_reverse_bytewise_non-direct_reused-64_non-adaptive-mutex",
+      "java_reverse_bytewise_non-direct_reused-64_thread-local",
+      "java_reverse_bytewise_direct_reused-64_adaptive-mutex",
+      "java_reverse_bytewise_direct_reused-64_non-adaptive-mutex",
+      "java_reverse_bytewise_direct_reused-64_thread-local",
+      "java_reverse_bytewise_non-direct_no-reuse",
+      "java_reverse_bytewise_direct_no-reuse"
+  })
+  public String comparatorName;
+
+  Path dbDir;
+  ComparatorOptions comparatorOptions;
+  AbstractComparator comparator;
+  Options options;
+  RocksDB db;
+
+  @Setup(Level.Trial)
+  public void setup() throws IOException, RocksDBException {
+    RocksDB.loadLibrary();
+
+    dbDir = Files.createTempDirectory("rocksjava-comparator-benchmarks");
+
+    options = new Options()
+            .setCreateIfMissing(true);
+
+    if ("native_bytewise".equals(comparatorName)) {
+      options.setComparator(BuiltinComparator.BYTEWISE_COMPARATOR);
+
+    } else if ("native_reverse_bytewise".equals(comparatorName)) {
+      options.setComparator(BuiltinComparator.REVERSE_BYTEWISE_COMPARATOR);
+
+    } else if (comparatorName.startsWith("java_")) {
+      comparatorOptions = new ComparatorOptions();
+
+      if (comparatorName.indexOf("non-direct") > -1) {
+        comparatorOptions.setUseDirectBuffer(false);
+      } else if (comparatorName.indexOf("direct") > -1) {
+        comparatorOptions.setUseDirectBuffer(true);
+      }
+
+      if (comparatorName.indexOf("no-reuse") > -1) {
+        comparatorOptions.setMaxReusedBufferSize(-1);
+      } else if (comparatorName.indexOf("_reused-") > -1) {
+        final int idx = comparatorName.indexOf("_reused-");
+        String s = comparatorName.substring(idx + 8);
+        s = s.substring(0, s.indexOf('_'));
+        comparatorOptions.setMaxReusedBufferSize(Integer.parseInt(s));
+      }
+
+      if (comparatorName.indexOf("non-adaptive-mutex") > -1) {
+        comparatorOptions.setReusedSynchronisationType(ReusedSynchronisationType.MUTEX);
+      } else if (comparatorName.indexOf("adaptive-mutex") > -1) {
+        comparatorOptions.setReusedSynchronisationType(ReusedSynchronisationType.ADAPTIVE_MUTEX);
+      } else if (comparatorName.indexOf("thread-local") > -1) {
+        comparatorOptions.setReusedSynchronisationType(ReusedSynchronisationType.THREAD_LOCAL);
+      }
+
+      if (comparatorName.startsWith("java_bytewise")) {
+        comparator = new BytewiseComparator(comparatorOptions);
+      } else if (comparatorName.startsWith("java_reverse_bytewise")) {
+        comparator = new ReverseBytewiseComparator(comparatorOptions);
+      }
+
+      options.setComparator(comparator);
+
+    } else {
+      throw new IllegalArgumentException("Unknown comparatorName: " + comparatorName);
+    }
+
+    db = RocksDB.open(options, dbDir.toAbsolutePath().toString());
+  }
+
+  @TearDown(Level.Trial)
+  public void cleanup() throws IOException {
+    db.close();
+    if (comparator != null) {
+      comparator.close();
+    }
+    if (comparatorOptions != null) {
+      comparatorOptions.close();
+    }
+    options.close();
+    FileUtils.delete(dbDir);
+  }
+
+  @State(Scope.Benchmark)
+  public static class Counter {
+    private final AtomicInteger count = new AtomicInteger();
+
+    public int next() {
+      return count.getAndIncrement();
+    }
+  }
+
+
+  @Benchmark
+  public void put(final Counter counter) throws RocksDBException {
+    final int i = counter.next();
+    db.put(ba("key" + i), ba("value" + i));
+  }
+}
diff --git a/src/rocksdb/java/jmh/src/main/java/org/rocksdb/jmh/GetBenchmarks.java b/src/rocksdb/java/jmh/src/main/java/org/rocksdb/jmh/GetBenchmarks.java
new file mode 100644
index 000000000..e34005c2f
--- /dev/null
+++ b/src/rocksdb/java/jmh/src/main/java/org/rocksdb/jmh/GetBenchmarks.java
@@ -0,0 +1,139 @@
+/**
+ * Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+ *  This source code is licensed under both the GPLv2 (found in the
+ *  COPYING file in the root directory) and Apache 2.0 License
+ *  (found in the LICENSE.Apache file in the root directory).
+ */
+package org.rocksdb.jmh;
+
+import org.openjdk.jmh.annotations.*;
+import org.rocksdb.*;
+import org.rocksdb.util.FileUtils;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import static org.rocksdb.util.KVUtils.ba;
+
+@State(Scope.Benchmark)
+public class GetBenchmarks {
+
+  @Param({
+      "no_column_family",
+      "1_column_family",
+      "20_column_families",
+      "100_column_families"
+  })
+  String columnFamilyTestType;
+
+  @Param("100000")
+  int keyCount;
+
+  Path dbDir;
+  DBOptions options;
+  int cfs = 0;  // number of column families
+  private AtomicInteger cfHandlesIdx;
+  ColumnFamilyHandle[] cfHandles;
+  RocksDB db;
+  private final AtomicInteger keyIndex = new AtomicInteger();
+
+  @Setup(Level.Trial)
+  public void setup() throws IOException, RocksDBException {
+    RocksDB.loadLibrary();
+
+    dbDir = Files.createTempDirectory("rocksjava-get-benchmarks");
+
+    options = new DBOptions()
+        .setCreateIfMissing(true)
+        .setCreateMissingColumnFamilies(true);
+
+    final List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+    cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+
+    if ("1_column_family".equals(columnFamilyTestType)) {
+      cfs = 1;
+    } else if ("20_column_families".equals(columnFamilyTestType)) {
+      cfs = 20;
+    } else if ("100_column_families".equals(columnFamilyTestType)) {
+      cfs = 100;
+    }
+
+    if (cfs > 0) {
+      cfHandlesIdx = new AtomicInteger(1);
+      for (int i = 1; i <= cfs; i++) {
+        cfDescriptors.add(new ColumnFamilyDescriptor(ba("cf" + i)));
+      }
+    }
+
+    final List<ColumnFamilyHandle> cfHandlesList = new ArrayList<>(cfDescriptors.size());
+    db = RocksDB.open(options, dbDir.toAbsolutePath().toString(), cfDescriptors, cfHandlesList);
+    cfHandles = cfHandlesList.toArray(new ColumnFamilyHandle[0]);
+
+    // store initial data for retrieving via get
+    for (int i = 0; i < cfs; i++) {
+      for (int j = 0; j < keyCount; j++) {
+        db.put(cfHandles[i], ba("key" + j), ba("value" + j));
+      }
+    }
+
+    try (final FlushOptions flushOptions = new FlushOptions()
+            .setWaitForFlush(true)) {
+      db.flush(flushOptions);
+    }
+  }
+
+  @TearDown(Level.Trial)
+  public void cleanup() throws IOException {
+    for (final ColumnFamilyHandle cfHandle : cfHandles) {
+      cfHandle.close();
+    }
+    db.close();
+    options.close();
+    FileUtils.delete(dbDir);
+  }
+
+  private ColumnFamilyHandle getColumnFamily() {
+    if (cfs == 0) {
+      return cfHandles[0];
+    }  else if (cfs == 1) {
+      return cfHandles[1];
+    } else {
+      int idx = cfHandlesIdx.getAndIncrement();
+      if (idx > cfs) {
+        cfHandlesIdx.set(1); // doesn't ensure a perfect distribution, but it's ok
+        idx = 0;
+      }
+      return cfHandles[idx];
+    }
+  }
+
+  /**
+   * Takes the next position in the index.
+   */
+  private int next() {
+    int idx;
+    int nextIdx;
+    while (true) {
+      idx = keyIndex.get();
+      nextIdx = idx + 1;
+      if (nextIdx >= keyCount) {
+        nextIdx = 0;
+      }
+
+      if (keyIndex.compareAndSet(idx, nextIdx)) {
+        break;
+      }
+    }
+    return idx;
+  }
+
+  @Benchmark
+  public byte[] get() throws RocksDBException {
+    final int keyIdx = next();
+    return db.get(getColumnFamily(), ba("key" + keyIdx));
+  }
+}
diff --git a/src/rocksdb/java/jmh/src/main/java/org/rocksdb/jmh/MultiGetBenchmarks.java b/src/rocksdb/java/jmh/src/main/java/org/rocksdb/jmh/MultiGetBenchmarks.java
new file mode 100644
index 000000000..c8c827444
--- /dev/null
+++ b/src/rocksdb/java/jmh/src/main/java/org/rocksdb/jmh/MultiGetBenchmarks.java
@@ -0,0 +1,232 @@
+/**
+ * Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+ *  This source code is licensed under both the GPLv2 (found in the
+ *  COPYING file in the root directory) and Apache 2.0 License
+ *  (found in the LICENSE.Apache file in the root directory).
+ */
+package org.rocksdb.jmh;
+
+import static org.rocksdb.util.KVUtils.ba;
+import static org.rocksdb.util.KVUtils.keys;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+import org.openjdk.jmh.annotations.*;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.RunnerException;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+import org.rocksdb.*;
+import org.rocksdb.util.FileUtils;
+
+@State(Scope.Thread)
+public class MultiGetBenchmarks {
+  @Param({
+      "no_column_family",
+      "1_column_family",
+      "20_column_families",
+      "100_column_families"
+  })
+  String columnFamilyTestType;
+
+  @Param({"10000", "25000", "100000"}) int keyCount;
+
+  @Param({
+          "10",
+          "100",
+          "1000",
+          "10000",
+  })
+  int multiGetSize;
+
+  @Param({"16", "64", "250", "1000", "4000", "16000"}) int valueSize;
+  @Param({"16"}) int keySize; // big enough
+
+  Path dbDir;
+  DBOptions options;
+  int cfs = 0;  // number of column families
+  private AtomicInteger cfHandlesIdx;
+  ColumnFamilyHandle[] cfHandles;
+  RocksDB db;
+  private final AtomicInteger keyIndex = new AtomicInteger();
+
+  @Setup(Level.Trial)
+  public void setup() throws IOException, RocksDBException {
+    RocksDB.loadLibrary();
+
+    dbDir = Files.createTempDirectory("rocksjava-multiget-benchmarks");
+
+    options = new DBOptions()
+        .setCreateIfMissing(true)
+        .setCreateMissingColumnFamilies(true);
+
+    final List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+    cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+
+    if ("1_column_family".equals(columnFamilyTestType)) {
+      cfs = 1;
+    } else if ("20_column_families".equals(columnFamilyTestType)) {
+      cfs = 20;
+    } else if ("100_column_families".equals(columnFamilyTestType)) {
+      cfs = 100;
+    }
+
+    if (cfs > 0) {
+      cfHandlesIdx = new AtomicInteger(1);
+      for (int i = 1; i <= cfs; i++) {
+        cfDescriptors.add(new ColumnFamilyDescriptor(ba("cf" + i)));
+      }
+    }
+
+    final List<ColumnFamilyHandle> cfHandlesList = new ArrayList<>(cfDescriptors.size());
+    db = RocksDB.open(options, dbDir.toAbsolutePath().toString(), cfDescriptors, cfHandlesList);
+    cfHandles = cfHandlesList.toArray(new ColumnFamilyHandle[0]);
+
+    // store initial data for retrieving via get
+    for (int i = 0; i < cfs; i++) {
+      for (int j = 0; j < keyCount; j++) {
+        final byte[] paddedValue = Arrays.copyOf(ba("value" + j), valueSize);
+        db.put(cfHandles[i], ba("key" + j), paddedValue);
+      }
+    }
+
+    try (final FlushOptions flushOptions = new FlushOptions()
+            .setWaitForFlush(true)) {
+      db.flush(flushOptions);
+    }
+  }
+
+  @TearDown(Level.Trial)
+  public void cleanup() throws IOException {
+    for (final ColumnFamilyHandle cfHandle : cfHandles) {
+      cfHandle.close();
+    }
+    db.close();
+    options.close();
+    FileUtils.delete(dbDir);
+  }
+
+  private ColumnFamilyHandle getColumnFamily() {
+    if (cfs == 0) {
+      return cfHandles[0];
+    }  else if (cfs == 1) {
+      return cfHandles[1];
+    } else {
+      int idx = cfHandlesIdx.getAndIncrement();
+      if (idx > cfs) {
+        cfHandlesIdx.set(1); // doesn't ensure a perfect distribution, but it's ok
+        idx = 0;
+      }
+      return cfHandles[idx];
+    }
+  }
+
+  /**
+   * Reserves the next {@inc} positions in the index.
+   *
+   * @param inc the number by which to increment the index
+   * @param limit the limit for the index
+   * @return the index before {@code inc} is added
+   */
+  private int next(final int inc, final int limit) {
+    int idx;
+    int nextIdx;
+    while (true) {
+      idx = keyIndex.get();
+      nextIdx = idx + inc;
+      if (nextIdx >= limit) {
+          nextIdx = inc;
+      }
+
+      if (keyIndex.compareAndSet(idx, nextIdx)) {
+        break;
+      }
+    }
+
+    if (nextIdx >= limit) {
+      return -1;
+    } else {
+      return idx;
+    }
+  }
+
+  ByteBuffer keysBuffer;
+  ByteBuffer valuesBuffer;
+
+  List<ByteBuffer> valueBuffersList;
+  List<ByteBuffer> keyBuffersList;
+
+  @Setup
+  public void allocateSliceBuffers() {
+    keysBuffer = ByteBuffer.allocateDirect(keyCount * valueSize);
+    valuesBuffer = ByteBuffer.allocateDirect(keyCount * valueSize);
+    valueBuffersList = new ArrayList<>();
+    keyBuffersList = new ArrayList<>();
+    for (int i = 0; i < keyCount; i++) {
+      valueBuffersList.add(valuesBuffer.slice());
+      valuesBuffer.position(i * valueSize);
+      keyBuffersList.add(keysBuffer.slice());
+      keysBuffer.position(i * keySize);
+    }
+  }
+
+  @TearDown
+  public void freeSliceBuffers() {
+    valueBuffersList.clear();
+  }
+
+  @Benchmark
+  public List<byte[]> multiGet10() throws RocksDBException {
+    final int fromKeyIdx = next(multiGetSize, keyCount);
+    if (fromKeyIdx >= 0) {
+      final List<byte[]> keys = keys(fromKeyIdx, fromKeyIdx + multiGetSize);
+      final List<byte[]> valueResults = db.multiGetAsList(keys);
+      for (final byte[] result : valueResults) {
+        if (result.length != valueSize)
+          throw new RuntimeException("Test valueSize assumption wrong");
+      }
+    }
+    return new ArrayList<>();
+  }
+
+  @Benchmark
+  public List<RocksDB.MultiGetInstance> multiGetDirect10() throws RocksDBException {
+    final int fromKeyIdx = next(multiGetSize, keyCount);
+    if (fromKeyIdx >= 0) {
+      final List<ByteBuffer> keys = keys(keyBuffersList, fromKeyIdx, fromKeyIdx + multiGetSize);
+      final List<RocksDB.MultiGetInstance> results = db.multiGetByteBuffers(
+          keys, valueBuffersList.subList(fromKeyIdx, fromKeyIdx + multiGetSize));
+      for (final RocksDB.MultiGetInstance result : results) {
+        if (result.status.getCode() != Status.Code.Ok)
+          throw new RuntimeException("Test status assumption wrong");
+        if (result.valueSize != valueSize)
+          throw new RuntimeException("Test valueSize assumption wrong");
+      }
+      return results;
+    }
+    return new ArrayList<>();
+  }
+
+  public static void main(final String[] args) throws RunnerException {
+    final org.openjdk.jmh.runner.options.Options opt =
+        new OptionsBuilder()
+            .include(MultiGetBenchmarks.class.getSimpleName())
+            .forks(1)
+            .jvmArgs("-ea")
+            .warmupIterations(1)
+            .measurementIterations(2)
+            .forks(2)
+            .param("columnFamilyTestType=", "1_column_family")
+            .param("multiGetSize=", "10", "1000")
+            .param("keyCount=", "1000")
+            .output("jmh_output")
+            .build();
+
+    new Runner(opt).run();
+  }
+}
diff --git a/src/rocksdb/java/jmh/src/main/java/org/rocksdb/jmh/PutBenchmarks.java b/src/rocksdb/java/jmh/src/main/java/org/rocksdb/jmh/PutBenchmarks.java
new file mode 100644
index 000000000..5aae21cb9
--- /dev/null
+++ b/src/rocksdb/java/jmh/src/main/java/org/rocksdb/jmh/PutBenchmarks.java
@@ -0,0 +1,112 @@
+/**
+ * Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+ *  This source code is licensed under both the GPLv2 (found in the
+ *  COPYING file in the root directory) and Apache 2.0 License
+ *  (found in the LICENSE.Apache file in the root directory).
+ */
+package org.rocksdb.jmh;
+
+import org.openjdk.jmh.annotations.*;
+import org.rocksdb.*;
+import org.rocksdb.util.FileUtils;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import static org.rocksdb.util.KVUtils.ba;
+
+@State(Scope.Benchmark)
+public class PutBenchmarks {
+
+  @Param({
+      "no_column_family",
+      "1_column_family",
+      "20_column_families",
+      "100_column_families"
+  })
+  String columnFamilyTestType;
+
+  Path dbDir;
+  DBOptions options;
+  int cfs = 0;  // number of column families
+  private AtomicInteger cfHandlesIdx;
+  ColumnFamilyHandle[] cfHandles;
+  RocksDB db;
+
+  @Setup(Level.Trial)
+  public void setup() throws IOException, RocksDBException {
+    RocksDB.loadLibrary();
+
+    dbDir = Files.createTempDirectory("rocksjava-put-benchmarks");
+
+    options = new DBOptions()
+        .setCreateIfMissing(true)
+        .setCreateMissingColumnFamilies(true);
+
+    final List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+    cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+
+    if ("1_column_family".equals(columnFamilyTestType)) {
+      cfs = 1;
+    } else if ("20_column_families".equals(columnFamilyTestType)) {
+      cfs = 20;
+    } else if ("100_column_families".equals(columnFamilyTestType)) {
+      cfs = 100;
+    }
+
+    if (cfs > 0) {
+      cfHandlesIdx = new AtomicInteger(1);
+      for (int i = 1; i <= cfs; i++) {
+        cfDescriptors.add(new ColumnFamilyDescriptor(ba("cf" + i)));
+      }
+    }
+
+    final List<ColumnFamilyHandle> cfHandlesList = new ArrayList<>(cfDescriptors.size());
+    db = RocksDB.open(options, dbDir.toAbsolutePath().toString(), cfDescriptors, cfHandlesList);
+    cfHandles = cfHandlesList.toArray(new ColumnFamilyHandle[0]);
+  }
+
+  @TearDown(Level.Trial)
+  public void cleanup() throws IOException {
+    for (final ColumnFamilyHandle cfHandle : cfHandles) {
+      cfHandle.close();
+    }
+    db.close();
+    options.close();
+    FileUtils.delete(dbDir);
+  }
+
+  private ColumnFamilyHandle getColumnFamily() {
+    if (cfs == 0) {
+      return cfHandles[0];
+    }  else if (cfs == 1) {
+      return cfHandles[1];
+    } else {
+      int idx = cfHandlesIdx.getAndIncrement();
+      if (idx > cfs) {
+        cfHandlesIdx.set(1); // doesn't ensure a perfect distribution, but it's ok
+        idx = 0;
+      }
+      return cfHandles[idx];
+    }
+  }
+
+  @State(Scope.Benchmark)
+  public static class Counter {
+    private final AtomicInteger count = new AtomicInteger();
+
+    public int next() {
+      return count.getAndIncrement();
+    }
+  }
+
+  @Benchmark
+  public void put(final ComparatorBenchmarks.Counter counter) throws RocksDBException {
+    final int i = counter.next();
+    db.put(getColumnFamily(), ba("key" + i), ba("value" + i));
+  }
+}
diff --git a/src/rocksdb/java/jmh/src/main/java/org/rocksdb/util/FileUtils.java b/src/rocksdb/java/jmh/src/main/java/org/rocksdb/util/FileUtils.java
new file mode 100644
index 000000000..63744a14f
--- /dev/null
+++ b/src/rocksdb/java/jmh/src/main/java/org/rocksdb/util/FileUtils.java
@@ -0,0 +1,59 @@
+/**
+ * Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+ *  This source code is licensed under both the GPLv2 (found in the
+ *  COPYING file in the root directory) and Apache 2.0 License
+ *  (found in the LICENSE.Apache file in the root directory).
+ */
+package org.rocksdb.util;
+
+import java.io.IOException;
+import java.nio.file.FileVisitResult;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.SimpleFileVisitor;
+import java.nio.file.attribute.BasicFileAttributes;
+
+public final class FileUtils {
+    private static final SimpleFileVisitor<Path> DELETE_DIR_VISITOR = new DeleteDirVisitor();
+
+    /**
+     * Deletes a path from the filesystem
+     *
+     * If the path is a directory its contents
+     * will be recursively deleted before it itself
+     * is deleted.
+     *
+     * Note that removal of a directory is not an atomic-operation
+     * and so if an error occurs during removal, some of the directories
+     * descendants may have already been removed
+     *
+     * @param path the path to delete.
+     *
+     * @throws IOException if an error occurs whilst removing a file or directory
+     */
+    public static void delete(final Path path) throws IOException {
+        if (!Files.isDirectory(path)) {
+            Files.deleteIfExists(path);
+        } else {
+            Files.walkFileTree(path, DELETE_DIR_VISITOR);
+        }
+    }
+
+    private static class DeleteDirVisitor extends SimpleFileVisitor<Path> {
+        @Override
+        public FileVisitResult visitFile(final Path file, final BasicFileAttributes attrs) throws IOException {
+            Files.deleteIfExists(file);
+            return FileVisitResult.CONTINUE;
+        }
+
+        @Override
+        public FileVisitResult postVisitDirectory(final Path dir, final IOException exc) throws IOException {
+            if (exc != null) {
+                throw exc;
+            }
+
+            Files.deleteIfExists(dir);
+            return FileVisitResult.CONTINUE;
+        }
+    }
+}
diff --git a/src/rocksdb/java/jmh/src/main/java/org/rocksdb/util/KVUtils.java b/src/rocksdb/java/jmh/src/main/java/org/rocksdb/util/KVUtils.java
new file mode 100644
index 000000000..5077291c8
--- /dev/null
+++ b/src/rocksdb/java/jmh/src/main/java/org/rocksdb/util/KVUtils.java
@@ -0,0 +1,72 @@
+/**
+ * Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+ *  This source code is licensed under both the GPLv2 (found in the
+ *  COPYING file in the root directory) and Apache 2.0 License
+ *  (found in the LICENSE.Apache file in the root directory).
+ */
+package org.rocksdb.util;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+
+public final class KVUtils {
+
+  /**
+   * Get a byte array from a string.
+   *
+   * Assumes UTF-8 encoding
+   *
+   * @param string the string
+   *
+   * @return the bytes.
+   */
+  public static byte[] ba(final String string) {
+    return string.getBytes(UTF_8);
+  }
+
+  /**
+   * Get a string from a byte array.
+   *
+   * Assumes UTF-8 encoding
+   *
+   * @param bytes the bytes
+   *
+   * @return the string.
+   */
+  public static String str(final byte[] bytes) {
+    return new String(bytes, UTF_8);
+  }
+
+  /**
+   * Get a list of keys where the keys are named key1..key1+N
+   * in the range of {@code from} to {@code to} i.e. keyFrom..keyTo.
+   *
+   * @param from the first key
+   * @param to the last key
+   *
+   * @return the array of keys
+   */
+  public static List<byte[]> keys(final int from, final int to) {
+    final List<byte[]> keys = new ArrayList<>(to - from);
+    for (int i = from; i < to; i++) {
+      keys.add(ba("key" + i));
+    }
+    return keys;
+  }
+
+  public static List<ByteBuffer> keys(
+      final List<ByteBuffer> keyBuffers, final int from, final int to) {
+    final List<ByteBuffer> keys = new ArrayList<>(to - from);
+    for (int i = from; i < to; i++) {
+      final ByteBuffer key = keyBuffers.get(i);
+      key.clear();
+      key.put(ba("key" + i));
+      key.flip();
+      keys.add(key);
+    }
+    return keys;
+  }
+}
diff --git a/src/rocksdb/java/pom.xml.template b/src/rocksdb/java/pom.xml.template
new file mode 100644
index 000000000..4abff4768
--- /dev/null
+++ b/src/rocksdb/java/pom.xml.template
@@ -0,0 +1,178 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+    <modelVersion>4.0.0</modelVersion>
+
+    <groupId>org.rocksdb</groupId>
+    <artifactId>rocksdbjni</artifactId>
+    <version>${ROCKSDB_JAVA_VERSION}</version>  <!-- this will be replaced by the Makefile's rocksdbjavageneratepom target -->
+
+    <name>RocksDB JNI</name>
+    <description>RocksDB fat jar that contains .so files for linux32 and linux64 (glibc and musl-libc), jnilib files
+        for Mac OSX, and a .dll for Windows x64.
+    </description>
+    <url>https://rocksdb.org</url>
+    <inceptionYear>2012</inceptionYear>
+
+    <licenses>
+        <license>
+            <name>Apache License 2.0</name>
+            <url>http://www.apache.org/licenses/LICENSE-2.0.html</url>
+            <distribution>repo</distribution>
+        </license>
+        <license>
+            <name>GNU General Public License, version 2</name>
+            <url>http://www.gnu.org/licenses/gpl-2.0.html</url>
+            <distribution>repo</distribution>
+        </license>
+    </licenses>
+
+    <scm>
+        <connection>scm:git:https://github.com/facebook/rocksdb.git</connection>
+        <developerConnection>scm:git:https://github.com/facebook/rocksdb.git</developerConnection>
+        <url>scm:git:https://github.com/facebook/rocksdb.git</url>
+    </scm>
+
+    <organization>
+        <name>Facebook</name>
+        <url>https://www.facebook.com</url>
+    </organization>
+
+    <developers>
+        <developer>
+            <name>Facebook</name>
+            <email>help@facebook.com</email>
+            <timezone>America/New_York</timezone>
+            <roles>
+                <role>architect</role>
+            </roles>
+        </developer>
+    </developers>
+
+    <mailingLists>
+        <mailingList>
+            <name>rocksdb - Google Groups</name>
+            <subscribe>rocksdb-subscribe@googlegroups.com</subscribe>
+            <unsubscribe>rocksdb-unsubscribe@googlegroups.com</unsubscribe>
+            <post>rocksdb@googlegroups.com</post>
+            <archive>https://groups.google.com/forum/#!forum/rocksdb</archive>
+        </mailingList>
+    </mailingLists>
+
+    <properties>
+        <project.build.source>1.7</project.build.source>
+        <project.build.target>1.7</project.build.target>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <version>3.2</version>
+                <configuration>
+                    <source>${project.build.source}</source>
+                    <target>${project.build.target}</target>
+                    <encoding>${project.build.sourceEncoding}</encoding>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-surefire-plugin</artifactId>
+                <version>2.18.1</version>
+                <configuration>
+                    <argLine>${argLine} -ea -Xcheck:jni -Djava.library.path=${project.build.directory}</argLine>
+                    <useManifestOnlyJar>false</useManifestOnlyJar>  
+                    <useSystemClassLoader>false</useSystemClassLoader>
+                    <additionalClasspathElements>
+                        <additionalClasspathElement>${project.build.directory}/*</additionalClasspathElement>
+                    </additionalClasspathElements>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.jacoco</groupId>
+                <artifactId>jacoco-maven-plugin</artifactId>
+                <version>0.7.2.201409121644</version>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>prepare-agent</goal>
+                        </goals>
+                    </execution>
+                    <execution>
+                        <id>report</id>
+                        <phase>prepare-package</phase>
+                        <goals>
+                            <goal>report</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <groupId>org.codehaus.gmaven</groupId>
+                <artifactId>groovy-maven-plugin</artifactId>
+                <version>2.0</version>
+                <executions>
+                    <execution>
+                        <phase>process-classes</phase>
+                        <goals>
+                            <goal>execute</goal>
+                        </goals>
+                        <configuration>
+                            <defaults>
+                                <name>Xenu</name>
+                            </defaults>
+                            <source>
+                                String fileContents = new File(project.basedir.absolutePath + '/../include/rocksdb/version.h').getText('UTF-8')
+                                matcher = (fileContents =~ /(?s).*ROCKSDB_MAJOR ([0-9]+).*?/)
+                                String major_version = matcher.getAt(0).getAt(1)
+                                matcher = (fileContents =~ /(?s).*ROCKSDB_MINOR ([0-9]+).*?/)
+                                String minor_version = matcher.getAt(0).getAt(1)
+                                matcher = (fileContents =~ /(?s).*ROCKSDB_PATCH ([0-9]+).*?/)
+                                String patch_version = matcher.getAt(0).getAt(1)
+                                String version = String.format('%s.%s.%s', major_version, minor_version, patch_version)
+                                // Set version to be used in pom.properties
+                                project.version = version
+                                // Set version to be set as jar name
+                                project.build.finalName = project.artifactId + "-" + version
+                            </source>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+
+    <dependencies>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>4.13.1</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.hamcrest</groupId>
+            <artifactId>hamcrest</artifactId>
+            <version>2.2</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>cglib</groupId>
+            <artifactId>cglib</artifactId>
+            <version>3.3.0</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.assertj</groupId>
+            <artifactId>assertj-core</artifactId>
+            <version>2.9.0</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.mockito</groupId>
+            <artifactId>mockito-all</artifactId>
+            <version>1.10.19</version>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+</project>
diff --git a/src/rocksdb/java/rocksjni/backup_engine_options.cc b/src/rocksdb/java/rocksjni/backup_engine_options.cc
new file mode 100644
index 000000000..25bfb6720
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/backup_engine_options.cc
@@ -0,0 +1,365 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ ROCKSDB_NAMESPACE::BackupEnginge and
+// ROCKSDB_NAMESPACE::BackupEngineOptions methods from Java side.
+
+#include <jni.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <string>
+#include <vector>
+
+#include "include/org_rocksdb_BackupEngineOptions.h"
+#include "rocksdb/utilities/backup_engine.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+
+///////////////////////////////////////////////////////////////////////////
+// BackupDBOptions
+
+/*
+ * Class:     org_rocksdb_BackupEngineOptions
+ * Method:    newBackupEngineOptions
+ * Signature: (Ljava/lang/String;)J
+ */
+jlong Java_org_rocksdb_BackupEngineOptions_newBackupEngineOptions(
+    JNIEnv* env, jclass /*jcls*/, jstring jpath) {
+  const char* cpath = env->GetStringUTFChars(jpath, nullptr);
+  if (cpath == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return 0;
+  }
+  auto* bopt = new ROCKSDB_NAMESPACE::BackupEngineOptions(cpath);
+  env->ReleaseStringUTFChars(jpath, cpath);
+  return GET_CPLUSPLUS_POINTER(bopt);
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngineOptions
+ * Method:    backupDir
+ * Signature: (J)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_BackupEngineOptions_backupDir(JNIEnv* env,
+                                                       jobject /*jopt*/,
+                                                       jlong jhandle) {
+  auto* bopt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngineOptions*>(jhandle);
+  return env->NewStringUTF(bopt->backup_dir.c_str());
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngineOptions
+ * Method:    setBackupEnv
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_BackupEngineOptions_setBackupEnv(
+    JNIEnv* /*env*/, jobject /*jopt*/, jlong jhandle, jlong jrocks_env_handle) {
+  auto* bopt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngineOptions*>(jhandle);
+  auto* rocks_env =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Env*>(jrocks_env_handle);
+  bopt->backup_env = rocks_env;
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngineOptions
+ * Method:    setShareTableFiles
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_BackupEngineOptions_setShareTableFiles(JNIEnv* /*env*/,
+                                                             jobject /*jobj*/,
+                                                             jlong jhandle,
+                                                             jboolean flag) {
+  auto* bopt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngineOptions*>(jhandle);
+  bopt->share_table_files = flag;
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngineOptions
+ * Method:    shareTableFiles
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_BackupEngineOptions_shareTableFiles(JNIEnv* /*env*/,
+                                                              jobject /*jobj*/,
+                                                              jlong jhandle) {
+  auto* bopt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngineOptions*>(jhandle);
+  return bopt->share_table_files;
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngineOptions
+ * Method:    setInfoLog
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_BackupEngineOptions_setInfoLog(JNIEnv* /*env*/,
+                                                     jobject /*jobj*/,
+                                                     jlong jhandle,
+                                                     jlong /*jlogger_handle*/) {
+  auto* bopt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngineOptions*>(jhandle);
+  auto* sptr_logger =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::LoggerJniCallback>*>(
+          jhandle);
+  bopt->info_log = sptr_logger->get();
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngineOptions
+ * Method:    setSync
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_BackupEngineOptions_setSync(JNIEnv* /*env*/,
+                                                  jobject /*jobj*/,
+                                                  jlong jhandle,
+                                                  jboolean flag) {
+  auto* bopt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngineOptions*>(jhandle);
+  bopt->sync = flag;
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngineOptions
+ * Method:    sync
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_BackupEngineOptions_sync(JNIEnv* /*env*/,
+                                                   jobject /*jobj*/,
+                                                   jlong jhandle) {
+  auto* bopt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngineOptions*>(jhandle);
+  return bopt->sync;
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngineOptions
+ * Method:    setDestroyOldData
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_BackupEngineOptions_setDestroyOldData(JNIEnv* /*env*/,
+                                                            jobject /*jobj*/,
+                                                            jlong jhandle,
+                                                            jboolean flag) {
+  auto* bopt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngineOptions*>(jhandle);
+  bopt->destroy_old_data = flag;
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngineOptions
+ * Method:    destroyOldData
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_BackupEngineOptions_destroyOldData(JNIEnv* /*env*/,
+                                                             jobject /*jobj*/,
+                                                             jlong jhandle) {
+  auto* bopt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngineOptions*>(jhandle);
+  return bopt->destroy_old_data;
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngineOptions
+ * Method:    setBackupLogFiles
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_BackupEngineOptions_setBackupLogFiles(JNIEnv* /*env*/,
+                                                            jobject /*jobj*/,
+                                                            jlong jhandle,
+                                                            jboolean flag) {
+  auto* bopt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngineOptions*>(jhandle);
+  bopt->backup_log_files = flag;
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngineOptions
+ * Method:    backupLogFiles
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_BackupEngineOptions_backupLogFiles(JNIEnv* /*env*/,
+                                                             jobject /*jobj*/,
+                                                             jlong jhandle) {
+  auto* bopt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngineOptions*>(jhandle);
+  return bopt->backup_log_files;
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngineOptions
+ * Method:    setBackupRateLimit
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_BackupEngineOptions_setBackupRateLimit(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jbackup_rate_limit) {
+  auto* bopt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngineOptions*>(jhandle);
+  bopt->backup_rate_limit = jbackup_rate_limit;
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngineOptions
+ * Method:    backupRateLimit
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_BackupEngineOptions_backupRateLimit(JNIEnv* /*env*/,
+                                                           jobject /*jobj*/,
+                                                           jlong jhandle) {
+  auto* bopt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngineOptions*>(jhandle);
+  return bopt->backup_rate_limit;
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngineOptions
+ * Method:    setBackupRateLimiter
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_BackupEngineOptions_setBackupRateLimiter(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jrate_limiter_handle) {
+  auto* bopt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngineOptions*>(jhandle);
+  auto* sptr_rate_limiter =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::RateLimiter>*>(
+          jrate_limiter_handle);
+  bopt->backup_rate_limiter = *sptr_rate_limiter;
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngineOptions
+ * Method:    setRestoreRateLimit
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_BackupEngineOptions_setRestoreRateLimit(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jrestore_rate_limit) {
+  auto* bopt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngineOptions*>(jhandle);
+  bopt->restore_rate_limit = jrestore_rate_limit;
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngineOptions
+ * Method:    restoreRateLimit
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_BackupEngineOptions_restoreRateLimit(JNIEnv* /*env*/,
+                                                            jobject /*jobj*/,
+                                                            jlong jhandle) {
+  auto* bopt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngineOptions*>(jhandle);
+  return bopt->restore_rate_limit;
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngineOptions
+ * Method:    setRestoreRateLimiter
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_BackupEngineOptions_setRestoreRateLimiter(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jrate_limiter_handle) {
+  auto* bopt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngineOptions*>(jhandle);
+  auto* sptr_rate_limiter =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::RateLimiter>*>(
+          jrate_limiter_handle);
+  bopt->restore_rate_limiter = *sptr_rate_limiter;
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngineOptions
+ * Method:    setShareFilesWithChecksum
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_BackupEngineOptions_setShareFilesWithChecksum(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jboolean flag) {
+  auto* bopt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngineOptions*>(jhandle);
+  bopt->share_files_with_checksum = flag;
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngineOptions
+ * Method:    shareFilesWithChecksum
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_BackupEngineOptions_shareFilesWithChecksum(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* bopt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngineOptions*>(jhandle);
+  return bopt->share_files_with_checksum;
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngineOptions
+ * Method:    setMaxBackgroundOperations
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_BackupEngineOptions_setMaxBackgroundOperations(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jint max_background_operations) {
+  auto* bopt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngineOptions*>(jhandle);
+  bopt->max_background_operations = static_cast<int>(max_background_operations);
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngineOptions
+ * Method:    maxBackgroundOperations
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_BackupEngineOptions_maxBackgroundOperations(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* bopt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngineOptions*>(jhandle);
+  return static_cast<jint>(bopt->max_background_operations);
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngineOptions
+ * Method:    setCallbackTriggerIntervalSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_BackupEngineOptions_setCallbackTriggerIntervalSize(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jcallback_trigger_interval_size) {
+  auto* bopt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngineOptions*>(jhandle);
+  bopt->callback_trigger_interval_size =
+      static_cast<uint64_t>(jcallback_trigger_interval_size);
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngineOptions
+ * Method:    callbackTriggerIntervalSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_BackupEngineOptions_callbackTriggerIntervalSize(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* bopt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngineOptions*>(jhandle);
+  return static_cast<jlong>(bopt->callback_trigger_interval_size);
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngineOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_BackupEngineOptions_disposeInternal(JNIEnv* /*env*/,
+                                                          jobject /*jopt*/,
+                                                          jlong jhandle) {
+  auto* bopt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngineOptions*>(jhandle);
+  assert(bopt != nullptr);
+  delete bopt;
+}
diff --git a/src/rocksdb/java/rocksjni/backupenginejni.cc b/src/rocksdb/java/rocksjni/backupenginejni.cc
new file mode 100644
index 000000000..1ba7ea286
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/backupenginejni.cc
@@ -0,0 +1,279 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling C++ ROCKSDB_NAMESPACE::BackupEngine methods from the Java side.
+
+#include <jni.h>
+
+#include <vector>
+
+#include "include/org_rocksdb_BackupEngine.h"
+#include "rocksdb/utilities/backup_engine.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_BackupEngine
+ * Method:    open
+ * Signature: (JJ)J
+ */
+jlong Java_org_rocksdb_BackupEngine_open(JNIEnv* env, jclass /*jcls*/,
+                                         jlong env_handle,
+                                         jlong backup_engine_options_handle) {
+  auto* rocks_env = reinterpret_cast<ROCKSDB_NAMESPACE::Env*>(env_handle);
+  auto* backup_engine_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngineOptions*>(
+          backup_engine_options_handle);
+  ROCKSDB_NAMESPACE::BackupEngine* backup_engine;
+  auto status = ROCKSDB_NAMESPACE::BackupEngine::Open(
+      rocks_env, *backup_engine_options, &backup_engine);
+
+  if (status.ok()) {
+    return GET_CPLUSPLUS_POINTER(backup_engine);
+  } else {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+    return 0;
+  }
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngine
+ * Method:    createNewBackup
+ * Signature: (JJZ)V
+ */
+void Java_org_rocksdb_BackupEngine_createNewBackup(
+    JNIEnv* env, jobject /*jbe*/, jlong jbe_handle, jlong db_handle,
+    jboolean jflush_before_backup) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(db_handle);
+  auto* backup_engine =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngine*>(jbe_handle);
+  auto status = backup_engine->CreateNewBackup(
+      db, static_cast<bool>(jflush_before_backup));
+
+  if (status.ok()) {
+    return;
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngine
+ * Method:    createNewBackupWithMetadata
+ * Signature: (JJLjava/lang/String;Z)V
+ */
+void Java_org_rocksdb_BackupEngine_createNewBackupWithMetadata(
+    JNIEnv* env, jobject /*jbe*/, jlong jbe_handle, jlong db_handle,
+    jstring japp_metadata, jboolean jflush_before_backup) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(db_handle);
+  auto* backup_engine =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngine*>(jbe_handle);
+
+  jboolean has_exception = JNI_FALSE;
+  std::string app_metadata = ROCKSDB_NAMESPACE::JniUtil::copyStdString(
+      env, japp_metadata, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env, "Could not copy jstring to std::string");
+    return;
+  }
+
+  auto status = backup_engine->CreateNewBackupWithMetadata(
+      db, app_metadata, static_cast<bool>(jflush_before_backup));
+
+  if (status.ok()) {
+    return;
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngine
+ * Method:    getBackupInfo
+ * Signature: (J)Ljava/util/List;
+ */
+jobject Java_org_rocksdb_BackupEngine_getBackupInfo(JNIEnv* env,
+                                                    jobject /*jbe*/,
+                                                    jlong jbe_handle) {
+  auto* backup_engine =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngine*>(jbe_handle);
+  std::vector<ROCKSDB_NAMESPACE::BackupInfo> backup_infos;
+  backup_engine->GetBackupInfo(&backup_infos);
+  return ROCKSDB_NAMESPACE::BackupInfoListJni::getBackupInfo(env, backup_infos);
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngine
+ * Method:    getCorruptedBackups
+ * Signature: (J)[I
+ */
+jintArray Java_org_rocksdb_BackupEngine_getCorruptedBackups(JNIEnv* env,
+                                                            jobject /*jbe*/,
+                                                            jlong jbe_handle) {
+  auto* backup_engine =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngine*>(jbe_handle);
+  std::vector<ROCKSDB_NAMESPACE::BackupID> backup_ids;
+  backup_engine->GetCorruptedBackups(&backup_ids);
+  // store backupids in int array
+  std::vector<jint> int_backup_ids(backup_ids.begin(), backup_ids.end());
+
+  // Store ints in java array
+  // Its ok to loose precision here (64->32)
+  jsize ret_backup_ids_size = static_cast<jsize>(backup_ids.size());
+  jintArray ret_backup_ids = env->NewIntArray(ret_backup_ids_size);
+  if (ret_backup_ids == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+  env->SetIntArrayRegion(ret_backup_ids, 0, ret_backup_ids_size,
+                         int_backup_ids.data());
+  return ret_backup_ids;
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngine
+ * Method:    garbageCollect
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_BackupEngine_garbageCollect(JNIEnv* env, jobject /*jbe*/,
+                                                  jlong jbe_handle) {
+  auto* backup_engine =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngine*>(jbe_handle);
+  auto status = backup_engine->GarbageCollect();
+
+  if (status.ok()) {
+    return;
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngine
+ * Method:    purgeOldBackups
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_BackupEngine_purgeOldBackups(JNIEnv* env, jobject /*jbe*/,
+                                                   jlong jbe_handle,
+                                                   jint jnum_backups_to_keep) {
+  auto* backup_engine =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngine*>(jbe_handle);
+  auto status = backup_engine->PurgeOldBackups(
+      static_cast<uint32_t>(jnum_backups_to_keep));
+
+  if (status.ok()) {
+    return;
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngine
+ * Method:    deleteBackup
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_BackupEngine_deleteBackup(JNIEnv* env, jobject /*jbe*/,
+                                                jlong jbe_handle,
+                                                jint jbackup_id) {
+  auto* backup_engine =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngine*>(jbe_handle);
+  auto status = backup_engine->DeleteBackup(
+      static_cast<ROCKSDB_NAMESPACE::BackupID>(jbackup_id));
+
+  if (status.ok()) {
+    return;
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngine
+ * Method:    restoreDbFromBackup
+ * Signature: (JILjava/lang/String;Ljava/lang/String;J)V
+ */
+void Java_org_rocksdb_BackupEngine_restoreDbFromBackup(
+    JNIEnv* env, jobject /*jbe*/, jlong jbe_handle, jint jbackup_id,
+    jstring jdb_dir, jstring jwal_dir, jlong jrestore_options_handle) {
+  auto* backup_engine =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngine*>(jbe_handle);
+  const char* db_dir = env->GetStringUTFChars(jdb_dir, nullptr);
+  if (db_dir == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+  const char* wal_dir = env->GetStringUTFChars(jwal_dir, nullptr);
+  if (wal_dir == nullptr) {
+    // exception thrown: OutOfMemoryError
+    env->ReleaseStringUTFChars(jdb_dir, db_dir);
+    return;
+  }
+  auto* restore_options = reinterpret_cast<ROCKSDB_NAMESPACE::RestoreOptions*>(
+      jrestore_options_handle);
+  auto status = backup_engine->RestoreDBFromBackup(
+      static_cast<ROCKSDB_NAMESPACE::BackupID>(jbackup_id), db_dir, wal_dir,
+      *restore_options);
+
+  env->ReleaseStringUTFChars(jwal_dir, wal_dir);
+  env->ReleaseStringUTFChars(jdb_dir, db_dir);
+
+  if (status.ok()) {
+    return;
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngine
+ * Method:    restoreDbFromLatestBackup
+ * Signature: (JLjava/lang/String;Ljava/lang/String;J)V
+ */
+void Java_org_rocksdb_BackupEngine_restoreDbFromLatestBackup(
+    JNIEnv* env, jobject /*jbe*/, jlong jbe_handle, jstring jdb_dir,
+    jstring jwal_dir, jlong jrestore_options_handle) {
+  auto* backup_engine =
+      reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngine*>(jbe_handle);
+  const char* db_dir = env->GetStringUTFChars(jdb_dir, nullptr);
+  if (db_dir == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+  const char* wal_dir = env->GetStringUTFChars(jwal_dir, nullptr);
+  if (wal_dir == nullptr) {
+    // exception thrown: OutOfMemoryError
+    env->ReleaseStringUTFChars(jdb_dir, db_dir);
+    return;
+  }
+  auto* restore_options = reinterpret_cast<ROCKSDB_NAMESPACE::RestoreOptions*>(
+      jrestore_options_handle);
+  auto status = backup_engine->RestoreDBFromLatestBackup(db_dir, wal_dir,
+                                                         *restore_options);
+
+  env->ReleaseStringUTFChars(jwal_dir, wal_dir);
+  env->ReleaseStringUTFChars(jdb_dir, db_dir);
+
+  if (status.ok()) {
+    return;
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngine
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_BackupEngine_disposeInternal(JNIEnv* /*env*/,
+                                                   jobject /*jbe*/,
+                                                   jlong jbe_handle) {
+  auto* be = reinterpret_cast<ROCKSDB_NAMESPACE::BackupEngine*>(jbe_handle);
+  assert(be != nullptr);
+  delete be;
+}
diff --git a/src/rocksdb/java/rocksjni/cache.cc b/src/rocksdb/java/rocksjni/cache.cc
new file mode 100644
index 000000000..33c0a2f0b
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/cache.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::Cache.
+
+#include "rocksdb/cache.h"
+
+#include <jni.h>
+
+#include "include/org_rocksdb_Cache.h"
+
+/*
+ * Class:     org_rocksdb_Cache
+ * Method:    getUsage
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Cache_getUsage(JNIEnv*, jclass, jlong jhandle) {
+  auto* sptr_cache =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache>*>(jhandle);
+  return static_cast<jlong>(sptr_cache->get()->GetUsage());
+}
+
+/*
+ * Class:     org_rocksdb_Cache
+ * Method:    getPinnedUsage
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Cache_getPinnedUsage(JNIEnv*, jclass, jlong jhandle) {
+  auto* sptr_cache =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache>*>(jhandle);
+  return static_cast<jlong>(sptr_cache->get()->GetPinnedUsage());
+}
diff --git a/src/rocksdb/java/rocksjni/cassandra_compactionfilterjni.cc b/src/rocksdb/java/rocksjni/cassandra_compactionfilterjni.cc
new file mode 100644
index 000000000..25817aeca
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/cassandra_compactionfilterjni.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <jni.h>
+
+#include "include/org_rocksdb_CassandraCompactionFilter.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "utilities/cassandra/cassandra_compaction_filter.h"
+
+/*
+ * Class:     org_rocksdb_CassandraCompactionFilter
+ * Method:    createNewCassandraCompactionFilter0
+ * Signature: (ZI)J
+ */
+jlong Java_org_rocksdb_CassandraCompactionFilter_createNewCassandraCompactionFilter0(
+    JNIEnv* /*env*/, jclass /*jcls*/, jboolean purge_ttl_on_expiration,
+    jint gc_grace_period_in_seconds) {
+  auto* compaction_filter =
+      new ROCKSDB_NAMESPACE::cassandra::CassandraCompactionFilter(
+          purge_ttl_on_expiration, gc_grace_period_in_seconds);
+  // set the native handle to our native compaction filter
+  return GET_CPLUSPLUS_POINTER(compaction_filter);
+}
diff --git a/src/rocksdb/java/rocksjni/cassandra_value_operator.cc b/src/rocksdb/java/rocksjni/cassandra_value_operator.cc
new file mode 100644
index 000000000..6de28c1b1
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/cassandra_value_operator.cc
@@ -0,0 +1,50 @@
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <jni.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <memory>
+#include <string>
+
+#include "include/org_rocksdb_CassandraValueMergeOperator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+#include "utilities/cassandra/merge_operator.h"
+
+/*
+ * Class:     org_rocksdb_CassandraValueMergeOperator
+ * Method:    newSharedCassandraValueMergeOperator
+ * Signature: (II)J
+ */
+jlong Java_org_rocksdb_CassandraValueMergeOperator_newSharedCassandraValueMergeOperator(
+    JNIEnv* /*env*/, jclass /*jclazz*/, jint gcGracePeriodInSeconds,
+    jint operands_limit) {
+  auto* op = new std::shared_ptr<ROCKSDB_NAMESPACE::MergeOperator>(
+      new ROCKSDB_NAMESPACE::cassandra::CassandraValueMergeOperator(
+          gcGracePeriodInSeconds, operands_limit));
+  return GET_CPLUSPLUS_POINTER(op);
+}
+
+/*
+ * Class:     org_rocksdb_CassandraValueMergeOperator
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_CassandraValueMergeOperator_disposeInternal(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* op =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::MergeOperator>*>(
+          jhandle);
+  delete op;
+}
diff --git a/src/rocksdb/java/rocksjni/checkpoint.cc b/src/rocksdb/java/rocksjni/checkpoint.cc
new file mode 100644
index 000000000..d7cfd813b
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/checkpoint.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ ROCKSDB_NAMESPACE::Checkpoint methods from Java side.
+
+#include "rocksdb/utilities/checkpoint.h"
+
+#include <jni.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <string>
+
+#include "include/org_rocksdb_Checkpoint.h"
+#include "rocksdb/db.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+/*
+ * Class:     org_rocksdb_Checkpoint
+ * Method:    newCheckpoint
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Checkpoint_newCheckpoint(JNIEnv* /*env*/,
+                                                jclass /*jclazz*/,
+                                                jlong jdb_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  ROCKSDB_NAMESPACE::Checkpoint* checkpoint;
+  ROCKSDB_NAMESPACE::Checkpoint::Create(db, &checkpoint);
+  return GET_CPLUSPLUS_POINTER(checkpoint);
+}
+
+/*
+ * Class:     org_rocksdb_Checkpoint
+ * Method:    dispose
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Checkpoint_disposeInternal(JNIEnv* /*env*/,
+                                                 jobject /*jobj*/,
+                                                 jlong jhandle) {
+  auto* checkpoint = reinterpret_cast<ROCKSDB_NAMESPACE::Checkpoint*>(jhandle);
+  assert(checkpoint != nullptr);
+  delete checkpoint;
+}
+
+/*
+ * Class:     org_rocksdb_Checkpoint
+ * Method:    createCheckpoint
+ * Signature: (JLjava/lang/String;)V
+ */
+void Java_org_rocksdb_Checkpoint_createCheckpoint(JNIEnv* env, jobject /*jobj*/,
+                                                  jlong jcheckpoint_handle,
+                                                  jstring jcheckpoint_path) {
+  const char* checkpoint_path = env->GetStringUTFChars(jcheckpoint_path, 0);
+  if (checkpoint_path == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  auto* checkpoint =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Checkpoint*>(jcheckpoint_handle);
+  ROCKSDB_NAMESPACE::Status s = checkpoint->CreateCheckpoint(checkpoint_path);
+
+  env->ReleaseStringUTFChars(jcheckpoint_path, checkpoint_path);
+
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
diff --git a/src/rocksdb/java/rocksjni/clock_cache.cc b/src/rocksdb/java/rocksjni/clock_cache.cc
new file mode 100644
index 000000000..e04991aa9
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/clock_cache.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::ClockCache.
+
+#include "cache/clock_cache.h"
+
+#include <jni.h>
+
+#include "include/org_rocksdb_ClockCache.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+
+/*
+ * Class:     org_rocksdb_ClockCache
+ * Method:    newClockCache
+ * Signature: (JIZ)J
+ */
+jlong Java_org_rocksdb_ClockCache_newClockCache(
+    JNIEnv* /*env*/, jclass /*jcls*/, jlong jcapacity, jint jnum_shard_bits,
+    jboolean jstrict_capacity_limit) {
+  auto* sptr_clock_cache = new std::shared_ptr<ROCKSDB_NAMESPACE::Cache>(
+      ROCKSDB_NAMESPACE::NewClockCache(
+          static_cast<size_t>(jcapacity), static_cast<int>(jnum_shard_bits),
+          static_cast<bool>(jstrict_capacity_limit)));
+  return GET_CPLUSPLUS_POINTER(sptr_clock_cache);
+}
+
+/*
+ * Class:     org_rocksdb_ClockCache
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_ClockCache_disposeInternal(JNIEnv* /*env*/,
+                                                 jobject /*jobj*/,
+                                                 jlong jhandle) {
+  auto* sptr_clock_cache =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache>*>(jhandle);
+  delete sptr_clock_cache;  // delete std::shared_ptr
+}
diff --git a/src/rocksdb/java/rocksjni/columnfamilyhandle.cc b/src/rocksdb/java/rocksjni/columnfamilyhandle.cc
new file mode 100644
index 000000000..4140580f0
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/columnfamilyhandle.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::ColumnFamilyHandle.
+
+#include <jni.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "include/org_rocksdb_ColumnFamilyHandle.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyHandle
+ * Method:    getName
+ * Signature: (J)[B
+ */
+jbyteArray Java_org_rocksdb_ColumnFamilyHandle_getName(JNIEnv* env,
+                                                       jobject /*jobj*/,
+                                                       jlong jhandle) {
+  auto* cfh = reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jhandle);
+  std::string cf_name = cfh->GetName();
+  return ROCKSDB_NAMESPACE::JniUtil::copyBytes(env, cf_name);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyHandle
+ * Method:    getID
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyHandle_getID(JNIEnv* /*env*/,
+                                               jobject /*jobj*/,
+                                               jlong jhandle) {
+  auto* cfh = reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jhandle);
+  const int32_t id = cfh->GetID();
+  return static_cast<jint>(id);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyHandle
+ * Method:    getDescriptor
+ * Signature: (J)Lorg/rocksdb/ColumnFamilyDescriptor;
+ */
+jobject Java_org_rocksdb_ColumnFamilyHandle_getDescriptor(JNIEnv* env,
+                                                          jobject /*jobj*/,
+                                                          jlong jhandle) {
+  auto* cfh = reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jhandle);
+  ROCKSDB_NAMESPACE::ColumnFamilyDescriptor desc;
+  ROCKSDB_NAMESPACE::Status s = cfh->GetDescriptor(&desc);
+  if (s.ok()) {
+    return ROCKSDB_NAMESPACE::ColumnFamilyDescriptorJni::construct(env, &desc);
+  } else {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
+  }
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyHandle
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_ColumnFamilyHandle_disposeInternal(JNIEnv* /*env*/,
+                                                         jobject /*jobj*/,
+                                                         jlong jhandle) {
+  auto* cfh = reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jhandle);
+  assert(cfh != nullptr);
+  delete cfh;
+}
diff --git a/src/rocksdb/java/rocksjni/compact_range_options.cc b/src/rocksdb/java/rocksjni/compact_range_options.cc
new file mode 100644
index 000000000..77fbb8890
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/compact_range_options.cc
@@ -0,0 +1,222 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::CompactRangeOptions.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_CompactRangeOptions.h"
+#include "rocksdb/options.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    newCompactRangeOptions
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_CompactRangeOptions_newCompactRangeOptions(
+    JNIEnv* /*env*/, jclass /*jclazz*/) {
+  auto* options = new ROCKSDB_NAMESPACE::CompactRangeOptions();
+  return GET_CPLUSPLUS_POINTER(options);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    exclusiveManualCompaction
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_CompactRangeOptions_exclusiveManualCompaction(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactRangeOptions*>(jhandle);
+  return static_cast<jboolean>(options->exclusive_manual_compaction);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    setExclusiveManualCompaction
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_CompactRangeOptions_setExclusiveManualCompaction(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jboolean exclusive_manual_compaction) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactRangeOptions*>(jhandle);
+  options->exclusive_manual_compaction =
+      static_cast<bool>(exclusive_manual_compaction);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    bottommostLevelCompaction
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompactRangeOptions_bottommostLevelCompaction(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactRangeOptions*>(jhandle);
+  return ROCKSDB_NAMESPACE::BottommostLevelCompactionJni::
+      toJavaBottommostLevelCompaction(options->bottommost_level_compaction);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    setBottommostLevelCompaction
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_CompactRangeOptions_setBottommostLevelCompaction(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jint bottommost_level_compaction) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactRangeOptions*>(jhandle);
+  options->bottommost_level_compaction =
+      ROCKSDB_NAMESPACE::BottommostLevelCompactionJni::
+          toCppBottommostLevelCompaction(bottommost_level_compaction);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    changeLevel
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_CompactRangeOptions_changeLevel(JNIEnv* /*env*/,
+                                                          jobject /*jobj*/,
+                                                          jlong jhandle) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactRangeOptions*>(jhandle);
+  return static_cast<jboolean>(options->change_level);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    setChangeLevel
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_CompactRangeOptions_setChangeLevel(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jboolean change_level) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactRangeOptions*>(jhandle);
+  options->change_level = static_cast<bool>(change_level);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    targetLevel
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompactRangeOptions_targetLevel(JNIEnv* /*env*/,
+                                                      jobject /*jobj*/,
+                                                      jlong jhandle) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactRangeOptions*>(jhandle);
+  return static_cast<jint>(options->target_level);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    setTargetLevel
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_CompactRangeOptions_setTargetLevel(JNIEnv* /*env*/,
+                                                         jobject /*jobj*/,
+                                                         jlong jhandle,
+                                                         jint target_level) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactRangeOptions*>(jhandle);
+  options->target_level = static_cast<int>(target_level);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    targetPathId
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompactRangeOptions_targetPathId(JNIEnv* /*env*/,
+                                                       jobject /*jobj*/,
+                                                       jlong jhandle) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactRangeOptions*>(jhandle);
+  return static_cast<jint>(options->target_path_id);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    setTargetPathId
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_CompactRangeOptions_setTargetPathId(JNIEnv* /*env*/,
+                                                          jobject /*jobj*/,
+                                                          jlong jhandle,
+                                                          jint target_path_id) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactRangeOptions*>(jhandle);
+  options->target_path_id = static_cast<uint32_t>(target_path_id);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    allowWriteStall
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_CompactRangeOptions_allowWriteStall(JNIEnv* /*env*/,
+                                                              jobject /*jobj*/,
+                                                              jlong jhandle) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactRangeOptions*>(jhandle);
+  return static_cast<jboolean>(options->allow_write_stall);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    setAllowWriteStall
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_CompactRangeOptions_setAllowWriteStall(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jboolean allow_write_stall) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactRangeOptions*>(jhandle);
+  options->allow_write_stall = static_cast<bool>(allow_write_stall);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    maxSubcompactions
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompactRangeOptions_maxSubcompactions(JNIEnv* /*env*/,
+                                                            jobject /*jobj*/,
+                                                            jlong jhandle) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactRangeOptions*>(jhandle);
+  return static_cast<jint>(options->max_subcompactions);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    setMaxSubcompactions
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_CompactRangeOptions_setMaxSubcompactions(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jint max_subcompactions) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactRangeOptions*>(jhandle);
+  options->max_subcompactions = static_cast<uint32_t>(max_subcompactions);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_CompactRangeOptions_disposeInternal(JNIEnv* /*env*/,
+                                                          jobject /*jobj*/,
+                                                          jlong jhandle) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactRangeOptions*>(jhandle);
+  delete options;
+}
diff --git a/src/rocksdb/java/rocksjni/compaction_filter.cc b/src/rocksdb/java/rocksjni/compaction_filter.cc
new file mode 100644
index 000000000..ea04996ac
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/compaction_filter.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::CompactionFilter.
+
+#include "rocksdb/compaction_filter.h"
+
+#include <jni.h>
+
+#include "include/org_rocksdb_AbstractCompactionFilter.h"
+
+// <editor-fold desc="org.rocksdb.AbstractCompactionFilter">
+
+/*
+ * Class:     org_rocksdb_AbstractCompactionFilter
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_AbstractCompactionFilter_disposeInternal(JNIEnv* /*env*/,
+                                                               jobject /*jobj*/,
+                                                               jlong handle) {
+  auto* cf = reinterpret_cast<ROCKSDB_NAMESPACE::CompactionFilter*>(handle);
+  assert(cf != nullptr);
+  delete cf;
+}
+// </editor-fold>
diff --git a/src/rocksdb/java/rocksjni/compaction_filter_factory.cc b/src/rocksdb/java/rocksjni/compaction_filter_factory.cc
new file mode 100644
index 000000000..16fbdbbdd
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/compaction_filter_factory.cc
@@ -0,0 +1,42 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::CompactionFilterFactory.
+
+#include <jni.h>
+
+#include <memory>
+
+#include "include/org_rocksdb_AbstractCompactionFilterFactory.h"
+#include "rocksjni/compaction_filter_factory_jnicallback.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+
+/*
+ * Class:     org_rocksdb_AbstractCompactionFilterFactory
+ * Method:    createNewCompactionFilterFactory0
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_AbstractCompactionFilterFactory_createNewCompactionFilterFactory0(
+    JNIEnv* env, jobject jobj) {
+  auto* cff =
+      new ROCKSDB_NAMESPACE::CompactionFilterFactoryJniCallback(env, jobj);
+  auto* ptr_sptr_cff = new std::shared_ptr<
+      ROCKSDB_NAMESPACE::CompactionFilterFactoryJniCallback>(cff);
+  return GET_CPLUSPLUS_POINTER(ptr_sptr_cff);
+}
+
+/*
+ * Class:     org_rocksdb_AbstractCompactionFilterFactory
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_AbstractCompactionFilterFactory_disposeInternal(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* ptr_sptr_cff = reinterpret_cast<
+      std::shared_ptr<ROCKSDB_NAMESPACE::CompactionFilterFactoryJniCallback>*>(
+      jhandle);
+  delete ptr_sptr_cff;
+}
diff --git a/src/rocksdb/java/rocksjni/compaction_filter_factory_jnicallback.cc b/src/rocksdb/java/rocksjni/compaction_filter_factory_jnicallback.cc
new file mode 100644
index 000000000..14285526f
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/compaction_filter_factory_jnicallback.cc
@@ -0,0 +1,79 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::CompactionFilterFactory.
+
+#include "rocksjni/compaction_filter_factory_jnicallback.h"
+
+#include "rocksjni/portal.h"
+
+namespace ROCKSDB_NAMESPACE {
+CompactionFilterFactoryJniCallback::CompactionFilterFactoryJniCallback(
+    JNIEnv* env, jobject jcompaction_filter_factory)
+    : JniCallback(env, jcompaction_filter_factory) {
+  // Note: The name of a CompactionFilterFactory will not change during
+  // it's lifetime, so we cache it in a global var
+  jmethodID jname_method_id =
+      AbstractCompactionFilterFactoryJni::getNameMethodId(env);
+  if (jname_method_id == nullptr) {
+    // exception thrown: NoSuchMethodException or OutOfMemoryError
+    return;
+  }
+
+  jstring jname =
+      (jstring)env->CallObjectMethod(m_jcallback_obj, jname_method_id);
+  if (env->ExceptionCheck()) {
+    // exception thrown
+    return;
+  }
+  jboolean has_exception = JNI_FALSE;
+  m_name =
+      JniUtil::copyString(env, jname, &has_exception);  // also releases jname
+  if (has_exception == JNI_TRUE) {
+    // exception thrown
+    return;
+  }
+
+  m_jcreate_compaction_filter_methodid =
+      AbstractCompactionFilterFactoryJni::getCreateCompactionFilterMethodId(
+          env);
+  if (m_jcreate_compaction_filter_methodid == nullptr) {
+    // exception thrown: NoSuchMethodException or OutOfMemoryError
+    return;
+  }
+}
+
+const char* CompactionFilterFactoryJniCallback::Name() const {
+  return m_name.get();
+}
+
+std::unique_ptr<CompactionFilter>
+CompactionFilterFactoryJniCallback::CreateCompactionFilter(
+    const CompactionFilter::Context& context) {
+  jboolean attached_thread = JNI_FALSE;
+  JNIEnv* env = getJniEnv(&attached_thread);
+  assert(env != nullptr);
+
+  jlong addr_compaction_filter =
+      env->CallLongMethod(m_jcallback_obj, m_jcreate_compaction_filter_methodid,
+                          static_cast<jboolean>(context.is_full_compaction),
+                          static_cast<jboolean>(context.is_manual_compaction));
+
+  if (env->ExceptionCheck()) {
+    // exception thrown from CallLongMethod
+    env->ExceptionDescribe();  // print out exception to stderr
+    releaseJniEnv(attached_thread);
+    return nullptr;
+  }
+
+  auto* cff = reinterpret_cast<CompactionFilter*>(addr_compaction_filter);
+
+  releaseJniEnv(attached_thread);
+
+  return std::unique_ptr<CompactionFilter>(cff);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/java/rocksjni/compaction_filter_factory_jnicallback.h b/src/rocksdb/java/rocksjni/compaction_filter_factory_jnicallback.h
new file mode 100644
index 000000000..2f26f8dbe
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/compaction_filter_factory_jnicallback.h
@@ -0,0 +1,37 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::CompactionFilterFactory.
+
+#ifndef JAVA_ROCKSJNI_COMPACTION_FILTER_FACTORY_JNICALLBACK_H_
+#define JAVA_ROCKSJNI_COMPACTION_FILTER_FACTORY_JNICALLBACK_H_
+
+#include <jni.h>
+
+#include <memory>
+
+#include "rocksdb/compaction_filter.h"
+#include "rocksjni/jnicallback.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CompactionFilterFactoryJniCallback : public JniCallback,
+                                           public CompactionFilterFactory {
+ public:
+  CompactionFilterFactoryJniCallback(JNIEnv* env,
+                                     jobject jcompaction_filter_factory);
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context);
+  virtual const char* Name() const;
+
+ private:
+  std::unique_ptr<const char[]> m_name;
+  jmethodID m_jcreate_compaction_filter_methodid;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // JAVA_ROCKSJNI_COMPACTION_FILTER_FACTORY_JNICALLBACK_H_
diff --git a/src/rocksdb/java/rocksjni/compaction_job_info.cc b/src/rocksdb/java/rocksjni/compaction_job_info.cc
new file mode 100644
index 000000000..fb292f59c
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/compaction_job_info.cc
@@ -0,0 +1,230 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::CompactionJobInfo.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_CompactionJobInfo.h"
+#include "rocksdb/listener.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_CompactionJobInfo
+ * Method:    newCompactionJobInfo
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_CompactionJobInfo_newCompactionJobInfo(JNIEnv*, jclass) {
+  auto* compact_job_info = new ROCKSDB_NAMESPACE::CompactionJobInfo();
+  return GET_CPLUSPLUS_POINTER(compact_job_info);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobInfo
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_CompactionJobInfo_disposeInternal(JNIEnv*, jobject,
+                                                        jlong jhandle) {
+  auto* compact_job_info =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobInfo*>(jhandle);
+  delete compact_job_info;
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobInfo
+ * Method:    columnFamilyName
+ * Signature: (J)[B
+ */
+jbyteArray Java_org_rocksdb_CompactionJobInfo_columnFamilyName(JNIEnv* env,
+                                                               jclass,
+                                                               jlong jhandle) {
+  auto* compact_job_info =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobInfo*>(jhandle);
+  return ROCKSDB_NAMESPACE::JniUtil::copyBytes(env, compact_job_info->cf_name);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobInfo
+ * Method:    status
+ * Signature: (J)Lorg/rocksdb/Status;
+ */
+jobject Java_org_rocksdb_CompactionJobInfo_status(JNIEnv* env, jclass,
+                                                  jlong jhandle) {
+  auto* compact_job_info =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobInfo*>(jhandle);
+  return ROCKSDB_NAMESPACE::StatusJni::construct(env, compact_job_info->status);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobInfo
+ * Method:    threadId
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobInfo_threadId(JNIEnv*, jclass,
+                                                  jlong jhandle) {
+  auto* compact_job_info =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobInfo*>(jhandle);
+  return static_cast<jlong>(compact_job_info->thread_id);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobInfo
+ * Method:    jobId
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompactionJobInfo_jobId(JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_info =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobInfo*>(jhandle);
+  return static_cast<jint>(compact_job_info->job_id);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobInfo
+ * Method:    baseInputLevel
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompactionJobInfo_baseInputLevel(JNIEnv*, jclass,
+                                                       jlong jhandle) {
+  auto* compact_job_info =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobInfo*>(jhandle);
+  return static_cast<jint>(compact_job_info->base_input_level);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobInfo
+ * Method:    outputLevel
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompactionJobInfo_outputLevel(JNIEnv*, jclass,
+                                                    jlong jhandle) {
+  auto* compact_job_info =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobInfo*>(jhandle);
+  return static_cast<jint>(compact_job_info->output_level);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobInfo
+ * Method:    inputFiles
+ * Signature: (J)[Ljava/lang/String;
+ */
+jobjectArray Java_org_rocksdb_CompactionJobInfo_inputFiles(JNIEnv* env, jclass,
+                                                           jlong jhandle) {
+  auto* compact_job_info =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobInfo*>(jhandle);
+  return ROCKSDB_NAMESPACE::JniUtil::toJavaStrings(
+      env, &compact_job_info->input_files);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobInfo
+ * Method:    outputFiles
+ * Signature: (J)[Ljava/lang/String;
+ */
+jobjectArray Java_org_rocksdb_CompactionJobInfo_outputFiles(JNIEnv* env, jclass,
+                                                            jlong jhandle) {
+  auto* compact_job_info =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobInfo*>(jhandle);
+  return ROCKSDB_NAMESPACE::JniUtil::toJavaStrings(
+      env, &compact_job_info->output_files);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobInfo
+ * Method:    tableProperties
+ * Signature: (J)Ljava/util/Map;
+ */
+jobject Java_org_rocksdb_CompactionJobInfo_tableProperties(JNIEnv* env, jclass,
+                                                           jlong jhandle) {
+  auto* compact_job_info =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobInfo*>(jhandle);
+  auto* map = &compact_job_info->table_properties;
+
+  jobject jhash_map = ROCKSDB_NAMESPACE::HashMapJni::construct(
+      env, static_cast<uint32_t>(map->size()));
+  if (jhash_map == nullptr) {
+    // exception occurred
+    return nullptr;
+  }
+
+  const ROCKSDB_NAMESPACE::HashMapJni::FnMapKV<
+      const std::string,
+      std::shared_ptr<const ROCKSDB_NAMESPACE::TableProperties>, jobject,
+      jobject>
+      fn_map_kv =
+          [env](const std::pair<
+                const std::string,
+                std::shared_ptr<const ROCKSDB_NAMESPACE::TableProperties>>&
+                    kv) {
+            jstring jkey = ROCKSDB_NAMESPACE::JniUtil::toJavaString(
+                env, &(kv.first), false);
+            if (env->ExceptionCheck()) {
+              // an error occurred
+              return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+            }
+
+            jobject jtable_properties =
+                ROCKSDB_NAMESPACE::TablePropertiesJni::fromCppTableProperties(
+                    env, *(kv.second.get()));
+            if (env->ExceptionCheck()) {
+              // an error occurred
+              env->DeleteLocalRef(jkey);
+              return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+            }
+
+            return std::unique_ptr<std::pair<jobject, jobject>>(
+                new std::pair<jobject, jobject>(static_cast<jobject>(jkey),
+                                                jtable_properties));
+          };
+
+  if (!ROCKSDB_NAMESPACE::HashMapJni::putAll(env, jhash_map, map->begin(),
+                                             map->end(), fn_map_kv)) {
+    // exception occurred
+    return nullptr;
+  }
+
+  return jhash_map;
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobInfo
+ * Method:    compactionReason
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_CompactionJobInfo_compactionReason(JNIEnv*, jclass,
+                                                          jlong jhandle) {
+  auto* compact_job_info =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobInfo*>(jhandle);
+  return ROCKSDB_NAMESPACE::CompactionReasonJni::toJavaCompactionReason(
+      compact_job_info->compaction_reason);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobInfo
+ * Method:    compression
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_CompactionJobInfo_compression(JNIEnv*, jclass,
+                                                     jlong jhandle) {
+  auto* compact_job_info =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobInfo*>(jhandle);
+  return ROCKSDB_NAMESPACE::CompressionTypeJni::toJavaCompressionType(
+      compact_job_info->compression);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobInfo
+ * Method:    stats
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobInfo_stats(JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_info =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobInfo*>(jhandle);
+  auto* stats = new ROCKSDB_NAMESPACE::CompactionJobStats();
+  stats->Add(compact_job_info->stats);
+  return GET_CPLUSPLUS_POINTER(stats);
+}
diff --git a/src/rocksdb/java/rocksjni/compaction_job_stats.cc b/src/rocksdb/java/rocksjni/compaction_job_stats.cc
new file mode 100644
index 000000000..a2599c132
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/compaction_job_stats.cc
@@ -0,0 +1,345 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::CompactionJobStats.
+
+#include "rocksdb/compaction_job_stats.h"
+
+#include <jni.h>
+
+#include "include/org_rocksdb_CompactionJobStats.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    newCompactionJobStats
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_newCompactionJobStats(JNIEnv*,
+                                                                jclass) {
+  auto* compact_job_stats = new ROCKSDB_NAMESPACE::CompactionJobStats();
+  return GET_CPLUSPLUS_POINTER(compact_job_stats);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_CompactionJobStats_disposeInternal(JNIEnv*, jobject,
+                                                         jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobStats*>(jhandle);
+  delete compact_job_stats;
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    reset
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_CompactionJobStats_reset(JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobStats*>(jhandle);
+  compact_job_stats->Reset();
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    add
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_CompactionJobStats_add(JNIEnv*, jclass, jlong jhandle,
+                                             jlong jother_handle) {
+  auto* compact_job_stats =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobStats*>(jhandle);
+  auto* other_compact_job_stats =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobStats*>(jother_handle);
+  compact_job_stats->Add(*other_compact_job_stats);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    elapsedMicros
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_elapsedMicros(JNIEnv*, jclass,
+                                                        jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(compact_job_stats->elapsed_micros);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    numInputRecords
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_numInputRecords(JNIEnv*, jclass,
+                                                          jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(compact_job_stats->num_input_records);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    numInputFiles
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_numInputFiles(JNIEnv*, jclass,
+                                                        jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(compact_job_stats->num_input_files);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    numInputFilesAtOutputLevel
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_numInputFilesAtOutputLevel(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(compact_job_stats->num_input_files_at_output_level);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    numOutputRecords
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_numOutputRecords(JNIEnv*, jclass,
+                                                           jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(compact_job_stats->num_output_records);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    numOutputFiles
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_numOutputFiles(JNIEnv*, jclass,
+                                                         jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(compact_job_stats->num_output_files);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    isManualCompaction
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_CompactionJobStats_isManualCompaction(JNIEnv*, jclass,
+                                                                jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobStats*>(jhandle);
+  if (compact_job_stats->is_manual_compaction) {
+    return JNI_TRUE;
+  } else {
+    return JNI_FALSE;
+  }
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    totalInputBytes
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_totalInputBytes(JNIEnv*, jclass,
+                                                          jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(compact_job_stats->total_input_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    totalOutputBytes
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_totalOutputBytes(JNIEnv*, jclass,
+                                                           jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(compact_job_stats->total_output_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    numRecordsReplaced
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_numRecordsReplaced(JNIEnv*, jclass,
+                                                             jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(compact_job_stats->num_records_replaced);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    totalInputRawKeyBytes
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_totalInputRawKeyBytes(JNIEnv*, jclass,
+                                                                jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(compact_job_stats->total_input_raw_key_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    totalInputRawValueBytes
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_totalInputRawValueBytes(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(compact_job_stats->total_input_raw_value_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    numInputDeletionRecords
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_numInputDeletionRecords(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(compact_job_stats->num_input_deletion_records);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    numExpiredDeletionRecords
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_numExpiredDeletionRecords(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(compact_job_stats->num_expired_deletion_records);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    numCorruptKeys
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_numCorruptKeys(JNIEnv*, jclass,
+                                                         jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(compact_job_stats->num_corrupt_keys);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    fileWriteNanos
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_fileWriteNanos(JNIEnv*, jclass,
+                                                         jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(compact_job_stats->file_write_nanos);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    fileRangeSyncNanos
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_fileRangeSyncNanos(JNIEnv*, jclass,
+                                                             jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(compact_job_stats->file_range_sync_nanos);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    fileFsyncNanos
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_fileFsyncNanos(JNIEnv*, jclass,
+                                                         jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(compact_job_stats->file_fsync_nanos);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    filePrepareWriteNanos
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_filePrepareWriteNanos(JNIEnv*, jclass,
+                                                                jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(compact_job_stats->file_prepare_write_nanos);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    smallestOutputKeyPrefix
+ * Signature: (J)[B
+ */
+jbyteArray Java_org_rocksdb_CompactionJobStats_smallestOutputKeyPrefix(
+    JNIEnv* env, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobStats*>(jhandle);
+  return ROCKSDB_NAMESPACE::JniUtil::copyBytes(
+      env, compact_job_stats->smallest_output_key_prefix);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    largestOutputKeyPrefix
+ * Signature: (J)[B
+ */
+jbyteArray Java_org_rocksdb_CompactionJobStats_largestOutputKeyPrefix(
+    JNIEnv* env, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobStats*>(jhandle);
+  return ROCKSDB_NAMESPACE::JniUtil::copyBytes(
+      env, compact_job_stats->largest_output_key_prefix);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    numSingleDelFallthru
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_numSingleDelFallthru(JNIEnv*, jclass,
+                                                               jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(compact_job_stats->num_single_del_fallthru);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    numSingleDelMismatch
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_numSingleDelMismatch(JNIEnv*, jclass,
+                                                               jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(compact_job_stats->num_single_del_mismatch);
+}
diff --git a/src/rocksdb/java/rocksjni/compaction_options.cc b/src/rocksdb/java/rocksjni/compaction_options.cc
new file mode 100644
index 000000000..bbbde0313
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/compaction_options.cc
@@ -0,0 +1,112 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::CompactionOptions.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_CompactionOptions.h"
+#include "rocksdb/options.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_CompactionOptions
+ * Method:    newCompactionOptions
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_CompactionOptions_newCompactionOptions(JNIEnv*, jclass) {
+  auto* compact_opts = new ROCKSDB_NAMESPACE::CompactionOptions();
+  return GET_CPLUSPLUS_POINTER(compact_opts);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_CompactionOptions_disposeInternal(JNIEnv*, jobject,
+                                                        jlong jhandle) {
+  auto* compact_opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptions*>(jhandle);
+  delete compact_opts;
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptions
+ * Method:    compression
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_CompactionOptions_compression(JNIEnv*, jclass,
+                                                     jlong jhandle) {
+  auto* compact_opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptions*>(jhandle);
+  return ROCKSDB_NAMESPACE::CompressionTypeJni::toJavaCompressionType(
+      compact_opts->compression);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptions
+ * Method:    setCompression
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_CompactionOptions_setCompression(
+    JNIEnv*, jclass, jlong jhandle, jbyte jcompression_type_value) {
+  auto* compact_opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptions*>(jhandle);
+  compact_opts->compression =
+      ROCKSDB_NAMESPACE::CompressionTypeJni::toCppCompressionType(
+          jcompression_type_value);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptions
+ * Method:    outputFileSizeLimit
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionOptions_outputFileSizeLimit(JNIEnv*, jclass,
+                                                             jlong jhandle) {
+  auto* compact_opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptions*>(jhandle);
+  return static_cast<jlong>(compact_opts->output_file_size_limit);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptions
+ * Method:    setOutputFileSizeLimit
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_CompactionOptions_setOutputFileSizeLimit(
+    JNIEnv*, jclass, jlong jhandle, jlong joutput_file_size_limit) {
+  auto* compact_opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptions*>(jhandle);
+  compact_opts->output_file_size_limit =
+      static_cast<uint64_t>(joutput_file_size_limit);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptions
+ * Method:    maxSubcompactions
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompactionOptions_maxSubcompactions(JNIEnv*, jclass,
+                                                          jlong jhandle) {
+  auto* compact_opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptions*>(jhandle);
+  return static_cast<jint>(compact_opts->max_subcompactions);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptions
+ * Method:    setMaxSubcompactions
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_CompactionOptions_setMaxSubcompactions(
+    JNIEnv*, jclass, jlong jhandle, jint jmax_subcompactions) {
+  auto* compact_opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptions*>(jhandle);
+  compact_opts->max_subcompactions = static_cast<uint32_t>(jmax_subcompactions);
+}
diff --git a/src/rocksdb/java/rocksjni/compaction_options_fifo.cc b/src/rocksdb/java/rocksjni/compaction_options_fifo.cc
new file mode 100644
index 000000000..f6a47fec5
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/compaction_options_fifo.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::CompactionOptionsFIFO.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_CompactionOptionsFIFO.h"
+#include "rocksdb/advanced_options.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsFIFO
+ * Method:    newCompactionOptionsFIFO
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_CompactionOptionsFIFO_newCompactionOptionsFIFO(JNIEnv*,
+                                                                      jclass) {
+  const auto* opt = new ROCKSDB_NAMESPACE::CompactionOptionsFIFO();
+  return GET_CPLUSPLUS_POINTER(opt);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsFIFO
+ * Method:    setMaxTableFilesSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_CompactionOptionsFIFO_setMaxTableFilesSize(
+    JNIEnv*, jobject, jlong jhandle, jlong jmax_table_files_size) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsFIFO*>(jhandle);
+  opt->max_table_files_size = static_cast<uint64_t>(jmax_table_files_size);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsFIFO
+ * Method:    maxTableFilesSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionOptionsFIFO_maxTableFilesSize(JNIEnv*, jobject,
+                                                               jlong jhandle) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsFIFO*>(jhandle);
+  return static_cast<jlong>(opt->max_table_files_size);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsFIFO
+ * Method:    setAllowCompaction
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_CompactionOptionsFIFO_setAllowCompaction(
+    JNIEnv*, jobject, jlong jhandle, jboolean allow_compaction) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsFIFO*>(jhandle);
+  opt->allow_compaction = static_cast<bool>(allow_compaction);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsFIFO
+ * Method:    allowCompaction
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_CompactionOptionsFIFO_allowCompaction(JNIEnv*,
+                                                                jobject,
+                                                                jlong jhandle) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsFIFO*>(jhandle);
+  return static_cast<jboolean>(opt->allow_compaction);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsFIFO
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_CompactionOptionsFIFO_disposeInternal(JNIEnv*, jobject,
+                                                            jlong jhandle) {
+  delete reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsFIFO*>(jhandle);
+}
diff --git a/src/rocksdb/java/rocksjni/compaction_options_universal.cc b/src/rocksdb/java/rocksjni/compaction_options_universal.cc
new file mode 100644
index 000000000..9fc6f3158
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/compaction_options_universal.cc
@@ -0,0 +1,209 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::CompactionOptionsUniversal.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_CompactionOptionsUniversal.h"
+#include "rocksdb/advanced_options.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsUniversal
+ * Method:    newCompactionOptionsUniversal
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_CompactionOptionsUniversal_newCompactionOptionsUniversal(
+    JNIEnv*, jclass) {
+  const auto* opt = new ROCKSDB_NAMESPACE::CompactionOptionsUniversal();
+  return GET_CPLUSPLUS_POINTER(opt);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsUniversal
+ * Method:    setSizeRatio
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_CompactionOptionsUniversal_setSizeRatio(
+    JNIEnv*, jobject, jlong jhandle, jint jsize_ratio) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsUniversal*>(jhandle);
+  opt->size_ratio = static_cast<unsigned int>(jsize_ratio);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsUniversal
+ * Method:    sizeRatio
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompactionOptionsUniversal_sizeRatio(JNIEnv*, jobject,
+                                                           jlong jhandle) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsUniversal*>(jhandle);
+  return static_cast<jint>(opt->size_ratio);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsUniversal
+ * Method:    setMinMergeWidth
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_CompactionOptionsUniversal_setMinMergeWidth(
+    JNIEnv*, jobject, jlong jhandle, jint jmin_merge_width) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsUniversal*>(jhandle);
+  opt->min_merge_width = static_cast<unsigned int>(jmin_merge_width);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsUniversal
+ * Method:    minMergeWidth
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompactionOptionsUniversal_minMergeWidth(JNIEnv*, jobject,
+                                                               jlong jhandle) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsUniversal*>(jhandle);
+  return static_cast<jint>(opt->min_merge_width);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsUniversal
+ * Method:    setMaxMergeWidth
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_CompactionOptionsUniversal_setMaxMergeWidth(
+    JNIEnv*, jobject, jlong jhandle, jint jmax_merge_width) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsUniversal*>(jhandle);
+  opt->max_merge_width = static_cast<unsigned int>(jmax_merge_width);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsUniversal
+ * Method:    maxMergeWidth
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompactionOptionsUniversal_maxMergeWidth(JNIEnv*, jobject,
+                                                               jlong jhandle) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsUniversal*>(jhandle);
+  return static_cast<jint>(opt->max_merge_width);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsUniversal
+ * Method:    setMaxSizeAmplificationPercent
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_CompactionOptionsUniversal_setMaxSizeAmplificationPercent(
+    JNIEnv*, jobject, jlong jhandle, jint jmax_size_amplification_percent) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsUniversal*>(jhandle);
+  opt->max_size_amplification_percent =
+      static_cast<unsigned int>(jmax_size_amplification_percent);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsUniversal
+ * Method:    maxSizeAmplificationPercent
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompactionOptionsUniversal_maxSizeAmplificationPercent(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsUniversal*>(jhandle);
+  return static_cast<jint>(opt->max_size_amplification_percent);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsUniversal
+ * Method:    setCompressionSizePercent
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_CompactionOptionsUniversal_setCompressionSizePercent(
+    JNIEnv*, jobject, jlong jhandle, jint jcompression_size_percent) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsUniversal*>(jhandle);
+  opt->compression_size_percent =
+      static_cast<unsigned int>(jcompression_size_percent);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsUniversal
+ * Method:    compressionSizePercent
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompactionOptionsUniversal_compressionSizePercent(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsUniversal*>(jhandle);
+  return static_cast<jint>(opt->compression_size_percent);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsUniversal
+ * Method:    setStopStyle
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_CompactionOptionsUniversal_setStopStyle(
+    JNIEnv*, jobject, jlong jhandle, jbyte jstop_style_value) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsUniversal*>(jhandle);
+  opt->stop_style =
+      ROCKSDB_NAMESPACE::CompactionStopStyleJni::toCppCompactionStopStyle(
+          jstop_style_value);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsUniversal
+ * Method:    stopStyle
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_CompactionOptionsUniversal_stopStyle(JNIEnv*, jobject,
+                                                            jlong jhandle) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsUniversal*>(jhandle);
+  return ROCKSDB_NAMESPACE::CompactionStopStyleJni::toJavaCompactionStopStyle(
+      opt->stop_style);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsUniversal
+ * Method:    setAllowTrivialMove
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_CompactionOptionsUniversal_setAllowTrivialMove(
+    JNIEnv*, jobject, jlong jhandle, jboolean jallow_trivial_move) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsUniversal*>(jhandle);
+  opt->allow_trivial_move = static_cast<bool>(jallow_trivial_move);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsUniversal
+ * Method:    allowTrivialMove
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_CompactionOptionsUniversal_allowTrivialMove(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsUniversal*>(jhandle);
+  return opt->allow_trivial_move;
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsUniversal
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_CompactionOptionsUniversal_disposeInternal(
+    JNIEnv*, jobject, jlong jhandle) {
+  delete reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsUniversal*>(
+      jhandle);
+}
diff --git a/src/rocksdb/java/rocksjni/comparator.cc b/src/rocksdb/java/rocksjni/comparator.cc
new file mode 100644
index 000000000..11279c4ce
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/comparator.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::Comparator.
+
+#include <jni.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <functional>
+#include <string>
+
+#include "include/org_rocksdb_AbstractComparator.h"
+#include "include/org_rocksdb_NativeComparatorWrapper.h"
+#include "rocksjni/comparatorjnicallback.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_AbstractComparator
+ * Method:    createNewComparator
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_AbstractComparator_createNewComparator(
+    JNIEnv* env, jobject jcomparator, jlong copt_handle) {
+  auto* copt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ComparatorJniCallbackOptions*>(
+          copt_handle);
+  auto* c =
+      new ROCKSDB_NAMESPACE::ComparatorJniCallback(env, jcomparator, copt);
+  return GET_CPLUSPLUS_POINTER(c);
+}
+
+/*
+ * Class:     org_rocksdb_AbstractComparator
+ * Method:    usingDirectBuffers
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_AbstractComparator_usingDirectBuffers(JNIEnv*,
+                                                                jobject,
+                                                                jlong jhandle) {
+  auto* c =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ComparatorJniCallback*>(jhandle);
+  return static_cast<jboolean>(c->m_options->direct_buffer);
+}
+
+/*
+ * Class:     org_rocksdb_NativeComparatorWrapper
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_NativeComparatorWrapper_disposeInternal(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jcomparator_handle) {
+  auto* comparator =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Comparator*>(jcomparator_handle);
+  delete comparator;
+}
diff --git a/src/rocksdb/java/rocksjni/comparatorjnicallback.cc b/src/rocksdb/java/rocksjni/comparatorjnicallback.cc
new file mode 100644
index 000000000..07ab9fa41
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/comparatorjnicallback.cc
@@ -0,0 +1,646 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::Comparator.
+
+#include "rocksjni/comparatorjnicallback.h"
+
+#include "rocksjni/portal.h"
+
+namespace ROCKSDB_NAMESPACE {
+ComparatorJniCallback::ComparatorJniCallback(
+    JNIEnv* env, jobject jcomparator,
+    const ComparatorJniCallbackOptions* options)
+    : JniCallback(env, jcomparator), m_options(options) {
+  // cache the AbstractComparatorJniBridge class as we will reuse it many times
+  // for each callback
+  m_abstract_comparator_jni_bridge_clazz = static_cast<jclass>(
+      env->NewGlobalRef(AbstractComparatorJniBridge::getJClass(env)));
+
+  // Note: The name of a Comparator will not change during it's lifetime,
+  // so we cache it in a global var
+  jmethodID jname_mid = AbstractComparatorJni::getNameMethodId(env);
+  if (jname_mid == nullptr) {
+    // exception thrown: NoSuchMethodException or OutOfMemoryError
+    return;
+  }
+  jstring js_name = (jstring)env->CallObjectMethod(m_jcallback_obj, jname_mid);
+  if (env->ExceptionCheck()) {
+    // exception thrown
+    return;
+  }
+  jboolean has_exception = JNI_FALSE;
+  m_name = JniUtil::copyString(env, js_name,
+                               &has_exception);  // also releases jsName
+  if (has_exception == JNI_TRUE) {
+    // exception thrown
+    return;
+  }
+
+  // cache the ByteBuffer class as we will reuse it many times for each callback
+  m_jbytebuffer_clazz =
+      static_cast<jclass>(env->NewGlobalRef(ByteBufferJni::getJClass(env)));
+
+  m_jcompare_mid = AbstractComparatorJniBridge::getCompareInternalMethodId(
+      env, m_abstract_comparator_jni_bridge_clazz);
+  if (m_jcompare_mid == nullptr) {
+    // exception thrown: NoSuchMethodException or OutOfMemoryError
+    return;
+  }
+
+  m_jshortest_mid =
+      AbstractComparatorJniBridge::getFindShortestSeparatorInternalMethodId(
+          env, m_abstract_comparator_jni_bridge_clazz);
+  if (m_jshortest_mid == nullptr) {
+    // exception thrown: NoSuchMethodException or OutOfMemoryError
+    return;
+  }
+
+  m_jshort_mid =
+      AbstractComparatorJniBridge::getFindShortSuccessorInternalMethodId(
+          env, m_abstract_comparator_jni_bridge_clazz);
+  if (m_jshort_mid == nullptr) {
+    // exception thrown: NoSuchMethodException or OutOfMemoryError
+    return;
+  }
+
+  // do we need reusable buffers?
+  if (m_options->max_reused_buffer_size > -1) {
+    if (m_options->reused_synchronisation_type ==
+        ReusedSynchronisationType::THREAD_LOCAL) {
+      // buffers reused per thread
+      UnrefHandler unref = [](void* ptr) {
+        ThreadLocalBuf* tlb = reinterpret_cast<ThreadLocalBuf*>(ptr);
+        jboolean attached_thread = JNI_FALSE;
+        JNIEnv* _env = JniUtil::getJniEnv(tlb->jvm, &attached_thread);
+        if (_env != nullptr) {
+          if (tlb->direct_buffer) {
+            void* buf = _env->GetDirectBufferAddress(tlb->jbuf);
+            delete[] static_cast<char*>(buf);
+          }
+          _env->DeleteGlobalRef(tlb->jbuf);
+          JniUtil::releaseJniEnv(tlb->jvm, attached_thread);
+        }
+      };
+
+      m_tl_buf_a = new ThreadLocalPtr(unref);
+      m_tl_buf_b = new ThreadLocalPtr(unref);
+
+      m_jcompare_buf_a = nullptr;
+      m_jcompare_buf_b = nullptr;
+      m_jshortest_buf_start = nullptr;
+      m_jshortest_buf_limit = nullptr;
+      m_jshort_buf_key = nullptr;
+
+    } else {
+      // buffers reused and shared across threads
+      const bool adaptive = m_options->reused_synchronisation_type ==
+                            ReusedSynchronisationType::ADAPTIVE_MUTEX;
+      mtx_compare = std::unique_ptr<port::Mutex>(new port::Mutex(adaptive));
+      mtx_shortest = std::unique_ptr<port::Mutex>(new port::Mutex(adaptive));
+      mtx_short = std::unique_ptr<port::Mutex>(new port::Mutex(adaptive));
+
+      m_jcompare_buf_a = env->NewGlobalRef(ByteBufferJni::construct(
+          env, m_options->direct_buffer, m_options->max_reused_buffer_size,
+          m_jbytebuffer_clazz));
+      if (m_jcompare_buf_a == nullptr) {
+        // exception thrown: OutOfMemoryError
+        return;
+      }
+
+      m_jcompare_buf_b = env->NewGlobalRef(ByteBufferJni::construct(
+          env, m_options->direct_buffer, m_options->max_reused_buffer_size,
+          m_jbytebuffer_clazz));
+      if (m_jcompare_buf_b == nullptr) {
+        // exception thrown: OutOfMemoryError
+        return;
+      }
+
+      m_jshortest_buf_start = env->NewGlobalRef(ByteBufferJni::construct(
+          env, m_options->direct_buffer, m_options->max_reused_buffer_size,
+          m_jbytebuffer_clazz));
+      if (m_jshortest_buf_start == nullptr) {
+        // exception thrown: OutOfMemoryError
+        return;
+      }
+
+      m_jshortest_buf_limit = env->NewGlobalRef(ByteBufferJni::construct(
+          env, m_options->direct_buffer, m_options->max_reused_buffer_size,
+          m_jbytebuffer_clazz));
+      if (m_jshortest_buf_limit == nullptr) {
+        // exception thrown: OutOfMemoryError
+        return;
+      }
+
+      m_jshort_buf_key = env->NewGlobalRef(ByteBufferJni::construct(
+          env, m_options->direct_buffer, m_options->max_reused_buffer_size,
+          m_jbytebuffer_clazz));
+      if (m_jshort_buf_key == nullptr) {
+        // exception thrown: OutOfMemoryError
+        return;
+      }
+
+      m_tl_buf_a = nullptr;
+      m_tl_buf_b = nullptr;
+    }
+
+  } else {
+    m_jcompare_buf_a = nullptr;
+    m_jcompare_buf_b = nullptr;
+    m_jshortest_buf_start = nullptr;
+    m_jshortest_buf_limit = nullptr;
+    m_jshort_buf_key = nullptr;
+
+    m_tl_buf_a = nullptr;
+    m_tl_buf_b = nullptr;
+  }
+}
+
+ComparatorJniCallback::~ComparatorJniCallback() {
+  jboolean attached_thread = JNI_FALSE;
+  JNIEnv* env = getJniEnv(&attached_thread);
+  assert(env != nullptr);
+
+  env->DeleteGlobalRef(m_abstract_comparator_jni_bridge_clazz);
+
+  env->DeleteGlobalRef(m_jbytebuffer_clazz);
+
+  if (m_jcompare_buf_a != nullptr) {
+    if (m_options->direct_buffer) {
+      void* buf = env->GetDirectBufferAddress(m_jcompare_buf_a);
+      delete[] static_cast<char*>(buf);
+    }
+    env->DeleteGlobalRef(m_jcompare_buf_a);
+  }
+
+  if (m_jcompare_buf_b != nullptr) {
+    if (m_options->direct_buffer) {
+      void* buf = env->GetDirectBufferAddress(m_jcompare_buf_b);
+      delete[] static_cast<char*>(buf);
+    }
+    env->DeleteGlobalRef(m_jcompare_buf_b);
+  }
+
+  if (m_jshortest_buf_start != nullptr) {
+    if (m_options->direct_buffer) {
+      void* buf = env->GetDirectBufferAddress(m_jshortest_buf_start);
+      delete[] static_cast<char*>(buf);
+    }
+    env->DeleteGlobalRef(m_jshortest_buf_start);
+  }
+
+  if (m_jshortest_buf_limit != nullptr) {
+    if (m_options->direct_buffer) {
+      void* buf = env->GetDirectBufferAddress(m_jshortest_buf_limit);
+      delete[] static_cast<char*>(buf);
+    }
+    env->DeleteGlobalRef(m_jshortest_buf_limit);
+  }
+
+  if (m_jshort_buf_key != nullptr) {
+    if (m_options->direct_buffer) {
+      void* buf = env->GetDirectBufferAddress(m_jshort_buf_key);
+      delete[] static_cast<char*>(buf);
+    }
+    env->DeleteGlobalRef(m_jshort_buf_key);
+  }
+
+  if (m_tl_buf_a != nullptr) {
+    delete m_tl_buf_a;
+  }
+
+  if (m_tl_buf_b != nullptr) {
+    delete m_tl_buf_b;
+  }
+
+  releaseJniEnv(attached_thread);
+}
+
+const char* ComparatorJniCallback::Name() const { return m_name.get(); }
+
+int ComparatorJniCallback::Compare(const Slice& a, const Slice& b) const {
+  jboolean attached_thread = JNI_FALSE;
+  JNIEnv* env = getJniEnv(&attached_thread);
+  assert(env != nullptr);
+
+  const bool reuse_jbuf_a =
+      static_cast<int64_t>(a.size()) <= m_options->max_reused_buffer_size;
+  const bool reuse_jbuf_b =
+      static_cast<int64_t>(b.size()) <= m_options->max_reused_buffer_size;
+
+  MaybeLockForReuse(mtx_compare, reuse_jbuf_a || reuse_jbuf_b);
+
+  jobject jcompare_buf_a =
+      GetBuffer(env, a, reuse_jbuf_a, m_tl_buf_a, m_jcompare_buf_a);
+  if (jcompare_buf_a == nullptr) {
+    // exception occurred
+    MaybeUnlockForReuse(mtx_compare, reuse_jbuf_a || reuse_jbuf_b);
+    env->ExceptionDescribe();  // print out exception to stderr
+    releaseJniEnv(attached_thread);
+    return 0;
+  }
+
+  jobject jcompare_buf_b =
+      GetBuffer(env, b, reuse_jbuf_b, m_tl_buf_b, m_jcompare_buf_b);
+  if (jcompare_buf_b == nullptr) {
+    // exception occurred
+    if (!reuse_jbuf_a) {
+      DeleteBuffer(env, jcompare_buf_a);
+    }
+    MaybeUnlockForReuse(mtx_compare, reuse_jbuf_a || reuse_jbuf_b);
+    env->ExceptionDescribe();  // print out exception to stderr
+    releaseJniEnv(attached_thread);
+    return 0;
+  }
+
+  jint result = env->CallStaticIntMethod(
+      m_abstract_comparator_jni_bridge_clazz, m_jcompare_mid, m_jcallback_obj,
+      jcompare_buf_a, reuse_jbuf_a ? a.size() : -1, jcompare_buf_b,
+      reuse_jbuf_b ? b.size() : -1);
+
+  if (env->ExceptionCheck()) {
+    // exception thrown from CallIntMethod
+    env->ExceptionDescribe();  // print out exception to stderr
+    result = 0;  // we could not get a result from java callback so use 0
+  }
+
+  if (!reuse_jbuf_a) {
+    DeleteBuffer(env, jcompare_buf_a);
+  }
+  if (!reuse_jbuf_b) {
+    DeleteBuffer(env, jcompare_buf_b);
+  }
+
+  MaybeUnlockForReuse(mtx_compare, reuse_jbuf_a || reuse_jbuf_b);
+
+  releaseJniEnv(attached_thread);
+
+  return result;
+}
+
+void ComparatorJniCallback::FindShortestSeparator(std::string* start,
+                                                  const Slice& limit) const {
+  if (start == nullptr) {
+    return;
+  }
+
+  jboolean attached_thread = JNI_FALSE;
+  JNIEnv* env = getJniEnv(&attached_thread);
+  assert(env != nullptr);
+
+  const bool reuse_jbuf_start = static_cast<int64_t>(start->length()) <=
+                                m_options->max_reused_buffer_size;
+  const bool reuse_jbuf_limit =
+      static_cast<int64_t>(limit.size()) <= m_options->max_reused_buffer_size;
+
+  MaybeLockForReuse(mtx_shortest, reuse_jbuf_start || reuse_jbuf_limit);
+
+  Slice sstart(start->data(), start->length());
+  jobject j_start_buf = GetBuffer(env, sstart, reuse_jbuf_start, m_tl_buf_a,
+                                  m_jshortest_buf_start);
+  if (j_start_buf == nullptr) {
+    // exception occurred
+    MaybeUnlockForReuse(mtx_shortest, reuse_jbuf_start || reuse_jbuf_limit);
+    env->ExceptionDescribe();  // print out exception to stderr
+    releaseJniEnv(attached_thread);
+    return;
+  }
+
+  jobject j_limit_buf = GetBuffer(env, limit, reuse_jbuf_limit, m_tl_buf_b,
+                                  m_jshortest_buf_limit);
+  if (j_limit_buf == nullptr) {
+    // exception occurred
+    if (!reuse_jbuf_start) {
+      DeleteBuffer(env, j_start_buf);
+    }
+    MaybeUnlockForReuse(mtx_shortest, reuse_jbuf_start || reuse_jbuf_limit);
+    env->ExceptionDescribe();  // print out exception to stderr
+    releaseJniEnv(attached_thread);
+    return;
+  }
+
+  jint jstart_len = env->CallStaticIntMethod(
+      m_abstract_comparator_jni_bridge_clazz, m_jshortest_mid, m_jcallback_obj,
+      j_start_buf, reuse_jbuf_start ? start->length() : -1, j_limit_buf,
+      reuse_jbuf_limit ? limit.size() : -1);
+
+  if (env->ExceptionCheck()) {
+    // exception thrown from CallIntMethod
+    env->ExceptionDescribe();  // print out exception to stderr
+
+  } else if (static_cast<size_t>(jstart_len) != start->length()) {
+    // start buffer has changed in Java, so update `start` with the result
+    bool copy_from_non_direct = false;
+    if (reuse_jbuf_start) {
+      // reused a buffer
+      if (m_options->direct_buffer) {
+        // reused direct buffer
+        void* start_buf = env->GetDirectBufferAddress(j_start_buf);
+        if (start_buf == nullptr) {
+          if (!reuse_jbuf_start) {
+            DeleteBuffer(env, j_start_buf);
+          }
+          if (!reuse_jbuf_limit) {
+            DeleteBuffer(env, j_limit_buf);
+          }
+          MaybeUnlockForReuse(mtx_shortest,
+                              reuse_jbuf_start || reuse_jbuf_limit);
+          ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+              env, "Unable to get Direct Buffer Address");
+          env->ExceptionDescribe();  // print out exception to stderr
+          releaseJniEnv(attached_thread);
+          return;
+        }
+        start->assign(static_cast<const char*>(start_buf), jstart_len);
+
+      } else {
+        // reused non-direct buffer
+        copy_from_non_direct = true;
+      }
+    } else {
+      // there was a new buffer
+      if (m_options->direct_buffer) {
+        // it was direct... don't forget to potentially truncate the `start`
+        // string
+        start->resize(jstart_len);
+      } else {
+        // it was non-direct
+        copy_from_non_direct = true;
+      }
+    }
+
+    if (copy_from_non_direct) {
+      jbyteArray jarray =
+          ByteBufferJni::array(env, j_start_buf, m_jbytebuffer_clazz);
+      if (jarray == nullptr) {
+        if (!reuse_jbuf_start) {
+          DeleteBuffer(env, j_start_buf);
+        }
+        if (!reuse_jbuf_limit) {
+          DeleteBuffer(env, j_limit_buf);
+        }
+        MaybeUnlockForReuse(mtx_shortest, reuse_jbuf_start || reuse_jbuf_limit);
+        env->ExceptionDescribe();  // print out exception to stderr
+        releaseJniEnv(attached_thread);
+        return;
+      }
+      jboolean has_exception = JNI_FALSE;
+      JniUtil::byteString<std::string>(
+          env, jarray,
+          [start, jstart_len](const char* data, const size_t) {
+            return start->assign(data, static_cast<size_t>(jstart_len));
+          },
+          &has_exception);
+      env->DeleteLocalRef(jarray);
+      if (has_exception == JNI_TRUE) {
+        if (!reuse_jbuf_start) {
+          DeleteBuffer(env, j_start_buf);
+        }
+        if (!reuse_jbuf_limit) {
+          DeleteBuffer(env, j_limit_buf);
+        }
+        env->ExceptionDescribe();  // print out exception to stderr
+        MaybeUnlockForReuse(mtx_shortest, reuse_jbuf_start || reuse_jbuf_limit);
+        releaseJniEnv(attached_thread);
+        return;
+      }
+    }
+  }
+
+  if (!reuse_jbuf_start) {
+    DeleteBuffer(env, j_start_buf);
+  }
+  if (!reuse_jbuf_limit) {
+    DeleteBuffer(env, j_limit_buf);
+  }
+
+  MaybeUnlockForReuse(mtx_shortest, reuse_jbuf_start || reuse_jbuf_limit);
+
+  releaseJniEnv(attached_thread);
+}
+
+void ComparatorJniCallback::FindShortSuccessor(std::string* key) const {
+  if (key == nullptr) {
+    return;
+  }
+
+  jboolean attached_thread = JNI_FALSE;
+  JNIEnv* env = getJniEnv(&attached_thread);
+  assert(env != nullptr);
+
+  const bool reuse_jbuf_key =
+      static_cast<int64_t>(key->length()) <= m_options->max_reused_buffer_size;
+
+  MaybeLockForReuse(mtx_short, reuse_jbuf_key);
+
+  Slice skey(key->data(), key->length());
+  jobject j_key_buf =
+      GetBuffer(env, skey, reuse_jbuf_key, m_tl_buf_a, m_jshort_buf_key);
+  if (j_key_buf == nullptr) {
+    // exception occurred
+    MaybeUnlockForReuse(mtx_short, reuse_jbuf_key);
+    env->ExceptionDescribe();  // print out exception to stderr
+    releaseJniEnv(attached_thread);
+    return;
+  }
+
+  jint jkey_len = env->CallStaticIntMethod(
+      m_abstract_comparator_jni_bridge_clazz, m_jshort_mid, m_jcallback_obj,
+      j_key_buf, reuse_jbuf_key ? key->length() : -1);
+
+  if (env->ExceptionCheck()) {
+    // exception thrown from CallObjectMethod
+    if (!reuse_jbuf_key) {
+      DeleteBuffer(env, j_key_buf);
+    }
+    MaybeUnlockForReuse(mtx_short, reuse_jbuf_key);
+    env->ExceptionDescribe();  // print out exception to stderr
+    releaseJniEnv(attached_thread);
+    return;
+  }
+
+  if (static_cast<size_t>(jkey_len) != key->length()) {
+    // key buffer has changed in Java, so update `key` with the result
+    bool copy_from_non_direct = false;
+    if (reuse_jbuf_key) {
+      // reused a buffer
+      if (m_options->direct_buffer) {
+        // reused direct buffer
+        void* key_buf = env->GetDirectBufferAddress(j_key_buf);
+        if (key_buf == nullptr) {
+          ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+              env, "Unable to get Direct Buffer Address");
+          if (!reuse_jbuf_key) {
+            DeleteBuffer(env, j_key_buf);
+          }
+          MaybeUnlockForReuse(mtx_short, reuse_jbuf_key);
+          env->ExceptionDescribe();  // print out exception to stderr
+          releaseJniEnv(attached_thread);
+          return;
+        }
+        key->assign(static_cast<const char*>(key_buf), jkey_len);
+      } else {
+        // reused non-direct buffer
+        copy_from_non_direct = true;
+      }
+    } else {
+      // there was a new buffer
+      if (m_options->direct_buffer) {
+        // it was direct... don't forget to potentially truncate the `key`
+        // string
+        key->resize(jkey_len);
+      } else {
+        // it was non-direct
+        copy_from_non_direct = true;
+      }
+    }
+
+    if (copy_from_non_direct) {
+      jbyteArray jarray =
+          ByteBufferJni::array(env, j_key_buf, m_jbytebuffer_clazz);
+      if (jarray == nullptr) {
+        if (!reuse_jbuf_key) {
+          DeleteBuffer(env, j_key_buf);
+        }
+        MaybeUnlockForReuse(mtx_short, reuse_jbuf_key);
+        env->ExceptionDescribe();  // print out exception to stderr
+        releaseJniEnv(attached_thread);
+        return;
+      }
+      jboolean has_exception = JNI_FALSE;
+      JniUtil::byteString<std::string>(
+          env, jarray,
+          [key, jkey_len](const char* data, const size_t) {
+            return key->assign(data, static_cast<size_t>(jkey_len));
+          },
+          &has_exception);
+      env->DeleteLocalRef(jarray);
+      if (has_exception == JNI_TRUE) {
+        if (!reuse_jbuf_key) {
+          DeleteBuffer(env, j_key_buf);
+        }
+        MaybeUnlockForReuse(mtx_short, reuse_jbuf_key);
+        env->ExceptionDescribe();  // print out exception to stderr
+        releaseJniEnv(attached_thread);
+        return;
+      }
+    }
+  }
+
+  if (!reuse_jbuf_key) {
+    DeleteBuffer(env, j_key_buf);
+  }
+
+  MaybeUnlockForReuse(mtx_short, reuse_jbuf_key);
+
+  releaseJniEnv(attached_thread);
+}
+
+inline void ComparatorJniCallback::MaybeLockForReuse(
+    const std::unique_ptr<port::Mutex>& mutex, const bool cond) const {
+  // no need to lock if using thread_local
+  if (m_options->reused_synchronisation_type !=
+          ReusedSynchronisationType::THREAD_LOCAL &&
+      cond) {
+    mutex.get()->Lock();
+  }
+}
+
+inline void ComparatorJniCallback::MaybeUnlockForReuse(
+    const std::unique_ptr<port::Mutex>& mutex, const bool cond) const {
+  // no need to unlock if using thread_local
+  if (m_options->reused_synchronisation_type !=
+          ReusedSynchronisationType::THREAD_LOCAL &&
+      cond) {
+    mutex.get()->Unlock();
+  }
+}
+
+jobject ComparatorJniCallback::GetBuffer(JNIEnv* env, const Slice& src,
+                                         bool reuse_buffer,
+                                         ThreadLocalPtr* tl_buf,
+                                         jobject jreuse_buffer) const {
+  if (reuse_buffer) {
+    if (m_options->reused_synchronisation_type ==
+        ReusedSynchronisationType::THREAD_LOCAL) {
+      // reuse thread-local bufffer
+      ThreadLocalBuf* tlb = reinterpret_cast<ThreadLocalBuf*>(tl_buf->Get());
+      if (tlb == nullptr) {
+        // thread-local buffer has not yet been created, so create it
+        jobject jtl_buf = env->NewGlobalRef(ByteBufferJni::construct(
+            env, m_options->direct_buffer, m_options->max_reused_buffer_size,
+            m_jbytebuffer_clazz));
+        if (jtl_buf == nullptr) {
+          // exception thrown: OutOfMemoryError
+          return nullptr;
+        }
+        tlb = new ThreadLocalBuf(m_jvm, m_options->direct_buffer, jtl_buf);
+        tl_buf->Reset(tlb);
+      }
+      return ReuseBuffer(env, src, tlb->jbuf);
+    } else {
+      // reuse class member buffer
+      return ReuseBuffer(env, src, jreuse_buffer);
+    }
+  } else {
+    // new buffer
+    return NewBuffer(env, src);
+  }
+}
+
+jobject ComparatorJniCallback::ReuseBuffer(JNIEnv* env, const Slice& src,
+                                           jobject jreuse_buffer) const {
+  // we can reuse the buffer
+  if (m_options->direct_buffer) {
+    // copy into direct buffer
+    void* buf = env->GetDirectBufferAddress(jreuse_buffer);
+    if (buf == nullptr) {
+      // either memory region is undefined, given object is not a direct
+      // java.nio.Buffer, or JNI access to direct buffers is not supported by
+      // this virtual machine.
+      ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+          env, "Unable to get Direct Buffer Address");
+      return nullptr;
+    }
+    memcpy(buf, src.data(), src.size());
+  } else {
+    // copy into non-direct buffer
+    const jbyteArray jarray =
+        ByteBufferJni::array(env, jreuse_buffer, m_jbytebuffer_clazz);
+    if (jarray == nullptr) {
+      // exception occurred
+      return nullptr;
+    }
+    env->SetByteArrayRegion(
+        jarray, 0, static_cast<jsize>(src.size()),
+        const_cast<jbyte*>(reinterpret_cast<const jbyte*>(src.data())));
+    if (env->ExceptionCheck()) {
+      // exception occurred
+      env->DeleteLocalRef(jarray);
+      return nullptr;
+    }
+    env->DeleteLocalRef(jarray);
+  }
+  return jreuse_buffer;
+}
+
+jobject ComparatorJniCallback::NewBuffer(JNIEnv* env, const Slice& src) const {
+  // we need a new buffer
+  jobject jbuf =
+      ByteBufferJni::constructWith(env, m_options->direct_buffer, src.data(),
+                                   src.size(), m_jbytebuffer_clazz);
+  if (jbuf == nullptr) {
+    // exception occurred
+    return nullptr;
+  }
+  return jbuf;
+}
+
+void ComparatorJniCallback::DeleteBuffer(JNIEnv* env, jobject jbuffer) const {
+  env->DeleteLocalRef(jbuffer);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/java/rocksjni/comparatorjnicallback.h b/src/rocksdb/java/rocksjni/comparatorjnicallback.h
new file mode 100644
index 000000000..a983ce4b5
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/comparatorjnicallback.h
@@ -0,0 +1,141 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::Comparator
+
+#ifndef JAVA_ROCKSJNI_COMPARATORJNICALLBACK_H_
+#define JAVA_ROCKSJNI_COMPARATORJNICALLBACK_H_
+
+#include <jni.h>
+
+#include <memory>
+#include <string>
+
+#include "port/port.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/slice.h"
+#include "rocksjni/jnicallback.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+enum ReusedSynchronisationType {
+  /**
+   * Standard mutex.
+   */
+  MUTEX,
+
+  /**
+   * Use adaptive mutex, which spins in the user space before resorting
+   * to kernel. This could reduce context switch when the mutex is not
+   * heavily contended. However, if the mutex is hot, we could end up
+   * wasting spin time.
+   */
+  ADAPTIVE_MUTEX,
+
+  /**
+   * There is a reused buffer per-thread.
+   */
+  THREAD_LOCAL
+};
+
+struct ComparatorJniCallbackOptions {
+  // Set the synchronisation type used to guard the reused buffers.
+  // Only used if max_reused_buffer_size > 0.
+  // Default: ADAPTIVE_MUTEX
+  ReusedSynchronisationType reused_synchronisation_type =
+      ReusedSynchronisationType::ADAPTIVE_MUTEX;
+
+  // Indicates if a direct byte buffer (i.e. outside of the normal
+  // garbage-collected heap) is used for the callbacks to Java,
+  // as opposed to a non-direct byte buffer which is a wrapper around
+  // an on-heap byte[].
+  // Default: true
+  bool direct_buffer = true;
+
+  // Maximum size of a buffer (in bytes) that will be reused.
+  // Comparators will use 5 of these buffers,
+  // so the retained memory size will be 5 * max_reused_buffer_size.
+  // When a buffer is needed for transferring data to a callback,
+  // if it requires less than max_reused_buffer_size, then an
+  // existing buffer will be reused, else a new buffer will be
+  // allocated just for that callback. -1 to disable.
+  // Default: 64 bytes
+  int32_t max_reused_buffer_size = 64;
+};
+
+/**
+ * This class acts as a bridge between C++
+ * and Java. The methods in this class will be
+ * called back from the RocksDB storage engine (C++)
+ * we then callback to the appropriate Java method
+ * this enables Comparators to be implemented in Java.
+ *
+ * The design of this Comparator caches the Java Slice
+ * objects that are used in the compare and findShortestSeparator
+ * method callbacks. Instead of creating new objects for each callback
+ * of those functions, by reuse via setHandle we are a lot
+ * faster; Unfortunately this means that we have to
+ * introduce independent locking in regions of each of those methods
+ * via the mutexs mtx_compare and mtx_findShortestSeparator respectively
+ */
+class ComparatorJniCallback : public JniCallback, public Comparator {
+ public:
+  ComparatorJniCallback(JNIEnv* env, jobject jcomparator,
+                        const ComparatorJniCallbackOptions* options);
+  ~ComparatorJniCallback();
+  virtual const char* Name() const;
+  virtual int Compare(const Slice& a, const Slice& b) const;
+  virtual void FindShortestSeparator(std::string* start,
+                                     const Slice& limit) const;
+  virtual void FindShortSuccessor(std::string* key) const;
+  const ComparatorJniCallbackOptions* m_options;
+
+ private:
+  struct ThreadLocalBuf {
+    ThreadLocalBuf(JavaVM* _jvm, bool _direct_buffer, jobject _jbuf)
+        : jvm(_jvm), direct_buffer(_direct_buffer), jbuf(_jbuf) {}
+    JavaVM* jvm;
+    bool direct_buffer;
+    jobject jbuf;
+  };
+  inline void MaybeLockForReuse(const std::unique_ptr<port::Mutex>& mutex,
+                                const bool cond) const;
+  inline void MaybeUnlockForReuse(const std::unique_ptr<port::Mutex>& mutex,
+                                  const bool cond) const;
+  jobject GetBuffer(JNIEnv* env, const Slice& src, bool reuse_buffer,
+                    ThreadLocalPtr* tl_buf, jobject jreuse_buffer) const;
+  jobject ReuseBuffer(JNIEnv* env, const Slice& src,
+                      jobject jreuse_buffer) const;
+  jobject NewBuffer(JNIEnv* env, const Slice& src) const;
+  void DeleteBuffer(JNIEnv* env, jobject jbuffer) const;
+  // used for synchronisation in compare method
+  std::unique_ptr<port::Mutex> mtx_compare;
+  // used for synchronisation in findShortestSeparator method
+  std::unique_ptr<port::Mutex> mtx_shortest;
+  // used for synchronisation in findShortSuccessor method
+  std::unique_ptr<port::Mutex> mtx_short;
+  std::unique_ptr<const char[]> m_name;
+  jclass m_abstract_comparator_jni_bridge_clazz;  // TODO(AR) could we make this
+                                                  // static somehow?
+  jclass m_jbytebuffer_clazz;  // TODO(AR) we could cache this globally for the
+                               // entire VM if we switch more APIs to use
+                               // ByteBuffer // TODO(AR) could we make this
+                               // static somehow?
+  jmethodID m_jcompare_mid;    // TODO(AR) could we make this static somehow?
+  jmethodID m_jshortest_mid;   // TODO(AR) could we make this static somehow?
+  jmethodID m_jshort_mid;      // TODO(AR) could we make this static somehow?
+  jobject m_jcompare_buf_a;
+  jobject m_jcompare_buf_b;
+  jobject m_jshortest_buf_start;
+  jobject m_jshortest_buf_limit;
+  jobject m_jshort_buf_key;
+  ThreadLocalPtr* m_tl_buf_a;
+  ThreadLocalPtr* m_tl_buf_b;
+};
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // JAVA_ROCKSJNI_COMPARATORJNICALLBACK_H_
diff --git a/src/rocksdb/java/rocksjni/compression_options.cc b/src/rocksdb/java/rocksjni/compression_options.cc
new file mode 100644
index 000000000..53f240560
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/compression_options.cc
@@ -0,0 +1,214 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::CompressionOptions.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_CompressionOptions.h"
+#include "rocksdb/advanced_options.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+
+/*
+ * Class:     org_rocksdb_CompressionOptions
+ * Method:    newCompressionOptions
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_CompressionOptions_newCompressionOptions(JNIEnv*,
+                                                                jclass) {
+  const auto* opt = new ROCKSDB_NAMESPACE::CompressionOptions();
+  return GET_CPLUSPLUS_POINTER(opt);
+}
+
+/*
+ * Class:     org_rocksdb_CompressionOptions
+ * Method:    setWindowBits
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_CompressionOptions_setWindowBits(JNIEnv*, jobject,
+                                                       jlong jhandle,
+                                                       jint jwindow_bits) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::CompressionOptions*>(jhandle);
+  opt->window_bits = static_cast<int>(jwindow_bits);
+}
+
+/*
+ * Class:     org_rocksdb_CompressionOptions
+ * Method:    windowBits
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompressionOptions_windowBits(JNIEnv*, jobject,
+                                                    jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::CompressionOptions*>(jhandle);
+  return static_cast<jint>(opt->window_bits);
+}
+
+/*
+ * Class:     org_rocksdb_CompressionOptions
+ * Method:    setLevel
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_CompressionOptions_setLevel(JNIEnv*, jobject,
+                                                  jlong jhandle, jint jlevel) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::CompressionOptions*>(jhandle);
+  opt->level = static_cast<int>(jlevel);
+}
+
+/*
+ * Class:     org_rocksdb_CompressionOptions
+ * Method:    level
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompressionOptions_level(JNIEnv*, jobject,
+                                               jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::CompressionOptions*>(jhandle);
+  return static_cast<jint>(opt->level);
+}
+
+/*
+ * Class:     org_rocksdb_CompressionOptions
+ * Method:    setStrategy
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_CompressionOptions_setStrategy(JNIEnv*, jobject,
+                                                     jlong jhandle,
+                                                     jint jstrategy) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::CompressionOptions*>(jhandle);
+  opt->strategy = static_cast<int>(jstrategy);
+}
+
+/*
+ * Class:     org_rocksdb_CompressionOptions
+ * Method:    strategy
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompressionOptions_strategy(JNIEnv*, jobject,
+                                                  jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::CompressionOptions*>(jhandle);
+  return static_cast<jint>(opt->strategy);
+}
+
+/*
+ * Class:     org_rocksdb_CompressionOptions
+ * Method:    setMaxDictBytes
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_CompressionOptions_setMaxDictBytes(JNIEnv*, jobject,
+                                                         jlong jhandle,
+                                                         jint jmax_dict_bytes) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::CompressionOptions*>(jhandle);
+  opt->max_dict_bytes = static_cast<uint32_t>(jmax_dict_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_CompressionOptions
+ * Method:    maxDictBytes
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompressionOptions_maxDictBytes(JNIEnv*, jobject,
+                                                      jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::CompressionOptions*>(jhandle);
+  return static_cast<jint>(opt->max_dict_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_CompressionOptions
+ * Method:    setZstdMaxTrainBytes
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_CompressionOptions_setZstdMaxTrainBytes(
+    JNIEnv*, jobject, jlong jhandle, jint jzstd_max_train_bytes) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::CompressionOptions*>(jhandle);
+  opt->zstd_max_train_bytes = static_cast<uint32_t>(jzstd_max_train_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_CompressionOptions
+ * Method:    zstdMaxTrainBytes
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompressionOptions_zstdMaxTrainBytes(JNIEnv*, jobject,
+                                                           jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::CompressionOptions*>(jhandle);
+  return static_cast<jint>(opt->zstd_max_train_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_CompressionOptions
+ * Method:    setMaxDictBufferBytes
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_CompressionOptions_setMaxDictBufferBytes(
+    JNIEnv*, jobject, jlong jhandle, jlong jmax_dict_buffer_bytes) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::CompressionOptions*>(jhandle);
+  opt->max_dict_buffer_bytes = static_cast<uint64_t>(jmax_dict_buffer_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_CompressionOptions
+ * Method:    maxDictBufferBytes
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompressionOptions_maxDictBufferBytes(JNIEnv*, jobject,
+                                                             jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::CompressionOptions*>(jhandle);
+  return static_cast<jlong>(opt->max_dict_buffer_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_CompressionOptions
+ * Method:    setZstdMaxTrainBytes
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_CompressionOptions_setUseZstdDictTrainer(
+    JNIEnv*, jobject, jlong jhandle, jboolean juse_zstd_dict_trainer) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::CompressionOptions*>(jhandle);
+  opt->use_zstd_dict_trainer = juse_zstd_dict_trainer == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_CompressionOptions
+ * Method:    zstdMaxTrainBytes
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_CompressionOptions_useZstdDictTrainer(JNIEnv*,
+                                                                jobject,
+                                                                jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::CompressionOptions*>(jhandle);
+  return static_cast<bool>(opt->use_zstd_dict_trainer);
+}
+
+/*
+ * Class:     org_rocksdb_CompressionOptions
+ * Method:    setEnabled
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_CompressionOptions_setEnabled(JNIEnv*, jobject,
+                                                    jlong jhandle,
+                                                    jboolean jenabled) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::CompressionOptions*>(jhandle);
+  opt->enabled = jenabled == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_CompressionOptions
+ * Method:    enabled
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_CompressionOptions_enabled(JNIEnv*, jobject,
+                                                     jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::CompressionOptions*>(jhandle);
+  return static_cast<bool>(opt->enabled);
+}
+/*
+ * Class:     org_rocksdb_CompressionOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_CompressionOptions_disposeInternal(JNIEnv*, jobject,
+                                                         jlong jhandle) {
+  delete reinterpret_cast<ROCKSDB_NAMESPACE::CompressionOptions*>(jhandle);
+}
diff --git a/src/rocksdb/java/rocksjni/concurrent_task_limiter.cc b/src/rocksdb/java/rocksjni/concurrent_task_limiter.cc
new file mode 100644
index 000000000..0b0b2d271
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/concurrent_task_limiter.cc
@@ -0,0 +1,97 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/concurrent_task_limiter.h"
+
+#include <jni.h>
+
+#include <memory>
+#include <string>
+
+#include "include/org_rocksdb_ConcurrentTaskLimiterImpl.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_ConcurrentTaskLimiterImpl
+ * Method:    newConcurrentTaskLimiterImpl0
+ * Signature: (Ljava/lang/String;I)J
+ */
+jlong Java_org_rocksdb_ConcurrentTaskLimiterImpl_newConcurrentTaskLimiterImpl0(
+    JNIEnv* env, jclass, jstring jname, jint limit) {
+  jboolean has_exception = JNI_FALSE;
+  std::string name =
+      ROCKSDB_NAMESPACE::JniUtil::copyStdString(env, jname, &has_exception);
+  if (JNI_TRUE == has_exception) {
+    return 0;
+  }
+
+  auto* ptr = new std::shared_ptr<ROCKSDB_NAMESPACE::ConcurrentTaskLimiter>(
+      ROCKSDB_NAMESPACE::NewConcurrentTaskLimiter(name, limit));
+
+  return GET_CPLUSPLUS_POINTER(ptr);
+}
+
+/*
+ * Class:     org_rocksdb_ConcurrentTaskLimiterImpl
+ * Method:    name
+ * Signature: (J)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_ConcurrentTaskLimiterImpl_name(JNIEnv* env, jclass,
+                                                        jlong handle) {
+  const auto& limiter = *reinterpret_cast<
+      std::shared_ptr<ROCKSDB_NAMESPACE::ConcurrentTaskLimiter>*>(handle);
+  return ROCKSDB_NAMESPACE::JniUtil::toJavaString(env, &limiter->GetName());
+}
+
+/*
+ * Class:     org_rocksdb_ConcurrentTaskLimiterImpl
+ * Method:    setMaxOutstandingTask
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ConcurrentTaskLimiterImpl_setMaxOutstandingTask(
+    JNIEnv*, jclass, jlong handle, jint max_outstanding_task) {
+  const auto& limiter = *reinterpret_cast<
+      std::shared_ptr<ROCKSDB_NAMESPACE::ConcurrentTaskLimiter>*>(handle);
+  limiter->SetMaxOutstandingTask(static_cast<int32_t>(max_outstanding_task));
+}
+
+/*
+ * Class:     org_rocksdb_ConcurrentTaskLimiterImpl
+ * Method:    resetMaxOutstandingTask
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_ConcurrentTaskLimiterImpl_resetMaxOutstandingTask(
+    JNIEnv*, jclass, jlong handle) {
+  const auto& limiter = *reinterpret_cast<
+      std::shared_ptr<ROCKSDB_NAMESPACE::ConcurrentTaskLimiter>*>(handle);
+  limiter->ResetMaxOutstandingTask();
+}
+
+/*
+ * Class:     org_rocksdb_ConcurrentTaskLimiterImpl
+ * Method:    outstandingTask
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ConcurrentTaskLimiterImpl_outstandingTask(JNIEnv*, jclass,
+                                                                jlong handle) {
+  const auto& limiter = *reinterpret_cast<
+      std::shared_ptr<ROCKSDB_NAMESPACE::ConcurrentTaskLimiter>*>(handle);
+  return static_cast<jint>(limiter->GetOutstandingTask());
+}
+
+/*
+ * Class:     org_rocksdb_ConcurrentTaskLimiterImpl
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_ConcurrentTaskLimiterImpl_disposeInternal(JNIEnv*,
+                                                                jobject,
+                                                                jlong jhandle) {
+  auto* ptr = reinterpret_cast<
+      std::shared_ptr<ROCKSDB_NAMESPACE::ConcurrentTaskLimiter>*>(jhandle);
+  delete ptr;  // delete std::shared_ptr
+}
diff --git a/src/rocksdb/java/rocksjni/config_options.cc b/src/rocksdb/java/rocksjni/config_options.cc
new file mode 100644
index 000000000..e62111323
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/config_options.cc
@@ -0,0 +1,90 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling C++ ROCKSDB_NAMESPACE::ConfigOptions methods
+// from Java side.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_ConfigOptions.h"
+#include "rocksdb/convenience.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_ConfigOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_ConfigOptions_disposeInternal(JNIEnv *, jobject,
+                                                    jlong jhandle) {
+  auto *co = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions *>(jhandle);
+  assert(co != nullptr);
+  delete co;
+}
+
+/*
+ * Class:     org_rocksdb_ConfigOptions
+ * Method:    newConfigOptions
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_ConfigOptions_newConfigOptions(JNIEnv *, jclass) {
+  auto *cfg_opt = new ROCKSDB_NAMESPACE::ConfigOptions();
+  return GET_CPLUSPLUS_POINTER(cfg_opt);
+}
+
+/*
+ * Class:     org_rocksdb_ConfigOptions
+ * Method:    setDelimiter
+ * Signature: (JLjava/lang/String;)V
+ */
+void Java_org_rocksdb_ConfigOptions_setDelimiter(JNIEnv *env, jclass,
+                                                 jlong handle, jstring s) {
+  auto *cfg_opt = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions *>(handle);
+  const char *delim = env->GetStringUTFChars(s, nullptr);
+  if (delim == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+  cfg_opt->delimiter = delim;
+  env->ReleaseStringUTFChars(s, delim);
+}
+
+/*
+ * Class:     org_rocksdb_ConfigOptions
+ * Method:    setIgnoreUnknownOptions
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ConfigOptions_setIgnoreUnknownOptions(JNIEnv *, jclass,
+                                                            jlong handle,
+                                                            jboolean b) {
+  auto *cfg_opt = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions *>(handle);
+  cfg_opt->ignore_unknown_options = static_cast<bool>(b);
+}
+
+/*
+ * Class:     org_rocksdb_ConfigOptions
+ * Method:    setInputStringsEscaped
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ConfigOptions_setInputStringsEscaped(JNIEnv *, jclass,
+                                                           jlong handle,
+                                                           jboolean b) {
+  auto *cfg_opt = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions *>(handle);
+  cfg_opt->input_strings_escaped = static_cast<bool>(b);
+}
+
+/*
+ * Class:     org_rocksdb_ConfigOptions
+ * Method:    setSanityLevel
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ConfigOptions_setSanityLevel(JNIEnv *, jclass,
+                                                   jlong handle, jbyte level) {
+  auto *cfg_opt = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions *>(handle);
+  cfg_opt->sanity_level =
+      ROCKSDB_NAMESPACE::SanityLevelJni::toCppSanityLevel(level);
+}
diff --git a/src/rocksdb/java/rocksjni/cplusplus_to_java_convert.h b/src/rocksdb/java/rocksjni/cplusplus_to_java_convert.h
new file mode 100644
index 000000000..0eea6fa2c
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/cplusplus_to_java_convert.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+/*
+ * This macro is used for 32 bit OS. In 32 bit OS, the result number is a
+ negative number if we use reinterpret_cast<jlong>(pointer).
+ * For example, jlong ptr = reinterpret_cast<jlong>(pointer), ptr is a negative
+ number in 32 bit OS.
+ * If we check ptr using ptr > 0, it fails. For example, the following code is
+ not correct.
+ * if (jblock_cache_handle > 0) {
+      std::shared_ptr<ROCKSDB_NAMESPACE::Cache> *pCache =
+          reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache> *>(
+              jblock_cache_handle);
+      options.block_cache = *pCache;
+    }
+ * But the result number is positive number if we do
+ reinterpret_cast<size_t>(pointer) first and then cast it to jlong. size_t is 4
+ bytes long in 32 bit OS and 8 bytes long in 64 bit OS.
+ static_cast<jlong>(reinterpret_cast<size_t>(_pointer)) is also working in 64
+ bit OS.
+ *
+ * We don't need an opposite cast because it works from jlong to c++ pointer in
+ both 32 bit and 64 bit OS.
+ * For example, the following code is working in both 32 bit and 64 bit OS.
+ jblock_cache_handle is jlong.
+ *   std::shared_ptr<ROCKSDB_NAMESPACE::Cache> *pCache =
+          reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache> *>(
+              jblock_cache_handle);
+*/
+
+#define GET_CPLUSPLUS_POINTER(_pointer) \
+  static_cast<jlong>(reinterpret_cast<size_t>(_pointer))
diff --git a/src/rocksdb/java/rocksjni/env.cc b/src/rocksdb/java/rocksjni/env.cc
new file mode 100644
index 000000000..bb739fe2b
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/env.cc
@@ -0,0 +1,205 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ ROCKSDB_NAMESPACE::Env methods from Java side.
+
+#include "rocksdb/env.h"
+
+#include <jni.h>
+
+#include <vector>
+
+#include "include/org_rocksdb_Env.h"
+#include "include/org_rocksdb_RocksEnv.h"
+#include "include/org_rocksdb_RocksMemEnv.h"
+#include "include/org_rocksdb_TimedEnv.h"
+#include "portal.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+
+/*
+ * Class:     org_rocksdb_Env
+ * Method:    getDefaultEnvInternal
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_Env_getDefaultEnvInternal(JNIEnv*, jclass) {
+  return GET_CPLUSPLUS_POINTER(ROCKSDB_NAMESPACE::Env::Default());
+}
+
+/*
+ * Class:     org_rocksdb_RocksEnv
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksEnv_disposeInternal(JNIEnv*, jobject,
+                                               jlong jhandle) {
+  auto* e = reinterpret_cast<ROCKSDB_NAMESPACE::Env*>(jhandle);
+  assert(e != nullptr);
+  delete e;
+}
+
+/*
+ * Class:     org_rocksdb_Env
+ * Method:    setBackgroundThreads
+ * Signature: (JIB)V
+ */
+void Java_org_rocksdb_Env_setBackgroundThreads(JNIEnv*, jobject, jlong jhandle,
+                                               jint jnum,
+                                               jbyte jpriority_value) {
+  auto* rocks_env = reinterpret_cast<ROCKSDB_NAMESPACE::Env*>(jhandle);
+  rocks_env->SetBackgroundThreads(
+      static_cast<int>(jnum),
+      ROCKSDB_NAMESPACE::PriorityJni::toCppPriority(jpriority_value));
+}
+
+/*
+ * Class:     org_rocksdb_Env
+ * Method:    getBackgroundThreads
+ * Signature: (JB)I
+ */
+jint Java_org_rocksdb_Env_getBackgroundThreads(JNIEnv*, jobject, jlong jhandle,
+                                               jbyte jpriority_value) {
+  auto* rocks_env = reinterpret_cast<ROCKSDB_NAMESPACE::Env*>(jhandle);
+  const int num = rocks_env->GetBackgroundThreads(
+      ROCKSDB_NAMESPACE::PriorityJni::toCppPriority(jpriority_value));
+  return static_cast<jint>(num);
+}
+
+/*
+ * Class:     org_rocksdb_Env
+ * Method:    getThreadPoolQueueLen
+ * Signature: (JB)I
+ */
+jint Java_org_rocksdb_Env_getThreadPoolQueueLen(JNIEnv*, jobject, jlong jhandle,
+                                                jbyte jpriority_value) {
+  auto* rocks_env = reinterpret_cast<ROCKSDB_NAMESPACE::Env*>(jhandle);
+  const int queue_len = rocks_env->GetThreadPoolQueueLen(
+      ROCKSDB_NAMESPACE::PriorityJni::toCppPriority(jpriority_value));
+  return static_cast<jint>(queue_len);
+}
+
+/*
+ * Class:     org_rocksdb_Env
+ * Method:    incBackgroundThreadsIfNeeded
+ * Signature: (JIB)V
+ */
+void Java_org_rocksdb_Env_incBackgroundThreadsIfNeeded(JNIEnv*, jobject,
+                                                       jlong jhandle, jint jnum,
+                                                       jbyte jpriority_value) {
+  auto* rocks_env = reinterpret_cast<ROCKSDB_NAMESPACE::Env*>(jhandle);
+  rocks_env->IncBackgroundThreadsIfNeeded(
+      static_cast<int>(jnum),
+      ROCKSDB_NAMESPACE::PriorityJni::toCppPriority(jpriority_value));
+}
+
+/*
+ * Class:     org_rocksdb_Env
+ * Method:    lowerThreadPoolIOPriority
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_Env_lowerThreadPoolIOPriority(JNIEnv*, jobject,
+                                                    jlong jhandle,
+                                                    jbyte jpriority_value) {
+  auto* rocks_env = reinterpret_cast<ROCKSDB_NAMESPACE::Env*>(jhandle);
+  rocks_env->LowerThreadPoolIOPriority(
+      ROCKSDB_NAMESPACE::PriorityJni::toCppPriority(jpriority_value));
+}
+
+/*
+ * Class:     org_rocksdb_Env
+ * Method:    lowerThreadPoolCPUPriority
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_Env_lowerThreadPoolCPUPriority(JNIEnv*, jobject,
+                                                     jlong jhandle,
+                                                     jbyte jpriority_value) {
+  auto* rocks_env = reinterpret_cast<ROCKSDB_NAMESPACE::Env*>(jhandle);
+  rocks_env->LowerThreadPoolCPUPriority(
+      ROCKSDB_NAMESPACE::PriorityJni::toCppPriority(jpriority_value));
+}
+
+/*
+ * Class:     org_rocksdb_Env
+ * Method:    getThreadList
+ * Signature: (J)[Lorg/rocksdb/ThreadStatus;
+ */
+jobjectArray Java_org_rocksdb_Env_getThreadList(JNIEnv* env, jobject,
+                                                jlong jhandle) {
+  auto* rocks_env = reinterpret_cast<ROCKSDB_NAMESPACE::Env*>(jhandle);
+  std::vector<ROCKSDB_NAMESPACE::ThreadStatus> thread_status;
+  ROCKSDB_NAMESPACE::Status s = rocks_env->GetThreadList(&thread_status);
+  if (!s.ok()) {
+    // error, throw exception
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
+  }
+
+  // object[]
+  const jsize len = static_cast<jsize>(thread_status.size());
+  jobjectArray jthread_status = env->NewObjectArray(
+      len, ROCKSDB_NAMESPACE::ThreadStatusJni::getJClass(env), nullptr);
+  if (jthread_status == nullptr) {
+    // an exception occurred
+    return nullptr;
+  }
+  for (jsize i = 0; i < len; ++i) {
+    jobject jts =
+        ROCKSDB_NAMESPACE::ThreadStatusJni::construct(env, &(thread_status[i]));
+    env->SetObjectArrayElement(jthread_status, i, jts);
+    if (env->ExceptionCheck()) {
+      // exception occurred
+      env->DeleteLocalRef(jthread_status);
+      return nullptr;
+    }
+  }
+
+  return jthread_status;
+}
+
+/*
+ * Class:     org_rocksdb_RocksMemEnv
+ * Method:    createMemEnv
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_RocksMemEnv_createMemEnv(JNIEnv*, jclass,
+                                                jlong jbase_env_handle) {
+  auto* base_env = reinterpret_cast<ROCKSDB_NAMESPACE::Env*>(jbase_env_handle);
+  return GET_CPLUSPLUS_POINTER(ROCKSDB_NAMESPACE::NewMemEnv(base_env));
+}
+
+/*
+ * Class:     org_rocksdb_RocksMemEnv
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksMemEnv_disposeInternal(JNIEnv*, jobject,
+                                                  jlong jhandle) {
+  auto* e = reinterpret_cast<ROCKSDB_NAMESPACE::Env*>(jhandle);
+  assert(e != nullptr);
+  delete e;
+}
+
+/*
+ * Class:     org_rocksdb_TimedEnv
+ * Method:    createTimedEnv
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_TimedEnv_createTimedEnv(JNIEnv*, jclass,
+                                               jlong jbase_env_handle) {
+  auto* base_env = reinterpret_cast<ROCKSDB_NAMESPACE::Env*>(jbase_env_handle);
+  return GET_CPLUSPLUS_POINTER(ROCKSDB_NAMESPACE::NewTimedEnv(base_env));
+}
+
+/*
+ * Class:     org_rocksdb_TimedEnv
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_TimedEnv_disposeInternal(JNIEnv*, jobject,
+                                               jlong jhandle) {
+  auto* e = reinterpret_cast<ROCKSDB_NAMESPACE::Env*>(jhandle);
+  assert(e != nullptr);
+  delete e;
+}
diff --git a/src/rocksdb/java/rocksjni/env_options.cc b/src/rocksdb/java/rocksjni/env_options.cc
new file mode 100644
index 000000000..3237e2775
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/env_options.cc
@@ -0,0 +1,305 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling C++ ROCKSDB_NAMESPACE::EnvOptions methods
+// from Java side.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_EnvOptions.h"
+#include "rocksdb/env.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+
+#define ENV_OPTIONS_SET_BOOL(_jhandle, _opt)                          \
+  reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions *>(_jhandle)->_opt = \
+      static_cast<bool>(_opt)
+
+#define ENV_OPTIONS_SET_SIZE_T(_jhandle, _opt)                        \
+  reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions *>(_jhandle)->_opt = \
+      static_cast<size_t>(_opt)
+
+#define ENV_OPTIONS_SET_UINT64_T(_jhandle, _opt)                      \
+  reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions *>(_jhandle)->_opt = \
+      static_cast<uint64_t>(_opt)
+
+#define ENV_OPTIONS_GET(_jhandle, _opt) \
+  reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions *>(_jhandle)->_opt
+
+/*
+ * Class:     org_rocksdb_EnvOptions
+ * Method:    newEnvOptions
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_EnvOptions_newEnvOptions__(JNIEnv *, jclass) {
+  auto *env_opt = new ROCKSDB_NAMESPACE::EnvOptions();
+  return GET_CPLUSPLUS_POINTER(env_opt);
+}
+
+/*
+ * Class:     org_rocksdb_EnvOptions
+ * Method:    newEnvOptions
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_EnvOptions_newEnvOptions__J(JNIEnv *, jclass,
+                                                   jlong jdboptions_handle) {
+  auto *db_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions *>(jdboptions_handle);
+  auto *env_opt = new ROCKSDB_NAMESPACE::EnvOptions(*db_options);
+  return GET_CPLUSPLUS_POINTER(env_opt);
+}
+
+/*
+ * Class:     org_rocksdb_EnvOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_EnvOptions_disposeInternal(JNIEnv *, jobject,
+                                                 jlong jhandle) {
+  auto *eo = reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions *>(jhandle);
+  assert(eo != nullptr);
+  delete eo;
+}
+
+/*
+ * Class:     org_rocksdb_EnvOptions
+ * Method:    setUseMmapReads
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_EnvOptions_setUseMmapReads(JNIEnv *, jobject,
+                                                 jlong jhandle,
+                                                 jboolean use_mmap_reads) {
+  ENV_OPTIONS_SET_BOOL(jhandle, use_mmap_reads);
+}
+
+/*
+ * Class:     org_rocksdb_EnvOptions
+ * Method:    useMmapReads
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_EnvOptions_useMmapReads(JNIEnv *, jobject,
+                                                  jlong jhandle) {
+  return ENV_OPTIONS_GET(jhandle, use_mmap_reads);
+}
+
+/*
+ * Class:     org_rocksdb_EnvOptions
+ * Method:    setUseMmapWrites
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_EnvOptions_setUseMmapWrites(JNIEnv *, jobject,
+                                                  jlong jhandle,
+                                                  jboolean use_mmap_writes) {
+  ENV_OPTIONS_SET_BOOL(jhandle, use_mmap_writes);
+}
+
+/*
+ * Class:     org_rocksdb_EnvOptions
+ * Method:    useMmapWrites
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_EnvOptions_useMmapWrites(JNIEnv *, jobject,
+                                                   jlong jhandle) {
+  return ENV_OPTIONS_GET(jhandle, use_mmap_writes);
+}
+
+/*
+ * Class:     org_rocksdb_EnvOptions
+ * Method:    setUseDirectReads
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_EnvOptions_setUseDirectReads(JNIEnv *, jobject,
+                                                   jlong jhandle,
+                                                   jboolean use_direct_reads) {
+  ENV_OPTIONS_SET_BOOL(jhandle, use_direct_reads);
+}
+
+/*
+ * Class:     org_rocksdb_EnvOptions
+ * Method:    useDirectReads
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_EnvOptions_useDirectReads(JNIEnv *, jobject,
+                                                    jlong jhandle) {
+  return ENV_OPTIONS_GET(jhandle, use_direct_reads);
+}
+
+/*
+ * Class:     org_rocksdb_EnvOptions
+ * Method:    setUseDirectWrites
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_EnvOptions_setUseDirectWrites(
+    JNIEnv *, jobject, jlong jhandle, jboolean use_direct_writes) {
+  ENV_OPTIONS_SET_BOOL(jhandle, use_direct_writes);
+}
+
+/*
+ * Class:     org_rocksdb_EnvOptions
+ * Method:    useDirectWrites
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_EnvOptions_useDirectWrites(JNIEnv *, jobject,
+                                                     jlong jhandle) {
+  return ENV_OPTIONS_GET(jhandle, use_direct_writes);
+}
+
+/*
+ * Class:     org_rocksdb_EnvOptions
+ * Method:    setAllowFallocate
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_EnvOptions_setAllowFallocate(JNIEnv *, jobject,
+                                                   jlong jhandle,
+                                                   jboolean allow_fallocate) {
+  ENV_OPTIONS_SET_BOOL(jhandle, allow_fallocate);
+}
+
+/*
+ * Class:     org_rocksdb_EnvOptions
+ * Method:    allowFallocate
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_EnvOptions_allowFallocate(JNIEnv *, jobject,
+                                                    jlong jhandle) {
+  return ENV_OPTIONS_GET(jhandle, allow_fallocate);
+}
+
+/*
+ * Class:     org_rocksdb_EnvOptions
+ * Method:    setSetFdCloexec
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_EnvOptions_setSetFdCloexec(JNIEnv *, jobject,
+                                                 jlong jhandle,
+                                                 jboolean set_fd_cloexec) {
+  ENV_OPTIONS_SET_BOOL(jhandle, set_fd_cloexec);
+}
+
+/*
+ * Class:     org_rocksdb_EnvOptions
+ * Method:    setFdCloexec
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_EnvOptions_setFdCloexec(JNIEnv *, jobject,
+                                                  jlong jhandle) {
+  return ENV_OPTIONS_GET(jhandle, set_fd_cloexec);
+}
+
+/*
+ * Class:     org_rocksdb_EnvOptions
+ * Method:    setBytesPerSync
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_EnvOptions_setBytesPerSync(JNIEnv *, jobject,
+                                                 jlong jhandle,
+                                                 jlong bytes_per_sync) {
+  ENV_OPTIONS_SET_UINT64_T(jhandle, bytes_per_sync);
+}
+
+/*
+ * Class:     org_rocksdb_EnvOptions
+ * Method:    bytesPerSync
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_EnvOptions_bytesPerSync(JNIEnv *, jobject,
+                                               jlong jhandle) {
+  return ENV_OPTIONS_GET(jhandle, bytes_per_sync);
+}
+
+/*
+ * Class:     org_rocksdb_EnvOptions
+ * Method:    setFallocateWithKeepSize
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_EnvOptions_setFallocateWithKeepSize(
+    JNIEnv *, jobject, jlong jhandle, jboolean fallocate_with_keep_size) {
+  ENV_OPTIONS_SET_BOOL(jhandle, fallocate_with_keep_size);
+}
+
+/*
+ * Class:     org_rocksdb_EnvOptions
+ * Method:    fallocateWithKeepSize
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_EnvOptions_fallocateWithKeepSize(JNIEnv *, jobject,
+                                                           jlong jhandle) {
+  return ENV_OPTIONS_GET(jhandle, fallocate_with_keep_size);
+}
+
+/*
+ * Class:     org_rocksdb_EnvOptions
+ * Method:    setCompactionReadaheadSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_EnvOptions_setCompactionReadaheadSize(
+    JNIEnv *, jobject, jlong jhandle, jlong compaction_readahead_size) {
+  ENV_OPTIONS_SET_SIZE_T(jhandle, compaction_readahead_size);
+}
+
+/*
+ * Class:     org_rocksdb_EnvOptions
+ * Method:    compactionReadaheadSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_EnvOptions_compactionReadaheadSize(JNIEnv *, jobject,
+                                                          jlong jhandle) {
+  return ENV_OPTIONS_GET(jhandle, compaction_readahead_size);
+}
+
+/*
+ * Class:     org_rocksdb_EnvOptions
+ * Method:    setRandomAccessMaxBufferSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_EnvOptions_setRandomAccessMaxBufferSize(
+    JNIEnv *, jobject, jlong jhandle, jlong random_access_max_buffer_size) {
+  ENV_OPTIONS_SET_SIZE_T(jhandle, random_access_max_buffer_size);
+}
+
+/*
+ * Class:     org_rocksdb_EnvOptions
+ * Method:    randomAccessMaxBufferSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_EnvOptions_randomAccessMaxBufferSize(JNIEnv *, jobject,
+                                                            jlong jhandle) {
+  return ENV_OPTIONS_GET(jhandle, random_access_max_buffer_size);
+}
+
+/*
+ * Class:     org_rocksdb_EnvOptions
+ * Method:    setWritableFileMaxBufferSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_EnvOptions_setWritableFileMaxBufferSize(
+    JNIEnv *, jobject, jlong jhandle, jlong writable_file_max_buffer_size) {
+  ENV_OPTIONS_SET_SIZE_T(jhandle, writable_file_max_buffer_size);
+}
+
+/*
+ * Class:     org_rocksdb_EnvOptions
+ * Method:    writableFileMaxBufferSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_EnvOptions_writableFileMaxBufferSize(JNIEnv *, jobject,
+                                                            jlong jhandle) {
+  return ENV_OPTIONS_GET(jhandle, writable_file_max_buffer_size);
+}
+
+/*
+ * Class:     org_rocksdb_EnvOptions
+ * Method:    setRateLimiter
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_EnvOptions_setRateLimiter(JNIEnv *, jobject,
+                                                jlong jhandle,
+                                                jlong rl_handle) {
+  auto *sptr_rate_limiter =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::RateLimiter> *>(
+          rl_handle);
+  auto *env_opt = reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions *>(jhandle);
+  env_opt->rate_limiter = sptr_rate_limiter->get();
+}
diff --git a/src/rocksdb/java/rocksjni/event_listener.cc b/src/rocksdb/java/rocksjni/event_listener.cc
new file mode 100644
index 000000000..965932c9c
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/event_listener.cc
@@ -0,0 +1,44 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::EventListener.
+
+#include <jni.h>
+
+#include <memory>
+
+#include "include/org_rocksdb_AbstractEventListener.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/event_listener_jnicallback.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_AbstractEventListener
+ * Method:    createNewEventListener
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_AbstractEventListener_createNewEventListener(
+    JNIEnv* env, jobject jobj, jlong jenabled_event_callback_values) {
+  auto enabled_event_callbacks =
+      ROCKSDB_NAMESPACE::EnabledEventCallbackJni::toCppEnabledEventCallbacks(
+          jenabled_event_callback_values);
+  auto* sptr_event_listener =
+      new std::shared_ptr<ROCKSDB_NAMESPACE::EventListener>(
+          new ROCKSDB_NAMESPACE::EventListenerJniCallback(
+              env, jobj, enabled_event_callbacks));
+  return GET_CPLUSPLUS_POINTER(sptr_event_listener);
+}
+
+/*
+ * Class:     org_rocksdb_AbstractEventListener
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_AbstractEventListener_disposeInternal(JNIEnv*, jobject,
+                                                            jlong jhandle) {
+  delete reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::EventListener>*>(
+      jhandle);
+}
diff --git a/src/rocksdb/java/rocksjni/event_listener_jnicallback.cc b/src/rocksdb/java/rocksjni/event_listener_jnicallback.cc
new file mode 100644
index 000000000..342d938b4
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/event_listener_jnicallback.cc
@@ -0,0 +1,502 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::EventListener.
+
+#include "rocksjni/event_listener_jnicallback.h"
+
+#include "rocksjni/portal.h"
+
+namespace ROCKSDB_NAMESPACE {
+EventListenerJniCallback::EventListenerJniCallback(
+    JNIEnv* env, jobject jevent_listener,
+    const std::set<EnabledEventCallback>& enabled_event_callbacks)
+    : JniCallback(env, jevent_listener),
+      m_enabled_event_callbacks(enabled_event_callbacks) {
+  InitCallbackMethodId(
+      m_on_flush_completed_proxy_mid, EnabledEventCallback::ON_FLUSH_COMPLETED,
+      env, AbstractEventListenerJni::getOnFlushCompletedProxyMethodId);
+
+  InitCallbackMethodId(m_on_flush_begin_proxy_mid,
+                       EnabledEventCallback::ON_FLUSH_BEGIN, env,
+                       AbstractEventListenerJni::getOnFlushBeginProxyMethodId);
+
+  InitCallbackMethodId(m_on_table_file_deleted_mid,
+                       EnabledEventCallback::ON_TABLE_FILE_DELETED, env,
+                       AbstractEventListenerJni::getOnTableFileDeletedMethodId);
+
+  InitCallbackMethodId(
+      m_on_compaction_begin_proxy_mid,
+      EnabledEventCallback::ON_COMPACTION_BEGIN, env,
+      AbstractEventListenerJni::getOnCompactionBeginProxyMethodId);
+
+  InitCallbackMethodId(
+      m_on_compaction_completed_proxy_mid,
+      EnabledEventCallback::ON_COMPACTION_COMPLETED, env,
+      AbstractEventListenerJni::getOnCompactionCompletedProxyMethodId);
+
+  InitCallbackMethodId(m_on_table_file_created_mid,
+                       EnabledEventCallback::ON_TABLE_FILE_CREATED, env,
+                       AbstractEventListenerJni::getOnTableFileCreatedMethodId);
+
+  InitCallbackMethodId(
+      m_on_table_file_creation_started_mid,
+      EnabledEventCallback::ON_TABLE_FILE_CREATION_STARTED, env,
+      AbstractEventListenerJni::getOnTableFileCreationStartedMethodId);
+
+  InitCallbackMethodId(m_on_mem_table_sealed_mid,
+                       EnabledEventCallback::ON_MEMTABLE_SEALED, env,
+                       AbstractEventListenerJni::getOnMemTableSealedMethodId);
+
+  InitCallbackMethodId(
+      m_on_column_family_handle_deletion_started_mid,
+      EnabledEventCallback::ON_COLUMN_FAMILY_HANDLE_DELETION_STARTED, env,
+      AbstractEventListenerJni::getOnColumnFamilyHandleDeletionStartedMethodId);
+
+  InitCallbackMethodId(
+      m_on_external_file_ingested_proxy_mid,
+      EnabledEventCallback::ON_EXTERNAL_FILE_INGESTED, env,
+      AbstractEventListenerJni::getOnExternalFileIngestedProxyMethodId);
+
+  InitCallbackMethodId(
+      m_on_background_error_proxy_mid,
+      EnabledEventCallback::ON_BACKGROUND_ERROR, env,
+      AbstractEventListenerJni::getOnBackgroundErrorProxyMethodId);
+
+  InitCallbackMethodId(
+      m_on_stall_conditions_changed_mid,
+      EnabledEventCallback::ON_STALL_CONDITIONS_CHANGED, env,
+      AbstractEventListenerJni::getOnStallConditionsChangedMethodId);
+
+  InitCallbackMethodId(m_on_file_read_finish_mid,
+                       EnabledEventCallback::ON_FILE_READ_FINISH, env,
+                       AbstractEventListenerJni::getOnFileReadFinishMethodId);
+
+  InitCallbackMethodId(m_on_file_write_finish_mid,
+                       EnabledEventCallback::ON_FILE_WRITE_FINISH, env,
+                       AbstractEventListenerJni::getOnFileWriteFinishMethodId);
+
+  InitCallbackMethodId(m_on_file_flush_finish_mid,
+                       EnabledEventCallback::ON_FILE_FLUSH_FINISH, env,
+                       AbstractEventListenerJni::getOnFileFlushFinishMethodId);
+
+  InitCallbackMethodId(m_on_file_sync_finish_mid,
+                       EnabledEventCallback::ON_FILE_SYNC_FINISH, env,
+                       AbstractEventListenerJni::getOnFileSyncFinishMethodId);
+
+  InitCallbackMethodId(
+      m_on_file_range_sync_finish_mid,
+      EnabledEventCallback::ON_FILE_RANGE_SYNC_FINISH, env,
+      AbstractEventListenerJni::getOnFileRangeSyncFinishMethodId);
+
+  InitCallbackMethodId(
+      m_on_file_truncate_finish_mid,
+      EnabledEventCallback::ON_FILE_TRUNCATE_FINISH, env,
+      AbstractEventListenerJni::getOnFileTruncateFinishMethodId);
+
+  InitCallbackMethodId(m_on_file_close_finish_mid,
+                       EnabledEventCallback::ON_FILE_CLOSE_FINISH, env,
+                       AbstractEventListenerJni::getOnFileCloseFinishMethodId);
+
+  InitCallbackMethodId(
+      m_should_be_notified_on_file_io,
+      EnabledEventCallback::SHOULD_BE_NOTIFIED_ON_FILE_IO, env,
+      AbstractEventListenerJni::getShouldBeNotifiedOnFileIOMethodId);
+
+  InitCallbackMethodId(
+      m_on_error_recovery_begin_proxy_mid,
+      EnabledEventCallback::ON_ERROR_RECOVERY_BEGIN, env,
+      AbstractEventListenerJni::getOnErrorRecoveryBeginProxyMethodId);
+
+  InitCallbackMethodId(
+      m_on_error_recovery_completed_mid,
+      EnabledEventCallback::ON_ERROR_RECOVERY_COMPLETED, env,
+      AbstractEventListenerJni::getOnErrorRecoveryCompletedMethodId);
+}
+
+EventListenerJniCallback::~EventListenerJniCallback() {}
+
+void EventListenerJniCallback::OnFlushCompleted(
+    DB* db, const FlushJobInfo& flush_job_info) {
+  if (m_on_flush_completed_proxy_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jflush_job_info = SetupCallbackInvocation<FlushJobInfo>(
+      env, attached_thread, flush_job_info,
+      FlushJobInfoJni::fromCppFlushJobInfo);
+
+  if (jflush_job_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_flush_completed_proxy_mid,
+                        reinterpret_cast<jlong>(db), jflush_job_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jflush_job_info});
+}
+
+void EventListenerJniCallback::OnFlushBegin(
+    DB* db, const FlushJobInfo& flush_job_info) {
+  if (m_on_flush_begin_proxy_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jflush_job_info = SetupCallbackInvocation<FlushJobInfo>(
+      env, attached_thread, flush_job_info,
+      FlushJobInfoJni::fromCppFlushJobInfo);
+
+  if (jflush_job_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_flush_begin_proxy_mid,
+                        reinterpret_cast<jlong>(db), jflush_job_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jflush_job_info});
+}
+
+void EventListenerJniCallback::OnTableFileDeleted(
+    const TableFileDeletionInfo& info) {
+  if (m_on_table_file_deleted_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jdeletion_info = SetupCallbackInvocation<TableFileDeletionInfo>(
+      env, attached_thread, info,
+      TableFileDeletionInfoJni::fromCppTableFileDeletionInfo);
+
+  if (jdeletion_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_table_file_deleted_mid,
+                        jdeletion_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jdeletion_info});
+}
+
+void EventListenerJniCallback::OnCompactionBegin(DB* db,
+                                                 const CompactionJobInfo& ci) {
+  if (m_on_compaction_begin_proxy_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jcompaction_job_info = SetupCallbackInvocation<CompactionJobInfo>(
+      env, attached_thread, ci, CompactionJobInfoJni::fromCppCompactionJobInfo);
+
+  if (jcompaction_job_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_compaction_begin_proxy_mid,
+                        reinterpret_cast<jlong>(db), jcompaction_job_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jcompaction_job_info});
+}
+
+void EventListenerJniCallback::OnCompactionCompleted(
+    DB* db, const CompactionJobInfo& ci) {
+  if (m_on_compaction_completed_proxy_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jcompaction_job_info = SetupCallbackInvocation<CompactionJobInfo>(
+      env, attached_thread, ci, CompactionJobInfoJni::fromCppCompactionJobInfo);
+
+  if (jcompaction_job_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_compaction_completed_proxy_mid,
+                        reinterpret_cast<jlong>(db), jcompaction_job_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jcompaction_job_info});
+}
+
+void EventListenerJniCallback::OnTableFileCreated(
+    const TableFileCreationInfo& info) {
+  if (m_on_table_file_created_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jfile_creation_info = SetupCallbackInvocation<TableFileCreationInfo>(
+      env, attached_thread, info,
+      TableFileCreationInfoJni::fromCppTableFileCreationInfo);
+
+  if (jfile_creation_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_table_file_created_mid,
+                        jfile_creation_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jfile_creation_info});
+}
+
+void EventListenerJniCallback::OnTableFileCreationStarted(
+    const TableFileCreationBriefInfo& info) {
+  if (m_on_table_file_creation_started_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jcreation_brief_info =
+      SetupCallbackInvocation<TableFileCreationBriefInfo>(
+          env, attached_thread, info,
+          TableFileCreationBriefInfoJni::fromCppTableFileCreationBriefInfo);
+
+  if (jcreation_brief_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_table_file_creation_started_mid,
+                        jcreation_brief_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jcreation_brief_info});
+}
+
+void EventListenerJniCallback::OnMemTableSealed(const MemTableInfo& info) {
+  if (m_on_mem_table_sealed_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jmem_table_info = SetupCallbackInvocation<MemTableInfo>(
+      env, attached_thread, info, MemTableInfoJni::fromCppMemTableInfo);
+
+  if (jmem_table_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_mem_table_sealed_mid,
+                        jmem_table_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jmem_table_info});
+}
+
+void EventListenerJniCallback::OnColumnFamilyHandleDeletionStarted(
+    ColumnFamilyHandle* handle) {
+  if (m_on_column_family_handle_deletion_started_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jcf_handle = SetupCallbackInvocation<ColumnFamilyHandle>(
+      env, attached_thread, *handle,
+      ColumnFamilyHandleJni::fromCppColumnFamilyHandle);
+
+  if (jcf_handle != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj,
+                        m_on_column_family_handle_deletion_started_mid,
+                        jcf_handle);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jcf_handle});
+}
+
+void EventListenerJniCallback::OnExternalFileIngested(
+    DB* db, const ExternalFileIngestionInfo& info) {
+  if (m_on_external_file_ingested_proxy_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jingestion_info = SetupCallbackInvocation<ExternalFileIngestionInfo>(
+      env, attached_thread, info,
+      ExternalFileIngestionInfoJni::fromCppExternalFileIngestionInfo);
+
+  if (jingestion_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_external_file_ingested_proxy_mid,
+                        reinterpret_cast<jlong>(db), jingestion_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jingestion_info});
+}
+
+void EventListenerJniCallback::OnBackgroundError(BackgroundErrorReason reason,
+                                                 Status* bg_error) {
+  if (m_on_background_error_proxy_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jstatus = SetupCallbackInvocation<Status>(
+      env, attached_thread, *bg_error, StatusJni::construct);
+
+  if (jstatus != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_background_error_proxy_mid,
+                        static_cast<jbyte>(reason), jstatus);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jstatus});
+}
+
+void EventListenerJniCallback::OnStallConditionsChanged(
+    const WriteStallInfo& info) {
+  if (m_on_stall_conditions_changed_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jwrite_stall_info = SetupCallbackInvocation<WriteStallInfo>(
+      env, attached_thread, info, WriteStallInfoJni::fromCppWriteStallInfo);
+
+  if (jwrite_stall_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_stall_conditions_changed_mid,
+                        jwrite_stall_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jwrite_stall_info});
+}
+
+void EventListenerJniCallback::OnFileReadFinish(const FileOperationInfo& info) {
+  OnFileOperation(m_on_file_read_finish_mid, info);
+}
+
+void EventListenerJniCallback::OnFileWriteFinish(
+    const FileOperationInfo& info) {
+  OnFileOperation(m_on_file_write_finish_mid, info);
+}
+
+void EventListenerJniCallback::OnFileFlushFinish(
+    const FileOperationInfo& info) {
+  OnFileOperation(m_on_file_flush_finish_mid, info);
+}
+
+void EventListenerJniCallback::OnFileSyncFinish(const FileOperationInfo& info) {
+  OnFileOperation(m_on_file_sync_finish_mid, info);
+}
+
+void EventListenerJniCallback::OnFileRangeSyncFinish(
+    const FileOperationInfo& info) {
+  OnFileOperation(m_on_file_range_sync_finish_mid, info);
+}
+
+void EventListenerJniCallback::OnFileTruncateFinish(
+    const FileOperationInfo& info) {
+  OnFileOperation(m_on_file_truncate_finish_mid, info);
+}
+
+void EventListenerJniCallback::OnFileCloseFinish(
+    const FileOperationInfo& info) {
+  OnFileOperation(m_on_file_close_finish_mid, info);
+}
+
+bool EventListenerJniCallback::ShouldBeNotifiedOnFileIO() {
+  if (m_should_be_notified_on_file_io == nullptr) {
+    return false;
+  }
+
+  jboolean attached_thread = JNI_FALSE;
+  JNIEnv* env = getJniEnv(&attached_thread);
+  assert(env != nullptr);
+
+  jboolean jshould_be_notified =
+      env->CallBooleanMethod(m_jcallback_obj, m_should_be_notified_on_file_io);
+
+  CleanupCallbackInvocation(env, attached_thread, {});
+
+  return static_cast<bool>(jshould_be_notified);
+}
+
+void EventListenerJniCallback::OnErrorRecoveryBegin(
+    BackgroundErrorReason reason, Status bg_error, bool* auto_recovery) {
+  if (m_on_error_recovery_begin_proxy_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jbg_error = SetupCallbackInvocation<Status>(
+      env, attached_thread, bg_error, StatusJni::construct);
+
+  if (jbg_error != nullptr) {
+    jboolean jauto_recovery = env->CallBooleanMethod(
+        m_jcallback_obj, m_on_error_recovery_begin_proxy_mid,
+        static_cast<jbyte>(reason), jbg_error);
+    *auto_recovery = jauto_recovery == JNI_TRUE;
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jbg_error});
+}
+
+void EventListenerJniCallback::OnErrorRecoveryCompleted(Status old_bg_error) {
+  if (m_on_error_recovery_completed_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jold_bg_error = SetupCallbackInvocation<Status>(
+      env, attached_thread, old_bg_error, StatusJni::construct);
+
+  if (jold_bg_error != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_error_recovery_completed_mid,
+                        jold_bg_error);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jold_bg_error});
+}
+
+void EventListenerJniCallback::InitCallbackMethodId(
+    jmethodID& mid, EnabledEventCallback eec, JNIEnv* env,
+    jmethodID (*get_id)(JNIEnv* env)) {
+  if (m_enabled_event_callbacks.count(eec) == 1) {
+    mid = get_id(env);
+  } else {
+    mid = nullptr;
+  }
+}
+
+template <class T>
+jobject EventListenerJniCallback::SetupCallbackInvocation(
+    JNIEnv*& env, jboolean& attached_thread, const T& cpp_obj,
+    jobject (*convert)(JNIEnv* env, const T* cpp_obj)) {
+  attached_thread = JNI_FALSE;
+  env = getJniEnv(&attached_thread);
+  assert(env != nullptr);
+
+  return convert(env, &cpp_obj);
+}
+
+void EventListenerJniCallback::CleanupCallbackInvocation(
+    JNIEnv* env, jboolean attached_thread,
+    std::initializer_list<jobject*> refs) {
+  for (auto* ref : refs) {
+    if (*ref == nullptr) continue;
+    env->DeleteLocalRef(*ref);
+  }
+
+  if (env->ExceptionCheck()) {
+    // exception thrown from CallVoidMethod
+    env->ExceptionDescribe();  // print out exception to stderr
+  }
+
+  releaseJniEnv(attached_thread);
+}
+
+void EventListenerJniCallback::OnFileOperation(const jmethodID& mid,
+                                               const FileOperationInfo& info) {
+  if (mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jop_info = SetupCallbackInvocation<FileOperationInfo>(
+      env, attached_thread, info,
+      FileOperationInfoJni::fromCppFileOperationInfo);
+
+  if (jop_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, mid, jop_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jop_info});
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/java/rocksjni/event_listener_jnicallback.h b/src/rocksdb/java/rocksjni/event_listener_jnicallback.h
new file mode 100644
index 000000000..f4a235a23
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/event_listener_jnicallback.h
@@ -0,0 +1,122 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::EventListener.
+
+#ifndef JAVA_ROCKSJNI_EVENT_LISTENER_JNICALLBACK_H_
+#define JAVA_ROCKSJNI_EVENT_LISTENER_JNICALLBACK_H_
+
+#include <jni.h>
+
+#include <memory>
+#include <set>
+
+#include "rocksdb/listener.h"
+#include "rocksjni/jnicallback.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+enum EnabledEventCallback {
+  ON_FLUSH_COMPLETED = 0x0,
+  ON_FLUSH_BEGIN = 0x1,
+  ON_TABLE_FILE_DELETED = 0x2,
+  ON_COMPACTION_BEGIN = 0x3,
+  ON_COMPACTION_COMPLETED = 0x4,
+  ON_TABLE_FILE_CREATED = 0x5,
+  ON_TABLE_FILE_CREATION_STARTED = 0x6,
+  ON_MEMTABLE_SEALED = 0x7,
+  ON_COLUMN_FAMILY_HANDLE_DELETION_STARTED = 0x8,
+  ON_EXTERNAL_FILE_INGESTED = 0x9,
+  ON_BACKGROUND_ERROR = 0xA,
+  ON_STALL_CONDITIONS_CHANGED = 0xB,
+  ON_FILE_READ_FINISH = 0xC,
+  ON_FILE_WRITE_FINISH = 0xD,
+  ON_FILE_FLUSH_FINISH = 0xE,
+  ON_FILE_SYNC_FINISH = 0xF,
+  ON_FILE_RANGE_SYNC_FINISH = 0x10,
+  ON_FILE_TRUNCATE_FINISH = 0x11,
+  ON_FILE_CLOSE_FINISH = 0x12,
+  SHOULD_BE_NOTIFIED_ON_FILE_IO = 0x13,
+  ON_ERROR_RECOVERY_BEGIN = 0x14,
+  ON_ERROR_RECOVERY_COMPLETED = 0x15,
+
+  NUM_ENABLED_EVENT_CALLBACK = 0x16,
+};
+
+class EventListenerJniCallback : public JniCallback, public EventListener {
+ public:
+  EventListenerJniCallback(
+      JNIEnv* env, jobject jevent_listener,
+      const std::set<EnabledEventCallback>& enabled_event_callbacks);
+  virtual ~EventListenerJniCallback();
+  virtual void OnFlushCompleted(DB* db, const FlushJobInfo& flush_job_info);
+  virtual void OnFlushBegin(DB* db, const FlushJobInfo& flush_job_info);
+  virtual void OnTableFileDeleted(const TableFileDeletionInfo& info);
+  virtual void OnCompactionBegin(DB* db, const CompactionJobInfo& ci);
+  virtual void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci);
+  virtual void OnTableFileCreated(const TableFileCreationInfo& info);
+  virtual void OnTableFileCreationStarted(
+      const TableFileCreationBriefInfo& info);
+  virtual void OnMemTableSealed(const MemTableInfo& info);
+  virtual void OnColumnFamilyHandleDeletionStarted(ColumnFamilyHandle* handle);
+  virtual void OnExternalFileIngested(DB* db,
+                                      const ExternalFileIngestionInfo& info);
+  virtual void OnBackgroundError(BackgroundErrorReason reason,
+                                 Status* bg_error);
+  virtual void OnStallConditionsChanged(const WriteStallInfo& info);
+  virtual void OnFileReadFinish(const FileOperationInfo& info);
+  virtual void OnFileWriteFinish(const FileOperationInfo& info);
+  virtual void OnFileFlushFinish(const FileOperationInfo& info);
+  virtual void OnFileSyncFinish(const FileOperationInfo& info);
+  virtual void OnFileRangeSyncFinish(const FileOperationInfo& info);
+  virtual void OnFileTruncateFinish(const FileOperationInfo& info);
+  virtual void OnFileCloseFinish(const FileOperationInfo& info);
+  virtual bool ShouldBeNotifiedOnFileIO();
+  virtual void OnErrorRecoveryBegin(BackgroundErrorReason reason,
+                                    Status bg_error, bool* auto_recovery);
+  virtual void OnErrorRecoveryCompleted(Status old_bg_error);
+
+ private:
+  inline void InitCallbackMethodId(jmethodID& mid, EnabledEventCallback eec,
+                                   JNIEnv* env,
+                                   jmethodID (*get_id)(JNIEnv* env));
+  template <class T>
+  inline jobject SetupCallbackInvocation(
+      JNIEnv*& env, jboolean& attached_thread, const T& cpp_obj,
+      jobject (*convert)(JNIEnv* env, const T* cpp_obj));
+  inline void CleanupCallbackInvocation(JNIEnv* env, jboolean attached_thread,
+                                        std::initializer_list<jobject*> refs);
+  inline void OnFileOperation(const jmethodID& mid,
+                              const FileOperationInfo& info);
+
+  const std::set<EnabledEventCallback> m_enabled_event_callbacks;
+  jmethodID m_on_flush_completed_proxy_mid;
+  jmethodID m_on_flush_begin_proxy_mid;
+  jmethodID m_on_table_file_deleted_mid;
+  jmethodID m_on_compaction_begin_proxy_mid;
+  jmethodID m_on_compaction_completed_proxy_mid;
+  jmethodID m_on_table_file_created_mid;
+  jmethodID m_on_table_file_creation_started_mid;
+  jmethodID m_on_mem_table_sealed_mid;
+  jmethodID m_on_column_family_handle_deletion_started_mid;
+  jmethodID m_on_external_file_ingested_proxy_mid;
+  jmethodID m_on_background_error_proxy_mid;
+  jmethodID m_on_stall_conditions_changed_mid;
+  jmethodID m_on_file_read_finish_mid;
+  jmethodID m_on_file_write_finish_mid;
+  jmethodID m_on_file_flush_finish_mid;
+  jmethodID m_on_file_sync_finish_mid;
+  jmethodID m_on_file_range_sync_finish_mid;
+  jmethodID m_on_file_truncate_finish_mid;
+  jmethodID m_on_file_close_finish_mid;
+  jmethodID m_should_be_notified_on_file_io;
+  jmethodID m_on_error_recovery_begin_proxy_mid;
+  jmethodID m_on_error_recovery_completed_mid;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // JAVA_ROCKSJNI_EVENT_LISTENER_JNICALLBACK_H_
diff --git a/src/rocksdb/java/rocksjni/filter.cc b/src/rocksdb/java/rocksjni/filter.cc
new file mode 100644
index 000000000..ed22016d2
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/filter.cc
@@ -0,0 +1,46 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::FilterPolicy.
+
+#include <jni.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <string>
+
+#include "include/org_rocksdb_BloomFilter.h"
+#include "include/org_rocksdb_Filter.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_BloomFilter
+ * Method:    createBloomFilter
+ * Signature: (DZ)J
+ */
+jlong Java_org_rocksdb_BloomFilter_createNewBloomFilter(JNIEnv* /*env*/,
+                                                        jclass /*jcls*/,
+                                                        jdouble bits_per_key) {
+  auto* sptr_filter =
+      new std::shared_ptr<const ROCKSDB_NAMESPACE::FilterPolicy>(
+          ROCKSDB_NAMESPACE::NewBloomFilterPolicy(bits_per_key));
+  return GET_CPLUSPLUS_POINTER(sptr_filter);
+}
+
+/*
+ * Class:     org_rocksdb_Filter
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Filter_disposeInternal(JNIEnv* /*env*/, jobject /*jobj*/,
+                                             jlong jhandle) {
+  auto* handle =
+      reinterpret_cast<std::shared_ptr<const ROCKSDB_NAMESPACE::FilterPolicy>*>(
+          jhandle);
+  delete handle;  // delete std::shared_ptr
+}
diff --git a/src/rocksdb/java/rocksjni/ingest_external_file_options.cc b/src/rocksdb/java/rocksjni/ingest_external_file_options.cc
new file mode 100644
index 000000000..052cf3325
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/ingest_external_file_options.cc
@@ -0,0 +1,199 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::FilterPolicy.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_IngestExternalFileOptions.h"
+#include "rocksdb/options.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+
+/*
+ * Class:     org_rocksdb_IngestExternalFileOptions
+ * Method:    newIngestExternalFileOptions
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_IngestExternalFileOptions_newIngestExternalFileOptions__(
+    JNIEnv*, jclass) {
+  auto* options = new ROCKSDB_NAMESPACE::IngestExternalFileOptions();
+  return GET_CPLUSPLUS_POINTER(options);
+}
+
+/*
+ * Class:     org_rocksdb_IngestExternalFileOptions
+ * Method:    newIngestExternalFileOptions
+ * Signature: (ZZZZ)J
+ */
+jlong Java_org_rocksdb_IngestExternalFileOptions_newIngestExternalFileOptions__ZZZZ(
+    JNIEnv*, jclass, jboolean jmove_files, jboolean jsnapshot_consistency,
+    jboolean jallow_global_seqno, jboolean jallow_blocking_flush) {
+  auto* options = new ROCKSDB_NAMESPACE::IngestExternalFileOptions();
+  options->move_files = static_cast<bool>(jmove_files);
+  options->snapshot_consistency = static_cast<bool>(jsnapshot_consistency);
+  options->allow_global_seqno = static_cast<bool>(jallow_global_seqno);
+  options->allow_blocking_flush = static_cast<bool>(jallow_blocking_flush);
+  return GET_CPLUSPLUS_POINTER(options);
+}
+
+/*
+ * Class:     org_rocksdb_IngestExternalFileOptions
+ * Method:    moveFiles
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_IngestExternalFileOptions_moveFiles(JNIEnv*, jobject,
+                                                              jlong jhandle) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::IngestExternalFileOptions*>(jhandle);
+  return static_cast<jboolean>(options->move_files);
+}
+
+/*
+ * Class:     org_rocksdb_IngestExternalFileOptions
+ * Method:    setMoveFiles
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_IngestExternalFileOptions_setMoveFiles(
+    JNIEnv*, jobject, jlong jhandle, jboolean jmove_files) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::IngestExternalFileOptions*>(jhandle);
+  options->move_files = static_cast<bool>(jmove_files);
+}
+
+/*
+ * Class:     org_rocksdb_IngestExternalFileOptions
+ * Method:    snapshotConsistency
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_IngestExternalFileOptions_snapshotConsistency(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::IngestExternalFileOptions*>(jhandle);
+  return static_cast<jboolean>(options->snapshot_consistency);
+}
+
+/*
+ * Class:     org_rocksdb_IngestExternalFileOptions
+ * Method:    setSnapshotConsistency
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_IngestExternalFileOptions_setSnapshotConsistency(
+    JNIEnv*, jobject, jlong jhandle, jboolean jsnapshot_consistency) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::IngestExternalFileOptions*>(jhandle);
+  options->snapshot_consistency = static_cast<bool>(jsnapshot_consistency);
+}
+
+/*
+ * Class:     org_rocksdb_IngestExternalFileOptions
+ * Method:    allowGlobalSeqNo
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_IngestExternalFileOptions_allowGlobalSeqNo(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::IngestExternalFileOptions*>(jhandle);
+  return static_cast<jboolean>(options->allow_global_seqno);
+}
+
+/*
+ * Class:     org_rocksdb_IngestExternalFileOptions
+ * Method:    setAllowGlobalSeqNo
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_IngestExternalFileOptions_setAllowGlobalSeqNo(
+    JNIEnv*, jobject, jlong jhandle, jboolean jallow_global_seqno) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::IngestExternalFileOptions*>(jhandle);
+  options->allow_global_seqno = static_cast<bool>(jallow_global_seqno);
+}
+
+/*
+ * Class:     org_rocksdb_IngestExternalFileOptions
+ * Method:    allowBlockingFlush
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_IngestExternalFileOptions_allowBlockingFlush(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::IngestExternalFileOptions*>(jhandle);
+  return static_cast<jboolean>(options->allow_blocking_flush);
+}
+
+/*
+ * Class:     org_rocksdb_IngestExternalFileOptions
+ * Method:    setAllowBlockingFlush
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_IngestExternalFileOptions_setAllowBlockingFlush(
+    JNIEnv*, jobject, jlong jhandle, jboolean jallow_blocking_flush) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::IngestExternalFileOptions*>(jhandle);
+  options->allow_blocking_flush = static_cast<bool>(jallow_blocking_flush);
+}
+
+/*
+ * Class:     org_rocksdb_IngestExternalFileOptions
+ * Method:    ingestBehind
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_IngestExternalFileOptions_ingestBehind(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::IngestExternalFileOptions*>(jhandle);
+  return options->ingest_behind == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_IngestExternalFileOptions
+ * Method:    setIngestBehind
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_IngestExternalFileOptions_setIngestBehind(
+    JNIEnv*, jobject, jlong jhandle, jboolean jingest_behind) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::IngestExternalFileOptions*>(jhandle);
+  options->ingest_behind = jingest_behind == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_IngestExternalFileOptions
+ * Method:    writeGlobalSeqno
+ * Signature: (J)Z
+ */
+JNIEXPORT jboolean JNICALL
+Java_org_rocksdb_IngestExternalFileOptions_writeGlobalSeqno(JNIEnv*, jobject,
+                                                            jlong jhandle) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::IngestExternalFileOptions*>(jhandle);
+  return options->write_global_seqno == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_IngestExternalFileOptions
+ * Method:    setWriteGlobalSeqno
+ * Signature: (JZ)V
+ */
+JNIEXPORT void JNICALL
+Java_org_rocksdb_IngestExternalFileOptions_setWriteGlobalSeqno(
+    JNIEnv*, jobject, jlong jhandle, jboolean jwrite_global_seqno) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::IngestExternalFileOptions*>(jhandle);
+  options->write_global_seqno = jwrite_global_seqno == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_IngestExternalFileOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_IngestExternalFileOptions_disposeInternal(JNIEnv*,
+                                                                jobject,
+                                                                jlong jhandle) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::IngestExternalFileOptions*>(jhandle);
+  delete options;
+}
diff --git a/src/rocksdb/java/rocksjni/iterator.cc b/src/rocksdb/java/rocksjni/iterator.cc
new file mode 100644
index 000000000..3ddb9778b
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/iterator.cc
@@ -0,0 +1,340 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ ROCKSDB_NAMESPACE::Iterator methods from Java side.
+
+#include "rocksdb/iterator.h"
+
+#include <jni.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+
+#include "include/org_rocksdb_RocksIterator.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksIterator_disposeInternal(JNIEnv* /*env*/,
+                                                    jobject /*jobj*/,
+                                                    jlong handle) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  assert(it != nullptr);
+  delete it;
+}
+
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    isValid0
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_RocksIterator_isValid0(JNIEnv* /*env*/,
+                                                 jobject /*jobj*/,
+                                                 jlong handle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle)->Valid();
+}
+
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    seekToFirst0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksIterator_seekToFirst0(JNIEnv* /*env*/,
+                                                 jobject /*jobj*/,
+                                                 jlong handle) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle)->SeekToFirst();
+}
+
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    seekToLast0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksIterator_seekToLast0(JNIEnv* /*env*/,
+                                                jobject /*jobj*/,
+                                                jlong handle) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle)->SeekToLast();
+}
+
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    next0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksIterator_next0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                          jlong handle) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle)->Next();
+}
+
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    prev0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksIterator_prev0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                          jlong handle) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle)->Prev();
+}
+
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    refresh0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksIterator_refresh0(JNIEnv* env, jobject /*jobj*/,
+                                             jlong handle) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  ROCKSDB_NAMESPACE::Status s = it->Refresh();
+
+  if (s.ok()) {
+    return;
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    seek0
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_RocksIterator_seek0(JNIEnv* env, jobject /*jobj*/,
+                                          jlong handle, jbyteArray jtarget,
+                                          jint jtarget_len) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  auto seek = [&it](ROCKSDB_NAMESPACE::Slice& target_slice) {
+    it->Seek(target_slice);
+  };
+  ROCKSDB_NAMESPACE::JniUtil::k_op_region(seek, env, jtarget, 0, jtarget_len);
+}
+
+/*
+ * This method supports fetching into indirect byte buffers;
+ * the Java wrapper extracts the byte[] and passes it here.
+ * In this case, the buffer offset of the key may be non-zero.
+ *
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    seek0
+ * Signature: (J[BII)V
+ */
+void Java_org_rocksdb_RocksIterator_seekByteArray0(
+    JNIEnv* env, jobject /*jobj*/, jlong handle, jbyteArray jtarget,
+    jint jtarget_off, jint jtarget_len) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  auto seek = [&it](ROCKSDB_NAMESPACE::Slice& target_slice) {
+    it->Seek(target_slice);
+  };
+  ROCKSDB_NAMESPACE::JniUtil::k_op_region(seek, env, jtarget, jtarget_off,
+                                          jtarget_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    seekDirect0
+ * Signature: (JLjava/nio/ByteBuffer;II)V
+ */
+void Java_org_rocksdb_RocksIterator_seekDirect0(JNIEnv* env, jobject /*jobj*/,
+                                                jlong handle, jobject jtarget,
+                                                jint jtarget_off,
+                                                jint jtarget_len) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  auto seek = [&it](ROCKSDB_NAMESPACE::Slice& target_slice) {
+    it->Seek(target_slice);
+  };
+  ROCKSDB_NAMESPACE::JniUtil::k_op_direct(seek, env, jtarget, jtarget_off,
+                                          jtarget_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    seekForPrevDirect0
+ * Signature: (JLjava/nio/ByteBuffer;II)V
+ */
+void Java_org_rocksdb_RocksIterator_seekForPrevDirect0(
+    JNIEnv* env, jobject /*jobj*/, jlong handle, jobject jtarget,
+    jint jtarget_off, jint jtarget_len) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  auto seekPrev = [&it](ROCKSDB_NAMESPACE::Slice& target_slice) {
+    it->SeekForPrev(target_slice);
+  };
+  ROCKSDB_NAMESPACE::JniUtil::k_op_direct(seekPrev, env, jtarget, jtarget_off,
+                                          jtarget_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    seekForPrev0
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_RocksIterator_seekForPrev0(JNIEnv* env, jobject /*jobj*/,
+                                                 jlong handle,
+                                                 jbyteArray jtarget,
+                                                 jint jtarget_len) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  auto seek = [&it](ROCKSDB_NAMESPACE::Slice& target_slice) {
+    it->SeekForPrev(target_slice);
+  };
+  ROCKSDB_NAMESPACE::JniUtil::k_op_region(seek, env, jtarget, 0, jtarget_len);
+}
+
+/*
+ * This method supports fetching into indirect byte buffers;
+ * the Java wrapper extracts the byte[] and passes it here.
+ * In this case, the buffer offset of the key may be non-zero.
+ *
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    seek0
+ * Signature: (J[BII)V
+ */
+void Java_org_rocksdb_RocksIterator_seekForPrevByteArray0(
+    JNIEnv* env, jobject /*jobj*/, jlong handle, jbyteArray jtarget,
+    jint jtarget_off, jint jtarget_len) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  auto seek = [&it](ROCKSDB_NAMESPACE::Slice& target_slice) {
+    it->SeekForPrev(target_slice);
+  };
+  ROCKSDB_NAMESPACE::JniUtil::k_op_region(seek, env, jtarget, jtarget_off,
+                                          jtarget_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    status0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksIterator_status0(JNIEnv* env, jobject /*jobj*/,
+                                            jlong handle) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  ROCKSDB_NAMESPACE::Status s = it->status();
+
+  if (s.ok()) {
+    return;
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    key0
+ * Signature: (J)[B
+ */
+jbyteArray Java_org_rocksdb_RocksIterator_key0(JNIEnv* env, jobject /*jobj*/,
+                                               jlong handle) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  ROCKSDB_NAMESPACE::Slice key_slice = it->key();
+
+  jbyteArray jkey = env->NewByteArray(static_cast<jsize>(key_slice.size()));
+  if (jkey == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+  env->SetByteArrayRegion(
+      jkey, 0, static_cast<jsize>(key_slice.size()),
+      const_cast<jbyte*>(reinterpret_cast<const jbyte*>(key_slice.data())));
+  return jkey;
+}
+
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    keyDirect0
+ * Signature: (JLjava/nio/ByteBuffer;II)I
+ */
+jint Java_org_rocksdb_RocksIterator_keyDirect0(JNIEnv* env, jobject /*jobj*/,
+                                               jlong handle, jobject jtarget,
+                                               jint jtarget_off,
+                                               jint jtarget_len) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  ROCKSDB_NAMESPACE::Slice key_slice = it->key();
+  return ROCKSDB_NAMESPACE::JniUtil::copyToDirect(env, key_slice, jtarget,
+                                                  jtarget_off, jtarget_len);
+}
+
+/*
+ * This method supports fetching into indirect byte buffers;
+ * the Java wrapper extracts the byte[] and passes it here.
+ *
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    keyByteArray0
+ * Signature: (J[BII)I
+ */
+jint Java_org_rocksdb_RocksIterator_keyByteArray0(JNIEnv* env, jobject /*jobj*/,
+                                                  jlong handle, jbyteArray jkey,
+                                                  jint jkey_off,
+                                                  jint jkey_len) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  ROCKSDB_NAMESPACE::Slice key_slice = it->key();
+  jsize copy_size = std::min(static_cast<uint32_t>(key_slice.size()),
+                             static_cast<uint32_t>(jkey_len));
+  env->SetByteArrayRegion(
+      jkey, jkey_off, copy_size,
+      const_cast<jbyte*>(reinterpret_cast<const jbyte*>(key_slice.data())));
+
+  return static_cast<jsize>(key_slice.size());
+}
+
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    value0
+ * Signature: (J)[B
+ */
+jbyteArray Java_org_rocksdb_RocksIterator_value0(JNIEnv* env, jobject /*jobj*/,
+                                                 jlong handle) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  ROCKSDB_NAMESPACE::Slice value_slice = it->value();
+
+  jbyteArray jkeyValue =
+      env->NewByteArray(static_cast<jsize>(value_slice.size()));
+  if (jkeyValue == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+  env->SetByteArrayRegion(
+      jkeyValue, 0, static_cast<jsize>(value_slice.size()),
+      const_cast<jbyte*>(reinterpret_cast<const jbyte*>(value_slice.data())));
+  return jkeyValue;
+}
+
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    valueDirect0
+ * Signature: (JLjava/nio/ByteBuffer;II)I
+ */
+jint Java_org_rocksdb_RocksIterator_valueDirect0(JNIEnv* env, jobject /*jobj*/,
+                                                 jlong handle, jobject jtarget,
+                                                 jint jtarget_off,
+                                                 jint jtarget_len) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  ROCKSDB_NAMESPACE::Slice value_slice = it->value();
+  return ROCKSDB_NAMESPACE::JniUtil::copyToDirect(env, value_slice, jtarget,
+                                                  jtarget_off, jtarget_len);
+}
+
+/*
+ * This method supports fetching into indirect byte buffers;
+ * the Java wrapper extracts the byte[] and passes it here.
+ *
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    valueByteArray0
+ * Signature: (J[BII)I
+ */
+jint Java_org_rocksdb_RocksIterator_valueByteArray0(
+    JNIEnv* env, jobject /*jobj*/, jlong handle, jbyteArray jvalue_target,
+    jint jvalue_off, jint jvalue_len) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  ROCKSDB_NAMESPACE::Slice value_slice = it->value();
+  jsize copy_size = std::min(static_cast<uint32_t>(value_slice.size()),
+                             static_cast<uint32_t>(jvalue_len));
+  env->SetByteArrayRegion(
+      jvalue_target, jvalue_off, copy_size,
+      const_cast<jbyte*>(reinterpret_cast<const jbyte*>(value_slice.data())));
+
+  return static_cast<jsize>(value_slice.size());
+}
diff --git a/src/rocksdb/java/rocksjni/jnicallback.cc b/src/rocksdb/java/rocksjni/jnicallback.cc
new file mode 100644
index 000000000..f2742cd88
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/jnicallback.cc
@@ -0,0 +1,54 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// JNI Callbacks from C++ to sub-classes or org.rocksdb.RocksCallbackObject
+
+#include "rocksjni/jnicallback.h"
+
+#include <assert.h>
+
+#include "rocksjni/portal.h"
+
+namespace ROCKSDB_NAMESPACE {
+JniCallback::JniCallback(JNIEnv* env, jobject jcallback_obj) {
+  // Note: jcallback_obj may be accessed by multiple threads,
+  // so we ref the jvm not the env
+  const jint rs = env->GetJavaVM(&m_jvm);
+  if (rs != JNI_OK) {
+    // exception thrown
+    return;
+  }
+
+  // Note: we may want to access the Java callback object instance
+  // across multiple method calls, so we create a global ref
+  assert(jcallback_obj != nullptr);
+  m_jcallback_obj = env->NewGlobalRef(jcallback_obj);
+  if (jcallback_obj == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+}
+
+JNIEnv* JniCallback::getJniEnv(jboolean* attached) const {
+  return JniUtil::getJniEnv(m_jvm, attached);
+}
+
+void JniCallback::releaseJniEnv(jboolean& attached) const {
+  JniUtil::releaseJniEnv(m_jvm, attached);
+}
+
+JniCallback::~JniCallback() {
+  jboolean attached_thread = JNI_FALSE;
+  JNIEnv* env = getJniEnv(&attached_thread);
+  assert(env != nullptr);
+
+  if (m_jcallback_obj != nullptr) {
+    env->DeleteGlobalRef(m_jcallback_obj);
+  }
+
+  releaseJniEnv(attached_thread);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/java/rocksjni/jnicallback.h b/src/rocksdb/java/rocksjni/jnicallback.h
new file mode 100644
index 000000000..a03a04128
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/jnicallback.h
@@ -0,0 +1,32 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// JNI Callbacks from C++ to sub-classes or org.rocksdb.RocksCallbackObject
+
+#ifndef JAVA_ROCKSJNI_JNICALLBACK_H_
+#define JAVA_ROCKSJNI_JNICALLBACK_H_
+
+#include <jni.h>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+class JniCallback {
+ public:
+  JniCallback(JNIEnv* env, jobject jcallback_obj);
+  virtual ~JniCallback();
+
+  const jobject& GetJavaObject() const { return m_jcallback_obj; }
+
+ protected:
+  JavaVM* m_jvm;
+  jobject m_jcallback_obj;
+  JNIEnv* getJniEnv(jboolean* attached) const;
+  void releaseJniEnv(jboolean& attached) const;
+};
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // JAVA_ROCKSJNI_JNICALLBACK_H_
diff --git a/src/rocksdb/java/rocksjni/loggerjnicallback.cc b/src/rocksdb/java/rocksjni/loggerjnicallback.cc
new file mode 100644
index 000000000..aa9f95cd4
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/loggerjnicallback.cc
@@ -0,0 +1,299 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::Logger.
+
+#include "rocksjni/loggerjnicallback.h"
+
+#include <cstdarg>
+#include <cstdio>
+
+#include "include/org_rocksdb_Logger.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+LoggerJniCallback::LoggerJniCallback(JNIEnv* env, jobject jlogger)
+    : JniCallback(env, jlogger) {
+  m_jLogMethodId = LoggerJni::getLogMethodId(env);
+  if (m_jLogMethodId == nullptr) {
+    // exception thrown: NoSuchMethodException or OutOfMemoryError
+    return;
+  }
+
+  jobject jdebug_level = InfoLogLevelJni::DEBUG_LEVEL(env);
+  if (jdebug_level == nullptr) {
+    // exception thrown: NoSuchFieldError, ExceptionInInitializerError
+    // or OutOfMemoryError
+    return;
+  }
+  m_jdebug_level = env->NewGlobalRef(jdebug_level);
+  if (m_jdebug_level == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  jobject jinfo_level = InfoLogLevelJni::INFO_LEVEL(env);
+  if (jinfo_level == nullptr) {
+    // exception thrown: NoSuchFieldError, ExceptionInInitializerError
+    // or OutOfMemoryError
+    return;
+  }
+  m_jinfo_level = env->NewGlobalRef(jinfo_level);
+  if (m_jinfo_level == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  jobject jwarn_level = InfoLogLevelJni::WARN_LEVEL(env);
+  if (jwarn_level == nullptr) {
+    // exception thrown: NoSuchFieldError, ExceptionInInitializerError
+    // or OutOfMemoryError
+    return;
+  }
+  m_jwarn_level = env->NewGlobalRef(jwarn_level);
+  if (m_jwarn_level == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  jobject jerror_level = InfoLogLevelJni::ERROR_LEVEL(env);
+  if (jerror_level == nullptr) {
+    // exception thrown: NoSuchFieldError, ExceptionInInitializerError
+    // or OutOfMemoryError
+    return;
+  }
+  m_jerror_level = env->NewGlobalRef(jerror_level);
+  if (m_jerror_level == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  jobject jfatal_level = InfoLogLevelJni::FATAL_LEVEL(env);
+  if (jfatal_level == nullptr) {
+    // exception thrown: NoSuchFieldError, ExceptionInInitializerError
+    // or OutOfMemoryError
+    return;
+  }
+  m_jfatal_level = env->NewGlobalRef(jfatal_level);
+  if (m_jfatal_level == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  jobject jheader_level = InfoLogLevelJni::HEADER_LEVEL(env);
+  if (jheader_level == nullptr) {
+    // exception thrown: NoSuchFieldError, ExceptionInInitializerError
+    // or OutOfMemoryError
+    return;
+  }
+  m_jheader_level = env->NewGlobalRef(jheader_level);
+  if (m_jheader_level == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+}
+
+void LoggerJniCallback::Logv(const char* /*format*/, va_list /*ap*/) {
+  // We implement this method because it is virtual but we don't
+  // use it because we need to know about the log level.
+}
+
+void LoggerJniCallback::Logv(const InfoLogLevel log_level, const char* format,
+                             va_list ap) {
+  if (GetInfoLogLevel() <= log_level) {
+    // determine InfoLogLevel java enum instance
+    jobject jlog_level;
+    switch (log_level) {
+      case ROCKSDB_NAMESPACE::InfoLogLevel::DEBUG_LEVEL:
+        jlog_level = m_jdebug_level;
+        break;
+      case ROCKSDB_NAMESPACE::InfoLogLevel::INFO_LEVEL:
+        jlog_level = m_jinfo_level;
+        break;
+      case ROCKSDB_NAMESPACE::InfoLogLevel::WARN_LEVEL:
+        jlog_level = m_jwarn_level;
+        break;
+      case ROCKSDB_NAMESPACE::InfoLogLevel::ERROR_LEVEL:
+        jlog_level = m_jerror_level;
+        break;
+      case ROCKSDB_NAMESPACE::InfoLogLevel::FATAL_LEVEL:
+        jlog_level = m_jfatal_level;
+        break;
+      case ROCKSDB_NAMESPACE::InfoLogLevel::HEADER_LEVEL:
+        jlog_level = m_jheader_level;
+        break;
+      default:
+        jlog_level = m_jfatal_level;
+        break;
+    }
+
+    assert(format != nullptr);
+    const std::unique_ptr<char[]> msg = format_str(format, ap);
+
+    // pass msg to java callback handler
+    jboolean attached_thread = JNI_FALSE;
+    JNIEnv* env = getJniEnv(&attached_thread);
+    assert(env != nullptr);
+
+    jstring jmsg = env->NewStringUTF(msg.get());
+    if (jmsg == nullptr) {
+      // unable to construct string
+      if (env->ExceptionCheck()) {
+        env->ExceptionDescribe();  // print out exception to stderr
+      }
+      releaseJniEnv(attached_thread);
+      return;
+    }
+    if (env->ExceptionCheck()) {
+      // exception thrown: OutOfMemoryError
+      env->ExceptionDescribe();  // print out exception to stderr
+      env->DeleteLocalRef(jmsg);
+      releaseJniEnv(attached_thread);
+      return;
+    }
+
+    env->CallVoidMethod(m_jcallback_obj, m_jLogMethodId, jlog_level, jmsg);
+    if (env->ExceptionCheck()) {
+      // exception thrown
+      env->ExceptionDescribe();  // print out exception to stderr
+      env->DeleteLocalRef(jmsg);
+      releaseJniEnv(attached_thread);
+      return;
+    }
+
+    env->DeleteLocalRef(jmsg);
+    releaseJniEnv(attached_thread);
+  }
+}
+
+std::unique_ptr<char[]> LoggerJniCallback::format_str(const char* format,
+                                                      va_list ap) const {
+  va_list ap_copy;
+
+  va_copy(ap_copy, ap);
+  const size_t required =
+      vsnprintf(nullptr, 0, format, ap_copy) + 1;  // Extra space for '\0'
+  va_end(ap_copy);
+
+  std::unique_ptr<char[]> buf(new char[required]);
+
+  va_copy(ap_copy, ap);
+  vsnprintf(buf.get(), required, format, ap_copy);
+  va_end(ap_copy);
+
+  return buf;
+}
+LoggerJniCallback::~LoggerJniCallback() {
+  jboolean attached_thread = JNI_FALSE;
+  JNIEnv* env = getJniEnv(&attached_thread);
+  assert(env != nullptr);
+
+  if (m_jdebug_level != nullptr) {
+    env->DeleteGlobalRef(m_jdebug_level);
+  }
+
+  if (m_jinfo_level != nullptr) {
+    env->DeleteGlobalRef(m_jinfo_level);
+  }
+
+  if (m_jwarn_level != nullptr) {
+    env->DeleteGlobalRef(m_jwarn_level);
+  }
+
+  if (m_jerror_level != nullptr) {
+    env->DeleteGlobalRef(m_jerror_level);
+  }
+
+  if (m_jfatal_level != nullptr) {
+    env->DeleteGlobalRef(m_jfatal_level);
+  }
+
+  if (m_jheader_level != nullptr) {
+    env->DeleteGlobalRef(m_jheader_level);
+  }
+
+  releaseJniEnv(attached_thread);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+/*
+ * Class:     org_rocksdb_Logger
+ * Method:    createNewLoggerOptions
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Logger_createNewLoggerOptions(JNIEnv* env, jobject jobj,
+                                                     jlong joptions) {
+  auto* sptr_logger = new std::shared_ptr<ROCKSDB_NAMESPACE::LoggerJniCallback>(
+      new ROCKSDB_NAMESPACE::LoggerJniCallback(env, jobj));
+
+  // set log level
+  auto* options = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(joptions);
+  sptr_logger->get()->SetInfoLogLevel(options->info_log_level);
+
+  return GET_CPLUSPLUS_POINTER(sptr_logger);
+}
+
+/*
+ * Class:     org_rocksdb_Logger
+ * Method:    createNewLoggerDbOptions
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Logger_createNewLoggerDbOptions(JNIEnv* env,
+                                                       jobject jobj,
+                                                       jlong jdb_options) {
+  auto* sptr_logger = new std::shared_ptr<ROCKSDB_NAMESPACE::LoggerJniCallback>(
+      new ROCKSDB_NAMESPACE::LoggerJniCallback(env, jobj));
+
+  // set log level
+  auto* db_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jdb_options);
+  sptr_logger->get()->SetInfoLogLevel(db_options->info_log_level);
+
+  return GET_CPLUSPLUS_POINTER(sptr_logger);
+}
+
+/*
+ * Class:     org_rocksdb_Logger
+ * Method:    setInfoLogLevel
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_Logger_setInfoLogLevel(JNIEnv* /*env*/, jobject /*jobj*/,
+                                             jlong jhandle, jbyte jlog_level) {
+  auto* handle =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::LoggerJniCallback>*>(
+          jhandle);
+  handle->get()->SetInfoLogLevel(
+      static_cast<ROCKSDB_NAMESPACE::InfoLogLevel>(jlog_level));
+}
+
+/*
+ * Class:     org_rocksdb_Logger
+ * Method:    infoLogLevel
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_Logger_infoLogLevel(JNIEnv* /*env*/, jobject /*jobj*/,
+                                           jlong jhandle) {
+  auto* handle =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::LoggerJniCallback>*>(
+          jhandle);
+  return static_cast<jbyte>(handle->get()->GetInfoLogLevel());
+}
+
+/*
+ * Class:     org_rocksdb_Logger
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Logger_disposeInternal(JNIEnv* /*env*/, jobject /*jobj*/,
+                                             jlong jhandle) {
+  auto* handle =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::LoggerJniCallback>*>(
+          jhandle);
+  delete handle;  // delete std::shared_ptr
+}
diff --git a/src/rocksdb/java/rocksjni/loggerjnicallback.h b/src/rocksdb/java/rocksjni/loggerjnicallback.h
new file mode 100644
index 000000000..57774988c
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/loggerjnicallback.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::Logger
+
+#ifndef JAVA_ROCKSJNI_LOGGERJNICALLBACK_H_
+#define JAVA_ROCKSJNI_LOGGERJNICALLBACK_H_
+
+#include <jni.h>
+
+#include <memory>
+#include <string>
+
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksjni/jnicallback.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class LoggerJniCallback : public JniCallback, public Logger {
+ public:
+  LoggerJniCallback(JNIEnv* env, jobject jLogger);
+  ~LoggerJniCallback();
+
+  using Logger::GetInfoLogLevel;
+  using Logger::SetInfoLogLevel;
+  // Write an entry to the log file with the specified format.
+  virtual void Logv(const char* format, va_list ap);
+  // Write an entry to the log file with the specified log level
+  // and format.  Any log with level under the internal log level
+  // of *this (see @SetInfoLogLevel and @GetInfoLogLevel) will not be
+  // printed.
+  virtual void Logv(const InfoLogLevel log_level, const char* format,
+                    va_list ap);
+
+ private:
+  jmethodID m_jLogMethodId;
+  jobject m_jdebug_level;
+  jobject m_jinfo_level;
+  jobject m_jwarn_level;
+  jobject m_jerror_level;
+  jobject m_jfatal_level;
+  jobject m_jheader_level;
+  std::unique_ptr<char[]> format_str(const char* format, va_list ap) const;
+};
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // JAVA_ROCKSJNI_LOGGERJNICALLBACK_H_
diff --git a/src/rocksdb/java/rocksjni/lru_cache.cc b/src/rocksdb/java/rocksjni/lru_cache.cc
new file mode 100644
index 000000000..56dffa2f0
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/lru_cache.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::LRUCache.
+
+#include "cache/lru_cache.h"
+
+#include <jni.h>
+
+#include "include/org_rocksdb_LRUCache.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+
+/*
+ * Class:     org_rocksdb_LRUCache
+ * Method:    newLRUCache
+ * Signature: (JIZD)J
+ */
+jlong Java_org_rocksdb_LRUCache_newLRUCache(JNIEnv* /*env*/, jclass /*jcls*/,
+                                            jlong jcapacity,
+                                            jint jnum_shard_bits,
+                                            jboolean jstrict_capacity_limit,
+                                            jdouble jhigh_pri_pool_ratio,
+                                            jdouble jlow_pri_pool_ratio) {
+  auto* sptr_lru_cache = new std::shared_ptr<ROCKSDB_NAMESPACE::Cache>(
+      ROCKSDB_NAMESPACE::NewLRUCache(
+          static_cast<size_t>(jcapacity), static_cast<int>(jnum_shard_bits),
+          static_cast<bool>(jstrict_capacity_limit),
+          static_cast<double>(jhigh_pri_pool_ratio),
+          nullptr /* memory_allocator */, rocksdb::kDefaultToAdaptiveMutex,
+          rocksdb::kDefaultCacheMetadataChargePolicy,
+          static_cast<double>(jlow_pri_pool_ratio)));
+  return GET_CPLUSPLUS_POINTER(sptr_lru_cache);
+}
+
+/*
+ * Class:     org_rocksdb_LRUCache
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_LRUCache_disposeInternal(JNIEnv* /*env*/,
+                                               jobject /*jobj*/,
+                                               jlong jhandle) {
+  auto* sptr_lru_cache =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache>*>(jhandle);
+  delete sptr_lru_cache;  // delete std::shared_ptr
+}
diff --git a/src/rocksdb/java/rocksjni/memory_util.cc b/src/rocksdb/java/rocksjni/memory_util.cc
new file mode 100644
index 000000000..c87c4f403
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/memory_util.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/utilities/memory_util.h"
+
+#include <jni.h>
+
+#include <map>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "include/org_rocksdb_MemoryUtil.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_MemoryUtil
+ * Method:    getApproximateMemoryUsageByType
+ * Signature: ([J[J)Ljava/util/Map;
+ */
+jobject Java_org_rocksdb_MemoryUtil_getApproximateMemoryUsageByType(
+    JNIEnv *env, jclass, jlongArray jdb_handles, jlongArray jcache_handles) {
+  jboolean has_exception = JNI_FALSE;
+  std::vector<ROCKSDB_NAMESPACE::DB *> dbs =
+      ROCKSDB_NAMESPACE::JniUtil::fromJPointers<ROCKSDB_NAMESPACE::DB>(
+          env, jdb_handles, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+
+  std::unordered_set<const ROCKSDB_NAMESPACE::Cache *> cache_set;
+  jsize cache_handle_count = env->GetArrayLength(jcache_handles);
+  if (cache_handle_count > 0) {
+    jlong *ptr_jcache_handles =
+        env->GetLongArrayElements(jcache_handles, nullptr);
+    if (ptr_jcache_handles == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+    for (jsize i = 0; i < cache_handle_count; i++) {
+      auto *cache_ptr =
+          reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache> *>(
+              ptr_jcache_handles[i]);
+      cache_set.insert(cache_ptr->get());
+    }
+    env->ReleaseLongArrayElements(jcache_handles, ptr_jcache_handles,
+                                  JNI_ABORT);
+  }
+
+  std::map<ROCKSDB_NAMESPACE::MemoryUtil::UsageType, uint64_t> usage_by_type;
+  if (ROCKSDB_NAMESPACE::MemoryUtil::GetApproximateMemoryUsageByType(
+          dbs, cache_set, &usage_by_type) != ROCKSDB_NAMESPACE::Status::OK()) {
+    // Non-OK status
+    return nullptr;
+  }
+
+  jobject jusage_by_type = ROCKSDB_NAMESPACE::HashMapJni::construct(
+      env, static_cast<uint32_t>(usage_by_type.size()));
+  if (jusage_by_type == nullptr) {
+    // exception occurred
+    return nullptr;
+  }
+  const ROCKSDB_NAMESPACE::HashMapJni::FnMapKV<
+      const ROCKSDB_NAMESPACE::MemoryUtil::UsageType, const uint64_t, jobject,
+      jobject>
+      fn_map_kv = [env](
+                      const std::pair<ROCKSDB_NAMESPACE::MemoryUtil::UsageType,
+                                      uint64_t> &pair) {
+        // Construct key
+        const jobject jusage_type = ROCKSDB_NAMESPACE::ByteJni::valueOf(
+            env, ROCKSDB_NAMESPACE::MemoryUsageTypeJni::toJavaMemoryUsageType(
+                     pair.first));
+        if (jusage_type == nullptr) {
+          // an error occurred
+          return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+        }
+        // Construct value
+        const jobject jusage_value =
+            ROCKSDB_NAMESPACE::LongJni::valueOf(env, pair.second);
+        if (jusage_value == nullptr) {
+          // an error occurred
+          return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+        }
+        // Construct and return pointer to pair of jobjects
+        return std::unique_ptr<std::pair<jobject, jobject>>(
+            new std::pair<jobject, jobject>(jusage_type, jusage_value));
+      };
+
+  if (!ROCKSDB_NAMESPACE::HashMapJni::putAll(env, jusage_by_type,
+                                             usage_by_type.begin(),
+                                             usage_by_type.end(), fn_map_kv)) {
+    // exception occcurred
+    jusage_by_type = nullptr;
+  }
+
+  return jusage_by_type;
+}
diff --git a/src/rocksdb/java/rocksjni/memtablejni.cc b/src/rocksdb/java/rocksjni/memtablejni.cc
new file mode 100644
index 000000000..a4d02f354
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/memtablejni.cc
@@ -0,0 +1,94 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for MemTables.
+
+#include "include/org_rocksdb_HashLinkedListMemTableConfig.h"
+#include "include/org_rocksdb_HashSkipListMemTableConfig.h"
+#include "include/org_rocksdb_SkipListMemTableConfig.h"
+#include "include/org_rocksdb_VectorMemTableConfig.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_HashSkipListMemTableConfig
+ * Method:    newMemTableFactoryHandle
+ * Signature: (JII)J
+ */
+jlong Java_org_rocksdb_HashSkipListMemTableConfig_newMemTableFactoryHandle(
+    JNIEnv* env, jobject /*jobj*/, jlong jbucket_count, jint jheight,
+    jint jbranching_factor) {
+  ROCKSDB_NAMESPACE::Status s =
+      ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(jbucket_count);
+  if (s.ok()) {
+    return GET_CPLUSPLUS_POINTER(ROCKSDB_NAMESPACE::NewHashSkipListRepFactory(
+        static_cast<size_t>(jbucket_count), static_cast<int32_t>(jheight),
+        static_cast<int32_t>(jbranching_factor)));
+  }
+  ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  return 0;
+}
+
+/*
+ * Class:     org_rocksdb_HashLinkedListMemTableConfig
+ * Method:    newMemTableFactoryHandle
+ * Signature: (JJIZI)J
+ */
+jlong Java_org_rocksdb_HashLinkedListMemTableConfig_newMemTableFactoryHandle(
+    JNIEnv* env, jobject /*jobj*/, jlong jbucket_count,
+    jlong jhuge_page_tlb_size, jint jbucket_entries_logging_threshold,
+    jboolean jif_log_bucket_dist_when_flash, jint jthreshold_use_skiplist) {
+  ROCKSDB_NAMESPACE::Status statusBucketCount =
+      ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(jbucket_count);
+  ROCKSDB_NAMESPACE::Status statusHugePageTlb =
+      ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(
+          jhuge_page_tlb_size);
+  if (statusBucketCount.ok() && statusHugePageTlb.ok()) {
+    return GET_CPLUSPLUS_POINTER(ROCKSDB_NAMESPACE::NewHashLinkListRepFactory(
+        static_cast<size_t>(jbucket_count),
+        static_cast<size_t>(jhuge_page_tlb_size),
+        static_cast<int32_t>(jbucket_entries_logging_threshold),
+        static_cast<bool>(jif_log_bucket_dist_when_flash),
+        static_cast<int32_t>(jthreshold_use_skiplist)));
+  }
+  ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(
+      env, !statusBucketCount.ok() ? statusBucketCount : statusHugePageTlb);
+  return 0;
+}
+
+/*
+ * Class:     org_rocksdb_VectorMemTableConfig
+ * Method:    newMemTableFactoryHandle
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_VectorMemTableConfig_newMemTableFactoryHandle(
+    JNIEnv* env, jobject /*jobj*/, jlong jreserved_size) {
+  ROCKSDB_NAMESPACE::Status s =
+      ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(jreserved_size);
+  if (s.ok()) {
+    return GET_CPLUSPLUS_POINTER(new ROCKSDB_NAMESPACE::VectorRepFactory(
+        static_cast<size_t>(jreserved_size)));
+  }
+  ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  return 0;
+}
+
+/*
+ * Class:     org_rocksdb_SkipListMemTableConfig
+ * Method:    newMemTableFactoryHandle0
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_SkipListMemTableConfig_newMemTableFactoryHandle0(
+    JNIEnv* env, jobject /*jobj*/, jlong jlookahead) {
+  ROCKSDB_NAMESPACE::Status s =
+      ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(jlookahead);
+  if (s.ok()) {
+    return GET_CPLUSPLUS_POINTER(new ROCKSDB_NAMESPACE::SkipListFactory(
+        static_cast<size_t>(jlookahead)));
+  }
+  ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  return 0;
+}
diff --git a/src/rocksdb/java/rocksjni/merge_operator.cc b/src/rocksdb/java/rocksjni/merge_operator.cc
new file mode 100644
index 000000000..ce3c5df56
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/merge_operator.cc
@@ -0,0 +1,98 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2014, Vlad Balan (vlad.gm@gmail.com).  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++
+// for ROCKSDB_NAMESPACE::MergeOperator.
+
+#include "rocksdb/merge_operator.h"
+
+#include <jni.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <memory>
+#include <string>
+
+#include "include/org_rocksdb_StringAppendOperator.h"
+#include "include/org_rocksdb_UInt64AddOperator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+#include "utilities/merge_operators.h"
+
+/*
+ * Class:     org_rocksdb_StringAppendOperator
+ * Method:    newSharedStringAppendOperator
+ * Signature: (C)J
+ */
+jlong Java_org_rocksdb_StringAppendOperator_newSharedStringAppendOperator__C(
+    JNIEnv* /*env*/, jclass /*jclazz*/, jchar jdelim) {
+  auto* sptr_string_append_op =
+      new std::shared_ptr<ROCKSDB_NAMESPACE::MergeOperator>(
+          ROCKSDB_NAMESPACE::MergeOperators::CreateStringAppendOperator(
+              (char)jdelim));
+  return GET_CPLUSPLUS_POINTER(sptr_string_append_op);
+}
+
+jlong Java_org_rocksdb_StringAppendOperator_newSharedStringAppendOperator__Ljava_lang_String_2(
+    JNIEnv* env, jclass /*jclass*/, jstring jdelim) {
+  jboolean has_exception = JNI_FALSE;
+  auto delim =
+      ROCKSDB_NAMESPACE::JniUtil::copyStdString(env, jdelim, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    return 0;
+  }
+  auto* sptr_string_append_op =
+      new std::shared_ptr<ROCKSDB_NAMESPACE::MergeOperator>(
+          ROCKSDB_NAMESPACE::MergeOperators::CreateStringAppendOperator(delim));
+  return GET_CPLUSPLUS_POINTER(sptr_string_append_op);
+}
+
+/*
+ * Class:     org_rocksdb_StringAppendOperator
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_StringAppendOperator_disposeInternal(JNIEnv* /*env*/,
+                                                           jobject /*jobj*/,
+                                                           jlong jhandle) {
+  auto* sptr_string_append_op =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::MergeOperator>*>(
+          jhandle);
+  delete sptr_string_append_op;  // delete std::shared_ptr
+}
+
+/*
+ * Class:     org_rocksdb_UInt64AddOperator
+ * Method:    newSharedUInt64AddOperator
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_UInt64AddOperator_newSharedUInt64AddOperator(
+    JNIEnv* /*env*/, jclass /*jclazz*/) {
+  auto* sptr_uint64_add_op =
+      new std::shared_ptr<ROCKSDB_NAMESPACE::MergeOperator>(
+          ROCKSDB_NAMESPACE::MergeOperators::CreateUInt64AddOperator());
+  return GET_CPLUSPLUS_POINTER(sptr_uint64_add_op);
+}
+
+/*
+ * Class:     org_rocksdb_UInt64AddOperator
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_UInt64AddOperator_disposeInternal(JNIEnv* /*env*/,
+                                                        jobject /*jobj*/,
+                                                        jlong jhandle) {
+  auto* sptr_uint64_add_op =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::MergeOperator>*>(
+          jhandle);
+  delete sptr_uint64_add_op;  // delete std::shared_ptr
+}
diff --git a/src/rocksdb/java/rocksjni/native_comparator_wrapper_test.cc b/src/rocksdb/java/rocksjni/native_comparator_wrapper_test.cc
new file mode 100644
index 000000000..ac33ca22d
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/native_comparator_wrapper_test.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <jni.h>
+
+#include <string>
+
+#include "include/org_rocksdb_NativeComparatorWrapperTest_NativeStringComparatorWrapper.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/slice.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class NativeComparatorWrapperTestStringComparator : public Comparator {
+  const char* Name() const {
+    return "NativeComparatorWrapperTestStringComparator";
+  }
+
+  int Compare(const Slice& a, const Slice& b) const {
+    return a.ToString().compare(b.ToString());
+  }
+
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const {
+    return;
+  }
+
+  void FindShortSuccessor(std::string* /*key*/) const { return; }
+};
+}  // namespace ROCKSDB_NAMESPACE
+
+/*
+ * Class: org_rocksdb_NativeComparatorWrapperTest_NativeStringComparatorWrapper
+ * Method:    newStringComparator
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_NativeComparatorWrapperTest_00024NativeStringComparatorWrapper_newStringComparator(
+    JNIEnv* /*env*/, jobject /*jobj*/) {
+  auto* comparator =
+      new ROCKSDB_NAMESPACE::NativeComparatorWrapperTestStringComparator();
+  return GET_CPLUSPLUS_POINTER(comparator);
+}
diff --git a/src/rocksdb/java/rocksjni/optimistic_transaction_db.cc b/src/rocksdb/java/rocksjni/optimistic_transaction_db.cc
new file mode 100644
index 000000000..238224f58
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/optimistic_transaction_db.cc
@@ -0,0 +1,270 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++
+// for ROCKSDB_NAMESPACE::TransactionDB.
+
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+
+#include <jni.h>
+
+#include "include/org_rocksdb_OptimisticTransactionDB.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_OptimisticTransactionDB
+ * Method:    open
+ * Signature: (JLjava/lang/String;)J
+ */
+jlong Java_org_rocksdb_OptimisticTransactionDB_open__JLjava_lang_String_2(
+    JNIEnv* env, jclass, jlong joptions_handle, jstring jdb_path) {
+  const char* db_path = env->GetStringUTFChars(jdb_path, nullptr);
+  if (db_path == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return 0;
+  }
+
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(joptions_handle);
+  ROCKSDB_NAMESPACE::OptimisticTransactionDB* otdb = nullptr;
+  ROCKSDB_NAMESPACE::Status s =
+      ROCKSDB_NAMESPACE::OptimisticTransactionDB::Open(*options, db_path,
+                                                       &otdb);
+  env->ReleaseStringUTFChars(jdb_path, db_path);
+
+  if (s.ok()) {
+    return GET_CPLUSPLUS_POINTER(otdb);
+  } else {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+    return 0;
+  }
+}
+
+/*
+ * Class:     org_rocksdb_OptimisticTransactionDB
+ * Method:    open
+ * Signature: (JLjava/lang/String;[[B[J)[J
+ */
+jlongArray
+Java_org_rocksdb_OptimisticTransactionDB_open__JLjava_lang_String_2_3_3B_3J(
+    JNIEnv* env, jclass, jlong jdb_options_handle, jstring jdb_path,
+    jobjectArray jcolumn_names, jlongArray jcolumn_options_handles) {
+  const char* db_path = env->GetStringUTFChars(jdb_path, nullptr);
+  if (db_path == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+
+  std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor> column_families;
+  const jsize len_cols = env->GetArrayLength(jcolumn_names);
+  if (len_cols > 0) {
+    jlong* jco = env->GetLongArrayElements(jcolumn_options_handles, nullptr);
+    if (jco == nullptr) {
+      // exception thrown: OutOfMemoryError
+      env->ReleaseStringUTFChars(jdb_path, db_path);
+      return nullptr;
+    }
+
+    for (int i = 0; i < len_cols; i++) {
+      const jobject jcn = env->GetObjectArrayElement(jcolumn_names, i);
+      if (env->ExceptionCheck()) {
+        // exception thrown: ArrayIndexOutOfBoundsException
+        env->ReleaseLongArrayElements(jcolumn_options_handles, jco, JNI_ABORT);
+        env->ReleaseStringUTFChars(jdb_path, db_path);
+        return nullptr;
+      }
+
+      const jbyteArray jcn_ba = reinterpret_cast<jbyteArray>(jcn);
+      const jsize jcf_name_len = env->GetArrayLength(jcn_ba);
+      jbyte* jcf_name = env->GetByteArrayElements(jcn_ba, nullptr);
+      if (jcf_name == nullptr) {
+        // exception thrown: OutOfMemoryError
+        env->DeleteLocalRef(jcn);
+        env->ReleaseLongArrayElements(jcolumn_options_handles, jco, JNI_ABORT);
+        env->ReleaseStringUTFChars(jdb_path, db_path);
+        return nullptr;
+      }
+
+      const std::string cf_name(reinterpret_cast<char*>(jcf_name),
+                                jcf_name_len);
+      const ROCKSDB_NAMESPACE::ColumnFamilyOptions* cf_options =
+          reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jco[i]);
+      column_families.push_back(
+          ROCKSDB_NAMESPACE::ColumnFamilyDescriptor(cf_name, *cf_options));
+
+      env->ReleaseByteArrayElements(jcn_ba, jcf_name, JNI_ABORT);
+      env->DeleteLocalRef(jcn);
+    }
+    env->ReleaseLongArrayElements(jcolumn_options_handles, jco, JNI_ABORT);
+  }
+
+  auto* db_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jdb_options_handle);
+  std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*> handles;
+  ROCKSDB_NAMESPACE::OptimisticTransactionDB* otdb = nullptr;
+  const ROCKSDB_NAMESPACE::Status s =
+      ROCKSDB_NAMESPACE::OptimisticTransactionDB::Open(
+          *db_options, db_path, column_families, &handles, &otdb);
+
+  env->ReleaseStringUTFChars(jdb_path, db_path);
+
+  // check if open operation was successful
+  if (s.ok()) {
+    const jsize resultsLen = 1 + len_cols;  // db handle + column family handles
+    std::unique_ptr<jlong[]> results =
+        std::unique_ptr<jlong[]>(new jlong[resultsLen]);
+    results[0] = reinterpret_cast<jlong>(otdb);
+    for (int i = 1; i <= len_cols; i++) {
+      results[i] = reinterpret_cast<jlong>(handles[i - 1]);
+    }
+
+    jlongArray jresults = env->NewLongArray(resultsLen);
+    if (jresults == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+    env->SetLongArrayRegion(jresults, 0, resultsLen, results.get());
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      return nullptr;
+    }
+    return jresults;
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  return nullptr;
+}
+
+/*
+ * Class:     org_rocksdb_OptimisticTransactionDB
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_OptimisticTransactionDB_disposeInternal(JNIEnv*, jobject,
+                                                              jlong jhandle) {
+  auto* optimistic_txn_db =
+      reinterpret_cast<ROCKSDB_NAMESPACE::OptimisticTransactionDB*>(jhandle);
+  assert(optimistic_txn_db != nullptr);
+  delete optimistic_txn_db;
+}
+
+/*
+ * Class:     org_rocksdb_OptimisticTransactionDB
+ * Method:    closeDatabase
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_OptimisticTransactionDB_closeDatabase(JNIEnv* env, jclass,
+                                                            jlong jhandle) {
+  auto* optimistic_txn_db =
+      reinterpret_cast<ROCKSDB_NAMESPACE::OptimisticTransactionDB*>(jhandle);
+  assert(optimistic_txn_db != nullptr);
+  ROCKSDB_NAMESPACE::Status s = optimistic_txn_db->Close();
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_OptimisticTransactionDB
+ * Method:    beginTransaction
+ * Signature: (JJ)J
+ */
+jlong Java_org_rocksdb_OptimisticTransactionDB_beginTransaction__JJ(
+    JNIEnv*, jobject, jlong jhandle, jlong jwrite_options_handle) {
+  auto* optimistic_txn_db =
+      reinterpret_cast<ROCKSDB_NAMESPACE::OptimisticTransactionDB*>(jhandle);
+  auto* write_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jwrite_options_handle);
+  ROCKSDB_NAMESPACE::Transaction* txn =
+      optimistic_txn_db->BeginTransaction(*write_options);
+  return GET_CPLUSPLUS_POINTER(txn);
+}
+
+/*
+ * Class:     org_rocksdb_OptimisticTransactionDB
+ * Method:    beginTransaction
+ * Signature: (JJJ)J
+ */
+jlong Java_org_rocksdb_OptimisticTransactionDB_beginTransaction__JJJ(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jwrite_options_handle, jlong joptimistic_txn_options_handle) {
+  auto* optimistic_txn_db =
+      reinterpret_cast<ROCKSDB_NAMESPACE::OptimisticTransactionDB*>(jhandle);
+  auto* write_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jwrite_options_handle);
+  auto* optimistic_txn_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::OptimisticTransactionOptions*>(
+          joptimistic_txn_options_handle);
+  ROCKSDB_NAMESPACE::Transaction* txn = optimistic_txn_db->BeginTransaction(
+      *write_options, *optimistic_txn_options);
+  return GET_CPLUSPLUS_POINTER(txn);
+}
+
+/*
+ * Class:     org_rocksdb_OptimisticTransactionDB
+ * Method:    beginTransaction_withOld
+ * Signature: (JJJ)J
+ */
+jlong Java_org_rocksdb_OptimisticTransactionDB_beginTransaction_1withOld__JJJ(
+    JNIEnv*, jobject, jlong jhandle, jlong jwrite_options_handle,
+    jlong jold_txn_handle) {
+  auto* optimistic_txn_db =
+      reinterpret_cast<ROCKSDB_NAMESPACE::OptimisticTransactionDB*>(jhandle);
+  auto* write_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jwrite_options_handle);
+  auto* old_txn =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jold_txn_handle);
+  ROCKSDB_NAMESPACE::OptimisticTransactionOptions optimistic_txn_options;
+  ROCKSDB_NAMESPACE::Transaction* txn = optimistic_txn_db->BeginTransaction(
+      *write_options, optimistic_txn_options, old_txn);
+
+  // RocksJava relies on the assumption that
+  // we do not allocate a new Transaction object
+  // when providing an old_optimistic_txn
+  assert(txn == old_txn);
+
+  return GET_CPLUSPLUS_POINTER(txn);
+}
+
+/*
+ * Class:     org_rocksdb_OptimisticTransactionDB
+ * Method:    beginTransaction_withOld
+ * Signature: (JJJJ)J
+ */
+jlong Java_org_rocksdb_OptimisticTransactionDB_beginTransaction_1withOld__JJJJ(
+    JNIEnv*, jobject, jlong jhandle, jlong jwrite_options_handle,
+    jlong joptimistic_txn_options_handle, jlong jold_txn_handle) {
+  auto* optimistic_txn_db =
+      reinterpret_cast<ROCKSDB_NAMESPACE::OptimisticTransactionDB*>(jhandle);
+  auto* write_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jwrite_options_handle);
+  auto* optimistic_txn_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::OptimisticTransactionOptions*>(
+          joptimistic_txn_options_handle);
+  auto* old_txn =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jold_txn_handle);
+  ROCKSDB_NAMESPACE::Transaction* txn = optimistic_txn_db->BeginTransaction(
+      *write_options, *optimistic_txn_options, old_txn);
+
+  // RocksJava relies on the assumption that
+  // we do not allocate a new Transaction object
+  // when providing an old_optimisic_txn
+  assert(txn == old_txn);
+
+  return GET_CPLUSPLUS_POINTER(txn);
+}
+
+/*
+ * Class:     org_rocksdb_OptimisticTransactionDB
+ * Method:    getBaseDB
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_OptimisticTransactionDB_getBaseDB(JNIEnv*, jobject,
+                                                         jlong jhandle) {
+  auto* optimistic_txn_db =
+      reinterpret_cast<ROCKSDB_NAMESPACE::OptimisticTransactionDB*>(jhandle);
+  return GET_CPLUSPLUS_POINTER(optimistic_txn_db->GetBaseDB());
+}
diff --git a/src/rocksdb/java/rocksjni/optimistic_transaction_options.cc b/src/rocksdb/java/rocksjni/optimistic_transaction_options.cc
new file mode 100644
index 000000000..501c6c4fb
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/optimistic_transaction_options.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++
+// for ROCKSDB_NAMESPACE::OptimisticTransactionOptions.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_OptimisticTransactionOptions.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+
+/*
+ * Class:     org_rocksdb_OptimisticTransactionOptions
+ * Method:    newOptimisticTransactionOptions
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_OptimisticTransactionOptions_newOptimisticTransactionOptions(
+    JNIEnv* /*env*/, jclass /*jcls*/) {
+  ROCKSDB_NAMESPACE::OptimisticTransactionOptions* opts =
+      new ROCKSDB_NAMESPACE::OptimisticTransactionOptions();
+  return GET_CPLUSPLUS_POINTER(opts);
+}
+
+/*
+ * Class:     org_rocksdb_OptimisticTransactionOptions
+ * Method:    isSetSnapshot
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_OptimisticTransactionOptions_isSetSnapshot(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::OptimisticTransactionOptions*>(
+          jhandle);
+  return opts->set_snapshot;
+}
+
+/*
+ * Class:     org_rocksdb_OptimisticTransactionOptions
+ * Method:    setSetSnapshot
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_OptimisticTransactionOptions_setSetSnapshot(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jboolean jset_snapshot) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::OptimisticTransactionOptions*>(
+          jhandle);
+  opts->set_snapshot = jset_snapshot;
+}
+
+/*
+ * Class:     org_rocksdb_OptimisticTransactionOptions
+ * Method:    setComparator
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_OptimisticTransactionOptions_setComparator(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jcomparator_handle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::OptimisticTransactionOptions*>(
+          jhandle);
+  opts->cmp =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Comparator*>(jcomparator_handle);
+}
+
+/*
+ * Class:     org_rocksdb_OptimisticTransactionOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_OptimisticTransactionOptions_disposeInternal(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  delete reinterpret_cast<ROCKSDB_NAMESPACE::OptimisticTransactionOptions*>(
+      jhandle);
+}
diff --git a/src/rocksdb/java/rocksjni/options.cc b/src/rocksdb/java/rocksjni/options.cc
new file mode 100644
index 000000000..b848ea9cf
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/options.cc
@@ -0,0 +1,8687 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::Options.
+
+#include "rocksdb/options.h"
+
+#include <jni.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <memory>
+#include <vector>
+
+#include "include/org_rocksdb_ColumnFamilyOptions.h"
+#include "include/org_rocksdb_ComparatorOptions.h"
+#include "include/org_rocksdb_DBOptions.h"
+#include "include/org_rocksdb_FlushOptions.h"
+#include "include/org_rocksdb_Options.h"
+#include "include/org_rocksdb_ReadOptions.h"
+#include "include/org_rocksdb_WriteOptions.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/sst_partitioner.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "rocksjni/comparatorjnicallback.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+#include "rocksjni/statisticsjni.h"
+#include "rocksjni/table_filter_jnicallback.h"
+#include "utilities/merge_operators.h"
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    newOptions
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_Options_newOptions__(JNIEnv*, jclass) {
+  auto* op = new ROCKSDB_NAMESPACE::Options();
+  return GET_CPLUSPLUS_POINTER(op);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    newOptions
+ * Signature: (JJ)J
+ */
+jlong Java_org_rocksdb_Options_newOptions__JJ(JNIEnv*, jclass, jlong jdboptions,
+                                              jlong jcfoptions) {
+  auto* dbOpt =
+      reinterpret_cast<const ROCKSDB_NAMESPACE::DBOptions*>(jdboptions);
+  auto* cfOpt = reinterpret_cast<const ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(
+      jcfoptions);
+  auto* op = new ROCKSDB_NAMESPACE::Options(*dbOpt, *cfOpt);
+  return GET_CPLUSPLUS_POINTER(op);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    copyOptions
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_copyOptions(JNIEnv*, jclass, jlong jhandle) {
+  auto new_opt = new ROCKSDB_NAMESPACE::Options(
+      *(reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)));
+  return GET_CPLUSPLUS_POINTER(new_opt);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Options_disposeInternal(JNIEnv*, jobject, jlong handle) {
+  auto* op = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(handle);
+  assert(op != nullptr);
+  delete op;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setIncreaseParallelism
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setIncreaseParallelism(JNIEnv*, jobject,
+                                                     jlong jhandle,
+                                                     jint totalThreads) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->IncreaseParallelism(
+      static_cast<int>(totalThreads));
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setCreateIfMissing
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setCreateIfMissing(JNIEnv*, jobject,
+                                                 jlong jhandle, jboolean flag) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->create_if_missing =
+      flag;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    createIfMissing
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_createIfMissing(JNIEnv*, jobject,
+                                                  jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->create_if_missing;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setCreateMissingColumnFamilies
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setCreateMissingColumnFamilies(JNIEnv*, jobject,
+                                                             jlong jhandle,
+                                                             jboolean flag) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->create_missing_column_families = flag;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    createMissingColumnFamilies
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_createMissingColumnFamilies(JNIEnv*, jobject,
+                                                              jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->create_missing_column_families;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setComparatorHandle
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setComparatorHandle__JI(JNIEnv*, jobject,
+                                                      jlong jhandle,
+                                                      jint builtinComparator) {
+  switch (builtinComparator) {
+    case 1:
+      reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->comparator =
+          ROCKSDB_NAMESPACE::ReverseBytewiseComparator();
+      break;
+    default:
+      reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->comparator =
+          ROCKSDB_NAMESPACE::BytewiseComparator();
+      break;
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setComparatorHandle
+ * Signature: (JJB)V
+ */
+void Java_org_rocksdb_Options_setComparatorHandle__JJB(JNIEnv*, jobject,
+                                                       jlong jopt_handle,
+                                                       jlong jcomparator_handle,
+                                                       jbyte jcomparator_type) {
+  ROCKSDB_NAMESPACE::Comparator* comparator = nullptr;
+  switch (jcomparator_type) {
+    // JAVA_COMPARATOR
+    case 0x0:
+      comparator = reinterpret_cast<ROCKSDB_NAMESPACE::ComparatorJniCallback*>(
+          jcomparator_handle);
+      break;
+
+    // JAVA_NATIVE_COMPARATOR_WRAPPER
+    case 0x1:
+      comparator =
+          reinterpret_cast<ROCKSDB_NAMESPACE::Comparator*>(jcomparator_handle);
+      break;
+  }
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jopt_handle);
+  opt->comparator = comparator;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMergeOperatorName
+ * Signature: (JJjava/lang/String)V
+ */
+void Java_org_rocksdb_Options_setMergeOperatorName(JNIEnv* env, jobject,
+                                                   jlong jhandle,
+                                                   jstring jop_name) {
+  const char* op_name = env->GetStringUTFChars(jop_name, nullptr);
+  if (op_name == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  auto* options = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  options->merge_operator =
+      ROCKSDB_NAMESPACE::MergeOperators::CreateFromStringId(op_name);
+
+  env->ReleaseStringUTFChars(jop_name, op_name);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMergeOperator
+ * Signature: (JJjava/lang/String)V
+ */
+void Java_org_rocksdb_Options_setMergeOperator(JNIEnv*, jobject, jlong jhandle,
+                                               jlong mergeOperatorHandle) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->merge_operator =
+      *(reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::MergeOperator>*>(
+          mergeOperatorHandle));
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setCompactionFilterHandle
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setCompactionFilterHandle(
+    JNIEnv*, jobject, jlong jopt_handle, jlong jcompactionfilter_handle) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jopt_handle)
+      ->compaction_filter =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionFilter*>(
+          jcompactionfilter_handle);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setCompactionFilterFactoryHandle
+ * Signature: (JJ)V
+ */
+void JNICALL Java_org_rocksdb_Options_setCompactionFilterFactoryHandle(
+    JNIEnv*, jobject, jlong jopt_handle,
+    jlong jcompactionfilterfactory_handle) {
+  auto* cff_factory = reinterpret_cast<
+      std::shared_ptr<ROCKSDB_NAMESPACE::CompactionFilterFactory>*>(
+      jcompactionfilterfactory_handle);
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jopt_handle)
+      ->compaction_filter_factory = *cff_factory;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setWriteBufferSize
+ * Signature: (JJ)I
+ */
+void Java_org_rocksdb_Options_setWriteBufferSize(JNIEnv* env, jobject,
+                                                 jlong jhandle,
+                                                 jlong jwrite_buffer_size) {
+  auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(
+      jwrite_buffer_size);
+  if (s.ok()) {
+    reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->write_buffer_size =
+        jwrite_buffer_size;
+  } else {
+    ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setWriteBufferManager
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setWriteBufferManager(
+    JNIEnv*, jobject, jlong joptions_handle,
+    jlong jwrite_buffer_manager_handle) {
+  auto* write_buffer_manager =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::WriteBufferManager>*>(
+          jwrite_buffer_manager_handle);
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(joptions_handle)
+      ->write_buffer_manager = *write_buffer_manager;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    writeBufferSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_writeBufferSize(JNIEnv*, jobject,
+                                               jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->write_buffer_size;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxWriteBufferNumber
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMaxWriteBufferNumber(
+    JNIEnv*, jobject, jlong jhandle, jint jmax_write_buffer_number) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->max_write_buffer_number = jmax_write_buffer_number;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setStatistics
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setStatistics(JNIEnv*, jobject, jlong jhandle,
+                                            jlong jstatistics_handle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  auto* pSptr =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::StatisticsJni>*>(
+          jstatistics_handle);
+  opt->statistics = *pSptr;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    statistics
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_statistics(JNIEnv*, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> sptr = opt->statistics;
+  if (sptr == nullptr) {
+    return 0;
+  } else {
+    std::shared_ptr<ROCKSDB_NAMESPACE::Statistics>* pSptr =
+        new std::shared_ptr<ROCKSDB_NAMESPACE::Statistics>(sptr);
+    return GET_CPLUSPLUS_POINTER(pSptr);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxWriteBufferNumber
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_maxWriteBufferNumber(JNIEnv*, jobject,
+                                                   jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->max_write_buffer_number;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    errorIfExists
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_errorIfExists(JNIEnv*, jobject,
+                                                jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->error_if_exists;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setErrorIfExists
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setErrorIfExists(JNIEnv*, jobject, jlong jhandle,
+                                               jboolean error_if_exists) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->error_if_exists =
+      static_cast<bool>(error_if_exists);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    paranoidChecks
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_paranoidChecks(JNIEnv*, jobject,
+                                                 jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->paranoid_checks;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setParanoidChecks
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setParanoidChecks(JNIEnv*, jobject, jlong jhandle,
+                                                jboolean paranoid_checks) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->paranoid_checks =
+      static_cast<bool>(paranoid_checks);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setEnv
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setEnv(JNIEnv*, jobject, jlong jhandle,
+                                     jlong jenv) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->env =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Env*>(jenv);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxTotalWalSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setMaxTotalWalSize(JNIEnv*, jobject,
+                                                 jlong jhandle,
+                                                 jlong jmax_total_wal_size) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->max_total_wal_size =
+      static_cast<jlong>(jmax_total_wal_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxTotalWalSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_maxTotalWalSize(JNIEnv*, jobject,
+                                               jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->max_total_wal_size;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxOpenFiles
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_maxOpenFiles(JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->max_open_files;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxOpenFiles
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMaxOpenFiles(JNIEnv*, jobject, jlong jhandle,
+                                              jint max_open_files) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->max_open_files =
+      static_cast<int>(max_open_files);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxFileOpeningThreads
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMaxFileOpeningThreads(
+    JNIEnv*, jobject, jlong jhandle, jint jmax_file_opening_threads) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->max_file_opening_threads = static_cast<int>(jmax_file_opening_threads);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxFileOpeningThreads
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_maxFileOpeningThreads(JNIEnv*, jobject,
+                                                    jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<int>(opt->max_file_opening_threads);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    useFsync
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_useFsync(JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->use_fsync;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setUseFsync
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setUseFsync(JNIEnv*, jobject, jlong jhandle,
+                                          jboolean use_fsync) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->use_fsync =
+      static_cast<bool>(use_fsync);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setDbPaths
+ * Signature: (J[Ljava/lang/String;[J)V
+ */
+void Java_org_rocksdb_Options_setDbPaths(JNIEnv* env, jobject, jlong jhandle,
+                                         jobjectArray jpaths,
+                                         jlongArray jtarget_sizes) {
+  std::vector<ROCKSDB_NAMESPACE::DbPath> db_paths;
+  jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, nullptr);
+  if (ptr_jtarget_size == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  jboolean has_exception = JNI_FALSE;
+  const jsize len = env->GetArrayLength(jpaths);
+  for (jsize i = 0; i < len; i++) {
+    jobject jpath =
+        reinterpret_cast<jstring>(env->GetObjectArrayElement(jpaths, i));
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
+      return;
+    }
+    std::string path = ROCKSDB_NAMESPACE::JniUtil::copyStdString(
+        env, static_cast<jstring>(jpath), &has_exception);
+    env->DeleteLocalRef(jpath);
+
+    if (has_exception == JNI_TRUE) {
+      env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
+      return;
+    }
+
+    jlong jtarget_size = ptr_jtarget_size[i];
+
+    db_paths.push_back(
+        ROCKSDB_NAMESPACE::DbPath(path, static_cast<uint64_t>(jtarget_size)));
+  }
+
+  env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
+
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->db_paths = db_paths;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    dbPathsLen
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_dbPathsLen(JNIEnv*, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jlong>(opt->db_paths.size());
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    dbPaths
+ * Signature: (J[Ljava/lang/String;[J)V
+ */
+void Java_org_rocksdb_Options_dbPaths(JNIEnv* env, jobject, jlong jhandle,
+                                      jobjectArray jpaths,
+                                      jlongArray jtarget_sizes) {
+  jboolean is_copy;
+  jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, &is_copy);
+  if (ptr_jtarget_size == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  const jsize len = env->GetArrayLength(jpaths);
+  for (jsize i = 0; i < len; i++) {
+    ROCKSDB_NAMESPACE::DbPath db_path = opt->db_paths[i];
+
+    jstring jpath = env->NewStringUTF(db_path.path.c_str());
+    if (jpath == nullptr) {
+      // exception thrown: OutOfMemoryError
+      env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
+      return;
+    }
+    env->SetObjectArrayElement(jpaths, i, jpath);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      env->DeleteLocalRef(jpath);
+      env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
+      return;
+    }
+
+    ptr_jtarget_size[i] = static_cast<jint>(db_path.target_size);
+  }
+
+  env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size,
+                                is_copy == JNI_TRUE ? 0 : JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    dbLogDir
+ * Signature: (J)Ljava/lang/String
+ */
+jstring Java_org_rocksdb_Options_dbLogDir(JNIEnv* env, jobject, jlong jhandle) {
+  return env->NewStringUTF(
+      reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+          ->db_log_dir.c_str());
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setDbLogDir
+ * Signature: (JLjava/lang/String)V
+ */
+void Java_org_rocksdb_Options_setDbLogDir(JNIEnv* env, jobject, jlong jhandle,
+                                          jstring jdb_log_dir) {
+  const char* log_dir = env->GetStringUTFChars(jdb_log_dir, nullptr);
+  if (log_dir == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->db_log_dir.assign(
+      log_dir);
+  env->ReleaseStringUTFChars(jdb_log_dir, log_dir);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    walDir
+ * Signature: (J)Ljava/lang/String
+ */
+jstring Java_org_rocksdb_Options_walDir(JNIEnv* env, jobject, jlong jhandle) {
+  return env->NewStringUTF(
+      reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->wal_dir.c_str());
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setWalDir
+ * Signature: (JLjava/lang/String)V
+ */
+void Java_org_rocksdb_Options_setWalDir(JNIEnv* env, jobject, jlong jhandle,
+                                        jstring jwal_dir) {
+  const char* wal_dir = env->GetStringUTFChars(jwal_dir, nullptr);
+  if (wal_dir == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->wal_dir.assign(
+      wal_dir);
+  env->ReleaseStringUTFChars(jwal_dir, wal_dir);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    deleteObsoleteFilesPeriodMicros
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_deleteObsoleteFilesPeriodMicros(JNIEnv*, jobject,
+                                                               jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->delete_obsolete_files_period_micros;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setDeleteObsoleteFilesPeriodMicros
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setDeleteObsoleteFilesPeriodMicros(JNIEnv*,
+                                                                 jobject,
+                                                                 jlong jhandle,
+                                                                 jlong micros) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->delete_obsolete_files_period_micros = static_cast<int64_t>(micros);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxBackgroundCompactions
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_maxBackgroundCompactions(JNIEnv*, jobject,
+                                                       jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->max_background_compactions;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxBackgroundCompactions
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMaxBackgroundCompactions(JNIEnv*, jobject,
+                                                          jlong jhandle,
+                                                          jint max) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->max_background_compactions = static_cast<int>(max);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxSubcompactions
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMaxSubcompactions(JNIEnv*, jobject,
+                                                   jlong jhandle, jint max) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->max_subcompactions =
+      static_cast<int32_t>(max);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxSubcompactions
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_maxSubcompactions(JNIEnv*, jobject,
+                                                jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->max_subcompactions;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxBackgroundFlushes
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_maxBackgroundFlushes(JNIEnv*, jobject,
+                                                   jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->max_background_flushes;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxBackgroundFlushes
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMaxBackgroundFlushes(
+    JNIEnv*, jobject, jlong jhandle, jint max_background_flushes) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->max_background_flushes = static_cast<int>(max_background_flushes);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxBackgroundJobs
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_maxBackgroundJobs(JNIEnv*, jobject,
+                                                jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->max_background_jobs;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxBackgroundJobs
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMaxBackgroundJobs(JNIEnv*, jobject,
+                                                   jlong jhandle,
+                                                   jint max_background_jobs) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->max_background_jobs =
+      static_cast<int>(max_background_jobs);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxLogFileSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_maxLogFileSize(JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->max_log_file_size;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxLogFileSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setMaxLogFileSize(JNIEnv* env, jobject,
+                                                jlong jhandle,
+                                                jlong max_log_file_size) {
+  auto s =
+      ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(max_log_file_size);
+  if (s.ok()) {
+    reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->max_log_file_size =
+        max_log_file_size;
+  } else {
+    ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    logFileTimeToRoll
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_logFileTimeToRoll(JNIEnv*, jobject,
+                                                 jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->log_file_time_to_roll;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setLogFileTimeToRoll
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setLogFileTimeToRoll(
+    JNIEnv* env, jobject, jlong jhandle, jlong log_file_time_to_roll) {
+  auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(
+      log_file_time_to_roll);
+  if (s.ok()) {
+    reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+        ->log_file_time_to_roll = log_file_time_to_roll;
+  } else {
+    ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    keepLogFileNum
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_keepLogFileNum(JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->keep_log_file_num;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setKeepLogFileNum
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setKeepLogFileNum(JNIEnv* env, jobject,
+                                                jlong jhandle,
+                                                jlong keep_log_file_num) {
+  auto s =
+      ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(keep_log_file_num);
+  if (s.ok()) {
+    reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->keep_log_file_num =
+        keep_log_file_num;
+  } else {
+    ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    recycleLogFileNum
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_recycleLogFileNum(JNIEnv*, jobject,
+                                                 jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->recycle_log_file_num;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setRecycleLogFileNum
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setRecycleLogFileNum(JNIEnv* env, jobject,
+                                                   jlong jhandle,
+                                                   jlong recycle_log_file_num) {
+  auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(
+      recycle_log_file_num);
+  if (s.ok()) {
+    reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+        ->recycle_log_file_num = recycle_log_file_num;
+  } else {
+    ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxManifestFileSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_maxManifestFileSize(JNIEnv*, jobject,
+                                                   jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->max_manifest_file_size;
+}
+
+/*
+ * Method:    memTableFactoryName
+ * Signature: (J)Ljava/lang/String
+ */
+jstring Java_org_rocksdb_Options_memTableFactoryName(JNIEnv* env, jobject,
+                                                     jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  ROCKSDB_NAMESPACE::MemTableRepFactory* tf = opt->memtable_factory.get();
+
+  // Should never be nullptr.
+  // Default memtable factory is SkipListFactory
+  assert(tf);
+
+  // temporarly fix for the historical typo
+  if (strcmp(tf->Name(), "HashLinkListRepFactory") == 0) {
+    return env->NewStringUTF("HashLinkedListRepFactory");
+  }
+
+  return env->NewStringUTF(tf->Name());
+}
+
+static std::vector<ROCKSDB_NAMESPACE::DbPath>
+rocksdb_convert_cf_paths_from_java_helper(JNIEnv* env, jobjectArray path_array,
+                                          jlongArray size_array,
+                                          jboolean* has_exception) {
+  jboolean copy_str_has_exception;
+  std::vector<std::string> paths = ROCKSDB_NAMESPACE::JniUtil::copyStrings(
+      env, path_array, &copy_str_has_exception);
+  if (JNI_TRUE == copy_str_has_exception) {
+    // Exception thrown
+    *has_exception = JNI_TRUE;
+    return {};
+  }
+
+  if (static_cast<size_t>(env->GetArrayLength(size_array)) != paths.size()) {
+    ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(
+        env,
+        ROCKSDB_NAMESPACE::Status::InvalidArgument(
+            ROCKSDB_NAMESPACE::Slice("There should be a corresponding target "
+                                     "size for every path and vice versa.")));
+    *has_exception = JNI_TRUE;
+    return {};
+  }
+
+  jlong* size_array_ptr = env->GetLongArrayElements(size_array, nullptr);
+  if (nullptr == size_array_ptr) {
+    // exception thrown: OutOfMemoryError
+    *has_exception = JNI_TRUE;
+    return {};
+  }
+  std::vector<ROCKSDB_NAMESPACE::DbPath> cf_paths;
+  for (size_t i = 0; i < paths.size(); ++i) {
+    jlong target_size = size_array_ptr[i];
+    if (target_size < 0) {
+      ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(
+          env,
+          ROCKSDB_NAMESPACE::Status::InvalidArgument(ROCKSDB_NAMESPACE::Slice(
+              "Path target size has to be positive.")));
+      *has_exception = JNI_TRUE;
+      env->ReleaseLongArrayElements(size_array, size_array_ptr, JNI_ABORT);
+      return {};
+    }
+    cf_paths.push_back(ROCKSDB_NAMESPACE::DbPath(
+        paths[i], static_cast<uint64_t>(target_size)));
+  }
+
+  env->ReleaseLongArrayElements(size_array, size_array_ptr, JNI_ABORT);
+
+  return cf_paths;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setCfPaths
+ * Signature: (J[Ljava/lang/String;[J)V
+ */
+void Java_org_rocksdb_Options_setCfPaths(JNIEnv* env, jclass, jlong jhandle,
+                                         jobjectArray path_array,
+                                         jlongArray size_array) {
+  auto* options = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  jboolean has_exception = JNI_FALSE;
+  std::vector<ROCKSDB_NAMESPACE::DbPath> cf_paths =
+      rocksdb_convert_cf_paths_from_java_helper(env, path_array, size_array,
+                                                &has_exception);
+  if (JNI_FALSE == has_exception) {
+    options->cf_paths = std::move(cf_paths);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    cfPathsLen
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_cfPathsLen(JNIEnv*, jclass, jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jlong>(opt->cf_paths.size());
+}
+
+template <typename T>
+static void rocksdb_convert_cf_paths_to_java_helper(JNIEnv* env, jlong jhandle,
+                                                    jobjectArray jpaths,
+                                                    jlongArray jtarget_sizes) {
+  jboolean is_copy;
+  jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, &is_copy);
+  if (ptr_jtarget_size == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  auto* opt = reinterpret_cast<T*>(jhandle);
+  const jsize len = env->GetArrayLength(jpaths);
+  for (jsize i = 0; i < len; i++) {
+    ROCKSDB_NAMESPACE::DbPath cf_path = opt->cf_paths[i];
+
+    jstring jpath = env->NewStringUTF(cf_path.path.c_str());
+    if (jpath == nullptr) {
+      // exception thrown: OutOfMemoryError
+      env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
+      return;
+    }
+    env->SetObjectArrayElement(jpaths, i, jpath);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      env->DeleteLocalRef(jpath);
+      env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
+      return;
+    }
+
+    ptr_jtarget_size[i] = static_cast<jint>(cf_path.target_size);
+
+    env->DeleteLocalRef(jpath);
+  }
+
+  env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size,
+                                is_copy ? 0 : JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    cfPaths
+ * Signature: (J[Ljava/lang/String;[J)V
+ */
+void Java_org_rocksdb_Options_cfPaths(JNIEnv* env, jclass, jlong jhandle,
+                                      jobjectArray jpaths,
+                                      jlongArray jtarget_sizes) {
+  rocksdb_convert_cf_paths_to_java_helper<ROCKSDB_NAMESPACE::Options>(
+      env, jhandle, jpaths, jtarget_sizes);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxManifestFileSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setMaxManifestFileSize(
+    JNIEnv*, jobject, jlong jhandle, jlong max_manifest_file_size) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->max_manifest_file_size = static_cast<int64_t>(max_manifest_file_size);
+}
+
+/*
+ * Method:    setMemTableFactory
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setMemTableFactory(JNIEnv*, jobject,
+                                                 jlong jhandle,
+                                                 jlong jfactory_handle) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->memtable_factory.reset(
+          reinterpret_cast<ROCKSDB_NAMESPACE::MemTableRepFactory*>(
+              jfactory_handle));
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setRateLimiter
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setRateLimiter(JNIEnv*, jobject, jlong jhandle,
+                                             jlong jrate_limiter_handle) {
+  std::shared_ptr<ROCKSDB_NAMESPACE::RateLimiter>* pRateLimiter =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::RateLimiter>*>(
+          jrate_limiter_handle);
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->rate_limiter =
+      *pRateLimiter;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setSstFileManager
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setSstFileManager(
+    JNIEnv*, jobject, jlong jhandle, jlong jsst_file_manager_handle) {
+  auto* sptr_sst_file_manager =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::SstFileManager>*>(
+          jsst_file_manager_handle);
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->sst_file_manager =
+      *sptr_sst_file_manager;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setLogger
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setLogger(JNIEnv*, jobject, jlong jhandle,
+                                        jlong jlogger_handle) {
+  std::shared_ptr<ROCKSDB_NAMESPACE::LoggerJniCallback>* pLogger =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::LoggerJniCallback>*>(
+          jlogger_handle);
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->info_log = *pLogger;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setInfoLogLevel
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_Options_setInfoLogLevel(JNIEnv*, jobject, jlong jhandle,
+                                              jbyte jlog_level) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->info_log_level =
+      static_cast<ROCKSDB_NAMESPACE::InfoLogLevel>(jlog_level);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    infoLogLevel
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_Options_infoLogLevel(JNIEnv*, jobject, jlong jhandle) {
+  return static_cast<jbyte>(
+      reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->info_log_level);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    tableCacheNumshardbits
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_tableCacheNumshardbits(JNIEnv*, jobject,
+                                                     jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->table_cache_numshardbits;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setTableCacheNumshardbits
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setTableCacheNumshardbits(
+    JNIEnv*, jobject, jlong jhandle, jint table_cache_numshardbits) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->table_cache_numshardbits = static_cast<int>(table_cache_numshardbits);
+}
+
+/*
+ * Method:    useFixedLengthPrefixExtractor
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_useFixedLengthPrefixExtractor(
+    JNIEnv*, jobject, jlong jhandle, jint jprefix_length) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->prefix_extractor.reset(ROCKSDB_NAMESPACE::NewFixedPrefixTransform(
+          static_cast<int>(jprefix_length)));
+}
+
+/*
+ * Method:    useCappedPrefixExtractor
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_useCappedPrefixExtractor(JNIEnv*, jobject,
+                                                       jlong jhandle,
+                                                       jint jprefix_length) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->prefix_extractor.reset(ROCKSDB_NAMESPACE::NewCappedPrefixTransform(
+          static_cast<int>(jprefix_length)));
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    walTtlSeconds
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_walTtlSeconds(JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->WAL_ttl_seconds;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setWalTtlSeconds
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setWalTtlSeconds(JNIEnv*, jobject, jlong jhandle,
+                                               jlong WAL_ttl_seconds) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->WAL_ttl_seconds =
+      static_cast<int64_t>(WAL_ttl_seconds);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    walTtlSeconds
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_walSizeLimitMB(JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->WAL_size_limit_MB;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setWalSizeLimitMB
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setWalSizeLimitMB(JNIEnv*, jobject, jlong jhandle,
+                                                jlong WAL_size_limit_MB) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->WAL_size_limit_MB =
+      static_cast<int64_t>(WAL_size_limit_MB);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxWriteBatchGroupSizeBytes
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setMaxWriteBatchGroupSizeBytes(
+    JNIEnv*, jclass, jlong jhandle, jlong jmax_write_batch_group_size_bytes) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->max_write_batch_group_size_bytes =
+      static_cast<uint64_t>(jmax_write_batch_group_size_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxWriteBatchGroupSizeBytes
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_maxWriteBatchGroupSizeBytes(JNIEnv*, jclass,
+                                                           jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jlong>(opt->max_write_batch_group_size_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    manifestPreallocationSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_manifestPreallocationSize(JNIEnv*, jobject,
+                                                         jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->manifest_preallocation_size;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setManifestPreallocationSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setManifestPreallocationSize(
+    JNIEnv* env, jobject, jlong jhandle, jlong preallocation_size) {
+  auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(
+      preallocation_size);
+  if (s.ok()) {
+    reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+        ->manifest_preallocation_size = preallocation_size;
+  } else {
+    ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Method:    setTableFactory
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setTableFactory(JNIEnv*, jobject, jlong jhandle,
+                                              jlong jtable_factory_handle) {
+  auto* options = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  auto* table_factory =
+      reinterpret_cast<ROCKSDB_NAMESPACE::TableFactory*>(jtable_factory_handle);
+  options->table_factory.reset(table_factory);
+}
+
+/*
+ * Method:    setSstPartitionerFactory
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setSstPartitionerFactory(JNIEnv*, jobject,
+                                                       jlong jhandle,
+                                                       jlong factory_handle) {
+  auto* options = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  auto factory = reinterpret_cast<
+      std::shared_ptr<ROCKSDB_NAMESPACE::SstPartitionerFactory>*>(
+      factory_handle);
+  options->sst_partitioner_factory = *factory;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setCompactionThreadLimiter
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setCompactionThreadLimiter(
+    JNIEnv*, jclass, jlong jhandle, jlong jlimiter_handle) {
+  auto* options = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  auto* limiter = reinterpret_cast<
+      std::shared_ptr<ROCKSDB_NAMESPACE::ConcurrentTaskLimiter>*>(
+      jlimiter_handle);
+  options->compaction_thread_limiter = *limiter;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    allowMmapReads
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_allowMmapReads(JNIEnv*, jobject,
+                                                 jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->allow_mmap_reads;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setAllowMmapReads
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setAllowMmapReads(JNIEnv*, jobject, jlong jhandle,
+                                                jboolean allow_mmap_reads) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->allow_mmap_reads =
+      static_cast<bool>(allow_mmap_reads);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    allowMmapWrites
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_allowMmapWrites(JNIEnv*, jobject,
+                                                  jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->allow_mmap_writes;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setAllowMmapWrites
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setAllowMmapWrites(JNIEnv*, jobject,
+                                                 jlong jhandle,
+                                                 jboolean allow_mmap_writes) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->allow_mmap_writes =
+      static_cast<bool>(allow_mmap_writes);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    useDirectReads
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_useDirectReads(JNIEnv*, jobject,
+                                                 jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->use_direct_reads;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setUseDirectReads
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setUseDirectReads(JNIEnv*, jobject, jlong jhandle,
+                                                jboolean use_direct_reads) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->use_direct_reads =
+      static_cast<bool>(use_direct_reads);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    useDirectIoForFlushAndCompaction
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_useDirectIoForFlushAndCompaction(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->use_direct_io_for_flush_and_compaction;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setUseDirectIoForFlushAndCompaction
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setUseDirectIoForFlushAndCompaction(
+    JNIEnv*, jobject, jlong jhandle,
+    jboolean use_direct_io_for_flush_and_compaction) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->use_direct_io_for_flush_and_compaction =
+      static_cast<bool>(use_direct_io_for_flush_and_compaction);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setAllowFAllocate
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setAllowFAllocate(JNIEnv*, jobject, jlong jhandle,
+                                                jboolean jallow_fallocate) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->allow_fallocate =
+      static_cast<bool>(jallow_fallocate);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    allowFAllocate
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_allowFAllocate(JNIEnv*, jobject,
+                                                 jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jboolean>(opt->allow_fallocate);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    isFdCloseOnExec
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_isFdCloseOnExec(JNIEnv*, jobject,
+                                                  jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->is_fd_close_on_exec;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setIsFdCloseOnExec
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setIsFdCloseOnExec(JNIEnv*, jobject,
+                                                 jlong jhandle,
+                                                 jboolean is_fd_close_on_exec) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->is_fd_close_on_exec =
+      static_cast<bool>(is_fd_close_on_exec);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    statsDumpPeriodSec
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_statsDumpPeriodSec(JNIEnv*, jobject,
+                                                 jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->stats_dump_period_sec;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setStatsDumpPeriodSec
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setStatsDumpPeriodSec(
+    JNIEnv*, jobject, jlong jhandle, jint jstats_dump_period_sec) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->stats_dump_period_sec =
+      static_cast<unsigned int>(jstats_dump_period_sec);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    statsPersistPeriodSec
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_statsPersistPeriodSec(JNIEnv*, jobject,
+                                                    jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->stats_persist_period_sec;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setStatsPersistPeriodSec
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setStatsPersistPeriodSec(
+    JNIEnv*, jobject, jlong jhandle, jint jstats_persist_period_sec) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->stats_persist_period_sec =
+      static_cast<unsigned int>(jstats_persist_period_sec);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    statsHistoryBufferSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_statsHistoryBufferSize(JNIEnv*, jobject,
+                                                      jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->stats_history_buffer_size;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setStatsHistoryBufferSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setStatsHistoryBufferSize(
+    JNIEnv*, jobject, jlong jhandle, jlong jstats_history_buffer_size) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->stats_history_buffer_size =
+      static_cast<size_t>(jstats_history_buffer_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    adviseRandomOnOpen
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_adviseRandomOnOpen(JNIEnv*, jobject,
+                                                     jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->advise_random_on_open;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setAdviseRandomOnOpen
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setAdviseRandomOnOpen(
+    JNIEnv*, jobject, jlong jhandle, jboolean advise_random_on_open) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->advise_random_on_open = static_cast<bool>(advise_random_on_open);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setDbWriteBufferSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setDbWriteBufferSize(
+    JNIEnv*, jobject, jlong jhandle, jlong jdb_write_buffer_size) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->db_write_buffer_size = static_cast<size_t>(jdb_write_buffer_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    dbWriteBufferSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_dbWriteBufferSize(JNIEnv*, jobject,
+                                                 jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jlong>(opt->db_write_buffer_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setAccessHintOnCompactionStart
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_Options_setAccessHintOnCompactionStart(
+    JNIEnv*, jobject, jlong jhandle, jbyte jaccess_hint_value) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->access_hint_on_compaction_start =
+      ROCKSDB_NAMESPACE::AccessHintJni::toCppAccessHint(jaccess_hint_value);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    accessHintOnCompactionStart
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_Options_accessHintOnCompactionStart(JNIEnv*, jobject,
+                                                           jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return ROCKSDB_NAMESPACE::AccessHintJni::toJavaAccessHint(
+      opt->access_hint_on_compaction_start);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setCompactionReadaheadSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setCompactionReadaheadSize(
+    JNIEnv*, jobject, jlong jhandle, jlong jcompaction_readahead_size) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->compaction_readahead_size =
+      static_cast<size_t>(jcompaction_readahead_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    compactionReadaheadSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_compactionReadaheadSize(JNIEnv*, jobject,
+                                                       jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jlong>(opt->compaction_readahead_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setRandomAccessMaxBufferSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setRandomAccessMaxBufferSize(
+    JNIEnv*, jobject, jlong jhandle, jlong jrandom_access_max_buffer_size) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->random_access_max_buffer_size =
+      static_cast<size_t>(jrandom_access_max_buffer_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    randomAccessMaxBufferSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_randomAccessMaxBufferSize(JNIEnv*, jobject,
+                                                         jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jlong>(opt->random_access_max_buffer_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setWritableFileMaxBufferSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setWritableFileMaxBufferSize(
+    JNIEnv*, jobject, jlong jhandle, jlong jwritable_file_max_buffer_size) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->writable_file_max_buffer_size =
+      static_cast<size_t>(jwritable_file_max_buffer_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    writableFileMaxBufferSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_writableFileMaxBufferSize(JNIEnv*, jobject,
+                                                         jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jlong>(opt->writable_file_max_buffer_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    useAdaptiveMutex
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_useAdaptiveMutex(JNIEnv*, jobject,
+                                                   jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->use_adaptive_mutex;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setUseAdaptiveMutex
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setUseAdaptiveMutex(JNIEnv*, jobject,
+                                                  jlong jhandle,
+                                                  jboolean use_adaptive_mutex) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->use_adaptive_mutex =
+      static_cast<bool>(use_adaptive_mutex);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    bytesPerSync
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_bytesPerSync(JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->bytes_per_sync;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setBytesPerSync
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setBytesPerSync(JNIEnv*, jobject, jlong jhandle,
+                                              jlong bytes_per_sync) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->bytes_per_sync =
+      static_cast<int64_t>(bytes_per_sync);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setWalBytesPerSync
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setWalBytesPerSync(JNIEnv*, jobject,
+                                                 jlong jhandle,
+                                                 jlong jwal_bytes_per_sync) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->wal_bytes_per_sync =
+      static_cast<int64_t>(jwal_bytes_per_sync);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    walBytesPerSync
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_walBytesPerSync(JNIEnv*, jobject,
+                                               jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jlong>(opt->wal_bytes_per_sync);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setStrictBytesPerSync
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setStrictBytesPerSync(
+    JNIEnv*, jobject, jlong jhandle, jboolean jstrict_bytes_per_sync) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->strict_bytes_per_sync = jstrict_bytes_per_sync == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    strictBytesPerSync
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_strictBytesPerSync(JNIEnv*, jobject,
+                                                     jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jboolean>(opt->strict_bytes_per_sync);
+}
+
+// Note: the RocksJava API currently only supports EventListeners implemented in
+// Java. It could be extended in future to also support adding/removing
+// EventListeners implemented in C++.
+static void rocksdb_set_event_listeners_helper(
+    JNIEnv* env, jlongArray jlistener_array,
+    std::vector<std::shared_ptr<ROCKSDB_NAMESPACE::EventListener>>&
+        listener_sptr_vec) {
+  jlong* ptr_jlistener_array =
+      env->GetLongArrayElements(jlistener_array, nullptr);
+  if (ptr_jlistener_array == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+  const jsize array_size = env->GetArrayLength(jlistener_array);
+  listener_sptr_vec.clear();
+  for (jsize i = 0; i < array_size; ++i) {
+    const auto& listener_sptr =
+        *reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::EventListener>*>(
+            ptr_jlistener_array[i]);
+    listener_sptr_vec.push_back(listener_sptr);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setEventListeners
+ * Signature: (J[J)V
+ */
+void Java_org_rocksdb_Options_setEventListeners(JNIEnv* env, jclass,
+                                                jlong jhandle,
+                                                jlongArray jlistener_array) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  rocksdb_set_event_listeners_helper(env, jlistener_array, opt->listeners);
+}
+
+// Note: the RocksJava API currently only supports EventListeners implemented in
+// Java. It could be extended in future to also support adding/removing
+// EventListeners implemented in C++.
+static jobjectArray rocksdb_get_event_listeners_helper(
+    JNIEnv* env,
+    const std::vector<std::shared_ptr<ROCKSDB_NAMESPACE::EventListener>>&
+        listener_sptr_vec) {
+  jsize sz = static_cast<jsize>(listener_sptr_vec.size());
+  jclass jlistener_clazz =
+      ROCKSDB_NAMESPACE::AbstractEventListenerJni::getJClass(env);
+  jobjectArray jlisteners = env->NewObjectArray(sz, jlistener_clazz, nullptr);
+  if (jlisteners == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+  for (jsize i = 0; i < sz; ++i) {
+    const auto* jni_cb =
+        static_cast<ROCKSDB_NAMESPACE::EventListenerJniCallback*>(
+            listener_sptr_vec[i].get());
+    env->SetObjectArrayElement(jlisteners, i, jni_cb->GetJavaObject());
+  }
+  return jlisteners;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    eventListeners
+ * Signature: (J)[Lorg/rocksdb/AbstractEventListener;
+ */
+jobjectArray Java_org_rocksdb_Options_eventListeners(JNIEnv* env, jclass,
+                                                     jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return rocksdb_get_event_listeners_helper(env, opt->listeners);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setEnableThreadTracking
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setEnableThreadTracking(
+    JNIEnv*, jobject, jlong jhandle, jboolean jenable_thread_tracking) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->enable_thread_tracking = static_cast<bool>(jenable_thread_tracking);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    enableThreadTracking
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_enableThreadTracking(JNIEnv*, jobject,
+                                                       jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jboolean>(opt->enable_thread_tracking);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setDelayedWriteRate
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setDelayedWriteRate(JNIEnv*, jobject,
+                                                  jlong jhandle,
+                                                  jlong jdelayed_write_rate) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->delayed_write_rate = static_cast<uint64_t>(jdelayed_write_rate);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    delayedWriteRate
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_delayedWriteRate(JNIEnv*, jobject,
+                                                jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jlong>(opt->delayed_write_rate);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setEnablePipelinedWrite
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setEnablePipelinedWrite(
+    JNIEnv*, jobject, jlong jhandle, jboolean jenable_pipelined_write) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->enable_pipelined_write = jenable_pipelined_write == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    enablePipelinedWrite
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_enablePipelinedWrite(JNIEnv*, jobject,
+                                                       jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jboolean>(opt->enable_pipelined_write);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setUnorderedWrite
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setUnorderedWrite(JNIEnv*, jobject, jlong jhandle,
+                                                jboolean unordered_write) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)->unordered_write =
+      static_cast<bool>(unordered_write);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    unorderedWrite
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_unorderedWrite(JNIEnv*, jobject,
+                                                 jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->unordered_write;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setAllowConcurrentMemtableWrite
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setAllowConcurrentMemtableWrite(JNIEnv*, jobject,
+                                                              jlong jhandle,
+                                                              jboolean allow) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->allow_concurrent_memtable_write = static_cast<bool>(allow);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    allowConcurrentMemtableWrite
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_allowConcurrentMemtableWrite(JNIEnv*, jobject,
+                                                               jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->allow_concurrent_memtable_write;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setEnableWriteThreadAdaptiveYield
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setEnableWriteThreadAdaptiveYield(
+    JNIEnv*, jobject, jlong jhandle, jboolean yield) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->enable_write_thread_adaptive_yield = static_cast<bool>(yield);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    enableWriteThreadAdaptiveYield
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_enableWriteThreadAdaptiveYield(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->enable_write_thread_adaptive_yield;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setWriteThreadMaxYieldUsec
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setWriteThreadMaxYieldUsec(JNIEnv*, jobject,
+                                                         jlong jhandle,
+                                                         jlong max) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->write_thread_max_yield_usec = static_cast<int64_t>(max);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    writeThreadMaxYieldUsec
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_writeThreadMaxYieldUsec(JNIEnv*, jobject,
+                                                       jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->write_thread_max_yield_usec;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setWriteThreadSlowYieldUsec
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setWriteThreadSlowYieldUsec(JNIEnv*, jobject,
+                                                          jlong jhandle,
+                                                          jlong slow) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->write_thread_slow_yield_usec = static_cast<int64_t>(slow);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    writeThreadSlowYieldUsec
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_writeThreadSlowYieldUsec(JNIEnv*, jobject,
+                                                        jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->write_thread_slow_yield_usec;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setSkipStatsUpdateOnDbOpen
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setSkipStatsUpdateOnDbOpen(
+    JNIEnv*, jobject, jlong jhandle, jboolean jskip_stats_update_on_db_open) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->skip_stats_update_on_db_open =
+      static_cast<bool>(jskip_stats_update_on_db_open);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    skipStatsUpdateOnDbOpen
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_skipStatsUpdateOnDbOpen(JNIEnv*, jobject,
+                                                          jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jboolean>(opt->skip_stats_update_on_db_open);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setSkipCheckingSstFileSizesOnDbOpen
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setSkipCheckingSstFileSizesOnDbOpen(
+    JNIEnv*, jclass, jlong jhandle,
+    jboolean jskip_checking_sst_file_sizes_on_db_open) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->skip_checking_sst_file_sizes_on_db_open =
+      static_cast<bool>(jskip_checking_sst_file_sizes_on_db_open);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    skipCheckingSstFileSizesOnDbOpen
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_skipCheckingSstFileSizesOnDbOpen(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jboolean>(opt->skip_checking_sst_file_sizes_on_db_open);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setWalRecoveryMode
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_Options_setWalRecoveryMode(
+    JNIEnv*, jobject, jlong jhandle, jbyte jwal_recovery_mode_value) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->wal_recovery_mode =
+      ROCKSDB_NAMESPACE::WALRecoveryModeJni::toCppWALRecoveryMode(
+          jwal_recovery_mode_value);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    walRecoveryMode
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_Options_walRecoveryMode(JNIEnv*, jobject,
+                                               jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return ROCKSDB_NAMESPACE::WALRecoveryModeJni::toJavaWALRecoveryMode(
+      opt->wal_recovery_mode);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setAllow2pc
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setAllow2pc(JNIEnv*, jobject, jlong jhandle,
+                                          jboolean jallow_2pc) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->allow_2pc = static_cast<bool>(jallow_2pc);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    allow2pc
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_allow2pc(JNIEnv*, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jboolean>(opt->allow_2pc);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setRowCache
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setRowCache(JNIEnv*, jobject, jlong jhandle,
+                                          jlong jrow_cache_handle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  auto* row_cache =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache>*>(
+          jrow_cache_handle);
+  opt->row_cache = *row_cache;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setWalFilter
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setWalFilter(JNIEnv*, jobject, jlong jhandle,
+                                           jlong jwal_filter_handle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  auto* wal_filter = reinterpret_cast<ROCKSDB_NAMESPACE::WalFilterJniCallback*>(
+      jwal_filter_handle);
+  opt->wal_filter = wal_filter;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setFailIfOptionsFileError
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setFailIfOptionsFileError(
+    JNIEnv*, jobject, jlong jhandle, jboolean jfail_if_options_file_error) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->fail_if_options_file_error =
+      static_cast<bool>(jfail_if_options_file_error);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    failIfOptionsFileError
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_failIfOptionsFileError(JNIEnv*, jobject,
+                                                         jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jboolean>(opt->fail_if_options_file_error);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setDumpMallocStats
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setDumpMallocStats(JNIEnv*, jobject,
+                                                 jlong jhandle,
+                                                 jboolean jdump_malloc_stats) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->dump_malloc_stats = static_cast<bool>(jdump_malloc_stats);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    dumpMallocStats
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_dumpMallocStats(JNIEnv*, jobject,
+                                                  jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jboolean>(opt->dump_malloc_stats);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setAvoidFlushDuringRecovery
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setAvoidFlushDuringRecovery(
+    JNIEnv*, jobject, jlong jhandle, jboolean javoid_flush_during_recovery) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->avoid_flush_during_recovery =
+      static_cast<bool>(javoid_flush_during_recovery);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    avoidFlushDuringRecovery
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_avoidFlushDuringRecovery(JNIEnv*, jobject,
+                                                           jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jboolean>(opt->avoid_flush_during_recovery);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setAvoidUnnecessaryBlockingIO
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setAvoidUnnecessaryBlockingIO(
+    JNIEnv*, jclass, jlong jhandle, jboolean avoid_blocking_io) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->avoid_unnecessary_blocking_io = static_cast<bool>(avoid_blocking_io);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    avoidUnnecessaryBlockingIO
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_avoidUnnecessaryBlockingIO(JNIEnv*, jclass,
+                                                             jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jboolean>(opt->avoid_unnecessary_blocking_io);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setPersistStatsToDisk
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setPersistStatsToDisk(
+    JNIEnv*, jclass, jlong jhandle, jboolean persist_stats_to_disk) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->persist_stats_to_disk = static_cast<bool>(persist_stats_to_disk);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    persistStatsToDisk
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_persistStatsToDisk(JNIEnv*, jclass,
+                                                     jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jboolean>(opt->persist_stats_to_disk);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setWriteDbidToManifest
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setWriteDbidToManifest(
+    JNIEnv*, jclass, jlong jhandle, jboolean jwrite_dbid_to_manifest) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->write_dbid_to_manifest = static_cast<bool>(jwrite_dbid_to_manifest);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    writeDbidToManifest
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_writeDbidToManifest(JNIEnv*, jclass,
+                                                      jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jboolean>(opt->write_dbid_to_manifest);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setLogReadaheadSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setLogReadaheadSize(JNIEnv*, jclass,
+                                                  jlong jhandle,
+                                                  jlong jlog_readahead_size) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->log_readahead_size = static_cast<size_t>(jlog_readahead_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    logReasaheadSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_logReadaheadSize(JNIEnv*, jclass,
+                                                jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jlong>(opt->log_readahead_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setBestEffortsRecovery
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setBestEffortsRecovery(
+    JNIEnv*, jclass, jlong jhandle, jboolean jbest_efforts_recovery) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->best_efforts_recovery = static_cast<bool>(jbest_efforts_recovery);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    bestEffortsRecovery
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_bestEffortsRecovery(JNIEnv*, jclass,
+                                                      jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jlong>(opt->best_efforts_recovery);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxBgErrorResumeCount
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMaxBgErrorResumeCount(
+    JNIEnv*, jclass, jlong jhandle, jint jmax_bgerror_resume_count) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->max_bgerror_resume_count = static_cast<int>(jmax_bgerror_resume_count);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxBgerrorResumeCount
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_maxBgerrorResumeCount(JNIEnv*, jclass,
+                                                    jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jint>(opt->max_bgerror_resume_count);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setBgerrorResumeRetryInterval
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setBgerrorResumeRetryInterval(
+    JNIEnv*, jclass, jlong jhandle, jlong jbgerror_resume_retry_interval) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->bgerror_resume_retry_interval =
+      static_cast<uint64_t>(jbgerror_resume_retry_interval);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    bgerrorResumeRetryInterval
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_bgerrorResumeRetryInterval(JNIEnv*, jclass,
+                                                          jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jlong>(opt->bgerror_resume_retry_interval);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setAvoidFlushDuringShutdown
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setAvoidFlushDuringShutdown(
+    JNIEnv*, jobject, jlong jhandle, jboolean javoid_flush_during_shutdown) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->avoid_flush_during_shutdown =
+      static_cast<bool>(javoid_flush_during_shutdown);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    avoidFlushDuringShutdown
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_avoidFlushDuringShutdown(JNIEnv*, jobject,
+                                                           jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jboolean>(opt->avoid_flush_during_shutdown);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setAllowIngestBehind
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setAllowIngestBehind(
+    JNIEnv*, jobject, jlong jhandle, jboolean jallow_ingest_behind) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->allow_ingest_behind = jallow_ingest_behind == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    allowIngestBehind
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_allowIngestBehind(JNIEnv*, jobject,
+                                                    jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jboolean>(opt->allow_ingest_behind);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setTwoWriteQueues
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setTwoWriteQueues(JNIEnv*, jobject, jlong jhandle,
+                                                jboolean jtwo_write_queues) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->two_write_queues = jtwo_write_queues == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    twoWriteQueues
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_twoWriteQueues(JNIEnv*, jobject,
+                                                 jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jboolean>(opt->two_write_queues);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setManualWalFlush
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setManualWalFlush(JNIEnv*, jobject, jlong jhandle,
+                                                jboolean jmanual_wal_flush) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->manual_wal_flush = jmanual_wal_flush == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    manualWalFlush
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_manualWalFlush(JNIEnv*, jobject,
+                                                 jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jboolean>(opt->manual_wal_flush);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setAtomicFlush
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setAtomicFlush(JNIEnv*, jobject, jlong jhandle,
+                                             jboolean jatomic_flush) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->atomic_flush = jatomic_flush == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    atomicFlush
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_atomicFlush(JNIEnv*, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jboolean>(opt->atomic_flush);
+}
+
+/*
+ * Method:    tableFactoryName
+ * Signature: (J)Ljava/lang/String
+ */
+jstring Java_org_rocksdb_Options_tableFactoryName(JNIEnv* env, jobject,
+                                                  jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  ROCKSDB_NAMESPACE::TableFactory* tf = opt->table_factory.get();
+
+  // Should never be nullptr.
+  // Default memtable factory is SkipListFactory
+  assert(tf);
+
+  return env->NewStringUTF(tf->Name());
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    minWriteBufferNumberToMerge
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_minWriteBufferNumberToMerge(JNIEnv*, jobject,
+                                                          jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->min_write_buffer_number_to_merge;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMinWriteBufferNumberToMerge
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMinWriteBufferNumberToMerge(
+    JNIEnv*, jobject, jlong jhandle, jint jmin_write_buffer_number_to_merge) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->min_write_buffer_number_to_merge =
+      static_cast<int>(jmin_write_buffer_number_to_merge);
+}
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxWriteBufferNumberToMaintain
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_maxWriteBufferNumberToMaintain(JNIEnv*, jobject,
+                                                             jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->max_write_buffer_number_to_maintain;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxWriteBufferNumberToMaintain
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMaxWriteBufferNumberToMaintain(
+    JNIEnv*, jobject, jlong jhandle,
+    jint jmax_write_buffer_number_to_maintain) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->max_write_buffer_number_to_maintain =
+      static_cast<int>(jmax_write_buffer_number_to_maintain);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setCompressionType
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_Options_setCompressionType(
+    JNIEnv*, jobject, jlong jhandle, jbyte jcompression_type_value) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opts->compression =
+      ROCKSDB_NAMESPACE::CompressionTypeJni::toCppCompressionType(
+          jcompression_type_value);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    compressionType
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_Options_compressionType(JNIEnv*, jobject,
+                                               jlong jhandle) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return ROCKSDB_NAMESPACE::CompressionTypeJni::toJavaCompressionType(
+      opts->compression);
+}
+
+/**
+ * Helper method to convert a Java byte array of compression levels
+ * to a C++ vector of ROCKSDB_NAMESPACE::CompressionType
+ *
+ * @param env A pointer to the Java environment
+ * @param jcompression_levels A reference to a java byte array
+ *     where each byte indicates a compression level
+ *
+ * @return A std::unique_ptr to the vector, or std::unique_ptr(nullptr) if a JNI
+ * exception occurs
+ */
+std::unique_ptr<std::vector<ROCKSDB_NAMESPACE::CompressionType>>
+rocksdb_compression_vector_helper(JNIEnv* env, jbyteArray jcompression_levels) {
+  jsize len = env->GetArrayLength(jcompression_levels);
+  jbyte* jcompression_level =
+      env->GetByteArrayElements(jcompression_levels, nullptr);
+  if (jcompression_level == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return std::unique_ptr<std::vector<ROCKSDB_NAMESPACE::CompressionType>>();
+  }
+
+  auto* compression_levels =
+      new std::vector<ROCKSDB_NAMESPACE::CompressionType>();
+  std::unique_ptr<std::vector<ROCKSDB_NAMESPACE::CompressionType>>
+      uptr_compression_levels(compression_levels);
+
+  for (jsize i = 0; i < len; i++) {
+    jbyte jcl = jcompression_level[i];
+    compression_levels->push_back(
+        static_cast<ROCKSDB_NAMESPACE::CompressionType>(jcl));
+  }
+
+  env->ReleaseByteArrayElements(jcompression_levels, jcompression_level,
+                                JNI_ABORT);
+
+  return uptr_compression_levels;
+}
+
+/**
+ * Helper method to convert a C++ vector of ROCKSDB_NAMESPACE::CompressionType
+ * to a Java byte array of compression levels
+ *
+ * @param env A pointer to the Java environment
+ * @param jcompression_levels A reference to a java byte array
+ *     where each byte indicates a compression level
+ *
+ * @return A jbytearray or nullptr if an exception occurs
+ */
+jbyteArray rocksdb_compression_list_helper(
+    JNIEnv* env,
+    std::vector<ROCKSDB_NAMESPACE::CompressionType> compression_levels) {
+  const size_t len = compression_levels.size();
+  jbyte* jbuf = new jbyte[len];
+
+  for (size_t i = 0; i < len; i++) {
+    jbuf[i] = compression_levels[i];
+  }
+
+  // insert in java array
+  jbyteArray jcompression_levels = env->NewByteArray(static_cast<jsize>(len));
+  if (jcompression_levels == nullptr) {
+    // exception thrown: OutOfMemoryError
+    delete[] jbuf;
+    return nullptr;
+  }
+  env->SetByteArrayRegion(jcompression_levels, 0, static_cast<jsize>(len),
+                          jbuf);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    env->DeleteLocalRef(jcompression_levels);
+    delete[] jbuf;
+    return nullptr;
+  }
+
+  delete[] jbuf;
+
+  return jcompression_levels;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setCompressionPerLevel
+ * Signature: (J[B)V
+ */
+void Java_org_rocksdb_Options_setCompressionPerLevel(
+    JNIEnv* env, jobject, jlong jhandle, jbyteArray jcompressionLevels) {
+  auto uptr_compression_levels =
+      rocksdb_compression_vector_helper(env, jcompressionLevels);
+  if (!uptr_compression_levels) {
+    // exception occurred
+    return;
+  }
+  auto* options = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  options->compression_per_level = *(uptr_compression_levels.get());
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    compressionPerLevel
+ * Signature: (J)[B
+ */
+jbyteArray Java_org_rocksdb_Options_compressionPerLevel(JNIEnv* env, jobject,
+                                                        jlong jhandle) {
+  auto* options = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return rocksdb_compression_list_helper(env, options->compression_per_level);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setBottommostCompressionType
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_Options_setBottommostCompressionType(
+    JNIEnv*, jobject, jlong jhandle, jbyte jcompression_type_value) {
+  auto* options = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  options->bottommost_compression =
+      ROCKSDB_NAMESPACE::CompressionTypeJni::toCppCompressionType(
+          jcompression_type_value);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    bottommostCompressionType
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_Options_bottommostCompressionType(JNIEnv*, jobject,
+                                                         jlong jhandle) {
+  auto* options = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return ROCKSDB_NAMESPACE::CompressionTypeJni::toJavaCompressionType(
+      options->bottommost_compression);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setBottommostCompressionOptions
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setBottommostCompressionOptions(
+    JNIEnv*, jobject, jlong jhandle,
+    jlong jbottommost_compression_options_handle) {
+  auto* options = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  auto* bottommost_compression_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompressionOptions*>(
+          jbottommost_compression_options_handle);
+  options->bottommost_compression_opts = *bottommost_compression_options;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setCompressionOptions
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setCompressionOptions(
+    JNIEnv*, jobject, jlong jhandle, jlong jcompression_options_handle) {
+  auto* options = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  auto* compression_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompressionOptions*>(
+          jcompression_options_handle);
+  options->compression_opts = *compression_options;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setCompactionStyle
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_Options_setCompactionStyle(JNIEnv*, jobject,
+                                                 jlong jhandle,
+                                                 jbyte jcompaction_style) {
+  auto* options = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  options->compaction_style =
+      ROCKSDB_NAMESPACE::CompactionStyleJni::toCppCompactionStyle(
+          jcompaction_style);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    compactionStyle
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_Options_compactionStyle(JNIEnv*, jobject,
+                                               jlong jhandle) {
+  auto* options = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return ROCKSDB_NAMESPACE::CompactionStyleJni::toJavaCompactionStyle(
+      options->compaction_style);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxTableFilesSizeFIFO
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setMaxTableFilesSizeFIFO(
+    JNIEnv*, jobject, jlong jhandle, jlong jmax_table_files_size) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->compaction_options_fifo.max_table_files_size =
+      static_cast<uint64_t>(jmax_table_files_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxTableFilesSizeFIFO
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_maxTableFilesSizeFIFO(JNIEnv*, jobject,
+                                                     jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->compaction_options_fifo.max_table_files_size;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    numLevels
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_numLevels(JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->num_levels;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setNumLevels
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setNumLevels(JNIEnv*, jobject, jlong jhandle,
+                                           jint jnum_levels) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->num_levels =
+      static_cast<int>(jnum_levels);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    levelZeroFileNumCompactionTrigger
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_levelZeroFileNumCompactionTrigger(JNIEnv*,
+                                                                jobject,
+                                                                jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->level0_file_num_compaction_trigger;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setLevelZeroFileNumCompactionTrigger
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setLevelZeroFileNumCompactionTrigger(
+    JNIEnv*, jobject, jlong jhandle, jint jlevel0_file_num_compaction_trigger) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->level0_file_num_compaction_trigger =
+      static_cast<int>(jlevel0_file_num_compaction_trigger);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    levelZeroSlowdownWritesTrigger
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_levelZeroSlowdownWritesTrigger(JNIEnv*, jobject,
+                                                             jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->level0_slowdown_writes_trigger;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setLevelSlowdownWritesTrigger
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setLevelZeroSlowdownWritesTrigger(
+    JNIEnv*, jobject, jlong jhandle, jint jlevel0_slowdown_writes_trigger) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->level0_slowdown_writes_trigger =
+      static_cast<int>(jlevel0_slowdown_writes_trigger);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    levelZeroStopWritesTrigger
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_levelZeroStopWritesTrigger(JNIEnv*, jobject,
+                                                         jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->level0_stop_writes_trigger;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setLevelStopWritesTrigger
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setLevelZeroStopWritesTrigger(
+    JNIEnv*, jobject, jlong jhandle, jint jlevel0_stop_writes_trigger) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->level0_stop_writes_trigger =
+      static_cast<int>(jlevel0_stop_writes_trigger);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    targetFileSizeBase
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_targetFileSizeBase(JNIEnv*, jobject,
+                                                  jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->target_file_size_base;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setTargetFileSizeBase
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setTargetFileSizeBase(
+    JNIEnv*, jobject, jlong jhandle, jlong jtarget_file_size_base) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->target_file_size_base = static_cast<uint64_t>(jtarget_file_size_base);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    targetFileSizeMultiplier
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_targetFileSizeMultiplier(JNIEnv*, jobject,
+                                                       jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->target_file_size_multiplier;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setTargetFileSizeMultiplier
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setTargetFileSizeMultiplier(
+    JNIEnv*, jobject, jlong jhandle, jint jtarget_file_size_multiplier) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->target_file_size_multiplier =
+      static_cast<int>(jtarget_file_size_multiplier);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxBytesForLevelBase
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_maxBytesForLevelBase(JNIEnv*, jobject,
+                                                    jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->max_bytes_for_level_base;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxBytesForLevelBase
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setMaxBytesForLevelBase(
+    JNIEnv*, jobject, jlong jhandle, jlong jmax_bytes_for_level_base) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->max_bytes_for_level_base =
+      static_cast<int64_t>(jmax_bytes_for_level_base);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    levelCompactionDynamicLevelBytes
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_levelCompactionDynamicLevelBytes(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->level_compaction_dynamic_level_bytes;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setLevelCompactionDynamicLevelBytes
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setLevelCompactionDynamicLevelBytes(
+    JNIEnv*, jobject, jlong jhandle, jboolean jenable_dynamic_level_bytes) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->level_compaction_dynamic_level_bytes = (jenable_dynamic_level_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxBytesForLevelMultiplier
+ * Signature: (J)D
+ */
+jdouble Java_org_rocksdb_Options_maxBytesForLevelMultiplier(JNIEnv*, jobject,
+                                                            jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->max_bytes_for_level_multiplier;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxBytesForLevelMultiplier
+ * Signature: (JD)V
+ */
+void Java_org_rocksdb_Options_setMaxBytesForLevelMultiplier(
+    JNIEnv*, jobject, jlong jhandle, jdouble jmax_bytes_for_level_multiplier) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->max_bytes_for_level_multiplier =
+      static_cast<double>(jmax_bytes_for_level_multiplier);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxCompactionBytes
+ * Signature: (J)I
+ */
+jlong Java_org_rocksdb_Options_maxCompactionBytes(JNIEnv*, jobject,
+                                                  jlong jhandle) {
+  return static_cast<jlong>(
+      reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+          ->max_compaction_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxCompactionBytes
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMaxCompactionBytes(
+    JNIEnv*, jobject, jlong jhandle, jlong jmax_compaction_bytes) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->max_compaction_bytes =
+      static_cast<uint64_t>(jmax_compaction_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    arenaBlockSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_arenaBlockSize(JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->arena_block_size;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setArenaBlockSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setArenaBlockSize(JNIEnv* env, jobject,
+                                                jlong jhandle,
+                                                jlong jarena_block_size) {
+  auto s =
+      ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(jarena_block_size);
+  if (s.ok()) {
+    reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->arena_block_size =
+        jarena_block_size;
+  } else {
+    ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    disableAutoCompactions
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_disableAutoCompactions(JNIEnv*, jobject,
+                                                         jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->disable_auto_compactions;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setDisableAutoCompactions
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setDisableAutoCompactions(
+    JNIEnv*, jobject, jlong jhandle, jboolean jdisable_auto_compactions) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->disable_auto_compactions = static_cast<bool>(jdisable_auto_compactions);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxSequentialSkipInIterations
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_maxSequentialSkipInIterations(JNIEnv*, jobject,
+                                                             jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->max_sequential_skip_in_iterations;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxSequentialSkipInIterations
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setMaxSequentialSkipInIterations(
+    JNIEnv*, jobject, jlong jhandle, jlong jmax_sequential_skip_in_iterations) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->max_sequential_skip_in_iterations =
+      static_cast<int64_t>(jmax_sequential_skip_in_iterations);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    inplaceUpdateSupport
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_inplaceUpdateSupport(JNIEnv*, jobject,
+                                                       jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->inplace_update_support;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setInplaceUpdateSupport
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setInplaceUpdateSupport(
+    JNIEnv*, jobject, jlong jhandle, jboolean jinplace_update_support) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->inplace_update_support = static_cast<bool>(jinplace_update_support);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    inplaceUpdateNumLocks
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_inplaceUpdateNumLocks(JNIEnv*, jobject,
+                                                     jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->inplace_update_num_locks;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setInplaceUpdateNumLocks
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setInplaceUpdateNumLocks(
+    JNIEnv* env, jobject, jlong jhandle, jlong jinplace_update_num_locks) {
+  auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(
+      jinplace_update_num_locks);
+  if (s.ok()) {
+    reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+        ->inplace_update_num_locks = jinplace_update_num_locks;
+  } else {
+    ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    memtablePrefixBloomSizeRatio
+ * Signature: (J)I
+ */
+jdouble Java_org_rocksdb_Options_memtablePrefixBloomSizeRatio(JNIEnv*, jobject,
+                                                              jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->memtable_prefix_bloom_size_ratio;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMemtablePrefixBloomSizeRatio
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMemtablePrefixBloomSizeRatio(
+    JNIEnv*, jobject, jlong jhandle,
+    jdouble jmemtable_prefix_bloom_size_ratio) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->memtable_prefix_bloom_size_ratio =
+      static_cast<double>(jmemtable_prefix_bloom_size_ratio);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    experimentalMempurgeThreshold
+ * Signature: (J)I
+ */
+jdouble Java_org_rocksdb_Options_experimentalMempurgeThreshold(JNIEnv*, jobject,
+                                                               jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->experimental_mempurge_threshold;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setExperimentalMempurgeThreshold
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setExperimentalMempurgeThreshold(
+    JNIEnv*, jobject, jlong jhandle, jdouble jexperimental_mempurge_threshold) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->experimental_mempurge_threshold =
+      static_cast<double>(jexperimental_mempurge_threshold);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    memtableWholeKeyFiltering
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_memtableWholeKeyFiltering(JNIEnv*, jobject,
+                                                            jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->memtable_whole_key_filtering;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMemtableWholeKeyFiltering
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setMemtableWholeKeyFiltering(
+    JNIEnv*, jobject, jlong jhandle, jboolean jmemtable_whole_key_filtering) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->memtable_whole_key_filtering =
+      static_cast<bool>(jmemtable_whole_key_filtering);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    bloomLocality
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_bloomLocality(JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->bloom_locality;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setBloomLocality
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setBloomLocality(JNIEnv*, jobject, jlong jhandle,
+                                               jint jbloom_locality) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->bloom_locality =
+      static_cast<int32_t>(jbloom_locality);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxSuccessiveMerges
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_maxSuccessiveMerges(JNIEnv*, jobject,
+                                                   jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->max_successive_merges;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxSuccessiveMerges
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setMaxSuccessiveMerges(
+    JNIEnv* env, jobject, jlong jhandle, jlong jmax_successive_merges) {
+  auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(
+      jmax_successive_merges);
+  if (s.ok()) {
+    reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+        ->max_successive_merges = jmax_successive_merges;
+  } else {
+    ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    optimizeFiltersForHits
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_optimizeFiltersForHits(JNIEnv*, jobject,
+                                                         jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->optimize_filters_for_hits;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setOptimizeFiltersForHits
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setOptimizeFiltersForHits(
+    JNIEnv*, jobject, jlong jhandle, jboolean joptimize_filters_for_hits) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->optimize_filters_for_hits =
+      static_cast<bool>(joptimize_filters_for_hits);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    oldDefaults
+ * Signature: (JII)V
+ */
+void Java_org_rocksdb_Options_oldDefaults(JNIEnv*, jclass, jlong jhandle,
+                                          jint major_version,
+                                          jint minor_version) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->OldDefaults(
+      major_version, minor_version);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    optimizeForSmallDb
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Options_optimizeForSmallDb__J(JNIEnv*, jobject,
+                                                    jlong jhandle) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->OptimizeForSmallDb();
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    optimizeForSmallDb
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_optimizeForSmallDb__JJ(JNIEnv*, jclass,
+                                                     jlong jhandle,
+                                                     jlong cache_handle) {
+  auto* cache_sptr_ptr =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache>*>(
+          cache_handle);
+  auto* options_ptr = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  auto* cf_options_ptr =
+      static_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(options_ptr);
+  cf_options_ptr->OptimizeForSmallDb(cache_sptr_ptr);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    optimizeForPointLookup
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_optimizeForPointLookup(
+    JNIEnv*, jobject, jlong jhandle, jlong block_cache_size_mb) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->OptimizeForPointLookup(block_cache_size_mb);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    optimizeLevelStyleCompaction
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_optimizeLevelStyleCompaction(
+    JNIEnv*, jobject, jlong jhandle, jlong memtable_memory_budget) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->OptimizeLevelStyleCompaction(memtable_memory_budget);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    optimizeUniversalStyleCompaction
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_optimizeUniversalStyleCompaction(
+    JNIEnv*, jobject, jlong jhandle, jlong memtable_memory_budget) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->OptimizeUniversalStyleCompaction(memtable_memory_budget);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    prepareForBulkLoad
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Options_prepareForBulkLoad(JNIEnv*, jobject,
+                                                 jlong jhandle) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->PrepareForBulkLoad();
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    memtableHugePageSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_memtableHugePageSize(JNIEnv*, jobject,
+                                                    jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->memtable_huge_page_size;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMemtableHugePageSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setMemtableHugePageSize(
+    JNIEnv* env, jobject, jlong jhandle, jlong jmemtable_huge_page_size) {
+  auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(
+      jmemtable_huge_page_size);
+  if (s.ok()) {
+    reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+        ->memtable_huge_page_size = jmemtable_huge_page_size;
+  } else {
+    ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    softPendingCompactionBytesLimit
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_softPendingCompactionBytesLimit(JNIEnv*, jobject,
+                                                               jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->soft_pending_compaction_bytes_limit;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setSoftPendingCompactionBytesLimit
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setSoftPendingCompactionBytesLimit(
+    JNIEnv*, jobject, jlong jhandle,
+    jlong jsoft_pending_compaction_bytes_limit) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->soft_pending_compaction_bytes_limit =
+      static_cast<int64_t>(jsoft_pending_compaction_bytes_limit);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    softHardCompactionBytesLimit
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_hardPendingCompactionBytesLimit(JNIEnv*, jobject,
+                                                               jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->hard_pending_compaction_bytes_limit;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setHardPendingCompactionBytesLimit
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setHardPendingCompactionBytesLimit(
+    JNIEnv*, jobject, jlong jhandle,
+    jlong jhard_pending_compaction_bytes_limit) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->hard_pending_compaction_bytes_limit =
+      static_cast<int64_t>(jhard_pending_compaction_bytes_limit);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    level0FileNumCompactionTrigger
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_level0FileNumCompactionTrigger(JNIEnv*, jobject,
+                                                             jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->level0_file_num_compaction_trigger;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setLevel0FileNumCompactionTrigger
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setLevel0FileNumCompactionTrigger(
+    JNIEnv*, jobject, jlong jhandle, jint jlevel0_file_num_compaction_trigger) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->level0_file_num_compaction_trigger =
+      static_cast<int32_t>(jlevel0_file_num_compaction_trigger);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    level0SlowdownWritesTrigger
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_level0SlowdownWritesTrigger(JNIEnv*, jobject,
+                                                          jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->level0_slowdown_writes_trigger;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setLevel0SlowdownWritesTrigger
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setLevel0SlowdownWritesTrigger(
+    JNIEnv*, jobject, jlong jhandle, jint jlevel0_slowdown_writes_trigger) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->level0_slowdown_writes_trigger =
+      static_cast<int32_t>(jlevel0_slowdown_writes_trigger);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    level0StopWritesTrigger
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_level0StopWritesTrigger(JNIEnv*, jobject,
+                                                      jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->level0_stop_writes_trigger;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setLevel0StopWritesTrigger
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setLevel0StopWritesTrigger(
+    JNIEnv*, jobject, jlong jhandle, jint jlevel0_stop_writes_trigger) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->level0_stop_writes_trigger =
+      static_cast<int32_t>(jlevel0_stop_writes_trigger);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxBytesForLevelMultiplierAdditional
+ * Signature: (J)[I
+ */
+jintArray Java_org_rocksdb_Options_maxBytesForLevelMultiplierAdditional(
+    JNIEnv* env, jobject, jlong jhandle) {
+  auto mbflma = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+                    ->max_bytes_for_level_multiplier_additional;
+
+  const size_t size = mbflma.size();
+
+  jint* additionals = new jint[size];
+  for (size_t i = 0; i < size; i++) {
+    additionals[i] = static_cast<jint>(mbflma[i]);
+  }
+
+  jsize jlen = static_cast<jsize>(size);
+  jintArray result = env->NewIntArray(jlen);
+  if (result == nullptr) {
+    // exception thrown: OutOfMemoryError
+    delete[] additionals;
+    return nullptr;
+  }
+
+  env->SetIntArrayRegion(result, 0, jlen, additionals);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    env->DeleteLocalRef(result);
+    delete[] additionals;
+    return nullptr;
+  }
+
+  delete[] additionals;
+
+  return result;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxBytesForLevelMultiplierAdditional
+ * Signature: (J[I)V
+ */
+void Java_org_rocksdb_Options_setMaxBytesForLevelMultiplierAdditional(
+    JNIEnv* env, jobject, jlong jhandle,
+    jintArray jmax_bytes_for_level_multiplier_additional) {
+  jsize len = env->GetArrayLength(jmax_bytes_for_level_multiplier_additional);
+  jint* additionals = env->GetIntArrayElements(
+      jmax_bytes_for_level_multiplier_additional, nullptr);
+  if (additionals == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->max_bytes_for_level_multiplier_additional.clear();
+  for (jsize i = 0; i < len; i++) {
+    opt->max_bytes_for_level_multiplier_additional.push_back(
+        static_cast<int32_t>(additionals[i]));
+  }
+
+  env->ReleaseIntArrayElements(jmax_bytes_for_level_multiplier_additional,
+                               additionals, JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    paranoidFileChecks
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_paranoidFileChecks(JNIEnv*, jobject,
+                                                     jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
+      ->paranoid_file_checks;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setParanoidFileChecks
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setParanoidFileChecks(
+    JNIEnv*, jobject, jlong jhandle, jboolean jparanoid_file_checks) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->paranoid_file_checks =
+      static_cast<bool>(jparanoid_file_checks);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setCompactionPriority
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_Options_setCompactionPriority(
+    JNIEnv*, jobject, jlong jhandle, jbyte jcompaction_priority_value) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opts->compaction_pri =
+      ROCKSDB_NAMESPACE::CompactionPriorityJni::toCppCompactionPriority(
+          jcompaction_priority_value);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    compactionPriority
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_Options_compactionPriority(JNIEnv*, jobject,
+                                                  jlong jhandle) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return ROCKSDB_NAMESPACE::CompactionPriorityJni::toJavaCompactionPriority(
+      opts->compaction_pri);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setReportBgIoStats
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setReportBgIoStats(JNIEnv*, jobject,
+                                                 jlong jhandle,
+                                                 jboolean jreport_bg_io_stats) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opts->report_bg_io_stats = static_cast<bool>(jreport_bg_io_stats);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    reportBgIoStats
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_reportBgIoStats(JNIEnv*, jobject,
+                                                  jlong jhandle) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<bool>(opts->report_bg_io_stats);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setTtl
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setTtl(JNIEnv*, jobject, jlong jhandle,
+                                     jlong jttl) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opts->ttl = static_cast<uint64_t>(jttl);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    ttl
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_ttl(JNIEnv*, jobject, jlong jhandle) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jlong>(opts->ttl);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setPeriodicCompactionSeconds
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setPeriodicCompactionSeconds(
+    JNIEnv*, jobject, jlong jhandle, jlong jperiodicCompactionSeconds) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opts->periodic_compaction_seconds =
+      static_cast<uint64_t>(jperiodicCompactionSeconds);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    periodicCompactionSeconds
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_periodicCompactionSeconds(JNIEnv*, jobject,
+                                                         jlong jhandle) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jlong>(opts->periodic_compaction_seconds);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setCompactionOptionsUniversal
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setCompactionOptionsUniversal(
+    JNIEnv*, jobject, jlong jhandle,
+    jlong jcompaction_options_universal_handle) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  auto* opts_uni =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsUniversal*>(
+          jcompaction_options_universal_handle);
+  opts->compaction_options_universal = *opts_uni;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setCompactionOptionsFIFO
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setCompactionOptionsFIFO(
+    JNIEnv*, jobject, jlong jhandle, jlong jcompaction_options_fifo_handle) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  auto* opts_fifo = reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsFIFO*>(
+      jcompaction_options_fifo_handle);
+  opts->compaction_options_fifo = *opts_fifo;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setForceConsistencyChecks
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setForceConsistencyChecks(
+    JNIEnv*, jobject, jlong jhandle, jboolean jforce_consistency_checks) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opts->force_consistency_checks = static_cast<bool>(jforce_consistency_checks);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    forceConsistencyChecks
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_forceConsistencyChecks(JNIEnv*, jobject,
+                                                         jlong jhandle) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<bool>(opts->force_consistency_checks);
+}
+
+/// BLOB options
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setEnableBlobFiles
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setEnableBlobFiles(JNIEnv*, jobject,
+                                                 jlong jhandle,
+                                                 jboolean jenable_blob_files) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opts->enable_blob_files = static_cast<bool>(jenable_blob_files);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    enableBlobFiles
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_enableBlobFiles(JNIEnv*, jobject,
+                                                  jlong jhandle) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jboolean>(opts->enable_blob_files);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMinBlobSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setMinBlobSize(JNIEnv*, jobject, jlong jhandle,
+                                             jlong jmin_blob_size) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opts->min_blob_size = static_cast<uint64_t>(jmin_blob_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    minBlobSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_minBlobSize(JNIEnv*, jobject, jlong jhandle) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jlong>(opts->min_blob_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setBlobFileSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setBlobFileSize(JNIEnv*, jobject, jlong jhandle,
+                                              jlong jblob_file_size) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opts->blob_file_size = static_cast<uint64_t>(jblob_file_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    blobFileSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_blobFileSize(JNIEnv*, jobject, jlong jhandle) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jlong>(opts->blob_file_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setBlobCompressionType
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_Options_setBlobCompressionType(
+    JNIEnv*, jobject, jlong jhandle, jbyte jblob_compression_type_value) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opts->blob_compression_type =
+      ROCKSDB_NAMESPACE::CompressionTypeJni::toCppCompressionType(
+          jblob_compression_type_value);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    blobCompressionType
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_Options_blobCompressionType(JNIEnv*, jobject,
+                                                   jlong jhandle) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return ROCKSDB_NAMESPACE::CompressionTypeJni::toJavaCompressionType(
+      opts->blob_compression_type);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setEnableBlobGarbageCollection
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setEnableBlobGarbageCollection(
+    JNIEnv*, jobject, jlong jhandle, jboolean jenable_blob_garbage_collection) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opts->enable_blob_garbage_collection =
+      static_cast<bool>(jenable_blob_garbage_collection);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    enableBlobGarbageCollection
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_enableBlobGarbageCollection(JNIEnv*, jobject,
+                                                              jlong jhandle) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jboolean>(opts->enable_blob_garbage_collection);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setBlobGarbageCollectionAgeCutoff
+ * Signature: (JD)V
+ */
+void Java_org_rocksdb_Options_setBlobGarbageCollectionAgeCutoff(
+    JNIEnv*, jobject, jlong jhandle,
+    jdouble jblob_garbage_collection_age_cutoff) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opts->blob_garbage_collection_age_cutoff =
+      static_cast<double>(jblob_garbage_collection_age_cutoff);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    blobGarbageCollectionAgeCutoff
+ * Signature: (J)D
+ */
+jdouble Java_org_rocksdb_Options_blobGarbageCollectionAgeCutoff(JNIEnv*,
+                                                                jobject,
+                                                                jlong jhandle) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jdouble>(opts->blob_garbage_collection_age_cutoff);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setBlobGarbageCollectionForceThreshold
+ * Signature: (JD)V
+ */
+void Java_org_rocksdb_Options_setBlobGarbageCollectionForceThreshold(
+    JNIEnv*, jobject, jlong jhandle,
+    jdouble jblob_garbage_collection_force_threshold) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opts->blob_garbage_collection_force_threshold =
+      static_cast<double>(jblob_garbage_collection_force_threshold);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    blobGarbageCollectionForceThreshold
+ * Signature: (J)D
+ */
+jdouble Java_org_rocksdb_Options_blobGarbageCollectionForceThreshold(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jdouble>(opts->blob_garbage_collection_force_threshold);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setBlobCompactionReadaheadSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setBlobCompactionReadaheadSize(
+    JNIEnv*, jobject, jlong jhandle, jlong jblob_compaction_readahead_size) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opts->blob_compaction_readahead_size =
+      static_cast<uint64_t>(jblob_compaction_readahead_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    blobCompactionReadaheadSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_blobCompactionReadaheadSize(JNIEnv*, jobject,
+                                                           jlong jhandle) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jlong>(opts->blob_compaction_readahead_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setBlobFileStartingLevel
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setBlobFileStartingLevel(
+    JNIEnv*, jobject, jlong jhandle, jint jblob_file_starting_level) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opts->blob_file_starting_level = jblob_file_starting_level;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    blobFileStartingLevel
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_blobFileStartingLevel(JNIEnv*, jobject,
+                                                    jlong jhandle) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jint>(opts->blob_file_starting_level);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setPrepopulateBlobCache
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_Options_setPrepopulateBlobCache(
+    JNIEnv*, jobject, jlong jhandle, jbyte jprepopulate_blob_cache_value) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opts->prepopulate_blob_cache =
+      ROCKSDB_NAMESPACE::PrepopulateBlobCacheJni::toCppPrepopulateBlobCache(
+          jprepopulate_blob_cache_value);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    prepopulateBlobCache
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_Options_prepopulateBlobCache(JNIEnv*, jobject,
+                                                    jlong jhandle) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return ROCKSDB_NAMESPACE::PrepopulateBlobCacheJni::toJavaPrepopulateBlobCache(
+      opts->prepopulate_blob_cache);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// ROCKSDB_NAMESPACE::ColumnFamilyOptions
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    newColumnFamilyOptions
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_newColumnFamilyOptions(JNIEnv*,
+                                                                  jclass) {
+  auto* op = new ROCKSDB_NAMESPACE::ColumnFamilyOptions();
+  return GET_CPLUSPLUS_POINTER(op);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    copyColumnFamilyOptions
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_copyColumnFamilyOptions(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto new_opt = new ROCKSDB_NAMESPACE::ColumnFamilyOptions(
+      *(reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)));
+  return GET_CPLUSPLUS_POINTER(new_opt);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    newColumnFamilyOptionsFromOptions
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_newColumnFamilyOptionsFromOptions(
+    JNIEnv*, jclass, jlong joptions_handle) {
+  auto new_opt = new ROCKSDB_NAMESPACE::ColumnFamilyOptions(
+      *reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(joptions_handle));
+  return GET_CPLUSPLUS_POINTER(new_opt);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    getColumnFamilyOptionsFromProps
+ * Signature: (JLjava/lang/String;)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_getColumnFamilyOptionsFromProps__JLjava_lang_String_2(
+    JNIEnv* env, jclass, jlong cfg_handle, jstring jopt_string) {
+  const char* opt_string = env->GetStringUTFChars(jopt_string, nullptr);
+  if (opt_string == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return 0;
+  }
+  auto* config_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions*>(cfg_handle);
+  auto* cf_options = new ROCKSDB_NAMESPACE::ColumnFamilyOptions();
+  ROCKSDB_NAMESPACE::Status status =
+      ROCKSDB_NAMESPACE::GetColumnFamilyOptionsFromString(
+          *config_options, ROCKSDB_NAMESPACE::ColumnFamilyOptions(), opt_string,
+          cf_options);
+
+  env->ReleaseStringUTFChars(jopt_string, opt_string);
+
+  // Check if ColumnFamilyOptions creation was possible.
+  jlong ret_value = 0;
+  if (status.ok()) {
+    ret_value = GET_CPLUSPLUS_POINTER(cf_options);
+  } else {
+    // if operation failed the ColumnFamilyOptions need to be deleted
+    // again to prevent a memory leak.
+    delete cf_options;
+  }
+  return ret_value;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    getColumnFamilyOptionsFromProps
+ * Signature: (Ljava/util/String;)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_getColumnFamilyOptionsFromProps__Ljava_lang_String_2(
+    JNIEnv* env, jclass, jstring jopt_string) {
+  const char* opt_string = env->GetStringUTFChars(jopt_string, nullptr);
+  if (opt_string == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return 0;
+  }
+
+  auto* cf_options = new ROCKSDB_NAMESPACE::ColumnFamilyOptions();
+  ROCKSDB_NAMESPACE::Status status =
+      ROCKSDB_NAMESPACE::GetColumnFamilyOptionsFromString(
+          ROCKSDB_NAMESPACE::ColumnFamilyOptions(), opt_string, cf_options);
+
+  env->ReleaseStringUTFChars(jopt_string, opt_string);
+
+  // Check if ColumnFamilyOptions creation was possible.
+  jlong ret_value = 0;
+  if (status.ok()) {
+    ret_value = GET_CPLUSPLUS_POINTER(cf_options);
+  } else {
+    // if operation failed the ColumnFamilyOptions need to be deleted
+    // again to prevent a memory leak.
+    delete cf_options;
+  }
+  return ret_value;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_disposeInternal(JNIEnv*, jobject,
+                                                          jlong handle) {
+  auto* cfo = reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(handle);
+  assert(cfo != nullptr);
+  delete cfo;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    oldDefaults
+ * Signature: (JII)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_oldDefaults(JNIEnv*, jclass,
+                                                      jlong jhandle,
+                                                      jint major_version,
+                                                      jint minor_version) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->OldDefaults(major_version, minor_version);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    optimizeForSmallDb
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_optimizeForSmallDb__J(JNIEnv*,
+                                                                jobject,
+                                                                jlong jhandle) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->OptimizeForSmallDb();
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    optimizeForSmallDb
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_optimizeForSmallDb__JJ(
+    JNIEnv*, jclass, jlong jhandle, jlong cache_handle) {
+  auto* cache_sptr_ptr =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache>*>(
+          cache_handle);
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->OptimizeForSmallDb(cache_sptr_ptr);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    optimizeForPointLookup
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_optimizeForPointLookup(
+    JNIEnv*, jobject, jlong jhandle, jlong block_cache_size_mb) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->OptimizeForPointLookup(block_cache_size_mb);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    optimizeLevelStyleCompaction
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_optimizeLevelStyleCompaction(
+    JNIEnv*, jobject, jlong jhandle, jlong memtable_memory_budget) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->OptimizeLevelStyleCompaction(memtable_memory_budget);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    optimizeUniversalStyleCompaction
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_optimizeUniversalStyleCompaction(
+    JNIEnv*, jobject, jlong jhandle, jlong memtable_memory_budget) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->OptimizeUniversalStyleCompaction(memtable_memory_budget);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setComparatorHandle
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setComparatorHandle__JI(
+    JNIEnv*, jobject, jlong jhandle, jint builtinComparator) {
+  switch (builtinComparator) {
+    case 1:
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+          ->comparator = ROCKSDB_NAMESPACE::ReverseBytewiseComparator();
+      break;
+    default:
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+          ->comparator = ROCKSDB_NAMESPACE::BytewiseComparator();
+      break;
+  }
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setComparatorHandle
+ * Signature: (JJB)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setComparatorHandle__JJB(
+    JNIEnv*, jobject, jlong jopt_handle, jlong jcomparator_handle,
+    jbyte jcomparator_type) {
+  ROCKSDB_NAMESPACE::Comparator* comparator = nullptr;
+  switch (jcomparator_type) {
+    // JAVA_COMPARATOR
+    case 0x0:
+      comparator = reinterpret_cast<ROCKSDB_NAMESPACE::ComparatorJniCallback*>(
+          jcomparator_handle);
+      break;
+
+    // JAVA_NATIVE_COMPARATOR_WRAPPER
+    case 0x1:
+      comparator =
+          reinterpret_cast<ROCKSDB_NAMESPACE::Comparator*>(jcomparator_handle);
+      break;
+  }
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jopt_handle);
+  opt->comparator = comparator;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMergeOperatorName
+ * Signature: (JJjava/lang/String)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMergeOperatorName(
+    JNIEnv* env, jobject, jlong jhandle, jstring jop_name) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  const char* op_name = env->GetStringUTFChars(jop_name, nullptr);
+  if (op_name == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  options->merge_operator =
+      ROCKSDB_NAMESPACE::MergeOperators::CreateFromStringId(op_name);
+  env->ReleaseStringUTFChars(jop_name, op_name);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMergeOperator
+ * Signature: (JJjava/lang/String)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMergeOperator(
+    JNIEnv*, jobject, jlong jhandle, jlong mergeOperatorHandle) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->merge_operator =
+      *(reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::MergeOperator>*>(
+          mergeOperatorHandle));
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setCompactionFilterHandle
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setCompactionFilterHandle(
+    JNIEnv*, jobject, jlong jopt_handle, jlong jcompactionfilter_handle) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jopt_handle)
+      ->compaction_filter =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionFilter*>(
+          jcompactionfilter_handle);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setCompactionFilterFactoryHandle
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setCompactionFilterFactoryHandle(
+    JNIEnv*, jobject, jlong jopt_handle,
+    jlong jcompactionfilterfactory_handle) {
+  auto* cff_factory = reinterpret_cast<
+      std::shared_ptr<ROCKSDB_NAMESPACE::CompactionFilterFactoryJniCallback>*>(
+      jcompactionfilterfactory_handle);
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jopt_handle)
+      ->compaction_filter_factory = *cff_factory;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setWriteBufferSize
+ * Signature: (JJ)I
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setWriteBufferSize(
+    JNIEnv* env, jobject, jlong jhandle, jlong jwrite_buffer_size) {
+  auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(
+      jwrite_buffer_size);
+  if (s.ok()) {
+    reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+        ->write_buffer_size = jwrite_buffer_size;
+  } else {
+    ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    writeBufferSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_writeBufferSize(JNIEnv*, jobject,
+                                                           jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->write_buffer_size;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMaxWriteBufferNumber
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMaxWriteBufferNumber(
+    JNIEnv*, jobject, jlong jhandle, jint jmax_write_buffer_number) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->max_write_buffer_number = jmax_write_buffer_number;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    maxWriteBufferNumber
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_maxWriteBufferNumber(JNIEnv*, jobject,
+                                                               jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->max_write_buffer_number;
+}
+
+/*
+ * Method:    setMemTableFactory
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMemTableFactory(
+    JNIEnv*, jobject, jlong jhandle, jlong jfactory_handle) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->memtable_factory.reset(
+          reinterpret_cast<ROCKSDB_NAMESPACE::MemTableRepFactory*>(
+              jfactory_handle));
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    memTableFactoryName
+ * Signature: (J)Ljava/lang/String
+ */
+jstring Java_org_rocksdb_ColumnFamilyOptions_memTableFactoryName(
+    JNIEnv* env, jobject, jlong jhandle) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  ROCKSDB_NAMESPACE::MemTableRepFactory* tf = opt->memtable_factory.get();
+
+  // Should never be nullptr.
+  // Default memtable factory is SkipListFactory
+  assert(tf);
+
+  // temporarly fix for the historical typo
+  if (strcmp(tf->Name(), "HashLinkListRepFactory") == 0) {
+    return env->NewStringUTF("HashLinkedListRepFactory");
+  }
+
+  return env->NewStringUTF(tf->Name());
+}
+
+/*
+ * Method:    useFixedLengthPrefixExtractor
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_useFixedLengthPrefixExtractor(
+    JNIEnv*, jobject, jlong jhandle, jint jprefix_length) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->prefix_extractor.reset(ROCKSDB_NAMESPACE::NewFixedPrefixTransform(
+          static_cast<int>(jprefix_length)));
+}
+
+/*
+ * Method:    useCappedPrefixExtractor
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_useCappedPrefixExtractor(
+    JNIEnv*, jobject, jlong jhandle, jint jprefix_length) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->prefix_extractor.reset(ROCKSDB_NAMESPACE::NewCappedPrefixTransform(
+          static_cast<int>(jprefix_length)));
+}
+
+/*
+ * Method:    setTableFactory
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setTableFactory(
+    JNIEnv*, jobject, jlong jhandle, jlong jfactory_handle) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->table_factory.reset(
+          reinterpret_cast<ROCKSDB_NAMESPACE::TableFactory*>(jfactory_handle));
+}
+
+/*
+ * Method:    setSstPartitionerFactory
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setSstPartitionerFactory(
+    JNIEnv*, jobject, jlong jhandle, jlong factory_handle) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  auto factory = reinterpret_cast<
+      std::shared_ptr<ROCKSDB_NAMESPACE::SstPartitionerFactory>*>(
+      factory_handle);
+  options->sst_partitioner_factory = *factory;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setCompactionThreadLimiter
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setCompactionThreadLimiter(
+    JNIEnv*, jclass, jlong jhandle, jlong jlimiter_handle) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  auto* limiter = reinterpret_cast<
+      std::shared_ptr<ROCKSDB_NAMESPACE::ConcurrentTaskLimiter>*>(
+      jlimiter_handle);
+  options->compaction_thread_limiter = *limiter;
+}
+
+/*
+ * Method:    tableFactoryName
+ * Signature: (J)Ljava/lang/String
+ */
+jstring Java_org_rocksdb_ColumnFamilyOptions_tableFactoryName(JNIEnv* env,
+                                                              jobject,
+                                                              jlong jhandle) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  ROCKSDB_NAMESPACE::TableFactory* tf = opt->table_factory.get();
+
+  // Should never be nullptr.
+  // Default memtable factory is SkipListFactory
+  assert(tf);
+
+  return env->NewStringUTF(tf->Name());
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setCfPaths
+ * Signature: (J[Ljava/lang/String;[J)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setCfPaths(JNIEnv* env, jclass,
+                                                     jlong jhandle,
+                                                     jobjectArray path_array,
+                                                     jlongArray size_array) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  jboolean has_exception = JNI_FALSE;
+  std::vector<ROCKSDB_NAMESPACE::DbPath> cf_paths =
+      rocksdb_convert_cf_paths_from_java_helper(env, path_array, size_array,
+                                                &has_exception);
+  if (JNI_FALSE == has_exception) {
+    options->cf_paths = std::move(cf_paths);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    cfPathsLen
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_cfPathsLen(JNIEnv*, jclass,
+                                                      jlong jhandle) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return static_cast<jlong>(opt->cf_paths.size());
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    cfPaths
+ * Signature: (J[Ljava/lang/String;[J)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_cfPaths(JNIEnv* env, jclass,
+                                                  jlong jhandle,
+                                                  jobjectArray jpaths,
+                                                  jlongArray jtarget_sizes) {
+  rocksdb_convert_cf_paths_to_java_helper<
+      ROCKSDB_NAMESPACE::ColumnFamilyOptions>(env, jhandle, jpaths,
+                                              jtarget_sizes);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    minWriteBufferNumberToMerge
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_minWriteBufferNumberToMerge(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->min_write_buffer_number_to_merge;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMinWriteBufferNumberToMerge
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMinWriteBufferNumberToMerge(
+    JNIEnv*, jobject, jlong jhandle, jint jmin_write_buffer_number_to_merge) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->min_write_buffer_number_to_merge =
+      static_cast<int>(jmin_write_buffer_number_to_merge);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    maxWriteBufferNumberToMaintain
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_maxWriteBufferNumberToMaintain(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->max_write_buffer_number_to_maintain;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMaxWriteBufferNumberToMaintain
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMaxWriteBufferNumberToMaintain(
+    JNIEnv*, jobject, jlong jhandle,
+    jint jmax_write_buffer_number_to_maintain) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->max_write_buffer_number_to_maintain =
+      static_cast<int>(jmax_write_buffer_number_to_maintain);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setCompressionType
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setCompressionType(
+    JNIEnv*, jobject, jlong jhandle, jbyte jcompression_type_value) {
+  auto* cf_opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  cf_opts->compression =
+      ROCKSDB_NAMESPACE::CompressionTypeJni::toCppCompressionType(
+          jcompression_type_value);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    compressionType
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_ColumnFamilyOptions_compressionType(JNIEnv*, jobject,
+                                                           jlong jhandle) {
+  auto* cf_opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return ROCKSDB_NAMESPACE::CompressionTypeJni::toJavaCompressionType(
+      cf_opts->compression);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setCompressionPerLevel
+ * Signature: (J[B)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setCompressionPerLevel(
+    JNIEnv* env, jobject, jlong jhandle, jbyteArray jcompressionLevels) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  auto uptr_compression_levels =
+      rocksdb_compression_vector_helper(env, jcompressionLevels);
+  if (!uptr_compression_levels) {
+    // exception occurred
+    return;
+  }
+  options->compression_per_level = *(uptr_compression_levels.get());
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    compressionPerLevel
+ * Signature: (J)[B
+ */
+jbyteArray Java_org_rocksdb_ColumnFamilyOptions_compressionPerLevel(
+    JNIEnv* env, jobject, jlong jhandle) {
+  auto* cf_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return rocksdb_compression_list_helper(env,
+                                         cf_options->compression_per_level);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setBottommostCompressionType
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setBottommostCompressionType(
+    JNIEnv*, jobject, jlong jhandle, jbyte jcompression_type_value) {
+  auto* cf_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  cf_options->bottommost_compression =
+      ROCKSDB_NAMESPACE::CompressionTypeJni::toCppCompressionType(
+          jcompression_type_value);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    bottommostCompressionType
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_ColumnFamilyOptions_bottommostCompressionType(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* cf_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return ROCKSDB_NAMESPACE::CompressionTypeJni::toJavaCompressionType(
+      cf_options->bottommost_compression);
+}
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setBottommostCompressionOptions
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setBottommostCompressionOptions(
+    JNIEnv*, jobject, jlong jhandle,
+    jlong jbottommost_compression_options_handle) {
+  auto* cf_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  auto* bottommost_compression_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompressionOptions*>(
+          jbottommost_compression_options_handle);
+  cf_options->bottommost_compression_opts = *bottommost_compression_options;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setCompressionOptions
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setCompressionOptions(
+    JNIEnv*, jobject, jlong jhandle, jlong jcompression_options_handle) {
+  auto* cf_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  auto* compression_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompressionOptions*>(
+          jcompression_options_handle);
+  cf_options->compression_opts = *compression_options;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setCompactionStyle
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setCompactionStyle(
+    JNIEnv*, jobject, jlong jhandle, jbyte jcompaction_style) {
+  auto* cf_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  cf_options->compaction_style =
+      ROCKSDB_NAMESPACE::CompactionStyleJni::toCppCompactionStyle(
+          jcompaction_style);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    compactionStyle
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_ColumnFamilyOptions_compactionStyle(JNIEnv*, jobject,
+                                                           jlong jhandle) {
+  auto* cf_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return ROCKSDB_NAMESPACE::CompactionStyleJni::toJavaCompactionStyle(
+      cf_options->compaction_style);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMaxTableFilesSizeFIFO
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMaxTableFilesSizeFIFO(
+    JNIEnv*, jobject, jlong jhandle, jlong jmax_table_files_size) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->compaction_options_fifo.max_table_files_size =
+      static_cast<uint64_t>(jmax_table_files_size);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    maxTableFilesSizeFIFO
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_maxTableFilesSizeFIFO(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->compaction_options_fifo.max_table_files_size;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    numLevels
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_numLevels(JNIEnv*, jobject,
+                                                    jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->num_levels;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setNumLevels
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setNumLevels(JNIEnv*, jobject,
+                                                       jlong jhandle,
+                                                       jint jnum_levels) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->num_levels = static_cast<int>(jnum_levels);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    levelZeroFileNumCompactionTrigger
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_levelZeroFileNumCompactionTrigger(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->level0_file_num_compaction_trigger;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setLevelZeroFileNumCompactionTrigger
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setLevelZeroFileNumCompactionTrigger(
+    JNIEnv*, jobject, jlong jhandle, jint jlevel0_file_num_compaction_trigger) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->level0_file_num_compaction_trigger =
+      static_cast<int>(jlevel0_file_num_compaction_trigger);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    levelZeroSlowdownWritesTrigger
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_levelZeroSlowdownWritesTrigger(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->level0_slowdown_writes_trigger;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setLevelSlowdownWritesTrigger
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setLevelZeroSlowdownWritesTrigger(
+    JNIEnv*, jobject, jlong jhandle, jint jlevel0_slowdown_writes_trigger) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->level0_slowdown_writes_trigger =
+      static_cast<int>(jlevel0_slowdown_writes_trigger);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    levelZeroStopWritesTrigger
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_levelZeroStopWritesTrigger(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->level0_stop_writes_trigger;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setLevelStopWritesTrigger
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setLevelZeroStopWritesTrigger(
+    JNIEnv*, jobject, jlong jhandle, jint jlevel0_stop_writes_trigger) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->level0_stop_writes_trigger =
+      static_cast<int>(jlevel0_stop_writes_trigger);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    targetFileSizeBase
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_targetFileSizeBase(JNIEnv*, jobject,
+                                                              jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->target_file_size_base;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setTargetFileSizeBase
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setTargetFileSizeBase(
+    JNIEnv*, jobject, jlong jhandle, jlong jtarget_file_size_base) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->target_file_size_base = static_cast<uint64_t>(jtarget_file_size_base);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    targetFileSizeMultiplier
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_targetFileSizeMultiplier(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->target_file_size_multiplier;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setTargetFileSizeMultiplier
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setTargetFileSizeMultiplier(
+    JNIEnv*, jobject, jlong jhandle, jint jtarget_file_size_multiplier) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->target_file_size_multiplier =
+      static_cast<int>(jtarget_file_size_multiplier);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    maxBytesForLevelBase
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_maxBytesForLevelBase(JNIEnv*,
+                                                                jobject,
+                                                                jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->max_bytes_for_level_base;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMaxBytesForLevelBase
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMaxBytesForLevelBase(
+    JNIEnv*, jobject, jlong jhandle, jlong jmax_bytes_for_level_base) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->max_bytes_for_level_base =
+      static_cast<int64_t>(jmax_bytes_for_level_base);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    levelCompactionDynamicLevelBytes
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ColumnFamilyOptions_levelCompactionDynamicLevelBytes(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->level_compaction_dynamic_level_bytes;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setLevelCompactionDynamicLevelBytes
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setLevelCompactionDynamicLevelBytes(
+    JNIEnv*, jobject, jlong jhandle, jboolean jenable_dynamic_level_bytes) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->level_compaction_dynamic_level_bytes = (jenable_dynamic_level_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    maxBytesForLevelMultiplier
+ * Signature: (J)D
+ */
+jdouble Java_org_rocksdb_ColumnFamilyOptions_maxBytesForLevelMultiplier(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->max_bytes_for_level_multiplier;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMaxBytesForLevelMultiplier
+ * Signature: (JD)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMaxBytesForLevelMultiplier(
+    JNIEnv*, jobject, jlong jhandle, jdouble jmax_bytes_for_level_multiplier) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->max_bytes_for_level_multiplier =
+      static_cast<double>(jmax_bytes_for_level_multiplier);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    maxCompactionBytes
+ * Signature: (J)I
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_maxCompactionBytes(JNIEnv*, jobject,
+                                                              jlong jhandle) {
+  return static_cast<jlong>(
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+          ->max_compaction_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMaxCompactionBytes
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMaxCompactionBytes(
+    JNIEnv*, jobject, jlong jhandle, jlong jmax_compaction_bytes) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->max_compaction_bytes = static_cast<uint64_t>(jmax_compaction_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    arenaBlockSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_arenaBlockSize(JNIEnv*, jobject,
+                                                          jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->arena_block_size;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setArenaBlockSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setArenaBlockSize(
+    JNIEnv* env, jobject, jlong jhandle, jlong jarena_block_size) {
+  auto s =
+      ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(jarena_block_size);
+  if (s.ok()) {
+    reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+        ->arena_block_size = jarena_block_size;
+  } else {
+    ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    disableAutoCompactions
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ColumnFamilyOptions_disableAutoCompactions(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->disable_auto_compactions;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setDisableAutoCompactions
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setDisableAutoCompactions(
+    JNIEnv*, jobject, jlong jhandle, jboolean jdisable_auto_compactions) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->disable_auto_compactions = static_cast<bool>(jdisable_auto_compactions);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    maxSequentialSkipInIterations
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_maxSequentialSkipInIterations(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->max_sequential_skip_in_iterations;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMaxSequentialSkipInIterations
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMaxSequentialSkipInIterations(
+    JNIEnv*, jobject, jlong jhandle, jlong jmax_sequential_skip_in_iterations) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->max_sequential_skip_in_iterations =
+      static_cast<int64_t>(jmax_sequential_skip_in_iterations);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    inplaceUpdateSupport
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ColumnFamilyOptions_inplaceUpdateSupport(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->inplace_update_support;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setInplaceUpdateSupport
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setInplaceUpdateSupport(
+    JNIEnv*, jobject, jlong jhandle, jboolean jinplace_update_support) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->inplace_update_support = static_cast<bool>(jinplace_update_support);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    inplaceUpdateNumLocks
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_inplaceUpdateNumLocks(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->inplace_update_num_locks;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setInplaceUpdateNumLocks
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setInplaceUpdateNumLocks(
+    JNIEnv* env, jobject, jlong jhandle, jlong jinplace_update_num_locks) {
+  auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(
+      jinplace_update_num_locks);
+  if (s.ok()) {
+    reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+        ->inplace_update_num_locks = jinplace_update_num_locks;
+  } else {
+    ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    memtablePrefixBloomSizeRatio
+ * Signature: (J)I
+ */
+jdouble Java_org_rocksdb_ColumnFamilyOptions_memtablePrefixBloomSizeRatio(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->memtable_prefix_bloom_size_ratio;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMemtablePrefixBloomSizeRatio
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMemtablePrefixBloomSizeRatio(
+    JNIEnv*, jobject, jlong jhandle,
+    jdouble jmemtable_prefix_bloom_size_ratio) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->memtable_prefix_bloom_size_ratio =
+      static_cast<double>(jmemtable_prefix_bloom_size_ratio);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    experimentalMempurgeThreshold
+ * Signature: (J)I
+ */
+jdouble Java_org_rocksdb_ColumnFamilyOptions_experimentalMempurgeThreshold(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->experimental_mempurge_threshold;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setExperimentalMempurgeThreshold
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setExperimentalMempurgeThreshold(
+    JNIEnv*, jobject, jlong jhandle, jdouble jexperimental_mempurge_threshold) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->experimental_mempurge_threshold =
+      static_cast<double>(jexperimental_mempurge_threshold);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    memtableWholeKeyFiltering
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ColumnFamilyOptions_memtableWholeKeyFiltering(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->memtable_whole_key_filtering;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMemtableWholeKeyFiltering
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMemtableWholeKeyFiltering(
+    JNIEnv*, jobject, jlong jhandle, jboolean jmemtable_whole_key_filtering) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->memtable_whole_key_filtering =
+      static_cast<bool>(jmemtable_whole_key_filtering);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    bloomLocality
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_bloomLocality(JNIEnv*, jobject,
+                                                        jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->bloom_locality;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setBloomLocality
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setBloomLocality(
+    JNIEnv*, jobject, jlong jhandle, jint jbloom_locality) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->bloom_locality = static_cast<int32_t>(jbloom_locality);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    maxSuccessiveMerges
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_maxSuccessiveMerges(JNIEnv*, jobject,
+                                                               jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->max_successive_merges;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMaxSuccessiveMerges
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMaxSuccessiveMerges(
+    JNIEnv* env, jobject, jlong jhandle, jlong jmax_successive_merges) {
+  auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(
+      jmax_successive_merges);
+  if (s.ok()) {
+    reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+        ->max_successive_merges = jmax_successive_merges;
+  } else {
+    ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    optimizeFiltersForHits
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ColumnFamilyOptions_optimizeFiltersForHits(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->optimize_filters_for_hits;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setOptimizeFiltersForHits
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setOptimizeFiltersForHits(
+    JNIEnv*, jobject, jlong jhandle, jboolean joptimize_filters_for_hits) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->optimize_filters_for_hits =
+      static_cast<bool>(joptimize_filters_for_hits);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    memtableHugePageSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_memtableHugePageSize(JNIEnv*,
+                                                                jobject,
+                                                                jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->memtable_huge_page_size;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMemtableHugePageSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMemtableHugePageSize(
+    JNIEnv* env, jobject, jlong jhandle, jlong jmemtable_huge_page_size) {
+  auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(
+      jmemtable_huge_page_size);
+  if (s.ok()) {
+    reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+        ->memtable_huge_page_size = jmemtable_huge_page_size;
+  } else {
+    ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    softPendingCompactionBytesLimit
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_softPendingCompactionBytesLimit(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->soft_pending_compaction_bytes_limit;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setSoftPendingCompactionBytesLimit
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setSoftPendingCompactionBytesLimit(
+    JNIEnv*, jobject, jlong jhandle,
+    jlong jsoft_pending_compaction_bytes_limit) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->soft_pending_compaction_bytes_limit =
+      static_cast<int64_t>(jsoft_pending_compaction_bytes_limit);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    softHardCompactionBytesLimit
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_hardPendingCompactionBytesLimit(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->hard_pending_compaction_bytes_limit;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setHardPendingCompactionBytesLimit
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setHardPendingCompactionBytesLimit(
+    JNIEnv*, jobject, jlong jhandle,
+    jlong jhard_pending_compaction_bytes_limit) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->hard_pending_compaction_bytes_limit =
+      static_cast<int64_t>(jhard_pending_compaction_bytes_limit);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    level0FileNumCompactionTrigger
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_level0FileNumCompactionTrigger(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->level0_file_num_compaction_trigger;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setLevel0FileNumCompactionTrigger
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setLevel0FileNumCompactionTrigger(
+    JNIEnv*, jobject, jlong jhandle, jint jlevel0_file_num_compaction_trigger) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->level0_file_num_compaction_trigger =
+      static_cast<int32_t>(jlevel0_file_num_compaction_trigger);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    level0SlowdownWritesTrigger
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_level0SlowdownWritesTrigger(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->level0_slowdown_writes_trigger;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setLevel0SlowdownWritesTrigger
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setLevel0SlowdownWritesTrigger(
+    JNIEnv*, jobject, jlong jhandle, jint jlevel0_slowdown_writes_trigger) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->level0_slowdown_writes_trigger =
+      static_cast<int32_t>(jlevel0_slowdown_writes_trigger);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    level0StopWritesTrigger
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_level0StopWritesTrigger(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->level0_stop_writes_trigger;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setLevel0StopWritesTrigger
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setLevel0StopWritesTrigger(
+    JNIEnv*, jobject, jlong jhandle, jint jlevel0_stop_writes_trigger) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->level0_stop_writes_trigger =
+      static_cast<int32_t>(jlevel0_stop_writes_trigger);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    maxBytesForLevelMultiplierAdditional
+ * Signature: (J)[I
+ */
+jintArray
+Java_org_rocksdb_ColumnFamilyOptions_maxBytesForLevelMultiplierAdditional(
+    JNIEnv* env, jobject, jlong jhandle) {
+  auto mbflma =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+          ->max_bytes_for_level_multiplier_additional;
+
+  const size_t size = mbflma.size();
+
+  jint* additionals = new jint[size];
+  for (size_t i = 0; i < size; i++) {
+    additionals[i] = static_cast<jint>(mbflma[i]);
+  }
+
+  jsize jlen = static_cast<jsize>(size);
+  jintArray result = env->NewIntArray(jlen);
+  if (result == nullptr) {
+    // exception thrown: OutOfMemoryError
+    delete[] additionals;
+    return nullptr;
+  }
+  env->SetIntArrayRegion(result, 0, jlen, additionals);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    env->DeleteLocalRef(result);
+    delete[] additionals;
+    return nullptr;
+  }
+
+  delete[] additionals;
+
+  return result;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMaxBytesForLevelMultiplierAdditional
+ * Signature: (J[I)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMaxBytesForLevelMultiplierAdditional(
+    JNIEnv* env, jobject, jlong jhandle,
+    jintArray jmax_bytes_for_level_multiplier_additional) {
+  jsize len = env->GetArrayLength(jmax_bytes_for_level_multiplier_additional);
+  jint* additionals = env->GetIntArrayElements(
+      jmax_bytes_for_level_multiplier_additional, nullptr);
+  if (additionals == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  auto* cf_opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  cf_opt->max_bytes_for_level_multiplier_additional.clear();
+  for (jsize i = 0; i < len; i++) {
+    cf_opt->max_bytes_for_level_multiplier_additional.push_back(
+        static_cast<int32_t>(additionals[i]));
+  }
+
+  env->ReleaseIntArrayElements(jmax_bytes_for_level_multiplier_additional,
+                               additionals, JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    paranoidFileChecks
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ColumnFamilyOptions_paranoidFileChecks(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->paranoid_file_checks;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setParanoidFileChecks
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setParanoidFileChecks(
+    JNIEnv*, jobject, jlong jhandle, jboolean jparanoid_file_checks) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->paranoid_file_checks = static_cast<bool>(jparanoid_file_checks);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setCompactionPriority
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setCompactionPriority(
+    JNIEnv*, jobject, jlong jhandle, jbyte jcompaction_priority_value) {
+  auto* cf_opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  cf_opts->compaction_pri =
+      ROCKSDB_NAMESPACE::CompactionPriorityJni::toCppCompactionPriority(
+          jcompaction_priority_value);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    compactionPriority
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_ColumnFamilyOptions_compactionPriority(JNIEnv*, jobject,
+                                                              jlong jhandle) {
+  auto* cf_opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return ROCKSDB_NAMESPACE::CompactionPriorityJni::toJavaCompactionPriority(
+      cf_opts->compaction_pri);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setReportBgIoStats
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setReportBgIoStats(
+    JNIEnv*, jobject, jlong jhandle, jboolean jreport_bg_io_stats) {
+  auto* cf_opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  cf_opts->report_bg_io_stats = static_cast<bool>(jreport_bg_io_stats);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    reportBgIoStats
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ColumnFamilyOptions_reportBgIoStats(JNIEnv*, jobject,
+                                                              jlong jhandle) {
+  auto* cf_opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return static_cast<bool>(cf_opts->report_bg_io_stats);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setTtl
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setTtl(JNIEnv*, jobject,
+                                                 jlong jhandle, jlong jttl) {
+  auto* cf_opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  cf_opts->ttl = static_cast<uint64_t>(jttl);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    ttl
+ * Signature: (J)J
+ */
+JNIEXPORT jlong JNICALL
+Java_org_rocksdb_ColumnFamilyOptions_ttl(JNIEnv*, jobject, jlong jhandle) {
+  auto* cf_opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return static_cast<jlong>(cf_opts->ttl);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setPeriodicCompactionSeconds
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setPeriodicCompactionSeconds(
+    JNIEnv*, jobject, jlong jhandle, jlong jperiodicCompactionSeconds) {
+  auto* cf_opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  cf_opts->periodic_compaction_seconds =
+      static_cast<uint64_t>(jperiodicCompactionSeconds);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    periodicCompactionSeconds
+ * Signature: (J)J
+ */
+JNIEXPORT jlong JNICALL
+Java_org_rocksdb_ColumnFamilyOptions_periodicCompactionSeconds(JNIEnv*, jobject,
+                                                               jlong jhandle) {
+  auto* cf_opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return static_cast<jlong>(cf_opts->periodic_compaction_seconds);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setCompactionOptionsUniversal
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setCompactionOptionsUniversal(
+    JNIEnv*, jobject, jlong jhandle,
+    jlong jcompaction_options_universal_handle) {
+  auto* cf_opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  auto* opts_uni =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsUniversal*>(
+          jcompaction_options_universal_handle);
+  cf_opts->compaction_options_universal = *opts_uni;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setCompactionOptionsFIFO
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setCompactionOptionsFIFO(
+    JNIEnv*, jobject, jlong jhandle, jlong jcompaction_options_fifo_handle) {
+  auto* cf_opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  auto* opts_fifo = reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsFIFO*>(
+      jcompaction_options_fifo_handle);
+  cf_opts->compaction_options_fifo = *opts_fifo;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setForceConsistencyChecks
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setForceConsistencyChecks(
+    JNIEnv*, jobject, jlong jhandle, jboolean jforce_consistency_checks) {
+  auto* cf_opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  cf_opts->force_consistency_checks =
+      static_cast<bool>(jforce_consistency_checks);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    forceConsistencyChecks
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ColumnFamilyOptions_forceConsistencyChecks(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* cf_opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return static_cast<jboolean>(cf_opts->force_consistency_checks);
+}
+
+/// BLOB options
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setEnableBlobFiles
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setEnableBlobFiles(
+    JNIEnv*, jobject, jlong jhandle, jboolean jenable_blob_files) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  opts->enable_blob_files = static_cast<bool>(jenable_blob_files);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    enableBlobFiles
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ColumnFamilyOptions_enableBlobFiles(JNIEnv*, jobject,
+                                                              jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return static_cast<jboolean>(opts->enable_blob_files);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMinBlobSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMinBlobSize(JNIEnv*, jobject,
+                                                         jlong jhandle,
+                                                         jlong jmin_blob_size) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  opts->min_blob_size = static_cast<uint64_t>(jmin_blob_size);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    minBlobSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_minBlobSize(JNIEnv*, jobject,
+                                                       jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return static_cast<jlong>(opts->min_blob_size);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setBlobFileSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setBlobFileSize(
+    JNIEnv*, jobject, jlong jhandle, jlong jblob_file_size) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  opts->blob_file_size = static_cast<uint64_t>(jblob_file_size);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    blobFileSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_blobFileSize(JNIEnv*, jobject,
+                                                        jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return static_cast<jlong>(opts->blob_file_size);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setBlobCompressionType
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setBlobCompressionType(
+    JNIEnv*, jobject, jlong jhandle, jbyte jblob_compression_type_value) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  opts->blob_compression_type =
+      ROCKSDB_NAMESPACE::CompressionTypeJni::toCppCompressionType(
+          jblob_compression_type_value);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    blobCompressionType
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_ColumnFamilyOptions_blobCompressionType(JNIEnv*, jobject,
+                                                               jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return ROCKSDB_NAMESPACE::CompressionTypeJni::toJavaCompressionType(
+      opts->blob_compression_type);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setEnableBlobGarbageCollection
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setEnableBlobGarbageCollection(
+    JNIEnv*, jobject, jlong jhandle, jboolean jenable_blob_garbage_collection) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  opts->enable_blob_garbage_collection =
+      static_cast<bool>(jenable_blob_garbage_collection);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    enableBlobGarbageCollection
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ColumnFamilyOptions_enableBlobGarbageCollection(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return static_cast<jboolean>(opts->enable_blob_garbage_collection);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setBlobGarbageCollectionAgeCutoff
+ * Signature: (JD)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setBlobGarbageCollectionAgeCutoff(
+    JNIEnv*, jobject, jlong jhandle,
+    jdouble jblob_garbage_collection_age_cutoff) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  opts->blob_garbage_collection_age_cutoff =
+      static_cast<double>(jblob_garbage_collection_age_cutoff);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    blobGarbageCollectionAgeCutoff
+ * Signature: (J)D
+ */
+jdouble Java_org_rocksdb_ColumnFamilyOptions_blobGarbageCollectionAgeCutoff(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return static_cast<jdouble>(opts->blob_garbage_collection_age_cutoff);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setBlobGarbageCollectionForceThreshold
+ * Signature: (JD)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setBlobGarbageCollectionForceThreshold(
+    JNIEnv*, jobject, jlong jhandle,
+    jdouble jblob_garbage_collection_force_threshold) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  opts->blob_garbage_collection_force_threshold =
+      static_cast<double>(jblob_garbage_collection_force_threshold);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    blobGarbageCollectionForceThreshold
+ * Signature: (J)D
+ */
+jdouble
+Java_org_rocksdb_ColumnFamilyOptions_blobGarbageCollectionForceThreshold(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return static_cast<jdouble>(opts->blob_garbage_collection_force_threshold);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setBlobCompactionReadaheadSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setBlobCompactionReadaheadSize(
+    JNIEnv*, jobject, jlong jhandle, jlong jblob_compaction_readahead_size) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  opts->blob_compaction_readahead_size =
+      static_cast<uint64_t>(jblob_compaction_readahead_size);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    blobCompactionReadaheadSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_blobCompactionReadaheadSize(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return static_cast<jlong>(opts->blob_compaction_readahead_size);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setBlobFileStartingLevel
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setBlobFileStartingLevel(
+    JNIEnv*, jobject, jlong jhandle, jint jblob_file_starting_level) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  opts->blob_file_starting_level = jblob_file_starting_level;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    blobFileStartingLevel
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_blobFileStartingLevel(JNIEnv*,
+                                                                jobject,
+                                                                jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return static_cast<jint>(opts->blob_file_starting_level);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setPrepopulateBlobCache
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setPrepopulateBlobCache(
+    JNIEnv*, jobject, jlong jhandle, jbyte jprepopulate_blob_cache_value) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  opts->prepopulate_blob_cache =
+      ROCKSDB_NAMESPACE::PrepopulateBlobCacheJni::toCppPrepopulateBlobCache(
+          jprepopulate_blob_cache_value);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    prepopulateBlobCache
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_ColumnFamilyOptions_prepopulateBlobCache(JNIEnv*,
+                                                                jobject,
+                                                                jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return ROCKSDB_NAMESPACE::PrepopulateBlobCacheJni::toJavaPrepopulateBlobCache(
+      opts->prepopulate_blob_cache);
+}
+
+/////////////////////////////////////////////////////////////////////
+// ROCKSDB_NAMESPACE::DBOptions
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    newDBOptions
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_DBOptions_newDBOptions(JNIEnv*, jclass) {
+  auto* dbop = new ROCKSDB_NAMESPACE::DBOptions();
+  return GET_CPLUSPLUS_POINTER(dbop);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    copyDBOptions
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_copyDBOptions(JNIEnv*, jclass, jlong jhandle) {
+  auto new_opt = new ROCKSDB_NAMESPACE::DBOptions(
+      *(reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)));
+  return GET_CPLUSPLUS_POINTER(new_opt);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    newDBOptionsFromOptions
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_newDBOptionsFromOptions(
+    JNIEnv*, jclass, jlong joptions_handle) {
+  auto new_opt = new ROCKSDB_NAMESPACE::DBOptions(
+      *reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(joptions_handle));
+  return GET_CPLUSPLUS_POINTER(new_opt);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    getDBOptionsFromProps
+ * Signature: (JLjava/lang/String;)J
+ */
+jlong Java_org_rocksdb_DBOptions_getDBOptionsFromProps__JLjava_lang_String_2(
+    JNIEnv* env, jclass, jlong config_handle, jstring jopt_string) {
+  const char* opt_string = env->GetStringUTFChars(jopt_string, nullptr);
+  if (opt_string == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return 0;
+  }
+
+  auto* config_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions*>(config_handle);
+  auto* db_options = new ROCKSDB_NAMESPACE::DBOptions();
+  ROCKSDB_NAMESPACE::Status status = ROCKSDB_NAMESPACE::GetDBOptionsFromString(
+      *config_options, ROCKSDB_NAMESPACE::DBOptions(), opt_string, db_options);
+
+  env->ReleaseStringUTFChars(jopt_string, opt_string);
+
+  // Check if DBOptions creation was possible.
+  jlong ret_value = 0;
+  if (status.ok()) {
+    ret_value = GET_CPLUSPLUS_POINTER(db_options);
+  } else {
+    // if operation failed the DBOptions need to be deleted
+    // again to prevent a memory leak.
+    delete db_options;
+  }
+  return ret_value;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    getDBOptionsFromProps
+ * Signature: (Ljava/util/String;)J
+ */
+jlong Java_org_rocksdb_DBOptions_getDBOptionsFromProps__Ljava_lang_String_2(
+    JNIEnv* env, jclass, jstring jopt_string) {
+  const char* opt_string = env->GetStringUTFChars(jopt_string, nullptr);
+  if (opt_string == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return 0;
+  }
+
+  auto* db_options = new ROCKSDB_NAMESPACE::DBOptions();
+  ROCKSDB_NAMESPACE::Status status = ROCKSDB_NAMESPACE::GetDBOptionsFromString(
+      ROCKSDB_NAMESPACE::DBOptions(), opt_string, db_options);
+
+  env->ReleaseStringUTFChars(jopt_string, opt_string);
+
+  // Check if DBOptions creation was possible.
+  jlong ret_value = 0;
+  if (status.ok()) {
+    ret_value = GET_CPLUSPLUS_POINTER(db_options);
+  } else {
+    // if operation failed the DBOptions need to be deleted
+    // again to prevent a memory leak.
+    delete db_options;
+  }
+  return ret_value;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_DBOptions_disposeInternal(JNIEnv*, jobject,
+                                                jlong handle) {
+  auto* dbo = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(handle);
+  assert(dbo != nullptr);
+  delete dbo;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    optimizeForSmallDb
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_DBOptions_optimizeForSmallDb(JNIEnv*, jobject,
+                                                   jlong jhandle) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->OptimizeForSmallDb();
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setEnv
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setEnv(JNIEnv*, jobject, jlong jhandle,
+                                       jlong jenv_handle) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)->env =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Env*>(jenv_handle);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setIncreaseParallelism
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DBOptions_setIncreaseParallelism(JNIEnv*, jobject,
+                                                       jlong jhandle,
+                                                       jint totalThreads) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)->IncreaseParallelism(
+      static_cast<int>(totalThreads));
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setCreateIfMissing
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setCreateIfMissing(JNIEnv*, jobject,
+                                                   jlong jhandle,
+                                                   jboolean flag) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)->create_if_missing =
+      flag;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    createIfMissing
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_createIfMissing(JNIEnv*, jobject,
+                                                    jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->create_if_missing;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setCreateMissingColumnFamilies
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setCreateMissingColumnFamilies(JNIEnv*, jobject,
+                                                               jlong jhandle,
+                                                               jboolean flag) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->create_missing_column_families = flag;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    createMissingColumnFamilies
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_createMissingColumnFamilies(JNIEnv*,
+                                                                jobject,
+                                                                jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->create_missing_column_families;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setErrorIfExists
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setErrorIfExists(JNIEnv*, jobject,
+                                                 jlong jhandle,
+                                                 jboolean error_if_exists) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)->error_if_exists =
+      static_cast<bool>(error_if_exists);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    errorIfExists
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_errorIfExists(JNIEnv*, jobject,
+                                                  jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->error_if_exists;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setParanoidChecks
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setParanoidChecks(JNIEnv*, jobject,
+                                                  jlong jhandle,
+                                                  jboolean paranoid_checks) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)->paranoid_checks =
+      static_cast<bool>(paranoid_checks);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    paranoidChecks
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_paranoidChecks(JNIEnv*, jobject,
+                                                   jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->paranoid_checks;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setRateLimiter
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setRateLimiter(JNIEnv*, jobject, jlong jhandle,
+                                               jlong jrate_limiter_handle) {
+  std::shared_ptr<ROCKSDB_NAMESPACE::RateLimiter>* pRateLimiter =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::RateLimiter>*>(
+          jrate_limiter_handle);
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)->rate_limiter =
+      *pRateLimiter;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setSstFileManager
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setSstFileManager(
+    JNIEnv*, jobject, jlong jhandle, jlong jsst_file_manager_handle) {
+  auto* sptr_sst_file_manager =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::SstFileManager>*>(
+          jsst_file_manager_handle);
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)->sst_file_manager =
+      *sptr_sst_file_manager;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setLogger
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setLogger(JNIEnv*, jobject, jlong jhandle,
+                                          jlong jlogger_handle) {
+  std::shared_ptr<ROCKSDB_NAMESPACE::LoggerJniCallback>* pLogger =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::LoggerJniCallback>*>(
+          jlogger_handle);
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)->info_log = *pLogger;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setInfoLogLevel
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_DBOptions_setInfoLogLevel(JNIEnv*, jobject, jlong jhandle,
+                                                jbyte jlog_level) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)->info_log_level =
+      static_cast<ROCKSDB_NAMESPACE::InfoLogLevel>(jlog_level);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    infoLogLevel
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_DBOptions_infoLogLevel(JNIEnv*, jobject, jlong jhandle) {
+  return static_cast<jbyte>(
+      reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)->info_log_level);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setMaxTotalWalSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setMaxTotalWalSize(JNIEnv*, jobject,
+                                                   jlong jhandle,
+                                                   jlong jmax_total_wal_size) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)->max_total_wal_size =
+      static_cast<jlong>(jmax_total_wal_size);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    maxTotalWalSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_maxTotalWalSize(JNIEnv*, jobject,
+                                                 jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->max_total_wal_size;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setMaxOpenFiles
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DBOptions_setMaxOpenFiles(JNIEnv*, jobject, jlong jhandle,
+                                                jint max_open_files) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)->max_open_files =
+      static_cast<int>(max_open_files);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    maxOpenFiles
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_DBOptions_maxOpenFiles(JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->max_open_files;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setMaxFileOpeningThreads
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DBOptions_setMaxFileOpeningThreads(
+    JNIEnv*, jobject, jlong jhandle, jint jmax_file_opening_threads) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->max_file_opening_threads = static_cast<int>(jmax_file_opening_threads);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    maxFileOpeningThreads
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_DBOptions_maxFileOpeningThreads(JNIEnv*, jobject,
+                                                      jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<int>(opt->max_file_opening_threads);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setStatistics
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setStatistics(JNIEnv*, jobject, jlong jhandle,
+                                              jlong jstatistics_handle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  auto* pSptr =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::StatisticsJni>*>(
+          jstatistics_handle);
+  opt->statistics = *pSptr;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    statistics
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_statistics(JNIEnv*, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> sptr = opt->statistics;
+  if (sptr == nullptr) {
+    return 0;
+  } else {
+    std::shared_ptr<ROCKSDB_NAMESPACE::Statistics>* pSptr =
+        new std::shared_ptr<ROCKSDB_NAMESPACE::Statistics>(sptr);
+    return GET_CPLUSPLUS_POINTER(pSptr);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setUseFsync
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setUseFsync(JNIEnv*, jobject, jlong jhandle,
+                                            jboolean use_fsync) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)->use_fsync =
+      static_cast<bool>(use_fsync);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    useFsync
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_useFsync(JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)->use_fsync;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setDbPaths
+ * Signature: (J[Ljava/lang/String;[J)V
+ */
+void Java_org_rocksdb_DBOptions_setDbPaths(JNIEnv* env, jobject, jlong jhandle,
+                                           jobjectArray jpaths,
+                                           jlongArray jtarget_sizes) {
+  std::vector<ROCKSDB_NAMESPACE::DbPath> db_paths;
+  jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, nullptr);
+  if (ptr_jtarget_size == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  jboolean has_exception = JNI_FALSE;
+  const jsize len = env->GetArrayLength(jpaths);
+  for (jsize i = 0; i < len; i++) {
+    jobject jpath =
+        reinterpret_cast<jstring>(env->GetObjectArrayElement(jpaths, i));
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
+      return;
+    }
+    std::string path = ROCKSDB_NAMESPACE::JniUtil::copyStdString(
+        env, static_cast<jstring>(jpath), &has_exception);
+    env->DeleteLocalRef(jpath);
+
+    if (has_exception == JNI_TRUE) {
+      env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
+      return;
+    }
+
+    jlong jtarget_size = ptr_jtarget_size[i];
+
+    db_paths.push_back(
+        ROCKSDB_NAMESPACE::DbPath(path, static_cast<uint64_t>(jtarget_size)));
+  }
+
+  env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
+
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->db_paths = db_paths;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    dbPathsLen
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_dbPathsLen(JNIEnv*, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jlong>(opt->db_paths.size());
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    dbPaths
+ * Signature: (J[Ljava/lang/String;[J)V
+ */
+void Java_org_rocksdb_DBOptions_dbPaths(JNIEnv* env, jobject, jlong jhandle,
+                                        jobjectArray jpaths,
+                                        jlongArray jtarget_sizes) {
+  jboolean is_copy;
+  jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, &is_copy);
+  if (ptr_jtarget_size == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  const jsize len = env->GetArrayLength(jpaths);
+  for (jsize i = 0; i < len; i++) {
+    ROCKSDB_NAMESPACE::DbPath db_path = opt->db_paths[i];
+
+    jstring jpath = env->NewStringUTF(db_path.path.c_str());
+    if (jpath == nullptr) {
+      // exception thrown: OutOfMemoryError
+      env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
+      return;
+    }
+    env->SetObjectArrayElement(jpaths, i, jpath);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      env->DeleteLocalRef(jpath);
+      env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
+      return;
+    }
+
+    ptr_jtarget_size[i] = static_cast<jint>(db_path.target_size);
+  }
+
+  env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size,
+                                is_copy == JNI_TRUE ? 0 : JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setDbLogDir
+ * Signature: (JLjava/lang/String)V
+ */
+void Java_org_rocksdb_DBOptions_setDbLogDir(JNIEnv* env, jobject, jlong jhandle,
+                                            jstring jdb_log_dir) {
+  const char* log_dir = env->GetStringUTFChars(jdb_log_dir, nullptr);
+  if (log_dir == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)->db_log_dir.assign(
+      log_dir);
+  env->ReleaseStringUTFChars(jdb_log_dir, log_dir);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    dbLogDir
+ * Signature: (J)Ljava/lang/String
+ */
+jstring Java_org_rocksdb_DBOptions_dbLogDir(JNIEnv* env, jobject,
+                                            jlong jhandle) {
+  return env->NewStringUTF(
+      reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+          ->db_log_dir.c_str());
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setWalDir
+ * Signature: (JLjava/lang/String)V
+ */
+void Java_org_rocksdb_DBOptions_setWalDir(JNIEnv* env, jobject, jlong jhandle,
+                                          jstring jwal_dir) {
+  const char* wal_dir = env->GetStringUTFChars(jwal_dir, 0);
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)->wal_dir.assign(
+      wal_dir);
+  env->ReleaseStringUTFChars(jwal_dir, wal_dir);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    walDir
+ * Signature: (J)Ljava/lang/String
+ */
+jstring Java_org_rocksdb_DBOptions_walDir(JNIEnv* env, jobject, jlong jhandle) {
+  return env->NewStringUTF(
+      reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+          ->wal_dir.c_str());
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setDeleteObsoleteFilesPeriodMicros
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setDeleteObsoleteFilesPeriodMicros(
+    JNIEnv*, jobject, jlong jhandle, jlong micros) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->delete_obsolete_files_period_micros = static_cast<int64_t>(micros);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    deleteObsoleteFilesPeriodMicros
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_deleteObsoleteFilesPeriodMicros(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->delete_obsolete_files_period_micros;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setMaxBackgroundCompactions
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DBOptions_setMaxBackgroundCompactions(JNIEnv*, jobject,
+                                                            jlong jhandle,
+                                                            jint max) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->max_background_compactions = static_cast<int>(max);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    maxBackgroundCompactions
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_DBOptions_maxBackgroundCompactions(JNIEnv*, jobject,
+                                                         jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->max_background_compactions;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setMaxSubcompactions
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DBOptions_setMaxSubcompactions(JNIEnv*, jobject,
+                                                     jlong jhandle, jint max) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)->max_subcompactions =
+      static_cast<int32_t>(max);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    maxSubcompactions
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_DBOptions_maxSubcompactions(JNIEnv*, jobject,
+                                                  jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->max_subcompactions;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setMaxBackgroundFlushes
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DBOptions_setMaxBackgroundFlushes(
+    JNIEnv*, jobject, jlong jhandle, jint max_background_flushes) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->max_background_flushes = static_cast<int>(max_background_flushes);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    maxBackgroundFlushes
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_DBOptions_maxBackgroundFlushes(JNIEnv*, jobject,
+                                                     jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->max_background_flushes;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setMaxBackgroundJobs
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DBOptions_setMaxBackgroundJobs(JNIEnv*, jobject,
+                                                     jlong jhandle,
+                                                     jint max_background_jobs) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->max_background_jobs = static_cast<int>(max_background_jobs);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    maxBackgroundJobs
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_DBOptions_maxBackgroundJobs(JNIEnv*, jobject,
+                                                  jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->max_background_jobs;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setMaxLogFileSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setMaxLogFileSize(JNIEnv* env, jobject,
+                                                  jlong jhandle,
+                                                  jlong max_log_file_size) {
+  auto s =
+      ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(max_log_file_size);
+  if (s.ok()) {
+    reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+        ->max_log_file_size = max_log_file_size;
+  } else {
+    ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    maxLogFileSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_maxLogFileSize(JNIEnv*, jobject,
+                                                jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->max_log_file_size;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setLogFileTimeToRoll
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setLogFileTimeToRoll(
+    JNIEnv* env, jobject, jlong jhandle, jlong log_file_time_to_roll) {
+  auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(
+      log_file_time_to_roll);
+  if (s.ok()) {
+    reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+        ->log_file_time_to_roll = log_file_time_to_roll;
+  } else {
+    ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    logFileTimeToRoll
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_logFileTimeToRoll(JNIEnv*, jobject,
+                                                   jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->log_file_time_to_roll;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setKeepLogFileNum
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setKeepLogFileNum(JNIEnv* env, jobject,
+                                                  jlong jhandle,
+                                                  jlong keep_log_file_num) {
+  auto s =
+      ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(keep_log_file_num);
+  if (s.ok()) {
+    reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+        ->keep_log_file_num = keep_log_file_num;
+  } else {
+    ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    keepLogFileNum
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_keepLogFileNum(JNIEnv*, jobject,
+                                                jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->keep_log_file_num;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setRecycleLogFileNum
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setRecycleLogFileNum(
+    JNIEnv* env, jobject, jlong jhandle, jlong recycle_log_file_num) {
+  auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(
+      recycle_log_file_num);
+  if (s.ok()) {
+    reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+        ->recycle_log_file_num = recycle_log_file_num;
+  } else {
+    ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    recycleLogFileNum
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_recycleLogFileNum(JNIEnv*, jobject,
+                                                   jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->recycle_log_file_num;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setMaxManifestFileSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setMaxManifestFileSize(
+    JNIEnv*, jobject, jlong jhandle, jlong max_manifest_file_size) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->max_manifest_file_size = static_cast<int64_t>(max_manifest_file_size);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    maxManifestFileSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_maxManifestFileSize(JNIEnv*, jobject,
+                                                     jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->max_manifest_file_size;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setTableCacheNumshardbits
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DBOptions_setTableCacheNumshardbits(
+    JNIEnv*, jobject, jlong jhandle, jint table_cache_numshardbits) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->table_cache_numshardbits = static_cast<int>(table_cache_numshardbits);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    tableCacheNumshardbits
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_DBOptions_tableCacheNumshardbits(JNIEnv*, jobject,
+                                                       jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->table_cache_numshardbits;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setWalTtlSeconds
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setWalTtlSeconds(JNIEnv*, jobject,
+                                                 jlong jhandle,
+                                                 jlong WAL_ttl_seconds) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)->WAL_ttl_seconds =
+      static_cast<int64_t>(WAL_ttl_seconds);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    walTtlSeconds
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_walTtlSeconds(JNIEnv*, jobject,
+                                               jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->WAL_ttl_seconds;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setWalSizeLimitMB
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setWalSizeLimitMB(JNIEnv*, jobject,
+                                                  jlong jhandle,
+                                                  jlong WAL_size_limit_MB) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)->WAL_size_limit_MB =
+      static_cast<int64_t>(WAL_size_limit_MB);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    walTtlSeconds
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_walSizeLimitMB(JNIEnv*, jobject,
+                                                jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->WAL_size_limit_MB;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setMaxWriteBatchGroupSizeBytes
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setMaxWriteBatchGroupSizeBytes(
+    JNIEnv*, jclass, jlong jhandle, jlong jmax_write_batch_group_size_bytes) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->max_write_batch_group_size_bytes =
+      static_cast<uint64_t>(jmax_write_batch_group_size_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    maxWriteBatchGroupSizeBytes
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_maxWriteBatchGroupSizeBytes(JNIEnv*, jclass,
+                                                             jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jlong>(opt->max_write_batch_group_size_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setManifestPreallocationSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setManifestPreallocationSize(
+    JNIEnv* env, jobject, jlong jhandle, jlong preallocation_size) {
+  auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(
+      preallocation_size);
+  if (s.ok()) {
+    reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+        ->manifest_preallocation_size = preallocation_size;
+  } else {
+    ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    manifestPreallocationSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_manifestPreallocationSize(JNIEnv*, jobject,
+                                                           jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->manifest_preallocation_size;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    useDirectReads
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_useDirectReads(JNIEnv*, jobject,
+                                                   jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->use_direct_reads;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setUseDirectReads
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setUseDirectReads(JNIEnv*, jobject,
+                                                  jlong jhandle,
+                                                  jboolean use_direct_reads) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)->use_direct_reads =
+      static_cast<bool>(use_direct_reads);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    useDirectIoForFlushAndCompaction
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_useDirectIoForFlushAndCompaction(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->use_direct_io_for_flush_and_compaction;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setUseDirectReads
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setUseDirectIoForFlushAndCompaction(
+    JNIEnv*, jobject, jlong jhandle,
+    jboolean use_direct_io_for_flush_and_compaction) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->use_direct_io_for_flush_and_compaction =
+      static_cast<bool>(use_direct_io_for_flush_and_compaction);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setAllowFAllocate
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setAllowFAllocate(JNIEnv*, jobject,
+                                                  jlong jhandle,
+                                                  jboolean jallow_fallocate) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)->allow_fallocate =
+      static_cast<bool>(jallow_fallocate);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    allowFAllocate
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_allowFAllocate(JNIEnv*, jobject,
+                                                   jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jboolean>(opt->allow_fallocate);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setAllowMmapReads
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setAllowMmapReads(JNIEnv*, jobject,
+                                                  jlong jhandle,
+                                                  jboolean allow_mmap_reads) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)->allow_mmap_reads =
+      static_cast<bool>(allow_mmap_reads);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    allowMmapReads
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_allowMmapReads(JNIEnv*, jobject,
+                                                   jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->allow_mmap_reads;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setAllowMmapWrites
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setAllowMmapWrites(JNIEnv*, jobject,
+                                                   jlong jhandle,
+                                                   jboolean allow_mmap_writes) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)->allow_mmap_writes =
+      static_cast<bool>(allow_mmap_writes);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    allowMmapWrites
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_allowMmapWrites(JNIEnv*, jobject,
+                                                    jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->allow_mmap_writes;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setIsFdCloseOnExec
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setIsFdCloseOnExec(
+    JNIEnv*, jobject, jlong jhandle, jboolean is_fd_close_on_exec) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->is_fd_close_on_exec = static_cast<bool>(is_fd_close_on_exec);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    isFdCloseOnExec
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_isFdCloseOnExec(JNIEnv*, jobject,
+                                                    jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->is_fd_close_on_exec;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setStatsDumpPeriodSec
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DBOptions_setStatsDumpPeriodSec(
+    JNIEnv*, jobject, jlong jhandle, jint jstats_dump_period_sec) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->stats_dump_period_sec =
+      static_cast<unsigned int>(jstats_dump_period_sec);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    statsDumpPeriodSec
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_DBOptions_statsDumpPeriodSec(JNIEnv*, jobject,
+                                                   jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->stats_dump_period_sec;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setStatsPersistPeriodSec
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DBOptions_setStatsPersistPeriodSec(
+    JNIEnv*, jobject, jlong jhandle, jint jstats_persist_period_sec) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->stats_persist_period_sec =
+      static_cast<unsigned int>(jstats_persist_period_sec);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    statsPersistPeriodSec
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_DBOptions_statsPersistPeriodSec(JNIEnv*, jobject,
+                                                      jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->stats_persist_period_sec;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setStatsHistoryBufferSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setStatsHistoryBufferSize(
+    JNIEnv*, jobject, jlong jhandle, jlong jstats_history_buffer_size) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->stats_history_buffer_size =
+      static_cast<size_t>(jstats_history_buffer_size);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    statsHistoryBufferSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_statsHistoryBufferSize(JNIEnv*, jobject,
+                                                        jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->stats_history_buffer_size;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setAdviseRandomOnOpen
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setAdviseRandomOnOpen(
+    JNIEnv*, jobject, jlong jhandle, jboolean advise_random_on_open) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->advise_random_on_open = static_cast<bool>(advise_random_on_open);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    adviseRandomOnOpen
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_adviseRandomOnOpen(JNIEnv*, jobject,
+                                                       jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->advise_random_on_open;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setDbWriteBufferSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setDbWriteBufferSize(
+    JNIEnv*, jobject, jlong jhandle, jlong jdb_write_buffer_size) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->db_write_buffer_size = static_cast<size_t>(jdb_write_buffer_size);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setWriteBufferManager
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setWriteBufferManager(
+    JNIEnv*, jobject, jlong jdb_options_handle,
+    jlong jwrite_buffer_manager_handle) {
+  auto* write_buffer_manager =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::WriteBufferManager>*>(
+          jwrite_buffer_manager_handle);
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jdb_options_handle)
+      ->write_buffer_manager = *write_buffer_manager;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    dbWriteBufferSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_dbWriteBufferSize(JNIEnv*, jobject,
+                                                   jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jlong>(opt->db_write_buffer_size);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setAccessHintOnCompactionStart
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_DBOptions_setAccessHintOnCompactionStart(
+    JNIEnv*, jobject, jlong jhandle, jbyte jaccess_hint_value) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->access_hint_on_compaction_start =
+      ROCKSDB_NAMESPACE::AccessHintJni::toCppAccessHint(jaccess_hint_value);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    accessHintOnCompactionStart
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_DBOptions_accessHintOnCompactionStart(JNIEnv*, jobject,
+                                                             jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return ROCKSDB_NAMESPACE::AccessHintJni::toJavaAccessHint(
+      opt->access_hint_on_compaction_start);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setCompactionReadaheadSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setCompactionReadaheadSize(
+    JNIEnv*, jobject, jlong jhandle, jlong jcompaction_readahead_size) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->compaction_readahead_size =
+      static_cast<size_t>(jcompaction_readahead_size);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    compactionReadaheadSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_compactionReadaheadSize(JNIEnv*, jobject,
+                                                         jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jlong>(opt->compaction_readahead_size);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setRandomAccessMaxBufferSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setRandomAccessMaxBufferSize(
+    JNIEnv*, jobject, jlong jhandle, jlong jrandom_access_max_buffer_size) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->random_access_max_buffer_size =
+      static_cast<size_t>(jrandom_access_max_buffer_size);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    randomAccessMaxBufferSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_randomAccessMaxBufferSize(JNIEnv*, jobject,
+                                                           jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jlong>(opt->random_access_max_buffer_size);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setWritableFileMaxBufferSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setWritableFileMaxBufferSize(
+    JNIEnv*, jobject, jlong jhandle, jlong jwritable_file_max_buffer_size) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->writable_file_max_buffer_size =
+      static_cast<size_t>(jwritable_file_max_buffer_size);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    writableFileMaxBufferSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_writableFileMaxBufferSize(JNIEnv*, jobject,
+                                                           jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jlong>(opt->writable_file_max_buffer_size);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setUseAdaptiveMutex
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setUseAdaptiveMutex(
+    JNIEnv*, jobject, jlong jhandle, jboolean use_adaptive_mutex) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)->use_adaptive_mutex =
+      static_cast<bool>(use_adaptive_mutex);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    useAdaptiveMutex
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_useAdaptiveMutex(JNIEnv*, jobject,
+                                                     jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->use_adaptive_mutex;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setBytesPerSync
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setBytesPerSync(JNIEnv*, jobject, jlong jhandle,
+                                                jlong bytes_per_sync) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)->bytes_per_sync =
+      static_cast<int64_t>(bytes_per_sync);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    bytesPerSync
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_bytesPerSync(JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->bytes_per_sync;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setWalBytesPerSync
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setWalBytesPerSync(JNIEnv*, jobject,
+                                                   jlong jhandle,
+                                                   jlong jwal_bytes_per_sync) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)->wal_bytes_per_sync =
+      static_cast<int64_t>(jwal_bytes_per_sync);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    walBytesPerSync
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_walBytesPerSync(JNIEnv*, jobject,
+                                                 jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jlong>(opt->wal_bytes_per_sync);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setStrictBytesPerSync
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setStrictBytesPerSync(
+    JNIEnv*, jobject, jlong jhandle, jboolean jstrict_bytes_per_sync) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->strict_bytes_per_sync = jstrict_bytes_per_sync == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    strictBytesPerSync
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_strictBytesPerSync(JNIEnv*, jobject,
+                                                       jlong jhandle) {
+  return static_cast<jboolean>(
+      reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+          ->strict_bytes_per_sync);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setEventListeners
+ * Signature: (J[J)V
+ */
+void Java_org_rocksdb_DBOptions_setEventListeners(JNIEnv* env, jclass,
+                                                  jlong jhandle,
+                                                  jlongArray jlistener_array) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  rocksdb_set_event_listeners_helper(env, jlistener_array, opt->listeners);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    eventListeners
+ * Signature: (J)[Lorg/rocksdb/AbstractEventListener;
+ */
+jobjectArray Java_org_rocksdb_DBOptions_eventListeners(JNIEnv* env, jclass,
+                                                       jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return rocksdb_get_event_listeners_helper(env, opt->listeners);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setDelayedWriteRate
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setDelayedWriteRate(JNIEnv*, jobject,
+                                                    jlong jhandle,
+                                                    jlong jdelayed_write_rate) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->delayed_write_rate = static_cast<uint64_t>(jdelayed_write_rate);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    delayedWriteRate
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_delayedWriteRate(JNIEnv*, jobject,
+                                                  jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jlong>(opt->delayed_write_rate);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setEnablePipelinedWrite
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setEnablePipelinedWrite(
+    JNIEnv*, jobject, jlong jhandle, jboolean jenable_pipelined_write) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->enable_pipelined_write = jenable_pipelined_write == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    enablePipelinedWrite
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_enablePipelinedWrite(JNIEnv*, jobject,
+                                                         jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jboolean>(opt->enable_pipelined_write);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setUnorderedWrite
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setUnorderedWrite(JNIEnv*, jobject,
+                                                  jlong jhandle,
+                                                  jboolean junordered_write) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->unordered_write = junordered_write == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    unorderedWrite
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_unorderedWrite(JNIEnv*, jobject,
+                                                   jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jboolean>(opt->unordered_write);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setEnableThreadTracking
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setEnableThreadTracking(
+    JNIEnv*, jobject, jlong jhandle, jboolean jenable_thread_tracking) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->enable_thread_tracking = jenable_thread_tracking == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    enableThreadTracking
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_enableThreadTracking(JNIEnv*, jobject,
+                                                         jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jboolean>(opt->enable_thread_tracking);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setAllowConcurrentMemtableWrite
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setAllowConcurrentMemtableWrite(
+    JNIEnv*, jobject, jlong jhandle, jboolean allow) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->allow_concurrent_memtable_write = static_cast<bool>(allow);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    allowConcurrentMemtableWrite
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_allowConcurrentMemtableWrite(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->allow_concurrent_memtable_write;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setEnableWriteThreadAdaptiveYield
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setEnableWriteThreadAdaptiveYield(
+    JNIEnv*, jobject, jlong jhandle, jboolean yield) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->enable_write_thread_adaptive_yield = static_cast<bool>(yield);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    enableWriteThreadAdaptiveYield
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_enableWriteThreadAdaptiveYield(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->enable_write_thread_adaptive_yield;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setWriteThreadMaxYieldUsec
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setWriteThreadMaxYieldUsec(JNIEnv*, jobject,
+                                                           jlong jhandle,
+                                                           jlong max) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->write_thread_max_yield_usec = static_cast<int64_t>(max);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    writeThreadMaxYieldUsec
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_writeThreadMaxYieldUsec(JNIEnv*, jobject,
+                                                         jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->write_thread_max_yield_usec;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setWriteThreadSlowYieldUsec
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setWriteThreadSlowYieldUsec(JNIEnv*, jobject,
+                                                            jlong jhandle,
+                                                            jlong slow) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->write_thread_slow_yield_usec = static_cast<int64_t>(slow);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    writeThreadSlowYieldUsec
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_writeThreadSlowYieldUsec(JNIEnv*, jobject,
+                                                          jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle)
+      ->write_thread_slow_yield_usec;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setSkipStatsUpdateOnDbOpen
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setSkipStatsUpdateOnDbOpen(
+    JNIEnv*, jobject, jlong jhandle, jboolean jskip_stats_update_on_db_open) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->skip_stats_update_on_db_open =
+      static_cast<bool>(jskip_stats_update_on_db_open);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    skipStatsUpdateOnDbOpen
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_skipStatsUpdateOnDbOpen(JNIEnv*, jobject,
+                                                            jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jboolean>(opt->skip_stats_update_on_db_open);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setSkipCheckingSstFileSizesOnDbOpen
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setSkipCheckingSstFileSizesOnDbOpen(
+    JNIEnv*, jclass, jlong jhandle,
+    jboolean jskip_checking_sst_file_sizes_on_db_open) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->skip_checking_sst_file_sizes_on_db_open =
+      static_cast<bool>(jskip_checking_sst_file_sizes_on_db_open);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    skipCheckingSstFileSizesOnDbOpen
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_skipCheckingSstFileSizesOnDbOpen(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jboolean>(opt->skip_checking_sst_file_sizes_on_db_open);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setWalRecoveryMode
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_DBOptions_setWalRecoveryMode(
+    JNIEnv*, jobject, jlong jhandle, jbyte jwal_recovery_mode_value) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->wal_recovery_mode =
+      ROCKSDB_NAMESPACE::WALRecoveryModeJni::toCppWALRecoveryMode(
+          jwal_recovery_mode_value);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    walRecoveryMode
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_DBOptions_walRecoveryMode(JNIEnv*, jobject,
+                                                 jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return ROCKSDB_NAMESPACE::WALRecoveryModeJni::toJavaWALRecoveryMode(
+      opt->wal_recovery_mode);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setAllow2pc
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setAllow2pc(JNIEnv*, jobject, jlong jhandle,
+                                            jboolean jallow_2pc) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->allow_2pc = static_cast<bool>(jallow_2pc);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    allow2pc
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_allow2pc(JNIEnv*, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jboolean>(opt->allow_2pc);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setRowCache
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setRowCache(JNIEnv*, jobject, jlong jhandle,
+                                            jlong jrow_cache_handle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  auto* row_cache =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache>*>(
+          jrow_cache_handle);
+  opt->row_cache = *row_cache;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setWalFilter
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setWalFilter(JNIEnv*, jobject, jlong jhandle,
+                                             jlong jwal_filter_handle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  auto* wal_filter = reinterpret_cast<ROCKSDB_NAMESPACE::WalFilterJniCallback*>(
+      jwal_filter_handle);
+  opt->wal_filter = wal_filter;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setFailIfOptionsFileError
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setFailIfOptionsFileError(
+    JNIEnv*, jobject, jlong jhandle, jboolean jfail_if_options_file_error) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->fail_if_options_file_error =
+      static_cast<bool>(jfail_if_options_file_error);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    failIfOptionsFileError
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_failIfOptionsFileError(JNIEnv*, jobject,
+                                                           jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jboolean>(opt->fail_if_options_file_error);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setDumpMallocStats
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setDumpMallocStats(
+    JNIEnv*, jobject, jlong jhandle, jboolean jdump_malloc_stats) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->dump_malloc_stats = static_cast<bool>(jdump_malloc_stats);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    dumpMallocStats
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_dumpMallocStats(JNIEnv*, jobject,
+                                                    jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jboolean>(opt->dump_malloc_stats);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setAvoidFlushDuringRecovery
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setAvoidFlushDuringRecovery(
+    JNIEnv*, jobject, jlong jhandle, jboolean javoid_flush_during_recovery) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->avoid_flush_during_recovery =
+      static_cast<bool>(javoid_flush_during_recovery);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    avoidFlushDuringRecovery
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_avoidFlushDuringRecovery(JNIEnv*, jobject,
+                                                             jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jboolean>(opt->avoid_flush_during_recovery);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setAllowIngestBehind
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setAllowIngestBehind(
+    JNIEnv*, jobject, jlong jhandle, jboolean jallow_ingest_behind) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->allow_ingest_behind = jallow_ingest_behind == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    allowIngestBehind
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_allowIngestBehind(JNIEnv*, jobject,
+                                                      jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jboolean>(opt->allow_ingest_behind);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setTwoWriteQueues
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setTwoWriteQueues(JNIEnv*, jobject,
+                                                  jlong jhandle,
+                                                  jboolean jtwo_write_queues) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->two_write_queues = jtwo_write_queues == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    twoWriteQueues
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_twoWriteQueues(JNIEnv*, jobject,
+                                                   jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jboolean>(opt->two_write_queues);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setManualWalFlush
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setManualWalFlush(JNIEnv*, jobject,
+                                                  jlong jhandle,
+                                                  jboolean jmanual_wal_flush) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->manual_wal_flush = jmanual_wal_flush == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    manualWalFlush
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_manualWalFlush(JNIEnv*, jobject,
+                                                   jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jboolean>(opt->manual_wal_flush);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setAtomicFlush
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setAtomicFlush(JNIEnv*, jobject, jlong jhandle,
+                                               jboolean jatomic_flush) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->atomic_flush = jatomic_flush == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    atomicFlush
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_atomicFlush(JNIEnv*, jobject,
+                                                jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jboolean>(opt->atomic_flush);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setAvoidFlushDuringShutdown
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setAvoidFlushDuringShutdown(
+    JNIEnv*, jobject, jlong jhandle, jboolean javoid_flush_during_shutdown) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->avoid_flush_during_shutdown =
+      static_cast<bool>(javoid_flush_during_shutdown);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    avoidFlushDuringShutdown
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_avoidFlushDuringShutdown(JNIEnv*, jobject,
+                                                             jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jboolean>(opt->avoid_flush_during_shutdown);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setAvoidUnnecessaryBlockingIO
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setAvoidUnnecessaryBlockingIO(
+    JNIEnv*, jclass, jlong jhandle, jboolean avoid_blocking_io) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->avoid_unnecessary_blocking_io = static_cast<bool>(avoid_blocking_io);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    avoidUnnecessaryBlockingIO
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_avoidUnnecessaryBlockingIO(JNIEnv*, jclass,
+                                                               jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jboolean>(opt->avoid_unnecessary_blocking_io);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setPersistStatsToDisk
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setPersistStatsToDisk(
+    JNIEnv*, jclass, jlong jhandle, jboolean persist_stats_to_disk) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->persist_stats_to_disk = static_cast<bool>(persist_stats_to_disk);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    persistStatsToDisk
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_persistStatsToDisk(JNIEnv*, jclass,
+                                                       jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jboolean>(opt->persist_stats_to_disk);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setWriteDbidToManifest
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setWriteDbidToManifest(
+    JNIEnv*, jclass, jlong jhandle, jboolean jwrite_dbid_to_manifest) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->write_dbid_to_manifest = static_cast<bool>(jwrite_dbid_to_manifest);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    writeDbidToManifest
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_writeDbidToManifest(JNIEnv*, jclass,
+                                                        jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jboolean>(opt->write_dbid_to_manifest);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setLogReadaheadSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setLogReadaheadSize(JNIEnv*, jclass,
+                                                    jlong jhandle,
+                                                    jlong jlog_readahead_size) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->log_readahead_size = static_cast<size_t>(jlog_readahead_size);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    logReasaheadSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_logReadaheadSize(JNIEnv*, jclass,
+                                                  jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jlong>(opt->log_readahead_size);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setBestEffortsRecovery
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setBestEffortsRecovery(
+    JNIEnv*, jclass, jlong jhandle, jboolean jbest_efforts_recovery) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->best_efforts_recovery = static_cast<bool>(jbest_efforts_recovery);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    bestEffortsRecovery
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_bestEffortsRecovery(JNIEnv*, jclass,
+                                                        jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jlong>(opt->best_efforts_recovery);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setMaxBgErrorResumeCount
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DBOptions_setMaxBgErrorResumeCount(
+    JNIEnv*, jclass, jlong jhandle, jint jmax_bgerror_resume_count) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->max_bgerror_resume_count = static_cast<int>(jmax_bgerror_resume_count);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    maxBgerrorResumeCount
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_DBOptions_maxBgerrorResumeCount(JNIEnv*, jclass,
+                                                      jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jint>(opt->max_bgerror_resume_count);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setBgerrorResumeRetryInterval
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setBgerrorResumeRetryInterval(
+    JNIEnv*, jclass, jlong jhandle, jlong jbgerror_resume_retry_interval) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->bgerror_resume_retry_interval =
+      static_cast<uint64_t>(jbgerror_resume_retry_interval);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    bgerrorResumeRetryInterval
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_bgerrorResumeRetryInterval(JNIEnv*, jclass,
+                                                            jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jlong>(opt->bgerror_resume_retry_interval);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// ROCKSDB_NAMESPACE::WriteOptions
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    newWriteOptions
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_WriteOptions_newWriteOptions(JNIEnv*, jclass) {
+  auto* op = new ROCKSDB_NAMESPACE::WriteOptions();
+  return GET_CPLUSPLUS_POINTER(op);
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    copyWriteOptions
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_WriteOptions_copyWriteOptions(JNIEnv*, jclass,
+                                                     jlong jhandle) {
+  auto new_opt = new ROCKSDB_NAMESPACE::WriteOptions(
+      *(reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jhandle)));
+  return GET_CPLUSPLUS_POINTER(new_opt);
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    disposeInternal
+ * Signature: ()V
+ */
+void Java_org_rocksdb_WriteOptions_disposeInternal(JNIEnv*, jobject,
+                                                   jlong jhandle) {
+  auto* write_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jhandle);
+  assert(write_options != nullptr);
+  delete write_options;
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    setSync
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_WriteOptions_setSync(JNIEnv*, jobject, jlong jhandle,
+                                           jboolean jflag) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jhandle)->sync = jflag;
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    sync
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_WriteOptions_sync(JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jhandle)->sync;
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    setDisableWAL
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_WriteOptions_setDisableWAL(JNIEnv*, jobject,
+                                                 jlong jhandle,
+                                                 jboolean jflag) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jhandle)->disableWAL =
+      jflag;
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    disableWAL
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_WriteOptions_disableWAL(JNIEnv*, jobject,
+                                                  jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jhandle)
+      ->disableWAL;
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    setIgnoreMissingColumnFamilies
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_WriteOptions_setIgnoreMissingColumnFamilies(
+    JNIEnv*, jobject, jlong jhandle, jboolean jignore_missing_column_families) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jhandle)
+      ->ignore_missing_column_families =
+      static_cast<bool>(jignore_missing_column_families);
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    ignoreMissingColumnFamilies
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_WriteOptions_ignoreMissingColumnFamilies(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jhandle)
+      ->ignore_missing_column_families;
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    setNoSlowdown
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_WriteOptions_setNoSlowdown(JNIEnv*, jobject,
+                                                 jlong jhandle,
+                                                 jboolean jno_slowdown) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jhandle)->no_slowdown =
+      static_cast<bool>(jno_slowdown);
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    noSlowdown
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_WriteOptions_noSlowdown(JNIEnv*, jobject,
+                                                  jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jhandle)
+      ->no_slowdown;
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    setLowPri
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_WriteOptions_setLowPri(JNIEnv*, jobject, jlong jhandle,
+                                             jboolean jlow_pri) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jhandle)->low_pri =
+      static_cast<bool>(jlow_pri);
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    lowPri
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_WriteOptions_lowPri(JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jhandle)->low_pri;
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    memtableInsertHintPerBatch
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_WriteOptions_memtableInsertHintPerBatch(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jhandle)
+      ->memtable_insert_hint_per_batch;
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    setMemtableInsertHintPerBatch
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_WriteOptions_setMemtableInsertHintPerBatch(
+    JNIEnv*, jobject, jlong jhandle, jboolean jmemtable_insert_hint_per_batch) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jhandle)
+      ->memtable_insert_hint_per_batch =
+      static_cast<bool>(jmemtable_insert_hint_per_batch);
+}
+
+/////////////////////////////////////////////////////////////////////
+// ROCKSDB_NAMESPACE::ReadOptions
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    newReadOptions
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_ReadOptions_newReadOptions__(JNIEnv*, jclass) {
+  auto* read_options = new ROCKSDB_NAMESPACE::ReadOptions();
+  return GET_CPLUSPLUS_POINTER(read_options);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    newReadOptions
+ * Signature: (ZZ)J
+ */
+jlong Java_org_rocksdb_ReadOptions_newReadOptions__ZZ(
+    JNIEnv*, jclass, jboolean jverify_checksums, jboolean jfill_cache) {
+  auto* read_options = new ROCKSDB_NAMESPACE::ReadOptions(
+      static_cast<bool>(jverify_checksums), static_cast<bool>(jfill_cache));
+  return GET_CPLUSPLUS_POINTER(read_options);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    copyReadOptions
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ReadOptions_copyReadOptions(JNIEnv*, jclass,
+                                                   jlong jhandle) {
+  auto new_opt = new ROCKSDB_NAMESPACE::ReadOptions(
+      *(reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)));
+  return GET_CPLUSPLUS_POINTER(new_opt);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_ReadOptions_disposeInternal(JNIEnv*, jobject,
+                                                  jlong jhandle) {
+  auto* read_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  assert(read_options != nullptr);
+  delete read_options;
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setVerifyChecksums
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ReadOptions_setVerifyChecksums(
+    JNIEnv*, jobject, jlong jhandle, jboolean jverify_checksums) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)->verify_checksums =
+      static_cast<bool>(jverify_checksums);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    verifyChecksums
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ReadOptions_verifyChecksums(JNIEnv*, jobject,
+                                                      jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)
+      ->verify_checksums;
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setFillCache
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ReadOptions_setFillCache(JNIEnv*, jobject, jlong jhandle,
+                                               jboolean jfill_cache) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)->fill_cache =
+      static_cast<bool>(jfill_cache);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    fillCache
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ReadOptions_fillCache(JNIEnv*, jobject,
+                                                jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)->fill_cache;
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setTailing
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ReadOptions_setTailing(JNIEnv*, jobject, jlong jhandle,
+                                             jboolean jtailing) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)->tailing =
+      static_cast<bool>(jtailing);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    tailing
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ReadOptions_tailing(JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)->tailing;
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    managed
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ReadOptions_managed(JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)->managed;
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setManaged
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ReadOptions_setManaged(JNIEnv*, jobject, jlong jhandle,
+                                             jboolean jmanaged) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)->managed =
+      static_cast<bool>(jmanaged);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    totalOrderSeek
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ReadOptions_totalOrderSeek(JNIEnv*, jobject,
+                                                     jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)
+      ->total_order_seek;
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setTotalOrderSeek
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ReadOptions_setTotalOrderSeek(
+    JNIEnv*, jobject, jlong jhandle, jboolean jtotal_order_seek) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)->total_order_seek =
+      static_cast<bool>(jtotal_order_seek);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    prefixSameAsStart
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ReadOptions_prefixSameAsStart(JNIEnv*, jobject,
+                                                        jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)
+      ->prefix_same_as_start;
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setPrefixSameAsStart
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ReadOptions_setPrefixSameAsStart(
+    JNIEnv*, jobject, jlong jhandle, jboolean jprefix_same_as_start) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)
+      ->prefix_same_as_start = static_cast<bool>(jprefix_same_as_start);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    pinData
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ReadOptions_pinData(JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)->pin_data;
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setPinData
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ReadOptions_setPinData(JNIEnv*, jobject, jlong jhandle,
+                                             jboolean jpin_data) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)->pin_data =
+      static_cast<bool>(jpin_data);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    backgroundPurgeOnIteratorCleanup
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ReadOptions_backgroundPurgeOnIteratorCleanup(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  return static_cast<jboolean>(opt->background_purge_on_iterator_cleanup);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setBackgroundPurgeOnIteratorCleanup
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ReadOptions_setBackgroundPurgeOnIteratorCleanup(
+    JNIEnv*, jobject, jlong jhandle,
+    jboolean jbackground_purge_on_iterator_cleanup) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  opt->background_purge_on_iterator_cleanup =
+      static_cast<bool>(jbackground_purge_on_iterator_cleanup);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    readaheadSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ReadOptions_readaheadSize(JNIEnv*, jobject,
+                                                 jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  return static_cast<jlong>(opt->readahead_size);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setReadaheadSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ReadOptions_setReadaheadSize(JNIEnv*, jobject,
+                                                   jlong jhandle,
+                                                   jlong jreadahead_size) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  opt->readahead_size = static_cast<size_t>(jreadahead_size);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    maxSkippableInternalKeys
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ReadOptions_maxSkippableInternalKeys(JNIEnv*, jobject,
+                                                            jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  return static_cast<jlong>(opt->max_skippable_internal_keys);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setMaxSkippableInternalKeys
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ReadOptions_setMaxSkippableInternalKeys(
+    JNIEnv*, jobject, jlong jhandle, jlong jmax_skippable_internal_keys) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  opt->max_skippable_internal_keys =
+      static_cast<uint64_t>(jmax_skippable_internal_keys);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    ignoreRangeDeletions
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ReadOptions_ignoreRangeDeletions(JNIEnv*, jobject,
+                                                           jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  return static_cast<jboolean>(opt->ignore_range_deletions);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setIgnoreRangeDeletions
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ReadOptions_setIgnoreRangeDeletions(
+    JNIEnv*, jobject, jlong jhandle, jboolean jignore_range_deletions) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  opt->ignore_range_deletions = static_cast<bool>(jignore_range_deletions);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setSnapshot
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ReadOptions_setSnapshot(JNIEnv*, jobject, jlong jhandle,
+                                              jlong jsnapshot) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)->snapshot =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Snapshot*>(jsnapshot);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    snapshot
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ReadOptions_snapshot(JNIEnv*, jobject, jlong jhandle) {
+  auto& snapshot =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)->snapshot;
+  return GET_CPLUSPLUS_POINTER(snapshot);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    readTier
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_ReadOptions_readTier(JNIEnv*, jobject, jlong jhandle) {
+  return static_cast<jbyte>(
+      reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)->read_tier);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setReadTier
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_ReadOptions_setReadTier(JNIEnv*, jobject, jlong jhandle,
+                                              jbyte jread_tier) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)->read_tier =
+      static_cast<ROCKSDB_NAMESPACE::ReadTier>(jread_tier);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setIterateUpperBound
+ * Signature: (JJ)I
+ */
+void Java_org_rocksdb_ReadOptions_setIterateUpperBound(
+    JNIEnv*, jobject, jlong jhandle, jlong jupper_bound_slice_handle) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)
+      ->iterate_upper_bound =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(jupper_bound_slice_handle);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    iterateUpperBound
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ReadOptions_iterateUpperBound(JNIEnv*, jobject,
+                                                     jlong jhandle) {
+  auto& upper_bound_slice_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)
+          ->iterate_upper_bound;
+  return GET_CPLUSPLUS_POINTER(upper_bound_slice_handle);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setIterateLowerBound
+ * Signature: (JJ)I
+ */
+void Java_org_rocksdb_ReadOptions_setIterateLowerBound(
+    JNIEnv*, jobject, jlong jhandle, jlong jlower_bound_slice_handle) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)
+      ->iterate_lower_bound =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(jlower_bound_slice_handle);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    iterateLowerBound
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ReadOptions_iterateLowerBound(JNIEnv*, jobject,
+                                                     jlong jhandle) {
+  auto& lower_bound_slice_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)
+          ->iterate_lower_bound;
+  return GET_CPLUSPLUS_POINTER(lower_bound_slice_handle);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setTableFilter
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ReadOptions_setTableFilter(
+    JNIEnv*, jobject, jlong jhandle, jlong jjni_table_filter_handle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  auto* jni_table_filter =
+      reinterpret_cast<ROCKSDB_NAMESPACE::TableFilterJniCallback*>(
+          jjni_table_filter_handle);
+  opt->table_filter = jni_table_filter->GetTableFilterFunction();
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    autoPrefixMode
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ReadOptions_autoPrefixMode(JNIEnv*, jobject,
+                                                     jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  return static_cast<jboolean>(opt->auto_prefix_mode);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setAutoPrefixMode
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ReadOptions_setAutoPrefixMode(
+    JNIEnv*, jobject, jlong jhandle, jboolean jauto_prefix_mode) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  opt->auto_prefix_mode = static_cast<bool>(jauto_prefix_mode);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    timestamp
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ReadOptions_timestamp(JNIEnv*, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  auto& timestamp_slice_handle = opt->timestamp;
+  return reinterpret_cast<jlong>(timestamp_slice_handle);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setTimestamp
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ReadOptions_setTimestamp(JNIEnv*, jobject, jlong jhandle,
+                                               jlong jtimestamp_slice_handle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  opt->timestamp =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(jtimestamp_slice_handle);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    iterStartTs
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ReadOptions_iterStartTs(JNIEnv*, jobject,
+                                               jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  auto& iter_start_ts_handle = opt->iter_start_ts;
+  return reinterpret_cast<jlong>(iter_start_ts_handle);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setIterStartTs
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ReadOptions_setIterStartTs(JNIEnv*, jobject,
+                                                 jlong jhandle,
+                                                 jlong jiter_start_ts_handle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  opt->iter_start_ts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(jiter_start_ts_handle);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    deadline
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ReadOptions_deadline(JNIEnv*, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  return static_cast<jlong>(opt->deadline.count());
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setDeadline
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ReadOptions_setDeadline(JNIEnv*, jobject, jlong jhandle,
+                                              jlong jdeadline) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  opt->deadline = std::chrono::microseconds(static_cast<int64_t>(jdeadline));
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    ioTimeout
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ReadOptions_ioTimeout(JNIEnv*, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  return static_cast<jlong>(opt->io_timeout.count());
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setIoTimeout
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ReadOptions_setIoTimeout(JNIEnv*, jobject, jlong jhandle,
+                                               jlong jio_timeout) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  opt->io_timeout =
+      std::chrono::microseconds(static_cast<int64_t>(jio_timeout));
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    valueSizeSofLimit
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ReadOptions_valueSizeSoftLimit(JNIEnv*, jobject,
+                                                      jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  return static_cast<jlong>(opt->value_size_soft_limit);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setValueSizeSofLimit
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ReadOptions_setValueSizeSoftLimit(
+    JNIEnv*, jobject, jlong jhandle, jlong jvalue_size_soft_limit) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  opt->value_size_soft_limit = static_cast<uint64_t>(jvalue_size_soft_limit);
+}
+
+/////////////////////////////////////////////////////////////////////
+// ROCKSDB_NAMESPACE::ComparatorOptions
+
+/*
+ * Class:     org_rocksdb_ComparatorOptions
+ * Method:    newComparatorOptions
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_ComparatorOptions_newComparatorOptions(JNIEnv*, jclass) {
+  auto* comparator_opt = new ROCKSDB_NAMESPACE::ComparatorJniCallbackOptions();
+  return GET_CPLUSPLUS_POINTER(comparator_opt);
+}
+
+/*
+ * Class:     org_rocksdb_ComparatorOptions
+ * Method:    reusedSynchronisationType
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_ComparatorOptions_reusedSynchronisationType(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* comparator_opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ComparatorJniCallbackOptions*>(
+          jhandle);
+  return ROCKSDB_NAMESPACE::ReusedSynchronisationTypeJni::
+      toJavaReusedSynchronisationType(
+          comparator_opt->reused_synchronisation_type);
+}
+
+/*
+ * Class:     org_rocksdb_ComparatorOptions
+ * Method:    setReusedSynchronisationType
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_ComparatorOptions_setReusedSynchronisationType(
+    JNIEnv*, jobject, jlong jhandle, jbyte jreused_synhcronisation_type) {
+  auto* comparator_opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ComparatorJniCallbackOptions*>(
+          jhandle);
+  comparator_opt->reused_synchronisation_type =
+      ROCKSDB_NAMESPACE::ReusedSynchronisationTypeJni::
+          toCppReusedSynchronisationType(jreused_synhcronisation_type);
+}
+
+/*
+ * Class:     org_rocksdb_ComparatorOptions
+ * Method:    useDirectBuffer
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ComparatorOptions_useDirectBuffer(JNIEnv*, jobject,
+                                                            jlong jhandle) {
+  return static_cast<jboolean>(
+      reinterpret_cast<ROCKSDB_NAMESPACE::ComparatorJniCallbackOptions*>(
+          jhandle)
+          ->direct_buffer);
+}
+
+/*
+ * Class:     org_rocksdb_ComparatorOptions
+ * Method:    setUseDirectBuffer
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ComparatorOptions_setUseDirectBuffer(
+    JNIEnv*, jobject, jlong jhandle, jboolean jdirect_buffer) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ComparatorJniCallbackOptions*>(jhandle)
+      ->direct_buffer = jdirect_buffer == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_ComparatorOptions
+ * Method:    maxReusedBufferSize
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ComparatorOptions_maxReusedBufferSize(JNIEnv*, jobject,
+                                                            jlong jhandle) {
+  return static_cast<jint>(
+      reinterpret_cast<ROCKSDB_NAMESPACE::ComparatorJniCallbackOptions*>(
+          jhandle)
+          ->max_reused_buffer_size);
+}
+
+/*
+ * Class:     org_rocksdb_ComparatorOptions
+ * Method:    setMaxReusedBufferSize
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ComparatorOptions_setMaxReusedBufferSize(
+    JNIEnv*, jobject, jlong jhandle, jint jmax_reused_buffer_size) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ComparatorJniCallbackOptions*>(jhandle)
+      ->max_reused_buffer_size = static_cast<int32_t>(jmax_reused_buffer_size);
+}
+
+/*
+ * Class:     org_rocksdb_ComparatorOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_ComparatorOptions_disposeInternal(JNIEnv*, jobject,
+                                                        jlong jhandle) {
+  auto* comparator_opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ComparatorJniCallbackOptions*>(
+          jhandle);
+  assert(comparator_opt != nullptr);
+  delete comparator_opt;
+}
+
+/////////////////////////////////////////////////////////////////////
+// ROCKSDB_NAMESPACE::FlushOptions
+
+/*
+ * Class:     org_rocksdb_FlushOptions
+ * Method:    newFlushOptions
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_FlushOptions_newFlushOptions(JNIEnv*, jclass) {
+  auto* flush_opt = new ROCKSDB_NAMESPACE::FlushOptions();
+  return GET_CPLUSPLUS_POINTER(flush_opt);
+}
+
+/*
+ * Class:     org_rocksdb_FlushOptions
+ * Method:    setWaitForFlush
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_FlushOptions_setWaitForFlush(JNIEnv*, jobject,
+                                                   jlong jhandle,
+                                                   jboolean jwait) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::FlushOptions*>(jhandle)->wait =
+      static_cast<bool>(jwait);
+}
+
+/*
+ * Class:     org_rocksdb_FlushOptions
+ * Method:    waitForFlush
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_FlushOptions_waitForFlush(JNIEnv*, jobject,
+                                                    jlong jhandle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::FlushOptions*>(jhandle)->wait;
+}
+
+/*
+ * Class:     org_rocksdb_FlushOptions
+ * Method:    setAllowWriteStall
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_FlushOptions_setAllowWriteStall(
+    JNIEnv*, jobject, jlong jhandle, jboolean jallow_write_stall) {
+  auto* flush_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::FlushOptions*>(jhandle);
+  flush_options->allow_write_stall = jallow_write_stall == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_FlushOptions
+ * Method:    allowWriteStall
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_FlushOptions_allowWriteStall(JNIEnv*, jobject,
+                                                       jlong jhandle) {
+  auto* flush_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::FlushOptions*>(jhandle);
+  return static_cast<jboolean>(flush_options->allow_write_stall);
+}
+
+/*
+ * Class:     org_rocksdb_FlushOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_FlushOptions_disposeInternal(JNIEnv*, jobject,
+                                                   jlong jhandle) {
+  auto* flush_opt = reinterpret_cast<ROCKSDB_NAMESPACE::FlushOptions*>(jhandle);
+  assert(flush_opt != nullptr);
+  delete flush_opt;
+}
diff --git a/src/rocksdb/java/rocksjni/options_util.cc b/src/rocksdb/java/rocksjni/options_util.cc
new file mode 100644
index 000000000..1a5fb9bb5
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/options_util.cc
@@ -0,0 +1,195 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling C++ ROCKSDB_NAMESPACE::OptionsUtil methods from Java side.
+
+#include "rocksdb/utilities/options_util.h"
+
+#include <jni.h>
+
+#include <string>
+
+#include "include/org_rocksdb_OptionsUtil.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksjni/portal.h"
+
+void build_column_family_descriptor_list(
+    JNIEnv* env, jobject jcfds,
+    std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor>& cf_descs) {
+  jmethodID add_mid = ROCKSDB_NAMESPACE::ListJni::getListAddMethodId(env);
+  if (add_mid == nullptr) {
+    // exception occurred accessing method
+    return;
+  }
+
+  // Column family descriptor
+  for (ROCKSDB_NAMESPACE::ColumnFamilyDescriptor& cfd : cf_descs) {
+    // Construct a ColumnFamilyDescriptor java object
+    jobject jcfd =
+        ROCKSDB_NAMESPACE::ColumnFamilyDescriptorJni::construct(env, &cfd);
+    if (env->ExceptionCheck()) {
+      // exception occurred constructing object
+      if (jcfd != nullptr) {
+        env->DeleteLocalRef(jcfd);
+      }
+      return;
+    }
+
+    // Add the object to java list.
+    jboolean rs = env->CallBooleanMethod(jcfds, add_mid, jcfd);
+    if (env->ExceptionCheck() || rs == JNI_FALSE) {
+      // exception occurred calling method, or could not add
+      if (jcfd != nullptr) {
+        env->DeleteLocalRef(jcfd);
+      }
+      return;
+    }
+  }
+}
+
+/*
+ * Class:     org_rocksdb_OptionsUtil
+ * Method:    loadLatestOptions
+ * Signature: (Ljava/lang/String;JLjava/util/List;Z)V
+ */
+void Java_org_rocksdb_OptionsUtil_loadLatestOptions__Ljava_lang_String_2JJLjava_util_List_2Z(
+    JNIEnv* env, jclass /*jcls*/, jstring jdbpath, jlong jenv_handle,
+    jlong jdb_opts_handle, jobject jcfds, jboolean ignore_unknown_options) {
+  jboolean has_exception = JNI_FALSE;
+  auto db_path =
+      ROCKSDB_NAMESPACE::JniUtil::copyStdString(env, jdbpath, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    return;
+  }
+  std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor> cf_descs;
+  ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::LoadLatestOptions(
+      db_path, reinterpret_cast<ROCKSDB_NAMESPACE::Env*>(jenv_handle),
+      reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jdb_opts_handle),
+      &cf_descs, ignore_unknown_options);
+  if (!s.ok()) {
+    // error, raise an exception
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  } else {
+    build_column_family_descriptor_list(env, jcfds, cf_descs);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_OptionsUtil
+ * Method:    loadLatestOptions_1
+ * Signature: (JLjava/lang/String;JLjava/util/List;)V
+ */
+void Java_org_rocksdb_OptionsUtil_loadLatestOptions__JLjava_lang_String_2JLjava_util_List_2(
+    JNIEnv* env, jclass /*jcls*/, jlong cfg_handle, jstring jdbpath,
+    jlong jdb_opts_handle, jobject jcfds) {
+  jboolean has_exception = JNI_FALSE;
+  auto db_path =
+      ROCKSDB_NAMESPACE::JniUtil::copyStdString(env, jdbpath, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    return;
+  }
+  std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor> cf_descs;
+  auto* config_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions*>(cfg_handle);
+  auto* db_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jdb_opts_handle);
+  ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::LoadLatestOptions(
+      *config_options, db_path, db_options, &cf_descs);
+  if (!s.ok()) {
+    // error, raise an exception
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  } else {
+    build_column_family_descriptor_list(env, jcfds, cf_descs);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_OptionsUtil
+ * Method:    loadOptionsFromFile
+ * Signature: (Ljava/lang/String;JJLjava/util/List;Z)V
+ */
+void Java_org_rocksdb_OptionsUtil_loadOptionsFromFile__Ljava_lang_String_2JJLjava_util_List_2Z(
+    JNIEnv* env, jclass /*jcls*/, jstring jopts_file_name, jlong jenv_handle,
+    jlong jdb_opts_handle, jobject jcfds, jboolean ignore_unknown_options) {
+  jboolean has_exception = JNI_FALSE;
+  auto opts_file_name = ROCKSDB_NAMESPACE::JniUtil::copyStdString(
+      env, jopts_file_name, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    return;
+  }
+  std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor> cf_descs;
+  ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::LoadOptionsFromFile(
+      opts_file_name, reinterpret_cast<ROCKSDB_NAMESPACE::Env*>(jenv_handle),
+      reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jdb_opts_handle),
+      &cf_descs, ignore_unknown_options);
+  if (!s.ok()) {
+    // error, raise an exception
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  } else {
+    build_column_family_descriptor_list(env, jcfds, cf_descs);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_OptionsUtil
+ * Method:    loadOptionsFromFile
+ * Signature: (JLjava/lang/String;JLjava/util/List;)V
+ */
+void Java_org_rocksdb_OptionsUtil_loadOptionsFromFile__JLjava_lang_String_2JLjava_util_List_2(
+    JNIEnv* env, jclass /*jcls*/, jlong cfg_handle, jstring jopts_file_name,
+    jlong jdb_opts_handle, jobject jcfds) {
+  jboolean has_exception = JNI_FALSE;
+  auto opts_file_name = ROCKSDB_NAMESPACE::JniUtil::copyStdString(
+      env, jopts_file_name, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    return;
+  }
+  std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor> cf_descs;
+  auto* config_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions*>(cfg_handle);
+  auto* db_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jdb_opts_handle);
+  ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::LoadOptionsFromFile(
+      *config_options, opts_file_name, db_options, &cf_descs);
+  if (!s.ok()) {
+    // error, raise an exception
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  } else {
+    build_column_family_descriptor_list(env, jcfds, cf_descs);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_OptionsUtil
+ * Method:    getLatestOptionsFileName
+ * Signature: (Ljava/lang/String;J)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_OptionsUtil_getLatestOptionsFileName(
+    JNIEnv* env, jclass /*jcls*/, jstring jdbpath, jlong jenv_handle) {
+  jboolean has_exception = JNI_FALSE;
+  auto db_path =
+      ROCKSDB_NAMESPACE::JniUtil::copyStdString(env, jdbpath, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    return nullptr;
+  }
+  std::string options_file_name;
+  ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::GetLatestOptionsFileName(
+      db_path, reinterpret_cast<ROCKSDB_NAMESPACE::Env*>(jenv_handle),
+      &options_file_name);
+  if (!s.ok()) {
+    // error, raise an exception
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
+  } else {
+    return env->NewStringUTF(options_file_name.c_str());
+  }
+}
diff --git a/src/rocksdb/java/rocksjni/persistent_cache.cc b/src/rocksdb/java/rocksjni/persistent_cache.cc
new file mode 100644
index 000000000..295d91798
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/persistent_cache.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::PersistentCache.
+
+#include "rocksdb/persistent_cache.h"
+
+#include <jni.h>
+
+#include <string>
+
+#include "include/org_rocksdb_PersistentCache.h"
+#include "loggerjnicallback.h"
+#include "portal.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+
+/*
+ * Class:     org_rocksdb_PersistentCache
+ * Method:    newPersistentCache
+ * Signature: (JLjava/lang/String;JJZ)J
+ */
+jlong Java_org_rocksdb_PersistentCache_newPersistentCache(
+    JNIEnv* env, jclass, jlong jenv_handle, jstring jpath, jlong jsz,
+    jlong jlogger_handle, jboolean joptimized_for_nvm) {
+  auto* rocks_env = reinterpret_cast<ROCKSDB_NAMESPACE::Env*>(jenv_handle);
+  jboolean has_exception = JNI_FALSE;
+  std::string path =
+      ROCKSDB_NAMESPACE::JniUtil::copyStdString(env, jpath, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    return 0;
+  }
+  auto* logger =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::LoggerJniCallback>*>(
+          jlogger_handle);
+  auto* cache =
+      new std::shared_ptr<ROCKSDB_NAMESPACE::PersistentCache>(nullptr);
+  ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::NewPersistentCache(
+      rocks_env, path, static_cast<uint64_t>(jsz), *logger,
+      static_cast<bool>(joptimized_for_nvm), cache);
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+  return GET_CPLUSPLUS_POINTER(cache);
+}
+
+/*
+ * Class:     org_rocksdb_PersistentCache
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_PersistentCache_disposeInternal(JNIEnv*, jobject,
+                                                      jlong jhandle) {
+  auto* cache =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::PersistentCache>*>(
+          jhandle);
+  delete cache;  // delete std::shared_ptr
+}
diff --git a/src/rocksdb/java/rocksjni/portal.h b/src/rocksdb/java/rocksjni/portal.h
new file mode 100644
index 000000000..340199507
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/portal.h
@@ -0,0 +1,8745 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+// This file is designed for caching those frequently used IDs and provide
+// efficient portal (i.e, a set of static functions) to access java code
+// from c++.
+
+#ifndef JAVA_ROCKSJNI_PORTAL_H_
+#define JAVA_ROCKSJNI_PORTAL_H_
+
+#include <jni.h>
+
+#include <algorithm>
+#include <cstring>
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <set>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/backup_engine.h"
+#include "rocksdb/utilities/memory_util.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "rocksjni/compaction_filter_factory_jnicallback.h"
+#include "rocksjni/comparatorjnicallback.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/event_listener_jnicallback.h"
+#include "rocksjni/loggerjnicallback.h"
+#include "rocksjni/table_filter_jnicallback.h"
+#include "rocksjni/trace_writer_jnicallback.h"
+#include "rocksjni/transaction_notifier_jnicallback.h"
+#include "rocksjni/wal_filter_jnicallback.h"
+#include "rocksjni/writebatchhandlerjnicallback.h"
+
+// Remove macro on windows
+#ifdef DELETE
+#undef DELETE
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class JavaClass {
+ public:
+  /**
+   * Gets and initializes a Java Class
+   *
+   * @param env A pointer to the Java environment
+   * @param jclazz_name The fully qualified JNI name of the Java Class
+   *     e.g. "java/lang/String"
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env, const char* jclazz_name) {
+    jclass jclazz = env->FindClass(jclazz_name);
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+};
+
+// Native class template
+template <class PTR, class DERIVED>
+class RocksDBNativeClass : public JavaClass {};
+
+// Native class template for sub-classes of RocksMutableObject
+template <class PTR, class DERIVED>
+class NativeRocksMutableObject : public RocksDBNativeClass<PTR, DERIVED> {
+ public:
+  /**
+   * Gets the Java Method ID for the
+   * RocksMutableObject#setNativeHandle(long, boolean) method
+   *
+   * @param env A pointer to the Java environment
+   * @return The Java Method ID or nullptr the RocksMutableObject class cannot
+   *     be accessed, or if one of the NoSuchMethodError,
+   *     ExceptionInInitializerError or OutOfMemoryError exceptions is thrown
+   */
+  static jmethodID getSetNativeHandleMethod(JNIEnv* env) {
+    static jclass jclazz = DERIVED::getJClass(env);
+    if (jclazz == nullptr) {
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "setNativeHandle", "(JZ)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Sets the C++ object pointer handle in the Java object
+   *
+   * @param env A pointer to the Java environment
+   * @param jobj The Java object on which to set the pointer handle
+   * @param ptr The C++ object pointer
+   * @param java_owns_handle JNI_TRUE if ownership of the C++ object is
+   *     managed by the Java object
+   *
+   * @return true if a Java exception is pending, false otherwise
+   */
+  static bool setHandle(JNIEnv* env, jobject jobj, PTR ptr,
+                        jboolean java_owns_handle) {
+    assert(jobj != nullptr);
+    static jmethodID mid = getSetNativeHandleMethod(env);
+    if (mid == nullptr) {
+      return true;  // signal exception
+    }
+
+    env->CallVoidMethod(jobj, mid, GET_CPLUSPLUS_POINTER(ptr),
+                        java_owns_handle);
+    if (env->ExceptionCheck()) {
+      return true;  // signal exception
+    }
+
+    return false;
+  }
+};
+
+// Java Exception template
+template <class DERIVED>
+class JavaException : public JavaClass {
+ public:
+  /**
+   * Create and throw a java exception with the provided message
+   *
+   * @param env A pointer to the Java environment
+   * @param msg The message for the exception
+   *
+   * @return true if an exception was thrown, false otherwise
+   */
+  static bool ThrowNew(JNIEnv* env, const std::string& msg) {
+    jclass jclazz = DERIVED::getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      std::cerr << "JavaException::ThrowNew - Error: unexpected exception!"
+                << std::endl;
+      return env->ExceptionCheck();
+    }
+
+    const jint rs = env->ThrowNew(jclazz, msg.c_str());
+    if (rs != JNI_OK) {
+      // exception could not be thrown
+      std::cerr << "JavaException::ThrowNew - Fatal: could not throw exception!"
+                << std::endl;
+      return env->ExceptionCheck();
+    }
+
+    return true;
+  }
+};
+
+// The portal class for java.lang.IllegalArgumentException
+class IllegalArgumentExceptionJni
+    : public JavaException<IllegalArgumentExceptionJni> {
+ public:
+  /**
+   * Get the Java Class java.lang.IllegalArgumentException
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaException::getJClass(env, "java/lang/IllegalArgumentException");
+  }
+
+  /**
+   * Create and throw a Java IllegalArgumentException with the provided status
+   *
+   * If s.ok() == true, then this function will not throw any exception.
+   *
+   * @param env A pointer to the Java environment
+   * @param s The status for the exception
+   *
+   * @return true if an exception was thrown, false otherwise
+   */
+  static bool ThrowNew(JNIEnv* env, const Status& s) {
+    assert(!s.ok());
+    if (s.ok()) {
+      return false;
+    }
+
+    // get the IllegalArgumentException class
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      std::cerr << "IllegalArgumentExceptionJni::ThrowNew/class - Error: "
+                   "unexpected exception!"
+                << std::endl;
+      return env->ExceptionCheck();
+    }
+
+    return JavaException::ThrowNew(env, s.ToString());
+  }
+};
+
+// The portal class for org.rocksdb.Status.Code
+class CodeJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.Status.Code
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/Status$Code");
+  }
+
+  /**
+   * Get the Java Method: Status.Code#getValue
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getValueMethod(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "getValue", "()b");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+// The portal class for org.rocksdb.Status.SubCode
+class SubCodeJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.Status.SubCode
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/Status$SubCode");
+  }
+
+  /**
+   * Get the Java Method: Status.SubCode#getValue
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getValueMethod(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "getValue", "()b");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  static ROCKSDB_NAMESPACE::Status::SubCode toCppSubCode(
+      const jbyte jsub_code) {
+    switch (jsub_code) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::Status::SubCode::kNone;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::Status::SubCode::kMutexTimeout;
+      case 0x2:
+        return ROCKSDB_NAMESPACE::Status::SubCode::kLockTimeout;
+      case 0x3:
+        return ROCKSDB_NAMESPACE::Status::SubCode::kLockLimit;
+      case 0x4:
+        return ROCKSDB_NAMESPACE::Status::SubCode::kNoSpace;
+      case 0x5:
+        return ROCKSDB_NAMESPACE::Status::SubCode::kDeadlock;
+      case 0x6:
+        return ROCKSDB_NAMESPACE::Status::SubCode::kStaleFile;
+      case 0x7:
+        return ROCKSDB_NAMESPACE::Status::SubCode::kMemoryLimit;
+
+      case 0x7F:
+      default:
+        return ROCKSDB_NAMESPACE::Status::SubCode::kNone;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.Status
+class StatusJni
+    : public RocksDBNativeClass<ROCKSDB_NAMESPACE::Status*, StatusJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.Status
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/Status");
+  }
+
+  /**
+   * Get the Java Method: Status#getCode
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getCodeMethod(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "getCode", "()Lorg/rocksdb/Status$Code;");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: Status#getSubCode
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getSubCodeMethod(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "getSubCode",
+                                            "()Lorg/rocksdb/Status$SubCode;");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: Status#getState
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getStateMethod(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "getState", "()Ljava/lang/String;");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Create a new Java org.rocksdb.Status object with the same properties as
+   * the provided C++ ROCKSDB_NAMESPACE::Status object
+   *
+   * @param env A pointer to the Java environment
+   * @param status The ROCKSDB_NAMESPACE::Status object
+   *
+   * @return A reference to a Java org.rocksdb.Status object, or nullptr
+   *     if an an exception occurs
+   */
+  static jobject construct(JNIEnv* env, const Status& status) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID mid =
+        env->GetMethodID(jclazz, "<init>", "(BBLjava/lang/String;)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    // convert the Status state for Java
+    jstring jstate = nullptr;
+    if (status.getState() != nullptr) {
+      const char* const state = status.getState();
+      jstate = env->NewStringUTF(state);
+      if (env->ExceptionCheck()) {
+        if (jstate != nullptr) {
+          env->DeleteLocalRef(jstate);
+        }
+        return nullptr;
+      }
+    }
+
+    jobject jstatus =
+        env->NewObject(jclazz, mid, toJavaStatusCode(status.code()),
+                       toJavaStatusSubCode(status.subcode()), jstate);
+    if (env->ExceptionCheck()) {
+      // exception occurred
+      if (jstate != nullptr) {
+        env->DeleteLocalRef(jstate);
+      }
+      return nullptr;
+    }
+
+    if (jstate != nullptr) {
+      env->DeleteLocalRef(jstate);
+    }
+
+    return jstatus;
+  }
+
+  static jobject construct(JNIEnv* env, const Status* status) {
+    return construct(env, *status);
+  }
+
+  // Returns the equivalent org.rocksdb.Status.Code for the provided
+  // C++ ROCKSDB_NAMESPACE::Status::Code enum
+  static jbyte toJavaStatusCode(const ROCKSDB_NAMESPACE::Status::Code& code) {
+    switch (code) {
+      case ROCKSDB_NAMESPACE::Status::Code::kOk:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::Status::Code::kNotFound:
+        return 0x1;
+      case ROCKSDB_NAMESPACE::Status::Code::kCorruption:
+        return 0x2;
+      case ROCKSDB_NAMESPACE::Status::Code::kNotSupported:
+        return 0x3;
+      case ROCKSDB_NAMESPACE::Status::Code::kInvalidArgument:
+        return 0x4;
+      case ROCKSDB_NAMESPACE::Status::Code::kIOError:
+        return 0x5;
+      case ROCKSDB_NAMESPACE::Status::Code::kMergeInProgress:
+        return 0x6;
+      case ROCKSDB_NAMESPACE::Status::Code::kIncomplete:
+        return 0x7;
+      case ROCKSDB_NAMESPACE::Status::Code::kShutdownInProgress:
+        return 0x8;
+      case ROCKSDB_NAMESPACE::Status::Code::kTimedOut:
+        return 0x9;
+      case ROCKSDB_NAMESPACE::Status::Code::kAborted:
+        return 0xA;
+      case ROCKSDB_NAMESPACE::Status::Code::kBusy:
+        return 0xB;
+      case ROCKSDB_NAMESPACE::Status::Code::kExpired:
+        return 0xC;
+      case ROCKSDB_NAMESPACE::Status::Code::kTryAgain:
+        return 0xD;
+      case ROCKSDB_NAMESPACE::Status::Code::kColumnFamilyDropped:
+        return 0xE;
+      default:
+        return 0x7F;  // undefined
+    }
+  }
+
+  // Returns the equivalent org.rocksdb.Status.SubCode for the provided
+  // C++ ROCKSDB_NAMESPACE::Status::SubCode enum
+  static jbyte toJavaStatusSubCode(
+      const ROCKSDB_NAMESPACE::Status::SubCode& subCode) {
+    switch (subCode) {
+      case ROCKSDB_NAMESPACE::Status::SubCode::kNone:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::Status::SubCode::kMutexTimeout:
+        return 0x1;
+      case ROCKSDB_NAMESPACE::Status::SubCode::kLockTimeout:
+        return 0x2;
+      case ROCKSDB_NAMESPACE::Status::SubCode::kLockLimit:
+        return 0x3;
+      case ROCKSDB_NAMESPACE::Status::SubCode::kNoSpace:
+        return 0x4;
+      case ROCKSDB_NAMESPACE::Status::SubCode::kDeadlock:
+        return 0x5;
+      case ROCKSDB_NAMESPACE::Status::SubCode::kStaleFile:
+        return 0x6;
+      case ROCKSDB_NAMESPACE::Status::SubCode::kMemoryLimit:
+        return 0x7;
+      default:
+        return 0x7F;  // undefined
+    }
+  }
+
+  static std::unique_ptr<ROCKSDB_NAMESPACE::Status> toCppStatus(
+      const jbyte jcode_value, const jbyte jsub_code_value) {
+    std::unique_ptr<ROCKSDB_NAMESPACE::Status> status;
+    switch (jcode_value) {
+      case 0x0:
+        // Ok
+        status = std::unique_ptr<ROCKSDB_NAMESPACE::Status>(
+            new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::OK()));
+        break;
+      case 0x1:
+        // NotFound
+        status = std::unique_ptr<ROCKSDB_NAMESPACE::Status>(
+            new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::NotFound(
+                ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(jsub_code_value))));
+        break;
+      case 0x2:
+        // Corruption
+        status = std::unique_ptr<ROCKSDB_NAMESPACE::Status>(
+            new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::Corruption(
+                ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(jsub_code_value))));
+        break;
+      case 0x3:
+        // NotSupported
+        status = std::unique_ptr<ROCKSDB_NAMESPACE::Status>(
+            new ROCKSDB_NAMESPACE::Status(
+                ROCKSDB_NAMESPACE::Status::NotSupported(
+                    ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(
+                        jsub_code_value))));
+        break;
+      case 0x4:
+        // InvalidArgument
+        status = std::unique_ptr<ROCKSDB_NAMESPACE::Status>(
+            new ROCKSDB_NAMESPACE::Status(
+                ROCKSDB_NAMESPACE::Status::InvalidArgument(
+                    ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(
+                        jsub_code_value))));
+        break;
+      case 0x5:
+        // IOError
+        status = std::unique_ptr<ROCKSDB_NAMESPACE::Status>(
+            new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::IOError(
+                ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(jsub_code_value))));
+        break;
+      case 0x6:
+        // MergeInProgress
+        status = std::unique_ptr<ROCKSDB_NAMESPACE::Status>(
+            new ROCKSDB_NAMESPACE::Status(
+                ROCKSDB_NAMESPACE::Status::MergeInProgress(
+                    ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(
+                        jsub_code_value))));
+        break;
+      case 0x7:
+        // Incomplete
+        status = std::unique_ptr<ROCKSDB_NAMESPACE::Status>(
+            new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::Incomplete(
+                ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(jsub_code_value))));
+        break;
+      case 0x8:
+        // ShutdownInProgress
+        status = std::unique_ptr<ROCKSDB_NAMESPACE::Status>(
+            new ROCKSDB_NAMESPACE::Status(
+                ROCKSDB_NAMESPACE::Status::ShutdownInProgress(
+                    ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(
+                        jsub_code_value))));
+        break;
+      case 0x9:
+        // TimedOut
+        status = std::unique_ptr<ROCKSDB_NAMESPACE::Status>(
+            new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::TimedOut(
+                ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(jsub_code_value))));
+        break;
+      case 0xA:
+        // Aborted
+        status = std::unique_ptr<ROCKSDB_NAMESPACE::Status>(
+            new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::Aborted(
+                ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(jsub_code_value))));
+        break;
+      case 0xB:
+        // Busy
+        status = std::unique_ptr<ROCKSDB_NAMESPACE::Status>(
+            new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::Busy(
+                ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(jsub_code_value))));
+        break;
+      case 0xC:
+        // Expired
+        status = std::unique_ptr<ROCKSDB_NAMESPACE::Status>(
+            new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::Expired(
+                ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(jsub_code_value))));
+        break;
+      case 0xD:
+        // TryAgain
+        status = std::unique_ptr<ROCKSDB_NAMESPACE::Status>(
+            new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::TryAgain(
+                ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(jsub_code_value))));
+        break;
+      case 0xE:
+        // ColumnFamilyDropped
+        status = std::unique_ptr<ROCKSDB_NAMESPACE::Status>(
+            new ROCKSDB_NAMESPACE::Status(
+                ROCKSDB_NAMESPACE::Status::ColumnFamilyDropped(
+                    ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(
+                        jsub_code_value))));
+        break;
+      case 0x7F:
+      default:
+        return nullptr;
+    }
+    return status;
+  }
+
+  // Returns the equivalent ROCKSDB_NAMESPACE::Status for the Java
+  // org.rocksdb.Status
+  static std::unique_ptr<ROCKSDB_NAMESPACE::Status> toCppStatus(
+      JNIEnv* env, const jobject jstatus) {
+    jmethodID mid_code = getCodeMethod(env);
+    if (mid_code == nullptr) {
+      // exception occurred
+      return nullptr;
+    }
+    jobject jcode = env->CallObjectMethod(jstatus, mid_code);
+    if (env->ExceptionCheck()) {
+      // exception occurred
+      return nullptr;
+    }
+
+    jmethodID mid_code_value = ROCKSDB_NAMESPACE::CodeJni::getValueMethod(env);
+    if (mid_code_value == nullptr) {
+      // exception occurred
+      return nullptr;
+    }
+    jbyte jcode_value = env->CallByteMethod(jcode, mid_code_value);
+    if (env->ExceptionCheck()) {
+      // exception occurred
+      if (jcode != nullptr) {
+        env->DeleteLocalRef(jcode);
+      }
+      return nullptr;
+    }
+
+    jmethodID mid_subCode = getSubCodeMethod(env);
+    if (mid_subCode == nullptr) {
+      // exception occurred
+      return nullptr;
+    }
+    jobject jsubCode = env->CallObjectMethod(jstatus, mid_subCode);
+    if (env->ExceptionCheck()) {
+      // exception occurred
+      if (jcode != nullptr) {
+        env->DeleteLocalRef(jcode);
+      }
+      return nullptr;
+    }
+
+    jbyte jsub_code_value = 0x0;  // None
+    if (jsubCode != nullptr) {
+      jmethodID mid_subCode_value =
+          ROCKSDB_NAMESPACE::SubCodeJni::getValueMethod(env);
+      if (mid_subCode_value == nullptr) {
+        // exception occurred
+        return nullptr;
+      }
+      jsub_code_value = env->CallByteMethod(jsubCode, mid_subCode_value);
+      if (env->ExceptionCheck()) {
+        // exception occurred
+        if (jcode != nullptr) {
+          env->DeleteLocalRef(jcode);
+        }
+        return nullptr;
+      }
+    }
+
+    jmethodID mid_state = getStateMethod(env);
+    if (mid_state == nullptr) {
+      // exception occurred
+      return nullptr;
+    }
+    jobject jstate = env->CallObjectMethod(jstatus, mid_state);
+    if (env->ExceptionCheck()) {
+      // exception occurred
+      if (jsubCode != nullptr) {
+        env->DeleteLocalRef(jsubCode);
+      }
+      if (jcode != nullptr) {
+        env->DeleteLocalRef(jcode);
+      }
+      return nullptr;
+    }
+
+    std::unique_ptr<ROCKSDB_NAMESPACE::Status> status =
+        toCppStatus(jcode_value, jsub_code_value);
+
+    // delete all local refs
+    if (jstate != nullptr) {
+      env->DeleteLocalRef(jstate);
+    }
+    if (jsubCode != nullptr) {
+      env->DeleteLocalRef(jsubCode);
+    }
+    if (jcode != nullptr) {
+      env->DeleteLocalRef(jcode);
+    }
+
+    return status;
+  }
+};
+
+// The portal class for org.rocksdb.RocksDBException
+class RocksDBExceptionJni : public JavaException<RocksDBExceptionJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.RocksDBException
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaException::getJClass(env, "org/rocksdb/RocksDBException");
+  }
+
+  /**
+   * Create and throw a Java RocksDBException with the provided message
+   *
+   * @param env A pointer to the Java environment
+   * @param msg The message for the exception
+   *
+   * @return true if an exception was thrown, false otherwise
+   */
+  static bool ThrowNew(JNIEnv* env, const std::string& msg) {
+    return JavaException::ThrowNew(env, msg);
+  }
+
+  /**
+   * Create and throw a Java RocksDBException with the provided status
+   *
+   * If s->ok() == true, then this function will not throw any exception.
+   *
+   * @param env A pointer to the Java environment
+   * @param s The status for the exception
+   *
+   * @return true if an exception was thrown, false otherwise
+   */
+  static bool ThrowNew(JNIEnv* env, std::unique_ptr<Status>& s) {
+    return ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, *(s.get()));
+  }
+
+  /**
+   * Create and throw a Java RocksDBException with the provided status
+   *
+   * If s.ok() == true, then this function will not throw any exception.
+   *
+   * @param env A pointer to the Java environment
+   * @param s The status for the exception
+   *
+   * @return true if an exception was thrown, false otherwise
+   */
+  static bool ThrowNew(JNIEnv* env, const Status& s) {
+    if (s.ok()) {
+      return false;
+    }
+
+    // get the RocksDBException class
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      std::cerr << "RocksDBExceptionJni::ThrowNew/class - Error: unexpected "
+                   "exception!"
+                << std::endl;
+      return env->ExceptionCheck();
+    }
+
+    // get the constructor of org.rocksdb.RocksDBException
+    jmethodID mid =
+        env->GetMethodID(jclazz, "<init>", "(Lorg/rocksdb/Status;)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      std::cerr
+          << "RocksDBExceptionJni::ThrowNew/cstr - Error: unexpected exception!"
+          << std::endl;
+      return env->ExceptionCheck();
+    }
+
+    // get the Java status object
+    jobject jstatus = StatusJni::construct(env, s);
+    if (jstatus == nullptr) {
+      // exception occcurred
+      std::cerr << "RocksDBExceptionJni::ThrowNew/StatusJni - Error: "
+                   "unexpected exception!"
+                << std::endl;
+      return env->ExceptionCheck();
+    }
+
+    // construct the RocksDBException
+    jthrowable rocksdb_exception =
+        reinterpret_cast<jthrowable>(env->NewObject(jclazz, mid, jstatus));
+    if (env->ExceptionCheck()) {
+      if (jstatus != nullptr) {
+        env->DeleteLocalRef(jstatus);
+      }
+      if (rocksdb_exception != nullptr) {
+        env->DeleteLocalRef(rocksdb_exception);
+      }
+      std::cerr << "RocksDBExceptionJni::ThrowNew/NewObject - Error: "
+                   "unexpected exception!"
+                << std::endl;
+      return true;
+    }
+
+    // throw the RocksDBException
+    const jint rs = env->Throw(rocksdb_exception);
+    if (rs != JNI_OK) {
+      // exception could not be thrown
+      std::cerr
+          << "RocksDBExceptionJni::ThrowNew - Fatal: could not throw exception!"
+          << std::endl;
+      if (jstatus != nullptr) {
+        env->DeleteLocalRef(jstatus);
+      }
+      if (rocksdb_exception != nullptr) {
+        env->DeleteLocalRef(rocksdb_exception);
+      }
+      return env->ExceptionCheck();
+    }
+
+    if (jstatus != nullptr) {
+      env->DeleteLocalRef(jstatus);
+    }
+    if (rocksdb_exception != nullptr) {
+      env->DeleteLocalRef(rocksdb_exception);
+    }
+
+    return true;
+  }
+
+  /**
+   * Create and throw a Java RocksDBException with the provided message
+   * and status
+   *
+   * If s.ok() == true, then this function will not throw any exception.
+   *
+   * @param env A pointer to the Java environment
+   * @param msg The message for the exception
+   * @param s The status for the exception
+   *
+   * @return true if an exception was thrown, false otherwise
+   */
+  static bool ThrowNew(JNIEnv* env, const std::string& msg, const Status& s) {
+    assert(!s.ok());
+    if (s.ok()) {
+      return false;
+    }
+
+    // get the RocksDBException class
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      std::cerr << "RocksDBExceptionJni::ThrowNew/class - Error: unexpected "
+                   "exception!"
+                << std::endl;
+      return env->ExceptionCheck();
+    }
+
+    // get the constructor of org.rocksdb.RocksDBException
+    jmethodID mid = env->GetMethodID(
+        jclazz, "<init>", "(Ljava/lang/String;Lorg/rocksdb/Status;)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      std::cerr
+          << "RocksDBExceptionJni::ThrowNew/cstr - Error: unexpected exception!"
+          << std::endl;
+      return env->ExceptionCheck();
+    }
+
+    jstring jmsg = env->NewStringUTF(msg.c_str());
+    if (jmsg == nullptr) {
+      // exception thrown: OutOfMemoryError
+      std::cerr
+          << "RocksDBExceptionJni::ThrowNew/msg - Error: unexpected exception!"
+          << std::endl;
+      return env->ExceptionCheck();
+    }
+
+    // get the Java status object
+    jobject jstatus = StatusJni::construct(env, s);
+    if (jstatus == nullptr) {
+      // exception occcurred
+      std::cerr << "RocksDBExceptionJni::ThrowNew/StatusJni - Error: "
+                   "unexpected exception!"
+                << std::endl;
+      if (jmsg != nullptr) {
+        env->DeleteLocalRef(jmsg);
+      }
+      return env->ExceptionCheck();
+    }
+
+    // construct the RocksDBException
+    jthrowable rocksdb_exception = reinterpret_cast<jthrowable>(
+        env->NewObject(jclazz, mid, jmsg, jstatus));
+    if (env->ExceptionCheck()) {
+      if (jstatus != nullptr) {
+        env->DeleteLocalRef(jstatus);
+      }
+      if (jmsg != nullptr) {
+        env->DeleteLocalRef(jmsg);
+      }
+      if (rocksdb_exception != nullptr) {
+        env->DeleteLocalRef(rocksdb_exception);
+      }
+      std::cerr << "RocksDBExceptionJni::ThrowNew/NewObject - Error: "
+                   "unexpected exception!"
+                << std::endl;
+      return true;
+    }
+
+    // throw the RocksDBException
+    const jint rs = env->Throw(rocksdb_exception);
+    if (rs != JNI_OK) {
+      // exception could not be thrown
+      std::cerr
+          << "RocksDBExceptionJni::ThrowNew - Fatal: could not throw exception!"
+          << std::endl;
+      if (jstatus != nullptr) {
+        env->DeleteLocalRef(jstatus);
+      }
+      if (jmsg != nullptr) {
+        env->DeleteLocalRef(jmsg);
+      }
+      if (rocksdb_exception != nullptr) {
+        env->DeleteLocalRef(rocksdb_exception);
+      }
+      return env->ExceptionCheck();
+    }
+
+    if (jstatus != nullptr) {
+      env->DeleteLocalRef(jstatus);
+    }
+    if (jmsg != nullptr) {
+      env->DeleteLocalRef(jmsg);
+    }
+    if (rocksdb_exception != nullptr) {
+      env->DeleteLocalRef(rocksdb_exception);
+    }
+
+    return true;
+  }
+
+  /**
+   * Get the Java Method: RocksDBException#getStatus
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getStatusMethod(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "getStatus", "()Lorg/rocksdb/Status;");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  static std::unique_ptr<ROCKSDB_NAMESPACE::Status> toCppStatus(
+      JNIEnv* env, jthrowable jrocksdb_exception) {
+    if (!env->IsInstanceOf(jrocksdb_exception, getJClass(env))) {
+      // not an instance of RocksDBException
+      return nullptr;
+    }
+
+    // get the java status object
+    jmethodID mid = getStatusMethod(env);
+    if (mid == nullptr) {
+      // exception occurred accessing class or method
+      return nullptr;
+    }
+
+    jobject jstatus = env->CallObjectMethod(jrocksdb_exception, mid);
+    if (env->ExceptionCheck()) {
+      // exception occurred
+      return nullptr;
+    }
+
+    if (jstatus == nullptr) {
+      return nullptr;  // no status available
+    }
+
+    return ROCKSDB_NAMESPACE::StatusJni::toCppStatus(env, jstatus);
+  }
+};
+
+// The portal class for java.util.List
+class ListJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class java.util.List
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getListClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "java/util/List");
+  }
+
+  /**
+   * Get the Java Class java.util.ArrayList
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getArrayListClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "java/util/ArrayList");
+  }
+
+  /**
+   * Get the Java Class java.util.Iterator
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getIteratorClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "java/util/Iterator");
+  }
+
+  /**
+   * Get the Java Method: List#iterator
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getIteratorMethod(JNIEnv* env) {
+    jclass jlist_clazz = getListClass(env);
+    if (jlist_clazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jlist_clazz, "iterator", "()Ljava/util/Iterator;");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: Iterator#hasNext
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getHasNextMethod(JNIEnv* env) {
+    jclass jiterator_clazz = getIteratorClass(env);
+    if (jiterator_clazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jiterator_clazz, "hasNext", "()Z");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: Iterator#next
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getNextMethod(JNIEnv* env) {
+    jclass jiterator_clazz = getIteratorClass(env);
+    if (jiterator_clazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jiterator_clazz, "next", "()Ljava/lang/Object;");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: ArrayList constructor
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getArrayListConstructorMethodId(JNIEnv* env) {
+    jclass jarray_list_clazz = getArrayListClass(env);
+    if (jarray_list_clazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+    static jmethodID mid =
+        env->GetMethodID(jarray_list_clazz, "<init>", "(I)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: List#add
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getListAddMethodId(JNIEnv* env) {
+    jclass jlist_clazz = getListClass(env);
+    if (jlist_clazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jlist_clazz, "add", "(Ljava/lang/Object;)Z");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+// The portal class for java.lang.Byte
+class ByteJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class java.lang.Byte
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "java/lang/Byte");
+  }
+
+  /**
+   * Get the Java Class byte[]
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getArrayJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "[B");
+  }
+
+  /**
+   * Creates a new 2-dimensional Java Byte Array byte[][]
+   *
+   * @param env A pointer to the Java environment
+   * @param len The size of the first dimension
+   *
+   * @return A reference to the Java byte[][] or nullptr if an exception occurs
+   */
+  static jobjectArray new2dByteArray(JNIEnv* env, const jsize len) {
+    jclass clazz = getArrayJClass(env);
+    if (clazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    return env->NewObjectArray(len, clazz, nullptr);
+  }
+
+  /**
+   * Get the Java Method: Byte#byteValue
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getByteValueMethod(JNIEnv* env) {
+    jclass clazz = getJClass(env);
+    if (clazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(clazz, "byteValue", "()B");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Calls the Java Method: Byte#valueOf, returning a constructed Byte jobject
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return A constructing Byte object or nullptr if the class or method id
+   * could not be retrieved, or an exception occurred
+   */
+  static jobject valueOf(JNIEnv* env, jbyte jprimitive_byte) {
+    jclass clazz = getJClass(env);
+    if (clazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetStaticMethodID(clazz, "valueOf", "(B)Ljava/lang/Byte;");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    const jobject jbyte_obj =
+        env->CallStaticObjectMethod(clazz, mid, jprimitive_byte);
+    if (env->ExceptionCheck()) {
+      // exception occurred
+      return nullptr;
+    }
+
+    return jbyte_obj;
+  }
+};
+
+// The portal class for java.nio.ByteBuffer
+class ByteBufferJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class java.nio.ByteBuffer
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "java/nio/ByteBuffer");
+  }
+
+  /**
+   * Get the Java Method: ByteBuffer#allocate
+   *
+   * @param env A pointer to the Java environment
+   * @param jbytebuffer_clazz if you have a reference to a ByteBuffer class, or
+   * nullptr
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getAllocateMethodId(JNIEnv* env,
+                                       jclass jbytebuffer_clazz = nullptr) {
+    const jclass jclazz =
+        jbytebuffer_clazz == nullptr ? getJClass(env) : jbytebuffer_clazz;
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetStaticMethodID(jclazz, "allocate", "(I)Ljava/nio/ByteBuffer;");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: ByteBuffer#array
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getArrayMethodId(JNIEnv* env,
+                                    jclass jbytebuffer_clazz = nullptr) {
+    const jclass jclazz =
+        jbytebuffer_clazz == nullptr ? getJClass(env) : jbytebuffer_clazz;
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "array", "()[B");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  static jobject construct(JNIEnv* env, const bool direct,
+                           const size_t capacity,
+                           jclass jbytebuffer_clazz = nullptr) {
+    return constructWith(env, direct, nullptr, capacity, jbytebuffer_clazz);
+  }
+
+  static jobject constructWith(JNIEnv* env, const bool direct, const char* buf,
+                               const size_t capacity,
+                               jclass jbytebuffer_clazz = nullptr) {
+    if (direct) {
+      bool allocated = false;
+      if (buf == nullptr) {
+        buf = new char[capacity];
+        allocated = true;
+      }
+      jobject jbuf = env->NewDirectByteBuffer(const_cast<char*>(buf),
+                                              static_cast<jlong>(capacity));
+      if (jbuf == nullptr) {
+        // exception occurred
+        if (allocated) {
+          delete[] static_cast<const char*>(buf);
+        }
+        return nullptr;
+      }
+      return jbuf;
+    } else {
+      const jclass jclazz =
+          jbytebuffer_clazz == nullptr ? getJClass(env) : jbytebuffer_clazz;
+      if (jclazz == nullptr) {
+        // exception occurred accessing class
+        return nullptr;
+      }
+      const jmethodID jmid_allocate =
+          getAllocateMethodId(env, jbytebuffer_clazz);
+      if (jmid_allocate == nullptr) {
+        // exception occurred accessing class, or NoSuchMethodException or
+        // OutOfMemoryError
+        return nullptr;
+      }
+      const jobject jbuf = env->CallStaticObjectMethod(
+          jclazz, jmid_allocate, static_cast<jint>(capacity));
+      if (env->ExceptionCheck()) {
+        // exception occurred
+        return nullptr;
+      }
+
+      // set buffer data?
+      if (buf != nullptr) {
+        jbyteArray jarray = array(env, jbuf, jbytebuffer_clazz);
+        if (jarray == nullptr) {
+          // exception occurred
+          env->DeleteLocalRef(jbuf);
+          return nullptr;
+        }
+
+        jboolean is_copy = JNI_FALSE;
+        jbyte* ja = reinterpret_cast<jbyte*>(
+            env->GetPrimitiveArrayCritical(jarray, &is_copy));
+        if (ja == nullptr) {
+          // exception occurred
+          env->DeleteLocalRef(jarray);
+          env->DeleteLocalRef(jbuf);
+          return nullptr;
+        }
+
+        memcpy(ja, const_cast<char*>(buf), capacity);
+
+        env->ReleasePrimitiveArrayCritical(jarray, ja, 0);
+
+        env->DeleteLocalRef(jarray);
+      }
+
+      return jbuf;
+    }
+  }
+
+  static jbyteArray array(JNIEnv* env, const jobject& jbyte_buffer,
+                          jclass jbytebuffer_clazz = nullptr) {
+    const jmethodID mid = getArrayMethodId(env, jbytebuffer_clazz);
+    if (mid == nullptr) {
+      // exception occurred accessing class, or NoSuchMethodException or
+      // OutOfMemoryError
+      return nullptr;
+    }
+    const jobject jarray = env->CallObjectMethod(jbyte_buffer, mid);
+    if (env->ExceptionCheck()) {
+      // exception occurred
+      return nullptr;
+    }
+    return static_cast<jbyteArray>(jarray);
+  }
+};
+
+// The portal class for java.lang.Integer
+class IntegerJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class java.lang.Integer
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "java/lang/Integer");
+  }
+
+  static jobject valueOf(JNIEnv* env, jint jprimitive_int) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID mid =
+        env->GetStaticMethodID(jclazz, "valueOf", "(I)Ljava/lang/Integer;");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    const jobject jinteger_obj =
+        env->CallStaticObjectMethod(jclazz, mid, jprimitive_int);
+    if (env->ExceptionCheck()) {
+      // exception occurred
+      return nullptr;
+    }
+
+    return jinteger_obj;
+  }
+};
+
+// The portal class for java.lang.Long
+class LongJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class java.lang.Long
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "java/lang/Long");
+  }
+
+  static jobject valueOf(JNIEnv* env, jlong jprimitive_long) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID mid =
+        env->GetStaticMethodID(jclazz, "valueOf", "(J)Ljava/lang/Long;");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    const jobject jlong_obj =
+        env->CallStaticObjectMethod(jclazz, mid, jprimitive_long);
+    if (env->ExceptionCheck()) {
+      // exception occurred
+      return nullptr;
+    }
+
+    return jlong_obj;
+  }
+};
+
+// The portal class for java.lang.StringBuilder
+class StringBuilderJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class java.lang.StringBuilder
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "java/lang/StringBuilder");
+  }
+
+  /**
+   * Get the Java Method: StringBuilder#append
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getListAddMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "append", "(Ljava/lang/String;)Ljava/lang/StringBuilder;");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Appends a C-style string to a StringBuilder
+   *
+   * @param env A pointer to the Java environment
+   * @param jstring_builder Reference to a java.lang.StringBuilder
+   * @param c_str A C-style string to append to the StringBuilder
+   *
+   * @return A reference to the updated StringBuilder, or a nullptr if
+   *     an exception occurs
+   */
+  static jobject append(JNIEnv* env, jobject jstring_builder,
+                        const char* c_str) {
+    jmethodID mid = getListAddMethodId(env);
+    if (mid == nullptr) {
+      // exception occurred accessing class or method
+      return nullptr;
+    }
+
+    jstring new_value_str = env->NewStringUTF(c_str);
+    if (new_value_str == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+
+    jobject jresult_string_builder =
+        env->CallObjectMethod(jstring_builder, mid, new_value_str);
+    if (env->ExceptionCheck()) {
+      // exception occurred
+      env->DeleteLocalRef(new_value_str);
+      return nullptr;
+    }
+
+    return jresult_string_builder;
+  }
+};
+
+// various utility functions for working with RocksDB and JNI
+class JniUtil {
+ public:
+  /**
+   * Detect if jlong overflows size_t
+   *
+   * @param jvalue the jlong value
+   *
+   * @return
+   */
+  inline static Status check_if_jlong_fits_size_t(const jlong& jvalue) {
+    Status s = Status::OK();
+    if (static_cast<uint64_t>(jvalue) > std::numeric_limits<size_t>::max()) {
+      s = Status::InvalidArgument(Slice("jlong overflows 32 bit value."));
+    }
+    return s;
+  }
+
+  /**
+   * Obtains a reference to the JNIEnv from
+   * the JVM
+   *
+   * If the current thread is not attached to the JavaVM
+   * then it will be attached so as to retrieve the JNIEnv
+   *
+   * If a thread is attached, it must later be manually
+   * released by calling JavaVM::DetachCurrentThread.
+   * This can be handled by always matching calls to this
+   * function with calls to {@link JniUtil::releaseJniEnv(JavaVM*, jboolean)}
+   *
+   * @param jvm (IN) A pointer to the JavaVM instance
+   * @param attached (OUT) A pointer to a boolean which
+   *     will be set to JNI_TRUE if we had to attach the thread
+   *
+   * @return A pointer to the JNIEnv or nullptr if a fatal error
+   *     occurs and the JNIEnv cannot be retrieved
+   */
+  static JNIEnv* getJniEnv(JavaVM* jvm, jboolean* attached) {
+    assert(jvm != nullptr);
+
+    JNIEnv* env;
+    const jint env_rs =
+        jvm->GetEnv(reinterpret_cast<void**>(&env), JNI_VERSION_1_6);
+
+    if (env_rs == JNI_OK) {
+      // current thread is already attached, return the JNIEnv
+      *attached = JNI_FALSE;
+      return env;
+    } else if (env_rs == JNI_EDETACHED) {
+      // current thread is not attached, attempt to attach
+      const jint rs_attach =
+          jvm->AttachCurrentThread(reinterpret_cast<void**>(&env), NULL);
+      if (rs_attach == JNI_OK) {
+        *attached = JNI_TRUE;
+        return env;
+      } else {
+        // error, could not attach the thread
+        std::cerr << "JniUtil::getJniEnv - Fatal: could not attach current "
+                     "thread to JVM!"
+                  << std::endl;
+        return nullptr;
+      }
+    } else if (env_rs == JNI_EVERSION) {
+      // error, JDK does not support JNI_VERSION_1_6+
+      std::cerr
+          << "JniUtil::getJniEnv - Fatal: JDK does not support JNI_VERSION_1_6"
+          << std::endl;
+      return nullptr;
+    } else {
+      std::cerr << "JniUtil::getJniEnv - Fatal: Unknown error: env_rs="
+                << env_rs << std::endl;
+      return nullptr;
+    }
+  }
+
+  /**
+   * Counterpart to {@link JniUtil::getJniEnv(JavaVM*, jboolean*)}
+   *
+   * Detachess the current thread from the JVM if it was previously
+   * attached
+   *
+   * @param jvm (IN) A pointer to the JavaVM instance
+   * @param attached (IN) JNI_TRUE if we previously had to attach the thread
+   *     to the JavaVM to get the JNIEnv
+   */
+  static void releaseJniEnv(JavaVM* jvm, jboolean& attached) {
+    assert(jvm != nullptr);
+    if (attached == JNI_TRUE) {
+      const jint rs_detach = jvm->DetachCurrentThread();
+      assert(rs_detach == JNI_OK);
+      if (rs_detach != JNI_OK) {
+        std::cerr << "JniUtil::getJniEnv - Warn: Unable to detach current "
+                     "thread from JVM!"
+                  << std::endl;
+      }
+    }
+  }
+
+  /**
+   * Copies a Java String[] to a C++ std::vector<std::string>
+   *
+   * @param env (IN) A pointer to the java environment
+   * @param jss (IN) The Java String array to copy
+   * @param has_exception (OUT) will be set to JNI_TRUE
+   *     if an OutOfMemoryError or ArrayIndexOutOfBoundsException
+   *     exception occurs
+   *
+   * @return A std::vector<std:string> containing copies of the Java strings
+   */
+  static std::vector<std::string> copyStrings(JNIEnv* env, jobjectArray jss,
+                                              jboolean* has_exception) {
+    return ROCKSDB_NAMESPACE::JniUtil::copyStrings(
+        env, jss, env->GetArrayLength(jss), has_exception);
+  }
+
+  /**
+   * Copies a Java String[] to a C++ std::vector<std::string>
+   *
+   * @param env (IN) A pointer to the java environment
+   * @param jss (IN) The Java String array to copy
+   * @param jss_len (IN) The length of the Java String array to copy
+   * @param has_exception (OUT) will be set to JNI_TRUE
+   *     if an OutOfMemoryError or ArrayIndexOutOfBoundsException
+   *     exception occurs
+   *
+   * @return A std::vector<std:string> containing copies of the Java strings
+   */
+  static std::vector<std::string> copyStrings(JNIEnv* env, jobjectArray jss,
+                                              const jsize jss_len,
+                                              jboolean* has_exception) {
+    std::vector<std::string> strs;
+    strs.reserve(jss_len);
+    for (jsize i = 0; i < jss_len; i++) {
+      jobject js = env->GetObjectArrayElement(jss, i);
+      if (env->ExceptionCheck()) {
+        // exception thrown: ArrayIndexOutOfBoundsException
+        *has_exception = JNI_TRUE;
+        return strs;
+      }
+
+      jstring jstr = static_cast<jstring>(js);
+      const char* str = env->GetStringUTFChars(jstr, nullptr);
+      if (str == nullptr) {
+        // exception thrown: OutOfMemoryError
+        env->DeleteLocalRef(js);
+        *has_exception = JNI_TRUE;
+        return strs;
+      }
+
+      strs.push_back(std::string(str));
+
+      env->ReleaseStringUTFChars(jstr, str);
+      env->DeleteLocalRef(js);
+    }
+
+    *has_exception = JNI_FALSE;
+    return strs;
+  }
+
+  /**
+   * Copies a jstring to a C-style null-terminated byte string
+   * and releases the original jstring
+   *
+   * The jstring is copied as UTF-8
+   *
+   * If an exception occurs, then JNIEnv::ExceptionCheck()
+   * will have been called
+   *
+   * @param env (IN) A pointer to the java environment
+   * @param js (IN) The java string to copy
+   * @param has_exception (OUT) will be set to JNI_TRUE
+   *     if an OutOfMemoryError exception occurs
+   *
+   * @return A pointer to the copied string, or a
+   *     nullptr if has_exception == JNI_TRUE
+   */
+  static std::unique_ptr<char[]> copyString(JNIEnv* env, jstring js,
+                                            jboolean* has_exception) {
+    const char* utf = env->GetStringUTFChars(js, nullptr);
+    if (utf == nullptr) {
+      // exception thrown: OutOfMemoryError
+      env->ExceptionCheck();
+      *has_exception = JNI_TRUE;
+      return nullptr;
+    } else if (env->ExceptionCheck()) {
+      // exception thrown
+      env->ReleaseStringUTFChars(js, utf);
+      *has_exception = JNI_TRUE;
+      return nullptr;
+    }
+
+    const jsize utf_len = env->GetStringUTFLength(js);
+    std::unique_ptr<char[]> str(
+        new char[utf_len +
+                 1]);  // Note: + 1 is needed for the c_str null terminator
+    std::strcpy(str.get(), utf);
+    env->ReleaseStringUTFChars(js, utf);
+    *has_exception = JNI_FALSE;
+    return str;
+  }
+
+  /**
+   * Copies a jstring to a std::string
+   * and releases the original jstring
+   *
+   * If an exception occurs, then JNIEnv::ExceptionCheck()
+   * will have been called
+   *
+   * @param env (IN) A pointer to the java environment
+   * @param js (IN) The java string to copy
+   * @param has_exception (OUT) will be set to JNI_TRUE
+   *     if an OutOfMemoryError exception occurs
+   *
+   * @return A std:string copy of the jstring, or an
+   *     empty std::string if has_exception == JNI_TRUE
+   */
+  static std::string copyStdString(JNIEnv* env, jstring js,
+                                   jboolean* has_exception) {
+    const char* utf = env->GetStringUTFChars(js, nullptr);
+    if (utf == nullptr) {
+      // exception thrown: OutOfMemoryError
+      env->ExceptionCheck();
+      *has_exception = JNI_TRUE;
+      return std::string();
+    } else if (env->ExceptionCheck()) {
+      // exception thrown
+      env->ReleaseStringUTFChars(js, utf);
+      *has_exception = JNI_TRUE;
+      return std::string();
+    }
+
+    std::string name(utf);
+    env->ReleaseStringUTFChars(js, utf);
+    *has_exception = JNI_FALSE;
+    return name;
+  }
+
+  /**
+   * Copies bytes from a std::string to a jByteArray
+   *
+   * @param env A pointer to the java environment
+   * @param bytes The bytes to copy
+   *
+   * @return the Java byte[], or nullptr if an exception occurs
+   *
+   * @throws RocksDBException thrown
+   *   if memory size to copy exceeds general java specific array size
+   * limitation.
+   */
+  static jbyteArray copyBytes(JNIEnv* env, std::string bytes) {
+    return createJavaByteArrayWithSizeCheck(env, bytes.c_str(), bytes.size());
+  }
+
+  /**
+   * Given a Java byte[][] which is an array of java.lang.Strings
+   * where each String is a byte[], the passed function `string_fn`
+   * will be called on each String, the result is the collected by
+   * calling the passed function `collector_fn`
+   *
+   * @param env (IN) A pointer to the java environment
+   * @param jbyte_strings (IN) A Java array of Strings expressed as bytes
+   * @param string_fn (IN) A transform function to call for each String
+   * @param collector_fn (IN) A collector which is called for the result
+   *     of each `string_fn`
+   * @param has_exception (OUT) will be set to JNI_TRUE
+   *     if an ArrayIndexOutOfBoundsException or OutOfMemoryError
+   *     exception occurs
+   */
+  template <typename T>
+  static void byteStrings(JNIEnv* env, jobjectArray jbyte_strings,
+                          std::function<T(const char*, const size_t)> string_fn,
+                          std::function<void(size_t, T)> collector_fn,
+                          jboolean* has_exception) {
+    const jsize jlen = env->GetArrayLength(jbyte_strings);
+
+    for (jsize i = 0; i < jlen; i++) {
+      jobject jbyte_string_obj = env->GetObjectArrayElement(jbyte_strings, i);
+      if (env->ExceptionCheck()) {
+        // exception thrown: ArrayIndexOutOfBoundsException
+        *has_exception = JNI_TRUE;  // signal error
+        return;
+      }
+
+      jbyteArray jbyte_string_ary =
+          reinterpret_cast<jbyteArray>(jbyte_string_obj);
+      T result = byteString(env, jbyte_string_ary, string_fn, has_exception);
+
+      env->DeleteLocalRef(jbyte_string_obj);
+
+      if (*has_exception == JNI_TRUE) {
+        // exception thrown: OutOfMemoryError
+        return;
+      }
+
+      collector_fn(i, result);
+    }
+
+    *has_exception = JNI_FALSE;
+  }
+
+  /**
+   * Given a Java String which is expressed as a Java Byte Array byte[],
+   * the passed function `string_fn` will be called on the String
+   * and the result returned
+   *
+   * @param env (IN) A pointer to the java environment
+   * @param jbyte_string_ary (IN) A Java String expressed in bytes
+   * @param string_fn (IN) A transform function to call on the String
+   * @param has_exception (OUT) will be set to JNI_TRUE
+   *     if an OutOfMemoryError exception occurs
+   */
+  template <typename T>
+  static T byteString(JNIEnv* env, jbyteArray jbyte_string_ary,
+                      std::function<T(const char*, const size_t)> string_fn,
+                      jboolean* has_exception) {
+    const jsize jbyte_string_len = env->GetArrayLength(jbyte_string_ary);
+    return byteString<T>(env, jbyte_string_ary, jbyte_string_len, string_fn,
+                         has_exception);
+  }
+
+  /**
+   * Given a Java String which is expressed as a Java Byte Array byte[],
+   * the passed function `string_fn` will be called on the String
+   * and the result returned
+   *
+   * @param env (IN) A pointer to the java environment
+   * @param jbyte_string_ary (IN) A Java String expressed in bytes
+   * @param jbyte_string_len (IN) The length of the Java String
+   *     expressed in bytes
+   * @param string_fn (IN) A transform function to call on the String
+   * @param has_exception (OUT) will be set to JNI_TRUE
+   *     if an OutOfMemoryError exception occurs
+   */
+  template <typename T>
+  static T byteString(JNIEnv* env, jbyteArray jbyte_string_ary,
+                      const jsize jbyte_string_len,
+                      std::function<T(const char*, const size_t)> string_fn,
+                      jboolean* has_exception) {
+    jbyte* jbyte_string = env->GetByteArrayElements(jbyte_string_ary, nullptr);
+    if (jbyte_string == nullptr) {
+      // exception thrown: OutOfMemoryError
+      *has_exception = JNI_TRUE;
+      return nullptr;  // signal error
+    }
+
+    T result =
+        string_fn(reinterpret_cast<char*>(jbyte_string), jbyte_string_len);
+
+    env->ReleaseByteArrayElements(jbyte_string_ary, jbyte_string, JNI_ABORT);
+
+    *has_exception = JNI_FALSE;
+    return result;
+  }
+
+  /**
+   * Converts a std::vector<string> to a Java byte[][] where each Java String
+   * is expressed as a Java Byte Array byte[].
+   *
+   * @param env A pointer to the java environment
+   * @param strings A vector of Strings
+   *
+   * @return A Java array of Strings expressed as bytes,
+   *     or nullptr if an exception is thrown
+   */
+  static jobjectArray stringsBytes(JNIEnv* env,
+                                   std::vector<std::string> strings) {
+    jclass jcls_ba = ByteJni::getArrayJClass(env);
+    if (jcls_ba == nullptr) {
+      // exception occurred
+      return nullptr;
+    }
+
+    const jsize len = static_cast<jsize>(strings.size());
+
+    jobjectArray jbyte_strings = env->NewObjectArray(len, jcls_ba, nullptr);
+    if (jbyte_strings == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+
+    for (jsize i = 0; i < len; i++) {
+      std::string* str = &strings[i];
+      const jsize str_len = static_cast<jsize>(str->size());
+
+      jbyteArray jbyte_string_ary = env->NewByteArray(str_len);
+      if (jbyte_string_ary == nullptr) {
+        // exception thrown: OutOfMemoryError
+        env->DeleteLocalRef(jbyte_strings);
+        return nullptr;
+      }
+
+      env->SetByteArrayRegion(
+          jbyte_string_ary, 0, str_len,
+          const_cast<jbyte*>(reinterpret_cast<const jbyte*>(str->c_str())));
+      if (env->ExceptionCheck()) {
+        // exception thrown: ArrayIndexOutOfBoundsException
+        env->DeleteLocalRef(jbyte_string_ary);
+        env->DeleteLocalRef(jbyte_strings);
+        return nullptr;
+      }
+
+      env->SetObjectArrayElement(jbyte_strings, i, jbyte_string_ary);
+      if (env->ExceptionCheck()) {
+        // exception thrown: ArrayIndexOutOfBoundsException
+        // or ArrayStoreException
+        env->DeleteLocalRef(jbyte_string_ary);
+        env->DeleteLocalRef(jbyte_strings);
+        return nullptr;
+      }
+
+      env->DeleteLocalRef(jbyte_string_ary);
+    }
+
+    return jbyte_strings;
+  }
+
+  /**
+   * Converts a std::vector<std::string> to a Java String[].
+   *
+   * @param env A pointer to the java environment
+   * @param strings A vector of Strings
+   *
+   * @return A Java array of Strings,
+   *     or nullptr if an exception is thrown
+   */
+  static jobjectArray toJavaStrings(JNIEnv* env,
+                                    const std::vector<std::string>* strings) {
+    jclass jcls_str = env->FindClass("java/lang/String");
+    if (jcls_str == nullptr) {
+      // exception occurred
+      return nullptr;
+    }
+
+    const jsize len = static_cast<jsize>(strings->size());
+
+    jobjectArray jstrings = env->NewObjectArray(len, jcls_str, nullptr);
+    if (jstrings == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+
+    for (jsize i = 0; i < len; i++) {
+      const std::string* str = &((*strings)[i]);
+      jstring js = ROCKSDB_NAMESPACE::JniUtil::toJavaString(env, str);
+      if (js == nullptr) {
+        env->DeleteLocalRef(jstrings);
+        return nullptr;
+      }
+
+      env->SetObjectArrayElement(jstrings, i, js);
+      if (env->ExceptionCheck()) {
+        // exception thrown: ArrayIndexOutOfBoundsException
+        // or ArrayStoreException
+        env->DeleteLocalRef(js);
+        env->DeleteLocalRef(jstrings);
+        return nullptr;
+      }
+    }
+
+    return jstrings;
+  }
+
+  /**
+   * Creates a Java UTF String from a C++ std::string
+   *
+   * @param env A pointer to the java environment
+   * @param string the C++ std::string
+   * @param treat_empty_as_null true if empty strings should be treated as null
+   *
+   * @return the Java UTF string, or nullptr if the provided string
+   *     is null (or empty and treat_empty_as_null is set), or if an
+   *     exception occurs allocating the Java String.
+   */
+  static jstring toJavaString(JNIEnv* env, const std::string* string,
+                              const bool treat_empty_as_null = false) {
+    if (string == nullptr) {
+      return nullptr;
+    }
+
+    if (treat_empty_as_null && string->empty()) {
+      return nullptr;
+    }
+
+    return env->NewStringUTF(string->c_str());
+  }
+
+  /**
+   * Copies bytes to a new jByteArray with the check of java array size
+   * limitation.
+   *
+   * @param bytes pointer to memory to copy to a new jByteArray
+   * @param size number of bytes to copy
+   *
+   * @return the Java byte[], or nullptr if an exception occurs
+   *
+   * @throws RocksDBException thrown
+   *   if memory size to copy exceeds general java array size limitation to
+   * avoid overflow.
+   */
+  static jbyteArray createJavaByteArrayWithSizeCheck(JNIEnv* env,
+                                                     const char* bytes,
+                                                     const size_t size) {
+    // Limitation for java array size is vm specific
+    // In general it cannot exceed Integer.MAX_VALUE (2^31 - 1)
+    // Current HotSpot VM limitation for array size is Integer.MAX_VALUE - 5
+    // (2^31 - 1 - 5) It means that the next call to env->NewByteArray can still
+    // end with OutOfMemoryError("Requested array size exceeds VM limit") coming
+    // from VM
+    static const size_t MAX_JARRAY_SIZE = (static_cast<size_t>(1)) << 31;
+    if (size > MAX_JARRAY_SIZE) {
+      ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+          env, "Requested array size exceeds VM limit");
+      return nullptr;
+    }
+
+    const jsize jlen = static_cast<jsize>(size);
+    jbyteArray jbytes = env->NewByteArray(jlen);
+    if (jbytes == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+
+    env->SetByteArrayRegion(
+        jbytes, 0, jlen,
+        const_cast<jbyte*>(reinterpret_cast<const jbyte*>(bytes)));
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      env->DeleteLocalRef(jbytes);
+      return nullptr;
+    }
+
+    return jbytes;
+  }
+
+  /**
+   * Copies bytes from a ROCKSDB_NAMESPACE::Slice to a jByteArray
+   *
+   * @param env A pointer to the java environment
+   * @param bytes The bytes to copy
+   *
+   * @return the Java byte[] or nullptr if an exception occurs
+   *
+   * @throws RocksDBException thrown
+   *   if memory size to copy exceeds general java specific array size
+   * limitation.
+   */
+  static jbyteArray copyBytes(JNIEnv* env, const Slice& bytes) {
+    return createJavaByteArrayWithSizeCheck(env, bytes.data(), bytes.size());
+  }
+
+  /*
+   * Helper for operations on a key and value
+   * for example WriteBatch->Put
+   *
+   * TODO(AR) could be used for RocksDB->Put etc.
+   */
+  static std::unique_ptr<ROCKSDB_NAMESPACE::Status> kv_op(
+      std::function<ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Slice,
+                                              ROCKSDB_NAMESPACE::Slice)>
+          op,
+      JNIEnv* env, jobject /*jobj*/, jbyteArray jkey, jint jkey_len,
+      jbyteArray jvalue, jint jvalue_len) {
+    jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+    if (env->ExceptionCheck()) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+
+    jbyte* value = env->GetByteArrayElements(jvalue, nullptr);
+    if (env->ExceptionCheck()) {
+      // exception thrown: OutOfMemoryError
+      if (key != nullptr) {
+        env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+      }
+      return nullptr;
+    }
+
+    ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+    ROCKSDB_NAMESPACE::Slice value_slice(reinterpret_cast<char*>(value),
+                                         jvalue_len);
+
+    auto status = op(key_slice, value_slice);
+
+    if (value != nullptr) {
+      env->ReleaseByteArrayElements(jvalue, value, JNI_ABORT);
+    }
+    if (key != nullptr) {
+      env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+    }
+
+    return std::unique_ptr<ROCKSDB_NAMESPACE::Status>(
+        new ROCKSDB_NAMESPACE::Status(status));
+  }
+
+  /*
+   * Helper for operations on a key
+   * for example WriteBatch->Delete
+   *
+   * TODO(AR) could be used for RocksDB->Delete etc.
+   */
+  static std::unique_ptr<ROCKSDB_NAMESPACE::Status> k_op(
+      std::function<ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Slice)> op,
+      JNIEnv* env, jobject /*jobj*/, jbyteArray jkey, jint jkey_len) {
+    jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+    if (env->ExceptionCheck()) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+
+    ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+
+    auto status = op(key_slice);
+
+    if (key != nullptr) {
+      env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+    }
+
+    return std::unique_ptr<ROCKSDB_NAMESPACE::Status>(
+        new ROCKSDB_NAMESPACE::Status(status));
+  }
+
+  /*
+   * Helper for operations on a key which is a region of an array
+   * Used to extract the common code from seek/seekForPrev.
+   * Possible that it can be generalised from that.
+   *
+   * We use GetByteArrayRegion to copy the key region of the whole array into
+   * a char[] We suspect this is not much slower than GetByteArrayElements,
+   * which probably copies anyway.
+   */
+  static void k_op_region(std::function<void(ROCKSDB_NAMESPACE::Slice&)> op,
+                          JNIEnv* env, jbyteArray jkey, jint jkey_off,
+                          jint jkey_len) {
+    const std::unique_ptr<char[]> key(new char[jkey_len]);
+    if (key == nullptr) {
+      jclass oom_class = env->FindClass("/lang/java/OutOfMemoryError");
+      env->ThrowNew(oom_class,
+                    "Memory allocation failed in RocksDB JNI function");
+      return;
+    }
+    env->GetByteArrayRegion(jkey, jkey_off, jkey_len,
+                            reinterpret_cast<jbyte*>(key.get()));
+    if (env->ExceptionCheck()) {
+      // exception thrown: OutOfMemoryError
+      return;
+    }
+
+    ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char*>(key.get()),
+                                       jkey_len);
+    op(key_slice);
+  }
+
+  /*
+   * Helper for operations on a value
+   * for example WriteBatchWithIndex->GetFromBatch
+   */
+  static jbyteArray v_op(std::function<ROCKSDB_NAMESPACE::Status(
+                             ROCKSDB_NAMESPACE::Slice, std::string*)>
+                             op,
+                         JNIEnv* env, jbyteArray jkey, jint jkey_len) {
+    jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+    if (env->ExceptionCheck()) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+
+    ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+
+    std::string value;
+    ROCKSDB_NAMESPACE::Status s = op(key_slice, &value);
+
+    if (key != nullptr) {
+      env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+    }
+
+    if (s.IsNotFound()) {
+      return nullptr;
+    }
+
+    if (s.ok()) {
+      jbyteArray jret_value =
+          env->NewByteArray(static_cast<jsize>(value.size()));
+      if (jret_value == nullptr) {
+        // exception thrown: OutOfMemoryError
+        return nullptr;
+      }
+
+      env->SetByteArrayRegion(
+          jret_value, 0, static_cast<jsize>(value.size()),
+          const_cast<jbyte*>(reinterpret_cast<const jbyte*>(value.c_str())));
+      if (env->ExceptionCheck()) {
+        // exception thrown: ArrayIndexOutOfBoundsException
+        if (jret_value != nullptr) {
+          env->DeleteLocalRef(jret_value);
+        }
+        return nullptr;
+      }
+
+      return jret_value;
+    }
+
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
+  }
+
+  /**
+   * Creates a vector<T*> of C++ pointers from
+   *     a Java array of C++ pointer addresses.
+   *
+   * @param env (IN) A pointer to the java environment
+   * @param pointers (IN) A Java array of C++ pointer addresses
+   * @param has_exception (OUT) will be set to JNI_TRUE
+   *     if an ArrayIndexOutOfBoundsException or OutOfMemoryError
+   *     exception occurs.
+   *
+   * @return A vector of C++ pointers.
+   */
+  template <typename T>
+  static std::vector<T*> fromJPointers(JNIEnv* env, jlongArray jptrs,
+                                       jboolean* has_exception) {
+    const jsize jptrs_len = env->GetArrayLength(jptrs);
+    std::vector<T*> ptrs;
+    jlong* jptr = env->GetLongArrayElements(jptrs, nullptr);
+    if (jptr == nullptr) {
+      // exception thrown: OutOfMemoryError
+      *has_exception = JNI_TRUE;
+      return ptrs;
+    }
+    ptrs.reserve(jptrs_len);
+    for (jsize i = 0; i < jptrs_len; i++) {
+      ptrs.push_back(reinterpret_cast<T*>(jptr[i]));
+    }
+    env->ReleaseLongArrayElements(jptrs, jptr, JNI_ABORT);
+    return ptrs;
+  }
+
+  /**
+   * Creates a Java array of C++ pointer addresses
+   *     from a vector of C++ pointers.
+   *
+   * @param env (IN) A pointer to the java environment
+   * @param pointers (IN) A vector of C++ pointers
+   * @param has_exception (OUT) will be set to JNI_TRUE
+   *     if an ArrayIndexOutOfBoundsException or OutOfMemoryError
+   *     exception occurs
+   *
+   * @return Java array of C++ pointer addresses.
+   */
+  template <typename T>
+  static jlongArray toJPointers(JNIEnv* env, const std::vector<T*>& pointers,
+                                jboolean* has_exception) {
+    const jsize len = static_cast<jsize>(pointers.size());
+    std::unique_ptr<jlong[]> results(new jlong[len]);
+    std::transform(
+        pointers.begin(), pointers.end(), results.get(),
+        [](T* pointer) -> jlong { return GET_CPLUSPLUS_POINTER(pointer); });
+
+    jlongArray jpointers = env->NewLongArray(len);
+    if (jpointers == nullptr) {
+      // exception thrown: OutOfMemoryError
+      *has_exception = JNI_TRUE;
+      return nullptr;
+    }
+
+    env->SetLongArrayRegion(jpointers, 0, len, results.get());
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      *has_exception = JNI_TRUE;
+      env->DeleteLocalRef(jpointers);
+      return nullptr;
+    }
+
+    *has_exception = JNI_FALSE;
+
+    return jpointers;
+  }
+
+  /*
+   * Helper for operations on a key and value
+   * for example WriteBatch->Put
+   *
+   * TODO(AR) could be extended to cover returning ROCKSDB_NAMESPACE::Status
+   * from `op` and used for RocksDB->Put etc.
+   */
+  static void kv_op_direct(
+      std::function<void(ROCKSDB_NAMESPACE::Slice&, ROCKSDB_NAMESPACE::Slice&)>
+          op,
+      JNIEnv* env, jobject jkey, jint jkey_off, jint jkey_len, jobject jval,
+      jint jval_off, jint jval_len) {
+    char* key = reinterpret_cast<char*>(env->GetDirectBufferAddress(jkey));
+    if (key == nullptr ||
+        env->GetDirectBufferCapacity(jkey) < (jkey_off + jkey_len)) {
+      ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env,
+                                                       "Invalid key argument");
+      return;
+    }
+
+    char* value = reinterpret_cast<char*>(env->GetDirectBufferAddress(jval));
+    if (value == nullptr ||
+        env->GetDirectBufferCapacity(jval) < (jval_off + jval_len)) {
+      ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+          env, "Invalid value argument");
+      return;
+    }
+
+    key += jkey_off;
+    value += jval_off;
+
+    ROCKSDB_NAMESPACE::Slice key_slice(key, jkey_len);
+    ROCKSDB_NAMESPACE::Slice value_slice(value, jval_len);
+
+    op(key_slice, value_slice);
+  }
+
+  /*
+   * Helper for operations on a key and value
+   * for example WriteBatch->Delete
+   *
+   * TODO(AR) could be extended to cover returning ROCKSDB_NAMESPACE::Status
+   * from `op` and used for RocksDB->Delete etc.
+   */
+  static void k_op_direct(std::function<void(ROCKSDB_NAMESPACE::Slice&)> op,
+                          JNIEnv* env, jobject jkey, jint jkey_off,
+                          jint jkey_len) {
+    char* key = reinterpret_cast<char*>(env->GetDirectBufferAddress(jkey));
+    if (key == nullptr ||
+        env->GetDirectBufferCapacity(jkey) < (jkey_off + jkey_len)) {
+      ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env,
+                                                       "Invalid key argument");
+      return;
+    }
+
+    key += jkey_off;
+
+    ROCKSDB_NAMESPACE::Slice key_slice(key, jkey_len);
+
+    return op(key_slice);
+  }
+
+  template <class T>
+  static jint copyToDirect(JNIEnv* env, T& source, jobject jtarget,
+                           jint jtarget_off, jint jtarget_len) {
+    char* target =
+        reinterpret_cast<char*>(env->GetDirectBufferAddress(jtarget));
+    if (target == nullptr ||
+        env->GetDirectBufferCapacity(jtarget) < (jtarget_off + jtarget_len)) {
+      ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+          env, "Invalid target argument");
+      return 0;
+    }
+
+    target += jtarget_off;
+
+    const jint cvalue_len = static_cast<jint>(source.size());
+    const jint length = std::min(jtarget_len, cvalue_len);
+
+    memcpy(target, source.data(), length);
+
+    return cvalue_len;
+  }
+};
+
+class MapJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class java.util.Map
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "java/util/Map");
+  }
+
+  /**
+   * Get the Java Method: Map#put
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getMapPutMethodId(JNIEnv* env) {
+    jclass jlist_clazz = getJClass(env);
+    if (jlist_clazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(
+        jlist_clazz, "put",
+        "(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+class HashMapJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class java.util.HashMap
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "java/util/HashMap");
+  }
+
+  /**
+   * Create a new Java java.util.HashMap object.
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return A reference to a Java java.util.HashMap object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject construct(JNIEnv* env, const uint32_t initial_capacity = 16) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID mid = env->GetMethodID(jclazz, "<init>", "(I)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    jobject jhash_map =
+        env->NewObject(jclazz, mid, static_cast<jint>(initial_capacity));
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+
+    return jhash_map;
+  }
+
+  /**
+   * A function which maps a std::pair<K,V> to a std::pair<JK, JV>
+   *
+   * @return Either a pointer to a std::pair<jobject, jobject>, or nullptr
+   *     if an error occurs during the mapping
+   */
+  template <typename K, typename V, typename JK, typename JV>
+  using FnMapKV =
+      std::function<std::unique_ptr<std::pair<JK, JV>>(const std::pair<K, V>&)>;
+
+  // template <class I, typename K, typename V, typename K1, typename V1,
+  // typename std::enable_if<std::is_same<typename
+  // std::iterator_traits<I>::value_type, std::pair<const K,V>>::value,
+  // int32_t>::type = 0> static void putAll(JNIEnv* env, const jobject
+  // jhash_map, I iterator, const FnMapKV<const K,V,K1,V1> &fn_map_kv) {
+  /**
+   * Returns true if it succeeds, false if an error occurs
+   */
+  template <class iterator_type, typename K, typename V>
+  static bool putAll(JNIEnv* env, const jobject jhash_map,
+                     iterator_type iterator, iterator_type end,
+                     const FnMapKV<K, V, jobject, jobject>& fn_map_kv) {
+    const jmethodID jmid_put =
+        ROCKSDB_NAMESPACE::MapJni::getMapPutMethodId(env);
+    if (jmid_put == nullptr) {
+      return false;
+    }
+
+    for (auto it = iterator; it != end; ++it) {
+      const std::unique_ptr<std::pair<jobject, jobject>> result =
+          fn_map_kv(*it);
+      if (result == nullptr) {
+        // an error occurred during fn_map_kv
+        return false;
+      }
+      env->CallObjectMethod(jhash_map, jmid_put, result->first, result->second);
+      if (env->ExceptionCheck()) {
+        // exception occurred
+        env->DeleteLocalRef(result->second);
+        env->DeleteLocalRef(result->first);
+        return false;
+      }
+
+      // release local references
+      env->DeleteLocalRef(result->second);
+      env->DeleteLocalRef(result->first);
+    }
+
+    return true;
+  }
+
+  /**
+   * Creates a java.util.Map<String, String> from a std::map<std::string,
+   * std::string>
+   *
+   * @param env A pointer to the Java environment
+   * @param map the Cpp map
+   *
+   * @return a reference to the Java java.util.Map object, or nullptr if an
+   * exception occcurred
+   */
+  static jobject fromCppMap(JNIEnv* env,
+                            const std::map<std::string, std::string>* map) {
+    if (map == nullptr) {
+      return nullptr;
+    }
+
+    jobject jhash_map = construct(env, static_cast<uint32_t>(map->size()));
+    if (jhash_map == nullptr) {
+      // exception occurred
+      return nullptr;
+    }
+
+    const ROCKSDB_NAMESPACE::HashMapJni::FnMapKV<
+        const std::string, const std::string, jobject, jobject>
+        fn_map_kv =
+            [env](const std::pair<const std::string, const std::string>& kv) {
+              jstring jkey = ROCKSDB_NAMESPACE::JniUtil::toJavaString(
+                  env, &(kv.first), false);
+              if (env->ExceptionCheck()) {
+                // an error occurred
+                return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+              }
+
+              jstring jvalue = ROCKSDB_NAMESPACE::JniUtil::toJavaString(
+                  env, &(kv.second), true);
+              if (env->ExceptionCheck()) {
+                // an error occurred
+                env->DeleteLocalRef(jkey);
+                return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+              }
+
+              return std::unique_ptr<std::pair<jobject, jobject>>(
+                  new std::pair<jobject, jobject>(
+                      static_cast<jobject>(jkey),
+                      static_cast<jobject>(jvalue)));
+            };
+
+    if (!putAll(env, jhash_map, map->begin(), map->end(), fn_map_kv)) {
+      // exception occurred
+      return nullptr;
+    }
+
+    return jhash_map;
+  }
+
+  /**
+   * Creates a java.util.Map<String, Long> from a std::map<std::string,
+   * uint32_t>
+   *
+   * @param env A pointer to the Java environment
+   * @param map the Cpp map
+   *
+   * @return a reference to the Java java.util.Map object, or nullptr if an
+   * exception occcurred
+   */
+  static jobject fromCppMap(JNIEnv* env,
+                            const std::map<std::string, uint32_t>* map) {
+    if (map == nullptr) {
+      return nullptr;
+    }
+
+    if (map == nullptr) {
+      return nullptr;
+    }
+
+    jobject jhash_map = construct(env, static_cast<uint32_t>(map->size()));
+    if (jhash_map == nullptr) {
+      // exception occurred
+      return nullptr;
+    }
+
+    const ROCKSDB_NAMESPACE::HashMapJni::FnMapKV<
+        const std::string, const uint32_t, jobject, jobject>
+        fn_map_kv =
+            [env](const std::pair<const std::string, const uint32_t>& kv) {
+              jstring jkey = ROCKSDB_NAMESPACE::JniUtil::toJavaString(
+                  env, &(kv.first), false);
+              if (env->ExceptionCheck()) {
+                // an error occurred
+                return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+              }
+
+              jobject jvalue = ROCKSDB_NAMESPACE::IntegerJni::valueOf(
+                  env, static_cast<jint>(kv.second));
+              if (env->ExceptionCheck()) {
+                // an error occurred
+                env->DeleteLocalRef(jkey);
+                return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+              }
+
+              return std::unique_ptr<std::pair<jobject, jobject>>(
+                  new std::pair<jobject, jobject>(static_cast<jobject>(jkey),
+                                                  jvalue));
+            };
+
+    if (!putAll(env, jhash_map, map->begin(), map->end(), fn_map_kv)) {
+      // exception occurred
+      return nullptr;
+    }
+
+    return jhash_map;
+  }
+
+  /**
+   * Creates a java.util.Map<String, Long> from a std::map<std::string,
+   * uint64_t>
+   *
+   * @param env A pointer to the Java environment
+   * @param map the Cpp map
+   *
+   * @return a reference to the Java java.util.Map object, or nullptr if an
+   * exception occcurred
+   */
+  static jobject fromCppMap(JNIEnv* env,
+                            const std::map<std::string, uint64_t>* map) {
+    if (map == nullptr) {
+      return nullptr;
+    }
+
+    jobject jhash_map = construct(env, static_cast<uint32_t>(map->size()));
+    if (jhash_map == nullptr) {
+      // exception occurred
+      return nullptr;
+    }
+
+    const ROCKSDB_NAMESPACE::HashMapJni::FnMapKV<
+        const std::string, const uint64_t, jobject, jobject>
+        fn_map_kv =
+            [env](const std::pair<const std::string, const uint64_t>& kv) {
+              jstring jkey = ROCKSDB_NAMESPACE::JniUtil::toJavaString(
+                  env, &(kv.first), false);
+              if (env->ExceptionCheck()) {
+                // an error occurred
+                return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+              }
+
+              jobject jvalue = ROCKSDB_NAMESPACE::LongJni::valueOf(
+                  env, static_cast<jlong>(kv.second));
+              if (env->ExceptionCheck()) {
+                // an error occurred
+                env->DeleteLocalRef(jkey);
+                return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+              }
+
+              return std::unique_ptr<std::pair<jobject, jobject>>(
+                  new std::pair<jobject, jobject>(static_cast<jobject>(jkey),
+                                                  jvalue));
+            };
+
+    if (!putAll(env, jhash_map, map->begin(), map->end(), fn_map_kv)) {
+      // exception occurred
+      return nullptr;
+    }
+
+    return jhash_map;
+  }
+
+  /**
+   * Creates a java.util.Map<String, Long> from a std::map<uint32_t, uint64_t>
+   *
+   * @param env A pointer to the Java environment
+   * @param map the Cpp map
+   *
+   * @return a reference to the Java java.util.Map object, or nullptr if an
+   * exception occcurred
+   */
+  static jobject fromCppMap(JNIEnv* env,
+                            const std::map<uint32_t, uint64_t>* map) {
+    if (map == nullptr) {
+      return nullptr;
+    }
+
+    jobject jhash_map = construct(env, static_cast<uint32_t>(map->size()));
+    if (jhash_map == nullptr) {
+      // exception occurred
+      return nullptr;
+    }
+
+    const ROCKSDB_NAMESPACE::HashMapJni::FnMapKV<const uint32_t, const uint64_t,
+                                                 jobject, jobject>
+        fn_map_kv = [env](const std::pair<const uint32_t, const uint64_t>& kv) {
+          jobject jkey = ROCKSDB_NAMESPACE::IntegerJni::valueOf(
+              env, static_cast<jint>(kv.first));
+          if (env->ExceptionCheck()) {
+            // an error occurred
+            return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+          }
+
+          jobject jvalue = ROCKSDB_NAMESPACE::LongJni::valueOf(
+              env, static_cast<jlong>(kv.second));
+          if (env->ExceptionCheck()) {
+            // an error occurred
+            env->DeleteLocalRef(jkey);
+            return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+          }
+
+          return std::unique_ptr<std::pair<jobject, jobject>>(
+              new std::pair<jobject, jobject>(static_cast<jobject>(jkey),
+                                              jvalue));
+        };
+
+    if (!putAll(env, jhash_map, map->begin(), map->end(), fn_map_kv)) {
+      // exception occurred
+      return nullptr;
+    }
+
+    return jhash_map;
+  }
+};
+
+// The portal class for org.rocksdb.RocksDB
+class RocksDBJni
+    : public RocksDBNativeClass<ROCKSDB_NAMESPACE::DB*, RocksDBJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.RocksDB
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/RocksDB");
+  }
+};
+
+// The portal class for org.rocksdb.Options
+class OptionsJni
+    : public RocksDBNativeClass<ROCKSDB_NAMESPACE::Options*, OptionsJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.Options
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/Options");
+  }
+};
+
+// The portal class for org.rocksdb.DBOptions
+class DBOptionsJni
+    : public RocksDBNativeClass<ROCKSDB_NAMESPACE::DBOptions*, DBOptionsJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.DBOptions
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/DBOptions");
+  }
+};
+
+// The portal class for org.rocksdb.ColumnFamilyOptions
+class ColumnFamilyOptionsJni
+    : public RocksDBNativeClass<ROCKSDB_NAMESPACE::ColumnFamilyOptions*,
+                                ColumnFamilyOptionsJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.ColumnFamilyOptions
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+                                         "org/rocksdb/ColumnFamilyOptions");
+  }
+
+  /**
+   * Create a new Java org.rocksdb.ColumnFamilyOptions object with the same
+   * properties as the provided C++ ROCKSDB_NAMESPACE::ColumnFamilyOptions
+   * object
+   *
+   * @param env A pointer to the Java environment
+   * @param cfoptions A pointer to ROCKSDB_NAMESPACE::ColumnFamilyOptions object
+   *
+   * @return A reference to a Java org.rocksdb.ColumnFamilyOptions object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject construct(JNIEnv* env, const ColumnFamilyOptions* cfoptions) {
+    auto* cfo = new ROCKSDB_NAMESPACE::ColumnFamilyOptions(*cfoptions);
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID mid = env->GetMethodID(jclazz, "<init>", "(J)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    jobject jcfd = env->NewObject(jclazz, mid, GET_CPLUSPLUS_POINTER(cfo));
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+
+    return jcfd;
+  }
+};
+
+// The portal class for org.rocksdb.WriteOptions
+class WriteOptionsJni
+    : public RocksDBNativeClass<ROCKSDB_NAMESPACE::WriteOptions*,
+                                WriteOptionsJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.WriteOptions
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/WriteOptions");
+  }
+};
+
+// The portal class for org.rocksdb.ReadOptions
+class ReadOptionsJni
+    : public RocksDBNativeClass<ROCKSDB_NAMESPACE::ReadOptions*,
+                                ReadOptionsJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.ReadOptions
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/ReadOptions");
+  }
+};
+
+// The portal class for org.rocksdb.WriteBatch
+class WriteBatchJni
+    : public RocksDBNativeClass<ROCKSDB_NAMESPACE::WriteBatch*, WriteBatchJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.WriteBatch
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/WriteBatch");
+  }
+
+  /**
+   * Create a new Java org.rocksdb.WriteBatch object
+   *
+   * @param env A pointer to the Java environment
+   * @param wb A pointer to ROCKSDB_NAMESPACE::WriteBatch object
+   *
+   * @return A reference to a Java org.rocksdb.WriteBatch object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject construct(JNIEnv* env, const WriteBatch* wb) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID mid = env->GetMethodID(jclazz, "<init>", "(J)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    jobject jwb = env->NewObject(jclazz, mid, GET_CPLUSPLUS_POINTER(wb));
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+
+    return jwb;
+  }
+};
+
+// The portal class for org.rocksdb.WriteBatch.Handler
+class WriteBatchHandlerJni
+    : public RocksDBNativeClass<
+          const ROCKSDB_NAMESPACE::WriteBatchHandlerJniCallback*,
+          WriteBatchHandlerJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.WriteBatch.Handler
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/WriteBatch$Handler");
+  }
+
+  /**
+   * Get the Java Method: WriteBatch.Handler#put
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getPutCfMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "put", "(I[B[B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: WriteBatch.Handler#put
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getPutMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "put", "([B[B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: WriteBatch.Handler#merge
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getMergeCfMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "merge", "(I[B[B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: WriteBatch.Handler#merge
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getMergeMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "merge", "([B[B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: WriteBatch.Handler#delete
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getDeleteCfMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "delete", "(I[B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: WriteBatch.Handler#delete
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getDeleteMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "delete", "([B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: WriteBatch.Handler#singleDelete
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getSingleDeleteCfMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "singleDelete", "(I[B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: WriteBatch.Handler#singleDelete
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getSingleDeleteMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "singleDelete", "([B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: WriteBatch.Handler#deleteRange
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getDeleteRangeCfMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "deleteRange", "(I[B[B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: WriteBatch.Handler#deleteRange
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getDeleteRangeMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "deleteRange", "([B[B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: WriteBatch.Handler#logData
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getLogDataMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "logData", "([B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: WriteBatch.Handler#putBlobIndex
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getPutBlobIndexCfMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "putBlobIndex", "(I[B[B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: WriteBatch.Handler#markBeginPrepare
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getMarkBeginPrepareMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "markBeginPrepare", "()V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: WriteBatch.Handler#markEndPrepare
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getMarkEndPrepareMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "markEndPrepare", "([B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: WriteBatch.Handler#markNoop
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getMarkNoopMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "markNoop", "(Z)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: WriteBatch.Handler#markRollback
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getMarkRollbackMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "markRollback", "([B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: WriteBatch.Handler#markCommit
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getMarkCommitMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "markCommit", "([B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: WriteBatch.Handler#markCommitWithTimestamp
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getMarkCommitWithTimestampMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "markCommitWithTimestamp", "([B[B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: WriteBatch.Handler#shouldContinue
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getContinueMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "shouldContinue", "()Z");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+class WriteBatchSavePointJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.WriteBatch.SavePoint
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/WriteBatch$SavePoint");
+  }
+
+  /**
+   * Get the Java Method: HistogramData constructor
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getConstructorMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "<init>", "(JJJ)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Create a new Java org.rocksdb.WriteBatch.SavePoint object
+   *
+   * @param env A pointer to the Java environment
+   * @param savePoint A pointer to ROCKSDB_NAMESPACE::WriteBatch::SavePoint
+   * object
+   *
+   * @return A reference to a Java org.rocksdb.WriteBatch.SavePoint object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject construct(JNIEnv* env, const SavePoint& save_point) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID mid = getConstructorMethodId(env);
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    jobject jsave_point =
+        env->NewObject(jclazz, mid, static_cast<jlong>(save_point.size),
+                       static_cast<jlong>(save_point.count),
+                       static_cast<jlong>(save_point.content_flags));
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+
+    return jsave_point;
+  }
+};
+
+// The portal class for org.rocksdb.WriteBatchWithIndex
+class WriteBatchWithIndexJni
+    : public RocksDBNativeClass<ROCKSDB_NAMESPACE::WriteBatchWithIndex*,
+                                WriteBatchWithIndexJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.WriteBatchWithIndex
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+                                         "org/rocksdb/WriteBatchWithIndex");
+  }
+};
+
+// The portal class for org.rocksdb.HistogramData
+class HistogramDataJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.HistogramData
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/HistogramData");
+  }
+
+  /**
+   * Get the Java Method: HistogramData constructor
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getConstructorMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "<init>", "(DDDDDDJJD)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+// The portal class for org.rocksdb.BackupEngineOptions
+class BackupEngineOptionsJni
+    : public RocksDBNativeClass<ROCKSDB_NAMESPACE::BackupEngineOptions*,
+                                BackupEngineOptionsJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.BackupEngineOptions
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+                                         "org/rocksdb/BackupEngineOptions");
+  }
+};
+
+// The portal class for org.rocksdb.BackupEngine
+class BackupEngineJni
+    : public RocksDBNativeClass<ROCKSDB_NAMESPACE::BackupEngine*,
+                                BackupEngineJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.BackupEngine
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/BackupEngine");
+  }
+};
+
+// The portal class for org.rocksdb.RocksIterator
+class IteratorJni
+    : public RocksDBNativeClass<ROCKSDB_NAMESPACE::Iterator*, IteratorJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.RocksIterator
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/RocksIterator");
+  }
+};
+
+// The portal class for org.rocksdb.Filter
+class FilterJni
+    : public RocksDBNativeClass<
+          std::shared_ptr<ROCKSDB_NAMESPACE::FilterPolicy>*, FilterJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.Filter
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/Filter");
+  }
+};
+
+// The portal class for org.rocksdb.ColumnFamilyHandle
+class ColumnFamilyHandleJni
+    : public RocksDBNativeClass<ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
+                                ColumnFamilyHandleJni> {
+ public:
+  static jobject fromCppColumnFamilyHandle(
+      JNIEnv* env, const ROCKSDB_NAMESPACE::ColumnFamilyHandle* info) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    return env->NewObject(jclazz, ctor, GET_CPLUSPLUS_POINTER(info));
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(clazz, "<init>", "(J)V");
+  }
+
+  /**
+   * Get the Java Class org.rocksdb.ColumnFamilyHandle
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/ColumnFamilyHandle");
+  }
+};
+
+// The portal class for org.rocksdb.FlushOptions
+class FlushOptionsJni
+    : public RocksDBNativeClass<ROCKSDB_NAMESPACE::FlushOptions*,
+                                FlushOptionsJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.FlushOptions
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/FlushOptions");
+  }
+};
+
+// The portal class for org.rocksdb.ComparatorOptions
+class ComparatorOptionsJni
+    : public RocksDBNativeClass<
+          ROCKSDB_NAMESPACE::ComparatorJniCallbackOptions*,
+          ComparatorOptionsJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.ComparatorOptions
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/ComparatorOptions");
+  }
+};
+
+// The portal class for org.rocksdb.AbstractCompactionFilterFactory
+class AbstractCompactionFilterFactoryJni
+    : public RocksDBNativeClass<
+          const ROCKSDB_NAMESPACE::CompactionFilterFactoryJniCallback*,
+          AbstractCompactionFilterFactoryJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.AbstractCompactionFilterFactory
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(
+        env, "org/rocksdb/AbstractCompactionFilterFactory");
+  }
+
+  /**
+   * Get the Java Method: AbstractCompactionFilterFactory#name
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getNameMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "name", "()Ljava/lang/String;");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractCompactionFilterFactory#createCompactionFilter
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getCreateCompactionFilterMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "createCompactionFilter", "(ZZ)J");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+// The portal class for org.rocksdb.AbstractTransactionNotifier
+class AbstractTransactionNotifierJni
+    : public RocksDBNativeClass<
+          const ROCKSDB_NAMESPACE::TransactionNotifierJniCallback*,
+          AbstractTransactionNotifierJni> {
+ public:
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(
+        env, "org/rocksdb/AbstractTransactionNotifier");
+  }
+
+  // Get the java method `snapshotCreated`
+  // of org.rocksdb.AbstractTransactionNotifier.
+  static jmethodID getSnapshotCreatedMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "snapshotCreated", "(J)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+// The portal class for org.rocksdb.AbstractComparatorJniBridge
+class AbstractComparatorJniBridge : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.AbstractComparatorJniBridge
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/AbstractComparatorJniBridge");
+  }
+
+  /**
+   * Get the Java Method: Comparator#compareInternal
+   *
+   * @param env A pointer to the Java environment
+   * @param jclazz the AbstractComparatorJniBridge class
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getCompareInternalMethodId(JNIEnv* env, jclass jclazz) {
+    static jmethodID mid =
+        env->GetStaticMethodID(jclazz, "compareInternal",
+                               "(Lorg/rocksdb/AbstractComparator;Ljava/nio/"
+                               "ByteBuffer;ILjava/nio/ByteBuffer;I)I");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: Comparator#findShortestSeparatorInternal
+   *
+   * @param env A pointer to the Java environment
+   * @param jclazz the AbstractComparatorJniBridge class
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getFindShortestSeparatorInternalMethodId(JNIEnv* env,
+                                                            jclass jclazz) {
+    static jmethodID mid =
+        env->GetStaticMethodID(jclazz, "findShortestSeparatorInternal",
+                               "(Lorg/rocksdb/AbstractComparator;Ljava/nio/"
+                               "ByteBuffer;ILjava/nio/ByteBuffer;I)I");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: Comparator#findShortSuccessorInternal
+   *
+   * @param env A pointer to the Java environment
+   * @param jclazz the AbstractComparatorJniBridge class
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getFindShortSuccessorInternalMethodId(JNIEnv* env,
+                                                         jclass jclazz) {
+    static jmethodID mid = env->GetStaticMethodID(
+        jclazz, "findShortSuccessorInternal",
+        "(Lorg/rocksdb/AbstractComparator;Ljava/nio/ByteBuffer;I)I");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+// The portal class for org.rocksdb.AbstractComparator
+class AbstractComparatorJni
+    : public RocksDBNativeClass<const ROCKSDB_NAMESPACE::ComparatorJniCallback*,
+                                AbstractComparatorJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.AbstractComparator
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/AbstractComparator");
+  }
+
+  /**
+   * Get the Java Method: Comparator#name
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getNameMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "name", "()Ljava/lang/String;");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+// The portal class for org.rocksdb.AbstractSlice
+class AbstractSliceJni
+    : public NativeRocksMutableObject<const ROCKSDB_NAMESPACE::Slice*,
+                                      AbstractSliceJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.AbstractSlice
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/AbstractSlice");
+  }
+};
+
+// The portal class for org.rocksdb.Slice
+class SliceJni
+    : public NativeRocksMutableObject<const ROCKSDB_NAMESPACE::Slice*,
+                                      AbstractSliceJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.Slice
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/Slice");
+  }
+
+  /**
+   * Constructs a Slice object
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return A reference to a Java Slice object, or a nullptr if an
+   *     exception occurs
+   */
+  static jobject construct0(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "<init>", "()V");
+    if (mid == nullptr) {
+      // exception occurred accessing method
+      return nullptr;
+    }
+
+    jobject jslice = env->NewObject(jclazz, mid);
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+
+    return jslice;
+  }
+};
+
+// The portal class for org.rocksdb.DirectSlice
+class DirectSliceJni
+    : public NativeRocksMutableObject<const ROCKSDB_NAMESPACE::Slice*,
+                                      AbstractSliceJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.DirectSlice
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/DirectSlice");
+  }
+
+  /**
+   * Constructs a DirectSlice object
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return A reference to a Java DirectSlice object, or a nullptr if an
+   *     exception occurs
+   */
+  static jobject construct0(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "<init>", "()V");
+    if (mid == nullptr) {
+      // exception occurred accessing method
+      return nullptr;
+    }
+
+    jobject jdirect_slice = env->NewObject(jclazz, mid);
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+
+    return jdirect_slice;
+  }
+};
+
+// The portal class for org.rocksdb.BackupInfo
+class BackupInfoJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.BackupInfo
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/BackupInfo");
+  }
+
+  /**
+   * Constructs a BackupInfo object
+   *
+   * @param env A pointer to the Java environment
+   * @param backup_id id of the backup
+   * @param timestamp timestamp of the backup
+   * @param size size of the backup
+   * @param number_files number of files related to the backup
+   * @param app_metadata application specific metadata
+   *
+   * @return A reference to a Java BackupInfo object, or a nullptr if an
+   *     exception occurs
+   */
+  static jobject construct0(JNIEnv* env, uint32_t backup_id, int64_t timestamp,
+                            uint64_t size, uint32_t number_files,
+                            const std::string& app_metadata) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "<init>", "(IJJILjava/lang/String;)V");
+    if (mid == nullptr) {
+      // exception occurred accessing method
+      return nullptr;
+    }
+
+    jstring japp_metadata = nullptr;
+    if (app_metadata != nullptr) {
+      japp_metadata = env->NewStringUTF(app_metadata.c_str());
+      if (japp_metadata == nullptr) {
+        // exception occurred creating java string
+        return nullptr;
+      }
+    }
+
+    jobject jbackup_info = env->NewObject(jclazz, mid, backup_id, timestamp,
+                                          size, number_files, japp_metadata);
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(japp_metadata);
+      return nullptr;
+    }
+
+    return jbackup_info;
+  }
+};
+
+class BackupInfoListJni {
+ public:
+  /**
+   * Converts a C++ std::vector<BackupInfo> object to
+   * a Java ArrayList<org.rocksdb.BackupInfo> object
+   *
+   * @param env A pointer to the Java environment
+   * @param backup_infos A vector of BackupInfo
+   *
+   * @return Either a reference to a Java ArrayList object, or a nullptr
+   *     if an exception occurs
+   */
+  static jobject getBackupInfo(JNIEnv* env,
+                               std::vector<BackupInfo> backup_infos) {
+    jclass jarray_list_clazz =
+        ROCKSDB_NAMESPACE::ListJni::getArrayListClass(env);
+    if (jarray_list_clazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID cstr_mid =
+        ROCKSDB_NAMESPACE::ListJni::getArrayListConstructorMethodId(env);
+    if (cstr_mid == nullptr) {
+      // exception occurred accessing method
+      return nullptr;
+    }
+
+    jmethodID add_mid = ROCKSDB_NAMESPACE::ListJni::getListAddMethodId(env);
+    if (add_mid == nullptr) {
+      // exception occurred accessing method
+      return nullptr;
+    }
+
+    // create java list
+    jobject jbackup_info_handle_list =
+        env->NewObject(jarray_list_clazz, cstr_mid, backup_infos.size());
+    if (env->ExceptionCheck()) {
+      // exception occurred constructing object
+      return nullptr;
+    }
+
+    // insert in java list
+    auto end = backup_infos.end();
+    for (auto it = backup_infos.begin(); it != end; ++it) {
+      auto backup_info = *it;
+
+      jobject obj = ROCKSDB_NAMESPACE::BackupInfoJni::construct0(
+          env, backup_info.backup_id, backup_info.timestamp, backup_info.size,
+          backup_info.number_files, backup_info.app_metadata);
+      if (env->ExceptionCheck()) {
+        // exception occurred constructing object
+        if (obj != nullptr) {
+          env->DeleteLocalRef(obj);
+        }
+        if (jbackup_info_handle_list != nullptr) {
+          env->DeleteLocalRef(jbackup_info_handle_list);
+        }
+        return nullptr;
+      }
+
+      jboolean rs =
+          env->CallBooleanMethod(jbackup_info_handle_list, add_mid, obj);
+      if (env->ExceptionCheck() || rs == JNI_FALSE) {
+        // exception occurred calling method, or could not add
+        if (obj != nullptr) {
+          env->DeleteLocalRef(obj);
+        }
+        if (jbackup_info_handle_list != nullptr) {
+          env->DeleteLocalRef(jbackup_info_handle_list);
+        }
+        return nullptr;
+      }
+    }
+
+    return jbackup_info_handle_list;
+  }
+};
+
+// The portal class for org.rocksdb.WBWIRocksIterator
+class WBWIRocksIteratorJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.WBWIRocksIterator
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/WBWIRocksIterator");
+  }
+
+  /**
+   * Get the Java Field: WBWIRocksIterator#entry
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Field ID or nullptr if the class or field id could not
+   *     be retrieved
+   */
+  static jfieldID getWriteEntryField(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jfieldID fid = env->GetFieldID(
+        jclazz, "entry", "Lorg/rocksdb/WBWIRocksIterator$WriteEntry;");
+    assert(fid != nullptr);
+    return fid;
+  }
+
+  /**
+   * Gets the value of the WBWIRocksIterator#entry
+   *
+   * @param env A pointer to the Java environment
+   * @param jwbwi_rocks_iterator A reference to a WBWIIterator
+   *
+   * @return A reference to a Java WBWIRocksIterator.WriteEntry object, or
+   *     a nullptr if an exception occurs
+   */
+  static jobject getWriteEntry(JNIEnv* env, jobject jwbwi_rocks_iterator) {
+    assert(jwbwi_rocks_iterator != nullptr);
+
+    jfieldID jwrite_entry_field = getWriteEntryField(env);
+    if (jwrite_entry_field == nullptr) {
+      // exception occurred accessing the field
+      return nullptr;
+    }
+
+    jobject jwe = env->GetObjectField(jwbwi_rocks_iterator, jwrite_entry_field);
+    assert(jwe != nullptr);
+    return jwe;
+  }
+};
+
+// The portal class for org.rocksdb.WBWIRocksIterator.WriteType
+class WriteTypeJni : public JavaClass {
+ public:
+  /**
+   * Get the PUT enum field value of WBWIRocksIterator.WriteType
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return A reference to the enum field value or a nullptr if
+   *     the enum field value could not be retrieved
+   */
+  static jobject PUT(JNIEnv* env) { return getEnum(env, "PUT"); }
+
+  /**
+   * Get the MERGE enum field value of WBWIRocksIterator.WriteType
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return A reference to the enum field value or a nullptr if
+   *     the enum field value could not be retrieved
+   */
+  static jobject MERGE(JNIEnv* env) { return getEnum(env, "MERGE"); }
+
+  /**
+   * Get the DELETE enum field value of WBWIRocksIterator.WriteType
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return A reference to the enum field value or a nullptr if
+   *     the enum field value could not be retrieved
+   */
+  static jobject DELETE(JNIEnv* env) { return getEnum(env, "DELETE"); }
+
+  /**
+   * Get the LOG enum field value of WBWIRocksIterator.WriteType
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return A reference to the enum field value or a nullptr if
+   *     the enum field value could not be retrieved
+   */
+  static jobject LOG(JNIEnv* env) { return getEnum(env, "LOG"); }
+
+  // Returns the equivalent org.rocksdb.WBWIRocksIterator.WriteType for the
+  // provided C++ ROCKSDB_NAMESPACE::WriteType enum
+  static jbyte toJavaWriteType(const ROCKSDB_NAMESPACE::WriteType& writeType) {
+    switch (writeType) {
+      case ROCKSDB_NAMESPACE::WriteType::kPutRecord:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::WriteType::kMergeRecord:
+        return 0x1;
+      case ROCKSDB_NAMESPACE::WriteType::kDeleteRecord:
+        return 0x2;
+      case ROCKSDB_NAMESPACE::WriteType::kSingleDeleteRecord:
+        return 0x3;
+      case ROCKSDB_NAMESPACE::WriteType::kDeleteRangeRecord:
+        return 0x4;
+      case ROCKSDB_NAMESPACE::WriteType::kLogDataRecord:
+        return 0x5;
+      case ROCKSDB_NAMESPACE::WriteType::kXIDRecord:
+        return 0x6;
+      default:
+        return 0x7F;  // undefined
+    }
+  }
+
+ private:
+  /**
+   * Get the Java Class org.rocksdb.WBWIRocksIterator.WriteType
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/WBWIRocksIterator$WriteType");
+  }
+
+  /**
+   * Get an enum field of org.rocksdb.WBWIRocksIterator.WriteType
+   *
+   * @param env A pointer to the Java environment
+   * @param name The name of the enum field
+   *
+   * @return A reference to the enum field value or a nullptr if
+   *     the enum field value could not be retrieved
+   */
+  static jobject getEnum(JNIEnv* env, const char name[]) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jfieldID jfid = env->GetStaticFieldID(
+        jclazz, name, "Lorg/rocksdb/WBWIRocksIterator$WriteType;");
+    if (env->ExceptionCheck()) {
+      // exception occurred while getting field
+      return nullptr;
+    } else if (jfid == nullptr) {
+      return nullptr;
+    }
+
+    jobject jwrite_type = env->GetStaticObjectField(jclazz, jfid);
+    assert(jwrite_type != nullptr);
+    return jwrite_type;
+  }
+};
+
+// The portal class for org.rocksdb.WBWIRocksIterator.WriteEntry
+class WriteEntryJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.WBWIRocksIterator.WriteEntry
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env,
+                                "org/rocksdb/WBWIRocksIterator$WriteEntry");
+  }
+};
+
+// The portal class for org.rocksdb.InfoLogLevel
+class InfoLogLevelJni : public JavaClass {
+ public:
+  /**
+   * Get the DEBUG_LEVEL enum field value of InfoLogLevel
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return A reference to the enum field value or a nullptr if
+   *     the enum field value could not be retrieved
+   */
+  static jobject DEBUG_LEVEL(JNIEnv* env) {
+    return getEnum(env, "DEBUG_LEVEL");
+  }
+
+  /**
+   * Get the INFO_LEVEL enum field value of InfoLogLevel
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return A reference to the enum field value or a nullptr if
+   *     the enum field value could not be retrieved
+   */
+  static jobject INFO_LEVEL(JNIEnv* env) { return getEnum(env, "INFO_LEVEL"); }
+
+  /**
+   * Get the WARN_LEVEL enum field value of InfoLogLevel
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return A reference to the enum field value or a nullptr if
+   *     the enum field value could not be retrieved
+   */
+  static jobject WARN_LEVEL(JNIEnv* env) { return getEnum(env, "WARN_LEVEL"); }
+
+  /**
+   * Get the ERROR_LEVEL enum field value of InfoLogLevel
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return A reference to the enum field value or a nullptr if
+   *     the enum field value could not be retrieved
+   */
+  static jobject ERROR_LEVEL(JNIEnv* env) {
+    return getEnum(env, "ERROR_LEVEL");
+  }
+
+  /**
+   * Get the FATAL_LEVEL enum field value of InfoLogLevel
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return A reference to the enum field value or a nullptr if
+   *     the enum field value could not be retrieved
+   */
+  static jobject FATAL_LEVEL(JNIEnv* env) {
+    return getEnum(env, "FATAL_LEVEL");
+  }
+
+  /**
+   * Get the HEADER_LEVEL enum field value of InfoLogLevel
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return A reference to the enum field value or a nullptr if
+   *     the enum field value could not be retrieved
+   */
+  static jobject HEADER_LEVEL(JNIEnv* env) {
+    return getEnum(env, "HEADER_LEVEL");
+  }
+
+ private:
+  /**
+   * Get the Java Class org.rocksdb.InfoLogLevel
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/InfoLogLevel");
+  }
+
+  /**
+   * Get an enum field of org.rocksdb.InfoLogLevel
+   *
+   * @param env A pointer to the Java environment
+   * @param name The name of the enum field
+   *
+   * @return A reference to the enum field value or a nullptr if
+   *     the enum field value could not be retrieved
+   */
+  static jobject getEnum(JNIEnv* env, const char name[]) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jfieldID jfid =
+        env->GetStaticFieldID(jclazz, name, "Lorg/rocksdb/InfoLogLevel;");
+    if (env->ExceptionCheck()) {
+      // exception occurred while getting field
+      return nullptr;
+    } else if (jfid == nullptr) {
+      return nullptr;
+    }
+
+    jobject jinfo_log_level = env->GetStaticObjectField(jclazz, jfid);
+    assert(jinfo_log_level != nullptr);
+    return jinfo_log_level;
+  }
+};
+
+// The portal class for org.rocksdb.Logger
+class LoggerJni
+    : public RocksDBNativeClass<
+          std::shared_ptr<ROCKSDB_NAMESPACE::LoggerJniCallback>*, LoggerJni> {
+ public:
+  /**
+   * Get the Java Class org/rocksdb/Logger
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/Logger");
+  }
+
+  /**
+   * Get the Java Method: Logger#log
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getLogMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "log", "(Lorg/rocksdb/InfoLogLevel;Ljava/lang/String;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+// The portal class for org.rocksdb.TransactionLogIterator.BatchResult
+class BatchResultJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.TransactionLogIterator.BatchResult
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(
+        env, "org/rocksdb/TransactionLogIterator$BatchResult");
+  }
+
+  /**
+   * Create a new Java org.rocksdb.TransactionLogIterator.BatchResult object
+   * with the same properties as the provided C++ ROCKSDB_NAMESPACE::BatchResult
+   * object
+   *
+   * @param env A pointer to the Java environment
+   * @param batch_result The ROCKSDB_NAMESPACE::BatchResult object
+   *
+   * @return A reference to a Java
+   *     org.rocksdb.TransactionLogIterator.BatchResult object,
+   *     or nullptr if an an exception occurs
+   */
+  static jobject construct(JNIEnv* env,
+                           ROCKSDB_NAMESPACE::BatchResult& batch_result) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID mid = env->GetMethodID(jclazz, "<init>", "(JJ)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    jobject jbatch_result = env->NewObject(jclazz, mid, batch_result.sequence,
+                                           batch_result.writeBatchPtr.get());
+    if (jbatch_result == nullptr) {
+      // exception thrown: InstantiationException or OutOfMemoryError
+      return nullptr;
+    }
+
+    batch_result.writeBatchPtr.release();
+    return jbatch_result;
+  }
+};
+
+// The portal class for org.rocksdb.BottommostLevelCompaction
+class BottommostLevelCompactionJni {
+ public:
+  // Returns the equivalent org.rocksdb.BottommostLevelCompaction for the
+  // provided C++ ROCKSDB_NAMESPACE::BottommostLevelCompaction enum
+  static jint toJavaBottommostLevelCompaction(
+      const ROCKSDB_NAMESPACE::BottommostLevelCompaction&
+          bottommost_level_compaction) {
+    switch (bottommost_level_compaction) {
+      case ROCKSDB_NAMESPACE::BottommostLevelCompaction::kSkip:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::BottommostLevelCompaction::
+          kIfHaveCompactionFilter:
+        return 0x1;
+      case ROCKSDB_NAMESPACE::BottommostLevelCompaction::kForce:
+        return 0x2;
+      case ROCKSDB_NAMESPACE::BottommostLevelCompaction::kForceOptimized:
+        return 0x3;
+      default:
+        return 0x7F;  // undefined
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::BottommostLevelCompaction
+  // enum for the provided Java org.rocksdb.BottommostLevelCompaction
+  static ROCKSDB_NAMESPACE::BottommostLevelCompaction
+  toCppBottommostLevelCompaction(jint bottommost_level_compaction) {
+    switch (bottommost_level_compaction) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::BottommostLevelCompaction::kSkip;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::BottommostLevelCompaction::
+            kIfHaveCompactionFilter;
+      case 0x2:
+        return ROCKSDB_NAMESPACE::BottommostLevelCompaction::kForce;
+      case 0x3:
+        return ROCKSDB_NAMESPACE::BottommostLevelCompaction::kForceOptimized;
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::BottommostLevelCompaction::
+            kIfHaveCompactionFilter;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.CompactionStopStyle
+class CompactionStopStyleJni {
+ public:
+  // Returns the equivalent org.rocksdb.CompactionStopStyle for the provided
+  // C++ ROCKSDB_NAMESPACE::CompactionStopStyle enum
+  static jbyte toJavaCompactionStopStyle(
+      const ROCKSDB_NAMESPACE::CompactionStopStyle& compaction_stop_style) {
+    switch (compaction_stop_style) {
+      case ROCKSDB_NAMESPACE::CompactionStopStyle::
+          kCompactionStopStyleSimilarSize:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::CompactionStopStyle::
+          kCompactionStopStyleTotalSize:
+        return 0x1;
+      default:
+        return 0x7F;  // undefined
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::CompactionStopStyle enum for
+  // the provided Java org.rocksdb.CompactionStopStyle
+  static ROCKSDB_NAMESPACE::CompactionStopStyle toCppCompactionStopStyle(
+      jbyte jcompaction_stop_style) {
+    switch (jcompaction_stop_style) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::CompactionStopStyle::
+            kCompactionStopStyleSimilarSize;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::CompactionStopStyle::
+            kCompactionStopStyleTotalSize;
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::CompactionStopStyle::
+            kCompactionStopStyleSimilarSize;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.CompressionType
+class CompressionTypeJni {
+ public:
+  // Returns the equivalent org.rocksdb.CompressionType for the provided
+  // C++ ROCKSDB_NAMESPACE::CompressionType enum
+  static jbyte toJavaCompressionType(
+      const ROCKSDB_NAMESPACE::CompressionType& compression_type) {
+    switch (compression_type) {
+      case ROCKSDB_NAMESPACE::CompressionType::kNoCompression:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::CompressionType::kSnappyCompression:
+        return 0x1;
+      case ROCKSDB_NAMESPACE::CompressionType::kZlibCompression:
+        return 0x2;
+      case ROCKSDB_NAMESPACE::CompressionType::kBZip2Compression:
+        return 0x3;
+      case ROCKSDB_NAMESPACE::CompressionType::kLZ4Compression:
+        return 0x4;
+      case ROCKSDB_NAMESPACE::CompressionType::kLZ4HCCompression:
+        return 0x5;
+      case ROCKSDB_NAMESPACE::CompressionType::kXpressCompression:
+        return 0x6;
+      case ROCKSDB_NAMESPACE::CompressionType::kZSTD:
+        return 0x7;
+      case ROCKSDB_NAMESPACE::CompressionType::kDisableCompressionOption:
+      default:
+        return 0x7F;
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::CompressionType enum for the
+  // provided Java org.rocksdb.CompressionType
+  static ROCKSDB_NAMESPACE::CompressionType toCppCompressionType(
+      jbyte jcompression_type) {
+    switch (jcompression_type) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::CompressionType::kNoCompression;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::CompressionType::kSnappyCompression;
+      case 0x2:
+        return ROCKSDB_NAMESPACE::CompressionType::kZlibCompression;
+      case 0x3:
+        return ROCKSDB_NAMESPACE::CompressionType::kBZip2Compression;
+      case 0x4:
+        return ROCKSDB_NAMESPACE::CompressionType::kLZ4Compression;
+      case 0x5:
+        return ROCKSDB_NAMESPACE::CompressionType::kLZ4HCCompression;
+      case 0x6:
+        return ROCKSDB_NAMESPACE::CompressionType::kXpressCompression;
+      case 0x7:
+        return ROCKSDB_NAMESPACE::CompressionType::kZSTD;
+      case 0x7F:
+      default:
+        return ROCKSDB_NAMESPACE::CompressionType::kDisableCompressionOption;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.CompactionPriority
+class CompactionPriorityJni {
+ public:
+  // Returns the equivalent org.rocksdb.CompactionPriority for the provided
+  // C++ ROCKSDB_NAMESPACE::CompactionPri enum
+  static jbyte toJavaCompactionPriority(
+      const ROCKSDB_NAMESPACE::CompactionPri& compaction_priority) {
+    switch (compaction_priority) {
+      case ROCKSDB_NAMESPACE::CompactionPri::kByCompensatedSize:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::CompactionPri::kOldestLargestSeqFirst:
+        return 0x1;
+      case ROCKSDB_NAMESPACE::CompactionPri::kOldestSmallestSeqFirst:
+        return 0x2;
+      case ROCKSDB_NAMESPACE::CompactionPri::kMinOverlappingRatio:
+        return 0x3;
+      case ROCKSDB_NAMESPACE::CompactionPri::kRoundRobin:
+        return 0x4;
+      default:
+        return 0x0;  // undefined
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::CompactionPri enum for the
+  // provided Java org.rocksdb.CompactionPriority
+  static ROCKSDB_NAMESPACE::CompactionPri toCppCompactionPriority(
+      jbyte jcompaction_priority) {
+    switch (jcompaction_priority) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::CompactionPri::kByCompensatedSize;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::CompactionPri::kOldestLargestSeqFirst;
+      case 0x2:
+        return ROCKSDB_NAMESPACE::CompactionPri::kOldestSmallestSeqFirst;
+      case 0x3:
+        return ROCKSDB_NAMESPACE::CompactionPri::kMinOverlappingRatio;
+      case 0x4:
+        return ROCKSDB_NAMESPACE::CompactionPri::kRoundRobin;
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::CompactionPri::kByCompensatedSize;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.AccessHint
+class AccessHintJni {
+ public:
+  // Returns the equivalent org.rocksdb.AccessHint for the provided
+  // C++ ROCKSDB_NAMESPACE::DBOptions::AccessHint enum
+  static jbyte toJavaAccessHint(
+      const ROCKSDB_NAMESPACE::DBOptions::AccessHint& access_hint) {
+    switch (access_hint) {
+      case ROCKSDB_NAMESPACE::DBOptions::AccessHint::NONE:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::DBOptions::AccessHint::NORMAL:
+        return 0x1;
+      case ROCKSDB_NAMESPACE::DBOptions::AccessHint::SEQUENTIAL:
+        return 0x2;
+      case ROCKSDB_NAMESPACE::DBOptions::AccessHint::WILLNEED:
+        return 0x3;
+      default:
+        // undefined/default
+        return 0x1;
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::DBOptions::AccessHint enum
+  // for the provided Java org.rocksdb.AccessHint
+  static ROCKSDB_NAMESPACE::DBOptions::AccessHint toCppAccessHint(
+      jbyte jaccess_hint) {
+    switch (jaccess_hint) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::DBOptions::AccessHint::NONE;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::DBOptions::AccessHint::NORMAL;
+      case 0x2:
+        return ROCKSDB_NAMESPACE::DBOptions::AccessHint::SEQUENTIAL;
+      case 0x3:
+        return ROCKSDB_NAMESPACE::DBOptions::AccessHint::WILLNEED;
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::DBOptions::AccessHint::NORMAL;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.WALRecoveryMode
+class WALRecoveryModeJni {
+ public:
+  // Returns the equivalent org.rocksdb.WALRecoveryMode for the provided
+  // C++ ROCKSDB_NAMESPACE::WALRecoveryMode enum
+  static jbyte toJavaWALRecoveryMode(
+      const ROCKSDB_NAMESPACE::WALRecoveryMode& wal_recovery_mode) {
+    switch (wal_recovery_mode) {
+      case ROCKSDB_NAMESPACE::WALRecoveryMode::kTolerateCorruptedTailRecords:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::WALRecoveryMode::kAbsoluteConsistency:
+        return 0x1;
+      case ROCKSDB_NAMESPACE::WALRecoveryMode::kPointInTimeRecovery:
+        return 0x2;
+      case ROCKSDB_NAMESPACE::WALRecoveryMode::kSkipAnyCorruptedRecords:
+        return 0x3;
+      default:
+        // undefined/default
+        return 0x2;
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::WALRecoveryMode enum for the
+  // provided Java org.rocksdb.WALRecoveryMode
+  static ROCKSDB_NAMESPACE::WALRecoveryMode toCppWALRecoveryMode(
+      jbyte jwal_recovery_mode) {
+    switch (jwal_recovery_mode) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::WALRecoveryMode::
+            kTolerateCorruptedTailRecords;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::WALRecoveryMode::kAbsoluteConsistency;
+      case 0x2:
+        return ROCKSDB_NAMESPACE::WALRecoveryMode::kPointInTimeRecovery;
+      case 0x3:
+        return ROCKSDB_NAMESPACE::WALRecoveryMode::kSkipAnyCorruptedRecords;
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::WALRecoveryMode::kPointInTimeRecovery;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.TickerType
+class TickerTypeJni {
+ public:
+  // Returns the equivalent org.rocksdb.TickerType for the provided
+  // C++ ROCKSDB_NAMESPACE::Tickers enum
+  static jbyte toJavaTickerType(const ROCKSDB_NAMESPACE::Tickers& tickers) {
+    switch (tickers) {
+      case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_MISS:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_HIT:
+        return 0x1;
+      case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_ADD:
+        return 0x2;
+      case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_ADD_FAILURES:
+        return 0x3;
+      case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_INDEX_MISS:
+        return 0x4;
+      case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_INDEX_HIT:
+        return 0x5;
+      case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_INDEX_ADD:
+        return 0x6;
+      case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_INDEX_BYTES_INSERT:
+        return 0x7;
+      case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_INDEX_BYTES_EVICT:
+        return 0x8;
+      case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_MISS:
+        return 0x9;
+      case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_HIT:
+        return 0xA;
+      case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_ADD:
+        return 0xB;
+      case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_BYTES_INSERT:
+        return 0xC;
+      case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_BYTES_EVICT:
+        return 0xD;
+      case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_MISS:
+        return 0xE;
+      case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_HIT:
+        return 0xF;
+      case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_ADD:
+        return 0x10;
+      case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_BYTES_INSERT:
+        return 0x11;
+      case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_BYTES_READ:
+        return 0x12;
+      case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_BYTES_WRITE:
+        return 0x13;
+      case ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_USEFUL:
+        return 0x14;
+      case ROCKSDB_NAMESPACE::Tickers::PERSISTENT_CACHE_HIT:
+        return 0x15;
+      case ROCKSDB_NAMESPACE::Tickers::PERSISTENT_CACHE_MISS:
+        return 0x16;
+      case ROCKSDB_NAMESPACE::Tickers::SIM_BLOCK_CACHE_HIT:
+        return 0x17;
+      case ROCKSDB_NAMESPACE::Tickers::SIM_BLOCK_CACHE_MISS:
+        return 0x18;
+      case ROCKSDB_NAMESPACE::Tickers::MEMTABLE_HIT:
+        return 0x19;
+      case ROCKSDB_NAMESPACE::Tickers::MEMTABLE_MISS:
+        return 0x1A;
+      case ROCKSDB_NAMESPACE::Tickers::GET_HIT_L0:
+        return 0x1B;
+      case ROCKSDB_NAMESPACE::Tickers::GET_HIT_L1:
+        return 0x1C;
+      case ROCKSDB_NAMESPACE::Tickers::GET_HIT_L2_AND_UP:
+        return 0x1D;
+      case ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_NEWER_ENTRY:
+        return 0x1E;
+      case ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_OBSOLETE:
+        return 0x1F;
+      case ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_RANGE_DEL:
+        return 0x20;
+      case ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_USER:
+        return 0x21;
+      case ROCKSDB_NAMESPACE::Tickers::COMPACTION_RANGE_DEL_DROP_OBSOLETE:
+        return 0x22;
+      case ROCKSDB_NAMESPACE::Tickers::NUMBER_KEYS_WRITTEN:
+        return 0x23;
+      case ROCKSDB_NAMESPACE::Tickers::NUMBER_KEYS_READ:
+        return 0x24;
+      case ROCKSDB_NAMESPACE::Tickers::NUMBER_KEYS_UPDATED:
+        return 0x25;
+      case ROCKSDB_NAMESPACE::Tickers::BYTES_WRITTEN:
+        return 0x26;
+      case ROCKSDB_NAMESPACE::Tickers::BYTES_READ:
+        return 0x27;
+      case ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_SEEK:
+        return 0x28;
+      case ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_NEXT:
+        return 0x29;
+      case ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_PREV:
+        return 0x2A;
+      case ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_SEEK_FOUND:
+        return 0x2B;
+      case ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_NEXT_FOUND:
+        return 0x2C;
+      case ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_PREV_FOUND:
+        return 0x2D;
+      case ROCKSDB_NAMESPACE::Tickers::ITER_BYTES_READ:
+        return 0x2E;
+      case ROCKSDB_NAMESPACE::Tickers::NO_FILE_CLOSES:
+        return 0x2F;
+      case ROCKSDB_NAMESPACE::Tickers::NO_FILE_OPENS:
+        return 0x30;
+      case ROCKSDB_NAMESPACE::Tickers::NO_FILE_ERRORS:
+        return 0x31;
+      case ROCKSDB_NAMESPACE::Tickers::STALL_L0_SLOWDOWN_MICROS:
+        return 0x32;
+      case ROCKSDB_NAMESPACE::Tickers::STALL_MEMTABLE_COMPACTION_MICROS:
+        return 0x33;
+      case ROCKSDB_NAMESPACE::Tickers::STALL_L0_NUM_FILES_MICROS:
+        return 0x34;
+      case ROCKSDB_NAMESPACE::Tickers::STALL_MICROS:
+        return 0x35;
+      case ROCKSDB_NAMESPACE::Tickers::DB_MUTEX_WAIT_MICROS:
+        return 0x36;
+      case ROCKSDB_NAMESPACE::Tickers::RATE_LIMIT_DELAY_MILLIS:
+        return 0x37;
+      case ROCKSDB_NAMESPACE::Tickers::NO_ITERATORS:
+        return 0x38;
+      case ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_CALLS:
+        return 0x39;
+      case ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_KEYS_READ:
+        return 0x3A;
+      case ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_BYTES_READ:
+        return 0x3B;
+      case ROCKSDB_NAMESPACE::Tickers::NUMBER_FILTERED_DELETES:
+        return 0x3C;
+      case ROCKSDB_NAMESPACE::Tickers::NUMBER_MERGE_FAILURES:
+        return 0x3D;
+      case ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_PREFIX_CHECKED:
+        return 0x3E;
+      case ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_PREFIX_USEFUL:
+        return 0x3F;
+      case ROCKSDB_NAMESPACE::Tickers::NUMBER_OF_RESEEKS_IN_ITERATION:
+        return 0x40;
+      case ROCKSDB_NAMESPACE::Tickers::GET_UPDATES_SINCE_CALLS:
+        return 0x41;
+      case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_COMPRESSED_MISS:
+        return 0x42;
+      case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_COMPRESSED_HIT:
+        return 0x43;
+      case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_COMPRESSED_ADD:
+        return 0x44;
+      case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_COMPRESSED_ADD_FAILURES:
+        return 0x45;
+      case ROCKSDB_NAMESPACE::Tickers::WAL_FILE_SYNCED:
+        return 0x46;
+      case ROCKSDB_NAMESPACE::Tickers::WAL_FILE_BYTES:
+        return 0x47;
+      case ROCKSDB_NAMESPACE::Tickers::WRITE_DONE_BY_SELF:
+        return 0x48;
+      case ROCKSDB_NAMESPACE::Tickers::WRITE_DONE_BY_OTHER:
+        return 0x49;
+      case ROCKSDB_NAMESPACE::Tickers::WRITE_TIMEDOUT:
+        return 0x4A;
+      case ROCKSDB_NAMESPACE::Tickers::WRITE_WITH_WAL:
+        return 0x4B;
+      case ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES:
+        return 0x4C;
+      case ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES:
+        return 0x4D;
+      case ROCKSDB_NAMESPACE::Tickers::FLUSH_WRITE_BYTES:
+        return 0x4E;
+      case ROCKSDB_NAMESPACE::Tickers::NUMBER_DIRECT_LOAD_TABLE_PROPERTIES:
+        return 0x4F;
+      case ROCKSDB_NAMESPACE::Tickers::NUMBER_SUPERVERSION_ACQUIRES:
+        return 0x50;
+      case ROCKSDB_NAMESPACE::Tickers::NUMBER_SUPERVERSION_RELEASES:
+        return 0x51;
+      case ROCKSDB_NAMESPACE::Tickers::NUMBER_SUPERVERSION_CLEANUPS:
+        return 0x52;
+      case ROCKSDB_NAMESPACE::Tickers::NUMBER_BLOCK_COMPRESSED:
+        return 0x53;
+      case ROCKSDB_NAMESPACE::Tickers::NUMBER_BLOCK_DECOMPRESSED:
+        return 0x54;
+      case ROCKSDB_NAMESPACE::Tickers::NUMBER_BLOCK_NOT_COMPRESSED:
+        return 0x55;
+      case ROCKSDB_NAMESPACE::Tickers::MERGE_OPERATION_TOTAL_TIME:
+        return 0x56;
+      case ROCKSDB_NAMESPACE::Tickers::FILTER_OPERATION_TOTAL_TIME:
+        return 0x57;
+      case ROCKSDB_NAMESPACE::Tickers::ROW_CACHE_HIT:
+        return 0x58;
+      case ROCKSDB_NAMESPACE::Tickers::ROW_CACHE_MISS:
+        return 0x59;
+      case ROCKSDB_NAMESPACE::Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES:
+        return 0x5A;
+      case ROCKSDB_NAMESPACE::Tickers::READ_AMP_TOTAL_READ_BYTES:
+        return 0x5B;
+      case ROCKSDB_NAMESPACE::Tickers::NUMBER_RATE_LIMITER_DRAINS:
+        return 0x5C;
+      case ROCKSDB_NAMESPACE::Tickers::NUMBER_ITER_SKIP:
+        return 0x5D;
+      case ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_KEYS_FOUND:
+        return 0x5E;
+      case ROCKSDB_NAMESPACE::Tickers::NO_ITERATOR_CREATED:
+        // -0x01 so we can skip over the already taken 0x5F (TICKER_ENUM_MAX).
+        return -0x01;
+      case ROCKSDB_NAMESPACE::Tickers::NO_ITERATOR_DELETED:
+        return 0x60;
+      case ROCKSDB_NAMESPACE::Tickers::COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE:
+        return 0x61;
+      case ROCKSDB_NAMESPACE::Tickers::COMPACTION_CANCELLED:
+        return 0x62;
+      case ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_FULL_POSITIVE:
+        return 0x63;
+      case ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_FULL_TRUE_POSITIVE:
+        return 0x64;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_PUT:
+        return 0x65;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_WRITE:
+        return 0x66;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_GET:
+        return 0x67;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_MULTIGET:
+        return 0x68;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_SEEK:
+        return 0x69;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_NEXT:
+        return 0x6A;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_PREV:
+        return 0x6B;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_KEYS_WRITTEN:
+        return 0x6C;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_KEYS_READ:
+        return 0x6D;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BYTES_WRITTEN:
+        return 0x6E;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BYTES_READ:
+        return 0x6F;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED:
+        return 0x70;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED_TTL:
+        return 0x71;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_BLOB:
+        return 0x72;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_BLOB_TTL:
+        return 0x73;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_FILE_BYTES_WRITTEN:
+        return 0x74;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_FILE_BYTES_READ:
+        return 0x75;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_FILE_SYNCED:
+        return 0x76;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EXPIRED_COUNT:
+        return 0x77;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EXPIRED_SIZE:
+        return 0x78;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EVICTED_COUNT:
+        return 0x79;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EVICTED_SIZE:
+        return 0x7A;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_FILES:
+        return 0x7B;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_NEW_FILES:
+        return 0x7C;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_FAILURES:
+        return 0x7D;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_KEYS_OVERWRITTEN:
+        return 0x7E;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_KEYS_EXPIRED:
+        return 0x7F;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_KEYS_RELOCATED:
+        return -0x02;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_BYTES_OVERWRITTEN:
+        return -0x03;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_BYTES_EXPIRED:
+        return -0x04;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_BYTES_RELOCATED:
+        return -0x05;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_FIFO_NUM_FILES_EVICTED:
+        return -0x06;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_FIFO_NUM_KEYS_EVICTED:
+        return -0x07;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_FIFO_BYTES_EVICTED:
+        return -0x08;
+      case ROCKSDB_NAMESPACE::Tickers::TXN_PREPARE_MUTEX_OVERHEAD:
+        return -0x09;
+      case ROCKSDB_NAMESPACE::Tickers::TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD:
+        return -0x0A;
+      case ROCKSDB_NAMESPACE::Tickers::TXN_DUPLICATE_KEY_OVERHEAD:
+        return -0x0B;
+      case ROCKSDB_NAMESPACE::Tickers::TXN_SNAPSHOT_MUTEX_OVERHEAD:
+        return -0x0C;
+      case ROCKSDB_NAMESPACE::Tickers::TXN_GET_TRY_AGAIN:
+        return -0x0D;
+      case ROCKSDB_NAMESPACE::Tickers::FILES_MARKED_TRASH:
+        return -0x0E;
+      case ROCKSDB_NAMESPACE::Tickers::FILES_DELETED_IMMEDIATELY:
+        return -0X0F;
+      case ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_MARKED:
+        return -0x10;
+      case ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_PERIODIC:
+        return -0x11;
+      case ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_TTL:
+        return -0x12;
+      case ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_MARKED:
+        return -0x13;
+      case ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_PERIODIC:
+        return -0x14;
+      case ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_TTL:
+        return -0x15;
+      case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_ERROR_COUNT:
+        return -0x16;
+      case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_IO_ERROR_COUNT:
+        return -0x17;
+      case ROCKSDB_NAMESPACE::Tickers::
+          ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT:
+        return -0x18;
+      case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_COUNT:
+        return -0x19;
+      case ROCKSDB_NAMESPACE::Tickers::
+          ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT:
+        return -0x1A;
+      case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT:
+        return -0x1B;
+      case ROCKSDB_NAMESPACE::Tickers::MEMTABLE_PAYLOAD_BYTES_AT_FLUSH:
+        return -0x1C;
+      case ROCKSDB_NAMESPACE::Tickers::MEMTABLE_GARBAGE_BYTES_AT_FLUSH:
+        return -0x1D;
+      case ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_HITS:
+        return -0x1E;
+      case ROCKSDB_NAMESPACE::Tickers::VERIFY_CHECKSUM_READ_BYTES:
+        return -0x1F;
+      case ROCKSDB_NAMESPACE::Tickers::BACKUP_READ_BYTES:
+        return -0x20;
+      case ROCKSDB_NAMESPACE::Tickers::BACKUP_WRITE_BYTES:
+        return -0x21;
+      case ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_READ_BYTES:
+        return -0x22;
+      case ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_WRITE_BYTES:
+        return -0x23;
+      case ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_BYTES:
+        return -0x24;
+      case ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_BYTES:
+        return -0x25;
+      case ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_BYTES:
+        return -0x26;
+      case ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_COUNT:
+        return -0x27;
+      case ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_COUNT:
+        return -0x28;
+      case ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_COUNT:
+        return -0x29;
+      case ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_READ_BYTES:
+        return -0x2A;
+      case ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_READ_COUNT:
+        return -0x2B;
+      case ROCKSDB_NAMESPACE::Tickers::NON_LAST_LEVEL_READ_BYTES:
+        return -0x2C;
+      case ROCKSDB_NAMESPACE::Tickers::NON_LAST_LEVEL_READ_COUNT:
+        return -0x2D;
+      case ROCKSDB_NAMESPACE::Tickers::BLOCK_CHECKSUM_COMPUTE_COUNT:
+        return -0x2E;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_MISS:
+        return -0x2F;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_HIT:
+        return -0x30;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_ADD:
+        return -0x31;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_ADD_FAILURES:
+        return -0x32;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_BYTES_READ:
+        return -0x33;
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_BYTES_WRITE:
+        return -0x34;
+      case ROCKSDB_NAMESPACE::Tickers::READ_ASYNC_MICROS:
+        return -0x35;
+      case ROCKSDB_NAMESPACE::Tickers::ASYNC_READ_ERROR_COUNT:
+        return -0x36;
+      case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX:
+        // 0x5F was the max value in the initial copy of tickers to Java.
+        // Since these values are exposed directly to Java clients, we keep
+        // the value the same forever.
+        //
+        // TODO: This particular case seems confusing and unnecessary to pin the
+        // value since it's meant to be the number of tickers, not an actual
+        // ticker value. But we aren't yet in a position to fix it since the
+        // number of tickers doesn't fit in the Java representation (jbyte).
+        return 0x5F;
+      default:
+        // undefined/default
+        return 0x0;
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::Tickers enum for the
+  // provided Java org.rocksdb.TickerType
+  static ROCKSDB_NAMESPACE::Tickers toCppTickers(jbyte jticker_type) {
+    switch (jticker_type) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_MISS;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_HIT;
+      case 0x2:
+        return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_ADD;
+      case 0x3:
+        return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_ADD_FAILURES;
+      case 0x4:
+        return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_INDEX_MISS;
+      case 0x5:
+        return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_INDEX_HIT;
+      case 0x6:
+        return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_INDEX_ADD;
+      case 0x7:
+        return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_INDEX_BYTES_INSERT;
+      case 0x8:
+        return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_INDEX_BYTES_EVICT;
+      case 0x9:
+        return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_MISS;
+      case 0xA:
+        return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_HIT;
+      case 0xB:
+        return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_ADD;
+      case 0xC:
+        return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_BYTES_INSERT;
+      case 0xD:
+        return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_BYTES_EVICT;
+      case 0xE:
+        return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_MISS;
+      case 0xF:
+        return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_HIT;
+      case 0x10:
+        return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_ADD;
+      case 0x11:
+        return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_BYTES_INSERT;
+      case 0x12:
+        return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_BYTES_READ;
+      case 0x13:
+        return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_BYTES_WRITE;
+      case 0x14:
+        return ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_USEFUL;
+      case 0x15:
+        return ROCKSDB_NAMESPACE::Tickers::PERSISTENT_CACHE_HIT;
+      case 0x16:
+        return ROCKSDB_NAMESPACE::Tickers::PERSISTENT_CACHE_MISS;
+      case 0x17:
+        return ROCKSDB_NAMESPACE::Tickers::SIM_BLOCK_CACHE_HIT;
+      case 0x18:
+        return ROCKSDB_NAMESPACE::Tickers::SIM_BLOCK_CACHE_MISS;
+      case 0x19:
+        return ROCKSDB_NAMESPACE::Tickers::MEMTABLE_HIT;
+      case 0x1A:
+        return ROCKSDB_NAMESPACE::Tickers::MEMTABLE_MISS;
+      case 0x1B:
+        return ROCKSDB_NAMESPACE::Tickers::GET_HIT_L0;
+      case 0x1C:
+        return ROCKSDB_NAMESPACE::Tickers::GET_HIT_L1;
+      case 0x1D:
+        return ROCKSDB_NAMESPACE::Tickers::GET_HIT_L2_AND_UP;
+      case 0x1E:
+        return ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_NEWER_ENTRY;
+      case 0x1F:
+        return ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_OBSOLETE;
+      case 0x20:
+        return ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_RANGE_DEL;
+      case 0x21:
+        return ROCKSDB_NAMESPACE::Tickers::COMPACTION_KEY_DROP_USER;
+      case 0x22:
+        return ROCKSDB_NAMESPACE::Tickers::COMPACTION_RANGE_DEL_DROP_OBSOLETE;
+      case 0x23:
+        return ROCKSDB_NAMESPACE::Tickers::NUMBER_KEYS_WRITTEN;
+      case 0x24:
+        return ROCKSDB_NAMESPACE::Tickers::NUMBER_KEYS_READ;
+      case 0x25:
+        return ROCKSDB_NAMESPACE::Tickers::NUMBER_KEYS_UPDATED;
+      case 0x26:
+        return ROCKSDB_NAMESPACE::Tickers::BYTES_WRITTEN;
+      case 0x27:
+        return ROCKSDB_NAMESPACE::Tickers::BYTES_READ;
+      case 0x28:
+        return ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_SEEK;
+      case 0x29:
+        return ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_NEXT;
+      case 0x2A:
+        return ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_PREV;
+      case 0x2B:
+        return ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_SEEK_FOUND;
+      case 0x2C:
+        return ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_NEXT_FOUND;
+      case 0x2D:
+        return ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_PREV_FOUND;
+      case 0x2E:
+        return ROCKSDB_NAMESPACE::Tickers::ITER_BYTES_READ;
+      case 0x2F:
+        return ROCKSDB_NAMESPACE::Tickers::NO_FILE_CLOSES;
+      case 0x30:
+        return ROCKSDB_NAMESPACE::Tickers::NO_FILE_OPENS;
+      case 0x31:
+        return ROCKSDB_NAMESPACE::Tickers::NO_FILE_ERRORS;
+      case 0x32:
+        return ROCKSDB_NAMESPACE::Tickers::STALL_L0_SLOWDOWN_MICROS;
+      case 0x33:
+        return ROCKSDB_NAMESPACE::Tickers::STALL_MEMTABLE_COMPACTION_MICROS;
+      case 0x34:
+        return ROCKSDB_NAMESPACE::Tickers::STALL_L0_NUM_FILES_MICROS;
+      case 0x35:
+        return ROCKSDB_NAMESPACE::Tickers::STALL_MICROS;
+      case 0x36:
+        return ROCKSDB_NAMESPACE::Tickers::DB_MUTEX_WAIT_MICROS;
+      case 0x37:
+        return ROCKSDB_NAMESPACE::Tickers::RATE_LIMIT_DELAY_MILLIS;
+      case 0x38:
+        return ROCKSDB_NAMESPACE::Tickers::NO_ITERATORS;
+      case 0x39:
+        return ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_CALLS;
+      case 0x3A:
+        return ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_KEYS_READ;
+      case 0x3B:
+        return ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_BYTES_READ;
+      case 0x3C:
+        return ROCKSDB_NAMESPACE::Tickers::NUMBER_FILTERED_DELETES;
+      case 0x3D:
+        return ROCKSDB_NAMESPACE::Tickers::NUMBER_MERGE_FAILURES;
+      case 0x3E:
+        return ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_PREFIX_CHECKED;
+      case 0x3F:
+        return ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_PREFIX_USEFUL;
+      case 0x40:
+        return ROCKSDB_NAMESPACE::Tickers::NUMBER_OF_RESEEKS_IN_ITERATION;
+      case 0x41:
+        return ROCKSDB_NAMESPACE::Tickers::GET_UPDATES_SINCE_CALLS;
+      case 0x42:
+        return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_COMPRESSED_MISS;
+      case 0x43:
+        return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_COMPRESSED_HIT;
+      case 0x44:
+        return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_COMPRESSED_ADD;
+      case 0x45:
+        return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_COMPRESSED_ADD_FAILURES;
+      case 0x46:
+        return ROCKSDB_NAMESPACE::Tickers::WAL_FILE_SYNCED;
+      case 0x47:
+        return ROCKSDB_NAMESPACE::Tickers::WAL_FILE_BYTES;
+      case 0x48:
+        return ROCKSDB_NAMESPACE::Tickers::WRITE_DONE_BY_SELF;
+      case 0x49:
+        return ROCKSDB_NAMESPACE::Tickers::WRITE_DONE_BY_OTHER;
+      case 0x4A:
+        return ROCKSDB_NAMESPACE::Tickers::WRITE_TIMEDOUT;
+      case 0x4B:
+        return ROCKSDB_NAMESPACE::Tickers::WRITE_WITH_WAL;
+      case 0x4C:
+        return ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES;
+      case 0x4D:
+        return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES;
+      case 0x4E:
+        return ROCKSDB_NAMESPACE::Tickers::FLUSH_WRITE_BYTES;
+      case 0x4F:
+        return ROCKSDB_NAMESPACE::Tickers::NUMBER_DIRECT_LOAD_TABLE_PROPERTIES;
+      case 0x50:
+        return ROCKSDB_NAMESPACE::Tickers::NUMBER_SUPERVERSION_ACQUIRES;
+      case 0x51:
+        return ROCKSDB_NAMESPACE::Tickers::NUMBER_SUPERVERSION_RELEASES;
+      case 0x52:
+        return ROCKSDB_NAMESPACE::Tickers::NUMBER_SUPERVERSION_CLEANUPS;
+      case 0x53:
+        return ROCKSDB_NAMESPACE::Tickers::NUMBER_BLOCK_COMPRESSED;
+      case 0x54:
+        return ROCKSDB_NAMESPACE::Tickers::NUMBER_BLOCK_DECOMPRESSED;
+      case 0x55:
+        return ROCKSDB_NAMESPACE::Tickers::NUMBER_BLOCK_NOT_COMPRESSED;
+      case 0x56:
+        return ROCKSDB_NAMESPACE::Tickers::MERGE_OPERATION_TOTAL_TIME;
+      case 0x57:
+        return ROCKSDB_NAMESPACE::Tickers::FILTER_OPERATION_TOTAL_TIME;
+      case 0x58:
+        return ROCKSDB_NAMESPACE::Tickers::ROW_CACHE_HIT;
+      case 0x59:
+        return ROCKSDB_NAMESPACE::Tickers::ROW_CACHE_MISS;
+      case 0x5A:
+        return ROCKSDB_NAMESPACE::Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES;
+      case 0x5B:
+        return ROCKSDB_NAMESPACE::Tickers::READ_AMP_TOTAL_READ_BYTES;
+      case 0x5C:
+        return ROCKSDB_NAMESPACE::Tickers::NUMBER_RATE_LIMITER_DRAINS;
+      case 0x5D:
+        return ROCKSDB_NAMESPACE::Tickers::NUMBER_ITER_SKIP;
+      case 0x5E:
+        return ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_KEYS_FOUND;
+      case -0x01:
+        // -0x01 so we can skip over the already taken 0x5F (TICKER_ENUM_MAX).
+        return ROCKSDB_NAMESPACE::Tickers::NO_ITERATOR_CREATED;
+      case 0x60:
+        return ROCKSDB_NAMESPACE::Tickers::NO_ITERATOR_DELETED;
+      case 0x61:
+        return ROCKSDB_NAMESPACE::Tickers::
+            COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE;
+      case 0x62:
+        return ROCKSDB_NAMESPACE::Tickers::COMPACTION_CANCELLED;
+      case 0x63:
+        return ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_FULL_POSITIVE;
+      case 0x64:
+        return ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_FULL_TRUE_POSITIVE;
+      case 0x65:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_PUT;
+      case 0x66:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_WRITE;
+      case 0x67:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_GET;
+      case 0x68:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_MULTIGET;
+      case 0x69:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_SEEK;
+      case 0x6A:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_NEXT;
+      case 0x6B:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_PREV;
+      case 0x6C:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_KEYS_WRITTEN;
+      case 0x6D:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_NUM_KEYS_READ;
+      case 0x6E:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BYTES_WRITTEN;
+      case 0x6F:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BYTES_READ;
+      case 0x70:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED;
+      case 0x71:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED_TTL;
+      case 0x72:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_BLOB;
+      case 0x73:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_BLOB_TTL;
+      case 0x74:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_FILE_BYTES_WRITTEN;
+      case 0x75:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_FILE_BYTES_READ;
+      case 0x76:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_FILE_SYNCED;
+      case 0x77:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EXPIRED_COUNT;
+      case 0x78:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EXPIRED_SIZE;
+      case 0x79:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EVICTED_COUNT;
+      case 0x7A:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BLOB_INDEX_EVICTED_SIZE;
+      case 0x7B:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_FILES;
+      case 0x7C:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_NEW_FILES;
+      case 0x7D:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_FAILURES;
+      case 0x7E:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_KEYS_OVERWRITTEN;
+      case 0x7F:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_KEYS_EXPIRED;
+      case -0x02:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_KEYS_RELOCATED;
+      case -0x03:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_BYTES_OVERWRITTEN;
+      case -0x04:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_BYTES_EXPIRED;
+      case -0x05:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_BYTES_RELOCATED;
+      case -0x06:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_FIFO_NUM_FILES_EVICTED;
+      case -0x07:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_FIFO_NUM_KEYS_EVICTED;
+      case -0x08:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_FIFO_BYTES_EVICTED;
+      case -0x09:
+        return ROCKSDB_NAMESPACE::Tickers::TXN_PREPARE_MUTEX_OVERHEAD;
+      case -0x0A:
+        return ROCKSDB_NAMESPACE::Tickers::TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD;
+      case -0x0B:
+        return ROCKSDB_NAMESPACE::Tickers::TXN_DUPLICATE_KEY_OVERHEAD;
+      case -0x0C:
+        return ROCKSDB_NAMESPACE::Tickers::TXN_SNAPSHOT_MUTEX_OVERHEAD;
+      case -0x0D:
+        return ROCKSDB_NAMESPACE::Tickers::TXN_GET_TRY_AGAIN;
+      case -0x0E:
+        return ROCKSDB_NAMESPACE::Tickers::FILES_MARKED_TRASH;
+      case -0x0F:
+        return ROCKSDB_NAMESPACE::Tickers::FILES_DELETED_IMMEDIATELY;
+      case -0x10:
+        return ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_MARKED;
+      case -0x11:
+        return ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_PERIODIC;
+      case -0x12:
+        return ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_TTL;
+      case -0x13:
+        return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_MARKED;
+      case -0x14:
+        return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_PERIODIC;
+      case -0x15:
+        return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_TTL;
+      case -0x16:
+        return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_ERROR_COUNT;
+      case -0x17:
+        return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_IO_ERROR_COUNT;
+      case -0x18:
+        return ROCKSDB_NAMESPACE::Tickers::
+            ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT;
+      case -0x19:
+        return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_COUNT;
+      case -0x1A:
+        return ROCKSDB_NAMESPACE::Tickers::
+            ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT;
+      case -0x1B:
+        return ROCKSDB_NAMESPACE::Tickers::
+            ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT;
+      case -0x1C:
+        return ROCKSDB_NAMESPACE::Tickers::MEMTABLE_PAYLOAD_BYTES_AT_FLUSH;
+      case -0x1D:
+        return ROCKSDB_NAMESPACE::Tickers::MEMTABLE_GARBAGE_BYTES_AT_FLUSH;
+      case -0x1E:
+        return ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_HITS;
+      case -0x1F:
+        return ROCKSDB_NAMESPACE::Tickers::VERIFY_CHECKSUM_READ_BYTES;
+      case -0x20:
+        return ROCKSDB_NAMESPACE::Tickers::BACKUP_READ_BYTES;
+      case -0x21:
+        return ROCKSDB_NAMESPACE::Tickers::BACKUP_WRITE_BYTES;
+      case -0x22:
+        return ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_READ_BYTES;
+      case -0x23:
+        return ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_WRITE_BYTES;
+      case -0x24:
+        return ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_BYTES;
+      case -0x25:
+        return ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_BYTES;
+      case -0x26:
+        return ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_BYTES;
+      case -0x27:
+        return ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_COUNT;
+      case -0x28:
+        return ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_COUNT;
+      case -0x29:
+        return ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_COUNT;
+      case -0x2A:
+        return ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_READ_BYTES;
+      case -0x2B:
+        return ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_READ_COUNT;
+      case -0x2C:
+        return ROCKSDB_NAMESPACE::Tickers::NON_LAST_LEVEL_READ_BYTES;
+      case -0x2D:
+        return ROCKSDB_NAMESPACE::Tickers::NON_LAST_LEVEL_READ_COUNT;
+      case -0x2E:
+        return ROCKSDB_NAMESPACE::Tickers::BLOCK_CHECKSUM_COMPUTE_COUNT;
+      case -0x2F:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_MISS;
+      case -0x30:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_HIT;
+      case -0x31:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_ADD;
+      case -0x32:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_ADD_FAILURES;
+      case -0x33:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_BYTES_READ;
+      case -0x34:
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_CACHE_BYTES_WRITE;
+      case -0x35:
+        return ROCKSDB_NAMESPACE::Tickers::READ_ASYNC_MICROS;
+      case -0x36:
+        return ROCKSDB_NAMESPACE::Tickers::ASYNC_READ_ERROR_COUNT;
+      case 0x5F:
+        // 0x5F was the max value in the initial copy of tickers to Java.
+        // Since these values are exposed directly to Java clients, we keep
+        // the value the same forever.
+        //
+        // TODO: This particular case seems confusing and unnecessary to pin the
+        // value since it's meant to be the number of tickers, not an actual
+        // ticker value. But we aren't yet in a position to fix it since the
+        // number of tickers doesn't fit in the Java representation (jbyte).
+        return ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX;
+
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_MISS;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.HistogramType
+class HistogramTypeJni {
+ public:
+  // Returns the equivalent org.rocksdb.HistogramType for the provided
+  // C++ ROCKSDB_NAMESPACE::Histograms enum
+  static jbyte toJavaHistogramsType(
+      const ROCKSDB_NAMESPACE::Histograms& histograms) {
+    switch (histograms) {
+      case ROCKSDB_NAMESPACE::Histograms::DB_GET:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::Histograms::DB_WRITE:
+        return 0x1;
+      case ROCKSDB_NAMESPACE::Histograms::COMPACTION_TIME:
+        return 0x2;
+      case ROCKSDB_NAMESPACE::Histograms::SUBCOMPACTION_SETUP_TIME:
+        return 0x3;
+      case ROCKSDB_NAMESPACE::Histograms::TABLE_SYNC_MICROS:
+        return 0x4;
+      case ROCKSDB_NAMESPACE::Histograms::COMPACTION_OUTFILE_SYNC_MICROS:
+        return 0x5;
+      case ROCKSDB_NAMESPACE::Histograms::WAL_FILE_SYNC_MICROS:
+        return 0x6;
+      case ROCKSDB_NAMESPACE::Histograms::MANIFEST_FILE_SYNC_MICROS:
+        return 0x7;
+      case ROCKSDB_NAMESPACE::Histograms::TABLE_OPEN_IO_MICROS:
+        return 0x8;
+      case ROCKSDB_NAMESPACE::Histograms::DB_MULTIGET:
+        return 0x9;
+      case ROCKSDB_NAMESPACE::Histograms::READ_BLOCK_COMPACTION_MICROS:
+        return 0xA;
+      case ROCKSDB_NAMESPACE::Histograms::READ_BLOCK_GET_MICROS:
+        return 0xB;
+      case ROCKSDB_NAMESPACE::Histograms::WRITE_RAW_BLOCK_MICROS:
+        return 0xC;
+      case ROCKSDB_NAMESPACE::Histograms::STALL_L0_SLOWDOWN_COUNT:
+        return 0xD;
+      case ROCKSDB_NAMESPACE::Histograms::STALL_MEMTABLE_COMPACTION_COUNT:
+        return 0xE;
+      case ROCKSDB_NAMESPACE::Histograms::STALL_L0_NUM_FILES_COUNT:
+        return 0xF;
+      case ROCKSDB_NAMESPACE::Histograms::HARD_RATE_LIMIT_DELAY_COUNT:
+        return 0x10;
+      case ROCKSDB_NAMESPACE::Histograms::SOFT_RATE_LIMIT_DELAY_COUNT:
+        return 0x11;
+      case ROCKSDB_NAMESPACE::Histograms::NUM_FILES_IN_SINGLE_COMPACTION:
+        return 0x12;
+      case ROCKSDB_NAMESPACE::Histograms::DB_SEEK:
+        return 0x13;
+      case ROCKSDB_NAMESPACE::Histograms::WRITE_STALL:
+        return 0x14;
+      case ROCKSDB_NAMESPACE::Histograms::SST_READ_MICROS:
+        return 0x15;
+      case ROCKSDB_NAMESPACE::Histograms::NUM_SUBCOMPACTIONS_SCHEDULED:
+        return 0x16;
+      case ROCKSDB_NAMESPACE::Histograms::BYTES_PER_READ:
+        return 0x17;
+      case ROCKSDB_NAMESPACE::Histograms::BYTES_PER_WRITE:
+        return 0x18;
+      case ROCKSDB_NAMESPACE::Histograms::BYTES_PER_MULTIGET:
+        return 0x19;
+      case ROCKSDB_NAMESPACE::Histograms::BYTES_COMPRESSED:
+        return 0x1A;
+      case ROCKSDB_NAMESPACE::Histograms::BYTES_DECOMPRESSED:
+        return 0x1B;
+      case ROCKSDB_NAMESPACE::Histograms::COMPRESSION_TIMES_NANOS:
+        return 0x1C;
+      case ROCKSDB_NAMESPACE::Histograms::DECOMPRESSION_TIMES_NANOS:
+        return 0x1D;
+      case ROCKSDB_NAMESPACE::Histograms::READ_NUM_MERGE_OPERANDS:
+        return 0x1E;
+      // 0x20 to skip 0x1F so TICKER_ENUM_MAX remains unchanged for minor
+      // version compatibility.
+      case ROCKSDB_NAMESPACE::Histograms::FLUSH_TIME:
+        return 0x20;
+      case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_KEY_SIZE:
+        return 0x21;
+      case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_VALUE_SIZE:
+        return 0x22;
+      case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_WRITE_MICROS:
+        return 0x23;
+      case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_GET_MICROS:
+        return 0x24;
+      case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_MULTIGET_MICROS:
+        return 0x25;
+      case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_SEEK_MICROS:
+        return 0x26;
+      case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_NEXT_MICROS:
+        return 0x27;
+      case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_PREV_MICROS:
+        return 0x28;
+      case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_BLOB_FILE_WRITE_MICROS:
+        return 0x29;
+      case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_BLOB_FILE_READ_MICROS:
+        return 0x2A;
+      case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_BLOB_FILE_SYNC_MICROS:
+        return 0x2B;
+      case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_GC_MICROS:
+        return 0x2C;
+      case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_COMPRESSION_MICROS:
+        return 0x2D;
+      case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_DECOMPRESSION_MICROS:
+        return 0x2E;
+      case ROCKSDB_NAMESPACE::Histograms::
+          NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL:
+        return 0x2F;
+      case ROCKSDB_NAMESPACE::Histograms::NUM_DATA_BLOCKS_READ_PER_LEVEL:
+        return 0x30;
+      case ROCKSDB_NAMESPACE::Histograms::NUM_SST_READ_PER_LEVEL:
+        return 0x31;
+      case ROCKSDB_NAMESPACE::Histograms::ERROR_HANDLER_AUTORESUME_RETRY_COUNT:
+        return 0x32;
+      case ROCKSDB_NAMESPACE::Histograms::ASYNC_READ_BYTES:
+        return 0x33;
+      case ROCKSDB_NAMESPACE::Histograms::POLL_WAIT_MICROS:
+        return 0x34;
+      case ROCKSDB_NAMESPACE::Histograms::PREFETCHED_BYTES_DISCARDED:
+        return 0x35;
+      case ROCKSDB_NAMESPACE::Histograms::MULTIGET_IO_BATCH_SIZE:
+        return 0x36;
+      case NUM_LEVEL_READ_PER_MULTIGET:
+        return 0x37;
+      case ASYNC_PREFETCH_ABORT_MICROS:
+        return 0x38;
+      case ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX:
+        // 0x1F for backwards compatibility on current minor version.
+        return 0x1F;
+
+      default:
+        // undefined/default
+        return 0x0;
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::Histograms enum for the
+  // provided Java org.rocksdb.HistogramsType
+  static ROCKSDB_NAMESPACE::Histograms toCppHistograms(jbyte jhistograms_type) {
+    switch (jhistograms_type) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::Histograms::DB_GET;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::Histograms::DB_WRITE;
+      case 0x2:
+        return ROCKSDB_NAMESPACE::Histograms::COMPACTION_TIME;
+      case 0x3:
+        return ROCKSDB_NAMESPACE::Histograms::SUBCOMPACTION_SETUP_TIME;
+      case 0x4:
+        return ROCKSDB_NAMESPACE::Histograms::TABLE_SYNC_MICROS;
+      case 0x5:
+        return ROCKSDB_NAMESPACE::Histograms::COMPACTION_OUTFILE_SYNC_MICROS;
+      case 0x6:
+        return ROCKSDB_NAMESPACE::Histograms::WAL_FILE_SYNC_MICROS;
+      case 0x7:
+        return ROCKSDB_NAMESPACE::Histograms::MANIFEST_FILE_SYNC_MICROS;
+      case 0x8:
+        return ROCKSDB_NAMESPACE::Histograms::TABLE_OPEN_IO_MICROS;
+      case 0x9:
+        return ROCKSDB_NAMESPACE::Histograms::DB_MULTIGET;
+      case 0xA:
+        return ROCKSDB_NAMESPACE::Histograms::READ_BLOCK_COMPACTION_MICROS;
+      case 0xB:
+        return ROCKSDB_NAMESPACE::Histograms::READ_BLOCK_GET_MICROS;
+      case 0xC:
+        return ROCKSDB_NAMESPACE::Histograms::WRITE_RAW_BLOCK_MICROS;
+      case 0xD:
+        return ROCKSDB_NAMESPACE::Histograms::STALL_L0_SLOWDOWN_COUNT;
+      case 0xE:
+        return ROCKSDB_NAMESPACE::Histograms::STALL_MEMTABLE_COMPACTION_COUNT;
+      case 0xF:
+        return ROCKSDB_NAMESPACE::Histograms::STALL_L0_NUM_FILES_COUNT;
+      case 0x10:
+        return ROCKSDB_NAMESPACE::Histograms::HARD_RATE_LIMIT_DELAY_COUNT;
+      case 0x11:
+        return ROCKSDB_NAMESPACE::Histograms::SOFT_RATE_LIMIT_DELAY_COUNT;
+      case 0x12:
+        return ROCKSDB_NAMESPACE::Histograms::NUM_FILES_IN_SINGLE_COMPACTION;
+      case 0x13:
+        return ROCKSDB_NAMESPACE::Histograms::DB_SEEK;
+      case 0x14:
+        return ROCKSDB_NAMESPACE::Histograms::WRITE_STALL;
+      case 0x15:
+        return ROCKSDB_NAMESPACE::Histograms::SST_READ_MICROS;
+      case 0x16:
+        return ROCKSDB_NAMESPACE::Histograms::NUM_SUBCOMPACTIONS_SCHEDULED;
+      case 0x17:
+        return ROCKSDB_NAMESPACE::Histograms::BYTES_PER_READ;
+      case 0x18:
+        return ROCKSDB_NAMESPACE::Histograms::BYTES_PER_WRITE;
+      case 0x19:
+        return ROCKSDB_NAMESPACE::Histograms::BYTES_PER_MULTIGET;
+      case 0x1A:
+        return ROCKSDB_NAMESPACE::Histograms::BYTES_COMPRESSED;
+      case 0x1B:
+        return ROCKSDB_NAMESPACE::Histograms::BYTES_DECOMPRESSED;
+      case 0x1C:
+        return ROCKSDB_NAMESPACE::Histograms::COMPRESSION_TIMES_NANOS;
+      case 0x1D:
+        return ROCKSDB_NAMESPACE::Histograms::DECOMPRESSION_TIMES_NANOS;
+      case 0x1E:
+        return ROCKSDB_NAMESPACE::Histograms::READ_NUM_MERGE_OPERANDS;
+      // 0x20 to skip 0x1F so TICKER_ENUM_MAX remains unchanged for minor
+      // version compatibility.
+      case 0x20:
+        return ROCKSDB_NAMESPACE::Histograms::FLUSH_TIME;
+      case 0x21:
+        return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_KEY_SIZE;
+      case 0x22:
+        return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_VALUE_SIZE;
+      case 0x23:
+        return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_WRITE_MICROS;
+      case 0x24:
+        return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_GET_MICROS;
+      case 0x25:
+        return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_MULTIGET_MICROS;
+      case 0x26:
+        return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_SEEK_MICROS;
+      case 0x27:
+        return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_NEXT_MICROS;
+      case 0x28:
+        return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_PREV_MICROS;
+      case 0x29:
+        return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_BLOB_FILE_WRITE_MICROS;
+      case 0x2A:
+        return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_BLOB_FILE_READ_MICROS;
+      case 0x2B:
+        return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_BLOB_FILE_SYNC_MICROS;
+      case 0x2C:
+        return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_GC_MICROS;
+      case 0x2D:
+        return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_COMPRESSION_MICROS;
+      case 0x2E:
+        return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_DECOMPRESSION_MICROS;
+      case 0x2F:
+        return ROCKSDB_NAMESPACE::Histograms::
+            NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL;
+      case 0x30:
+        return ROCKSDB_NAMESPACE::Histograms::NUM_DATA_BLOCKS_READ_PER_LEVEL;
+      case 0x31:
+        return ROCKSDB_NAMESPACE::Histograms::NUM_SST_READ_PER_LEVEL;
+      case 0x32:
+        return ROCKSDB_NAMESPACE::Histograms::
+            ERROR_HANDLER_AUTORESUME_RETRY_COUNT;
+      case 0x33:
+        return ROCKSDB_NAMESPACE::Histograms::ASYNC_READ_BYTES;
+      case 0x34:
+        return ROCKSDB_NAMESPACE::Histograms::POLL_WAIT_MICROS;
+      case 0x35:
+        return ROCKSDB_NAMESPACE::Histograms::PREFETCHED_BYTES_DISCARDED;
+      case 0x36:
+        return ROCKSDB_NAMESPACE::Histograms::MULTIGET_IO_BATCH_SIZE;
+      case 0x37:
+        return ROCKSDB_NAMESPACE::Histograms::NUM_LEVEL_READ_PER_MULTIGET;
+      case 0x38:
+        return ROCKSDB_NAMESPACE::Histograms::ASYNC_PREFETCH_ABORT_MICROS;
+      case 0x1F:
+        // 0x1F for backwards compatibility on current minor version.
+        return ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX;
+
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::Histograms::DB_GET;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.StatsLevel
+class StatsLevelJni {
+ public:
+  // Returns the equivalent org.rocksdb.StatsLevel for the provided
+  // C++ ROCKSDB_NAMESPACE::StatsLevel enum
+  static jbyte toJavaStatsLevel(
+      const ROCKSDB_NAMESPACE::StatsLevel& stats_level) {
+    switch (stats_level) {
+      case ROCKSDB_NAMESPACE::StatsLevel::kExceptDetailedTimers:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::StatsLevel::kExceptTimeForMutex:
+        return 0x1;
+      case ROCKSDB_NAMESPACE::StatsLevel::kAll:
+        return 0x2;
+
+      default:
+        // undefined/default
+        return 0x0;
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::StatsLevel enum for the
+  // provided Java org.rocksdb.StatsLevel
+  static ROCKSDB_NAMESPACE::StatsLevel toCppStatsLevel(jbyte jstats_level) {
+    switch (jstats_level) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::StatsLevel::kExceptDetailedTimers;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::StatsLevel::kExceptTimeForMutex;
+      case 0x2:
+        return ROCKSDB_NAMESPACE::StatsLevel::kAll;
+
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::StatsLevel::kExceptDetailedTimers;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.RateLimiterMode
+class RateLimiterModeJni {
+ public:
+  // Returns the equivalent org.rocksdb.RateLimiterMode for the provided
+  // C++ ROCKSDB_NAMESPACE::RateLimiter::Mode enum
+  static jbyte toJavaRateLimiterMode(
+      const ROCKSDB_NAMESPACE::RateLimiter::Mode& rate_limiter_mode) {
+    switch (rate_limiter_mode) {
+      case ROCKSDB_NAMESPACE::RateLimiter::Mode::kReadsOnly:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::RateLimiter::Mode::kWritesOnly:
+        return 0x1;
+      case ROCKSDB_NAMESPACE::RateLimiter::Mode::kAllIo:
+        return 0x2;
+
+      default:
+        // undefined/default
+        return 0x1;
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::RateLimiter::Mode enum for
+  // the provided Java org.rocksdb.RateLimiterMode
+  static ROCKSDB_NAMESPACE::RateLimiter::Mode toCppRateLimiterMode(
+      jbyte jrate_limiter_mode) {
+    switch (jrate_limiter_mode) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::RateLimiter::Mode::kReadsOnly;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::RateLimiter::Mode::kWritesOnly;
+      case 0x2:
+        return ROCKSDB_NAMESPACE::RateLimiter::Mode::kAllIo;
+
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::RateLimiter::Mode::kWritesOnly;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.MemoryUsageType
+class MemoryUsageTypeJni {
+ public:
+  // Returns the equivalent org.rocksdb.MemoryUsageType for the provided
+  // C++ ROCKSDB_NAMESPACE::MemoryUtil::UsageType enum
+  static jbyte toJavaMemoryUsageType(
+      const ROCKSDB_NAMESPACE::MemoryUtil::UsageType& usage_type) {
+    switch (usage_type) {
+      case ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kMemTableTotal:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kMemTableUnFlushed:
+        return 0x1;
+      case ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kTableReadersTotal:
+        return 0x2;
+      case ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kCacheTotal:
+        return 0x3;
+      default:
+        // undefined: use kNumUsageTypes
+        return 0x4;
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::MemoryUtil::UsageType enum
+  // for the provided Java org.rocksdb.MemoryUsageType
+  static ROCKSDB_NAMESPACE::MemoryUtil::UsageType toCppMemoryUsageType(
+      jbyte usage_type) {
+    switch (usage_type) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kMemTableTotal;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kMemTableUnFlushed;
+      case 0x2:
+        return ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kTableReadersTotal;
+      case 0x3:
+        return ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kCacheTotal;
+      default:
+        // undefined/default: use kNumUsageTypes
+        return ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kNumUsageTypes;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.Transaction
+class TransactionJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.Transaction
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/Transaction");
+  }
+
+  /**
+   * Create a new Java org.rocksdb.Transaction.WaitingTransactions object
+   *
+   * @param env A pointer to the Java environment
+   * @param jtransaction A Java org.rocksdb.Transaction object
+   * @param column_family_id The id of the column family
+   * @param key The key
+   * @param transaction_ids The transaction ids
+   *
+   * @return A reference to a Java
+   *     org.rocksdb.Transaction.WaitingTransactions object,
+   *     or nullptr if an an exception occurs
+   */
+  static jobject newWaitingTransactions(
+      JNIEnv* env, jobject jtransaction, const uint32_t column_family_id,
+      const std::string& key,
+      const std::vector<TransactionID>& transaction_ids) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID mid = env->GetMethodID(
+        jclazz, "newWaitingTransactions",
+        "(JLjava/lang/String;[J)Lorg/rocksdb/Transaction$WaitingTransactions;");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    jstring jkey = env->NewStringUTF(key.c_str());
+    if (jkey == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+
+    const size_t len = transaction_ids.size();
+    jlongArray jtransaction_ids = env->NewLongArray(static_cast<jsize>(len));
+    if (jtransaction_ids == nullptr) {
+      // exception thrown: OutOfMemoryError
+      env->DeleteLocalRef(jkey);
+      return nullptr;
+    }
+
+    jboolean is_copy;
+    jlong* body = env->GetLongArrayElements(jtransaction_ids, &is_copy);
+    if (body == nullptr) {
+      // exception thrown: OutOfMemoryError
+      env->DeleteLocalRef(jkey);
+      env->DeleteLocalRef(jtransaction_ids);
+      return nullptr;
+    }
+    for (size_t i = 0; i < len; ++i) {
+      body[i] = static_cast<jlong>(transaction_ids[i]);
+    }
+    env->ReleaseLongArrayElements(jtransaction_ids, body,
+                                  is_copy == JNI_TRUE ? 0 : JNI_ABORT);
+
+    jobject jwaiting_transactions = env->CallObjectMethod(
+        jtransaction, mid, static_cast<jlong>(column_family_id), jkey,
+        jtransaction_ids);
+    if (env->ExceptionCheck()) {
+      // exception thrown: InstantiationException or OutOfMemoryError
+      env->DeleteLocalRef(jkey);
+      env->DeleteLocalRef(jtransaction_ids);
+      return nullptr;
+    }
+
+    return jwaiting_transactions;
+  }
+};
+
+// The portal class for org.rocksdb.TransactionDB
+class TransactionDBJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.TransactionDB
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/TransactionDB");
+  }
+
+  /**
+   * Create a new Java org.rocksdb.TransactionDB.DeadlockInfo object
+   *
+   * @param env A pointer to the Java environment
+   * @param jtransaction A Java org.rocksdb.Transaction object
+   * @param column_family_id The id of the column family
+   * @param key The key
+   * @param transaction_ids The transaction ids
+   *
+   * @return A reference to a Java
+   *     org.rocksdb.Transaction.WaitingTransactions object,
+   *     or nullptr if an an exception occurs
+   */
+  static jobject newDeadlockInfo(
+      JNIEnv* env, jobject jtransaction_db,
+      const ROCKSDB_NAMESPACE::TransactionID transaction_id,
+      const uint32_t column_family_id, const std::string& waiting_key,
+      const bool exclusive) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID mid = env->GetMethodID(
+        jclazz, "newDeadlockInfo",
+        "(JJLjava/lang/String;Z)Lorg/rocksdb/TransactionDB$DeadlockInfo;");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    jstring jwaiting_key = env->NewStringUTF(waiting_key.c_str());
+    if (jwaiting_key == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+
+    // resolve the column family id to a ColumnFamilyHandle
+    jobject jdeadlock_info = env->CallObjectMethod(
+        jtransaction_db, mid, transaction_id,
+        static_cast<jlong>(column_family_id), jwaiting_key, exclusive);
+    if (env->ExceptionCheck()) {
+      // exception thrown: InstantiationException or OutOfMemoryError
+      env->DeleteLocalRef(jwaiting_key);
+      return nullptr;
+    }
+
+    return jdeadlock_info;
+  }
+};
+
+// The portal class for org.rocksdb.TxnDBWritePolicy
+class TxnDBWritePolicyJni {
+ public:
+  // Returns the equivalent org.rocksdb.TxnDBWritePolicy for the provided
+  // C++ ROCKSDB_NAMESPACE::TxnDBWritePolicy enum
+  static jbyte toJavaTxnDBWritePolicy(
+      const ROCKSDB_NAMESPACE::TxnDBWritePolicy& txndb_write_policy) {
+    switch (txndb_write_policy) {
+      case ROCKSDB_NAMESPACE::TxnDBWritePolicy::WRITE_COMMITTED:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::TxnDBWritePolicy::WRITE_PREPARED:
+        return 0x1;
+      case ROCKSDB_NAMESPACE::TxnDBWritePolicy::WRITE_UNPREPARED:
+        return 0x2;
+      default:
+        return 0x7F;  // undefined
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::TxnDBWritePolicy enum for the
+  // provided Java org.rocksdb.TxnDBWritePolicy
+  static ROCKSDB_NAMESPACE::TxnDBWritePolicy toCppTxnDBWritePolicy(
+      jbyte jtxndb_write_policy) {
+    switch (jtxndb_write_policy) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::TxnDBWritePolicy::WRITE_COMMITTED;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::TxnDBWritePolicy::WRITE_PREPARED;
+      case 0x2:
+        return ROCKSDB_NAMESPACE::TxnDBWritePolicy::WRITE_UNPREPARED;
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::TxnDBWritePolicy::WRITE_COMMITTED;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.TransactionDB.KeyLockInfo
+class KeyLockInfoJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.TransactionDB.KeyLockInfo
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/TransactionDB$KeyLockInfo");
+  }
+
+  /**
+   * Create a new Java org.rocksdb.TransactionDB.KeyLockInfo object
+   * with the same properties as the provided C++ ROCKSDB_NAMESPACE::KeyLockInfo
+   * object
+   *
+   * @param env A pointer to the Java environment
+   * @param key_lock_info The ROCKSDB_NAMESPACE::KeyLockInfo object
+   *
+   * @return A reference to a Java
+   *     org.rocksdb.TransactionDB.KeyLockInfo object,
+   *     or nullptr if an an exception occurs
+   */
+  static jobject construct(
+      JNIEnv* env, const ROCKSDB_NAMESPACE::KeyLockInfo& key_lock_info) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID mid =
+        env->GetMethodID(jclazz, "<init>", "(Ljava/lang/String;[JZ)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    jstring jkey = env->NewStringUTF(key_lock_info.key.c_str());
+    if (jkey == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+
+    const jsize jtransaction_ids_len =
+        static_cast<jsize>(key_lock_info.ids.size());
+    jlongArray jtransactions_ids = env->NewLongArray(jtransaction_ids_len);
+    if (jtransactions_ids == nullptr) {
+      // exception thrown: OutOfMemoryError
+      env->DeleteLocalRef(jkey);
+      return nullptr;
+    }
+
+    const jobject jkey_lock_info = env->NewObject(
+        jclazz, mid, jkey, jtransactions_ids, key_lock_info.exclusive);
+    if (jkey_lock_info == nullptr) {
+      // exception thrown: InstantiationException or OutOfMemoryError
+      env->DeleteLocalRef(jtransactions_ids);
+      env->DeleteLocalRef(jkey);
+      return nullptr;
+    }
+
+    return jkey_lock_info;
+  }
+};
+
+// The portal class for org.rocksdb.TransactionDB.DeadlockInfo
+class DeadlockInfoJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.TransactionDB.DeadlockInfo
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/TransactionDB$DeadlockInfo");
+  }
+};
+
+// The portal class for org.rocksdb.TransactionDB.DeadlockPath
+class DeadlockPathJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.TransactionDB.DeadlockPath
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/TransactionDB$DeadlockPath");
+  }
+
+  /**
+   * Create a new Java org.rocksdb.TransactionDB.DeadlockPath object
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return A reference to a Java
+   *     org.rocksdb.TransactionDB.DeadlockPath object,
+   *     or nullptr if an an exception occurs
+   */
+  static jobject construct(JNIEnv* env, const jobjectArray jdeadlock_infos,
+                           const bool limit_exceeded) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID mid = env->GetMethodID(jclazz, "<init>", "([LDeadlockInfo;Z)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    const jobject jdeadlock_path =
+        env->NewObject(jclazz, mid, jdeadlock_infos, limit_exceeded);
+    if (jdeadlock_path == nullptr) {
+      // exception thrown: InstantiationException or OutOfMemoryError
+      return nullptr;
+    }
+
+    return jdeadlock_path;
+  }
+};
+
+class AbstractTableFilterJni
+    : public RocksDBNativeClass<
+          const ROCKSDB_NAMESPACE::TableFilterJniCallback*,
+          AbstractTableFilterJni> {
+ public:
+  /**
+   * Get the Java Method: TableFilter#filter(TableProperties)
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getFilterMethod(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "filter", "(Lorg/rocksdb/TableProperties;)Z");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+ private:
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/TableFilter");
+  }
+};
+
+class TablePropertiesJni : public JavaClass {
+ public:
+  /**
+   * Create a new Java org.rocksdb.TableProperties object.
+   *
+   * @param env A pointer to the Java environment
+   * @param table_properties A Cpp table properties object
+   *
+   * @return A reference to a Java org.rocksdb.TableProperties object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject fromCppTableProperties(
+      JNIEnv* env, const ROCKSDB_NAMESPACE::TableProperties& table_properties) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID mid = env->GetMethodID(
+        jclazz, "<init>",
+        "(JJJJJJJJJJJJJJJJJJJJJJ[BLjava/lang/String;Ljava/lang/String;Ljava/"
+        "lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/lang/"
+        "String;Ljava/util/Map;Ljava/util/Map;)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    jbyteArray jcolumn_family_name = ROCKSDB_NAMESPACE::JniUtil::copyBytes(
+        env, table_properties.column_family_name);
+    if (jcolumn_family_name == nullptr) {
+      // exception occurred creating java string
+      return nullptr;
+    }
+
+    jstring jfilter_policy_name = ROCKSDB_NAMESPACE::JniUtil::toJavaString(
+        env, &table_properties.filter_policy_name, true);
+    if (env->ExceptionCheck()) {
+      // exception occurred creating java string
+      env->DeleteLocalRef(jcolumn_family_name);
+      return nullptr;
+    }
+
+    jstring jcomparator_name = ROCKSDB_NAMESPACE::JniUtil::toJavaString(
+        env, &table_properties.comparator_name, true);
+    if (env->ExceptionCheck()) {
+      // exception occurred creating java string
+      env->DeleteLocalRef(jcolumn_family_name);
+      env->DeleteLocalRef(jfilter_policy_name);
+      return nullptr;
+    }
+
+    jstring jmerge_operator_name = ROCKSDB_NAMESPACE::JniUtil::toJavaString(
+        env, &table_properties.merge_operator_name, true);
+    if (env->ExceptionCheck()) {
+      // exception occurred creating java string
+      env->DeleteLocalRef(jcolumn_family_name);
+      env->DeleteLocalRef(jfilter_policy_name);
+      env->DeleteLocalRef(jcomparator_name);
+      return nullptr;
+    }
+
+    jstring jprefix_extractor_name = ROCKSDB_NAMESPACE::JniUtil::toJavaString(
+        env, &table_properties.prefix_extractor_name, true);
+    if (env->ExceptionCheck()) {
+      // exception occurred creating java string
+      env->DeleteLocalRef(jcolumn_family_name);
+      env->DeleteLocalRef(jfilter_policy_name);
+      env->DeleteLocalRef(jcomparator_name);
+      env->DeleteLocalRef(jmerge_operator_name);
+      return nullptr;
+    }
+
+    jstring jproperty_collectors_names =
+        ROCKSDB_NAMESPACE::JniUtil::toJavaString(
+            env, &table_properties.property_collectors_names, true);
+    if (env->ExceptionCheck()) {
+      // exception occurred creating java string
+      env->DeleteLocalRef(jcolumn_family_name);
+      env->DeleteLocalRef(jfilter_policy_name);
+      env->DeleteLocalRef(jcomparator_name);
+      env->DeleteLocalRef(jmerge_operator_name);
+      env->DeleteLocalRef(jprefix_extractor_name);
+      return nullptr;
+    }
+
+    jstring jcompression_name = ROCKSDB_NAMESPACE::JniUtil::toJavaString(
+        env, &table_properties.compression_name, true);
+    if (env->ExceptionCheck()) {
+      // exception occurred creating java string
+      env->DeleteLocalRef(jcolumn_family_name);
+      env->DeleteLocalRef(jfilter_policy_name);
+      env->DeleteLocalRef(jcomparator_name);
+      env->DeleteLocalRef(jmerge_operator_name);
+      env->DeleteLocalRef(jprefix_extractor_name);
+      env->DeleteLocalRef(jproperty_collectors_names);
+      return nullptr;
+    }
+
+    // Map<String, String>
+    jobject juser_collected_properties =
+        ROCKSDB_NAMESPACE::HashMapJni::fromCppMap(
+            env, &table_properties.user_collected_properties);
+    if (env->ExceptionCheck()) {
+      // exception occurred creating java map
+      env->DeleteLocalRef(jcolumn_family_name);
+      env->DeleteLocalRef(jfilter_policy_name);
+      env->DeleteLocalRef(jcomparator_name);
+      env->DeleteLocalRef(jmerge_operator_name);
+      env->DeleteLocalRef(jprefix_extractor_name);
+      env->DeleteLocalRef(jproperty_collectors_names);
+      env->DeleteLocalRef(jcompression_name);
+      return nullptr;
+    }
+
+    // Map<String, String>
+    jobject jreadable_properties = ROCKSDB_NAMESPACE::HashMapJni::fromCppMap(
+        env, &table_properties.readable_properties);
+    if (env->ExceptionCheck()) {
+      // exception occurred creating java map
+      env->DeleteLocalRef(jcolumn_family_name);
+      env->DeleteLocalRef(jfilter_policy_name);
+      env->DeleteLocalRef(jcomparator_name);
+      env->DeleteLocalRef(jmerge_operator_name);
+      env->DeleteLocalRef(jprefix_extractor_name);
+      env->DeleteLocalRef(jproperty_collectors_names);
+      env->DeleteLocalRef(jcompression_name);
+      env->DeleteLocalRef(juser_collected_properties);
+      return nullptr;
+    }
+
+    jobject jtable_properties = env->NewObject(
+        jclazz, mid, static_cast<jlong>(table_properties.data_size),
+        static_cast<jlong>(table_properties.index_size),
+        static_cast<jlong>(table_properties.index_partitions),
+        static_cast<jlong>(table_properties.top_level_index_size),
+        static_cast<jlong>(table_properties.index_key_is_user_key),
+        static_cast<jlong>(table_properties.index_value_is_delta_encoded),
+        static_cast<jlong>(table_properties.filter_size),
+        static_cast<jlong>(table_properties.raw_key_size),
+        static_cast<jlong>(table_properties.raw_value_size),
+        static_cast<jlong>(table_properties.num_data_blocks),
+        static_cast<jlong>(table_properties.num_entries),
+        static_cast<jlong>(table_properties.num_deletions),
+        static_cast<jlong>(table_properties.num_merge_operands),
+        static_cast<jlong>(table_properties.num_range_deletions),
+        static_cast<jlong>(table_properties.format_version),
+        static_cast<jlong>(table_properties.fixed_key_len),
+        static_cast<jlong>(table_properties.column_family_id),
+        static_cast<jlong>(table_properties.creation_time),
+        static_cast<jlong>(table_properties.oldest_key_time),
+        static_cast<jlong>(
+            table_properties.slow_compression_estimated_data_size),
+        static_cast<jlong>(
+            table_properties.fast_compression_estimated_data_size),
+        static_cast<jlong>(
+            table_properties.external_sst_file_global_seqno_offset),
+        jcolumn_family_name, jfilter_policy_name, jcomparator_name,
+        jmerge_operator_name, jprefix_extractor_name,
+        jproperty_collectors_names, jcompression_name,
+        juser_collected_properties, jreadable_properties);
+
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+
+    return jtable_properties;
+  }
+
+ private:
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/TableProperties");
+  }
+};
+
+class ColumnFamilyDescriptorJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.ColumnFamilyDescriptor
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/ColumnFamilyDescriptor");
+  }
+
+  /**
+   * Create a new Java org.rocksdb.ColumnFamilyDescriptor object with the same
+   * properties as the provided C++ ROCKSDB_NAMESPACE::ColumnFamilyDescriptor
+   * object
+   *
+   * @param env A pointer to the Java environment
+   * @param cfd A pointer to ROCKSDB_NAMESPACE::ColumnFamilyDescriptor object
+   *
+   * @return A reference to a Java org.rocksdb.ColumnFamilyDescriptor object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject construct(JNIEnv* env, ColumnFamilyDescriptor* cfd) {
+    jbyteArray jcf_name = JniUtil::copyBytes(env, cfd->name);
+    jobject cfopts = ColumnFamilyOptionsJni::construct(env, &(cfd->options));
+
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID mid = env->GetMethodID(jclazz, "<init>",
+                                     "([BLorg/rocksdb/ColumnFamilyOptions;)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      env->DeleteLocalRef(jcf_name);
+      return nullptr;
+    }
+
+    jobject jcfd = env->NewObject(jclazz, mid, jcf_name, cfopts);
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jcf_name);
+      return nullptr;
+    }
+
+    return jcfd;
+  }
+
+  /**
+   * Get the Java Method: ColumnFamilyDescriptor#columnFamilyName
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getColumnFamilyNameMethod(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "columnFamilyName", "()[B");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: ColumnFamilyDescriptor#columnFamilyOptions
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getColumnFamilyOptionsMethod(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "columnFamilyOptions", "()Lorg/rocksdb/ColumnFamilyOptions;");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+// The portal class for org.rocksdb.IndexType
+class IndexTypeJni {
+ public:
+  // Returns the equivalent org.rocksdb.IndexType for the provided
+  // C++ ROCKSDB_NAMESPACE::IndexType enum
+  static jbyte toJavaIndexType(
+      const ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexType& index_type) {
+    switch (index_type) {
+      case ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexType::kBinarySearch:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexType::kHashSearch:
+        return 0x1;
+      case ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexType::
+          kTwoLevelIndexSearch:
+        return 0x2;
+      case ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexType::
+          kBinarySearchWithFirstKey:
+        return 0x3;
+      default:
+        return 0x7F;  // undefined
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::IndexType enum for the
+  // provided Java org.rocksdb.IndexType
+  static ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexType toCppIndexType(
+      jbyte jindex_type) {
+    switch (jindex_type) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexType::
+            kBinarySearch;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexType::
+            kHashSearch;
+      case 0x2:
+        return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexType::
+            kTwoLevelIndexSearch;
+      case 0x3:
+        return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexType::
+            kBinarySearchWithFirstKey;
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexType::
+            kBinarySearch;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.DataBlockIndexType
+class DataBlockIndexTypeJni {
+ public:
+  // Returns the equivalent org.rocksdb.DataBlockIndexType for the provided
+  // C++ ROCKSDB_NAMESPACE::DataBlockIndexType enum
+  static jbyte toJavaDataBlockIndexType(
+      const ROCKSDB_NAMESPACE::BlockBasedTableOptions::DataBlockIndexType&
+          index_type) {
+    switch (index_type) {
+      case ROCKSDB_NAMESPACE::BlockBasedTableOptions::DataBlockIndexType::
+          kDataBlockBinarySearch:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::BlockBasedTableOptions::DataBlockIndexType::
+          kDataBlockBinaryAndHash:
+        return 0x1;
+      default:
+        return 0x7F;  // undefined
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::DataBlockIndexType enum for
+  // the provided Java org.rocksdb.DataBlockIndexType
+  static ROCKSDB_NAMESPACE::BlockBasedTableOptions::DataBlockIndexType
+  toCppDataBlockIndexType(jbyte jindex_type) {
+    switch (jindex_type) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::BlockBasedTableOptions::DataBlockIndexType::
+            kDataBlockBinarySearch;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::BlockBasedTableOptions::DataBlockIndexType::
+            kDataBlockBinaryAndHash;
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::BlockBasedTableOptions::DataBlockIndexType::
+            kDataBlockBinarySearch;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.ChecksumType
+class ChecksumTypeJni {
+ public:
+  // Returns the equivalent org.rocksdb.ChecksumType for the provided
+  // C++ ROCKSDB_NAMESPACE::ChecksumType enum
+  static jbyte toJavaChecksumType(
+      const ROCKSDB_NAMESPACE::ChecksumType& checksum_type) {
+    switch (checksum_type) {
+      case ROCKSDB_NAMESPACE::ChecksumType::kNoChecksum:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::ChecksumType::kCRC32c:
+        return 0x1;
+      case ROCKSDB_NAMESPACE::ChecksumType::kxxHash:
+        return 0x2;
+      case ROCKSDB_NAMESPACE::ChecksumType::kxxHash64:
+        return 0x3;
+      case ROCKSDB_NAMESPACE::ChecksumType::kXXH3:
+        return 0x4;
+      default:
+        return 0x7F;  // undefined
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::ChecksumType enum for the
+  // provided Java org.rocksdb.ChecksumType
+  static ROCKSDB_NAMESPACE::ChecksumType toCppChecksumType(
+      jbyte jchecksum_type) {
+    switch (jchecksum_type) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::ChecksumType::kNoChecksum;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::ChecksumType::kCRC32c;
+      case 0x2:
+        return ROCKSDB_NAMESPACE::ChecksumType::kxxHash;
+      case 0x3:
+        return ROCKSDB_NAMESPACE::ChecksumType::kxxHash64;
+      case 0x4:
+        return ROCKSDB_NAMESPACE::ChecksumType::kXXH3;
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::ChecksumType::kCRC32c;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.IndexShorteningMode
+class IndexShorteningModeJni {
+ public:
+  // Returns the equivalent org.rocksdb.IndexShorteningMode for the provided
+  // C++ ROCKSDB_NAMESPACE::IndexShorteningMode enum
+  static jbyte toJavaIndexShorteningMode(
+      const ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode&
+          index_shortening_mode) {
+    switch (index_shortening_mode) {
+      case ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode::
+          kNoShortening:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode::
+          kShortenSeparators:
+        return 0x1;
+      case ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode::
+          kShortenSeparatorsAndSuccessor:
+        return 0x2;
+      default:
+        return 0x7F;  // undefined
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::IndexShorteningMode enum for
+  // the provided Java org.rocksdb.IndexShorteningMode
+  static ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode
+  toCppIndexShorteningMode(jbyte jindex_shortening_mode) {
+    switch (jindex_shortening_mode) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode::
+            kNoShortening;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode::
+            kShortenSeparators;
+      case 0x2:
+        return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode::
+            kShortenSeparatorsAndSuccessor;
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode::
+            kShortenSeparators;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.Priority
+class PriorityJni {
+ public:
+  // Returns the equivalent org.rocksdb.Priority for the provided
+  // C++ ROCKSDB_NAMESPACE::Env::Priority enum
+  static jbyte toJavaPriority(
+      const ROCKSDB_NAMESPACE::Env::Priority& priority) {
+    switch (priority) {
+      case ROCKSDB_NAMESPACE::Env::Priority::BOTTOM:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::Env::Priority::LOW:
+        return 0x1;
+      case ROCKSDB_NAMESPACE::Env::Priority::HIGH:
+        return 0x2;
+      case ROCKSDB_NAMESPACE::Env::Priority::TOTAL:
+        return 0x3;
+      default:
+        return 0x7F;  // undefined
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::env::Priority enum for the
+  // provided Java org.rocksdb.Priority
+  static ROCKSDB_NAMESPACE::Env::Priority toCppPriority(jbyte jpriority) {
+    switch (jpriority) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::Env::Priority::BOTTOM;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::Env::Priority::LOW;
+      case 0x2:
+        return ROCKSDB_NAMESPACE::Env::Priority::HIGH;
+      case 0x3:
+        return ROCKSDB_NAMESPACE::Env::Priority::TOTAL;
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::Env::Priority::LOW;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.ThreadType
+class ThreadTypeJni {
+ public:
+  // Returns the equivalent org.rocksdb.ThreadType for the provided
+  // C++ ROCKSDB_NAMESPACE::ThreadStatus::ThreadType enum
+  static jbyte toJavaThreadType(
+      const ROCKSDB_NAMESPACE::ThreadStatus::ThreadType& thread_type) {
+    switch (thread_type) {
+      case ROCKSDB_NAMESPACE::ThreadStatus::ThreadType::HIGH_PRIORITY:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::ThreadStatus::ThreadType::LOW_PRIORITY:
+        return 0x1;
+      case ROCKSDB_NAMESPACE::ThreadStatus::ThreadType::USER:
+        return 0x2;
+      case ROCKSDB_NAMESPACE::ThreadStatus::ThreadType::BOTTOM_PRIORITY:
+        return 0x3;
+      default:
+        return 0x7F;  // undefined
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::ThreadStatus::ThreadType enum
+  // for the provided Java org.rocksdb.ThreadType
+  static ROCKSDB_NAMESPACE::ThreadStatus::ThreadType toCppThreadType(
+      jbyte jthread_type) {
+    switch (jthread_type) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::ThreadStatus::ThreadType::HIGH_PRIORITY;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::ThreadStatus::ThreadType::LOW_PRIORITY;
+      case 0x2:
+        return ThreadStatus::ThreadType::USER;
+      case 0x3:
+        return ROCKSDB_NAMESPACE::ThreadStatus::ThreadType::BOTTOM_PRIORITY;
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::ThreadStatus::ThreadType::LOW_PRIORITY;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.OperationType
+class OperationTypeJni {
+ public:
+  // Returns the equivalent org.rocksdb.OperationType for the provided
+  // C++ ROCKSDB_NAMESPACE::ThreadStatus::OperationType enum
+  static jbyte toJavaOperationType(
+      const ROCKSDB_NAMESPACE::ThreadStatus::OperationType& operation_type) {
+    switch (operation_type) {
+      case ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_UNKNOWN:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_COMPACTION:
+        return 0x1;
+      case ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_FLUSH:
+        return 0x2;
+      default:
+        return 0x7F;  // undefined
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::ThreadStatus::OperationType
+  // enum for the provided Java org.rocksdb.OperationType
+  static ROCKSDB_NAMESPACE::ThreadStatus::OperationType toCppOperationType(
+      jbyte joperation_type) {
+    switch (joperation_type) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_UNKNOWN;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_COMPACTION;
+      case 0x2:
+        return ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_FLUSH;
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_UNKNOWN;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.OperationStage
+class OperationStageJni {
+ public:
+  // Returns the equivalent org.rocksdb.OperationStage for the provided
+  // C++ ROCKSDB_NAMESPACE::ThreadStatus::OperationStage enum
+  static jbyte toJavaOperationStage(
+      const ROCKSDB_NAMESPACE::ThreadStatus::OperationStage& operation_stage) {
+    switch (operation_stage) {
+      case ROCKSDB_NAMESPACE::ThreadStatus::OperationStage::STAGE_UNKNOWN:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::ThreadStatus::OperationStage::STAGE_FLUSH_RUN:
+        return 0x1;
+      case ROCKSDB_NAMESPACE::ThreadStatus::OperationStage::
+          STAGE_FLUSH_WRITE_L0:
+        return 0x2;
+      case ROCKSDB_NAMESPACE::ThreadStatus::OperationStage::
+          STAGE_COMPACTION_PREPARE:
+        return 0x3;
+      case ROCKSDB_NAMESPACE::ThreadStatus::OperationStage::
+          STAGE_COMPACTION_RUN:
+        return 0x4;
+      case ROCKSDB_NAMESPACE::ThreadStatus::OperationStage::
+          STAGE_COMPACTION_PROCESS_KV:
+        return 0x5;
+      case ROCKSDB_NAMESPACE::ThreadStatus::OperationStage::
+          STAGE_COMPACTION_INSTALL:
+        return 0x6;
+      case ROCKSDB_NAMESPACE::ThreadStatus::OperationStage::
+          STAGE_COMPACTION_SYNC_FILE:
+        return 0x7;
+      case ROCKSDB_NAMESPACE::ThreadStatus::OperationStage::
+          STAGE_PICK_MEMTABLES_TO_FLUSH:
+        return 0x8;
+      case ROCKSDB_NAMESPACE::ThreadStatus::OperationStage::
+          STAGE_MEMTABLE_ROLLBACK:
+        return 0x9;
+      case ROCKSDB_NAMESPACE::ThreadStatus::OperationStage::
+          STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS:
+        return 0xA;
+      default:
+        return 0x7F;  // undefined
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::ThreadStatus::OperationStage
+  // enum for the provided Java org.rocksdb.OperationStage
+  static ROCKSDB_NAMESPACE::ThreadStatus::OperationStage toCppOperationStage(
+      jbyte joperation_stage) {
+    switch (joperation_stage) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::ThreadStatus::OperationStage::STAGE_UNKNOWN;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::ThreadStatus::OperationStage::STAGE_FLUSH_RUN;
+      case 0x2:
+        return ROCKSDB_NAMESPACE::ThreadStatus::OperationStage::
+            STAGE_FLUSH_WRITE_L0;
+      case 0x3:
+        return ROCKSDB_NAMESPACE::ThreadStatus::OperationStage::
+            STAGE_COMPACTION_PREPARE;
+      case 0x4:
+        return ROCKSDB_NAMESPACE::ThreadStatus::OperationStage::
+            STAGE_COMPACTION_RUN;
+      case 0x5:
+        return ROCKSDB_NAMESPACE::ThreadStatus::OperationStage::
+            STAGE_COMPACTION_PROCESS_KV;
+      case 0x6:
+        return ROCKSDB_NAMESPACE::ThreadStatus::OperationStage::
+            STAGE_COMPACTION_INSTALL;
+      case 0x7:
+        return ROCKSDB_NAMESPACE::ThreadStatus::OperationStage::
+            STAGE_COMPACTION_SYNC_FILE;
+      case 0x8:
+        return ROCKSDB_NAMESPACE::ThreadStatus::OperationStage::
+            STAGE_PICK_MEMTABLES_TO_FLUSH;
+      case 0x9:
+        return ROCKSDB_NAMESPACE::ThreadStatus::OperationStage::
+            STAGE_MEMTABLE_ROLLBACK;
+      case 0xA:
+        return ROCKSDB_NAMESPACE::ThreadStatus::OperationStage::
+            STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS;
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::ThreadStatus::OperationStage::STAGE_UNKNOWN;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.StateType
+class StateTypeJni {
+ public:
+  // Returns the equivalent org.rocksdb.StateType for the provided
+  // C++ ROCKSDB_NAMESPACE::ThreadStatus::StateType enum
+  static jbyte toJavaStateType(
+      const ROCKSDB_NAMESPACE::ThreadStatus::StateType& state_type) {
+    switch (state_type) {
+      case ROCKSDB_NAMESPACE::ThreadStatus::StateType::STATE_UNKNOWN:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::ThreadStatus::StateType::STATE_MUTEX_WAIT:
+        return 0x1;
+      default:
+        return 0x7F;  // undefined
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::ThreadStatus::StateType enum
+  // for the provided Java org.rocksdb.StateType
+  static ROCKSDB_NAMESPACE::ThreadStatus::StateType toCppStateType(
+      jbyte jstate_type) {
+    switch (jstate_type) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::ThreadStatus::StateType::STATE_UNKNOWN;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::ThreadStatus::StateType::STATE_MUTEX_WAIT;
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::ThreadStatus::StateType::STATE_UNKNOWN;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.ThreadStatus
+class ThreadStatusJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.ThreadStatus
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/ThreadStatus");
+  }
+
+  /**
+   * Create a new Java org.rocksdb.ThreadStatus object with the same
+   * properties as the provided C++ ROCKSDB_NAMESPACE::ThreadStatus object
+   *
+   * @param env A pointer to the Java environment
+   * @param thread_status A pointer to ROCKSDB_NAMESPACE::ThreadStatus object
+   *
+   * @return A reference to a Java org.rocksdb.ColumnFamilyOptions object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject construct(
+      JNIEnv* env, const ROCKSDB_NAMESPACE::ThreadStatus* thread_status) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID mid = env->GetMethodID(
+        jclazz, "<init>", "(JBLjava/lang/String;Ljava/lang/String;BJB[JB)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    jstring jdb_name =
+        JniUtil::toJavaString(env, &(thread_status->db_name), true);
+    if (env->ExceptionCheck()) {
+      // an error occurred
+      return nullptr;
+    }
+
+    jstring jcf_name =
+        JniUtil::toJavaString(env, &(thread_status->cf_name), true);
+    if (env->ExceptionCheck()) {
+      // an error occurred
+      env->DeleteLocalRef(jdb_name);
+      return nullptr;
+    }
+
+    // long[]
+    const jsize len = static_cast<jsize>(
+        ROCKSDB_NAMESPACE::ThreadStatus::kNumOperationProperties);
+    jlongArray joperation_properties = env->NewLongArray(len);
+    if (joperation_properties == nullptr) {
+      // an exception occurred
+      env->DeleteLocalRef(jdb_name);
+      env->DeleteLocalRef(jcf_name);
+      return nullptr;
+    }
+    jboolean is_copy;
+    jlong* body = env->GetLongArrayElements(joperation_properties, &is_copy);
+    if (body == nullptr) {
+      // exception thrown: OutOfMemoryError
+      env->DeleteLocalRef(jdb_name);
+      env->DeleteLocalRef(jcf_name);
+      env->DeleteLocalRef(joperation_properties);
+      return nullptr;
+    }
+    for (size_t i = 0; i < len; ++i) {
+      body[i] = static_cast<jlong>(thread_status->op_properties[i]);
+    }
+    env->ReleaseLongArrayElements(joperation_properties, body,
+                                  is_copy == JNI_TRUE ? 0 : JNI_ABORT);
+
+    jobject jcfd = env->NewObject(
+        jclazz, mid, static_cast<jlong>(thread_status->thread_id),
+        ThreadTypeJni::toJavaThreadType(thread_status->thread_type), jdb_name,
+        jcf_name,
+        OperationTypeJni::toJavaOperationType(thread_status->operation_type),
+        static_cast<jlong>(thread_status->op_elapsed_micros),
+        OperationStageJni::toJavaOperationStage(thread_status->operation_stage),
+        joperation_properties,
+        StateTypeJni::toJavaStateType(thread_status->state_type));
+    if (env->ExceptionCheck()) {
+      // exception occurred
+      env->DeleteLocalRef(jdb_name);
+      env->DeleteLocalRef(jcf_name);
+      env->DeleteLocalRef(joperation_properties);
+      return nullptr;
+    }
+
+    // cleanup
+    env->DeleteLocalRef(jdb_name);
+    env->DeleteLocalRef(jcf_name);
+    env->DeleteLocalRef(joperation_properties);
+
+    return jcfd;
+  }
+};
+
+// The portal class for org.rocksdb.CompactionStyle
+class CompactionStyleJni {
+ public:
+  // Returns the equivalent org.rocksdb.CompactionStyle for the provided
+  // C++ ROCKSDB_NAMESPACE::CompactionStyle enum
+  static jbyte toJavaCompactionStyle(
+      const ROCKSDB_NAMESPACE::CompactionStyle& compaction_style) {
+    switch (compaction_style) {
+      case ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleLevel:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleUniversal:
+        return 0x1;
+      case ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleFIFO:
+        return 0x2;
+      case ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleNone:
+        return 0x3;
+      default:
+        return 0x7F;  // undefined
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::CompactionStyle enum for the
+  // provided Java org.rocksdb.CompactionStyle
+  static ROCKSDB_NAMESPACE::CompactionStyle toCppCompactionStyle(
+      jbyte jcompaction_style) {
+    switch (jcompaction_style) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleLevel;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleUniversal;
+      case 0x2:
+        return ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleFIFO;
+      case 0x3:
+        return ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleNone;
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleLevel;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.CompactionReason
+class CompactionReasonJni {
+ public:
+  // Returns the equivalent org.rocksdb.CompactionReason for the provided
+  // C++ ROCKSDB_NAMESPACE::CompactionReason enum
+  static jbyte toJavaCompactionReason(
+      const ROCKSDB_NAMESPACE::CompactionReason& compaction_reason) {
+    switch (compaction_reason) {
+      case ROCKSDB_NAMESPACE::CompactionReason::kUnknown:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::CompactionReason::kLevelL0FilesNum:
+        return 0x1;
+      case ROCKSDB_NAMESPACE::CompactionReason::kLevelMaxLevelSize:
+        return 0x2;
+      case ROCKSDB_NAMESPACE::CompactionReason::kUniversalSizeAmplification:
+        return 0x3;
+      case ROCKSDB_NAMESPACE::CompactionReason::kUniversalSizeRatio:
+        return 0x4;
+      case ROCKSDB_NAMESPACE::CompactionReason::kUniversalSortedRunNum:
+        return 0x5;
+      case ROCKSDB_NAMESPACE::CompactionReason::kFIFOMaxSize:
+        return 0x6;
+      case ROCKSDB_NAMESPACE::CompactionReason::kFIFOReduceNumFiles:
+        return 0x7;
+      case ROCKSDB_NAMESPACE::CompactionReason::kFIFOTtl:
+        return 0x8;
+      case ROCKSDB_NAMESPACE::CompactionReason::kManualCompaction:
+        return 0x9;
+      case ROCKSDB_NAMESPACE::CompactionReason::kFilesMarkedForCompaction:
+        return 0x10;
+      case ROCKSDB_NAMESPACE::CompactionReason::kBottommostFiles:
+        return 0x0A;
+      case ROCKSDB_NAMESPACE::CompactionReason::kTtl:
+        return 0x0B;
+      case ROCKSDB_NAMESPACE::CompactionReason::kFlush:
+        return 0x0C;
+      case ROCKSDB_NAMESPACE::CompactionReason::kExternalSstIngestion:
+        return 0x0D;
+      default:
+        return 0x7F;  // undefined
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::CompactionReason enum for the
+  // provided Java org.rocksdb.CompactionReason
+  static ROCKSDB_NAMESPACE::CompactionReason toCppCompactionReason(
+      jbyte jcompaction_reason) {
+    switch (jcompaction_reason) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::CompactionReason::kUnknown;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::CompactionReason::kLevelL0FilesNum;
+      case 0x2:
+        return ROCKSDB_NAMESPACE::CompactionReason::kLevelMaxLevelSize;
+      case 0x3:
+        return ROCKSDB_NAMESPACE::CompactionReason::kUniversalSizeAmplification;
+      case 0x4:
+        return ROCKSDB_NAMESPACE::CompactionReason::kUniversalSizeRatio;
+      case 0x5:
+        return ROCKSDB_NAMESPACE::CompactionReason::kUniversalSortedRunNum;
+      case 0x6:
+        return ROCKSDB_NAMESPACE::CompactionReason::kFIFOMaxSize;
+      case 0x7:
+        return ROCKSDB_NAMESPACE::CompactionReason::kFIFOReduceNumFiles;
+      case 0x8:
+        return ROCKSDB_NAMESPACE::CompactionReason::kFIFOTtl;
+      case 0x9:
+        return ROCKSDB_NAMESPACE::CompactionReason::kManualCompaction;
+      case 0x10:
+        return ROCKSDB_NAMESPACE::CompactionReason::kFilesMarkedForCompaction;
+      case 0x0A:
+        return ROCKSDB_NAMESPACE::CompactionReason::kBottommostFiles;
+      case 0x0B:
+        return ROCKSDB_NAMESPACE::CompactionReason::kTtl;
+      case 0x0C:
+        return ROCKSDB_NAMESPACE::CompactionReason::kFlush;
+      case 0x0D:
+        return ROCKSDB_NAMESPACE::CompactionReason::kExternalSstIngestion;
+      case 0x0E:
+        return ROCKSDB_NAMESPACE::CompactionReason::kPeriodicCompaction;
+      case 0x0F:
+        return ROCKSDB_NAMESPACE::CompactionReason::kChangeTemperature;
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::CompactionReason::kUnknown;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.WalFileType
+class WalFileTypeJni {
+ public:
+  // Returns the equivalent org.rocksdb.WalFileType for the provided
+  // C++ ROCKSDB_NAMESPACE::WalFileType enum
+  static jbyte toJavaWalFileType(
+      const ROCKSDB_NAMESPACE::WalFileType& wal_file_type) {
+    switch (wal_file_type) {
+      case ROCKSDB_NAMESPACE::WalFileType::kArchivedLogFile:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::WalFileType::kAliveLogFile:
+        return 0x1;
+      default:
+        return 0x7F;  // undefined
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::WalFileType enum for the
+  // provided Java org.rocksdb.WalFileType
+  static ROCKSDB_NAMESPACE::WalFileType toCppWalFileType(jbyte jwal_file_type) {
+    switch (jwal_file_type) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::WalFileType::kArchivedLogFile;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::WalFileType::kAliveLogFile;
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::WalFileType::kAliveLogFile;
+    }
+  }
+};
+
+class LogFileJni : public JavaClass {
+ public:
+  /**
+   * Create a new Java org.rocksdb.LogFile object.
+   *
+   * @param env A pointer to the Java environment
+   * @param log_file A Cpp log file object
+   *
+   * @return A reference to a Java org.rocksdb.LogFile object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject fromCppLogFile(JNIEnv* env,
+                                ROCKSDB_NAMESPACE::LogFile* log_file) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID mid =
+        env->GetMethodID(jclazz, "<init>", "(Ljava/lang/String;JBJJ)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    std::string path_name = log_file->PathName();
+    jstring jpath_name =
+        ROCKSDB_NAMESPACE::JniUtil::toJavaString(env, &path_name, true);
+    if (env->ExceptionCheck()) {
+      // exception occurred creating java string
+      return nullptr;
+    }
+
+    jobject jlog_file = env->NewObject(
+        jclazz, mid, jpath_name, static_cast<jlong>(log_file->LogNumber()),
+        ROCKSDB_NAMESPACE::WalFileTypeJni::toJavaWalFileType(log_file->Type()),
+        static_cast<jlong>(log_file->StartSequence()),
+        static_cast<jlong>(log_file->SizeFileBytes()));
+
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jpath_name);
+      return nullptr;
+    }
+
+    // cleanup
+    env->DeleteLocalRef(jpath_name);
+
+    return jlog_file;
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/LogFile");
+  }
+};
+
+class LiveFileMetaDataJni : public JavaClass {
+ public:
+  /**
+   * Create a new Java org.rocksdb.LiveFileMetaData object.
+   *
+   * @param env A pointer to the Java environment
+   * @param live_file_meta_data A Cpp live file meta data object
+   *
+   * @return A reference to a Java org.rocksdb.LiveFileMetaData object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject fromCppLiveFileMetaData(
+      JNIEnv* env, ROCKSDB_NAMESPACE::LiveFileMetaData* live_file_meta_data) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID mid = env->GetMethodID(
+        jclazz, "<init>",
+        "([BILjava/lang/String;Ljava/lang/String;JJJ[B[BJZJJ)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    jbyteArray jcolumn_family_name = ROCKSDB_NAMESPACE::JniUtil::copyBytes(
+        env, live_file_meta_data->column_family_name);
+    if (jcolumn_family_name == nullptr) {
+      // exception occurred creating java byte array
+      return nullptr;
+    }
+
+    jstring jfile_name = ROCKSDB_NAMESPACE::JniUtil::toJavaString(
+        env, &live_file_meta_data->name, true);
+    if (env->ExceptionCheck()) {
+      // exception occurred creating java string
+      env->DeleteLocalRef(jcolumn_family_name);
+      return nullptr;
+    }
+
+    jstring jpath = ROCKSDB_NAMESPACE::JniUtil::toJavaString(
+        env, &live_file_meta_data->db_path, true);
+    if (env->ExceptionCheck()) {
+      // exception occurred creating java string
+      env->DeleteLocalRef(jcolumn_family_name);
+      env->DeleteLocalRef(jfile_name);
+      return nullptr;
+    }
+
+    jbyteArray jsmallest_key = ROCKSDB_NAMESPACE::JniUtil::copyBytes(
+        env, live_file_meta_data->smallestkey);
+    if (jsmallest_key == nullptr) {
+      // exception occurred creating java byte array
+      env->DeleteLocalRef(jcolumn_family_name);
+      env->DeleteLocalRef(jfile_name);
+      env->DeleteLocalRef(jpath);
+      return nullptr;
+    }
+
+    jbyteArray jlargest_key = ROCKSDB_NAMESPACE::JniUtil::copyBytes(
+        env, live_file_meta_data->largestkey);
+    if (jlargest_key == nullptr) {
+      // exception occurred creating java byte array
+      env->DeleteLocalRef(jcolumn_family_name);
+      env->DeleteLocalRef(jfile_name);
+      env->DeleteLocalRef(jpath);
+      env->DeleteLocalRef(jsmallest_key);
+      return nullptr;
+    }
+
+    jobject jlive_file_meta_data = env->NewObject(
+        jclazz, mid, jcolumn_family_name,
+        static_cast<jint>(live_file_meta_data->level), jfile_name, jpath,
+        static_cast<jlong>(live_file_meta_data->size),
+        static_cast<jlong>(live_file_meta_data->smallest_seqno),
+        static_cast<jlong>(live_file_meta_data->largest_seqno), jsmallest_key,
+        jlargest_key,
+        static_cast<jlong>(live_file_meta_data->num_reads_sampled),
+        static_cast<jboolean>(live_file_meta_data->being_compacted),
+        static_cast<jlong>(live_file_meta_data->num_entries),
+        static_cast<jlong>(live_file_meta_data->num_deletions));
+
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jcolumn_family_name);
+      env->DeleteLocalRef(jfile_name);
+      env->DeleteLocalRef(jpath);
+      env->DeleteLocalRef(jsmallest_key);
+      env->DeleteLocalRef(jlargest_key);
+      return nullptr;
+    }
+
+    // cleanup
+    env->DeleteLocalRef(jcolumn_family_name);
+    env->DeleteLocalRef(jfile_name);
+    env->DeleteLocalRef(jpath);
+    env->DeleteLocalRef(jsmallest_key);
+    env->DeleteLocalRef(jlargest_key);
+
+    return jlive_file_meta_data;
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/LiveFileMetaData");
+  }
+};
+
+class SstFileMetaDataJni : public JavaClass {
+ public:
+  /**
+   * Create a new Java org.rocksdb.SstFileMetaData object.
+   *
+   * @param env A pointer to the Java environment
+   * @param sst_file_meta_data A Cpp sst file meta data object
+   *
+   * @return A reference to a Java org.rocksdb.SstFileMetaData object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject fromCppSstFileMetaData(
+      JNIEnv* env,
+      const ROCKSDB_NAMESPACE::SstFileMetaData* sst_file_meta_data) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID mid = env->GetMethodID(
+        jclazz, "<init>", "(Ljava/lang/String;Ljava/lang/String;JJJ[B[BJZJJ)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    jstring jfile_name = ROCKSDB_NAMESPACE::JniUtil::toJavaString(
+        env, &sst_file_meta_data->name, true);
+    if (jfile_name == nullptr) {
+      // exception occurred creating java byte array
+      return nullptr;
+    }
+
+    jstring jpath = ROCKSDB_NAMESPACE::JniUtil::toJavaString(
+        env, &sst_file_meta_data->db_path, true);
+    if (jpath == nullptr) {
+      // exception occurred creating java byte array
+      env->DeleteLocalRef(jfile_name);
+      return nullptr;
+    }
+
+    jbyteArray jsmallest_key = ROCKSDB_NAMESPACE::JniUtil::copyBytes(
+        env, sst_file_meta_data->smallestkey);
+    if (jsmallest_key == nullptr) {
+      // exception occurred creating java byte array
+      env->DeleteLocalRef(jfile_name);
+      env->DeleteLocalRef(jpath);
+      return nullptr;
+    }
+
+    jbyteArray jlargest_key = ROCKSDB_NAMESPACE::JniUtil::copyBytes(
+        env, sst_file_meta_data->largestkey);
+    if (jlargest_key == nullptr) {
+      // exception occurred creating java byte array
+      env->DeleteLocalRef(jfile_name);
+      env->DeleteLocalRef(jpath);
+      env->DeleteLocalRef(jsmallest_key);
+      return nullptr;
+    }
+
+    jobject jsst_file_meta_data = env->NewObject(
+        jclazz, mid, jfile_name, jpath,
+        static_cast<jlong>(sst_file_meta_data->size),
+        static_cast<jint>(sst_file_meta_data->smallest_seqno),
+        static_cast<jlong>(sst_file_meta_data->largest_seqno), jsmallest_key,
+        jlargest_key, static_cast<jlong>(sst_file_meta_data->num_reads_sampled),
+        static_cast<jboolean>(sst_file_meta_data->being_compacted),
+        static_cast<jlong>(sst_file_meta_data->num_entries),
+        static_cast<jlong>(sst_file_meta_data->num_deletions));
+
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jfile_name);
+      env->DeleteLocalRef(jpath);
+      env->DeleteLocalRef(jsmallest_key);
+      env->DeleteLocalRef(jlargest_key);
+      return nullptr;
+    }
+
+    // cleanup
+    env->DeleteLocalRef(jfile_name);
+    env->DeleteLocalRef(jpath);
+    env->DeleteLocalRef(jsmallest_key);
+    env->DeleteLocalRef(jlargest_key);
+
+    return jsst_file_meta_data;
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/SstFileMetaData");
+  }
+};
+
+class LevelMetaDataJni : public JavaClass {
+ public:
+  /**
+   * Create a new Java org.rocksdb.LevelMetaData object.
+   *
+   * @param env A pointer to the Java environment
+   * @param level_meta_data A Cpp level meta data object
+   *
+   * @return A reference to a Java org.rocksdb.LevelMetaData object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject fromCppLevelMetaData(
+      JNIEnv* env, const ROCKSDB_NAMESPACE::LevelMetaData* level_meta_data) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID mid = env->GetMethodID(jclazz, "<init>",
+                                     "(IJ[Lorg/rocksdb/SstFileMetaData;)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    const jsize jlen = static_cast<jsize>(level_meta_data->files.size());
+    jobjectArray jfiles =
+        env->NewObjectArray(jlen, SstFileMetaDataJni::getJClass(env), nullptr);
+    if (jfiles == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+
+    jsize i = 0;
+    for (auto it = level_meta_data->files.begin();
+         it != level_meta_data->files.end(); ++it) {
+      jobject jfile = SstFileMetaDataJni::fromCppSstFileMetaData(env, &(*it));
+      if (jfile == nullptr) {
+        // exception occurred
+        env->DeleteLocalRef(jfiles);
+        return nullptr;
+      }
+      env->SetObjectArrayElement(jfiles, i++, jfile);
+    }
+
+    jobject jlevel_meta_data =
+        env->NewObject(jclazz, mid, static_cast<jint>(level_meta_data->level),
+                       static_cast<jlong>(level_meta_data->size), jfiles);
+
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jfiles);
+      return nullptr;
+    }
+
+    // cleanup
+    env->DeleteLocalRef(jfiles);
+
+    return jlevel_meta_data;
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/LevelMetaData");
+  }
+};
+
+class ColumnFamilyMetaDataJni : public JavaClass {
+ public:
+  /**
+   * Create a new Java org.rocksdb.ColumnFamilyMetaData object.
+   *
+   * @param env A pointer to the Java environment
+   * @param column_famly_meta_data A Cpp live file meta data object
+   *
+   * @return A reference to a Java org.rocksdb.ColumnFamilyMetaData object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject fromCppColumnFamilyMetaData(
+      JNIEnv* env,
+      const ROCKSDB_NAMESPACE::ColumnFamilyMetaData* column_famly_meta_data) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID mid = env->GetMethodID(jclazz, "<init>",
+                                     "(JJ[B[Lorg/rocksdb/LevelMetaData;)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    jbyteArray jname = ROCKSDB_NAMESPACE::JniUtil::copyBytes(
+        env, column_famly_meta_data->name);
+    if (jname == nullptr) {
+      // exception occurred creating java byte array
+      return nullptr;
+    }
+
+    const jsize jlen =
+        static_cast<jsize>(column_famly_meta_data->levels.size());
+    jobjectArray jlevels =
+        env->NewObjectArray(jlen, LevelMetaDataJni::getJClass(env), nullptr);
+    if (jlevels == nullptr) {
+      // exception thrown: OutOfMemoryError
+      env->DeleteLocalRef(jname);
+      return nullptr;
+    }
+
+    jsize i = 0;
+    for (auto it = column_famly_meta_data->levels.begin();
+         it != column_famly_meta_data->levels.end(); ++it) {
+      jobject jlevel = LevelMetaDataJni::fromCppLevelMetaData(env, &(*it));
+      if (jlevel == nullptr) {
+        // exception occurred
+        env->DeleteLocalRef(jname);
+        env->DeleteLocalRef(jlevels);
+        return nullptr;
+      }
+      env->SetObjectArrayElement(jlevels, i++, jlevel);
+    }
+
+    jobject jcolumn_family_meta_data = env->NewObject(
+        jclazz, mid, static_cast<jlong>(column_famly_meta_data->size),
+        static_cast<jlong>(column_famly_meta_data->file_count), jname, jlevels);
+
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jname);
+      env->DeleteLocalRef(jlevels);
+      return nullptr;
+    }
+
+    // cleanup
+    env->DeleteLocalRef(jname);
+    env->DeleteLocalRef(jlevels);
+
+    return jcolumn_family_meta_data;
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/ColumnFamilyMetaData");
+  }
+};
+
+// The portal class for org.rocksdb.AbstractTraceWriter
+class AbstractTraceWriterJni
+    : public RocksDBNativeClass<
+          const ROCKSDB_NAMESPACE::TraceWriterJniCallback*,
+          AbstractTraceWriterJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.AbstractTraceWriter
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+                                         "org/rocksdb/AbstractTraceWriter");
+  }
+
+  /**
+   * Get the Java Method: AbstractTraceWriter#write
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getWriteProxyMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "writeProxy", "(J)S");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractTraceWriter#closeWriter
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getCloseWriterProxyMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "closeWriterProxy", "()S");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractTraceWriter#getFileSize
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getGetFileSizeMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "getFileSize", "()J");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+// The portal class for org.rocksdb.AbstractWalFilter
+class AbstractWalFilterJni
+    : public RocksDBNativeClass<const ROCKSDB_NAMESPACE::WalFilterJniCallback*,
+                                AbstractWalFilterJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.AbstractWalFilter
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/AbstractWalFilter");
+  }
+
+  /**
+   * Get the Java Method: AbstractWalFilter#columnFamilyLogNumberMap
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getColumnFamilyLogNumberMapMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "columnFamilyLogNumberMap",
+                         "(Ljava/util/Map;Ljava/util/Map;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractTraceWriter#logRecordFoundProxy
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getLogRecordFoundProxyMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "logRecordFoundProxy",
+                                            "(JLjava/lang/String;JJ)S");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractTraceWriter#name
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getNameMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "name", "()Ljava/lang/String;");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+// The portal class for org.rocksdb.WalProcessingOption
+class WalProcessingOptionJni {
+ public:
+  // Returns the equivalent org.rocksdb.WalProcessingOption for the provided
+  // C++ ROCKSDB_NAMESPACE::WalFilter::WalProcessingOption enum
+  static jbyte toJavaWalProcessingOption(
+      const ROCKSDB_NAMESPACE::WalFilter::WalProcessingOption&
+          wal_processing_option) {
+    switch (wal_processing_option) {
+      case ROCKSDB_NAMESPACE::WalFilter::WalProcessingOption::
+          kContinueProcessing:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::WalFilter::WalProcessingOption::
+          kIgnoreCurrentRecord:
+        return 0x1;
+      case ROCKSDB_NAMESPACE::WalFilter::WalProcessingOption::kStopReplay:
+        return 0x2;
+      case ROCKSDB_NAMESPACE::WalFilter::WalProcessingOption::kCorruptedRecord:
+        return 0x3;
+      default:
+        return 0x7F;  // undefined
+    }
+  }
+
+  // Returns the equivalent C++
+  // ROCKSDB_NAMESPACE::WalFilter::WalProcessingOption enum for the provided
+  // Java org.rocksdb.WalProcessingOption
+  static ROCKSDB_NAMESPACE::WalFilter::WalProcessingOption
+  toCppWalProcessingOption(jbyte jwal_processing_option) {
+    switch (jwal_processing_option) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::WalFilter::WalProcessingOption::
+            kContinueProcessing;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::WalFilter::WalProcessingOption::
+            kIgnoreCurrentRecord;
+      case 0x2:
+        return ROCKSDB_NAMESPACE::WalFilter::WalProcessingOption::kStopReplay;
+      case 0x3:
+        return ROCKSDB_NAMESPACE::WalFilter::WalProcessingOption::
+            kCorruptedRecord;
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::WalFilter::WalProcessingOption::
+            kCorruptedRecord;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.ReusedSynchronisationType
+class ReusedSynchronisationTypeJni {
+ public:
+  // Returns the equivalent org.rocksdb.ReusedSynchronisationType for the
+  // provided C++ ROCKSDB_NAMESPACE::ReusedSynchronisationType enum
+  static jbyte toJavaReusedSynchronisationType(
+      const ROCKSDB_NAMESPACE::ReusedSynchronisationType&
+          reused_synchronisation_type) {
+    switch (reused_synchronisation_type) {
+      case ROCKSDB_NAMESPACE::ReusedSynchronisationType::MUTEX:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::ReusedSynchronisationType::ADAPTIVE_MUTEX:
+        return 0x1;
+      case ROCKSDB_NAMESPACE::ReusedSynchronisationType::THREAD_LOCAL:
+        return 0x2;
+      default:
+        return 0x7F;  // undefined
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::ReusedSynchronisationType
+  // enum for the provided Java org.rocksdb.ReusedSynchronisationType
+  static ROCKSDB_NAMESPACE::ReusedSynchronisationType
+  toCppReusedSynchronisationType(jbyte reused_synchronisation_type) {
+    switch (reused_synchronisation_type) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::ReusedSynchronisationType::MUTEX;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::ReusedSynchronisationType::ADAPTIVE_MUTEX;
+      case 0x2:
+        return ROCKSDB_NAMESPACE::ReusedSynchronisationType::THREAD_LOCAL;
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::ReusedSynchronisationType::ADAPTIVE_MUTEX;
+    }
+  }
+};
+// The portal class for org.rocksdb.SanityLevel
+class SanityLevelJni {
+ public:
+  // Returns the equivalent org.rocksdb.SanityLevel for the provided
+  // C++ ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel enum
+  static jbyte toJavaSanityLevel(
+      const ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel& sanity_level) {
+    switch (sanity_level) {
+      case ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel::kSanityLevelNone:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel::
+          kSanityLevelLooselyCompatible:
+        return 0x1;
+      case ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel::
+          kSanityLevelExactMatch:
+        return -0x01;
+      default:
+        return -0x01;  // undefined
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel
+  // enum for the provided Java org.rocksdb.SanityLevel
+  static ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel toCppSanityLevel(
+      jbyte sanity_level) {
+    switch (sanity_level) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::ConfigOptions::kSanityLevelNone;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::ConfigOptions::kSanityLevelLooselyCompatible;
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::ConfigOptions::kSanityLevelExactMatch;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.PrepopulateBlobCache
+class PrepopulateBlobCacheJni {
+ public:
+  // Returns the equivalent org.rocksdb.PrepopulateBlobCache for the provided
+  // C++ ROCKSDB_NAMESPACE::PrepopulateBlobCache enum
+  static jbyte toJavaPrepopulateBlobCache(
+      ROCKSDB_NAMESPACE::PrepopulateBlobCache prepopulate_blob_cache) {
+    switch (prepopulate_blob_cache) {
+      case ROCKSDB_NAMESPACE::PrepopulateBlobCache::kDisable:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::PrepopulateBlobCache::kFlushOnly:
+        return 0x1;
+      default:
+        return 0x7f;  // undefined
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::PrepopulateBlobCache enum for
+  // the provided Java org.rocksdb.PrepopulateBlobCache
+  static ROCKSDB_NAMESPACE::PrepopulateBlobCache toCppPrepopulateBlobCache(
+      jbyte jprepopulate_blob_cache) {
+    switch (jprepopulate_blob_cache) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::PrepopulateBlobCache::kDisable;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::PrepopulateBlobCache::kFlushOnly;
+      case 0x7F:
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::PrepopulateBlobCache::kDisable;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.AbstractListener.EnabledEventCallback
+class EnabledEventCallbackJni {
+ public:
+  // Returns the set of equivalent C++
+  // ROCKSDB_NAMESPACE::EnabledEventCallbackJni::EnabledEventCallback enums for
+  // the provided Java jenabled_event_callback_values
+  static std::set<EnabledEventCallback> toCppEnabledEventCallbacks(
+      jlong jenabled_event_callback_values) {
+    std::set<EnabledEventCallback> enabled_event_callbacks;
+    for (size_t i = 0; i < EnabledEventCallback::NUM_ENABLED_EVENT_CALLBACK;
+         ++i) {
+      if (((1ULL << i) & jenabled_event_callback_values) > 0) {
+        enabled_event_callbacks.emplace(static_cast<EnabledEventCallback>(i));
+      }
+    }
+    return enabled_event_callbacks;
+  }
+};
+
+// The portal class for org.rocksdb.AbstractEventListener
+class AbstractEventListenerJni
+    : public RocksDBNativeClass<
+          const ROCKSDB_NAMESPACE::EventListenerJniCallback*,
+          AbstractEventListenerJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.AbstractEventListener
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+                                         "org/rocksdb/AbstractEventListener");
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onFlushCompletedProxy
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnFlushCompletedProxyMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(jclazz, "onFlushCompletedProxy",
+                                            "(JLorg/rocksdb/FlushJobInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onFlushBeginProxy
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnFlushBeginProxyMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(jclazz, "onFlushBeginProxy",
+                                            "(JLorg/rocksdb/FlushJobInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onTableFileDeleted
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnTableFileDeletedMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "onTableFileDeleted", "(Lorg/rocksdb/TableFileDeletionInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onCompactionBeginProxy
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnCompactionBeginProxyMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "onCompactionBeginProxy",
+                         "(JLorg/rocksdb/CompactionJobInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onCompactionCompletedProxy
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnCompactionCompletedProxyMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "onCompactionCompletedProxy",
+                         "(JLorg/rocksdb/CompactionJobInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onTableFileCreated
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnTableFileCreatedMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "onTableFileCreated", "(Lorg/rocksdb/TableFileCreationInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onTableFileCreationStarted
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnTableFileCreationStartedMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "onTableFileCreationStarted",
+                         "(Lorg/rocksdb/TableFileCreationBriefInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onMemTableSealed
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnMemTableSealedMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(jclazz, "onMemTableSealed",
+                                            "(Lorg/rocksdb/MemTableInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method:
+   * AbstractEventListener#onColumnFamilyHandleDeletionStarted
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnColumnFamilyHandleDeletionStartedMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "onColumnFamilyHandleDeletionStarted",
+                         "(Lorg/rocksdb/ColumnFamilyHandle;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onExternalFileIngestedProxy
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnExternalFileIngestedProxyMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "onExternalFileIngestedProxy",
+                         "(JLorg/rocksdb/ExternalFileIngestionInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onBackgroundError
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnBackgroundErrorProxyMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(jclazz, "onBackgroundErrorProxy",
+                                            "(BLorg/rocksdb/Status;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onStallConditionsChanged
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnStallConditionsChangedMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(jclazz, "onStallConditionsChanged",
+                                            "(Lorg/rocksdb/WriteStallInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onFileReadFinish
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnFileReadFinishMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "onFileReadFinish", "(Lorg/rocksdb/FileOperationInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onFileWriteFinish
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnFileWriteFinishMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "onFileWriteFinish", "(Lorg/rocksdb/FileOperationInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onFileFlushFinish
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnFileFlushFinishMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "onFileFlushFinish", "(Lorg/rocksdb/FileOperationInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onFileSyncFinish
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnFileSyncFinishMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "onFileSyncFinish", "(Lorg/rocksdb/FileOperationInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onFileRangeSyncFinish
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnFileRangeSyncFinishMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "onFileRangeSyncFinish", "(Lorg/rocksdb/FileOperationInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onFileTruncateFinish
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnFileTruncateFinishMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "onFileTruncateFinish", "(Lorg/rocksdb/FileOperationInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onFileCloseFinish
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnFileCloseFinishMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "onFileCloseFinish", "(Lorg/rocksdb/FileOperationInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#shouldBeNotifiedOnFileIO
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getShouldBeNotifiedOnFileIOMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "shouldBeNotifiedOnFileIO", "()Z");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onErrorRecoveryBeginProxy
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnErrorRecoveryBeginProxyMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(jclazz, "onErrorRecoveryBeginProxy",
+                                            "(BLorg/rocksdb/Status;)Z");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onErrorRecoveryCompleted
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnErrorRecoveryCompletedMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(jclazz, "onErrorRecoveryCompleted",
+                                            "(Lorg/rocksdb/Status;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+class FlushJobInfoJni : public JavaClass {
+ public:
+  /**
+   * Create a new Java org.rocksdb.FlushJobInfo object.
+   *
+   * @param env A pointer to the Java environment
+   * @param flush_job_info A Cpp flush job info object
+   *
+   * @return A reference to a Java org.rocksdb.FlushJobInfo object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject fromCppFlushJobInfo(
+      JNIEnv* env, const ROCKSDB_NAMESPACE::FlushJobInfo* flush_job_info) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    jstring jcf_name = JniUtil::toJavaString(env, &flush_job_info->cf_name);
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+    jstring jfile_path = JniUtil::toJavaString(env, &flush_job_info->file_path);
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jfile_path);
+      return nullptr;
+    }
+    jobject jtable_properties = TablePropertiesJni::fromCppTableProperties(
+        env, flush_job_info->table_properties);
+    if (jtable_properties == nullptr) {
+      env->DeleteLocalRef(jcf_name);
+      env->DeleteLocalRef(jfile_path);
+      return nullptr;
+    }
+    return env->NewObject(
+        jclazz, ctor, static_cast<jlong>(flush_job_info->cf_id), jcf_name,
+        jfile_path, static_cast<jlong>(flush_job_info->thread_id),
+        static_cast<jint>(flush_job_info->job_id),
+        static_cast<jboolean>(flush_job_info->triggered_writes_slowdown),
+        static_cast<jboolean>(flush_job_info->triggered_writes_stop),
+        static_cast<jlong>(flush_job_info->smallest_seqno),
+        static_cast<jlong>(flush_job_info->largest_seqno), jtable_properties,
+        static_cast<jbyte>(flush_job_info->flush_reason));
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/FlushJobInfo");
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(clazz, "<init>",
+                            "(JLjava/lang/String;Ljava/lang/String;JIZZJJLorg/"
+                            "rocksdb/TableProperties;B)V");
+  }
+};
+
+class TableFileDeletionInfoJni : public JavaClass {
+ public:
+  /**
+   * Create a new Java org.rocksdb.TableFileDeletionInfo object.
+   *
+   * @param env A pointer to the Java environment
+   * @param file_del_info A Cpp table file deletion info object
+   *
+   * @return A reference to a Java org.rocksdb.TableFileDeletionInfo object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject fromCppTableFileDeletionInfo(
+      JNIEnv* env,
+      const ROCKSDB_NAMESPACE::TableFileDeletionInfo* file_del_info) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    jstring jdb_name = JniUtil::toJavaString(env, &file_del_info->db_name);
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+    jobject jstatus = StatusJni::construct(env, file_del_info->status);
+    if (jstatus == nullptr) {
+      env->DeleteLocalRef(jdb_name);
+      return nullptr;
+    }
+    return env->NewObject(jclazz, ctor, jdb_name,
+                          JniUtil::toJavaString(env, &file_del_info->file_path),
+                          static_cast<jint>(file_del_info->job_id), jstatus);
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/TableFileDeletionInfo");
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(
+        clazz, "<init>",
+        "(Ljava/lang/String;Ljava/lang/String;ILorg/rocksdb/Status;)V");
+  }
+};
+
+class CompactionJobInfoJni : public JavaClass {
+ public:
+  static jobject fromCppCompactionJobInfo(
+      JNIEnv* env,
+      const ROCKSDB_NAMESPACE::CompactionJobInfo* compaction_job_info) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    return env->NewObject(jclazz, ctor,
+                          GET_CPLUSPLUS_POINTER(compaction_job_info));
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/CompactionJobInfo");
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(clazz, "<init>", "(J)V");
+  }
+};
+
+class TableFileCreationInfoJni : public JavaClass {
+ public:
+  static jobject fromCppTableFileCreationInfo(
+      JNIEnv* env, const ROCKSDB_NAMESPACE::TableFileCreationInfo* info) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    jstring jdb_name = JniUtil::toJavaString(env, &info->db_name);
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+    jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name);
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jdb_name);
+      return nullptr;
+    }
+    jstring jfile_path = JniUtil::toJavaString(env, &info->file_path);
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jdb_name);
+      env->DeleteLocalRef(jcf_name);
+      return nullptr;
+    }
+    jobject jtable_properties =
+        TablePropertiesJni::fromCppTableProperties(env, info->table_properties);
+    if (jtable_properties == nullptr) {
+      env->DeleteLocalRef(jdb_name);
+      env->DeleteLocalRef(jcf_name);
+      return nullptr;
+    }
+    jobject jstatus = StatusJni::construct(env, info->status);
+    if (jstatus == nullptr) {
+      env->DeleteLocalRef(jdb_name);
+      env->DeleteLocalRef(jcf_name);
+      env->DeleteLocalRef(jtable_properties);
+      return nullptr;
+    }
+    return env->NewObject(jclazz, ctor, static_cast<jlong>(info->file_size),
+                          jtable_properties, jstatus, jdb_name, jcf_name,
+                          jfile_path, static_cast<jint>(info->job_id),
+                          static_cast<jbyte>(info->reason));
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/TableFileCreationInfo");
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(
+        clazz, "<init>",
+        "(JLorg/rocksdb/TableProperties;Lorg/rocksdb/Status;Ljava/lang/"
+        "String;Ljava/lang/String;Ljava/lang/String;IB)V");
+  }
+};
+
+class TableFileCreationBriefInfoJni : public JavaClass {
+ public:
+  static jobject fromCppTableFileCreationBriefInfo(
+      JNIEnv* env, const ROCKSDB_NAMESPACE::TableFileCreationBriefInfo* info) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    jstring jdb_name = JniUtil::toJavaString(env, &info->db_name);
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+    jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name);
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jdb_name);
+      return nullptr;
+    }
+    jstring jfile_path = JniUtil::toJavaString(env, &info->file_path);
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jdb_name);
+      env->DeleteLocalRef(jcf_name);
+      return nullptr;
+    }
+    return env->NewObject(jclazz, ctor, jdb_name, jcf_name, jfile_path,
+                          static_cast<jint>(info->job_id),
+                          static_cast<jbyte>(info->reason));
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/TableFileCreationBriefInfo");
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(
+        clazz, "<init>",
+        "(Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;IB)V");
+  }
+};
+
+class MemTableInfoJni : public JavaClass {
+ public:
+  static jobject fromCppMemTableInfo(
+      JNIEnv* env, const ROCKSDB_NAMESPACE::MemTableInfo* info) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name);
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+    return env->NewObject(jclazz, ctor, jcf_name,
+                          static_cast<jlong>(info->first_seqno),
+                          static_cast<jlong>(info->earliest_seqno),
+                          static_cast<jlong>(info->num_entries),
+                          static_cast<jlong>(info->num_deletes));
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/MemTableInfo");
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(clazz, "<init>", "(Ljava/lang/String;JJJJ)V");
+  }
+};
+
+class ExternalFileIngestionInfoJni : public JavaClass {
+ public:
+  static jobject fromCppExternalFileIngestionInfo(
+      JNIEnv* env, const ROCKSDB_NAMESPACE::ExternalFileIngestionInfo* info) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name);
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+    jstring jexternal_file_path =
+        JniUtil::toJavaString(env, &info->external_file_path);
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jcf_name);
+      return nullptr;
+    }
+    jstring jinternal_file_path =
+        JniUtil::toJavaString(env, &info->internal_file_path);
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jcf_name);
+      env->DeleteLocalRef(jexternal_file_path);
+      return nullptr;
+    }
+    jobject jtable_properties =
+        TablePropertiesJni::fromCppTableProperties(env, info->table_properties);
+    if (jtable_properties == nullptr) {
+      env->DeleteLocalRef(jcf_name);
+      env->DeleteLocalRef(jexternal_file_path);
+      env->DeleteLocalRef(jinternal_file_path);
+      return nullptr;
+    }
+    return env->NewObject(
+        jclazz, ctor, jcf_name, jexternal_file_path, jinternal_file_path,
+        static_cast<jlong>(info->global_seqno), jtable_properties);
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/ExternalFileIngestionInfo");
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(clazz, "<init>",
+                            "(Ljava/lang/String;Ljava/lang/String;Ljava/lang/"
+                            "String;JLorg/rocksdb/TableProperties;)V");
+  }
+};
+
+class WriteStallInfoJni : public JavaClass {
+ public:
+  static jobject fromCppWriteStallInfo(
+      JNIEnv* env, const ROCKSDB_NAMESPACE::WriteStallInfo* info) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name);
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+    return env->NewObject(jclazz, ctor, jcf_name,
+                          static_cast<jbyte>(info->condition.cur),
+                          static_cast<jbyte>(info->condition.prev));
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/WriteStallInfo");
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(clazz, "<init>", "(Ljava/lang/String;BB)V");
+  }
+};
+
+class FileOperationInfoJni : public JavaClass {
+ public:
+  static jobject fromCppFileOperationInfo(
+      JNIEnv* env, const ROCKSDB_NAMESPACE::FileOperationInfo* info) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    jstring jpath = JniUtil::toJavaString(env, &info->path);
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+    jobject jstatus = StatusJni::construct(env, info->status);
+    if (jstatus == nullptr) {
+      env->DeleteLocalRef(jpath);
+      return nullptr;
+    }
+    return env->NewObject(
+        jclazz, ctor, jpath, static_cast<jlong>(info->offset),
+        static_cast<jlong>(info->length),
+        static_cast<jlong>(info->start_ts.time_since_epoch().count()),
+        static_cast<jlong>(info->duration.count()), jstatus);
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/FileOperationInfo");
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(clazz, "<init>",
+                            "(Ljava/lang/String;JJJJLorg/rocksdb/Status;)V");
+  }
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // JAVA_ROCKSJNI_PORTAL_H_
diff --git a/src/rocksdb/java/rocksjni/ratelimiterjni.cc b/src/rocksdb/java/rocksjni/ratelimiterjni.cc
new file mode 100644
index 000000000..7a17f367e
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/ratelimiterjni.cc
@@ -0,0 +1,128 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for RateLimiter.
+
+#include "include/org_rocksdb_RateLimiter.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_RateLimiter
+ * Method:    newRateLimiterHandle
+ * Signature: (JJIBZ)J
+ */
+jlong Java_org_rocksdb_RateLimiter_newRateLimiterHandle(
+    JNIEnv* /*env*/, jclass /*jclazz*/, jlong jrate_bytes_per_second,
+    jlong jrefill_period_micros, jint jfairness, jbyte jrate_limiter_mode,
+    jboolean jauto_tune) {
+  auto rate_limiter_mode =
+      ROCKSDB_NAMESPACE::RateLimiterModeJni::toCppRateLimiterMode(
+          jrate_limiter_mode);
+  auto* sptr_rate_limiter = new std::shared_ptr<ROCKSDB_NAMESPACE::RateLimiter>(
+      ROCKSDB_NAMESPACE::NewGenericRateLimiter(
+          static_cast<int64_t>(jrate_bytes_per_second),
+          static_cast<int64_t>(jrefill_period_micros),
+          static_cast<int32_t>(jfairness), rate_limiter_mode, jauto_tune));
+
+  return GET_CPLUSPLUS_POINTER(sptr_rate_limiter);
+}
+
+/*
+ * Class:     org_rocksdb_RateLimiter
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RateLimiter_disposeInternal(JNIEnv* /*env*/,
+                                                  jobject /*jobj*/,
+                                                  jlong jhandle) {
+  auto* handle =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::RateLimiter>*>(
+          jhandle);
+  delete handle;  // delete std::shared_ptr
+}
+
+/*
+ * Class:     org_rocksdb_RateLimiter
+ * Method:    setBytesPerSecond
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_RateLimiter_setBytesPerSecond(JNIEnv* /*env*/,
+                                                    jobject /*jobj*/,
+                                                    jlong handle,
+                                                    jlong jbytes_per_second) {
+  reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::RateLimiter>*>(handle)
+      ->get()
+      ->SetBytesPerSecond(jbytes_per_second);
+}
+
+/*
+ * Class:     org_rocksdb_RateLimiter
+ * Method:    getBytesPerSecond
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_RateLimiter_getBytesPerSecond(JNIEnv* /*env*/,
+                                                     jobject /*jobj*/,
+                                                     jlong handle) {
+  return reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::RateLimiter>*>(
+             handle)
+      ->get()
+      ->GetBytesPerSecond();
+}
+
+/*
+ * Class:     org_rocksdb_RateLimiter
+ * Method:    request
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_RateLimiter_request(JNIEnv* /*env*/, jobject /*jobj*/,
+                                          jlong handle, jlong jbytes) {
+  reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::RateLimiter>*>(handle)
+      ->get()
+      ->Request(jbytes, ROCKSDB_NAMESPACE::Env::IO_TOTAL);
+}
+
+/*
+ * Class:     org_rocksdb_RateLimiter
+ * Method:    getSingleBurstBytes
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_RateLimiter_getSingleBurstBytes(JNIEnv* /*env*/,
+                                                       jobject /*jobj*/,
+                                                       jlong handle) {
+  return reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::RateLimiter>*>(
+             handle)
+      ->get()
+      ->GetSingleBurstBytes();
+}
+
+/*
+ * Class:     org_rocksdb_RateLimiter
+ * Method:    getTotalBytesThrough
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_RateLimiter_getTotalBytesThrough(JNIEnv* /*env*/,
+                                                        jobject /*jobj*/,
+                                                        jlong handle) {
+  return reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::RateLimiter>*>(
+             handle)
+      ->get()
+      ->GetTotalBytesThrough();
+}
+
+/*
+ * Class:     org_rocksdb_RateLimiter
+ * Method:    getTotalRequests
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_RateLimiter_getTotalRequests(JNIEnv* /*env*/,
+                                                    jobject /*jobj*/,
+                                                    jlong handle) {
+  return reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::RateLimiter>*>(
+             handle)
+      ->get()
+      ->GetTotalRequests();
+}
diff --git a/src/rocksdb/java/rocksjni/remove_emptyvalue_compactionfilterjni.cc b/src/rocksdb/java/rocksjni/remove_emptyvalue_compactionfilterjni.cc
new file mode 100644
index 000000000..c0b09e151
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/remove_emptyvalue_compactionfilterjni.cc
@@ -0,0 +1,24 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <jni.h>
+
+#include "include/org_rocksdb_RemoveEmptyValueCompactionFilter.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "utilities/compaction_filters/remove_emptyvalue_compactionfilter.h"
+
+/*
+ * Class:     org_rocksdb_RemoveEmptyValueCompactionFilter
+ * Method:    createNewRemoveEmptyValueCompactionFilter0
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_RemoveEmptyValueCompactionFilter_createNewRemoveEmptyValueCompactionFilter0(
+    JNIEnv* /*env*/, jclass /*jcls*/) {
+  auto* compaction_filter =
+      new ROCKSDB_NAMESPACE::RemoveEmptyValueCompactionFilter();
+
+  // set the native handle to our native compaction filter
+  return GET_CPLUSPLUS_POINTER(compaction_filter);
+}
diff --git a/src/rocksdb/java/rocksjni/restorejni.cc b/src/rocksdb/java/rocksjni/restorejni.cc
new file mode 100644
index 000000000..aadc86128
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/restorejni.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling C++ ROCKSDB_NAMESPACE::RestoreOptions methods
+// from Java side.
+
+#include <jni.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <string>
+
+#include "include/org_rocksdb_RestoreOptions.h"
+#include "rocksdb/utilities/backup_engine.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+/*
+ * Class:     org_rocksdb_RestoreOptions
+ * Method:    newRestoreOptions
+ * Signature: (Z)J
+ */
+jlong Java_org_rocksdb_RestoreOptions_newRestoreOptions(
+    JNIEnv* /*env*/, jclass /*jcls*/, jboolean keep_log_files) {
+  auto* ropt = new ROCKSDB_NAMESPACE::RestoreOptions(keep_log_files);
+  return GET_CPLUSPLUS_POINTER(ropt);
+}
+
+/*
+ * Class:     org_rocksdb_RestoreOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RestoreOptions_disposeInternal(JNIEnv* /*env*/,
+                                                     jobject /*jobj*/,
+                                                     jlong jhandle) {
+  auto* ropt = reinterpret_cast<ROCKSDB_NAMESPACE::RestoreOptions*>(jhandle);
+  assert(ropt);
+  delete ropt;
+}
diff --git a/src/rocksdb/java/rocksjni/rocks_callback_object.cc b/src/rocksdb/java/rocksjni/rocks_callback_object.cc
new file mode 100644
index 000000000..35513e151
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/rocks_callback_object.cc
@@ -0,0 +1,30 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// JNI Callbacks from C++ to sub-classes or org.rocksdb.RocksCallbackObject
+
+#include <jni.h>
+
+#include "include/org_rocksdb_RocksCallbackObject.h"
+#include "jnicallback.h"
+
+/*
+ * Class:     org_rocksdb_RocksCallbackObject
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksCallbackObject_disposeInternal(JNIEnv* /*env*/,
+                                                          jobject /*jobj*/,
+                                                          jlong handle) {
+  // TODO(AR) is deleting from the super class JniCallback OK, or must we delete
+  // the subclass? Example hierarchies:
+  //   1) Comparator -> BaseComparatorJniCallback + JniCallback ->
+  //   DirectComparatorJniCallback 2) Comparator -> BaseComparatorJniCallback +
+  //   JniCallback -> ComparatorJniCallback
+  // I think this is okay, as Comparator and JniCallback both have virtual
+  // destructors...
+  delete reinterpret_cast<ROCKSDB_NAMESPACE::JniCallback*>(handle);
+}
diff --git a/src/rocksdb/java/rocksjni/rocksdb_exception_test.cc b/src/rocksdb/java/rocksjni/rocksdb_exception_test.cc
new file mode 100644
index 000000000..67e62f726
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/rocksdb_exception_test.cc
@@ -0,0 +1,81 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <jni.h>
+
+#include "include/org_rocksdb_RocksDBExceptionTest.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_RocksDBExceptionTest
+ * Method:    raiseException
+ * Signature: ()V
+ */
+void Java_org_rocksdb_RocksDBExceptionTest_raiseException(JNIEnv* env,
+                                                          jobject /*jobj*/) {
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env,
+                                                   std::string("test message"));
+}
+
+/*
+ * Class:     org_rocksdb_RocksDBExceptionTest
+ * Method:    raiseExceptionWithStatusCode
+ * Signature: ()V
+ */
+void Java_org_rocksdb_RocksDBExceptionTest_raiseExceptionWithStatusCode(
+    JNIEnv* env, jobject /*jobj*/) {
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+      env, "test message", ROCKSDB_NAMESPACE::Status::NotSupported());
+}
+
+/*
+ * Class:     org_rocksdb_RocksDBExceptionTest
+ * Method:    raiseExceptionNoMsgWithStatusCode
+ * Signature: ()V
+ */
+void Java_org_rocksdb_RocksDBExceptionTest_raiseExceptionNoMsgWithStatusCode(
+    JNIEnv* env, jobject /*jobj*/) {
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+      env, ROCKSDB_NAMESPACE::Status::NotSupported());
+}
+
+/*
+ * Class:     org_rocksdb_RocksDBExceptionTest
+ * Method:    raiseExceptionWithStatusCodeSubCode
+ * Signature: ()V
+ */
+void Java_org_rocksdb_RocksDBExceptionTest_raiseExceptionWithStatusCodeSubCode(
+    JNIEnv* env, jobject /*jobj*/) {
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+      env, "test message",
+      ROCKSDB_NAMESPACE::Status::TimedOut(
+          ROCKSDB_NAMESPACE::Status::SubCode::kLockTimeout));
+}
+
+/*
+ * Class:     org_rocksdb_RocksDBExceptionTest
+ * Method:    raiseExceptionNoMsgWithStatusCodeSubCode
+ * Signature: ()V
+ */
+void Java_org_rocksdb_RocksDBExceptionTest_raiseExceptionNoMsgWithStatusCodeSubCode(
+    JNIEnv* env, jobject /*jobj*/) {
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+      env, ROCKSDB_NAMESPACE::Status::TimedOut(
+               ROCKSDB_NAMESPACE::Status::SubCode::kLockTimeout));
+}
+
+/*
+ * Class:     org_rocksdb_RocksDBExceptionTest
+ * Method:    raiseExceptionWithStatusCodeState
+ * Signature: ()V
+ */
+void Java_org_rocksdb_RocksDBExceptionTest_raiseExceptionWithStatusCodeState(
+    JNIEnv* env, jobject /*jobj*/) {
+  ROCKSDB_NAMESPACE::Slice state("test state");
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+      env, "test message", ROCKSDB_NAMESPACE::Status::NotSupported(state));
+}
diff --git a/src/rocksdb/java/rocksjni/rocksjni.cc b/src/rocksdb/java/rocksjni/rocksjni.cc
new file mode 100644
index 000000000..23fa60467
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/rocksjni.cc
@@ -0,0 +1,3947 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ ROCKSDB_NAMESPACE::DB methods from Java side.
+
+#include <jni.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "include/org_rocksdb_RocksDB.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/types.h"
+#include "rocksdb/version.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+
+#ifdef min
+#undef min
+#endif
+
+jlong rocksdb_open_helper(JNIEnv* env, jlong jopt_handle, jstring jdb_path,
+                          std::function<ROCKSDB_NAMESPACE::Status(
+                              const ROCKSDB_NAMESPACE::Options&,
+                              const std::string&, ROCKSDB_NAMESPACE::DB**)>
+                              open_fn) {
+  const char* db_path = env->GetStringUTFChars(jdb_path, nullptr);
+  if (db_path == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return 0;
+  }
+
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jopt_handle);
+  ROCKSDB_NAMESPACE::DB* db = nullptr;
+  ROCKSDB_NAMESPACE::Status s = open_fn(*opt, db_path, &db);
+
+  env->ReleaseStringUTFChars(jdb_path, db_path);
+
+  if (s.ok()) {
+    return GET_CPLUSPLUS_POINTER(db);
+  } else {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+    return 0;
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    open
+ * Signature: (JLjava/lang/String;)J
+ */
+jlong Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2(JNIEnv* env, jclass,
+                                                          jlong jopt_handle,
+                                                          jstring jdb_path) {
+  return rocksdb_open_helper(env, jopt_handle, jdb_path,
+                             (ROCKSDB_NAMESPACE::Status(*)(
+                                 const ROCKSDB_NAMESPACE::Options&,
+                                 const std::string&, ROCKSDB_NAMESPACE::DB**)) &
+                                 ROCKSDB_NAMESPACE::DB::Open);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    openROnly
+ * Signature: (JLjava/lang/String;Z)J
+ */
+jlong Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2Z(
+    JNIEnv* env, jclass, jlong jopt_handle, jstring jdb_path,
+    jboolean jerror_if_wal_file_exists) {
+  const bool error_if_wal_file_exists = jerror_if_wal_file_exists == JNI_TRUE;
+  return rocksdb_open_helper(
+      env, jopt_handle, jdb_path,
+      [error_if_wal_file_exists](const ROCKSDB_NAMESPACE::Options& options,
+                                 const std::string& db_path,
+                                 ROCKSDB_NAMESPACE::DB** db) {
+        return ROCKSDB_NAMESPACE::DB::OpenForReadOnly(options, db_path, db,
+                                                      error_if_wal_file_exists);
+      });
+}
+
+jlongArray rocksdb_open_helper(
+    JNIEnv* env, jlong jopt_handle, jstring jdb_path,
+    jobjectArray jcolumn_names, jlongArray jcolumn_options,
+    std::function<ROCKSDB_NAMESPACE::Status(
+        const ROCKSDB_NAMESPACE::DBOptions&, const std::string&,
+        const std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor>&,
+        std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>*,
+        ROCKSDB_NAMESPACE::DB**)>
+        open_fn) {
+  const char* db_path = env->GetStringUTFChars(jdb_path, nullptr);
+  if (db_path == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+
+  const jsize len_cols = env->GetArrayLength(jcolumn_names);
+  jlong* jco = env->GetLongArrayElements(jcolumn_options, nullptr);
+  if (jco == nullptr) {
+    // exception thrown: OutOfMemoryError
+    env->ReleaseStringUTFChars(jdb_path, db_path);
+    return nullptr;
+  }
+
+  std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor> column_families;
+  jboolean has_exception = JNI_FALSE;
+  ROCKSDB_NAMESPACE::JniUtil::byteStrings<std::string>(
+      env, jcolumn_names,
+      [](const char* str_data, const size_t str_len) {
+        return std::string(str_data, str_len);
+      },
+      [&jco, &column_families](size_t idx, std::string cf_name) {
+        ROCKSDB_NAMESPACE::ColumnFamilyOptions* cf_options =
+            reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jco[idx]);
+        column_families.push_back(
+            ROCKSDB_NAMESPACE::ColumnFamilyDescriptor(cf_name, *cf_options));
+      },
+      &has_exception);
+
+  env->ReleaseLongArrayElements(jcolumn_options, jco, JNI_ABORT);
+
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    env->ReleaseStringUTFChars(jdb_path, db_path);
+    return nullptr;
+  }
+
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jopt_handle);
+  std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*> cf_handles;
+  ROCKSDB_NAMESPACE::DB* db = nullptr;
+  ROCKSDB_NAMESPACE::Status s =
+      open_fn(*opt, db_path, column_families, &cf_handles, &db);
+
+  // we have now finished with db_path
+  env->ReleaseStringUTFChars(jdb_path, db_path);
+
+  // check if open operation was successful
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
+  }
+
+  const jsize resultsLen = 1 + len_cols;  // db handle + column family handles
+  std::unique_ptr<jlong[]> results =
+      std::unique_ptr<jlong[]>(new jlong[resultsLen]);
+  results[0] = GET_CPLUSPLUS_POINTER(db);
+  for (int i = 1; i <= len_cols; i++) {
+    results[i] = GET_CPLUSPLUS_POINTER(cf_handles[i - 1]);
+  }
+
+  jlongArray jresults = env->NewLongArray(resultsLen);
+  if (jresults == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+
+  env->SetLongArrayRegion(jresults, 0, resultsLen, results.get());
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    env->DeleteLocalRef(jresults);
+    return nullptr;
+  }
+
+  return jresults;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    openROnly
+ * Signature: (JLjava/lang/String;[[B[JZ)[J
+ */
+jlongArray Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2_3_3B_3JZ(
+    JNIEnv* env, jclass, jlong jopt_handle, jstring jdb_path,
+    jobjectArray jcolumn_names, jlongArray jcolumn_options,
+    jboolean jerror_if_wal_file_exists) {
+  const bool error_if_wal_file_exists = jerror_if_wal_file_exists == JNI_TRUE;
+  return rocksdb_open_helper(
+      env, jopt_handle, jdb_path, jcolumn_names, jcolumn_options,
+      [error_if_wal_file_exists](
+          const ROCKSDB_NAMESPACE::DBOptions& options,
+          const std::string& db_path,
+          const std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor>&
+              column_families,
+          std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>* handles,
+          ROCKSDB_NAMESPACE::DB** db) {
+        return ROCKSDB_NAMESPACE::DB::OpenForReadOnly(
+            options, db_path, column_families, handles, db,
+            error_if_wal_file_exists);
+      });
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    open
+ * Signature: (JLjava/lang/String;[[B[J)[J
+ */
+jlongArray Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2_3_3B_3J(
+    JNIEnv* env, jclass, jlong jopt_handle, jstring jdb_path,
+    jobjectArray jcolumn_names, jlongArray jcolumn_options) {
+  return rocksdb_open_helper(
+      env, jopt_handle, jdb_path, jcolumn_names, jcolumn_options,
+      (ROCKSDB_NAMESPACE::Status(*)(
+          const ROCKSDB_NAMESPACE::DBOptions&, const std::string&,
+          const std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor>&,
+          std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>*,
+          ROCKSDB_NAMESPACE::DB**)) &
+          ROCKSDB_NAMESPACE::DB::Open);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    openAsSecondary
+ * Signature: (JLjava/lang/String;Ljava/lang/String;)J
+ */
+jlong Java_org_rocksdb_RocksDB_openAsSecondary__JLjava_lang_String_2Ljava_lang_String_2(
+    JNIEnv* env, jclass, jlong jopt_handle, jstring jdb_path,
+    jstring jsecondary_db_path) {
+  const char* secondary_db_path =
+      env->GetStringUTFChars(jsecondary_db_path, nullptr);
+  if (secondary_db_path == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return 0;
+  }
+
+  jlong db_handle = rocksdb_open_helper(
+      env, jopt_handle, jdb_path,
+      [secondary_db_path](const ROCKSDB_NAMESPACE::Options& options,
+                          const std::string& db_path,
+                          ROCKSDB_NAMESPACE::DB** db) {
+        return ROCKSDB_NAMESPACE::DB::OpenAsSecondary(options, db_path,
+                                                      secondary_db_path, db);
+      });
+
+  // we have now finished with secondary_db_path
+  env->ReleaseStringUTFChars(jsecondary_db_path, secondary_db_path);
+
+  return db_handle;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    openAsSecondary
+ * Signature: (JLjava/lang/String;Ljava/lang/String;[[B[J)[J
+ */
+jlongArray
+Java_org_rocksdb_RocksDB_openAsSecondary__JLjava_lang_String_2Ljava_lang_String_2_3_3B_3J(
+    JNIEnv* env, jclass, jlong jopt_handle, jstring jdb_path,
+    jstring jsecondary_db_path, jobjectArray jcolumn_names,
+    jlongArray jcolumn_options) {
+  const char* secondary_db_path =
+      env->GetStringUTFChars(jsecondary_db_path, nullptr);
+  if (secondary_db_path == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+
+  jlongArray jhandles = rocksdb_open_helper(
+      env, jopt_handle, jdb_path, jcolumn_names, jcolumn_options,
+      [secondary_db_path](
+          const ROCKSDB_NAMESPACE::DBOptions& options,
+          const std::string& db_path,
+          const std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor>&
+              column_families,
+          std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>* handles,
+          ROCKSDB_NAMESPACE::DB** db) {
+        return ROCKSDB_NAMESPACE::DB::OpenAsSecondary(
+            options, db_path, secondary_db_path, column_families, handles, db);
+      });
+
+  // we have now finished with secondary_db_path
+  env->ReleaseStringUTFChars(jsecondary_db_path, secondary_db_path);
+
+  return jhandles;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksDB_disposeInternal(JNIEnv*, jobject, jlong jhandle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jhandle);
+  assert(db != nullptr);
+  delete db;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    closeDatabase
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksDB_closeDatabase(JNIEnv* env, jclass,
+                                            jlong jhandle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jhandle);
+  assert(db != nullptr);
+  ROCKSDB_NAMESPACE::Status s = db->Close();
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    listColumnFamilies
+ * Signature: (JLjava/lang/String;)[[B
+ */
+jobjectArray Java_org_rocksdb_RocksDB_listColumnFamilies(JNIEnv* env, jclass,
+                                                         jlong jopt_handle,
+                                                         jstring jdb_path) {
+  std::vector<std::string> column_family_names;
+  const char* db_path = env->GetStringUTFChars(jdb_path, nullptr);
+  if (db_path == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jopt_handle);
+  ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::DB::ListColumnFamilies(
+      *opt, db_path, &column_family_names);
+
+  env->ReleaseStringUTFChars(jdb_path, db_path);
+
+  jobjectArray jcolumn_family_names =
+      ROCKSDB_NAMESPACE::JniUtil::stringsBytes(env, column_family_names);
+
+  return jcolumn_family_names;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    createColumnFamily
+ * Signature: (J[BIJ)J
+ */
+jlong Java_org_rocksdb_RocksDB_createColumnFamily(JNIEnv* env, jobject,
+                                                  jlong jhandle,
+                                                  jbyteArray jcf_name,
+                                                  jint jcf_name_len,
+                                                  jlong jcf_options_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jhandle);
+  jboolean has_exception = JNI_FALSE;
+  const std::string cf_name =
+      ROCKSDB_NAMESPACE::JniUtil::byteString<std::string>(
+          env, jcf_name, jcf_name_len,
+          [](const char* str, const size_t len) {
+            return std::string(str, len);
+          },
+          &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    return 0;
+  }
+  auto* cf_options = reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(
+      jcf_options_handle);
+  ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle;
+  ROCKSDB_NAMESPACE::Status s =
+      db->CreateColumnFamily(*cf_options, cf_name, &cf_handle);
+  if (!s.ok()) {
+    // error occurred
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+    return 0;
+  }
+  return GET_CPLUSPLUS_POINTER(cf_handle);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    createColumnFamilies
+ * Signature: (JJ[[B)[J
+ */
+jlongArray Java_org_rocksdb_RocksDB_createColumnFamilies__JJ_3_3B(
+    JNIEnv* env, jobject, jlong jhandle, jlong jcf_options_handle,
+    jobjectArray jcf_names) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jhandle);
+  auto* cf_options = reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(
+      jcf_options_handle);
+  jboolean has_exception = JNI_FALSE;
+  std::vector<std::string> cf_names;
+  ROCKSDB_NAMESPACE::JniUtil::byteStrings<std::string>(
+      env, jcf_names,
+      [](const char* str, const size_t len) { return std::string(str, len); },
+      [&cf_names](const size_t, std::string str) { cf_names.push_back(str); },
+      &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    return nullptr;
+  }
+
+  std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*> cf_handles;
+  ROCKSDB_NAMESPACE::Status s =
+      db->CreateColumnFamilies(*cf_options, cf_names, &cf_handles);
+  if (!s.ok()) {
+    // error occurred
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
+  }
+
+  jlongArray jcf_handles = ROCKSDB_NAMESPACE::JniUtil::toJPointers<
+      ROCKSDB_NAMESPACE::ColumnFamilyHandle>(env, cf_handles, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    return nullptr;
+  }
+  return jcf_handles;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    createColumnFamilies
+ * Signature: (J[J[[B)[J
+ */
+jlongArray Java_org_rocksdb_RocksDB_createColumnFamilies__J_3J_3_3B(
+    JNIEnv* env, jobject, jlong jhandle, jlongArray jcf_options_handles,
+    jobjectArray jcf_names) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jhandle);
+  const jsize jlen = env->GetArrayLength(jcf_options_handles);
+  std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor> cf_descriptors;
+  cf_descriptors.reserve(jlen);
+
+  jlong* jcf_options_handles_elems =
+      env->GetLongArrayElements(jcf_options_handles, nullptr);
+  if (jcf_options_handles_elems == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+
+  // extract the column family descriptors
+  jboolean has_exception = JNI_FALSE;
+  for (jsize i = 0; i < jlen; i++) {
+    auto* cf_options =
+        reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(
+            jcf_options_handles_elems[i]);
+    jbyteArray jcf_name =
+        static_cast<jbyteArray>(env->GetObjectArrayElement(jcf_names, i));
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      env->ReleaseLongArrayElements(jcf_options_handles,
+                                    jcf_options_handles_elems, JNI_ABORT);
+      return nullptr;
+    }
+    const std::string cf_name =
+        ROCKSDB_NAMESPACE::JniUtil::byteString<std::string>(
+            env, jcf_name,
+            [](const char* str, const size_t len) {
+              return std::string(str, len);
+            },
+            &has_exception);
+    if (has_exception == JNI_TRUE) {
+      // exception occurred
+      env->DeleteLocalRef(jcf_name);
+      env->ReleaseLongArrayElements(jcf_options_handles,
+                                    jcf_options_handles_elems, JNI_ABORT);
+      return nullptr;
+    }
+
+    cf_descriptors.push_back(
+        ROCKSDB_NAMESPACE::ColumnFamilyDescriptor(cf_name, *cf_options));
+
+    env->DeleteLocalRef(jcf_name);
+  }
+
+  std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*> cf_handles;
+  ROCKSDB_NAMESPACE::Status s =
+      db->CreateColumnFamilies(cf_descriptors, &cf_handles);
+
+  env->ReleaseLongArrayElements(jcf_options_handles, jcf_options_handles_elems,
+                                JNI_ABORT);
+
+  if (!s.ok()) {
+    // error occurred
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
+  }
+
+  jlongArray jcf_handles = ROCKSDB_NAMESPACE::JniUtil::toJPointers<
+      ROCKSDB_NAMESPACE::ColumnFamilyHandle>(env, cf_handles, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    return nullptr;
+  }
+  return jcf_handles;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    dropColumnFamily
+ * Signature: (JJ)V;
+ */
+void Java_org_rocksdb_RocksDB_dropColumnFamily(JNIEnv* env, jobject,
+                                               jlong jdb_handle,
+                                               jlong jcf_handle) {
+  auto* db_handle = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  ROCKSDB_NAMESPACE::Status s = db_handle->DropColumnFamily(cf_handle);
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    dropColumnFamilies
+ * Signature: (J[J)V
+ */
+void Java_org_rocksdb_RocksDB_dropColumnFamilies(
+    JNIEnv* env, jobject, jlong jdb_handle, jlongArray jcolumn_family_handles) {
+  auto* db_handle = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+
+  std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*> cf_handles;
+  if (jcolumn_family_handles != nullptr) {
+    const jsize len_cols = env->GetArrayLength(jcolumn_family_handles);
+
+    jlong* jcfh = env->GetLongArrayElements(jcolumn_family_handles, nullptr);
+    if (jcfh == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return;
+    }
+
+    for (jsize i = 0; i < len_cols; i++) {
+      auto* cf_handle =
+          reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcfh[i]);
+      cf_handles.push_back(cf_handle);
+    }
+    env->ReleaseLongArrayElements(jcolumn_family_handles, jcfh, JNI_ABORT);
+  }
+
+  ROCKSDB_NAMESPACE::Status s = db_handle->DropColumnFamilies(cf_handles);
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// ROCKSDB_NAMESPACE::DB::Put
+
+/**
+ * @return true if the put succeeded, false if a Java Exception was thrown
+ */
+bool rocksdb_put_helper(JNIEnv* env, ROCKSDB_NAMESPACE::DB* db,
+                        const ROCKSDB_NAMESPACE::WriteOptions& write_options,
+                        ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle,
+                        jbyteArray jkey, jint jkey_off, jint jkey_len,
+                        jbyteArray jval, jint jval_off, jint jval_len) {
+  jbyte* key = new jbyte[jkey_len];
+  env->GetByteArrayRegion(jkey, jkey_off, jkey_len, key);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    delete[] key;
+    return false;
+  }
+
+  jbyte* value = new jbyte[jval_len];
+  env->GetByteArrayRegion(jval, jval_off, jval_len, value);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    delete[] value;
+    delete[] key;
+    return false;
+  }
+
+  ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+  ROCKSDB_NAMESPACE::Slice value_slice(reinterpret_cast<char*>(value),
+                                       jval_len);
+
+  ROCKSDB_NAMESPACE::Status s;
+  if (cf_handle != nullptr) {
+    s = db->Put(write_options, cf_handle, key_slice, value_slice);
+  } else {
+    // backwards compatibility
+    s = db->Put(write_options, key_slice, value_slice);
+  }
+
+  // cleanup
+  delete[] value;
+  delete[] key;
+
+  if (s.ok()) {
+    return true;
+  } else {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+    return false;
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    put
+ * Signature: (J[BII[BII)V
+ */
+void Java_org_rocksdb_RocksDB_put__J_3BII_3BII(JNIEnv* env, jobject,
+                                               jlong jdb_handle,
+                                               jbyteArray jkey, jint jkey_off,
+                                               jint jkey_len, jbyteArray jval,
+                                               jint jval_off, jint jval_len) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  static const ROCKSDB_NAMESPACE::WriteOptions default_write_options =
+      ROCKSDB_NAMESPACE::WriteOptions();
+  rocksdb_put_helper(env, db, default_write_options, nullptr, jkey, jkey_off,
+                     jkey_len, jval, jval_off, jval_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    put
+ * Signature: (J[BII[BIIJ)V
+ */
+void Java_org_rocksdb_RocksDB_put__J_3BII_3BIIJ(JNIEnv* env, jobject,
+                                                jlong jdb_handle,
+                                                jbyteArray jkey, jint jkey_off,
+                                                jint jkey_len, jbyteArray jval,
+                                                jint jval_off, jint jval_len,
+                                                jlong jcf_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  static const ROCKSDB_NAMESPACE::WriteOptions default_write_options =
+      ROCKSDB_NAMESPACE::WriteOptions();
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    rocksdb_put_helper(env, db, default_write_options, cf_handle, jkey,
+                       jkey_off, jkey_len, jval, jval_off, jval_len);
+  } else {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env, ROCKSDB_NAMESPACE::Status::InvalidArgument(
+                 "Invalid ColumnFamilyHandle."));
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    put
+ * Signature: (JJ[BII[BII)V
+ */
+void Java_org_rocksdb_RocksDB_put__JJ_3BII_3BII(JNIEnv* env, jobject,
+                                                jlong jdb_handle,
+                                                jlong jwrite_options_handle,
+                                                jbyteArray jkey, jint jkey_off,
+                                                jint jkey_len, jbyteArray jval,
+                                                jint jval_off, jint jval_len) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto* write_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jwrite_options_handle);
+  rocksdb_put_helper(env, db, *write_options, nullptr, jkey, jkey_off, jkey_len,
+                     jval, jval_off, jval_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    put
+ * Signature: (JJ[BII[BIIJ)V
+ */
+void Java_org_rocksdb_RocksDB_put__JJ_3BII_3BIIJ(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jwrite_options_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len, jbyteArray jval,
+    jint jval_off, jint jval_len, jlong jcf_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto* write_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jwrite_options_handle);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    rocksdb_put_helper(env, db, *write_options, cf_handle, jkey, jkey_off,
+                       jkey_len, jval, jval_off, jval_len);
+  } else {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env, ROCKSDB_NAMESPACE::Status::InvalidArgument(
+                 "Invalid ColumnFamilyHandle."));
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    putDirect
+ * Signature: (JJLjava/nio/ByteBuffer;IILjava/nio/ByteBuffer;IIJ)V
+ */
+void Java_org_rocksdb_RocksDB_putDirect(
+    JNIEnv* env, jobject /*jdb*/, jlong jdb_handle, jlong jwrite_options_handle,
+    jobject jkey, jint jkey_off, jint jkey_len, jobject jval, jint jval_off,
+    jint jval_len, jlong jcf_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto* write_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jwrite_options_handle);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  auto put = [&env, &db, &cf_handle, &write_options](
+                 ROCKSDB_NAMESPACE::Slice& key,
+                 ROCKSDB_NAMESPACE::Slice& value) {
+    ROCKSDB_NAMESPACE::Status s;
+    if (cf_handle == nullptr) {
+      s = db->Put(*write_options, key, value);
+    } else {
+      s = db->Put(*write_options, cf_handle, key, value);
+    }
+    if (s.ok()) {
+      return;
+    }
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  };
+  ROCKSDB_NAMESPACE::JniUtil::kv_op_direct(put, env, jkey, jkey_off, jkey_len,
+                                           jval, jval_off, jval_len);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// ROCKSDB_NAMESPACE::DB::Delete()
+
+/**
+ * @return true if the delete succeeded, false if a Java Exception was thrown
+ */
+bool rocksdb_delete_helper(JNIEnv* env, ROCKSDB_NAMESPACE::DB* db,
+                           const ROCKSDB_NAMESPACE::WriteOptions& write_options,
+                           ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle,
+                           jbyteArray jkey, jint jkey_off, jint jkey_len) {
+  jbyte* key = new jbyte[jkey_len];
+  env->GetByteArrayRegion(jkey, jkey_off, jkey_len, key);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    delete[] key;
+    return false;
+  }
+  ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+
+  ROCKSDB_NAMESPACE::Status s;
+  if (cf_handle != nullptr) {
+    s = db->Delete(write_options, cf_handle, key_slice);
+  } else {
+    // backwards compatibility
+    s = db->Delete(write_options, key_slice);
+  }
+
+  // cleanup
+  delete[] key;
+
+  if (s.ok()) {
+    return true;
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  return false;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    delete
+ * Signature: (J[BII)V
+ */
+void Java_org_rocksdb_RocksDB_delete__J_3BII(JNIEnv* env, jobject,
+                                             jlong jdb_handle, jbyteArray jkey,
+                                             jint jkey_off, jint jkey_len) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  static const ROCKSDB_NAMESPACE::WriteOptions default_write_options =
+      ROCKSDB_NAMESPACE::WriteOptions();
+  rocksdb_delete_helper(env, db, default_write_options, nullptr, jkey, jkey_off,
+                        jkey_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    delete
+ * Signature: (J[BIIJ)V
+ */
+void Java_org_rocksdb_RocksDB_delete__J_3BIIJ(JNIEnv* env, jobject,
+                                              jlong jdb_handle, jbyteArray jkey,
+                                              jint jkey_off, jint jkey_len,
+                                              jlong jcf_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  static const ROCKSDB_NAMESPACE::WriteOptions default_write_options =
+      ROCKSDB_NAMESPACE::WriteOptions();
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    rocksdb_delete_helper(env, db, default_write_options, cf_handle, jkey,
+                          jkey_off, jkey_len);
+  } else {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env, ROCKSDB_NAMESPACE::Status::InvalidArgument(
+                 "Invalid ColumnFamilyHandle."));
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    delete
+ * Signature: (JJ[BII)V
+ */
+void Java_org_rocksdb_RocksDB_delete__JJ_3BII(JNIEnv* env, jobject,
+                                              jlong jdb_handle,
+                                              jlong jwrite_options,
+                                              jbyteArray jkey, jint jkey_off,
+                                              jint jkey_len) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto* write_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jwrite_options);
+  rocksdb_delete_helper(env, db, *write_options, nullptr, jkey, jkey_off,
+                        jkey_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    delete
+ * Signature: (JJ[BIIJ)V
+ */
+void Java_org_rocksdb_RocksDB_delete__JJ_3BIIJ(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jwrite_options,
+    jbyteArray jkey, jint jkey_off, jint jkey_len, jlong jcf_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto* write_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jwrite_options);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    rocksdb_delete_helper(env, db, *write_options, cf_handle, jkey, jkey_off,
+                          jkey_len);
+  } else {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env, ROCKSDB_NAMESPACE::Status::InvalidArgument(
+                 "Invalid ColumnFamilyHandle."));
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// ROCKSDB_NAMESPACE::DB::SingleDelete()
+/**
+ * @return true if the single delete succeeded, false if a Java Exception
+ *     was thrown
+ */
+bool rocksdb_single_delete_helper(
+    JNIEnv* env, ROCKSDB_NAMESPACE::DB* db,
+    const ROCKSDB_NAMESPACE::WriteOptions& write_options,
+    ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle, jbyteArray jkey,
+    jint jkey_len) {
+  jbyte* key = new jbyte[jkey_len];
+  env->GetByteArrayRegion(jkey, 0, jkey_len, key);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    delete[] key;
+    return false;
+  }
+  ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+
+  ROCKSDB_NAMESPACE::Status s;
+  if (cf_handle != nullptr) {
+    s = db->SingleDelete(write_options, cf_handle, key_slice);
+  } else {
+    // backwards compatibility
+    s = db->SingleDelete(write_options, key_slice);
+  }
+
+  delete[] key;
+
+  if (s.ok()) {
+    return true;
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  return false;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    singleDelete
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_RocksDB_singleDelete__J_3BI(JNIEnv* env, jobject,
+                                                  jlong jdb_handle,
+                                                  jbyteArray jkey,
+                                                  jint jkey_len) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  static const ROCKSDB_NAMESPACE::WriteOptions default_write_options =
+      ROCKSDB_NAMESPACE::WriteOptions();
+  rocksdb_single_delete_helper(env, db, default_write_options, nullptr, jkey,
+                               jkey_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    singleDelete
+ * Signature: (J[BIJ)V
+ */
+void Java_org_rocksdb_RocksDB_singleDelete__J_3BIJ(JNIEnv* env, jobject,
+                                                   jlong jdb_handle,
+                                                   jbyteArray jkey,
+                                                   jint jkey_len,
+                                                   jlong jcf_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  static const ROCKSDB_NAMESPACE::WriteOptions default_write_options =
+      ROCKSDB_NAMESPACE::WriteOptions();
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    rocksdb_single_delete_helper(env, db, default_write_options, cf_handle,
+                                 jkey, jkey_len);
+  } else {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env, ROCKSDB_NAMESPACE::Status::InvalidArgument(
+                 "Invalid ColumnFamilyHandle."));
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    singleDelete
+ * Signature: (JJ[BIJ)V
+ */
+void Java_org_rocksdb_RocksDB_singleDelete__JJ_3BI(JNIEnv* env, jobject,
+                                                   jlong jdb_handle,
+                                                   jlong jwrite_options,
+                                                   jbyteArray jkey,
+                                                   jint jkey_len) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto* write_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jwrite_options);
+  rocksdb_single_delete_helper(env, db, *write_options, nullptr, jkey,
+                               jkey_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    singleDelete
+ * Signature: (JJ[BIJ)V
+ */
+void Java_org_rocksdb_RocksDB_singleDelete__JJ_3BIJ(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jwrite_options,
+    jbyteArray jkey, jint jkey_len, jlong jcf_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto* write_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jwrite_options);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    rocksdb_single_delete_helper(env, db, *write_options, cf_handle, jkey,
+                                 jkey_len);
+  } else {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env, ROCKSDB_NAMESPACE::Status::InvalidArgument(
+                 "Invalid ColumnFamilyHandle."));
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// ROCKSDB_NAMESPACE::DB::DeleteRange()
+/**
+ * @return true if the delete range succeeded, false if a Java Exception
+ *     was thrown
+ */
+bool rocksdb_delete_range_helper(
+    JNIEnv* env, ROCKSDB_NAMESPACE::DB* db,
+    const ROCKSDB_NAMESPACE::WriteOptions& write_options,
+    ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle, jbyteArray jbegin_key,
+    jint jbegin_key_off, jint jbegin_key_len, jbyteArray jend_key,
+    jint jend_key_off, jint jend_key_len) {
+  jbyte* begin_key = new jbyte[jbegin_key_len];
+  env->GetByteArrayRegion(jbegin_key, jbegin_key_off, jbegin_key_len,
+                          begin_key);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    delete[] begin_key;
+    return false;
+  }
+  ROCKSDB_NAMESPACE::Slice begin_key_slice(reinterpret_cast<char*>(begin_key),
+                                           jbegin_key_len);
+
+  jbyte* end_key = new jbyte[jend_key_len];
+  env->GetByteArrayRegion(jend_key, jend_key_off, jend_key_len, end_key);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    delete[] begin_key;
+    delete[] end_key;
+    return false;
+  }
+  ROCKSDB_NAMESPACE::Slice end_key_slice(reinterpret_cast<char*>(end_key),
+                                         jend_key_len);
+
+  ROCKSDB_NAMESPACE::Status s =
+      db->DeleteRange(write_options, cf_handle, begin_key_slice, end_key_slice);
+
+  // cleanup
+  delete[] begin_key;
+  delete[] end_key;
+
+  if (s.ok()) {
+    return true;
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  return false;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    deleteRange
+ * Signature: (J[BII[BII)V
+ */
+void Java_org_rocksdb_RocksDB_deleteRange__J_3BII_3BII(
+    JNIEnv* env, jobject, jlong jdb_handle, jbyteArray jbegin_key,
+    jint jbegin_key_off, jint jbegin_key_len, jbyteArray jend_key,
+    jint jend_key_off, jint jend_key_len) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  static const ROCKSDB_NAMESPACE::WriteOptions default_write_options =
+      ROCKSDB_NAMESPACE::WriteOptions();
+  rocksdb_delete_range_helper(env, db, default_write_options, nullptr,
+                              jbegin_key, jbegin_key_off, jbegin_key_len,
+                              jend_key, jend_key_off, jend_key_len);
+}
+
+jint rocksdb_get_helper_direct(
+    JNIEnv* env, ROCKSDB_NAMESPACE::DB* db,
+    const ROCKSDB_NAMESPACE::ReadOptions& read_options,
+    ROCKSDB_NAMESPACE::ColumnFamilyHandle* column_family_handle, jobject jkey,
+    jint jkey_off, jint jkey_len, jobject jval, jint jval_off, jint jval_len,
+    bool* has_exception) {
+  static const int kNotFound = -1;
+  static const int kStatusError = -2;
+  static const int kArgumentError = -3;
+
+  char* key = reinterpret_cast<char*>(env->GetDirectBufferAddress(jkey));
+  if (key == nullptr) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env,
+        "Invalid key argument (argument is not a valid direct ByteBuffer)");
+    *has_exception = true;
+    return kArgumentError;
+  }
+  if (env->GetDirectBufferCapacity(jkey) < (jkey_off + jkey_len)) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env,
+        "Invalid key argument. Capacity is less than requested region (offset "
+        "+ length).");
+    *has_exception = true;
+    return kArgumentError;
+  }
+
+  char* value = reinterpret_cast<char*>(env->GetDirectBufferAddress(jval));
+  if (value == nullptr) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env,
+        "Invalid value argument (argument is not a valid direct ByteBuffer)");
+    *has_exception = true;
+    return kArgumentError;
+  }
+
+  if (env->GetDirectBufferCapacity(jval) < (jval_off + jval_len)) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env,
+        "Invalid value argument. Capacity is less than requested region "
+        "(offset + length).");
+    *has_exception = true;
+    return kArgumentError;
+  }
+
+  key += jkey_off;
+  value += jval_off;
+
+  ROCKSDB_NAMESPACE::Slice key_slice(key, jkey_len);
+
+  // TODO(yhchiang): we might save one memory allocation here by adding
+  // a DB::Get() function which takes preallocated jbyte* as input.
+  std::string cvalue;
+  ROCKSDB_NAMESPACE::Status s;
+  if (column_family_handle != nullptr) {
+    s = db->Get(read_options, column_family_handle, key_slice, &cvalue);
+  } else {
+    // backwards compatibility
+    s = db->Get(read_options, key_slice, &cvalue);
+  }
+
+  if (s.IsNotFound()) {
+    *has_exception = false;
+    return kNotFound;
+  } else if (!s.ok()) {
+    *has_exception = true;
+    // Here since we are throwing a Java exception from c++ side.
+    // As a result, c++ does not know calling this function will in fact
+    // throwing an exception.  As a result, the execution flow will
+    // not stop here, and codes after this throw will still be
+    // executed.
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+
+    // Return a dummy const value to avoid compilation error, although
+    // java side might not have a chance to get the return value :)
+    return kStatusError;
+  }
+
+  const jint cvalue_len = static_cast<jint>(cvalue.size());
+  const jint length = std::min(jval_len, cvalue_len);
+
+  memcpy(value, cvalue.c_str(), length);
+
+  *has_exception = false;
+  return cvalue_len;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    deleteRange
+ * Signature: (J[BII[BIIJ)V
+ */
+void Java_org_rocksdb_RocksDB_deleteRange__J_3BII_3BIIJ(
+    JNIEnv* env, jobject, jlong jdb_handle, jbyteArray jbegin_key,
+    jint jbegin_key_off, jint jbegin_key_len, jbyteArray jend_key,
+    jint jend_key_off, jint jend_key_len, jlong jcf_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  static const ROCKSDB_NAMESPACE::WriteOptions default_write_options =
+      ROCKSDB_NAMESPACE::WriteOptions();
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    rocksdb_delete_range_helper(env, db, default_write_options, cf_handle,
+                                jbegin_key, jbegin_key_off, jbegin_key_len,
+                                jend_key, jend_key_off, jend_key_len);
+  } else {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env, ROCKSDB_NAMESPACE::Status::InvalidArgument(
+                 "Invalid ColumnFamilyHandle."));
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    deleteRange
+ * Signature: (JJ[BII[BII)V
+ */
+void Java_org_rocksdb_RocksDB_deleteRange__JJ_3BII_3BII(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jwrite_options,
+    jbyteArray jbegin_key, jint jbegin_key_off, jint jbegin_key_len,
+    jbyteArray jend_key, jint jend_key_off, jint jend_key_len) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto* write_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jwrite_options);
+  rocksdb_delete_range_helper(env, db, *write_options, nullptr, jbegin_key,
+                              jbegin_key_off, jbegin_key_len, jend_key,
+                              jend_key_off, jend_key_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    deleteRange
+ * Signature: (JJ[BII[BIIJ)V
+ */
+void Java_org_rocksdb_RocksDB_deleteRange__JJ_3BII_3BIIJ(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jwrite_options,
+    jbyteArray jbegin_key, jint jbegin_key_off, jint jbegin_key_len,
+    jbyteArray jend_key, jint jend_key_off, jint jend_key_len,
+    jlong jcf_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto* write_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jwrite_options);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    rocksdb_delete_range_helper(env, db, *write_options, cf_handle, jbegin_key,
+                                jbegin_key_off, jbegin_key_len, jend_key,
+                                jend_key_off, jend_key_len);
+  } else {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env, ROCKSDB_NAMESPACE::Status::InvalidArgument(
+                 "Invalid ColumnFamilyHandle."));
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getDirect
+ * Signature: (JJLjava/nio/ByteBuffer;IILjava/nio/ByteBuffer;IIJ)I
+ */
+jint Java_org_rocksdb_RocksDB_getDirect(JNIEnv* env, jobject /*jdb*/,
+                                        jlong jdb_handle, jlong jropt_handle,
+                                        jobject jkey, jint jkey_off,
+                                        jint jkey_len, jobject jval,
+                                        jint jval_off, jint jval_len,
+                                        jlong jcf_handle) {
+  auto* db_handle = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto* ro_opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jropt_handle);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  bool has_exception = false;
+  return rocksdb_get_helper_direct(
+      env, db_handle,
+      ro_opt == nullptr ? ROCKSDB_NAMESPACE::ReadOptions() : *ro_opt, cf_handle,
+      jkey, jkey_off, jkey_len, jval, jval_off, jval_len, &has_exception);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// ROCKSDB_NAMESPACE::DB::Merge
+
+/**
+ * @return true if the merge succeeded, false if a Java Exception was thrown
+ */
+bool rocksdb_merge_helper(JNIEnv* env, ROCKSDB_NAMESPACE::DB* db,
+                          const ROCKSDB_NAMESPACE::WriteOptions& write_options,
+                          ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle,
+                          jbyteArray jkey, jint jkey_off, jint jkey_len,
+                          jbyteArray jval, jint jval_off, jint jval_len) {
+  jbyte* key = new jbyte[jkey_len];
+  env->GetByteArrayRegion(jkey, jkey_off, jkey_len, key);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    delete[] key;
+    return false;
+  }
+  ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+
+  jbyte* value = new jbyte[jval_len];
+  env->GetByteArrayRegion(jval, jval_off, jval_len, value);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    delete[] value;
+    delete[] key;
+    return false;
+  }
+  ROCKSDB_NAMESPACE::Slice value_slice(reinterpret_cast<char*>(value),
+                                       jval_len);
+
+  ROCKSDB_NAMESPACE::Status s;
+  if (cf_handle != nullptr) {
+    s = db->Merge(write_options, cf_handle, key_slice, value_slice);
+  } else {
+    s = db->Merge(write_options, key_slice, value_slice);
+  }
+
+  // cleanup
+  delete[] value;
+  delete[] key;
+
+  if (s.ok()) {
+    return true;
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  return false;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    merge
+ * Signature: (J[BII[BII)V
+ */
+void Java_org_rocksdb_RocksDB_merge__J_3BII_3BII(JNIEnv* env, jobject,
+                                                 jlong jdb_handle,
+                                                 jbyteArray jkey, jint jkey_off,
+                                                 jint jkey_len, jbyteArray jval,
+                                                 jint jval_off, jint jval_len) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  static const ROCKSDB_NAMESPACE::WriteOptions default_write_options =
+      ROCKSDB_NAMESPACE::WriteOptions();
+  rocksdb_merge_helper(env, db, default_write_options, nullptr, jkey, jkey_off,
+                       jkey_len, jval, jval_off, jval_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    merge
+ * Signature: (J[BII[BIIJ)V
+ */
+void Java_org_rocksdb_RocksDB_merge__J_3BII_3BIIJ(
+    JNIEnv* env, jobject, jlong jdb_handle, jbyteArray jkey, jint jkey_off,
+    jint jkey_len, jbyteArray jval, jint jval_off, jint jval_len,
+    jlong jcf_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  static const ROCKSDB_NAMESPACE::WriteOptions default_write_options =
+      ROCKSDB_NAMESPACE::WriteOptions();
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    rocksdb_merge_helper(env, db, default_write_options, cf_handle, jkey,
+                         jkey_off, jkey_len, jval, jval_off, jval_len);
+  } else {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env, ROCKSDB_NAMESPACE::Status::InvalidArgument(
+                 "Invalid ColumnFamilyHandle."));
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    merge
+ * Signature: (JJ[BII[BII)V
+ */
+void Java_org_rocksdb_RocksDB_merge__JJ_3BII_3BII(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jwrite_options_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len, jbyteArray jval,
+    jint jval_off, jint jval_len) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto* write_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jwrite_options_handle);
+  rocksdb_merge_helper(env, db, *write_options, nullptr, jkey, jkey_off,
+                       jkey_len, jval, jval_off, jval_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    merge
+ * Signature: (JJ[BII[BIIJ)V
+ */
+void Java_org_rocksdb_RocksDB_merge__JJ_3BII_3BIIJ(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jwrite_options_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len, jbyteArray jval,
+    jint jval_off, jint jval_len, jlong jcf_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto* write_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jwrite_options_handle);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    rocksdb_merge_helper(env, db, *write_options, cf_handle, jkey, jkey_off,
+                         jkey_len, jval, jval_off, jval_len);
+  } else {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env, ROCKSDB_NAMESPACE::Status::InvalidArgument(
+                 "Invalid ColumnFamilyHandle."));
+  }
+}
+
+jlong rocksdb_iterator_helper(
+    ROCKSDB_NAMESPACE::DB* db, ROCKSDB_NAMESPACE::ReadOptions read_options,
+    ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle) {
+  ROCKSDB_NAMESPACE::Iterator* iterator = nullptr;
+  if (cf_handle != nullptr) {
+    iterator = db->NewIterator(read_options, cf_handle);
+  } else {
+    iterator = db->NewIterator(read_options);
+  }
+  return GET_CPLUSPLUS_POINTER(iterator);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    deleteDirect
+ * Signature: (JJLjava/nio/ByteBuffer;IIJ)V
+ */
+void Java_org_rocksdb_RocksDB_deleteDirect(JNIEnv* env, jobject /*jdb*/,
+                                           jlong jdb_handle,
+                                           jlong jwrite_options, jobject jkey,
+                                           jint jkey_offset, jint jkey_len,
+                                           jlong jcf_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto* write_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jwrite_options);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  auto remove = [&env, &db, &write_options,
+                 &cf_handle](ROCKSDB_NAMESPACE::Slice& key) {
+    ROCKSDB_NAMESPACE::Status s;
+    if (cf_handle == nullptr) {
+      s = db->Delete(*write_options, key);
+    } else {
+      s = db->Delete(*write_options, cf_handle, key);
+    }
+    if (s.ok()) {
+      return;
+    }
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  };
+  ROCKSDB_NAMESPACE::JniUtil::k_op_direct(remove, env, jkey, jkey_offset,
+                                          jkey_len);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// ROCKSDB_NAMESPACE::DB::Write
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    write0
+ * Signature: (JJJ)V
+ */
+void Java_org_rocksdb_RocksDB_write0(JNIEnv* env, jobject, jlong jdb_handle,
+                                     jlong jwrite_options_handle,
+                                     jlong jwb_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto* write_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jwrite_options_handle);
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+
+  ROCKSDB_NAMESPACE::Status s = db->Write(*write_options, wb);
+
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    write1
+ * Signature: (JJJ)V
+ */
+void Java_org_rocksdb_RocksDB_write1(JNIEnv* env, jobject, jlong jdb_handle,
+                                     jlong jwrite_options_handle,
+                                     jlong jwbwi_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto* write_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jwrite_options_handle);
+  auto* wbwi =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchWithIndex*>(jwbwi_handle);
+  auto* wb = wbwi->GetWriteBatch();
+
+  ROCKSDB_NAMESPACE::Status s = db->Write(*write_options, wb);
+
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// ROCKSDB_NAMESPACE::DB::Get
+
+jbyteArray rocksdb_get_helper(
+    JNIEnv* env, ROCKSDB_NAMESPACE::DB* db,
+    const ROCKSDB_NAMESPACE::ReadOptions& read_opt,
+    ROCKSDB_NAMESPACE::ColumnFamilyHandle* column_family_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len) {
+  jbyte* key = new jbyte[jkey_len];
+  env->GetByteArrayRegion(jkey, jkey_off, jkey_len, key);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    delete[] key;
+    return nullptr;
+  }
+
+  ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+
+  std::string value;
+  ROCKSDB_NAMESPACE::Status s;
+  if (column_family_handle != nullptr) {
+    s = db->Get(read_opt, column_family_handle, key_slice, &value);
+  } else {
+    // backwards compatibility
+    s = db->Get(read_opt, key_slice, &value);
+  }
+
+  // cleanup
+  delete[] key;
+
+  if (s.IsNotFound()) {
+    return nullptr;
+  }
+
+  if (s.ok()) {
+    jbyteArray jret_value = ROCKSDB_NAMESPACE::JniUtil::copyBytes(env, value);
+    if (jret_value == nullptr) {
+      // exception occurred
+      return nullptr;
+    }
+    return jret_value;
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  return nullptr;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (J[BII)[B
+ */
+jbyteArray Java_org_rocksdb_RocksDB_get__J_3BII(JNIEnv* env, jobject,
+                                                jlong jdb_handle,
+                                                jbyteArray jkey, jint jkey_off,
+                                                jint jkey_len) {
+  return rocksdb_get_helper(
+      env, reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle),
+      ROCKSDB_NAMESPACE::ReadOptions(), nullptr, jkey, jkey_off, jkey_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (J[BIIJ)[B
+ */
+jbyteArray Java_org_rocksdb_RocksDB_get__J_3BIIJ(JNIEnv* env, jobject,
+                                                 jlong jdb_handle,
+                                                 jbyteArray jkey, jint jkey_off,
+                                                 jint jkey_len,
+                                                 jlong jcf_handle) {
+  auto db_handle = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    return rocksdb_get_helper(env, db_handle, ROCKSDB_NAMESPACE::ReadOptions(),
+                              cf_handle, jkey, jkey_off, jkey_len);
+  } else {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env, ROCKSDB_NAMESPACE::Status::InvalidArgument(
+                 "Invalid ColumnFamilyHandle."));
+    return nullptr;
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (JJ[BII)[B
+ */
+jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BII(JNIEnv* env, jobject,
+                                                 jlong jdb_handle,
+                                                 jlong jropt_handle,
+                                                 jbyteArray jkey, jint jkey_off,
+                                                 jint jkey_len) {
+  return rocksdb_get_helper(
+      env, reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle),
+      *reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jropt_handle), nullptr,
+      jkey, jkey_off, jkey_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (JJ[BIIJ)[B
+ */
+jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BIIJ(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jropt_handle, jbyteArray jkey,
+    jint jkey_off, jint jkey_len, jlong jcf_handle) {
+  auto* db_handle = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto& ro_opt =
+      *reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jropt_handle);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    return rocksdb_get_helper(env, db_handle, ro_opt, cf_handle, jkey, jkey_off,
+                              jkey_len);
+  } else {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env, ROCKSDB_NAMESPACE::Status::InvalidArgument(
+                 "Invalid ColumnFamilyHandle."));
+    return nullptr;
+  }
+}
+
+jint rocksdb_get_helper(
+    JNIEnv* env, ROCKSDB_NAMESPACE::DB* db,
+    const ROCKSDB_NAMESPACE::ReadOptions& read_options,
+    ROCKSDB_NAMESPACE::ColumnFamilyHandle* column_family_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len, jbyteArray jval,
+    jint jval_off, jint jval_len, bool* has_exception) {
+  static const int kNotFound = -1;
+  static const int kStatusError = -2;
+
+  jbyte* key = new jbyte[jkey_len];
+  env->GetByteArrayRegion(jkey, jkey_off, jkey_len, key);
+  if (env->ExceptionCheck()) {
+    // exception thrown: OutOfMemoryError
+    delete[] key;
+    *has_exception = true;
+    return kStatusError;
+  }
+  ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+
+  // TODO(yhchiang): we might save one memory allocation here by adding
+  // a DB::Get() function which takes preallocated jbyte* as input.
+  std::string cvalue;
+  ROCKSDB_NAMESPACE::Status s;
+  if (column_family_handle != nullptr) {
+    s = db->Get(read_options, column_family_handle, key_slice, &cvalue);
+  } else {
+    // backwards compatibility
+    s = db->Get(read_options, key_slice, &cvalue);
+  }
+
+  // cleanup
+  delete[] key;
+
+  if (s.IsNotFound()) {
+    *has_exception = false;
+    return kNotFound;
+  } else if (!s.ok()) {
+    *has_exception = true;
+    // Here since we are throwing a Java exception from c++ side.
+    // As a result, c++ does not know calling this function will in fact
+    // throwing an exception.  As a result, the execution flow will
+    // not stop here, and codes after this throw will still be
+    // executed.
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+
+    // Return a dummy const value to avoid compilation error, although
+    // java side might not have a chance to get the return value :)
+    return kStatusError;
+  }
+
+  const jint cvalue_len = static_cast<jint>(cvalue.size());
+  const jint length = std::min(jval_len, cvalue_len);
+
+  env->SetByteArrayRegion(
+      jval, jval_off, length,
+      const_cast<jbyte*>(reinterpret_cast<const jbyte*>(cvalue.c_str())));
+  if (env->ExceptionCheck()) {
+    // exception thrown: OutOfMemoryError
+    *has_exception = true;
+    return kStatusError;
+  }
+
+  *has_exception = false;
+  return cvalue_len;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (J[BII[BII)I
+ */
+jint Java_org_rocksdb_RocksDB_get__J_3BII_3BII(JNIEnv* env, jobject,
+                                               jlong jdb_handle,
+                                               jbyteArray jkey, jint jkey_off,
+                                               jint jkey_len, jbyteArray jval,
+                                               jint jval_off, jint jval_len) {
+  bool has_exception = false;
+  return rocksdb_get_helper(
+      env, reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle),
+      ROCKSDB_NAMESPACE::ReadOptions(), nullptr, jkey, jkey_off, jkey_len, jval,
+      jval_off, jval_len, &has_exception);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (J[BII[BIIJ)I
+ */
+jint Java_org_rocksdb_RocksDB_get__J_3BII_3BIIJ(JNIEnv* env, jobject,
+                                                jlong jdb_handle,
+                                                jbyteArray jkey, jint jkey_off,
+                                                jint jkey_len, jbyteArray jval,
+                                                jint jval_off, jint jval_len,
+                                                jlong jcf_handle) {
+  auto* db_handle = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    bool has_exception = false;
+    return rocksdb_get_helper(env, db_handle, ROCKSDB_NAMESPACE::ReadOptions(),
+                              cf_handle, jkey, jkey_off, jkey_len, jval,
+                              jval_off, jval_len, &has_exception);
+  } else {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env, ROCKSDB_NAMESPACE::Status::InvalidArgument(
+                 "Invalid ColumnFamilyHandle."));
+    // will never be evaluated
+    return 0;
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (JJ[BII[BII)I
+ */
+jint Java_org_rocksdb_RocksDB_get__JJ_3BII_3BII(JNIEnv* env, jobject,
+                                                jlong jdb_handle,
+                                                jlong jropt_handle,
+                                                jbyteArray jkey, jint jkey_off,
+                                                jint jkey_len, jbyteArray jval,
+                                                jint jval_off, jint jval_len) {
+  bool has_exception = false;
+  return rocksdb_get_helper(
+      env, reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle),
+      *reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jropt_handle), nullptr,
+      jkey, jkey_off, jkey_len, jval, jval_off, jval_len, &has_exception);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (JJ[BII[BIIJ)I
+ */
+jint Java_org_rocksdb_RocksDB_get__JJ_3BII_3BIIJ(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jropt_handle, jbyteArray jkey,
+    jint jkey_off, jint jkey_len, jbyteArray jval, jint jval_off, jint jval_len,
+    jlong jcf_handle) {
+  auto* db_handle = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto& ro_opt =
+      *reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jropt_handle);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    bool has_exception = false;
+    return rocksdb_get_helper(env, db_handle, ro_opt, cf_handle, jkey, jkey_off,
+                              jkey_len, jval, jval_off, jval_len,
+                              &has_exception);
+  } else {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env, ROCKSDB_NAMESPACE::Status::InvalidArgument(
+                 "Invalid ColumnFamilyHandle."));
+    // will never be evaluated
+    return 0;
+  }
+}
+
+inline void multi_get_helper_release_keys(std::vector<jbyte*>& keys_to_free) {
+  auto end = keys_to_free.end();
+  for (auto it = keys_to_free.begin(); it != end; ++it) {
+    delete[] * it;
+  }
+  keys_to_free.clear();
+}
+
+/**
+ * @brief fill a native array of cf handles from java handles
+ *
+ * @param env
+ * @param cf_handles to fill from the java variants
+ * @param jcolumn_family_handles
+ * @return true if the copy succeeds
+ * @return false if a JNI exception is generated
+ */
+inline bool cf_handles_from_jcf_handles(
+    JNIEnv* env,
+    std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>& cf_handles,
+    jlongArray jcolumn_family_handles) {
+  if (jcolumn_family_handles != nullptr) {
+    const jsize len_cols = env->GetArrayLength(jcolumn_family_handles);
+
+    jlong* jcfh = env->GetLongArrayElements(jcolumn_family_handles, nullptr);
+    if (jcfh == nullptr) {
+      // exception thrown: OutOfMemoryError
+      jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError");
+      (env)->ThrowNew(exception_cls,
+                      "Insufficient Memory for CF handle array.");
+      return false;
+    }
+
+    for (jsize i = 0; i < len_cols; i++) {
+      auto* cf_handle =
+          reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcfh[i]);
+      cf_handles.push_back(cf_handle);
+    }
+    env->ReleaseLongArrayElements(jcolumn_family_handles, jcfh, JNI_ABORT);
+  }
+  return true;
+}
+
+/**
+ * @brief copy keys from JNI into vector of slices for Rocks API
+ *
+ * @param keys to instantiate
+ * @param jkeys
+ * @param jkey_offs
+ * @param jkey_lens
+ * @return true if the copy succeeds
+ * @return false if a JNI exception is raised
+ */
+inline bool keys_from_jkeys(JNIEnv* env,
+                            std::vector<ROCKSDB_NAMESPACE::Slice>& keys,
+                            std::vector<jbyte*>& keys_to_free,
+                            jobjectArray jkeys, jintArray jkey_offs,
+                            jintArray jkey_lens) {
+  jint* jkey_off = env->GetIntArrayElements(jkey_offs, nullptr);
+  if (jkey_off == nullptr) {
+    // exception thrown: OutOfMemoryError
+    jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError");
+    (env)->ThrowNew(exception_cls, "Insufficient Memory for key offset array.");
+    return false;
+  }
+
+  jint* jkey_len = env->GetIntArrayElements(jkey_lens, nullptr);
+  if (jkey_len == nullptr) {
+    // exception thrown: OutOfMemoryError
+    env->ReleaseIntArrayElements(jkey_offs, jkey_off, JNI_ABORT);
+    jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError");
+    (env)->ThrowNew(exception_cls, "Insufficient Memory for key length array.");
+    return false;
+  }
+
+  const jsize len_keys = env->GetArrayLength(jkeys);
+  for (jsize i = 0; i < len_keys; i++) {
+    jobject jkey = env->GetObjectArrayElement(jkeys, i);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      env->ReleaseIntArrayElements(jkey_lens, jkey_len, JNI_ABORT);
+      env->ReleaseIntArrayElements(jkey_offs, jkey_off, JNI_ABORT);
+      multi_get_helper_release_keys(keys_to_free);
+      jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError");
+      (env)->ThrowNew(exception_cls,
+                      "Insufficient Memory for key object array.");
+      return false;
+    }
+
+    jbyteArray jkey_ba = reinterpret_cast<jbyteArray>(jkey);
+
+    const jint len_key = jkey_len[i];
+    jbyte* key = new jbyte[len_key];
+    env->GetByteArrayRegion(jkey_ba, jkey_off[i], len_key, key);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      delete[] key;
+      env->DeleteLocalRef(jkey);
+      env->ReleaseIntArrayElements(jkey_lens, jkey_len, JNI_ABORT);
+      env->ReleaseIntArrayElements(jkey_offs, jkey_off, JNI_ABORT);
+      multi_get_helper_release_keys(keys_to_free);
+      jclass exception_cls =
+          (env)->FindClass("java/lang/ArrayIndexOutOfBoundsException");
+      (env)->ThrowNew(exception_cls, "Invalid byte array region index.");
+      return false;
+    }
+
+    ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char*>(key), len_key);
+    keys.push_back(key_slice);
+
+    env->DeleteLocalRef(jkey);
+    keys_to_free.push_back(key);
+  }
+
+  // cleanup jkey_off and jken_len
+  env->ReleaseIntArrayElements(jkey_lens, jkey_len, JNI_ABORT);
+  env->ReleaseIntArrayElements(jkey_offs, jkey_off, JNI_ABORT);
+
+  return true;
+}
+
+inline bool keys_from_bytebuffers(JNIEnv* env,
+                                  std::vector<ROCKSDB_NAMESPACE::Slice>& keys,
+                                  jobjectArray jkeys, jintArray jkey_offs,
+                                  jintArray jkey_lens) {
+  jint* jkey_off = env->GetIntArrayElements(jkey_offs, nullptr);
+  if (jkey_off == nullptr) {
+    // exception thrown: OutOfMemoryError
+    jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError");
+    (env)->ThrowNew(exception_cls, "Insufficient Memory for key offset array.");
+    return false;
+  }
+
+  jint* jkey_len = env->GetIntArrayElements(jkey_lens, nullptr);
+  if (jkey_len == nullptr) {
+    // exception thrown: OutOfMemoryError
+    env->ReleaseIntArrayElements(jkey_offs, jkey_off, JNI_ABORT);
+    jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError");
+    (env)->ThrowNew(exception_cls, "Insufficient Memory for key length array.");
+    return false;
+  }
+
+  const jsize len_keys = env->GetArrayLength(jkeys);
+  for (jsize i = 0; i < len_keys; i++) {
+    jobject jkey = env->GetObjectArrayElement(jkeys, i);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      return false;
+    }
+    char* key = reinterpret_cast<char*>(env->GetDirectBufferAddress(jkey));
+    ROCKSDB_NAMESPACE::Slice key_slice(key + jkey_off[i], jkey_len[i]);
+    keys.push_back(key_slice);
+
+    env->DeleteLocalRef(jkey);
+  }
+  return true;
+}
+
+/**
+ * cf multi get
+ *
+ * @return byte[][] of values or nullptr if an
+ * exception occurs
+ */
+jobjectArray multi_get_helper(JNIEnv* env, jobject, ROCKSDB_NAMESPACE::DB* db,
+                              const ROCKSDB_NAMESPACE::ReadOptions& rOpt,
+                              jobjectArray jkeys, jintArray jkey_offs,
+                              jintArray jkey_lens,
+                              jlongArray jcolumn_family_handles) {
+  std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*> cf_handles;
+  if (!cf_handles_from_jcf_handles(env, cf_handles, jcolumn_family_handles)) {
+    return nullptr;
+  }
+
+  std::vector<ROCKSDB_NAMESPACE::Slice> keys;
+  std::vector<jbyte*> keys_to_free;
+  if (!keys_from_jkeys(env, keys, keys_to_free, jkeys, jkey_offs, jkey_lens)) {
+    return nullptr;
+  }
+
+  std::vector<std::string> values;
+  std::vector<ROCKSDB_NAMESPACE::Status> s;
+  if (cf_handles.size() == 0) {
+    s = db->MultiGet(rOpt, keys, &values);
+  } else {
+    s = db->MultiGet(rOpt, cf_handles, keys, &values);
+  }
+
+  // free up allocated byte arrays
+  multi_get_helper_release_keys(keys_to_free);
+
+  // prepare the results
+  jobjectArray jresults = ROCKSDB_NAMESPACE::ByteJni::new2dByteArray(
+      env, static_cast<jsize>(s.size()));
+  if (jresults == nullptr) {
+    // exception occurred
+    jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError");
+    (env)->ThrowNew(exception_cls, "Insufficient Memory for results.");
+    return nullptr;
+  }
+
+  // add to the jresults
+  for (std::vector<ROCKSDB_NAMESPACE::Status>::size_type i = 0; i != s.size();
+       i++) {
+    if (s[i].ok()) {
+      std::string* value = &values[i];
+      const jsize jvalue_len = static_cast<jsize>(value->size());
+      jbyteArray jentry_value = env->NewByteArray(jvalue_len);
+      if (jentry_value == nullptr) {
+        // exception thrown: OutOfMemoryError
+        return nullptr;
+      }
+
+      env->SetByteArrayRegion(
+          jentry_value, 0, static_cast<jsize>(jvalue_len),
+          const_cast<jbyte*>(reinterpret_cast<const jbyte*>(value->c_str())));
+      if (env->ExceptionCheck()) {
+        // exception thrown:
+        // ArrayIndexOutOfBoundsException
+        env->DeleteLocalRef(jentry_value);
+        return nullptr;
+      }
+
+      env->SetObjectArrayElement(jresults, static_cast<jsize>(i), jentry_value);
+      if (env->ExceptionCheck()) {
+        // exception thrown:
+        // ArrayIndexOutOfBoundsException
+        env->DeleteLocalRef(jentry_value);
+        return nullptr;
+      }
+
+      env->DeleteLocalRef(jentry_value);
+    }
+  }
+
+  return jresults;
+}
+
+/**
+ * cf multi get
+ *
+ * fill supplied native buffers, or raise JNI
+ * exception on a problem
+ */
+
+/**
+ * @brief multi_get_helper_direct for fast-path multiget (io_uring) on Linux
+ *
+ * @param env
+ * @param db
+ * @param rOpt read options
+ * @param jcolumn_family_handles 0, 1, or n column family handles
+ * @param jkeys
+ * @param jkey_offsets
+ * @param jkey_lengths
+ * @param jvalues byte buffers to receive values
+ * @param jvalue_sizes returned actual sizes of data values for keys
+ * @param jstatuses returned java RocksDB status values for per key
+ */
+void multi_get_helper_direct(JNIEnv* env, jobject, ROCKSDB_NAMESPACE::DB* db,
+                             const ROCKSDB_NAMESPACE::ReadOptions& rOpt,
+                             jlongArray jcolumn_family_handles,
+                             jobjectArray jkeys, jintArray jkey_offsets,
+                             jintArray jkey_lengths, jobjectArray jvalues,
+                             jintArray jvalue_sizes, jobjectArray jstatuses) {
+  const jsize num_keys = env->GetArrayLength(jkeys);
+
+  std::vector<ROCKSDB_NAMESPACE::Slice> keys;
+  if (!keys_from_bytebuffers(env, keys, jkeys, jkey_offsets, jkey_lengths)) {
+    return;
+  }
+
+  std::vector<ROCKSDB_NAMESPACE::PinnableSlice> values(num_keys);
+
+  std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*> cf_handles;
+  if (!cf_handles_from_jcf_handles(env, cf_handles, jcolumn_family_handles)) {
+    return;
+  }
+
+  std::vector<ROCKSDB_NAMESPACE::Status> s(num_keys);
+  if (cf_handles.size() == 0) {
+    // we can use the more efficient call here
+    auto cf_handle = db->DefaultColumnFamily();
+    db->MultiGet(rOpt, cf_handle, num_keys, keys.data(), values.data(),
+                 s.data());
+  } else if (cf_handles.size() == 1) {
+    // we can use the more efficient call here
+    auto cf_handle = cf_handles[0];
+    db->MultiGet(rOpt, cf_handle, num_keys, keys.data(), values.data(),
+                 s.data());
+  } else {
+    // multiple CFs version
+    db->MultiGet(rOpt, num_keys, cf_handles.data(), keys.data(), values.data(),
+                 s.data());
+  }
+
+  // prepare the results
+  jobjectArray jresults = ROCKSDB_NAMESPACE::ByteJni::new2dByteArray(
+      env, static_cast<jsize>(s.size()));
+  if (jresults == nullptr) {
+    // exception occurred
+    jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError");
+    (env)->ThrowNew(exception_cls, "Insufficient Memory for results.");
+    return;
+  }
+
+  std::vector<jint> value_size;
+  for (int i = 0; i < num_keys; i++) {
+    auto jstatus = ROCKSDB_NAMESPACE::StatusJni::construct(env, s[i]);
+    if (jstatus == nullptr) {
+      // exception in context
+      return;
+    }
+    env->SetObjectArrayElement(jstatuses, i, jstatus);
+
+    if (s[i].ok()) {
+      jobject jvalue_bytebuf = env->GetObjectArrayElement(jvalues, i);
+      if (env->ExceptionCheck()) {
+        // ArrayIndexOutOfBoundsException is thrown
+        return;
+      }
+      jlong jvalue_capacity = env->GetDirectBufferCapacity(jvalue_bytebuf);
+      if (jvalue_capacity == -1) {
+        ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+            env,
+            "Invalid value(s) argument (argument is not a valid direct "
+            "ByteBuffer)");
+        return;
+      }
+      void* jvalue_address = env->GetDirectBufferAddress(jvalue_bytebuf);
+      if (jvalue_address == nullptr) {
+        ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+            env,
+            "Invalid value(s) argument (argument is not a valid direct "
+            "ByteBuffer)");
+        return;
+      }
+
+      // record num returned, push back that number, which may be bigger then
+      // the ByteBuffer supplied. then copy as much as fits in the ByteBuffer.
+      value_size.push_back(static_cast<jint>(values[i].size()));
+      auto copy_bytes =
+          std::min(static_cast<jlong>(values[i].size()), jvalue_capacity);
+      memcpy(jvalue_address, values[i].data(), copy_bytes);
+    } else {
+      // bad status for this
+      value_size.push_back(0);
+    }
+  }
+
+  env->SetIntArrayRegion(jvalue_sizes, 0, num_keys, value_size.data());
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    multiGet
+ * Signature: (J[[B[I[I)[[B
+ */
+jobjectArray Java_org_rocksdb_RocksDB_multiGet__J_3_3B_3I_3I(
+    JNIEnv* env, jobject jdb, jlong jdb_handle, jobjectArray jkeys,
+    jintArray jkey_offs, jintArray jkey_lens) {
+  return multi_get_helper(
+      env, jdb, reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle),
+      ROCKSDB_NAMESPACE::ReadOptions(), jkeys, jkey_offs, jkey_lens, nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    multiGet
+ * Signature: (J[[B[I[I[J)[[B
+ */
+jobjectArray Java_org_rocksdb_RocksDB_multiGet__J_3_3B_3I_3I_3J(
+    JNIEnv* env, jobject jdb, jlong jdb_handle, jobjectArray jkeys,
+    jintArray jkey_offs, jintArray jkey_lens,
+    jlongArray jcolumn_family_handles) {
+  return multi_get_helper(env, jdb,
+                          reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle),
+                          ROCKSDB_NAMESPACE::ReadOptions(), jkeys, jkey_offs,
+                          jkey_lens, jcolumn_family_handles);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    multiGet
+ * Signature: (JJ[[B[I[I)[[B
+ */
+jobjectArray Java_org_rocksdb_RocksDB_multiGet__JJ_3_3B_3I_3I(
+    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle,
+    jobjectArray jkeys, jintArray jkey_offs, jintArray jkey_lens) {
+  return multi_get_helper(
+      env, jdb, reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle),
+      *reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jropt_handle), jkeys,
+      jkey_offs, jkey_lens, nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    multiGet
+ * Signature: (JJ[[B[I[I[J)[[B
+ */
+jobjectArray Java_org_rocksdb_RocksDB_multiGet__JJ_3_3B_3I_3I_3J(
+    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle,
+    jobjectArray jkeys, jintArray jkey_offs, jintArray jkey_lens,
+    jlongArray jcolumn_family_handles) {
+  return multi_get_helper(
+      env, jdb, reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle),
+      *reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jropt_handle), jkeys,
+      jkey_offs, jkey_lens, jcolumn_family_handles);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    multiGet
+ * Signature:
+ * (JJ[J[Ljava/nio/ByteBuffer;[I[I[Ljava/nio/ByteBuffer;[I[Lorg/rocksdb/Status;)V
+ */
+void Java_org_rocksdb_RocksDB_multiGet__JJ_3J_3Ljava_nio_ByteBuffer_2_3I_3I_3Ljava_nio_ByteBuffer_2_3I_3Lorg_rocksdb_Status_2(
+    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle,
+    jlongArray jcolumn_family_handles, jobjectArray jkeys,
+    jintArray jkey_offsets, jintArray jkey_lengths, jobjectArray jvalues,
+    jintArray jvalues_sizes, jobjectArray jstatus_objects) {
+  return multi_get_helper_direct(
+      env, jdb, reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle),
+      *reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jropt_handle),
+      jcolumn_family_handles, jkeys, jkey_offsets, jkey_lengths, jvalues,
+      jvalues_sizes, jstatus_objects);
+}
+// private native void
+// multiGet(final long dbHandle, final long rOptHandle,
+//        final long[] columnFamilyHandles, final ByteBuffer[] keysArray,
+//        final ByteBuffer[] valuesArray);
+
+//////////////////////////////////////////////////////////////////////////////
+// ROCKSDB_NAMESPACE::DB::KeyMayExist
+bool key_may_exist_helper(JNIEnv* env, jlong jdb_handle, jlong jcf_handle,
+                          jlong jread_opts_handle, jbyteArray jkey,
+                          jint jkey_offset, jint jkey_len, bool* has_exception,
+                          std::string* value, bool* value_found) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle =
+        reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  }
+  ROCKSDB_NAMESPACE::ReadOptions read_opts =
+      jread_opts_handle == 0
+          ? ROCKSDB_NAMESPACE::ReadOptions()
+          : *(reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(
+                jread_opts_handle));
+
+  jbyte* key = new jbyte[jkey_len];
+  env->GetByteArrayRegion(jkey, jkey_offset, jkey_len, key);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    delete[] key;
+    *has_exception = true;
+    return false;
+  }
+  ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+
+  const bool exists =
+      db->KeyMayExist(read_opts, cf_handle, key_slice, value, value_found);
+
+  // cleanup
+  delete[] key;
+
+  return exists;
+}
+
+bool key_may_exist_direct_helper(JNIEnv* env, jlong jdb_handle,
+                                 jlong jcf_handle, jlong jread_opts_handle,
+                                 jobject jkey, jint jkey_offset, jint jkey_len,
+                                 bool* has_exception, std::string* value,
+                                 bool* value_found) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle =
+        reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  }
+  ROCKSDB_NAMESPACE::ReadOptions read_opts =
+      jread_opts_handle == 0
+          ? ROCKSDB_NAMESPACE::ReadOptions()
+          : *(reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(
+                jread_opts_handle));
+
+  char* key = reinterpret_cast<char*>(env->GetDirectBufferAddress(jkey));
+  if (key == nullptr) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env,
+        "Invalid key argument (argument is not a valid direct ByteBuffer)");
+    *has_exception = true;
+    return false;
+  }
+  if (env->GetDirectBufferCapacity(jkey) < (jkey_offset + jkey_len)) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env,
+        "Invalid key argument. Capacity is less than requested region (offset "
+        "+ length).");
+    *has_exception = true;
+    return false;
+  }
+
+  ROCKSDB_NAMESPACE::Slice key_slice(key, jkey_len);
+
+  const bool exists =
+      db->KeyMayExist(read_opts, cf_handle, key_slice, value, value_found);
+
+  return exists;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    keyMayExist
+ * Signature: (JJJ[BII)Z
+ */
+jboolean Java_org_rocksdb_RocksDB_keyMayExist(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle,
+    jlong jread_opts_handle, jbyteArray jkey, jint jkey_offset, jint jkey_len) {
+  bool has_exception = false;
+  std::string value;
+  bool value_found = false;
+
+  const bool exists = key_may_exist_helper(
+      env, jdb_handle, jcf_handle, jread_opts_handle, jkey, jkey_offset,
+      jkey_len, &has_exception, &value, &value_found);
+
+  if (has_exception) {
+    // java exception already raised
+    return false;
+  }
+
+  return static_cast<jboolean>(exists);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    keyMayExistDirect
+ * Signature: (JJJLjava/nio/ByteBuffer;II)Z
+ */
+jboolean Java_org_rocksdb_RocksDB_keyMayExistDirect(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle,
+    jlong jread_opts_handle, jobject jkey, jint jkey_offset, jint jkey_len) {
+  bool has_exception = false;
+  std::string value;
+  bool value_found = false;
+
+  const bool exists = key_may_exist_direct_helper(
+      env, jdb_handle, jcf_handle, jread_opts_handle, jkey, jkey_offset,
+      jkey_len, &has_exception, &value, &value_found);
+  if (has_exception) {
+    // java exception already raised
+    return false;
+  }
+
+  return static_cast<jboolean>(exists);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    keyMayExistDirectFoundValue
+ * Signature:
+ * (JJJLjava/nio/ByteBuffer;IILjava/nio/ByteBuffer;II)[J
+ */
+jintArray Java_org_rocksdb_RocksDB_keyMayExistDirectFoundValue(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle,
+    jlong jread_opts_handle, jobject jkey, jint jkey_offset, jint jkey_len,
+    jobject jval, jint jval_offset, jint jval_len) {
+  char* val_buffer = reinterpret_cast<char*>(env->GetDirectBufferAddress(jval));
+  if (val_buffer == nullptr) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env,
+        "Invalid value argument (argument is not a valid direct ByteBuffer)");
+    return nullptr;
+  }
+
+  if (env->GetDirectBufferCapacity(jval) < (jval_offset + jval_len)) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env,
+        "Invalid value argument. Capacity is less than requested region "
+        "(offset + length).");
+    return nullptr;
+  }
+
+  bool has_exception = false;
+  std::string cvalue;
+  bool value_found = false;
+
+  const bool exists = key_may_exist_direct_helper(
+      env, jdb_handle, jcf_handle, jread_opts_handle, jkey, jkey_offset,
+      jkey_len, &has_exception, &cvalue, &value_found);
+
+  if (has_exception) {
+    // java exception already raised
+    return nullptr;
+  }
+
+  const jint cvalue_len = static_cast<jint>(cvalue.size());
+  const jint length = std::min(jval_len, cvalue_len);
+  memcpy(val_buffer + jval_offset, cvalue.c_str(), length);
+
+  // keep consistent with java KeyMayExistEnum.values()
+  const int kNotExist = 0;
+  const int kExistsWithoutValue = 1;
+  const int kExistsWithValue = 2;
+
+  // TODO fix return value/type
+  // exists/value_found/neither
+  // cvalue_len
+  jintArray jresult = env->NewIntArray(2);
+  const jint jexists =
+      exists ? (value_found ? kExistsWithValue : kExistsWithoutValue)
+             : kNotExist;
+
+  env->SetIntArrayRegion(jresult, 0, 1, &jexists);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    env->DeleteLocalRef(jresult);
+    return nullptr;
+  }
+  env->SetIntArrayRegion(jresult, 1, 1, &cvalue_len);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    env->DeleteLocalRef(jresult);
+    return nullptr;
+  }
+
+  return jresult;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    keyMayExistFoundValue
+ * Signature: (JJJ[BII)[[B
+ */
+jobjectArray Java_org_rocksdb_RocksDB_keyMayExistFoundValue(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle,
+    jlong jread_opts_handle, jbyteArray jkey, jint jkey_offset, jint jkey_len) {
+  bool has_exception = false;
+  std::string value;
+  bool value_found = false;
+
+  const bool exists = key_may_exist_helper(
+      env, jdb_handle, jcf_handle, jread_opts_handle, jkey, jkey_offset,
+      jkey_len, &has_exception, &value, &value_found);
+
+  if (has_exception) {
+    // java exception already raised
+    return nullptr;
+  }
+
+  jbyte result_flags[1];
+  if (!exists) {
+    result_flags[0] = 0;
+  } else if (!value_found) {
+    result_flags[0] = 1;
+  } else {
+    // found
+    result_flags[0] = 2;
+  }
+
+  jobjectArray jresults = ROCKSDB_NAMESPACE::ByteJni::new2dByteArray(env, 2);
+  if (jresults == nullptr) {
+    // exception occurred
+    return nullptr;
+  }
+
+  // prepare the result flag
+  jbyteArray jresult_flags = env->NewByteArray(1);
+  if (jresult_flags == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+  env->SetByteArrayRegion(jresult_flags, 0, 1, result_flags);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    env->DeleteLocalRef(jresult_flags);
+    return nullptr;
+  }
+
+  env->SetObjectArrayElement(jresults, 0, jresult_flags);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    env->DeleteLocalRef(jresult_flags);
+    return nullptr;
+  }
+
+  env->DeleteLocalRef(jresult_flags);
+
+  if (result_flags[0] == 2) {
+    // set the value
+    const jsize jvalue_len = static_cast<jsize>(value.size());
+    jbyteArray jresult_value = env->NewByteArray(jvalue_len);
+    if (jresult_value == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+    env->SetByteArrayRegion(
+        jresult_value, 0, jvalue_len,
+        const_cast<jbyte*>(reinterpret_cast<const jbyte*>(value.data())));
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      env->DeleteLocalRef(jresult_value);
+      return nullptr;
+    }
+    env->SetObjectArrayElement(jresults, 1, jresult_value);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      env->DeleteLocalRef(jresult_value);
+      return nullptr;
+    }
+
+    env->DeleteLocalRef(jresult_value);
+  }
+
+  return jresults;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    iterator
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_RocksDB_iterator__J(JNIEnv*, jobject, jlong db_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(db_handle);
+  return rocksdb_iterator_helper(db, ROCKSDB_NAMESPACE::ReadOptions(), nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    iterator
+ * Signature: (JJ)J
+ */
+jlong Java_org_rocksdb_RocksDB_iterator__JJ(JNIEnv*, jobject, jlong db_handle,
+                                            jlong jread_options_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(db_handle);
+  auto& read_options =
+      *reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jread_options_handle);
+  return rocksdb_iterator_helper(db, read_options, nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    iteratorCF
+ * Signature: (JJ)J
+ */
+jlong Java_org_rocksdb_RocksDB_iteratorCF__JJ(JNIEnv*, jobject, jlong db_handle,
+                                              jlong jcf_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(db_handle);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  return rocksdb_iterator_helper(db, ROCKSDB_NAMESPACE::ReadOptions(),
+                                 cf_handle);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    iteratorCF
+ * Signature: (JJJ)J
+ */
+jlong Java_org_rocksdb_RocksDB_iteratorCF__JJJ(JNIEnv*, jobject,
+                                               jlong db_handle,
+                                               jlong jcf_handle,
+                                               jlong jread_options_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(db_handle);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  auto& read_options =
+      *reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jread_options_handle);
+  return rocksdb_iterator_helper(db, read_options, cf_handle);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    iterators
+ * Signature: (J[JJ)[J
+ */
+jlongArray Java_org_rocksdb_RocksDB_iterators(JNIEnv* env, jobject,
+                                              jlong db_handle,
+                                              jlongArray jcolumn_family_handles,
+                                              jlong jread_options_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(db_handle);
+  auto& read_options =
+      *reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jread_options_handle);
+
+  std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*> cf_handles;
+  if (jcolumn_family_handles != nullptr) {
+    const jsize len_cols = env->GetArrayLength(jcolumn_family_handles);
+    jlong* jcfh = env->GetLongArrayElements(jcolumn_family_handles, nullptr);
+    if (jcfh == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+
+    for (jsize i = 0; i < len_cols; i++) {
+      auto* cf_handle =
+          reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcfh[i]);
+      cf_handles.push_back(cf_handle);
+    }
+
+    env->ReleaseLongArrayElements(jcolumn_family_handles, jcfh, JNI_ABORT);
+  }
+
+  std::vector<ROCKSDB_NAMESPACE::Iterator*> iterators;
+  ROCKSDB_NAMESPACE::Status s =
+      db->NewIterators(read_options, cf_handles, &iterators);
+  if (s.ok()) {
+    jlongArray jLongArray =
+        env->NewLongArray(static_cast<jsize>(iterators.size()));
+    if (jLongArray == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+
+    for (std::vector<ROCKSDB_NAMESPACE::Iterator*>::size_type i = 0;
+         i < iterators.size(); i++) {
+      env->SetLongArrayRegion(
+          jLongArray, static_cast<jsize>(i), 1,
+          const_cast<jlong*>(reinterpret_cast<const jlong*>(&iterators[i])));
+      if (env->ExceptionCheck()) {
+        // exception thrown: ArrayIndexOutOfBoundsException
+        env->DeleteLocalRef(jLongArray);
+        return nullptr;
+      }
+    }
+
+    return jLongArray;
+  } else {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
+  }
+}
+
+/*
+ * Method:    getSnapshot
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_RocksDB_getSnapshot(JNIEnv*, jobject, jlong db_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(db_handle);
+  const ROCKSDB_NAMESPACE::Snapshot* snapshot = db->GetSnapshot();
+  return GET_CPLUSPLUS_POINTER(snapshot);
+}
+
+/*
+ * Method:    releaseSnapshot
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_RocksDB_releaseSnapshot(JNIEnv*, jobject, jlong db_handle,
+                                              jlong snapshot_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(db_handle);
+  auto* snapshot =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Snapshot*>(snapshot_handle);
+  db->ReleaseSnapshot(snapshot);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getProperty
+ * Signature: (JJLjava/lang/String;I)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_RocksDB_getProperty(JNIEnv* env, jobject,
+                                             jlong jdb_handle, jlong jcf_handle,
+                                             jstring jproperty,
+                                             jint jproperty_len) {
+  const char* property = env->GetStringUTFChars(jproperty, nullptr);
+  if (property == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+  ROCKSDB_NAMESPACE::Slice property_name(property, jproperty_len);
+
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle =
+        reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  }
+
+  std::string property_value;
+  bool retCode = db->GetProperty(cf_handle, property_name, &property_value);
+  env->ReleaseStringUTFChars(jproperty, property);
+
+  if (retCode) {
+    return env->NewStringUTF(property_value.c_str());
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+      env, ROCKSDB_NAMESPACE::Status::NotFound());
+  return nullptr;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getMapProperty
+ * Signature: (JJLjava/lang/String;I)Ljava/util/Map;
+ */
+jobject Java_org_rocksdb_RocksDB_getMapProperty(JNIEnv* env, jobject,
+                                                jlong jdb_handle,
+                                                jlong jcf_handle,
+                                                jstring jproperty,
+                                                jint jproperty_len) {
+  const char* property = env->GetStringUTFChars(jproperty, nullptr);
+  if (property == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+  ROCKSDB_NAMESPACE::Slice property_name(property, jproperty_len);
+
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle =
+        reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  }
+
+  std::map<std::string, std::string> property_value;
+  bool retCode = db->GetMapProperty(cf_handle, property_name, &property_value);
+  env->ReleaseStringUTFChars(jproperty, property);
+
+  if (retCode) {
+    return ROCKSDB_NAMESPACE::HashMapJni::fromCppMap(env, &property_value);
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+      env, ROCKSDB_NAMESPACE::Status::NotFound());
+  return nullptr;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getLongProperty
+ * Signature: (JJLjava/lang/String;I)J
+ */
+jlong Java_org_rocksdb_RocksDB_getLongProperty(JNIEnv* env, jobject,
+                                               jlong jdb_handle,
+                                               jlong jcf_handle,
+                                               jstring jproperty,
+                                               jint jproperty_len) {
+  const char* property = env->GetStringUTFChars(jproperty, nullptr);
+  if (property == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return 0;
+  }
+  ROCKSDB_NAMESPACE::Slice property_name(property, jproperty_len);
+
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle =
+        reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  }
+
+  uint64_t property_value;
+  bool retCode = db->GetIntProperty(cf_handle, property_name, &property_value);
+  env->ReleaseStringUTFChars(jproperty, property);
+
+  if (retCode) {
+    return property_value;
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+      env, ROCKSDB_NAMESPACE::Status::NotFound());
+  return 0;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    resetStats
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksDB_resetStats(JNIEnv*, jobject, jlong jdb_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  db->ResetStats();
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getAggregatedLongProperty
+ * Signature: (JLjava/lang/String;I)J
+ */
+jlong Java_org_rocksdb_RocksDB_getAggregatedLongProperty(JNIEnv* env, jobject,
+                                                         jlong db_handle,
+                                                         jstring jproperty,
+                                                         jint jproperty_len) {
+  const char* property = env->GetStringUTFChars(jproperty, nullptr);
+  if (property == nullptr) {
+    return 0;
+  }
+  ROCKSDB_NAMESPACE::Slice property_name(property, jproperty_len);
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(db_handle);
+  uint64_t property_value = 0;
+  bool retCode = db->GetAggregatedIntProperty(property_name, &property_value);
+  env->ReleaseStringUTFChars(jproperty, property);
+
+  if (retCode) {
+    return property_value;
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+      env, ROCKSDB_NAMESPACE::Status::NotFound());
+  return 0;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getApproximateSizes
+ * Signature: (JJ[JB)[J
+ */
+jlongArray Java_org_rocksdb_RocksDB_getApproximateSizes(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle,
+    jlongArray jrange_slice_handles, jbyte jinclude_flags) {
+  const jsize jlen = env->GetArrayLength(jrange_slice_handles);
+  const size_t range_count = jlen / 2;
+
+  jlong* jranges = env->GetLongArrayElements(jrange_slice_handles, nullptr);
+  if (jranges == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+
+  auto ranges = std::unique_ptr<ROCKSDB_NAMESPACE::Range[]>(
+      new ROCKSDB_NAMESPACE::Range[range_count]);
+  size_t range_offset = 0;
+  for (jsize i = 0; i < jlen; ++i) {
+    auto* start = reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(jranges[i]);
+    auto* limit = reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(jranges[++i]);
+    ranges.get()[range_offset++] = ROCKSDB_NAMESPACE::Range(*start, *limit);
+  }
+
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle =
+        reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  }
+
+  auto sizes = std::unique_ptr<uint64_t[]>(new uint64_t[range_count]);
+
+  ROCKSDB_NAMESPACE::DB::SizeApproximationFlags include_flags =
+      ROCKSDB_NAMESPACE::DB::SizeApproximationFlags::NONE;
+  if (jinclude_flags & 1) {
+    include_flags =
+        ROCKSDB_NAMESPACE::DB::SizeApproximationFlags::INCLUDE_MEMTABLES;
+  }
+  if (jinclude_flags & 2) {
+    include_flags =
+        (include_flags |
+         ROCKSDB_NAMESPACE::DB::SizeApproximationFlags::INCLUDE_FILES);
+  }
+
+  db->GetApproximateSizes(cf_handle, ranges.get(),
+                          static_cast<int>(range_count), sizes.get(),
+                          include_flags);
+
+  // release LongArrayElements
+  env->ReleaseLongArrayElements(jrange_slice_handles, jranges, JNI_ABORT);
+
+  // prepare results
+  auto results = std::unique_ptr<jlong[]>(new jlong[range_count]);
+  for (size_t i = 0; i < range_count; ++i) {
+    results.get()[i] = static_cast<jlong>(sizes.get()[i]);
+  }
+
+  const jsize jrange_count = jlen / 2;
+  jlongArray jresults = env->NewLongArray(jrange_count);
+  if (jresults == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+
+  env->SetLongArrayRegion(jresults, 0, jrange_count, results.get());
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    env->DeleteLocalRef(jresults);
+    return nullptr;
+  }
+
+  return jresults;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getApproximateMemTableStats
+ * Signature: (JJJJ)[J
+ */
+jlongArray Java_org_rocksdb_RocksDB_getApproximateMemTableStats(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle,
+    jlong jstartHandle, jlong jlimitHandle) {
+  auto* start = reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(jstartHandle);
+  auto* limit = reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(jlimitHandle);
+  const ROCKSDB_NAMESPACE::Range range(*start, *limit);
+
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle =
+        reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  }
+
+  uint64_t count = 0;
+  uint64_t sizes = 0;
+  db->GetApproximateMemTableStats(cf_handle, range, &count, &sizes);
+
+  // prepare results
+  jlong results[2] = {static_cast<jlong>(count), static_cast<jlong>(sizes)};
+
+  jlongArray jsizes = env->NewLongArray(2);
+  if (jsizes == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+
+  env->SetLongArrayRegion(jsizes, 0, 2, results);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    env->DeleteLocalRef(jsizes);
+    return nullptr;
+  }
+
+  return jsizes;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    compactRange
+ * Signature: (J[BI[BIJJ)V
+ */
+void Java_org_rocksdb_RocksDB_compactRange(JNIEnv* env, jobject,
+                                           jlong jdb_handle, jbyteArray jbegin,
+                                           jint jbegin_len, jbyteArray jend,
+                                           jint jend_len,
+                                           jlong jcompact_range_opts_handle,
+                                           jlong jcf_handle) {
+  jboolean has_exception = JNI_FALSE;
+
+  std::string str_begin;
+  if (jbegin_len > 0) {
+    str_begin = ROCKSDB_NAMESPACE::JniUtil::byteString<std::string>(
+        env, jbegin, jbegin_len,
+        [](const char* str, const size_t len) { return std::string(str, len); },
+        &has_exception);
+    if (has_exception == JNI_TRUE) {
+      // exception occurred
+      return;
+    }
+  }
+
+  std::string str_end;
+  if (jend_len > 0) {
+    str_end = ROCKSDB_NAMESPACE::JniUtil::byteString<std::string>(
+        env, jend, jend_len,
+        [](const char* str, const size_t len) { return std::string(str, len); },
+        &has_exception);
+    if (has_exception == JNI_TRUE) {
+      // exception occurred
+      return;
+    }
+  }
+
+  ROCKSDB_NAMESPACE::CompactRangeOptions* compact_range_opts = nullptr;
+  if (jcompact_range_opts_handle == 0) {
+    // NOTE: we DO own the pointer!
+    compact_range_opts = new ROCKSDB_NAMESPACE::CompactRangeOptions();
+  } else {
+    // NOTE: we do NOT own the pointer!
+    compact_range_opts =
+        reinterpret_cast<ROCKSDB_NAMESPACE::CompactRangeOptions*>(
+            jcompact_range_opts_handle);
+  }
+
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+
+  ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle =
+        reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  }
+
+  ROCKSDB_NAMESPACE::Status s;
+  if (jbegin_len > 0 || jend_len > 0) {
+    const ROCKSDB_NAMESPACE::Slice begin(str_begin);
+    const ROCKSDB_NAMESPACE::Slice end(str_end);
+    s = db->CompactRange(*compact_range_opts, cf_handle, &begin, &end);
+  } else {
+    s = db->CompactRange(*compact_range_opts, cf_handle, nullptr, nullptr);
+  }
+
+  if (jcompact_range_opts_handle == 0) {
+    delete compact_range_opts;
+  }
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    setOptions
+ * Signature: (JJ[Ljava/lang/String;[Ljava/lang/String;)V
+ */
+void Java_org_rocksdb_RocksDB_setOptions(JNIEnv* env, jobject, jlong jdb_handle,
+                                         jlong jcf_handle, jobjectArray jkeys,
+                                         jobjectArray jvalues) {
+  const jsize len = env->GetArrayLength(jkeys);
+  assert(len == env->GetArrayLength(jvalues));
+
+  std::unordered_map<std::string, std::string> options_map;
+  for (jsize i = 0; i < len; i++) {
+    jobject jobj_key = env->GetObjectArrayElement(jkeys, i);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      return;
+    }
+
+    jobject jobj_value = env->GetObjectArrayElement(jvalues, i);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      env->DeleteLocalRef(jobj_key);
+      return;
+    }
+
+    jboolean has_exception = JNI_FALSE;
+    std::string s_key = ROCKSDB_NAMESPACE::JniUtil::copyStdString(
+        env, reinterpret_cast<jstring>(jobj_key), &has_exception);
+    if (has_exception == JNI_TRUE) {
+      // exception occurred
+      env->DeleteLocalRef(jobj_value);
+      env->DeleteLocalRef(jobj_key);
+      return;
+    }
+
+    std::string s_value = ROCKSDB_NAMESPACE::JniUtil::copyStdString(
+        env, reinterpret_cast<jstring>(jobj_value), &has_exception);
+    if (has_exception == JNI_TRUE) {
+      // exception occurred
+      env->DeleteLocalRef(jobj_value);
+      env->DeleteLocalRef(jobj_key);
+      return;
+    }
+
+    options_map[s_key] = s_value;
+
+    env->DeleteLocalRef(jobj_key);
+    env->DeleteLocalRef(jobj_value);
+  }
+
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle == nullptr) {
+    cf_handle = db->DefaultColumnFamily();
+  }
+  auto s = db->SetOptions(cf_handle, options_map);
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    setDBOptions
+ * Signature: (J[Ljava/lang/String;[Ljava/lang/String;)V
+ */
+void Java_org_rocksdb_RocksDB_setDBOptions(JNIEnv* env, jobject,
+                                           jlong jdb_handle, jobjectArray jkeys,
+                                           jobjectArray jvalues) {
+  const jsize len = env->GetArrayLength(jkeys);
+  assert(len == env->GetArrayLength(jvalues));
+
+  std::unordered_map<std::string, std::string> options_map;
+  for (jsize i = 0; i < len; i++) {
+    jobject jobj_key = env->GetObjectArrayElement(jkeys, i);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      return;
+    }
+
+    jobject jobj_value = env->GetObjectArrayElement(jvalues, i);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      env->DeleteLocalRef(jobj_key);
+      return;
+    }
+
+    jboolean has_exception = JNI_FALSE;
+    std::string s_key = ROCKSDB_NAMESPACE::JniUtil::copyStdString(
+        env, reinterpret_cast<jstring>(jobj_key), &has_exception);
+    if (has_exception == JNI_TRUE) {
+      // exception occurred
+      env->DeleteLocalRef(jobj_value);
+      env->DeleteLocalRef(jobj_key);
+      return;
+    }
+
+    std::string s_value = ROCKSDB_NAMESPACE::JniUtil::copyStdString(
+        env, reinterpret_cast<jstring>(jobj_value), &has_exception);
+    if (has_exception == JNI_TRUE) {
+      // exception occurred
+      env->DeleteLocalRef(jobj_value);
+      env->DeleteLocalRef(jobj_key);
+      return;
+    }
+
+    options_map[s_key] = s_value;
+
+    env->DeleteLocalRef(jobj_key);
+    env->DeleteLocalRef(jobj_value);
+  }
+
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto s = db->SetDBOptions(options_map);
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getOptions
+ * Signature: (JJ)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_RocksDB_getOptions(JNIEnv* env, jobject,
+                                            jlong jdb_handle,
+                                            jlong jcf_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+
+  ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle =
+        reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  }
+
+  auto options = db->GetOptions(cf_handle);
+  std::string options_as_string;
+  ROCKSDB_NAMESPACE::Status s =
+      GetStringFromColumnFamilyOptions(&options_as_string, options);
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
+  }
+  return env->NewStringUTF(options_as_string.c_str());
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getDBOptions
+ * Signature: (J)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_RocksDB_getDBOptions(JNIEnv* env, jobject,
+                                              jlong jdb_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+
+  auto options = db->GetDBOptions();
+  std::string options_as_string;
+  ROCKSDB_NAMESPACE::Status s =
+      GetStringFromDBOptions(&options_as_string, options);
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
+  }
+  return env->NewStringUTF(options_as_string.c_str());
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    compactFiles
+ * Signature: (JJJ[Ljava/lang/String;IIJ)[Ljava/lang/String;
+ */
+jobjectArray Java_org_rocksdb_RocksDB_compactFiles(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jcompaction_opts_handle,
+    jlong jcf_handle, jobjectArray jinput_file_names, jint joutput_level,
+    jint joutput_path_id, jlong jcompaction_job_info_handle) {
+  jboolean has_exception = JNI_FALSE;
+  const std::vector<std::string> input_file_names =
+      ROCKSDB_NAMESPACE::JniUtil::copyStrings(env, jinput_file_names,
+                                              &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    return nullptr;
+  }
+
+  auto* compaction_opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptions*>(
+          jcompaction_opts_handle);
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle =
+        reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  }
+
+  ROCKSDB_NAMESPACE::CompactionJobInfo* compaction_job_info = nullptr;
+  if (jcompaction_job_info_handle != 0) {
+    compaction_job_info =
+        reinterpret_cast<ROCKSDB_NAMESPACE::CompactionJobInfo*>(
+            jcompaction_job_info_handle);
+  }
+
+  std::vector<std::string> output_file_names;
+  auto s = db->CompactFiles(*compaction_opts, cf_handle, input_file_names,
+                            static_cast<int>(joutput_level),
+                            static_cast<int>(joutput_path_id),
+                            &output_file_names, compaction_job_info);
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
+  }
+
+  return ROCKSDB_NAMESPACE::JniUtil::toJavaStrings(env, &output_file_names);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    cancelAllBackgroundWork
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_RocksDB_cancelAllBackgroundWork(JNIEnv*, jobject,
+                                                      jlong jdb_handle,
+                                                      jboolean jwait) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  ROCKSDB_NAMESPACE::CancelAllBackgroundWork(db, jwait);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    pauseBackgroundWork
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksDB_pauseBackgroundWork(JNIEnv* env, jobject,
+                                                  jlong jdb_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto s = db->PauseBackgroundWork();
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    continueBackgroundWork
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksDB_continueBackgroundWork(JNIEnv* env, jobject,
+                                                     jlong jdb_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto s = db->ContinueBackgroundWork();
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    enableAutoCompaction
+ * Signature: (J[J)V
+ */
+void Java_org_rocksdb_RocksDB_enableAutoCompaction(JNIEnv* env, jobject,
+                                                   jlong jdb_handle,
+                                                   jlongArray jcf_handles) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  jboolean has_exception = JNI_FALSE;
+  const std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*> cf_handles =
+      ROCKSDB_NAMESPACE::JniUtil::fromJPointers<
+          ROCKSDB_NAMESPACE::ColumnFamilyHandle>(env, jcf_handles,
+                                                 &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    return;
+  }
+  db->EnableAutoCompaction(cf_handles);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    numberLevels
+ * Signature: (JJ)I
+ */
+jint Java_org_rocksdb_RocksDB_numberLevels(JNIEnv*, jobject, jlong jdb_handle,
+                                           jlong jcf_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle =
+        reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  }
+  return static_cast<jint>(db->NumberLevels(cf_handle));
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    maxMemCompactionLevel
+ * Signature: (JJ)I
+ */
+jint Java_org_rocksdb_RocksDB_maxMemCompactionLevel(JNIEnv*, jobject,
+                                                    jlong jdb_handle,
+                                                    jlong jcf_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle =
+        reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  }
+  return static_cast<jint>(db->MaxMemCompactionLevel(cf_handle));
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    level0StopWriteTrigger
+ * Signature: (JJ)I
+ */
+jint Java_org_rocksdb_RocksDB_level0StopWriteTrigger(JNIEnv*, jobject,
+                                                     jlong jdb_handle,
+                                                     jlong jcf_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle =
+        reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  }
+  return static_cast<jint>(db->Level0StopWriteTrigger(cf_handle));
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getName
+ * Signature: (J)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_RocksDB_getName(JNIEnv* env, jobject,
+                                         jlong jdb_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  std::string name = db->GetName();
+  return ROCKSDB_NAMESPACE::JniUtil::toJavaString(env, &name, false);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getEnv
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_RocksDB_getEnv(JNIEnv*, jobject, jlong jdb_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  return GET_CPLUSPLUS_POINTER(db->GetEnv());
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    flush
+ * Signature: (JJ[J)V
+ */
+void Java_org_rocksdb_RocksDB_flush(JNIEnv* env, jobject, jlong jdb_handle,
+                                    jlong jflush_opts_handle,
+                                    jlongArray jcf_handles) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto* flush_opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::FlushOptions*>(jflush_opts_handle);
+  std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*> cf_handles;
+  if (jcf_handles == nullptr) {
+    cf_handles.push_back(db->DefaultColumnFamily());
+  } else {
+    jboolean has_exception = JNI_FALSE;
+    cf_handles = ROCKSDB_NAMESPACE::JniUtil::fromJPointers<
+        ROCKSDB_NAMESPACE::ColumnFamilyHandle>(env, jcf_handles,
+                                               &has_exception);
+    if (has_exception) {
+      // exception occurred
+      return;
+    }
+  }
+  auto s = db->Flush(*flush_opts, cf_handles);
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    flushWal
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_RocksDB_flushWal(JNIEnv* env, jobject, jlong jdb_handle,
+                                       jboolean jsync) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto s = db->FlushWAL(jsync == JNI_TRUE);
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    syncWal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksDB_syncWal(JNIEnv* env, jobject, jlong jdb_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto s = db->SyncWAL();
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getLatestSequenceNumber
+ * Signature: (J)V
+ */
+jlong Java_org_rocksdb_RocksDB_getLatestSequenceNumber(JNIEnv*, jobject,
+                                                       jlong jdb_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  return db->GetLatestSequenceNumber();
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    disableFileDeletions
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksDB_disableFileDeletions(JNIEnv* env, jobject,
+                                                   jlong jdb_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  ROCKSDB_NAMESPACE::Status s = db->DisableFileDeletions();
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    enableFileDeletions
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_RocksDB_enableFileDeletions(JNIEnv* env, jobject,
+                                                  jlong jdb_handle,
+                                                  jboolean jforce) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  ROCKSDB_NAMESPACE::Status s = db->EnableFileDeletions(jforce);
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getLiveFiles
+ * Signature: (JZ)[Ljava/lang/String;
+ */
+jobjectArray Java_org_rocksdb_RocksDB_getLiveFiles(JNIEnv* env, jobject,
+                                                   jlong jdb_handle,
+                                                   jboolean jflush_memtable) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  std::vector<std::string> live_files;
+  uint64_t manifest_file_size = 0;
+  auto s = db->GetLiveFiles(live_files, &manifest_file_size,
+                            jflush_memtable == JNI_TRUE);
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
+  }
+
+  // append the manifest_file_size to the vector
+  // for passing back to java
+  live_files.push_back(std::to_string(manifest_file_size));
+
+  return ROCKSDB_NAMESPACE::JniUtil::toJavaStrings(env, &live_files);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getSortedWalFiles
+ * Signature: (J)[Lorg/rocksdb/LogFile;
+ */
+jobjectArray Java_org_rocksdb_RocksDB_getSortedWalFiles(JNIEnv* env, jobject,
+                                                        jlong jdb_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  std::vector<std::unique_ptr<ROCKSDB_NAMESPACE::LogFile>> sorted_wal_files;
+  auto s = db->GetSortedWalFiles(sorted_wal_files);
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
+  }
+
+  // convert to Java type
+  const jsize jlen = static_cast<jsize>(sorted_wal_files.size());
+  jobjectArray jsorted_wal_files = env->NewObjectArray(
+      jlen, ROCKSDB_NAMESPACE::LogFileJni::getJClass(env), nullptr);
+  if (jsorted_wal_files == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+
+  jsize i = 0;
+  for (auto it = sorted_wal_files.begin(); it != sorted_wal_files.end(); ++it) {
+    jobject jlog_file =
+        ROCKSDB_NAMESPACE::LogFileJni::fromCppLogFile(env, it->get());
+    if (jlog_file == nullptr) {
+      // exception occurred
+      env->DeleteLocalRef(jsorted_wal_files);
+      return nullptr;
+    }
+
+    env->SetObjectArrayElement(jsorted_wal_files, i++, jlog_file);
+    if (env->ExceptionCheck()) {
+      // exception occurred
+      env->DeleteLocalRef(jlog_file);
+      env->DeleteLocalRef(jsorted_wal_files);
+      return nullptr;
+    }
+
+    env->DeleteLocalRef(jlog_file);
+  }
+
+  return jsorted_wal_files;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getUpdatesSince
+ * Signature: (JJ)J
+ */
+jlong Java_org_rocksdb_RocksDB_getUpdatesSince(JNIEnv* env, jobject,
+                                               jlong jdb_handle,
+                                               jlong jsequence_number) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  ROCKSDB_NAMESPACE::SequenceNumber sequence_number =
+      static_cast<ROCKSDB_NAMESPACE::SequenceNumber>(jsequence_number);
+  std::unique_ptr<ROCKSDB_NAMESPACE::TransactionLogIterator> iter;
+  ROCKSDB_NAMESPACE::Status s = db->GetUpdatesSince(sequence_number, &iter);
+  if (s.ok()) {
+    return GET_CPLUSPLUS_POINTER(iter.release());
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  return 0;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    deleteFile
+ * Signature: (JLjava/lang/String;)V
+ */
+void Java_org_rocksdb_RocksDB_deleteFile(JNIEnv* env, jobject, jlong jdb_handle,
+                                         jstring jname) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  jboolean has_exception = JNI_FALSE;
+  std::string name =
+      ROCKSDB_NAMESPACE::JniUtil::copyStdString(env, jname, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    return;
+  }
+  db->DeleteFile(name);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getLiveFilesMetaData
+ * Signature: (J)[Lorg/rocksdb/LiveFileMetaData;
+ */
+jobjectArray Java_org_rocksdb_RocksDB_getLiveFilesMetaData(JNIEnv* env, jobject,
+                                                           jlong jdb_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  std::vector<ROCKSDB_NAMESPACE::LiveFileMetaData> live_files_meta_data;
+  db->GetLiveFilesMetaData(&live_files_meta_data);
+
+  // convert to Java type
+  const jsize jlen = static_cast<jsize>(live_files_meta_data.size());
+  jobjectArray jlive_files_meta_data = env->NewObjectArray(
+      jlen, ROCKSDB_NAMESPACE::LiveFileMetaDataJni::getJClass(env), nullptr);
+  if (jlive_files_meta_data == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+
+  jsize i = 0;
+  for (auto it = live_files_meta_data.begin(); it != live_files_meta_data.end();
+       ++it) {
+    jobject jlive_file_meta_data =
+        ROCKSDB_NAMESPACE::LiveFileMetaDataJni::fromCppLiveFileMetaData(env,
+                                                                        &(*it));
+    if (jlive_file_meta_data == nullptr) {
+      // exception occurred
+      env->DeleteLocalRef(jlive_files_meta_data);
+      return nullptr;
+    }
+
+    env->SetObjectArrayElement(jlive_files_meta_data, i++,
+                               jlive_file_meta_data);
+    if (env->ExceptionCheck()) {
+      // exception occurred
+      env->DeleteLocalRef(jlive_file_meta_data);
+      env->DeleteLocalRef(jlive_files_meta_data);
+      return nullptr;
+    }
+
+    env->DeleteLocalRef(jlive_file_meta_data);
+  }
+
+  return jlive_files_meta_data;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getColumnFamilyMetaData
+ * Signature: (JJ)Lorg/rocksdb/ColumnFamilyMetaData;
+ */
+jobject Java_org_rocksdb_RocksDB_getColumnFamilyMetaData(JNIEnv* env, jobject,
+                                                         jlong jdb_handle,
+                                                         jlong jcf_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle =
+        reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  }
+  ROCKSDB_NAMESPACE::ColumnFamilyMetaData cf_metadata;
+  db->GetColumnFamilyMetaData(cf_handle, &cf_metadata);
+  return ROCKSDB_NAMESPACE::ColumnFamilyMetaDataJni::
+      fromCppColumnFamilyMetaData(env, &cf_metadata);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    ingestExternalFile
+ * Signature: (JJ[Ljava/lang/String;IJ)V
+ */
+void Java_org_rocksdb_RocksDB_ingestExternalFile(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle,
+    jobjectArray jfile_path_list, jint jfile_path_list_len,
+    jlong jingest_external_file_options_handle) {
+  jboolean has_exception = JNI_FALSE;
+  std::vector<std::string> file_path_list =
+      ROCKSDB_NAMESPACE::JniUtil::copyStrings(
+          env, jfile_path_list, jfile_path_list_len, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    return;
+  }
+
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto* column_family =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  auto* ifo = reinterpret_cast<ROCKSDB_NAMESPACE::IngestExternalFileOptions*>(
+      jingest_external_file_options_handle);
+  ROCKSDB_NAMESPACE::Status s =
+      db->IngestExternalFile(column_family, file_path_list, *ifo);
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    verifyChecksum
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksDB_verifyChecksum(JNIEnv* env, jobject,
+                                             jlong jdb_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto s = db->VerifyChecksum();
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getDefaultColumnFamily
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_RocksDB_getDefaultColumnFamily(JNIEnv*, jobject,
+                                                      jlong jdb_handle) {
+  auto* db_handle = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto* cf_handle = db_handle->DefaultColumnFamily();
+  return GET_CPLUSPLUS_POINTER(cf_handle);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getPropertiesOfAllTables
+ * Signature: (JJ)Ljava/util/Map;
+ */
+jobject Java_org_rocksdb_RocksDB_getPropertiesOfAllTables(JNIEnv* env, jobject,
+                                                          jlong jdb_handle,
+                                                          jlong jcf_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle =
+        reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  }
+  ROCKSDB_NAMESPACE::TablePropertiesCollection table_properties_collection;
+  auto s =
+      db->GetPropertiesOfAllTables(cf_handle, &table_properties_collection);
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+
+  // convert to Java type
+  jobject jhash_map = ROCKSDB_NAMESPACE::HashMapJni::construct(
+      env, static_cast<uint32_t>(table_properties_collection.size()));
+  if (jhash_map == nullptr) {
+    // exception occurred
+    return nullptr;
+  }
+
+  const ROCKSDB_NAMESPACE::HashMapJni::FnMapKV<
+      const std::string,
+      const std::shared_ptr<const ROCKSDB_NAMESPACE::TableProperties>, jobject,
+      jobject>
+      fn_map_kv =
+          [env](const std::pair<const std::string,
+                                const std::shared_ptr<
+                                    const ROCKSDB_NAMESPACE::TableProperties>>&
+                    kv) {
+            jstring jkey = ROCKSDB_NAMESPACE::JniUtil::toJavaString(
+                env, &(kv.first), false);
+            if (env->ExceptionCheck()) {
+              // an error occurred
+              return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+            }
+
+            jobject jtable_properties =
+                ROCKSDB_NAMESPACE::TablePropertiesJni::fromCppTableProperties(
+                    env, *(kv.second.get()));
+            if (jtable_properties == nullptr) {
+              // an error occurred
+              env->DeleteLocalRef(jkey);
+              return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+            }
+
+            return std::unique_ptr<std::pair<jobject, jobject>>(
+                new std::pair<jobject, jobject>(
+                    static_cast<jobject>(jkey),
+                    static_cast<jobject>(jtable_properties)));
+          };
+
+  if (!ROCKSDB_NAMESPACE::HashMapJni::putAll(
+          env, jhash_map, table_properties_collection.begin(),
+          table_properties_collection.end(), fn_map_kv)) {
+    // exception occurred
+    return nullptr;
+  }
+
+  return jhash_map;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getPropertiesOfTablesInRange
+ * Signature: (JJ[J)Ljava/util/Map;
+ */
+jobject Java_org_rocksdb_RocksDB_getPropertiesOfTablesInRange(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle,
+    jlongArray jrange_slice_handles) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle =
+        reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  }
+  const jsize jlen = env->GetArrayLength(jrange_slice_handles);
+  jlong* jrange_slice_handle =
+      env->GetLongArrayElements(jrange_slice_handles, nullptr);
+  if (jrange_slice_handle == nullptr) {
+    // exception occurred
+    return nullptr;
+  }
+
+  const size_t ranges_len = static_cast<size_t>(jlen / 2);
+  auto ranges = std::unique_ptr<ROCKSDB_NAMESPACE::Range[]>(
+      new ROCKSDB_NAMESPACE::Range[ranges_len]);
+  for (jsize i = 0, j = 0; i < jlen; ++i) {
+    auto* start =
+        reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(jrange_slice_handle[i]);
+    auto* limit =
+        reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(jrange_slice_handle[++i]);
+    ranges[j++] = ROCKSDB_NAMESPACE::Range(*start, *limit);
+  }
+
+  ROCKSDB_NAMESPACE::TablePropertiesCollection table_properties_collection;
+  auto s = db->GetPropertiesOfTablesInRange(cf_handle, ranges.get(), ranges_len,
+                                            &table_properties_collection);
+  if (!s.ok()) {
+    // error occurred
+    env->ReleaseLongArrayElements(jrange_slice_handles, jrange_slice_handle,
+                                  JNI_ABORT);
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
+  }
+
+  // cleanup
+  env->ReleaseLongArrayElements(jrange_slice_handles, jrange_slice_handle,
+                                JNI_ABORT);
+
+  return jrange_slice_handles;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    suggestCompactRange
+ * Signature: (JJ)[J
+ */
+jlongArray Java_org_rocksdb_RocksDB_suggestCompactRange(JNIEnv* env, jobject,
+                                                        jlong jdb_handle,
+                                                        jlong jcf_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle =
+        reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  }
+  auto* begin = new ROCKSDB_NAMESPACE::Slice();
+  auto* end = new ROCKSDB_NAMESPACE::Slice();
+  auto s = db->SuggestCompactRange(cf_handle, begin, end);
+  if (!s.ok()) {
+    // error occurred
+    delete begin;
+    delete end;
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
+  }
+
+  jlongArray jslice_handles = env->NewLongArray(2);
+  if (jslice_handles == nullptr) {
+    // exception thrown: OutOfMemoryError
+    delete begin;
+    delete end;
+    return nullptr;
+  }
+
+  jlong slice_handles[2];
+  slice_handles[0] = GET_CPLUSPLUS_POINTER(begin);
+  slice_handles[1] = GET_CPLUSPLUS_POINTER(end);
+  env->SetLongArrayRegion(jslice_handles, 0, 2, slice_handles);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    delete begin;
+    delete end;
+    env->DeleteLocalRef(jslice_handles);
+    return nullptr;
+  }
+
+  return jslice_handles;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    promoteL0
+ * Signature: (JJI)V
+ */
+void Java_org_rocksdb_RocksDB_promoteL0(JNIEnv*, jobject, jlong jdb_handle,
+                                        jlong jcf_handle, jint jtarget_level) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle =
+        reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  }
+  db->PromoteL0(cf_handle, static_cast<int>(jtarget_level));
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    startTrace
+ * Signature: (JJJ)V
+ */
+void Java_org_rocksdb_RocksDB_startTrace(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jmax_trace_file_size,
+    jlong jtrace_writer_jnicallback_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  ROCKSDB_NAMESPACE::TraceOptions trace_options;
+  trace_options.max_trace_file_size =
+      static_cast<uint64_t>(jmax_trace_file_size);
+  // transfer ownership of trace writer from Java to C++
+  auto trace_writer =
+      std::unique_ptr<ROCKSDB_NAMESPACE::TraceWriterJniCallback>(
+          reinterpret_cast<ROCKSDB_NAMESPACE::TraceWriterJniCallback*>(
+              jtrace_writer_jnicallback_handle));
+  auto s = db->StartTrace(trace_options, std::move(trace_writer));
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    endTrace
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksDB_endTrace(JNIEnv* env, jobject, jlong jdb_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto s = db->EndTrace();
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    tryCatchUpWithPrimary
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksDB_tryCatchUpWithPrimary(JNIEnv* env, jobject,
+                                                    jlong jdb_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto s = db->TryCatchUpWithPrimary();
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    destroyDB
+ * Signature: (Ljava/lang/String;J)V
+ */
+void Java_org_rocksdb_RocksDB_destroyDB(JNIEnv* env, jclass, jstring jdb_path,
+                                        jlong joptions_handle) {
+  const char* db_path = env->GetStringUTFChars(jdb_path, nullptr);
+  if (db_path == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(joptions_handle);
+  if (options == nullptr) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env, ROCKSDB_NAMESPACE::Status::InvalidArgument("Invalid Options."));
+  }
+
+  ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::DestroyDB(db_path, *options);
+  env->ReleaseStringUTFChars(jdb_path, db_path);
+
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+bool get_slice_helper(JNIEnv* env, jobjectArray ranges, jsize index,
+                      std::unique_ptr<ROCKSDB_NAMESPACE::Slice>& slice,
+                      std::vector<std::unique_ptr<jbyte[]>>& ranges_to_free) {
+  jobject jArray = env->GetObjectArrayElement(ranges, index);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    return false;
+  }
+
+  if (jArray == nullptr) {
+    return true;
+  }
+
+  jbyteArray jba = reinterpret_cast<jbyteArray>(jArray);
+  jsize len_ba = env->GetArrayLength(jba);
+  ranges_to_free.push_back(std::unique_ptr<jbyte[]>(new jbyte[len_ba]));
+  env->GetByteArrayRegion(jba, 0, len_ba, ranges_to_free.back().get());
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    env->DeleteLocalRef(jArray);
+    return false;
+  }
+  env->DeleteLocalRef(jArray);
+  slice.reset(new ROCKSDB_NAMESPACE::Slice(
+      reinterpret_cast<char*>(ranges_to_free.back().get()), len_ba));
+  return true;
+}
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    deleteFilesInRanges
+ * Signature: (JJLjava/util/List;Z)V
+ */
+void Java_org_rocksdb_RocksDB_deleteFilesInRanges(JNIEnv* env, jobject /*jdb*/,
+                                                  jlong jdb_handle,
+                                                  jlong jcf_handle,
+                                                  jobjectArray ranges,
+                                                  jboolean include_end) {
+  jsize length = env->GetArrayLength(ranges);
+
+  std::vector<ROCKSDB_NAMESPACE::RangePtr> rangesVector;
+  std::vector<std::unique_ptr<ROCKSDB_NAMESPACE::Slice>> slices;
+  std::vector<std::unique_ptr<jbyte[]>> ranges_to_free;
+  for (jsize i = 0; (i + 1) < length; i += 2) {
+    slices.push_back(std::unique_ptr<ROCKSDB_NAMESPACE::Slice>());
+    if (!get_slice_helper(env, ranges, i, slices.back(), ranges_to_free)) {
+      // exception thrown
+      return;
+    }
+
+    slices.push_back(std::unique_ptr<ROCKSDB_NAMESPACE::Slice>());
+    if (!get_slice_helper(env, ranges, i + 1, slices.back(), ranges_to_free)) {
+      // exception thrown
+      return;
+    }
+
+    rangesVector.push_back(ROCKSDB_NAMESPACE::RangePtr(
+        slices[slices.size() - 2].get(), slices[slices.size() - 1].get()));
+  }
+
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto* column_family =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+
+  ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::DeleteFilesInRanges(
+      db, column_family == nullptr ? db->DefaultColumnFamily() : column_family,
+      rangesVector.data(), rangesVector.size(), include_end);
+
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    version
+ * Signature: ()I
+ */
+jint Java_org_rocksdb_RocksDB_version(JNIEnv*, jclass) {
+  uint32_t encodedVersion = (ROCKSDB_MAJOR & 0xff) << 16;
+  encodedVersion |= (ROCKSDB_MINOR & 0xff) << 8;
+  encodedVersion |= (ROCKSDB_PATCH & 0xff);
+  return static_cast<jint>(encodedVersion);
+}
diff --git a/src/rocksdb/java/rocksjni/slice.cc b/src/rocksdb/java/rocksjni/slice.cc
new file mode 100644
index 000000000..63c6b1b9f
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/slice.cc
@@ -0,0 +1,374 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::Slice.
+
+#include "rocksdb/slice.h"
+
+#include <jni.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <string>
+
+#include "include/org_rocksdb_AbstractSlice.h"
+#include "include/org_rocksdb_DirectSlice.h"
+#include "include/org_rocksdb_Slice.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+
+// <editor-fold desc="org.rocksdb.AbstractSlice>
+
+/*
+ * Class:     org_rocksdb_AbstractSlice
+ * Method:    createNewSliceFromString
+ * Signature: (Ljava/lang/String;)J
+ */
+jlong Java_org_rocksdb_AbstractSlice_createNewSliceFromString(JNIEnv* env,
+                                                              jclass /*jcls*/,
+                                                              jstring jstr) {
+  const auto* str = env->GetStringUTFChars(jstr, nullptr);
+  if (str == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return 0;
+  }
+
+  const size_t len = strlen(str);
+
+  // NOTE: buf will be deleted in the
+  // Java_org_rocksdb_Slice_disposeInternalBuf or
+  // or Java_org_rocksdb_DirectSlice_disposeInternalBuf methods
+  char* buf = new char[len + 1];
+  memcpy(buf, str, len);
+  buf[len] = 0;
+  env->ReleaseStringUTFChars(jstr, str);
+
+  const auto* slice = new ROCKSDB_NAMESPACE::Slice(buf);
+  return GET_CPLUSPLUS_POINTER(slice);
+}
+
+/*
+ * Class:     org_rocksdb_AbstractSlice
+ * Method:    size0
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_AbstractSlice_size0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                          jlong handle) {
+  const auto* slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(handle);
+  return static_cast<jint>(slice->size());
+}
+
+/*
+ * Class:     org_rocksdb_AbstractSlice
+ * Method:    empty0
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_AbstractSlice_empty0(JNIEnv* /*env*/,
+                                               jobject /*jobj*/, jlong handle) {
+  const auto* slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(handle);
+  return slice->empty();
+}
+
+/*
+ * Class:     org_rocksdb_AbstractSlice
+ * Method:    toString0
+ * Signature: (JZ)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_AbstractSlice_toString0(JNIEnv* env, jobject /*jobj*/,
+                                                 jlong handle, jboolean hex) {
+  const auto* slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(handle);
+  const std::string s = slice->ToString(hex);
+  return env->NewStringUTF(s.c_str());
+}
+
+/*
+ * Class:     org_rocksdb_AbstractSlice
+ * Method:    compare0
+ * Signature: (JJ)I;
+ */
+jint Java_org_rocksdb_AbstractSlice_compare0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                             jlong handle, jlong otherHandle) {
+  const auto* slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(handle);
+  const auto* otherSlice =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(otherHandle);
+  return slice->compare(*otherSlice);
+}
+
+/*
+ * Class:     org_rocksdb_AbstractSlice
+ * Method:    startsWith0
+ * Signature: (JJ)Z;
+ */
+jboolean Java_org_rocksdb_AbstractSlice_startsWith0(JNIEnv* /*env*/,
+                                                    jobject /*jobj*/,
+                                                    jlong handle,
+                                                    jlong otherHandle) {
+  const auto* slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(handle);
+  const auto* otherSlice =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(otherHandle);
+  return slice->starts_with(*otherSlice);
+}
+
+/*
+ * Class:     org_rocksdb_AbstractSlice
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_AbstractSlice_disposeInternal(JNIEnv* /*env*/,
+                                                    jobject /*jobj*/,
+                                                    jlong handle) {
+  delete reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(handle);
+}
+
+// </editor-fold>
+
+// <editor-fold desc="org.rocksdb.Slice>
+
+/*
+ * Class:     org_rocksdb_Slice
+ * Method:    createNewSlice0
+ * Signature: ([BI)J
+ */
+jlong Java_org_rocksdb_Slice_createNewSlice0(JNIEnv* env, jclass /*jcls*/,
+                                             jbyteArray data, jint offset) {
+  const jsize dataSize = env->GetArrayLength(data);
+  const int len = dataSize - offset;
+
+  // NOTE: buf will be deleted in the Java_org_rocksdb_Slice_disposeInternalBuf
+  // method
+  jbyte* buf = new jbyte[len];
+  env->GetByteArrayRegion(data, offset, len, buf);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    return 0;
+  }
+
+  const auto* slice = new ROCKSDB_NAMESPACE::Slice((const char*)buf, len);
+  return GET_CPLUSPLUS_POINTER(slice);
+}
+
+/*
+ * Class:     org_rocksdb_Slice
+ * Method:    createNewSlice1
+ * Signature: ([B)J
+ */
+jlong Java_org_rocksdb_Slice_createNewSlice1(JNIEnv* env, jclass /*jcls*/,
+                                             jbyteArray data) {
+  jbyte* ptrData = env->GetByteArrayElements(data, nullptr);
+  if (ptrData == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return 0;
+  }
+  const int len = env->GetArrayLength(data) + 1;
+
+  // NOTE: buf will be deleted in the Java_org_rocksdb_Slice_disposeInternalBuf
+  // method
+  char* buf = new char[len];
+  memcpy(buf, ptrData, len - 1);
+  buf[len - 1] = '\0';
+
+  const auto* slice = new ROCKSDB_NAMESPACE::Slice(buf, len - 1);
+
+  env->ReleaseByteArrayElements(data, ptrData, JNI_ABORT);
+
+  return GET_CPLUSPLUS_POINTER(slice);
+}
+
+/*
+ * Class:     org_rocksdb_Slice
+ * Method:    data0
+ * Signature: (J)[B
+ */
+jbyteArray Java_org_rocksdb_Slice_data0(JNIEnv* env, jobject /*jobj*/,
+                                        jlong handle) {
+  const auto* slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(handle);
+  const jsize len = static_cast<jsize>(slice->size());
+  const jbyteArray data = env->NewByteArray(len);
+  if (data == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+
+  env->SetByteArrayRegion(
+      data, 0, len,
+      const_cast<jbyte*>(reinterpret_cast<const jbyte*>(slice->data())));
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    env->DeleteLocalRef(data);
+    return nullptr;
+  }
+
+  return data;
+}
+
+/*
+ * Class:     org_rocksdb_Slice
+ * Method:    clear0
+ * Signature: (JZJ)V
+ */
+void Java_org_rocksdb_Slice_clear0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                   jlong handle, jboolean shouldRelease,
+                                   jlong internalBufferOffset) {
+  auto* slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(handle);
+  if (shouldRelease == JNI_TRUE) {
+    const char* buf = slice->data_ - internalBufferOffset;
+    delete[] buf;
+  }
+  slice->clear();
+}
+
+/*
+ * Class:     org_rocksdb_Slice
+ * Method:    removePrefix0
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Slice_removePrefix0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                          jlong handle, jint length) {
+  auto* slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(handle);
+  slice->remove_prefix(length);
+}
+
+/*
+ * Class:     org_rocksdb_DirectSlice
+ * Method:    setLength0
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DirectSlice_setLength0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                             jlong handle, jint length) {
+  auto* slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(handle);
+  slice->size_ = length;
+}
+
+/*
+ * Class:     org_rocksdb_Slice
+ * Method:    disposeInternalBuf
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Slice_disposeInternalBuf(JNIEnv* /*env*/,
+                                               jobject /*jobj*/, jlong handle,
+                                               jlong internalBufferOffset) {
+  const auto* slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(handle);
+  const char* buf = slice->data_ - internalBufferOffset;
+  delete[] buf;
+}
+
+// </editor-fold>
+
+// <editor-fold desc="org.rocksdb.DirectSlice>
+
+/*
+ * Class:     org_rocksdb_DirectSlice
+ * Method:    createNewDirectSlice0
+ * Signature: (Ljava/nio/ByteBuffer;I)J
+ */
+jlong Java_org_rocksdb_DirectSlice_createNewDirectSlice0(JNIEnv* env,
+                                                         jclass /*jcls*/,
+                                                         jobject data,
+                                                         jint length) {
+  assert(data != nullptr);
+  void* data_addr = env->GetDirectBufferAddress(data);
+  if (data_addr == nullptr) {
+    // error: memory region is undefined, given object is not a direct
+    // java.nio.Buffer, or JNI access to direct buffers is not supported by JVM
+    ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(
+        env, ROCKSDB_NAMESPACE::Status::InvalidArgument(
+                 "Could not access DirectBuffer"));
+    return 0;
+  }
+
+  const auto* ptrData = reinterpret_cast<char*>(data_addr);
+  const auto* slice = new ROCKSDB_NAMESPACE::Slice(ptrData, length);
+  return GET_CPLUSPLUS_POINTER(slice);
+}
+
+/*
+ * Class:     org_rocksdb_DirectSlice
+ * Method:    createNewDirectSlice1
+ * Signature: (Ljava/nio/ByteBuffer;)J
+ */
+jlong Java_org_rocksdb_DirectSlice_createNewDirectSlice1(JNIEnv* env,
+                                                         jclass /*jcls*/,
+                                                         jobject data) {
+  void* data_addr = env->GetDirectBufferAddress(data);
+  if (data_addr == nullptr) {
+    // error: memory region is undefined, given object is not a direct
+    // java.nio.Buffer, or JNI access to direct buffers is not supported by JVM
+    ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(
+        env, ROCKSDB_NAMESPACE::Status::InvalidArgument(
+                 "Could not access DirectBuffer"));
+    return 0;
+  }
+
+  const auto* ptrData = reinterpret_cast<char*>(data_addr);
+  const auto* slice = new ROCKSDB_NAMESPACE::Slice(ptrData);
+  return GET_CPLUSPLUS_POINTER(slice);
+}
+
+/*
+ * Class:     org_rocksdb_DirectSlice
+ * Method:    data0
+ * Signature: (J)Ljava/lang/Object;
+ */
+jobject Java_org_rocksdb_DirectSlice_data0(JNIEnv* env, jobject /*jobj*/,
+                                           jlong handle) {
+  const auto* slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(handle);
+  return env->NewDirectByteBuffer(const_cast<char*>(slice->data()),
+                                  slice->size());
+}
+
+/*
+ * Class:     org_rocksdb_DirectSlice
+ * Method:    get0
+ * Signature: (JI)B
+ */
+jbyte Java_org_rocksdb_DirectSlice_get0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                        jlong handle, jint offset) {
+  const auto* slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(handle);
+  return (*slice)[offset];
+}
+
+/*
+ * Class:     org_rocksdb_DirectSlice
+ * Method:    clear0
+ * Signature: (JZJ)V
+ */
+void Java_org_rocksdb_DirectSlice_clear0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                         jlong handle, jboolean shouldRelease,
+                                         jlong internalBufferOffset) {
+  auto* slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(handle);
+  if (shouldRelease == JNI_TRUE) {
+    const char* buf = slice->data_ - internalBufferOffset;
+    delete[] buf;
+  }
+  slice->clear();
+}
+
+/*
+ * Class:     org_rocksdb_DirectSlice
+ * Method:    removePrefix0
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DirectSlice_removePrefix0(JNIEnv* /*env*/,
+                                                jobject /*jobj*/, jlong handle,
+                                                jint length) {
+  auto* slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(handle);
+  slice->remove_prefix(length);
+}
+
+/*
+ * Class:     org_rocksdb_DirectSlice
+ * Method:    disposeInternalBuf
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DirectSlice_disposeInternalBuf(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong handle,
+    jlong internalBufferOffset) {
+  const auto* slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(handle);
+  const char* buf = slice->data_ - internalBufferOffset;
+  delete[] buf;
+}
+
+// </editor-fold>
diff --git a/src/rocksdb/java/rocksjni/snapshot.cc b/src/rocksdb/java/rocksjni/snapshot.cc
new file mode 100644
index 000000000..2a1265a58
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/snapshot.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++.
+
+#include <jni.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "include/org_rocksdb_Snapshot.h"
+#include "rocksdb/db.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_Snapshot
+ * Method:    getSequenceNumber
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Snapshot_getSequenceNumber(JNIEnv* /*env*/,
+                                                  jobject /*jobj*/,
+                                                  jlong jsnapshot_handle) {
+  auto* snapshot =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Snapshot*>(jsnapshot_handle);
+  return snapshot->GetSequenceNumber();
+}
diff --git a/src/rocksdb/java/rocksjni/sst_file_manager.cc b/src/rocksdb/java/rocksjni/sst_file_manager.cc
new file mode 100644
index 000000000..c51436819
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/sst_file_manager.cc
@@ -0,0 +1,250 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling C++ ROCKSDB_NAMESPACE::SstFileManager methods
+// from Java side.
+
+#include "rocksdb/sst_file_manager.h"
+
+#include <jni.h>
+
+#include <memory>
+
+#include "include/org_rocksdb_SstFileManager.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_SstFileManager
+ * Method:    newSstFileManager
+ * Signature: (JJJDJ)J
+ */
+jlong Java_org_rocksdb_SstFileManager_newSstFileManager(
+    JNIEnv* jnienv, jclass /*jcls*/, jlong jenv_handle, jlong jlogger_handle,
+    jlong jrate_bytes, jdouble jmax_trash_db_ratio,
+    jlong jmax_delete_chunk_bytes) {
+  auto* env = reinterpret_cast<ROCKSDB_NAMESPACE::Env*>(jenv_handle);
+  ROCKSDB_NAMESPACE::Status s;
+  ROCKSDB_NAMESPACE::SstFileManager* sst_file_manager = nullptr;
+
+  if (jlogger_handle != 0) {
+    auto* sptr_logger =
+        reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Logger>*>(
+            jlogger_handle);
+    sst_file_manager = ROCKSDB_NAMESPACE::NewSstFileManager(
+        env, *sptr_logger, "", jrate_bytes, true, &s, jmax_trash_db_ratio,
+        jmax_delete_chunk_bytes);
+  } else {
+    sst_file_manager = ROCKSDB_NAMESPACE::NewSstFileManager(
+        env, nullptr, "", jrate_bytes, true, &s, jmax_trash_db_ratio,
+        jmax_delete_chunk_bytes);
+  }
+
+  if (!s.ok()) {
+    if (sst_file_manager != nullptr) {
+      delete sst_file_manager;
+    }
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(jnienv, s);
+  }
+  auto* sptr_sst_file_manager =
+      new std::shared_ptr<ROCKSDB_NAMESPACE::SstFileManager>(sst_file_manager);
+
+  return GET_CPLUSPLUS_POINTER(sptr_sst_file_manager);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileManager
+ * Method:    setMaxAllowedSpaceUsage
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_SstFileManager_setMaxAllowedSpaceUsage(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jmax_allowed_space) {
+  auto* sptr_sst_file_manager =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::SstFileManager>*>(
+          jhandle);
+  sptr_sst_file_manager->get()->SetMaxAllowedSpaceUsage(jmax_allowed_space);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileManager
+ * Method:    setCompactionBufferSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_SstFileManager_setCompactionBufferSize(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jcompaction_buffer_size) {
+  auto* sptr_sst_file_manager =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::SstFileManager>*>(
+          jhandle);
+  sptr_sst_file_manager->get()->SetCompactionBufferSize(
+      jcompaction_buffer_size);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileManager
+ * Method:    isMaxAllowedSpaceReached
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_SstFileManager_isMaxAllowedSpaceReached(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* sptr_sst_file_manager =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::SstFileManager>*>(
+          jhandle);
+  return sptr_sst_file_manager->get()->IsMaxAllowedSpaceReached();
+}
+
+/*
+ * Class:     org_rocksdb_SstFileManager
+ * Method:    isMaxAllowedSpaceReachedIncludingCompactions
+ * Signature: (J)Z
+ */
+jboolean
+Java_org_rocksdb_SstFileManager_isMaxAllowedSpaceReachedIncludingCompactions(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* sptr_sst_file_manager =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::SstFileManager>*>(
+          jhandle);
+  return sptr_sst_file_manager->get()
+      ->IsMaxAllowedSpaceReachedIncludingCompactions();
+}
+
+/*
+ * Class:     org_rocksdb_SstFileManager
+ * Method:    getTotalSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_SstFileManager_getTotalSize(JNIEnv* /*env*/,
+                                                   jobject /*jobj*/,
+                                                   jlong jhandle) {
+  auto* sptr_sst_file_manager =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::SstFileManager>*>(
+          jhandle);
+  return sptr_sst_file_manager->get()->GetTotalSize();
+}
+
+/*
+ * Class:     org_rocksdb_SstFileManager
+ * Method:    getTrackedFiles
+ * Signature: (J)Ljava/util/Map;
+ */
+jobject Java_org_rocksdb_SstFileManager_getTrackedFiles(JNIEnv* env,
+                                                        jobject /*jobj*/,
+                                                        jlong jhandle) {
+  auto* sptr_sst_file_manager =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::SstFileManager>*>(
+          jhandle);
+  auto tracked_files = sptr_sst_file_manager->get()->GetTrackedFiles();
+
+  // TODO(AR) could refactor to share code with
+  // ROCKSDB_NAMESPACE::HashMapJni::fromCppMap(env, tracked_files);
+
+  const jobject jtracked_files = ROCKSDB_NAMESPACE::HashMapJni::construct(
+      env, static_cast<uint32_t>(tracked_files.size()));
+  if (jtracked_files == nullptr) {
+    // exception occurred
+    return nullptr;
+  }
+
+  const ROCKSDB_NAMESPACE::HashMapJni::FnMapKV<const std::string,
+                                               const uint64_t, jobject, jobject>
+      fn_map_kv =
+          [env](const std::pair<const std::string, const uint64_t>& pair) {
+            const jstring jtracked_file_path =
+                env->NewStringUTF(pair.first.c_str());
+            if (jtracked_file_path == nullptr) {
+              // an error occurred
+              return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+            }
+            const jobject jtracked_file_size =
+                ROCKSDB_NAMESPACE::LongJni::valueOf(env, pair.second);
+            if (jtracked_file_size == nullptr) {
+              // an error occurred
+              return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+            }
+            return std::unique_ptr<std::pair<jobject, jobject>>(
+                new std::pair<jobject, jobject>(jtracked_file_path,
+                                                jtracked_file_size));
+          };
+
+  if (!ROCKSDB_NAMESPACE::HashMapJni::putAll(env, jtracked_files,
+                                             tracked_files.begin(),
+                                             tracked_files.end(), fn_map_kv)) {
+    // exception occcurred
+    return nullptr;
+  }
+
+  return jtracked_files;
+}
+
+/*
+ * Class:     org_rocksdb_SstFileManager
+ * Method:    getDeleteRateBytesPerSecond
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_SstFileManager_getDeleteRateBytesPerSecond(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* sptr_sst_file_manager =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::SstFileManager>*>(
+          jhandle);
+  return sptr_sst_file_manager->get()->GetDeleteRateBytesPerSecond();
+}
+
+/*
+ * Class:     org_rocksdb_SstFileManager
+ * Method:    setDeleteRateBytesPerSecond
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_SstFileManager_setDeleteRateBytesPerSecond(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jlong jdelete_rate) {
+  auto* sptr_sst_file_manager =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::SstFileManager>*>(
+          jhandle);
+  sptr_sst_file_manager->get()->SetDeleteRateBytesPerSecond(jdelete_rate);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileManager
+ * Method:    getMaxTrashDBRatio
+ * Signature: (J)D
+ */
+jdouble Java_org_rocksdb_SstFileManager_getMaxTrashDBRatio(JNIEnv* /*env*/,
+                                                           jobject /*jobj*/,
+                                                           jlong jhandle) {
+  auto* sptr_sst_file_manager =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::SstFileManager>*>(
+          jhandle);
+  return sptr_sst_file_manager->get()->GetMaxTrashDBRatio();
+}
+
+/*
+ * Class:     org_rocksdb_SstFileManager
+ * Method:    setMaxTrashDBRatio
+ * Signature: (JD)V
+ */
+void Java_org_rocksdb_SstFileManager_setMaxTrashDBRatio(JNIEnv* /*env*/,
+                                                        jobject /*jobj*/,
+                                                        jlong jhandle,
+                                                        jdouble jratio) {
+  auto* sptr_sst_file_manager =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::SstFileManager>*>(
+          jhandle);
+  sptr_sst_file_manager->get()->SetMaxTrashDBRatio(jratio);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileManager
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileManager_disposeInternal(JNIEnv* /*env*/,
+                                                     jobject /*jobj*/,
+                                                     jlong jhandle) {
+  auto* sptr_sst_file_manager =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::SstFileManager>*>(
+          jhandle);
+  delete sptr_sst_file_manager;
+}
diff --git a/src/rocksdb/java/rocksjni/sst_file_reader_iterator.cc b/src/rocksdb/java/rocksjni/sst_file_reader_iterator.cc
new file mode 100644
index 000000000..68fa4c37c
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/sst_file_reader_iterator.cc
@@ -0,0 +1,373 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ ROCKSDB_NAMESPACE::Iterator methods from Java side.
+
+#include <jni.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "include/org_rocksdb_SstFileReaderIterator.h"
+#include "rocksdb/iterator.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_disposeInternal(JNIEnv* /*env*/,
+                                                            jobject /*jobj*/,
+                                                            jlong handle) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  assert(it != nullptr);
+  delete it;
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    isValid0
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_SstFileReaderIterator_isValid0(JNIEnv* /*env*/,
+                                                         jobject /*jobj*/,
+                                                         jlong handle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle)->Valid();
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    seekToFirst0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_seekToFirst0(JNIEnv* /*env*/,
+                                                         jobject /*jobj*/,
+                                                         jlong handle) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle)->SeekToFirst();
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    seekToLast0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_seekToLast0(JNIEnv* /*env*/,
+                                                        jobject /*jobj*/,
+                                                        jlong handle) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle)->SeekToLast();
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    next0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_next0(JNIEnv* /*env*/,
+                                                  jobject /*jobj*/,
+                                                  jlong handle) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle)->Next();
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    prev0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_prev0(JNIEnv* /*env*/,
+                                                  jobject /*jobj*/,
+                                                  jlong handle) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle)->Prev();
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    seek0
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_seek0(JNIEnv* env, jobject /*jobj*/,
+                                                  jlong handle,
+                                                  jbyteArray jtarget,
+                                                  jint jtarget_len) {
+  jbyte* target = env->GetByteArrayElements(jtarget, nullptr);
+  if (target == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  ROCKSDB_NAMESPACE::Slice target_slice(reinterpret_cast<char*>(target),
+                                        jtarget_len);
+
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  it->Seek(target_slice);
+
+  env->ReleaseByteArrayElements(jtarget, target, JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    seekForPrev0
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_seekForPrev0(JNIEnv* env,
+                                                         jobject /*jobj*/,
+                                                         jlong handle,
+                                                         jbyteArray jtarget,
+                                                         jint jtarget_len) {
+  jbyte* target = env->GetByteArrayElements(jtarget, nullptr);
+  if (target == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  ROCKSDB_NAMESPACE::Slice target_slice(reinterpret_cast<char*>(target),
+                                        jtarget_len);
+
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  it->SeekForPrev(target_slice);
+
+  env->ReleaseByteArrayElements(jtarget, target, JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    status0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_status0(JNIEnv* env,
+                                                    jobject /*jobj*/,
+                                                    jlong handle) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  ROCKSDB_NAMESPACE::Status s = it->status();
+
+  if (s.ok()) {
+    return;
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    key0
+ * Signature: (J)[B
+ */
+jbyteArray Java_org_rocksdb_SstFileReaderIterator_key0(JNIEnv* env,
+                                                       jobject /*jobj*/,
+                                                       jlong handle) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  ROCKSDB_NAMESPACE::Slice key_slice = it->key();
+
+  jbyteArray jkey = env->NewByteArray(static_cast<jsize>(key_slice.size()));
+  if (jkey == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+  env->SetByteArrayRegion(
+      jkey, 0, static_cast<jsize>(key_slice.size()),
+      const_cast<jbyte*>(reinterpret_cast<const jbyte*>(key_slice.data())));
+  return jkey;
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    value0
+ * Signature: (J)[B
+ */
+jbyteArray Java_org_rocksdb_SstFileReaderIterator_value0(JNIEnv* env,
+                                                         jobject /*jobj*/,
+                                                         jlong handle) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  ROCKSDB_NAMESPACE::Slice value_slice = it->value();
+
+  jbyteArray jkeyValue =
+      env->NewByteArray(static_cast<jsize>(value_slice.size()));
+  if (jkeyValue == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+  env->SetByteArrayRegion(
+      jkeyValue, 0, static_cast<jsize>(value_slice.size()),
+      const_cast<jbyte*>(reinterpret_cast<const jbyte*>(value_slice.data())));
+  return jkeyValue;
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    keyDirect0
+ * Signature: (JLjava/nio/ByteBuffer;II)I
+ */
+jint Java_org_rocksdb_SstFileReaderIterator_keyDirect0(
+    JNIEnv* env, jobject /*jobj*/, jlong handle, jobject jtarget,
+    jint jtarget_off, jint jtarget_len) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  ROCKSDB_NAMESPACE::Slice key_slice = it->key();
+  return ROCKSDB_NAMESPACE::JniUtil::copyToDirect(env, key_slice, jtarget,
+                                                  jtarget_off, jtarget_len);
+}
+
+/*
+ * This method supports fetching into indirect byte buffers;
+ * the Java wrapper extracts the byte[] and passes it here.
+ *
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    keyByteArray0
+ * Signature: (J[BII)I
+ */
+jint Java_org_rocksdb_SstFileReaderIterator_keyByteArray0(
+    JNIEnv* env, jobject /*jobj*/, jlong handle, jbyteArray jkey, jint jkey_off,
+    jint jkey_len) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  ROCKSDB_NAMESPACE::Slice key_slice = it->key();
+  auto slice_size = key_slice.size();
+  jsize copy_size = std::min(static_cast<uint32_t>(slice_size),
+                             static_cast<uint32_t>(jkey_len));
+  env->SetByteArrayRegion(
+      jkey, jkey_off, copy_size,
+      const_cast<jbyte*>(reinterpret_cast<const jbyte*>(key_slice.data())));
+
+  return static_cast<jsize>(slice_size);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    valueDirect0
+ * Signature: (JLjava/nio/ByteBuffer;II)I
+ */
+jint Java_org_rocksdb_SstFileReaderIterator_valueDirect0(
+    JNIEnv* env, jobject /*jobj*/, jlong handle, jobject jtarget,
+    jint jtarget_off, jint jtarget_len) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  ROCKSDB_NAMESPACE::Slice value_slice = it->value();
+  return ROCKSDB_NAMESPACE::JniUtil::copyToDirect(env, value_slice, jtarget,
+                                                  jtarget_off, jtarget_len);
+}
+
+/*
+ * This method supports fetching into indirect byte buffers;
+ * the Java wrapper extracts the byte[] and passes it here.
+ *
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    valueByteArray0
+ * Signature: (J[BII)I
+ */
+jint Java_org_rocksdb_SstFileReaderIterator_valueByteArray0(
+    JNIEnv* env, jobject /*jobj*/, jlong handle, jbyteArray jvalue_target,
+    jint jvalue_off, jint jvalue_len) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  ROCKSDB_NAMESPACE::Slice value_slice = it->value();
+  auto slice_size = value_slice.size();
+  jsize copy_size = std::min(static_cast<uint32_t>(slice_size),
+                             static_cast<uint32_t>(jvalue_len));
+  env->SetByteArrayRegion(
+      jvalue_target, jvalue_off, copy_size,
+      const_cast<jbyte*>(reinterpret_cast<const jbyte*>(value_slice.data())));
+
+  return static_cast<jsize>(slice_size);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    seekDirect0
+ * Signature: (JLjava/nio/ByteBuffer;II)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_seekDirect0(
+    JNIEnv* env, jobject /*jobj*/, jlong handle, jobject jtarget,
+    jint jtarget_off, jint jtarget_len) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  auto seek = [&it](ROCKSDB_NAMESPACE::Slice& target_slice) {
+    it->Seek(target_slice);
+  };
+  ROCKSDB_NAMESPACE::JniUtil::k_op_direct(seek, env, jtarget, jtarget_off,
+                                          jtarget_len);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    seekForPrevDirect0
+ * Signature: (JLjava/nio/ByteBuffer;II)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_seekForPrevDirect0(
+    JNIEnv* env, jobject /*jobj*/, jlong handle, jobject jtarget,
+    jint jtarget_off, jint jtarget_len) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  auto seekPrev = [&it](ROCKSDB_NAMESPACE::Slice& target_slice) {
+    it->SeekForPrev(target_slice);
+  };
+  ROCKSDB_NAMESPACE::JniUtil::k_op_direct(seekPrev, env, jtarget, jtarget_off,
+                                          jtarget_len);
+}
+
+/*
+ * This method supports fetching into indirect byte buffers;
+ * the Java wrapper extracts the byte[] and passes it here.
+ *
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    seekByteArray0
+ * Signature: (J[BII)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_seekByteArray0(
+    JNIEnv* env, jobject /*jobj*/, jlong handle, jbyteArray jtarget,
+    jint jtarget_off, jint jtarget_len) {
+  const std::unique_ptr<char[]> target(new char[jtarget_len]);
+  if (target == nullptr) {
+    jclass oom_class = env->FindClass("/lang/java/OutOfMemoryError");
+    env->ThrowNew(oom_class,
+                  "Memory allocation failed in RocksDB JNI function");
+    return;
+  }
+  env->GetByteArrayRegion(jtarget, jtarget_off, jtarget_len,
+                          reinterpret_cast<jbyte*>(target.get()));
+
+  ROCKSDB_NAMESPACE::Slice target_slice(target.get(), jtarget_len);
+
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  it->Seek(target_slice);
+}
+
+/*
+ * This method supports fetching into indirect byte buffers;
+ * the Java wrapper extracts the byte[] and passes it here.
+ *
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    seekForPrevByteArray0
+ * Signature: (J[BII)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_seekForPrevByteArray0(
+    JNIEnv* env, jobject /*jobj*/, jlong handle, jbyteArray jtarget,
+    jint jtarget_off, jint jtarget_len) {
+  const std::unique_ptr<char[]> target(new char[jtarget_len]);
+  if (target == nullptr) {
+    jclass oom_class = env->FindClass("/lang/java/OutOfMemoryError");
+    env->ThrowNew(oom_class,
+                  "Memory allocation failed in RocksDB JNI function");
+    return;
+  }
+  env->GetByteArrayRegion(jtarget, jtarget_off, jtarget_len,
+                          reinterpret_cast<jbyte*>(target.get()));
+
+  ROCKSDB_NAMESPACE::Slice target_slice(target.get(), jtarget_len);
+
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  it->SeekForPrev(target_slice);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    refresh0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_refresh0(JNIEnv* env,
+                                                     jobject /*jobj*/,
+                                                     jlong handle) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  ROCKSDB_NAMESPACE::Status s = it->Refresh();
+
+  if (s.ok()) {
+    return;
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+}
diff --git a/src/rocksdb/java/rocksjni/sst_file_readerjni.cc b/src/rocksdb/java/rocksjni/sst_file_readerjni.cc
new file mode 100644
index 000000000..7ef711842
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/sst_file_readerjni.cc
@@ -0,0 +1,118 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling C++ ROCKSDB_NAMESPACE::SstFileReader methods
+// from Java side.
+
+#include <jni.h>
+
+#include <string>
+
+#include "include/org_rocksdb_SstFileReader.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/sst_file_reader.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_SstFileReader
+ * Method:    newSstFileReader
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_SstFileReader_newSstFileReader(JNIEnv * /*env*/,
+                                                      jclass /*jcls*/,
+                                                      jlong joptions) {
+  auto *options =
+      reinterpret_cast<const ROCKSDB_NAMESPACE::Options *>(joptions);
+  ROCKSDB_NAMESPACE::SstFileReader *sst_file_reader =
+      new ROCKSDB_NAMESPACE::SstFileReader(*options);
+  return GET_CPLUSPLUS_POINTER(sst_file_reader);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReader
+ * Method:    open
+ * Signature: (JLjava/lang/String;)V
+ */
+void Java_org_rocksdb_SstFileReader_open(JNIEnv *env, jobject /*jobj*/,
+                                         jlong jhandle, jstring jfile_path) {
+  const char *file_path = env->GetStringUTFChars(jfile_path, nullptr);
+  if (file_path == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+  ROCKSDB_NAMESPACE::Status s =
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileReader *>(jhandle)->Open(
+          file_path);
+  env->ReleaseStringUTFChars(jfile_path, file_path);
+
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReader
+ * Method:    newIterator
+ * Signature: (JJ)J
+ */
+jlong Java_org_rocksdb_SstFileReader_newIterator(JNIEnv * /*env*/,
+                                                 jobject /*jobj*/,
+                                                 jlong jhandle,
+                                                 jlong jread_options_handle) {
+  auto *sst_file_reader =
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileReader *>(jhandle);
+  auto *read_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions *>(jread_options_handle);
+  return GET_CPLUSPLUS_POINTER(sst_file_reader->NewIterator(*read_options));
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReader
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileReader_disposeInternal(JNIEnv * /*env*/,
+                                                    jobject /*jobj*/,
+                                                    jlong jhandle) {
+  delete reinterpret_cast<ROCKSDB_NAMESPACE::SstFileReader *>(jhandle);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReader
+ * Method:    verifyChecksum
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileReader_verifyChecksum(JNIEnv *env,
+                                                   jobject /*jobj*/,
+                                                   jlong jhandle) {
+  auto *sst_file_reader =
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileReader *>(jhandle);
+  auto s = sst_file_reader->VerifyChecksum();
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReader
+ * Method:    getTableProperties
+ * Signature: (J)J
+ */
+jobject Java_org_rocksdb_SstFileReader_getTableProperties(JNIEnv *env,
+                                                          jobject /*jobj*/,
+                                                          jlong jhandle) {
+  auto *sst_file_reader =
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileReader *>(jhandle);
+  std::shared_ptr<const ROCKSDB_NAMESPACE::TableProperties> tp =
+      sst_file_reader->GetTableProperties();
+  jobject jtable_properties =
+      ROCKSDB_NAMESPACE::TablePropertiesJni::fromCppTableProperties(
+          env, *(tp.get()));
+  return jtable_properties;
+}
diff --git a/src/rocksdb/java/rocksjni/sst_file_writerjni.cc b/src/rocksdb/java/rocksjni/sst_file_writerjni.cc
new file mode 100644
index 000000000..1898c3cfc
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/sst_file_writerjni.cc
@@ -0,0 +1,310 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling C++ ROCKSDB_NAMESPACE::SstFileWriter methods
+// from Java side.
+
+#include <jni.h>
+
+#include <string>
+
+#include "include/org_rocksdb_SstFileWriter.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/sst_file_writer.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_SstFileWriter
+ * Method:    newSstFileWriter
+ * Signature: (JJJB)J
+ */
+jlong Java_org_rocksdb_SstFileWriter_newSstFileWriter__JJJB(
+    JNIEnv * /*env*/, jclass /*jcls*/, jlong jenvoptions, jlong joptions,
+    jlong jcomparator_handle, jbyte jcomparator_type) {
+  ROCKSDB_NAMESPACE::Comparator *comparator = nullptr;
+  switch (jcomparator_type) {
+    // JAVA_COMPARATOR
+    case 0x0:
+      comparator = reinterpret_cast<ROCKSDB_NAMESPACE::ComparatorJniCallback *>(
+          jcomparator_handle);
+      break;
+
+    // JAVA_NATIVE_COMPARATOR_WRAPPER
+    case 0x1:
+      comparator =
+          reinterpret_cast<ROCKSDB_NAMESPACE::Comparator *>(jcomparator_handle);
+      break;
+  }
+  auto *env_options =
+      reinterpret_cast<const ROCKSDB_NAMESPACE::EnvOptions *>(jenvoptions);
+  auto *options =
+      reinterpret_cast<const ROCKSDB_NAMESPACE::Options *>(joptions);
+  ROCKSDB_NAMESPACE::SstFileWriter *sst_file_writer =
+      new ROCKSDB_NAMESPACE::SstFileWriter(*env_options, *options, comparator);
+  return GET_CPLUSPLUS_POINTER(sst_file_writer);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileWriter
+ * Method:    newSstFileWriter
+ * Signature: (JJ)J
+ */
+jlong Java_org_rocksdb_SstFileWriter_newSstFileWriter__JJ(JNIEnv * /*env*/,
+                                                          jclass /*jcls*/,
+                                                          jlong jenvoptions,
+                                                          jlong joptions) {
+  auto *env_options =
+      reinterpret_cast<const ROCKSDB_NAMESPACE::EnvOptions *>(jenvoptions);
+  auto *options =
+      reinterpret_cast<const ROCKSDB_NAMESPACE::Options *>(joptions);
+  ROCKSDB_NAMESPACE::SstFileWriter *sst_file_writer =
+      new ROCKSDB_NAMESPACE::SstFileWriter(*env_options, *options);
+  return GET_CPLUSPLUS_POINTER(sst_file_writer);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileWriter
+ * Method:    open
+ * Signature: (JLjava/lang/String;)V
+ */
+void Java_org_rocksdb_SstFileWriter_open(JNIEnv *env, jobject /*jobj*/,
+                                         jlong jhandle, jstring jfile_path) {
+  const char *file_path = env->GetStringUTFChars(jfile_path, nullptr);
+  if (file_path == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+  ROCKSDB_NAMESPACE::Status s =
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jhandle)->Open(
+          file_path);
+  env->ReleaseStringUTFChars(jfile_path, file_path);
+
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_SstFileWriter
+ * Method:    put
+ * Signature: (JJJ)V
+ */
+void Java_org_rocksdb_SstFileWriter_put__JJJ(JNIEnv *env, jobject /*jobj*/,
+                                             jlong jhandle, jlong jkey_handle,
+                                             jlong jvalue_handle) {
+  auto *key_slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice *>(jkey_handle);
+  auto *value_slice =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Slice *>(jvalue_handle);
+  ROCKSDB_NAMESPACE::Status s =
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jhandle)->Put(
+          *key_slice, *value_slice);
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_SstFileWriter
+ * Method:    put
+ * Signature: (JJJ)V
+ */
+void Java_org_rocksdb_SstFileWriter_put__J_3B_3B(JNIEnv *env, jobject /*jobj*/,
+                                                 jlong jhandle, jbyteArray jkey,
+                                                 jbyteArray jval) {
+  jbyte *key = env->GetByteArrayElements(jkey, nullptr);
+  if (key == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+  ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char *>(key),
+                                     env->GetArrayLength(jkey));
+
+  jbyte *value = env->GetByteArrayElements(jval, nullptr);
+  if (value == nullptr) {
+    // exception thrown: OutOfMemoryError
+    env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+    return;
+  }
+  ROCKSDB_NAMESPACE::Slice value_slice(reinterpret_cast<char *>(value),
+                                       env->GetArrayLength(jval));
+
+  ROCKSDB_NAMESPACE::Status s =
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jhandle)->Put(
+          key_slice, value_slice);
+
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+  env->ReleaseByteArrayElements(jval, value, JNI_ABORT);
+
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_SstFileWriter
+ * Method:    putDirect
+ * Signature: (JLjava/nio/ByteBuffer;IILjava/nio/ByteBuffer;II)V
+ */
+void Java_org_rocksdb_SstFileWriter_putDirect(JNIEnv *env, jobject /*jdb*/,
+                                              jlong jdb_handle, jobject jkey,
+                                              jint jkey_off, jint jkey_len,
+                                              jobject jval, jint jval_off,
+                                              jint jval_len) {
+  auto *writer =
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jdb_handle);
+  auto put = [&env, &writer](ROCKSDB_NAMESPACE::Slice &key,
+                             ROCKSDB_NAMESPACE::Slice &value) {
+    ROCKSDB_NAMESPACE::Status s = writer->Put(key, value);
+    if (s.ok()) {
+      return;
+    }
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  };
+  ROCKSDB_NAMESPACE::JniUtil::kv_op_direct(put, env, jkey, jkey_off, jkey_len,
+                                           jval, jval_off, jval_len);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileWriter
+ * Method:    fileSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_SstFileWriter_fileSize(JNIEnv * /*env*/, jobject /*jdb*/,
+                                              jlong jdb_handle) {
+  auto *writer =
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jdb_handle);
+  return static_cast<jlong>(writer->FileSize());
+}
+
+/*
+ * Class:     org_rocksdb_SstFileWriter
+ * Method:    merge
+ * Signature: (JJJ)V
+ */
+void Java_org_rocksdb_SstFileWriter_merge__JJJ(JNIEnv *env, jobject /*jobj*/,
+                                               jlong jhandle, jlong jkey_handle,
+                                               jlong jvalue_handle) {
+  auto *key_slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice *>(jkey_handle);
+  auto *value_slice =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Slice *>(jvalue_handle);
+  ROCKSDB_NAMESPACE::Status s =
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jhandle)->Merge(
+          *key_slice, *value_slice);
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_SstFileWriter
+ * Method:    merge
+ * Signature: (J[B[B)V
+ */
+void Java_org_rocksdb_SstFileWriter_merge__J_3B_3B(JNIEnv *env,
+                                                   jobject /*jobj*/,
+                                                   jlong jhandle,
+                                                   jbyteArray jkey,
+                                                   jbyteArray jval) {
+  jbyte *key = env->GetByteArrayElements(jkey, nullptr);
+  if (key == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+  ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char *>(key),
+                                     env->GetArrayLength(jkey));
+
+  jbyte *value = env->GetByteArrayElements(jval, nullptr);
+  if (value == nullptr) {
+    // exception thrown: OutOfMemoryError
+    env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+    return;
+  }
+  ROCKSDB_NAMESPACE::Slice value_slice(reinterpret_cast<char *>(value),
+                                       env->GetArrayLength(jval));
+
+  ROCKSDB_NAMESPACE::Status s =
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jhandle)->Merge(
+          key_slice, value_slice);
+
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+  env->ReleaseByteArrayElements(jval, value, JNI_ABORT);
+
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_SstFileWriter
+ * Method:    delete
+ * Signature: (JJJ)V
+ */
+void Java_org_rocksdb_SstFileWriter_delete__J_3B(JNIEnv *env, jobject /*jobj*/,
+                                                 jlong jhandle,
+                                                 jbyteArray jkey) {
+  jbyte *key = env->GetByteArrayElements(jkey, nullptr);
+  if (key == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+  ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char *>(key),
+                                     env->GetArrayLength(jkey));
+
+  ROCKSDB_NAMESPACE::Status s =
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jhandle)->Delete(
+          key_slice);
+
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_SstFileWriter
+ * Method:    delete
+ * Signature: (JJJ)V
+ */
+void Java_org_rocksdb_SstFileWriter_delete__JJ(JNIEnv *env, jobject /*jobj*/,
+                                               jlong jhandle,
+                                               jlong jkey_handle) {
+  auto *key_slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice *>(jkey_handle);
+  ROCKSDB_NAMESPACE::Status s =
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jhandle)->Delete(
+          *key_slice);
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_SstFileWriter
+ * Method:    finish
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileWriter_finish(JNIEnv *env, jobject /*jobj*/,
+                                           jlong jhandle) {
+  ROCKSDB_NAMESPACE::Status s =
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jhandle)->Finish();
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_SstFileWriter
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileWriter_disposeInternal(JNIEnv * /*env*/,
+                                                    jobject /*jobj*/,
+                                                    jlong jhandle) {
+  delete reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jhandle);
+}
diff --git a/src/rocksdb/java/rocksjni/sst_partitioner.cc b/src/rocksdb/java/rocksjni/sst_partitioner.cc
new file mode 100644
index 000000000..1cea3b0cb
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/sst_partitioner.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling C++ ROCKSDB_NAMESPACE::SstFileManager methods
+// from Java side.
+
+#include "rocksdb/sst_partitioner.h"
+
+#include <jni.h>
+
+#include <memory>
+
+#include "include/org_rocksdb_SstPartitionerFixedPrefixFactory.h"
+#include "rocksdb/sst_file_manager.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_SstPartitionerFixedPrefixFactory
+ * Method:    newSstPartitionerFixedPrefixFactory0
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_SstPartitionerFixedPrefixFactory_newSstPartitionerFixedPrefixFactory0(
+    JNIEnv*, jclass, jlong prefix_len) {
+  auto* ptr = new std::shared_ptr<ROCKSDB_NAMESPACE::SstPartitionerFactory>(
+      ROCKSDB_NAMESPACE::NewSstPartitionerFixedPrefixFactory(prefix_len));
+  return GET_CPLUSPLUS_POINTER(ptr);
+}
+
+/*
+ * Class:     org_rocksdb_SstPartitionerFixedPrefixFactory
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstPartitionerFixedPrefixFactory_disposeInternal(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* ptr = reinterpret_cast<
+      std::shared_ptr<ROCKSDB_NAMESPACE::SstPartitionerFactory>*>(jhandle);
+  delete ptr;  // delete std::shared_ptr
+}
diff --git a/src/rocksdb/java/rocksjni/statistics.cc b/src/rocksdb/java/rocksjni/statistics.cc
new file mode 100644
index 000000000..bd405afa1
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/statistics.cc
@@ -0,0 +1,268 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ ROCKSDB_NAMESPACE::Statistics methods from Java side.
+
+#include "rocksdb/statistics.h"
+
+#include <jni.h>
+
+#include <memory>
+#include <set>
+
+#include "include/org_rocksdb_Statistics.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+#include "rocksjni/statisticsjni.h"
+
+/*
+ * Class:     org_rocksdb_Statistics
+ * Method:    newStatistics
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_Statistics_newStatistics__(JNIEnv* env, jclass jcls) {
+  return Java_org_rocksdb_Statistics_newStatistics___3BJ(env, jcls, nullptr, 0);
+}
+
+/*
+ * Class:     org_rocksdb_Statistics
+ * Method:    newStatistics
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Statistics_newStatistics__J(
+    JNIEnv* env, jclass jcls, jlong jother_statistics_handle) {
+  return Java_org_rocksdb_Statistics_newStatistics___3BJ(
+      env, jcls, nullptr, jother_statistics_handle);
+}
+
+/*
+ * Class:     org_rocksdb_Statistics
+ * Method:    newStatistics
+ * Signature: ([B)J
+ */
+jlong Java_org_rocksdb_Statistics_newStatistics___3B(JNIEnv* env, jclass jcls,
+                                                     jbyteArray jhistograms) {
+  return Java_org_rocksdb_Statistics_newStatistics___3BJ(env, jcls, jhistograms,
+                                                         0);
+}
+
+/*
+ * Class:     org_rocksdb_Statistics
+ * Method:    newStatistics
+ * Signature: ([BJ)J
+ */
+jlong Java_org_rocksdb_Statistics_newStatistics___3BJ(
+    JNIEnv* env, jclass, jbyteArray jhistograms,
+    jlong jother_statistics_handle) {
+  std::shared_ptr<ROCKSDB_NAMESPACE::Statistics>* pSptr_other_statistics =
+      nullptr;
+  if (jother_statistics_handle > 0) {
+    pSptr_other_statistics =
+        reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Statistics>*>(
+            jother_statistics_handle);
+  }
+
+  std::set<uint32_t> histograms;
+  if (jhistograms != nullptr) {
+    const jsize len = env->GetArrayLength(jhistograms);
+    if (len > 0) {
+      jbyte* jhistogram = env->GetByteArrayElements(jhistograms, nullptr);
+      if (jhistogram == nullptr) {
+        // exception thrown: OutOfMemoryError
+        return 0;
+      }
+
+      for (jsize i = 0; i < len; i++) {
+        const ROCKSDB_NAMESPACE::Histograms histogram =
+            ROCKSDB_NAMESPACE::HistogramTypeJni::toCppHistograms(jhistogram[i]);
+        histograms.emplace(histogram);
+      }
+
+      env->ReleaseByteArrayElements(jhistograms, jhistogram, JNI_ABORT);
+    }
+  }
+
+  std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> sptr_other_statistics =
+      nullptr;
+  if (pSptr_other_statistics != nullptr) {
+    sptr_other_statistics = *pSptr_other_statistics;
+  }
+
+  auto* pSptr_statistics =
+      new std::shared_ptr<ROCKSDB_NAMESPACE::StatisticsJni>(
+          new ROCKSDB_NAMESPACE::StatisticsJni(sptr_other_statistics,
+                                               histograms));
+
+  return GET_CPLUSPLUS_POINTER(pSptr_statistics);
+}
+
+/*
+ * Class:     org_rocksdb_Statistics
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Statistics_disposeInternal(JNIEnv*, jobject,
+                                                 jlong jhandle) {
+  if (jhandle > 0) {
+    auto* pSptr_statistics =
+        reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Statistics>*>(
+            jhandle);
+    delete pSptr_statistics;
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Statistics
+ * Method:    statsLevel
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_Statistics_statsLevel(JNIEnv*, jobject, jlong jhandle) {
+  auto* pSptr_statistics =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Statistics>*>(
+          jhandle);
+  assert(pSptr_statistics != nullptr);
+  return ROCKSDB_NAMESPACE::StatsLevelJni::toJavaStatsLevel(
+      pSptr_statistics->get()->get_stats_level());
+}
+
+/*
+ * Class:     org_rocksdb_Statistics
+ * Method:    setStatsLevel
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_Statistics_setStatsLevel(JNIEnv*, jobject, jlong jhandle,
+                                               jbyte jstats_level) {
+  auto* pSptr_statistics =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Statistics>*>(
+          jhandle);
+  assert(pSptr_statistics != nullptr);
+  auto stats_level =
+      ROCKSDB_NAMESPACE::StatsLevelJni::toCppStatsLevel(jstats_level);
+  pSptr_statistics->get()->set_stats_level(stats_level);
+}
+
+/*
+ * Class:     org_rocksdb_Statistics
+ * Method:    getTickerCount
+ * Signature: (JB)J
+ */
+jlong Java_org_rocksdb_Statistics_getTickerCount(JNIEnv*, jobject,
+                                                 jlong jhandle,
+                                                 jbyte jticker_type) {
+  auto* pSptr_statistics =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Statistics>*>(
+          jhandle);
+  assert(pSptr_statistics != nullptr);
+  auto ticker = ROCKSDB_NAMESPACE::TickerTypeJni::toCppTickers(jticker_type);
+  uint64_t count = pSptr_statistics->get()->getTickerCount(ticker);
+  return static_cast<jlong>(count);
+}
+
+/*
+ * Class:     org_rocksdb_Statistics
+ * Method:    getAndResetTickerCount
+ * Signature: (JB)J
+ */
+jlong Java_org_rocksdb_Statistics_getAndResetTickerCount(JNIEnv*, jobject,
+                                                         jlong jhandle,
+                                                         jbyte jticker_type) {
+  auto* pSptr_statistics =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Statistics>*>(
+          jhandle);
+  assert(pSptr_statistics != nullptr);
+  auto ticker = ROCKSDB_NAMESPACE::TickerTypeJni::toCppTickers(jticker_type);
+  return pSptr_statistics->get()->getAndResetTickerCount(ticker);
+}
+
+/*
+ * Class:     org_rocksdb_Statistics
+ * Method:    getHistogramData
+ * Signature: (JB)Lorg/rocksdb/HistogramData;
+ */
+jobject Java_org_rocksdb_Statistics_getHistogramData(JNIEnv* env, jobject,
+                                                     jlong jhandle,
+                                                     jbyte jhistogram_type) {
+  auto* pSptr_statistics =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Statistics>*>(
+          jhandle);
+  assert(pSptr_statistics != nullptr);
+
+  // TODO(AR) perhaps better to construct a Java Object Wrapper that
+  //    uses ptr to C++ `new HistogramData`
+  ROCKSDB_NAMESPACE::HistogramData data;
+
+  auto histogram =
+      ROCKSDB_NAMESPACE::HistogramTypeJni::toCppHistograms(jhistogram_type);
+  pSptr_statistics->get()->histogramData(
+      static_cast<ROCKSDB_NAMESPACE::Histograms>(histogram), &data);
+
+  jclass jclazz = ROCKSDB_NAMESPACE::HistogramDataJni::getJClass(env);
+  if (jclazz == nullptr) {
+    // exception occurred accessing class
+    return nullptr;
+  }
+
+  jmethodID mid =
+      ROCKSDB_NAMESPACE::HistogramDataJni::getConstructorMethodId(env);
+  if (mid == nullptr) {
+    // exception occurred accessing method
+    return nullptr;
+  }
+
+  return env->NewObject(jclazz, mid, data.median, data.percentile95,
+                        data.percentile99, data.average,
+                        data.standard_deviation, data.max, data.count, data.sum,
+                        data.min);
+}
+
+/*
+ * Class:     org_rocksdb_Statistics
+ * Method:    getHistogramString
+ * Signature: (JB)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_Statistics_getHistogramString(JNIEnv* env, jobject,
+                                                       jlong jhandle,
+                                                       jbyte jhistogram_type) {
+  auto* pSptr_statistics =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Statistics>*>(
+          jhandle);
+  assert(pSptr_statistics != nullptr);
+  auto histogram =
+      ROCKSDB_NAMESPACE::HistogramTypeJni::toCppHistograms(jhistogram_type);
+  auto str = pSptr_statistics->get()->getHistogramString(histogram);
+  return env->NewStringUTF(str.c_str());
+}
+
+/*
+ * Class:     org_rocksdb_Statistics
+ * Method:    reset
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Statistics_reset(JNIEnv* env, jobject, jlong jhandle) {
+  auto* pSptr_statistics =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Statistics>*>(
+          jhandle);
+  assert(pSptr_statistics != nullptr);
+  ROCKSDB_NAMESPACE::Status s = pSptr_statistics->get()->Reset();
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Statistics
+ * Method:    toString
+ * Signature: (J)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_Statistics_toString(JNIEnv* env, jobject,
+                                             jlong jhandle) {
+  auto* pSptr_statistics =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Statistics>*>(
+          jhandle);
+  assert(pSptr_statistics != nullptr);
+  auto str = pSptr_statistics->get()->ToString();
+  return env->NewStringUTF(str.c_str());
+}
diff --git a/src/rocksdb/java/rocksjni/statisticsjni.cc b/src/rocksdb/java/rocksjni/statisticsjni.cc
new file mode 100644
index 000000000..f46337893
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/statisticsjni.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::Statistics
+
+#include "rocksjni/statisticsjni.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+StatisticsJni::StatisticsJni(std::shared_ptr<Statistics> stats)
+    : StatisticsImpl(stats), m_ignore_histograms() {}
+
+StatisticsJni::StatisticsJni(std::shared_ptr<Statistics> stats,
+                             const std::set<uint32_t> ignore_histograms)
+    : StatisticsImpl(stats), m_ignore_histograms(ignore_histograms) {}
+
+bool StatisticsJni::HistEnabledForType(uint32_t type) const {
+  if (type >= HISTOGRAM_ENUM_MAX) {
+    return false;
+  }
+
+  if (m_ignore_histograms.count(type) > 0) {
+    return false;
+  }
+
+  return true;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/java/rocksjni/statisticsjni.h b/src/rocksdb/java/rocksjni/statisticsjni.h
new file mode 100644
index 000000000..ce823f9b1
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/statisticsjni.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::Statistics
+
+#ifndef JAVA_ROCKSJNI_STATISTICSJNI_H_
+#define JAVA_ROCKSJNI_STATISTICSJNI_H_
+
+#include <memory>
+#include <set>
+#include <string>
+
+#include "monitoring/statistics.h"
+#include "rocksdb/statistics.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class StatisticsJni : public StatisticsImpl {
+ public:
+  StatisticsJni(std::shared_ptr<Statistics> stats);
+  StatisticsJni(std::shared_ptr<Statistics> stats,
+                const std::set<uint32_t> ignore_histograms);
+  virtual bool HistEnabledForType(uint32_t type) const override;
+
+ private:
+  const std::set<uint32_t> m_ignore_histograms;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // JAVA_ROCKSJNI_STATISTICSJNI_H_
diff --git a/src/rocksdb/java/rocksjni/table.cc b/src/rocksdb/java/rocksjni/table.cc
new file mode 100644
index 000000000..0054e5c1f
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/table.cc
@@ -0,0 +1,161 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::Options.
+
+#include "rocksdb/table.h"
+
+#include <jni.h>
+
+#include "include/org_rocksdb_BlockBasedTableConfig.h"
+#include "include/org_rocksdb_PlainTableConfig.h"
+#include "portal.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+
+/*
+ * Class:     org_rocksdb_PlainTableConfig
+ * Method:    newTableFactoryHandle
+ * Signature: (IIDIIBZZ)J
+ */
+jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle(
+    JNIEnv * /*env*/, jobject /*jobj*/, jint jkey_size,
+    jint jbloom_bits_per_key, jdouble jhash_table_ratio, jint jindex_sparseness,
+    jint jhuge_page_tlb_size, jbyte jencoding_type, jboolean jfull_scan_mode,
+    jboolean jstore_index_in_file) {
+  ROCKSDB_NAMESPACE::PlainTableOptions options =
+      ROCKSDB_NAMESPACE::PlainTableOptions();
+  options.user_key_len = jkey_size;
+  options.bloom_bits_per_key = jbloom_bits_per_key;
+  options.hash_table_ratio = jhash_table_ratio;
+  options.index_sparseness = jindex_sparseness;
+  options.huge_page_tlb_size = jhuge_page_tlb_size;
+  options.encoding_type =
+      static_cast<ROCKSDB_NAMESPACE::EncodingType>(jencoding_type);
+  options.full_scan_mode = jfull_scan_mode;
+  options.store_index_in_file = jstore_index_in_file;
+  return GET_CPLUSPLUS_POINTER(
+      ROCKSDB_NAMESPACE::NewPlainTableFactory(options));
+}
+
+/*
+ * Class:     org_rocksdb_BlockBasedTableConfig
+ * Method:    newTableFactoryHandle
+ * Signature: (ZZZZBBDBZJJJJIIIJZZZJZZIIZZBJIJI)J
+ */
+jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
+    JNIEnv *, jobject, jboolean jcache_index_and_filter_blocks,
+    jboolean jcache_index_and_filter_blocks_with_high_priority,
+    jboolean jpin_l0_filter_and_index_blocks_in_cache,
+    jboolean jpin_top_level_index_and_filter, jbyte jindex_type_value,
+    jbyte jdata_block_index_type_value,
+    jdouble jdata_block_hash_table_util_ratio, jbyte jchecksum_type_value,
+    jboolean jno_block_cache, jlong jblock_cache_handle,
+    jlong jpersistent_cache_handle, jlong jblock_cache_compressed_handle,
+    jlong jblock_size, jint jblock_size_deviation, jint jblock_restart_interval,
+    jint jindex_block_restart_interval, jlong jmetadata_block_size,
+    jboolean jpartition_filters, jboolean joptimize_filters_for_memory,
+    jboolean juse_delta_encoding, jlong jfilter_policy_handle,
+    jboolean jwhole_key_filtering, jboolean jverify_compression,
+    jint jread_amp_bytes_per_bit, jint jformat_version,
+    jboolean jenable_index_compression, jboolean jblock_align,
+    jbyte jindex_shortening, jlong jblock_cache_size,
+    jint jblock_cache_num_shard_bits, jlong jblock_cache_compressed_size,
+    jint jblock_cache_compressed_num_shard_bits) {
+  ROCKSDB_NAMESPACE::BlockBasedTableOptions options;
+  options.cache_index_and_filter_blocks =
+      static_cast<bool>(jcache_index_and_filter_blocks);
+  options.cache_index_and_filter_blocks_with_high_priority =
+      static_cast<bool>(jcache_index_and_filter_blocks_with_high_priority);
+  options.pin_l0_filter_and_index_blocks_in_cache =
+      static_cast<bool>(jpin_l0_filter_and_index_blocks_in_cache);
+  options.pin_top_level_index_and_filter =
+      static_cast<bool>(jpin_top_level_index_and_filter);
+  options.index_type =
+      ROCKSDB_NAMESPACE::IndexTypeJni::toCppIndexType(jindex_type_value);
+  options.data_block_index_type =
+      ROCKSDB_NAMESPACE::DataBlockIndexTypeJni::toCppDataBlockIndexType(
+          jdata_block_index_type_value);
+  options.data_block_hash_table_util_ratio =
+      static_cast<double>(jdata_block_hash_table_util_ratio);
+  options.checksum = ROCKSDB_NAMESPACE::ChecksumTypeJni::toCppChecksumType(
+      jchecksum_type_value);
+  options.no_block_cache = static_cast<bool>(jno_block_cache);
+  if (options.no_block_cache) {
+    options.block_cache = nullptr;
+  } else {
+    if (jblock_cache_handle > 0) {
+      std::shared_ptr<ROCKSDB_NAMESPACE::Cache> *pCache =
+          reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache> *>(
+              jblock_cache_handle);
+      options.block_cache = *pCache;
+    } else if (jblock_cache_size >= 0) {
+      if (jblock_cache_num_shard_bits > 0) {
+        options.block_cache = ROCKSDB_NAMESPACE::NewLRUCache(
+            static_cast<size_t>(jblock_cache_size),
+            static_cast<int>(jblock_cache_num_shard_bits));
+      } else {
+        options.block_cache = ROCKSDB_NAMESPACE::NewLRUCache(
+            static_cast<size_t>(jblock_cache_size));
+      }
+    } else {
+      options.no_block_cache = true;
+      options.block_cache = nullptr;
+    }
+  }
+  if (jpersistent_cache_handle > 0) {
+    std::shared_ptr<ROCKSDB_NAMESPACE::PersistentCache> *pCache =
+        reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::PersistentCache> *>(
+            jpersistent_cache_handle);
+    options.persistent_cache = *pCache;
+  }
+  if (jblock_cache_compressed_handle > 0) {
+    std::shared_ptr<ROCKSDB_NAMESPACE::Cache> *pCache =
+        reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache> *>(
+            jblock_cache_compressed_handle);
+    options.block_cache_compressed = *pCache;
+  } else if (jblock_cache_compressed_size > 0) {
+    if (jblock_cache_compressed_num_shard_bits > 0) {
+      options.block_cache_compressed = ROCKSDB_NAMESPACE::NewLRUCache(
+          static_cast<size_t>(jblock_cache_compressed_size),
+          static_cast<int>(jblock_cache_compressed_num_shard_bits));
+    } else {
+      options.block_cache_compressed = ROCKSDB_NAMESPACE::NewLRUCache(
+          static_cast<size_t>(jblock_cache_compressed_size));
+    }
+  }
+  options.block_size = static_cast<size_t>(jblock_size);
+  options.block_size_deviation = static_cast<int>(jblock_size_deviation);
+  options.block_restart_interval = static_cast<int>(jblock_restart_interval);
+  options.index_block_restart_interval =
+      static_cast<int>(jindex_block_restart_interval);
+  options.metadata_block_size = static_cast<uint64_t>(jmetadata_block_size);
+  options.partition_filters = static_cast<bool>(jpartition_filters);
+  options.optimize_filters_for_memory =
+      static_cast<bool>(joptimize_filters_for_memory);
+  options.use_delta_encoding = static_cast<bool>(juse_delta_encoding);
+  if (jfilter_policy_handle > 0) {
+    std::shared_ptr<ROCKSDB_NAMESPACE::FilterPolicy> *pFilterPolicy =
+        reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::FilterPolicy> *>(
+            jfilter_policy_handle);
+    options.filter_policy = *pFilterPolicy;
+  }
+  options.whole_key_filtering = static_cast<bool>(jwhole_key_filtering);
+  options.verify_compression = static_cast<bool>(jverify_compression);
+  options.read_amp_bytes_per_bit =
+      static_cast<uint32_t>(jread_amp_bytes_per_bit);
+  options.format_version = static_cast<uint32_t>(jformat_version);
+  options.enable_index_compression =
+      static_cast<bool>(jenable_index_compression);
+  options.block_align = static_cast<bool>(jblock_align);
+  options.index_shortening =
+      ROCKSDB_NAMESPACE::IndexShorteningModeJni::toCppIndexShorteningMode(
+          jindex_shortening);
+
+  return GET_CPLUSPLUS_POINTER(
+      ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(options));
+}
diff --git a/src/rocksdb/java/rocksjni/table_filter.cc b/src/rocksdb/java/rocksjni/table_filter.cc
new file mode 100644
index 000000000..1400fa1d9
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/table_filter.cc
@@ -0,0 +1,27 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// org.rocksdb.AbstractTableFilter.
+
+#include <jni.h>
+
+#include <memory>
+
+#include "include/org_rocksdb_AbstractTableFilter.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/table_filter_jnicallback.h"
+
+/*
+ * Class:     org_rocksdb_AbstractTableFilter
+ * Method:    createNewTableFilter
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_AbstractTableFilter_createNewTableFilter(
+    JNIEnv* env, jobject jtable_filter) {
+  auto* table_filter_jnicallback =
+      new ROCKSDB_NAMESPACE::TableFilterJniCallback(env, jtable_filter);
+  return GET_CPLUSPLUS_POINTER(table_filter_jnicallback);
+}
diff --git a/src/rocksdb/java/rocksjni/table_filter_jnicallback.cc b/src/rocksdb/java/rocksjni/table_filter_jnicallback.cc
new file mode 100644
index 000000000..5350c5cee
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/table_filter_jnicallback.cc
@@ -0,0 +1,66 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::TableFilter.
+
+#include "rocksjni/table_filter_jnicallback.h"
+
+#include "rocksjni/portal.h"
+
+namespace ROCKSDB_NAMESPACE {
+TableFilterJniCallback::TableFilterJniCallback(JNIEnv* env,
+                                               jobject jtable_filter)
+    : JniCallback(env, jtable_filter) {
+  m_jfilter_methodid = AbstractTableFilterJni::getFilterMethod(env);
+  if (m_jfilter_methodid == nullptr) {
+    // exception thrown: NoSuchMethodException or OutOfMemoryError
+    return;
+  }
+
+  // create the function reference
+  /*
+  Note the JNI ENV must be obtained/release
+  on each call to the function itself as
+  it may be called from multiple threads
+  */
+  m_table_filter_function =
+      [this](const ROCKSDB_NAMESPACE::TableProperties& table_properties) {
+        jboolean attached_thread = JNI_FALSE;
+        JNIEnv* thread_env = getJniEnv(&attached_thread);
+        assert(thread_env != nullptr);
+
+        // create a Java TableProperties object
+        jobject jtable_properties = TablePropertiesJni::fromCppTableProperties(
+            thread_env, table_properties);
+        if (jtable_properties == nullptr) {
+          // exception thrown from fromCppTableProperties
+          thread_env->ExceptionDescribe();  // print out exception to stderr
+          releaseJniEnv(attached_thread);
+          return false;
+        }
+
+        jboolean result = thread_env->CallBooleanMethod(
+            m_jcallback_obj, m_jfilter_methodid, jtable_properties);
+        if (thread_env->ExceptionCheck()) {
+          // exception thrown from CallBooleanMethod
+          thread_env->DeleteLocalRef(jtable_properties);
+          thread_env->ExceptionDescribe();  // print out exception to stderr
+          releaseJniEnv(attached_thread);
+          return false;
+        }
+
+        // ok... cleanup and then return
+        releaseJniEnv(attached_thread);
+        return static_cast<bool>(result);
+      };
+}
+
+std::function<bool(const ROCKSDB_NAMESPACE::TableProperties&)>
+TableFilterJniCallback::GetTableFilterFunction() {
+  return m_table_filter_function;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/java/rocksjni/table_filter_jnicallback.h b/src/rocksdb/java/rocksjni/table_filter_jnicallback.h
new file mode 100644
index 000000000..0ef404ca2
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/table_filter_jnicallback.h
@@ -0,0 +1,36 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::TableFilter.
+
+#ifndef JAVA_ROCKSJNI_TABLE_FILTER_JNICALLBACK_H_
+#define JAVA_ROCKSJNI_TABLE_FILTER_JNICALLBACK_H_
+
+#include <jni.h>
+
+#include <functional>
+#include <memory>
+
+#include "rocksdb/table_properties.h"
+#include "rocksjni/jnicallback.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TableFilterJniCallback : public JniCallback {
+ public:
+  TableFilterJniCallback(JNIEnv* env, jobject jtable_filter);
+  std::function<bool(const ROCKSDB_NAMESPACE::TableProperties&)>
+  GetTableFilterFunction();
+
+ private:
+  jmethodID m_jfilter_methodid;
+  std::function<bool(const ROCKSDB_NAMESPACE::TableProperties&)>
+      m_table_filter_function;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // JAVA_ROCKSJNI_TABLE_FILTER_JNICALLBACK_H_
diff --git a/src/rocksdb/java/rocksjni/testable_event_listener.cc b/src/rocksdb/java/rocksjni/testable_event_listener.cc
new file mode 100644
index 000000000..71188bc3c
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/testable_event_listener.cc
@@ -0,0 +1,219 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#include <climits>
+#include <cstdint>
+#include <iostream>
+#include <utility>
+
+#include "include/org_rocksdb_test_TestableEventListener.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table_properties.h"
+
+using ROCKSDB_NAMESPACE::BackgroundErrorReason;
+using ROCKSDB_NAMESPACE::CompactionJobInfo;
+using ROCKSDB_NAMESPACE::CompactionJobStats;
+using ROCKSDB_NAMESPACE::CompactionReason;
+using ROCKSDB_NAMESPACE::CompressionType;
+using ROCKSDB_NAMESPACE::ExternalFileIngestionInfo;
+using ROCKSDB_NAMESPACE::FileOperationInfo;
+using ROCKSDB_NAMESPACE::FileOperationType;
+using ROCKSDB_NAMESPACE::FlushJobInfo;
+using ROCKSDB_NAMESPACE::FlushReason;
+using ROCKSDB_NAMESPACE::MemTableInfo;
+using ROCKSDB_NAMESPACE::Status;
+using ROCKSDB_NAMESPACE::TableFileCreationBriefInfo;
+using ROCKSDB_NAMESPACE::TableFileCreationInfo;
+using ROCKSDB_NAMESPACE::TableFileCreationReason;
+using ROCKSDB_NAMESPACE::TableFileDeletionInfo;
+using ROCKSDB_NAMESPACE::TableProperties;
+using ROCKSDB_NAMESPACE::WriteStallCondition;
+using ROCKSDB_NAMESPACE::WriteStallInfo;
+
+static TableProperties newTablePropertiesForTest() {
+  TableProperties table_properties;
+  table_properties.data_size = UINT64_MAX;
+  table_properties.index_size = UINT64_MAX;
+  table_properties.index_partitions = UINT64_MAX;
+  table_properties.top_level_index_size = UINT64_MAX;
+  table_properties.index_key_is_user_key = UINT64_MAX;
+  table_properties.index_value_is_delta_encoded = UINT64_MAX;
+  table_properties.filter_size = UINT64_MAX;
+  table_properties.raw_key_size = UINT64_MAX;
+  table_properties.raw_value_size = UINT64_MAX;
+  table_properties.num_data_blocks = UINT64_MAX;
+  table_properties.num_entries = UINT64_MAX;
+  table_properties.num_deletions = UINT64_MAX;
+  table_properties.num_merge_operands = UINT64_MAX;
+  table_properties.num_range_deletions = UINT64_MAX;
+  table_properties.format_version = UINT64_MAX;
+  table_properties.fixed_key_len = UINT64_MAX;
+  table_properties.column_family_id = UINT64_MAX;
+  table_properties.creation_time = UINT64_MAX;
+  table_properties.oldest_key_time = UINT64_MAX;
+  table_properties.file_creation_time = UINT64_MAX;
+  table_properties.slow_compression_estimated_data_size = UINT64_MAX;
+  table_properties.fast_compression_estimated_data_size = UINT64_MAX;
+  table_properties.external_sst_file_global_seqno_offset = UINT64_MAX;
+  table_properties.db_id = "dbId";
+  table_properties.db_session_id = "sessionId";
+  table_properties.column_family_name = "columnFamilyName";
+  table_properties.filter_policy_name = "filterPolicyName";
+  table_properties.comparator_name = "comparatorName";
+  table_properties.merge_operator_name = "mergeOperatorName";
+  table_properties.prefix_extractor_name = "prefixExtractorName";
+  table_properties.property_collectors_names = "propertyCollectorsNames";
+  table_properties.compression_name = "compressionName";
+  table_properties.compression_options = "compressionOptions";
+  table_properties.user_collected_properties = {{"key", "value"}};
+  table_properties.readable_properties = {{"key", "value"}};
+  return table_properties;
+}
+
+/*
+ * Class:     org_rocksdb_test_TestableEventListener
+ * Method:    invokeAllCallbacks
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_test_TestableEventListener_invokeAllCallbacks(
+    JNIEnv *, jclass, jlong jhandle) {
+  const auto &el =
+      *reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::EventListener> *>(
+          jhandle);
+
+  TableProperties table_properties = newTablePropertiesForTest();
+
+  FlushJobInfo flush_job_info;
+  flush_job_info.cf_id = INT_MAX;
+  flush_job_info.cf_name = "testColumnFamily";
+  flush_job_info.file_path = "/file/path";
+  flush_job_info.file_number = UINT64_MAX;
+  flush_job_info.oldest_blob_file_number = UINT64_MAX;
+  flush_job_info.thread_id = UINT64_MAX;
+  flush_job_info.job_id = INT_MAX;
+  flush_job_info.triggered_writes_slowdown = true;
+  flush_job_info.triggered_writes_stop = true;
+  flush_job_info.smallest_seqno = UINT64_MAX;
+  flush_job_info.largest_seqno = UINT64_MAX;
+  flush_job_info.table_properties = table_properties;
+  flush_job_info.flush_reason = FlushReason::kManualFlush;
+
+  el->OnFlushCompleted(nullptr, flush_job_info);
+  el->OnFlushBegin(nullptr, flush_job_info);
+
+  Status status = Status::Incomplete(Status::SubCode::kNoSpace);
+
+  TableFileDeletionInfo file_deletion_info;
+  file_deletion_info.db_name = "dbName";
+  file_deletion_info.file_path = "/file/path";
+  file_deletion_info.job_id = INT_MAX;
+  file_deletion_info.status = status;
+
+  el->OnTableFileDeleted(file_deletion_info);
+
+  CompactionJobInfo compaction_job_info;
+  compaction_job_info.cf_id = UINT32_MAX;
+  compaction_job_info.cf_name = "compactionColumnFamily";
+  compaction_job_info.status = status;
+  compaction_job_info.thread_id = UINT64_MAX;
+  compaction_job_info.job_id = INT_MAX;
+  compaction_job_info.base_input_level = INT_MAX;
+  compaction_job_info.output_level = INT_MAX;
+  compaction_job_info.input_files = {"inputFile.sst"};
+  compaction_job_info.input_file_infos = {};
+  compaction_job_info.output_files = {"outputFile.sst"};
+  compaction_job_info.output_file_infos = {};
+  compaction_job_info.table_properties = {
+      {"tableProperties", std::shared_ptr<TableProperties>(
+                              &table_properties, [](TableProperties *) {})}};
+  compaction_job_info.compaction_reason = CompactionReason::kFlush;
+  compaction_job_info.compression = CompressionType::kSnappyCompression;
+
+  compaction_job_info.stats = CompactionJobStats();
+
+  el->OnCompactionBegin(nullptr, compaction_job_info);
+  el->OnCompactionCompleted(nullptr, compaction_job_info);
+
+  TableFileCreationInfo file_creation_info;
+  file_creation_info.file_size = UINT64_MAX;
+  file_creation_info.table_properties = table_properties;
+  file_creation_info.status = status;
+  file_creation_info.file_checksum = "fileChecksum";
+  file_creation_info.file_checksum_func_name = "fileChecksumFuncName";
+  file_creation_info.db_name = "dbName";
+  file_creation_info.cf_name = "columnFamilyName";
+  file_creation_info.file_path = "/file/path";
+  file_creation_info.job_id = INT_MAX;
+  file_creation_info.reason = TableFileCreationReason::kMisc;
+
+  el->OnTableFileCreated(file_creation_info);
+
+  TableFileCreationBriefInfo file_creation_brief_info;
+  file_creation_brief_info.db_name = "dbName";
+  file_creation_brief_info.cf_name = "columnFamilyName";
+  file_creation_brief_info.file_path = "/file/path";
+  file_creation_brief_info.job_id = INT_MAX;
+  file_creation_brief_info.reason = TableFileCreationReason::kMisc;
+
+  el->OnTableFileCreationStarted(file_creation_brief_info);
+
+  MemTableInfo mem_table_info;
+  mem_table_info.cf_name = "columnFamilyName";
+  mem_table_info.first_seqno = UINT64_MAX;
+  mem_table_info.earliest_seqno = UINT64_MAX;
+  mem_table_info.num_entries = UINT64_MAX;
+  mem_table_info.num_deletes = UINT64_MAX;
+
+  el->OnMemTableSealed(mem_table_info);
+  el->OnColumnFamilyHandleDeletionStarted(nullptr);
+
+  ExternalFileIngestionInfo file_ingestion_info;
+  file_ingestion_info.cf_name = "columnFamilyName";
+  file_ingestion_info.external_file_path = "/external/file/path";
+  file_ingestion_info.internal_file_path = "/internal/file/path";
+  file_ingestion_info.global_seqno = UINT64_MAX;
+  file_ingestion_info.table_properties = table_properties;
+  el->OnExternalFileIngested(nullptr, file_ingestion_info);
+
+  el->OnBackgroundError(BackgroundErrorReason::kFlush, &status);
+
+  WriteStallInfo write_stall_info;
+  write_stall_info.cf_name = "columnFamilyName";
+  write_stall_info.condition.cur = WriteStallCondition::kDelayed;
+  write_stall_info.condition.prev = WriteStallCondition::kStopped;
+  el->OnStallConditionsChanged(write_stall_info);
+
+  const std::string file_path = "/file/path";
+  const auto start_timestamp =
+      std::make_pair(std::chrono::time_point<std::chrono::system_clock,
+                                             std::chrono::nanoseconds>(
+                         std::chrono::nanoseconds(1600699420000000000ll)),
+                     std::chrono::time_point<std::chrono::steady_clock,
+                                             std::chrono::nanoseconds>(
+                         std::chrono::nanoseconds(1600699420000000000ll)));
+  const auto finish_timestamp =
+      std::chrono::time_point<std::chrono::steady_clock,
+                              std::chrono::nanoseconds>(
+          std::chrono::nanoseconds(1600699425000000000ll));
+  FileOperationInfo op_info =
+      FileOperationInfo(FileOperationType::kRead, file_path, start_timestamp,
+                        finish_timestamp, status);
+  op_info.offset = UINT64_MAX;
+  op_info.length = SIZE_MAX;
+
+  el->OnFileReadFinish(op_info);
+  el->OnFileWriteFinish(op_info);
+  el->OnFileFlushFinish(op_info);
+  el->OnFileSyncFinish(op_info);
+  el->OnFileRangeSyncFinish(op_info);
+  el->OnFileTruncateFinish(op_info);
+  el->OnFileCloseFinish(op_info);
+  el->ShouldBeNotifiedOnFileIO();
+
+  bool auto_recovery;
+  el->OnErrorRecoveryBegin(BackgroundErrorReason::kFlush, status,
+                           &auto_recovery);
+  el->OnErrorRecoveryCompleted(status);
+}
diff --git a/src/rocksdb/java/rocksjni/thread_status.cc b/src/rocksdb/java/rocksjni/thread_status.cc
new file mode 100644
index 000000000..c600f6cd5
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/thread_status.cc
@@ -0,0 +1,125 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ ROCKSDB_NAMESPACE::ThreadStatus methods from Java side.
+
+#include "rocksdb/thread_status.h"
+
+#include <jni.h>
+
+#include "include/org_rocksdb_ThreadStatus.h"
+#include "portal.h"
+
+/*
+ * Class:     org_rocksdb_ThreadStatus
+ * Method:    getThreadTypeName
+ * Signature: (B)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_ThreadStatus_getThreadTypeName(
+    JNIEnv* env, jclass, jbyte jthread_type_value) {
+  auto name = ROCKSDB_NAMESPACE::ThreadStatus::GetThreadTypeName(
+      ROCKSDB_NAMESPACE::ThreadTypeJni::toCppThreadType(jthread_type_value));
+  return ROCKSDB_NAMESPACE::JniUtil::toJavaString(env, &name, true);
+}
+
+/*
+ * Class:     org_rocksdb_ThreadStatus
+ * Method:    getOperationName
+ * Signature: (B)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_ThreadStatus_getOperationName(
+    JNIEnv* env, jclass, jbyte joperation_type_value) {
+  auto name = ROCKSDB_NAMESPACE::ThreadStatus::GetOperationName(
+      ROCKSDB_NAMESPACE::OperationTypeJni::toCppOperationType(
+          joperation_type_value));
+  return ROCKSDB_NAMESPACE::JniUtil::toJavaString(env, &name, true);
+}
+
+/*
+ * Class:     org_rocksdb_ThreadStatus
+ * Method:    microsToStringNative
+ * Signature: (J)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_ThreadStatus_microsToStringNative(JNIEnv* env, jclass,
+                                                           jlong jmicros) {
+  auto str = ROCKSDB_NAMESPACE::ThreadStatus::MicrosToString(
+      static_cast<uint64_t>(jmicros));
+  return ROCKSDB_NAMESPACE::JniUtil::toJavaString(env, &str, true);
+}
+
+/*
+ * Class:     org_rocksdb_ThreadStatus
+ * Method:    getOperationStageName
+ * Signature: (B)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_ThreadStatus_getOperationStageName(
+    JNIEnv* env, jclass, jbyte joperation_stage_value) {
+  auto name = ROCKSDB_NAMESPACE::ThreadStatus::GetOperationStageName(
+      ROCKSDB_NAMESPACE::OperationStageJni::toCppOperationStage(
+          joperation_stage_value));
+  return ROCKSDB_NAMESPACE::JniUtil::toJavaString(env, &name, true);
+}
+
+/*
+ * Class:     org_rocksdb_ThreadStatus
+ * Method:    getOperationPropertyName
+ * Signature: (BI)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_ThreadStatus_getOperationPropertyName(
+    JNIEnv* env, jclass, jbyte joperation_type_value, jint jindex) {
+  auto name = ROCKSDB_NAMESPACE::ThreadStatus::GetOperationPropertyName(
+      ROCKSDB_NAMESPACE::OperationTypeJni::toCppOperationType(
+          joperation_type_value),
+      static_cast<int>(jindex));
+  return ROCKSDB_NAMESPACE::JniUtil::toJavaString(env, &name, true);
+}
+
+/*
+ * Class:     org_rocksdb_ThreadStatus
+ * Method:    interpretOperationProperties
+ * Signature: (B[J)Ljava/util/Map;
+ */
+jobject Java_org_rocksdb_ThreadStatus_interpretOperationProperties(
+    JNIEnv* env, jclass, jbyte joperation_type_value,
+    jlongArray joperation_properties) {
+  // convert joperation_properties
+  const jsize len = env->GetArrayLength(joperation_properties);
+  const std::unique_ptr<uint64_t[]> op_properties(new uint64_t[len]);
+  jlong* jop = env->GetLongArrayElements(joperation_properties, nullptr);
+  if (jop == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+  for (jsize i = 0; i < len; i++) {
+    op_properties[i] = static_cast<uint64_t>(jop[i]);
+  }
+  env->ReleaseLongArrayElements(joperation_properties, jop, JNI_ABORT);
+
+  // call the function
+  auto result = ROCKSDB_NAMESPACE::ThreadStatus::InterpretOperationProperties(
+      ROCKSDB_NAMESPACE::OperationTypeJni::toCppOperationType(
+          joperation_type_value),
+      op_properties.get());
+  jobject jresult = ROCKSDB_NAMESPACE::HashMapJni::fromCppMap(env, &result);
+  if (env->ExceptionCheck()) {
+    // exception occurred
+    return nullptr;
+  }
+
+  return jresult;
+}
+
+/*
+ * Class:     org_rocksdb_ThreadStatus
+ * Method:    getStateName
+ * Signature: (B)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_ThreadStatus_getStateName(JNIEnv* env, jclass,
+                                                   jbyte jstate_type_value) {
+  auto name = ROCKSDB_NAMESPACE::ThreadStatus::GetStateName(
+      ROCKSDB_NAMESPACE::StateTypeJni::toCppStateType(jstate_type_value));
+  return ROCKSDB_NAMESPACE::JniUtil::toJavaString(env, &name, true);
+}
diff --git a/src/rocksdb/java/rocksjni/trace_writer.cc b/src/rocksdb/java/rocksjni/trace_writer.cc
new file mode 100644
index 000000000..d58276399
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/trace_writer.cc
@@ -0,0 +1,24 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::CompactionFilterFactory.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_AbstractTraceWriter.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/trace_writer_jnicallback.h"
+
+/*
+ * Class:     org_rocksdb_AbstractTraceWriter
+ * Method:    createNewTraceWriter
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_AbstractTraceWriter_createNewTraceWriter(JNIEnv* env,
+                                                                jobject jobj) {
+  auto* trace_writer = new ROCKSDB_NAMESPACE::TraceWriterJniCallback(env, jobj);
+  return GET_CPLUSPLUS_POINTER(trace_writer);
+}
diff --git a/src/rocksdb/java/rocksjni/trace_writer_jnicallback.cc b/src/rocksdb/java/rocksjni/trace_writer_jnicallback.cc
new file mode 100644
index 000000000..d1ed32038
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/trace_writer_jnicallback.cc
@@ -0,0 +1,118 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::TraceWriter.
+
+#include "rocksjni/trace_writer_jnicallback.h"
+
+#include "rocksjni/portal.h"
+
+namespace ROCKSDB_NAMESPACE {
+TraceWriterJniCallback::TraceWriterJniCallback(JNIEnv* env,
+                                               jobject jtrace_writer)
+    : JniCallback(env, jtrace_writer) {
+  m_jwrite_proxy_methodid = AbstractTraceWriterJni::getWriteProxyMethodId(env);
+  if (m_jwrite_proxy_methodid == nullptr) {
+    // exception thrown: NoSuchMethodException or OutOfMemoryError
+    return;
+  }
+
+  m_jclose_writer_proxy_methodid =
+      AbstractTraceWriterJni::getCloseWriterProxyMethodId(env);
+  if (m_jclose_writer_proxy_methodid == nullptr) {
+    // exception thrown: NoSuchMethodException or OutOfMemoryError
+    return;
+  }
+
+  m_jget_file_size_methodid =
+      AbstractTraceWriterJni::getGetFileSizeMethodId(env);
+  if (m_jget_file_size_methodid == nullptr) {
+    // exception thrown: NoSuchMethodException or OutOfMemoryError
+    return;
+  }
+}
+
+Status TraceWriterJniCallback::Write(const Slice& data) {
+  jboolean attached_thread = JNI_FALSE;
+  JNIEnv* env = getJniEnv(&attached_thread);
+  if (env == nullptr) {
+    return Status::IOError("Unable to attach JNI Environment");
+  }
+
+  jshort jstatus =
+      env->CallShortMethod(m_jcallback_obj, m_jwrite_proxy_methodid, &data);
+
+  if (env->ExceptionCheck()) {
+    // exception thrown from CallShortMethod
+    env->ExceptionDescribe();  // print out exception to stderr
+    releaseJniEnv(attached_thread);
+    return Status::IOError(
+        "Unable to call AbstractTraceWriter#writeProxy(long)");
+  }
+
+  // unpack status code and status sub-code from jstatus
+  jbyte jcode_value = (jstatus >> 8) & 0xFF;
+  jbyte jsub_code_value = jstatus & 0xFF;
+  std::unique_ptr<Status> s =
+      StatusJni::toCppStatus(jcode_value, jsub_code_value);
+
+  releaseJniEnv(attached_thread);
+
+  return Status(*s);
+}
+
+Status TraceWriterJniCallback::Close() {
+  jboolean attached_thread = JNI_FALSE;
+  JNIEnv* env = getJniEnv(&attached_thread);
+  if (env == nullptr) {
+    return Status::IOError("Unable to attach JNI Environment");
+  }
+
+  jshort jstatus =
+      env->CallShortMethod(m_jcallback_obj, m_jclose_writer_proxy_methodid);
+
+  if (env->ExceptionCheck()) {
+    // exception thrown from CallShortMethod
+    env->ExceptionDescribe();  // print out exception to stderr
+    releaseJniEnv(attached_thread);
+    return Status::IOError(
+        "Unable to call AbstractTraceWriter#closeWriterProxy()");
+  }
+
+  // unpack status code and status sub-code from jstatus
+  jbyte code_value = (jstatus >> 8) & 0xFF;
+  jbyte sub_code_value = jstatus & 0xFF;
+  std::unique_ptr<Status> s =
+      StatusJni::toCppStatus(code_value, sub_code_value);
+
+  releaseJniEnv(attached_thread);
+
+  return Status(*s);
+}
+
+uint64_t TraceWriterJniCallback::GetFileSize() {
+  jboolean attached_thread = JNI_FALSE;
+  JNIEnv* env = getJniEnv(&attached_thread);
+  if (env == nullptr) {
+    return 0;
+  }
+
+  jlong jfile_size =
+      env->CallLongMethod(m_jcallback_obj, m_jget_file_size_methodid);
+
+  if (env->ExceptionCheck()) {
+    // exception thrown from CallLongMethod
+    env->ExceptionDescribe();  // print out exception to stderr
+    releaseJniEnv(attached_thread);
+    return 0;
+  }
+
+  releaseJniEnv(attached_thread);
+
+  return static_cast<uint64_t>(jfile_size);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/java/rocksjni/trace_writer_jnicallback.h b/src/rocksdb/java/rocksjni/trace_writer_jnicallback.h
new file mode 100644
index 000000000..c82a3a72c
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/trace_writer_jnicallback.h
@@ -0,0 +1,36 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::TraceWriter.
+
+#ifndef JAVA_ROCKSJNI_TRACE_WRITER_JNICALLBACK_H_
+#define JAVA_ROCKSJNI_TRACE_WRITER_JNICALLBACK_H_
+
+#include <jni.h>
+
+#include <memory>
+
+#include "rocksdb/trace_reader_writer.h"
+#include "rocksjni/jnicallback.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TraceWriterJniCallback : public JniCallback, public TraceWriter {
+ public:
+  TraceWriterJniCallback(JNIEnv* env, jobject jtrace_writer);
+  virtual Status Write(const Slice& data);
+  virtual Status Close();
+  virtual uint64_t GetFileSize();
+
+ private:
+  jmethodID m_jwrite_proxy_methodid;
+  jmethodID m_jclose_writer_proxy_methodid;
+  jmethodID m_jget_file_size_methodid;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // JAVA_ROCKSJNI_TRACE_WRITER_JNICALLBACK_H_
diff --git a/src/rocksdb/java/rocksjni/transaction.cc b/src/rocksdb/java/rocksjni/transaction.cc
new file mode 100644
index 000000000..1a0a64fc7
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/transaction.cc
@@ -0,0 +1,1655 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++
+// for ROCKSDB_NAMESPACE::Transaction.
+
+#include "rocksdb/utilities/transaction.h"
+
+#include <jni.h>
+
+#include <functional>
+
+#include "include/org_rocksdb_Transaction.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4503)  // identifier' : decorated name length
+                                 // exceeded, name was truncated
+#endif
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    setSnapshot
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Transaction_setSnapshot(JNIEnv* /*env*/, jobject /*jobj*/,
+                                              jlong jhandle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  txn->SetSnapshot();
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    setSnapshotOnNextOperation
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Transaction_setSnapshotOnNextOperation__J(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  txn->SetSnapshotOnNextOperation(nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    setSnapshotOnNextOperation
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Transaction_setSnapshotOnNextOperation__JJ(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jtxn_notifier_handle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  auto* txn_notifier = reinterpret_cast<
+      std::shared_ptr<ROCKSDB_NAMESPACE::TransactionNotifierJniCallback>*>(
+      jtxn_notifier_handle);
+  txn->SetSnapshotOnNextOperation(*txn_notifier);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getSnapshot
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Transaction_getSnapshot(JNIEnv* /*env*/,
+                                               jobject /*jobj*/,
+                                               jlong jhandle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  const ROCKSDB_NAMESPACE::Snapshot* snapshot = txn->GetSnapshot();
+  return GET_CPLUSPLUS_POINTER(snapshot);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    clearSnapshot
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Transaction_clearSnapshot(JNIEnv* /*env*/,
+                                                jobject /*jobj*/,
+                                                jlong jhandle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  txn->ClearSnapshot();
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    prepare
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Transaction_prepare(JNIEnv* env, jobject /*jobj*/,
+                                          jlong jhandle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  ROCKSDB_NAMESPACE::Status s = txn->Prepare();
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    commit
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Transaction_commit(JNIEnv* env, jobject /*jobj*/,
+                                         jlong jhandle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  ROCKSDB_NAMESPACE::Status s = txn->Commit();
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    rollback
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Transaction_rollback(JNIEnv* env, jobject /*jobj*/,
+                                           jlong jhandle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  ROCKSDB_NAMESPACE::Status s = txn->Rollback();
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    setSavePoint
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Transaction_setSavePoint(JNIEnv* /*env*/,
+                                               jobject /*jobj*/,
+                                               jlong jhandle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  txn->SetSavePoint();
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    rollbackToSavePoint
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Transaction_rollbackToSavePoint(JNIEnv* env,
+                                                      jobject /*jobj*/,
+                                                      jlong jhandle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  ROCKSDB_NAMESPACE::Status s = txn->RollbackToSavePoint();
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+typedef std::function<ROCKSDB_NAMESPACE::Status(
+    const ROCKSDB_NAMESPACE::ReadOptions&, const ROCKSDB_NAMESPACE::Slice&,
+    std::string*)>
+    FnGet;
+
+// TODO(AR) consider refactoring to share this between here and rocksjni.cc
+jbyteArray txn_get_helper(JNIEnv* env, const FnGet& fn_get,
+                          const jlong& jread_options_handle,
+                          const jbyteArray& jkey, const jint& jkey_part_len) {
+  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+  if (key == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+  ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char*>(key),
+                                     jkey_part_len);
+
+  auto* read_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jread_options_handle);
+  std::string value;
+  ROCKSDB_NAMESPACE::Status s = fn_get(*read_options, key_slice, &value);
+
+  // trigger java unref on key.
+  // by passing JNI_ABORT, it will simply release the reference without
+  // copying the result back to the java byte array.
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+
+  if (s.IsNotFound()) {
+    return nullptr;
+  }
+
+  if (s.ok()) {
+    jbyteArray jret_value = env->NewByteArray(static_cast<jsize>(value.size()));
+    if (jret_value == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+    env->SetByteArrayRegion(
+        jret_value, 0, static_cast<jsize>(value.size()),
+        const_cast<jbyte*>(reinterpret_cast<const jbyte*>(value.c_str())));
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      return nullptr;
+    }
+    return jret_value;
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  return nullptr;
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    get
+ * Signature: (JJ[BIJ)[B
+ */
+jbyteArray Java_org_rocksdb_Transaction_get__JJ_3BIJ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jlong jread_options_handle,
+    jbyteArray jkey, jint jkey_part_len, jlong jcolumn_family_handle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  auto* column_family_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(
+          jcolumn_family_handle);
+  FnGet fn_get =
+      std::bind<ROCKSDB_NAMESPACE::Status (ROCKSDB_NAMESPACE::Transaction::*)(
+          const ROCKSDB_NAMESPACE::ReadOptions&,
+          ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
+          const ROCKSDB_NAMESPACE::Slice&, std::string*)>(
+          &ROCKSDB_NAMESPACE::Transaction::Get, txn, std::placeholders::_1,
+          column_family_handle, std::placeholders::_2, std::placeholders::_3);
+  return txn_get_helper(env, fn_get, jread_options_handle, jkey, jkey_part_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    get
+ * Signature: (JJ[BI)[B
+ */
+jbyteArray Java_org_rocksdb_Transaction_get__JJ_3BI(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jlong jread_options_handle,
+    jbyteArray jkey, jint jkey_part_len) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  FnGet fn_get =
+      std::bind<ROCKSDB_NAMESPACE::Status (ROCKSDB_NAMESPACE::Transaction::*)(
+          const ROCKSDB_NAMESPACE::ReadOptions&,
+          const ROCKSDB_NAMESPACE::Slice&, std::string*)>(
+          &ROCKSDB_NAMESPACE::Transaction::Get, txn, std::placeholders::_1,
+          std::placeholders::_2, std::placeholders::_3);
+  return txn_get_helper(env, fn_get, jread_options_handle, jkey, jkey_part_len);
+}
+
+// TODO(AR) consider refactoring to share this between here and rocksjni.cc
+// used by txn_multi_get_helper below
+std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*> txn_column_families_helper(
+    JNIEnv* env, jlongArray jcolumn_family_handles, bool* has_exception) {
+  std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*> cf_handles;
+  if (jcolumn_family_handles != nullptr) {
+    const jsize len_cols = env->GetArrayLength(jcolumn_family_handles);
+    if (len_cols > 0) {
+      jlong* jcfh = env->GetLongArrayElements(jcolumn_family_handles, nullptr);
+      if (jcfh == nullptr) {
+        // exception thrown: OutOfMemoryError
+        *has_exception = JNI_TRUE;
+        return std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>();
+      }
+      for (int i = 0; i < len_cols; i++) {
+        auto* cf_handle =
+            reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcfh[i]);
+        cf_handles.push_back(cf_handle);
+      }
+      env->ReleaseLongArrayElements(jcolumn_family_handles, jcfh, JNI_ABORT);
+    }
+  }
+  return cf_handles;
+}
+
+typedef std::function<std::vector<ROCKSDB_NAMESPACE::Status>(
+    const ROCKSDB_NAMESPACE::ReadOptions&,
+    const std::vector<ROCKSDB_NAMESPACE::Slice>&, std::vector<std::string>*)>
+    FnMultiGet;
+
+void free_parts(
+    JNIEnv* env,
+    std::vector<std::tuple<jbyteArray, jbyte*, jobject>>& parts_to_free) {
+  for (auto& value : parts_to_free) {
+    jobject jk;
+    jbyteArray jk_ba;
+    jbyte* jk_val;
+    std::tie(jk_ba, jk_val, jk) = value;
+    env->ReleaseByteArrayElements(jk_ba, jk_val, JNI_ABORT);
+    env->DeleteLocalRef(jk);
+  }
+}
+
+void free_key_values(std::vector<jbyte*>& keys_to_free) {
+  for (auto& key : keys_to_free) {
+    delete[] key;
+  }
+}
+
+// TODO(AR) consider refactoring to share this between here and rocksjni.cc
+// cf multi get
+jobjectArray txn_multi_get_helper(JNIEnv* env, const FnMultiGet& fn_multi_get,
+                                  const jlong& jread_options_handle,
+                                  const jobjectArray& jkey_parts) {
+  const jsize len_key_parts = env->GetArrayLength(jkey_parts);
+
+  std::vector<ROCKSDB_NAMESPACE::Slice> key_parts;
+  std::vector<jbyte*> keys_to_free;
+  for (int i = 0; i < len_key_parts; i++) {
+    const jobject jk = env->GetObjectArrayElement(jkey_parts, i);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      free_key_values(keys_to_free);
+      return nullptr;
+    }
+    jbyteArray jk_ba = reinterpret_cast<jbyteArray>(jk);
+    const jsize len_key = env->GetArrayLength(jk_ba);
+    jbyte* jk_val = new jbyte[len_key];
+    if (jk_val == nullptr) {
+      // exception thrown: OutOfMemoryError
+      env->DeleteLocalRef(jk);
+      free_key_values(keys_to_free);
+
+      jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError");
+      (env)->ThrowNew(exception_cls,
+                      "Insufficient Memory for CF handle array.");
+      return nullptr;
+    }
+    env->GetByteArrayRegion(jk_ba, 0, len_key, jk_val);
+
+    ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char*>(jk_val),
+                                       len_key);
+    key_parts.push_back(key_slice);
+    keys_to_free.push_back(jk_val);
+    env->DeleteLocalRef(jk);
+  }
+
+  auto* read_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jread_options_handle);
+  std::vector<std::string> value_parts;
+  std::vector<ROCKSDB_NAMESPACE::Status> s =
+      fn_multi_get(*read_options, key_parts, &value_parts);
+
+  // free up allocated byte arrays
+  free_key_values(keys_to_free);
+
+  // prepare the results
+  const jclass jcls_ba = env->FindClass("[B");
+  jobjectArray jresults =
+      env->NewObjectArray(static_cast<jsize>(s.size()), jcls_ba, nullptr);
+  if (jresults == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+
+  // add to the jresults
+  for (std::vector<ROCKSDB_NAMESPACE::Status>::size_type i = 0; i != s.size();
+       i++) {
+    if (s[i].ok()) {
+      jbyteArray jentry_value =
+          env->NewByteArray(static_cast<jsize>(value_parts[i].size()));
+      if (jentry_value == nullptr) {
+        // exception thrown: OutOfMemoryError
+        return nullptr;
+      }
+
+      env->SetByteArrayRegion(
+          jentry_value, 0, static_cast<jsize>(value_parts[i].size()),
+          const_cast<jbyte*>(
+              reinterpret_cast<const jbyte*>(value_parts[i].c_str())));
+      if (env->ExceptionCheck()) {
+        // exception thrown: ArrayIndexOutOfBoundsException
+        env->DeleteLocalRef(jentry_value);
+        return nullptr;
+      }
+
+      env->SetObjectArrayElement(jresults, static_cast<jsize>(i), jentry_value);
+      env->DeleteLocalRef(jentry_value);
+    }
+  }
+
+  return jresults;
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    multiGet
+ * Signature: (JJ[[B[J)[[B
+ */
+jobjectArray Java_org_rocksdb_Transaction_multiGet__JJ_3_3B_3J(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jlong jread_options_handle,
+    jobjectArray jkey_parts, jlongArray jcolumn_family_handles) {
+  bool has_exception = false;
+  const std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>
+      column_family_handles = txn_column_families_helper(
+          env, jcolumn_family_handles, &has_exception);
+  if (has_exception) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  FnMultiGet fn_multi_get = std::bind<std::vector<ROCKSDB_NAMESPACE::Status> (
+      ROCKSDB_NAMESPACE::Transaction::*)(
+      const ROCKSDB_NAMESPACE::ReadOptions&,
+      const std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>&,
+      const std::vector<ROCKSDB_NAMESPACE::Slice>&, std::vector<std::string>*)>(
+      &ROCKSDB_NAMESPACE::Transaction::MultiGet, txn, std::placeholders::_1,
+      column_family_handles, std::placeholders::_2, std::placeholders::_3);
+  return txn_multi_get_helper(env, fn_multi_get, jread_options_handle,
+                              jkey_parts);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    multiGet
+ * Signature: (JJ[[B)[[B
+ */
+jobjectArray Java_org_rocksdb_Transaction_multiGet__JJ_3_3B(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jlong jread_options_handle,
+    jobjectArray jkey_parts) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  FnMultiGet fn_multi_get = std::bind<std::vector<ROCKSDB_NAMESPACE::Status> (
+      ROCKSDB_NAMESPACE::Transaction::*)(
+      const ROCKSDB_NAMESPACE::ReadOptions&,
+      const std::vector<ROCKSDB_NAMESPACE::Slice>&, std::vector<std::string>*)>(
+      &ROCKSDB_NAMESPACE::Transaction::MultiGet, txn, std::placeholders::_1,
+      std::placeholders::_2, std::placeholders::_3);
+  return txn_multi_get_helper(env, fn_multi_get, jread_options_handle,
+                              jkey_parts);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getForUpdate
+ * Signature: (JJ[BIJZZ)[B
+ */
+jbyteArray Java_org_rocksdb_Transaction_getForUpdate__JJ_3BIJZZ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jlong jread_options_handle,
+    jbyteArray jkey, jint jkey_part_len, jlong jcolumn_family_handle,
+    jboolean jexclusive, jboolean jdo_validate) {
+  auto* column_family_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(
+          jcolumn_family_handle);
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  FnGet fn_get_for_update =
+      std::bind<ROCKSDB_NAMESPACE::Status (ROCKSDB_NAMESPACE::Transaction::*)(
+          const ROCKSDB_NAMESPACE::ReadOptions&,
+          ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
+          const ROCKSDB_NAMESPACE::Slice&, std::string*, bool, bool)>(
+          &ROCKSDB_NAMESPACE::Transaction::GetForUpdate, txn,
+          std::placeholders::_1, column_family_handle, std::placeholders::_2,
+          std::placeholders::_3, jexclusive, jdo_validate);
+  return txn_get_helper(env, fn_get_for_update, jread_options_handle, jkey,
+                        jkey_part_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getForUpdate
+ * Signature: (JJ[BIZZ)[B
+ */
+jbyteArray Java_org_rocksdb_Transaction_getForUpdate__JJ_3BIZZ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jlong jread_options_handle,
+    jbyteArray jkey, jint jkey_part_len, jboolean jexclusive,
+    jboolean jdo_validate) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  FnGet fn_get_for_update =
+      std::bind<ROCKSDB_NAMESPACE::Status (ROCKSDB_NAMESPACE::Transaction::*)(
+          const ROCKSDB_NAMESPACE::ReadOptions&,
+          const ROCKSDB_NAMESPACE::Slice&, std::string*, bool, bool)>(
+          &ROCKSDB_NAMESPACE::Transaction::GetForUpdate, txn,
+          std::placeholders::_1, std::placeholders::_2, std::placeholders::_3,
+          jexclusive, jdo_validate);
+  return txn_get_helper(env, fn_get_for_update, jread_options_handle, jkey,
+                        jkey_part_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    multiGetForUpdate
+ * Signature: (JJ[[B[J)[[B
+ */
+jobjectArray Java_org_rocksdb_Transaction_multiGetForUpdate__JJ_3_3B_3J(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jlong jread_options_handle,
+    jobjectArray jkey_parts, jlongArray jcolumn_family_handles) {
+  bool has_exception = false;
+  const std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>
+      column_family_handles = txn_column_families_helper(
+          env, jcolumn_family_handles, &has_exception);
+  if (has_exception) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  FnMultiGet fn_multi_get_for_update = std::bind<std::vector<
+      ROCKSDB_NAMESPACE::Status> (ROCKSDB_NAMESPACE::Transaction::*)(
+      const ROCKSDB_NAMESPACE::ReadOptions&,
+      const std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>&,
+      const std::vector<ROCKSDB_NAMESPACE::Slice>&, std::vector<std::string>*)>(
+      &ROCKSDB_NAMESPACE::Transaction::MultiGetForUpdate, txn,
+      std::placeholders::_1, column_family_handles, std::placeholders::_2,
+      std::placeholders::_3);
+  return txn_multi_get_helper(env, fn_multi_get_for_update,
+                              jread_options_handle, jkey_parts);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    multiGetForUpdate
+ * Signature: (JJ[[B)[[B
+ */
+jobjectArray Java_org_rocksdb_Transaction_multiGetForUpdate__JJ_3_3B(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jlong jread_options_handle,
+    jobjectArray jkey_parts) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  FnMultiGet fn_multi_get_for_update = std::bind<std::vector<
+      ROCKSDB_NAMESPACE::Status> (ROCKSDB_NAMESPACE::Transaction::*)(
+      const ROCKSDB_NAMESPACE::ReadOptions&,
+      const std::vector<ROCKSDB_NAMESPACE::Slice>&, std::vector<std::string>*)>(
+      &ROCKSDB_NAMESPACE::Transaction::MultiGetForUpdate, txn,
+      std::placeholders::_1, std::placeholders::_2, std::placeholders::_3);
+  return txn_multi_get_helper(env, fn_multi_get_for_update,
+                              jread_options_handle, jkey_parts);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getIterator
+ * Signature: (JJ)J
+ */
+jlong Java_org_rocksdb_Transaction_getIterator__JJ(JNIEnv* /*env*/,
+                                                   jobject /*jobj*/,
+                                                   jlong jhandle,
+                                                   jlong jread_options_handle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  auto* read_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jread_options_handle);
+  return GET_CPLUSPLUS_POINTER(txn->GetIterator(*read_options));
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getIterator
+ * Signature: (JJJ)J
+ */
+jlong Java_org_rocksdb_Transaction_getIterator__JJJ(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jread_options_handle, jlong jcolumn_family_handle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  auto* read_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jread_options_handle);
+  auto* column_family_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(
+          jcolumn_family_handle);
+  return GET_CPLUSPLUS_POINTER(
+      txn->GetIterator(*read_options, column_family_handle));
+}
+
+typedef std::function<ROCKSDB_NAMESPACE::Status(
+    const ROCKSDB_NAMESPACE::Slice&, const ROCKSDB_NAMESPACE::Slice&)>
+    FnWriteKV;
+
+// TODO(AR) consider refactoring to share this between here and rocksjni.cc
+void txn_write_kv_helper(JNIEnv* env, const FnWriteKV& fn_write_kv,
+                         const jbyteArray& jkey, const jint& jkey_part_len,
+                         const jbyteArray& jval, const jint& jval_len) {
+  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+  if (key == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+  jbyte* value = env->GetByteArrayElements(jval, nullptr);
+  if (value == nullptr) {
+    // exception thrown: OutOfMemoryError
+    env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+    return;
+  }
+  ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char*>(key),
+                                     jkey_part_len);
+  ROCKSDB_NAMESPACE::Slice value_slice(reinterpret_cast<char*>(value),
+                                       jval_len);
+
+  ROCKSDB_NAMESPACE::Status s = fn_write_kv(key_slice, value_slice);
+
+  // trigger java unref on key.
+  // by passing JNI_ABORT, it will simply release the reference without
+  // copying the result back to the java byte array.
+  env->ReleaseByteArrayElements(jval, value, JNI_ABORT);
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+
+  if (s.ok()) {
+    return;
+  }
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    put
+ * Signature: (J[BI[BIJZ)V
+ */
+void Java_org_rocksdb_Transaction_put__J_3BI_3BIJZ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey,
+    jint jkey_part_len, jbyteArray jval, jint jval_len,
+    jlong jcolumn_family_handle, jboolean jassume_tracked) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  auto* column_family_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(
+          jcolumn_family_handle);
+  FnWriteKV fn_put =
+      std::bind<ROCKSDB_NAMESPACE::Status (ROCKSDB_NAMESPACE::Transaction::*)(
+          ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
+          const ROCKSDB_NAMESPACE::Slice&, const ROCKSDB_NAMESPACE::Slice&,
+          bool)>(&ROCKSDB_NAMESPACE::Transaction::Put, txn,
+                 column_family_handle, std::placeholders::_1,
+                 std::placeholders::_2, jassume_tracked);
+  txn_write_kv_helper(env, fn_put, jkey, jkey_part_len, jval, jval_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    put
+ * Signature: (J[BI[BI)V
+ */
+void Java_org_rocksdb_Transaction_put__J_3BI_3BI(JNIEnv* env, jobject /*jobj*/,
+                                                 jlong jhandle, jbyteArray jkey,
+                                                 jint jkey_part_len,
+                                                 jbyteArray jval,
+                                                 jint jval_len) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  FnWriteKV fn_put =
+      std::bind<ROCKSDB_NAMESPACE::Status (ROCKSDB_NAMESPACE::Transaction::*)(
+          const ROCKSDB_NAMESPACE::Slice&, const ROCKSDB_NAMESPACE::Slice&)>(
+          &ROCKSDB_NAMESPACE::Transaction::Put, txn, std::placeholders::_1,
+          std::placeholders::_2);
+  txn_write_kv_helper(env, fn_put, jkey, jkey_part_len, jval, jval_len);
+}
+
+typedef std::function<ROCKSDB_NAMESPACE::Status(
+    const ROCKSDB_NAMESPACE::SliceParts&, const ROCKSDB_NAMESPACE::SliceParts&)>
+    FnWriteKVParts;
+
+// TODO(AR) consider refactoring to share this between here and rocksjni.cc
+void txn_write_kv_parts_helper(JNIEnv* env,
+                               const FnWriteKVParts& fn_write_kv_parts,
+                               const jobjectArray& jkey_parts,
+                               const jint& jkey_parts_len,
+                               const jobjectArray& jvalue_parts,
+                               const jint& jvalue_parts_len) {
+#ifndef DEBUG
+  (void)jvalue_parts_len;
+#else
+  assert(jkey_parts_len == jvalue_parts_len);
+#endif
+
+  auto key_parts = std::vector<ROCKSDB_NAMESPACE::Slice>();
+  auto value_parts = std::vector<ROCKSDB_NAMESPACE::Slice>();
+  auto jparts_to_free = std::vector<std::tuple<jbyteArray, jbyte*, jobject>>();
+
+  // Since this is fundamentally a gather write at the RocksDB level,
+  // it seems wrong to refactor it by copying (gathering) keys and data here,
+  // in order to avoid the local reference limit.
+  // The user needs to be a aware that there is a limit to the number of parts
+  // which can be gathered.
+  if (env->EnsureLocalCapacity(jkey_parts_len + jvalue_parts_len) != 0) {
+    // no space for all the jobjects we store up
+    env->ExceptionClear();
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env, "Insufficient JNI local references for " +
+                 std::to_string(jkey_parts_len) + " key/value parts");
+    return;
+  }
+
+  // convert java key_parts/value_parts byte[][] to Slice(s)
+  for (jsize i = 0; i < jkey_parts_len; ++i) {
+    const jobject jobj_key_part = env->GetObjectArrayElement(jkey_parts, i);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      free_parts(env, jparts_to_free);
+      return;
+    }
+    const jobject jobj_value_part = env->GetObjectArrayElement(jvalue_parts, i);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      env->DeleteLocalRef(jobj_key_part);
+      free_parts(env, jparts_to_free);
+      return;
+    }
+
+    const jbyteArray jba_key_part = reinterpret_cast<jbyteArray>(jobj_key_part);
+    const jsize jkey_part_len = env->GetArrayLength(jba_key_part);
+    jbyte* jkey_part = env->GetByteArrayElements(jba_key_part, nullptr);
+    if (jkey_part == nullptr) {
+      // exception thrown: OutOfMemoryError
+      env->DeleteLocalRef(jobj_value_part);
+      env->DeleteLocalRef(jobj_key_part);
+      free_parts(env, jparts_to_free);
+      return;
+    }
+
+    const jbyteArray jba_value_part =
+        reinterpret_cast<jbyteArray>(jobj_value_part);
+    const jsize jvalue_part_len = env->GetArrayLength(jba_value_part);
+    jbyte* jvalue_part = env->GetByteArrayElements(jba_value_part, nullptr);
+    if (jvalue_part == nullptr) {
+      // exception thrown: OutOfMemoryError
+      env->DeleteLocalRef(jobj_value_part);
+      env->DeleteLocalRef(jobj_key_part);
+      env->ReleaseByteArrayElements(jba_key_part, jkey_part, JNI_ABORT);
+      free_parts(env, jparts_to_free);
+      return;
+    }
+
+    jparts_to_free.push_back(
+        std::make_tuple(jba_key_part, jkey_part, jobj_key_part));
+    jparts_to_free.push_back(
+        std::make_tuple(jba_value_part, jvalue_part, jobj_value_part));
+
+    key_parts.push_back(ROCKSDB_NAMESPACE::Slice(
+        reinterpret_cast<char*>(jkey_part), jkey_part_len));
+    value_parts.push_back(ROCKSDB_NAMESPACE::Slice(
+        reinterpret_cast<char*>(jvalue_part), jvalue_part_len));
+  }
+
+  // call the write_multi function
+  ROCKSDB_NAMESPACE::Status s = fn_write_kv_parts(
+      ROCKSDB_NAMESPACE::SliceParts(key_parts.data(), (int)key_parts.size()),
+      ROCKSDB_NAMESPACE::SliceParts(value_parts.data(),
+                                    (int)value_parts.size()));
+
+  // cleanup temporary memory
+  free_parts(env, jparts_to_free);
+
+  // return
+  if (s.ok()) {
+    return;
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    put
+ * Signature: (J[[BI[[BIJZ)V
+ */
+void Java_org_rocksdb_Transaction_put__J_3_3BI_3_3BIJZ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jobjectArray jkey_parts,
+    jint jkey_parts_len, jobjectArray jvalue_parts, jint jvalue_parts_len,
+    jlong jcolumn_family_handle, jboolean jassume_tracked) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  auto* column_family_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(
+          jcolumn_family_handle);
+  FnWriteKVParts fn_put_parts =
+      std::bind<ROCKSDB_NAMESPACE::Status (ROCKSDB_NAMESPACE::Transaction::*)(
+          ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
+          const ROCKSDB_NAMESPACE::SliceParts&,
+          const ROCKSDB_NAMESPACE::SliceParts&, bool)>(
+          &ROCKSDB_NAMESPACE::Transaction::Put, txn, column_family_handle,
+          std::placeholders::_1, std::placeholders::_2, jassume_tracked);
+  txn_write_kv_parts_helper(env, fn_put_parts, jkey_parts, jkey_parts_len,
+                            jvalue_parts, jvalue_parts_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    put
+ * Signature: (J[[BI[[BI)V
+ */
+void Java_org_rocksdb_Transaction_put__J_3_3BI_3_3BI(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jobjectArray jkey_parts,
+    jint jkey_parts_len, jobjectArray jvalue_parts, jint jvalue_parts_len) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  FnWriteKVParts fn_put_parts = std::bind<ROCKSDB_NAMESPACE::Status (
+      ROCKSDB_NAMESPACE::Transaction::*)(const ROCKSDB_NAMESPACE::SliceParts&,
+                                         const ROCKSDB_NAMESPACE::SliceParts&)>(
+      &ROCKSDB_NAMESPACE::Transaction::Put, txn, std::placeholders::_1,
+      std::placeholders::_2);
+  txn_write_kv_parts_helper(env, fn_put_parts, jkey_parts, jkey_parts_len,
+                            jvalue_parts, jvalue_parts_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    merge
+ * Signature: (J[BI[BIJZ)V
+ */
+void Java_org_rocksdb_Transaction_merge__J_3BI_3BIJZ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey,
+    jint jkey_part_len, jbyteArray jval, jint jval_len,
+    jlong jcolumn_family_handle, jboolean jassume_tracked) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  auto* column_family_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(
+          jcolumn_family_handle);
+  FnWriteKV fn_merge =
+      std::bind<ROCKSDB_NAMESPACE::Status (ROCKSDB_NAMESPACE::Transaction::*)(
+          ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
+          const ROCKSDB_NAMESPACE::Slice&, const ROCKSDB_NAMESPACE::Slice&,
+          bool)>(&ROCKSDB_NAMESPACE::Transaction::Merge, txn,
+                 column_family_handle, std::placeholders::_1,
+                 std::placeholders::_2, jassume_tracked);
+  txn_write_kv_helper(env, fn_merge, jkey, jkey_part_len, jval, jval_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    merge
+ * Signature: (J[BI[BI)V
+ */
+void Java_org_rocksdb_Transaction_merge__J_3BI_3BI(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey,
+    jint jkey_part_len, jbyteArray jval, jint jval_len) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  FnWriteKV fn_merge =
+      std::bind<ROCKSDB_NAMESPACE::Status (ROCKSDB_NAMESPACE::Transaction::*)(
+          const ROCKSDB_NAMESPACE::Slice&, const ROCKSDB_NAMESPACE::Slice&)>(
+          &ROCKSDB_NAMESPACE::Transaction::Merge, txn, std::placeholders::_1,
+          std::placeholders::_2);
+  txn_write_kv_helper(env, fn_merge, jkey, jkey_part_len, jval, jval_len);
+}
+
+typedef std::function<ROCKSDB_NAMESPACE::Status(
+    const ROCKSDB_NAMESPACE::Slice&)>
+    FnWriteK;
+
+// TODO(AR) consider refactoring to share this between here and rocksjni.cc
+void txn_write_k_helper(JNIEnv* env, const FnWriteK& fn_write_k,
+                        const jbyteArray& jkey, const jint& jkey_part_len) {
+  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+  if (key == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+  ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char*>(key),
+                                     jkey_part_len);
+
+  ROCKSDB_NAMESPACE::Status s = fn_write_k(key_slice);
+
+  // trigger java unref on key.
+  // by passing JNI_ABORT, it will simply release the reference without
+  // copying the result back to the java byte array.
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+
+  if (s.ok()) {
+    return;
+  }
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    delete
+ * Signature: (J[BIJZ)V
+ */
+void Java_org_rocksdb_Transaction_delete__J_3BIJZ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey,
+    jint jkey_part_len, jlong jcolumn_family_handle, jboolean jassume_tracked) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  auto* column_family_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(
+          jcolumn_family_handle);
+  FnWriteK fn_delete =
+      std::bind<ROCKSDB_NAMESPACE::Status (ROCKSDB_NAMESPACE::Transaction::*)(
+          ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
+          const ROCKSDB_NAMESPACE::Slice&, bool)>(
+          &ROCKSDB_NAMESPACE::Transaction::Delete, txn, column_family_handle,
+          std::placeholders::_1, jassume_tracked);
+  txn_write_k_helper(env, fn_delete, jkey, jkey_part_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    delete
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_Transaction_delete__J_3BI(JNIEnv* env, jobject /*jobj*/,
+                                                jlong jhandle, jbyteArray jkey,
+                                                jint jkey_part_len) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  FnWriteK fn_delete = std::bind<ROCKSDB_NAMESPACE::Status (
+      ROCKSDB_NAMESPACE::Transaction::*)(const ROCKSDB_NAMESPACE::Slice&)>(
+      &ROCKSDB_NAMESPACE::Transaction::Delete, txn, std::placeholders::_1);
+  txn_write_k_helper(env, fn_delete, jkey, jkey_part_len);
+}
+
+typedef std::function<ROCKSDB_NAMESPACE::Status(
+    const ROCKSDB_NAMESPACE::SliceParts&)>
+    FnWriteKParts;
+
+// TODO(AR) consider refactoring to share this between here and rocksjni.cc
+void txn_write_k_parts_helper(JNIEnv* env,
+                              const FnWriteKParts& fn_write_k_parts,
+                              const jobjectArray& jkey_parts,
+                              const jint& jkey_parts_len) {
+  std::vector<ROCKSDB_NAMESPACE::Slice> key_parts;
+  std::vector<std::tuple<jbyteArray, jbyte*, jobject>> jkey_parts_to_free;
+
+  // convert java key_parts byte[][] to Slice(s)
+  for (jint i = 0; i < jkey_parts_len; ++i) {
+    const jobject jobj_key_part = env->GetObjectArrayElement(jkey_parts, i);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      free_parts(env, jkey_parts_to_free);
+      return;
+    }
+
+    const jbyteArray jba_key_part = reinterpret_cast<jbyteArray>(jobj_key_part);
+    const jsize jkey_part_len = env->GetArrayLength(jba_key_part);
+    jbyte* jkey_part = env->GetByteArrayElements(jba_key_part, nullptr);
+    if (jkey_part == nullptr) {
+      // exception thrown: OutOfMemoryError
+      env->DeleteLocalRef(jobj_key_part);
+      free_parts(env, jkey_parts_to_free);
+      return;
+    }
+
+    jkey_parts_to_free.push_back(std::tuple<jbyteArray, jbyte*, jobject>(
+        jba_key_part, jkey_part, jobj_key_part));
+
+    key_parts.push_back(ROCKSDB_NAMESPACE::Slice(
+        reinterpret_cast<char*>(jkey_part), jkey_part_len));
+  }
+
+  // call the write_multi function
+  ROCKSDB_NAMESPACE::Status s = fn_write_k_parts(
+      ROCKSDB_NAMESPACE::SliceParts(key_parts.data(), (int)key_parts.size()));
+
+  // cleanup temporary memory
+  free_parts(env, jkey_parts_to_free);
+
+  // return
+  if (s.ok()) {
+    return;
+  }
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    delete
+ * Signature: (J[[BIJZ)V
+ */
+void Java_org_rocksdb_Transaction_delete__J_3_3BIJZ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jobjectArray jkey_parts,
+    jint jkey_parts_len, jlong jcolumn_family_handle,
+    jboolean jassume_tracked) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  auto* column_family_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(
+          jcolumn_family_handle);
+  FnWriteKParts fn_delete_parts =
+      std::bind<ROCKSDB_NAMESPACE::Status (ROCKSDB_NAMESPACE::Transaction::*)(
+          ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
+          const ROCKSDB_NAMESPACE::SliceParts&, bool)>(
+          &ROCKSDB_NAMESPACE::Transaction::Delete, txn, column_family_handle,
+          std::placeholders::_1, jassume_tracked);
+  txn_write_k_parts_helper(env, fn_delete_parts, jkey_parts, jkey_parts_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    delete
+ * Signature: (J[[BI)V
+ */
+void Java_org_rocksdb_Transaction_delete__J_3_3BI(JNIEnv* env, jobject /*jobj*/,
+                                                  jlong jhandle,
+                                                  jobjectArray jkey_parts,
+                                                  jint jkey_parts_len) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  FnWriteKParts fn_delete_parts = std::bind<ROCKSDB_NAMESPACE::Status (
+      ROCKSDB_NAMESPACE::Transaction::*)(const ROCKSDB_NAMESPACE::SliceParts&)>(
+      &ROCKSDB_NAMESPACE::Transaction::Delete, txn, std::placeholders::_1);
+  txn_write_k_parts_helper(env, fn_delete_parts, jkey_parts, jkey_parts_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    singleDelete
+ * Signature: (J[BIJZ)V
+ */
+void Java_org_rocksdb_Transaction_singleDelete__J_3BIJZ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey,
+    jint jkey_part_len, jlong jcolumn_family_handle, jboolean jassume_tracked) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  auto* column_family_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(
+          jcolumn_family_handle);
+  FnWriteK fn_single_delete =
+      std::bind<ROCKSDB_NAMESPACE::Status (ROCKSDB_NAMESPACE::Transaction::*)(
+          ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
+          const ROCKSDB_NAMESPACE::Slice&, bool)>(
+          &ROCKSDB_NAMESPACE::Transaction::SingleDelete, txn,
+          column_family_handle, std::placeholders::_1, jassume_tracked);
+  txn_write_k_helper(env, fn_single_delete, jkey, jkey_part_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    singleDelete
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_Transaction_singleDelete__J_3BI(JNIEnv* env,
+                                                      jobject /*jobj*/,
+                                                      jlong jhandle,
+                                                      jbyteArray jkey,
+                                                      jint jkey_part_len) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  FnWriteK fn_single_delete = std::bind<ROCKSDB_NAMESPACE::Status (
+      ROCKSDB_NAMESPACE::Transaction::*)(const ROCKSDB_NAMESPACE::Slice&)>(
+      &ROCKSDB_NAMESPACE::Transaction::SingleDelete, txn,
+      std::placeholders::_1);
+  txn_write_k_helper(env, fn_single_delete, jkey, jkey_part_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    singleDelete
+ * Signature: (J[[BIJZ)V
+ */
+void Java_org_rocksdb_Transaction_singleDelete__J_3_3BIJZ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jobjectArray jkey_parts,
+    jint jkey_parts_len, jlong jcolumn_family_handle,
+    jboolean jassume_tracked) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  auto* column_family_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(
+          jcolumn_family_handle);
+  FnWriteKParts fn_single_delete_parts =
+      std::bind<ROCKSDB_NAMESPACE::Status (ROCKSDB_NAMESPACE::Transaction::*)(
+          ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
+          const ROCKSDB_NAMESPACE::SliceParts&, bool)>(
+          &ROCKSDB_NAMESPACE::Transaction::SingleDelete, txn,
+          column_family_handle, std::placeholders::_1, jassume_tracked);
+  txn_write_k_parts_helper(env, fn_single_delete_parts, jkey_parts,
+                           jkey_parts_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    singleDelete
+ * Signature: (J[[BI)V
+ */
+void Java_org_rocksdb_Transaction_singleDelete__J_3_3BI(JNIEnv* env,
+                                                        jobject /*jobj*/,
+                                                        jlong jhandle,
+                                                        jobjectArray jkey_parts,
+                                                        jint jkey_parts_len) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  FnWriteKParts fn_single_delete_parts = std::bind<ROCKSDB_NAMESPACE::Status (
+      ROCKSDB_NAMESPACE::Transaction::*)(const ROCKSDB_NAMESPACE::SliceParts&)>(
+      &ROCKSDB_NAMESPACE::Transaction::SingleDelete, txn,
+      std::placeholders::_1);
+  txn_write_k_parts_helper(env, fn_single_delete_parts, jkey_parts,
+                           jkey_parts_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    putUntracked
+ * Signature: (J[BI[BIJ)V
+ */
+void Java_org_rocksdb_Transaction_putUntracked__J_3BI_3BIJ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey,
+    jint jkey_part_len, jbyteArray jval, jint jval_len,
+    jlong jcolumn_family_handle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  auto* column_family_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(
+          jcolumn_family_handle);
+  FnWriteKV fn_put_untracked =
+      std::bind<ROCKSDB_NAMESPACE::Status (ROCKSDB_NAMESPACE::Transaction::*)(
+          ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
+          const ROCKSDB_NAMESPACE::Slice&, const ROCKSDB_NAMESPACE::Slice&)>(
+          &ROCKSDB_NAMESPACE::Transaction::PutUntracked, txn,
+          column_family_handle, std::placeholders::_1, std::placeholders::_2);
+  txn_write_kv_helper(env, fn_put_untracked, jkey, jkey_part_len, jval,
+                      jval_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    putUntracked
+ * Signature: (J[BI[BI)V
+ */
+void Java_org_rocksdb_Transaction_putUntracked__J_3BI_3BI(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey,
+    jint jkey_part_len, jbyteArray jval, jint jval_len) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  FnWriteKV fn_put_untracked =
+      std::bind<ROCKSDB_NAMESPACE::Status (ROCKSDB_NAMESPACE::Transaction::*)(
+          const ROCKSDB_NAMESPACE::Slice&, const ROCKSDB_NAMESPACE::Slice&)>(
+          &ROCKSDB_NAMESPACE::Transaction::PutUntracked, txn,
+          std::placeholders::_1, std::placeholders::_2);
+  txn_write_kv_helper(env, fn_put_untracked, jkey, jkey_part_len, jval,
+                      jval_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    putUntracked
+ * Signature: (J[[BI[[BIJ)V
+ */
+void Java_org_rocksdb_Transaction_putUntracked__J_3_3BI_3_3BIJ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jobjectArray jkey_parts,
+    jint jkey_parts_len, jobjectArray jvalue_parts, jint jvalue_parts_len,
+    jlong jcolumn_family_handle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  auto* column_family_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(
+          jcolumn_family_handle);
+  FnWriteKVParts fn_put_parts_untracked = std::bind<ROCKSDB_NAMESPACE::Status (
+      ROCKSDB_NAMESPACE::Transaction::*)(ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
+                                         const ROCKSDB_NAMESPACE::SliceParts&,
+                                         const ROCKSDB_NAMESPACE::SliceParts&)>(
+      &ROCKSDB_NAMESPACE::Transaction::PutUntracked, txn, column_family_handle,
+      std::placeholders::_1, std::placeholders::_2);
+  txn_write_kv_parts_helper(env, fn_put_parts_untracked, jkey_parts,
+                            jkey_parts_len, jvalue_parts, jvalue_parts_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    putUntracked
+ * Signature: (J[[BI[[BI)V
+ */
+void Java_org_rocksdb_Transaction_putUntracked__J_3_3BI_3_3BI(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jobjectArray jkey_parts,
+    jint jkey_parts_len, jobjectArray jvalue_parts, jint jvalue_parts_len) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  FnWriteKVParts fn_put_parts_untracked = std::bind<ROCKSDB_NAMESPACE::Status (
+      ROCKSDB_NAMESPACE::Transaction::*)(const ROCKSDB_NAMESPACE::SliceParts&,
+                                         const ROCKSDB_NAMESPACE::SliceParts&)>(
+      &ROCKSDB_NAMESPACE::Transaction::PutUntracked, txn, std::placeholders::_1,
+      std::placeholders::_2);
+  txn_write_kv_parts_helper(env, fn_put_parts_untracked, jkey_parts,
+                            jkey_parts_len, jvalue_parts, jvalue_parts_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    mergeUntracked
+ * Signature: (J[BI[BIJ)V
+ */
+void Java_org_rocksdb_Transaction_mergeUntracked__J_3BI_3BIJ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey,
+    jint jkey_part_len, jbyteArray jval, jint jval_len,
+    jlong jcolumn_family_handle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  auto* column_family_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(
+          jcolumn_family_handle);
+  FnWriteKV fn_merge_untracked =
+      std::bind<ROCKSDB_NAMESPACE::Status (ROCKSDB_NAMESPACE::Transaction::*)(
+          ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
+          const ROCKSDB_NAMESPACE::Slice&, const ROCKSDB_NAMESPACE::Slice&)>(
+          &ROCKSDB_NAMESPACE::Transaction::MergeUntracked, txn,
+          column_family_handle, std::placeholders::_1, std::placeholders::_2);
+  txn_write_kv_helper(env, fn_merge_untracked, jkey, jkey_part_len, jval,
+                      jval_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    mergeUntracked
+ * Signature: (J[BI[BI)V
+ */
+void Java_org_rocksdb_Transaction_mergeUntracked__J_3BI_3BI(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey,
+    jint jkey_part_len, jbyteArray jval, jint jval_len) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  FnWriteKV fn_merge_untracked =
+      std::bind<ROCKSDB_NAMESPACE::Status (ROCKSDB_NAMESPACE::Transaction::*)(
+          const ROCKSDB_NAMESPACE::Slice&, const ROCKSDB_NAMESPACE::Slice&)>(
+          &ROCKSDB_NAMESPACE::Transaction::MergeUntracked, txn,
+          std::placeholders::_1, std::placeholders::_2);
+  txn_write_kv_helper(env, fn_merge_untracked, jkey, jkey_part_len, jval,
+                      jval_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    deleteUntracked
+ * Signature: (J[BIJ)V
+ */
+void Java_org_rocksdb_Transaction_deleteUntracked__J_3BIJ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey,
+    jint jkey_part_len, jlong jcolumn_family_handle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  auto* column_family_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(
+          jcolumn_family_handle);
+  FnWriteK fn_delete_untracked = std::bind<ROCKSDB_NAMESPACE::Status (
+      ROCKSDB_NAMESPACE::Transaction::*)(ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
+                                         const ROCKSDB_NAMESPACE::Slice&)>(
+      &ROCKSDB_NAMESPACE::Transaction::DeleteUntracked, txn,
+      column_family_handle, std::placeholders::_1);
+  txn_write_k_helper(env, fn_delete_untracked, jkey, jkey_part_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    deleteUntracked
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_Transaction_deleteUntracked__J_3BI(JNIEnv* env,
+                                                         jobject /*jobj*/,
+                                                         jlong jhandle,
+                                                         jbyteArray jkey,
+                                                         jint jkey_part_len) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  FnWriteK fn_delete_untracked = std::bind<ROCKSDB_NAMESPACE::Status (
+      ROCKSDB_NAMESPACE::Transaction::*)(const ROCKSDB_NAMESPACE::Slice&)>(
+      &ROCKSDB_NAMESPACE::Transaction::DeleteUntracked, txn,
+      std::placeholders::_1);
+  txn_write_k_helper(env, fn_delete_untracked, jkey, jkey_part_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    deleteUntracked
+ * Signature: (J[[BIJ)V
+ */
+void Java_org_rocksdb_Transaction_deleteUntracked__J_3_3BIJ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jobjectArray jkey_parts,
+    jint jkey_parts_len, jlong jcolumn_family_handle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  auto* column_family_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(
+          jcolumn_family_handle);
+  FnWriteKParts fn_delete_untracked_parts =
+      std::bind<ROCKSDB_NAMESPACE::Status (ROCKSDB_NAMESPACE::Transaction::*)(
+          ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
+          const ROCKSDB_NAMESPACE::SliceParts&)>(
+          &ROCKSDB_NAMESPACE::Transaction::DeleteUntracked, txn,
+          column_family_handle, std::placeholders::_1);
+  txn_write_k_parts_helper(env, fn_delete_untracked_parts, jkey_parts,
+                           jkey_parts_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    deleteUntracked
+ * Signature: (J[[BI)V
+ */
+void Java_org_rocksdb_Transaction_deleteUntracked__J_3_3BI(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jobjectArray jkey_parts,
+    jint jkey_parts_len) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  FnWriteKParts fn_delete_untracked_parts =
+      std::bind<ROCKSDB_NAMESPACE::Status (ROCKSDB_NAMESPACE::Transaction::*)(
+          const ROCKSDB_NAMESPACE::SliceParts&)>(
+          &ROCKSDB_NAMESPACE::Transaction::DeleteUntracked, txn,
+          std::placeholders::_1);
+  txn_write_k_parts_helper(env, fn_delete_untracked_parts, jkey_parts,
+                           jkey_parts_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    putLogData
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_Transaction_putLogData(JNIEnv* env, jobject /*jobj*/,
+                                             jlong jhandle, jbyteArray jkey,
+                                             jint jkey_part_len) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+
+  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+  if (key == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char*>(key),
+                                     jkey_part_len);
+  txn->PutLogData(key_slice);
+
+  // trigger java unref on key.
+  // by passing JNI_ABORT, it will simply release the reference without
+  // copying the result back to the java byte array.
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    disableIndexing
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Transaction_disableIndexing(JNIEnv* /*env*/,
+                                                  jobject /*jobj*/,
+                                                  jlong jhandle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  txn->DisableIndexing();
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    enableIndexing
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Transaction_enableIndexing(JNIEnv* /*env*/,
+                                                 jobject /*jobj*/,
+                                                 jlong jhandle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  txn->EnableIndexing();
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getNumKeys
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Transaction_getNumKeys(JNIEnv* /*env*/, jobject /*jobj*/,
+                                              jlong jhandle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  return txn->GetNumKeys();
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getNumPuts
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Transaction_getNumPuts(JNIEnv* /*env*/, jobject /*jobj*/,
+                                              jlong jhandle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  return txn->GetNumPuts();
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getNumDeletes
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Transaction_getNumDeletes(JNIEnv* /*env*/,
+                                                 jobject /*jobj*/,
+                                                 jlong jhandle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  return txn->GetNumDeletes();
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getNumMerges
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Transaction_getNumMerges(JNIEnv* /*env*/,
+                                                jobject /*jobj*/,
+                                                jlong jhandle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  return txn->GetNumMerges();
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getElapsedTime
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Transaction_getElapsedTime(JNIEnv* /*env*/,
+                                                  jobject /*jobj*/,
+                                                  jlong jhandle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  return txn->GetElapsedTime();
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getWriteBatch
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Transaction_getWriteBatch(JNIEnv* /*env*/,
+                                                 jobject /*jobj*/,
+                                                 jlong jhandle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  return GET_CPLUSPLUS_POINTER(txn->GetWriteBatch());
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    setLockTimeout
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Transaction_setLockTimeout(JNIEnv* /*env*/,
+                                                 jobject /*jobj*/,
+                                                 jlong jhandle,
+                                                 jlong jlock_timeout) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  txn->SetLockTimeout(jlock_timeout);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getWriteOptions
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Transaction_getWriteOptions(JNIEnv* /*env*/,
+                                                   jobject /*jobj*/,
+                                                   jlong jhandle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  return GET_CPLUSPLUS_POINTER(txn->GetWriteOptions());
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    setWriteOptions
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Transaction_setWriteOptions(JNIEnv* /*env*/,
+                                                  jobject /*jobj*/,
+                                                  jlong jhandle,
+                                                  jlong jwrite_options_handle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  auto* write_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jwrite_options_handle);
+  txn->SetWriteOptions(*write_options);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    undo
+ * Signature: (J[BIJ)V
+ */
+void Java_org_rocksdb_Transaction_undoGetForUpdate__J_3BIJ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey,
+    jint jkey_part_len, jlong jcolumn_family_handle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  auto* column_family_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(
+          jcolumn_family_handle);
+  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+  if (key == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char*>(key),
+                                     jkey_part_len);
+  txn->UndoGetForUpdate(column_family_handle, key_slice);
+
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    undoGetForUpdate
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_Transaction_undoGetForUpdate__J_3BI(JNIEnv* env,
+                                                          jobject /*jobj*/,
+                                                          jlong jhandle,
+                                                          jbyteArray jkey,
+                                                          jint jkey_part_len) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+  if (key == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char*>(key),
+                                     jkey_part_len);
+  txn->UndoGetForUpdate(key_slice);
+
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    rebuildFromWriteBatch
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Transaction_rebuildFromWriteBatch(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jlong jwrite_batch_handle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  auto* write_batch =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwrite_batch_handle);
+  ROCKSDB_NAMESPACE::Status s = txn->RebuildFromWriteBatch(write_batch);
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getCommitTimeWriteBatch
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Transaction_getCommitTimeWriteBatch(JNIEnv* /*env*/,
+                                                           jobject /*jobj*/,
+                                                           jlong jhandle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  return GET_CPLUSPLUS_POINTER(txn->GetCommitTimeWriteBatch());
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    setLogNumber
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Transaction_setLogNumber(JNIEnv* /*env*/,
+                                               jobject /*jobj*/, jlong jhandle,
+                                               jlong jlog_number) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  txn->SetLogNumber(jlog_number);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getLogNumber
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Transaction_getLogNumber(JNIEnv* /*env*/,
+                                                jobject /*jobj*/,
+                                                jlong jhandle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  return txn->GetLogNumber();
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    setName
+ * Signature: (JLjava/lang/String;)V
+ */
+void Java_org_rocksdb_Transaction_setName(JNIEnv* env, jobject /*jobj*/,
+                                          jlong jhandle, jstring jname) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  const char* name = env->GetStringUTFChars(jname, nullptr);
+  if (name == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  ROCKSDB_NAMESPACE::Status s = txn->SetName(name);
+
+  env->ReleaseStringUTFChars(jname, name);
+
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getName
+ * Signature: (J)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_Transaction_getName(JNIEnv* env, jobject /*jobj*/,
+                                             jlong jhandle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  ROCKSDB_NAMESPACE::TransactionName name = txn->GetName();
+  return env->NewStringUTF(name.data());
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getID
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Transaction_getID(JNIEnv* /*env*/, jobject /*jobj*/,
+                                         jlong jhandle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  ROCKSDB_NAMESPACE::TransactionID id = txn->GetID();
+  return static_cast<jlong>(id);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    isDeadlockDetect
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Transaction_isDeadlockDetect(JNIEnv* /*env*/,
+                                                       jobject /*jobj*/,
+                                                       jlong jhandle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  return static_cast<jboolean>(txn->IsDeadlockDetect());
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getWaitingTxns
+ * Signature: (J)Lorg/rocksdb/Transaction/WaitingTransactions;
+ */
+jobject Java_org_rocksdb_Transaction_getWaitingTxns(JNIEnv* env,
+                                                    jobject jtransaction_obj,
+                                                    jlong jhandle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  uint32_t column_family_id;
+  std::string key;
+  std::vector<ROCKSDB_NAMESPACE::TransactionID> waiting_txns =
+      txn->GetWaitingTxns(&column_family_id, &key);
+  jobject jwaiting_txns =
+      ROCKSDB_NAMESPACE::TransactionJni::newWaitingTransactions(
+          env, jtransaction_obj, column_family_id, key, waiting_txns);
+  return jwaiting_txns;
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getState
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_Transaction_getState(JNIEnv* /*env*/, jobject /*jobj*/,
+                                            jlong jhandle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  ROCKSDB_NAMESPACE::Transaction::TransactionState txn_status = txn->GetState();
+  switch (txn_status) {
+    case ROCKSDB_NAMESPACE::Transaction::TransactionState::STARTED:
+      return 0x0;
+
+    case ROCKSDB_NAMESPACE::Transaction::TransactionState::AWAITING_PREPARE:
+      return 0x1;
+
+    case ROCKSDB_NAMESPACE::Transaction::TransactionState::PREPARED:
+      return 0x2;
+
+    case ROCKSDB_NAMESPACE::Transaction::TransactionState::AWAITING_COMMIT:
+      return 0x3;
+
+    case ROCKSDB_NAMESPACE::Transaction::TransactionState::COMMITTED:
+      return 0x4;
+
+    case ROCKSDB_NAMESPACE::Transaction::TransactionState::AWAITING_ROLLBACK:
+      return 0x5;
+
+    case ROCKSDB_NAMESPACE::Transaction::TransactionState::ROLLEDBACK:
+      return 0x6;
+
+    case ROCKSDB_NAMESPACE::Transaction::TransactionState::LOCKS_STOLEN:
+      return 0x7;
+  }
+
+  assert(false);
+  return static_cast<jbyte>(-1);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getId
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Transaction_getId(JNIEnv* /*env*/, jobject /*jobj*/,
+                                         jlong jhandle) {
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  uint64_t id = txn->GetId();
+  return static_cast<jlong>(id);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Transaction_disposeInternal(JNIEnv* /*env*/,
+                                                  jobject /*jobj*/,
+                                                  jlong jhandle) {
+  delete reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+}
diff --git a/src/rocksdb/java/rocksjni/transaction_db.cc b/src/rocksdb/java/rocksjni/transaction_db.cc
new file mode 100644
index 000000000..0adf85606
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/transaction_db.cc
@@ -0,0 +1,451 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++
+// for ROCKSDB_NAMESPACE::TransactionDB.
+
+#include "rocksdb/utilities/transaction_db.h"
+
+#include <jni.h>
+
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include "include/org_rocksdb_TransactionDB.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_TransactionDB
+ * Method:    open
+ * Signature: (JJLjava/lang/String;)J
+ */
+jlong Java_org_rocksdb_TransactionDB_open__JJLjava_lang_String_2(
+    JNIEnv* env, jclass, jlong joptions_handle, jlong jtxn_db_options_handle,
+    jstring jdb_path) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(joptions_handle);
+  auto* txn_db_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::TransactionDBOptions*>(
+          jtxn_db_options_handle);
+  ROCKSDB_NAMESPACE::TransactionDB* tdb = nullptr;
+  const char* db_path = env->GetStringUTFChars(jdb_path, nullptr);
+  if (db_path == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return 0;
+  }
+  ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::TransactionDB::Open(
+      *options, *txn_db_options, db_path, &tdb);
+  env->ReleaseStringUTFChars(jdb_path, db_path);
+
+  if (s.ok()) {
+    return GET_CPLUSPLUS_POINTER(tdb);
+  } else {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+    return 0;
+  }
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDB
+ * Method:    open
+ * Signature: (JJLjava/lang/String;[[B[J)[J
+ */
+jlongArray Java_org_rocksdb_TransactionDB_open__JJLjava_lang_String_2_3_3B_3J(
+    JNIEnv* env, jclass, jlong jdb_options_handle, jlong jtxn_db_options_handle,
+    jstring jdb_path, jobjectArray jcolumn_names,
+    jlongArray jcolumn_options_handles) {
+  const char* db_path = env->GetStringUTFChars(jdb_path, nullptr);
+  if (db_path == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+
+  const jsize len_cols = env->GetArrayLength(jcolumn_names);
+  jlong* jco = env->GetLongArrayElements(jcolumn_options_handles, nullptr);
+  if (jco == nullptr) {
+    // exception thrown: OutOfMemoryError
+    env->ReleaseStringUTFChars(jdb_path, db_path);
+    return nullptr;
+  }
+  std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor> column_families;
+  for (int i = 0; i < len_cols; i++) {
+    const jobject jcn = env->GetObjectArrayElement(jcolumn_names, i);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      env->ReleaseLongArrayElements(jcolumn_options_handles, jco, JNI_ABORT);
+      env->ReleaseStringUTFChars(jdb_path, db_path);
+      return nullptr;
+    }
+    const jbyteArray jcn_ba = reinterpret_cast<jbyteArray>(jcn);
+    jbyte* jcf_name = env->GetByteArrayElements(jcn_ba, nullptr);
+    if (jcf_name == nullptr) {
+      // exception thrown: OutOfMemoryError
+      env->DeleteLocalRef(jcn);
+      env->ReleaseLongArrayElements(jcolumn_options_handles, jco, JNI_ABORT);
+      env->ReleaseStringUTFChars(jdb_path, db_path);
+      return nullptr;
+    }
+
+    const int jcf_name_len = env->GetArrayLength(jcn_ba);
+    const std::string cf_name(reinterpret_cast<char*>(jcf_name), jcf_name_len);
+    const ROCKSDB_NAMESPACE::ColumnFamilyOptions* cf_options =
+        reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jco[i]);
+    column_families.push_back(
+        ROCKSDB_NAMESPACE::ColumnFamilyDescriptor(cf_name, *cf_options));
+
+    env->ReleaseByteArrayElements(jcn_ba, jcf_name, JNI_ABORT);
+    env->DeleteLocalRef(jcn);
+  }
+  env->ReleaseLongArrayElements(jcolumn_options_handles, jco, JNI_ABORT);
+
+  auto* db_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jdb_options_handle);
+  auto* txn_db_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::TransactionDBOptions*>(
+          jtxn_db_options_handle);
+  std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*> handles;
+  ROCKSDB_NAMESPACE::TransactionDB* tdb = nullptr;
+  const ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::TransactionDB::Open(
+      *db_options, *txn_db_options, db_path, column_families, &handles, &tdb);
+
+  // check if open operation was successful
+  if (s.ok()) {
+    const jsize resultsLen = 1 + len_cols;  // db handle + column family handles
+    std::unique_ptr<jlong[]> results =
+        std::unique_ptr<jlong[]>(new jlong[resultsLen]);
+    results[0] = GET_CPLUSPLUS_POINTER(tdb);
+    for (int i = 1; i <= len_cols; i++) {
+      results[i] = GET_CPLUSPLUS_POINTER(handles[i - 1]);
+    }
+
+    jlongArray jresults = env->NewLongArray(resultsLen);
+    if (jresults == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+    env->SetLongArrayRegion(jresults, 0, resultsLen, results.get());
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      env->DeleteLocalRef(jresults);
+      return nullptr;
+    }
+    return jresults;
+  } else {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
+  }
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDB
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_TransactionDB_disposeInternal(JNIEnv*, jobject,
+                                                    jlong jhandle) {
+  auto* txn_db = reinterpret_cast<ROCKSDB_NAMESPACE::TransactionDB*>(jhandle);
+  assert(txn_db != nullptr);
+  delete txn_db;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDB
+ * Method:    closeDatabase
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_TransactionDB_closeDatabase(JNIEnv* env, jclass,
+                                                  jlong jhandle) {
+  auto* txn_db = reinterpret_cast<ROCKSDB_NAMESPACE::TransactionDB*>(jhandle);
+  assert(txn_db != nullptr);
+  ROCKSDB_NAMESPACE::Status s = txn_db->Close();
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDB
+ * Method:    beginTransaction
+ * Signature: (JJ)J
+ */
+jlong Java_org_rocksdb_TransactionDB_beginTransaction__JJ(
+    JNIEnv*, jobject, jlong jhandle, jlong jwrite_options_handle) {
+  auto* txn_db = reinterpret_cast<ROCKSDB_NAMESPACE::TransactionDB*>(jhandle);
+  auto* write_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jwrite_options_handle);
+  ROCKSDB_NAMESPACE::Transaction* txn =
+      txn_db->BeginTransaction(*write_options);
+  return GET_CPLUSPLUS_POINTER(txn);
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDB
+ * Method:    beginTransaction
+ * Signature: (JJJ)J
+ */
+jlong Java_org_rocksdb_TransactionDB_beginTransaction__JJJ(
+    JNIEnv*, jobject, jlong jhandle, jlong jwrite_options_handle,
+    jlong jtxn_options_handle) {
+  auto* txn_db = reinterpret_cast<ROCKSDB_NAMESPACE::TransactionDB*>(jhandle);
+  auto* write_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jwrite_options_handle);
+  auto* txn_options = reinterpret_cast<ROCKSDB_NAMESPACE::TransactionOptions*>(
+      jtxn_options_handle);
+  ROCKSDB_NAMESPACE::Transaction* txn =
+      txn_db->BeginTransaction(*write_options, *txn_options);
+  return GET_CPLUSPLUS_POINTER(txn);
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDB
+ * Method:    beginTransaction_withOld
+ * Signature: (JJJ)J
+ */
+jlong Java_org_rocksdb_TransactionDB_beginTransaction_1withOld__JJJ(
+    JNIEnv*, jobject, jlong jhandle, jlong jwrite_options_handle,
+    jlong jold_txn_handle) {
+  auto* txn_db = reinterpret_cast<ROCKSDB_NAMESPACE::TransactionDB*>(jhandle);
+  auto* write_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jwrite_options_handle);
+  auto* old_txn =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jold_txn_handle);
+  ROCKSDB_NAMESPACE::TransactionOptions txn_options;
+  ROCKSDB_NAMESPACE::Transaction* txn =
+      txn_db->BeginTransaction(*write_options, txn_options, old_txn);
+
+  // RocksJava relies on the assumption that
+  // we do not allocate a new Transaction object
+  // when providing an old_txn
+  assert(txn == old_txn);
+
+  return GET_CPLUSPLUS_POINTER(txn);
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDB
+ * Method:    beginTransaction_withOld
+ * Signature: (JJJJ)J
+ */
+jlong Java_org_rocksdb_TransactionDB_beginTransaction_1withOld__JJJJ(
+    JNIEnv*, jobject, jlong jhandle, jlong jwrite_options_handle,
+    jlong jtxn_options_handle, jlong jold_txn_handle) {
+  auto* txn_db = reinterpret_cast<ROCKSDB_NAMESPACE::TransactionDB*>(jhandle);
+  auto* write_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteOptions*>(jwrite_options_handle);
+  auto* txn_options = reinterpret_cast<ROCKSDB_NAMESPACE::TransactionOptions*>(
+      jtxn_options_handle);
+  auto* old_txn =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jold_txn_handle);
+  ROCKSDB_NAMESPACE::Transaction* txn =
+      txn_db->BeginTransaction(*write_options, *txn_options, old_txn);
+
+  // RocksJava relies on the assumption that
+  // we do not allocate a new Transaction object
+  // when providing an old_txn
+  assert(txn == old_txn);
+
+  return GET_CPLUSPLUS_POINTER(txn);
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDB
+ * Method:    getTransactionByName
+ * Signature: (JLjava/lang/String;)J
+ */
+jlong Java_org_rocksdb_TransactionDB_getTransactionByName(JNIEnv* env, jobject,
+                                                          jlong jhandle,
+                                                          jstring jname) {
+  auto* txn_db = reinterpret_cast<ROCKSDB_NAMESPACE::TransactionDB*>(jhandle);
+  const char* name = env->GetStringUTFChars(jname, nullptr);
+  if (name == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return 0;
+  }
+  ROCKSDB_NAMESPACE::Transaction* txn = txn_db->GetTransactionByName(name);
+  env->ReleaseStringUTFChars(jname, name);
+  return GET_CPLUSPLUS_POINTER(txn);
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDB
+ * Method:    getAllPreparedTransactions
+ * Signature: (J)[J
+ */
+jlongArray Java_org_rocksdb_TransactionDB_getAllPreparedTransactions(
+    JNIEnv* env, jobject, jlong jhandle) {
+  auto* txn_db = reinterpret_cast<ROCKSDB_NAMESPACE::TransactionDB*>(jhandle);
+  std::vector<ROCKSDB_NAMESPACE::Transaction*> txns;
+  txn_db->GetAllPreparedTransactions(&txns);
+
+  const size_t size = txns.size();
+  assert(size < UINT32_MAX);  // does it fit in a jint?
+
+  const jsize len = static_cast<jsize>(size);
+  std::vector<jlong> tmp(len);
+  for (jsize i = 0; i < len; ++i) {
+    tmp[i] = GET_CPLUSPLUS_POINTER(txns[i]);
+  }
+
+  jlongArray jtxns = env->NewLongArray(len);
+  if (jtxns == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+  env->SetLongArrayRegion(jtxns, 0, len, tmp.data());
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    env->DeleteLocalRef(jtxns);
+    return nullptr;
+  }
+
+  return jtxns;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDB
+ * Method:    getLockStatusData
+ * Signature: (J)Ljava/util/Map;
+ */
+jobject Java_org_rocksdb_TransactionDB_getLockStatusData(JNIEnv* env, jobject,
+                                                         jlong jhandle) {
+  auto* txn_db = reinterpret_cast<ROCKSDB_NAMESPACE::TransactionDB*>(jhandle);
+  const std::unordered_multimap<uint32_t, ROCKSDB_NAMESPACE::KeyLockInfo>
+      lock_status_data = txn_db->GetLockStatusData();
+  const jobject jlock_status_data = ROCKSDB_NAMESPACE::HashMapJni::construct(
+      env, static_cast<uint32_t>(lock_status_data.size()));
+  if (jlock_status_data == nullptr) {
+    // exception occurred
+    return nullptr;
+  }
+
+  const ROCKSDB_NAMESPACE::HashMapJni::FnMapKV<
+      const int32_t, const ROCKSDB_NAMESPACE::KeyLockInfo, jobject, jobject>
+      fn_map_kv =
+          [env](const std::pair<const int32_t,
+                                const ROCKSDB_NAMESPACE::KeyLockInfo>& pair) {
+            const jobject jlong_column_family_id =
+                ROCKSDB_NAMESPACE::LongJni::valueOf(env, pair.first);
+            if (jlong_column_family_id == nullptr) {
+              // an error occurred
+              return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+            }
+            const jobject jkey_lock_info =
+                ROCKSDB_NAMESPACE::KeyLockInfoJni::construct(env, pair.second);
+            if (jkey_lock_info == nullptr) {
+              // an error occurred
+              return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+            }
+            return std::unique_ptr<std::pair<jobject, jobject>>(
+                new std::pair<jobject, jobject>(jlong_column_family_id,
+                                                jkey_lock_info));
+          };
+
+  if (!ROCKSDB_NAMESPACE::HashMapJni::putAll(
+          env, jlock_status_data, lock_status_data.begin(),
+          lock_status_data.end(), fn_map_kv)) {
+    // exception occcurred
+    return nullptr;
+  }
+
+  return jlock_status_data;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDB
+ * Method:    getDeadlockInfoBuffer
+ * Signature: (J)[Lorg/rocksdb/TransactionDB/DeadlockPath;
+ */
+jobjectArray Java_org_rocksdb_TransactionDB_getDeadlockInfoBuffer(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto* txn_db = reinterpret_cast<ROCKSDB_NAMESPACE::TransactionDB*>(jhandle);
+  const std::vector<ROCKSDB_NAMESPACE::DeadlockPath> deadlock_info_buffer =
+      txn_db->GetDeadlockInfoBuffer();
+
+  const jsize deadlock_info_buffer_len =
+      static_cast<jsize>(deadlock_info_buffer.size());
+  jobjectArray jdeadlock_info_buffer = env->NewObjectArray(
+      deadlock_info_buffer_len,
+      ROCKSDB_NAMESPACE::DeadlockPathJni::getJClass(env), nullptr);
+  if (jdeadlock_info_buffer == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+  jsize jdeadlock_info_buffer_offset = 0;
+
+  auto buf_end = deadlock_info_buffer.end();
+  for (auto buf_it = deadlock_info_buffer.begin(); buf_it != buf_end;
+       ++buf_it) {
+    const ROCKSDB_NAMESPACE::DeadlockPath deadlock_path = *buf_it;
+    const std::vector<ROCKSDB_NAMESPACE::DeadlockInfo> deadlock_infos =
+        deadlock_path.path;
+    const jsize deadlock_infos_len =
+        static_cast<jsize>(deadlock_info_buffer.size());
+    jobjectArray jdeadlock_infos = env->NewObjectArray(
+        deadlock_infos_len, ROCKSDB_NAMESPACE::DeadlockInfoJni::getJClass(env),
+        nullptr);
+    if (jdeadlock_infos == nullptr) {
+      // exception thrown: OutOfMemoryError
+      env->DeleteLocalRef(jdeadlock_info_buffer);
+      return nullptr;
+    }
+    jsize jdeadlock_infos_offset = 0;
+
+    auto infos_end = deadlock_infos.end();
+    for (auto infos_it = deadlock_infos.begin(); infos_it != infos_end;
+         ++infos_it) {
+      const ROCKSDB_NAMESPACE::DeadlockInfo deadlock_info = *infos_it;
+      const jobject jdeadlock_info =
+          ROCKSDB_NAMESPACE::TransactionDBJni::newDeadlockInfo(
+              env, jobj, deadlock_info.m_txn_id, deadlock_info.m_cf_id,
+              deadlock_info.m_waiting_key, deadlock_info.m_exclusive);
+      if (jdeadlock_info == nullptr) {
+        // exception occcurred
+        env->DeleteLocalRef(jdeadlock_info_buffer);
+        return nullptr;
+      }
+      env->SetObjectArrayElement(jdeadlock_infos, jdeadlock_infos_offset++,
+                                 jdeadlock_info);
+      if (env->ExceptionCheck()) {
+        // exception thrown: ArrayIndexOutOfBoundsException or
+        // ArrayStoreException
+        env->DeleteLocalRef(jdeadlock_info);
+        env->DeleteLocalRef(jdeadlock_info_buffer);
+        return nullptr;
+      }
+    }
+
+    const jobject jdeadlock_path =
+        ROCKSDB_NAMESPACE::DeadlockPathJni::construct(
+            env, jdeadlock_infos, deadlock_path.limit_exceeded);
+    if (jdeadlock_path == nullptr) {
+      // exception occcurred
+      env->DeleteLocalRef(jdeadlock_info_buffer);
+      return nullptr;
+    }
+    env->SetObjectArrayElement(jdeadlock_info_buffer,
+                               jdeadlock_info_buffer_offset++, jdeadlock_path);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException or ArrayStoreException
+      env->DeleteLocalRef(jdeadlock_path);
+      env->DeleteLocalRef(jdeadlock_info_buffer);
+      return nullptr;
+    }
+  }
+
+  return jdeadlock_info_buffer;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDB
+ * Method:    setDeadlockInfoBufferSize
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_TransactionDB_setDeadlockInfoBufferSize(
+    JNIEnv*, jobject, jlong jhandle, jint jdeadlock_info_buffer_size) {
+  auto* txn_db = reinterpret_cast<ROCKSDB_NAMESPACE::TransactionDB*>(jhandle);
+  txn_db->SetDeadlockInfoBufferSize(jdeadlock_info_buffer_size);
+}
diff --git a/src/rocksdb/java/rocksjni/transaction_db_options.cc b/src/rocksdb/java/rocksjni/transaction_db_options.cc
new file mode 100644
index 000000000..4cf27121e
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/transaction_db_options.cc
@@ -0,0 +1,169 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++
+// for ROCKSDB_NAMESPACE::TransactionDBOptions.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_TransactionDBOptions.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_TransactionDBOptions
+ * Method:    newTransactionDBOptions
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_TransactionDBOptions_newTransactionDBOptions(
+    JNIEnv* /*env*/, jclass /*jcls*/) {
+  ROCKSDB_NAMESPACE::TransactionDBOptions* opts =
+      new ROCKSDB_NAMESPACE::TransactionDBOptions();
+  return GET_CPLUSPLUS_POINTER(opts);
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDBOptions
+ * Method:    getMaxNumLocks
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_TransactionDBOptions_getMaxNumLocks(JNIEnv* /*env*/,
+                                                           jobject /*jobj*/,
+                                                           jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::TransactionDBOptions*>(jhandle);
+  return opts->max_num_locks;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDBOptions
+ * Method:    setMaxNumLocks
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_TransactionDBOptions_setMaxNumLocks(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jlong jmax_num_locks) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::TransactionDBOptions*>(jhandle);
+  opts->max_num_locks = jmax_num_locks;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDBOptions
+ * Method:    getNumStripes
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_TransactionDBOptions_getNumStripes(JNIEnv* /*env*/,
+                                                          jobject /*jobj*/,
+                                                          jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::TransactionDBOptions*>(jhandle);
+  return opts->num_stripes;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDBOptions
+ * Method:    setNumStripes
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_TransactionDBOptions_setNumStripes(JNIEnv* /*env*/,
+                                                         jobject /*jobj*/,
+                                                         jlong jhandle,
+                                                         jlong jnum_stripes) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::TransactionDBOptions*>(jhandle);
+  opts->num_stripes = jnum_stripes;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDBOptions
+ * Method:    getTransactionLockTimeout
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_TransactionDBOptions_getTransactionLockTimeout(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::TransactionDBOptions*>(jhandle);
+  return opts->transaction_lock_timeout;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDBOptions
+ * Method:    setTransactionLockTimeout
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_TransactionDBOptions_setTransactionLockTimeout(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jtransaction_lock_timeout) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::TransactionDBOptions*>(jhandle);
+  opts->transaction_lock_timeout = jtransaction_lock_timeout;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDBOptions
+ * Method:    getDefaultLockTimeout
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_TransactionDBOptions_getDefaultLockTimeout(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::TransactionDBOptions*>(jhandle);
+  return opts->default_lock_timeout;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDBOptions
+ * Method:    setDefaultLockTimeout
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_TransactionDBOptions_setDefaultLockTimeout(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jdefault_lock_timeout) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::TransactionDBOptions*>(jhandle);
+  opts->default_lock_timeout = jdefault_lock_timeout;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDBOptions
+ * Method:    getWritePolicy
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_TransactionDBOptions_getWritePolicy(JNIEnv* /*env*/,
+                                                           jobject /*jobj*/,
+                                                           jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::TransactionDBOptions*>(jhandle);
+  return ROCKSDB_NAMESPACE::TxnDBWritePolicyJni::toJavaTxnDBWritePolicy(
+      opts->write_policy);
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDBOptions
+ * Method:    setWritePolicy
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_TransactionDBOptions_setWritePolicy(JNIEnv* /*env*/,
+                                                          jobject /*jobj*/,
+                                                          jlong jhandle,
+                                                          jbyte jwrite_policy) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::TransactionDBOptions*>(jhandle);
+  opts->write_policy =
+      ROCKSDB_NAMESPACE::TxnDBWritePolicyJni::toCppTxnDBWritePolicy(
+          jwrite_policy);
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDBOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_TransactionDBOptions_disposeInternal(JNIEnv* /*env*/,
+                                                           jobject /*jobj*/,
+                                                           jlong jhandle) {
+  delete reinterpret_cast<ROCKSDB_NAMESPACE::TransactionDBOptions*>(jhandle);
+}
diff --git a/src/rocksdb/java/rocksjni/transaction_log.cc b/src/rocksdb/java/rocksjni/transaction_log.cc
new file mode 100644
index 000000000..97c3bb301
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/transaction_log.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ ROCKSDB_NAMESPACE::Iterator methods from Java side.
+
+#include "rocksdb/transaction_log.h"
+
+#include <jni.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "include/org_rocksdb_TransactionLogIterator.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_TransactionLogIterator
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_TransactionLogIterator_disposeInternal(JNIEnv* /*env*/,
+                                                             jobject /*jobj*/,
+                                                             jlong handle) {
+  delete reinterpret_cast<ROCKSDB_NAMESPACE::TransactionLogIterator*>(handle);
+}
+
+/*
+ * Class:     org_rocksdb_TransactionLogIterator
+ * Method:    isValid
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_TransactionLogIterator_isValid(JNIEnv* /*env*/,
+                                                         jobject /*jobj*/,
+                                                         jlong handle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::TransactionLogIterator*>(handle)
+      ->Valid();
+}
+
+/*
+ * Class:     org_rocksdb_TransactionLogIterator
+ * Method:    next
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_TransactionLogIterator_next(JNIEnv* /*env*/,
+                                                  jobject /*jobj*/,
+                                                  jlong handle) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::TransactionLogIterator*>(handle)->Next();
+}
+
+/*
+ * Class:     org_rocksdb_TransactionLogIterator
+ * Method:    status
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_TransactionLogIterator_status(JNIEnv* env,
+                                                    jobject /*jobj*/,
+                                                    jlong handle) {
+  ROCKSDB_NAMESPACE::Status s =
+      reinterpret_cast<ROCKSDB_NAMESPACE::TransactionLogIterator*>(handle)
+          ->status();
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_TransactionLogIterator
+ * Method:    getBatch
+ * Signature: (J)Lorg/rocksdb/TransactionLogIterator$BatchResult
+ */
+jobject Java_org_rocksdb_TransactionLogIterator_getBatch(JNIEnv* env,
+                                                         jobject /*jobj*/,
+                                                         jlong handle) {
+  ROCKSDB_NAMESPACE::BatchResult batch_result =
+      reinterpret_cast<ROCKSDB_NAMESPACE::TransactionLogIterator*>(handle)
+          ->GetBatch();
+  return ROCKSDB_NAMESPACE::BatchResultJni::construct(env, batch_result);
+}
diff --git a/src/rocksdb/java/rocksjni/transaction_notifier.cc b/src/rocksdb/java/rocksjni/transaction_notifier.cc
new file mode 100644
index 000000000..cefeb648a
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/transaction_notifier.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++
+// for ROCKSDB_NAMESPACE::TransactionNotifier.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_AbstractTransactionNotifier.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/transaction_notifier_jnicallback.h"
+
+/*
+ * Class:     org_rocksdb_AbstractTransactionNotifier
+ * Method:    createNewTransactionNotifier
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_AbstractTransactionNotifier_createNewTransactionNotifier(
+    JNIEnv* env, jobject jobj) {
+  auto* transaction_notifier =
+      new ROCKSDB_NAMESPACE::TransactionNotifierJniCallback(env, jobj);
+  auto* sptr_transaction_notifier =
+      new std::shared_ptr<ROCKSDB_NAMESPACE::TransactionNotifierJniCallback>(
+          transaction_notifier);
+  return GET_CPLUSPLUS_POINTER(sptr_transaction_notifier);
+}
+
+/*
+ * Class:     org_rocksdb_AbstractTransactionNotifier
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_AbstractTransactionNotifier_disposeInternal(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  // TODO(AR) refactor to use JniCallback::JniCallback
+  // when https://github.com/facebook/rocksdb/pull/1241/ is merged
+  std::shared_ptr<ROCKSDB_NAMESPACE::TransactionNotifierJniCallback>* handle =
+      reinterpret_cast<
+          std::shared_ptr<ROCKSDB_NAMESPACE::TransactionNotifierJniCallback>*>(
+          jhandle);
+  delete handle;
+}
diff --git a/src/rocksdb/java/rocksjni/transaction_notifier_jnicallback.cc b/src/rocksdb/java/rocksjni/transaction_notifier_jnicallback.cc
new file mode 100644
index 000000000..26761cabd
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/transaction_notifier_jnicallback.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::TransactionNotifier.
+
+#include "rocksjni/transaction_notifier_jnicallback.h"
+
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+TransactionNotifierJniCallback::TransactionNotifierJniCallback(
+    JNIEnv* env, jobject jtransaction_notifier)
+    : JniCallback(env, jtransaction_notifier) {
+  // we cache the method id for the JNI callback
+  m_jsnapshot_created_methodID =
+      AbstractTransactionNotifierJni::getSnapshotCreatedMethodId(env);
+}
+
+void TransactionNotifierJniCallback::SnapshotCreated(
+    const Snapshot* newSnapshot) {
+  jboolean attached_thread = JNI_FALSE;
+  JNIEnv* env = getJniEnv(&attached_thread);
+  assert(env != nullptr);
+
+  env->CallVoidMethod(m_jcallback_obj, m_jsnapshot_created_methodID,
+                      GET_CPLUSPLUS_POINTER(newSnapshot));
+
+  if (env->ExceptionCheck()) {
+    // exception thrown from CallVoidMethod
+    env->ExceptionDescribe();  // print out exception to stderr
+    releaseJniEnv(attached_thread);
+    return;
+  }
+
+  releaseJniEnv(attached_thread);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/java/rocksjni/transaction_notifier_jnicallback.h b/src/rocksdb/java/rocksjni/transaction_notifier_jnicallback.h
new file mode 100644
index 000000000..089a5ee4a
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/transaction_notifier_jnicallback.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::TransactionNotifier.
+
+#ifndef JAVA_ROCKSJNI_TRANSACTION_NOTIFIER_JNICALLBACK_H_
+#define JAVA_ROCKSJNI_TRANSACTION_NOTIFIER_JNICALLBACK_H_
+
+#include <jni.h>
+
+#include "rocksdb/utilities/transaction.h"
+#include "rocksjni/jnicallback.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+/**
+ * This class acts as a bridge between C++
+ * and Java. The methods in this class will be
+ * called back from the RocksDB TransactionDB or OptimisticTransactionDB (C++),
+ * we then callback to the appropriate Java method
+ * this enables TransactionNotifier to be implemented in Java.
+ *
+ * Unlike RocksJava's Comparator JNI Callback, we do not attempt
+ * to reduce Java object allocations by caching the Snapshot object
+ * presented to the callback. This could be revisited in future
+ * if performance is lacking.
+ */
+class TransactionNotifierJniCallback : public JniCallback,
+                                       public TransactionNotifier {
+ public:
+  TransactionNotifierJniCallback(JNIEnv* env, jobject jtransaction_notifier);
+  virtual void SnapshotCreated(const Snapshot* newSnapshot);
+
+ private:
+  jmethodID m_jsnapshot_created_methodID;
+};
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // JAVA_ROCKSJNI_TRANSACTION_NOTIFIER_JNICALLBACK_H_
diff --git a/src/rocksdb/java/rocksjni/transaction_options.cc b/src/rocksdb/java/rocksjni/transaction_options.cc
new file mode 100644
index 000000000..dcf363e14
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/transaction_options.cc
@@ -0,0 +1,191 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++
+// for ROCKSDB_NAMESPACE::TransactionOptions.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_TransactionOptions.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+
+/*
+ * Class:     org_rocksdb_TransactionOptions
+ * Method:    newTransactionOptions
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_TransactionOptions_newTransactionOptions(
+    JNIEnv* /*env*/, jclass /*jcls*/) {
+  auto* opts = new ROCKSDB_NAMESPACE::TransactionOptions();
+  return GET_CPLUSPLUS_POINTER(opts);
+}
+
+/*
+ * Class:     org_rocksdb_TransactionOptions
+ * Method:    isSetSnapshot
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_TransactionOptions_isSetSnapshot(JNIEnv* /*env*/,
+                                                           jobject /*jobj*/,
+                                                           jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::TransactionOptions*>(jhandle);
+  return opts->set_snapshot;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionOptions
+ * Method:    setSetSnapshot
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_TransactionOptions_setSetSnapshot(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jboolean jset_snapshot) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::TransactionOptions*>(jhandle);
+  opts->set_snapshot = jset_snapshot;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionOptions
+ * Method:    isDeadlockDetect
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_TransactionOptions_isDeadlockDetect(JNIEnv* /*env*/,
+                                                              jobject /*jobj*/,
+                                                              jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::TransactionOptions*>(jhandle);
+  return opts->deadlock_detect;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionOptions
+ * Method:    setDeadlockDetect
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_TransactionOptions_setDeadlockDetect(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jboolean jdeadlock_detect) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::TransactionOptions*>(jhandle);
+  opts->deadlock_detect = jdeadlock_detect;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionOptions
+ * Method:    getLockTimeout
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_TransactionOptions_getLockTimeout(JNIEnv* /*env*/,
+                                                         jobject /*jobj*/,
+                                                         jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::TransactionOptions*>(jhandle);
+  return opts->lock_timeout;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionOptions
+ * Method:    setLockTimeout
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_TransactionOptions_setLockTimeout(JNIEnv* /*env*/,
+                                                        jobject /*jobj*/,
+                                                        jlong jhandle,
+                                                        jlong jlock_timeout) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::TransactionOptions*>(jhandle);
+  opts->lock_timeout = jlock_timeout;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionOptions
+ * Method:    getExpiration
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_TransactionOptions_getExpiration(JNIEnv* /*env*/,
+                                                        jobject /*jobj*/,
+                                                        jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::TransactionOptions*>(jhandle);
+  return opts->expiration;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionOptions
+ * Method:    setExpiration
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_TransactionOptions_setExpiration(JNIEnv* /*env*/,
+                                                       jobject /*jobj*/,
+                                                       jlong jhandle,
+                                                       jlong jexpiration) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::TransactionOptions*>(jhandle);
+  opts->expiration = jexpiration;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionOptions
+ * Method:    getDeadlockDetectDepth
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_TransactionOptions_getDeadlockDetectDepth(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::TransactionOptions*>(jhandle);
+  return opts->deadlock_detect_depth;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionOptions
+ * Method:    setDeadlockDetectDepth
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_TransactionOptions_setDeadlockDetectDepth(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jdeadlock_detect_depth) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::TransactionOptions*>(jhandle);
+  opts->deadlock_detect_depth = jdeadlock_detect_depth;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionOptions
+ * Method:    getMaxWriteBatchSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_TransactionOptions_getMaxWriteBatchSize(JNIEnv* /*env*/,
+                                                               jobject /*jobj*/,
+                                                               jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::TransactionOptions*>(jhandle);
+  return opts->max_write_batch_size;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionOptions
+ * Method:    setMaxWriteBatchSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_TransactionOptions_setMaxWriteBatchSize(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jmax_write_batch_size) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::TransactionOptions*>(jhandle);
+  opts->max_write_batch_size = jmax_write_batch_size;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_TransactionOptions_disposeInternal(JNIEnv* /*env*/,
+                                                         jobject /*jobj*/,
+                                                         jlong jhandle) {
+  delete reinterpret_cast<ROCKSDB_NAMESPACE::TransactionOptions*>(jhandle);
+}
diff --git a/src/rocksdb/java/rocksjni/ttl.cc b/src/rocksdb/java/rocksjni/ttl.cc
new file mode 100644
index 000000000..1fe2083d9
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/ttl.cc
@@ -0,0 +1,212 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ ROCKSDB_NAMESPACE::TtlDB methods.
+// from Java side.
+
+#include <jni.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "include/org_rocksdb_TtlDB.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_TtlDB
+ * Method:    open
+ * Signature: (JLjava/lang/String;IZ)J
+ */
+jlong Java_org_rocksdb_TtlDB_open(JNIEnv* env, jclass, jlong joptions_handle,
+                                  jstring jdb_path, jint jttl,
+                                  jboolean jread_only) {
+  const char* db_path = env->GetStringUTFChars(jdb_path, nullptr);
+  if (db_path == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return 0;
+  }
+
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(joptions_handle);
+  ROCKSDB_NAMESPACE::DBWithTTL* db = nullptr;
+  ROCKSDB_NAMESPACE::Status s =
+      ROCKSDB_NAMESPACE::DBWithTTL::Open(*opt, db_path, &db, jttl, jread_only);
+  env->ReleaseStringUTFChars(jdb_path, db_path);
+
+  // as TTLDB extends RocksDB on the java side, we can reuse
+  // the RocksDB portal here.
+  if (s.ok()) {
+    return GET_CPLUSPLUS_POINTER(db);
+  } else {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+    return 0;
+  }
+}
+
+/*
+ * Class:     org_rocksdb_TtlDB
+ * Method:    openCF
+ * Signature: (JLjava/lang/String;[[B[J[IZ)[J
+ */
+jlongArray Java_org_rocksdb_TtlDB_openCF(JNIEnv* env, jclass, jlong jopt_handle,
+                                         jstring jdb_path,
+                                         jobjectArray jcolumn_names,
+                                         jlongArray jcolumn_options,
+                                         jintArray jttls, jboolean jread_only) {
+  const char* db_path = env->GetStringUTFChars(jdb_path, nullptr);
+  if (db_path == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return 0;
+  }
+
+  const jsize len_cols = env->GetArrayLength(jcolumn_names);
+  jlong* jco = env->GetLongArrayElements(jcolumn_options, nullptr);
+  if (jco == nullptr) {
+    // exception thrown: OutOfMemoryError
+    env->ReleaseStringUTFChars(jdb_path, db_path);
+    return nullptr;
+  }
+
+  std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor> column_families;
+  jboolean has_exception = JNI_FALSE;
+  ROCKSDB_NAMESPACE::JniUtil::byteStrings<std::string>(
+      env, jcolumn_names,
+      [](const char* str_data, const size_t str_len) {
+        return std::string(str_data, str_len);
+      },
+      [&jco, &column_families](size_t idx, std::string cf_name) {
+        ROCKSDB_NAMESPACE::ColumnFamilyOptions* cf_options =
+            reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jco[idx]);
+        column_families.push_back(
+            ROCKSDB_NAMESPACE::ColumnFamilyDescriptor(cf_name, *cf_options));
+      },
+      &has_exception);
+
+  env->ReleaseLongArrayElements(jcolumn_options, jco, JNI_ABORT);
+
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    env->ReleaseStringUTFChars(jdb_path, db_path);
+    return nullptr;
+  }
+
+  std::vector<int32_t> ttl_values;
+  jint* jttlv = env->GetIntArrayElements(jttls, nullptr);
+  if (jttlv == nullptr) {
+    // exception thrown: OutOfMemoryError
+    env->ReleaseStringUTFChars(jdb_path, db_path);
+    return nullptr;
+  }
+  const jsize len_ttls = env->GetArrayLength(jttls);
+  for (jsize i = 0; i < len_ttls; i++) {
+    ttl_values.push_back(jttlv[i]);
+  }
+  env->ReleaseIntArrayElements(jttls, jttlv, JNI_ABORT);
+
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jopt_handle);
+  std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*> handles;
+  ROCKSDB_NAMESPACE::DBWithTTL* db = nullptr;
+  ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::DBWithTTL::Open(
+      *opt, db_path, column_families, &handles, &db, ttl_values, jread_only);
+
+  // we have now finished with db_path
+  env->ReleaseStringUTFChars(jdb_path, db_path);
+
+  // check if open operation was successful
+  if (s.ok()) {
+    const jsize resultsLen = 1 + len_cols;  // db handle + column family handles
+    std::unique_ptr<jlong[]> results =
+        std::unique_ptr<jlong[]>(new jlong[resultsLen]);
+    results[0] = GET_CPLUSPLUS_POINTER(db);
+    for (int i = 1; i <= len_cols; i++) {
+      results[i] = GET_CPLUSPLUS_POINTER(handles[i - 1]);
+    }
+
+    jlongArray jresults = env->NewLongArray(resultsLen);
+    if (jresults == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+
+    env->SetLongArrayRegion(jresults, 0, resultsLen, results.get());
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      env->DeleteLocalRef(jresults);
+      return nullptr;
+    }
+
+    return jresults;
+  } else {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+    return NULL;
+  }
+}
+
+/*
+ * Class:     org_rocksdb_TtlDB
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_TtlDB_disposeInternal(JNIEnv*, jobject, jlong jhandle) {
+  auto* ttl_db = reinterpret_cast<ROCKSDB_NAMESPACE::DBWithTTL*>(jhandle);
+  assert(ttl_db != nullptr);
+  delete ttl_db;
+}
+
+/*
+ * Class:     org_rocksdb_TtlDB
+ * Method:    closeDatabase
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_TtlDB_closeDatabase(JNIEnv* /* env */, jclass,
+                                          jlong /* jhandle */) {
+  // auto* ttl_db = reinterpret_cast<ROCKSDB_NAMESPACE::DBWithTTL*>(jhandle);
+  // assert(ttl_db != nullptr);
+  // ROCKSDB_NAMESPACE::Status s = ttl_db->Close();
+  // ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+
+  // TODO(AR) this is disabled until
+  // https://github.com/facebook/rocksdb/issues/4818 is resolved!
+}
+
+/*
+ * Class:     org_rocksdb_TtlDB
+ * Method:    createColumnFamilyWithTtl
+ * Signature: (JLorg/rocksdb/ColumnFamilyDescriptor;[BJI)J;
+ */
+jlong Java_org_rocksdb_TtlDB_createColumnFamilyWithTtl(JNIEnv* env, jobject,
+                                                       jlong jdb_handle,
+                                                       jbyteArray jcolumn_name,
+                                                       jlong jcolumn_options,
+                                                       jint jttl) {
+  jbyte* cfname = env->GetByteArrayElements(jcolumn_name, nullptr);
+  if (cfname == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return 0;
+  }
+  const jsize len = env->GetArrayLength(jcolumn_name);
+
+  auto* cfOptions = reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(
+      jcolumn_options);
+
+  auto* db_handle = reinterpret_cast<ROCKSDB_NAMESPACE::DBWithTTL*>(jdb_handle);
+  ROCKSDB_NAMESPACE::ColumnFamilyHandle* handle;
+  ROCKSDB_NAMESPACE::Status s = db_handle->CreateColumnFamilyWithTtl(
+      *cfOptions, std::string(reinterpret_cast<char*>(cfname), len), &handle,
+      jttl);
+
+  env->ReleaseByteArrayElements(jcolumn_name, cfname, JNI_ABORT);
+
+  if (s.ok()) {
+    return GET_CPLUSPLUS_POINTER(handle);
+  }
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  return 0;
+}
diff --git a/src/rocksdb/java/rocksjni/wal_filter.cc b/src/rocksdb/java/rocksjni/wal_filter.cc
new file mode 100644
index 000000000..24b88afed
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/wal_filter.cc
@@ -0,0 +1,24 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::WalFilter.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_AbstractWalFilter.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/wal_filter_jnicallback.h"
+
+/*
+ * Class:     org_rocksdb_AbstractWalFilter
+ * Method:    createNewWalFilter
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_AbstractWalFilter_createNewWalFilter(JNIEnv* env,
+                                                            jobject jobj) {
+  auto* wal_filter = new ROCKSDB_NAMESPACE::WalFilterJniCallback(env, jobj);
+  return GET_CPLUSPLUS_POINTER(wal_filter);
+}
diff --git a/src/rocksdb/java/rocksjni/wal_filter_jnicallback.cc b/src/rocksdb/java/rocksjni/wal_filter_jnicallback.cc
new file mode 100644
index 000000000..d2e3c9076
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/wal_filter_jnicallback.cc
@@ -0,0 +1,139 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::WalFilter.
+
+#include "rocksjni/wal_filter_jnicallback.h"
+
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+
+namespace ROCKSDB_NAMESPACE {
+WalFilterJniCallback::WalFilterJniCallback(JNIEnv* env, jobject jwal_filter)
+    : JniCallback(env, jwal_filter) {
+  // Note: The name of a WalFilter will not change during it's lifetime,
+  // so we cache it in a global var
+  jmethodID jname_mid = AbstractWalFilterJni::getNameMethodId(env);
+  if (jname_mid == nullptr) {
+    // exception thrown: NoSuchMethodException or OutOfMemoryError
+    return;
+  }
+  jstring jname = (jstring)env->CallObjectMethod(m_jcallback_obj, jname_mid);
+  if (env->ExceptionCheck()) {
+    // exception thrown
+    return;
+  }
+  jboolean has_exception = JNI_FALSE;
+  m_name = JniUtil::copyString(env, jname,
+                               &has_exception);  // also releases jname
+  if (has_exception == JNI_TRUE) {
+    // exception thrown
+    return;
+  }
+
+  m_column_family_log_number_map_mid =
+      AbstractWalFilterJni::getColumnFamilyLogNumberMapMethodId(env);
+  if (m_column_family_log_number_map_mid == nullptr) {
+    // exception thrown: NoSuchMethodException or OutOfMemoryError
+    return;
+  }
+
+  m_log_record_found_proxy_mid =
+      AbstractWalFilterJni::getLogRecordFoundProxyMethodId(env);
+  if (m_log_record_found_proxy_mid == nullptr) {
+    // exception thrown: NoSuchMethodException or OutOfMemoryError
+    return;
+  }
+}
+
+void WalFilterJniCallback::ColumnFamilyLogNumberMap(
+    const std::map<uint32_t, uint64_t>& cf_lognumber_map,
+    const std::map<std::string, uint32_t>& cf_name_id_map) {
+  jboolean attached_thread = JNI_FALSE;
+  JNIEnv* env = getJniEnv(&attached_thread);
+  if (env == nullptr) {
+    return;
+  }
+
+  jobject jcf_lognumber_map =
+      ROCKSDB_NAMESPACE::HashMapJni::fromCppMap(env, &cf_lognumber_map);
+  if (jcf_lognumber_map == nullptr) {
+    // exception occurred
+    env->ExceptionDescribe();  // print out exception to stderr
+    releaseJniEnv(attached_thread);
+    return;
+  }
+
+  jobject jcf_name_id_map =
+      ROCKSDB_NAMESPACE::HashMapJni::fromCppMap(env, &cf_name_id_map);
+  if (jcf_name_id_map == nullptr) {
+    // exception occurred
+    env->ExceptionDescribe();  // print out exception to stderr
+    env->DeleteLocalRef(jcf_lognumber_map);
+    releaseJniEnv(attached_thread);
+    return;
+  }
+
+  env->CallVoidMethod(m_jcallback_obj, m_column_family_log_number_map_mid,
+                      jcf_lognumber_map, jcf_name_id_map);
+
+  env->DeleteLocalRef(jcf_lognumber_map);
+  env->DeleteLocalRef(jcf_name_id_map);
+
+  if (env->ExceptionCheck()) {
+    // exception thrown from CallVoidMethod
+    env->ExceptionDescribe();  // print out exception to stderr
+  }
+
+  releaseJniEnv(attached_thread);
+}
+
+WalFilter::WalProcessingOption WalFilterJniCallback::LogRecordFound(
+    unsigned long long log_number, const std::string& log_file_name,
+    const WriteBatch& batch, WriteBatch* new_batch, bool* batch_changed) {
+  jboolean attached_thread = JNI_FALSE;
+  JNIEnv* env = getJniEnv(&attached_thread);
+  if (env == nullptr) {
+    return WalFilter::WalProcessingOption::kCorruptedRecord;
+  }
+
+  jstring jlog_file_name = JniUtil::toJavaString(env, &log_file_name);
+  if (jlog_file_name == nullptr) {
+    // exception occcurred
+    env->ExceptionDescribe();  // print out exception to stderr
+    releaseJniEnv(attached_thread);
+    return WalFilter::WalProcessingOption::kCorruptedRecord;
+  }
+
+  jshort jlog_record_found_result = env->CallShortMethod(
+      m_jcallback_obj, m_log_record_found_proxy_mid,
+      static_cast<jlong>(log_number), jlog_file_name,
+      GET_CPLUSPLUS_POINTER(&batch), GET_CPLUSPLUS_POINTER(new_batch));
+
+  env->DeleteLocalRef(jlog_file_name);
+
+  if (env->ExceptionCheck()) {
+    // exception thrown from CallShortMethod
+    env->ExceptionDescribe();  // print out exception to stderr
+    releaseJniEnv(attached_thread);
+    return WalFilter::WalProcessingOption::kCorruptedRecord;
+  }
+
+  // unpack WalProcessingOption and batch_changed from jlog_record_found_result
+  jbyte jwal_processing_option_value = (jlog_record_found_result >> 8) & 0xFF;
+  jbyte jbatch_changed_value = jlog_record_found_result & 0xFF;
+
+  releaseJniEnv(attached_thread);
+
+  *batch_changed = jbatch_changed_value == JNI_TRUE;
+
+  return WalProcessingOptionJni::toCppWalProcessingOption(
+      jwal_processing_option_value);
+}
+
+const char* WalFilterJniCallback::Name() const { return m_name.get(); }
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/java/rocksjni/wal_filter_jnicallback.h b/src/rocksdb/java/rocksjni/wal_filter_jnicallback.h
new file mode 100644
index 000000000..5cdc65978
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/wal_filter_jnicallback.h
@@ -0,0 +1,42 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::WalFilter.
+
+#ifndef JAVA_ROCKSJNI_WAL_FILTER_JNICALLBACK_H_
+#define JAVA_ROCKSJNI_WAL_FILTER_JNICALLBACK_H_
+
+#include <jni.h>
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "rocksdb/wal_filter.h"
+#include "rocksjni/jnicallback.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WalFilterJniCallback : public JniCallback, public WalFilter {
+ public:
+  WalFilterJniCallback(JNIEnv* env, jobject jwal_filter);
+  virtual void ColumnFamilyLogNumberMap(
+      const std::map<uint32_t, uint64_t>& cf_lognumber_map,
+      const std::map<std::string, uint32_t>& cf_name_id_map);
+  virtual WalFilter::WalProcessingOption LogRecordFound(
+      unsigned long long log_number, const std::string& log_file_name,
+      const WriteBatch& batch, WriteBatch* new_batch, bool* batch_changed);
+  virtual const char* Name() const;
+
+ private:
+  std::unique_ptr<const char[]> m_name;
+  jmethodID m_column_family_log_number_map_mid;
+  jmethodID m_log_record_found_proxy_mid;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // JAVA_ROCKSJNI_WAL_FILTER_JNICALLBACK_H_
diff --git a/src/rocksdb/java/rocksjni/write_batch.cc b/src/rocksdb/java/rocksjni/write_batch.cc
new file mode 100644
index 000000000..6704e4a7e
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/write_batch.cc
@@ -0,0 +1,676 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ ROCKSDB_NAMESPACE::WriteBatch methods from Java side.
+#include "rocksdb/write_batch.h"
+
+#include <memory>
+
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "include/org_rocksdb_WriteBatch.h"
+#include "include/org_rocksdb_WriteBatch_Handler.h"
+#include "logging/logging.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/status.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+#include "rocksjni/writebatchhandlerjnicallback.h"
+#include "table/scoped_arena_iterator.h"
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    newWriteBatch
+ * Signature: (I)J
+ */
+jlong Java_org_rocksdb_WriteBatch_newWriteBatch__I(JNIEnv* /*env*/,
+                                                   jclass /*jcls*/,
+                                                   jint jreserved_bytes) {
+  auto* wb =
+      new ROCKSDB_NAMESPACE::WriteBatch(static_cast<size_t>(jreserved_bytes));
+  return GET_CPLUSPLUS_POINTER(wb);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    newWriteBatch
+ * Signature: ([BI)J
+ */
+jlong Java_org_rocksdb_WriteBatch_newWriteBatch___3BI(JNIEnv* env,
+                                                      jclass /*jcls*/,
+                                                      jbyteArray jserialized,
+                                                      jint jserialized_length) {
+  jboolean has_exception = JNI_FALSE;
+  std::string serialized = ROCKSDB_NAMESPACE::JniUtil::byteString<std::string>(
+      env, jserialized, jserialized_length,
+      [](const char* str, const size_t len) { return std::string(str, len); },
+      &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    return 0;
+  }
+
+  auto* wb = new ROCKSDB_NAMESPACE::WriteBatch(serialized);
+  return GET_CPLUSPLUS_POINTER(wb);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    count0
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_WriteBatch_count0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                        jlong jwb_handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  return static_cast<jint>(wb->Count());
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    clear0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WriteBatch_clear0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                        jlong jwb_handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  wb->Clear();
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    setSavePoint0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WriteBatch_setSavePoint0(JNIEnv* /*env*/,
+                                               jobject /*jobj*/,
+                                               jlong jwb_handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  wb->SetSavePoint();
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    rollbackToSavePoint0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WriteBatch_rollbackToSavePoint0(JNIEnv* env,
+                                                      jobject /*jobj*/,
+                                                      jlong jwb_handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  auto s = wb->RollbackToSavePoint();
+
+  if (s.ok()) {
+    return;
+  }
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    popSavePoint
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WriteBatch_popSavePoint(JNIEnv* env, jobject /*jobj*/,
+                                              jlong jwb_handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  auto s = wb->PopSavePoint();
+
+  if (s.ok()) {
+    return;
+  }
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    setMaxBytes
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_WriteBatch_setMaxBytes(JNIEnv* /*env*/, jobject /*jobj*/,
+                                             jlong jwb_handle,
+                                             jlong jmax_bytes) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  wb->SetMaxBytes(static_cast<size_t>(jmax_bytes));
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    put
+ * Signature: (J[BI[BI)V
+ */
+void Java_org_rocksdb_WriteBatch_put__J_3BI_3BI(JNIEnv* env, jobject jobj,
+                                                jlong jwb_handle,
+                                                jbyteArray jkey, jint jkey_len,
+                                                jbyteArray jentry_value,
+                                                jint jentry_value_len) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+  auto put = [&wb](ROCKSDB_NAMESPACE::Slice key,
+                   ROCKSDB_NAMESPACE::Slice value) {
+    return wb->Put(key, value);
+  };
+  std::unique_ptr<ROCKSDB_NAMESPACE::Status> status =
+      ROCKSDB_NAMESPACE::JniUtil::kv_op(put, env, jobj, jkey, jkey_len,
+                                        jentry_value, jentry_value_len);
+  if (status != nullptr && !status->ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    put
+ * Signature: (J[BI[BIJ)V
+ */
+void Java_org_rocksdb_WriteBatch_put__J_3BI_3BIJ(
+    JNIEnv* env, jobject jobj, jlong jwb_handle, jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  assert(cf_handle != nullptr);
+  auto put = [&wb, &cf_handle](ROCKSDB_NAMESPACE::Slice key,
+                               ROCKSDB_NAMESPACE::Slice value) {
+    return wb->Put(cf_handle, key, value);
+  };
+  std::unique_ptr<ROCKSDB_NAMESPACE::Status> status =
+      ROCKSDB_NAMESPACE::JniUtil::kv_op(put, env, jobj, jkey, jkey_len,
+                                        jentry_value, jentry_value_len);
+  if (status != nullptr && !status->ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    putDirect
+ * Signature: (JLjava/nio/ByteBuffer;IILjava/nio/ByteBuffer;IIJ)V
+ */
+void Java_org_rocksdb_WriteBatch_putDirect(JNIEnv* env, jobject /*jobj*/,
+                                           jlong jwb_handle, jobject jkey,
+                                           jint jkey_offset, jint jkey_len,
+                                           jobject jval, jint jval_offset,
+                                           jint jval_len, jlong jcf_handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  auto put = [&wb, &cf_handle](ROCKSDB_NAMESPACE::Slice& key,
+                               ROCKSDB_NAMESPACE::Slice& value) {
+    if (cf_handle == nullptr) {
+      wb->Put(key, value);
+    } else {
+      wb->Put(cf_handle, key, value);
+    }
+  };
+  ROCKSDB_NAMESPACE::JniUtil::kv_op_direct(
+      put, env, jkey, jkey_offset, jkey_len, jval, jval_offset, jval_len);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    merge
+ * Signature: (J[BI[BI)V
+ */
+void Java_org_rocksdb_WriteBatch_merge__J_3BI_3BI(
+    JNIEnv* env, jobject jobj, jlong jwb_handle, jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+  auto merge = [&wb](ROCKSDB_NAMESPACE::Slice key,
+                     ROCKSDB_NAMESPACE::Slice value) {
+    return wb->Merge(key, value);
+  };
+  std::unique_ptr<ROCKSDB_NAMESPACE::Status> status =
+      ROCKSDB_NAMESPACE::JniUtil::kv_op(merge, env, jobj, jkey, jkey_len,
+                                        jentry_value, jentry_value_len);
+  if (status != nullptr && !status->ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    merge
+ * Signature: (J[BI[BIJ)V
+ */
+void Java_org_rocksdb_WriteBatch_merge__J_3BI_3BIJ(
+    JNIEnv* env, jobject jobj, jlong jwb_handle, jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  assert(cf_handle != nullptr);
+  auto merge = [&wb, &cf_handle](ROCKSDB_NAMESPACE::Slice key,
+                                 ROCKSDB_NAMESPACE::Slice value) {
+    return wb->Merge(cf_handle, key, value);
+  };
+  std::unique_ptr<ROCKSDB_NAMESPACE::Status> status =
+      ROCKSDB_NAMESPACE::JniUtil::kv_op(merge, env, jobj, jkey, jkey_len,
+                                        jentry_value, jentry_value_len);
+  if (status != nullptr && !status->ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    delete
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_WriteBatch_delete__J_3BI(JNIEnv* env, jobject jobj,
+                                               jlong jwb_handle,
+                                               jbyteArray jkey, jint jkey_len) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+  auto remove = [&wb](ROCKSDB_NAMESPACE::Slice key) { return wb->Delete(key); };
+  std::unique_ptr<ROCKSDB_NAMESPACE::Status> status =
+      ROCKSDB_NAMESPACE::JniUtil::k_op(remove, env, jobj, jkey, jkey_len);
+  if (status != nullptr && !status->ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    delete
+ * Signature: (J[BIJ)V
+ */
+void Java_org_rocksdb_WriteBatch_delete__J_3BIJ(JNIEnv* env, jobject jobj,
+                                                jlong jwb_handle,
+                                                jbyteArray jkey, jint jkey_len,
+                                                jlong jcf_handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  assert(cf_handle != nullptr);
+  auto remove = [&wb, &cf_handle](ROCKSDB_NAMESPACE::Slice key) {
+    return wb->Delete(cf_handle, key);
+  };
+  std::unique_ptr<ROCKSDB_NAMESPACE::Status> status =
+      ROCKSDB_NAMESPACE::JniUtil::k_op(remove, env, jobj, jkey, jkey_len);
+  if (status != nullptr && !status->ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    singleDelete
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_WriteBatch_singleDelete__J_3BI(JNIEnv* env, jobject jobj,
+                                                     jlong jwb_handle,
+                                                     jbyteArray jkey,
+                                                     jint jkey_len) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+  auto single_delete = [&wb](ROCKSDB_NAMESPACE::Slice key) {
+    return wb->SingleDelete(key);
+  };
+  std::unique_ptr<ROCKSDB_NAMESPACE::Status> status =
+      ROCKSDB_NAMESPACE::JniUtil::k_op(single_delete, env, jobj, jkey,
+                                       jkey_len);
+  if (status != nullptr && !status->ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    singleDelete
+ * Signature: (J[BIJ)V
+ */
+void Java_org_rocksdb_WriteBatch_singleDelete__J_3BIJ(JNIEnv* env, jobject jobj,
+                                                      jlong jwb_handle,
+                                                      jbyteArray jkey,
+                                                      jint jkey_len,
+                                                      jlong jcf_handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  assert(cf_handle != nullptr);
+  auto single_delete = [&wb, &cf_handle](ROCKSDB_NAMESPACE::Slice key) {
+    return wb->SingleDelete(cf_handle, key);
+  };
+  std::unique_ptr<ROCKSDB_NAMESPACE::Status> status =
+      ROCKSDB_NAMESPACE::JniUtil::k_op(single_delete, env, jobj, jkey,
+                                       jkey_len);
+  if (status != nullptr && !status->ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    deleteDirect
+ * Signature: (JLjava/nio/ByteBuffer;IIJ)V
+ */
+void Java_org_rocksdb_WriteBatch_deleteDirect(JNIEnv* env, jobject /*jobj*/,
+                                              jlong jwb_handle, jobject jkey,
+                                              jint jkey_offset, jint jkey_len,
+                                              jlong jcf_handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  auto remove = [&wb, &cf_handle](ROCKSDB_NAMESPACE::Slice& key) {
+    if (cf_handle == nullptr) {
+      wb->Delete(key);
+    } else {
+      wb->Delete(cf_handle, key);
+    }
+  };
+  ROCKSDB_NAMESPACE::JniUtil::k_op_direct(remove, env, jkey, jkey_offset,
+                                          jkey_len);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    deleteRange
+ * Signature: (J[BI[BI)V
+ */
+void Java_org_rocksdb_WriteBatch_deleteRange__J_3BI_3BI(
+    JNIEnv* env, jobject jobj, jlong jwb_handle, jbyteArray jbegin_key,
+    jint jbegin_key_len, jbyteArray jend_key, jint jend_key_len) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+  auto deleteRange = [&wb](ROCKSDB_NAMESPACE::Slice beginKey,
+                           ROCKSDB_NAMESPACE::Slice endKey) {
+    return wb->DeleteRange(beginKey, endKey);
+  };
+  std::unique_ptr<ROCKSDB_NAMESPACE::Status> status =
+      ROCKSDB_NAMESPACE::JniUtil::kv_op(deleteRange, env, jobj, jbegin_key,
+                                        jbegin_key_len, jend_key, jend_key_len);
+  if (status != nullptr && !status->ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    deleteRange
+ * Signature: (J[BI[BIJ)V
+ */
+void Java_org_rocksdb_WriteBatch_deleteRange__J_3BI_3BIJ(
+    JNIEnv* env, jobject jobj, jlong jwb_handle, jbyteArray jbegin_key,
+    jint jbegin_key_len, jbyteArray jend_key, jint jend_key_len,
+    jlong jcf_handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  assert(cf_handle != nullptr);
+  auto deleteRange = [&wb, &cf_handle](ROCKSDB_NAMESPACE::Slice beginKey,
+                                       ROCKSDB_NAMESPACE::Slice endKey) {
+    return wb->DeleteRange(cf_handle, beginKey, endKey);
+  };
+  std::unique_ptr<ROCKSDB_NAMESPACE::Status> status =
+      ROCKSDB_NAMESPACE::JniUtil::kv_op(deleteRange, env, jobj, jbegin_key,
+                                        jbegin_key_len, jend_key, jend_key_len);
+  if (status != nullptr && !status->ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    putLogData
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_WriteBatch_putLogData(JNIEnv* env, jobject jobj,
+                                            jlong jwb_handle, jbyteArray jblob,
+                                            jint jblob_len) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+  auto putLogData = [&wb](ROCKSDB_NAMESPACE::Slice blob) {
+    return wb->PutLogData(blob);
+  };
+  std::unique_ptr<ROCKSDB_NAMESPACE::Status> status =
+      ROCKSDB_NAMESPACE::JniUtil::k_op(putLogData, env, jobj, jblob, jblob_len);
+  if (status != nullptr && !status->ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    iterate
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_WriteBatch_iterate(JNIEnv* env, jobject /*jobj*/,
+                                         jlong jwb_handle,
+                                         jlong handlerHandle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  ROCKSDB_NAMESPACE::Status s = wb->Iterate(
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchHandlerJniCallback*>(
+          handlerHandle));
+
+  if (s.ok()) {
+    return;
+  }
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    data
+ * Signature: (J)[B
+ */
+jbyteArray Java_org_rocksdb_WriteBatch_data(JNIEnv* env, jobject /*jobj*/,
+                                            jlong jwb_handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  auto data = wb->Data();
+  return ROCKSDB_NAMESPACE::JniUtil::copyBytes(env, data);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    getDataSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_WriteBatch_getDataSize(JNIEnv* /*env*/, jobject /*jobj*/,
+                                              jlong jwb_handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  auto data_size = wb->GetDataSize();
+  return static_cast<jlong>(data_size);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    hasPut
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_WriteBatch_hasPut(JNIEnv* /*env*/, jobject /*jobj*/,
+                                            jlong jwb_handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  return wb->HasPut();
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    hasDelete
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_WriteBatch_hasDelete(JNIEnv* /*env*/,
+                                               jobject /*jobj*/,
+                                               jlong jwb_handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  return wb->HasDelete();
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    hasSingleDelete
+ * Signature: (J)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasSingleDelete(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jwb_handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  return wb->HasSingleDelete();
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    hasDeleteRange
+ * Signature: (J)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasDeleteRange(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jwb_handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  return wb->HasDeleteRange();
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    hasMerge
+ * Signature: (J)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasMerge(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jwb_handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  return wb->HasMerge();
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    hasBeginPrepare
+ * Signature: (J)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasBeginPrepare(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jwb_handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  return wb->HasBeginPrepare();
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    hasEndPrepare
+ * Signature: (J)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasEndPrepare(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jwb_handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  return wb->HasEndPrepare();
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    hasCommit
+ * Signature: (J)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasCommit(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jwb_handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  return wb->HasCommit();
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    hasRollback
+ * Signature: (J)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasRollback(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jwb_handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  return wb->HasRollback();
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    markWalTerminationPoint
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WriteBatch_markWalTerminationPoint(JNIEnv* /*env*/,
+                                                         jobject /*jobj*/,
+                                                         jlong jwb_handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  wb->MarkWalTerminationPoint();
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    getWalTerminationPoint
+ * Signature: (J)Lorg/rocksdb/WriteBatch/SavePoint;
+ */
+jobject Java_org_rocksdb_WriteBatch_getWalTerminationPoint(JNIEnv* env,
+                                                           jobject /*jobj*/,
+                                                           jlong jwb_handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  auto save_point = wb->GetWalTerminationPoint();
+  return ROCKSDB_NAMESPACE::WriteBatchSavePointJni::construct(env, save_point);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WriteBatch_disposeInternal(JNIEnv* /*env*/,
+                                                 jobject /*jobj*/,
+                                                 jlong handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(handle);
+  assert(wb != nullptr);
+  delete wb;
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch_Handler
+ * Method:    createNewHandler0
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_WriteBatch_00024Handler_createNewHandler0(JNIEnv* env,
+                                                                 jobject jobj) {
+  auto* wbjnic = new ROCKSDB_NAMESPACE::WriteBatchHandlerJniCallback(env, jobj);
+  return GET_CPLUSPLUS_POINTER(wbjnic);
+}
diff --git a/src/rocksdb/java/rocksjni/write_batch_test.cc b/src/rocksdb/java/rocksjni/write_batch_test.cc
new file mode 100644
index 000000000..30b9a7229
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/write_batch_test.cc
@@ -0,0 +1,199 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ ROCKSDB_NAMESPACE::WriteBatch methods testing from Java side.
+#include "rocksdb/write_batch.h"
+
+#include <memory>
+
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "include/org_rocksdb_WriteBatch.h"
+#include "include/org_rocksdb_WriteBatchTest.h"
+#include "include/org_rocksdb_WriteBatchTestInternalHelper.h"
+#include "include/org_rocksdb_WriteBatch_Handler.h"
+#include "options/cf_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/status.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "rocksjni/portal.h"
+#include "table/scoped_arena_iterator.h"
+#include "test_util/testharness.h"
+#include "util/string_util.h"
+
+/*
+ * Class:     org_rocksdb_WriteBatchTest
+ * Method:    getContents
+ * Signature: (J)[B
+ */
+jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(JNIEnv* env,
+                                                       jclass /*jclazz*/,
+                                                       jlong jwb_handle) {
+  auto* b = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(b != nullptr);
+
+  // todo: Currently the following code is directly copied from
+  // db/write_bench_test.cc.  It could be implemented in java once
+  // all the necessary components can be accessed via jni api.
+
+  ROCKSDB_NAMESPACE::InternalKeyComparator cmp(
+      ROCKSDB_NAMESPACE::BytewiseComparator());
+  auto factory = std::make_shared<ROCKSDB_NAMESPACE::SkipListFactory>();
+  ROCKSDB_NAMESPACE::Options options;
+  ROCKSDB_NAMESPACE::WriteBufferManager wb(options.db_write_buffer_size);
+  options.memtable_factory = factory;
+  ROCKSDB_NAMESPACE::MemTable* mem = new ROCKSDB_NAMESPACE::MemTable(
+      cmp, ROCKSDB_NAMESPACE::ImmutableOptions(options),
+      ROCKSDB_NAMESPACE::MutableCFOptions(options), &wb,
+      ROCKSDB_NAMESPACE::kMaxSequenceNumber, 0 /* column_family_id */);
+  mem->Ref();
+  std::string state;
+  ROCKSDB_NAMESPACE::ColumnFamilyMemTablesDefault cf_mems_default(mem);
+  ROCKSDB_NAMESPACE::Status s =
+      ROCKSDB_NAMESPACE::WriteBatchInternal::InsertInto(b, &cf_mems_default,
+                                                        nullptr, nullptr);
+  unsigned int count = 0;
+  ROCKSDB_NAMESPACE::Arena arena;
+  ROCKSDB_NAMESPACE::ScopedArenaIterator iter(
+      mem->NewIterator(ROCKSDB_NAMESPACE::ReadOptions(), &arena));
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ROCKSDB_NAMESPACE::ParsedInternalKey ikey;
+    ikey.clear();
+    ROCKSDB_NAMESPACE::Status pik_status = ROCKSDB_NAMESPACE::ParseInternalKey(
+        iter->key(), &ikey, true /* log_err_key */);
+    pik_status.PermitUncheckedError();
+    assert(pik_status.ok());
+    switch (ikey.type) {
+      case ROCKSDB_NAMESPACE::kTypeValue:
+        state.append("Put(");
+        state.append(ikey.user_key.ToString());
+        state.append(", ");
+        state.append(iter->value().ToString());
+        state.append(")");
+        count++;
+        break;
+      case ROCKSDB_NAMESPACE::kTypeMerge:
+        state.append("Merge(");
+        state.append(ikey.user_key.ToString());
+        state.append(", ");
+        state.append(iter->value().ToString());
+        state.append(")");
+        count++;
+        break;
+      case ROCKSDB_NAMESPACE::kTypeDeletion:
+        state.append("Delete(");
+        state.append(ikey.user_key.ToString());
+        state.append(")");
+        count++;
+        break;
+      case ROCKSDB_NAMESPACE::kTypeSingleDeletion:
+        state.append("SingleDelete(");
+        state.append(ikey.user_key.ToString());
+        state.append(")");
+        count++;
+        break;
+      case ROCKSDB_NAMESPACE::kTypeRangeDeletion:
+        state.append("DeleteRange(");
+        state.append(ikey.user_key.ToString());
+        state.append(", ");
+        state.append(iter->value().ToString());
+        state.append(")");
+        count++;
+        break;
+      case ROCKSDB_NAMESPACE::kTypeLogData:
+        state.append("LogData(");
+        state.append(ikey.user_key.ToString());
+        state.append(")");
+        count++;
+        break;
+      default:
+        assert(false);
+        state.append("Err:Expected(");
+        state.append(std::to_string(ikey.type));
+        state.append(")");
+        count++;
+        break;
+    }
+    state.append("@");
+    state.append(std::to_string(ikey.sequence));
+  }
+  if (!s.ok()) {
+    state.append(s.ToString());
+  } else if (ROCKSDB_NAMESPACE::WriteBatchInternal::Count(b) != count) {
+    state.append("Err:CountMismatch(expected=");
+    state.append(
+        std::to_string(ROCKSDB_NAMESPACE::WriteBatchInternal::Count(b)));
+    state.append(", actual=");
+    state.append(std::to_string(count));
+    state.append(")");
+  }
+  delete mem->Unref();
+
+  jbyteArray jstate = env->NewByteArray(static_cast<jsize>(state.size()));
+  if (jstate == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+
+  env->SetByteArrayRegion(
+      jstate, 0, static_cast<jsize>(state.size()),
+      const_cast<jbyte*>(reinterpret_cast<const jbyte*>(state.c_str())));
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    env->DeleteLocalRef(jstate);
+    return nullptr;
+  }
+
+  return jstate;
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchTestInternalHelper
+ * Method:    setSequence
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_WriteBatchTestInternalHelper_setSequence(
+    JNIEnv* /*env*/, jclass /*jclazz*/, jlong jwb_handle, jlong jsn) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  ROCKSDB_NAMESPACE::WriteBatchInternal::SetSequence(
+      wb, static_cast<ROCKSDB_NAMESPACE::SequenceNumber>(jsn));
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchTestInternalHelper
+ * Method:    sequence
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_WriteBatchTestInternalHelper_sequence(JNIEnv* /*env*/,
+                                                             jclass /*jclazz*/,
+                                                             jlong jwb_handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  return static_cast<jlong>(
+      ROCKSDB_NAMESPACE::WriteBatchInternal::Sequence(wb));
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchTestInternalHelper
+ * Method:    append
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_WriteBatchTestInternalHelper_append(JNIEnv* /*env*/,
+                                                          jclass /*jclazz*/,
+                                                          jlong jwb_handle_1,
+                                                          jlong jwb_handle_2) {
+  auto* wb1 = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle_1);
+  assert(wb1 != nullptr);
+  auto* wb2 = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle_2);
+  assert(wb2 != nullptr);
+
+  ROCKSDB_NAMESPACE::WriteBatchInternal::Append(wb1, wb2);
+}
diff --git a/src/rocksdb/java/rocksjni/write_batch_with_index.cc b/src/rocksdb/java/rocksjni/write_batch_with_index.cc
new file mode 100644
index 000000000..a5c3216cb
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/write_batch_with_index.cc
@@ -0,0 +1,953 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ ROCKSDB_NAMESPACE::WriteBatchWithIndex methods from Java side.
+
+#include "rocksdb/utilities/write_batch_with_index.h"
+
+#include "include/org_rocksdb_WBWIRocksIterator.h"
+#include "include/org_rocksdb_WriteBatchWithIndex.h"
+#include "rocksdb/comparator.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    newWriteBatchWithIndex
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_WriteBatchWithIndex_newWriteBatchWithIndex__(
+    JNIEnv* /*env*/, jclass /*jcls*/) {
+  auto* wbwi = new ROCKSDB_NAMESPACE::WriteBatchWithIndex();
+  return GET_CPLUSPLUS_POINTER(wbwi);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    newWriteBatchWithIndex
+ * Signature: (Z)J
+ */
+jlong Java_org_rocksdb_WriteBatchWithIndex_newWriteBatchWithIndex__Z(
+    JNIEnv* /*env*/, jclass /*jcls*/, jboolean joverwrite_key) {
+  auto* wbwi = new ROCKSDB_NAMESPACE::WriteBatchWithIndex(
+      ROCKSDB_NAMESPACE::BytewiseComparator(), 0,
+      static_cast<bool>(joverwrite_key));
+  return GET_CPLUSPLUS_POINTER(wbwi);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    newWriteBatchWithIndex
+ * Signature: (JBIZ)J
+ */
+jlong Java_org_rocksdb_WriteBatchWithIndex_newWriteBatchWithIndex__JBIZ(
+    JNIEnv* /*env*/, jclass /*jcls*/, jlong jfallback_index_comparator_handle,
+    jbyte jcomparator_type, jint jreserved_bytes, jboolean joverwrite_key) {
+  ROCKSDB_NAMESPACE::Comparator* fallback_comparator = nullptr;
+  switch (jcomparator_type) {
+    // JAVA_COMPARATOR
+    case 0x0:
+      fallback_comparator =
+          reinterpret_cast<ROCKSDB_NAMESPACE::ComparatorJniCallback*>(
+              jfallback_index_comparator_handle);
+      break;
+
+    // JAVA_NATIVE_COMPARATOR_WRAPPER
+    case 0x1:
+      fallback_comparator = reinterpret_cast<ROCKSDB_NAMESPACE::Comparator*>(
+          jfallback_index_comparator_handle);
+      break;
+  }
+  auto* wbwi = new ROCKSDB_NAMESPACE::WriteBatchWithIndex(
+      fallback_comparator, static_cast<size_t>(jreserved_bytes),
+      static_cast<bool>(joverwrite_key));
+  return GET_CPLUSPLUS_POINTER(wbwi);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    count0
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_WriteBatchWithIndex_count0(JNIEnv* /*env*/,
+                                                 jobject /*jobj*/,
+                                                 jlong jwbwi_handle) {
+  auto* wbwi =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchWithIndex*>(jwbwi_handle);
+  assert(wbwi != nullptr);
+
+  return static_cast<jint>(wbwi->GetWriteBatch()->Count());
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    put
+ * Signature: (J[BI[BI)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_put__J_3BI_3BI(
+    JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jkey,
+    jint jkey_len, jbyteArray jentry_value, jint jentry_value_len) {
+  auto* wbwi =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchWithIndex*>(jwbwi_handle);
+  assert(wbwi != nullptr);
+  auto put = [&wbwi](ROCKSDB_NAMESPACE::Slice key,
+                     ROCKSDB_NAMESPACE::Slice value) {
+    return wbwi->Put(key, value);
+  };
+  std::unique_ptr<ROCKSDB_NAMESPACE::Status> status =
+      ROCKSDB_NAMESPACE::JniUtil::kv_op(put, env, jobj, jkey, jkey_len,
+                                        jentry_value, jentry_value_len);
+  if (status != nullptr && !status->ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    put
+ * Signature: (J[BI[BIJ)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_put__J_3BI_3BIJ(
+    JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jkey,
+    jint jkey_len, jbyteArray jentry_value, jint jentry_value_len,
+    jlong jcf_handle) {
+  auto* wbwi =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchWithIndex*>(jwbwi_handle);
+  assert(wbwi != nullptr);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  assert(cf_handle != nullptr);
+  auto put = [&wbwi, &cf_handle](ROCKSDB_NAMESPACE::Slice key,
+                                 ROCKSDB_NAMESPACE::Slice value) {
+    return wbwi->Put(cf_handle, key, value);
+  };
+  std::unique_ptr<ROCKSDB_NAMESPACE::Status> status =
+      ROCKSDB_NAMESPACE::JniUtil::kv_op(put, env, jobj, jkey, jkey_len,
+                                        jentry_value, jentry_value_len);
+  if (status != nullptr && !status->ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    putDirect
+ * Signature: (JLjava/nio/ByteBuffer;IILjava/nio/ByteBuffer;IIJ)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_putDirect(
+    JNIEnv* env, jobject /*jobj*/, jlong jwb_handle, jobject jkey,
+    jint jkey_offset, jint jkey_len, jobject jval, jint jval_offset,
+    jint jval_len, jlong jcf_handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  auto put = [&wb, &cf_handle](ROCKSDB_NAMESPACE::Slice& key,
+                               ROCKSDB_NAMESPACE::Slice& value) {
+    if (cf_handle == nullptr) {
+      wb->Put(key, value);
+    } else {
+      wb->Put(cf_handle, key, value);
+    }
+  };
+  ROCKSDB_NAMESPACE::JniUtil::kv_op_direct(
+      put, env, jkey, jkey_offset, jkey_len, jval, jval_offset, jval_len);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    merge
+ * Signature: (J[BI[BI)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_merge__J_3BI_3BI(
+    JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jkey,
+    jint jkey_len, jbyteArray jentry_value, jint jentry_value_len) {
+  auto* wbwi =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchWithIndex*>(jwbwi_handle);
+  assert(wbwi != nullptr);
+  auto merge = [&wbwi](ROCKSDB_NAMESPACE::Slice key,
+                       ROCKSDB_NAMESPACE::Slice value) {
+    return wbwi->Merge(key, value);
+  };
+  std::unique_ptr<ROCKSDB_NAMESPACE::Status> status =
+      ROCKSDB_NAMESPACE::JniUtil::kv_op(merge, env, jobj, jkey, jkey_len,
+                                        jentry_value, jentry_value_len);
+  if (status != nullptr && !status->ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    merge
+ * Signature: (J[BI[BIJ)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_merge__J_3BI_3BIJ(
+    JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jkey,
+    jint jkey_len, jbyteArray jentry_value, jint jentry_value_len,
+    jlong jcf_handle) {
+  auto* wbwi =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchWithIndex*>(jwbwi_handle);
+  assert(wbwi != nullptr);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  assert(cf_handle != nullptr);
+  auto merge = [&wbwi, &cf_handle](ROCKSDB_NAMESPACE::Slice key,
+                                   ROCKSDB_NAMESPACE::Slice value) {
+    return wbwi->Merge(cf_handle, key, value);
+  };
+  std::unique_ptr<ROCKSDB_NAMESPACE::Status> status =
+      ROCKSDB_NAMESPACE::JniUtil::kv_op(merge, env, jobj, jkey, jkey_len,
+                                        jentry_value, jentry_value_len);
+  if (status != nullptr && !status->ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    delete
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_delete__J_3BI(JNIEnv* env,
+                                                        jobject jobj,
+                                                        jlong jwbwi_handle,
+                                                        jbyteArray jkey,
+                                                        jint jkey_len) {
+  auto* wbwi =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchWithIndex*>(jwbwi_handle);
+  assert(wbwi != nullptr);
+  auto remove = [&wbwi](ROCKSDB_NAMESPACE::Slice key) {
+    return wbwi->Delete(key);
+  };
+  std::unique_ptr<ROCKSDB_NAMESPACE::Status> status =
+      ROCKSDB_NAMESPACE::JniUtil::k_op(remove, env, jobj, jkey, jkey_len);
+  if (status != nullptr && !status->ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    delete
+ * Signature: (J[BIJ)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_delete__J_3BIJ(
+    JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jkey,
+    jint jkey_len, jlong jcf_handle) {
+  auto* wbwi =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchWithIndex*>(jwbwi_handle);
+  assert(wbwi != nullptr);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  assert(cf_handle != nullptr);
+  auto remove = [&wbwi, &cf_handle](ROCKSDB_NAMESPACE::Slice key) {
+    return wbwi->Delete(cf_handle, key);
+  };
+  std::unique_ptr<ROCKSDB_NAMESPACE::Status> status =
+      ROCKSDB_NAMESPACE::JniUtil::k_op(remove, env, jobj, jkey, jkey_len);
+  if (status != nullptr && !status->ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    singleDelete
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_singleDelete__J_3BI(
+    JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jkey,
+    jint jkey_len) {
+  auto* wbwi =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchWithIndex*>(jwbwi_handle);
+  assert(wbwi != nullptr);
+  auto single_delete = [&wbwi](ROCKSDB_NAMESPACE::Slice key) {
+    return wbwi->SingleDelete(key);
+  };
+  std::unique_ptr<ROCKSDB_NAMESPACE::Status> status =
+      ROCKSDB_NAMESPACE::JniUtil::k_op(single_delete, env, jobj, jkey,
+                                       jkey_len);
+  if (status != nullptr && !status->ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    singleDelete
+ * Signature: (J[BIJ)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_singleDelete__J_3BIJ(
+    JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jkey,
+    jint jkey_len, jlong jcf_handle) {
+  auto* wbwi =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchWithIndex*>(jwbwi_handle);
+  assert(wbwi != nullptr);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  assert(cf_handle != nullptr);
+  auto single_delete = [&wbwi, &cf_handle](ROCKSDB_NAMESPACE::Slice key) {
+    return wbwi->SingleDelete(cf_handle, key);
+  };
+  std::unique_ptr<ROCKSDB_NAMESPACE::Status> status =
+      ROCKSDB_NAMESPACE::JniUtil::k_op(single_delete, env, jobj, jkey,
+                                       jkey_len);
+  if (status != nullptr && !status->ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    deleteDirect
+ * Signature: (JLjava/nio/ByteBuffer;IIJ)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_deleteDirect(
+    JNIEnv* env, jobject /*jobj*/, jlong jwb_handle, jobject jkey,
+    jint jkey_offset, jint jkey_len, jlong jcf_handle) {
+  auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  auto remove = [&wb, &cf_handle](ROCKSDB_NAMESPACE::Slice& key) {
+    if (cf_handle == nullptr) {
+      wb->Delete(key);
+    } else {
+      wb->Delete(cf_handle, key);
+    }
+  };
+  ROCKSDB_NAMESPACE::JniUtil::k_op_direct(remove, env, jkey, jkey_offset,
+                                          jkey_len);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    deleteRange
+ * Signature: (J[BI[BI)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_deleteRange__J_3BI_3BI(
+    JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jbegin_key,
+    jint jbegin_key_len, jbyteArray jend_key, jint jend_key_len) {
+  auto* wbwi =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchWithIndex*>(jwbwi_handle);
+  assert(wbwi != nullptr);
+  auto deleteRange = [&wbwi](ROCKSDB_NAMESPACE::Slice beginKey,
+                             ROCKSDB_NAMESPACE::Slice endKey) {
+    return wbwi->DeleteRange(beginKey, endKey);
+  };
+  std::unique_ptr<ROCKSDB_NAMESPACE::Status> status =
+      ROCKSDB_NAMESPACE::JniUtil::kv_op(deleteRange, env, jobj, jbegin_key,
+                                        jbegin_key_len, jend_key, jend_key_len);
+  if (status != nullptr && !status->ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    deleteRange
+ * Signature: (J[BI[BIJ)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_deleteRange__J_3BI_3BIJ(
+    JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jbegin_key,
+    jint jbegin_key_len, jbyteArray jend_key, jint jend_key_len,
+    jlong jcf_handle) {
+  auto* wbwi =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchWithIndex*>(jwbwi_handle);
+  assert(wbwi != nullptr);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  assert(cf_handle != nullptr);
+  auto deleteRange = [&wbwi, &cf_handle](ROCKSDB_NAMESPACE::Slice beginKey,
+                                         ROCKSDB_NAMESPACE::Slice endKey) {
+    return wbwi->DeleteRange(cf_handle, beginKey, endKey);
+  };
+  std::unique_ptr<ROCKSDB_NAMESPACE::Status> status =
+      ROCKSDB_NAMESPACE::JniUtil::kv_op(deleteRange, env, jobj, jbegin_key,
+                                        jbegin_key_len, jend_key, jend_key_len);
+  if (status != nullptr && !status->ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    putLogData
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_putLogData(JNIEnv* env, jobject jobj,
+                                                     jlong jwbwi_handle,
+                                                     jbyteArray jblob,
+                                                     jint jblob_len) {
+  auto* wbwi =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchWithIndex*>(jwbwi_handle);
+  assert(wbwi != nullptr);
+  auto putLogData = [&wbwi](ROCKSDB_NAMESPACE::Slice blob) {
+    return wbwi->PutLogData(blob);
+  };
+  std::unique_ptr<ROCKSDB_NAMESPACE::Status> status =
+      ROCKSDB_NAMESPACE::JniUtil::k_op(putLogData, env, jobj, jblob, jblob_len);
+  if (status != nullptr && !status->ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, status);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    clear
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_clear0(JNIEnv* /*env*/,
+                                                 jobject /*jobj*/,
+                                                 jlong jwbwi_handle) {
+  auto* wbwi =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchWithIndex*>(jwbwi_handle);
+  assert(wbwi != nullptr);
+
+  wbwi->Clear();
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    setSavePoint0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_setSavePoint0(JNIEnv* /*env*/,
+                                                        jobject /*jobj*/,
+                                                        jlong jwbwi_handle) {
+  auto* wbwi =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchWithIndex*>(jwbwi_handle);
+  assert(wbwi != nullptr);
+
+  wbwi->SetSavePoint();
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    rollbackToSavePoint0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_rollbackToSavePoint0(
+    JNIEnv* env, jobject /*jobj*/, jlong jwbwi_handle) {
+  auto* wbwi =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchWithIndex*>(jwbwi_handle);
+  assert(wbwi != nullptr);
+
+  auto s = wbwi->RollbackToSavePoint();
+
+  if (s.ok()) {
+    return;
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    popSavePoint
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_popSavePoint(JNIEnv* env,
+                                                       jobject /*jobj*/,
+                                                       jlong jwbwi_handle) {
+  auto* wbwi =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchWithIndex*>(jwbwi_handle);
+  assert(wbwi != nullptr);
+
+  auto s = wbwi->PopSavePoint();
+
+  if (s.ok()) {
+    return;
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    setMaxBytes
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_setMaxBytes(JNIEnv* /*env*/,
+                                                      jobject /*jobj*/,
+                                                      jlong jwbwi_handle,
+                                                      jlong jmax_bytes) {
+  auto* wbwi =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchWithIndex*>(jwbwi_handle);
+  assert(wbwi != nullptr);
+
+  wbwi->SetMaxBytes(static_cast<size_t>(jmax_bytes));
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    getWriteBatch
+ * Signature: (J)Lorg/rocksdb/WriteBatch;
+ */
+jobject Java_org_rocksdb_WriteBatchWithIndex_getWriteBatch(JNIEnv* env,
+                                                           jobject /*jobj*/,
+                                                           jlong jwbwi_handle) {
+  auto* wbwi =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchWithIndex*>(jwbwi_handle);
+  assert(wbwi != nullptr);
+
+  auto* wb = wbwi->GetWriteBatch();
+
+  // TODO(AR) is the `wb` object owned by us?
+  return ROCKSDB_NAMESPACE::WriteBatchJni::construct(env, wb);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    iterator0
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_WriteBatchWithIndex_iterator0(JNIEnv* /*env*/,
+                                                     jobject /*jobj*/,
+                                                     jlong jwbwi_handle) {
+  auto* wbwi =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchWithIndex*>(jwbwi_handle);
+  auto* wbwi_iterator = wbwi->NewIterator();
+  return GET_CPLUSPLUS_POINTER(wbwi_iterator);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    iterator1
+ * Signature: (JJ)J
+ */
+jlong Java_org_rocksdb_WriteBatchWithIndex_iterator1(JNIEnv* /*env*/,
+                                                     jobject /*jobj*/,
+                                                     jlong jwbwi_handle,
+                                                     jlong jcf_handle) {
+  auto* wbwi =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchWithIndex*>(jwbwi_handle);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  auto* wbwi_iterator = wbwi->NewIterator(cf_handle);
+  return GET_CPLUSPLUS_POINTER(wbwi_iterator);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    iteratorWithBase
+ * Signature: (JJJJ)J
+ */
+jlong Java_org_rocksdb_WriteBatchWithIndex_iteratorWithBase(
+    JNIEnv*, jobject, jlong jwbwi_handle, jlong jcf_handle,
+    jlong jbase_iterator_handle, jlong jread_opts_handle) {
+  auto* wbwi =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchWithIndex*>(jwbwi_handle);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  auto* base_iterator =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(jbase_iterator_handle);
+  ROCKSDB_NAMESPACE::ReadOptions* read_opts =
+      jread_opts_handle == 0
+          ? nullptr
+          : reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(
+                jread_opts_handle);
+  auto* iterator =
+      wbwi->NewIteratorWithBase(cf_handle, base_iterator, read_opts);
+  return GET_CPLUSPLUS_POINTER(iterator);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    getFromBatch
+ * Signature: (JJ[BI)[B
+ */
+jbyteArray JNICALL Java_org_rocksdb_WriteBatchWithIndex_getFromBatch__JJ_3BI(
+    JNIEnv* env, jobject /*jobj*/, jlong jwbwi_handle, jlong jdbopt_handle,
+    jbyteArray jkey, jint jkey_len) {
+  auto* wbwi =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchWithIndex*>(jwbwi_handle);
+  auto* dbopt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jdbopt_handle);
+
+  auto getter = [&wbwi, &dbopt](const ROCKSDB_NAMESPACE::Slice& key,
+                                std::string* value) {
+    return wbwi->GetFromBatch(*dbopt, key, value);
+  };
+
+  return ROCKSDB_NAMESPACE::JniUtil::v_op(getter, env, jkey, jkey_len);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    getFromBatch
+ * Signature: (JJ[BIJ)[B
+ */
+jbyteArray Java_org_rocksdb_WriteBatchWithIndex_getFromBatch__JJ_3BIJ(
+    JNIEnv* env, jobject /*jobj*/, jlong jwbwi_handle, jlong jdbopt_handle,
+    jbyteArray jkey, jint jkey_len, jlong jcf_handle) {
+  auto* wbwi =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchWithIndex*>(jwbwi_handle);
+  auto* dbopt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jdbopt_handle);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+
+  auto getter = [&wbwi, &cf_handle, &dbopt](const ROCKSDB_NAMESPACE::Slice& key,
+                                            std::string* value) {
+    return wbwi->GetFromBatch(cf_handle, *dbopt, key, value);
+  };
+
+  return ROCKSDB_NAMESPACE::JniUtil::v_op(getter, env, jkey, jkey_len);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    getFromBatchAndDB
+ * Signature: (JJJ[BI)[B
+ */
+jbyteArray Java_org_rocksdb_WriteBatchWithIndex_getFromBatchAndDB__JJJ_3BI(
+    JNIEnv* env, jobject /*jobj*/, jlong jwbwi_handle, jlong jdb_handle,
+    jlong jreadopt_handle, jbyteArray jkey, jint jkey_len) {
+  auto* wbwi =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchWithIndex*>(jwbwi_handle);
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto* readopt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jreadopt_handle);
+
+  auto getter = [&wbwi, &db, &readopt](const ROCKSDB_NAMESPACE::Slice& key,
+                                       std::string* value) {
+    return wbwi->GetFromBatchAndDB(db, *readopt, key, value);
+  };
+
+  return ROCKSDB_NAMESPACE::JniUtil::v_op(getter, env, jkey, jkey_len);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    getFromBatchAndDB
+ * Signature: (JJJ[BIJ)[B
+ */
+jbyteArray Java_org_rocksdb_WriteBatchWithIndex_getFromBatchAndDB__JJJ_3BIJ(
+    JNIEnv* env, jobject /*jobj*/, jlong jwbwi_handle, jlong jdb_handle,
+    jlong jreadopt_handle, jbyteArray jkey, jint jkey_len, jlong jcf_handle) {
+  auto* wbwi =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchWithIndex*>(jwbwi_handle);
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto* readopt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jreadopt_handle);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+
+  auto getter = [&wbwi, &db, &cf_handle, &readopt](
+                    const ROCKSDB_NAMESPACE::Slice& key, std::string* value) {
+    return wbwi->GetFromBatchAndDB(db, *readopt, cf_handle, key, value);
+  };
+
+  return ROCKSDB_NAMESPACE::JniUtil::v_op(getter, env, jkey, jkey_len);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_disposeInternal(JNIEnv* /*env*/,
+                                                          jobject /*jobj*/,
+                                                          jlong handle) {
+  auto* wbwi =
+      reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchWithIndex*>(handle);
+  assert(wbwi != nullptr);
+  delete wbwi;
+}
+
+/* WBWIRocksIterator below */
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_disposeInternal(JNIEnv* /*env*/,
+                                                        jobject /*jobj*/,
+                                                        jlong handle) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::WBWIIterator*>(handle);
+  assert(it != nullptr);
+  delete it;
+}
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    isValid0
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_WBWIRocksIterator_isValid0(JNIEnv* /*env*/,
+                                                     jobject /*jobj*/,
+                                                     jlong handle) {
+  return reinterpret_cast<ROCKSDB_NAMESPACE::WBWIIterator*>(handle)->Valid();
+}
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    seekToFirst0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_seekToFirst0(JNIEnv* /*env*/,
+                                                     jobject /*jobj*/,
+                                                     jlong handle) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::WBWIIterator*>(handle)->SeekToFirst();
+}
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    seekToLast0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_seekToLast0(JNIEnv* /*env*/,
+                                                    jobject /*jobj*/,
+                                                    jlong handle) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::WBWIIterator*>(handle)->SeekToLast();
+}
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    next0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_next0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                              jlong handle) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::WBWIIterator*>(handle)->Next();
+}
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    prev0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_prev0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                              jlong handle) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::WBWIIterator*>(handle)->Prev();
+}
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    seek0
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_seek0(JNIEnv* env, jobject /*jobj*/,
+                                              jlong handle, jbyteArray jtarget,
+                                              jint jtarget_len) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::WBWIIterator*>(handle);
+  jbyte* target = new jbyte[jtarget_len];
+  env->GetByteArrayRegion(jtarget, 0, jtarget_len, target);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    delete[] target;
+    return;
+  }
+
+  ROCKSDB_NAMESPACE::Slice target_slice(reinterpret_cast<char*>(target),
+                                        jtarget_len);
+
+  it->Seek(target_slice);
+
+  delete[] target;
+}
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    seekDirect0
+ * Signature: (JLjava/nio/ByteBuffer;II)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_seekDirect0(
+    JNIEnv* env, jobject /*jobj*/, jlong handle, jobject jtarget,
+    jint jtarget_off, jint jtarget_len) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::WBWIIterator*>(handle);
+  auto seek = [&it](ROCKSDB_NAMESPACE::Slice& target_slice) {
+    it->Seek(target_slice);
+  };
+  ROCKSDB_NAMESPACE::JniUtil::k_op_direct(seek, env, jtarget, jtarget_off,
+                                          jtarget_len);
+}
+
+/*
+ * This method supports fetching into indirect byte buffers;
+ * the Java wrapper extracts the byte[] and passes it here.
+ *
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    seekByteArray0
+ * Signature: (J[BII)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_seekByteArray0(
+    JNIEnv* env, jobject /*jobj*/, jlong handle, jbyteArray jtarget,
+    jint jtarget_off, jint jtarget_len) {
+  const std::unique_ptr<char[]> target(new char[jtarget_len]);
+  if (target == nullptr) {
+    jclass oom_class = env->FindClass("/lang/java/OutOfMemoryError");
+    env->ThrowNew(oom_class,
+                  "Memory allocation failed in RocksDB JNI function");
+    return;
+  }
+  env->GetByteArrayRegion(jtarget, jtarget_off, jtarget_len,
+                          reinterpret_cast<jbyte*>(target.get()));
+
+  ROCKSDB_NAMESPACE::Slice target_slice(target.get(), jtarget_len);
+
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::WBWIIterator*>(handle);
+  it->Seek(target_slice);
+}
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    seekForPrev0
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_seekForPrev0(JNIEnv* env,
+                                                     jobject /*jobj*/,
+                                                     jlong handle,
+                                                     jbyteArray jtarget,
+                                                     jint jtarget_len) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::WBWIIterator*>(handle);
+  jbyte* target = new jbyte[jtarget_len];
+  env->GetByteArrayRegion(jtarget, 0, jtarget_len, target);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    delete[] target;
+    return;
+  }
+
+  ROCKSDB_NAMESPACE::Slice target_slice(reinterpret_cast<char*>(target),
+                                        jtarget_len);
+
+  it->SeekForPrev(target_slice);
+
+  delete[] target;
+}
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    seekForPrevDirect0
+ * Signature: (JLjava/nio/ByteBuffer;II)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_seekForPrevDirect0(
+    JNIEnv* env, jobject /*jobj*/, jlong handle, jobject jtarget,
+    jint jtarget_off, jint jtarget_len) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::WBWIIterator*>(handle);
+  auto seek_for_prev = [&it](ROCKSDB_NAMESPACE::Slice& target_slice) {
+    it->SeekForPrev(target_slice);
+  };
+  ROCKSDB_NAMESPACE::JniUtil::k_op_direct(seek_for_prev, env, jtarget,
+                                          jtarget_off, jtarget_len);
+}
+
+/*
+ * This method supports fetching into indirect byte buffers;
+ * the Java wrapper extracts the byte[] and passes it here.
+ *
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    seekForPrevByteArray0
+ * Signature: (J[BII)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_seekForPrevByteArray0(
+    JNIEnv* env, jobject /*jobj*/, jlong handle, jbyteArray jtarget,
+    jint jtarget_off, jint jtarget_len) {
+  const std::unique_ptr<char[]> target(new char[jtarget_len]);
+  if (target == nullptr) {
+    jclass oom_class = env->FindClass("/lang/java/OutOfMemoryError");
+    env->ThrowNew(oom_class,
+                  "Memory allocation failed in RocksDB JNI function");
+    return;
+  }
+  env->GetByteArrayRegion(jtarget, jtarget_off, jtarget_len,
+                          reinterpret_cast<jbyte*>(target.get()));
+
+  ROCKSDB_NAMESPACE::Slice target_slice(target.get(), jtarget_len);
+
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::WBWIIterator*>(handle);
+  it->SeekForPrev(target_slice);
+}
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    status0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_status0(JNIEnv* env, jobject /*jobj*/,
+                                                jlong handle) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::WBWIIterator*>(handle);
+  ROCKSDB_NAMESPACE::Status s = it->status();
+
+  if (s.ok()) {
+    return;
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    entry1
+ * Signature: (J)[J
+ */
+jlongArray Java_org_rocksdb_WBWIRocksIterator_entry1(JNIEnv* env,
+                                                     jobject /*jobj*/,
+                                                     jlong handle) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::WBWIIterator*>(handle);
+  const ROCKSDB_NAMESPACE::WriteEntry& we = it->Entry();
+
+  jlong results[3];
+
+  // set the type of the write entry
+  results[0] = ROCKSDB_NAMESPACE::WriteTypeJni::toJavaWriteType(we.type);
+
+  // NOTE: key_slice and value_slice will be freed by
+  // org.rocksdb.DirectSlice#close
+
+  auto* key_slice = new ROCKSDB_NAMESPACE::Slice(we.key.data(), we.key.size());
+  results[1] = GET_CPLUSPLUS_POINTER(key_slice);
+  if (we.type == ROCKSDB_NAMESPACE::kDeleteRecord ||
+      we.type == ROCKSDB_NAMESPACE::kSingleDeleteRecord ||
+      we.type == ROCKSDB_NAMESPACE::kLogDataRecord) {
+    // set native handle of value slice to null if no value available
+    results[2] = 0;
+  } else {
+    auto* value_slice =
+        new ROCKSDB_NAMESPACE::Slice(we.value.data(), we.value.size());
+    results[2] = GET_CPLUSPLUS_POINTER(value_slice);
+  }
+
+  jlongArray jresults = env->NewLongArray(3);
+  if (jresults == nullptr) {
+    // exception thrown: OutOfMemoryError
+    if (results[2] != 0) {
+      auto* value_slice =
+          reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(results[2]);
+      delete value_slice;
+    }
+    delete key_slice;
+    return nullptr;
+  }
+
+  env->SetLongArrayRegion(jresults, 0, 3, results);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    env->DeleteLocalRef(jresults);
+    if (results[2] != 0) {
+      auto* value_slice =
+          reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(results[2]);
+      delete value_slice;
+    }
+    delete key_slice;
+    return nullptr;
+  }
+
+  return jresults;
+}
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    refresh0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_refresh0(JNIEnv* env) {
+  ROCKSDB_NAMESPACE::Status s =
+      ROCKSDB_NAMESPACE::Status::NotSupported("Refresh() is not supported");
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+}
diff --git a/src/rocksdb/java/rocksjni/write_buffer_manager.cc b/src/rocksdb/java/rocksjni/write_buffer_manager.cc
new file mode 100644
index 000000000..b5b7d193b
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/write_buffer_manager.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/write_buffer_manager.h"
+
+#include <jni.h>
+
+#include "include/org_rocksdb_WriteBufferManager.h"
+#include "rocksdb/cache.h"
+#include "rocksjni/cplusplus_to_java_convert.h"
+
+/*
+ * Class:     org_rocksdb_WriteBufferManager
+ * Method:    newWriteBufferManager
+ * Signature: (JJ)J
+ */
+jlong Java_org_rocksdb_WriteBufferManager_newWriteBufferManager(
+    JNIEnv* /*env*/, jclass /*jclazz*/, jlong jbuffer_size, jlong jcache_handle,
+    jboolean allow_stall) {
+  auto* cache_ptr =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache>*>(
+          jcache_handle);
+  auto* write_buffer_manager =
+      new std::shared_ptr<ROCKSDB_NAMESPACE::WriteBufferManager>(
+          std::make_shared<ROCKSDB_NAMESPACE::WriteBufferManager>(
+              jbuffer_size, *cache_ptr, allow_stall));
+  return GET_CPLUSPLUS_POINTER(write_buffer_manager);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBufferManager
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WriteBufferManager_disposeInternal(JNIEnv* /*env*/,
+                                                         jobject /*jobj*/,
+                                                         jlong jhandle) {
+  auto* write_buffer_manager =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::WriteBufferManager>*>(
+          jhandle);
+  assert(write_buffer_manager != nullptr);
+  delete write_buffer_manager;
+}
diff --git a/src/rocksdb/java/rocksjni/writebatchhandlerjnicallback.cc b/src/rocksdb/java/rocksjni/writebatchhandlerjnicallback.cc
new file mode 100644
index 000000000..66ceabe9a
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/writebatchhandlerjnicallback.cc
@@ -0,0 +1,519 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::Comparator.
+
+#include "rocksjni/writebatchhandlerjnicallback.h"
+
+#include "rocksjni/portal.h"
+
+namespace ROCKSDB_NAMESPACE {
+WriteBatchHandlerJniCallback::WriteBatchHandlerJniCallback(
+    JNIEnv* env, jobject jWriteBatchHandler)
+    : JniCallback(env, jWriteBatchHandler), m_env(env) {
+  m_jPutCfMethodId = WriteBatchHandlerJni::getPutCfMethodId(env);
+  if (m_jPutCfMethodId == nullptr) {
+    // exception thrown
+    return;
+  }
+
+  m_jPutMethodId = WriteBatchHandlerJni::getPutMethodId(env);
+  if (m_jPutMethodId == nullptr) {
+    // exception thrown
+    return;
+  }
+
+  m_jMergeCfMethodId = WriteBatchHandlerJni::getMergeCfMethodId(env);
+  if (m_jMergeCfMethodId == nullptr) {
+    // exception thrown
+    return;
+  }
+
+  m_jMergeMethodId = WriteBatchHandlerJni::getMergeMethodId(env);
+  if (m_jMergeMethodId == nullptr) {
+    // exception thrown
+    return;
+  }
+
+  m_jDeleteCfMethodId = WriteBatchHandlerJni::getDeleteCfMethodId(env);
+  if (m_jDeleteCfMethodId == nullptr) {
+    // exception thrown
+    return;
+  }
+
+  m_jDeleteMethodId = WriteBatchHandlerJni::getDeleteMethodId(env);
+  if (m_jDeleteMethodId == nullptr) {
+    // exception thrown
+    return;
+  }
+
+  m_jSingleDeleteCfMethodId =
+      WriteBatchHandlerJni::getSingleDeleteCfMethodId(env);
+  if (m_jSingleDeleteCfMethodId == nullptr) {
+    // exception thrown
+    return;
+  }
+
+  m_jSingleDeleteMethodId = WriteBatchHandlerJni::getSingleDeleteMethodId(env);
+  if (m_jSingleDeleteMethodId == nullptr) {
+    // exception thrown
+    return;
+  }
+
+  m_jDeleteRangeCfMethodId =
+      WriteBatchHandlerJni::getDeleteRangeCfMethodId(env);
+  if (m_jDeleteRangeCfMethodId == nullptr) {
+    // exception thrown
+    return;
+  }
+
+  m_jDeleteRangeMethodId = WriteBatchHandlerJni::getDeleteRangeMethodId(env);
+  if (m_jDeleteRangeMethodId == nullptr) {
+    // exception thrown
+    return;
+  }
+
+  m_jLogDataMethodId = WriteBatchHandlerJni::getLogDataMethodId(env);
+  if (m_jLogDataMethodId == nullptr) {
+    // exception thrown
+    return;
+  }
+
+  m_jPutBlobIndexCfMethodId =
+      WriteBatchHandlerJni::getPutBlobIndexCfMethodId(env);
+  if (m_jPutBlobIndexCfMethodId == nullptr) {
+    // exception thrown
+    return;
+  }
+
+  m_jMarkBeginPrepareMethodId =
+      WriteBatchHandlerJni::getMarkBeginPrepareMethodId(env);
+  if (m_jMarkBeginPrepareMethodId == nullptr) {
+    // exception thrown
+    return;
+  }
+
+  m_jMarkEndPrepareMethodId =
+      WriteBatchHandlerJni::getMarkEndPrepareMethodId(env);
+  if (m_jMarkEndPrepareMethodId == nullptr) {
+    // exception thrown
+    return;
+  }
+
+  m_jMarkNoopMethodId = WriteBatchHandlerJni::getMarkNoopMethodId(env);
+  if (m_jMarkNoopMethodId == nullptr) {
+    // exception thrown
+    return;
+  }
+
+  m_jMarkRollbackMethodId = WriteBatchHandlerJni::getMarkRollbackMethodId(env);
+  if (m_jMarkRollbackMethodId == nullptr) {
+    // exception thrown
+    return;
+  }
+
+  m_jMarkCommitMethodId = WriteBatchHandlerJni::getMarkCommitMethodId(env);
+  if (m_jMarkCommitMethodId == nullptr) {
+    // exception thrown
+    return;
+  }
+
+  m_jMarkCommitWithTimestampMethodId =
+      WriteBatchHandlerJni::getMarkCommitWithTimestampMethodId(env);
+  if (m_jMarkCommitWithTimestampMethodId == nullptr) {
+    // exception thrown
+    return;
+  }
+
+  m_jContinueMethodId = WriteBatchHandlerJni::getContinueMethodId(env);
+  if (m_jContinueMethodId == nullptr) {
+    // exception thrown
+    return;
+  }
+}
+
+ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::PutCF(
+    uint32_t column_family_id, const Slice& key, const Slice& value) {
+  auto put = [this, column_family_id](jbyteArray j_key, jbyteArray j_value) {
+    m_env->CallVoidMethod(m_jcallback_obj, m_jPutCfMethodId,
+                          static_cast<jint>(column_family_id), j_key, j_value);
+  };
+  auto status = WriteBatchHandlerJniCallback::kv_op(key, value, put);
+  if (status == nullptr) {
+    return ROCKSDB_NAMESPACE::Status::OK();  // TODO(AR) what to do if there is
+                                             // an Exception but we don't know
+                                             // the ROCKSDB_NAMESPACE::Status?
+  } else {
+    return ROCKSDB_NAMESPACE::Status(*status);
+  }
+}
+
+void WriteBatchHandlerJniCallback::Put(const Slice& key, const Slice& value) {
+  auto put = [this](jbyteArray j_key, jbyteArray j_value) {
+    m_env->CallVoidMethod(m_jcallback_obj, m_jPutMethodId, j_key, j_value);
+  };
+  WriteBatchHandlerJniCallback::kv_op(key, value, put);
+}
+
+ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::MergeCF(
+    uint32_t column_family_id, const Slice& key, const Slice& value) {
+  auto merge = [this, column_family_id](jbyteArray j_key, jbyteArray j_value) {
+    m_env->CallVoidMethod(m_jcallback_obj, m_jMergeCfMethodId,
+                          static_cast<jint>(column_family_id), j_key, j_value);
+  };
+  auto status = WriteBatchHandlerJniCallback::kv_op(key, value, merge);
+  if (status == nullptr) {
+    return ROCKSDB_NAMESPACE::Status::OK();  // TODO(AR) what to do if there is
+                                             // an Exception but we don't know
+                                             // the ROCKSDB_NAMESPACE::Status?
+  } else {
+    return ROCKSDB_NAMESPACE::Status(*status);
+  }
+}
+
+void WriteBatchHandlerJniCallback::Merge(const Slice& key, const Slice& value) {
+  auto merge = [this](jbyteArray j_key, jbyteArray j_value) {
+    m_env->CallVoidMethod(m_jcallback_obj, m_jMergeMethodId, j_key, j_value);
+  };
+  WriteBatchHandlerJniCallback::kv_op(key, value, merge);
+}
+
+ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::DeleteCF(
+    uint32_t column_family_id, const Slice& key) {
+  auto remove = [this, column_family_id](jbyteArray j_key) {
+    m_env->CallVoidMethod(m_jcallback_obj, m_jDeleteCfMethodId,
+                          static_cast<jint>(column_family_id), j_key);
+  };
+  auto status = WriteBatchHandlerJniCallback::k_op(key, remove);
+  if (status == nullptr) {
+    return ROCKSDB_NAMESPACE::Status::OK();  // TODO(AR) what to do if there is
+                                             // an Exception but we don't know
+                                             // the ROCKSDB_NAMESPACE::Status?
+  } else {
+    return ROCKSDB_NAMESPACE::Status(*status);
+  }
+}
+
+void WriteBatchHandlerJniCallback::Delete(const Slice& key) {
+  auto remove = [this](jbyteArray j_key) {
+    m_env->CallVoidMethod(m_jcallback_obj, m_jDeleteMethodId, j_key);
+  };
+  WriteBatchHandlerJniCallback::k_op(key, remove);
+}
+
+ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::SingleDeleteCF(
+    uint32_t column_family_id, const Slice& key) {
+  auto singleDelete = [this, column_family_id](jbyteArray j_key) {
+    m_env->CallVoidMethod(m_jcallback_obj, m_jSingleDeleteCfMethodId,
+                          static_cast<jint>(column_family_id), j_key);
+  };
+  auto status = WriteBatchHandlerJniCallback::k_op(key, singleDelete);
+  if (status == nullptr) {
+    return ROCKSDB_NAMESPACE::Status::OK();  // TODO(AR) what to do if there is
+                                             // an Exception but we don't know
+                                             // the ROCKSDB_NAMESPACE::Status?
+  } else {
+    return ROCKSDB_NAMESPACE::Status(*status);
+  }
+}
+
+void WriteBatchHandlerJniCallback::SingleDelete(const Slice& key) {
+  auto singleDelete = [this](jbyteArray j_key) {
+    m_env->CallVoidMethod(m_jcallback_obj, m_jSingleDeleteMethodId, j_key);
+  };
+  WriteBatchHandlerJniCallback::k_op(key, singleDelete);
+}
+
+ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::DeleteRangeCF(
+    uint32_t column_family_id, const Slice& beginKey, const Slice& endKey) {
+  auto deleteRange = [this, column_family_id](jbyteArray j_beginKey,
+                                              jbyteArray j_endKey) {
+    m_env->CallVoidMethod(m_jcallback_obj, m_jDeleteRangeCfMethodId,
+                          static_cast<jint>(column_family_id), j_beginKey,
+                          j_endKey);
+  };
+  auto status =
+      WriteBatchHandlerJniCallback::kv_op(beginKey, endKey, deleteRange);
+  if (status == nullptr) {
+    return ROCKSDB_NAMESPACE::Status::OK();  // TODO(AR) what to do if there is
+                                             // an Exception but we don't know
+                                             // the ROCKSDB_NAMESPACE::Status?
+  } else {
+    return ROCKSDB_NAMESPACE::Status(*status);
+  }
+}
+
+void WriteBatchHandlerJniCallback::DeleteRange(const Slice& beginKey,
+                                               const Slice& endKey) {
+  auto deleteRange = [this](jbyteArray j_beginKey, jbyteArray j_endKey) {
+    m_env->CallVoidMethod(m_jcallback_obj, m_jDeleteRangeMethodId, j_beginKey,
+                          j_endKey);
+  };
+  WriteBatchHandlerJniCallback::kv_op(beginKey, endKey, deleteRange);
+}
+
+void WriteBatchHandlerJniCallback::LogData(const Slice& blob) {
+  auto logData = [this](jbyteArray j_blob) {
+    m_env->CallVoidMethod(m_jcallback_obj, m_jLogDataMethodId, j_blob);
+  };
+  WriteBatchHandlerJniCallback::k_op(blob, logData);
+}
+
+ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::PutBlobIndexCF(
+    uint32_t column_family_id, const Slice& key, const Slice& value) {
+  auto putBlobIndex = [this, column_family_id](jbyteArray j_key,
+                                               jbyteArray j_value) {
+    m_env->CallVoidMethod(m_jcallback_obj, m_jPutBlobIndexCfMethodId,
+                          static_cast<jint>(column_family_id), j_key, j_value);
+  };
+  auto status = WriteBatchHandlerJniCallback::kv_op(key, value, putBlobIndex);
+  if (status == nullptr) {
+    return ROCKSDB_NAMESPACE::Status::OK();  // TODO(AR) what to do if there is
+                                             // an Exception but we don't know
+                                             // the ROCKSDB_NAMESPACE::Status?
+  } else {
+    return ROCKSDB_NAMESPACE::Status(*status);
+  }
+}
+
+ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::MarkBeginPrepare(
+    bool unprepare) {
+#ifndef DEBUG
+  (void)unprepare;
+#else
+  assert(!unprepare);
+#endif
+  m_env->CallVoidMethod(m_jcallback_obj, m_jMarkBeginPrepareMethodId);
+
+  // check for Exception, in-particular RocksDBException
+  if (m_env->ExceptionCheck()) {
+    // exception thrown
+    jthrowable exception = m_env->ExceptionOccurred();
+    std::unique_ptr<ROCKSDB_NAMESPACE::Status> status =
+        ROCKSDB_NAMESPACE::RocksDBExceptionJni::toCppStatus(m_env, exception);
+    if (status == nullptr) {
+      // unkown status or exception occurred extracting status
+      m_env->ExceptionDescribe();
+      return ROCKSDB_NAMESPACE::Status::OK();  // TODO(AR) probably need a
+                                               // better error code here
+
+    } else {
+      m_env->ExceptionClear();  // clear the exception, as we have extracted the
+                                // status
+      return ROCKSDB_NAMESPACE::Status(*status);
+    }
+  }
+
+  return ROCKSDB_NAMESPACE::Status::OK();
+}
+
+ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::MarkEndPrepare(
+    const Slice& xid) {
+  auto markEndPrepare = [this](jbyteArray j_xid) {
+    m_env->CallVoidMethod(m_jcallback_obj, m_jMarkEndPrepareMethodId, j_xid);
+  };
+  auto status = WriteBatchHandlerJniCallback::k_op(xid, markEndPrepare);
+  if (status == nullptr) {
+    return ROCKSDB_NAMESPACE::Status::OK();  // TODO(AR) what to do if there is
+                                             // an Exception but we don't know
+                                             // the ROCKSDB_NAMESPACE::Status?
+  } else {
+    return ROCKSDB_NAMESPACE::Status(*status);
+  }
+}
+
+ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::MarkNoop(
+    bool empty_batch) {
+  m_env->CallVoidMethod(m_jcallback_obj, m_jMarkNoopMethodId,
+                        static_cast<jboolean>(empty_batch));
+
+  // check for Exception, in-particular RocksDBException
+  if (m_env->ExceptionCheck()) {
+    // exception thrown
+    jthrowable exception = m_env->ExceptionOccurred();
+    std::unique_ptr<ROCKSDB_NAMESPACE::Status> status =
+        ROCKSDB_NAMESPACE::RocksDBExceptionJni::toCppStatus(m_env, exception);
+    if (status == nullptr) {
+      // unkown status or exception occurred extracting status
+      m_env->ExceptionDescribe();
+      return ROCKSDB_NAMESPACE::Status::OK();  // TODO(AR) probably need a
+                                               // better error code here
+
+    } else {
+      m_env->ExceptionClear();  // clear the exception, as we have extracted the
+                                // status
+      return ROCKSDB_NAMESPACE::Status(*status);
+    }
+  }
+
+  return ROCKSDB_NAMESPACE::Status::OK();
+}
+
+ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::MarkRollback(
+    const Slice& xid) {
+  auto markRollback = [this](jbyteArray j_xid) {
+    m_env->CallVoidMethod(m_jcallback_obj, m_jMarkRollbackMethodId, j_xid);
+  };
+  auto status = WriteBatchHandlerJniCallback::k_op(xid, markRollback);
+  if (status == nullptr) {
+    return ROCKSDB_NAMESPACE::Status::OK();  // TODO(AR) what to do if there is
+                                             // an Exception but we don't know
+                                             // the ROCKSDB_NAMESPACE::Status?
+  } else {
+    return ROCKSDB_NAMESPACE::Status(*status);
+  }
+}
+
+ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::MarkCommit(
+    const Slice& xid) {
+  auto markCommit = [this](jbyteArray j_xid) {
+    m_env->CallVoidMethod(m_jcallback_obj, m_jMarkCommitMethodId, j_xid);
+  };
+  auto status = WriteBatchHandlerJniCallback::k_op(xid, markCommit);
+  if (status == nullptr) {
+    return ROCKSDB_NAMESPACE::Status::OK();  // TODO(AR) what to do if there is
+                                             // an Exception but we don't know
+                                             // the ROCKSDB_NAMESPACE::Status?
+  } else {
+    return ROCKSDB_NAMESPACE::Status(*status);
+  }
+}
+
+ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::MarkCommitWithTimestamp(
+    const Slice& xid, const Slice& ts) {
+  auto markCommitWithTimestamp = [this](jbyteArray j_xid, jbyteArray j_ts) {
+    m_env->CallVoidMethod(m_jcallback_obj, m_jMarkCommitWithTimestampMethodId,
+                          j_xid, j_ts);
+  };
+  auto status =
+      WriteBatchHandlerJniCallback::kv_op(xid, ts, markCommitWithTimestamp);
+  if (status == nullptr) {
+    return ROCKSDB_NAMESPACE::Status::OK();  // TODO(AR) what to do if there is
+                                             // an Exception but we don't know
+                                             // the ROCKSDB_NAMESPACE::Status?
+  } else {
+    return ROCKSDB_NAMESPACE::Status(*status);
+  }
+}
+
+bool WriteBatchHandlerJniCallback::Continue() {
+  jboolean jContinue =
+      m_env->CallBooleanMethod(m_jcallback_obj, m_jContinueMethodId);
+  if (m_env->ExceptionCheck()) {
+    // exception thrown
+    m_env->ExceptionDescribe();
+  }
+
+  return static_cast<bool>(jContinue == JNI_TRUE);
+}
+
+std::unique_ptr<ROCKSDB_NAMESPACE::Status> WriteBatchHandlerJniCallback::kv_op(
+    const Slice& key, const Slice& value,
+    std::function<void(jbyteArray, jbyteArray)> kvFn) {
+  const jbyteArray j_key = JniUtil::copyBytes(m_env, key);
+  if (j_key == nullptr) {
+    // exception thrown
+    if (m_env->ExceptionCheck()) {
+      m_env->ExceptionDescribe();
+    }
+    return nullptr;
+  }
+
+  const jbyteArray j_value = JniUtil::copyBytes(m_env, value);
+  if (j_value == nullptr) {
+    // exception thrown
+    if (m_env->ExceptionCheck()) {
+      m_env->ExceptionDescribe();
+    }
+    if (j_key != nullptr) {
+      m_env->DeleteLocalRef(j_key);
+    }
+    return nullptr;
+  }
+
+  kvFn(j_key, j_value);
+
+  // check for Exception, in-particular RocksDBException
+  if (m_env->ExceptionCheck()) {
+    if (j_value != nullptr) {
+      m_env->DeleteLocalRef(j_value);
+    }
+    if (j_key != nullptr) {
+      m_env->DeleteLocalRef(j_key);
+    }
+
+    // exception thrown
+    jthrowable exception = m_env->ExceptionOccurred();
+    std::unique_ptr<ROCKSDB_NAMESPACE::Status> status =
+        ROCKSDB_NAMESPACE::RocksDBExceptionJni::toCppStatus(m_env, exception);
+    if (status == nullptr) {
+      // unkown status or exception occurred extracting status
+      m_env->ExceptionDescribe();
+      return nullptr;
+
+    } else {
+      m_env->ExceptionClear();  // clear the exception, as we have extracted the
+                                // status
+      return status;
+    }
+  }
+
+  if (j_value != nullptr) {
+    m_env->DeleteLocalRef(j_value);
+  }
+  if (j_key != nullptr) {
+    m_env->DeleteLocalRef(j_key);
+  }
+
+  // all OK
+  return std::unique_ptr<ROCKSDB_NAMESPACE::Status>(
+      new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::OK()));
+}
+
+std::unique_ptr<ROCKSDB_NAMESPACE::Status> WriteBatchHandlerJniCallback::k_op(
+    const Slice& key, std::function<void(jbyteArray)> kFn) {
+  const jbyteArray j_key = JniUtil::copyBytes(m_env, key);
+  if (j_key == nullptr) {
+    // exception thrown
+    if (m_env->ExceptionCheck()) {
+      m_env->ExceptionDescribe();
+    }
+    return nullptr;
+  }
+
+  kFn(j_key);
+
+  // check for Exception, in-particular RocksDBException
+  if (m_env->ExceptionCheck()) {
+    if (j_key != nullptr) {
+      m_env->DeleteLocalRef(j_key);
+    }
+
+    // exception thrown
+    jthrowable exception = m_env->ExceptionOccurred();
+    std::unique_ptr<ROCKSDB_NAMESPACE::Status> status =
+        ROCKSDB_NAMESPACE::RocksDBExceptionJni::toCppStatus(m_env, exception);
+    if (status == nullptr) {
+      // unkown status or exception occurred extracting status
+      m_env->ExceptionDescribe();
+      return nullptr;
+
+    } else {
+      m_env->ExceptionClear();  // clear the exception, as we have extracted the
+                                // status
+      return status;
+    }
+  }
+
+  if (j_key != nullptr) {
+    m_env->DeleteLocalRef(j_key);
+  }
+
+  // all OK
+  return std::unique_ptr<ROCKSDB_NAMESPACE::Status>(
+      new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::OK()));
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/java/rocksjni/writebatchhandlerjnicallback.h b/src/rocksdb/java/rocksjni/writebatchhandlerjnicallback.h
new file mode 100644
index 000000000..9629797ca
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/writebatchhandlerjnicallback.h
@@ -0,0 +1,92 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::WriteBatch::Handler.
+
+#ifndef JAVA_ROCKSJNI_WRITEBATCHHANDLERJNICALLBACK_H_
+#define JAVA_ROCKSJNI_WRITEBATCHHANDLERJNICALLBACK_H_
+
+#include <jni.h>
+
+#include <functional>
+#include <memory>
+
+#include "rocksdb/write_batch.h"
+#include "rocksjni/jnicallback.h"
+
+namespace ROCKSDB_NAMESPACE {
+/**
+ * This class acts as a bridge between C++
+ * and Java. The methods in this class will be
+ * called back from the RocksDB storage engine (C++)
+ * which calls the appropriate Java method.
+ * This enables Write Batch Handlers to be implemented in Java.
+ */
+class WriteBatchHandlerJniCallback : public JniCallback,
+                                     public WriteBatch::Handler {
+ public:
+  WriteBatchHandlerJniCallback(JNIEnv* env, jobject jWriteBackHandler);
+  Status PutCF(uint32_t column_family_id, const Slice& key, const Slice& value);
+  void Put(const Slice& key, const Slice& value);
+  Status MergeCF(uint32_t column_family_id, const Slice& key,
+                 const Slice& value);
+  void Merge(const Slice& key, const Slice& value);
+  Status DeleteCF(uint32_t column_family_id, const Slice& key);
+  void Delete(const Slice& key);
+  Status SingleDeleteCF(uint32_t column_family_id, const Slice& key);
+  void SingleDelete(const Slice& key);
+  Status DeleteRangeCF(uint32_t column_family_id, const Slice& beginKey,
+                       const Slice& endKey);
+  void DeleteRange(const Slice& beginKey, const Slice& endKey);
+  void LogData(const Slice& blob);
+  Status PutBlobIndexCF(uint32_t column_family_id, const Slice& key,
+                        const Slice& value);
+  Status MarkBeginPrepare(bool);
+  Status MarkEndPrepare(const Slice& xid);
+  Status MarkNoop(bool empty_batch);
+  Status MarkRollback(const Slice& xid);
+  Status MarkCommit(const Slice& xid);
+  Status MarkCommitWithTimestamp(const Slice& xid, const Slice& commit_ts);
+  bool Continue();
+
+ private:
+  JNIEnv* m_env;
+  jmethodID m_jPutCfMethodId;
+  jmethodID m_jPutMethodId;
+  jmethodID m_jMergeCfMethodId;
+  jmethodID m_jMergeMethodId;
+  jmethodID m_jDeleteCfMethodId;
+  jmethodID m_jDeleteMethodId;
+  jmethodID m_jSingleDeleteCfMethodId;
+  jmethodID m_jSingleDeleteMethodId;
+  jmethodID m_jDeleteRangeCfMethodId;
+  jmethodID m_jDeleteRangeMethodId;
+  jmethodID m_jLogDataMethodId;
+  jmethodID m_jPutBlobIndexCfMethodId;
+  jmethodID m_jMarkBeginPrepareMethodId;
+  jmethodID m_jMarkEndPrepareMethodId;
+  jmethodID m_jMarkNoopMethodId;
+  jmethodID m_jMarkRollbackMethodId;
+  jmethodID m_jMarkCommitMethodId;
+  jmethodID m_jMarkCommitWithTimestampMethodId;
+  jmethodID m_jContinueMethodId;
+  /**
+   * @return A pointer to a ROCKSDB_NAMESPACE::Status or nullptr if an
+   * unexpected exception occurred
+   */
+  std::unique_ptr<ROCKSDB_NAMESPACE::Status> kv_op(
+      const Slice& key, const Slice& value,
+      std::function<void(jbyteArray, jbyteArray)> kvFn);
+  /**
+   * @return A pointer to a ROCKSDB_NAMESPACE::Status or nullptr if an
+   * unexpected exception occurred
+   */
+  std::unique_ptr<ROCKSDB_NAMESPACE::Status> k_op(
+      const Slice& key, std::function<void(jbyteArray)> kFn);
+};
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // JAVA_ROCKSJNI_WRITEBATCHHANDLERJNICALLBACK_H_
diff --git a/src/rocksdb/java/samples/src/main/java/OptimisticTransactionSample.java b/src/rocksdb/java/samples/src/main/java/OptimisticTransactionSample.java
new file mode 100644
index 000000000..7e7a22e94
--- /dev/null
+++ b/src/rocksdb/java/samples/src/main/java/OptimisticTransactionSample.java
@@ -0,0 +1,184 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+import org.rocksdb.*;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Demonstrates using Transactions on an OptimisticTransactionDB with
+ * varying isolation guarantees
+ */
+public class OptimisticTransactionSample {
+  private static final String dbPath = "/tmp/rocksdb_optimistic_transaction_example";
+
+  public static final void main(final String args[]) throws RocksDBException {
+
+    try(final Options options = new Options()
+        .setCreateIfMissing(true);
+        final OptimisticTransactionDB txnDb =
+            OptimisticTransactionDB.open(options, dbPath)) {
+
+      try (final WriteOptions writeOptions = new WriteOptions();
+           final ReadOptions readOptions = new ReadOptions()) {
+
+        ////////////////////////////////////////////////////////
+        //
+        // Simple OptimisticTransaction Example ("Read Committed")
+        //
+        ////////////////////////////////////////////////////////
+        readCommitted(txnDb, writeOptions, readOptions);
+
+
+        ////////////////////////////////////////////////////////
+        //
+        // "Repeatable Read" (Snapshot Isolation) Example
+        //   -- Using a single Snapshot
+        //
+        ////////////////////////////////////////////////////////
+        repeatableRead(txnDb, writeOptions, readOptions);
+
+
+        ////////////////////////////////////////////////////////
+        //
+        // "Read Committed" (Monotonic Atomic Views) Example
+        //   --Using multiple Snapshots
+        //
+        ////////////////////////////////////////////////////////
+        readCommitted_monotonicAtomicViews(txnDb, writeOptions, readOptions);
+      }
+    }
+  }
+
+  /**
+   * Demonstrates "Read Committed" isolation
+   */
+  private static void readCommitted(final OptimisticTransactionDB txnDb,
+      final WriteOptions writeOptions, final ReadOptions readOptions)
+      throws RocksDBException {
+    final byte key1[] = "abc".getBytes(UTF_8);
+    final byte value1[] = "def".getBytes(UTF_8);
+
+    final byte key2[] = "xyz".getBytes(UTF_8);
+    final byte value2[] = "zzz".getBytes(UTF_8);
+
+    // Start a transaction
+    try(final Transaction txn = txnDb.beginTransaction(writeOptions)) {
+      // Read a key in this transaction
+      byte[] value = txn.get(readOptions, key1);
+      assert(value == null);
+
+      // Write a key in this transaction
+      txn.put(key1, value1);
+
+      // Read a key OUTSIDE this transaction. Does not affect txn.
+      value = txnDb.get(readOptions, key1);
+      assert(value == null);
+
+      // Write a key OUTSIDE of this transaction.
+      // Does not affect txn since this is an unrelated key.
+      // If we wrote key 'abc' here, the transaction would fail to commit.
+      txnDb.put(writeOptions, key2, value2);
+
+      // Commit transaction
+      txn.commit();
+    }
+  }
+
+  /**
+   * Demonstrates "Repeatable Read" (Snapshot Isolation) isolation
+   */
+  private static void repeatableRead(final OptimisticTransactionDB txnDb,
+      final WriteOptions writeOptions, final ReadOptions readOptions)
+      throws RocksDBException {
+
+    final byte key1[] = "ghi".getBytes(UTF_8);
+    final byte value1[] = "jkl".getBytes(UTF_8);
+
+    // Set a snapshot at start of transaction by setting setSnapshot(true)
+    try(final OptimisticTransactionOptions txnOptions =
+            new OptimisticTransactionOptions().setSetSnapshot(true);
+        final Transaction txn =
+            txnDb.beginTransaction(writeOptions, txnOptions)) {
+
+      final Snapshot snapshot = txn.getSnapshot();
+
+      // Write a key OUTSIDE of transaction
+      txnDb.put(writeOptions, key1, value1);
+
+      // Read a key using the snapshot.
+      readOptions.setSnapshot(snapshot);
+      final byte[] value = txn.getForUpdate(readOptions, key1, true);
+      assert (value == null);
+
+      try {
+        // Attempt to commit transaction
+        txn.commit();
+        throw new IllegalStateException();
+      } catch(final RocksDBException e) {
+        // Transaction could not commit since the write outside of the txn
+        // conflicted with the read!
+        assert(e.getStatus().getCode() == Status.Code.Busy);
+      }
+
+      txn.rollback();
+    } finally {
+      // Clear snapshot from read options since it is no longer valid
+      readOptions.setSnapshot(null);
+    }
+  }
+
+  /**
+   * Demonstrates "Read Committed" (Monotonic Atomic Views) isolation
+   *
+   * In this example, we set the snapshot multiple times.  This is probably
+   * only necessary if you have very strict isolation requirements to
+   * implement.
+   */
+  private static void readCommitted_monotonicAtomicViews(
+      final OptimisticTransactionDB txnDb, final WriteOptions writeOptions,
+      final ReadOptions readOptions) throws RocksDBException {
+
+    final byte keyX[] = "x".getBytes(UTF_8);
+    final byte valueX[] = "x".getBytes(UTF_8);
+
+    final byte keyY[] = "y".getBytes(UTF_8);
+    final byte valueY[] = "y".getBytes(UTF_8);
+
+    try (final OptimisticTransactionOptions txnOptions =
+             new OptimisticTransactionOptions().setSetSnapshot(true);
+         final Transaction txn =
+             txnDb.beginTransaction(writeOptions, txnOptions)) {
+
+      // Do some reads and writes to key "x"
+      Snapshot snapshot = txnDb.getSnapshot();
+      readOptions.setSnapshot(snapshot);
+      byte[] value = txn.get(readOptions, keyX);
+      txn.put(valueX, valueX);
+
+      // Do a write outside of the transaction to key "y"
+      txnDb.put(writeOptions, keyY, valueY);
+
+      // Set a new snapshot in the transaction
+      txn.setSnapshot();
+      snapshot = txnDb.getSnapshot();
+      readOptions.setSnapshot(snapshot);
+
+      // Do some reads and writes to key "y"
+      // Since the snapshot was advanced, the write done outside of the
+      // transaction does not conflict.
+      value = txn.getForUpdate(readOptions, keyY, true);
+      txn.put(keyY, valueY);
+
+      // Commit.  Since the snapshot was advanced, the write done outside of the
+      // transaction does not prevent this transaction from Committing.
+      txn.commit();
+
+    } finally {
+      // Clear snapshot from read options since it is no longer valid
+      readOptions.setSnapshot(null);
+    }
+  }
+}
diff --git a/src/rocksdb/java/samples/src/main/java/RocksDBColumnFamilySample.java b/src/rocksdb/java/samples/src/main/java/RocksDBColumnFamilySample.java
new file mode 100644
index 000000000..72f5731a1
--- /dev/null
+++ b/src/rocksdb/java/samples/src/main/java/RocksDBColumnFamilySample.java
@@ -0,0 +1,78 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+import org.rocksdb.*;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class RocksDBColumnFamilySample {
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  public static void main(final String[] args) throws RocksDBException {
+    if (args.length < 1) {
+      System.out.println(
+          "usage: RocksDBColumnFamilySample db_path");
+      System.exit(-1);
+    }
+
+    final String db_path = args[0];
+
+    System.out.println("RocksDBColumnFamilySample");
+    try(final Options options = new Options().setCreateIfMissing(true);
+        final RocksDB db = RocksDB.open(options, db_path)) {
+
+      assert(db != null);
+
+      // create column family
+      try(final ColumnFamilyHandle columnFamilyHandle = db.createColumnFamily(
+          new ColumnFamilyDescriptor("new_cf".getBytes(),
+          new ColumnFamilyOptions()))) {
+        assert (columnFamilyHandle != null);
+      }
+    }
+
+    // open DB with two column families
+    final List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+        new ArrayList<>();
+    // have to open default column family
+    columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
+        RocksDB.DEFAULT_COLUMN_FAMILY, new ColumnFamilyOptions()));
+    // open the new one, too
+    columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
+        "new_cf".getBytes(), new ColumnFamilyOptions()));
+    final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+    try(final DBOptions options = new DBOptions();
+        final RocksDB db = RocksDB.open(options, db_path,
+            columnFamilyDescriptors, columnFamilyHandles)) {
+      assert(db != null);
+
+      try {
+        // put and get from non-default column family
+        db.put(
+            columnFamilyHandles.get(1), new WriteOptions(), "key".getBytes(), "value".getBytes());
+
+        // atomic write
+        try (final WriteBatch wb = new WriteBatch()) {
+          wb.put(columnFamilyHandles.get(0), "key2".getBytes(),
+              "value2".getBytes());
+          wb.put(columnFamilyHandles.get(1), "key3".getBytes(),
+              "value3".getBytes());
+          wb.delete(columnFamilyHandles.get(1), "key".getBytes());
+          db.write(new WriteOptions(), wb);
+        }
+
+        // drop column family
+        db.dropColumnFamily(columnFamilyHandles.get(1));
+      } finally {
+        for (final ColumnFamilyHandle handle : columnFamilyHandles) {
+          handle.close();
+        }
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/samples/src/main/java/RocksDBSample.java b/src/rocksdb/java/samples/src/main/java/RocksDBSample.java
new file mode 100644
index 000000000..ea650b141
--- /dev/null
+++ b/src/rocksdb/java/samples/src/main/java/RocksDBSample.java
@@ -0,0 +1,296 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+import java.lang.IllegalArgumentException;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.ArrayList;
+
+import org.rocksdb.*;
+import org.rocksdb.util.SizeUnit;
+
+public class RocksDBSample {
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  public static void main(final String[] args) {
+    if (args.length < 1) {
+      System.out.println("usage: RocksDBSample db_path");
+      System.exit(-1);
+    }
+
+    final String db_path = args[0];
+    final String db_path_not_found = db_path + "_not_found";
+
+    System.out.println("RocksDBSample");
+    try (final Options options = new Options();
+         final Filter bloomFilter = new BloomFilter(10);
+         final ReadOptions readOptions = new ReadOptions()
+             .setFillCache(false);
+         final Statistics stats = new Statistics();
+         final RateLimiter rateLimiter = new RateLimiter(10000000,10000, 10)) {
+
+      try (final RocksDB db = RocksDB.open(options, db_path_not_found)) {
+        assert (false);
+      } catch (final RocksDBException e) {
+        System.out.format("Caught the expected exception -- %s\n", e);
+      }
+
+      try {
+        options.setCreateIfMissing(true)
+            .setStatistics(stats)
+            .setWriteBufferSize(8 * SizeUnit.KB)
+            .setMaxWriteBufferNumber(3)
+            .setMaxBackgroundJobs(10)
+            .setCompressionType(CompressionType.ZLIB_COMPRESSION)
+            .setCompactionStyle(CompactionStyle.UNIVERSAL);
+      } catch (final IllegalArgumentException e) {
+        assert (false);
+      }
+
+      assert (options.createIfMissing() == true);
+      assert (options.writeBufferSize() == 8 * SizeUnit.KB);
+      assert (options.maxWriteBufferNumber() == 3);
+      assert (options.maxBackgroundJobs() == 10);
+      assert (options.compressionType() == CompressionType.ZLIB_COMPRESSION);
+      assert (options.compactionStyle() == CompactionStyle.UNIVERSAL);
+
+      assert (options.memTableFactoryName().equals("SkipListFactory"));
+      options.setMemTableConfig(
+          new HashSkipListMemTableConfig()
+              .setHeight(4)
+              .setBranchingFactor(4)
+              .setBucketCount(2000000));
+      assert (options.memTableFactoryName().equals("HashSkipListRepFactory"));
+
+      options.setMemTableConfig(
+          new HashLinkedListMemTableConfig()
+              .setBucketCount(100000));
+      assert (options.memTableFactoryName().equals("HashLinkedListRepFactory"));
+
+      options.setMemTableConfig(
+          new VectorMemTableConfig().setReservedSize(10000));
+      assert (options.memTableFactoryName().equals("VectorRepFactory"));
+
+      options.setMemTableConfig(new SkipListMemTableConfig());
+      assert (options.memTableFactoryName().equals("SkipListFactory"));
+
+      options.setTableFormatConfig(new PlainTableConfig());
+      // Plain-Table requires mmap read
+      options.setAllowMmapReads(true);
+      assert (options.tableFactoryName().equals("PlainTable"));
+
+      options.setRateLimiter(rateLimiter);
+
+      final BlockBasedTableConfig table_options = new BlockBasedTableConfig();
+      Cache cache = new LRUCache(64 * 1024, 6);
+      table_options.setBlockCache(cache)
+          .setFilterPolicy(bloomFilter)
+          .setBlockSizeDeviation(5)
+          .setBlockRestartInterval(10)
+          .setCacheIndexAndFilterBlocks(true)
+          .setBlockCacheCompressed(new LRUCache(64 * 1000, 10));
+
+      assert (table_options.blockSizeDeviation() == 5);
+      assert (table_options.blockRestartInterval() == 10);
+      assert (table_options.cacheIndexAndFilterBlocks() == true);
+
+      options.setTableFormatConfig(table_options);
+      assert (options.tableFactoryName().equals("BlockBasedTable"));
+
+      try (final RocksDB db = RocksDB.open(options, db_path)) {
+        db.put("hello".getBytes(), "world".getBytes());
+
+        final byte[] value = db.get("hello".getBytes());
+        assert ("world".equals(new String(value)));
+
+        final String str = db.getProperty("rocksdb.stats");
+        assert (str != null && !str.equals(""));
+      } catch (final RocksDBException e) {
+        System.out.format("[ERROR] caught the unexpected exception -- %s\n", e);
+        assert (false);
+      }
+
+      try (final RocksDB db = RocksDB.open(options, db_path)) {
+        db.put("hello".getBytes(), "world".getBytes());
+        byte[] value = db.get("hello".getBytes());
+        System.out.format("Get('hello') = %s\n",
+            new String(value));
+
+        for (int i = 1; i <= 9; ++i) {
+          for (int j = 1; j <= 9; ++j) {
+            db.put(String.format("%dx%d", i, j).getBytes(),
+                String.format("%d", i * j).getBytes());
+          }
+        }
+
+        for (int i = 1; i <= 9; ++i) {
+          for (int j = 1; j <= 9; ++j) {
+            System.out.format("%s ", new String(db.get(
+                String.format("%dx%d", i, j).getBytes())));
+          }
+          System.out.println("");
+        }
+
+        // write batch test
+        try (final WriteOptions writeOpt = new WriteOptions()) {
+          for (int i = 10; i <= 19; ++i) {
+            try (final WriteBatch batch = new WriteBatch()) {
+              for (int j = 10; j <= 19; ++j) {
+                batch.put(String.format("%dx%d", i, j).getBytes(),
+                    String.format("%d", i * j).getBytes());
+              }
+              db.write(writeOpt, batch);
+            }
+          }
+        }
+        for (int i = 10; i <= 19; ++i) {
+          for (int j = 10; j <= 19; ++j) {
+            assert (new String(
+                db.get(String.format("%dx%d", i, j).getBytes())).equals(
+                String.format("%d", i * j)));
+            System.out.format("%s ", new String(db.get(
+                String.format("%dx%d", i, j).getBytes())));
+          }
+          System.out.println("");
+        }
+
+        value = db.get("1x1".getBytes());
+        assert (value != null);
+        value = db.get("world".getBytes());
+        assert (value == null);
+        value = db.get(readOptions, "world".getBytes());
+        assert (value == null);
+
+        final byte[] testKey = "asdf".getBytes();
+        final byte[] testValue =
+            "asdfghjkl;'?><MNBVCXZQWERTYUIOP{+_)(*&^%$#@".getBytes();
+        db.put(testKey, testValue);
+        byte[] testResult = db.get(testKey);
+        assert (testResult != null);
+        assert (Arrays.equals(testValue, testResult));
+        assert (new String(testValue).equals(new String(testResult)));
+        testResult = db.get(readOptions, testKey);
+        assert (testResult != null);
+        assert (Arrays.equals(testValue, testResult));
+        assert (new String(testValue).equals(new String(testResult)));
+
+        final byte[] insufficientArray = new byte[10];
+        final byte[] enoughArray = new byte[50];
+        int len;
+        len = db.get(testKey, insufficientArray);
+        assert (len > insufficientArray.length);
+        len = db.get("asdfjkl;".getBytes(), enoughArray);
+        assert (len == RocksDB.NOT_FOUND);
+        len = db.get(testKey, enoughArray);
+        assert (len == testValue.length);
+
+        len = db.get(readOptions, testKey, insufficientArray);
+        assert (len > insufficientArray.length);
+        len = db.get(readOptions, "asdfjkl;".getBytes(), enoughArray);
+        assert (len == RocksDB.NOT_FOUND);
+        len = db.get(readOptions, testKey, enoughArray);
+        assert (len == testValue.length);
+
+        db.delete(testKey);
+        len = db.get(testKey, enoughArray);
+        assert (len == RocksDB.NOT_FOUND);
+
+        // repeat the test with WriteOptions
+        try (final WriteOptions writeOpts = new WriteOptions()) {
+          writeOpts.setSync(true);
+          writeOpts.setDisableWAL(false);
+          db.put(writeOpts, testKey, testValue);
+          len = db.get(testKey, enoughArray);
+          assert (len == testValue.length);
+          assert (new String(testValue).equals(
+              new String(enoughArray, 0, len)));
+        }
+
+        try {
+          for (final TickerType statsType : TickerType.values()) {
+            if (statsType != TickerType.TICKER_ENUM_MAX) {
+              stats.getTickerCount(statsType);
+            }
+          }
+          System.out.println("getTickerCount() passed.");
+        } catch (final Exception e) {
+          System.out.println("Failed in call to getTickerCount()");
+          assert (false); //Should never reach here.
+        }
+
+        try {
+          for (final HistogramType histogramType : HistogramType.values()) {
+            if (histogramType != HistogramType.HISTOGRAM_ENUM_MAX) {
+              HistogramData data = stats.getHistogramData(histogramType);
+            }
+          }
+          System.out.println("getHistogramData() passed.");
+        } catch (final Exception e) {
+          System.out.println("Failed in call to getHistogramData()");
+          assert (false); //Should never reach here.
+        }
+
+        try (final RocksIterator iterator = db.newIterator()) {
+
+          boolean seekToFirstPassed = false;
+          for (iterator.seekToFirst(); iterator.isValid(); iterator.next()) {
+            iterator.status();
+            assert (iterator.key() != null);
+            assert (iterator.value() != null);
+            seekToFirstPassed = true;
+          }
+          if (seekToFirstPassed) {
+            System.out.println("iterator seekToFirst tests passed.");
+          }
+
+          boolean seekToLastPassed = false;
+          for (iterator.seekToLast(); iterator.isValid(); iterator.prev()) {
+            iterator.status();
+            assert (iterator.key() != null);
+            assert (iterator.value() != null);
+            seekToLastPassed = true;
+          }
+
+          if (seekToLastPassed) {
+            System.out.println("iterator seekToLastPassed tests passed.");
+          }
+
+          iterator.seekToFirst();
+          iterator.seek(iterator.key());
+          assert (iterator.key() != null);
+          assert (iterator.value() != null);
+
+          System.out.println("iterator seek test passed.");
+
+        }
+        System.out.println("iterator tests passed.");
+
+        final List<byte[]> keys = new ArrayList<>();
+        try (final RocksIterator iterator = db.newIterator()) {
+          for (iterator.seekToLast(); iterator.isValid(); iterator.prev()) {
+            keys.add(iterator.key());
+          }
+        }
+
+        List<byte[]> values = db.multiGetAsList(keys);
+        assert (values.size() == keys.size());
+        for (final byte[] value1 : values) {
+          assert (value1 != null);
+        }
+
+        values = db.multiGetAsList(new ReadOptions(), keys);
+        assert (values.size() == keys.size());
+        for (final byte[] value1 : values) {
+          assert (value1 != null);
+        }
+      } catch (final RocksDBException e) {
+        System.err.println(e);
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/samples/src/main/java/TransactionSample.java b/src/rocksdb/java/samples/src/main/java/TransactionSample.java
new file mode 100644
index 000000000..b88a68f12
--- /dev/null
+++ b/src/rocksdb/java/samples/src/main/java/TransactionSample.java
@@ -0,0 +1,183 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+import org.rocksdb.*;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Demonstrates using Transactions on a TransactionDB with
+ * varying isolation guarantees
+ */
+public class TransactionSample {
+  private static final String dbPath = "/tmp/rocksdb_transaction_example";
+
+  public static final void main(final String args[]) throws RocksDBException {
+
+    try(final Options options = new Options()
+        .setCreateIfMissing(true);
+        final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+        final TransactionDB txnDb =
+            TransactionDB.open(options, txnDbOptions, dbPath)) {
+
+      try (final WriteOptions writeOptions = new WriteOptions();
+           final ReadOptions readOptions = new ReadOptions()) {
+
+        ////////////////////////////////////////////////////////
+        //
+        // Simple Transaction Example ("Read Committed")
+        //
+        ////////////////////////////////////////////////////////
+        readCommitted(txnDb, writeOptions, readOptions);
+
+
+        ////////////////////////////////////////////////////////
+        //
+        // "Repeatable Read" (Snapshot Isolation) Example
+        //   -- Using a single Snapshot
+        //
+        ////////////////////////////////////////////////////////
+        repeatableRead(txnDb, writeOptions, readOptions);
+
+
+        ////////////////////////////////////////////////////////
+        //
+        // "Read Committed" (Monotonic Atomic Views) Example
+        //   --Using multiple Snapshots
+        //
+        ////////////////////////////////////////////////////////
+        readCommitted_monotonicAtomicViews(txnDb, writeOptions, readOptions);
+      }
+    }
+  }
+
+  /**
+   * Demonstrates "Read Committed" isolation
+   */
+  private static void readCommitted(final TransactionDB txnDb,
+      final WriteOptions writeOptions, final ReadOptions readOptions)
+      throws RocksDBException {
+    final byte key1[] = "abc".getBytes(UTF_8);
+    final byte value1[] = "def".getBytes(UTF_8);
+
+    final byte key2[] = "xyz".getBytes(UTF_8);
+    final byte value2[] = "zzz".getBytes(UTF_8);
+
+    // Start a transaction
+    try(final Transaction txn = txnDb.beginTransaction(writeOptions)) {
+      // Read a key in this transaction
+      byte[] value = txn.get(readOptions, key1);
+      assert(value == null);
+
+      // Write a key in this transaction
+      txn.put(key1, value1);
+
+      // Read a key OUTSIDE this transaction. Does not affect txn.
+      value = txnDb.get(readOptions, key1);
+      assert(value == null);
+
+      // Write a key OUTSIDE of this transaction.
+      // Does not affect txn since this is an unrelated key.
+      // If we wrote key 'abc' here, the transaction would fail to commit.
+      txnDb.put(writeOptions, key2, value2);
+
+      // Commit transaction
+      txn.commit();
+    }
+  }
+
+  /**
+   * Demonstrates "Repeatable Read" (Snapshot Isolation) isolation
+   */
+  private static void repeatableRead(final TransactionDB txnDb,
+      final WriteOptions writeOptions, final ReadOptions readOptions)
+      throws RocksDBException {
+
+    final byte key1[] = "ghi".getBytes(UTF_8);
+    final byte value1[] = "jkl".getBytes(UTF_8);
+
+    // Set a snapshot at start of transaction by setting setSnapshot(true)
+    try(final TransactionOptions txnOptions = new TransactionOptions()
+          .setSetSnapshot(true);
+        final Transaction txn =
+            txnDb.beginTransaction(writeOptions, txnOptions)) {
+
+      final Snapshot snapshot = txn.getSnapshot();
+
+      // Write a key OUTSIDE of transaction
+      txnDb.put(writeOptions, key1, value1);
+
+      // Attempt to read a key using the snapshot.  This will fail since
+      // the previous write outside this txn conflicts with this read.
+      readOptions.setSnapshot(snapshot);
+
+      try {
+        final byte[] value = txn.getForUpdate(readOptions, key1, true);
+        throw new IllegalStateException();
+      } catch(final RocksDBException e) {
+        assert(e.getStatus().getCode() == Status.Code.Busy);
+      }
+
+      txn.rollback();
+    } finally {
+      // Clear snapshot from read options since it is no longer valid
+      readOptions.setSnapshot(null);
+    }
+  }
+
+  /**
+   * Demonstrates "Read Committed" (Monotonic Atomic Views) isolation
+   *
+   * In this example, we set the snapshot multiple times.  This is probably
+   * only necessary if you have very strict isolation requirements to
+   * implement.
+   */
+  private static void readCommitted_monotonicAtomicViews(
+      final TransactionDB txnDb, final WriteOptions writeOptions,
+      final ReadOptions readOptions) throws RocksDBException {
+
+    final byte keyX[] = "x".getBytes(UTF_8);
+    final byte valueX[] = "x".getBytes(UTF_8);
+
+    final byte keyY[] = "y".getBytes(UTF_8);
+    final byte valueY[] = "y".getBytes(UTF_8);
+
+    try (final TransactionOptions txnOptions = new TransactionOptions()
+        .setSetSnapshot(true);
+         final Transaction txn =
+             txnDb.beginTransaction(writeOptions, txnOptions)) {
+
+      // Do some reads and writes to key "x"
+      Snapshot snapshot = txnDb.getSnapshot();
+      readOptions.setSnapshot(snapshot);
+      byte[] value = txn.get(readOptions, keyX);
+      txn.put(valueX, valueX);
+
+      // Do a write outside of the transaction to key "y"
+      txnDb.put(writeOptions, keyY, valueY);
+
+      // Set a new snapshot in the transaction
+      txn.setSnapshot();
+      txn.setSavePoint();
+      snapshot = txnDb.getSnapshot();
+      readOptions.setSnapshot(snapshot);
+
+      // Do some reads and writes to key "y"
+      // Since the snapshot was advanced, the write done outside of the
+      // transaction does not conflict.
+      value = txn.getForUpdate(readOptions, keyY, true);
+      txn.put(keyY, valueY);
+
+      // Decide we want to revert the last write from this transaction.
+      txn.rollbackToSavePoint();
+
+      // Commit.
+      txn.commit();
+    } finally {
+      // Clear snapshot from read options since it is no longer valid
+      readOptions.setSnapshot(null);
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/AbstractCompactionFilter.java b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractCompactionFilter.java
new file mode 100644
index 000000000..2f0d4f3ca
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractCompactionFilter.java
@@ -0,0 +1,59 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+/**
+ * A CompactionFilter allows an application to modify/delete a key-value at
+ * the time of compaction.
+ *
+ * At present we just permit an overriding Java class to wrap a C++
+ * implementation
+ */
+public abstract class AbstractCompactionFilter<T extends AbstractSlice<?>>
+    extends RocksObject {
+
+  public static class Context {
+    private final boolean fullCompaction;
+    private final boolean manualCompaction;
+
+    public Context(final boolean fullCompaction, final boolean manualCompaction) {
+      this.fullCompaction = fullCompaction;
+      this.manualCompaction = manualCompaction;
+    }
+
+    /**
+     * Does this compaction run include all data files
+     *
+     * @return true if this is a full compaction run
+     */
+    public boolean isFullCompaction() {
+      return fullCompaction;
+    }
+
+    /**
+     * Is this compaction requested by the client,
+     * or is it occurring as an automatic compaction process
+     *
+     * @return true if the compaction was initiated by the client
+     */
+    public boolean isManualCompaction() {
+      return manualCompaction;
+    }
+  }
+
+  protected AbstractCompactionFilter(final long nativeHandle) {
+    super(nativeHandle);
+  }
+
+  /**
+   * Deletes underlying C++ compaction pointer.
+   *
+   * Note that this function should be called only after all
+   * RocksDB instances referencing the compaction filter are closed.
+   * Otherwise an undefined behavior will occur.
+   */
+  @Override
+  protected final native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/AbstractCompactionFilterFactory.java b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractCompactionFilterFactory.java
new file mode 100644
index 000000000..380b4461d
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractCompactionFilterFactory.java
@@ -0,0 +1,77 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Each compaction will create a new {@link AbstractCompactionFilter}
+ * allowing the application to know about different compactions
+ *
+ * @param <T> The concrete type of the compaction filter
+ */
+public abstract class AbstractCompactionFilterFactory<T extends AbstractCompactionFilter<?>>
+    extends RocksCallbackObject {
+
+  public AbstractCompactionFilterFactory() {
+    super(null);
+  }
+
+  @Override
+  protected long initializeNative(final long... nativeParameterHandles) {
+    return createNewCompactionFilterFactory0();
+  }
+
+  /**
+   * Called from JNI, see compaction_filter_factory_jnicallback.cc
+   *
+   * @param fullCompaction {@link AbstractCompactionFilter.Context#fullCompaction}
+   * @param manualCompaction {@link AbstractCompactionFilter.Context#manualCompaction}
+   *
+   * @return native handle of the CompactionFilter
+   */
+  private long createCompactionFilter(final boolean fullCompaction,
+      final boolean manualCompaction) {
+    final T filter = createCompactionFilter(
+        new AbstractCompactionFilter.Context(fullCompaction, manualCompaction));
+
+    // CompactionFilterFactory::CreateCompactionFilter returns a std::unique_ptr
+    // which therefore has ownership of the underlying native object
+    filter.disOwnNativeHandle();
+
+    return filter.nativeHandle_;
+  }
+
+  /**
+   * Create a new compaction filter
+   *
+   * @param context The context describing the need for a new compaction filter
+   *
+   * @return A new instance of {@link AbstractCompactionFilter}
+   */
+  public abstract T createCompactionFilter(
+      final AbstractCompactionFilter.Context context);
+
+  /**
+   * A name which identifies this compaction filter
+   *
+   * The name will be printed to the LOG file on start up for diagnosis
+   *
+   * @return name which identifies this compaction filter.
+   */
+  public abstract String name();
+
+  /**
+   * We override {@link RocksCallbackObject#disposeInternal()}
+   * as disposing of a rocksdb::AbstractCompactionFilterFactory requires
+   * a slightly different approach as it is a std::shared_ptr
+   */
+  @Override
+  protected void disposeInternal() {
+    disposeInternal(nativeHandle_);
+  }
+
+  private native long createNewCompactionFilterFactory0();
+  private native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/AbstractComparator.java b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractComparator.java
new file mode 100644
index 000000000..c08e9127c
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractComparator.java
@@ -0,0 +1,124 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.nio.ByteBuffer;
+
+/**
+ * Comparators are used by RocksDB to determine
+ * the ordering of keys.
+ *
+ * Implementations of Comparators in Java should extend this class.
+ */
+public abstract class AbstractComparator
+    extends RocksCallbackObject {
+
+  AbstractComparator() {
+    super();
+  }
+
+  protected AbstractComparator(final ComparatorOptions copt) {
+    super(copt.nativeHandle_);
+  }
+
+  @Override
+  protected long initializeNative(final long... nativeParameterHandles) {
+      return createNewComparator(nativeParameterHandles[0]);
+  }
+
+  /**
+   * Get the type of this comparator.
+   *
+   * Used for determining the correct C++ cast in native code.
+   *
+   * @return The type of the comparator.
+   */
+  ComparatorType getComparatorType() {
+    return ComparatorType.JAVA_COMPARATOR;
+  }
+
+  /**
+   * The name of the comparator.  Used to check for comparator
+   * mismatches (i.e., a DB created with one comparator is
+   * accessed using a different comparator).
+   *
+   * A new name should be used whenever
+   * the comparator implementation changes in a way that will cause
+   * the relative ordering of any two keys to change.
+   *
+   * Names starting with "rocksdb." are reserved and should not be used.
+   *
+   * @return The name of this comparator implementation
+   */
+  public abstract String name();
+
+  /**
+   * Three-way key comparison. Implementations should provide a
+   * <a href="https://en.wikipedia.org/wiki/Total_order">total order</a>
+   * on keys that might be passed to it.
+   *
+   * The implementation may modify the {@code ByteBuffer}s passed in, though
+   * it would be unconventional to modify the "limit" or any of the
+   * underlying bytes. As a callback, RocksJava will ensure that {@code a}
+   * is a different instance from {@code b}.
+   *
+   * @param a buffer containing the first key in its "remaining" elements
+   * @param b buffer containing the second key in its "remaining" elements
+   *
+   * @return Should return either:
+   *    1) &lt; 0 if "a" &lt; "b"
+   *    2) == 0 if "a" == "b"
+   *    3) &gt; 0 if "a" &gt; "b"
+   */
+  public abstract int compare(final ByteBuffer a, final ByteBuffer b);
+
+  /**
+   * <p>Used to reduce the space requirements
+   * for internal data structures like index blocks.</p>
+   *
+   * <p>If start &lt; limit, you may modify start which is a
+   * shorter string in [start, limit).</p>
+   *
+   * If you modify start, it is expected that you set the byte buffer so that
+   * a subsequent read of start.remaining() bytes from start.position()
+   * to start.limit() will obtain the new start value.
+   *
+   * <p>Simple comparator implementations may return with start unchanged.
+   * i.e., an implementation of this method that does nothing is correct.</p>
+   *
+   * @param start the start
+   * @param limit the limit
+   */
+  public void findShortestSeparator(final ByteBuffer start,
+      final ByteBuffer limit) {
+    // no-op
+  }
+
+  /**
+   * <p>Used to reduce the space requirements
+   * for internal data structures like index blocks.</p>
+   *
+   * <p>You may change key to a shorter key (key1) where
+   * key1 &ge; key.</p>
+   *
+   * <p>Simple comparator implementations may return the key unchanged.
+   * i.e., an implementation of
+   * this method that does nothing is correct.</p>
+   *
+   * @param key the key
+   */
+  public void findShortSuccessor(final ByteBuffer key) {
+    // no-op
+  }
+
+  public final boolean usingDirectBuffers() {
+    return usingDirectBuffers(nativeHandle_);
+  }
+
+  private native boolean usingDirectBuffers(final long nativeHandle);
+
+  private native long createNewComparator(final long comparatorOptionsHandle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/AbstractComparatorJniBridge.java b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractComparatorJniBridge.java
new file mode 100644
index 000000000..b732d2495
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractComparatorJniBridge.java
@@ -0,0 +1,125 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.nio.ByteBuffer;
+
+/**
+ * This class is intentionally private,
+ * it holds methods which are called
+ * from C++ to interact with a Comparator
+ * written in Java.
+ *
+ * Placing these bridge methods in this
+ * class keeps the API of the
+ * {@link org.rocksdb.AbstractComparator} clean.
+ */
+class AbstractComparatorJniBridge {
+
+    /**
+     * Only called from JNI.
+     *
+     * Simply a bridge to calling
+     * {@link AbstractComparator#compare(ByteBuffer, ByteBuffer)},
+     * which ensures that the byte buffer lengths are correct
+     * before and after the call.
+     *
+     * @param comparator the comparator object on which to
+     *     call {@link AbstractComparator#compare(ByteBuffer, ByteBuffer)}
+     * @param a buffer access to first key
+     * @param aLen the length of the a key,
+     *     may be smaller than the buffer {@code a}
+     * @param b buffer access to second key
+     * @param bLen the length of the b key,
+     *     may be smaller than the buffer {@code b}
+     *
+     * @return the result of the comparison
+     */
+    private static int compareInternal(
+            final AbstractComparator comparator,
+            final ByteBuffer a, final int aLen,
+            final ByteBuffer b, final int bLen) {
+        if (aLen != -1) {
+            a.mark();
+            a.limit(aLen);
+        }
+        if (bLen != -1) {
+            b.mark();
+            b.limit(bLen);
+        }
+
+        final int c = comparator.compare(a, b);
+
+        if (aLen != -1) {
+            a.reset();
+        }
+        if (bLen != -1) {
+            b.reset();
+        }
+
+        return c;
+    }
+
+    /**
+     * Only called from JNI.
+     *
+     * Simply a bridge to calling
+     * {@link AbstractComparator#findShortestSeparator(ByteBuffer, ByteBuffer)},
+     * which ensures that the byte buffer lengths are correct
+     * before the call.
+     *
+     * @param comparator the comparator object on which to
+     *     call {@link AbstractComparator#findShortestSeparator(ByteBuffer, ByteBuffer)}
+     * @param start buffer access to the start key
+     * @param startLen the length of the start key,
+     *     may be smaller than the buffer {@code start}
+     * @param limit buffer access to the limit key
+     * @param limitLen the length of the limit key,
+     *     may be smaller than the buffer {@code limit}
+     *
+     * @return either {@code startLen} if the start key is unchanged, otherwise
+     *     the new length of the start key
+     */
+    private static int findShortestSeparatorInternal(
+            final AbstractComparator comparator,
+            final ByteBuffer start, final int startLen,
+            final ByteBuffer limit, final int limitLen) {
+        if (startLen != -1) {
+            start.limit(startLen);
+        }
+        if (limitLen != -1) {
+            limit.limit(limitLen);
+        }
+        comparator.findShortestSeparator(start, limit);
+        return start.remaining();
+    }
+
+    /**
+     * Only called from JNI.
+     *
+     * Simply a bridge to calling
+     * {@link AbstractComparator#findShortestSeparator(ByteBuffer, ByteBuffer)},
+     * which ensures that the byte buffer length is correct
+     * before the call.
+     *
+     * @param comparator the comparator object on which to
+     *     call {@link AbstractComparator#findShortSuccessor(ByteBuffer)}
+     * @param key buffer access to the key
+     * @param keyLen the length of the key,
+     *     may be smaller than the buffer {@code key}
+     *
+     * @return either keyLen if the key is unchanged, otherwise the new length of the key
+     */
+    private static int findShortSuccessorInternal(
+            final AbstractComparator comparator,
+            final ByteBuffer key, final int keyLen) {
+        if (keyLen != -1) {
+            key.limit(keyLen);
+        }
+        comparator.findShortSuccessor(key);
+        return key.remaining();
+    }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/AbstractEventListener.java b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractEventListener.java
new file mode 100644
index 000000000..6698acf88
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractEventListener.java
@@ -0,0 +1,334 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import static org.rocksdb.AbstractEventListener.EnabledEventCallback.*;
+
+/**
+ * Base class for Event Listeners.
+ */
+public abstract class AbstractEventListener extends RocksCallbackObject implements EventListener {
+  public enum EnabledEventCallback {
+    ON_FLUSH_COMPLETED((byte) 0x0),
+    ON_FLUSH_BEGIN((byte) 0x1),
+    ON_TABLE_FILE_DELETED((byte) 0x2),
+    ON_COMPACTION_BEGIN((byte) 0x3),
+    ON_COMPACTION_COMPLETED((byte) 0x4),
+    ON_TABLE_FILE_CREATED((byte) 0x5),
+    ON_TABLE_FILE_CREATION_STARTED((byte) 0x6),
+    ON_MEMTABLE_SEALED((byte) 0x7),
+    ON_COLUMN_FAMILY_HANDLE_DELETION_STARTED((byte) 0x8),
+    ON_EXTERNAL_FILE_INGESTED((byte) 0x9),
+    ON_BACKGROUND_ERROR((byte) 0xA),
+    ON_STALL_CONDITIONS_CHANGED((byte) 0xB),
+    ON_FILE_READ_FINISH((byte) 0xC),
+    ON_FILE_WRITE_FINISH((byte) 0xD),
+    ON_FILE_FLUSH_FINISH((byte) 0xE),
+    ON_FILE_SYNC_FINISH((byte) 0xF),
+    ON_FILE_RANGE_SYNC_FINISH((byte) 0x10),
+    ON_FILE_TRUNCATE_FINISH((byte) 0x11),
+    ON_FILE_CLOSE_FINISH((byte) 0x12),
+    SHOULD_BE_NOTIFIED_ON_FILE_IO((byte) 0x13),
+    ON_ERROR_RECOVERY_BEGIN((byte) 0x14),
+    ON_ERROR_RECOVERY_COMPLETED((byte) 0x15);
+
+    private final byte value;
+
+    EnabledEventCallback(final byte value) {
+      this.value = value;
+    }
+
+    /**
+     * Get the internal representation value.
+     *
+     * @return the internal representation value
+     */
+    byte getValue() {
+      return value;
+    }
+
+    /**
+     * Get the EnabledEventCallbacks from the internal representation value.
+     *
+     * @return the enabled event callback.
+     *
+     * @throws IllegalArgumentException if the value is unknown.
+     */
+    static EnabledEventCallback fromValue(final byte value) {
+      for (final EnabledEventCallback enabledEventCallback : EnabledEventCallback.values()) {
+        if (enabledEventCallback.value == value) {
+          return enabledEventCallback;
+        }
+      }
+
+      throw new IllegalArgumentException(
+          "Illegal value provided for EnabledEventCallback: " + value);
+    }
+  }
+
+  /**
+   * Creates an Event Listener that will
+   * received all callbacks from C++.
+   *
+   * If you don't need all callbacks, it is much more efficient to
+   * just register for the ones you need by calling
+   * {@link #AbstractEventListener(EnabledEventCallback...)} instead.
+   */
+  protected AbstractEventListener() {
+    this(ON_FLUSH_COMPLETED, ON_FLUSH_BEGIN, ON_TABLE_FILE_DELETED, ON_COMPACTION_BEGIN,
+        ON_COMPACTION_COMPLETED, ON_TABLE_FILE_CREATED, ON_TABLE_FILE_CREATION_STARTED,
+        ON_MEMTABLE_SEALED, ON_COLUMN_FAMILY_HANDLE_DELETION_STARTED, ON_EXTERNAL_FILE_INGESTED,
+        ON_BACKGROUND_ERROR, ON_STALL_CONDITIONS_CHANGED, ON_FILE_READ_FINISH, ON_FILE_WRITE_FINISH,
+        ON_FILE_FLUSH_FINISH, ON_FILE_SYNC_FINISH, ON_FILE_RANGE_SYNC_FINISH,
+        ON_FILE_TRUNCATE_FINISH, ON_FILE_CLOSE_FINISH, SHOULD_BE_NOTIFIED_ON_FILE_IO,
+        ON_ERROR_RECOVERY_BEGIN, ON_ERROR_RECOVERY_COMPLETED);
+  }
+
+  /**
+   * Creates an Event Listener that will
+   * receive only certain callbacks from C++.
+   *
+   * @param enabledEventCallbacks callbacks to enable in Java.
+   */
+  protected AbstractEventListener(final EnabledEventCallback... enabledEventCallbacks) {
+    super(packToLong(enabledEventCallbacks));
+  }
+
+  /**
+   * Pack EnabledEventCallbacks to a long.
+   *
+   * @param enabledEventCallbacks the flags
+   *
+   * @return a long
+   */
+  private static long packToLong(final EnabledEventCallback... enabledEventCallbacks) {
+    long l = 0;
+    for (int i = 0; i < enabledEventCallbacks.length; i++) {
+      l |= 1 << enabledEventCallbacks[i].getValue();
+    }
+    return l;
+  }
+
+  @Override
+  public void onFlushCompleted(final RocksDB db, final FlushJobInfo flushJobInfo) {
+    // no-op
+  }
+
+  /**
+   * Called from JNI, proxy for
+   *     {@link #onFlushCompleted(RocksDB, FlushJobInfo)}.
+   *
+   * @param dbHandle native handle of the database
+   * @param flushJobInfo the flush job info
+   */
+  private void onFlushCompletedProxy(final long dbHandle, final FlushJobInfo flushJobInfo) {
+    final RocksDB db = new RocksDB(dbHandle);
+    db.disOwnNativeHandle(); // we don't own this!
+    onFlushCompleted(db, flushJobInfo);
+  }
+
+  @Override
+  public void onFlushBegin(final RocksDB db, final FlushJobInfo flushJobInfo) {
+    // no-op
+  }
+
+  /**
+   * Called from JNI, proxy for
+   *     {@link #onFlushBegin(RocksDB, FlushJobInfo)}.
+   *
+   * @param dbHandle native handle of the database
+   * @param flushJobInfo the flush job info
+   */
+  private void onFlushBeginProxy(final long dbHandle, final FlushJobInfo flushJobInfo) {
+    final RocksDB db = new RocksDB(dbHandle);
+    db.disOwnNativeHandle(); // we don't own this!
+    onFlushBegin(db, flushJobInfo);
+  }
+
+  @Override
+  public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onCompactionBegin(final RocksDB db, final CompactionJobInfo compactionJobInfo) {
+    // no-op
+  }
+
+  /**
+   * Called from JNI, proxy for
+   *     {@link #onCompactionBegin(RocksDB, CompactionJobInfo)}.
+   *
+   * @param dbHandle native handle of the database
+   * @param compactionJobInfo the flush job info
+   */
+  private void onCompactionBeginProxy(
+      final long dbHandle, final CompactionJobInfo compactionJobInfo) {
+    final RocksDB db = new RocksDB(dbHandle);
+    db.disOwnNativeHandle(); // we don't own this!
+    onCompactionBegin(db, compactionJobInfo);
+  }
+
+  @Override
+  public void onCompactionCompleted(final RocksDB db, final CompactionJobInfo compactionJobInfo) {
+    // no-op
+  }
+
+  /**
+   * Called from JNI, proxy for
+   *     {@link #onCompactionCompleted(RocksDB, CompactionJobInfo)}.
+   *
+   * @param dbHandle native handle of the database
+   * @param compactionJobInfo the flush job info
+   */
+  private void onCompactionCompletedProxy(
+      final long dbHandle, final CompactionJobInfo compactionJobInfo) {
+    final RocksDB db = new RocksDB(dbHandle);
+    db.disOwnNativeHandle(); // we don't own this!
+    onCompactionCompleted(db, compactionJobInfo);
+  }
+
+  @Override
+  public void onTableFileCreated(final TableFileCreationInfo tableFileCreationInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onTableFileCreationStarted(
+      final TableFileCreationBriefInfo tableFileCreationBriefInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onMemTableSealed(final MemTableInfo memTableInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onColumnFamilyHandleDeletionStarted(final ColumnFamilyHandle columnFamilyHandle) {
+    // no-op
+  }
+
+  @Override
+  public void onExternalFileIngested(
+      final RocksDB db, final ExternalFileIngestionInfo externalFileIngestionInfo) {
+    // no-op
+  }
+
+  /**
+   * Called from JNI, proxy for
+   *     {@link #onExternalFileIngested(RocksDB, ExternalFileIngestionInfo)}.
+   *
+   * @param dbHandle native handle of the database
+   * @param externalFileIngestionInfo the flush job info
+   */
+  private void onExternalFileIngestedProxy(
+      final long dbHandle, final ExternalFileIngestionInfo externalFileIngestionInfo) {
+    final RocksDB db = new RocksDB(dbHandle);
+    db.disOwnNativeHandle(); // we don't own this!
+    onExternalFileIngested(db, externalFileIngestionInfo);
+  }
+
+  @Override
+  public void onBackgroundError(
+      final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) {
+    // no-op
+  }
+
+  /**
+   * Called from JNI, proxy for
+   *     {@link #onBackgroundError(BackgroundErrorReason, Status)}.
+   *
+   * @param reasonByte byte value representing error reason
+   * @param backgroundError status with error code
+   */
+  private void onBackgroundErrorProxy(final byte reasonByte, final Status backgroundError) {
+    onBackgroundError(BackgroundErrorReason.fromValue(reasonByte), backgroundError);
+  }
+
+  @Override
+  public void onStallConditionsChanged(final WriteStallInfo writeStallInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onFileReadFinish(final FileOperationInfo fileOperationInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onFileWriteFinish(final FileOperationInfo fileOperationInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onFileFlushFinish(final FileOperationInfo fileOperationInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onFileSyncFinish(final FileOperationInfo fileOperationInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onFileRangeSyncFinish(final FileOperationInfo fileOperationInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onFileTruncateFinish(final FileOperationInfo fileOperationInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onFileCloseFinish(final FileOperationInfo fileOperationInfo) {
+    // no-op
+  }
+
+  @Override
+  public boolean shouldBeNotifiedOnFileIO() {
+    return false;
+  }
+
+  @Override
+  public boolean onErrorRecoveryBegin(
+      final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) {
+    return true;
+  }
+
+  /**
+   * Called from JNI, proxy for
+   *     {@link #onErrorRecoveryBegin(BackgroundErrorReason, Status)}.
+   *
+   * @param reasonByte byte value representing error reason
+   * @param backgroundError status with error code
+   */
+  private boolean onErrorRecoveryBeginProxy(final byte reasonByte, final Status backgroundError) {
+    return onErrorRecoveryBegin(BackgroundErrorReason.fromValue(reasonByte), backgroundError);
+  }
+
+  @Override
+  public void onErrorRecoveryCompleted(final Status oldBackgroundError) {
+    // no-op
+  }
+
+  @Override
+  protected long initializeNative(final long... nativeParameterHandles) {
+    return createNewEventListener(nativeParameterHandles[0]);
+  }
+
+  /**
+   * Deletes underlying C++ native callback object pointer
+   */
+  @Override
+  protected void disposeInternal() {
+    disposeInternal(nativeHandle_);
+  }
+
+  private native long createNewEventListener(final long enabledEventCallbackValues);
+  private native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/AbstractImmutableNativeReference.java b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractImmutableNativeReference.java
new file mode 100644
index 000000000..173d63e90
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractImmutableNativeReference.java
@@ -0,0 +1,65 @@
+// Copyright (c) 2016, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.concurrent.atomic.AtomicBoolean;
+
+/**
+ * Offers functionality for implementations of
+ * {@link AbstractNativeReference} which have an immutable reference to the
+ * underlying native C++ object
+ */
+//@ThreadSafe
+public abstract class AbstractImmutableNativeReference
+    extends AbstractNativeReference {
+
+  /**
+   * A flag indicating whether the current {@code AbstractNativeReference} is
+   * responsible to free the underlying C++ object
+   */
+  protected final AtomicBoolean owningHandle_;
+
+  protected AbstractImmutableNativeReference(final boolean owningHandle) {
+    this.owningHandle_ = new AtomicBoolean(owningHandle);
+  }
+
+  @Override
+  public boolean isOwningHandle() {
+    return owningHandle_.get();
+  }
+
+  /**
+   * Releases this {@code AbstractNativeReference} from  the responsibility of
+   * freeing the underlying native C++ object
+   * <p>
+   * This will prevent the object from attempting to delete the underlying
+   * native object in {@code close()}. This must be used when another object
+   * takes over ownership of the native object or both will attempt to delete
+   * the underlying object when closed.
+   * <p>
+   * When {@code disOwnNativeHandle()} is called, {@code close()} will
+   * subsequently take no action. As a result, incorrect use of this function
+   * may cause a memory leak.
+   * </p>
+   */
+  protected final void disOwnNativeHandle() {
+    owningHandle_.set(false);
+  }
+
+  @Override
+  public void close() {
+    if (owningHandle_.compareAndSet(true, false)) {
+      disposeInternal();
+    }
+  }
+
+  /**
+   * The helper function of {@link AbstractImmutableNativeReference#close()}
+   * which all subclasses of {@code AbstractImmutableNativeReference} must
+   * implement to release their underlying native C++ objects.
+   */
+  protected abstract void disposeInternal();
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/AbstractMutableOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractMutableOptions.java
new file mode 100644
index 000000000..7189272b8
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractMutableOptions.java
@@ -0,0 +1,370 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+package org.rocksdb;
+
+import java.util.*;
+
+public abstract class AbstractMutableOptions {
+
+  protected static final String KEY_VALUE_PAIR_SEPARATOR = ";";
+  protected static final char KEY_VALUE_SEPARATOR = '=';
+  static final String INT_ARRAY_INT_SEPARATOR = ":";
+
+  protected final String[] keys;
+  private final String[] values;
+
+  /**
+   * User must use builder pattern, or parser.
+   *
+   * @param keys the keys
+   * @param values the values
+   */
+  protected AbstractMutableOptions(final String[] keys, final String[] values) {
+    this.keys = keys;
+    this.values = values;
+  }
+
+  String[] getKeys() {
+    return keys;
+  }
+
+  String[] getValues() {
+    return values;
+  }
+
+  /**
+   * Returns a string representation of MutableOptions which
+   * is suitable for consumption by {@code #parse(String)}.
+   *
+   * @return String representation of MutableOptions
+   */
+  @Override
+  public String toString() {
+    final StringBuilder buffer = new StringBuilder();
+    for(int i = 0; i < keys.length; i++) {
+      buffer
+          .append(keys[i])
+          .append(KEY_VALUE_SEPARATOR)
+          .append(values[i]);
+
+      if(i + 1 < keys.length) {
+        buffer.append(KEY_VALUE_PAIR_SEPARATOR);
+      }
+    }
+    return buffer.toString();
+  }
+
+  public static abstract class AbstractMutableOptionsBuilder<
+      T extends AbstractMutableOptions,
+      U extends AbstractMutableOptionsBuilder<T, U, K>,
+      K extends MutableOptionKey> {
+
+    private final Map<K, MutableOptionValue<?>> options = new LinkedHashMap<>();
+    private final List<OptionString.Entry> unknown = new ArrayList<>();
+
+    protected abstract U self();
+
+    /**
+     * Get all of the possible keys
+     *
+     * @return A map of all keys, indexed by name.
+     */
+    protected abstract Map<String, K> allKeys();
+
+    /**
+     * Construct a sub-class instance of {@link AbstractMutableOptions}.
+     *
+     * @param keys the keys
+     * @param values the values
+     *
+     * @return an instance of the options.
+     */
+    protected abstract T build(final String[] keys, final String[] values);
+
+    public T build() {
+      final String[] keys = new String[options.size()];
+      final String[] values = new String[options.size()];
+
+      int i = 0;
+      for (final Map.Entry<K, MutableOptionValue<?>> option : options.entrySet()) {
+        keys[i] = option.getKey().name();
+        values[i] = option.getValue().asString();
+        i++;
+      }
+
+      return build(keys, values);
+    }
+
+    protected U setDouble(
+       final K key, final double value) {
+      if (key.getValueType() != MutableOptionKey.ValueType.DOUBLE) {
+        throw new IllegalArgumentException(
+            key + " does not accept a double value");
+      }
+      options.put(key, MutableOptionValue.fromDouble(value));
+      return self();
+    }
+
+    protected double getDouble(final K key)
+        throws NoSuchElementException, NumberFormatException {
+      final MutableOptionValue<?> value = options.get(key);
+      if(value == null) {
+        throw new NoSuchElementException(key.name() + " has not been set");
+      }
+      return value.asDouble();
+    }
+
+    protected U setLong(
+        final K key, final long value) {
+      if(key.getValueType() != MutableOptionKey.ValueType.LONG) {
+        throw new IllegalArgumentException(
+            key + " does not accept a long value");
+      }
+      options.put(key, MutableOptionValue.fromLong(value));
+      return self();
+    }
+
+    protected long getLong(final K key)
+        throws NoSuchElementException, NumberFormatException {
+      final MutableOptionValue<?> value = options.get(key);
+      if(value == null) {
+        throw new NoSuchElementException(key.name() + " has not been set");
+      }
+      return value.asLong();
+    }
+
+    protected U setInt(
+        final K key, final int value) {
+      if(key.getValueType() != MutableOptionKey.ValueType.INT) {
+        throw new IllegalArgumentException(
+            key + " does not accept an integer value");
+      }
+      options.put(key, MutableOptionValue.fromInt(value));
+      return self();
+    }
+
+    protected int getInt(final K key)
+        throws NoSuchElementException, NumberFormatException {
+      final MutableOptionValue<?> value = options.get(key);
+      if(value == null) {
+        throw new NoSuchElementException(key.name() + " has not been set");
+      }
+      return value.asInt();
+    }
+
+    protected U setBoolean(
+        final K key, final boolean value) {
+      if(key.getValueType() != MutableOptionKey.ValueType.BOOLEAN) {
+        throw new IllegalArgumentException(
+            key + " does not accept a boolean value");
+      }
+      options.put(key, MutableOptionValue.fromBoolean(value));
+      return self();
+    }
+
+    protected boolean getBoolean(final K key)
+        throws NoSuchElementException, NumberFormatException {
+      final MutableOptionValue<?> value = options.get(key);
+      if(value == null) {
+        throw new NoSuchElementException(key.name() + " has not been set");
+      }
+      return value.asBoolean();
+    }
+
+    protected U setIntArray(
+        final K key, final int[] value) {
+      if(key.getValueType() != MutableOptionKey.ValueType.INT_ARRAY) {
+        throw new IllegalArgumentException(
+            key + " does not accept an int array value");
+      }
+      options.put(key, MutableOptionValue.fromIntArray(value));
+      return self();
+    }
+
+    protected int[] getIntArray(final K key)
+        throws NoSuchElementException, NumberFormatException {
+      final MutableOptionValue<?> value = options.get(key);
+      if(value == null) {
+        throw new NoSuchElementException(key.name() + " has not been set");
+      }
+      return value.asIntArray();
+    }
+
+    protected <N extends Enum<N>> U setEnum(
+        final K key, final N value) {
+      if(key.getValueType() != MutableOptionKey.ValueType.ENUM) {
+        throw new IllegalArgumentException(
+            key + " does not accept a Enum value");
+      }
+      options.put(key, MutableOptionValue.fromEnum(value));
+      return self();
+    }
+
+    @SuppressWarnings("unchecked")
+    protected <N extends Enum<N>> N getEnum(final K key)
+        throws NoSuchElementException, NumberFormatException {
+      final MutableOptionValue<?> value = options.get(key);
+      if (value == null) {
+        throw new NoSuchElementException(key.name() + " has not been set");
+      }
+
+      if (!(value instanceof MutableOptionValue.MutableOptionEnumValue)) {
+        throw new NoSuchElementException(key.name() + " is not of Enum type");
+      }
+
+      return ((MutableOptionValue.MutableOptionEnumValue<N>) value).asObject();
+    }
+
+    /**
+     * Parse a string into a long value, accepting values expressed as a double (such as 9.00) which
+     * are meant to be a long, not a double
+     *
+     * @param value the string containing a value which represents a long
+     * @return the long value of the parsed string
+     */
+    private long parseAsLong(final String value) {
+      try {
+        return Long.parseLong(value);
+      } catch (NumberFormatException nfe) {
+        final double doubleValue = Double.parseDouble(value);
+        if (doubleValue != Math.round(doubleValue))
+          throw new IllegalArgumentException("Unable to parse or round " + value + " to long");
+        return Math.round(doubleValue);
+      }
+    }
+
+    /**
+     * Parse a string into an int value, accepting values expressed as a double (such as 9.00) which
+     * are meant to be an int, not a double
+     *
+     * @param value the string containing a value which represents an int
+     * @return the int value of the parsed string
+     */
+    private int parseAsInt(final String value) {
+      try {
+        return Integer.parseInt(value);
+      } catch (NumberFormatException nfe) {
+        final double doubleValue = Double.parseDouble(value);
+        if (doubleValue != Math.round(doubleValue))
+          throw new IllegalArgumentException("Unable to parse or round " + value + " to int");
+        return (int) Math.round(doubleValue);
+      }
+    }
+
+    /**
+     * Constructs a builder for mutable column family options from a hierarchical parsed options
+     * string representation. The {@link OptionString.Parser} class output has been used to create a
+     * (name,value)-list; each value may be either a simple string or a (name, value)-list in turn.
+     *
+     * @param options a list of parsed option string objects
+     * @param ignoreUnknown what to do if the key is not one of the keys we expect
+     *
+     * @return a builder with the values from the parsed input set
+     *
+     * @throws IllegalArgumentException if an option value is of the wrong type, or a key is empty
+     */
+    protected U fromParsed(final List<OptionString.Entry> options, final boolean ignoreUnknown) {
+      Objects.requireNonNull(options);
+
+      for (final OptionString.Entry option : options) {
+        try {
+          if (option.key.isEmpty()) {
+            throw new IllegalArgumentException("options string is invalid: " + option);
+          }
+          fromOptionString(option, ignoreUnknown);
+        } catch (NumberFormatException nfe) {
+          throw new IllegalArgumentException(
+              "" + option.key + "=" + option.value + " - not a valid value for its type", nfe);
+        }
+      }
+
+      return self();
+    }
+
+    /**
+     * Set a value in the builder from the supplied option string
+     *
+     * @param option the option key/value to add to this builder
+     * @param ignoreUnknown if this is not set, throw an exception when a key is not in the known
+     *     set
+     * @return the same object, after adding options
+     * @throws IllegalArgumentException if the key is unkown, or a value has the wrong type/form
+     */
+    private U fromOptionString(final OptionString.Entry option, final boolean ignoreUnknown)
+        throws IllegalArgumentException {
+      Objects.requireNonNull(option.key);
+      Objects.requireNonNull(option.value);
+
+      final K key = allKeys().get(option.key);
+      if (key == null && ignoreUnknown) {
+        unknown.add(option);
+        return self();
+      } else if (key == null) {
+        throw new IllegalArgumentException("Key: " + key + " is not a known option key");
+      }
+
+      if (!option.value.isList()) {
+        throw new IllegalArgumentException(
+            "Option: " + key + " is not a simple value or list, don't know how to parse it");
+      }
+
+      // Check that simple values are the single item in the array
+      if (key.getValueType() != MutableOptionKey.ValueType.INT_ARRAY) {
+        {
+          if (option.value.list.size() != 1) {
+            throw new IllegalArgumentException(
+                "Simple value does not have exactly 1 item: " + option.value.list);
+          }
+        }
+      }
+
+      final List<String> valueStrs = option.value.list;
+      final String valueStr = valueStrs.get(0);
+
+      switch (key.getValueType()) {
+        case DOUBLE:
+          return setDouble(key, Double.parseDouble(valueStr));
+
+        case LONG:
+          return setLong(key, parseAsLong(valueStr));
+
+        case INT:
+          return setInt(key, parseAsInt(valueStr));
+
+        case BOOLEAN:
+          return setBoolean(key, Boolean.parseBoolean(valueStr));
+
+        case INT_ARRAY:
+          final int[] value = new int[valueStrs.size()];
+          for (int i = 0; i < valueStrs.size(); i++) {
+            value[i] = Integer.parseInt(valueStrs.get(i));
+          }
+          return setIntArray(key, value);
+
+        case ENUM:
+          String optionName = key.name();
+          if (optionName.equals("prepopulate_blob_cache")) {
+            final PrepopulateBlobCache prepopulateBlobCache =
+                PrepopulateBlobCache.getFromInternal(valueStr);
+            return setEnum(key, prepopulateBlobCache);
+          } else if (optionName.equals("compression")
+              || optionName.equals("blob_compression_type")) {
+            final CompressionType compressionType = CompressionType.getFromInternal(valueStr);
+            return setEnum(key, compressionType);
+          } else {
+            throw new IllegalArgumentException("Unknown enum type: " + key.name());
+          }
+
+        default:
+          throw new IllegalStateException(key + " has unknown value type: " + key.getValueType());
+      }
+    }
+
+    /**
+     *
+     * @return the list of keys encountered which were not known to the type being generated
+     */
+    public List<OptionString.Entry> getUnknown() {
+      return new ArrayList<>(unknown);
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/AbstractNativeReference.java b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractNativeReference.java
new file mode 100644
index 000000000..88b2963b6
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractNativeReference.java
@@ -0,0 +1,48 @@
+// Copyright (c) 2016, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * AbstractNativeReference is the base-class of all RocksDB classes that have
+ * a pointer to a native C++ {@code rocksdb} object.
+ * <p>
+ * AbstractNativeReference has the {@link AbstractNativeReference#close()}
+ * method, which frees its associated C++ object.</p>
+ * <p>
+ * This function should be called manually, or even better, called implicitly using a
+ * <a
+ * href="https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html">try-with-resources</a>
+ * statement, when you are finished with the object. It is no longer
+ * called automatically during the regular Java GC process via
+ * {@link AbstractNativeReference#finalize()}.</p>
+ * <p>
+ * Explanatory note - When or if the Garbage Collector calls {@link Object#finalize()}
+ * depends on the JVM implementation and system conditions, which the programmer
+ * cannot control. In addition, the GC cannot see through the native reference
+ * long member variable (which is the C++ pointer value to the native object),
+ * and cannot know what other resources depend on it.
+ * </p>
+ */
+public abstract class AbstractNativeReference implements AutoCloseable {
+  /**
+   * Returns true if we are responsible for freeing the underlying C++ object
+   *
+   * @return true if we are responsible to free the C++ object
+   */
+  protected abstract boolean isOwningHandle();
+
+  /**
+   * Frees the underlying C++ object
+   * <p>
+   * It is strong recommended that the developer calls this after they
+   * have finished using the object.</p>
+   * <p>
+   * Note, that once an instance of {@link AbstractNativeReference} has been
+   * closed, calling any of its functions will lead to undefined
+   * behavior.</p>
+   */
+  @Override public abstract void close();
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/AbstractRocksIterator.java b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractRocksIterator.java
new file mode 100644
index 000000000..1aade1b89
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractRocksIterator.java
@@ -0,0 +1,146 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.nio.ByteBuffer;
+
+/**
+ * Base class implementation for Rocks Iterators
+ * in the Java API
+ *
+ * <p>Multiple threads can invoke const methods on an RocksIterator without
+ * external synchronization, but if any of the threads may call a
+ * non-const method, all threads accessing the same RocksIterator must use
+ * external synchronization.</p>
+ *
+ * @param <P> The type of the Parent Object from which the Rocks Iterator was
+ *          created. This is used by disposeInternal to avoid double-free
+ *          issues with the underlying C++ object.
+ * @see org.rocksdb.RocksObject
+ */
+public abstract class AbstractRocksIterator<P extends RocksObject>
+    extends RocksObject implements RocksIteratorInterface {
+  final P parent_;
+
+  protected AbstractRocksIterator(final P parent,
+      final long nativeHandle) {
+    super(nativeHandle);
+    // parent must point to a valid RocksDB instance.
+    assert (parent != null);
+    // RocksIterator must hold a reference to the related parent instance
+    // to guarantee that while a GC cycle starts RocksIterator instances
+    // are freed prior to parent instances.
+    parent_ = parent;
+  }
+
+  @Override
+  public boolean isValid() {
+    assert (isOwningHandle());
+    return isValid0(nativeHandle_);
+  }
+
+  @Override
+  public void seekToFirst() {
+    assert (isOwningHandle());
+    seekToFirst0(nativeHandle_);
+  }
+
+  @Override
+  public void seekToLast() {
+    assert (isOwningHandle());
+    seekToLast0(nativeHandle_);
+  }
+
+  @Override
+  public void seek(final byte[] target) {
+    assert (isOwningHandle());
+    seek0(nativeHandle_, target, target.length);
+  }
+
+  @Override
+  public void seekForPrev(final byte[] target) {
+    assert (isOwningHandle());
+    seekForPrev0(nativeHandle_, target, target.length);
+  }
+
+  @Override
+  public void seek(final ByteBuffer target) {
+    assert (isOwningHandle());
+    if (target.isDirect()) {
+      seekDirect0(nativeHandle_, target, target.position(), target.remaining());
+    } else {
+      seekByteArray0(nativeHandle_, target.array(), target.arrayOffset() + target.position(),
+          target.remaining());
+    }
+    target.position(target.limit());
+  }
+
+  @Override
+  public void seekForPrev(final ByteBuffer target) {
+    assert (isOwningHandle());
+    if (target.isDirect()) {
+      seekForPrevDirect0(nativeHandle_, target, target.position(), target.remaining());
+    } else {
+      seekForPrevByteArray0(nativeHandle_, target.array(), target.arrayOffset() + target.position(),
+          target.remaining());
+    }
+    target.position(target.limit());
+  }
+
+  @Override
+  public void next() {
+    assert (isOwningHandle());
+    next0(nativeHandle_);
+  }
+
+  @Override
+  public void prev() {
+    assert (isOwningHandle());
+    prev0(nativeHandle_);
+  }
+
+  @Override
+  public void refresh() throws RocksDBException {
+    assert (isOwningHandle());
+    refresh0(nativeHandle_);
+  }
+
+  @Override
+  public void status() throws RocksDBException {
+    assert (isOwningHandle());
+    status0(nativeHandle_);
+  }
+
+  /**
+   * <p>Deletes underlying C++ iterator pointer.</p>
+   *
+   * <p>Note: the underlying handle can only be safely deleted if the parent
+   * instance related to a certain RocksIterator is still valid and initialized.
+   * Therefore {@code disposeInternal()} checks if the parent is initialized
+   * before freeing the native handle.</p>
+   */
+  @Override
+  protected void disposeInternal() {
+      if (parent_.isOwningHandle()) {
+        disposeInternal(nativeHandle_);
+      }
+  }
+
+  abstract boolean isValid0(long handle);
+  abstract void seekToFirst0(long handle);
+  abstract void seekToLast0(long handle);
+  abstract void next0(long handle);
+  abstract void prev0(long handle);
+  abstract void refresh0(long handle) throws RocksDBException;
+  abstract void seek0(long handle, byte[] target, int targetLen);
+  abstract void seekForPrev0(long handle, byte[] target, int targetLen);
+  abstract void seekDirect0(long handle, ByteBuffer target, int targetOffset, int targetLen);
+  abstract void seekForPrevDirect0(long handle, ByteBuffer target, int targetOffset, int targetLen);
+  abstract void seekByteArray0(long handle, byte[] target, int targetOffset, int targetLen);
+  abstract void seekForPrevByteArray0(long handle, byte[] target, int targetOffset, int targetLen);
+
+  abstract void status0(long handle) throws RocksDBException;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/AbstractSlice.java b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractSlice.java
new file mode 100644
index 000000000..5a22e2956
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractSlice.java
@@ -0,0 +1,191 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Slices are used by RocksDB to provide
+ * efficient access to keys and values.
+ *
+ * This class is package private, implementers
+ * should extend either of the public abstract classes:
+ *   @see org.rocksdb.Slice
+ *   @see org.rocksdb.DirectSlice
+ *
+ * Regards the lifecycle of Java Slices in RocksDB:
+ *   At present when you configure a Comparator from Java, it creates an
+ *   instance of a C++ BaseComparatorJniCallback subclass and
+ *   passes that to RocksDB as the comparator. That subclass of
+ *   BaseComparatorJniCallback creates the Java
+ *   @see org.rocksdb.AbstractSlice subclass Objects. When you dispose
+ *   the Java @see org.rocksdb.AbstractComparator subclass, it disposes the
+ *   C++ BaseComparatorJniCallback subclass, which in turn destroys the
+ *   Java @see org.rocksdb.AbstractSlice subclass Objects.
+ */
+public abstract class AbstractSlice<T> extends RocksMutableObject {
+
+  protected AbstractSlice() {
+    super();
+  }
+
+  protected AbstractSlice(final long nativeHandle) {
+    super(nativeHandle);
+  }
+
+  /**
+   * Returns the data of the slice.
+   *
+   * @return The slice data. Note, the type of access is
+   *   determined by the subclass
+   *   @see org.rocksdb.AbstractSlice#data0(long)
+   */
+  public T data() {
+    return data0(getNativeHandle());
+  }
+
+  /**
+   * Access to the data is provided by the
+   * subtype as it needs to handle the
+   * generic typing.
+   *
+   * @param handle The address of the underlying
+   *   native object.
+   *
+   * @return Java typed access to the data.
+   */
+  protected abstract T data0(long handle);
+
+  /**
+   * Drops the specified {@code n}
+   * number of bytes from the start
+   * of the backing slice
+   *
+   * @param n The number of bytes to drop
+   */
+  public abstract void removePrefix(final int n);
+
+  /**
+   * Clears the backing slice
+   */
+  public abstract void clear();
+
+  /**
+   * Return the length (in bytes) of the data.
+   *
+   * @return The length in bytes.
+   */
+  public int size() {
+    return size0(getNativeHandle());
+  }
+
+  /**
+   * Return true if the length of the
+   * data is zero.
+   *
+   * @return true if there is no data, false otherwise.
+   */
+  public boolean empty() {
+    return empty0(getNativeHandle());
+  }
+
+  /**
+   * Creates a string representation of the data
+   *
+   * @param hex When true, the representation
+   *   will be encoded in hexadecimal.
+   *
+   * @return The string representation of the data.
+   */
+  public String toString(final boolean hex) {
+    return toString0(getNativeHandle(), hex);
+  }
+
+  @Override
+  public String toString() {
+    return toString(false);
+  }
+
+  /**
+   * Three-way key comparison
+   *
+   *  @param other A slice to compare against
+   *
+   *  @return Should return either:
+   *    1) &lt; 0 if this &lt; other
+   *    2) == 0 if this == other
+   *    3) &gt; 0 if this &gt; other
+   */
+  public int compare(final AbstractSlice<?> other) {
+    assert (other != null);
+    if(!isOwningHandle()) {
+      return other.isOwningHandle() ? -1 : 0;
+    } else {
+      if(!other.isOwningHandle()) {
+        return 1;
+      } else {
+        return compare0(getNativeHandle(), other.getNativeHandle());
+      }
+    }
+  }
+
+  @Override
+  public int hashCode() {
+    return toString().hashCode();
+  }
+
+  /**
+   * If other is a slice object, then
+   * we defer to {@link #compare(AbstractSlice) compare}
+   * to check equality, otherwise we return false.
+   *
+   * @param other Object to test for equality
+   *
+   * @return true when {@code this.compare(other) == 0},
+   *   false otherwise.
+   */
+  @Override
+  public boolean equals(final Object other) {
+    if (other != null && other instanceof AbstractSlice) {
+      return compare((AbstractSlice<?>)other) == 0;
+    } else {
+      return false;
+    }
+  }
+
+  /**
+   * Determines whether this slice starts with
+   * another slice
+   *
+   * @param prefix Another slice which may of may not
+   *   be a prefix of this slice.
+   *
+   * @return true when this slice starts with the
+   *   {@code prefix} slice
+   */
+  public boolean startsWith(final AbstractSlice<?> prefix) {
+    if (prefix != null) {
+      return startsWith0(getNativeHandle(), prefix.getNativeHandle());
+    } else {
+      return false;
+    }
+  }
+
+  protected native static long createNewSliceFromString(final String str);
+  private native int size0(long handle);
+  private native boolean empty0(long handle);
+  private native String toString0(long handle, boolean hex);
+  private native int compare0(long handle, long otherHandle);
+  private native boolean startsWith0(long handle, long otherHandle);
+
+  /**
+   * Deletes underlying C++ slice pointer.
+   * Note that this function should be called only after all
+   * RocksDB instances referencing the slice are closed.
+   * Otherwise an undefined behavior will occur.
+   */
+  @Override
+  protected final native void disposeInternal(final long handle);
+
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/AbstractTableFilter.java b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractTableFilter.java
new file mode 100644
index 000000000..c696c3e13
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractTableFilter.java
@@ -0,0 +1,20 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+package org.rocksdb;
+
+/**
+ * Base class for Table Filters.
+ */
+public abstract class AbstractTableFilter
+    extends RocksCallbackObject implements TableFilter {
+
+  protected AbstractTableFilter() {
+    super();
+  }
+
+  @Override
+  protected long initializeNative(final long... nativeParameterHandles) {
+    return createNewTableFilter();
+  }
+
+  private native long createNewTableFilter();
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/AbstractTraceWriter.java b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractTraceWriter.java
new file mode 100644
index 000000000..806709b1f
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractTraceWriter.java
@@ -0,0 +1,70 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Base class for TraceWriters.
+ */
+public abstract class AbstractTraceWriter
+    extends RocksCallbackObject implements TraceWriter {
+
+  @Override
+  protected long initializeNative(final long... nativeParameterHandles) {
+    return createNewTraceWriter();
+  }
+
+  /**
+   * Called from JNI, proxy for {@link TraceWriter#write(Slice)}.
+   *
+   * @param sliceHandle the native handle of the slice (which we do not own)
+   *
+   * @return short (2 bytes) where the first byte is the
+   *     {@link Status.Code#getValue()} and the second byte is the
+   *     {@link Status.SubCode#getValue()}.
+   */
+  private short writeProxy(final long sliceHandle) {
+    try {
+      write(new Slice(sliceHandle));
+      return statusToShort(Status.Code.Ok, Status.SubCode.None);
+    } catch (final RocksDBException e) {
+      return statusToShort(e.getStatus());
+    }
+  }
+
+  /**
+   * Called from JNI, proxy for {@link TraceWriter#closeWriter()}.
+   *
+   * @return short (2 bytes) where the first byte is the
+   *     {@link Status.Code#getValue()} and the second byte is the
+   *     {@link Status.SubCode#getValue()}.
+   */
+  private short closeWriterProxy() {
+    try {
+      closeWriter();
+      return statusToShort(Status.Code.Ok, Status.SubCode.None);
+    } catch (final RocksDBException e) {
+      return statusToShort(e.getStatus());
+    }
+  }
+
+  private static short statusToShort(/*@Nullable*/ final Status status) {
+    final Status.Code code = status != null && status.getCode() != null
+        ? status.getCode()
+        : Status.Code.IOError;
+    final Status.SubCode subCode = status != null && status.getSubCode() != null
+        ? status.getSubCode()
+        : Status.SubCode.None;
+    return statusToShort(code, subCode);
+  }
+
+  private static short statusToShort(final Status.Code code,
+      final Status.SubCode subCode) {
+    short result = (short)(code.getValue() << 8);
+    return (short)(result | subCode.getValue());
+  }
+
+  private native long createNewTraceWriter();
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/AbstractTransactionNotifier.java b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractTransactionNotifier.java
new file mode 100644
index 000000000..cbb49836d
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractTransactionNotifier.java
@@ -0,0 +1,54 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Provides notification to the caller of SetSnapshotOnNextOperation when
+ * the actual snapshot gets created
+ */
+public abstract class AbstractTransactionNotifier
+    extends RocksCallbackObject {
+
+  protected AbstractTransactionNotifier() {
+    super();
+  }
+
+  /**
+   * Implement this method to receive notification when a snapshot is
+   * requested via {@link Transaction#setSnapshotOnNextOperation()}.
+   *
+   * @param newSnapshot the snapshot that has been created.
+   */
+  public abstract void snapshotCreated(final Snapshot newSnapshot);
+
+  /**
+   * This is intentionally private as it is the callback hook
+   * from JNI
+   */
+  private void snapshotCreated(final long snapshotHandle) {
+    snapshotCreated(new Snapshot(snapshotHandle));
+  }
+
+  @Override
+  protected long initializeNative(final long... nativeParameterHandles) {
+    return createNewTransactionNotifier();
+  }
+
+  private native long createNewTransactionNotifier();
+
+  /**
+   * Deletes underlying C++ TransactionNotifier pointer.
+   *
+   * Note that this function should be called only after all
+   * Transactions referencing the comparator are closed.
+   * Otherwise an undefined behavior will occur.
+   */
+  @Override
+  protected void disposeInternal() {
+    disposeInternal(nativeHandle_);
+  }
+  protected final native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/AbstractWalFilter.java b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractWalFilter.java
new file mode 100644
index 000000000..d525045c6
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractWalFilter.java
@@ -0,0 +1,49 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Base class for WAL Filters.
+ */
+public abstract class AbstractWalFilter
+    extends RocksCallbackObject implements WalFilter {
+
+  @Override
+  protected long initializeNative(final long... nativeParameterHandles) {
+    return createNewWalFilter();
+  }
+
+  /**
+   * Called from JNI, proxy for
+   *     {@link WalFilter#logRecordFound(long, String, WriteBatch, WriteBatch)}.
+   *
+   * @param logNumber the log handle.
+   * @param logFileName the log file name
+   * @param batchHandle the native handle of a WriteBatch (which we do not own)
+   * @param newBatchHandle the native handle of a
+   *     new WriteBatch (which we do not own)
+   *
+   * @return short (2 bytes) where the first byte is the
+   *     {@link WalFilter.LogRecordFoundResult#walProcessingOption}
+   *     {@link WalFilter.LogRecordFoundResult#batchChanged}.
+   */
+  private short logRecordFoundProxy(final long logNumber,
+      final String logFileName, final long batchHandle,
+      final long newBatchHandle) {
+    final LogRecordFoundResult logRecordFoundResult = logRecordFound(
+        logNumber, logFileName, new WriteBatch(batchHandle),
+        new WriteBatch(newBatchHandle));
+    return logRecordFoundResultToShort(logRecordFoundResult);
+  }
+
+  private static short logRecordFoundResultToShort(
+      final LogRecordFoundResult logRecordFoundResult) {
+    short result = (short)(logRecordFoundResult.walProcessingOption.getValue() << 8);
+    return (short)(result | (logRecordFoundResult.batchChanged ? 1 : 0));
+  }
+
+  private native long createNewWalFilter();
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/AbstractWriteBatch.java b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractWriteBatch.java
new file mode 100644
index 000000000..9527a2fd9
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractWriteBatch.java
@@ -0,0 +1,204 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.nio.ByteBuffer;
+
+public abstract class AbstractWriteBatch extends RocksObject
+    implements WriteBatchInterface {
+
+  protected AbstractWriteBatch(final long nativeHandle) {
+    super(nativeHandle);
+  }
+
+  @Override
+  public int count() {
+    return count0(nativeHandle_);
+  }
+
+  @Override
+  public void put(byte[] key, byte[] value) throws RocksDBException {
+    put(nativeHandle_, key, key.length, value, value.length);
+  }
+
+  @Override
+  public void put(ColumnFamilyHandle columnFamilyHandle, byte[] key,
+      byte[] value) throws RocksDBException {
+    put(nativeHandle_, key, key.length, value, value.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  @Override
+  public void merge(byte[] key, byte[] value) throws RocksDBException {
+    merge(nativeHandle_, key, key.length, value, value.length);
+  }
+
+  @Override
+  public void merge(ColumnFamilyHandle columnFamilyHandle, byte[] key,
+      byte[] value) throws RocksDBException {
+    merge(nativeHandle_, key, key.length, value, value.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  @Override
+  public void put(final ByteBuffer key, final ByteBuffer value) throws RocksDBException {
+    assert key.isDirect() && value.isDirect();
+    putDirect(nativeHandle_, key, key.position(), key.remaining(), value, value.position(),
+        value.remaining(), 0);
+    key.position(key.limit());
+    value.position(value.limit());
+  }
+
+  @Override
+  public void put(ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key,
+      final ByteBuffer value) throws RocksDBException {
+    assert key.isDirect() && value.isDirect();
+    putDirect(nativeHandle_, key, key.position(), key.remaining(), value, value.position(),
+        value.remaining(), columnFamilyHandle.nativeHandle_);
+    key.position(key.limit());
+    value.position(value.limit());
+  }
+
+  @Override
+  public void delete(byte[] key) throws RocksDBException {
+    delete(nativeHandle_, key, key.length);
+  }
+
+  @Override
+  public void delete(ColumnFamilyHandle columnFamilyHandle, byte[] key)
+      throws RocksDBException {
+    delete(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_);
+  }
+
+  @Override
+  public void delete(final ByteBuffer key) throws RocksDBException {
+    deleteDirect(nativeHandle_, key, key.position(), key.remaining(), 0);
+    key.position(key.limit());
+  }
+
+  @Override
+  public void delete(ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key)
+      throws RocksDBException {
+    deleteDirect(
+        nativeHandle_, key, key.position(), key.remaining(), columnFamilyHandle.nativeHandle_);
+    key.position(key.limit());
+  }
+
+  @Override
+  public void singleDelete(byte[] key) throws RocksDBException {
+    singleDelete(nativeHandle_, key, key.length);
+  }
+
+  @Override
+  public void singleDelete(ColumnFamilyHandle columnFamilyHandle, byte[] key)
+      throws RocksDBException {
+    singleDelete(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_);
+  }
+
+  @Override
+  public void deleteRange(byte[] beginKey, byte[] endKey)
+      throws RocksDBException {
+    deleteRange(nativeHandle_, beginKey, beginKey.length, endKey, endKey.length);
+  }
+
+  @Override
+  public void deleteRange(ColumnFamilyHandle columnFamilyHandle,
+      byte[] beginKey, byte[] endKey) throws RocksDBException {
+    deleteRange(nativeHandle_, beginKey, beginKey.length, endKey, endKey.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  @Override
+  public void putLogData(byte[] blob) throws RocksDBException {
+    putLogData(nativeHandle_, blob, blob.length);
+  }
+
+  @Override
+  public void clear() {
+    clear0(nativeHandle_);
+  }
+
+  @Override
+  public void setSavePoint() {
+    setSavePoint0(nativeHandle_);
+  }
+
+  @Override
+  public void rollbackToSavePoint() throws RocksDBException {
+    rollbackToSavePoint0(nativeHandle_);
+  }
+
+  @Override
+  public void popSavePoint() throws RocksDBException {
+    popSavePoint(nativeHandle_);
+  }
+
+  @Override
+  public void setMaxBytes(final long maxBytes) {
+    setMaxBytes(nativeHandle_, maxBytes);
+  }
+
+  @Override
+  public WriteBatch getWriteBatch() {
+    return getWriteBatch(nativeHandle_);
+  }
+
+  abstract int count0(final long handle);
+
+  abstract void put(final long handle, final byte[] key, final int keyLen,
+      final byte[] value, final int valueLen) throws RocksDBException;
+
+  abstract void put(final long handle, final byte[] key, final int keyLen,
+      final byte[] value, final int valueLen, final long cfHandle)
+      throws RocksDBException;
+
+  abstract void putDirect(final long handle, final ByteBuffer key, final int keyOffset,
+      final int keyLength, final ByteBuffer value, final int valueOffset, final int valueLength,
+      final long cfHandle) throws RocksDBException;
+
+  abstract void merge(final long handle, final byte[] key, final int keyLen,
+      final byte[] value, final int valueLen) throws RocksDBException;
+
+  abstract void merge(final long handle, final byte[] key, final int keyLen,
+      final byte[] value, final int valueLen, final long cfHandle)
+      throws RocksDBException;
+
+  abstract void delete(final long handle, final byte[] key,
+      final int keyLen) throws RocksDBException;
+
+  abstract void delete(final long handle, final byte[] key,
+      final int keyLen, final long cfHandle) throws RocksDBException;
+
+  abstract void singleDelete(final long handle, final byte[] key, final int keyLen)
+      throws RocksDBException;
+
+  abstract void singleDelete(final long handle, final byte[] key, final int keyLen,
+      final long cfHandle) throws RocksDBException;
+
+  abstract void deleteDirect(final long handle, final ByteBuffer key, final int keyOffset,
+      final int keyLength, final long cfHandle) throws RocksDBException;
+
+  abstract void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen,
+      final byte[] endKey, final int endKeyLen) throws RocksDBException;
+
+  abstract void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen,
+      final byte[] endKey, final int endKeyLen, final long cfHandle) throws RocksDBException;
+
+  abstract void putLogData(final long handle, final byte[] blob,
+      final int blobLen) throws RocksDBException;
+
+  abstract void clear0(final long handle);
+
+  abstract void setSavePoint0(final long handle);
+
+  abstract void rollbackToSavePoint0(final long handle);
+
+  abstract void popSavePoint(final long handle) throws RocksDBException;
+
+  abstract void setMaxBytes(final long handle, long maxBytes);
+
+  abstract WriteBatch getWriteBatch(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/AccessHint.java b/src/rocksdb/java/src/main/java/org/rocksdb/AccessHint.java
new file mode 100644
index 000000000..877c4ab39
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/AccessHint.java
@@ -0,0 +1,53 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * File access pattern once a compaction has started
+ */
+public enum AccessHint {
+  NONE((byte)0x0),
+  NORMAL((byte)0x1),
+  SEQUENTIAL((byte)0x2),
+  WILLNEED((byte)0x3);
+
+  private final byte value;
+
+  AccessHint(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * <p>Returns the byte value of the enumerations value.</p>
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value;
+  }
+
+  /**
+   * <p>Get the AccessHint enumeration value by
+   * passing the byte identifier to this method.</p>
+   *
+   * @param byteIdentifier of AccessHint.
+   *
+   * @return AccessHint instance.
+   *
+   * @throws IllegalArgumentException if the access hint for the byteIdentifier
+   *     cannot be found
+   */
+  public static AccessHint getAccessHint(final byte byteIdentifier) {
+    for (final AccessHint accessHint : AccessHint.values()) {
+      if (accessHint.getValue() == byteIdentifier) {
+        return accessHint;
+      }
+    }
+
+    throw new IllegalArgumentException(
+        "Illegal value provided for AccessHint.");
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java b/src/rocksdb/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
new file mode 100644
index 000000000..5338bc42d
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
@@ -0,0 +1,464 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.List;
+
+/**
+ * Advanced Column Family Options which are not
+ * mutable (i.e. present in {@link AdvancedMutableColumnFamilyOptionsInterface}
+ *
+ * Taken from include/rocksdb/advanced_options.h
+ */
+public interface AdvancedColumnFamilyOptionsInterface<
+    T extends AdvancedColumnFamilyOptionsInterface<T>> {
+  /**
+   * The minimum number of write buffers that will be merged together
+   * before writing to storage.  If set to 1, then
+   * all write buffers are flushed to L0 as individual files and this increases
+   * read amplification because a get request has to check in all of these
+   * files. Also, an in-memory merge may result in writing lesser
+   * data to storage if there are duplicate records in each of these
+   * individual write buffers.  Default: 1
+   *
+   * @param minWriteBufferNumberToMerge the minimum number of write buffers
+   *     that will be merged together.
+   * @return the reference to the current options.
+   */
+  T setMinWriteBufferNumberToMerge(
+      int minWriteBufferNumberToMerge);
+
+  /**
+   * The minimum number of write buffers that will be merged together
+   * before writing to storage.  If set to 1, then
+   * all write buffers are flushed to L0 as individual files and this increases
+   * read amplification because a get request has to check in all of these
+   * files. Also, an in-memory merge may result in writing lesser
+   * data to storage if there are duplicate records in each of these
+   * individual write buffers.  Default: 1
+   *
+   * @return the minimum number of write buffers that will be merged together.
+   */
+  int minWriteBufferNumberToMerge();
+
+  /**
+   * The total maximum number of write buffers to maintain in memory including
+   * copies of buffers that have already been flushed.  Unlike
+   * {@link AdvancedMutableColumnFamilyOptionsInterface#maxWriteBufferNumber()},
+   * this parameter does not affect flushing.
+   * This controls the minimum amount of write history that will be available
+   * in memory for conflict checking when Transactions are used.
+   *
+   * When using an OptimisticTransactionDB:
+   * If this value is too low, some transactions may fail at commit time due
+   * to not being able to determine whether there were any write conflicts.
+   *
+   * When using a TransactionDB:
+   * If Transaction::SetSnapshot is used, TransactionDB will read either
+   * in-memory write buffers or SST files to do write-conflict checking.
+   * Increasing this value can reduce the number of reads to SST files
+   * done for conflict detection.
+   *
+   * Setting this value to 0 will cause write buffers to be freed immediately
+   * after they are flushed.
+   * If this value is set to -1,
+   * {@link AdvancedMutableColumnFamilyOptionsInterface#maxWriteBufferNumber()}
+   * will be used.
+   *
+   * Default:
+   * If using a TransactionDB/OptimisticTransactionDB, the default value will
+   * be set to the value of
+   * {@link AdvancedMutableColumnFamilyOptionsInterface#maxWriteBufferNumber()}
+   * if it is not explicitly set by the user. Otherwise, the default is 0.
+   *
+   * @param maxWriteBufferNumberToMaintain The maximum number of write
+   *     buffers to maintain
+   *
+   * @return the reference to the current options.
+   */
+  T setMaxWriteBufferNumberToMaintain(
+      int maxWriteBufferNumberToMaintain);
+
+  /**
+   * The total maximum number of write buffers to maintain in memory including
+   * copies of buffers that have already been flushed.
+   *
+   * @return maxWriteBufferNumberToMaintain The maximum number of write buffers
+   *     to maintain
+   */
+  int maxWriteBufferNumberToMaintain();
+
+  /**
+   * Allows thread-safe inplace updates.
+   * If inplace_callback function is not set,
+   *   Put(key, new_value) will update inplace the existing_value iff
+   *   * key exists in current memtable
+   *   * new sizeof(new_value) &le; sizeof(existing_value)
+   *   * existing_value for that key is a put i.e. kTypeValue
+   * If inplace_callback function is set, check doc for inplace_callback.
+   * Default: false.
+   *
+   * @param inplaceUpdateSupport true if thread-safe inplace updates
+   *     are allowed.
+   * @return the reference to the current options.
+   */
+  T setInplaceUpdateSupport(
+      boolean inplaceUpdateSupport);
+
+  /**
+   * Allows thread-safe inplace updates.
+   * If inplace_callback function is not set,
+   *   Put(key, new_value) will update inplace the existing_value iff
+   *   * key exists in current memtable
+   *   * new sizeof(new_value) &le; sizeof(existing_value)
+   *   * existing_value for that key is a put i.e. kTypeValue
+   * If inplace_callback function is set, check doc for inplace_callback.
+   * Default: false.
+   *
+   * @return true if thread-safe inplace updates are allowed.
+   */
+  boolean inplaceUpdateSupport();
+
+  /**
+   * Control locality of bloom filter probes to improve cache miss rate.
+   * This option only applies to memtable prefix bloom and plaintable
+   * prefix bloom. It essentially limits the max number of cache lines each
+   * bloom filter check can touch.
+   * This optimization is turned off when set to 0. The number should never
+   * be greater than number of probes. This option can boost performance
+   * for in-memory workload but should use with care since it can cause
+   * higher false positive rate.
+   * Default: 0
+   *
+   * @param bloomLocality the level of locality of bloom-filter probes.
+   * @return the reference to the current options.
+   */
+  T setBloomLocality(int bloomLocality);
+
+  /**
+   * Control locality of bloom filter probes to improve cache miss rate.
+   * This option only applies to memtable prefix bloom and plaintable
+   * prefix bloom. It essentially limits the max number of cache lines each
+   * bloom filter check can touch.
+   * This optimization is turned off when set to 0. The number should never
+   * be greater than number of probes. This option can boost performance
+   * for in-memory workload but should use with care since it can cause
+   * higher false positive rate.
+   * Default: 0
+   *
+   * @return the level of locality of bloom-filter probes.
+   * @see #setBloomLocality(int)
+   */
+  int bloomLocality();
+
+  /**
+   * <p>Different levels can have different compression
+   * policies. There are cases where most lower levels
+   * would like to use quick compression algorithms while
+   * the higher levels (which have more data) use
+   * compression algorithms that have better compression
+   * but could be slower. This array, if non-empty, should
+   * have an entry for each level of the database;
+   * these override the value specified in the previous
+   * field 'compression'.</p>
+   *
+   * <strong>NOTICE</strong>
+   * <p>If {@code level_compaction_dynamic_level_bytes=true},
+   * {@code compression_per_level[0]} still determines {@code L0},
+   * but other elements of the array are based on base level
+   * (the level {@code L0} files are merged to), and may not
+   * match the level users see from info log for metadata.
+   * </p>
+   * <p>If {@code L0} files are merged to {@code level - n},
+   * then, for {@code i&gt;0}, {@code compression_per_level[i]}
+   * determines compaction type for level {@code n+i-1}.</p>
+   *
+   * <strong>Example</strong>
+   * <p>For example, if we have 5 levels, and we determine to
+   * merge {@code L0} data to {@code L4} (which means {@code L1..L3}
+   * will be empty), then the new files go to {@code L4} uses
+   * compression type {@code compression_per_level[1]}.</p>
+   *
+   * <p>If now {@code L0} is merged to {@code L2}. Data goes to
+   * {@code L2} will be compressed according to
+   * {@code compression_per_level[1]}, {@code L3} using
+   * {@code compression_per_level[2]}and {@code L4} using
+   * {@code compression_per_level[3]}. Compaction for each
+   * level can change when data grows.</p>
+   *
+   * <p><strong>Default:</strong> empty</p>
+   *
+   * @param compressionLevels list of
+   *     {@link org.rocksdb.CompressionType} instances.
+   *
+   * @return the reference to the current options.
+   */
+  T setCompressionPerLevel(
+      List<CompressionType> compressionLevels);
+
+  /**
+   * <p>Return the currently set {@link org.rocksdb.CompressionType}
+   * per instances.</p>
+   *
+   * <p>See: {@link #setCompressionPerLevel(java.util.List)}</p>
+   *
+   * @return list of {@link org.rocksdb.CompressionType}
+   *     instances.
+   */
+  List<CompressionType> compressionPerLevel();
+
+  /**
+   * Set the number of levels for this database
+   * If level-styled compaction is used, then this number determines
+   * the total number of levels.
+   *
+   * @param numLevels the number of levels.
+   * @return the reference to the current options.
+   */
+  T setNumLevels(int numLevels);
+
+  /**
+   * If level-styled compaction is used, then this number determines
+   * the total number of levels.
+   *
+   * @return the number of levels.
+   */
+  int numLevels();
+
+  /**
+   * <p>If {@code true}, RocksDB will pick target size of each level
+   * dynamically. We will pick a base level b &gt;= 1. L0 will be
+   * directly merged into level b, instead of always into level 1.
+   * Level 1 to b-1 need to be empty. We try to pick b and its target
+   * size so that</p>
+   *
+   * <ol>
+   * <li>target size is in the range of
+   *   (max_bytes_for_level_base / max_bytes_for_level_multiplier,
+   *    max_bytes_for_level_base]</li>
+   * <li>target size of the last level (level num_levels-1) equals to extra size
+   *    of the level.</li>
+   * </ol>
+   *
+   * <p>At the same time max_bytes_for_level_multiplier and
+   * max_bytes_for_level_multiplier_additional are still satisfied.</p>
+   *
+   * <p>With this option on, from an empty DB, we make last level the base
+   * level, which means merging L0 data into the last level, until it exceeds
+   * max_bytes_for_level_base. And then we make the second last level to be
+   * base level, to start to merge L0 data to second last level, with its
+   * target size to be {@code 1/max_bytes_for_level_multiplier} of the last
+   * levels extra size. After the data accumulates more so that we need to
+   * move the base level to the third last one, and so on.</p>
+   *
+   * <p><b>Example</b></p>
+   *
+   * <p>For example, assume {@code max_bytes_for_level_multiplier=10},
+   * {@code num_levels=6}, and {@code max_bytes_for_level_base=10MB}.</p>
+   *
+   * <p>Target sizes of level 1 to 5 starts with:</p>
+   * {@code [- - - - 10MB]}
+   * <p>with base level is level. Target sizes of level 1 to 4 are not applicable
+   * because they will not be used.
+   * Until the size of Level 5 grows to more than 10MB, say 11MB, we make
+   * base target to level 4 and now the targets looks like:</p>
+   * {@code [- - - 1.1MB 11MB]}
+   * <p>While data are accumulated, size targets are tuned based on actual data
+   * of level 5. When level 5 has 50MB of data, the target is like:</p>
+   * {@code [- - - 5MB 50MB]}
+   * <p>Until level 5's actual size is more than 100MB, say 101MB. Now if we
+   * keep level 4 to be the base level, its target size needs to be 10.1MB,
+   * which doesn't satisfy the target size range. So now we make level 3
+   * the target size and the target sizes of the levels look like:</p>
+   * {@code [- - 1.01MB 10.1MB 101MB]}
+   * <p>In the same way, while level 5 further grows, all levels' targets grow,
+   * like</p>
+   * {@code [- - 5MB 50MB 500MB]}
+   * <p>Until level 5 exceeds 1000MB and becomes 1001MB, we make level 2 the
+   * base level and make levels' target sizes like this:</p>
+   * {@code [- 1.001MB 10.01MB 100.1MB 1001MB]}
+   * <p>and go on...</p>
+   *
+   * <p>By doing it, we give {@code max_bytes_for_level_multiplier} a priority
+   * against {@code max_bytes_for_level_base}, for a more predictable LSM tree
+   * shape. It is useful to limit worse case space amplification.</p>
+   *
+   * <p>{@code max_bytes_for_level_multiplier_additional} is ignored with
+   * this flag on.</p>
+   *
+   * <p>Turning this feature on or off for an existing DB can cause unexpected
+   * LSM tree structure so it's not recommended.</p>
+   *
+   * <p><strong>Caution</strong>: this option is experimental</p>
+   *
+   * <p>Default: false</p>
+   *
+   * @param enableLevelCompactionDynamicLevelBytes boolean value indicating
+   *     if {@code LevelCompactionDynamicLevelBytes} shall be enabled.
+   * @return the reference to the current options.
+   */
+  @Experimental("Turning this feature on or off for an existing DB can cause" +
+      " unexpected LSM tree structure so it's not recommended")
+  T setLevelCompactionDynamicLevelBytes(
+      boolean enableLevelCompactionDynamicLevelBytes);
+
+  /**
+   * <p>Return if {@code LevelCompactionDynamicLevelBytes} is enabled.
+   * </p>
+   *
+   * <p>For further information see
+   * {@link #setLevelCompactionDynamicLevelBytes(boolean)}</p>
+   *
+   * @return boolean value indicating if
+   *    {@code levelCompactionDynamicLevelBytes} is enabled.
+   */
+  @Experimental("Caution: this option is experimental")
+  boolean levelCompactionDynamicLevelBytes();
+
+  /**
+   * Maximum size of each compaction (not guarantee)
+   *
+   * @param maxCompactionBytes the compaction size limit
+   * @return the reference to the current options.
+   */
+  T setMaxCompactionBytes(
+      long maxCompactionBytes);
+
+  /**
+   * Control maximum size of each compaction (not guaranteed)
+   *
+   * @return compaction size threshold
+   */
+  long maxCompactionBytes();
+
+  /**
+   * Set compaction style for DB.
+   *
+   * Default: LEVEL.
+   *
+   * @param compactionStyle Compaction style.
+   * @return the reference to the current options.
+   */
+  ColumnFamilyOptionsInterface setCompactionStyle(
+      CompactionStyle compactionStyle);
+
+  /**
+   * Compaction style for DB.
+   *
+   * @return Compaction style.
+   */
+  CompactionStyle compactionStyle();
+
+  /**
+   * If level {@link #compactionStyle()} == {@link CompactionStyle#LEVEL},
+   * for each level, which files are prioritized to be picked to compact.
+   *
+   * Default: {@link CompactionPriority#ByCompensatedSize}
+   *
+   * @param compactionPriority The compaction priority
+   *
+   * @return the reference to the current options.
+   */
+  T setCompactionPriority(
+      CompactionPriority compactionPriority);
+
+  /**
+   * Get the Compaction priority if level compaction
+   * is used for all levels
+   *
+   * @return The compaction priority
+   */
+  CompactionPriority compactionPriority();
+
+  /**
+   * Set the options needed to support Universal Style compactions
+   *
+   * @param compactionOptionsUniversal The Universal Style compaction options
+   *
+   * @return the reference to the current options.
+   */
+  T setCompactionOptionsUniversal(
+      CompactionOptionsUniversal compactionOptionsUniversal);
+
+  /**
+   * The options needed to support Universal Style compactions
+   *
+   * @return The Universal Style compaction options
+   */
+  CompactionOptionsUniversal compactionOptionsUniversal();
+
+  /**
+   * The options for FIFO compaction style
+   *
+   * @param compactionOptionsFIFO The FIFO compaction options
+   *
+   * @return the reference to the current options.
+   */
+  T setCompactionOptionsFIFO(
+      CompactionOptionsFIFO compactionOptionsFIFO);
+
+  /**
+   * The options for FIFO compaction style
+   *
+   * @return The FIFO compaction options
+   */
+  CompactionOptionsFIFO compactionOptionsFIFO();
+
+  /**
+   * <p>This flag specifies that the implementation should optimize the filters
+   * mainly for cases where keys are found rather than also optimize for keys
+   * missed. This would be used in cases where the application knows that
+   * there are very few misses or the performance in the case of misses is not
+   * important.</p>
+   *
+   * <p>For now, this flag allows us to not store filters for the last level i.e
+   * the largest level which contains data of the LSM store. For keys which
+   * are hits, the filters in this level are not useful because we will search
+   * for the data anyway.</p>
+   *
+   * <p><strong>NOTE</strong>: the filters in other levels are still useful
+   * even for key hit because they tell us whether to look in that level or go
+   * to the higher level.</p>
+   *
+   * <p>Default: false<p>
+   *
+   * @param optimizeFiltersForHits boolean value indicating if this flag is set.
+   * @return the reference to the current options.
+   */
+  T setOptimizeFiltersForHits(
+      boolean optimizeFiltersForHits);
+
+  /**
+   * <p>Returns the current state of the {@code optimize_filters_for_hits}
+   * setting.</p>
+   *
+   * @return boolean value indicating if the flag
+   *     {@code optimize_filters_for_hits} was set.
+   */
+  boolean optimizeFiltersForHits();
+
+  /**
+   * By default, RocksDB runs consistency checks on the LSM every time the LSM
+   * changes (Flush, Compaction, AddFile). Use this option if you need to
+   * disable them.
+   *
+   * Default: true
+   *
+   * @param forceConsistencyChecks false to disable consistency checks
+   *
+   * @return the reference to the current options.
+   */
+  T setForceConsistencyChecks(
+      boolean forceConsistencyChecks);
+
+  /**
+   * By default, RocksDB runs consistency checks on the LSM every time the LSM
+   * changes (Flush, Compaction, AddFile).
+   *
+   * @return true if consistency checks are enforced
+   */
+  boolean forceConsistencyChecks();
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java b/src/rocksdb/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java
new file mode 100644
index 000000000..162d15d80
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java
@@ -0,0 +1,830 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Advanced Column Family Options which are mutable
+ *
+ * Taken from include/rocksdb/advanced_options.h
+ * and MutableCFOptions in util/cf_options.h
+ */
+public interface AdvancedMutableColumnFamilyOptionsInterface<
+    T extends AdvancedMutableColumnFamilyOptionsInterface<T>> {
+  /**
+   * The maximum number of write buffers that are built up in memory.
+   * The default is 2, so that when 1 write buffer is being flushed to
+   * storage, new writes can continue to the other write buffer.
+   * Default: 2
+   *
+   * @param maxWriteBufferNumber maximum number of write buffers.
+   * @return the instance of the current options.
+   */
+  T setMaxWriteBufferNumber(
+      int maxWriteBufferNumber);
+
+  /**
+   * Returns maximum number of write buffers.
+   *
+   * @return maximum number of write buffers.
+   * @see #setMaxWriteBufferNumber(int)
+   */
+  int maxWriteBufferNumber();
+
+  /**
+   * Number of locks used for inplace update
+   * Default: 10000, if inplace_update_support = true, else 0.
+   *
+   * @param inplaceUpdateNumLocks the number of locks used for
+   *     inplace updates.
+   * @return the reference to the current options.
+   * @throws java.lang.IllegalArgumentException thrown on 32-Bit platforms
+   *     while overflowing the underlying platform specific value.
+   */
+  T setInplaceUpdateNumLocks(
+      long inplaceUpdateNumLocks);
+
+  /**
+   * Number of locks used for inplace update
+   * Default: 10000, if inplace_update_support = true, else 0.
+   *
+   * @return the number of locks used for inplace update.
+   */
+  long inplaceUpdateNumLocks();
+
+  /**
+   * if prefix_extractor is set and memtable_prefix_bloom_size_ratio is not 0,
+   * create prefix bloom for memtable with the size of
+   * write_buffer_size * memtable_prefix_bloom_size_ratio.
+   * If it is larger than 0.25, it is santinized to 0.25.
+   *
+   * Default: 0 (disabled)
+   *
+   * @param memtablePrefixBloomSizeRatio the ratio of memtable used by the
+   *     bloom filter, 0 means no bloom filter
+   * @return the reference to the current options.
+   */
+  T setMemtablePrefixBloomSizeRatio(
+      double memtablePrefixBloomSizeRatio);
+
+  /**
+   * if prefix_extractor is set and memtable_prefix_bloom_size_ratio is not 0,
+   * create prefix bloom for memtable with the size of
+   * write_buffer_size * memtable_prefix_bloom_size_ratio.
+   * If it is larger than 0.25, it is santinized to 0.25.
+   *
+   * Default: 0 (disabled)
+   *
+   * @return the ratio of memtable used by the bloom filter
+   */
+  double memtablePrefixBloomSizeRatio();
+
+  /**
+   * Threshold used in the MemPurge (memtable garbage collection)
+   * feature. A value of 0.0 corresponds to no MemPurge,
+   * a value of 1.0 will trigger a MemPurge as often as possible.
+   *
+   * Default: 0.0 (disabled)
+   *
+   * @param experimentalMempurgeThreshold the threshold used by
+   *     the MemPurge decider.
+   * @return the reference to the current options.
+   */
+  T setExperimentalMempurgeThreshold(double experimentalMempurgeThreshold);
+
+  /**
+   * Threshold used in the MemPurge (memtable garbage collection)
+   * feature. A value of 0.0 corresponds to no MemPurge,
+   * a value of 1.0 will trigger a MemPurge as often as possible.
+   *
+   * Default: 0 (disabled)
+   *
+   * @return the threshold used by the MemPurge decider
+   */
+  double experimentalMempurgeThreshold();
+
+  /**
+   * Enable whole key bloom filter in memtable. Note this will only take effect
+   * if memtable_prefix_bloom_size_ratio is not 0. Enabling whole key filtering
+   * can potentially reduce CPU usage for point-look-ups.
+   *
+   * Default: false (disabled)
+   *
+   * @param memtableWholeKeyFiltering true if whole key bloom filter is enabled
+   *     in memtable
+   * @return the reference to the current options.
+   */
+  T setMemtableWholeKeyFiltering(boolean memtableWholeKeyFiltering);
+
+  /**
+   * Returns whether whole key bloom filter is enabled in memtable
+   *
+   * @return true if whole key bloom filter is enabled in memtable
+   */
+  boolean memtableWholeKeyFiltering();
+
+  /**
+   * Page size for huge page TLB for bloom in memtable. If &le; 0, not allocate
+   * from huge page TLB but from malloc.
+   * Need to reserve huge pages for it to be allocated. For example:
+   *     sysctl -w vm.nr_hugepages=20
+   * See linux doc Documentation/vm/hugetlbpage.txt
+   *
+   * @param memtableHugePageSize The page size of the huge
+   *     page tlb
+   * @return the reference to the current options.
+   */
+  T setMemtableHugePageSize(
+      long memtableHugePageSize);
+
+  /**
+   * Page size for huge page TLB for bloom in memtable. If &le; 0, not allocate
+   * from huge page TLB but from malloc.
+   * Need to reserve huge pages for it to be allocated. For example:
+   *     sysctl -w vm.nr_hugepages=20
+   * See linux doc Documentation/vm/hugetlbpage.txt
+   *
+   * @return The page size of the huge page tlb
+   */
+  long memtableHugePageSize();
+
+  /**
+   * The size of one block in arena memory allocation.
+   * If &le; 0, a proper value is automatically calculated (usually 1/10 of
+   * writer_buffer_size).
+   *
+   * There are two additional restriction of the specified size:
+   * (1) size should be in the range of [4096, 2 &lt;&lt; 30] and
+   * (2) be the multiple of the CPU word (which helps with the memory
+   * alignment).
+   *
+   * We'll automatically check and adjust the size number to make sure it
+   * conforms to the restrictions.
+   * Default: 0
+   *
+   * @param arenaBlockSize the size of an arena block
+   * @return the reference to the current options.
+   * @throws java.lang.IllegalArgumentException thrown on 32-Bit platforms
+   *   while overflowing the underlying platform specific value.
+   */
+  T setArenaBlockSize(long arenaBlockSize);
+
+  /**
+   * The size of one block in arena memory allocation.
+   * If &le; 0, a proper value is automatically calculated (usually 1/10 of
+   * writer_buffer_size).
+   *
+   * There are two additional restriction of the specified size:
+   * (1) size should be in the range of [4096, 2 &lt;&lt; 30] and
+   * (2) be the multiple of the CPU word (which helps with the memory
+   * alignment).
+   *
+   * We'll automatically check and adjust the size number to make sure it
+   * conforms to the restrictions.
+   * Default: 0
+   *
+   * @return the size of an arena block
+   */
+  long arenaBlockSize();
+
+  /**
+   * Soft limit on number of level-0 files. We start slowing down writes at this
+   * point. A value &lt; 0 means that no writing slow down will be triggered by
+   * number of files in level-0.
+   *
+   * @param level0SlowdownWritesTrigger The soft limit on the number of
+   *   level-0 files
+   * @return the reference to the current options.
+   */
+  T setLevel0SlowdownWritesTrigger(
+      int level0SlowdownWritesTrigger);
+
+  /**
+   * Soft limit on number of level-0 files. We start slowing down writes at this
+   * point. A value &lt; 0 means that no writing slow down will be triggered by
+   * number of files in level-0.
+   *
+   * @return The soft limit on the number of
+   *   level-0 files
+   */
+  int level0SlowdownWritesTrigger();
+
+  /**
+   * Maximum number of level-0 files.  We stop writes at this point.
+   *
+   * @param level0StopWritesTrigger The maximum number of level-0 files
+   * @return the reference to the current options.
+   */
+  T setLevel0StopWritesTrigger(
+      int level0StopWritesTrigger);
+
+  /**
+   * Maximum number of level-0 files.  We stop writes at this point.
+   *
+   * @return The maximum number of level-0 files
+   */
+  int level0StopWritesTrigger();
+
+  /**
+   * The target file size for compaction.
+   * This targetFileSizeBase determines a level-1 file size.
+   * Target file size for level L can be calculated by
+   * targetFileSizeBase * (targetFileSizeMultiplier ^ (L-1))
+   * For example, if targetFileSizeBase is 2MB and
+   * target_file_size_multiplier is 10, then each file on level-1 will
+   * be 2MB, and each file on level 2 will be 20MB,
+   * and each file on level-3 will be 200MB.
+   * by default targetFileSizeBase is 64MB.
+   *
+   * @param targetFileSizeBase the target size of a level-0 file.
+   * @return the reference to the current options.
+   *
+   * @see #setTargetFileSizeMultiplier(int)
+   */
+  T setTargetFileSizeBase(
+      long targetFileSizeBase);
+
+  /**
+   * The target file size for compaction.
+   * This targetFileSizeBase determines a level-1 file size.
+   * Target file size for level L can be calculated by
+   * targetFileSizeBase * (targetFileSizeMultiplier ^ (L-1))
+   * For example, if targetFileSizeBase is 2MB and
+   * target_file_size_multiplier is 10, then each file on level-1 will
+   * be 2MB, and each file on level 2 will be 20MB,
+   * and each file on level-3 will be 200MB.
+   * by default targetFileSizeBase is 64MB.
+   *
+   * @return the target size of a level-0 file.
+   *
+   * @see #targetFileSizeMultiplier()
+   */
+  long targetFileSizeBase();
+
+  /**
+   * targetFileSizeMultiplier defines the size ratio between a
+   * level-L file and level-(L+1) file.
+   * By default target_file_size_multiplier is 1, meaning
+   * files in different levels have the same target.
+   *
+   * @param multiplier the size ratio between a level-(L+1) file
+   *     and level-L file.
+   * @return the reference to the current options.
+   */
+  T setTargetFileSizeMultiplier(
+      int multiplier);
+
+  /**
+   * targetFileSizeMultiplier defines the size ratio between a
+   * level-(L+1) file and level-L file.
+   * By default targetFileSizeMultiplier is 1, meaning
+   * files in different levels have the same target.
+   *
+   * @return the size ratio between a level-(L+1) file and level-L file.
+   */
+  int targetFileSizeMultiplier();
+
+  /**
+   * The ratio between the total size of level-(L+1) files and the total
+   * size of level-L files for all L.
+   * DEFAULT: 10
+   *
+   * @param multiplier the ratio between the total size of level-(L+1)
+   *     files and the total size of level-L files for all L.
+   * @return the reference to the current options.
+   *
+   * See {@link MutableColumnFamilyOptionsInterface#setMaxBytesForLevelBase(long)}
+   */
+  T setMaxBytesForLevelMultiplier(double multiplier);
+
+  /**
+   * The ratio between the total size of level-(L+1) files and the total
+   * size of level-L files for all L.
+   * DEFAULT: 10
+   *
+   * @return the ratio between the total size of level-(L+1) files and
+   *     the total size of level-L files for all L.
+   *
+   * See {@link MutableColumnFamilyOptionsInterface#maxBytesForLevelBase()}
+   */
+  double maxBytesForLevelMultiplier();
+
+  /**
+   * Different max-size multipliers for different levels.
+   * These are multiplied by max_bytes_for_level_multiplier to arrive
+   * at the max-size of each level.
+   *
+   * Default: 1
+   *
+   * @param maxBytesForLevelMultiplierAdditional The max-size multipliers
+   *   for each level
+   * @return the reference to the current options.
+   */
+  T setMaxBytesForLevelMultiplierAdditional(
+      int[] maxBytesForLevelMultiplierAdditional);
+
+  /**
+   * Different max-size multipliers for different levels.
+   * These are multiplied by max_bytes_for_level_multiplier to arrive
+   * at the max-size of each level.
+   *
+   * Default: 1
+   *
+   * @return The max-size multipliers for each level
+   */
+  int[] maxBytesForLevelMultiplierAdditional();
+
+  /**
+   * All writes will be slowed down to at least delayed_write_rate if estimated
+   * bytes needed to be compaction exceed this threshold.
+   *
+   * Default: 64GB
+   *
+   * @param softPendingCompactionBytesLimit The soft limit to impose on
+   *   compaction
+   * @return the reference to the current options.
+   */
+  T setSoftPendingCompactionBytesLimit(
+      long softPendingCompactionBytesLimit);
+
+  /**
+   * All writes will be slowed down to at least delayed_write_rate if estimated
+   * bytes needed to be compaction exceed this threshold.
+   *
+   * Default: 64GB
+   *
+   * @return The soft limit to impose on compaction
+   */
+  long softPendingCompactionBytesLimit();
+
+  /**
+   * All writes are stopped if estimated bytes needed to be compaction exceed
+   * this threshold.
+   *
+   * Default: 256GB
+   *
+   * @param hardPendingCompactionBytesLimit The hard limit to impose on
+   *   compaction
+   * @return the reference to the current options.
+   */
+  T setHardPendingCompactionBytesLimit(
+      long hardPendingCompactionBytesLimit);
+
+  /**
+   * All writes are stopped if estimated bytes needed to be compaction exceed
+   * this threshold.
+   *
+   * Default: 256GB
+   *
+   * @return The hard limit to impose on compaction
+   */
+  long hardPendingCompactionBytesLimit();
+
+  /**
+   * An iteration-&gt;Next() sequentially skips over keys with the same
+   * user-key unless this option is set. This number specifies the number
+   * of keys (with the same userkey) that will be sequentially
+   * skipped before a reseek is issued.
+   * Default: 8
+   *
+   * @param maxSequentialSkipInIterations the number of keys could
+   *     be skipped in a iteration.
+   * @return the reference to the current options.
+   */
+  T setMaxSequentialSkipInIterations(
+      long maxSequentialSkipInIterations);
+
+  /**
+   * An iteration-&gt;Next() sequentially skips over keys with the same
+   * user-key unless this option is set. This number specifies the number
+   * of keys (with the same userkey) that will be sequentially
+   * skipped before a reseek is issued.
+   * Default: 8
+   *
+   * @return the number of keys could be skipped in a iteration.
+   */
+  long maxSequentialSkipInIterations();
+
+  /**
+   * Maximum number of successive merge operations on a key in the memtable.
+   *
+   * When a merge operation is added to the memtable and the maximum number of
+   * successive merges is reached, the value of the key will be calculated and
+   * inserted into the memtable instead of the merge operation. This will
+   * ensure that there are never more than max_successive_merges merge
+   * operations in the memtable.
+   *
+   * Default: 0 (disabled)
+   *
+   * @param maxSuccessiveMerges the maximum number of successive merges.
+   * @return the reference to the current options.
+   * @throws java.lang.IllegalArgumentException thrown on 32-Bit platforms
+   *   while overflowing the underlying platform specific value.
+   */
+  T setMaxSuccessiveMerges(
+      long maxSuccessiveMerges);
+
+  /**
+   * Maximum number of successive merge operations on a key in the memtable.
+   *
+   * When a merge operation is added to the memtable and the maximum number of
+   * successive merges is reached, the value of the key will be calculated and
+   * inserted into the memtable instead of the merge operation. This will
+   * ensure that there are never more than max_successive_merges merge
+   * operations in the memtable.
+   *
+   * Default: 0 (disabled)
+   *
+   * @return the maximum number of successive merges.
+   */
+  long maxSuccessiveMerges();
+
+  /**
+   * After writing every SST file, reopen it and read all the keys.
+   *
+   * Default: false
+   *
+   * @param paranoidFileChecks true to enable paranoid file checks
+   * @return the reference to the current options.
+   */
+  T setParanoidFileChecks(
+      boolean paranoidFileChecks);
+
+  /**
+   * After writing every SST file, reopen it and read all the keys.
+   *
+   * Default: false
+   *
+   * @return true if paranoid file checks are enabled
+   */
+  boolean paranoidFileChecks();
+
+  /**
+   * Measure IO stats in compactions and flushes, if true.
+   *
+   * Default: false
+   *
+   * @param reportBgIoStats true to enable reporting
+   * @return the reference to the current options.
+   */
+  T setReportBgIoStats(
+      boolean reportBgIoStats);
+
+  /**
+   * Determine whether IO stats in compactions and flushes are being measured
+   *
+   * @return true if reporting is enabled
+   */
+  boolean reportBgIoStats();
+
+  /**
+   * Non-bottom-level files older than TTL will go through the compaction
+   * process. This needs {@link MutableDBOptionsInterface#maxOpenFiles()} to be
+   * set to -1.
+   *
+   * Enabled only for level compaction for now.
+   *
+   * Default: 0 (disabled)
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @param ttl the time-to-live.
+   *
+   * @return the reference to the current options.
+   */
+  T setTtl(final long ttl);
+
+  /**
+   * Get the TTL for Non-bottom-level files that will go through the compaction
+   * process.
+   *
+   * See {@link #setTtl(long)}.
+   *
+   * @return the time-to-live.
+   */
+  long ttl();
+
+  /**
+   * Files older than this value will be picked up for compaction, and
+   * re-written to the same level as they were before.
+   * One main use of the feature is to make sure a file goes through compaction
+   * filters periodically. Users can also use the feature to clear up SST
+   * files using old format.
+   *
+   * A file's age is computed by looking at file_creation_time or creation_time
+   * table properties in order, if they have valid non-zero values; if not, the
+   * age is based on the file's last modified time (given by the underlying
+   * Env).
+   *
+   * Supported in Level and FIFO compaction.
+   * In FIFO compaction, this option has the same meaning as TTL and whichever
+   * stricter will be used.
+   * Pre-req: max_open_file == -1.
+   * unit: seconds. Ex: 7 days = 7 * 24 * 60 * 60
+   *
+   * Values:
+   * 0: Turn off Periodic compactions.
+   * UINT64_MAX - 1 (i.e 0xfffffffffffffffe): Let RocksDB control this feature
+   *     as needed. For now, RocksDB will change this value to 30 days
+   *     (i.e 30 * 24 * 60 * 60) so that every file goes through the compaction
+   *     process at least once every 30 days if not compacted sooner.
+   *     In FIFO compaction, since the option has the same meaning as ttl,
+   *     when this value is left default, and ttl is left to 0, 30 days will be
+   *     used. Otherwise, min(ttl, periodic_compaction_seconds) will be used.
+   *
+   * Default: 0xfffffffffffffffe (allow RocksDB to auto-tune)
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @param periodicCompactionSeconds the periodic compaction in seconds.
+   *
+   * @return the reference to the current options.
+   */
+  T setPeriodicCompactionSeconds(final long periodicCompactionSeconds);
+
+  /**
+   * Get the periodicCompactionSeconds.
+   *
+   * See {@link #setPeriodicCompactionSeconds(long)}.
+   *
+   * @return the periodic compaction in seconds.
+   */
+  long periodicCompactionSeconds();
+
+  //
+  // BEGIN options for blobs (integrated BlobDB)
+  //
+
+  /**
+   * When set, large values (blobs) are written to separate blob files, and only
+   * pointers to them are stored in SST files. This can reduce write amplification
+   * for large-value use cases at the cost of introducing a level of indirection
+   * for reads. See also the options min_blob_size, blob_file_size,
+   * blob_compression_type, enable_blob_garbage_collection, and
+   * blob_garbage_collection_age_cutoff below.
+   *
+   * Default: false
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @param enableBlobFiles true iff blob files should be enabled
+   *
+   * @return the reference to the current options.
+   */
+  T setEnableBlobFiles(final boolean enableBlobFiles);
+
+  /**
+   * When set, large values (blobs) are written to separate blob files, and only
+   * pointers to them are stored in SST files. This can reduce write amplification
+   * for large-value use cases at the cost of introducing a level of indirection
+   * for reads. See also the options min_blob_size, blob_file_size,
+   * blob_compression_type, enable_blob_garbage_collection, and
+   * blob_garbage_collection_age_cutoff below.
+   *
+   * Default: false
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @return true if blob files are enabled
+   */
+  boolean enableBlobFiles();
+
+  /**
+   * Set the size of the smallest value to be stored separately in a blob file. Values
+   * which have an uncompressed size smaller than this threshold are stored
+   * alongside the keys in SST files in the usual fashion. A value of zero for
+   * this option means that all values are stored in blob files. Note that
+   * enable_blob_files has to be set in order for this option to have any effect.
+   *
+   * Default: 0
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @param minBlobSize the size of the smallest value to be stored separately in a blob file
+   * @return the reference to the current options.
+   */
+  T setMinBlobSize(final long minBlobSize);
+
+  /**
+   * Get the size of the smallest value to be stored separately in a blob file. Values
+   * which have an uncompressed size smaller than this threshold are stored
+   * alongside the keys in SST files in the usual fashion. A value of zero for
+   * this option means that all values are stored in blob files. Note that
+   * enable_blob_files has to be set in order for this option to have any effect.
+   *
+   * Default: 0
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @return the current minimum size of value which is stored separately in a blob
+   */
+  long minBlobSize();
+
+  /**
+   * Set the size limit for blob files. When writing blob files, a new file is opened
+   * once this limit is reached. Note that enable_blob_files has to be set in
+   * order for this option to have any effect.
+   *
+   * Default: 256 MB
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @param blobFileSize the size limit for blob files
+   *
+   * @return the reference to the current options.
+   */
+  T setBlobFileSize(final long blobFileSize);
+
+  /**
+   * The size limit for blob files. When writing blob files, a new file is opened
+   * once this limit is reached.
+   *
+   * @return the current size limit for blob files
+   */
+  long blobFileSize();
+
+  /**
+   * Set the compression algorithm to use for large values stored in blob files. Note
+   * that enable_blob_files has to be set in order for this option to have any
+   * effect.
+   *
+   * Default: no compression
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @param compressionType the compression algorithm to use.
+   *
+   * @return the reference to the current options.
+   */
+  T setBlobCompressionType(CompressionType compressionType);
+
+  /**
+   * Get the compression algorithm in use for large values stored in blob files.
+   * Note that enable_blob_files has to be set in order for this option to have any
+   * effect.
+   *
+   * @return the current compression algorithm
+   */
+  CompressionType blobCompressionType();
+
+  /**
+   * Enable/disable garbage collection of blobs. Blob GC is performed as part of
+   * compaction. Valid blobs residing in blob files older than a cutoff get
+   * relocated to new files as they are encountered during compaction, which makes
+   * it possible to clean up blob files once they contain nothing but
+   * obsolete/garbage blobs. See also blob_garbage_collection_age_cutoff below.
+   *
+   * Default: false
+   *
+   * @param enableBlobGarbageCollection the new enabled/disabled state of blob garbage collection
+   *
+   * @return the reference to the current options.
+   */
+  T setEnableBlobGarbageCollection(final boolean enableBlobGarbageCollection);
+
+  /**
+   * Query whether garbage collection of blobs is enabled.Blob GC is performed as part of
+   * compaction. Valid blobs residing in blob files older than a cutoff get
+   * relocated to new files as they are encountered during compaction, which makes
+   * it possible to clean up blob files once they contain nothing but
+   * obsolete/garbage blobs. See also blob_garbage_collection_age_cutoff below.
+   *
+   * Default: false
+   *
+   * @return true if blob garbage collection is currently enabled.
+   */
+  boolean enableBlobGarbageCollection();
+
+  /**
+   * Set cutoff in terms of blob file age for garbage collection. Blobs in the
+   * oldest N blob files will be relocated when encountered during compaction,
+   * where N = garbage_collection_cutoff * number_of_blob_files. Note that
+   * enable_blob_garbage_collection has to be set in order for this option to have
+   * any effect.
+   *
+   * Default: 0.25
+   *
+   * @param blobGarbageCollectionAgeCutoff the new age cutoff
+   *
+   * @return the reference to the current options.
+   */
+  T setBlobGarbageCollectionAgeCutoff(double blobGarbageCollectionAgeCutoff);
+  /**
+   * Get cutoff in terms of blob file age for garbage collection. Blobs in the
+   * oldest N blob files will be relocated when encountered during compaction,
+   * where N = garbage_collection_cutoff * number_of_blob_files. Note that
+   * enable_blob_garbage_collection has to be set in order for this option to have
+   * any effect.
+   *
+   * Default: 0.25
+   *
+   * @return the current age cutoff for garbage collection
+   */
+  double blobGarbageCollectionAgeCutoff();
+
+  /**
+   *  If the ratio of garbage in the oldest blob files exceeds this threshold,
+   *  targeted compactions are scheduled in order to force garbage collecting
+   *  the blob files in question, assuming they are all eligible based on the
+   *  value of {@link #blobGarbageCollectionAgeCutoff} above. This option is
+   *  currently only supported with leveled compactions.
+   *
+   *  Note that {@link #enableBlobGarbageCollection} has to be set in order for this
+   *  option to have any effect.
+   *
+   *  Default: 1.0
+   *
+   * Dynamically changeable through the SetOptions() API
+   *
+   * @param blobGarbageCollectionForceThreshold new value for the threshold
+   * @return the reference to the current options
+   */
+  T setBlobGarbageCollectionForceThreshold(double blobGarbageCollectionForceThreshold);
+
+  /**
+   * Get the current value for the {@link #blobGarbageCollectionForceThreshold}
+   * @return the current threshold at which garbage collection of blobs is forced
+   */
+  double blobGarbageCollectionForceThreshold();
+
+  /**
+   * Set compaction readahead for blob files.
+   *
+   * Default: 0
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @param blobCompactionReadaheadSize the compaction readahead for blob files
+   *
+   * @return the reference to the current options.
+   */
+  T setBlobCompactionReadaheadSize(final long blobCompactionReadaheadSize);
+
+  /**
+   * Get compaction readahead for blob files.
+   *
+   * @return the current compaction readahead for blob files
+   */
+  long blobCompactionReadaheadSize();
+
+  /**
+   * Set a certain LSM tree level to enable blob files.
+   *
+   * Default: 0
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @param blobFileStartingLevel the starting level to enable blob files
+   *
+   * @return the reference to the current options.
+   */
+  T setBlobFileStartingLevel(final int blobFileStartingLevel);
+
+  /**
+   * Get the starting LSM tree level to enable blob files.
+   *
+   * Default: 0
+   *
+   * @return the current LSM tree level to enable blob files.
+   */
+  int blobFileStartingLevel();
+
+  /**
+   * Set a certain prepopulate blob cache option.
+   *
+   * Default: 0
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @param prepopulateBlobCache the prepopulate blob cache option
+   *
+   * @return the reference to the current options.
+   */
+  T setPrepopulateBlobCache(final PrepopulateBlobCache prepopulateBlobCache);
+
+  /**
+   * Get the prepopulate blob cache option.
+   *
+   * Default: 0
+   *
+   * @return the current prepopulate blob cache option.
+   */
+  PrepopulateBlobCache prepopulateBlobCache();
+
+  //
+  // END options for blobs (integrated BlobDB)
+  //
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/BackgroundErrorReason.java b/src/rocksdb/java/src/main/java/org/rocksdb/BackgroundErrorReason.java
new file mode 100644
index 000000000..eec593d35
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/BackgroundErrorReason.java
@@ -0,0 +1,46 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public enum BackgroundErrorReason {
+  FLUSH((byte) 0x0),
+  COMPACTION((byte) 0x1),
+  WRITE_CALLBACK((byte) 0x2),
+  MEMTABLE((byte) 0x3);
+
+  private final byte value;
+
+  BackgroundErrorReason(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation.
+   *
+   * @return the internal representation
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get the BackgroundErrorReason from the internal representation value.
+   *
+   * @return the background error reason.
+   *
+   * @throws IllegalArgumentException if the value is unknown.
+   */
+  static BackgroundErrorReason fromValue(final byte value) {
+    for (final BackgroundErrorReason backgroundErrorReason : BackgroundErrorReason.values()) {
+      if (backgroundErrorReason.value == value) {
+        return backgroundErrorReason;
+      }
+    }
+
+    throw new IllegalArgumentException(
+        "Illegal value provided for BackgroundErrorReason: " + value);
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/BackupEngine.java b/src/rocksdb/java/src/main/java/org/rocksdb/BackupEngine.java
new file mode 100644
index 000000000..515824a91
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/BackupEngine.java
@@ -0,0 +1,259 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+import java.util.List;
+
+/**
+ * BackupEngine allows you to backup
+ * and restore the database
+ *
+ * Be aware, that `new BackupEngine` takes time proportional to the amount
+ * of backups. So if you have a slow filesystem to backup
+ * and you have a lot of backups then restoring can take some time.
+ * That's why we recommend to limit the number of backups.
+ * Also we recommend to keep BackupEngine alive and not to recreate it every
+ * time you need to do a backup.
+ */
+public class BackupEngine extends RocksObject implements AutoCloseable {
+
+  protected BackupEngine(final long nativeHandle) {
+    super(nativeHandle);
+  }
+
+  /**
+   * Opens a new Backup Engine
+   *
+   * @param env The environment that the backup engine should operate within
+   * @param options Any options for the backup engine
+   *
+   * @return A new BackupEngine instance
+   * @throws RocksDBException thrown if the backup engine could not be opened
+   */
+  public static BackupEngine open(final Env env, final BackupEngineOptions options)
+      throws RocksDBException {
+    return new BackupEngine(open(env.nativeHandle_, options.nativeHandle_));
+  }
+
+  /**
+   * Captures the state of the database in the latest backup
+   *
+   * Just a convenience for {@link #createNewBackup(RocksDB, boolean)} with
+   * the flushBeforeBackup parameter set to false
+   *
+   * @param db The database to backup
+   *
+   * Note - This method is not thread safe
+   *
+   * @throws RocksDBException thrown if a new backup could not be created
+   */
+  public void createNewBackup(final RocksDB db) throws RocksDBException {
+    createNewBackup(db, false);
+  }
+
+  /**
+   * Captures the state of the database in the latest backup
+   *
+   * @param db The database to backup
+   * @param flushBeforeBackup When true, the Backup Engine will first issue a
+   *                          memtable flush and only then copy the DB files to
+   *                          the backup directory. Doing so will prevent log
+   *                          files from being copied to the backup directory
+   *                          (since flush will delete them).
+   *                          When false, the Backup Engine will not issue a
+   *                          flush before starting the backup. In that case,
+   *                          the backup will also include log files
+   *                          corresponding to live memtables. If writes have
+   *                          been performed with the write ahead log disabled,
+   *                          set flushBeforeBackup to true to prevent those
+   *                          writes from being lost. Otherwise, the backup will
+   *                          always be consistent with the current state of the
+   *                          database regardless of the flushBeforeBackup
+   *                          parameter.
+   *
+   * Note - This method is not thread safe
+   *
+   * @throws RocksDBException thrown if a new backup could not be created
+   */
+  public void createNewBackup(
+      final RocksDB db, final boolean flushBeforeBackup)
+      throws RocksDBException {
+    assert (isOwningHandle());
+    createNewBackup(nativeHandle_, db.nativeHandle_, flushBeforeBackup);
+  }
+
+  /**
+   * Captures the state of the database in the latest backup along with
+   * application specific metadata.
+   *
+   * @param db The database to backup
+   * @param metadata Application metadata
+   * @param flushBeforeBackup When true, the Backup Engine will first issue a
+   *                          memtable flush and only then copy the DB files to
+   *                          the backup directory. Doing so will prevent log
+   *                          files from being copied to the backup directory
+   *                          (since flush will delete them).
+   *                          When false, the Backup Engine will not issue a
+   *                          flush before starting the backup. In that case,
+   *                          the backup will also include log files
+   *                          corresponding to live memtables. If writes have
+   *                          been performed with the write ahead log disabled,
+   *                          set flushBeforeBackup to true to prevent those
+   *                          writes from being lost. Otherwise, the backup will
+   *                          always be consistent with the current state of the
+   *                          database regardless of the flushBeforeBackup
+   *                          parameter.
+   *
+   * Note - This method is not thread safe
+   *
+   * @throws RocksDBException thrown if a new backup could not be created
+   */
+  public void createNewBackupWithMetadata(final RocksDB db, final String metadata,
+      final boolean flushBeforeBackup) throws RocksDBException {
+    assert (isOwningHandle());
+    createNewBackupWithMetadata(nativeHandle_, db.nativeHandle_, metadata, flushBeforeBackup);
+  }
+
+  /**
+   * Gets information about the available
+   * backups
+   *
+   * @return A list of information about each available backup
+   */
+  public List<BackupInfo> getBackupInfo() {
+    assert (isOwningHandle());
+    return getBackupInfo(nativeHandle_);
+  }
+
+  /**
+   * <p>Returns a list of corrupted backup ids. If there
+   * is no corrupted backup the method will return an
+   * empty list.</p>
+   *
+   * @return array of backup ids as int ids.
+   */
+  public int[] getCorruptedBackups() {
+    assert(isOwningHandle());
+    return getCorruptedBackups(nativeHandle_);
+  }
+
+  /**
+   * <p>Will delete all the files we don't need anymore. It will
+   * do the full scan of the files/ directory and delete all the
+   * files that are not referenced.</p>
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void garbageCollect() throws RocksDBException {
+    assert(isOwningHandle());
+    garbageCollect(nativeHandle_);
+  }
+
+  /**
+   * Deletes old backups, keeping just the latest numBackupsToKeep
+   *
+   * @param numBackupsToKeep The latest n backups to keep
+   *
+   * @throws RocksDBException thrown if the old backups could not be deleted
+   */
+  public void purgeOldBackups(
+      final int numBackupsToKeep) throws RocksDBException {
+    assert (isOwningHandle());
+    purgeOldBackups(nativeHandle_, numBackupsToKeep);
+  }
+
+  /**
+   * Deletes a backup
+   *
+   * @param backupId The id of the backup to delete
+   *
+   * @throws RocksDBException thrown if the backup could not be deleted
+   */
+  public void deleteBackup(final int backupId) throws RocksDBException {
+    assert (isOwningHandle());
+    deleteBackup(nativeHandle_, backupId);
+  }
+
+  /**
+   * Restore the database from a backup
+   *
+   * IMPORTANT: if options.share_table_files == true and you restore the DB
+   * from some backup that is not the latest, and you start creating new
+   * backups from the new DB, they will probably fail!
+   *
+   * Example: Let's say you have backups 1, 2, 3, 4, 5 and you restore 3.
+   * If you add new data to the DB and try creating a new backup now, the
+   * database will diverge from backups 4 and 5 and the new backup will fail.
+   * If you want to create new backup, you will first have to delete backups 4
+   * and 5.
+   *
+   * @param backupId The id of the backup to restore
+   * @param dbDir The directory to restore the backup to, i.e. where your
+   *              database is
+   * @param walDir The location of the log files for your database,
+   *               often the same as dbDir
+   * @param restoreOptions Options for controlling the restore
+   *
+   * @throws RocksDBException thrown if the database could not be restored
+   */
+  public void restoreDbFromBackup(
+      final int backupId, final String dbDir, final String walDir,
+      final RestoreOptions restoreOptions) throws RocksDBException {
+    assert (isOwningHandle());
+    restoreDbFromBackup(nativeHandle_, backupId, dbDir, walDir,
+        restoreOptions.nativeHandle_);
+  }
+
+  /**
+   * Restore the database from the latest backup
+   *
+   * @param dbDir The directory to restore the backup to, i.e. where your
+   *              database is
+   * @param walDir The location of the log files for your database, often the
+   *               same as dbDir
+   * @param restoreOptions Options for controlling the restore
+   *
+   * @throws RocksDBException thrown if the database could not be restored
+   */
+  public void restoreDbFromLatestBackup(
+      final String dbDir, final String walDir,
+      final RestoreOptions restoreOptions) throws RocksDBException {
+    assert (isOwningHandle());
+    restoreDbFromLatestBackup(nativeHandle_, dbDir, walDir,
+        restoreOptions.nativeHandle_);
+  }
+
+  private native static long open(final long env, final long backupEngineOptions)
+      throws RocksDBException;
+
+  private native void createNewBackup(final long handle, final long dbHandle,
+      final boolean flushBeforeBackup) throws RocksDBException;
+
+  private native void createNewBackupWithMetadata(final long handle, final long dbHandle,
+      final String metadata, final boolean flushBeforeBackup) throws RocksDBException;
+
+  private native List<BackupInfo> getBackupInfo(final long handle);
+
+  private native int[] getCorruptedBackups(final long handle);
+
+  private native void garbageCollect(final long handle) throws RocksDBException;
+
+  private native void purgeOldBackups(final long handle,
+      final int numBackupsToKeep) throws RocksDBException;
+
+  private native void deleteBackup(final long handle, final int backupId)
+      throws RocksDBException;
+
+  private native void restoreDbFromBackup(final long handle, final int backupId,
+      final String dbDir, final String walDir, final long restoreOptionsHandle)
+      throws RocksDBException;
+
+  private native void restoreDbFromLatestBackup(final long handle,
+      final String dbDir, final String walDir, final long restoreOptionsHandle)
+      throws RocksDBException;
+
+  @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/BackupEngineOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/BackupEngineOptions.java
new file mode 100644
index 000000000..6e2dacc02
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/BackupEngineOptions.java
@@ -0,0 +1,458 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.io.File;
+
+/**
+ * <p>BackupEngineOptions controls the behavior of a
+ * {@link org.rocksdb.BackupEngine}.
+ * </p>
+ * <p>Note that dispose() must be called before an Options instance
+ * become out-of-scope to release the allocated memory in c++.</p>
+ *
+ * @see org.rocksdb.BackupEngine
+ */
+public class BackupEngineOptions extends RocksObject {
+  private Env backupEnv = null;
+  private Logger infoLog = null;
+  private RateLimiter backupRateLimiter = null;
+  private RateLimiter restoreRateLimiter = null;
+
+  /**
+   * <p>BackupEngineOptions constructor.</p>
+   *
+   * @param path Where to keep the backup files. Has to be different than db
+   *   name. Best to set this to {@code db name_ + "/backups"}
+   * @throws java.lang.IllegalArgumentException if illegal path is used.
+   */
+  public BackupEngineOptions(final String path) {
+    super(newBackupEngineOptions(ensureWritableFile(path)));
+  }
+
+  private static String ensureWritableFile(final String path) {
+    final File backupPath = path == null ? null : new File(path);
+    if (backupPath == null || !backupPath.isDirectory() ||
+        !backupPath.canWrite()) {
+      throw new IllegalArgumentException("Illegal path provided.");
+    } else {
+      return path;
+    }
+  }
+
+  /**
+   * <p>Returns the path to the BackupEngine directory.</p>
+   *
+   * @return the path to the BackupEngine directory.
+   */
+  public String backupDir() {
+    assert(isOwningHandle());
+    return backupDir(nativeHandle_);
+  }
+
+  /**
+   * Backup Env object. It will be used for backup file I/O. If it's
+   * null, backups will be written out using DBs Env. Otherwise
+   * backup's I/O will be performed using this object.
+   *
+   * Default: null
+   *
+   * @param env The environment to use
+   * @return instance of current BackupEngineOptions.
+   */
+  public BackupEngineOptions setBackupEnv(final Env env) {
+    assert(isOwningHandle());
+    setBackupEnv(nativeHandle_, env.nativeHandle_);
+    this.backupEnv = env;
+    return this;
+  }
+
+  /**
+   * Backup Env object. It will be used for backup file I/O. If it's
+   * null, backups will be written out using DBs Env. Otherwise
+   * backup's I/O will be performed using this object.
+   *
+   * Default: null
+   *
+   * @return The environment in use
+   */
+  public Env backupEnv() {
+    return this.backupEnv;
+  }
+
+  /**
+   * <p>Share table files between backups.</p>
+   *
+   * @param shareTableFiles If {@code share_table_files == true}, backup will
+   *   assume that table files with same name have the same contents. This
+   *   enables incremental backups and avoids unnecessary data copies. If
+   *   {@code share_table_files == false}, each backup will be on its own and
+   *   will not share any data with other backups.
+   *
+   * <p>Default: true</p>
+   *
+   * @return instance of current BackupEngineOptions.
+   */
+  public BackupEngineOptions setShareTableFiles(final boolean shareTableFiles) {
+    assert(isOwningHandle());
+    setShareTableFiles(nativeHandle_, shareTableFiles);
+    return this;
+  }
+
+  /**
+   * <p>Share table files between backups.</p>
+   *
+   * @return boolean value indicating if SST files will be shared between
+   *     backups.
+   */
+  public boolean shareTableFiles() {
+    assert(isOwningHandle());
+    return shareTableFiles(nativeHandle_);
+  }
+
+  /**
+   * Set the logger to use for Backup info and error messages
+   *
+   * @param logger The logger to use for the backup
+   * @return instance of current BackupEngineOptions.
+   */
+  public BackupEngineOptions setInfoLog(final Logger logger) {
+    assert(isOwningHandle());
+    setInfoLog(nativeHandle_, logger.nativeHandle_);
+    this.infoLog = logger;
+    return this;
+  }
+
+  /**
+   * Set the logger to use for Backup info and error messages
+   *
+   * Default: null
+   *
+   * @return The logger in use for the backup
+   */
+  public Logger infoLog() {
+    return this.infoLog;
+  }
+
+  /**
+   * <p>Set synchronous backups.</p>
+   *
+   * @param sync If {@code sync == true}, we can guarantee you'll get consistent
+   *   backup even on a machine crash/reboot. Backup process is slower with sync
+   *   enabled. If {@code sync == false}, we don't guarantee anything on machine
+   *   reboot. However, chances are some of the backups are consistent.
+   *
+   * <p>Default: true</p>
+   *
+   * @return instance of current BackupEngineOptions.
+   */
+  public BackupEngineOptions setSync(final boolean sync) {
+    assert(isOwningHandle());
+    setSync(nativeHandle_, sync);
+    return this;
+  }
+
+  /**
+   * <p>Are synchronous backups activated.</p>
+   *
+   * @return boolean value if synchronous backups are configured.
+   */
+  public boolean sync() {
+    assert(isOwningHandle());
+    return sync(nativeHandle_);
+  }
+
+  /**
+   * <p>Set if old data will be destroyed.</p>
+   *
+   * @param destroyOldData If true, it will delete whatever backups there are
+   *   already.
+   *
+   * <p>Default: false</p>
+   *
+   * @return instance of current BackupEngineOptions.
+   */
+  public BackupEngineOptions setDestroyOldData(final boolean destroyOldData) {
+    assert(isOwningHandle());
+    setDestroyOldData(nativeHandle_, destroyOldData);
+    return this;
+  }
+
+  /**
+   * <p>Returns if old data will be destroyed will performing new backups.</p>
+   *
+   * @return boolean value indicating if old data will be destroyed.
+   */
+  public boolean destroyOldData() {
+    assert(isOwningHandle());
+    return destroyOldData(nativeHandle_);
+  }
+
+  /**
+   * <p>Set if log files shall be persisted.</p>
+   *
+   * @param backupLogFiles If false, we won't backup log files. This option can
+   *   be useful for backing up in-memory databases where log file are
+   *   persisted, but table files are in memory.
+   *
+   * <p>Default: true</p>
+   *
+   * @return instance of current BackupEngineOptions.
+   */
+  public BackupEngineOptions setBackupLogFiles(final boolean backupLogFiles) {
+    assert(isOwningHandle());
+    setBackupLogFiles(nativeHandle_, backupLogFiles);
+    return this;
+  }
+
+  /**
+   * <p>Return information if log files shall be persisted.</p>
+   *
+   * @return boolean value indicating if log files will be persisted.
+   */
+  public boolean backupLogFiles() {
+    assert(isOwningHandle());
+    return backupLogFiles(nativeHandle_);
+  }
+
+  /**
+   * <p>Set backup rate limit.</p>
+   *
+   * @param backupRateLimit Max bytes that can be transferred in a second during
+   *   backup. If 0 or negative, then go as fast as you can.
+   *
+   * <p>Default: 0</p>
+   *
+   * @return instance of current BackupEngineOptions.
+   */
+  public BackupEngineOptions setBackupRateLimit(long backupRateLimit) {
+    assert(isOwningHandle());
+    backupRateLimit = (backupRateLimit <= 0) ? 0 : backupRateLimit;
+    setBackupRateLimit(nativeHandle_, backupRateLimit);
+    return this;
+  }
+
+  /**
+   * <p>Return backup rate limit which described the max bytes that can be
+   * transferred in a second during backup.</p>
+   *
+   * @return numerical value describing the backup transfer limit in bytes per
+   *   second.
+   */
+  public long backupRateLimit() {
+    assert(isOwningHandle());
+    return backupRateLimit(nativeHandle_);
+  }
+
+  /**
+   * Backup rate limiter. Used to control transfer speed for backup. If this is
+   * not null, {@link #backupRateLimit()} is ignored.
+   *
+   * Default: null
+   *
+   * @param backupRateLimiter The rate limiter to use for the backup
+   * @return instance of current BackupEngineOptions.
+   */
+  public BackupEngineOptions setBackupRateLimiter(final RateLimiter backupRateLimiter) {
+    assert(isOwningHandle());
+    setBackupRateLimiter(nativeHandle_, backupRateLimiter.nativeHandle_);
+    this.backupRateLimiter = backupRateLimiter;
+    return this;
+  }
+
+  /**
+   * Backup rate limiter. Used to control transfer speed for backup. If this is
+   * not null, {@link #backupRateLimit()} is ignored.
+   *
+   * Default: null
+   *
+   * @return The rate limiter in use for the backup
+   */
+  public RateLimiter backupRateLimiter() {
+    assert(isOwningHandle());
+    return this.backupRateLimiter;
+  }
+
+  /**
+   * <p>Set restore rate limit.</p>
+   *
+   * @param restoreRateLimit Max bytes that can be transferred in a second
+   *   during restore. If 0 or negative, then go as fast as you can.
+   *
+   * <p>Default: 0</p>
+   *
+   * @return instance of current BackupEngineOptions.
+   */
+  public BackupEngineOptions setRestoreRateLimit(long restoreRateLimit) {
+    assert(isOwningHandle());
+    restoreRateLimit = (restoreRateLimit <= 0) ? 0 : restoreRateLimit;
+    setRestoreRateLimit(nativeHandle_, restoreRateLimit);
+    return this;
+  }
+
+  /**
+   * <p>Return restore rate limit which described the max bytes that can be
+   * transferred in a second during restore.</p>
+   *
+   * @return numerical value describing the restore transfer limit in bytes per
+   *   second.
+   */
+  public long restoreRateLimit() {
+    assert(isOwningHandle());
+    return restoreRateLimit(nativeHandle_);
+  }
+
+  /**
+   * Restore rate limiter. Used to control transfer speed during restore. If
+   * this is not null, {@link #restoreRateLimit()} is ignored.
+   *
+   * Default: null
+   *
+   * @param restoreRateLimiter The rate limiter to use during restore
+   * @return instance of current BackupEngineOptions.
+   */
+  public BackupEngineOptions setRestoreRateLimiter(final RateLimiter restoreRateLimiter) {
+    assert(isOwningHandle());
+    setRestoreRateLimiter(nativeHandle_, restoreRateLimiter.nativeHandle_);
+    this.restoreRateLimiter = restoreRateLimiter;
+    return this;
+  }
+
+  /**
+   * Restore rate limiter. Used to control transfer speed during restore. If
+   * this is not null, {@link #restoreRateLimit()} is ignored.
+   *
+   * Default: null
+   *
+   * @return The rate limiter in use during restore
+   */
+  public RateLimiter restoreRateLimiter() {
+    assert(isOwningHandle());
+    return this.restoreRateLimiter;
+  }
+
+  /**
+   * <p>Only used if share_table_files is set to true. If true, will consider
+   * that backups can come from different databases, hence a sst is not uniquely
+   * identified by its name, but by the triple (file name, crc32, file length)
+   * </p>
+   *
+   * @param shareFilesWithChecksum boolean value indicating if SST files are
+   *   stored using the triple (file name, crc32, file length) and not its name.
+   *
+   * <p>Note: this is an experimental option, and you'll need to set it manually
+   * turn it on only if you know what you're doing*</p>
+   *
+   * <p>Default: false</p>
+   *
+   * @return instance of current BackupEngineOptions.
+   */
+  public BackupEngineOptions setShareFilesWithChecksum(final boolean shareFilesWithChecksum) {
+    assert(isOwningHandle());
+    setShareFilesWithChecksum(nativeHandle_, shareFilesWithChecksum);
+    return this;
+  }
+
+  /**
+   * <p>Return of share files with checksum is active.</p>
+   *
+   * @return boolean value indicating if share files with checksum
+   *     is active.
+   */
+  public boolean shareFilesWithChecksum() {
+    assert(isOwningHandle());
+    return shareFilesWithChecksum(nativeHandle_);
+  }
+
+  /**
+   * Up to this many background threads will copy files for
+   * {@link BackupEngine#createNewBackup(RocksDB, boolean)} and
+   * {@link BackupEngine#restoreDbFromBackup(int, String, String, RestoreOptions)}
+   *
+   * Default: 1
+   *
+   * @param maxBackgroundOperations The maximum number of background threads
+   * @return instance of current BackupEngineOptions.
+   */
+  public BackupEngineOptions setMaxBackgroundOperations(final int maxBackgroundOperations) {
+    assert(isOwningHandle());
+    setMaxBackgroundOperations(nativeHandle_, maxBackgroundOperations);
+    return this;
+  }
+
+  /**
+   * Up to this many background threads will copy files for
+   * {@link BackupEngine#createNewBackup(RocksDB, boolean)} and
+   * {@link BackupEngine#restoreDbFromBackup(int, String, String, RestoreOptions)}
+   *
+   * Default: 1
+   *
+   * @return The maximum number of background threads
+   */
+  public int maxBackgroundOperations() {
+    assert(isOwningHandle());
+    return maxBackgroundOperations(nativeHandle_);
+  }
+
+  /**
+   * During backup user can get callback every time next
+   * {@link #callbackTriggerIntervalSize()} bytes being copied.
+   *
+   * Default: 4194304
+   *
+   * @param callbackTriggerIntervalSize The interval size for the
+   *     callback trigger
+   * @return instance of current BackupEngineOptions.
+   */
+  public BackupEngineOptions setCallbackTriggerIntervalSize(
+      final long callbackTriggerIntervalSize) {
+    assert(isOwningHandle());
+    setCallbackTriggerIntervalSize(nativeHandle_, callbackTriggerIntervalSize);
+    return this;
+  }
+
+  /**
+   * During backup user can get callback every time next
+   * {@link #callbackTriggerIntervalSize()} bytes being copied.
+   *
+   * Default: 4194304
+   *
+   * @return The interval size for the callback trigger
+   */
+  public long callbackTriggerIntervalSize() {
+    assert(isOwningHandle());
+    return callbackTriggerIntervalSize(nativeHandle_);
+  }
+
+  private native static long newBackupEngineOptions(final String path);
+  private native String backupDir(long handle);
+  private native void setBackupEnv(final long handle, final long envHandle);
+  private native void setShareTableFiles(long handle, boolean flag);
+  private native boolean shareTableFiles(long handle);
+  private native void setInfoLog(final long handle, final long infoLogHandle);
+  private native void setSync(long handle, boolean flag);
+  private native boolean sync(long handle);
+  private native void setDestroyOldData(long handle, boolean flag);
+  private native boolean destroyOldData(long handle);
+  private native void setBackupLogFiles(long handle, boolean flag);
+  private native boolean backupLogFiles(long handle);
+  private native void setBackupRateLimit(long handle, long rateLimit);
+  private native long backupRateLimit(long handle);
+  private native void setBackupRateLimiter(long handle, long rateLimiterHandle);
+  private native void setRestoreRateLimit(long handle, long rateLimit);
+  private native long restoreRateLimit(long handle);
+  private native void setRestoreRateLimiter(final long handle,
+      final long rateLimiterHandle);
+  private native void setShareFilesWithChecksum(long handle, boolean flag);
+  private native boolean shareFilesWithChecksum(long handle);
+  private native void setMaxBackgroundOperations(final long handle,
+      final int maxBackgroundOperations);
+  private native int maxBackgroundOperations(final long handle);
+  private native void setCallbackTriggerIntervalSize(final long handle,
+      long callbackTriggerIntervalSize);
+  private native long callbackTriggerIntervalSize(final long handle);
+  @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/BackupInfo.java b/src/rocksdb/java/src/main/java/org/rocksdb/BackupInfo.java
new file mode 100644
index 000000000..9244e4eb1
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/BackupInfo.java
@@ -0,0 +1,76 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+/**
+ * Instances of this class describe a Backup made by
+ * {@link org.rocksdb.BackupEngine}.
+ */
+public class BackupInfo {
+
+  /**
+   * Package private constructor used to create instances
+   * of BackupInfo by {@link org.rocksdb.BackupEngine}
+   *
+   * @param backupId id of backup
+   * @param timestamp timestamp of backup
+   * @param size size of backup
+   * @param numberFiles number of files related to this backup.
+   */
+  BackupInfo(final int backupId, final long timestamp, final long size, final int numberFiles,
+      final String app_metadata) {
+    backupId_ = backupId;
+    timestamp_ = timestamp;
+    size_ = size;
+    numberFiles_ = numberFiles;
+    app_metadata_ = app_metadata;
+  }
+
+  /**
+   *
+   * @return the backup id.
+   */
+  public int backupId() {
+    return backupId_;
+  }
+
+  /**
+   *
+   * @return the timestamp of the backup.
+   */
+  public long timestamp() {
+    return timestamp_;
+  }
+
+  /**
+   *
+   * @return the size of the backup
+   */
+  public long size() {
+    return size_;
+  }
+
+  /**
+   *
+   * @return the number of files of this backup.
+   */
+  public int numberFiles() {
+    return numberFiles_;
+  }
+
+  /**
+   *
+   * @return the associated application metadata, or null
+   */
+  public String appMetadata() {
+    return app_metadata_;
+  }
+
+  private int backupId_;
+  private long timestamp_;
+  private long size_;
+  private int numberFiles_;
+  private String app_metadata_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java b/src/rocksdb/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
new file mode 100644
index 000000000..0404fc620
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
@@ -0,0 +1,1055 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+/**
+ * The config for plain table sst format.
+ *
+ * BlockBasedTable is a RocksDB's default SST file format.
+ */
+//TODO(AR) should be renamed BlockBasedTableOptions
+public class BlockBasedTableConfig extends TableFormatConfig {
+
+  public BlockBasedTableConfig() {
+    //TODO(AR) flushBlockPolicyFactory
+    cacheIndexAndFilterBlocks = false;
+    cacheIndexAndFilterBlocksWithHighPriority = true;
+    pinL0FilterAndIndexBlocksInCache = false;
+    pinTopLevelIndexAndFilter = true;
+    indexType = IndexType.kBinarySearch;
+    dataBlockIndexType = DataBlockIndexType.kDataBlockBinarySearch;
+    dataBlockHashTableUtilRatio = 0.75;
+    checksumType = ChecksumType.kCRC32c;
+    noBlockCache = false;
+    blockCache = null;
+    persistentCache = null;
+    blockCacheCompressed = null;
+    blockSize = 4 * 1024;
+    blockSizeDeviation = 10;
+    blockRestartInterval = 16;
+    indexBlockRestartInterval = 1;
+    metadataBlockSize = 4096;
+    partitionFilters = false;
+    optimizeFiltersForMemory = false;
+    useDeltaEncoding = true;
+    filterPolicy = null;
+    wholeKeyFiltering = true;
+    verifyCompression = false;
+    readAmpBytesPerBit = 0;
+    formatVersion = 5;
+    enableIndexCompression = true;
+    blockAlign = false;
+    indexShortening = IndexShorteningMode.kShortenSeparators;
+
+    // NOTE: ONLY used if blockCache == null
+    blockCacheSize = 8 * 1024 * 1024;
+    blockCacheNumShardBits = 0;
+
+    // NOTE: ONLY used if blockCacheCompressed == null
+    blockCacheCompressedSize = 0;
+    blockCacheCompressedNumShardBits = 0;
+  }
+
+  /**
+   * Indicating if we'd put index/filter blocks to the block cache.
+   * If not specified, each "table reader" object will pre-load index/filter
+   * block during table initialization.
+   *
+   * @return if index and filter blocks should be put in block cache.
+   */
+  public boolean cacheIndexAndFilterBlocks() {
+    return cacheIndexAndFilterBlocks;
+  }
+
+  /**
+   * Indicating if we'd put index/filter blocks to the block cache.
+   * If not specified, each "table reader" object will pre-load index/filter
+   * block during table initialization.
+   *
+   * @param cacheIndexAndFilterBlocks and filter blocks should be put in block cache.
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setCacheIndexAndFilterBlocks(
+      final boolean cacheIndexAndFilterBlocks) {
+    this.cacheIndexAndFilterBlocks = cacheIndexAndFilterBlocks;
+    return this;
+  }
+
+  /**
+   * Indicates if index and filter blocks will be treated as high-priority in the block cache.
+   * See note below about applicability. If not specified, defaults to true.
+   *
+   * @return if index and filter blocks will be treated as high-priority.
+   */
+  public boolean cacheIndexAndFilterBlocksWithHighPriority() {
+    return cacheIndexAndFilterBlocksWithHighPriority;
+  }
+
+  /**
+   * If true, cache index and filter blocks with high priority. If set to true,
+   * depending on implementation of block cache, index and filter blocks may be
+   * less likely to be evicted than data blocks.
+   *
+   * @param cacheIndexAndFilterBlocksWithHighPriority if index and filter blocks
+   *            will be treated as high-priority.
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setCacheIndexAndFilterBlocksWithHighPriority(
+      final boolean cacheIndexAndFilterBlocksWithHighPriority) {
+    this.cacheIndexAndFilterBlocksWithHighPriority = cacheIndexAndFilterBlocksWithHighPriority;
+    return this;
+  }
+
+  /**
+   * Indicating if we'd like to pin L0 index/filter blocks to the block cache.
+   If not specified, defaults to false.
+   *
+   * @return if L0 index and filter blocks should be pinned to the block cache.
+   */
+  public boolean pinL0FilterAndIndexBlocksInCache() {
+    return pinL0FilterAndIndexBlocksInCache;
+  }
+
+  /**
+   * Indicating if we'd like to pin L0 index/filter blocks to the block cache.
+   If not specified, defaults to false.
+   *
+   * @param pinL0FilterAndIndexBlocksInCache pin blocks in block cache
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setPinL0FilterAndIndexBlocksInCache(
+      final boolean pinL0FilterAndIndexBlocksInCache) {
+    this.pinL0FilterAndIndexBlocksInCache = pinL0FilterAndIndexBlocksInCache;
+    return this;
+  }
+
+  /**
+   * Indicates if top-level index and filter blocks should be pinned.
+   *
+   * @return if top-level index and filter blocks should be pinned.
+   */
+  public boolean pinTopLevelIndexAndFilter() {
+    return pinTopLevelIndexAndFilter;
+  }
+
+  /**
+   * If cacheIndexAndFilterBlocks is true and the below is true, then
+   * the top-level index of partitioned filter and index blocks are stored in
+   * the cache, but a reference is held in the "table reader" object so the
+   * blocks are pinned and only evicted from cache when the table reader is
+   * freed. This is not limited to l0 in LSM tree.
+   *
+   * @param pinTopLevelIndexAndFilter if top-level index and filter blocks should be pinned.
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setPinTopLevelIndexAndFilter(final boolean pinTopLevelIndexAndFilter) {
+    this.pinTopLevelIndexAndFilter = pinTopLevelIndexAndFilter;
+    return this;
+  }
+
+  /**
+   * Get the index type.
+   *
+   * @return the currently set index type
+   */
+  public IndexType indexType() {
+    return indexType;
+  }
+
+  /**
+   * Sets the index type to used with this table.
+   *
+   * @param indexType {@link org.rocksdb.IndexType} value
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setIndexType(
+      final IndexType indexType) {
+    this.indexType = indexType;
+    return this;
+  }
+
+  /**
+   * Get the data block index type.
+   *
+   * @return the currently set data block index type
+   */
+  public DataBlockIndexType dataBlockIndexType() {
+    return dataBlockIndexType;
+  }
+
+  /**
+   * Sets the data block index type to used with this table.
+   *
+   * @param dataBlockIndexType {@link org.rocksdb.DataBlockIndexType} value
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setDataBlockIndexType(
+      final DataBlockIndexType dataBlockIndexType) {
+    this.dataBlockIndexType = dataBlockIndexType;
+    return this;
+  }
+
+  /**
+   * Get the #entries/#buckets. It is valid only when {@link #dataBlockIndexType()} is
+   * {@link DataBlockIndexType#kDataBlockBinaryAndHash}.
+   *
+   * @return the #entries/#buckets.
+   */
+  public double dataBlockHashTableUtilRatio() {
+    return dataBlockHashTableUtilRatio;
+  }
+
+  /**
+   * Set the #entries/#buckets. It is valid only when {@link #dataBlockIndexType()} is
+   * {@link DataBlockIndexType#kDataBlockBinaryAndHash}.
+   *
+   * @param dataBlockHashTableUtilRatio #entries/#buckets
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setDataBlockHashTableUtilRatio(
+      final double dataBlockHashTableUtilRatio) {
+    this.dataBlockHashTableUtilRatio = dataBlockHashTableUtilRatio;
+    return this;
+  }
+
+  /**
+   * Get the checksum type to be used with this table.
+   *
+   * @return the currently set checksum type
+   */
+  public ChecksumType checksumType() {
+    return checksumType;
+  }
+
+  /**
+   * Sets
+   *
+   * @param checksumType {@link org.rocksdb.ChecksumType} value.
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setChecksumType(
+      final ChecksumType checksumType) {
+    this.checksumType = checksumType;
+    return this;
+  }
+
+  /**
+   * Determine if the block cache is disabled.
+   *
+   * @return if block cache is disabled
+   */
+  public boolean noBlockCache() {
+    return noBlockCache;
+  }
+
+  /**
+   * Disable block cache. If this is set to true,
+   * then no block cache should be used, and the {@link #setBlockCache(Cache)}
+   * should point to a {@code null} object.
+   *
+   * Default: false
+   *
+   * @param noBlockCache if use block cache
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setNoBlockCache(final boolean noBlockCache) {
+    this.noBlockCache = noBlockCache;
+    return this;
+  }
+
+  /**
+   * Use the specified cache for blocks.
+   * When not null this take precedence even if the user sets a block cache size.
+   *
+   * {@link org.rocksdb.Cache} should not be disposed before options instances
+   * using this cache is disposed.
+   *
+   * {@link org.rocksdb.Cache} instance can be re-used in multiple options
+   * instances.
+   *
+   * @param blockCache {@link org.rocksdb.Cache} Cache java instance
+   *     (e.g. LRUCache).
+   *
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setBlockCache(final Cache blockCache) {
+    this.blockCache = blockCache;
+    return this;
+  }
+
+  /**
+   * Use the specified persistent cache.
+   *
+   * If {@code !null} use the specified cache for pages read from device,
+   * otherwise no page cache is used.
+   *
+   * @param persistentCache the persistent cache
+   *
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setPersistentCache(
+      final PersistentCache persistentCache) {
+    this.persistentCache = persistentCache;
+    return this;
+  }
+
+  /**
+   * Use the specified cache for compressed blocks.
+   *
+   * If {@code null}, RocksDB will not use a compressed block cache.
+   *
+   * Note: though it looks similar to {@link #setBlockCache(Cache)}, RocksDB
+   *     doesn't put the same type of object there.
+   *
+   * {@link org.rocksdb.Cache} should not be disposed before options instances
+   * using this cache is disposed.
+   *
+   * {@link org.rocksdb.Cache} instance can be re-used in multiple options
+   * instances.
+   *
+   * @param blockCacheCompressed {@link org.rocksdb.Cache} Cache java instance
+   *     (e.g. LRUCache).
+   *
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setBlockCacheCompressed(
+      final Cache blockCacheCompressed) {
+    this.blockCacheCompressed = blockCacheCompressed;
+    return this;
+  }
+
+  /**
+   * Get the approximate size of user data packed per block.
+   *
+   * @return block size in bytes
+   */
+  public long blockSize() {
+    return blockSize;
+  }
+
+  /**
+   * Approximate size of user data packed per block. Note that the
+   * block size specified here corresponds to uncompressed data.  The
+   * actual size of the unit read from disk may be smaller if
+   * compression is enabled.  This parameter can be changed dynamically.
+   * Default: 4K
+   *
+   * @param blockSize block size in bytes
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setBlockSize(final long blockSize) {
+    this.blockSize = blockSize;
+    return this;
+  }
+
+  /**
+   * @return the hash table ratio.
+   */
+  public int blockSizeDeviation() {
+    return blockSizeDeviation;
+  }
+
+  /**
+   * This is used to close a block before it reaches the configured
+   * {@link #blockSize()}. If the percentage of free space in the current block
+   * is less than this specified number and adding a new record to the block
+   * will exceed the configured block size, then this block will be closed and
+   * the new record will be written to the next block.
+   *
+   * Default is 10.
+   *
+   * @param blockSizeDeviation the deviation to block size allowed
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setBlockSizeDeviation(
+      final int blockSizeDeviation) {
+    this.blockSizeDeviation = blockSizeDeviation;
+    return this;
+  }
+
+  /**
+   * Get the block restart interval.
+   *
+   * @return block restart interval
+   */
+  public int blockRestartInterval() {
+    return blockRestartInterval;
+  }
+
+  /**
+   * Set the block restart interval.
+   *
+   * @param restartInterval block restart interval.
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setBlockRestartInterval(
+      final int restartInterval) {
+    blockRestartInterval = restartInterval;
+    return this;
+  }
+
+  /**
+   * Get the index block restart interval.
+   *
+   * @return index block restart interval
+   */
+  public int indexBlockRestartInterval() {
+    return indexBlockRestartInterval;
+  }
+
+  /**
+   * Set the index block restart interval
+   *
+   * @param restartInterval index block restart interval.
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setIndexBlockRestartInterval(
+      final int restartInterval) {
+    indexBlockRestartInterval = restartInterval;
+    return this;
+  }
+
+  /**
+   * Get the block size for partitioned metadata.
+   *
+   * @return block size for partitioned metadata.
+   */
+  public long metadataBlockSize() {
+    return metadataBlockSize;
+  }
+
+  /**
+   * Set block size for partitioned metadata.
+   *
+   * @param metadataBlockSize Partitioned metadata block size.
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setMetadataBlockSize(
+      final long metadataBlockSize) {
+    this.metadataBlockSize = metadataBlockSize;
+    return this;
+  }
+
+  /**
+   * Indicates if we're using partitioned filters.
+   *
+   * @return if we're using partition filters.
+   */
+  public boolean partitionFilters() {
+    return partitionFilters;
+  }
+
+  /**
+   * Use partitioned full filters for each SST file. This option is incompatible
+   * with block-based filters.
+   *
+   * Defaults to false.
+   *
+   * @param partitionFilters use partition filters.
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setPartitionFilters(final boolean partitionFilters) {
+    this.partitionFilters = partitionFilters;
+    return this;
+  }
+
+  /***
+   * Option to generate Bloom filters that minimize memory
+   * internal fragmentation.
+   *
+   * See {@link #setOptimizeFiltersForMemory(boolean)}.
+   *
+   * @return true if bloom filters are used to minimize memory internal
+   *     fragmentation
+   */
+  @Experimental("Option to generate Bloom filters that minimize memory internal fragmentation")
+  public boolean optimizeFiltersForMemory() {
+    return optimizeFiltersForMemory;
+  }
+
+  /**
+   * Option to generate Bloom filters that minimize memory
+   * internal fragmentation.
+   *
+   * When false, malloc_usable_size is not available, or format_version &lt; 5,
+   * filters are generated without regard to internal fragmentation when
+   * loaded into memory (historical behavior). When true (and
+   * malloc_usable_size is available and {@link #formatVersion()} &gt;= 5),
+   * then Bloom filters are generated to "round up" and "round down" their
+   * sizes to minimize internal fragmentation when loaded into memory, assuming
+   * the reading DB has the same memory allocation characteristics as the
+   * generating DB. This option does not break forward or backward
+   * compatibility.
+   *
+   * While individual filters will vary in bits/key and false positive rate
+   * when setting is true, the implementation attempts to maintain a weighted
+   * average FP rate for filters consistent with this option set to false.
+   *
+   * With Jemalloc for example, this setting is expected to save about 10% of
+   * the memory footprint and block cache charge of filters, while increasing
+   * disk usage of filters by about 1-2% due to encoding efficiency losses
+   * with variance in bits/key.
+   *
+   * NOTE: Because some memory counted by block cache might be unmapped pages
+   * within internal fragmentation, this option can increase observed RSS
+   * memory usage. With {@link #cacheIndexAndFilterBlocks()} == true,
+   * this option makes the block cache better at using space it is allowed.
+   *
+   * NOTE: Do not set to true if you do not trust malloc_usable_size. With
+   * this option, RocksDB might access an allocated memory object beyond its
+   * original size if malloc_usable_size says it is safe to do so. While this
+   * can be considered bad practice, it should not produce undefined behavior
+   * unless malloc_usable_size is buggy or broken.
+   *
+   * @param optimizeFiltersForMemory true to enable Bloom filters that minimize
+   *     memory internal fragmentation, or false to disable.
+   *
+   * @return the reference to the current config.
+   */
+  @Experimental("Option to generate Bloom filters that minimize memory internal fragmentation")
+  public BlockBasedTableConfig setOptimizeFiltersForMemory(final boolean optimizeFiltersForMemory) {
+    this.optimizeFiltersForMemory = optimizeFiltersForMemory;
+    return this;
+  }
+
+  /**
+   * Determine if delta encoding is being used to compress block keys.
+   *
+   * @return true if delta encoding is enabled, false otherwise.
+   */
+  public boolean useDeltaEncoding() {
+    return useDeltaEncoding;
+  }
+
+  /**
+   * Use delta encoding to compress keys in blocks.
+   *
+   * NOTE: {@link ReadOptions#pinData()} requires this option to be disabled.
+   *
+   * Default: true
+   *
+   * @param useDeltaEncoding true to enable delta encoding
+   *
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setUseDeltaEncoding(
+      final boolean useDeltaEncoding) {
+    this.useDeltaEncoding = useDeltaEncoding;
+    return this;
+  }
+
+  /**
+   * Get the filter policy.
+   *
+   * @return the current filter policy.
+   */
+  public Filter filterPolicy() {
+    return filterPolicy;
+  }
+
+  /**
+   * Use the specified filter policy to reduce disk reads.
+   *
+   * {@link org.rocksdb.Filter} should not be closed before options instances
+   * using this filter are closed.
+   *
+   * {@link org.rocksdb.Filter} instance can be re-used in multiple options
+   * instances.
+   *
+   * @param filterPolicy {@link org.rocksdb.Filter} Filter Policy java instance.
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setFilterPolicy(
+      final Filter filterPolicy) {
+    this.filterPolicy = filterPolicy;
+    return this;
+  }
+
+  /**
+   * Set the filter.
+   *
+   * @param filter the filter
+   * @return the reference to the current config.
+   *
+   * @deprecated Use {@link #setFilterPolicy(Filter)}
+   */
+  @Deprecated
+  public BlockBasedTableConfig setFilter(
+      final Filter filter) {
+    return setFilterPolicy(filter);
+  }
+
+  /**
+   * Determine if whole keys as opposed to prefixes are placed in the filter.
+   *
+   * @return if whole key filtering is enabled
+   */
+  public boolean wholeKeyFiltering() {
+    return wholeKeyFiltering;
+  }
+
+  /**
+   * If true, place whole keys in the filter (not just prefixes).
+   * This must generally be true for gets to be efficient.
+   * Default: true
+   *
+   * @param wholeKeyFiltering if enable whole key filtering
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setWholeKeyFiltering(
+      final boolean wholeKeyFiltering) {
+    this.wholeKeyFiltering = wholeKeyFiltering;
+    return this;
+  }
+
+  /**
+   * Returns true when compression verification is enabled.
+   *
+   * See {@link #setVerifyCompression(boolean)}.
+   *
+   * @return true if compression verification is enabled.
+   */
+  public boolean verifyCompression() {
+    return verifyCompression;
+  }
+
+  /**
+   * Verify that decompressing the compressed block gives back the input. This
+   * is a verification mode that we use to detect bugs in compression
+   * algorithms.
+   *
+   * @param verifyCompression true to enable compression verification.
+   *
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setVerifyCompression(
+      final boolean verifyCompression) {
+    this.verifyCompression = verifyCompression;
+    return this;
+  }
+
+  /**
+   * Get the Read amplification bytes per-bit.
+   *
+   * See {@link #setReadAmpBytesPerBit(int)}.
+   *
+   * @return the bytes per-bit.
+   */
+  public int readAmpBytesPerBit() {
+    return readAmpBytesPerBit;
+  }
+
+  /**
+   * Set the Read amplification bytes per-bit.
+   *
+   * If used, For every data block we load into memory, we will create a bitmap
+   * of size ((block_size / `read_amp_bytes_per_bit`) / 8) bytes. This bitmap
+   * will be used to figure out the percentage we actually read of the blocks.
+   *
+   * When this feature is used Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES and
+   * Tickers::READ_AMP_TOTAL_READ_BYTES can be used to calculate the
+   * read amplification using this formula
+   * (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES)
+   *
+   * value  =&gt;  memory usage (percentage of loaded blocks memory)
+   * 1      =&gt;  12.50 %
+   * 2      =&gt;  06.25 %
+   * 4      =&gt;  03.12 %
+   * 8      =&gt;  01.56 %
+   * 16     =&gt;  00.78 %
+   *
+   * Note: This number must be a power of 2, if not it will be sanitized
+   * to be the next lowest power of 2, for example a value of 7 will be
+   * treated as 4, a value of 19 will be treated as 16.
+   *
+   * Default: 0 (disabled)
+   *
+   * @param readAmpBytesPerBit the bytes per-bit
+   *
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setReadAmpBytesPerBit(final int readAmpBytesPerBit) {
+    this.readAmpBytesPerBit = readAmpBytesPerBit;
+    return this;
+  }
+
+  /**
+   * Get the format version.
+   * See {@link #setFormatVersion(int)}.
+   *
+   * @return the currently configured format version.
+   */
+  public int formatVersion() {
+    return formatVersion;
+  }
+
+  /**
+   * <p>We currently have five versions:</p>
+   *
+   * <ul>
+   * <li><strong>0</strong> - This version is currently written
+   * out by all RocksDB's versions by default. Can be read by really old
+   * RocksDB's. Doesn't support changing checksum (default is CRC32).</li>
+   * <li><strong>1</strong> - Can be read by RocksDB's versions since 3.0.
+   * Supports non-default checksum, like xxHash. It is written by RocksDB when
+   * BlockBasedTableOptions::checksum is something other than kCRC32c. (version
+   * 0 is silently upconverted)</li>
+   * <li><strong>2</strong> - Can be read by RocksDB's versions since 3.10.
+   * Changes the way we encode compressed blocks with LZ4, BZip2 and Zlib
+   * compression. If you don't plan to run RocksDB before version 3.10,
+   * you should probably use this.</li>
+   * <li><strong>3</strong> - Can be read by RocksDB's versions since 5.15. Changes the way we
+   * encode the keys in index blocks. If you don't plan to run RocksDB before
+   * version 5.15, you should probably use this.
+   * This option only affects newly written tables. When reading existing
+   * tables, the information about version is read from the footer.</li>
+   * <li><strong>4</strong> - Can be read by RocksDB's versions since 5.16. Changes the way we
+   * encode the values in index blocks. If you don't plan to run RocksDB before
+   * version 5.16 and you are using index_block_restart_interval &gt; 1, you should
+   * probably use this as it would reduce the index size.
+   * This option only affects newly written tables. When reading existing
+   * tables, the information about version is read from the footer.</li>
+   * <li><strong>5</strong> - Can be read by RocksDB's versions since 6.6.0.
+   * Full and partitioned filters use a generally faster and more accurate
+   * Bloom filter implementation, with a different schema.</li>
+   * </ul>
+   *
+   * @param formatVersion integer representing the version to be used.
+   *
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setFormatVersion(
+      final int formatVersion) {
+    assert (formatVersion >= 0);
+    this.formatVersion = formatVersion;
+    return this;
+  }
+
+  /**
+   * Determine if index compression is enabled.
+   *
+   * See {@link #setEnableIndexCompression(boolean)}.
+   *
+   * @return true if index compression is enabled, false otherwise
+   */
+  public boolean enableIndexCompression() {
+    return enableIndexCompression;
+  }
+
+  /**
+   * Store index blocks on disk in compressed format.
+   *
+   * Changing this option to false  will avoid the overhead of decompression
+   * if index blocks are evicted and read back.
+   *
+   * @param enableIndexCompression true to enable index compression,
+   *     false to disable
+   *
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setEnableIndexCompression(
+      final boolean enableIndexCompression) {
+    this.enableIndexCompression = enableIndexCompression;
+    return this;
+  }
+
+  /**
+   * Determines whether data blocks are aligned on the lesser of page size
+   * and block size.
+   *
+   * @return true if data blocks are aligned on the lesser of page size
+   *     and block size.
+   */
+  public boolean blockAlign() {
+    return blockAlign;
+  }
+
+  /**
+   * Set whether data blocks should be aligned on the lesser of page size
+   * and block size.
+   *
+   * @param blockAlign true to align data blocks on the lesser of page size
+   *     and block size.
+   *
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setBlockAlign(final boolean blockAlign) {
+    this.blockAlign = blockAlign;
+    return this;
+  }
+
+  /**
+   * Get the index shortening mode.
+   *
+   * @return the index shortening mode.
+   */
+  public IndexShorteningMode indexShortening() {
+    return indexShortening;
+  }
+
+  /**
+   * Set the index shortening mode.
+   *
+   * See {@link IndexShorteningMode}.
+   *
+   * @param indexShortening the index shortening mode.
+   *
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setIndexShortening(final IndexShorteningMode indexShortening) {
+    this.indexShortening = indexShortening;
+    return this;
+  }
+
+  /**
+   * Get the size of the cache in bytes that will be used by RocksDB.
+   *
+   * @return block cache size in bytes
+   */
+  @Deprecated
+  public long blockCacheSize() {
+    return blockCacheSize;
+  }
+
+  /**
+   * Set the size of the cache in bytes that will be used by RocksDB.
+   * If cacheSize is negative, then cache will not be used.
+   * DEFAULT: 8M
+   *
+   * @param blockCacheSize block cache size in bytes
+   * @return the reference to the current config.
+   *
+   * @deprecated Use {@link #setBlockCache(Cache)}.
+   */
+  @Deprecated
+  public BlockBasedTableConfig setBlockCacheSize(final long blockCacheSize) {
+    this.blockCacheSize = blockCacheSize;
+    return this;
+  }
+
+  /**
+   * Returns the number of shard bits used in the block cache.
+   * The resulting number of shards would be 2 ^ (returned value).
+   * Any negative number means use default settings.
+   *
+   * @return the number of shard bits used in the block cache.
+   */
+  @Deprecated
+  public int cacheNumShardBits() {
+    return blockCacheNumShardBits;
+  }
+
+  /**
+   * Controls the number of shards for the block cache.
+   * This is applied only if cacheSize is set to non-negative.
+   *
+   * @param blockCacheNumShardBits the number of shard bits. The resulting
+   *     number of shards would be 2 ^ numShardBits.  Any negative
+   *     number means use default settings."
+   * @return the reference to the current option.
+   *
+   * @deprecated Use {@link #setBlockCache(Cache)}.
+   */
+  @Deprecated
+  public BlockBasedTableConfig setCacheNumShardBits(
+      final int blockCacheNumShardBits) {
+    this.blockCacheNumShardBits = blockCacheNumShardBits;
+    return this;
+  }
+
+  /**
+   * Size of compressed block cache. If 0, then block_cache_compressed is set
+   * to null.
+   *
+   * @return size of compressed block cache.
+   */
+  @Deprecated
+  public long blockCacheCompressedSize() {
+    return blockCacheCompressedSize;
+  }
+
+  /**
+   * Size of compressed block cache. If 0, then block_cache_compressed is set
+   * to null.
+   *
+   * @param blockCacheCompressedSize of compressed block cache.
+   * @return the reference to the current config.
+   *
+   * @deprecated Use {@link #setBlockCacheCompressed(Cache)}.
+   */
+  @Deprecated
+  public BlockBasedTableConfig setBlockCacheCompressedSize(
+      final long blockCacheCompressedSize) {
+    this.blockCacheCompressedSize = blockCacheCompressedSize;
+    return this;
+  }
+
+  /**
+   * Controls the number of shards for the block compressed cache.
+   * This is applied only if blockCompressedCacheSize is set to non-negative.
+   *
+   * @return numShardBits the number of shard bits.  The resulting
+   *     number of shards would be 2 ^ numShardBits.  Any negative
+   *     number means use default settings.
+   */
+  @Deprecated
+  public int blockCacheCompressedNumShardBits() {
+    return blockCacheCompressedNumShardBits;
+  }
+
+  /**
+   * Controls the number of shards for the block compressed cache.
+   * This is applied only if blockCompressedCacheSize is set to non-negative.
+   *
+   * @param blockCacheCompressedNumShardBits the number of shard bits.  The resulting
+   *     number of shards would be 2 ^ numShardBits.  Any negative
+   *     number means use default settings."
+   * @return the reference to the current option.
+   *
+   * @deprecated Use {@link #setBlockCacheCompressed(Cache)}.
+   */
+  @Deprecated
+  public BlockBasedTableConfig setBlockCacheCompressedNumShardBits(
+      final int blockCacheCompressedNumShardBits) {
+    this.blockCacheCompressedNumShardBits = blockCacheCompressedNumShardBits;
+    return this;
+  }
+
+  /**
+   * Influence the behavior when kHashSearch is used.
+   *  if false, stores a precise prefix to block range mapping
+   *  if true, does not store prefix and allows prefix hash collision
+   *  (less memory consumption)
+   *
+   * @return if hash collisions should be allowed.
+   *
+   * @deprecated This option is now deprecated. No matter what value it
+   *     is set to, it will behave as
+   *     if {@link #hashIndexAllowCollision()} == true.
+   */
+  @Deprecated
+  public boolean hashIndexAllowCollision() {
+    return true;
+  }
+
+  /**
+   * Influence the behavior when kHashSearch is used.
+   * if false, stores a precise prefix to block range mapping
+   * if true, does not store prefix and allows prefix hash collision
+   * (less memory consumption)
+   *
+   * @param hashIndexAllowCollision points out if hash collisions should be allowed.
+   *
+   * @return the reference to the current config.
+   *
+   * @deprecated This option is now deprecated. No matter what value it
+   *     is set to, it will behave as
+   *     if {@link #hashIndexAllowCollision()} == true.
+   */
+  @Deprecated
+  public BlockBasedTableConfig setHashIndexAllowCollision(
+      final boolean hashIndexAllowCollision) {
+    // no-op
+    return this;
+  }
+
+  @Override protected long newTableFactoryHandle() {
+    final long filterPolicyHandle;
+    if (filterPolicy != null) {
+      filterPolicyHandle = filterPolicy.nativeHandle_;
+    } else {
+      filterPolicyHandle = 0;
+    }
+
+    final long blockCacheHandle;
+    if (blockCache != null) {
+      blockCacheHandle = blockCache.nativeHandle_;
+    } else {
+      blockCacheHandle = 0;
+    }
+
+    final long persistentCacheHandle;
+    if (persistentCache != null) {
+      persistentCacheHandle = persistentCache.nativeHandle_;
+    } else {
+      persistentCacheHandle = 0;
+    }
+
+    final long blockCacheCompressedHandle;
+    if (blockCacheCompressed != null) {
+      blockCacheCompressedHandle = blockCacheCompressed.nativeHandle_;
+    } else {
+      blockCacheCompressedHandle = 0;
+    }
+
+    return newTableFactoryHandle(cacheIndexAndFilterBlocks,
+        cacheIndexAndFilterBlocksWithHighPriority, pinL0FilterAndIndexBlocksInCache,
+        pinTopLevelIndexAndFilter, indexType.getValue(), dataBlockIndexType.getValue(),
+        dataBlockHashTableUtilRatio, checksumType.getValue(), noBlockCache, blockCacheHandle,
+        persistentCacheHandle, blockCacheCompressedHandle, blockSize, blockSizeDeviation,
+        blockRestartInterval, indexBlockRestartInterval, metadataBlockSize, partitionFilters,
+        optimizeFiltersForMemory, useDeltaEncoding, filterPolicyHandle, wholeKeyFiltering,
+        verifyCompression, readAmpBytesPerBit, formatVersion, enableIndexCompression, blockAlign,
+        indexShortening.getValue(), blockCacheSize, blockCacheNumShardBits,
+        blockCacheCompressedSize, blockCacheCompressedNumShardBits);
+  }
+
+  private native long newTableFactoryHandle(final boolean cacheIndexAndFilterBlocks,
+      final boolean cacheIndexAndFilterBlocksWithHighPriority,
+      final boolean pinL0FilterAndIndexBlocksInCache, final boolean pinTopLevelIndexAndFilter,
+      final byte indexTypeValue, final byte dataBlockIndexTypeValue,
+      final double dataBlockHashTableUtilRatio, final byte checksumTypeValue,
+      final boolean noBlockCache, final long blockCacheHandle, final long persistentCacheHandle,
+      final long blockCacheCompressedHandle, final long blockSize, final int blockSizeDeviation,
+      final int blockRestartInterval, final int indexBlockRestartInterval,
+      final long metadataBlockSize, final boolean partitionFilters,
+      final boolean optimizeFiltersForMemory, final boolean useDeltaEncoding,
+      final long filterPolicyHandle, final boolean wholeKeyFiltering,
+      final boolean verifyCompression, final int readAmpBytesPerBit, final int formatVersion,
+      final boolean enableIndexCompression, final boolean blockAlign, final byte indexShortening,
+
+      @Deprecated final long blockCacheSize, @Deprecated final int blockCacheNumShardBits,
+
+      @Deprecated final long blockCacheCompressedSize,
+      @Deprecated final int blockCacheCompressedNumShardBits);
+
+  //TODO(AR) flushBlockPolicyFactory
+  private boolean cacheIndexAndFilterBlocks;
+  private boolean cacheIndexAndFilterBlocksWithHighPriority;
+  private boolean pinL0FilterAndIndexBlocksInCache;
+  private boolean pinTopLevelIndexAndFilter;
+  private IndexType indexType;
+  private DataBlockIndexType dataBlockIndexType;
+  private double dataBlockHashTableUtilRatio;
+  private ChecksumType checksumType;
+  private boolean noBlockCache;
+  private Cache blockCache;
+  private PersistentCache persistentCache;
+  private Cache blockCacheCompressed;
+  private long blockSize;
+  private int blockSizeDeviation;
+  private int blockRestartInterval;
+  private int indexBlockRestartInterval;
+  private long metadataBlockSize;
+  private boolean partitionFilters;
+  private boolean optimizeFiltersForMemory;
+  private boolean useDeltaEncoding;
+  private Filter filterPolicy;
+  private boolean wholeKeyFiltering;
+  private boolean verifyCompression;
+  private int readAmpBytesPerBit;
+  private int formatVersion;
+  private boolean enableIndexCompression;
+  private boolean blockAlign;
+  private IndexShorteningMode indexShortening;
+
+  // NOTE: ONLY used if blockCache == null
+  @Deprecated private long blockCacheSize;
+  @Deprecated private int blockCacheNumShardBits;
+
+  // NOTE: ONLY used if blockCacheCompressed == null
+  @Deprecated private long blockCacheCompressedSize;
+  @Deprecated private int blockCacheCompressedNumShardBits;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/BloomFilter.java b/src/rocksdb/java/src/main/java/org/rocksdb/BloomFilter.java
new file mode 100644
index 000000000..8aff715b7
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/BloomFilter.java
@@ -0,0 +1,73 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Bloom filter policy that uses a bloom filter with approximately
+ * the specified number of bits per key.
+ *
+ * <p>
+ * Note: if you are using a custom comparator that ignores some parts
+ * of the keys being compared, you must not use this {@code BloomFilter}
+ * and must provide your own FilterPolicy that also ignores the
+ * corresponding parts of the keys. For example, if the comparator
+ * ignores trailing spaces, it would be incorrect to use a
+ * FilterPolicy (like {@code BloomFilter}) that does not ignore
+ * trailing spaces in keys.</p>
+ */
+public class BloomFilter extends Filter {
+
+  private static final double DEFAULT_BITS_PER_KEY = 10.0;
+
+  /**
+   * BloomFilter constructor
+   *
+   * <p>
+   * Callers must delete the result after any database that is using the
+   * result has been closed.</p>
+   */
+  public BloomFilter() {
+    this(DEFAULT_BITS_PER_KEY);
+  }
+
+  /**
+   * BloomFilter constructor
+   *
+   * <p>
+   * bits_per_key: bits per key in bloom filter. A good value for bits_per_key
+   * is 9.9, which yields a filter with ~ 1% false positive rate.
+   * </p>
+   * <p>
+   * Callers must delete the result after any database that is using the
+   * result has been closed.</p>
+   *
+   * @param bitsPerKey number of bits to use
+   */
+  public BloomFilter(final double bitsPerKey) {
+    super(createNewBloomFilter(bitsPerKey));
+  }
+
+  /**
+   * BloomFilter constructor
+   *
+   * <p>
+   * bits_per_key: bits per key in bloom filter. A good value for bits_per_key
+   * is 10, which yields a filter with ~ 1% false positive rate.
+   * <p><strong>default bits_per_key</strong>: 10</p>
+   *
+   * <p>
+   * Callers must delete the result after any database that is using the
+   * result has been closed.</p>
+   *
+   * @param bitsPerKey number of bits to use
+   * @param IGNORED_useBlockBasedMode obsolete, ignored parameter
+   */
+  public BloomFilter(final double bitsPerKey, final boolean IGNORED_useBlockBasedMode) {
+    this(bitsPerKey);
+  }
+
+  private native static long createNewBloomFilter(final double bitsKeyKey);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/BuiltinComparator.java b/src/rocksdb/java/src/main/java/org/rocksdb/BuiltinComparator.java
new file mode 100644
index 000000000..2c89bf218
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/BuiltinComparator.java
@@ -0,0 +1,20 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Builtin RocksDB comparators
+ *
+ * <ol>
+ *   <li>BYTEWISE_COMPARATOR - Sorts all keys in ascending bytewise
+ *   order.</li>
+ *   <li>REVERSE_BYTEWISE_COMPARATOR - Sorts all keys in descending bytewise
+ *   order</li>
+ * </ol>
+ */
+public enum BuiltinComparator {
+  BYTEWISE_COMPARATOR, REVERSE_BYTEWISE_COMPARATOR
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/ByteBufferGetStatus.java b/src/rocksdb/java/src/main/java/org/rocksdb/ByteBufferGetStatus.java
new file mode 100644
index 000000000..8eef95447
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/ByteBufferGetStatus.java
@@ -0,0 +1,50 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+/**
+ * A ByteBuffer containing fetched data, together with a result for the fetch
+ * and the total size of the object fetched.
+ *
+ * Used for the individual results of
+ * {@link RocksDB#multiGetByteBuffers(List, List)}
+ * {@link RocksDB#multiGetByteBuffers(List, List, List)}
+ * {@link RocksDB#multiGetByteBuffers(ReadOptions, List, List)}
+ * {@link RocksDB#multiGetByteBuffers(ReadOptions, List, List, List)}
+ */
+public class ByteBufferGetStatus {
+  public final Status status;
+  public final int requiredSize;
+  public final ByteBuffer value;
+
+  /**
+   * Constructor used for success status, when the value is contained in the buffer
+   *
+   * @param status the status of the request to fetch into the buffer
+   * @param requiredSize the size of the data, which may be bigger than the buffer
+   * @param value the buffer containing as much of the value as fits
+   */
+  ByteBufferGetStatus(final Status status, final int requiredSize, final ByteBuffer value) {
+    this.status = status;
+    this.requiredSize = requiredSize;
+    this.value = value;
+  }
+
+  /**
+   * Constructor used for a failure status, when no value is filled in
+   *
+   * @param status the status of the request to fetch into the buffer
+   */
+  ByteBufferGetStatus(final Status status) {
+    this.status = status;
+    this.requiredSize = 0;
+    this.value = null;
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/Cache.java b/src/rocksdb/java/src/main/java/org/rocksdb/Cache.java
new file mode 100644
index 000000000..569a1df06
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/Cache.java
@@ -0,0 +1,40 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+
+public abstract class Cache extends RocksObject {
+  protected Cache(final long nativeHandle) {
+    super(nativeHandle);
+  }
+
+  /**
+   * Returns the memory size for the entries
+   * residing in cache.
+   *
+   * @return cache usage size.
+   *
+   */
+  public long getUsage() {
+    assert (isOwningHandle());
+    return getUsage(this.nativeHandle_);
+  }
+
+  /**
+   * Returns the memory size for the entries
+   * being pinned in cache.
+   *
+   * @return cache pinned usage size.
+   *
+   */
+  public long getPinnedUsage() {
+    assert (isOwningHandle());
+    return getPinnedUsage(this.nativeHandle_);
+  }
+
+  private native static long getUsage(final long handle);
+  private native static long getPinnedUsage(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/CassandraCompactionFilter.java b/src/rocksdb/java/src/main/java/org/rocksdb/CassandraCompactionFilter.java
new file mode 100644
index 000000000..6c87cc188
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/CassandraCompactionFilter.java
@@ -0,0 +1,19 @@
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Just a Java wrapper around CassandraCompactionFilter implemented in C++
+ */
+public class CassandraCompactionFilter
+    extends AbstractCompactionFilter<Slice> {
+  public CassandraCompactionFilter(boolean purgeTtlOnExpiration, int gcGracePeriodInSeconds) {
+    super(createNewCassandraCompactionFilter0(purgeTtlOnExpiration, gcGracePeriodInSeconds));
+  }
+
+  private native static long createNewCassandraCompactionFilter0(
+      boolean purgeTtlOnExpiration, int gcGracePeriodInSeconds);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/CassandraValueMergeOperator.java b/src/rocksdb/java/src/main/java/org/rocksdb/CassandraValueMergeOperator.java
new file mode 100644
index 000000000..4b0c71ba5
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/CassandraValueMergeOperator.java
@@ -0,0 +1,25 @@
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * CassandraValueMergeOperator is a merge operator that merges two cassandra wide column
+ * values.
+ */
+public class CassandraValueMergeOperator extends MergeOperator {
+  public CassandraValueMergeOperator(int gcGracePeriodInSeconds) {
+    super(newSharedCassandraValueMergeOperator(gcGracePeriodInSeconds, 0));
+    }
+
+    public CassandraValueMergeOperator(int gcGracePeriodInSeconds, int operandsLimit) {
+      super(newSharedCassandraValueMergeOperator(gcGracePeriodInSeconds, operandsLimit));
+    }
+
+    private native static long newSharedCassandraValueMergeOperator(
+        int gcGracePeriodInSeconds, int limit);
+
+    @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/Checkpoint.java b/src/rocksdb/java/src/main/java/org/rocksdb/Checkpoint.java
new file mode 100644
index 000000000..000969932
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/Checkpoint.java
@@ -0,0 +1,66 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Provides Checkpoint functionality. Checkpoints
+ * provide persistent snapshots of RocksDB databases.
+ */
+public class Checkpoint extends RocksObject {
+
+  /**
+   * Creates a Checkpoint object to be used for creating open-able
+   * snapshots.
+   *
+   * @param db {@link RocksDB} instance.
+   * @return a Checkpoint instance.
+   *
+   * @throws java.lang.IllegalArgumentException if {@link RocksDB}
+   *     instance is null.
+   * @throws java.lang.IllegalStateException if {@link RocksDB}
+   *     instance is not initialized.
+   */
+  public static Checkpoint create(final RocksDB db) {
+    if (db == null) {
+      throw new IllegalArgumentException(
+          "RocksDB instance shall not be null.");
+    } else if (!db.isOwningHandle()) {
+      throw new IllegalStateException(
+          "RocksDB instance must be initialized.");
+    }
+    Checkpoint checkpoint = new Checkpoint(db);
+    return checkpoint;
+  }
+
+  /**
+   * <p>Builds an open-able snapshot of RocksDB on the same disk, which
+   * accepts an output directory on the same disk, and under the directory
+   * (1) hard-linked SST files pointing to existing live SST files
+   * (2) a copied manifest files and other files</p>
+   *
+   * @param checkpointPath path to the folder where the snapshot is going
+   *     to be stored.
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void createCheckpoint(final String checkpointPath)
+      throws RocksDBException {
+    createCheckpoint(nativeHandle_, checkpointPath);
+  }
+
+  private Checkpoint(final RocksDB db) {
+    super(newCheckpoint(db.nativeHandle_));
+    this.db_ = db;
+  }
+
+  private final RocksDB db_;
+
+  private static native long newCheckpoint(long dbHandle);
+  @Override protected final native void disposeInternal(final long handle);
+
+  private native void createCheckpoint(long handle, String checkpointPath)
+      throws RocksDBException;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/ChecksumType.java b/src/rocksdb/java/src/main/java/org/rocksdb/ChecksumType.java
new file mode 100644
index 000000000..e03fa14ba
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/ChecksumType.java
@@ -0,0 +1,45 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Checksum types used in conjunction with BlockBasedTable.
+ */
+public enum ChecksumType {
+  /**
+   * Not implemented yet.
+   */
+  kNoChecksum((byte) 0),
+  /**
+   * CRC32 Checksum
+   */
+  kCRC32c((byte) 1),
+  /**
+   * XX Hash
+   */
+  kxxHash((byte) 2),
+  /**
+   * XX Hash 64
+   */
+  kxxHash64((byte) 3),
+
+  kXXH3((byte) 4);
+
+  /**
+   * Returns the byte value of the enumerations value
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value_;
+  }
+
+  private ChecksumType(final byte value) {
+    value_ = value;
+  }
+
+  private final byte value_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/ClockCache.java b/src/rocksdb/java/src/main/java/org/rocksdb/ClockCache.java
new file mode 100644
index 000000000..a66dc0e8a
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/ClockCache.java
@@ -0,0 +1,59 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Similar to {@link LRUCache}, but based on the CLOCK algorithm with
+ * better concurrent performance in some cases
+ */
+public class ClockCache extends Cache {
+
+  /**
+   * Create a new cache with a fixed size capacity.
+   *
+   * @param capacity The fixed size capacity of the cache
+   */
+  public ClockCache(final long capacity) {
+    super(newClockCache(capacity, -1, false));
+  }
+
+  /**
+   * Create a new cache with a fixed size capacity. The cache is sharded
+   * to 2^numShardBits shards, by hash of the key. The total capacity
+   * is divided and evenly assigned to each shard.
+   * numShardBits = -1 means it is automatically determined: every shard
+   * will be at least 512KB and number of shard bits will not exceed 6.
+   *
+   * @param capacity The fixed size capacity of the cache
+   * @param numShardBits The cache is sharded to 2^numShardBits shards,
+   *     by hash of the key
+   */
+  public ClockCache(final long capacity, final int numShardBits) {
+    super(newClockCache(capacity, numShardBits, false));
+  }
+
+  /**
+   * Create a new cache with a fixed size capacity. The cache is sharded
+   * to 2^numShardBits shards, by hash of the key. The total capacity
+   * is divided and evenly assigned to each shard. If strictCapacityLimit
+   * is set, insert to the cache will fail when cache is full.
+   * numShardBits = -1 means it is automatically determined: every shard
+   * will be at least 512KB and number of shard bits will not exceed 6.
+   *
+   * @param capacity The fixed size capacity of the cache
+   * @param numShardBits The cache is sharded to 2^numShardBits shards,
+   *     by hash of the key
+   * @param strictCapacityLimit insert to the cache will fail when cache is full
+   */
+  public ClockCache(final long capacity, final int numShardBits,
+      final boolean strictCapacityLimit) {
+    super(newClockCache(capacity, numShardBits, strictCapacityLimit));
+  }
+
+  private native static long newClockCache(final long capacity,
+      final int numShardBits, final boolean strictCapacityLimit);
+  @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java b/src/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java
new file mode 100644
index 000000000..125a8dcf8
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java
@@ -0,0 +1,84 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Arrays;
+
+/**
+ * <p>Describes a column family with a
+ * name and respective Options.</p>
+ */
+public class ColumnFamilyDescriptor {
+
+  /**
+   * <p>Creates a new Column Family using a name and default
+   * options,</p>
+   *
+   * @param columnFamilyName name of column family.
+   * @since 3.10.0
+   */
+  public ColumnFamilyDescriptor(final byte[] columnFamilyName) {
+    this(columnFamilyName, new ColumnFamilyOptions());
+  }
+
+  /**
+   * <p>Creates a new Column Family using a name and custom
+   * options.</p>
+   *
+   * @param columnFamilyName name of column family.
+   * @param columnFamilyOptions options to be used with
+   *     column family.
+   * @since 3.10.0
+   */
+  public ColumnFamilyDescriptor(final byte[] columnFamilyName,
+                                final ColumnFamilyOptions columnFamilyOptions) {
+    columnFamilyName_ = columnFamilyName;
+    columnFamilyOptions_ = columnFamilyOptions;
+  }
+
+  /**
+   * Retrieve name of column family.
+   *
+   * @return column family name.
+   * @since 3.10.0
+   */
+  public byte[] getName() {
+    return columnFamilyName_;
+  }
+
+  /**
+   * Retrieve assigned options instance.
+   *
+   * @return Options instance assigned to this instance.
+   */
+  public ColumnFamilyOptions getOptions() {
+    return columnFamilyOptions_;
+  }
+
+  @Override
+  public boolean equals(final Object o) {
+    if (this == o) {
+      return true;
+    }
+    if (o == null || getClass() != o.getClass()) {
+      return false;
+    }
+
+    final ColumnFamilyDescriptor that = (ColumnFamilyDescriptor) o;
+    return Arrays.equals(columnFamilyName_, that.columnFamilyName_)
+            && columnFamilyOptions_.nativeHandle_ == that.columnFamilyOptions_.nativeHandle_;
+  }
+
+  @Override
+  public int hashCode() {
+    int result = (int) (columnFamilyOptions_.nativeHandle_ ^ (columnFamilyOptions_.nativeHandle_ >>> 32));
+    result = 31 * result + Arrays.hashCode(columnFamilyName_);
+    return result;
+  }
+
+  private final byte[] columnFamilyName_;
+  private final ColumnFamilyOptions columnFamilyOptions_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java b/src/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java
new file mode 100644
index 000000000..1ac0a35bb
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java
@@ -0,0 +1,151 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Arrays;
+import java.util.Objects;
+
+/**
+ * ColumnFamilyHandle class to hold handles to underlying rocksdb
+ * ColumnFamily Pointers.
+ */
+public class ColumnFamilyHandle extends RocksObject {
+  /**
+   * Constructs column family Java object, which operates on underlying native object.
+   *
+   * @param rocksDB db instance associated with this column family
+   * @param nativeHandle native handle to underlying native ColumnFamily object
+   */
+  ColumnFamilyHandle(final RocksDB rocksDB,
+      final long nativeHandle) {
+    super(nativeHandle);
+    // rocksDB must point to a valid RocksDB instance;
+    assert(rocksDB != null);
+    // ColumnFamilyHandle must hold a reference to the related RocksDB instance
+    // to guarantee that while a GC cycle starts ColumnFamilyHandle instances
+    // are freed prior to RocksDB instances.
+    this.rocksDB_ = rocksDB;
+  }
+
+  /**
+   * Constructor called only from JNI.
+   *
+   * NOTE: we are producing an additional Java Object here to represent the underlying native C++
+   * ColumnFamilyHandle object. The underlying object is not owned by ourselves. The Java API user
+   * likely already had a ColumnFamilyHandle Java object which owns the underlying C++ object, as
+   * they will have been presented it when they opened the database or added a Column Family.
+   *
+   *
+   * TODO(AR) - Potentially a better design would be to cache the active Java Column Family Objects
+   * in RocksDB, and return the same Java Object instead of instantiating a new one here. This could
+   * also help us to improve the Java API semantics for Java users. See for example
+   * https://github.com/facebook/rocksdb/issues/2687.
+   *
+   * @param nativeHandle native handle to the column family.
+   */
+  ColumnFamilyHandle(final long nativeHandle) {
+    super(nativeHandle);
+    rocksDB_ = null;
+    disOwnNativeHandle();
+  }
+
+  /**
+   * Gets the name of the Column Family.
+   *
+   * @return The name of the Column Family.
+   *
+   * @throws RocksDBException if an error occurs whilst retrieving the name.
+   */
+  public byte[] getName() throws RocksDBException {
+    assert(isOwningHandle() || isDefaultColumnFamily());
+    return getName(nativeHandle_);
+  }
+
+  /**
+   * Gets the ID of the Column Family.
+   *
+   * @return the ID of the Column Family.
+   */
+  public int getID() {
+    assert(isOwningHandle() || isDefaultColumnFamily());
+    return getID(nativeHandle_);
+  }
+
+  /**
+   * Gets the up-to-date descriptor of the column family
+   * associated with this handle. Since it fills "*desc" with the up-to-date
+   * information, this call might internally lock and release DB mutex to
+   * access the up-to-date CF options. In addition, all the pointer-typed
+   * options cannot be referenced any longer than the original options exist.
+   *
+   * Note that this function is not supported in RocksDBLite.
+   *
+   * @return the up-to-date descriptor.
+   *
+   * @throws RocksDBException if an error occurs whilst retrieving the
+   *     descriptor.
+   */
+  public ColumnFamilyDescriptor getDescriptor() throws RocksDBException {
+    assert(isOwningHandle() || isDefaultColumnFamily());
+    return getDescriptor(nativeHandle_);
+  }
+
+  @Override
+  public boolean equals(final Object o) {
+    if (this == o) {
+      return true;
+    }
+    if (o == null || getClass() != o.getClass()) {
+      return false;
+    }
+
+    final ColumnFamilyHandle that = (ColumnFamilyHandle) o;
+    try {
+      return rocksDB_.nativeHandle_ == that.rocksDB_.nativeHandle_ &&
+          getID() == that.getID() &&
+          Arrays.equals(getName(), that.getName());
+    } catch (RocksDBException e) {
+      throw new RuntimeException("Cannot compare column family handles", e);
+    }
+  }
+
+  @Override
+  public int hashCode() {
+    try {
+      int result = Objects.hash(getID(), rocksDB_.nativeHandle_);
+      result = 31 * result + Arrays.hashCode(getName());
+      return result;
+    } catch (RocksDBException e) {
+      throw new RuntimeException("Cannot calculate hash code of column family handle", e);
+    }
+  }
+
+  protected boolean isDefaultColumnFamily() {
+    return nativeHandle_ == rocksDB_.getDefaultColumnFamily().nativeHandle_;
+  }
+
+  /**
+   * <p>Deletes underlying C++ iterator pointer.</p>
+   *
+   * <p>Note: the underlying handle can only be safely deleted if the RocksDB
+   * instance related to a certain ColumnFamilyHandle is still valid and
+   * initialized. Therefore {@code disposeInternal()} checks if the RocksDB is
+   * initialized before freeing the native handle.</p>
+   */
+  @Override
+  protected void disposeInternal() {
+    if(rocksDB_.isOwningHandle()) {
+      disposeInternal(nativeHandle_);
+    }
+  }
+
+  private native byte[] getName(final long handle) throws RocksDBException;
+  private native int getID(final long handle);
+  private native ColumnFamilyDescriptor getDescriptor(final long handle) throws RocksDBException;
+  @Override protected final native void disposeInternal(final long handle);
+
+  private final RocksDB rocksDB_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyMetaData.java b/src/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyMetaData.java
new file mode 100644
index 000000000..191904017
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyMetaData.java
@@ -0,0 +1,70 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * The metadata that describes a column family.
+ */
+public class ColumnFamilyMetaData {
+  private final long size;
+  private final long fileCount;
+  private final byte[] name;
+  private final LevelMetaData[] levels;
+
+  /**
+   * Called from JNI C++
+   */
+  private ColumnFamilyMetaData(
+      final long size,
+      final long fileCount,
+      final byte[] name,
+      final LevelMetaData[] levels) {
+    this.size = size;
+    this.fileCount = fileCount;
+    this.name = name;
+    this.levels = levels;
+  }
+
+  /**
+   * The size of this column family in bytes, which is equal to the sum of
+   * the file size of its {@link #levels()}.
+   *
+   * @return the size of this column family
+   */
+  public long size() {
+    return size;
+  }
+
+  /**
+   * The number of files in this column family.
+   *
+   * @return the number of files
+   */
+  public long fileCount() {
+    return fileCount;
+  }
+
+  /**
+   * The name of the column family.
+   *
+   * @return the name
+   */
+  public byte[] name() {
+    return name;
+  }
+
+  /**
+   * The metadata of all levels in this column family.
+   *
+   * @return the levels metadata
+   */
+  public List<LevelMetaData> levels() {
+    return Arrays.asList(levels);
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java
new file mode 100644
index 000000000..a642cb6fa
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java
@@ -0,0 +1,1540 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.nio.file.Paths;
+import java.util.*;
+
+/**
+ * ColumnFamilyOptions to control the behavior of a database.  It will be used
+ * during the creation of a {@link org.rocksdb.RocksDB} (i.e., RocksDB.open()).
+ *
+ * As a descendent of {@link AbstractNativeReference}, this class is {@link AutoCloseable}
+ * and will be automatically released if opened in the preamble of a try with resources block.
+ */
+public class ColumnFamilyOptions extends RocksObject
+    implements ColumnFamilyOptionsInterface<ColumnFamilyOptions>,
+    MutableColumnFamilyOptionsInterface<ColumnFamilyOptions> {
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  /**
+   * Construct ColumnFamilyOptions.
+   *
+   * This constructor will create (by allocating a block of memory)
+   * an {@code rocksdb::ColumnFamilyOptions} in the c++ side.
+   */
+  public ColumnFamilyOptions() {
+    super(newColumnFamilyOptions());
+  }
+
+  /**
+   * Copy constructor for ColumnFamilyOptions.
+   *
+   * NOTE: This does a shallow copy, which means comparator, merge_operator, compaction_filter,
+   * compaction_filter_factory and other pointers will be cloned!
+   *
+   * @param other The ColumnFamilyOptions to copy.
+   */
+  public ColumnFamilyOptions(ColumnFamilyOptions other) {
+    super(copyColumnFamilyOptions(other.nativeHandle_));
+    this.memTableConfig_ = other.memTableConfig_;
+    this.tableFormatConfig_ = other.tableFormatConfig_;
+    this.comparator_ = other.comparator_;
+    this.compactionFilter_ = other.compactionFilter_;
+    this.compactionFilterFactory_ = other.compactionFilterFactory_;
+    this.compactionOptionsUniversal_ = other.compactionOptionsUniversal_;
+    this.compactionOptionsFIFO_ = other.compactionOptionsFIFO_;
+    this.bottommostCompressionOptions_ = other.bottommostCompressionOptions_;
+    this.compressionOptions_ = other.compressionOptions_;
+    this.compactionThreadLimiter_ = other.compactionThreadLimiter_;
+    this.sstPartitionerFactory_ = other.sstPartitionerFactory_;
+  }
+
+  /**
+   * Constructor from Options
+   *
+   * @param options The options.
+   */
+  public ColumnFamilyOptions(final Options options) {
+    super(newColumnFamilyOptionsFromOptions(options.nativeHandle_));
+  }
+
+  /**
+   * <p>Constructor to be used by
+   * {@link #getColumnFamilyOptionsFromProps(java.util.Properties)},
+   * {@link ColumnFamilyDescriptor#getOptions()}
+   * and also called via JNI.</p>
+   *
+   * @param handle native handle to ColumnFamilyOptions instance.
+   */
+  ColumnFamilyOptions(final long handle) {
+    super(handle);
+  }
+
+  /**
+   * <p>Method to get a options instance by using pre-configured
+   * property values. If one or many values are undefined in
+   * the context of RocksDB the method will return a null
+   * value.</p>
+   *
+   * <p><strong>Note</strong>: Property keys can be derived from
+   * getter methods within the options class. Example: the method
+   * {@code writeBufferSize()} has a property key:
+   * {@code write_buffer_size}.</p>
+   *
+   * @param properties {@link java.util.Properties} instance.
+   *
+   * @return {@link org.rocksdb.ColumnFamilyOptions instance}
+   *     or null.
+   *
+   * @throws java.lang.IllegalArgumentException if null or empty
+   *     {@link Properties} instance is passed to the method call.
+   */
+  public static ColumnFamilyOptions getColumnFamilyOptionsFromProps(
+      final Properties properties) {
+    ColumnFamilyOptions columnFamilyOptions = null;
+    final long handle =
+        getColumnFamilyOptionsFromProps(Options.getOptionStringFromProps(properties));
+    if (handle != 0) {
+      columnFamilyOptions = new ColumnFamilyOptions(handle);
+    }
+    return columnFamilyOptions;
+  }
+
+  /**
+   * <p>Method to get a options instance by using pre-configured
+   * property values. If one or many values are undefined in
+   * the context of RocksDB the method will return a null
+   * value.</p>
+   *
+   * <p><strong>Note</strong>: Property keys can be derived from
+   * getter methods within the options class. Example: the method
+   * {@code writeBufferSize()} has a property key:
+   * {@code write_buffer_size}.</p>
+   *
+   * @param cfgOpts  ConfigOptions controlling how the properties are parsed.
+   * @param properties {@link java.util.Properties} instance.
+   *
+   * @return {@link org.rocksdb.ColumnFamilyOptions instance}
+   *     or null.
+   *
+   * @throws java.lang.IllegalArgumentException if null or empty
+   *     {@link Properties} instance is passed to the method call.
+   */
+  public static ColumnFamilyOptions getColumnFamilyOptionsFromProps(
+      final ConfigOptions cfgOpts, final Properties properties) {
+    ColumnFamilyOptions columnFamilyOptions = null;
+    final long handle = getColumnFamilyOptionsFromProps(
+        cfgOpts.nativeHandle_, Options.getOptionStringFromProps(properties));
+    if (handle != 0){
+      columnFamilyOptions = new ColumnFamilyOptions(handle);
+    }
+    return columnFamilyOptions;
+  }
+
+  @Override
+  public ColumnFamilyOptions oldDefaults(final int majorVersion, final int minorVersion) {
+    oldDefaults(nativeHandle_, majorVersion, minorVersion);
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions optimizeForSmallDb() {
+    optimizeForSmallDb(nativeHandle_);
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions optimizeForSmallDb(final Cache cache) {
+    optimizeForSmallDb(nativeHandle_, cache.getNativeHandle());
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions optimizeForPointLookup(
+      final long blockCacheSizeMb) {
+    optimizeForPointLookup(nativeHandle_,
+        blockCacheSizeMb);
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions optimizeLevelStyleCompaction() {
+    optimizeLevelStyleCompaction(nativeHandle_,
+        DEFAULT_COMPACTION_MEMTABLE_MEMORY_BUDGET);
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions optimizeLevelStyleCompaction(
+      final long memtableMemoryBudget) {
+    optimizeLevelStyleCompaction(nativeHandle_,
+        memtableMemoryBudget);
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions optimizeUniversalStyleCompaction() {
+    optimizeUniversalStyleCompaction(nativeHandle_,
+        DEFAULT_COMPACTION_MEMTABLE_MEMORY_BUDGET);
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions optimizeUniversalStyleCompaction(
+      final long memtableMemoryBudget) {
+    optimizeUniversalStyleCompaction(nativeHandle_,
+        memtableMemoryBudget);
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions setComparator(
+      final BuiltinComparator builtinComparator) {
+    assert(isOwningHandle());
+    setComparatorHandle(nativeHandle_, builtinComparator.ordinal());
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions setComparator(
+      final AbstractComparator comparator) {
+    assert (isOwningHandle());
+    setComparatorHandle(nativeHandle_, comparator.nativeHandle_,
+            comparator.getComparatorType().getValue());
+    comparator_ = comparator;
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions setMergeOperatorName(final String name) {
+    assert (isOwningHandle());
+    if (name == null) {
+      throw new IllegalArgumentException(
+          "Merge operator name must not be null.");
+    }
+    setMergeOperatorName(nativeHandle_, name);
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions setMergeOperator(
+      final MergeOperator mergeOperator) {
+    setMergeOperator(nativeHandle_, mergeOperator.nativeHandle_);
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions setCompactionFilter(
+        final AbstractCompactionFilter<? extends AbstractSlice<?>>
+            compactionFilter) {
+    setCompactionFilterHandle(nativeHandle_, compactionFilter.nativeHandle_);
+    compactionFilter_ = compactionFilter;
+    return this;
+  }
+
+  @Override
+  public AbstractCompactionFilter<? extends AbstractSlice<?>> compactionFilter() {
+    assert (isOwningHandle());
+    return compactionFilter_;
+  }
+
+  @Override
+  public ColumnFamilyOptions setCompactionFilterFactory(final AbstractCompactionFilterFactory<? extends AbstractCompactionFilter<?>> compactionFilterFactory) {
+    assert (isOwningHandle());
+    setCompactionFilterFactoryHandle(nativeHandle_, compactionFilterFactory.nativeHandle_);
+    compactionFilterFactory_ = compactionFilterFactory;
+    return this;
+  }
+
+  @Override
+  public AbstractCompactionFilterFactory<? extends AbstractCompactionFilter<?>> compactionFilterFactory() {
+    assert (isOwningHandle());
+    return compactionFilterFactory_;
+  }
+
+  @Override
+  public ColumnFamilyOptions setWriteBufferSize(final long writeBufferSize) {
+    assert(isOwningHandle());
+    setWriteBufferSize(nativeHandle_, writeBufferSize);
+    return this;
+  }
+
+  @Override
+  public long writeBufferSize()  {
+    assert(isOwningHandle());
+    return writeBufferSize(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMaxWriteBufferNumber(
+      final int maxWriteBufferNumber) {
+    assert(isOwningHandle());
+    setMaxWriteBufferNumber(nativeHandle_, maxWriteBufferNumber);
+    return this;
+  }
+
+  @Override
+  public int maxWriteBufferNumber() {
+    assert(isOwningHandle());
+    return maxWriteBufferNumber(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMinWriteBufferNumberToMerge(
+      final int minWriteBufferNumberToMerge) {
+    setMinWriteBufferNumberToMerge(nativeHandle_, minWriteBufferNumberToMerge);
+    return this;
+  }
+
+  @Override
+  public int minWriteBufferNumberToMerge() {
+    return minWriteBufferNumberToMerge(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions useFixedLengthPrefixExtractor(final int n) {
+    assert(isOwningHandle());
+    useFixedLengthPrefixExtractor(nativeHandle_, n);
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions useCappedPrefixExtractor(final int n) {
+    assert(isOwningHandle());
+    useCappedPrefixExtractor(nativeHandle_, n);
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions setCompressionType(
+      final CompressionType compressionType) {
+    setCompressionType(nativeHandle_, compressionType.getValue());
+    return this;
+  }
+
+  @Override
+  public CompressionType compressionType() {
+    return CompressionType.getCompressionType(compressionType(nativeHandle_));
+  }
+
+  @Override
+  public ColumnFamilyOptions setCompressionPerLevel(
+      final List<CompressionType> compressionLevels) {
+    final byte[] byteCompressionTypes = new byte[
+        compressionLevels.size()];
+    for (int i = 0; i < compressionLevels.size(); i++) {
+      byteCompressionTypes[i] = compressionLevels.get(i).getValue();
+    }
+    setCompressionPerLevel(nativeHandle_, byteCompressionTypes);
+    return this;
+  }
+
+  @Override
+  public List<CompressionType> compressionPerLevel() {
+    final byte[] byteCompressionTypes =
+        compressionPerLevel(nativeHandle_);
+    final List<CompressionType> compressionLevels = new ArrayList<>();
+    for (final byte byteCompressionType : byteCompressionTypes) {
+      compressionLevels.add(CompressionType.getCompressionType(
+          byteCompressionType));
+    }
+    return compressionLevels;
+  }
+
+  @Override
+  public ColumnFamilyOptions setBottommostCompressionType(
+      final CompressionType bottommostCompressionType) {
+    setBottommostCompressionType(nativeHandle_,
+        bottommostCompressionType.getValue());
+    return this;
+  }
+
+  @Override
+  public CompressionType bottommostCompressionType() {
+    return CompressionType.getCompressionType(
+        bottommostCompressionType(nativeHandle_));
+  }
+
+  @Override
+  public ColumnFamilyOptions setBottommostCompressionOptions(
+      final CompressionOptions bottommostCompressionOptions) {
+    setBottommostCompressionOptions(nativeHandle_,
+        bottommostCompressionOptions.nativeHandle_);
+    this.bottommostCompressionOptions_ = bottommostCompressionOptions;
+    return this;
+  }
+
+  @Override
+  public CompressionOptions bottommostCompressionOptions() {
+    return this.bottommostCompressionOptions_;
+  }
+
+  @Override
+  public ColumnFamilyOptions setCompressionOptions(
+      final CompressionOptions compressionOptions) {
+    setCompressionOptions(nativeHandle_, compressionOptions.nativeHandle_);
+    this.compressionOptions_ = compressionOptions;
+    return this;
+  }
+
+  @Override
+  public CompressionOptions compressionOptions() {
+    return this.compressionOptions_;
+  }
+
+  @Override
+  public ColumnFamilyOptions setNumLevels(final int numLevels) {
+    setNumLevels(nativeHandle_, numLevels);
+    return this;
+  }
+
+  @Override
+  public int numLevels() {
+    return numLevels(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setLevelZeroFileNumCompactionTrigger(
+      final int numFiles) {
+    setLevelZeroFileNumCompactionTrigger(
+        nativeHandle_, numFiles);
+    return this;
+  }
+
+  @Override
+  public int levelZeroFileNumCompactionTrigger() {
+    return levelZeroFileNumCompactionTrigger(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setLevelZeroSlowdownWritesTrigger(
+      final int numFiles) {
+    setLevelZeroSlowdownWritesTrigger(nativeHandle_, numFiles);
+    return this;
+  }
+
+  @Override
+  public int levelZeroSlowdownWritesTrigger() {
+    return levelZeroSlowdownWritesTrigger(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setLevelZeroStopWritesTrigger(final int numFiles) {
+    setLevelZeroStopWritesTrigger(nativeHandle_, numFiles);
+    return this;
+  }
+
+  @Override
+  public int levelZeroStopWritesTrigger() {
+    return levelZeroStopWritesTrigger(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setTargetFileSizeBase(
+      final long targetFileSizeBase) {
+    setTargetFileSizeBase(nativeHandle_, targetFileSizeBase);
+    return this;
+  }
+
+  @Override
+  public long targetFileSizeBase() {
+    return targetFileSizeBase(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setTargetFileSizeMultiplier(
+      final int multiplier) {
+    setTargetFileSizeMultiplier(nativeHandle_, multiplier);
+    return this;
+  }
+
+  @Override
+  public int targetFileSizeMultiplier() {
+    return targetFileSizeMultiplier(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMaxBytesForLevelBase(
+      final long maxBytesForLevelBase) {
+    setMaxBytesForLevelBase(nativeHandle_, maxBytesForLevelBase);
+    return this;
+  }
+
+  @Override
+  public long maxBytesForLevelBase() {
+    return maxBytesForLevelBase(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setLevelCompactionDynamicLevelBytes(
+      final boolean enableLevelCompactionDynamicLevelBytes) {
+    setLevelCompactionDynamicLevelBytes(nativeHandle_,
+        enableLevelCompactionDynamicLevelBytes);
+    return this;
+  }
+
+  @Override
+  public boolean levelCompactionDynamicLevelBytes() {
+    return levelCompactionDynamicLevelBytes(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMaxBytesForLevelMultiplier(final double multiplier) {
+    setMaxBytesForLevelMultiplier(nativeHandle_, multiplier);
+    return this;
+  }
+
+  @Override
+  public double maxBytesForLevelMultiplier() {
+    return maxBytesForLevelMultiplier(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMaxCompactionBytes(final long maxCompactionBytes) {
+    setMaxCompactionBytes(nativeHandle_, maxCompactionBytes);
+    return this;
+  }
+
+  @Override
+  public long maxCompactionBytes() {
+    return maxCompactionBytes(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setArenaBlockSize(
+      final long arenaBlockSize) {
+    setArenaBlockSize(nativeHandle_, arenaBlockSize);
+    return this;
+  }
+
+  @Override
+  public long arenaBlockSize() {
+    return arenaBlockSize(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setDisableAutoCompactions(
+      final boolean disableAutoCompactions) {
+    setDisableAutoCompactions(nativeHandle_, disableAutoCompactions);
+    return this;
+  }
+
+  @Override
+  public boolean disableAutoCompactions() {
+    return disableAutoCompactions(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setCompactionStyle(
+      final CompactionStyle compactionStyle) {
+    setCompactionStyle(nativeHandle_, compactionStyle.getValue());
+    return this;
+  }
+
+  @Override
+  public CompactionStyle compactionStyle() {
+    return CompactionStyle.fromValue(compactionStyle(nativeHandle_));
+  }
+
+  @Override
+  public ColumnFamilyOptions setMaxTableFilesSizeFIFO(
+      final long maxTableFilesSize) {
+    assert(maxTableFilesSize > 0); // unsigned native type
+    assert(isOwningHandle());
+    setMaxTableFilesSizeFIFO(nativeHandle_, maxTableFilesSize);
+    return this;
+  }
+
+  @Override
+  public long maxTableFilesSizeFIFO() {
+    return maxTableFilesSizeFIFO(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMaxSequentialSkipInIterations(
+      final long maxSequentialSkipInIterations) {
+    setMaxSequentialSkipInIterations(nativeHandle_,
+        maxSequentialSkipInIterations);
+    return this;
+  }
+
+  @Override
+  public long maxSequentialSkipInIterations() {
+    return maxSequentialSkipInIterations(nativeHandle_);
+  }
+
+  @Override
+  public MemTableConfig memTableConfig() {
+    return this.memTableConfig_;
+  }
+
+  @Override
+  public ColumnFamilyOptions setMemTableConfig(
+      final MemTableConfig memTableConfig) {
+    setMemTableFactory(
+        nativeHandle_, memTableConfig.newMemTableFactoryHandle());
+    this.memTableConfig_ = memTableConfig;
+    return this;
+  }
+
+  @Override
+  public String memTableFactoryName() {
+    assert(isOwningHandle());
+    return memTableFactoryName(nativeHandle_);
+  }
+
+  @Override
+  public TableFormatConfig tableFormatConfig() {
+    return this.tableFormatConfig_;
+  }
+
+  @Override
+  public ColumnFamilyOptions setTableFormatConfig(
+      final TableFormatConfig tableFormatConfig) {
+    setTableFactory(nativeHandle_, tableFormatConfig.newTableFactoryHandle());
+    this.tableFormatConfig_ = tableFormatConfig;
+    return this;
+  }
+
+  @Override
+  public String tableFactoryName() {
+    assert(isOwningHandle());
+    return tableFactoryName(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setCfPaths(final Collection<DbPath> cfPaths) {
+    assert (isOwningHandle());
+
+    final int len = cfPaths.size();
+    final String[] paths = new String[len];
+    final long[] targetSizes = new long[len];
+
+    int i = 0;
+    for (final DbPath dbPath : cfPaths) {
+      paths[i] = dbPath.path.toString();
+      targetSizes[i] = dbPath.targetSize;
+      i++;
+    }
+    setCfPaths(nativeHandle_, paths, targetSizes);
+    return this;
+  }
+
+  @Override
+  public List<DbPath> cfPaths() {
+    final int len = (int) cfPathsLen(nativeHandle_);
+
+    if (len == 0) {
+      return Collections.emptyList();
+    }
+
+    final String[] paths = new String[len];
+    final long[] targetSizes = new long[len];
+
+    cfPaths(nativeHandle_, paths, targetSizes);
+
+    final List<DbPath> cfPaths = new ArrayList<>();
+    for (int i = 0; i < len; i++) {
+      cfPaths.add(new DbPath(Paths.get(paths[i]), targetSizes[i]));
+    }
+
+    return cfPaths;
+  }
+
+  @Override
+  public ColumnFamilyOptions setInplaceUpdateSupport(
+      final boolean inplaceUpdateSupport) {
+    setInplaceUpdateSupport(nativeHandle_, inplaceUpdateSupport);
+    return this;
+  }
+
+  @Override
+  public boolean inplaceUpdateSupport() {
+    return inplaceUpdateSupport(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setInplaceUpdateNumLocks(
+      final long inplaceUpdateNumLocks) {
+    setInplaceUpdateNumLocks(nativeHandle_, inplaceUpdateNumLocks);
+    return this;
+  }
+
+  @Override
+  public long inplaceUpdateNumLocks() {
+    return inplaceUpdateNumLocks(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMemtablePrefixBloomSizeRatio(
+      final double memtablePrefixBloomSizeRatio) {
+    setMemtablePrefixBloomSizeRatio(nativeHandle_, memtablePrefixBloomSizeRatio);
+    return this;
+  }
+
+  @Override
+  public double memtablePrefixBloomSizeRatio() {
+    return memtablePrefixBloomSizeRatio(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setExperimentalMempurgeThreshold(
+      final double experimentalMempurgeThreshold) {
+    setExperimentalMempurgeThreshold(nativeHandle_, experimentalMempurgeThreshold);
+    return this;
+  }
+
+  @Override
+  public double experimentalMempurgeThreshold() {
+    return experimentalMempurgeThreshold(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMemtableWholeKeyFiltering(final boolean memtableWholeKeyFiltering) {
+    setMemtableWholeKeyFiltering(nativeHandle_, memtableWholeKeyFiltering);
+    return this;
+  }
+
+  @Override
+  public boolean memtableWholeKeyFiltering() {
+    return memtableWholeKeyFiltering(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setBloomLocality(int bloomLocality) {
+    setBloomLocality(nativeHandle_, bloomLocality);
+    return this;
+  }
+
+  @Override
+  public int bloomLocality() {
+    return bloomLocality(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMaxSuccessiveMerges(
+      final long maxSuccessiveMerges) {
+    setMaxSuccessiveMerges(nativeHandle_, maxSuccessiveMerges);
+    return this;
+  }
+
+  @Override
+  public long maxSuccessiveMerges() {
+    return maxSuccessiveMerges(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setOptimizeFiltersForHits(
+      final boolean optimizeFiltersForHits) {
+    setOptimizeFiltersForHits(nativeHandle_, optimizeFiltersForHits);
+    return this;
+  }
+
+  @Override
+  public boolean optimizeFiltersForHits() {
+    return optimizeFiltersForHits(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions
+  setMemtableHugePageSize(
+      long memtableHugePageSize) {
+    setMemtableHugePageSize(nativeHandle_,
+        memtableHugePageSize);
+    return this;
+  }
+
+  @Override
+  public long memtableHugePageSize() {
+    return memtableHugePageSize(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setSoftPendingCompactionBytesLimit(long softPendingCompactionBytesLimit) {
+    setSoftPendingCompactionBytesLimit(nativeHandle_,
+        softPendingCompactionBytesLimit);
+    return this;
+  }
+
+  @Override
+  public long softPendingCompactionBytesLimit() {
+    return softPendingCompactionBytesLimit(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setHardPendingCompactionBytesLimit(long hardPendingCompactionBytesLimit) {
+    setHardPendingCompactionBytesLimit(nativeHandle_, hardPendingCompactionBytesLimit);
+    return this;
+  }
+
+  @Override
+  public long hardPendingCompactionBytesLimit() {
+    return hardPendingCompactionBytesLimit(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setLevel0FileNumCompactionTrigger(int level0FileNumCompactionTrigger) {
+    setLevel0FileNumCompactionTrigger(nativeHandle_, level0FileNumCompactionTrigger);
+    return this;
+  }
+
+  @Override
+  public int level0FileNumCompactionTrigger() {
+    return level0FileNumCompactionTrigger(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setLevel0SlowdownWritesTrigger(int level0SlowdownWritesTrigger) {
+    setLevel0SlowdownWritesTrigger(nativeHandle_, level0SlowdownWritesTrigger);
+    return this;
+  }
+
+  @Override
+  public int level0SlowdownWritesTrigger() {
+    return level0SlowdownWritesTrigger(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setLevel0StopWritesTrigger(int level0StopWritesTrigger) {
+    setLevel0StopWritesTrigger(nativeHandle_, level0StopWritesTrigger);
+    return this;
+  }
+
+  @Override
+  public int level0StopWritesTrigger() {
+    return level0StopWritesTrigger(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMaxBytesForLevelMultiplierAdditional(int[] maxBytesForLevelMultiplierAdditional) {
+    setMaxBytesForLevelMultiplierAdditional(nativeHandle_, maxBytesForLevelMultiplierAdditional);
+    return this;
+  }
+
+  @Override
+  public int[] maxBytesForLevelMultiplierAdditional() {
+    return maxBytesForLevelMultiplierAdditional(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setParanoidFileChecks(boolean paranoidFileChecks) {
+    setParanoidFileChecks(nativeHandle_, paranoidFileChecks);
+    return this;
+  }
+
+  @Override
+  public boolean paranoidFileChecks() {
+    return paranoidFileChecks(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMaxWriteBufferNumberToMaintain(
+      final int maxWriteBufferNumberToMaintain) {
+    setMaxWriteBufferNumberToMaintain(
+        nativeHandle_, maxWriteBufferNumberToMaintain);
+    return this;
+  }
+
+  @Override
+  public int maxWriteBufferNumberToMaintain() {
+    return maxWriteBufferNumberToMaintain(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setCompactionPriority(
+      final CompactionPriority compactionPriority) {
+    setCompactionPriority(nativeHandle_, compactionPriority.getValue());
+    return this;
+  }
+
+  @Override
+  public CompactionPriority compactionPriority() {
+    return CompactionPriority.getCompactionPriority(
+        compactionPriority(nativeHandle_));
+  }
+
+  @Override
+  public ColumnFamilyOptions setReportBgIoStats(final boolean reportBgIoStats) {
+    setReportBgIoStats(nativeHandle_, reportBgIoStats);
+    return this;
+  }
+
+  @Override
+  public boolean reportBgIoStats() {
+    return reportBgIoStats(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setTtl(final long ttl) {
+    setTtl(nativeHandle_, ttl);
+    return this;
+  }
+
+  @Override
+  public long ttl() {
+    return ttl(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setPeriodicCompactionSeconds(final long periodicCompactionSeconds) {
+    setPeriodicCompactionSeconds(nativeHandle_, periodicCompactionSeconds);
+    return this;
+  }
+
+  @Override
+  public long periodicCompactionSeconds() {
+    return periodicCompactionSeconds(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setCompactionOptionsUniversal(
+      final CompactionOptionsUniversal compactionOptionsUniversal) {
+    setCompactionOptionsUniversal(nativeHandle_,
+        compactionOptionsUniversal.nativeHandle_);
+    this.compactionOptionsUniversal_ = compactionOptionsUniversal;
+    return this;
+  }
+
+  @Override
+  public CompactionOptionsUniversal compactionOptionsUniversal() {
+    return this.compactionOptionsUniversal_;
+  }
+
+  @Override
+  public ColumnFamilyOptions setCompactionOptionsFIFO(final CompactionOptionsFIFO compactionOptionsFIFO) {
+    setCompactionOptionsFIFO(nativeHandle_,
+        compactionOptionsFIFO.nativeHandle_);
+    this.compactionOptionsFIFO_ = compactionOptionsFIFO;
+    return this;
+  }
+
+  @Override
+  public CompactionOptionsFIFO compactionOptionsFIFO() {
+    return this.compactionOptionsFIFO_;
+  }
+
+  @Override
+  public ColumnFamilyOptions setForceConsistencyChecks(final boolean forceConsistencyChecks) {
+    setForceConsistencyChecks(nativeHandle_, forceConsistencyChecks);
+    return this;
+  }
+
+  @Override
+  public boolean forceConsistencyChecks() {
+    return forceConsistencyChecks(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setSstPartitionerFactory(SstPartitionerFactory sstPartitionerFactory) {
+    setSstPartitionerFactory(nativeHandle_, sstPartitionerFactory.nativeHandle_);
+    this.sstPartitionerFactory_ = sstPartitionerFactory;
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions setCompactionThreadLimiter(
+      final ConcurrentTaskLimiter compactionThreadLimiter) {
+    setCompactionThreadLimiter(nativeHandle_, compactionThreadLimiter.nativeHandle_);
+    this.compactionThreadLimiter_ = compactionThreadLimiter;
+    return this;
+  }
+
+  @Override
+  public ConcurrentTaskLimiter compactionThreadLimiter() {
+    assert (isOwningHandle());
+    return this.compactionThreadLimiter_;
+  }
+
+  @Override
+  public SstPartitionerFactory sstPartitionerFactory() {
+    return sstPartitionerFactory_;
+  }
+
+  //
+  // BEGIN options for blobs (integrated BlobDB)
+  //
+
+  /**
+   * When set, large values (blobs) are written to separate blob files, and only
+   * pointers to them are stored in SST files. This can reduce write amplification
+   * for large-value use cases at the cost of introducing a level of indirection
+   * for reads. See also the options min_blob_size, blob_file_size,
+   * blob_compression_type, enable_blob_garbage_collection, and
+   * blob_garbage_collection_age_cutoff below.
+   *
+   * Default: false
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @param enableBlobFiles true iff blob files should be enabled
+   *
+   * @return the reference to the current options.
+   */
+  @Override
+  public ColumnFamilyOptions setEnableBlobFiles(final boolean enableBlobFiles) {
+    setEnableBlobFiles(nativeHandle_, enableBlobFiles);
+    return this;
+  }
+
+  /**
+   * When set, large values (blobs) are written to separate blob files, and only
+   * pointers to them are stored in SST files. This can reduce write amplification
+   * for large-value use cases at the cost of introducing a level of indirection
+   * for reads. See also the options min_blob_size, blob_file_size,
+   * blob_compression_type, enable_blob_garbage_collection, and
+   * blob_garbage_collection_age_cutoff below.
+   *
+   * Default: false
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @return true iff blob files are currently enabled
+   */
+  public boolean enableBlobFiles() {
+    return enableBlobFiles(nativeHandle_);
+  }
+
+  /**
+   * Set the size of the smallest value to be stored separately in a blob file. Values
+   * which have an uncompressed size smaller than this threshold are stored
+   * alongside the keys in SST files in the usual fashion. A value of zero for
+   * this option means that all values are stored in blob files. Note that
+   * enable_blob_files has to be set in order for this option to have any effect.
+   *
+   * Default: 0
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @param minBlobSize the size of the smallest value to be stored separately in a blob file
+   * @return these options, updated with the supplied minimum blob size value
+   */
+  @Override
+  public ColumnFamilyOptions setMinBlobSize(final long minBlobSize) {
+    setMinBlobSize(nativeHandle_, minBlobSize);
+    return this;
+  }
+
+  /**
+   * Get the size of the smallest value to be stored separately in a blob file. Values
+   * which have an uncompressed size smaller than this threshold are stored
+   * alongside the keys in SST files in the usual fashion. A value of zero for
+   * this option means that all values are stored in blob files. Note that
+   * enable_blob_files has to be set in order for this option to have any effect.
+   *
+   * Default: 0
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @return the current minimum blob size
+   */
+  @Override
+  public long minBlobSize() {
+    return minBlobSize(nativeHandle_);
+  }
+
+  /**
+   * Set the size limit for blob files. When writing blob files, a new file is opened
+   * once this limit is reached. Note that enable_blob_files has to be set in
+   * order for this option to have any effect.
+   *
+   * Default: 256 MB
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @param blobFileSize the new size limit for blob files
+   *
+   * @return the reference to the current options.
+   */
+  @Override
+  public ColumnFamilyOptions setBlobFileSize(final long blobFileSize) {
+    setBlobFileSize(nativeHandle_, blobFileSize);
+    return this;
+  }
+
+  /**
+   * Get the size limit for blob files. When writing blob files, a new file is opened
+   * once this limit is reached. Note that enable_blob_files has to be set in
+   * order for this option to have any effect.
+   *
+   * Default: 256 MB
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @return the size limit for blob files
+   */
+  @Override
+  public long blobFileSize() {
+    return blobFileSize(nativeHandle_);
+  }
+
+  /**
+   * Set the compression algorithm to use for large values stored in blob files. Note
+   * that enable_blob_files has to be set in order for this option to have any
+   * effect.
+   *
+   * Default: no compression
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @param compressionType the compression algorithm to use
+   *
+   * @return the reference to the current options.
+   */
+  @Override
+  public ColumnFamilyOptions setBlobCompressionType(final CompressionType compressionType) {
+    setBlobCompressionType(nativeHandle_, compressionType.getValue());
+    return this;
+  }
+
+  /**
+   * Get the compression algorithm to use for large values stored in blob files. Note
+   * that enable_blob_files has to be set in order for this option to have any
+   * effect.
+   *
+   * Default: no compression
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @return the compression algorithm currently in use for blobs
+   */
+  @Override
+  public CompressionType blobCompressionType() {
+    return CompressionType.values()[blobCompressionType(nativeHandle_)];
+  }
+
+  /**
+   * Enable/disable garbage collection of blobs. Blob GC is performed as part of
+   * compaction. Valid blobs residing in blob files older than a cutoff get
+   * relocated to new files as they are encountered during compaction, which makes
+   * it possible to clean up blob files once they contain nothing but
+   * obsolete/garbage blobs. See also blob_garbage_collection_age_cutoff below.
+   *
+   * Default: false
+   *
+   * @param enableBlobGarbageCollection true iff blob garbage collection is to be enabled
+   *
+   * @return the reference to the current options.
+   */
+  @Override
+  public ColumnFamilyOptions setEnableBlobGarbageCollection(
+      final boolean enableBlobGarbageCollection) {
+    setEnableBlobGarbageCollection(nativeHandle_, enableBlobGarbageCollection);
+    return this;
+  }
+
+  /**
+   * Get enabled/disables state for garbage collection of blobs. Blob GC is performed as part of
+   * compaction. Valid blobs residing in blob files older than a cutoff get
+   * relocated to new files as they are encountered during compaction, which makes
+   * it possible to clean up blob files once they contain nothing but
+   * obsolete/garbage blobs. See also blob_garbage_collection_age_cutoff below.
+   *
+   * Default: false
+   *
+   * @return true iff blob garbage collection is currently enabled
+   */
+  @Override
+  public boolean enableBlobGarbageCollection() {
+    return enableBlobGarbageCollection(nativeHandle_);
+  }
+
+  /**
+   * Set the cutoff in terms of blob file age for garbage collection. Blobs in the
+   * oldest N blob files will be relocated when encountered during compaction,
+   * where N = garbage_collection_cutoff * number_of_blob_files. Note that
+   * enable_blob_garbage_collection has to be set in order for this option to have
+   * any effect.
+   *
+   * Default: 0.25
+   *
+   * @param blobGarbageCollectionAgeCutoff the new blob garbage collection age cutoff
+   *
+   * @return the reference to the current options.
+   */
+  @Override
+  public ColumnFamilyOptions setBlobGarbageCollectionAgeCutoff(
+      final double blobGarbageCollectionAgeCutoff) {
+    setBlobGarbageCollectionAgeCutoff(nativeHandle_, blobGarbageCollectionAgeCutoff);
+    return this;
+  }
+
+  /**
+   * Get the cutoff in terms of blob file age for garbage collection. Blobs in the
+   * oldest N blob files will be relocated when encountered during compaction,
+   * where N = garbage_collection_cutoff * number_of_blob_files. Note that
+   * enable_blob_garbage_collection has to be set in order for this option to have
+   * any effect.
+   *
+   * Default: 0.25
+   *
+   * @return the current blob garbage collection age cutoff
+   */
+  @Override
+  public double blobGarbageCollectionAgeCutoff() {
+    return blobGarbageCollectionAgeCutoff(nativeHandle_);
+  }
+
+  /**
+   *  If the ratio of garbage in the oldest blob files exceeds this threshold,
+   *  targeted compactions are scheduled in order to force garbage collecting
+   *  the blob files in question, assuming they are all eligible based on the
+   *  value of {@link #blobGarbageCollectionAgeCutoff} above. This option is
+   *  currently only supported with leveled compactions.
+   *
+   *  Note that {@link #enableBlobGarbageCollection} has to be set in order for this
+   *  option to have any effect.
+   *
+   *  Default: 1.0
+   *
+   * Dynamically changeable through the SetOptions() API
+   *
+   * @param blobGarbageCollectionForceThreshold new value for the threshold
+   * @return the reference to the current options
+   */
+  @Override
+  public ColumnFamilyOptions setBlobGarbageCollectionForceThreshold(
+      final double blobGarbageCollectionForceThreshold) {
+    setBlobGarbageCollectionForceThreshold(nativeHandle_, blobGarbageCollectionForceThreshold);
+    return this;
+  }
+
+  /**
+   * Get the current value for the {@link #blobGarbageCollectionForceThreshold}
+   * @return the current threshold at which garbage collection of blobs is forced
+   */
+  @Override
+  public double blobGarbageCollectionForceThreshold() {
+    return blobGarbageCollectionForceThreshold(nativeHandle_);
+  }
+
+  /**
+   * Set compaction readahead for blob files.
+   *
+   * Default: 0
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @param blobCompactionReadaheadSize the compaction readahead for blob files
+   *
+   * @return the reference to the current options.
+   */
+  @Override
+  public ColumnFamilyOptions setBlobCompactionReadaheadSize(
+      final long blobCompactionReadaheadSize) {
+    setBlobCompactionReadaheadSize(nativeHandle_, blobCompactionReadaheadSize);
+    return this;
+  }
+
+  /**
+   * Get compaction readahead for blob files.
+   *
+   * @return the current compaction readahead for blob files
+   */
+  @Override
+  public long blobCompactionReadaheadSize() {
+    return blobCompactionReadaheadSize(nativeHandle_);
+  }
+
+  /**
+   * Set a certain LSM tree level to enable blob files.
+   *
+   * Default: 0
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @param blobFileStartingLevel the starting level to enable blob files
+   *
+   * @return the reference to the current options.
+   */
+  @Override
+  public ColumnFamilyOptions setBlobFileStartingLevel(final int blobFileStartingLevel) {
+    setBlobFileStartingLevel(nativeHandle_, blobFileStartingLevel);
+    return this;
+  }
+
+  /**
+   * Get the starting LSM tree level to enable blob files.
+   *
+   * Default: 0
+   *
+   * @return the current LSM tree level to enable blob files.
+   */
+  @Override
+  public int blobFileStartingLevel() {
+    return blobFileStartingLevel(nativeHandle_);
+  }
+
+  /**
+   * Set a certain prepopulate blob cache option.
+   *
+   * Default: 0
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @param prepopulateBlobCache the prepopulate blob cache option
+   *
+   * @return the reference to the current options.
+   */
+  @Override
+  public ColumnFamilyOptions setPrepopulateBlobCache(
+      final PrepopulateBlobCache prepopulateBlobCache) {
+    setPrepopulateBlobCache(nativeHandle_, prepopulateBlobCache.getValue());
+    return this;
+  }
+
+  /**
+   * Get the prepopulate blob cache option.
+   *
+   * Default: 0
+   *
+   * @return the current prepopulate blob cache option.
+   */
+  @Override
+  public PrepopulateBlobCache prepopulateBlobCache() {
+    return PrepopulateBlobCache.getPrepopulateBlobCache(prepopulateBlobCache(nativeHandle_));
+  }
+
+  //
+  // END options for blobs (integrated BlobDB)
+  //
+
+  private static native long getColumnFamilyOptionsFromProps(
+      final long cfgHandle, String optString);
+  private static native long getColumnFamilyOptionsFromProps(final String optString);
+
+  private static native long newColumnFamilyOptions();
+  private static native long copyColumnFamilyOptions(final long handle);
+  private static native long newColumnFamilyOptionsFromOptions(
+      final long optionsHandle);
+  @Override protected final native void disposeInternal(final long handle);
+
+  private static native void oldDefaults(
+      final long handle, final int majorVersion, final int minorVersion);
+  private native void optimizeForSmallDb(final long handle);
+  private static native void optimizeForSmallDb(final long handle, final long cacheHandle);
+  private native void optimizeForPointLookup(long handle,
+      long blockCacheSizeMb);
+  private native void optimizeLevelStyleCompaction(long handle,
+      long memtableMemoryBudget);
+  private native void optimizeUniversalStyleCompaction(long handle,
+      long memtableMemoryBudget);
+  private native void setComparatorHandle(long handle, int builtinComparator);
+  private native void setComparatorHandle(long optHandle,
+      long comparatorHandle, byte comparatorType);
+  private native void setMergeOperatorName(long handle, String name);
+  private native void setMergeOperator(long handle, long mergeOperatorHandle);
+  private native void setCompactionFilterHandle(long handle,
+      long compactionFilterHandle);
+  private native void setCompactionFilterFactoryHandle(long handle,
+      long compactionFilterFactoryHandle);
+  private native void setWriteBufferSize(long handle, long writeBufferSize)
+      throws IllegalArgumentException;
+  private native long writeBufferSize(long handle);
+  private native void setMaxWriteBufferNumber(
+      long handle, int maxWriteBufferNumber);
+  private native int maxWriteBufferNumber(long handle);
+  private native void setMinWriteBufferNumberToMerge(
+      long handle, int minWriteBufferNumberToMerge);
+  private native int minWriteBufferNumberToMerge(long handle);
+  private native void setCompressionType(long handle, byte compressionType);
+  private native byte compressionType(long handle);
+  private native void setCompressionPerLevel(long handle,
+      byte[] compressionLevels);
+  private native byte[] compressionPerLevel(long handle);
+  private native void setBottommostCompressionType(long handle,
+      byte bottommostCompressionType);
+  private native byte bottommostCompressionType(long handle);
+  private native void setBottommostCompressionOptions(final long handle,
+      final long bottommostCompressionOptionsHandle);
+  private native void setCompressionOptions(long handle,
+      long compressionOptionsHandle);
+  private native void useFixedLengthPrefixExtractor(
+      long handle, int prefixLength);
+  private native void useCappedPrefixExtractor(
+      long handle, int prefixLength);
+  private native void setNumLevels(
+      long handle, int numLevels);
+  private native int numLevels(long handle);
+  private native void setLevelZeroFileNumCompactionTrigger(
+      long handle, int numFiles);
+  private native int levelZeroFileNumCompactionTrigger(long handle);
+  private native void setLevelZeroSlowdownWritesTrigger(
+      long handle, int numFiles);
+  private native int levelZeroSlowdownWritesTrigger(long handle);
+  private native void setLevelZeroStopWritesTrigger(
+      long handle, int numFiles);
+  private native int levelZeroStopWritesTrigger(long handle);
+  private native void setTargetFileSizeBase(
+      long handle, long targetFileSizeBase);
+  private native long targetFileSizeBase(long handle);
+  private native void setTargetFileSizeMultiplier(
+      long handle, int multiplier);
+  private native int targetFileSizeMultiplier(long handle);
+  private native void setMaxBytesForLevelBase(
+      long handle, long maxBytesForLevelBase);
+  private native long maxBytesForLevelBase(long handle);
+  private native void setLevelCompactionDynamicLevelBytes(
+      long handle, boolean enableLevelCompactionDynamicLevelBytes);
+  private native boolean levelCompactionDynamicLevelBytes(
+      long handle);
+  private native void setMaxBytesForLevelMultiplier(long handle, double multiplier);
+  private native double maxBytesForLevelMultiplier(long handle);
+  private native void setMaxCompactionBytes(long handle, long maxCompactionBytes);
+  private native long maxCompactionBytes(long handle);
+  private native void setArenaBlockSize(
+      long handle, long arenaBlockSize)
+      throws IllegalArgumentException;
+  private native long arenaBlockSize(long handle);
+  private native void setDisableAutoCompactions(
+      long handle, boolean disableAutoCompactions);
+  private native boolean disableAutoCompactions(long handle);
+  private native void setCompactionStyle(long handle, byte compactionStyle);
+  private native byte compactionStyle(long handle);
+   private native void setMaxTableFilesSizeFIFO(
+      long handle, long max_table_files_size);
+  private native long maxTableFilesSizeFIFO(long handle);
+  private native void setMaxSequentialSkipInIterations(
+      long handle, long maxSequentialSkipInIterations);
+  private native long maxSequentialSkipInIterations(long handle);
+  private native void setMemTableFactory(long handle, long factoryHandle);
+  private native String memTableFactoryName(long handle);
+  private native void setTableFactory(long handle, long factoryHandle);
+  private native String tableFactoryName(long handle);
+  private static native void setCfPaths(
+      final long handle, final String[] paths, final long[] targetSizes);
+  private static native long cfPathsLen(final long handle);
+  private static native void cfPaths(
+      final long handle, final String[] paths, final long[] targetSizes);
+  private native void setInplaceUpdateSupport(
+      long handle, boolean inplaceUpdateSupport);
+  private native boolean inplaceUpdateSupport(long handle);
+  private native void setInplaceUpdateNumLocks(
+      long handle, long inplaceUpdateNumLocks)
+      throws IllegalArgumentException;
+  private native long inplaceUpdateNumLocks(long handle);
+  private native void setMemtablePrefixBloomSizeRatio(
+      long handle, double memtablePrefixBloomSizeRatio);
+  private native double memtablePrefixBloomSizeRatio(long handle);
+  private native void setExperimentalMempurgeThreshold(
+      long handle, double experimentalMempurgeThreshold);
+  private native double experimentalMempurgeThreshold(long handle);
+  private native void setMemtableWholeKeyFiltering(long handle, boolean memtableWholeKeyFiltering);
+  private native boolean memtableWholeKeyFiltering(long handle);
+  private native void setBloomLocality(
+      long handle, int bloomLocality);
+  private native int bloomLocality(long handle);
+  private native void setMaxSuccessiveMerges(
+      long handle, long maxSuccessiveMerges)
+      throws IllegalArgumentException;
+  private native long maxSuccessiveMerges(long handle);
+  private native void setOptimizeFiltersForHits(long handle,
+      boolean optimizeFiltersForHits);
+  private native boolean optimizeFiltersForHits(long handle);
+  private native void setMemtableHugePageSize(long handle,
+      long memtableHugePageSize);
+  private native long memtableHugePageSize(long handle);
+  private native void setSoftPendingCompactionBytesLimit(long handle,
+      long softPendingCompactionBytesLimit);
+  private native long softPendingCompactionBytesLimit(long handle);
+  private native void setHardPendingCompactionBytesLimit(long handle,
+      long hardPendingCompactionBytesLimit);
+  private native long hardPendingCompactionBytesLimit(long handle);
+  private native void setLevel0FileNumCompactionTrigger(long handle,
+      int level0FileNumCompactionTrigger);
+  private native int level0FileNumCompactionTrigger(long handle);
+  private native void setLevel0SlowdownWritesTrigger(long handle,
+      int level0SlowdownWritesTrigger);
+  private native int level0SlowdownWritesTrigger(long handle);
+  private native void setLevel0StopWritesTrigger(long handle,
+      int level0StopWritesTrigger);
+  private native int level0StopWritesTrigger(long handle);
+  private native void setMaxBytesForLevelMultiplierAdditional(long handle,
+      int[] maxBytesForLevelMultiplierAdditional);
+  private native int[] maxBytesForLevelMultiplierAdditional(long handle);
+  private native void setParanoidFileChecks(long handle,
+      boolean paranoidFileChecks);
+  private native boolean paranoidFileChecks(long handle);
+  private native void setMaxWriteBufferNumberToMaintain(final long handle,
+      final int maxWriteBufferNumberToMaintain);
+  private native int maxWriteBufferNumberToMaintain(final long handle);
+  private native void setCompactionPriority(final long handle,
+      final byte compactionPriority);
+  private native byte compactionPriority(final long handle);
+  private native void setReportBgIoStats(final long handle,
+    final boolean reportBgIoStats);
+  private native boolean reportBgIoStats(final long handle);
+  private native void setTtl(final long handle, final long ttl);
+  private native long ttl(final long handle);
+  private native void setPeriodicCompactionSeconds(
+      final long handle, final long periodicCompactionSeconds);
+  private native long periodicCompactionSeconds(final long handle);
+  private native void setCompactionOptionsUniversal(final long handle,
+    final long compactionOptionsUniversalHandle);
+  private native void setCompactionOptionsFIFO(final long handle,
+    final long compactionOptionsFIFOHandle);
+  private native void setForceConsistencyChecks(final long handle,
+    final boolean forceConsistencyChecks);
+  private native boolean forceConsistencyChecks(final long handle);
+  private native void setSstPartitionerFactory(long nativeHandle_, long newFactoryHandle);
+  private static native void setCompactionThreadLimiter(
+      final long nativeHandle_, final long compactionThreadLimiterHandle);
+
+  private native void setEnableBlobFiles(final long nativeHandle_, final boolean enableBlobFiles);
+  private native boolean enableBlobFiles(final long nativeHandle_);
+  private native void setMinBlobSize(final long nativeHandle_, final long minBlobSize);
+  private native long minBlobSize(final long nativeHandle_);
+  private native void setBlobFileSize(final long nativeHandle_, final long blobFileSize);
+  private native long blobFileSize(final long nativeHandle_);
+  private native void setBlobCompressionType(final long nativeHandle_, final byte compressionType);
+  private native byte blobCompressionType(final long nativeHandle_);
+  private native void setEnableBlobGarbageCollection(
+      final long nativeHandle_, final boolean enableBlobGarbageCollection);
+  private native boolean enableBlobGarbageCollection(final long nativeHandle_);
+  private native void setBlobGarbageCollectionAgeCutoff(
+      final long nativeHandle_, final double blobGarbageCollectionAgeCutoff);
+  private native double blobGarbageCollectionAgeCutoff(final long nativeHandle_);
+  private native void setBlobGarbageCollectionForceThreshold(
+      final long nativeHandle_, final double blobGarbageCollectionForceThreshold);
+  private native double blobGarbageCollectionForceThreshold(final long nativeHandle_);
+  private native void setBlobCompactionReadaheadSize(
+      final long nativeHandle_, final long blobCompactionReadaheadSize);
+  private native long blobCompactionReadaheadSize(final long nativeHandle_);
+  private native void setBlobFileStartingLevel(
+      final long nativeHandle_, final int blobFileStartingLevel);
+  private native int blobFileStartingLevel(final long nativeHandle_);
+  private native void setPrepopulateBlobCache(
+      final long nativeHandle_, final byte prepopulateBlobCache);
+  private native byte prepopulateBlobCache(final long nativeHandle_);
+
+  // instance variables
+  // NOTE: If you add new member variables, please update the copy constructor above!
+  private MemTableConfig memTableConfig_;
+  private TableFormatConfig tableFormatConfig_;
+  private AbstractComparator comparator_;
+  private AbstractCompactionFilter<? extends AbstractSlice<?>> compactionFilter_;
+  private AbstractCompactionFilterFactory<? extends AbstractCompactionFilter<?>>
+      compactionFilterFactory_;
+  private CompactionOptionsUniversal compactionOptionsUniversal_;
+  private CompactionOptionsFIFO compactionOptionsFIFO_;
+  private CompressionOptions bottommostCompressionOptions_;
+  private CompressionOptions compressionOptions_;
+  private SstPartitionerFactory sstPartitionerFactory_;
+  private ConcurrentTaskLimiter compactionThreadLimiter_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java b/src/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java
new file mode 100644
index 000000000..97357aacf
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java
@@ -0,0 +1,536 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Collection;
+import java.util.List;
+
+public interface ColumnFamilyOptionsInterface<T extends ColumnFamilyOptionsInterface<T>>
+    extends AdvancedColumnFamilyOptionsInterface<T> {
+  /**
+   * The function recovers options to a previous version. Only 4.6 or later
+   * versions are supported.
+   *
+   * @param majorVersion The major version to recover default values of options
+   * @param minorVersion The minor version to recover default values of options
+   * @return the instance of the current object.
+   */
+  T oldDefaults(int majorVersion, int minorVersion);
+
+  /**
+   * Use this if your DB is very small (like under 1GB) and you don't want to
+   * spend lots of memory for memtables.
+   *
+   * @return the instance of the current object.
+   */
+  T optimizeForSmallDb();
+
+  /**
+   * Some functions that make it easier to optimize RocksDB
+   * Use this if your DB is very small (like under 1GB) and you don't want to
+   * spend lots of memory for memtables.
+   *
+   * @param cache An optional cache object is passed in to be used as the block cache
+   * @return the instance of the current object.
+   */
+  T optimizeForSmallDb(Cache cache);
+
+  /**
+   * Use this if you don't need to keep the data sorted, i.e. you'll never use
+   * an iterator, only Put() and Get() API calls
+   *
+   * @param blockCacheSizeMb Block cache size in MB
+   * @return the instance of the current object.
+   */
+  T optimizeForPointLookup(long blockCacheSizeMb);
+
+  /**
+   * <p>Default values for some parameters in ColumnFamilyOptions are not
+   * optimized for heavy workloads and big datasets, which means you might
+   * observe write stalls under some conditions. As a starting point for tuning
+   * RocksDB options, use the following for level style compaction.</p>
+   *
+   * <p>Make sure to also call IncreaseParallelism(), which will provide the
+   * biggest performance gains.</p>
+   * <p>Note: we might use more memory than memtable_memory_budget during high
+   * write rate period</p>
+   *
+   * @return the instance of the current object.
+   */
+  T optimizeLevelStyleCompaction();
+
+  /**
+   * <p>Default values for some parameters in ColumnFamilyOptions are not
+   * optimized for heavy workloads and big datasets, which means you might
+   * observe write stalls under some conditions. As a starting point for tuning
+   * RocksDB options, use the following for level style compaction.</p>
+   *
+   * <p>Make sure to also call IncreaseParallelism(), which will provide the
+   * biggest performance gains.</p>
+   * <p>Note: we might use more memory than memtable_memory_budget during high
+   * write rate period</p>
+   *
+   * @param memtableMemoryBudget memory budget in bytes
+   * @return the instance of the current object.
+   */
+  T optimizeLevelStyleCompaction(
+      long memtableMemoryBudget);
+
+  /**
+   * <p>Default values for some parameters in ColumnFamilyOptions are not
+   * optimized for heavy workloads and big datasets, which means you might
+   * observe write stalls under some conditions. As a starting point for tuning
+   * RocksDB options, use the following for universal style compaction.</p>
+   *
+   * <p>Universal style compaction is focused on reducing Write Amplification
+   * Factor for big data sets, but increases Space Amplification.</p>
+   *
+   * <p>Make sure to also call IncreaseParallelism(), which will provide the
+   * biggest performance gains.</p>
+   *
+   * <p>Note: we might use more memory than memtable_memory_budget during high
+   * write rate period</p>
+   *
+   * @return the instance of the current object.
+   */
+  T optimizeUniversalStyleCompaction();
+
+  /**
+   * <p>Default values for some parameters in ColumnFamilyOptions are not
+   * optimized for heavy workloads and big datasets, which means you might
+   * observe write stalls under some conditions. As a starting point for tuning
+   * RocksDB options, use the following for universal style compaction.</p>
+   *
+   * <p>Universal style compaction is focused on reducing Write Amplification
+   * Factor for big data sets, but increases Space Amplification.</p>
+   *
+   * <p>Make sure to also call IncreaseParallelism(), which will provide the
+   * biggest performance gains.</p>
+   *
+   * <p>Note: we might use more memory than memtable_memory_budget during high
+   * write rate period</p>
+   *
+   * @param memtableMemoryBudget memory budget in bytes
+   * @return the instance of the current object.
+   */
+  T optimizeUniversalStyleCompaction(
+      long memtableMemoryBudget);
+
+  /**
+   * Set {@link BuiltinComparator} to be used with RocksDB.
+   *
+   * Note: Comparator can be set once upon database creation.
+   *
+   * Default: BytewiseComparator.
+   * @param builtinComparator a {@link BuiltinComparator} type.
+   * @return the instance of the current object.
+   */
+  T setComparator(
+      BuiltinComparator builtinComparator);
+
+  /**
+   * Use the specified comparator for key ordering.
+   *
+   * Comparator should not be disposed before options instances using this comparator is
+   * disposed. If dispose() function is not called, then comparator object will be
+   * GC'd automatically.
+   *
+   * Comparator instance can be re-used in multiple options instances.
+   *
+   * @param comparator java instance.
+   * @return the instance of the current object.
+   */
+  T setComparator(
+      AbstractComparator comparator);
+
+  /**
+   * <p>Set the merge operator to be used for merging two merge operands
+   * of the same key. The merge function is invoked during
+   * compaction and at lookup time, if multiple key/value pairs belonging
+   * to the same key are found in the database.</p>
+   *
+   * @param name the name of the merge function, as defined by
+   * the MergeOperators factory (see utilities/MergeOperators.h)
+   * The merge function is specified by name and must be one of the
+   * standard merge operators provided by RocksDB. The available
+   * operators are "put", "uint64add", "stringappend" and "stringappendtest".
+   * @return the instance of the current object.
+   */
+  T setMergeOperatorName(String name);
+
+  /**
+   * <p>Set the merge operator to be used for merging two different key/value
+   * pairs that share the same key. The merge function is invoked during
+   * compaction and at lookup time, if multiple key/value pairs belonging
+   * to the same key are found in the database.</p>
+   *
+   * @param mergeOperator {@link MergeOperator} instance.
+   * @return the instance of the current object.
+   */
+  T setMergeOperator(MergeOperator mergeOperator);
+
+  /**
+   * A single CompactionFilter instance to call into during compaction.
+   * Allows an application to modify/delete a key-value during background
+   * compaction.
+   *
+   * If the client requires a new compaction filter to be used for different
+   * compaction runs, it can specify call
+   * {@link #setCompactionFilterFactory(AbstractCompactionFilterFactory)}
+   * instead.
+   *
+   * The client should specify only set one of the two.
+   * {@link #setCompactionFilter(AbstractCompactionFilter)} takes precedence
+   * over {@link #setCompactionFilterFactory(AbstractCompactionFilterFactory)}
+   * if the client specifies both.
+   *
+   * If multithreaded compaction is being used, the supplied CompactionFilter
+   * instance may be used from different threads concurrently and so should be thread-safe.
+   *
+   * @param compactionFilter {@link AbstractCompactionFilter} instance.
+   * @return the instance of the current object.
+   */
+  T setCompactionFilter(
+          final AbstractCompactionFilter<? extends AbstractSlice<?>> compactionFilter);
+
+  /**
+   * Accessor for the CompactionFilter instance in use.
+   *
+   * @return  Reference to the CompactionFilter, or null if one hasn't been set.
+   */
+  AbstractCompactionFilter<? extends AbstractSlice<?>> compactionFilter();
+
+  /**
+   * This is a factory that provides {@link AbstractCompactionFilter} objects
+   * which allow an application to modify/delete a key-value during background
+   * compaction.
+   *
+   * A new filter will be created on each compaction run.  If multithreaded
+   * compaction is being used, each created CompactionFilter will only be used
+   * from a single thread and so does not need to be thread-safe.
+   *
+   * @param compactionFilterFactory {@link AbstractCompactionFilterFactory} instance.
+   * @return the instance of the current object.
+   */
+  T setCompactionFilterFactory(
+          final AbstractCompactionFilterFactory<? extends AbstractCompactionFilter<?>>
+                  compactionFilterFactory);
+
+  /**
+   * Accessor for the CompactionFilterFactory instance in use.
+   *
+   * @return  Reference to the CompactionFilterFactory, or null if one hasn't been set.
+   */
+  AbstractCompactionFilterFactory<? extends AbstractCompactionFilter<?>> compactionFilterFactory();
+
+  /**
+   * This prefix-extractor uses the first n bytes of a key as its prefix.
+   *
+   * In some hash-based memtable representation such as HashLinkedList
+   * and HashSkipList, prefixes are used to partition the keys into
+   * several buckets.  Prefix extractor is used to specify how to
+   * extract the prefix given a key.
+   *
+   * @param n use the first n bytes of a key as its prefix.
+   * @return the reference to the current option.
+   */
+  T useFixedLengthPrefixExtractor(int n);
+
+  /**
+   * Same as fixed length prefix extractor, except that when slice is
+   * shorter than the fixed length, it will use the full key.
+   *
+   * @param n use the first n bytes of a key as its prefix.
+   * @return the reference to the current option.
+   */
+  T useCappedPrefixExtractor(int n);
+
+  /**
+   * Number of files to trigger level-0 compaction. A value &lt; 0 means that
+   * level-0 compaction will not be triggered by number of files at all.
+   * Default: 4
+   *
+   * @param numFiles the number of files in level-0 to trigger compaction.
+   * @return the reference to the current option.
+   */
+  T setLevelZeroFileNumCompactionTrigger(
+      int numFiles);
+
+  /**
+   * The number of files in level 0 to trigger compaction from level-0 to
+   * level-1.  A value &lt; 0 means that level-0 compaction will not be
+   * triggered by number of files at all.
+   * Default: 4
+   *
+   * @return the number of files in level 0 to trigger compaction.
+   */
+  int levelZeroFileNumCompactionTrigger();
+
+  /**
+   * Soft limit on number of level-0 files. We start slowing down writes at this
+   * point. A value &lt; 0 means that no writing slow down will be triggered by
+   * number of files in level-0.
+   *
+   * @param numFiles soft limit on number of level-0 files.
+   * @return the reference to the current option.
+   */
+  T setLevelZeroSlowdownWritesTrigger(
+      int numFiles);
+
+  /**
+   * Soft limit on the number of level-0 files. We start slowing down writes
+   * at this point. A value &lt; 0 means that no writing slow down will be
+   * triggered by number of files in level-0.
+   *
+   * @return the soft limit on the number of level-0 files.
+   */
+  int levelZeroSlowdownWritesTrigger();
+
+  /**
+   * Maximum number of level-0 files.  We stop writes at this point.
+   *
+   * @param numFiles the hard limit of the number of level-0 files.
+   * @return the reference to the current option.
+   */
+  T setLevelZeroStopWritesTrigger(int numFiles);
+
+  /**
+   * Maximum number of level-0 files.  We stop writes at this point.
+   *
+   * @return the hard limit of the number of level-0 file.
+   */
+  int levelZeroStopWritesTrigger();
+
+  /**
+   * The ratio between the total size of level-(L+1) files and the total
+   * size of level-L files for all L.
+   * DEFAULT: 10
+   *
+   * @param multiplier the ratio between the total size of level-(L+1)
+   *     files and the total size of level-L files for all L.
+   * @return the reference to the current option.
+   */
+  T setMaxBytesForLevelMultiplier(
+      double multiplier);
+
+  /**
+   * The ratio between the total size of level-(L+1) files and the total
+   * size of level-L files for all L.
+   * DEFAULT: 10
+   *
+   * @return the ratio between the total size of level-(L+1) files and
+   *     the total size of level-L files for all L.
+   */
+  double maxBytesForLevelMultiplier();
+
+  /**
+   * FIFO compaction option.
+   * The oldest table file will be deleted
+   * once the sum of table files reaches this size.
+   * The default value is 1GB (1 * 1024 * 1024 * 1024).
+   *
+   * @param maxTableFilesSize the size limit of the total sum of table files.
+   * @return the instance of the current object.
+   */
+  T setMaxTableFilesSizeFIFO(
+      long maxTableFilesSize);
+
+  /**
+   * FIFO compaction option.
+   * The oldest table file will be deleted
+   * once the sum of table files reaches this size.
+   * The default value is 1GB (1 * 1024 * 1024 * 1024).
+   *
+   * @return the size limit of the total sum of table files.
+   */
+  long maxTableFilesSizeFIFO();
+
+  /**
+   * Get the config for mem-table.
+   *
+   * @return the mem-table config.
+   */
+  MemTableConfig memTableConfig();
+
+  /**
+   * Set the config for mem-table.
+   *
+   * @param memTableConfig the mem-table config.
+   * @return the instance of the current object.
+   * @throws java.lang.IllegalArgumentException thrown on 32-Bit platforms
+   *   while overflowing the underlying platform specific value.
+   */
+  T setMemTableConfig(MemTableConfig memTableConfig);
+
+  /**
+   * Returns the name of the current mem table representation.
+   * Memtable format can be set using setTableFormatConfig.
+   *
+   * @return the name of the currently-used memtable factory.
+   * @see #setTableFormatConfig(org.rocksdb.TableFormatConfig)
+   */
+  String memTableFactoryName();
+
+  /**
+   * Get the config for table format.
+   *
+   * @return the table format config.
+   */
+  TableFormatConfig tableFormatConfig();
+
+  /**
+   * Set the config for table format.
+   *
+   * @param config the table format config.
+   * @return the reference of the current options.
+   */
+  T setTableFormatConfig(TableFormatConfig config);
+
+  /**
+   * @return the name of the currently used table factory.
+   */
+  String tableFactoryName();
+
+  /**
+   * A list of paths where SST files for this column family
+   * can be put into, with its target size. Similar to db_paths,
+   * newer data is placed into paths specified earlier in the
+   * vector while older data gradually moves to paths specified
+   * later in the vector.
+   * Note that, if a path is supplied to multiple column
+   * families, it would have files and total size from all
+   * the column families combined. User should provision for the
+   * total size(from all the column families) in such cases.
+   *
+   * If left empty, db_paths will be used.
+   * Default: empty
+   *
+   * @param paths collection of paths for SST files.
+   * @return the reference of the current options.
+   */
+  T setCfPaths(final Collection<DbPath> paths);
+
+  /**
+   * @return collection of paths for SST files.
+   */
+  List<DbPath> cfPaths();
+
+  /**
+   * Compression algorithm that will be used for the bottommost level that
+   * contain files. If level-compaction is used, this option will only affect
+   * levels after base level.
+   *
+   * Default: {@link CompressionType#DISABLE_COMPRESSION_OPTION}
+   *
+   * @param bottommostCompressionType  The compression type to use for the
+   *     bottommost level
+   *
+   * @return the reference of the current options.
+   */
+  T setBottommostCompressionType(
+      final CompressionType bottommostCompressionType);
+
+  /**
+   * Compression algorithm that will be used for the bottommost level that
+   * contain files. If level-compaction is used, this option will only affect
+   * levels after base level.
+   *
+   * Default: {@link CompressionType#DISABLE_COMPRESSION_OPTION}
+   *
+   * @return The compression type used for the bottommost level
+   */
+  CompressionType bottommostCompressionType();
+
+  /**
+   * Set the options for compression algorithms used by
+   * {@link #bottommostCompressionType()} if it is enabled.
+   *
+   * To enable it, please see the definition of
+   * {@link CompressionOptions}.
+   *
+   * @param compressionOptions the bottom most compression options.
+   *
+   * @return the reference of the current options.
+   */
+  T setBottommostCompressionOptions(
+      final CompressionOptions compressionOptions);
+
+  /**
+   * Get the bottom most compression options.
+   *
+   * See {@link #setBottommostCompressionOptions(CompressionOptions)}.
+   *
+   * @return the bottom most compression options.
+   */
+  CompressionOptions bottommostCompressionOptions();
+
+  /**
+   * Set the different options for compression algorithms
+   *
+   * @param compressionOptions The compression options
+   *
+   * @return the reference of the current options.
+   */
+  T setCompressionOptions(
+      CompressionOptions compressionOptions);
+
+  /**
+   * Get the different options for compression algorithms
+   *
+   * @return The compression options
+   */
+  CompressionOptions compressionOptions();
+
+  /**
+   * If non-nullptr, use the specified factory for a function to determine the
+   * partitioning of sst files. This helps compaction to split the files
+   * on interesting boundaries (key prefixes) to make propagation of sst
+   * files less write amplifying (covering the whole key space).
+   *
+   * Default: nullptr
+   *
+   * @param factory The factory reference
+   * @return the reference of the current options.
+   */
+  @Experimental("Caution: this option is experimental")
+  T setSstPartitionerFactory(SstPartitionerFactory factory);
+
+  /**
+   * Get SST partitioner factory
+   *
+   * @return SST partitioner factory
+   */
+  @Experimental("Caution: this option is experimental")
+  SstPartitionerFactory sstPartitionerFactory();
+
+  /**
+   * Compaction concurrent thread limiter for the column family.
+   * If non-nullptr, use given concurrent thread limiter to control
+   * the max outstanding compaction tasks. Limiter can be shared with
+   * multiple column families across db instances.
+   *
+   * @param concurrentTaskLimiter The compaction thread limiter.
+   * @return the reference of the current options.
+   */
+  T setCompactionThreadLimiter(ConcurrentTaskLimiter concurrentTaskLimiter);
+
+  /**
+   * Get compaction thread limiter
+   *
+   * @return Compaction thread limiter
+   */
+  ConcurrentTaskLimiter compactionThreadLimiter();
+
+  /**
+   * Default memtable memory budget used with the following methods:
+   *
+   * <ol>
+   *   <li>{@link #optimizeLevelStyleCompaction()}</li>
+   *   <li>{@link #optimizeUniversalStyleCompaction()}</li>
+   * </ol>
+   */
+  long DEFAULT_COMPACTION_MEMTABLE_MEMORY_BUDGET = 512 * 1024 * 1024;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/CompactRangeOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/CompactRangeOptions.java
new file mode 100644
index 000000000..da023d366
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/CompactRangeOptions.java
@@ -0,0 +1,238 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * CompactRangeOptions is used by CompactRange() call. In the documentation of the methods "the compaction" refers to
+ * any compaction that is using this CompactRangeOptions.
+ */
+public class CompactRangeOptions extends RocksObject {
+
+  private final static byte VALUE_kSkip = 0;
+  private final static byte VALUE_kIfHaveCompactionFilter = 1;
+  private final static byte VALUE_kForce = 2;
+
+  // For level based compaction, we can configure if we want to skip/force bottommost level
+  // compaction. The order of this enum MUST follow the C++ layer. See BottommostLevelCompaction in
+  // db/options.h
+  public enum BottommostLevelCompaction {
+    /**
+     * Skip bottommost level compaction
+     */
+    kSkip(VALUE_kSkip),
+    /**
+     * Only compact bottommost level if there is a compaction filter. This is the default option
+     */
+    kIfHaveCompactionFilter(VALUE_kIfHaveCompactionFilter),
+    /**
+     * Always compact bottommost level
+     */
+    kForce(VALUE_kForce);
+
+    private final byte value;
+
+    BottommostLevelCompaction(final byte value) {
+      this.value = value;
+    }
+
+    /**
+     * <p>Returns the byte value of the enumerations value.</p>
+     *
+     * @return byte representation
+     */
+    public byte getValue() {
+      return value;
+    }
+
+    /**
+     * Returns the BottommostLevelCompaction for the given C++ rocks enum value.
+     * @param bottommostLevelCompaction The value of the BottommostLevelCompaction
+     * @return BottommostLevelCompaction instance, or null if none matches
+     */
+    public static BottommostLevelCompaction fromRocksId(final int bottommostLevelCompaction) {
+      switch (bottommostLevelCompaction) {
+        case VALUE_kSkip: return kSkip;
+        case VALUE_kIfHaveCompactionFilter: return kIfHaveCompactionFilter;
+        case VALUE_kForce: return kForce;
+        default: return null;
+      }
+    }
+  }
+
+  /**
+   * Construct CompactRangeOptions.
+   */
+  public CompactRangeOptions() {
+    super(newCompactRangeOptions());
+  }
+
+  /**
+   * Returns whether the compaction is exclusive or other compactions may run concurrently at the same time.
+   *
+   * @return true if exclusive, false if concurrent
+   */
+  public boolean exclusiveManualCompaction() {
+    return exclusiveManualCompaction(nativeHandle_);
+  }
+
+  /**
+   * Sets whether the compaction is exclusive or other compaction are allowed run concurrently at the same time.
+   *
+   * @param exclusiveCompaction true if compaction should be exclusive
+   * @return This CompactRangeOptions
+   */
+  public CompactRangeOptions setExclusiveManualCompaction(final boolean exclusiveCompaction) {
+    setExclusiveManualCompaction(nativeHandle_, exclusiveCompaction);
+    return this;
+  }
+
+  /**
+   * Returns whether compacted files will be moved to the minimum level capable of holding the data or given level
+   * (specified non-negative target_level).
+   * @return true, if compacted files will be moved to the minimum level
+   */
+  public boolean changeLevel() {
+    return changeLevel(nativeHandle_);
+  }
+
+  /**
+   * Whether compacted files will be moved to the minimum level capable of holding the data or given level
+   * (specified non-negative target_level).
+   *
+   * @param changeLevel If true, compacted files will be moved to the minimum level
+   * @return This CompactRangeOptions
+   */
+  public CompactRangeOptions setChangeLevel(final boolean changeLevel) {
+    setChangeLevel(nativeHandle_, changeLevel);
+    return this;
+  }
+
+  /**
+   * If change_level is true and target_level have non-negative value, compacted files will be moved to target_level.
+   * @return The target level for the compacted files
+   */
+  public int targetLevel() {
+    return targetLevel(nativeHandle_);
+  }
+
+
+  /**
+   * If change_level is true and target_level have non-negative value, compacted files will be moved to target_level.
+   *
+   * @param targetLevel target level for the compacted files
+   * @return This CompactRangeOptions
+   */
+  public CompactRangeOptions setTargetLevel(final int targetLevel) {
+    setTargetLevel(nativeHandle_, targetLevel);
+    return this;
+  }
+
+  /**
+   * target_path_id for compaction output. Compaction outputs will be placed in options.db_paths[target_path_id].
+   *
+   * @return target_path_id
+   */
+  public int targetPathId() {
+    return targetPathId(nativeHandle_);
+  }
+
+  /**
+   * Compaction outputs will be placed in options.db_paths[target_path_id]. Behavior is undefined if target_path_id is
+   * out of range.
+   *
+   * @param targetPathId target path id
+   * @return This CompactRangeOptions
+   */
+  public CompactRangeOptions setTargetPathId(final int targetPathId) {
+    setTargetPathId(nativeHandle_, targetPathId);
+    return this;
+  }
+
+  /**
+   * Returns the policy for compacting the bottommost level
+   * @return The BottommostLevelCompaction policy
+   */
+  public BottommostLevelCompaction bottommostLevelCompaction() {
+    return BottommostLevelCompaction.fromRocksId(bottommostLevelCompaction(nativeHandle_));
+  }
+
+  /**
+   * Sets the policy for compacting the bottommost level
+   *
+   * @param bottommostLevelCompaction The policy for compacting the bottommost level
+   * @return This CompactRangeOptions
+   */
+  public CompactRangeOptions setBottommostLevelCompaction(final BottommostLevelCompaction bottommostLevelCompaction) {
+    setBottommostLevelCompaction(nativeHandle_, bottommostLevelCompaction.getValue());
+    return this;
+  }
+
+  /**
+   * If true, compaction will execute immediately even if doing so would cause the DB to
+   * enter write stall mode. Otherwise, it'll sleep until load is low enough.
+   * @return true if compaction will execute immediately
+   */
+  public boolean allowWriteStall() {
+    return allowWriteStall(nativeHandle_);
+  }
+
+
+  /**
+   * If true, compaction will execute immediately even if doing so would cause the DB to
+   * enter write stall mode. Otherwise, it'll sleep until load is low enough.
+   *
+   * @return This CompactRangeOptions
+   * @param allowWriteStall true if compaction should execute immediately
+   */
+  public CompactRangeOptions setAllowWriteStall(final boolean allowWriteStall) {
+    setAllowWriteStall(nativeHandle_, allowWriteStall);
+    return this;
+  }
+
+  /**
+   * If &gt; 0, it will replace the option in the DBOptions for this compaction
+   * @return number of subcompactions
+   */
+  public int maxSubcompactions() {
+    return maxSubcompactions(nativeHandle_);
+  }
+
+  /**
+   * If &gt; 0, it will replace the option in the DBOptions for this compaction
+   *
+   * @param maxSubcompactions number of subcompactions
+   * @return This CompactRangeOptions
+   */
+  public CompactRangeOptions setMaxSubcompactions(final int maxSubcompactions) {
+    setMaxSubcompactions(nativeHandle_, maxSubcompactions);
+    return this;
+  }
+
+  private native static long newCompactRangeOptions();
+  @Override protected final native void disposeInternal(final long handle);
+
+  private native boolean exclusiveManualCompaction(final long handle);
+  private native void setExclusiveManualCompaction(final long handle,
+      final boolean exclusive_manual_compaction);
+  private native boolean changeLevel(final long handle);
+  private native void setChangeLevel(final long handle,
+      final boolean changeLevel);
+  private native int targetLevel(final long handle);
+  private native void setTargetLevel(final long handle,
+      final int targetLevel);
+  private native int targetPathId(final long handle);
+  private native void setTargetPathId(final long handle,
+      final int targetPathId);
+  private native int bottommostLevelCompaction(final long handle);
+  private native void setBottommostLevelCompaction(final long handle,
+      final int bottommostLevelCompaction);
+  private native boolean allowWriteStall(final long handle);
+  private native void setAllowWriteStall(final long handle,
+      final boolean allowWriteStall);
+  private native void setMaxSubcompactions(final long handle,
+      final int maxSubcompactions);
+  private native int maxSubcompactions(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/CompactionJobInfo.java b/src/rocksdb/java/src/main/java/org/rocksdb/CompactionJobInfo.java
new file mode 100644
index 000000000..4e3b8d68b
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/CompactionJobInfo.java
@@ -0,0 +1,161 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+public class CompactionJobInfo extends RocksObject {
+
+  public CompactionJobInfo() {
+    super(newCompactionJobInfo());
+  }
+
+  /**
+   * Private as called from JNI C++
+   */
+  private CompactionJobInfo(final long nativeHandle) {
+    super(nativeHandle);
+    // We do not own the native object!
+    disOwnNativeHandle();
+  }
+
+  /**
+   * Get the name of the column family where the compaction happened.
+   *
+   * @return the name of the column family
+   */
+  public byte[] columnFamilyName() {
+    return columnFamilyName(nativeHandle_);
+  }
+
+  /**
+   * Get the status indicating whether the compaction was successful or not.
+   *
+   * @return the status
+   */
+  public Status status() {
+    return status(nativeHandle_);
+  }
+
+  /**
+   * Get the id of the thread that completed this compaction job.
+   *
+   * @return the id of the thread
+   */
+  public long threadId() {
+    return threadId(nativeHandle_);
+  }
+
+  /**
+   * Get the job id, which is unique in the same thread.
+   *
+   * @return the id of the thread
+   */
+  public int jobId() {
+    return jobId(nativeHandle_);
+  }
+
+  /**
+   * Get the smallest input level of the compaction.
+   *
+   * @return the input level
+   */
+  public int baseInputLevel() {
+    return baseInputLevel(nativeHandle_);
+  }
+
+  /**
+   * Get the output level of the compaction.
+   *
+   * @return the output level
+   */
+  public int outputLevel() {
+    return outputLevel(nativeHandle_);
+  }
+
+  /**
+   * Get the names of the compaction input files.
+   *
+   * @return the names of the input files.
+   */
+  public List<String> inputFiles() {
+    return Arrays.asList(inputFiles(nativeHandle_));
+  }
+
+  /**
+   * Get the names of the compaction output files.
+   *
+   * @return the names of the output files.
+   */
+  public List<String> outputFiles() {
+    return Arrays.asList(outputFiles(nativeHandle_));
+  }
+
+  /**
+   * Get the table properties for the input and output tables.
+   *
+   * The map is keyed by values from {@link #inputFiles()} and
+   *     {@link #outputFiles()}.
+   *
+   * @return the table properties
+   */
+  public Map<String, TableProperties> tableProperties() {
+    return tableProperties(nativeHandle_);
+  }
+
+  /**
+   * Get the Reason for running the compaction.
+   *
+   * @return the reason.
+   */
+  public CompactionReason compactionReason() {
+    return CompactionReason.fromValue(compactionReason(nativeHandle_));
+  }
+
+  //
+  /**
+   * Get the compression algorithm used for output files.
+   *
+   * @return the compression algorithm
+   */
+  public CompressionType compression() {
+    return CompressionType.getCompressionType(compression(nativeHandle_));
+  }
+
+  /**
+   * Get detailed information about this compaction.
+   *
+   * @return the detailed information, or null if not available.
+   */
+  public /* @Nullable */ CompactionJobStats stats() {
+    final long statsHandle = stats(nativeHandle_);
+    if (statsHandle == 0) {
+      return null;
+    }
+
+    return new CompactionJobStats(statsHandle);
+  }
+
+
+  private static native long newCompactionJobInfo();
+  @Override protected native void disposeInternal(final long handle);
+
+  private static native byte[] columnFamilyName(final long handle);
+  private static native Status status(final long handle);
+  private static native long threadId(final long handle);
+  private static native int jobId(final long handle);
+  private static native int baseInputLevel(final long handle);
+  private static native int outputLevel(final long handle);
+  private static native String[] inputFiles(final long handle);
+  private static native String[] outputFiles(final long handle);
+  private static native Map<String, TableProperties> tableProperties(
+      final long handle);
+  private static native byte compactionReason(final long handle);
+  private static native byte compression(final long handle);
+  private static native long stats(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/CompactionJobStats.java b/src/rocksdb/java/src/main/java/org/rocksdb/CompactionJobStats.java
new file mode 100644
index 000000000..3d53b5565
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/CompactionJobStats.java
@@ -0,0 +1,295 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public class CompactionJobStats extends RocksObject {
+
+  public CompactionJobStats() {
+    super(newCompactionJobStats());
+  }
+
+  /**
+   * Private as called from JNI C++
+   */
+  CompactionJobStats(final long nativeHandle) {
+    super(nativeHandle);
+  }
+
+  /**
+   * Reset the stats.
+   */
+  public void reset() {
+    reset(nativeHandle_);
+  }
+
+  /**
+   * Aggregate the CompactionJobStats from another instance with this one.
+   *
+   * @param compactionJobStats another instance of stats.
+   */
+  public void add(final CompactionJobStats compactionJobStats) {
+    add(nativeHandle_, compactionJobStats.nativeHandle_);
+  }
+
+  /**
+   * Get the elapsed time in micro of this compaction.
+   *
+   * @return the elapsed time in micro of this compaction.
+   */
+  public long elapsedMicros() {
+    return elapsedMicros(nativeHandle_);
+  }
+
+  /**
+   * Get the number of compaction input records.
+   *
+   * @return the number of compaction input records.
+   */
+  public long numInputRecords() {
+    return numInputRecords(nativeHandle_);
+  }
+
+  /**
+   * Get the number of compaction input files.
+   *
+   * @return the number of compaction input files.
+   */
+  public long numInputFiles() {
+    return numInputFiles(nativeHandle_);
+  }
+
+  /**
+   * Get the number of compaction input files at the output level.
+   *
+   * @return the number of compaction input files at the output level.
+   */
+  public long numInputFilesAtOutputLevel() {
+    return numInputFilesAtOutputLevel(nativeHandle_);
+  }
+
+  /**
+   * Get the number of compaction output records.
+   *
+   * @return the number of compaction output records.
+   */
+  public long numOutputRecords() {
+    return numOutputRecords(nativeHandle_);
+  }
+
+  /**
+   * Get the number of compaction output files.
+   *
+   * @return the number of compaction output files.
+   */
+  public long numOutputFiles() {
+    return numOutputFiles(nativeHandle_);
+  }
+
+  /**
+   * Determine if the compaction is a manual compaction.
+   *
+   * @return true if the compaction is a manual compaction, false otherwise.
+   */
+  public boolean isManualCompaction() {
+    return isManualCompaction(nativeHandle_);
+  }
+
+  /**
+   * Get the size of the compaction input in bytes.
+   *
+   * @return the size of the compaction input in bytes.
+   */
+  public long totalInputBytes() {
+    return totalInputBytes(nativeHandle_);
+  }
+
+  /**
+   * Get the size of the compaction output in bytes.
+   *
+   * @return the size of the compaction output in bytes.
+   */
+  public long totalOutputBytes() {
+    return totalOutputBytes(nativeHandle_);
+  }
+
+  /**
+   * Get the number of records being replaced by newer record associated
+   * with same key.
+   *
+   * This could be a new value or a deletion entry for that key so this field
+   * sums up all updated and deleted keys.
+   *
+   * @return the number of records being replaced by newer record associated
+   *     with same key.
+   */
+  public long numRecordsReplaced() {
+    return numRecordsReplaced(nativeHandle_);
+  }
+
+  /**
+   * Get the sum of the uncompressed input keys in bytes.
+   *
+   * @return the sum of the uncompressed input keys in bytes.
+   */
+  public long totalInputRawKeyBytes() {
+    return totalInputRawKeyBytes(nativeHandle_);
+  }
+
+  /**
+   * Get the sum of the uncompressed input values in bytes.
+   *
+   * @return the sum of the uncompressed input values in bytes.
+   */
+  public long totalInputRawValueBytes() {
+    return totalInputRawValueBytes(nativeHandle_);
+  }
+
+  /**
+   * Get the number of deletion entries before compaction.
+   *
+   * Deletion entries can disappear after compaction because they expired.
+   *
+   * @return the number of deletion entries before compaction.
+   */
+  public long numInputDeletionRecords() {
+    return numInputDeletionRecords(nativeHandle_);
+  }
+
+  /**
+   * Get the number of deletion records that were found obsolete and discarded
+   * because it is not possible to delete any more keys with this entry.
+   * (i.e. all possible deletions resulting from it have been completed)
+   *
+   * @return the number of deletion records that were found obsolete and
+   *     discarded.
+   */
+  public long numExpiredDeletionRecords() {
+    return numExpiredDeletionRecords(nativeHandle_);
+  }
+
+  /**
+   * Get the number of corrupt keys (ParseInternalKey returned false when
+   * applied to the key) encountered and written out.
+   *
+   * @return the number of corrupt keys.
+   */
+  public long numCorruptKeys() {
+    return numCorruptKeys(nativeHandle_);
+  }
+
+  /**
+   * Get the Time spent on file's Append() call.
+   *
+   * Only populated if {@link ColumnFamilyOptions#reportBgIoStats()} is set.
+   *
+   * @return the Time spent on file's Append() call.
+   */
+  public long fileWriteNanos() {
+    return fileWriteNanos(nativeHandle_);
+  }
+
+  /**
+   * Get the Time spent on sync file range.
+   *
+   * Only populated if {@link ColumnFamilyOptions#reportBgIoStats()} is set.
+   *
+   * @return the Time spent on sync file range.
+   */
+  public long fileRangeSyncNanos() {
+    return fileRangeSyncNanos(nativeHandle_);
+  }
+
+  /**
+   * Get the Time spent on file fsync.
+   *
+   * Only populated if {@link ColumnFamilyOptions#reportBgIoStats()} is set.
+   *
+   * @return the Time spent on file fsync.
+   */
+  public long fileFsyncNanos() {
+    return fileFsyncNanos(nativeHandle_);
+  }
+
+  /**
+   * Get the Time spent on preparing file write (falocate, etc)
+   *
+   * Only populated if {@link ColumnFamilyOptions#reportBgIoStats()} is set.
+   *
+   * @return the Time spent on preparing file write (falocate, etc).
+   */
+  public long filePrepareWriteNanos() {
+    return filePrepareWriteNanos(nativeHandle_);
+  }
+
+  /**
+   * Get the smallest output key prefix.
+   *
+   * @return the smallest output key prefix.
+   */
+  public byte[] smallestOutputKeyPrefix() {
+    return smallestOutputKeyPrefix(nativeHandle_);
+  }
+
+  /**
+   * Get the largest output key prefix.
+   *
+   * @return the smallest output key prefix.
+   */
+  public byte[] largestOutputKeyPrefix() {
+    return largestOutputKeyPrefix(nativeHandle_);
+  }
+
+  /**
+   * Get the number of single-deletes which do not meet a put.
+   *
+   * @return number of single-deletes which do not meet a put.
+   */
+  @Experimental("Performance optimization for a very specific workload")
+  public long numSingleDelFallthru() {
+    return numSingleDelFallthru(nativeHandle_);
+  }
+
+  /**
+   * Get the number of single-deletes which meet something other than a put.
+   *
+   * @return the number of single-deletes which meet something other than a put.
+   */
+  @Experimental("Performance optimization for a very specific workload")
+  public long numSingleDelMismatch() {
+    return numSingleDelMismatch(nativeHandle_);
+  }
+
+  private static native long newCompactionJobStats();
+  @Override protected native void disposeInternal(final long handle);
+
+
+  private static native void reset(final long handle);
+  private static native void add(final long handle,
+      final long compactionJobStatsHandle);
+  private static native long elapsedMicros(final long handle);
+  private static native long numInputRecords(final long handle);
+  private static native long numInputFiles(final long handle);
+  private static native long numInputFilesAtOutputLevel(final long handle);
+  private static native long numOutputRecords(final long handle);
+  private static native long numOutputFiles(final long handle);
+  private static native boolean isManualCompaction(final long handle);
+  private static native long totalInputBytes(final long handle);
+  private static native long totalOutputBytes(final long handle);
+  private static native long numRecordsReplaced(final long handle);
+  private static native long totalInputRawKeyBytes(final long handle);
+  private static native long totalInputRawValueBytes(final long handle);
+  private static native long numInputDeletionRecords(final long handle);
+  private static native long numExpiredDeletionRecords(final long handle);
+  private static native long numCorruptKeys(final long handle);
+  private static native long fileWriteNanos(final long handle);
+  private static native long fileRangeSyncNanos(final long handle);
+  private static native long fileFsyncNanos(final long handle);
+  private static native long filePrepareWriteNanos(final long handle);
+  private static native byte[] smallestOutputKeyPrefix(final long handle);
+  private static native byte[] largestOutputKeyPrefix(final long handle);
+  private static native long numSingleDelFallthru(final long handle);
+  private static native long numSingleDelMismatch(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/CompactionOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/CompactionOptions.java
new file mode 100644
index 000000000..2c7e391fb
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/CompactionOptions.java
@@ -0,0 +1,121 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.List;
+
+/**
+ * CompactionOptions are used in
+ * {@link RocksDB#compactFiles(CompactionOptions, ColumnFamilyHandle, List, int, int, CompactionJobInfo)}
+ * calls.
+ */
+public class CompactionOptions extends RocksObject {
+
+  public CompactionOptions() {
+    super(newCompactionOptions());
+  }
+
+  /**
+   * Get the compaction output compression type.
+   *
+   * See {@link #setCompression(CompressionType)}.
+   *
+   * @return the compression type.
+   */
+  public CompressionType compression() {
+    return CompressionType.getCompressionType(
+        compression(nativeHandle_));
+  }
+
+  /**
+   * Set the compaction output compression type.
+   *
+   * Default: snappy
+   *
+   * If set to {@link CompressionType#DISABLE_COMPRESSION_OPTION},
+   * RocksDB will choose compression type according to the
+   * {@link ColumnFamilyOptions#compressionType()}, taking into account
+   * the output level if {@link ColumnFamilyOptions#compressionPerLevel()}
+   * is specified.
+   *
+   * @param compression the compression type to use for compaction output.
+   *
+   * @return the instance of the current Options.
+   */
+  public CompactionOptions setCompression(final CompressionType compression) {
+    setCompression(nativeHandle_, compression.getValue());
+    return this;
+  }
+
+  /**
+   * Get the compaction output file size limit.
+   *
+   * See {@link #setOutputFileSizeLimit(long)}.
+   *
+   * @return the file size limit.
+   */
+  public long outputFileSizeLimit() {
+    return outputFileSizeLimit(nativeHandle_);
+  }
+
+  /**
+   * Compaction will create files of size {@link #outputFileSizeLimit()}.
+   *
+   * Default: 2^64-1, which means that compaction will create a single file
+   *
+   * @param outputFileSizeLimit the size limit
+   *
+   * @return the instance of the current Options.
+   */
+  public CompactionOptions setOutputFileSizeLimit(
+      final long outputFileSizeLimit) {
+    setOutputFileSizeLimit(nativeHandle_, outputFileSizeLimit);
+    return this;
+  }
+
+  /**
+   * Get the maximum number of threads that will concurrently perform a
+   * compaction job.
+   *
+   * @return the maximum number of threads.
+   */
+  public int maxSubcompactions() {
+    return maxSubcompactions(nativeHandle_);
+  }
+
+  /**
+   * This value represents the maximum number of threads that will
+   * concurrently perform a compaction job by breaking it into multiple,
+   * smaller ones that are run simultaneously.
+   *
+   * Default: 0 (i.e. no subcompactions)
+   *
+   * If &gt; 0, it will replace the option in
+   * {@link DBOptions#maxSubcompactions()} for this compaction.
+   *
+   * @param maxSubcompactions The maximum number of threads that will
+   *     concurrently perform a compaction job
+   *
+   * @return the instance of the current Options.
+   */
+  public CompactionOptions setMaxSubcompactions(final int maxSubcompactions) {
+    setMaxSubcompactions(nativeHandle_, maxSubcompactions);
+    return this;
+  }
+
+  private static native long newCompactionOptions();
+  @Override protected final native void disposeInternal(final long handle);
+
+  private static native byte compression(final long handle);
+  private static native void setCompression(final long handle,
+      final byte compressionTypeValue);
+  private static native long outputFileSizeLimit(final long handle);
+  private static native void setOutputFileSizeLimit(final long handle,
+      final long outputFileSizeLimit);
+  private static native int maxSubcompactions(final long handle);
+  private static native void setMaxSubcompactions(final long handle,
+      final int maxSubcompactions);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java b/src/rocksdb/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java
new file mode 100644
index 000000000..4c8d6545c
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java
@@ -0,0 +1,89 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Options for FIFO Compaction
+ */
+public class CompactionOptionsFIFO extends RocksObject {
+
+  public CompactionOptionsFIFO() {
+    super(newCompactionOptionsFIFO());
+  }
+
+  /**
+   * Once the total sum of table files reaches this, we will delete the oldest
+   * table file
+   *
+   * Default: 1GB
+   *
+   * @param maxTableFilesSize The maximum size of the table files
+   *
+   * @return the reference to the current options.
+   */
+  public CompactionOptionsFIFO setMaxTableFilesSize(
+      final long maxTableFilesSize) {
+    setMaxTableFilesSize(nativeHandle_, maxTableFilesSize);
+    return this;
+  }
+
+  /**
+   * Once the total sum of table files reaches this, we will delete the oldest
+   * table file
+   *
+   * Default: 1GB
+   *
+   * @return max table file size in bytes
+   */
+  public long maxTableFilesSize() {
+    return maxTableFilesSize(nativeHandle_);
+  }
+
+  /**
+   * If true, try to do compaction to compact smaller files into larger ones.
+   * Minimum files to compact follows options.level0_file_num_compaction_trigger
+   * and compaction won't trigger if average compact bytes per del file is
+   * larger than options.write_buffer_size. This is to protect large files
+   * from being compacted again.
+   *
+   * Default: false
+   *
+   * @param allowCompaction true to allow intra-L0 compaction
+   *
+   * @return the reference to the current options.
+   */
+  public CompactionOptionsFIFO setAllowCompaction(
+      final boolean allowCompaction) {
+    setAllowCompaction(nativeHandle_, allowCompaction);
+    return this;
+  }
+
+
+  /**
+   * Check if intra-L0 compaction is enabled.
+   * When enabled, we try to compact smaller files into larger ones.
+   *
+   * See {@link #setAllowCompaction(boolean)}.
+   *
+   * Default: false
+   *
+   * @return true if intra-L0 compaction is enabled, false otherwise.
+   */
+  public boolean allowCompaction() {
+    return allowCompaction(nativeHandle_);
+  }
+
+
+  private native static long newCompactionOptionsFIFO();
+  @Override protected final native void disposeInternal(final long handle);
+
+  private native void setMaxTableFilesSize(final long handle,
+      final long maxTableFilesSize);
+  private native long maxTableFilesSize(final long handle);
+  private native void setAllowCompaction(final long handle,
+      final boolean allowCompaction);
+  private native boolean allowCompaction(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/CompactionOptionsUniversal.java b/src/rocksdb/java/src/main/java/org/rocksdb/CompactionOptionsUniversal.java
new file mode 100644
index 000000000..d2dfa4eef
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/CompactionOptionsUniversal.java
@@ -0,0 +1,273 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Options for Universal Compaction
+ */
+public class CompactionOptionsUniversal extends RocksObject {
+
+  public CompactionOptionsUniversal() {
+    super(newCompactionOptionsUniversal());
+  }
+
+  /**
+   * Percentage flexibility while comparing file size. If the candidate file(s)
+   * size is 1% smaller than the next file's size, then include next file into
+   * this candidate set.
+   *
+   * Default: 1
+   *
+   * @param sizeRatio The size ratio to use
+   *
+   * @return the reference to the current options.
+   */
+  public CompactionOptionsUniversal setSizeRatio(final int sizeRatio) {
+    setSizeRatio(nativeHandle_, sizeRatio);
+    return this;
+  }
+
+  /**
+   * Percentage flexibility while comparing file size. If the candidate file(s)
+   * size is 1% smaller than the next file's size, then include next file into
+   * this candidate set.
+   *
+   * Default: 1
+   *
+   * @return The size ratio in use
+   */
+  public int sizeRatio() {
+    return sizeRatio(nativeHandle_);
+  }
+
+  /**
+   * The minimum number of files in a single compaction run.
+   *
+   * Default: 2
+   *
+   * @param minMergeWidth minimum number of files in a single compaction run
+   *
+   * @return the reference to the current options.
+   */
+  public CompactionOptionsUniversal setMinMergeWidth(final int minMergeWidth) {
+    setMinMergeWidth(nativeHandle_, minMergeWidth);
+    return this;
+  }
+
+  /**
+   * The minimum number of files in a single compaction run.
+   *
+   * Default: 2
+   *
+   * @return minimum number of files in a single compaction run
+   */
+  public int minMergeWidth() {
+    return minMergeWidth(nativeHandle_);
+  }
+
+  /**
+   * The maximum number of files in a single compaction run.
+   *
+   * Default: {@link Long#MAX_VALUE}
+   *
+   * @param maxMergeWidth maximum number of files in a single compaction run
+   *
+   * @return the reference to the current options.
+   */
+  public CompactionOptionsUniversal setMaxMergeWidth(final int maxMergeWidth) {
+    setMaxMergeWidth(nativeHandle_, maxMergeWidth);
+    return this;
+  }
+
+  /**
+   * The maximum number of files in a single compaction run.
+   *
+   * Default: {@link Long#MAX_VALUE}
+   *
+   * @return maximum number of files in a single compaction run
+   */
+  public int maxMergeWidth() {
+    return maxMergeWidth(nativeHandle_);
+  }
+
+  /**
+   * The size amplification is defined as the amount (in percentage) of
+   * additional storage needed to store a single byte of data in the database.
+   * For example, a size amplification of 2% means that a database that
+   * contains 100 bytes of user-data may occupy upto 102 bytes of
+   * physical storage. By this definition, a fully compacted database has
+   * a size amplification of 0%. Rocksdb uses the following heuristic
+   * to calculate size amplification: it assumes that all files excluding
+   * the earliest file contribute to the size amplification.
+   *
+   * Default: 200, which means that a 100 byte database could require upto
+   * 300 bytes of storage.
+   *
+   * @param maxSizeAmplificationPercent the amount of additional storage needed
+   *     (as a percentage) to store a single byte in the database
+   *
+   * @return the reference to the current options.
+   */
+  public CompactionOptionsUniversal setMaxSizeAmplificationPercent(
+      final int maxSizeAmplificationPercent) {
+    setMaxSizeAmplificationPercent(nativeHandle_, maxSizeAmplificationPercent);
+    return this;
+  }
+
+  /**
+   * The size amplification is defined as the amount (in percentage) of
+   * additional storage needed to store a single byte of data in the database.
+   * For example, a size amplification of 2% means that a database that
+   * contains 100 bytes of user-data may occupy upto 102 bytes of
+   * physical storage. By this definition, a fully compacted database has
+   * a size amplification of 0%. Rocksdb uses the following heuristic
+   * to calculate size amplification: it assumes that all files excluding
+   * the earliest file contribute to the size amplification.
+   *
+   * Default: 200, which means that a 100 byte database could require upto
+   * 300 bytes of storage.
+   *
+   * @return the amount of additional storage needed (as a percentage) to store
+   *     a single byte in the database
+   */
+  public int maxSizeAmplificationPercent() {
+    return maxSizeAmplificationPercent(nativeHandle_);
+  }
+
+  /**
+   * If this option is set to be -1 (the default value), all the output files
+   * will follow compression type specified.
+   *
+   * If this option is not negative, we will try to make sure compressed
+   * size is just above this value. In normal cases, at least this percentage
+   * of data will be compressed.
+   *
+   * When we are compacting to a new file, here is the criteria whether
+   * it needs to be compressed: assuming here are the list of files sorted
+   * by generation time:
+   *    A1...An B1...Bm C1...Ct
+   * where A1 is the newest and Ct is the oldest, and we are going to compact
+   * B1...Bm, we calculate the total size of all the files as total_size, as
+   * well as  the total size of C1...Ct as total_C, the compaction output file
+   * will be compressed iff
+   *    total_C / total_size &lt; this percentage
+   *
+   * Default: -1
+   *
+   * @param compressionSizePercent percentage of size for compression
+   *
+   * @return the reference to the current options.
+   */
+  public CompactionOptionsUniversal setCompressionSizePercent(
+      final int compressionSizePercent) {
+    setCompressionSizePercent(nativeHandle_, compressionSizePercent);
+    return this;
+  }
+
+  /**
+   * If this option is set to be -1 (the default value), all the output files
+   * will follow compression type specified.
+   *
+   * If this option is not negative, we will try to make sure compressed
+   * size is just above this value. In normal cases, at least this percentage
+   * of data will be compressed.
+   *
+   * When we are compacting to a new file, here is the criteria whether
+   * it needs to be compressed: assuming here are the list of files sorted
+   * by generation time:
+   *    A1...An B1...Bm C1...Ct
+   * where A1 is the newest and Ct is the oldest, and we are going to compact
+   * B1...Bm, we calculate the total size of all the files as total_size, as
+   * well as  the total size of C1...Ct as total_C, the compaction output file
+   * will be compressed iff
+   *    total_C / total_size &lt; this percentage
+   *
+   * Default: -1
+   *
+   * @return percentage of size for compression
+   */
+  public int compressionSizePercent() {
+    return compressionSizePercent(nativeHandle_);
+  }
+
+  /**
+   * The algorithm used to stop picking files into a single compaction run
+   *
+   * Default: {@link CompactionStopStyle#CompactionStopStyleTotalSize}
+   *
+   * @param compactionStopStyle The compaction algorithm
+   *
+   * @return the reference to the current options.
+   */
+  public CompactionOptionsUniversal setStopStyle(
+      final CompactionStopStyle compactionStopStyle) {
+    setStopStyle(nativeHandle_, compactionStopStyle.getValue());
+    return this;
+  }
+
+  /**
+   * The algorithm used to stop picking files into a single compaction run
+   *
+   * Default: {@link CompactionStopStyle#CompactionStopStyleTotalSize}
+   *
+   * @return The compaction algorithm
+   */
+  public CompactionStopStyle stopStyle() {
+    return CompactionStopStyle.getCompactionStopStyle(stopStyle(nativeHandle_));
+  }
+
+  /**
+   * Option to optimize the universal multi level compaction by enabling
+   * trivial move for non overlapping files.
+   *
+   * Default: false
+   *
+   * @param allowTrivialMove true if trivial move is allowed
+   *
+   * @return the reference to the current options.
+   */
+  public CompactionOptionsUniversal setAllowTrivialMove(
+      final boolean allowTrivialMove) {
+    setAllowTrivialMove(nativeHandle_, allowTrivialMove);
+    return this;
+  }
+
+  /**
+   * Option to optimize the universal multi level compaction by enabling
+   * trivial move for non overlapping files.
+   *
+   * Default: false
+   *
+   * @return true if trivial move is allowed
+   */
+  public boolean allowTrivialMove() {
+    return allowTrivialMove(nativeHandle_);
+  }
+
+  private native static long newCompactionOptionsUniversal();
+  @Override protected final native void disposeInternal(final long handle);
+
+  private native void setSizeRatio(final long handle, final int sizeRatio);
+  private native int sizeRatio(final long handle);
+  private native void setMinMergeWidth(
+      final long handle, final int minMergeWidth);
+  private native int minMergeWidth(final long handle);
+  private native void setMaxMergeWidth(
+      final long handle, final int maxMergeWidth);
+  private native int maxMergeWidth(final long handle);
+  private native void setMaxSizeAmplificationPercent(
+      final long handle, final int maxSizeAmplificationPercent);
+  private native int maxSizeAmplificationPercent(final long handle);
+  private native void setCompressionSizePercent(
+      final long handle, final int compressionSizePercent);
+  private native int compressionSizePercent(final long handle);
+  private native void setStopStyle(
+      final long handle, final byte stopStyle);
+  private native byte stopStyle(final long handle);
+  private native void setAllowTrivialMove(
+      final long handle, final boolean allowTrivialMove);
+  private native boolean allowTrivialMove(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/CompactionPriority.java b/src/rocksdb/java/src/main/java/org/rocksdb/CompactionPriority.java
new file mode 100644
index 000000000..eda05942e
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/CompactionPriority.java
@@ -0,0 +1,81 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Compaction Priorities
+ */
+public enum CompactionPriority {
+
+  /**
+   * Slightly Prioritize larger files by size compensated by #deletes
+   */
+  ByCompensatedSize((byte)0x0),
+
+  /**
+   * First compact files whose data's latest update time is oldest.
+   * Try this if you only update some hot keys in small ranges.
+   */
+  OldestLargestSeqFirst((byte)0x1),
+
+  /**
+   * First compact files whose range hasn't been compacted to the next level
+   * for the longest. If your updates are random across the key space,
+   * write amplification is slightly better with this option.
+   */
+  OldestSmallestSeqFirst((byte)0x2),
+
+  /**
+   * First compact files whose ratio between overlapping size in next level
+   * and its size is the smallest. It in many cases can optimize write
+   * amplification.
+   */
+  MinOverlappingRatio((byte)0x3),
+
+  /**
+   * Keeps a cursor(s) of the successor of the file (key range) was/were
+   * compacted before, and always picks the next files (key range) in that
+   * level. The file picking process will cycle through all the files in a
+   * round-robin manner.
+   */
+  RoundRobin((byte)0x4);
+
+
+  private final byte value;
+
+  CompactionPriority(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Returns the byte value of the enumerations value
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get CompactionPriority by byte value.
+   *
+   * @param value byte representation of CompactionPriority.
+   *
+   * @return {@link org.rocksdb.CompactionPriority} instance or null.
+   * @throws java.lang.IllegalArgumentException if an invalid
+   *     value is provided.
+   */
+  public static CompactionPriority getCompactionPriority(final byte value) {
+    for (final CompactionPriority compactionPriority :
+        CompactionPriority.values()) {
+      if (compactionPriority.getValue() == value){
+        return compactionPriority;
+      }
+    }
+    throw new IllegalArgumentException(
+        "Illegal value provided for CompactionPriority.");
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/CompactionReason.java b/src/rocksdb/java/src/main/java/org/rocksdb/CompactionReason.java
new file mode 100644
index 000000000..24e234450
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/CompactionReason.java
@@ -0,0 +1,125 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public enum CompactionReason {
+  kUnknown((byte)0x0),
+
+  /**
+   * [Level] number of L0 files &gt; level0_file_num_compaction_trigger
+   */
+  kLevelL0FilesNum((byte)0x1),
+
+  /**
+   * [Level] total size of level &gt; MaxBytesForLevel()
+   */
+  kLevelMaxLevelSize((byte)0x2),
+
+  /**
+   * [Universal] Compacting for size amplification
+   */
+  kUniversalSizeAmplification((byte)0x3),
+
+  /**
+   * [Universal] Compacting for size ratio
+   */
+  kUniversalSizeRatio((byte)0x4),
+
+  /**
+   * [Universal] number of sorted runs &gt; level0_file_num_compaction_trigger
+   */
+  kUniversalSortedRunNum((byte)0x5),
+
+  /**
+   * [FIFO] total size &gt; max_table_files_size
+   */
+  kFIFOMaxSize((byte)0x6),
+
+  /**
+   * [FIFO] reduce number of files.
+   */
+  kFIFOReduceNumFiles((byte)0x7),
+
+  /**
+   * [FIFO] files with creation time &lt; (current_time - interval)
+   */
+  kFIFOTtl((byte)0x8),
+
+  /**
+   * Manual compaction
+   */
+  kManualCompaction((byte)0x9),
+
+  /**
+   * DB::SuggestCompactRange() marked files for compaction
+   */
+  kFilesMarkedForCompaction((byte)0x10),
+
+  /**
+   * [Level] Automatic compaction within bottommost level to cleanup duplicate
+   * versions of same user key, usually due to a released snapshot.
+   */
+  kBottommostFiles((byte)0x0A),
+
+  /**
+   * Compaction based on TTL
+   */
+  kTtl((byte)0x0B),
+
+  /**
+   * According to the comments in flush_job.cc, RocksDB treats flush as
+   * a level 0 compaction in internal stats.
+   */
+  kFlush((byte)0x0C),
+
+  /**
+   * Compaction caused by external sst file ingestion
+   */
+  kExternalSstIngestion((byte) 0x0D),
+
+  /**
+   * Compaction due to SST file being too old
+   */
+  kPeriodicCompaction((byte) 0x0E),
+
+  /**
+   * Compaction in order to move files to temperature
+   */
+  kChangeTemperature((byte) 0x0F);
+
+  private final byte value;
+
+  CompactionReason(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation value.
+   *
+   * @return the internal representation value
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get the CompactionReason from the internal representation value.
+   *
+   * @return the compaction reason.
+   *
+   * @throws IllegalArgumentException if the value is unknown.
+   */
+  static CompactionReason fromValue(final byte value) {
+    for (final CompactionReason compactionReason : CompactionReason.values()) {
+      if(compactionReason.value == value) {
+        return compactionReason;
+      }
+    }
+
+    throw new IllegalArgumentException(
+        "Illegal value provided for CompactionReason: " + value);
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/CompactionStopStyle.java b/src/rocksdb/java/src/main/java/org/rocksdb/CompactionStopStyle.java
new file mode 100644
index 000000000..f6e63209c
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/CompactionStopStyle.java
@@ -0,0 +1,55 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+package org.rocksdb;
+
+/**
+ * Algorithm used to make a compaction request stop picking new files
+ * into a single compaction run
+ */
+public enum CompactionStopStyle {
+
+  /**
+   * Pick files of similar size
+   */
+  CompactionStopStyleSimilarSize((byte)0x0),
+
+  /**
+   * Total size of picked files &gt; next file
+   */
+  CompactionStopStyleTotalSize((byte)0x1);
+
+
+  private final byte value;
+
+  CompactionStopStyle(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Returns the byte value of the enumerations value
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get CompactionStopStyle by byte value.
+   *
+   * @param value byte representation of CompactionStopStyle.
+   *
+   * @return {@link org.rocksdb.CompactionStopStyle} instance or null.
+   * @throws java.lang.IllegalArgumentException if an invalid
+   *     value is provided.
+   */
+  public static CompactionStopStyle getCompactionStopStyle(final byte value) {
+    for (final CompactionStopStyle compactionStopStyle :
+        CompactionStopStyle.values()) {
+      if (compactionStopStyle.getValue() == value){
+        return compactionStopStyle;
+      }
+    }
+    throw new IllegalArgumentException(
+        "Illegal value provided for CompactionStopStyle.");
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/CompactionStyle.java b/src/rocksdb/java/src/main/java/org/rocksdb/CompactionStyle.java
new file mode 100644
index 000000000..b24bbf850
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/CompactionStyle.java
@@ -0,0 +1,80 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.List;
+
+/**
+ * Enum CompactionStyle
+ *
+ * RocksDB supports different styles of compaction. Available
+ * compaction styles can be chosen using this enumeration.
+ *
+ * <ol>
+ *   <li><strong>LEVEL</strong> - Level based Compaction style</li>
+ *   <li><strong>UNIVERSAL</strong> - Universal Compaction Style is a
+ *   compaction style, targeting the use cases requiring lower write
+ *   amplification, trading off read amplification and space
+ *   amplification.</li>
+ *   <li><strong>FIFO</strong> - FIFO compaction style is the simplest
+ *   compaction strategy. It is suited for keeping event log data with
+ *   very low overhead (query log for example). It periodically deletes
+ *   the old data, so it's basically a TTL compaction style.</li>
+ *   <li><strong>NONE</strong> - Disable background compaction.
+ *   Compaction jobs are submitted
+ *   {@link RocksDB#compactFiles(CompactionOptions, ColumnFamilyHandle, List, int, int, CompactionJobInfo)} ()}.</li>
+ * </ol>
+ *
+ * @see <a
+ * href="https://github.com/facebook/rocksdb/wiki/Universal-Compaction">
+ * Universal Compaction</a>
+ * @see <a
+ * href="https://github.com/facebook/rocksdb/wiki/FIFO-compaction-style">
+ * FIFO Compaction</a>
+ */
+public enum CompactionStyle {
+  LEVEL((byte) 0x0),
+  UNIVERSAL((byte) 0x1),
+  FIFO((byte) 0x2),
+  NONE((byte) 0x3);
+
+  private final byte value;
+
+  CompactionStyle(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation value.
+   *
+   * @return the internal representation value.
+   */
+  //TODO(AR) should be made package-private
+  public byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get the Compaction style from the internal representation value.
+   *
+   * @param value the internal representation value.
+   *
+   * @return the Compaction style
+   *
+   * @throws IllegalArgumentException if the value does not match a
+   *     CompactionStyle
+   */
+  static CompactionStyle fromValue(final byte value)
+      throws IllegalArgumentException {
+    for (final CompactionStyle compactionStyle : CompactionStyle.values()) {
+      if (compactionStyle.value == value) {
+        return compactionStyle;
+      }
+    }
+    throw new IllegalArgumentException("Unknown value for CompactionStyle: "
+        + value);
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/ComparatorOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/ComparatorOptions.java
new file mode 100644
index 000000000..8c3162858
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/ComparatorOptions.java
@@ -0,0 +1,133 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+/**
+ * This class controls the behaviour
+ * of Java implementations of
+ * AbstractComparator
+ *
+ * Note that dispose() must be called before a ComparatorOptions
+ * instance becomes out-of-scope to release the allocated memory in C++.
+ */
+public class ComparatorOptions extends RocksObject {
+  public ComparatorOptions() {
+    super(newComparatorOptions());
+  }
+
+  /**
+   * Get the synchronisation type used to guard the reused buffers.
+   * Only used if {@link #maxReusedBufferSize()} &gt; 0
+   * Default: {@link ReusedSynchronisationType#ADAPTIVE_MUTEX}
+   *
+   * @return the synchronisation type
+   */
+  public ReusedSynchronisationType reusedSynchronisationType() {
+    assert(isOwningHandle());
+    return ReusedSynchronisationType.getReusedSynchronisationType(
+        reusedSynchronisationType(nativeHandle_));
+  }
+
+  /**
+   * Set the synchronisation type used to guard the reused buffers.
+   * Only used if {@link #maxReusedBufferSize()} &gt; 0
+   * Default: {@link ReusedSynchronisationType#ADAPTIVE_MUTEX}
+   *
+   * @param reusedSynchronisationType the synchronisation type
+   *
+   * @return the reference to the current comparator options.
+   */
+  public ComparatorOptions setReusedSynchronisationType(
+      final ReusedSynchronisationType reusedSynchronisationType) {
+    assert (isOwningHandle());
+    setReusedSynchronisationType(nativeHandle_,
+        reusedSynchronisationType.getValue());
+    return this;
+  }
+
+  /**
+   * Indicates if a direct byte buffer (i.e. outside of the normal
+   * garbage-collected heap) is used, as opposed to a non-direct byte buffer
+   * which is a wrapper around an on-heap byte[].
+   *
+   * Default: true
+   *
+   * @return true if a direct byte buffer will be used, false otherwise
+   */
+  public boolean useDirectBuffer() {
+    assert(isOwningHandle());
+    return useDirectBuffer(nativeHandle_);
+  }
+
+  /**
+   * Controls whether a direct byte buffer (i.e. outside of the normal
+   * garbage-collected heap) is used, as opposed to a non-direct byte buffer
+   * which is a wrapper around an on-heap byte[].
+   *
+   * Default: true
+   *
+   * @param useDirectBuffer true if a direct byte buffer should be used,
+   *     false otherwise
+   * @return the reference to the current comparator options.
+   */
+  public ComparatorOptions setUseDirectBuffer(final boolean useDirectBuffer) {
+    assert(isOwningHandle());
+    setUseDirectBuffer(nativeHandle_, useDirectBuffer);
+    return this;
+  }
+
+  /**
+   * Maximum size of a buffer (in bytes) that will be reused.
+   * Comparators will use 5 of these buffers,
+   * so the retained memory size will be 5 * max_reused_buffer_size.
+   * When a buffer is needed for transferring data to a callback,
+   * if it requires less than {@code maxReuseBufferSize}, then an
+   * existing buffer will be reused, else a new buffer will be
+   * allocated just for that callback.
+   *
+   * Default: 64 bytes
+   *
+   * @return the maximum size of a buffer which is reused,
+   *     or 0 if reuse is disabled
+   */
+  public int maxReusedBufferSize() {
+    assert(isOwningHandle());
+    return maxReusedBufferSize(nativeHandle_);
+  }
+
+  /**
+   * Sets the maximum size of a buffer (in bytes) that will be reused.
+   * Comparators will use 5 of these buffers,
+   * so the retained memory size will be 5 * max_reused_buffer_size.
+   * When a buffer is needed for transferring data to a callback,
+   * if it requires less than {@code maxReuseBufferSize}, then an
+   * existing buffer will be reused, else a new buffer will be
+   * allocated just for that callback.
+   *
+   * Default: 64 bytes
+   *
+   * @param maxReusedBufferSize the maximum size for a buffer to reuse, or 0 to
+   *     disable reuse
+   *
+   * @return the maximum size of a buffer which is reused
+   */
+  public ComparatorOptions setMaxReusedBufferSize(final int maxReusedBufferSize) {
+    assert(isOwningHandle());
+    setMaxReusedBufferSize(nativeHandle_, maxReusedBufferSize);
+    return this;
+  }
+
+  private native static long newComparatorOptions();
+  private native byte reusedSynchronisationType(final long handle);
+  private native void setReusedSynchronisationType(final long handle,
+      final byte reusedSynchronisationType);
+  private native boolean useDirectBuffer(final long handle);
+  private native void setUseDirectBuffer(final long handle,
+      final boolean useDirectBuffer);
+  private native int maxReusedBufferSize(final long handle);
+  private native void setMaxReusedBufferSize(final long handle,
+      final int maxReuseBufferSize);
+  @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/ComparatorType.java b/src/rocksdb/java/src/main/java/org/rocksdb/ComparatorType.java
new file mode 100644
index 000000000..199980b6e
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/ComparatorType.java
@@ -0,0 +1,48 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+enum ComparatorType {
+  JAVA_COMPARATOR((byte)0x0),
+  JAVA_NATIVE_COMPARATOR_WRAPPER((byte)0x1);
+
+  private final byte value;
+
+  ComparatorType(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * <p>Returns the byte value of the enumerations value.</p>
+   *
+   * @return byte representation
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * <p>Get the ComparatorType enumeration value by
+   * passing the byte identifier to this method.</p>
+   *
+   * @param byteIdentifier of ComparatorType.
+   *
+   * @return ComparatorType instance.
+   *
+   * @throws IllegalArgumentException if the comparator type for the byteIdentifier
+   *     cannot be found
+   */
+  static ComparatorType getComparatorType(final byte byteIdentifier) {
+    for (final ComparatorType comparatorType : ComparatorType.values()) {
+      if (comparatorType.getValue() == byteIdentifier) {
+        return comparatorType;
+      }
+    }
+
+    throw new IllegalArgumentException(
+        "Illegal value provided for ComparatorType.");
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/CompressionOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/CompressionOptions.java
new file mode 100644
index 000000000..a9072bbb9
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/CompressionOptions.java
@@ -0,0 +1,151 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Options for Compression
+ */
+public class CompressionOptions extends RocksObject {
+
+  public CompressionOptions() {
+    super(newCompressionOptions());
+  }
+
+  public CompressionOptions setWindowBits(final int windowBits) {
+    setWindowBits(nativeHandle_, windowBits);
+    return this;
+  }
+
+  public int windowBits() {
+    return windowBits(nativeHandle_);
+  }
+
+  public CompressionOptions setLevel(final int level) {
+    setLevel(nativeHandle_, level);
+    return this;
+  }
+
+  public int level() {
+    return level(nativeHandle_);
+  }
+
+  public CompressionOptions setStrategy(final int strategy) {
+    setStrategy(nativeHandle_, strategy);
+    return this;
+  }
+
+  public int strategy() {
+    return strategy(nativeHandle_);
+  }
+
+  /**
+   * Maximum size of dictionary used to prime the compression library. Currently
+   * this dictionary will be constructed by sampling the first output file in a
+   * subcompaction when the target level is bottommost. This dictionary will be
+   * loaded into the compression library before compressing/uncompressing each
+   * data block of subsequent files in the subcompaction. Effectively, this
+   * improves compression ratios when there are repetitions across data blocks.
+   *
+   * A value of 0 indicates the feature is disabled.
+   *
+   * Default: 0.
+   *
+   * @param maxDictBytes Maximum bytes to use for the dictionary
+   *
+   * @return the reference to the current options
+   */
+  public CompressionOptions setMaxDictBytes(final int maxDictBytes) {
+    setMaxDictBytes(nativeHandle_, maxDictBytes);
+    return this;
+  }
+
+  /**
+   * Maximum size of dictionary used to prime the compression library.
+   *
+   * @return The maximum bytes to use for the dictionary
+   */
+  public int maxDictBytes() {
+    return maxDictBytes(nativeHandle_);
+  }
+
+  /**
+   * Maximum size of training data passed to zstd's dictionary trainer. Using
+   * zstd's dictionary trainer can achieve even better compression ratio
+   * improvements than using {@link #setMaxDictBytes(int)} alone.
+   *
+   * The training data will be used to generate a dictionary
+   * of {@link #maxDictBytes()}.
+   *
+   * Default: 0.
+   *
+   * @param zstdMaxTrainBytes Maximum bytes to use for training ZStd.
+   *
+   * @return the reference to the current options
+   */
+  public CompressionOptions setZStdMaxTrainBytes(final int zstdMaxTrainBytes) {
+    setZstdMaxTrainBytes(nativeHandle_, zstdMaxTrainBytes);
+    return this;
+  }
+
+  /**
+   * Maximum size of training data passed to zstd's dictionary trainer.
+   *
+   * @return Maximum bytes to use for training ZStd
+   */
+  public int zstdMaxTrainBytes() {
+    return zstdMaxTrainBytes(nativeHandle_);
+  }
+
+  /**
+   * When the compression options are set by the user, it will be set to "true".
+   * For bottommost_compression_opts, to enable it, user must set enabled=true.
+   * Otherwise, bottommost compression will use compression_opts as default
+   * compression options.
+   *
+   * For compression_opts, if compression_opts.enabled=false, it is still
+   * used as compression options for compression process.
+   *
+   * Default: false.
+   *
+   * @param enabled true to use these compression options
+   *     for the bottommost_compression_opts, false otherwise
+   *
+   * @return the reference to the current options
+   */
+  public CompressionOptions setEnabled(final boolean enabled) {
+    setEnabled(nativeHandle_, enabled);
+    return this;
+  }
+
+  /**
+   * Determine whether these compression options
+   * are used for the bottommost_compression_opts.
+   *
+   * @return true if these compression options are used
+   *     for the bottommost_compression_opts, false otherwise
+   */
+  public boolean enabled() {
+    return enabled(nativeHandle_);
+  }
+
+
+  private native static long newCompressionOptions();
+  @Override protected final native void disposeInternal(final long handle);
+
+  private native void setWindowBits(final long handle, final int windowBits);
+  private native int windowBits(final long handle);
+  private native void setLevel(final long handle, final int level);
+  private native int level(final long handle);
+  private native void setStrategy(final long handle, final int strategy);
+  private native int strategy(final long handle);
+  private native void setMaxDictBytes(final long handle, final int maxDictBytes);
+  private native int maxDictBytes(final long handle);
+  private native void setZstdMaxTrainBytes(final long handle,
+      final int zstdMaxTrainBytes);
+  private native int zstdMaxTrainBytes(final long handle);
+  private native void setEnabled(final long handle, final boolean enabled);
+  private native boolean enabled(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/CompressionType.java b/src/rocksdb/java/src/main/java/org/rocksdb/CompressionType.java
new file mode 100644
index 000000000..d1d73d51a
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/CompressionType.java
@@ -0,0 +1,121 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Enum CompressionType
+ *
+ * <p>DB contents are stored in a set of blocks, each of which holds a
+ * sequence of key,value pairs. Each block may be compressed before
+ * being stored in a file. The following enum describes which
+ * compression method (if any) is used to compress a block.</p>
+ */
+public enum CompressionType {
+  NO_COMPRESSION((byte) 0x0, null, "kNoCompression"),
+  SNAPPY_COMPRESSION((byte) 0x1, "snappy", "kSnappyCompression"),
+  ZLIB_COMPRESSION((byte) 0x2, "z", "kZlibCompression"),
+  BZLIB2_COMPRESSION((byte) 0x3, "bzip2", "kBZip2Compression"),
+  LZ4_COMPRESSION((byte) 0x4, "lz4", "kLZ4Compression"),
+  LZ4HC_COMPRESSION((byte) 0x5, "lz4hc", "kLZ4HCCompression"),
+  XPRESS_COMPRESSION((byte) 0x6, "xpress", "kXpressCompression"),
+  ZSTD_COMPRESSION((byte) 0x7, "zstd", "kZSTD"),
+  DISABLE_COMPRESSION_OPTION((byte) 0x7F, null, "kDisableCompressionOption");
+
+  /**
+   * <p>Get the CompressionType enumeration value by
+   * passing the library name to this method.</p>
+   *
+   * <p>If library cannot be found the enumeration
+   * value {@code NO_COMPRESSION} will be returned.</p>
+   *
+   * @param libraryName compression library name.
+   *
+   * @return CompressionType instance.
+   */
+  public static CompressionType getCompressionType(String libraryName) {
+    if (libraryName != null) {
+      for (CompressionType compressionType : CompressionType.values()) {
+        if (compressionType.getLibraryName() != null &&
+            compressionType.getLibraryName().equals(libraryName)) {
+          return compressionType;
+        }
+      }
+    }
+    return CompressionType.NO_COMPRESSION;
+  }
+
+  /**
+   * <p>Get the CompressionType enumeration value by
+   * passing the byte identifier to this method.</p>
+   *
+   * @param byteIdentifier of CompressionType.
+   *
+   * @return CompressionType instance.
+   *
+   * @throws IllegalArgumentException If CompressionType cannot be found for the
+   *   provided byteIdentifier
+   */
+  public static CompressionType getCompressionType(byte byteIdentifier) {
+    for (final CompressionType compressionType : CompressionType.values()) {
+      if (compressionType.getValue() == byteIdentifier) {
+        return compressionType;
+      }
+    }
+
+    throw new IllegalArgumentException(
+        "Illegal value provided for CompressionType.");
+  }
+
+  /**
+   * <p>Get a CompressionType value based on the string key in the C++ options output.
+   * This gets used in support of getting options into Java from an options string,
+   * which is generated at the C++ level.
+   * </p>
+   *
+   * @param internalName the internal (C++) name by which the option is known.
+   *
+   * @return CompressionType instance (optional)
+   */
+  static CompressionType getFromInternal(final String internalName) {
+    for (final CompressionType compressionType : CompressionType.values()) {
+      if (compressionType.internalName_.equals(internalName)) {
+        return compressionType;
+      }
+    }
+
+    throw new IllegalArgumentException(
+        "Illegal internalName '" + internalName + " ' provided for CompressionType.");
+  }
+
+  /**
+   * <p>Returns the byte value of the enumerations value.</p>
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value_;
+  }
+
+  /**
+   * <p>Returns the library name of the compression type
+   * identified by the enumeration value.</p>
+   *
+   * @return library name
+   */
+  public String getLibraryName() {
+    return libraryName_;
+  }
+
+  CompressionType(final byte value, final String libraryName, final String internalName) {
+    value_ = value;
+    libraryName_ = libraryName;
+    internalName_ = internalName;
+  }
+
+  private final byte value_;
+  private final String libraryName_;
+  private final String internalName_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/ConcurrentTaskLimiter.java b/src/rocksdb/java/src/main/java/org/rocksdb/ConcurrentTaskLimiter.java
new file mode 100644
index 000000000..b4e34303b
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/ConcurrentTaskLimiter.java
@@ -0,0 +1,44 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public abstract class ConcurrentTaskLimiter extends RocksObject {
+  protected ConcurrentTaskLimiter(final long nativeHandle) {
+    super(nativeHandle);
+  }
+
+  /**
+   * Returns a name that identifies this concurrent task limiter.
+   *
+   * @return Concurrent task limiter name.
+   */
+  public abstract String name();
+
+  /**
+   * Set max concurrent tasks.<br>
+   * limit = 0 means no new task allowed.<br>
+   * limit &lt; 0 means no limitation.
+   *
+   * @param maxOutstandinsTask max concurrent tasks.
+   * @return the reference to the current instance of ConcurrentTaskLimiter.
+   */
+  public abstract ConcurrentTaskLimiter setMaxOutstandingTask(final int maxOutstandinsTask);
+
+  /**
+   * Reset to unlimited max concurrent task.
+   *
+   * @return the reference to the current instance of ConcurrentTaskLimiter.
+   */
+  public abstract ConcurrentTaskLimiter resetMaxOutstandingTask();
+
+  /**
+   * Returns current outstanding task count.
+   *
+   * @return current outstanding task count.
+   */
+  public abstract int outstandingTask();
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/ConcurrentTaskLimiterImpl.java b/src/rocksdb/java/src/main/java/org/rocksdb/ConcurrentTaskLimiterImpl.java
new file mode 100644
index 000000000..d28b9060a
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/ConcurrentTaskLimiterImpl.java
@@ -0,0 +1,48 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public class ConcurrentTaskLimiterImpl extends ConcurrentTaskLimiter {
+  public ConcurrentTaskLimiterImpl(final String name, final int maxOutstandingTask) {
+    super(newConcurrentTaskLimiterImpl0(name, maxOutstandingTask));
+  }
+
+  @Override
+  public String name() {
+    assert (isOwningHandle());
+    return name(nativeHandle_);
+  }
+
+  @Override
+  public ConcurrentTaskLimiter setMaxOutstandingTask(final int maxOutstandingTask) {
+    assert (isOwningHandle());
+    setMaxOutstandingTask(nativeHandle_, maxOutstandingTask);
+    return this;
+  }
+
+  @Override
+  public ConcurrentTaskLimiter resetMaxOutstandingTask() {
+    assert (isOwningHandle());
+    resetMaxOutstandingTask(nativeHandle_);
+    return this;
+  }
+
+  @Override
+  public int outstandingTask() {
+    assert (isOwningHandle());
+    return outstandingTask(nativeHandle_);
+  }
+
+  private static native long newConcurrentTaskLimiterImpl0(
+      final String name, final int maxOutstandingTask);
+  private static native String name(final long handle);
+  private static native void setMaxOutstandingTask(final long handle, final int limit);
+  private static native void resetMaxOutstandingTask(final long handle);
+  private static native int outstandingTask(final long handle);
+
+  @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/ConfigOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/ConfigOptions.java
new file mode 100644
index 000000000..4d93f0c99
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/ConfigOptions.java
@@ -0,0 +1,53 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public class ConfigOptions extends RocksObject {
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  /**
+   * Construct with default Options
+   */
+  public ConfigOptions() {
+    super(newConfigOptions());
+  }
+
+  public ConfigOptions setDelimiter(final String delimiter) {
+    setDelimiter(nativeHandle_, delimiter);
+    return this;
+  }
+  public ConfigOptions setIgnoreUnknownOptions(final boolean ignore) {
+    setIgnoreUnknownOptions(nativeHandle_, ignore);
+    return this;
+  }
+
+  public ConfigOptions setEnv(final Env env) {
+    setEnv(nativeHandle_, env.nativeHandle_);
+    return this;
+  }
+
+  public ConfigOptions setInputStringsEscaped(final boolean escaped) {
+    setInputStringsEscaped(nativeHandle_, escaped);
+    return this;
+  }
+
+  public ConfigOptions setSanityLevel(final SanityLevel level) {
+    setSanityLevel(nativeHandle_, level.getValue());
+    return this;
+  }
+
+  @Override protected final native void disposeInternal(final long handle);
+
+  private native static long newConfigOptions();
+  private native static void setEnv(final long handle, final long envHandle);
+  private native static void setDelimiter(final long handle, final String delimiter);
+  private native static void setIgnoreUnknownOptions(final long handle, final boolean ignore);
+  private native static void setInputStringsEscaped(final long handle, final boolean escaped);
+  private native static void setSanityLevel(final long handle, final byte level);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/DBOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/DBOptions.java
new file mode 100644
index 000000000..543222262
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/DBOptions.java
@@ -0,0 +1,1495 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.nio.file.Paths;
+import java.util.*;
+
+/**
+ * DBOptions to control the behavior of a database.  It will be used
+ * during the creation of a {@link org.rocksdb.RocksDB} (i.e., RocksDB.open()).
+ *
+ * As a descendent of {@link AbstractNativeReference}, this class is {@link AutoCloseable}
+ * and will be automatically released if opened in the preamble of a try with resources block.
+ */
+public class DBOptions extends RocksObject
+    implements DBOptionsInterface<DBOptions>,
+    MutableDBOptionsInterface<DBOptions> {
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  /**
+   * Construct DBOptions.
+   *
+   * This constructor will create (by allocating a block of memory)
+   * an {@code rocksdb::DBOptions} in the c++ side.
+   */
+  public DBOptions() {
+    super(newDBOptions());
+    numShardBits_ = DEFAULT_NUM_SHARD_BITS;
+  }
+
+  /**
+   * Copy constructor for DBOptions.
+   *
+   * NOTE: This does a shallow copy, which means env, rate_limiter, sst_file_manager,
+   * info_log and other pointers will be cloned!
+   *
+   * @param other The DBOptions to copy.
+   */
+  public DBOptions(DBOptions other) {
+    super(copyDBOptions(other.nativeHandle_));
+    this.env_ = other.env_;
+    this.numShardBits_ = other.numShardBits_;
+    this.rateLimiter_ = other.rateLimiter_;
+    this.rowCache_ = other.rowCache_;
+    this.walFilter_ = other.walFilter_;
+    this.writeBufferManager_ = other.writeBufferManager_;
+  }
+
+  /**
+   * Constructor from Options
+   *
+   * @param options The options.
+   */
+  public DBOptions(final Options options) {
+    super(newDBOptionsFromOptions(options.nativeHandle_));
+  }
+
+  /**
+   * <p>Method to get a options instance by using pre-configured
+   * property values. If one or many values are undefined in
+   * the context of RocksDB the method will return a null
+   * value.</p>
+   *
+   * <p><strong>Note</strong>: Property keys can be derived from
+   * getter methods within the options class. Example: the method
+   * {@code allowMmapReads()} has a property key:
+   * {@code allow_mmap_reads}.</p>
+   *
+   * @param cfgOpts The ConfigOptions to control how the string is processed.
+   * @param properties {@link java.util.Properties} instance.
+   *
+   * @return {@link org.rocksdb.DBOptions instance}
+   *     or null.
+   *
+   * @throws java.lang.IllegalArgumentException if null or empty
+   *     {@link java.util.Properties} instance is passed to the method call.
+   */
+  public static DBOptions getDBOptionsFromProps(
+      final ConfigOptions cfgOpts, final Properties properties) {
+    DBOptions dbOptions = null;
+    final String optionsString = Options.getOptionStringFromProps(properties);
+    final long handle = getDBOptionsFromProps(cfgOpts.nativeHandle_, optionsString);
+    if (handle != 0) {
+      dbOptions = new DBOptions(handle);
+    }
+    return dbOptions;
+  }
+
+  /**
+   * <p>Method to get a options instance by using pre-configured
+   * property values. If one or many values are undefined in
+   * the context of RocksDB the method will return a null
+   * value.</p>
+   *
+   * <p><strong>Note</strong>: Property keys can be derived from
+   * getter methods within the options class. Example: the method
+   * {@code allowMmapReads()} has a property key:
+   * {@code allow_mmap_reads}.</p>
+   *
+   * @param properties {@link java.util.Properties} instance.
+   *
+   * @return {@link org.rocksdb.DBOptions instance}
+   *     or null.
+   *
+   * @throws java.lang.IllegalArgumentException if null or empty
+   *     {@link java.util.Properties} instance is passed to the method call.
+   */
+  public static DBOptions getDBOptionsFromProps(final Properties properties) {
+    DBOptions dbOptions = null;
+    final String optionsString = Options.getOptionStringFromProps(properties);
+    final long handle = getDBOptionsFromProps(optionsString);
+    if (handle != 0) {
+      dbOptions = new DBOptions(handle);
+    }
+    return dbOptions;
+  }
+
+  @Override
+  public DBOptions optimizeForSmallDb() {
+    optimizeForSmallDb(nativeHandle_);
+    return this;
+  }
+
+  @Override
+  public DBOptions setIncreaseParallelism(
+      final int totalThreads) {
+    assert(isOwningHandle());
+    setIncreaseParallelism(nativeHandle_, totalThreads);
+    return this;
+  }
+
+  @Override
+  public DBOptions setCreateIfMissing(final boolean flag) {
+    assert(isOwningHandle());
+    setCreateIfMissing(nativeHandle_, flag);
+    return this;
+  }
+
+  @Override
+  public boolean createIfMissing() {
+    assert(isOwningHandle());
+    return createIfMissing(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setCreateMissingColumnFamilies(
+      final boolean flag) {
+    assert(isOwningHandle());
+    setCreateMissingColumnFamilies(nativeHandle_, flag);
+    return this;
+  }
+
+  @Override
+  public boolean createMissingColumnFamilies() {
+    assert(isOwningHandle());
+    return createMissingColumnFamilies(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setErrorIfExists(
+      final boolean errorIfExists) {
+    assert(isOwningHandle());
+    setErrorIfExists(nativeHandle_, errorIfExists);
+    return this;
+  }
+
+  @Override
+  public boolean errorIfExists() {
+    assert(isOwningHandle());
+    return errorIfExists(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setParanoidChecks(
+      final boolean paranoidChecks) {
+    assert(isOwningHandle());
+    setParanoidChecks(nativeHandle_, paranoidChecks);
+    return this;
+  }
+
+  @Override
+  public boolean paranoidChecks() {
+    assert(isOwningHandle());
+    return paranoidChecks(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setEnv(final Env env) {
+    setEnv(nativeHandle_, env.nativeHandle_);
+    this.env_ = env;
+    return this;
+  }
+
+  @Override
+  public Env getEnv() {
+    return env_;
+  }
+
+  @Override
+  public DBOptions setRateLimiter(final RateLimiter rateLimiter) {
+    assert(isOwningHandle());
+    rateLimiter_ = rateLimiter;
+    setRateLimiter(nativeHandle_, rateLimiter.nativeHandle_);
+    return this;
+  }
+
+  @Override
+  public DBOptions setSstFileManager(final SstFileManager sstFileManager) {
+    assert(isOwningHandle());
+    setSstFileManager(nativeHandle_, sstFileManager.nativeHandle_);
+    return this;
+  }
+
+  @Override
+  public DBOptions setLogger(final Logger logger) {
+    assert(isOwningHandle());
+    setLogger(nativeHandle_, logger.nativeHandle_);
+    return this;
+  }
+
+  @Override
+  public DBOptions setInfoLogLevel(
+      final InfoLogLevel infoLogLevel) {
+    assert(isOwningHandle());
+    setInfoLogLevel(nativeHandle_, infoLogLevel.getValue());
+    return this;
+  }
+
+  @Override
+  public InfoLogLevel infoLogLevel() {
+    assert(isOwningHandle());
+    return InfoLogLevel.getInfoLogLevel(
+        infoLogLevel(nativeHandle_));
+  }
+
+  @Override
+  public DBOptions setMaxOpenFiles(
+      final int maxOpenFiles) {
+    assert(isOwningHandle());
+    setMaxOpenFiles(nativeHandle_, maxOpenFiles);
+    return this;
+  }
+
+  @Override
+  public int maxOpenFiles() {
+    assert(isOwningHandle());
+    return maxOpenFiles(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setMaxFileOpeningThreads(final int maxFileOpeningThreads) {
+    assert(isOwningHandle());
+    setMaxFileOpeningThreads(nativeHandle_, maxFileOpeningThreads);
+    return this;
+  }
+
+  @Override
+  public int maxFileOpeningThreads() {
+    assert(isOwningHandle());
+    return maxFileOpeningThreads(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setMaxTotalWalSize(
+      final long maxTotalWalSize) {
+    assert(isOwningHandle());
+    setMaxTotalWalSize(nativeHandle_, maxTotalWalSize);
+    return this;
+  }
+
+  @Override
+  public long maxTotalWalSize() {
+    assert(isOwningHandle());
+    return maxTotalWalSize(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setStatistics(final Statistics statistics) {
+    assert(isOwningHandle());
+    setStatistics(nativeHandle_, statistics.nativeHandle_);
+    return this;
+  }
+
+  @Override
+  public Statistics statistics() {
+    assert(isOwningHandle());
+    final long statisticsNativeHandle = statistics(nativeHandle_);
+    if(statisticsNativeHandle == 0) {
+      return null;
+    } else {
+      return new Statistics(statisticsNativeHandle);
+    }
+  }
+
+  @Override
+  public DBOptions setUseFsync(
+      final boolean useFsync) {
+    assert(isOwningHandle());
+    setUseFsync(nativeHandle_, useFsync);
+    return this;
+  }
+
+  @Override
+  public boolean useFsync() {
+    assert(isOwningHandle());
+    return useFsync(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setDbPaths(final Collection<DbPath> dbPaths) {
+    assert(isOwningHandle());
+
+    final int len = dbPaths.size();
+    final String[] paths = new String[len];
+    final long[] targetSizes = new long[len];
+
+    int i = 0;
+    for(final DbPath dbPath : dbPaths) {
+      paths[i] = dbPath.path.toString();
+      targetSizes[i] = dbPath.targetSize;
+      i++;
+    }
+    setDbPaths(nativeHandle_, paths, targetSizes);
+    return this;
+  }
+
+  @Override
+  public List<DbPath> dbPaths() {
+    final int len = (int)dbPathsLen(nativeHandle_);
+    if(len == 0) {
+      return Collections.emptyList();
+    } else {
+      final String[] paths = new String[len];
+      final long[] targetSizes = new long[len];
+
+      dbPaths(nativeHandle_, paths, targetSizes);
+
+      final List<DbPath> dbPaths = new ArrayList<>();
+      for(int i = 0; i < len; i++) {
+        dbPaths.add(new DbPath(Paths.get(paths[i]), targetSizes[i]));
+      }
+      return dbPaths;
+    }
+  }
+
+  @Override
+  public DBOptions setDbLogDir(
+      final String dbLogDir) {
+    assert(isOwningHandle());
+    setDbLogDir(nativeHandle_, dbLogDir);
+    return this;
+  }
+
+  @Override
+  public String dbLogDir() {
+    assert(isOwningHandle());
+    return dbLogDir(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setWalDir(
+      final String walDir) {
+    assert(isOwningHandle());
+    setWalDir(nativeHandle_, walDir);
+    return this;
+  }
+
+  @Override
+  public String walDir() {
+    assert(isOwningHandle());
+    return walDir(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setDeleteObsoleteFilesPeriodMicros(
+      final long micros) {
+    assert(isOwningHandle());
+    setDeleteObsoleteFilesPeriodMicros(nativeHandle_, micros);
+    return this;
+  }
+
+  @Override
+  public long deleteObsoleteFilesPeriodMicros() {
+    assert(isOwningHandle());
+    return deleteObsoleteFilesPeriodMicros(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setMaxBackgroundJobs(final int maxBackgroundJobs) {
+    assert(isOwningHandle());
+    setMaxBackgroundJobs(nativeHandle_, maxBackgroundJobs);
+    return this;
+  }
+
+  @Override
+  public int maxBackgroundJobs() {
+    assert(isOwningHandle());
+    return maxBackgroundJobs(nativeHandle_);
+  }
+
+  @Override
+  @Deprecated
+  public DBOptions setMaxBackgroundCompactions(
+      final int maxBackgroundCompactions) {
+    assert(isOwningHandle());
+    setMaxBackgroundCompactions(nativeHandle_, maxBackgroundCompactions);
+    return this;
+  }
+
+  @Override
+  @Deprecated
+  public int maxBackgroundCompactions() {
+    assert(isOwningHandle());
+    return maxBackgroundCompactions(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setMaxSubcompactions(final int maxSubcompactions) {
+    assert(isOwningHandle());
+    setMaxSubcompactions(nativeHandle_, maxSubcompactions);
+    return this;
+  }
+
+  @Override
+  public int maxSubcompactions() {
+    assert(isOwningHandle());
+    return maxSubcompactions(nativeHandle_);
+  }
+
+  @Override
+  @Deprecated
+  public DBOptions setMaxBackgroundFlushes(
+      final int maxBackgroundFlushes) {
+    assert(isOwningHandle());
+    setMaxBackgroundFlushes(nativeHandle_, maxBackgroundFlushes);
+    return this;
+  }
+
+  @Override
+  @Deprecated
+  public int maxBackgroundFlushes() {
+    assert(isOwningHandle());
+    return maxBackgroundFlushes(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setMaxLogFileSize(final long maxLogFileSize) {
+    assert(isOwningHandle());
+    setMaxLogFileSize(nativeHandle_, maxLogFileSize);
+    return this;
+  }
+
+  @Override
+  public long maxLogFileSize() {
+    assert(isOwningHandle());
+    return maxLogFileSize(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setLogFileTimeToRoll(
+      final long logFileTimeToRoll) {
+    assert(isOwningHandle());
+    setLogFileTimeToRoll(nativeHandle_, logFileTimeToRoll);
+    return this;
+  }
+
+  @Override
+  public long logFileTimeToRoll() {
+    assert(isOwningHandle());
+    return logFileTimeToRoll(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setKeepLogFileNum(
+      final long keepLogFileNum) {
+    assert(isOwningHandle());
+    setKeepLogFileNum(nativeHandle_, keepLogFileNum);
+    return this;
+  }
+
+  @Override
+  public long keepLogFileNum() {
+    assert(isOwningHandle());
+    return keepLogFileNum(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setRecycleLogFileNum(final long recycleLogFileNum) {
+    assert(isOwningHandle());
+    setRecycleLogFileNum(nativeHandle_, recycleLogFileNum);
+    return this;
+  }
+
+  @Override
+  public long recycleLogFileNum() {
+    assert(isOwningHandle());
+    return recycleLogFileNum(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setMaxManifestFileSize(
+      final long maxManifestFileSize) {
+    assert(isOwningHandle());
+    setMaxManifestFileSize(nativeHandle_, maxManifestFileSize);
+    return this;
+  }
+
+  @Override
+  public long maxManifestFileSize() {
+    assert(isOwningHandle());
+    return maxManifestFileSize(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setTableCacheNumshardbits(
+      final int tableCacheNumshardbits) {
+    assert(isOwningHandle());
+    setTableCacheNumshardbits(nativeHandle_, tableCacheNumshardbits);
+    return this;
+  }
+
+  @Override
+  public int tableCacheNumshardbits() {
+    assert(isOwningHandle());
+    return tableCacheNumshardbits(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setWalTtlSeconds(
+      final long walTtlSeconds) {
+    assert(isOwningHandle());
+    setWalTtlSeconds(nativeHandle_, walTtlSeconds);
+    return this;
+  }
+
+  @Override
+  public long walTtlSeconds() {
+    assert(isOwningHandle());
+    return walTtlSeconds(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setWalSizeLimitMB(
+      final long sizeLimitMB) {
+    assert(isOwningHandle());
+    setWalSizeLimitMB(nativeHandle_, sizeLimitMB);
+    return this;
+  }
+
+  @Override
+  public long walSizeLimitMB() {
+    assert(isOwningHandle());
+    return walSizeLimitMB(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setMaxWriteBatchGroupSizeBytes(final long maxWriteBatchGroupSizeBytes) {
+    setMaxWriteBatchGroupSizeBytes(nativeHandle_, maxWriteBatchGroupSizeBytes);
+    return this;
+  }
+
+  @Override
+  public long maxWriteBatchGroupSizeBytes() {
+    assert (isOwningHandle());
+    return maxWriteBatchGroupSizeBytes(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setManifestPreallocationSize(
+      final long size) {
+    assert(isOwningHandle());
+    setManifestPreallocationSize(nativeHandle_, size);
+    return this;
+  }
+
+  @Override
+  public long manifestPreallocationSize() {
+    assert(isOwningHandle());
+    return manifestPreallocationSize(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setAllowMmapReads(
+      final boolean allowMmapReads) {
+    assert(isOwningHandle());
+    setAllowMmapReads(nativeHandle_, allowMmapReads);
+    return this;
+  }
+
+  @Override
+  public boolean allowMmapReads() {
+    assert(isOwningHandle());
+    return allowMmapReads(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setAllowMmapWrites(
+      final boolean allowMmapWrites) {
+    assert(isOwningHandle());
+    setAllowMmapWrites(nativeHandle_, allowMmapWrites);
+    return this;
+  }
+
+  @Override
+  public boolean allowMmapWrites() {
+    assert(isOwningHandle());
+    return allowMmapWrites(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setUseDirectReads(
+      final boolean useDirectReads) {
+    assert(isOwningHandle());
+    setUseDirectReads(nativeHandle_, useDirectReads);
+    return this;
+  }
+
+  @Override
+  public boolean useDirectReads() {
+    assert(isOwningHandle());
+    return useDirectReads(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setUseDirectIoForFlushAndCompaction(
+      final boolean useDirectIoForFlushAndCompaction) {
+    assert(isOwningHandle());
+    setUseDirectIoForFlushAndCompaction(nativeHandle_,
+        useDirectIoForFlushAndCompaction);
+    return this;
+  }
+
+  @Override
+  public boolean useDirectIoForFlushAndCompaction() {
+    assert(isOwningHandle());
+    return useDirectIoForFlushAndCompaction(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setAllowFAllocate(final boolean allowFAllocate) {
+    assert(isOwningHandle());
+    setAllowFAllocate(nativeHandle_, allowFAllocate);
+    return this;
+  }
+
+  @Override
+  public boolean allowFAllocate() {
+    assert(isOwningHandle());
+    return allowFAllocate(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setIsFdCloseOnExec(
+      final boolean isFdCloseOnExec) {
+    assert(isOwningHandle());
+    setIsFdCloseOnExec(nativeHandle_, isFdCloseOnExec);
+    return this;
+  }
+
+  @Override
+  public boolean isFdCloseOnExec() {
+    assert(isOwningHandle());
+    return isFdCloseOnExec(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setStatsDumpPeriodSec(
+      final int statsDumpPeriodSec) {
+    assert(isOwningHandle());
+    setStatsDumpPeriodSec(nativeHandle_, statsDumpPeriodSec);
+    return this;
+  }
+
+  @Override
+  public int statsDumpPeriodSec() {
+    assert(isOwningHandle());
+    return statsDumpPeriodSec(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setStatsPersistPeriodSec(
+      final int statsPersistPeriodSec) {
+    assert(isOwningHandle());
+    setStatsPersistPeriodSec(nativeHandle_, statsPersistPeriodSec);
+    return this;
+  }
+
+  @Override
+  public int statsPersistPeriodSec() {
+    assert(isOwningHandle());
+    return statsPersistPeriodSec(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setStatsHistoryBufferSize(
+      final long statsHistoryBufferSize) {
+    assert(isOwningHandle());
+    setStatsHistoryBufferSize(nativeHandle_, statsHistoryBufferSize);
+    return this;
+  }
+
+  @Override
+  public long statsHistoryBufferSize() {
+    assert(isOwningHandle());
+    return statsHistoryBufferSize(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setAdviseRandomOnOpen(
+      final boolean adviseRandomOnOpen) {
+    assert(isOwningHandle());
+    setAdviseRandomOnOpen(nativeHandle_, adviseRandomOnOpen);
+    return this;
+  }
+
+  @Override
+  public boolean adviseRandomOnOpen() {
+    return adviseRandomOnOpen(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setDbWriteBufferSize(final long dbWriteBufferSize) {
+    assert(isOwningHandle());
+    setDbWriteBufferSize(nativeHandle_, dbWriteBufferSize);
+    return this;
+  }
+
+  @Override
+  public DBOptions setWriteBufferManager(final WriteBufferManager writeBufferManager) {
+    assert(isOwningHandle());
+    setWriteBufferManager(nativeHandle_, writeBufferManager.nativeHandle_);
+    this.writeBufferManager_ = writeBufferManager;
+    return this;
+  }
+
+  @Override
+  public WriteBufferManager writeBufferManager() {
+    assert(isOwningHandle());
+    return this.writeBufferManager_;
+  }
+
+  @Override
+  public long dbWriteBufferSize() {
+    assert(isOwningHandle());
+    return dbWriteBufferSize(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setAccessHintOnCompactionStart(final AccessHint accessHint) {
+    assert(isOwningHandle());
+    setAccessHintOnCompactionStart(nativeHandle_, accessHint.getValue());
+    return this;
+  }
+
+  @Override
+  public AccessHint accessHintOnCompactionStart() {
+    assert(isOwningHandle());
+    return AccessHint.getAccessHint(accessHintOnCompactionStart(nativeHandle_));
+  }
+
+  @Override
+  public DBOptions setCompactionReadaheadSize(final long compactionReadaheadSize) {
+    assert(isOwningHandle());
+    setCompactionReadaheadSize(nativeHandle_, compactionReadaheadSize);
+    return this;
+  }
+
+  @Override
+  public long compactionReadaheadSize() {
+    assert(isOwningHandle());
+    return compactionReadaheadSize(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setRandomAccessMaxBufferSize(final long randomAccessMaxBufferSize) {
+    assert(isOwningHandle());
+    setRandomAccessMaxBufferSize(nativeHandle_, randomAccessMaxBufferSize);
+    return this;
+  }
+
+  @Override
+  public long randomAccessMaxBufferSize() {
+    assert(isOwningHandle());
+    return randomAccessMaxBufferSize(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setWritableFileMaxBufferSize(final long writableFileMaxBufferSize) {
+    assert(isOwningHandle());
+    setWritableFileMaxBufferSize(nativeHandle_, writableFileMaxBufferSize);
+    return this;
+  }
+
+  @Override
+  public long writableFileMaxBufferSize() {
+    assert(isOwningHandle());
+    return writableFileMaxBufferSize(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setUseAdaptiveMutex(
+      final boolean useAdaptiveMutex) {
+    assert(isOwningHandle());
+    setUseAdaptiveMutex(nativeHandle_, useAdaptiveMutex);
+    return this;
+  }
+
+  @Override
+  public boolean useAdaptiveMutex() {
+    assert(isOwningHandle());
+    return useAdaptiveMutex(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setBytesPerSync(
+      final long bytesPerSync) {
+    assert(isOwningHandle());
+    setBytesPerSync(nativeHandle_, bytesPerSync);
+    return this;
+  }
+
+  @Override
+  public long bytesPerSync() {
+    return bytesPerSync(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setWalBytesPerSync(final long walBytesPerSync) {
+    assert(isOwningHandle());
+    setWalBytesPerSync(nativeHandle_, walBytesPerSync);
+    return this;
+  }
+
+  @Override
+  public long walBytesPerSync() {
+    assert(isOwningHandle());
+    return walBytesPerSync(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setStrictBytesPerSync(final boolean strictBytesPerSync) {
+    assert(isOwningHandle());
+    setStrictBytesPerSync(nativeHandle_, strictBytesPerSync);
+    return this;
+  }
+
+  @Override
+  public boolean strictBytesPerSync() {
+    assert(isOwningHandle());
+    return strictBytesPerSync(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setListeners(final List<AbstractEventListener> listeners) {
+    assert (isOwningHandle());
+    setEventListeners(nativeHandle_, RocksCallbackObject.toNativeHandleList(listeners));
+    return this;
+  }
+
+  @Override
+  public List<AbstractEventListener> listeners() {
+    assert (isOwningHandle());
+    return Arrays.asList(eventListeners(nativeHandle_));
+  }
+
+  @Override
+  public DBOptions setEnableThreadTracking(final boolean enableThreadTracking) {
+    assert(isOwningHandle());
+    setEnableThreadTracking(nativeHandle_, enableThreadTracking);
+    return this;
+  }
+
+  @Override
+  public boolean enableThreadTracking() {
+    assert(isOwningHandle());
+    return enableThreadTracking(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setDelayedWriteRate(final long delayedWriteRate) {
+    assert(isOwningHandle());
+    setDelayedWriteRate(nativeHandle_, delayedWriteRate);
+    return this;
+  }
+
+  @Override
+  public long delayedWriteRate(){
+    return delayedWriteRate(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setEnablePipelinedWrite(final boolean enablePipelinedWrite) {
+    assert(isOwningHandle());
+    setEnablePipelinedWrite(nativeHandle_, enablePipelinedWrite);
+    return this;
+  }
+
+  @Override
+  public boolean enablePipelinedWrite() {
+    assert(isOwningHandle());
+    return enablePipelinedWrite(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setUnorderedWrite(final boolean unorderedWrite) {
+    setUnorderedWrite(nativeHandle_, unorderedWrite);
+    return this;
+  }
+
+  @Override
+  public boolean unorderedWrite() {
+    return unorderedWrite(nativeHandle_);
+  }
+
+
+  @Override
+  public DBOptions setAllowConcurrentMemtableWrite(
+      final boolean allowConcurrentMemtableWrite) {
+    setAllowConcurrentMemtableWrite(nativeHandle_,
+        allowConcurrentMemtableWrite);
+    return this;
+  }
+
+  @Override
+  public boolean allowConcurrentMemtableWrite() {
+    return allowConcurrentMemtableWrite(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setEnableWriteThreadAdaptiveYield(
+      final boolean enableWriteThreadAdaptiveYield) {
+    setEnableWriteThreadAdaptiveYield(nativeHandle_,
+        enableWriteThreadAdaptiveYield);
+    return this;
+  }
+
+  @Override
+  public boolean enableWriteThreadAdaptiveYield() {
+    return enableWriteThreadAdaptiveYield(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setWriteThreadMaxYieldUsec(final long writeThreadMaxYieldUsec) {
+    setWriteThreadMaxYieldUsec(nativeHandle_, writeThreadMaxYieldUsec);
+    return this;
+  }
+
+  @Override
+  public long writeThreadMaxYieldUsec() {
+    return writeThreadMaxYieldUsec(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setWriteThreadSlowYieldUsec(final long writeThreadSlowYieldUsec) {
+    setWriteThreadSlowYieldUsec(nativeHandle_, writeThreadSlowYieldUsec);
+    return this;
+  }
+
+  @Override
+  public long writeThreadSlowYieldUsec() {
+    return writeThreadSlowYieldUsec(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setSkipStatsUpdateOnDbOpen(final boolean skipStatsUpdateOnDbOpen) {
+    assert(isOwningHandle());
+    setSkipStatsUpdateOnDbOpen(nativeHandle_, skipStatsUpdateOnDbOpen);
+    return this;
+  }
+
+  @Override
+  public boolean skipStatsUpdateOnDbOpen() {
+    assert(isOwningHandle());
+    return skipStatsUpdateOnDbOpen(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setSkipCheckingSstFileSizesOnDbOpen(
+      final boolean skipCheckingSstFileSizesOnDbOpen) {
+    setSkipCheckingSstFileSizesOnDbOpen(nativeHandle_, skipCheckingSstFileSizesOnDbOpen);
+    return this;
+  }
+
+  @Override
+  public boolean skipCheckingSstFileSizesOnDbOpen() {
+    assert (isOwningHandle());
+    return skipCheckingSstFileSizesOnDbOpen(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setWalRecoveryMode(final WALRecoveryMode walRecoveryMode) {
+    assert(isOwningHandle());
+    setWalRecoveryMode(nativeHandle_, walRecoveryMode.getValue());
+    return this;
+  }
+
+  @Override
+  public WALRecoveryMode walRecoveryMode() {
+    assert(isOwningHandle());
+    return WALRecoveryMode.getWALRecoveryMode(walRecoveryMode(nativeHandle_));
+  }
+
+  @Override
+  public DBOptions setAllow2pc(final boolean allow2pc) {
+    assert(isOwningHandle());
+    setAllow2pc(nativeHandle_, allow2pc);
+    return this;
+  }
+
+  @Override
+  public boolean allow2pc() {
+    assert(isOwningHandle());
+    return allow2pc(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setRowCache(final Cache rowCache) {
+    assert(isOwningHandle());
+    setRowCache(nativeHandle_, rowCache.nativeHandle_);
+    this.rowCache_ = rowCache;
+    return this;
+  }
+
+  @Override
+  public Cache rowCache() {
+    assert(isOwningHandle());
+    return this.rowCache_;
+  }
+
+  @Override
+  public DBOptions setWalFilter(final AbstractWalFilter walFilter) {
+    assert(isOwningHandle());
+    setWalFilter(nativeHandle_, walFilter.nativeHandle_);
+    this.walFilter_ = walFilter;
+    return this;
+  }
+
+  @Override
+  public WalFilter walFilter() {
+    assert(isOwningHandle());
+    return this.walFilter_;
+  }
+
+  @Override
+  public DBOptions setFailIfOptionsFileError(final boolean failIfOptionsFileError) {
+    assert(isOwningHandle());
+    setFailIfOptionsFileError(nativeHandle_, failIfOptionsFileError);
+    return this;
+  }
+
+  @Override
+  public boolean failIfOptionsFileError() {
+    assert(isOwningHandle());
+    return failIfOptionsFileError(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setDumpMallocStats(final boolean dumpMallocStats) {
+    assert(isOwningHandle());
+    setDumpMallocStats(nativeHandle_, dumpMallocStats);
+    return this;
+  }
+
+  @Override
+  public boolean dumpMallocStats() {
+    assert(isOwningHandle());
+    return dumpMallocStats(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setAvoidFlushDuringRecovery(final boolean avoidFlushDuringRecovery) {
+    assert(isOwningHandle());
+    setAvoidFlushDuringRecovery(nativeHandle_, avoidFlushDuringRecovery);
+    return this;
+  }
+
+  @Override
+  public boolean avoidFlushDuringRecovery() {
+    assert(isOwningHandle());
+    return avoidFlushDuringRecovery(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setAvoidFlushDuringShutdown(final boolean avoidFlushDuringShutdown) {
+    assert(isOwningHandle());
+    setAvoidFlushDuringShutdown(nativeHandle_, avoidFlushDuringShutdown);
+    return this;
+  }
+
+  @Override
+  public boolean avoidFlushDuringShutdown() {
+    assert(isOwningHandle());
+    return avoidFlushDuringShutdown(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setAllowIngestBehind(final boolean allowIngestBehind) {
+    assert(isOwningHandle());
+    setAllowIngestBehind(nativeHandle_, allowIngestBehind);
+    return this;
+  }
+
+  @Override
+  public boolean allowIngestBehind() {
+    assert(isOwningHandle());
+    return allowIngestBehind(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setTwoWriteQueues(final boolean twoWriteQueues) {
+    assert(isOwningHandle());
+    setTwoWriteQueues(nativeHandle_, twoWriteQueues);
+    return this;
+  }
+
+  @Override
+  public boolean twoWriteQueues() {
+    assert(isOwningHandle());
+    return twoWriteQueues(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setManualWalFlush(final boolean manualWalFlush) {
+    assert(isOwningHandle());
+    setManualWalFlush(nativeHandle_, manualWalFlush);
+    return this;
+  }
+
+  @Override
+  public boolean manualWalFlush() {
+    assert(isOwningHandle());
+    return manualWalFlush(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setAtomicFlush(final boolean atomicFlush) {
+    setAtomicFlush(nativeHandle_, atomicFlush);
+    return this;
+  }
+
+  @Override
+  public boolean atomicFlush() {
+    return atomicFlush(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setAvoidUnnecessaryBlockingIO(final boolean avoidUnnecessaryBlockingIO) {
+    setAvoidUnnecessaryBlockingIO(nativeHandle_, avoidUnnecessaryBlockingIO);
+    return this;
+  }
+
+  @Override
+  public boolean avoidUnnecessaryBlockingIO() {
+    assert (isOwningHandle());
+    return avoidUnnecessaryBlockingIO(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setPersistStatsToDisk(final boolean persistStatsToDisk) {
+    setPersistStatsToDisk(nativeHandle_, persistStatsToDisk);
+    return this;
+  }
+
+  @Override
+  public boolean persistStatsToDisk() {
+    assert (isOwningHandle());
+    return persistStatsToDisk(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setWriteDbidToManifest(final boolean writeDbidToManifest) {
+    setWriteDbidToManifest(nativeHandle_, writeDbidToManifest);
+    return this;
+  }
+
+  @Override
+  public boolean writeDbidToManifest() {
+    assert (isOwningHandle());
+    return writeDbidToManifest(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setLogReadaheadSize(final long logReadaheadSize) {
+    setLogReadaheadSize(nativeHandle_, logReadaheadSize);
+    return this;
+  }
+
+  @Override
+  public long logReadaheadSize() {
+    assert (isOwningHandle());
+    return logReadaheadSize(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setBestEffortsRecovery(final boolean bestEffortsRecovery) {
+    setBestEffortsRecovery(nativeHandle_, bestEffortsRecovery);
+    return this;
+  }
+
+  @Override
+  public boolean bestEffortsRecovery() {
+    assert (isOwningHandle());
+    return bestEffortsRecovery(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setMaxBgErrorResumeCount(final int maxBgerrorResumeCount) {
+    setMaxBgErrorResumeCount(nativeHandle_, maxBgerrorResumeCount);
+    return this;
+  }
+
+  @Override
+  public int maxBgerrorResumeCount() {
+    assert (isOwningHandle());
+    return maxBgerrorResumeCount(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setBgerrorResumeRetryInterval(final long bgerrorResumeRetryInterval) {
+    setBgerrorResumeRetryInterval(nativeHandle_, bgerrorResumeRetryInterval);
+    return this;
+  }
+
+  @Override
+  public long bgerrorResumeRetryInterval() {
+    assert (isOwningHandle());
+    return bgerrorResumeRetryInterval(nativeHandle_);
+  }
+
+  static final int DEFAULT_NUM_SHARD_BITS = -1;
+
+
+
+
+  /**
+   * <p>Private constructor to be used by
+   * {@link #getDBOptionsFromProps(java.util.Properties)}</p>
+   *
+   * @param nativeHandle native handle to DBOptions instance.
+   */
+  private DBOptions(final long nativeHandle) {
+    super(nativeHandle);
+  }
+
+  private static native long getDBOptionsFromProps(long cfgHandle, String optString);
+  private static native long getDBOptionsFromProps(String optString);
+
+  private static native long newDBOptions();
+  private static native long copyDBOptions(final long handle);
+  private static native long newDBOptionsFromOptions(final long optionsHandle);
+  @Override protected final native void disposeInternal(final long handle);
+
+  private native void optimizeForSmallDb(final long handle);
+  private native void setIncreaseParallelism(long handle, int totalThreads);
+  private native void setCreateIfMissing(long handle, boolean flag);
+  private native boolean createIfMissing(long handle);
+  private native void setCreateMissingColumnFamilies(
+      long handle, boolean flag);
+  private native boolean createMissingColumnFamilies(long handle);
+  private native void setEnv(long handle, long envHandle);
+  private native void setErrorIfExists(long handle, boolean errorIfExists);
+  private native boolean errorIfExists(long handle);
+  private native void setParanoidChecks(
+      long handle, boolean paranoidChecks);
+  private native boolean paranoidChecks(long handle);
+  private native void setRateLimiter(long handle,
+      long rateLimiterHandle);
+  private native void setSstFileManager(final long handle,
+      final long sstFileManagerHandle);
+  private native void setLogger(long handle,
+      long loggerHandle);
+  private native void setInfoLogLevel(long handle, byte logLevel);
+  private native byte infoLogLevel(long handle);
+  private native void setMaxOpenFiles(long handle, int maxOpenFiles);
+  private native int maxOpenFiles(long handle);
+  private native void setMaxFileOpeningThreads(final long handle,
+      final int maxFileOpeningThreads);
+  private native int maxFileOpeningThreads(final long handle);
+  private native void setMaxTotalWalSize(long handle,
+      long maxTotalWalSize);
+  private native long maxTotalWalSize(long handle);
+  private native void setStatistics(final long handle, final long statisticsHandle);
+  private native long statistics(final long handle);
+  private native boolean useFsync(long handle);
+  private native void setUseFsync(long handle, boolean useFsync);
+  private native void setDbPaths(final long handle, final String[] paths,
+      final long[] targetSizes);
+  private native long dbPathsLen(final long handle);
+  private native void dbPaths(final long handle, final String[] paths,
+                                 final long[] targetSizes);
+  private native void setDbLogDir(long handle, String dbLogDir);
+  private native String dbLogDir(long handle);
+  private native void setWalDir(long handle, String walDir);
+  private native String walDir(long handle);
+  private native void setDeleteObsoleteFilesPeriodMicros(
+      long handle, long micros);
+  private native long deleteObsoleteFilesPeriodMicros(long handle);
+  private native void setMaxBackgroundCompactions(
+      long handle, int maxBackgroundCompactions);
+  private native int maxBackgroundCompactions(long handle);
+  private native void setMaxSubcompactions(long handle, int maxSubcompactions);
+  private native int maxSubcompactions(long handle);
+  private native void setMaxBackgroundFlushes(
+      long handle, int maxBackgroundFlushes);
+  private native int maxBackgroundFlushes(long handle);
+  private native void setMaxBackgroundJobs(long handle, int maxBackgroundJobs);
+  private native int maxBackgroundJobs(long handle);
+  private native void setMaxLogFileSize(long handle, long maxLogFileSize)
+      throws IllegalArgumentException;
+  private native long maxLogFileSize(long handle);
+  private native void setLogFileTimeToRoll(
+      long handle, long logFileTimeToRoll) throws IllegalArgumentException;
+  private native long logFileTimeToRoll(long handle);
+  private native void setKeepLogFileNum(long handle, long keepLogFileNum)
+      throws IllegalArgumentException;
+  private native long keepLogFileNum(long handle);
+  private native void setRecycleLogFileNum(long handle, long recycleLogFileNum);
+  private native long recycleLogFileNum(long handle);
+  private native void setMaxManifestFileSize(
+      long handle, long maxManifestFileSize);
+  private native long maxManifestFileSize(long handle);
+  private native void setTableCacheNumshardbits(
+      long handle, int tableCacheNumshardbits);
+  private native int tableCacheNumshardbits(long handle);
+  private native void setWalTtlSeconds(long handle, long walTtlSeconds);
+  private native long walTtlSeconds(long handle);
+  private native void setWalSizeLimitMB(long handle, long sizeLimitMB);
+  private native long walSizeLimitMB(long handle);
+  private static native void setMaxWriteBatchGroupSizeBytes(
+      final long handle, final long maxWriteBatchGroupSizeBytes);
+  private static native long maxWriteBatchGroupSizeBytes(final long handle);
+  private native void setManifestPreallocationSize(
+      long handle, long size) throws IllegalArgumentException;
+  private native long manifestPreallocationSize(long handle);
+  private native void setUseDirectReads(long handle, boolean useDirectReads);
+  private native boolean useDirectReads(long handle);
+  private native void setUseDirectIoForFlushAndCompaction(
+      long handle, boolean useDirectIoForFlushAndCompaction);
+  private native boolean useDirectIoForFlushAndCompaction(long handle);
+  private native void setAllowFAllocate(final long handle,
+      final boolean allowFAllocate);
+  private native boolean allowFAllocate(final long handle);
+  private native void setAllowMmapReads(
+      long handle, boolean allowMmapReads);
+  private native boolean allowMmapReads(long handle);
+  private native void setAllowMmapWrites(
+      long handle, boolean allowMmapWrites);
+  private native boolean allowMmapWrites(long handle);
+  private native void setIsFdCloseOnExec(
+      long handle, boolean isFdCloseOnExec);
+  private native boolean isFdCloseOnExec(long handle);
+  private native void setStatsDumpPeriodSec(
+      long handle, int statsDumpPeriodSec);
+  private native int statsDumpPeriodSec(long handle);
+  private native void setStatsPersistPeriodSec(
+      final long handle, final int statsPersistPeriodSec);
+  private native int statsPersistPeriodSec(
+      final long handle);
+  private native void setStatsHistoryBufferSize(
+      final long handle, final long statsHistoryBufferSize);
+  private native long statsHistoryBufferSize(
+      final long handle);
+  private native void setAdviseRandomOnOpen(
+      long handle, boolean adviseRandomOnOpen);
+  private native boolean adviseRandomOnOpen(long handle);
+  private native void setDbWriteBufferSize(final long handle,
+      final long dbWriteBufferSize);
+  private native void setWriteBufferManager(final long dbOptionsHandle,
+      final long writeBufferManagerHandle);
+  private native long dbWriteBufferSize(final long handle);
+  private native void setAccessHintOnCompactionStart(final long handle,
+      final byte accessHintOnCompactionStart);
+  private native byte accessHintOnCompactionStart(final long handle);
+  private native void setCompactionReadaheadSize(final long handle,
+      final long compactionReadaheadSize);
+  private native long compactionReadaheadSize(final long handle);
+  private native void setRandomAccessMaxBufferSize(final long handle,
+      final long randomAccessMaxBufferSize);
+  private native long randomAccessMaxBufferSize(final long handle);
+  private native void setWritableFileMaxBufferSize(final long handle,
+      final long writableFileMaxBufferSize);
+  private native long writableFileMaxBufferSize(final long handle);
+  private native void setUseAdaptiveMutex(
+      long handle, boolean useAdaptiveMutex);
+  private native boolean useAdaptiveMutex(long handle);
+  private native void setBytesPerSync(
+      long handle, long bytesPerSync);
+  private native long bytesPerSync(long handle);
+  private native void setWalBytesPerSync(long handle, long walBytesPerSync);
+  private native long walBytesPerSync(long handle);
+  private native void setStrictBytesPerSync(
+      final long handle, final boolean strictBytesPerSync);
+  private native boolean strictBytesPerSync(
+      final long handle);
+  private static native void setEventListeners(
+      final long handle, final long[] eventListenerHandles);
+  private static native AbstractEventListener[] eventListeners(final long handle);
+  private native void setEnableThreadTracking(long handle,
+      boolean enableThreadTracking);
+  private native boolean enableThreadTracking(long handle);
+  private native void setDelayedWriteRate(long handle, long delayedWriteRate);
+  private native long delayedWriteRate(long handle);
+  private native void setEnablePipelinedWrite(final long handle,
+      final boolean enablePipelinedWrite);
+  private native boolean enablePipelinedWrite(final long handle);
+  private native void setUnorderedWrite(final long handle,
+      final boolean unorderedWrite);
+  private native boolean unorderedWrite(final long handle);
+  private native void setAllowConcurrentMemtableWrite(long handle,
+      boolean allowConcurrentMemtableWrite);
+  private native boolean allowConcurrentMemtableWrite(long handle);
+  private native void setEnableWriteThreadAdaptiveYield(long handle,
+      boolean enableWriteThreadAdaptiveYield);
+  private native boolean enableWriteThreadAdaptiveYield(long handle);
+  private native void setWriteThreadMaxYieldUsec(long handle,
+      long writeThreadMaxYieldUsec);
+  private native long writeThreadMaxYieldUsec(long handle);
+  private native void setWriteThreadSlowYieldUsec(long handle,
+      long writeThreadSlowYieldUsec);
+  private native long writeThreadSlowYieldUsec(long handle);
+  private native void setSkipStatsUpdateOnDbOpen(final long handle,
+      final boolean skipStatsUpdateOnDbOpen);
+  private native boolean skipStatsUpdateOnDbOpen(final long handle);
+  private static native void setSkipCheckingSstFileSizesOnDbOpen(
+      final long handle, final boolean skipChecking);
+  private static native boolean skipCheckingSstFileSizesOnDbOpen(final long handle);
+  private native void setWalRecoveryMode(final long handle,
+      final byte walRecoveryMode);
+  private native byte walRecoveryMode(final long handle);
+  private native void setAllow2pc(final long handle,
+      final boolean allow2pc);
+  private native boolean allow2pc(final long handle);
+  private native void setRowCache(final long handle,
+      final long rowCacheHandle);
+  private native void setWalFilter(final long handle,
+      final long walFilterHandle);
+  private native void setFailIfOptionsFileError(final long handle,
+      final boolean failIfOptionsFileError);
+  private native boolean failIfOptionsFileError(final long handle);
+  private native void setDumpMallocStats(final long handle,
+      final boolean dumpMallocStats);
+  private native boolean dumpMallocStats(final long handle);
+  private native void setAvoidFlushDuringRecovery(final long handle,
+      final boolean avoidFlushDuringRecovery);
+  private native boolean avoidFlushDuringRecovery(final long handle);
+  private native void setAvoidFlushDuringShutdown(final long handle,
+      final boolean avoidFlushDuringShutdown);
+  private native boolean avoidFlushDuringShutdown(final long handle);
+  private native void setAllowIngestBehind(final long handle,
+      final boolean allowIngestBehind);
+  private native boolean allowIngestBehind(final long handle);
+  private native void setTwoWriteQueues(final long handle,
+      final boolean twoWriteQueues);
+  private native boolean twoWriteQueues(final long handle);
+  private native void setManualWalFlush(final long handle,
+      final boolean manualWalFlush);
+  private native boolean manualWalFlush(final long handle);
+  private native void setAtomicFlush(final long handle,
+      final boolean atomicFlush);
+  private native boolean atomicFlush(final long handle);
+  private static native void setAvoidUnnecessaryBlockingIO(
+      final long handle, final boolean avoidBlockingIO);
+  private static native boolean avoidUnnecessaryBlockingIO(final long handle);
+  private static native void setPersistStatsToDisk(
+      final long handle, final boolean persistStatsToDisk);
+  private static native boolean persistStatsToDisk(final long handle);
+  private static native void setWriteDbidToManifest(
+      final long handle, final boolean writeDbidToManifest);
+  private static native boolean writeDbidToManifest(final long handle);
+  private static native void setLogReadaheadSize(final long handle, final long logReadaheadSize);
+  private static native long logReadaheadSize(final long handle);
+  private static native void setBestEffortsRecovery(
+      final long handle, final boolean bestEffortsRecovery);
+  private static native boolean bestEffortsRecovery(final long handle);
+  private static native void setMaxBgErrorResumeCount(
+      final long handle, final int maxBgerrorRecumeCount);
+  private static native int maxBgerrorResumeCount(final long handle);
+  private static native void setBgerrorResumeRetryInterval(
+      final long handle, final long bgerrorResumeRetryInterval);
+  private static native long bgerrorResumeRetryInterval(final long handle);
+
+  // instance variables
+  // NOTE: If you add new member variables, please update the copy constructor above!
+  private Env env_;
+  private int numShardBits_;
+  private RateLimiter rateLimiter_;
+  private Cache rowCache_;
+  private WalFilter walFilter_;
+  private WriteBufferManager writeBufferManager_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/DBOptionsInterface.java b/src/rocksdb/java/src/main/java/org/rocksdb/DBOptionsInterface.java
new file mode 100644
index 000000000..ef1b86bff
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/DBOptionsInterface.java
@@ -0,0 +1,1756 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Collection;
+import java.util.List;
+
+public interface DBOptionsInterface<T extends DBOptionsInterface<T>> {
+  /**
+   * Use this if your DB is very small (like under 1GB) and you don't want to
+   * spend lots of memory for memtables.
+   *
+   * @return the instance of the current object.
+   */
+  T optimizeForSmallDb();
+
+  /**
+   * Use the specified object to interact with the environment,
+   * e.g. to read/write files, schedule background work, etc.
+   * Default: {@link Env#getDefault()}
+   *
+   * @param env {@link Env} instance.
+   * @return the instance of the current Options.
+   */
+  T setEnv(final Env env);
+
+  /**
+   * Returns the set RocksEnv instance.
+   *
+   * @return {@link RocksEnv} instance set in the options.
+   */
+  Env getEnv();
+
+  /**
+   * <p>By default, RocksDB uses only one background thread for flush and
+   * compaction. Calling this function will set it up such that total of
+   * `total_threads` is used.</p>
+   *
+   * <p>You almost definitely want to call this function if your system is
+   * bottlenecked by RocksDB.</p>
+   *
+   * @param totalThreads The total number of threads to be used by RocksDB.
+   *     A good value is the number of cores.
+   *
+   * @return the instance of the current Options
+   */
+  T setIncreaseParallelism(int totalThreads);
+
+  /**
+   * If this value is set to true, then the database will be created
+   * if it is missing during {@code RocksDB.open()}.
+   * Default: false
+   *
+   * @param flag a flag indicating whether to create a database the
+   *     specified database in {@link RocksDB#open(org.rocksdb.Options, String)} operation
+   *     is missing.
+   * @return the instance of the current Options
+   * @see RocksDB#open(org.rocksdb.Options, String)
+   */
+  T setCreateIfMissing(boolean flag);
+
+  /**
+   * Return true if the create_if_missing flag is set to true.
+   * If true, the database will be created if it is missing.
+   *
+   * @return true if the createIfMissing option is set to true.
+   * @see #setCreateIfMissing(boolean)
+   */
+  boolean createIfMissing();
+
+  /**
+   * <p>If true, missing column families will be automatically created</p>
+   *
+   * <p>Default: false</p>
+   *
+   * @param flag a flag indicating if missing column families shall be
+   *     created automatically.
+   * @return true if missing column families shall be created automatically
+   *     on open.
+   */
+  T setCreateMissingColumnFamilies(boolean flag);
+
+  /**
+   * Return true if the create_missing_column_families flag is set
+   * to true. If true column families be created if missing.
+   *
+   * @return true if the createMissingColumnFamilies is set to
+   *     true.
+   * @see #setCreateMissingColumnFamilies(boolean)
+   */
+  boolean createMissingColumnFamilies();
+
+  /**
+   * If true, an error will be thrown during RocksDB.open() if the
+   * database already exists.
+   * Default: false
+   *
+   * @param errorIfExists if true, an exception will be thrown
+   *     during {@code RocksDB.open()} if the database already exists.
+   * @return the reference to the current option.
+   * @see RocksDB#open(org.rocksdb.Options, String)
+   */
+  T setErrorIfExists(boolean errorIfExists);
+
+  /**
+   * If true, an error will be thrown during RocksDB.open() if the
+   * database already exists.
+   *
+   * @return if true, an error is raised when the specified database
+   *    already exists before open.
+   */
+  boolean errorIfExists();
+
+  /**
+   * If true, the implementation will do aggressive checking of the
+   * data it is processing and will stop early if it detects any
+   * errors.  This may have unforeseen ramifications: for example, a
+   * corruption of one DB entry may cause a large number of entries to
+   * become unreadable or for the entire DB to become unopenable.
+   * If any of the  writes to the database fails (Put, Delete, Merge, Write),
+   * the database will switch to read-only mode and fail all other
+   * Write operations.
+   * Default: true
+   *
+   * @param paranoidChecks a flag to indicate whether paranoid-check
+   *     is on.
+   * @return the reference to the current option.
+   */
+  T setParanoidChecks(boolean paranoidChecks);
+
+  /**
+   * If true, the implementation will do aggressive checking of the
+   * data it is processing and will stop early if it detects any
+   * errors.  This may have unforeseen ramifications: for example, a
+   * corruption of one DB entry may cause a large number of entries to
+   * become unreadable or for the entire DB to become unopenable.
+   * If any of the  writes to the database fails (Put, Delete, Merge, Write),
+   * the database will switch to read-only mode and fail all other
+   * Write operations.
+   *
+   * @return a boolean indicating whether paranoid-check is on.
+   */
+  boolean paranoidChecks();
+
+  /**
+   * Use to control write rate of flush and compaction. Flush has higher
+   * priority than compaction. Rate limiting is disabled if nullptr.
+   * Default: nullptr
+   *
+   * @param rateLimiter {@link org.rocksdb.RateLimiter} instance.
+   * @return the instance of the current object.
+   *
+   * @since 3.10.0
+   */
+  T setRateLimiter(RateLimiter rateLimiter);
+
+  /**
+   * Use to track SST files and control their file deletion rate.
+   *
+   * Features:
+   *  - Throttle the deletion rate of the SST files.
+   *  - Keep track the total size of all SST files.
+   *  - Set a maximum allowed space limit for SST files that when reached
+   *    the DB wont do any further flushes or compactions and will set the
+   *    background error.
+   *  - Can be shared between multiple dbs.
+   *
+   *  Limitations:
+   *  - Only track and throttle deletes of SST files in
+   *    first db_path (db_name if db_paths is empty).
+   *
+   * @param sstFileManager The SST File Manager for the db.
+   * @return the instance of the current object.
+   */
+  T setSstFileManager(SstFileManager sstFileManager);
+
+  /**
+   * <p>Any internal progress/error information generated by
+   * the db will be written to the Logger if it is non-nullptr,
+   * or to a file stored in the same directory as the DB
+   * contents if info_log is nullptr.</p>
+   *
+   * <p>Default: nullptr</p>
+   *
+   * @param logger {@link Logger} instance.
+   * @return the instance of the current object.
+   */
+  T setLogger(Logger logger);
+
+  /**
+   * <p>Sets the RocksDB log level. Default level is INFO</p>
+   *
+   * @param infoLogLevel log level to set.
+   * @return the instance of the current object.
+   */
+  T setInfoLogLevel(InfoLogLevel infoLogLevel);
+
+  /**
+   * <p>Returns currently set log level.</p>
+   * @return {@link org.rocksdb.InfoLogLevel} instance.
+   */
+  InfoLogLevel infoLogLevel();
+
+  /**
+   * If {@link MutableDBOptionsInterface#maxOpenFiles()} is -1, DB will open
+   * all files on DB::Open(). You can use this option to increase the number
+   * of threads used to open the files.
+   *
+   * Default: 16
+   *
+   * @param maxFileOpeningThreads the maximum number of threads to use to
+   *     open files
+   *
+   * @return the reference to the current options.
+   */
+  T setMaxFileOpeningThreads(int maxFileOpeningThreads);
+
+  /**
+   * If {@link MutableDBOptionsInterface#maxOpenFiles()} is -1, DB will open all
+   * files on DB::Open(). You can use this option to increase the number of
+   * threads used to open the files.
+   *
+   * Default: 16
+   *
+   * @return the maximum number of threads to use to open files
+   */
+  int maxFileOpeningThreads();
+
+  /**
+   * <p>Sets the statistics object which collects metrics about database operations.
+   * Statistics objects should not be shared between DB instances as
+   * it does not use any locks to prevent concurrent updates.</p>
+   *
+   * @param statistics The statistics to set
+   *
+   * @return the instance of the current object.
+   *
+   * @see RocksDB#open(org.rocksdb.Options, String)
+   */
+  T setStatistics(final Statistics statistics);
+
+  /**
+   * <p>Returns statistics object.</p>
+   *
+   * @return the instance of the statistics object or null if there is no
+   * statistics object.
+   *
+   * @see #setStatistics(Statistics)
+   */
+  Statistics statistics();
+
+  /**
+   * <p>If true, then every store to stable storage will issue a fsync.</p>
+   * <p>If false, then every store to stable storage will issue a fdatasync.
+   * This parameter should be set to true while storing data to
+   * filesystem like ext3 that can lose files after a reboot.</p>
+   * <p>Default: false</p>
+   *
+   * @param useFsync a boolean flag to specify whether to use fsync
+   * @return the instance of the current object.
+   */
+  T setUseFsync(boolean useFsync);
+
+  /**
+   * <p>If true, then every store to stable storage will issue a fsync.</p>
+   * <p>If false, then every store to stable storage will issue a fdatasync.
+   * This parameter should be set to true while storing data to
+   * filesystem like ext3 that can lose files after a reboot.</p>
+   *
+   * @return boolean value indicating if fsync is used.
+   */
+  boolean useFsync();
+
+  /**
+   * A list of paths where SST files can be put into, with its target size.
+   * Newer data is placed into paths specified earlier in the vector while
+   * older data gradually moves to paths specified later in the vector.
+   *
+   * For example, you have a flash device with 10GB allocated for the DB,
+   * as well as a hard drive of 2TB, you should config it to be:
+   *    [{"/flash_path", 10GB}, {"/hard_drive", 2TB}]
+   *
+   * The system will try to guarantee data under each path is close to but
+   * not larger than the target size. But current and future file sizes used
+   * by determining where to place a file are based on best-effort estimation,
+   * which means there is a chance that the actual size under the directory
+   * is slightly more than target size under some workloads. User should give
+   * some buffer room for those cases.
+   *
+   * If none of the paths has sufficient room to place a file, the file will
+   * be placed to the last path anyway, despite to the target size.
+   *
+   * Placing newer data to earlier paths is also best-efforts. User should
+   * expect user files to be placed in higher levels in some extreme cases.
+   *
+   * If left empty, only one path will be used, which is db_name passed when
+   * opening the DB.
+   *
+   * Default: empty
+   *
+   * @param dbPaths the paths and target sizes
+   *
+   * @return the reference to the current options
+   */
+  T setDbPaths(final Collection<DbPath> dbPaths);
+
+  /**
+   * A list of paths where SST files can be put into, with its target size.
+   * Newer data is placed into paths specified earlier in the vector while
+   * older data gradually moves to paths specified later in the vector.
+   *
+   * For example, you have a flash device with 10GB allocated for the DB,
+   * as well as a hard drive of 2TB, you should config it to be:
+   *    [{"/flash_path", 10GB}, {"/hard_drive", 2TB}]
+   *
+   * The system will try to guarantee data under each path is close to but
+   * not larger than the target size. But current and future file sizes used
+   * by determining where to place a file are based on best-effort estimation,
+   * which means there is a chance that the actual size under the directory
+   * is slightly more than target size under some workloads. User should give
+   * some buffer room for those cases.
+   *
+   * If none of the paths has sufficient room to place a file, the file will
+   * be placed to the last path anyway, despite to the target size.
+   *
+   * Placing newer data to earlier paths is also best-efforts. User should
+   * expect user files to be placed in higher levels in some extreme cases.
+   *
+   * If left empty, only one path will be used, which is db_name passed when
+   * opening the DB.
+   *
+   * Default: {@link java.util.Collections#emptyList()}
+   *
+   * @return dbPaths the paths and target sizes
+   */
+  List<DbPath> dbPaths();
+
+  /**
+   * This specifies the info LOG dir.
+   * If it is empty, the log files will be in the same dir as data.
+   * If it is non empty, the log files will be in the specified dir,
+   * and the db data dir's absolute path will be used as the log file
+   * name's prefix.
+   *
+   * @param dbLogDir the path to the info log directory
+   * @return the instance of the current object.
+   */
+  T setDbLogDir(String dbLogDir);
+
+  /**
+   * Returns the directory of info log.
+   *
+   * If it is empty, the log files will be in the same dir as data.
+   * If it is non empty, the log files will be in the specified dir,
+   * and the db data dir's absolute path will be used as the log file
+   * name's prefix.
+   *
+   * @return the path to the info log directory
+   */
+  String dbLogDir();
+
+  /**
+   * This specifies the absolute dir path for write-ahead logs (WAL).
+   * If it is empty, the log files will be in the same dir as data,
+   *   dbname is used as the data dir by default
+   * If it is non empty, the log files will be in kept the specified dir.
+   * When destroying the db,
+   *   all log files in wal_dir and the dir itself is deleted
+   *
+   * @param walDir the path to the write-ahead-log directory.
+   * @return the instance of the current object.
+   */
+  T setWalDir(String walDir);
+
+  /**
+   * Returns the path to the write-ahead-logs (WAL) directory.
+   *
+   * If it is empty, the log files will be in the same dir as data,
+   *   dbname is used as the data dir by default
+   * If it is non empty, the log files will be in kept the specified dir.
+   * When destroying the db,
+   *   all log files in wal_dir and the dir itself is deleted
+   *
+   * @return the path to the write-ahead-logs (WAL) directory.
+   */
+  String walDir();
+
+  /**
+   * The periodicity when obsolete files get deleted. The default
+   * value is 6 hours. The files that get out of scope by compaction
+   * process will still get automatically delete on every compaction,
+   * regardless of this setting
+   *
+   * @param micros the time interval in micros
+   * @return the instance of the current object.
+   */
+  T setDeleteObsoleteFilesPeriodMicros(long micros);
+
+  /**
+   * The periodicity when obsolete files get deleted. The default
+   * value is 6 hours. The files that get out of scope by compaction
+   * process will still get automatically delete on every compaction,
+   * regardless of this setting
+   *
+   * @return the time interval in micros when obsolete files will be deleted.
+   */
+  long deleteObsoleteFilesPeriodMicros();
+
+  /**
+   * This value represents the maximum number of threads that will
+   * concurrently perform a compaction job by breaking it into multiple,
+   * smaller ones that are run simultaneously.
+   * Default: 1 (i.e. no subcompactions)
+   *
+   * @param maxSubcompactions The maximum number of threads that will
+   *     concurrently perform a compaction job
+   *
+   * @return the instance of the current object.
+   */
+  T setMaxSubcompactions(int maxSubcompactions);
+
+  /**
+   * This value represents the maximum number of threads that will
+   * concurrently perform a compaction job by breaking it into multiple,
+   * smaller ones that are run simultaneously.
+   * Default: 1 (i.e. no subcompactions)
+   *
+   * @return The maximum number of threads that will concurrently perform a
+   *     compaction job
+   */
+  int maxSubcompactions();
+
+  /**
+   * NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
+   * value of max_background_jobs. For backwards compatibility we will set
+   * `max_background_jobs = max_background_compactions + max_background_flushes`
+   * in the case where user sets at least one of `max_background_compactions` or
+   * `max_background_flushes`.
+   *
+   * Specifies the maximum number of concurrent background flush jobs.
+   * If you're increasing this, also consider increasing number of threads in
+   * HIGH priority thread pool. For more information, see
+   * Default: -1
+   *
+   * @param maxBackgroundFlushes number of max concurrent flush jobs
+   * @return the instance of the current object.
+   *
+   * @see RocksEnv#setBackgroundThreads(int)
+   * @see RocksEnv#setBackgroundThreads(int, Priority)
+   * @see MutableDBOptionsInterface#maxBackgroundCompactions()
+   *
+   * @deprecated Use {@link MutableDBOptionsInterface#setMaxBackgroundJobs(int)}
+   */
+  @Deprecated
+  T setMaxBackgroundFlushes(int maxBackgroundFlushes);
+
+  /**
+   * NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
+   * value of max_background_jobs. For backwards compatibility we will set
+   * `max_background_jobs = max_background_compactions + max_background_flushes`
+   * in the case where user sets at least one of `max_background_compactions` or
+   * `max_background_flushes`.
+   *
+   * Returns the maximum number of concurrent background flush jobs.
+   * If you're increasing this, also consider increasing number of threads in
+   * HIGH priority thread pool. For more information, see
+   * Default: -1
+   *
+   * @return the maximum number of concurrent background flush jobs.
+   * @see RocksEnv#setBackgroundThreads(int)
+   * @see RocksEnv#setBackgroundThreads(int, Priority)
+   */
+  @Deprecated
+  int maxBackgroundFlushes();
+
+  /**
+   * Specifies the maximum size of a info log file. If the current log file
+   * is larger than `max_log_file_size`, a new info log file will
+   * be created.
+   * If 0, all logs will be written to one log file.
+   *
+   * @param maxLogFileSize the maximum size of a info log file.
+   * @return the instance of the current object.
+   * @throws java.lang.IllegalArgumentException thrown on 32-Bit platforms
+   *   while overflowing the underlying platform specific value.
+   */
+  T setMaxLogFileSize(long maxLogFileSize);
+
+  /**
+   * Returns the maximum size of a info log file. If the current log file
+   * is larger than this size, a new info log file will be created.
+   * If 0, all logs will be written to one log file.
+   *
+   * @return the maximum size of the info log file.
+   */
+  long maxLogFileSize();
+
+  /**
+   * Specifies the time interval for the info log file to roll (in seconds).
+   * If specified with non-zero value, log file will be rolled
+   * if it has been active longer than `log_file_time_to_roll`.
+   * Default: 0 (disabled)
+   *
+   * @param logFileTimeToRoll the time interval in seconds.
+   * @return the instance of the current object.
+   * @throws java.lang.IllegalArgumentException thrown on 32-Bit platforms
+   *   while overflowing the underlying platform specific value.
+   */
+  T setLogFileTimeToRoll(long logFileTimeToRoll);
+
+  /**
+   * Returns the time interval for the info log file to roll (in seconds).
+   * If specified with non-zero value, log file will be rolled
+   * if it has been active longer than `log_file_time_to_roll`.
+   * Default: 0 (disabled)
+   *
+   * @return the time interval in seconds.
+   */
+  long logFileTimeToRoll();
+
+  /**
+   * Specifies the maximum number of info log files to be kept.
+   * Default: 1000
+   *
+   * @param keepLogFileNum the maximum number of info log files to be kept.
+   * @return the instance of the current object.
+   * @throws java.lang.IllegalArgumentException thrown on 32-Bit platforms
+   *   while overflowing the underlying platform specific value.
+   */
+  T setKeepLogFileNum(long keepLogFileNum);
+
+  /**
+   * Returns the maximum number of info log files to be kept.
+   * Default: 1000
+   *
+   * @return the maximum number of info log files to be kept.
+   */
+  long keepLogFileNum();
+
+  /**
+   * Recycle log files.
+   *
+   * If non-zero, we will reuse previously written log files for new
+   * logs, overwriting the old data.  The value indicates how many
+   * such files we will keep around at any point in time for later
+   * use.
+   *
+   * This is more efficient because the blocks are already
+   * allocated and fdatasync does not need to update the inode after
+   * each write.
+   *
+   * Default: 0
+   *
+   * @param recycleLogFileNum the number of log files to keep for recycling
+   *
+   * @return the reference to the current options
+   */
+  T setRecycleLogFileNum(long recycleLogFileNum);
+
+  /**
+   * Recycle log files.
+   *
+   * If non-zero, we will reuse previously written log files for new
+   * logs, overwriting the old data.  The value indicates how many
+   * such files we will keep around at any point in time for later
+   * use.
+   *
+   * This is more efficient because the blocks are already
+   * allocated and fdatasync does not need to update the inode after
+   * each write.
+   *
+   * Default: 0
+   *
+   * @return the number of log files kept for recycling
+   */
+  long recycleLogFileNum();
+
+  /**
+   * Manifest file is rolled over on reaching this limit.
+   * The older manifest file be deleted.
+   * The default value is 1GB so that the manifest file can grow, but not
+   * reach the limit of storage capacity.
+   *
+   * @param maxManifestFileSize the size limit of a manifest file.
+   * @return the instance of the current object.
+   */
+  T setMaxManifestFileSize(long maxManifestFileSize);
+
+  /**
+   * Manifest file is rolled over on reaching this limit.
+   * The older manifest file be deleted.
+   * The default value is 1GB so that the manifest file can grow, but not
+   * reach the limit of storage capacity.
+   *
+   * @return the size limit of a manifest file.
+   */
+  long maxManifestFileSize();
+
+  /**
+   * Number of shards used for table cache.
+   *
+   * @param tableCacheNumshardbits the number of chards
+   * @return the instance of the current object.
+   */
+  T setTableCacheNumshardbits(int tableCacheNumshardbits);
+
+  /**
+   * Number of shards used for table cache.
+   *
+   * @return the number of shards used for table cache.
+   */
+  int tableCacheNumshardbits();
+
+  /**
+   * {@link #walTtlSeconds()} and {@link #walSizeLimitMB()} affect how archived logs
+   * will be deleted.
+   * <ol>
+   * <li>If both set to 0, logs will be deleted asap and will not get into
+   * the archive.</li>
+   * <li>If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
+   *    WAL files will be checked every 10 min and if total size is greater
+   *    then WAL_size_limit_MB, they will be deleted starting with the
+   *    earliest until size_limit is met. All empty files will be deleted.</li>
+   * <li>If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
+   *    WAL files will be checked every WAL_ttl_seconds / 2 and those that
+   *    are older than WAL_ttl_seconds will be deleted.</li>
+   * <li>If both are not 0, WAL files will be checked every 10 min and both
+   *    checks will be performed with ttl being first.</li>
+   * </ol>
+   *
+   * @param walTtlSeconds the ttl seconds
+   * @return the instance of the current object.
+   * @see #setWalSizeLimitMB(long)
+   */
+  T setWalTtlSeconds(long walTtlSeconds);
+
+  /**
+   * WalTtlSeconds() and walSizeLimitMB() affect how archived logs
+   * will be deleted.
+   * <ol>
+   * <li>If both set to 0, logs will be deleted asap and will not get into
+   * the archive.</li>
+   * <li>If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
+   * WAL files will be checked every 10 min and if total size is greater
+   * then WAL_size_limit_MB, they will be deleted starting with the
+   * earliest until size_limit is met. All empty files will be deleted.</li>
+   * <li>If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
+   * WAL files will be checked every WAL_ttl_seconds / 2 and those that
+   * are older than WAL_ttl_seconds will be deleted.</li>
+   * <li>If both are not 0, WAL files will be checked every 10 min and both
+   * checks will be performed with ttl being first.</li>
+   * </ol>
+   *
+   * @return the wal-ttl seconds
+   * @see #walSizeLimitMB()
+   */
+  long walTtlSeconds();
+
+  /**
+   * WalTtlSeconds() and walSizeLimitMB() affect how archived logs
+   * will be deleted.
+   * <ol>
+   * <li>If both set to 0, logs will be deleted asap and will not get into
+   *    the archive.</li>
+   * <li>If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
+   *    WAL files will be checked every 10 min and if total size is greater
+   *    then WAL_size_limit_MB, they will be deleted starting with the
+   *    earliest until size_limit is met. All empty files will be deleted.</li>
+   * <li>If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
+   *    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
+   *    are older than WAL_ttl_seconds will be deleted.</li>
+   * <li>If both are not 0, WAL files will be checked every 10 min and both
+   *    checks will be performed with ttl being first.</li>
+   * </ol>
+   *
+   * @param sizeLimitMB size limit in mega-bytes.
+   * @return the instance of the current object.
+   * @see #setWalSizeLimitMB(long)
+   */
+  T setWalSizeLimitMB(long sizeLimitMB);
+
+  /**
+   * {@link #walTtlSeconds()} and {@code #walSizeLimitMB()} affect how archived logs
+   * will be deleted.
+   * <ol>
+   * <li>If both set to 0, logs will be deleted asap and will not get into
+   *    the archive.</li>
+   * <li>If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
+   *    WAL files will be checked every 10 min and if total size is greater
+   *    then WAL_size_limit_MB, they will be deleted starting with the
+   *    earliest until size_limit is met. All empty files will be deleted.</li>
+   * <li>If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
+   *    WAL files will be checked every WAL_ttl_seconds i / 2 and those that
+   *    are older than WAL_ttl_seconds will be deleted.</li>
+   * <li>If both are not 0, WAL files will be checked every 10 min and both
+   *    checks will be performed with ttl being first.</li>
+   * </ol>
+   * @return size limit in mega-bytes.
+   * @see #walSizeLimitMB()
+   */
+  long walSizeLimitMB();
+
+  /**
+   * The maximum limit of number of bytes that are written in a single batch
+   * of WAL or memtable write. It is followed when the leader write size
+   * is larger than 1/8 of this limit.
+   *
+   * Default: 1 MB
+   *
+   * @param maxWriteBatchGroupSizeBytes the maximum limit of number of bytes, see description.
+   * @return the instance of the current object.
+   */
+  T setMaxWriteBatchGroupSizeBytes(final long maxWriteBatchGroupSizeBytes);
+
+  /**
+   * The maximum limit of number of bytes that are written in a single batch
+   * of WAL or memtable write. It is followed when the leader write size
+   * is larger than 1/8 of this limit.
+   *
+   * Default: 1 MB
+   *
+   * @return the maximum limit of number of bytes, see description.
+   */
+  long maxWriteBatchGroupSizeBytes();
+
+  /**
+   * Number of bytes to preallocate (via fallocate) the manifest
+   * files.  Default is 4mb, which is reasonable to reduce random IO
+   * as well as prevent overallocation for mounts that preallocate
+   * large amounts of data (such as xfs's allocsize option).
+   *
+   * @param size the size in byte
+   * @return the instance of the current object.
+   * @throws java.lang.IllegalArgumentException thrown on 32-Bit platforms
+   *   while overflowing the underlying platform specific value.
+   */
+  T setManifestPreallocationSize(long size);
+
+  /**
+   * Number of bytes to preallocate (via fallocate) the manifest
+   * files.  Default is 4mb, which is reasonable to reduce random IO
+   * as well as prevent overallocation for mounts that preallocate
+   * large amounts of data (such as xfs's allocsize option).
+   *
+   * @return size in bytes.
+   */
+  long manifestPreallocationSize();
+
+  /**
+   * Enable the OS to use direct I/O for reading sst tables.
+   * Default: false
+   *
+   * @param useDirectReads if true, then direct read is enabled
+   * @return the instance of the current object.
+   */
+  T setUseDirectReads(boolean useDirectReads);
+
+  /**
+   * Enable the OS to use direct I/O for reading sst tables.
+   * Default: false
+   *
+   * @return if true, then direct reads are enabled
+   */
+  boolean useDirectReads();
+
+  /**
+   * Enable the OS to use direct reads and writes in flush and
+   * compaction
+   * Default: false
+   *
+   * @param useDirectIoForFlushAndCompaction if true, then direct
+   *        I/O will be enabled for background flush and compactions
+   * @return the instance of the current object.
+   */
+  T setUseDirectIoForFlushAndCompaction(boolean useDirectIoForFlushAndCompaction);
+
+  /**
+   * Enable the OS to use direct reads and writes in flush and
+   * compaction
+   *
+   * @return if true, then direct I/O is enabled for flush and
+   *         compaction
+   */
+  boolean useDirectIoForFlushAndCompaction();
+
+  /**
+   * Whether fallocate calls are allowed
+   *
+   * @param allowFAllocate false if fallocate() calls are bypassed
+   *
+   * @return the reference to the current options.
+   */
+  T setAllowFAllocate(boolean allowFAllocate);
+
+  /**
+   * Whether fallocate calls are allowed
+   *
+   * @return false if fallocate() calls are bypassed
+   */
+  boolean allowFAllocate();
+
+  /**
+   * Allow the OS to mmap file for reading sst tables.
+   * Default: false
+   *
+   * @param allowMmapReads true if mmap reads are allowed.
+   * @return the instance of the current object.
+   */
+  T setAllowMmapReads(boolean allowMmapReads);
+
+  /**
+   * Allow the OS to mmap file for reading sst tables.
+   * Default: false
+   *
+   * @return true if mmap reads are allowed.
+   */
+  boolean allowMmapReads();
+
+  /**
+   * Allow the OS to mmap file for writing. Default: false
+   *
+   * @param allowMmapWrites true if mmap writes are allowd.
+   * @return the instance of the current object.
+   */
+  T setAllowMmapWrites(boolean allowMmapWrites);
+
+  /**
+   * Allow the OS to mmap file for writing. Default: false
+   *
+   * @return true if mmap writes are allowed.
+   */
+  boolean allowMmapWrites();
+
+  /**
+   * Disable child process inherit open files. Default: true
+   *
+   * @param isFdCloseOnExec true if child process inheriting open
+   *     files is disabled.
+   * @return the instance of the current object.
+   */
+  T setIsFdCloseOnExec(boolean isFdCloseOnExec);
+
+  /**
+   * Disable child process inherit open files. Default: true
+   *
+   * @return true if child process inheriting open files is disabled.
+   */
+  boolean isFdCloseOnExec();
+
+  /**
+   * If set true, will hint the underlying file system that the file
+   * access pattern is random, when a sst file is opened.
+   * Default: true
+   *
+   * @param adviseRandomOnOpen true if hinting random access is on.
+   * @return the instance of the current object.
+   */
+  T setAdviseRandomOnOpen(boolean adviseRandomOnOpen);
+
+  /**
+   * If set true, will hint the underlying file system that the file
+   * access pattern is random, when a sst file is opened.
+   * Default: true
+   *
+   * @return true if hinting random access is on.
+   */
+  boolean adviseRandomOnOpen();
+
+  /**
+   * Amount of data to build up in memtables across all column
+   * families before writing to disk.
+   *
+   * This is distinct from {@link ColumnFamilyOptions#writeBufferSize()},
+   * which enforces a limit for a single memtable.
+   *
+   * This feature is disabled by default. Specify a non-zero value
+   * to enable it.
+   *
+   * Default: 0 (disabled)
+   *
+   * @param dbWriteBufferSize the size of the write buffer
+   *
+   * @return the reference to the current options.
+   */
+  T setDbWriteBufferSize(long dbWriteBufferSize);
+
+  /**
+   * Use passed {@link WriteBufferManager} to control memory usage across
+   * multiple column families and/or DB instances.
+   *
+   * Check <a href="https://github.com/facebook/rocksdb/wiki/Write-Buffer-Manager">
+   *     https://github.com/facebook/rocksdb/wiki/Write-Buffer-Manager</a>
+   * for more details on when to use it
+   *
+   * @param writeBufferManager The WriteBufferManager to use
+   * @return the reference of the current options.
+   */
+  T setWriteBufferManager(final WriteBufferManager writeBufferManager);
+
+  /**
+   * Reference to {@link WriteBufferManager} used by it. <br>
+   *
+   * Default: null (Disabled)
+   *
+   * @return a reference to WriteBufferManager
+   */
+  WriteBufferManager writeBufferManager();
+
+  /**
+   * Amount of data to build up in memtables across all column
+   * families before writing to disk.
+   *
+   * This is distinct from {@link ColumnFamilyOptions#writeBufferSize()},
+   * which enforces a limit for a single memtable.
+   *
+   * This feature is disabled by default. Specify a non-zero value
+   * to enable it.
+   *
+   * Default: 0 (disabled)
+   *
+   * @return the size of the write buffer
+   */
+  long dbWriteBufferSize();
+
+  /**
+   * Specify the file access pattern once a compaction is started.
+   * It will be applied to all input files of a compaction.
+   *
+   * Default: {@link AccessHint#NORMAL}
+   *
+   * @param accessHint The access hint
+   *
+   * @return the reference to the current options.
+   */
+  T setAccessHintOnCompactionStart(final AccessHint accessHint);
+
+  /**
+   * Specify the file access pattern once a compaction is started.
+   * It will be applied to all input files of a compaction.
+   *
+   * Default: {@link AccessHint#NORMAL}
+   *
+   * @return The access hint
+   */
+  AccessHint accessHintOnCompactionStart();
+
+  /**
+   * This is a maximum buffer size that is used by WinMmapReadableFile in
+   * unbuffered disk I/O mode. We need to maintain an aligned buffer for
+   * reads. We allow the buffer to grow until the specified value and then
+   * for bigger requests allocate one shot buffers. In unbuffered mode we
+   * always bypass read-ahead buffer at ReadaheadRandomAccessFile
+   * When read-ahead is required we then make use of
+   * {@link MutableDBOptionsInterface#compactionReadaheadSize()} value and
+   * always try to read ahead.
+   * With read-ahead we always pre-allocate buffer to the size instead of
+   * growing it up to a limit.
+   *
+   * This option is currently honored only on Windows
+   *
+   * Default: 1 Mb
+   *
+   * Special value: 0 - means do not maintain per instance buffer. Allocate
+   *                per request buffer and avoid locking.
+   *
+   * @param randomAccessMaxBufferSize the maximum size of the random access
+   *     buffer
+   *
+   * @return the reference to the current options.
+   */
+  T setRandomAccessMaxBufferSize(long randomAccessMaxBufferSize);
+
+  /**
+   * This is a maximum buffer size that is used by WinMmapReadableFile in
+   * unbuffered disk I/O mode. We need to maintain an aligned buffer for
+   * reads. We allow the buffer to grow until the specified value and then
+   * for bigger requests allocate one shot buffers. In unbuffered mode we
+   * always bypass read-ahead buffer at ReadaheadRandomAccessFile
+   * When read-ahead is required we then make use of
+   * {@link MutableDBOptionsInterface#compactionReadaheadSize()} value and
+   * always try to read ahead. With read-ahead we always pre-allocate buffer
+   * to the size instead of growing it up to a limit.
+   *
+   * This option is currently honored only on Windows
+   *
+   * Default: 1 Mb
+   *
+   * Special value: 0 - means do not maintain per instance buffer. Allocate
+   *                per request buffer and avoid locking.
+   *
+   * @return the maximum size of the random access buffer
+   */
+  long randomAccessMaxBufferSize();
+
+  /**
+   * Use adaptive mutex, which spins in the user space before resorting
+   * to kernel. This could reduce context switch when the mutex is not
+   * heavily contended. However, if the mutex is hot, we could end up
+   * wasting spin time.
+   * Default: false
+   *
+   * @param useAdaptiveMutex true if adaptive mutex is used.
+   * @return the instance of the current object.
+   */
+  T setUseAdaptiveMutex(boolean useAdaptiveMutex);
+
+  /**
+   * Use adaptive mutex, which spins in the user space before resorting
+   * to kernel. This could reduce context switch when the mutex is not
+   * heavily contended. However, if the mutex is hot, we could end up
+   * wasting spin time.
+   * Default: false
+   *
+   * @return true if adaptive mutex is used.
+   */
+  boolean useAdaptiveMutex();
+
+  /**
+   * Sets the {@link EventListener}s whose callback functions
+   * will be called when specific RocksDB event happens.
+   *
+   * Note: the RocksJava API currently only supports EventListeners implemented in Java.
+   * It could be extended in future to also support adding/removing EventListeners implemented in
+   * C++.
+   *
+   * @param listeners the listeners who should be notified on various events.
+   *
+   * @return the instance of the current object.
+   */
+  T setListeners(final List<AbstractEventListener> listeners);
+
+  /**
+   * Sets the {@link EventListener}s whose callback functions
+   * will be called when specific RocksDB event happens.
+   *
+   * Note: the RocksJava API currently only supports EventListeners implemented in Java.
+   * It could be extended in future to also support adding/removing EventListeners implemented in
+   * C++.
+   *
+   * @return the instance of the current object.
+   */
+  List<AbstractEventListener> listeners();
+
+  /**
+   * If true, then the status of the threads involved in this DB will
+   * be tracked and available via GetThreadList() API.
+   *
+   * Default: false
+   *
+   * @param enableThreadTracking true to enable tracking
+   *
+   * @return the reference to the current options.
+   */
+  T setEnableThreadTracking(boolean enableThreadTracking);
+
+  /**
+   * If true, then the status of the threads involved in this DB will
+   * be tracked and available via GetThreadList() API.
+   *
+   * Default: false
+   *
+   * @return true if tracking is enabled
+   */
+  boolean enableThreadTracking();
+
+  /**
+   * By default, a single write thread queue is maintained. The thread gets
+   * to the head of the queue becomes write batch group leader and responsible
+   * for writing to WAL and memtable for the batch group.
+   *
+   * If {@link #enablePipelinedWrite()} is true, separate write thread queue is
+   * maintained for WAL write and memtable write. A write thread first enter WAL
+   * writer queue and then memtable writer queue. Pending thread on the WAL
+   * writer queue thus only have to wait for previous writers to finish their
+   * WAL writing but not the memtable writing. Enabling the feature may improve
+   * write throughput and reduce latency of the prepare phase of two-phase
+   * commit.
+   *
+   * Default: false
+   *
+   * @param enablePipelinedWrite true to enabled pipelined writes
+   *
+   * @return the reference to the current options.
+   */
+  T setEnablePipelinedWrite(final boolean enablePipelinedWrite);
+
+  /**
+   * Returns true if pipelined writes are enabled.
+   * See {@link #setEnablePipelinedWrite(boolean)}.
+   *
+   * @return true if pipelined writes are enabled, false otherwise.
+   */
+  boolean enablePipelinedWrite();
+
+  /**
+   * Setting {@link #unorderedWrite()} to true trades higher write throughput with
+   * relaxing the immutability guarantee of snapshots. This violates the
+   * repeatability one expects from ::Get from a snapshot, as well as
+   * ::MultiGet and Iterator's consistent-point-in-time view property.
+   * If the application cannot tolerate the relaxed guarantees, it can implement
+   * its own mechanisms to work around that and yet benefit from the higher
+   * throughput. Using TransactionDB with WRITE_PREPARED write policy and
+   * {@link #twoWriteQueues()} true is one way to achieve immutable snapshots despite
+   * unordered_write.
+   *
+   * By default, i.e., when it is false, rocksdb does not advance the sequence
+   * number for new snapshots unless all the writes with lower sequence numbers
+   * are already finished. This provides the immutability that we except from
+   * snapshots. Moreover, since Iterator and MultiGet internally depend on
+   * snapshots, the snapshot immutability results into Iterator and MultiGet
+   * offering consistent-point-in-time view. If set to true, although
+   * Read-Your-Own-Write property is still provided, the snapshot immutability
+   * property is relaxed: the writes issued after the snapshot is obtained (with
+   * larger sequence numbers) will be still not visible to the reads from that
+   * snapshot, however, there still might be pending writes (with lower sequence
+   * number) that will change the state visible to the snapshot after they are
+   * landed to the memtable.
+   *
+   * @param unorderedWrite true to enabled unordered write
+   *
+   * @return the reference to the current options.
+   */
+  T setUnorderedWrite(final boolean unorderedWrite);
+
+  /**
+   * Returns true if unordered write are enabled.
+   * See {@link #setUnorderedWrite(boolean)}.
+   *
+   * @return true if unordered write are enabled, false otherwise.
+   */
+  boolean unorderedWrite();
+
+  /**
+   * If true, allow multi-writers to update mem tables in parallel.
+   * Only some memtable factorys support concurrent writes; currently it
+   * is implemented only for SkipListFactory.  Concurrent memtable writes
+   * are not compatible with inplace_update_support or filter_deletes.
+   * It is strongly recommended to set
+   * {@link #setEnableWriteThreadAdaptiveYield(boolean)} if you are going to use
+   * this feature.
+   * Default: true
+   *
+   * @param allowConcurrentMemtableWrite true to enable concurrent writes
+   *     for the memtable
+   *
+   * @return the reference to the current options.
+   */
+  T setAllowConcurrentMemtableWrite(boolean allowConcurrentMemtableWrite);
+
+  /**
+   * If true, allow multi-writers to update mem tables in parallel.
+   * Only some memtable factorys support concurrent writes; currently it
+   * is implemented only for SkipListFactory.  Concurrent memtable writes
+   * are not compatible with inplace_update_support or filter_deletes.
+   * It is strongly recommended to set
+   * {@link #setEnableWriteThreadAdaptiveYield(boolean)} if you are going to use
+   * this feature.
+   * Default: true
+   *
+   * @return true if concurrent writes are enabled for the memtable
+   */
+  boolean allowConcurrentMemtableWrite();
+
+  /**
+   * If true, threads synchronizing with the write batch group leader will
+   * wait for up to {@link #writeThreadMaxYieldUsec()} before blocking on a
+   * mutex. This can substantially improve throughput for concurrent workloads,
+   * regardless of whether {@link #allowConcurrentMemtableWrite()} is enabled.
+   * Default: true
+   *
+   * @param enableWriteThreadAdaptiveYield true to enable adaptive yield for the
+   *     write threads
+   *
+   * @return the reference to the current options.
+   */
+  T setEnableWriteThreadAdaptiveYield(
+      boolean enableWriteThreadAdaptiveYield);
+
+  /**
+   * If true, threads synchronizing with the write batch group leader will
+   * wait for up to {@link #writeThreadMaxYieldUsec()} before blocking on a
+   * mutex. This can substantially improve throughput for concurrent workloads,
+   * regardless of whether {@link #allowConcurrentMemtableWrite()} is enabled.
+   * Default: true
+   *
+   * @return true if adaptive yield is enabled
+   *    for the writing threads
+   */
+  boolean enableWriteThreadAdaptiveYield();
+
+  /**
+   * The maximum number of microseconds that a write operation will use
+   * a yielding spin loop to coordinate with other write threads before
+   * blocking on a mutex.  (Assuming {@link #writeThreadSlowYieldUsec()} is
+   * set properly) increasing this value is likely to increase RocksDB
+   * throughput at the expense of increased CPU usage.
+   * Default: 100
+   *
+   * @param writeThreadMaxYieldUsec maximum number of microseconds
+   *
+   * @return the reference to the current options.
+   */
+  T setWriteThreadMaxYieldUsec(long writeThreadMaxYieldUsec);
+
+  /**
+   * The maximum number of microseconds that a write operation will use
+   * a yielding spin loop to coordinate with other write threads before
+   * blocking on a mutex.  (Assuming {@link #writeThreadSlowYieldUsec()} is
+   * set properly) increasing this value is likely to increase RocksDB
+   * throughput at the expense of increased CPU usage.
+   * Default: 100
+   *
+   * @return the maximum number of microseconds
+   */
+  long writeThreadMaxYieldUsec();
+
+  /**
+   * The latency in microseconds after which a std::this_thread::yield
+   * call (sched_yield on Linux) is considered to be a signal that
+   * other processes or threads would like to use the current core.
+   * Increasing this makes writer threads more likely to take CPU
+   * by spinning, which will show up as an increase in the number of
+   * involuntary context switches.
+   * Default: 3
+   *
+   * @param writeThreadSlowYieldUsec the latency in microseconds
+   *
+   * @return the reference to the current options.
+   */
+  T setWriteThreadSlowYieldUsec(long writeThreadSlowYieldUsec);
+
+  /**
+   * The latency in microseconds after which a std::this_thread::yield
+   * call (sched_yield on Linux) is considered to be a signal that
+   * other processes or threads would like to use the current core.
+   * Increasing this makes writer threads more likely to take CPU
+   * by spinning, which will show up as an increase in the number of
+   * involuntary context switches.
+   * Default: 3
+   *
+   * @return writeThreadSlowYieldUsec the latency in microseconds
+   */
+  long writeThreadSlowYieldUsec();
+
+  /**
+   * If true, then DB::Open() will not update the statistics used to optimize
+   * compaction decision by loading table properties from many files.
+   * Turning off this feature will improve DBOpen time especially in
+   * disk environment.
+   *
+   * Default: false
+   *
+   * @param skipStatsUpdateOnDbOpen true if updating stats will be skipped
+   *
+   * @return the reference to the current options.
+   */
+  T setSkipStatsUpdateOnDbOpen(boolean skipStatsUpdateOnDbOpen);
+
+  /**
+   * If true, then DB::Open() will not update the statistics used to optimize
+   * compaction decision by loading table properties from many files.
+   * Turning off this feature will improve DBOpen time especially in
+   * disk environment.
+   *
+   * Default: false
+   *
+   * @return true if updating stats will be skipped
+   */
+  boolean skipStatsUpdateOnDbOpen();
+
+  /**
+   * If true, then {@link RocksDB#open(String)} will not fetch and check sizes of all sst files.
+   * This may significantly speed up startup if there are many sst files,
+   * especially when using non-default Env with expensive GetFileSize().
+   * We'll still check that all required sst files exist.
+   * If {@code paranoid_checks} is false, this option is ignored, and sst files are
+   * not checked at all.
+   *
+   * Default: false
+   *
+   * @param skipCheckingSstFileSizesOnDbOpen if true, then SST file sizes will not be checked
+   *                                         when calling {@link RocksDB#open(String)}.
+   * @return the reference to the current options.
+   */
+  T setSkipCheckingSstFileSizesOnDbOpen(final boolean skipCheckingSstFileSizesOnDbOpen);
+
+  /**
+   * If true, then {@link RocksDB#open(String)} will not fetch and check sizes of all sst files.
+   * This may significantly speed up startup if there are many sst files,
+   * especially when using non-default Env with expensive GetFileSize().
+   * We'll still check that all required sst files exist.
+   * If {@code paranoid_checks} is false, this option is ignored, and sst files are
+   * not checked at all.
+   *
+   * Default: false
+   *
+   * @return true, if file sizes will not be checked when calling {@link RocksDB#open(String)}.
+   */
+  boolean skipCheckingSstFileSizesOnDbOpen();
+
+  /**
+   * Recovery mode to control the consistency while replaying WAL
+   *
+   * Default: {@link WALRecoveryMode#PointInTimeRecovery}
+   *
+   * @param walRecoveryMode The WAL recover mode
+   *
+   * @return the reference to the current options.
+   */
+  T setWalRecoveryMode(WALRecoveryMode walRecoveryMode);
+
+  /**
+   * Recovery mode to control the consistency while replaying WAL
+   *
+   * Default: {@link WALRecoveryMode#PointInTimeRecovery}
+   *
+   * @return The WAL recover mode
+   */
+  WALRecoveryMode walRecoveryMode();
+
+  /**
+   * if set to false then recovery will fail when a prepared
+   * transaction is encountered in the WAL
+   *
+   * Default: false
+   *
+   * @param allow2pc true if two-phase-commit is enabled
+   *
+   * @return the reference to the current options.
+   */
+  T setAllow2pc(boolean allow2pc);
+
+  /**
+   * if set to false then recovery will fail when a prepared
+   * transaction is encountered in the WAL
+   *
+   * Default: false
+   *
+   * @return true if two-phase-commit is enabled
+   */
+  boolean allow2pc();
+
+  /**
+   * A global cache for table-level rows.
+   *
+   * Default: null (disabled)
+   *
+   * @param rowCache The global row cache
+   *
+   * @return the reference to the current options.
+   */
+  T setRowCache(final Cache rowCache);
+
+  /**
+   * A global cache for table-level rows.
+   *
+   * Default: null (disabled)
+   *
+   * @return The global row cache
+   */
+  Cache rowCache();
+
+  /**
+   * A filter object supplied to be invoked while processing write-ahead-logs
+   * (WALs) during recovery. The filter provides a way to inspect log
+   * records, ignoring a particular record or skipping replay.
+   * The filter is invoked at startup and is invoked from a single-thread
+   * currently.
+   *
+   * @param walFilter the filter for processing WALs during recovery.
+   *
+   * @return the reference to the current options.
+   */
+  T setWalFilter(final AbstractWalFilter walFilter);
+
+  /**
+   * Get's the filter for processing WALs during recovery.
+   * See {@link #setWalFilter(AbstractWalFilter)}.
+   *
+   * @return the filter used for processing WALs during recovery.
+   */
+  WalFilter walFilter();
+
+  /**
+   * If true, then DB::Open / CreateColumnFamily / DropColumnFamily
+   * / SetOptions will fail if options file is not detected or properly
+   * persisted.
+   *
+   * DEFAULT: false
+   *
+   * @param failIfOptionsFileError true if we should fail if there is an error
+   *     in the options file
+   *
+   * @return the reference to the current options.
+   */
+  T setFailIfOptionsFileError(boolean failIfOptionsFileError);
+
+  /**
+   * If true, then DB::Open / CreateColumnFamily / DropColumnFamily
+   * / SetOptions will fail if options file is not detected or properly
+   * persisted.
+   *
+   * DEFAULT: false
+   *
+   * @return true if we should fail if there is an error in the options file
+   */
+  boolean failIfOptionsFileError();
+
+  /**
+   * If true, then print malloc stats together with rocksdb.stats
+   * when printing to LOG.
+   *
+   * DEFAULT: false
+   *
+   * @param dumpMallocStats true if malloc stats should be printed to LOG
+   *
+   * @return the reference to the current options.
+   */
+  T setDumpMallocStats(boolean dumpMallocStats);
+
+  /**
+   * If true, then print malloc stats together with rocksdb.stats
+   * when printing to LOG.
+   *
+   * DEFAULT: false
+   *
+   * @return true if malloc stats should be printed to LOG
+   */
+  boolean dumpMallocStats();
+
+  /**
+   * By default RocksDB replay WAL logs and flush them on DB open, which may
+   * create very small SST files. If this option is enabled, RocksDB will try
+   * to avoid (but not guarantee not to) flush during recovery. Also, existing
+   * WAL logs will be kept, so that if crash happened before flush, we still
+   * have logs to recover from.
+   *
+   * DEFAULT: false
+   *
+   * @param avoidFlushDuringRecovery true to try to avoid (but not guarantee
+   *     not to) flush during recovery
+   *
+   * @return the reference to the current options.
+   */
+  T setAvoidFlushDuringRecovery(boolean avoidFlushDuringRecovery);
+
+  /**
+   * By default RocksDB replay WAL logs and flush them on DB open, which may
+   * create very small SST files. If this option is enabled, RocksDB will try
+   * to avoid (but not guarantee not to) flush during recovery. Also, existing
+   * WAL logs will be kept, so that if crash happened before flush, we still
+   * have logs to recover from.
+   *
+   * DEFAULT: false
+   *
+   * @return true to try to avoid (but not guarantee not to) flush during
+   *     recovery
+   */
+  boolean avoidFlushDuringRecovery();
+
+  /**
+   * Set this option to true during creation of database if you want
+   * to be able to ingest behind (call IngestExternalFile() skipping keys
+   * that already exist, rather than overwriting matching keys).
+   * Setting this option to true will affect 2 things:
+   *     1) Disable some internal optimizations around SST file compression
+   *     2) Reserve bottom-most level for ingested files only.
+   *     3) Note that num_levels should be &gt;= 3 if this option is turned on.
+   *
+   * DEFAULT: false
+   *
+   * @param allowIngestBehind true to allow ingest behind, false to disallow.
+   *
+   * @return the reference to the current options.
+   */
+  T setAllowIngestBehind(final boolean allowIngestBehind);
+
+  /**
+   * Returns true if ingest behind is allowed.
+   * See {@link #setAllowIngestBehind(boolean)}.
+   *
+   * @return true if ingest behind is allowed, false otherwise.
+   */
+  boolean allowIngestBehind();
+
+  /**
+   * If enabled it uses two queues for writes, one for the ones with
+   * disable_memtable and one for the ones that also write to memtable. This
+   * allows the memtable writes not to lag behind other writes. It can be used
+   * to optimize MySQL 2PC in which only the commits, which are serial, write to
+   * memtable.
+   *
+   * DEFAULT: false
+   *
+   * @param twoWriteQueues true to enable two write queues, false otherwise.
+   *
+   * @return the reference to the current options.
+   */
+  T setTwoWriteQueues(final boolean twoWriteQueues);
+
+  /**
+   * Returns true if two write queues are enabled.
+   *
+   * @return true if two write queues are enabled, false otherwise.
+   */
+  boolean twoWriteQueues();
+
+  /**
+   * If true WAL is not flushed automatically after each write. Instead it
+   * relies on manual invocation of FlushWAL to write the WAL buffer to its
+   * file.
+   *
+   * DEFAULT: false
+   *
+   * @param manualWalFlush true to set disable automatic WAL flushing,
+   *     false otherwise.
+   *
+   * @return the reference to the current options.
+   */
+  T setManualWalFlush(final boolean manualWalFlush);
+
+  /**
+   * Returns true if automatic WAL flushing is disabled.
+   * See {@link #setManualWalFlush(boolean)}.
+   *
+   * @return true if automatic WAL flushing is disabled, false otherwise.
+   */
+  boolean manualWalFlush();
+
+  /**
+   * If true, RocksDB supports flushing multiple column families and committing
+   * their results atomically to MANIFEST. Note that it is not
+   * necessary to set atomic_flush to true if WAL is always enabled since WAL
+   * allows the database to be restored to the last persistent state in WAL.
+   * This option is useful when there are column families with writes NOT
+   * protected by WAL.
+   * For manual flush, application has to specify which column families to
+   * flush atomically in {@link RocksDB#flush(FlushOptions, List)}.
+   * For auto-triggered flush, RocksDB atomically flushes ALL column families.
+   *
+   * Currently, any WAL-enabled writes after atomic flush may be replayed
+   * independently if the process crashes later and tries to recover.
+   *
+   * @param atomicFlush true to enable atomic flush of multiple column families.
+   *
+   * @return the reference to the current options.
+   */
+  T setAtomicFlush(final boolean atomicFlush);
+
+  /**
+   * Determine if atomic flush of multiple column families is enabled.
+   *
+   * See {@link #setAtomicFlush(boolean)}.
+   *
+   * @return true if atomic flush is enabled.
+   */
+  boolean atomicFlush();
+
+  /**
+   * If true, working thread may avoid doing unnecessary and long-latency
+   * operation (such as deleting obsolete files directly or deleting memtable)
+   * and will instead schedule a background job to do it.
+   * Use it if you're latency-sensitive.
+   * If set to true, takes precedence over
+   * {@link ReadOptions#setBackgroundPurgeOnIteratorCleanup(boolean)}.
+   *
+   * @param avoidUnnecessaryBlockingIO If true, working thread may avoid doing unnecessary
+   *     operation.
+   * @return the reference to the current options.
+   */
+  T setAvoidUnnecessaryBlockingIO(final boolean avoidUnnecessaryBlockingIO);
+
+  /**
+   * If true, working thread may avoid doing unnecessary and long-latency
+   * operation (such as deleting obsolete files directly or deleting memtable)
+   * and will instead schedule a background job to do it.
+   * Use it if you're latency-sensitive.
+   * If set to true, takes precedence over
+   * {@link ReadOptions#setBackgroundPurgeOnIteratorCleanup(boolean)}.
+   *
+   * @return true, if working thread may avoid doing unnecessary operation.
+   */
+  boolean avoidUnnecessaryBlockingIO();
+
+  /**
+   * If true, automatically persist stats to a hidden column family (column
+   * family name: ___rocksdb_stats_history___) every
+   * stats_persist_period_sec seconds; otherwise, write to an in-memory
+   * struct. User can query through `GetStatsHistory` API.
+   * If user attempts to create a column family with the same name on a DB
+   * which have previously set persist_stats_to_disk to true, the column family
+   * creation will fail, but the hidden column family will survive, as well as
+   * the previously persisted statistics.
+   * When peristing stats to disk, the stat name will be limited at 100 bytes.
+   * Default: false
+   *
+   * @param persistStatsToDisk true if stats should be persisted to hidden column family.
+   * @return the instance of the current object.
+   */
+  T setPersistStatsToDisk(final boolean persistStatsToDisk);
+
+  /**
+   * If true, automatically persist stats to a hidden column family (column
+   * family name: ___rocksdb_stats_history___) every
+   * stats_persist_period_sec seconds; otherwise, write to an in-memory
+   * struct. User can query through `GetStatsHistory` API.
+   * If user attempts to create a column family with the same name on a DB
+   * which have previously set persist_stats_to_disk to true, the column family
+   * creation will fail, but the hidden column family will survive, as well as
+   * the previously persisted statistics.
+   * When peristing stats to disk, the stat name will be limited at 100 bytes.
+   * Default: false
+   *
+   * @return true if stats should be persisted to hidden column family.
+   */
+  boolean persistStatsToDisk();
+
+  /**
+   * Historically DB ID has always been stored in Identity File in DB folder.
+   * If this flag is true, the DB ID is written to Manifest file in addition
+   * to the Identity file. By doing this 2 problems are solved
+   * 1. We don't checksum the Identity file where as Manifest file is.
+   * 2. Since the source of truth for DB is Manifest file DB ID will sit with
+   *    the source of truth. Previously the Identity file could be copied
+   *    independent of Manifest and that can result in wrong DB ID.
+   * We recommend setting this flag to true.
+   * Default: false
+   *
+   * @param writeDbidToManifest if true, then DB ID will be written to Manifest file.
+   * @return the instance of the current object.
+   */
+  T setWriteDbidToManifest(final boolean writeDbidToManifest);
+
+  /**
+   * Historically DB ID has always been stored in Identity File in DB folder.
+   * If this flag is true, the DB ID is written to Manifest file in addition
+   * to the Identity file. By doing this 2 problems are solved
+   * 1. We don't checksum the Identity file where as Manifest file is.
+   * 2. Since the source of truth for DB is Manifest file DB ID will sit with
+   *    the source of truth. Previously the Identity file could be copied
+   *    independent of Manifest and that can result in wrong DB ID.
+   * We recommend setting this flag to true.
+   * Default: false
+   *
+   * @return true, if DB ID will be written to Manifest file.
+   */
+  boolean writeDbidToManifest();
+
+  /**
+   * The number of bytes to prefetch when reading the log. This is mostly useful
+   * for reading a remotely located log, as it can save the number of
+   * round-trips. If 0, then the prefetching is disabled.
+   *
+   * Default: 0
+   *
+   * @param logReadaheadSize the number of bytes to prefetch when reading the log.
+   * @return the instance of the current object.
+   */
+  T setLogReadaheadSize(final long logReadaheadSize);
+
+  /**
+   * The number of bytes to prefetch when reading the log. This is mostly useful
+   * for reading a remotely located log, as it can save the number of
+   * round-trips. If 0, then the prefetching is disabled.
+   *
+   * Default: 0
+   *
+   * @return the number of bytes to prefetch when reading the log.
+   */
+  long logReadaheadSize();
+
+  /**
+   * By default, RocksDB recovery fails if any table file referenced in
+   * MANIFEST are missing after scanning the MANIFEST.
+   * Best-efforts recovery is another recovery mode that
+   * tries to restore the database to the most recent point in time without
+   * missing file.
+   * Currently not compatible with atomic flush. Furthermore, WAL files will
+   * not be used for recovery if best_efforts_recovery is true.
+   * Default: false
+   *
+   * @param bestEffortsRecovery if true, RocksDB will use best-efforts mode when recovering.
+   * @return the instance of the current object.
+   */
+  T setBestEffortsRecovery(final boolean bestEffortsRecovery);
+
+  /**
+   * By default, RocksDB recovery fails if any table file referenced in
+   * MANIFEST are missing after scanning the MANIFEST.
+   * Best-efforts recovery is another recovery mode that
+   * tries to restore the database to the most recent point in time without
+   * missing file.
+   * Currently not compatible with atomic flush. Furthermore, WAL files will
+   * not be used for recovery if best_efforts_recovery is true.
+   * Default: false
+   *
+   * @return true, if RocksDB uses best-efforts mode when recovering.
+   */
+  boolean bestEffortsRecovery();
+
+  /**
+   * It defines how many times db resume is called by a separate thread when
+   * background retryable IO Error happens. When background retryable IO
+   * Error happens, SetBGError is called to deal with the error. If the error
+   * can be auto-recovered (e.g., retryable IO Error during Flush or WAL write),
+   * then db resume is called in background to recover from the error. If this
+   * value is 0 or negative, db resume will not be called.
+   *
+   * Default: INT_MAX
+   *
+   * @param maxBgerrorResumeCount maximum number of times db resume should be called when IO Error
+   *     happens.
+   * @return the instance of the current object.
+   */
+  T setMaxBgErrorResumeCount(final int maxBgerrorResumeCount);
+
+  /**
+   * It defines how many times db resume is called by a separate thread when
+   * background retryable IO Error happens. When background retryable IO
+   * Error happens, SetBGError is called to deal with the error. If the error
+   * can be auto-recovered (e.g., retryable IO Error during Flush or WAL write),
+   * then db resume is called in background to recover from the error. If this
+   * value is 0 or negative, db resume will not be called.
+   *
+   * Default: INT_MAX
+   *
+   * @return maximum number of times db resume should be called when IO Error happens.
+   */
+  int maxBgerrorResumeCount();
+
+  /**
+   * If max_bgerror_resume_count is &ge; 2, db resume is called multiple times.
+   * This option decides how long to wait to retry the next resume if the
+   * previous resume fails and satisfy redo resume conditions.
+   *
+   * Default: 1000000 (microseconds).
+   *
+   * @param bgerrorResumeRetryInterval how many microseconds to wait between DB resume attempts.
+   * @return the instance of the current object.
+   */
+  T setBgerrorResumeRetryInterval(final long bgerrorResumeRetryInterval);
+
+  /**
+   * If max_bgerror_resume_count is &ge; 2, db resume is called multiple times.
+   * This option decides how long to wait to retry the next resume if the
+   * previous resume fails and satisfy redo resume conditions.
+   *
+   * Default: 1000000 (microseconds).
+   *
+   * @return the instance of the current object.
+   */
+  long bgerrorResumeRetryInterval();
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/DataBlockIndexType.java b/src/rocksdb/java/src/main/java/org/rocksdb/DataBlockIndexType.java
new file mode 100644
index 000000000..513e5b429
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/DataBlockIndexType.java
@@ -0,0 +1,32 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+
+/**
+ * DataBlockIndexType used in conjunction with BlockBasedTable.
+ */
+public enum DataBlockIndexType {
+  /**
+   * traditional block type
+   */
+  kDataBlockBinarySearch((byte)0x0),
+
+  /**
+   * additional hash index
+   */
+  kDataBlockBinaryAndHash((byte)0x1);
+
+  private final byte value;
+
+  DataBlockIndexType(final byte value) {
+    this.value = value;
+  }
+
+  byte getValue() {
+    return value;
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/DbPath.java b/src/rocksdb/java/src/main/java/org/rocksdb/DbPath.java
new file mode 100644
index 000000000..3f0b67557
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/DbPath.java
@@ -0,0 +1,47 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.nio.file.Path;
+
+/**
+ * Tuple of database path and target size
+ */
+public class DbPath {
+  final Path path;
+  final long targetSize;
+
+  public DbPath(final Path path, final long targetSize) {
+    this.path = path;
+    this.targetSize = targetSize;
+  }
+
+  @Override
+  public boolean equals(final Object o) {
+    if (this == o) {
+      return true;
+    }
+
+    if (o == null || getClass() != o.getClass()) {
+      return false;
+    }
+
+    final DbPath dbPath = (DbPath) o;
+
+    if (targetSize != dbPath.targetSize) {
+      return false;
+    }
+
+    return path != null ? path.equals(dbPath.path) : dbPath.path == null;
+  }
+
+  @Override
+  public int hashCode() {
+    int result = path != null ? path.hashCode() : 0;
+    result = 31 * result + (int) (targetSize ^ (targetSize >>> 32));
+    return result;
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/DirectSlice.java b/src/rocksdb/java/src/main/java/org/rocksdb/DirectSlice.java
new file mode 100644
index 000000000..02fa3511f
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/DirectSlice.java
@@ -0,0 +1,137 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.nio.ByteBuffer;
+
+/**
+ * Base class for slices which will receive direct
+ * ByteBuffer based access to the underlying data.
+ *
+ * ByteBuffer backed slices typically perform better with
+ * larger keys and values. When using smaller keys and
+ * values consider using @see org.rocksdb.Slice
+ */
+public class DirectSlice extends AbstractSlice<ByteBuffer> {
+  public final static DirectSlice NONE = new DirectSlice();
+
+  /**
+   * Indicates whether we have to free the memory pointed to by the Slice
+   */
+  private final boolean internalBuffer;
+  private volatile boolean cleared = false;
+  private volatile long internalBufferOffset = 0;
+
+  /**
+   * Called from JNI to construct a new Java DirectSlice
+   * without an underlying C++ object set
+   * at creation time.
+   *
+   * Note: You should be aware that it is intentionally marked as
+   * package-private. This is so that developers cannot construct their own
+   * default DirectSlice objects (at present). As developers cannot construct
+   * their own DirectSlice objects through this, they are not creating
+   * underlying C++ DirectSlice objects, and so there is nothing to free
+   * (dispose) from Java.
+   */
+  DirectSlice() {
+    super();
+    this.internalBuffer = false;
+  }
+
+  /**
+   * Constructs a slice
+   * where the data is taken from
+   * a String.
+   *
+   * @param str The string
+   */
+  public DirectSlice(final String str) {
+    super(createNewSliceFromString(str));
+    this.internalBuffer = true;
+  }
+
+  /**
+   * Constructs a slice where the data is
+   * read from the provided
+   * ByteBuffer up to a certain length
+   *
+   * @param data The buffer containing the data
+   * @param length The length of the data to use for the slice
+   */
+  public DirectSlice(final ByteBuffer data, final int length) {
+    super(createNewDirectSlice0(ensureDirect(data), length));
+    this.internalBuffer = false;
+  }
+
+  /**
+   * Constructs a slice where the data is
+   * read from the provided
+   * ByteBuffer
+   *
+   * @param data The bugger containing the data
+   */
+  public DirectSlice(final ByteBuffer data) {
+    super(createNewDirectSlice1(ensureDirect(data)));
+    this.internalBuffer = false;
+  }
+
+  private static ByteBuffer ensureDirect(final ByteBuffer data) {
+    if(!data.isDirect()) {
+      throw new IllegalArgumentException("The ByteBuffer must be direct");
+    }
+    return data;
+  }
+
+  /**
+   * Retrieves the byte at a specific offset
+   * from the underlying data
+   *
+   * @param offset The (zero-based) offset of the byte to retrieve
+   *
+   * @return the requested byte
+   */
+  public byte get(final int offset) {
+    return get0(getNativeHandle(), offset);
+  }
+
+  @Override
+  public void clear() {
+    clear0(getNativeHandle(), !cleared && internalBuffer, internalBufferOffset);
+    cleared = true;
+  }
+
+  @Override
+  public void removePrefix(final int n) {
+    removePrefix0(getNativeHandle(), n);
+    this.internalBufferOffset += n;
+  }
+
+  public void setLength(final int n) {
+    setLength0(getNativeHandle(), n);
+  }
+
+  @Override
+  protected void disposeInternal() {
+    final long nativeHandle = getNativeHandle();
+    if(!cleared && internalBuffer) {
+      disposeInternalBuf(nativeHandle, internalBufferOffset);
+    }
+    disposeInternal(nativeHandle);
+  }
+
+  private native static long createNewDirectSlice0(final ByteBuffer data,
+      final int length);
+  private native static long createNewDirectSlice1(final ByteBuffer data);
+  @Override protected final native ByteBuffer data0(long handle);
+  private native byte get0(long handle, int offset);
+  private native void clear0(long handle, boolean internalBuffer,
+      long internalBufferOffset);
+  private native void removePrefix0(long handle, int length);
+  private native void setLength0(long handle, int length);
+  private native void disposeInternalBuf(final long handle,
+      long internalBufferOffset);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/EncodingType.java b/src/rocksdb/java/src/main/java/org/rocksdb/EncodingType.java
new file mode 100644
index 000000000..5ceeb54c8
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/EncodingType.java
@@ -0,0 +1,55 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * EncodingType
+ *
+ * <p>The value will determine how to encode keys
+ * when writing to a new SST file.</p>
+ *
+ * <p>This value will be stored
+ * inside the SST file which will be used when reading from
+ * the file, which makes it possible for users to choose
+ * different encoding type when reopening a DB. Files with
+ * different encoding types can co-exist in the same DB and
+ * can be read.</p>
+ */
+public enum EncodingType {
+  /**
+   * Always write full keys without any special encoding.
+   */
+  kPlain((byte) 0),
+  /**
+   * <p>Find opportunity to write the same prefix once for multiple rows.
+   * In some cases, when a key follows a previous key with the same prefix,
+   * instead of writing out the full key, it just writes out the size of the
+   * shared prefix, as well as other bytes, to save some bytes.</p>
+   *
+   * <p>When using this option, the user is required to use the same prefix
+   * extractor to make sure the same prefix will be extracted from the same key.
+   * The Name() value of the prefix extractor will be stored in the file. When
+   * reopening the file, the name of the options.prefix_extractor given will be
+   * bitwise compared to the prefix extractors stored in the file. An error
+   * will be returned if the two don't match.</p>
+   */
+  kPrefix((byte) 1);
+
+  /**
+   * Returns the byte value of the enumerations value
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value_;
+  }
+
+  private EncodingType(byte value) {
+    value_ = value;
+  }
+
+  private final byte value_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/Env.java b/src/rocksdb/java/src/main/java/org/rocksdb/Env.java
new file mode 100644
index 000000000..07b5319bb
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/Env.java
@@ -0,0 +1,167 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Base class for all Env implementations in RocksDB.
+ */
+public abstract class Env extends RocksObject {
+
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  private static final Env DEFAULT_ENV = new RocksEnv(getDefaultEnvInternal());
+  static {
+    /**
+     * The Ownership of the Default Env belongs to C++
+     * and so we disown the native handle here so that
+     * we cannot accidentally free it from Java.
+     */
+    DEFAULT_ENV.disOwnNativeHandle();
+  }
+
+  /**
+   * <p>Returns the default environment suitable for the current operating
+   * system.</p>
+   *
+   * <p>The result of {@code getDefault()} is a singleton whose ownership
+   * belongs to rocksdb c++.  As a result, the returned RocksEnv will not
+   * have the ownership of its c++ resource, and calling its dispose()/close()
+   * will be no-op.</p>
+   *
+   * @return the default {@link org.rocksdb.RocksEnv} instance.
+   */
+  public static Env getDefault() {
+    return DEFAULT_ENV;
+  }
+
+  /**
+   * <p>Sets the number of background worker threads of the low priority
+   * pool for this environment.</p>
+   * <p>Default number: 1</p>
+   *
+   * @param number the number of threads
+   *
+   * @return current {@link RocksEnv} instance.
+   */
+  public Env setBackgroundThreads(final int number) {
+    return setBackgroundThreads(number, Priority.LOW);
+  }
+
+  /**
+   * <p>Gets the number of background worker threads of the pool
+   * for this environment.</p>
+   *
+   * @param priority the priority id of a specified thread pool.
+   *
+   * @return the number of threads.
+   */
+  public int getBackgroundThreads(final Priority priority) {
+    return getBackgroundThreads(nativeHandle_, priority.getValue());
+  }
+
+  /**
+   * <p>Sets the number of background worker threads of the specified thread
+   * pool for this environment.</p>
+   *
+   * @param number the number of threads
+   * @param priority the priority id of a specified thread pool.
+   *
+   * <p>Default number: 1</p>
+   * @return current {@link RocksEnv} instance.
+   */
+  public Env setBackgroundThreads(final int number, final Priority priority) {
+    setBackgroundThreads(nativeHandle_, number, priority.getValue());
+    return this;
+  }
+
+  /**
+   * <p>Returns the length of the queue associated with the specified
+   * thread pool.</p>
+   *
+   * @param priority the priority id of a specified thread pool.
+   *
+   * @return the thread pool queue length.
+   */
+  public int getThreadPoolQueueLen(final Priority priority) {
+    return getThreadPoolQueueLen(nativeHandle_, priority.getValue());
+  }
+
+  /**
+   * Enlarge number of background worker threads of a specific thread pool
+   * for this environment if it is smaller than specified. 'LOW' is the default
+   * pool.
+   *
+   * @param number the number of threads.
+   * @param priority the priority id of a specified thread pool.
+   *
+   * @return current {@link RocksEnv} instance.
+   */
+  public Env incBackgroundThreadsIfNeeded(final int number,
+    final Priority priority) {
+    incBackgroundThreadsIfNeeded(nativeHandle_, number, priority.getValue());
+    return this;
+  }
+
+  /**
+   * Lower IO priority for threads from the specified pool.
+   *
+   * @param priority the priority id of a specified thread pool.
+   *
+   * @return current {@link RocksEnv} instance.
+   */
+  public Env lowerThreadPoolIOPriority(final Priority priority) {
+    lowerThreadPoolIOPriority(nativeHandle_, priority.getValue());
+    return this;
+  }
+
+  /**
+   * Lower CPU priority for threads from the specified pool.
+   *
+   * @param priority the priority id of a specified thread pool.
+   *
+   * @return current {@link RocksEnv} instance.
+   */
+  public Env lowerThreadPoolCPUPriority(final Priority priority) {
+    lowerThreadPoolCPUPriority(nativeHandle_, priority.getValue());
+    return this;
+  }
+
+  /**
+   * Returns the status of all threads that belong to the current Env.
+   *
+   * @return the status of all threads belong to this env.
+   *
+   * @throws RocksDBException if the thread list cannot be acquired.
+   */
+  public List<ThreadStatus> getThreadList() throws RocksDBException {
+    return Arrays.asList(getThreadList(nativeHandle_));
+  }
+
+  Env(final long nativeHandle) {
+    super(nativeHandle);
+  }
+
+  private static native long getDefaultEnvInternal();
+  private native void setBackgroundThreads(
+      final long handle, final int number, final byte priority);
+  private native int getBackgroundThreads(final long handle,
+    final byte priority);
+  private native int getThreadPoolQueueLen(final long handle,
+      final byte priority);
+  private native void incBackgroundThreadsIfNeeded(final long handle,
+      final int number, final byte priority);
+  private native void lowerThreadPoolIOPriority(final long handle,
+      final byte priority);
+  private native void lowerThreadPoolCPUPriority(final long handle,
+      final byte priority);
+  private native ThreadStatus[] getThreadList(final long handle)
+      throws RocksDBException;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/EnvOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/EnvOptions.java
new file mode 100644
index 000000000..6baddb310
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/EnvOptions.java
@@ -0,0 +1,366 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Options while opening a file to read/write
+ */
+public class EnvOptions extends RocksObject {
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  /**
+   * Construct with default Options
+   */
+  public EnvOptions() {
+    super(newEnvOptions());
+  }
+
+  /**
+   * Construct from {@link DBOptions}.
+   *
+   * @param dbOptions the database options.
+   */
+  public EnvOptions(final DBOptions dbOptions) {
+    super(newEnvOptions(dbOptions.nativeHandle_));
+  }
+
+  /**
+   * Enable/Disable memory mapped reads.
+   *
+   * Default: false
+   *
+   * @param useMmapReads true to enable memory mapped reads, false to disable.
+   *
+   * @return the reference to these options.
+   */
+  public EnvOptions setUseMmapReads(final boolean useMmapReads) {
+    setUseMmapReads(nativeHandle_, useMmapReads);
+    return this;
+  }
+
+  /**
+   * Determine if memory mapped reads are in-use.
+   *
+   * @return true if memory mapped reads are in-use, false otherwise.
+   */
+  public boolean useMmapReads() {
+    assert(isOwningHandle());
+    return useMmapReads(nativeHandle_);
+  }
+
+  /**
+   * Enable/Disable memory mapped Writes.
+   *
+   * Default: true
+   *
+   * @param useMmapWrites true to enable memory mapped writes, false to disable.
+   *
+   * @return the reference to these options.
+   */
+  public EnvOptions setUseMmapWrites(final boolean useMmapWrites) {
+    setUseMmapWrites(nativeHandle_, useMmapWrites);
+    return this;
+  }
+
+  /**
+   * Determine if memory mapped writes are in-use.
+   *
+   * @return true if memory mapped writes are in-use, false otherwise.
+   */
+  public boolean useMmapWrites() {
+    assert(isOwningHandle());
+    return useMmapWrites(nativeHandle_);
+  }
+
+  /**
+   * Enable/Disable direct reads, i.e. {@code O_DIRECT}.
+   *
+   * Default: false
+   *
+   * @param useDirectReads true to enable direct reads, false to disable.
+   *
+   * @return the reference to these options.
+   */
+  public EnvOptions setUseDirectReads(final boolean useDirectReads) {
+    setUseDirectReads(nativeHandle_, useDirectReads);
+    return this;
+  }
+
+  /**
+   * Determine if direct reads are in-use.
+   *
+   * @return true if direct reads are in-use, false otherwise.
+   */
+  public boolean useDirectReads() {
+    assert(isOwningHandle());
+    return useDirectReads(nativeHandle_);
+  }
+
+  /**
+   * Enable/Disable direct writes, i.e. {@code O_DIRECT}.
+   *
+   * Default: false
+   *
+   * @param useDirectWrites true to enable direct writes, false to disable.
+   *
+   * @return the reference to these options.
+   */
+  public EnvOptions setUseDirectWrites(final boolean useDirectWrites) {
+    setUseDirectWrites(nativeHandle_, useDirectWrites);
+    return this;
+  }
+
+  /**
+   * Determine if direct writes are in-use.
+   *
+   * @return true if direct writes are in-use, false otherwise.
+   */
+  public boolean useDirectWrites() {
+    assert(isOwningHandle());
+    return useDirectWrites(nativeHandle_);
+  }
+
+  /**
+   * Enable/Disable fallocate calls.
+   *
+   * Default: true
+   *
+   * If false, {@code fallocate()} calls are bypassed.
+   *
+   * @param allowFallocate true to enable fallocate calls, false to disable.
+   *
+   * @return the reference to these options.
+   */
+  public EnvOptions setAllowFallocate(final boolean allowFallocate) {
+    setAllowFallocate(nativeHandle_, allowFallocate);
+    return this;
+  }
+
+  /**
+   * Determine if fallocate calls are used.
+   *
+   * @return true if fallocate calls are used, false otherwise.
+   */
+  public boolean allowFallocate() {
+    assert(isOwningHandle());
+    return allowFallocate(nativeHandle_);
+  }
+
+  /**
+   * Enable/Disable the {@code FD_CLOEXEC} bit when opening file descriptors.
+   *
+   * Default: true
+   *
+   * @param setFdCloexec true to enable the {@code FB_CLOEXEC} bit,
+   *     false to disable.
+   *
+   * @return the reference to these options.
+   */
+  public EnvOptions setSetFdCloexec(final boolean setFdCloexec) {
+    setSetFdCloexec(nativeHandle_, setFdCloexec);
+    return this;
+  }
+
+  /**
+   * Determine i fthe {@code FD_CLOEXEC} bit is set when opening file
+   * descriptors.
+   *
+   * @return true if the {@code FB_CLOEXEC} bit is enabled, false otherwise.
+   */
+  public boolean setFdCloexec() {
+    assert(isOwningHandle());
+    return setFdCloexec(nativeHandle_);
+  }
+
+  /**
+   * Allows OS to incrementally sync files to disk while they are being
+   * written, in the background. Issue one request for every
+   * {@code bytesPerSync} written.
+   *
+   * Default: 0
+   *
+   * @param bytesPerSync 0 to disable, otherwise the number of bytes.
+   *
+   * @return the reference to these options.
+   */
+  public EnvOptions setBytesPerSync(final long bytesPerSync) {
+    setBytesPerSync(nativeHandle_, bytesPerSync);
+    return this;
+  }
+
+  /**
+   * Get the number of incremental bytes per sync written in the background.
+   *
+   * @return 0 if disabled, otherwise the number of bytes.
+   */
+  public long bytesPerSync() {
+    assert(isOwningHandle());
+    return bytesPerSync(nativeHandle_);
+  }
+
+  /**
+   * If true, we will preallocate the file with {@code FALLOC_FL_KEEP_SIZE}
+   * flag, which means that file size won't change as part of preallocation.
+   * If false, preallocation will also change the file size. This option will
+   * improve the performance in workloads where you sync the data on every
+   * write. By default, we set it to true for MANIFEST writes and false for
+   * WAL writes
+   *
+   * @param fallocateWithKeepSize true to preallocate, false otherwise.
+   *
+   * @return the reference to these options.
+   */
+  public EnvOptions setFallocateWithKeepSize(
+      final boolean fallocateWithKeepSize) {
+    setFallocateWithKeepSize(nativeHandle_, fallocateWithKeepSize);
+    return this;
+  }
+
+  /**
+   * Determine if file is preallocated.
+   *
+   * @return true if the file is preallocated, false otherwise.
+   */
+  public boolean fallocateWithKeepSize() {
+    assert(isOwningHandle());
+    return fallocateWithKeepSize(nativeHandle_);
+  }
+
+  /**
+   * See {@link DBOptions#setCompactionReadaheadSize(long)}.
+   *
+   * @param compactionReadaheadSize the compaction read-ahead size.
+   *
+   * @return the reference to these options.
+   */
+  public EnvOptions setCompactionReadaheadSize(
+      final long compactionReadaheadSize) {
+    setCompactionReadaheadSize(nativeHandle_, compactionReadaheadSize);
+    return this;
+  }
+
+  /**
+   * See {@link DBOptions#compactionReadaheadSize()}.
+   *
+   * @return the compaction read-ahead size.
+   */
+  public long compactionReadaheadSize() {
+    assert(isOwningHandle());
+    return compactionReadaheadSize(nativeHandle_);
+  }
+
+  /**
+   * See {@link DBOptions#setRandomAccessMaxBufferSize(long)}.
+   *
+   * @param randomAccessMaxBufferSize the max buffer size for random access.
+   *
+   * @return the reference to these options.
+   */
+  public EnvOptions setRandomAccessMaxBufferSize(
+      final long randomAccessMaxBufferSize) {
+    setRandomAccessMaxBufferSize(nativeHandle_, randomAccessMaxBufferSize);
+    return this;
+  }
+
+  /**
+   * See {@link DBOptions#randomAccessMaxBufferSize()}.
+   *
+   * @return the max buffer size for random access.
+   */
+  public long randomAccessMaxBufferSize() {
+    assert(isOwningHandle());
+    return randomAccessMaxBufferSize(nativeHandle_);
+  }
+
+  /**
+   * See {@link DBOptions#setWritableFileMaxBufferSize(long)}.
+   *
+   * @param writableFileMaxBufferSize the max buffer size.
+   *
+   * @return the reference to these options.
+   */
+  public EnvOptions setWritableFileMaxBufferSize(
+      final long writableFileMaxBufferSize) {
+    setWritableFileMaxBufferSize(nativeHandle_, writableFileMaxBufferSize);
+    return this;
+  }
+
+  /**
+   * See {@link DBOptions#writableFileMaxBufferSize()}.
+   *
+   * @return the max buffer size.
+   */
+  public long writableFileMaxBufferSize() {
+    assert(isOwningHandle());
+    return writableFileMaxBufferSize(nativeHandle_);
+  }
+
+  /**
+   * Set the write rate limiter for flush and compaction.
+   *
+   * @param rateLimiter the rate limiter.
+   *
+   * @return the reference to these options.
+   */
+  public EnvOptions setRateLimiter(final RateLimiter rateLimiter) {
+    this.rateLimiter = rateLimiter;
+    setRateLimiter(nativeHandle_, rateLimiter.nativeHandle_);
+    return this;
+  }
+
+  /**
+   * Get the write rate limiter for flush and compaction.
+   *
+   * @return the rate limiter.
+   */
+  public RateLimiter rateLimiter() {
+    assert(isOwningHandle());
+    return rateLimiter;
+  }
+
+  private native static long newEnvOptions();
+  private native static long newEnvOptions(final long dboptions_handle);
+  @Override protected final native void disposeInternal(final long handle);
+
+  private native void setUseMmapReads(final long handle,
+      final boolean useMmapReads);
+  private native boolean useMmapReads(final long handle);
+  private native void setUseMmapWrites(final long handle,
+      final boolean useMmapWrites);
+  private native boolean useMmapWrites(final long handle);
+  private native void setUseDirectReads(final long handle,
+      final boolean useDirectReads);
+  private native boolean useDirectReads(final long handle);
+  private native void setUseDirectWrites(final long handle,
+      final boolean useDirectWrites);
+  private native boolean useDirectWrites(final long handle);
+  private native void setAllowFallocate(final long handle,
+      final boolean allowFallocate);
+  private native boolean allowFallocate(final long handle);
+  private native void setSetFdCloexec(final long handle,
+      final boolean setFdCloexec);
+  private native boolean setFdCloexec(final long handle);
+  private native void setBytesPerSync(final long handle,
+      final long bytesPerSync);
+  private native long bytesPerSync(final long handle);
+  private native void setFallocateWithKeepSize(
+      final long handle, final boolean fallocateWithKeepSize);
+  private native boolean fallocateWithKeepSize(final long handle);
+  private native void setCompactionReadaheadSize(
+      final long handle, final long compactionReadaheadSize);
+  private native long compactionReadaheadSize(final long handle);
+  private native void setRandomAccessMaxBufferSize(
+      final long handle, final long randomAccessMaxBufferSize);
+  private native long randomAccessMaxBufferSize(final long handle);
+  private native void setWritableFileMaxBufferSize(
+      final long handle, final long writableFileMaxBufferSize);
+  private native long writableFileMaxBufferSize(final long handle);
+  private native void setRateLimiter(final long handle,
+      final long rateLimiterHandle);
+  private RateLimiter rateLimiter;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/EventListener.java b/src/rocksdb/java/src/main/java/org/rocksdb/EventListener.java
new file mode 100644
index 000000000..a12ab92ba
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/EventListener.java
@@ -0,0 +1,335 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.List;
+
+/**
+ * EventListener class contains a set of callback functions that will
+ * be called when specific RocksDB event happens such as flush.  It can
+ * be used as a building block for developing custom features such as
+ * stats-collector or external compaction algorithm.
+ *
+ * Note that callback functions should not run for an extended period of
+ * time before the function returns, otherwise RocksDB may be blocked.
+ * For example, it is not suggested to do
+ * {@link RocksDB#compactFiles(CompactionOptions, ColumnFamilyHandle, List, int, int,
+ * CompactionJobInfo)} (as it may run for a long while) or issue many of
+ * {@link RocksDB#put(ColumnFamilyHandle, WriteOptions, byte[], byte[])}
+ * (as Put may be blocked in certain cases) in the same thread in the
+ * EventListener callback.
+ *
+ * However, doing
+ * {@link RocksDB#compactFiles(CompactionOptions, ColumnFamilyHandle, List, int, int,
+ * CompactionJobInfo)} and {@link RocksDB#put(ColumnFamilyHandle, WriteOptions, byte[], byte[])} in
+ * another thread is considered safe.
+ *
+ * [Threading] All EventListener callback will be called using the
+ * actual thread that involves in that specific event. For example, it
+ * is the RocksDB background flush thread that does the actual flush to
+ * call {@link #onFlushCompleted(RocksDB, FlushJobInfo)}.
+ *
+ * [Locking] All EventListener callbacks are designed to be called without
+ * the current thread holding any DB mutex. This is to prevent potential
+ * deadlock and performance issue when using EventListener callback
+ * in a complex way.
+ */
+public interface EventListener {
+  /**
+   * A callback function to RocksDB which will be called before a
+   * RocksDB starts to flush memtables.
+   *
+   * Note that the this function must be implemented in a way such that
+   * it should not run for an extended period of time before the function
+   * returns. Otherwise, RocksDB may be blocked.
+   *
+   * @param db the database
+   * @param flushJobInfo the flush job info, contains data copied from
+   *     respective native structure.
+   */
+  void onFlushBegin(final RocksDB db, final FlushJobInfo flushJobInfo);
+
+  /**
+   * callback function to RocksDB which will be called whenever a
+   * registered RocksDB flushes a file.
+   *
+   * Note that the this function must be implemented in a way such that
+   * it should not run for an extended period of time before the function
+   * returns. Otherwise, RocksDB may be blocked.
+   *
+   * @param db the database
+   * @param flushJobInfo the flush job info, contains data copied from
+   *     respective native structure.
+   */
+  void onFlushCompleted(final RocksDB db, final FlushJobInfo flushJobInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever
+   * a SST file is deleted. Different from
+   * {@link #onCompactionCompleted(RocksDB, CompactionJobInfo)} and
+   * {@link #onFlushCompleted(RocksDB, FlushJobInfo)},
+   * this callback is designed for external logging
+   * service and thus only provide string parameters instead
+   * of a pointer to DB.  Applications that build logic basic based
+   * on file creations and deletions is suggested to implement
+   * {@link #onFlushCompleted(RocksDB, FlushJobInfo)} and
+   * {@link #onCompactionCompleted(RocksDB, CompactionJobInfo)}.
+   *
+   * Note that if applications would like to use the passed reference
+   * outside this function call, they should make copies from the
+   * returned value.
+   *
+   * @param tableFileDeletionInfo the table file deletion info,
+   *     contains data copied from respective native structure.
+   */
+  void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo);
+
+  /**
+   * A callback function to RocksDB which will be called before a
+   * RocksDB starts to compact. The default implementation is
+   * no-op.
+   *
+   * Note that the this function must be implemented in a way such that
+   * it should not run for an extended period of time before the function
+   * returns. Otherwise, RocksDB may be blocked.
+   *
+   * @param db a pointer to the rocksdb instance which just compacted
+   *     a file.
+   * @param compactionJobInfo a reference to a native CompactionJobInfo struct,
+   *     which is released after this function is returned, and must be copied
+   *     if it is needed outside of this function.
+   */
+  void onCompactionBegin(final RocksDB db, final CompactionJobInfo compactionJobInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever
+   * a registered RocksDB compacts a file. The default implementation
+   * is a no-op.
+   *
+   * Note that this function must be implemented in a way such that
+   * it should not run for an extended period of time before the function
+   * returns. Otherwise, RocksDB may be blocked.
+   *
+   * @param db a pointer to the rocksdb instance which just compacted
+   *     a file.
+   * @param compactionJobInfo a reference to a native CompactionJobInfo struct,
+   *     which is released after this function is returned, and must be copied
+   *     if it is needed outside of this function.
+   */
+  void onCompactionCompleted(final RocksDB db, final CompactionJobInfo compactionJobInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever
+   * a SST file is created.  Different from OnCompactionCompleted and
+   * OnFlushCompleted, this callback is designed for external logging
+   * service and thus only provide string parameters instead
+   * of a pointer to DB.  Applications that build logic basic based
+   * on file creations and deletions is suggested to implement
+   * OnFlushCompleted and OnCompactionCompleted.
+   *
+   * Historically it will only be called if the file is successfully created.
+   * Now it will also be called on failure case. User can check info.status
+   * to see if it succeeded or not.
+   *
+   * Note that if applications would like to use the passed reference
+   * outside this function call, they should make copies from these
+   * returned value.
+   *
+   * @param tableFileCreationInfo the table file creation info,
+   *     contains data copied from respective native structure.
+   */
+  void onTableFileCreated(final TableFileCreationInfo tableFileCreationInfo);
+
+  /**
+   * A callback function for RocksDB which will be called before
+   * a SST file is being created. It will follow by OnTableFileCreated after
+   * the creation finishes.
+   *
+   * Note that if applications would like to use the passed reference
+   * outside this function call, they should make copies from these
+   * returned value.
+   *
+   * @param tableFileCreationBriefInfo the table file creation brief info,
+   *     contains data copied from respective native structure.
+   */
+  void onTableFileCreationStarted(final TableFileCreationBriefInfo tableFileCreationBriefInfo);
+
+  /**
+   * A callback function for RocksDB which will be called before
+   * a memtable is made immutable.
+   *
+   * Note that the this function must be implemented in a way such that
+   * it should not run for an extended period of time before the function
+   * returns.  Otherwise, RocksDB may be blocked.
+   *
+   * Note that if applications would like to use the passed reference
+   * outside this function call, they should make copies from these
+   * returned value.
+   *
+   * @param memTableInfo the mem table info, contains data
+   *     copied from respective native structure.
+   */
+  void onMemTableSealed(final MemTableInfo memTableInfo);
+
+  /**
+   * A callback function for RocksDB which will be called before
+   * a column family handle is deleted.
+   *
+   * Note that the this function must be implemented in a way such that
+   * it should not run for an extended period of time before the function
+   * returns.  Otherwise, RocksDB may be blocked.
+   *
+   * @param columnFamilyHandle is a pointer to the column family handle to be
+   *     deleted which will become a dangling pointer after the deletion.
+   */
+  void onColumnFamilyHandleDeletionStarted(final ColumnFamilyHandle columnFamilyHandle);
+
+  /**
+   * A callback function for RocksDB which will be called after an external
+   * file is ingested using IngestExternalFile.
+   *
+   * Note that the this function will run on the same thread as
+   * IngestExternalFile(), if this function is blocked, IngestExternalFile()
+   * will be blocked from finishing.
+   *
+   * @param db the database
+   * @param externalFileIngestionInfo the external file ingestion info,
+   *     contains data copied from respective native structure.
+   */
+  void onExternalFileIngested(
+      final RocksDB db, final ExternalFileIngestionInfo externalFileIngestionInfo);
+
+  /**
+   * A callback function for RocksDB which will be called before setting the
+   * background error status to a non-OK value. The new background error status
+   * is provided in `bg_error` and can be modified by the callback. E.g., a
+   * callback can suppress errors by resetting it to Status::OK(), thus
+   * preventing the database from entering read-only mode. We do not provide any
+   * guarantee when failed flushes/compactions will be rescheduled if the user
+   * suppresses an error.
+   *
+   * Note that this function can run on the same threads as flush, compaction,
+   * and user writes. So, it is extremely important not to perform heavy
+   * computations or blocking calls in this function.
+   *
+   * @param backgroundErrorReason background error reason code
+   * @param backgroundError background error codes
+   */
+  void onBackgroundError(
+      final BackgroundErrorReason backgroundErrorReason, final Status backgroundError);
+
+  /**
+   * A callback function for RocksDB which will be called whenever a change
+   * of superversion triggers a change of the stall conditions.
+   *
+   * Note that the this function must be implemented in a way such that
+   * it should not run for an extended period of time before the function
+   * returns. Otherwise, RocksDB may be blocked.
+   *
+   * @param writeStallInfo write stall info,
+   *     contains data copied from respective native structure.
+   */
+  void onStallConditionsChanged(final WriteStallInfo writeStallInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever a file read
+   * operation finishes.
+   *
+   * @param fileOperationInfo file operation info,
+   *     contains data copied from respective native structure.
+   */
+  void onFileReadFinish(final FileOperationInfo fileOperationInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever a file write
+   * operation finishes.
+   *
+   * @param fileOperationInfo file operation info,
+   *     contains data copied from respective native structure.
+   */
+  void onFileWriteFinish(final FileOperationInfo fileOperationInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever a file flush
+   * operation finishes.
+   *
+   * @param fileOperationInfo file operation info,
+   *     contains data copied from respective native structure.
+   */
+  void onFileFlushFinish(final FileOperationInfo fileOperationInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever a file sync
+   * operation finishes.
+   *
+   * @param fileOperationInfo file operation info,
+   *     contains data copied from respective native structure.
+   */
+  void onFileSyncFinish(final FileOperationInfo fileOperationInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever a file
+   * rangeSync operation finishes.
+   *
+   * @param fileOperationInfo file operation info,
+   *     contains data copied from respective native structure.
+   */
+  void onFileRangeSyncFinish(final FileOperationInfo fileOperationInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever a file
+   * truncate operation finishes.
+   *
+   * @param fileOperationInfo file operation info,
+   *     contains data copied from respective native structure.
+   */
+  void onFileTruncateFinish(final FileOperationInfo fileOperationInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever a file close
+   * operation finishes.
+   *
+   * @param fileOperationInfo file operation info,
+   *     contains data copied from respective native structure.
+   */
+  void onFileCloseFinish(final FileOperationInfo fileOperationInfo);
+
+  /**
+   * If true, the {@link #onFileReadFinish(FileOperationInfo)}
+   * and {@link #onFileWriteFinish(FileOperationInfo)} will be called. If
+   * false, then they won't be called.
+   *
+   * Default: false
+   *
+   * @return whether to callback when file read/write is finished
+   */
+  boolean shouldBeNotifiedOnFileIO();
+
+  /**
+   * A callback function for RocksDB which will be called just before
+   * starting the automatic recovery process for recoverable background
+   * errors, such as NoSpace(). The callback can suppress the automatic
+   * recovery by setting returning false. The database will then
+   * have to be transitioned out of read-only mode by calling
+   * RocksDB#resume().
+   *
+   * @param backgroundErrorReason background error reason code
+   * @param backgroundError background error codes
+   * @return return {@code false} if the automatic recovery should be suppressed
+   */
+  boolean onErrorRecoveryBegin(
+      final BackgroundErrorReason backgroundErrorReason, final Status backgroundError);
+
+  /**
+   * A callback function for RocksDB which will be called once the database
+   * is recovered from read-only mode after an error. When this is called, it
+   * means normal writes to the database can be issued and the user can
+   * initiate any further recovery actions needed
+   *
+   * @param oldBackgroundError old background error codes
+   */
+  void onErrorRecoveryCompleted(final Status oldBackgroundError);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/Experimental.java b/src/rocksdb/java/src/main/java/org/rocksdb/Experimental.java
new file mode 100644
index 000000000..64b404d6f
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/Experimental.java
@@ -0,0 +1,23 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Documented;
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+import java.lang.annotation.Target;
+
+/**
+ * Marks a feature as experimental, meaning that it is likely
+ * to change or even be removed/re-engineered in the future
+ */
+@Documented
+@Retention(RetentionPolicy.SOURCE)
+@Target({ElementType.TYPE, ElementType.METHOD})
+public @interface Experimental {
+  String value();
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/ExternalFileIngestionInfo.java b/src/rocksdb/java/src/main/java/org/rocksdb/ExternalFileIngestionInfo.java
new file mode 100644
index 000000000..6b14a8024
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/ExternalFileIngestionInfo.java
@@ -0,0 +1,103 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Objects;
+
+public class ExternalFileIngestionInfo {
+  private final String columnFamilyName;
+  private final String externalFilePath;
+  private final String internalFilePath;
+  private final long globalSeqno;
+  private final TableProperties tableProperties;
+
+  /**
+   * Access is package private as this will only be constructed from
+   * C++ via JNI and for testing.
+   */
+  ExternalFileIngestionInfo(final String columnFamilyName, final String externalFilePath,
+      final String internalFilePath, final long globalSeqno,
+      final TableProperties tableProperties) {
+    this.columnFamilyName = columnFamilyName;
+    this.externalFilePath = externalFilePath;
+    this.internalFilePath = internalFilePath;
+    this.globalSeqno = globalSeqno;
+    this.tableProperties = tableProperties;
+  }
+
+  /**
+   * Get the name of the column family.
+   *
+   * @return the name of the column family.
+   */
+  public String getColumnFamilyName() {
+    return columnFamilyName;
+  }
+
+  /**
+   * Get the path of the file outside the DB.
+   *
+   * @return the path of the file outside the DB.
+   */
+  public String getExternalFilePath() {
+    return externalFilePath;
+  }
+
+  /**
+   * Get the path of the file inside the DB.
+   *
+   * @return the path of the file inside the DB.
+   */
+  public String getInternalFilePath() {
+    return internalFilePath;
+  }
+
+  /**
+   * Get the global sequence number assigned to keys in this file.
+   *
+   * @return the global sequence number.
+   */
+  public long getGlobalSeqno() {
+    return globalSeqno;
+  }
+
+  /**
+   * Get the Table properties of the table being flushed.
+   *
+   * @return the table properties.
+   */
+  public TableProperties getTableProperties() {
+    return tableProperties;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    ExternalFileIngestionInfo that = (ExternalFileIngestionInfo) o;
+    return globalSeqno == that.globalSeqno
+        && Objects.equals(columnFamilyName, that.columnFamilyName)
+        && Objects.equals(externalFilePath, that.externalFilePath)
+        && Objects.equals(internalFilePath, that.internalFilePath)
+        && Objects.equals(tableProperties, that.tableProperties);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(
+        columnFamilyName, externalFilePath, internalFilePath, globalSeqno, tableProperties);
+  }
+
+  @Override
+  public String toString() {
+    return "ExternalFileIngestionInfo{"
+        + "columnFamilyName='" + columnFamilyName + '\'' + ", externalFilePath='" + externalFilePath
+        + '\'' + ", internalFilePath='" + internalFilePath + '\'' + ", globalSeqno=" + globalSeqno
+        + ", tableProperties=" + tableProperties + '}';
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/FileOperationInfo.java b/src/rocksdb/java/src/main/java/org/rocksdb/FileOperationInfo.java
new file mode 100644
index 000000000..aa5743ed3
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/FileOperationInfo.java
@@ -0,0 +1,112 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Objects;
+
+/**
+ * Java representation of FileOperationInfo struct from include/rocksdb/listener.h
+ */
+public class FileOperationInfo {
+  private final String path;
+  private final long offset;
+  private final long length;
+  private final long startTimestamp;
+  private final long duration;
+  private final Status status;
+
+  /**
+   * Access is private as this will only be constructed from
+   * C++ via JNI.
+   */
+  FileOperationInfo(final String path, final long offset, final long length,
+      final long startTimestamp, final long duration, final Status status) {
+    this.path = path;
+    this.offset = offset;
+    this.length = length;
+    this.startTimestamp = startTimestamp;
+    this.duration = duration;
+    this.status = status;
+  }
+
+  /**
+   * Get the file path.
+   *
+   * @return the file path.
+   */
+  public String getPath() {
+    return path;
+  }
+
+  /**
+   * Get the offset.
+   *
+   * @return the offset.
+   */
+  public long getOffset() {
+    return offset;
+  }
+
+  /**
+   * Get the length.
+   *
+   * @return the length.
+   */
+  public long getLength() {
+    return length;
+  }
+
+  /**
+   * Get the start timestamp (in nanoseconds).
+   *
+   * @return the start timestamp.
+   */
+  public long getStartTimestamp() {
+    return startTimestamp;
+  }
+
+  /**
+   * Get the operation duration (in nanoseconds).
+   *
+   * @return the operation duration.
+   */
+  public long getDuration() {
+    return duration;
+  }
+
+  /**
+   * Get the status.
+   *
+   * @return the status.
+   */
+  public Status getStatus() {
+    return status;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    FileOperationInfo that = (FileOperationInfo) o;
+    return offset == that.offset && length == that.length && startTimestamp == that.startTimestamp
+        && duration == that.duration && Objects.equals(path, that.path)
+        && Objects.equals(status, that.status);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(path, offset, length, startTimestamp, duration, status);
+  }
+
+  @Override
+  public String toString() {
+    return "FileOperationInfo{"
+        + "path='" + path + '\'' + ", offset=" + offset + ", length=" + length + ", startTimestamp="
+        + startTimestamp + ", duration=" + duration + ", status=" + status + '}';
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/Filter.java b/src/rocksdb/java/src/main/java/org/rocksdb/Filter.java
new file mode 100644
index 000000000..7f490cf59
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/Filter.java
@@ -0,0 +1,36 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Filters are stored in rocksdb and are consulted automatically
+ * by rocksdb to decide whether or not to read some
+ * information from disk. In many cases, a filter can cut down the
+ * number of disk seeks form a handful to a single disk seek per
+ * DB::Get() call.
+ */
+//TODO(AR) should be renamed FilterPolicy
+public abstract class Filter extends RocksObject {
+
+  protected Filter(final long nativeHandle) {
+    super(nativeHandle);
+  }
+
+  /**
+   * Deletes underlying C++ filter pointer.
+   *
+   * Note that this function should be called only after all
+   * RocksDB instances referencing the filter are closed.
+   * Otherwise an undefined behavior will occur.
+   */
+  @Override
+  protected void disposeInternal() {
+    disposeInternal(nativeHandle_);
+  }
+
+  @Override
+  protected final native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/FlushJobInfo.java b/src/rocksdb/java/src/main/java/org/rocksdb/FlushJobInfo.java
new file mode 100644
index 000000000..ca9aa0523
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/FlushJobInfo.java
@@ -0,0 +1,186 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Objects;
+
+public class FlushJobInfo {
+  private final long columnFamilyId;
+  private final String columnFamilyName;
+  private final String filePath;
+  private final long threadId;
+  private final int jobId;
+  private final boolean triggeredWritesSlowdown;
+  private final boolean triggeredWritesStop;
+  private final long smallestSeqno;
+  private final long largestSeqno;
+  private final TableProperties tableProperties;
+  private final FlushReason flushReason;
+
+  /**
+   * Access is package private as this will only be constructed from
+   * C++ via JNI and for testing.
+   */
+  FlushJobInfo(final long columnFamilyId, final String columnFamilyName, final String filePath,
+      final long threadId, final int jobId, final boolean triggeredWritesSlowdown,
+      final boolean triggeredWritesStop, final long smallestSeqno, final long largestSeqno,
+      final TableProperties tableProperties, final byte flushReasonValue) {
+    this.columnFamilyId = columnFamilyId;
+    this.columnFamilyName = columnFamilyName;
+    this.filePath = filePath;
+    this.threadId = threadId;
+    this.jobId = jobId;
+    this.triggeredWritesSlowdown = triggeredWritesSlowdown;
+    this.triggeredWritesStop = triggeredWritesStop;
+    this.smallestSeqno = smallestSeqno;
+    this.largestSeqno = largestSeqno;
+    this.tableProperties = tableProperties;
+    this.flushReason = FlushReason.fromValue(flushReasonValue);
+  }
+
+  /**
+   * Get the id of the column family.
+   *
+   * @return the id of the column family
+   */
+  public long getColumnFamilyId() {
+    return columnFamilyId;
+  }
+
+  /**
+   * Get the name of the column family.
+   *
+   * @return the name of the column family
+   */
+  public String getColumnFamilyName() {
+    return columnFamilyName;
+  }
+
+  /**
+   * Get the path to the newly created file.
+   *
+   * @return the path to the newly created file
+   */
+  public String getFilePath() {
+    return filePath;
+  }
+
+  /**
+   * Get the id of the thread that completed this flush job.
+   *
+   * @return the id of the thread that completed this flush job
+   */
+  public long getThreadId() {
+    return threadId;
+  }
+
+  /**
+   * Get the job id, which is unique in the same thread.
+   *
+   * @return the job id
+   */
+  public int getJobId() {
+    return jobId;
+  }
+
+  /**
+   * Determine if rocksdb is currently slowing-down all writes to prevent
+   * creating too many Level 0 files as compaction seems not able to
+   * catch up the write request speed.
+   *
+   * This indicates that there are too many files in Level 0.
+   *
+   * @return true if rocksdb is currently slowing-down all writes,
+   *     false otherwise
+   */
+  public boolean isTriggeredWritesSlowdown() {
+    return triggeredWritesSlowdown;
+  }
+
+  /**
+   * Determine if rocksdb is currently blocking any writes to prevent
+   * creating more L0 files.
+   *
+   * This indicates that there are too many files in level 0.
+   * Compactions should try to compact L0 files down to lower levels as soon
+   * as possible.
+   *
+   * @return true  if rocksdb is currently blocking any writes, false otherwise
+   */
+  public boolean isTriggeredWritesStop() {
+    return triggeredWritesStop;
+  }
+
+  /**
+   * Get the smallest sequence number in the newly created file.
+   *
+   * @return the smallest sequence number
+   */
+  public long getSmallestSeqno() {
+    return smallestSeqno;
+  }
+
+  /**
+   * Get the largest sequence number in the newly created file.
+   *
+   * @return the largest sequence number
+   */
+  public long getLargestSeqno() {
+    return largestSeqno;
+  }
+
+  /**
+   * Get the Table properties of the table being flushed.
+   *
+   * @return the Table properties of the table being flushed
+   */
+  public TableProperties getTableProperties() {
+    return tableProperties;
+  }
+
+  /**
+   * Get the reason for initiating the flush.
+   *
+   * @return the reason for initiating the flush.
+   */
+  public FlushReason getFlushReason() {
+    return flushReason;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    FlushJobInfo that = (FlushJobInfo) o;
+    return columnFamilyId == that.columnFamilyId && threadId == that.threadId && jobId == that.jobId
+        && triggeredWritesSlowdown == that.triggeredWritesSlowdown
+        && triggeredWritesStop == that.triggeredWritesStop && smallestSeqno == that.smallestSeqno
+        && largestSeqno == that.largestSeqno
+        && Objects.equals(columnFamilyName, that.columnFamilyName)
+        && Objects.equals(filePath, that.filePath)
+        && Objects.equals(tableProperties, that.tableProperties) && flushReason == that.flushReason;
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(columnFamilyId, columnFamilyName, filePath, threadId, jobId,
+        triggeredWritesSlowdown, triggeredWritesStop, smallestSeqno, largestSeqno, tableProperties,
+        flushReason);
+  }
+
+  @Override
+  public String toString() {
+    return "FlushJobInfo{"
+        + "columnFamilyId=" + columnFamilyId + ", columnFamilyName='" + columnFamilyName + '\''
+        + ", filePath='" + filePath + '\'' + ", threadId=" + threadId + ", jobId=" + jobId
+        + ", triggeredWritesSlowdown=" + triggeredWritesSlowdown
+        + ", triggeredWritesStop=" + triggeredWritesStop + ", smallestSeqno=" + smallestSeqno
+        + ", largestSeqno=" + largestSeqno + ", tableProperties=" + tableProperties
+        + ", flushReason=" + flushReason + '}';
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/FlushOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/FlushOptions.java
new file mode 100644
index 000000000..760b515fd
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/FlushOptions.java
@@ -0,0 +1,90 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * FlushOptions to be passed to flush operations of
+ * {@link org.rocksdb.RocksDB}.
+ */
+public class FlushOptions extends RocksObject {
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  /**
+   * Construct a new instance of FlushOptions.
+   */
+  public FlushOptions(){
+    super(newFlushOptions());
+  }
+
+  /**
+   * Set if the flush operation shall block until it terminates.
+   *
+   * @param waitForFlush boolean value indicating if the flush
+   *     operations waits for termination of the flush process.
+   *
+   * @return instance of current FlushOptions.
+   */
+  public FlushOptions setWaitForFlush(final boolean waitForFlush) {
+    assert(isOwningHandle());
+    setWaitForFlush(nativeHandle_, waitForFlush);
+    return this;
+  }
+
+  /**
+   * Wait for flush to finished.
+   *
+   * @return boolean value indicating if the flush operation
+   *     waits for termination of the flush process.
+   */
+  public boolean waitForFlush() {
+    assert(isOwningHandle());
+    return waitForFlush(nativeHandle_);
+  }
+
+  /**
+   * Set to true so that flush would proceeds immediately even it it means
+   * writes will stall for the duration of the flush.
+   *
+   * Set to false so that the operation will wait until it's possible to do
+   * the flush without causing stall or until required flush is performed by
+   * someone else (foreground call or background thread).
+   *
+   * Default: false
+   *
+   * @param allowWriteStall true to allow writes to stall for flush, false
+   *     otherwise.
+   *
+   * @return instance of current FlushOptions.
+   */
+  public FlushOptions setAllowWriteStall(final boolean allowWriteStall) {
+    assert(isOwningHandle());
+    setAllowWriteStall(nativeHandle_, allowWriteStall);
+    return this;
+  }
+
+  /**
+   * Returns true if writes are allowed to stall for flushes to complete, false
+   * otherwise.
+   *
+   * @return true if writes are allowed to stall for flushes
+   */
+  public boolean allowWriteStall() {
+    assert(isOwningHandle());
+    return allowWriteStall(nativeHandle_);
+  }
+
+  private native static long newFlushOptions();
+  @Override protected final native void disposeInternal(final long handle);
+
+  private native void setWaitForFlush(final long handle,
+      final boolean wait);
+  private native boolean waitForFlush(final long handle);
+  private native void setAllowWriteStall(final long handle,
+      final boolean allowWriteStall);
+  private native boolean allowWriteStall(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/FlushReason.java b/src/rocksdb/java/src/main/java/org/rocksdb/FlushReason.java
new file mode 100644
index 000000000..9d486cda1
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/FlushReason.java
@@ -0,0 +1,53 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public enum FlushReason {
+  OTHERS((byte) 0x00),
+  GET_LIVE_FILES((byte) 0x01),
+  SHUTDOWN((byte) 0x02),
+  EXTERNAL_FILE_INGESTION((byte) 0x03),
+  MANUAL_COMPACTION((byte) 0x04),
+  WRITE_BUFFER_MANAGER((byte) 0x05),
+  WRITE_BUFFER_FULL((byte) 0x06),
+  TEST((byte) 0x07),
+  DELETE_FILES((byte) 0x08),
+  AUTO_COMPACTION((byte) 0x09),
+  MANUAL_FLUSH((byte) 0x0a),
+  ERROR_RECOVERY((byte) 0xb);
+
+  private final byte value;
+
+  FlushReason(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation.
+   *
+   * @return the internal representation
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get the FlushReason from the internal representation value.
+   *
+   * @return the flush reason.
+   *
+   * @throws IllegalArgumentException if the value is unknown.
+   */
+  static FlushReason fromValue(final byte value) {
+    for (final FlushReason flushReason : FlushReason.values()) {
+      if (flushReason.value == value) {
+        return flushReason;
+      }
+    }
+
+    throw new IllegalArgumentException("Illegal value provided for FlushReason: " + value);
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/HashLinkedListMemTableConfig.java b/src/rocksdb/java/src/main/java/org/rocksdb/HashLinkedListMemTableConfig.java
new file mode 100644
index 000000000..05cc2bb90
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/HashLinkedListMemTableConfig.java
@@ -0,0 +1,174 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+package org.rocksdb;
+
+/**
+ * The config for hash linked list memtable representation
+ * Such memtable contains a fix-sized array of buckets, where
+ * each bucket points to a sorted singly-linked
+ * list (or null if the bucket is empty).
+ *
+ * Note that since this mem-table representation relies on the
+ * key prefix, it is required to invoke one of the usePrefixExtractor
+ * functions to specify how to extract key prefix given a key.
+ * If proper prefix-extractor is not set, then RocksDB will
+ * use the default memtable representation (SkipList) instead
+ * and post a warning in the LOG.
+ */
+public class HashLinkedListMemTableConfig extends MemTableConfig {
+  public static final long DEFAULT_BUCKET_COUNT = 50000;
+  public static final long DEFAULT_HUGE_PAGE_TLB_SIZE = 0;
+  public static final int DEFAULT_BUCKET_ENTRIES_LOG_THRES = 4096;
+  public static final boolean
+      DEFAULT_IF_LOG_BUCKET_DIST_WHEN_FLUSH = true;
+  public static final int DEFAUL_THRESHOLD_USE_SKIPLIST = 256;
+
+  /**
+   * HashLinkedListMemTableConfig constructor
+   */
+  public HashLinkedListMemTableConfig() {
+    bucketCount_ = DEFAULT_BUCKET_COUNT;
+    hugePageTlbSize_ = DEFAULT_HUGE_PAGE_TLB_SIZE;
+    bucketEntriesLoggingThreshold_ = DEFAULT_BUCKET_ENTRIES_LOG_THRES;
+    ifLogBucketDistWhenFlush_ = DEFAULT_IF_LOG_BUCKET_DIST_WHEN_FLUSH;
+    thresholdUseSkiplist_ = DEFAUL_THRESHOLD_USE_SKIPLIST;
+  }
+
+  /**
+   * Set the number of buckets in the fixed-size array used
+   * in the hash linked-list mem-table.
+   *
+   * @param count the number of hash buckets.
+   * @return the reference to the current HashLinkedListMemTableConfig.
+   */
+  public HashLinkedListMemTableConfig setBucketCount(
+      final long count) {
+    bucketCount_ = count;
+    return this;
+  }
+
+  /**
+   * Returns the number of buckets that will be used in the memtable
+   * created based on this config.
+   *
+   * @return the number of buckets
+   */
+  public long bucketCount() {
+    return bucketCount_;
+  }
+
+  /**
+   * <p>Set the size of huge tlb or allocate the hashtable bytes from
+   * malloc if {@code size <= 0}.</p>
+   *
+   * <p>The user needs to reserve huge pages for it to be allocated,
+   * like: {@code sysctl -w vm.nr_hugepages=20}</p>
+   *
+   * <p>See linux documentation/vm/hugetlbpage.txt</p>
+   *
+   * @param size if set to {@code <= 0} hashtable bytes from malloc
+   * @return the reference to the current HashLinkedListMemTableConfig.
+   */
+  public HashLinkedListMemTableConfig setHugePageTlbSize(
+      final long size) {
+    hugePageTlbSize_ = size;
+    return this;
+  }
+
+  /**
+   * Returns the size value of hugePageTlbSize.
+   *
+   * @return the hugePageTlbSize.
+   */
+  public long hugePageTlbSize() {
+    return hugePageTlbSize_;
+  }
+
+  /**
+   * If number of entries in one bucket exceeds that setting, log
+   * about it.
+   *
+   * @param threshold - number of entries in a single bucket before
+   *     logging starts.
+   * @return the reference to the current HashLinkedListMemTableConfig.
+   */
+  public HashLinkedListMemTableConfig
+      setBucketEntriesLoggingThreshold(final int threshold) {
+    bucketEntriesLoggingThreshold_ = threshold;
+    return this;
+  }
+
+  /**
+   * Returns the maximum number of entries in one bucket before
+   * logging starts.
+   *
+   * @return maximum number of entries in one bucket before logging
+   *     starts.
+   */
+  public int bucketEntriesLoggingThreshold() {
+    return bucketEntriesLoggingThreshold_;
+  }
+
+  /**
+   * If true the distrubition of number of entries will be logged.
+   *
+   * @param logDistribution - boolean parameter indicating if number
+   *     of entry distribution shall be logged.
+   * @return the reference to the current HashLinkedListMemTableConfig.
+   */
+  public HashLinkedListMemTableConfig
+      setIfLogBucketDistWhenFlush(final boolean logDistribution) {
+    ifLogBucketDistWhenFlush_ = logDistribution;
+    return this;
+  }
+
+  /**
+   * Returns information about logging the distribution of
+   *  number of entries on flush.
+   *
+   * @return if distribution of number of entries shall be logged.
+   */
+  public boolean ifLogBucketDistWhenFlush() {
+    return ifLogBucketDistWhenFlush_;
+  }
+
+  /**
+   * Set maximum number of entries in one bucket. Exceeding this val
+   * leads to a switch from LinkedList to SkipList.
+   *
+   * @param threshold maximum number of entries before SkipList is
+   *     used.
+   * @return the reference to the current HashLinkedListMemTableConfig.
+   */
+  public HashLinkedListMemTableConfig
+      setThresholdUseSkiplist(final int threshold) {
+    thresholdUseSkiplist_ = threshold;
+    return this;
+  }
+
+  /**
+   * Returns entries per bucket threshold before LinkedList is
+   * replaced by SkipList usage for that bucket.
+   *
+   * @return entries per bucket threshold before SkipList is used.
+   */
+  public int thresholdUseSkiplist() {
+    return thresholdUseSkiplist_;
+  }
+
+  @Override protected long newMemTableFactoryHandle() {
+    return newMemTableFactoryHandle(bucketCount_, hugePageTlbSize_,
+        bucketEntriesLoggingThreshold_, ifLogBucketDistWhenFlush_,
+        thresholdUseSkiplist_);
+  }
+
+  private native long newMemTableFactoryHandle(long bucketCount,
+      long hugePageTlbSize, int bucketEntriesLoggingThreshold,
+      boolean ifLogBucketDistWhenFlush, int thresholdUseSkiplist)
+      throws IllegalArgumentException;
+
+  private long bucketCount_;
+  private long hugePageTlbSize_;
+  private int bucketEntriesLoggingThreshold_;
+  private boolean ifLogBucketDistWhenFlush_;
+  private int thresholdUseSkiplist_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/HashSkipListMemTableConfig.java b/src/rocksdb/java/src/main/java/org/rocksdb/HashSkipListMemTableConfig.java
new file mode 100644
index 000000000..efc78b14e
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/HashSkipListMemTableConfig.java
@@ -0,0 +1,106 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+package org.rocksdb;
+
+/**
+ * The config for hash skip-list mem-table representation.
+ * Such mem-table representation contains a fix-sized array of
+ * buckets, where each bucket points to a skiplist (or null if the
+ * bucket is empty).
+ *
+ * Note that since this mem-table representation relies on the
+ * key prefix, it is required to invoke one of the usePrefixExtractor
+ * functions to specify how to extract key prefix given a key.
+ * If proper prefix-extractor is not set, then RocksDB will
+ * use the default memtable representation (SkipList) instead
+ * and post a warning in the LOG.
+ */
+public class HashSkipListMemTableConfig extends MemTableConfig {
+  public static final int DEFAULT_BUCKET_COUNT = 1000000;
+  public static final int DEFAULT_BRANCHING_FACTOR = 4;
+  public static final int DEFAULT_HEIGHT = 4;
+
+  /**
+   * HashSkipListMemTableConfig constructor
+   */
+  public HashSkipListMemTableConfig() {
+    bucketCount_ = DEFAULT_BUCKET_COUNT;
+    branchingFactor_ = DEFAULT_BRANCHING_FACTOR;
+    height_ = DEFAULT_HEIGHT;
+  }
+
+  /**
+   * Set the number of hash buckets used in the hash skiplist memtable.
+   * Default = 1000000.
+   *
+   * @param count the number of hash buckets used in the hash
+   *    skiplist memtable.
+   * @return the reference to the current HashSkipListMemTableConfig.
+   */
+  public HashSkipListMemTableConfig setBucketCount(
+      final long count) {
+    bucketCount_ = count;
+    return this;
+  }
+
+  /**
+   * @return the number of hash buckets
+   */
+  public long bucketCount() {
+    return bucketCount_;
+  }
+
+  /**
+   * Set the height of the skip list.  Default = 4.
+   *
+   * @param height height to set.
+   *
+   * @return the reference to the current HashSkipListMemTableConfig.
+   */
+  public HashSkipListMemTableConfig setHeight(final int height) {
+    height_ = height;
+    return this;
+  }
+
+  /**
+   * @return the height of the skip list.
+   */
+  public int height() {
+    return height_;
+  }
+
+  /**
+   * Set the branching factor used in the hash skip-list memtable.
+   * This factor controls the probabilistic size ratio between adjacent
+   * links in the skip list.
+   *
+   * @param bf the probabilistic size ratio between adjacent link
+   *     lists in the skip list.
+   * @return the reference to the current HashSkipListMemTableConfig.
+   */
+  public HashSkipListMemTableConfig setBranchingFactor(
+      final int bf) {
+    branchingFactor_ = bf;
+    return this;
+  }
+
+  /**
+   * @return branching factor, the probabilistic size ratio between
+   *     adjacent links in the skip list.
+   */
+  public int branchingFactor() {
+    return branchingFactor_;
+  }
+
+  @Override protected long newMemTableFactoryHandle() {
+    return newMemTableFactoryHandle(
+        bucketCount_, height_, branchingFactor_);
+  }
+
+  private native long newMemTableFactoryHandle(
+      long bucketCount, int height, int branchingFactor)
+      throws IllegalArgumentException;
+
+  private long bucketCount_;
+  private int branchingFactor_;
+  private int height_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/HistogramData.java b/src/rocksdb/java/src/main/java/org/rocksdb/HistogramData.java
new file mode 100644
index 000000000..81d890883
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/HistogramData.java
@@ -0,0 +1,75 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public class HistogramData {
+  private final double median_;
+  private final double percentile95_;
+  private final double percentile99_;
+  private final double average_;
+  private final double standardDeviation_;
+  private final double max_;
+  private final long count_;
+  private final long sum_;
+  private final double min_;
+
+  public HistogramData(final double median, final double percentile95,
+                       final double percentile99, final double average,
+                       final double standardDeviation) {
+    this(median, percentile95, percentile99, average, standardDeviation, 0.0, 0, 0, 0.0);
+  }
+
+  public HistogramData(final double median, final double percentile95,
+      final double percentile99, final double average,
+      final double standardDeviation, final double max, final long count,
+      final long sum, final double min) {
+    median_ = median;
+    percentile95_ = percentile95;
+    percentile99_ = percentile99;
+    average_ = average;
+    standardDeviation_ = standardDeviation;
+    min_ = min;
+    max_ = max;
+    count_ = count;
+    sum_ = sum;
+  }
+
+  public double getMedian() {
+    return median_;
+  }
+
+  public double getPercentile95() {
+    return percentile95_;
+  }
+
+  public double getPercentile99() {
+    return percentile99_;
+  }
+
+  public double getAverage() {
+    return average_;
+  }
+
+  public double getStandardDeviation() {
+    return standardDeviation_;
+  }
+
+  public double getMax() {
+    return max_;
+  }
+
+  public long getCount() {
+    return count_;
+  }
+
+  public long getSum() {
+    return sum_;
+  }
+
+  public double getMin() {
+    return min_;
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/HistogramType.java b/src/rocksdb/java/src/main/java/org/rocksdb/HistogramType.java
new file mode 100644
index 000000000..d5f7da5e0
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/HistogramType.java
@@ -0,0 +1,221 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public enum HistogramType {
+
+  DB_GET((byte) 0x0),
+
+  DB_WRITE((byte) 0x1),
+
+  COMPACTION_TIME((byte) 0x2),
+
+  SUBCOMPACTION_SETUP_TIME((byte) 0x3),
+
+  TABLE_SYNC_MICROS((byte) 0x4),
+
+  COMPACTION_OUTFILE_SYNC_MICROS((byte) 0x5),
+
+  WAL_FILE_SYNC_MICROS((byte) 0x6),
+
+  MANIFEST_FILE_SYNC_MICROS((byte) 0x7),
+
+  /**
+   * TIME SPENT IN IO DURING TABLE OPEN.
+   */
+  TABLE_OPEN_IO_MICROS((byte) 0x8),
+
+  DB_MULTIGET((byte) 0x9),
+
+  READ_BLOCK_COMPACTION_MICROS((byte) 0xA),
+
+  READ_BLOCK_GET_MICROS((byte) 0xB),
+
+  WRITE_RAW_BLOCK_MICROS((byte) 0xC),
+
+  STALL_L0_SLOWDOWN_COUNT((byte) 0xD),
+
+  STALL_MEMTABLE_COMPACTION_COUNT((byte) 0xE),
+
+  STALL_L0_NUM_FILES_COUNT((byte) 0xF),
+
+  HARD_RATE_LIMIT_DELAY_COUNT((byte) 0x10),
+
+  SOFT_RATE_LIMIT_DELAY_COUNT((byte) 0x11),
+
+  NUM_FILES_IN_SINGLE_COMPACTION((byte) 0x12),
+
+  DB_SEEK((byte) 0x13),
+
+  WRITE_STALL((byte) 0x14),
+
+  SST_READ_MICROS((byte) 0x15),
+
+  /**
+   * The number of subcompactions actually scheduled during a compaction.
+   */
+  NUM_SUBCOMPACTIONS_SCHEDULED((byte) 0x16),
+
+  /**
+   * Value size distribution in each operation.
+   */
+  BYTES_PER_READ((byte) 0x17),
+  BYTES_PER_WRITE((byte) 0x18),
+  BYTES_PER_MULTIGET((byte) 0x19),
+
+  /**
+   * number of bytes compressed.
+   */
+  BYTES_COMPRESSED((byte) 0x1A),
+
+  /**
+   * number of bytes decompressed.
+   *
+   * number of bytes is when uncompressed; i.e. before/after respectively
+   */
+  BYTES_DECOMPRESSED((byte) 0x1B),
+
+  COMPRESSION_TIMES_NANOS((byte) 0x1C),
+
+  DECOMPRESSION_TIMES_NANOS((byte) 0x1D),
+
+  READ_NUM_MERGE_OPERANDS((byte) 0x1E),
+
+  /**
+   * Time spent flushing memtable to disk.
+   */
+  FLUSH_TIME((byte) 0x20),
+
+  /**
+   * Size of keys written to BlobDB.
+   */
+  BLOB_DB_KEY_SIZE((byte) 0x21),
+
+  /**
+   * Size of values written to BlobDB.
+   */
+  BLOB_DB_VALUE_SIZE((byte) 0x22),
+
+  /**
+   * BlobDB Put/PutWithTTL/PutUntil/Write latency.
+   */
+  BLOB_DB_WRITE_MICROS((byte) 0x23),
+
+  /**
+   * BlobDB Get lagency.
+   */
+  BLOB_DB_GET_MICROS((byte) 0x24),
+
+  /**
+   * BlobDB MultiGet latency.
+   */
+  BLOB_DB_MULTIGET_MICROS((byte) 0x25),
+
+  /**
+   * BlobDB Seek/SeekToFirst/SeekToLast/SeekForPrev latency.
+   */
+  BLOB_DB_SEEK_MICROS((byte) 0x26),
+
+  /**
+   * BlobDB Next latency.
+   */
+  BLOB_DB_NEXT_MICROS((byte) 0x27),
+
+  /**
+   * BlobDB Prev latency.
+   */
+  BLOB_DB_PREV_MICROS((byte) 0x28),
+
+  /**
+   * Blob file write latency.
+   */
+  BLOB_DB_BLOB_FILE_WRITE_MICROS((byte) 0x29),
+
+  /**
+   * Blob file read latency.
+   */
+  BLOB_DB_BLOB_FILE_READ_MICROS((byte) 0x2A),
+
+  /**
+   * Blob file sync latency.
+   */
+  BLOB_DB_BLOB_FILE_SYNC_MICROS((byte) 0x2B),
+
+  /**
+   * BlobDB garbage collection time.
+   */
+  BLOB_DB_GC_MICROS((byte) 0x2C),
+
+  /**
+   * BlobDB compression time.
+   */
+  BLOB_DB_COMPRESSION_MICROS((byte) 0x2D),
+
+  /**
+   * BlobDB decompression time.
+   */
+  BLOB_DB_DECOMPRESSION_MICROS((byte) 0x2E),
+
+  /**
+   * Num of Index and Filter blocks read from file system per level in MultiGet
+   * request
+   */
+  NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL((byte) 0x2F),
+
+  /**
+   * Num of Data blocks read from file system per level in MultiGet request.
+   */
+  NUM_DATA_BLOCKS_READ_PER_LEVEL((byte) 0x30),
+
+  /**
+   * Num of SST files read from file system per level in MultiGet request.
+   */
+  NUM_SST_READ_PER_LEVEL((byte) 0x31),
+
+  /**
+   * The number of retry in auto resume
+   */
+  ERROR_HANDLER_AUTORESUME_RETRY_COUNT((byte) 0x32),
+
+  ASYNC_READ_BYTES((byte) 0x33),
+
+  // 0x1F for backwards compatibility on current minor version.
+  HISTOGRAM_ENUM_MAX((byte) 0x1F);
+
+  private final byte value;
+
+  HistogramType(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Returns the byte value of the enumerations value
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get Histogram type by byte value.
+   *
+   * @param value byte representation of HistogramType.
+   *
+   * @return {@link org.rocksdb.HistogramType} instance.
+   * @throws java.lang.IllegalArgumentException if an invalid
+   *     value is provided.
+   */
+  public static HistogramType getHistogramType(final byte value) {
+    for (final HistogramType histogramType : HistogramType.values()) {
+      if (histogramType.getValue() == value) {
+        return histogramType;
+      }
+    }
+    throw new IllegalArgumentException(
+        "Illegal value provided for HistogramType.");
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/Holder.java b/src/rocksdb/java/src/main/java/org/rocksdb/Holder.java
new file mode 100644
index 000000000..716a0bda0
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/Holder.java
@@ -0,0 +1,46 @@
+// Copyright (c) 2016, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Simple instance reference wrapper.
+ */
+public class Holder<T> {
+  private /* @Nullable */ T value;
+
+  /**
+   * Constructs a new Holder with null instance.
+   */
+  public Holder() {
+  }
+
+  /**
+   * Constructs a new Holder.
+   *
+   * @param value the instance or null
+   */
+  public Holder(/* @Nullable */ final T value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the instance reference.
+   *
+   * @return value the instance reference or null
+   */
+  public /* @Nullable */ T getValue() {
+    return value;
+  }
+
+  /**
+   * Set the instance reference.
+   *
+   * @param value the instance reference or null
+   */
+  public void setValue(/* @Nullable */ final T value) {
+    this.value = value;
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/IndexShorteningMode.java b/src/rocksdb/java/src/main/java/org/rocksdb/IndexShorteningMode.java
new file mode 100644
index 000000000..a68346c38
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/IndexShorteningMode.java
@@ -0,0 +1,60 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+/**
+ * This enum allows trading off increased index size for improved iterator
+ * seek performance in some situations, particularly when block cache is
+ * disabled ({@link ReadOptions#fillCache()} == false and direct IO is
+ * enabled ({@link DBOptions#useDirectReads()} == true).
+ * The default mode is the best tradeoff for most use cases.
+ * This option only affects newly written tables.
+ *
+ * The index contains a key separating each pair of consecutive blocks.
+ * Let A be the highest key in one block, B the lowest key in the next block,
+ * and I the index entry separating these two blocks:
+ * [ ... A] I [B ...]
+ * I is allowed to be anywhere in [A, B).
+ * If an iterator is seeked to a key in (A, I], we'll unnecessarily read the
+ * first block, then immediately fall through to the second block.
+ * However, if I=A, this can't happen, and we'll read only the second block.
+ * In kNoShortening mode, we use I=A. In other modes, we use the shortest
+ * key in [A, B), which usually significantly reduces index size.
+ *
+ * There's a similar story for the last index entry, which is an upper bound
+ * of the highest key in the file. If it's shortened and therefore
+ * overestimated, iterator is likely to unnecessarily read the last data block
+ * from each file on each seek.
+ */
+public enum IndexShorteningMode {
+  /**
+   * Use full keys.
+   */
+  kNoShortening((byte) 0),
+  /**
+   * Shorten index keys between blocks, but use full key for the last index
+   * key, which is the upper bound of the whole file.
+   */
+  kShortenSeparators((byte) 1),
+  /**
+   * Shorten both keys between blocks and key after last block.
+   */
+  kShortenSeparatorsAndSuccessor((byte) 2);
+
+  private final byte value;
+
+  IndexShorteningMode(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Returns the byte value of the enumerations value.
+   *
+   * @return byte representation
+   */
+  byte getValue() {
+    return value;
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/IndexType.java b/src/rocksdb/java/src/main/java/org/rocksdb/IndexType.java
new file mode 100644
index 000000000..162edad1b
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/IndexType.java
@@ -0,0 +1,55 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * IndexType used in conjunction with BlockBasedTable.
+ */
+public enum IndexType {
+  /**
+   * A space efficient index block that is optimized for
+   * binary-search-based index.
+   */
+  kBinarySearch((byte) 0),
+  /**
+   * The hash index, if enabled, will do the hash lookup when
+   * {@code Options.prefix_extractor} is provided.
+   */
+  kHashSearch((byte) 1),
+  /**
+   * A two-level index implementation. Both levels are binary search indexes.
+   */
+  kTwoLevelIndexSearch((byte) 2),
+  /**
+   * Like {@link #kBinarySearch}, but index also contains first key of each block.
+   * This allows iterators to defer reading the block until it's actually
+   * needed. May significantly reduce read amplification of short range scans.
+   * Without it, iterator seek usually reads one block from each level-0 file
+   * and from each level, which may be expensive.
+   * Works best in combination with:
+   *   - IndexShorteningMode::kNoShortening,
+   *   - custom FlushBlockPolicy to cut blocks at some meaningful boundaries,
+   *     e.g. when prefix changes.
+   * Makes the index significantly bigger (2x or more), especially when keys
+   * are long.
+   */
+  kBinarySearchWithFirstKey((byte) 3);
+
+  /**
+   * Returns the byte value of the enumerations value
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value_;
+  }
+
+  IndexType(byte value) {
+    value_ = value;
+  }
+
+  private final byte value_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/InfoLogLevel.java b/src/rocksdb/java/src/main/java/org/rocksdb/InfoLogLevel.java
new file mode 100644
index 000000000..b7c0f0700
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/InfoLogLevel.java
@@ -0,0 +1,49 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+package org.rocksdb;
+
+/**
+ * RocksDB log levels.
+ */
+public enum InfoLogLevel {
+  DEBUG_LEVEL((byte)0),
+  INFO_LEVEL((byte)1),
+  WARN_LEVEL((byte)2),
+  ERROR_LEVEL((byte)3),
+  FATAL_LEVEL((byte)4),
+  HEADER_LEVEL((byte)5),
+  NUM_INFO_LOG_LEVELS((byte)6);
+
+  private final byte value_;
+
+  private InfoLogLevel(final byte value) {
+    value_ = value;
+  }
+
+  /**
+   * Returns the byte value of the enumerations value
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value_;
+  }
+
+  /**
+   * Get InfoLogLevel by byte value.
+   *
+   * @param value byte representation of InfoLogLevel.
+   *
+   * @return {@link org.rocksdb.InfoLogLevel} instance.
+   * @throws java.lang.IllegalArgumentException if an invalid
+   *     value is provided.
+   */
+  public static InfoLogLevel getInfoLogLevel(final byte value) {
+    for (final InfoLogLevel infoLogLevel : InfoLogLevel.values()) {
+      if (infoLogLevel.getValue() == value) {
+        return infoLogLevel;
+      }
+    }
+    throw new IllegalArgumentException(
+        "Illegal value provided for InfoLogLevel.");
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/IngestExternalFileOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/IngestExternalFileOptions.java
new file mode 100644
index 000000000..a6a308daa
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/IngestExternalFileOptions.java
@@ -0,0 +1,227 @@
+package org.rocksdb;
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+import java.util.List;
+
+/**
+ * IngestExternalFileOptions is used by
+ * {@link RocksDB#ingestExternalFile(ColumnFamilyHandle, List, IngestExternalFileOptions)}.
+ */
+public class IngestExternalFileOptions extends RocksObject {
+
+  public IngestExternalFileOptions() {
+    super(newIngestExternalFileOptions());
+  }
+
+  /**
+   * @param moveFiles {@link #setMoveFiles(boolean)}
+   * @param snapshotConsistency {@link #setSnapshotConsistency(boolean)}
+   * @param allowGlobalSeqNo {@link #setAllowGlobalSeqNo(boolean)}
+   * @param allowBlockingFlush {@link #setAllowBlockingFlush(boolean)}
+   */
+  public IngestExternalFileOptions(final boolean moveFiles,
+      final boolean snapshotConsistency, final boolean allowGlobalSeqNo,
+      final boolean allowBlockingFlush) {
+    super(newIngestExternalFileOptions(moveFiles, snapshotConsistency,
+        allowGlobalSeqNo, allowBlockingFlush));
+  }
+
+  /**
+   * Can be set to true to move the files instead of copying them.
+   *
+   * @return true if files will be moved
+   */
+  public boolean moveFiles() {
+    return moveFiles(nativeHandle_);
+  }
+
+  /**
+   * Can be set to true to move the files instead of copying them.
+   *
+   * @param moveFiles true if files should be moved instead of copied
+   *
+   * @return the reference to the current IngestExternalFileOptions.
+   */
+  public IngestExternalFileOptions setMoveFiles(final boolean moveFiles) {
+    setMoveFiles(nativeHandle_, moveFiles);
+    return this;
+  }
+
+  /**
+   * If set to false, an ingested file keys could appear in existing snapshots
+   * that where created before the file was ingested.
+   *
+   * @return true if snapshot consistency is assured
+   */
+  public boolean snapshotConsistency() {
+    return snapshotConsistency(nativeHandle_);
+  }
+
+  /**
+   * If set to false, an ingested file keys could appear in existing snapshots
+   * that where created before the file was ingested.
+   *
+   * @param snapshotConsistency true if snapshot consistency is required
+   *
+   * @return the reference to the current IngestExternalFileOptions.
+   */
+  public IngestExternalFileOptions setSnapshotConsistency(
+      final boolean snapshotConsistency) {
+    setSnapshotConsistency(nativeHandle_, snapshotConsistency);
+    return this;
+  }
+
+  /**
+   * If set to false, {@link RocksDB#ingestExternalFile(ColumnFamilyHandle, List, IngestExternalFileOptions)}
+   * will fail if the file key range overlaps with existing keys or tombstones in the DB.
+   *
+   * @return true if global seq numbers are assured
+   */
+  public boolean allowGlobalSeqNo() {
+    return allowGlobalSeqNo(nativeHandle_);
+  }
+
+  /**
+   * If set to false, {@link RocksDB#ingestExternalFile(ColumnFamilyHandle, List, IngestExternalFileOptions)}
+   * will fail if the file key range overlaps with existing keys or tombstones in the DB.
+   *
+   * @param allowGlobalSeqNo true if global seq numbers are required
+   *
+   * @return the reference to the current IngestExternalFileOptions.
+   */
+  public IngestExternalFileOptions setAllowGlobalSeqNo(
+      final boolean allowGlobalSeqNo) {
+    setAllowGlobalSeqNo(nativeHandle_, allowGlobalSeqNo);
+    return this;
+  }
+
+  /**
+   * If set to false and the file key range overlaps with the memtable key range
+   * (memtable flush required), IngestExternalFile will fail.
+   *
+   * @return true if blocking flushes may occur
+   */
+  public boolean allowBlockingFlush() {
+    return allowBlockingFlush(nativeHandle_);
+  }
+
+  /**
+   * If set to false and the file key range overlaps with the memtable key range
+   * (memtable flush required), IngestExternalFile will fail.
+   *
+   * @param allowBlockingFlush true if blocking flushes are allowed
+   *
+   * @return the reference to the current IngestExternalFileOptions.
+   */
+  public IngestExternalFileOptions setAllowBlockingFlush(
+      final boolean allowBlockingFlush) {
+    setAllowBlockingFlush(nativeHandle_, allowBlockingFlush);
+    return this;
+  }
+
+  /**
+   * Returns true if duplicate keys in the file being ingested are
+   * to be skipped rather than overwriting existing data under that key.
+   *
+   * @return true if duplicate keys in the file being ingested are to be
+   *     skipped, false otherwise.
+   */
+  public boolean ingestBehind() {
+    return ingestBehind(nativeHandle_);
+  }
+
+  /**
+   * Set to true if you would like duplicate keys in the file being ingested
+   * to be skipped rather than overwriting existing data under that key.
+   *
+   * Usecase: back-fill of some historical data in the database without
+   * over-writing existing newer version of data.
+   *
+   * This option could only be used if the DB has been running
+   * with DBOptions#allowIngestBehind() == true since the dawn of time.
+   *
+   * All files will be ingested at the bottommost level with seqno=0.
+   *
+   * Default: false
+   *
+   * @param ingestBehind true if you would like duplicate keys in the file being
+   *     ingested to be skipped.
+   *
+   * @return the reference to the current IngestExternalFileOptions.
+   */
+  public IngestExternalFileOptions setIngestBehind(final boolean ingestBehind) {
+    setIngestBehind(nativeHandle_, ingestBehind);
+    return this;
+  }
+
+  /**
+   * Returns true write if the global_seqno is written to a given offset
+   * in the external SST file for backward compatibility.
+   *
+   * See {@link #setWriteGlobalSeqno(boolean)}.
+   *
+   * @return true if the global_seqno is written to a given offset,
+   *     false otherwise.
+   */
+  public boolean writeGlobalSeqno() {
+    return writeGlobalSeqno(nativeHandle_);
+  }
+
+  /**
+   * Set to true if you would like to write the global_seqno to a given offset
+   * in the external SST file for backward compatibility.
+   *
+   * Older versions of RocksDB write the global_seqno to a given offset within
+   * the ingested SST files, and new versions of RocksDB do not.
+   *
+   * If you ingest an external SST using new version of RocksDB and would like
+   * to be able to downgrade to an older version of RocksDB, you should set
+   * {@link #writeGlobalSeqno()} to true.
+   *
+   * If your service is just starting to use the new RocksDB, we recommend that
+   * you set this option to false, which brings two benefits:
+   *    1. No extra random write for global_seqno during ingestion.
+   *    2. Without writing external SST file, it's possible to do checksum.
+   *
+   * We have a plan to set this option to false by default in the future.
+   *
+   * Default: true
+   *
+   * @param writeGlobalSeqno true to write the gloal_seqno to a given offset,
+   *     false otherwise
+   *
+   * @return the reference to the current IngestExternalFileOptions.
+   */
+  public IngestExternalFileOptions setWriteGlobalSeqno(
+      final boolean writeGlobalSeqno) {
+    setWriteGlobalSeqno(nativeHandle_, writeGlobalSeqno);
+    return this;
+  }
+
+  private native static long newIngestExternalFileOptions();
+  private native static long newIngestExternalFileOptions(
+      final boolean moveFiles, final boolean snapshotConsistency,
+      final boolean allowGlobalSeqNo, final boolean allowBlockingFlush);
+  @Override protected final native void disposeInternal(final long handle);
+
+  private native boolean moveFiles(final long handle);
+  private native void setMoveFiles(final long handle, final boolean move_files);
+  private native boolean snapshotConsistency(final long handle);
+  private native void setSnapshotConsistency(final long handle,
+      final boolean snapshotConsistency);
+  private native boolean allowGlobalSeqNo(final long handle);
+  private native void setAllowGlobalSeqNo(final long handle,
+      final boolean allowGloablSeqNo);
+  private native boolean allowBlockingFlush(final long handle);
+  private native void setAllowBlockingFlush(final long handle,
+      final boolean allowBlockingFlush);
+  private native boolean ingestBehind(final long handle);
+  private native void setIngestBehind(final long handle,
+      final boolean ingestBehind);
+  private native boolean writeGlobalSeqno(final long handle);
+  private native void setWriteGlobalSeqno(final long handle,
+      final boolean writeGlobalSeqNo);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/KeyMayExist.java b/src/rocksdb/java/src/main/java/org/rocksdb/KeyMayExist.java
new file mode 100644
index 000000000..36185d8c9
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/KeyMayExist.java
@@ -0,0 +1,36 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Objects;
+
+public class KeyMayExist {
+  @Override
+  public boolean equals(final Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    final KeyMayExist that = (KeyMayExist) o;
+    return (valueLength == that.valueLength && exists == that.exists);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(exists, valueLength);
+  }
+
+  public enum KeyMayExistEnum { kNotExist, kExistsWithoutValue, kExistsWithValue }
+  ;
+
+  public KeyMayExist(final KeyMayExistEnum exists, final int valueLength) {
+    this.exists = exists;
+    this.valueLength = valueLength;
+  }
+
+  public final KeyMayExistEnum exists;
+  public final int valueLength;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/LRUCache.java b/src/rocksdb/java/src/main/java/org/rocksdb/LRUCache.java
new file mode 100644
index 000000000..db90b17c5
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/LRUCache.java
@@ -0,0 +1,106 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Least Recently Used Cache
+ */
+public class LRUCache extends Cache {
+
+  /**
+   * Create a new cache with a fixed size capacity
+   *
+   * @param capacity The fixed size capacity of the cache
+   */
+  public LRUCache(final long capacity) {
+    this(capacity, -1, false, 0.0, 0.0);
+  }
+
+  /**
+   * Create a new cache with a fixed size capacity. The cache is sharded
+   * to 2^numShardBits shards, by hash of the key. The total capacity
+   * is divided and evenly assigned to each shard.
+   * numShardBits = -1 means it is automatically determined: every shard
+   * will be at least 512KB and number of shard bits will not exceed 6.
+   *
+   * @param capacity The fixed size capacity of the cache
+   * @param numShardBits The cache is sharded to 2^numShardBits shards,
+   *     by hash of the key
+   */
+  public LRUCache(final long capacity, final int numShardBits) {
+    super(newLRUCache(capacity, numShardBits, false, 0.0, 0.0));
+  }
+
+  /**
+   * Create a new cache with a fixed size capacity. The cache is sharded
+   * to 2^numShardBits shards, by hash of the key. The total capacity
+   * is divided and evenly assigned to each shard. If strictCapacityLimit
+   * is set, insert to the cache will fail when cache is full.
+   * numShardBits = -1 means it is automatically determined: every shard
+   * will be at least 512KB and number of shard bits will not exceed 6.
+   *
+   * @param capacity The fixed size capacity of the cache
+   * @param numShardBits The cache is sharded to 2^numShardBits shards,
+   *     by hash of the key
+   * @param strictCapacityLimit insert to the cache will fail when cache is full
+   */
+  public LRUCache(final long capacity, final int numShardBits,
+                  final boolean strictCapacityLimit) {
+    super(newLRUCache(capacity, numShardBits, strictCapacityLimit, 0.0, 0.0));
+  }
+
+  /**
+   * Create a new cache with a fixed size capacity. The cache is sharded
+   * to 2^numShardBits shards, by hash of the key. The total capacity
+   * is divided and evenly assigned to each shard. If strictCapacityLimit
+   * is set, insert to the cache will fail when cache is full. User can also
+   * set percentage of the cache reserves for high priority entries via
+   * highPriPoolRatio.
+   * numShardBits = -1 means it is automatically determined: every shard
+   * will be at least 512KB and number of shard bits will not exceed 6.
+   *
+   * @param capacity The fixed size capacity of the cache
+   * @param numShardBits The cache is sharded to 2^numShardBits shards,
+   *     by hash of the key
+   * @param strictCapacityLimit insert to the cache will fail when cache is full
+   * @param highPriPoolRatio percentage of the cache reserves for high priority
+   *     entries
+   */
+  public LRUCache(final long capacity, final int numShardBits, final boolean strictCapacityLimit,
+      final double highPriPoolRatio) {
+    super(newLRUCache(capacity, numShardBits, strictCapacityLimit, highPriPoolRatio, 0.0));
+  }
+
+  /**
+   * Create a new cache with a fixed size capacity. The cache is sharded
+   * to 2^numShardBits shards, by hash of the key. The total capacity
+   * is divided and evenly assigned to each shard. If strictCapacityLimit
+   * is set, insert to the cache will fail when cache is full. User can also
+   * set percentage of the cache reserves for high priority entries and low
+   * priority entries via highPriPoolRatio and lowPriPoolRatio.
+   * numShardBits = -1 means it is automatically determined: every shard
+   * will be at least 512KB and number of shard bits will not exceed 6.
+   *
+   * @param capacity The fixed size capacity of the cache
+   * @param numShardBits The cache is sharded to 2^numShardBits shards,
+   *     by hash of the key
+   * @param strictCapacityLimit insert to the cache will fail when cache is full
+   * @param highPriPoolRatio percentage of the cache reserves for high priority
+   *     entries
+   * @param lowPriPoolRatio percentage of the cache reserves for low priority
+   *     entries
+   */
+  public LRUCache(final long capacity, final int numShardBits, final boolean strictCapacityLimit,
+      final double highPriPoolRatio, final double lowPriPoolRatio) {
+    super(newLRUCache(
+        capacity, numShardBits, strictCapacityLimit, highPriPoolRatio, lowPriPoolRatio));
+  }
+
+  private native static long newLRUCache(final long capacity, final int numShardBits,
+      final boolean strictCapacityLimit, final double highPriPoolRatio,
+      final double lowPriPoolRatio);
+  @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/LevelMetaData.java b/src/rocksdb/java/src/main/java/org/rocksdb/LevelMetaData.java
new file mode 100644
index 000000000..c5685098b
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/LevelMetaData.java
@@ -0,0 +1,56 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * The metadata that describes a level.
+ */
+public class LevelMetaData {
+  private final int level;
+  private final long size;
+  private final SstFileMetaData[] files;
+
+  /**
+   * Called from JNI C++
+   */
+  private LevelMetaData(final int level, final long size,
+      final SstFileMetaData[] files) {
+    this.level = level;
+    this.size = size;
+    this.files = files;
+  }
+
+  /**
+   * The level which this meta data describes.
+   *
+   * @return the level
+   */
+  public int level() {
+    return level;
+  }
+
+  /**
+   * The size of this level in bytes, which is equal to the sum of
+   * the file size of its {@link #files()}.
+   *
+   * @return the size
+   */
+  public long size() {
+    return size;
+  }
+
+  /**
+   * The metadata of all sst files in this level.
+   *
+   * @return the metadata of the files
+   */
+  public List<SstFileMetaData> files() {
+    return Arrays.asList(files);
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/LiveFileMetaData.java b/src/rocksdb/java/src/main/java/org/rocksdb/LiveFileMetaData.java
new file mode 100644
index 000000000..35d883e18
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/LiveFileMetaData.java
@@ -0,0 +1,55 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * The full set of metadata associated with each SST file.
+ */
+public class LiveFileMetaData extends SstFileMetaData {
+  private final byte[] columnFamilyName;
+  private final int level;
+
+  /**
+   * Called from JNI C++
+   */
+  private LiveFileMetaData(
+      final byte[] columnFamilyName,
+      final int level,
+      final String fileName,
+      final String path,
+      final long size,
+      final long smallestSeqno,
+      final long largestSeqno,
+      final byte[] smallestKey,
+      final byte[] largestKey,
+      final long numReadsSampled,
+      final boolean beingCompacted,
+      final long numEntries,
+      final long numDeletions) {
+    super(fileName, path, size, smallestSeqno, largestSeqno, smallestKey,
+        largestKey, numReadsSampled, beingCompacted, numEntries, numDeletions);
+    this.columnFamilyName = columnFamilyName;
+    this.level = level;
+  }
+
+  /**
+   * Get the name of the column family.
+   *
+   * @return the name of the column family
+   */
+  public byte[] columnFamilyName() {
+    return columnFamilyName;
+  }
+
+  /**
+   * Get the level at which this file resides.
+   *
+   * @return the level at which the file resides.
+   */
+  public int level() {
+    return level;
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/LogFile.java b/src/rocksdb/java/src/main/java/org/rocksdb/LogFile.java
new file mode 100644
index 000000000..ef24a6427
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/LogFile.java
@@ -0,0 +1,75 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public class LogFile {
+  private final String pathName;
+  private final long logNumber;
+  private final WalFileType type;
+  private final long startSequence;
+  private final long sizeFileBytes;
+
+  /**
+   * Called from JNI C++
+   */
+  private LogFile(final String pathName, final long logNumber,
+      final byte walFileTypeValue, final long startSequence,
+      final long sizeFileBytes) {
+    this.pathName = pathName;
+    this.logNumber = logNumber;
+    this.type = WalFileType.fromValue(walFileTypeValue);
+    this.startSequence = startSequence;
+    this.sizeFileBytes = sizeFileBytes;
+  }
+
+  /**
+   * Returns log file's pathname relative to the main db dir
+   * Eg. For a live-log-file = /000003.log
+   * For an archived-log-file = /archive/000003.log
+   *
+   * @return log file's pathname
+   */
+  public String pathName() {
+    return pathName;
+  }
+
+  /**
+   * Primary identifier for log file.
+   * This is directly proportional to creation time of the log file
+   *
+   * @return the log number
+   */
+  public long logNumber() {
+    return logNumber;
+  }
+
+  /**
+   * Log file can be either alive or archived.
+   *
+   * @return the type of the log file.
+   */
+  public WalFileType type() {
+    return type;
+  }
+
+  /**
+   * Starting sequence number of writebatch written in this log file.
+   *
+   * @return the stating sequence number
+   */
+  public long startSequence() {
+    return startSequence;
+  }
+
+  /**
+   * Size of log file on disk in Bytes.
+   *
+   * @return size of log file
+   */
+  public long sizeFileBytes() {
+    return sizeFileBytes;
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/Logger.java b/src/rocksdb/java/src/main/java/org/rocksdb/Logger.java
new file mode 100644
index 000000000..00a5d5674
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/Logger.java
@@ -0,0 +1,122 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * <p>This class provides a custom logger functionality
+ * in Java which wraps {@code RocksDB} logging facilities.
+ * </p>
+ *
+ * <p>Using this class RocksDB can log with common
+ * Java logging APIs like Log4j or Slf4j without keeping
+ * database logs in the filesystem.</p>
+ *
+ * <strong>Performance</strong>
+ * <p>There are certain performance penalties using a Java
+ * {@code Logger} implementation within production code.
+ * </p>
+ *
+ * <p>
+ * A log level can be set using {@link org.rocksdb.Options} or
+ * {@link Logger#setInfoLogLevel(InfoLogLevel)}. The set log level
+ * influences the underlying native code. Each log message is
+ * checked against the set log level and if the log level is more
+ * verbose as the set log level, native allocations will be made
+ * and data structures are allocated.
+ * </p>
+ *
+ * <p>Every log message which will be emitted by native code will
+ * trigger expensive native to Java transitions. So the preferred
+ * setting for production use is either
+ * {@link org.rocksdb.InfoLogLevel#ERROR_LEVEL} or
+ * {@link org.rocksdb.InfoLogLevel#FATAL_LEVEL}.
+ * </p>
+ */
+public abstract class Logger extends RocksCallbackObject {
+
+  private final static long WITH_OPTIONS = 0;
+  private final static long WITH_DBOPTIONS = 1;
+
+  /**
+   * <p>AbstractLogger constructor.</p>
+   *
+   * <p><strong>Important:</strong> the log level set within
+   * the {@link org.rocksdb.Options} instance will be used as
+   * maximum log level of RocksDB.</p>
+   *
+   * @param options {@link org.rocksdb.Options} instance.
+   */
+  public Logger(final Options options) {
+    super(options.nativeHandle_, WITH_OPTIONS);
+
+  }
+
+  /**
+   * <p>AbstractLogger constructor.</p>
+   *
+   * <p><strong>Important:</strong> the log level set within
+   * the {@link org.rocksdb.DBOptions} instance will be used
+   * as maximum log level of RocksDB.</p>
+   *
+   * @param dboptions {@link org.rocksdb.DBOptions} instance.
+   */
+  public Logger(final DBOptions dboptions) {
+    super(dboptions.nativeHandle_, WITH_DBOPTIONS);
+  }
+
+  @Override
+  protected long initializeNative(long... nativeParameterHandles) {
+    if(nativeParameterHandles[1] == WITH_OPTIONS) {
+      return createNewLoggerOptions(nativeParameterHandles[0]);
+    } else if(nativeParameterHandles[1] == WITH_DBOPTIONS) {
+      return createNewLoggerDbOptions(nativeParameterHandles[0]);
+    } else {
+      throw new IllegalArgumentException();
+    }
+  }
+
+  /**
+   * Set {@link org.rocksdb.InfoLogLevel} to AbstractLogger.
+   *
+   * @param infoLogLevel {@link org.rocksdb.InfoLogLevel} instance.
+   */
+  public void setInfoLogLevel(final InfoLogLevel infoLogLevel) {
+      setInfoLogLevel(nativeHandle_, infoLogLevel.getValue());
+  }
+
+  /**
+   * Return the loggers log level.
+   *
+   * @return {@link org.rocksdb.InfoLogLevel} instance.
+   */
+  public InfoLogLevel infoLogLevel() {
+    return InfoLogLevel.getInfoLogLevel(
+        infoLogLevel(nativeHandle_));
+  }
+
+  protected abstract void log(InfoLogLevel infoLogLevel,
+      String logMsg);
+
+  protected native long createNewLoggerOptions(
+      long options);
+  protected native long createNewLoggerDbOptions(
+      long dbOptions);
+  protected native void setInfoLogLevel(long handle,
+      byte infoLogLevel);
+  protected native byte infoLogLevel(long handle);
+
+  /**
+   * We override {@link RocksCallbackObject#disposeInternal()}
+   * as disposing of a rocksdb::LoggerJniCallback requires
+   * a slightly different approach as it is a std::shared_ptr
+   */
+  @Override
+  protected void disposeInternal() {
+    disposeInternal(nativeHandle_);
+  }
+
+  private native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/MemTableConfig.java b/src/rocksdb/java/src/main/java/org/rocksdb/MemTableConfig.java
new file mode 100644
index 000000000..83cee974a
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/MemTableConfig.java
@@ -0,0 +1,29 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+/**
+ * MemTableConfig is used to config the internal mem-table of a RocksDB.
+ * It is required for each memtable to have one such sub-class to allow
+ * Java developers to use it.
+ *
+ * To make a RocksDB to use a specific MemTable format, its associated
+ * MemTableConfig should be properly set and passed into Options
+ * via Options.setMemTableFactory() and open the db using that Options.
+ *
+ * @see Options
+ */
+public abstract class MemTableConfig {
+  /**
+   * This function should only be called by Options.setMemTableConfig(),
+   * which will create a c++ shared-pointer to the c++ MemTableRepFactory
+   * that associated with the Java MemTableConfig.
+   *
+   * @see Options#setMemTableConfig(MemTableConfig)
+   *
+   * @return native handle address to native memory table instance.
+   */
+  abstract protected long newMemTableFactoryHandle();
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/MemTableInfo.java b/src/rocksdb/java/src/main/java/org/rocksdb/MemTableInfo.java
new file mode 100644
index 000000000..f4fb577c3
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/MemTableInfo.java
@@ -0,0 +1,103 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Objects;
+
+public class MemTableInfo {
+  private final String columnFamilyName;
+  private final long firstSeqno;
+  private final long earliestSeqno;
+  private final long numEntries;
+  private final long numDeletes;
+
+  /**
+   * Access is package private as this will only be constructed from
+   * C++ via JNI and for testing.
+   */
+  MemTableInfo(final String columnFamilyName, final long firstSeqno, final long earliestSeqno,
+      final long numEntries, final long numDeletes) {
+    this.columnFamilyName = columnFamilyName;
+    this.firstSeqno = firstSeqno;
+    this.earliestSeqno = earliestSeqno;
+    this.numEntries = numEntries;
+    this.numDeletes = numDeletes;
+  }
+
+  /**
+   * Get the name of the column family to which memtable belongs.
+   *
+   * @return the name of the column family.
+   */
+  public String getColumnFamilyName() {
+    return columnFamilyName;
+  }
+
+  /**
+   * Get the Sequence number of the first element that was inserted into the
+   * memtable.
+   *
+   * @return the sequence number of the first inserted element.
+   */
+  public long getFirstSeqno() {
+    return firstSeqno;
+  }
+
+  /**
+   * Get the Sequence number that is guaranteed to be smaller than or equal
+   * to the sequence number of any key that could be inserted into this
+   * memtable. It can then be assumed that any write with a larger(or equal)
+   * sequence number will be present in this memtable or a later memtable.
+   *
+   * @return the earliest sequence number.
+   */
+  public long getEarliestSeqno() {
+    return earliestSeqno;
+  }
+
+  /**
+   * Get the total number of entries in memtable.
+   *
+   * @return the total number of entries.
+   */
+  public long getNumEntries() {
+    return numEntries;
+  }
+
+  /**
+   * Get the total number of deletes in memtable.
+   *
+   * @return the total number of deletes.
+   */
+  public long getNumDeletes() {
+    return numDeletes;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    MemTableInfo that = (MemTableInfo) o;
+    return firstSeqno == that.firstSeqno && earliestSeqno == that.earliestSeqno
+        && numEntries == that.numEntries && numDeletes == that.numDeletes
+        && Objects.equals(columnFamilyName, that.columnFamilyName);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(columnFamilyName, firstSeqno, earliestSeqno, numEntries, numDeletes);
+  }
+
+  @Override
+  public String toString() {
+    return "MemTableInfo{"
+        + "columnFamilyName='" + columnFamilyName + '\'' + ", firstSeqno=" + firstSeqno
+        + ", earliestSeqno=" + earliestSeqno + ", numEntries=" + numEntries
+        + ", numDeletes=" + numDeletes + '}';
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/MemoryUsageType.java b/src/rocksdb/java/src/main/java/org/rocksdb/MemoryUsageType.java
new file mode 100644
index 000000000..6010ce7af
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/MemoryUsageType.java
@@ -0,0 +1,72 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * MemoryUsageType
+ *
+ * <p>The value will be used as a key to indicate the type of memory usage
+ * described</p>
+ */
+public enum MemoryUsageType {
+  /**
+   * Memory usage of all the mem-tables.
+   */
+  kMemTableTotal((byte) 0),
+  /**
+   * Memory usage of those un-flushed mem-tables.
+   */
+  kMemTableUnFlushed((byte) 1),
+  /**
+   * Memory usage of all the table readers.
+   */
+  kTableReadersTotal((byte) 2),
+  /**
+   * Memory usage by Cache.
+   */
+  kCacheTotal((byte) 3),
+  /**
+   * Max usage types - copied to keep 1:1 with native.
+   */
+  kNumUsageTypes((byte) 4);
+
+  /**
+   * Returns the byte value of the enumerations value
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value_;
+  }
+
+  /**
+   * <p>Get the MemoryUsageType enumeration value by
+   * passing the byte identifier to this method.</p>
+   *
+   * @param byteIdentifier of MemoryUsageType.
+   *
+   * @return MemoryUsageType instance.
+   *
+   * @throws IllegalArgumentException if the usage type for the byteIdentifier
+   *     cannot be found
+   */
+  public static MemoryUsageType getMemoryUsageType(final byte byteIdentifier) {
+    for (final MemoryUsageType memoryUsageType : MemoryUsageType.values()) {
+      if (memoryUsageType.getValue() == byteIdentifier) {
+        return memoryUsageType;
+      }
+    }
+
+    throw new IllegalArgumentException(
+        "Illegal value provided for MemoryUsageType.");
+  }
+
+  MemoryUsageType(byte value) {
+    value_ = value;
+  }
+
+  private final byte value_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/MemoryUtil.java b/src/rocksdb/java/src/main/java/org/rocksdb/MemoryUtil.java
new file mode 100644
index 000000000..52b2175e6
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/MemoryUtil.java
@@ -0,0 +1,60 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.*;
+
+/**
+ * JNI passthrough for MemoryUtil.
+ */
+public class MemoryUtil {
+
+  /**
+   * <p>Returns the approximate memory usage of different types in the input
+   * list of DBs and Cache set.  For instance, in the output map the key
+   * kMemTableTotal will be associated with the memory
+   * usage of all the mem-tables from all the input rocksdb instances.</p>
+   *
+   * <p>Note that for memory usage inside Cache class, we will
+   * only report the usage of the input "cache_set" without
+   * including those Cache usage inside the input list "dbs"
+   * of DBs.</p>
+   *
+   * @param dbs List of dbs to collect memory usage for.
+   * @param caches Set of caches to collect memory usage for.
+   * @return Map from {@link MemoryUsageType} to memory usage as a {@link Long}.
+   */
+  public static Map<MemoryUsageType, Long> getApproximateMemoryUsageByType(final List<RocksDB> dbs, final Set<Cache> caches) {
+    int dbCount = (dbs == null) ? 0 : dbs.size();
+    int cacheCount = (caches == null) ? 0 : caches.size();
+    long[] dbHandles = new long[dbCount];
+    long[] cacheHandles = new long[cacheCount];
+    if (dbCount > 0) {
+      ListIterator<RocksDB> dbIter = dbs.listIterator();
+      while (dbIter.hasNext()) {
+        dbHandles[dbIter.nextIndex()] = dbIter.next().nativeHandle_;
+      }
+    }
+    if (cacheCount > 0) {
+      // NOTE: This index handling is super ugly but I couldn't get a clean way to track both the
+      // index and the iterator simultaneously within a Set.
+      int i = 0;
+      for (Cache cache : caches) {
+        cacheHandles[i] = cache.nativeHandle_;
+        i++;
+      }
+    }
+    Map<Byte, Long> byteOutput = getApproximateMemoryUsageByType(dbHandles, cacheHandles);
+    Map<MemoryUsageType, Long> output = new HashMap<>();
+    for(Map.Entry<Byte, Long> longEntry : byteOutput.entrySet()) {
+      output.put(MemoryUsageType.getMemoryUsageType(longEntry.getKey()), longEntry.getValue());
+    }
+    return output;
+  }
+
+  private native static Map<Byte, Long> getApproximateMemoryUsageByType(final long[] dbHandles,
+      final long[] cacheHandles);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/MergeOperator.java b/src/rocksdb/java/src/main/java/org/rocksdb/MergeOperator.java
new file mode 100644
index 000000000..c299f6221
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/MergeOperator.java
@@ -0,0 +1,18 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2014, Vlad Balan (vlad.gm@gmail.com).  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * MergeOperator holds an operator to be applied when compacting
+ * two merge operands held under the same key in order to obtain a single
+ * value.
+ */
+public abstract class MergeOperator extends RocksObject {
+    protected MergeOperator(final long nativeHandle) {
+        super(nativeHandle);
+    }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptions.java
new file mode 100644
index 000000000..af28fa8ce
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptions.java
@@ -0,0 +1,623 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.*;
+
+public class MutableColumnFamilyOptions
+    extends AbstractMutableOptions {
+
+  /**
+   * User must use builder pattern, or parser.
+   *
+   * @param keys the keys
+   * @param values the values
+   *
+   * See {@link #builder()} and {@link #parse(String)}.
+   */
+  private MutableColumnFamilyOptions(final String[] keys,
+      final String[] values) {
+    super(keys, values);
+  }
+
+  /**
+   * Creates a builder which allows you
+   * to set MutableColumnFamilyOptions in a fluent
+   * manner
+   *
+   * @return A builder for MutableColumnFamilyOptions
+   */
+  public static MutableColumnFamilyOptionsBuilder builder() {
+    return new MutableColumnFamilyOptionsBuilder();
+  }
+
+  /**
+   * Parses a String representation of MutableColumnFamilyOptions
+   *
+   * The format is: key1=value1;key2=value2;key3=value3 etc
+   *
+   * For int[] values, each int should be separated by a colon, e.g.
+   *
+   * key1=value1;intArrayKey1=1:2:3
+   *
+   * @param str The string representation of the mutable column family options
+   * @param ignoreUnknown what to do if the key is not one of the keys we expect
+   *
+   * @return A builder for the mutable column family options
+   */
+  public static MutableColumnFamilyOptionsBuilder parse(
+      final String str, final boolean ignoreUnknown) {
+    Objects.requireNonNull(str);
+
+    final List<OptionString.Entry> parsedOptions = OptionString.Parser.parse(str);
+    return new MutableColumnFamilyOptionsBuilder().fromParsed(parsedOptions, ignoreUnknown);
+  }
+
+  public static MutableColumnFamilyOptionsBuilder parse(final String str) {
+    return parse(str, false);
+  }
+
+  private interface MutableColumnFamilyOptionKey extends MutableOptionKey {}
+
+  public enum MemtableOption implements MutableColumnFamilyOptionKey {
+    write_buffer_size(ValueType.LONG),
+    arena_block_size(ValueType.LONG),
+    memtable_prefix_bloom_size_ratio(ValueType.DOUBLE),
+    memtable_whole_key_filtering(ValueType.BOOLEAN),
+    @Deprecated memtable_prefix_bloom_bits(ValueType.INT),
+    @Deprecated memtable_prefix_bloom_probes(ValueType.INT),
+    memtable_huge_page_size(ValueType.LONG),
+    max_successive_merges(ValueType.LONG),
+    @Deprecated filter_deletes(ValueType.BOOLEAN),
+    max_write_buffer_number(ValueType.INT),
+    inplace_update_num_locks(ValueType.LONG),
+    experimental_mempurge_threshold(ValueType.DOUBLE);
+
+    private final ValueType valueType;
+    MemtableOption(final ValueType valueType) {
+      this.valueType = valueType;
+    }
+
+    @Override
+    public ValueType getValueType() {
+      return valueType;
+    }
+  }
+
+  public enum CompactionOption implements MutableColumnFamilyOptionKey {
+    disable_auto_compactions(ValueType.BOOLEAN),
+    soft_pending_compaction_bytes_limit(ValueType.LONG),
+    hard_pending_compaction_bytes_limit(ValueType.LONG),
+    level0_file_num_compaction_trigger(ValueType.INT),
+    level0_slowdown_writes_trigger(ValueType.INT),
+    level0_stop_writes_trigger(ValueType.INT),
+    max_compaction_bytes(ValueType.LONG),
+    target_file_size_base(ValueType.LONG),
+    target_file_size_multiplier(ValueType.INT),
+    max_bytes_for_level_base(ValueType.LONG),
+    max_bytes_for_level_multiplier(ValueType.INT),
+    max_bytes_for_level_multiplier_additional(ValueType.INT_ARRAY),
+    ttl(ValueType.LONG),
+    periodic_compaction_seconds(ValueType.LONG);
+
+    private final ValueType valueType;
+    CompactionOption(final ValueType valueType) {
+      this.valueType = valueType;
+    }
+
+    @Override
+    public ValueType getValueType() {
+      return valueType;
+    }
+  }
+
+  public enum BlobOption implements MutableColumnFamilyOptionKey {
+    enable_blob_files(ValueType.BOOLEAN),
+    min_blob_size(ValueType.LONG),
+    blob_file_size(ValueType.LONG),
+    blob_compression_type(ValueType.ENUM),
+    enable_blob_garbage_collection(ValueType.BOOLEAN),
+    blob_garbage_collection_age_cutoff(ValueType.DOUBLE),
+    blob_garbage_collection_force_threshold(ValueType.DOUBLE),
+    blob_compaction_readahead_size(ValueType.LONG),
+    blob_file_starting_level(ValueType.INT),
+    prepopulate_blob_cache(ValueType.ENUM);
+
+    private final ValueType valueType;
+    BlobOption(final ValueType valueType) {
+      this.valueType = valueType;
+    }
+
+    @Override
+    public ValueType getValueType() {
+      return valueType;
+    }
+  }
+
+  public enum MiscOption implements MutableColumnFamilyOptionKey {
+    max_sequential_skip_in_iterations(ValueType.LONG),
+    paranoid_file_checks(ValueType.BOOLEAN),
+    report_bg_io_stats(ValueType.BOOLEAN),
+    compression(ValueType.ENUM);
+
+    private final ValueType valueType;
+    MiscOption(final ValueType valueType) {
+      this.valueType = valueType;
+    }
+
+    @Override
+    public ValueType getValueType() {
+      return valueType;
+    }
+  }
+
+  public static class MutableColumnFamilyOptionsBuilder
+      extends AbstractMutableOptionsBuilder<MutableColumnFamilyOptions, MutableColumnFamilyOptionsBuilder, MutableColumnFamilyOptionKey>
+      implements MutableColumnFamilyOptionsInterface<MutableColumnFamilyOptionsBuilder> {
+
+    private final static Map<String, MutableColumnFamilyOptionKey> ALL_KEYS_LOOKUP = new HashMap<>();
+    static {
+      for(final MutableColumnFamilyOptionKey key : MemtableOption.values()) {
+        ALL_KEYS_LOOKUP.put(key.name(), key);
+      }
+
+      for(final MutableColumnFamilyOptionKey key : CompactionOption.values()) {
+        ALL_KEYS_LOOKUP.put(key.name(), key);
+      }
+
+      for(final MutableColumnFamilyOptionKey key : MiscOption.values()) {
+        ALL_KEYS_LOOKUP.put(key.name(), key);
+      }
+
+      for (final MutableColumnFamilyOptionKey key : BlobOption.values()) {
+        ALL_KEYS_LOOKUP.put(key.name(), key);
+      }
+    }
+
+    private MutableColumnFamilyOptionsBuilder() {
+      super();
+    }
+
+    @Override
+    protected MutableColumnFamilyOptionsBuilder self() {
+      return this;
+    }
+
+    @Override
+    protected Map<String, MutableColumnFamilyOptionKey> allKeys() {
+      return ALL_KEYS_LOOKUP;
+    }
+
+    @Override
+    protected MutableColumnFamilyOptions build(final String[] keys,
+        final String[] values) {
+      return new MutableColumnFamilyOptions(keys, values);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setWriteBufferSize(
+        final long writeBufferSize) {
+      return setLong(MemtableOption.write_buffer_size, writeBufferSize);
+    }
+
+    @Override
+    public long writeBufferSize() {
+      return getLong(MemtableOption.write_buffer_size);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setArenaBlockSize(
+        final long arenaBlockSize) {
+      return setLong(MemtableOption.arena_block_size, arenaBlockSize);
+    }
+
+    @Override
+    public long arenaBlockSize() {
+      return getLong(MemtableOption.arena_block_size);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setMemtablePrefixBloomSizeRatio(
+        final double memtablePrefixBloomSizeRatio) {
+      return setDouble(MemtableOption.memtable_prefix_bloom_size_ratio,
+          memtablePrefixBloomSizeRatio);
+    }
+
+    @Override
+    public double memtablePrefixBloomSizeRatio() {
+      return getDouble(MemtableOption.memtable_prefix_bloom_size_ratio);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setMemtableWholeKeyFiltering(
+        final boolean memtableWholeKeyFiltering) {
+      return setBoolean(MemtableOption.memtable_whole_key_filtering, memtableWholeKeyFiltering);
+    }
+
+    @Override
+    public boolean memtableWholeKeyFiltering() {
+      return getBoolean(MemtableOption.memtable_whole_key_filtering);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setMemtableHugePageSize(
+        final long memtableHugePageSize) {
+      return setLong(MemtableOption.memtable_huge_page_size,
+          memtableHugePageSize);
+    }
+
+    @Override
+    public long memtableHugePageSize() {
+      return getLong(MemtableOption.memtable_huge_page_size);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setMaxSuccessiveMerges(
+        final long maxSuccessiveMerges) {
+      return setLong(MemtableOption.max_successive_merges, maxSuccessiveMerges);
+    }
+
+    @Override
+    public long maxSuccessiveMerges() {
+      return getLong(MemtableOption.max_successive_merges);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setMaxWriteBufferNumber(
+        final int maxWriteBufferNumber) {
+      return setInt(MemtableOption.max_write_buffer_number,
+          maxWriteBufferNumber);
+    }
+
+    @Override
+    public int maxWriteBufferNumber() {
+      return getInt(MemtableOption.max_write_buffer_number);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setInplaceUpdateNumLocks(
+        final long inplaceUpdateNumLocks) {
+      return setLong(MemtableOption.inplace_update_num_locks,
+          inplaceUpdateNumLocks);
+    }
+
+    @Override
+    public long inplaceUpdateNumLocks() {
+      return getLong(MemtableOption.inplace_update_num_locks);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setExperimentalMempurgeThreshold(
+        final double experimentalMempurgeThreshold) {
+      return setDouble(
+          MemtableOption.experimental_mempurge_threshold, experimentalMempurgeThreshold);
+    }
+
+    @Override
+    public double experimentalMempurgeThreshold() {
+      return getDouble(MemtableOption.experimental_mempurge_threshold);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setDisableAutoCompactions(
+        final boolean disableAutoCompactions) {
+      return setBoolean(CompactionOption.disable_auto_compactions,
+          disableAutoCompactions);
+    }
+
+    @Override
+    public boolean disableAutoCompactions() {
+      return getBoolean(CompactionOption.disable_auto_compactions);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setSoftPendingCompactionBytesLimit(
+        final long softPendingCompactionBytesLimit) {
+      return setLong(CompactionOption.soft_pending_compaction_bytes_limit,
+          softPendingCompactionBytesLimit);
+    }
+
+    @Override
+    public long softPendingCompactionBytesLimit() {
+      return getLong(CompactionOption.soft_pending_compaction_bytes_limit);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setHardPendingCompactionBytesLimit(
+        final long hardPendingCompactionBytesLimit) {
+      return setLong(CompactionOption.hard_pending_compaction_bytes_limit,
+          hardPendingCompactionBytesLimit);
+    }
+
+    @Override
+    public long hardPendingCompactionBytesLimit() {
+      return getLong(CompactionOption.hard_pending_compaction_bytes_limit);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setLevel0FileNumCompactionTrigger(
+        final int level0FileNumCompactionTrigger) {
+      return setInt(CompactionOption.level0_file_num_compaction_trigger,
+          level0FileNumCompactionTrigger);
+    }
+
+    @Override
+    public int level0FileNumCompactionTrigger() {
+      return getInt(CompactionOption.level0_file_num_compaction_trigger);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setLevel0SlowdownWritesTrigger(
+        final int level0SlowdownWritesTrigger) {
+      return setInt(CompactionOption.level0_slowdown_writes_trigger,
+          level0SlowdownWritesTrigger);
+    }
+
+    @Override
+    public int level0SlowdownWritesTrigger() {
+      return getInt(CompactionOption.level0_slowdown_writes_trigger);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setLevel0StopWritesTrigger(
+        final int level0StopWritesTrigger) {
+      return setInt(CompactionOption.level0_stop_writes_trigger,
+          level0StopWritesTrigger);
+    }
+
+    @Override
+    public int level0StopWritesTrigger() {
+      return getInt(CompactionOption.level0_stop_writes_trigger);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setMaxCompactionBytes(final long maxCompactionBytes) {
+      return setLong(CompactionOption.max_compaction_bytes, maxCompactionBytes);
+    }
+
+    @Override
+    public long maxCompactionBytes() {
+      return getLong(CompactionOption.max_compaction_bytes);
+    }
+
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setTargetFileSizeBase(
+        final long targetFileSizeBase) {
+      return setLong(CompactionOption.target_file_size_base,
+          targetFileSizeBase);
+    }
+
+    @Override
+    public long targetFileSizeBase() {
+      return getLong(CompactionOption.target_file_size_base);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setTargetFileSizeMultiplier(
+        final int targetFileSizeMultiplier) {
+      return setInt(CompactionOption.target_file_size_multiplier,
+          targetFileSizeMultiplier);
+    }
+
+    @Override
+    public int targetFileSizeMultiplier() {
+      return getInt(CompactionOption.target_file_size_multiplier);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setMaxBytesForLevelBase(
+        final long maxBytesForLevelBase) {
+      return setLong(CompactionOption.max_bytes_for_level_base,
+          maxBytesForLevelBase);
+    }
+
+    @Override
+    public long maxBytesForLevelBase() {
+      return getLong(CompactionOption.max_bytes_for_level_base);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setMaxBytesForLevelMultiplier(
+        final double maxBytesForLevelMultiplier) {
+      return setDouble(CompactionOption.max_bytes_for_level_multiplier, maxBytesForLevelMultiplier);
+    }
+
+    @Override
+    public double maxBytesForLevelMultiplier() {
+      return getDouble(CompactionOption.max_bytes_for_level_multiplier);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setMaxBytesForLevelMultiplierAdditional(
+        final int[] maxBytesForLevelMultiplierAdditional) {
+      return setIntArray(
+          CompactionOption.max_bytes_for_level_multiplier_additional,
+          maxBytesForLevelMultiplierAdditional);
+    }
+
+    @Override
+    public int[] maxBytesForLevelMultiplierAdditional() {
+      return getIntArray(
+          CompactionOption.max_bytes_for_level_multiplier_additional);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setMaxSequentialSkipInIterations(
+        final long maxSequentialSkipInIterations) {
+      return setLong(MiscOption.max_sequential_skip_in_iterations,
+          maxSequentialSkipInIterations);
+    }
+
+    @Override
+    public long maxSequentialSkipInIterations() {
+      return getLong(MiscOption.max_sequential_skip_in_iterations);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setParanoidFileChecks(
+        final boolean paranoidFileChecks) {
+      return setBoolean(MiscOption.paranoid_file_checks, paranoidFileChecks);
+    }
+
+    @Override
+    public boolean paranoidFileChecks() {
+      return getBoolean(MiscOption.paranoid_file_checks);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setCompressionType(
+        final CompressionType compressionType) {
+      return setEnum(MiscOption.compression, compressionType);
+    }
+
+    @Override
+    public CompressionType compressionType() {
+      return (CompressionType) getEnum(MiscOption.compression);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setReportBgIoStats(
+        final boolean reportBgIoStats) {
+      return setBoolean(MiscOption.report_bg_io_stats, reportBgIoStats);
+    }
+
+    @Override
+    public boolean reportBgIoStats() {
+      return getBoolean(MiscOption.report_bg_io_stats);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setTtl(final long ttl) {
+      return setLong(CompactionOption.ttl, ttl);
+    }
+
+    @Override
+    public long ttl() {
+      return getLong(CompactionOption.ttl);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setPeriodicCompactionSeconds(
+        final long periodicCompactionSeconds) {
+      return setLong(CompactionOption.periodic_compaction_seconds, periodicCompactionSeconds);
+    }
+
+    @Override
+    public long periodicCompactionSeconds() {
+      return getLong(CompactionOption.periodic_compaction_seconds);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setEnableBlobFiles(final boolean enableBlobFiles) {
+      return setBoolean(BlobOption.enable_blob_files, enableBlobFiles);
+    }
+
+    @Override
+    public boolean enableBlobFiles() {
+      return getBoolean(BlobOption.enable_blob_files);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setMinBlobSize(final long minBlobSize) {
+      return setLong(BlobOption.min_blob_size, minBlobSize);
+    }
+
+    @Override
+    public long minBlobSize() {
+      return getLong(BlobOption.min_blob_size);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setBlobFileSize(final long blobFileSize) {
+      return setLong(BlobOption.blob_file_size, blobFileSize);
+    }
+
+    @Override
+    public long blobFileSize() {
+      return getLong(BlobOption.blob_file_size);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setBlobCompressionType(
+        final CompressionType compressionType) {
+      return setEnum(BlobOption.blob_compression_type, compressionType);
+    }
+
+    @Override
+    public CompressionType blobCompressionType() {
+      return (CompressionType) getEnum(BlobOption.blob_compression_type);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setEnableBlobGarbageCollection(
+        final boolean enableBlobGarbageCollection) {
+      return setBoolean(BlobOption.enable_blob_garbage_collection, enableBlobGarbageCollection);
+    }
+
+    @Override
+    public boolean enableBlobGarbageCollection() {
+      return getBoolean(BlobOption.enable_blob_garbage_collection);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setBlobGarbageCollectionAgeCutoff(
+        final double blobGarbageCollectionAgeCutoff) {
+      return setDouble(
+          BlobOption.blob_garbage_collection_age_cutoff, blobGarbageCollectionAgeCutoff);
+    }
+
+    @Override
+    public double blobGarbageCollectionAgeCutoff() {
+      return getDouble(BlobOption.blob_garbage_collection_age_cutoff);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setBlobGarbageCollectionForceThreshold(
+        final double blobGarbageCollectionForceThreshold) {
+      return setDouble(
+          BlobOption.blob_garbage_collection_force_threshold, blobGarbageCollectionForceThreshold);
+    }
+
+    @Override
+    public double blobGarbageCollectionForceThreshold() {
+      return getDouble(BlobOption.blob_garbage_collection_force_threshold);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setBlobCompactionReadaheadSize(
+        final long blobCompactionReadaheadSize) {
+      return setLong(BlobOption.blob_compaction_readahead_size, blobCompactionReadaheadSize);
+    }
+
+    @Override
+    public long blobCompactionReadaheadSize() {
+      return getLong(BlobOption.blob_compaction_readahead_size);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setBlobFileStartingLevel(
+        final int blobFileStartingLevel) {
+      return setInt(BlobOption.blob_file_starting_level, blobFileStartingLevel);
+    }
+
+    @Override
+    public int blobFileStartingLevel() {
+      return getInt(BlobOption.blob_file_starting_level);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setPrepopulateBlobCache(
+        final PrepopulateBlobCache prepopulateBlobCache) {
+      return setEnum(BlobOption.prepopulate_blob_cache, prepopulateBlobCache);
+    }
+
+    @Override
+    public PrepopulateBlobCache prepopulateBlobCache() {
+      return (PrepopulateBlobCache) getEnum(BlobOption.prepopulate_blob_cache);
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java b/src/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java
new file mode 100644
index 000000000..0f5fe7d78
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java
@@ -0,0 +1,156 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public interface MutableColumnFamilyOptionsInterface<
+    T extends MutableColumnFamilyOptionsInterface<T>>
+    extends AdvancedMutableColumnFamilyOptionsInterface<T> {
+  /**
+   * Amount of data to build up in memory (backed by an unsorted log
+   * on disk) before converting to a sorted on-disk file.
+   *
+   * Larger values increase performance, especially during bulk loads.
+   * Up to {@code max_write_buffer_number} write buffers may be held in memory
+   * at the same time, so you may wish to adjust this parameter
+   * to control memory usage.
+   *
+   * Also, a larger write buffer will result in a longer recovery time
+   * the next time the database is opened.
+   *
+   * Default: 64MB
+   * @param writeBufferSize the size of write buffer.
+   * @return the instance of the current object.
+   * @throws java.lang.IllegalArgumentException thrown on 32-Bit platforms
+   *   while overflowing the underlying platform specific value.
+   */
+  T setWriteBufferSize(long writeBufferSize);
+
+  /**
+   * Return size of write buffer size.
+   *
+   * @return size of write buffer.
+   * @see #setWriteBufferSize(long)
+   */
+  long writeBufferSize();
+
+  /**
+   * Disable automatic compactions. Manual compactions can still
+   * be issued on this column family
+   *
+   * @param disableAutoCompactions true if auto-compactions are disabled.
+   * @return the reference to the current option.
+   */
+  T setDisableAutoCompactions(boolean disableAutoCompactions);
+
+  /**
+   * Disable automatic compactions. Manual compactions can still
+   * be issued on this column family
+   *
+   * @return true if auto-compactions are disabled.
+   */
+  boolean disableAutoCompactions();
+
+  /**
+   * Number of files to trigger level-0 compaction. A value &lt; 0 means that
+   * level-0 compaction will not be triggered by number of files at all.
+   *
+   * Default: 4
+   *
+   * @param level0FileNumCompactionTrigger The number of files to trigger
+   *   level-0 compaction
+   * @return the reference to the current option.
+   */
+  T setLevel0FileNumCompactionTrigger(int level0FileNumCompactionTrigger);
+
+  /**
+   * Number of files to trigger level-0 compaction. A value &lt; 0 means that
+   * level-0 compaction will not be triggered by number of files at all.
+   *
+   * Default: 4
+   *
+   * @return The number of files to trigger
+   */
+  int level0FileNumCompactionTrigger();
+
+  /**
+   * We try to limit number of bytes in one compaction to be lower than this
+   * threshold. But it's not guaranteed.
+   * Value 0 will be sanitized.
+   *
+   * @param maxCompactionBytes max bytes in a compaction
+   * @return the reference to the current option.
+   * @see #maxCompactionBytes()
+   */
+  T setMaxCompactionBytes(final long maxCompactionBytes);
+
+  /**
+   * We try to limit number of bytes in one compaction to be lower than this
+   * threshold. But it's not guaranteed.
+   * Value 0 will be sanitized.
+   *
+   * @return the maximum number of bytes in for a compaction.
+   * @see #setMaxCompactionBytes(long)
+   */
+  long maxCompactionBytes();
+
+  /**
+   * The upper-bound of the total size of level-1 files in bytes.
+   * Maximum number of bytes for level L can be calculated as
+   * (maxBytesForLevelBase) * (maxBytesForLevelMultiplier ^ (L-1))
+   * For example, if maxBytesForLevelBase is 20MB, and if
+   * max_bytes_for_level_multiplier is 10, total data size for level-1
+   * will be 200MB, total file size for level-2 will be 2GB,
+   * and total file size for level-3 will be 20GB.
+   * by default 'maxBytesForLevelBase' is 256MB.
+   *
+   * @param maxBytesForLevelBase maximum bytes for level base.
+   *
+   * @return the reference to the current option.
+   *
+   * See {@link AdvancedMutableColumnFamilyOptionsInterface#setMaxBytesForLevelMultiplier(double)}
+   */
+  T setMaxBytesForLevelBase(
+      long maxBytesForLevelBase);
+
+  /**
+   * The upper-bound of the total size of level-1 files in bytes.
+   * Maximum number of bytes for level L can be calculated as
+   * (maxBytesForLevelBase) * (maxBytesForLevelMultiplier ^ (L-1))
+   * For example, if maxBytesForLevelBase is 20MB, and if
+   * max_bytes_for_level_multiplier is 10, total data size for level-1
+   * will be 200MB, total file size for level-2 will be 2GB,
+   * and total file size for level-3 will be 20GB.
+   * by default 'maxBytesForLevelBase' is 256MB.
+   *
+   * @return the upper-bound of the total size of level-1 files
+   *     in bytes.
+   *
+   * See {@link AdvancedMutableColumnFamilyOptionsInterface#maxBytesForLevelMultiplier()}
+   */
+  long maxBytesForLevelBase();
+
+  /**
+   * Compress blocks using the specified compression algorithm.  This
+   * parameter can be changed dynamically.
+   *
+   * Default: SNAPPY_COMPRESSION, which gives lightweight but fast compression.
+   *
+   * @param compressionType Compression Type.
+   * @return the reference to the current option.
+   */
+  T setCompressionType(
+          CompressionType compressionType);
+
+  /**
+   * Compress blocks using the specified compression algorithm.  This
+   * parameter can be changed dynamically.
+   *
+   * Default: SNAPPY_COMPRESSION, which gives lightweight but fast compression.
+   *
+   * @return Compression type.
+   */
+  CompressionType compressionType();
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptions.java
new file mode 100644
index 000000000..bfba1dab3
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptions.java
@@ -0,0 +1,294 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+
+public class MutableDBOptions extends AbstractMutableOptions {
+
+  /**
+   * User must use builder pattern, or parser.
+   *
+   * @param keys the keys
+   * @param values the values
+   *
+   * See {@link #builder()} and {@link #parse(String)}.
+   */
+  private MutableDBOptions(final String[] keys, final String[] values) {
+    super(keys, values);
+  }
+
+  /**
+   * Creates a builder which allows you
+   * to set MutableDBOptions in a fluent
+   * manner
+   *
+   * @return A builder for MutableDBOptions
+   */
+  public static MutableDBOptionsBuilder builder() {
+    return new MutableDBOptionsBuilder();
+  }
+
+  /**
+   * Parses a String representation of MutableDBOptions
+   *
+   * The format is: key1=value1;key2=value2;key3=value3 etc
+   *
+   * For int[] values, each int should be separated by a comma, e.g.
+   *
+   * key1=value1;intArrayKey1=1:2:3
+   *
+   * @param str The string representation of the mutable db options
+   * @param ignoreUnknown what to do if the key is not one of the keys we expect
+   *
+   * @return A builder for the mutable db options
+   */
+  public static MutableDBOptionsBuilder parse(final String str, boolean ignoreUnknown) {
+    Objects.requireNonNull(str);
+
+    final List<OptionString.Entry> parsedOptions = OptionString.Parser.parse(str);
+    return new MutableDBOptions.MutableDBOptionsBuilder().fromParsed(parsedOptions, ignoreUnknown);
+  }
+
+  public static MutableDBOptionsBuilder parse(final String str) {
+    return parse(str, false);
+  }
+
+  private interface MutableDBOptionKey extends MutableOptionKey {}
+
+  public enum DBOption implements MutableDBOptionKey {
+    max_background_jobs(ValueType.INT),
+    max_background_compactions(ValueType.INT),
+    avoid_flush_during_shutdown(ValueType.BOOLEAN),
+    writable_file_max_buffer_size(ValueType.LONG),
+    delayed_write_rate(ValueType.LONG),
+    max_total_wal_size(ValueType.LONG),
+    delete_obsolete_files_period_micros(ValueType.LONG),
+    stats_dump_period_sec(ValueType.INT),
+    stats_persist_period_sec(ValueType.INT),
+    stats_history_buffer_size(ValueType.LONG),
+    max_open_files(ValueType.INT),
+    bytes_per_sync(ValueType.LONG),
+    wal_bytes_per_sync(ValueType.LONG),
+    strict_bytes_per_sync(ValueType.BOOLEAN),
+    compaction_readahead_size(ValueType.LONG);
+
+    private final ValueType valueType;
+    DBOption(final ValueType valueType) {
+      this.valueType = valueType;
+    }
+
+    @Override
+    public ValueType getValueType() {
+      return valueType;
+    }
+  }
+
+  public static class MutableDBOptionsBuilder
+      extends AbstractMutableOptionsBuilder<MutableDBOptions, MutableDBOptionsBuilder, MutableDBOptionKey>
+      implements MutableDBOptionsInterface<MutableDBOptionsBuilder> {
+
+    private final static Map<String, MutableDBOptionKey> ALL_KEYS_LOOKUP = new HashMap<>();
+    static {
+      for(final MutableDBOptionKey key : DBOption.values()) {
+        ALL_KEYS_LOOKUP.put(key.name(), key);
+      }
+    }
+
+    private MutableDBOptionsBuilder() {
+      super();
+    }
+
+    @Override
+    protected MutableDBOptionsBuilder self() {
+      return this;
+    }
+
+    @Override
+    protected Map<String, MutableDBOptionKey> allKeys() {
+      return ALL_KEYS_LOOKUP;
+    }
+
+    @Override
+    protected MutableDBOptions build(final String[] keys,
+        final String[] values) {
+      return new MutableDBOptions(keys, values);
+    }
+
+    @Override
+    public MutableDBOptionsBuilder setMaxBackgroundJobs(
+        final int maxBackgroundJobs) {
+      return setInt(DBOption.max_background_jobs, maxBackgroundJobs);
+    }
+
+    @Override
+    public int maxBackgroundJobs() {
+      return getInt(DBOption.max_background_jobs);
+    }
+
+    @Override
+    @Deprecated
+    public MutableDBOptionsBuilder setMaxBackgroundCompactions(
+        final int maxBackgroundCompactions) {
+      return setInt(DBOption.max_background_compactions,
+          maxBackgroundCompactions);
+    }
+
+    @Override
+    @Deprecated
+    public int maxBackgroundCompactions() {
+      return getInt(DBOption.max_background_compactions);
+    }
+
+    @Override
+    public MutableDBOptionsBuilder setAvoidFlushDuringShutdown(
+        final boolean avoidFlushDuringShutdown) {
+      return setBoolean(DBOption.avoid_flush_during_shutdown,
+          avoidFlushDuringShutdown);
+    }
+
+    @Override
+    public boolean avoidFlushDuringShutdown() {
+      return getBoolean(DBOption.avoid_flush_during_shutdown);
+    }
+
+    @Override
+    public MutableDBOptionsBuilder setWritableFileMaxBufferSize(
+        final long writableFileMaxBufferSize) {
+      return setLong(DBOption.writable_file_max_buffer_size,
+          writableFileMaxBufferSize);
+    }
+
+    @Override
+    public long writableFileMaxBufferSize() {
+      return getLong(DBOption.writable_file_max_buffer_size);
+    }
+
+    @Override
+    public MutableDBOptionsBuilder setDelayedWriteRate(
+        final long delayedWriteRate) {
+      return setLong(DBOption.delayed_write_rate,
+          delayedWriteRate);
+    }
+
+    @Override
+    public long delayedWriteRate() {
+      return getLong(DBOption.delayed_write_rate);
+    }
+
+    @Override
+    public MutableDBOptionsBuilder setMaxTotalWalSize(
+        final long maxTotalWalSize) {
+      return setLong(DBOption.max_total_wal_size, maxTotalWalSize);
+    }
+
+    @Override
+    public long maxTotalWalSize() {
+      return getLong(DBOption.max_total_wal_size);
+    }
+
+    @Override
+    public MutableDBOptionsBuilder setDeleteObsoleteFilesPeriodMicros(
+        final long micros) {
+      return setLong(DBOption.delete_obsolete_files_period_micros, micros);
+    }
+
+    @Override
+    public long deleteObsoleteFilesPeriodMicros() {
+      return getLong(DBOption.delete_obsolete_files_period_micros);
+    }
+
+    @Override
+    public MutableDBOptionsBuilder setStatsDumpPeriodSec(
+        final int statsDumpPeriodSec) {
+      return setInt(DBOption.stats_dump_period_sec, statsDumpPeriodSec);
+    }
+
+    @Override
+    public int statsDumpPeriodSec() {
+      return getInt(DBOption.stats_dump_period_sec);
+    }
+
+    @Override
+    public MutableDBOptionsBuilder setStatsPersistPeriodSec(
+        final int statsPersistPeriodSec) {
+      return setInt(DBOption.stats_persist_period_sec, statsPersistPeriodSec);
+    }
+
+    @Override
+    public int statsPersistPeriodSec() {
+      return getInt(DBOption.stats_persist_period_sec);
+    }
+
+    @Override
+    public MutableDBOptionsBuilder setStatsHistoryBufferSize(
+        final long statsHistoryBufferSize) {
+      return setLong(DBOption.stats_history_buffer_size, statsHistoryBufferSize);
+    }
+
+    @Override
+    public long statsHistoryBufferSize() {
+      return getLong(DBOption.stats_history_buffer_size);
+    }
+
+    @Override
+    public MutableDBOptionsBuilder setMaxOpenFiles(final int maxOpenFiles) {
+      return setInt(DBOption.max_open_files, maxOpenFiles);
+    }
+
+    @Override
+    public int maxOpenFiles() {
+      return getInt(DBOption.max_open_files);
+    }
+
+    @Override
+    public MutableDBOptionsBuilder setBytesPerSync(final long bytesPerSync) {
+      return setLong(DBOption.bytes_per_sync, bytesPerSync);
+    }
+
+    @Override
+    public long bytesPerSync() {
+      return getLong(DBOption.bytes_per_sync);
+    }
+
+    @Override
+    public MutableDBOptionsBuilder setWalBytesPerSync(
+        final long walBytesPerSync) {
+      return setLong(DBOption.wal_bytes_per_sync, walBytesPerSync);
+    }
+
+    @Override
+    public long walBytesPerSync() {
+      return getLong(DBOption.wal_bytes_per_sync);
+    }
+
+    @Override
+    public MutableDBOptionsBuilder setStrictBytesPerSync(
+        final boolean strictBytesPerSync) {
+      return setBoolean(DBOption.strict_bytes_per_sync, strictBytesPerSync);
+    }
+
+    @Override
+    public boolean strictBytesPerSync() {
+      return getBoolean(DBOption.strict_bytes_per_sync);
+    }
+
+    @Override
+    public MutableDBOptionsBuilder setCompactionReadaheadSize(
+        final long compactionReadaheadSize) {
+      return setLong(DBOption.compaction_readahead_size,
+          compactionReadaheadSize);
+    }
+
+    @Override
+    public long compactionReadaheadSize() {
+      return getLong(DBOption.compaction_readahead_size);
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java b/src/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java
new file mode 100644
index 000000000..bdf9d7bf6
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java
@@ -0,0 +1,440 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+package org.rocksdb;
+
+public interface MutableDBOptionsInterface<T extends MutableDBOptionsInterface<T>> {
+  /**
+   * Specifies the maximum number of concurrent background jobs (both flushes
+   * and compactions combined).
+   * Default: 2
+   *
+   * @param maxBackgroundJobs number of max concurrent background jobs
+   * @return the instance of the current object.
+   */
+  T setMaxBackgroundJobs(int maxBackgroundJobs);
+
+  /**
+   * Returns the maximum number of concurrent background jobs (both flushes
+   * and compactions combined).
+   * Default: 2
+   *
+   * @return the maximum number of concurrent background jobs.
+   */
+  int maxBackgroundJobs();
+
+  /**
+   * NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
+   * value of max_background_jobs. For backwards compatibility we will set
+   * `max_background_jobs = max_background_compactions + max_background_flushes`
+   * in the case where user sets at least one of `max_background_compactions` or
+   * `max_background_flushes` (we replace -1 by 1 in case one option is unset).
+   *
+   * Specifies the maximum number of concurrent background compaction jobs,
+   * submitted to the default LOW priority thread pool.
+   * If you're increasing this, also consider increasing number of threads in
+   * LOW priority thread pool. For more information, see
+   * Default: -1
+   *
+   * @param maxBackgroundCompactions the maximum number of background
+   *     compaction jobs.
+   * @return the instance of the current object.
+   *
+   * @see RocksEnv#setBackgroundThreads(int)
+   * @see RocksEnv#setBackgroundThreads(int, Priority)
+   * @see DBOptionsInterface#maxBackgroundFlushes()
+   * @deprecated Use {@link #setMaxBackgroundJobs(int)}
+   */
+  @Deprecated
+  T setMaxBackgroundCompactions(int maxBackgroundCompactions);
+
+  /**
+   * NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
+   * value of max_background_jobs. For backwards compatibility we will set
+   * `max_background_jobs = max_background_compactions + max_background_flushes`
+   * in the case where user sets at least one of `max_background_compactions` or
+   * `max_background_flushes` (we replace -1 by 1 in case one option is unset).
+   *
+   * Returns the maximum number of concurrent background compaction jobs,
+   * submitted to the default LOW priority thread pool.
+   * When increasing this number, we may also want to consider increasing
+   * number of threads in LOW priority thread pool.
+   * Default: -1
+   *
+   * @return the maximum number of concurrent background compaction jobs.
+   * @see RocksEnv#setBackgroundThreads(int)
+   * @see RocksEnv#setBackgroundThreads(int, Priority)
+   *
+   * @deprecated Use {@link #setMaxBackgroundJobs(int)}
+   */
+  @Deprecated
+  int maxBackgroundCompactions();
+
+  /**
+   * By default RocksDB will flush all memtables on DB close if there are
+   * unpersisted data (i.e. with WAL disabled) The flush can be skip to speedup
+   * DB close. Unpersisted data WILL BE LOST.
+   *
+   * DEFAULT: false
+   *
+   * Dynamically changeable through
+   *     {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}
+   *     API.
+   *
+   * @param avoidFlushDuringShutdown true if we should avoid flush during
+   *     shutdown
+   *
+   * @return the reference to the current options.
+   */
+  T setAvoidFlushDuringShutdown(boolean avoidFlushDuringShutdown);
+
+  /**
+   * By default RocksDB will flush all memtables on DB close if there are
+   * unpersisted data (i.e. with WAL disabled) The flush can be skip to speedup
+   * DB close. Unpersisted data WILL BE LOST.
+   *
+   * DEFAULT: false
+   *
+   * Dynamically changeable through
+   *     {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}
+   *     API.
+   *
+   * @return true if we should avoid flush during shutdown
+   */
+  boolean avoidFlushDuringShutdown();
+
+  /**
+   * This is the maximum buffer size that is used by WritableFileWriter.
+   * On Windows, we need to maintain an aligned buffer for writes.
+   * We allow the buffer to grow until it's size hits the limit.
+   *
+   * Default: 1024 * 1024 (1 MB)
+   *
+   * @param writableFileMaxBufferSize the maximum buffer size
+   *
+   * @return the reference to the current options.
+   */
+  T setWritableFileMaxBufferSize(long writableFileMaxBufferSize);
+
+  /**
+   * This is the maximum buffer size that is used by WritableFileWriter.
+   * On Windows, we need to maintain an aligned buffer for writes.
+   * We allow the buffer to grow until it's size hits the limit.
+   *
+   * Default: 1024 * 1024 (1 MB)
+   *
+   * @return the maximum buffer size
+   */
+  long writableFileMaxBufferSize();
+
+  /**
+   * The limited write rate to DB if
+   * {@link ColumnFamilyOptions#softPendingCompactionBytesLimit()} or
+   * {@link ColumnFamilyOptions#level0SlowdownWritesTrigger()} is triggered,
+   * or we are writing to the last mem table allowed and we allow more than 3
+   * mem tables. It is calculated using size of user write requests before
+   * compression. RocksDB may decide to slow down more if the compaction still
+   * gets behind further.
+   * If the value is 0, we will infer a value from `rater_limiter` value
+   * if it is not empty, or 16MB if `rater_limiter` is empty. Note that
+   * if users change the rate in `rate_limiter` after DB is opened,
+   * `delayed_write_rate` won't be adjusted.
+   *
+   * Unit: bytes per second.
+   *
+   * Default: 0
+   *
+   * Dynamically changeable through {@link RocksDB#setDBOptions(MutableDBOptions)}.
+   *
+   * @param delayedWriteRate the rate in bytes per second
+   *
+   * @return the reference to the current options.
+   */
+  T setDelayedWriteRate(long delayedWriteRate);
+
+  /**
+   * The limited write rate to DB if
+   * {@link ColumnFamilyOptions#softPendingCompactionBytesLimit()} or
+   * {@link ColumnFamilyOptions#level0SlowdownWritesTrigger()} is triggered,
+   * or we are writing to the last mem table allowed and we allow more than 3
+   * mem tables. It is calculated using size of user write requests before
+   * compression. RocksDB may decide to slow down more if the compaction still
+   * gets behind further.
+   * If the value is 0, we will infer a value from `rater_limiter` value
+   * if it is not empty, or 16MB if `rater_limiter` is empty. Note that
+   * if users change the rate in `rate_limiter` after DB is opened,
+   * `delayed_write_rate` won't be adjusted.
+   *
+   * Unit: bytes per second.
+   *
+   * Default: 0
+   *
+   * Dynamically changeable through {@link RocksDB#setDBOptions(MutableDBOptions)}.
+   *
+   * @return the rate in bytes per second
+   */
+  long delayedWriteRate();
+
+  /**
+   * <p>Set the max total write-ahead log size. Once write-ahead logs exceed this size, we will
+   * start forcing the flush of column families whose memtables are backed by the oldest live WAL
+   * file
+   * </p>
+   * <p>The oldest WAL files are the ones that are causing all the space amplification.
+   * </p>
+   *  For example, with 15 column families, each with
+   *  <code>write_buffer_size = 128 MB</code>
+   *  <code>max_write_buffer_number = 6</code>
+   *  <code>max_total_wal_size</code> will be calculated to be <code>[15 * 128MB * 6] * 4 =
+   * 45GB</code>
+   * <p>
+   *  The RocksDB wiki has some discussion about how the WAL interacts
+   *  with memtables and flushing of column families, at
+   * <a href="https://github.com/facebook/rocksdb/wiki/Column-Families">...</a>
+   *  </p>
+   * <p>If set to 0 (default), we will dynamically choose the WAL size limit to
+   * be [sum of all write_buffer_size * max_write_buffer_number] * 4</p>
+   * <p>This option takes effect only when there are more than one column family as
+   * otherwise the wal size is dictated by the write_buffer_size.</p>
+   * <p>Default: 0</p>
+   *
+   * @param maxTotalWalSize max total wal size.
+   * @return the instance of the current object.
+   */
+  T setMaxTotalWalSize(long maxTotalWalSize);
+
+  /**
+   * <p>Returns the max total write-ahead log size. Once write-ahead logs exceed this size,
+   * we will start forcing the flush of column families whose memtables are
+   * backed by the oldest live WAL file.</p>
+   * <p>The oldest WAL files are the ones that are causing all the space amplification.
+   * </p>
+   *  For example, with 15 column families, each with
+   *  <code>write_buffer_size = 128 MB</code>
+   *  <code>max_write_buffer_number = 6</code>
+   *  <code>max_total_wal_size</code> will be calculated to be <code>[15 * 128MB * 6] * 4 =
+   * 45GB</code>
+   * <p>
+   *  The RocksDB wiki has some discussion about how the WAL interacts
+   *  with memtables and flushing of column families, at
+   * <a href="https://github.com/facebook/rocksdb/wiki/Column-Families">...</a>
+   *  </p>
+   * <p>If set to 0 (default), we will dynamically choose the WAL size limit to
+   * be [sum of all write_buffer_size * max_write_buffer_number] * 4</p>
+   * <p>This option takes effect only when there are more than one column family as
+   * otherwise the wal size is dictated by the write_buffer_size.</p>
+   * <p>Default: 0</p>
+   *
+   *
+   * <p>If set to 0 (default), we will dynamically choose the WAL size limit
+   * to be [sum of all write_buffer_size * max_write_buffer_number] * 4
+   * </p>
+   *
+   * @return max total wal size
+   */
+  long maxTotalWalSize();
+
+  /**
+   * The periodicity when obsolete files get deleted. The default
+   * value is 6 hours. The files that get out of scope by compaction
+   * process will still get automatically delete on every compaction,
+   * regardless of this setting
+   *
+   * @param micros the time interval in micros
+   * @return the instance of the current object.
+   */
+  T setDeleteObsoleteFilesPeriodMicros(long micros);
+
+  /**
+   * The periodicity when obsolete files get deleted. The default
+   * value is 6 hours. The files that get out of scope by compaction
+   * process will still get automatically delete on every compaction,
+   * regardless of this setting
+   *
+   * @return the time interval in micros when obsolete files will be deleted.
+   */
+  long deleteObsoleteFilesPeriodMicros();
+
+  /**
+   * if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
+   * Default: 600 (10 minutes)
+   *
+   * @param statsDumpPeriodSec time interval in seconds.
+   * @return the instance of the current object.
+   */
+  T setStatsDumpPeriodSec(int statsDumpPeriodSec);
+
+  /**
+   * If not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
+   * Default: 600 (10 minutes)
+   *
+   * @return time interval in seconds.
+   */
+  int statsDumpPeriodSec();
+
+  /**
+   * If not zero, dump rocksdb.stats to RocksDB every
+   * {@code statsPersistPeriodSec}
+   *
+   * Default: 600
+   *
+   * @param statsPersistPeriodSec time interval in seconds.
+   * @return the instance of the current object.
+   */
+  T setStatsPersistPeriodSec(int statsPersistPeriodSec);
+
+  /**
+   * If not zero, dump rocksdb.stats to RocksDB every
+   * {@code statsPersistPeriodSec}
+   *
+   * @return time interval in seconds.
+   */
+  int statsPersistPeriodSec();
+
+  /**
+   * If not zero, periodically take stats snapshots and store in memory, the
+   * memory size for stats snapshots is capped at {@code statsHistoryBufferSize}
+   *
+   * Default: 1MB
+   *
+   * @param statsHistoryBufferSize the size of the buffer.
+   * @return the instance of the current object.
+   */
+  T setStatsHistoryBufferSize(long statsHistoryBufferSize);
+
+  /**
+   * If not zero, periodically take stats snapshots and store in memory, the
+   * memory size for stats snapshots is capped at {@code statsHistoryBufferSize}
+   *
+   * @return the size of the buffer.
+   */
+  long statsHistoryBufferSize();
+
+  /**
+   * Number of open files that can be used by the DB.  You may need to
+   * increase this if your database has a large working set. Value -1 means
+   * files opened are always kept open. You can estimate number of files based
+   * on {@code target_file_size_base} and {@code target_file_size_multiplier}
+   * for level-based compaction. For universal-style compaction, you can usually
+   * set it to -1.
+   * Default: -1
+   *
+   * @param maxOpenFiles the maximum number of open files.
+   * @return the instance of the current object.
+   */
+  T setMaxOpenFiles(int maxOpenFiles);
+
+  /**
+   * Number of open files that can be used by the DB.  You may need to
+   * increase this if your database has a large working set. Value -1 means
+   * files opened are always kept open. You can estimate number of files based
+   * on {@code target_file_size_base} and {@code target_file_size_multiplier}
+   * for level-based compaction. For universal-style compaction, you can usually
+   * set it to -1.
+   * Default: -1
+   *
+   * @return the maximum number of open files.
+   */
+  int maxOpenFiles();
+
+  /**
+   * Allows OS to incrementally sync files to disk while they are being
+   * written, asynchronously, in the background.
+   * Issue one request for every bytes_per_sync written. 0 turns it off.
+   * Default: 0
+   *
+   * @param bytesPerSync size in bytes
+   * @return the instance of the current object.
+   */
+  T setBytesPerSync(long bytesPerSync);
+
+  /**
+   * Allows OS to incrementally sync files to disk while they are being
+   * written, asynchronously, in the background.
+   * Issue one request for every bytes_per_sync written. 0 turns it off.
+   * Default: 0
+   *
+   * @return size in bytes
+   */
+  long bytesPerSync();
+
+  /**
+   * Same as {@link #setBytesPerSync(long)} , but applies to WAL files
+   *
+   * Default: 0, turned off
+   *
+   * @param walBytesPerSync size in bytes
+   * @return the instance of the current object.
+   */
+  T setWalBytesPerSync(long walBytesPerSync);
+
+  /**
+   * Same as {@link #bytesPerSync()} , but applies to WAL files
+   *
+   * Default: 0, turned off
+   *
+   * @return size in bytes
+   */
+  long walBytesPerSync();
+
+  /**
+   * When true, guarantees WAL files have at most {@link #walBytesPerSync()}
+   * bytes submitted for writeback at any given time, and SST files have at most
+   * {@link #bytesPerSync()} bytes pending writeback at any given time. This
+   * can be used to handle cases where processing speed exceeds I/O speed
+   * during file generation, which can lead to a huge sync when the file is
+   * finished, even with {@link #bytesPerSync()} / {@link #walBytesPerSync()}
+   * properly configured.
+   *
+   * - If `sync_file_range` is supported it achieves this by waiting for any
+   *   prior `sync_file_range`s to finish before proceeding. In this way,
+   *   processing (compression, etc.) can proceed uninhibited in the gap
+   *   between `sync_file_range`s, and we block only when I/O falls
+   *   behind.
+   * - Otherwise the `WritableFile::Sync` method is used. Note this mechanism
+   *   always blocks, thus preventing the interleaving of I/O and processing.
+   *
+   * Note: Enabling this option does not provide any additional persistence
+   * guarantees, as it may use `sync_file_range`, which does not write out
+   * metadata.
+   *
+   * Default: false
+   *
+   * @param strictBytesPerSync the bytes per sync
+   * @return the instance of the current object.
+   */
+  T setStrictBytesPerSync(boolean strictBytesPerSync);
+
+  /**
+   * Return the strict byte limit per sync.
+   *
+   * See {@link #setStrictBytesPerSync(boolean)}
+   *
+   * @return the limit in bytes.
+   */
+  boolean strictBytesPerSync();
+
+  /**
+   * If non-zero, we perform bigger reads when doing compaction. If you're
+   * running RocksDB on spinning disks, you should set this to at least 2MB.
+   *
+   * That way RocksDB's compaction is doing sequential instead of random reads.
+   *
+   * Default: 0
+   *
+   * @param compactionReadaheadSize The compaction read-ahead size
+   *
+   * @return the reference to the current options.
+   */
+  T setCompactionReadaheadSize(final long compactionReadaheadSize);
+
+  /**
+   * If non-zero, we perform bigger reads when doing compaction. If you're
+   * running RocksDB on spinning disks, you should set this to at least 2MB.
+   *
+   * That way RocksDB's compaction is doing sequential instead of random reads.
+   *
+   * Default: 0
+   *
+   * @return The compaction read-ahead size
+   */
+  long compactionReadaheadSize();
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/MutableOptionKey.java b/src/rocksdb/java/src/main/java/org/rocksdb/MutableOptionKey.java
new file mode 100644
index 000000000..ec1b9ff3b
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/MutableOptionKey.java
@@ -0,0 +1,16 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+package org.rocksdb;
+
+public interface MutableOptionKey {
+  enum ValueType {
+    DOUBLE,
+    LONG,
+    INT,
+    BOOLEAN,
+    INT_ARRAY,
+    ENUM
+  }
+
+  String name();
+  ValueType getValueType();
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/MutableOptionValue.java b/src/rocksdb/java/src/main/java/org/rocksdb/MutableOptionValue.java
new file mode 100644
index 000000000..7f69eeb9e
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/MutableOptionValue.java
@@ -0,0 +1,369 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+package org.rocksdb;
+
+import static org.rocksdb.AbstractMutableOptions.INT_ARRAY_INT_SEPARATOR;
+
+public abstract class MutableOptionValue<T> {
+
+  abstract double asDouble() throws NumberFormatException;
+  abstract long asLong() throws NumberFormatException;
+  abstract int asInt() throws NumberFormatException;
+  abstract boolean asBoolean() throws IllegalStateException;
+  abstract int[] asIntArray() throws IllegalStateException;
+  abstract String asString();
+  abstract T asObject();
+
+  private static abstract class MutableOptionValueObject<T>
+      extends MutableOptionValue<T> {
+    protected final T value;
+
+    protected MutableOptionValueObject(final T value) {
+      this.value = value;
+    }
+
+    @Override T asObject() {
+      return value;
+    }
+  }
+
+  static MutableOptionValue<String> fromString(final String s) {
+    return new MutableOptionStringValue(s);
+  }
+
+  static MutableOptionValue<Double> fromDouble(final double d) {
+    return new MutableOptionDoubleValue(d);
+  }
+
+  static MutableOptionValue<Long> fromLong(final long d) {
+    return new MutableOptionLongValue(d);
+  }
+
+  static MutableOptionValue<Integer> fromInt(final int i) {
+    return new MutableOptionIntValue(i);
+  }
+
+  static MutableOptionValue<Boolean> fromBoolean(final boolean b) {
+    return new MutableOptionBooleanValue(b);
+  }
+
+  static MutableOptionValue<int[]> fromIntArray(final int[] ix) {
+    return new MutableOptionIntArrayValue(ix);
+  }
+
+  static <N extends Enum<N>> MutableOptionValue<N> fromEnum(final N value) {
+    return new MutableOptionEnumValue<>(value);
+  }
+
+  static class MutableOptionStringValue
+      extends MutableOptionValueObject<String> {
+    MutableOptionStringValue(final String value) {
+      super(value);
+    }
+
+    @Override
+    double asDouble() throws NumberFormatException {
+      return Double.parseDouble(value);
+    }
+
+    @Override
+    long asLong() throws NumberFormatException {
+      return Long.parseLong(value);
+    }
+
+    @Override
+    int asInt() throws NumberFormatException {
+      return Integer.parseInt(value);
+    }
+
+    @Override
+    boolean asBoolean() throws IllegalStateException {
+      return Boolean.parseBoolean(value);
+    }
+
+    @Override
+    int[] asIntArray() throws IllegalStateException {
+      throw new IllegalStateException("String is not applicable as int[]");
+    }
+
+    @Override
+    String asString() {
+      return value;
+    }
+  }
+
+  static class MutableOptionDoubleValue
+      extends MutableOptionValue<Double> {
+    private final double value;
+    MutableOptionDoubleValue(final double value) {
+      this.value = value;
+    }
+
+    @Override
+    double asDouble() {
+      return value;
+    }
+
+    @Override
+    long asLong() throws NumberFormatException {
+      return Double.valueOf(value).longValue();
+    }
+
+    @Override
+    int asInt() throws NumberFormatException {
+      if(value > Integer.MAX_VALUE || value < Integer.MIN_VALUE) {
+        throw new NumberFormatException(
+            "double value lies outside the bounds of int");
+      }
+      return Double.valueOf(value).intValue();
+    }
+
+    @Override
+    boolean asBoolean() throws IllegalStateException {
+      throw new IllegalStateException(
+          "double is not applicable as boolean");
+    }
+
+    @Override
+    int[] asIntArray() throws IllegalStateException {
+      if(value > Integer.MAX_VALUE || value < Integer.MIN_VALUE) {
+        throw new NumberFormatException(
+            "double value lies outside the bounds of int");
+      }
+      return new int[] { Double.valueOf(value).intValue() };
+    }
+
+    @Override
+    String asString() {
+      return String.valueOf(value);
+    }
+
+    @Override
+    Double asObject() {
+      return value;
+    }
+  }
+
+  static class MutableOptionLongValue
+      extends MutableOptionValue<Long> {
+    private final long value;
+
+    MutableOptionLongValue(final long value) {
+      this.value = value;
+    }
+
+    @Override
+    double asDouble() {
+      return Long.valueOf(value).doubleValue();
+    }
+
+    @Override
+    long asLong() throws NumberFormatException {
+      return value;
+    }
+
+    @Override
+    int asInt() throws NumberFormatException {
+      if(value > Integer.MAX_VALUE || value < Integer.MIN_VALUE) {
+        throw new NumberFormatException(
+            "long value lies outside the bounds of int");
+      }
+      return Long.valueOf(value).intValue();
+    }
+
+    @Override
+    boolean asBoolean() throws IllegalStateException {
+      throw new IllegalStateException(
+          "long is not applicable as boolean");
+    }
+
+    @Override
+    int[] asIntArray() throws IllegalStateException {
+      if(value > Integer.MAX_VALUE || value < Integer.MIN_VALUE) {
+        throw new NumberFormatException(
+            "long value lies outside the bounds of int");
+      }
+      return new int[] { Long.valueOf(value).intValue() };
+    }
+
+    @Override
+    String asString() {
+      return String.valueOf(value);
+    }
+
+    @Override
+    Long asObject() {
+      return value;
+    }
+  }
+
+  static class MutableOptionIntValue
+      extends MutableOptionValue<Integer> {
+    private final int value;
+
+    MutableOptionIntValue(final int value) {
+      this.value = value;
+    }
+
+    @Override
+    double asDouble() {
+      return Integer.valueOf(value).doubleValue();
+    }
+
+    @Override
+    long asLong() throws NumberFormatException {
+      return value;
+    }
+
+    @Override
+    int asInt() throws NumberFormatException {
+      return value;
+    }
+
+    @Override
+    boolean asBoolean() throws IllegalStateException {
+      throw new IllegalStateException("int is not applicable as boolean");
+    }
+
+    @Override
+    int[] asIntArray() throws IllegalStateException {
+      return new int[] { value };
+    }
+
+    @Override
+    String asString() {
+      return String.valueOf(value);
+    }
+
+    @Override
+    Integer asObject() {
+      return value;
+    }
+  }
+
+  static class MutableOptionBooleanValue
+      extends MutableOptionValue<Boolean> {
+    private final boolean value;
+
+    MutableOptionBooleanValue(final boolean value) {
+      this.value = value;
+    }
+
+    @Override
+    double asDouble() {
+      throw new NumberFormatException("boolean is not applicable as double");
+    }
+
+    @Override
+    long asLong() throws NumberFormatException {
+      throw new NumberFormatException("boolean is not applicable as Long");
+    }
+
+    @Override
+    int asInt() throws NumberFormatException {
+      throw new NumberFormatException("boolean is not applicable as int");
+    }
+
+    @Override
+    boolean asBoolean() {
+      return value;
+    }
+
+    @Override
+    int[] asIntArray() throws IllegalStateException {
+      throw new IllegalStateException("boolean is not applicable as int[]");
+    }
+
+    @Override
+    String asString() {
+      return String.valueOf(value);
+    }
+
+    @Override
+    Boolean asObject() {
+      return value;
+    }
+  }
+
+  static class MutableOptionIntArrayValue
+      extends MutableOptionValueObject<int[]> {
+    MutableOptionIntArrayValue(final int[] value) {
+      super(value);
+    }
+
+    @Override
+    double asDouble() {
+      throw new NumberFormatException("int[] is not applicable as double");
+    }
+
+    @Override
+    long asLong() throws NumberFormatException {
+      throw new NumberFormatException("int[] is not applicable as Long");
+    }
+
+    @Override
+    int asInt() throws NumberFormatException {
+      throw new NumberFormatException("int[] is not applicable as int");
+    }
+
+    @Override
+    boolean asBoolean() {
+      throw new NumberFormatException("int[] is not applicable as boolean");
+    }
+
+    @Override
+    int[] asIntArray() throws IllegalStateException {
+      return value;
+    }
+
+    @Override
+    String asString() {
+      final StringBuilder builder = new StringBuilder();
+      for(int i = 0; i < value.length; i++) {
+        builder.append(value[i]);
+        if(i + 1 < value.length) {
+          builder.append(INT_ARRAY_INT_SEPARATOR);
+        }
+      }
+      return builder.toString();
+    }
+  }
+
+  static class MutableOptionEnumValue<T extends Enum<T>>
+      extends MutableOptionValueObject<T> {
+
+    MutableOptionEnumValue(final T value) {
+      super(value);
+    }
+
+    @Override
+    double asDouble() throws NumberFormatException {
+      throw new NumberFormatException("Enum is not applicable as double");
+    }
+
+    @Override
+    long asLong() throws NumberFormatException {
+      throw new NumberFormatException("Enum is not applicable as long");
+    }
+
+    @Override
+    int asInt() throws NumberFormatException {
+      throw new NumberFormatException("Enum is not applicable as int");
+    }
+
+    @Override
+    boolean asBoolean() throws IllegalStateException {
+      throw new NumberFormatException("Enum is not applicable as boolean");
+    }
+
+    @Override
+    int[] asIntArray() throws IllegalStateException {
+      throw new NumberFormatException("Enum is not applicable as int[]");
+    }
+
+    @Override
+    String asString() {
+      return value.name();
+    }
+  }
+
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/NativeComparatorWrapper.java b/src/rocksdb/java/src/main/java/org/rocksdb/NativeComparatorWrapper.java
new file mode 100644
index 000000000..6acc146f7
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/NativeComparatorWrapper.java
@@ -0,0 +1,59 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.nio.ByteBuffer;
+
+/**
+ * A simple abstraction to allow a Java class to wrap a custom comparator
+ * implemented in C++.
+ *
+ * The native comparator must directly extend rocksdb::Comparator.
+ */
+public abstract class NativeComparatorWrapper
+    extends AbstractComparator {
+
+  @Override
+  final ComparatorType getComparatorType() {
+    return ComparatorType.JAVA_NATIVE_COMPARATOR_WRAPPER;
+  }
+
+  @Override
+  public final String name() {
+    throw new IllegalStateException("This should not be called. " +
+        "Implementation is in Native code");
+  }
+
+  @Override
+  public final int compare(final ByteBuffer s1, final ByteBuffer s2) {
+    throw new IllegalStateException("This should not be called. " +
+        "Implementation is in Native code");
+  }
+
+  @Override
+  public final void findShortestSeparator(final ByteBuffer start, final ByteBuffer limit) {
+    throw new IllegalStateException("This should not be called. " +
+        "Implementation is in Native code");
+  }
+
+  @Override
+  public final void findShortSuccessor(final ByteBuffer key) {
+    throw new IllegalStateException("This should not be called. " +
+        "Implementation is in Native code");
+  }
+
+  /**
+   * We override {@link RocksCallbackObject#disposeInternal()}
+   * as disposing of a native rocksdb::Comparator extension requires
+   * a slightly different approach as it is not really a RocksCallbackObject
+   */
+  @Override
+  protected void disposeInternal() {
+    disposeInternal(nativeHandle_);
+  }
+
+  private native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/NativeLibraryLoader.java b/src/rocksdb/java/src/main/java/org/rocksdb/NativeLibraryLoader.java
new file mode 100644
index 000000000..b97cf28b9
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/NativeLibraryLoader.java
@@ -0,0 +1,172 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+package org.rocksdb;
+
+import java.io.*;
+import java.nio.file.Files;
+import java.nio.file.StandardCopyOption;
+
+import org.rocksdb.util.Environment;
+
+/**
+ * This class is used to load the RocksDB shared library from within the jar.
+ * The shared library is extracted to a temp folder and loaded from there.
+ */
+public class NativeLibraryLoader {
+  //singleton
+  private static final NativeLibraryLoader instance = new NativeLibraryLoader();
+  private static boolean initialized = false;
+
+  private static final String sharedLibraryName = Environment.getSharedLibraryName("rocksdb");
+  private static final String jniLibraryName = Environment.getJniLibraryName("rocksdb");
+  private static final /* @Nullable */ String fallbackJniLibraryName =
+      Environment.getFallbackJniLibraryName("rocksdb");
+  private static final String jniLibraryFileName = Environment.getJniLibraryFileName("rocksdb");
+  private static final /* @Nullable */ String fallbackJniLibraryFileName =
+      Environment.getFallbackJniLibraryFileName("rocksdb");
+  private static final String tempFilePrefix = "librocksdbjni";
+  private static final String tempFileSuffix = Environment.getJniLibraryExtension();
+
+  /**
+   * Get a reference to the NativeLibraryLoader
+   *
+   * @return The NativeLibraryLoader
+   */
+  public static NativeLibraryLoader getInstance() {
+    return instance;
+  }
+
+  /**
+   * Firstly attempts to load the library from <i>java.library.path</i>,
+   * if that fails then it falls back to extracting
+   * the library from the classpath
+   * {@link org.rocksdb.NativeLibraryLoader#loadLibraryFromJar(java.lang.String)}
+   *
+   * @param tmpDir A temporary directory to use
+   *   to copy the native library to when loading from the classpath.
+   *   If null, or the empty string, we rely on Java's
+   *   {@link java.io.File#createTempFile(String, String)}
+   *   function to provide a temporary location.
+   *   The temporary file will be registered for deletion
+   *   on exit.
+   *
+   * @throws java.io.IOException if a filesystem operation fails.
+   */
+  public synchronized void loadLibrary(final String tmpDir) throws IOException {
+    try {
+      // try dynamic library
+      System.loadLibrary(sharedLibraryName);
+      return;
+    } catch (final UnsatisfiedLinkError ule) {
+      // ignore - try from static library
+    }
+
+    try {
+      // try static library
+      System.loadLibrary(jniLibraryName);
+      return;
+    } catch (final UnsatisfiedLinkError ule) {
+      // ignore - then try static library fallback or from jar
+    }
+
+    if (fallbackJniLibraryName != null) {
+      try {
+        // try static library fallback
+        System.loadLibrary(fallbackJniLibraryName);
+        return;
+      } catch (final UnsatisfiedLinkError ule) {
+        // ignore - then try from jar
+      }
+    }
+
+    // try jar
+    loadLibraryFromJar(tmpDir);
+  }
+
+  /**
+   * Attempts to extract the native RocksDB library
+   * from the classpath and load it
+   *
+   * @param tmpDir A temporary directory to use
+   *   to copy the native library to. If null,
+   *   or the empty string, we rely on Java's
+   *   {@link java.io.File#createTempFile(String, String)}
+   *   function to provide a temporary location.
+   *   The temporary file will be registered for deletion
+   *   on exit.
+   *
+   * @throws java.io.IOException if a filesystem operation fails.
+   */
+  void loadLibraryFromJar(final String tmpDir)
+      throws IOException {
+    if (!initialized) {
+      System.load(loadLibraryFromJarToTemp(tmpDir).getAbsolutePath());
+      initialized = true;
+    }
+  }
+
+  File loadLibraryFromJarToTemp(final String tmpDir)
+          throws IOException {
+    InputStream is = null;
+    try {
+      // attempt to look up the static library in the jar file
+      String libraryFileName = jniLibraryFileName;
+      is = getClass().getClassLoader().getResourceAsStream(libraryFileName);
+
+      if (is == null) {
+        // is there a fallback we can try
+        if (fallbackJniLibraryFileName == null) {
+          throw new RuntimeException(libraryFileName + " was not found inside JAR.");
+        }
+
+        // attempt to look up the fallback static library in the jar file
+        libraryFileName = fallbackJniLibraryFileName;
+        is = getClass().getClassLoader().getResourceAsStream(libraryFileName);
+        if (is == null) {
+          throw new RuntimeException(libraryFileName + " was not found inside JAR.");
+        }
+      }
+
+      // create a temporary file to copy the library to
+      final File temp;
+      if (tmpDir == null || tmpDir.isEmpty()) {
+        temp = File.createTempFile(tempFilePrefix, tempFileSuffix);
+      } else {
+        final File parentDir = new File(tmpDir);
+        if (!parentDir.exists()) {
+          throw new RuntimeException(
+              "Directory: " + parentDir.getAbsolutePath() + " does not exist!");
+        }
+        temp = new File(parentDir, libraryFileName);
+        if (temp.exists() && !temp.delete()) {
+          throw new RuntimeException(
+              "File: " + temp.getAbsolutePath() + " already exists and cannot be removed.");
+        }
+        if (!temp.createNewFile()) {
+          throw new RuntimeException("File: " + temp.getAbsolutePath() + " could not be created.");
+        }
+      }
+      if (!temp.exists()) {
+        throw new RuntimeException("File " + temp.getAbsolutePath() + " does not exist.");
+      } else {
+        temp.deleteOnExit();
+      }
+
+      // copy the library from the Jar file to the temp destination
+      Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING);
+
+      // return the temporary library file
+      return temp;
+
+    } finally {
+      if (is != null) {
+        is.close();
+      }
+    }
+  }
+
+  /**
+   * Private constructor to disallow instantiation
+   */
+  private NativeLibraryLoader() {
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/OperationStage.java b/src/rocksdb/java/src/main/java/org/rocksdb/OperationStage.java
new file mode 100644
index 000000000..6ac0a15a2
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/OperationStage.java
@@ -0,0 +1,59 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * The operation stage.
+ */
+public enum OperationStage {
+  STAGE_UNKNOWN((byte)0x0),
+  STAGE_FLUSH_RUN((byte)0x1),
+  STAGE_FLUSH_WRITE_L0((byte)0x2),
+  STAGE_COMPACTION_PREPARE((byte)0x3),
+  STAGE_COMPACTION_RUN((byte)0x4),
+  STAGE_COMPACTION_PROCESS_KV((byte)0x5),
+  STAGE_COMPACTION_INSTALL((byte)0x6),
+  STAGE_COMPACTION_SYNC_FILE((byte)0x7),
+  STAGE_PICK_MEMTABLES_TO_FLUSH((byte)0x8),
+  STAGE_MEMTABLE_ROLLBACK((byte)0x9),
+  STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS((byte)0xA);
+
+  private final byte value;
+
+  OperationStage(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation value.
+   *
+   * @return the internal representation value.
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get the Operation stage from the internal representation value.
+   *
+   * @param value the internal representation value.
+   *
+   * @return the operation stage
+   *
+   * @throws IllegalArgumentException if the value does not match
+   *     an OperationStage
+   */
+  static OperationStage fromValue(final byte value)
+      throws IllegalArgumentException {
+    for (final OperationStage threadType : OperationStage.values()) {
+      if (threadType.value == value) {
+        return threadType;
+      }
+    }
+    throw new IllegalArgumentException(
+        "Unknown value for OperationStage: " + value);
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/OperationType.java b/src/rocksdb/java/src/main/java/org/rocksdb/OperationType.java
new file mode 100644
index 000000000..7cc9b65cd
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/OperationType.java
@@ -0,0 +1,54 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * The type used to refer to a thread operation.
+ *
+ * A thread operation describes high-level action of a thread,
+ * examples include compaction and flush.
+ */
+public enum OperationType {
+  OP_UNKNOWN((byte)0x0),
+  OP_COMPACTION((byte)0x1),
+  OP_FLUSH((byte)0x2);
+
+  private final byte value;
+
+  OperationType(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation value.
+   *
+   * @return the internal representation value.
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get the Operation type from the internal representation value.
+   *
+   * @param value the internal representation value.
+   *
+   * @return the operation type
+   *
+   * @throws IllegalArgumentException if the value does not match
+   *     an OperationType
+   */
+  static OperationType fromValue(final byte value)
+      throws IllegalArgumentException {
+    for (final OperationType threadType : OperationType.values()) {
+      if (threadType.value == value) {
+        return threadType;
+      }
+    }
+    throw new IllegalArgumentException(
+        "Unknown value for OperationType: " + value);
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/OptimisticTransactionDB.java b/src/rocksdb/java/src/main/java/org/rocksdb/OptimisticTransactionDB.java
new file mode 100644
index 000000000..5a2e1f3ed
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/OptimisticTransactionDB.java
@@ -0,0 +1,226 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.List;
+
+/**
+ * Database with Transaction support.
+ */
+public class OptimisticTransactionDB extends RocksDB
+    implements TransactionalDB<OptimisticTransactionOptions> {
+
+  /**
+   * Private constructor.
+   *
+   * @param nativeHandle The native handle of the C++ OptimisticTransactionDB
+   *     object
+   */
+  private OptimisticTransactionDB(final long nativeHandle) {
+    super(nativeHandle);
+  }
+
+  /**
+   * Open an OptimisticTransactionDB similar to
+   * {@link RocksDB#open(Options, String)}.
+   *
+   * @param options {@link org.rocksdb.Options} instance.
+   * @param path the path to the rocksdb.
+   *
+   * @return a {@link OptimisticTransactionDB} instance on success, null if the
+   * specified {@link OptimisticTransactionDB} can not be opened.
+   *
+   * @throws RocksDBException if an error occurs whilst opening the database.
+   */
+  public static OptimisticTransactionDB open(final Options options,
+      final String path) throws RocksDBException {
+    final OptimisticTransactionDB otdb = new OptimisticTransactionDB(open(
+        options.nativeHandle_, path));
+
+    // when non-default Options is used, keeping an Options reference
+    // in RocksDB can prevent Java to GC during the life-time of
+    // the currently-created RocksDB.
+    otdb.storeOptionsInstance(options);
+
+    return otdb;
+  }
+
+  /**
+   * Open an OptimisticTransactionDB similar to
+   * {@link RocksDB#open(DBOptions, String, List, List)}.
+   *
+   * @param dbOptions {@link org.rocksdb.DBOptions} instance.
+   * @param path the path to the rocksdb.
+   * @param columnFamilyDescriptors list of column family descriptors
+   * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
+   *
+   * @return a {@link OptimisticTransactionDB} instance on success, null if the
+   *     specified {@link OptimisticTransactionDB} can not be opened.
+   *
+   * @throws RocksDBException if an error occurs whilst opening the database.
+   */
+  public static OptimisticTransactionDB open(final DBOptions dbOptions,
+      final String path,
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors,
+      final List<ColumnFamilyHandle> columnFamilyHandles)
+      throws RocksDBException {
+
+    final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][];
+    final long[] cfOptionHandles = new long[columnFamilyDescriptors.size()];
+    for (int i = 0; i < columnFamilyDescriptors.size(); i++) {
+      final ColumnFamilyDescriptor cfDescriptor = columnFamilyDescriptors
+          .get(i);
+      cfNames[i] = cfDescriptor.getName();
+      cfOptionHandles[i] = cfDescriptor.getOptions().nativeHandle_;
+    }
+
+    final long[] handles = open(dbOptions.nativeHandle_, path, cfNames,
+        cfOptionHandles);
+    final OptimisticTransactionDB otdb =
+        new OptimisticTransactionDB(handles[0]);
+
+    // when non-default Options is used, keeping an Options reference
+    // in RocksDB can prevent Java to GC during the life-time of
+    // the currently-created RocksDB.
+    otdb.storeOptionsInstance(dbOptions);
+
+    for (int i = 1; i < handles.length; i++) {
+      columnFamilyHandles.add(new ColumnFamilyHandle(otdb, handles[i]));
+    }
+
+    return otdb;
+  }
+
+
+  /**
+   * This is similar to {@link #close()} except that it
+   * throws an exception if any error occurs.
+   *
+   * This will not fsync the WAL files.
+   * If syncing is required, the caller must first call {@link #syncWal()}
+   * or {@link #write(WriteOptions, WriteBatch)} using an empty write batch
+   * with {@link WriteOptions#setSync(boolean)} set to true.
+   *
+   * See also {@link #close()}.
+   *
+   * @throws RocksDBException if an error occurs whilst closing.
+   */
+  public void closeE() throws RocksDBException {
+    if (owningHandle_.compareAndSet(true, false)) {
+      try {
+        closeDatabase(nativeHandle_);
+      } finally {
+        disposeInternal();
+      }
+    }
+  }
+
+  /**
+   * This is similar to {@link #closeE()} except that it
+   * silently ignores any errors.
+   *
+   * This will not fsync the WAL files.
+   * If syncing is required, the caller must first call {@link #syncWal()}
+   * or {@link #write(WriteOptions, WriteBatch)} using an empty write batch
+   * with {@link WriteOptions#setSync(boolean)} set to true.
+   *
+   * See also {@link #close()}.
+   */
+  @Override
+  public void close() {
+    if (owningHandle_.compareAndSet(true, false)) {
+      try {
+        closeDatabase(nativeHandle_);
+      } catch (final RocksDBException e) {
+        // silently ignore the error report
+      } finally {
+        disposeInternal();
+      }
+    }
+  }
+
+  @Override
+  public Transaction beginTransaction(final WriteOptions writeOptions) {
+    return new Transaction(this, beginTransaction(nativeHandle_,
+        writeOptions.nativeHandle_));
+  }
+
+  @Override
+  public Transaction beginTransaction(final WriteOptions writeOptions,
+      final OptimisticTransactionOptions optimisticTransactionOptions) {
+    return new Transaction(this, beginTransaction(nativeHandle_,
+        writeOptions.nativeHandle_,
+        optimisticTransactionOptions.nativeHandle_));
+  }
+
+  // TODO(AR) consider having beingTransaction(... oldTransaction) set a
+  // reference count inside Transaction, so that we can always call
+  // Transaction#close but the object is only disposed when there are as many
+  // closes as beginTransaction. Makes the try-with-resources paradigm easier for
+  // java developers
+
+  @Override
+  public Transaction beginTransaction(final WriteOptions writeOptions,
+      final Transaction oldTransaction) {
+    final long jtxn_handle = beginTransaction_withOld(nativeHandle_,
+        writeOptions.nativeHandle_, oldTransaction.nativeHandle_);
+
+    // RocksJava relies on the assumption that
+    // we do not allocate a new Transaction object
+    // when providing an old_txn
+    assert(jtxn_handle == oldTransaction.nativeHandle_);
+
+    return oldTransaction;
+  }
+
+  @Override
+  public Transaction beginTransaction(final WriteOptions writeOptions,
+      final OptimisticTransactionOptions optimisticTransactionOptions,
+      final Transaction oldTransaction) {
+    final long jtxn_handle = beginTransaction_withOld(nativeHandle_,
+        writeOptions.nativeHandle_, optimisticTransactionOptions.nativeHandle_,
+        oldTransaction.nativeHandle_);
+
+    // RocksJava relies on the assumption that
+    // we do not allocate a new Transaction object
+    // when providing an old_txn
+    assert(jtxn_handle == oldTransaction.nativeHandle_);
+
+    return oldTransaction;
+  }
+
+  /**
+   * Get the underlying database that was opened.
+   *
+   * @return The underlying database that was opened.
+   */
+  public RocksDB getBaseDB() {
+    final RocksDB db = new RocksDB(getBaseDB(nativeHandle_));
+    db.disOwnNativeHandle();
+    return db;
+  }
+
+  @Override protected final native void disposeInternal(final long handle);
+
+  protected static native long open(final long optionsHandle,
+      final String path) throws RocksDBException;
+  protected static native long[] open(final long handle, final String path,
+      final byte[][] columnFamilyNames, final long[] columnFamilyOptions);
+  private native static void closeDatabase(final long handle)
+      throws RocksDBException;
+  private native long beginTransaction(final long handle,
+      final long writeOptionsHandle);
+  private native long beginTransaction(final long handle,
+      final long writeOptionsHandle,
+      final long optimisticTransactionOptionsHandle);
+  private native long beginTransaction_withOld(final long handle,
+      final long writeOptionsHandle, final long oldTransactionHandle);
+  private native long beginTransaction_withOld(final long handle,
+      final long writeOptionsHandle,
+      final long optimisticTransactionOptionsHandle,
+      final long oldTransactionHandle);
+  private native long getBaseDB(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/OptimisticTransactionOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/OptimisticTransactionOptions.java
new file mode 100644
index 000000000..250edf806
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/OptimisticTransactionOptions.java
@@ -0,0 +1,53 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public class OptimisticTransactionOptions extends RocksObject
+    implements TransactionalOptions<OptimisticTransactionOptions> {
+
+  public OptimisticTransactionOptions() {
+    super(newOptimisticTransactionOptions());
+  }
+
+  @Override
+  public boolean isSetSnapshot() {
+    assert(isOwningHandle());
+    return isSetSnapshot(nativeHandle_);
+  }
+
+  @Override
+  public OptimisticTransactionOptions setSetSnapshot(
+      final boolean setSnapshot) {
+    assert(isOwningHandle());
+    setSetSnapshot(nativeHandle_, setSnapshot);
+    return this;
+  }
+
+  /**
+   * Should be set if the DB has a non-default comparator.
+   * See comment in
+   * {@link WriteBatchWithIndex#WriteBatchWithIndex(AbstractComparator, int, boolean)}
+   * constructor.
+   *
+   * @param comparator The comparator to use for the transaction.
+   *
+   * @return this OptimisticTransactionOptions instance
+   */
+  public OptimisticTransactionOptions setComparator(
+      final AbstractComparator comparator) {
+    assert(isOwningHandle());
+    setComparator(nativeHandle_, comparator.nativeHandle_);
+    return this;
+  }
+
+  private native static long newOptimisticTransactionOptions();
+  private native boolean isSetSnapshot(final long handle);
+  private native void setSetSnapshot(final long handle,
+      final boolean setSnapshot);
+  private native void setComparator(final long handle,
+      final long comparatorHandle);
+  @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/OptionString.java b/src/rocksdb/java/src/main/java/org/rocksdb/OptionString.java
new file mode 100644
index 000000000..7f97827cb
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/OptionString.java
@@ -0,0 +1,256 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+
+public class OptionString {
+  private final static char kvPairSeparator = ';';
+  private final static char kvSeparator = '=';
+  private final static char complexValueBegin = '{';
+  private final static char complexValueEnd = '}';
+  private final static char wrappedValueBegin = '{';
+  private final static char wrappedValueEnd = '}';
+  private final static char arrayValueSeparator = ':';
+
+  static class Value {
+    final List<String> list;
+    final List<Entry> complex;
+
+    public Value(final List<String> list, final List<Entry> complex) {
+      this.list = list;
+      this.complex = complex;
+    }
+
+    public boolean isList() {
+      return (this.list != null && this.complex == null);
+    }
+
+    public static Value fromList(final List<String> list) {
+      return new Value(list, null);
+    }
+
+    public static Value fromComplex(final List<Entry> complex) {
+      return new Value(null, complex);
+    }
+
+    public String toString() {
+      final StringBuilder sb = new StringBuilder();
+      if (isList()) {
+        for (final String item : list) {
+          sb.append(item).append(arrayValueSeparator);
+        }
+        // remove the final separator
+        if (sb.length() > 0)
+          sb.delete(sb.length() - 1, sb.length());
+      } else {
+        sb.append('[');
+        for (final Entry entry : complex) {
+          sb.append(entry.toString()).append(';');
+        }
+        sb.append(']');
+      }
+      return sb.toString();
+    }
+  }
+
+  static class Entry {
+    public final String key;
+    public final Value value;
+
+    private Entry(final String key, final Value value) {
+      this.key = key;
+      this.value = value;
+    }
+
+    public String toString() {
+      return "" + key + "=" + value;
+    }
+  }
+
+  static class Parser {
+    static class Exception extends RuntimeException {
+      public Exception(final String s) {
+        super(s);
+      }
+    }
+
+    final String str;
+    final StringBuilder sb;
+
+    private Parser(final String str) {
+      this.str = str;
+      this.sb = new StringBuilder(str);
+    }
+
+    private void exception(final String message) {
+      final int pos = str.length() - sb.length();
+      final int before = Math.min(pos, 64);
+      final int after = Math.min(64, str.length() - pos);
+      final String here =
+          str.substring(pos - before, pos) + "__*HERE*__" + str.substring(pos, pos + after);
+
+      throw new Parser.Exception(message + " at [" + here + "]");
+    }
+
+    private void skipWhite() {
+      while (sb.length() > 0 && Character.isWhitespace(sb.charAt(0))) {
+        sb.delete(0, 1);
+      }
+    }
+
+    private char first() {
+      if (sb.length() == 0)
+        exception("Unexpected end of input");
+      return sb.charAt(0);
+    }
+
+    private char next() {
+      if (sb.length() == 0)
+        exception("Unexpected end of input");
+      final char c = sb.charAt(0);
+      sb.delete(0, 1);
+      return c;
+    }
+
+    private boolean hasNext() {
+      return (sb.length() > 0);
+    }
+
+    private boolean is(final char c) {
+      return (sb.length() > 0 && sb.charAt(0) == c);
+    }
+
+    private boolean isKeyChar() {
+      if (!hasNext())
+        return false;
+      final char c = first();
+      return (Character.isAlphabetic(c) || Character.isDigit(c) || "_".indexOf(c) != -1);
+    }
+
+    private boolean isValueChar() {
+      if (!hasNext())
+        return false;
+      final char c = first();
+      return (Character.isAlphabetic(c) || Character.isDigit(c) || "_-+.[]".indexOf(c) != -1);
+    }
+
+    private String parseKey() {
+      final StringBuilder sbKey = new StringBuilder();
+      sbKey.append(next());
+      while (isKeyChar()) {
+        sbKey.append(next());
+      }
+
+      return sbKey.toString();
+    }
+
+    private String parseSimpleValue() {
+      if (is(wrappedValueBegin)) {
+        next();
+        final String result = parseSimpleValue();
+        if (!is(wrappedValueEnd)) {
+          exception("Expected to end a wrapped value with " + wrappedValueEnd);
+        }
+        next();
+
+        return result;
+      } else {
+        final StringBuilder sbValue = new StringBuilder();
+        while (isValueChar()) sbValue.append(next());
+
+        return sbValue.toString();
+      }
+    }
+
+    private List<String> parseList() {
+      final List<String> list = new ArrayList<>(1);
+      while (true) {
+        list.add(parseSimpleValue());
+        if (!is(arrayValueSeparator))
+          break;
+
+        next();
+      }
+
+      return list;
+    }
+
+    private Entry parseOption() {
+      skipWhite();
+      if (!isKeyChar()) {
+        exception("No valid key character(s) for key in key=value ");
+      }
+      final String key = parseKey();
+      skipWhite();
+      if (is(kvSeparator)) {
+        next();
+      } else {
+        exception("Expected = separating key and value");
+      }
+      skipWhite();
+      final Value value = parseValue();
+      return new Entry(key, value);
+    }
+
+    private Value parseValue() {
+      skipWhite();
+      if (is(complexValueBegin)) {
+        next();
+        skipWhite();
+        final Value value = Value.fromComplex(parseComplex());
+        skipWhite();
+        if (is(complexValueEnd)) {
+          next();
+          skipWhite();
+        } else {
+          exception("Expected } ending complex value");
+        }
+        return value;
+      } else if (isValueChar()) {
+        return Value.fromList(parseList());
+      }
+
+      exception("No valid value character(s) for value in key=value");
+      return null;
+    }
+
+    private List<Entry> parseComplex() {
+      final List<Entry> entries = new ArrayList<>();
+
+      skipWhite();
+      if (hasNext()) {
+        entries.add(parseOption());
+        skipWhite();
+        while (is(kvPairSeparator)) {
+          next();
+          skipWhite();
+          if (!isKeyChar()) {
+            // the separator was a terminator
+            break;
+          }
+          entries.add(parseOption());
+          skipWhite();
+        }
+      }
+      return entries;
+    }
+
+    public static List<Entry> parse(final String str) {
+      Objects.requireNonNull(str);
+
+      final Parser parser = new Parser(str);
+      final List<Entry> result = parser.parseComplex();
+      if (parser.hasNext()) {
+        parser.exception("Unexpected end of parsing ");
+      }
+
+      return result;
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/Options.java b/src/rocksdb/java/src/main/java/org/rocksdb/Options.java
new file mode 100644
index 000000000..1f1e5507a
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/Options.java
@@ -0,0 +1,2578 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.nio.file.Paths;
+import java.util.*;
+
+/**
+ * Options to control the behavior of a database.  It will be used
+ * during the creation of a {@link org.rocksdb.RocksDB} (i.e., RocksDB.open()).
+ *
+ * As a descendent of {@link AbstractNativeReference}, this class is {@link AutoCloseable}
+ * and will be automatically released if opened in the preamble of a try with resources block.
+ */
+public class Options extends RocksObject
+    implements DBOptionsInterface<Options>,
+    MutableDBOptionsInterface<Options>,
+    ColumnFamilyOptionsInterface<Options>,
+    MutableColumnFamilyOptionsInterface<Options> {
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  /**
+   * Converts the input properties into a Options-style formatted string
+   * @param properties   The set of properties to convert
+   * @return The Options-style representation of those properties.
+   */
+  public static String getOptionStringFromProps(final Properties properties) {
+    if (properties == null || properties.size() == 0) {
+      throw new IllegalArgumentException("Properties value must contain at least one value.");
+    }
+    StringBuilder stringBuilder = new StringBuilder();
+    for (final String name : properties.stringPropertyNames()) {
+      stringBuilder.append(name);
+      stringBuilder.append("=");
+      stringBuilder.append(properties.getProperty(name));
+      stringBuilder.append(";");
+    }
+    return stringBuilder.toString();
+  }
+
+  /**
+   * Construct options for opening a RocksDB.
+   *
+   * This constructor will create (by allocating a block of memory)
+   * an {@code rocksdb::Options} in the c++ side.
+   */
+  public Options() {
+    super(newOptions());
+    env_ = Env.getDefault();
+  }
+
+  /**
+   * Construct options for opening a RocksDB. Reusing database options
+   * and column family options.
+   *
+   * @param dbOptions {@link org.rocksdb.DBOptions} instance
+   * @param columnFamilyOptions {@link org.rocksdb.ColumnFamilyOptions}
+   *     instance
+   */
+  public Options(final DBOptions dbOptions,
+      final ColumnFamilyOptions columnFamilyOptions) {
+    super(newOptions(dbOptions.nativeHandle_,
+        columnFamilyOptions.nativeHandle_));
+    env_ = Env.getDefault();
+  }
+
+  /**
+   * Copy constructor for ColumnFamilyOptions.
+   *
+   * NOTE: This does a shallow copy, which means comparator, merge_operator
+   * and other pointers will be cloned!
+   *
+   * @param other The Options to copy.
+   */
+  public Options(Options other) {
+    super(copyOptions(other.nativeHandle_));
+    this.env_ = other.env_;
+    this.memTableConfig_ = other.memTableConfig_;
+    this.tableFormatConfig_ = other.tableFormatConfig_;
+    this.rateLimiter_ = other.rateLimiter_;
+    this.comparator_ = other.comparator_;
+    this.compactionFilter_ = other.compactionFilter_;
+    this.compactionFilterFactory_ = other.compactionFilterFactory_;
+    this.compactionOptionsUniversal_ = other.compactionOptionsUniversal_;
+    this.compactionOptionsFIFO_ = other.compactionOptionsFIFO_;
+    this.compressionOptions_ = other.compressionOptions_;
+    this.rowCache_ = other.rowCache_;
+    this.writeBufferManager_ = other.writeBufferManager_;
+    this.compactionThreadLimiter_ = other.compactionThreadLimiter_;
+    this.bottommostCompressionOptions_ = other.bottommostCompressionOptions_;
+    this.walFilter_ = other.walFilter_;
+    this.sstPartitionerFactory_ = other.sstPartitionerFactory_;
+  }
+
+  @Override
+  public Options setIncreaseParallelism(final int totalThreads) {
+    assert(isOwningHandle());
+    setIncreaseParallelism(nativeHandle_, totalThreads);
+    return this;
+  }
+
+  @Override
+  public Options setCreateIfMissing(final boolean flag) {
+    assert(isOwningHandle());
+    setCreateIfMissing(nativeHandle_, flag);
+    return this;
+  }
+
+  @Override
+  public Options setCreateMissingColumnFamilies(final boolean flag) {
+    assert(isOwningHandle());
+    setCreateMissingColumnFamilies(nativeHandle_, flag);
+    return this;
+  }
+
+  @Override
+  public Options setEnv(final Env env) {
+    assert(isOwningHandle());
+    setEnv(nativeHandle_, env.nativeHandle_);
+    env_ = env;
+    return this;
+  }
+
+  @Override
+  public Env getEnv() {
+    return env_;
+  }
+
+  /**
+   * <p>Set appropriate parameters for bulk loading.
+   * The reason that this is a function that returns "this" instead of a
+   * constructor is to enable chaining of multiple similar calls in the future.
+   * </p>
+   *
+   * <p>All data will be in level 0 without any automatic compaction.
+   * It's recommended to manually call CompactRange(NULL, NULL) before reading
+   * from the database, because otherwise the read can be very slow.</p>
+   *
+   * @return the instance of the current Options.
+   */
+  public Options prepareForBulkLoad() {
+    prepareForBulkLoad(nativeHandle_);
+    return this;
+  }
+
+  @Override
+  public boolean createIfMissing() {
+    assert(isOwningHandle());
+    return createIfMissing(nativeHandle_);
+  }
+
+  @Override
+  public boolean createMissingColumnFamilies() {
+    assert(isOwningHandle());
+    return createMissingColumnFamilies(nativeHandle_);
+  }
+
+  @Override
+  public Options oldDefaults(final int majorVersion, final int minorVersion) {
+    oldDefaults(nativeHandle_, majorVersion, minorVersion);
+    return this;
+  }
+
+  @Override
+  public Options optimizeForSmallDb() {
+    optimizeForSmallDb(nativeHandle_);
+    return this;
+  }
+
+  @Override
+  public Options optimizeForSmallDb(final Cache cache) {
+    optimizeForSmallDb(nativeHandle_, cache.getNativeHandle());
+    return this;
+  }
+
+  @Override
+  public Options optimizeForPointLookup(
+      long blockCacheSizeMb) {
+    optimizeForPointLookup(nativeHandle_,
+        blockCacheSizeMb);
+    return this;
+  }
+
+  @Override
+  public Options optimizeLevelStyleCompaction() {
+    optimizeLevelStyleCompaction(nativeHandle_,
+        DEFAULT_COMPACTION_MEMTABLE_MEMORY_BUDGET);
+    return this;
+  }
+
+  @Override
+  public Options optimizeLevelStyleCompaction(
+      long memtableMemoryBudget) {
+    optimizeLevelStyleCompaction(nativeHandle_,
+        memtableMemoryBudget);
+    return this;
+  }
+
+  @Override
+  public Options optimizeUniversalStyleCompaction() {
+    optimizeUniversalStyleCompaction(nativeHandle_,
+        DEFAULT_COMPACTION_MEMTABLE_MEMORY_BUDGET);
+    return this;
+  }
+
+  @Override
+  public Options optimizeUniversalStyleCompaction(
+      final long memtableMemoryBudget) {
+    optimizeUniversalStyleCompaction(nativeHandle_,
+        memtableMemoryBudget);
+    return this;
+  }
+
+  @Override
+  public Options setComparator(final BuiltinComparator builtinComparator) {
+    assert(isOwningHandle());
+    setComparatorHandle(nativeHandle_, builtinComparator.ordinal());
+    return this;
+  }
+
+  @Override
+  public Options setComparator(
+      final AbstractComparator comparator) {
+    assert(isOwningHandle());
+    setComparatorHandle(nativeHandle_, comparator.nativeHandle_,
+            comparator.getComparatorType().getValue());
+    comparator_ = comparator;
+    return this;
+  }
+
+  @Override
+  public Options setMergeOperatorName(final String name) {
+    assert(isOwningHandle());
+    if (name == null) {
+      throw new IllegalArgumentException(
+          "Merge operator name must not be null.");
+    }
+    setMergeOperatorName(nativeHandle_, name);
+    return this;
+  }
+
+  @Override
+  public Options setMergeOperator(final MergeOperator mergeOperator) {
+    setMergeOperator(nativeHandle_, mergeOperator.nativeHandle_);
+    return this;
+  }
+
+  @Override
+  public Options setCompactionFilter(
+          final AbstractCompactionFilter<? extends AbstractSlice<?>>
+                  compactionFilter) {
+    setCompactionFilterHandle(nativeHandle_, compactionFilter.nativeHandle_);
+    compactionFilter_ = compactionFilter;
+    return this;
+  }
+
+  @Override
+  public AbstractCompactionFilter<? extends AbstractSlice<?>> compactionFilter() {
+    assert (isOwningHandle());
+    return compactionFilter_;
+  }
+
+  @Override
+  public Options setCompactionFilterFactory(final AbstractCompactionFilterFactory<? extends AbstractCompactionFilter<?>> compactionFilterFactory) {
+    assert (isOwningHandle());
+    setCompactionFilterFactoryHandle(nativeHandle_, compactionFilterFactory.nativeHandle_);
+    compactionFilterFactory_ = compactionFilterFactory;
+    return this;
+  }
+
+  @Override
+  public AbstractCompactionFilterFactory<? extends AbstractCompactionFilter<?>> compactionFilterFactory() {
+    assert (isOwningHandle());
+    return compactionFilterFactory_;
+  }
+
+  @Override
+  public Options setWriteBufferSize(final long writeBufferSize) {
+    assert(isOwningHandle());
+    setWriteBufferSize(nativeHandle_, writeBufferSize);
+    return this;
+  }
+
+  @Override
+  public long writeBufferSize()  {
+    assert(isOwningHandle());
+    return writeBufferSize(nativeHandle_);
+  }
+
+  @Override
+  public Options setMaxWriteBufferNumber(final int maxWriteBufferNumber) {
+    assert(isOwningHandle());
+    setMaxWriteBufferNumber(nativeHandle_, maxWriteBufferNumber);
+    return this;
+  }
+
+  @Override
+  public int maxWriteBufferNumber() {
+    assert(isOwningHandle());
+    return maxWriteBufferNumber(nativeHandle_);
+  }
+
+  @Override
+  public boolean errorIfExists() {
+    assert(isOwningHandle());
+    return errorIfExists(nativeHandle_);
+  }
+
+  @Override
+  public Options setErrorIfExists(final boolean errorIfExists) {
+    assert(isOwningHandle());
+    setErrorIfExists(nativeHandle_, errorIfExists);
+    return this;
+  }
+
+  @Override
+  public boolean paranoidChecks() {
+    assert(isOwningHandle());
+    return paranoidChecks(nativeHandle_);
+  }
+
+  @Override
+  public Options setParanoidChecks(final boolean paranoidChecks) {
+    assert(isOwningHandle());
+    setParanoidChecks(nativeHandle_, paranoidChecks);
+    return this;
+  }
+
+  @Override
+  public int maxOpenFiles() {
+    assert(isOwningHandle());
+    return maxOpenFiles(nativeHandle_);
+  }
+
+  @Override
+  public Options setMaxFileOpeningThreads(final int maxFileOpeningThreads) {
+    assert(isOwningHandle());
+    setMaxFileOpeningThreads(nativeHandle_, maxFileOpeningThreads);
+    return this;
+  }
+
+  @Override
+  public int maxFileOpeningThreads() {
+    assert(isOwningHandle());
+    return maxFileOpeningThreads(nativeHandle_);
+  }
+
+  @Override
+  public Options setMaxTotalWalSize(final long maxTotalWalSize) {
+    assert(isOwningHandle());
+    setMaxTotalWalSize(nativeHandle_, maxTotalWalSize);
+    return this;
+  }
+
+  @Override
+  public long maxTotalWalSize() {
+    assert(isOwningHandle());
+    return maxTotalWalSize(nativeHandle_);
+  }
+
+  @Override
+  public Options setMaxOpenFiles(final int maxOpenFiles) {
+    assert(isOwningHandle());
+    setMaxOpenFiles(nativeHandle_, maxOpenFiles);
+    return this;
+  }
+
+  @Override
+  public boolean useFsync() {
+    assert(isOwningHandle());
+    return useFsync(nativeHandle_);
+  }
+
+  @Override
+  public Options setUseFsync(final boolean useFsync) {
+    assert(isOwningHandle());
+    setUseFsync(nativeHandle_, useFsync);
+    return this;
+  }
+
+  @Override
+  public Options setDbPaths(final Collection<DbPath> dbPaths) {
+    assert(isOwningHandle());
+
+    final int len = dbPaths.size();
+    final String paths[] = new String[len];
+    final long targetSizes[] = new long[len];
+
+    int i = 0;
+    for(final DbPath dbPath : dbPaths) {
+      paths[i] = dbPath.path.toString();
+      targetSizes[i] = dbPath.targetSize;
+      i++;
+    }
+    setDbPaths(nativeHandle_, paths, targetSizes);
+    return this;
+  }
+
+  @Override
+  public List<DbPath> dbPaths() {
+    final int len = (int)dbPathsLen(nativeHandle_);
+    if(len == 0) {
+      return Collections.emptyList();
+    } else {
+      final String paths[] = new String[len];
+      final long targetSizes[] = new long[len];
+
+      dbPaths(nativeHandle_, paths, targetSizes);
+
+      final List<DbPath> dbPaths = new ArrayList<>();
+      for(int i = 0; i < len; i++) {
+        dbPaths.add(new DbPath(Paths.get(paths[i]), targetSizes[i]));
+      }
+      return dbPaths;
+    }
+  }
+
+  @Override
+  public String dbLogDir() {
+    assert(isOwningHandle());
+    return dbLogDir(nativeHandle_);
+  }
+
+  @Override
+  public Options setDbLogDir(final String dbLogDir) {
+    assert(isOwningHandle());
+    setDbLogDir(nativeHandle_, dbLogDir);
+    return this;
+  }
+
+  @Override
+  public String walDir() {
+    assert(isOwningHandle());
+    return walDir(nativeHandle_);
+  }
+
+  @Override
+  public Options setWalDir(final String walDir) {
+    assert(isOwningHandle());
+    setWalDir(nativeHandle_, walDir);
+    return this;
+  }
+
+  @Override
+  public long deleteObsoleteFilesPeriodMicros() {
+    assert(isOwningHandle());
+    return deleteObsoleteFilesPeriodMicros(nativeHandle_);
+  }
+
+  @Override
+  public Options setDeleteObsoleteFilesPeriodMicros(
+      final long micros) {
+    assert(isOwningHandle());
+    setDeleteObsoleteFilesPeriodMicros(nativeHandle_, micros);
+    return this;
+  }
+
+  @Override
+  @Deprecated
+  public int maxBackgroundCompactions() {
+    assert(isOwningHandle());
+    return maxBackgroundCompactions(nativeHandle_);
+  }
+
+  @Override
+  public Options setStatistics(final Statistics statistics) {
+    assert(isOwningHandle());
+    setStatistics(nativeHandle_, statistics.nativeHandle_);
+    return this;
+  }
+
+  @Override
+  public Statistics statistics() {
+    assert(isOwningHandle());
+    final long statisticsNativeHandle = statistics(nativeHandle_);
+    if(statisticsNativeHandle == 0) {
+      return null;
+    } else {
+      return new Statistics(statisticsNativeHandle);
+    }
+  }
+
+  @Override
+  @Deprecated
+  public Options setMaxBackgroundCompactions(
+      final int maxBackgroundCompactions) {
+    assert(isOwningHandle());
+    setMaxBackgroundCompactions(nativeHandle_, maxBackgroundCompactions);
+    return this;
+  }
+
+  @Override
+  public Options setMaxSubcompactions(final int maxSubcompactions) {
+    assert(isOwningHandle());
+    setMaxSubcompactions(nativeHandle_, maxSubcompactions);
+    return this;
+  }
+
+  @Override
+  public int maxSubcompactions() {
+    assert(isOwningHandle());
+    return maxSubcompactions(nativeHandle_);
+  }
+
+  @Override
+  @Deprecated
+  public int maxBackgroundFlushes() {
+    assert(isOwningHandle());
+    return maxBackgroundFlushes(nativeHandle_);
+  }
+
+  @Override
+  @Deprecated
+  public Options setMaxBackgroundFlushes(
+      final int maxBackgroundFlushes) {
+    assert(isOwningHandle());
+    setMaxBackgroundFlushes(nativeHandle_, maxBackgroundFlushes);
+    return this;
+  }
+
+  @Override
+  public int maxBackgroundJobs() {
+    assert(isOwningHandle());
+    return maxBackgroundJobs(nativeHandle_);
+  }
+
+  @Override
+  public Options setMaxBackgroundJobs(final int maxBackgroundJobs) {
+    assert(isOwningHandle());
+    setMaxBackgroundJobs(nativeHandle_, maxBackgroundJobs);
+    return this;
+  }
+
+  @Override
+  public long maxLogFileSize() {
+    assert(isOwningHandle());
+    return maxLogFileSize(nativeHandle_);
+  }
+
+  @Override
+  public Options setMaxLogFileSize(final long maxLogFileSize) {
+    assert(isOwningHandle());
+    setMaxLogFileSize(nativeHandle_, maxLogFileSize);
+    return this;
+  }
+
+  @Override
+  public long logFileTimeToRoll() {
+    assert(isOwningHandle());
+    return logFileTimeToRoll(nativeHandle_);
+  }
+
+  @Override
+  public Options setLogFileTimeToRoll(final long logFileTimeToRoll) {
+    assert(isOwningHandle());
+    setLogFileTimeToRoll(nativeHandle_, logFileTimeToRoll);
+    return this;
+  }
+
+  @Override
+  public long keepLogFileNum() {
+    assert(isOwningHandle());
+    return keepLogFileNum(nativeHandle_);
+  }
+
+  @Override
+  public Options setKeepLogFileNum(final long keepLogFileNum) {
+    assert(isOwningHandle());
+    setKeepLogFileNum(nativeHandle_, keepLogFileNum);
+    return this;
+  }
+
+
+  @Override
+  public Options setRecycleLogFileNum(final long recycleLogFileNum) {
+    assert(isOwningHandle());
+    setRecycleLogFileNum(nativeHandle_, recycleLogFileNum);
+    return this;
+  }
+
+  @Override
+  public long recycleLogFileNum() {
+    assert(isOwningHandle());
+    return recycleLogFileNum(nativeHandle_);
+  }
+
+  @Override
+  public long maxManifestFileSize() {
+    assert(isOwningHandle());
+    return maxManifestFileSize(nativeHandle_);
+  }
+
+  @Override
+  public Options setMaxManifestFileSize(
+      final long maxManifestFileSize) {
+    assert(isOwningHandle());
+    setMaxManifestFileSize(nativeHandle_, maxManifestFileSize);
+    return this;
+  }
+
+  @Override
+  public Options setMaxTableFilesSizeFIFO(
+    final long maxTableFilesSize) {
+    assert(maxTableFilesSize > 0); // unsigned native type
+    assert(isOwningHandle());
+    setMaxTableFilesSizeFIFO(nativeHandle_, maxTableFilesSize);
+    return this;
+  }
+
+  @Override
+  public long maxTableFilesSizeFIFO() {
+    return maxTableFilesSizeFIFO(nativeHandle_);
+  }
+
+  @Override
+  public int tableCacheNumshardbits() {
+    assert(isOwningHandle());
+    return tableCacheNumshardbits(nativeHandle_);
+  }
+
+  @Override
+  public Options setTableCacheNumshardbits(
+      final int tableCacheNumshardbits) {
+    assert(isOwningHandle());
+    setTableCacheNumshardbits(nativeHandle_, tableCacheNumshardbits);
+    return this;
+  }
+
+  @Override
+  public long walTtlSeconds() {
+    assert(isOwningHandle());
+    return walTtlSeconds(nativeHandle_);
+  }
+
+  @Override
+  public Options setWalTtlSeconds(final long walTtlSeconds) {
+    assert(isOwningHandle());
+    setWalTtlSeconds(nativeHandle_, walTtlSeconds);
+    return this;
+  }
+
+  @Override
+  public long walSizeLimitMB() {
+    assert(isOwningHandle());
+    return walSizeLimitMB(nativeHandle_);
+  }
+
+  @Override
+  public Options setMaxWriteBatchGroupSizeBytes(long maxWriteBatchGroupSizeBytes) {
+    setMaxWriteBatchGroupSizeBytes(nativeHandle_, maxWriteBatchGroupSizeBytes);
+    return this;
+  }
+
+  @Override
+  public long maxWriteBatchGroupSizeBytes() {
+    assert (isOwningHandle());
+    return maxWriteBatchGroupSizeBytes(nativeHandle_);
+  }
+
+  @Override
+  public Options setWalSizeLimitMB(final long sizeLimitMB) {
+    assert(isOwningHandle());
+    setWalSizeLimitMB(nativeHandle_, sizeLimitMB);
+    return this;
+  }
+
+  @Override
+  public long manifestPreallocationSize() {
+    assert(isOwningHandle());
+    return manifestPreallocationSize(nativeHandle_);
+  }
+
+  @Override
+  public Options setManifestPreallocationSize(final long size) {
+    assert(isOwningHandle());
+    setManifestPreallocationSize(nativeHandle_, size);
+    return this;
+  }
+
+  @Override
+  public Options setUseDirectReads(final boolean useDirectReads) {
+    assert(isOwningHandle());
+    setUseDirectReads(nativeHandle_, useDirectReads);
+    return this;
+  }
+
+  @Override
+  public boolean useDirectReads() {
+    assert(isOwningHandle());
+    return useDirectReads(nativeHandle_);
+  }
+
+  @Override
+  public Options setUseDirectIoForFlushAndCompaction(
+      final boolean useDirectIoForFlushAndCompaction) {
+    assert(isOwningHandle());
+    setUseDirectIoForFlushAndCompaction(nativeHandle_, useDirectIoForFlushAndCompaction);
+    return this;
+  }
+
+  @Override
+  public boolean useDirectIoForFlushAndCompaction() {
+    assert(isOwningHandle());
+    return useDirectIoForFlushAndCompaction(nativeHandle_);
+  }
+
+  @Override
+  public Options setAllowFAllocate(final boolean allowFAllocate) {
+    assert(isOwningHandle());
+    setAllowFAllocate(nativeHandle_, allowFAllocate);
+    return this;
+  }
+
+  @Override
+  public boolean allowFAllocate() {
+    assert(isOwningHandle());
+    return allowFAllocate(nativeHandle_);
+  }
+
+  @Override
+  public boolean allowMmapReads() {
+    assert(isOwningHandle());
+    return allowMmapReads(nativeHandle_);
+  }
+
+  @Override
+  public Options setAllowMmapReads(final boolean allowMmapReads) {
+    assert(isOwningHandle());
+    setAllowMmapReads(nativeHandle_, allowMmapReads);
+    return this;
+  }
+
+  @Override
+  public boolean allowMmapWrites() {
+    assert(isOwningHandle());
+    return allowMmapWrites(nativeHandle_);
+  }
+
+  @Override
+  public Options setAllowMmapWrites(final boolean allowMmapWrites) {
+    assert(isOwningHandle());
+    setAllowMmapWrites(nativeHandle_, allowMmapWrites);
+    return this;
+  }
+
+  @Override
+  public boolean isFdCloseOnExec() {
+    assert(isOwningHandle());
+    return isFdCloseOnExec(nativeHandle_);
+  }
+
+  @Override
+  public Options setIsFdCloseOnExec(final boolean isFdCloseOnExec) {
+    assert(isOwningHandle());
+    setIsFdCloseOnExec(nativeHandle_, isFdCloseOnExec);
+    return this;
+  }
+
+  @Override
+  public int statsDumpPeriodSec() {
+    assert(isOwningHandle());
+    return statsDumpPeriodSec(nativeHandle_);
+  }
+
+  @Override
+  public Options setStatsDumpPeriodSec(final int statsDumpPeriodSec) {
+    assert(isOwningHandle());
+    setStatsDumpPeriodSec(nativeHandle_, statsDumpPeriodSec);
+    return this;
+  }
+
+  @Override
+  public Options setStatsPersistPeriodSec(
+      final int statsPersistPeriodSec) {
+    assert(isOwningHandle());
+    setStatsPersistPeriodSec(nativeHandle_, statsPersistPeriodSec);
+    return this;
+  }
+
+  @Override
+  public int statsPersistPeriodSec() {
+    assert(isOwningHandle());
+    return statsPersistPeriodSec(nativeHandle_);
+  }
+
+  @Override
+  public Options setStatsHistoryBufferSize(
+      final long statsHistoryBufferSize) {
+    assert(isOwningHandle());
+    setStatsHistoryBufferSize(nativeHandle_, statsHistoryBufferSize);
+    return this;
+  }
+
+  @Override
+  public long statsHistoryBufferSize() {
+    assert(isOwningHandle());
+    return statsHistoryBufferSize(nativeHandle_);
+  }
+
+  @Override
+  public boolean adviseRandomOnOpen() {
+    return adviseRandomOnOpen(nativeHandle_);
+  }
+
+  @Override
+  public Options setAdviseRandomOnOpen(final boolean adviseRandomOnOpen) {
+    assert(isOwningHandle());
+    setAdviseRandomOnOpen(nativeHandle_, adviseRandomOnOpen);
+    return this;
+  }
+
+  @Override
+  public Options setDbWriteBufferSize(final long dbWriteBufferSize) {
+    assert(isOwningHandle());
+    setDbWriteBufferSize(nativeHandle_, dbWriteBufferSize);
+    return this;
+  }
+
+  @Override
+  public Options setWriteBufferManager(final WriteBufferManager writeBufferManager) {
+    assert(isOwningHandle());
+    setWriteBufferManager(nativeHandle_, writeBufferManager.nativeHandle_);
+    this.writeBufferManager_ = writeBufferManager;
+    return this;
+  }
+
+  @Override
+  public WriteBufferManager writeBufferManager() {
+    assert(isOwningHandle());
+    return this.writeBufferManager_;
+  }
+
+    @Override
+  public long dbWriteBufferSize() {
+    assert(isOwningHandle());
+    return dbWriteBufferSize(nativeHandle_);
+  }
+
+  @Override
+  public Options setAccessHintOnCompactionStart(final AccessHint accessHint) {
+    assert(isOwningHandle());
+    setAccessHintOnCompactionStart(nativeHandle_, accessHint.getValue());
+    return this;
+  }
+
+  @Override
+  public AccessHint accessHintOnCompactionStart() {
+    assert(isOwningHandle());
+    return AccessHint.getAccessHint(accessHintOnCompactionStart(nativeHandle_));
+  }
+
+  @Override
+  public Options setCompactionReadaheadSize(final long compactionReadaheadSize) {
+    assert(isOwningHandle());
+    setCompactionReadaheadSize(nativeHandle_, compactionReadaheadSize);
+    return this;
+  }
+
+  @Override
+  public long compactionReadaheadSize() {
+    assert(isOwningHandle());
+    return compactionReadaheadSize(nativeHandle_);
+  }
+
+  @Override
+  public Options setRandomAccessMaxBufferSize(final long randomAccessMaxBufferSize) {
+    assert(isOwningHandle());
+    setRandomAccessMaxBufferSize(nativeHandle_, randomAccessMaxBufferSize);
+    return this;
+  }
+
+  @Override
+  public long randomAccessMaxBufferSize() {
+    assert(isOwningHandle());
+    return randomAccessMaxBufferSize(nativeHandle_);
+  }
+
+  @Override
+  public Options setWritableFileMaxBufferSize(final long writableFileMaxBufferSize) {
+    assert(isOwningHandle());
+    setWritableFileMaxBufferSize(nativeHandle_, writableFileMaxBufferSize);
+    return this;
+  }
+
+  @Override
+  public long writableFileMaxBufferSize() {
+    assert(isOwningHandle());
+    return writableFileMaxBufferSize(nativeHandle_);
+  }
+
+  @Override
+  public boolean useAdaptiveMutex() {
+    assert(isOwningHandle());
+    return useAdaptiveMutex(nativeHandle_);
+  }
+
+  @Override
+  public Options setUseAdaptiveMutex(final boolean useAdaptiveMutex) {
+    assert(isOwningHandle());
+    setUseAdaptiveMutex(nativeHandle_, useAdaptiveMutex);
+    return this;
+  }
+
+  @Override
+  public long bytesPerSync() {
+    return bytesPerSync(nativeHandle_);
+  }
+
+  @Override
+  public Options setBytesPerSync(final long bytesPerSync) {
+    assert(isOwningHandle());
+    setBytesPerSync(nativeHandle_, bytesPerSync);
+    return this;
+  }
+
+  @Override
+  public Options setWalBytesPerSync(final long walBytesPerSync) {
+    assert(isOwningHandle());
+    setWalBytesPerSync(nativeHandle_, walBytesPerSync);
+    return this;
+  }
+
+  @Override
+  public long walBytesPerSync() {
+    assert(isOwningHandle());
+    return walBytesPerSync(nativeHandle_);
+  }
+
+  @Override
+  public Options setStrictBytesPerSync(final boolean strictBytesPerSync) {
+    assert(isOwningHandle());
+    setStrictBytesPerSync(nativeHandle_, strictBytesPerSync);
+    return this;
+  }
+
+  @Override
+  public boolean strictBytesPerSync() {
+    assert(isOwningHandle());
+    return strictBytesPerSync(nativeHandle_);
+  }
+
+  @Override
+  public Options setListeners(final List<AbstractEventListener> listeners) {
+    assert (isOwningHandle());
+    setEventListeners(nativeHandle_, RocksCallbackObject.toNativeHandleList(listeners));
+    return this;
+  }
+
+  @Override
+  public List<AbstractEventListener> listeners() {
+    assert (isOwningHandle());
+    return Arrays.asList(eventListeners(nativeHandle_));
+  }
+
+  @Override
+  public Options setEnableThreadTracking(final boolean enableThreadTracking) {
+    assert(isOwningHandle());
+    setEnableThreadTracking(nativeHandle_, enableThreadTracking);
+    return this;
+  }
+
+  @Override
+  public boolean enableThreadTracking() {
+    assert(isOwningHandle());
+    return enableThreadTracking(nativeHandle_);
+  }
+
+  @Override
+  public Options setDelayedWriteRate(final long delayedWriteRate) {
+    assert(isOwningHandle());
+    setDelayedWriteRate(nativeHandle_, delayedWriteRate);
+    return this;
+  }
+
+  @Override
+  public long delayedWriteRate(){
+    return delayedWriteRate(nativeHandle_);
+  }
+
+  @Override
+  public Options setEnablePipelinedWrite(final boolean enablePipelinedWrite) {
+    setEnablePipelinedWrite(nativeHandle_, enablePipelinedWrite);
+    return this;
+  }
+
+  @Override
+  public boolean enablePipelinedWrite() {
+    return enablePipelinedWrite(nativeHandle_);
+  }
+
+  @Override
+  public Options setUnorderedWrite(final boolean unorderedWrite) {
+    setUnorderedWrite(nativeHandle_, unorderedWrite);
+    return this;
+  }
+
+  @Override
+  public boolean unorderedWrite() {
+    return unorderedWrite(nativeHandle_);
+  }
+
+  @Override
+  public Options setAllowConcurrentMemtableWrite(
+      final boolean allowConcurrentMemtableWrite) {
+    setAllowConcurrentMemtableWrite(nativeHandle_,
+        allowConcurrentMemtableWrite);
+    return this;
+  }
+
+  @Override
+  public boolean allowConcurrentMemtableWrite() {
+    return allowConcurrentMemtableWrite(nativeHandle_);
+  }
+
+  @Override
+  public Options setEnableWriteThreadAdaptiveYield(
+      final boolean enableWriteThreadAdaptiveYield) {
+    setEnableWriteThreadAdaptiveYield(nativeHandle_,
+        enableWriteThreadAdaptiveYield);
+    return this;
+  }
+
+  @Override
+  public boolean enableWriteThreadAdaptiveYield() {
+    return enableWriteThreadAdaptiveYield(nativeHandle_);
+  }
+
+  @Override
+  public Options setWriteThreadMaxYieldUsec(final long writeThreadMaxYieldUsec) {
+    setWriteThreadMaxYieldUsec(nativeHandle_, writeThreadMaxYieldUsec);
+    return this;
+  }
+
+  @Override
+  public long writeThreadMaxYieldUsec() {
+    return writeThreadMaxYieldUsec(nativeHandle_);
+  }
+
+  @Override
+  public Options setWriteThreadSlowYieldUsec(final long writeThreadSlowYieldUsec) {
+    setWriteThreadSlowYieldUsec(nativeHandle_, writeThreadSlowYieldUsec);
+    return this;
+  }
+
+  @Override
+  public long writeThreadSlowYieldUsec() {
+    return writeThreadSlowYieldUsec(nativeHandle_);
+  }
+
+  @Override
+  public Options setSkipStatsUpdateOnDbOpen(final boolean skipStatsUpdateOnDbOpen) {
+    assert(isOwningHandle());
+    setSkipStatsUpdateOnDbOpen(nativeHandle_, skipStatsUpdateOnDbOpen);
+    return this;
+  }
+
+  @Override
+  public boolean skipStatsUpdateOnDbOpen() {
+    assert(isOwningHandle());
+    return skipStatsUpdateOnDbOpen(nativeHandle_);
+  }
+
+  @Override
+  public Options setSkipCheckingSstFileSizesOnDbOpen(boolean skipCheckingSstFileSizesOnDbOpen) {
+    setSkipCheckingSstFileSizesOnDbOpen(nativeHandle_, skipCheckingSstFileSizesOnDbOpen);
+    return this;
+  }
+
+  @Override
+  public boolean skipCheckingSstFileSizesOnDbOpen() {
+    assert (isOwningHandle());
+    return skipCheckingSstFileSizesOnDbOpen(nativeHandle_);
+  }
+
+  @Override
+  public Options setWalRecoveryMode(final WALRecoveryMode walRecoveryMode) {
+    assert(isOwningHandle());
+    setWalRecoveryMode(nativeHandle_, walRecoveryMode.getValue());
+    return this;
+  }
+
+  @Override
+  public WALRecoveryMode walRecoveryMode() {
+    assert(isOwningHandle());
+    return WALRecoveryMode.getWALRecoveryMode(walRecoveryMode(nativeHandle_));
+  }
+
+  @Override
+  public Options setAllow2pc(final boolean allow2pc) {
+    assert(isOwningHandle());
+    setAllow2pc(nativeHandle_, allow2pc);
+    return this;
+  }
+
+  @Override
+  public boolean allow2pc() {
+    assert(isOwningHandle());
+    return allow2pc(nativeHandle_);
+  }
+
+  @Override
+  public Options setRowCache(final Cache rowCache) {
+    assert(isOwningHandle());
+    setRowCache(nativeHandle_, rowCache.nativeHandle_);
+    this.rowCache_ = rowCache;
+    return this;
+  }
+
+  @Override
+  public Cache rowCache() {
+    assert(isOwningHandle());
+    return this.rowCache_;
+  }
+
+  @Override
+  public Options setWalFilter(final AbstractWalFilter walFilter) {
+    assert(isOwningHandle());
+    setWalFilter(nativeHandle_, walFilter.nativeHandle_);
+    this.walFilter_ = walFilter;
+    return this;
+  }
+
+  @Override
+  public WalFilter walFilter() {
+    assert(isOwningHandle());
+    return this.walFilter_;
+  }
+
+  @Override
+  public Options setFailIfOptionsFileError(final boolean failIfOptionsFileError) {
+    assert(isOwningHandle());
+    setFailIfOptionsFileError(nativeHandle_, failIfOptionsFileError);
+    return this;
+  }
+
+  @Override
+  public boolean failIfOptionsFileError() {
+    assert(isOwningHandle());
+    return failIfOptionsFileError(nativeHandle_);
+  }
+
+  @Override
+  public Options setDumpMallocStats(final boolean dumpMallocStats) {
+    assert(isOwningHandle());
+    setDumpMallocStats(nativeHandle_, dumpMallocStats);
+    return this;
+  }
+
+  @Override
+  public boolean dumpMallocStats() {
+    assert(isOwningHandle());
+    return dumpMallocStats(nativeHandle_);
+  }
+
+  @Override
+  public Options setAvoidFlushDuringRecovery(final boolean avoidFlushDuringRecovery) {
+    assert(isOwningHandle());
+    setAvoidFlushDuringRecovery(nativeHandle_, avoidFlushDuringRecovery);
+    return this;
+  }
+
+  @Override
+  public boolean avoidFlushDuringRecovery() {
+    assert(isOwningHandle());
+    return avoidFlushDuringRecovery(nativeHandle_);
+  }
+
+  @Override
+  public Options setAvoidFlushDuringShutdown(final boolean avoidFlushDuringShutdown) {
+    assert(isOwningHandle());
+    setAvoidFlushDuringShutdown(nativeHandle_, avoidFlushDuringShutdown);
+    return this;
+  }
+
+  @Override
+  public boolean avoidFlushDuringShutdown() {
+    assert(isOwningHandle());
+    return avoidFlushDuringShutdown(nativeHandle_);
+  }
+
+  @Override
+  public Options setAllowIngestBehind(final boolean allowIngestBehind) {
+    assert(isOwningHandle());
+    setAllowIngestBehind(nativeHandle_, allowIngestBehind);
+    return this;
+  }
+
+  @Override
+  public boolean allowIngestBehind() {
+    assert(isOwningHandle());
+    return allowIngestBehind(nativeHandle_);
+  }
+
+  @Override
+  public Options setTwoWriteQueues(final boolean twoWriteQueues) {
+    assert(isOwningHandle());
+    setTwoWriteQueues(nativeHandle_, twoWriteQueues);
+    return this;
+  }
+
+  @Override
+  public boolean twoWriteQueues() {
+    assert(isOwningHandle());
+    return twoWriteQueues(nativeHandle_);
+  }
+
+  @Override
+  public Options setManualWalFlush(final boolean manualWalFlush) {
+    assert(isOwningHandle());
+    setManualWalFlush(nativeHandle_, manualWalFlush);
+    return this;
+  }
+
+  @Override
+  public boolean manualWalFlush() {
+    assert(isOwningHandle());
+    return manualWalFlush(nativeHandle_);
+  }
+
+  @Override
+  public MemTableConfig memTableConfig() {
+    return this.memTableConfig_;
+  }
+
+  @Override
+  public Options setMemTableConfig(final MemTableConfig config) {
+    memTableConfig_ = config;
+    setMemTableFactory(nativeHandle_, config.newMemTableFactoryHandle());
+    return this;
+  }
+
+  @Override
+  public Options setRateLimiter(final RateLimiter rateLimiter) {
+    assert(isOwningHandle());
+    rateLimiter_ = rateLimiter;
+    setRateLimiter(nativeHandle_, rateLimiter.nativeHandle_);
+    return this;
+  }
+
+  @Override
+  public Options setSstFileManager(final SstFileManager sstFileManager) {
+    assert(isOwningHandle());
+    setSstFileManager(nativeHandle_, sstFileManager.nativeHandle_);
+    return this;
+  }
+
+  @Override
+  public Options setLogger(final Logger logger) {
+    assert(isOwningHandle());
+    setLogger(nativeHandle_, logger.nativeHandle_);
+    return this;
+  }
+
+  @Override
+  public Options setInfoLogLevel(final InfoLogLevel infoLogLevel) {
+    assert(isOwningHandle());
+    setInfoLogLevel(nativeHandle_, infoLogLevel.getValue());
+    return this;
+  }
+
+  @Override
+  public InfoLogLevel infoLogLevel() {
+    assert(isOwningHandle());
+    return InfoLogLevel.getInfoLogLevel(
+        infoLogLevel(nativeHandle_));
+  }
+
+  @Override
+  public String memTableFactoryName() {
+    assert(isOwningHandle());
+    return memTableFactoryName(nativeHandle_);
+  }
+
+  @Override
+  public TableFormatConfig tableFormatConfig() {
+    return this.tableFormatConfig_;
+  }
+
+  @Override
+  public Options setTableFormatConfig(final TableFormatConfig config) {
+    tableFormatConfig_ = config;
+    setTableFactory(nativeHandle_, config.newTableFactoryHandle());
+    return this;
+  }
+
+  @Override
+  public String tableFactoryName() {
+    assert(isOwningHandle());
+    return tableFactoryName(nativeHandle_);
+  }
+
+  @Override
+  public Options setCfPaths(final Collection<DbPath> cfPaths) {
+    assert (isOwningHandle());
+
+    final int len = cfPaths.size();
+    final String[] paths = new String[len];
+    final long[] targetSizes = new long[len];
+
+    int i = 0;
+    for (final DbPath dbPath : cfPaths) {
+      paths[i] = dbPath.path.toString();
+      targetSizes[i] = dbPath.targetSize;
+      i++;
+    }
+    setCfPaths(nativeHandle_, paths, targetSizes);
+    return this;
+  }
+
+  @Override
+  public List<DbPath> cfPaths() {
+    final int len = (int) cfPathsLen(nativeHandle_);
+
+    if (len == 0) {
+      return Collections.emptyList();
+    }
+
+    final String[] paths = new String[len];
+    final long[] targetSizes = new long[len];
+
+    cfPaths(nativeHandle_, paths, targetSizes);
+
+    final List<DbPath> cfPaths = new ArrayList<>();
+    for (int i = 0; i < len; i++) {
+      cfPaths.add(new DbPath(Paths.get(paths[i]), targetSizes[i]));
+    }
+
+    return cfPaths;
+  }
+
+  @Override
+  public Options useFixedLengthPrefixExtractor(final int n) {
+    assert(isOwningHandle());
+    useFixedLengthPrefixExtractor(nativeHandle_, n);
+    return this;
+  }
+
+  @Override
+  public Options useCappedPrefixExtractor(final int n) {
+    assert(isOwningHandle());
+    useCappedPrefixExtractor(nativeHandle_, n);
+    return this;
+  }
+
+  @Override
+  public CompressionType compressionType() {
+    return CompressionType.getCompressionType(compressionType(nativeHandle_));
+  }
+
+  @Override
+  public Options setCompressionPerLevel(
+      final List<CompressionType> compressionLevels) {
+    final byte[] byteCompressionTypes = new byte[
+        compressionLevels.size()];
+    for (int i = 0; i < compressionLevels.size(); i++) {
+      byteCompressionTypes[i] = compressionLevels.get(i).getValue();
+    }
+    setCompressionPerLevel(nativeHandle_, byteCompressionTypes);
+    return this;
+  }
+
+  @Override
+  public List<CompressionType> compressionPerLevel() {
+    final byte[] byteCompressionTypes =
+        compressionPerLevel(nativeHandle_);
+    final List<CompressionType> compressionLevels = new ArrayList<>();
+    for (final byte byteCompressionType : byteCompressionTypes) {
+      compressionLevels.add(CompressionType.getCompressionType(
+          byteCompressionType));
+    }
+    return compressionLevels;
+  }
+
+  @Override
+  public Options setCompressionType(CompressionType compressionType) {
+    setCompressionType(nativeHandle_, compressionType.getValue());
+    return this;
+  }
+
+
+  @Override
+  public Options setBottommostCompressionType(
+      final CompressionType bottommostCompressionType) {
+    setBottommostCompressionType(nativeHandle_,
+        bottommostCompressionType.getValue());
+    return this;
+  }
+
+  @Override
+  public CompressionType bottommostCompressionType() {
+    return CompressionType.getCompressionType(
+        bottommostCompressionType(nativeHandle_));
+  }
+
+  @Override
+  public Options setBottommostCompressionOptions(
+      final CompressionOptions bottommostCompressionOptions) {
+    setBottommostCompressionOptions(nativeHandle_,
+        bottommostCompressionOptions.nativeHandle_);
+    this.bottommostCompressionOptions_ = bottommostCompressionOptions;
+    return this;
+  }
+
+  @Override
+  public CompressionOptions bottommostCompressionOptions() {
+    return this.bottommostCompressionOptions_;
+  }
+
+  @Override
+  public Options setCompressionOptions(
+      final CompressionOptions compressionOptions) {
+    setCompressionOptions(nativeHandle_, compressionOptions.nativeHandle_);
+    this.compressionOptions_ = compressionOptions;
+    return this;
+  }
+
+  @Override
+  public CompressionOptions compressionOptions() {
+    return this.compressionOptions_;
+  }
+
+  @Override
+  public CompactionStyle compactionStyle() {
+    return CompactionStyle.fromValue(compactionStyle(nativeHandle_));
+  }
+
+  @Override
+  public Options setCompactionStyle(
+      final CompactionStyle compactionStyle) {
+    setCompactionStyle(nativeHandle_, compactionStyle.getValue());
+    return this;
+  }
+
+  @Override
+  public int numLevels() {
+    return numLevels(nativeHandle_);
+  }
+
+  @Override
+  public Options setNumLevels(int numLevels) {
+    setNumLevels(nativeHandle_, numLevels);
+    return this;
+  }
+
+  @Override
+  public int levelZeroFileNumCompactionTrigger() {
+    return levelZeroFileNumCompactionTrigger(nativeHandle_);
+  }
+
+  @Override
+  public Options setLevelZeroFileNumCompactionTrigger(
+      final int numFiles) {
+    setLevelZeroFileNumCompactionTrigger(
+        nativeHandle_, numFiles);
+    return this;
+  }
+
+  @Override
+  public int levelZeroSlowdownWritesTrigger() {
+    return levelZeroSlowdownWritesTrigger(nativeHandle_);
+  }
+
+  @Override
+  public Options setLevelZeroSlowdownWritesTrigger(
+      final int numFiles) {
+    setLevelZeroSlowdownWritesTrigger(nativeHandle_, numFiles);
+    return this;
+  }
+
+  @Override
+  public int levelZeroStopWritesTrigger() {
+    return levelZeroStopWritesTrigger(nativeHandle_);
+  }
+
+  @Override
+  public Options setLevelZeroStopWritesTrigger(
+      final int numFiles) {
+    setLevelZeroStopWritesTrigger(nativeHandle_, numFiles);
+    return this;
+  }
+
+  @Override
+  public long targetFileSizeBase() {
+    return targetFileSizeBase(nativeHandle_);
+  }
+
+  @Override
+  public Options setTargetFileSizeBase(long targetFileSizeBase) {
+    setTargetFileSizeBase(nativeHandle_, targetFileSizeBase);
+    return this;
+  }
+
+  @Override
+  public int targetFileSizeMultiplier() {
+    return targetFileSizeMultiplier(nativeHandle_);
+  }
+
+  @Override
+  public Options setTargetFileSizeMultiplier(int multiplier) {
+    setTargetFileSizeMultiplier(nativeHandle_, multiplier);
+    return this;
+  }
+
+  @Override
+  public Options setMaxBytesForLevelBase(final long maxBytesForLevelBase) {
+    setMaxBytesForLevelBase(nativeHandle_, maxBytesForLevelBase);
+    return this;
+  }
+
+  @Override
+  public long maxBytesForLevelBase() {
+    return maxBytesForLevelBase(nativeHandle_);
+  }
+
+  @Override
+  public Options setLevelCompactionDynamicLevelBytes(
+      final boolean enableLevelCompactionDynamicLevelBytes) {
+    setLevelCompactionDynamicLevelBytes(nativeHandle_,
+        enableLevelCompactionDynamicLevelBytes);
+    return this;
+  }
+
+  @Override
+  public boolean levelCompactionDynamicLevelBytes() {
+    return levelCompactionDynamicLevelBytes(nativeHandle_);
+  }
+
+  @Override
+  public double maxBytesForLevelMultiplier() {
+    return maxBytesForLevelMultiplier(nativeHandle_);
+  }
+
+  @Override
+  public Options setMaxBytesForLevelMultiplier(final double multiplier) {
+    setMaxBytesForLevelMultiplier(nativeHandle_, multiplier);
+    return this;
+  }
+
+  @Override
+  public long maxCompactionBytes() {
+    return maxCompactionBytes(nativeHandle_);
+  }
+
+  @Override
+  public Options setMaxCompactionBytes(final long maxCompactionBytes) {
+    setMaxCompactionBytes(nativeHandle_, maxCompactionBytes);
+    return this;
+  }
+
+  @Override
+  public long arenaBlockSize() {
+    return arenaBlockSize(nativeHandle_);
+  }
+
+  @Override
+  public Options setArenaBlockSize(final long arenaBlockSize) {
+    setArenaBlockSize(nativeHandle_, arenaBlockSize);
+    return this;
+  }
+
+  @Override
+  public boolean disableAutoCompactions() {
+    return disableAutoCompactions(nativeHandle_);
+  }
+
+  @Override
+  public Options setDisableAutoCompactions(
+      final boolean disableAutoCompactions) {
+    setDisableAutoCompactions(nativeHandle_, disableAutoCompactions);
+    return this;
+  }
+
+  @Override
+  public long maxSequentialSkipInIterations() {
+    return maxSequentialSkipInIterations(nativeHandle_);
+  }
+
+  @Override
+  public Options setMaxSequentialSkipInIterations(
+      final long maxSequentialSkipInIterations) {
+    setMaxSequentialSkipInIterations(nativeHandle_,
+        maxSequentialSkipInIterations);
+    return this;
+  }
+
+  @Override
+  public boolean inplaceUpdateSupport() {
+    return inplaceUpdateSupport(nativeHandle_);
+  }
+
+  @Override
+  public Options setInplaceUpdateSupport(
+      final boolean inplaceUpdateSupport) {
+    setInplaceUpdateSupport(nativeHandle_, inplaceUpdateSupport);
+    return this;
+  }
+
+  @Override
+  public long inplaceUpdateNumLocks() {
+    return inplaceUpdateNumLocks(nativeHandle_);
+  }
+
+  @Override
+  public Options setInplaceUpdateNumLocks(
+      final long inplaceUpdateNumLocks) {
+    setInplaceUpdateNumLocks(nativeHandle_, inplaceUpdateNumLocks);
+    return this;
+  }
+
+  @Override
+  public double memtablePrefixBloomSizeRatio() {
+    return memtablePrefixBloomSizeRatio(nativeHandle_);
+  }
+
+  @Override
+  public Options setMemtablePrefixBloomSizeRatio(final double memtablePrefixBloomSizeRatio) {
+    setMemtablePrefixBloomSizeRatio(nativeHandle_, memtablePrefixBloomSizeRatio);
+    return this;
+  }
+
+  @Override
+  public double experimentalMempurgeThreshold() {
+    return experimentalMempurgeThreshold(nativeHandle_);
+  }
+
+  @Override
+  public Options setExperimentalMempurgeThreshold(final double experimentalMempurgeThreshold) {
+    setExperimentalMempurgeThreshold(nativeHandle_, experimentalMempurgeThreshold);
+    return this;
+  }
+
+  @Override
+  public boolean memtableWholeKeyFiltering() {
+    return memtableWholeKeyFiltering(nativeHandle_);
+  }
+
+  @Override
+  public Options setMemtableWholeKeyFiltering(final boolean memtableWholeKeyFiltering) {
+    setMemtableWholeKeyFiltering(nativeHandle_, memtableWholeKeyFiltering);
+    return this;
+  }
+
+  @Override
+  public int bloomLocality() {
+    return bloomLocality(nativeHandle_);
+  }
+
+  @Override
+  public Options setBloomLocality(final int bloomLocality) {
+    setBloomLocality(nativeHandle_, bloomLocality);
+    return this;
+  }
+
+  @Override
+  public long maxSuccessiveMerges() {
+    return maxSuccessiveMerges(nativeHandle_);
+  }
+
+  @Override
+  public Options setMaxSuccessiveMerges(long maxSuccessiveMerges) {
+    setMaxSuccessiveMerges(nativeHandle_, maxSuccessiveMerges);
+    return this;
+  }
+
+  @Override
+  public int minWriteBufferNumberToMerge() {
+    return minWriteBufferNumberToMerge(nativeHandle_);
+  }
+
+  @Override
+  public Options setMinWriteBufferNumberToMerge(
+      final int minWriteBufferNumberToMerge) {
+    setMinWriteBufferNumberToMerge(nativeHandle_, minWriteBufferNumberToMerge);
+    return this;
+  }
+
+  @Override
+  public Options setOptimizeFiltersForHits(
+      final boolean optimizeFiltersForHits) {
+    setOptimizeFiltersForHits(nativeHandle_, optimizeFiltersForHits);
+    return this;
+  }
+
+  @Override
+  public boolean optimizeFiltersForHits() {
+    return optimizeFiltersForHits(nativeHandle_);
+  }
+
+  @Override
+  public Options
+  setMemtableHugePageSize(
+      long memtableHugePageSize) {
+    setMemtableHugePageSize(nativeHandle_,
+        memtableHugePageSize);
+    return this;
+  }
+
+  @Override
+  public long memtableHugePageSize() {
+    return memtableHugePageSize(nativeHandle_);
+  }
+
+  @Override
+  public Options setSoftPendingCompactionBytesLimit(long softPendingCompactionBytesLimit) {
+    setSoftPendingCompactionBytesLimit(nativeHandle_,
+        softPendingCompactionBytesLimit);
+    return this;
+  }
+
+  @Override
+  public long softPendingCompactionBytesLimit() {
+    return softPendingCompactionBytesLimit(nativeHandle_);
+  }
+
+  @Override
+  public Options setHardPendingCompactionBytesLimit(long hardPendingCompactionBytesLimit) {
+    setHardPendingCompactionBytesLimit(nativeHandle_, hardPendingCompactionBytesLimit);
+    return this;
+  }
+
+  @Override
+  public long hardPendingCompactionBytesLimit() {
+    return hardPendingCompactionBytesLimit(nativeHandle_);
+  }
+
+  @Override
+  public Options setLevel0FileNumCompactionTrigger(int level0FileNumCompactionTrigger) {
+    setLevel0FileNumCompactionTrigger(nativeHandle_, level0FileNumCompactionTrigger);
+    return this;
+  }
+
+  @Override
+  public int level0FileNumCompactionTrigger() {
+    return level0FileNumCompactionTrigger(nativeHandle_);
+  }
+
+  @Override
+  public Options setLevel0SlowdownWritesTrigger(int level0SlowdownWritesTrigger) {
+    setLevel0SlowdownWritesTrigger(nativeHandle_, level0SlowdownWritesTrigger);
+    return this;
+  }
+
+  @Override
+  public int level0SlowdownWritesTrigger() {
+    return level0SlowdownWritesTrigger(nativeHandle_);
+  }
+
+  @Override
+  public Options setLevel0StopWritesTrigger(int level0StopWritesTrigger) {
+    setLevel0StopWritesTrigger(nativeHandle_, level0StopWritesTrigger);
+    return this;
+  }
+
+  @Override
+  public int level0StopWritesTrigger() {
+    return level0StopWritesTrigger(nativeHandle_);
+  }
+
+  @Override
+  public Options setMaxBytesForLevelMultiplierAdditional(int[] maxBytesForLevelMultiplierAdditional) {
+    setMaxBytesForLevelMultiplierAdditional(nativeHandle_, maxBytesForLevelMultiplierAdditional);
+    return this;
+  }
+
+  @Override
+  public int[] maxBytesForLevelMultiplierAdditional() {
+    return maxBytesForLevelMultiplierAdditional(nativeHandle_);
+  }
+
+  @Override
+  public Options setParanoidFileChecks(boolean paranoidFileChecks) {
+    setParanoidFileChecks(nativeHandle_, paranoidFileChecks);
+    return this;
+  }
+
+  @Override
+  public boolean paranoidFileChecks() {
+    return paranoidFileChecks(nativeHandle_);
+  }
+
+  @Override
+  public Options setMaxWriteBufferNumberToMaintain(
+      final int maxWriteBufferNumberToMaintain) {
+    setMaxWriteBufferNumberToMaintain(
+        nativeHandle_, maxWriteBufferNumberToMaintain);
+    return this;
+  }
+
+  @Override
+  public int maxWriteBufferNumberToMaintain() {
+    return maxWriteBufferNumberToMaintain(nativeHandle_);
+  }
+
+  @Override
+  public Options setCompactionPriority(
+      final CompactionPriority compactionPriority) {
+    setCompactionPriority(nativeHandle_, compactionPriority.getValue());
+    return this;
+  }
+
+  @Override
+  public CompactionPriority compactionPriority() {
+    return CompactionPriority.getCompactionPriority(
+        compactionPriority(nativeHandle_));
+  }
+
+  @Override
+  public Options setReportBgIoStats(final boolean reportBgIoStats) {
+    setReportBgIoStats(nativeHandle_, reportBgIoStats);
+    return this;
+  }
+
+  @Override
+  public boolean reportBgIoStats() {
+    return reportBgIoStats(nativeHandle_);
+  }
+
+  @Override
+  public Options setTtl(final long ttl) {
+    setTtl(nativeHandle_, ttl);
+    return this;
+  }
+
+  @Override
+  public long ttl() {
+    return ttl(nativeHandle_);
+  }
+
+  @Override
+  public Options setPeriodicCompactionSeconds(final long periodicCompactionSeconds) {
+    setPeriodicCompactionSeconds(nativeHandle_, periodicCompactionSeconds);
+    return this;
+  }
+
+  @Override
+  public long periodicCompactionSeconds() {
+    return periodicCompactionSeconds(nativeHandle_);
+  }
+
+  @Override
+  public Options setCompactionOptionsUniversal(
+      final CompactionOptionsUniversal compactionOptionsUniversal) {
+    setCompactionOptionsUniversal(nativeHandle_,
+        compactionOptionsUniversal.nativeHandle_);
+    this.compactionOptionsUniversal_ = compactionOptionsUniversal;
+    return this;
+  }
+
+  @Override
+  public CompactionOptionsUniversal compactionOptionsUniversal() {
+    return this.compactionOptionsUniversal_;
+  }
+
+  @Override
+  public Options setCompactionOptionsFIFO(final CompactionOptionsFIFO compactionOptionsFIFO) {
+    setCompactionOptionsFIFO(nativeHandle_,
+        compactionOptionsFIFO.nativeHandle_);
+    this.compactionOptionsFIFO_ = compactionOptionsFIFO;
+    return this;
+  }
+
+  @Override
+  public CompactionOptionsFIFO compactionOptionsFIFO() {
+    return this.compactionOptionsFIFO_;
+  }
+
+  @Override
+  public Options setForceConsistencyChecks(final boolean forceConsistencyChecks) {
+    setForceConsistencyChecks(nativeHandle_, forceConsistencyChecks);
+    return this;
+  }
+
+  @Override
+  public boolean forceConsistencyChecks() {
+    return forceConsistencyChecks(nativeHandle_);
+  }
+
+  @Override
+  public Options setAtomicFlush(final boolean atomicFlush) {
+    setAtomicFlush(nativeHandle_, atomicFlush);
+    return this;
+  }
+
+  @Override
+  public boolean atomicFlush() {
+    return atomicFlush(nativeHandle_);
+  }
+
+  @Override
+  public Options setAvoidUnnecessaryBlockingIO(boolean avoidUnnecessaryBlockingIO) {
+    setAvoidUnnecessaryBlockingIO(nativeHandle_, avoidUnnecessaryBlockingIO);
+    return this;
+  }
+
+  @Override
+  public boolean avoidUnnecessaryBlockingIO() {
+    assert (isOwningHandle());
+    return avoidUnnecessaryBlockingIO(nativeHandle_);
+  }
+
+  @Override
+  public Options setPersistStatsToDisk(boolean persistStatsToDisk) {
+    setPersistStatsToDisk(nativeHandle_, persistStatsToDisk);
+    return this;
+  }
+
+  @Override
+  public boolean persistStatsToDisk() {
+    assert (isOwningHandle());
+    return persistStatsToDisk(nativeHandle_);
+  }
+
+  @Override
+  public Options setWriteDbidToManifest(boolean writeDbidToManifest) {
+    setWriteDbidToManifest(nativeHandle_, writeDbidToManifest);
+    return this;
+  }
+
+  @Override
+  public boolean writeDbidToManifest() {
+    assert (isOwningHandle());
+    return writeDbidToManifest(nativeHandle_);
+  }
+
+  @Override
+  public Options setLogReadaheadSize(long logReadaheadSize) {
+    setLogReadaheadSize(nativeHandle_, logReadaheadSize);
+    return this;
+  }
+
+  @Override
+  public long logReadaheadSize() {
+    assert (isOwningHandle());
+    return logReadaheadSize(nativeHandle_);
+  }
+
+  @Override
+  public Options setBestEffortsRecovery(boolean bestEffortsRecovery) {
+    setBestEffortsRecovery(nativeHandle_, bestEffortsRecovery);
+    return this;
+  }
+
+  @Override
+  public boolean bestEffortsRecovery() {
+    assert (isOwningHandle());
+    return bestEffortsRecovery(nativeHandle_);
+  }
+
+  @Override
+  public Options setMaxBgErrorResumeCount(int maxBgerrorResumeCount) {
+    setMaxBgErrorResumeCount(nativeHandle_, maxBgerrorResumeCount);
+    return this;
+  }
+
+  @Override
+  public int maxBgerrorResumeCount() {
+    assert (isOwningHandle());
+    return maxBgerrorResumeCount(nativeHandle_);
+  }
+
+  @Override
+  public Options setBgerrorResumeRetryInterval(long bgerrorResumeRetryInterval) {
+    setBgerrorResumeRetryInterval(nativeHandle_, bgerrorResumeRetryInterval);
+    return this;
+  }
+
+  @Override
+  public long bgerrorResumeRetryInterval() {
+    assert (isOwningHandle());
+    return bgerrorResumeRetryInterval(nativeHandle_);
+  }
+
+  @Override
+  public Options setSstPartitionerFactory(SstPartitionerFactory sstPartitionerFactory) {
+    setSstPartitionerFactory(nativeHandle_, sstPartitionerFactory.nativeHandle_);
+    this.sstPartitionerFactory_ = sstPartitionerFactory;
+    return this;
+  }
+
+  @Override
+  public SstPartitionerFactory sstPartitionerFactory() {
+    return sstPartitionerFactory_;
+  }
+
+  @Override
+  public Options setCompactionThreadLimiter(final ConcurrentTaskLimiter compactionThreadLimiter) {
+    setCompactionThreadLimiter(nativeHandle_, compactionThreadLimiter.nativeHandle_);
+    this.compactionThreadLimiter_ = compactionThreadLimiter;
+    return this;
+  }
+
+  @Override
+  public ConcurrentTaskLimiter compactionThreadLimiter() {
+    assert (isOwningHandle());
+    return this.compactionThreadLimiter_;
+  }
+
+  //
+  // BEGIN options for blobs (integrated BlobDB)
+  //
+
+  @Override
+  public Options setEnableBlobFiles(final boolean enableBlobFiles) {
+    setEnableBlobFiles(nativeHandle_, enableBlobFiles);
+    return this;
+  }
+
+  @Override
+  public boolean enableBlobFiles() {
+    return enableBlobFiles(nativeHandle_);
+  }
+
+  @Override
+  public Options setMinBlobSize(final long minBlobSize) {
+    setMinBlobSize(nativeHandle_, minBlobSize);
+    return this;
+  }
+
+  @Override
+  public long minBlobSize() {
+    return minBlobSize(nativeHandle_);
+  }
+
+  @Override
+  public Options setBlobFileSize(final long blobFileSize) {
+    setBlobFileSize(nativeHandle_, blobFileSize);
+    return this;
+  }
+
+  @Override
+  public long blobFileSize() {
+    return blobFileSize(nativeHandle_);
+  }
+
+  @Override
+  public Options setBlobCompressionType(CompressionType compressionType) {
+    setBlobCompressionType(nativeHandle_, compressionType.getValue());
+    return this;
+  }
+
+  @Override
+  public CompressionType blobCompressionType() {
+    return CompressionType.values()[blobCompressionType(nativeHandle_)];
+  }
+
+  @Override
+  public Options setEnableBlobGarbageCollection(final boolean enableBlobGarbageCollection) {
+    setEnableBlobGarbageCollection(nativeHandle_, enableBlobGarbageCollection);
+    return this;
+  }
+
+  @Override
+  public boolean enableBlobGarbageCollection() {
+    return enableBlobGarbageCollection(nativeHandle_);
+  }
+
+  @Override
+  public Options setBlobGarbageCollectionAgeCutoff(final double blobGarbageCollectionAgeCutoff) {
+    setBlobGarbageCollectionAgeCutoff(nativeHandle_, blobGarbageCollectionAgeCutoff);
+    return this;
+  }
+
+  @Override
+  public double blobGarbageCollectionAgeCutoff() {
+    return blobGarbageCollectionAgeCutoff(nativeHandle_);
+  }
+
+  @Override
+  public Options setBlobGarbageCollectionForceThreshold(
+      final double blobGarbageCollectionForceThreshold) {
+    setBlobGarbageCollectionForceThreshold(nativeHandle_, blobGarbageCollectionForceThreshold);
+    return this;
+  }
+
+  @Override
+  public double blobGarbageCollectionForceThreshold() {
+    return blobGarbageCollectionForceThreshold(nativeHandle_);
+  }
+
+  @Override
+  public Options setBlobCompactionReadaheadSize(final long blobCompactionReadaheadSize) {
+    setBlobCompactionReadaheadSize(nativeHandle_, blobCompactionReadaheadSize);
+    return this;
+  }
+
+  @Override
+  public long blobCompactionReadaheadSize() {
+    return blobCompactionReadaheadSize(nativeHandle_);
+  }
+
+  @Override
+  public Options setBlobFileStartingLevel(final int blobFileStartingLevel) {
+    setBlobFileStartingLevel(nativeHandle_, blobFileStartingLevel);
+    return this;
+  }
+
+  @Override
+  public int blobFileStartingLevel() {
+    return blobFileStartingLevel(nativeHandle_);
+  }
+
+  @Override
+  public Options setPrepopulateBlobCache(final PrepopulateBlobCache prepopulateBlobCache) {
+    setPrepopulateBlobCache(nativeHandle_, prepopulateBlobCache.getValue());
+    return this;
+  }
+
+  @Override
+  public PrepopulateBlobCache prepopulateBlobCache() {
+    return PrepopulateBlobCache.getPrepopulateBlobCache(prepopulateBlobCache(nativeHandle_));
+  }
+
+  //
+  // END options for blobs (integrated BlobDB)
+  //
+
+  private native static long newOptions();
+  private native static long newOptions(long dbOptHandle,
+      long cfOptHandle);
+  private native static long copyOptions(long handle);
+  @Override protected final native void disposeInternal(final long handle);
+  private native void setEnv(long optHandle, long envHandle);
+  private native void prepareForBulkLoad(long handle);
+
+  // DB native handles
+  private native void setIncreaseParallelism(long handle, int totalThreads);
+  private native void setCreateIfMissing(long handle, boolean flag);
+  private native boolean createIfMissing(long handle);
+  private native void setCreateMissingColumnFamilies(
+      long handle, boolean flag);
+  private native boolean createMissingColumnFamilies(long handle);
+  private native void setErrorIfExists(long handle, boolean errorIfExists);
+  private native boolean errorIfExists(long handle);
+  private native void setParanoidChecks(
+      long handle, boolean paranoidChecks);
+  private native boolean paranoidChecks(long handle);
+  private native void setRateLimiter(long handle,
+      long rateLimiterHandle);
+  private native void setSstFileManager(final long handle,
+      final long sstFileManagerHandle);
+  private native void setLogger(long handle,
+      long loggerHandle);
+  private native void setInfoLogLevel(long handle, byte logLevel);
+  private native byte infoLogLevel(long handle);
+  private native void setMaxOpenFiles(long handle, int maxOpenFiles);
+  private native int maxOpenFiles(long handle);
+  private native void setMaxTotalWalSize(long handle,
+      long maxTotalWalSize);
+  private native void setMaxFileOpeningThreads(final long handle,
+      final int maxFileOpeningThreads);
+  private native int maxFileOpeningThreads(final long handle);
+  private native long maxTotalWalSize(long handle);
+  private native void setStatistics(final long handle, final long statisticsHandle);
+  private native long statistics(final long handle);
+  private native boolean useFsync(long handle);
+  private native void setUseFsync(long handle, boolean useFsync);
+  private native void setDbPaths(final long handle, final String[] paths,
+      final long[] targetSizes);
+  private native long dbPathsLen(final long handle);
+  private native void dbPaths(final long handle, final String[] paths,
+      final long[] targetSizes);
+  private native void setDbLogDir(long handle, String dbLogDir);
+  private native String dbLogDir(long handle);
+  private native void setWalDir(long handle, String walDir);
+  private native String walDir(long handle);
+  private native void setDeleteObsoleteFilesPeriodMicros(
+      long handle, long micros);
+  private native long deleteObsoleteFilesPeriodMicros(long handle);
+  private native void setMaxBackgroundCompactions(
+      long handle, int maxBackgroundCompactions);
+  private native int maxBackgroundCompactions(long handle);
+  private native void setMaxSubcompactions(long handle, int maxSubcompactions);
+  private native int maxSubcompactions(long handle);
+  private native void setMaxBackgroundFlushes(
+      long handle, int maxBackgroundFlushes);
+  private native int maxBackgroundFlushes(long handle);
+  private native void setMaxBackgroundJobs(long handle, int maxMaxBackgroundJobs);
+  private native int maxBackgroundJobs(long handle);
+  private native void setMaxLogFileSize(long handle, long maxLogFileSize)
+      throws IllegalArgumentException;
+  private native long maxLogFileSize(long handle);
+  private native void setLogFileTimeToRoll(
+      long handle, long logFileTimeToRoll) throws IllegalArgumentException;
+  private native long logFileTimeToRoll(long handle);
+  private native void setKeepLogFileNum(long handle, long keepLogFileNum)
+      throws IllegalArgumentException;
+  private native long keepLogFileNum(long handle);
+  private native void setRecycleLogFileNum(long handle, long recycleLogFileNum);
+  private native long recycleLogFileNum(long handle);
+  private native void setMaxManifestFileSize(
+      long handle, long maxManifestFileSize);
+  private native long maxManifestFileSize(long handle);
+  private native void setMaxTableFilesSizeFIFO(
+      long handle, long maxTableFilesSize);
+  private native long maxTableFilesSizeFIFO(long handle);
+  private native void setTableCacheNumshardbits(
+      long handle, int tableCacheNumshardbits);
+  private native int tableCacheNumshardbits(long handle);
+  private native void setWalTtlSeconds(long handle, long walTtlSeconds);
+  private native long walTtlSeconds(long handle);
+  private native void setWalSizeLimitMB(long handle, long sizeLimitMB);
+  private native long walSizeLimitMB(long handle);
+  private static native void setMaxWriteBatchGroupSizeBytes(
+      final long handle, final long maxWriteBatchGroupSizeBytes);
+  private static native long maxWriteBatchGroupSizeBytes(final long handle);
+  private native void setManifestPreallocationSize(
+      long handle, long size) throws IllegalArgumentException;
+  private native long manifestPreallocationSize(long handle);
+  private native void setUseDirectReads(long handle, boolean useDirectReads);
+  private native boolean useDirectReads(long handle);
+  private native void setUseDirectIoForFlushAndCompaction(
+      long handle, boolean useDirectIoForFlushAndCompaction);
+  private native boolean useDirectIoForFlushAndCompaction(long handle);
+  private native void setAllowFAllocate(final long handle,
+      final boolean allowFAllocate);
+  private native boolean allowFAllocate(final long handle);
+  private native void setAllowMmapReads(
+      long handle, boolean allowMmapReads);
+  private native boolean allowMmapReads(long handle);
+  private native void setAllowMmapWrites(
+      long handle, boolean allowMmapWrites);
+  private native boolean allowMmapWrites(long handle);
+  private native void setIsFdCloseOnExec(
+      long handle, boolean isFdCloseOnExec);
+  private native boolean isFdCloseOnExec(long handle);
+  private native void setStatsDumpPeriodSec(
+      long handle, int statsDumpPeriodSec);
+  private native int statsDumpPeriodSec(long handle);
+  private native void setStatsPersistPeriodSec(
+      final long handle, final int statsPersistPeriodSec);
+  private native int statsPersistPeriodSec(
+      final long handle);
+  private native void setStatsHistoryBufferSize(
+      final long handle, final long statsHistoryBufferSize);
+  private native long statsHistoryBufferSize(
+      final long handle);
+  private native void setAdviseRandomOnOpen(
+      long handle, boolean adviseRandomOnOpen);
+  private native boolean adviseRandomOnOpen(long handle);
+  private native void setDbWriteBufferSize(final long handle,
+      final long dbWriteBufferSize);
+  private native void setWriteBufferManager(final long handle,
+      final long writeBufferManagerHandle);
+  private native long dbWriteBufferSize(final long handle);
+  private native void setAccessHintOnCompactionStart(final long handle,
+      final byte accessHintOnCompactionStart);
+  private native byte accessHintOnCompactionStart(final long handle);
+  private native void setCompactionReadaheadSize(final long handle,
+      final long compactionReadaheadSize);
+  private native long compactionReadaheadSize(final long handle);
+  private native void setRandomAccessMaxBufferSize(final long handle,
+      final long randomAccessMaxBufferSize);
+  private native long randomAccessMaxBufferSize(final long handle);
+  private native void setWritableFileMaxBufferSize(final long handle,
+      final long writableFileMaxBufferSize);
+  private native long writableFileMaxBufferSize(final long handle);
+  private native void setUseAdaptiveMutex(
+      long handle, boolean useAdaptiveMutex);
+  private native boolean useAdaptiveMutex(long handle);
+  private native void setBytesPerSync(
+      long handle, long bytesPerSync);
+  private native long bytesPerSync(long handle);
+  private native void setWalBytesPerSync(long handle, long walBytesPerSync);
+  private native long walBytesPerSync(long handle);
+  private native void setStrictBytesPerSync(
+      final long handle, final boolean strictBytesPerSync);
+  private native boolean strictBytesPerSync(
+      final long handle);
+  private static native void setEventListeners(
+      final long handle, final long[] eventListenerHandles);
+  private static native AbstractEventListener[] eventListeners(final long handle);
+  private native void setEnableThreadTracking(long handle,
+      boolean enableThreadTracking);
+  private native boolean enableThreadTracking(long handle);
+  private native void setDelayedWriteRate(long handle, long delayedWriteRate);
+  private native long delayedWriteRate(long handle);
+  private native void setEnablePipelinedWrite(final long handle,
+      final boolean pipelinedWrite);
+  private native boolean enablePipelinedWrite(final long handle);
+  private native void setUnorderedWrite(final long handle,
+      final boolean unorderedWrite);
+  private native boolean unorderedWrite(final long handle);
+  private native void setAllowConcurrentMemtableWrite(long handle,
+      boolean allowConcurrentMemtableWrite);
+  private native boolean allowConcurrentMemtableWrite(long handle);
+  private native void setEnableWriteThreadAdaptiveYield(long handle,
+      boolean enableWriteThreadAdaptiveYield);
+  private native boolean enableWriteThreadAdaptiveYield(long handle);
+  private native void setWriteThreadMaxYieldUsec(long handle,
+      long writeThreadMaxYieldUsec);
+  private native long writeThreadMaxYieldUsec(long handle);
+  private native void setWriteThreadSlowYieldUsec(long handle,
+      long writeThreadSlowYieldUsec);
+  private native long writeThreadSlowYieldUsec(long handle);
+  private native void setSkipStatsUpdateOnDbOpen(final long handle,
+      final boolean skipStatsUpdateOnDbOpen);
+  private native boolean skipStatsUpdateOnDbOpen(final long handle);
+  private static native void setSkipCheckingSstFileSizesOnDbOpen(
+      final long handle, final boolean skipChecking);
+  private static native boolean skipCheckingSstFileSizesOnDbOpen(final long handle);
+  private native void setWalRecoveryMode(final long handle,
+      final byte walRecoveryMode);
+  private native byte walRecoveryMode(final long handle);
+  private native void setAllow2pc(final long handle,
+      final boolean allow2pc);
+  private native boolean allow2pc(final long handle);
+  private native void setRowCache(final long handle,
+      final long rowCacheHandle);
+  private native void setWalFilter(final long handle,
+      final long walFilterHandle);
+  private native void setFailIfOptionsFileError(final long handle,
+      final boolean failIfOptionsFileError);
+  private native boolean failIfOptionsFileError(final long handle);
+  private native void setDumpMallocStats(final long handle,
+      final boolean dumpMallocStats);
+  private native boolean dumpMallocStats(final long handle);
+  private native void setAvoidFlushDuringRecovery(final long handle,
+      final boolean avoidFlushDuringRecovery);
+  private native boolean avoidFlushDuringRecovery(final long handle);
+  private native void setAvoidFlushDuringShutdown(final long handle,
+      final boolean avoidFlushDuringShutdown);
+  private native boolean avoidFlushDuringShutdown(final long handle);
+  private native void setAllowIngestBehind(final long handle,
+      final boolean allowIngestBehind);
+  private native boolean allowIngestBehind(final long handle);
+  private native void setTwoWriteQueues(final long handle,
+      final boolean twoWriteQueues);
+  private native boolean twoWriteQueues(final long handle);
+  private native void setManualWalFlush(final long handle,
+      final boolean manualWalFlush);
+  private native boolean manualWalFlush(final long handle);
+
+
+  // CF native handles
+  private static native void oldDefaults(
+      final long handle, final int majorVersion, final int minorVersion);
+  private native void optimizeForSmallDb(final long handle);
+  private static native void optimizeForSmallDb(final long handle, final long cacheHandle);
+  private native void optimizeForPointLookup(long handle,
+      long blockCacheSizeMb);
+  private native void optimizeLevelStyleCompaction(long handle,
+      long memtableMemoryBudget);
+  private native void optimizeUniversalStyleCompaction(long handle,
+      long memtableMemoryBudget);
+  private native void setComparatorHandle(long handle, int builtinComparator);
+  private native void setComparatorHandle(long optHandle,
+      long comparatorHandle, byte comparatorType);
+  private native void setMergeOperatorName(
+      long handle, String name);
+  private native void setMergeOperator(
+      long handle, long mergeOperatorHandle);
+  private native void setCompactionFilterHandle(
+          long handle, long compactionFilterHandle);
+  private native void setCompactionFilterFactoryHandle(
+          long handle, long compactionFilterFactoryHandle);
+  private native void setWriteBufferSize(long handle, long writeBufferSize)
+      throws IllegalArgumentException;
+  private native long writeBufferSize(long handle);
+  private native void setMaxWriteBufferNumber(
+      long handle, int maxWriteBufferNumber);
+  private native int maxWriteBufferNumber(long handle);
+  private native void setMinWriteBufferNumberToMerge(
+      long handle, int minWriteBufferNumberToMerge);
+  private native int minWriteBufferNumberToMerge(long handle);
+  private native void setCompressionType(long handle, byte compressionType);
+  private native byte compressionType(long handle);
+  private native void setCompressionPerLevel(long handle,
+      byte[] compressionLevels);
+  private native byte[] compressionPerLevel(long handle);
+  private native void setBottommostCompressionType(long handle,
+      byte bottommostCompressionType);
+  private native byte bottommostCompressionType(long handle);
+  private native void setBottommostCompressionOptions(final long handle,
+      final long bottommostCompressionOptionsHandle);
+  private native void setCompressionOptions(long handle,
+      long compressionOptionsHandle);
+  private native void useFixedLengthPrefixExtractor(
+      long handle, int prefixLength);
+  private native void useCappedPrefixExtractor(
+      long handle, int prefixLength);
+  private native void setNumLevels(
+      long handle, int numLevels);
+  private native int numLevels(long handle);
+  private native void setLevelZeroFileNumCompactionTrigger(
+      long handle, int numFiles);
+  private native int levelZeroFileNumCompactionTrigger(long handle);
+  private native void setLevelZeroSlowdownWritesTrigger(
+      long handle, int numFiles);
+  private native int levelZeroSlowdownWritesTrigger(long handle);
+  private native void setLevelZeroStopWritesTrigger(
+      long handle, int numFiles);
+  private native int levelZeroStopWritesTrigger(long handle);
+  private native void setTargetFileSizeBase(
+      long handle, long targetFileSizeBase);
+  private native long targetFileSizeBase(long handle);
+  private native void setTargetFileSizeMultiplier(
+      long handle, int multiplier);
+  private native int targetFileSizeMultiplier(long handle);
+  private native void setMaxBytesForLevelBase(
+      long handle, long maxBytesForLevelBase);
+  private native long maxBytesForLevelBase(long handle);
+  private native void setLevelCompactionDynamicLevelBytes(
+      long handle, boolean enableLevelCompactionDynamicLevelBytes);
+  private native boolean levelCompactionDynamicLevelBytes(
+      long handle);
+  private native void setMaxBytesForLevelMultiplier(long handle, double multiplier);
+  private native double maxBytesForLevelMultiplier(long handle);
+  private native void setMaxCompactionBytes(long handle, long maxCompactionBytes);
+  private native long maxCompactionBytes(long handle);
+  private native void setArenaBlockSize(
+      long handle, long arenaBlockSize) throws IllegalArgumentException;
+  private native long arenaBlockSize(long handle);
+  private native void setDisableAutoCompactions(
+      long handle, boolean disableAutoCompactions);
+  private native boolean disableAutoCompactions(long handle);
+  private native void setCompactionStyle(long handle, byte compactionStyle);
+  private native byte compactionStyle(long handle);
+  private native void setMaxSequentialSkipInIterations(
+      long handle, long maxSequentialSkipInIterations);
+  private native long maxSequentialSkipInIterations(long handle);
+  private native void setMemTableFactory(long handle, long factoryHandle);
+  private native String memTableFactoryName(long handle);
+  private native void setTableFactory(long handle, long factoryHandle);
+  private native String tableFactoryName(long handle);
+  private static native void setCfPaths(
+      final long handle, final String[] paths, final long[] targetSizes);
+  private static native long cfPathsLen(final long handle);
+  private static native void cfPaths(
+      final long handle, final String[] paths, final long[] targetSizes);
+  private native void setInplaceUpdateSupport(
+      long handle, boolean inplaceUpdateSupport);
+  private native boolean inplaceUpdateSupport(long handle);
+  private native void setInplaceUpdateNumLocks(
+      long handle, long inplaceUpdateNumLocks)
+      throws IllegalArgumentException;
+  private native long inplaceUpdateNumLocks(long handle);
+  private native void setMemtablePrefixBloomSizeRatio(
+      long handle, double memtablePrefixBloomSizeRatio);
+  private native double memtablePrefixBloomSizeRatio(long handle);
+  private native void setExperimentalMempurgeThreshold(
+      long handle, double experimentalMempurgeThreshold);
+  private native double experimentalMempurgeThreshold(long handle);
+  private native void setMemtableWholeKeyFiltering(long handle, boolean memtableWholeKeyFiltering);
+  private native boolean memtableWholeKeyFiltering(long handle);
+  private native void setBloomLocality(
+      long handle, int bloomLocality);
+  private native int bloomLocality(long handle);
+  private native void setMaxSuccessiveMerges(
+      long handle, long maxSuccessiveMerges)
+      throws IllegalArgumentException;
+  private native long maxSuccessiveMerges(long handle);
+  private native void setOptimizeFiltersForHits(long handle,
+      boolean optimizeFiltersForHits);
+  private native boolean optimizeFiltersForHits(long handle);
+  private native void setMemtableHugePageSize(long handle,
+      long memtableHugePageSize);
+  private native long memtableHugePageSize(long handle);
+  private native void setSoftPendingCompactionBytesLimit(long handle,
+      long softPendingCompactionBytesLimit);
+  private native long softPendingCompactionBytesLimit(long handle);
+  private native void setHardPendingCompactionBytesLimit(long handle,
+      long hardPendingCompactionBytesLimit);
+  private native long hardPendingCompactionBytesLimit(long handle);
+  private native void setLevel0FileNumCompactionTrigger(long handle,
+      int level0FileNumCompactionTrigger);
+  private native int level0FileNumCompactionTrigger(long handle);
+  private native void setLevel0SlowdownWritesTrigger(long handle,
+      int level0SlowdownWritesTrigger);
+  private native int level0SlowdownWritesTrigger(long handle);
+  private native void setLevel0StopWritesTrigger(long handle,
+      int level0StopWritesTrigger);
+  private native int level0StopWritesTrigger(long handle);
+  private native void setMaxBytesForLevelMultiplierAdditional(long handle,
+      int[] maxBytesForLevelMultiplierAdditional);
+  private native int[] maxBytesForLevelMultiplierAdditional(long handle);
+  private native void setParanoidFileChecks(long handle,
+      boolean paranoidFileChecks);
+  private native boolean paranoidFileChecks(long handle);
+  private native void setMaxWriteBufferNumberToMaintain(final long handle,
+      final int maxWriteBufferNumberToMaintain);
+  private native int maxWriteBufferNumberToMaintain(final long handle);
+  private native void setCompactionPriority(final long handle,
+      final byte compactionPriority);
+  private native byte compactionPriority(final long handle);
+  private native void setReportBgIoStats(final long handle,
+      final boolean reportBgIoStats);
+  private native boolean reportBgIoStats(final long handle);
+  private native void setTtl(final long handle, final long ttl);
+  private native long ttl(final long handle);
+  private native void setPeriodicCompactionSeconds(
+      final long handle, final long periodicCompactionSeconds);
+  private native long periodicCompactionSeconds(final long handle);
+  private native void setCompactionOptionsUniversal(final long handle,
+      final long compactionOptionsUniversalHandle);
+  private native void setCompactionOptionsFIFO(final long handle,
+      final long compactionOptionsFIFOHandle);
+  private native void setForceConsistencyChecks(final long handle,
+      final boolean forceConsistencyChecks);
+  private native boolean forceConsistencyChecks(final long handle);
+  private native void setAtomicFlush(final long handle,
+      final boolean atomicFlush);
+  private native boolean atomicFlush(final long handle);
+  private native void setSstPartitionerFactory(long nativeHandle_, long newFactoryHandle);
+  private static native void setCompactionThreadLimiter(
+      final long nativeHandle_, final long newLimiterHandle);
+  private static native void setAvoidUnnecessaryBlockingIO(
+      final long handle, final boolean avoidBlockingIO);
+  private static native boolean avoidUnnecessaryBlockingIO(final long handle);
+  private static native void setPersistStatsToDisk(
+      final long handle, final boolean persistStatsToDisk);
+  private static native boolean persistStatsToDisk(final long handle);
+  private static native void setWriteDbidToManifest(
+      final long handle, final boolean writeDbidToManifest);
+  private static native boolean writeDbidToManifest(final long handle);
+  private static native void setLogReadaheadSize(final long handle, final long logReadaheadSize);
+  private static native long logReadaheadSize(final long handle);
+  private static native void setBestEffortsRecovery(
+      final long handle, final boolean bestEffortsRecovery);
+  private static native boolean bestEffortsRecovery(final long handle);
+  private static native void setMaxBgErrorResumeCount(
+      final long handle, final int maxBgerrorRecumeCount);
+  private static native int maxBgerrorResumeCount(final long handle);
+  private static native void setBgerrorResumeRetryInterval(
+      final long handle, final long bgerrorResumeRetryInterval);
+  private static native long bgerrorResumeRetryInterval(final long handle);
+
+  private native void setEnableBlobFiles(final long nativeHandle_, final boolean enableBlobFiles);
+  private native boolean enableBlobFiles(final long nativeHandle_);
+  private native void setMinBlobSize(final long nativeHandle_, final long minBlobSize);
+  private native long minBlobSize(final long nativeHandle_);
+  private native void setBlobFileSize(final long nativeHandle_, final long blobFileSize);
+  private native long blobFileSize(final long nativeHandle_);
+  private native void setBlobCompressionType(final long nativeHandle_, final byte compressionType);
+  private native byte blobCompressionType(final long nativeHandle_);
+  private native void setEnableBlobGarbageCollection(
+      final long nativeHandle_, final boolean enableBlobGarbageCollection);
+  private native boolean enableBlobGarbageCollection(final long nativeHandle_);
+  private native void setBlobGarbageCollectionAgeCutoff(
+      final long nativeHandle_, final double blobGarbageCollectionAgeCutoff);
+  private native double blobGarbageCollectionAgeCutoff(final long nativeHandle_);
+  private native void setBlobGarbageCollectionForceThreshold(
+      final long nativeHandle_, final double blobGarbageCollectionForceThreshold);
+  private native double blobGarbageCollectionForceThreshold(final long nativeHandle_);
+  private native void setBlobCompactionReadaheadSize(
+      final long nativeHandle_, final long blobCompactionReadaheadSize);
+  private native long blobCompactionReadaheadSize(final long nativeHandle_);
+  private native void setBlobFileStartingLevel(
+      final long nativeHandle_, final int blobFileStartingLevel);
+  private native int blobFileStartingLevel(final long nativeHandle_);
+  private native void setPrepopulateBlobCache(
+      final long nativeHandle_, final byte prepopulateBlobCache);
+  private native byte prepopulateBlobCache(final long nativeHandle_);
+
+  // instance variables
+  // NOTE: If you add new member variables, please update the copy constructor above!
+  private Env env_;
+  private MemTableConfig memTableConfig_;
+  private TableFormatConfig tableFormatConfig_;
+  private RateLimiter rateLimiter_;
+  private AbstractComparator comparator_;
+  private AbstractCompactionFilter<? extends AbstractSlice<?>> compactionFilter_;
+  private AbstractCompactionFilterFactory<? extends AbstractCompactionFilter<?>>
+          compactionFilterFactory_;
+  private CompactionOptionsUniversal compactionOptionsUniversal_;
+  private CompactionOptionsFIFO compactionOptionsFIFO_;
+  private CompressionOptions bottommostCompressionOptions_;
+  private CompressionOptions compressionOptions_;
+  private Cache rowCache_;
+  private WalFilter walFilter_;
+  private WriteBufferManager writeBufferManager_;
+  private SstPartitionerFactory sstPartitionerFactory_;
+  private ConcurrentTaskLimiter compactionThreadLimiter_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/OptionsUtil.java b/src/rocksdb/java/src/main/java/org/rocksdb/OptionsUtil.java
new file mode 100644
index 000000000..899996af9
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/OptionsUtil.java
@@ -0,0 +1,184 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.List;
+
+public class OptionsUtil {
+  /**
+   * A static method to construct the DBOptions and ColumnFamilyDescriptors by
+   * loading the latest RocksDB options file stored in the specified rocksdb
+   * database.
+   *
+   * Note that the all the pointer options (except table_factory, which will
+   * be described in more details below) will be initialized with the default
+   * values.  Developers can further initialize them after this function call.
+   * Below is an example list of pointer options which will be initialized.
+   *
+   * - env
+   * - memtable_factory
+   * - compaction_filter_factory
+   * - prefix_extractor
+   * - comparator
+   * - merge_operator
+   * - compaction_filter
+   *
+   * For table_factory, this function further supports deserializing
+   * BlockBasedTableFactory and its BlockBasedTableOptions except the
+   * pointer options of BlockBasedTableOptions (flush_block_policy_factory,
+   * block_cache, and block_cache_compressed), which will be initialized with
+   * default values.  Developers can further specify these three options by
+   * casting the return value of TableFactoroy::GetOptions() to
+   * BlockBasedTableOptions and making necessary changes.
+   *
+   * @param dbPath the path to the RocksDB.
+   * @param env {@link org.rocksdb.Env} instance.
+   * @param dbOptions {@link org.rocksdb.DBOptions} instance. This will be
+   *     filled and returned.
+   * @param cfDescs A list of {@link org.rocksdb.ColumnFamilyDescriptor}'s be
+   *    returned.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *     native library.
+   */
+
+  public static void loadLatestOptions(String dbPath, Env env, DBOptions dbOptions,
+      List<ColumnFamilyDescriptor> cfDescs) throws RocksDBException {
+    loadLatestOptions(dbPath, env, dbOptions, cfDescs, false);
+  }
+
+  /**
+   * @param dbPath the path to the RocksDB.
+   * @param env {@link org.rocksdb.Env} instance.
+   * @param dbOptions {@link org.rocksdb.DBOptions} instance. This will be
+   *     filled and returned.
+   * @param cfDescs A list of {@link org.rocksdb.ColumnFamilyDescriptor}'s be
+   *     returned.
+   * @param ignoreUnknownOptions this flag can be set to true if you want to
+   *     ignore options that are from a newer version of the db, essentially for
+   *     forward compatibility.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *     native library.
+   */
+  public static void loadLatestOptions(String dbPath, Env env, DBOptions dbOptions,
+      List<ColumnFamilyDescriptor> cfDescs, boolean ignoreUnknownOptions) throws RocksDBException {
+    loadLatestOptions(
+        dbPath, env.nativeHandle_, dbOptions.nativeHandle_, cfDescs, ignoreUnknownOptions);
+  }
+
+  /**
+   * Similar to LoadLatestOptions, this function constructs the DBOptions
+   * and ColumnFamilyDescriptors based on the specified RocksDB Options file.
+   * See LoadLatestOptions above.
+   *
+   * @param dbPath the path to the RocksDB.
+   * @param configOptions {@link org.rocksdb.ConfigOptions} instance.
+   * @param dbOptions {@link org.rocksdb.DBOptions} instance. This will be
+   *     filled and returned.
+   * @param cfDescs A list of {@link org.rocksdb.ColumnFamilyDescriptor}'s be
+   *     returned.
+   * @throws RocksDBException thrown if error happens in underlying
+   *     native library.
+   */
+  public static void loadLatestOptions(ConfigOptions configOptions, String dbPath,
+      DBOptions dbOptions, List<ColumnFamilyDescriptor> cfDescs) throws RocksDBException {
+    loadLatestOptions(configOptions.nativeHandle_, dbPath, dbOptions.nativeHandle_, cfDescs);
+  }
+
+  /**
+   * Similar to LoadLatestOptions, this function constructs the DBOptions
+   * and ColumnFamilyDescriptors based on the specified RocksDB Options file.
+   * See LoadLatestOptions above.
+   *
+   * @param optionsFileName the RocksDB options file path.
+   * @param env {@link org.rocksdb.Env} instance.
+   * @param dbOptions {@link org.rocksdb.DBOptions} instance. This will be
+   *     filled and returned.
+   * @param cfDescs A list of {@link org.rocksdb.ColumnFamilyDescriptor}'s be
+   *     returned.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *     native library.
+   */
+  public static void loadOptionsFromFile(String optionsFileName, Env env, DBOptions dbOptions,
+      List<ColumnFamilyDescriptor> cfDescs) throws RocksDBException {
+    loadOptionsFromFile(optionsFileName, env, dbOptions, cfDescs, false);
+  }
+
+  /**
+   * @param optionsFileName the RocksDB options file path.
+   * @param env {@link org.rocksdb.Env} instance.
+   * @param dbOptions {@link org.rocksdb.DBOptions} instance. This will be
+   *     filled and returned.
+   * @param cfDescs A list of {@link org.rocksdb.ColumnFamilyDescriptor}'s be
+   *     returned.
+   * @param ignoreUnknownOptions this flag can be set to true if you want to
+   *     ignore options that are from a newer version of the db, esentially for
+   *     forward compatibility.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *     native library.
+   */
+  public static void loadOptionsFromFile(String optionsFileName, Env env, DBOptions dbOptions,
+      List<ColumnFamilyDescriptor> cfDescs, boolean ignoreUnknownOptions) throws RocksDBException {
+    loadOptionsFromFile(
+        optionsFileName, env.nativeHandle_, dbOptions.nativeHandle_, cfDescs, ignoreUnknownOptions);
+  }
+
+  /**
+   * Similar to LoadLatestOptions, this function constructs the DBOptions
+   * and ColumnFamilyDescriptors based on the specified RocksDB Options file.
+   * See LoadLatestOptions above.
+   *
+   * @param optionsFileName the RocksDB options file path.
+   * @param configOptions {@link org.rocksdb.ConfigOptions} instance.
+   * @param dbOptions {@link org.rocksdb.DBOptions} instance. This will be
+   *     filled and returned.
+   * @param cfDescs A list of {@link org.rocksdb.ColumnFamilyDescriptor}'s be
+   *     returned.
+   * @throws RocksDBException thrown if error happens in underlying
+   *     native library.
+   */
+  public static void loadOptionsFromFile(ConfigOptions configOptions, String optionsFileName,
+      DBOptions dbOptions, List<ColumnFamilyDescriptor> cfDescs) throws RocksDBException {
+    loadOptionsFromFile(
+        configOptions.nativeHandle_, optionsFileName, dbOptions.nativeHandle_, cfDescs);
+  }
+
+  /**
+   * Returns the latest options file name under the specified RocksDB path.
+   *
+   * @param dbPath the path to the RocksDB.
+   * @param env {@link org.rocksdb.Env} instance.
+   * @return the latest options file name under the db path.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *     native library.
+   */
+  public static String getLatestOptionsFileName(String dbPath, Env env) throws RocksDBException {
+    return getLatestOptionsFileName(dbPath, env.nativeHandle_);
+  }
+
+  /**
+   * Private constructor.
+   * This class has only static methods and shouldn't be instantiated.
+   */
+  private OptionsUtil() {}
+
+  // native methods
+  private native static void loadLatestOptions(String dbPath, long envHandle, long dbOptionsHandle,
+      List<ColumnFamilyDescriptor> cfDescs, boolean ignoreUnknownOptions) throws RocksDBException;
+  private native static void loadLatestOptions(long cfgHandle, String dbPath, long dbOptionsHandle,
+      List<ColumnFamilyDescriptor> cfDescs) throws RocksDBException;
+  private native static void loadOptionsFromFile(String optionsFileName, long envHandle,
+      long dbOptionsHandle, List<ColumnFamilyDescriptor> cfDescs, boolean ignoreUnknownOptions)
+      throws RocksDBException;
+  private native static void loadOptionsFromFile(long cfgHandle, String optionsFileName,
+      long dbOptionsHandle, List<ColumnFamilyDescriptor> cfDescs) throws RocksDBException;
+  private native static String getLatestOptionsFileName(String dbPath, long envHandle)
+      throws RocksDBException;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/PersistentCache.java b/src/rocksdb/java/src/main/java/org/rocksdb/PersistentCache.java
new file mode 100644
index 000000000..aed565297
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/PersistentCache.java
@@ -0,0 +1,26 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Persistent cache for caching IO pages on a persistent medium. The
+ * cache is specifically designed for persistent read cache.
+ */
+public class PersistentCache extends RocksObject {
+
+  public PersistentCache(final Env env, final String path, final long size,
+      final Logger logger, final boolean optimizedForNvm)
+      throws RocksDBException {
+    super(newPersistentCache(env.nativeHandle_, path, size,
+        logger.nativeHandle_, optimizedForNvm));
+  }
+
+  private native static long newPersistentCache(final long envHandle,
+    final String path, final long size, final long loggerHandle,
+    final boolean optimizedForNvm) throws RocksDBException;
+
+  @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/PlainTableConfig.java b/src/rocksdb/java/src/main/java/org/rocksdb/PlainTableConfig.java
new file mode 100644
index 000000000..c09998167
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/PlainTableConfig.java
@@ -0,0 +1,251 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+/**
+ * The config for plain table sst format.
+ *
+ * <p>PlainTable is a RocksDB's SST file format optimized for low query
+ * latency on pure-memory or really low-latency media.</p>
+ *
+ * <p>It also support prefix hash feature.</p>
+ */
+public class PlainTableConfig extends TableFormatConfig {
+  public static final int VARIABLE_LENGTH = 0;
+  public static final int DEFAULT_BLOOM_BITS_PER_KEY = 10;
+  public static final double DEFAULT_HASH_TABLE_RATIO = 0.75;
+  public static final int DEFAULT_INDEX_SPARSENESS = 16;
+  public static final int DEFAULT_HUGE_TLB_SIZE = 0;
+  public static final EncodingType DEFAULT_ENCODING_TYPE =
+      EncodingType.kPlain;
+  public static final boolean DEFAULT_FULL_SCAN_MODE = false;
+  public static final boolean DEFAULT_STORE_INDEX_IN_FILE
+      = false;
+
+  public PlainTableConfig() {
+    keySize_ = VARIABLE_LENGTH;
+    bloomBitsPerKey_ = DEFAULT_BLOOM_BITS_PER_KEY;
+    hashTableRatio_ = DEFAULT_HASH_TABLE_RATIO;
+    indexSparseness_ = DEFAULT_INDEX_SPARSENESS;
+    hugePageTlbSize_ = DEFAULT_HUGE_TLB_SIZE;
+    encodingType_ = DEFAULT_ENCODING_TYPE;
+    fullScanMode_ = DEFAULT_FULL_SCAN_MODE;
+    storeIndexInFile_ = DEFAULT_STORE_INDEX_IN_FILE;
+  }
+
+  /**
+   * <p>Set the length of the user key. If it is set to be
+   * VARIABLE_LENGTH, then it indicates the user keys are
+   * of variable length.</p>
+   *
+   * <p>Otherwise,all the keys need to have the same length
+   * in byte.</p>
+   *
+   * <p>DEFAULT: VARIABLE_LENGTH</p>
+   *
+   * @param keySize the length of the user key.
+   * @return the reference to the current config.
+   */
+  public PlainTableConfig setKeySize(int keySize) {
+    keySize_ = keySize;
+    return this;
+  }
+
+  /**
+   * @return the specified size of the user key.  If VARIABLE_LENGTH,
+   *     then it indicates variable-length key.
+   */
+  public int keySize() {
+    return keySize_;
+  }
+
+  /**
+   * Set the number of bits per key used by the internal bloom filter
+   * in the plain table sst format.
+   *
+   * @param bitsPerKey the number of bits per key for bloom filer.
+   * @return the reference to the current config.
+   */
+  public PlainTableConfig setBloomBitsPerKey(int bitsPerKey) {
+    bloomBitsPerKey_ = bitsPerKey;
+    return this;
+  }
+
+  /**
+   * @return the number of bits per key used for the bloom filter.
+   */
+  public int bloomBitsPerKey() {
+    return bloomBitsPerKey_;
+  }
+
+  /**
+   * hashTableRatio is the desired utilization of the hash table used
+   * for prefix hashing.  The ideal ratio would be the number of
+   * prefixes / the number of hash buckets.  If this value is set to
+   * zero, then hash table will not be used.
+   *
+   * @param ratio the hash table ratio.
+   * @return the reference to the current config.
+   */
+  public PlainTableConfig setHashTableRatio(double ratio) {
+    hashTableRatio_ = ratio;
+    return this;
+  }
+
+  /**
+   * @return the hash table ratio.
+   */
+  public double hashTableRatio() {
+    return hashTableRatio_;
+  }
+
+  /**
+   * Index sparseness determines the index interval for keys inside the
+   * same prefix.  This number is equal to the maximum number of linear
+   * search required after hash and binary search.  If it's set to 0,
+   * then each key will be indexed.
+   *
+   * @param sparseness the index sparseness.
+   * @return the reference to the current config.
+   */
+  public PlainTableConfig setIndexSparseness(int sparseness) {
+    indexSparseness_ = sparseness;
+    return this;
+  }
+
+  /**
+   * @return the index sparseness.
+   */
+  public long indexSparseness() {
+    return indexSparseness_;
+  }
+
+  /**
+   * <p>huge_page_tlb_size: if &le;0, allocate hash indexes and blooms
+   * from malloc otherwise from huge page TLB.</p>
+   *
+   * <p>The user needs to reserve huge pages for it to be allocated,
+   * like: {@code sysctl -w vm.nr_hugepages=20}</p>
+   *
+   * <p>See linux doc Documentation/vm/hugetlbpage.txt</p>
+   *
+   * @param hugePageTlbSize huge page tlb size
+   * @return the reference to the current config.
+   */
+  public PlainTableConfig setHugePageTlbSize(int hugePageTlbSize) {
+    this.hugePageTlbSize_ = hugePageTlbSize;
+    return this;
+  }
+
+  /**
+   * Returns the value for huge page tlb size
+   *
+   * @return hugePageTlbSize
+   */
+  public int hugePageTlbSize() {
+    return hugePageTlbSize_;
+  }
+
+  /**
+   * Sets the encoding type.
+   *
+   * <p>This setting determines how to encode
+   * the keys. See enum {@link EncodingType} for
+   * the choices.</p>
+   *
+   * <p>The value will determine how to encode keys
+   * when writing to a new SST file. This value will be stored
+   * inside the SST file which will be used when reading from
+   * the file, which makes it possible for users to choose
+   * different encoding type when reopening a DB. Files with
+   * different encoding types can co-exist in the same DB and
+   * can be read.</p>
+   *
+   * @param encodingType {@link org.rocksdb.EncodingType} value.
+   * @return the reference to the current config.
+   */
+  public PlainTableConfig setEncodingType(EncodingType encodingType) {
+    this.encodingType_ = encodingType;
+    return this;
+  }
+
+  /**
+   * Returns the active EncodingType
+   *
+   * @return currently set encoding type
+   */
+  public EncodingType encodingType() {
+    return encodingType_;
+  }
+
+  /**
+   * Set full scan mode, if true the whole file will be read
+   * one record by one without using the index.
+   *
+   * @param fullScanMode boolean value indicating if full
+   *     scan mode shall be enabled.
+   * @return the reference to the current config.
+   */
+  public PlainTableConfig setFullScanMode(boolean fullScanMode) {
+    this.fullScanMode_ = fullScanMode;
+    return this;
+  }
+
+  /**
+   * Return if full scan mode is active
+   * @return boolean value indicating if the full scan mode is
+   *     enabled.
+   */
+  public boolean fullScanMode() {
+    return fullScanMode_;
+  }
+
+  /**
+   * <p>If set to true: compute plain table index and bloom
+   * filter during file building and store it in file.
+   * When reading file, index will be mmaped instead
+   * of doing recomputation.</p>
+   *
+   * @param storeIndexInFile value indicating if index shall
+   *     be stored in a file
+   * @return the reference to the current config.
+   */
+  public PlainTableConfig setStoreIndexInFile(boolean storeIndexInFile) {
+    this.storeIndexInFile_ = storeIndexInFile;
+    return this;
+  }
+
+  /**
+   * Return a boolean value indicating if index shall be stored
+   * in a file.
+   *
+   * @return currently set value for store index in file.
+   */
+  public boolean storeIndexInFile() {
+    return storeIndexInFile_;
+  }
+
+  @Override protected long newTableFactoryHandle() {
+    return newTableFactoryHandle(keySize_, bloomBitsPerKey_,
+        hashTableRatio_, indexSparseness_, hugePageTlbSize_,
+        encodingType_.getValue(), fullScanMode_,
+        storeIndexInFile_);
+  }
+
+  private native long newTableFactoryHandle(
+      int keySize, int bloomBitsPerKey,
+      double hashTableRatio, int indexSparseness,
+      int hugePageTlbSize, byte encodingType,
+      boolean fullScanMode, boolean storeIndexInFile);
+
+  private int keySize_;
+  private int bloomBitsPerKey_;
+  private double hashTableRatio_;
+  private int indexSparseness_;
+  private int hugePageTlbSize_;
+  private EncodingType encodingType_;
+  private boolean fullScanMode_;
+  private boolean storeIndexInFile_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/PrepopulateBlobCache.java b/src/rocksdb/java/src/main/java/org/rocksdb/PrepopulateBlobCache.java
new file mode 100644
index 000000000..f1237aa7c
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/PrepopulateBlobCache.java
@@ -0,0 +1,117 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Enum PrepopulateBlobCache
+ *
+ * <p>Prepopulate warm/hot blobs which are already in memory into blob
+ * cache at the time of flush. On a flush, the blob that is in memory
+ * (in memtables) get flushed to the device. If using Direct IO,
+ * additional IO is incurred to read this blob back into memory again,
+ * which is avoided by enabling this option. This further helps if the
+ * workload exhibits high temporal locality, where most of the reads go
+ * to recently written data. This also helps in case of the remote file
+ * system since it involves network traffic and higher latencies.</p>
+ */
+public enum PrepopulateBlobCache {
+  PREPOPULATE_BLOB_DISABLE((byte) 0x0, "prepopulate_blob_disable", "kDisable"),
+  PREPOPULATE_BLOB_FLUSH_ONLY((byte) 0x1, "prepopulate_blob_flush_only", "kFlushOnly");
+
+  /**
+   * <p>Get the PrepopulateBlobCache enumeration value by
+   * passing the library name to this method.</p>
+   *
+   * <p>If library cannot be found the enumeration
+   * value {@code PREPOPULATE_BLOB_DISABLE} will be returned.</p>
+   *
+   * @param libraryName prepopulate blob cache library name.
+   *
+   * @return PrepopulateBlobCache instance.
+   */
+  public static PrepopulateBlobCache getPrepopulateBlobCache(String libraryName) {
+    if (libraryName != null) {
+      for (PrepopulateBlobCache prepopulateBlobCache : PrepopulateBlobCache.values()) {
+        if (prepopulateBlobCache.getLibraryName() != null
+            && prepopulateBlobCache.getLibraryName().equals(libraryName)) {
+          return prepopulateBlobCache;
+        }
+      }
+    }
+    return PrepopulateBlobCache.PREPOPULATE_BLOB_DISABLE;
+  }
+
+  /**
+   * <p>Get the PrepopulateBlobCache enumeration value by
+   * passing the byte identifier to this method.</p>
+   *
+   * @param byteIdentifier of PrepopulateBlobCache.
+   *
+   * @return PrepopulateBlobCache instance.
+   *
+   * @throws IllegalArgumentException If PrepopulateBlobCache cannot be found for the
+   *   provided byteIdentifier
+   */
+  public static PrepopulateBlobCache getPrepopulateBlobCache(byte byteIdentifier) {
+    for (final PrepopulateBlobCache prepopulateBlobCache : PrepopulateBlobCache.values()) {
+      if (prepopulateBlobCache.getValue() == byteIdentifier) {
+        return prepopulateBlobCache;
+      }
+    }
+
+    throw new IllegalArgumentException("Illegal value provided for PrepopulateBlobCache.");
+  }
+
+  /**
+   * <p>Get a PrepopulateBlobCache value based on the string key in the C++ options output.
+   * This gets used in support of getting options into Java from an options string,
+   * which is generated at the C++ level.
+   * </p>
+   *
+   * @param internalName the internal (C++) name by which the option is known.
+   *
+   * @return PrepopulateBlobCache instance (optional)
+   */
+  static PrepopulateBlobCache getFromInternal(final String internalName) {
+    for (final PrepopulateBlobCache prepopulateBlobCache : PrepopulateBlobCache.values()) {
+      if (prepopulateBlobCache.internalName_.equals(internalName)) {
+        return prepopulateBlobCache;
+      }
+    }
+
+    throw new IllegalArgumentException(
+        "Illegal internalName '" + internalName + " ' provided for PrepopulateBlobCache.");
+  }
+
+  /**
+   * <p>Returns the byte value of the enumerations value.</p>
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value_;
+  }
+
+  /**
+   * <p>Returns the library name of the prepopulate blob cache mode
+   * identified by the enumeration value.</p>
+   *
+   * @return library name
+   */
+  public String getLibraryName() {
+    return libraryName_;
+  }
+
+  PrepopulateBlobCache(final byte value, final String libraryName, final String internalName) {
+    value_ = value;
+    libraryName_ = libraryName;
+    internalName_ = internalName;
+  }
+
+  private final byte value_;
+  private final String libraryName_;
+  private final String internalName_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/Priority.java b/src/rocksdb/java/src/main/java/org/rocksdb/Priority.java
new file mode 100644
index 000000000..34a56edcb
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/Priority.java
@@ -0,0 +1,49 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * The Thread Pool priority.
+ */
+public enum Priority {
+  BOTTOM((byte) 0x0),
+  LOW((byte) 0x1),
+  HIGH((byte)0x2),
+  TOTAL((byte)0x3);
+
+  private final byte value;
+
+  Priority(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * <p>Returns the byte value of the enumerations value.</p>
+   *
+   * @return byte representation
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get Priority by byte value.
+   *
+   * @param value byte representation of Priority.
+   *
+   * @return {@link org.rocksdb.Priority} instance.
+   * @throws java.lang.IllegalArgumentException if an invalid
+   *     value is provided.
+   */
+  static Priority getPriority(final byte value) {
+    for (final Priority priority : Priority.values()) {
+      if (priority.getValue() == value){
+        return priority;
+      }
+    }
+    throw new IllegalArgumentException("Illegal value provided for Priority.");
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/Range.java b/src/rocksdb/java/src/main/java/org/rocksdb/Range.java
new file mode 100644
index 000000000..74c85e5f0
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/Range.java
@@ -0,0 +1,19 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Range from start to limit.
+ */
+public class Range {
+  final Slice start;
+  final Slice limit;
+
+  public Range(final Slice start, final Slice limit) {
+    this.start = start;
+    this.limit = limit;
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/RateLimiter.java b/src/rocksdb/java/src/main/java/org/rocksdb/RateLimiter.java
new file mode 100644
index 000000000..c2b8a0fd9
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/RateLimiter.java
@@ -0,0 +1,227 @@
+// Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * RateLimiter, which is used to control write rate of flush and
+ * compaction.
+ *
+ * @since 3.10.0
+ */
+public class RateLimiter extends RocksObject {
+  public static final long DEFAULT_REFILL_PERIOD_MICROS = 100 * 1000;
+  public static final int DEFAULT_FAIRNESS = 10;
+  public static final RateLimiterMode DEFAULT_MODE =
+      RateLimiterMode.WRITES_ONLY;
+  public static final boolean DEFAULT_AUTOTUNE = false;
+
+  /**
+   * RateLimiter constructor
+   *
+   * @param rateBytesPerSecond this is the only parameter you want to set
+   *     most of the time. It controls the total write rate of compaction
+   *     and flush in bytes per second. Currently, RocksDB does not enforce
+   *     rate limit for anything other than flush and compaction, e.g. write to
+   *     WAL.
+   */
+  public RateLimiter(final long rateBytesPerSecond) {
+    this(rateBytesPerSecond, DEFAULT_REFILL_PERIOD_MICROS, DEFAULT_FAIRNESS,
+        DEFAULT_MODE, DEFAULT_AUTOTUNE);
+  }
+
+  /**
+   * RateLimiter constructor
+   *
+   * @param rateBytesPerSecond this is the only parameter you want to set
+   *     most of the time. It controls the total write rate of compaction
+   *     and flush in bytes per second. Currently, RocksDB does not enforce
+   *     rate limit for anything other than flush and compaction, e.g. write to
+   *     WAL.
+   * @param refillPeriodMicros this controls how often tokens are refilled. For
+   *     example,
+   *     when rate_bytes_per_sec is set to 10MB/s and refill_period_us is set to
+   *     100ms, then 1MB is refilled every 100ms internally. Larger value can
+   *     lead to burstier writes while smaller value introduces more CPU
+   *     overhead. The default of 100,000ms should work for most cases.
+   */
+  public RateLimiter(final long rateBytesPerSecond,
+      final long refillPeriodMicros) {
+    this(rateBytesPerSecond, refillPeriodMicros, DEFAULT_FAIRNESS, DEFAULT_MODE,
+        DEFAULT_AUTOTUNE);
+  }
+
+  /**
+   * RateLimiter constructor
+   *
+   * @param rateBytesPerSecond this is the only parameter you want to set
+   *     most of the time. It controls the total write rate of compaction
+   *     and flush in bytes per second. Currently, RocksDB does not enforce
+   *     rate limit for anything other than flush and compaction, e.g. write to
+   *     WAL.
+   * @param refillPeriodMicros this controls how often tokens are refilled. For
+   *     example,
+   *     when rate_bytes_per_sec is set to 10MB/s and refill_period_us is set to
+   *     100ms, then 1MB is refilled every 100ms internally. Larger value can
+   *     lead to burstier writes while smaller value introduces more CPU
+   *     overhead. The default of 100,000ms should work for most cases.
+   * @param fairness RateLimiter accepts high-pri requests and low-pri requests.
+   *     A low-pri request is usually blocked in favor of hi-pri request.
+   *     Currently, RocksDB assigns low-pri to request from compaction and
+   *     high-pri to request from flush. Low-pri requests can get blocked if
+   *     flush requests come in continuously. This fairness parameter grants
+   *     low-pri requests permission by fairness chance even though high-pri
+   *     requests exist to avoid starvation.
+   *     You should be good by leaving it at default 10.
+   */
+  public RateLimiter(final long rateBytesPerSecond,
+      final long refillPeriodMicros, final int fairness) {
+    this(rateBytesPerSecond, refillPeriodMicros, fairness, DEFAULT_MODE,
+        DEFAULT_AUTOTUNE);
+  }
+
+  /**
+   * RateLimiter constructor
+   *
+   * @param rateBytesPerSecond this is the only parameter you want to set
+   *     most of the time. It controls the total write rate of compaction
+   *     and flush in bytes per second. Currently, RocksDB does not enforce
+   *     rate limit for anything other than flush and compaction, e.g. write to
+   *     WAL.
+   * @param refillPeriodMicros this controls how often tokens are refilled. For
+   *     example,
+   *     when rate_bytes_per_sec is set to 10MB/s and refill_period_us is set to
+   *     100ms, then 1MB is refilled every 100ms internally. Larger value can
+   *     lead to burstier writes while smaller value introduces more CPU
+   *     overhead. The default of 100,000ms should work for most cases.
+   * @param fairness RateLimiter accepts high-pri requests and low-pri requests.
+   *     A low-pri request is usually blocked in favor of hi-pri request.
+   *     Currently, RocksDB assigns low-pri to request from compaction and
+   *     high-pri to request from flush. Low-pri requests can get blocked if
+   *     flush requests come in continuously. This fairness parameter grants
+   *     low-pri requests permission by fairness chance even though high-pri
+   *     requests exist to avoid starvation.
+   *     You should be good by leaving it at default 10.
+   * @param rateLimiterMode indicates which types of operations count against
+   *     the limit.
+   */
+  public RateLimiter(final long rateBytesPerSecond,
+      final long refillPeriodMicros, final int fairness,
+      final RateLimiterMode rateLimiterMode) {
+    this(rateBytesPerSecond, refillPeriodMicros, fairness, rateLimiterMode,
+        DEFAULT_AUTOTUNE);
+  }
+
+  /**
+   * RateLimiter constructor
+   *
+   * @param rateBytesPerSecond this is the only parameter you want to set
+   *     most of the time. It controls the total write rate of compaction
+   *     and flush in bytes per second. Currently, RocksDB does not enforce
+   *     rate limit for anything other than flush and compaction, e.g. write to
+   *     WAL.
+   * @param refillPeriodMicros this controls how often tokens are refilled. For
+   *     example,
+   *     when rate_bytes_per_sec is set to 10MB/s and refill_period_us is set to
+   *     100ms, then 1MB is refilled every 100ms internally. Larger value can
+   *     lead to burstier writes while smaller value introduces more CPU
+   *     overhead. The default of 100,000ms should work for most cases.
+   * @param fairness RateLimiter accepts high-pri requests and low-pri requests.
+   *     A low-pri request is usually blocked in favor of hi-pri request.
+   *     Currently, RocksDB assigns low-pri to request from compaction and
+   *     high-pri to request from flush. Low-pri requests can get blocked if
+   *     flush requests come in continuously. This fairness parameter grants
+   *     low-pri requests permission by fairness chance even though high-pri
+   *     requests exist to avoid starvation.
+   *     You should be good by leaving it at default 10.
+   * @param rateLimiterMode indicates which types of operations count against
+   *     the limit.
+   * @param autoTune Enables dynamic adjustment of rate limit within the range
+   *     {@code [rate_bytes_per_sec / 20, rate_bytes_per_sec]}, according to
+   *     the recent demand for background I/O.
+   */
+  public RateLimiter(final long rateBytesPerSecond,
+      final long refillPeriodMicros, final int fairness,
+      final RateLimiterMode rateLimiterMode, final boolean autoTune) {
+    super(newRateLimiterHandle(rateBytesPerSecond,
+        refillPeriodMicros, fairness, rateLimiterMode.getValue(), autoTune));
+  }
+
+  /**
+   * <p>This API allows user to dynamically change rate limiter's bytes per second.
+   * REQUIRED: bytes_per_second &gt; 0</p>
+   *
+   * @param bytesPerSecond bytes per second.
+   */
+  public void setBytesPerSecond(final long bytesPerSecond) {
+    assert(isOwningHandle());
+    setBytesPerSecond(nativeHandle_, bytesPerSecond);
+  }
+
+  /**
+   * Returns the bytes per second.
+   *
+   * @return bytes per second.
+   */
+  public long getBytesPerSecond() {
+    assert(isOwningHandle());
+    return getBytesPerSecond(nativeHandle_);
+  }
+
+  /**
+   * <p>Request for token to write bytes. If this request can not be satisfied,
+   * the call is blocked. Caller is responsible to make sure
+   * {@code bytes &lt; GetSingleBurstBytes()}.</p>
+   *
+   * @param bytes requested bytes.
+   */
+  public void request(final long bytes) {
+    assert(isOwningHandle());
+    request(nativeHandle_, bytes);
+  }
+
+  /**
+   * <p>Max bytes can be granted in a single burst.</p>
+   *
+   * @return max bytes can be granted in a single burst.
+   */
+  public long getSingleBurstBytes() {
+    assert(isOwningHandle());
+    return getSingleBurstBytes(nativeHandle_);
+  }
+
+  /**
+   * <p>Total bytes that go through rate limiter.</p>
+   *
+   * @return total bytes that go through rate limiter.
+   */
+  public long getTotalBytesThrough() {
+    assert(isOwningHandle());
+    return getTotalBytesThrough(nativeHandle_);
+  }
+
+  /**
+   * <p>Total # of requests that go through rate limiter.</p>
+   *
+   * @return total # of requests that go through rate limiter.
+   */
+  public long getTotalRequests() {
+    assert(isOwningHandle());
+    return getTotalRequests(nativeHandle_);
+  }
+
+  private static native long newRateLimiterHandle(final long rateBytesPerSecond,
+      final long refillPeriodMicros, final int fairness,
+      final byte rateLimiterMode, final boolean autoTune);
+  @Override protected final native void disposeInternal(final long handle);
+
+  private native void setBytesPerSecond(final long handle,
+      final long bytesPerSecond);
+  private native long getBytesPerSecond(final long handle);
+  private native void request(final long handle, final long bytes);
+  private native long getSingleBurstBytes(final long handle);
+  private native long getTotalBytesThrough(final long handle);
+  private native long getTotalRequests(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/RateLimiterMode.java b/src/rocksdb/java/src/main/java/org/rocksdb/RateLimiterMode.java
new file mode 100644
index 000000000..4b029d816
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/RateLimiterMode.java
@@ -0,0 +1,52 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Mode for {@link RateLimiter#RateLimiter(long, long, int, RateLimiterMode)}.
+ */
+public enum RateLimiterMode {
+  READS_ONLY((byte)0x0),
+  WRITES_ONLY((byte)0x1),
+  ALL_IO((byte)0x2);
+
+  private final byte value;
+
+  RateLimiterMode(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * <p>Returns the byte value of the enumerations value.</p>
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value;
+  }
+
+  /**
+   * <p>Get the RateLimiterMode enumeration value by
+   * passing the byte identifier to this method.</p>
+   *
+   * @param byteIdentifier of RateLimiterMode.
+   *
+   * @return AccessHint instance.
+   *
+   * @throws IllegalArgumentException if the access hint for the byteIdentifier
+   *     cannot be found
+   */
+  public static RateLimiterMode getRateLimiterMode(final byte byteIdentifier) {
+    for (final RateLimiterMode rateLimiterMode : RateLimiterMode.values()) {
+      if (rateLimiterMode.getValue() == byteIdentifier) {
+        return rateLimiterMode;
+      }
+    }
+
+    throw new IllegalArgumentException(
+        "Illegal value provided for RateLimiterMode.");
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/ReadOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/ReadOptions.java
new file mode 100755
index 000000000..0836f0f18
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/ReadOptions.java
@@ -0,0 +1,831 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * The class that controls the get behavior.
+ *
+ * Note that dispose() must be called before an Options instance
+ * become out-of-scope to release the allocated memory in c++.
+ */
+public class ReadOptions extends RocksObject {
+  public ReadOptions() {
+    super(newReadOptions());
+  }
+
+  /**
+   * @param verifyChecksums verification will be performed on every read
+   *     when set to true
+   * @param fillCache if true, then fill-cache behavior will be performed.
+   */
+  public ReadOptions(final boolean verifyChecksums, final boolean fillCache) {
+    super(newReadOptions(verifyChecksums, fillCache));
+  }
+
+  /**
+   * Copy constructor.
+   *
+   * NOTE: This does a shallow copy, which means snapshot, iterate_upper_bound
+   * and other pointers will be cloned!
+   *
+   * @param other The ReadOptions to copy.
+   */
+  public ReadOptions(ReadOptions other) {
+    super(copyReadOptions(other.nativeHandle_));
+    this.iterateLowerBoundSlice_ = other.iterateLowerBoundSlice_;
+    this.iterateUpperBoundSlice_ = other.iterateUpperBoundSlice_;
+    this.timestampSlice_ = other.timestampSlice_;
+    this.iterStartTs_ = other.iterStartTs_;
+  }
+
+  /**
+   * If true, all data read from underlying storage will be
+   * verified against corresponding checksums.
+   * Default: true
+   *
+   * @return true if checksum verification is on.
+   */
+  public boolean verifyChecksums() {
+    assert(isOwningHandle());
+    return verifyChecksums(nativeHandle_);
+  }
+
+  /**
+   * If true, all data read from underlying storage will be
+   * verified against corresponding checksums.
+   * Default: true
+   *
+   * @param verifyChecksums if true, then checksum verification
+   *     will be performed on every read.
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setVerifyChecksums(
+      final boolean verifyChecksums) {
+    assert(isOwningHandle());
+    setVerifyChecksums(nativeHandle_, verifyChecksums);
+    return this;
+  }
+
+  // TODO(yhchiang): this option seems to be block-based table only.
+  //                 move this to a better place?
+  /**
+   * Fill the cache when loading the block-based sst formated db.
+   * Callers may wish to set this field to false for bulk scans.
+   * Default: true
+   *
+   * @return true if the fill-cache behavior is on.
+   */
+  public boolean fillCache() {
+    assert(isOwningHandle());
+    return fillCache(nativeHandle_);
+  }
+
+  /**
+   * Fill the cache when loading the block-based sst formatted db.
+   * Callers may wish to set this field to false for bulk scans.
+   * Default: true
+   *
+   * @param fillCache if true, then fill-cache behavior will be
+   *     performed.
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setFillCache(final boolean fillCache) {
+    assert(isOwningHandle());
+    setFillCache(nativeHandle_, fillCache);
+    return this;
+  }
+
+  /**
+   * Returns the currently assigned Snapshot instance.
+   *
+   * @return the Snapshot assigned to this instance. If no Snapshot
+   *     is assigned null.
+   */
+  public Snapshot snapshot() {
+    assert(isOwningHandle());
+    long snapshotHandle = snapshot(nativeHandle_);
+    if (snapshotHandle != 0) {
+      return new Snapshot(snapshotHandle);
+    }
+    return null;
+  }
+
+  /**
+   * <p>If "snapshot" is non-nullptr, read as of the supplied snapshot
+   * (which must belong to the DB that is being read and which must
+   * not have been released).  If "snapshot" is nullptr, use an implicit
+   * snapshot of the state at the beginning of this read operation.</p>
+   * <p>Default: null</p>
+   *
+   * @param snapshot {@link Snapshot} instance
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setSnapshot(final Snapshot snapshot) {
+    assert(isOwningHandle());
+    if (snapshot != null) {
+      setSnapshot(nativeHandle_, snapshot.nativeHandle_);
+    } else {
+      setSnapshot(nativeHandle_, 0l);
+    }
+    return this;
+  }
+
+  /**
+   * Returns the current read tier.
+   *
+   * @return the read tier in use, by default {@link ReadTier#READ_ALL_TIER}
+   */
+  public ReadTier readTier() {
+    assert(isOwningHandle());
+    return ReadTier.getReadTier(readTier(nativeHandle_));
+  }
+
+  /**
+   * Specify if this read request should process data that ALREADY
+   * resides on a particular cache. If the required data is not
+   * found at the specified cache, then {@link RocksDBException} is thrown.
+   *
+   * @param readTier {@link ReadTier} instance
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setReadTier(final ReadTier readTier) {
+    assert(isOwningHandle());
+    setReadTier(nativeHandle_, readTier.getValue());
+    return this;
+  }
+
+  /**
+   * Specify to create a tailing iterator -- a special iterator that has a
+   * view of the complete database (i.e. it can also be used to read newly
+   * added data) and is optimized for sequential reads. It will return records
+   * that were inserted into the database after the creation of the iterator.
+   * Default: false
+   *
+   * Not supported in {@code ROCKSDB_LITE} mode!
+   *
+   * @return true if tailing iterator is enabled.
+   */
+  public boolean tailing() {
+    assert(isOwningHandle());
+    return tailing(nativeHandle_);
+  }
+
+  /**
+   * Specify to create a tailing iterator -- a special iterator that has a
+   * view of the complete database (i.e. it can also be used to read newly
+   * added data) and is optimized for sequential reads. It will return records
+   * that were inserted into the database after the creation of the iterator.
+   * Default: false
+   * Not supported in ROCKSDB_LITE mode!
+   *
+   * @param tailing if true, then tailing iterator will be enabled.
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setTailing(final boolean tailing) {
+    assert(isOwningHandle());
+    setTailing(nativeHandle_, tailing);
+    return this;
+  }
+
+  /**
+   * Returns whether managed iterators will be used.
+   *
+   * @return the setting of whether managed iterators will be used,
+   *     by default false
+   *
+   * @deprecated This options is not used anymore.
+   */
+  @Deprecated
+  public boolean managed() {
+    assert(isOwningHandle());
+    return managed(nativeHandle_);
+  }
+
+  /**
+   * Specify to create a managed iterator -- a special iterator that
+   * uses less resources by having the ability to free its underlying
+   * resources on request.
+   *
+   * @param managed if true, then managed iterators will be enabled.
+   * @return the reference to the current ReadOptions.
+   *
+   * @deprecated This options is not used anymore.
+   */
+  @Deprecated
+  public ReadOptions setManaged(final boolean managed) {
+    assert(isOwningHandle());
+    setManaged(nativeHandle_, managed);
+    return this;
+  }
+
+  /**
+   * Returns whether a total seek order will be used
+   *
+   * @return the setting of whether a total seek order will be used
+   */
+  public boolean totalOrderSeek() {
+    assert(isOwningHandle());
+    return totalOrderSeek(nativeHandle_);
+  }
+
+  /**
+   * Enable a total order seek regardless of index format (e.g. hash index)
+   * used in the table. Some table format (e.g. plain table) may not support
+   * this option.
+   *
+   * @param totalOrderSeek if true, then total order seek will be enabled.
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setTotalOrderSeek(final boolean totalOrderSeek) {
+    assert(isOwningHandle());
+    setTotalOrderSeek(nativeHandle_, totalOrderSeek);
+    return this;
+  }
+
+  /**
+   * Returns whether the iterator only iterates over the same prefix as the seek
+   *
+   * @return the setting of whether the iterator only iterates over the same
+   *   prefix as the seek, default is false
+   */
+  public boolean prefixSameAsStart() {
+    assert(isOwningHandle());
+    return prefixSameAsStart(nativeHandle_);
+  }
+
+  /**
+   * Enforce that the iterator only iterates over the same prefix as the seek.
+   * This option is effective only for prefix seeks, i.e. prefix_extractor is
+   * non-null for the column family and {@link #totalOrderSeek()} is false.
+   * Unlike iterate_upper_bound, {@link #setPrefixSameAsStart(boolean)} only
+   * works within a prefix but in both directions.
+   *
+   * @param prefixSameAsStart if true, then the iterator only iterates over the
+   *   same prefix as the seek
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setPrefixSameAsStart(final boolean prefixSameAsStart) {
+    assert(isOwningHandle());
+    setPrefixSameAsStart(nativeHandle_, prefixSameAsStart);
+    return this;
+  }
+
+  /**
+   * Returns whether the blocks loaded by the iterator will be pinned in memory
+   *
+   * @return the setting of whether the blocks loaded by the iterator will be
+   *   pinned in memory
+   */
+  public boolean pinData() {
+    assert(isOwningHandle());
+    return pinData(nativeHandle_);
+  }
+
+  /**
+   * Keep the blocks loaded by the iterator pinned in memory as long as the
+   * iterator is not deleted, If used when reading from tables created with
+   * BlockBasedTableOptions::use_delta_encoding = false,
+   * Iterator's property "rocksdb.iterator.is-key-pinned" is guaranteed to
+   * return 1.
+   *
+   * @param pinData if true, the blocks loaded by the iterator will be pinned
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setPinData(final boolean pinData) {
+    assert(isOwningHandle());
+    setPinData(nativeHandle_, pinData);
+    return this;
+  }
+
+  /**
+   * If true, when PurgeObsoleteFile is called in CleanupIteratorState, we
+   * schedule a background job in the flush job queue and delete obsolete files
+   * in background.
+   *
+   * Default: false
+   *
+   * @return true when PurgeObsoleteFile is called in CleanupIteratorState
+   */
+  public boolean backgroundPurgeOnIteratorCleanup() {
+    assert(isOwningHandle());
+    return backgroundPurgeOnIteratorCleanup(nativeHandle_);
+  }
+
+  /**
+   * If true, when PurgeObsoleteFile is called in CleanupIteratorState, we
+   * schedule a background job in the flush job queue and delete obsolete files
+   * in background.
+   *
+   * Default: false
+   *
+   * @param backgroundPurgeOnIteratorCleanup true when PurgeObsoleteFile is
+   *     called in CleanupIteratorState
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setBackgroundPurgeOnIteratorCleanup(
+      final boolean backgroundPurgeOnIteratorCleanup) {
+    assert(isOwningHandle());
+    setBackgroundPurgeOnIteratorCleanup(nativeHandle_,
+        backgroundPurgeOnIteratorCleanup);
+    return this;
+  }
+
+  /**
+   * If non-zero, NewIterator will create a new table reader which
+   * performs reads of the given size. Using a large size (&gt; 2MB) can
+   * improve the performance of forward iteration on spinning disks.
+   *
+   * Default: 0
+   *
+   * @return The readahead size is bytes
+   */
+  public long readaheadSize() {
+    assert(isOwningHandle());
+    return readaheadSize(nativeHandle_);
+  }
+
+  /**
+   * If non-zero, NewIterator will create a new table reader which
+   * performs reads of the given size. Using a large size (&gt; 2MB) can
+   * improve the performance of forward iteration on spinning disks.
+   *
+   * Default: 0
+   *
+   * @param readaheadSize The readahead size is bytes
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setReadaheadSize(final long readaheadSize) {
+    assert(isOwningHandle());
+    setReadaheadSize(nativeHandle_, readaheadSize);
+    return this;
+  }
+
+  /**
+   * A threshold for the number of keys that can be skipped before failing an
+   * iterator seek as incomplete.
+   *
+   * @return the number of keys that can be skipped
+   *     before failing an iterator seek as incomplete.
+   */
+  public long maxSkippableInternalKeys() {
+    assert(isOwningHandle());
+    return maxSkippableInternalKeys(nativeHandle_);
+  }
+
+  /**
+   * A threshold for the number of keys that can be skipped before failing an
+   * iterator seek as incomplete. The default value of 0 should be used to
+   * never fail a request as incomplete, even on skipping too many keys.
+   *
+   * Default: 0
+   *
+   * @param maxSkippableInternalKeys the number of keys that can be skipped
+   *     before failing an iterator seek as incomplete.
+   *
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setMaxSkippableInternalKeys(
+      final long maxSkippableInternalKeys) {
+    assert(isOwningHandle());
+    setMaxSkippableInternalKeys(nativeHandle_, maxSkippableInternalKeys);
+    return this;
+  }
+
+  /**
+   * If true, keys deleted using the DeleteRange() API will be visible to
+   * readers until they are naturally deleted during compaction. This improves
+   * read performance in DBs with many range deletions.
+   *
+   * Default: false
+   *
+   * @return true if keys deleted using the DeleteRange() API will be visible
+   */
+  public boolean ignoreRangeDeletions() {
+    assert(isOwningHandle());
+    return ignoreRangeDeletions(nativeHandle_);
+  }
+
+  /**
+   * If true, keys deleted using the DeleteRange() API will be visible to
+   * readers until they are naturally deleted during compaction. This improves
+   * read performance in DBs with many range deletions.
+   *
+   * Default: false
+   *
+   * @param ignoreRangeDeletions true if keys deleted using the DeleteRange()
+   *     API should be visible
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setIgnoreRangeDeletions(final boolean ignoreRangeDeletions) {
+    assert(isOwningHandle());
+    setIgnoreRangeDeletions(nativeHandle_, ignoreRangeDeletions);
+    return this;
+  }
+
+  /**
+   * Defines the smallest key at which the backward
+   * iterator can return an entry. Once the bound is passed,
+   * {@link RocksIterator#isValid()} will be false.
+   *
+   * The lower bound is inclusive i.e. the bound value is a valid
+   * entry.
+   *
+   * If prefix_extractor is not null, the Seek target and `iterate_lower_bound`
+   * need to have the same prefix. This is because ordering is not guaranteed
+   * outside of prefix domain.
+   *
+   * Default: null
+   *
+   * @param iterateLowerBound Slice representing the lower bound
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setIterateLowerBound(final AbstractSlice<?> iterateLowerBound) {
+    assert(isOwningHandle());
+    setIterateLowerBound(
+        nativeHandle_, iterateLowerBound == null ? 0 : iterateLowerBound.getNativeHandle());
+    // Hold onto a reference so it doesn't get garbage collected out from under us.
+    iterateLowerBoundSlice_ = iterateLowerBound;
+    return this;
+  }
+
+  /**
+   * Returns the smallest key at which the backward
+   * iterator can return an entry.
+   *
+   * The lower bound is inclusive i.e. the bound value is a valid entry.
+   *
+   * @return the smallest key, or null if there is no lower bound defined.
+   */
+  public Slice iterateLowerBound() {
+    assert(isOwningHandle());
+    final long lowerBoundSliceHandle = iterateLowerBound(nativeHandle_);
+    if (lowerBoundSliceHandle != 0) {
+      // Disown the new slice - it's owned by the C++ side of the JNI boundary
+      // from the perspective of this method.
+      return new Slice(lowerBoundSliceHandle, false);
+    }
+    return null;
+  }
+
+  /**
+   * Defines the extent up to which the forward iterator
+   * can returns entries. Once the bound is reached,
+   * {@link RocksIterator#isValid()} will be false.
+   *
+   * The upper bound is exclusive i.e. the bound value is not a valid entry.
+   *
+   * If prefix_extractor is not null, the Seek target and iterate_upper_bound
+   * need to have the same prefix. This is because ordering is not guaranteed
+   * outside of prefix domain.
+   *
+   * Default: null
+   *
+   * @param iterateUpperBound Slice representing the upper bound
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setIterateUpperBound(final AbstractSlice<?> iterateUpperBound) {
+    assert(isOwningHandle());
+    setIterateUpperBound(
+        nativeHandle_, iterateUpperBound == null ? 0 : iterateUpperBound.getNativeHandle());
+    // Hold onto a reference so it doesn't get garbage collected out from under us.
+    iterateUpperBoundSlice_ = iterateUpperBound;
+    return this;
+  }
+
+  /**
+   * Returns the largest key at which the forward
+   * iterator can return an entry.
+   *
+   * The upper bound is exclusive i.e. the bound value is not a valid entry.
+   *
+   * @return the largest key, or null if there is no upper bound defined.
+   */
+  public Slice iterateUpperBound() {
+    assert(isOwningHandle());
+    final long upperBoundSliceHandle = iterateUpperBound(nativeHandle_);
+    if (upperBoundSliceHandle != 0) {
+      // Disown the new slice - it's owned by the C++ side of the JNI boundary
+      // from the perspective of this method.
+      return new Slice(upperBoundSliceHandle, false);
+    }
+    return null;
+  }
+
+  /**
+   * A callback to determine whether relevant keys for this scan exist in a
+   * given table based on the table's properties. The callback is passed the
+   * properties of each table during iteration. If the callback returns false,
+   * the table will not be scanned. This option only affects Iterators and has
+   * no impact on point lookups.
+   *
+   * Default: null (every table will be scanned)
+   *
+   * @param tableFilter the table filter for the callback.
+   *
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setTableFilter(final AbstractTableFilter tableFilter) {
+    assert(isOwningHandle());
+    setTableFilter(nativeHandle_, tableFilter.nativeHandle_);
+    return this;
+  }
+
+  /**
+   * When true, by default use total_order_seek = true, and RocksDB can
+   * selectively enable prefix seek mode if won't generate a different result
+   * from total_order_seek, based on seek key, and iterator upper bound.
+   * Not supported in ROCKSDB_LITE mode, in the way that even with value true
+   * prefix mode is not used.
+   * Default: false
+   *
+   * @return true if auto prefix mode is set.
+   *
+   */
+  public boolean autoPrefixMode() {
+    assert (isOwningHandle());
+    return autoPrefixMode(nativeHandle_);
+  }
+
+  /**
+   * When true, by default use total_order_seek = true, and RocksDB can
+   * selectively enable prefix seek mode if won't generate a different result
+   * from total_order_seek, based on seek key, and iterator upper bound.
+   * Not supported in ROCKSDB_LITE mode, in the way that even with value true
+   * prefix mode is not used.
+   * Default: false
+   * @param mode auto prefix mode
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setAutoPrefixMode(final boolean mode) {
+    assert (isOwningHandle());
+    setAutoPrefixMode(nativeHandle_, mode);
+    return this;
+  }
+
+  /**
+   * Timestamp of operation. Read should return the latest data visible to the
+   * specified timestamp. All timestamps of the same database must be of the
+   * same length and format. The user is responsible for providing a customized
+   * compare function via Comparator to order &gt;key, timestamp&gt; tuples.
+   * For iterator, iter_start_ts is the lower bound (older) and timestamp
+   * serves as the upper bound. Versions of the same record that fall in
+   * the timestamp range will be returned. If iter_start_ts is nullptr,
+   * only the most recent version visible to timestamp is returned.
+   * The user-specified timestamp feature is still under active development,
+   * and the API is subject to change.
+   *
+   * Default: null
+   * @see #iterStartTs()
+   * @return Reference to timestamp or null if there is no timestamp defined.
+   */
+  public Slice timestamp() {
+    assert (isOwningHandle());
+    final long timestampSliceHandle = timestamp(nativeHandle_);
+    if (timestampSliceHandle != 0) {
+      return new Slice(timestampSliceHandle);
+    } else {
+      return null;
+    }
+  }
+
+  /**
+   * Timestamp of operation. Read should return the latest data visible to the
+   * specified timestamp. All timestamps of the same database must be of the
+   * same length and format. The user is responsible for providing a customized
+   * compare function via Comparator to order {@code <key, timestamp>} tuples.
+   * For iterator, {@code iter_start_ts} is the lower bound (older) and timestamp
+   * serves as the upper bound. Versions of the same record that fall in
+   * the timestamp range will be returned. If iter_start_ts is nullptr,
+   * only the most recent version visible to timestamp is returned.
+   * The user-specified timestamp feature is still under active development,
+   * and the API is subject to change.
+   *
+   * Default: null
+   * @see #setIterStartTs(AbstractSlice)
+   * @param timestamp Slice representing the timestamp
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setTimestamp(final AbstractSlice<?> timestamp) {
+    assert (isOwningHandle());
+    setTimestamp(nativeHandle_, timestamp == null ? 0 : timestamp.getNativeHandle());
+    timestampSlice_ = timestamp;
+    return this;
+  }
+
+  /**
+   * Timestamp of operation. Read should return the latest data visible to the
+   * specified timestamp. All timestamps of the same database must be of the
+   * same length and format. The user is responsible for providing a customized
+   * compare function via Comparator to order {@code <key, timestamp>} tuples.
+   * For iterator, {@code iter_start_ts} is the lower bound (older) and timestamp
+   * serves as the upper bound. Versions of the same record that fall in
+   * the timestamp range will be returned. If iter_start_ts is nullptr,
+   * only the most recent version visible to timestamp is returned.
+   * The user-specified timestamp feature is still under active development,
+   * and the API is subject to change.
+   *
+   * Default: null
+   * @return Reference to lower bound timestamp or null if there is no lower bound timestamp
+   *     defined.
+   */
+  public Slice iterStartTs() {
+    assert (isOwningHandle());
+    final long iterStartTsHandle = iterStartTs(nativeHandle_);
+    if (iterStartTsHandle != 0) {
+      return new Slice(iterStartTsHandle);
+    } else {
+      return null;
+    }
+  }
+
+  /**
+   * Timestamp of operation. Read should return the latest data visible to the
+   * specified timestamp. All timestamps of the same database must be of the
+   * same length and format. The user is responsible for providing a customized
+   * compare function via Comparator to order {@code <key, timestamp>} tuples.
+   * For iterator, {@code iter_start_ts} is the lower bound (older) and timestamp
+   * serves as the upper bound. Versions of the same record that fall in
+   * the timestamp range will be returned. If iter_start_ts is nullptr,
+   * only the most recent version visible to timestamp is returned.
+   * The user-specified timestamp feature is still under active development,
+   * and the API is subject to change.
+   *
+   * Default: null
+   *
+   * @param iterStartTs Reference to lower bound timestamp or null if there is no lower bound
+   *     timestamp defined
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setIterStartTs(final AbstractSlice<?> iterStartTs) {
+    assert (isOwningHandle());
+    setIterStartTs(nativeHandle_, iterStartTs == null ? 0 : iterStartTs.getNativeHandle());
+    iterStartTs_ = iterStartTs;
+    return this;
+  }
+
+  /**
+   * Deadline for completing an API call (Get/MultiGet/Seek/Next for now)
+   * in microseconds.
+   * It should be set to microseconds since epoch, i.e, {@code gettimeofday} or
+   * equivalent plus allowed duration in microseconds. The best way is to use
+   * {@code env->NowMicros() + some timeout}.
+   * This is best efforts. The call may exceed the deadline if there is IO
+   * involved and the file system doesn't support deadlines, or due to
+   * checking for deadline periodically rather than for every key if
+   * processing a batch
+   *
+   * @return deadline time in microseconds
+   */
+  public long deadline() {
+    assert (isOwningHandle());
+    return deadline(nativeHandle_);
+  }
+
+  /**
+   * Deadline for completing an API call (Get/MultiGet/Seek/Next for now)
+   * in microseconds.
+   * It should be set to microseconds since epoch, i.e, {@code gettimeofday} or
+   * equivalent plus allowed duration in microseconds. The best way is to use
+   * {@code env->NowMicros() + some timeout}.
+   * This is best efforts. The call may exceed the deadline if there is IO
+   * involved and the file system doesn't support deadlines, or due to
+   * checking for deadline periodically rather than for every key if
+   * processing a batch
+   *
+   * @param deadlineTime deadline time in microseconds.
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setDeadline(final long deadlineTime) {
+    assert (isOwningHandle());
+    setDeadline(nativeHandle_, deadlineTime);
+    return this;
+  }
+
+  /**
+   * A timeout in microseconds to be passed to the underlying FileSystem for
+   * reads. As opposed to deadline, this determines the timeout for each
+   * individual file read request. If a MultiGet/Get/Seek/Next etc call
+   * results in multiple reads, each read can last up to io_timeout us.
+   * @return ioTimeout time in microseconds
+   */
+  public long ioTimeout() {
+    assert (isOwningHandle());
+    return ioTimeout(nativeHandle_);
+  }
+
+  /**
+   * A timeout in microseconds to be passed to the underlying FileSystem for
+   * reads. As opposed to deadline, this determines the timeout for each
+   * individual file read request. If a MultiGet/Get/Seek/Next etc call
+   * results in multiple reads, each read can last up to io_timeout us.
+   *
+   * @param ioTimeout time in microseconds.
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setIoTimeout(final long ioTimeout) {
+    assert (isOwningHandle());
+    setIoTimeout(nativeHandle_, ioTimeout);
+    return this;
+  }
+
+  /**
+   * It limits the maximum cumulative value size of the keys in batch while
+   * reading through MultiGet. Once the cumulative value size exceeds this
+   * soft limit then all the remaining keys are returned with status Aborted.
+   *
+   * Default: {@code std::numeric_limits<uint64_t>::max()}
+   * @return actual valueSizeSofLimit
+   */
+  public long valueSizeSoftLimit() {
+    assert (isOwningHandle());
+    return valueSizeSoftLimit(nativeHandle_);
+  }
+
+  /**
+   * It limits the maximum cumulative value size of the keys in batch while
+   * reading through MultiGet. Once the cumulative value size exceeds this
+   * soft limit then all the remaining keys are returned with status Aborted.
+   *
+   * Default: {@code std::numeric_limits<uint64_t>::max()}
+   *
+   * @param valueSizeSoftLimit the maximum cumulative value size of the keys
+   * @return the reference to the current ReadOptions
+   */
+  public ReadOptions setValueSizeSoftLimit(final long valueSizeSoftLimit) {
+    assert (isOwningHandle());
+    setValueSizeSoftLimit(nativeHandle_, valueSizeSoftLimit);
+    return this;
+  }
+
+  // instance variables
+  // NOTE: If you add new member variables, please update the copy constructor above!
+  //
+  // Hold a reference to any iterate lower or upper bound that was set on this
+  // object until we're destroyed or it's overwritten. That way the caller can
+  // freely leave scope without us losing the Java Slice object, which during
+  // close() would also reap its associated rocksdb::Slice native object since
+  // it's possibly (likely) to be an owning handle.
+  private AbstractSlice<?> iterateLowerBoundSlice_;
+  private AbstractSlice<?> iterateUpperBoundSlice_;
+  private AbstractSlice<?> timestampSlice_;
+  private AbstractSlice<?> iterStartTs_;
+
+  private native static long newReadOptions();
+  private native static long newReadOptions(final boolean verifyChecksums,
+    final boolean fillCache);
+  private native static long copyReadOptions(long handle);
+  @Override protected final native void disposeInternal(final long handle);
+
+  private native boolean verifyChecksums(long handle);
+  private native void setVerifyChecksums(long handle, boolean verifyChecksums);
+  private native boolean fillCache(long handle);
+  private native void setFillCache(long handle, boolean fillCache);
+  private native long snapshot(long handle);
+  private native void setSnapshot(long handle, long snapshotHandle);
+  private native byte readTier(long handle);
+  private native void setReadTier(long handle, byte readTierValue);
+  private native boolean tailing(long handle);
+  private native void setTailing(long handle, boolean tailing);
+  private native boolean managed(long handle);
+  private native void setManaged(long handle, boolean managed);
+  private native boolean totalOrderSeek(long handle);
+  private native void setTotalOrderSeek(long handle, boolean totalOrderSeek);
+  private native boolean prefixSameAsStart(long handle);
+  private native void setPrefixSameAsStart(long handle, boolean prefixSameAsStart);
+  private native boolean pinData(long handle);
+  private native void setPinData(long handle, boolean pinData);
+  private native boolean backgroundPurgeOnIteratorCleanup(final long handle);
+  private native void setBackgroundPurgeOnIteratorCleanup(final long handle,
+      final boolean backgroundPurgeOnIteratorCleanup);
+  private native long readaheadSize(final long handle);
+  private native void setReadaheadSize(final long handle,
+      final long readaheadSize);
+  private native long maxSkippableInternalKeys(final long handle);
+  private native void setMaxSkippableInternalKeys(final long handle,
+      final long maxSkippableInternalKeys);
+  private native boolean ignoreRangeDeletions(final long handle);
+  private native void setIgnoreRangeDeletions(final long handle,
+      final boolean ignoreRangeDeletions);
+  private native void setIterateUpperBound(final long handle,
+      final long upperBoundSliceHandle);
+  private native long iterateUpperBound(final long handle);
+  private native void setIterateLowerBound(final long handle,
+      final long lowerBoundSliceHandle);
+  private native long iterateLowerBound(final long handle);
+  private native void setTableFilter(final long handle, final long tableFilterHandle);
+  private native boolean autoPrefixMode(final long handle);
+  private native void setAutoPrefixMode(final long handle, final boolean autoPrefixMode);
+  private native long timestamp(final long handle);
+  private native void setTimestamp(final long handle, final long timestampSliceHandle);
+  private native long iterStartTs(final long handle);
+  private native void setIterStartTs(final long handle, final long iterStartTsHandle);
+  private native long deadline(final long handle);
+  private native void setDeadline(final long handle, final long deadlineTime);
+  private native long ioTimeout(final long handle);
+  private native void setIoTimeout(final long handle, final long ioTimeout);
+  private native long valueSizeSoftLimit(final long handle);
+  private native void setValueSizeSoftLimit(final long handle, final long softLimit);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/ReadTier.java b/src/rocksdb/java/src/main/java/org/rocksdb/ReadTier.java
new file mode 100644
index 000000000..78f83f6ad
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/ReadTier.java
@@ -0,0 +1,49 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * RocksDB {@link ReadOptions} read tiers.
+ */
+public enum ReadTier {
+  READ_ALL_TIER((byte)0),
+  BLOCK_CACHE_TIER((byte)1),
+  PERSISTED_TIER((byte)2),
+  MEMTABLE_TIER((byte)3);
+
+  private final byte value;
+
+  ReadTier(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Returns the byte value of the enumerations value
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get ReadTier by byte value.
+   *
+   * @param value byte representation of ReadTier.
+   *
+   * @return {@link org.rocksdb.ReadTier} instance or null.
+   * @throws java.lang.IllegalArgumentException if an invalid
+   *     value is provided.
+   */
+  public static ReadTier getReadTier(final byte value) {
+    for (final ReadTier readTier : ReadTier.values()) {
+      if (readTier.getValue() == value){
+        return readTier;
+      }
+    }
+    throw new IllegalArgumentException("Illegal value provided for ReadTier.");
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/RemoveEmptyValueCompactionFilter.java b/src/rocksdb/java/src/main/java/org/rocksdb/RemoveEmptyValueCompactionFilter.java
new file mode 100644
index 000000000..6ee81d858
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/RemoveEmptyValueCompactionFilter.java
@@ -0,0 +1,18 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Just a Java wrapper around EmptyValueCompactionFilter implemented in C++
+ */
+public class RemoveEmptyValueCompactionFilter
+    extends AbstractCompactionFilter<Slice> {
+  public RemoveEmptyValueCompactionFilter() {
+    super(createNewRemoveEmptyValueCompactionFilter0());
+  }
+
+  private native static long createNewRemoveEmptyValueCompactionFilter0();
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/RestoreOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/RestoreOptions.java
new file mode 100644
index 000000000..54dc0e61c
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/RestoreOptions.java
@@ -0,0 +1,32 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * RestoreOptions to control the behavior of restore.
+ *
+ * Note that dispose() must be called before this instance become out-of-scope
+ * to release the allocated memory in c++.
+ *
+ */
+public class RestoreOptions extends RocksObject {
+  /**
+   * Constructor
+   *
+   * @param keepLogFiles If true, restore won't overwrite the existing log files
+   *   in wal_dir. It will also move all log files from archive directory to
+   *   wal_dir. Use this option in combination with
+   *   BackupEngineOptions::backup_log_files = false for persisting in-memory
+   *   databases.
+   *   Default: false
+   */
+  public RestoreOptions(final boolean keepLogFiles) {
+    super(newRestoreOptions(keepLogFiles));
+  }
+
+  private native static long newRestoreOptions(boolean keepLogFiles);
+  @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/ReusedSynchronisationType.java b/src/rocksdb/java/src/main/java/org/rocksdb/ReusedSynchronisationType.java
new file mode 100644
index 000000000..2709a5d59
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/ReusedSynchronisationType.java
@@ -0,0 +1,65 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+/**
+ * Determines the type of synchronisation primitive used
+ * in native code.
+ */
+public enum ReusedSynchronisationType {
+  /**
+   * Standard mutex.
+   */
+  MUTEX((byte)0x0),
+
+  /**
+   * Use adaptive mutex, which spins in the user space before resorting
+   * to kernel. This could reduce context switch when the mutex is not
+   * heavily contended. However, if the mutex is hot, we could end up
+   * wasting spin time.
+   */
+  ADAPTIVE_MUTEX((byte)0x1),
+
+  /**
+   * There is a reused buffer per-thread.
+   */
+  THREAD_LOCAL((byte)0x2);
+
+  private final byte value;
+
+  ReusedSynchronisationType(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Returns the byte value of the enumerations value
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get ReusedSynchronisationType by byte value.
+   *
+   * @param value byte representation of ReusedSynchronisationType.
+   *
+   * @return {@link org.rocksdb.ReusedSynchronisationType} instance.
+   * @throws java.lang.IllegalArgumentException if an invalid
+   *     value is provided.
+   */
+  public static ReusedSynchronisationType getReusedSynchronisationType(
+      final byte value) {
+    for (final ReusedSynchronisationType reusedSynchronisationType
+        : ReusedSynchronisationType.values()) {
+      if (reusedSynchronisationType.getValue() == value) {
+        return reusedSynchronisationType;
+      }
+    }
+    throw new IllegalArgumentException(
+        "Illegal value provided for ReusedSynchronisationType.");
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/RocksCallbackObject.java b/src/rocksdb/java/src/main/java/org/rocksdb/RocksCallbackObject.java
new file mode 100644
index 000000000..8d7a867ee
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/RocksCallbackObject.java
@@ -0,0 +1,73 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.List;
+
+/**
+ * RocksCallbackObject is similar to {@link RocksObject} but varies
+ * in its construction as it is designed for Java objects which have functions
+ * which are called from C++ via JNI.
+ *
+ * RocksCallbackObject is the base-class any RocksDB classes that acts as a
+ * callback from some underlying underlying native C++ {@code rocksdb} object.
+ *
+ * The use of {@code RocksObject} should always be preferred over
+ * {@link RocksCallbackObject} if callbacks are not required.
+ */
+public abstract class RocksCallbackObject extends
+    AbstractImmutableNativeReference {
+
+  protected final long nativeHandle_;
+
+  protected RocksCallbackObject(final long... nativeParameterHandles) {
+    super(true);
+    this.nativeHandle_ = initializeNative(nativeParameterHandles);
+  }
+
+  /**
+   * Given a list of RocksCallbackObjects, it returns a list
+   * of the native handles of the underlying objects.
+   *
+   * @param objectList the rocks callback objects
+   *
+   * @return the native handles
+   */
+  static /* @Nullable */ long[] toNativeHandleList(
+      /* @Nullable */ final List<? extends RocksCallbackObject> objectList) {
+    if (objectList == null) {
+      return null;
+    }
+    final int len = objectList.size();
+    final long[] handleList = new long[len];
+    for (int i = 0; i < len; i++) {
+      handleList[i] = objectList.get(i).nativeHandle_;
+    }
+    return handleList;
+  }
+
+  /**
+   * Construct the Native C++ object which will callback
+   * to our object methods
+   *
+   * @param nativeParameterHandles An array of native handles for any parameter
+   *     objects that are needed during construction
+   *
+   * @return The native handle of the C++ object which will callback to us
+   */
+  protected abstract long initializeNative(
+      final long... nativeParameterHandles);
+
+  /**
+   * Deletes underlying C++ native callback object pointer
+   */
+  @Override
+  protected void disposeInternal() {
+    disposeInternal(nativeHandle_);
+  }
+
+  private native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/RocksDB.java b/src/rocksdb/java/src/main/java/org/rocksdb/RocksDB.java
new file mode 100644
index 000000000..77484288f
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/RocksDB.java
@@ -0,0 +1,4694 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicReference;
+import org.rocksdb.util.Environment;
+
+/**
+ * A RocksDB is a persistent ordered map from keys to values.  It is safe for
+ * concurrent access from multiple threads without any external synchronization.
+ * All methods of this class could potentially throw RocksDBException, which
+ * indicates sth wrong at the RocksDB library side and the call failed.
+ */
+public class RocksDB extends RocksObject {
+  public static final byte[] DEFAULT_COLUMN_FAMILY = "default".getBytes(UTF_8);
+  public static final int NOT_FOUND = -1;
+
+  private enum LibraryState {
+    NOT_LOADED,
+    LOADING,
+    LOADED
+  }
+
+  private static final AtomicReference<LibraryState> libraryLoaded =
+      new AtomicReference<>(LibraryState.NOT_LOADED);
+
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  private final List<ColumnFamilyHandle> ownedColumnFamilyHandles = new ArrayList<>();
+
+  /**
+   * Loads the necessary library files.
+   * Calling this method twice will have no effect.
+   * By default the method extracts the shared library for loading at
+   * java.io.tmpdir, however, you can override this temporary location by
+   * setting the environment variable ROCKSDB_SHAREDLIB_DIR.
+   */
+  public static void loadLibrary() {
+    if (libraryLoaded.get() == LibraryState.LOADED) {
+      return;
+    }
+
+    if (libraryLoaded.compareAndSet(LibraryState.NOT_LOADED,
+        LibraryState.LOADING)) {
+      final String tmpDir = System.getenv("ROCKSDB_SHAREDLIB_DIR");
+      // loading possibly necessary libraries.
+      for (final CompressionType compressionType : CompressionType.values()) {
+        try {
+          if (compressionType.getLibraryName() != null) {
+            System.loadLibrary(compressionType.getLibraryName());
+          }
+        } catch (final UnsatisfiedLinkError e) {
+          // since it may be optional, we ignore its loading failure here.
+        }
+      }
+      try {
+        NativeLibraryLoader.getInstance().loadLibrary(tmpDir);
+      } catch (final IOException e) {
+        libraryLoaded.set(LibraryState.NOT_LOADED);
+        throw new RuntimeException("Unable to load the RocksDB shared library",
+            e);
+      }
+
+      final int encodedVersion = version();
+      version = Version.fromEncodedVersion(encodedVersion);
+
+      libraryLoaded.set(LibraryState.LOADED);
+      return;
+    }
+
+    while (libraryLoaded.get() == LibraryState.LOADING) {
+      try {
+        Thread.sleep(10);
+      } catch(final InterruptedException e) {
+        //ignore
+      }
+    }
+  }
+
+  /**
+   * Tries to load the necessary library files from the given list of
+   * directories.
+   *
+   * @param paths a list of strings where each describes a directory
+   *     of a library.
+   */
+  public static void loadLibrary(final List<String> paths) {
+    if (libraryLoaded.get() == LibraryState.LOADED) {
+      return;
+    }
+
+    if (libraryLoaded.compareAndSet(LibraryState.NOT_LOADED,
+        LibraryState.LOADING)) {
+      for (final CompressionType compressionType : CompressionType.values()) {
+        if (compressionType.equals(CompressionType.NO_COMPRESSION)) {
+          continue;
+        }
+        for (final String path : paths) {
+          try {
+            System.load(path + "/" + Environment.getSharedLibraryFileName(
+                compressionType.getLibraryName()));
+            break;
+          } catch (final UnsatisfiedLinkError e) {
+            // since they are optional, we ignore loading fails.
+          }
+        }
+      }
+      boolean success = false;
+      UnsatisfiedLinkError err = null;
+      for (final String path : paths) {
+        try {
+          System.load(path + "/" +
+              Environment.getJniLibraryFileName("rocksdbjni"));
+          success = true;
+          break;
+        } catch (final UnsatisfiedLinkError e) {
+          err = e;
+        }
+      }
+      if (!success) {
+        libraryLoaded.set(LibraryState.NOT_LOADED);
+        throw err;
+      }
+
+      final int encodedVersion = version();
+      version = Version.fromEncodedVersion(encodedVersion);
+
+      libraryLoaded.set(LibraryState.LOADED);
+      return;
+    }
+
+    while (libraryLoaded.get() == LibraryState.LOADING) {
+      try {
+        Thread.sleep(10);
+      } catch(final InterruptedException e) {
+        //ignore
+      }
+    }
+  }
+
+  public static Version rocksdbVersion() {
+    return version;
+  }
+
+  /**
+   * Private constructor.
+   *
+   * @param nativeHandle The native handle of the C++ RocksDB object
+   */
+  protected RocksDB(final long nativeHandle) {
+    super(nativeHandle);
+  }
+
+  /**
+   * The factory constructor of RocksDB that opens a RocksDB instance given
+   * the path to the database using the default options w/ createIfMissing
+   * set to true.
+   *
+   * @param path the path to the rocksdb.
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   * @see Options#setCreateIfMissing(boolean)
+   */
+  public static RocksDB open(final String path) throws RocksDBException {
+    final Options options = new Options();
+    options.setCreateIfMissing(true);
+    return open(options, path);
+  }
+
+  /**
+   * The factory constructor of RocksDB that opens a RocksDB instance given
+   * the path to the database using the specified options and db path and a list
+   * of column family names.
+   * <p>
+   * If opened in read write mode every existing column family name must be
+   * passed within the list to this method.</p>
+   * <p>
+   * If opened in read-only mode only a subset of existing column families must
+   * be passed to this method.</p>
+   * <p>
+   * Options instance *should* not be disposed before all DBs using this options
+   * instance have been closed. If user doesn't call options dispose explicitly,
+   * then this options instance will be GC'd automatically</p>
+   * <p>
+   * ColumnFamily handles are disposed when the RocksDB instance is disposed.
+   * </p>
+   *
+   * @param path the path to the rocksdb.
+   * @param columnFamilyDescriptors list of column family descriptors
+   * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
+   *     on open.
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   * @see DBOptions#setCreateIfMissing(boolean)
+   */
+  public static RocksDB open(final String path,
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors,
+      final List<ColumnFamilyHandle> columnFamilyHandles)
+      throws RocksDBException {
+    final DBOptions options = new DBOptions();
+    return open(options, path, columnFamilyDescriptors, columnFamilyHandles);
+  }
+
+  /**
+   * The factory constructor of RocksDB that opens a RocksDB instance given
+   * the path to the database using the specified options and db path.
+   *
+   * <p>
+   * Options instance *should* not be disposed before all DBs using this options
+   * instance have been closed. If user doesn't call options dispose explicitly,
+   * then this options instance will be GC'd automatically.</p>
+   * <p>
+   * Options instance can be re-used to open multiple DBs if DB statistics is
+   * not used. If DB statistics are required, then its recommended to open DB
+   * with new Options instance as underlying native statistics instance does not
+   * use any locks to prevent concurrent updates.</p>
+   *
+   * @param options {@link org.rocksdb.Options} instance.
+   * @param path the path to the rocksdb.
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   *
+   * @see Options#setCreateIfMissing(boolean)
+   */
+  public static RocksDB open(final Options options, final String path)
+      throws RocksDBException {
+    // when non-default Options is used, keeping an Options reference
+    // in RocksDB can prevent Java to GC during the life-time of
+    // the currently-created RocksDB.
+    final RocksDB db = new RocksDB(open(options.nativeHandle_, path));
+    db.storeOptionsInstance(options);
+    return db;
+  }
+
+  /**
+   * The factory constructor of RocksDB that opens a RocksDB instance given
+   * the path to the database using the specified options and db path and a list
+   * of column family names.
+   * <p>
+   * If opened in read write mode every existing column family name must be
+   * passed within the list to this method.</p>
+   * <p>
+   * If opened in read-only mode only a subset of existing column families must
+   * be passed to this method.</p>
+   * <p>
+   * Options instance *should* not be disposed before all DBs using this options
+   * instance have been closed. If user doesn't call options dispose explicitly,
+   * then this options instance will be GC'd automatically.</p>
+   * <p>
+   * Options instance can be re-used to open multiple DBs if DB statistics is
+   * not used. If DB statistics are required, then its recommended to open DB
+   * with new Options instance as underlying native statistics instance does not
+   * use any locks to prevent concurrent updates.</p>
+   * <p>
+   * ColumnFamily handles are disposed when the RocksDB instance is disposed.
+   * </p>
+   *
+   * @param options {@link org.rocksdb.DBOptions} instance.
+   * @param path the path to the rocksdb.
+   * @param columnFamilyDescriptors list of column family descriptors
+   * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
+   *     on open.
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   *
+   * @see DBOptions#setCreateIfMissing(boolean)
+   */
+  public static RocksDB open(final DBOptions options, final String path,
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors,
+      final List<ColumnFamilyHandle> columnFamilyHandles)
+      throws RocksDBException {
+
+    final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][];
+    final long[] cfOptionHandles = new long[columnFamilyDescriptors.size()];
+    for (int i = 0; i < columnFamilyDescriptors.size(); i++) {
+      final ColumnFamilyDescriptor cfDescriptor = columnFamilyDescriptors
+          .get(i);
+      cfNames[i] = cfDescriptor.getName();
+      cfOptionHandles[i] = cfDescriptor.getOptions().nativeHandle_;
+    }
+
+    final long[] handles = open(options.nativeHandle_, path, cfNames,
+        cfOptionHandles);
+    final RocksDB db = new RocksDB(handles[0]);
+    db.storeOptionsInstance(options);
+
+    for (int i = 1; i < handles.length; i++) {
+      final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(db, handles[i]);
+      columnFamilyHandles.add(columnFamilyHandle);
+    }
+
+    db.ownedColumnFamilyHandles.addAll(columnFamilyHandles);
+
+    return db;
+  }
+
+  /**
+   * The factory constructor of RocksDB that opens a RocksDB instance in
+   * Read-Only mode given the path to the database using the default
+   * options.
+   *
+   * @param path the path to the RocksDB.
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public static RocksDB openReadOnly(final String path)
+      throws RocksDBException {
+    // This allows to use the rocksjni default Options instead of
+    // the c++ one.
+    final Options options = new Options();
+    return openReadOnly(options, path);
+  }
+
+  /**
+   * The factory constructor of RocksDB that opens a RocksDB instance in
+   * Read-Only mode given the path to the database using the specified
+   * options and db path.
+   *
+   * Options instance *should* not be disposed before all DBs using this options
+   * instance have been closed. If user doesn't call options dispose explicitly,
+   * then this options instance will be GC'd automatically.
+   *
+   * @param options {@link Options} instance.
+   * @param path the path to the RocksDB.
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public static RocksDB openReadOnly(final Options options, final String path)
+      throws RocksDBException {
+    return openReadOnly(options, path, false);
+  }
+
+  /**
+   * The factory constructor of RocksDB that opens a RocksDB instance in
+   * Read-Only mode given the path to the database using the specified
+   * options and db path.
+   *
+   * Options instance *should* not be disposed before all DBs using this options
+   * instance have been closed. If user doesn't call options dispose explicitly,
+   * then this options instance will be GC'd automatically.
+   *
+   * @param options {@link Options} instance.
+   * @param path the path to the RocksDB.
+   * @param errorIfWalFileExists true to raise an error when opening the db
+   *            if a Write Ahead Log file exists, false otherwise.
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public static RocksDB openReadOnly(final Options options, final String path,
+      final boolean errorIfWalFileExists) throws RocksDBException {
+    // when non-default Options is used, keeping an Options reference
+    // in RocksDB can prevent Java to GC during the life-time of
+    // the currently-created RocksDB.
+    final RocksDB db = new RocksDB(openROnly(options.nativeHandle_, path, errorIfWalFileExists));
+    db.storeOptionsInstance(options);
+    return db;
+  }
+
+  /**
+   * The factory constructor of RocksDB that opens a RocksDB instance in
+   * Read-Only mode given the path to the database using the default
+   * options.
+   *
+   * @param path the path to the RocksDB.
+   * @param columnFamilyDescriptors list of column family descriptors
+   * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
+   *     on open.
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public static RocksDB openReadOnly(final String path,
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors,
+      final List<ColumnFamilyHandle> columnFamilyHandles)
+      throws RocksDBException {
+    // This allows to use the rocksjni default Options instead of
+    // the c++ one.
+    final DBOptions options = new DBOptions();
+    return openReadOnly(options, path, columnFamilyDescriptors, columnFamilyHandles, false);
+  }
+
+  /**
+   * The factory constructor of RocksDB that opens a RocksDB instance in
+   * Read-Only mode given the path to the database using the specified
+   * options and db path.
+   *
+   * <p>This open method allows to open RocksDB using a subset of available
+   * column families</p>
+   * <p>Options instance *should* not be disposed before all DBs using this
+   * options instance have been closed. If user doesn't call options dispose
+   * explicitly,then this options instance will be GC'd automatically.</p>
+   *
+   * @param options {@link DBOptions} instance.
+   * @param path the path to the RocksDB.
+   * @param columnFamilyDescriptors list of column family descriptors
+   * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
+   *     on open.
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public static RocksDB openReadOnly(final DBOptions options, final String path,
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors,
+      final List<ColumnFamilyHandle> columnFamilyHandles) throws RocksDBException {
+    return openReadOnly(options, path, columnFamilyDescriptors, columnFamilyHandles, false);
+  }
+
+  /**
+   * The factory constructor of RocksDB that opens a RocksDB instance in
+   * Read-Only mode given the path to the database using the specified
+   * options and db path.
+   *
+   * <p>This open method allows to open RocksDB using a subset of available
+   * column families</p>
+   * <p>Options instance *should* not be disposed before all DBs using this
+   * options instance have been closed. If user doesn't call options dispose
+   * explicitly,then this options instance will be GC'd automatically.</p>
+   *
+   * @param options {@link DBOptions} instance.
+   * @param path the path to the RocksDB.
+   * @param columnFamilyDescriptors list of column family descriptors
+   * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
+   *     on open.
+   * @param errorIfWalFileExists true to raise an error when opening the db
+   *            if a Write Ahead Log file exists, false otherwise.
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public static RocksDB openReadOnly(final DBOptions options, final String path,
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors,
+      final List<ColumnFamilyHandle> columnFamilyHandles, final boolean errorIfWalFileExists)
+      throws RocksDBException {
+    // when non-default Options is used, keeping an Options reference
+    // in RocksDB can prevent Java to GC during the life-time of
+    // the currently-created RocksDB.
+
+    final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][];
+    final long[] cfOptionHandles = new long[columnFamilyDescriptors.size()];
+    for (int i = 0; i < columnFamilyDescriptors.size(); i++) {
+      final ColumnFamilyDescriptor cfDescriptor = columnFamilyDescriptors
+          .get(i);
+      cfNames[i] = cfDescriptor.getName();
+      cfOptionHandles[i] = cfDescriptor.getOptions().nativeHandle_;
+    }
+
+    final long[] handles =
+        openROnly(options.nativeHandle_, path, cfNames, cfOptionHandles, errorIfWalFileExists);
+    final RocksDB db = new RocksDB(handles[0]);
+    db.storeOptionsInstance(options);
+
+    for (int i = 1; i < handles.length; i++) {
+      final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(db, handles[i]);
+      columnFamilyHandles.add(columnFamilyHandle);
+    }
+
+    db.ownedColumnFamilyHandles.addAll(columnFamilyHandles);
+
+    return db;
+  }
+
+  /**
+   * Open DB as secondary instance with only the default column family.
+   *
+   * The secondary instance can dynamically tail the MANIFEST of
+   * a primary that must have already been created. User can call
+   * {@link #tryCatchUpWithPrimary()} to make the secondary instance catch up
+   * with primary (WAL tailing is NOT supported now) whenever the user feels
+   * necessary. Column families created by the primary after the secondary
+   * instance starts are currently ignored by the secondary instance.
+   * Column families opened by secondary and dropped by the primary will be
+   * dropped by secondary as well. However the user of the secondary instance
+   * can still access the data of such dropped column family as long as they
+   * do not destroy the corresponding column family handle.
+   * WAL tailing is not supported at present, but will arrive soon.
+   *
+   * @param options the options to open the secondary instance.
+   * @param path the path to the primary RocksDB instance.
+   * @param secondaryPath points to a directory where the secondary instance
+   *    stores its info log
+   *
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public static RocksDB openAsSecondary(final Options options, final String path,
+      final String secondaryPath) throws RocksDBException {
+    // when non-default Options is used, keeping an Options reference
+    // in RocksDB can prevent Java to GC during the life-time of
+    // the currently-created RocksDB.
+    final RocksDB db = new RocksDB(openAsSecondary(options.nativeHandle_, path, secondaryPath));
+    db.storeOptionsInstance(options);
+    return db;
+  }
+
+  /**
+   * Open DB as secondary instance with column families.
+   * You can open a subset of column families in secondary mode.
+   *
+   * The secondary instance can dynamically tail the MANIFEST of
+   * a primary that must have already been created. User can call
+   * {@link #tryCatchUpWithPrimary()} to make the secondary instance catch up
+   * with primary (WAL tailing is NOT supported now) whenever the user feels
+   * necessary. Column families created by the primary after the secondary
+   * instance starts are currently ignored by the secondary instance.
+   * Column families opened by secondary and dropped by the primary will be
+   * dropped by secondary as well. However the user of the secondary instance
+   * can still access the data of such dropped column family as long as they
+   * do not destroy the corresponding column family handle.
+   * WAL tailing is not supported at present, but will arrive soon.
+   *
+   * @param options the options to open the secondary instance.
+   * @param path the path to the primary RocksDB instance.
+   * @param secondaryPath points to a directory where the secondary instance
+   *    stores its info log.
+   * @param columnFamilyDescriptors list of column family descriptors
+   * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
+   *     on open.
+   *
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public static RocksDB openAsSecondary(final DBOptions options, final String path,
+      final String secondaryPath, final List<ColumnFamilyDescriptor> columnFamilyDescriptors,
+      final List<ColumnFamilyHandle> columnFamilyHandles) throws RocksDBException {
+    // when non-default Options is used, keeping an Options reference
+    // in RocksDB can prevent Java to GC during the life-time of
+    // the currently-created RocksDB.
+
+    final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][];
+    final long[] cfOptionHandles = new long[columnFamilyDescriptors.size()];
+    for (int i = 0; i < columnFamilyDescriptors.size(); i++) {
+      final ColumnFamilyDescriptor cfDescriptor = columnFamilyDescriptors.get(i);
+      cfNames[i] = cfDescriptor.getName();
+      cfOptionHandles[i] = cfDescriptor.getOptions().nativeHandle_;
+    }
+
+    final long[] handles =
+        openAsSecondary(options.nativeHandle_, path, secondaryPath, cfNames, cfOptionHandles);
+    final RocksDB db = new RocksDB(handles[0]);
+    db.storeOptionsInstance(options);
+
+    for (int i = 1; i < handles.length; i++) {
+      final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(db, handles[i]);
+      columnFamilyHandles.add(columnFamilyHandle);
+    }
+
+    db.ownedColumnFamilyHandles.addAll(columnFamilyHandles);
+
+    return db;
+  }
+
+  /**
+   * This is similar to {@link #close()} except that it
+   * throws an exception if any error occurs.
+   *
+   * This will not fsync the WAL files.
+   * If syncing is required, the caller must first call {@link #syncWal()}
+   * or {@link #write(WriteOptions, WriteBatch)} using an empty write batch
+   * with {@link WriteOptions#setSync(boolean)} set to true.
+   *
+   * See also {@link #close()}.
+   *
+   * @throws RocksDBException if an error occurs whilst closing.
+   */
+  public void closeE() throws RocksDBException {
+    for (final ColumnFamilyHandle columnFamilyHandle : ownedColumnFamilyHandles) {
+      columnFamilyHandle.close();
+    }
+    ownedColumnFamilyHandles.clear();
+
+    if (owningHandle_.compareAndSet(true, false)) {
+      try {
+        closeDatabase(nativeHandle_);
+      } finally {
+        disposeInternal();
+      }
+    }
+  }
+
+  /**
+   * This is similar to {@link #closeE()} except that it
+   * silently ignores any errors.
+   *
+   * This will not fsync the WAL files.
+   * If syncing is required, the caller must first call {@link #syncWal()}
+   * or {@link #write(WriteOptions, WriteBatch)} using an empty write batch
+   * with {@link WriteOptions#setSync(boolean)} set to true.
+   *
+   * See also {@link #close()}.
+   */
+  @Override
+  public void close() {
+    for (final ColumnFamilyHandle columnFamilyHandle : ownedColumnFamilyHandles) {
+      columnFamilyHandle.close();
+    }
+    ownedColumnFamilyHandles.clear();
+
+    if (owningHandle_.compareAndSet(true, false)) {
+      try {
+        closeDatabase(nativeHandle_);
+      } catch (final RocksDBException e) {
+        // silently ignore the error report
+      } finally {
+        disposeInternal();
+      }
+    }
+  }
+
+  /**
+   * Static method to determine all available column families for a
+   * rocksdb database identified by path
+   *
+   * @param options Options for opening the database
+   * @param path Absolute path to rocksdb database
+   * @return List&lt;byte[]&gt; List containing the column family names
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public static List<byte[]> listColumnFamilies(final Options options,
+      final String path) throws RocksDBException {
+    return Arrays.asList(RocksDB.listColumnFamilies(options.nativeHandle_,
+        path));
+  }
+
+  /**
+   * Creates a new column family with the name columnFamilyName and
+   * allocates a ColumnFamilyHandle within an internal structure.
+   * The ColumnFamilyHandle is automatically disposed with DB disposal.
+   *
+   * @param columnFamilyDescriptor column family to be created.
+   * @return {@link org.rocksdb.ColumnFamilyHandle} instance.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public ColumnFamilyHandle createColumnFamily(
+      final ColumnFamilyDescriptor columnFamilyDescriptor)
+      throws RocksDBException {
+    final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(this,
+        createColumnFamily(nativeHandle_, columnFamilyDescriptor.getName(),
+            columnFamilyDescriptor.getName().length,
+            columnFamilyDescriptor.getOptions().nativeHandle_));
+    ownedColumnFamilyHandles.add(columnFamilyHandle);
+    return columnFamilyHandle;
+  }
+
+  /**
+   * Bulk create column families with the same column family options.
+   *
+   * @param columnFamilyOptions the options for the column families.
+   * @param columnFamilyNames the names of the column families.
+   *
+   * @return the handles to the newly created column families.
+   *
+   * @throws RocksDBException if an error occurs whilst creating
+   *     the column families
+   */
+  public List<ColumnFamilyHandle> createColumnFamilies(
+      final ColumnFamilyOptions columnFamilyOptions,
+      final List<byte[]> columnFamilyNames) throws RocksDBException {
+    final byte[][] cfNames = columnFamilyNames.toArray(
+        new byte[0][]);
+    final long[] cfHandles = createColumnFamilies(nativeHandle_,
+        columnFamilyOptions.nativeHandle_, cfNames);
+    final List<ColumnFamilyHandle> columnFamilyHandles =
+        new ArrayList<>(cfHandles.length);
+    for (int i = 0; i < cfHandles.length; i++) {
+      final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(this, cfHandles[i]);
+      columnFamilyHandles.add(columnFamilyHandle);
+    }
+    ownedColumnFamilyHandles.addAll(columnFamilyHandles);
+    return columnFamilyHandles;
+  }
+
+  /**
+   * Bulk create column families with the same column family options.
+   *
+   * @param columnFamilyDescriptors the descriptions of the column families.
+   *
+   * @return the handles to the newly created column families.
+   *
+   * @throws RocksDBException if an error occurs whilst creating
+   *     the column families
+   */
+  public List<ColumnFamilyHandle> createColumnFamilies(
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors)
+      throws RocksDBException {
+    final long[] cfOptsHandles = new long[columnFamilyDescriptors.size()];
+    final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][];
+    for (int i = 0; i < columnFamilyDescriptors.size(); i++) {
+      final ColumnFamilyDescriptor columnFamilyDescriptor
+          = columnFamilyDescriptors.get(i);
+      cfOptsHandles[i] = columnFamilyDescriptor.getOptions().nativeHandle_;
+      cfNames[i] = columnFamilyDescriptor.getName();
+    }
+    final long[] cfHandles = createColumnFamilies(nativeHandle_,
+        cfOptsHandles, cfNames);
+    final List<ColumnFamilyHandle> columnFamilyHandles =
+        new ArrayList<>(cfHandles.length);
+    for (int i = 0; i < cfHandles.length; i++) {
+      final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(this, cfHandles[i]);
+      columnFamilyHandles.add(columnFamilyHandle);
+    }
+    ownedColumnFamilyHandles.addAll(columnFamilyHandles);
+    return columnFamilyHandles;
+  }
+
+  /**
+   * Drops the column family specified by {@code columnFamilyHandle}. This call
+   * only records a drop record in the manifest and prevents the column
+   * family from flushing and compacting.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void dropColumnFamily(final ColumnFamilyHandle columnFamilyHandle)
+      throws RocksDBException {
+    dropColumnFamily(nativeHandle_, columnFamilyHandle.nativeHandle_);
+  }
+
+  // Bulk drop column families. This call only records drop records in the
+  // manifest and prevents the column families from flushing and compacting.
+  // In case of error, the request may succeed partially. User may call
+  // ListColumnFamilies to check the result.
+  public void dropColumnFamilies(
+      final List<ColumnFamilyHandle> columnFamilies) throws RocksDBException {
+    final long[] cfHandles = new long[columnFamilies.size()];
+    for (int i = 0; i < columnFamilies.size(); i++) {
+      cfHandles[i] = columnFamilies.get(i).nativeHandle_;
+    }
+    dropColumnFamilies(nativeHandle_, cfHandles);
+  }
+
+  /**
+   * Deletes native column family handle of given {@link ColumnFamilyHandle} Java object
+   * and removes reference from {@link RocksDB#ownedColumnFamilyHandles}.
+   *
+   * @param columnFamilyHandle column family handle object.
+   */
+  public void destroyColumnFamilyHandle(final ColumnFamilyHandle columnFamilyHandle) {
+    for (int i = 0; i < ownedColumnFamilyHandles.size(); ++i) {
+      final ColumnFamilyHandle ownedHandle = ownedColumnFamilyHandles.get(i);
+      if (ownedHandle.equals(columnFamilyHandle)) {
+        columnFamilyHandle.close();
+        ownedColumnFamilyHandles.remove(i);
+        return;
+      }
+    }
+  }
+
+  /**
+   * Set the database entry for "key" to "value".
+   *
+   * @param key the specified key to be inserted.
+   * @param value the value associated with the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void put(final byte[] key, final byte[] value)
+      throws RocksDBException {
+    put(nativeHandle_, key, 0, key.length, value, 0, value.length);
+  }
+
+  /**
+   * Set the database entry for "key" to "value".
+   *
+   * @param key The specified key to be inserted
+   * @param offset the offset of the "key" array to be used, must be
+   *    non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("key".length -  offset)
+   * @param value the value associated with the specified key
+   * @param vOffset the offset of the "value" array to be used, must be
+   *     non-negative and no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be
+   *     non-negative and no larger than ("value".length -  offset)
+   *
+   * @throws RocksDBException thrown if errors happens in underlying native
+   *     library.
+   * @throws IndexOutOfBoundsException if an offset or length is out of bounds
+   */
+  public void put(final byte[] key, final int offset, final int len,
+      final byte[] value, final int vOffset, final int vLen)
+      throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    put(nativeHandle_, key, offset, len, value, vOffset, vLen);
+  }
+
+  /**
+   * Set the database entry for "key" to "value" in the specified
+   * column family.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the specified key to be inserted.
+   * @param value the value associated with the specified key.
+   *
+   * throws IllegalArgumentException if column family is not present
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void put(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, final byte[] value) throws RocksDBException {
+    put(nativeHandle_, key, 0, key.length, value, 0, value.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Set the database entry for "key" to "value" in the specified
+   * column family.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key The specified key to be inserted
+   * @param offset the offset of the "key" array to be used, must
+   *     be non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("key".length -  offset)
+   * @param value the value associated with the specified key
+   * @param vOffset the offset of the "value" array to be used, must be
+   *     non-negative and no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be
+   *     non-negative and no larger than ("value".length - offset)
+   *
+   * @throws RocksDBException thrown if errors happens in underlying native
+   *     library.
+   * @throws IndexOutOfBoundsException if an offset or length is out of bounds
+   */
+  public void put(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, final int offset, final int len,
+      final byte[] value, final int vOffset, final int vLen)
+      throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    put(nativeHandle_, key, offset, len, value, vOffset, vLen,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Set the database entry for "key" to "value".
+   *
+   * @param writeOpts {@link org.rocksdb.WriteOptions} instance.
+   * @param key the specified key to be inserted.
+   * @param value the value associated with the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void put(final WriteOptions writeOpts, final byte[] key,
+      final byte[] value) throws RocksDBException {
+    put(nativeHandle_, writeOpts.nativeHandle_,
+        key, 0, key.length, value, 0, value.length);
+  }
+
+  /**
+   * Set the database entry for "key" to "value".
+   *
+   * @param writeOpts {@link org.rocksdb.WriteOptions} instance.
+   * @param key The specified key to be inserted
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("key".length -  offset)
+   * @param value the value associated with the specified key
+   * @param vOffset the offset of the "value" array to be used, must be
+   *     non-negative and no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be
+   *     non-negative and no larger than ("value".length -  offset)
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   * @throws IndexOutOfBoundsException if an offset or length is out of bounds
+   */
+  public void put(final WriteOptions writeOpts,
+      final byte[] key, final int offset, final int len,
+      final byte[] value, final int vOffset, final int vLen)
+      throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    put(nativeHandle_, writeOpts.nativeHandle_,
+        key, offset, len, value, vOffset, vLen);
+  }
+
+  /**
+   * Set the database entry for "key" to "value" for the specified
+   * column family.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param writeOpts {@link org.rocksdb.WriteOptions} instance.
+   * @param key the specified key to be inserted.
+   * @param value the value associated with the specified key.
+   *
+   * throws IllegalArgumentException if column family is not present
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   * @see IllegalArgumentException
+   */
+  public void put(final ColumnFamilyHandle columnFamilyHandle,
+      final WriteOptions writeOpts, final byte[] key,
+      final byte[] value) throws RocksDBException {
+    put(nativeHandle_, writeOpts.nativeHandle_, key, 0, key.length, value,
+        0, value.length, columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Set the database entry for "key" to "value" for the specified
+   * column family.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param writeOpts {@link org.rocksdb.WriteOptions} instance.
+   * @param key the specified key to be inserted. Position and limit is used.
+   *     Supports direct buffer only.
+   * @param value the value associated with the specified key. Position and limit is used.
+   *     Supports direct buffer only.
+   *
+   * throws IllegalArgumentException if column family is not present
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   * @see IllegalArgumentException
+   */
+  public void put(final ColumnFamilyHandle columnFamilyHandle, final WriteOptions writeOpts,
+      final ByteBuffer key, final ByteBuffer value) throws RocksDBException {
+    assert key.isDirect() && value.isDirect();
+    putDirect(nativeHandle_, writeOpts.nativeHandle_, key, key.position(), key.remaining(), value,
+        value.position(), value.remaining(), columnFamilyHandle.nativeHandle_);
+    key.position(key.limit());
+    value.position(value.limit());
+  }
+
+  /**
+   * Set the database entry for "key" to "value".
+   *
+   * @param writeOpts {@link org.rocksdb.WriteOptions} instance.
+   * @param key the specified key to be inserted. Position and limit is used.
+   *     Supports direct buffer only.
+   * @param value the value associated with the specified key. Position and limit is used.
+   *     Supports direct buffer only.
+   *
+   * throws IllegalArgumentException if column family is not present
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   * @see IllegalArgumentException
+   */
+  public void put(final WriteOptions writeOpts, final ByteBuffer key, final ByteBuffer value)
+      throws RocksDBException {
+    assert key.isDirect() && value.isDirect();
+    putDirect(nativeHandle_, writeOpts.nativeHandle_, key, key.position(), key.remaining(), value,
+        value.position(), value.remaining(), 0);
+    key.position(key.limit());
+    value.position(value.limit());
+  }
+
+  /**
+   * Set the database entry for "key" to "value" for the specified
+   * column family.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param writeOpts {@link org.rocksdb.WriteOptions} instance.
+   * @param key The specified key to be inserted
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("key".length -  offset)
+   * @param value the value associated with the specified key
+   * @param vOffset the offset of the "value" array to be used, must be
+   *     non-negative and no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be
+   *     non-negative and no larger than ("value".length -  offset)
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   * @throws IndexOutOfBoundsException if an offset or length is out of bounds
+   */
+  public void put(final ColumnFamilyHandle columnFamilyHandle,
+      final WriteOptions writeOpts,
+      final byte[] key, final int offset, final int len,
+      final byte[] value, final int vOffset, final int vLen)
+      throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    put(nativeHandle_, writeOpts.nativeHandle_, key, offset, len, value,
+        vOffset, vLen, columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Delete the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
+   *
+   * @param key Key to delete within database
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void delete(final byte[] key) throws RocksDBException {
+    delete(nativeHandle_, key, 0, key.length);
+  }
+
+  /**
+   * Delete the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
+   *
+   * @param key Key to delete within database
+   * @param offset the offset of the "key" array to be used, must be
+   *      non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be
+   *      non-negative and no larger than ("key".length - offset)
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void delete(final byte[] key, final int offset, final int len)
+      throws RocksDBException {
+    delete(nativeHandle_, key, offset, len);
+  }
+
+  /**
+   * Delete the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key Key to delete within database
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void delete(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key) throws RocksDBException {
+    delete(nativeHandle_, key, 0, key.length, columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Delete the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key Key to delete within database
+   * @param offset the offset of the "key" array to be used,
+   *     must be non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("value".length - offset)
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void delete(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, final int offset, final int len)
+      throws RocksDBException {
+    delete(nativeHandle_, key, offset, len, columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Delete the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
+   *
+   * @param writeOpt WriteOptions to be used with delete operation
+   * @param key Key to delete within database
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void delete(final WriteOptions writeOpt, final byte[] key)
+      throws RocksDBException {
+    delete(nativeHandle_, writeOpt.nativeHandle_, key, 0, key.length);
+  }
+
+  /**
+   * Delete the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
+   *
+   * @param writeOpt WriteOptions to be used with delete operation
+   * @param key Key to delete within database
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be
+   *     non-negative and no larger than ("key".length -  offset)
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void delete(final WriteOptions writeOpt, final byte[] key,
+      final int offset, final int len) throws RocksDBException {
+    delete(nativeHandle_, writeOpt.nativeHandle_, key, offset, len);
+  }
+
+  /**
+   * Delete the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param writeOpt WriteOptions to be used with delete operation
+   * @param key Key to delete within database
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void delete(final ColumnFamilyHandle columnFamilyHandle,
+      final WriteOptions writeOpt, final byte[] key)
+      throws RocksDBException {
+    delete(nativeHandle_, writeOpt.nativeHandle_, key, 0, key.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Delete the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param writeOpt WriteOptions to be used with delete operation
+   * @param key Key to delete within database
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be
+   *     non-negative and no larger than ("key".length -  offset)
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void delete(final ColumnFamilyHandle columnFamilyHandle,
+      final WriteOptions writeOpt, final byte[] key, final int offset,
+      final int len)  throws RocksDBException {
+    delete(nativeHandle_, writeOpt.nativeHandle_, key, offset, len,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Get the value associated with the specified key within column family.
+   *
+   * @param opt {@link org.rocksdb.ReadOptions} instance.
+   * @param key the key to retrieve the value. It is using position and limit.
+   *     Supports direct buffer only.
+   * @param value the out-value to receive the retrieved value.
+   *     It is using position and limit. Limit is set according to value size.
+   *     Supports direct buffer only.
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public int get(final ReadOptions opt, final ByteBuffer key, final ByteBuffer value)
+      throws RocksDBException {
+    assert key.isDirect() && value.isDirect();
+    int result = getDirect(nativeHandle_, opt.nativeHandle_, key, key.position(), key.remaining(),
+        value, value.position(), value.remaining(), 0);
+    if (result != NOT_FOUND) {
+      value.limit(Math.min(value.limit(), value.position() + result));
+    }
+    key.position(key.limit());
+    return result;
+  }
+
+  /**
+   * Get the value associated with the specified key within column family.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param opt {@link org.rocksdb.ReadOptions} instance.
+   * @param key the key to retrieve the value. It is using position and limit.
+   *     Supports direct buffer only.
+   * @param value the out-value to receive the retrieved value.
+   *     It is using position and limit. Limit is set according to value size.
+   *     Supports direct buffer only.
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public int get(final ColumnFamilyHandle columnFamilyHandle, final ReadOptions opt,
+      final ByteBuffer key, final ByteBuffer value) throws RocksDBException {
+    assert key.isDirect() && value.isDirect();
+    int result = getDirect(nativeHandle_, opt.nativeHandle_, key, key.position(), key.remaining(),
+        value, value.position(), value.remaining(), columnFamilyHandle.nativeHandle_);
+    if (result != NOT_FOUND) {
+      value.limit(Math.min(value.limit(), value.position() + result));
+    }
+    key.position(key.limit());
+    return result;
+  }
+
+  /**
+   * Remove the database entry for {@code key}. Requires that the key exists
+   * and was not overwritten. It is not an error if the key did not exist
+   * in the database.
+   *
+   * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple
+   * times), then the result of calling SingleDelete() on this key is undefined.
+   * SingleDelete() only behaves correctly if there has been only one Put()
+   * for this key since the previous call to SingleDelete() for this key.
+   *
+   * This feature is currently an experimental performance optimization
+   * for a very specific workload. It is up to the caller to ensure that
+   * SingleDelete is only used for a key that is not deleted using Delete() or
+   * written using Merge(). Mixing SingleDelete operations with Deletes and
+   * Merges can result in undefined behavior.
+   *
+   * @param key Key to delete within database
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *     native library.
+   */
+  @Experimental("Performance optimization for a very specific workload")
+  public void singleDelete(final byte[] key) throws RocksDBException {
+    singleDelete(nativeHandle_, key, key.length);
+  }
+
+  /**
+   * Remove the database entry for {@code key}. Requires that the key exists
+   * and was not overwritten. It is not an error if the key did not exist
+   * in the database.
+   *
+   * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple
+   * times), then the result of calling SingleDelete() on this key is undefined.
+   * SingleDelete() only behaves correctly if there has been only one Put()
+   * for this key since the previous call to SingleDelete() for this key.
+   *
+   * This feature is currently an experimental performance optimization
+   * for a very specific workload. It is up to the caller to ensure that
+   * SingleDelete is only used for a key that is not deleted using Delete() or
+   * written using Merge(). Mixing SingleDelete operations with Deletes and
+   * Merges can result in undefined behavior.
+   *
+   * @param columnFamilyHandle The column family to delete the key from
+   * @param key Key to delete within database
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *     native library.
+   */
+  @Experimental("Performance optimization for a very specific workload")
+  public void singleDelete(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key) throws RocksDBException {
+    singleDelete(nativeHandle_, key, key.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Remove the database entry for {@code key}. Requires that the key exists
+   * and was not overwritten. It is not an error if the key did not exist
+   * in the database.
+   *
+   * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple
+   * times), then the result of calling SingleDelete() on this key is undefined.
+   * SingleDelete() only behaves correctly if there has been only one Put()
+   * for this key since the previous call to SingleDelete() for this key.
+   *
+   * This feature is currently an experimental performance optimization
+   * for a very specific workload. It is up to the caller to ensure that
+   * SingleDelete is only used for a key that is not deleted using Delete() or
+   * written using Merge(). Mixing SingleDelete operations with Deletes and
+   * Merges can result in undefined behavior.
+   *
+   * Note: consider setting {@link WriteOptions#setSync(boolean)} true.
+   *
+   * @param writeOpt Write options for the delete
+   * @param key Key to delete within database
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *     native library.
+   */
+  @Experimental("Performance optimization for a very specific workload")
+  public void singleDelete(final WriteOptions writeOpt, final byte[] key)
+      throws RocksDBException {
+    singleDelete(nativeHandle_, writeOpt.nativeHandle_, key, key.length);
+  }
+
+  /**
+   * Remove the database entry for {@code key}. Requires that the key exists
+   * and was not overwritten. It is not an error if the key did not exist
+   * in the database.
+   *
+   * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple
+   * times), then the result of calling SingleDelete() on this key is undefined.
+   * SingleDelete() only behaves correctly if there has been only one Put()
+   * for this key since the previous call to SingleDelete() for this key.
+   *
+   * This feature is currently an experimental performance optimization
+   * for a very specific workload. It is up to the caller to ensure that
+   * SingleDelete is only used for a key that is not deleted using Delete() or
+   * written using Merge(). Mixing SingleDelete operations with Deletes and
+   * Merges can result in undefined behavior.
+   *
+   * Note: consider setting {@link WriteOptions#setSync(boolean)} true.
+   *
+   * @param columnFamilyHandle The column family to delete the key from
+   * @param writeOpt Write options for the delete
+   * @param key Key to delete within database
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *     native library.
+   */
+  @Experimental("Performance optimization for a very specific workload")
+  public void singleDelete(final ColumnFamilyHandle columnFamilyHandle,
+      final WriteOptions writeOpt, final byte[] key) throws RocksDBException {
+    singleDelete(nativeHandle_, writeOpt.nativeHandle_, key, key.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+
+  /**
+   * Removes the database entries in the range ["beginKey", "endKey"), i.e.,
+   * including "beginKey" and excluding "endKey". a non-OK status on error. It
+   * is not an error if no keys exist in the range ["beginKey", "endKey").
+   *
+   * Delete the database entry (if any) for "key". Returns OK on success, and a
+   * non-OK status on error. It is not an error if "key" did not exist in the
+   * database.
+   *
+   * @param beginKey First key to delete within database (inclusive)
+   * @param endKey Last key to delete within database (exclusive)
+   *
+   * @throws RocksDBException thrown if error happens in underlying native
+   *     library.
+   */
+  public void deleteRange(final byte[] beginKey, final byte[] endKey)
+      throws RocksDBException {
+    deleteRange(nativeHandle_, beginKey, 0, beginKey.length, endKey, 0,
+        endKey.length);
+  }
+
+  /**
+   * Removes the database entries in the range ["beginKey", "endKey"), i.e.,
+   * including "beginKey" and excluding "endKey". a non-OK status on error. It
+   * is not an error if no keys exist in the range ["beginKey", "endKey").
+   *
+   * Delete the database entry (if any) for "key". Returns OK on success, and a
+   * non-OK status on error. It is not an error if "key" did not exist in the
+   * database.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} instance
+   * @param beginKey First key to delete within database (inclusive)
+   * @param endKey Last key to delete within database (exclusive)
+   *
+   * @throws RocksDBException thrown if error happens in underlying native
+   *     library.
+   */
+  public void deleteRange(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] beginKey, final byte[] endKey) throws RocksDBException {
+    deleteRange(nativeHandle_, beginKey, 0, beginKey.length, endKey, 0,
+        endKey.length, columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Removes the database entries in the range ["beginKey", "endKey"), i.e.,
+   * including "beginKey" and excluding "endKey". a non-OK status on error. It
+   * is not an error if no keys exist in the range ["beginKey", "endKey").
+   *
+   * Delete the database entry (if any) for "key". Returns OK on success, and a
+   * non-OK status on error. It is not an error if "key" did not exist in the
+   * database.
+   *
+   * @param writeOpt WriteOptions to be used with delete operation
+   * @param beginKey First key to delete within database (inclusive)
+   * @param endKey Last key to delete within database (exclusive)
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *     native library.
+   */
+  public void deleteRange(final WriteOptions writeOpt, final byte[] beginKey,
+      final byte[] endKey) throws RocksDBException {
+    deleteRange(nativeHandle_, writeOpt.nativeHandle_, beginKey, 0,
+        beginKey.length, endKey, 0, endKey.length);
+  }
+
+  /**
+   * Removes the database entries in the range ["beginKey", "endKey"), i.e.,
+   * including "beginKey" and excluding "endKey". a non-OK status on error. It
+   * is not an error if no keys exist in the range ["beginKey", "endKey").
+   *
+   * Delete the database entry (if any) for "key". Returns OK on success, and a
+   * non-OK status on error. It is not an error if "key" did not exist in the
+   * database.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} instance
+   * @param writeOpt WriteOptions to be used with delete operation
+   * @param beginKey First key to delete within database (included)
+   * @param endKey Last key to delete within database (excluded)
+   *
+   * @throws RocksDBException thrown if error happens in underlying native
+   *     library.
+   */
+  public void deleteRange(final ColumnFamilyHandle columnFamilyHandle,
+      final WriteOptions writeOpt, final byte[] beginKey, final byte[] endKey)
+      throws RocksDBException {
+    deleteRange(nativeHandle_, writeOpt.nativeHandle_, beginKey, 0,
+        beginKey.length, endKey, 0, endKey.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+
+  /**
+   * Add merge operand for key/value pair.
+   *
+   * @param key the specified key to be merged.
+   * @param value the value to be merged with the current value for the
+   *     specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void merge(final byte[] key, final byte[] value)
+      throws RocksDBException {
+    merge(nativeHandle_, key, 0, key.length, value, 0, value.length);
+  }
+
+  /**
+   * Add merge operand for key/value pair.
+   *
+   * @param key the specified key to be merged.
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("key".length -  offset)
+   * @param value the value to be merged with the current value for the
+   *     specified key.
+   * @param vOffset the offset of the "value" array to be used, must be
+   *     non-negative and no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be
+   *     non-negative and must be non-negative and no larger than
+   *     ("value".length -  offset)
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   * @throws IndexOutOfBoundsException if an offset or length is out of bounds
+   */
+  public void merge(final byte[] key, int offset, int len, final byte[] value,
+      final int vOffset, final int vLen) throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    merge(nativeHandle_, key, offset, len, value, vOffset, vLen);
+  }
+
+  /**
+   * Add merge operand for key/value pair in a ColumnFamily.
+   *
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param key the specified key to be merged.
+   * @param value the value to be merged with the current value for
+   * the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void merge(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, final byte[] value) throws RocksDBException {
+    merge(nativeHandle_, key, 0, key.length, value, 0, value.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Add merge operand for key/value pair in a ColumnFamily.
+   *
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param key the specified key to be merged.
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("key".length -  offset)
+   * @param value the value to be merged with the current value for
+   *     the specified key.
+   * @param vOffset the offset of the "value" array to be used, must be
+   *     non-negative and no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be
+   *     must be non-negative and no larger than ("value".length -  offset)
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   * @throws IndexOutOfBoundsException if an offset or length is out of bounds
+   */
+  public void merge(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, final int offset, final int len, final byte[] value,
+      final int vOffset, final int vLen) throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    merge(nativeHandle_, key, offset, len, value, vOffset, vLen,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Add merge operand for key/value pair.
+   *
+   * @param writeOpts {@link WriteOptions} for this write.
+   * @param key the specified key to be merged.
+   * @param value the value to be merged with the current value for
+   * the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void merge(final WriteOptions writeOpts, final byte[] key,
+      final byte[] value) throws RocksDBException {
+    merge(nativeHandle_, writeOpts.nativeHandle_,
+        key, 0, key.length, value, 0, value.length);
+  }
+
+  /**
+   * Add merge operand for key/value pair.
+   *
+   * @param writeOpts {@link WriteOptions} for this write.
+   * @param key the specified key to be merged.
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("value".length -  offset)
+   * @param value the value to be merged with the current value for
+   *     the specified key.
+   * @param vOffset the offset of the "value" array to be used, must be
+   *     non-negative and no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be
+   *     non-negative and no larger than ("value".length -  offset)
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   * @throws IndexOutOfBoundsException if an offset or length is out of bounds
+   */
+  public void merge(final WriteOptions writeOpts,
+      final byte[] key,  final int offset, final int len,
+      final byte[] value, final int vOffset, final int vLen)
+      throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    merge(nativeHandle_, writeOpts.nativeHandle_,
+        key, offset, len, value, vOffset, vLen);
+  }
+
+  /**
+   * Delete the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
+   *
+   * @param writeOpt WriteOptions to be used with delete operation
+   * @param key Key to delete within database. It is using position and limit.
+   *     Supports direct buffer only.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void delete(final WriteOptions writeOpt, final ByteBuffer key) throws RocksDBException {
+    assert key.isDirect();
+    deleteDirect(nativeHandle_, writeOpt.nativeHandle_, key, key.position(), key.remaining(), 0);
+    key.position(key.limit());
+  }
+
+  /**
+   * Delete the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param writeOpt WriteOptions to be used with delete operation
+   * @param key Key to delete within database. It is using position and limit.
+   *     Supports direct buffer only.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void delete(final ColumnFamilyHandle columnFamilyHandle, final WriteOptions writeOpt,
+      final ByteBuffer key) throws RocksDBException {
+    assert key.isDirect();
+    deleteDirect(nativeHandle_, writeOpt.nativeHandle_, key, key.position(), key.remaining(),
+        columnFamilyHandle.nativeHandle_);
+    key.position(key.limit());
+  }
+
+  /**
+   * Add merge operand for key/value pair.
+   *
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param writeOpts {@link WriteOptions} for this write.
+   * @param key the specified key to be merged.
+   * @param value the value to be merged with the current value for the
+   *     specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void merge(final ColumnFamilyHandle columnFamilyHandle,
+      final WriteOptions writeOpts, final byte[] key, final byte[] value)
+      throws RocksDBException {
+    merge(nativeHandle_, writeOpts.nativeHandle_,
+        key, 0, key.length, value, 0, value.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Add merge operand for key/value pair.
+   *
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param writeOpts {@link WriteOptions} for this write.
+   * @param key the specified key to be merged.
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("key".length -  offset)
+   * @param value the value to be merged with the current value for
+   *     the specified key.
+   * @param vOffset the offset of the "value" array to be used, must be
+   *     non-negative and no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be
+   *     non-negative and no larger than ("value".length -  offset)
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   * @throws IndexOutOfBoundsException if an offset or length is out of bounds
+   */
+  public void merge(
+      final ColumnFamilyHandle columnFamilyHandle, final WriteOptions writeOpts,
+      final byte[] key, final int offset, final int len,
+      final byte[] value, final int vOffset, final int vLen)
+      throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    merge(nativeHandle_, writeOpts.nativeHandle_,
+        key, offset, len, value, vOffset, vLen,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Apply the specified updates to the database.
+   *
+   * @param writeOpts WriteOptions instance
+   * @param updates WriteBatch instance
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void write(final WriteOptions writeOpts, final WriteBatch updates)
+      throws RocksDBException {
+    write0(nativeHandle_, writeOpts.nativeHandle_, updates.nativeHandle_);
+  }
+
+  /**
+   * Apply the specified updates to the database.
+   *
+   * @param writeOpts WriteOptions instance
+   * @param updates WriteBatchWithIndex instance
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void write(final WriteOptions writeOpts,
+      final WriteBatchWithIndex updates) throws RocksDBException {
+    write1(nativeHandle_, writeOpts.nativeHandle_, updates.nativeHandle_);
+  }
+
+  // TODO(AR) we should improve the #get() API, returning -1 (RocksDB.NOT_FOUND) is not very nice
+  // when we could communicate better status into, also the C++ code show that -2 could be returned
+
+  /**
+   * Get the value associated with the specified key within column family*
+   *
+   * @param key the key to retrieve the value.
+   * @param value the out-value to receive the retrieved value.
+   *
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public int get(final byte[] key, final byte[] value) throws RocksDBException {
+    return get(nativeHandle_, key, 0, key.length, value, 0, value.length);
+  }
+
+  /**
+   * Get the value associated with the specified key within column family*
+   *
+   * @param key the key to retrieve the value.
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("key".length -  offset)
+   * @param value the out-value to receive the retrieved value.
+   * @param vOffset the offset of the "value" array to be used, must be
+   *     non-negative and no longer than "value".length
+   * @param vLen the length of the "value" array to be used, must be
+   *     non-negative and and no larger than ("value".length -  offset)
+   *
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public int get(final byte[] key, final int offset, final int len,
+      final byte[] value, final int vOffset, final int vLen)
+      throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    return get(nativeHandle_, key, offset, len, value, vOffset, vLen);
+  }
+
+  /**
+   * Get the value associated with the specified key within column family.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the key to retrieve the value.
+   * @param value the out-value to receive the retrieved value.
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public int get(final ColumnFamilyHandle columnFamilyHandle, final byte[] key,
+      final byte[] value) throws RocksDBException, IllegalArgumentException {
+    return get(nativeHandle_, key, 0, key.length, value, 0, value.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Get the value associated with the specified key within column family.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the key to retrieve the value.
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     an no larger than ("key".length -  offset)
+   * @param value the out-value to receive the retrieved value.
+   * @param vOffset the offset of the "value" array to be used, must be
+   *     non-negative and no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be
+   *     non-negative and no larger than ("value".length -  offset)
+   *
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public int get(final ColumnFamilyHandle columnFamilyHandle, final byte[] key,
+      final int offset, final int len, final byte[] value, final int vOffset,
+      final int vLen) throws RocksDBException, IllegalArgumentException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    return get(nativeHandle_, key, offset, len, value, vOffset, vLen,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Get the value associated with the specified key.
+   *
+   * @param opt {@link org.rocksdb.ReadOptions} instance.
+   * @param key the key to retrieve the value.
+   * @param value the out-value to receive the retrieved value.
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public int get(final ReadOptions opt, final byte[] key,
+      final byte[] value) throws RocksDBException {
+    return get(nativeHandle_, opt.nativeHandle_,
+               key, 0, key.length, value, 0, value.length);
+  }
+
+  /**
+   * Get the value associated with the specified key.
+   *
+   * @param opt {@link org.rocksdb.ReadOptions} instance.
+   * @param key the key to retrieve the value.
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("key".length -  offset)
+   * @param value the out-value to receive the retrieved value.
+   * @param vOffset the offset of the "value" array to be used, must be
+   *     non-negative and no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be
+   *     non-negative and no larger than ("value".length -  offset)
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public int get(final ReadOptions opt, final byte[] key, final int offset,
+      final int len, final byte[] value, final int vOffset, final int vLen)
+      throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    return get(nativeHandle_, opt.nativeHandle_,
+        key, offset, len, value, vOffset, vLen);
+  }
+
+  /**
+   * Get the value associated with the specified key within column family.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param opt {@link org.rocksdb.ReadOptions} instance.
+   * @param key the key to retrieve the value.
+   * @param value the out-value to receive the retrieved value.
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public int get(final ColumnFamilyHandle columnFamilyHandle,
+      final ReadOptions opt, final byte[] key, final byte[] value)
+      throws RocksDBException {
+    return get(nativeHandle_, opt.nativeHandle_, key, 0, key.length, value,
+        0, value.length, columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Get the value associated with the specified key within column family.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param opt {@link org.rocksdb.ReadOptions} instance.
+   * @param key the key to retrieve the value.
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be
+   *     non-negative and and no larger than ("key".length -  offset)
+   * @param value the out-value to receive the retrieved value.
+   * @param vOffset the offset of the "value" array to be used, must be
+   *     non-negative and no longer than "key".length
+   * @param vLen the length of the "value" array to be used, and must be
+   *     non-negative and no larger than ("value".length -  offset)
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public int get(final ColumnFamilyHandle columnFamilyHandle,
+      final ReadOptions opt, final byte[] key, final int offset, final int len,
+      final byte[] value, final int vOffset, final int vLen)
+      throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    return get(nativeHandle_, opt.nativeHandle_, key, offset, len, value,
+        vOffset, vLen, columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * The simplified version of get which returns a new byte array storing
+   * the value associated with the specified input key if any.  null will be
+   * returned if the specified key is not found.
+   *
+   * @param key the key retrieve the value.
+   * @return a byte array storing the value associated with the input key if
+   *     any. null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] get(final byte[] key) throws RocksDBException {
+    return get(nativeHandle_, key, 0, key.length);
+  }
+
+  /**
+   * The simplified version of get which returns a new byte array storing
+   * the value associated with the specified input key if any.  null will be
+   * returned if the specified key is not found.
+   *
+   * @param key the key retrieve the value.
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("key".length -  offset)
+   * @return a byte array storing the value associated with the input key if
+   *     any. null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] get(final byte[] key, final int offset,
+      final int len) throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    return get(nativeHandle_, key, offset, len);
+  }
+
+  /**
+   * The simplified version of get which returns a new byte array storing
+   * the value associated with the specified input key if any.  null will be
+   * returned if the specified key is not found.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the key retrieve the value.
+   * @return a byte array storing the value associated with the input key if
+   *     any.  null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] get(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key) throws RocksDBException {
+    return get(nativeHandle_, key, 0, key.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * The simplified version of get which returns a new byte array storing
+   * the value associated with the specified input key if any.  null will be
+   * returned if the specified key is not found.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the key retrieve the value.
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("key".length -  offset)
+   * @return a byte array storing the value associated with the input key if
+   *     any. null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] get(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, final int offset, final int len)
+      throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    return get(nativeHandle_, key, offset, len,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * The simplified version of get which returns a new byte array storing
+   * the value associated with the specified input key if any.  null will be
+   * returned if the specified key is not found.
+   *
+   * @param key the key retrieve the value.
+   * @param opt Read options.
+   * @return a byte array storing the value associated with the input key if
+   *     any.  null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] get(final ReadOptions opt, final byte[] key)
+      throws RocksDBException {
+    return get(nativeHandle_, opt.nativeHandle_, key, 0, key.length);
+  }
+
+  /**
+   * The simplified version of get which returns a new byte array storing
+   * the value associated with the specified input key if any.  null will be
+   * returned if the specified key is not found.
+   *
+   * @param key the key retrieve the value.
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("key".length -  offset)
+   * @param opt Read options.
+   * @return a byte array storing the value associated with the input key if
+   *     any. null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] get(final ReadOptions opt, final byte[] key, final int offset,
+      final int len) throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    return get(nativeHandle_, opt.nativeHandle_, key, offset, len);
+  }
+
+  /**
+   * The simplified version of get which returns a new byte array storing
+   * the value associated with the specified input key if any.  null will be
+   * returned if the specified key is not found.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the key retrieve the value.
+   * @param opt Read options.
+   * @return a byte array storing the value associated with the input key if
+   *     any. null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] get(final ColumnFamilyHandle columnFamilyHandle,
+      final ReadOptions opt, final byte[] key) throws RocksDBException {
+    return get(nativeHandle_, opt.nativeHandle_, key, 0, key.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * The simplified version of get which returns a new byte array storing
+   * the value associated with the specified input key if any.  null will be
+   * returned if the specified key is not found.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the key retrieve the value.
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("key".length -  offset)
+   * @param opt Read options.
+   * @return a byte array storing the value associated with the input key if
+   *     any. null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] get(final ColumnFamilyHandle columnFamilyHandle,
+      final ReadOptions opt, final byte[] key, final int offset, final int len)
+      throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    return get(nativeHandle_, opt.nativeHandle_, key, offset, len,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Takes a list of keys, and returns a list of values for the given list of
+   * keys. List will contain null for keys which could not be found.
+   *
+   * @param keys List of keys for which values need to be retrieved.
+   * @return List of values for the given list of keys. List will contain
+   * null for keys which could not be found.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public List<byte[]> multiGetAsList(final List<byte[]> keys)
+      throws RocksDBException {
+    assert(keys.size() != 0);
+
+    final byte[][] keysArray = keys.toArray(new byte[keys.size()][]);
+    final int[] keyOffsets = new int[keysArray.length];
+    final int[] keyLengths = new int[keysArray.length];
+    for(int i = 0; i < keyLengths.length; i++) {
+      keyLengths[i] = keysArray[i].length;
+    }
+
+    return Arrays.asList(multiGet(nativeHandle_, keysArray, keyOffsets,
+        keyLengths));
+  }
+
+  /**
+   * Returns a list of values for the given list of keys. List will contain
+   * null for keys which could not be found.
+   * <p>
+   * Note: Every key needs to have a related column family name in
+   * {@code columnFamilyHandleList}.
+   * </p>
+   *
+   * @param columnFamilyHandleList {@link java.util.List} containing
+   *     {@link org.rocksdb.ColumnFamilyHandle} instances.
+   * @param keys List of keys for which values need to be retrieved.
+   * @return List of values for the given list of keys. List will contain
+   * null for keys which could not be found.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   * @throws IllegalArgumentException thrown if the size of passed keys is not
+   *    equal to the amount of passed column family handles.
+   */
+  public List<byte[]> multiGetAsList(
+      final List<ColumnFamilyHandle> columnFamilyHandleList,
+      final List<byte[]> keys) throws RocksDBException,
+      IllegalArgumentException {
+    assert(keys.size() != 0);
+    // Check if key size equals cfList size. If not a exception must be
+    // thrown. If not a Segmentation fault happens.
+    if (keys.size() != columnFamilyHandleList.size()) {
+        throw new IllegalArgumentException(
+            "For each key there must be a ColumnFamilyHandle.");
+    }
+    final long[] cfHandles = new long[columnFamilyHandleList.size()];
+    for (int i = 0; i < columnFamilyHandleList.size(); i++) {
+      cfHandles[i] = columnFamilyHandleList.get(i).nativeHandle_;
+    }
+
+    final byte[][] keysArray = keys.toArray(new byte[keys.size()][]);
+    final int[] keyOffsets = new int[keysArray.length];
+    final int[] keyLengths = new int[keysArray.length];
+    for(int i = 0; i < keyLengths.length; i++) {
+      keyLengths[i] = keysArray[i].length;
+    }
+
+    return Arrays.asList(multiGet(nativeHandle_, keysArray, keyOffsets,
+        keyLengths, cfHandles));
+  }
+
+  /**
+   * Returns a list of values for the given list of keys. List will contain
+   * null for keys which could not be found.
+   *
+   * @param opt Read options.
+   * @param keys of keys for which values need to be retrieved.
+   * @return List of values for the given list of keys. List will contain
+   * null for keys which could not be found.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public List<byte[]> multiGetAsList(final ReadOptions opt,
+      final List<byte[]> keys) throws RocksDBException {
+    assert(keys.size() != 0);
+
+    final byte[][] keysArray = keys.toArray(new byte[keys.size()][]);
+    final int[] keyOffsets = new int[keysArray.length];
+    final int[] keyLengths = new int[keysArray.length];
+    for(int i = 0; i < keyLengths.length; i++) {
+      keyLengths[i] = keysArray[i].length;
+    }
+
+    return Arrays.asList(multiGet(nativeHandle_, opt.nativeHandle_,
+        keysArray, keyOffsets, keyLengths));
+  }
+
+  /**
+   * Returns a list of values for the given list of keys. List will contain
+   * null for keys which could not be found.
+   * <p>
+   * Note: Every key needs to have a related column family name in
+   * {@code columnFamilyHandleList}.
+   * </p>
+   *
+   * @param opt Read options.
+   * @param columnFamilyHandleList {@link java.util.List} containing
+   *     {@link org.rocksdb.ColumnFamilyHandle} instances.
+   * @param keys of keys for which values need to be retrieved.
+   * @return List of values for the given list of keys. List will contain
+   * null for keys which could not be found.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   * @throws IllegalArgumentException thrown if the size of passed keys is not
+   *    equal to the amount of passed column family handles.
+   */
+  public List<byte[]> multiGetAsList(final ReadOptions opt,
+      final List<ColumnFamilyHandle> columnFamilyHandleList,
+      final List<byte[]> keys) throws RocksDBException {
+    assert(keys.size() != 0);
+    // Check if key size equals cfList size. If not a exception must be
+    // thrown. If not a Segmentation fault happens.
+    if (keys.size()!=columnFamilyHandleList.size()){
+      throw new IllegalArgumentException(
+          "For each key there must be a ColumnFamilyHandle.");
+    }
+    final long[] cfHandles = new long[columnFamilyHandleList.size()];
+    for (int i = 0; i < columnFamilyHandleList.size(); i++) {
+      cfHandles[i] = columnFamilyHandleList.get(i).nativeHandle_;
+    }
+
+    final byte[][] keysArray = keys.toArray(new byte[keys.size()][]);
+    final int[] keyOffsets = new int[keysArray.length];
+    final int[] keyLengths = new int[keysArray.length];
+    for(int i = 0; i < keyLengths.length; i++) {
+      keyLengths[i] = keysArray[i].length;
+    }
+
+    return Arrays.asList(multiGet(nativeHandle_, opt.nativeHandle_,
+        keysArray, keyOffsets, keyLengths, cfHandles));
+  }
+
+  /**
+   * Fetches a list of values for the given list of keys, all from the default column family.
+   *
+   * @param keys list of keys for which values need to be retrieved.
+   * @param values list of buffers to return retrieved values in
+   * @return list of number of bytes in DB for each requested key
+   * this can be more than the size of the corresponding buffer; then the buffer will be filled
+   * with the appropriate truncation of the database value.
+   * @throws RocksDBException if error happens in underlying native library.
+   * @throws IllegalArgumentException thrown if the number of passed keys and passed values
+   * do not match.
+   */
+  public List<ByteBufferGetStatus> multiGetByteBuffers(
+      final List<ByteBuffer> keys, final List<ByteBuffer> values) throws RocksDBException {
+    final ReadOptions readOptions = new ReadOptions();
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>(1);
+    columnFamilyHandleList.add(getDefaultColumnFamily());
+    return multiGetByteBuffers(readOptions, columnFamilyHandleList, keys, values);
+  }
+
+  /**
+   * Fetches a list of values for the given list of keys, all from the default column family.
+   *
+   * @param readOptions Read options
+   * @param keys list of keys for which values need to be retrieved.
+   * @param values list of buffers to return retrieved values in
+   * @throws RocksDBException if error happens in underlying native library.
+   * @throws IllegalArgumentException thrown if the number of passed keys and passed values
+   * do not match.
+   * @return the list of values for the given list of keys
+   */
+  public List<ByteBufferGetStatus> multiGetByteBuffers(final ReadOptions readOptions,
+      final List<ByteBuffer> keys, final List<ByteBuffer> values) throws RocksDBException {
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>(1);
+    columnFamilyHandleList.add(getDefaultColumnFamily());
+    return multiGetByteBuffers(readOptions, columnFamilyHandleList, keys, values);
+  }
+
+  /**
+   * Fetches a list of values for the given list of keys.
+   * <p>
+   * Note: Every key needs to have a related column family name in
+   * {@code columnFamilyHandleList}.
+   * </p>
+   *
+   * @param columnFamilyHandleList {@link java.util.List} containing
+   * {@link org.rocksdb.ColumnFamilyHandle} instances.
+   * @param keys list of keys for which values need to be retrieved.
+   * @param values list of buffers to return retrieved values in
+   * @throws RocksDBException if error happens in underlying native library.
+   * @throws IllegalArgumentException thrown if the number of passed keys, passed values and
+   * passed column family handles do not match.
+   * @return the list of values for the given list of keys
+   */
+  public List<ByteBufferGetStatus> multiGetByteBuffers(
+      final List<ColumnFamilyHandle> columnFamilyHandleList, final List<ByteBuffer> keys,
+      final List<ByteBuffer> values) throws RocksDBException {
+    final ReadOptions readOptions = new ReadOptions();
+    return multiGetByteBuffers(readOptions, columnFamilyHandleList, keys, values);
+  }
+
+  /**
+   * Fetches a list of values for the given list of keys.
+   * <p>
+   * Note: Every key needs to have a related column family name in
+   * {@code columnFamilyHandleList}.
+   * </p>
+   *
+   * @param readOptions Read options
+   * @param columnFamilyHandleList {@link java.util.List} containing
+   * {@link org.rocksdb.ColumnFamilyHandle} instances.
+   * @param keys list of keys for which values need to be retrieved.
+   * @param values list of buffers to return retrieved values in
+   * @throws RocksDBException if error happens in underlying native library.
+   * @throws IllegalArgumentException thrown if the number of passed keys, passed values and
+   * passed column family handles do not match.
+   * @return the list of values for the given list of keys
+   */
+  public List<ByteBufferGetStatus> multiGetByteBuffers(final ReadOptions readOptions,
+      final List<ColumnFamilyHandle> columnFamilyHandleList, final List<ByteBuffer> keys,
+      final List<ByteBuffer> values) throws RocksDBException {
+    assert (keys.size() != 0);
+
+    // Check if key size equals cfList size. If not a exception must be
+    // thrown. If not a Segmentation fault happens.
+    if (keys.size() != columnFamilyHandleList.size() && columnFamilyHandleList.size() > 1) {
+      throw new IllegalArgumentException(
+          "Wrong number of ColumnFamilyHandle(s) supplied. Provide 0, 1, or as many as there are key/value(s)");
+    }
+
+    // Check if key size equals cfList size. If not a exception must be
+    // thrown. If not a Segmentation fault happens.
+    if (values.size() != keys.size()) {
+      throw new IllegalArgumentException("For each key there must be a corresponding value.");
+    }
+
+    // TODO (AP) support indirect buffers
+    for (final ByteBuffer key : keys) {
+      if (!key.isDirect()) {
+        throw new IllegalArgumentException("All key buffers must be direct byte buffers");
+      }
+    }
+
+    // TODO (AP) support indirect buffers, though probably via a less efficient code path
+    for (final ByteBuffer value : values) {
+      if (!value.isDirect()) {
+        throw new IllegalArgumentException("All value buffers must be direct byte buffers");
+      }
+    }
+
+    final int numCFHandles = columnFamilyHandleList.size();
+    final long[] cfHandles = new long[numCFHandles];
+    for (int i = 0; i < numCFHandles; i++) {
+      cfHandles[i] = columnFamilyHandleList.get(i).nativeHandle_;
+    }
+
+    final int numValues = keys.size();
+
+    final ByteBuffer[] keysArray = keys.toArray(new ByteBuffer[0]);
+    final int[] keyOffsets = new int[numValues];
+    final int[] keyLengths = new int[numValues];
+    for (int i = 0; i < numValues; i++) {
+      // TODO (AP) add keysArray[i].arrayOffset() if the buffer is indirect
+      // TODO (AP) because in that case we have to pass the array directly,
+      // so that the JNI C++ code will not know to compensate for the array offset
+      keyOffsets[i] = keysArray[i].position();
+      keyLengths[i] = keysArray[i].limit();
+    }
+    final ByteBuffer[] valuesArray = values.toArray(new ByteBuffer[0]);
+    final int[] valuesSizeArray = new int[numValues];
+    final Status[] statusArray = new Status[numValues];
+
+    multiGet(nativeHandle_, readOptions.nativeHandle_, cfHandles, keysArray, keyOffsets, keyLengths,
+        valuesArray, valuesSizeArray, statusArray);
+
+    final List<ByteBufferGetStatus> results = new ArrayList<>();
+    for (int i = 0; i < numValues; i++) {
+      final Status status = statusArray[i];
+      if (status.getCode() == Status.Code.Ok) {
+        final ByteBuffer value = valuesArray[i];
+        value.position(Math.min(valuesSizeArray[i], value.capacity()));
+        value.flip(); // prepare for read out
+        results.add(new ByteBufferGetStatus(status, valuesSizeArray[i], value));
+      } else {
+        results.add(new ByteBufferGetStatus(status));
+      }
+    }
+
+    return results;
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a false negative.
+   *
+   * If the caller wants to obtain value when the key
+   * is found in memory, then {@code valueHolder} must be set.
+   *
+   * This check is potentially lighter-weight than invoking
+   * {@link #get(byte[])}. One way to make this lighter weight is to avoid
+   * doing any IOs.
+   *
+   * @param key byte array of a key to search for
+   * @param valueHolder non-null to retrieve the value if it is found, or null
+   *     if the value is not needed. If non-null, upon return of the function,
+   *     the {@code value} will be set if it could be retrieved.
+   *
+   * @return false if the key definitely does not exist in the database,
+   *     otherwise true.
+   */
+  public boolean keyMayExist(final byte[] key,
+      /* @Nullable */ final Holder<byte[]> valueHolder) {
+    return keyMayExist(key, 0, key.length, valueHolder);
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a false negative.
+   *
+   * If the caller wants to obtain value when the key
+   * is found in memory, then {@code valueHolder} must be set.
+   *
+   * This check is potentially lighter-weight than invoking
+   * {@link #get(byte[], int, int)}. One way to make this lighter weight is to
+   * avoid doing any IOs.
+   *
+   * @param key byte array of a key to search for
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than "key".length
+   * @param valueHolder non-null to retrieve the value if it is found, or null
+   *     if the value is not needed. If non-null, upon return of the function,
+   *     the {@code value} will be set if it could be retrieved.
+   *
+   * @return false if the key definitely does not exist in the database,
+   *     otherwise true.
+   */
+  public boolean keyMayExist(final byte[] key,
+      final int offset, final int len,
+      /* @Nullable */ final Holder<byte[]> valueHolder) {
+    return keyMayExist((ColumnFamilyHandle)null, key, offset, len, valueHolder);
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a false negative.
+   *
+   * If the caller wants to obtain value when the key
+   * is found in memory, then {@code valueHolder} must be set.
+   *
+   * This check is potentially lighter-weight than invoking
+   * {@link #get(ColumnFamilyHandle,byte[])}. One way to make this lighter
+   * weight is to avoid doing any IOs.
+   *
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param key byte array of a key to search for
+   * @param valueHolder non-null to retrieve the value if it is found, or null
+   *     if the value is not needed. If non-null, upon return of the function,
+   *     the {@code value} will be set if it could be retrieved.
+   *
+   * @return false if the key definitely does not exist in the database,
+   *     otherwise true.
+   */
+  public boolean keyMayExist(
+      final ColumnFamilyHandle columnFamilyHandle, final byte[] key,
+      /* @Nullable */ final Holder<byte[]> valueHolder) {
+    return keyMayExist(columnFamilyHandle, key, 0, key.length,
+        valueHolder);
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a false negative.
+   *
+   * If the caller wants to obtain value when the key
+   * is found in memory, then {@code valueHolder} must be set.
+   *
+   * This check is potentially lighter-weight than invoking
+   * {@link #get(ColumnFamilyHandle, byte[], int, int)}. One way to make this
+   * lighter weight is to avoid doing any IOs.
+   *
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param key byte array of a key to search for
+   * @param offset the offset of the "key" array to be used, must be
+   *    non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *    and no larger than "key".length
+   * @param valueHolder non-null to retrieve the value if it is found, or null
+   *     if the value is not needed. If non-null, upon return of the function,
+   *     the {@code value} will be set if it could be retrieved.
+   *
+   * @return false if the key definitely does not exist in the database,
+   *     otherwise true.
+   */
+  public boolean keyMayExist(
+      final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, int offset, int len,
+      /* @Nullable */ final Holder<byte[]> valueHolder) {
+    return keyMayExist(columnFamilyHandle, null, key, offset, len,
+        valueHolder);
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a true negative.
+   *
+   * If the caller wants to obtain value when the key
+   * is found in memory, then {@code valueHolder} must be set.
+   *
+   * This check is potentially lighter-weight than invoking
+   * {@link #get(ReadOptions, byte[])}. One way to make this
+   * lighter weight is to avoid doing any IOs.
+   *
+   * @param readOptions {@link ReadOptions} instance
+   * @param key byte array of a key to search for
+   * @param valueHolder non-null to retrieve the value if it is found, or null
+   *     if the value is not needed. If non-null, upon return of the function,
+   *     the {@code value} will be set if it could be retrieved.
+   *
+   * @return false if the key definitely does not exist in the database,
+   *     otherwise true.
+   */
+  public boolean keyMayExist(
+      final ReadOptions readOptions, final byte[] key,
+      /* @Nullable */ final Holder<byte[]> valueHolder) {
+    return keyMayExist(readOptions, key, 0, key.length,
+        valueHolder);
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a true negative.
+   *
+   * If the caller wants to obtain value when the key
+   * is found in memory, then {@code valueHolder} must be set.
+   *
+   * This check is potentially lighter-weight than invoking
+   * {@link #get(ReadOptions, byte[], int, int)}. One way to make this
+   * lighter weight is to avoid doing any IOs.
+   *
+   * @param readOptions {@link ReadOptions} instance
+   * @param key byte array of a key to search for
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than "key".length
+   * @param valueHolder non-null to retrieve the value if it is found, or null
+   *     if the value is not needed. If non-null, upon return of the function,
+   *     the {@code value} will be set if it could be retrieved.
+   *
+   * @return false if the key definitely does not exist in the database,
+   *     otherwise true.
+   */
+  public boolean keyMayExist(
+      final ReadOptions readOptions,
+      final byte[] key, final int offset, final int len,
+      /* @Nullable */ final Holder<byte[]> valueHolder) {
+    return keyMayExist(null, readOptions,
+        key, offset, len, valueHolder);
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a true negative.
+   *
+   * If the caller wants to obtain value when the key
+   * is found in memory, then {@code valueHolder} must be set.
+   *
+   * This check is potentially lighter-weight than invoking
+   * {@link #get(ColumnFamilyHandle, ReadOptions, byte[])}. One way to make this
+   * lighter weight is to avoid doing any IOs.
+   *
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param readOptions {@link ReadOptions} instance
+   * @param key byte array of a key to search for
+   * @param valueHolder non-null to retrieve the value if it is found, or null
+   *     if the value is not needed. If non-null, upon return of the function,
+   *     the {@code value} will be set if it could be retrieved.
+   *
+   * @return false if the key definitely does not exist in the database,
+   *     otherwise true.
+   */
+  public boolean keyMayExist(
+      final ColumnFamilyHandle columnFamilyHandle,
+      final ReadOptions readOptions, final byte[] key,
+      /* @Nullable */ final Holder<byte[]> valueHolder) {
+    return keyMayExist(columnFamilyHandle, readOptions,
+        key, 0, key.length, valueHolder);
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a false negative.
+   *
+   * If the caller wants to obtain value when the key
+   * is found in memory, then {@code valueHolder} must be set.
+   *
+   * This check is potentially lighter-weight than invoking
+   * {@link #get(ColumnFamilyHandle, ReadOptions, byte[], int, int)}.
+   * One way to make this lighter weight is to avoid doing any IOs.
+   *
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param readOptions {@link ReadOptions} instance
+   * @param key byte array of a key to search for
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than "key".length
+   * @param valueHolder non-null to retrieve the value if it is found, or null
+   *     if the value is not needed. If non-null, upon return of the function,
+   *     the {@code value} will be set if it could be retrieved.
+   *
+   * @return false if the key definitely does not exist in the database,
+   *     otherwise true.
+   */
+  public boolean keyMayExist(
+      final ColumnFamilyHandle columnFamilyHandle,
+      final ReadOptions readOptions,
+      final byte[] key, final int offset, final int len,
+      /* @Nullable */ final Holder<byte[]> valueHolder) {
+    checkBounds(offset, len, key.length);
+    if (valueHolder == null) {
+      return keyMayExist(nativeHandle_,
+          columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_,
+          readOptions == null ? 0 : readOptions.nativeHandle_,
+          key, offset, len);
+    } else {
+      final byte[][] result = keyMayExistFoundValue(
+          nativeHandle_,
+          columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_,
+          readOptions == null ? 0 : readOptions.nativeHandle_,
+          key, offset, len);
+      if (result[0][0] == 0x0) {
+        valueHolder.setValue(null);
+        return false;
+      } else if (result[0][0] == 0x1) {
+        valueHolder.setValue(null);
+        return true;
+      } else {
+        valueHolder.setValue(result[1]);
+        return true;
+      }
+    }
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a false negative.
+   *
+   * @param key bytebuffer containing the value of the key
+   * @return false if the key definitely does not exist in the database,
+   *     otherwise true.
+   */
+  public boolean keyMayExist(final ByteBuffer key) {
+    return keyMayExist(null, (ReadOptions) null, key);
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a false negative.
+   *
+   * @param columnFamilyHandle the {@link ColumnFamilyHandle} to look for the key in
+   * @param key bytebuffer containing the value of the key
+   * @return false if the key definitely does not exist in the database,
+   *     otherwise true.
+   */
+  public boolean keyMayExist(final ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key) {
+    return keyMayExist(columnFamilyHandle, (ReadOptions) null, key);
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a false negative.
+   *
+   * @param readOptions the {@link ReadOptions} to use when reading the key/value
+   * @param key bytebuffer containing the value of the key
+   * @return false if the key definitely does not exist in the database,
+   *     otherwise true.
+   */
+  public boolean keyMayExist(final ReadOptions readOptions, final ByteBuffer key) {
+    return keyMayExist(null, readOptions, key);
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns {@link KeyMayExist.KeyMayExistEnum#kNotExist},
+   * otherwise if it can with best effort retreive the value, it returns {@link
+   * KeyMayExist.KeyMayExistEnum#kExistsWithValue} otherwise it returns {@link
+   * KeyMayExist.KeyMayExistEnum#kExistsWithoutValue}. The choice not to return a value which might
+   * exist is at the discretion of the implementation; the only guarantee is that {@link
+   * KeyMayExist.KeyMayExistEnum#kNotExist} is an assurance that the key does not exist.
+   *
+   * @param key bytebuffer containing the value of the key
+   * @param value bytebuffer which will receive a value if the key exists and a value is known
+   * @return a {@link KeyMayExist} object reporting if key may exist and if a value is provided
+   */
+  public KeyMayExist keyMayExist(final ByteBuffer key, final ByteBuffer value) {
+    return keyMayExist(null, null, key, value);
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns {@link KeyMayExist.KeyMayExistEnum#kNotExist},
+   * otherwise if it can with best effort retreive the value, it returns {@link
+   * KeyMayExist.KeyMayExistEnum#kExistsWithValue} otherwise it returns {@link
+   * KeyMayExist.KeyMayExistEnum#kExistsWithoutValue}. The choice not to return a value which might
+   * exist is at the discretion of the implementation; the only guarantee is that {@link
+   * KeyMayExist.KeyMayExistEnum#kNotExist} is an assurance that the key does not exist.
+   *
+   * @param columnFamilyHandle the {@link ColumnFamilyHandle} to look for the key in
+   * @param key bytebuffer containing the value of the key
+   * @param value bytebuffer which will receive a value if the key exists and a value is known
+   * @return a {@link KeyMayExist} object reporting if key may exist and if a value is provided
+   */
+  public KeyMayExist keyMayExist(
+      final ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key, final ByteBuffer value) {
+    return keyMayExist(columnFamilyHandle, null, key, value);
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns {@link KeyMayExist.KeyMayExistEnum#kNotExist},
+   * otherwise if it can with best effort retreive the value, it returns {@link
+   * KeyMayExist.KeyMayExistEnum#kExistsWithValue} otherwise it returns {@link
+   * KeyMayExist.KeyMayExistEnum#kExistsWithoutValue}. The choice not to return a value which might
+   * exist is at the discretion of the implementation; the only guarantee is that {@link
+   * KeyMayExist.KeyMayExistEnum#kNotExist} is an assurance that the key does not exist.
+   *
+   * @param readOptions the {@link ReadOptions} to use when reading the key/value
+   * @param key bytebuffer containing the value of the key
+   * @param value bytebuffer which will receive a value if the key exists and a value is known
+   * @return a {@link KeyMayExist} object reporting if key may exist and if a value is provided
+   */
+  public KeyMayExist keyMayExist(
+      final ReadOptions readOptions, final ByteBuffer key, final ByteBuffer value) {
+    return keyMayExist(null, readOptions, key, value);
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a false negative.
+   *
+   * @param columnFamilyHandle the {@link ColumnFamilyHandle} to look for the key in
+   * @param readOptions the {@link ReadOptions} to use when reading the key/value
+   * @param key bytebuffer containing the value of the key
+   * @return false if the key definitely does not exist in the database,
+   *     otherwise true.
+   */
+  public boolean keyMayExist(final ColumnFamilyHandle columnFamilyHandle,
+      final ReadOptions readOptions, final ByteBuffer key) {
+    assert key != null : "key ByteBuffer parameter cannot be null";
+    assert key.isDirect() : "key parameter must be a direct ByteBuffer";
+    return keyMayExistDirect(nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_,
+        readOptions == null ? 0 : readOptions.nativeHandle_, key, key.position(), key.limit());
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns {@link KeyMayExist.KeyMayExistEnum#kNotExist},
+   * otherwise if it can with best effort retreive the value, it returns {@link
+   * KeyMayExist.KeyMayExistEnum#kExistsWithValue} otherwise it returns {@link
+   * KeyMayExist.KeyMayExistEnum#kExistsWithoutValue}. The choice not to return a value which might
+   * exist is at the discretion of the implementation; the only guarantee is that {@link
+   * KeyMayExist.KeyMayExistEnum#kNotExist} is an assurance that the key does not exist.
+   *
+   * @param columnFamilyHandle the {@link ColumnFamilyHandle} to look for the key in
+   * @param readOptions the {@link ReadOptions} to use when reading the key/value
+   * @param key bytebuffer containing the value of the key
+   * @param value bytebuffer which will receive a value if the key exists and a value is known
+   * @return a {@link KeyMayExist} object reporting if key may exist and if a value is provided
+   */
+  public KeyMayExist keyMayExist(final ColumnFamilyHandle columnFamilyHandle,
+      final ReadOptions readOptions, final ByteBuffer key, final ByteBuffer value) {
+    assert key != null : "key ByteBuffer parameter cannot be null";
+    assert key.isDirect() : "key parameter must be a direct ByteBuffer";
+    assert value
+        != null
+        : "value ByteBuffer parameter cannot be null. If you do not need the value, use a different version of the method";
+    assert value.isDirect() : "value parameter must be a direct ByteBuffer";
+
+    final int[] result = keyMayExistDirectFoundValue(nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_,
+        readOptions == null ? 0 : readOptions.nativeHandle_, key, key.position(), key.remaining(),
+        value, value.position(), value.remaining());
+    final int valueLength = result[1];
+    value.limit(value.position() + Math.min(valueLength, value.remaining()));
+    return new KeyMayExist(KeyMayExist.KeyMayExistEnum.values()[result[0]], valueLength);
+  }
+
+  /**
+   * <p>Return a heap-allocated iterator over the contents of the
+   * database. The result of newIterator() is initially invalid
+   * (caller must call one of the Seek methods on the iterator
+   * before using it).</p>
+   *
+   * <p>Caller should close the iterator when it is no longer needed.
+   * The returned iterator should be closed before this db is closed.
+   * </p>
+   *
+   * @return instance of iterator object.
+   */
+  public RocksIterator newIterator() {
+    return new RocksIterator(this, iterator(nativeHandle_));
+  }
+
+  /**
+   * <p>Return a heap-allocated iterator over the contents of the
+   * database. The result of newIterator() is initially invalid
+   * (caller must call one of the Seek methods on the iterator
+   * before using it).</p>
+   *
+   * <p>Caller should close the iterator when it is no longer needed.
+   * The returned iterator should be closed before this db is closed.
+   * </p>
+   *
+   * @param readOptions {@link ReadOptions} instance.
+   * @return instance of iterator object.
+   */
+  public RocksIterator newIterator(final ReadOptions readOptions) {
+    return new RocksIterator(this, iterator(nativeHandle_,
+        readOptions.nativeHandle_));
+  }
+
+  /**
+   * <p>Return a heap-allocated iterator over the contents of a
+   * ColumnFamily. The result of newIterator() is initially invalid
+   * (caller must call one of the Seek methods on the iterator
+   * before using it).</p>
+   *
+   * <p>Caller should close the iterator when it is no longer needed.
+   * The returned iterator should be closed before this db is closed.
+   * </p>
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @return instance of iterator object.
+   */
+  public RocksIterator newIterator(
+      final ColumnFamilyHandle columnFamilyHandle) {
+    return new RocksIterator(this, iteratorCF(nativeHandle_,
+        columnFamilyHandle.nativeHandle_));
+  }
+
+  /**
+   * <p>Return a heap-allocated iterator over the contents of a
+   * ColumnFamily. The result of newIterator() is initially invalid
+   * (caller must call one of the Seek methods on the iterator
+   * before using it).</p>
+   *
+   * <p>Caller should close the iterator when it is no longer needed.
+   * The returned iterator should be closed before this db is closed.
+   * </p>
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param readOptions {@link ReadOptions} instance.
+   * @return instance of iterator object.
+   */
+  public RocksIterator newIterator(final ColumnFamilyHandle columnFamilyHandle,
+      final ReadOptions readOptions) {
+    return new RocksIterator(this, iteratorCF(nativeHandle_,
+        columnFamilyHandle.nativeHandle_, readOptions.nativeHandle_));
+  }
+
+  /**
+   * Returns iterators from a consistent database state across multiple
+   * column families. Iterators are heap allocated and need to be deleted
+   * before the db is deleted
+   *
+   * @param columnFamilyHandleList {@link java.util.List} containing
+   *     {@link org.rocksdb.ColumnFamilyHandle} instances.
+   * @return {@link java.util.List} containing {@link org.rocksdb.RocksIterator}
+   *     instances
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public List<RocksIterator> newIterators(
+      final List<ColumnFamilyHandle> columnFamilyHandleList)
+      throws RocksDBException {
+    return newIterators(columnFamilyHandleList, new ReadOptions());
+  }
+
+  /**
+   * Returns iterators from a consistent database state across multiple
+   * column families. Iterators are heap allocated and need to be deleted
+   * before the db is deleted
+   *
+   * @param columnFamilyHandleList {@link java.util.List} containing
+   *     {@link org.rocksdb.ColumnFamilyHandle} instances.
+   * @param readOptions {@link ReadOptions} instance.
+   * @return {@link java.util.List} containing {@link org.rocksdb.RocksIterator}
+   *     instances
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public List<RocksIterator> newIterators(
+      final List<ColumnFamilyHandle> columnFamilyHandleList,
+      final ReadOptions readOptions) throws RocksDBException {
+
+    final long[] columnFamilyHandles = new long[columnFamilyHandleList.size()];
+    for (int i = 0; i < columnFamilyHandleList.size(); i++) {
+      columnFamilyHandles[i] = columnFamilyHandleList.get(i).nativeHandle_;
+    }
+
+    final long[] iteratorRefs = iterators(nativeHandle_, columnFamilyHandles,
+        readOptions.nativeHandle_);
+
+    final List<RocksIterator> iterators = new ArrayList<>(
+        columnFamilyHandleList.size());
+    for (int i=0; i<columnFamilyHandleList.size(); i++){
+      iterators.add(new RocksIterator(this, iteratorRefs[i]));
+    }
+    return iterators;
+  }
+
+
+  /**
+   * <p>Return a handle to the current DB state. Iterators created with
+   * this handle will all observe a stable snapshot of the current DB
+   * state. The caller must call ReleaseSnapshot(result) when the
+   * snapshot is no longer needed.</p>
+   *
+   * <p>nullptr will be returned if the DB fails to take a snapshot or does
+   * not support snapshot.</p>
+   *
+   * @return Snapshot {@link Snapshot} instance
+   */
+  public Snapshot getSnapshot() {
+    long snapshotHandle = getSnapshot(nativeHandle_);
+    if (snapshotHandle != 0) {
+      return new Snapshot(snapshotHandle);
+    }
+    return null;
+  }
+
+  /**
+   * Release a previously acquired snapshot.
+   *
+   * The caller must not use "snapshot" after this call.
+   *
+   * @param snapshot {@link Snapshot} instance
+   */
+  public void releaseSnapshot(final Snapshot snapshot) {
+    if (snapshot != null) {
+      releaseSnapshot(nativeHandle_, snapshot.nativeHandle_);
+    }
+  }
+
+  /**
+   * DB implements can export properties about their state
+   * via this method on a per column family level.
+   *
+   * <p>If {@code property} is a valid property understood by this DB
+   * implementation, fills {@code value} with its current value and
+   * returns true. Otherwise returns false.</p>
+   *
+   * <p>Valid property names include:
+   * <ul>
+   * <li>"rocksdb.num-files-at-level&lt;N&gt;" - return the number of files at
+   * level &lt;N&gt;, where &lt;N&gt; is an ASCII representation of a level
+   * number (e.g. "0").</li>
+   * <li>"rocksdb.stats" - returns a multi-line string that describes statistics
+   *     about the internal operation of the DB.</li>
+   * <li>"rocksdb.sstables" - returns a multi-line string that describes all
+   *    of the sstables that make up the db contents.</li>
+   * </ul>
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance, or null for the default column family.
+   * @param property to be fetched. See above for examples
+   * @return property value
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public String getProperty(
+      /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle,
+      final String property) throws RocksDBException {
+    return getProperty(nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_,
+        property, property.length());
+  }
+
+  /**
+   * DB implementations can export properties about their state
+   * via this method.  If "property" is a valid property understood by this
+   * DB implementation, fills "*value" with its current value and returns
+   * true.  Otherwise returns false.
+   *
+   * <p>Valid property names include:
+   * <ul>
+   * <li>"rocksdb.num-files-at-level&lt;N&gt;" - return the number of files at
+   * level &lt;N&gt;, where &lt;N&gt; is an ASCII representation of a level
+   * number (e.g. "0").</li>
+   * <li>"rocksdb.stats" - returns a multi-line string that describes statistics
+   *     about the internal operation of the DB.</li>
+   * <li>"rocksdb.sstables" - returns a multi-line string that describes all
+   *    of the sstables that make up the db contents.</li>
+   *</ul>
+   *
+   * @param property to be fetched. See above for examples
+   * @return property value
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public String getProperty(final String property) throws RocksDBException {
+    return getProperty(null, property);
+  }
+
+
+  /**
+   * Gets a property map.
+   *
+   * @param property to be fetched.
+   *
+   * @return the property map
+   *
+   * @throws RocksDBException if an error happens in the underlying native code.
+   */
+  public Map<String, String> getMapProperty(final String property)
+      throws RocksDBException {
+    return getMapProperty(null, property);
+  }
+
+  /**
+   * Gets a property map.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance, or null for the default column family.
+   * @param property to be fetched.
+   *
+   * @return the property map
+   *
+   * @throws RocksDBException if an error happens in the underlying native code.
+   */
+  public Map<String, String> getMapProperty(
+      /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle,
+                      final String property) throws RocksDBException {
+    return getMapProperty(nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_,
+        property, property.length());
+  }
+
+  /**
+   * <p> Similar to GetProperty(), but only works for a subset of properties
+   * whose return value is a numerical value. Return the value as long.</p>
+   *
+   * <p><strong>Note</strong>: As the returned property is of type
+   * {@code uint64_t} on C++ side the returning value can be negative
+   * because Java supports in Java 7 only signed long values.</p>
+   *
+   * <p><strong>Java 7</strong>: To mitigate the problem of the non
+   * existent unsigned long tpye, values should be encapsulated using
+   * {@link java.math.BigInteger} to reflect the correct value. The correct
+   * behavior is guaranteed if {@code 2^64} is added to negative values.</p>
+   *
+   * <p><strong>Java 8</strong>: In Java 8 the value should be treated as
+   * unsigned long using provided methods of type {@link Long}.</p>
+   *
+   * @param property to be fetched.
+   *
+   * @return numerical property value.
+   *
+   * @throws RocksDBException if an error happens in the underlying native code.
+   */
+  public long getLongProperty(final String property) throws RocksDBException {
+    return getLongProperty(null, property);
+  }
+
+  /**
+   * <p> Similar to GetProperty(), but only works for a subset of properties
+   * whose return value is a numerical value. Return the value as long.</p>
+   *
+   * <p><strong>Note</strong>: As the returned property is of type
+   * {@code uint64_t} on C++ side the returning value can be negative
+   * because Java supports in Java 7 only signed long values.</p>
+   *
+   * <p><strong>Java 7</strong>: To mitigate the problem of the non
+   * existent unsigned long tpye, values should be encapsulated using
+   * {@link java.math.BigInteger} to reflect the correct value. The correct
+   * behavior is guaranteed if {@code 2^64} is added to negative values.</p>
+   *
+   * <p><strong>Java 8</strong>: In Java 8 the value should be treated as
+   * unsigned long using provided methods of type {@link Long}.</p>
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance, or null for the default column family
+   * @param property to be fetched.
+   *
+   * @return numerical property value
+   *
+   * @throws RocksDBException if an error happens in the underlying native code.
+   */
+  public long getLongProperty(
+      /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle,
+      final String property) throws RocksDBException {
+    return getLongProperty(nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_,
+        property, property.length());
+  }
+
+  /**
+   * Reset internal stats for DB and all column families.
+   *
+   * Note this doesn't reset {@link Options#statistics()} as it is not
+   * owned by DB.
+   *
+   * @throws RocksDBException if an error occurs whilst reseting the stats
+   */
+  public void resetStats() throws RocksDBException {
+    resetStats(nativeHandle_);
+  }
+
+  /**
+   * <p> Return sum of the getLongProperty of all the column families</p>
+   *
+   * <p><strong>Note</strong>: As the returned property is of type
+   * {@code uint64_t} on C++ side the returning value can be negative
+   * because Java supports in Java 7 only signed long values.</p>
+   *
+   * <p><strong>Java 7</strong>: To mitigate the problem of the non
+   * existent unsigned long tpye, values should be encapsulated using
+   * {@link java.math.BigInteger} to reflect the correct value. The correct
+   * behavior is guaranteed if {@code 2^64} is added to negative values.</p>
+   *
+   * <p><strong>Java 8</strong>: In Java 8 the value should be treated as
+   * unsigned long using provided methods of type {@link Long}.</p>
+   *
+   * @param property to be fetched.
+   *
+   * @return numerical property value
+   *
+   * @throws RocksDBException if an error happens in the underlying native code.
+   */
+  public long getAggregatedLongProperty(final String property)
+      throws RocksDBException {
+    return getAggregatedLongProperty(nativeHandle_, property,
+        property.length());
+  }
+
+  /**
+   * Get the approximate file system space used by keys in each range.
+   *
+   * Note that the returned sizes measure file system space usage, so
+   * if the user data compresses by a factor of ten, the returned
+   * sizes will be one-tenth the size of the corresponding user data size.
+   *
+   * If {@code sizeApproximationFlags} defines whether the returned size
+   * should include the recently written data in the mem-tables (if
+   * the mem-table type supports it), data serialized to disk, or both.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance, or null for the default column family
+   * @param ranges the ranges over which to approximate sizes
+   * @param sizeApproximationFlags flags to determine what to include in the
+   *     approximation.
+   *
+   * @return the sizes
+   */
+  public long[] getApproximateSizes(
+      /*@Nullable*/ final ColumnFamilyHandle columnFamilyHandle,
+      final List<Range> ranges,
+      final SizeApproximationFlag... sizeApproximationFlags) {
+
+    byte flags = 0x0;
+    for (final SizeApproximationFlag sizeApproximationFlag
+        : sizeApproximationFlags) {
+      flags |= sizeApproximationFlag.getValue();
+    }
+
+    return getApproximateSizes(nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_,
+        toRangeSliceHandles(ranges), flags);
+  }
+
+  /**
+   * Get the approximate file system space used by keys in each range for
+   * the default column family.
+   *
+   * Note that the returned sizes measure file system space usage, so
+   * if the user data compresses by a factor of ten, the returned
+   * sizes will be one-tenth the size of the corresponding user data size.
+   *
+   * If {@code sizeApproximationFlags} defines whether the returned size
+   * should include the recently written data in the mem-tables (if
+   * the mem-table type supports it), data serialized to disk, or both.
+   *
+   * @param ranges the ranges over which to approximate sizes
+   * @param sizeApproximationFlags flags to determine what to include in the
+   *     approximation.
+   *
+   * @return the sizes.
+   */
+  public long[] getApproximateSizes(final List<Range> ranges,
+      final SizeApproximationFlag... sizeApproximationFlags) {
+    return getApproximateSizes(null, ranges, sizeApproximationFlags);
+  }
+
+  public static class CountAndSize {
+    public final long count;
+    public final long size;
+
+    public CountAndSize(final long count, final long size) {
+      this.count = count;
+      this.size = size;
+    }
+  }
+
+  /**
+   * This method is similar to
+   * {@link #getApproximateSizes(ColumnFamilyHandle, List, SizeApproximationFlag...)},
+   * except that it returns approximate number of records and size in memtables.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance, or null for the default column family
+   * @param range the ranges over which to get the memtable stats
+   *
+   * @return the count and size for the range
+   */
+  public CountAndSize getApproximateMemTableStats(
+      /*@Nullable*/ final ColumnFamilyHandle columnFamilyHandle,
+      final Range range) {
+    final long[] result = getApproximateMemTableStats(nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_,
+        range.start.getNativeHandle(),
+        range.limit.getNativeHandle());
+    return new CountAndSize(result[0], result[1]);
+  }
+
+  /**
+   * This method is similar to
+   * {@link #getApproximateSizes(ColumnFamilyHandle, List, SizeApproximationFlag...)},
+   * except that it returns approximate number of records and size in memtables.
+   *
+   * @param range the ranges over which to get the memtable stats
+   *
+   * @return the count and size for the range
+   */
+  public CountAndSize getApproximateMemTableStats(
+    final Range range) {
+    return getApproximateMemTableStats(null, range);
+  }
+
+  /**
+   * <p>Range compaction of database.</p>
+   * <p><strong>Note</strong>: After the database has been compacted,
+   * all data will have been pushed down to the last level containing
+   * any data.</p>
+   *
+   * <p><strong>See also</strong></p>
+   * <ul>
+   * <li>{@link #compactRange(byte[], byte[])}</li>
+   * </ul>
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void compactRange() throws RocksDBException {
+    compactRange(null);
+  }
+
+  /**
+   * <p>Range compaction of column family.</p>
+   * <p><strong>Note</strong>: After the database has been compacted,
+   * all data will have been pushed down to the last level containing
+   * any data.</p>
+   *
+   * <p><strong>See also</strong></p>
+   * <ul>
+   * <li>
+   *   {@link #compactRange(ColumnFamilyHandle, byte[], byte[])}
+   * </li>
+   * </ul>
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance, or null for the default column family.
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void compactRange(
+      /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle)
+      throws RocksDBException {
+    compactRange(nativeHandle_, null, -1, null, -1, 0,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * <p>Range compaction of database.</p>
+   * <p><strong>Note</strong>: After the database has been compacted,
+   * all data will have been pushed down to the last level containing
+   * any data.</p>
+   *
+   * <p><strong>See also</strong></p>
+   * <ul>
+   * <li>{@link #compactRange()}</li>
+   * </ul>
+   *
+   * @param begin start of key range (included in range)
+   * @param end end of key range (excluded from range)
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void compactRange(final byte[] begin, final byte[] end)
+      throws RocksDBException {
+    compactRange(null, begin, end);
+  }
+
+  /**
+   * <p>Range compaction of column family.</p>
+   * <p><strong>Note</strong>: After the database has been compacted,
+   * all data will have been pushed down to the last level containing
+   * any data.</p>
+   *
+   * <p><strong>See also</strong></p>
+   * <ul>
+   * <li>{@link #compactRange(ColumnFamilyHandle)}</li>
+   * </ul>
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance, or null for the default column family.
+   * @param begin start of key range (included in range)
+   * @param end end of key range (excluded from range)
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void compactRange(
+      /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] begin, final byte[] end) throws RocksDBException {
+    compactRange(nativeHandle_,
+        begin, begin == null ? -1 : begin.length,
+        end, end == null ? -1 : end.length,
+        0, columnFamilyHandle == null ? 0: columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * <p>Range compaction of column family.</p>
+   * <p><strong>Note</strong>: After the database has been compacted,
+   * all data will have been pushed down to the last level containing
+   * any data.</p>
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} instance.
+   * @param begin start of key range (included in range)
+   * @param end end of key range (excluded from range)
+   * @param compactRangeOptions options for the compaction
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void compactRange(
+      /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] begin, final byte[] end,
+      final CompactRangeOptions compactRangeOptions) throws RocksDBException {
+    compactRange(nativeHandle_,
+        begin, begin == null ? -1 : begin.length,
+        end, end == null ? -1 : end.length,
+        compactRangeOptions.nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Change the options for the column family handle.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance, or null for the default column family.
+   * @param mutableColumnFamilyOptions the options.
+   *
+   * @throws RocksDBException if an error occurs whilst setting the options
+   */
+  public void setOptions(
+      /* @Nullable */final ColumnFamilyHandle columnFamilyHandle,
+      final MutableColumnFamilyOptions mutableColumnFamilyOptions)
+      throws RocksDBException {
+    setOptions(nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_,
+        mutableColumnFamilyOptions.getKeys(), mutableColumnFamilyOptions.getValues());
+  }
+
+  /**
+   * Get the options for the column family handle
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance, or null for the default column family.
+   *
+   * @return the options parsed from the options string return by RocksDB
+   *
+   * @throws RocksDBException if an error occurs while getting the options string, or parsing the
+   *     resulting options string into options
+   */
+  public MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder getOptions(
+      /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle) throws RocksDBException {
+    String optionsString = getOptions(
+        nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_);
+    return MutableColumnFamilyOptions.parse(optionsString, true);
+  }
+
+  /**
+   * Default column family options
+   *
+   * @return the options parsed from the options string return by RocksDB
+   *
+   * @throws RocksDBException if an error occurs while getting the options string, or parsing the
+   *     resulting options string into options
+   */
+  public MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder getOptions()
+      throws RocksDBException {
+    return getOptions(null);
+  }
+
+  /**
+   * Get the database options
+   *
+   * @return the DB options parsed from the options string return by RocksDB
+   *
+   * @throws RocksDBException if an error occurs while getting the options string, or parsing the
+   *     resulting options string into options
+   */
+  public MutableDBOptions.MutableDBOptionsBuilder getDBOptions() throws RocksDBException {
+    String optionsString = getDBOptions(nativeHandle_);
+    return MutableDBOptions.parse(optionsString, true);
+  }
+
+  /**
+   * Change the options for the default column family handle.
+   *
+   * @param mutableColumnFamilyOptions the options.
+   *
+   * @throws RocksDBException if an error occurs whilst setting the options
+   */
+  public void setOptions(
+      final MutableColumnFamilyOptions mutableColumnFamilyOptions)
+      throws RocksDBException {
+    setOptions(null, mutableColumnFamilyOptions);
+  }
+
+  /**
+   * Set the options for the column family handle.
+   *
+   * @param mutableDBoptions the options.
+   *
+   * @throws RocksDBException if an error occurs whilst setting the options
+   */
+  public void setDBOptions(final MutableDBOptions mutableDBoptions)
+      throws RocksDBException {
+    setDBOptions(nativeHandle_,
+        mutableDBoptions.getKeys(),
+        mutableDBoptions.getValues());
+  }
+
+  /**
+   * Takes a list of files specified by file names and
+   * compacts them to the specified level.
+   *
+   * Note that the behavior is different from
+   * {@link #compactRange(ColumnFamilyHandle, byte[], byte[])}
+   * in that CompactFiles() performs the compaction job using the CURRENT
+   * thread.
+   *
+   * @param compactionOptions compaction options
+   * @param inputFileNames the name of the files to compact
+   * @param outputLevel the level to which they should be compacted
+   * @param outputPathId the id of the output path, or -1
+   * @param compactionJobInfo the compaction job info, this parameter
+   *     will be updated with the info from compacting the files,
+   *     can just be null if you don't need it.
+   *
+   * @return the list of compacted files
+   *
+   * @throws RocksDBException if an error occurs during compaction
+   */
+  public List<String> compactFiles(
+      final CompactionOptions compactionOptions,
+      final List<String> inputFileNames,
+      final int outputLevel,
+      final int outputPathId,
+      /* @Nullable */ final CompactionJobInfo compactionJobInfo)
+      throws RocksDBException {
+    return compactFiles(compactionOptions, null, inputFileNames, outputLevel,
+        outputPathId, compactionJobInfo);
+  }
+
+  /**
+   * Takes a list of files specified by file names and
+   * compacts them to the specified level.
+   *
+   * Note that the behavior is different from
+   * {@link #compactRange(ColumnFamilyHandle, byte[], byte[])}
+   * in that CompactFiles() performs the compaction job using the CURRENT
+   * thread.
+   *
+   * @param compactionOptions compaction options
+   * @param columnFamilyHandle columnFamilyHandle, or null for the
+   *     default column family
+   * @param inputFileNames the name of the files to compact
+   * @param outputLevel the level to which they should be compacted
+   * @param outputPathId the id of the output path, or -1
+   * @param compactionJobInfo the compaction job info, this parameter
+   *     will be updated with the info from compacting the files,
+   *     can just be null if you don't need it.
+   *
+   * @return the list of compacted files
+   *
+   * @throws RocksDBException if an error occurs during compaction
+   */
+  public List<String> compactFiles(
+      final CompactionOptions compactionOptions,
+      /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle,
+      final List<String> inputFileNames,
+      final int outputLevel,
+      final int outputPathId,
+      /* @Nullable */ final CompactionJobInfo compactionJobInfo)
+      throws RocksDBException {
+    return Arrays.asList(compactFiles(nativeHandle_, compactionOptions.nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_,
+        inputFileNames.toArray(new String[0]),
+        outputLevel,
+        outputPathId,
+        compactionJobInfo == null ? 0 : compactionJobInfo.nativeHandle_));
+  }
+
+  /**
+   * This function will cancel all currently running background processes.
+   *
+   * @param wait if true, wait for all background work to be cancelled before
+   *   returning.
+   *
+   */
+  public void cancelAllBackgroundWork(boolean wait) {
+    cancelAllBackgroundWork(nativeHandle_, wait);
+  }
+
+  /**
+   * This function will wait until all currently running background processes
+   * finish. After it returns, no background process will be run until
+   * {@link #continueBackgroundWork()} is called
+   *
+   * @throws RocksDBException if an error occurs when pausing background work
+   */
+  public void pauseBackgroundWork() throws RocksDBException {
+    pauseBackgroundWork(nativeHandle_);
+  }
+
+  /**
+   * Resumes background work which was suspended by
+   * previously calling {@link #pauseBackgroundWork()}
+   *
+   * @throws RocksDBException if an error occurs when resuming background work
+   */
+  public void continueBackgroundWork() throws RocksDBException {
+    continueBackgroundWork(nativeHandle_);
+  }
+
+  /**
+   * Enable automatic compactions for the given column
+   * families if they were previously disabled.
+   *
+   * The function will first set the
+   * {@link ColumnFamilyOptions#disableAutoCompactions()} option for each
+   * column family to false, after which it will schedule a flush/compaction.
+   *
+   * NOTE: Setting disableAutoCompactions to 'false' through
+   * {@link #setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}
+   * does NOT schedule a flush/compaction afterwards, and only changes the
+   * parameter itself within the column family option.
+   *
+   * @param columnFamilyHandles the column family handles
+   *
+   * @throws RocksDBException if an error occurs whilst enabling auto-compaction
+   */
+  public void enableAutoCompaction(
+      final List<ColumnFamilyHandle> columnFamilyHandles)
+      throws RocksDBException {
+    enableAutoCompaction(nativeHandle_,
+        toNativeHandleList(columnFamilyHandles));
+  }
+
+  /**
+   * Number of levels used for this DB.
+   *
+   * @return the number of levels
+   */
+  public int numberLevels() {
+    return numberLevels(null);
+  }
+
+  /**
+   * Number of levels used for a column family in this DB.
+   *
+   * @param columnFamilyHandle the column family handle, or null
+   *     for the default column family
+   *
+   * @return the number of levels
+   */
+  public int numberLevels(/* @Nullable */final ColumnFamilyHandle columnFamilyHandle) {
+    return numberLevels(nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Maximum level to which a new compacted memtable is pushed if it
+   * does not create overlap.
+   *
+   * @return the maximum level
+   */
+  public int maxMemCompactionLevel() {
+    return maxMemCompactionLevel(null);
+  }
+
+  /**
+   * Maximum level to which a new compacted memtable is pushed if it
+   * does not create overlap.
+   *
+   * @param columnFamilyHandle the column family handle
+   *
+   * @return the maximum level
+   */
+  public int maxMemCompactionLevel(
+      /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle) {
+      return maxMemCompactionLevel(nativeHandle_,
+          columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Number of files in level-0 that would stop writes.
+   *
+   * @return the number of files
+   */
+  public int level0StopWriteTrigger() {
+    return level0StopWriteTrigger(null);
+  }
+
+  /**
+   * Number of files in level-0 that would stop writes.
+   *
+   * @param columnFamilyHandle the column family handle
+   *
+   * @return the number of files
+   */
+  public int level0StopWriteTrigger(
+      /* @Nullable */final ColumnFamilyHandle columnFamilyHandle) {
+    return level0StopWriteTrigger(nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Get DB name -- the exact same name that was provided as an argument to
+   * as path to {@link #open(Options, String)}.
+   *
+   * @return the DB name
+   */
+  public String getName() {
+    return getName(nativeHandle_);
+  }
+
+  /**
+   * Get the Env object from the DB
+   *
+   * @return the env
+   */
+  public Env getEnv() {
+    final long envHandle = getEnv(nativeHandle_);
+    if (envHandle == Env.getDefault().nativeHandle_) {
+      return Env.getDefault();
+    } else {
+      final Env env = new RocksEnv(envHandle);
+      env.disOwnNativeHandle();  // we do not own the Env!
+      return env;
+    }
+  }
+
+  /**
+   * <p>Flush all memory table data.</p>
+   *
+   * <p>Note: it must be ensured that the FlushOptions instance
+   * is not GC'ed before this method finishes. If the wait parameter is
+   * set to false, flush processing is asynchronous.</p>
+   *
+   * @param flushOptions {@link org.rocksdb.FlushOptions} instance.
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void flush(final FlushOptions flushOptions)
+      throws RocksDBException {
+    flush(flushOptions, (List<ColumnFamilyHandle>) null);
+  }
+
+  /**
+   * <p>Flush all memory table data.</p>
+   *
+   * <p>Note: it must be ensured that the FlushOptions instance
+   * is not GC'ed before this method finishes. If the wait parameter is
+   * set to false, flush processing is asynchronous.</p>
+   *
+   * @param flushOptions {@link org.rocksdb.FlushOptions} instance.
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} instance.
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void flush(final FlushOptions flushOptions,
+      /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle)
+      throws RocksDBException {
+    flush(flushOptions,
+        columnFamilyHandle == null ? null : Arrays.asList(columnFamilyHandle));
+  }
+
+  /**
+   * Flushes multiple column families.
+   *
+   * If atomic flush is not enabled, this is equivalent to calling
+   * {@link #flush(FlushOptions, ColumnFamilyHandle)} multiple times.
+   *
+   * If atomic flush is enabled, this will flush all column families
+   * specified up to the latest sequence number at the time when flush is
+   * requested.
+   *
+   * @param flushOptions {@link org.rocksdb.FlushOptions} instance.
+   * @param columnFamilyHandles column family handles.
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void flush(final FlushOptions flushOptions,
+      /* @Nullable */ final List<ColumnFamilyHandle> columnFamilyHandles)
+      throws RocksDBException {
+    flush(nativeHandle_, flushOptions.nativeHandle_,
+        toNativeHandleList(columnFamilyHandles));
+  }
+
+  /**
+   * Flush the WAL memory buffer to the file. If {@code sync} is true,
+   * it calls {@link #syncWal()} afterwards.
+   *
+   * @param sync true to also fsync to disk.
+   *
+   * @throws RocksDBException if an error occurs whilst flushing
+   */
+  public void flushWal(final boolean sync) throws RocksDBException {
+    flushWal(nativeHandle_, sync);
+  }
+
+  /**
+   * Sync the WAL.
+   *
+   * Note that {@link #write(WriteOptions, WriteBatch)} followed by
+   * {@link #syncWal()} is not exactly the same as
+   * {@link #write(WriteOptions, WriteBatch)} with
+   * {@link WriteOptions#sync()} set to true; In the latter case the changes
+   * won't be visible until the sync is done.
+   *
+   * Currently only works if {@link Options#allowMmapWrites()} is set to false.
+   *
+   * @throws RocksDBException if an error occurs whilst syncing
+   */
+  public void syncWal() throws RocksDBException {
+    syncWal(nativeHandle_);
+  }
+
+  /**
+   * <p>The sequence number of the most recent transaction.</p>
+   *
+   * @return sequence number of the most
+   *     recent transaction.
+   */
+  public long getLatestSequenceNumber() {
+    return getLatestSequenceNumber(nativeHandle_);
+  }
+
+  /**
+   * <p>Prevent file deletions. Compactions will continue to occur,
+   * but no obsolete files will be deleted. Calling this multiple
+   * times have the same effect as calling it once.</p>
+   *
+   * @throws RocksDBException thrown if operation was not performed
+   *     successfully.
+   */
+  public void disableFileDeletions() throws RocksDBException {
+    disableFileDeletions(nativeHandle_);
+  }
+
+  /**
+   * <p>Allow compactions to delete obsolete files.
+   * If force == true, the call to EnableFileDeletions()
+   * will guarantee that file deletions are enabled after
+   * the call, even if DisableFileDeletions() was called
+   * multiple times before.</p>
+   *
+   * <p>If force == false, EnableFileDeletions will only
+   * enable file deletion after it's been called at least
+   * as many times as DisableFileDeletions(), enabling
+   * the two methods to be called by two threads
+   * concurrently without synchronization
+   * -- i.e., file deletions will be enabled only after both
+   * threads call EnableFileDeletions()</p>
+   *
+   * @param force boolean value described above.
+   *
+   * @throws RocksDBException thrown if operation was not performed
+   *     successfully.
+   */
+  public void enableFileDeletions(final boolean force)
+      throws RocksDBException {
+    enableFileDeletions(nativeHandle_, force);
+  }
+
+  public static class LiveFiles {
+    /**
+     * The valid size of the manifest file. The manifest file is an ever growing
+     * file, but only the portion specified here is valid for this snapshot.
+     */
+    public final long manifestFileSize;
+
+    /**
+     * The files are relative to the {@link #getName()} and are not
+     * absolute paths. Despite being relative paths, the file names begin
+     * with "/".
+     */
+    public final List<String> files;
+
+    LiveFiles(final long manifestFileSize, final List<String> files) {
+      this.manifestFileSize = manifestFileSize;
+      this.files = files;
+    }
+  }
+
+  /**
+   * Retrieve the list of all files in the database after flushing the memtable.
+   *
+   * See {@link #getLiveFiles(boolean)}.
+   *
+   * @return the live files
+   *
+   * @throws RocksDBException if an error occurs whilst retrieving the list
+   *     of live files
+   */
+  public LiveFiles getLiveFiles() throws RocksDBException {
+    return getLiveFiles(true);
+  }
+
+  /**
+   * Retrieve the list of all files in the database.
+   *
+   * In case you have multiple column families, even if {@code flushMemtable}
+   * is true, you still need to call {@link #getSortedWalFiles()}
+   * after {@link #getLiveFiles(boolean)} to compensate for new data that
+   * arrived to already-flushed column families while other column families
+   * were flushing.
+   *
+   * NOTE: Calling {@link #getLiveFiles(boolean)} followed by
+   *     {@link #getSortedWalFiles()} can generate a lossless backup.
+   *
+   * @param flushMemtable set to true to flush before recoding the live
+   *     files. Setting to false is useful when we don't want to wait for flush
+   *     which may have to wait for compaction to complete taking an
+   *     indeterminate time.
+   *
+   * @return the live files
+   *
+   * @throws RocksDBException if an error occurs whilst retrieving the list
+   *     of live files
+   */
+  public LiveFiles getLiveFiles(final boolean flushMemtable)
+      throws RocksDBException {
+     final String[] result = getLiveFiles(nativeHandle_, flushMemtable);
+     if (result == null) {
+       return null;
+     }
+     final String[] files = Arrays.copyOf(result, result.length - 1);
+     final long manifestFileSize = Long.parseLong(result[result.length - 1]);
+
+     return new LiveFiles(manifestFileSize, Arrays.asList(files));
+  }
+
+  /**
+   * Retrieve the sorted list of all wal files with earliest file first.
+   *
+   * @return the log files
+   *
+   * @throws RocksDBException if an error occurs whilst retrieving the list
+   *     of sorted WAL files
+   */
+  public List<LogFile> getSortedWalFiles() throws RocksDBException {
+    final LogFile[] logFiles = getSortedWalFiles(nativeHandle_);
+    return Arrays.asList(logFiles);
+  }
+
+  /**
+   * <p>Returns an iterator that is positioned at a write-batch containing
+   * seq_number. If the sequence number is non existent, it returns an iterator
+   * at the first available seq_no after the requested seq_no.</p>
+   *
+   * <p>Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to
+   * use this api, else the WAL files will get
+   * cleared aggressively and the iterator might keep getting invalid before
+   * an update is read.</p>
+   *
+   * @param sequenceNumber sequence number offset
+   *
+   * @return {@link org.rocksdb.TransactionLogIterator} instance.
+   *
+   * @throws org.rocksdb.RocksDBException if iterator cannot be retrieved
+   *     from native-side.
+   */
+  public TransactionLogIterator getUpdatesSince(final long sequenceNumber)
+      throws RocksDBException {
+    return new TransactionLogIterator(
+        getUpdatesSince(nativeHandle_, sequenceNumber));
+  }
+
+  /**
+   * Delete the file name from the db directory and update the internal state to
+   * reflect that. Supports deletion of sst and log files only. 'name' must be
+   * path relative to the db directory. eg. 000001.sst, /archive/000003.log
+   *
+   * @param name the file name
+   *
+   * @throws RocksDBException if an error occurs whilst deleting the file
+   */
+  public void deleteFile(final String name) throws RocksDBException {
+    deleteFile(nativeHandle_, name);
+  }
+
+  /**
+   * Gets a list of all table files metadata.
+   *
+   * @return table files metadata.
+   */
+  public List<LiveFileMetaData> getLiveFilesMetaData() {
+    return Arrays.asList(getLiveFilesMetaData(nativeHandle_));
+  }
+
+  /**
+   * Obtains the meta data of the specified column family of the DB.
+   *
+   * @param columnFamilyHandle the column family
+   *
+   * @return the column family metadata
+   */
+  public ColumnFamilyMetaData getColumnFamilyMetaData(
+      /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle) {
+    return getColumnFamilyMetaData(nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Obtains the meta data of the default column family of the DB.
+   *
+   * @return the column family metadata
+   */
+  public ColumnFamilyMetaData getColumnFamilyMetaData() {
+    return getColumnFamilyMetaData(null);
+  }
+
+  /**
+   * ingestExternalFile will load a list of external SST files (1) into the DB
+   * We will try to find the lowest possible level that the file can fit in, and
+   * ingest the file into this level (2). A file that have a key range that
+   * overlap with the memtable key range will require us to Flush the memtable
+   * first before ingesting the file.
+   *
+   * (1) External SST files can be created using {@link SstFileWriter}
+   * (2) We will try to ingest the files to the lowest possible level
+   * even if the file compression doesn't match the level compression
+   *
+   * @param filePathList The list of files to ingest
+   * @param ingestExternalFileOptions the options for the ingestion
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *     native library.
+   */
+  public void ingestExternalFile(final List<String> filePathList,
+      final IngestExternalFileOptions ingestExternalFileOptions)
+      throws RocksDBException {
+    ingestExternalFile(nativeHandle_, getDefaultColumnFamily().nativeHandle_,
+        filePathList.toArray(new String[0]),
+        filePathList.size(), ingestExternalFileOptions.nativeHandle_);
+  }
+
+  /**
+   * ingestExternalFile will load a list of external SST files (1) into the DB
+   * We will try to find the lowest possible level that the file can fit in, and
+   * ingest the file into this level (2). A file that have a key range that
+   * overlap with the memtable key range will require us to Flush the memtable
+   * first before ingesting the file.
+   *
+   * (1) External SST files can be created using {@link SstFileWriter}
+   * (2) We will try to ingest the files to the lowest possible level
+   * even if the file compression doesn't match the level compression
+   *
+   * @param columnFamilyHandle The column family for the ingested files
+   * @param filePathList The list of files to ingest
+   * @param ingestExternalFileOptions the options for the ingestion
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *     native library.
+   */
+  public void ingestExternalFile(final ColumnFamilyHandle columnFamilyHandle,
+      final List<String> filePathList,
+      final IngestExternalFileOptions ingestExternalFileOptions)
+      throws RocksDBException {
+    ingestExternalFile(nativeHandle_, columnFamilyHandle.nativeHandle_,
+        filePathList.toArray(new String[0]),
+        filePathList.size(), ingestExternalFileOptions.nativeHandle_);
+  }
+
+  /**
+   * Verify checksum
+   *
+   * @throws RocksDBException if the checksum is not valid
+   */
+  public void verifyChecksum() throws RocksDBException {
+    verifyChecksum(nativeHandle_);
+  }
+
+  /**
+   * Gets the handle for the default column family
+   *
+   * @return The handle of the default column family
+   */
+  public ColumnFamilyHandle getDefaultColumnFamily() {
+    final ColumnFamilyHandle cfHandle = new ColumnFamilyHandle(this,
+        getDefaultColumnFamily(nativeHandle_));
+    cfHandle.disOwnNativeHandle();
+    return cfHandle;
+  }
+
+  /**
+   * Get the properties of all tables.
+   *
+   * @param columnFamilyHandle the column family handle, or null for the default
+   *     column family.
+   *
+   * @return the properties
+   *
+   * @throws RocksDBException if an error occurs whilst getting the properties
+   */
+  public Map<String, TableProperties> getPropertiesOfAllTables(
+      /* @Nullable */final ColumnFamilyHandle columnFamilyHandle)
+      throws RocksDBException {
+    return getPropertiesOfAllTables(nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Get the properties of all tables in the default column family.
+   *
+   * @return the properties
+   *
+   * @throws RocksDBException if an error occurs whilst getting the properties
+   */
+  public Map<String, TableProperties> getPropertiesOfAllTables()
+      throws RocksDBException {
+    return getPropertiesOfAllTables(null);
+  }
+
+  /**
+   * Get the properties of tables in range.
+   *
+   * @param columnFamilyHandle the column family handle, or null for the default
+   *     column family.
+   * @param ranges the ranges over which to get the table properties
+   *
+   * @return the properties
+   *
+   * @throws RocksDBException if an error occurs whilst getting the properties
+   */
+  public Map<String, TableProperties> getPropertiesOfTablesInRange(
+      /* @Nullable */final ColumnFamilyHandle columnFamilyHandle,
+      final List<Range> ranges) throws RocksDBException {
+    return getPropertiesOfTablesInRange(nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_,
+        toRangeSliceHandles(ranges));
+  }
+
+  /**
+   * Get the properties of tables in range for the default column family.
+   *
+   * @param ranges the ranges over which to get the table properties
+   *
+   * @return the properties
+   *
+   * @throws RocksDBException if an error occurs whilst getting the properties
+   */
+  public Map<String, TableProperties> getPropertiesOfTablesInRange(
+      final List<Range> ranges) throws RocksDBException {
+    return getPropertiesOfTablesInRange(null, ranges);
+  }
+
+  /**
+   * Suggest the range to compact.
+   *
+   * @param columnFamilyHandle the column family handle, or null for the default
+   *     column family.
+   *
+   * @return the suggested range.
+   *
+   * @throws RocksDBException if an error occurs whilst suggesting the range
+   */
+  public Range suggestCompactRange(
+      /* @Nullable */final ColumnFamilyHandle columnFamilyHandle)
+      throws RocksDBException {
+    final long[] rangeSliceHandles = suggestCompactRange(nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_);
+    return new Range(new Slice(rangeSliceHandles[0]),
+        new Slice(rangeSliceHandles[1]));
+  }
+
+  /**
+   * Suggest the range to compact for the default column family.
+   *
+   * @return the suggested range.
+   *
+   * @throws RocksDBException if an error occurs whilst suggesting the range
+   */
+  public Range suggestCompactRange()
+      throws RocksDBException {
+    return suggestCompactRange(null);
+  }
+
+  /**
+   * Promote L0.
+   *
+   * @param columnFamilyHandle the column family handle,
+   *     or null for the default column family.
+   * @param targetLevel the target level for L0
+   *
+   * @throws RocksDBException if an error occurs whilst promoting L0
+   */
+  public void promoteL0(
+      /* @Nullable */final ColumnFamilyHandle columnFamilyHandle,
+      final int targetLevel) throws RocksDBException {
+    promoteL0(nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_,
+        targetLevel);
+  }
+
+  /**
+   * Promote L0 for the default column family.
+   *
+   * @param targetLevel the target level for L0
+   *
+   * @throws RocksDBException if an error occurs whilst promoting L0
+   */
+  public void promoteL0(final int targetLevel)
+      throws RocksDBException {
+    promoteL0(null, targetLevel);
+  }
+
+  /**
+   * Trace DB operations.
+   *
+   * Use {@link #endTrace()} to stop tracing.
+   *
+   * @param traceOptions the options
+   * @param traceWriter the trace writer
+   *
+   * @throws RocksDBException if an error occurs whilst starting the trace
+   */
+  public void startTrace(final TraceOptions traceOptions,
+      final AbstractTraceWriter traceWriter) throws RocksDBException {
+    startTrace(nativeHandle_, traceOptions.getMaxTraceFileSize(),
+        traceWriter.nativeHandle_);
+    /**
+     * NOTE: {@link #startTrace(long, long, long) transfers the ownership
+     * from Java to C++, so we must disown the native handle here.
+     */
+    traceWriter.disOwnNativeHandle();
+  }
+
+  /**
+   * Stop tracing DB operations.
+   *
+   * See {@link #startTrace(TraceOptions, AbstractTraceWriter)}
+   *
+   * @throws RocksDBException if an error occurs whilst ending the trace
+   */
+  public void endTrace() throws RocksDBException {
+    endTrace(nativeHandle_);
+  }
+
+  /**
+   * Make the secondary instance catch up with the primary by tailing and
+   * replaying the MANIFEST and WAL of the primary.
+   * Column families created by the primary after the secondary instance starts
+   * will be ignored unless the secondary instance closes and restarts with the
+   * newly created column families.
+   * Column families that exist before secondary instance starts and dropped by
+   * the primary afterwards will be marked as dropped. However, as long as the
+   * secondary instance does not delete the corresponding column family
+   * handles, the data of the column family is still accessible to the
+   * secondary.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *     native library.
+   */
+  public void tryCatchUpWithPrimary() throws RocksDBException {
+    tryCatchUpWithPrimary(nativeHandle_);
+  }
+
+  /**
+   * Delete files in multiple ranges at once.
+   * Delete files in a lot of ranges one at a time can be slow, use this API for
+   * better performance in that case.
+   *
+   * @param columnFamily - The column family for operation (null for default)
+   * @param includeEnd - Whether ranges should include end
+   * @param ranges - pairs of ranges (from1, to1, from2, to2, ...)
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *     native library.
+   */
+  public void deleteFilesInRanges(final ColumnFamilyHandle columnFamily,
+      final List<byte[]> ranges, final boolean includeEnd)
+      throws RocksDBException {
+    if (ranges.size() == 0) {
+      return;
+    }
+    if ((ranges.size() % 2) != 0) {
+      throw new IllegalArgumentException("Ranges size needs to be multiple of 2 "
+          + "(from1, to1, from2, to2, ...), but is " + ranges.size());
+    }
+
+    final byte[][] rangesArray = ranges.toArray(new byte[ranges.size()][]);
+
+    deleteFilesInRanges(nativeHandle_, columnFamily == null ? 0 : columnFamily.nativeHandle_,
+        rangesArray, includeEnd);
+  }
+
+  /**
+   * Static method to destroy the contents of the specified database.
+   * Be very careful using this method.
+   *
+   * @param path the path to the Rocksdb database.
+   * @param options {@link org.rocksdb.Options} instance.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public static void destroyDB(final String path, final Options options)
+      throws RocksDBException {
+    destroyDB(path, options.nativeHandle_);
+  }
+
+  private /* @Nullable */ long[] toNativeHandleList(
+      /* @Nullable */ final List<? extends RocksObject> objectList) {
+    if (objectList == null) {
+      return null;
+    }
+    final int len = objectList.size();
+    final long[] handleList = new long[len];
+    for (int i = 0; i < len; i++) {
+      handleList[i] = objectList.get(i).nativeHandle_;
+    }
+    return handleList;
+  }
+
+  private static long[] toRangeSliceHandles(final List<Range> ranges) {
+    final long rangeSliceHandles[] = new long [ranges.size() * 2];
+    for (int i = 0, j = 0; i < ranges.size(); i++) {
+      final Range range = ranges.get(i);
+      rangeSliceHandles[j++] = range.start.getNativeHandle();
+      rangeSliceHandles[j++] = range.limit.getNativeHandle();
+    }
+    return rangeSliceHandles;
+  }
+
+  protected void storeOptionsInstance(DBOptionsInterface<?> options) {
+    options_ = options;
+  }
+
+  private static void checkBounds(int offset, int len, int size) {
+    if ((offset | len | (offset + len) | (size - (offset + len))) < 0) {
+      throw new IndexOutOfBoundsException(String.format("offset(%d), len(%d), size(%d)", offset, len, size));
+    }
+  }
+
+  private static int computeCapacityHint(final int estimatedNumberOfItems) {
+    // Default load factor for HashMap is 0.75, so N * 1.5 will be at the load
+    // limit. We add +1 for a buffer.
+    return (int)Math.ceil(estimatedNumberOfItems * 1.5 + 1.0);
+  }
+
+  // native methods
+  private native static long open(final long optionsHandle,
+      final String path) throws RocksDBException;
+
+  /**
+   * @param optionsHandle Native handle pointing to an Options object
+   * @param path The directory path for the database files
+   * @param columnFamilyNames An array of column family names
+   * @param columnFamilyOptions An array of native handles pointing to
+   *                            ColumnFamilyOptions objects
+   *
+   * @return An array of native handles, [0] is the handle of the RocksDB object
+   *   [1..1+n] are handles of the ColumnFamilyReferences
+   *
+   * @throws RocksDBException thrown if the database could not be opened
+   */
+  private native static long[] open(final long optionsHandle,
+      final String path, final byte[][] columnFamilyNames,
+      final long[] columnFamilyOptions) throws RocksDBException;
+
+  private native static long openROnly(final long optionsHandle, final String path,
+      final boolean errorIfWalFileExists) throws RocksDBException;
+
+  /**
+   * @param optionsHandle Native handle pointing to an Options object
+   * @param path The directory path for the database files
+   * @param columnFamilyNames An array of column family names
+   * @param columnFamilyOptions An array of native handles pointing to
+   *                            ColumnFamilyOptions objects
+   *
+   * @return An array of native handles, [0] is the handle of the RocksDB object
+   *   [1..1+n] are handles of the ColumnFamilyReferences
+   *
+   * @throws RocksDBException thrown if the database could not be opened
+   */
+  private native static long[] openROnly(final long optionsHandle, final String path,
+      final byte[][] columnFamilyNames, final long[] columnFamilyOptions,
+      final boolean errorIfWalFileExists) throws RocksDBException;
+
+  private native static long openAsSecondary(final long optionsHandle, final String path,
+      final String secondaryPath) throws RocksDBException;
+
+  private native static long[] openAsSecondary(final long optionsHandle, final String path,
+      final String secondaryPath, final byte[][] columnFamilyNames,
+      final long[] columnFamilyOptions) throws RocksDBException;
+
+  @Override protected native void disposeInternal(final long handle);
+
+  private native static void closeDatabase(final long handle)
+      throws RocksDBException;
+  private native static byte[][] listColumnFamilies(final long optionsHandle,
+      final String path) throws RocksDBException;
+  private native long createColumnFamily(final long handle,
+      final byte[] columnFamilyName, final int columnFamilyNamelen,
+      final long columnFamilyOptions) throws RocksDBException;
+  private native long[] createColumnFamilies(final long handle,
+      final long columnFamilyOptionsHandle, final byte[][] columnFamilyNames)
+      throws RocksDBException;
+  private native long[] createColumnFamilies(final long handle,
+      final long columnFamilyOptionsHandles[], final byte[][] columnFamilyNames)
+      throws RocksDBException;
+  private native void dropColumnFamily(
+      final long handle, final long cfHandle) throws RocksDBException;
+  private native void dropColumnFamilies(final long handle,
+      final long[] cfHandles) throws RocksDBException;
+  private native void put(final long handle, final byte[] key,
+      final int keyOffset, final int keyLength, final byte[] value,
+      final int valueOffset, int valueLength) throws RocksDBException;
+  private native void put(final long handle, final byte[] key, final int keyOffset,
+      final int keyLength, final byte[] value, final int valueOffset,
+      final int valueLength, final long cfHandle) throws RocksDBException;
+  private native void put(final long handle, final long writeOptHandle,
+      final byte[] key,  final int keyOffset, final int keyLength,
+      final byte[] value, final int valueOffset, final int valueLength)
+      throws RocksDBException;
+  private native void put(final long handle, final long writeOptHandle,
+      final byte[] key, final int keyOffset, final int keyLength,
+      final byte[] value, final int valueOffset, final int valueLength,
+      final long cfHandle) throws RocksDBException;
+  private native void delete(final long handle, final byte[] key,
+      final int keyOffset, final int keyLength) throws RocksDBException;
+  private native void delete(final long handle, final byte[] key,
+      final int keyOffset, final int keyLength, final long cfHandle)
+      throws RocksDBException;
+  private native void delete(final long handle, final long writeOptHandle,
+      final byte[] key, final int keyOffset, final int keyLength)
+      throws RocksDBException;
+  private native void delete(final long handle, final long writeOptHandle,
+      final byte[] key, final int keyOffset, final int keyLength,
+      final long cfHandle) throws RocksDBException;
+  private native void singleDelete(
+      final long handle, final byte[] key, final int keyLen)
+      throws RocksDBException;
+  private native void singleDelete(
+      final long handle, final byte[] key, final int keyLen,
+      final long cfHandle) throws RocksDBException;
+  private native void singleDelete(
+      final long handle, final long writeOptHandle, final byte[] key,
+      final int keyLen) throws RocksDBException;
+  private native void singleDelete(
+      final long handle, final long writeOptHandle,
+      final byte[] key, final int keyLen, final long cfHandle)
+      throws RocksDBException;
+  private native void deleteRange(final long handle, final byte[] beginKey,
+      final int beginKeyOffset, final int beginKeyLength, final byte[] endKey,
+      final int endKeyOffset, final int endKeyLength) throws RocksDBException;
+  private native void deleteRange(final long handle, final byte[] beginKey,
+      final int beginKeyOffset, final int beginKeyLength, final byte[] endKey,
+      final int endKeyOffset, final int endKeyLength, final long cfHandle)
+      throws RocksDBException;
+  private native void deleteRange(final long handle, final long writeOptHandle,
+      final byte[] beginKey, final int beginKeyOffset, final int beginKeyLength,
+      final byte[] endKey, final int endKeyOffset, final int endKeyLength)
+      throws RocksDBException;
+  private native void deleteRange(
+      final long handle, final long writeOptHandle, final byte[] beginKey,
+      final int beginKeyOffset, final int beginKeyLength, final byte[] endKey,
+      final int endKeyOffset, final int endKeyLength, final long cfHandle)
+      throws RocksDBException;
+  private native void merge(final long handle, final byte[] key,
+      final int keyOffset, final int keyLength, final byte[] value,
+      final int valueOffset, final int valueLength) throws RocksDBException;
+  private native void merge(final long handle, final byte[] key,
+      final int keyOffset, final int keyLength, final byte[] value,
+      final int valueOffset, final int valueLength, final long cfHandle)
+      throws RocksDBException;
+  private native void merge(final long handle, final long writeOptHandle,
+      final byte[] key, final int keyOffset, final int keyLength,
+      final byte[] value, final int valueOffset, final int valueLength)
+      throws RocksDBException;
+  private native void merge(final long handle, final long writeOptHandle,
+      final byte[] key, final int keyOffset, final int keyLength,
+      final byte[] value, final int valueOffset, final int valueLength,
+      final long cfHandle) throws RocksDBException;
+  private native void write0(final long handle, final long writeOptHandle,
+      final long wbHandle) throws RocksDBException;
+  private native void write1(final long handle, final long writeOptHandle,
+      final long wbwiHandle) throws RocksDBException;
+  private native int get(final long handle, final byte[] key,
+      final int keyOffset, final int keyLength, final byte[] value,
+      final int valueOffset, final int valueLength) throws RocksDBException;
+  private native int get(final long handle, final byte[] key,
+      final int keyOffset, final int keyLength, byte[] value,
+      final int valueOffset, final int valueLength, final long cfHandle)
+      throws RocksDBException;
+  private native int get(final long handle, final long readOptHandle,
+      final byte[] key, final int keyOffset, final int keyLength,
+      final byte[] value, final int valueOffset, final int valueLength)
+      throws RocksDBException;
+  private native int get(final long handle, final long readOptHandle,
+      final byte[] key, final int keyOffset, final int keyLength,
+      final byte[] value, final int valueOffset, final int valueLength,
+      final long cfHandle) throws RocksDBException;
+  private native byte[] get(final long handle, byte[] key, final int keyOffset,
+      final int keyLength) throws RocksDBException;
+  private native byte[] get(final long handle, final byte[] key,
+      final int keyOffset, final int keyLength, final long cfHandle)
+      throws RocksDBException;
+  private native byte[] get(final long handle, final long readOptHandle,
+      final byte[] key, final int keyOffset, final int keyLength)
+      throws RocksDBException;
+  private native byte[] get(final long handle,
+      final long readOptHandle, final byte[] key, final int keyOffset,
+      final int keyLength, final long cfHandle) throws RocksDBException;
+  private native byte[][] multiGet(final long dbHandle, final byte[][] keys,
+      final int[] keyOffsets, final int[] keyLengths);
+  private native byte[][] multiGet(final long dbHandle, final byte[][] keys,
+      final int[] keyOffsets, final int[] keyLengths,
+      final long[] columnFamilyHandles);
+  private native byte[][] multiGet(final long dbHandle, final long rOptHandle,
+      final byte[][] keys, final int[] keyOffsets, final int[] keyLengths);
+  private native byte[][] multiGet(final long dbHandle, final long rOptHandle,
+      final byte[][] keys, final int[] keyOffsets, final int[] keyLengths,
+      final long[] columnFamilyHandles);
+
+  private native void multiGet(final long dbHandle, final long rOptHandle,
+      final long[] columnFamilyHandles, final ByteBuffer[] keysArray, final int[] keyOffsets,
+      final int[] keyLengths, final ByteBuffer[] valuesArray, final int[] valuesSizeArray,
+      final Status[] statusArray);
+
+  private native boolean keyMayExist(
+      final long handle, final long cfHandle, final long readOptHandle,
+      final byte[] key, final int keyOffset, final int keyLength);
+  private native byte[][] keyMayExistFoundValue(
+      final long handle, final long cfHandle, final long readOptHandle,
+      final byte[] key, final int keyOffset, final int keyLength);
+  private native void putDirect(long handle, long writeOptHandle, ByteBuffer key, int keyOffset,
+      int keyLength, ByteBuffer value, int valueOffset, int valueLength, long cfHandle)
+      throws RocksDBException;
+  private native long iterator(final long handle);
+  private native long iterator(final long handle, final long readOptHandle);
+  private native long iteratorCF(final long handle, final long cfHandle);
+  private native long iteratorCF(final long handle, final long cfHandle,
+      final long readOptHandle);
+  private native long[] iterators(final long handle,
+      final long[] columnFamilyHandles, final long readOptHandle)
+      throws RocksDBException;
+  private native long getSnapshot(final long nativeHandle);
+  private native void releaseSnapshot(
+      final long nativeHandle, final long snapshotHandle);
+  private native String getProperty(final long nativeHandle,
+      final long cfHandle, final String property, final int propertyLength)
+      throws RocksDBException;
+  private native Map<String, String> getMapProperty(final long nativeHandle,
+      final long cfHandle, final String property, final int propertyLength)
+      throws RocksDBException;
+  private native int getDirect(long handle, long readOptHandle, ByteBuffer key, int keyOffset,
+      int keyLength, ByteBuffer value, int valueOffset, int valueLength, long cfHandle)
+      throws RocksDBException;
+  private native boolean keyMayExistDirect(final long handle, final long cfHhandle,
+      final long readOptHandle, final ByteBuffer key, final int keyOffset, final int keyLength);
+  private native int[] keyMayExistDirectFoundValue(final long handle, final long cfHhandle,
+      final long readOptHandle, final ByteBuffer key, final int keyOffset, final int keyLength,
+      final ByteBuffer value, final int valueOffset, final int valueLength);
+  private native void deleteDirect(long handle, long optHandle, ByteBuffer key, int keyOffset,
+      int keyLength, long cfHandle) throws RocksDBException;
+  private native long getLongProperty(final long nativeHandle,
+      final long cfHandle, final String property, final int propertyLength)
+      throws RocksDBException;
+  private native void resetStats(final long nativeHandle)
+      throws RocksDBException;
+  private native long getAggregatedLongProperty(final long nativeHandle,
+      final String property, int propertyLength) throws RocksDBException;
+  private native long[] getApproximateSizes(final long nativeHandle,
+      final long columnFamilyHandle, final long[] rangeSliceHandles,
+      final byte includeFlags);
+  private native long[] getApproximateMemTableStats(final long nativeHandle,
+      final long columnFamilyHandle, final long rangeStartSliceHandle,
+      final long rangeLimitSliceHandle);
+  private native void compactRange(final long handle,
+      /* @Nullable */ final byte[] begin, final int beginLen,
+      /* @Nullable */ final byte[] end, final int endLen,
+      final long compactRangeOptHandle, final long cfHandle)
+      throws RocksDBException;
+  private native void setOptions(final long handle, final long cfHandle,
+      final String[] keys, final String[] values) throws RocksDBException;
+  private native String getOptions(final long handle, final long cfHandle);
+  private native void setDBOptions(final long handle,
+      final String[] keys, final String[] values) throws RocksDBException;
+  private native String getDBOptions(final long handle);
+  private native String[] compactFiles(final long handle,
+      final long compactionOptionsHandle,
+      final long columnFamilyHandle,
+      final String[] inputFileNames,
+      final int outputLevel,
+      final int outputPathId,
+      final long compactionJobInfoHandle) throws RocksDBException;
+  private native void cancelAllBackgroundWork(final long handle,
+      final boolean wait);
+  private native void pauseBackgroundWork(final long handle)
+      throws RocksDBException;
+  private native void continueBackgroundWork(final long handle)
+      throws RocksDBException;
+  private native void enableAutoCompaction(final long handle,
+      final long[] columnFamilyHandles) throws RocksDBException;
+  private native int numberLevels(final long handle,
+      final long columnFamilyHandle);
+  private native int maxMemCompactionLevel(final long handle,
+      final long columnFamilyHandle);
+  private native int level0StopWriteTrigger(final long handle,
+      final long columnFamilyHandle);
+  private native String getName(final long handle);
+  private native long getEnv(final long handle);
+  private native void flush(final long handle, final long flushOptHandle,
+      /* @Nullable */ final long[] cfHandles) throws RocksDBException;
+  private native void flushWal(final long handle, final boolean sync)
+      throws RocksDBException;
+  private native void syncWal(final long handle) throws RocksDBException;
+  private native long getLatestSequenceNumber(final long handle);
+  private native void disableFileDeletions(long handle) throws RocksDBException;
+  private native void enableFileDeletions(long handle, boolean force)
+      throws RocksDBException;
+  private native String[] getLiveFiles(final long handle,
+      final boolean flushMemtable) throws RocksDBException;
+  private native LogFile[] getSortedWalFiles(final long handle)
+      throws RocksDBException;
+  private native long getUpdatesSince(final long handle,
+      final long sequenceNumber) throws RocksDBException;
+  private native void deleteFile(final long handle, final String name)
+      throws RocksDBException;
+  private native LiveFileMetaData[] getLiveFilesMetaData(final long handle);
+  private native ColumnFamilyMetaData getColumnFamilyMetaData(
+      final long handle, final long columnFamilyHandle);
+  private native void ingestExternalFile(final long handle,
+      final long columnFamilyHandle,  final String[] filePathList,
+      final int filePathListLen, final long ingestExternalFileOptionsHandle)
+      throws RocksDBException;
+  private native void verifyChecksum(final long handle) throws RocksDBException;
+  private native long getDefaultColumnFamily(final long handle);
+  private native Map<String, TableProperties> getPropertiesOfAllTables(
+      final long handle, final long columnFamilyHandle) throws RocksDBException;
+  private native Map<String, TableProperties> getPropertiesOfTablesInRange(
+      final long handle, final long columnFamilyHandle,
+      final long[] rangeSliceHandles);
+  private native long[] suggestCompactRange(final long handle,
+      final long columnFamilyHandle) throws RocksDBException;
+  private native void promoteL0(final long handle,
+      final long columnFamilyHandle, final int tragetLevel)
+      throws RocksDBException;
+  private native void startTrace(final long handle, final long maxTraceFileSize,
+      final long traceWriterHandle) throws RocksDBException;
+  private native void endTrace(final long handle) throws RocksDBException;
+  private native void tryCatchUpWithPrimary(final long handle) throws RocksDBException;
+  private native void deleteFilesInRanges(long handle, long cfHandle, final byte[][] ranges,
+      boolean include_end) throws RocksDBException;
+
+  private native static void destroyDB(final String path,
+      final long optionsHandle) throws RocksDBException;
+
+  private native static int version();
+
+  protected DBOptionsInterface<?> options_;
+  private static Version version;
+
+  public static class Version {
+    private final byte major;
+    private final byte minor;
+    private final byte patch;
+
+    public Version(final byte major, final byte minor, final byte patch) {
+      this.major = major;
+      this.minor = minor;
+      this.patch = patch;
+    }
+
+    public int getMajor() {
+      return major;
+    }
+
+    public int getMinor() {
+      return minor;
+    }
+
+    public int getPatch() {
+      return patch;
+    }
+
+    @Override
+    public String toString() {
+      return getMajor() + "." + getMinor() + "." + getPatch();
+    }
+
+    private static Version fromEncodedVersion(int encodedVersion) {
+      final byte patch = (byte) (encodedVersion & 0xff);
+      encodedVersion >>= 8;
+      final byte minor = (byte) (encodedVersion & 0xff);
+      encodedVersion >>= 8;
+      final byte major = (byte) (encodedVersion & 0xff);
+
+      return new Version(major, minor, patch);
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/RocksDBException.java b/src/rocksdb/java/src/main/java/org/rocksdb/RocksDBException.java
new file mode 100644
index 000000000..8b035f458
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/RocksDBException.java
@@ -0,0 +1,44 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * A RocksDBException encapsulates the error of an operation.  This exception
+ * type is used to describe an internal error from the c++ rocksdb library.
+ */
+public class RocksDBException extends Exception {
+
+  /* @Nullable */ private final Status status;
+
+  /**
+   * The private construct used by a set of public static factory method.
+   *
+   * @param msg the specified error message.
+   */
+  public RocksDBException(final String msg) {
+    this(msg, null);
+  }
+
+  public RocksDBException(final String msg, final Status status) {
+    super(msg);
+    this.status = status;
+  }
+
+  public RocksDBException(final Status status) {
+    super(status.getState() != null ? status.getState()
+        : status.getCodeString());
+    this.status = status;
+  }
+
+  /**
+   * Get the status returned from RocksDB
+   *
+   * @return The status reported by RocksDB, or null if no status is available
+   */
+  public Status getStatus() {
+    return status;
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/RocksEnv.java b/src/rocksdb/java/src/main/java/org/rocksdb/RocksEnv.java
new file mode 100644
index 000000000..b3681d77d
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/RocksEnv.java
@@ -0,0 +1,32 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * <p>A RocksEnv is an interface used by the rocksdb implementation to access
+ * operating system functionality like the filesystem etc.</p>
+ *
+ * <p>All Env implementations are safe for concurrent access from
+ * multiple threads without any external synchronization.</p>
+ */
+public class RocksEnv extends Env {
+
+  /**
+   * <p>Package-private constructor that uses the specified native handle
+   * to construct a RocksEnv.</p>
+   *
+   * <p>Note that the ownership of the input handle
+   * belongs to the caller, and the newly created RocksEnv will not take
+   * the ownership of the input handle.  As a result, calling
+   * {@code dispose()} of the created RocksEnv will be no-op.</p>
+   */
+  RocksEnv(final long handle) {
+    super(handle);
+  }
+
+  @Override
+  protected native final void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/RocksIterator.java b/src/rocksdb/java/src/main/java/org/rocksdb/RocksIterator.java
new file mode 100644
index 000000000..20e56d2eb
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/RocksIterator.java
@@ -0,0 +1,140 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.nio.ByteBuffer;
+
+/**
+ * <p>An iterator that yields a sequence of key/value pairs from a source.
+ * Multiple implementations are provided by this library.
+ * In particular, iterators are provided
+ * to access the contents of a Table or a DB.</p>
+ *
+ * <p>Multiple threads can invoke const methods on an RocksIterator without
+ * external synchronization, but if any of the threads may call a
+ * non-const method, all threads accessing the same RocksIterator must use
+ * external synchronization.</p>
+ *
+ * @see org.rocksdb.RocksObject
+ */
+public class RocksIterator extends AbstractRocksIterator<RocksDB> {
+  protected RocksIterator(final RocksDB rocksDB, final long nativeHandle) {
+    super(rocksDB, nativeHandle);
+  }
+
+  /**
+   * <p>Return the key for the current entry.  The underlying storage for
+   * the returned slice is valid only until the next modification of
+   * the iterator.</p>
+   *
+   * <p>REQUIRES: {@link #isValid()}</p>
+   *
+   * @return key for the current entry.
+   */
+  public byte[] key() {
+    assert(isOwningHandle());
+    return key0(nativeHandle_);
+  }
+
+  /**
+   * <p>Return the key for the current entry.  The underlying storage for
+   * the returned slice is valid only until the next modification of
+   * the iterator.</p>
+   *
+   * <p>REQUIRES: {@link #isValid()}</p>
+   *
+   * @param key the out-value to receive the retrieved key.
+   *     It is using position and limit. Limit is set according to key size.
+   *     Supports direct buffer only.
+   * @return The size of the actual key. If the return key is greater than the
+   *     length of {@code key}, then it indicates that the size of the
+   *     input buffer {@code key} is insufficient and partial result will
+   *     be returned.
+   */
+  public int key(final ByteBuffer key) {
+    assert isOwningHandle();
+    final int result;
+    if (key.isDirect()) {
+      result = keyDirect0(nativeHandle_, key, key.position(), key.remaining());
+    } else {
+      assert key.hasArray();
+      result = keyByteArray0(
+          nativeHandle_, key.array(), key.arrayOffset() + key.position(), key.remaining());
+    }
+    key.limit(Math.min(key.position() + result, key.limit()));
+    return result;
+  }
+
+  /**
+   * <p>Return the value for the current entry.  The underlying storage for
+   * the returned slice is valid only until the next modification of
+   * the iterator.</p>
+   *
+   * <p>REQUIRES: !AtEnd() &amp;&amp; !AtStart()</p>
+   * @return value for the current entry.
+   */
+  public byte[] value() {
+    assert(isOwningHandle());
+    return value0(nativeHandle_);
+  }
+
+  /**
+   * <p>Return the value for the current entry.  The underlying storage for
+   * the returned slice is valid only until the next modification of
+   * the iterator.</p>
+   *
+   * <p>REQUIRES: {@link #isValid()}</p>
+   *
+   * @param value the out-value to receive the retrieved value.
+   *     It is using position and limit. Limit is set according to value size.
+   *     Supports direct buffer only.
+   * @return The size of the actual value. If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.
+   */
+  public int value(final ByteBuffer value) {
+    assert isOwningHandle();
+    final int result;
+    if (value.isDirect()) {
+      result = valueDirect0(nativeHandle_, value, value.position(), value.remaining());
+    } else {
+      assert value.hasArray();
+      result = valueByteArray0(
+          nativeHandle_, value.array(), value.arrayOffset() + value.position(), value.remaining());
+    }
+    value.limit(Math.min(value.position() + result, value.limit()));
+    return result;
+  }
+
+  @Override protected final native void disposeInternal(final long handle);
+  @Override final native boolean isValid0(long handle);
+  @Override final native void seekToFirst0(long handle);
+  @Override final native void seekToLast0(long handle);
+  @Override final native void next0(long handle);
+  @Override final native void prev0(long handle);
+  @Override final native void refresh0(long handle);
+  @Override final native void seek0(long handle, byte[] target, int targetLen);
+  @Override final native void seekForPrev0(long handle, byte[] target, int targetLen);
+  @Override
+  final native void seekDirect0(long handle, ByteBuffer target, int targetOffset, int targetLen);
+  @Override
+  final native void seekByteArray0(long handle, byte[] target, int targetOffset, int targetLen);
+  @Override
+  final native void seekForPrevDirect0(
+      long handle, ByteBuffer target, int targetOffset, int targetLen);
+  @Override
+  final native void seekForPrevByteArray0(
+      long handle, byte[] target, int targetOffset, int targetLen);
+  @Override final native void status0(long handle) throws RocksDBException;
+
+  private native byte[] key0(long handle);
+  private native byte[] value0(long handle);
+  private native int keyDirect0(long handle, ByteBuffer buffer, int bufferOffset, int bufferLen);
+  private native int keyByteArray0(long handle, byte[] array, int arrayOffset, int arrayLen);
+  private native int valueDirect0(long handle, ByteBuffer buffer, int bufferOffset, int bufferLen);
+  private native int valueByteArray0(long handle, byte[] array, int arrayOffset, int arrayLen);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/RocksIteratorInterface.java b/src/rocksdb/java/src/main/java/org/rocksdb/RocksIteratorInterface.java
new file mode 100644
index 000000000..819c21c2c
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/RocksIteratorInterface.java
@@ -0,0 +1,127 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.nio.ByteBuffer;
+
+/**
+ * <p>Defines the interface for an Iterator which provides
+ * access to data one entry at a time. Multiple implementations
+ * are provided by this library.  In particular, iterators are provided
+ * to access the contents of a DB and Write Batch.</p>
+ *
+ * <p>Multiple threads can invoke const methods on an RocksIterator without
+ * external synchronization, but if any of the threads may call a
+ * non-const method, all threads accessing the same RocksIterator must use
+ * external synchronization.</p>
+ *
+ * @see org.rocksdb.RocksObject
+ */
+public interface RocksIteratorInterface {
+
+  /**
+   * <p>An iterator is either positioned at an entry, or
+   * not valid.  This method returns true if the iterator is valid.</p>
+   *
+   * @return true if iterator is valid.
+   */
+  boolean isValid();
+
+  /**
+   * <p>Position at the first entry in the source.  The iterator is Valid()
+   * after this call if the source is not empty.</p>
+   */
+  void seekToFirst();
+
+  /**
+   * <p>Position at the last entry in the source.  The iterator is
+   * valid after this call if the source is not empty.</p>
+   */
+  void seekToLast();
+
+  /**
+   * <p>Position at the first entry in the source whose key is at or
+   * past target.</p>
+   *
+   * <p>The iterator is valid after this call if the source contains
+   * a key that comes at or past target.</p>
+   *
+   * @param target byte array describing a key or a
+   *               key prefix to seek for.
+   */
+  void seek(byte[] target);
+
+  /**
+   * <p>Position at the first entry in the source whose key is that or
+   * before target.</p>
+   *
+   * <p>The iterator is valid after this call if the source contains
+   * a key that comes at or before target.</p>
+   *
+   * @param target byte array describing a key or a
+   *               key prefix to seek for.
+   */
+  void seekForPrev(byte[] target);
+
+  /**
+   * <p>Position at the first entry in the source whose key is that or
+   * past target.</p>
+   *
+   * <p>The iterator is valid after this call if the source contains
+   * a key that comes at or past target.</p>
+   *
+   * @param target byte array describing a key or a
+   *               key prefix to seek for. Supports direct buffer only.
+   */
+  void seek(ByteBuffer target);
+
+  /**
+   * <p>Position at the last key that is less than or equal to the target key.</p>
+   *
+   * <p>The iterator is valid after this call if the source contains
+   * a key that comes at or past target.</p>
+   *
+   * @param target byte array describing a key or a
+   *               key prefix to seek for. Supports direct buffer only.
+   */
+  void seekForPrev(ByteBuffer target);
+
+  /**
+   * <p>Moves to the next entry in the source.  After this call, Valid() is
+   * true if the iterator was not positioned at the last entry in the source.</p>
+   *
+   * <p>REQUIRES: {@link #isValid()}</p>
+   */
+  void next();
+
+  /**
+   * <p>Moves to the previous entry in the source.  After this call, Valid() is
+   * true if the iterator was not positioned at the first entry in source.</p>
+   *
+   * <p>REQUIRES: {@link #isValid()}</p>
+   */
+  void prev();
+
+  /**
+   * <p>If an error has occurred, return it.  Else return an ok status.
+   * If non-blocking IO is requested and this operation cannot be
+   * satisfied without doing some IO, then this returns Status::Incomplete().</p>
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *                          native library.
+   */
+  void status() throws RocksDBException;
+
+  /**
+   * <p>If supported, renew the iterator to represent the latest state. The iterator will be
+   * invalidated after the call. Not supported if {@link ReadOptions#setSnapshot(Snapshot)} was
+   * specified when creating the iterator.</p>
+   *
+   * @throws RocksDBException thrown if the operation is not supported or an error happens in the
+   *     underlying native library
+   */
+  void refresh() throws RocksDBException;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/RocksMemEnv.java b/src/rocksdb/java/src/main/java/org/rocksdb/RocksMemEnv.java
new file mode 100644
index 000000000..39a6f6e1c
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/RocksMemEnv.java
@@ -0,0 +1,31 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Memory environment.
+ */
+//TODO(AR) rename to MemEnv
+public class RocksMemEnv extends Env {
+
+  /**
+   * <p>Creates a new environment that stores its data
+   * in memory and delegates all non-file-storage tasks to
+   * {@code baseEnv}.</p>
+   *
+   * <p>The caller must delete the result when it is
+   * no longer needed.</p>
+   *
+   * @param baseEnv the base environment,
+   *     must remain live while the result is in use.
+   */
+  public RocksMemEnv(final Env baseEnv) {
+    super(createMemEnv(baseEnv.nativeHandle_));
+  }
+
+  private static native long createMemEnv(final long baseEnvHandle);
+  @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/RocksMutableObject.java b/src/rocksdb/java/src/main/java/org/rocksdb/RocksMutableObject.java
new file mode 100644
index 000000000..e92289dc0
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/RocksMutableObject.java
@@ -0,0 +1,87 @@
+// Copyright (c) 2016, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * RocksMutableObject is an implementation of {@link AbstractNativeReference}
+ * whose reference to the underlying native C++ object can change.
+ *
+ * <p>The use of {@code RocksMutableObject} should be kept to a minimum, as it
+ * has synchronization overheads and introduces complexity. Instead it is
+ * recommended to use {@link RocksObject} where possible.</p>
+ */
+public abstract class RocksMutableObject extends AbstractNativeReference {
+
+  /**
+   * An mutable reference to the value of the C++ pointer pointing to some
+   * underlying native RocksDB C++ object.
+   */
+  private long nativeHandle_;
+  private boolean owningHandle_;
+
+  protected RocksMutableObject() {
+  }
+
+  protected RocksMutableObject(final long nativeHandle) {
+    this.nativeHandle_ = nativeHandle;
+    this.owningHandle_ = true;
+  }
+
+  /**
+   * Closes the existing handle, and changes the handle to the new handle
+   *
+   * @param newNativeHandle The C++ pointer to the new native object
+   * @param owningNativeHandle true if we own the new native object
+   */
+  public synchronized void resetNativeHandle(final long newNativeHandle,
+      final boolean owningNativeHandle) {
+    close();
+    setNativeHandle(newNativeHandle, owningNativeHandle);
+  }
+
+  /**
+   * Sets the handle (C++ pointer) of the underlying C++ native object
+   *
+   * @param nativeHandle The C++ pointer to the native object
+   * @param owningNativeHandle true if we own the native object
+   */
+  public synchronized void setNativeHandle(final long nativeHandle,
+      final boolean owningNativeHandle) {
+    this.nativeHandle_ = nativeHandle;
+    this.owningHandle_ = owningNativeHandle;
+  }
+
+  @Override
+  protected synchronized boolean isOwningHandle() {
+    return this.owningHandle_;
+  }
+
+  /**
+   * Gets the value of the C++ pointer pointing to the underlying
+   * native C++ object
+   *
+   * @return the pointer value for the native object
+   */
+  protected synchronized long getNativeHandle() {
+    assert (this.nativeHandle_ != 0);
+    return this.nativeHandle_;
+  }
+
+  @Override
+  public synchronized final void close() {
+    if (isOwningHandle()) {
+      disposeInternal();
+      this.owningHandle_ = false;
+      this.nativeHandle_ = 0;
+    }
+  }
+
+  protected void disposeInternal() {
+    disposeInternal(nativeHandle_);
+  }
+
+  protected abstract void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/RocksObject.java b/src/rocksdb/java/src/main/java/org/rocksdb/RocksObject.java
new file mode 100644
index 000000000..f07e1018a
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/RocksObject.java
@@ -0,0 +1,45 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * RocksObject is an implementation of {@link AbstractNativeReference} which
+ * has an immutable and therefore thread-safe reference to the underlying
+ * native C++ RocksDB object.
+ * <p>
+ * RocksObject is the base-class of almost all RocksDB classes that have a
+ * pointer to some underlying native C++ {@code rocksdb} object.</p>
+ * <p>
+ * The use of {@code RocksObject} should always be preferred over
+ * {@link RocksMutableObject}.</p>
+ */
+public abstract class RocksObject extends AbstractImmutableNativeReference {
+
+  /**
+   * An immutable reference to the value of the C++ pointer pointing to some
+   * underlying native RocksDB C++ object.
+   */
+  protected final long nativeHandle_;
+
+  protected RocksObject(final long nativeHandle) {
+    super(true);
+    this.nativeHandle_ = nativeHandle;
+  }
+
+  /**
+   * Deletes underlying C++ object pointer.
+   */
+  @Override
+  protected void disposeInternal() {
+    disposeInternal(nativeHandle_);
+  }
+
+  protected abstract void disposeInternal(final long handle);
+
+  public long getNativeHandle() {
+    return nativeHandle_;
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/SanityLevel.java b/src/rocksdb/java/src/main/java/org/rocksdb/SanityLevel.java
new file mode 100644
index 000000000..30568c363
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/SanityLevel.java
@@ -0,0 +1,47 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public enum SanityLevel {
+  NONE((byte) 0x0),
+  LOOSELY_COMPATIBLE((byte) 0x1),
+  EXACT_MATCH((byte) 0xFF);
+
+  private final byte value;
+
+  SanityLevel(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation value.
+   *
+   * @return the internal representation value.
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get the SanityLevel from the internal representation value.
+   *
+   * @param value the internal representation value.
+   *
+   * @return the SanityLevel
+   *
+   * @throws IllegalArgumentException if the value does not match a
+   *     SanityLevel
+   */
+  static SanityLevel fromValue(final byte value) throws IllegalArgumentException {
+    for (final SanityLevel level : SanityLevel.values()) {
+      if (level.value == value) {
+        return level;
+      }
+    }
+    throw new IllegalArgumentException("Unknown value for SanityLevel: " + value);
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/SizeApproximationFlag.java b/src/rocksdb/java/src/main/java/org/rocksdb/SizeApproximationFlag.java
new file mode 100644
index 000000000..fe3c2dd05
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/SizeApproximationFlag.java
@@ -0,0 +1,31 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+package org.rocksdb;
+
+import java.util.List;
+
+/**
+ * Flags for
+ * {@link RocksDB#getApproximateSizes(ColumnFamilyHandle, List, SizeApproximationFlag...)}
+ * that specify whether memtable stats should be included,
+ * or file stats approximation or both.
+ */
+public enum SizeApproximationFlag {
+  NONE((byte)0x0),
+  INCLUDE_MEMTABLES((byte)0x1),
+  INCLUDE_FILES((byte)0x2);
+
+  private final byte value;
+
+  SizeApproximationFlag(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal byte representation.
+   *
+   * @return the internal representation.
+   */
+  byte getValue() {
+    return value;
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/SkipListMemTableConfig.java b/src/rocksdb/java/src/main/java/org/rocksdb/SkipListMemTableConfig.java
new file mode 100644
index 000000000..e2c1b97d8
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/SkipListMemTableConfig.java
@@ -0,0 +1,51 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+package org.rocksdb;
+
+/**
+ * The config for skip-list memtable representation.
+ */
+public class SkipListMemTableConfig extends MemTableConfig {
+
+  public static final long DEFAULT_LOOKAHEAD = 0;
+
+  /**
+   * SkipListMemTableConfig constructor
+   */
+  public SkipListMemTableConfig() {
+    lookahead_ = DEFAULT_LOOKAHEAD;
+  }
+
+  /**
+   * Sets lookahead for SkipList
+   *
+   * @param lookahead If non-zero, each iterator's seek operation
+   *     will start the search from the previously visited record
+   *     (doing at most 'lookahead' steps). This is an
+   *     optimization for the access pattern including many
+   *     seeks with consecutive keys.
+   * @return the current instance of SkipListMemTableConfig
+   */
+  public SkipListMemTableConfig setLookahead(final long lookahead) {
+    lookahead_ = lookahead;
+    return this;
+  }
+
+  /**
+   * Returns the currently set lookahead value.
+   *
+   * @return lookahead value
+   */
+  public long lookahead() {
+    return lookahead_;
+  }
+
+
+  @Override protected long newMemTableFactoryHandle() {
+    return newMemTableFactoryHandle0(lookahead_);
+  }
+
+  private native long newMemTableFactoryHandle0(long lookahead)
+      throws IllegalArgumentException;
+
+  private long lookahead_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/Slice.java b/src/rocksdb/java/src/main/java/org/rocksdb/Slice.java
new file mode 100644
index 000000000..50d9f7652
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/Slice.java
@@ -0,0 +1,136 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * <p>Base class for slices which will receive
+ * byte[] based access to the underlying data.</p>
+ *
+ * <p>byte[] backed slices typically perform better with
+ * small keys and values. When using larger keys and
+ * values consider using {@link org.rocksdb.DirectSlice}</p>
+ */
+public class Slice extends AbstractSlice<byte[]> {
+
+  /**
+   * Indicates whether we have to free the memory pointed to by the Slice
+   */
+  private volatile boolean cleared;
+  private volatile long internalBufferOffset = 0;
+
+  /**
+   * <p>Called from JNI to construct a new Java Slice
+   * without an underlying C++ object set
+   * at creation time.</p>
+   *
+   * <p>Note: You should be aware that
+   * {@see org.rocksdb.RocksObject#disOwnNativeHandle()} is intentionally
+   * called from the default Slice constructor, and that it is marked as
+   * private. This is so that developers cannot construct their own default
+   * Slice objects (at present). As developers cannot construct their own
+   * Slice objects through this, they are not creating underlying C++ Slice
+   * objects, and so there is nothing to free (dispose) from Java.</p>
+   */
+  @SuppressWarnings("unused")
+  private Slice() {
+    super();
+  }
+
+  /**
+   * <p>Package-private Slice constructor which is used to construct
+   * Slice instances from C++ side. As the reference to this
+   * object is also managed from C++ side the handle will be disowned.</p>
+   *
+   * @param nativeHandle address of native instance.
+   */
+  Slice(final long nativeHandle) {
+    this(nativeHandle, false);
+  }
+
+  /**
+   * <p>Package-private Slice constructor which is used to construct
+   * Slice instances using a handle. </p>
+   *
+   * @param nativeHandle address of native instance.
+   * @param owningNativeHandle true if the Java side owns the memory pointed to
+   *     by this reference, false if ownership belongs to the C++ side
+   */
+  Slice(final long nativeHandle, final boolean owningNativeHandle) {
+    super();
+    setNativeHandle(nativeHandle, owningNativeHandle);
+  }
+
+  /**
+   * <p>Constructs a slice where the data is taken from
+   * a String.</p>
+   *
+   * @param str String value.
+   */
+  public Slice(final String str) {
+    super(createNewSliceFromString(str));
+  }
+
+  /**
+   * <p>Constructs a slice where the data is a copy of
+   * the byte array from a specific offset.</p>
+   *
+   * @param data byte array.
+   * @param offset offset within the byte array.
+   */
+  public Slice(final byte[] data, final int offset) {
+    super(createNewSlice0(data, offset));
+  }
+
+  /**
+   * <p>Constructs a slice where the data is a copy of
+   * the byte array.</p>
+   *
+   * @param data byte array.
+   */
+  public Slice(final byte[] data) {
+    super(createNewSlice1(data));
+  }
+
+  @Override
+  public void clear() {
+    clear0(getNativeHandle(), !cleared, internalBufferOffset);
+    cleared = true;
+  }
+
+  @Override
+  public void removePrefix(final int n) {
+    removePrefix0(getNativeHandle(), n);
+    this.internalBufferOffset += n;
+  }
+
+  /**
+   * <p>Deletes underlying C++ slice pointer
+   * and any buffered data.</p>
+   *
+   * <p>
+   * Note that this function should be called only after all
+   * RocksDB instances referencing the slice are closed.
+   * Otherwise an undefined behavior will occur.</p>
+   */
+  @Override
+  protected void disposeInternal() {
+    final long nativeHandle = getNativeHandle();
+    if(!cleared) {
+      disposeInternalBuf(nativeHandle, internalBufferOffset);
+    }
+    super.disposeInternal(nativeHandle);
+  }
+
+  @Override protected final native byte[] data0(long handle);
+  private native static long createNewSlice0(final byte[] data,
+      final int length);
+  private native static long createNewSlice1(final byte[] data);
+  private native void clear0(long handle, boolean internalBuffer,
+      long internalBufferOffset);
+  private native void removePrefix0(long handle, int length);
+  private native void disposeInternalBuf(final long handle,
+      long internalBufferOffset);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/Snapshot.java b/src/rocksdb/java/src/main/java/org/rocksdb/Snapshot.java
new file mode 100644
index 000000000..39cdf0c2d
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/Snapshot.java
@@ -0,0 +1,41 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Snapshot of database
+ */
+public class Snapshot extends RocksObject {
+  Snapshot(final long nativeHandle) {
+    super(nativeHandle);
+
+    // The pointer to the snapshot is always released
+    // by the database instance.
+    disOwnNativeHandle();
+  }
+
+  /**
+   * Return the associated sequence number;
+   *
+   * @return the associated sequence number of
+   *     this snapshot.
+   */
+  public long getSequenceNumber() {
+    return getSequenceNumber(nativeHandle_);
+  }
+
+  @Override
+  protected final void disposeInternal(final long handle) {
+    /**
+     * Nothing to release, we never own the pointer for a
+     * Snapshot. The pointer
+     * to the snapshot is released by the database
+     * instance.
+     */
+  }
+
+  private native long getSequenceNumber(long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/SstFileManager.java b/src/rocksdb/java/src/main/java/org/rocksdb/SstFileManager.java
new file mode 100644
index 000000000..8805410aa
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/SstFileManager.java
@@ -0,0 +1,251 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Map;
+
+/**
+ * SstFileManager is used to track SST files in the DB and control their
+ * deletion rate.
+ *
+ * All SstFileManager public functions are thread-safe.
+ *
+ * SstFileManager is not extensible.
+ */
+//@ThreadSafe
+public final class SstFileManager extends RocksObject {
+
+  public static final long RATE_BYTES_PER_SEC_DEFAULT = 0;
+  public static final boolean DELETE_EXISTING_TRASH_DEFAULT = true;
+  public static final double MAX_TRASH_DB_RATION_DEFAULT = 0.25;
+  public static final long BYTES_MAX_DELETE_CHUNK_DEFAULT = 64 * 1024 * 1024;
+
+  /**
+   * Create a new SstFileManager that can be shared among multiple RocksDB
+   * instances to track SST file and control there deletion rate.
+   *
+   * @param env the environment.
+   *
+   * @throws RocksDBException thrown if error happens in underlying native library.
+   */
+  public SstFileManager(final Env env) throws RocksDBException {
+    this(env, null);
+  }
+
+  /**
+   * Create a new SstFileManager that can be shared among multiple RocksDB
+   * instances to track SST file and control there deletion rate.
+   *
+   * @param env the environment.
+   * @param logger if not null, the logger will be used to log errors.
+   *
+   * @throws RocksDBException thrown if error happens in underlying native library.
+   */
+  public SstFileManager(final Env env, /*@Nullable*/  final Logger logger)
+      throws RocksDBException {
+    this(env, logger, RATE_BYTES_PER_SEC_DEFAULT);
+  }
+
+  /**
+   * Create a new SstFileManager that can be shared among multiple RocksDB
+   * instances to track SST file and control there deletion rate.
+   *
+   * @param env the environment.
+   * @param logger if not null, the logger will be used to log errors.
+   *
+   * == Deletion rate limiting specific arguments ==
+   * @param rateBytesPerSec how many bytes should be deleted per second, If
+   *     this value is set to 1024 (1 Kb / sec) and we deleted a file of size
+   *     4 Kb in 1 second, we will wait for another 3 seconds before we delete
+   *     other files, Set to 0 to disable deletion rate limiting.
+   *
+   * @throws RocksDBException thrown if error happens in underlying native library.
+   */
+  public SstFileManager(final Env env, /*@Nullable*/  final Logger logger,
+      final long rateBytesPerSec) throws RocksDBException {
+    this(env, logger, rateBytesPerSec, MAX_TRASH_DB_RATION_DEFAULT);
+  }
+
+  /**
+   * Create a new SstFileManager that can be shared among multiple RocksDB
+   * instances to track SST file and control there deletion rate.
+   *
+   * @param env the environment.
+   * @param logger if not null, the logger will be used to log errors.
+   *
+   * == Deletion rate limiting specific arguments ==
+   * @param rateBytesPerSec how many bytes should be deleted per second, If
+   *     this value is set to 1024 (1 Kb / sec) and we deleted a file of size
+   *     4 Kb in 1 second, we will wait for another 3 seconds before we delete
+   *     other files, Set to 0 to disable deletion rate limiting.
+   * @param maxTrashDbRatio if the trash size constitutes for more than this
+   *     fraction of the total DB size we will start deleting new files passed
+   *     to DeleteScheduler immediately.
+   *
+   *  @throws RocksDBException thrown if error happens in underlying native library.
+   */
+  public SstFileManager(final Env env, /*@Nullable*/ final Logger logger,
+      final long rateBytesPerSec, final double maxTrashDbRatio)
+      throws RocksDBException {
+    this(env, logger, rateBytesPerSec, maxTrashDbRatio,
+        BYTES_MAX_DELETE_CHUNK_DEFAULT);
+  }
+
+  /**
+   * Create a new SstFileManager that can be shared among multiple RocksDB
+   * instances to track SST file and control there deletion rate.
+   *
+   * @param env the environment.
+   * @param logger if not null, the logger will be used to log errors.
+   *
+   * == Deletion rate limiting specific arguments ==
+   * @param rateBytesPerSec how many bytes should be deleted per second, If
+   *     this value is set to 1024 (1 Kb / sec) and we deleted a file of size
+   *     4 Kb in 1 second, we will wait for another 3 seconds before we delete
+   *     other files, Set to 0 to disable deletion rate limiting.
+   * @param maxTrashDbRatio if the trash size constitutes for more than this
+   *     fraction of the total DB size we will start deleting new files passed
+   *     to DeleteScheduler immediately.
+   * @param bytesMaxDeleteChunk if a single file is larger than delete chunk,
+   *     ftruncate the file by this size each time, rather than dropping the whole
+   *     file. 0 means to always delete the whole file.
+   *
+   * @throws RocksDBException thrown if error happens in underlying native library.
+   */
+  public SstFileManager(final Env env, /*@Nullable*/final Logger logger,
+      final long rateBytesPerSec, final double maxTrashDbRatio,
+      final long bytesMaxDeleteChunk) throws RocksDBException {
+    super(newSstFileManager(env.nativeHandle_,
+        logger != null ? logger.nativeHandle_ : 0,
+        rateBytesPerSec, maxTrashDbRatio, bytesMaxDeleteChunk));
+  }
+
+
+  /**
+   * Update the maximum allowed space that should be used by RocksDB, if
+   * the total size of the SST files exceeds {@code maxAllowedSpace}, writes to
+   * RocksDB will fail.
+   *
+   * Setting {@code maxAllowedSpace} to 0 will disable this feature;
+   * maximum allowed space will be infinite (Default value).
+   *
+   * @param maxAllowedSpace the maximum allowed space that should be used by
+   *     RocksDB.
+   */
+  public void setMaxAllowedSpaceUsage(final long maxAllowedSpace) {
+    setMaxAllowedSpaceUsage(nativeHandle_, maxAllowedSpace);
+  }
+
+  /**
+   * Set the amount of buffer room each compaction should be able to leave.
+   * In other words, at its maximum disk space consumption, the compaction
+   * should still leave {@code compactionBufferSize} available on the disk so
+   * that other background functions may continue, such as logging and flushing.
+   *
+   * @param compactionBufferSize the amount of buffer room each compaction
+   *     should be able to leave.
+   */
+  public void setCompactionBufferSize(final long compactionBufferSize) {
+    setCompactionBufferSize(nativeHandle_, compactionBufferSize);
+  }
+
+  /**
+   * Determines if the total size of SST files exceeded the maximum allowed
+   * space usage.
+   *
+   * @return true when the maximum allows space usage has been exceeded.
+   */
+  public boolean isMaxAllowedSpaceReached() {
+    return isMaxAllowedSpaceReached(nativeHandle_);
+  }
+
+  /**
+   * Determines if the total size of SST files as well as estimated size
+   * of ongoing compactions exceeds the maximums allowed space usage.
+   *
+   * @return true when the total size of SST files as well as estimated size
+   * of ongoing compactions exceeds the maximums allowed space usage.
+   */
+  public boolean isMaxAllowedSpaceReachedIncludingCompactions() {
+    return isMaxAllowedSpaceReachedIncludingCompactions(nativeHandle_);
+  }
+
+  /**
+   * Get the total size of all tracked files.
+   *
+   * @return the total size of all tracked files.
+   */
+  public long getTotalSize() {
+    return getTotalSize(nativeHandle_);
+  }
+
+  /**
+   * Gets all tracked files and their corresponding sizes.
+   *
+   * @return a map containing all tracked files and there corresponding sizes.
+   */
+  public Map<String, Long> getTrackedFiles() {
+    return getTrackedFiles(nativeHandle_);
+  }
+
+  /**
+   * Gets the delete rate limit.
+   *
+   * @return the delete rate limit (in bytes per second).
+   */
+  public long getDeleteRateBytesPerSecond() {
+    return getDeleteRateBytesPerSecond(nativeHandle_);
+  }
+
+  /**
+   * Set the delete rate limit.
+   *
+   * Zero means disable delete rate limiting and delete files immediately.
+   *
+   * @param deleteRate the delete rate limit (in bytes per second).
+   */
+  public void setDeleteRateBytesPerSecond(final long deleteRate) {
+    setDeleteRateBytesPerSecond(nativeHandle_, deleteRate);
+  }
+
+  /**
+   * Get the trash/DB size ratio where new files will be deleted immediately.
+   *
+   * @return the trash/DB size ratio.
+   */
+  public double getMaxTrashDBRatio() {
+    return getMaxTrashDBRatio(nativeHandle_);
+  }
+
+  /**
+   * Set the trash/DB size ratio where new files will be deleted immediately.
+   *
+   * @param ratio the trash/DB size ratio.
+   */
+  public void setMaxTrashDBRatio(final double ratio) {
+    setMaxTrashDBRatio(nativeHandle_, ratio);
+  }
+
+  private native static long newSstFileManager(final long handle,
+      final long logger_handle, final long rateBytesPerSec,
+      final double maxTrashDbRatio, final long bytesMaxDeleteChunk)
+      throws RocksDBException;
+  private native void setMaxAllowedSpaceUsage(final long handle,
+      final long maxAllowedSpace);
+  private native void setCompactionBufferSize(final long handle,
+      final long compactionBufferSize);
+  private native boolean isMaxAllowedSpaceReached(final long handle);
+  private native boolean isMaxAllowedSpaceReachedIncludingCompactions(
+      final long handle);
+  private native long getTotalSize(final long handle);
+  private native Map<String, Long> getTrackedFiles(final long handle);
+  private native long getDeleteRateBytesPerSecond(final long handle);
+  private native void setDeleteRateBytesPerSecond(final long handle,
+        final long deleteRate);
+  private native double getMaxTrashDBRatio(final long handle);
+  private native void setMaxTrashDBRatio(final long handle, final double ratio);
+  @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/SstFileMetaData.java b/src/rocksdb/java/src/main/java/org/rocksdb/SstFileMetaData.java
new file mode 100644
index 000000000..a04d05cb5
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/SstFileMetaData.java
@@ -0,0 +1,162 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * The metadata that describes a SST file.
+ */
+public class SstFileMetaData {
+  private final String fileName;
+  private final String path;
+  private final long size;
+  private final long smallestSeqno;
+  private final long largestSeqno;
+  private final byte[] smallestKey;
+  private final byte[] largestKey;
+  private final long numReadsSampled;
+  private final boolean beingCompacted;
+  private final long numEntries;
+  private final long numDeletions;
+
+  /**
+   * Called from JNI C++
+   *
+   * @param fileName the file name
+   * @param path the file path
+   * @param size the size of the file
+   * @param smallestSeqno the smallest sequence number
+   * @param largestSeqno the largest sequence number
+   * @param smallestKey the smallest key
+   * @param largestKey the largest key
+   * @param numReadsSampled the number of reads sampled
+   * @param beingCompacted true if the file is being compacted, false otherwise
+   * @param numEntries the number of entries
+   * @param numDeletions the number of deletions
+   */
+  protected SstFileMetaData(
+      final String fileName,
+      final String path,
+      final long size,
+      final long smallestSeqno,
+      final long largestSeqno,
+      final byte[] smallestKey,
+      final byte[] largestKey,
+      final long numReadsSampled,
+      final boolean beingCompacted,
+      final long numEntries,
+      final long numDeletions) {
+    this.fileName = fileName;
+    this.path = path;
+    this.size = size;
+    this.smallestSeqno = smallestSeqno;
+    this.largestSeqno = largestSeqno;
+    this.smallestKey = smallestKey;
+    this.largestKey = largestKey;
+    this.numReadsSampled = numReadsSampled;
+    this.beingCompacted = beingCompacted;
+    this.numEntries = numEntries;
+    this.numDeletions = numDeletions;
+  }
+
+  /**
+   * Get the name of the file.
+   *
+   * @return the name of the file.
+   */
+  public String fileName() {
+    return fileName;
+  }
+
+  /**
+   * Get the full path where the file locates.
+   *
+   * @return the full path
+   */
+  public String path() {
+    return path;
+  }
+
+  /**
+   * Get the file size in bytes.
+   *
+   * @return file size
+   */
+  public long size() {
+    return size;
+  }
+
+  /**
+   * Get the smallest sequence number in file.
+   *
+   * @return the smallest sequence number
+   */
+  public long smallestSeqno() {
+    return smallestSeqno;
+  }
+
+  /**
+   * Get the largest sequence number in file.
+   *
+   * @return the largest sequence number
+   */
+  public long largestSeqno() {
+    return largestSeqno;
+  }
+
+  /**
+   * Get the smallest user defined key in the file.
+   *
+   * @return the smallest user defined key
+   */
+  public byte[] smallestKey() {
+    return smallestKey;
+  }
+
+  /**
+   * Get the largest user defined key in the file.
+   *
+   * @return the largest user defined key
+   */
+  public byte[] largestKey() {
+    return largestKey;
+  }
+
+  /**
+   * Get the number of times the file has been read.
+   *
+   * @return the number of times the file has been read
+   */
+  public long numReadsSampled() {
+    return numReadsSampled;
+  }
+
+  /**
+   * Returns true if the file is currently being compacted.
+   *
+   * @return true if the file is currently being compacted, false otherwise.
+   */
+  public boolean beingCompacted() {
+    return beingCompacted;
+  }
+
+  /**
+   * Get the number of entries.
+   *
+   * @return the number of entries.
+   */
+  public long numEntries() {
+    return numEntries;
+  }
+
+  /**
+   * Get the number of deletions.
+   *
+   * @return the number of deletions.
+   */
+  public long numDeletions() {
+    return numDeletions;
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/SstFileReader.java b/src/rocksdb/java/src/main/java/org/rocksdb/SstFileReader.java
new file mode 100644
index 000000000..bb1e94ee0
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/SstFileReader.java
@@ -0,0 +1,82 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public class SstFileReader extends RocksObject {
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  public SstFileReader(final Options options) {
+    super(newSstFileReader(options.nativeHandle_));
+  }
+
+  /**
+   * Returns an iterator that will iterate on all keys in the default
+   * column family including both keys in the DB and uncommitted keys in this
+   * transaction.
+   *
+   * Setting {@link ReadOptions#setSnapshot(Snapshot)} will affect what is read
+   * from the DB but will NOT change which keys are read from this transaction
+   * (the keys in this transaction do not yet belong to any snapshot and will be
+   * fetched regardless).
+   *
+   * Caller is responsible for deleting the returned Iterator.
+   *
+   * @param readOptions Read options.
+   *
+   * @return instance of iterator object.
+   */
+  public SstFileReaderIterator newIterator(final ReadOptions readOptions) {
+    assert (isOwningHandle());
+    long iter = newIterator(nativeHandle_, readOptions.nativeHandle_);
+    return new SstFileReaderIterator(this, iter);
+  }
+
+  /**
+   * Prepare SstFileReader to read a file.
+   *
+   * @param filePath the location of file
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void open(final String filePath) throws RocksDBException {
+    open(nativeHandle_, filePath);
+  }
+
+  /**
+   * Verify checksum
+   *
+   * @throws RocksDBException if the checksum is not valid
+   */
+  public void verifyChecksum() throws RocksDBException {
+    verifyChecksum(nativeHandle_);
+  }
+
+  /**
+   * Get the properties of the table.
+   *
+   * @return the properties
+   *
+   * @throws RocksDBException if an error occurs whilst getting the table
+   *     properties
+   */
+  public TableProperties getTableProperties() throws RocksDBException {
+    return getTableProperties(nativeHandle_);
+  }
+
+  @Override protected final native void disposeInternal(final long handle);
+  private native long newIterator(final long handle, final long readOptionsHandle);
+
+  private native void open(final long handle, final String filePath)
+      throws RocksDBException;
+
+  private native static long newSstFileReader(final long optionsHandle);
+  private native void verifyChecksum(final long handle) throws RocksDBException;
+  private native TableProperties getTableProperties(final long handle)
+      throws RocksDBException;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/SstFileReaderIterator.java b/src/rocksdb/java/src/main/java/org/rocksdb/SstFileReaderIterator.java
new file mode 100644
index 000000000..a4a08167b
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/SstFileReaderIterator.java
@@ -0,0 +1,140 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.nio.ByteBuffer;
+
+/**
+ * <p>An iterator that yields a sequence of key/value pairs from a source.
+ * Multiple implementations are provided by this library.
+ * In particular, iterators are provided
+ * to access the contents of a Table or a DB.</p>
+ *
+ * <p>Multiple threads can invoke const methods on an RocksIterator without
+ * external synchronization, but if any of the threads may call a
+ * non-const method, all threads accessing the same RocksIterator must use
+ * external synchronization.</p>
+ *
+ * @see RocksObject
+ */
+public class SstFileReaderIterator extends AbstractRocksIterator<SstFileReader> {
+  protected SstFileReaderIterator(final SstFileReader reader, final long nativeHandle) {
+    super(reader, nativeHandle);
+  }
+
+  /**
+   * <p>Return the key for the current entry.  The underlying storage for
+   * the returned slice is valid only until the next modification of
+   * the iterator.</p>
+   *
+   * <p>REQUIRES: {@link #isValid()}</p>
+   *
+   * @return key for the current entry.
+   */
+  public byte[] key() {
+    assert (isOwningHandle());
+    return key0(nativeHandle_);
+  }
+
+  /**
+   * <p>Return the key for the current entry.  The underlying storage for
+   * the returned slice is valid only until the next modification of
+   * the iterator.</p>
+   *
+   * <p>REQUIRES: {@link #isValid()}</p>
+   *
+   * @param key the out-value to receive the retrieved key.
+   *     It is using position and limit. Limit is set according to key size.
+   *     Supports direct buffer only.
+   * @return The size of the actual key. If the return key is greater than the
+   *     length of {@code key}, then it indicates that the size of the
+   *     input buffer {@code key} is insufficient and partial result will
+   *     be returned.
+   */
+  public int key(final ByteBuffer key) {
+    assert (isOwningHandle());
+    final int result;
+    if (key.isDirect()) {
+      result = keyDirect0(nativeHandle_, key, key.position(), key.remaining());
+    } else {
+      result = keyByteArray0(
+          nativeHandle_, key.array(), key.arrayOffset() + key.position(), key.remaining());
+    }
+    key.limit(Math.min(key.position() + result, key.limit()));
+    return result;
+  }
+
+  /**
+   * <p>Return the value for the current entry.  The underlying storage for
+   * the returned slice is valid only until the next modification of
+   * the iterator.</p>
+   *
+   * <p>REQUIRES: !AtEnd() &amp;&amp; !AtStart()</p>
+   * @return value for the current entry.
+   */
+  public byte[] value() {
+    assert (isOwningHandle());
+    return value0(nativeHandle_);
+  }
+
+  /**
+   * <p>Return the value for the current entry.  The underlying storage for
+   * the returned slice is valid only until the next modification of
+   * the iterator.</p>
+   *
+   * <p>REQUIRES: {@link #isValid()}</p>
+   *
+   * @param value the out-value to receive the retrieved value.
+   *     It is using position and limit. Limit is set according to value size.
+   *     Supports direct buffer only.
+   * @return The size of the actual value. If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.
+   */
+  public int value(final ByteBuffer value) {
+    assert (isOwningHandle());
+    final int result;
+    if (value.isDirect()) {
+      result = valueDirect0(nativeHandle_, value, value.position(), value.remaining());
+    } else {
+      result = valueByteArray0(
+          nativeHandle_, value.array(), value.arrayOffset() + value.position(), value.remaining());
+    }
+    value.limit(Math.min(value.position() + result, value.limit()));
+    return result;
+  }
+
+  @Override protected final native void disposeInternal(final long handle);
+  @Override final native boolean isValid0(long handle);
+  @Override final native void seekToFirst0(long handle);
+  @Override final native void seekToLast0(long handle);
+  @Override final native void next0(long handle);
+  @Override final native void prev0(long handle);
+  @Override final native void refresh0(long handle) throws RocksDBException;
+  @Override final native void seek0(long handle, byte[] target, int targetLen);
+  @Override final native void seekForPrev0(long handle, byte[] target, int targetLen);
+  @Override final native void status0(long handle) throws RocksDBException;
+  @Override
+  final native void seekDirect0(long handle, ByteBuffer target, int targetOffset, int targetLen);
+  @Override
+  final native void seekForPrevDirect0(
+      long handle, ByteBuffer target, int targetOffset, int targetLen);
+  @Override
+  final native void seekByteArray0(
+      final long handle, final byte[] target, final int targetOffset, final int targetLen);
+  @Override
+  final native void seekForPrevByteArray0(
+      final long handle, final byte[] target, final int targetOffset, final int targetLen);
+
+  private native byte[] key0(long handle);
+  private native byte[] value0(long handle);
+
+  private native int keyDirect0(long handle, ByteBuffer buffer, int bufferOffset, int bufferLen);
+  private native int keyByteArray0(long handle, byte[] buffer, int bufferOffset, int bufferLen);
+  private native int valueDirect0(long handle, ByteBuffer buffer, int bufferOffset, int bufferLen);
+  private native int valueByteArray0(long handle, byte[] buffer, int bufferOffset, int bufferLen);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/SstFileWriter.java b/src/rocksdb/java/src/main/java/org/rocksdb/SstFileWriter.java
new file mode 100644
index 000000000..fe00c1a12
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/SstFileWriter.java
@@ -0,0 +1,238 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.nio.ByteBuffer;
+
+/**
+ * SstFileWriter is used to create sst files that can be added to the
+ * database later. All keys in files generated by SstFileWriter will have
+ * sequence number = 0.
+ */
+public class SstFileWriter extends RocksObject {
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  /**
+   * SstFileWriter Constructor.
+   *
+   * @param envOptions {@link org.rocksdb.EnvOptions} instance.
+   * @param options {@link org.rocksdb.Options} instance.
+   */
+  public SstFileWriter(final EnvOptions envOptions, final Options options) {
+    super(newSstFileWriter(
+        envOptions.nativeHandle_, options.nativeHandle_));
+  }
+
+  /**
+   * Prepare SstFileWriter to write to a file.
+   *
+   * @param filePath the location of file
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void open(final String filePath) throws RocksDBException {
+    open(nativeHandle_, filePath);
+  }
+
+  /**
+   * Add a Put key with value to currently opened file.
+   *
+   * @param key the specified key to be inserted.
+   * @param value the value associated with the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void put(final Slice key, final Slice value) throws RocksDBException {
+    put(nativeHandle_, key.getNativeHandle(), value.getNativeHandle());
+  }
+
+  /**
+   * Add a Put key with value to currently opened file.
+   *
+   * @param key the specified key to be inserted.
+   * @param value the value associated with the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void put(final DirectSlice key, final DirectSlice value)
+      throws RocksDBException {
+    put(nativeHandle_, key.getNativeHandle(), value.getNativeHandle());
+  }
+
+  /**
+   * Add a Put key with value to currently opened file.
+   *
+   * @param key the specified key to be inserted.
+   * @param value the value associated with the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void put(final ByteBuffer key, final ByteBuffer value) throws RocksDBException {
+    assert key.isDirect() && value.isDirect();
+    putDirect(nativeHandle_, key, key.position(), key.remaining(), value, value.position(),
+        value.remaining());
+    key.position(key.limit());
+    value.position(value.limit());
+  }
+
+ /**
+   * Add a Put key with value to currently opened file.
+   *
+   * @param key the specified key to be inserted.
+   * @param value the value associated with the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void put(final byte[] key, final byte[] value) throws RocksDBException {
+    put(nativeHandle_, key, value);
+  }
+
+  /**
+   * Add a Merge key with value to currently opened file.
+   *
+   * @param key the specified key to be merged.
+   * @param value the value to be merged with the current value for
+   * the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void merge(final Slice key, final Slice value)
+      throws RocksDBException {
+    merge(nativeHandle_, key.getNativeHandle(), value.getNativeHandle());
+  }
+
+  /**
+   * Add a Merge key with value to currently opened file.
+   *
+   * @param key the specified key to be merged.
+   * @param value the value to be merged with the current value for
+   * the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void merge(final byte[] key, final byte[] value)
+      throws RocksDBException {
+    merge(nativeHandle_, key, value);
+  }
+
+  /**
+   * Add a Merge key with value to currently opened file.
+   *
+   * @param key the specified key to be merged.
+   * @param value the value to be merged with the current value for
+   * the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void merge(final DirectSlice key, final DirectSlice value)
+      throws RocksDBException {
+    merge(nativeHandle_, key.getNativeHandle(), value.getNativeHandle());
+  }
+
+  /**
+   * Add a deletion key to currently opened file.
+   *
+   * @param key the specified key to be deleted.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void delete(final Slice key) throws RocksDBException {
+    delete(nativeHandle_, key.getNativeHandle());
+  }
+
+  /**
+   * Add a deletion key to currently opened file.
+   *
+   * @param key the specified key to be deleted.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void delete(final DirectSlice key) throws RocksDBException {
+    delete(nativeHandle_, key.getNativeHandle());
+  }
+
+  /**
+   * Add a deletion key to currently opened file.
+   *
+   * @param key the specified key to be deleted.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void delete(final byte[] key) throws RocksDBException {
+    delete(nativeHandle_, key);
+  }
+
+  /**
+   * Finish the process and close the sst file.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void finish() throws RocksDBException {
+    finish(nativeHandle_);
+  }
+
+  /**
+   * Return the current file size.
+   *
+   * @return the current file size.
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public long fileSize() throws RocksDBException {
+    return fileSize(nativeHandle_);
+  }
+
+  private native static long newSstFileWriter(
+      final long envOptionsHandle, final long optionsHandle,
+      final long userComparatorHandle, final byte comparatorType);
+
+  private native static long newSstFileWriter(final long envOptionsHandle,
+      final long optionsHandle);
+
+  private native void open(final long handle, final String filePath)
+      throws RocksDBException;
+
+  private native void put(final long handle, final long keyHandle,
+      final long valueHandle) throws RocksDBException;
+      
+  private native void put(final long handle, final byte[] key,
+      final byte[] value) throws RocksDBException;
+
+  private native void putDirect(long handle, ByteBuffer key, int keyOffset, int keyLength,
+      ByteBuffer value, int valueOffset, int valueLength) throws RocksDBException;
+
+  private native long fileSize(long handle) throws RocksDBException;
+
+  private native void merge(final long handle, final long keyHandle,
+      final long valueHandle) throws RocksDBException;
+
+  private native void merge(final long handle, final byte[] key,
+      final byte[] value) throws RocksDBException;
+
+  private native void delete(final long handle, final long keyHandle)
+      throws RocksDBException;
+
+  private native void delete(final long handle, final byte[] key)
+      throws RocksDBException;
+
+  private native void finish(final long handle) throws RocksDBException;
+
+  @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/SstPartitionerFactory.java b/src/rocksdb/java/src/main/java/org/rocksdb/SstPartitionerFactory.java
new file mode 100644
index 000000000..ea6f13565
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/SstPartitionerFactory.java
@@ -0,0 +1,15 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Handle to factory for SstPartitioner. It is used in {@link ColumnFamilyOptions}
+ */
+public abstract class SstPartitionerFactory extends RocksObject {
+  protected SstPartitionerFactory(final long nativeHandle) {
+    super(nativeHandle);
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/SstPartitionerFixedPrefixFactory.java b/src/rocksdb/java/src/main/java/org/rocksdb/SstPartitionerFixedPrefixFactory.java
new file mode 100644
index 000000000..d513c5f15
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/SstPartitionerFixedPrefixFactory.java
@@ -0,0 +1,19 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Fixed prefix factory. It partitions SST files using fixed prefix of the key.
+ */
+public class SstPartitionerFixedPrefixFactory extends SstPartitionerFactory {
+  public SstPartitionerFixedPrefixFactory(long prefixLength) {
+    super(newSstPartitionerFixedPrefixFactory0(prefixLength));
+  }
+
+  private native static long newSstPartitionerFixedPrefixFactory0(long prefixLength);
+
+  @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/StateType.java b/src/rocksdb/java/src/main/java/org/rocksdb/StateType.java
new file mode 100644
index 000000000..803456bb2
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/StateType.java
@@ -0,0 +1,53 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * The type used to refer to a thread state.
+ *
+ * A state describes lower-level action of a thread
+ * such as reading / writing a file or waiting for a mutex.
+ */
+public enum StateType {
+  STATE_UNKNOWN((byte)0x0),
+  STATE_MUTEX_WAIT((byte)0x1);
+
+  private final byte value;
+
+  StateType(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation value.
+   *
+   * @return the internal representation value.
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get the State type from the internal representation value.
+   *
+   * @param value the internal representation value.
+   *
+   * @return the state type
+   *
+   * @throws IllegalArgumentException if the value does not match
+   *     a StateType
+   */
+  static StateType fromValue(final byte value)
+      throws IllegalArgumentException {
+    for (final StateType threadType : StateType.values()) {
+      if (threadType.value == value) {
+        return threadType;
+      }
+    }
+    throw new IllegalArgumentException(
+        "Unknown value for StateType: " + value);
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/Statistics.java b/src/rocksdb/java/src/main/java/org/rocksdb/Statistics.java
new file mode 100644
index 000000000..0938a6d58
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/Statistics.java
@@ -0,0 +1,152 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.EnumSet;
+
+/**
+ * Statistics to analyze the performance of a db. Pointer for statistics object
+ * is managed by Options class.
+ */
+public class Statistics extends RocksObject {
+
+  public Statistics() {
+    super(newStatistics());
+  }
+
+  public Statistics(final Statistics otherStatistics) {
+    super(newStatistics(otherStatistics.nativeHandle_));
+  }
+
+  public Statistics(final EnumSet<HistogramType> ignoreHistograms) {
+    super(newStatistics(toArrayValues(ignoreHistograms)));
+  }
+
+  public Statistics(final EnumSet<HistogramType> ignoreHistograms, final Statistics otherStatistics) {
+    super(newStatistics(toArrayValues(ignoreHistograms), otherStatistics.nativeHandle_));
+  }
+
+  /**
+   * Intentionally package-private.
+   *
+   * Used from {@link DBOptions#statistics()}
+   *
+   * @param existingStatisticsHandle The C++ pointer to an existing statistics object
+   */
+  Statistics(final long existingStatisticsHandle) {
+    super(existingStatisticsHandle);
+  }
+
+  private static byte[] toArrayValues(final EnumSet<HistogramType> histogramTypes) {
+    final byte[] values = new byte[histogramTypes.size()];
+    int i = 0;
+    for(final HistogramType histogramType : histogramTypes) {
+      values[i++] = histogramType.getValue();
+    }
+    return values;
+  }
+
+  /**
+   * Gets the current stats level.
+   *
+   * @return The stats level.
+   */
+  public StatsLevel statsLevel() {
+    return StatsLevel.getStatsLevel(statsLevel(nativeHandle_));
+  }
+
+  /**
+   * Sets the stats level.
+   *
+   * @param statsLevel The stats level to set.
+   */
+  public void setStatsLevel(final StatsLevel statsLevel) {
+    setStatsLevel(nativeHandle_, statsLevel.getValue());
+  }
+
+  /**
+   * Get the count for a ticker.
+   *
+   * @param tickerType The ticker to get the count for
+   *
+   * @return The count for the ticker
+   */
+  public long getTickerCount(final TickerType tickerType) {
+    assert(isOwningHandle());
+    return getTickerCount(nativeHandle_, tickerType.getValue());
+  }
+
+  /**
+   * Get the count for a ticker and reset the tickers count.
+   *
+   * @param tickerType The ticker to get the count for
+   *
+   * @return The count for the ticker
+   */
+  public long getAndResetTickerCount(final TickerType tickerType) {
+    assert(isOwningHandle());
+    return getAndResetTickerCount(nativeHandle_, tickerType.getValue());
+  }
+
+  /**
+   * Gets the histogram data for a particular histogram.
+   *
+   * @param histogramType The histogram to retrieve the data for
+   *
+   * @return The histogram data
+   */
+  public HistogramData getHistogramData(final HistogramType histogramType) {
+    assert(isOwningHandle());
+    return getHistogramData(nativeHandle_, histogramType.getValue());
+  }
+
+  /**
+   * Gets a string representation of a particular histogram.
+   *
+   * @param histogramType The histogram to retrieve the data for
+   *
+   * @return A string representation of the histogram data
+   */
+  public String getHistogramString(final HistogramType histogramType) {
+    assert(isOwningHandle());
+    return getHistogramString(nativeHandle_, histogramType.getValue());
+  }
+
+  /**
+   * Resets all ticker and histogram stats.
+   *
+   * @throws RocksDBException if an error occurs when resetting the statistics.
+   */
+  public void reset() throws RocksDBException {
+    assert(isOwningHandle());
+    reset(nativeHandle_);
+  }
+
+  /**
+   * String representation of the statistic object.
+   */
+  @Override
+  public String toString() {
+    assert(isOwningHandle());
+    return toString(nativeHandle_);
+  }
+
+  private native static long newStatistics();
+  private native static long newStatistics(final long otherStatisticsHandle);
+  private native static long newStatistics(final byte[] ignoreHistograms);
+  private native static long newStatistics(final byte[] ignoreHistograms, final long otherStatisticsHandle);
+
+  @Override protected final native void disposeInternal(final long handle);
+
+  private native byte statsLevel(final long handle);
+  private native void setStatsLevel(final long handle, final byte statsLevel);
+  private native long getTickerCount(final long handle, final byte tickerType);
+  private native long getAndResetTickerCount(final long handle, final byte tickerType);
+  private native HistogramData getHistogramData(final long handle, final byte histogramType);
+  private native String getHistogramString(final long handle, final byte histogramType);
+  private native void reset(final long nativeHandle) throws RocksDBException;
+  private native String toString(final long nativeHandle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/StatisticsCollector.java b/src/rocksdb/java/src/main/java/org/rocksdb/StatisticsCollector.java
new file mode 100644
index 000000000..fb3f57150
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/StatisticsCollector.java
@@ -0,0 +1,111 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.List;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * <p>Helper class to collect DB statistics periodically at a period specified in
+ * constructor. Callback function (provided in constructor) is called with
+ * every statistics collection.</p>
+ *
+ * <p>Caller should call start() to start statistics collection. Shutdown() should
+ * be called to stop stats collection and should be called before statistics (
+ * provided in constructor) reference has been disposed.</p>
+ */
+public class StatisticsCollector {
+  private final List<StatsCollectorInput> _statsCollectorInputList;
+  private final ExecutorService _executorService;
+  private final int _statsCollectionInterval;
+  private volatile boolean _isRunning = true;
+
+  /**
+   * Constructor for statistics collector.
+   *
+   * @param statsCollectorInputList List of statistics collector input.
+   * @param statsCollectionIntervalInMilliSeconds Statistics collection time
+   *        period (specified in milliseconds).
+   */
+  public StatisticsCollector(
+      final List<StatsCollectorInput> statsCollectorInputList,
+      final int statsCollectionIntervalInMilliSeconds) {
+    _statsCollectorInputList = statsCollectorInputList;
+    _statsCollectionInterval = statsCollectionIntervalInMilliSeconds;
+
+    _executorService = Executors.newSingleThreadExecutor();
+  }
+
+  public void start() {
+    _executorService.submit(collectStatistics());
+  }
+
+  /**
+   * Shuts down statistics collector.
+   *
+   * @param shutdownTimeout Time in milli-seconds to wait for shutdown before
+   *        killing the collection process.
+   * @throws java.lang.InterruptedException thrown if Threads are interrupted.
+   */
+  public void shutDown(final int shutdownTimeout) throws InterruptedException {
+    _isRunning = false;
+
+    _executorService.shutdownNow();
+    // Wait for collectStatistics runnable to finish so that disposal of
+    // statistics does not cause any exceptions to be thrown.
+    _executorService.awaitTermination(shutdownTimeout, TimeUnit.MILLISECONDS);
+  }
+
+  private Runnable collectStatistics() {
+    return new Runnable() {
+
+      @Override
+      public void run() {
+        while (_isRunning) {
+          try {
+            if(Thread.currentThread().isInterrupted()) {
+              break;
+            }
+            for(final StatsCollectorInput statsCollectorInput :
+                _statsCollectorInputList) {
+              Statistics statistics = statsCollectorInput.getStatistics();
+              StatisticsCollectorCallback statsCallback =
+                  statsCollectorInput.getCallback();
+
+              // Collect ticker data
+              for(final TickerType ticker : TickerType.values()) {
+                if(ticker != TickerType.TICKER_ENUM_MAX) {
+                  final long tickerValue = statistics.getTickerCount(ticker);
+                  statsCallback.tickerCallback(ticker, tickerValue);
+                }
+              }
+
+              // Collect histogram data
+              for(final HistogramType histogramType : HistogramType.values()) {
+                if(histogramType != HistogramType.HISTOGRAM_ENUM_MAX) {
+                  final HistogramData histogramData =
+                          statistics.getHistogramData(histogramType);
+                  statsCallback.histogramCallback(histogramType, histogramData);
+                }
+              }
+            }
+
+            Thread.sleep(_statsCollectionInterval);
+          }
+          catch (final InterruptedException e) {
+            Thread.currentThread().interrupt();
+            break;
+          }
+          catch (final Exception e) {
+            throw new RuntimeException("Error while calculating statistics", e);
+          }
+        }
+      }
+    };
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/StatisticsCollectorCallback.java b/src/rocksdb/java/src/main/java/org/rocksdb/StatisticsCollectorCallback.java
new file mode 100644
index 000000000..f3785b15f
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/StatisticsCollectorCallback.java
@@ -0,0 +1,32 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Callback interface provided to StatisticsCollector.
+ *
+ * Thread safety:
+ * StatisticsCollector doesn't make any guarantees about thread safety.
+ * If the same reference of StatisticsCollectorCallback is passed to multiple
+ * StatisticsCollector references, then its the responsibility of the
+ * user to make StatisticsCollectorCallback's implementation thread-safe.
+ *
+ */
+public interface StatisticsCollectorCallback {
+  /**
+   * Callback function to get ticker values.
+   * @param tickerType Ticker type.
+   * @param tickerCount Value of ticker type.
+  */
+  void tickerCallback(TickerType tickerType, long tickerCount);
+
+  /**
+   * Callback function to get histogram values.
+   * @param histType Histogram type.
+   * @param histData Histogram data.
+  */
+  void histogramCallback(HistogramType histType, HistogramData histData);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/StatsCollectorInput.java b/src/rocksdb/java/src/main/java/org/rocksdb/StatsCollectorInput.java
new file mode 100644
index 000000000..5bf43ade5
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/StatsCollectorInput.java
@@ -0,0 +1,35 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Contains all information necessary to collect statistics from one instance
+ * of DB statistics.
+ */
+public class StatsCollectorInput {
+  private final Statistics _statistics;
+  private final StatisticsCollectorCallback _statsCallback;
+
+  /**
+   * Constructor for StatsCollectorInput.
+   *
+   * @param statistics Reference of DB statistics.
+   * @param statsCallback Reference of statistics callback interface.
+   */
+  public StatsCollectorInput(final Statistics statistics,
+      final StatisticsCollectorCallback statsCallback) {
+    _statistics = statistics;
+    _statsCallback = statsCallback;
+  }
+
+  public Statistics getStatistics() {
+    return _statistics;
+  }
+
+  public StatisticsCollectorCallback getCallback() {
+    return _statsCallback;
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/StatsLevel.java b/src/rocksdb/java/src/main/java/org/rocksdb/StatsLevel.java
new file mode 100644
index 000000000..58504b84a
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/StatsLevel.java
@@ -0,0 +1,65 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * The level of Statistics to report.
+ */
+public enum StatsLevel {
+    /**
+     * Collect all stats except time inside mutex lock AND time spent on
+     * compression.
+     */
+    EXCEPT_DETAILED_TIMERS((byte) 0x0),
+
+    /**
+     * Collect all stats except the counters requiring to get time inside the
+     * mutex lock.
+     */
+    EXCEPT_TIME_FOR_MUTEX((byte) 0x1),
+
+    /**
+     * Collect all stats, including measuring duration of mutex operations.
+     *
+     * If getting time is expensive on the platform to run, it can
+     * reduce scalability to more threads, especially for writes.
+     */
+    ALL((byte) 0x2);
+
+    private final byte value;
+
+    StatsLevel(final byte value) {
+        this.value = value;
+    }
+
+    /**
+     * <p>Returns the byte value of the enumerations value.</p>
+     *
+     * @return byte representation
+     */
+    public byte getValue() {
+        return value;
+    }
+
+    /**
+     * Get StatsLevel by byte value.
+     *
+     * @param value byte representation of StatsLevel.
+     *
+     * @return {@link org.rocksdb.StatsLevel} instance.
+     * @throws java.lang.IllegalArgumentException if an invalid
+     *     value is provided.
+     */
+    public static StatsLevel getStatsLevel(final byte value) {
+        for (final StatsLevel statsLevel : StatsLevel.values()) {
+            if (statsLevel.getValue() == value){
+                return statsLevel;
+            }
+        }
+        throw new IllegalArgumentException(
+                "Illegal value provided for StatsLevel.");
+    }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/Status.java b/src/rocksdb/java/src/main/java/org/rocksdb/Status.java
new file mode 100644
index 000000000..033ed3ea1
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/Status.java
@@ -0,0 +1,155 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Objects;
+
+/**
+ * Represents the status returned by a function call in RocksDB.
+ *
+ * Currently only used with {@link RocksDBException} when the
+ * status is not {@link Code#Ok}
+ */
+public class Status {
+  private final Code code;
+  /* @Nullable */ private final SubCode subCode;
+  /* @Nullable */ private final String state;
+
+  public Status(final Code code, final SubCode subCode, final String state) {
+    this.code = code;
+    this.subCode = subCode;
+    this.state = state;
+  }
+
+  /**
+   * Intentionally private as this will be called from JNI
+   */
+  private Status(final byte code, final byte subCode, final String state) {
+    this.code = Code.getCode(code);
+    this.subCode = SubCode.getSubCode(subCode);
+    this.state = state;
+  }
+
+  public Code getCode() {
+    return code;
+  }
+
+  public SubCode getSubCode() {
+    return subCode;
+  }
+
+  public String getState() {
+    return state;
+  }
+
+  public String getCodeString() {
+    final StringBuilder builder = new StringBuilder()
+        .append(code.name());
+    if(subCode != null && subCode != SubCode.None) {
+      builder.append("(")
+          .append(subCode.name())
+          .append(")");
+    }
+    return builder.toString();
+  }
+
+  // should stay in sync with /include/rocksdb/status.h:Code and /java/rocksjni/portal.h:toJavaStatusCode
+  public enum Code {
+    Ok(                 (byte)0x0),
+    NotFound(           (byte)0x1),
+    Corruption(         (byte)0x2),
+    NotSupported(       (byte)0x3),
+    InvalidArgument(    (byte)0x4),
+    IOError(            (byte)0x5),
+    MergeInProgress(    (byte)0x6),
+    Incomplete(         (byte)0x7),
+    ShutdownInProgress( (byte)0x8),
+    TimedOut(           (byte)0x9),
+    Aborted(            (byte)0xA),
+    Busy(               (byte)0xB),
+    Expired(            (byte)0xC),
+    TryAgain(           (byte)0xD),
+    Undefined(          (byte)0x7F);
+
+    private final byte value;
+
+    Code(final byte value) {
+      this.value = value;
+    }
+
+    public static Code getCode(final byte value) {
+      for (final Code code : Code.values()) {
+        if (code.value == value){
+          return code;
+        }
+      }
+      throw new IllegalArgumentException(
+          "Illegal value provided for Code (" + value + ").");
+    }
+
+    /**
+     * Returns the byte value of the enumerations value.
+     *
+     * @return byte representation
+     */
+    public byte getValue() {
+      return value;
+    }
+  }
+
+  // should stay in sync with /include/rocksdb/status.h:SubCode and /java/rocksjni/portal.h:toJavaStatusSubCode
+  public enum SubCode {
+    None(         (byte)0x0),
+    MutexTimeout( (byte)0x1),
+    LockTimeout(  (byte)0x2),
+    LockLimit(    (byte)0x3),
+    NoSpace(      (byte)0x4),
+    Deadlock(     (byte)0x5),
+    StaleFile(    (byte)0x6),
+    MemoryLimit(  (byte)0x7),
+    Undefined(    (byte)0x7F);
+
+    private final byte value;
+
+    SubCode(final byte value) {
+      this.value = value;
+    }
+
+    public static SubCode getSubCode(final byte value) {
+      for (final SubCode subCode : SubCode.values()) {
+        if (subCode.value == value){
+          return subCode;
+        }
+      }
+      throw new IllegalArgumentException(
+          "Illegal value provided for SubCode (" + value + ").");
+    }
+
+    /**
+     * Returns the byte value of the enumerations value.
+     *
+     * @return byte representation
+     */
+    public byte getValue() {
+      return value;
+    }
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    Status status = (Status) o;
+    return code == status.code && subCode == status.subCode && Objects.equals(state, status.state);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(code, subCode, state);
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/StringAppendOperator.java b/src/rocksdb/java/src/main/java/org/rocksdb/StringAppendOperator.java
new file mode 100644
index 000000000..ddbccff46
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/StringAppendOperator.java
@@ -0,0 +1,29 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2014, Vlad Balan (vlad.gm@gmail.com).  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * StringAppendOperator is a merge operator that concatenates
+ * two strings.
+ */
+public class StringAppendOperator extends MergeOperator {
+    public StringAppendOperator() {
+        this(',');
+    }
+
+    public StringAppendOperator(char delim) {
+        super(newSharedStringAppendOperator(delim));
+    }
+
+    public StringAppendOperator(String delim) {
+      super(newSharedStringAppendOperator(delim));
+    }
+
+    private native static long newSharedStringAppendOperator(final char delim);
+    private native static long newSharedStringAppendOperator(final String delim);
+    @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationBriefInfo.java b/src/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationBriefInfo.java
new file mode 100644
index 000000000..5a383ade4
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationBriefInfo.java
@@ -0,0 +1,107 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Objects;
+
+public class TableFileCreationBriefInfo {
+  private final String dbName;
+  private final String columnFamilyName;
+  private final String filePath;
+  private final int jobId;
+  private final TableFileCreationReason reason;
+
+  /**
+   * Access is private as this will only be constructed from
+   * C++ via JNI, either directly of via
+   * {@link TableFileCreationInfo#TableFileCreationInfo(long, TableProperties, Status, String,
+   * String, String, int, byte)}.
+   *
+   * @param dbName the database name
+   * @param columnFamilyName the column family name
+   * @param filePath the path to the table file
+   * @param jobId the job identifier
+   * @param tableFileCreationReasonValue the reason for creation of the table file
+   */
+  protected TableFileCreationBriefInfo(final String dbName, final String columnFamilyName,
+      final String filePath, final int jobId, final byte tableFileCreationReasonValue) {
+    this.dbName = dbName;
+    this.columnFamilyName = columnFamilyName;
+    this.filePath = filePath;
+    this.jobId = jobId;
+    this.reason = TableFileCreationReason.fromValue(tableFileCreationReasonValue);
+  }
+
+  /**
+   * Get the name of the database where the file was created.
+   *
+   * @return the name of the database.
+   */
+  public String getDbName() {
+    return dbName;
+  }
+
+  /**
+   * Get the name of the column family where the file was created.
+   *
+   * @return the name of the column family.
+   */
+  public String getColumnFamilyName() {
+    return columnFamilyName;
+  }
+
+  /**
+   * Get the path to the created file.
+   *
+   * @return the path.
+   */
+  public String getFilePath() {
+    return filePath;
+  }
+
+  /**
+   * Get the id of the job (which could be flush or compaction) that
+   * created the file.
+   *
+   * @return the id of the job.
+   */
+  public int getJobId() {
+    return jobId;
+  }
+
+  /**
+   * Get the reason for creating the table.
+   *
+   * @return the reason for creating the table.
+   */
+  public TableFileCreationReason getReason() {
+    return reason;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    TableFileCreationBriefInfo that = (TableFileCreationBriefInfo) o;
+    return jobId == that.jobId && Objects.equals(dbName, that.dbName)
+        && Objects.equals(columnFamilyName, that.columnFamilyName)
+        && Objects.equals(filePath, that.filePath) && reason == that.reason;
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(dbName, columnFamilyName, filePath, jobId, reason);
+  }
+
+  @Override
+  public String toString() {
+    return "TableFileCreationBriefInfo{"
+        + "dbName='" + dbName + '\'' + ", columnFamilyName='" + columnFamilyName + '\''
+        + ", filePath='" + filePath + '\'' + ", jobId=" + jobId + ", reason=" + reason + '}';
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationInfo.java b/src/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationInfo.java
new file mode 100644
index 000000000..7742f32f1
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationInfo.java
@@ -0,0 +1,86 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Objects;
+
+public class TableFileCreationInfo extends TableFileCreationBriefInfo {
+  private final long fileSize;
+  private final TableProperties tableProperties;
+  private final Status status;
+
+  /**
+   * Access is protected as this will only be constructed from
+   * C++ via JNI.
+   *
+   * @param fileSize the size of the table file
+   * @param tableProperties the properties of the table file
+   * @param status the status of the creation operation
+   * @param dbName the database name
+   * @param columnFamilyName the column family name
+   * @param filePath the path to the table file
+   * @param jobId the job identifier
+   * @param tableFileCreationReasonValue the reason for creation of the table file
+   */
+  protected TableFileCreationInfo(final long fileSize, final TableProperties tableProperties,
+      final Status status, final String dbName, final String columnFamilyName,
+      final String filePath, final int jobId, final byte tableFileCreationReasonValue) {
+    super(dbName, columnFamilyName, filePath, jobId, tableFileCreationReasonValue);
+    this.fileSize = fileSize;
+    this.tableProperties = tableProperties;
+    this.status = status;
+  }
+
+  /**
+   * Get the size of the file.
+   *
+   * @return the size.
+   */
+  public long getFileSize() {
+    return fileSize;
+  }
+
+  /**
+   * Get the detailed properties of the created file.
+   *
+   * @return the properties.
+   */
+  public TableProperties getTableProperties() {
+    return tableProperties;
+  }
+
+  /**
+   * Get the status indicating whether the creation was successful or not.
+   *
+   * @return the status.
+   */
+  public Status getStatus() {
+    return status;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    TableFileCreationInfo that = (TableFileCreationInfo) o;
+    return fileSize == that.fileSize && Objects.equals(tableProperties, that.tableProperties)
+        && Objects.equals(status, that.status);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(fileSize, tableProperties, status);
+  }
+
+  @Override
+  public String toString() {
+    return "TableFileCreationInfo{"
+        + "fileSize=" + fileSize + ", tableProperties=" + tableProperties + ", status=" + status
+        + '}';
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationReason.java b/src/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationReason.java
new file mode 100644
index 000000000..d3984663d
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationReason.java
@@ -0,0 +1,46 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public enum TableFileCreationReason {
+  FLUSH((byte) 0x00),
+  COMPACTION((byte) 0x01),
+  RECOVERY((byte) 0x02),
+  MISC((byte) 0x03);
+
+  private final byte value;
+
+  TableFileCreationReason(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation.
+   *
+   * @return the internal representation
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get the TableFileCreationReason from the internal representation value.
+   *
+   * @return the table file creation reason.
+   *
+   * @throws IllegalArgumentException if the value is unknown.
+   */
+  static TableFileCreationReason fromValue(final byte value) {
+    for (final TableFileCreationReason tableFileCreationReason : TableFileCreationReason.values()) {
+      if (tableFileCreationReason.value == value) {
+        return tableFileCreationReason;
+      }
+    }
+
+    throw new IllegalArgumentException(
+        "Illegal value provided for TableFileCreationReason: " + value);
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/TableFileDeletionInfo.java b/src/rocksdb/java/src/main/java/org/rocksdb/TableFileDeletionInfo.java
new file mode 100644
index 000000000..8aad03ae8
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/TableFileDeletionInfo.java
@@ -0,0 +1,86 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Objects;
+
+public class TableFileDeletionInfo {
+  private final String dbName;
+  private final String filePath;
+  private final int jobId;
+  private final Status status;
+
+  /**
+   * Access is package private as this will only be constructed from
+   * C++ via JNI and for testing.
+   */
+  TableFileDeletionInfo(
+      final String dbName, final String filePath, final int jobId, final Status status) {
+    this.dbName = dbName;
+    this.filePath = filePath;
+    this.jobId = jobId;
+    this.status = status;
+  }
+
+  /**
+   * Get the name of the database where the file was deleted.
+   *
+   * @return the name of the database.
+   */
+  public String getDbName() {
+    return dbName;
+  }
+
+  /**
+   * Get the path to the deleted file.
+   *
+   * @return the path.
+   */
+  public String getFilePath() {
+    return filePath;
+  }
+
+  /**
+   * Get the id of the job which deleted the file.
+   *
+   * @return the id of the job.
+   */
+  public int getJobId() {
+    return jobId;
+  }
+
+  /**
+   * Get the status indicating whether the deletion was successful or not.
+   *
+   * @return the status
+   */
+  public Status getStatus() {
+    return status;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    TableFileDeletionInfo that = (TableFileDeletionInfo) o;
+    return jobId == that.jobId && Objects.equals(dbName, that.dbName)
+        && Objects.equals(filePath, that.filePath) && Objects.equals(status, that.status);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(dbName, filePath, jobId, status);
+  }
+
+  @Override
+  public String toString() {
+    return "TableFileDeletionInfo{"
+        + "dbName='" + dbName + '\'' + ", filePath='" + filePath + '\'' + ", jobId=" + jobId
+        + ", status=" + status + '}';
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/TableFilter.java b/src/rocksdb/java/src/main/java/org/rocksdb/TableFilter.java
new file mode 100644
index 000000000..a39a329fb
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/TableFilter.java
@@ -0,0 +1,21 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+package org.rocksdb;
+
+/**
+ * Filter for iterating a table.
+ */
+public interface TableFilter {
+
+  /**
+   * A callback to determine whether relevant keys for this scan exist in a
+   * given table based on the table's properties. The callback is passed the
+   * properties of each table during iteration. If the callback returns false,
+   * the table will not be scanned. This option only affects Iterators and has
+   * no impact on point lookups.
+   *
+   * @param tableProperties the table properties.
+   *
+   * @return true if the table should be scanned, false otherwise.
+   */
+  boolean filter(final TableProperties tableProperties);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/TableFormatConfig.java b/src/rocksdb/java/src/main/java/org/rocksdb/TableFormatConfig.java
new file mode 100644
index 000000000..dbe524c42
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/TableFormatConfig.java
@@ -0,0 +1,22 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+/**
+ * TableFormatConfig is used to config the internal Table format of a RocksDB.
+ * To make a RocksDB to use a specific Table format, its associated
+ * TableFormatConfig should be properly set and passed into Options via
+ * Options.setTableFormatConfig() and open the db using that Options.
+ */
+public abstract class TableFormatConfig {
+  /**
+   * <p>This function should only be called by Options.setTableFormatConfig(),
+   * which will create a c++ shared-pointer to the c++ TableFactory
+   * that associated with the Java TableFormatConfig.</p>
+   *
+   * @return native handle address to native table instance.
+   */
+  abstract protected long newTableFactoryHandle();
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/TableProperties.java b/src/rocksdb/java/src/main/java/org/rocksdb/TableProperties.java
new file mode 100644
index 000000000..096341a4c
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/TableProperties.java
@@ -0,0 +1,426 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+package org.rocksdb;
+
+import java.util.Arrays;
+import java.util.Map;
+import java.util.Objects;
+
+/**
+ * TableProperties contains read-only properties of its associated
+ * table.
+ */
+public class TableProperties {
+  private final long dataSize;
+  private final long indexSize;
+  private final long indexPartitions;
+  private final long topLevelIndexSize;
+  private final long indexKeyIsUserKey;
+  private final long indexValueIsDeltaEncoded;
+  private final long filterSize;
+  private final long rawKeySize;
+  private final long rawValueSize;
+  private final long numDataBlocks;
+  private final long numEntries;
+  private final long numDeletions;
+  private final long numMergeOperands;
+  private final long numRangeDeletions;
+  private final long formatVersion;
+  private final long fixedKeyLen;
+  private final long columnFamilyId;
+  private final long creationTime;
+  private final long oldestKeyTime;
+  private final long slowCompressionEstimatedDataSize;
+  private final long fastCompressionEstimatedDataSize;
+  private final long externalSstFileGlobalSeqnoOffset;
+  private final byte[] columnFamilyName;
+  private final String filterPolicyName;
+  private final String comparatorName;
+  private final String mergeOperatorName;
+  private final String prefixExtractorName;
+  private final String propertyCollectorsNames;
+  private final String compressionName;
+  private final Map<String, String> userCollectedProperties;
+  private final Map<String, String> readableProperties;
+
+  /**
+   * Access is package private as this will only be constructed from
+   * C++ via JNI and for testing.
+   */
+  TableProperties(final long dataSize, final long indexSize, final long indexPartitions,
+      final long topLevelIndexSize, final long indexKeyIsUserKey,
+      final long indexValueIsDeltaEncoded, final long filterSize, final long rawKeySize,
+      final long rawValueSize, final long numDataBlocks, final long numEntries,
+      final long numDeletions, final long numMergeOperands, final long numRangeDeletions,
+      final long formatVersion, final long fixedKeyLen, final long columnFamilyId,
+      final long creationTime, final long oldestKeyTime,
+      final long slowCompressionEstimatedDataSize, final long fastCompressionEstimatedDataSize,
+      final long externalSstFileGlobalSeqnoOffset, final byte[] columnFamilyName,
+      final String filterPolicyName, final String comparatorName, final String mergeOperatorName,
+      final String prefixExtractorName, final String propertyCollectorsNames,
+      final String compressionName, final Map<String, String> userCollectedProperties,
+      final Map<String, String> readableProperties) {
+    this.dataSize = dataSize;
+    this.indexSize = indexSize;
+    this.indexPartitions = indexPartitions;
+    this.topLevelIndexSize = topLevelIndexSize;
+    this.indexKeyIsUserKey = indexKeyIsUserKey;
+    this.indexValueIsDeltaEncoded = indexValueIsDeltaEncoded;
+    this.filterSize = filterSize;
+    this.rawKeySize = rawKeySize;
+    this.rawValueSize = rawValueSize;
+    this.numDataBlocks = numDataBlocks;
+    this.numEntries = numEntries;
+    this.numDeletions = numDeletions;
+    this.numMergeOperands = numMergeOperands;
+    this.numRangeDeletions = numRangeDeletions;
+    this.formatVersion = formatVersion;
+    this.fixedKeyLen = fixedKeyLen;
+    this.columnFamilyId = columnFamilyId;
+    this.creationTime = creationTime;
+    this.oldestKeyTime = oldestKeyTime;
+    this.slowCompressionEstimatedDataSize = slowCompressionEstimatedDataSize;
+    this.fastCompressionEstimatedDataSize = fastCompressionEstimatedDataSize;
+    this.externalSstFileGlobalSeqnoOffset = externalSstFileGlobalSeqnoOffset;
+    this.columnFamilyName = columnFamilyName;
+    this.filterPolicyName = filterPolicyName;
+    this.comparatorName = comparatorName;
+    this.mergeOperatorName = mergeOperatorName;
+    this.prefixExtractorName = prefixExtractorName;
+    this.propertyCollectorsNames = propertyCollectorsNames;
+    this.compressionName = compressionName;
+    this.userCollectedProperties = userCollectedProperties;
+    this.readableProperties = readableProperties;
+  }
+
+  /**
+   * Get the total size of all data blocks.
+   *
+   * @return the total size of all data blocks.
+   */
+  public long getDataSize() {
+    return dataSize;
+  }
+
+  /**
+   * Get the size of index block.
+   *
+   * @return the size of index block.
+   */
+  public long getIndexSize() {
+    return indexSize;
+  }
+
+  /**
+   * Get the total number of index partitions
+   * if {@link IndexType#kTwoLevelIndexSearch} is used.
+   *
+   * @return the total number of index partitions.
+   */
+  public long getIndexPartitions() {
+    return indexPartitions;
+  }
+
+  /**
+   * Size of the top-level index
+   * if {@link IndexType#kTwoLevelIndexSearch} is used.
+   *
+   * @return the size of the top-level index.
+   */
+  public long getTopLevelIndexSize() {
+    return topLevelIndexSize;
+  }
+
+  /**
+   * Whether the index key is user key.
+   * Otherwise it includes 8 byte of sequence
+   * number added by internal key format.
+   *
+   * @return the index key
+   */
+  public long getIndexKeyIsUserKey() {
+    return indexKeyIsUserKey;
+  }
+
+  /**
+   * Whether delta encoding is used to encode the index values.
+   *
+   * @return whether delta encoding is used to encode the index values.
+   */
+  public long getIndexValueIsDeltaEncoded() {
+    return indexValueIsDeltaEncoded;
+  }
+
+  /**
+   * Get the size of filter block.
+   *
+   * @return the size of filter block.
+   */
+  public long getFilterSize() {
+    return filterSize;
+  }
+
+  /**
+   * Get the total raw key size.
+   *
+   * @return the total raw key size.
+   */
+  public long getRawKeySize() {
+    return rawKeySize;
+  }
+
+  /**
+   * Get the total raw value size.
+   *
+   * @return the total raw value size.
+   */
+  public long getRawValueSize() {
+    return rawValueSize;
+  }
+
+  /**
+   * Get the number of blocks in this table.
+   *
+   * @return the number of blocks in this table.
+   */
+  public long getNumDataBlocks() {
+    return numDataBlocks;
+  }
+
+  /**
+   * Get the number of entries in this table.
+   *
+   * @return the number of entries in this table.
+   */
+  public long getNumEntries() {
+    return numEntries;
+  }
+
+  /**
+   * Get the number of deletions in the table.
+   *
+   * @return the number of deletions in the table.
+   */
+  public long getNumDeletions() {
+    return numDeletions;
+  }
+
+  /**
+   * Get the number of merge operands in the table.
+   *
+   * @return the number of merge operands in the table.
+   */
+  public long getNumMergeOperands() {
+    return numMergeOperands;
+  }
+
+  /**
+   * Get the number of range deletions in this table.
+   *
+   * @return the number of range deletions in this table.
+   */
+  public long getNumRangeDeletions() {
+    return numRangeDeletions;
+  }
+
+  /**
+   * Get the format version, reserved for backward compatibility.
+   *
+   * @return the format version.
+   */
+  public long getFormatVersion() {
+    return formatVersion;
+  }
+
+  /**
+   * Get the length of the keys.
+   *
+   * @return 0 when the key is variable length, otherwise number of
+   *     bytes for each key.
+   */
+  public long getFixedKeyLen() {
+    return fixedKeyLen;
+  }
+
+  /**
+   * Get the ID of column family for this SST file,
+   * corresponding to the column family identified by
+   * {@link #getColumnFamilyName()}.
+   *
+   * @return the id of the column family.
+   */
+  public long getColumnFamilyId() {
+    return columnFamilyId;
+  }
+
+  /**
+   * The time when the SST file was created.
+   * Since SST files are immutable, this is equivalent
+   * to last modified time.
+   *
+   * @return the created time.
+   */
+  public long getCreationTime() {
+    return creationTime;
+  }
+
+  /**
+   * Get the timestamp of the earliest key.
+   *
+   * @return 0 means unknown, otherwise the timestamp.
+   */
+  public long getOldestKeyTime() {
+    return oldestKeyTime;
+  }
+
+  /**
+   * Get the estimated size of data blocks compressed with a relatively slower
+   * compression algorithm.
+   *
+   * @return 0 means unknown, otherwise the timestamp.
+   */
+  public long getSlowCompressionEstimatedDataSize() {
+    return slowCompressionEstimatedDataSize;
+  }
+
+  /**
+   * Get the estimated size of data blocks compressed with a relatively faster
+   * compression algorithm.
+   *
+   * @return 0 means unknown, otherwise the timestamp.
+   */
+  public long getFastCompressionEstimatedDataSize() {
+    return fastCompressionEstimatedDataSize;
+  }
+
+  /**
+   * Get the name of the column family with which this
+   * SST file is associated.
+   *
+   * @return the name of the column family, or null if the
+   *     column family is unknown.
+   */
+  /*@Nullable*/ public byte[] getColumnFamilyName() {
+    return columnFamilyName;
+  }
+
+  /**
+   * Get the name of the filter policy used in this table.
+   *
+   * @return the name of the filter policy, or null if
+   *     no filter policy is used.
+   */
+  /*@Nullable*/ public String getFilterPolicyName() {
+    return filterPolicyName;
+  }
+
+  /**
+   * Get the name of the comparator used in this table.
+   *
+   * @return the name of the comparator.
+   */
+  public String getComparatorName() {
+    return comparatorName;
+  }
+
+  /**
+   * Get the name of the merge operator used in this table.
+   *
+   * @return the name of the merge operator, or null if no merge operator
+   *      is used.
+   */
+  /*@Nullable*/ public String getMergeOperatorName() {
+    return mergeOperatorName;
+  }
+
+  /**
+   * Get the name of the prefix extractor used in this table.
+   *
+   * @return the name of the prefix extractor, or null if no prefix
+   *     extractor is used.
+   */
+  /*@Nullable*/ public String getPrefixExtractorName() {
+    return prefixExtractorName;
+  }
+
+  /**
+   * Get the names of the property collectors factories used in this table.
+   *
+   * @return the names of the property collector factories separated
+   *     by commas, e.g. {collector_name[1]},{collector_name[2]},...
+   */
+  public String getPropertyCollectorsNames() {
+    return propertyCollectorsNames;
+  }
+
+  /**
+   * Get the name of the compression algorithm used to compress the SST files.
+   *
+   * @return the name of the compression algorithm.
+   */
+  public String getCompressionName() {
+    return compressionName;
+  }
+
+  /**
+   * Get the user collected properties.
+   *
+   * @return the user collected properties.
+   */
+  public Map<String, String> getUserCollectedProperties() {
+    return userCollectedProperties;
+  }
+
+  /**
+   * Get the readable properties.
+   *
+   * @return the readable properties.
+   */
+  public Map<String, String> getReadableProperties() {
+    return readableProperties;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    TableProperties that = (TableProperties) o;
+    return dataSize == that.dataSize && indexSize == that.indexSize
+        && indexPartitions == that.indexPartitions && topLevelIndexSize == that.topLevelIndexSize
+        && indexKeyIsUserKey == that.indexKeyIsUserKey
+        && indexValueIsDeltaEncoded == that.indexValueIsDeltaEncoded
+        && filterSize == that.filterSize && rawKeySize == that.rawKeySize
+        && rawValueSize == that.rawValueSize && numDataBlocks == that.numDataBlocks
+        && numEntries == that.numEntries && numDeletions == that.numDeletions
+        && numMergeOperands == that.numMergeOperands && numRangeDeletions == that.numRangeDeletions
+        && formatVersion == that.formatVersion && fixedKeyLen == that.fixedKeyLen
+        && columnFamilyId == that.columnFamilyId && creationTime == that.creationTime
+        && oldestKeyTime == that.oldestKeyTime
+        && slowCompressionEstimatedDataSize == that.slowCompressionEstimatedDataSize
+        && fastCompressionEstimatedDataSize == that.fastCompressionEstimatedDataSize
+        && externalSstFileGlobalSeqnoOffset == that.externalSstFileGlobalSeqnoOffset
+        && Arrays.equals(columnFamilyName, that.columnFamilyName)
+        && Objects.equals(filterPolicyName, that.filterPolicyName)
+        && Objects.equals(comparatorName, that.comparatorName)
+        && Objects.equals(mergeOperatorName, that.mergeOperatorName)
+        && Objects.equals(prefixExtractorName, that.prefixExtractorName)
+        && Objects.equals(propertyCollectorsNames, that.propertyCollectorsNames)
+        && Objects.equals(compressionName, that.compressionName)
+        && Objects.equals(userCollectedProperties, that.userCollectedProperties)
+        && Objects.equals(readableProperties, that.readableProperties);
+  }
+
+  @Override
+  public int hashCode() {
+    int result = Objects.hash(dataSize, indexSize, indexPartitions, topLevelIndexSize,
+        indexKeyIsUserKey, indexValueIsDeltaEncoded, filterSize, rawKeySize, rawValueSize,
+        numDataBlocks, numEntries, numDeletions, numMergeOperands, numRangeDeletions, formatVersion,
+        fixedKeyLen, columnFamilyId, creationTime, oldestKeyTime, slowCompressionEstimatedDataSize,
+        fastCompressionEstimatedDataSize, externalSstFileGlobalSeqnoOffset, filterPolicyName,
+        comparatorName, mergeOperatorName, prefixExtractorName, propertyCollectorsNames,
+        compressionName, userCollectedProperties, readableProperties);
+    result = 31 * result + Arrays.hashCode(columnFamilyName);
+    return result;
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/ThreadStatus.java b/src/rocksdb/java/src/main/java/org/rocksdb/ThreadStatus.java
new file mode 100644
index 000000000..062df5889
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/ThreadStatus.java
@@ -0,0 +1,224 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Map;
+
+public class ThreadStatus {
+  private final long threadId;
+  private final ThreadType threadType;
+  private final String dbName;
+  private final String cfName;
+  private final OperationType operationType;
+  private final long operationElapsedTime; // microseconds
+  private final OperationStage operationStage;
+  private final long  operationProperties[];
+  private final StateType stateType;
+
+  /**
+   * Invoked from C++ via JNI
+   */
+  private ThreadStatus(final long threadId,
+                       final byte threadTypeValue,
+                       final String dbName,
+                       final String cfName,
+                       final byte operationTypeValue,
+                       final long operationElapsedTime,
+                       final byte operationStageValue,
+                       final long[] operationProperties,
+                       final byte stateTypeValue) {
+    this.threadId = threadId;
+    this.threadType = ThreadType.fromValue(threadTypeValue);
+    this.dbName = dbName;
+    this.cfName = cfName;
+    this.operationType = OperationType.fromValue(operationTypeValue);
+    this.operationElapsedTime = operationElapsedTime;
+    this.operationStage = OperationStage.fromValue(operationStageValue);
+    this.operationProperties = operationProperties;
+    this.stateType = StateType.fromValue(stateTypeValue);
+  }
+
+  /**
+   * Get the unique ID of the thread.
+   *
+   * @return the thread id
+   */
+  public long getThreadId() {
+    return threadId;
+  }
+
+  /**
+   * Get the type of the thread.
+   *
+   * @return the type of the thread.
+   */
+  public ThreadType getThreadType() {
+    return threadType;
+  }
+
+  /**
+   * The name of the DB instance that the thread is currently
+   * involved with.
+   *
+   * @return the name of the db, or null if the thread is not involved
+   *     in any DB operation.
+   */
+  /* @Nullable */ public String getDbName() {
+    return dbName;
+  }
+
+  /**
+   * The name of the Column Family that the thread is currently
+   * involved with.
+   *
+   * @return the name of the db, or null if the thread is not involved
+   *     in any column Family operation.
+   */
+  /* @Nullable */ public String getCfName() {
+    return cfName;
+  }
+
+  /**
+   * Get the operation (high-level action) that the current thread is involved
+   * with.
+   *
+   * @return the operation
+   */
+  public OperationType getOperationType() {
+    return operationType;
+  }
+
+  /**
+   * Get the elapsed time of the current thread operation in microseconds.
+   *
+   * @return the elapsed time
+   */
+  public long getOperationElapsedTime() {
+    return operationElapsedTime;
+  }
+
+  /**
+   * Get the current stage where the thread is involved in the current
+   * operation.
+   *
+   * @return the current stage of the current operation
+   */
+  public OperationStage getOperationStage() {
+    return operationStage;
+  }
+
+  /**
+   * Get the list of properties that describe some details about the current
+   * operation.
+   *
+   * Each field in might have different meanings for different operations.
+   *
+   * @return the properties
+   */
+  public long[] getOperationProperties() {
+    return operationProperties;
+  }
+
+  /**
+   * Get the state (lower-level action) that the current thread is involved
+   * with.
+   *
+   * @return the state
+   */
+  public StateType getStateType() {
+    return stateType;
+  }
+
+  /**
+   * Get the name of the thread type.
+   *
+   * @param threadType the thread type
+   *
+   * @return the name of the thread type.
+   */
+  public static String getThreadTypeName(final ThreadType threadType) {
+    return getThreadTypeName(threadType.getValue());
+  }
+
+  /**
+   * Get the name of an operation given its type.
+   *
+   * @param operationType the type of operation.
+   *
+   * @return the name of the operation.
+   */
+  public static String getOperationName(final OperationType operationType) {
+    return getOperationName(operationType.getValue());
+  }
+
+  public static String microsToString(final long operationElapsedTime) {
+    return microsToStringNative(operationElapsedTime);
+  }
+
+  /**
+   * Obtain a human-readable string describing the specified operation stage.
+   *
+   * @param operationStage the stage of the operation.
+   *
+   * @return the description of the operation stage.
+   */
+  public static String getOperationStageName(
+      final OperationStage operationStage) {
+    return getOperationStageName(operationStage.getValue());
+  }
+
+  /**
+   * Obtain the name of the "i"th operation property of the
+   * specified operation.
+   *
+   * @param operationType the operation type.
+   * @param i the index of the operation property.
+   *
+   * @return the name of the operation property
+   */
+  public static String getOperationPropertyName(
+      final OperationType operationType, final int i) {
+    return getOperationPropertyName(operationType.getValue(), i);
+  }
+
+  /**
+   * Translate the "i"th property of the specified operation given
+   * a property value.
+   *
+   * @param operationType the operation type.
+   * @param operationProperties the operation properties.
+   *
+   * @return the property values.
+   */
+  public static Map<String, Long> interpretOperationProperties(
+      final OperationType operationType, final long[] operationProperties) {
+    return interpretOperationProperties(operationType.getValue(),
+        operationProperties);
+  }
+
+  /**
+   * Obtain the name of a state given its type.
+   *
+   * @param stateType the state type.
+   *
+   * @return the name of the state.
+   */
+  public static String getStateName(final StateType stateType) {
+    return getStateName(stateType.getValue());
+  }
+
+  private static native String getThreadTypeName(final byte threadTypeValue);
+  private static native String getOperationName(final byte operationTypeValue);
+  private static native String microsToStringNative(
+      final long operationElapsedTime);
+  private static native String getOperationStageName(
+      final byte operationStageTypeValue);
+  private static native String getOperationPropertyName(
+      final byte operationTypeValue, final int i);
+  private static native Map<String, Long>interpretOperationProperties(
+      final byte operationTypeValue, final long[] operationProperties);
+  private static native String getStateName(final byte stateTypeValue);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/ThreadType.java b/src/rocksdb/java/src/main/java/org/rocksdb/ThreadType.java
new file mode 100644
index 000000000..cc329f442
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/ThreadType.java
@@ -0,0 +1,65 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * The type of a thread.
+ */
+public enum ThreadType {
+  /**
+   * RocksDB BG thread in high-pri thread pool.
+   */
+  HIGH_PRIORITY((byte)0x0),
+
+  /**
+   * RocksDB BG thread in low-pri thread pool.
+   */
+  LOW_PRIORITY((byte)0x1),
+
+  /**
+   * User thread (Non-RocksDB BG thread).
+   */
+  USER((byte)0x2),
+
+  /**
+   * RocksDB BG thread in bottom-pri thread pool
+   */
+  BOTTOM_PRIORITY((byte)0x3);
+
+  private final byte value;
+
+  ThreadType(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation value.
+   *
+   * @return the internal representation value.
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get the Thread type from the internal representation value.
+   *
+   * @param value the internal representation value.
+   *
+   * @return the thread type
+   *
+   * @throws IllegalArgumentException if the value does not match a ThreadType
+   */
+  static ThreadType fromValue(final byte value)
+      throws IllegalArgumentException {
+    for (final ThreadType threadType : ThreadType.values()) {
+      if (threadType.value == value) {
+        return threadType;
+      }
+    }
+    throw new IllegalArgumentException("Unknown value for ThreadType: " + value);
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/TickerType.java b/src/rocksdb/java/src/main/java/org/rocksdb/TickerType.java
new file mode 100644
index 000000000..0d00add5b
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/TickerType.java
@@ -0,0 +1,874 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * The logical mapping of tickers defined in rocksdb::Tickers.
+ *
+ * Java byte value mappings don't align 1:1 to the c++ values. c++ rocksdb::Tickers enumeration type
+ * is uint32_t and java org.rocksdb.TickerType is byte, this causes mapping issues when
+ * rocksdb::Tickers value is greater then 127 (0x7F) for jbyte jni interface as range greater is not
+ * available. Without breaking interface in minor versions, value mappings for
+ * org.rocksdb.TickerType leverage full byte range [-128 (-0x80), (0x7F)]. Newer tickers added
+ * should descend into negative values until TICKER_ENUM_MAX reaches -128 (-0x80).
+ */
+public enum TickerType {
+
+    /**
+     * total block cache misses
+     *
+     * REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS +
+     *     BLOCK_CACHE_FILTER_MISS +
+     *     BLOCK_CACHE_DATA_MISS;
+     */
+    BLOCK_CACHE_MISS((byte) 0x0),
+
+    /**
+     * total block cache hit
+     *
+     * REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT +
+     *     BLOCK_CACHE_FILTER_HIT +
+     *     BLOCK_CACHE_DATA_HIT;
+     */
+    BLOCK_CACHE_HIT((byte) 0x1),
+
+    BLOCK_CACHE_ADD((byte) 0x2),
+
+    /**
+     * # of failures when adding blocks to block cache.
+     */
+    BLOCK_CACHE_ADD_FAILURES((byte) 0x3),
+
+    /**
+     * # of times cache miss when accessing index block from block cache.
+     */
+    BLOCK_CACHE_INDEX_MISS((byte) 0x4),
+
+    /**
+     * # of times cache hit when accessing index block from block cache.
+     */
+    BLOCK_CACHE_INDEX_HIT((byte) 0x5),
+
+    /**
+     * # of index blocks added to block cache.
+     */
+    BLOCK_CACHE_INDEX_ADD((byte) 0x6),
+
+    /**
+     * # of bytes of index blocks inserted into cache
+     */
+    BLOCK_CACHE_INDEX_BYTES_INSERT((byte) 0x7),
+
+    /**
+     * # of bytes of index block erased from cache
+     */
+    BLOCK_CACHE_INDEX_BYTES_EVICT((byte) 0x8),
+
+    /**
+     * # of times cache miss when accessing filter block from block cache.
+     */
+    BLOCK_CACHE_FILTER_MISS((byte) 0x9),
+
+    /**
+     * # of times cache hit when accessing filter block from block cache.
+     */
+    BLOCK_CACHE_FILTER_HIT((byte) 0xA),
+
+    /**
+     * # of filter blocks added to block cache.
+     */
+    BLOCK_CACHE_FILTER_ADD((byte) 0xB),
+
+    /**
+     * # of bytes of bloom filter blocks inserted into cache
+     */
+    BLOCK_CACHE_FILTER_BYTES_INSERT((byte) 0xC),
+
+    /**
+     * # of bytes of bloom filter block erased from cache
+     */
+    BLOCK_CACHE_FILTER_BYTES_EVICT((byte) 0xD),
+
+    /**
+     * # of times cache miss when accessing data block from block cache.
+     */
+    BLOCK_CACHE_DATA_MISS((byte) 0xE),
+
+    /**
+     * # of times cache hit when accessing data block from block cache.
+     */
+    BLOCK_CACHE_DATA_HIT((byte) 0xF),
+
+    /**
+     * # of data blocks added to block cache.
+     */
+    BLOCK_CACHE_DATA_ADD((byte) 0x10),
+
+    /**
+     * # of bytes of data blocks inserted into cache
+     */
+    BLOCK_CACHE_DATA_BYTES_INSERT((byte) 0x11),
+
+    /**
+     * # of bytes read from cache.
+     */
+    BLOCK_CACHE_BYTES_READ((byte) 0x12),
+
+    /**
+     * # of bytes written into cache.
+     */
+    BLOCK_CACHE_BYTES_WRITE((byte) 0x13),
+
+    /**
+     * # of times bloom filter has avoided file reads.
+     */
+    BLOOM_FILTER_USEFUL((byte) 0x14),
+
+    /**
+     * # persistent cache hit
+     */
+    PERSISTENT_CACHE_HIT((byte) 0x15),
+
+    /**
+     * # persistent cache miss
+     */
+    PERSISTENT_CACHE_MISS((byte) 0x16),
+
+    /**
+     * # total simulation block cache hits
+     */
+    SIM_BLOCK_CACHE_HIT((byte) 0x17),
+
+    /**
+     * # total simulation block cache misses
+     */
+    SIM_BLOCK_CACHE_MISS((byte) 0x18),
+
+    /**
+     * # of memtable hits.
+     */
+    MEMTABLE_HIT((byte) 0x19),
+
+    /**
+     * # of memtable misses.
+     */
+    MEMTABLE_MISS((byte) 0x1A),
+
+    /**
+     * # of Get() queries served by L0
+     */
+    GET_HIT_L0((byte) 0x1B),
+
+    /**
+     * # of Get() queries served by L1
+     */
+    GET_HIT_L1((byte) 0x1C),
+
+    /**
+     * # of Get() queries served by L2 and up
+     */
+    GET_HIT_L2_AND_UP((byte) 0x1D),
+
+    /**
+     * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction
+     * There are 4 reasons currently.
+     */
+
+    /**
+     * key was written with a newer value.
+     */
+    COMPACTION_KEY_DROP_NEWER_ENTRY((byte) 0x1E),
+
+    /**
+     * Also includes keys dropped for range del.
+     * The key is obsolete.
+     */
+    COMPACTION_KEY_DROP_OBSOLETE((byte) 0x1F),
+
+    /**
+     * key was covered by a range tombstone.
+     */
+    COMPACTION_KEY_DROP_RANGE_DEL((byte) 0x20),
+
+    /**
+     * User compaction function has dropped the key.
+     */
+    COMPACTION_KEY_DROP_USER((byte) 0x21),
+
+    /**
+     * all keys in range were deleted.
+     */
+    COMPACTION_RANGE_DEL_DROP_OBSOLETE((byte) 0x22),
+
+    /**
+     * Number of keys written to the database via the Put and Write call's.
+     */
+    NUMBER_KEYS_WRITTEN((byte) 0x23),
+
+    /**
+     * Number of Keys read.
+     */
+    NUMBER_KEYS_READ((byte) 0x24),
+
+    /**
+     * Number keys updated, if inplace update is enabled
+     */
+    NUMBER_KEYS_UPDATED((byte) 0x25),
+
+    /**
+     * The number of uncompressed bytes issued by DB::Put(), DB::Delete(),\
+     * DB::Merge(), and DB::Write().
+     */
+    BYTES_WRITTEN((byte) 0x26),
+
+    /**
+     * The number of uncompressed bytes read from DB::Get().  It could be
+     * either from memtables, cache, or table files.
+     *
+     * For the number of logical bytes read from DB::MultiGet(),
+     * please use {@link #NUMBER_MULTIGET_BYTES_READ}.
+     */
+    BYTES_READ((byte) 0x27),
+
+    /**
+     * The number of calls to seek.
+     */
+    NUMBER_DB_SEEK((byte) 0x28),
+
+    /**
+     * The number of calls to next.
+     */
+    NUMBER_DB_NEXT((byte) 0x29),
+
+    /**
+     * The number of calls to prev.
+     */
+    NUMBER_DB_PREV((byte) 0x2A),
+
+    /**
+     * The number of calls to seek that returned data.
+     */
+    NUMBER_DB_SEEK_FOUND((byte) 0x2B),
+
+    /**
+     * The number of calls to next that returned data.
+     */
+    NUMBER_DB_NEXT_FOUND((byte) 0x2C),
+
+    /**
+     * The number of calls to prev that returned data.
+     */
+    NUMBER_DB_PREV_FOUND((byte) 0x2D),
+
+    /**
+     * The number of uncompressed bytes read from an iterator.
+     * Includes size of key and value.
+     */
+    ITER_BYTES_READ((byte) 0x2E),
+
+    NO_FILE_CLOSES((byte) 0x2F),
+
+    NO_FILE_OPENS((byte) 0x30),
+
+    NO_FILE_ERRORS((byte) 0x31),
+
+    /**
+     * Time system had to wait to do LO-L1 compactions.
+     *
+     * @deprecated
+     */
+    @Deprecated
+    STALL_L0_SLOWDOWN_MICROS((byte) 0x32),
+
+    /**
+     * Time system had to wait to move memtable to L1.
+     *
+     * @deprecated
+     */
+    @Deprecated
+    STALL_MEMTABLE_COMPACTION_MICROS((byte) 0x33),
+
+    /**
+     * write throttle because of too many files in L0.
+     *
+     * @deprecated
+     */
+    @Deprecated
+    STALL_L0_NUM_FILES_MICROS((byte) 0x34),
+
+    /**
+     * Writer has to wait for compaction or flush to finish.
+     */
+    STALL_MICROS((byte) 0x35),
+
+    /**
+     * The wait time for db mutex.
+     *
+     * Disabled by default. To enable it set stats level to {@link StatsLevel#ALL}
+     */
+    DB_MUTEX_WAIT_MICROS((byte) 0x36),
+
+    RATE_LIMIT_DELAY_MILLIS((byte) 0x37),
+
+    /**
+     * Number of iterators created.
+     *
+     */
+    NO_ITERATORS((byte) 0x38),
+
+    /**
+     * Number of MultiGet calls.
+     */
+    NUMBER_MULTIGET_CALLS((byte) 0x39),
+
+    /**
+     * Number of MultiGet keys read.
+     */
+    NUMBER_MULTIGET_KEYS_READ((byte) 0x3A),
+
+    /**
+     * Number of MultiGet bytes read.
+     */
+    NUMBER_MULTIGET_BYTES_READ((byte) 0x3B),
+
+    /**
+     * Number of deletes records that were not required to be
+     * written to storage because key does not exist.
+     */
+    NUMBER_FILTERED_DELETES((byte) 0x3C),
+    NUMBER_MERGE_FAILURES((byte) 0x3D),
+
+    /**
+     * Number of times bloom was checked before creating iterator on a
+     * file, and the number of times the check was useful in avoiding
+     * iterator creation (and thus likely IOPs).
+     */
+    BLOOM_FILTER_PREFIX_CHECKED((byte) 0x3E),
+    BLOOM_FILTER_PREFIX_USEFUL((byte) 0x3F),
+
+    /**
+     * Number of times we had to reseek inside an iteration to skip
+     * over large number of keys with same userkey.
+     */
+    NUMBER_OF_RESEEKS_IN_ITERATION((byte) 0x40),
+
+    /**
+     * Record the number of calls to {@link RocksDB#getUpdatesSince(long)}. Useful to keep track of
+     * transaction log iterator refreshes.
+     */
+    GET_UPDATES_SINCE_CALLS((byte) 0x41),
+
+    /**
+     * Miss in the compressed block cache.
+     */
+    BLOCK_CACHE_COMPRESSED_MISS((byte) 0x42),
+
+    /**
+     * Hit in the compressed block cache.
+     */
+    BLOCK_CACHE_COMPRESSED_HIT((byte) 0x43),
+
+    /**
+     * Number of blocks added to compressed block cache.
+     */
+    BLOCK_CACHE_COMPRESSED_ADD((byte) 0x44),
+
+    /**
+     * Number of failures when adding blocks to compressed block cache.
+     */
+    BLOCK_CACHE_COMPRESSED_ADD_FAILURES((byte) 0x45),
+
+    /**
+     * Number of times WAL sync is done.
+     */
+    WAL_FILE_SYNCED((byte) 0x46),
+
+    /**
+     * Number of bytes written to WAL.
+     */
+    WAL_FILE_BYTES((byte) 0x47),
+
+    /**
+     * Writes can be processed by requesting thread or by the thread at the
+     * head of the writers queue.
+     */
+    WRITE_DONE_BY_SELF((byte) 0x48),
+
+    /**
+     * Equivalent to writes done for others.
+     */
+    WRITE_DONE_BY_OTHER((byte) 0x49),
+
+    /**
+     * Number of writes ending up with timed-out.
+     */
+    WRITE_TIMEDOUT((byte) 0x4A),
+
+    /**
+     * Number of Write calls that request WAL.
+     */
+    WRITE_WITH_WAL((byte) 0x4B),
+
+    /**
+     * Bytes read during compaction.
+     */
+    COMPACT_READ_BYTES((byte) 0x4C),
+
+    /**
+     * Bytes written during compaction.
+     */
+    COMPACT_WRITE_BYTES((byte) 0x4D),
+
+    /**
+     * Bytes written during flush.
+     */
+    FLUSH_WRITE_BYTES((byte) 0x4E),
+
+    /**
+     * Number of table's properties loaded directly from file, without creating
+     * table reader object.
+     */
+    NUMBER_DIRECT_LOAD_TABLE_PROPERTIES((byte) 0x4F),
+    NUMBER_SUPERVERSION_ACQUIRES((byte) 0x50),
+    NUMBER_SUPERVERSION_RELEASES((byte) 0x51),
+    NUMBER_SUPERVERSION_CLEANUPS((byte) 0x52),
+
+    /**
+     * # of compressions/decompressions executed
+     */
+    NUMBER_BLOCK_COMPRESSED((byte) 0x53),
+    NUMBER_BLOCK_DECOMPRESSED((byte) 0x54),
+
+    NUMBER_BLOCK_NOT_COMPRESSED((byte) 0x55),
+    MERGE_OPERATION_TOTAL_TIME((byte) 0x56),
+    FILTER_OPERATION_TOTAL_TIME((byte) 0x57),
+
+    /**
+     * Row cache.
+     */
+    ROW_CACHE_HIT((byte) 0x58),
+    ROW_CACHE_MISS((byte) 0x59),
+
+    /**
+     * Read amplification statistics.
+     *
+     * Read amplification can be calculated using this formula
+     * (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES)
+     *
+     * REQUIRES: ReadOptions::read_amp_bytes_per_bit to be enabled
+     */
+
+    /**
+     * Estimate of total bytes actually used.
+     */
+    READ_AMP_ESTIMATE_USEFUL_BYTES((byte) 0x5A),
+
+    /**
+     * Total size of loaded data blocks.
+     */
+    READ_AMP_TOTAL_READ_BYTES((byte) 0x5B),
+
+    /**
+     * Number of refill intervals where rate limiter's bytes are fully consumed.
+     */
+    NUMBER_RATE_LIMITER_DRAINS((byte) 0x5C),
+
+    /**
+     * Number of internal skipped during iteration
+     */
+    NUMBER_ITER_SKIP((byte) 0x5D),
+
+    /**
+     * Number of MultiGet keys found (vs number requested)
+     */
+    NUMBER_MULTIGET_KEYS_FOUND((byte) 0x5E),
+
+    // -0x01 to fixate the new value that incorrectly changed TICKER_ENUM_MAX
+    /**
+     * Number of iterators created.
+     */
+    NO_ITERATOR_CREATED((byte) -0x01),
+
+    /**
+     * Number of iterators deleted.
+     */
+    NO_ITERATOR_DELETED((byte) 0x60),
+
+    /**
+     * Deletions obsoleted before bottom level due to file gap optimization.
+     */
+    COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE((byte) 0x61),
+
+    /**
+     * If a compaction was cancelled in sfm to prevent ENOSPC
+     */
+    COMPACTION_CANCELLED((byte) 0x62),
+
+    /**
+     * # of times bloom FullFilter has not avoided the reads.
+     */
+    BLOOM_FILTER_FULL_POSITIVE((byte) 0x63),
+
+    /**
+     * # of times bloom FullFilter has not avoided the reads and data actually
+     * exist.
+     */
+    BLOOM_FILTER_FULL_TRUE_POSITIVE((byte) 0x64),
+
+    /**
+     * BlobDB specific stats
+     * # of Put/PutTTL/PutUntil to BlobDB.
+     */
+    BLOB_DB_NUM_PUT((byte) 0x65),
+
+    /**
+     * # of Write to BlobDB.
+     */
+    BLOB_DB_NUM_WRITE((byte) 0x66),
+
+    /**
+     * # of Get to BlobDB.
+     */
+    BLOB_DB_NUM_GET((byte) 0x67),
+
+    /**
+     * # of MultiGet to BlobDB.
+     */
+    BLOB_DB_NUM_MULTIGET((byte) 0x68),
+
+    /**
+     * # of Seek/SeekToFirst/SeekToLast/SeekForPrev to BlobDB iterator.
+     */
+    BLOB_DB_NUM_SEEK((byte) 0x69),
+
+    /**
+     * # of Next to BlobDB iterator.
+     */
+    BLOB_DB_NUM_NEXT((byte) 0x6A),
+
+    /**
+     * # of Prev to BlobDB iterator.
+     */
+    BLOB_DB_NUM_PREV((byte) 0x6B),
+
+    /**
+     * # of keys written to BlobDB.
+     */
+    BLOB_DB_NUM_KEYS_WRITTEN((byte) 0x6C),
+
+    /**
+     * # of keys read from BlobDB.
+     */
+    BLOB_DB_NUM_KEYS_READ((byte) 0x6D),
+
+    /**
+     * # of bytes (key + value) written to BlobDB.
+     */
+    BLOB_DB_BYTES_WRITTEN((byte) 0x6E),
+
+    /**
+     * # of bytes (keys + value) read from BlobDB.
+     */
+    BLOB_DB_BYTES_READ((byte) 0x6F),
+
+    /**
+     * # of keys written by BlobDB as non-TTL inlined value.
+     */
+    BLOB_DB_WRITE_INLINED((byte) 0x70),
+
+    /**
+     * # of keys written by BlobDB as TTL inlined value.
+     */
+    BLOB_DB_WRITE_INLINED_TTL((byte) 0x71),
+
+    /**
+     * # of keys written by BlobDB as non-TTL blob value.
+     */
+    BLOB_DB_WRITE_BLOB((byte) 0x72),
+
+    /**
+     * # of keys written by BlobDB as TTL blob value.
+     */
+    BLOB_DB_WRITE_BLOB_TTL((byte) 0x73),
+
+    /**
+     * # of bytes written to blob file.
+     */
+    BLOB_DB_BLOB_FILE_BYTES_WRITTEN((byte) 0x74),
+
+    /**
+     * # of bytes read from blob file.
+     */
+    BLOB_DB_BLOB_FILE_BYTES_READ((byte) 0x75),
+
+    /**
+     * # of times a blob files being synced.
+     */
+    BLOB_DB_BLOB_FILE_SYNCED((byte) 0x76),
+
+    /**
+     * # of blob index evicted from base DB by BlobDB compaction filter because
+     * of expiration.
+     */
+    BLOB_DB_BLOB_INDEX_EXPIRED_COUNT((byte) 0x77),
+
+    /**
+     * Size of blob index evicted from base DB by BlobDB compaction filter
+     * because of expiration.
+     */
+    BLOB_DB_BLOB_INDEX_EXPIRED_SIZE((byte) 0x78),
+
+    /**
+     * # of blob index evicted from base DB by BlobDB compaction filter because
+     * of corresponding file deleted.
+     */
+    BLOB_DB_BLOB_INDEX_EVICTED_COUNT((byte) 0x79),
+
+    /**
+     * Size of blob index evicted from base DB by BlobDB compaction filter
+     * because of corresponding file deleted.
+     */
+    BLOB_DB_BLOB_INDEX_EVICTED_SIZE((byte) 0x7A),
+
+    /**
+     * # of blob files being garbage collected.
+     */
+    BLOB_DB_GC_NUM_FILES((byte) 0x7B),
+
+    /**
+     * # of blob files generated by garbage collection.
+     */
+    BLOB_DB_GC_NUM_NEW_FILES((byte) 0x7C),
+
+    /**
+     * # of BlobDB garbage collection failures.
+     */
+    BLOB_DB_GC_FAILURES((byte) 0x7D),
+
+    /**
+     * # of keys drop by BlobDB garbage collection because they had been
+     * overwritten.
+     */
+    BLOB_DB_GC_NUM_KEYS_OVERWRITTEN((byte) 0x7E),
+
+    /**
+     * # of keys drop by BlobDB garbage collection because of expiration.
+     */
+    BLOB_DB_GC_NUM_KEYS_EXPIRED((byte) 0x7F),
+
+    /**
+     * # of keys relocated to new blob file by garbage collection.
+     */
+    BLOB_DB_GC_NUM_KEYS_RELOCATED((byte) -0x02),
+
+    /**
+     * # of bytes drop by BlobDB garbage collection because they had been
+     * overwritten.
+     */
+    BLOB_DB_GC_BYTES_OVERWRITTEN((byte) -0x03),
+
+    /**
+     * # of bytes drop by BlobDB garbage collection because of expiration.
+     */
+    BLOB_DB_GC_BYTES_EXPIRED((byte) -0x04),
+
+    /**
+     * # of bytes relocated to new blob file by garbage collection.
+     */
+    BLOB_DB_GC_BYTES_RELOCATED((byte) -0x05),
+
+    /**
+     * # of blob files evicted because of BlobDB is full.
+     */
+    BLOB_DB_FIFO_NUM_FILES_EVICTED((byte) -0x06),
+
+    /**
+     * # of keys in the blob files evicted because of BlobDB is full.
+     */
+    BLOB_DB_FIFO_NUM_KEYS_EVICTED((byte) -0x07),
+
+    /**
+     * # of bytes in the blob files evicted because of BlobDB is full.
+     */
+    BLOB_DB_FIFO_BYTES_EVICTED((byte) -0x08),
+
+    /**
+     * These counters indicate a performance issue in WritePrepared transactions.
+     * We should not seem them ticking them much.
+     * # of times prepare_mutex_ is acquired in the fast path.
+     */
+    TXN_PREPARE_MUTEX_OVERHEAD((byte) -0x09),
+
+    /**
+     * # of times old_commit_map_mutex_ is acquired in the fast path.
+     */
+    TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD((byte) -0x0A),
+
+    /**
+     * # of times we checked a batch for duplicate keys.
+     */
+    TXN_DUPLICATE_KEY_OVERHEAD((byte) -0x0B),
+
+    /**
+     * # of times snapshot_mutex_ is acquired in the fast path.
+     */
+    TXN_SNAPSHOT_MUTEX_OVERHEAD((byte) -0x0C),
+
+    /**
+     * # of times ::Get returned TryAgain due to expired snapshot seq
+     */
+    TXN_GET_TRY_AGAIN((byte) -0x0D),
+
+    /**
+     * # of files marked as trash by delete scheduler
+     */
+    FILES_MARKED_TRASH((byte) -0x0E),
+
+    /**
+     * # of files deleted immediately by delete scheduler
+     */
+    FILES_DELETED_IMMEDIATELY((byte) -0x0f),
+
+    /**
+     * Compaction read and write statistics broken down by CompactionReason
+     */
+    COMPACT_READ_BYTES_MARKED((byte) -0x10),
+    COMPACT_READ_BYTES_PERIODIC((byte) -0x11),
+    COMPACT_READ_BYTES_TTL((byte) -0x12),
+    COMPACT_WRITE_BYTES_MARKED((byte) -0x13),
+    COMPACT_WRITE_BYTES_PERIODIC((byte) -0x14),
+    COMPACT_WRITE_BYTES_TTL((byte) -0x15),
+
+    /**
+     * DB error handler statistics
+     */
+    ERROR_HANDLER_BG_ERROR_COUNT((byte) -0x16),
+    ERROR_HANDLER_BG_IO_ERROR_COUNT((byte) -0x17),
+    ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT((byte) -0x18),
+    ERROR_HANDLER_AUTORESUME_COUNT((byte) -0x19),
+    ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT((byte) -0x1A),
+    ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT((byte) -0x1B),
+
+    /**
+     * Bytes of raw data (payload) found on memtable at flush time.
+     * Contains the sum of garbage payload (bytes that are discarded
+     * at flush time) and useful payload (bytes of data that will
+     * eventually be written to SSTable).
+     */
+    MEMTABLE_PAYLOAD_BYTES_AT_FLUSH((byte) -0x1C),
+    /**
+     * Outdated bytes of data present on memtable at flush time.
+     */
+    MEMTABLE_GARBAGE_BYTES_AT_FLUSH((byte) -0x1D),
+
+    /**
+     * Number of secondary cache hits
+     */
+    SECONDARY_CACHE_HITS((byte) -0x1E),
+
+    /**
+     * Bytes read by `VerifyChecksum()` and `VerifyFileChecksums()` APIs.
+     */
+    VERIFY_CHECKSUM_READ_BYTES((byte) -0x1F),
+
+    /**
+     * Bytes read/written while creating backups
+     */
+    BACKUP_READ_BYTES((byte) -0x20),
+    BACKUP_WRITE_BYTES((byte) -0x21),
+
+    /**
+     * Remote compaction read/write statistics
+     */
+    REMOTE_COMPACT_READ_BYTES((byte) -0x22),
+    REMOTE_COMPACT_WRITE_BYTES((byte) -0x23),
+
+    /**
+     * Tiered storage related statistics
+     */
+    HOT_FILE_READ_BYTES((byte) -0x24),
+    WARM_FILE_READ_BYTES((byte) -0x25),
+    COLD_FILE_READ_BYTES((byte) -0x26),
+    HOT_FILE_READ_COUNT((byte) -0x27),
+    WARM_FILE_READ_COUNT((byte) -0x28),
+    COLD_FILE_READ_COUNT((byte) -0x29),
+
+    /**
+     * (non-)last level read statistics
+     */
+    LAST_LEVEL_READ_BYTES((byte) -0x2A),
+    LAST_LEVEL_READ_COUNT((byte) -0x2B),
+    NON_LAST_LEVEL_READ_BYTES((byte) -0x2C),
+    NON_LAST_LEVEL_READ_COUNT((byte) -0x2D),
+
+    BLOCK_CHECKSUM_COMPUTE_COUNT((byte) -0x2E),
+
+    /**
+     * # of times cache miss when accessing blob from blob cache.
+     */
+    BLOB_DB_CACHE_MISS((byte) -0x2F),
+
+    /**
+     * # of times cache hit when accessing blob from blob cache.
+     */
+    BLOB_DB_CACHE_HIT((byte) -0x30),
+
+    /**
+     * # of data blocks added to blob cache.
+     */
+    BLOB_DB_CACHE_ADD((byte) -0x31),
+
+    /**
+     * # # of failures when adding blobs to blob cache.
+     */
+    BLOB_DB_CACHE_ADD_FAILURES((byte) -0x32),
+
+    /**
+     * # of bytes read from blob cache.
+     */
+    BLOB_DB_CACHE_BYTES_READ((byte) -0x33),
+
+    /**
+     * # of bytes written into blob cache.
+     */
+    BLOB_DB_CACHE_BYTES_WRITE((byte) -0x34),
+
+    TICKER_ENUM_MAX((byte) 0x5F);
+
+    private final byte value;
+
+    TickerType(final byte value) {
+        this.value = value;
+    }
+
+    /**
+     * Returns the byte value of the enumerations value
+     *
+     * @return byte representation
+     */
+    public byte getValue() {
+        return value;
+    }
+
+    /**
+     * Get Ticker type by byte value.
+     *
+     * @param value byte representation of TickerType.
+     *
+     * @return {@link org.rocksdb.TickerType} instance.
+     * @throws java.lang.IllegalArgumentException if an invalid
+     *     value is provided.
+     */
+    public static TickerType getTickerType(final byte value) {
+        for (final TickerType tickerType : TickerType.values()) {
+            if (tickerType.getValue() == value) {
+                return tickerType;
+            }
+        }
+        throw new IllegalArgumentException(
+            "Illegal value provided for TickerType.");
+    }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/TimedEnv.java b/src/rocksdb/java/src/main/java/org/rocksdb/TimedEnv.java
new file mode 100644
index 000000000..dc8b5d6ef
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/TimedEnv.java
@@ -0,0 +1,30 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Timed environment.
+ */
+public class TimedEnv extends Env {
+
+  /**
+   * <p>Creates a new environment that measures function call times for
+   * filesystem operations, reporting results to variables in PerfContext.</p>
+   *
+   *
+   * <p>The caller must delete the result when it is
+   * no longer needed.</p>
+   *
+   * @param baseEnv the base environment,
+   *     must remain live while the result is in use.
+   */
+  public TimedEnv(final Env baseEnv) {
+    super(createTimedEnv(baseEnv.nativeHandle_));
+  }
+
+  private static native long createTimedEnv(final long baseEnvHandle);
+  @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/TraceOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/TraceOptions.java
new file mode 100644
index 000000000..cf5f7bbe1
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/TraceOptions.java
@@ -0,0 +1,32 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * TraceOptions is used for
+ * {@link RocksDB#startTrace(TraceOptions, AbstractTraceWriter)}.
+ */
+public class TraceOptions {
+  private final long maxTraceFileSize;
+
+  public TraceOptions() {
+    this.maxTraceFileSize = 64L * 1024L * 1024L * 1024L; // 64 GB
+  }
+
+  public TraceOptions(final long maxTraceFileSize) {
+    this.maxTraceFileSize = maxTraceFileSize;
+  }
+
+  /**
+   * To avoid the trace file size grows larger than the storage space,
+   * user can set the max trace file size in Bytes. Default is 64 GB.
+   *
+   * @return the max trace size
+   */
+  public long getMaxTraceFileSize() {
+    return maxTraceFileSize;
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/TraceWriter.java b/src/rocksdb/java/src/main/java/org/rocksdb/TraceWriter.java
new file mode 100644
index 000000000..cb0234e9b
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/TraceWriter.java
@@ -0,0 +1,36 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * TraceWriter allows exporting RocksDB traces to any system,
+ * one operation at a time.
+ */
+public interface TraceWriter {
+
+  /**
+   * Write the data.
+   *
+   * @param data the data
+   *
+   * @throws RocksDBException if an error occurs whilst writing.
+   */
+  void write(final Slice data) throws RocksDBException;
+
+  /**
+   * Close the writer.
+   *
+   * @throws RocksDBException if an error occurs whilst closing the writer.
+   */
+  void closeWriter() throws RocksDBException;
+
+  /**
+   * Get the size of the file that this writer is writing to.
+   *
+   * @return the file size
+   */
+  long getFileSize();
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/Transaction.java b/src/rocksdb/java/src/main/java/org/rocksdb/Transaction.java
new file mode 100644
index 000000000..b2cc8a932
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/Transaction.java
@@ -0,0 +1,2170 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Provides BEGIN/COMMIT/ROLLBACK transactions.
+ *
+ * To use transactions, you must first create either an
+ * {@link OptimisticTransactionDB} or a {@link TransactionDB}
+ *
+ * To create a transaction, use
+ * {@link OptimisticTransactionDB#beginTransaction(org.rocksdb.WriteOptions)} or
+ * {@link TransactionDB#beginTransaction(org.rocksdb.WriteOptions)}
+ *
+ * It is up to the caller to synchronize access to this object.
+ *
+ * See samples/src/main/java/OptimisticTransactionSample.java and
+ * samples/src/main/java/TransactionSample.java for some simple
+ * examples.
+ */
+public class Transaction extends RocksObject {
+
+  private final RocksDB parent;
+
+  /**
+   * Intentionally package private
+   * as this is called from
+   * {@link OptimisticTransactionDB#beginTransaction(org.rocksdb.WriteOptions)}
+   * or {@link TransactionDB#beginTransaction(org.rocksdb.WriteOptions)}
+   *
+   * @param parent This must be either {@link TransactionDB} or
+   *     {@link OptimisticTransactionDB}
+   * @param transactionHandle The native handle to the underlying C++
+   *     transaction object
+   */
+  Transaction(final RocksDB parent, final long transactionHandle) {
+    super(transactionHandle);
+    this.parent = parent;
+  }
+
+  /**
+   * If a transaction has a snapshot set, the transaction will ensure that
+   * any keys successfully written (or fetched via {@link #getForUpdate}) have
+   * not been modified outside of this transaction since the time the snapshot
+   * was set.
+   *
+   * If a snapshot has not been set, the transaction guarantees that keys have
+   * not been modified since the time each key was first written (or fetched via
+   * {@link #getForUpdate}).
+   *
+   * Using {@link #setSnapshot()} will provide stricter isolation guarantees
+   * at the expense of potentially more transaction failures due to conflicts
+   * with other writes.
+   *
+   * Calling {@link #setSnapshot()} has no effect on keys written before this
+   * function has been called.
+   *
+   * {@link #setSnapshot()} may be called multiple times if you would like to
+   * change the snapshot used for different operations in this transaction.
+   *
+   * Calling {@link #setSnapshot()} will not affect the version of Data returned
+   * by get(...) methods. See {@link #get} for more details.
+   */
+  public void setSnapshot() {
+    assert(isOwningHandle());
+    setSnapshot(nativeHandle_);
+  }
+
+  /**
+   * Similar to {@link #setSnapshot()}, but will not change the current snapshot
+   * until put/merge/delete/getForUpdate/multiGetForUpdate is called.
+   * By calling this function, the transaction will essentially call
+   * {@link #setSnapshot()} for you right before performing the next
+   * write/getForUpdate.
+   *
+   * Calling {@link #setSnapshotOnNextOperation()} will not affect what
+   * snapshot is returned by {@link #getSnapshot} until the next
+   * write/getForUpdate is executed.
+   *
+   * When the snapshot is created the notifier's snapshotCreated method will
+   * be called so that the caller can get access to the snapshot.
+   *
+   * This is an optimization to reduce the likelihood of conflicts that
+   * could occur in between the time {@link #setSnapshot()} is called and the
+   * first write/getForUpdate operation. i.e. this prevents the following
+   * race-condition:
+   *
+   *   txn1-&gt;setSnapshot();
+   *                             txn2-&gt;put("A", ...);
+   *                             txn2-&gt;commit();
+   *   txn1-&gt;getForUpdate(opts, "A", ...);  * FAIL!
+   */
+  public void setSnapshotOnNextOperation() {
+    assert(isOwningHandle());
+    setSnapshotOnNextOperation(nativeHandle_);
+  }
+
+  /**
+   * Similar to {@link #setSnapshot()}, but will not change the current snapshot
+   * until put/merge/delete/getForUpdate/multiGetForUpdate is called.
+   * By calling this function, the transaction will essentially call
+   * {@link #setSnapshot()} for you right before performing the next
+   * write/getForUpdate.
+   *
+   * Calling {@link #setSnapshotOnNextOperation()} will not affect what
+   * snapshot is returned by {@link #getSnapshot} until the next
+   * write/getForUpdate is executed.
+   *
+   * When the snapshot is created the
+   * {@link AbstractTransactionNotifier#snapshotCreated(Snapshot)} method will
+   * be called so that the caller can get access to the snapshot.
+   *
+   * This is an optimization to reduce the likelihood of conflicts that
+   * could occur in between the time {@link #setSnapshot()} is called and the
+   * first write/getForUpdate operation. i.e. this prevents the following
+   * race-condition:
+   *
+   *   txn1-&gt;setSnapshot();
+   *                             txn2-&gt;put("A", ...);
+   *                             txn2-&gt;commit();
+   *   txn1-&gt;getForUpdate(opts, "A", ...);  * FAIL!
+   *
+   * @param transactionNotifier A handler for receiving snapshot notifications
+   *     for the transaction
+   *
+   */
+  public void setSnapshotOnNextOperation(
+      final AbstractTransactionNotifier transactionNotifier) {
+    assert(isOwningHandle());
+    setSnapshotOnNextOperation(nativeHandle_, transactionNotifier.nativeHandle_);
+  }
+
+ /**
+  * Returns the Snapshot created by the last call to {@link #setSnapshot()}.
+  *
+  * REQUIRED: The returned Snapshot is only valid up until the next time
+  * {@link #setSnapshot()}/{@link #setSnapshotOnNextOperation()} is called,
+  * {@link #clearSnapshot()} is called, or the Transaction is deleted.
+  *
+  * @return The snapshot or null if there is no snapshot
+  */
+  public Snapshot getSnapshot() {
+    assert(isOwningHandle());
+    final long snapshotNativeHandle = getSnapshot(nativeHandle_);
+    if(snapshotNativeHandle == 0) {
+      return null;
+    } else {
+      final Snapshot snapshot = new Snapshot(snapshotNativeHandle);
+      return snapshot;
+    }
+  }
+
+  /**
+   * Clears the current snapshot (i.e. no snapshot will be 'set')
+   *
+   * This removes any snapshot that currently exists or is set to be created
+   * on the next update operation ({@link #setSnapshotOnNextOperation()}).
+   *
+   * Calling {@link #clearSnapshot()} has no effect on keys written before this
+   * function has been called.
+   *
+   * If a reference to a snapshot was retrieved via {@link #getSnapshot()}, it
+   * will no longer be valid and should be discarded after a call to
+   * {@link #clearSnapshot()}.
+   */
+  public void clearSnapshot() {
+    assert(isOwningHandle());
+    clearSnapshot(nativeHandle_);
+  }
+
+  /**
+   * Prepare the current transaction for 2PC
+   */
+  public void prepare() throws RocksDBException {
+    //TODO(AR) consider a Java'ish version of this function, which returns an AutoCloseable (commit)
+    assert(isOwningHandle());
+    prepare(nativeHandle_);
+  }
+
+  /**
+   * Write all batched keys to the db atomically.
+   *
+   * Returns OK on success.
+   *
+   * May return any error status that could be returned by DB:Write().
+   *
+   * If this transaction was created by an {@link OptimisticTransactionDB}
+   * Status::Busy() may be returned if the transaction could not guarantee
+   * that there are no write conflicts. Status::TryAgain() may be returned
+   * if the memtable history size is not large enough
+   *  (See max_write_buffer_number_to_maintain).
+   *
+   * If this transaction was created by a {@link TransactionDB},
+   * Status::Expired() may be returned if this transaction has lived for
+   * longer than {@link TransactionOptions#getExpiration()}.
+   *
+   * @throws RocksDBException if an error occurs when committing the transaction
+   */
+  public void commit() throws RocksDBException {
+    assert(isOwningHandle());
+    commit(nativeHandle_);
+  }
+
+  /**
+   * Discard all batched writes in this transaction.
+   *
+   * @throws RocksDBException if an error occurs when rolling back the transaction
+   */
+  public void rollback() throws RocksDBException {
+    assert(isOwningHandle());
+    rollback(nativeHandle_);
+  }
+
+  /**
+   * Records the state of the transaction for future calls to
+   * {@link #rollbackToSavePoint()}.
+   *
+   * May be called multiple times to set multiple save points.
+   *
+   * @throws RocksDBException if an error occurs whilst setting a save point
+   */
+  public void setSavePoint() throws RocksDBException {
+    assert(isOwningHandle());
+    setSavePoint(nativeHandle_);
+  }
+
+  /**
+   * Undo all operations in this transaction (put, merge, delete, putLogData)
+   * since the most recent call to {@link #setSavePoint()} and removes the most
+   * recent {@link #setSavePoint()}.
+   *
+   * If there is no previous call to {@link #setSavePoint()},
+   * returns Status::NotFound()
+   *
+   * @throws RocksDBException if an error occurs when rolling back to a save point
+   */
+  public void rollbackToSavePoint() throws RocksDBException {
+    assert(isOwningHandle());
+    rollbackToSavePoint(nativeHandle_);
+  }
+
+  /**
+   * This function is similar to
+   * {@link RocksDB#get(ColumnFamilyHandle, ReadOptions, byte[])} except it will
+   * also read pending changes in this transaction.
+   * Currently, this function will return Status::MergeInProgress if the most
+   * recent write to the queried key in this batch is a Merge.
+   *
+   * If {@link ReadOptions#snapshot()} is not set, the current version of the
+   * key will be read. Calling {@link #setSnapshot()} does not affect the
+   * version of the data returned.
+   *
+   * Note that setting {@link ReadOptions#setSnapshot(Snapshot)} will affect
+   * what is read from the DB but will NOT change which keys are read from this
+   * transaction (the keys in this transaction do not yet belong to any snapshot
+   * and will be fetched regardless).
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} instance
+   * @param readOptions Read options.
+   * @param key the key to retrieve the value for.
+   *
+   * @return a byte array storing the value associated with the input key if
+   *     any. null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying native
+   *     library.
+   */
+  public byte[] get(final ColumnFamilyHandle columnFamilyHandle,
+      final ReadOptions readOptions, final byte[] key) throws RocksDBException {
+    assert(isOwningHandle());
+    return get(nativeHandle_, readOptions.nativeHandle_, key, key.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * This function is similar to
+   * {@link RocksDB#get(ReadOptions, byte[])} except it will
+   * also read pending changes in this transaction.
+   * Currently, this function will return Status::MergeInProgress if the most
+   * recent write to the queried key in this batch is a Merge.
+   *
+   * If {@link ReadOptions#snapshot()} is not set, the current version of the
+   * key will be read. Calling {@link #setSnapshot()} does not affect the
+   * version of the data returned.
+   *
+   * Note that setting {@link ReadOptions#setSnapshot(Snapshot)} will affect
+   * what is read from the DB but will NOT change which keys are read from this
+   * transaction (the keys in this transaction do not yet belong to any snapshot
+   * and will be fetched regardless).
+   *
+   * @param readOptions Read options.
+   * @param key the key to retrieve the value for.
+   *
+   * @return a byte array storing the value associated with the input key if
+   *     any. null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying native
+   *     library.
+   */
+  public byte[] get(final ReadOptions readOptions, final byte[] key)
+      throws RocksDBException {
+    assert(isOwningHandle());
+    return get(nativeHandle_, readOptions.nativeHandle_, key, key.length);
+  }
+
+  /**
+   * This function is similar to
+   * {@link RocksDB#multiGetAsList} except it will
+   * also read pending changes in this transaction.
+   * Currently, this function will return Status::MergeInProgress if the most
+   * recent write to the queried key in this batch is a Merge.
+   *
+   * If {@link ReadOptions#snapshot()} is not set, the current version of the
+   * key will be read. Calling {@link #setSnapshot()} does not affect the
+   * version of the data returned.
+   *
+   * Note that setting {@link ReadOptions#setSnapshot(Snapshot)} will affect
+   * what is read from the DB but will NOT change which keys are read from this
+   * transaction (the keys in this transaction do not yet belong to any snapshot
+   * and will be fetched regardless).
+   *
+   * @param readOptions Read options.
+   * @param columnFamilyHandles {@link java.util.List} containing
+   *     {@link org.rocksdb.ColumnFamilyHandle} instances.
+   * @param keys of keys for which values need to be retrieved.
+   *
+   * @return Array of values, one for each key
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   * @throws IllegalArgumentException thrown if the size of passed keys is not
+   *    equal to the amount of passed column family handles.
+   */
+  @Deprecated
+  public byte[][] multiGet(final ReadOptions readOptions,
+      final List<ColumnFamilyHandle> columnFamilyHandles, final byte[][] keys)
+      throws RocksDBException {
+    assert(isOwningHandle());
+    // Check if key size equals cfList size. If not a exception must be
+    // thrown. If not a Segmentation fault happens.
+    if (keys.length != columnFamilyHandles.size()) {
+      throw new IllegalArgumentException(
+          "For each key there must be a ColumnFamilyHandle.");
+    }
+    if(keys.length == 0) {
+      return new byte[0][0];
+    }
+    final long[] cfHandles = new long[columnFamilyHandles.size()];
+    for (int i = 0; i < columnFamilyHandles.size(); i++) {
+      cfHandles[i] = columnFamilyHandles.get(i).nativeHandle_;
+    }
+
+    return multiGet(nativeHandle_, readOptions.nativeHandle_,
+       keys, cfHandles);
+  }
+
+  /**
+   * This function is similar to
+   * {@link RocksDB#multiGetAsList(ReadOptions, List, List)} except it will
+   * also read pending changes in this transaction.
+   * Currently, this function will return Status::MergeInProgress if the most
+   * recent write to the queried key in this batch is a Merge.
+   *
+   * If {@link ReadOptions#snapshot()} is not set, the current version of the
+   * key will be read. Calling {@link #setSnapshot()} does not affect the
+   * version of the data returned.
+   *
+   * Note that setting {@link ReadOptions#setSnapshot(Snapshot)} will affect
+   * what is read from the DB but will NOT change which keys are read from this
+   * transaction (the keys in this transaction do not yet belong to any snapshot
+   * and will be fetched regardless).
+   *
+   * @param readOptions Read options.
+   * @param columnFamilyHandles {@link java.util.List} containing
+   *     {@link org.rocksdb.ColumnFamilyHandle} instances.
+   * @param keys of keys for which values need to be retrieved.
+   *
+   * @return Array of values, one for each key
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   * @throws IllegalArgumentException thrown if the size of passed keys is not
+   *    equal to the amount of passed column family handles.
+   */
+
+  public List<byte[]> multiGetAsList(final ReadOptions readOptions,
+      final List<ColumnFamilyHandle> columnFamilyHandles, final List<byte[]> keys)
+      throws RocksDBException {
+    assert (isOwningHandle());
+    // Check if key size equals cfList size. If not a exception must be
+    // thrown. If not a Segmentation fault happens.
+    if (keys.size() != columnFamilyHandles.size()) {
+      throw new IllegalArgumentException("For each key there must be a ColumnFamilyHandle.");
+    }
+    if (keys.size() == 0) {
+      return new ArrayList<>(0);
+    }
+    final byte[][] keysArray = keys.toArray(new byte[keys.size()][]);
+    final long[] cfHandles = new long[columnFamilyHandles.size()];
+    for (int i = 0; i < columnFamilyHandles.size(); i++) {
+      cfHandles[i] = columnFamilyHandles.get(i).nativeHandle_;
+    }
+
+    return Arrays.asList(multiGet(nativeHandle_, readOptions.nativeHandle_, keysArray, cfHandles));
+  }
+
+  /**
+   * This function is similar to
+   * {@link RocksDB#multiGetAsList} except it will
+   * also read pending changes in this transaction.
+   * Currently, this function will return Status::MergeInProgress if the most
+   * recent write to the queried key in this batch is a Merge.
+   *
+   * If {@link ReadOptions#snapshot()} is not set, the current version of the
+   * key will be read. Calling {@link #setSnapshot()} does not affect the
+   * version of the data returned.
+   *
+   * Note that setting {@link ReadOptions#setSnapshot(Snapshot)} will affect
+   * what is read from the DB but will NOT change which keys are read from this
+   * transaction (the keys in this transaction do not yet belong to any snapshot
+   * and will be fetched regardless).
+   *
+   * @param readOptions Read options.=
+   *     {@link org.rocksdb.ColumnFamilyHandle} instances.
+   * @param keys of keys for which values need to be retrieved.
+   *
+   * @return Array of values, one for each key
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  @Deprecated
+  public byte[][] multiGet(final ReadOptions readOptions, final byte[][] keys)
+      throws RocksDBException {
+    assert(isOwningHandle());
+    if(keys.length == 0) {
+      return new byte[0][0];
+    }
+
+    return multiGet(nativeHandle_, readOptions.nativeHandle_,
+        keys);
+  }
+
+  /**
+   * This function is similar to
+   * {@link RocksDB#multiGetAsList} except it will
+   * also read pending changes in this transaction.
+   * Currently, this function will return Status::MergeInProgress if the most
+   * recent write to the queried key in this batch is a Merge.
+   *
+   * If {@link ReadOptions#snapshot()} is not set, the current version of the
+   * key will be read. Calling {@link #setSnapshot()} does not affect the
+   * version of the data returned.
+   *
+   * Note that setting {@link ReadOptions#setSnapshot(Snapshot)} will affect
+   * what is read from the DB but will NOT change which keys are read from this
+   * transaction (the keys in this transaction do not yet belong to any snapshot
+   * and will be fetched regardless).
+   *
+   * @param readOptions Read options.=
+   *     {@link org.rocksdb.ColumnFamilyHandle} instances.
+   * @param keys of keys for which values need to be retrieved.
+   *
+   * @return Array of values, one for each key
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public List<byte[]> multiGetAsList(final ReadOptions readOptions, final List<byte[]> keys)
+      throws RocksDBException {
+    if (keys.size() == 0) {
+      return new ArrayList<>(0);
+    }
+    final byte[][] keysArray = keys.toArray(new byte[keys.size()][]);
+
+    return Arrays.asList(multiGet(nativeHandle_, readOptions.nativeHandle_, keysArray));
+  }
+
+  /**
+   * Read this key and ensure that this transaction will only
+   * be able to be committed if this key is not written outside this
+   * transaction after it has first been read (or after the snapshot if a
+   * snapshot is set in this transaction). The transaction behavior is the
+   * same regardless of whether the key exists or not.
+   *
+   * Note: Currently, this function will return Status::MergeInProgress
+   * if the most recent write to the queried key in this batch is a Merge.
+   *
+   * The values returned by this function are similar to
+   * {@link RocksDB#get(ColumnFamilyHandle, ReadOptions, byte[])}.
+   * If value==nullptr, then this function will not read any data, but will
+   * still ensure that this key cannot be written to by outside of this
+   * transaction.
+   *
+   * If this transaction was created by an {@link OptimisticTransactionDB},
+   * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)}
+   * could cause {@link #commit()} to fail. Otherwise, it could return any error
+   * that could be returned by
+   * {@link RocksDB#get(ColumnFamilyHandle, ReadOptions, byte[])}.
+   *
+   * If this transaction was created on a {@link TransactionDB}, an
+   * {@link RocksDBException} may be thrown with an accompanying {@link Status}
+   * when:
+   *     {@link Status.Code#Busy} if there is a write conflict,
+   *     {@link Status.Code#TimedOut} if a lock could not be acquired,
+   *     {@link Status.Code#TryAgain} if the memtable history size is not large
+   *         enough. See
+   *         {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *     {@link Status.Code#MergeInProgress} if merge operations cannot be
+   *     resolved.
+   *
+   * @param readOptions Read options.
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the key to retrieve the value for.
+   * @param exclusive true if the transaction should have exclusive access to
+   *     the key, otherwise false for shared access.
+   * @param doValidate true if it should validate the snapshot before doing the read
+   *
+   * @return a byte array storing the value associated with the input key if
+   *     any.  null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] getForUpdate(final ReadOptions readOptions,
+      final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final boolean exclusive,
+      final boolean doValidate) throws RocksDBException {
+    assert (isOwningHandle());
+    return getForUpdate(nativeHandle_, readOptions.nativeHandle_, key, key.length,
+        columnFamilyHandle.nativeHandle_, exclusive, doValidate);
+  }
+
+  /**
+   * Same as
+   * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean, boolean)}
+   * with doValidate=true.
+   *
+   * @param readOptions Read options.
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the key to retrieve the value for.
+   * @param exclusive true if the transaction should have exclusive access to
+   *     the key, otherwise false for shared access.
+   *
+   * @return a byte array storing the value associated with the input key if
+   *     any.  null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] getForUpdate(final ReadOptions readOptions,
+      final ColumnFamilyHandle columnFamilyHandle, final byte[] key,
+      final boolean exclusive) throws RocksDBException {
+    assert(isOwningHandle());
+    return getForUpdate(nativeHandle_, readOptions.nativeHandle_, key, key.length,
+        columnFamilyHandle.nativeHandle_, exclusive, true /*doValidate*/);
+  }
+
+  /**
+   * Read this key and ensure that this transaction will only
+   * be able to be committed if this key is not written outside this
+   * transaction after it has first been read (or after the snapshot if a
+   * snapshot is set in this transaction). The transaction behavior is the
+   * same regardless of whether the key exists or not.
+   *
+   * Note: Currently, this function will return Status::MergeInProgress
+   * if the most recent write to the queried key in this batch is a Merge.
+   *
+   * The values returned by this function are similar to
+   * {@link RocksDB#get(ReadOptions, byte[])}.
+   * If value==nullptr, then this function will not read any data, but will
+   * still ensure that this key cannot be written to by outside of this
+   * transaction.
+   *
+   * If this transaction was created on an {@link OptimisticTransactionDB},
+   * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)}
+   * could cause {@link #commit()} to fail. Otherwise, it could return any error
+   * that could be returned by
+   * {@link RocksDB#get(ReadOptions, byte[])}.
+   *
+   * If this transaction was created on a {@link TransactionDB}, an
+   * {@link RocksDBException} may be thrown with an accompanying {@link Status}
+   * when:
+   *     {@link Status.Code#Busy} if there is a write conflict,
+   *     {@link Status.Code#TimedOut} if a lock could not be acquired,
+   *     {@link Status.Code#TryAgain} if the memtable history size is not large
+   *         enough. See
+   *         {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *     {@link Status.Code#MergeInProgress} if merge operations cannot be
+   *     resolved.
+   *
+   * @param readOptions Read options.
+   * @param key the key to retrieve the value for.
+   * @param exclusive true if the transaction should have exclusive access to
+   *     the key, otherwise false for shared access.
+   *
+   * @return a byte array storing the value associated with the input key if
+   *     any.  null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] getForUpdate(final ReadOptions readOptions, final byte[] key,
+      final boolean exclusive) throws RocksDBException {
+    assert(isOwningHandle());
+    return getForUpdate(
+        nativeHandle_, readOptions.nativeHandle_, key, key.length, exclusive, true /*doValidate*/);
+  }
+
+  /**
+   * A multi-key version of
+   * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)}.
+   *
+   *
+   * @param readOptions Read options.
+   * @param columnFamilyHandles {@link org.rocksdb.ColumnFamilyHandle}
+   *     instances
+   * @param keys the keys to retrieve the values for.
+   *
+   * @return Array of values, one for each key
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  @Deprecated
+  public byte[][] multiGetForUpdate(final ReadOptions readOptions,
+      final List<ColumnFamilyHandle> columnFamilyHandles, final byte[][] keys)
+      throws RocksDBException {
+    assert(isOwningHandle());
+    // Check if key size equals cfList size. If not a exception must be
+    // thrown. If not a Segmentation fault happens.
+    if (keys.length != columnFamilyHandles.size()){
+      throw new IllegalArgumentException(
+          "For each key there must be a ColumnFamilyHandle.");
+    }
+    if(keys.length == 0) {
+      return new byte[0][0];
+    }
+    final long[] cfHandles = new long[columnFamilyHandles.size()];
+    for (int i = 0; i < columnFamilyHandles.size(); i++) {
+      cfHandles[i] = columnFamilyHandles.get(i).nativeHandle_;
+    }
+    return multiGetForUpdate(nativeHandle_, readOptions.nativeHandle_,
+        keys, cfHandles);
+  }
+
+  /**
+   * A multi-key version of
+   * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)}.
+   *
+   *
+   * @param readOptions Read options.
+   * @param columnFamilyHandles {@link org.rocksdb.ColumnFamilyHandle}
+   *     instances
+   * @param keys the keys to retrieve the values for.
+   *
+   * @return Array of values, one for each key
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public List<byte[]> multiGetForUpdateAsList(final ReadOptions readOptions,
+      final List<ColumnFamilyHandle> columnFamilyHandles, final List<byte[]> keys)
+      throws RocksDBException {
+    assert (isOwningHandle());
+    // Check if key size equals cfList size. If not a exception must be
+    // thrown. If not a Segmentation fault happens.
+    if (keys.size() != columnFamilyHandles.size()) {
+      throw new IllegalArgumentException("For each key there must be a ColumnFamilyHandle.");
+    }
+    if (keys.size() == 0) {
+      return new ArrayList<>();
+    }
+    final byte[][] keysArray = keys.toArray(new byte[keys.size()][]);
+
+    final long[] cfHandles = new long[columnFamilyHandles.size()];
+    for (int i = 0; i < columnFamilyHandles.size(); i++) {
+      cfHandles[i] = columnFamilyHandles.get(i).nativeHandle_;
+    }
+    return Arrays.asList(
+        multiGetForUpdate(nativeHandle_, readOptions.nativeHandle_, keysArray, cfHandles));
+  }
+
+  /**
+   * A multi-key version of {@link #getForUpdate(ReadOptions, byte[], boolean)}.
+   *
+   *
+   * @param readOptions Read options.
+   * @param keys the keys to retrieve the values for.
+   *
+   * @return Array of values, one for each key
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  @Deprecated
+  public byte[][] multiGetForUpdate(final ReadOptions readOptions, final byte[][] keys)
+      throws RocksDBException {
+    assert(isOwningHandle());
+    if(keys.length == 0) {
+      return new byte[0][0];
+    }
+
+    return multiGetForUpdate(nativeHandle_,
+        readOptions.nativeHandle_, keys);
+  }
+
+  /**
+   * A multi-key version of {@link #getForUpdate(ReadOptions, byte[], boolean)}.
+   *
+   *
+   * @param readOptions Read options.
+   * @param keys the keys to retrieve the values for.
+   *
+   * @return List of values, one for each key
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public List<byte[]> multiGetForUpdateAsList(
+      final ReadOptions readOptions, final List<byte[]> keys) throws RocksDBException {
+    assert (isOwningHandle());
+    if (keys.size() == 0) {
+      return new ArrayList<>(0);
+    }
+
+    final byte[][] keysArray = keys.toArray(new byte[keys.size()][]);
+
+    return Arrays.asList(multiGetForUpdate(nativeHandle_, readOptions.nativeHandle_, keysArray));
+  }
+
+  /**
+   * Returns an iterator that will iterate on all keys in the default
+   * column family including both keys in the DB and uncommitted keys in this
+   * transaction.
+   *
+   * Setting {@link ReadOptions#setSnapshot(Snapshot)} will affect what is read
+   * from the DB but will NOT change which keys are read from this transaction
+   * (the keys in this transaction do not yet belong to any snapshot and will be
+   * fetched regardless).
+   *
+   * Caller is responsible for deleting the returned Iterator.
+   *
+   * The returned iterator is only valid until {@link #commit()},
+   * {@link #rollback()}, or {@link #rollbackToSavePoint()} is called.
+   *
+   * @param readOptions Read options.
+   *
+   * @return instance of iterator object.
+   */
+  public RocksIterator getIterator(final ReadOptions readOptions) {
+    assert(isOwningHandle());
+    return new RocksIterator(parent, getIterator(nativeHandle_,
+        readOptions.nativeHandle_));
+  }
+
+  /**
+   * Returns an iterator that will iterate on all keys in the column family
+   * specified by {@code columnFamilyHandle} including both keys in the DB
+   * and uncommitted keys in this transaction.
+   *
+   * Setting {@link ReadOptions#setSnapshot(Snapshot)} will affect what is read
+   * from the DB but will NOT change which keys are read from this transaction
+   * (the keys in this transaction do not yet belong to any snapshot and will be
+   * fetched regardless).
+   *
+   * Caller is responsible for calling {@link RocksIterator#close()} on
+   * the returned Iterator.
+   *
+   * The returned iterator is only valid until {@link #commit()},
+   * {@link #rollback()}, or {@link #rollbackToSavePoint()} is called.
+   *
+   * @param readOptions Read options.
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   *
+   * @return instance of iterator object.
+   */
+  public RocksIterator getIterator(final ReadOptions readOptions,
+      final ColumnFamilyHandle columnFamilyHandle) {
+    assert(isOwningHandle());
+    return new RocksIterator(parent, getIterator(nativeHandle_,
+        readOptions.nativeHandle_, columnFamilyHandle.nativeHandle_));
+  }
+
+  /**
+   * Similar to {@link RocksDB#put(ColumnFamilyHandle, byte[], byte[])}, but
+   * will also perform conflict checking on the keys be written.
+   *
+   * If this Transaction was created on an {@link OptimisticTransactionDB},
+   * these functions should always succeed.
+   *
+   * If this Transaction was created on a {@link TransactionDB}, an
+   * {@link RocksDBException} may be thrown with an accompanying {@link Status}
+   * when:
+   *     {@link Status.Code#Busy} if there is a write conflict,
+   *     {@link Status.Code#TimedOut} if a lock could not be acquired,
+   *     {@link Status.Code#TryAgain} if the memtable history size is not large
+   *         enough. See
+   *         {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *
+   * @param columnFamilyHandle The column family to put the key/value into
+   * @param key the specified key to be inserted.
+   * @param value the value associated with the specified key.
+   * @param assumeTracked true when it is expected that the key is already
+   *     tracked. More specifically, it means the the key was previous tracked
+   *     in the same savepoint, with the same exclusive flag, and at a lower
+   *     sequence number. If valid then it skips ValidateSnapshot,
+   *     throws an error otherwise.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void put(final ColumnFamilyHandle columnFamilyHandle, final byte[] key,
+      final byte[] value, final boolean assumeTracked) throws RocksDBException {
+    assert (isOwningHandle());
+    put(nativeHandle_, key, key.length, value, value.length,
+        columnFamilyHandle.nativeHandle_, assumeTracked);
+  }
+
+  /**
+   * Similar to {@link #put(ColumnFamilyHandle, byte[], byte[], boolean)}
+   * but with {@code assumeTracked = false}.
+   *
+   * Will also perform conflict checking on the keys be written.
+   *
+   * If this Transaction was created on an {@link OptimisticTransactionDB},
+   * these functions should always succeed.
+   *
+   * If this Transaction was created on a {@link TransactionDB}, an
+   * {@link RocksDBException} may be thrown with an accompanying {@link Status}
+   * when:
+   *     {@link Status.Code#Busy} if there is a write conflict,
+   *     {@link Status.Code#TimedOut} if a lock could not be acquired,
+   *     {@link Status.Code#TryAgain} if the memtable history size is not large
+   *         enough. See
+   *         {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *
+   * @param columnFamilyHandle The column family to put the key/value into
+   * @param key the specified key to be inserted.
+   * @param value the value associated with the specified key.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void put(final ColumnFamilyHandle columnFamilyHandle, final byte[] key,
+      final byte[] value) throws RocksDBException {
+    assert(isOwningHandle());
+    put(nativeHandle_, key, key.length, value, value.length,
+        columnFamilyHandle.nativeHandle_, false);
+  }
+
+  /**
+   * Similar to {@link RocksDB#put(byte[], byte[])}, but
+   * will also perform conflict checking on the keys be written.
+   *
+   * If this Transaction was created on an {@link OptimisticTransactionDB},
+   * these functions should always succeed.
+   *
+   *  If this Transaction was created on a {@link TransactionDB}, an
+   *  {@link RocksDBException} may be thrown with an accompanying {@link Status}
+   *  when:
+   *    {@link Status.Code#Busy} if there is a write conflict,
+   *    {@link Status.Code#TimedOut} if a lock could not be acquired,
+   *    {@link Status.Code#TryAgain} if the memtable history size is not large
+   *       enough. See
+   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *
+   * @param key the specified key to be inserted.
+   * @param value the value associated with the specified key.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void put(final byte[] key, final byte[] value)
+      throws RocksDBException {
+    assert(isOwningHandle());
+    put(nativeHandle_, key, key.length, value, value.length);
+  }
+
+  //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future
+  /**
+   * Similar to {@link #put(ColumnFamilyHandle, byte[], byte[])} but allows
+   * you to specify the key and value in several parts that will be
+   * concatenated together.
+   *
+   * @param columnFamilyHandle The column family to put the key/value into
+   * @param keyParts the specified key to be inserted.
+   * @param valueParts the value associated with the specified key.
+   * @param assumeTracked true when it is expected that the key is already
+   *     tracked. More specifically, it means the the key was previous tracked
+   *     in the same savepoint, with the same exclusive flag, and at a lower
+   *     sequence number. If valid then it skips ValidateSnapshot,
+   *     throws an error otherwise.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void put(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[][] keyParts, final byte[][] valueParts,
+      final boolean assumeTracked) throws RocksDBException {
+    assert (isOwningHandle());
+    put(nativeHandle_, keyParts, keyParts.length, valueParts, valueParts.length,
+        columnFamilyHandle.nativeHandle_, assumeTracked);
+  }
+
+  /**
+   * Similar to {@link #put(ColumnFamilyHandle, byte[][], byte[][], boolean)}
+   * but with with {@code assumeTracked = false}.
+   *
+   * Allows you to specify the key and value in several parts that will be
+   * concatenated together.
+   *
+   * @param columnFamilyHandle The column family to put the key/value into
+   * @param keyParts the specified key to be inserted.
+   * @param valueParts the value associated with the specified key.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void put(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[][] keyParts, final byte[][] valueParts)
+      throws RocksDBException {
+    assert(isOwningHandle());
+    put(nativeHandle_, keyParts, keyParts.length, valueParts, valueParts.length,
+        columnFamilyHandle.nativeHandle_, false);
+  }
+
+  //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future
+  /**
+   * Similar to {@link #put(byte[], byte[])} but allows
+   * you to specify the key and value in several parts that will be
+   * concatenated together
+   *
+   * @param keyParts the specified key to be inserted.
+   * @param valueParts the value associated with the specified key.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void put(final byte[][] keyParts, final byte[][] valueParts)
+      throws RocksDBException {
+    assert(isOwningHandle());
+    put(nativeHandle_, keyParts, keyParts.length, valueParts,
+        valueParts.length);
+  }
+
+  /**
+   * Similar to {@link RocksDB#merge(ColumnFamilyHandle, byte[], byte[])}, but
+   * will also perform conflict checking on the keys be written.
+   *
+   * If this Transaction was created on an {@link OptimisticTransactionDB},
+   * these functions should always succeed.
+   *
+   *  If this Transaction was created on a {@link TransactionDB}, an
+   *  {@link RocksDBException} may be thrown with an accompanying {@link Status}
+   *  when:
+   *    {@link Status.Code#Busy} if there is a write conflict,
+   *    {@link Status.Code#TimedOut} if a lock could not be acquired,
+   *    {@link Status.Code#TryAgain} if the memtable history size is not large
+   *       enough. See
+   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *
+   * @param columnFamilyHandle The column family to merge the key/value into
+   * @param key the specified key to be merged.
+   * @param value the value associated with the specified key.
+   * @param assumeTracked true when it is expected that the key is already
+   *     tracked. More specifically, it means the the key was previous tracked
+   *     in the same savepoint, with the same exclusive flag, and at a lower
+   *     sequence number. If valid then it skips ValidateSnapshot,
+   *     throws an error otherwise.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void merge(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, final byte[] value, final boolean assumeTracked)
+      throws RocksDBException {
+    assert (isOwningHandle());
+    merge(nativeHandle_, key, key.length, value, value.length,
+        columnFamilyHandle.nativeHandle_, assumeTracked);
+  }
+
+  /**
+   * Similar to {@link #merge(ColumnFamilyHandle, byte[], byte[], boolean)}
+   * but with {@code assumeTracked = false}.
+   *
+   * Will also perform conflict checking on the keys be written.
+   *
+   * If this Transaction was created on an {@link OptimisticTransactionDB},
+   * these functions should always succeed.
+   *
+   *  If this Transaction was created on a {@link TransactionDB}, an
+   *  {@link RocksDBException} may be thrown with an accompanying {@link Status}
+   *  when:
+   *    {@link Status.Code#Busy} if there is a write conflict,
+   *    {@link Status.Code#TimedOut} if a lock could not be acquired,
+   *    {@link Status.Code#TryAgain} if the memtable history size is not large
+   *       enough. See
+   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *
+   * @param columnFamilyHandle The column family to merge the key/value into
+   * @param key the specified key to be merged.
+   * @param value the value associated with the specified key.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void merge(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, final byte[] value) throws RocksDBException {
+    assert(isOwningHandle());
+    merge(nativeHandle_, key, key.length, value, value.length,
+        columnFamilyHandle.nativeHandle_, false);
+  }
+
+  /**
+   * Similar to {@link RocksDB#merge(byte[], byte[])}, but
+   * will also perform conflict checking on the keys be written.
+   *
+   * If this Transaction was created on an {@link OptimisticTransactionDB},
+   * these functions should always succeed.
+   *
+   *  If this Transaction was created on a {@link TransactionDB}, an
+   *  {@link RocksDBException} may be thrown with an accompanying {@link Status}
+   *  when:
+   *    {@link Status.Code#Busy} if there is a write conflict,
+   *    {@link Status.Code#TimedOut} if a lock could not be acquired,
+   *    {@link Status.Code#TryAgain} if the memtable history size is not large
+   *       enough. See
+   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *
+   * @param key the specified key to be merged.
+   * @param value the value associated with the specified key.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void merge(final byte[] key, final byte[] value)
+      throws RocksDBException {
+    assert(isOwningHandle());
+    merge(nativeHandle_, key, key.length, value, value.length);
+  }
+
+  /**
+   * Similar to {@link RocksDB#delete(ColumnFamilyHandle, byte[])}, but
+   * will also perform conflict checking on the keys be written.
+   *
+   * If this Transaction was created on an {@link OptimisticTransactionDB},
+   * these functions should always succeed.
+   *
+   *  If this Transaction was created on a {@link TransactionDB}, an
+   *  {@link RocksDBException} may be thrown with an accompanying {@link Status}
+   *  when:
+   *    {@link Status.Code#Busy} if there is a write conflict,
+   *    {@link Status.Code#TimedOut} if a lock could not be acquired,
+   *    {@link Status.Code#TryAgain} if the memtable history size is not large
+   *       enough. See
+   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *
+   * @param columnFamilyHandle The column family to delete the key/value from
+   * @param key the specified key to be deleted.
+   * @param assumeTracked true when it is expected that the key is already
+   *     tracked. More specifically, it means the the key was previous tracked
+   *     in the same savepoint, with the same exclusive flag, and at a lower
+   *     sequence number. If valid then it skips ValidateSnapshot,
+   *     throws an error otherwise.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void delete(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, final boolean assumeTracked) throws RocksDBException {
+    assert (isOwningHandle());
+    delete(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_,
+        assumeTracked);
+  }
+
+  /**
+   * Similar to {@link #delete(ColumnFamilyHandle, byte[], boolean)}
+   * but with {@code assumeTracked = false}.
+   *
+   * Will also perform conflict checking on the keys be written.
+   *
+   * If this Transaction was created on an {@link OptimisticTransactionDB},
+   * these functions should always succeed.
+   *
+   *  If this Transaction was created on a {@link TransactionDB}, an
+   *  {@link RocksDBException} may be thrown with an accompanying {@link Status}
+   *  when:
+   *    {@link Status.Code#Busy} if there is a write conflict,
+   *    {@link Status.Code#TimedOut} if a lock could not be acquired,
+   *    {@link Status.Code#TryAgain} if the memtable history size is not large
+   *       enough. See
+   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *
+   * @param columnFamilyHandle The column family to delete the key/value from
+   * @param key the specified key to be deleted.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void delete(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key) throws RocksDBException {
+    assert(isOwningHandle());
+    delete(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_,
+        /*assumeTracked*/ false);
+  }
+
+  /**
+   * Similar to {@link RocksDB#delete(byte[])}, but
+   * will also perform conflict checking on the keys be written.
+   *
+   * If this Transaction was created on an {@link OptimisticTransactionDB},
+   * these functions should always succeed.
+   *
+   *  If this Transaction was created on a {@link TransactionDB}, an
+   *  {@link RocksDBException} may be thrown with an accompanying {@link Status}
+   *  when:
+   *    {@link Status.Code#Busy} if there is a write conflict,
+   *    {@link Status.Code#TimedOut} if a lock could not be acquired,
+   *    {@link Status.Code#TryAgain} if the memtable history size is not large
+   *       enough. See
+   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *
+   * @param key the specified key to be deleted.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void delete(final byte[] key) throws RocksDBException {
+    assert(isOwningHandle());
+    delete(nativeHandle_, key, key.length);
+  }
+
+  //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future
+  /**
+   * Similar to {@link #delete(ColumnFamilyHandle, byte[])} but allows
+   * you to specify the key in several parts that will be
+   * concatenated together.
+   *
+   * @param columnFamilyHandle The column family to delete the key/value from
+   * @param keyParts the specified key to be deleted.
+   * @param assumeTracked true when it is expected that the key is already
+   *     tracked. More specifically, it means the the key was previous tracked
+   *     in the same savepoint, with the same exclusive flag, and at a lower
+   *     sequence number. If valid then it skips ValidateSnapshot,
+   *     throws an error otherwise.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void delete(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[][] keyParts, final boolean assumeTracked)
+      throws RocksDBException {
+    assert (isOwningHandle());
+    delete(nativeHandle_, keyParts, keyParts.length,
+        columnFamilyHandle.nativeHandle_, assumeTracked);
+  }
+
+  /**
+   * Similar to{@link #delete(ColumnFamilyHandle, byte[][], boolean)}
+   * but with {@code assumeTracked = false}.
+   *
+   * Allows you to specify the key in several parts that will be
+   * concatenated together.
+   *
+   * @param columnFamilyHandle The column family to delete the key/value from
+   * @param keyParts the specified key to be deleted.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void delete(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[][] keyParts) throws RocksDBException {
+    assert(isOwningHandle());
+    delete(nativeHandle_, keyParts, keyParts.length,
+        columnFamilyHandle.nativeHandle_, false);
+  }
+
+  //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future
+  /**
+   * Similar to {@link #delete(byte[])} but allows
+   * you to specify key the in several parts that will be
+   * concatenated together.
+   *
+   * @param keyParts the specified key to be deleted
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void delete(final byte[][] keyParts) throws RocksDBException {
+    assert(isOwningHandle());
+    delete(nativeHandle_, keyParts, keyParts.length);
+  }
+
+  /**
+   * Similar to {@link RocksDB#singleDelete(ColumnFamilyHandle, byte[])}, but
+   * will also perform conflict checking on the keys be written.
+   *
+   * If this Transaction was created on an {@link OptimisticTransactionDB},
+   * these functions should always succeed.
+   *
+   *  If this Transaction was created on a {@link TransactionDB}, an
+   *  {@link RocksDBException} may be thrown with an accompanying {@link Status}
+   *  when:
+   *    {@link Status.Code#Busy} if there is a write conflict,
+   *    {@link Status.Code#TimedOut} if a lock could not be acquired,
+   *    {@link Status.Code#TryAgain} if the memtable history size is not large
+   *       enough. See
+   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *
+   * @param columnFamilyHandle The column family to delete the key/value from
+   * @param key the specified key to be deleted.
+   * @param assumeTracked true when it is expected that the key is already
+   *     tracked. More specifically, it means the key was previously tracked
+   *     in the same savepoint, with the same exclusive flag, and at a lower
+   *     sequence number. If valid then it skips ValidateSnapshot,
+   *     throws an error otherwise.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  @Experimental("Performance optimization for a very specific workload")
+  public void singleDelete(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, final boolean assumeTracked) throws RocksDBException {
+    assert (isOwningHandle());
+    singleDelete(nativeHandle_, key, key.length,
+        columnFamilyHandle.nativeHandle_, assumeTracked);
+  }
+
+  /**
+   * Similar to {@link #singleDelete(ColumnFamilyHandle, byte[], boolean)}
+   * but with {@code assumeTracked = false}.
+   *
+   * will also perform conflict checking on the keys be written.
+   *
+   * If this Transaction was created on an {@link OptimisticTransactionDB},
+   * these functions should always succeed.
+   *
+   *  If this Transaction was created on a {@link TransactionDB}, an
+   *  {@link RocksDBException} may be thrown with an accompanying {@link Status}
+   *  when:
+   *    {@link Status.Code#Busy} if there is a write conflict,
+   *    {@link Status.Code#TimedOut} if a lock could not be acquired,
+   *    {@link Status.Code#TryAgain} if the memtable history size is not large
+   *       enough. See
+   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *
+   * @param columnFamilyHandle The column family to delete the key/value from
+   * @param key the specified key to be deleted.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  @Experimental("Performance optimization for a very specific workload")
+  public void singleDelete(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key) throws RocksDBException {
+    assert(isOwningHandle());
+    singleDelete(nativeHandle_, key, key.length,
+        columnFamilyHandle.nativeHandle_, false);
+  }
+
+  /**
+   * Similar to {@link RocksDB#singleDelete(byte[])}, but
+   * will also perform conflict checking on the keys be written.
+   *
+   * If this Transaction was created on an {@link OptimisticTransactionDB},
+   * these functions should always succeed.
+   *
+   *  If this Transaction was created on a {@link TransactionDB}, an
+   *  {@link RocksDBException} may be thrown with an accompanying {@link Status}
+   *  when:
+   *    {@link Status.Code#Busy} if there is a write conflict,
+   *    {@link Status.Code#TimedOut} if a lock could not be acquired,
+   *    {@link Status.Code#TryAgain} if the memtable history size is not large
+   *       enough. See
+   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *
+   * @param key the specified key to be deleted.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  @Experimental("Performance optimization for a very specific workload")
+  public void singleDelete(final byte[] key) throws RocksDBException {
+    assert(isOwningHandle());
+    singleDelete(nativeHandle_, key, key.length);
+  }
+
+  //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future
+  /**
+   * Similar to {@link #singleDelete(ColumnFamilyHandle, byte[])} but allows
+   * you to specify the key in several parts that will be
+   * concatenated together.
+   *
+   * @param columnFamilyHandle The column family to delete the key/value from
+   * @param keyParts the specified key to be deleted.
+   * @param assumeTracked true when it is expected that the key is already
+   *     tracked. More specifically, it means the key was previously tracked
+   *     in the same savepoint, with the same exclusive flag, and at a lower
+   *     sequence number. If valid then it skips ValidateSnapshot,
+   *     throws an error otherwise.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  @Experimental("Performance optimization for a very specific workload")
+  public void singleDelete(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[][] keyParts, final boolean assumeTracked)
+      throws RocksDBException {
+    assert (isOwningHandle());
+    singleDelete(nativeHandle_, keyParts, keyParts.length,
+        columnFamilyHandle.nativeHandle_, assumeTracked);
+  }
+
+  /**
+   * Similar to{@link #singleDelete(ColumnFamilyHandle, byte[][], boolean)}
+   * but with {@code assumeTracked = false}.
+   *
+   * Allows you to specify the key in several parts that will be
+   * concatenated together.
+   *
+   * @param columnFamilyHandle The column family to delete the key/value from
+   * @param keyParts the specified key to be deleted.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  @Experimental("Performance optimization for a very specific workload")
+  public void singleDelete(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[][] keyParts) throws RocksDBException {
+    assert(isOwningHandle());
+    singleDelete(nativeHandle_, keyParts, keyParts.length,
+        columnFamilyHandle.nativeHandle_, false);
+  }
+
+  //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future
+  /**
+   * Similar to {@link #singleDelete(byte[])} but allows
+   * you to specify the key in several parts that will be
+   * concatenated together.
+   *
+   * @param keyParts the specified key to be deleted.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  @Experimental("Performance optimization for a very specific workload")
+  public void singleDelete(final byte[][] keyParts) throws RocksDBException {
+    assert(isOwningHandle());
+    singleDelete(nativeHandle_, keyParts, keyParts.length);
+  }
+
+  /**
+   * Similar to {@link RocksDB#put(ColumnFamilyHandle, byte[], byte[])},
+   * but operates on the transactions write batch. This write will only happen
+   * if this transaction gets committed successfully.
+   *
+   * Unlike {@link #put(ColumnFamilyHandle, byte[], byte[])} no conflict
+   * checking will be performed for this key.
+   *
+   * If this Transaction was created on a {@link TransactionDB}, this function
+   * will still acquire locks necessary to make sure this write doesn't cause
+   * conflicts in other transactions; This may cause a {@link RocksDBException}
+   * with associated {@link Status.Code#Busy}.
+   *
+   * @param columnFamilyHandle The column family to put the key/value into
+   * @param key the specified key to be inserted.
+   * @param value the value associated with the specified key.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void putUntracked(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, final byte[] value) throws RocksDBException {
+    assert(isOwningHandle());
+    putUntracked(nativeHandle_, key, key.length, value, value.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Similar to {@link RocksDB#put(byte[], byte[])},
+   * but operates on the transactions write batch. This write will only happen
+   * if this transaction gets committed successfully.
+   *
+   * Unlike {@link #put(byte[], byte[])} no conflict
+   * checking will be performed for this key.
+   *
+   * If this Transaction was created on a {@link TransactionDB}, this function
+   * will still acquire locks necessary to make sure this write doesn't cause
+   * conflicts in other transactions; This may cause a {@link RocksDBException}
+   * with associated {@link Status.Code#Busy}.
+   *
+   * @param key the specified key to be inserted.
+   * @param value the value associated with the specified key.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void putUntracked(final byte[] key, final byte[] value)
+      throws RocksDBException {
+    assert(isOwningHandle());
+    putUntracked(nativeHandle_, key, key.length, value, value.length);
+  }
+
+  //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future
+  /**
+   * Similar to {@link #putUntracked(ColumnFamilyHandle, byte[], byte[])} but
+   * allows you to specify the key and value in several parts that will be
+   * concatenated together.
+   *
+   * @param columnFamilyHandle The column family to put the key/value into
+   * @param keyParts the specified key to be inserted.
+   * @param valueParts the value associated with the specified key.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void putUntracked(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[][] keyParts, final byte[][] valueParts)
+      throws RocksDBException {
+    assert(isOwningHandle());
+    putUntracked(nativeHandle_, keyParts, keyParts.length, valueParts,
+        valueParts.length, columnFamilyHandle.nativeHandle_);
+  }
+
+  //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future
+  /**
+   * Similar to {@link #putUntracked(byte[], byte[])} but
+   * allows you to specify the key and value in several parts that will be
+   * concatenated together.
+   *
+   * @param keyParts the specified key to be inserted.
+   * @param valueParts the value associated with the specified key.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void putUntracked(final byte[][] keyParts, final byte[][] valueParts)
+      throws RocksDBException {
+    assert(isOwningHandle());
+    putUntracked(nativeHandle_, keyParts, keyParts.length, valueParts,
+        valueParts.length);
+  }
+
+  /**
+   * Similar to {@link RocksDB#merge(ColumnFamilyHandle, byte[], byte[])},
+   * but operates on the transactions write batch. This write will only happen
+   * if this transaction gets committed successfully.
+   *
+   * Unlike {@link #merge(ColumnFamilyHandle, byte[], byte[])} no conflict
+   * checking will be performed for this key.
+   *
+   * If this Transaction was created on a {@link TransactionDB}, this function
+   * will still acquire locks necessary to make sure this write doesn't cause
+   * conflicts in other transactions; This may cause a {@link RocksDBException}
+   * with associated {@link Status.Code#Busy}.
+   *
+   * @param columnFamilyHandle The column family to merge the key/value into
+   * @param key the specified key to be merged.
+   * @param value the value associated with the specified key.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void mergeUntracked(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, final byte[] value) throws RocksDBException {
+    mergeUntracked(nativeHandle_, key, key.length, value, value.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Similar to {@link RocksDB#merge(byte[], byte[])},
+   * but operates on the transactions write batch. This write will only happen
+   * if this transaction gets committed successfully.
+   *
+   * Unlike {@link #merge(byte[], byte[])} no conflict
+   * checking will be performed for this key.
+   *
+   * If this Transaction was created on a {@link TransactionDB}, this function
+   * will still acquire locks necessary to make sure this write doesn't cause
+   * conflicts in other transactions; This may cause a {@link RocksDBException}
+   * with associated {@link Status.Code#Busy}.
+   *
+   * @param key the specified key to be merged.
+   * @param value the value associated with the specified key.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void mergeUntracked(final byte[] key, final byte[] value)
+      throws RocksDBException {
+    assert(isOwningHandle());
+    mergeUntracked(nativeHandle_, key, key.length, value, value.length);
+  }
+
+  /**
+   * Similar to {@link RocksDB#delete(ColumnFamilyHandle, byte[])},
+   * but operates on the transactions write batch. This write will only happen
+   * if this transaction gets committed successfully.
+   *
+   * Unlike {@link #delete(ColumnFamilyHandle, byte[])} no conflict
+   * checking will be performed for this key.
+   *
+   * If this Transaction was created on a {@link TransactionDB}, this function
+   * will still acquire locks necessary to make sure this write doesn't cause
+   * conflicts in other transactions; This may cause a {@link RocksDBException}
+   * with associated {@link Status.Code#Busy}.
+   *
+   * @param columnFamilyHandle The column family to delete the key/value from
+   * @param key the specified key to be deleted.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void deleteUntracked(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key) throws RocksDBException {
+    assert(isOwningHandle());
+    deleteUntracked(nativeHandle_, key, key.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Similar to {@link RocksDB#delete(byte[])},
+   * but operates on the transactions write batch. This write will only happen
+   * if this transaction gets committed successfully.
+   *
+   * Unlike {@link #delete(byte[])} no conflict
+   * checking will be performed for this key.
+   *
+   * If this Transaction was created on a {@link TransactionDB}, this function
+   * will still acquire locks necessary to make sure this write doesn't cause
+   * conflicts in other transactions; This may cause a {@link RocksDBException}
+   * with associated {@link Status.Code#Busy}.
+   *
+   * @param key the specified key to be deleted.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void deleteUntracked(final byte[] key) throws RocksDBException {
+    assert(isOwningHandle());
+    deleteUntracked(nativeHandle_, key, key.length);
+  }
+
+  //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future
+  /**
+   * Similar to {@link #deleteUntracked(ColumnFamilyHandle, byte[])} but allows
+   * you to specify the key in several parts that will be
+   * concatenated together.
+   *
+   * @param columnFamilyHandle The column family to delete the key/value from
+   * @param keyParts the specified key to be deleted.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void deleteUntracked(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[][] keyParts) throws RocksDBException {
+    assert(isOwningHandle());
+    deleteUntracked(nativeHandle_, keyParts, keyParts.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future
+  /**
+   * Similar to {@link #deleteUntracked(byte[])} but allows
+   * you to specify the key in several parts that will be
+   * concatenated together.
+   *
+   * @param keyParts the specified key to be deleted.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void deleteUntracked(final byte[][] keyParts) throws RocksDBException {
+    assert(isOwningHandle());
+    deleteUntracked(nativeHandle_, keyParts, keyParts.length);
+  }
+
+  /**
+   * Similar to {@link WriteBatch#putLogData(byte[])}
+   *
+   * @param blob binary object to be inserted
+   */
+  public void putLogData(final byte[] blob) {
+    assert(isOwningHandle());
+    putLogData(nativeHandle_, blob, blob.length);
+  }
+
+  /**
+   * By default, all put/merge/delete operations will be indexed in the
+   * transaction so that get/getForUpdate/getIterator can search for these
+   * keys.
+   *
+   * If the caller does not want to fetch the keys about to be written,
+   * they may want to avoid indexing as a performance optimization.
+   * Calling {@link #disableIndexing()} will turn off indexing for all future
+   * put/merge/delete operations until {@link #enableIndexing()} is called.
+   *
+   * If a key is put/merge/deleted after {@link #disableIndexing()} is called
+   * and then is fetched via get/getForUpdate/getIterator, the result of the
+   * fetch is undefined.
+   */
+  public void disableIndexing() {
+    assert(isOwningHandle());
+    disableIndexing(nativeHandle_);
+  }
+
+  /**
+   * Re-enables indexing after a previous call to {@link #disableIndexing()}
+   */
+  public void enableIndexing() {
+    assert(isOwningHandle());
+    enableIndexing(nativeHandle_);
+  }
+
+  /**
+   * Returns the number of distinct Keys being tracked by this transaction.
+   * If this transaction was created by a {@link TransactionDB}, this is the
+   * number of keys that are currently locked by this transaction.
+   * If this transaction was created by an {@link OptimisticTransactionDB},
+   * this is the number of keys that need to be checked for conflicts at commit
+   * time.
+   *
+   * @return the number of distinct Keys being tracked by this transaction
+   */
+  public long getNumKeys() {
+    assert(isOwningHandle());
+    return getNumKeys(nativeHandle_);
+  }
+
+  /**
+   * Returns the number of puts that have been applied to this
+   * transaction so far.
+   *
+   * @return the number of puts that have been applied to this transaction
+   */
+  public long getNumPuts() {
+    assert(isOwningHandle());
+    return getNumPuts(nativeHandle_);
+  }
+
+  /**
+   * Returns the number of deletes that have been applied to this
+   * transaction so far.
+   *
+   * @return the number of deletes that have been applied to this transaction
+   */
+  public long getNumDeletes() {
+    assert(isOwningHandle());
+    return getNumDeletes(nativeHandle_);
+  }
+
+  /**
+   * Returns the number of merges that have been applied to this
+   * transaction so far.
+   *
+   * @return the number of merges that have been applied to this transaction
+   */
+  public long getNumMerges() {
+    assert(isOwningHandle());
+    return getNumMerges(nativeHandle_);
+  }
+
+  /**
+   * Returns the elapsed time in milliseconds since this Transaction began.
+   *
+   * @return the elapsed time in milliseconds since this transaction began.
+   */
+  public long getElapsedTime() {
+    assert(isOwningHandle());
+    return getElapsedTime(nativeHandle_);
+  }
+
+  /**
+   * Fetch the underlying write batch that contains all pending changes to be
+   * committed.
+   *
+   * Note: You should not write or delete anything from the batch directly and
+   * should only use the functions in the {@link Transaction} class to
+   * write to this transaction.
+   *
+   * @return The write batch
+   */
+  public WriteBatchWithIndex getWriteBatch() {
+    assert(isOwningHandle());
+    final WriteBatchWithIndex writeBatchWithIndex =
+        new WriteBatchWithIndex(getWriteBatch(nativeHandle_));
+    return writeBatchWithIndex;
+  }
+
+  /**
+   * Change the value of {@link TransactionOptions#getLockTimeout()}
+   * (in milliseconds) for this transaction.
+   *
+   * Has no effect on OptimisticTransactions.
+   *
+   * @param lockTimeout the timeout (in milliseconds) for locks used by this
+   *     transaction.
+   */
+  public void setLockTimeout(final long lockTimeout) {
+    assert(isOwningHandle());
+    setLockTimeout(nativeHandle_, lockTimeout);
+  }
+
+  /**
+   * Return the WriteOptions that will be used during {@link #commit()}.
+   *
+   * @return the WriteOptions that will be used
+   */
+  public WriteOptions getWriteOptions() {
+    assert(isOwningHandle());
+    final WriteOptions writeOptions =
+        new WriteOptions(getWriteOptions(nativeHandle_));
+    return writeOptions;
+  }
+
+  /**
+   * Reset the WriteOptions that will be used during {@link #commit()}.
+   *
+   * @param writeOptions The new WriteOptions
+   */
+  public void setWriteOptions(final WriteOptions writeOptions) {
+    assert(isOwningHandle());
+    setWriteOptions(nativeHandle_, writeOptions.nativeHandle_);
+  }
+
+  /**
+   * If this key was previously fetched in this transaction using
+   * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)}/
+   * {@link #multiGetForUpdate(ReadOptions, List, byte[][])}, calling
+   * {@link #undoGetForUpdate(ColumnFamilyHandle, byte[])} will tell
+   * the transaction that it no longer needs to do any conflict checking
+   * for this key.
+   *
+   * If a key has been fetched N times via
+   * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)}/
+   * {@link #multiGetForUpdate(ReadOptions, List, byte[][])}, then
+   * {@link #undoGetForUpdate(ColumnFamilyHandle, byte[])}  will only have an
+   * effect if it is also called N times. If this key has been written to in
+   * this transaction, {@link #undoGetForUpdate(ColumnFamilyHandle, byte[])}
+   * will have no effect.
+   *
+   * If {@link #setSavePoint()} has been called after the
+   * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)},
+   * {@link #undoGetForUpdate(ColumnFamilyHandle, byte[])} will not have any
+   * effect.
+   *
+   * If this Transaction was created by an {@link OptimisticTransactionDB},
+   * calling {@link #undoGetForUpdate(ColumnFamilyHandle, byte[])} can affect
+   * whether this key is conflict checked at commit time.
+   * If this Transaction was created by a {@link TransactionDB},
+   * calling {@link #undoGetForUpdate(ColumnFamilyHandle, byte[])} may release
+   * any held locks for this key.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the key to retrieve the value for.
+   */
+  public void undoGetForUpdate(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key) {
+    assert(isOwningHandle());
+    undoGetForUpdate(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * If this key was previously fetched in this transaction using
+   * {@link #getForUpdate(ReadOptions, byte[], boolean)}/
+   * {@link #multiGetForUpdate(ReadOptions, List, byte[][])}, calling
+   * {@link #undoGetForUpdate(byte[])} will tell
+   * the transaction that it no longer needs to do any conflict checking
+   * for this key.
+   *
+   * If a key has been fetched N times via
+   * {@link #getForUpdate(ReadOptions, byte[], boolean)}/
+   * {@link #multiGetForUpdate(ReadOptions, List, byte[][])}, then
+   * {@link #undoGetForUpdate(byte[])}  will only have an
+   * effect if it is also called N times. If this key has been written to in
+   * this transaction, {@link #undoGetForUpdate(byte[])}
+   * will have no effect.
+   *
+   * If {@link #setSavePoint()} has been called after the
+   * {@link #getForUpdate(ReadOptions, byte[], boolean)},
+   * {@link #undoGetForUpdate(byte[])} will not have any
+   * effect.
+   *
+   * If this Transaction was created by an {@link OptimisticTransactionDB},
+   * calling {@link #undoGetForUpdate(byte[])} can affect
+   * whether this key is conflict checked at commit time.
+   * If this Transaction was created by a {@link TransactionDB},
+   * calling {@link #undoGetForUpdate(byte[])} may release
+   * any held locks for this key.
+   *
+   * @param key the key to retrieve the value for.
+   */
+  public void undoGetForUpdate(final byte[] key) {
+    assert(isOwningHandle());
+    undoGetForUpdate(nativeHandle_, key, key.length);
+  }
+
+  /**
+   * Adds the keys from the WriteBatch to the transaction
+   *
+   * @param writeBatch The write batch to read from
+   *
+   * @throws RocksDBException if an error occurs whilst rebuilding from the
+   *     write batch.
+   */
+  public void rebuildFromWriteBatch(final WriteBatch writeBatch)
+      throws RocksDBException {
+    assert(isOwningHandle());
+    rebuildFromWriteBatch(nativeHandle_, writeBatch.nativeHandle_);
+  }
+
+  /**
+   * Get the Commit time Write Batch.
+   *
+   * @return the commit time write batch.
+   */
+  public WriteBatch getCommitTimeWriteBatch() {
+    assert(isOwningHandle());
+    final WriteBatch writeBatch =
+        new WriteBatch(getCommitTimeWriteBatch(nativeHandle_));
+    return writeBatch;
+  }
+
+  /**
+   * Set the log number.
+   *
+   * @param logNumber the log number
+   */
+  public void setLogNumber(final long logNumber) {
+    assert(isOwningHandle());
+    setLogNumber(nativeHandle_, logNumber);
+  }
+
+  /**
+   * Get the log number.
+   *
+   * @return the log number
+   */
+  public long getLogNumber() {
+    assert(isOwningHandle());
+    return getLogNumber(nativeHandle_);
+  }
+
+  /**
+   * Set the name of the transaction.
+   *
+   * @param transactionName the name of the transaction
+   *
+   * @throws RocksDBException if an error occurs when setting the transaction
+   *     name.
+   */
+  public void setName(final String transactionName) throws RocksDBException {
+    assert(isOwningHandle());
+    setName(nativeHandle_, transactionName);
+  }
+
+  /**
+   * Get the name of the transaction.
+   *
+   * @return the name of the transaction
+   */
+  public String getName() {
+    assert(isOwningHandle());
+    return getName(nativeHandle_);
+  }
+
+  /**
+   * Get the ID of the transaction.
+   *
+   * @return the ID of the transaction.
+   */
+  public long getID() {
+    assert(isOwningHandle());
+    return getID(nativeHandle_);
+  }
+
+  /**
+   * Determine if a deadlock has been detected.
+   *
+   * @return true if a deadlock has been detected.
+   */
+  public boolean isDeadlockDetect() {
+    assert(isOwningHandle());
+    return isDeadlockDetect(nativeHandle_);
+  }
+
+  /**
+   * Get the list of waiting transactions.
+   *
+   * @return The list of waiting transactions.
+   */
+  public WaitingTransactions getWaitingTxns() {
+    assert(isOwningHandle());
+    return getWaitingTxns(nativeHandle_);
+  }
+
+  /**
+   * Get the execution status of the transaction.
+   *
+   * NOTE: The execution status of an Optimistic Transaction
+   * never changes. This is only useful for non-optimistic transactions!
+   *
+   * @return The execution status of the transaction
+   */
+  public TransactionState getState() {
+    assert(isOwningHandle());
+    return TransactionState.getTransactionState(
+        getState(nativeHandle_));
+  }
+
+  /**
+   * The globally unique id with which the transaction is identified. This id
+   * might or might not be set depending on the implementation. Similarly the
+   * implementation decides the point in lifetime of a transaction at which it
+   * assigns the id. Although currently it is the case, the id is not guaranteed
+   * to remain the same across restarts.
+   *
+   * @return the transaction id.
+   */
+  @Experimental("NOTE: Experimental feature")
+  public long getId() {
+    assert(isOwningHandle());
+    return getId(nativeHandle_);
+  }
+
+  public enum TransactionState {
+    STARTED((byte)0),
+    AWAITING_PREPARE((byte)1),
+    PREPARED((byte)2),
+    AWAITING_COMMIT((byte)3),
+    COMMITTED((byte)4),
+    AWAITING_ROLLBACK((byte)5),
+    ROLLEDBACK((byte)6),
+    LOCKS_STOLEN((byte)7);
+
+    /*
+     * Keep old misspelled variable as alias
+     * Tip from https://stackoverflow.com/a/37092410/454544
+     */
+    public static final TransactionState COMMITED = COMMITTED;
+
+    private final byte value;
+
+    TransactionState(final byte value) {
+      this.value = value;
+    }
+
+    /**
+     * Get TransactionState by byte value.
+     *
+     * @param value byte representation of TransactionState.
+     *
+     * @return {@link org.rocksdb.Transaction.TransactionState} instance or null.
+     * @throws java.lang.IllegalArgumentException if an invalid
+     *     value is provided.
+     */
+    public static TransactionState getTransactionState(final byte value) {
+      for (final TransactionState transactionState : TransactionState.values()) {
+        if (transactionState.value == value){
+          return transactionState;
+        }
+      }
+      throw new IllegalArgumentException(
+          "Illegal value provided for TransactionState.");
+    }
+  }
+
+  /**
+   * Called from C++ native method {@link #getWaitingTxns(long)}
+   * to construct a WaitingTransactions object.
+   *
+   * @param columnFamilyId The id of the {@link ColumnFamilyHandle}
+   * @param key The key
+   * @param transactionIds The transaction ids
+   *
+   * @return The waiting transactions
+   */
+  private WaitingTransactions newWaitingTransactions(
+      final long columnFamilyId, final String key,
+      final long[] transactionIds) {
+    return new WaitingTransactions(columnFamilyId, key, transactionIds);
+  }
+
+  public static class WaitingTransactions {
+    private final long columnFamilyId;
+    private final String key;
+    private final long[] transactionIds;
+
+    private WaitingTransactions(final long columnFamilyId, final String key,
+        final long[] transactionIds) {
+      this.columnFamilyId = columnFamilyId;
+      this.key = key;
+      this.transactionIds = transactionIds;
+    }
+
+    /**
+     * Get the Column Family ID.
+     *
+     * @return The column family ID
+     */
+    public long getColumnFamilyId() {
+      return columnFamilyId;
+    }
+
+    /**
+     * Get the key on which the transactions are waiting.
+     *
+     * @return The key
+     */
+    public String getKey() {
+      return key;
+    }
+
+    /**
+     * Get the IDs of the waiting transactions.
+     *
+     * @return The IDs of the waiting transactions
+     */
+    public long[] getTransactionIds() {
+      return transactionIds;
+    }
+  }
+
+  private native void setSnapshot(final long handle);
+  private native void setSnapshotOnNextOperation(final long handle);
+  private native void setSnapshotOnNextOperation(final long handle,
+      final long transactionNotifierHandle);
+  private native long getSnapshot(final long handle);
+  private native void clearSnapshot(final long handle);
+  private native void prepare(final long handle) throws RocksDBException;
+  private native void commit(final long handle) throws RocksDBException;
+  private native void rollback(final long handle) throws RocksDBException;
+  private native void setSavePoint(final long handle) throws RocksDBException;
+  private native void rollbackToSavePoint(final long handle)
+      throws RocksDBException;
+  private native byte[] get(final long handle, final long readOptionsHandle,
+      final byte key[], final int keyLength, final long columnFamilyHandle)
+      throws RocksDBException;
+  private native byte[] get(final long handle, final long readOptionsHandle,
+      final byte key[], final int keyLen) throws RocksDBException;
+  private native byte[][] multiGet(final long handle,
+      final long readOptionsHandle, final byte[][] keys,
+      final long[] columnFamilyHandles) throws RocksDBException;
+  private native byte[][] multiGet(final long handle,
+      final long readOptionsHandle, final byte[][] keys)
+      throws RocksDBException;
+  private native byte[] getForUpdate(final long handle, final long readOptionsHandle,
+      final byte key[], final int keyLength, final long columnFamilyHandle, final boolean exclusive,
+      final boolean doValidate) throws RocksDBException;
+  private native byte[] getForUpdate(final long handle, final long readOptionsHandle,
+      final byte key[], final int keyLen, final boolean exclusive, final boolean doValidate)
+      throws RocksDBException;
+  private native byte[][] multiGetForUpdate(final long handle,
+      final long readOptionsHandle, final byte[][] keys,
+      final long[] columnFamilyHandles) throws RocksDBException;
+  private native byte[][] multiGetForUpdate(final long handle,
+      final long readOptionsHandle, final byte[][] keys)
+      throws RocksDBException;
+  private native long getIterator(final long handle,
+      final long readOptionsHandle);
+  private native long getIterator(final long handle,
+      final long readOptionsHandle, final long columnFamilyHandle);
+  private native void put(final long handle, final byte[] key, final int keyLength,
+      final byte[] value, final int valueLength, final long columnFamilyHandle,
+      final boolean assumeTracked) throws RocksDBException;
+  private native void put(final long handle, final byte[] key,
+      final int keyLength, final byte[] value, final int valueLength)
+      throws RocksDBException;
+  private native void put(final long handle, final byte[][] keys, final int keysLength,
+      final byte[][] values, final int valuesLength, final long columnFamilyHandle,
+      final boolean assumeTracked) throws RocksDBException;
+  private native void put(final long handle, final byte[][] keys,
+      final int keysLength, final byte[][] values, final int valuesLength)
+      throws RocksDBException;
+  private native void merge(final long handle, final byte[] key, final int keyLength,
+      final byte[] value, final int valueLength, final long columnFamilyHandle,
+      final boolean assumeTracked) throws RocksDBException;
+  private native void merge(final long handle, final byte[] key,
+      final int keyLength, final byte[] value, final int valueLength)
+      throws RocksDBException;
+  private native void delete(final long handle, final byte[] key, final int keyLength,
+      final long columnFamilyHandle, final boolean assumeTracked) throws RocksDBException;
+  private native void delete(final long handle, final byte[] key,
+      final int keyLength) throws RocksDBException;
+  private native void delete(final long handle, final byte[][] keys, final int keysLength,
+      final long columnFamilyHandle, final boolean assumeTracked) throws RocksDBException;
+  private native void delete(final long handle, final byte[][] keys,
+      final int keysLength) throws RocksDBException;
+  private native void singleDelete(final long handle, final byte[] key, final int keyLength,
+      final long columnFamilyHandle, final boolean assumeTracked) throws RocksDBException;
+  private native void singleDelete(final long handle, final byte[] key,
+      final int keyLength) throws RocksDBException;
+  private native void singleDelete(final long handle, final byte[][] keys, final int keysLength,
+      final long columnFamilyHandle, final boolean assumeTracked) throws RocksDBException;
+  private native void singleDelete(final long handle, final byte[][] keys,
+      final int keysLength) throws RocksDBException;
+  private native void putUntracked(final long handle, final byte[] key,
+      final int keyLength, final byte[] value, final int valueLength,
+      final long columnFamilyHandle) throws RocksDBException;
+  private native void putUntracked(final long handle, final byte[] key,
+      final int keyLength, final byte[] value, final int valueLength)
+      throws RocksDBException;
+  private native void putUntracked(final long handle, final byte[][] keys,
+      final int keysLength, final byte[][] values, final int valuesLength,
+      final long columnFamilyHandle) throws RocksDBException;
+  private native void putUntracked(final long handle, final byte[][] keys,
+      final int keysLength, final byte[][] values, final int valuesLength)
+      throws RocksDBException;
+  private native void mergeUntracked(final long handle, final byte[] key,
+      final int keyLength, final byte[] value, final int valueLength,
+      final long columnFamilyHandle) throws RocksDBException;
+  private native void mergeUntracked(final long handle, final byte[] key,
+      final int keyLength, final byte[] value, final int valueLength)
+      throws RocksDBException;
+  private native void deleteUntracked(final long handle, final byte[] key,
+      final int keyLength, final long columnFamilyHandle)
+      throws RocksDBException;
+  private native void deleteUntracked(final long handle, final byte[] key,
+      final int keyLength) throws RocksDBException;
+  private native void deleteUntracked(final long handle, final byte[][] keys,
+      final int keysLength, final long columnFamilyHandle)
+      throws RocksDBException;
+  private native void deleteUntracked(final long handle, final byte[][] keys,
+      final int keysLength) throws RocksDBException;
+  private native void putLogData(final long handle, final byte[] blob,
+      final int blobLength);
+  private native void disableIndexing(final long handle);
+  private native void enableIndexing(final long handle);
+  private native long getNumKeys(final long handle);
+  private native long getNumPuts(final long handle);
+  private native long getNumDeletes(final long handle);
+  private native long getNumMerges(final long handle);
+  private native long getElapsedTime(final long handle);
+  private native long getWriteBatch(final long handle);
+  private native void setLockTimeout(final long handle, final long lockTimeout);
+  private native long getWriteOptions(final long handle);
+  private native void setWriteOptions(final long handle,
+      final long writeOptionsHandle);
+  private native void undoGetForUpdate(final long handle, final byte[] key,
+      final int keyLength, final long columnFamilyHandle);
+  private native void undoGetForUpdate(final long handle, final byte[] key,
+      final int keyLength);
+  private native void rebuildFromWriteBatch(final long handle,
+      final long writeBatchHandle) throws RocksDBException;
+  private native long getCommitTimeWriteBatch(final long handle);
+  private native void setLogNumber(final long handle, final long logNumber);
+  private native long getLogNumber(final long handle);
+  private native void setName(final long handle, final String name)
+      throws RocksDBException;
+  private native String getName(final long handle);
+  private native long getID(final long handle);
+  private native boolean isDeadlockDetect(final long handle);
+  private native WaitingTransactions getWaitingTxns(final long handle);
+  private native byte getState(final long handle);
+  private native long getId(final long handle);
+
+  @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/TransactionDB.java b/src/rocksdb/java/src/main/java/org/rocksdb/TransactionDB.java
new file mode 100644
index 000000000..86f25fe15
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/TransactionDB.java
@@ -0,0 +1,403 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Database with Transaction support
+ */
+public class TransactionDB extends RocksDB
+    implements TransactionalDB<TransactionOptions> {
+
+  private TransactionDBOptions transactionDbOptions_;
+
+  /**
+   * Private constructor.
+   *
+   * @param nativeHandle The native handle of the C++ TransactionDB object
+   */
+  private TransactionDB(final long nativeHandle) {
+    super(nativeHandle);
+  }
+
+  /**
+   * Open a TransactionDB, similar to {@link RocksDB#open(Options, String)}.
+   *
+   * @param options {@link org.rocksdb.Options} instance.
+   * @param transactionDbOptions {@link org.rocksdb.TransactionDBOptions}
+   *     instance.
+   * @param path the path to the rocksdb.
+   *
+   * @return a {@link TransactionDB} instance on success, null if the specified
+   *     {@link TransactionDB} can not be opened.
+   *
+   * @throws RocksDBException if an error occurs whilst opening the database.
+   */
+  public static TransactionDB open(final Options options,
+      final TransactionDBOptions transactionDbOptions, final String path)
+      throws RocksDBException {
+    final TransactionDB tdb = new TransactionDB(open(options.nativeHandle_,
+        transactionDbOptions.nativeHandle_, path));
+
+    // when non-default Options is used, keeping an Options reference
+    // in RocksDB can prevent Java to GC during the life-time of
+    // the currently-created RocksDB.
+    tdb.storeOptionsInstance(options);
+    tdb.storeTransactionDbOptions(transactionDbOptions);
+
+    return tdb;
+  }
+
+  /**
+   * Open a TransactionDB, similar to
+   * {@link RocksDB#open(DBOptions, String, List, List)}.
+   *
+   * @param dbOptions {@link org.rocksdb.DBOptions} instance.
+   * @param transactionDbOptions {@link org.rocksdb.TransactionDBOptions}
+   *     instance.
+   * @param path the path to the rocksdb.
+   * @param columnFamilyDescriptors list of column family descriptors
+   * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
+   *
+   * @return a {@link TransactionDB} instance on success, null if the specified
+   *     {@link TransactionDB} can not be opened.
+   *
+   * @throws RocksDBException if an error occurs whilst opening the database.
+   */
+  public static TransactionDB open(final DBOptions dbOptions,
+      final TransactionDBOptions transactionDbOptions,
+      final String path,
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors,
+      final List<ColumnFamilyHandle> columnFamilyHandles)
+      throws RocksDBException {
+
+    final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][];
+    final long[] cfOptionHandles = new long[columnFamilyDescriptors.size()];
+    for (int i = 0; i < columnFamilyDescriptors.size(); i++) {
+      final ColumnFamilyDescriptor cfDescriptor = columnFamilyDescriptors
+          .get(i);
+      cfNames[i] = cfDescriptor.getName();
+      cfOptionHandles[i] = cfDescriptor.getOptions().nativeHandle_;
+    }
+
+    final long[] handles = open(dbOptions.nativeHandle_,
+        transactionDbOptions.nativeHandle_, path, cfNames, cfOptionHandles);
+    final TransactionDB tdb = new TransactionDB(handles[0]);
+
+    // when non-default Options is used, keeping an Options reference
+    // in RocksDB can prevent Java to GC during the life-time of
+    // the currently-created RocksDB.
+    tdb.storeOptionsInstance(dbOptions);
+    tdb.storeTransactionDbOptions(transactionDbOptions);
+
+    for (int i = 1; i < handles.length; i++) {
+      columnFamilyHandles.add(new ColumnFamilyHandle(tdb, handles[i]));
+    }
+
+    return tdb;
+  }
+
+  /**
+   * This is similar to {@link #close()} except that it
+   * throws an exception if any error occurs.
+   *
+   * This will not fsync the WAL files.
+   * If syncing is required, the caller must first call {@link #syncWal()}
+   * or {@link #write(WriteOptions, WriteBatch)} using an empty write batch
+   * with {@link WriteOptions#setSync(boolean)} set to true.
+   *
+   * See also {@link #close()}.
+   *
+   * @throws RocksDBException if an error occurs whilst closing.
+   */
+  public void closeE() throws RocksDBException {
+    if (owningHandle_.compareAndSet(true, false)) {
+      try {
+        closeDatabase(nativeHandle_);
+      } finally {
+        disposeInternal();
+      }
+    }
+  }
+
+  /**
+   * This is similar to {@link #closeE()} except that it
+   * silently ignores any errors.
+   *
+   * This will not fsync the WAL files.
+   * If syncing is required, the caller must first call {@link #syncWal()}
+   * or {@link #write(WriteOptions, WriteBatch)} using an empty write batch
+   * with {@link WriteOptions#setSync(boolean)} set to true.
+   *
+   * See also {@link #close()}.
+   */
+  @Override
+  public void close() {
+    if (owningHandle_.compareAndSet(true, false)) {
+      try {
+        closeDatabase(nativeHandle_);
+      } catch (final RocksDBException e) {
+        // silently ignore the error report
+      } finally {
+        disposeInternal();
+      }
+    }
+  }
+
+  @Override
+  public Transaction beginTransaction(final WriteOptions writeOptions) {
+    return new Transaction(this, beginTransaction(nativeHandle_,
+        writeOptions.nativeHandle_));
+  }
+
+  @Override
+  public Transaction beginTransaction(final WriteOptions writeOptions,
+      final TransactionOptions transactionOptions) {
+    return new Transaction(this, beginTransaction(nativeHandle_,
+        writeOptions.nativeHandle_, transactionOptions.nativeHandle_));
+  }
+
+  // TODO(AR) consider having beingTransaction(... oldTransaction) set a
+  // reference count inside Transaction, so that we can always call
+  // Transaction#close but the object is only disposed when there are as many
+  // closes as beginTransaction. Makes the try-with-resources paradigm easier for
+  // java developers
+
+  @Override
+  public Transaction beginTransaction(final WriteOptions writeOptions,
+      final Transaction oldTransaction) {
+    final long jtxnHandle = beginTransaction_withOld(nativeHandle_,
+        writeOptions.nativeHandle_, oldTransaction.nativeHandle_);
+
+    // RocksJava relies on the assumption that
+    // we do not allocate a new Transaction object
+    // when providing an old_txn
+    assert(jtxnHandle == oldTransaction.nativeHandle_);
+
+    return oldTransaction;
+  }
+
+  @Override
+  public Transaction beginTransaction(final WriteOptions writeOptions,
+      final TransactionOptions transactionOptions,
+      final Transaction oldTransaction) {
+    final long jtxn_handle = beginTransaction_withOld(nativeHandle_,
+        writeOptions.nativeHandle_, transactionOptions.nativeHandle_,
+        oldTransaction.nativeHandle_);
+
+    // RocksJava relies on the assumption that
+    // we do not allocate a new Transaction object
+    // when providing an old_txn
+    assert(jtxn_handle == oldTransaction.nativeHandle_);
+
+    return oldTransaction;
+  }
+
+  public Transaction getTransactionByName(final String transactionName) {
+    final long jtxnHandle = getTransactionByName(nativeHandle_, transactionName);
+    if(jtxnHandle == 0) {
+      return null;
+    }
+
+    final Transaction txn = new Transaction(this, jtxnHandle);
+
+    // this instance doesn't own the underlying C++ object
+    txn.disOwnNativeHandle();
+
+    return txn;
+  }
+
+  public List<Transaction> getAllPreparedTransactions() {
+    final long[] jtxnHandles = getAllPreparedTransactions(nativeHandle_);
+
+    final List<Transaction> txns = new ArrayList<>();
+    for(final long jtxnHandle : jtxnHandles) {
+      final Transaction txn = new Transaction(this, jtxnHandle);
+
+      // this instance doesn't own the underlying C++ object
+      txn.disOwnNativeHandle();
+
+      txns.add(txn);
+    }
+    return txns;
+  }
+
+  public static class KeyLockInfo {
+    private final String key;
+    private final long[] transactionIDs;
+    private final boolean exclusive;
+
+    public KeyLockInfo(final String key, final long transactionIDs[],
+        final boolean exclusive) {
+      this.key = key;
+      this.transactionIDs = transactionIDs;
+      this.exclusive = exclusive;
+    }
+
+    /**
+     * Get the key.
+     *
+     * @return the key
+     */
+    public String getKey() {
+      return key;
+    }
+
+    /**
+     * Get the Transaction IDs.
+     *
+     * @return the Transaction IDs.
+     */
+    public long[] getTransactionIDs() {
+      return transactionIDs;
+    }
+
+    /**
+     * Get the Lock status.
+     *
+     * @return true if the lock is exclusive, false if the lock is shared.
+     */
+    public boolean isExclusive() {
+      return exclusive;
+    }
+  }
+
+  /**
+   * Returns map of all locks held.
+   *
+   * @return a map of all the locks held.
+   */
+  public Map<Long, KeyLockInfo> getLockStatusData() {
+    return getLockStatusData(nativeHandle_);
+  }
+
+  /**
+   * Called from C++ native method {@link #getDeadlockInfoBuffer(long)}
+   * to construct a DeadlockInfo object.
+   *
+   * @param transactionID The transaction id
+   * @param columnFamilyId The id of the {@link ColumnFamilyHandle}
+   * @param waitingKey the key that we are waiting on
+   * @param exclusive true if the lock is exclusive, false if the lock is shared
+   *
+   * @return The waiting transactions
+   */
+  private DeadlockInfo newDeadlockInfo(
+      final long transactionID, final long columnFamilyId,
+      final String waitingKey, final boolean exclusive) {
+    return new DeadlockInfo(transactionID, columnFamilyId,
+        waitingKey, exclusive);
+  }
+
+  public static class DeadlockInfo {
+    private final long transactionID;
+    private final long columnFamilyId;
+    private final String waitingKey;
+    private final boolean exclusive;
+
+    private DeadlockInfo(final long transactionID, final long columnFamilyId,
+      final String waitingKey, final boolean exclusive) {
+      this.transactionID = transactionID;
+      this.columnFamilyId = columnFamilyId;
+      this.waitingKey = waitingKey;
+      this.exclusive = exclusive;
+    }
+
+    /**
+     * Get the Transaction ID.
+     *
+     * @return the transaction ID
+     */
+    public long getTransactionID() {
+      return transactionID;
+    }
+
+    /**
+     * Get the Column Family ID.
+     *
+     * @return The column family ID
+     */
+    public long getColumnFamilyId() {
+      return columnFamilyId;
+    }
+
+    /**
+     * Get the key that we are waiting on.
+     *
+     * @return the key that we are waiting on
+     */
+    public String getWaitingKey() {
+      return waitingKey;
+    }
+
+    /**
+     * Get the Lock status.
+     *
+     * @return true if the lock is exclusive, false if the lock is shared.
+     */
+    public boolean isExclusive() {
+      return exclusive;
+    }
+  }
+
+  public static class DeadlockPath {
+    final DeadlockInfo[] path;
+    final boolean limitExceeded;
+
+    public DeadlockPath(final DeadlockInfo[] path, final boolean limitExceeded) {
+      this.path = path;
+      this.limitExceeded = limitExceeded;
+    }
+
+    public boolean isEmpty() {
+      return path.length == 0 && !limitExceeded;
+    }
+  }
+
+  public DeadlockPath[] getDeadlockInfoBuffer() {
+    return getDeadlockInfoBuffer(nativeHandle_);
+  }
+
+  public void setDeadlockInfoBufferSize(final int targetSize) {
+    setDeadlockInfoBufferSize(nativeHandle_, targetSize);
+  }
+
+  private void storeTransactionDbOptions(
+      final TransactionDBOptions transactionDbOptions) {
+    this.transactionDbOptions_ = transactionDbOptions;
+  }
+
+  @Override protected final native void disposeInternal(final long handle);
+
+  private static native long open(final long optionsHandle,
+      final long transactionDbOptionsHandle, final String path)
+      throws RocksDBException;
+  private static native long[] open(final long dbOptionsHandle,
+      final long transactionDbOptionsHandle, final String path,
+      final byte[][] columnFamilyNames, final long[] columnFamilyOptions);
+  private native static void closeDatabase(final long handle)
+      throws RocksDBException;
+  private native long beginTransaction(final long handle,
+      final long writeOptionsHandle);
+  private native long beginTransaction(final long handle,
+      final long writeOptionsHandle, final long transactionOptionsHandle);
+  private native long beginTransaction_withOld(final long handle,
+      final long writeOptionsHandle, final long oldTransactionHandle);
+  private native long beginTransaction_withOld(final long handle,
+      final long writeOptionsHandle, final long transactionOptionsHandle,
+      final long oldTransactionHandle);
+  private native long getTransactionByName(final long handle,
+      final String name);
+  private native long[] getAllPreparedTransactions(final long handle);
+  private native Map<Long, KeyLockInfo> getLockStatusData(
+      final long handle);
+  private native DeadlockPath[] getDeadlockInfoBuffer(final long handle);
+  private native void setDeadlockInfoBufferSize(final long handle,
+      final int targetSize);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/TransactionDBOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/TransactionDBOptions.java
new file mode 100644
index 000000000..7f4296a7c
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/TransactionDBOptions.java
@@ -0,0 +1,217 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public class TransactionDBOptions extends RocksObject {
+
+  public TransactionDBOptions() {
+    super(newTransactionDBOptions());
+  }
+
+  /**
+   * Specifies the maximum number of keys that can be locked at the same time
+   * per column family.
+   *
+   * If the number of locked keys is greater than {@link #getMaxNumLocks()},
+   * transaction writes (or GetForUpdate) will return an error.
+   *
+   * @return The maximum number of keys that can be locked
+   */
+  public long getMaxNumLocks() {
+    assert(isOwningHandle());
+    return getMaxNumLocks(nativeHandle_);
+  }
+
+  /**
+   * Specifies the maximum number of keys that can be locked at the same time
+   * per column family.
+   *
+   * If the number of locked keys is greater than {@link #getMaxNumLocks()},
+   * transaction writes (or GetForUpdate) will return an error.
+   *
+   * @param maxNumLocks The maximum number of keys that can be locked;
+   *     If this value is not positive, no limit will be enforced.
+   *
+   * @return this TransactionDBOptions instance
+   */
+  public TransactionDBOptions setMaxNumLocks(final long maxNumLocks) {
+    assert(isOwningHandle());
+    setMaxNumLocks(nativeHandle_, maxNumLocks);
+    return this;
+  }
+
+  /**
+   * The number of sub-tables per lock table (per column family)
+   *
+   * @return The number of sub-tables
+   */
+  public long getNumStripes() {
+    assert(isOwningHandle());
+    return getNumStripes(nativeHandle_);
+  }
+
+  /**
+   * Increasing this value will increase the concurrency by dividing the lock
+   * table (per column family) into more sub-tables, each with their own
+   * separate mutex.
+   *
+   * Default: 16
+   *
+   * @param numStripes The number of sub-tables
+   *
+   * @return this TransactionDBOptions instance
+   */
+  public TransactionDBOptions setNumStripes(final long numStripes) {
+    assert(isOwningHandle());
+    setNumStripes(nativeHandle_, numStripes);
+    return this;
+  }
+
+  /**
+   * The default wait timeout in milliseconds when
+   * a transaction attempts to lock a key if not specified by
+   * {@link TransactionOptions#setLockTimeout(long)}
+   *
+   * If 0, no waiting is done if a lock cannot instantly be acquired.
+   * If negative, there is no timeout.
+   *
+   * @return the default wait timeout in milliseconds
+   */
+  public long getTransactionLockTimeout() {
+    assert(isOwningHandle());
+    return getTransactionLockTimeout(nativeHandle_);
+  }
+
+  /**
+   * If positive, specifies the default wait timeout in milliseconds when
+   * a transaction attempts to lock a key if not specified by
+   * {@link TransactionOptions#setLockTimeout(long)}
+   *
+   * If 0, no waiting is done if a lock cannot instantly be acquired.
+   * If negative, there is no timeout. Not using a timeout is not recommended
+   * as it can lead to deadlocks.  Currently, there is no deadlock-detection to
+   * recover from a deadlock.
+   *
+   * Default: 1000
+   *
+   * @param transactionLockTimeout the default wait timeout in milliseconds
+   *
+   * @return this TransactionDBOptions instance
+   */
+  public TransactionDBOptions setTransactionLockTimeout(
+      final long transactionLockTimeout) {
+    assert(isOwningHandle());
+    setTransactionLockTimeout(nativeHandle_, transactionLockTimeout);
+    return this;
+  }
+
+  /**
+   * The wait timeout in milliseconds when writing a key
+   * OUTSIDE of a transaction (ie by calling {@link RocksDB#put},
+   * {@link RocksDB#merge}, {@link RocksDB#delete} or {@link RocksDB#write}
+   * directly).
+   *
+   * If 0, no waiting is done if a lock cannot instantly be acquired.
+   * If negative, there is no timeout and will block indefinitely when acquiring
+   * a lock.
+   *
+   * @return the timeout in milliseconds when writing a key OUTSIDE of a
+   *     transaction
+   */
+  public long getDefaultLockTimeout() {
+    assert(isOwningHandle());
+    return getDefaultLockTimeout(nativeHandle_);
+  }
+
+  /**
+   * If positive, specifies the wait timeout in milliseconds when writing a key
+   * OUTSIDE of a transaction (ie by calling {@link RocksDB#put},
+   * {@link RocksDB#merge}, {@link RocksDB#delete} or {@link RocksDB#write}
+   * directly).
+   *
+   * If 0, no waiting is done if a lock cannot instantly be acquired.
+   * If negative, there is no timeout and will block indefinitely when acquiring
+   * a lock.
+   *
+   * Not using a timeout can lead to deadlocks. Currently, there
+   * is no deadlock-detection to recover from a deadlock.  While DB writes
+   * cannot deadlock with other DB writes, they can deadlock with a transaction.
+   * A negative timeout should only be used if all transactions have a small
+   * expiration set.
+   *
+   * Default: 1000
+   *
+   * @param defaultLockTimeout the timeout in milliseconds when writing a key
+   *     OUTSIDE of a transaction
+   * @return this TransactionDBOptions instance
+   */
+   public TransactionDBOptions setDefaultLockTimeout(
+       final long defaultLockTimeout) {
+     assert(isOwningHandle());
+     setDefaultLockTimeout(nativeHandle_, defaultLockTimeout);
+     return this;
+   }
+
+//  /**
+//   * If set, the {@link TransactionDB} will use this implementation of a mutex
+//   * and condition variable for all transaction locking instead of the default
+//   * mutex/condvar implementation.
+//   *
+//   * @param transactionDbMutexFactory the mutex factory for the transactions
+//   *
+//   * @return this TransactionDBOptions instance
+//   */
+//  public TransactionDBOptions setCustomMutexFactory(
+//      final TransactionDBMutexFactory transactionDbMutexFactory) {
+//
+//  }
+
+  /**
+   * The policy for when to write the data into the DB. The default policy is to
+   * write only the committed data {@link TxnDBWritePolicy#WRITE_COMMITTED}.
+   * The data could be written before the commit phase. The DB then needs to
+   * provide the mechanisms to tell apart committed from uncommitted data.
+   *
+   * @return The write policy.
+   */
+  public TxnDBWritePolicy getWritePolicy() {
+    assert(isOwningHandle());
+    return TxnDBWritePolicy.getTxnDBWritePolicy(getWritePolicy(nativeHandle_));
+  }
+
+  /**
+   * The policy for when to write the data into the DB. The default policy is to
+   * write only the committed data {@link TxnDBWritePolicy#WRITE_COMMITTED}.
+   * The data could be written before the commit phase. The DB then needs to
+   * provide the mechanisms to tell apart committed from uncommitted data.
+   *
+   * @param writePolicy The write policy.
+   *
+   * @return this TransactionDBOptions instance
+   */
+  public TransactionDBOptions setWritePolicy(
+      final TxnDBWritePolicy writePolicy) {
+    assert(isOwningHandle());
+    setWritePolicy(nativeHandle_, writePolicy.getValue());
+    return this;
+  }
+
+  private native static long newTransactionDBOptions();
+  private native long getMaxNumLocks(final long handle);
+  private native void setMaxNumLocks(final long handle,
+      final long maxNumLocks);
+  private native long getNumStripes(final long handle);
+  private native void setNumStripes(final long handle, final long numStripes);
+  private native long getTransactionLockTimeout(final long handle);
+  private native void setTransactionLockTimeout(final long handle,
+      final long transactionLockTimeout);
+  private native long getDefaultLockTimeout(final long handle);
+  private native void setDefaultLockTimeout(final long handle,
+      final long transactionLockTimeout);
+  private native byte getWritePolicy(final long handle);
+  private native void setWritePolicy(final long handle, final byte writePolicy);
+  @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/TransactionLogIterator.java b/src/rocksdb/java/src/main/java/org/rocksdb/TransactionLogIterator.java
new file mode 100644
index 000000000..5d9ec58d7
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/TransactionLogIterator.java
@@ -0,0 +1,112 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+package org.rocksdb;
+
+/**
+ * <p>A TransactionLogIterator is used to iterate over the transactions in a db.
+ * One run of the iterator is continuous, i.e. the iterator will stop at the
+ * beginning of any gap in sequences.</p>
+ */
+public class TransactionLogIterator extends RocksObject {
+
+  /**
+   * <p>An iterator is either positioned at a WriteBatch
+   * or not valid. This method returns true if the iterator
+   * is valid. Can read data from a valid iterator.</p>
+   *
+   * @return true if iterator position is valid.
+   */
+  public boolean isValid() {
+    return isValid(nativeHandle_);
+  }
+
+  /**
+   * <p>Moves the iterator to the next WriteBatch.
+   * <strong>REQUIRES</strong>: Valid() to be true.</p>
+   */
+  public void next() {
+    next(nativeHandle_);
+  }
+
+  /**
+   * <p>Throws RocksDBException if something went wrong.</p>
+   *
+   * @throws org.rocksdb.RocksDBException if something went
+   *     wrong in the underlying C++ code.
+   */
+  public void status() throws RocksDBException {
+    status(nativeHandle_);
+  }
+
+  /**
+   * <p>If iterator position is valid, return the current
+   * write_batch and the sequence number of the earliest
+   * transaction contained in the batch.</p>
+   *
+   * <p>ONLY use if Valid() is true and status() is OK.</p>
+   *
+   * @return {@link org.rocksdb.TransactionLogIterator.BatchResult}
+   *     instance.
+   */
+  public BatchResult getBatch() {
+    assert(isValid());
+    return getBatch(nativeHandle_);
+  }
+
+  /**
+   * <p>TransactionLogIterator constructor.</p>
+   *
+   * @param nativeHandle address to native address.
+   */
+  TransactionLogIterator(final long nativeHandle) {
+    super(nativeHandle);
+  }
+
+  /**
+   * <p>BatchResult represents a data structure returned
+   * by a TransactionLogIterator containing a sequence
+   * number and a {@link WriteBatch} instance.</p>
+   */
+  public static final class BatchResult {
+    /**
+     * <p>Constructor of BatchResult class.</p>
+     *
+     * @param sequenceNumber related to this BatchResult instance.
+     * @param nativeHandle to {@link org.rocksdb.WriteBatch}
+     *     native instance.
+     */
+    public BatchResult(final long sequenceNumber,
+        final long nativeHandle) {
+      sequenceNumber_ = sequenceNumber;
+      writeBatch_ = new WriteBatch(nativeHandle, true);
+    }
+
+    /**
+     * <p>Return sequence number related to this BatchResult.</p>
+     *
+     * @return Sequence number.
+     */
+    public long sequenceNumber() {
+      return sequenceNumber_;
+    }
+
+    /**
+     * <p>Return contained {@link org.rocksdb.WriteBatch}
+     * instance</p>
+     *
+     * @return {@link org.rocksdb.WriteBatch} instance.
+     */
+    public WriteBatch writeBatch() {
+      return writeBatch_;
+    }
+
+    private final long sequenceNumber_;
+    private final WriteBatch writeBatch_;
+  }
+
+  @Override protected final native void disposeInternal(final long handle);
+  private native boolean isValid(long handle);
+  private native void next(long handle);
+  private native void status(long handle)
+      throws RocksDBException;
+  private native BatchResult getBatch(long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/TransactionOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/TransactionOptions.java
new file mode 100644
index 000000000..195fc85e4
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/TransactionOptions.java
@@ -0,0 +1,189 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public class TransactionOptions extends RocksObject
+    implements TransactionalOptions<TransactionOptions> {
+
+  public TransactionOptions() {
+    super(newTransactionOptions());
+  }
+
+  @Override
+  public boolean isSetSnapshot() {
+    assert(isOwningHandle());
+    return isSetSnapshot(nativeHandle_);
+  }
+
+  @Override
+  public TransactionOptions setSetSnapshot(final boolean setSnapshot) {
+    assert(isOwningHandle());
+    setSetSnapshot(nativeHandle_, setSnapshot);
+    return this;
+  }
+
+  /**
+   * True means that before acquiring locks, this transaction will
+   * check if doing so will cause a deadlock. If so, it will return with
+   * {@link Status.Code#Busy}. The user should retry their transaction.
+   *
+   * @return true if a deadlock is detected.
+   */
+  public boolean isDeadlockDetect() {
+    assert(isOwningHandle());
+    return isDeadlockDetect(nativeHandle_);
+  }
+
+  /**
+   * Setting to true means that before acquiring locks, this transaction will
+   * check if doing so will cause a deadlock. If so, it will return with
+   * {@link Status.Code#Busy}. The user should retry their transaction.
+   *
+   * @param deadlockDetect true if we should detect deadlocks.
+   *
+   * @return this TransactionOptions instance
+   */
+  public TransactionOptions setDeadlockDetect(final boolean deadlockDetect) {
+    assert(isOwningHandle());
+    setDeadlockDetect(nativeHandle_, deadlockDetect);
+    return this;
+  }
+
+  /**
+   * The wait timeout in milliseconds when a transaction attempts to lock a key.
+   *
+   * If 0, no waiting is done if a lock cannot instantly be acquired.
+   * If negative, {@link TransactionDBOptions#getTransactionLockTimeout(long)}
+   * will be used
+   *
+   * @return the lock timeout in milliseconds
+   */
+  public long getLockTimeout() {
+    assert(isOwningHandle());
+    return getLockTimeout(nativeHandle_);
+  }
+
+  /**
+   * If positive, specifies the wait timeout in milliseconds when
+   * a transaction attempts to lock a key.
+   *
+   * If 0, no waiting is done if a lock cannot instantly be acquired.
+   * If negative, {@link TransactionDBOptions#getTransactionLockTimeout(long)}
+   * will be used
+   *
+   * Default: -1
+   *
+   * @param lockTimeout the lock timeout in milliseconds
+   *
+   * @return this TransactionOptions instance
+   */
+  public TransactionOptions setLockTimeout(final long lockTimeout) {
+    assert(isOwningHandle());
+    setLockTimeout(nativeHandle_, lockTimeout);
+    return this;
+  }
+
+  /**
+   * Expiration duration in milliseconds.
+   *
+   * If non-negative, transactions that last longer than this many milliseconds
+   * will fail to commit. If not set, a forgotten transaction that is never
+   * committed, rolled back, or deleted will never relinquish any locks it
+   * holds. This could prevent keys from being written by other writers.
+   *
+   * @return expiration the expiration duration in milliseconds
+   */
+  public long getExpiration() {
+    assert(isOwningHandle());
+    return getExpiration(nativeHandle_);
+  }
+
+  /**
+   * Expiration duration in milliseconds.
+   *
+   * If non-negative, transactions that last longer than this many milliseconds
+   * will fail to commit. If not set, a forgotten transaction that is never
+   * committed, rolled back, or deleted will never relinquish any locks it
+   * holds. This could prevent keys from being written by other writers.
+   *
+   * Default: -1
+   *
+   * @param expiration the expiration duration in milliseconds
+   *
+   * @return this TransactionOptions instance
+   */
+  public TransactionOptions setExpiration(final long expiration) {
+    assert(isOwningHandle());
+    setExpiration(nativeHandle_, expiration);
+    return this;
+  }
+
+  /**
+   * Gets the number of traversals to make during deadlock detection.
+   *
+   * @return the number of traversals to make during
+   *     deadlock detection
+   */
+  public long getDeadlockDetectDepth() {
+    return getDeadlockDetectDepth(nativeHandle_);
+  }
+
+  /**
+   * Sets the number of traversals to make during deadlock detection.
+   *
+   * Default: 50
+   *
+   * @param deadlockDetectDepth the number of traversals to make during
+   *     deadlock detection
+   *
+   * @return this TransactionOptions instance
+   */
+  public TransactionOptions setDeadlockDetectDepth(
+      final long deadlockDetectDepth) {
+    setDeadlockDetectDepth(nativeHandle_, deadlockDetectDepth);
+    return this;
+  }
+
+  /**
+   * Get the maximum number of bytes that may be used for the write batch.
+   *
+   * @return the maximum number of bytes, 0 means no limit.
+   */
+  public long getMaxWriteBatchSize() {
+    return getMaxWriteBatchSize(nativeHandle_);
+  }
+
+  /**
+   * Set the maximum number of bytes that may be used for the write batch.
+   *
+   * @param maxWriteBatchSize the maximum number of bytes, 0 means no limit.
+   *
+   * @return this TransactionOptions instance
+   */
+  public TransactionOptions setMaxWriteBatchSize(final long maxWriteBatchSize) {
+    setMaxWriteBatchSize(nativeHandle_, maxWriteBatchSize);
+    return this;
+  }
+
+  private native static long newTransactionOptions();
+  private native boolean isSetSnapshot(final long handle);
+  private native void setSetSnapshot(final long handle,
+      final boolean setSnapshot);
+  private native boolean isDeadlockDetect(final long handle);
+  private native void setDeadlockDetect(final long handle,
+      final boolean deadlockDetect);
+  private native long getLockTimeout(final long handle);
+  private native void setLockTimeout(final long handle, final long lockTimeout);
+  private native long getExpiration(final long handle);
+  private native void setExpiration(final long handle, final long expiration);
+  private native long getDeadlockDetectDepth(final long handle);
+  private native void setDeadlockDetectDepth(final long handle,
+      final long deadlockDetectDepth);
+  private native long getMaxWriteBatchSize(final long handle);
+  private native void setMaxWriteBatchSize(final long handle,
+      final long maxWriteBatchSize);
+  @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/TransactionalDB.java b/src/rocksdb/java/src/main/java/org/rocksdb/TransactionalDB.java
new file mode 100644
index 000000000..740181989
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/TransactionalDB.java
@@ -0,0 +1,65 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+interface TransactionalDB<T extends TransactionalOptions<T>> extends AutoCloseable {
+  /**
+   * Starts a new Transaction.
+   *
+   * Caller is responsible for calling {@link #close()} on the returned
+   * transaction when it is no longer needed.
+   *
+   * @param writeOptions Any write options for the transaction
+   * @return a new transaction
+   */
+  Transaction beginTransaction(final WriteOptions writeOptions);
+
+  /**
+   * Starts a new Transaction.
+   *
+   * Caller is responsible for calling {@link #close()} on the returned
+   * transaction when it is no longer needed.
+   *
+   * @param writeOptions Any write options for the transaction
+   * @param transactionOptions Any options for the transaction
+   * @return a new transaction
+   */
+  Transaction beginTransaction(final WriteOptions writeOptions,
+      final T transactionOptions);
+
+  /**
+   * Starts a new Transaction.
+   *
+   * Caller is responsible for calling {@link #close()} on the returned
+   * transaction when it is no longer needed.
+   *
+   * @param writeOptions Any write options for the transaction
+   * @param oldTransaction this Transaction will be reused instead of allocating
+   *     a new one. This is an optimization to avoid extra allocations
+   *     when repeatedly creating transactions.
+   * @return The oldTransaction which has been reinitialized as a new
+   *     transaction
+   */
+  Transaction beginTransaction(final WriteOptions writeOptions,
+      final Transaction oldTransaction);
+
+  /**
+   * Starts a new Transaction.
+   *
+   * Caller is responsible for calling {@link #close()} on the returned
+   * transaction when it is no longer needed.
+   *
+   * @param writeOptions Any write options for the transaction
+   * @param transactionOptions Any options for the transaction
+   * @param oldTransaction this Transaction will be reused instead of allocating
+   *     a new one. This is an optimization to avoid extra allocations
+   *     when repeatedly creating transactions.
+   * @return The oldTransaction which has been reinitialized as a new
+   *     transaction
+   */
+  Transaction beginTransaction(final WriteOptions writeOptions,
+      final T transactionOptions, final Transaction oldTransaction);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/TransactionalOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/TransactionalOptions.java
new file mode 100644
index 000000000..d55ee900c
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/TransactionalOptions.java
@@ -0,0 +1,31 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+
+interface TransactionalOptions<T extends TransactionalOptions<T>>
+    extends AutoCloseable {
+
+  /**
+   * True indicates snapshots will be set, just like if
+   * {@link Transaction#setSnapshot()} had been called
+   *
+   * @return whether a snapshot will be set
+   */
+  boolean isSetSnapshot();
+
+  /**
+   * Setting the setSnapshot to true is the same as calling
+   * {@link Transaction#setSnapshot()}.
+   *
+   * Default: false
+   *
+   * @param setSnapshot Whether to set a snapshot
+   *
+   * @return this TransactionalOptions instance
+   */
+  T setSetSnapshot(final boolean setSnapshot);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/TtlDB.java b/src/rocksdb/java/src/main/java/org/rocksdb/TtlDB.java
new file mode 100644
index 000000000..a7adaf4b2
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/TtlDB.java
@@ -0,0 +1,245 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.List;
+
+/**
+ * Database with TTL support.
+ *
+ * <p><strong>Use case</strong></p>
+ * <p>This API should be used to open the db when key-values inserted are
+ * meant to be removed from the db in a non-strict 'ttl' amount of time
+ * Therefore, this guarantees that key-values inserted will remain in the
+ * db for &gt;= ttl amount of time and the db will make efforts to remove the
+ * key-values as soon as possible after ttl seconds of their insertion.
+ * </p>
+ *
+ * <p><strong>Behaviour</strong></p>
+ * <p>TTL is accepted in seconds
+ * (int32_t)Timestamp(creation) is suffixed to values in Put internally
+ * Expired TTL values deleted in compaction only:(Timestamp+ttl&lt;time_now)
+ * Get/Iterator may return expired entries(compaction not run on them yet)
+ * Different TTL may be used during different Opens
+ * </p>
+ *
+ * <p><strong>Example</strong></p>
+ * <ul>
+ * <li>Open1 at t=0 with ttl=4 and insert k1,k2, close at t=2</li>
+ * <li>Open2 at t=3 with ttl=5. Now k1,k2 should be deleted at t&gt;=5</li>
+ * </ul>
+ *
+ * <p>
+ * read_only=true opens in the usual read-only mode. Compactions will not be
+ *  triggered(neither manual nor automatic), so no expired entries removed
+ * </p>
+ *
+ * <p><strong>Constraints</strong></p>
+ * <p>Not specifying/passing or non-positive TTL behaves
+ * like TTL = infinity</p>
+ *
+ * <p><strong>!!!WARNING!!!</strong></p>
+ * <p>Calling DB::Open directly to re-open a db created by this API will get
+ * corrupt values(timestamp suffixed) and no ttl effect will be there
+ * during the second Open, so use this API consistently to open the db
+ * Be careful when passing ttl with a small positive value because the
+ * whole database may be deleted in a small amount of time.</p>
+ */
+public class TtlDB extends RocksDB {
+
+  /**
+   * <p>Opens a TtlDB.</p>
+   *
+   * <p>Database is opened in read-write mode without default TTL.</p>
+   *
+   * @param options {@link org.rocksdb.Options} instance.
+   * @param db_path path to database.
+   *
+   * @return TtlDB instance.
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public static TtlDB open(final Options options, final String db_path)
+      throws RocksDBException {
+    return open(options, db_path, 0, false);
+  }
+
+  /**
+   * <p>Opens a TtlDB.</p>
+   *
+   * @param options {@link org.rocksdb.Options} instance.
+   * @param db_path path to database.
+   * @param ttl time to live for new entries.
+   * @param readOnly boolean value indicating if database if db is
+   *     opened read-only.
+   *
+   * @return TtlDB instance.
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public static TtlDB open(final Options options, final String db_path,
+      final int ttl, final boolean readOnly) throws RocksDBException {
+    return new TtlDB(open(options.nativeHandle_, db_path, ttl, readOnly));
+  }
+
+  /**
+   * <p>Opens a TtlDB.</p>
+   *
+   * @param options {@link org.rocksdb.Options} instance.
+   * @param db_path path to database.
+   * @param columnFamilyDescriptors list of column family descriptors
+   * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
+   *     on open.
+   * @param ttlValues time to live values per column family handle
+   * @param readOnly boolean value indicating if database if db is
+   *     opened read-only.
+   *
+   * @return TtlDB instance.
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   * @throws java.lang.IllegalArgumentException when there is not a ttl value
+   *     per given column family handle.
+   */
+  public static TtlDB open(final DBOptions options, final String db_path,
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors,
+      final List<ColumnFamilyHandle> columnFamilyHandles,
+      final List<Integer> ttlValues, final boolean readOnly)
+      throws RocksDBException {
+    if (columnFamilyDescriptors.size() != ttlValues.size()) {
+      throw new IllegalArgumentException("There must be a ttl value per column"
+          + " family handle.");
+    }
+
+    final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][];
+    final long[] cfOptionHandles = new long[columnFamilyDescriptors.size()];
+    for (int i = 0; i < columnFamilyDescriptors.size(); i++) {
+      final ColumnFamilyDescriptor cfDescriptor =
+          columnFamilyDescriptors.get(i);
+      cfNames[i] = cfDescriptor.getName();
+      cfOptionHandles[i] = cfDescriptor.getOptions().nativeHandle_;
+    }
+
+    final int ttlVals[] = new int[ttlValues.size()];
+    for(int i = 0; i < ttlValues.size(); i++) {
+      ttlVals[i] = ttlValues.get(i);
+    }
+    final long[] handles = openCF(options.nativeHandle_, db_path,
+            cfNames, cfOptionHandles, ttlVals, readOnly);
+
+    final TtlDB ttlDB = new TtlDB(handles[0]);
+    for (int i = 1; i < handles.length; i++) {
+      columnFamilyHandles.add(new ColumnFamilyHandle(ttlDB, handles[i]));
+    }
+    return ttlDB;
+  }
+
+  /**
+   * <p>Close the TtlDB instance and release resource.</p>
+   *
+   * This is similar to {@link #close()} except that it
+   * throws an exception if any error occurs.
+   *
+   * This will not fsync the WAL files.
+   * If syncing is required, the caller must first call {@link #syncWal()}
+   * or {@link #write(WriteOptions, WriteBatch)} using an empty write batch
+   * with {@link WriteOptions#setSync(boolean)} set to true.
+   *
+   * See also {@link #close()}.
+   *
+   * @throws RocksDBException if an error occurs whilst closing.
+   */
+  public void closeE() throws RocksDBException {
+    if (owningHandle_.compareAndSet(true, false)) {
+      try {
+        closeDatabase(nativeHandle_);
+      } finally {
+        disposeInternal();
+      }
+    }
+  }
+
+  /**
+   * <p>Close the TtlDB instance and release resource.</p>
+   *
+   *
+   * This will not fsync the WAL files.
+   * If syncing is required, the caller must first call {@link #syncWal()}
+   * or {@link #write(WriteOptions, WriteBatch)} using an empty write batch
+   * with {@link WriteOptions#setSync(boolean)} set to true.
+   *
+   * See also {@link #close()}.
+   */
+  @Override
+  public void close() {
+    if (owningHandle_.compareAndSet(true, false)) {
+      try {
+        closeDatabase(nativeHandle_);
+      } catch (final RocksDBException e) {
+        // silently ignore the error report
+      } finally {
+        disposeInternal();
+      }
+    }
+  }
+
+  /**
+   * <p>Creates a new ttl based column family with a name defined
+   * in given ColumnFamilyDescriptor and allocates a
+   * ColumnFamilyHandle within an internal structure.</p>
+   *
+   * <p>The ColumnFamilyHandle is automatically disposed with DB
+   * disposal.</p>
+   *
+   * @param columnFamilyDescriptor column family to be created.
+   * @param ttl TTL to set for this column family.
+   *
+   * @return {@link org.rocksdb.ColumnFamilyHandle} instance.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public ColumnFamilyHandle createColumnFamilyWithTtl(
+      final ColumnFamilyDescriptor columnFamilyDescriptor,
+      final int ttl) throws RocksDBException {
+    return new ColumnFamilyHandle(this,
+        createColumnFamilyWithTtl(nativeHandle_,
+            columnFamilyDescriptor.getName(),
+            columnFamilyDescriptor.getOptions().nativeHandle_, ttl));
+  }
+
+  /**
+   * <p>A protected constructor that will be used in the static
+   * factory method
+   * {@link #open(Options, String, int, boolean)}
+   * and
+   * {@link #open(DBOptions, String, java.util.List, java.util.List,
+   * java.util.List, boolean)}.
+   * </p>
+   *
+   * @param nativeHandle The native handle of the C++ TtlDB object
+   */
+  protected TtlDB(final long nativeHandle) {
+    super(nativeHandle);
+  }
+
+  @Override protected native void disposeInternal(final long handle);
+
+  private native static long open(final long optionsHandle,
+      final String db_path, final int ttl, final boolean readOnly)
+      throws RocksDBException;
+  private native static long[] openCF(final long optionsHandle,
+      final String db_path, final byte[][] columnFamilyNames,
+      final long[] columnFamilyOptions, final int[] ttlValues,
+      final boolean readOnly) throws RocksDBException;
+  private native long createColumnFamilyWithTtl(final long handle,
+      final byte[] columnFamilyName, final long columnFamilyOptions, int ttl)
+      throws RocksDBException;
+  private native static void closeDatabase(final long handle)
+      throws RocksDBException;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/TxnDBWritePolicy.java b/src/rocksdb/java/src/main/java/org/rocksdb/TxnDBWritePolicy.java
new file mode 100644
index 000000000..837ce6157
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/TxnDBWritePolicy.java
@@ -0,0 +1,62 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+/**
+ * The transaction db write policy.
+ */
+public enum TxnDBWritePolicy {
+  /**
+   * Write only the committed data.
+   */
+  WRITE_COMMITTED((byte)0x00),
+
+  /**
+   * Write data after the prepare phase of 2pc.
+   */
+  WRITE_PREPARED((byte)0x1),
+
+  /**
+   * Write data before the prepare phase of 2pc.
+   */
+  WRITE_UNPREPARED((byte)0x2);
+
+  private byte value;
+
+  TxnDBWritePolicy(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * <p>Returns the byte value of the enumerations value.</p>
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value;
+  }
+
+  /**
+   * <p>Get the TxnDBWritePolicy enumeration value by
+   * passing the byte identifier to this method.</p>
+   *
+   * @param byteIdentifier of TxnDBWritePolicy.
+   *
+   * @return TxnDBWritePolicy instance.
+   *
+   * @throws IllegalArgumentException If TxnDBWritePolicy cannot be found for
+   *     the provided byteIdentifier
+   */
+  public static TxnDBWritePolicy getTxnDBWritePolicy(final byte byteIdentifier) {
+    for (final TxnDBWritePolicy txnDBWritePolicy : TxnDBWritePolicy.values()) {
+      if (txnDBWritePolicy.getValue() == byteIdentifier) {
+        return txnDBWritePolicy;
+      }
+    }
+
+    throw new IllegalArgumentException(
+        "Illegal value provided for TxnDBWritePolicy.");
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/UInt64AddOperator.java b/src/rocksdb/java/src/main/java/org/rocksdb/UInt64AddOperator.java
new file mode 100644
index 000000000..cce9b298d
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/UInt64AddOperator.java
@@ -0,0 +1,19 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Uint64AddOperator is a merge operator that accumlates a long
+ * integer value.
+ */
+public class UInt64AddOperator extends MergeOperator {
+    public UInt64AddOperator() {
+        super(newSharedUInt64AddOperator());
+    }
+
+    private native static long newSharedUInt64AddOperator();
+    @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/VectorMemTableConfig.java b/src/rocksdb/java/src/main/java/org/rocksdb/VectorMemTableConfig.java
new file mode 100644
index 000000000..fb1e7a948
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/VectorMemTableConfig.java
@@ -0,0 +1,46 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+package org.rocksdb;
+
+/**
+ * The config for vector memtable representation.
+ */
+public class VectorMemTableConfig extends MemTableConfig {
+  public static final int DEFAULT_RESERVED_SIZE = 0;
+
+  /**
+   * VectorMemTableConfig constructor
+   */
+  public VectorMemTableConfig() {
+    reservedSize_ = DEFAULT_RESERVED_SIZE;
+  }
+
+  /**
+   * Set the initial size of the vector that will be used
+   * by the memtable created based on this config.
+   *
+   * @param size the initial size of the vector.
+   * @return the reference to the current config.
+   */
+  public VectorMemTableConfig setReservedSize(final int size) {
+    reservedSize_ = size;
+    return this;
+  }
+
+  /**
+   * Returns the initial size of the vector used by the memtable
+   * created based on this config.
+   *
+   * @return the initial size of the vector.
+   */
+  public int reservedSize() {
+    return reservedSize_;
+  }
+
+  @Override protected long newMemTableFactoryHandle() {
+    return newMemTableFactoryHandle(reservedSize_);
+  }
+
+  private native long newMemTableFactoryHandle(long reservedSize)
+      throws IllegalArgumentException;
+  private int reservedSize_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/WALRecoveryMode.java b/src/rocksdb/java/src/main/java/org/rocksdb/WALRecoveryMode.java
new file mode 100644
index 000000000..d8b9eeced
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/WALRecoveryMode.java
@@ -0,0 +1,83 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * The WAL Recover Mode
+ */
+public enum WALRecoveryMode {
+
+  /**
+   * Original levelDB recovery
+   *
+   * We tolerate incomplete record in trailing data on all logs
+   * Use case : This is legacy behavior (default)
+   */
+  TolerateCorruptedTailRecords((byte)0x00),
+
+  /**
+   * Recover from clean shutdown
+   *
+   * We don't expect to find any corruption in the WAL
+   * Use case : This is ideal for unit tests and rare applications that
+   * can require high consistency guarantee
+   */
+  AbsoluteConsistency((byte)0x01),
+
+  /**
+   * Recover to point-in-time consistency
+   * We stop the WAL playback on discovering WAL inconsistency
+   * Use case : Ideal for systems that have disk controller cache like
+   * hard disk, SSD without super capacitor that store related data
+   */
+  PointInTimeRecovery((byte)0x02),
+
+  /**
+   * Recovery after a disaster
+   * We ignore any corruption in the WAL and try to salvage as much data as
+   * possible
+   * Use case : Ideal for last ditch effort to recover data or systems that
+   * operate with low grade unrelated data
+   */
+  SkipAnyCorruptedRecords((byte)0x03);
+
+  private byte value;
+
+  WALRecoveryMode(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * <p>Returns the byte value of the enumerations value.</p>
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value;
+  }
+
+  /**
+   * <p>Get the WALRecoveryMode enumeration value by
+   * passing the byte identifier to this method.</p>
+   *
+   * @param byteIdentifier of WALRecoveryMode.
+   *
+   * @return WALRecoveryMode instance.
+   *
+   * @throws IllegalArgumentException If WALRecoveryMode cannot be found for the
+   *   provided byteIdentifier
+   */
+  public static WALRecoveryMode getWALRecoveryMode(final byte byteIdentifier) {
+    for (final WALRecoveryMode walRecoveryMode : WALRecoveryMode.values()) {
+      if (walRecoveryMode.getValue() == byteIdentifier) {
+        return walRecoveryMode;
+      }
+    }
+
+    throw new IllegalArgumentException(
+        "Illegal value provided for WALRecoveryMode.");
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/WBWIRocksIterator.java b/src/rocksdb/java/src/main/java/org/rocksdb/WBWIRocksIterator.java
new file mode 100644
index 000000000..ce146eb3f
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/WBWIRocksIterator.java
@@ -0,0 +1,203 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.nio.ByteBuffer;
+
+public class WBWIRocksIterator
+    extends AbstractRocksIterator<WriteBatchWithIndex> {
+  private final WriteEntry entry = new WriteEntry();
+
+  protected WBWIRocksIterator(final WriteBatchWithIndex wbwi,
+      final long nativeHandle) {
+    super(wbwi, nativeHandle);
+  }
+
+  /**
+   * Get the current entry
+   *
+   * The WriteEntry is only valid
+   * until the iterator is repositioned.
+   * If you want to keep the WriteEntry across iterator
+   * movements, you must make a copy of its data!
+   *
+   * Note - This method is not thread-safe with respect to the WriteEntry
+   * as it performs a non-atomic update across the fields of the WriteEntry
+   *
+   * @return The WriteEntry of the current entry
+   */
+  public WriteEntry entry() {
+    assert(isOwningHandle());
+    final long[] ptrs = entry1(nativeHandle_);
+
+    entry.type = WriteType.fromId((byte)ptrs[0]);
+    entry.key.resetNativeHandle(ptrs[1], ptrs[1] != 0);
+    entry.value.resetNativeHandle(ptrs[2], ptrs[2] != 0);
+
+    return entry;
+  }
+
+  @Override protected final native void disposeInternal(final long handle);
+  @Override final native boolean isValid0(long handle);
+  @Override final native void seekToFirst0(long handle);
+  @Override final native void seekToLast0(long handle);
+  @Override final native void next0(long handle);
+  @Override final native void prev0(long handle);
+  @Override final native void refresh0(final long handle) throws RocksDBException;
+  @Override final native void seek0(long handle, byte[] target, int targetLen);
+  @Override final native void seekForPrev0(long handle, byte[] target, int targetLen);
+  @Override final native void status0(long handle) throws RocksDBException;
+  @Override
+  final native void seekDirect0(
+      final long handle, final ByteBuffer target, final int targetOffset, final int targetLen);
+  @Override
+  final native void seekForPrevDirect0(
+      final long handle, final ByteBuffer target, final int targetOffset, final int targetLen);
+  @Override
+  final native void seekByteArray0(
+      final long handle, final byte[] target, final int targetOffset, final int targetLen);
+  @Override
+  final native void seekForPrevByteArray0(
+      final long handle, final byte[] target, final int targetOffset, final int targetLen);
+
+  private native long[] entry1(final long handle);
+
+  /**
+   * Enumeration of the Write operation
+   * that created the record in the Write Batch
+   */
+  public enum WriteType {
+    PUT((byte)0x0),
+    MERGE((byte)0x1),
+    DELETE((byte)0x2),
+    SINGLE_DELETE((byte)0x3),
+    DELETE_RANGE((byte)0x4),
+    LOG((byte)0x5),
+    XID((byte)0x6);
+
+    final byte id;
+    WriteType(final byte id) {
+      this.id = id;
+    }
+
+    public static WriteType fromId(final byte id) {
+      for(final WriteType wt : WriteType.values()) {
+        if(id == wt.id) {
+          return wt;
+        }
+      }
+      throw new IllegalArgumentException("No WriteType with id=" + id);
+    }
+  }
+
+  @Override
+  public void close() {
+    entry.close();
+    super.close();
+  }
+
+  /**
+   * Represents an entry returned by
+   * {@link org.rocksdb.WBWIRocksIterator#entry()}
+   *
+   * It is worth noting that a WriteEntry with
+   * the type {@link org.rocksdb.WBWIRocksIterator.WriteType#DELETE}
+   * or {@link org.rocksdb.WBWIRocksIterator.WriteType#LOG}
+   * will not have a value.
+   */
+  public static class WriteEntry implements AutoCloseable {
+    WriteType type = null;
+    final DirectSlice key;
+    final DirectSlice value;
+
+    /**
+     * Intentionally private as this
+     * should only be instantiated in
+     * this manner by the outer WBWIRocksIterator
+     * class; The class members are then modified
+     * by calling {@link org.rocksdb.WBWIRocksIterator#entry()}
+     */
+    private WriteEntry() {
+      key = new DirectSlice();
+      value = new DirectSlice();
+    }
+
+    public WriteEntry(final WriteType type, final DirectSlice key,
+        final DirectSlice value) {
+      this.type = type;
+      this.key = key;
+      this.value = value;
+    }
+
+    /**
+     * Returns the type of the Write Entry
+     *
+     * @return the WriteType of the WriteEntry
+     */
+    public WriteType getType() {
+      return type;
+    }
+
+    /**
+     * Returns the key of the Write Entry
+     *
+     * @return The slice containing the key
+     * of the WriteEntry
+     */
+    public DirectSlice getKey() {
+      return key;
+    }
+
+    /**
+     * Returns the value of the Write Entry
+     *
+     * @return The slice containing the value of
+     * the WriteEntry or null if the WriteEntry has
+     * no value
+     */
+    public DirectSlice getValue() {
+      if(!value.isOwningHandle()) {
+        return null; //TODO(AR) migrate to JDK8 java.util.Optional#empty()
+      } else {
+        return value;
+      }
+    }
+
+    /**
+     * Generates a hash code for the Write Entry. NOTE: The hash code is based
+     * on the string representation of the key, so it may not work correctly
+     * with exotic custom comparators.
+     *
+     * @return The hash code for the Write Entry
+     */
+    @Override
+    public int hashCode() {
+      return (key == null) ? 0 : key.hashCode();
+    }
+
+    @Override
+    public boolean equals(final Object other) {
+      if(other == null) {
+        return false;
+      } else if (this == other) {
+        return true;
+      } else if(other instanceof WriteEntry) {
+        final WriteEntry otherWriteEntry = (WriteEntry)other;
+        return type.equals(otherWriteEntry.type)
+            && key.equals(otherWriteEntry.key)
+            && value.equals(otherWriteEntry.value);
+      } else {
+        return false;
+      }
+    }
+
+    @Override
+    public void close() {
+      value.close();
+      key.close();
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/WalFileType.java b/src/rocksdb/java/src/main/java/org/rocksdb/WalFileType.java
new file mode 100644
index 000000000..fed27ed11
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/WalFileType.java
@@ -0,0 +1,55 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public enum WalFileType {
+  /**
+   * Indicates that WAL file is in archive directory. WAL files are moved from
+   * the main db directory to archive directory once they are not live and stay
+   * there until cleaned up. Files are cleaned depending on archive size
+   * (Options::WAL_size_limit_MB) and time since last cleaning
+   * (Options::WAL_ttl_seconds).
+   */
+  kArchivedLogFile((byte)0x0),
+
+  /**
+   * Indicates that WAL file is live and resides in the main db directory
+   */
+  kAliveLogFile((byte)0x1);
+
+  private final byte value;
+
+  WalFileType(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation value.
+   *
+   * @return the internal representation value
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get the WalFileType from the internal representation value.
+   *
+   * @return the wal file type.
+   *
+   * @throws IllegalArgumentException if the value is unknown.
+   */
+  static WalFileType fromValue(final byte value) {
+    for (final WalFileType walFileType : WalFileType.values()) {
+      if(walFileType.value == value) {
+        return walFileType;
+      }
+    }
+
+    throw new IllegalArgumentException(
+        "Illegal value provided for WalFileType: " + value);
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/WalFilter.java b/src/rocksdb/java/src/main/java/org/rocksdb/WalFilter.java
new file mode 100644
index 000000000..37e36213a
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/WalFilter.java
@@ -0,0 +1,87 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Map;
+
+/**
+ * WALFilter allows an application to inspect write-ahead-log (WAL)
+ * records or modify their processing on recovery.
+ */
+public interface WalFilter {
+
+  /**
+   * Provide ColumnFamily-&gt;LogNumber map to filter
+   * so that filter can determine whether a log number applies to a given
+   * column family (i.e. that log hasn't been flushed to SST already for the
+   * column family).
+   *
+   * We also pass in name&gt;id map as only name is known during
+   * recovery (as handles are opened post-recovery).
+   * while write batch callbacks happen in terms of column family id.
+   *
+   * @param cfLognumber column_family_id to lognumber map
+   * @param cfNameId column_family_name to column_family_id map
+   */
+  void columnFamilyLogNumberMap(final Map<Integer, Long> cfLognumber,
+      final Map<String, Integer> cfNameId);
+
+  /**
+   * LogRecord is invoked for each log record encountered for all the logs
+   * during replay on logs on recovery. This method can be used to:
+   *     * inspect the record (using the batch parameter)
+   *     * ignoring current record
+   *         (by returning WalProcessingOption::kIgnoreCurrentRecord)
+   *     * reporting corrupted record
+   *         (by returning WalProcessingOption::kCorruptedRecord)
+   *     * stop log replay
+   *         (by returning kStop replay) - please note that this implies
+   *         discarding the logs from current record onwards.
+   *
+   * @param logNumber log number of the current log.
+   *     Filter might use this to determine if the log
+   *     record is applicable to a certain column family.
+   * @param logFileName log file name - only for informational purposes
+   * @param batch batch encountered in the log during recovery
+   * @param newBatch new batch to populate if filter wants to change
+   *     the batch (for example to filter some records out, or alter some
+   *     records). Please note that the new batch MUST NOT contain
+   *     more records than original, else recovery would be failed.
+   *
+   * @return Processing option for the current record.
+   */
+  LogRecordFoundResult logRecordFound(final long logNumber,
+      final String logFileName, final WriteBatch batch,
+      final WriteBatch newBatch);
+
+  class LogRecordFoundResult {
+    public static LogRecordFoundResult CONTINUE_UNCHANGED =
+        new LogRecordFoundResult(WalProcessingOption.CONTINUE_PROCESSING, false);
+
+    final WalProcessingOption walProcessingOption;
+    final boolean batchChanged;
+
+    /**
+     * @param walProcessingOption the processing option
+     * @param batchChanged Whether batch was changed by the filter.
+     *     It must be set to true if newBatch was populated,
+     *     else newBatch has no effect.
+     */
+    public LogRecordFoundResult(final WalProcessingOption walProcessingOption,
+        final boolean batchChanged) {
+      this.walProcessingOption = walProcessingOption;
+      this.batchChanged = batchChanged;
+    }
+  }
+
+  /**
+   * Returns a name that identifies this WAL filter.
+   * The name will be printed to LOG file on start up for diagnosis.
+   *
+   * @return the name
+   */
+  String name();
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/WalProcessingOption.java b/src/rocksdb/java/src/main/java/org/rocksdb/WalProcessingOption.java
new file mode 100644
index 000000000..889602edc
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/WalProcessingOption.java
@@ -0,0 +1,54 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public enum WalProcessingOption {
+  /**
+   * Continue processing as usual.
+   */
+  CONTINUE_PROCESSING((byte)0x0),
+
+  /**
+   * Ignore the current record but continue processing of log(s).
+   */
+  IGNORE_CURRENT_RECORD((byte)0x1),
+
+  /**
+   * Stop replay of logs and discard logs.
+   * Logs won't be replayed on subsequent recovery.
+   */
+  STOP_REPLAY((byte)0x2),
+
+  /**
+   * Corrupted record detected by filter.
+   */
+  CORRUPTED_RECORD((byte)0x3);
+
+  private final byte value;
+
+  WalProcessingOption(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation.
+   *
+   * @return the internal representation.
+   */
+  byte getValue() {
+    return value;
+  }
+
+  public static WalProcessingOption fromValue(final byte value) {
+    for (final WalProcessingOption walProcessingOption : WalProcessingOption.values()) {
+      if (walProcessingOption.value == value) {
+        return walProcessingOption;
+      }
+    }
+    throw new IllegalArgumentException(
+        "Illegal value provided for WalProcessingOption: " + value);
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/WriteBatch.java b/src/rocksdb/java/src/main/java/org/rocksdb/WriteBatch.java
new file mode 100644
index 000000000..9b46108d0
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/WriteBatch.java
@@ -0,0 +1,396 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.nio.ByteBuffer;
+
+/**
+ * WriteBatch holds a collection of updates to apply atomically to a DB.
+ *
+ * The updates are applied in the order in which they are added
+ * to the WriteBatch.  For example, the value of "key" will be "v3"
+ * after the following batch is written:
+ *
+ *    batch.put("key", "v1");
+ *    batch.remove("key");
+ *    batch.put("key", "v2");
+ *    batch.put("key", "v3");
+ *
+ * Multiple threads can invoke const methods on a WriteBatch without
+ * external synchronization, but if any of the threads may call a
+ * non-const method, all threads accessing the same WriteBatch must use
+ * external synchronization.
+ */
+public class WriteBatch extends AbstractWriteBatch {
+  /**
+   * Constructs a WriteBatch instance.
+   */
+  public WriteBatch() {
+    this(0);
+  }
+
+  /**
+   * Constructs a WriteBatch instance with a given size.
+   *
+   * @param reserved_bytes reserved size for WriteBatch
+   */
+  public WriteBatch(final int reserved_bytes) {
+    super(newWriteBatch(reserved_bytes));
+  }
+
+  /**
+   * Constructs a WriteBatch instance from a serialized representation
+   * as returned by {@link #data()}.
+   *
+   * @param serialized the serialized representation.
+   */
+  public WriteBatch(final byte[] serialized) {
+    super(newWriteBatch(serialized, serialized.length));
+  }
+
+  /**
+   * Support for iterating over the contents of a batch.
+   *
+   * @param handler A handler that is called back for each
+   *                update present in the batch
+   *
+   * @throws RocksDBException If we cannot iterate over the batch
+   */
+  public void iterate(final Handler handler) throws RocksDBException {
+    iterate(nativeHandle_, handler.nativeHandle_);
+  }
+
+  /**
+   * Retrieve the serialized version of this batch.
+   *
+   * @return the serialized representation of this write batch.
+   *
+   * @throws RocksDBException if an error occurs whilst retrieving
+   *   the serialized batch data.
+   */
+  public byte[] data() throws RocksDBException {
+    return data(nativeHandle_);
+  }
+
+  /**
+   * Retrieve data size of the batch.
+   *
+   * @return the serialized data size of the batch.
+   */
+  public long getDataSize() {
+    return getDataSize(nativeHandle_);
+  }
+
+  /**
+   * Returns true if Put will be called during Iterate.
+   *
+   * @return true if Put will be called during Iterate.
+   */
+  public boolean hasPut() {
+    return hasPut(nativeHandle_);
+  }
+
+  /**
+   * Returns true if Delete will be called during Iterate.
+   *
+   * @return true if Delete will be called during Iterate.
+   */
+  public boolean hasDelete() {
+    return hasDelete(nativeHandle_);
+  }
+
+  /**
+   * Returns true if SingleDelete will be called during Iterate.
+   *
+   * @return true if SingleDelete will be called during Iterate.
+   */
+  public boolean hasSingleDelete() {
+    return hasSingleDelete(nativeHandle_);
+  }
+
+  /**
+   * Returns true if DeleteRange will be called during Iterate.
+   *
+   * @return true if DeleteRange will be called during Iterate.
+   */
+  public boolean hasDeleteRange() {
+    return hasDeleteRange(nativeHandle_);
+  }
+
+  /**
+   * Returns true if Merge will be called during Iterate.
+   *
+   * @return true if Merge will be called during Iterate.
+   */
+  public boolean hasMerge() {
+    return hasMerge(nativeHandle_);
+  }
+
+  /**
+   * Returns true if MarkBeginPrepare will be called during Iterate.
+   *
+   * @return true if MarkBeginPrepare will be called during Iterate.
+   */
+  public boolean hasBeginPrepare() {
+    return hasBeginPrepare(nativeHandle_);
+  }
+
+  /**
+   * Returns true if MarkEndPrepare will be called during Iterate.
+   *
+   * @return true if MarkEndPrepare will be called during Iterate.
+   */
+  public boolean hasEndPrepare() {
+    return hasEndPrepare(nativeHandle_);
+  }
+
+  /**
+   * Returns true if MarkCommit will be called during Iterate.
+   *
+   * @return true if MarkCommit will be called during Iterate.
+   */
+  public boolean hasCommit() {
+    return hasCommit(nativeHandle_);
+  }
+
+  /**
+   * Returns true if MarkRollback will be called during Iterate.
+   *
+   * @return true if MarkRollback will be called during Iterate.
+   */
+  public boolean hasRollback() {
+    return hasRollback(nativeHandle_);
+  }
+
+  @Override
+  public WriteBatch getWriteBatch() {
+    return this;
+  }
+
+  /**
+   * Marks this point in the WriteBatch as the last record to
+   * be inserted into the WAL, provided the WAL is enabled.
+   */
+  public void markWalTerminationPoint() {
+    markWalTerminationPoint(nativeHandle_);
+  }
+
+  /**
+   * Gets the WAL termination point.
+   *
+   * See {@link #markWalTerminationPoint()}
+   *
+   * @return the WAL termination point
+   */
+  public SavePoint getWalTerminationPoint() {
+    return getWalTerminationPoint(nativeHandle_);
+  }
+
+  @Override
+  WriteBatch getWriteBatch(final long handle) {
+    return this;
+  }
+
+  /**
+   * <p>Private WriteBatch constructor which is used to construct
+   * WriteBatch instances from C++ side. As the reference to this
+   * object is also managed from C++ side the handle will be disowned.</p>
+   *
+   * @param nativeHandle address of native instance.
+   */
+  WriteBatch(final long nativeHandle) {
+    this(nativeHandle, false);
+  }
+
+  /**
+   * <p>Private WriteBatch constructor which is used to construct
+   * WriteBatch instances. </p>
+   *
+   * @param nativeHandle address of native instance.
+   * @param owningNativeHandle whether to own this reference from the C++ side or not
+   */
+  WriteBatch(final long nativeHandle, final boolean owningNativeHandle) {
+    super(nativeHandle);
+    if(!owningNativeHandle)
+      disOwnNativeHandle();
+  }
+
+  @Override protected final native void disposeInternal(final long handle);
+  @Override final native int count0(final long handle);
+  @Override final native void put(final long handle, final byte[] key,
+      final int keyLen, final byte[] value, final int valueLen);
+  @Override final native void put(final long handle, final byte[] key,
+      final int keyLen, final byte[] value, final int valueLen,
+      final long cfHandle);
+  @Override
+  final native void putDirect(final long handle, final ByteBuffer key, final int keyOffset,
+      final int keyLength, final ByteBuffer value, final int valueOffset, final int valueLength,
+      final long cfHandle);
+  @Override final native void merge(final long handle, final byte[] key,
+      final int keyLen, final byte[] value, final int valueLen);
+  @Override final native void merge(final long handle, final byte[] key,
+      final int keyLen, final byte[] value, final int valueLen,
+      final long cfHandle);
+  @Override final native void delete(final long handle, final byte[] key,
+      final int keyLen) throws RocksDBException;
+  @Override final native void delete(final long handle, final byte[] key,
+      final int keyLen, final long cfHandle) throws RocksDBException;
+  @Override final native void singleDelete(final long handle, final byte[] key,
+      final int keyLen) throws RocksDBException;
+  @Override final native void singleDelete(final long handle, final byte[] key,
+      final int keyLen, final long cfHandle) throws RocksDBException;
+  @Override
+  final native void deleteDirect(final long handle, final ByteBuffer key, final int keyOffset,
+      final int keyLength, final long cfHandle) throws RocksDBException;
+  @Override
+  final native void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen,
+      final byte[] endKey, final int endKeyLen);
+  @Override
+  final native void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen,
+      final byte[] endKey, final int endKeyLen, final long cfHandle);
+  @Override final native void putLogData(final long handle,
+      final byte[] blob, final int blobLen) throws RocksDBException;
+  @Override final native void clear0(final long handle);
+  @Override final native void setSavePoint0(final long handle);
+  @Override final native void rollbackToSavePoint0(final long handle);
+  @Override final native void popSavePoint(final long handle) throws RocksDBException;
+  @Override final native void setMaxBytes(final long nativeHandle,
+    final long maxBytes);
+
+  private native static long newWriteBatch(final int reserved_bytes);
+  private native static long newWriteBatch(final byte[] serialized,
+      final int serializedLength);
+  private native void iterate(final long handle, final long handlerHandle)
+      throws RocksDBException;
+  private native byte[] data(final long nativeHandle) throws RocksDBException;
+  private native long getDataSize(final long nativeHandle);
+  private native boolean hasPut(final long nativeHandle);
+  private native boolean hasDelete(final long nativeHandle);
+  private native boolean hasSingleDelete(final long nativeHandle);
+  private native boolean hasDeleteRange(final long nativeHandle);
+  private native boolean hasMerge(final long nativeHandle);
+  private native boolean hasBeginPrepare(final long nativeHandle);
+  private native boolean hasEndPrepare(final long nativeHandle);
+  private native boolean hasCommit(final long nativeHandle);
+  private native boolean hasRollback(final long nativeHandle);
+  private native void markWalTerminationPoint(final long nativeHandle);
+  private native SavePoint getWalTerminationPoint(final long nativeHandle);
+
+  /**
+   * Handler callback for iterating over the contents of a batch.
+   */
+  public static abstract class Handler
+      extends RocksCallbackObject {
+    public Handler() {
+      super(null);
+    }
+
+    @Override
+    protected long initializeNative(final long... nativeParameterHandles) {
+      return createNewHandler0();
+    }
+
+    public abstract void put(final int columnFamilyId, final byte[] key,
+        final byte[] value) throws RocksDBException;
+    public abstract void put(final byte[] key, final byte[] value);
+    public abstract void merge(final int columnFamilyId, final byte[] key,
+        final byte[] value) throws RocksDBException;
+    public abstract void merge(final byte[] key, final byte[] value);
+    public abstract void delete(final int columnFamilyId, final byte[] key)
+        throws RocksDBException;
+    public abstract void delete(final byte[] key);
+    public abstract void singleDelete(final int columnFamilyId,
+        final byte[] key) throws RocksDBException;
+    public abstract void singleDelete(final byte[] key);
+    public abstract void deleteRange(final int columnFamilyId,
+        final byte[] beginKey, final byte[] endKey) throws RocksDBException;
+    public abstract void deleteRange(final byte[] beginKey,
+        final byte[] endKey);
+    public abstract void logData(final byte[] blob);
+    public abstract void putBlobIndex(final int columnFamilyId,
+        final byte[] key, final byte[] value) throws RocksDBException;
+    public abstract void markBeginPrepare() throws RocksDBException;
+    public abstract void markEndPrepare(final byte[] xid)
+        throws RocksDBException;
+    public abstract void markNoop(final boolean emptyBatch)
+        throws RocksDBException;
+    public abstract void markRollback(final byte[] xid)
+        throws RocksDBException;
+    public abstract void markCommit(final byte[] xid)
+        throws RocksDBException;
+    public abstract void markCommitWithTimestamp(final byte[] xid, final byte[] ts)
+        throws RocksDBException;
+
+    /**
+     * shouldContinue is called by the underlying iterator
+     * {@link WriteBatch#iterate(Handler)}. If it returns false,
+     * iteration is halted. Otherwise, it continues
+     * iterating. The default implementation always
+     * returns true.
+     *
+     * @return boolean value indicating if the
+     *     iteration is halted.
+     */
+    public boolean shouldContinue() {
+      return true;
+    }
+
+    private native long createNewHandler0();
+  }
+
+  /**
+   * A structure for describing the save point in the Write Batch.
+   */
+  public static class SavePoint {
+    private long size;
+    private long count;
+    private long contentFlags;
+
+    public SavePoint(final long size, final long count,
+        final long contentFlags) {
+      this.size = size;
+      this.count = count;
+      this.contentFlags = contentFlags;
+    }
+
+    public void clear() {
+      this.size = 0;
+      this.count = 0;
+      this.contentFlags = 0;
+    }
+
+    /**
+     * Get the size of the serialized representation.
+     *
+     * @return the size of the serialized representation.
+     */
+    public long getSize() {
+      return size;
+    }
+
+    /**
+     * Get the number of elements.
+     *
+     * @return the number of elements.
+     */
+    public long getCount() {
+      return count;
+    }
+
+    /**
+     * Get the content flags.
+     *
+     * @return the content flags.
+     */
+    public long getContentFlags() {
+      return contentFlags;
+    }
+
+    public boolean isCleared() {
+      return (size | count | contentFlags) == 0;
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/WriteBatchInterface.java b/src/rocksdb/java/src/main/java/org/rocksdb/WriteBatchInterface.java
new file mode 100644
index 000000000..92caa22b3
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/WriteBatchInterface.java
@@ -0,0 +1,283 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.nio.ByteBuffer;
+
+/**
+ * <p>Defines the interface for a Write Batch which
+ * holds a collection of updates to apply atomically to a DB.</p>
+ */
+public interface WriteBatchInterface {
+
+    /**
+     * Returns the number of updates in the batch.
+     *
+     * @return number of items in WriteBatch
+     */
+    int count();
+
+    /**
+     * <p>Store the mapping "key-&gt;value" in the database.</p>
+     *
+     * @param key the specified key to be inserted.
+     * @param value the value associated with the specified key.
+     * @throws RocksDBException thrown if error happens in underlying native library.
+     */
+    void put(byte[] key, byte[] value) throws RocksDBException;
+
+    /**
+     * <p>Store the mapping "key-&gt;value" within given column
+     * family.</p>
+     *
+     * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+     *     instance
+     * @param key the specified key to be inserted.
+     * @param value the value associated with the specified key.
+     * @throws RocksDBException thrown if error happens in underlying native library.
+     */
+    void put(ColumnFamilyHandle columnFamilyHandle, byte[] key, byte[] value)
+        throws RocksDBException;
+
+    /**
+     * <p>Store the mapping "key-&gt;value" within given column
+     * family.</p>
+     *
+     * @param key the specified key to be inserted. It is using position and limit.
+     *     Supports direct buffer only.
+     * @param value the value associated with the specified key. It is using position and limit.
+     *     Supports direct buffer only.
+     * @throws RocksDBException thrown if error happens in underlying native library.
+     */
+    void put(final ByteBuffer key, final ByteBuffer value) throws RocksDBException;
+
+    /**
+     * <p>Store the mapping "key-&gt;value" within given column
+     * family.</p>
+     *
+     * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+     *     instance
+     * @param key the specified key to be inserted. It is using position and limit.
+     *     Supports direct buffer only.
+     * @param value the value associated with the specified key. It is using position and limit.
+     *     Supports direct buffer only.
+     * @throws RocksDBException thrown if error happens in underlying native library.
+     */
+    void put(ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key, final ByteBuffer value)
+        throws RocksDBException;
+
+    /**
+     * <p>Merge "value" with the existing value of "key" in the database.
+     * "key-&gt;merge(existing, value)"</p>
+     *
+     * @param key the specified key to be merged.
+     * @param value the value to be merged with the current value for
+     * the specified key.
+     * @throws RocksDBException thrown if error happens in underlying native library.
+     */
+    void merge(byte[] key, byte[] value) throws RocksDBException;
+
+    /**
+     * <p>Merge "value" with the existing value of "key" in given column family.
+     * "key-&gt;merge(existing, value)"</p>
+     *
+     * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+     * @param key the specified key to be merged.
+     * @param value the value to be merged with the current value for
+     * the specified key.
+     * @throws RocksDBException thrown if error happens in underlying native library.
+     */
+    void merge(ColumnFamilyHandle columnFamilyHandle, byte[] key, byte[] value)
+        throws RocksDBException;
+
+    /**
+     * <p>If the database contains a mapping for "key", erase it.  Else do nothing.</p>
+     *
+     * @param key Key to delete within database
+     * @throws RocksDBException thrown if error happens in underlying native library.
+     */
+    void delete(byte[] key) throws RocksDBException;
+
+    /**
+     * <p>If column family contains a mapping for "key", erase it.  Else do nothing.</p>
+     *
+     * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+     * @param key Key to delete within database
+     * @throws RocksDBException thrown if error happens in underlying native library.
+     */
+    void delete(ColumnFamilyHandle columnFamilyHandle, byte[] key) throws RocksDBException;
+
+    /**
+     * <p>If column family contains a mapping for "key", erase it.  Else do nothing.</p>
+     *
+     * @param key Key to delete within database. It is using position and limit.
+     *     Supports direct buffer only.
+     *
+     * @throws RocksDBException thrown if error happens in underlying native library.
+     */
+    void delete(final ByteBuffer key) throws RocksDBException;
+
+    /**
+     * <p>If column family contains a mapping for "key", erase it.  Else do nothing.</p>
+     *
+     * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+     * @param key Key to delete within database. It is using position and limit.
+     *     Supports direct buffer only.
+     *
+     * @throws RocksDBException thrown if error happens in underlying native library.
+     */
+    void delete(ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key)
+        throws RocksDBException;
+
+    /**
+     * Remove the database entry for {@code key}. Requires that the key exists
+     * and was not overwritten. It is not an error if the key did not exist
+     * in the database.
+     *
+     * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple
+     * times), then the result of calling SingleDelete() on this key is undefined.
+     * SingleDelete() only behaves correctly if there has been only one Put()
+     * for this key since the previous call to SingleDelete() for this key.
+     *
+     * This feature is currently an experimental performance optimization
+     * for a very specific workload. It is up to the caller to ensure that
+     * SingleDelete is only used for a key that is not deleted using Delete() or
+     * written using Merge(). Mixing SingleDelete operations with Deletes and
+     * Merges can result in undefined behavior.
+     *
+     * @param key Key to delete within database
+     *
+     * @throws RocksDBException thrown if error happens in underlying
+     *     native library.
+     */
+    @Experimental("Performance optimization for a very specific workload")
+    void singleDelete(final byte[] key) throws RocksDBException;
+
+    /**
+     * Remove the database entry for {@code key}. Requires that the key exists
+     * and was not overwritten. It is not an error if the key did not exist
+     * in the database.
+     *
+     * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple
+     * times), then the result of calling SingleDelete() on this key is undefined.
+     * SingleDelete() only behaves correctly if there has been only one Put()
+     * for this key since the previous call to SingleDelete() for this key.
+     *
+     * This feature is currently an experimental performance optimization
+     * for a very specific workload. It is up to the caller to ensure that
+     * SingleDelete is only used for a key that is not deleted using Delete() or
+     * written using Merge(). Mixing SingleDelete operations with Deletes and
+     * Merges can result in undefined behavior.
+     *
+     * @param columnFamilyHandle The column family to delete the key from
+     * @param key Key to delete within database
+     *
+     * @throws RocksDBException thrown if error happens in underlying
+     *     native library.
+     */
+    @Experimental("Performance optimization for a very specific workload")
+    void singleDelete(final ColumnFamilyHandle columnFamilyHandle, final byte[] key)
+        throws RocksDBException;
+
+    /**
+     * Removes the database entries in the range ["beginKey", "endKey"), i.e.,
+     * including "beginKey" and excluding "endKey". a non-OK status on error. It
+     * is not an error if no keys exist in the range ["beginKey", "endKey").
+     *
+     * Delete the database entry (if any) for "key". Returns OK on success, and a
+     * non-OK status on error. It is not an error if "key" did not exist in the
+     * database.
+     *
+     * @param beginKey
+     *          First key to delete within database (included)
+     * @param endKey
+     *          Last key to delete within database (excluded)
+     * @throws RocksDBException thrown if error happens in underlying native library.
+     */
+    void deleteRange(byte[] beginKey, byte[] endKey) throws RocksDBException;
+
+    /**
+     * Removes the database entries in the range ["beginKey", "endKey"), i.e.,
+     * including "beginKey" and excluding "endKey". a non-OK status on error. It
+     * is not an error if no keys exist in the range ["beginKey", "endKey").
+     *
+     * Delete the database entry (if any) for "key". Returns OK on success, and a
+     * non-OK status on error. It is not an error if "key" did not exist in the
+     * database.
+     *
+     * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+     * @param beginKey
+     *          First key to delete within database (included)
+     * @param endKey
+     *          Last key to delete within database (excluded)
+     * @throws RocksDBException thrown if error happens in underlying native library.
+     */
+    void deleteRange(ColumnFamilyHandle columnFamilyHandle, byte[] beginKey, byte[] endKey)
+        throws RocksDBException;
+
+    /**
+     * Append a blob of arbitrary size to the records in this batch. The blob will
+     * be stored in the transaction log but not in any other file. In particular,
+     * it will not be persisted to the SST files. When iterating over this
+     * WriteBatch, WriteBatch::Handler::LogData will be called with the contents
+     * of the blob as it is encountered. Blobs, puts, deletes, and merges will be
+     * encountered in the same order in thich they were inserted. The blob will
+     * NOT consume sequence number(s) and will NOT increase the count of the batch
+     *
+     * Example application: add timestamps to the transaction log for use in
+     * replication.
+     *
+     * @param blob binary object to be inserted
+     * @throws RocksDBException thrown if error happens in underlying native library.
+     */
+    void putLogData(byte[] blob) throws RocksDBException;
+
+    /**
+     * Clear all updates buffered in this batch
+     */
+    void clear();
+
+    /**
+     * Records the state of the batch for future calls to RollbackToSavePoint().
+     * May be called multiple times to set multiple save points.
+     */
+    void setSavePoint();
+
+    /**
+     * Remove all entries in this batch (Put, Merge, Delete, PutLogData) since
+     * the most recent call to SetSavePoint() and removes the most recent save
+     * point.
+     *
+     * @throws RocksDBException if there is no previous call to SetSavePoint()
+     */
+    void rollbackToSavePoint() throws RocksDBException;
+
+    /**
+     * Pop the most recent save point.
+     *
+     * That is to say that it removes the last save point,
+     * which was set by {@link #setSavePoint()}.
+     *
+     * @throws RocksDBException If there is no previous call to
+     *     {@link #setSavePoint()}, an exception with
+     *     {@link Status.Code#NotFound} will be thrown.
+     */
+    void popSavePoint() throws RocksDBException;
+
+    /**
+     * Set the maximum size of the write batch.
+     *
+     * @param maxBytes the maximum size in bytes.
+     */
+    void setMaxBytes(long maxBytes);
+
+    /**
+     * Get the underlying Write Batch.
+     *
+     * @return the underlying WriteBatch.
+     */
+    WriteBatch getWriteBatch();
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java b/src/rocksdb/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java
new file mode 100644
index 000000000..c73bd7dda
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java
@@ -0,0 +1,361 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.nio.ByteBuffer;
+
+/**
+ * Similar to {@link org.rocksdb.WriteBatch} but with a binary searchable
+ * index built for all the keys inserted.
+ *
+ * Calling put, merge, remove or putLogData calls the same function
+ * as with {@link org.rocksdb.WriteBatch} whilst also building an index.
+ *
+ * A user can call {@link org.rocksdb.WriteBatchWithIndex#newIterator()} to
+ * create an iterator over the write batch or
+ * {@link org.rocksdb.WriteBatchWithIndex#newIteratorWithBase(org.rocksdb.RocksIterator)}
+ * to get an iterator for the database with Read-Your-Own-Writes like capability
+ */
+public class WriteBatchWithIndex extends AbstractWriteBatch {
+  /**
+   * Creates a WriteBatchWithIndex where no bytes
+   * are reserved up-front, bytewise comparison is
+   * used for fallback key comparisons,
+   * and duplicate keys operations are retained
+   */
+  public WriteBatchWithIndex() {
+    super(newWriteBatchWithIndex());
+  }
+
+
+  /**
+   * Creates a WriteBatchWithIndex where no bytes
+   * are reserved up-front, bytewise comparison is
+   * used for fallback key comparisons, and duplicate key
+   * assignment is determined by the constructor argument
+   *
+   * @param overwriteKey if true, overwrite the key in the index when
+   *   inserting a duplicate key, in this way an iterator will never
+   *   show two entries with the same key.
+   */
+  public WriteBatchWithIndex(final boolean overwriteKey) {
+    super(newWriteBatchWithIndex(overwriteKey));
+  }
+
+  /**
+   * Creates a WriteBatchWithIndex
+   *
+   * @param fallbackIndexComparator We fallback to this comparator
+   *  to compare keys within a column family if we cannot determine
+   *  the column family and so look up it's comparator.
+   *
+   * @param reservedBytes reserved bytes in underlying WriteBatch
+   *
+   * @param overwriteKey if true, overwrite the key in the index when
+   *   inserting a duplicate key, in this way an iterator will never
+   *   show two entries with the same key.
+   */
+  public WriteBatchWithIndex(
+      final AbstractComparator
+          fallbackIndexComparator, final int reservedBytes,
+      final boolean overwriteKey) {
+    super(newWriteBatchWithIndex(fallbackIndexComparator.nativeHandle_,
+        fallbackIndexComparator.getComparatorType().getValue(), reservedBytes,
+        overwriteKey));
+  }
+
+  /**
+   * <p>Private WriteBatchWithIndex constructor which is used to construct
+   * WriteBatchWithIndex instances from C++ side. As the reference to this
+   * object is also managed from C++ side the handle will be disowned.</p>
+   *
+   * @param nativeHandle address of native instance.
+   */
+  WriteBatchWithIndex(final long nativeHandle) {
+    super(nativeHandle);
+    disOwnNativeHandle();
+  }
+
+  /**
+   * Create an iterator of a column family. User can call
+   * {@link org.rocksdb.RocksIteratorInterface#seek(byte[])} to
+   * search to the next entry of or after a key. Keys will be iterated in the
+   * order given by index_comparator. For multiple updates on the same key,
+   * each update will be returned as a separate entry, in the order of update
+   * time.
+   *
+   * @param columnFamilyHandle The column family to iterate over
+   * @return An iterator for the Write Batch contents, restricted to the column
+   * family
+   */
+  public WBWIRocksIterator newIterator(
+      final ColumnFamilyHandle columnFamilyHandle) {
+    return new WBWIRocksIterator(this, iterator1(nativeHandle_,
+            columnFamilyHandle.nativeHandle_));
+  }
+
+  /**
+   * Create an iterator of the default column family. User can call
+   * {@link org.rocksdb.RocksIteratorInterface#seek(byte[])} to
+   * search to the next entry of or after a key. Keys will be iterated in the
+   * order given by index_comparator. For multiple updates on the same key,
+   * each update will be returned as a separate entry, in the order of update
+   * time.
+   *
+   * @return An iterator for the Write Batch contents
+   */
+  public WBWIRocksIterator newIterator() {
+    return new WBWIRocksIterator(this, iterator0(nativeHandle_));
+  }
+
+  /**
+   * Provides Read-Your-Own-Writes like functionality by
+   * creating a new Iterator that will use {@link org.rocksdb.WBWIRocksIterator}
+   * as a delta and baseIterator as a base
+   *
+   * Updating write batch with the current key of the iterator is not safe.
+   * We strongly recommend users not to do it. It will invalidate the current
+   * key() and value() of the iterator. This invalidation happens even before
+   * the write batch update finishes. The state may recover after Next() is
+   * called.
+   *
+   * @param columnFamilyHandle The column family to iterate over
+   * @param baseIterator The base iterator,
+   *   e.g. {@link org.rocksdb.RocksDB#newIterator()}
+   * @return An iterator which shows a view comprised of both the database
+   * point-in-time from baseIterator and modifications made in this write batch.
+   */
+  public RocksIterator newIteratorWithBase(
+      final ColumnFamilyHandle columnFamilyHandle,
+      final RocksIterator baseIterator) {
+    return newIteratorWithBase(columnFamilyHandle, baseIterator, null);
+  }
+
+  /**
+   * Provides Read-Your-Own-Writes like functionality by
+   * creating a new Iterator that will use {@link org.rocksdb.WBWIRocksIterator}
+   * as a delta and baseIterator as a base
+   *
+   * Updating write batch with the current key of the iterator is not safe.
+   * We strongly recommend users not to do it. It will invalidate the current
+   * key() and value() of the iterator. This invalidation happens even before
+   * the write batch update finishes. The state may recover after Next() is
+   * called.
+   *
+   * @param columnFamilyHandle The column family to iterate over
+   * @param baseIterator The base iterator,
+   *   e.g. {@link org.rocksdb.RocksDB#newIterator()}
+   * @param readOptions the read options, or null
+   * @return An iterator which shows a view comprised of both the database
+   * point-in-time from baseIterator and modifications made in this write batch.
+   */
+  public RocksIterator newIteratorWithBase(final ColumnFamilyHandle columnFamilyHandle,
+      final RocksIterator baseIterator, /* @Nullable */ final ReadOptions readOptions) {
+    final RocksIterator iterator = new RocksIterator(baseIterator.parent_,
+        iteratorWithBase(nativeHandle_, columnFamilyHandle.nativeHandle_,
+            baseIterator.nativeHandle_, readOptions == null ? 0 : readOptions.nativeHandle_));
+
+    // when the iterator is deleted it will also delete the baseIterator
+    baseIterator.disOwnNativeHandle();
+
+    return iterator;
+  }
+
+  /**
+   * Provides Read-Your-Own-Writes like functionality by
+   * creating a new Iterator that will use {@link org.rocksdb.WBWIRocksIterator}
+   * as a delta and baseIterator as a base. Operates on the default column
+   * family.
+   *
+   * @param baseIterator The base iterator,
+   *   e.g. {@link org.rocksdb.RocksDB#newIterator()}
+   * @return An iterator which shows a view comprised of both the database
+   * point-in-timefrom baseIterator and modifications made in this write batch.
+   */
+  public RocksIterator newIteratorWithBase(final RocksIterator baseIterator) {
+    return newIteratorWithBase(baseIterator.parent_.getDefaultColumnFamily(), baseIterator, null);
+  }
+
+  /**
+   * Provides Read-Your-Own-Writes like functionality by
+   * creating a new Iterator that will use {@link org.rocksdb.WBWIRocksIterator}
+   * as a delta and baseIterator as a base. Operates on the default column
+   * family.
+   *
+   * @param baseIterator The base iterator,
+   *   e.g. {@link org.rocksdb.RocksDB#newIterator()}
+   * @param readOptions the read options, or null
+   * @return An iterator which shows a view comprised of both the database
+   * point-in-timefrom baseIterator and modifications made in this write batch.
+   */
+  public RocksIterator newIteratorWithBase(final RocksIterator baseIterator,
+      /* @Nullable */ final ReadOptions readOptions) {
+    return newIteratorWithBase(
+        baseIterator.parent_.getDefaultColumnFamily(), baseIterator, readOptions);
+  }
+
+  /**
+   * Similar to {@link RocksDB#get(ColumnFamilyHandle, byte[])} but will only
+   * read the key from this batch.
+   *
+   * @param columnFamilyHandle The column family to retrieve the value from
+   * @param options The database options to use
+   * @param key The key to read the value for
+   *
+   * @return a byte array storing the value associated with the input key if
+   *     any. null if it does not find the specified key.
+   *
+   * @throws RocksDBException if the batch does not have enough data to resolve
+   * Merge operations, MergeInProgress status may be returned.
+   */
+  public byte[] getFromBatch(final ColumnFamilyHandle columnFamilyHandle,
+      final DBOptions options, final byte[] key) throws RocksDBException {
+    return getFromBatch(nativeHandle_, options.nativeHandle_,
+        key, key.length, columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Similar to {@link RocksDB#get(byte[])} but will only
+   * read the key from this batch.
+   *
+   * @param options The database options to use
+   * @param key The key to read the value for
+   *
+   * @return a byte array storing the value associated with the input key if
+   *     any. null if it does not find the specified key.
+   *
+   * @throws RocksDBException if the batch does not have enough data to resolve
+   * Merge operations, MergeInProgress status may be returned.
+   */
+  public byte[] getFromBatch(final DBOptions options, final byte[] key)
+      throws RocksDBException {
+    return getFromBatch(nativeHandle_, options.nativeHandle_, key, key.length);
+  }
+
+  /**
+   * Similar to {@link RocksDB#get(ColumnFamilyHandle, byte[])} but will also
+   * read writes from this batch.
+   *
+   * This function will query both this batch and the DB and then merge
+   * the results using the DB's merge operator (if the batch contains any
+   * merge requests).
+   *
+   * Setting {@link ReadOptions#setSnapshot(Snapshot)} will affect what is
+   * read from the DB but will NOT change which keys are read from the batch
+   * (the keys in this batch do not yet belong to any snapshot and will be
+   * fetched regardless).
+   *
+   * @param db The Rocks database
+   * @param columnFamilyHandle The column family to retrieve the value from
+   * @param options The read options to use
+   * @param key The key to read the value for
+   *
+   * @return a byte array storing the value associated with the input key if
+   *     any. null if it does not find the specified key.
+   *
+   * @throws RocksDBException if the value for the key cannot be read
+   */
+  public byte[] getFromBatchAndDB(final RocksDB db, final ColumnFamilyHandle columnFamilyHandle,
+      final ReadOptions options, final byte[] key) throws RocksDBException {
+    return getFromBatchAndDB(nativeHandle_, db.nativeHandle_,
+        options.nativeHandle_, key, key.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Similar to {@link RocksDB#get(byte[])} but will also
+   * read writes from this batch.
+   *
+   * This function will query both this batch and the DB and then merge
+   * the results using the DB's merge operator (if the batch contains any
+   * merge requests).
+   *
+   * Setting {@link ReadOptions#setSnapshot(Snapshot)} will affect what is
+   * read from the DB but will NOT change which keys are read from the batch
+   * (the keys in this batch do not yet belong to any snapshot and will be
+   * fetched regardless).
+   *
+   * @param db The Rocks database
+   * @param options The read options to use
+   * @param key The key to read the value for
+   *
+   * @return a byte array storing the value associated with the input key if
+   *     any. null if it does not find the specified key.
+   *
+   * @throws RocksDBException if the value for the key cannot be read
+   */
+  public byte[] getFromBatchAndDB(final RocksDB db, final ReadOptions options,
+      final byte[] key) throws RocksDBException {
+    return getFromBatchAndDB(nativeHandle_, db.nativeHandle_,
+        options.nativeHandle_, key, key.length);
+  }
+
+  @Override protected final native void disposeInternal(final long handle);
+  @Override final native int count0(final long handle);
+  @Override final native void put(final long handle, final byte[] key,
+      final int keyLen, final byte[] value, final int valueLen);
+  @Override final native void put(final long handle, final byte[] key,
+      final int keyLen, final byte[] value, final int valueLen,
+      final long cfHandle);
+  @Override
+  final native void putDirect(final long handle, final ByteBuffer key, final int keyOffset,
+      final int keyLength, final ByteBuffer value, final int valueOffset, final int valueLength,
+      final long cfHandle);
+  @Override final native void merge(final long handle, final byte[] key,
+      final int keyLen, final byte[] value, final int valueLen);
+  @Override final native void merge(final long handle, final byte[] key,
+      final int keyLen, final byte[] value, final int valueLen,
+      final long cfHandle);
+  @Override final native void delete(final long handle, final byte[] key,
+      final int keyLen) throws RocksDBException;
+  @Override final native void delete(final long handle, final byte[] key,
+      final int keyLen, final long cfHandle) throws RocksDBException;
+  @Override final native void singleDelete(final long handle, final byte[] key,
+      final int keyLen) throws RocksDBException;
+  @Override final native void singleDelete(final long handle, final byte[] key,
+      final int keyLen, final long cfHandle) throws RocksDBException;
+  @Override
+  final native void deleteDirect(final long handle, final ByteBuffer key, final int keyOffset,
+      final int keyLength, final long cfHandle) throws RocksDBException;
+  // DO NOT USE - `WriteBatchWithIndex::deleteRange` is not yet supported
+  @Override
+  final native void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen,
+      final byte[] endKey, final int endKeyLen);
+  // DO NOT USE - `WriteBatchWithIndex::deleteRange` is not yet supported
+  @Override
+  final native void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen,
+      final byte[] endKey, final int endKeyLen, final long cfHandle);
+  @Override final native void putLogData(final long handle, final byte[] blob,
+      final int blobLen) throws RocksDBException;
+  @Override final native void clear0(final long handle);
+  @Override final native void setSavePoint0(final long handle);
+  @Override final native void rollbackToSavePoint0(final long handle);
+  @Override final native void popSavePoint(final long handle) throws RocksDBException;
+  @Override final native void setMaxBytes(final long nativeHandle,
+      final long maxBytes);
+  @Override final native WriteBatch getWriteBatch(final long handle);
+
+  private native static long newWriteBatchWithIndex();
+  private native static long newWriteBatchWithIndex(final boolean overwriteKey);
+  private native static long newWriteBatchWithIndex(
+      final long fallbackIndexComparatorHandle,
+      final byte comparatorType, final int reservedBytes,
+      final boolean overwriteKey);
+  private native long iterator0(final long handle);
+  private native long iterator1(final long handle, final long cfHandle);
+  private native long iteratorWithBase(final long handle, final long baseIteratorHandle,
+      final long cfHandle, final long readOptionsHandle);
+  private native byte[] getFromBatch(final long handle, final long optHandle,
+      final byte[] key, final int keyLen);
+  private native byte[] getFromBatch(final long handle, final long optHandle,
+      final byte[] key, final int keyLen, final long cfHandle);
+  private native byte[] getFromBatchAndDB(final long handle,
+      final long dbHandle,  final long readOptHandle, final byte[] key,
+      final int keyLen);
+  private native byte[] getFromBatchAndDB(final long handle,
+      final long dbHandle, final long readOptHandle, final byte[] key,
+      final int keyLen, final long cfHandle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/WriteBufferManager.java b/src/rocksdb/java/src/main/java/org/rocksdb/WriteBufferManager.java
new file mode 100644
index 000000000..8ec963958
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/WriteBufferManager.java
@@ -0,0 +1,50 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Java wrapper over native write_buffer_manager class
+ */
+public class WriteBufferManager extends RocksObject {
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  /**
+   * Construct a new instance of WriteBufferManager.
+   *
+   * Check <a href="https://github.com/facebook/rocksdb/wiki/Write-Buffer-Manager">
+   *     https://github.com/facebook/rocksdb/wiki/Write-Buffer-Manager</a>
+   * for more details on when to use it
+   *
+   * @param bufferSizeBytes buffer size(in bytes) to use for native write_buffer_manager
+   * @param cache cache whose memory should be bounded by this write buffer manager
+   * @param allowStall if set true, it will enable stalling of writes when memory_usage() exceeds
+   *     buffer_size.
+   *        It will wait for flush to complete and memory usage to drop down.
+   */
+  public WriteBufferManager(
+      final long bufferSizeBytes, final Cache cache, final boolean allowStall) {
+    super(newWriteBufferManager(bufferSizeBytes, cache.nativeHandle_, allowStall));
+    this.allowStall_ = allowStall;
+  }
+
+  public WriteBufferManager(final long bufferSizeBytes, final Cache cache){
+    this(bufferSizeBytes, cache, false);
+  }
+
+  public boolean allowStall() {
+    return allowStall_;
+  }
+
+  private native static long newWriteBufferManager(
+      final long bufferSizeBytes, final long cacheHandle, final boolean allowStall);
+
+  @Override
+  protected native void disposeInternal(final long handle);
+
+  private boolean allowStall_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/WriteOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/WriteOptions.java
new file mode 100644
index 000000000..5a3ffa6c5
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/WriteOptions.java
@@ -0,0 +1,256 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Options that control write operations.
+ *
+ * Note that developers should call WriteOptions.dispose() to release the
+ * c++ side memory before a WriteOptions instance runs out of scope.
+ */
+public class WriteOptions extends RocksObject {
+  /**
+   * Construct WriteOptions instance.
+   */
+  public WriteOptions() {
+    super(newWriteOptions());
+
+  }
+
+  // TODO(AR) consider ownership
+  WriteOptions(final long nativeHandle) {
+    super(nativeHandle);
+    disOwnNativeHandle();
+  }
+
+  /**
+   * Copy constructor for WriteOptions.
+   *
+   * NOTE: This does a shallow copy, which means comparator, merge_operator, compaction_filter,
+   * compaction_filter_factory and other pointers will be cloned!
+   *
+   * @param other The ColumnFamilyOptions to copy.
+   */
+  public WriteOptions(WriteOptions other) {
+    super(copyWriteOptions(other.nativeHandle_));
+  }
+
+
+  /**
+   * If true, the write will be flushed from the operating system
+   * buffer cache (by calling WritableFile::Sync()) before the write
+   * is considered complete.  If this flag is true, writes will be
+   * slower.
+   *
+   * If this flag is false, and the machine crashes, some recent
+   * writes may be lost.  Note that if it is just the process that
+   * crashes (i.e., the machine does not reboot), no writes will be
+   * lost even if sync==false.
+   *
+   * In other words, a DB write with sync==false has similar
+   * crash semantics as the "write()" system call.  A DB write
+   * with sync==true has similar crash semantics to a "write()"
+   * system call followed by "fdatasync()".
+   *
+   * Default: false
+   *
+   * @param flag a boolean flag to indicate whether a write
+   *     should be synchronized.
+   * @return the instance of the current WriteOptions.
+   */
+  public WriteOptions setSync(final boolean flag) {
+    setSync(nativeHandle_, flag);
+    return this;
+  }
+
+  /**
+   * If true, the write will be flushed from the operating system
+   * buffer cache (by calling WritableFile::Sync()) before the write
+   * is considered complete.  If this flag is true, writes will be
+   * slower.
+   *
+   * If this flag is false, and the machine crashes, some recent
+   * writes may be lost.  Note that if it is just the process that
+   * crashes (i.e., the machine does not reboot), no writes will be
+   * lost even if sync==false.
+   *
+   * In other words, a DB write with sync==false has similar
+   * crash semantics as the "write()" system call.  A DB write
+   * with sync==true has similar crash semantics to a "write()"
+   * system call followed by "fdatasync()".
+   *
+   * @return boolean value indicating if sync is active.
+   */
+  public boolean sync() {
+    return sync(nativeHandle_);
+  }
+
+  /**
+   * If true, writes will not first go to the write ahead log,
+   * and the write may got lost after a crash. The backup engine
+   * relies on write-ahead logs to back up the memtable, so if
+   * you disable write-ahead logs, you must create backups with
+   * flush_before_backup=true to avoid losing unflushed memtable data.
+   *
+   * @param flag a boolean flag to specify whether to disable
+   *     write-ahead-log on writes.
+   * @return the instance of the current WriteOptions.
+   */
+  public WriteOptions setDisableWAL(final boolean flag) {
+    setDisableWAL(nativeHandle_, flag);
+    return this;
+  }
+
+  /**
+   * If true, writes will not first go to the write ahead log,
+   * and the write may got lost after a crash. The backup engine
+   * relies on write-ahead logs to back up the memtable, so if
+   * you disable write-ahead logs, you must create backups with
+   * flush_before_backup=true to avoid losing unflushed memtable data.
+   *
+   * @return boolean value indicating if WAL is disabled.
+   */
+  public boolean disableWAL() {
+    return disableWAL(nativeHandle_);
+  }
+
+  /**
+   * If true and if user is trying to write to column families that don't exist
+   * (they were dropped), ignore the write (don't return an error). If there
+   * are multiple writes in a WriteBatch, other writes will succeed.
+   *
+   * Default: false
+   *
+   * @param ignoreMissingColumnFamilies true to ignore writes to column families
+   *     which don't exist
+   * @return the instance of the current WriteOptions.
+   */
+  public WriteOptions setIgnoreMissingColumnFamilies(
+      final boolean ignoreMissingColumnFamilies) {
+    setIgnoreMissingColumnFamilies(nativeHandle_, ignoreMissingColumnFamilies);
+    return this;
+  }
+
+  /**
+   * If true and if user is trying to write to column families that don't exist
+   * (they were dropped), ignore the write (don't return an error). If there
+   * are multiple writes in a WriteBatch, other writes will succeed.
+   *
+   * Default: false
+   *
+   * @return true if writes to column families which don't exist are ignored
+   */
+  public boolean ignoreMissingColumnFamilies() {
+    return ignoreMissingColumnFamilies(nativeHandle_);
+  }
+
+  /**
+   * If true and we need to wait or sleep for the write request, fails
+   * immediately with {@link Status.Code#Incomplete}.
+   *
+   * @param noSlowdown true to fail write requests if we need to wait or sleep
+   * @return the instance of the current WriteOptions.
+   */
+  public WriteOptions setNoSlowdown(final boolean noSlowdown) {
+    setNoSlowdown(nativeHandle_, noSlowdown);
+    return this;
+  }
+
+  /**
+   * If true and we need to wait or sleep for the write request, fails
+   * immediately with {@link Status.Code#Incomplete}.
+   *
+   * @return true when write requests are failed if we need to wait or sleep
+   */
+  public boolean noSlowdown() {
+    return noSlowdown(nativeHandle_);
+  }
+
+  /**
+   * If true, this write request is of lower priority if compaction is
+   * behind. In the case that, {@link #noSlowdown()} == true, the request
+   * will be cancelled immediately with {@link Status.Code#Incomplete} returned.
+   * Otherwise, it will be slowed down. The slowdown value is determined by
+   * RocksDB to guarantee it introduces minimum impacts to high priority writes.
+   *
+   * Default: false
+   *
+   * @param lowPri true if the write request should be of lower priority than
+   *     compactions which are behind.
+   *
+   * @return the instance of the current WriteOptions.
+   */
+  public WriteOptions setLowPri(final boolean lowPri) {
+    setLowPri(nativeHandle_, lowPri);
+    return this;
+  }
+
+  /**
+   * Returns true if this write request is of lower priority if compaction is
+   * behind.
+   *
+   * See {@link #setLowPri(boolean)}.
+   *
+   * @return true if this write request is of lower priority, false otherwise.
+   */
+  public boolean lowPri() {
+    return lowPri(nativeHandle_);
+  }
+
+  /**
+   * If true, this writebatch will maintain the last insert positions of each
+   * memtable as hints in concurrent write. It can improve write performance
+   * in concurrent writes if keys in one writebatch are sequential. In
+   * non-concurrent writes (when {@code concurrent_memtable_writes} is false) this
+   * option will be ignored.
+   *
+   * Default: false
+   *
+   * @return true if writebatch will maintain the last insert positions of each memtable as hints in
+   *     concurrent write.
+   */
+  public boolean memtableInsertHintPerBatch() {
+    return memtableInsertHintPerBatch(nativeHandle_);
+  }
+
+  /**
+   * If true, this writebatch will maintain the last insert positions of each
+   * memtable as hints in concurrent write. It can improve write performance
+   * in concurrent writes if keys in one writebatch are sequential. In
+   * non-concurrent writes (when {@code concurrent_memtable_writes} is false) this
+   * option will be ignored.
+   *
+   * Default: false
+   *
+   * @param memtableInsertHintPerBatch true if writebatch should maintain the last insert positions
+   *     of each memtable as hints in concurrent write.
+   * @return the instance of the current WriteOptions.
+   */
+  public WriteOptions setMemtableInsertHintPerBatch(final boolean memtableInsertHintPerBatch) {
+    setMemtableInsertHintPerBatch(nativeHandle_, memtableInsertHintPerBatch);
+    return this;
+  }
+
+  private native static long newWriteOptions();
+  private native static long copyWriteOptions(long handle);
+  @Override protected final native void disposeInternal(final long handle);
+
+  private native void setSync(long handle, boolean flag);
+  private native boolean sync(long handle);
+  private native void setDisableWAL(long handle, boolean flag);
+  private native boolean disableWAL(long handle);
+  private native void setIgnoreMissingColumnFamilies(final long handle,
+      final boolean ignoreMissingColumnFamilies);
+  private native boolean ignoreMissingColumnFamilies(final long handle);
+  private native void setNoSlowdown(final long handle,
+      final boolean noSlowdown);
+  private native boolean noSlowdown(final long handle);
+  private native void setLowPri(final long handle, final boolean lowPri);
+  private native boolean lowPri(final long handle);
+  private native boolean memtableInsertHintPerBatch(final long handle);
+  private native void setMemtableInsertHintPerBatch(
+      final long handle, final boolean memtableInsertHintPerBatch);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/WriteStallCondition.java b/src/rocksdb/java/src/main/java/org/rocksdb/WriteStallCondition.java
new file mode 100644
index 000000000..3bc9d4104
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/WriteStallCondition.java
@@ -0,0 +1,44 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public enum WriteStallCondition {
+  NORMAL((byte) 0x0),
+  DELAYED((byte) 0x1),
+  STOPPED((byte) 0x2);
+
+  private final byte value;
+
+  WriteStallCondition(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation.
+   *
+   * @return the internal representation
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get the WriteStallCondition from the internal representation value.
+   *
+   * @return the flush reason.
+   *
+   * @throws IllegalArgumentException if the value is unknown.
+   */
+  static WriteStallCondition fromValue(final byte value) {
+    for (final WriteStallCondition writeStallCondition : WriteStallCondition.values()) {
+      if (writeStallCondition.value == value) {
+        return writeStallCondition;
+      }
+    }
+
+    throw new IllegalArgumentException("Illegal value provided for WriteStallCondition: " + value);
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/WriteStallInfo.java b/src/rocksdb/java/src/main/java/org/rocksdb/WriteStallInfo.java
new file mode 100644
index 000000000..4aef0eda9
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/WriteStallInfo.java
@@ -0,0 +1,75 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Objects;
+
+public class WriteStallInfo {
+  private final String columnFamilyName;
+  private final WriteStallCondition currentCondition;
+  private final WriteStallCondition previousCondition;
+
+  /**
+   * Access is package private as this will only be constructed from
+   * C++ via JNI and for testing.
+   */
+  WriteStallInfo(final String columnFamilyName, final byte currentConditionValue,
+      final byte previousConditionValue) {
+    this.columnFamilyName = columnFamilyName;
+    this.currentCondition = WriteStallCondition.fromValue(currentConditionValue);
+    this.previousCondition = WriteStallCondition.fromValue(previousConditionValue);
+  }
+
+  /**
+   * Get the name of the column family.
+   *
+   * @return the name of the column family.
+   */
+  public String getColumnFamilyName() {
+    return columnFamilyName;
+  }
+
+  /**
+   * Get the current state of the write controller.
+   *
+   * @return the current state.
+   */
+  public WriteStallCondition getCurrentCondition() {
+    return currentCondition;
+  }
+
+  /**
+   * Get the previous state of the write controller.
+   *
+   * @return the previous state.
+   */
+  public WriteStallCondition getPreviousCondition() {
+    return previousCondition;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    WriteStallInfo that = (WriteStallInfo) o;
+    return Objects.equals(columnFamilyName, that.columnFamilyName)
+        && currentCondition == that.currentCondition && previousCondition == that.previousCondition;
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(columnFamilyName, currentCondition, previousCondition);
+  }
+
+  @Override
+  public String toString() {
+    return "WriteStallInfo{"
+        + "columnFamilyName='" + columnFamilyName + '\'' + ", currentCondition=" + currentCondition
+        + ", previousCondition=" + previousCondition + '}';
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/util/ByteUtil.java b/src/rocksdb/java/src/main/java/org/rocksdb/util/ByteUtil.java
new file mode 100644
index 000000000..5d64d5dcf
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/util/ByteUtil.java
@@ -0,0 +1,52 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb.util;
+
+import java.nio.ByteBuffer;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+public class ByteUtil {
+
+  /**
+   * Convert a String to a UTF-8 byte array.
+   *
+   * @param str the string
+   *
+   * @return the byte array.
+   */
+  public static byte[] bytes(final String str) {
+    return str.getBytes(UTF_8);
+  }
+
+  /**
+   * Compares the first {@code count} bytes of two areas of memory.  Returns
+   * zero if they are the same, a value less than zero if {@code x} is
+   * lexically less than {@code y}, or a value greater than zero if {@code x}
+   * is lexically greater than {@code y}.  Note that lexical order is determined
+   * as if comparing unsigned char arrays.
+   *
+   * Similar to <a href="https://github.com/gcc-mirror/gcc/blob/master/libiberty/memcmp.c">memcmp.c</a>.
+   *
+   * @param x the first value to compare with
+   * @param y the second value to compare against
+   * @param count the number of bytes to compare
+   *
+   * @return the result of the comparison
+   */
+  public static int memcmp(final ByteBuffer x, final ByteBuffer y,
+                            final int count) {
+    for (int idx = 0; idx < count; idx++) {
+      final int aa = x.get(idx) & 0xff;
+      final int bb = y.get(idx) & 0xff;
+      if (aa != bb) {
+        return aa - bb;
+      }
+    }
+    return 0;
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/util/BytewiseComparator.java b/src/rocksdb/java/src/main/java/org/rocksdb/util/BytewiseComparator.java
new file mode 100644
index 000000000..9561b0a31
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/util/BytewiseComparator.java
@@ -0,0 +1,121 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb.util;
+
+import org.rocksdb.*;
+
+import java.nio.ByteBuffer;
+
+import static org.rocksdb.util.ByteUtil.memcmp;
+
+/**
+ * This is a Java Native implementation of the C++
+ * equivalent BytewiseComparatorImpl using {@link Slice}
+ *
+ * The performance of Comparators implemented in Java is always
+ * less than their C++ counterparts due to the bridging overhead,
+ * as such you likely don't want to use this apart from benchmarking
+ * and you most likely instead wanted
+ * {@link org.rocksdb.BuiltinComparator#BYTEWISE_COMPARATOR}
+ */
+public final class BytewiseComparator extends AbstractComparator {
+
+  public BytewiseComparator(final ComparatorOptions copt) {
+    super(copt);
+  }
+
+  @Override
+  public String name() {
+    return "rocksdb.java.BytewiseComparator";
+  }
+
+  @Override
+  public int compare(final ByteBuffer a, final ByteBuffer b) {
+    return _compare(a, b);
+  }
+
+  static int _compare(final ByteBuffer a, final ByteBuffer b) {
+    assert(a != null && b != null);
+    final int minLen = a.remaining() < b.remaining() ?
+        a.remaining() : b.remaining();
+    int r = memcmp(a, b, minLen);
+    if (r == 0) {
+      if (a.remaining() < b.remaining()) {
+        r = -1;
+      } else if (a.remaining() > b.remaining()) {
+        r = +1;
+      }
+    }
+    return r;
+  }
+
+  @Override
+  public void findShortestSeparator(final ByteBuffer start,
+      final ByteBuffer limit) {
+    // Find length of common prefix
+    final int minLength = Math.min(start.remaining(), limit.remaining());
+    int diffIndex = 0;
+    while (diffIndex < minLength &&
+        start.get(diffIndex) == limit.get(diffIndex)) {
+      diffIndex++;
+    }
+
+    if (diffIndex >= minLength) {
+      // Do not shorten if one string is a prefix of the other
+    } else {
+      final int startByte = start.get(diffIndex) & 0xff;
+      final int limitByte = limit.get(diffIndex) & 0xff;
+      if (startByte >= limitByte) {
+        // Cannot shorten since limit is smaller than start or start is
+        // already the shortest possible.
+        return;
+      }
+      assert(startByte < limitByte);
+
+      if (diffIndex < limit.remaining() - 1 || startByte + 1 < limitByte) {
+        start.put(diffIndex, (byte)((start.get(diffIndex) & 0xff) + 1));
+        start.limit(diffIndex + 1);
+      } else {
+        //     v
+        // A A 1 A A A
+        // A A 2
+        //
+        // Incrementing the current byte will make start bigger than limit, we
+        // will skip this byte, and find the first non 0xFF byte in start and
+        // increment it.
+        diffIndex++;
+
+        while (diffIndex < start.remaining()) {
+          // Keep moving until we find the first non 0xFF byte to
+          // increment it
+          if ((start.get(diffIndex) & 0xff) <
+              0xff) {
+            start.put(diffIndex, (byte)((start.get(diffIndex) & 0xff) + 1));
+            start.limit(diffIndex + 1);
+            break;
+          }
+          diffIndex++;
+        }
+      }
+      assert(compare(start.duplicate(), limit.duplicate()) < 0);
+    }
+  }
+
+  @Override
+  public void findShortSuccessor(final ByteBuffer key) {
+    // Find first character that can be incremented
+    final int n = key.remaining();
+    for (int i = 0; i < n; i++) {
+      final int byt = key.get(i) & 0xff;
+      if (byt != 0xff) {
+        key.put(i, (byte)(byt + 1));
+        key.limit(i+1);
+        return;
+      }
+    }
+    // *key is a run of 0xffs.  Leave it alone.
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/util/Environment.java b/src/rocksdb/java/src/main/java/org/rocksdb/util/Environment.java
new file mode 100644
index 000000000..9ad51c7c7
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/util/Environment.java
@@ -0,0 +1,245 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+package org.rocksdb.util;
+
+import java.io.File;
+import java.io.IOException;
+
+public class Environment {
+  private static String OS = System.getProperty("os.name").toLowerCase();
+  private static String ARCH = System.getProperty("os.arch").toLowerCase();
+  private static String MUSL_ENVIRONMENT = System.getenv("ROCKSDB_MUSL_LIBC");
+
+  /**
+   * Will be lazily initialised by {@link #isMuslLibc()} instead of the previous static
+   * initialisation. The lazy initialisation prevents Windows from reporting suspicious behaviour of
+   * the JVM attempting IO on Unix paths.
+   */
+  private static Boolean MUSL_LIBC = null;
+
+  public static boolean isAarch64() {
+    return ARCH.contains("aarch64");
+  }
+
+  public static boolean isPowerPC() {
+    return ARCH.contains("ppc");
+  }
+
+  public static boolean isS390x() {
+    return ARCH.contains("s390x");
+  }
+
+  public static boolean isWindows() {
+    return (OS.contains("win"));
+  }
+
+  public static boolean isFreeBSD() {
+    return (OS.contains("freebsd"));
+  }
+
+  public static boolean isMac() {
+    return (OS.contains("mac"));
+  }
+
+  public static boolean isAix() {
+    return OS.contains("aix");
+  }
+  
+  public static boolean isUnix() {
+    return OS.contains("nix") ||
+        OS.contains("nux");
+  }
+
+  /**
+   * Determine if the environment has a musl libc.
+   *
+   * @return true if the environment has a musl libc, false otherwise.
+   */
+  public static boolean isMuslLibc() {
+    if (MUSL_LIBC == null) {
+      MUSL_LIBC = initIsMuslLibc();
+    }
+    return MUSL_LIBC;
+  }
+
+  /**
+   * Determine if the environment has a musl libc.
+   *
+   * The initialisation counterpart of {@link #isMuslLibc()}.
+   *
+   * Intentionally package-private for testing.
+   *
+   * @return true if the environment has a musl libc, false otherwise.
+   */
+  static boolean initIsMuslLibc() {
+    // consider explicit user setting from environment first
+    if ("true".equalsIgnoreCase(MUSL_ENVIRONMENT)) {
+      return true;
+    }
+    if ("false".equalsIgnoreCase(MUSL_ENVIRONMENT)) {
+      return false;
+    }
+
+    // check if ldd indicates a muslc lib
+    try {
+      final Process p =
+          new ProcessBuilder("/usr/bin/env", "sh", "-c", "ldd /usr/bin/env | grep -q musl").start();
+      if (p.waitFor() == 0) {
+        return true;
+      }
+    } catch (final IOException | InterruptedException e) {
+      // do nothing, and move on to the next check
+    }
+
+    final File lib = new File("/lib");
+    if (lib.exists() && lib.isDirectory() && lib.canRead()) {
+      // attempt the most likely musl libc name first
+      final String possibleMuslcLibName;
+      if (isPowerPC()) {
+        possibleMuslcLibName = "libc.musl-ppc64le.so.1";
+      } else if (isAarch64()) {
+        possibleMuslcLibName = "libc.musl-aarch64.so.1";
+      } else if (isS390x()) {
+        possibleMuslcLibName = "libc.musl-s390x.so.1";
+      } else {
+        possibleMuslcLibName = "libc.musl-x86_64.so.1";
+      }
+      final File possibleMuslcLib = new File(lib, possibleMuslcLibName);
+      if (possibleMuslcLib.exists() && possibleMuslcLib.canRead()) {
+        return true;
+      }
+
+      // fallback to scanning for a musl libc
+      final File[] libFiles = lib.listFiles();
+      if (libFiles == null) {
+        return false;
+      }
+      for (final File f : libFiles) {
+        if (f.getName().startsWith("libc.musl")) {
+          return true;
+        }
+      }
+    }
+
+    return false;
+  }
+
+  public static boolean isSolaris() {
+    return OS.contains("sunos");
+  }
+
+  public static boolean isOpenBSD() {
+    return (OS.contains("openbsd"));
+  }
+
+  public static boolean is64Bit() {
+    if (ARCH.indexOf("sparcv9") >= 0) {
+      return true;
+    }
+    return (ARCH.indexOf("64") > 0);
+  }
+
+  public static String getSharedLibraryName(final String name) {
+    return name + "jni";
+  }
+
+  public static String getSharedLibraryFileName(final String name) {
+    return appendLibOsSuffix("lib" + getSharedLibraryName(name), true);
+  }
+
+  /**
+   * Get the name of the libc implementation
+   *
+   * @return the name of the implementation,
+   *    or null if the default for that platform (e.g. glibc on Linux).
+   */
+  public static /* @Nullable */ String getLibcName() {
+    if (isMuslLibc()) {
+      return "musl";
+    } else {
+      return null;
+    }
+  }
+
+  private static String getLibcPostfix() {
+    final String libcName = getLibcName();
+    if (libcName == null) {
+      return "";
+    }
+    return "-" + libcName;
+  }
+
+  public static String getJniLibraryName(final String name) {
+    if (isUnix()) {
+      final String arch = is64Bit() ? "64" : "32";
+      if (isPowerPC() || isAarch64()) {
+        return String.format("%sjni-linux-%s%s", name, ARCH, getLibcPostfix());
+      } else if (isS390x()) {
+        return String.format("%sjni-linux-%s", name, ARCH);
+      } else {
+        return String.format("%sjni-linux%s%s", name, arch, getLibcPostfix());
+      }
+    } else if (isMac()) {
+      if (is64Bit()) {
+        final String arch;
+        if (isAarch64()) {
+          arch = "arm64";
+        } else {
+          arch = "x86_64";
+        }
+        return String.format("%sjni-osx-%s", name, arch);
+      } else {
+        return String.format("%sjni-osx", name);
+      }
+    } else if (isFreeBSD()) {
+      return String.format("%sjni-freebsd%s", name, is64Bit() ? "64" : "32");
+    } else if (isAix() && is64Bit()) {
+      return String.format("%sjni-aix64", name);
+    } else if (isSolaris()) {
+      final String arch = is64Bit() ? "64" : "32";
+      return String.format("%sjni-solaris%s", name, arch);
+    } else if (isWindows() && is64Bit()) {
+      return String.format("%sjni-win64", name);
+    } else if (isOpenBSD()) {
+      return String.format("%sjni-openbsd%s", name, is64Bit() ? "64" : "32");
+    }
+
+    throw new UnsupportedOperationException(String.format("Cannot determine JNI library name for ARCH='%s' OS='%s' name='%s'", ARCH, OS, name));
+  }
+
+  public static /*@Nullable*/ String getFallbackJniLibraryName(final String name) {
+    if (isMac() && is64Bit()) {
+      return String.format("%sjni-osx", name);
+    }
+    return null;
+  }
+
+  public static String getJniLibraryFileName(final String name) {
+    return appendLibOsSuffix("lib" + getJniLibraryName(name), false);
+  }
+
+  public static /*@Nullable*/ String getFallbackJniLibraryFileName(final String name) {
+    final String fallbackJniLibraryName = getFallbackJniLibraryName(name);
+    if (fallbackJniLibraryName == null) {
+      return null;
+    }
+    return appendLibOsSuffix("lib" + fallbackJniLibraryName, false);
+  }
+
+  private static String appendLibOsSuffix(final String libraryFileName, final boolean shared) {
+    if (isUnix() || isAix() || isSolaris() || isFreeBSD() || isOpenBSD()) {
+      return libraryFileName + ".so";
+    } else if (isMac()) {
+      return libraryFileName + (shared ? ".dylib" : ".jnilib");
+    } else if (isWindows()) {
+      return libraryFileName + ".dll";
+    }
+    throw new UnsupportedOperationException();
+  }
+
+  public static String getJniLibraryExtension() {
+    if (isWindows()) {
+      return ".dll";
+    }
+    return (isMac()) ? ".jnilib" : ".so";
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/util/IntComparator.java b/src/rocksdb/java/src/main/java/org/rocksdb/util/IntComparator.java
new file mode 100644
index 000000000..cc096cd14
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/util/IntComparator.java
@@ -0,0 +1,67 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb.util;
+
+import org.rocksdb.AbstractComparator;
+import org.rocksdb.ComparatorOptions;
+
+import java.nio.ByteBuffer;
+
+/**
+ * This is a Java implementation of a Comparator for Java int
+ * keys.
+ *
+ * This comparator assumes keys are (at least) four bytes, so
+ * the caller must guarantee that in accessing other APIs in
+ * combination with this comparator.
+ *
+ * The performance of Comparators implemented in Java is always
+ * less than their C++ counterparts due to the bridging overhead,
+ * as such you likely don't want to use this apart from benchmarking
+ * or testing.
+ */
+public final class IntComparator extends AbstractComparator {
+
+  public IntComparator(final ComparatorOptions copt) {
+    super(copt);
+  }
+
+  @Override
+  public String name() {
+    return "rocksdb.java.IntComparator";
+  }
+
+  @Override
+  public int compare(final ByteBuffer a, final ByteBuffer b) {
+    return compareIntKeys(a, b);
+  }
+
+  /**
+   * Compares integer keys
+   * so that they are in ascending order
+   *
+   * @param a 4-bytes representing an integer key
+   * @param b 4-bytes representing an integer key
+   *
+   * @return negative if a &lt; b, 0 if a == b, positive otherwise
+   */
+  private final int compareIntKeys(final ByteBuffer a, final ByteBuffer b) {
+    final int iA = a.getInt();
+    final int iB = b.getInt();
+
+    // protect against int key calculation overflow
+    final long diff = (long)iA - iB;
+    final int result;
+    if (diff < Integer.MIN_VALUE) {
+      result = Integer.MIN_VALUE;
+    } else if(diff > Integer.MAX_VALUE) {
+      result = Integer.MAX_VALUE;
+    } else {
+      result = (int)diff;
+    }
+    return result;
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/util/ReverseBytewiseComparator.java b/src/rocksdb/java/src/main/java/org/rocksdb/util/ReverseBytewiseComparator.java
new file mode 100644
index 000000000..4c06f80aa
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/util/ReverseBytewiseComparator.java
@@ -0,0 +1,88 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb.util;
+
+import org.rocksdb.AbstractComparator;
+import org.rocksdb.BuiltinComparator;
+import org.rocksdb.ComparatorOptions;
+import org.rocksdb.Slice;
+
+import java.nio.ByteBuffer;
+
+/**
+ * This is a Java Native implementation of the C++
+ * equivalent ReverseBytewiseComparatorImpl using {@link Slice}
+ *
+ * The performance of Comparators implemented in Java is always
+ * less than their C++ counterparts due to the bridging overhead,
+ * as such you likely don't want to use this apart from benchmarking
+ * and you most likely instead wanted
+ * {@link BuiltinComparator#REVERSE_BYTEWISE_COMPARATOR}
+ */
+public final class ReverseBytewiseComparator extends AbstractComparator {
+
+  public ReverseBytewiseComparator(final ComparatorOptions copt) {
+    super(copt);
+  }
+
+  @Override
+  public String name() {
+    return "rocksdb.java.ReverseBytewiseComparator";
+  }
+
+  @Override
+  public int compare(final ByteBuffer a, final ByteBuffer b) {
+    return -BytewiseComparator._compare(a, b);
+  }
+
+  @Override
+  public void findShortestSeparator(final ByteBuffer start,
+      final ByteBuffer limit) {
+    // Find length of common prefix
+    final int minLength = Math.min(start.remaining(), limit.remaining());
+    int diffIndex = 0;
+    while (diffIndex < minLength &&
+        start.get(diffIndex) == limit.get(diffIndex)) {
+      diffIndex++;
+    }
+
+    assert(diffIndex <= minLength);
+    if (diffIndex == minLength) {
+      // Do not shorten if one string is a prefix of the other
+      //
+      // We could handle cases like:
+      //     V
+      // A A 2 X Y
+      // A A 2
+      // in a similar way as BytewiseComparator::FindShortestSeparator().
+      // We keep it simple by not implementing it. We can come back to it
+      // later when needed.
+    } else {
+      final int startByte = start.get(diffIndex) & 0xff;
+      final int limitByte = limit.get(diffIndex) & 0xff;
+      if (startByte > limitByte && diffIndex < start.remaining() - 1) {
+        // Case like
+        //     V
+        // A A 3 A A
+        // A A 1 B B
+        //
+        // or
+        //     v
+        // A A 2 A A
+        // A A 1 B B
+        // In this case "AA2" will be good.
+//#ifndef NDEBUG
+//        std::string old_start = *start;
+//#endif
+        start.limit(diffIndex + 1);
+//#ifndef NDEBUG
+//        assert(old_start >= *start);
+//#endif
+        assert(BytewiseComparator._compare(start.duplicate(), limit.duplicate()) > 0);
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/util/SizeUnit.java b/src/rocksdb/java/src/main/java/org/rocksdb/util/SizeUnit.java
new file mode 100644
index 000000000..0f717e8d4
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/util/SizeUnit.java
@@ -0,0 +1,16 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb.util;
+
+public class SizeUnit {
+  public static final long KB = 1024L;
+  public static final long MB = KB * KB;
+  public static final long GB = KB * MB;
+  public static final long TB = KB * GB;
+  public static final long PB = KB * TB;
+
+  private SizeUnit() {}
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/AbstractTransactionTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/AbstractTransactionTest.java
new file mode 100644
index 000000000..46685f9fd
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/AbstractTransactionTest.java
@@ -0,0 +1,965 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Random;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.fail;
+
+/**
+ * Base class of {@link TransactionTest} and {@link OptimisticTransactionTest}
+ */
+public abstract class AbstractTransactionTest {
+
+  protected final static byte[] TXN_TEST_COLUMN_FAMILY = "txn_test_cf"
+      .getBytes();
+
+  protected static final Random rand = PlatformRandomHelper.
+      getPlatformSpecificRandomFactory();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  public abstract DBContainer startDb()
+      throws RocksDBException;
+
+  @Test
+  public void setSnapshot() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      txn.setSnapshot();
+    }
+  }
+
+  @Test
+  public void setSnapshotOnNextOperation() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      txn.setSnapshotOnNextOperation();
+      txn.put("key1".getBytes(), "value1".getBytes());
+    }
+  }
+
+  @Test
+  public void setSnapshotOnNextOperation_transactionNotifier() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+
+      try(final TestTransactionNotifier notifier = new TestTransactionNotifier()) {
+        txn.setSnapshotOnNextOperation(notifier);
+        txn.put("key1".getBytes(), "value1".getBytes());
+
+        txn.setSnapshotOnNextOperation(notifier);
+        txn.put("key2".getBytes(), "value2".getBytes());
+
+        assertThat(notifier.getCreatedSnapshots().size()).isEqualTo(2);
+      }
+    }
+  }
+
+  @Test
+  public void getSnapshot() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      txn.setSnapshot();
+      final Snapshot snapshot = txn.getSnapshot();
+      assertThat(snapshot.isOwningHandle()).isFalse();
+    }
+  }
+
+  @Test
+  public void getSnapshot_null() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final Snapshot snapshot = txn.getSnapshot();
+      assertThat(snapshot).isNull();
+    }
+  }
+
+  @Test
+  public void clearSnapshot() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      txn.setSnapshot();
+      txn.clearSnapshot();
+    }
+  }
+
+  @Test
+  public void clearSnapshot_none() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      txn.clearSnapshot();
+    }
+  }
+
+  @Test
+  public void commit() throws RocksDBException {
+    final byte k1[] = "rollback-key1".getBytes(UTF_8);
+    final byte v1[] = "rollback-value1".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb()) {
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(k1, v1);
+        txn.commit();
+      }
+
+      try(final ReadOptions readOptions = new ReadOptions();
+          final Transaction txn2 = dbContainer.beginTransaction()) {
+        assertThat(txn2.get(readOptions, k1)).isEqualTo(v1);
+      }
+    }
+  }
+
+  @Test
+  public void rollback() throws RocksDBException {
+    final byte k1[] = "rollback-key1".getBytes(UTF_8);
+    final byte v1[] = "rollback-value1".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb()) {
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(k1, v1);
+        txn.rollback();
+      }
+
+      try(final ReadOptions readOptions = new ReadOptions();
+          final Transaction txn2 = dbContainer.beginTransaction()) {
+        assertThat(txn2.get(readOptions, k1)).isNull();
+      }
+    }
+  }
+
+  @Test
+  public void savePoint() throws RocksDBException {
+    final byte k1[] = "savePoint-key1".getBytes(UTF_8);
+    final byte v1[] = "savePoint-value1".getBytes(UTF_8);
+    final byte k2[] = "savePoint-key2".getBytes(UTF_8);
+    final byte v2[] = "savePoint-value2".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions()) {
+
+
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(k1, v1);
+
+        assertThat(txn.get(readOptions, k1)).isEqualTo(v1);
+
+        txn.setSavePoint();
+
+        txn.put(k2, v2);
+
+        assertThat(txn.get(readOptions, k1)).isEqualTo(v1);
+        assertThat(txn.get(readOptions, k2)).isEqualTo(v2);
+
+        txn.rollbackToSavePoint();
+
+        assertThat(txn.get(readOptions, k1)).isEqualTo(v1);
+        assertThat(txn.get(readOptions, k2)).isNull();
+
+        txn.commit();
+      }
+
+      try(final Transaction txn2 = dbContainer.beginTransaction()) {
+        assertThat(txn2.get(readOptions, k1)).isEqualTo(v1);
+        assertThat(txn2.get(readOptions, k2)).isNull();
+      }
+    }
+  }
+
+  @Test
+  public void getPut_cf() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      assertThat(txn.get(testCf, readOptions, k1)).isNull();
+      txn.put(testCf, k1, v1);
+      assertThat(txn.get(testCf, readOptions, k1)).isEqualTo(v1);
+    }
+  }
+
+  @Test
+  public void getPut() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      assertThat(txn.get(readOptions, k1)).isNull();
+      txn.put(k1, v1);
+      assertThat(txn.get(readOptions, k1)).isEqualTo(v1);
+    }
+  }
+
+  @Test
+  public void multiGetPut_cf() throws RocksDBException {
+    final byte[][] keys = new byte[][] {"key1".getBytes(UTF_8), "key2".getBytes(UTF_8)};
+    final byte[][] values = new byte[][] {"value1".getBytes(UTF_8), "value2".getBytes(UTF_8)};
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      final List<ColumnFamilyHandle> cfList = Arrays.asList(testCf, testCf);
+
+      assertThat(txn.multiGet(readOptions, cfList, keys)).isEqualTo(new byte[][] { null, null });
+
+      txn.put(testCf, keys[0], values[0]);
+      txn.put(testCf, keys[1], values[1]);
+      assertThat(txn.multiGet(readOptions, cfList, keys)).isEqualTo(values);
+    }
+  }
+
+  @Test
+  public void multiGetPutAsList_cf() throws RocksDBException {
+    final byte[][] keys = new byte[][] {"key1".getBytes(UTF_8), "key2".getBytes(UTF_8)};
+    final byte[][] values = new byte[][] {"value1".getBytes(UTF_8), "value2".getBytes(UTF_8)};
+
+    try (final DBContainer dbContainer = startDb();
+         final ReadOptions readOptions = new ReadOptions();
+         final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      final List<ColumnFamilyHandle> cfList = Arrays.asList(testCf, testCf);
+
+      assertThat(txn.multiGetAsList(readOptions, cfList, Arrays.asList(keys)))
+          .containsExactly(null, null);
+
+      txn.put(testCf, keys[0], values[0]);
+      txn.put(testCf, keys[1], values[1]);
+      assertThat(txn.multiGetAsList(readOptions, cfList, Arrays.asList(keys)))
+          .containsExactly(values);
+    }
+  }
+
+  @Test
+  public void multiGetPut() throws RocksDBException {
+    final byte[][] keys = new byte[][] {"key1".getBytes(UTF_8), "key2".getBytes(UTF_8)};
+    final byte[][] values = new byte[][] {"value1".getBytes(UTF_8), "value2".getBytes(UTF_8)};
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+
+      assertThat(txn.multiGet(readOptions, keys)).isEqualTo(new byte[][] { null, null });
+
+      txn.put(keys[0], values[0]);
+      txn.put(keys[1], values[1]);
+      assertThat(txn.multiGet(readOptions, keys)).isEqualTo(values);
+    }
+  }
+
+  @Test
+  public void multiGetPutAsList() throws RocksDBException {
+    final byte[][] keys = new byte[][] {"key1".getBytes(UTF_8), "key2".getBytes(UTF_8)};
+    final byte[][] values = new byte[][] {"value1".getBytes(UTF_8), "value2".getBytes(UTF_8)};
+
+    try (final DBContainer dbContainer = startDb();
+         final ReadOptions readOptions = new ReadOptions();
+         final Transaction txn = dbContainer.beginTransaction()) {
+      assertThat(txn.multiGetAsList(readOptions, Arrays.asList(keys))).containsExactly(null, null);
+
+      txn.put(keys[0], values[0]);
+      txn.put(keys[1], values[1]);
+      assertThat(txn.multiGetAsList(readOptions, Arrays.asList(keys))).containsExactly(values);
+    }
+  }
+
+  @Test
+  public void getForUpdate_cf() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      assertThat(txn.getForUpdate(readOptions, testCf, k1, true)).isNull();
+      txn.put(testCf, k1, v1);
+      assertThat(txn.getForUpdate(readOptions, testCf, k1, true)).isEqualTo(v1);
+    }
+  }
+
+  @Test
+  public void getForUpdate() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      assertThat(txn.getForUpdate(readOptions, k1, true)).isNull();
+      txn.put(k1, v1);
+      assertThat(txn.getForUpdate(readOptions, k1, true)).isEqualTo(v1);
+    }
+  }
+
+  @Test
+  public void multiGetForUpdate_cf() throws RocksDBException {
+    final byte keys[][] = new byte[][] {
+        "key1".getBytes(UTF_8),
+        "key2".getBytes(UTF_8)};
+    final byte values[][] = new byte[][] {
+        "value1".getBytes(UTF_8),
+        "value2".getBytes(UTF_8)};
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      final List<ColumnFamilyHandle> cfList = Arrays.asList(testCf, testCf);
+
+      assertThat(txn.multiGetForUpdate(readOptions, cfList, keys))
+          .isEqualTo(new byte[][] { null, null });
+
+      txn.put(testCf, keys[0], values[0]);
+      txn.put(testCf, keys[1], values[1]);
+      assertThat(txn.multiGetForUpdate(readOptions, cfList, keys))
+          .isEqualTo(values);
+    }
+  }
+
+  @Test
+  public void multiGetForUpdate() throws RocksDBException {
+    final byte keys[][] = new byte[][]{
+        "key1".getBytes(UTF_8),
+        "key2".getBytes(UTF_8)};
+    final byte values[][] = new byte[][]{
+        "value1".getBytes(UTF_8),
+        "value2".getBytes(UTF_8)};
+
+    try (final DBContainer dbContainer = startDb();
+         final ReadOptions readOptions = new ReadOptions();
+         final Transaction txn = dbContainer.beginTransaction()) {
+      assertThat(txn.multiGetForUpdate(readOptions, keys)).isEqualTo(new byte[][]{null, null});
+
+      txn.put(keys[0], values[0]);
+      txn.put(keys[1], values[1]);
+      assertThat(txn.multiGetForUpdate(readOptions, keys)).isEqualTo(values);
+    }
+  }
+
+  @Test
+  public void getIterator() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+
+      final byte[] k1 = "key1".getBytes(UTF_8);
+      final byte[] v1 = "value1".getBytes(UTF_8);
+
+      txn.put(k1, v1);
+
+      try(final RocksIterator iterator = txn.getIterator(readOptions)) {
+        iterator.seek(k1);
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo(k1);
+        assertThat(iterator.value()).isEqualTo(v1);
+      }
+    }
+  }
+
+  @Test
+  public void getIterator_cf() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+
+      final byte[] k1 = "key1".getBytes(UTF_8);
+      final byte[] v1 = "value1".getBytes(UTF_8);
+
+      txn.put(testCf, k1, v1);
+
+      try(final RocksIterator iterator = txn.getIterator(readOptions, testCf)) {
+        iterator.seek(k1);
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo(k1);
+        assertThat(iterator.value()).isEqualTo(v1);
+      }
+    }
+  }
+
+  @Test
+  public void merge_cf() throws RocksDBException {
+    final byte[] k1 = "key1".getBytes(UTF_8);
+    final byte[] v1 = "value1".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      txn.merge(testCf, k1, v1);
+    }
+  }
+
+  @Test
+  public void merge() throws RocksDBException {
+    final byte[] k1 = "key1".getBytes(UTF_8);
+    final byte[] v1 = "value1".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      txn.merge(k1, v1);
+    }
+  }
+
+
+  @Test
+  public void delete_cf() throws RocksDBException {
+    final byte[] k1 = "key1".getBytes(UTF_8);
+    final byte[] v1 = "value1".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      txn.put(testCf, k1, v1);
+      assertThat(txn.get(testCf, readOptions, k1)).isEqualTo(v1);
+
+      txn.delete(testCf, k1);
+      assertThat(txn.get(testCf, readOptions, k1)).isNull();
+    }
+  }
+
+  @Test
+  public void delete() throws RocksDBException {
+    final byte[] k1 = "key1".getBytes(UTF_8);
+    final byte[] v1 = "value1".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      txn.put(k1, v1);
+      assertThat(txn.get(readOptions, k1)).isEqualTo(v1);
+
+      txn.delete(k1);
+      assertThat(txn.get(readOptions, k1)).isNull();
+    }
+  }
+
+  @Test
+  public void delete_parts_cf() throws RocksDBException {
+    final byte keyParts[][] = new byte[][] {
+        "ke".getBytes(UTF_8),
+        "y1".getBytes(UTF_8)};
+    final byte valueParts[][] = new byte[][] {
+        "val".getBytes(UTF_8),
+        "ue1".getBytes(UTF_8)};
+    final byte[] key = concat(keyParts);
+    final byte[] value = concat(valueParts);
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      txn.put(testCf, keyParts, valueParts);
+      assertThat(txn.get(testCf, readOptions, key)).isEqualTo(value);
+
+      txn.delete(testCf, keyParts);
+
+      assertThat(txn.get(testCf, readOptions, key))
+          .isNull();
+    }
+  }
+
+  @Test
+  public void delete_parts() throws RocksDBException {
+    final byte keyParts[][] = new byte[][] {
+        "ke".getBytes(UTF_8),
+        "y1".getBytes(UTF_8)};
+    final byte valueParts[][] = new byte[][] {
+        "val".getBytes(UTF_8),
+        "ue1".getBytes(UTF_8)};
+    final byte[] key = concat(keyParts);
+    final byte[] value = concat(valueParts);
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+
+      txn.put(keyParts, valueParts);
+
+      assertThat(txn.get(readOptions, key)).isEqualTo(value);
+
+      txn.delete(keyParts);
+
+      assertThat(txn.get(readOptions, key)).isNull();
+    }
+  }
+
+  @Test
+  public void getPutUntracked_cf() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      assertThat(txn.get(testCf, readOptions, k1)).isNull();
+      txn.putUntracked(testCf, k1, v1);
+      assertThat(txn.get(testCf, readOptions, k1)).isEqualTo(v1);
+    }
+  }
+
+  @Test
+  public void getPutUntracked() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      assertThat(txn.get(readOptions, k1)).isNull();
+      txn.putUntracked(k1, v1);
+      assertThat(txn.get(readOptions, k1)).isEqualTo(v1);
+    }
+  }
+
+  @Deprecated
+  @Test
+  public void multiGetPutUntracked_cf() throws RocksDBException {
+    final byte keys[][] = new byte[][] {
+        "key1".getBytes(UTF_8),
+        "key2".getBytes(UTF_8)};
+    final byte values[][] = new byte[][] {
+        "value1".getBytes(UTF_8),
+        "value2".getBytes(UTF_8)};
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+
+      final List<ColumnFamilyHandle> cfList = Arrays.asList(testCf, testCf);
+
+      assertThat(txn.multiGet(readOptions, cfList, keys)).isEqualTo(new byte[][] { null, null });
+      txn.putUntracked(testCf, keys[0], values[0]);
+      txn.putUntracked(testCf, keys[1], values[1]);
+      assertThat(txn.multiGet(readOptions, cfList, keys)).isEqualTo(values);
+    }
+  }
+
+  @Test
+  public void multiGetPutUntrackedAsList_cf() throws RocksDBException {
+    final byte[][] keys = new byte[][] {"key1".getBytes(UTF_8), "key2".getBytes(UTF_8)};
+    final byte[][] values = new byte[][] {"value1".getBytes(UTF_8), "value2".getBytes(UTF_8)};
+
+    try (final DBContainer dbContainer = startDb();
+         final ReadOptions readOptions = new ReadOptions();
+         final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+
+      final List<ColumnFamilyHandle> cfList = Arrays.asList(testCf, testCf);
+
+      assertThat(txn.multiGetAsList(readOptions, cfList, Arrays.asList(keys)))
+          .containsExactly(null, null);
+      txn.putUntracked(testCf, keys[0], values[0]);
+      txn.putUntracked(testCf, keys[1], values[1]);
+      assertThat(txn.multiGetAsList(readOptions, cfList, Arrays.asList(keys)))
+          .containsExactly(values);
+    }
+  }
+
+  @Deprecated
+  @Test
+  public void multiGetPutUntracked() throws RocksDBException {
+    final byte[][] keys = new byte[][] {"key1".getBytes(UTF_8), "key2".getBytes(UTF_8)};
+    final byte[][] values = new byte[][] {"value1".getBytes(UTF_8), "value2".getBytes(UTF_8)};
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+
+      assertThat(txn.multiGet(readOptions, keys)).isEqualTo(new byte[][] { null, null });
+      txn.putUntracked(keys[0], values[0]);
+      txn.putUntracked(keys[1], values[1]);
+      assertThat(txn.multiGet(readOptions, keys)).isEqualTo(values);
+    }
+  }
+
+  @Test
+  public void multiGetPutAsListUntracked() throws RocksDBException {
+    final byte[][] keys = new byte[][] {"key1".getBytes(UTF_8), "key2".getBytes(UTF_8)};
+    final byte[][] values = new byte[][] {"value1".getBytes(UTF_8), "value2".getBytes(UTF_8)};
+
+    try (final DBContainer dbContainer = startDb();
+         final ReadOptions readOptions = new ReadOptions();
+         final Transaction txn = dbContainer.beginTransaction()) {
+      assertThat(txn.multiGetAsList(readOptions, Arrays.asList(keys))).containsExactly(null, null);
+      txn.putUntracked(keys[0], values[0]);
+      txn.putUntracked(keys[1], values[1]);
+      assertThat(txn.multiGetAsList(readOptions, Arrays.asList(keys))).containsExactly(values);
+    }
+  }
+
+  @Test
+  public void mergeUntracked_cf() throws RocksDBException {
+    final byte[] k1 = "key1".getBytes(UTF_8);
+    final byte[] v1 = "value1".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      txn.mergeUntracked(testCf, k1, v1);
+    }
+  }
+
+  @Test
+  public void mergeUntracked() throws RocksDBException {
+    final byte[] k1 = "key1".getBytes(UTF_8);
+    final byte[] v1 = "value1".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      txn.mergeUntracked(k1, v1);
+    }
+  }
+
+  @Test
+  public void deleteUntracked_cf() throws RocksDBException {
+    final byte[] k1 = "key1".getBytes(UTF_8);
+    final byte[] v1 = "value1".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      txn.put(testCf, k1, v1);
+      assertThat(txn.get(testCf, readOptions, k1)).isEqualTo(v1);
+
+      txn.deleteUntracked(testCf, k1);
+      assertThat(txn.get(testCf, readOptions, k1)).isNull();
+    }
+  }
+
+  @Test
+  public void deleteUntracked() throws RocksDBException {
+    final byte[] k1 = "key1".getBytes(UTF_8);
+    final byte[] v1 = "value1".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      txn.put(k1, v1);
+      assertThat(txn.get(readOptions, k1)).isEqualTo(v1);
+
+      txn.deleteUntracked(k1);
+      assertThat(txn.get(readOptions, k1)).isNull();
+    }
+  }
+
+  @Test
+  public void deleteUntracked_parts_cf() throws RocksDBException {
+    final byte keyParts[][] = new byte[][] {
+        "ke".getBytes(UTF_8),
+        "y1".getBytes(UTF_8)};
+    final byte valueParts[][] = new byte[][] {
+        "val".getBytes(UTF_8),
+        "ue1".getBytes(UTF_8)};
+    final byte[] key = concat(keyParts);
+    final byte[] value = concat(valueParts);
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      txn.put(testCf, keyParts, valueParts);
+      assertThat(txn.get(testCf, readOptions, key)).isEqualTo(value);
+
+      txn.deleteUntracked(testCf, keyParts);
+      assertThat(txn.get(testCf, readOptions, key)).isNull();
+    }
+  }
+
+  @Test
+  public void deleteUntracked_parts() throws RocksDBException {
+    final byte keyParts[][] = new byte[][] {
+        "ke".getBytes(UTF_8),
+        "y1".getBytes(UTF_8)};
+    final byte valueParts[][] = new byte[][] {
+        "val".getBytes(UTF_8),
+        "ue1".getBytes(UTF_8)};
+    final byte[] key = concat(keyParts);
+    final byte[] value = concat(valueParts);
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      txn.put(keyParts, valueParts);
+      assertThat(txn.get(readOptions, key)).isEqualTo(value);
+
+      txn.deleteUntracked(keyParts);
+      assertThat(txn.get(readOptions, key)).isNull();
+    }
+  }
+
+  @Test
+  public void putLogData() throws RocksDBException {
+    final byte[] blob = "blobby".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      txn.putLogData(blob);
+    }
+  }
+
+  @Test
+  public void enabledDisableIndexing() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      txn.disableIndexing();
+      txn.enableIndexing();
+      txn.disableIndexing();
+      txn.enableIndexing();
+    }
+  }
+
+  @Test
+  public void numKeys() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+    final byte k2[] = "key2".getBytes(UTF_8);
+    final byte v2[] = "value2".getBytes(UTF_8);
+    final byte k3[] = "key3".getBytes(UTF_8);
+    final byte v3[] = "value3".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      txn.put(k1, v1);
+      txn.put(testCf, k2, v2);
+      txn.merge(k3, v3);
+      txn.delete(testCf, k2);
+
+      assertThat(txn.getNumKeys()).isEqualTo(3);
+      assertThat(txn.getNumPuts()).isEqualTo(2);
+      assertThat(txn.getNumMerges()).isEqualTo(1);
+      assertThat(txn.getNumDeletes()).isEqualTo(1);
+    }
+  }
+
+  @Test
+  public void elapsedTime() throws RocksDBException, InterruptedException {
+    final long preStartTxnTime = System.currentTimeMillis();
+    try (final DBContainer dbContainer = startDb();
+         final Transaction txn = dbContainer.beginTransaction()) {
+      Thread.sleep(2);
+
+      final long txnElapsedTime = txn.getElapsedTime();
+      assertThat(txnElapsedTime).isLessThan(System.currentTimeMillis() - preStartTxnTime);
+      assertThat(txnElapsedTime).isGreaterThan(0);
+    }
+  }
+
+  @Test
+  public void getWriteBatch() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+
+      txn.put(k1, v1);
+
+      final WriteBatchWithIndex writeBatch = txn.getWriteBatch();
+      assertThat(writeBatch).isNotNull();
+      assertThat(writeBatch.isOwningHandle()).isFalse();
+      assertThat(writeBatch.count()).isEqualTo(1);
+    }
+  }
+
+  @Test
+  public void setLockTimeout() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      txn.setLockTimeout(1000);
+    }
+  }
+
+  @Test
+  public void writeOptions() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final WriteOptions writeOptions = new WriteOptions()
+        .setDisableWAL(true)
+        .setSync(true);
+        final Transaction txn = dbContainer.beginTransaction(writeOptions)) {
+
+      txn.put(k1, v1);
+
+      WriteOptions txnWriteOptions = txn.getWriteOptions();
+      assertThat(txnWriteOptions).isNotNull();
+      assertThat(txnWriteOptions.isOwningHandle()).isFalse();
+      assertThat(txnWriteOptions).isNotSameAs(writeOptions);
+      assertThat(txnWriteOptions.disableWAL()).isTrue();
+      assertThat(txnWriteOptions.sync()).isTrue();
+
+      txn.setWriteOptions(txnWriteOptions.setSync(false));
+      txnWriteOptions = txn.getWriteOptions();
+      assertThat(txnWriteOptions).isNotNull();
+      assertThat(txnWriteOptions.isOwningHandle()).isFalse();
+      assertThat(txnWriteOptions).isNotSameAs(writeOptions);
+      assertThat(txnWriteOptions.disableWAL()).isTrue();
+      assertThat(txnWriteOptions.sync()).isFalse();
+    }
+  }
+
+  @Test
+  public void undoGetForUpdate_cf() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      assertThat(txn.getForUpdate(readOptions, testCf, k1, true)).isNull();
+      txn.put(testCf, k1, v1);
+      assertThat(txn.getForUpdate(readOptions, testCf, k1, true)).isEqualTo(v1);
+      txn.undoGetForUpdate(testCf, k1);
+    }
+  }
+
+  @Test
+  public void undoGetForUpdate() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      assertThat(txn.getForUpdate(readOptions, k1, true)).isNull();
+      txn.put(k1, v1);
+      assertThat(txn.getForUpdate(readOptions, k1, true)).isEqualTo(v1);
+      txn.undoGetForUpdate(k1);
+    }
+  }
+
+  @Test
+  public void rebuildFromWriteBatch() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+    final byte k2[] = "key2".getBytes(UTF_8);
+    final byte v2[] = "value2".getBytes(UTF_8);
+    final byte k3[] = "key3".getBytes(UTF_8);
+    final byte v3[] = "value3".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+
+      txn.put(k1, v1);
+
+      assertThat(txn.get(readOptions, k1)).isEqualTo(v1);
+      assertThat(txn.getNumKeys()).isEqualTo(1);
+
+      try(final WriteBatch writeBatch = new WriteBatch()) {
+        writeBatch.put(k2, v2);
+        writeBatch.put(k3, v3);
+        txn.rebuildFromWriteBatch(writeBatch);
+
+        assertThat(txn.get(readOptions, k1)).isEqualTo(v1);
+        assertThat(txn.get(readOptions, k2)).isEqualTo(v2);
+        assertThat(txn.get(readOptions, k3)).isEqualTo(v3);
+        assertThat(txn.getNumKeys()).isEqualTo(3);
+      }
+    }
+  }
+
+  @Test
+  public void getCommitTimeWriteBatch() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+
+      txn.put(k1, v1);
+      final WriteBatch writeBatch = txn.getCommitTimeWriteBatch();
+
+      assertThat(writeBatch).isNotNull();
+      assertThat(writeBatch.isOwningHandle()).isFalse();
+      assertThat(writeBatch.count()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void logNumber() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      assertThat(txn.getLogNumber()).isEqualTo(0);
+      final long logNumber = rand.nextLong();
+      txn.setLogNumber(logNumber);
+      assertThat(txn.getLogNumber()).isEqualTo(logNumber);
+    }
+  }
+
+  private static byte[] concat(final byte[][] bufs) {
+    int resultLength = 0;
+    for(final byte[] buf : bufs) {
+      resultLength += buf.length;
+    }
+
+    final byte[] result = new byte[resultLength];
+    int resultOffset = 0;
+    for(final byte[] buf : bufs) {
+      final int srcLength = buf.length;
+      System.arraycopy(buf, 0, result, resultOffset, srcLength);
+      resultOffset += srcLength;
+    }
+
+    return result;
+  }
+
+  private static class TestTransactionNotifier
+      extends AbstractTransactionNotifier {
+    private final List<Snapshot> createdSnapshots = new ArrayList<>();
+
+    @Override
+    public void snapshotCreated(final Snapshot newSnapshot) {
+      createdSnapshots.add(newSnapshot);
+    }
+
+    public List<Snapshot> getCreatedSnapshots() {
+      return createdSnapshots;
+    }
+  }
+
+  protected static abstract class DBContainer
+      implements AutoCloseable {
+    protected final WriteOptions writeOptions;
+    protected final List<ColumnFamilyHandle> columnFamilyHandles;
+    protected final ColumnFamilyOptions columnFamilyOptions;
+    protected final DBOptions options;
+
+    public DBContainer(final WriteOptions writeOptions,
+        final List<ColumnFamilyHandle> columnFamilyHandles,
+        final ColumnFamilyOptions columnFamilyOptions,
+        final DBOptions options) {
+      this.writeOptions = writeOptions;
+      this.columnFamilyHandles = columnFamilyHandles;
+      this.columnFamilyOptions = columnFamilyOptions;
+      this.options = options;
+    }
+
+    public abstract Transaction beginTransaction();
+
+    public abstract Transaction beginTransaction(
+        final WriteOptions writeOptions);
+
+    public ColumnFamilyHandle getTestColumnFamily() {
+      return columnFamilyHandles.get(1);
+    }
+
+    @Override
+    public abstract void close();
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/BackupEngineOptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/BackupEngineOptionsTest.java
new file mode 100644
index 000000000..794bf04fb
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/BackupEngineOptionsTest.java
@@ -0,0 +1,300 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.util.Random;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+
+public class BackupEngineOptionsTest {
+  private final static String ARBITRARY_PATH =
+      System.getProperty("java.io.tmpdir");
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public ExpectedException exception = ExpectedException.none();
+
+  public static final Random rand = PlatformRandomHelper.
+      getPlatformSpecificRandomFactory();
+
+  @Test
+  public void backupDir() {
+    try (final BackupEngineOptions backupEngineOptions = new BackupEngineOptions(ARBITRARY_PATH)) {
+      assertThat(backupEngineOptions.backupDir()).isEqualTo(ARBITRARY_PATH);
+    }
+  }
+
+  @Test
+  public void env() {
+    try (final BackupEngineOptions backupEngineOptions = new BackupEngineOptions(ARBITRARY_PATH)) {
+      assertThat(backupEngineOptions.backupEnv()).isNull();
+
+      try(final Env env = new RocksMemEnv(Env.getDefault())) {
+        backupEngineOptions.setBackupEnv(env);
+        assertThat(backupEngineOptions.backupEnv()).isEqualTo(env);
+      }
+    }
+  }
+
+  @Test
+  public void shareTableFiles() {
+    try (final BackupEngineOptions backupEngineOptions = new BackupEngineOptions(ARBITRARY_PATH)) {
+      final boolean value = rand.nextBoolean();
+      backupEngineOptions.setShareTableFiles(value);
+      assertThat(backupEngineOptions.shareTableFiles()).isEqualTo(value);
+    }
+  }
+
+  @Test
+  public void infoLog() {
+    try (final BackupEngineOptions backupEngineOptions = new BackupEngineOptions(ARBITRARY_PATH)) {
+      assertThat(backupEngineOptions.infoLog()).isNull();
+
+      try(final Options options = new Options();
+          final Logger logger = new Logger(options){
+            @Override
+            protected void log(InfoLogLevel infoLogLevel, String logMsg) {
+
+            }
+          }) {
+        backupEngineOptions.setInfoLog(logger);
+        assertThat(backupEngineOptions.infoLog()).isEqualTo(logger);
+      }
+    }
+  }
+
+  @Test
+  public void sync() {
+    try (final BackupEngineOptions backupEngineOptions = new BackupEngineOptions(ARBITRARY_PATH)) {
+      final boolean value = rand.nextBoolean();
+      backupEngineOptions.setSync(value);
+      assertThat(backupEngineOptions.sync()).isEqualTo(value);
+    }
+  }
+
+  @Test
+  public void destroyOldData() {
+    try (final BackupEngineOptions backupEngineOptions = new BackupEngineOptions(ARBITRARY_PATH);) {
+      final boolean value = rand.nextBoolean();
+      backupEngineOptions.setDestroyOldData(value);
+      assertThat(backupEngineOptions.destroyOldData()).isEqualTo(value);
+    }
+  }
+
+  @Test
+  public void backupLogFiles() {
+    try (final BackupEngineOptions backupEngineOptions = new BackupEngineOptions(ARBITRARY_PATH)) {
+      final boolean value = rand.nextBoolean();
+      backupEngineOptions.setBackupLogFiles(value);
+      assertThat(backupEngineOptions.backupLogFiles()).isEqualTo(value);
+    }
+  }
+
+  @Test
+  public void backupRateLimit() {
+    try (final BackupEngineOptions backupEngineOptions = new BackupEngineOptions(ARBITRARY_PATH)) {
+      final long value = Math.abs(rand.nextLong());
+      backupEngineOptions.setBackupRateLimit(value);
+      assertThat(backupEngineOptions.backupRateLimit()).isEqualTo(value);
+      // negative will be mapped to 0
+      backupEngineOptions.setBackupRateLimit(-1);
+      assertThat(backupEngineOptions.backupRateLimit()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void backupRateLimiter() {
+    try (final BackupEngineOptions backupEngineOptions = new BackupEngineOptions(ARBITRARY_PATH)) {
+      assertThat(backupEngineOptions.backupEnv()).isNull();
+
+      try(final RateLimiter backupRateLimiter =
+              new RateLimiter(999)) {
+        backupEngineOptions.setBackupRateLimiter(backupRateLimiter);
+        assertThat(backupEngineOptions.backupRateLimiter()).isEqualTo(backupRateLimiter);
+      }
+    }
+  }
+
+  @Test
+  public void restoreRateLimit() {
+    try (final BackupEngineOptions backupEngineOptions = new BackupEngineOptions(ARBITRARY_PATH)) {
+      final long value = Math.abs(rand.nextLong());
+      backupEngineOptions.setRestoreRateLimit(value);
+      assertThat(backupEngineOptions.restoreRateLimit()).isEqualTo(value);
+      // negative will be mapped to 0
+      backupEngineOptions.setRestoreRateLimit(-1);
+      assertThat(backupEngineOptions.restoreRateLimit()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void restoreRateLimiter() {
+    try (final BackupEngineOptions backupEngineOptions = new BackupEngineOptions(ARBITRARY_PATH)) {
+      assertThat(backupEngineOptions.backupEnv()).isNull();
+
+      try(final RateLimiter restoreRateLimiter =
+              new RateLimiter(911)) {
+        backupEngineOptions.setRestoreRateLimiter(restoreRateLimiter);
+        assertThat(backupEngineOptions.restoreRateLimiter()).isEqualTo(restoreRateLimiter);
+      }
+    }
+  }
+
+  @Test
+  public void shareFilesWithChecksum() {
+    try (final BackupEngineOptions backupEngineOptions = new BackupEngineOptions(ARBITRARY_PATH)) {
+      boolean value = rand.nextBoolean();
+      backupEngineOptions.setShareFilesWithChecksum(value);
+      assertThat(backupEngineOptions.shareFilesWithChecksum()).isEqualTo(value);
+    }
+  }
+
+  @Test
+  public void maxBackgroundOperations() {
+    try (final BackupEngineOptions backupEngineOptions = new BackupEngineOptions(ARBITRARY_PATH)) {
+      final int value = rand.nextInt();
+      backupEngineOptions.setMaxBackgroundOperations(value);
+      assertThat(backupEngineOptions.maxBackgroundOperations()).isEqualTo(value);
+    }
+  }
+
+  @Test
+  public void callbackTriggerIntervalSize() {
+    try (final BackupEngineOptions backupEngineOptions = new BackupEngineOptions(ARBITRARY_PATH)) {
+      final long value = rand.nextLong();
+      backupEngineOptions.setCallbackTriggerIntervalSize(value);
+      assertThat(backupEngineOptions.callbackTriggerIntervalSize()).isEqualTo(value);
+    }
+  }
+
+  @Test
+  public void failBackupDirIsNull() {
+    exception.expect(IllegalArgumentException.class);
+    try (final BackupEngineOptions opts = new BackupEngineOptions(null)) {
+      //no-op
+    }
+  }
+
+  @Test
+  public void failBackupDirIfDisposed() {
+    try (final BackupEngineOptions options = setupUninitializedBackupEngineOptions(exception)) {
+      options.backupDir();
+    }
+  }
+
+  @Test
+  public void failSetShareTableFilesIfDisposed() {
+    try (final BackupEngineOptions options = setupUninitializedBackupEngineOptions(exception)) {
+      options.setShareTableFiles(true);
+    }
+  }
+
+  @Test
+  public void failShareTableFilesIfDisposed() {
+    try (BackupEngineOptions options = setupUninitializedBackupEngineOptions(exception)) {
+      options.shareTableFiles();
+    }
+  }
+
+  @Test
+  public void failSetSyncIfDisposed() {
+    try (final BackupEngineOptions options = setupUninitializedBackupEngineOptions(exception)) {
+      options.setSync(true);
+    }
+  }
+
+  @Test
+  public void failSyncIfDisposed() {
+    try (final BackupEngineOptions options = setupUninitializedBackupEngineOptions(exception)) {
+      options.sync();
+    }
+  }
+
+  @Test
+  public void failSetDestroyOldDataIfDisposed() {
+    try (final BackupEngineOptions options = setupUninitializedBackupEngineOptions(exception)) {
+      options.setDestroyOldData(true);
+    }
+  }
+
+  @Test
+  public void failDestroyOldDataIfDisposed() {
+    try (final BackupEngineOptions options = setupUninitializedBackupEngineOptions(exception)) {
+      options.destroyOldData();
+    }
+  }
+
+  @Test
+  public void failSetBackupLogFilesIfDisposed() {
+    try (final BackupEngineOptions options = setupUninitializedBackupEngineOptions(exception)) {
+      options.setBackupLogFiles(true);
+    }
+  }
+
+  @Test
+  public void failBackupLogFilesIfDisposed() {
+    try (final BackupEngineOptions options = setupUninitializedBackupEngineOptions(exception)) {
+      options.backupLogFiles();
+    }
+  }
+
+  @Test
+  public void failSetBackupRateLimitIfDisposed() {
+    try (final BackupEngineOptions options = setupUninitializedBackupEngineOptions(exception)) {
+      options.setBackupRateLimit(1);
+    }
+  }
+
+  @Test
+  public void failBackupRateLimitIfDisposed() {
+    try (final BackupEngineOptions options = setupUninitializedBackupEngineOptions(exception)) {
+      options.backupRateLimit();
+    }
+  }
+
+  @Test
+  public void failSetRestoreRateLimitIfDisposed() {
+    try (final BackupEngineOptions options = setupUninitializedBackupEngineOptions(exception)) {
+      options.setRestoreRateLimit(1);
+    }
+  }
+
+  @Test
+  public void failRestoreRateLimitIfDisposed() {
+    try (final BackupEngineOptions options = setupUninitializedBackupEngineOptions(exception)) {
+      options.restoreRateLimit();
+    }
+  }
+
+  @Test
+  public void failSetShareFilesWithChecksumIfDisposed() {
+    try (final BackupEngineOptions options = setupUninitializedBackupEngineOptions(exception)) {
+      options.setShareFilesWithChecksum(true);
+    }
+  }
+
+  @Test
+  public void failShareFilesWithChecksumIfDisposed() {
+    try (final BackupEngineOptions options = setupUninitializedBackupEngineOptions(exception)) {
+      options.shareFilesWithChecksum();
+    }
+  }
+
+  private BackupEngineOptions setupUninitializedBackupEngineOptions(ExpectedException exception) {
+    final BackupEngineOptions backupEngineOptions = new BackupEngineOptions(ARBITRARY_PATH);
+    backupEngineOptions.close();
+    exception.expect(AssertionError.class);
+    return backupEngineOptions;
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/BackupEngineTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/BackupEngineTest.java
new file mode 100644
index 000000000..67145f846
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/BackupEngineTest.java
@@ -0,0 +1,261 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.util.List;
+import java.util.concurrent.ThreadLocalRandom;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class BackupEngineTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Rule
+  public TemporaryFolder backupFolder = new TemporaryFolder();
+
+  @Test
+  public void backupDb() throws RocksDBException {
+    // Open empty database.
+    try(final Options opt = new Options().setCreateIfMissing(true);
+        final RocksDB db = RocksDB.open(opt,
+            dbFolder.getRoot().getAbsolutePath())) {
+
+      // Fill database with some test values
+      prepareDatabase(db);
+
+      // Create two backups
+      try (final BackupEngineOptions bopt =
+               new BackupEngineOptions(backupFolder.getRoot().getAbsolutePath());
+           final BackupEngine be = BackupEngine.open(opt.getEnv(), bopt)) {
+        be.createNewBackup(db, false);
+        be.createNewBackup(db, true);
+        verifyNumberOfValidBackups(be, 2);
+      }
+    }
+  }
+
+  @Test
+  public void deleteBackup() throws RocksDBException {
+    // Open empty database.
+    try(final Options opt = new Options().setCreateIfMissing(true);
+        final RocksDB db = RocksDB.open(opt,
+            dbFolder.getRoot().getAbsolutePath())) {
+      // Fill database with some test values
+      prepareDatabase(db);
+      // Create two backups
+      try (final BackupEngineOptions bopt =
+               new BackupEngineOptions(backupFolder.getRoot().getAbsolutePath());
+           final BackupEngine be = BackupEngine.open(opt.getEnv(), bopt)) {
+        be.createNewBackup(db, false);
+        be.createNewBackup(db, true);
+        final List<BackupInfo> backupInfo =
+            verifyNumberOfValidBackups(be, 2);
+        // Delete the first backup
+        be.deleteBackup(backupInfo.get(0).backupId());
+        final List<BackupInfo> newBackupInfo =
+            verifyNumberOfValidBackups(be, 1);
+
+        // The second backup must remain.
+        assertThat(newBackupInfo.get(0).backupId()).
+            isEqualTo(backupInfo.get(1).backupId());
+      }
+    }
+  }
+
+  @Test
+  public void purgeOldBackups() throws RocksDBException {
+    // Open empty database.
+    try(final Options opt = new Options().setCreateIfMissing(true);
+        final RocksDB db = RocksDB.open(opt,
+            dbFolder.getRoot().getAbsolutePath())) {
+      // Fill database with some test values
+      prepareDatabase(db);
+      // Create four backups
+      try (final BackupEngineOptions bopt =
+               new BackupEngineOptions(backupFolder.getRoot().getAbsolutePath());
+           final BackupEngine be = BackupEngine.open(opt.getEnv(), bopt)) {
+        be.createNewBackup(db, false);
+        be.createNewBackup(db, true);
+        be.createNewBackup(db, true);
+        be.createNewBackup(db, true);
+        final List<BackupInfo> backupInfo =
+            verifyNumberOfValidBackups(be, 4);
+        // Delete everything except the latest backup
+        be.purgeOldBackups(1);
+        final List<BackupInfo> newBackupInfo =
+            verifyNumberOfValidBackups(be, 1);
+        // The latest backup must remain.
+        assertThat(newBackupInfo.get(0).backupId()).
+            isEqualTo(backupInfo.get(3).backupId());
+      }
+    }
+  }
+
+  @Test
+  public void restoreLatestBackup() throws RocksDBException {
+    try(final Options opt = new Options().setCreateIfMissing(true)) {
+      // Open empty database.
+      RocksDB db = null;
+      try {
+        db = RocksDB.open(opt,
+            dbFolder.getRoot().getAbsolutePath());
+        // Fill database with some test values
+        prepareDatabase(db);
+
+        try (final BackupEngineOptions bopt =
+                 new BackupEngineOptions(backupFolder.getRoot().getAbsolutePath());
+             final BackupEngine be = BackupEngine.open(opt.getEnv(), bopt)) {
+          be.createNewBackup(db, true);
+          verifyNumberOfValidBackups(be, 1);
+          db.put("key1".getBytes(), "valueV2".getBytes());
+          db.put("key2".getBytes(), "valueV2".getBytes());
+          be.createNewBackup(db, true);
+          verifyNumberOfValidBackups(be, 2);
+          db.put("key1".getBytes(), "valueV3".getBytes());
+          db.put("key2".getBytes(), "valueV3".getBytes());
+          assertThat(new String(db.get("key1".getBytes()))).endsWith("V3");
+          assertThat(new String(db.get("key2".getBytes()))).endsWith("V3");
+
+          db.close();
+          db = null;
+
+          verifyNumberOfValidBackups(be, 2);
+          // restore db from latest backup
+          try(final RestoreOptions ropts = new RestoreOptions(false)) {
+            be.restoreDbFromLatestBackup(dbFolder.getRoot().getAbsolutePath(),
+                dbFolder.getRoot().getAbsolutePath(), ropts);
+          }
+
+          // Open database again.
+          db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath());
+
+          // Values must have suffix V2 because of restoring latest backup.
+          assertThat(new String(db.get("key1".getBytes()))).endsWith("V2");
+          assertThat(new String(db.get("key2".getBytes()))).endsWith("V2");
+        }
+      } finally {
+        if(db != null) {
+          db.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  public void restoreFromBackup()
+      throws RocksDBException {
+    try(final Options opt = new Options().setCreateIfMissing(true)) {
+      RocksDB db = null;
+      try {
+        // Open empty database.
+        db = RocksDB.open(opt,
+            dbFolder.getRoot().getAbsolutePath());
+        // Fill database with some test values
+        prepareDatabase(db);
+        try (final BackupEngineOptions bopt =
+                 new BackupEngineOptions(backupFolder.getRoot().getAbsolutePath());
+             final BackupEngine be = BackupEngine.open(opt.getEnv(), bopt)) {
+          be.createNewBackup(db, true);
+          verifyNumberOfValidBackups(be, 1);
+          db.put("key1".getBytes(), "valueV2".getBytes());
+          db.put("key2".getBytes(), "valueV2".getBytes());
+          be.createNewBackup(db, true);
+          verifyNumberOfValidBackups(be, 2);
+          db.put("key1".getBytes(), "valueV3".getBytes());
+          db.put("key2".getBytes(), "valueV3".getBytes());
+          assertThat(new String(db.get("key1".getBytes()))).endsWith("V3");
+          assertThat(new String(db.get("key2".getBytes()))).endsWith("V3");
+
+          //close the database
+          db.close();
+          db = null;
+
+          //restore the backup
+          final List<BackupInfo> backupInfo = verifyNumberOfValidBackups(be, 2);
+          // restore db from first backup
+          be.restoreDbFromBackup(backupInfo.get(0).backupId(),
+              dbFolder.getRoot().getAbsolutePath(),
+              dbFolder.getRoot().getAbsolutePath(),
+              new RestoreOptions(false));
+          // Open database again.
+          db = RocksDB.open(opt,
+              dbFolder.getRoot().getAbsolutePath());
+          // Values must have suffix V2 because of restoring latest backup.
+          assertThat(new String(db.get("key1".getBytes()))).endsWith("V1");
+          assertThat(new String(db.get("key2".getBytes()))).endsWith("V1");
+        }
+      } finally {
+        if(db != null) {
+          db.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  public void backupDbWithMetadata() throws RocksDBException {
+    // Open empty database.
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      // Fill database with some test values
+      prepareDatabase(db);
+
+      // Create two backups
+      try (final BackupEngineOptions bopt =
+               new BackupEngineOptions(backupFolder.getRoot().getAbsolutePath());
+           final BackupEngine be = BackupEngine.open(opt.getEnv(), bopt)) {
+        final String metadata = String.valueOf(ThreadLocalRandom.current().nextInt());
+        be.createNewBackupWithMetadata(db, metadata, true);
+        final List<BackupInfo> backupInfoList = verifyNumberOfValidBackups(be, 1);
+        assertThat(backupInfoList.get(0).appMetadata()).isEqualTo(metadata);
+      }
+    }
+  }
+
+  /**
+   * Verify backups.
+   *
+   * @param be {@link BackupEngine} instance.
+   * @param expectedNumberOfBackups numerical value
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  private List<BackupInfo> verifyNumberOfValidBackups(final BackupEngine be,
+      final int expectedNumberOfBackups) throws RocksDBException {
+    // Verify that backups exist
+    assertThat(be.getCorruptedBackups().length).
+        isEqualTo(0);
+    be.garbageCollect();
+    final List<BackupInfo> backupInfo = be.getBackupInfo();
+    assertThat(backupInfo.size()).
+        isEqualTo(expectedNumberOfBackups);
+    return backupInfo;
+  }
+
+  /**
+   * Fill database with some test values.
+   *
+   * @param db {@link RocksDB} instance.
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  private void prepareDatabase(final RocksDB db)
+      throws RocksDBException {
+    db.put("key1".getBytes(), "valueV1".getBytes());
+    db.put("key2".getBytes(), "valueV1".getBytes());
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/BlobOptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/BlobOptionsTest.java
new file mode 100644
index 000000000..fe3d9b246
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/BlobOptionsTest.java
@@ -0,0 +1,351 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.util.*;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+public class BlobOptionsTest {
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  final int minBlobSize = 65536;
+  final int largeBlobSize = 65536 * 2;
+
+  /**
+   * Count the files in the temporary folder which end with a particular suffix
+   * Used to query the state of a test database to check if it is as the test expects
+   *
+   * @param endsWith the suffix to match
+   * @return the number of files with a matching suffix
+   */
+  @SuppressWarnings("CallToStringConcatCanBeReplacedByOperator")
+  private int countDBFiles(final String endsWith) {
+    return Objects
+        .requireNonNull(dbFolder.getRoot().list(new FilenameFilter() {
+          @Override
+          public boolean accept(File dir, String name) {
+            return name.endsWith(endsWith);
+          }
+        }))
+        .length;
+  }
+
+  @SuppressWarnings("SameParameterValue")
+  private byte[] small_key(String suffix) {
+    return ("small_key_" + suffix).getBytes(UTF_8);
+  }
+
+  @SuppressWarnings("SameParameterValue")
+  private byte[] small_value(String suffix) {
+    return ("small_value_" + suffix).getBytes(UTF_8);
+  }
+
+  private byte[] large_key(String suffix) {
+    return ("large_key_" + suffix).getBytes(UTF_8);
+  }
+
+  private byte[] large_value(String repeat) {
+    final byte[] large_value = ("" + repeat + "_" + largeBlobSize + "b").getBytes(UTF_8);
+    final byte[] large_buffer = new byte[largeBlobSize];
+    for (int pos = 0; pos < largeBlobSize; pos += large_value.length) {
+      int numBytes = Math.min(large_value.length, large_buffer.length - pos);
+      System.arraycopy(large_value, 0, large_buffer, pos, numBytes);
+    }
+    return large_buffer;
+  }
+
+  @Test
+  public void blobOptions() {
+    try (final Options options = new Options()) {
+      assertThat(options.enableBlobFiles()).isEqualTo(false);
+      assertThat(options.minBlobSize()).isEqualTo(0);
+      assertThat(options.blobCompressionType()).isEqualTo(CompressionType.NO_COMPRESSION);
+      assertThat(options.enableBlobGarbageCollection()).isEqualTo(false);
+      assertThat(options.blobFileSize()).isEqualTo(268435456L);
+      assertThat(options.blobGarbageCollectionAgeCutoff()).isEqualTo(0.25);
+      assertThat(options.blobGarbageCollectionForceThreshold()).isEqualTo(1.0);
+      assertThat(options.blobCompactionReadaheadSize()).isEqualTo(0);
+      assertThat(options.prepopulateBlobCache())
+          .isEqualTo(PrepopulateBlobCache.PREPOPULATE_BLOB_DISABLE);
+
+      assertThat(options.setEnableBlobFiles(true)).isEqualTo(options);
+      assertThat(options.setMinBlobSize(132768L)).isEqualTo(options);
+      assertThat(options.setBlobCompressionType(CompressionType.BZLIB2_COMPRESSION))
+          .isEqualTo(options);
+      assertThat(options.setEnableBlobGarbageCollection(true)).isEqualTo(options);
+      assertThat(options.setBlobFileSize(132768L)).isEqualTo(options);
+      assertThat(options.setBlobGarbageCollectionAgeCutoff(0.89)).isEqualTo(options);
+      assertThat(options.setBlobGarbageCollectionForceThreshold(0.80)).isEqualTo(options);
+      assertThat(options.setBlobCompactionReadaheadSize(262144L)).isEqualTo(options);
+      assertThat(options.setBlobFileStartingLevel(0)).isEqualTo(options);
+      assertThat(options.setPrepopulateBlobCache(PrepopulateBlobCache.PREPOPULATE_BLOB_FLUSH_ONLY))
+          .isEqualTo(options);
+
+      assertThat(options.enableBlobFiles()).isEqualTo(true);
+      assertThat(options.minBlobSize()).isEqualTo(132768L);
+      assertThat(options.blobCompressionType()).isEqualTo(CompressionType.BZLIB2_COMPRESSION);
+      assertThat(options.enableBlobGarbageCollection()).isEqualTo(true);
+      assertThat(options.blobFileSize()).isEqualTo(132768L);
+      assertThat(options.blobGarbageCollectionAgeCutoff()).isEqualTo(0.89);
+      assertThat(options.blobGarbageCollectionForceThreshold()).isEqualTo(0.80);
+      assertThat(options.blobCompactionReadaheadSize()).isEqualTo(262144L);
+      assertThat(options.blobFileStartingLevel()).isEqualTo(0);
+      assertThat(options.prepopulateBlobCache())
+          .isEqualTo(PrepopulateBlobCache.PREPOPULATE_BLOB_FLUSH_ONLY);
+    }
+  }
+
+  @Test
+  public void blobColumnFamilyOptions() {
+    try (final ColumnFamilyOptions columnFamilyOptions = new ColumnFamilyOptions()) {
+      assertThat(columnFamilyOptions.enableBlobFiles()).isEqualTo(false);
+      assertThat(columnFamilyOptions.minBlobSize()).isEqualTo(0);
+      assertThat(columnFamilyOptions.blobCompressionType())
+          .isEqualTo(CompressionType.NO_COMPRESSION);
+      assertThat(columnFamilyOptions.enableBlobGarbageCollection()).isEqualTo(false);
+      assertThat(columnFamilyOptions.blobFileSize()).isEqualTo(268435456L);
+      assertThat(columnFamilyOptions.blobGarbageCollectionAgeCutoff()).isEqualTo(0.25);
+      assertThat(columnFamilyOptions.blobGarbageCollectionForceThreshold()).isEqualTo(1.0);
+      assertThat(columnFamilyOptions.blobCompactionReadaheadSize()).isEqualTo(0);
+
+      assertThat(columnFamilyOptions.setEnableBlobFiles(true)).isEqualTo(columnFamilyOptions);
+      assertThat(columnFamilyOptions.setMinBlobSize(132768L)).isEqualTo(columnFamilyOptions);
+      assertThat(columnFamilyOptions.setBlobCompressionType(CompressionType.BZLIB2_COMPRESSION))
+          .isEqualTo(columnFamilyOptions);
+      assertThat(columnFamilyOptions.setEnableBlobGarbageCollection(true))
+          .isEqualTo(columnFamilyOptions);
+      assertThat(columnFamilyOptions.setBlobFileSize(132768L)).isEqualTo(columnFamilyOptions);
+      assertThat(columnFamilyOptions.setBlobGarbageCollectionAgeCutoff(0.89))
+          .isEqualTo(columnFamilyOptions);
+      assertThat(columnFamilyOptions.setBlobGarbageCollectionForceThreshold(0.80))
+          .isEqualTo(columnFamilyOptions);
+      assertThat(columnFamilyOptions.setBlobCompactionReadaheadSize(262144L))
+          .isEqualTo(columnFamilyOptions);
+      assertThat(columnFamilyOptions.setBlobFileStartingLevel(0)).isEqualTo(columnFamilyOptions);
+      assertThat(columnFamilyOptions.setPrepopulateBlobCache(
+                     PrepopulateBlobCache.PREPOPULATE_BLOB_DISABLE))
+          .isEqualTo(columnFamilyOptions);
+
+      assertThat(columnFamilyOptions.enableBlobFiles()).isEqualTo(true);
+      assertThat(columnFamilyOptions.minBlobSize()).isEqualTo(132768L);
+      assertThat(columnFamilyOptions.blobCompressionType())
+          .isEqualTo(CompressionType.BZLIB2_COMPRESSION);
+      assertThat(columnFamilyOptions.enableBlobGarbageCollection()).isEqualTo(true);
+      assertThat(columnFamilyOptions.blobFileSize()).isEqualTo(132768L);
+      assertThat(columnFamilyOptions.blobGarbageCollectionAgeCutoff()).isEqualTo(0.89);
+      assertThat(columnFamilyOptions.blobGarbageCollectionForceThreshold()).isEqualTo(0.80);
+      assertThat(columnFamilyOptions.blobCompactionReadaheadSize()).isEqualTo(262144L);
+      assertThat(columnFamilyOptions.blobFileStartingLevel()).isEqualTo(0);
+      assertThat(columnFamilyOptions.prepopulateBlobCache())
+          .isEqualTo(PrepopulateBlobCache.PREPOPULATE_BLOB_DISABLE);
+    }
+  }
+
+  @Test
+  public void blobMutableColumnFamilyOptionsBuilder() {
+    final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder builder =
+        MutableColumnFamilyOptions.builder();
+    builder.setEnableBlobFiles(true)
+        .setMinBlobSize(1024)
+        .setBlobFileSize(132768)
+        .setBlobCompressionType(CompressionType.BZLIB2_COMPRESSION)
+        .setEnableBlobGarbageCollection(true)
+        .setBlobGarbageCollectionAgeCutoff(0.89)
+        .setBlobGarbageCollectionForceThreshold(0.80)
+        .setBlobCompactionReadaheadSize(262144)
+        .setBlobFileStartingLevel(1)
+        .setPrepopulateBlobCache(PrepopulateBlobCache.PREPOPULATE_BLOB_FLUSH_ONLY);
+
+    assertThat(builder.enableBlobFiles()).isEqualTo(true);
+    assertThat(builder.minBlobSize()).isEqualTo(1024);
+    assertThat(builder.blobFileSize()).isEqualTo(132768);
+    assertThat(builder.blobCompressionType()).isEqualTo(CompressionType.BZLIB2_COMPRESSION);
+    assertThat(builder.enableBlobGarbageCollection()).isEqualTo(true);
+    assertThat(builder.blobGarbageCollectionAgeCutoff()).isEqualTo(0.89);
+    assertThat(builder.blobGarbageCollectionForceThreshold()).isEqualTo(0.80);
+    assertThat(builder.blobCompactionReadaheadSize()).isEqualTo(262144);
+    assertThat(builder.blobFileStartingLevel()).isEqualTo(1);
+    assertThat(builder.prepopulateBlobCache())
+        .isEqualTo(PrepopulateBlobCache.PREPOPULATE_BLOB_FLUSH_ONLY);
+
+    builder.setEnableBlobFiles(false)
+        .setMinBlobSize(4096)
+        .setBlobFileSize(2048)
+        .setBlobCompressionType(CompressionType.LZ4_COMPRESSION)
+        .setEnableBlobGarbageCollection(false)
+        .setBlobGarbageCollectionAgeCutoff(0.91)
+        .setBlobGarbageCollectionForceThreshold(0.96)
+        .setBlobCompactionReadaheadSize(1024)
+        .setBlobFileStartingLevel(0)
+        .setPrepopulateBlobCache(PrepopulateBlobCache.PREPOPULATE_BLOB_DISABLE);
+
+    assertThat(builder.enableBlobFiles()).isEqualTo(false);
+    assertThat(builder.minBlobSize()).isEqualTo(4096);
+    assertThat(builder.blobFileSize()).isEqualTo(2048);
+    assertThat(builder.blobCompressionType()).isEqualTo(CompressionType.LZ4_COMPRESSION);
+    assertThat(builder.enableBlobGarbageCollection()).isEqualTo(false);
+    assertThat(builder.blobGarbageCollectionAgeCutoff()).isEqualTo(0.91);
+    assertThat(builder.blobGarbageCollectionForceThreshold()).isEqualTo(0.96);
+    assertThat(builder.blobCompactionReadaheadSize()).isEqualTo(1024);
+    assertThat(builder.blobFileStartingLevel()).isEqualTo(0);
+    assertThat(builder.prepopulateBlobCache())
+        .isEqualTo(PrepopulateBlobCache.PREPOPULATE_BLOB_DISABLE);
+
+    final MutableColumnFamilyOptions options = builder.build();
+    assertThat(options.getKeys())
+        .isEqualTo(new String[] {"enable_blob_files", "min_blob_size", "blob_file_size",
+            "blob_compression_type", "enable_blob_garbage_collection",
+            "blob_garbage_collection_age_cutoff", "blob_garbage_collection_force_threshold",
+            "blob_compaction_readahead_size", "blob_file_starting_level",
+            "prepopulate_blob_cache"});
+    assertThat(options.getValues())
+        .isEqualTo(new String[] {"false", "4096", "2048", "LZ4_COMPRESSION", "false", "0.91",
+            "0.96", "1024", "0", "PREPOPULATE_BLOB_DISABLE"});
+  }
+
+  /**
+   * Configure the default column family with BLOBs.
+   * Confirm that BLOBs are generated when appropriately-sized writes are flushed.
+   *
+   * @throws RocksDBException if a db access throws an exception
+   */
+  @Test
+  public void testBlobWriteAboveThreshold() throws RocksDBException {
+    try (final Options options = new Options()
+                                     .setCreateIfMissing(true)
+                                     .setMinBlobSize(minBlobSize)
+                                     .setEnableBlobFiles(true);
+
+         final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) {
+      db.put(small_key("default"), small_value("default"));
+      db.flush(new FlushOptions().setWaitForFlush(true));
+
+      // check there are no blobs in the database
+      assertThat(countDBFiles(".sst")).isEqualTo(1);
+      assertThat(countDBFiles(".blob")).isEqualTo(0);
+
+      db.put(large_key("default"), large_value("default"));
+      db.flush(new FlushOptions().setWaitForFlush(true));
+
+      // wrote and flushed a value larger than the blobbing threshold
+      // check there is a single blob in the database
+      assertThat(countDBFiles(".sst")).isEqualTo(2);
+      assertThat(countDBFiles(".blob")).isEqualTo(1);
+
+      assertThat(db.get(small_key("default"))).isEqualTo(small_value("default"));
+      assertThat(db.get(large_key("default"))).isEqualTo(large_value("default"));
+
+      final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder fetchOptions =
+          db.getOptions(null);
+      assertThat(fetchOptions.minBlobSize()).isEqualTo(minBlobSize);
+      assertThat(fetchOptions.enableBlobFiles()).isEqualTo(true);
+      assertThat(fetchOptions.writeBufferSize()).isEqualTo(64 << 20);
+    }
+  }
+
+  /**
+   * Configure 2 column families respectively with and without BLOBs.
+   * Confirm that BLOB files are generated (once the DB is flushed) only for the appropriate column
+   * family.
+   *
+   * @throws RocksDBException if a db access throws an exception
+   */
+  @Test
+  public void testBlobWriteAboveThresholdCF() throws RocksDBException {
+    final ColumnFamilyOptions columnFamilyOptions0 = new ColumnFamilyOptions();
+    final ColumnFamilyDescriptor columnFamilyDescriptor0 =
+        new ColumnFamilyDescriptor("default".getBytes(UTF_8), columnFamilyOptions0);
+    List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+        Collections.singletonList(columnFamilyDescriptor0);
+    List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+
+    try (final DBOptions dbOptions = new DBOptions().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(dbOptions, dbFolder.getRoot().getAbsolutePath(),
+             columnFamilyDescriptors, columnFamilyHandles)) {
+      db.put(columnFamilyHandles.get(0), small_key("default"), small_value("default"));
+      db.flush(new FlushOptions().setWaitForFlush(true));
+
+      assertThat(countDBFiles(".blob")).isEqualTo(0);
+
+      try (final ColumnFamilyOptions columnFamilyOptions1 =
+               new ColumnFamilyOptions().setMinBlobSize(minBlobSize).setEnableBlobFiles(true);
+
+           final ColumnFamilyOptions columnFamilyOptions2 =
+               new ColumnFamilyOptions().setMinBlobSize(minBlobSize).setEnableBlobFiles(false)) {
+        final ColumnFamilyDescriptor columnFamilyDescriptor1 =
+            new ColumnFamilyDescriptor("column_family_1".getBytes(UTF_8), columnFamilyOptions1);
+        final ColumnFamilyDescriptor columnFamilyDescriptor2 =
+            new ColumnFamilyDescriptor("column_family_2".getBytes(UTF_8), columnFamilyOptions2);
+
+        // Create the first column family with blob options
+        db.createColumnFamily(columnFamilyDescriptor1);
+
+        // Create the second column family with not-blob options
+        db.createColumnFamily(columnFamilyDescriptor2);
+      }
+    }
+
+    // Now re-open after auto-close - at this point the CF options we use are recognized.
+    try (final ColumnFamilyOptions columnFamilyOptions1 =
+             new ColumnFamilyOptions().setMinBlobSize(minBlobSize).setEnableBlobFiles(true);
+
+         final ColumnFamilyOptions columnFamilyOptions2 =
+             new ColumnFamilyOptions().setMinBlobSize(minBlobSize).setEnableBlobFiles(false)) {
+      assertThat(columnFamilyOptions1.enableBlobFiles()).isEqualTo(true);
+      assertThat(columnFamilyOptions1.minBlobSize()).isEqualTo(minBlobSize);
+      assertThat(columnFamilyOptions2.enableBlobFiles()).isEqualTo(false);
+      assertThat(columnFamilyOptions1.minBlobSize()).isEqualTo(minBlobSize);
+
+      final ColumnFamilyDescriptor columnFamilyDescriptor1 =
+          new ColumnFamilyDescriptor("column_family_1".getBytes(UTF_8), columnFamilyOptions1);
+      final ColumnFamilyDescriptor columnFamilyDescriptor2 =
+          new ColumnFamilyDescriptor("column_family_2".getBytes(UTF_8), columnFamilyOptions2);
+      columnFamilyDescriptors = new ArrayList<>();
+      columnFamilyDescriptors.add(columnFamilyDescriptor0);
+      columnFamilyDescriptors.add(columnFamilyDescriptor1);
+      columnFamilyDescriptors.add(columnFamilyDescriptor2);
+      columnFamilyHandles = new ArrayList<>();
+
+      assertThat(columnFamilyDescriptor1.getOptions().enableBlobFiles()).isEqualTo(true);
+      assertThat(columnFamilyDescriptor2.getOptions().enableBlobFiles()).isEqualTo(false);
+
+      try (final DBOptions dbOptions = new DBOptions();
+           final RocksDB db = RocksDB.open(dbOptions, dbFolder.getRoot().getAbsolutePath(),
+               columnFamilyDescriptors, columnFamilyHandles)) {
+        final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder builder1 =
+            db.getOptions(columnFamilyHandles.get(1));
+        assertThat(builder1.enableBlobFiles()).isEqualTo(true);
+        assertThat(builder1.minBlobSize()).isEqualTo(minBlobSize);
+
+        final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder builder2 =
+            db.getOptions(columnFamilyHandles.get(2));
+        assertThat(builder2.enableBlobFiles()).isEqualTo(false);
+        assertThat(builder2.minBlobSize()).isEqualTo(minBlobSize);
+
+        db.put(columnFamilyHandles.get(1), large_key("column_family_1_k2"),
+            large_value("column_family_1_k2"));
+        db.flush(new FlushOptions().setWaitForFlush(true), columnFamilyHandles.get(1));
+        assertThat(countDBFiles(".blob")).isEqualTo(1);
+
+        db.put(columnFamilyHandles.get(2), large_key("column_family_2_k2"),
+            large_value("column_family_2_k2"));
+        db.flush(new FlushOptions().setWaitForFlush(true), columnFamilyHandles.get(2));
+        assertThat(countDBFiles(".blob")).isEqualTo(1);
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java
new file mode 100644
index 000000000..330881764
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java
@@ -0,0 +1,490 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.junit.Assert.fail;
+
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.stream.Stream;
+import org.junit.ClassRule;
+import org.junit.Ignore;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+public class BlockBasedTableConfigTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void cacheIndexAndFilterBlocks() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setCacheIndexAndFilterBlocks(true);
+    assertThat(blockBasedTableConfig.cacheIndexAndFilterBlocks()).
+        isTrue();
+  }
+
+  @Test
+  public void cacheIndexAndFilterBlocksWithHighPriority() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    assertThat(blockBasedTableConfig.cacheIndexAndFilterBlocksWithHighPriority()).
+        isTrue();
+    blockBasedTableConfig.setCacheIndexAndFilterBlocksWithHighPriority(false);
+    assertThat(blockBasedTableConfig.cacheIndexAndFilterBlocksWithHighPriority()).isFalse();
+  }
+
+  @Test
+  public void pinL0FilterAndIndexBlocksInCache() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setPinL0FilterAndIndexBlocksInCache(true);
+    assertThat(blockBasedTableConfig.pinL0FilterAndIndexBlocksInCache()).
+        isTrue();
+  }
+
+  @Test
+  public void pinTopLevelIndexAndFilter() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setPinTopLevelIndexAndFilter(false);
+    assertThat(blockBasedTableConfig.pinTopLevelIndexAndFilter()).
+        isFalse();
+  }
+
+  @Test
+  public void indexType() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    assertThat(IndexType.values().length).isEqualTo(4);
+    blockBasedTableConfig.setIndexType(IndexType.kHashSearch);
+    assertThat(blockBasedTableConfig.indexType()).isEqualTo(IndexType.kHashSearch);
+    assertThat(IndexType.valueOf("kBinarySearch")).isNotNull();
+    blockBasedTableConfig.setIndexType(IndexType.valueOf("kBinarySearch"));
+    assertThat(blockBasedTableConfig.indexType()).isEqualTo(IndexType.kBinarySearch);
+  }
+
+  @Test
+  public void dataBlockIndexType() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setDataBlockIndexType(DataBlockIndexType.kDataBlockBinaryAndHash);
+    assertThat(blockBasedTableConfig.dataBlockIndexType())
+        .isEqualTo(DataBlockIndexType.kDataBlockBinaryAndHash);
+    blockBasedTableConfig.setDataBlockIndexType(DataBlockIndexType.kDataBlockBinarySearch);
+    assertThat(blockBasedTableConfig.dataBlockIndexType())
+        .isEqualTo(DataBlockIndexType.kDataBlockBinarySearch);
+  }
+
+  @Test
+  public void checksumType() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    assertThat(ChecksumType.values().length).isEqualTo(5);
+    assertThat(ChecksumType.valueOf("kxxHash")).
+        isEqualTo(ChecksumType.kxxHash);
+    blockBasedTableConfig.setChecksumType(ChecksumType.kNoChecksum);
+    assertThat(blockBasedTableConfig.checksumType()).isEqualTo(ChecksumType.kNoChecksum);
+    blockBasedTableConfig.setChecksumType(ChecksumType.kxxHash);
+    assertThat(blockBasedTableConfig.checksumType()).isEqualTo(ChecksumType.kxxHash);
+    blockBasedTableConfig.setChecksumType(ChecksumType.kxxHash64);
+    assertThat(blockBasedTableConfig.checksumType()).isEqualTo(ChecksumType.kxxHash64);
+    blockBasedTableConfig.setChecksumType(ChecksumType.kXXH3);
+    assertThat(blockBasedTableConfig.checksumType()).isEqualTo(ChecksumType.kXXH3);
+  }
+
+  @Test
+  public void jniPortal() throws Exception {
+    // Verifies that the JNI layer is correctly translating options.
+    // Since introspecting the options requires creating a database, the checks
+    // cover multiple options at the same time.
+
+    final BlockBasedTableConfig tableConfig = new BlockBasedTableConfig();
+
+    tableConfig.setIndexType(IndexType.kBinarySearch);
+    tableConfig.setDataBlockIndexType(DataBlockIndexType.kDataBlockBinarySearch);
+    tableConfig.setChecksumType(ChecksumType.kNoChecksum);
+    try (final Options options = new Options().setTableFormatConfig(tableConfig)) {
+      String opts = getOptionAsString(options);
+      assertThat(opts).contains("index_type=kBinarySearch");
+      assertThat(opts).contains("data_block_index_type=kDataBlockBinarySearch");
+      assertThat(opts).contains("checksum=kNoChecksum");
+    }
+
+    tableConfig.setIndexType(IndexType.kHashSearch);
+    tableConfig.setDataBlockIndexType(DataBlockIndexType.kDataBlockBinaryAndHash);
+    tableConfig.setChecksumType(ChecksumType.kCRC32c);
+    try (final Options options = new Options().setTableFormatConfig(tableConfig)) {
+      options.useCappedPrefixExtractor(1); // Needed to use kHashSearch
+      String opts = getOptionAsString(options);
+      assertThat(opts).contains("index_type=kHashSearch");
+      assertThat(opts).contains("data_block_index_type=kDataBlockBinaryAndHash");
+      assertThat(opts).contains("checksum=kCRC32c");
+    }
+
+    tableConfig.setIndexType(IndexType.kTwoLevelIndexSearch);
+    tableConfig.setChecksumType(ChecksumType.kxxHash);
+    try (final Options options = new Options().setTableFormatConfig(tableConfig)) {
+      String opts = getOptionAsString(options);
+      assertThat(opts).contains("index_type=kTwoLevelIndexSearch");
+      assertThat(opts).contains("checksum=kxxHash");
+    }
+
+    tableConfig.setIndexType(IndexType.kBinarySearchWithFirstKey);
+    tableConfig.setChecksumType(ChecksumType.kxxHash64);
+    try (final Options options = new Options().setTableFormatConfig(tableConfig)) {
+      String opts = getOptionAsString(options);
+      assertThat(opts).contains("index_type=kBinarySearchWithFirstKey");
+      assertThat(opts).contains("checksum=kxxHash64");
+    }
+
+    tableConfig.setChecksumType(ChecksumType.kXXH3);
+    try (final Options options = new Options().setTableFormatConfig(tableConfig)) {
+      String opts = getOptionAsString(options);
+      assertThat(opts).contains("checksum=kXXH3");
+    }
+  }
+
+  private String getOptionAsString(Options options) throws Exception {
+    options.setCreateIfMissing(true);
+    String dbPath = dbFolder.getRoot().getAbsolutePath();
+    String result;
+    try (final RocksDB db = RocksDB.open(options, dbPath);
+         final Stream<Path> pathStream = Files.walk(Paths.get(dbPath))) {
+      Path optionsPath =
+          pathStream
+              .filter(p -> p.getFileName().toString().startsWith("OPTIONS"))
+              .findAny()
+              .orElseThrow(() -> new AssertionError("Missing options file"));
+      byte[] optionsData = Files.readAllBytes(optionsPath);
+      result = new String(optionsData, StandardCharsets.UTF_8);
+    }
+    RocksDB.destroyDB(dbPath, options);
+    return result;
+  }
+
+  @Test
+  public void noBlockCache() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setNoBlockCache(true);
+    assertThat(blockBasedTableConfig.noBlockCache()).isTrue();
+  }
+
+  @Test
+  public void blockCache() {
+    try (
+        final Cache cache = new LRUCache(17 * 1024 * 1024);
+        final Options options = new Options().setTableFormatConfig(
+            new BlockBasedTableConfig().setBlockCache(cache))) {
+      assertThat(options.tableFactoryName()).isEqualTo("BlockBasedTable");
+    }
+  }
+
+  @Test
+  public void blockCacheIntegration() throws RocksDBException {
+    try (final Cache cache = new LRUCache(8 * 1024 * 1024);
+         final Statistics statistics = new Statistics()) {
+      for (int shard = 0; shard < 8; shard++) {
+        try (final Options options =
+                 new Options()
+                     .setCreateIfMissing(true)
+                     .setStatistics(statistics)
+                     .setTableFormatConfig(new BlockBasedTableConfig().setBlockCache(cache));
+             final RocksDB db =
+                 RocksDB.open(options, dbFolder.getRoot().getAbsolutePath() + "/" + shard)) {
+          final byte[] key = "some-key".getBytes(StandardCharsets.UTF_8);
+          final byte[] value = "some-value".getBytes(StandardCharsets.UTF_8);
+
+          db.put(key, value);
+          db.flush(new FlushOptions());
+          db.get(key);
+
+          assertThat(statistics.getTickerCount(TickerType.BLOCK_CACHE_ADD)).isEqualTo(shard + 1);
+        }
+      }
+    }
+  }
+
+  @Test
+  public void persistentCache() throws RocksDBException {
+    try (final DBOptions dbOptions = new DBOptions().
+        setInfoLogLevel(InfoLogLevel.INFO_LEVEL).
+        setCreateIfMissing(true);
+        final Logger logger = new Logger(dbOptions) {
+      @Override
+      protected void log(final InfoLogLevel infoLogLevel, final String logMsg) {
+        System.out.println(infoLogLevel.name() + ": " + logMsg);
+      }
+    }) {
+      try (final PersistentCache persistentCache =
+               new PersistentCache(Env.getDefault(), dbFolder.getRoot().getPath(), 1024 * 1024 * 100, logger, false);
+           final Options options = new Options().setTableFormatConfig(
+               new BlockBasedTableConfig().setPersistentCache(persistentCache))) {
+        assertThat(options.tableFactoryName()).isEqualTo("BlockBasedTable");
+      }
+    }
+  }
+
+  @Test
+  public void blockCacheCompressed() {
+    try (final Cache cache = new LRUCache(17 * 1024 * 1024);
+         final Options options = new Options().setTableFormatConfig(
+        new BlockBasedTableConfig().setBlockCacheCompressed(cache))) {
+      assertThat(options.tableFactoryName()).isEqualTo("BlockBasedTable");
+    }
+  }
+
+  @Ignore("See issue: https://github.com/facebook/rocksdb/issues/4822")
+  @Test
+  public void blockCacheCompressedIntegration() throws RocksDBException {
+    final byte[] key1 = "some-key1".getBytes(StandardCharsets.UTF_8);
+    final byte[] key2 = "some-key1".getBytes(StandardCharsets.UTF_8);
+    final byte[] key3 = "some-key1".getBytes(StandardCharsets.UTF_8);
+    final byte[] key4 = "some-key1".getBytes(StandardCharsets.UTF_8);
+    final byte[] value = "some-value".getBytes(StandardCharsets.UTF_8);
+
+    try (final Cache compressedCache = new LRUCache(8 * 1024 * 1024);
+         final Statistics statistics = new Statistics()) {
+
+      final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig()
+          .setNoBlockCache(true)
+          .setBlockCache(null)
+          .setBlockCacheCompressed(compressedCache)
+          .setFormatVersion(4);
+
+      try (final Options options = new Options()
+             .setCreateIfMissing(true)
+             .setStatistics(statistics)
+             .setTableFormatConfig(blockBasedTableConfig)) {
+
+        for (int shard = 0; shard < 8; shard++) {
+          try (final FlushOptions flushOptions = new FlushOptions();
+               final WriteOptions writeOptions = new WriteOptions();
+               final ReadOptions readOptions = new ReadOptions();
+               final RocksDB db =
+                   RocksDB.open(options, dbFolder.getRoot().getAbsolutePath() + "/" + shard)) {
+
+            db.put(writeOptions, key1, value);
+            db.put(writeOptions, key2, value);
+            db.put(writeOptions, key3, value);
+            db.put(writeOptions, key4, value);
+            db.flush(flushOptions);
+
+            db.get(readOptions, key1);
+            db.get(readOptions, key2);
+            db.get(readOptions, key3);
+            db.get(readOptions, key4);
+
+            assertThat(statistics.getTickerCount(TickerType.BLOCK_CACHE_COMPRESSED_ADD)).isEqualTo(shard + 1);
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void blockSize() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setBlockSize(10);
+    assertThat(blockBasedTableConfig.blockSize()).isEqualTo(10);
+  }
+
+  @Test
+  public void blockSizeDeviation() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setBlockSizeDeviation(12);
+    assertThat(blockBasedTableConfig.blockSizeDeviation()).
+        isEqualTo(12);
+  }
+
+  @Test
+  public void blockRestartInterval() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setBlockRestartInterval(15);
+    assertThat(blockBasedTableConfig.blockRestartInterval()).
+        isEqualTo(15);
+  }
+
+  @Test
+  public void indexBlockRestartInterval() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setIndexBlockRestartInterval(15);
+    assertThat(blockBasedTableConfig.indexBlockRestartInterval()).
+        isEqualTo(15);
+  }
+
+  @Test
+  public void metadataBlockSize() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setMetadataBlockSize(1024);
+    assertThat(blockBasedTableConfig.metadataBlockSize()).
+        isEqualTo(1024);
+  }
+
+  @Test
+  public void partitionFilters() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setPartitionFilters(true);
+    assertThat(blockBasedTableConfig.partitionFilters()).
+        isTrue();
+  }
+
+  @Test
+  public void optimizeFiltersForMemory() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setOptimizeFiltersForMemory(true);
+    assertThat(blockBasedTableConfig.optimizeFiltersForMemory()).isTrue();
+  }
+
+  @Test
+  public void useDeltaEncoding() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setUseDeltaEncoding(false);
+    assertThat(blockBasedTableConfig.useDeltaEncoding()).
+        isFalse();
+  }
+
+  @Test
+  public void blockBasedTableWithFilterPolicy() {
+    try(final Options options = new Options()
+        .setTableFormatConfig(new BlockBasedTableConfig()
+            .setFilterPolicy(new BloomFilter(10)))) {
+      assertThat(options.tableFactoryName()).
+          isEqualTo("BlockBasedTable");
+    }
+  }
+
+  @Test
+  public void blockBasedTableWithoutFilterPolicy() {
+    try(final Options options = new Options().setTableFormatConfig(
+        new BlockBasedTableConfig().setFilterPolicy(null))) {
+      assertThat(options.tableFactoryName()).
+          isEqualTo("BlockBasedTable");
+    }
+  }
+
+  @Test
+  public void wholeKeyFiltering() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setWholeKeyFiltering(false);
+    assertThat(blockBasedTableConfig.wholeKeyFiltering()).
+        isFalse();
+  }
+
+  @Test
+  public void verifyCompression() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    assertThat(blockBasedTableConfig.verifyCompression()).isFalse();
+    blockBasedTableConfig.setVerifyCompression(true);
+    assertThat(blockBasedTableConfig.verifyCompression()).
+        isTrue();
+  }
+
+  @Test
+  public void readAmpBytesPerBit() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setReadAmpBytesPerBit(2);
+    assertThat(blockBasedTableConfig.readAmpBytesPerBit()).
+        isEqualTo(2);
+  }
+
+  @Test
+  public void formatVersion() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    for (int version = 0; version <= 5; version++) {
+      blockBasedTableConfig.setFormatVersion(version);
+      assertThat(blockBasedTableConfig.formatVersion()).isEqualTo(version);
+    }
+  }
+
+  @Test(expected = AssertionError.class)
+  public void formatVersionFailNegative() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setFormatVersion(-1);
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void invalidFormatVersion() throws RocksDBException {
+    final BlockBasedTableConfig blockBasedTableConfig =
+        new BlockBasedTableConfig().setFormatVersion(99999);
+
+    try (final Options options = new Options().setTableFormatConfig(blockBasedTableConfig);
+         final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) {
+      fail("Opening the database with an invalid format_version should have raised an exception");
+    }
+  }
+
+  @Test
+  public void enableIndexCompression() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setEnableIndexCompression(false);
+    assertThat(blockBasedTableConfig.enableIndexCompression()).
+        isFalse();
+  }
+
+  @Test
+  public void blockAlign() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setBlockAlign(true);
+    assertThat(blockBasedTableConfig.blockAlign()).
+        isTrue();
+  }
+
+  @Test
+  public void indexShortening() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setIndexShortening(IndexShorteningMode.kShortenSeparatorsAndSuccessor);
+    assertThat(blockBasedTableConfig.indexShortening())
+        .isEqualTo(IndexShorteningMode.kShortenSeparatorsAndSuccessor);
+  }
+
+  @Deprecated
+  @Test
+  public void hashIndexAllowCollision() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setHashIndexAllowCollision(false);
+    assertThat(blockBasedTableConfig.hashIndexAllowCollision()).
+        isTrue();  // NOTE: setHashIndexAllowCollision should do nothing!
+  }
+
+  @Deprecated
+  @Test
+  public void blockCacheSize() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setBlockCacheSize(8 * 1024);
+    assertThat(blockBasedTableConfig.blockCacheSize()).
+        isEqualTo(8 * 1024);
+  }
+
+  @Deprecated
+  @Test
+  public void blockCacheNumShardBits() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setCacheNumShardBits(5);
+    assertThat(blockBasedTableConfig.cacheNumShardBits()).
+        isEqualTo(5);
+  }
+
+  @Deprecated
+  @Test
+  public void blockCacheCompressedSize() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setBlockCacheCompressedSize(40);
+    assertThat(blockBasedTableConfig.blockCacheCompressedSize()).
+        isEqualTo(40);
+  }
+
+  @Deprecated
+  @Test
+  public void blockCacheCompressedNumShardBits() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setBlockCacheCompressedNumShardBits(4);
+    assertThat(blockBasedTableConfig.blockCacheCompressedNumShardBits()).
+        isEqualTo(4);
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/BuiltinComparatorTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/BuiltinComparatorTest.java
new file mode 100644
index 000000000..e238ae07b
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/BuiltinComparatorTest.java
@@ -0,0 +1,145 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class BuiltinComparatorTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void builtinForwardComparator()
+      throws RocksDBException {
+    try (final Options options = new Options()
+        .setCreateIfMissing(true)
+        .setComparator(BuiltinComparator.BYTEWISE_COMPARATOR);
+         final RocksDB rocksDb = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath())
+    ) {
+      rocksDb.put("abc1".getBytes(), "abc1".getBytes());
+      rocksDb.put("abc2".getBytes(), "abc2".getBytes());
+      rocksDb.put("abc3".getBytes(), "abc3".getBytes());
+
+      try(final RocksIterator rocksIterator = rocksDb.newIterator()) {
+        // Iterate over keys using a iterator
+        rocksIterator.seekToFirst();
+        assertThat(rocksIterator.isValid()).isTrue();
+        assertThat(rocksIterator.key()).isEqualTo(
+            "abc1".getBytes());
+        assertThat(rocksIterator.value()).isEqualTo(
+            "abc1".getBytes());
+        rocksIterator.next();
+        assertThat(rocksIterator.isValid()).isTrue();
+        assertThat(rocksIterator.key()).isEqualTo(
+            "abc2".getBytes());
+        assertThat(rocksIterator.value()).isEqualTo(
+            "abc2".getBytes());
+        rocksIterator.next();
+        assertThat(rocksIterator.isValid()).isTrue();
+        assertThat(rocksIterator.key()).isEqualTo(
+            "abc3".getBytes());
+        assertThat(rocksIterator.value()).isEqualTo(
+            "abc3".getBytes());
+        rocksIterator.next();
+        assertThat(rocksIterator.isValid()).isFalse();
+        // Get last one
+        rocksIterator.seekToLast();
+        assertThat(rocksIterator.isValid()).isTrue();
+        assertThat(rocksIterator.key()).isEqualTo(
+            "abc3".getBytes());
+        assertThat(rocksIterator.value()).isEqualTo(
+            "abc3".getBytes());
+        // Seek for abc
+        rocksIterator.seek("abc".getBytes());
+        assertThat(rocksIterator.isValid()).isTrue();
+        assertThat(rocksIterator.key()).isEqualTo(
+            "abc1".getBytes());
+        assertThat(rocksIterator.value()).isEqualTo(
+            "abc1".getBytes());
+      }
+    }
+  }
+
+  @Test
+  public void builtinReverseComparator()
+      throws RocksDBException {
+    try (final Options options = new Options()
+        .setCreateIfMissing(true)
+        .setComparator(BuiltinComparator.REVERSE_BYTEWISE_COMPARATOR);
+         final RocksDB rocksDb = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath())
+    ) {
+
+      rocksDb.put("abc1".getBytes(), "abc1".getBytes());
+      rocksDb.put("abc2".getBytes(), "abc2".getBytes());
+      rocksDb.put("abc3".getBytes(), "abc3".getBytes());
+
+      try (final RocksIterator rocksIterator = rocksDb.newIterator()) {
+        // Iterate over keys using a iterator
+        rocksIterator.seekToFirst();
+        assertThat(rocksIterator.isValid()).isTrue();
+        assertThat(rocksIterator.key()).isEqualTo(
+            "abc3".getBytes());
+        assertThat(rocksIterator.value()).isEqualTo(
+            "abc3".getBytes());
+        rocksIterator.next();
+        assertThat(rocksIterator.isValid()).isTrue();
+        assertThat(rocksIterator.key()).isEqualTo(
+            "abc2".getBytes());
+        assertThat(rocksIterator.value()).isEqualTo(
+            "abc2".getBytes());
+        rocksIterator.next();
+        assertThat(rocksIterator.isValid()).isTrue();
+        assertThat(rocksIterator.key()).isEqualTo(
+            "abc1".getBytes());
+        assertThat(rocksIterator.value()).isEqualTo(
+            "abc1".getBytes());
+        rocksIterator.next();
+        assertThat(rocksIterator.isValid()).isFalse();
+        // Get last one
+        rocksIterator.seekToLast();
+        assertThat(rocksIterator.isValid()).isTrue();
+        assertThat(rocksIterator.key()).isEqualTo(
+            "abc1".getBytes());
+        assertThat(rocksIterator.value()).isEqualTo(
+            "abc1".getBytes());
+        // Will be invalid because abc is after abc1
+        rocksIterator.seek("abc".getBytes());
+        assertThat(rocksIterator.isValid()).isFalse();
+        // Will be abc3 because the next one after abc999
+        // is abc3
+        rocksIterator.seek("abc999".getBytes());
+        assertThat(rocksIterator.key()).isEqualTo(
+            "abc3".getBytes());
+        assertThat(rocksIterator.value()).isEqualTo(
+            "abc3".getBytes());
+      }
+    }
+  }
+
+  @Test
+  public void builtinComparatorEnum(){
+    assertThat(BuiltinComparator.BYTEWISE_COMPARATOR.ordinal())
+        .isEqualTo(0);
+    assertThat(
+        BuiltinComparator.REVERSE_BYTEWISE_COMPARATOR.ordinal())
+        .isEqualTo(1);
+    assertThat(BuiltinComparator.values().length).isEqualTo(2);
+    assertThat(BuiltinComparator.valueOf("BYTEWISE_COMPARATOR")).
+        isEqualTo(BuiltinComparator.BYTEWISE_COMPARATOR);
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/BytewiseComparatorRegressionTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/BytewiseComparatorRegressionTest.java
new file mode 100644
index 000000000..fe950362b
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/BytewiseComparatorRegressionTest.java
@@ -0,0 +1,126 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import static org.junit.Assert.assertArrayEquals;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.rocksdb.util.BytewiseComparator;
+
+/**
+ * This test confirms that the following issues were in fact resolved
+ * by a change made between 6.2.2 and 6.22.1,
+ * to wit {@link <a href="https://github.com/facebook/rocksdb/commit/7242dae7">...</a>}
+ * which as part of its effect, changed the Java bytewise comparators.
+ *
+ * {@link <a href="https://github.com/facebook/rocksdb/issues/5891">...</a>}
+ * {@link <a href="https://github.com/facebook/rocksdb/issues/2001">...</a>}
+ */
+public class BytewiseComparatorRegressionTest {
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Rule public TemporaryFolder temporarySSTFolder = new TemporaryFolder();
+
+  private final static byte[][] testData = {{10, -11, 13}, {10, 11, 12}, {10, 11, 14}};
+  private final static byte[][] orderedData = {{10, 11, 12}, {10, 11, 14}, {10, -11, 13}};
+
+  /**
+   * {@link <a href="https://github.com/facebook/rocksdb/issues/5891">...</a>}
+   */
+  @Test
+  public void testJavaComparator() throws RocksDBException {
+    final BytewiseComparator comparator = new BytewiseComparator(new ComparatorOptions());
+    performTest(new Options().setCreateIfMissing(true).setComparator(comparator));
+  }
+
+  @Test
+  public void testDefaultComparator() throws RocksDBException {
+    performTest(new Options().setCreateIfMissing(true));
+  }
+
+  /**
+   * {@link <a href="https://github.com/facebook/rocksdb/issues/5891">...</a>}
+   */
+  @Test
+  public void testCppComparator() throws RocksDBException {
+    performTest(new Options().setCreateIfMissing(true).setComparator(
+        BuiltinComparator.BYTEWISE_COMPARATOR));
+  }
+
+  private void performTest(final Options options) throws RocksDBException {
+    try (final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) {
+      for (final byte[] item : testData) {
+        db.put(item, item);
+      }
+      try (final RocksIterator iterator = db.newIterator()) {
+        iterator.seekToFirst();
+        final ArrayList<byte[]> result = new ArrayList<>();
+        while (iterator.isValid()) {
+          result.add(iterator.key());
+          iterator.next();
+        }
+        assertArrayEquals(orderedData, result.toArray());
+      }
+    }
+  }
+
+  private byte[] hexToByte(final String hexString) {
+    final byte[] bytes = new byte[hexString.length() / 2];
+    if (bytes.length * 2 < hexString.length()) {
+      throw new RuntimeException("Hex string has odd length: " + hexString);
+    }
+
+    for (int i = 0; i < bytes.length; i++) {
+      final int firstDigit = toDigit(hexString.charAt(i + i));
+      final int secondDigit = toDigit(hexString.charAt(i + i + 1));
+      bytes[i] = (byte) ((firstDigit << 4) + secondDigit);
+    }
+
+    return bytes;
+  }
+
+  private int toDigit(final char hexChar) {
+    final int digit = Character.digit(hexChar, 16);
+    if (digit == -1) {
+      throw new IllegalArgumentException("Invalid Hexadecimal Character: " + hexChar);
+    }
+    return digit;
+  }
+
+  /**
+   * {@link <a href="https://github.com/facebook/rocksdb/issues/2001">...</a>}
+   *
+   * @throws RocksDBException if something goes wrong, or if the regression occurs
+   * @throws IOException if we can't make the temporary file
+   */
+  @Test
+  public void testSST() throws RocksDBException, IOException {
+    final File tempSSTFile = temporarySSTFolder.newFile("test_file_with_weird_keys.sst");
+
+    final EnvOptions envOpts = new EnvOptions();
+    final Options opts = new Options();
+    opts.setComparator(new BytewiseComparator(new ComparatorOptions()));
+    final SstFileWriter writer = new SstFileWriter(envOpts, opts);
+    writer.open(tempSSTFile.getAbsolutePath());
+    final byte[] gKey =
+        hexToByte("000000293030303030303030303030303030303030303032303736343730696E666F33");
+    final byte[] wKey =
+        hexToByte("0000008d3030303030303030303030303030303030303030303437363433696e666f34");
+    writer.put(new Slice(gKey), new Slice("dummyV1"));
+    writer.put(new Slice(wKey), new Slice("dummyV2"));
+    writer.finish();
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/CheckPointTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/CheckPointTest.java
new file mode 100644
index 000000000..c2cc6fc62
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/CheckPointTest.java
@@ -0,0 +1,83 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+package org.rocksdb;
+
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class CheckPointTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Rule
+  public TemporaryFolder checkpointFolder = new TemporaryFolder();
+
+  @Test
+  public void checkPoint() throws RocksDBException {
+    try (final Options options = new Options().
+        setCreateIfMissing(true)) {
+
+      try (final RocksDB db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath())) {
+        db.put("key".getBytes(), "value".getBytes());
+        try (final Checkpoint checkpoint = Checkpoint.create(db)) {
+          checkpoint.createCheckpoint(checkpointFolder.
+              getRoot().getAbsolutePath() + "/snapshot1");
+          db.put("key2".getBytes(), "value2".getBytes());
+          checkpoint.createCheckpoint(checkpointFolder.
+              getRoot().getAbsolutePath() + "/snapshot2");
+        }
+      }
+
+      try (final RocksDB db = RocksDB.open(options,
+          checkpointFolder.getRoot().getAbsolutePath() +
+              "/snapshot1")) {
+        assertThat(new String(db.get("key".getBytes()))).
+            isEqualTo("value");
+        assertThat(db.get("key2".getBytes())).isNull();
+      }
+
+      try (final RocksDB db = RocksDB.open(options,
+          checkpointFolder.getRoot().getAbsolutePath() +
+              "/snapshot2")) {
+        assertThat(new String(db.get("key".getBytes()))).
+            isEqualTo("value");
+        assertThat(new String(db.get("key2".getBytes()))).
+            isEqualTo("value2");
+      }
+    }
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void failIfDbIsNull() {
+    try (final Checkpoint checkpoint = Checkpoint.create(null)) {
+
+    }
+  }
+
+  @Test(expected = IllegalStateException.class)
+  public void failIfDbNotInitialized() throws RocksDBException {
+    try (final RocksDB db = RocksDB.open(
+        dbFolder.getRoot().getAbsolutePath())) {
+      db.close();
+      Checkpoint.create(db);
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failWithIllegalPath() throws RocksDBException {
+    try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+         final Checkpoint checkpoint = Checkpoint.create(db)) {
+      checkpoint.createCheckpoint("/Z:///:\\C:\\TZ/-");
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/ClockCacheTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/ClockCacheTest.java
new file mode 100644
index 000000000..d1241ac75
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/ClockCacheTest.java
@@ -0,0 +1,26 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Test;
+
+public class ClockCacheTest {
+
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  @Test
+  public void newClockCache() {
+    final long capacity = 1000;
+    final int numShardBits = 16;
+    final boolean strictCapacityLimit = true;
+    try(final Cache clockCache = new ClockCache(capacity,
+        numShardBits, strictCapacityLimit)) {
+      //no op
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java
new file mode 100644
index 000000000..7d7581048
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java
@@ -0,0 +1,714 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.*;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.rocksdb.test.RemoveEmptyValueCompactionFilterFactory;
+
+public class ColumnFamilyOptionsTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  public static final Random rand = PlatformRandomHelper.
+      getPlatformSpecificRandomFactory();
+
+  @Test
+  public void copyConstructor() {
+    ColumnFamilyOptions origOpts = new ColumnFamilyOptions();
+    origOpts.setNumLevels(rand.nextInt(8));
+    origOpts.setTargetFileSizeMultiplier(rand.nextInt(100));
+    origOpts.setLevel0StopWritesTrigger(rand.nextInt(50));
+    ColumnFamilyOptions copyOpts = new ColumnFamilyOptions(origOpts);
+    assertThat(origOpts.numLevels()).isEqualTo(copyOpts.numLevels());
+    assertThat(origOpts.targetFileSizeMultiplier()).isEqualTo(copyOpts.targetFileSizeMultiplier());
+    assertThat(origOpts.level0StopWritesTrigger()).isEqualTo(copyOpts.level0StopWritesTrigger());
+  }
+
+  @Test
+  public void getColumnFamilyOptionsFromProps() {
+    Properties properties = new Properties();
+    properties.put("write_buffer_size", "112");
+    properties.put("max_write_buffer_number", "13");
+
+    try (final ColumnFamilyOptions opt = ColumnFamilyOptions.
+        getColumnFamilyOptionsFromProps(properties)) {
+      // setup sample properties
+      assertThat(opt).isNotNull();
+      assertThat(String.valueOf(opt.writeBufferSize())).
+          isEqualTo(properties.get("write_buffer_size"));
+      assertThat(String.valueOf(opt.maxWriteBufferNumber())).
+          isEqualTo(properties.get("max_write_buffer_number"));
+    }
+  }
+
+  @Test
+  public void getColumnFamilyOptionsFromPropsWithIgnoreIllegalValue() {
+    // setup sample properties
+    final Properties properties = new Properties();
+    properties.put("tomato", "1024");
+    properties.put("burger", "2");
+    properties.put("write_buffer_size", "112");
+    properties.put("max_write_buffer_number", "13");
+
+    try (final ConfigOptions cfgOpts = new ConfigOptions().setIgnoreUnknownOptions(true);
+         final ColumnFamilyOptions opt =
+             ColumnFamilyOptions.getColumnFamilyOptionsFromProps(cfgOpts, properties)) {
+      // setup sample properties
+      assertThat(opt).isNotNull();
+      assertThat(String.valueOf(opt.writeBufferSize()))
+          .isEqualTo(properties.get("write_buffer_size"));
+      assertThat(String.valueOf(opt.maxWriteBufferNumber()))
+          .isEqualTo(properties.get("max_write_buffer_number"));
+    }
+  }
+
+  @Test
+  public void failColumnFamilyOptionsFromPropsWithIllegalValue() {
+    // setup sample properties
+    final Properties properties = new Properties();
+    properties.put("tomato", "1024");
+    properties.put("burger", "2");
+
+    try (final ColumnFamilyOptions opt =
+             ColumnFamilyOptions.getColumnFamilyOptionsFromProps(properties)) {
+      assertThat(opt).isNull();
+    }
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void failColumnFamilyOptionsFromPropsWithNullValue() {
+    try (final ColumnFamilyOptions opt =
+             ColumnFamilyOptions.getColumnFamilyOptionsFromProps(null)) {
+    }
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void failColumnFamilyOptionsFromPropsWithEmptyProps() {
+    try (final ColumnFamilyOptions opt =
+             ColumnFamilyOptions.getColumnFamilyOptionsFromProps(
+                 new Properties())) {
+    }
+  }
+
+  @Test
+  public void writeBufferSize() throws RocksDBException {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setWriteBufferSize(longValue);
+      assertThat(opt.writeBufferSize()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void maxWriteBufferNumber() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final int intValue = rand.nextInt();
+      opt.setMaxWriteBufferNumber(intValue);
+      assertThat(opt.maxWriteBufferNumber()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void minWriteBufferNumberToMerge() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final int intValue = rand.nextInt();
+      opt.setMinWriteBufferNumberToMerge(intValue);
+      assertThat(opt.minWriteBufferNumberToMerge()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void numLevels() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final int intValue = rand.nextInt();
+      opt.setNumLevels(intValue);
+      assertThat(opt.numLevels()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void levelZeroFileNumCompactionTrigger() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final int intValue = rand.nextInt();
+      opt.setLevelZeroFileNumCompactionTrigger(intValue);
+      assertThat(opt.levelZeroFileNumCompactionTrigger()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void levelZeroSlowdownWritesTrigger() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final int intValue = rand.nextInt();
+      opt.setLevelZeroSlowdownWritesTrigger(intValue);
+      assertThat(opt.levelZeroSlowdownWritesTrigger()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void levelZeroStopWritesTrigger() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final int intValue = rand.nextInt();
+      opt.setLevelZeroStopWritesTrigger(intValue);
+      assertThat(opt.levelZeroStopWritesTrigger()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void targetFileSizeBase() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setTargetFileSizeBase(longValue);
+      assertThat(opt.targetFileSizeBase()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void targetFileSizeMultiplier() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final int intValue = rand.nextInt();
+      opt.setTargetFileSizeMultiplier(intValue);
+      assertThat(opt.targetFileSizeMultiplier()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void maxBytesForLevelBase() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setMaxBytesForLevelBase(longValue);
+      assertThat(opt.maxBytesForLevelBase()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void levelCompactionDynamicLevelBytes() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setLevelCompactionDynamicLevelBytes(boolValue);
+      assertThat(opt.levelCompactionDynamicLevelBytes())
+          .isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void maxBytesForLevelMultiplier() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final double doubleValue = rand.nextDouble();
+      opt.setMaxBytesForLevelMultiplier(doubleValue);
+      assertThat(opt.maxBytesForLevelMultiplier()).isEqualTo(doubleValue);
+    }
+  }
+
+  @Test
+  public void maxBytesForLevelMultiplierAdditional() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final int intValue1 = rand.nextInt();
+      final int intValue2 = rand.nextInt();
+      final int[] ints = new int[]{intValue1, intValue2};
+      opt.setMaxBytesForLevelMultiplierAdditional(ints);
+      assertThat(opt.maxBytesForLevelMultiplierAdditional()).isEqualTo(ints);
+    }
+  }
+
+  @Test
+  public void maxCompactionBytes() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setMaxCompactionBytes(longValue);
+      assertThat(opt.maxCompactionBytes()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void softPendingCompactionBytesLimit() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setSoftPendingCompactionBytesLimit(longValue);
+      assertThat(opt.softPendingCompactionBytesLimit()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void hardPendingCompactionBytesLimit() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setHardPendingCompactionBytesLimit(longValue);
+      assertThat(opt.hardPendingCompactionBytesLimit()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void level0FileNumCompactionTrigger() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final int intValue = rand.nextInt();
+      opt.setLevel0FileNumCompactionTrigger(intValue);
+      assertThat(opt.level0FileNumCompactionTrigger()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void level0SlowdownWritesTrigger() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final int intValue = rand.nextInt();
+      opt.setLevel0SlowdownWritesTrigger(intValue);
+      assertThat(opt.level0SlowdownWritesTrigger()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void level0StopWritesTrigger() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final int intValue = rand.nextInt();
+      opt.setLevel0StopWritesTrigger(intValue);
+      assertThat(opt.level0StopWritesTrigger()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void arenaBlockSize() throws RocksDBException {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setArenaBlockSize(longValue);
+      assertThat(opt.arenaBlockSize()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void disableAutoCompactions() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setDisableAutoCompactions(boolValue);
+      assertThat(opt.disableAutoCompactions()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void maxSequentialSkipInIterations() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setMaxSequentialSkipInIterations(longValue);
+      assertThat(opt.maxSequentialSkipInIterations()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void inplaceUpdateSupport() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setInplaceUpdateSupport(boolValue);
+      assertThat(opt.inplaceUpdateSupport()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void inplaceUpdateNumLocks() throws RocksDBException {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setInplaceUpdateNumLocks(longValue);
+      assertThat(opt.inplaceUpdateNumLocks()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void memtablePrefixBloomSizeRatio() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final double doubleValue = rand.nextDouble();
+      opt.setMemtablePrefixBloomSizeRatio(doubleValue);
+      assertThat(opt.memtablePrefixBloomSizeRatio()).isEqualTo(doubleValue);
+    }
+  }
+
+  @Test
+  public void experimentalMempurgeThreshold() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final double doubleValue = rand.nextDouble();
+      opt.setExperimentalMempurgeThreshold(doubleValue);
+      assertThat(opt.experimentalMempurgeThreshold()).isEqualTo(doubleValue);
+    }
+  }
+
+  @Test
+  public void memtableWholeKeyFiltering() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final boolean booleanValue = rand.nextBoolean();
+      opt.setMemtableWholeKeyFiltering(booleanValue);
+      assertThat(opt.memtableWholeKeyFiltering()).isEqualTo(booleanValue);
+    }
+  }
+
+  @Test
+  public void memtableHugePageSize() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setMemtableHugePageSize(longValue);
+      assertThat(opt.memtableHugePageSize()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void bloomLocality() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final int intValue = rand.nextInt();
+      opt.setBloomLocality(intValue);
+      assertThat(opt.bloomLocality()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void maxSuccessiveMerges() throws RocksDBException {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setMaxSuccessiveMerges(longValue);
+      assertThat(opt.maxSuccessiveMerges()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void optimizeFiltersForHits() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final boolean aBoolean = rand.nextBoolean();
+      opt.setOptimizeFiltersForHits(aBoolean);
+      assertThat(opt.optimizeFiltersForHits()).isEqualTo(aBoolean);
+    }
+  }
+
+  @Test
+  public void memTable() throws RocksDBException {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      opt.setMemTableConfig(new HashLinkedListMemTableConfig());
+      assertThat(opt.memTableFactoryName()).
+          isEqualTo("HashLinkedListRepFactory");
+    }
+  }
+
+  @Test
+  public void comparator() throws RocksDBException {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      opt.setComparator(BuiltinComparator.BYTEWISE_COMPARATOR);
+    }
+  }
+
+  @Test
+  public void linkageOfPrepMethods() {
+    try (final ColumnFamilyOptions options = new ColumnFamilyOptions()) {
+      options.optimizeUniversalStyleCompaction();
+      options.optimizeUniversalStyleCompaction(4000);
+      options.optimizeLevelStyleCompaction();
+      options.optimizeLevelStyleCompaction(3000);
+      options.optimizeForPointLookup(10);
+      options.optimizeForSmallDb();
+    }
+  }
+
+  @Test
+  public void shouldSetTestPrefixExtractor() {
+    try (final ColumnFamilyOptions options = new ColumnFamilyOptions()) {
+      options.useFixedLengthPrefixExtractor(100);
+      options.useFixedLengthPrefixExtractor(10);
+    }
+  }
+
+  @Test
+  public void shouldSetTestCappedPrefixExtractor() {
+    try (final ColumnFamilyOptions options = new ColumnFamilyOptions()) {
+      options.useCappedPrefixExtractor(100);
+      options.useCappedPrefixExtractor(10);
+    }
+  }
+
+  @Test
+  public void compressionTypes() {
+    try (final ColumnFamilyOptions columnFamilyOptions
+             = new ColumnFamilyOptions()) {
+      for (final CompressionType compressionType :
+          CompressionType.values()) {
+        columnFamilyOptions.setCompressionType(compressionType);
+        assertThat(columnFamilyOptions.compressionType()).
+            isEqualTo(compressionType);
+        assertThat(CompressionType.valueOf("NO_COMPRESSION")).
+            isEqualTo(CompressionType.NO_COMPRESSION);
+      }
+    }
+  }
+
+  @Test
+  public void compressionPerLevel() {
+    try (final ColumnFamilyOptions columnFamilyOptions
+             = new ColumnFamilyOptions()) {
+      assertThat(columnFamilyOptions.compressionPerLevel()).isEmpty();
+      List<CompressionType> compressionTypeList = new ArrayList<>();
+      for (int i = 0; i < columnFamilyOptions.numLevels(); i++) {
+        compressionTypeList.add(CompressionType.NO_COMPRESSION);
+      }
+      columnFamilyOptions.setCompressionPerLevel(compressionTypeList);
+      compressionTypeList = columnFamilyOptions.compressionPerLevel();
+      for (CompressionType compressionType : compressionTypeList) {
+        assertThat(compressionType).isEqualTo(
+            CompressionType.NO_COMPRESSION);
+      }
+    }
+  }
+
+  @Test
+  public void differentCompressionsPerLevel() {
+    try (final ColumnFamilyOptions columnFamilyOptions
+             = new ColumnFamilyOptions()) {
+      columnFamilyOptions.setNumLevels(3);
+
+      assertThat(columnFamilyOptions.compressionPerLevel()).isEmpty();
+      List<CompressionType> compressionTypeList = new ArrayList<>();
+
+      compressionTypeList.add(CompressionType.BZLIB2_COMPRESSION);
+      compressionTypeList.add(CompressionType.SNAPPY_COMPRESSION);
+      compressionTypeList.add(CompressionType.LZ4_COMPRESSION);
+
+      columnFamilyOptions.setCompressionPerLevel(compressionTypeList);
+      compressionTypeList = columnFamilyOptions.compressionPerLevel();
+
+      assertThat(compressionTypeList.size()).isEqualTo(3);
+      assertThat(compressionTypeList).
+          containsExactly(
+              CompressionType.BZLIB2_COMPRESSION,
+              CompressionType.SNAPPY_COMPRESSION,
+              CompressionType.LZ4_COMPRESSION);
+
+    }
+  }
+
+  @Test
+  public void bottommostCompressionType() {
+    try (final ColumnFamilyOptions columnFamilyOptions
+             = new ColumnFamilyOptions()) {
+      assertThat(columnFamilyOptions.bottommostCompressionType())
+          .isEqualTo(CompressionType.DISABLE_COMPRESSION_OPTION);
+
+      for (final CompressionType compressionType : CompressionType.values()) {
+        columnFamilyOptions.setBottommostCompressionType(compressionType);
+        assertThat(columnFamilyOptions.bottommostCompressionType())
+            .isEqualTo(compressionType);
+      }
+    }
+  }
+
+  @Test
+  public void bottommostCompressionOptions() {
+    try (final ColumnFamilyOptions columnFamilyOptions =
+             new ColumnFamilyOptions();
+         final CompressionOptions bottommostCompressionOptions =
+             new CompressionOptions()
+                 .setMaxDictBytes(123)) {
+
+      columnFamilyOptions.setBottommostCompressionOptions(
+          bottommostCompressionOptions);
+      assertThat(columnFamilyOptions.bottommostCompressionOptions())
+          .isEqualTo(bottommostCompressionOptions);
+      assertThat(columnFamilyOptions.bottommostCompressionOptions()
+          .maxDictBytes()).isEqualTo(123);
+    }
+  }
+
+  @Test
+  public void compressionOptions() {
+    try (final ColumnFamilyOptions columnFamilyOptions
+             = new ColumnFamilyOptions();
+        final CompressionOptions compressionOptions = new CompressionOptions()
+          .setMaxDictBytes(123)) {
+
+      columnFamilyOptions.setCompressionOptions(compressionOptions);
+      assertThat(columnFamilyOptions.compressionOptions())
+          .isEqualTo(compressionOptions);
+      assertThat(columnFamilyOptions.compressionOptions().maxDictBytes())
+          .isEqualTo(123);
+    }
+  }
+
+  @Test
+  public void compactionStyles() {
+    try (final ColumnFamilyOptions columnFamilyOptions
+             = new ColumnFamilyOptions()) {
+      for (final CompactionStyle compactionStyle :
+          CompactionStyle.values()) {
+        columnFamilyOptions.setCompactionStyle(compactionStyle);
+        assertThat(columnFamilyOptions.compactionStyle()).
+            isEqualTo(compactionStyle);
+        assertThat(CompactionStyle.valueOf("FIFO")).
+            isEqualTo(CompactionStyle.FIFO);
+      }
+    }
+  }
+
+  @Test
+  public void maxTableFilesSizeFIFO() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      long longValue = rand.nextLong();
+      // Size has to be positive
+      longValue = (longValue < 0) ? -longValue : longValue;
+      longValue = (longValue == 0) ? longValue + 1 : longValue;
+      opt.setMaxTableFilesSizeFIFO(longValue);
+      assertThat(opt.maxTableFilesSizeFIFO()).
+          isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void maxWriteBufferNumberToMaintain() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      int intValue = rand.nextInt();
+      // Size has to be positive
+      intValue = (intValue < 0) ? -intValue : intValue;
+      intValue = (intValue == 0) ? intValue + 1 : intValue;
+      opt.setMaxWriteBufferNumberToMaintain(intValue);
+      assertThat(opt.maxWriteBufferNumberToMaintain()).
+          isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void compactionPriorities() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      for (final CompactionPriority compactionPriority :
+          CompactionPriority.values()) {
+        opt.setCompactionPriority(compactionPriority);
+        assertThat(opt.compactionPriority()).
+            isEqualTo(compactionPriority);
+      }
+    }
+  }
+
+  @Test
+  public void reportBgIoStats() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final boolean booleanValue = true;
+      opt.setReportBgIoStats(booleanValue);
+      assertThat(opt.reportBgIoStats()).
+          isEqualTo(booleanValue);
+    }
+  }
+
+  @Test
+  public void ttl() {
+    try (final ColumnFamilyOptions options = new ColumnFamilyOptions()) {
+      options.setTtl(1000 * 60);
+      assertThat(options.ttl()).
+          isEqualTo(1000 * 60);
+    }
+  }
+
+  @Test
+  public void periodicCompactionSeconds() {
+    try (final ColumnFamilyOptions options = new ColumnFamilyOptions()) {
+      options.setPeriodicCompactionSeconds(1000 * 60);
+      assertThat(options.periodicCompactionSeconds()).isEqualTo(1000 * 60);
+    }
+  }
+
+  @Test
+  public void compactionOptionsUniversal() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions();
+        final CompactionOptionsUniversal optUni = new CompactionOptionsUniversal()
+          .setCompressionSizePercent(7)) {
+      opt.setCompactionOptionsUniversal(optUni);
+      assertThat(opt.compactionOptionsUniversal()).
+          isEqualTo(optUni);
+      assertThat(opt.compactionOptionsUniversal().compressionSizePercent())
+          .isEqualTo(7);
+    }
+  }
+
+  @Test
+  public void compactionOptionsFIFO() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions();
+         final CompactionOptionsFIFO optFifo = new CompactionOptionsFIFO()
+             .setMaxTableFilesSize(2000)) {
+      opt.setCompactionOptionsFIFO(optFifo);
+      assertThat(opt.compactionOptionsFIFO()).
+          isEqualTo(optFifo);
+      assertThat(opt.compactionOptionsFIFO().maxTableFilesSize())
+          .isEqualTo(2000);
+    }
+  }
+
+  @Test
+  public void forceConsistencyChecks() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      final boolean booleanValue = true;
+      opt.setForceConsistencyChecks(booleanValue);
+      assertThat(opt.forceConsistencyChecks()).
+          isEqualTo(booleanValue);
+    }
+  }
+
+  @Test
+  public void compactionFilter() {
+    try(final ColumnFamilyOptions options = new ColumnFamilyOptions();
+        final RemoveEmptyValueCompactionFilter cf = new RemoveEmptyValueCompactionFilter()) {
+      options.setCompactionFilter(cf);
+      assertThat(options.compactionFilter()).isEqualTo(cf);
+    }
+  }
+
+  @Test
+  public void compactionFilterFactory() {
+    try(final ColumnFamilyOptions options = new ColumnFamilyOptions();
+        final RemoveEmptyValueCompactionFilterFactory cff = new RemoveEmptyValueCompactionFilterFactory()) {
+      options.setCompactionFilterFactory(cff);
+      assertThat(options.compactionFilterFactory()).isEqualTo(cff);
+    }
+  }
+
+  @Test
+  public void compactionThreadLimiter() {
+    try (final ColumnFamilyOptions options = new ColumnFamilyOptions();
+         final ConcurrentTaskLimiter compactionThreadLimiter =
+             new ConcurrentTaskLimiterImpl("name", 3)) {
+      options.setCompactionThreadLimiter(compactionThreadLimiter);
+      assertThat(options.compactionThreadLimiter()).isEqualTo(compactionThreadLimiter);
+    }
+  }
+
+  @Test
+  public void oldDefaults() {
+    try (final ColumnFamilyOptions options = new ColumnFamilyOptions()) {
+      options.oldDefaults(4, 6);
+      assertEquals(4 << 20, options.writeBufferSize());
+      assertThat(options.compactionPriority()).isEqualTo(CompactionPriority.ByCompensatedSize);
+      assertThat(options.targetFileSizeBase()).isEqualTo(2 * 1048576);
+      assertThat(options.maxBytesForLevelBase()).isEqualTo(10 * 1048576);
+      assertThat(options.softPendingCompactionBytesLimit()).isEqualTo(0);
+      assertThat(options.hardPendingCompactionBytesLimit()).isEqualTo(0);
+      assertThat(options.level0StopWritesTrigger()).isEqualTo(24);
+    }
+  }
+
+  @Test
+  public void optimizeForSmallDbWithCache() {
+    try (final ColumnFamilyOptions options = new ColumnFamilyOptions();
+         final Cache cache = new LRUCache(1024)) {
+      assertThat(options.optimizeForSmallDb(cache)).isEqualTo(options);
+    }
+  }
+
+  @Test
+  public void cfPaths() throws IOException {
+    try (final ColumnFamilyOptions options = new ColumnFamilyOptions()) {
+      final List<DbPath> paths = Arrays.asList(
+          new DbPath(Paths.get("test1"), 2 << 25), new DbPath(Paths.get("/test2/path"), 2 << 25));
+      assertThat(options.cfPaths()).isEqualTo(Collections.emptyList());
+      assertThat(options.setCfPaths(paths)).isEqualTo(options);
+      assertThat(options.cfPaths()).isEqualTo(paths);
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyTest.java
new file mode 100644
index 000000000..e98327d93
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyTest.java
@@ -0,0 +1,582 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.util.*;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+public class ColumnFamilyTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void columnFamilyDescriptorName() throws RocksDBException {
+    final byte[] cfName = "some_name".getBytes(UTF_8);
+
+    try(final ColumnFamilyOptions cfOptions = new ColumnFamilyOptions()) {
+      final ColumnFamilyDescriptor cfDescriptor =
+              new ColumnFamilyDescriptor(cfName, cfOptions);
+      assertThat(cfDescriptor.getName()).isEqualTo(cfName);
+    }
+  }
+
+  @Test
+  public void columnFamilyDescriptorOptions() throws RocksDBException {
+    final byte[] cfName = "some_name".getBytes(UTF_8);
+
+    try(final ColumnFamilyOptions cfOptions = new ColumnFamilyOptions()
+            .setCompressionType(CompressionType.BZLIB2_COMPRESSION)) {
+      final ColumnFamilyDescriptor cfDescriptor =
+          new ColumnFamilyDescriptor(cfName, cfOptions);
+
+        assertThat(cfDescriptor.getOptions().compressionType())
+            .isEqualTo(CompressionType.BZLIB2_COMPRESSION);
+    }
+  }
+
+  @Test
+  public void listColumnFamilies() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath())) {
+      // Test listColumnFamilies
+      final List<byte[]> columnFamilyNames = RocksDB.listColumnFamilies(options,
+          dbFolder.getRoot().getAbsolutePath());
+      assertThat(columnFamilyNames).isNotNull();
+      assertThat(columnFamilyNames.size()).isGreaterThan(0);
+      assertThat(columnFamilyNames.size()).isEqualTo(1);
+      assertThat(new String(columnFamilyNames.get(0))).isEqualTo("default");
+    }
+  }
+
+  @Test
+  public void defaultColumnFamily() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath())) {
+      final ColumnFamilyHandle cfh = db.getDefaultColumnFamily();
+      try {
+        assertThat(cfh).isNotNull();
+
+        assertThat(cfh.getName()).isEqualTo("default".getBytes(UTF_8));
+        assertThat(cfh.getID()).isEqualTo(0);
+        assertThat(cfh.getDescriptor().getName()).isEqualTo("default".getBytes(UTF_8));
+
+        final byte[] key = "key".getBytes();
+        final byte[] value = "value".getBytes();
+
+        db.put(cfh, key, value);
+
+        final byte[] actualValue = db.get(cfh, key);
+
+        assertThat(cfh).isNotNull();
+        assertThat(actualValue).isEqualTo(value);
+      } finally {
+        cfh.close();
+      }
+    }
+  }
+
+  @Test
+  public void createColumnFamily() throws RocksDBException {
+    final byte[] cfName = "new_cf".getBytes(UTF_8);
+    final ColumnFamilyDescriptor cfDescriptor = new ColumnFamilyDescriptor(cfName,
+            new ColumnFamilyOptions());
+
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+                 dbFolder.getRoot().getAbsolutePath())) {
+
+      final ColumnFamilyHandle columnFamilyHandle = db.createColumnFamily(cfDescriptor);
+
+      try {
+        assertThat(columnFamilyHandle.getName()).isEqualTo(cfName);
+        assertThat(columnFamilyHandle.getID()).isEqualTo(1);
+
+        final ColumnFamilyDescriptor latestDescriptor = columnFamilyHandle.getDescriptor();
+        assertThat(latestDescriptor.getName()).isEqualTo(cfName);
+
+        final List<byte[]> columnFamilyNames = RocksDB.listColumnFamilies(
+                options, dbFolder.getRoot().getAbsolutePath());
+        assertThat(columnFamilyNames).isNotNull();
+        assertThat(columnFamilyNames.size()).isGreaterThan(0);
+        assertThat(columnFamilyNames.size()).isEqualTo(2);
+        assertThat(new String(columnFamilyNames.get(0))).isEqualTo("default");
+        assertThat(new String(columnFamilyNames.get(1))).isEqualTo("new_cf");
+      } finally {
+        columnFamilyHandle.close();
+      }
+    }
+  }
+
+  @Test
+  public void openWithColumnFamilies() throws RocksDBException {
+    final List<ColumnFamilyDescriptor> cfNames = Arrays.asList(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+        new ColumnFamilyDescriptor("new_cf".getBytes())
+    );
+
+    final List<ColumnFamilyHandle> columnFamilyHandleList =
+        new ArrayList<>();
+
+    // Test open database with column family names
+    try (final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true)
+        .setCreateMissingColumnFamilies(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath(), cfNames,
+             columnFamilyHandleList)) {
+      assertThat(columnFamilyHandleList.size()).isEqualTo(2);
+      db.put("dfkey1".getBytes(), "dfvalue".getBytes());
+      db.put(columnFamilyHandleList.get(0), "dfkey2".getBytes(), "dfvalue".getBytes());
+      db.put(columnFamilyHandleList.get(1), "newcfkey1".getBytes(), "newcfvalue".getBytes());
+
+      String retVal = new String(db.get(columnFamilyHandleList.get(1), "newcfkey1".getBytes()));
+      assertThat(retVal).isEqualTo("newcfvalue");
+      assertThat((db.get(columnFamilyHandleList.get(1), "dfkey1".getBytes()))).isNull();
+      db.delete(columnFamilyHandleList.get(1), "newcfkey1".getBytes());
+      assertThat((db.get(columnFamilyHandleList.get(1), "newcfkey1".getBytes()))).isNull();
+      db.delete(columnFamilyHandleList.get(0), new WriteOptions(), "dfkey2".getBytes());
+      assertThat(db.get(columnFamilyHandleList.get(0), new ReadOptions(), "dfkey2".getBytes()))
+          .isNull();
+    }
+  }
+
+  @Test
+  public void getWithOutValueAndCf() throws RocksDBException {
+    final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+
+    // Test open database with column family names
+    try (final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true)
+        .setCreateMissingColumnFamilies(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+             columnFamilyHandleList)) {
+      db.put(
+          columnFamilyHandleList.get(0), new WriteOptions(), "key1".getBytes(), "value".getBytes());
+      db.put("key2".getBytes(), "12345678".getBytes());
+      final byte[] outValue = new byte[5];
+      // not found value
+      int getResult = db.get("keyNotFound".getBytes(), outValue);
+      assertThat(getResult).isEqualTo(RocksDB.NOT_FOUND);
+      // found value which fits in outValue
+      getResult = db.get(columnFamilyHandleList.get(0), "key1".getBytes(), outValue);
+      assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND);
+      assertThat(outValue).isEqualTo("value".getBytes());
+      // found value which fits partially
+      getResult =
+          db.get(columnFamilyHandleList.get(0), new ReadOptions(), "key2".getBytes(), outValue);
+      assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND);
+      assertThat(outValue).isEqualTo("12345".getBytes());
+    }
+  }
+
+  @Test
+  public void createWriteDropColumnFamily() throws RocksDBException {
+    final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+        new ColumnFamilyDescriptor("new_cf".getBytes()));
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+    try (final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true)
+        .setCreateMissingColumnFamilies(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+             columnFamilyHandleList)) {
+      ColumnFamilyHandle tmpColumnFamilyHandle;
+      tmpColumnFamilyHandle = db.createColumnFamily(
+          new ColumnFamilyDescriptor("tmpCF".getBytes(), new ColumnFamilyOptions()));
+      db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes());
+      db.dropColumnFamily(tmpColumnFamilyHandle);
+      assertThat(tmpColumnFamilyHandle.isOwningHandle()).isTrue();
+    }
+  }
+
+  @Test
+  public void createWriteDropColumnFamilies() throws RocksDBException {
+    final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+        new ColumnFamilyDescriptor("new_cf".getBytes()));
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+    try (final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true)
+        .setCreateMissingColumnFamilies(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+             columnFamilyHandleList)) {
+      ColumnFamilyHandle tmpColumnFamilyHandle = null;
+      ColumnFamilyHandle tmpColumnFamilyHandle2 = null;
+      tmpColumnFamilyHandle = db.createColumnFamily(
+          new ColumnFamilyDescriptor("tmpCF".getBytes(), new ColumnFamilyOptions()));
+      tmpColumnFamilyHandle2 = db.createColumnFamily(
+          new ColumnFamilyDescriptor("tmpCF2".getBytes(), new ColumnFamilyOptions()));
+      db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes());
+      db.put(tmpColumnFamilyHandle2, "key".getBytes(), "value".getBytes());
+      db.dropColumnFamilies(Arrays.asList(tmpColumnFamilyHandle, tmpColumnFamilyHandle2));
+      assertThat(tmpColumnFamilyHandle.isOwningHandle()).isTrue();
+      assertThat(tmpColumnFamilyHandle2.isOwningHandle()).isTrue();
+    }
+  }
+
+  @Test
+  public void writeBatch() throws RocksDBException {
+    try (final StringAppendOperator stringAppendOperator = new StringAppendOperator();
+         final ColumnFamilyOptions defaultCfOptions = new ColumnFamilyOptions()
+             .setMergeOperator(stringAppendOperator)) {
+      final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+              defaultCfOptions),
+          new ColumnFamilyDescriptor("new_cf".getBytes()));
+      final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+      try (final DBOptions options = new DBOptions()
+          .setCreateIfMissing(true)
+          .setCreateMissingColumnFamilies(true);
+           final RocksDB db = RocksDB.open(options,
+               dbFolder.getRoot().getAbsolutePath(),
+               cfDescriptors, columnFamilyHandleList);
+           final WriteBatch writeBatch = new WriteBatch();
+           final WriteOptions writeOpt = new WriteOptions()) {
+        writeBatch.put("key".getBytes(), "value".getBytes());
+        writeBatch.put(db.getDefaultColumnFamily(), "mergeKey".getBytes(), "merge".getBytes());
+        writeBatch.merge(db.getDefaultColumnFamily(), "mergeKey".getBytes(), "merge".getBytes());
+        writeBatch.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), "value".getBytes());
+        writeBatch.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(), "value2".getBytes());
+        writeBatch.delete("xyz".getBytes());
+        writeBatch.delete(columnFamilyHandleList.get(1), "xyz".getBytes());
+        db.write(writeOpt, writeBatch);
+
+        assertThat(db.get(columnFamilyHandleList.get(1), "xyz".getBytes()) == null);
+        assertThat(new String(db.get(columnFamilyHandleList.get(1), "newcfkey".getBytes())))
+            .isEqualTo("value");
+        assertThat(new String(db.get(columnFamilyHandleList.get(1), "newcfkey2".getBytes())))
+            .isEqualTo("value2");
+        assertThat(new String(db.get("key".getBytes()))).isEqualTo("value");
+        // check if key is merged
+        assertThat(new String(db.get(db.getDefaultColumnFamily(), "mergeKey".getBytes())))
+            .isEqualTo("merge,merge");
+      }
+    }
+  }
+
+  @Test
+  public void iteratorOnColumnFamily() throws RocksDBException {
+    final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+        new ColumnFamilyDescriptor("new_cf".getBytes()));
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+    try (final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true)
+        .setCreateMissingColumnFamilies(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath(),
+             cfDescriptors, columnFamilyHandleList)) {
+      db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), "value".getBytes());
+      db.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(), "value2".getBytes());
+      try (final RocksIterator rocksIterator = db.newIterator(columnFamilyHandleList.get(1))) {
+        rocksIterator.seekToFirst();
+        Map<String, String> refMap = new HashMap<>();
+        refMap.put("newcfkey", "value");
+        refMap.put("newcfkey2", "value2");
+        int i = 0;
+        while (rocksIterator.isValid()) {
+          i++;
+          assertThat(refMap.get(new String(rocksIterator.key())))
+              .isEqualTo(new String(rocksIterator.value()));
+          rocksIterator.next();
+        }
+        assertThat(i).isEqualTo(2);
+      }
+    }
+  }
+
+  @Test
+  public void multiGet() throws RocksDBException {
+    final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+        new ColumnFamilyDescriptor("new_cf".getBytes()));
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+    try (final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true)
+        .setCreateMissingColumnFamilies(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath(),
+             cfDescriptors, columnFamilyHandleList)) {
+      db.put(columnFamilyHandleList.get(0), "key".getBytes(), "value".getBytes());
+      db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), "value".getBytes());
+
+      final List<byte[]> keys =
+          Arrays.asList(new byte[][] {"key".getBytes(), "newcfkey".getBytes()});
+
+      List<byte[]> retValues = db.multiGetAsList(columnFamilyHandleList, keys);
+      assertThat(retValues.size()).isEqualTo(2);
+      assertThat(new String(retValues.get(0))).isEqualTo("value");
+      assertThat(new String(retValues.get(1))).isEqualTo("value");
+      retValues = db.multiGetAsList(new ReadOptions(), columnFamilyHandleList, keys);
+      assertThat(retValues.size()).isEqualTo(2);
+      assertThat(new String(retValues.get(0))).isEqualTo("value");
+      assertThat(new String(retValues.get(1))).isEqualTo("value");
+    }
+  }
+
+  @Test
+  public void multiGetAsList() throws RocksDBException {
+    final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+        new ColumnFamilyDescriptor("new_cf".getBytes()));
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+    try (final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true)
+        .setCreateMissingColumnFamilies(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath(),
+             cfDescriptors, columnFamilyHandleList)) {
+      db.put(columnFamilyHandleList.get(0), "key".getBytes(), "value".getBytes());
+      db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), "value".getBytes());
+
+      final List<byte[]> keys =
+          Arrays.asList(new byte[][] {"key".getBytes(), "newcfkey".getBytes()});
+      List<byte[]> retValues = db.multiGetAsList(columnFamilyHandleList, keys);
+      assertThat(retValues.size()).isEqualTo(2);
+      assertThat(new String(retValues.get(0))).isEqualTo("value");
+      assertThat(new String(retValues.get(1))).isEqualTo("value");
+      retValues = db.multiGetAsList(new ReadOptions(), columnFamilyHandleList, keys);
+      assertThat(retValues.size()).isEqualTo(2);
+      assertThat(new String(retValues.get(0))).isEqualTo("value");
+      assertThat(new String(retValues.get(1))).isEqualTo("value");
+    }
+  }
+
+  @Test
+  public void properties() throws RocksDBException {
+    final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+        new ColumnFamilyDescriptor("new_cf".getBytes()));
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+    try (final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true)
+        .setCreateMissingColumnFamilies(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath(),
+             cfDescriptors, columnFamilyHandleList)) {
+      assertThat(db.getProperty("rocksdb.estimate-num-keys")).isNotNull();
+      assertThat(db.getLongProperty(columnFamilyHandleList.get(0), "rocksdb.estimate-num-keys"))
+          .isGreaterThanOrEqualTo(0);
+      assertThat(db.getProperty("rocksdb.stats")).isNotNull();
+      assertThat(db.getProperty(columnFamilyHandleList.get(0), "rocksdb.sstables")).isNotNull();
+      assertThat(db.getProperty(columnFamilyHandleList.get(1), "rocksdb.estimate-num-keys"))
+          .isNotNull();
+      assertThat(db.getProperty(columnFamilyHandleList.get(1), "rocksdb.stats")).isNotNull();
+      assertThat(db.getProperty(columnFamilyHandleList.get(1), "rocksdb.sstables")).isNotNull();
+      assertThat(db.getAggregatedLongProperty("rocksdb.estimate-num-keys")).isNotNull();
+      assertThat(db.getAggregatedLongProperty("rocksdb.estimate-num-keys"))
+          .isGreaterThanOrEqualTo(0);
+    }
+  }
+
+
+  @Test
+  public void iterators() throws RocksDBException {
+    final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+        new ColumnFamilyDescriptor("new_cf".getBytes()));
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+    try (final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true)
+        .setCreateMissingColumnFamilies(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+             columnFamilyHandleList)) {
+      List<RocksIterator> iterators = null;
+      try {
+        iterators = db.newIterators(columnFamilyHandleList);
+        assertThat(iterators.size()).isEqualTo(2);
+        RocksIterator iter = iterators.get(0);
+        iter.seekToFirst();
+        final Map<String, String> defRefMap = new HashMap<>();
+        defRefMap.put("dfkey1", "dfvalue");
+        defRefMap.put("key", "value");
+        while (iter.isValid()) {
+          assertThat(defRefMap.get(new String(iter.key()))).
+              isEqualTo(new String(iter.value()));
+          iter.next();
+        }
+        // iterate over new_cf key/value pairs
+        final Map<String, String> cfRefMap = new HashMap<>();
+        cfRefMap.put("newcfkey", "value");
+        cfRefMap.put("newcfkey2", "value2");
+        iter = iterators.get(1);
+        iter.seekToFirst();
+        while (iter.isValid()) {
+          assertThat(cfRefMap.get(new String(iter.key()))).
+              isEqualTo(new String(iter.value()));
+          iter.next();
+        }
+      } finally {
+        if (iterators != null) {
+          for (final RocksIterator rocksIterator : iterators) {
+            rocksIterator.close();
+          }
+        }
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failPutDisposedCF() throws RocksDBException {
+    final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+        new ColumnFamilyDescriptor("new_cf".getBytes()));
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+    try (final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath(),
+             cfDescriptors, columnFamilyHandleList)) {
+      db.dropColumnFamily(columnFamilyHandleList.get(1));
+      db.put(columnFamilyHandleList.get(1), "key".getBytes(), "value".getBytes());
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failRemoveDisposedCF() throws RocksDBException {
+    final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+        new ColumnFamilyDescriptor("new_cf".getBytes()));
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+    try (final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath(),
+             cfDescriptors, columnFamilyHandleList)) {
+      db.dropColumnFamily(columnFamilyHandleList.get(1));
+      db.delete(columnFamilyHandleList.get(1), "key".getBytes());
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failGetDisposedCF() throws RocksDBException {
+    final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+        new ColumnFamilyDescriptor("new_cf".getBytes()));
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+    try (final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+             columnFamilyHandleList)) {
+      db.dropColumnFamily(columnFamilyHandleList.get(1));
+      db.get(columnFamilyHandleList.get(1), "key".getBytes());
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failMultiGetWithoutCorrectNumberOfCF() throws RocksDBException {
+    final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+        new ColumnFamilyDescriptor("new_cf".getBytes()));
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+    try (final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+             columnFamilyHandleList)) {
+      final List<byte[]> keys = new ArrayList<>();
+      keys.add("key".getBytes());
+      keys.add("newcfkey".getBytes());
+      final List<ColumnFamilyHandle> cfCustomList = new ArrayList<>();
+      db.multiGetAsList(cfCustomList, keys);
+    }
+  }
+
+  @Test
+  public void testByteCreateFolumnFamily() throws RocksDBException {
+
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath())
+    ) {
+      final byte[] b0 = new byte[]{(byte) 0x00};
+      final byte[] b1 = new byte[]{(byte) 0x01};
+      final byte[] b2 = new byte[]{(byte) 0x02};
+      db.createColumnFamily(new ColumnFamilyDescriptor(b0));
+      db.createColumnFamily(new ColumnFamilyDescriptor(b1));
+      final List<byte[]> families =
+          RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath());
+      assertThat(families).contains("default".getBytes(), b0, b1);
+      db.createColumnFamily(new ColumnFamilyDescriptor(b2));
+    }
+  }
+
+  @Test
+  public void testCFNamesWithZeroBytes() throws RocksDBException {
+    ColumnFamilyHandle cf1 = null, cf2 = null;
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath());
+    ) {
+      final byte[] b0 = new byte[] {0, 0};
+      final byte[] b1 = new byte[] {0, 1};
+      cf1 = db.createColumnFamily(new ColumnFamilyDescriptor(b0));
+      cf2 = db.createColumnFamily(new ColumnFamilyDescriptor(b1));
+      final List<byte[]> families =
+          RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath());
+      assertThat(families).contains("default".getBytes(), b0, b1);
+    }
+  }
+
+  @Test
+  public void testCFNameSimplifiedChinese() throws RocksDBException {
+    ColumnFamilyHandle columnFamilyHandle = null;
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath());
+    ) {
+      final String simplifiedChinese = "\u7b80\u4f53\u5b57";
+      columnFamilyHandle =
+          db.createColumnFamily(new ColumnFamilyDescriptor(simplifiedChinese.getBytes()));
+
+      final List<byte[]> families =
+          RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath());
+      assertThat(families).contains("default".getBytes(), simplifiedChinese.getBytes());
+    }
+  }
+
+  @Test
+  public void testDestroyColumnFamilyHandle() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());) {
+      final byte[] name1 = "cf1".getBytes();
+      final byte[] name2 = "cf2".getBytes();
+      final ColumnFamilyDescriptor desc1 = new ColumnFamilyDescriptor(name1);
+      final ColumnFamilyDescriptor desc2 = new ColumnFamilyDescriptor(name2);
+      final ColumnFamilyHandle cf1 = db.createColumnFamily(desc1);
+      final ColumnFamilyHandle cf2 = db.createColumnFamily(desc2);
+      assertTrue(cf1.isOwningHandle());
+      assertTrue(cf2.isOwningHandle());
+      assertFalse(cf1.isDefaultColumnFamily());
+      db.destroyColumnFamilyHandle(cf1);
+      // At this point cf1 should not be used!
+      assertFalse(cf1.isOwningHandle());
+      assertTrue(cf2.isOwningHandle());
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/CompactRangeOptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/CompactRangeOptionsTest.java
new file mode 100644
index 000000000..18c187ddb
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/CompactRangeOptionsTest.java
@@ -0,0 +1,98 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Test;
+import org.rocksdb.CompactRangeOptions.BottommostLevelCompaction;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class CompactRangeOptionsTest {
+
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  @Test
+  public void exclusiveManualCompaction() {
+    CompactRangeOptions opt = new CompactRangeOptions();
+    boolean value = false;
+    opt.setExclusiveManualCompaction(value);
+    assertThat(opt.exclusiveManualCompaction()).isEqualTo(value);
+    value = true;
+    opt.setExclusiveManualCompaction(value);
+    assertThat(opt.exclusiveManualCompaction()).isEqualTo(value);
+  }
+
+  @Test
+  public void bottommostLevelCompaction() {
+    CompactRangeOptions opt = new CompactRangeOptions();
+    BottommostLevelCompaction value = BottommostLevelCompaction.kSkip;
+    opt.setBottommostLevelCompaction(value);
+    assertThat(opt.bottommostLevelCompaction()).isEqualTo(value);
+    value = BottommostLevelCompaction.kForce;
+    opt.setBottommostLevelCompaction(value);
+    assertThat(opt.bottommostLevelCompaction()).isEqualTo(value);
+    value = BottommostLevelCompaction.kIfHaveCompactionFilter;
+    opt.setBottommostLevelCompaction(value);
+    assertThat(opt.bottommostLevelCompaction()).isEqualTo(value);
+  }
+
+  @Test
+  public void changeLevel() {
+    CompactRangeOptions opt = new CompactRangeOptions();
+    boolean value = false;
+    opt.setChangeLevel(value);
+    assertThat(opt.changeLevel()).isEqualTo(value);
+    value = true;
+    opt.setChangeLevel(value);
+    assertThat(opt.changeLevel()).isEqualTo(value);
+  }
+
+  @Test
+  public void targetLevel() {
+    CompactRangeOptions opt = new CompactRangeOptions();
+    int value = 2;
+    opt.setTargetLevel(value);
+    assertThat(opt.targetLevel()).isEqualTo(value);
+    value = 3;
+    opt.setTargetLevel(value);
+    assertThat(opt.targetLevel()).isEqualTo(value);
+  }
+
+  @Test
+  public void targetPathId() {
+    CompactRangeOptions opt = new CompactRangeOptions();
+    int value = 2;
+    opt.setTargetPathId(value);
+    assertThat(opt.targetPathId()).isEqualTo(value);
+    value = 3;
+    opt.setTargetPathId(value);
+    assertThat(opt.targetPathId()).isEqualTo(value);
+  }
+
+  @Test
+  public void allowWriteStall() {
+    CompactRangeOptions opt = new CompactRangeOptions();
+    boolean value = false;
+    opt.setAllowWriteStall(value);
+    assertThat(opt.allowWriteStall()).isEqualTo(value);
+    value = true;
+    opt.setAllowWriteStall(value);
+    assertThat(opt.allowWriteStall()).isEqualTo(value);
+  }
+
+  @Test
+  public void maxSubcompactions() {
+    CompactRangeOptions opt = new CompactRangeOptions();
+    int value = 2;
+    opt.setMaxSubcompactions(value);
+    assertThat(opt.maxSubcompactions()).isEqualTo(value);
+    value = 3;
+    opt.setMaxSubcompactions(value);
+    assertThat(opt.maxSubcompactions()).isEqualTo(value);
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/CompactionFilterFactoryTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/CompactionFilterFactoryTest.java
new file mode 100644
index 000000000..35a14eb54
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/CompactionFilterFactoryTest.java
@@ -0,0 +1,61 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.rocksdb.test.RemoveEmptyValueCompactionFilterFactory;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class CompactionFilterFactoryTest {
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void columnFamilyOptions_setCompactionFilterFactory()
+      throws RocksDBException {
+    try(final DBOptions options = new DBOptions()
+            .setCreateIfMissing(true)
+            .setCreateMissingColumnFamilies(true);
+        final RemoveEmptyValueCompactionFilterFactory compactionFilterFactory
+            = new RemoveEmptyValueCompactionFilterFactory();
+        final ColumnFamilyOptions new_cf_opts
+            = new ColumnFamilyOptions()
+            .setCompactionFilterFactory(compactionFilterFactory)) {
+
+      final List<ColumnFamilyDescriptor> cfNames = Arrays.asList(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+          new ColumnFamilyDescriptor("new_cf".getBytes(), new_cf_opts));
+
+      final List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
+
+      try (final RocksDB rocksDb =
+               RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfNames, cfHandles)) {
+        final byte[] key1 = "key1".getBytes();
+        final byte[] key2 = "key2".getBytes();
+
+        final byte[] value1 = "value1".getBytes();
+        final byte[] value2 = new byte[0];
+
+        rocksDb.put(cfHandles.get(1), key1, value1);
+        rocksDb.put(cfHandles.get(1), key2, value2);
+
+        rocksDb.compactRange(cfHandles.get(1));
+
+        assertThat(rocksDb.get(cfHandles.get(1), key1)).isEqualTo(value1);
+        final boolean exists = rocksDb.keyMayExist(cfHandles.get(1), key2, null);
+        assertThat(exists).isFalse();
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/CompactionJobInfoTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/CompactionJobInfoTest.java
new file mode 100644
index 000000000..c71b0da16
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/CompactionJobInfoTest.java
@@ -0,0 +1,114 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class CompactionJobInfoTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Test
+  public void columnFamilyName() {
+    try (final CompactionJobInfo compactionJobInfo = new CompactionJobInfo()) {
+      assertThat(compactionJobInfo.columnFamilyName())
+          .isEmpty();
+    }
+  }
+
+  @Test
+  public void status() {
+    try (final CompactionJobInfo compactionJobInfo = new CompactionJobInfo()) {
+      assertThat(compactionJobInfo.status().getCode())
+          .isEqualTo(Status.Code.Ok);
+    }
+  }
+
+  @Test
+  public void threadId() {
+    try (final CompactionJobInfo compactionJobInfo = new CompactionJobInfo()) {
+      assertThat(compactionJobInfo.threadId())
+          .isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void jobId() {
+    try (final CompactionJobInfo compactionJobInfo = new CompactionJobInfo()) {
+      assertThat(compactionJobInfo.jobId())
+          .isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void baseInputLevel() {
+    try (final CompactionJobInfo compactionJobInfo = new CompactionJobInfo()) {
+      assertThat(compactionJobInfo.baseInputLevel())
+          .isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void outputLevel() {
+    try (final CompactionJobInfo compactionJobInfo = new CompactionJobInfo()) {
+      assertThat(compactionJobInfo.outputLevel())
+          .isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void inputFiles() {
+    try (final CompactionJobInfo compactionJobInfo = new CompactionJobInfo()) {
+      assertThat(compactionJobInfo.inputFiles())
+          .isEmpty();
+    }
+  }
+
+  @Test
+  public void outputFiles() {
+    try (final CompactionJobInfo compactionJobInfo = new CompactionJobInfo()) {
+      assertThat(compactionJobInfo.outputFiles())
+          .isEmpty();
+    }
+  }
+
+  @Test
+  public void tableProperties() {
+    try (final CompactionJobInfo compactionJobInfo = new CompactionJobInfo()) {
+      assertThat(compactionJobInfo.tableProperties())
+          .isEmpty();
+    }
+  }
+
+  @Test
+  public void compactionReason() {
+    try (final CompactionJobInfo compactionJobInfo = new CompactionJobInfo()) {
+      assertThat(compactionJobInfo.compactionReason())
+          .isEqualTo(CompactionReason.kUnknown);
+    }
+  }
+
+  @Test
+  public void compression() {
+    try (final CompactionJobInfo compactionJobInfo = new CompactionJobInfo()) {
+      assertThat(compactionJobInfo.compression())
+          .isEqualTo(CompressionType.NO_COMPRESSION);
+    }
+  }
+
+  @Test
+  public void stats() {
+    try (final CompactionJobInfo compactionJobInfo = new CompactionJobInfo()) {
+      assertThat(compactionJobInfo.stats())
+          .isNotNull();
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/CompactionJobStatsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/CompactionJobStatsTest.java
new file mode 100644
index 000000000..5c1eb2aab
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/CompactionJobStatsTest.java
@@ -0,0 +1,196 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class CompactionJobStatsTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Test
+  public void reset() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      compactionJobStats.reset();
+      assertThat(compactionJobStats.elapsedMicros()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void add() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats();
+         final CompactionJobStats otherCompactionJobStats = new CompactionJobStats()) {
+      compactionJobStats.add(otherCompactionJobStats);
+    }
+  }
+
+  @Test
+  public void elapsedMicros() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.elapsedMicros()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void numInputRecords() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.numInputRecords()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void numInputFiles() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.numInputFiles()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void numInputFilesAtOutputLevel() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.numInputFilesAtOutputLevel()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void numOutputRecords() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.numOutputRecords()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void numOutputFiles() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.numOutputFiles()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void isManualCompaction() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.isManualCompaction()).isFalse();
+    }
+  }
+
+  @Test
+  public void totalInputBytes() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.totalInputBytes()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void totalOutputBytes() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.totalOutputBytes()).isEqualTo(0);
+    }
+  }
+
+
+  @Test
+  public void numRecordsReplaced() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.numRecordsReplaced()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void totalInputRawKeyBytes() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.totalInputRawKeyBytes()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void totalInputRawValueBytes() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.totalInputRawValueBytes()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void numInputDeletionRecords() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.numInputDeletionRecords()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void numExpiredDeletionRecords() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.numExpiredDeletionRecords()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void numCorruptKeys() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.numCorruptKeys()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void fileWriteNanos() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.fileWriteNanos()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void fileRangeSyncNanos() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.fileRangeSyncNanos()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void fileFsyncNanos() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.fileFsyncNanos()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void filePrepareWriteNanos() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.filePrepareWriteNanos()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void smallestOutputKeyPrefix() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.smallestOutputKeyPrefix()).isEmpty();
+    }
+  }
+
+  @Test
+  public void largestOutputKeyPrefix() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.largestOutputKeyPrefix()).isEmpty();
+    }
+  }
+
+  @Test
+  public void numSingleDelFallthru() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.numSingleDelFallthru()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void numSingleDelMismatch() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.numSingleDelMismatch()).isEqualTo(0);
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/CompactionOptionsFIFOTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/CompactionOptionsFIFOTest.java
new file mode 100644
index 000000000..841615e67
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/CompactionOptionsFIFOTest.java
@@ -0,0 +1,35 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class CompactionOptionsFIFOTest {
+
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  @Test
+  public void maxTableFilesSize() {
+    final long size = 500 * 1024 * 1026;
+    try (final CompactionOptionsFIFO opt = new CompactionOptionsFIFO()) {
+      opt.setMaxTableFilesSize(size);
+      assertThat(opt.maxTableFilesSize()).isEqualTo(size);
+    }
+  }
+
+  @Test
+  public void allowCompaction() {
+    final boolean allowCompaction = true;
+    try (final CompactionOptionsFIFO opt = new CompactionOptionsFIFO()) {
+      opt.setAllowCompaction(allowCompaction);
+      assertThat(opt.allowCompaction()).isEqualTo(allowCompaction);
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/CompactionOptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/CompactionOptionsTest.java
new file mode 100644
index 000000000..9b7d79694
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/CompactionOptionsTest.java
@@ -0,0 +1,52 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class CompactionOptionsTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Test
+  public void compression() {
+    try (final CompactionOptions compactionOptions = new CompactionOptions()) {
+      assertThat(compactionOptions.compression())
+          .isEqualTo(CompressionType.SNAPPY_COMPRESSION);
+      compactionOptions.setCompression(CompressionType.NO_COMPRESSION);
+      assertThat(compactionOptions.compression())
+          .isEqualTo(CompressionType.NO_COMPRESSION);
+    }
+  }
+
+  @Test
+  public void outputFileSizeLimit() {
+    final long mb250 = 1024 * 1024 * 250;
+    try (final CompactionOptions compactionOptions = new CompactionOptions()) {
+      assertThat(compactionOptions.outputFileSizeLimit())
+          .isEqualTo(-1);
+      compactionOptions.setOutputFileSizeLimit(mb250);
+      assertThat(compactionOptions.outputFileSizeLimit())
+          .isEqualTo(mb250);
+    }
+  }
+
+  @Test
+  public void maxSubcompactions() {
+    try (final CompactionOptions compactionOptions = new CompactionOptions()) {
+      assertThat(compactionOptions.maxSubcompactions())
+          .isEqualTo(0);
+      compactionOptions.setMaxSubcompactions(9);
+      assertThat(compactionOptions.maxSubcompactions())
+          .isEqualTo(9);
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/CompactionOptionsUniversalTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/CompactionOptionsUniversalTest.java
new file mode 100644
index 000000000..5e2d195b6
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/CompactionOptionsUniversalTest.java
@@ -0,0 +1,80 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class CompactionOptionsUniversalTest {
+
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  @Test
+  public void sizeRatio() {
+    final int sizeRatio = 4;
+    try(final CompactionOptionsUniversal opt = new CompactionOptionsUniversal()) {
+      opt.setSizeRatio(sizeRatio);
+      assertThat(opt.sizeRatio()).isEqualTo(sizeRatio);
+    }
+  }
+
+  @Test
+  public void minMergeWidth() {
+    final int minMergeWidth = 3;
+    try(final CompactionOptionsUniversal opt = new CompactionOptionsUniversal()) {
+      opt.setMinMergeWidth(minMergeWidth);
+      assertThat(opt.minMergeWidth()).isEqualTo(minMergeWidth);
+    }
+  }
+
+  @Test
+  public void maxMergeWidth() {
+    final int maxMergeWidth = Integer.MAX_VALUE - 1234;
+    try(final CompactionOptionsUniversal opt = new CompactionOptionsUniversal()) {
+      opt.setMaxMergeWidth(maxMergeWidth);
+      assertThat(opt.maxMergeWidth()).isEqualTo(maxMergeWidth);
+    }
+  }
+
+  @Test
+  public void maxSizeAmplificationPercent() {
+    final int maxSizeAmplificationPercent = 150;
+    try(final CompactionOptionsUniversal opt = new CompactionOptionsUniversal()) {
+      opt.setMaxSizeAmplificationPercent(maxSizeAmplificationPercent);
+      assertThat(opt.maxSizeAmplificationPercent()).isEqualTo(maxSizeAmplificationPercent);
+    }
+  }
+
+  @Test
+  public void compressionSizePercent() {
+    final int compressionSizePercent = 500;
+    try(final CompactionOptionsUniversal opt = new CompactionOptionsUniversal()) {
+      opt.setCompressionSizePercent(compressionSizePercent);
+      assertThat(opt.compressionSizePercent()).isEqualTo(compressionSizePercent);
+    }
+  }
+
+  @Test
+  public void stopStyle() {
+    final CompactionStopStyle stopStyle = CompactionStopStyle.CompactionStopStyleSimilarSize;
+    try(final CompactionOptionsUniversal opt = new CompactionOptionsUniversal()) {
+      opt.setStopStyle(stopStyle);
+      assertThat(opt.stopStyle()).isEqualTo(stopStyle);
+    }
+  }
+
+  @Test
+  public void allowTrivialMove() {
+    final boolean allowTrivialMove = true;
+    try(final CompactionOptionsUniversal opt = new CompactionOptionsUniversal()) {
+      opt.setAllowTrivialMove(allowTrivialMove);
+      assertThat(opt.allowTrivialMove()).isEqualTo(allowTrivialMove);
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/CompactionPriorityTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/CompactionPriorityTest.java
new file mode 100644
index 000000000..b078e132f
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/CompactionPriorityTest.java
@@ -0,0 +1,31 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class CompactionPriorityTest {
+
+  @Test(expected = IllegalArgumentException.class)
+  public void failIfIllegalByteValueProvided() {
+    CompactionPriority.getCompactionPriority((byte) -1);
+  }
+
+  @Test
+  public void getCompactionPriority() {
+    assertThat(CompactionPriority.getCompactionPriority(
+        CompactionPriority.OldestLargestSeqFirst.getValue()))
+            .isEqualTo(CompactionPriority.OldestLargestSeqFirst);
+  }
+
+  @Test
+  public void valueOf() {
+    assertThat(CompactionPriority.valueOf("OldestSmallestSeqFirst")).
+        isEqualTo(CompactionPriority.OldestSmallestSeqFirst);
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/CompactionStopStyleTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/CompactionStopStyleTest.java
new file mode 100644
index 000000000..4c8a20950
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/CompactionStopStyleTest.java
@@ -0,0 +1,31 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class CompactionStopStyleTest {
+
+  @Test(expected = IllegalArgumentException.class)
+  public void failIfIllegalByteValueProvided() {
+    CompactionStopStyle.getCompactionStopStyle((byte) -1);
+  }
+
+  @Test
+  public void getCompactionStopStyle() {
+    assertThat(CompactionStopStyle.getCompactionStopStyle(
+        CompactionStopStyle.CompactionStopStyleTotalSize.getValue()))
+            .isEqualTo(CompactionStopStyle.CompactionStopStyleTotalSize);
+  }
+
+  @Test
+  public void valueOf() {
+    assertThat(CompactionStopStyle.valueOf("CompactionStopStyleSimilarSize")).
+        isEqualTo(CompactionStopStyle.CompactionStopStyleSimilarSize);
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/ComparatorOptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/ComparatorOptionsTest.java
new file mode 100644
index 000000000..3e90b9f10
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/ComparatorOptionsTest.java
@@ -0,0 +1,58 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class ComparatorOptionsTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Test
+  public void reusedSynchronisationType() {
+    try(final ComparatorOptions copt = new ComparatorOptions()) {
+
+      copt.setReusedSynchronisationType(ReusedSynchronisationType.MUTEX);
+      assertThat(copt.reusedSynchronisationType())
+          .isEqualTo(ReusedSynchronisationType.MUTEX);
+
+      copt.setReusedSynchronisationType(ReusedSynchronisationType.ADAPTIVE_MUTEX);
+      assertThat(copt.reusedSynchronisationType())
+          .isEqualTo(ReusedSynchronisationType.ADAPTIVE_MUTEX);
+
+      copt.setReusedSynchronisationType(ReusedSynchronisationType.THREAD_LOCAL);
+      assertThat(copt.reusedSynchronisationType())
+          .isEqualTo(ReusedSynchronisationType.THREAD_LOCAL);
+    }
+  }
+
+  @Test
+  public void useDirectBuffer() {
+    try(final ComparatorOptions copt = new ComparatorOptions()) {
+      copt.setUseDirectBuffer(true);
+      assertThat(copt.useDirectBuffer()).isTrue();
+
+      copt.setUseDirectBuffer(false);
+      assertThat(copt.useDirectBuffer()).isFalse();
+    }
+  }
+
+  @Test
+  public void maxReusedBufferSize() {
+    try(final ComparatorOptions copt = new ComparatorOptions()) {
+      copt.setMaxReusedBufferSize(12345);
+      assertThat(copt.maxReusedBufferSize()).isEqualTo(12345);
+
+      copt.setMaxReusedBufferSize(-1);
+      assertThat(copt.maxReusedBufferSize()).isEqualTo(-1);
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/CompressionOptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/CompressionOptionsTest.java
new file mode 100644
index 000000000..116552c32
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/CompressionOptionsTest.java
@@ -0,0 +1,71 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class CompressionOptionsTest {
+
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  @Test
+  public void windowBits() {
+    final int windowBits = 7;
+    try(final CompressionOptions opt = new CompressionOptions()) {
+      opt.setWindowBits(windowBits);
+      assertThat(opt.windowBits()).isEqualTo(windowBits);
+    }
+  }
+
+  @Test
+  public void level() {
+    final int level = 6;
+    try(final CompressionOptions opt = new CompressionOptions()) {
+      opt.setLevel(level);
+      assertThat(opt.level()).isEqualTo(level);
+    }
+  }
+
+  @Test
+  public void strategy() {
+    final int strategy = 2;
+    try(final CompressionOptions opt = new CompressionOptions()) {
+      opt.setStrategy(strategy);
+      assertThat(opt.strategy()).isEqualTo(strategy);
+    }
+  }
+
+  @Test
+  public void maxDictBytes() {
+    final int maxDictBytes = 999;
+    try(final CompressionOptions opt = new CompressionOptions()) {
+      opt.setMaxDictBytes(maxDictBytes);
+      assertThat(opt.maxDictBytes()).isEqualTo(maxDictBytes);
+    }
+  }
+
+  @Test
+  public void zstdMaxTrainBytes() {
+    final int zstdMaxTrainBytes = 999;
+    try(final CompressionOptions opt = new CompressionOptions()) {
+      opt.setZStdMaxTrainBytes(zstdMaxTrainBytes);
+      assertThat(opt.zstdMaxTrainBytes()).isEqualTo(zstdMaxTrainBytes);
+    }
+  }
+
+  @Test
+  public void enabled() {
+    try(final CompressionOptions opt = new CompressionOptions()) {
+      assertThat(opt.enabled()).isFalse();
+      opt.setEnabled(true);
+      assertThat(opt.enabled()).isTrue();
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/CompressionTypesTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/CompressionTypesTest.java
new file mode 100644
index 000000000..e26cc0aca
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/CompressionTypesTest.java
@@ -0,0 +1,20 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Test;
+
+
+public class CompressionTypesTest {
+  @Test
+  public void getCompressionType() {
+    for (final CompressionType compressionType : CompressionType.values()) {
+      String libraryName = compressionType.getLibraryName();
+      compressionType.equals(CompressionType.getCompressionType(
+          libraryName));
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/ConcurrentTaskLimiterTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/ConcurrentTaskLimiterTest.java
new file mode 100644
index 000000000..165f4f24c
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/ConcurrentTaskLimiterTest.java
@@ -0,0 +1,56 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.ClassRule;
+import org.junit.Test;
+
+public class ConcurrentTaskLimiterTest {
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  private static final String NAME = "name";
+
+  private ConcurrentTaskLimiter concurrentTaskLimiter;
+
+  @Before
+  public void beforeTest() {
+    concurrentTaskLimiter = new ConcurrentTaskLimiterImpl(NAME, 3);
+  }
+
+  @Test
+  public void name() {
+    assertEquals(NAME, concurrentTaskLimiter.name());
+  }
+
+  @Test
+  public void outstandingTask() {
+    assertEquals(0, concurrentTaskLimiter.outstandingTask());
+  }
+
+  @Test
+  public void setMaxOutstandingTask() {
+    assertEquals(concurrentTaskLimiter, concurrentTaskLimiter.setMaxOutstandingTask(4));
+    assertEquals(0, concurrentTaskLimiter.outstandingTask());
+  }
+
+  @Test
+  public void resetMaxOutstandingTask() {
+    assertEquals(concurrentTaskLimiter, concurrentTaskLimiter.resetMaxOutstandingTask());
+    assertEquals(0, concurrentTaskLimiter.outstandingTask());
+  }
+
+  @After
+  public void afterTest() {
+    concurrentTaskLimiter.close();
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/DBOptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/DBOptionsTest.java
new file mode 100644
index 000000000..d55ceebcf
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/DBOptionsTest.java
@@ -0,0 +1,904 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import java.nio.file.Paths;
+import java.util.*;
+import java.util.concurrent.atomic.AtomicBoolean;
+import org.junit.ClassRule;
+import org.junit.Test;
+
+public class DBOptionsTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  public static final Random rand = PlatformRandomHelper.
+      getPlatformSpecificRandomFactory();
+
+  @Test
+  public void copyConstructor() {
+    DBOptions origOpts = new DBOptions();
+    origOpts.setCreateIfMissing(rand.nextBoolean());
+    origOpts.setAllow2pc(rand.nextBoolean());
+    origOpts.setMaxBackgroundJobs(rand.nextInt(10));
+    DBOptions copyOpts = new DBOptions(origOpts);
+    assertThat(origOpts.createIfMissing()).isEqualTo(copyOpts.createIfMissing());
+    assertThat(origOpts.allow2pc()).isEqualTo(copyOpts.allow2pc());
+  }
+
+  @Test
+  public void getDBOptionsFromProps() {
+    // setup sample properties
+    final Properties properties = new Properties();
+    properties.put("allow_mmap_reads", "true");
+    properties.put("bytes_per_sync", "13");
+    try(final DBOptions opt = DBOptions.getDBOptionsFromProps(properties)) {
+      assertThat(opt).isNotNull();
+      assertThat(String.valueOf(opt.allowMmapReads())).
+          isEqualTo(properties.get("allow_mmap_reads"));
+      assertThat(String.valueOf(opt.bytesPerSync())).
+          isEqualTo(properties.get("bytes_per_sync"));
+    }
+  }
+
+  @Test
+  public void failDBOptionsFromPropsWithIllegalValue() {
+    // setup sample properties
+    final Properties properties = new Properties();
+    properties.put("tomato", "1024");
+    properties.put("burger", "2");
+    try(final DBOptions opt = DBOptions.getDBOptionsFromProps(properties)) {
+      assertThat(opt).isNull();
+    }
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void failDBOptionsFromPropsWithNullValue() {
+    try(final DBOptions opt = DBOptions.getDBOptionsFromProps(null)) {
+      //no-op
+    }
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void failDBOptionsFromPropsWithEmptyProps() {
+    try(final DBOptions opt = DBOptions.getDBOptionsFromProps(
+        new Properties())) {
+      //no-op
+    }
+  }
+
+  @Test
+  public void linkageOfPrepMethods() {
+    try (final DBOptions opt = new DBOptions()) {
+      opt.optimizeForSmallDb();
+    }
+  }
+
+  @Test
+  public void env() {
+    try (final DBOptions opt = new DBOptions();
+         final Env env = Env.getDefault()) {
+      opt.setEnv(env);
+      assertThat(opt.getEnv()).isSameAs(env);
+    }
+  }
+
+  @Test
+  public void setIncreaseParallelism() {
+    try(final DBOptions opt = new DBOptions()) {
+      final int threads = Runtime.getRuntime().availableProcessors() * 2;
+      opt.setIncreaseParallelism(threads);
+    }
+  }
+
+  @Test
+  public void createIfMissing() {
+    try(final DBOptions opt = new DBOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setCreateIfMissing(boolValue);
+      assertThat(opt.createIfMissing()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void createMissingColumnFamilies() {
+    try(final DBOptions opt = new DBOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setCreateMissingColumnFamilies(boolValue);
+      assertThat(opt.createMissingColumnFamilies()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void errorIfExists() {
+    try(final DBOptions opt = new DBOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setErrorIfExists(boolValue);
+      assertThat(opt.errorIfExists()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void paranoidChecks() {
+    try(final DBOptions opt = new DBOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setParanoidChecks(boolValue);
+      assertThat(opt.paranoidChecks()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void maxTotalWalSize() {
+    try(final DBOptions opt = new DBOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setMaxTotalWalSize(longValue);
+      assertThat(opt.maxTotalWalSize()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void maxOpenFiles() {
+    try(final DBOptions opt = new DBOptions()) {
+      final int intValue = rand.nextInt();
+      opt.setMaxOpenFiles(intValue);
+      assertThat(opt.maxOpenFiles()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void maxFileOpeningThreads() {
+    try(final DBOptions opt = new DBOptions()) {
+      final int intValue = rand.nextInt();
+      opt.setMaxFileOpeningThreads(intValue);
+      assertThat(opt.maxFileOpeningThreads()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void useFsync() {
+    try(final DBOptions opt = new DBOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setUseFsync(boolValue);
+      assertThat(opt.useFsync()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void dbPaths() {
+    final List<DbPath> dbPaths = new ArrayList<>();
+    dbPaths.add(new DbPath(Paths.get("/a"), 10));
+    dbPaths.add(new DbPath(Paths.get("/b"), 100));
+    dbPaths.add(new DbPath(Paths.get("/c"), 1000));
+
+    try(final DBOptions opt = new DBOptions()) {
+      assertThat(opt.dbPaths()).isEqualTo(Collections.emptyList());
+
+      opt.setDbPaths(dbPaths);
+
+      assertThat(opt.dbPaths()).isEqualTo(dbPaths);
+    }
+  }
+
+  @Test
+  public void dbLogDir() {
+    try(final DBOptions opt = new DBOptions()) {
+      final String str = "path/to/DbLogDir";
+      opt.setDbLogDir(str);
+      assertThat(opt.dbLogDir()).isEqualTo(str);
+    }
+  }
+
+  @Test
+  public void walDir() {
+    try(final DBOptions opt = new DBOptions()) {
+      final String str = "path/to/WalDir";
+      opt.setWalDir(str);
+      assertThat(opt.walDir()).isEqualTo(str);
+    }
+  }
+
+  @Test
+  public void deleteObsoleteFilesPeriodMicros() {
+    try(final DBOptions opt = new DBOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setDeleteObsoleteFilesPeriodMicros(longValue);
+      assertThat(opt.deleteObsoleteFilesPeriodMicros()).isEqualTo(longValue);
+    }
+  }
+
+  @SuppressWarnings("deprecated")
+  @Test
+  public void maxBackgroundCompactions() {
+    try(final DBOptions opt = new DBOptions()) {
+      final int intValue = rand.nextInt();
+      opt.setMaxBackgroundCompactions(intValue);
+      assertThat(opt.maxBackgroundCompactions()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void maxSubcompactions() {
+    try (final DBOptions opt = new DBOptions()) {
+      final int intValue = rand.nextInt();
+      opt.setMaxSubcompactions(intValue);
+      assertThat(opt.maxSubcompactions()).
+          isEqualTo(intValue);
+    }
+  }
+
+  @SuppressWarnings("deprecated")
+  @Test
+  public void maxBackgroundFlushes() {
+    try(final DBOptions opt = new DBOptions()) {
+      final int intValue = rand.nextInt();
+      opt.setMaxBackgroundFlushes(intValue);
+      assertThat(opt.maxBackgroundFlushes()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void maxBackgroundJobs() {
+    try (final DBOptions opt = new DBOptions()) {
+      final int intValue = rand.nextInt();
+      opt.setMaxBackgroundJobs(intValue);
+      assertThat(opt.maxBackgroundJobs()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void maxLogFileSize() throws RocksDBException {
+    try(final DBOptions opt = new DBOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setMaxLogFileSize(longValue);
+      assertThat(opt.maxLogFileSize()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void logFileTimeToRoll() throws RocksDBException {
+    try(final DBOptions opt = new DBOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setLogFileTimeToRoll(longValue);
+      assertThat(opt.logFileTimeToRoll()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void keepLogFileNum() throws RocksDBException {
+    try(final DBOptions opt = new DBOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setKeepLogFileNum(longValue);
+      assertThat(opt.keepLogFileNum()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void recycleLogFileNum() throws RocksDBException {
+    try(final DBOptions opt = new DBOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setRecycleLogFileNum(longValue);
+      assertThat(opt.recycleLogFileNum()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void maxManifestFileSize() {
+    try(final DBOptions opt = new DBOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setMaxManifestFileSize(longValue);
+      assertThat(opt.maxManifestFileSize()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void tableCacheNumshardbits() {
+    try(final DBOptions opt = new DBOptions()) {
+      final int intValue = rand.nextInt();
+      opt.setTableCacheNumshardbits(intValue);
+      assertThat(opt.tableCacheNumshardbits()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void walSizeLimitMB() {
+    try(final DBOptions opt = new DBOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setWalSizeLimitMB(longValue);
+      assertThat(opt.walSizeLimitMB()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void walTtlSeconds() {
+    try(final DBOptions opt = new DBOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setWalTtlSeconds(longValue);
+      assertThat(opt.walTtlSeconds()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void manifestPreallocationSize() throws RocksDBException {
+    try(final DBOptions opt = new DBOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setManifestPreallocationSize(longValue);
+      assertThat(opt.manifestPreallocationSize()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void useDirectReads() {
+    try(final DBOptions opt = new DBOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setUseDirectReads(boolValue);
+      assertThat(opt.useDirectReads()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void useDirectIoForFlushAndCompaction() {
+    try(final DBOptions opt = new DBOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setUseDirectIoForFlushAndCompaction(boolValue);
+      assertThat(opt.useDirectIoForFlushAndCompaction()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void allowFAllocate() {
+    try(final DBOptions opt = new DBOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setAllowFAllocate(boolValue);
+      assertThat(opt.allowFAllocate()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void allowMmapReads() {
+    try(final DBOptions opt = new DBOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setAllowMmapReads(boolValue);
+      assertThat(opt.allowMmapReads()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void allowMmapWrites() {
+    try(final DBOptions opt = new DBOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setAllowMmapWrites(boolValue);
+      assertThat(opt.allowMmapWrites()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void isFdCloseOnExec() {
+    try(final DBOptions opt = new DBOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setIsFdCloseOnExec(boolValue);
+      assertThat(opt.isFdCloseOnExec()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void statsDumpPeriodSec() {
+    try(final DBOptions opt = new DBOptions()) {
+      final int intValue = rand.nextInt();
+      opt.setStatsDumpPeriodSec(intValue);
+      assertThat(opt.statsDumpPeriodSec()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void statsPersistPeriodSec() {
+    try (final DBOptions opt = new DBOptions()) {
+      final int intValue = rand.nextInt();
+      opt.setStatsPersistPeriodSec(intValue);
+      assertThat(opt.statsPersistPeriodSec()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void statsHistoryBufferSize() {
+    try (final DBOptions opt = new DBOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setStatsHistoryBufferSize(longValue);
+      assertThat(opt.statsHistoryBufferSize()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void adviseRandomOnOpen() {
+    try(final DBOptions opt = new DBOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setAdviseRandomOnOpen(boolValue);
+      assertThat(opt.adviseRandomOnOpen()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void dbWriteBufferSize() {
+    try(final DBOptions opt = new DBOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setDbWriteBufferSize(longValue);
+      assertThat(opt.dbWriteBufferSize()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void setWriteBufferManager() throws RocksDBException {
+    try (final DBOptions opt = new DBOptions();
+         final Cache cache = new LRUCache(1 * 1024 * 1024);
+         final WriteBufferManager writeBufferManager = new WriteBufferManager(2000l, cache)) {
+      opt.setWriteBufferManager(writeBufferManager);
+      assertThat(opt.writeBufferManager()).isEqualTo(writeBufferManager);
+    }
+  }
+
+  @Test
+  public void setWriteBufferManagerWithZeroBufferSize() throws RocksDBException {
+    try (final DBOptions opt = new DBOptions();
+         final Cache cache = new LRUCache(1 * 1024 * 1024);
+         final WriteBufferManager writeBufferManager = new WriteBufferManager(0l, cache)) {
+      opt.setWriteBufferManager(writeBufferManager);
+      assertThat(opt.writeBufferManager()).isEqualTo(writeBufferManager);
+    }
+  }
+
+  @Test
+  public void accessHintOnCompactionStart() {
+    try(final DBOptions opt = new DBOptions()) {
+      final AccessHint accessHint = AccessHint.SEQUENTIAL;
+      opt.setAccessHintOnCompactionStart(accessHint);
+      assertThat(opt.accessHintOnCompactionStart()).isEqualTo(accessHint);
+    }
+  }
+
+  @Test
+  public void compactionReadaheadSize() {
+    try(final DBOptions opt = new DBOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setCompactionReadaheadSize(longValue);
+      assertThat(opt.compactionReadaheadSize()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void randomAccessMaxBufferSize() {
+    try(final DBOptions opt = new DBOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setRandomAccessMaxBufferSize(longValue);
+      assertThat(opt.randomAccessMaxBufferSize()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void writableFileMaxBufferSize() {
+    try(final DBOptions opt = new DBOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setWritableFileMaxBufferSize(longValue);
+      assertThat(opt.writableFileMaxBufferSize()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void useAdaptiveMutex() {
+    try(final DBOptions opt = new DBOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setUseAdaptiveMutex(boolValue);
+      assertThat(opt.useAdaptiveMutex()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void bytesPerSync() {
+    try(final DBOptions opt = new DBOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setBytesPerSync(longValue);
+      assertThat(opt.bytesPerSync()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void walBytesPerSync() {
+    try(final DBOptions opt = new DBOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setWalBytesPerSync(longValue);
+      assertThat(opt.walBytesPerSync()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void strictBytesPerSync() {
+    try (final DBOptions opt = new DBOptions()) {
+      assertThat(opt.strictBytesPerSync()).isFalse();
+      opt.setStrictBytesPerSync(true);
+      assertThat(opt.strictBytesPerSync()).isTrue();
+    }
+  }
+
+  @Test
+  public void enableThreadTracking() {
+    try (final DBOptions opt = new DBOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setEnableThreadTracking(boolValue);
+      assertThat(opt.enableThreadTracking()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void delayedWriteRate() {
+    try(final DBOptions opt = new DBOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setDelayedWriteRate(longValue);
+      assertThat(opt.delayedWriteRate()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void enablePipelinedWrite() {
+    try(final DBOptions opt = new DBOptions()) {
+      assertThat(opt.enablePipelinedWrite()).isFalse();
+      opt.setEnablePipelinedWrite(true);
+      assertThat(opt.enablePipelinedWrite()).isTrue();
+    }
+  }
+
+  @Test
+  public void unordredWrite() {
+    try(final DBOptions opt = new DBOptions()) {
+      assertThat(opt.unorderedWrite()).isFalse();
+      opt.setUnorderedWrite(true);
+      assertThat(opt.unorderedWrite()).isTrue();
+    }
+  }
+
+  @Test
+  public void allowConcurrentMemtableWrite() {
+    try (final DBOptions opt = new DBOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setAllowConcurrentMemtableWrite(boolValue);
+      assertThat(opt.allowConcurrentMemtableWrite()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void enableWriteThreadAdaptiveYield() {
+    try (final DBOptions opt = new DBOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setEnableWriteThreadAdaptiveYield(boolValue);
+      assertThat(opt.enableWriteThreadAdaptiveYield()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void writeThreadMaxYieldUsec() {
+    try (final DBOptions opt = new DBOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setWriteThreadMaxYieldUsec(longValue);
+      assertThat(opt.writeThreadMaxYieldUsec()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void writeThreadSlowYieldUsec() {
+    try (final DBOptions opt = new DBOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setWriteThreadSlowYieldUsec(longValue);
+      assertThat(opt.writeThreadSlowYieldUsec()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void skipStatsUpdateOnDbOpen() {
+    try (final DBOptions opt = new DBOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setSkipStatsUpdateOnDbOpen(boolValue);
+      assertThat(opt.skipStatsUpdateOnDbOpen()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void walRecoveryMode() {
+    try (final DBOptions opt = new DBOptions()) {
+      for (final WALRecoveryMode walRecoveryMode : WALRecoveryMode.values()) {
+        opt.setWalRecoveryMode(walRecoveryMode);
+        assertThat(opt.walRecoveryMode()).isEqualTo(walRecoveryMode);
+      }
+    }
+  }
+
+  @Test
+  public void allow2pc() {
+    try (final DBOptions opt = new DBOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setAllow2pc(boolValue);
+      assertThat(opt.allow2pc()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void rowCache() {
+    try (final DBOptions opt = new DBOptions()) {
+      assertThat(opt.rowCache()).isNull();
+
+      try(final Cache lruCache = new LRUCache(1000)) {
+        opt.setRowCache(lruCache);
+        assertThat(opt.rowCache()).isEqualTo(lruCache);
+      }
+
+      try(final Cache clockCache = new ClockCache(1000)) {
+        opt.setRowCache(clockCache);
+        assertThat(opt.rowCache()).isEqualTo(clockCache);
+      }
+    }
+  }
+
+  @Test
+  public void walFilter() {
+    try (final DBOptions opt = new DBOptions()) {
+      assertThat(opt.walFilter()).isNull();
+
+      try (final AbstractWalFilter walFilter = new AbstractWalFilter() {
+        @Override
+        public void columnFamilyLogNumberMap(
+            final Map<Integer, Long> cfLognumber,
+            final Map<String, Integer> cfNameId) {
+          // no-op
+        }
+
+        @Override
+        public LogRecordFoundResult logRecordFound(final long logNumber,
+            final String logFileName, final WriteBatch batch,
+            final WriteBatch newBatch) {
+          return new LogRecordFoundResult(
+              WalProcessingOption.CONTINUE_PROCESSING, false);
+        }
+
+        @Override
+        public String name() {
+          return "test-wal-filter";
+        }
+      }) {
+        opt.setWalFilter(walFilter);
+        assertThat(opt.walFilter()).isEqualTo(walFilter);
+      }
+    }
+  }
+
+  @Test
+  public void failIfOptionsFileError() {
+    try (final DBOptions opt = new DBOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setFailIfOptionsFileError(boolValue);
+      assertThat(opt.failIfOptionsFileError()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void dumpMallocStats() {
+    try (final DBOptions opt = new DBOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setDumpMallocStats(boolValue);
+      assertThat(opt.dumpMallocStats()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void avoidFlushDuringRecovery() {
+    try (final DBOptions opt = new DBOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setAvoidFlushDuringRecovery(boolValue);
+      assertThat(opt.avoidFlushDuringRecovery()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void avoidFlushDuringShutdown() {
+    try (final DBOptions opt = new DBOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setAvoidFlushDuringShutdown(boolValue);
+      assertThat(opt.avoidFlushDuringShutdown()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void allowIngestBehind() {
+    try (final DBOptions opt = new DBOptions()) {
+      assertThat(opt.allowIngestBehind()).isFalse();
+      opt.setAllowIngestBehind(true);
+      assertThat(opt.allowIngestBehind()).isTrue();
+    }
+  }
+
+  @Test
+  public void twoWriteQueues() {
+    try (final DBOptions opt = new DBOptions()) {
+      assertThat(opt.twoWriteQueues()).isFalse();
+      opt.setTwoWriteQueues(true);
+      assertThat(opt.twoWriteQueues()).isTrue();
+    }
+  }
+
+  @Test
+  public void manualWalFlush() {
+    try (final DBOptions opt = new DBOptions()) {
+      assertThat(opt.manualWalFlush()).isFalse();
+      opt.setManualWalFlush(true);
+      assertThat(opt.manualWalFlush()).isTrue();
+    }
+  }
+
+  @Test
+  public void atomicFlush() {
+    try (final DBOptions opt = new DBOptions()) {
+      assertThat(opt.atomicFlush()).isFalse();
+      opt.setAtomicFlush(true);
+      assertThat(opt.atomicFlush()).isTrue();
+    }
+  }
+
+  @Test
+  public void rateLimiter() {
+    try(final DBOptions options = new DBOptions();
+        final DBOptions anotherOptions = new DBOptions();
+        final RateLimiter rateLimiter = new RateLimiter(1000, 100 * 1000, 1)) {
+      options.setRateLimiter(rateLimiter);
+      // Test with parameter initialization
+      anotherOptions.setRateLimiter(
+          new RateLimiter(1000));
+    }
+  }
+
+  @Test
+  public void sstFileManager() throws RocksDBException {
+    try (final DBOptions options = new DBOptions();
+         final SstFileManager sstFileManager =
+             new SstFileManager(Env.getDefault())) {
+      options.setSstFileManager(sstFileManager);
+    }
+  }
+
+  @Test
+  public void statistics() {
+    try(final DBOptions options = new DBOptions()) {
+      final Statistics statistics = options.statistics();
+      assertThat(statistics).isNull();
+    }
+
+    try(final Statistics statistics = new Statistics();
+        final DBOptions options = new DBOptions().setStatistics(statistics);
+        final Statistics stats = options.statistics()) {
+      assertThat(stats).isNotNull();
+    }
+  }
+
+  @Test
+  public void avoidUnnecessaryBlockingIO() {
+    try (final DBOptions options = new DBOptions()) {
+      assertThat(options.avoidUnnecessaryBlockingIO()).isEqualTo(false);
+      assertThat(options.setAvoidUnnecessaryBlockingIO(true)).isEqualTo(options);
+      assertThat(options.avoidUnnecessaryBlockingIO()).isEqualTo(true);
+    }
+  }
+
+  @Test
+  public void persistStatsToDisk() {
+    try (final DBOptions options = new DBOptions()) {
+      assertThat(options.persistStatsToDisk()).isEqualTo(false);
+      assertThat(options.setPersistStatsToDisk(true)).isEqualTo(options);
+      assertThat(options.persistStatsToDisk()).isEqualTo(true);
+    }
+  }
+
+  @Test
+  public void writeDbidToManifest() {
+    try (final DBOptions options = new DBOptions()) {
+      assertThat(options.writeDbidToManifest()).isEqualTo(false);
+      assertThat(options.setWriteDbidToManifest(true)).isEqualTo(options);
+      assertThat(options.writeDbidToManifest()).isEqualTo(true);
+    }
+  }
+
+  @Test
+  public void logReadaheadSize() {
+    try (final DBOptions options = new DBOptions()) {
+      assertThat(options.logReadaheadSize()).isEqualTo(0);
+      final int size = 1024 * 1024 * 100;
+      assertThat(options.setLogReadaheadSize(size)).isEqualTo(options);
+      assertThat(options.logReadaheadSize()).isEqualTo(size);
+    }
+  }
+
+  @Test
+  public void bestEffortsRecovery() {
+    try (final DBOptions options = new DBOptions()) {
+      assertThat(options.bestEffortsRecovery()).isEqualTo(false);
+      assertThat(options.setBestEffortsRecovery(true)).isEqualTo(options);
+      assertThat(options.bestEffortsRecovery()).isEqualTo(true);
+    }
+  }
+
+  @Test
+  public void maxBgerrorResumeCount() {
+    try (final DBOptions options = new DBOptions()) {
+      final int INT_MAX = 2147483647;
+      assertThat(options.maxBgerrorResumeCount()).isEqualTo(INT_MAX);
+      assertThat(options.setMaxBgErrorResumeCount(-1)).isEqualTo(options);
+      assertThat(options.maxBgerrorResumeCount()).isEqualTo(-1);
+    }
+  }
+
+  @Test
+  public void bgerrorResumeRetryInterval() {
+    try (final DBOptions options = new DBOptions()) {
+      assertThat(options.bgerrorResumeRetryInterval()).isEqualTo(1000000);
+      final long newRetryInterval = 24 * 3600 * 1000000L;
+      assertThat(options.setBgerrorResumeRetryInterval(newRetryInterval)).isEqualTo(options);
+      assertThat(options.bgerrorResumeRetryInterval()).isEqualTo(newRetryInterval);
+    }
+  }
+
+  @Test
+  public void maxWriteBatchGroupSizeBytes() {
+    try (final DBOptions options = new DBOptions()) {
+      assertThat(options.maxWriteBatchGroupSizeBytes()).isEqualTo(1024 * 1024);
+      final long size = 1024 * 1024 * 1024 * 10L;
+      assertThat(options.setMaxWriteBatchGroupSizeBytes(size)).isEqualTo(options);
+      assertThat(options.maxWriteBatchGroupSizeBytes()).isEqualTo(size);
+    }
+  }
+
+  @Test
+  public void skipCheckingSstFileSizesOnDbOpen() {
+    try (final DBOptions options = new DBOptions()) {
+      assertThat(options.skipCheckingSstFileSizesOnDbOpen()).isEqualTo(false);
+      assertThat(options.setSkipCheckingSstFileSizesOnDbOpen(true)).isEqualTo(options);
+      assertThat(options.skipCheckingSstFileSizesOnDbOpen()).isEqualTo(true);
+    }
+  }
+
+  @Test
+  public void eventListeners() {
+    final AtomicBoolean wasCalled1 = new AtomicBoolean();
+    final AtomicBoolean wasCalled2 = new AtomicBoolean();
+    try (final DBOptions options = new DBOptions();
+         final AbstractEventListener el1 =
+             new AbstractEventListener() {
+               @Override
+               public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) {
+                 wasCalled1.set(true);
+               }
+             };
+         final AbstractEventListener el2 =
+             new AbstractEventListener() {
+               @Override
+               public void onMemTableSealed(final MemTableInfo memTableInfo) {
+                 wasCalled2.set(true);
+               }
+             }) {
+      assertThat(options.setListeners(Arrays.asList(el1, el2))).isEqualTo(options);
+      List<AbstractEventListener> listeners = options.listeners();
+      assertEquals(el1, listeners.get(0));
+      assertEquals(el2, listeners.get(1));
+      options.setListeners(Collections.<AbstractEventListener>emptyList());
+      listeners.get(0).onTableFileDeleted(null);
+      assertTrue(wasCalled1.get());
+      listeners.get(1).onMemTableSealed(null);
+      assertTrue(wasCalled2.get());
+      List<AbstractEventListener> listeners2 = options.listeners();
+      assertNotNull(listeners2);
+      assertEquals(0, listeners2.size());
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/DefaultEnvTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/DefaultEnvTest.java
new file mode 100644
index 000000000..3fb563ecb
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/DefaultEnvTest.java
@@ -0,0 +1,113 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.util.Collection;
+import java.util.List;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class DefaultEnvTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void backgroundThreads() {
+    try (final Env defaultEnv = RocksEnv.getDefault()) {
+      defaultEnv.setBackgroundThreads(5, Priority.BOTTOM);
+      assertThat(defaultEnv.getBackgroundThreads(Priority.BOTTOM)).isEqualTo(5);
+
+      defaultEnv.setBackgroundThreads(5);
+      assertThat(defaultEnv.getBackgroundThreads(Priority.LOW)).isEqualTo(5);
+
+      defaultEnv.setBackgroundThreads(5, Priority.LOW);
+      assertThat(defaultEnv.getBackgroundThreads(Priority.LOW)).isEqualTo(5);
+
+      defaultEnv.setBackgroundThreads(5, Priority.HIGH);
+      assertThat(defaultEnv.getBackgroundThreads(Priority.HIGH)).isEqualTo(5);
+    }
+  }
+
+  @Test
+  public void threadPoolQueueLen() {
+    try (final Env defaultEnv = RocksEnv.getDefault()) {
+      assertThat(defaultEnv.getThreadPoolQueueLen(Priority.BOTTOM)).isEqualTo(0);
+      assertThat(defaultEnv.getThreadPoolQueueLen(Priority.LOW)).isEqualTo(0);
+      assertThat(defaultEnv.getThreadPoolQueueLen(Priority.HIGH)).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void incBackgroundThreadsIfNeeded() {
+    try (final Env defaultEnv = RocksEnv.getDefault()) {
+      defaultEnv.incBackgroundThreadsIfNeeded(20, Priority.BOTTOM);
+      assertThat(defaultEnv.getBackgroundThreads(Priority.BOTTOM)).isGreaterThanOrEqualTo(20);
+
+      defaultEnv.incBackgroundThreadsIfNeeded(20, Priority.LOW);
+      assertThat(defaultEnv.getBackgroundThreads(Priority.LOW)).isGreaterThanOrEqualTo(20);
+
+      defaultEnv.incBackgroundThreadsIfNeeded(20, Priority.HIGH);
+      assertThat(defaultEnv.getBackgroundThreads(Priority.HIGH)).isGreaterThanOrEqualTo(20);
+    }
+  }
+
+  @Test
+  public void lowerThreadPoolIOPriority() {
+    try (final Env defaultEnv = RocksEnv.getDefault()) {
+      defaultEnv.lowerThreadPoolIOPriority(Priority.BOTTOM);
+
+      defaultEnv.lowerThreadPoolIOPriority(Priority.LOW);
+
+      defaultEnv.lowerThreadPoolIOPriority(Priority.HIGH);
+    }
+  }
+
+  @Test
+  public void lowerThreadPoolCPUPriority() {
+    try (final Env defaultEnv = RocksEnv.getDefault()) {
+      defaultEnv.lowerThreadPoolCPUPriority(Priority.BOTTOM);
+
+      defaultEnv.lowerThreadPoolCPUPriority(Priority.LOW);
+
+      defaultEnv.lowerThreadPoolCPUPriority(Priority.HIGH);
+    }
+  }
+
+  @Test
+  public void threadList() throws RocksDBException {
+    try (final Env defaultEnv = RocksEnv.getDefault()) {
+      final Collection<ThreadStatus> threadList = defaultEnv.getThreadList();
+      assertThat(threadList.size()).isGreaterThan(0);
+    }
+  }
+
+  @Test
+  public void threadList_integration() throws RocksDBException {
+    try (final Env env = RocksEnv.getDefault();
+        final Options opt = new Options()
+            .setCreateIfMissing(true)
+            .setCreateMissingColumnFamilies(true)
+            .setEnv(env)) {
+      // open database
+      try (final RocksDB db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath())) {
+
+        final List<ThreadStatus> threadList = env.getThreadList();
+        assertThat(threadList.size()).isGreaterThan(0);
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/DirectSliceTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/DirectSliceTest.java
new file mode 100644
index 000000000..67385345c
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/DirectSliceTest.java
@@ -0,0 +1,93 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import java.nio.ByteBuffer;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class DirectSliceTest {
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Test
+  public void directSlice() {
+    try(final DirectSlice directSlice = new DirectSlice("abc");
+        final DirectSlice otherSlice = new DirectSlice("abc")) {
+      assertThat(directSlice.toString()).isEqualTo("abc");
+      // clear first slice
+      directSlice.clear();
+      assertThat(directSlice.toString()).isEmpty();
+      // get first char in otherslice
+      assertThat(otherSlice.get(0)).isEqualTo("a".getBytes()[0]);
+      // remove prefix
+      otherSlice.removePrefix(1);
+      assertThat(otherSlice.toString()).isEqualTo("bc");
+    }
+  }
+
+  @Test
+  public void directSliceWithByteBuffer() {
+    final byte[] data = "Some text".getBytes();
+    final ByteBuffer buffer = ByteBuffer.allocateDirect(data.length + 1);
+    buffer.put(data);
+    buffer.put(data.length, (byte)0);
+
+    try(final DirectSlice directSlice = new DirectSlice(buffer)) {
+      assertThat(directSlice.toString()).isEqualTo("Some text");
+    }
+  }
+
+  @Test
+  public void directSliceWithByteBufferAndLength() {
+    final byte[] data = "Some text".getBytes();
+    final ByteBuffer buffer = ByteBuffer.allocateDirect(data.length);
+    buffer.put(data);
+    try(final DirectSlice directSlice = new DirectSlice(buffer, 4)) {
+      assertThat(directSlice.toString()).isEqualTo("Some");
+    }
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void directSliceInitWithoutDirectAllocation() {
+    final byte[] data = "Some text".getBytes();
+    final ByteBuffer buffer = ByteBuffer.wrap(data);
+    try(final DirectSlice directSlice = new DirectSlice(buffer)) {
+      //no-op
+    }
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void directSlicePrefixInitWithoutDirectAllocation() {
+    final byte[] data = "Some text".getBytes();
+    final ByteBuffer buffer = ByteBuffer.wrap(data);
+    try(final DirectSlice directSlice = new DirectSlice(buffer, 4)) {
+      //no-op
+    }
+  }
+
+  @Test
+  public void directSliceClear() {
+    try(final DirectSlice directSlice = new DirectSlice("abc")) {
+      assertThat(directSlice.toString()).isEqualTo("abc");
+      directSlice.clear();
+      assertThat(directSlice.toString()).isEmpty();
+      directSlice.clear();  // make sure we don't double-free
+    }
+  }
+
+  @Test
+  public void directSliceRemovePrefix() {
+    try(final DirectSlice directSlice = new DirectSlice("abc")) {
+      assertThat(directSlice.toString()).isEqualTo("abc");
+      directSlice.removePrefix(1);
+      assertThat(directSlice.toString()).isEqualTo("bc");
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/EnvOptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/EnvOptionsTest.java
new file mode 100644
index 000000000..0f3d8e234
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/EnvOptionsTest.java
@@ -0,0 +1,145 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import java.util.Random;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class EnvOptionsTest {
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE = new RocksNativeLibraryResource();
+
+  public static final Random rand = PlatformRandomHelper.getPlatformSpecificRandomFactory();
+
+  @Test
+  public void dbOptionsConstructor() {
+    final long compactionReadaheadSize = 4 * 1024 * 1024;
+    try (final DBOptions dbOptions = new DBOptions()
+        .setCompactionReadaheadSize(compactionReadaheadSize)) {
+      try (final EnvOptions envOptions = new EnvOptions(dbOptions)) {
+        assertThat(envOptions.compactionReadaheadSize())
+            .isEqualTo(compactionReadaheadSize);
+      }
+    }
+  }
+
+  @Test
+  public void useMmapReads() {
+    try (final EnvOptions envOptions = new EnvOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      envOptions.setUseMmapReads(boolValue);
+      assertThat(envOptions.useMmapReads()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void useMmapWrites() {
+    try (final EnvOptions envOptions = new EnvOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      envOptions.setUseMmapWrites(boolValue);
+      assertThat(envOptions.useMmapWrites()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void useDirectReads() {
+    try (final EnvOptions envOptions = new EnvOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      envOptions.setUseDirectReads(boolValue);
+      assertThat(envOptions.useDirectReads()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void useDirectWrites() {
+    try (final EnvOptions envOptions = new EnvOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      envOptions.setUseDirectWrites(boolValue);
+      assertThat(envOptions.useDirectWrites()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void allowFallocate() {
+    try (final EnvOptions envOptions = new EnvOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      envOptions.setAllowFallocate(boolValue);
+      assertThat(envOptions.allowFallocate()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void setFdCloexecs() {
+    try (final EnvOptions envOptions = new EnvOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      envOptions.setSetFdCloexec(boolValue);
+      assertThat(envOptions.setFdCloexec()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void bytesPerSync() {
+    try (final EnvOptions envOptions = new EnvOptions()) {
+      final long longValue = rand.nextLong();
+      envOptions.setBytesPerSync(longValue);
+      assertThat(envOptions.bytesPerSync()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void fallocateWithKeepSize() {
+    try (final EnvOptions envOptions = new EnvOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      envOptions.setFallocateWithKeepSize(boolValue);
+      assertThat(envOptions.fallocateWithKeepSize()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void compactionReadaheadSize() {
+    try (final EnvOptions envOptions = new EnvOptions()) {
+      final int intValue = rand.nextInt(2147483647);
+      envOptions.setCompactionReadaheadSize(intValue);
+      assertThat(envOptions.compactionReadaheadSize()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void randomAccessMaxBufferSize() {
+    try (final EnvOptions envOptions = new EnvOptions()) {
+      final int intValue = rand.nextInt(2147483647);
+      envOptions.setRandomAccessMaxBufferSize(intValue);
+      assertThat(envOptions.randomAccessMaxBufferSize()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void writableFileMaxBufferSize() {
+    try (final EnvOptions envOptions = new EnvOptions()) {
+      final int intValue = rand.nextInt(2147483647);
+      envOptions.setWritableFileMaxBufferSize(intValue);
+      assertThat(envOptions.writableFileMaxBufferSize()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void rateLimiter() {
+    try (final EnvOptions envOptions = new EnvOptions();
+      final RateLimiter rateLimiter1 = new RateLimiter(1000, 100 * 1000, 1)) {
+      envOptions.setRateLimiter(rateLimiter1);
+      assertThat(envOptions.rateLimiter()).isEqualTo(rateLimiter1);
+
+      try(final RateLimiter rateLimiter2 = new RateLimiter(1000)) {
+        envOptions.setRateLimiter(rateLimiter2);
+        assertThat(envOptions.rateLimiter()).isEqualTo(rateLimiter2);
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/EventListenerTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/EventListenerTest.java
new file mode 100644
index 000000000..aec0af617
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/EventListenerTest.java
@@ -0,0 +1,725 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.*;
+import java.util.concurrent.atomic.AtomicBoolean;
+import org.assertj.core.api.AbstractObjectAssert;
+import org.assertj.core.api.ObjectAssert;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.rocksdb.AbstractEventListener.EnabledEventCallback;
+import org.rocksdb.test.TestableEventListener;
+
+public class EventListenerTest {
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  public static final Random rand = PlatformRandomHelper.getPlatformSpecificRandomFactory();
+
+  void flushDb(final AbstractEventListener el, final AtomicBoolean wasCbCalled)
+      throws RocksDBException {
+    try (final Options opt =
+             new Options().setCreateIfMissing(true).setListeners(Collections.singletonList(el));
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      assertThat(db).isNotNull();
+      final byte[] value = new byte[24];
+      rand.nextBytes(value);
+      db.put("testKey".getBytes(), value);
+      db.flush(new FlushOptions());
+      assertThat(wasCbCalled.get()).isTrue();
+    }
+  }
+
+  @Test
+  public void onFlushCompleted() throws RocksDBException {
+    final AtomicBoolean wasCbCalled = new AtomicBoolean();
+    final AbstractEventListener onFlushCompletedListener = new AbstractEventListener() {
+      @Override
+      public void onFlushCompleted(final RocksDB rocksDb, final FlushJobInfo flushJobInfo) {
+        assertThat(flushJobInfo.getColumnFamilyName()).isNotNull();
+        assertThat(flushJobInfo.getFlushReason()).isEqualTo(FlushReason.MANUAL_FLUSH);
+        wasCbCalled.set(true);
+      }
+    };
+    flushDb(onFlushCompletedListener, wasCbCalled);
+  }
+
+  @Test
+  public void onFlushBegin() throws RocksDBException {
+    final AtomicBoolean wasCbCalled = new AtomicBoolean();
+    final AbstractEventListener onFlushBeginListener = new AbstractEventListener() {
+      @Override
+      public void onFlushBegin(final RocksDB rocksDb, final FlushJobInfo flushJobInfo) {
+        assertThat(flushJobInfo.getColumnFamilyName()).isNotNull();
+        assertThat(flushJobInfo.getFlushReason()).isEqualTo(FlushReason.MANUAL_FLUSH);
+        wasCbCalled.set(true);
+      }
+    };
+    flushDb(onFlushBeginListener, wasCbCalled);
+  }
+
+  void deleteTableFile(final AbstractEventListener el, final AtomicBoolean wasCbCalled)
+      throws RocksDBException {
+    try (final Options opt =
+             new Options().setCreateIfMissing(true).setListeners(Collections.singletonList(el));
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      assertThat(db).isNotNull();
+      final byte[] value = new byte[24];
+      rand.nextBytes(value);
+      db.put("testKey".getBytes(), value);
+      final RocksDB.LiveFiles liveFiles = db.getLiveFiles();
+      assertThat(liveFiles).isNotNull();
+      assertThat(liveFiles.files).isNotNull();
+      assertThat(liveFiles.files.isEmpty()).isFalse();
+      db.deleteFile(liveFiles.files.get(0));
+      assertThat(wasCbCalled.get()).isTrue();
+    }
+  }
+
+  @Test
+  public void onTableFileDeleted() throws RocksDBException {
+    final AtomicBoolean wasCbCalled = new AtomicBoolean();
+    final AbstractEventListener onTableFileDeletedListener = new AbstractEventListener() {
+      @Override
+      public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) {
+        assertThat(tableFileDeletionInfo.getDbName()).isNotNull();
+        wasCbCalled.set(true);
+      }
+    };
+    deleteTableFile(onTableFileDeletedListener, wasCbCalled);
+  }
+
+  void compactRange(final AbstractEventListener el, final AtomicBoolean wasCbCalled)
+      throws RocksDBException {
+    try (final Options opt =
+             new Options().setCreateIfMissing(true).setListeners(Collections.singletonList(el));
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      assertThat(db).isNotNull();
+      final byte[] value = new byte[24];
+      rand.nextBytes(value);
+      db.put("testKey".getBytes(), value);
+      db.compactRange();
+      assertThat(wasCbCalled.get()).isTrue();
+    }
+  }
+
+  @Test
+  public void onCompactionBegin() throws RocksDBException {
+    final AtomicBoolean wasCbCalled = new AtomicBoolean();
+    final AbstractEventListener onCompactionBeginListener = new AbstractEventListener() {
+      @Override
+      public void onCompactionBegin(final RocksDB db, final CompactionJobInfo compactionJobInfo) {
+        assertThat(compactionJobInfo.compactionReason())
+            .isEqualTo(CompactionReason.kManualCompaction);
+        wasCbCalled.set(true);
+      }
+    };
+    compactRange(onCompactionBeginListener, wasCbCalled);
+  }
+
+  @Test
+  public void onCompactionCompleted() throws RocksDBException {
+    final AtomicBoolean wasCbCalled = new AtomicBoolean();
+    final AbstractEventListener onCompactionCompletedListener = new AbstractEventListener() {
+      @Override
+      public void onCompactionCompleted(
+          final RocksDB db, final CompactionJobInfo compactionJobInfo) {
+        assertThat(compactionJobInfo.compactionReason())
+            .isEqualTo(CompactionReason.kManualCompaction);
+        wasCbCalled.set(true);
+      }
+    };
+    compactRange(onCompactionCompletedListener, wasCbCalled);
+  }
+
+  @Test
+  public void onTableFileCreated() throws RocksDBException {
+    final AtomicBoolean wasCbCalled = new AtomicBoolean();
+    final AbstractEventListener onTableFileCreatedListener = new AbstractEventListener() {
+      @Override
+      public void onTableFileCreated(final TableFileCreationInfo tableFileCreationInfo) {
+        assertThat(tableFileCreationInfo.getReason()).isEqualTo(TableFileCreationReason.FLUSH);
+        wasCbCalled.set(true);
+      }
+    };
+    flushDb(onTableFileCreatedListener, wasCbCalled);
+  }
+
+  @Test
+  public void onTableFileCreationStarted() throws RocksDBException {
+    final AtomicBoolean wasCbCalled = new AtomicBoolean();
+    final AbstractEventListener onTableFileCreationStartedListener = new AbstractEventListener() {
+      @Override
+      public void onTableFileCreationStarted(
+          final TableFileCreationBriefInfo tableFileCreationBriefInfo) {
+        assertThat(tableFileCreationBriefInfo.getReason()).isEqualTo(TableFileCreationReason.FLUSH);
+        wasCbCalled.set(true);
+      }
+    };
+    flushDb(onTableFileCreationStartedListener, wasCbCalled);
+  }
+
+  void deleteColumnFamilyHandle(final AbstractEventListener el, final AtomicBoolean wasCbCalled)
+      throws RocksDBException {
+    try (final Options opt =
+             new Options().setCreateIfMissing(true).setListeners(Collections.singletonList(el));
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      assertThat(db).isNotNull();
+      final byte[] value = new byte[24];
+      rand.nextBytes(value);
+      db.put("testKey".getBytes(), value);
+      ColumnFamilyHandle columnFamilyHandle = db.getDefaultColumnFamily();
+      columnFamilyHandle.close();
+      assertThat(wasCbCalled.get()).isTrue();
+    }
+  }
+
+  @Test
+  public void onColumnFamilyHandleDeletionStarted() throws RocksDBException {
+    final AtomicBoolean wasCbCalled = new AtomicBoolean();
+    final AbstractEventListener onColumnFamilyHandleDeletionStartedListener =
+        new AbstractEventListener() {
+          @Override
+          public void onColumnFamilyHandleDeletionStarted(
+              final ColumnFamilyHandle columnFamilyHandle) {
+            assertThat(columnFamilyHandle).isNotNull();
+            wasCbCalled.set(true);
+          }
+        };
+    deleteColumnFamilyHandle(onColumnFamilyHandleDeletionStartedListener, wasCbCalled);
+  }
+
+  void ingestExternalFile(final AbstractEventListener el, final AtomicBoolean wasCbCalled)
+      throws RocksDBException {
+    try (final Options opt =
+             new Options().setCreateIfMissing(true).setListeners(Collections.singletonList(el));
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      assertThat(db).isNotNull();
+      final String uuid = UUID.randomUUID().toString();
+      final SstFileWriter sstFileWriter = new SstFileWriter(new EnvOptions(), opt);
+      final Path externalFilePath = Paths.get(db.getName(), uuid);
+      sstFileWriter.open(externalFilePath.toString());
+      sstFileWriter.put("testKey".getBytes(), uuid.getBytes());
+      sstFileWriter.finish();
+      db.ingestExternalFile(
+          Collections.singletonList(externalFilePath.toString()), new IngestExternalFileOptions());
+      assertThat(wasCbCalled.get()).isTrue();
+    }
+  }
+
+  @Test
+  public void onExternalFileIngested() throws RocksDBException {
+    final AtomicBoolean wasCbCalled = new AtomicBoolean();
+    final AbstractEventListener onExternalFileIngestedListener = new AbstractEventListener() {
+      @Override
+      public void onExternalFileIngested(
+          final RocksDB db, final ExternalFileIngestionInfo externalFileIngestionInfo) {
+        assertThat(db).isNotNull();
+        wasCbCalled.set(true);
+      }
+    };
+    ingestExternalFile(onExternalFileIngestedListener, wasCbCalled);
+  }
+
+  @Test
+  public void testAllCallbacksInvocation() {
+    final long TEST_LONG_VAL = -1;
+    // Expected test data objects
+    final Map<String, String> userCollectedPropertiesTestData =
+        Collections.singletonMap("key", "value");
+    final Map<String, String> readablePropertiesTestData = Collections.singletonMap("key", "value");
+    final TableProperties tablePropertiesTestData = new TableProperties(TEST_LONG_VAL,
+        TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL,
+        TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL,
+        TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL,
+        TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, "columnFamilyName".getBytes(),
+        "filterPolicyName", "comparatorName", "mergeOperatorName", "prefixExtractorName",
+        "propertyCollectorsNames", "compressionName", userCollectedPropertiesTestData,
+        readablePropertiesTestData);
+    final FlushJobInfo flushJobInfoTestData = new FlushJobInfo(Integer.MAX_VALUE,
+        "testColumnFamily", "/file/path", TEST_LONG_VAL, Integer.MAX_VALUE, true, true,
+        TEST_LONG_VAL, TEST_LONG_VAL, tablePropertiesTestData, (byte) 0x0a);
+    final Status statusTestData = new Status(Status.Code.Incomplete, Status.SubCode.NoSpace, null);
+    final TableFileDeletionInfo tableFileDeletionInfoTestData =
+        new TableFileDeletionInfo("dbName", "/file/path", Integer.MAX_VALUE, statusTestData);
+    final TableFileCreationInfo tableFileCreationInfoTestData =
+        new TableFileCreationInfo(TEST_LONG_VAL, tablePropertiesTestData, statusTestData, "dbName",
+            "columnFamilyName", "/file/path", Integer.MAX_VALUE, (byte) 0x03);
+    final TableFileCreationBriefInfo tableFileCreationBriefInfoTestData =
+        new TableFileCreationBriefInfo(
+            "dbName", "columnFamilyName", "/file/path", Integer.MAX_VALUE, (byte) 0x03);
+    final MemTableInfo memTableInfoTestData = new MemTableInfo(
+        "columnFamilyName", TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL);
+    final FileOperationInfo fileOperationInfoTestData = new FileOperationInfo("/file/path",
+        TEST_LONG_VAL, TEST_LONG_VAL, 1_600_699_420_000_000_000L, 5_000_000_000L, statusTestData);
+    final WriteStallInfo writeStallInfoTestData =
+        new WriteStallInfo("columnFamilyName", (byte) 0x1, (byte) 0x2);
+    final ExternalFileIngestionInfo externalFileIngestionInfoTestData =
+        new ExternalFileIngestionInfo("columnFamilyName", "/external/file/path",
+            "/internal/file/path", TEST_LONG_VAL, tablePropertiesTestData);
+
+    final CapturingTestableEventListener listener = new CapturingTestableEventListener() {
+      @Override
+      public void onFlushCompleted(final RocksDB db, final FlushJobInfo flushJobInfo) {
+        super.onFlushCompleted(db, flushJobInfo);
+        assertThat(flushJobInfo).isEqualTo(flushJobInfoTestData);
+      }
+
+      @Override
+      public void onFlushBegin(final RocksDB db, final FlushJobInfo flushJobInfo) {
+        super.onFlushBegin(db, flushJobInfo);
+        assertThat(flushJobInfo).isEqualTo(flushJobInfoTestData);
+      }
+
+      @Override
+      public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) {
+        super.onTableFileDeleted(tableFileDeletionInfo);
+        assertThat(tableFileDeletionInfo).isEqualTo(tableFileDeletionInfoTestData);
+      }
+
+      @Override
+      public void onCompactionBegin(final RocksDB db, final CompactionJobInfo compactionJobInfo) {
+        super.onCompactionBegin(db, compactionJobInfo);
+        assertThat(new String(compactionJobInfo.columnFamilyName(), StandardCharsets.UTF_8))
+            .isEqualTo("compactionColumnFamily");
+        assertThat(compactionJobInfo.status()).isEqualTo(statusTestData);
+        assertThat(compactionJobInfo.threadId()).isEqualTo(TEST_LONG_VAL);
+        assertThat(compactionJobInfo.jobId()).isEqualTo(Integer.MAX_VALUE);
+        assertThat(compactionJobInfo.baseInputLevel()).isEqualTo(Integer.MAX_VALUE);
+        assertThat(compactionJobInfo.outputLevel()).isEqualTo(Integer.MAX_VALUE);
+        assertThat(compactionJobInfo.inputFiles())
+            .isEqualTo(Collections.singletonList("inputFile.sst"));
+        assertThat(compactionJobInfo.outputFiles())
+            .isEqualTo(Collections.singletonList("outputFile.sst"));
+        assertThat(compactionJobInfo.tableProperties())
+            .isEqualTo(Collections.singletonMap("tableProperties", tablePropertiesTestData));
+        assertThat(compactionJobInfo.compactionReason()).isEqualTo(CompactionReason.kFlush);
+        assertThat(compactionJobInfo.compression()).isEqualTo(CompressionType.SNAPPY_COMPRESSION);
+      }
+
+      @Override
+      public void onCompactionCompleted(
+          final RocksDB db, final CompactionJobInfo compactionJobInfo) {
+        super.onCompactionCompleted(db, compactionJobInfo);
+        assertThat(new String(compactionJobInfo.columnFamilyName()))
+            .isEqualTo("compactionColumnFamily");
+        assertThat(compactionJobInfo.status()).isEqualTo(statusTestData);
+        assertThat(compactionJobInfo.threadId()).isEqualTo(TEST_LONG_VAL);
+        assertThat(compactionJobInfo.jobId()).isEqualTo(Integer.MAX_VALUE);
+        assertThat(compactionJobInfo.baseInputLevel()).isEqualTo(Integer.MAX_VALUE);
+        assertThat(compactionJobInfo.outputLevel()).isEqualTo(Integer.MAX_VALUE);
+        assertThat(compactionJobInfo.inputFiles())
+            .isEqualTo(Collections.singletonList("inputFile.sst"));
+        assertThat(compactionJobInfo.outputFiles())
+            .isEqualTo(Collections.singletonList("outputFile.sst"));
+        assertThat(compactionJobInfo.tableProperties())
+            .isEqualTo(Collections.singletonMap("tableProperties", tablePropertiesTestData));
+        assertThat(compactionJobInfo.compactionReason()).isEqualTo(CompactionReason.kFlush);
+        assertThat(compactionJobInfo.compression()).isEqualTo(CompressionType.SNAPPY_COMPRESSION);
+      }
+
+      @Override
+      public void onTableFileCreated(final TableFileCreationInfo tableFileCreationInfo) {
+        super.onTableFileCreated(tableFileCreationInfo);
+        assertThat(tableFileCreationInfo).isEqualTo(tableFileCreationInfoTestData);
+      }
+
+      @Override
+      public void onTableFileCreationStarted(
+          final TableFileCreationBriefInfo tableFileCreationBriefInfo) {
+        super.onTableFileCreationStarted(tableFileCreationBriefInfo);
+        assertThat(tableFileCreationBriefInfo).isEqualTo(tableFileCreationBriefInfoTestData);
+      }
+
+      @Override
+      public void onMemTableSealed(final MemTableInfo memTableInfo) {
+        super.onMemTableSealed(memTableInfo);
+        assertThat(memTableInfo).isEqualTo(memTableInfoTestData);
+      }
+
+      @Override
+      public void onColumnFamilyHandleDeletionStarted(final ColumnFamilyHandle columnFamilyHandle) {
+        super.onColumnFamilyHandleDeletionStarted(columnFamilyHandle);
+      }
+
+      @Override
+      public void onExternalFileIngested(
+          final RocksDB db, final ExternalFileIngestionInfo externalFileIngestionInfo) {
+        super.onExternalFileIngested(db, externalFileIngestionInfo);
+        assertThat(externalFileIngestionInfo).isEqualTo(externalFileIngestionInfoTestData);
+      }
+
+      @Override
+      public void onBackgroundError(
+          final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) {
+        super.onBackgroundError(backgroundErrorReason, backgroundError);
+      }
+
+      @Override
+      public void onStallConditionsChanged(final WriteStallInfo writeStallInfo) {
+        super.onStallConditionsChanged(writeStallInfo);
+        assertThat(writeStallInfo).isEqualTo(writeStallInfoTestData);
+      }
+
+      @Override
+      public void onFileReadFinish(final FileOperationInfo fileOperationInfo) {
+        super.onFileReadFinish(fileOperationInfo);
+        assertThat(fileOperationInfo).isEqualTo(fileOperationInfoTestData);
+      }
+
+      @Override
+      public void onFileWriteFinish(final FileOperationInfo fileOperationInfo) {
+        super.onFileWriteFinish(fileOperationInfo);
+        assertThat(fileOperationInfo).isEqualTo(fileOperationInfoTestData);
+      }
+
+      @Override
+      public void onFileFlushFinish(final FileOperationInfo fileOperationInfo) {
+        super.onFileFlushFinish(fileOperationInfo);
+        assertThat(fileOperationInfo).isEqualTo(fileOperationInfoTestData);
+      }
+
+      @Override
+      public void onFileSyncFinish(final FileOperationInfo fileOperationInfo) {
+        super.onFileSyncFinish(fileOperationInfo);
+        assertThat(fileOperationInfo).isEqualTo(fileOperationInfoTestData);
+      }
+
+      @Override
+      public void onFileRangeSyncFinish(final FileOperationInfo fileOperationInfo) {
+        super.onFileRangeSyncFinish(fileOperationInfo);
+        assertThat(fileOperationInfo).isEqualTo(fileOperationInfoTestData);
+      }
+
+      @Override
+      public void onFileTruncateFinish(final FileOperationInfo fileOperationInfo) {
+        super.onFileTruncateFinish(fileOperationInfo);
+        assertThat(fileOperationInfo).isEqualTo(fileOperationInfoTestData);
+      }
+
+      @Override
+      public void onFileCloseFinish(final FileOperationInfo fileOperationInfo) {
+        super.onFileCloseFinish(fileOperationInfo);
+        assertThat(fileOperationInfo).isEqualTo(fileOperationInfoTestData);
+      }
+
+      @Override
+      public boolean shouldBeNotifiedOnFileIO() {
+        super.shouldBeNotifiedOnFileIO();
+        return false;
+      }
+
+      @Override
+      public boolean onErrorRecoveryBegin(
+          final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) {
+        super.onErrorRecoveryBegin(backgroundErrorReason, backgroundError);
+        assertThat(backgroundErrorReason).isEqualTo(BackgroundErrorReason.FLUSH);
+        assertThat(backgroundError).isEqualTo(statusTestData);
+        return true;
+      }
+
+      @Override
+      public void onErrorRecoveryCompleted(final Status oldBackgroundError) {
+        super.onErrorRecoveryCompleted(oldBackgroundError);
+        assertThat(oldBackgroundError).isEqualTo(statusTestData);
+      }
+    };
+
+    // test action
+    listener.invokeAllCallbacks();
+
+    // assert
+    assertAllEventsCalled(listener);
+
+    assertNoCallbackErrors(listener);
+  }
+
+  @Test
+  public void testEnabledCallbacks() {
+    final EnabledEventCallback[] enabledEvents = {
+        EnabledEventCallback.ON_MEMTABLE_SEALED, EnabledEventCallback.ON_ERROR_RECOVERY_COMPLETED};
+
+    final CapturingTestableEventListener listener =
+        new CapturingTestableEventListener(enabledEvents);
+
+    // test action
+    listener.invokeAllCallbacks();
+
+    // assert
+    assertEventsCalled(listener, enabledEvents);
+  }
+
+  private static void assertAllEventsCalled(
+      final CapturingTestableEventListener capturingTestableEventListener) {
+    assertEventsCalled(capturingTestableEventListener, EnumSet.allOf(EnabledEventCallback.class));
+  }
+
+  private static void assertEventsCalled(
+      final CapturingTestableEventListener capturingTestableEventListener,
+      final EnabledEventCallback[] expected) {
+    assertEventsCalled(capturingTestableEventListener, EnumSet.copyOf(Arrays.asList(expected)));
+  }
+
+  private static void assertNoCallbackErrors(
+      final CapturingTestableEventListener capturingTestableEventListener) {
+    for (AssertionError error : capturingTestableEventListener.capturedAssertionErrors) {
+      throw new Error("An assertion failed in callback", error);
+    }
+  }
+
+  private static void assertEventsCalled(
+      final CapturingTestableEventListener capturingTestableEventListener,
+      final EnumSet<EnabledEventCallback> expected) {
+    final ListenerEvents capturedEvents = capturingTestableEventListener.capturedListenerEvents;
+
+    assertThat(capturedEvents.flushCompleted)
+        .isEqualTo(expected.contains(EnabledEventCallback.ON_FLUSH_COMPLETED));
+    assertThat(capturedEvents.flushBegin)
+        .isEqualTo(expected.contains(EnabledEventCallback.ON_FLUSH_BEGIN));
+    assertThat(capturedEvents.tableFileDeleted)
+        .isEqualTo(expected.contains(EnabledEventCallback.ON_TABLE_FILE_DELETED));
+    assertThat(capturedEvents.compactionBegin)
+        .isEqualTo(expected.contains(EnabledEventCallback.ON_COMPACTION_BEGIN));
+    assertThat(capturedEvents.compactionCompleted)
+        .isEqualTo(expected.contains(EnabledEventCallback.ON_COMPACTION_COMPLETED));
+    assertThat(capturedEvents.tableFileCreated)
+        .isEqualTo(expected.contains(EnabledEventCallback.ON_TABLE_FILE_CREATED));
+    assertThat(capturedEvents.tableFileCreationStarted)
+        .isEqualTo(expected.contains(EnabledEventCallback.ON_TABLE_FILE_CREATION_STARTED));
+    assertThat(capturedEvents.memTableSealed)
+        .isEqualTo(expected.contains(EnabledEventCallback.ON_MEMTABLE_SEALED));
+    assertThat(capturedEvents.columnFamilyHandleDeletionStarted)
+        .isEqualTo(
+            expected.contains(EnabledEventCallback.ON_COLUMN_FAMILY_HANDLE_DELETION_STARTED));
+    assertThat(capturedEvents.externalFileIngested)
+        .isEqualTo(expected.contains(EnabledEventCallback.ON_EXTERNAL_FILE_INGESTED));
+    assertThat(capturedEvents.backgroundError)
+        .isEqualTo(expected.contains(EnabledEventCallback.ON_BACKGROUND_ERROR));
+    assertThat(capturedEvents.stallConditionsChanged)
+        .isEqualTo(expected.contains(EnabledEventCallback.ON_STALL_CONDITIONS_CHANGED));
+    assertThat(capturedEvents.fileReadFinish)
+        .isEqualTo(expected.contains(EnabledEventCallback.ON_FILE_READ_FINISH));
+    assertThat(capturedEvents.fileWriteFinish)
+        .isEqualTo(expected.contains(EnabledEventCallback.ON_FILE_WRITE_FINISH));
+    assertThat(capturedEvents.fileFlushFinish)
+        .isEqualTo(expected.contains(EnabledEventCallback.ON_FILE_FLUSH_FINISH));
+    assertThat(capturedEvents.fileSyncFinish)
+        .isEqualTo(expected.contains(EnabledEventCallback.ON_FILE_SYNC_FINISH));
+    assertThat(capturedEvents.fileRangeSyncFinish)
+        .isEqualTo(expected.contains(EnabledEventCallback.ON_FILE_RANGE_SYNC_FINISH));
+    assertThat(capturedEvents.fileTruncateFinish)
+        .isEqualTo(expected.contains(EnabledEventCallback.ON_FILE_TRUNCATE_FINISH));
+    assertThat(capturedEvents.fileCloseFinish)
+        .isEqualTo(expected.contains(EnabledEventCallback.ON_FILE_CLOSE_FINISH));
+    assertThat(capturedEvents.shouldBeNotifiedOnFileIO)
+        .isEqualTo(expected.contains(EnabledEventCallback.SHOULD_BE_NOTIFIED_ON_FILE_IO));
+    assertThat(capturedEvents.errorRecoveryBegin)
+        .isEqualTo(expected.contains(EnabledEventCallback.ON_ERROR_RECOVERY_BEGIN));
+    assertThat(capturedEvents.errorRecoveryCompleted)
+        .isEqualTo(expected.contains(EnabledEventCallback.ON_ERROR_RECOVERY_COMPLETED));
+    assertThat(capturedEvents.errorRecoveryCompleted)
+        .isEqualTo(expected.contains(EnabledEventCallback.ON_ERROR_RECOVERY_COMPLETED));
+  }
+
+  /**
+   * Members are volatile as they may be written
+   * and read by different threads.
+   */
+  private static class ListenerEvents {
+    volatile boolean flushCompleted;
+    volatile boolean flushBegin;
+    volatile boolean tableFileDeleted;
+    volatile boolean compactionBegin;
+    volatile boolean compactionCompleted;
+    volatile boolean tableFileCreated;
+    volatile boolean tableFileCreationStarted;
+    volatile boolean memTableSealed;
+    volatile boolean columnFamilyHandleDeletionStarted;
+    volatile boolean externalFileIngested;
+    volatile boolean backgroundError;
+    volatile boolean stallConditionsChanged;
+    volatile boolean fileReadFinish;
+    volatile boolean fileWriteFinish;
+    volatile boolean fileFlushFinish;
+    volatile boolean fileSyncFinish;
+    volatile boolean fileRangeSyncFinish;
+    volatile boolean fileTruncateFinish;
+    volatile boolean fileCloseFinish;
+    volatile boolean shouldBeNotifiedOnFileIO;
+    volatile boolean errorRecoveryBegin;
+    volatile boolean errorRecoveryCompleted;
+  }
+
+  private static class CapturingObjectAssert<T> extends ObjectAssert<T> {
+    private final List<AssertionError> assertionErrors;
+    public CapturingObjectAssert(T t, List<AssertionError> assertionErrors) {
+      super(t);
+      this.assertionErrors = assertionErrors;
+    }
+
+    @Override
+    public ObjectAssert<T> isEqualTo(Object other) {
+      try {
+        return super.isEqualTo(other);
+      } catch (AssertionError error) {
+        assertionErrors.add(error);
+        throw error;
+      }
+    }
+
+    @Override
+    public ObjectAssert<T> isNotNull() {
+      try {
+        return super.isNotNull();
+      } catch (AssertionError error) {
+        assertionErrors.add(error);
+        throw error;
+      }
+    }
+  }
+
+  private static class CapturingTestableEventListener extends TestableEventListener {
+    final ListenerEvents capturedListenerEvents = new ListenerEvents();
+
+    final List<AssertionError> capturedAssertionErrors = new ArrayList<>();
+
+    protected <T> AbstractObjectAssert<?, T> assertThat(T actual) {
+      return new CapturingObjectAssert<T>(actual, capturedAssertionErrors);
+    }
+
+    public CapturingTestableEventListener() {}
+
+    public CapturingTestableEventListener(final EnabledEventCallback... enabledEventCallbacks) {
+      super(enabledEventCallbacks);
+    }
+
+    @Override
+    public void onFlushCompleted(final RocksDB db, final FlushJobInfo flushJobInfo) {
+      capturedListenerEvents.flushCompleted = true;
+    }
+
+    @Override
+    public void onFlushBegin(final RocksDB db, final FlushJobInfo flushJobInfo) {
+      capturedListenerEvents.flushBegin = true;
+    }
+
+    @Override
+    public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) {
+      capturedListenerEvents.tableFileDeleted = true;
+    }
+
+    @Override
+    public void onCompactionBegin(final RocksDB db, final CompactionJobInfo compactionJobInfo) {
+      capturedListenerEvents.compactionBegin = true;
+    }
+
+    @Override
+    public void onCompactionCompleted(final RocksDB db, final CompactionJobInfo compactionJobInfo) {
+      capturedListenerEvents.compactionCompleted = true;
+    }
+
+    @Override
+    public void onTableFileCreated(final TableFileCreationInfo tableFileCreationInfo) {
+      capturedListenerEvents.tableFileCreated = true;
+    }
+
+    @Override
+    public void onTableFileCreationStarted(
+        final TableFileCreationBriefInfo tableFileCreationBriefInfo) {
+      capturedListenerEvents.tableFileCreationStarted = true;
+    }
+
+    @Override
+    public void onMemTableSealed(final MemTableInfo memTableInfo) {
+      capturedListenerEvents.memTableSealed = true;
+    }
+
+    @Override
+    public void onColumnFamilyHandleDeletionStarted(final ColumnFamilyHandle columnFamilyHandle) {
+      capturedListenerEvents.columnFamilyHandleDeletionStarted = true;
+    }
+
+    @Override
+    public void onExternalFileIngested(
+        final RocksDB db, final ExternalFileIngestionInfo externalFileIngestionInfo) {
+      capturedListenerEvents.externalFileIngested = true;
+    }
+
+    @Override
+    public void onBackgroundError(
+        final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) {
+      capturedListenerEvents.backgroundError = true;
+    }
+
+    @Override
+    public void onStallConditionsChanged(final WriteStallInfo writeStallInfo) {
+      capturedListenerEvents.stallConditionsChanged = true;
+    }
+
+    @Override
+    public void onFileReadFinish(final FileOperationInfo fileOperationInfo) {
+      capturedListenerEvents.fileReadFinish = true;
+    }
+
+    @Override
+    public void onFileWriteFinish(final FileOperationInfo fileOperationInfo) {
+      capturedListenerEvents.fileWriteFinish = true;
+    }
+
+    @Override
+    public void onFileFlushFinish(final FileOperationInfo fileOperationInfo) {
+      capturedListenerEvents.fileFlushFinish = true;
+    }
+
+    @Override
+    public void onFileSyncFinish(final FileOperationInfo fileOperationInfo) {
+      capturedListenerEvents.fileSyncFinish = true;
+    }
+
+    @Override
+    public void onFileRangeSyncFinish(final FileOperationInfo fileOperationInfo) {
+      capturedListenerEvents.fileRangeSyncFinish = true;
+    }
+
+    @Override
+    public void onFileTruncateFinish(final FileOperationInfo fileOperationInfo) {
+      capturedListenerEvents.fileTruncateFinish = true;
+    }
+
+    @Override
+    public void onFileCloseFinish(final FileOperationInfo fileOperationInfo) {
+      capturedListenerEvents.fileCloseFinish = true;
+    }
+
+    @Override
+    public boolean shouldBeNotifiedOnFileIO() {
+      capturedListenerEvents.shouldBeNotifiedOnFileIO = true;
+      return false;
+    }
+
+    @Override
+    public boolean onErrorRecoveryBegin(
+        final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) {
+      capturedListenerEvents.errorRecoveryBegin = true;
+      return true;
+    }
+
+    @Override
+    public void onErrorRecoveryCompleted(final Status oldBackgroundError) {
+      capturedListenerEvents.errorRecoveryCompleted = true;
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/FilterTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/FilterTest.java
new file mode 100644
index 000000000..dc5c19fbc
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/FilterTest.java
@@ -0,0 +1,39 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+public class FilterTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Test
+  public void filter() {
+    // new Bloom filter
+    final BlockBasedTableConfig blockConfig = new BlockBasedTableConfig();
+    try(final Options options = new Options()) {
+
+      try(final Filter bloomFilter = new BloomFilter()) {
+        blockConfig.setFilterPolicy(bloomFilter);
+        options.setTableFormatConfig(blockConfig);
+      }
+
+      try(final Filter bloomFilter = new BloomFilter(10)) {
+        blockConfig.setFilterPolicy(bloomFilter);
+        options.setTableFormatConfig(blockConfig);
+      }
+
+      try(final Filter bloomFilter = new BloomFilter(10, false)) {
+        blockConfig.setFilterPolicy(bloomFilter);
+        options.setTableFormatConfig(blockConfig);
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/FlushOptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/FlushOptionsTest.java
new file mode 100644
index 000000000..f90ae911d
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/FlushOptionsTest.java
@@ -0,0 +1,31 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class FlushOptionsTest {
+
+  @Test
+  public void waitForFlush() {
+    try (final FlushOptions flushOptions = new FlushOptions()) {
+      assertThat(flushOptions.waitForFlush()).isTrue();
+      flushOptions.setWaitForFlush(false);
+      assertThat(flushOptions.waitForFlush()).isFalse();
+    }
+  }
+
+  @Test
+  public void allowWriteStall() {
+    try (final FlushOptions flushOptions = new FlushOptions()) {
+      assertThat(flushOptions.allowWriteStall()).isFalse();
+      flushOptions.setAllowWriteStall(true);
+      assertThat(flushOptions.allowWriteStall()).isTrue();
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/FlushTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/FlushTest.java
new file mode 100644
index 000000000..1a354f4ce
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/FlushTest.java
@@ -0,0 +1,49 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class FlushTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void flush() throws RocksDBException {
+    try(final Options options = new Options()
+        .setCreateIfMissing(true)
+        .setMaxWriteBufferNumber(10)
+        .setMinWriteBufferNumberToMerge(10);
+        final WriteOptions wOpt = new WriteOptions()
+            .setDisableWAL(true);
+        final FlushOptions flushOptions = new FlushOptions()
+            .setWaitForFlush(true)) {
+      assertThat(flushOptions.waitForFlush()).isTrue();
+
+      try(final RocksDB db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath())) {
+        db.put(wOpt, "key1".getBytes(), "value1".getBytes());
+        db.put(wOpt, "key2".getBytes(), "value2".getBytes());
+        db.put(wOpt, "key3".getBytes(), "value3".getBytes());
+        db.put(wOpt, "key4".getBytes(), "value4".getBytes());
+        assertThat(db.getProperty("rocksdb.num-entries-active-mem-table"))
+            .isEqualTo("4");
+        db.flush(flushOptions);
+        assertThat(db.getProperty("rocksdb.num-entries-active-mem-table"))
+            .isEqualTo("0");
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/InfoLogLevelTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/InfoLogLevelTest.java
new file mode 100644
index 000000000..12ee537d9
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/InfoLogLevelTest.java
@@ -0,0 +1,109 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.rocksdb.util.Environment;
+
+import java.io.IOException;
+
+import static java.nio.file.Files.readAllBytes;
+import static java.nio.file.Paths.get;
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class InfoLogLevelTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void testInfoLogLevel() throws RocksDBException,
+      IOException {
+    try (final RocksDB db =
+             RocksDB.open(dbFolder.getRoot().getAbsolutePath())) {
+      db.put("key".getBytes(), "value".getBytes());
+      db.flush(new FlushOptions().setWaitForFlush(true));
+      assertThat(getLogContentsWithoutHeader()).isNotEmpty();
+    }
+  }
+
+  @Test
+  public void testFatalLogLevel() throws RocksDBException,
+      IOException {
+    try (final Options options = new Options().
+        setCreateIfMissing(true).
+        setInfoLogLevel(InfoLogLevel.FATAL_LEVEL);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath())) {
+      assertThat(options.infoLogLevel()).
+          isEqualTo(InfoLogLevel.FATAL_LEVEL);
+      db.put("key".getBytes(), "value".getBytes());
+      // As InfoLogLevel is set to FATAL_LEVEL, here we expect the log
+      // content to be empty.
+      assertThat(getLogContentsWithoutHeader()).isEmpty();
+    }
+  }
+
+  @Test
+  public void testFatalLogLevelWithDBOptions()
+      throws RocksDBException, IOException {
+    try (final DBOptions dbOptions = new DBOptions().
+        setInfoLogLevel(InfoLogLevel.FATAL_LEVEL);
+         final Options options = new Options(dbOptions,
+             new ColumnFamilyOptions()).
+             setCreateIfMissing(true);
+         final RocksDB db =
+             RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) {
+      assertThat(dbOptions.infoLogLevel()).
+          isEqualTo(InfoLogLevel.FATAL_LEVEL);
+      assertThat(options.infoLogLevel()).
+          isEqualTo(InfoLogLevel.FATAL_LEVEL);
+      db.put("key".getBytes(), "value".getBytes());
+      assertThat(getLogContentsWithoutHeader()).isEmpty();
+    }
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void failIfIllegalByteValueProvided() {
+    InfoLogLevel.getInfoLogLevel((byte) -1);
+  }
+
+  @Test
+  public void valueOf() {
+    assertThat(InfoLogLevel.valueOf("DEBUG_LEVEL")).
+        isEqualTo(InfoLogLevel.DEBUG_LEVEL);
+  }
+
+  /**
+   * Read LOG file contents into String.
+   *
+   * @return LOG file contents as String.
+   * @throws IOException if file is not found.
+   */
+  private String getLogContentsWithoutHeader() throws IOException {
+    final String separator = Environment.isWindows() ?
+        "\n" : System.getProperty("line.separator");
+    final String[] lines = new String(readAllBytes(get(
+        dbFolder.getRoot().getAbsolutePath() + "/LOG"))).split(separator);
+
+    int first_non_header = lines.length;
+    // Identify the last line of the header
+    for (int i = lines.length - 1; i >= 0; --i) {
+      if (lines[i].indexOf("DB pointer") >= 0) {
+        first_non_header = i + 1;
+        break;
+      }
+    }
+    StringBuilder builder = new StringBuilder();
+    for (int i = first_non_header; i < lines.length; ++i) {
+      builder.append(lines[i]).append(separator);
+    }
+    return builder.toString();
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/IngestExternalFileOptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/IngestExternalFileOptionsTest.java
new file mode 100644
index 000000000..ab7e21568
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/IngestExternalFileOptionsTest.java
@@ -0,0 +1,107 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import java.util.Random;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class IngestExternalFileOptionsTest {
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE
+      = new RocksNativeLibraryResource();
+
+  public static final Random rand =
+      PlatformRandomHelper.getPlatformSpecificRandomFactory();
+
+  @Test
+  public void createExternalSstFileInfoWithoutParameters() {
+    try (final IngestExternalFileOptions options =
+        new IngestExternalFileOptions()) {
+      assertThat(options).isNotNull();
+    }
+  }
+
+  @Test
+  public void createExternalSstFileInfoWithParameters() {
+    final boolean moveFiles = rand.nextBoolean();
+    final boolean snapshotConsistency = rand.nextBoolean();
+    final boolean allowGlobalSeqNo = rand.nextBoolean();
+    final boolean allowBlockingFlush = rand.nextBoolean();
+    try (final IngestExternalFileOptions options =
+        new IngestExternalFileOptions(moveFiles, snapshotConsistency,
+        allowGlobalSeqNo, allowBlockingFlush)) {
+      assertThat(options).isNotNull();
+      assertThat(options.moveFiles()).isEqualTo(moveFiles);
+      assertThat(options.snapshotConsistency()).isEqualTo(snapshotConsistency);
+      assertThat(options.allowGlobalSeqNo()).isEqualTo(allowGlobalSeqNo);
+      assertThat(options.allowBlockingFlush()).isEqualTo(allowBlockingFlush);
+    }
+  }
+
+  @Test
+  public void moveFiles() {
+    try (final IngestExternalFileOptions options =
+        new IngestExternalFileOptions()) {
+      final boolean moveFiles = rand.nextBoolean();
+      options.setMoveFiles(moveFiles);
+      assertThat(options.moveFiles()).isEqualTo(moveFiles);
+    }
+  }
+
+  @Test
+  public void snapshotConsistency() {
+    try (final IngestExternalFileOptions options =
+        new IngestExternalFileOptions()) {
+      final boolean snapshotConsistency = rand.nextBoolean();
+      options.setSnapshotConsistency(snapshotConsistency);
+      assertThat(options.snapshotConsistency()).isEqualTo(snapshotConsistency);
+    }
+  }
+
+  @Test
+  public void allowGlobalSeqNo() {
+    try (final IngestExternalFileOptions options =
+        new IngestExternalFileOptions()) {
+      final boolean allowGlobalSeqNo = rand.nextBoolean();
+      options.setAllowGlobalSeqNo(allowGlobalSeqNo);
+      assertThat(options.allowGlobalSeqNo()).isEqualTo(allowGlobalSeqNo);
+    }
+  }
+
+  @Test
+  public void allowBlockingFlush() {
+    try (final IngestExternalFileOptions options =
+        new IngestExternalFileOptions()) {
+      final boolean allowBlockingFlush = rand.nextBoolean();
+      options.setAllowBlockingFlush(allowBlockingFlush);
+      assertThat(options.allowBlockingFlush()).isEqualTo(allowBlockingFlush);
+    }
+  }
+
+  @Test
+  public void ingestBehind() {
+    try (final IngestExternalFileOptions options =
+             new IngestExternalFileOptions()) {
+      assertThat(options.ingestBehind()).isFalse();
+      options.setIngestBehind(true);
+      assertThat(options.ingestBehind()).isTrue();
+    }
+  }
+
+  @Test
+  public void writeGlobalSeqno() {
+    try (final IngestExternalFileOptions options =
+             new IngestExternalFileOptions()) {
+      assertThat(options.writeGlobalSeqno()).isTrue();
+      options.setWriteGlobalSeqno(false);
+      assertThat(options.writeGlobalSeqno()).isFalse();
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/KeyMayExistTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/KeyMayExistTest.java
new file mode 100644
index 000000000..3f3bec6ba
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/KeyMayExistTest.java
@@ -0,0 +1,528 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.nio.BufferUnderflowException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import org.junit.*;
+import org.junit.rules.ExpectedException;
+import org.junit.rules.TemporaryFolder;
+
+public class KeyMayExistTest {
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Rule public ExpectedException exceptionRule = ExpectedException.none();
+
+  List<ColumnFamilyDescriptor> cfDescriptors;
+  List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+  RocksDB db;
+
+  // Slice key
+  int offset;
+  int len;
+
+  byte[] sliceKey;
+  byte[] sliceValue;
+
+  @Before
+  public void before() throws RocksDBException {
+    cfDescriptors = Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+        new ColumnFamilyDescriptor("new_cf".getBytes()));
+    final DBOptions options =
+        new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true);
+
+    db = RocksDB.open(
+        options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList);
+
+    // Build the slice key
+    final StringBuilder builder = new StringBuilder("prefix");
+    offset = builder.toString().length();
+    builder.append("slice key 0");
+    len = builder.toString().length() - offset;
+    builder.append("suffix");
+    sliceKey = builder.toString().getBytes(UTF_8);
+    sliceValue = "slice value 0".getBytes(UTF_8);
+  }
+
+  @After
+  public void after() {
+    for (final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
+      columnFamilyHandle.close();
+    }
+    db.close();
+  }
+
+  @Test
+  public void keyMayExist() throws RocksDBException {
+    assertThat(columnFamilyHandleList.size()).isEqualTo(2);
+
+    // Standard key
+    db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8));
+
+    // Test without column family
+    final Holder<byte[]> holder = new Holder<>();
+    boolean exists = db.keyMayExist("key".getBytes(UTF_8), holder);
+    assertThat(exists).isTrue();
+    assertThat(holder.getValue()).isNotNull();
+    assertThat(new String(holder.getValue(), UTF_8)).isEqualTo("value");
+
+    exists = db.keyMayExist("key".getBytes(UTF_8), null);
+    assertThat(exists).isTrue();
+  }
+
+  @Test
+  public void keyMayExistReadOptions() throws RocksDBException {
+    // Test without column family but with readOptions
+    try (final ReadOptions readOptions = new ReadOptions()) {
+      // Standard key
+      db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8));
+
+      // Slice key
+      db.put(sliceKey, offset, len, sliceValue, 0, sliceValue.length);
+
+      final Holder<byte[]> holder = new Holder<>();
+      boolean exists = db.keyMayExist(readOptions, "key".getBytes(UTF_8), holder);
+      assertThat(exists).isTrue();
+      assertThat(holder.getValue()).isNotNull();
+      assertThat(new String(holder.getValue(), UTF_8)).isEqualTo("value");
+
+      exists = db.keyMayExist(readOptions, "key".getBytes(UTF_8), null);
+      assertThat(exists).isTrue();
+
+      exists = db.keyMayExist(readOptions, sliceKey, offset, len, holder);
+      assertThat(exists).isTrue();
+      assertThat(holder.getValue()).isNotNull();
+      assertThat(holder.getValue()).isEqualTo(sliceValue);
+
+      exists = db.keyMayExist(readOptions, sliceKey, offset, len, null);
+      assertThat(exists).isTrue();
+    }
+  }
+
+  @Test
+  public void keyMayExistColumnFamily() throws RocksDBException {
+    // Standard key
+    db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8));
+
+    // Slice key
+    db.put(sliceKey, offset, len, sliceValue, 0, sliceValue.length);
+
+    // Test slice key with column family
+    final Holder<byte[]> holder = new Holder<>();
+    boolean exists = db.keyMayExist(columnFamilyHandleList.get(0), sliceKey, offset, len, holder);
+    assertThat(exists).isTrue();
+    assertThat(holder.getValue()).isNotNull();
+    assertThat(holder.getValue()).isEqualTo(sliceValue);
+
+    exists = db.keyMayExist(columnFamilyHandleList.get(0), sliceKey, offset, len, null);
+    assertThat(exists).isTrue();
+  }
+
+  @Test
+  public void keyMayExistColumnFamilyReadOptions() throws RocksDBException {
+    // Standard key
+    db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8));
+
+    // Slice key
+    db.put(sliceKey, offset, len, sliceValue, 0, sliceValue.length);
+
+    // Test slice key with column family and read options
+    final Holder<byte[]> holder = new Holder<>();
+    try (final ReadOptions readOptions = new ReadOptions()) {
+      boolean exists =
+          db.keyMayExist(columnFamilyHandleList.get(0), readOptions, "key".getBytes(UTF_8), holder);
+      assertThat(exists).isTrue();
+      assertThat(holder.getValue()).isNotNull();
+      assertThat(new String(holder.getValue(), UTF_8)).isEqualTo("value");
+
+      exists =
+          db.keyMayExist(columnFamilyHandleList.get(0), readOptions, "key".getBytes(UTF_8), null);
+      assertThat(exists).isTrue();
+
+      // Test slice key with column family and read options
+      exists =
+          db.keyMayExist(columnFamilyHandleList.get(0), readOptions, sliceKey, offset, len, holder);
+      assertThat(exists).isTrue();
+      assertThat(holder.getValue()).isNotNull();
+      assertThat(holder.getValue()).isEqualTo(sliceValue);
+
+      exists =
+          db.keyMayExist(columnFamilyHandleList.get(0), readOptions, sliceKey, offset, len, null);
+      assertThat(exists).isTrue();
+    }
+  }
+
+  @Test
+  public void keyMayExistSliceKey() throws RocksDBException {
+    assertThat(columnFamilyHandleList.size()).isEqualTo(2);
+
+    // Standard key
+    db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8));
+
+    // Slice key
+    db.put(sliceKey, offset, len, sliceValue, 0, sliceValue.length);
+
+    final Holder<byte[]> holder = new Holder<>();
+    boolean exists = db.keyMayExist(sliceKey, offset, len, holder);
+    assertThat(exists).isTrue();
+    assertThat(holder.getValue()).isNotNull();
+    assertThat(holder.getValue()).isEqualTo(sliceValue);
+
+    exists = db.keyMayExist(sliceKey, offset, len, null);
+    assertThat(exists).isTrue();
+
+    exists = db.keyMayExist("slice key".getBytes(UTF_8), null);
+    assertThat(exists).isFalse();
+
+    exists = db.keyMayExist("slice key 0".getBytes(UTF_8), null);
+    assertThat(exists).isTrue();
+
+    // Test with column family
+    exists = db.keyMayExist(columnFamilyHandleList.get(0), "key".getBytes(UTF_8), holder);
+    assertThat(exists).isTrue();
+    assertThat(holder.getValue()).isNotNull();
+    assertThat(new String(holder.getValue(), UTF_8)).isEqualTo("value");
+
+    exists = db.keyMayExist(columnFamilyHandleList.get(0), "key".getBytes(UTF_8), null);
+    assertThat(exists).isTrue();
+
+    // KeyMayExist in CF1 must return null value
+    exists = db.keyMayExist(columnFamilyHandleList.get(1), "key".getBytes(UTF_8), holder);
+    assertThat(exists).isFalse();
+    assertThat(holder.getValue()).isNull();
+    exists = db.keyMayExist(columnFamilyHandleList.get(1), "key".getBytes(UTF_8), null);
+    assertThat(exists).isFalse();
+
+    // slice key
+    exists = db.keyMayExist(columnFamilyHandleList.get(1), sliceKey, 1, 3, holder);
+    assertThat(exists).isFalse();
+    assertThat(holder.getValue()).isNull();
+    exists = db.keyMayExist(columnFamilyHandleList.get(1), sliceKey, 1, 3, null);
+    assertThat(exists).isFalse();
+  }
+
+  @Test
+  public void keyMayExistCF1() throws RocksDBException {
+    // Standard key
+    db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8));
+
+    // Slice key
+    db.put(sliceKey, offset, len, sliceValue, 0, sliceValue.length);
+
+    // KeyMayExist in CF1 must return null value
+    final Holder<byte[]> holder = new Holder<>();
+    boolean exists = db.keyMayExist(columnFamilyHandleList.get(1), "key".getBytes(UTF_8), holder);
+    assertThat(exists).isFalse();
+    assertThat(holder.getValue()).isNull();
+    exists = db.keyMayExist(columnFamilyHandleList.get(1), "key".getBytes(UTF_8), null);
+    assertThat(exists).isFalse();
+  }
+
+  @Test
+  public void keyMayExistCF1Slice() throws RocksDBException {
+    // Standard key
+    db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8));
+
+    // Slice key
+    db.put(sliceKey, offset, len, sliceValue, 0, sliceValue.length);
+
+    // slice key
+    final Holder<byte[]> holder = new Holder<>();
+    boolean exists = db.keyMayExist(columnFamilyHandleList.get(1), sliceKey, 1, 3, holder);
+    assertThat(exists).isFalse();
+    assertThat(holder.getValue()).isNull();
+    exists = db.keyMayExist(columnFamilyHandleList.get(1), sliceKey, 1, 3, null);
+    assertThat(exists).isFalse();
+  }
+
+  @Test
+  public void keyMayExistBB() throws RocksDBException {
+    // Standard key
+    db.put("keyBB".getBytes(UTF_8), "valueBB".getBytes(UTF_8));
+
+    final byte[] key = "keyBB".getBytes(UTF_8);
+    final byte[] value = "valueBB".getBytes(UTF_8);
+
+    final ByteBuffer keyBuffer = ByteBuffer.allocateDirect(key.length);
+    keyBuffer.put(key, 0, key.length);
+    keyBuffer.flip();
+
+    assertThat(db.keyMayExist(keyBuffer)).isEqualTo(true);
+
+    final ByteBuffer valueBuffer = ByteBuffer.allocateDirect(value.length + 24);
+    valueBuffer.position(12);
+    KeyMayExist keyMayExist = db.keyMayExist(keyBuffer, valueBuffer);
+    assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue);
+    assertThat(keyMayExist.valueLength).isEqualTo(value.length);
+    assertThat(valueBuffer.position()).isEqualTo(12);
+    assertThat(valueBuffer.limit()).isEqualTo(12 + value.length);
+    byte[] valueGet = new byte[value.length];
+    valueBuffer.get(valueGet);
+    assertThat(valueGet).isEqualTo(value);
+
+    valueBuffer.limit(value.length + 24);
+    valueBuffer.position(25);
+    keyMayExist = db.keyMayExist(keyBuffer, valueBuffer);
+    assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue);
+    assertThat(keyMayExist.valueLength).isEqualTo(value.length);
+    assertThat(valueBuffer.position()).isEqualTo(25);
+    assertThat(valueBuffer.limit()).isEqualTo(24 + value.length);
+    valueGet = new byte[value.length - 1];
+    valueBuffer.get(valueGet);
+    assertThat(valueGet).isEqualTo(Arrays.copyOfRange(value, 0, value.length - 1));
+
+    exceptionRule.expect(BufferUnderflowException.class);
+    valueGet = new byte[value.length];
+    valueBuffer.get(valueGet);
+  }
+
+  @Test
+  public void keyMayExistBBReadOptions() throws RocksDBException {
+    // Standard key
+    db.put("keyBB".getBytes(UTF_8), "valueBB".getBytes(UTF_8));
+
+    final byte[] key = "keyBB".getBytes(UTF_8);
+    final byte[] value = "valueBB".getBytes(UTF_8);
+
+    final ByteBuffer keyBuffer = ByteBuffer.allocateDirect(key.length);
+    keyBuffer.put(key, 0, key.length);
+    keyBuffer.flip();
+
+    try (final ReadOptions readOptions = new ReadOptions()) {
+      assertThat(db.keyMayExist(readOptions, keyBuffer)).isEqualTo(true);
+
+      final ByteBuffer valueBuffer = ByteBuffer.allocateDirect(value.length + 24);
+      valueBuffer.position(12);
+      KeyMayExist keyMayExist = db.keyMayExist(readOptions, keyBuffer, valueBuffer);
+      assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue);
+      assertThat(keyMayExist.valueLength).isEqualTo(value.length);
+      assertThat(valueBuffer.position()).isEqualTo(12);
+      assertThat(valueBuffer.limit()).isEqualTo(12 + value.length);
+      byte[] valueGet = new byte[value.length];
+      valueBuffer.get(valueGet);
+      assertThat(valueGet).isEqualTo(value);
+
+      valueBuffer.limit(value.length + 24);
+      valueBuffer.position(25);
+      keyMayExist = db.keyMayExist(readOptions, keyBuffer, valueBuffer);
+      assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue);
+      assertThat(keyMayExist.valueLength).isEqualTo(value.length);
+      assertThat(valueBuffer.position()).isEqualTo(25);
+      assertThat(valueBuffer.limit()).isEqualTo(24 + value.length);
+      valueGet = new byte[value.length - 1];
+      valueBuffer.get(valueGet);
+      assertThat(valueGet).isEqualTo(Arrays.copyOfRange(value, 0, value.length - 1));
+
+      exceptionRule.expect(BufferUnderflowException.class);
+      valueGet = new byte[value.length];
+      valueBuffer.get(valueGet);
+    }
+  }
+
+  @Test
+  public void keyMayExistBBNullValue() throws RocksDBException {
+    // Standard key
+    db.put("keyBB".getBytes(UTF_8), "valueBB".getBytes(UTF_8));
+
+    final byte[] key = "keyBB".getBytes(UTF_8);
+
+    final ByteBuffer keyBuffer = ByteBuffer.allocateDirect(key.length);
+    keyBuffer.put(key, 0, key.length);
+    keyBuffer.flip();
+
+    exceptionRule.expect(AssertionError.class);
+    exceptionRule.expectMessage(
+        "value ByteBuffer parameter cannot be null. If you do not need the value, use a different version of the method");
+    final KeyMayExist keyMayExist = db.keyMayExist(keyBuffer, null);
+  }
+
+  @Test
+  public void keyMayExistBBCF() throws RocksDBException {
+    // Standard key
+    db.put(columnFamilyHandleList.get(0), "keyBBCF0".getBytes(UTF_8), "valueBBCF0".getBytes(UTF_8));
+    db.put(columnFamilyHandleList.get(1), "keyBBCF1".getBytes(UTF_8), "valueBBCF1".getBytes(UTF_8));
+
+    // 0 is the default CF
+    byte[] key = "keyBBCF0".getBytes(UTF_8);
+    ByteBuffer keyBuffer = ByteBuffer.allocateDirect(key.length);
+    keyBuffer.put(key, 0, key.length);
+    keyBuffer.flip();
+
+    assertThat(db.keyMayExist(keyBuffer)).isEqualTo(true);
+    assertThat(db.keyMayExist(columnFamilyHandleList.get(1), keyBuffer)).isEqualTo(false);
+    assertThat(db.keyMayExist(columnFamilyHandleList.get(0), keyBuffer)).isEqualTo(true);
+
+    // 1 is just a CF
+    key = "keyBBCF1".getBytes(UTF_8);
+    keyBuffer = ByteBuffer.allocateDirect(key.length);
+    keyBuffer.put(key, 0, key.length);
+    keyBuffer.flip();
+
+    assertThat(db.keyMayExist(keyBuffer)).isEqualTo(false);
+    assertThat(db.keyMayExist(columnFamilyHandleList.get(1), keyBuffer)).isEqualTo(true);
+    assertThat(db.keyMayExist(columnFamilyHandleList.get(0), keyBuffer)).isEqualTo(false);
+
+    exceptionRule.expect(AssertionError.class);
+    exceptionRule.expectMessage(
+        "value ByteBuffer parameter cannot be null. If you do not need the value, use a different version of the method");
+    final KeyMayExist keyMayExist = db.keyMayExist(columnFamilyHandleList.get(0), keyBuffer, null);
+  }
+
+  @Test
+  public void keyMayExistBBCFReadOptions() throws RocksDBException {
+    // Standard key
+    db.put(columnFamilyHandleList.get(0), "keyBBCF0".getBytes(UTF_8), "valueBBCF0".getBytes(UTF_8));
+    db.put(columnFamilyHandleList.get(1), "keyBBCF1".getBytes(UTF_8), "valueBBCF1".getBytes(UTF_8));
+
+    // 0 is the default CF
+    byte[] key = "keyBBCF0".getBytes(UTF_8);
+    ByteBuffer keyBuffer = ByteBuffer.allocateDirect(key.length);
+    keyBuffer.put(key, 0, key.length);
+    keyBuffer.flip();
+
+    try (final ReadOptions readOptions = new ReadOptions()) {
+      assertThat(db.keyMayExist(keyBuffer)).isEqualTo(true);
+      assertThat(db.keyMayExist(columnFamilyHandleList.get(1), readOptions, keyBuffer))
+          .isEqualTo(false);
+      assertThat(db.keyMayExist(columnFamilyHandleList.get(0), readOptions, keyBuffer))
+          .isEqualTo(true);
+
+      // 1 is just a CF
+      key = "keyBBCF1".getBytes(UTF_8);
+      keyBuffer = ByteBuffer.allocateDirect(key.length);
+      keyBuffer.put(key, 0, key.length);
+      keyBuffer.flip();
+
+      assertThat(db.keyMayExist(readOptions, keyBuffer)).isEqualTo(false);
+      assertThat(db.keyMayExist(columnFamilyHandleList.get(1), readOptions, keyBuffer))
+          .isEqualTo(true);
+      assertThat(db.keyMayExist(columnFamilyHandleList.get(0), readOptions, keyBuffer))
+          .isEqualTo(false);
+
+      exceptionRule.expect(AssertionError.class);
+      exceptionRule.expectMessage(
+          "value ByteBuffer parameter cannot be null. If you do not need the value, use a different version of the method");
+      final KeyMayExist keyMayExist =
+          db.keyMayExist(columnFamilyHandleList.get(0), readOptions, keyBuffer, null);
+    }
+  }
+
+  @Test
+  public void keyMayExistBBCFOffset() throws RocksDBException {
+    db.put(columnFamilyHandleList.get(1), "keyBBCF1".getBytes(UTF_8), "valueBBCF1".getBytes(UTF_8));
+
+    final byte[] key = "keyBBCF1".getBytes(UTF_8);
+    final byte[] value = "valueBBCF1".getBytes(UTF_8);
+
+    final ByteBuffer keyBuffer = ByteBuffer.allocateDirect(key.length);
+    keyBuffer.put(key, 0, key.length);
+    keyBuffer.flip();
+
+    assertThat(db.keyMayExist(columnFamilyHandleList.get(1), keyBuffer)).isEqualTo(true);
+
+    final ByteBuffer valueBuffer = ByteBuffer.allocateDirect(value.length + 24);
+    valueBuffer.position(12);
+    KeyMayExist keyMayExist = db.keyMayExist(columnFamilyHandleList.get(1), keyBuffer, valueBuffer);
+    assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue);
+    assertThat(keyMayExist.valueLength).isEqualTo(value.length);
+    assertThat(valueBuffer.position()).isEqualTo(12);
+    assertThat(valueBuffer.limit()).isEqualTo(12 + value.length);
+    byte[] valueGet = new byte[value.length];
+    valueBuffer.get(valueGet);
+    assertThat(valueGet).isEqualTo(value);
+
+    valueBuffer.limit(value.length + 24);
+    valueBuffer.position(25);
+    keyMayExist = db.keyMayExist(columnFamilyHandleList.get(1), keyBuffer, valueBuffer);
+    assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue);
+    assertThat(keyMayExist.valueLength).isEqualTo(value.length);
+    assertThat(valueBuffer.position()).isEqualTo(25);
+    assertThat(valueBuffer.limit()).isEqualTo(24 + value.length);
+    valueGet = new byte[value.length - 1];
+    valueBuffer.get(valueGet);
+    assertThat(valueGet).isEqualTo(Arrays.copyOfRange(value, 0, value.length - 1));
+
+    exceptionRule.expect(BufferUnderflowException.class);
+    valueGet = new byte[value.length];
+    valueBuffer.get(valueGet);
+  }
+
+  @Test
+  public void keyMayExistBBCFOffsetReadOptions() throws RocksDBException {
+    db.put(columnFamilyHandleList.get(1), "keyBBCF1".getBytes(UTF_8), "valueBBCF1".getBytes(UTF_8));
+
+    final byte[] key = "keyBBCF1".getBytes(UTF_8);
+    final byte[] value = "valueBBCF1".getBytes(UTF_8);
+
+    final ByteBuffer keyBuffer = ByteBuffer.allocateDirect(key.length);
+    keyBuffer.put(key, 0, key.length);
+    keyBuffer.flip();
+
+    try (final ReadOptions readOptions = new ReadOptions()) {
+      assertThat(db.keyMayExist(columnFamilyHandleList.get(1), readOptions, keyBuffer))
+          .isEqualTo(true);
+
+      final ByteBuffer valueBuffer = ByteBuffer.allocateDirect(value.length + 24);
+      valueBuffer.position(12);
+      KeyMayExist keyMayExist =
+          db.keyMayExist(columnFamilyHandleList.get(1), readOptions, keyBuffer, valueBuffer);
+      assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue);
+      assertThat(keyMayExist.valueLength).isEqualTo(value.length);
+      assertThat(valueBuffer.position()).isEqualTo(12);
+      assertThat(valueBuffer.limit()).isEqualTo(12 + value.length);
+      byte[] valueGet = new byte[value.length];
+      valueBuffer.get(valueGet);
+      assertThat(valueGet).isEqualTo(value);
+
+      valueBuffer.limit(value.length + 24);
+      valueBuffer.position(25);
+      keyMayExist =
+          db.keyMayExist(columnFamilyHandleList.get(1), readOptions, keyBuffer, valueBuffer);
+      assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue);
+      assertThat(keyMayExist.valueLength).isEqualTo(value.length);
+      assertThat(valueBuffer.position()).isEqualTo(25);
+      assertThat(valueBuffer.limit()).isEqualTo(24 + value.length);
+      valueGet = new byte[value.length - 1];
+      valueBuffer.get(valueGet);
+      assertThat(valueGet).isEqualTo(Arrays.copyOfRange(value, 0, value.length - 1));
+
+      exceptionRule.expect(BufferUnderflowException.class);
+      valueGet = new byte[value.length];
+      valueBuffer.get(valueGet);
+    }
+  }
+
+  @Test
+  public void keyMayExistNonUnicodeString() throws RocksDBException {
+    final byte[] key = "key".getBytes(UTF_8);
+    final byte[] value = {(byte) 0x80}; // invalid unicode code-point
+    db.put(key, value);
+
+    final byte[] buf = new byte[10];
+    final int read = db.get(key, buf);
+    assertThat(read).isEqualTo(1);
+    assertThat(buf).startsWith(value);
+
+    final Holder<byte[]> holder = new Holder<>();
+    boolean exists = db.keyMayExist("key".getBytes(UTF_8), holder);
+    assertThat(exists).isTrue();
+    assertThat(holder.getValue()).isNotNull();
+    assertThat(holder.getValue()).isEqualTo(value);
+
+    exists = db.keyMayExist("key".getBytes(UTF_8), null);
+    assertThat(exists).isTrue();
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/LRUCacheTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/LRUCacheTest.java
new file mode 100644
index 000000000..4d194e712
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/LRUCacheTest.java
@@ -0,0 +1,32 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+public class LRUCacheTest {
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Test
+  public void newLRUCache() {
+    final long capacity = 80000000;
+    final int numShardBits = 16;
+    final boolean strictCapacityLimit = true;
+    final double highPriPoolRatio = 0.5;
+    final double lowPriPoolRatio = 0.5;
+    try (final Cache lruCache = new LRUCache(
+             capacity, numShardBits, strictCapacityLimit, highPriPoolRatio, lowPriPoolRatio)) {
+      //no op
+      assertThat(lruCache.getUsage()).isGreaterThanOrEqualTo(0);
+      assertThat(lruCache.getPinnedUsage()).isGreaterThanOrEqualTo(0);
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/LoggerTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/LoggerTest.java
new file mode 100644
index 000000000..5bc299f11
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/LoggerTest.java
@@ -0,0 +1,239 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class LoggerTest {
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void customLogger() throws RocksDBException {
+    final AtomicInteger logMessageCounter = new AtomicInteger();
+    try (final Options options = new Options().
+        setInfoLogLevel(InfoLogLevel.DEBUG_LEVEL).
+        setCreateIfMissing(true);
+         final Logger logger = new Logger(options) {
+           // Create new logger with max log level passed by options
+           @Override
+           protected void log(InfoLogLevel infoLogLevel, String logMsg) {
+             assertThat(logMsg).isNotNull();
+             assertThat(logMsg.length()).isGreaterThan(0);
+             logMessageCounter.incrementAndGet();
+           }
+         }
+    ) {
+      // Set custom logger to options
+      options.setLogger(logger);
+
+      try (final RocksDB db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath())) {
+        // there should be more than zero received log messages in
+        // debug level.
+        assertThat(logMessageCounter.get()).isGreaterThan(0);
+      }
+    }
+  }
+
+  @Test
+  public void warnLogger() throws RocksDBException {
+    final AtomicInteger logMessageCounter = new AtomicInteger();
+    try (final Options options = new Options().
+        setInfoLogLevel(InfoLogLevel.WARN_LEVEL).
+        setCreateIfMissing(true);
+
+         final Logger logger = new Logger(options) {
+           // Create new logger with max log level passed by options
+           @Override
+           protected void log(InfoLogLevel infoLogLevel, String logMsg) {
+             assertThat(logMsg).isNotNull();
+             assertThat(logMsg.length()).isGreaterThan(0);
+             logMessageCounter.incrementAndGet();
+           }
+         }
+    ) {
+
+      // Set custom logger to options
+      options.setLogger(logger);
+
+      try (final RocksDB db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath())) {
+        // there should be zero messages
+        // using warn level as log level.
+        assertThat(logMessageCounter.get()).isEqualTo(0);
+      }
+    }
+  }
+
+
+  @Test
+  public void fatalLogger() throws RocksDBException {
+    final AtomicInteger logMessageCounter = new AtomicInteger();
+    try (final Options options = new Options().
+        setInfoLogLevel(InfoLogLevel.FATAL_LEVEL).
+        setCreateIfMissing(true);
+
+         final Logger logger = new Logger(options) {
+           // Create new logger with max log level passed by options
+           @Override
+           protected void log(InfoLogLevel infoLogLevel, String logMsg) {
+             assertThat(logMsg).isNotNull();
+             assertThat(logMsg.length()).isGreaterThan(0);
+             logMessageCounter.incrementAndGet();
+           }
+         }
+    ) {
+
+      // Set custom logger to options
+      options.setLogger(logger);
+
+      try (final RocksDB db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath())) {
+        // there should be zero messages
+        // using fatal level as log level.
+        assertThat(logMessageCounter.get()).isEqualTo(0);
+      }
+    }
+  }
+
+  @Test
+  public void dbOptionsLogger() throws RocksDBException {
+    final AtomicInteger logMessageCounter = new AtomicInteger();
+    try (final DBOptions options = new DBOptions().
+        setInfoLogLevel(InfoLogLevel.FATAL_LEVEL).
+        setCreateIfMissing(true);
+         final Logger logger = new Logger(options) {
+           // Create new logger with max log level passed by options
+           @Override
+           protected void log(InfoLogLevel infoLogLevel, String logMsg) {
+             assertThat(logMsg).isNotNull();
+             assertThat(logMsg.length()).isGreaterThan(0);
+             logMessageCounter.incrementAndGet();
+           }
+         }
+    ) {
+      // Set custom logger to options
+      options.setLogger(logger);
+
+      final List<ColumnFamilyDescriptor> cfDescriptors =
+          Arrays.asList(
+              new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+      final List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
+
+      try (final RocksDB db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath(),
+          cfDescriptors, cfHandles)) {
+        try {
+          // there should be zero messages
+          // using fatal level as log level.
+          assertThat(logMessageCounter.get()).isEqualTo(0);
+        } finally {
+          for (final ColumnFamilyHandle columnFamilyHandle : cfHandles) {
+            columnFamilyHandle.close();
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void setWarnLogLevel() {
+    final AtomicInteger logMessageCounter = new AtomicInteger();
+    try (final Options options = new Options().
+        setInfoLogLevel(InfoLogLevel.FATAL_LEVEL).
+        setCreateIfMissing(true);
+         final Logger logger = new Logger(options) {
+           // Create new logger with max log level passed by options
+           @Override
+           protected void log(InfoLogLevel infoLogLevel, String logMsg) {
+             assertThat(logMsg).isNotNull();
+             assertThat(logMsg.length()).isGreaterThan(0);
+             logMessageCounter.incrementAndGet();
+           }
+         }
+    ) {
+      assertThat(logger.infoLogLevel()).
+          isEqualTo(InfoLogLevel.FATAL_LEVEL);
+      logger.setInfoLogLevel(InfoLogLevel.WARN_LEVEL);
+      assertThat(logger.infoLogLevel()).
+          isEqualTo(InfoLogLevel.WARN_LEVEL);
+    }
+  }
+
+  @Test
+  public void setInfoLogLevel() {
+    final AtomicInteger logMessageCounter = new AtomicInteger();
+    try (final Options options = new Options().
+        setInfoLogLevel(InfoLogLevel.FATAL_LEVEL).
+        setCreateIfMissing(true);
+         final Logger logger = new Logger(options) {
+           // Create new logger with max log level passed by options
+           @Override
+           protected void log(InfoLogLevel infoLogLevel, String logMsg) {
+             assertThat(logMsg).isNotNull();
+             assertThat(logMsg.length()).isGreaterThan(0);
+             logMessageCounter.incrementAndGet();
+           }
+         }
+    ) {
+      assertThat(logger.infoLogLevel()).
+          isEqualTo(InfoLogLevel.FATAL_LEVEL);
+      logger.setInfoLogLevel(InfoLogLevel.DEBUG_LEVEL);
+      assertThat(logger.infoLogLevel()).
+          isEqualTo(InfoLogLevel.DEBUG_LEVEL);
+    }
+  }
+
+  @Test
+  public void changeLogLevelAtRuntime() throws RocksDBException {
+    final AtomicInteger logMessageCounter = new AtomicInteger();
+    try (final Options options = new Options().
+        setInfoLogLevel(InfoLogLevel.FATAL_LEVEL).
+        setCreateIfMissing(true);
+
+         // Create new logger with max log level passed by options
+         final Logger logger = new Logger(options) {
+           @Override
+           protected void log(InfoLogLevel infoLogLevel, String logMsg) {
+             assertThat(logMsg).isNotNull();
+             assertThat(logMsg.length()).isGreaterThan(0);
+             logMessageCounter.incrementAndGet();
+           }
+         }
+    ) {
+      // Set custom logger to options
+      options.setLogger(logger);
+
+      try (final RocksDB db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath())) {
+
+        // there should be zero messages
+        // using fatal level as log level.
+        assertThat(logMessageCounter.get()).isEqualTo(0);
+
+        // change log level to debug level
+        logger.setInfoLogLevel(InfoLogLevel.DEBUG_LEVEL);
+
+        db.put("key".getBytes(), "value".getBytes());
+        db.flush(new FlushOptions().setWaitForFlush(true));
+
+        // messages shall be received due to previous actions.
+        assertThat(logMessageCounter.get()).isNotEqualTo(0);
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/MemTableTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/MemTableTest.java
new file mode 100644
index 000000000..73ac589a9
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/MemTableTest.java
@@ -0,0 +1,111 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class MemTableTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Test
+  public void hashSkipListMemTable() throws RocksDBException {
+    try(final Options options = new Options()) {
+      // Test HashSkipListMemTableConfig
+      HashSkipListMemTableConfig memTableConfig =
+          new HashSkipListMemTableConfig();
+      assertThat(memTableConfig.bucketCount()).
+          isEqualTo(1000000);
+      memTableConfig.setBucketCount(2000000);
+      assertThat(memTableConfig.bucketCount()).
+          isEqualTo(2000000);
+      assertThat(memTableConfig.height()).
+          isEqualTo(4);
+      memTableConfig.setHeight(5);
+      assertThat(memTableConfig.height()).
+          isEqualTo(5);
+      assertThat(memTableConfig.branchingFactor()).
+          isEqualTo(4);
+      memTableConfig.setBranchingFactor(6);
+      assertThat(memTableConfig.branchingFactor()).
+          isEqualTo(6);
+      options.setMemTableConfig(memTableConfig);
+    }
+  }
+
+  @Test
+  public void skipListMemTable() throws RocksDBException {
+    try(final Options options = new Options()) {
+      SkipListMemTableConfig skipMemTableConfig =
+          new SkipListMemTableConfig();
+      assertThat(skipMemTableConfig.lookahead()).
+          isEqualTo(0);
+      skipMemTableConfig.setLookahead(20);
+      assertThat(skipMemTableConfig.lookahead()).
+          isEqualTo(20);
+      options.setMemTableConfig(skipMemTableConfig);
+    }
+  }
+
+  @Test
+  public void hashLinkedListMemTable() throws RocksDBException {
+    try(final Options options = new Options()) {
+      HashLinkedListMemTableConfig hashLinkedListMemTableConfig =
+          new HashLinkedListMemTableConfig();
+      assertThat(hashLinkedListMemTableConfig.bucketCount()).
+          isEqualTo(50000);
+      hashLinkedListMemTableConfig.setBucketCount(100000);
+      assertThat(hashLinkedListMemTableConfig.bucketCount()).
+          isEqualTo(100000);
+      assertThat(hashLinkedListMemTableConfig.hugePageTlbSize()).
+          isEqualTo(0);
+      hashLinkedListMemTableConfig.setHugePageTlbSize(1);
+      assertThat(hashLinkedListMemTableConfig.hugePageTlbSize()).
+          isEqualTo(1);
+      assertThat(hashLinkedListMemTableConfig.
+          bucketEntriesLoggingThreshold()).
+          isEqualTo(4096);
+      hashLinkedListMemTableConfig.
+          setBucketEntriesLoggingThreshold(200);
+      assertThat(hashLinkedListMemTableConfig.
+          bucketEntriesLoggingThreshold()).
+          isEqualTo(200);
+      assertThat(hashLinkedListMemTableConfig.
+          ifLogBucketDistWhenFlush()).isTrue();
+      hashLinkedListMemTableConfig.
+          setIfLogBucketDistWhenFlush(false);
+      assertThat(hashLinkedListMemTableConfig.
+          ifLogBucketDistWhenFlush()).isFalse();
+      assertThat(hashLinkedListMemTableConfig.
+          thresholdUseSkiplist()).
+          isEqualTo(256);
+      hashLinkedListMemTableConfig.setThresholdUseSkiplist(29);
+      assertThat(hashLinkedListMemTableConfig.
+          thresholdUseSkiplist()).
+          isEqualTo(29);
+      options.setMemTableConfig(hashLinkedListMemTableConfig);
+    }
+  }
+
+  @Test
+  public void vectorMemTable() throws RocksDBException {
+    try(final Options options = new Options()) {
+      VectorMemTableConfig vectorMemTableConfig =
+          new VectorMemTableConfig();
+      assertThat(vectorMemTableConfig.reservedSize()).
+          isEqualTo(0);
+      vectorMemTableConfig.setReservedSize(123);
+      assertThat(vectorMemTableConfig.reservedSize()).
+          isEqualTo(123);
+      options.setMemTableConfig(vectorMemTableConfig);
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/MemoryUtilTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/MemoryUtilTest.java
new file mode 100644
index 000000000..1bea02379
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/MemoryUtilTest.java
@@ -0,0 +1,144 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.nio.charset.StandardCharsets;
+import java.util.*;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class MemoryUtilTest {
+
+  private static final String MEMTABLE_SIZE = "rocksdb.size-all-mem-tables";
+  private static final String UNFLUSHED_MEMTABLE_SIZE = "rocksdb.cur-size-all-mem-tables";
+  private static final String TABLE_READERS = "rocksdb.estimate-table-readers-mem";
+
+  private final byte[] key = "some-key".getBytes(StandardCharsets.UTF_8);
+  private final byte[] value = "some-value".getBytes(StandardCharsets.UTF_8);
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule public TemporaryFolder dbFolder1 = new TemporaryFolder();
+  @Rule public TemporaryFolder dbFolder2 = new TemporaryFolder();
+
+  /**
+   * Test MemoryUtil.getApproximateMemoryUsageByType before and after a put + get
+   */
+  @Test
+  public void getApproximateMemoryUsageByType() throws RocksDBException {
+    try (final Cache cache = new LRUCache(8 * 1024 * 1024);
+         final Options options =
+                 new Options()
+                         .setCreateIfMissing(true)
+                         .setTableFormatConfig(new BlockBasedTableConfig().setBlockCache(cache));
+         final FlushOptions flushOptions =
+                 new FlushOptions().setWaitForFlush(true);
+         final RocksDB db =
+                 RocksDB.open(options, dbFolder1.getRoot().getAbsolutePath())) {
+
+      List<RocksDB> dbs = new ArrayList<>(1);
+      dbs.add(db);
+      Set<Cache> caches = new HashSet<>(1);
+      caches.add(cache);
+      Map<MemoryUsageType, Long> usage = MemoryUtil.getApproximateMemoryUsageByType(dbs, caches);
+
+      assertThat(usage.get(MemoryUsageType.kMemTableTotal)).isEqualTo(
+              db.getAggregatedLongProperty(MEMTABLE_SIZE));
+      assertThat(usage.get(MemoryUsageType.kMemTableUnFlushed)).isEqualTo(
+              db.getAggregatedLongProperty(UNFLUSHED_MEMTABLE_SIZE));
+      assertThat(usage.get(MemoryUsageType.kTableReadersTotal)).isEqualTo(
+              db.getAggregatedLongProperty(TABLE_READERS));
+      // TODO(peterd): disable block cache entry stats and check for 0
+      assertThat(usage.get(MemoryUsageType.kCacheTotal)).isLessThan(1024);
+
+      db.put(key, value);
+      db.flush(flushOptions);
+      db.get(key);
+
+      usage = MemoryUtil.getApproximateMemoryUsageByType(dbs, caches);
+      assertThat(usage.get(MemoryUsageType.kMemTableTotal)).isGreaterThan(0);
+      assertThat(usage.get(MemoryUsageType.kMemTableTotal)).isEqualTo(
+              db.getAggregatedLongProperty(MEMTABLE_SIZE));
+      assertThat(usage.get(MemoryUsageType.kMemTableUnFlushed)).isGreaterThan(0);
+      assertThat(usage.get(MemoryUsageType.kMemTableUnFlushed)).isEqualTo(
+              db.getAggregatedLongProperty(UNFLUSHED_MEMTABLE_SIZE));
+      assertThat(usage.get(MemoryUsageType.kTableReadersTotal)).isGreaterThan(0);
+      assertThat(usage.get(MemoryUsageType.kTableReadersTotal)).isEqualTo(
+              db.getAggregatedLongProperty(TABLE_READERS));
+      assertThat(usage.get(MemoryUsageType.kCacheTotal)).isGreaterThan(0);
+
+    }
+  }
+
+  /**
+   * Test MemoryUtil.getApproximateMemoryUsageByType with null inputs
+   */
+  @Test
+  public void getApproximateMemoryUsageByTypeNulls() throws RocksDBException {
+    Map<MemoryUsageType, Long> usage = MemoryUtil.getApproximateMemoryUsageByType(null, null);
+
+    assertThat(usage.get(MemoryUsageType.kMemTableTotal)).isEqualTo(null);
+    assertThat(usage.get(MemoryUsageType.kMemTableUnFlushed)).isEqualTo(null);
+    assertThat(usage.get(MemoryUsageType.kTableReadersTotal)).isEqualTo(null);
+    assertThat(usage.get(MemoryUsageType.kCacheTotal)).isEqualTo(null);
+  }
+
+  /**
+   * Test MemoryUtil.getApproximateMemoryUsageByType with two DBs and two caches
+   */
+  @Test
+  public void getApproximateMemoryUsageByTypeMultiple() throws RocksDBException {
+    try (final Cache cache1 = new LRUCache(1 * 1024 * 1024);
+         final Options options1 =
+                 new Options()
+                         .setCreateIfMissing(true)
+                         .setTableFormatConfig(new BlockBasedTableConfig().setBlockCache(cache1));
+         final RocksDB db1 =
+                 RocksDB.open(options1, dbFolder1.getRoot().getAbsolutePath());
+         final Cache cache2 = new LRUCache(1 * 1024 * 1024);
+         final Options options2 =
+                 new Options()
+                         .setCreateIfMissing(true)
+                         .setTableFormatConfig(new BlockBasedTableConfig().setBlockCache(cache2));
+         final RocksDB db2 =
+                 RocksDB.open(options2, dbFolder2.getRoot().getAbsolutePath());
+         final FlushOptions flushOptions =
+                 new FlushOptions().setWaitForFlush(true);
+
+    ) {
+      List<RocksDB> dbs = new ArrayList<>(1);
+      dbs.add(db1);
+      dbs.add(db2);
+      Set<Cache> caches = new HashSet<>(1);
+      caches.add(cache1);
+      caches.add(cache2);
+
+      for (RocksDB db: dbs) {
+        db.put(key, value);
+        db.flush(flushOptions);
+        db.get(key);
+      }
+
+      Map<MemoryUsageType, Long> usage = MemoryUtil.getApproximateMemoryUsageByType(dbs, caches);
+      assertThat(usage.get(MemoryUsageType.kMemTableTotal)).isEqualTo(
+              db1.getAggregatedLongProperty(MEMTABLE_SIZE) + db2.getAggregatedLongProperty(MEMTABLE_SIZE));
+      assertThat(usage.get(MemoryUsageType.kMemTableUnFlushed)).isEqualTo(
+              db1.getAggregatedLongProperty(UNFLUSHED_MEMTABLE_SIZE) + db2.getAggregatedLongProperty(UNFLUSHED_MEMTABLE_SIZE));
+      assertThat(usage.get(MemoryUsageType.kTableReadersTotal)).isEqualTo(
+              db1.getAggregatedLongProperty(TABLE_READERS) + db2.getAggregatedLongProperty(TABLE_READERS));
+      assertThat(usage.get(MemoryUsageType.kCacheTotal)).isGreaterThan(0);
+
+    }
+  }
+
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/MergeTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/MergeTest.java
new file mode 100644
index 000000000..a840eb104
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/MergeTest.java
@@ -0,0 +1,465 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+public class MergeTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void stringOption()
+      throws InterruptedException, RocksDBException {
+    try (final Options opt = new Options()
+        .setCreateIfMissing(true)
+        .setMergeOperatorName("stringappend");
+         final RocksDB db = RocksDB.open(opt,
+             dbFolder.getRoot().getAbsolutePath())) {
+      // writing aa under key
+      db.put("key".getBytes(), "aa".getBytes());
+      // merge bb under key
+      db.merge("key".getBytes(), "bb".getBytes());
+
+      final byte[] value = db.get("key".getBytes());
+      final String strValue = new String(value);
+      assertThat(strValue).isEqualTo("aa,bb");
+    }
+  }
+
+  private byte[] longToByteArray(long l) {
+    ByteBuffer buf = ByteBuffer.allocate(Long.SIZE / Byte.SIZE).order(ByteOrder.LITTLE_ENDIAN);
+    buf.putLong(l);
+    return buf.array();
+  }
+
+  private long longFromByteArray(byte[] a) {
+    ByteBuffer buf = ByteBuffer.allocate(Long.SIZE / Byte.SIZE).order(ByteOrder.LITTLE_ENDIAN);
+    buf.put(a);
+    buf.flip();
+    return buf.getLong();
+  }
+
+  @Test
+  public void uint64AddOption()
+      throws InterruptedException, RocksDBException {
+    try (final Options opt = new Options()
+        .setCreateIfMissing(true)
+        .setMergeOperatorName("uint64add");
+         final RocksDB db = RocksDB.open(opt,
+             dbFolder.getRoot().getAbsolutePath())) {
+      // writing (long)100 under key
+      db.put("key".getBytes(), longToByteArray(100));
+      // merge (long)1 under key
+      db.merge("key".getBytes(), longToByteArray(1));
+
+      final byte[] value = db.get("key".getBytes());
+      final long longValue = longFromByteArray(value);
+      assertThat(longValue).isEqualTo(101);
+    }
+  }
+
+  @Test
+  public void cFStringOption()
+      throws InterruptedException, RocksDBException {
+
+    try (final ColumnFamilyOptions cfOpt1 = new ColumnFamilyOptions()
+        .setMergeOperatorName("stringappend");
+         final ColumnFamilyOptions cfOpt2 = new ColumnFamilyOptions()
+             .setMergeOperatorName("stringappend")
+    ) {
+      final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpt1),
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpt2)
+      );
+
+      final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+      try (final DBOptions opt = new DBOptions()
+          .setCreateIfMissing(true)
+          .setCreateMissingColumnFamilies(true);
+           final RocksDB db = RocksDB.open(opt,
+               dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+               columnFamilyHandleList)) {
+        try {
+          // writing aa under key
+          db.put(columnFamilyHandleList.get(1),
+              "cfkey".getBytes(), "aa".getBytes());
+          // merge bb under key
+          db.merge(columnFamilyHandleList.get(1),
+              "cfkey".getBytes(), "bb".getBytes());
+
+          byte[] value = db.get(columnFamilyHandleList.get(1),
+              "cfkey".getBytes());
+          String strValue = new String(value);
+          assertThat(strValue).isEqualTo("aa,bb");
+        } finally {
+          for (final ColumnFamilyHandle handle : columnFamilyHandleList) {
+            handle.close();
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void cFUInt64AddOption()
+      throws InterruptedException, RocksDBException {
+
+    try (final ColumnFamilyOptions cfOpt1 = new ColumnFamilyOptions()
+        .setMergeOperatorName("uint64add");
+         final ColumnFamilyOptions cfOpt2 = new ColumnFamilyOptions()
+             .setMergeOperatorName("uint64add")
+    ) {
+      final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpt1),
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpt2)
+      );
+
+      final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+      try (final DBOptions opt = new DBOptions()
+          .setCreateIfMissing(true)
+          .setCreateMissingColumnFamilies(true);
+           final RocksDB db = RocksDB.open(opt,
+               dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+               columnFamilyHandleList)) {
+        try {
+          // writing (long)100 under key
+          db.put(columnFamilyHandleList.get(1),
+              "cfkey".getBytes(), longToByteArray(100));
+          // merge (long)157 under key
+          db.merge(columnFamilyHandleList.get(1), "cfkey".getBytes(), longToByteArray(157));
+
+          byte[] value = db.get(columnFamilyHandleList.get(1),
+              "cfkey".getBytes());
+          long longValue = longFromByteArray(value);
+          assertThat(longValue).isEqualTo(257);
+        } finally {
+          for (final ColumnFamilyHandle handle : columnFamilyHandleList) {
+            handle.close();
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void operatorOption()
+      throws InterruptedException, RocksDBException {
+    try (final StringAppendOperator stringAppendOperator = new StringAppendOperator();
+         final Options opt = new Options()
+            .setCreateIfMissing(true)
+            .setMergeOperator(stringAppendOperator);
+         final RocksDB db = RocksDB.open(opt,
+             dbFolder.getRoot().getAbsolutePath())) {
+      // Writing aa under key
+      db.put("key".getBytes(), "aa".getBytes());
+
+      // Writing bb under key
+      db.merge("key".getBytes(), "bb".getBytes());
+
+      final byte[] value = db.get("key".getBytes());
+      final String strValue = new String(value);
+
+      assertThat(strValue).isEqualTo("aa,bb");
+    }
+  }
+
+  @Test
+  public void uint64AddOperatorOption()
+      throws InterruptedException, RocksDBException {
+    try (final UInt64AddOperator uint64AddOperator = new UInt64AddOperator();
+         final Options opt = new Options()
+            .setCreateIfMissing(true)
+            .setMergeOperator(uint64AddOperator);
+         final RocksDB db = RocksDB.open(opt,
+             dbFolder.getRoot().getAbsolutePath())) {
+      // Writing (long)100 under key
+      db.put("key".getBytes(), longToByteArray(100));
+
+      // Writing (long)1 under key
+      db.merge("key".getBytes(), longToByteArray(1));
+
+      final byte[] value = db.get("key".getBytes());
+      final long longValue = longFromByteArray(value);
+
+      assertThat(longValue).isEqualTo(101);
+    }
+  }
+
+  @Test
+  public void cFOperatorOption()
+      throws InterruptedException, RocksDBException {
+    try (final StringAppendOperator stringAppendOperator = new StringAppendOperator();
+         final ColumnFamilyOptions cfOpt1 = new ColumnFamilyOptions()
+             .setMergeOperator(stringAppendOperator);
+         final ColumnFamilyOptions cfOpt2 = new ColumnFamilyOptions()
+             .setMergeOperator(stringAppendOperator)
+    ) {
+      final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpt1),
+          new ColumnFamilyDescriptor("new_cf".getBytes(), cfOpt2)
+      );
+      final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+      try (final DBOptions opt = new DBOptions()
+          .setCreateIfMissing(true)
+          .setCreateMissingColumnFamilies(true);
+           final RocksDB db = RocksDB.open(opt,
+               dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+               columnFamilyHandleList)
+      ) {
+        try {
+          // writing aa under key
+          db.put(columnFamilyHandleList.get(1),
+              "cfkey".getBytes(), "aa".getBytes());
+          // merge bb under key
+          db.merge(columnFamilyHandleList.get(1),
+              "cfkey".getBytes(), "bb".getBytes());
+          byte[] value = db.get(columnFamilyHandleList.get(1),
+              "cfkey".getBytes());
+          String strValue = new String(value);
+
+          // Test also with createColumnFamily
+          try (final ColumnFamilyOptions cfHandleOpts =
+                   new ColumnFamilyOptions()
+                       .setMergeOperator(stringAppendOperator);
+               final ColumnFamilyHandle cfHandle =
+                   db.createColumnFamily(
+                       new ColumnFamilyDescriptor("new_cf2".getBytes(),
+                           cfHandleOpts))
+          ) {
+            // writing xx under cfkey2
+            db.put(cfHandle, "cfkey2".getBytes(), "xx".getBytes());
+            // merge yy under cfkey2
+            db.merge(cfHandle, new WriteOptions(), "cfkey2".getBytes(),
+                "yy".getBytes());
+            value = db.get(cfHandle, "cfkey2".getBytes());
+            String strValueTmpCf = new String(value);
+
+            assertThat(strValue).isEqualTo("aa,bb");
+            assertThat(strValueTmpCf).isEqualTo("xx,yy");
+          }
+        } finally {
+          for (final ColumnFamilyHandle columnFamilyHandle :
+              columnFamilyHandleList) {
+            columnFamilyHandle.close();
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void cFUInt64AddOperatorOption()
+      throws InterruptedException, RocksDBException {
+    try (final UInt64AddOperator uint64AddOperator = new UInt64AddOperator();
+         final ColumnFamilyOptions cfOpt1 = new ColumnFamilyOptions()
+             .setMergeOperator(uint64AddOperator);
+         final ColumnFamilyOptions cfOpt2 = new ColumnFamilyOptions()
+             .setMergeOperator(uint64AddOperator)
+    ) {
+      final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpt1),
+          new ColumnFamilyDescriptor("new_cf".getBytes(), cfOpt2)
+      );
+      final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+      try (final DBOptions opt = new DBOptions()
+          .setCreateIfMissing(true)
+          .setCreateMissingColumnFamilies(true);
+           final RocksDB db = RocksDB.open(opt,
+               dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+               columnFamilyHandleList)
+      ) {
+        try {
+          // writing (long)100 under key
+          db.put(columnFamilyHandleList.get(1),
+              "cfkey".getBytes(), longToByteArray(100));
+          // merge (long)1 under key
+          db.merge(columnFamilyHandleList.get(1),
+              "cfkey".getBytes(), longToByteArray(1));
+          byte[] value = db.get(columnFamilyHandleList.get(1),
+              "cfkey".getBytes());
+          long longValue = longFromByteArray(value);
+
+          // Test also with createColumnFamily
+          try (final ColumnFamilyOptions cfHandleOpts =
+                   new ColumnFamilyOptions()
+                       .setMergeOperator(uint64AddOperator);
+               final ColumnFamilyHandle cfHandle =
+                   db.createColumnFamily(
+                       new ColumnFamilyDescriptor("new_cf2".getBytes(),
+                           cfHandleOpts))
+          ) {
+            // writing (long)200 under cfkey2
+            db.put(cfHandle, "cfkey2".getBytes(), longToByteArray(200));
+            // merge (long)50 under cfkey2
+            db.merge(cfHandle, new WriteOptions(), "cfkey2".getBytes(),
+                longToByteArray(50));
+            value = db.get(cfHandle, "cfkey2".getBytes());
+            long longValueTmpCf = longFromByteArray(value);
+
+            assertThat(longValue).isEqualTo(101);
+            assertThat(longValueTmpCf).isEqualTo(250);
+          }
+        } finally {
+          for (final ColumnFamilyHandle columnFamilyHandle :
+              columnFamilyHandleList) {
+            columnFamilyHandle.close();
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void operatorGcBehaviour()
+      throws RocksDBException {
+    try (final StringAppendOperator stringAppendOperator = new StringAppendOperator()) {
+      try (final Options opt = new Options()
+              .setCreateIfMissing(true)
+              .setMergeOperator(stringAppendOperator);
+           final RocksDB db = RocksDB.open(opt,
+                   dbFolder.getRoot().getAbsolutePath())) {
+        //no-op
+      }
+
+      // test reuse
+      try (final Options opt = new Options()
+              .setMergeOperator(stringAppendOperator);
+           final RocksDB db = RocksDB.open(opt,
+                   dbFolder.getRoot().getAbsolutePath())) {
+        //no-op
+      }
+
+      // test param init
+      try (final StringAppendOperator stringAppendOperator2 = new StringAppendOperator();
+           final Options opt = new Options()
+              .setMergeOperator(stringAppendOperator2);
+           final RocksDB db = RocksDB.open(opt,
+                   dbFolder.getRoot().getAbsolutePath())) {
+        //no-op
+      }
+
+      // test replace one with another merge operator instance
+      try (final Options opt = new Options()
+              .setMergeOperator(stringAppendOperator);
+           final StringAppendOperator newStringAppendOperator = new StringAppendOperator()) {
+        opt.setMergeOperator(newStringAppendOperator);
+        try (final RocksDB db = RocksDB.open(opt,
+                dbFolder.getRoot().getAbsolutePath())) {
+          //no-op
+        }
+      }
+    }
+  }
+
+  @Test
+  public void uint64AddOperatorGcBehaviour()
+      throws RocksDBException {
+    try (final UInt64AddOperator uint64AddOperator = new UInt64AddOperator()) {
+      try (final Options opt = new Options()
+              .setCreateIfMissing(true)
+              .setMergeOperator(uint64AddOperator);
+           final RocksDB db = RocksDB.open(opt,
+                   dbFolder.getRoot().getAbsolutePath())) {
+        //no-op
+      }
+
+      // test reuse
+      try (final Options opt = new Options()
+              .setMergeOperator(uint64AddOperator);
+           final RocksDB db = RocksDB.open(opt,
+                   dbFolder.getRoot().getAbsolutePath())) {
+        //no-op
+      }
+
+      // test param init
+      try (final UInt64AddOperator uint64AddOperator2 = new UInt64AddOperator();
+           final Options opt = new Options()
+              .setMergeOperator(uint64AddOperator2);
+           final RocksDB db = RocksDB.open(opt,
+                   dbFolder.getRoot().getAbsolutePath())) {
+        //no-op
+      }
+
+      // test replace one with another merge operator instance
+      try (final Options opt = new Options()
+              .setMergeOperator(uint64AddOperator);
+           final UInt64AddOperator newUInt64AddOperator = new UInt64AddOperator()) {
+        opt.setMergeOperator(newUInt64AddOperator);
+        try (final RocksDB db = RocksDB.open(opt,
+                dbFolder.getRoot().getAbsolutePath())) {
+          //no-op
+        }
+      }
+    }
+  }
+
+  @Test
+  public void emptyStringAsStringAppendDelimiter() throws RocksDBException {
+    try (final StringAppendOperator stringAppendOperator = new StringAppendOperator("");
+         final Options opt =
+             new Options().setCreateIfMissing(true).setMergeOperator(stringAppendOperator);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      db.put("key".getBytes(), "aa".getBytes());
+      db.merge("key".getBytes(), "bb".getBytes());
+      final byte[] value = db.get("key".getBytes());
+      assertThat(new String(value)).isEqualTo("aabb");
+    }
+  }
+
+  @Test
+  public void multiCharStringAsStringAppendDelimiter() throws RocksDBException {
+    try (final StringAppendOperator stringAppendOperator = new StringAppendOperator("<>");
+         final Options opt =
+             new Options().setCreateIfMissing(true).setMergeOperator(stringAppendOperator);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      db.put("key".getBytes(), "aa".getBytes());
+      db.merge("key".getBytes(), "bb".getBytes());
+      final byte[] value = db.get("key".getBytes());
+      assertThat(new String(value)).isEqualTo("aa<>bb");
+    }
+  }
+
+  @Test
+  public void emptyStringInSetMergeOperatorByName() {
+    try (final Options opt = new Options()
+        .setMergeOperatorName("");
+         final ColumnFamilyOptions cOpt = new ColumnFamilyOptions()
+             .setMergeOperatorName("")) {
+      //no-op
+    }
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void nullStringInSetMergeOperatorByNameOptions() {
+    try (final Options opt = new Options()) {
+      opt.setMergeOperatorName(null);
+    }
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void
+  nullStringInSetMergeOperatorByNameColumnFamilyOptions() {
+    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
+      opt.setMergeOperatorName(null);
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/MixedOptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/MixedOptionsTest.java
new file mode 100644
index 000000000..10c92d49d
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/MixedOptionsTest.java
@@ -0,0 +1,55 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class MixedOptionsTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Test
+  public void mixedOptionsTest(){
+    // Set a table factory and check the names
+    try(final Filter bloomFilter = new BloomFilter();
+        final ColumnFamilyOptions cfOptions = new ColumnFamilyOptions()
+            .setTableFormatConfig(
+                new BlockBasedTableConfig().setFilterPolicy(bloomFilter))
+    ) {
+      assertThat(cfOptions.tableFactoryName()).isEqualTo(
+          "BlockBasedTable");
+      cfOptions.setTableFormatConfig(new PlainTableConfig());
+      assertThat(cfOptions.tableFactoryName()).isEqualTo("PlainTable");
+      // Initialize a dbOptions object from cf options and
+      // db options
+      try (final DBOptions dbOptions = new DBOptions();
+           final Options options = new Options(dbOptions, cfOptions)) {
+        assertThat(options.tableFactoryName()).isEqualTo("PlainTable");
+        // Free instances
+      }
+    }
+
+    // Test Optimize for statements
+    try(final ColumnFamilyOptions cfOptions = new ColumnFamilyOptions()) {
+    cfOptions.optimizeUniversalStyleCompaction();
+    cfOptions.optimizeLevelStyleCompaction();
+    cfOptions.optimizeForPointLookup(1024);
+    try(final Options options = new Options()) {
+        options.optimizeLevelStyleCompaction();
+        options.optimizeLevelStyleCompaction(400);
+        options.optimizeUniversalStyleCompaction();
+        options.optimizeUniversalStyleCompaction(400);
+        options.optimizeForPointLookup(1024);
+        options.prepareForBulkLoad();
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/MultiColumnRegressionTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/MultiColumnRegressionTest.java
new file mode 100644
index 000000000..cdfd9d3a9
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/MultiColumnRegressionTest.java
@@ -0,0 +1,146 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+/**
+ * Test for changes made by
+ * <a link="https://github.com/facebook/rocksdb/issues/9006">transactional multiGet problem</a>
+ * the tests here were previously broken by the nonsense removed by that change.
+ */
+@RunWith(Parameterized.class)
+public class MultiColumnRegressionTest {
+  @Parameterized.Parameters
+  public static List<Params> data() {
+    return Arrays.asList(new Params(3, 100), new Params(3, 1000000));
+  }
+
+  public static class Params {
+    final int numColumns;
+    final int keySize;
+
+    public Params(final int numColumns, final int keySize) {
+      this.numColumns = numColumns;
+      this.keySize = keySize;
+    }
+  }
+
+  @Rule public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  private final Params params;
+
+  public MultiColumnRegressionTest(final Params params) {
+    this.params = params;
+  }
+
+  @Test
+  public void transactionDB() throws RocksDBException {
+    final List<ColumnFamilyDescriptor> columnFamilyDescriptors = new ArrayList<>();
+    for (int i = 0; i < params.numColumns; i++) {
+      StringBuilder sb = new StringBuilder();
+      sb.append("cf" + i);
+      for (int j = 0; j < params.keySize; j++) sb.append("_cf");
+      columnFamilyDescriptors.add(new ColumnFamilyDescriptor(sb.toString().getBytes()));
+    }
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      final List<ColumnFamilyHandle> columnFamilyHandles =
+          db.createColumnFamilies(columnFamilyDescriptors);
+    }
+
+    columnFamilyDescriptors.add(new ColumnFamilyDescriptor("default".getBytes()));
+    final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+    try (final TransactionDB tdb = TransactionDB.open(new DBOptions().setCreateIfMissing(true),
+             new TransactionDBOptions(), dbFolder.getRoot().getAbsolutePath(),
+             columnFamilyDescriptors, columnFamilyHandles)) {
+      final WriteOptions writeOptions = new WriteOptions();
+      try (Transaction transaction = tdb.beginTransaction(writeOptions)) {
+        for (int i = 0; i < params.numColumns; i++) {
+          transaction.put(
+              columnFamilyHandles.get(i), ("key" + i).getBytes(), ("value" + (i - 7)).getBytes());
+        }
+        transaction.put("key".getBytes(), "value".getBytes());
+        transaction.commit();
+      }
+      for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles) {
+        columnFamilyHandle.close();
+      }
+    }
+
+    final List<ColumnFamilyHandle> columnFamilyHandles2 = new ArrayList<>();
+    try (final TransactionDB tdb = TransactionDB.open(new DBOptions().setCreateIfMissing(true),
+             new TransactionDBOptions(), dbFolder.getRoot().getAbsolutePath(),
+             columnFamilyDescriptors, columnFamilyHandles2)) {
+      try (Transaction transaction = tdb.beginTransaction(new WriteOptions())) {
+        final ReadOptions readOptions = new ReadOptions();
+        for (int i = 0; i < params.numColumns; i++) {
+          final byte[] value =
+              transaction.get(columnFamilyHandles2.get(i), readOptions, ("key" + i).getBytes());
+          assertThat(value).isEqualTo(("value" + (i - 7)).getBytes());
+        }
+        transaction.commit();
+      }
+      for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles2) {
+        columnFamilyHandle.close();
+      }
+    }
+  }
+
+  @Test
+  public void optimisticDB() throws RocksDBException {
+    final List<ColumnFamilyDescriptor> columnFamilyDescriptors = new ArrayList<>();
+    for (int i = 0; i < params.numColumns; i++) {
+      columnFamilyDescriptors.add(new ColumnFamilyDescriptor("default".getBytes()));
+    }
+
+    columnFamilyDescriptors.add(new ColumnFamilyDescriptor("default".getBytes()));
+    final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+    try (final OptimisticTransactionDB otdb = OptimisticTransactionDB.open(
+             new DBOptions().setCreateIfMissing(true), dbFolder.getRoot().getAbsolutePath(),
+             columnFamilyDescriptors, columnFamilyHandles)) {
+      try (Transaction transaction = otdb.beginTransaction(new WriteOptions())) {
+        for (int i = 0; i < params.numColumns; i++) {
+          transaction.put(
+              columnFamilyHandles.get(i), ("key" + i).getBytes(), ("value" + (i - 7)).getBytes());
+        }
+        transaction.put("key".getBytes(), "value".getBytes());
+        transaction.commit();
+      }
+      for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles) {
+        columnFamilyHandle.close();
+      }
+    }
+
+    final List<ColumnFamilyHandle> columnFamilyHandles2 = new ArrayList<>();
+    try (final OptimisticTransactionDB otdb = OptimisticTransactionDB.open(
+             new DBOptions().setCreateIfMissing(true), dbFolder.getRoot().getAbsolutePath(),
+             columnFamilyDescriptors, columnFamilyHandles2)) {
+      try (Transaction transaction = otdb.beginTransaction(new WriteOptions())) {
+        final ReadOptions readOptions = new ReadOptions();
+        for (int i = 0; i < params.numColumns; i++) {
+          final byte[] value =
+              transaction.get(columnFamilyHandles2.get(i), readOptions, ("key" + i).getBytes());
+          assertThat(value).isEqualTo(("value" + (i - 7)).getBytes());
+        }
+        transaction.commit();
+      }
+      for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles2) {
+        columnFamilyHandle.close();
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/MultiGetManyKeysTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/MultiGetManyKeysTest.java
new file mode 100644
index 000000000..90a13e1da
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/MultiGetManyKeysTest.java
@@ -0,0 +1,241 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.util.*;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+@RunWith(Parameterized.class)
+public class MultiGetManyKeysTest {
+  @Parameterized.Parameters
+  public static List<Integer> data() {
+    return Arrays.asList(2, 3, 250, 60000, 70000, 150000, 750000);
+  }
+
+  @Rule public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  private final int numKeys;
+
+  public MultiGetManyKeysTest(final Integer numKeys) {
+    this.numKeys = numKeys;
+  }
+
+  /**
+   * Test for <a link="https://github.com/facebook/rocksdb/issues/8039">multiGet problem</a>
+   */
+  @Test
+  public void multiGetAsListLarge() throws RocksDBException {
+    final List<byte[]> keys = generateRandomKeys(numKeys);
+    final Map<Key, byte[]> keyValues = generateRandomKeyValues(keys, 10);
+    putKeysAndValues(keyValues);
+
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      final List<byte[]> values = db.multiGetAsList(keys);
+      assertKeysAndValues(keys, keyValues, values);
+    }
+  }
+
+  /**
+   * Test for <a link="https://github.com/facebook/rocksdb/issues/9006">transactional multiGet
+   * problem</a>
+   */
+  @Test
+  public void multiGetAsListLargeTransactional() throws RocksDBException {
+    final List<byte[]> keys = generateRandomKeys(numKeys);
+    final Map<Key, byte[]> keyValues = generateRandomKeyValues(keys, 10);
+    putKeysAndValues(keyValues);
+
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+         final TransactionDB txnDB =
+             TransactionDB.open(options, txnDbOptions, dbFolder.getRoot().getAbsolutePath())) {
+      try (final Transaction transaction = txnDB.beginTransaction(new WriteOptions())) {
+        final List<byte[]> values = transaction.multiGetAsList(new ReadOptions(), keys);
+        assertKeysAndValues(keys, keyValues, values);
+      }
+    }
+  }
+
+  /**
+   * Test for <a link="https://github.com/facebook/rocksdb/issues/9006">transactional multiGet
+   * problem</a>
+   */
+  @Test
+  public void multiGetForUpdateAsListLargeTransactional() throws RocksDBException {
+    final List<byte[]> keys = generateRandomKeys(numKeys);
+    final Map<Key, byte[]> keyValues = generateRandomKeyValues(keys, 10);
+    putKeysAndValues(keyValues);
+
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+         final TransactionDB txnDB =
+             TransactionDB.open(options, txnDbOptions, dbFolder.getRoot().getAbsolutePath())) {
+      try (final Transaction transaction = txnDB.beginTransaction(new WriteOptions())) {
+        final List<byte[]> values = transaction.multiGetForUpdateAsList(new ReadOptions(), keys);
+        assertKeysAndValues(keys, keyValues, values);
+      }
+    }
+  }
+
+  /**
+   * Test for <a link="https://github.com/facebook/rocksdb/issues/9006">transactional multiGet
+   * problem</a>
+   */
+  @Test
+  public void multiGetAsListLargeTransactionalCF() throws RocksDBException {
+    final List<byte[]> keys = generateRandomKeys(numKeys);
+    final Map<Key, byte[]> keyValues = generateRandomKeyValues(keys, 10);
+    final ColumnFamilyDescriptor columnFamilyDescriptor =
+        new ColumnFamilyDescriptor("cfTest".getBytes());
+    putKeysAndValues(columnFamilyDescriptor, keyValues);
+
+    final List<ColumnFamilyDescriptor> columnFamilyDescriptors = new ArrayList<>();
+    columnFamilyDescriptors.add(columnFamilyDescriptor);
+    columnFamilyDescriptors.add(new ColumnFamilyDescriptor("default".getBytes()));
+    final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+         final TransactionDB txnDB = TransactionDB.open(new DBOptions(options), txnDbOptions,
+             dbFolder.getRoot().getAbsolutePath(), columnFamilyDescriptors, columnFamilyHandles)) {
+      final List<ColumnFamilyHandle> columnFamilyHandlesForMultiGet = new ArrayList<>(numKeys);
+      for (int i = 0; i < numKeys; i++)
+        columnFamilyHandlesForMultiGet.add(columnFamilyHandles.get(0));
+      try (final Transaction transaction = txnDB.beginTransaction(new WriteOptions())) {
+        final List<byte[]> values =
+            transaction.multiGetAsList(new ReadOptions(), columnFamilyHandlesForMultiGet, keys);
+        assertKeysAndValues(keys, keyValues, values);
+      }
+      for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles) {
+        columnFamilyHandle.close();
+      }
+    }
+  }
+
+  /**
+   * Test for <a link="https://github.com/facebook/rocksdb/issues/9006">transactional multiGet
+   * problem</a>
+   */
+  @Test
+  public void multiGetForUpdateAsListLargeTransactionalCF() throws RocksDBException {
+    final List<byte[]> keys = generateRandomKeys(numKeys);
+    final Map<Key, byte[]> keyValues = generateRandomKeyValues(keys, 10);
+    final ColumnFamilyDescriptor columnFamilyDescriptor =
+        new ColumnFamilyDescriptor("cfTest".getBytes());
+    putKeysAndValues(columnFamilyDescriptor, keyValues);
+
+    final List<ColumnFamilyDescriptor> columnFamilyDescriptors = new ArrayList<>();
+    columnFamilyDescriptors.add(columnFamilyDescriptor);
+    columnFamilyDescriptors.add(new ColumnFamilyDescriptor("default".getBytes()));
+    final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+         final TransactionDB txnDB = TransactionDB.open(new DBOptions(options), txnDbOptions,
+             dbFolder.getRoot().getAbsolutePath(), columnFamilyDescriptors, columnFamilyHandles)) {
+      final List<ColumnFamilyHandle> columnFamilyHandlesForMultiGet = new ArrayList<>(numKeys);
+      for (int i = 0; i < numKeys; i++)
+        columnFamilyHandlesForMultiGet.add(columnFamilyHandles.get(0));
+      try (final Transaction transaction = txnDB.beginTransaction(new WriteOptions())) {
+        final List<byte[]> values = transaction.multiGetForUpdateAsList(
+            new ReadOptions(), columnFamilyHandlesForMultiGet, keys);
+        assertKeysAndValues(keys, keyValues, values);
+      }
+      for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles) {
+        columnFamilyHandle.close();
+      }
+    }
+  }
+
+  private List<byte[]> generateRandomKeys(final int numKeys) {
+    final Random rand = new Random();
+    final List<byte[]> keys = new ArrayList<>();
+    for (int i = 0; i < numKeys; i++) {
+      final byte[] key = new byte[4];
+      rand.nextBytes(key);
+      keys.add(key);
+    }
+    return keys;
+  }
+
+  private Map<Key, byte[]> generateRandomKeyValues(final List<byte[]> keys, final int percent) {
+    final Random rand = new Random();
+    final Map<Key, byte[]> keyValues = new HashMap<>();
+    for (int i = 0; i < numKeys; i++) {
+      if (rand.nextInt(100) < percent) {
+        final byte[] value = new byte[1024];
+        rand.nextBytes(value);
+        keyValues.put(new Key(keys.get(i)), value);
+      }
+    }
+    return keyValues;
+  }
+
+  private void putKeysAndValues(Map<Key, byte[]> keyValues) throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) {
+      for (Map.Entry<Key, byte[]> keyValue : keyValues.entrySet()) {
+        db.put(keyValue.getKey().get(), keyValue.getValue());
+      }
+    }
+  }
+
+  private void putKeysAndValues(ColumnFamilyDescriptor columnFamilyDescriptor,
+      Map<Key, byte[]> keyValues) throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+         final ColumnFamilyHandle columnFamilyHandle =
+             db.createColumnFamily(columnFamilyDescriptor)) {
+      for (Map.Entry<Key, byte[]> keyValue : keyValues.entrySet()) {
+        db.put(columnFamilyHandle, keyValue.getKey().get(), keyValue.getValue());
+      }
+    }
+  }
+
+  private void assertKeysAndValues(
+      final List<byte[]> keys, final Map<Key, byte[]> keyValues, final List<byte[]> values) {
+    assertThat(values.size()).isEqualTo(keys.size());
+    for (int i = 0; i < numKeys; i++) {
+      final Key key = new Key(keys.get(i));
+      final byte[] value = values.get(i);
+      if (keyValues.containsKey(key)) {
+        assertThat(value).isEqualTo(keyValues.get(key));
+      } else {
+        assertThat(value).isNull();
+      }
+    }
+  }
+
+  static private class Key {
+    private final byte[] bytes;
+    public Key(byte[] bytes) {
+      this.bytes = bytes;
+    }
+
+    public byte[] get() {
+      return this.bytes;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+      if (this == o)
+        return true;
+      if (o == null || getClass() != o.getClass())
+        return false;
+      Key key = (Key) o;
+      return Arrays.equals(bytes, key.bytes);
+    }
+
+    @Override
+    public int hashCode() {
+      return Arrays.hashCode(bytes);
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/MultiGetTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/MultiGetTest.java
new file mode 100644
index 000000000..323a6b1f4
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/MultiGetTest.java
@@ -0,0 +1,525 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.fail;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.rocksdb.util.TestUtil;
+
+public class MultiGetTest {
+  @Rule public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void putNThenMultiGet() throws RocksDBException {
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      db.put("key1".getBytes(), "value1ForKey1".getBytes());
+      db.put("key2".getBytes(), "value2ForKey2".getBytes());
+      db.put("key3".getBytes(), "value3ForKey3".getBytes());
+      final List<byte[]> keys =
+          Arrays.asList("key1".getBytes(), "key2".getBytes(), "key3".getBytes());
+      final List<byte[]> values = db.multiGetAsList(keys);
+      assertThat(values.size()).isEqualTo(keys.size());
+      assertThat(values.get(0)).isEqualTo("value1ForKey1".getBytes());
+      assertThat(values.get(1)).isEqualTo("value2ForKey2".getBytes());
+      assertThat(values.get(2)).isEqualTo("value3ForKey3".getBytes());
+    }
+  }
+
+  @Test
+  public void putNThenMultiGetDirect() throws RocksDBException {
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      db.put("key1".getBytes(), "value1ForKey1".getBytes());
+      db.put("key2".getBytes(), "value2ForKey2".getBytes());
+      db.put("key3".getBytes(), "value3ForKey3".getBytes());
+
+      final List<ByteBuffer> keys = new ArrayList<>();
+      keys.add(ByteBuffer.allocateDirect(12).put("key1".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key2".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key3".getBytes()));
+      // Java8 and lower flip() returns Buffer not ByteBuffer, so can't chain above /\/\
+      for (final ByteBuffer key : keys) {
+        key.flip();
+      }
+      final List<ByteBuffer> values = new ArrayList<>();
+      for (int i = 0; i < keys.size(); i++) {
+        values.add(ByteBuffer.allocateDirect(24));
+      }
+
+      {
+        final List<ByteBufferGetStatus> results = db.multiGetByteBuffers(keys, values);
+
+        assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok);
+
+        assertThat(results.get(0).requiredSize).isEqualTo("value1ForKey1".getBytes().length);
+        assertThat(results.get(1).requiredSize).isEqualTo("value2ForKey2".getBytes().length);
+        assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length);
+
+        assertThat(TestUtil.bufferBytes(results.get(0).value))
+            .isEqualTo("value1ForKey1".getBytes());
+        assertThat(TestUtil.bufferBytes(results.get(1).value))
+            .isEqualTo("value2ForKey2".getBytes());
+        assertThat(TestUtil.bufferBytes(results.get(2).value))
+            .isEqualTo("value3ForKey3".getBytes());
+      }
+
+      {
+        final List<ByteBufferGetStatus> results =
+            db.multiGetByteBuffers(new ReadOptions(), keys, values);
+
+        assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok);
+
+        assertThat(results.get(0).requiredSize).isEqualTo("value1ForKey1".getBytes().length);
+        assertThat(results.get(1).requiredSize).isEqualTo("value2ForKey2".getBytes().length);
+        assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length);
+
+        assertThat(TestUtil.bufferBytes(results.get(0).value))
+            .isEqualTo("value1ForKey1".getBytes());
+        assertThat(TestUtil.bufferBytes(results.get(1).value))
+            .isEqualTo("value2ForKey2".getBytes());
+        assertThat(TestUtil.bufferBytes(results.get(2).value))
+            .isEqualTo("value3ForKey3".getBytes());
+      }
+    }
+  }
+
+  @Test
+  public void putNThenMultiGetDirectSliced() throws RocksDBException {
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      db.put("key1".getBytes(), "value1ForKey1".getBytes());
+      db.put("key2".getBytes(), "value2ForKey2".getBytes());
+      db.put("key3".getBytes(), "value3ForKey3".getBytes());
+
+      final List<ByteBuffer> keys = new ArrayList<>();
+      keys.add(ByteBuffer.allocateDirect(12).put("key2".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key3".getBytes()));
+      keys.add(
+          ByteBuffer.allocateDirect(12).put("prefix1".getBytes()).slice().put("key1".getBytes()));
+      // Java8 and lower flip() returns Buffer not ByteBuffer, so can't chain above /\/\
+      for (final ByteBuffer key : keys) {
+        key.flip();
+      }
+      final List<ByteBuffer> values = new ArrayList<>();
+      for (int i = 0; i < keys.size(); i++) {
+        values.add(ByteBuffer.allocateDirect(24));
+      }
+
+      {
+        final List<ByteBufferGetStatus> results = db.multiGetByteBuffers(keys, values);
+
+        assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok);
+
+        assertThat(results.get(1).requiredSize).isEqualTo("value2ForKey2".getBytes().length);
+        assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length);
+        assertThat(results.get(0).requiredSize).isEqualTo("value1ForKey1".getBytes().length);
+
+        assertThat(TestUtil.bufferBytes(results.get(0).value))
+            .isEqualTo("value2ForKey2".getBytes());
+        assertThat(TestUtil.bufferBytes(results.get(1).value))
+            .isEqualTo("value3ForKey3".getBytes());
+        assertThat(TestUtil.bufferBytes(results.get(2).value))
+            .isEqualTo("value1ForKey1".getBytes());
+      }
+    }
+  }
+
+  @Test
+  public void putNThenMultiGetDirectBadValuesArray() throws RocksDBException {
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      db.put("key1".getBytes(), "value1ForKey1".getBytes());
+      db.put("key2".getBytes(), "value2ForKey2".getBytes());
+      db.put("key3".getBytes(), "value3ForKey3".getBytes());
+
+      final List<ByteBuffer> keys = new ArrayList<>();
+      keys.add(ByteBuffer.allocateDirect(12).put("key1".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key2".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key3".getBytes()));
+      // Java8 and lower flip() returns Buffer not ByteBuffer, so can't chain above /\/\
+      for (final ByteBuffer key : keys) {
+        key.flip();
+      }
+
+      {
+        final List<ByteBuffer> values = new ArrayList<>();
+        for (int i = 0; i < keys.size(); i++) {
+          values.add(ByteBuffer.allocateDirect(24));
+        }
+
+        values.remove(0);
+
+        try {
+          db.multiGetByteBuffers(keys, values);
+          fail("Expected exception when not enough value ByteBuffers supplied");
+        } catch (final IllegalArgumentException e) {
+          assertThat(e.getMessage()).contains("For each key there must be a corresponding value");
+        }
+      }
+
+      {
+        final List<ByteBuffer> values = new ArrayList<>();
+        for (int i = 0; i < keys.size(); i++) {
+          values.add(ByteBuffer.allocateDirect(24));
+        }
+
+        values.add(ByteBuffer.allocateDirect(24));
+
+        try {
+          db.multiGetByteBuffers(keys, values);
+          fail("Expected exception when too many value ByteBuffers supplied");
+        } catch (final IllegalArgumentException e) {
+          assertThat(e.getMessage()).contains("For each key there must be a corresponding value");
+        }
+      }
+    }
+  }
+
+  @Test
+  public void putNThenMultiGetDirectShortValueBuffers() throws RocksDBException {
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      db.put("key1".getBytes(), "value1ForKey1".getBytes());
+      db.put("key2".getBytes(), "value2ForKey2".getBytes());
+      db.put("key3".getBytes(), "value3ForKey3".getBytes());
+
+      final List<ByteBuffer> keys = new ArrayList<>();
+      keys.add(ByteBuffer.allocateDirect(12).put("key1".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key2".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key3".getBytes()));
+      // Java8 and lower flip() returns Buffer not ByteBuffer, so can't chain above /\/\
+      for (final ByteBuffer key : keys) {
+        key.flip();
+      }
+
+      {
+        final List<ByteBuffer> values = new ArrayList<>();
+        for (int i = 0; i < keys.size(); i++) {
+          values.add(ByteBuffer.allocateDirect(4));
+        }
+
+        final List<ByteBufferGetStatus> statii = db.multiGetByteBuffers(keys, values);
+        assertThat(statii.size()).isEqualTo(values.size());
+        for (final ByteBufferGetStatus status : statii) {
+          assertThat(status.status.getCode()).isEqualTo(Status.Code.Ok);
+          assertThat(status.requiredSize).isEqualTo("value3ForKey3".getBytes().length);
+          final ByteBuffer expected =
+              ByteBuffer.allocateDirect(24).put(Arrays.copyOf("valueX".getBytes(), 4));
+          expected.flip();
+          assertThat(status.value).isEqualTo(expected);
+        }
+      }
+    }
+  }
+
+  @Test
+  public void putNThenMultiGetDirectNondefaultCF() throws RocksDBException {
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      final List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>(0);
+      cfDescriptors.add(new ColumnFamilyDescriptor("cf0".getBytes()));
+      cfDescriptors.add(new ColumnFamilyDescriptor("cf1".getBytes()));
+      cfDescriptors.add(new ColumnFamilyDescriptor("cf2".getBytes()));
+
+      final List<ColumnFamilyHandle> cf = db.createColumnFamilies(cfDescriptors);
+
+      db.put(cf.get(0), "key1".getBytes(), "value1ForKey1".getBytes());
+      db.put(cf.get(0), "key2".getBytes(), "value2ForKey2".getBytes());
+      db.put(cf.get(0), "key3".getBytes(), "value3ForKey3".getBytes());
+
+      final List<ByteBuffer> keys = new ArrayList<>();
+      keys.add(ByteBuffer.allocateDirect(12).put("key1".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key2".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key3".getBytes()));
+      // Java8 and lower flip() returns Buffer not ByteBuffer, so can't chain above /\/\
+      for (final ByteBuffer key : keys) {
+        key.flip();
+      }
+      final List<ByteBuffer> values = new ArrayList<>();
+      for (int i = 0; i < keys.size(); i++) {
+        values.add(ByteBuffer.allocateDirect(24));
+      }
+
+      {
+        final List<ByteBufferGetStatus> results = db.multiGetByteBuffers(keys, values);
+
+        assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.NotFound);
+        assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.NotFound);
+        assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.NotFound);
+      }
+
+      {
+        final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+        columnFamilyHandles.add(cf.get(0));
+        final List<ByteBufferGetStatus> results =
+            db.multiGetByteBuffers(columnFamilyHandles, keys, values);
+
+        assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok);
+
+        assertThat(results.get(0).requiredSize).isEqualTo("value1ForKey1".getBytes().length);
+        assertThat(results.get(1).requiredSize).isEqualTo("value2ForKey2".getBytes().length);
+        assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length);
+
+        assertThat(TestUtil.bufferBytes(results.get(0).value))
+            .isEqualTo("value1ForKey1".getBytes());
+        assertThat(TestUtil.bufferBytes(results.get(1).value))
+            .isEqualTo("value2ForKey2".getBytes());
+        assertThat(TestUtil.bufferBytes(results.get(2).value))
+            .isEqualTo("value3ForKey3".getBytes());
+      }
+
+      {
+        final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+        columnFamilyHandles.add(cf.get(0));
+        columnFamilyHandles.add(cf.get(0));
+        columnFamilyHandles.add(cf.get(0));
+        final List<ByteBufferGetStatus> results =
+            db.multiGetByteBuffers(columnFamilyHandles, keys, values);
+
+        assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok);
+
+        assertThat(results.get(0).requiredSize).isEqualTo("value1ForKey1".getBytes().length);
+        assertThat(results.get(1).requiredSize).isEqualTo("value2ForKey2".getBytes().length);
+        assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length);
+
+        assertThat(TestUtil.bufferBytes(results.get(0).value))
+            .isEqualTo("value1ForKey1".getBytes());
+        assertThat(TestUtil.bufferBytes(results.get(1).value))
+            .isEqualTo("value2ForKey2".getBytes());
+        assertThat(TestUtil.bufferBytes(results.get(2).value))
+            .isEqualTo("value3ForKey3".getBytes());
+      }
+    }
+  }
+
+  @Test
+  public void putNThenMultiGetDirectCFParams() throws RocksDBException {
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      db.put("key1".getBytes(), "value1ForKey1".getBytes());
+      db.put("key2".getBytes(), "value2ForKey2".getBytes());
+      db.put("key3".getBytes(), "value3ForKey3".getBytes());
+
+      final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+      columnFamilyHandles.add(db.getDefaultColumnFamily());
+      columnFamilyHandles.add(db.getDefaultColumnFamily());
+
+      final List<ByteBuffer> keys = new ArrayList<>();
+      keys.add(ByteBuffer.allocateDirect(12).put("key1".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key2".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key3".getBytes()));
+      // Java8 and lower flip() returns Buffer not ByteBuffer, so can't chain above /\/\
+      for (final ByteBuffer key : keys) {
+        key.flip();
+      }
+      final List<ByteBuffer> values = new ArrayList<>();
+      for (int i = 0; i < keys.size(); i++) {
+        values.add(ByteBuffer.allocateDirect(24));
+      }
+      try {
+        db.multiGetByteBuffers(columnFamilyHandles, keys, values);
+        fail("Expected exception when 2 column families supplied");
+      } catch (final IllegalArgumentException e) {
+        assertThat(e.getMessage()).contains("Wrong number of ColumnFamilyHandle(s) supplied");
+      }
+
+      columnFamilyHandles.clear();
+      columnFamilyHandles.add(db.getDefaultColumnFamily());
+      final List<ByteBufferGetStatus> results =
+          db.multiGetByteBuffers(columnFamilyHandles, keys, values);
+
+      assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok);
+      assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.Ok);
+      assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok);
+
+      assertThat(results.get(0).requiredSize).isEqualTo("value1ForKey1".getBytes().length);
+      assertThat(results.get(1).requiredSize).isEqualTo("value2ForKey2".getBytes().length);
+      assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length);
+
+      assertThat(TestUtil.bufferBytes(results.get(0).value)).isEqualTo("value1ForKey1".getBytes());
+      assertThat(TestUtil.bufferBytes(results.get(1).value)).isEqualTo("value2ForKey2".getBytes());
+      assertThat(TestUtil.bufferBytes(results.get(2).value)).isEqualTo("value3ForKey3".getBytes());
+    }
+  }
+
+  @Test
+  public void putNThenMultiGetDirectMixedCF() throws RocksDBException {
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      final List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+      cfDescriptors.add(new ColumnFamilyDescriptor("cf0".getBytes()));
+      cfDescriptors.add(new ColumnFamilyDescriptor("cf1".getBytes()));
+      cfDescriptors.add(new ColumnFamilyDescriptor("cf2".getBytes()));
+      cfDescriptors.add(new ColumnFamilyDescriptor("cf3".getBytes()));
+
+      final List<ColumnFamilyHandle> cf = db.createColumnFamilies(cfDescriptors);
+
+      db.put(cf.get(1), "key1".getBytes(), "value1ForKey1".getBytes());
+      db.put("key2".getBytes(), "value2ForKey2".getBytes());
+      db.put(cf.get(3), "key3".getBytes(), "value3ForKey3".getBytes());
+
+      final List<ByteBuffer> keys = new ArrayList<>();
+      keys.add(ByteBuffer.allocateDirect(12).put("key1".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key2".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key3".getBytes()));
+      // Java8 and lower flip() returns Buffer not ByteBuffer, so can't chain above /\/\
+      for (final ByteBuffer key : keys) {
+        key.flip();
+      }
+      final List<ByteBuffer> values = new ArrayList<>();
+      for (int i = 0; i < keys.size(); i++) {
+        values.add(ByteBuffer.allocateDirect(24));
+      }
+
+      {
+        final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+        columnFamilyHandles.add(db.getDefaultColumnFamily());
+
+        final List<ByteBufferGetStatus> results =
+            db.multiGetByteBuffers(columnFamilyHandles, keys, values);
+
+        assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.NotFound);
+        assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.NotFound);
+
+        assertThat(results.get(1).requiredSize).isEqualTo("value2ForKey2".getBytes().length);
+
+        assertThat(TestUtil.bufferBytes(results.get(1).value))
+            .isEqualTo("value2ForKey2".getBytes());
+      }
+
+      {
+        final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+        columnFamilyHandles.add(cf.get(1));
+
+        final List<ByteBufferGetStatus> results =
+            db.multiGetByteBuffers(columnFamilyHandles, keys, values);
+
+        assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.NotFound);
+        assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.NotFound);
+
+        assertThat(results.get(0).requiredSize).isEqualTo("value2ForKey2".getBytes().length);
+
+        assertThat(TestUtil.bufferBytes(results.get(0).value))
+            .isEqualTo("value1ForKey1".getBytes());
+      }
+
+      {
+        final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+        columnFamilyHandles.add(cf.get(1));
+        columnFamilyHandles.add(db.getDefaultColumnFamily());
+        columnFamilyHandles.add(cf.get(3));
+
+        final List<ByteBufferGetStatus> results =
+            db.multiGetByteBuffers(columnFamilyHandles, keys, values);
+
+        assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok);
+
+        assertThat(results.get(0).requiredSize).isEqualTo("value1ForKey1".getBytes().length);
+        assertThat(results.get(1).requiredSize).isEqualTo("value2ForKey2".getBytes().length);
+        assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length);
+
+        assertThat(TestUtil.bufferBytes(results.get(0).value))
+            .isEqualTo("value1ForKey1".getBytes());
+        assertThat(TestUtil.bufferBytes(results.get(1).value))
+            .isEqualTo("value2ForKey2".getBytes());
+        assertThat(TestUtil.bufferBytes(results.get(2).value))
+            .isEqualTo("value3ForKey3".getBytes());
+      }
+
+      {
+        final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+        columnFamilyHandles.add(db.getDefaultColumnFamily());
+        columnFamilyHandles.add(cf.get(1));
+        columnFamilyHandles.add(cf.get(3));
+
+        final List<ByteBufferGetStatus> results =
+            db.multiGetByteBuffers(columnFamilyHandles, keys, values);
+
+        assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.NotFound);
+        assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.NotFound);
+        assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok);
+
+        assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length);
+
+        assertThat(TestUtil.bufferBytes(results.get(2).value))
+            .isEqualTo("value3ForKey3".getBytes());
+      }
+    }
+  }
+
+  @Test
+  public void putNThenMultiGetDirectTruncateCF() throws RocksDBException {
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      final List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+      cfDescriptors.add(new ColumnFamilyDescriptor("cf0".getBytes()));
+
+      final List<ColumnFamilyHandle> cf = db.createColumnFamilies(cfDescriptors);
+
+      db.put(cf.get(0), "key1".getBytes(), "value1ForKey1".getBytes());
+      db.put(cf.get(0), "key2".getBytes(), "value2ForKey2WithLotsOfTrailingGarbage".getBytes());
+      db.put(cf.get(0), "key3".getBytes(), "value3ForKey3".getBytes());
+
+      final List<ByteBuffer> keys = new ArrayList<>();
+      keys.add(ByteBuffer.allocateDirect(12).put("key1".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key2".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key3".getBytes()));
+      // Java8 and lower flip() returns Buffer not ByteBuffer, so can't chain above /\/\
+      for (final ByteBuffer key : keys) {
+        key.flip();
+      }
+      final List<ByteBuffer> values = new ArrayList<>();
+      for (int i = 0; i < keys.size(); i++) {
+        values.add(ByteBuffer.allocateDirect(24));
+      }
+
+      {
+        final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+        columnFamilyHandles.add(cf.get(0));
+        final List<ByteBufferGetStatus> results =
+            db.multiGetByteBuffers(columnFamilyHandles, keys, values);
+
+        assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok);
+
+        assertThat(results.get(0).requiredSize).isEqualTo("value1ForKey1".getBytes().length);
+        assertThat(results.get(1).requiredSize)
+            .isEqualTo("value2ForKey2WithLotsOfTrailingGarbage".getBytes().length);
+        assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length);
+
+        assertThat(TestUtil.bufferBytes(results.get(0).value))
+            .isEqualTo("value1ForKey1".getBytes());
+        assertThat(TestUtil.bufferBytes(results.get(1).value))
+            .isEqualTo("valu e2Fo rKey 2Wit hLot sOfT".replace(" ", "").getBytes());
+        assertThat(TestUtil.bufferBytes(results.get(2).value))
+            .isEqualTo("value3ForKey3".getBytes());
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/MutableColumnFamilyOptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/MutableColumnFamilyOptionsTest.java
new file mode 100644
index 000000000..b2b2599a7
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/MutableColumnFamilyOptionsTest.java
@@ -0,0 +1,167 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+import org.junit.Test;
+import org.rocksdb.MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder;
+
+import java.util.NoSuchElementException;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class MutableColumnFamilyOptionsTest {
+
+  @Test
+  public void builder() {
+    final MutableColumnFamilyOptionsBuilder builder =
+        MutableColumnFamilyOptions.builder();
+        builder
+            .setWriteBufferSize(10)
+            .setInplaceUpdateNumLocks(5)
+            .setDisableAutoCompactions(true)
+            .setParanoidFileChecks(true);
+
+    assertThat(builder.writeBufferSize()).isEqualTo(10);
+    assertThat(builder.inplaceUpdateNumLocks()).isEqualTo(5);
+    assertThat(builder.disableAutoCompactions()).isEqualTo(true);
+    assertThat(builder.paranoidFileChecks()).isEqualTo(true);
+  }
+
+  @Test(expected = NoSuchElementException.class)
+  public void builder_getWhenNotSet() {
+    final MutableColumnFamilyOptionsBuilder builder =
+        MutableColumnFamilyOptions.builder();
+
+    builder.writeBufferSize();
+  }
+
+  @Test
+  public void builder_build() {
+    final MutableColumnFamilyOptions options = MutableColumnFamilyOptions
+        .builder()
+          .setWriteBufferSize(10)
+          .setParanoidFileChecks(true)
+          .build();
+
+    assertThat(options.getKeys().length).isEqualTo(2);
+    assertThat(options.getValues().length).isEqualTo(2);
+    assertThat(options.getKeys()[0])
+        .isEqualTo(
+            MutableColumnFamilyOptions.MemtableOption.write_buffer_size.name());
+    assertThat(options.getValues()[0]).isEqualTo("10");
+    assertThat(options.getKeys()[1])
+        .isEqualTo(
+            MutableColumnFamilyOptions.MiscOption.paranoid_file_checks.name());
+    assertThat(options.getValues()[1]).isEqualTo("true");
+  }
+
+  @Test
+  public void mutableColumnFamilyOptions_toString() {
+    final String str = MutableColumnFamilyOptions.builder()
+                           .setWriteBufferSize(10)
+                           .setInplaceUpdateNumLocks(5)
+                           .setDisableAutoCompactions(true)
+                           .setParanoidFileChecks(true)
+                           .setMaxBytesForLevelMultiplierAdditional(new int[] {2, 3, 5, 7, 11, 13})
+                           .build()
+                           .toString();
+
+    assertThat(str).isEqualTo("write_buffer_size=10;inplace_update_num_locks=5;"
+        + "disable_auto_compactions=true;paranoid_file_checks=true;max_bytes_for_level_multiplier_additional=2:3:5:7:11:13");
+  }
+
+  @Test
+  public void mutableColumnFamilyOptions_parse() {
+    final String str = "write_buffer_size=10;inplace_update_num_locks=5;"
+        + "disable_auto_compactions=true;paranoid_file_checks=true;max_bytes_for_level_multiplier_additional=2:{3}:{5}:{7}:{11}:{13}";
+
+    final MutableColumnFamilyOptionsBuilder builder =
+        MutableColumnFamilyOptions.parse(str);
+
+    assertThat(builder.writeBufferSize()).isEqualTo(10);
+    assertThat(builder.inplaceUpdateNumLocks()).isEqualTo(5);
+    assertThat(builder.disableAutoCompactions()).isEqualTo(true);
+    assertThat(builder.paranoidFileChecks()).isEqualTo(true);
+    assertThat(builder.maxBytesForLevelMultiplierAdditional())
+        .isEqualTo(new int[] {2, 3, 5, 7, 11, 13});
+  }
+
+  /**
+   * Extended parsing test to deal with all the options which C++ may return.
+   * We have canned a set of options returned by {RocksDB#getOptions}
+   */
+  @Test
+  public void mutableColumnFamilyOptions_parse_getOptions_output() {
+    final String optionsString =
+        "bottommost_compression=kDisableCompressionOption;  sample_for_compression=0;  "
+        + "blob_garbage_collection_age_cutoff=0.250000;  blob_garbage_collection_force_threshold=0.800000;"
+        + "arena_block_size=1048576;  enable_blob_garbage_collection=false;  level0_stop_writes_trigger=36;  min_blob_size=65536;"
+        + "blob_compaction_readahead_size=262144;  blob_file_starting_level=5;  prepopulate_blob_cache=kDisable;"
+        + "compaction_options_universal={allow_trivial_move=false;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;"
+        + "compression_size_percent=-1;max_size_amplification_percent=200;max_merge_width=4294967295;size_ratio=1;};  "
+        + "target_file_size_base=67108864;  max_bytes_for_level_base=268435456;  memtable_whole_key_filtering=false;  "
+        + "soft_pending_compaction_bytes_limit=68719476736;  blob_compression_type=kNoCompression;  max_write_buffer_number=2;  "
+        + "ttl=2592000;  compaction_options_fifo={allow_compaction=false;age_for_warm=0;max_table_files_size=1073741824;};  "
+        + "check_flush_compaction_key_order=true;  max_successive_merges=0;  inplace_update_num_locks=10000;  "
+        + "bottommost_compression_opts={enabled=false;parallel_threads=1;zstd_max_train_bytes=0;max_dict_bytes=0;"
+        + "strategy=0;max_dict_buffer_bytes=0;level=32767;window_bits=-14;};  "
+        + "target_file_size_multiplier=1;  max_bytes_for_level_multiplier_additional=5:{7}:{9}:{11}:{13}:{15}:{17};  "
+        + "enable_blob_files=true;  level0_slowdown_writes_trigger=20;  compression=kLZ4HCCompression;  level0_file_num_compaction_trigger=4;  "
+        + "blob_file_size=268435456;  prefix_extractor=nullptr;  max_bytes_for_level_multiplier=10.000000;  write_buffer_size=67108864;  "
+        + "disable_auto_compactions=false;  max_compaction_bytes=1677721600;  memtable_huge_page_size=0;  "
+        + "compression_opts={enabled=false;parallel_threads=1;zstd_max_train_bytes=0;max_dict_bytes=0;strategy=0;max_dict_buffer_bytes=0;"
+        + "level=32767;window_bits=-14;};  "
+        + "hard_pending_compaction_bytes_limit=274877906944;  periodic_compaction_seconds=0;  paranoid_file_checks=true;  "
+        + "memtable_prefix_bloom_size_ratio=7.500000;  max_sequential_skip_in_iterations=8;  report_bg_io_stats=true;  "
+        + "compaction_pri=kMinOverlappingRatio;  compaction_style=kCompactionStyleLevel;  memtable_factory=SkipListFactory;  "
+        + "comparator=leveldb.BytewiseComparator;  bloom_locality=0;  compaction_filter_factory=nullptr;  "
+        + "min_write_buffer_number_to_merge=1;  max_write_buffer_number_to_maintain=0;  compaction_filter=nullptr;  merge_operator=nullptr;  "
+        + "num_levels=7;  optimize_filters_for_hits=false;  force_consistency_checks=true;  table_factory=BlockBasedTable;  "
+        + "max_write_buffer_size_to_maintain=0;  memtable_insert_with_hint_prefix_extractor=nullptr;  level_compaction_dynamic_level_bytes=false;  "
+        + "inplace_update_support=false;  experimental_mempurge_threshold=0.003";
+
+    MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder cf =
+        MutableColumnFamilyOptions.parse(optionsString, true);
+
+    // Check the values from the parsed string which are column family options
+    assertThat(cf.blobGarbageCollectionAgeCutoff()).isEqualTo(0.25);
+    assertThat(cf.blobGarbageCollectionForceThreshold()).isEqualTo(0.80);
+    assertThat(cf.arenaBlockSize()).isEqualTo(1048576);
+    assertThat(cf.enableBlobGarbageCollection()).isEqualTo(false);
+    assertThat(cf.level0StopWritesTrigger()).isEqualTo(36);
+    assertThat(cf.minBlobSize()).isEqualTo(65536);
+    assertThat(cf.blobCompactionReadaheadSize()).isEqualTo(262144);
+    assertThat(cf.blobFileStartingLevel()).isEqualTo(5);
+    assertThat(cf.prepopulateBlobCache()).isEqualTo(PrepopulateBlobCache.PREPOPULATE_BLOB_DISABLE);
+    assertThat(cf.targetFileSizeBase()).isEqualTo(67108864);
+    assertThat(cf.maxBytesForLevelBase()).isEqualTo(268435456);
+    assertThat(cf.softPendingCompactionBytesLimit()).isEqualTo(68719476736L);
+    assertThat(cf.blobCompressionType()).isEqualTo(CompressionType.NO_COMPRESSION);
+    assertThat(cf.maxWriteBufferNumber()).isEqualTo(2);
+    assertThat(cf.ttl()).isEqualTo(2592000);
+    assertThat(cf.maxSuccessiveMerges()).isEqualTo(0);
+    assertThat(cf.inplaceUpdateNumLocks()).isEqualTo(10000);
+    assertThat(cf.targetFileSizeMultiplier()).isEqualTo(1);
+    assertThat(cf.maxBytesForLevelMultiplierAdditional())
+        .isEqualTo(new int[] {5, 7, 9, 11, 13, 15, 17});
+    assertThat(cf.enableBlobFiles()).isEqualTo(true);
+    assertThat(cf.level0SlowdownWritesTrigger()).isEqualTo(20);
+    assertThat(cf.compressionType()).isEqualTo(CompressionType.LZ4HC_COMPRESSION);
+    assertThat(cf.level0FileNumCompactionTrigger()).isEqualTo(4);
+    assertThat(cf.blobFileSize()).isEqualTo(268435456);
+    assertThat(cf.maxBytesForLevelMultiplier()).isEqualTo(10.0);
+    assertThat(cf.writeBufferSize()).isEqualTo(67108864);
+    assertThat(cf.disableAutoCompactions()).isEqualTo(false);
+    assertThat(cf.maxCompactionBytes()).isEqualTo(1677721600);
+    assertThat(cf.memtableHugePageSize()).isEqualTo(0);
+    assertThat(cf.hardPendingCompactionBytesLimit()).isEqualTo(274877906944L);
+    assertThat(cf.periodicCompactionSeconds()).isEqualTo(0);
+    assertThat(cf.paranoidFileChecks()).isEqualTo(true);
+    assertThat(cf.memtablePrefixBloomSizeRatio()).isEqualTo(7.5);
+    assertThat(cf.experimentalMempurgeThreshold()).isEqualTo(0.003);
+    assertThat(cf.maxSequentialSkipInIterations()).isEqualTo(8);
+    assertThat(cf.reportBgIoStats()).isEqualTo(true);
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/MutableDBOptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/MutableDBOptionsTest.java
new file mode 100644
index 000000000..063a8de38
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/MutableDBOptionsTest.java
@@ -0,0 +1,85 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+import org.junit.Test;
+import org.rocksdb.MutableDBOptions.MutableDBOptionsBuilder;
+
+import java.util.NoSuchElementException;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class MutableDBOptionsTest {
+
+  @Test
+  public void builder() {
+    final MutableDBOptionsBuilder builder =
+        MutableDBOptions.builder();
+        builder
+            .setBytesPerSync(1024 * 1024 * 7)
+            .setMaxBackgroundJobs(5)
+            .setAvoidFlushDuringShutdown(false);
+
+    assertThat(builder.bytesPerSync()).isEqualTo(1024 * 1024 * 7);
+    assertThat(builder.maxBackgroundJobs()).isEqualTo(5);
+    assertThat(builder.avoidFlushDuringShutdown()).isEqualTo(false);
+  }
+
+  @Test(expected = NoSuchElementException.class)
+  public void builder_getWhenNotSet() {
+    final MutableDBOptionsBuilder builder =
+        MutableDBOptions.builder();
+
+    builder.bytesPerSync();
+  }
+
+  @Test
+  public void builder_build() {
+    final MutableDBOptions options = MutableDBOptions
+        .builder()
+          .setBytesPerSync(1024 * 1024 * 7)
+          .setMaxBackgroundJobs(5)
+          .build();
+
+    assertThat(options.getKeys().length).isEqualTo(2);
+    assertThat(options.getValues().length).isEqualTo(2);
+    assertThat(options.getKeys()[0])
+        .isEqualTo(
+            MutableDBOptions.DBOption.bytes_per_sync.name());
+    assertThat(options.getValues()[0]).isEqualTo("7340032");
+    assertThat(options.getKeys()[1])
+        .isEqualTo(
+            MutableDBOptions.DBOption.max_background_jobs.name());
+    assertThat(options.getValues()[1]).isEqualTo("5");
+  }
+
+  @Test
+  public void mutableDBOptions_toString() {
+    final String str = MutableDBOptions
+        .builder()
+        .setMaxOpenFiles(99)
+        .setDelayedWriteRate(789)
+        .setAvoidFlushDuringShutdown(true)
+        .setStrictBytesPerSync(true)
+        .build()
+        .toString();
+
+    assertThat(str).isEqualTo("max_open_files=99;delayed_write_rate=789;"
+        + "avoid_flush_during_shutdown=true;strict_bytes_per_sync=true");
+  }
+
+  @Test
+  public void mutableDBOptions_parse() {
+    final String str = "max_open_files=99;delayed_write_rate=789;"
+        + "avoid_flush_during_shutdown=true";
+
+    final MutableDBOptionsBuilder builder =
+        MutableDBOptions.parse(str);
+
+    assertThat(builder.maxOpenFiles()).isEqualTo(99);
+    assertThat(builder.delayedWriteRate()).isEqualTo(789);
+    assertThat(builder.avoidFlushDuringShutdown()).isEqualTo(true);
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/MutableOptionsGetSetTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/MutableOptionsGetSetTest.java
new file mode 100644
index 000000000..6db940619
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/MutableOptionsGetSetTest.java
@@ -0,0 +1,429 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+public class MutableOptionsGetSetTest {
+  final int minBlobSize = 65536;
+
+  @Rule public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  /**
+   * Validate the round-trip of  blob options into and out of the C++ core of RocksDB
+   * From CF options on CF Creation to {RocksDB#getOptions}
+   * Uses 2x column families with different values for their options.
+   * NOTE that some constraints are applied to the options in the C++ core,
+   * e.g. on {ColumnFamilyOptions#setMemtablePrefixBloomSizeRatio}
+   *
+   * @throws RocksDBException if the database throws an exception
+   */
+  @Test
+  public void testGetMutableBlobOptionsAfterCreate() throws RocksDBException {
+    final ColumnFamilyOptions columnFamilyOptions0 = new ColumnFamilyOptions();
+    final ColumnFamilyDescriptor columnFamilyDescriptor0 =
+        new ColumnFamilyDescriptor("default".getBytes(UTF_8), columnFamilyOptions0);
+    final List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+        Collections.singletonList(columnFamilyDescriptor0);
+    final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+
+    try (final DBOptions dbOptions = new DBOptions().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(dbOptions, dbFolder.getRoot().getAbsolutePath(),
+             columnFamilyDescriptors, columnFamilyHandles)) {
+      try (final ColumnFamilyOptions columnFamilyOptions1 =
+               new ColumnFamilyOptions()
+                   .setMinBlobSize(minBlobSize)
+                   .setEnableBlobFiles(true)
+                   .setBlobGarbageCollectionAgeCutoff(0.25)
+                   .setBlobGarbageCollectionForceThreshold(0.80)
+                   .setBlobCompactionReadaheadSize(262144)
+                   .setBlobFileStartingLevel(2)
+                   .setArenaBlockSize(42)
+                   .setMemtablePrefixBloomSizeRatio(0.17)
+                   .setExperimentalMempurgeThreshold(0.005)
+                   .setMemtableWholeKeyFiltering(false)
+                   .setMemtableHugePageSize(3)
+                   .setMaxSuccessiveMerges(4)
+                   .setMaxWriteBufferNumber(12)
+                   .setInplaceUpdateNumLocks(16)
+                   .setDisableAutoCompactions(false)
+                   .setSoftPendingCompactionBytesLimit(112)
+                   .setHardPendingCompactionBytesLimit(280)
+                   .setLevel0FileNumCompactionTrigger(200)
+                   .setLevel0SlowdownWritesTrigger(312)
+                   .setLevel0StopWritesTrigger(584)
+                   .setMaxCompactionBytes(12)
+                   .setTargetFileSizeBase(99)
+                   .setTargetFileSizeMultiplier(112)
+                   .setMaxSequentialSkipInIterations(50)
+                   .setReportBgIoStats(true);
+
+           final ColumnFamilyOptions columnFamilyOptions2 =
+               new ColumnFamilyOptions()
+                   .setMinBlobSize(minBlobSize)
+                   .setEnableBlobFiles(false)
+                   .setArenaBlockSize(42)
+                   .setMemtablePrefixBloomSizeRatio(0.236)
+                   .setExperimentalMempurgeThreshold(0.247)
+                   .setMemtableWholeKeyFiltering(true)
+                   .setMemtableHugePageSize(8)
+                   .setMaxSuccessiveMerges(12)
+                   .setMaxWriteBufferNumber(22)
+                   .setInplaceUpdateNumLocks(160)
+                   .setDisableAutoCompactions(true)
+                   .setSoftPendingCompactionBytesLimit(1124)
+                   .setHardPendingCompactionBytesLimit(2800)
+                   .setLevel0FileNumCompactionTrigger(2000)
+                   .setLevel0SlowdownWritesTrigger(5840)
+                   .setLevel0StopWritesTrigger(31200)
+                   .setMaxCompactionBytes(112)
+                   .setTargetFileSizeBase(999)
+                   .setTargetFileSizeMultiplier(1120)
+                   .setMaxSequentialSkipInIterations(24)
+                   .setReportBgIoStats(true)) {
+        final ColumnFamilyDescriptor columnFamilyDescriptor1 =
+            new ColumnFamilyDescriptor("column_family_1".getBytes(UTF_8), columnFamilyOptions1);
+        final ColumnFamilyDescriptor columnFamilyDescriptor2 =
+            new ColumnFamilyDescriptor("column_family_2".getBytes(UTF_8), columnFamilyOptions2);
+
+        // Create the column family with blob options
+        final ColumnFamilyHandle columnFamilyHandle1 =
+            db.createColumnFamily(columnFamilyDescriptor1);
+        final ColumnFamilyHandle columnFamilyHandle2 =
+            db.createColumnFamily(columnFamilyDescriptor2);
+
+        // Check the getOptions() brings back the creation options for CF1
+        final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder builder1 =
+            db.getOptions(columnFamilyHandle1);
+        assertThat(builder1.enableBlobFiles()).isEqualTo(true);
+        assertThat(builder1.blobGarbageCollectionAgeCutoff()).isEqualTo(0.25);
+        assertThat(builder1.blobGarbageCollectionForceThreshold()).isEqualTo(0.80);
+        assertThat(builder1.blobCompactionReadaheadSize()).isEqualTo(262144);
+        assertThat(builder1.blobFileStartingLevel()).isEqualTo(2);
+        assertThat(builder1.minBlobSize()).isEqualTo(minBlobSize);
+        assertThat(builder1.arenaBlockSize()).isEqualTo(42);
+        assertThat(builder1.memtablePrefixBloomSizeRatio()).isEqualTo(0.17);
+        assertThat(builder1.experimentalMempurgeThreshold()).isEqualTo(0.005);
+        assertThat(builder1.memtableWholeKeyFiltering()).isEqualTo(false);
+        assertThat(builder1.memtableHugePageSize()).isEqualTo(3);
+        assertThat(builder1.maxSuccessiveMerges()).isEqualTo(4);
+        assertThat(builder1.maxWriteBufferNumber()).isEqualTo(12);
+        assertThat(builder1.inplaceUpdateNumLocks()).isEqualTo(16);
+        assertThat(builder1.disableAutoCompactions()).isEqualTo(false);
+        assertThat(builder1.softPendingCompactionBytesLimit()).isEqualTo(112);
+        assertThat(builder1.hardPendingCompactionBytesLimit()).isEqualTo(280);
+        assertThat(builder1.level0FileNumCompactionTrigger()).isEqualTo(200);
+        assertThat(builder1.level0SlowdownWritesTrigger()).isEqualTo(312);
+        assertThat(builder1.level0StopWritesTrigger()).isEqualTo(584);
+        assertThat(builder1.maxCompactionBytes()).isEqualTo(12);
+        assertThat(builder1.targetFileSizeBase()).isEqualTo(99);
+        assertThat(builder1.targetFileSizeMultiplier()).isEqualTo(112);
+        assertThat(builder1.maxSequentialSkipInIterations()).isEqualTo(50);
+        assertThat(builder1.reportBgIoStats()).isEqualTo(true);
+
+        // Check the getOptions() brings back the creation options for CF2
+        final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder builder2 =
+            db.getOptions(columnFamilyHandle2);
+        assertThat(builder2.enableBlobFiles()).isEqualTo(false);
+        assertThat(builder2.minBlobSize()).isEqualTo(minBlobSize);
+        assertThat(builder2.arenaBlockSize()).isEqualTo(42);
+        assertThat(builder2.memtablePrefixBloomSizeRatio()).isEqualTo(0.236);
+        assertThat(builder2.experimentalMempurgeThreshold()).isEqualTo(0.247);
+        assertThat(builder2.memtableWholeKeyFiltering()).isEqualTo(true);
+        assertThat(builder2.memtableHugePageSize()).isEqualTo(8);
+        assertThat(builder2.maxSuccessiveMerges()).isEqualTo(12);
+        assertThat(builder2.maxWriteBufferNumber()).isEqualTo(22);
+        assertThat(builder2.inplaceUpdateNumLocks()).isEqualTo(160);
+        assertThat(builder2.disableAutoCompactions()).isEqualTo(true);
+        assertThat(builder2.softPendingCompactionBytesLimit()).isEqualTo(1124);
+        assertThat(builder2.hardPendingCompactionBytesLimit()).isEqualTo(2800);
+        assertThat(builder2.level0FileNumCompactionTrigger()).isEqualTo(2000);
+        assertThat(builder2.level0SlowdownWritesTrigger()).isEqualTo(5840);
+        assertThat(builder2.level0StopWritesTrigger()).isEqualTo(31200);
+        assertThat(builder2.maxCompactionBytes()).isEqualTo(112);
+        assertThat(builder2.targetFileSizeBase()).isEqualTo(999);
+        assertThat(builder2.targetFileSizeMultiplier()).isEqualTo(1120);
+        assertThat(builder2.maxSequentialSkipInIterations()).isEqualTo(24);
+        assertThat(builder2.reportBgIoStats()).isEqualTo(true);
+      }
+    }
+  }
+
+  /**
+   * Validate the round-trip of  blob options into and out of the C++ core of RocksDB
+   * From {RocksDB#setOptions} to {RocksDB#getOptions}
+   * Uses 2x column families with different values for their options.
+   * NOTE that some constraints are applied to the options in the C++ core,
+   * e.g. on {ColumnFamilyOptions#setMemtablePrefixBloomSizeRatio}
+   *
+   * @throws RocksDBException if a database access has an error
+   */
+  @Test
+  public void testGetMutableBlobOptionsAfterSetCF() throws RocksDBException {
+    final ColumnFamilyOptions columnFamilyOptions0 = new ColumnFamilyOptions();
+    final ColumnFamilyDescriptor columnFamilyDescriptor0 =
+        new ColumnFamilyDescriptor("default".getBytes(UTF_8), columnFamilyOptions0);
+    final List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+        Collections.singletonList(columnFamilyDescriptor0);
+    final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+
+    try (final DBOptions dbOptions = new DBOptions().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(dbOptions, dbFolder.getRoot().getAbsolutePath(),
+             columnFamilyDescriptors, columnFamilyHandles)) {
+      try (final ColumnFamilyOptions columnFamilyOptions1 = new ColumnFamilyOptions();
+
+           final ColumnFamilyOptions columnFamilyOptions2 = new ColumnFamilyOptions()) {
+        final ColumnFamilyDescriptor columnFamilyDescriptor1 =
+            new ColumnFamilyDescriptor("column_family_1".getBytes(UTF_8), columnFamilyOptions1);
+        final ColumnFamilyDescriptor columnFamilyDescriptor2 =
+            new ColumnFamilyDescriptor("column_family_2".getBytes(UTF_8), columnFamilyOptions2);
+
+        // Create the column family with blob options
+        final ColumnFamilyHandle columnFamilyHandle1 =
+            db.createColumnFamily(columnFamilyDescriptor1);
+        final ColumnFamilyHandle columnFamilyHandle2 =
+            db.createColumnFamily(columnFamilyDescriptor2);
+        db.flush(new FlushOptions().setWaitForFlush(true));
+
+        final MutableColumnFamilyOptions
+            .MutableColumnFamilyOptionsBuilder mutableColumnFamilyOptions1 =
+            MutableColumnFamilyOptions.builder()
+                .setMinBlobSize(minBlobSize)
+                .setEnableBlobFiles(true)
+                .setBlobGarbageCollectionAgeCutoff(0.25)
+                .setBlobGarbageCollectionForceThreshold(0.80)
+                .setBlobCompactionReadaheadSize(262144)
+                .setBlobFileStartingLevel(3)
+                .setArenaBlockSize(42)
+                .setMemtablePrefixBloomSizeRatio(0.17)
+                .setExperimentalMempurgeThreshold(0.005)
+                .setMemtableWholeKeyFiltering(false)
+                .setMemtableHugePageSize(3)
+                .setMaxSuccessiveMerges(4)
+                .setMaxWriteBufferNumber(12)
+                .setInplaceUpdateNumLocks(16)
+                .setDisableAutoCompactions(false)
+                .setSoftPendingCompactionBytesLimit(112)
+                .setHardPendingCompactionBytesLimit(280)
+                .setLevel0FileNumCompactionTrigger(200)
+                .setLevel0SlowdownWritesTrigger(312)
+                .setLevel0StopWritesTrigger(584)
+                .setMaxCompactionBytes(12)
+                .setTargetFileSizeBase(99)
+                .setTargetFileSizeMultiplier(112);
+        db.setOptions(columnFamilyHandle1, mutableColumnFamilyOptions1.build());
+
+        // Check the getOptions() brings back the creation options for CF1
+        final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder builder1 =
+            db.getOptions(columnFamilyHandle1);
+        assertThat(builder1.enableBlobFiles()).isEqualTo(true);
+        assertThat(builder1.blobGarbageCollectionAgeCutoff()).isEqualTo(0.25);
+        assertThat(builder1.blobGarbageCollectionForceThreshold()).isEqualTo(0.80);
+        assertThat(builder1.blobCompactionReadaheadSize()).isEqualTo(262144);
+        assertThat(builder1.blobFileStartingLevel()).isEqualTo(3);
+        assertThat(builder1.minBlobSize()).isEqualTo(minBlobSize);
+        assertThat(builder1.arenaBlockSize()).isEqualTo(42);
+        assertThat(builder1.memtablePrefixBloomSizeRatio()).isEqualTo(0.17);
+        assertThat(builder1.experimentalMempurgeThreshold()).isEqualTo(0.005);
+        assertThat(builder1.memtableWholeKeyFiltering()).isEqualTo(false);
+        assertThat(builder1.memtableHugePageSize()).isEqualTo(3);
+        assertThat(builder1.maxSuccessiveMerges()).isEqualTo(4);
+        assertThat(builder1.maxWriteBufferNumber()).isEqualTo(12);
+        assertThat(builder1.inplaceUpdateNumLocks()).isEqualTo(16);
+        assertThat(builder1.disableAutoCompactions()).isEqualTo(false);
+        assertThat(builder1.softPendingCompactionBytesLimit()).isEqualTo(112);
+        assertThat(builder1.hardPendingCompactionBytesLimit()).isEqualTo(280);
+        assertThat(builder1.level0FileNumCompactionTrigger()).isEqualTo(200);
+        assertThat(builder1.level0SlowdownWritesTrigger()).isEqualTo(312);
+        assertThat(builder1.level0StopWritesTrigger()).isEqualTo(584);
+        assertThat(builder1.maxCompactionBytes()).isEqualTo(12);
+        assertThat(builder1.targetFileSizeBase()).isEqualTo(99);
+        assertThat(builder1.targetFileSizeMultiplier()).isEqualTo(112);
+
+        final MutableColumnFamilyOptions
+            .MutableColumnFamilyOptionsBuilder mutableColumnFamilyOptions2 =
+            MutableColumnFamilyOptions.builder()
+                .setMinBlobSize(minBlobSize)
+                .setEnableBlobFiles(false)
+                .setArenaBlockSize(42)
+                .setMemtablePrefixBloomSizeRatio(0.236)
+                .setExperimentalMempurgeThreshold(0.247)
+                .setMemtableWholeKeyFiltering(true)
+                .setMemtableHugePageSize(8)
+                .setMaxSuccessiveMerges(12)
+                .setMaxWriteBufferNumber(22)
+                .setInplaceUpdateNumLocks(160)
+                .setDisableAutoCompactions(true)
+                .setSoftPendingCompactionBytesLimit(1124)
+                .setHardPendingCompactionBytesLimit(2800)
+                .setLevel0FileNumCompactionTrigger(2000)
+                .setLevel0SlowdownWritesTrigger(5840)
+                .setLevel0StopWritesTrigger(31200)
+                .setMaxCompactionBytes(112)
+                .setTargetFileSizeBase(999)
+                .setTargetFileSizeMultiplier(1120);
+        db.setOptions(columnFamilyHandle2, mutableColumnFamilyOptions2.build());
+
+        // Check the getOptions() brings back the creation options for CF2
+        final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder builder2 =
+            db.getOptions(columnFamilyHandle2);
+        assertThat(builder2.enableBlobFiles()).isEqualTo(false);
+        assertThat(builder2.minBlobSize()).isEqualTo(minBlobSize);
+        assertThat(builder2.arenaBlockSize()).isEqualTo(42);
+        assertThat(builder2.memtablePrefixBloomSizeRatio()).isEqualTo(0.236);
+        assertThat(builder2.experimentalMempurgeThreshold()).isEqualTo(0.247);
+        assertThat(builder2.memtableWholeKeyFiltering()).isEqualTo(true);
+        assertThat(builder2.memtableHugePageSize()).isEqualTo(8);
+        assertThat(builder2.maxSuccessiveMerges()).isEqualTo(12);
+        assertThat(builder2.maxWriteBufferNumber()).isEqualTo(22);
+        assertThat(builder2.inplaceUpdateNumLocks()).isEqualTo(160);
+        assertThat(builder2.disableAutoCompactions()).isEqualTo(true);
+        assertThat(builder2.softPendingCompactionBytesLimit()).isEqualTo(1124);
+        assertThat(builder2.hardPendingCompactionBytesLimit()).isEqualTo(2800);
+        assertThat(builder2.level0FileNumCompactionTrigger()).isEqualTo(2000);
+        assertThat(builder2.level0SlowdownWritesTrigger()).isEqualTo(5840);
+        assertThat(builder2.level0StopWritesTrigger()).isEqualTo(31200);
+        assertThat(builder2.maxCompactionBytes()).isEqualTo(112);
+        assertThat(builder2.targetFileSizeBase()).isEqualTo(999);
+        assertThat(builder2.targetFileSizeMultiplier()).isEqualTo(1120);
+      }
+    }
+  }
+
+  /**
+   * Validate the round-trip of  blob options into and out of the C++ core of RocksDB
+   * From {RocksDB#setOptions} to {RocksDB#getOptions}
+   * Uses 2x column families with different values for their options.
+   * NOTE that some constraints are applied to the options in the C++ core,
+   * e.g. on {ColumnFamilyOptions#setMemtablePrefixBloomSizeRatio}
+   *
+   * @throws RocksDBException if a database access has an error
+   */
+  @Test
+  public void testGetMutableBlobOptionsAfterSet() throws RocksDBException {
+    final ColumnFamilyOptions columnFamilyOptions0 = new ColumnFamilyOptions();
+    final ColumnFamilyDescriptor columnFamilyDescriptor0 =
+        new ColumnFamilyDescriptor("default".getBytes(UTF_8), columnFamilyOptions0);
+    final List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+        Collections.singletonList(columnFamilyDescriptor0);
+    final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+
+    try (final DBOptions dbOptions = new DBOptions().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(dbOptions, dbFolder.getRoot().getAbsolutePath(),
+             columnFamilyDescriptors, columnFamilyHandles)) {
+      final MutableColumnFamilyOptions
+          .MutableColumnFamilyOptionsBuilder mutableColumnFamilyOptions =
+          MutableColumnFamilyOptions.builder()
+              .setMinBlobSize(minBlobSize)
+              .setEnableBlobFiles(true)
+              .setBlobGarbageCollectionAgeCutoff(0.25)
+              .setBlobGarbageCollectionForceThreshold(0.80)
+              .setBlobCompactionReadaheadSize(131072)
+              .setBlobFileStartingLevel(4)
+              .setArenaBlockSize(42)
+              .setMemtablePrefixBloomSizeRatio(0.17)
+              .setExperimentalMempurgeThreshold(0.005)
+              .setMemtableWholeKeyFiltering(false)
+              .setMemtableHugePageSize(3)
+              .setMaxSuccessiveMerges(4)
+              .setMaxWriteBufferNumber(12)
+              .setInplaceUpdateNumLocks(16)
+              .setDisableAutoCompactions(false)
+              .setSoftPendingCompactionBytesLimit(112)
+              .setHardPendingCompactionBytesLimit(280)
+              .setLevel0FileNumCompactionTrigger(200)
+              .setLevel0SlowdownWritesTrigger(312)
+              .setLevel0StopWritesTrigger(584)
+              .setMaxCompactionBytes(12)
+              .setTargetFileSizeBase(99)
+              .setTargetFileSizeMultiplier(112);
+      db.setOptions(mutableColumnFamilyOptions.build());
+
+      // Check the getOptions() brings back the creation options for CF1
+      final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder builder1 = db.getOptions();
+      assertThat(builder1.enableBlobFiles()).isEqualTo(true);
+      assertThat(builder1.blobGarbageCollectionAgeCutoff()).isEqualTo(0.25);
+      assertThat(builder1.blobGarbageCollectionForceThreshold()).isEqualTo(0.80);
+      assertThat(builder1.blobCompactionReadaheadSize()).isEqualTo(131072);
+      assertThat(builder1.blobFileStartingLevel()).isEqualTo(4);
+      assertThat(builder1.minBlobSize()).isEqualTo(minBlobSize);
+      assertThat(builder1.arenaBlockSize()).isEqualTo(42);
+      assertThat(builder1.memtablePrefixBloomSizeRatio()).isEqualTo(0.17);
+      assertThat(builder1.experimentalMempurgeThreshold()).isEqualTo(0.005);
+      assertThat(builder1.memtableWholeKeyFiltering()).isEqualTo(false);
+      assertThat(builder1.memtableHugePageSize()).isEqualTo(3);
+      assertThat(builder1.maxSuccessiveMerges()).isEqualTo(4);
+      assertThat(builder1.maxWriteBufferNumber()).isEqualTo(12);
+      assertThat(builder1.inplaceUpdateNumLocks()).isEqualTo(16);
+      assertThat(builder1.disableAutoCompactions()).isEqualTo(false);
+      assertThat(builder1.softPendingCompactionBytesLimit()).isEqualTo(112);
+      assertThat(builder1.hardPendingCompactionBytesLimit()).isEqualTo(280);
+      assertThat(builder1.level0FileNumCompactionTrigger()).isEqualTo(200);
+      assertThat(builder1.level0SlowdownWritesTrigger()).isEqualTo(312);
+      assertThat(builder1.level0StopWritesTrigger()).isEqualTo(584);
+      assertThat(builder1.maxCompactionBytes()).isEqualTo(12);
+      assertThat(builder1.targetFileSizeBase()).isEqualTo(99);
+      assertThat(builder1.targetFileSizeMultiplier()).isEqualTo(112);
+    }
+  }
+
+  @Test
+  public void testGetMutableDBOptionsAfterSet() throws RocksDBException {
+    final ColumnFamilyOptions columnFamilyOptions0 = new ColumnFamilyOptions();
+    final ColumnFamilyDescriptor columnFamilyDescriptor0 =
+        new ColumnFamilyDescriptor("default".getBytes(UTF_8), columnFamilyOptions0);
+    final List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+        Collections.singletonList(columnFamilyDescriptor0);
+    final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+
+    try (final DBOptions dbOptions = new DBOptions().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(dbOptions, dbFolder.getRoot().getAbsolutePath(),
+             columnFamilyDescriptors, columnFamilyHandles)) {
+      final MutableDBOptions.MutableDBOptionsBuilder mutableDBOptions =
+          MutableDBOptions.builder()
+              .setMaxBackgroundJobs(16)
+              .setAvoidFlushDuringShutdown(true)
+              .setWritableFileMaxBufferSize(2097152)
+              .setDelayedWriteRate(67108864)
+              .setMaxTotalWalSize(16777216)
+              .setDeleteObsoleteFilesPeriodMicros(86400000000L)
+              .setStatsDumpPeriodSec(1200)
+              .setStatsPersistPeriodSec(7200)
+              .setStatsHistoryBufferSize(6291456)
+              .setMaxOpenFiles(8)
+              .setBytesPerSync(4194304)
+              .setWalBytesPerSync(1048576)
+              .setStrictBytesPerSync(true)
+              .setCompactionReadaheadSize(1024);
+
+      db.setDBOptions(mutableDBOptions.build());
+
+      final MutableDBOptions.MutableDBOptionsBuilder getBuilder = db.getDBOptions();
+      assertThat(getBuilder.maxBackgroundJobs()).isEqualTo(16); // 4
+      assertThat(getBuilder.avoidFlushDuringShutdown()).isEqualTo(true); // false
+      assertThat(getBuilder.writableFileMaxBufferSize()).isEqualTo(2097152); // 1048576
+      assertThat(getBuilder.delayedWriteRate()).isEqualTo(67108864); // 16777216
+      assertThat(getBuilder.maxTotalWalSize()).isEqualTo(16777216);
+      assertThat(getBuilder.deleteObsoleteFilesPeriodMicros())
+          .isEqualTo(86400000000L); // 21600000000
+      assertThat(getBuilder.statsDumpPeriodSec()).isEqualTo(1200); // 600
+      assertThat(getBuilder.statsPersistPeriodSec()).isEqualTo(7200); // 600
+      assertThat(getBuilder.statsHistoryBufferSize()).isEqualTo(6291456); // 1048576
+      assertThat(getBuilder.maxOpenFiles()).isEqualTo(8); //-1
+      assertThat(getBuilder.bytesPerSync()).isEqualTo(4194304); // 1048576
+      assertThat(getBuilder.walBytesPerSync()).isEqualTo(1048576); // 0
+      assertThat(getBuilder.strictBytesPerSync()).isEqualTo(true); // false
+      assertThat(getBuilder.compactionReadaheadSize()).isEqualTo(1024); // 0
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java
new file mode 100644
index 000000000..970e58c0c
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java
@@ -0,0 +1,95 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.util.*;
+import java.util.Comparator;
+
+import static org.junit.Assert.assertEquals;
+
+public class NativeComparatorWrapperTest {
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  private static final Random random = new Random();
+
+  @Test
+  public void rountrip() throws RocksDBException {
+    final String dbPath = dbFolder.getRoot().getAbsolutePath();
+    final int ITERATIONS = 1_000;
+
+    final String[] storedKeys = new String[ITERATIONS];
+    try (final NativeStringComparatorWrapper comparator = new NativeStringComparatorWrapper();
+        final Options opt = new Options()
+        .setCreateIfMissing(true)
+        .setComparator(comparator)) {
+
+      // store random integer keys
+      try (final RocksDB db = RocksDB.open(opt, dbPath)) {
+        for (int i = 0; i < ITERATIONS; i++) {
+          final String strKey = randomString();
+          final byte key[] = strKey.getBytes();
+          // does key already exist (avoid duplicates)
+          if (i > 0 && db.get(key) != null) {
+            i--; // generate a different key
+          } else {
+            db.put(key, "value".getBytes());
+            storedKeys[i] = strKey;
+          }
+        }
+      }
+
+      // sort the stored keys into ascending alpha-numeric order
+      Arrays.sort(storedKeys, new Comparator<String>() {
+        @Override
+        public int compare(final String o1, final String o2) {
+          return o1.compareTo(o2);
+        }
+      });
+
+      // re-open db and read from start to end
+      // string keys should be in ascending
+      // order
+      try (final RocksDB db = RocksDB.open(opt, dbPath);
+           final RocksIterator it = db.newIterator()) {
+        int count = 0;
+        for (it.seekToFirst(); it.isValid(); it.next()) {
+          final String strKey = new String(it.key());
+          assertEquals(storedKeys[count++], strKey);
+        }
+      }
+    }
+  }
+
+  private String randomString() {
+    final char[] chars = new char[12];
+    for(int i = 0; i < 12; i++) {
+      final int letterCode = random.nextInt(24);
+      final char letter = (char) (((int) 'a') + letterCode);
+      chars[i] = letter;
+    }
+    return String.copyValueOf(chars);
+  }
+
+  public static class NativeStringComparatorWrapper
+      extends NativeComparatorWrapper {
+
+    @Override
+    protected long initializeNative(final long... nativeParameterHandles) {
+      return newStringComparator();
+    }
+
+    private native long newStringComparator();
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/NativeLibraryLoaderTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/NativeLibraryLoaderTest.java
new file mode 100644
index 000000000..ab60081a0
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/NativeLibraryLoaderTest.java
@@ -0,0 +1,41 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.rocksdb.util.Environment;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.*;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class NativeLibraryLoaderTest {
+
+  @Rule
+  public TemporaryFolder temporaryFolder = new TemporaryFolder();
+
+  @Test
+  public void tempFolder() throws IOException {
+    NativeLibraryLoader.getInstance().loadLibraryFromJarToTemp(
+        temporaryFolder.getRoot().getAbsolutePath());
+    final Path path = Paths.get(temporaryFolder.getRoot().getAbsolutePath(),
+        Environment.getJniLibraryFileName("rocksdb"));
+    assertThat(Files.exists(path)).isTrue();
+    assertThat(Files.isReadable(path)).isTrue();
+  }
+
+  @Test
+  public void overridesExistingLibrary() throws IOException {
+    File first = NativeLibraryLoader.getInstance().loadLibraryFromJarToTemp(
+        temporaryFolder.getRoot().getAbsolutePath());
+    NativeLibraryLoader.getInstance().loadLibraryFromJarToTemp(
+        temporaryFolder.getRoot().getAbsolutePath());
+    assertThat(first.exists()).isTrue();
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/OptimisticTransactionDBTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/OptimisticTransactionDBTest.java
new file mode 100644
index 000000000..519b70b1d
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/OptimisticTransactionDBTest.java
@@ -0,0 +1,131 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class OptimisticTransactionDBTest {
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void open() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final OptimisticTransactionDB otdb = OptimisticTransactionDB.open(options,
+                 dbFolder.getRoot().getAbsolutePath())) {
+      assertThat(otdb).isNotNull();
+    }
+  }
+
+  @Test
+  public void open_columnFamilies() throws RocksDBException {
+    try(final DBOptions dbOptions = new DBOptions().setCreateIfMissing(true)
+          .setCreateMissingColumnFamilies(true);
+        final ColumnFamilyOptions myCfOpts = new ColumnFamilyOptions()) {
+
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+          Arrays.asList(
+              new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+              new ColumnFamilyDescriptor("myCf".getBytes(), myCfOpts));
+
+      final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+
+      try (final OptimisticTransactionDB otdb = OptimisticTransactionDB.open(dbOptions,
+               dbFolder.getRoot().getAbsolutePath(),
+               columnFamilyDescriptors, columnFamilyHandles)) {
+        try {
+          assertThat(otdb).isNotNull();
+        } finally {
+          for (final ColumnFamilyHandle handle : columnFamilyHandles) {
+            handle.close();
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void beginTransaction() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final OptimisticTransactionDB otdb = OptimisticTransactionDB.open(
+             options, dbFolder.getRoot().getAbsolutePath());
+        final WriteOptions writeOptions = new WriteOptions()) {
+
+      try(final Transaction txn = otdb.beginTransaction(writeOptions)) {
+        assertThat(txn).isNotNull();
+      }
+    }
+  }
+
+  @Test
+  public void beginTransaction_transactionOptions() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final OptimisticTransactionDB otdb = OptimisticTransactionDB.open(
+             options, dbFolder.getRoot().getAbsolutePath());
+         final WriteOptions writeOptions = new WriteOptions();
+         final OptimisticTransactionOptions optimisticTxnOptions =
+             new OptimisticTransactionOptions()) {
+
+      try(final Transaction txn = otdb.beginTransaction(writeOptions,
+          optimisticTxnOptions)) {
+        assertThat(txn).isNotNull();
+      }
+    }
+  }
+
+  @Test
+  public void beginTransaction_withOld() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final OptimisticTransactionDB otdb = OptimisticTransactionDB.open(
+             options, dbFolder.getRoot().getAbsolutePath());
+         final WriteOptions writeOptions = new WriteOptions()) {
+
+      try(final Transaction txn = otdb.beginTransaction(writeOptions)) {
+        final Transaction txnReused = otdb.beginTransaction(writeOptions, txn);
+        assertThat(txnReused).isSameAs(txn);
+      }
+    }
+  }
+
+  @Test
+  public void beginTransaction_withOld_transactionOptions()
+      throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final OptimisticTransactionDB otdb = OptimisticTransactionDB.open(
+             options, dbFolder.getRoot().getAbsolutePath());
+         final WriteOptions writeOptions = new WriteOptions();
+         final OptimisticTransactionOptions optimisticTxnOptions =
+             new OptimisticTransactionOptions()) {
+
+      try(final Transaction txn = otdb.beginTransaction(writeOptions)) {
+        final Transaction txnReused = otdb.beginTransaction(writeOptions,
+            optimisticTxnOptions, txn);
+        assertThat(txnReused).isSameAs(txn);
+      }
+    }
+  }
+
+  @Test
+  public void baseDB() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final OptimisticTransactionDB otdb = OptimisticTransactionDB.open(options,
+             dbFolder.getRoot().getAbsolutePath())) {
+      assertThat(otdb).isNotNull();
+      final RocksDB db = otdb.getBaseDB();
+      assertThat(db).isNotNull();
+      assertThat(db.isOwningHandle()).isFalse();
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/OptimisticTransactionOptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/OptimisticTransactionOptionsTest.java
new file mode 100644
index 000000000..ef656b958
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/OptimisticTransactionOptionsTest.java
@@ -0,0 +1,38 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Test;
+import org.rocksdb.util.BytewiseComparator;
+
+import java.util.Random;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class OptimisticTransactionOptionsTest {
+
+  private static final Random rand = PlatformRandomHelper.
+      getPlatformSpecificRandomFactory();
+
+  @Test
+  public void setSnapshot() {
+    try (final OptimisticTransactionOptions opt = new OptimisticTransactionOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setSetSnapshot(boolValue);
+      assertThat(opt.isSetSnapshot()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void comparator() {
+    try (final OptimisticTransactionOptions opt = new OptimisticTransactionOptions();
+         final ComparatorOptions copt = new ComparatorOptions()
+             .setUseDirectBuffer(true);
+         final AbstractComparator comparator = new BytewiseComparator(copt)) {
+      opt.setComparator(comparator);
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/OptimisticTransactionTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/OptimisticTransactionTest.java
new file mode 100644
index 000000000..d2f92e1ff
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/OptimisticTransactionTest.java
@@ -0,0 +1,446 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.assertj.core.api.Assertions.*;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import org.junit.Test;
+
+public class OptimisticTransactionTest extends AbstractTransactionTest {
+  @Test
+  public void prepare_commit() throws RocksDBException {
+    final byte[] k1 = "key1".getBytes(UTF_8);
+    final byte[] v1 = "value1".getBytes(UTF_8);
+    final byte[] v12 = "value12".getBytes(UTF_8);
+
+    try (final DBContainer dbContainer = startDb();
+         final ReadOptions readOptions = new ReadOptions()) {
+      try (final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(k1, v1);
+        txn.commit();
+      }
+
+      try (final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(k1, v12);
+        txn.prepare();
+
+        failBecauseExceptionWasNotThrown(RocksDBException.class);
+      } catch (final RocksDBException e) {
+        assertThat(e.getMessage())
+            .contains("Two phase commit not supported for optimistic transactions");
+      }
+    }
+  }
+
+  @Test
+  public void getForUpdate_cf_conflict() throws RocksDBException {
+    final byte[] k1 = "key1".getBytes(UTF_8);
+    final byte[] v1 = "value1".getBytes(UTF_8);
+    final byte[] v12 = "value12".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(testCf, k1, v1);
+        assertThat(txn.get(testCf, readOptions, k1)).isEqualTo(v1);
+        txn.commit();
+      }
+
+      try(final Transaction txn2 = dbContainer.beginTransaction()) {
+        try(final Transaction txn3 = dbContainer.beginTransaction()) {
+          assertThat(txn3.getForUpdate(readOptions, testCf, k1, true)).isEqualTo(v1);
+
+          // NOTE: txn2 updates k1, during txn3
+          txn2.put(testCf, k1, v12);
+          assertThat(txn2.get(testCf, readOptions, k1)).isEqualTo(v12);
+          txn2.commit();
+
+          try {
+            txn3.commit(); // should cause an exception!
+          } catch(final RocksDBException e) {
+            assertThat(e.getStatus().getCode()).isSameAs(Status.Code.Busy);
+            return;
+          }
+        }
+      }
+
+      fail("Expected an exception for put after getForUpdate from conflicting" +
+          "transactions");
+    }
+  }
+
+  @Test
+  public void getForUpdate_conflict() throws RocksDBException {
+    final byte[] k1 = "key1".getBytes(UTF_8);
+    final byte[] v1 = "value1".getBytes(UTF_8);
+    final byte[] v12 = "value12".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions()) {
+
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(k1, v1);
+        assertThat(txn.get(readOptions, k1)).isEqualTo(v1);
+        txn.commit();
+      }
+
+      try(final Transaction txn2 = dbContainer.beginTransaction()) {
+        try(final Transaction txn3 = dbContainer.beginTransaction()) {
+          assertThat(txn3.getForUpdate(readOptions, k1, true)).isEqualTo(v1);
+
+          // NOTE: txn2 updates k1, during txn3
+          txn2.put(k1, v12);
+          assertThat(txn2.get(readOptions, k1)).isEqualTo(v12);
+          txn2.commit();
+
+          try {
+            txn3.commit(); // should cause an exception!
+          } catch(final RocksDBException e) {
+            assertThat(e.getStatus().getCode()).isSameAs(Status.Code.Busy);
+            return;
+          }
+        }
+      }
+
+      fail("Expected an exception for put after getForUpdate from conflicting" +
+          "transactions");
+    }
+  }
+
+  @Deprecated
+  @Test
+  public void multiGetForUpdate_cf_conflict() throws RocksDBException {
+    final byte[][] keys = new byte[][] {"key1".getBytes(UTF_8), "key2".getBytes(UTF_8)};
+    final byte[][] values = new byte[][] {"value1".getBytes(UTF_8), "value2".getBytes(UTF_8)};
+    final byte[] otherValue = "otherValue".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      final List<ColumnFamilyHandle> cfList = Arrays.asList(testCf, testCf);
+
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(testCf, keys[0], values[0]);
+        txn.put(testCf, keys[1], values[1]);
+        assertThat(txn.multiGet(readOptions, cfList, keys)).isEqualTo(values);
+        txn.commit();
+      }
+
+      try(final Transaction txn2 = dbContainer.beginTransaction()) {
+        try(final Transaction txn3 = dbContainer.beginTransaction()) {
+          assertThat(txn3.multiGetForUpdate(readOptions, cfList, keys))
+              .isEqualTo(values);
+
+          // NOTE: txn2 updates k1, during txn3
+          txn2.put(testCf, keys[0], otherValue);
+          assertThat(txn2.get(testCf, readOptions, keys[0]))
+              .isEqualTo(otherValue);
+          txn2.commit();
+
+          try {
+            txn3.commit(); // should cause an exception!
+          } catch(final RocksDBException e) {
+            assertThat(e.getStatus().getCode()).isSameAs(Status.Code.Busy);
+            return;
+          }
+        }
+      }
+
+      fail("Expected an exception for put after getForUpdate from conflicting" +
+          "transactions");
+    }
+  }
+
+  @Test
+  public void multiGetAsListForUpdate_cf_conflict() throws RocksDBException {
+    final byte[][] keys = new byte[][] {"key1".getBytes(UTF_8), "key2".getBytes(UTF_8)};
+    final byte[][] values = new byte[][] {"value1".getBytes(UTF_8), "value2".getBytes(UTF_8)};
+    final byte[] otherValue = "otherValue".getBytes(UTF_8);
+
+    try (final DBContainer dbContainer = startDb();
+         final ReadOptions readOptions = new ReadOptions()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      final List<ColumnFamilyHandle> cfList = Arrays.asList(testCf, testCf);
+
+      try (final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(testCf, keys[0], values[0]);
+        txn.put(testCf, keys[1], values[1]);
+        assertThat(txn.multiGetAsList(readOptions, cfList, Arrays.asList(keys)))
+            .containsExactly(values);
+        txn.commit();
+      }
+
+      try (final Transaction txn2 = dbContainer.beginTransaction()) {
+        try (final Transaction txn3 = dbContainer.beginTransaction()) {
+          assertThat(txn3.multiGetForUpdateAsList(readOptions, cfList, Arrays.asList(keys)))
+              .containsExactly(values);
+
+          // NOTE: txn2 updates k1, during txn3
+          txn2.put(testCf, keys[0], otherValue);
+          assertThat(txn2.get(testCf, readOptions, keys[0])).isEqualTo(otherValue);
+          txn2.commit();
+
+          try {
+            txn3.commit(); // should cause an exception!
+          } catch (final RocksDBException e) {
+            assertThat(e.getStatus().getCode()).isSameAs(Status.Code.Busy);
+            return;
+          }
+        }
+      }
+
+      fail("Expected an exception for put after getForUpdate from conflicting"
+          + "transactions");
+    }
+  }
+
+  @Deprecated
+  @Test
+  public void multiGetForUpdate_conflict() throws RocksDBException {
+    final byte[][] keys = new byte[][] {"key1".getBytes(UTF_8), "key2".getBytes(UTF_8)};
+    final byte[][] values = new byte[][] {"value1".getBytes(UTF_8), "value2".getBytes(UTF_8)};
+    final byte[] otherValue = "otherValue".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions()) {
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(keys[0], values[0]);
+        txn.put(keys[1], values[1]);
+        assertThat(txn.multiGet(readOptions, keys)).isEqualTo(values);
+        txn.commit();
+      }
+
+      try(final Transaction txn2 = dbContainer.beginTransaction()) {
+        try(final Transaction txn3 = dbContainer.beginTransaction()) {
+          assertThat(txn3.multiGetForUpdate(readOptions, keys))
+              .isEqualTo(values);
+
+          // NOTE: txn2 updates k1, during txn3
+          txn2.put(keys[0], otherValue);
+          assertThat(txn2.get(readOptions, keys[0]))
+              .isEqualTo(otherValue);
+          txn2.commit();
+
+          try {
+            txn3.commit(); // should cause an exception!
+          } catch(final RocksDBException e) {
+            assertThat(e.getStatus().getCode()).isSameAs(Status.Code.Busy);
+            return;
+          }
+        }
+      }
+
+      fail("Expected an exception for put after getForUpdate from conflicting" +
+          "transactions");
+    }
+  }
+
+  @Test
+  public void multiGetasListForUpdate_conflict() throws RocksDBException {
+    final byte[][] keys = new byte[][] {"key1".getBytes(UTF_8), "key2".getBytes(UTF_8)};
+    final byte[][] values = new byte[][] {"value1".getBytes(UTF_8), "value2".getBytes(UTF_8)};
+    final byte[] otherValue = "otherValue".getBytes(UTF_8);
+
+    try (final DBContainer dbContainer = startDb();
+         final ReadOptions readOptions = new ReadOptions()) {
+      try (final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(keys[0], values[0]);
+        txn.put(keys[1], values[1]);
+        assertThat(txn.multiGetAsList(readOptions, Arrays.asList(keys))).containsExactly(values);
+        txn.commit();
+      }
+
+      try (final Transaction txn2 = dbContainer.beginTransaction()) {
+        try (final Transaction txn3 = dbContainer.beginTransaction()) {
+          assertThat(txn3.multiGetForUpdateAsList(readOptions, Arrays.asList(keys)))
+              .containsExactly(values);
+
+          // NOTE: txn2 updates k1, during txn3
+          txn2.put(keys[0], otherValue);
+          assertThat(txn2.get(readOptions, keys[0])).isEqualTo(otherValue);
+          txn2.commit();
+
+          try {
+            txn3.commit(); // should cause an exception!
+          } catch (final RocksDBException e) {
+            assertThat(e.getStatus().getCode()).isSameAs(Status.Code.Busy);
+            return;
+          }
+        }
+      }
+
+      fail("Expected an exception for put after getForUpdate from conflicting"
+          + "transactions");
+    }
+  }
+
+  @Test
+  public void undoGetForUpdate_cf_conflict() throws RocksDBException {
+    final byte[] k1 = "key1".getBytes(UTF_8);
+    final byte[] v1 = "value1".getBytes(UTF_8);
+    final byte[] v12 = "value12".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(testCf, k1, v1);
+        assertThat(txn.get(testCf, readOptions, k1)).isEqualTo(v1);
+        txn.commit();
+      }
+
+      try(final Transaction txn2 = dbContainer.beginTransaction()) {
+        try(final Transaction txn3 = dbContainer.beginTransaction()) {
+          assertThat(txn3.getForUpdate(readOptions, testCf, k1, true)).isEqualTo(v1);
+
+          // undo the getForUpdate
+          txn3.undoGetForUpdate(testCf, k1);
+
+          // NOTE: txn2 updates k1, during txn3
+          txn2.put(testCf, k1, v12);
+          assertThat(txn2.get(testCf, readOptions, k1)).isEqualTo(v12);
+          txn2.commit();
+
+          // should not cause an exception
+          // because we undid the getForUpdate above!
+          txn3.commit();
+        }
+      }
+    }
+  }
+
+  @Test
+  public void undoGetForUpdate_conflict() throws RocksDBException {
+    final byte[] k1 = "key1".getBytes(UTF_8);
+    final byte[] v1 = "value1".getBytes(UTF_8);
+    final byte[] v12 = "value12".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions()) {
+
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(k1, v1);
+        assertThat(txn.get(readOptions, k1)).isEqualTo(v1);
+        txn.commit();
+      }
+
+      try(final Transaction txn2 = dbContainer.beginTransaction()) {
+        try(final Transaction txn3 = dbContainer.beginTransaction()) {
+          assertThat(txn3.getForUpdate(readOptions, k1, true)).isEqualTo(v1);
+
+          // undo the getForUpdate
+          txn3.undoGetForUpdate(k1);
+
+          // NOTE: txn2 updates k1, during txn3
+          txn2.put(k1, v12);
+          assertThat(txn2.get(readOptions, k1)).isEqualTo(v12);
+          txn2.commit();
+
+          // should not cause an exception
+          // because we undid the getForUpdate above!
+          txn3.commit();
+        }
+      }
+    }
+  }
+
+  @Test
+  public void name() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      assertThat(txn.getName()).isEmpty();
+      final String name = "my-transaction-" + rand.nextLong();
+
+      try {
+        txn.setName(name);
+        fail("Optimistic transactions cannot be named.");
+      } catch(final RocksDBException e) {
+        assertThat(e.getStatus().getCode()).isEqualTo(Status.Code.InvalidArgument);
+      }
+    }
+  }
+
+  @Override
+  public OptimisticTransactionDBContainer startDb()
+      throws RocksDBException {
+    final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true)
+        .setCreateMissingColumnFamilies(true);
+
+    final ColumnFamilyOptions columnFamilyOptions = new ColumnFamilyOptions();
+    final List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+        Arrays.asList(
+            new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+            new ColumnFamilyDescriptor(TXN_TEST_COLUMN_FAMILY,
+                columnFamilyOptions));
+    final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+
+    final OptimisticTransactionDB optimisticTxnDb;
+    try {
+      optimisticTxnDb = OptimisticTransactionDB.open(
+          options, dbFolder.getRoot().getAbsolutePath(),
+          columnFamilyDescriptors, columnFamilyHandles);
+    } catch(final RocksDBException e) {
+      columnFamilyOptions.close();
+      options.close();
+      throw e;
+    }
+
+    final WriteOptions writeOptions = new WriteOptions();
+    final OptimisticTransactionOptions optimisticTxnOptions =
+             new OptimisticTransactionOptions();
+
+    return new OptimisticTransactionDBContainer(optimisticTxnOptions,
+        writeOptions, columnFamilyHandles, optimisticTxnDb, columnFamilyOptions,
+        options);
+  }
+
+  private static class OptimisticTransactionDBContainer
+      extends DBContainer {
+
+    private final OptimisticTransactionOptions optimisticTxnOptions;
+    private final OptimisticTransactionDB optimisticTxnDb;
+
+    public OptimisticTransactionDBContainer(
+        final OptimisticTransactionOptions optimisticTxnOptions,
+        final WriteOptions writeOptions,
+        final List<ColumnFamilyHandle> columnFamilyHandles,
+        final OptimisticTransactionDB optimisticTxnDb,
+        final ColumnFamilyOptions columnFamilyOptions,
+        final DBOptions options) {
+      super(writeOptions, columnFamilyHandles, columnFamilyOptions,
+          options);
+      this.optimisticTxnOptions = optimisticTxnOptions;
+      this.optimisticTxnDb = optimisticTxnDb;
+    }
+
+    @Override
+    public Transaction beginTransaction() {
+      return optimisticTxnDb.beginTransaction(writeOptions,
+          optimisticTxnOptions);
+    }
+
+    @Override
+    public Transaction beginTransaction(final WriteOptions writeOptions) {
+      return optimisticTxnDb.beginTransaction(writeOptions,
+          optimisticTxnOptions);
+    }
+
+    @Override
+    public void close() {
+      optimisticTxnOptions.close();
+      writeOptions.close();
+      for(final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles) {
+        columnFamilyHandle.close();
+      }
+      optimisticTxnDb.close();
+      options.close();
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/OptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/OptionsTest.java
new file mode 100644
index 000000000..129f1c39a
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/OptionsTest.java
@@ -0,0 +1,1492 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.*;
+import java.util.concurrent.atomic.AtomicBoolean;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.rocksdb.test.RemoveEmptyValueCompactionFilterFactory;
+
+public class OptionsTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  public static final Random rand = PlatformRandomHelper.
+      getPlatformSpecificRandomFactory();
+
+  @Test
+  public void copyConstructor() {
+    Options origOpts = new Options();
+    origOpts.setNumLevels(rand.nextInt(8));
+    origOpts.setTargetFileSizeMultiplier(rand.nextInt(100));
+    origOpts.setLevel0StopWritesTrigger(rand.nextInt(50));
+    Options copyOpts = new Options(origOpts);
+    assertThat(origOpts.numLevels()).isEqualTo(copyOpts.numLevels());
+    assertThat(origOpts.targetFileSizeMultiplier()).isEqualTo(copyOpts.targetFileSizeMultiplier());
+    assertThat(origOpts.level0StopWritesTrigger()).isEqualTo(copyOpts.level0StopWritesTrigger());
+  }
+
+  @Test
+  public void setIncreaseParallelism() {
+    try (final Options opt = new Options()) {
+      final int threads = Runtime.getRuntime().availableProcessors() * 2;
+      opt.setIncreaseParallelism(threads);
+    }
+  }
+
+  @Test
+  public void writeBufferSize() throws RocksDBException {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setWriteBufferSize(longValue);
+      assertThat(opt.writeBufferSize()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void maxWriteBufferNumber() {
+    try (final Options opt = new Options()) {
+      final int intValue = rand.nextInt();
+      opt.setMaxWriteBufferNumber(intValue);
+      assertThat(opt.maxWriteBufferNumber()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void minWriteBufferNumberToMerge() {
+    try (final Options opt = new Options()) {
+      final int intValue = rand.nextInt();
+      opt.setMinWriteBufferNumberToMerge(intValue);
+      assertThat(opt.minWriteBufferNumberToMerge()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void numLevels() {
+    try (final Options opt = new Options()) {
+      final int intValue = rand.nextInt();
+      opt.setNumLevels(intValue);
+      assertThat(opt.numLevels()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void levelZeroFileNumCompactionTrigger() {
+    try (final Options opt = new Options()) {
+      final int intValue = rand.nextInt();
+      opt.setLevelZeroFileNumCompactionTrigger(intValue);
+      assertThat(opt.levelZeroFileNumCompactionTrigger()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void levelZeroSlowdownWritesTrigger() {
+    try (final Options opt = new Options()) {
+      final int intValue = rand.nextInt();
+      opt.setLevelZeroSlowdownWritesTrigger(intValue);
+      assertThat(opt.levelZeroSlowdownWritesTrigger()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void levelZeroStopWritesTrigger() {
+    try (final Options opt = new Options()) {
+      final int intValue = rand.nextInt();
+      opt.setLevelZeroStopWritesTrigger(intValue);
+      assertThat(opt.levelZeroStopWritesTrigger()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void targetFileSizeBase() {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setTargetFileSizeBase(longValue);
+      assertThat(opt.targetFileSizeBase()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void targetFileSizeMultiplier() {
+    try (final Options opt = new Options()) {
+      final int intValue = rand.nextInt();
+      opt.setTargetFileSizeMultiplier(intValue);
+      assertThat(opt.targetFileSizeMultiplier()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void maxBytesForLevelBase() {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setMaxBytesForLevelBase(longValue);
+      assertThat(opt.maxBytesForLevelBase()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void levelCompactionDynamicLevelBytes() {
+    try (final Options opt = new Options()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setLevelCompactionDynamicLevelBytes(boolValue);
+      assertThat(opt.levelCompactionDynamicLevelBytes())
+          .isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void maxBytesForLevelMultiplier() {
+    try (final Options opt = new Options()) {
+      final double doubleValue = rand.nextDouble();
+      opt.setMaxBytesForLevelMultiplier(doubleValue);
+      assertThat(opt.maxBytesForLevelMultiplier()).isEqualTo(doubleValue);
+    }
+  }
+
+  @Test
+  public void maxBytesForLevelMultiplierAdditional() {
+    try (final Options opt = new Options()) {
+      final int intValue1 = rand.nextInt();
+      final int intValue2 = rand.nextInt();
+      final int[] ints = new int[]{intValue1, intValue2};
+      opt.setMaxBytesForLevelMultiplierAdditional(ints);
+      assertThat(opt.maxBytesForLevelMultiplierAdditional()).isEqualTo(ints);
+    }
+  }
+
+  @Test
+  public void maxCompactionBytes() {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setMaxCompactionBytes(longValue);
+      assertThat(opt.maxCompactionBytes()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void softPendingCompactionBytesLimit() {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setSoftPendingCompactionBytesLimit(longValue);
+      assertThat(opt.softPendingCompactionBytesLimit()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void hardPendingCompactionBytesLimit() {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setHardPendingCompactionBytesLimit(longValue);
+      assertThat(opt.hardPendingCompactionBytesLimit()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void level0FileNumCompactionTrigger() {
+    try (final Options opt = new Options()) {
+      final int intValue = rand.nextInt();
+      opt.setLevel0FileNumCompactionTrigger(intValue);
+      assertThat(opt.level0FileNumCompactionTrigger()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void level0SlowdownWritesTrigger() {
+    try (final Options opt = new Options()) {
+      final int intValue = rand.nextInt();
+      opt.setLevel0SlowdownWritesTrigger(intValue);
+      assertThat(opt.level0SlowdownWritesTrigger()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void level0StopWritesTrigger() {
+    try (final Options opt = new Options()) {
+      final int intValue = rand.nextInt();
+      opt.setLevel0StopWritesTrigger(intValue);
+      assertThat(opt.level0StopWritesTrigger()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void arenaBlockSize() throws RocksDBException {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setArenaBlockSize(longValue);
+      assertThat(opt.arenaBlockSize()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void disableAutoCompactions() {
+    try (final Options opt = new Options()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setDisableAutoCompactions(boolValue);
+      assertThat(opt.disableAutoCompactions()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void maxSequentialSkipInIterations() {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setMaxSequentialSkipInIterations(longValue);
+      assertThat(opt.maxSequentialSkipInIterations()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void inplaceUpdateSupport() {
+    try (final Options opt = new Options()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setInplaceUpdateSupport(boolValue);
+      assertThat(opt.inplaceUpdateSupport()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void inplaceUpdateNumLocks() throws RocksDBException {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setInplaceUpdateNumLocks(longValue);
+      assertThat(opt.inplaceUpdateNumLocks()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void memtablePrefixBloomSizeRatio() {
+    try (final Options opt = new Options()) {
+      final double doubleValue = rand.nextDouble();
+      opt.setMemtablePrefixBloomSizeRatio(doubleValue);
+      assertThat(opt.memtablePrefixBloomSizeRatio()).isEqualTo(doubleValue);
+    }
+  }
+
+  @Test
+  public void experimentalMempurgeThreshold() {
+    try (final Options opt = new Options()) {
+      final double doubleValue = rand.nextDouble();
+      opt.setExperimentalMempurgeThreshold(doubleValue);
+      assertThat(opt.experimentalMempurgeThreshold()).isEqualTo(doubleValue);
+    }
+  }
+
+  @Test
+  public void memtableWholeKeyFiltering() {
+    try (final Options opt = new Options()) {
+      final boolean booleanValue = rand.nextBoolean();
+      opt.setMemtableWholeKeyFiltering(booleanValue);
+      assertThat(opt.memtableWholeKeyFiltering()).isEqualTo(booleanValue);
+    }
+  }
+
+  @Test
+  public void memtableHugePageSize() {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setMemtableHugePageSize(longValue);
+      assertThat(opt.memtableHugePageSize()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void bloomLocality() {
+    try (final Options opt = new Options()) {
+      final int intValue = rand.nextInt();
+      opt.setBloomLocality(intValue);
+      assertThat(opt.bloomLocality()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void maxSuccessiveMerges() throws RocksDBException {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setMaxSuccessiveMerges(longValue);
+      assertThat(opt.maxSuccessiveMerges()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void optimizeFiltersForHits() {
+    try (final Options opt = new Options()) {
+      final boolean aBoolean = rand.nextBoolean();
+      opt.setOptimizeFiltersForHits(aBoolean);
+      assertThat(opt.optimizeFiltersForHits()).isEqualTo(aBoolean);
+    }
+  }
+
+  @Test
+  public void createIfMissing() {
+    try (final Options opt = new Options()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setCreateIfMissing(boolValue);
+      assertThat(opt.createIfMissing()).
+          isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void createMissingColumnFamilies() {
+    try (final Options opt = new Options()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setCreateMissingColumnFamilies(boolValue);
+      assertThat(opt.createMissingColumnFamilies()).
+          isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void errorIfExists() {
+    try (final Options opt = new Options()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setErrorIfExists(boolValue);
+      assertThat(opt.errorIfExists()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void paranoidChecks() {
+    try (final Options opt = new Options()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setParanoidChecks(boolValue);
+      assertThat(opt.paranoidChecks()).
+          isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void maxTotalWalSize() {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setMaxTotalWalSize(longValue);
+      assertThat(opt.maxTotalWalSize()).
+          isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void maxOpenFiles() {
+    try (final Options opt = new Options()) {
+      final int intValue = rand.nextInt();
+      opt.setMaxOpenFiles(intValue);
+      assertThat(opt.maxOpenFiles()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void maxFileOpeningThreads() {
+    try (final Options opt = new Options()) {
+      final int intValue = rand.nextInt();
+      opt.setMaxFileOpeningThreads(intValue);
+      assertThat(opt.maxFileOpeningThreads()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void useFsync() {
+    try (final Options opt = new Options()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setUseFsync(boolValue);
+      assertThat(opt.useFsync()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void dbPaths() {
+    final List<DbPath> dbPaths = new ArrayList<>();
+    dbPaths.add(new DbPath(Paths.get("/a"), 10));
+    dbPaths.add(new DbPath(Paths.get("/b"), 100));
+    dbPaths.add(new DbPath(Paths.get("/c"), 1000));
+
+    try (final Options opt = new Options()) {
+      assertThat(opt.dbPaths()).isEqualTo(Collections.emptyList());
+
+      opt.setDbPaths(dbPaths);
+
+      assertThat(opt.dbPaths()).isEqualTo(dbPaths);
+    }
+  }
+
+  @Test
+  public void dbLogDir() {
+    try (final Options opt = new Options()) {
+      final String str = "path/to/DbLogDir";
+      opt.setDbLogDir(str);
+      assertThat(opt.dbLogDir()).isEqualTo(str);
+    }
+  }
+
+  @Test
+  public void walDir() {
+    try (final Options opt = new Options()) {
+      final String str = "path/to/WalDir";
+      opt.setWalDir(str);
+      assertThat(opt.walDir()).isEqualTo(str);
+    }
+  }
+
+  @Test
+  public void deleteObsoleteFilesPeriodMicros() {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setDeleteObsoleteFilesPeriodMicros(longValue);
+      assertThat(opt.deleteObsoleteFilesPeriodMicros()).
+          isEqualTo(longValue);
+    }
+  }
+
+  @SuppressWarnings("deprecated")
+  @Test
+  public void maxBackgroundCompactions() {
+    try (final Options opt = new Options()) {
+      final int intValue = rand.nextInt();
+      opt.setMaxBackgroundCompactions(intValue);
+      assertThat(opt.maxBackgroundCompactions()).
+          isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void maxSubcompactions() {
+    try (final Options opt = new Options()) {
+      final int intValue = rand.nextInt();
+      opt.setMaxSubcompactions(intValue);
+      assertThat(opt.maxSubcompactions()).
+          isEqualTo(intValue);
+    }
+  }
+
+  @SuppressWarnings("deprecated")
+  @Test
+  public void maxBackgroundFlushes() {
+    try (final Options opt = new Options()) {
+      final int intValue = rand.nextInt();
+      opt.setMaxBackgroundFlushes(intValue);
+      assertThat(opt.maxBackgroundFlushes()).
+          isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void maxBackgroundJobs() {
+    try (final Options opt = new Options()) {
+      final int intValue = rand.nextInt();
+      opt.setMaxBackgroundJobs(intValue);
+      assertThat(opt.maxBackgroundJobs()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void maxLogFileSize() throws RocksDBException {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setMaxLogFileSize(longValue);
+      assertThat(opt.maxLogFileSize()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void logFileTimeToRoll() throws RocksDBException {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setLogFileTimeToRoll(longValue);
+      assertThat(opt.logFileTimeToRoll()).
+          isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void keepLogFileNum() throws RocksDBException {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setKeepLogFileNum(longValue);
+      assertThat(opt.keepLogFileNum()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void recycleLogFileNum() throws RocksDBException {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setRecycleLogFileNum(longValue);
+      assertThat(opt.recycleLogFileNum()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void maxManifestFileSize() {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setMaxManifestFileSize(longValue);
+      assertThat(opt.maxManifestFileSize()).
+          isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void tableCacheNumshardbits() {
+    try (final Options opt = new Options()) {
+      final int intValue = rand.nextInt();
+      opt.setTableCacheNumshardbits(intValue);
+      assertThat(opt.tableCacheNumshardbits()).
+          isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void walSizeLimitMB() {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setWalSizeLimitMB(longValue);
+      assertThat(opt.walSizeLimitMB()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void walTtlSeconds() {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setWalTtlSeconds(longValue);
+      assertThat(opt.walTtlSeconds()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void manifestPreallocationSize() throws RocksDBException {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setManifestPreallocationSize(longValue);
+      assertThat(opt.manifestPreallocationSize()).
+          isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void useDirectReads() {
+    try(final Options opt = new Options()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setUseDirectReads(boolValue);
+      assertThat(opt.useDirectReads()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void useDirectIoForFlushAndCompaction() {
+    try(final Options opt = new Options()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setUseDirectIoForFlushAndCompaction(boolValue);
+      assertThat(opt.useDirectIoForFlushAndCompaction()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void allowFAllocate() {
+    try (final Options opt = new Options()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setAllowFAllocate(boolValue);
+      assertThat(opt.allowFAllocate()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void allowMmapReads() {
+    try (final Options opt = new Options()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setAllowMmapReads(boolValue);
+      assertThat(opt.allowMmapReads()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void allowMmapWrites() {
+    try (final Options opt = new Options()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setAllowMmapWrites(boolValue);
+      assertThat(opt.allowMmapWrites()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void isFdCloseOnExec() {
+    try (final Options opt = new Options()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setIsFdCloseOnExec(boolValue);
+      assertThat(opt.isFdCloseOnExec()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void statsDumpPeriodSec() {
+    try (final Options opt = new Options()) {
+      final int intValue = rand.nextInt();
+      opt.setStatsDumpPeriodSec(intValue);
+      assertThat(opt.statsDumpPeriodSec()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void statsPersistPeriodSec() {
+    try (final Options opt = new Options()) {
+      final int intValue = rand.nextInt();
+      opt.setStatsPersistPeriodSec(intValue);
+      assertThat(opt.statsPersistPeriodSec()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void statsHistoryBufferSize() {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setStatsHistoryBufferSize(longValue);
+      assertThat(opt.statsHistoryBufferSize()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void adviseRandomOnOpen() {
+    try (final Options opt = new Options()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setAdviseRandomOnOpen(boolValue);
+      assertThat(opt.adviseRandomOnOpen()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void dbWriteBufferSize() {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setDbWriteBufferSize(longValue);
+      assertThat(opt.dbWriteBufferSize()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void setWriteBufferManager() throws RocksDBException {
+    try (final Options opt = new Options();
+         final Cache cache = new LRUCache(1 * 1024 * 1024);
+         final WriteBufferManager writeBufferManager = new WriteBufferManager(2000l, cache)) {
+      opt.setWriteBufferManager(writeBufferManager);
+      assertThat(opt.writeBufferManager()).isEqualTo(writeBufferManager);
+    }
+  }
+
+  @Test
+  public void setWriteBufferManagerWithZeroBufferSize() throws RocksDBException {
+    try (final Options opt = new Options();
+         final Cache cache = new LRUCache(1 * 1024 * 1024);
+         final WriteBufferManager writeBufferManager = new WriteBufferManager(0l, cache)) {
+      opt.setWriteBufferManager(writeBufferManager);
+      assertThat(opt.writeBufferManager()).isEqualTo(writeBufferManager);
+    }
+  }
+
+  @Test
+  public void setWriteBufferManagerWithAllowStall() throws RocksDBException {
+    try (final Options opt = new Options(); final Cache cache = new LRUCache(1 * 1024 * 1024);
+         final WriteBufferManager writeBufferManager = new WriteBufferManager(2000l, cache, true)) {
+      opt.setWriteBufferManager(writeBufferManager);
+      assertThat(opt.writeBufferManager()).isEqualTo(writeBufferManager);
+      assertThat(opt.writeBufferManager().allowStall()).isEqualTo(true);
+    }
+  }
+
+  @Test
+  public void accessHintOnCompactionStart() {
+    try (final Options opt = new Options()) {
+      final AccessHint accessHint = AccessHint.SEQUENTIAL;
+      opt.setAccessHintOnCompactionStart(accessHint);
+      assertThat(opt.accessHintOnCompactionStart()).isEqualTo(accessHint);
+    }
+  }
+
+  @Test
+  public void compactionReadaheadSize() {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setCompactionReadaheadSize(longValue);
+      assertThat(opt.compactionReadaheadSize()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void randomAccessMaxBufferSize() {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setRandomAccessMaxBufferSize(longValue);
+      assertThat(opt.randomAccessMaxBufferSize()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void writableFileMaxBufferSize() {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setWritableFileMaxBufferSize(longValue);
+      assertThat(opt.writableFileMaxBufferSize()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void useAdaptiveMutex() {
+    try (final Options opt = new Options()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setUseAdaptiveMutex(boolValue);
+      assertThat(opt.useAdaptiveMutex()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void bytesPerSync() {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setBytesPerSync(longValue);
+      assertThat(opt.bytesPerSync()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void walBytesPerSync() {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setWalBytesPerSync(longValue);
+      assertThat(opt.walBytesPerSync()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void strictBytesPerSync() {
+    try (final Options opt = new Options()) {
+      assertThat(opt.strictBytesPerSync()).isFalse();
+      opt.setStrictBytesPerSync(true);
+      assertThat(opt.strictBytesPerSync()).isTrue();
+    }
+  }
+
+  @Test
+  public void enableThreadTracking() {
+    try (final Options opt = new Options()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setEnableThreadTracking(boolValue);
+      assertThat(opt.enableThreadTracking()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void delayedWriteRate() {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setDelayedWriteRate(longValue);
+      assertThat(opt.delayedWriteRate()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void enablePipelinedWrite() {
+    try(final Options opt = new Options()) {
+      assertThat(opt.enablePipelinedWrite()).isFalse();
+      opt.setEnablePipelinedWrite(true);
+      assertThat(opt.enablePipelinedWrite()).isTrue();
+    }
+  }
+
+  @Test
+  public void unordredWrite() {
+    try(final Options opt = new Options()) {
+      assertThat(opt.unorderedWrite()).isFalse();
+      opt.setUnorderedWrite(true);
+      assertThat(opt.unorderedWrite()).isTrue();
+    }
+  }
+
+  @Test
+  public void allowConcurrentMemtableWrite() {
+    try (final Options opt = new Options()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setAllowConcurrentMemtableWrite(boolValue);
+      assertThat(opt.allowConcurrentMemtableWrite()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void enableWriteThreadAdaptiveYield() {
+    try (final Options opt = new Options()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setEnableWriteThreadAdaptiveYield(boolValue);
+      assertThat(opt.enableWriteThreadAdaptiveYield()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void writeThreadMaxYieldUsec() {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setWriteThreadMaxYieldUsec(longValue);
+      assertThat(opt.writeThreadMaxYieldUsec()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void writeThreadSlowYieldUsec() {
+    try (final Options opt = new Options()) {
+      final long longValue = rand.nextLong();
+      opt.setWriteThreadSlowYieldUsec(longValue);
+      assertThat(opt.writeThreadSlowYieldUsec()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void skipStatsUpdateOnDbOpen() {
+    try (final Options opt = new Options()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setSkipStatsUpdateOnDbOpen(boolValue);
+      assertThat(opt.skipStatsUpdateOnDbOpen()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void walRecoveryMode() {
+    try (final Options opt = new Options()) {
+      for (final WALRecoveryMode walRecoveryMode : WALRecoveryMode.values()) {
+        opt.setWalRecoveryMode(walRecoveryMode);
+        assertThat(opt.walRecoveryMode()).isEqualTo(walRecoveryMode);
+      }
+    }
+  }
+
+  @Test
+  public void allow2pc() {
+    try (final Options opt = new Options()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setAllow2pc(boolValue);
+      assertThat(opt.allow2pc()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void rowCache() {
+    try (final Options opt = new Options()) {
+      assertThat(opt.rowCache()).isNull();
+
+      try(final Cache lruCache = new LRUCache(1000)) {
+        opt.setRowCache(lruCache);
+        assertThat(opt.rowCache()).isEqualTo(lruCache);
+      }
+
+      try(final Cache clockCache = new ClockCache(1000)) {
+        opt.setRowCache(clockCache);
+        assertThat(opt.rowCache()).isEqualTo(clockCache);
+      }
+    }
+  }
+
+  @Test
+  public void walFilter() {
+    try (final Options opt = new Options()) {
+      assertThat(opt.walFilter()).isNull();
+
+      try (final AbstractWalFilter walFilter = new AbstractWalFilter() {
+        @Override
+        public void columnFamilyLogNumberMap(
+            final Map<Integer, Long> cfLognumber,
+            final Map<String, Integer> cfNameId) {
+          // no-op
+        }
+
+        @Override
+        public LogRecordFoundResult logRecordFound(final long logNumber,
+            final String logFileName, final WriteBatch batch,
+            final WriteBatch newBatch) {
+          return new LogRecordFoundResult(
+              WalProcessingOption.CONTINUE_PROCESSING, false);
+        }
+
+        @Override
+        public String name() {
+          return "test-wal-filter";
+        }
+      }) {
+        opt.setWalFilter(walFilter);
+        assertThat(opt.walFilter()).isEqualTo(walFilter);
+      }
+    }
+  }
+
+  @Test
+  public void failIfOptionsFileError() {
+    try (final Options opt = new Options()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setFailIfOptionsFileError(boolValue);
+      assertThat(opt.failIfOptionsFileError()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void dumpMallocStats() {
+    try (final Options opt = new Options()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setDumpMallocStats(boolValue);
+      assertThat(opt.dumpMallocStats()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void avoidFlushDuringRecovery() {
+    try (final Options opt = new Options()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setAvoidFlushDuringRecovery(boolValue);
+      assertThat(opt.avoidFlushDuringRecovery()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void avoidFlushDuringShutdown() {
+    try (final Options opt = new Options()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setAvoidFlushDuringShutdown(boolValue);
+      assertThat(opt.avoidFlushDuringShutdown()).isEqualTo(boolValue);
+    }
+  }
+
+
+  @Test
+  public void allowIngestBehind() {
+    try (final Options opt = new Options()) {
+      assertThat(opt.allowIngestBehind()).isFalse();
+      opt.setAllowIngestBehind(true);
+      assertThat(opt.allowIngestBehind()).isTrue();
+    }
+  }
+
+  @Test
+  public void twoWriteQueues() {
+    try (final Options opt = new Options()) {
+      assertThat(opt.twoWriteQueues()).isFalse();
+      opt.setTwoWriteQueues(true);
+      assertThat(opt.twoWriteQueues()).isTrue();
+    }
+  }
+
+  @Test
+  public void manualWalFlush() {
+    try (final Options opt = new Options()) {
+      assertThat(opt.manualWalFlush()).isFalse();
+      opt.setManualWalFlush(true);
+      assertThat(opt.manualWalFlush()).isTrue();
+    }
+  }
+
+  @Test
+  public void atomicFlush() {
+    try (final Options opt = new Options()) {
+      assertThat(opt.atomicFlush()).isFalse();
+      opt.setAtomicFlush(true);
+      assertThat(opt.atomicFlush()).isTrue();
+    }
+  }
+
+  @Test
+  public void env() {
+    try (final Options options = new Options();
+         final Env env = Env.getDefault()) {
+      options.setEnv(env);
+      assertThat(options.getEnv()).isSameAs(env);
+    }
+  }
+
+  @Test
+  public void linkageOfPrepMethods() {
+    try (final Options options = new Options()) {
+      options.optimizeUniversalStyleCompaction();
+      options.optimizeUniversalStyleCompaction(4000);
+      options.optimizeLevelStyleCompaction();
+      options.optimizeLevelStyleCompaction(3000);
+      options.optimizeForPointLookup(10);
+      options.optimizeForSmallDb();
+      options.prepareForBulkLoad();
+    }
+  }
+
+  @Test
+  public void compressionTypes() {
+    try (final Options options = new Options()) {
+      for (final CompressionType compressionType :
+          CompressionType.values()) {
+        options.setCompressionType(compressionType);
+        assertThat(options.compressionType()).
+            isEqualTo(compressionType);
+        assertThat(CompressionType.valueOf("NO_COMPRESSION")).
+            isEqualTo(CompressionType.NO_COMPRESSION);
+      }
+    }
+  }
+
+  @Test
+  public void prepopulateBlobCache() {
+    try (final Options options = new Options()) {
+      for (final PrepopulateBlobCache prepopulateBlobCache : PrepopulateBlobCache.values()) {
+        options.setPrepopulateBlobCache(prepopulateBlobCache);
+        assertThat(options.prepopulateBlobCache()).isEqualTo(prepopulateBlobCache);
+        assertThat(PrepopulateBlobCache.valueOf("PREPOPULATE_BLOB_DISABLE"))
+            .isEqualTo(PrepopulateBlobCache.PREPOPULATE_BLOB_DISABLE);
+      }
+    }
+  }
+
+  @Test
+  public void compressionPerLevel() {
+    try (final Options options = new Options()) {
+      assertThat(options.compressionPerLevel()).isEmpty();
+      List<CompressionType> compressionTypeList =
+          new ArrayList<>();
+      for (int i = 0; i < options.numLevels(); i++) {
+        compressionTypeList.add(CompressionType.NO_COMPRESSION);
+      }
+      options.setCompressionPerLevel(compressionTypeList);
+      compressionTypeList = options.compressionPerLevel();
+      for (final CompressionType compressionType : compressionTypeList) {
+        assertThat(compressionType).isEqualTo(
+            CompressionType.NO_COMPRESSION);
+      }
+    }
+  }
+
+  @Test
+  public void differentCompressionsPerLevel() {
+    try (final Options options = new Options()) {
+      options.setNumLevels(3);
+
+      assertThat(options.compressionPerLevel()).isEmpty();
+      List<CompressionType> compressionTypeList = new ArrayList<>();
+
+      compressionTypeList.add(CompressionType.BZLIB2_COMPRESSION);
+      compressionTypeList.add(CompressionType.SNAPPY_COMPRESSION);
+      compressionTypeList.add(CompressionType.LZ4_COMPRESSION);
+
+      options.setCompressionPerLevel(compressionTypeList);
+      compressionTypeList = options.compressionPerLevel();
+
+      assertThat(compressionTypeList.size()).isEqualTo(3);
+      assertThat(compressionTypeList).
+          containsExactly(
+              CompressionType.BZLIB2_COMPRESSION,
+              CompressionType.SNAPPY_COMPRESSION,
+              CompressionType.LZ4_COMPRESSION);
+
+    }
+  }
+
+  @Test
+  public void bottommostCompressionType() {
+    try (final Options options = new Options()) {
+      assertThat(options.bottommostCompressionType())
+          .isEqualTo(CompressionType.DISABLE_COMPRESSION_OPTION);
+
+      for (final CompressionType compressionType : CompressionType.values()) {
+        options.setBottommostCompressionType(compressionType);
+        assertThat(options.bottommostCompressionType())
+            .isEqualTo(compressionType);
+      }
+    }
+  }
+
+  @Test
+  public void bottommostCompressionOptions() {
+    try (final Options options = new Options();
+         final CompressionOptions bottommostCompressionOptions = new CompressionOptions()
+             .setMaxDictBytes(123)) {
+
+      options.setBottommostCompressionOptions(bottommostCompressionOptions);
+      assertThat(options.bottommostCompressionOptions())
+          .isEqualTo(bottommostCompressionOptions);
+      assertThat(options.bottommostCompressionOptions().maxDictBytes())
+          .isEqualTo(123);
+    }
+  }
+
+  @Test
+  public void compressionOptions() {
+    try (final Options options = new Options();
+         final CompressionOptions compressionOptions = new CompressionOptions()
+             .setMaxDictBytes(123)) {
+
+      options.setCompressionOptions(compressionOptions);
+      assertThat(options.compressionOptions())
+          .isEqualTo(compressionOptions);
+      assertThat(options.compressionOptions().maxDictBytes())
+          .isEqualTo(123);
+    }
+  }
+
+  @Test
+  public void compactionStyles() {
+    try (final Options options = new Options()) {
+      for (final CompactionStyle compactionStyle :
+          CompactionStyle.values()) {
+        options.setCompactionStyle(compactionStyle);
+        assertThat(options.compactionStyle()).
+            isEqualTo(compactionStyle);
+        assertThat(CompactionStyle.valueOf("FIFO")).
+            isEqualTo(CompactionStyle.FIFO);
+      }
+    }
+  }
+
+  @Test
+  public void maxTableFilesSizeFIFO() {
+    try (final Options opt = new Options()) {
+      long longValue = rand.nextLong();
+      // Size has to be positive
+      longValue = (longValue < 0) ? -longValue : longValue;
+      longValue = (longValue == 0) ? longValue + 1 : longValue;
+      opt.setMaxTableFilesSizeFIFO(longValue);
+      assertThat(opt.maxTableFilesSizeFIFO()).
+          isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void rateLimiter() {
+    try (final Options options = new Options();
+         final Options anotherOptions = new Options();
+         final RateLimiter rateLimiter =
+             new RateLimiter(1000, 100 * 1000, 1)) {
+      options.setRateLimiter(rateLimiter);
+      // Test with parameter initialization
+      anotherOptions.setRateLimiter(
+          new RateLimiter(1000));
+    }
+  }
+
+  @Test
+  public void sstFileManager() throws RocksDBException {
+    try (final Options options = new Options();
+         final SstFileManager sstFileManager =
+             new SstFileManager(Env.getDefault())) {
+      options.setSstFileManager(sstFileManager);
+    }
+  }
+
+  @Test
+  public void shouldSetTestPrefixExtractor() {
+    try (final Options options = new Options()) {
+      options.useFixedLengthPrefixExtractor(100);
+      options.useFixedLengthPrefixExtractor(10);
+    }
+  }
+
+  @Test
+  public void shouldSetTestCappedPrefixExtractor() {
+    try (final Options options = new Options()) {
+      options.useCappedPrefixExtractor(100);
+      options.useCappedPrefixExtractor(10);
+    }
+  }
+
+  @Test
+  public void shouldTestMemTableFactoryName()
+      throws RocksDBException {
+    try (final Options options = new Options()) {
+      options.setMemTableConfig(new VectorMemTableConfig());
+      assertThat(options.memTableFactoryName()).
+          isEqualTo("VectorRepFactory");
+      options.setMemTableConfig(
+          new HashLinkedListMemTableConfig());
+      assertThat(options.memTableFactoryName()).
+          isEqualTo("HashLinkedListRepFactory");
+    }
+  }
+
+  @Test
+  public void statistics() {
+    try(final Options options = new Options()) {
+      final Statistics statistics = options.statistics();
+      assertThat(statistics).isNull();
+    }
+
+    try(final Statistics statistics = new Statistics();
+        final Options options = new Options().setStatistics(statistics);
+        final Statistics stats = options.statistics()) {
+      assertThat(stats).isNotNull();
+    }
+  }
+
+  @Test
+  public void maxWriteBufferNumberToMaintain() {
+    try (final Options options = new Options()) {
+      int intValue = rand.nextInt();
+      // Size has to be positive
+      intValue = (intValue < 0) ? -intValue : intValue;
+      intValue = (intValue == 0) ? intValue + 1 : intValue;
+      options.setMaxWriteBufferNumberToMaintain(intValue);
+      assertThat(options.maxWriteBufferNumberToMaintain()).
+          isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void compactionPriorities() {
+    try (final Options options = new Options()) {
+      for (final CompactionPriority compactionPriority :
+          CompactionPriority.values()) {
+        options.setCompactionPriority(compactionPriority);
+        assertThat(options.compactionPriority()).
+            isEqualTo(compactionPriority);
+      }
+    }
+  }
+
+  @Test
+  public void reportBgIoStats() {
+    try (final Options options = new Options()) {
+      final boolean booleanValue = true;
+      options.setReportBgIoStats(booleanValue);
+      assertThat(options.reportBgIoStats()).
+          isEqualTo(booleanValue);
+    }
+  }
+
+  @Test
+  public void ttl() {
+    try (final Options options = new Options()) {
+      options.setTtl(1000 * 60);
+      assertThat(options.ttl()).
+          isEqualTo(1000 * 60);
+    }
+  }
+
+  @Test
+  public void periodicCompactionSeconds() {
+    try (final Options options = new Options()) {
+      options.setPeriodicCompactionSeconds(1000 * 60);
+      assertThat(options.periodicCompactionSeconds()).isEqualTo(1000 * 60);
+    }
+  }
+
+  @Test
+  public void compactionOptionsUniversal() {
+    try (final Options options = new Options();
+         final CompactionOptionsUniversal optUni = new CompactionOptionsUniversal()
+             .setCompressionSizePercent(7)) {
+      options.setCompactionOptionsUniversal(optUni);
+      assertThat(options.compactionOptionsUniversal()).
+          isEqualTo(optUni);
+      assertThat(options.compactionOptionsUniversal().compressionSizePercent())
+          .isEqualTo(7);
+    }
+  }
+
+  @Test
+  public void compactionOptionsFIFO() {
+    try (final Options options = new Options();
+         final CompactionOptionsFIFO optFifo = new CompactionOptionsFIFO()
+             .setMaxTableFilesSize(2000)) {
+      options.setCompactionOptionsFIFO(optFifo);
+      assertThat(options.compactionOptionsFIFO()).
+          isEqualTo(optFifo);
+      assertThat(options.compactionOptionsFIFO().maxTableFilesSize())
+          .isEqualTo(2000);
+    }
+  }
+
+  @Test
+  public void forceConsistencyChecks() {
+    try (final Options options = new Options()) {
+      final boolean booleanValue = true;
+      options.setForceConsistencyChecks(booleanValue);
+      assertThat(options.forceConsistencyChecks()).
+          isEqualTo(booleanValue);
+    }
+  }
+
+  @Test
+  public void compactionFilter() {
+    try(final Options options = new Options();
+        final RemoveEmptyValueCompactionFilter cf = new RemoveEmptyValueCompactionFilter()) {
+      options.setCompactionFilter(cf);
+      assertThat(options.compactionFilter()).isEqualTo(cf);
+    }
+  }
+
+  @Test
+  public void compactionFilterFactory() {
+    try(final Options options = new Options();
+        final RemoveEmptyValueCompactionFilterFactory cff = new RemoveEmptyValueCompactionFilterFactory()) {
+      options.setCompactionFilterFactory(cff);
+      assertThat(options.compactionFilterFactory()).isEqualTo(cff);
+    }
+  }
+
+  @Test
+  public void compactionThreadLimiter() {
+    try (final Options options = new Options();
+         final ConcurrentTaskLimiter compactionThreadLimiter =
+             new ConcurrentTaskLimiterImpl("name", 3)) {
+      options.setCompactionThreadLimiter(compactionThreadLimiter);
+      assertThat(options.compactionThreadLimiter()).isEqualTo(compactionThreadLimiter);
+    }
+  }
+
+  @Test
+  public void oldDefaults() {
+    try (final Options options = new Options()) {
+      options.oldDefaults(4, 6);
+      assertThat(options.writeBufferSize()).isEqualTo(4 << 20);
+      assertThat(options.compactionPriority()).isEqualTo(CompactionPriority.ByCompensatedSize);
+      assertThat(options.targetFileSizeBase()).isEqualTo(2 * 1048576);
+      assertThat(options.maxBytesForLevelBase()).isEqualTo(10 * 1048576);
+      assertThat(options.softPendingCompactionBytesLimit()).isEqualTo(0);
+      assertThat(options.hardPendingCompactionBytesLimit()).isEqualTo(0);
+      assertThat(options.level0StopWritesTrigger()).isEqualTo(24);
+    }
+  }
+
+  @Test
+  public void optimizeForSmallDbWithCache() {
+    try (final Options options = new Options(); final Cache cache = new LRUCache(1024)) {
+      assertThat(options.optimizeForSmallDb(cache)).isEqualTo(options);
+    }
+  }
+
+  @Test
+  public void cfPaths() {
+    try (final Options options = new Options()) {
+      final List<DbPath> paths = Arrays.asList(
+          new DbPath(Paths.get("test1"), 2 << 25), new DbPath(Paths.get("/test2/path"), 2 << 25));
+      assertThat(options.cfPaths()).isEqualTo(Collections.emptyList());
+      assertThat(options.setCfPaths(paths)).isEqualTo(options);
+      assertThat(options.cfPaths()).isEqualTo(paths);
+    }
+  }
+
+  @Test
+  public void avoidUnnecessaryBlockingIO() {
+    try (final Options options = new Options()) {
+      assertThat(options.avoidUnnecessaryBlockingIO()).isEqualTo(false);
+      assertThat(options.setAvoidUnnecessaryBlockingIO(true)).isEqualTo(options);
+      assertThat(options.avoidUnnecessaryBlockingIO()).isEqualTo(true);
+    }
+  }
+
+  @Test
+  public void persistStatsToDisk() {
+    try (final Options options = new Options()) {
+      assertThat(options.persistStatsToDisk()).isEqualTo(false);
+      assertThat(options.setPersistStatsToDisk(true)).isEqualTo(options);
+      assertThat(options.persistStatsToDisk()).isEqualTo(true);
+    }
+  }
+
+  @Test
+  public void writeDbidToManifest() {
+    try (final Options options = new Options()) {
+      assertThat(options.writeDbidToManifest()).isEqualTo(false);
+      assertThat(options.setWriteDbidToManifest(true)).isEqualTo(options);
+      assertThat(options.writeDbidToManifest()).isEqualTo(true);
+    }
+  }
+
+  @Test
+  public void logReadaheadSize() {
+    try (final Options options = new Options()) {
+      assertThat(options.logReadaheadSize()).isEqualTo(0);
+      final int size = 1024 * 1024 * 100;
+      assertThat(options.setLogReadaheadSize(size)).isEqualTo(options);
+      assertThat(options.logReadaheadSize()).isEqualTo(size);
+    }
+  }
+
+  @Test
+  public void bestEffortsRecovery() {
+    try (final Options options = new Options()) {
+      assertThat(options.bestEffortsRecovery()).isEqualTo(false);
+      assertThat(options.setBestEffortsRecovery(true)).isEqualTo(options);
+      assertThat(options.bestEffortsRecovery()).isEqualTo(true);
+    }
+  }
+
+  @Test
+  public void maxBgerrorResumeCount() {
+    try (final Options options = new Options()) {
+      final int INT_MAX = 2147483647;
+      assertThat(options.maxBgerrorResumeCount()).isEqualTo(INT_MAX);
+      assertThat(options.setMaxBgErrorResumeCount(-1)).isEqualTo(options);
+      assertThat(options.maxBgerrorResumeCount()).isEqualTo(-1);
+    }
+  }
+
+  @Test
+  public void bgerrorResumeRetryInterval() {
+    try (final Options options = new Options()) {
+      assertThat(options.bgerrorResumeRetryInterval()).isEqualTo(1000000);
+      final long newRetryInterval = 24 * 3600 * 1000000L;
+      assertThat(options.setBgerrorResumeRetryInterval(newRetryInterval)).isEqualTo(options);
+      assertThat(options.bgerrorResumeRetryInterval()).isEqualTo(newRetryInterval);
+    }
+  }
+
+  @Test
+  public void maxWriteBatchGroupSizeBytes() {
+    try (final Options options = new Options()) {
+      assertThat(options.maxWriteBatchGroupSizeBytes()).isEqualTo(1024 * 1024);
+      final long size = 1024 * 1024 * 1024 * 10L;
+      assertThat(options.setMaxWriteBatchGroupSizeBytes(size)).isEqualTo(options);
+      assertThat(options.maxWriteBatchGroupSizeBytes()).isEqualTo(size);
+    }
+  }
+
+  @Test
+  public void skipCheckingSstFileSizesOnDbOpen() {
+    try (final Options options = new Options()) {
+      assertThat(options.skipCheckingSstFileSizesOnDbOpen()).isEqualTo(false);
+      assertThat(options.setSkipCheckingSstFileSizesOnDbOpen(true)).isEqualTo(options);
+      assertThat(options.skipCheckingSstFileSizesOnDbOpen()).isEqualTo(true);
+    }
+  }
+
+  @Test
+  public void eventListeners() {
+    final AtomicBoolean wasCalled1 = new AtomicBoolean();
+    final AtomicBoolean wasCalled2 = new AtomicBoolean();
+    try (final Options options = new Options();
+         final AbstractEventListener el1 =
+             new AbstractEventListener() {
+               @Override
+               public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) {
+                 wasCalled1.set(true);
+               }
+             };
+         final AbstractEventListener el2 =
+             new AbstractEventListener() {
+               @Override
+               public void onMemTableSealed(final MemTableInfo memTableInfo) {
+                 wasCalled2.set(true);
+               }
+             }) {
+      assertThat(options.setListeners(Arrays.asList(el1, el2))).isEqualTo(options);
+      List<AbstractEventListener> listeners = options.listeners();
+      assertEquals(el1, listeners.get(0));
+      assertEquals(el2, listeners.get(1));
+      options.setListeners(Collections.<AbstractEventListener>emptyList());
+      listeners.get(0).onTableFileDeleted(null);
+      assertTrue(wasCalled1.get());
+      listeners.get(1).onMemTableSealed(null);
+      assertTrue(wasCalled2.get());
+      List<AbstractEventListener> listeners2 = options.listeners();
+      assertNotNull(listeners2);
+      assertEquals(0, listeners2.size());
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/OptionsUtilTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/OptionsUtilTest.java
new file mode 100644
index 000000000..b84314eec
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/OptionsUtilTest.java
@@ -0,0 +1,126 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.util.*;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class OptionsUtilTest {
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE = new RocksNativeLibraryResource();
+
+  @Rule public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  enum TestAPI { LOAD_LATEST_OPTIONS, LOAD_OPTIONS_FROM_FILE }
+
+  @Test
+  public void loadLatestOptions() throws RocksDBException {
+    verifyOptions(TestAPI.LOAD_LATEST_OPTIONS);
+  }
+
+  @Test
+  public void loadOptionsFromFile() throws RocksDBException {
+    verifyOptions(TestAPI.LOAD_OPTIONS_FROM_FILE);
+  }
+
+  @Test
+  public void getLatestOptionsFileName() throws RocksDBException {
+    final String dbPath = dbFolder.getRoot().getAbsolutePath();
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options, dbPath)) {
+      assertThat(db).isNotNull();
+    }
+
+    String fName = OptionsUtil.getLatestOptionsFileName(dbPath, Env.getDefault());
+    assertThat(fName).isNotNull();
+    assert(fName.startsWith("OPTIONS-") == true);
+    // System.out.println("latest options fileName: " + fName);
+  }
+
+  private void verifyOptions(TestAPI apiType) throws RocksDBException {
+    final String dbPath = dbFolder.getRoot().getAbsolutePath();
+    final Options options = new Options()
+                                .setCreateIfMissing(true)
+                                .setParanoidChecks(false)
+                                .setMaxOpenFiles(478)
+                                .setDelayedWriteRate(1234567L);
+    final ColumnFamilyOptions baseDefaultCFOpts = new ColumnFamilyOptions();
+    final byte[] secondCFName = "new_cf".getBytes();
+    final ColumnFamilyOptions baseSecondCFOpts =
+        new ColumnFamilyOptions()
+            .setWriteBufferSize(70 * 1024)
+            .setMaxWriteBufferNumber(7)
+            .setMaxBytesForLevelBase(53 * 1024 * 1024)
+            .setLevel0FileNumCompactionTrigger(3)
+            .setLevel0SlowdownWritesTrigger(51)
+            .setBottommostCompressionType(CompressionType.ZSTD_COMPRESSION);
+
+    // Create a database with a new column family
+    try (final RocksDB db = RocksDB.open(options, dbPath)) {
+      assertThat(db).isNotNull();
+
+      // create column family
+      try (final ColumnFamilyHandle columnFamilyHandle =
+               db.createColumnFamily(new ColumnFamilyDescriptor(secondCFName, baseSecondCFOpts))) {
+        assert(columnFamilyHandle != null);
+      }
+    }
+
+    // Read the options back and verify
+    DBOptions dbOptions = new DBOptions();
+    final List<ColumnFamilyDescriptor> cfDescs = new ArrayList<>();
+    String path = dbPath;
+    if (apiType == TestAPI.LOAD_LATEST_OPTIONS) {
+      OptionsUtil.loadLatestOptions(path, Env.getDefault(), dbOptions, cfDescs, false);
+    } else if (apiType == TestAPI.LOAD_OPTIONS_FROM_FILE) {
+      path = dbPath + "/" + OptionsUtil.getLatestOptionsFileName(dbPath, Env.getDefault());
+      OptionsUtil.loadOptionsFromFile(path, Env.getDefault(), dbOptions, cfDescs, false);
+    }
+
+    assertThat(dbOptions.createIfMissing()).isEqualTo(options.createIfMissing());
+    assertThat(dbOptions.paranoidChecks()).isEqualTo(options.paranoidChecks());
+    assertThat(dbOptions.maxOpenFiles()).isEqualTo(options.maxOpenFiles());
+    assertThat(dbOptions.delayedWriteRate()).isEqualTo(options.delayedWriteRate());
+
+    assertThat(cfDescs.size()).isEqualTo(2);
+    assertThat(cfDescs.get(0)).isNotNull();
+    assertThat(cfDescs.get(1)).isNotNull();
+    assertThat(cfDescs.get(0).getName()).isEqualTo(RocksDB.DEFAULT_COLUMN_FAMILY);
+    assertThat(cfDescs.get(1).getName()).isEqualTo(secondCFName);
+
+    ColumnFamilyOptions defaultCFOpts = cfDescs.get(0).getOptions();
+    assertThat(defaultCFOpts.writeBufferSize()).isEqualTo(baseDefaultCFOpts.writeBufferSize());
+    assertThat(defaultCFOpts.maxWriteBufferNumber())
+        .isEqualTo(baseDefaultCFOpts.maxWriteBufferNumber());
+    assertThat(defaultCFOpts.maxBytesForLevelBase())
+        .isEqualTo(baseDefaultCFOpts.maxBytesForLevelBase());
+    assertThat(defaultCFOpts.level0FileNumCompactionTrigger())
+        .isEqualTo(baseDefaultCFOpts.level0FileNumCompactionTrigger());
+    assertThat(defaultCFOpts.level0SlowdownWritesTrigger())
+        .isEqualTo(baseDefaultCFOpts.level0SlowdownWritesTrigger());
+    assertThat(defaultCFOpts.bottommostCompressionType())
+        .isEqualTo(baseDefaultCFOpts.bottommostCompressionType());
+
+    ColumnFamilyOptions secondCFOpts = cfDescs.get(1).getOptions();
+    assertThat(secondCFOpts.writeBufferSize()).isEqualTo(baseSecondCFOpts.writeBufferSize());
+    assertThat(secondCFOpts.maxWriteBufferNumber())
+        .isEqualTo(baseSecondCFOpts.maxWriteBufferNumber());
+    assertThat(secondCFOpts.maxBytesForLevelBase())
+        .isEqualTo(baseSecondCFOpts.maxBytesForLevelBase());
+    assertThat(secondCFOpts.level0FileNumCompactionTrigger())
+        .isEqualTo(baseSecondCFOpts.level0FileNumCompactionTrigger());
+    assertThat(secondCFOpts.level0SlowdownWritesTrigger())
+        .isEqualTo(baseSecondCFOpts.level0SlowdownWritesTrigger());
+    assertThat(secondCFOpts.bottommostCompressionType())
+        .isEqualTo(baseSecondCFOpts.bottommostCompressionType());
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/PlainTableConfigTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/PlainTableConfigTest.java
new file mode 100644
index 000000000..c813dbbb4
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/PlainTableConfigTest.java
@@ -0,0 +1,89 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class PlainTableConfigTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Test
+  public void keySize() {
+    PlainTableConfig plainTableConfig = new PlainTableConfig();
+    plainTableConfig.setKeySize(5);
+    assertThat(plainTableConfig.keySize()).
+        isEqualTo(5);
+  }
+
+  @Test
+  public void bloomBitsPerKey() {
+    PlainTableConfig plainTableConfig = new PlainTableConfig();
+    plainTableConfig.setBloomBitsPerKey(11);
+    assertThat(plainTableConfig.bloomBitsPerKey()).
+        isEqualTo(11);
+  }
+
+  @Test
+  public void hashTableRatio() {
+    PlainTableConfig plainTableConfig = new PlainTableConfig();
+    plainTableConfig.setHashTableRatio(0.95);
+    assertThat(plainTableConfig.hashTableRatio()).
+        isEqualTo(0.95);
+  }
+
+  @Test
+  public void indexSparseness() {
+    PlainTableConfig plainTableConfig = new PlainTableConfig();
+    plainTableConfig.setIndexSparseness(18);
+    assertThat(plainTableConfig.indexSparseness()).
+        isEqualTo(18);
+  }
+
+  @Test
+  public void hugePageTlbSize() {
+    PlainTableConfig plainTableConfig = new PlainTableConfig();
+    plainTableConfig.setHugePageTlbSize(1);
+    assertThat(plainTableConfig.hugePageTlbSize()).
+        isEqualTo(1);
+  }
+
+  @Test
+  public void encodingType() {
+    PlainTableConfig plainTableConfig = new PlainTableConfig();
+    plainTableConfig.setEncodingType(EncodingType.kPrefix);
+    assertThat(plainTableConfig.encodingType()).isEqualTo(
+        EncodingType.kPrefix);
+  }
+
+  @Test
+  public void fullScanMode() {
+    PlainTableConfig plainTableConfig = new PlainTableConfig();
+    plainTableConfig.setFullScanMode(true);
+    assertThat(plainTableConfig.fullScanMode()).isTrue();  }
+
+  @Test
+  public void storeIndexInFile() {
+    PlainTableConfig plainTableConfig = new PlainTableConfig();
+    plainTableConfig.setStoreIndexInFile(true);
+    assertThat(plainTableConfig.storeIndexInFile()).
+        isTrue();
+  }
+
+  @Test
+  public void plainTableConfig() {
+    try(final Options opt = new Options()) {
+      final PlainTableConfig plainTableConfig = new PlainTableConfig();
+      opt.setTableFormatConfig(plainTableConfig);
+      assertThat(opt.tableFactoryName()).isEqualTo("PlainTable");
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/PlatformRandomHelper.java b/src/rocksdb/java/src/test/java/org/rocksdb/PlatformRandomHelper.java
new file mode 100644
index 000000000..80ea4d197
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/PlatformRandomHelper.java
@@ -0,0 +1,58 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Random;
+
+/**
+ * Helper class to get the appropriate Random class instance dependent
+ * on the current platform architecture (32bit vs 64bit)
+ */
+public class PlatformRandomHelper {
+    /**
+     * Determine if OS is 32-Bit/64-Bit
+     *
+     * @return boolean value indicating if operating system is 64 Bit.
+     */
+    public static boolean isOs64Bit(){
+      final boolean is64Bit;
+      if (System.getProperty("os.name").contains("Windows")) {
+        is64Bit = (System.getenv("ProgramFiles(x86)") != null);
+      } else {
+        is64Bit = (System.getProperty("os.arch").contains("64"));
+      }
+      return is64Bit;
+    }
+
+    /**
+     * Factory to get a platform specific Random instance
+     *
+     * @return {@link java.util.Random} instance.
+     */
+    public static Random getPlatformSpecificRandomFactory(){
+      if (isOs64Bit()) {
+        return new Random();
+      }
+      return new Random32Bit();
+    }
+
+    /**
+     * Random32Bit is a class which overrides {@code nextLong} to
+     * provide random numbers which fit in size_t. This workaround
+     * is necessary because there is no unsigned_int &lt; Java 8
+     */
+    private static class Random32Bit extends Random {
+      @Override
+      public long nextLong(){
+      return this.nextInt(Integer.MAX_VALUE);
+    }
+    }
+
+    /**
+     * Utility class constructor
+     */
+    private PlatformRandomHelper() { }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/PutMultiplePartsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/PutMultiplePartsTest.java
new file mode 100644
index 000000000..471ef0728
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/PutMultiplePartsTest.java
@@ -0,0 +1,164 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+@RunWith(Parameterized.class)
+public class PutMultiplePartsTest {
+  @Parameterized.Parameters
+  public static List<Integer> data() {
+    return Arrays.asList(2, 3, 250, 20000);
+  }
+
+  @Rule public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  private final int numParts;
+
+  public PutMultiplePartsTest(final Integer numParts) {
+    this.numParts = numParts;
+  }
+
+  @Test
+  public void putUntracked() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+         final TransactionDB txnDB =
+             TransactionDB.open(options, txnDbOptions, dbFolder.getRoot().getAbsolutePath())) {
+      try (final Transaction transaction = txnDB.beginTransaction(new WriteOptions())) {
+        final byte[][] keys = generateItems("key", ":", numParts);
+        final byte[][] values = generateItems("value", "", numParts);
+        transaction.putUntracked(keys, values);
+        transaction.commit();
+      }
+      txnDB.syncWal();
+    }
+
+    validateResults();
+  }
+
+  @Test
+  public void put() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+         final TransactionDB txnDB =
+             TransactionDB.open(options, txnDbOptions, dbFolder.getRoot().getAbsolutePath())) {
+      try (final Transaction transaction = txnDB.beginTransaction(new WriteOptions())) {
+        final byte[][] keys = generateItems("key", ":", numParts);
+        final byte[][] values = generateItems("value", "", numParts);
+        transaction.put(keys, values);
+        transaction.commit();
+      }
+      txnDB.syncWal();
+    }
+
+    validateResults();
+  }
+
+  @Test
+  public void putUntrackedCF() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+         final TransactionDB txnDB =
+             TransactionDB.open(options, txnDbOptions, dbFolder.getRoot().getAbsolutePath());
+         final ColumnFamilyHandle columnFamilyHandle =
+             txnDB.createColumnFamily(new ColumnFamilyDescriptor("cfTest".getBytes()))) {
+      try (final Transaction transaction = txnDB.beginTransaction(new WriteOptions())) {
+        final byte[][] keys = generateItems("key", ":", numParts);
+        final byte[][] values = generateItems("value", "", numParts);
+        transaction.putUntracked(columnFamilyHandle, keys, values);
+        transaction.commit();
+      }
+      txnDB.syncWal();
+    }
+
+    validateResultsCF();
+  }
+  @Test
+  public void putCF() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+         final TransactionDB txnDB =
+             TransactionDB.open(options, txnDbOptions, dbFolder.getRoot().getAbsolutePath());
+         final ColumnFamilyHandle columnFamilyHandle =
+             txnDB.createColumnFamily(new ColumnFamilyDescriptor("cfTest".getBytes()))) {
+      try (final Transaction transaction = txnDB.beginTransaction(new WriteOptions())) {
+        final byte[][] keys = generateItems("key", ":", numParts);
+        final byte[][] values = generateItems("value", "", numParts);
+        transaction.put(columnFamilyHandle, keys, values);
+        transaction.commit();
+      }
+      txnDB.syncWal();
+    }
+
+    validateResultsCF();
+  }
+
+  private void validateResults() throws RocksDBException {
+    try (final RocksDB db = RocksDB.open(new Options(), dbFolder.getRoot().getAbsolutePath())) {
+      final List<byte[]> keys = generateItemsAsList("key", ":", numParts);
+      final byte[][] values = generateItems("value", "", numParts);
+
+      StringBuilder singleKey = new StringBuilder();
+      for (int i = 0; i < numParts; i++) {
+        singleKey.append(new String(keys.get(i), StandardCharsets.UTF_8));
+      }
+      final byte[] result = db.get(singleKey.toString().getBytes());
+      StringBuilder singleValue = new StringBuilder();
+      for (int i = 0; i < numParts; i++) {
+        singleValue.append(new String(values[i], StandardCharsets.UTF_8));
+      }
+      assertThat(result).isEqualTo(singleValue.toString().getBytes());
+    }
+  }
+
+  private void validateResultsCF() throws RocksDBException {
+    final List<ColumnFamilyDescriptor> columnFamilyDescriptors = new ArrayList<>();
+    columnFamilyDescriptors.add(new ColumnFamilyDescriptor("cfTest".getBytes()));
+    columnFamilyDescriptors.add(new ColumnFamilyDescriptor("default".getBytes()));
+    final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+    try (final RocksDB db = RocksDB.open(new DBOptions(), dbFolder.getRoot().getAbsolutePath(),
+             columnFamilyDescriptors, columnFamilyHandles)) {
+      final List<byte[]> keys = generateItemsAsList("key", ":", numParts);
+      final byte[][] values = generateItems("value", "", numParts);
+
+      StringBuilder singleKey = new StringBuilder();
+      for (int i = 0; i < numParts; i++) {
+        singleKey.append(new String(keys.get(i), StandardCharsets.UTF_8));
+      }
+      final byte[] result = db.get(columnFamilyHandles.get(0), singleKey.toString().getBytes());
+      StringBuilder singleValue = new StringBuilder();
+      for (int i = 0; i < numParts; i++) {
+        singleValue.append(new String(values[i], StandardCharsets.UTF_8));
+      }
+      assertThat(result).isEqualTo(singleValue.toString().getBytes());
+    }
+  }
+
+  private byte[][] generateItems(final String prefix, final String suffix, final int numItems) {
+    return generateItemsAsList(prefix, suffix, numItems).toArray(new byte[0][0]);
+  }
+
+  private List<byte[]> generateItemsAsList(
+      final String prefix, final String suffix, final int numItems) {
+    final List<byte[]> items = new ArrayList<>();
+    for (int i = 0; i < numItems; i++) {
+      items.add((prefix + i + suffix).getBytes());
+    }
+    return items;
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/RateLimiterTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/RateLimiterTest.java
new file mode 100644
index 000000000..e7d6e6c49
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/RateLimiterTest.java
@@ -0,0 +1,65 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.rocksdb.RateLimiter.*;
+
+public class RateLimiterTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Test
+  public void bytesPerSecond() {
+    try(final RateLimiter rateLimiter =
+            new RateLimiter(1000, DEFAULT_REFILL_PERIOD_MICROS,
+                DEFAULT_FAIRNESS, DEFAULT_MODE, DEFAULT_AUTOTUNE)) {
+      assertThat(rateLimiter.getBytesPerSecond()).isGreaterThan(0);
+      rateLimiter.setBytesPerSecond(2000);
+      assertThat(rateLimiter.getBytesPerSecond()).isGreaterThan(0);
+    }
+  }
+
+  @Test
+  public void getSingleBurstBytes() {
+    try(final RateLimiter rateLimiter =
+            new RateLimiter(1000, DEFAULT_REFILL_PERIOD_MICROS,
+                DEFAULT_FAIRNESS, DEFAULT_MODE, DEFAULT_AUTOTUNE)) {
+      assertThat(rateLimiter.getSingleBurstBytes()).isEqualTo(100);
+    }
+  }
+
+  @Test
+  public void getTotalBytesThrough() {
+    try(final RateLimiter rateLimiter =
+            new RateLimiter(1000, DEFAULT_REFILL_PERIOD_MICROS,
+                DEFAULT_FAIRNESS, DEFAULT_MODE, DEFAULT_AUTOTUNE)) {
+      assertThat(rateLimiter.getTotalBytesThrough()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void getTotalRequests() {
+    try(final RateLimiter rateLimiter =
+            new RateLimiter(1000, DEFAULT_REFILL_PERIOD_MICROS,
+                DEFAULT_FAIRNESS, DEFAULT_MODE, DEFAULT_AUTOTUNE)) {
+      assertThat(rateLimiter.getTotalRequests()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void autoTune() {
+    try(final RateLimiter rateLimiter =
+            new RateLimiter(1000, DEFAULT_REFILL_PERIOD_MICROS,
+                DEFAULT_FAIRNESS, DEFAULT_MODE, true)) {
+      assertThat(rateLimiter.getBytesPerSecond()).isGreaterThan(0);
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/ReadOnlyTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/ReadOnlyTest.java
new file mode 100644
index 000000000..5b40a5df1
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/ReadOnlyTest.java
@@ -0,0 +1,234 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class ReadOnlyTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void readOnlyOpen() throws RocksDBException {
+    try (final Options options = new Options()
+        .setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath())) {
+      db.put("key".getBytes(), "value".getBytes());
+    }
+    try (final RocksDB db = RocksDB.openReadOnly(dbFolder.getRoot().getAbsolutePath())) {
+      assertThat("value").isEqualTo(new String(db.get("key".getBytes())));
+    }
+
+    try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) {
+      final List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+      cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts));
+      final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+      try (final RocksDB db = RocksDB.open(
+               dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) {
+        columnFamilyHandleList.add(
+            db.createColumnFamily(new ColumnFamilyDescriptor("new_cf".getBytes(), cfOpts)));
+        columnFamilyHandleList.add(
+            db.createColumnFamily(new ColumnFamilyDescriptor("new_cf2".getBytes(), cfOpts)));
+        db.put(columnFamilyHandleList.get(2), "key2".getBytes(), "value2".getBytes());
+      }
+
+      columnFamilyHandleList.clear();
+      try (final RocksDB db = RocksDB.openReadOnly(
+               dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) {
+        assertThat(db.get("key2".getBytes())).isNull();
+        assertThat(db.get(columnFamilyHandleList.get(0), "key2".getBytes())).isNull();
+      }
+
+      cfDescriptors.clear();
+      cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts));
+      cfDescriptors.add(new ColumnFamilyDescriptor("new_cf2".getBytes(), cfOpts));
+      columnFamilyHandleList.clear();
+      try (final RocksDB db = RocksDB.openReadOnly(
+               dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) {
+        assertThat(new String(db.get(columnFamilyHandleList.get(1), "key2".getBytes())))
+            .isEqualTo("value2");
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failToWriteInReadOnly() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      try (final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) {
+        // no-op
+      }
+    }
+
+    try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) {
+      final List<ColumnFamilyDescriptor> cfDescriptors =
+          Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts));
+
+      final List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList = new ArrayList<>();
+      try (final RocksDB rDb = RocksDB.openReadOnly(dbFolder.getRoot().getAbsolutePath(),
+               cfDescriptors, readOnlyColumnFamilyHandleList)) {
+        // test that put fails in readonly mode
+        rDb.put("key".getBytes(), "value".getBytes());
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failToCFWriteInReadOnly() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath())) {
+      //no-op
+    }
+
+    try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) {
+      final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts)
+      );
+      final List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
+          new ArrayList<>();
+      try (final RocksDB rDb = RocksDB.openReadOnly(
+          dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+          readOnlyColumnFamilyHandleList)) {
+        rDb.put(readOnlyColumnFamilyHandleList.get(0), "key".getBytes(), "value".getBytes());
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failToRemoveInReadOnly() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath())) {
+      //no-op
+    }
+
+    try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) {
+      final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts)
+      );
+
+      final List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
+          new ArrayList<>();
+
+      try (final RocksDB rDb = RocksDB.openReadOnly(
+          dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+          readOnlyColumnFamilyHandleList)) {
+        rDb.delete("key".getBytes());
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failToCFRemoveInReadOnly() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath())) {
+      //no-op
+    }
+
+    try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) {
+      final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts)
+      );
+
+      final List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
+          new ArrayList<>();
+      try (final RocksDB rDb = RocksDB.openReadOnly(
+          dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+          readOnlyColumnFamilyHandleList)) {
+          rDb.delete(readOnlyColumnFamilyHandleList.get(0),
+              "key".getBytes());
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failToWriteBatchReadOnly() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath())) {
+      //no-op
+    }
+
+    try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) {
+      final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts)
+      );
+
+      final List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
+          new ArrayList<>();
+      try (final RocksDB rDb = RocksDB.openReadOnly(
+          dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+          readOnlyColumnFamilyHandleList);
+           final WriteBatch wb = new WriteBatch();
+           final WriteOptions wOpts = new WriteOptions()) {
+          wb.put("key".getBytes(), "value".getBytes());
+          rDb.write(wOpts, wb);
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failToCFWriteBatchReadOnly() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath())) {
+      //no-op
+    }
+
+    try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) {
+      final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts)
+      );
+
+      final List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
+          new ArrayList<>();
+      try (final RocksDB rDb = RocksDB.openReadOnly(
+          dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+          readOnlyColumnFamilyHandleList);
+           final WriteBatch wb = new WriteBatch();
+           final WriteOptions wOpts = new WriteOptions()) {
+          wb.put(readOnlyColumnFamilyHandleList.get(0), "key".getBytes(),
+              "value".getBytes());
+          rDb.write(wOpts, wb);
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void errorIfWalFileExists() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) {
+      // no-op
+    }
+
+    try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) {
+      final List<ColumnFamilyDescriptor> cfDescriptors =
+          Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts));
+
+      final List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList = new ArrayList<>();
+      try (final DBOptions options = new DBOptions();
+           final RocksDB rDb = RocksDB.openReadOnly(options, dbFolder.getRoot().getAbsolutePath(),
+               cfDescriptors, readOnlyColumnFamilyHandleList, true);) {
+        // no-op... should have raised an error as errorIfWalFileExists=true
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/ReadOptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/ReadOptionsTest.java
new file mode 100644
index 000000000..156dd3730
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/ReadOptionsTest.java
@@ -0,0 +1,375 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Arrays;
+import java.util.Random;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class ReadOptionsTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public ExpectedException exception = ExpectedException.none();
+
+  @Test
+  public void altConstructor() {
+    try (final ReadOptions opt = new ReadOptions(true, true)) {
+      assertThat(opt.verifyChecksums()).isTrue();
+      assertThat(opt.fillCache()).isTrue();
+    }
+  }
+
+  @Test
+  public void copyConstructor() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      opt.setVerifyChecksums(false);
+      opt.setFillCache(false);
+      opt.setIterateUpperBound(buildRandomSlice());
+      opt.setIterateLowerBound(buildRandomSlice());
+      opt.setTimestamp(buildRandomSlice());
+      opt.setIterStartTs(buildRandomSlice());
+      try (final ReadOptions other = new ReadOptions(opt)) {
+        assertThat(opt.verifyChecksums()).isEqualTo(other.verifyChecksums());
+        assertThat(opt.fillCache()).isEqualTo(other.fillCache());
+        assertThat(Arrays.equals(opt.iterateUpperBound().data(), other.iterateUpperBound().data())).isTrue();
+        assertThat(Arrays.equals(opt.iterateLowerBound().data(), other.iterateLowerBound().data())).isTrue();
+        assertThat(Arrays.equals(opt.timestamp().data(), other.timestamp().data())).isTrue();
+        assertThat(Arrays.equals(opt.iterStartTs().data(), other.iterStartTs().data())).isTrue();
+      }
+    }
+  }
+
+  @Test
+  public void verifyChecksum() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      final Random rand = new Random();
+      final boolean boolValue = rand.nextBoolean();
+      opt.setVerifyChecksums(boolValue);
+      assertThat(opt.verifyChecksums()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void fillCache() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      final Random rand = new Random();
+      final boolean boolValue = rand.nextBoolean();
+      opt.setFillCache(boolValue);
+      assertThat(opt.fillCache()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void tailing() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      final Random rand = new Random();
+      final boolean boolValue = rand.nextBoolean();
+      opt.setTailing(boolValue);
+      assertThat(opt.tailing()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void snapshot() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      opt.setSnapshot(null);
+      assertThat(opt.snapshot()).isNull();
+    }
+  }
+
+  @Test
+  public void readTier() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      opt.setReadTier(ReadTier.BLOCK_CACHE_TIER);
+      assertThat(opt.readTier()).isEqualTo(ReadTier.BLOCK_CACHE_TIER);
+    }
+  }
+
+  @SuppressWarnings("deprecated")
+  @Test
+  public void managed() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      opt.setManaged(true);
+      assertThat(opt.managed()).isTrue();
+    }
+  }
+
+  @Test
+  public void totalOrderSeek() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      opt.setTotalOrderSeek(true);
+      assertThat(opt.totalOrderSeek()).isTrue();
+    }
+  }
+
+  @Test
+  public void prefixSameAsStart() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      opt.setPrefixSameAsStart(true);
+      assertThat(opt.prefixSameAsStart()).isTrue();
+    }
+  }
+
+  @Test
+  public void pinData() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      opt.setPinData(true);
+      assertThat(opt.pinData()).isTrue();
+    }
+  }
+
+  @Test
+  public void backgroundPurgeOnIteratorCleanup() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      opt.setBackgroundPurgeOnIteratorCleanup(true);
+      assertThat(opt.backgroundPurgeOnIteratorCleanup()).isTrue();
+    }
+  }
+
+  @Test
+  public void readaheadSize() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      final Random rand = new Random();
+      final int intValue = rand.nextInt(2147483647);
+      opt.setReadaheadSize(intValue);
+      assertThat(opt.readaheadSize()).isEqualTo(intValue);
+    }
+  }
+
+  @Test
+  public void ignoreRangeDeletions() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      opt.setIgnoreRangeDeletions(true);
+      assertThat(opt.ignoreRangeDeletions()).isTrue();
+    }
+  }
+
+  @Test
+  public void iterateUpperBound() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      Slice upperBound = buildRandomSlice();
+      opt.setIterateUpperBound(upperBound);
+      assertThat(Arrays.equals(upperBound.data(), opt.iterateUpperBound().data())).isTrue();
+      opt.setIterateUpperBound(null);
+      assertThat(opt.iterateUpperBound()).isNull();
+    }
+  }
+
+  @Test
+  public void iterateUpperBoundNull() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      assertThat(opt.iterateUpperBound()).isNull();
+    }
+  }
+
+  @Test
+  public void iterateLowerBound() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      Slice lowerBound = buildRandomSlice();
+      opt.setIterateLowerBound(lowerBound);
+      assertThat(Arrays.equals(lowerBound.data(), opt.iterateLowerBound().data())).isTrue();
+      opt.setIterateLowerBound(null);
+      assertThat(opt.iterateLowerBound()).isNull();
+    }
+  }
+
+  @Test
+  public void iterateLowerBoundNull() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      assertThat(opt.iterateLowerBound()).isNull();
+    }
+  }
+
+  @Test
+  public void tableFilter() {
+    try (final ReadOptions opt = new ReadOptions();
+         final AbstractTableFilter allTablesFilter = new AllTablesFilter()) {
+      opt.setTableFilter(allTablesFilter);
+    }
+  }
+
+  @Test
+  public void autoPrefixMode() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      opt.setAutoPrefixMode(true);
+      assertThat(opt.autoPrefixMode()).isTrue();
+    }
+  }
+
+  @Test
+  public void timestamp() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      Slice timestamp = buildRandomSlice();
+      opt.setTimestamp(timestamp);
+      assertThat(Arrays.equals(timestamp.data(), opt.timestamp().data())).isTrue();
+      opt.setTimestamp(null);
+      assertThat(opt.timestamp()).isNull();
+    }
+  }
+
+  @Test
+  public void iterStartTs() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      Slice itertStartTsSlice = buildRandomSlice();
+      opt.setIterStartTs(itertStartTsSlice);
+      assertThat(Arrays.equals(itertStartTsSlice.data(), opt.iterStartTs().data())).isTrue();
+      opt.setIterStartTs(null);
+      assertThat(opt.iterStartTs()).isNull();
+    }
+  }
+
+  @Test
+  public void deadline() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      opt.setDeadline(1999l);
+      assertThat(opt.deadline()).isEqualTo(1999l);
+    }
+  }
+
+  @Test
+  public void ioTimeout() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      opt.setIoTimeout(34555l);
+      assertThat(opt.ioTimeout()).isEqualTo(34555l);
+    }
+  }
+
+  @Test
+  public void valueSizeSoftLimit() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      opt.setValueSizeSoftLimit(12134324l);
+      assertThat(opt.valueSizeSoftLimit()).isEqualTo(12134324l);
+    }
+  }
+
+  @Test
+  public void failSetVerifyChecksumUninitialized() {
+    try (final ReadOptions readOptions =
+             setupUninitializedReadOptions(exception)) {
+      readOptions.setVerifyChecksums(true);
+    }
+  }
+
+  @Test
+  public void failVerifyChecksumUninitialized() {
+    try (final ReadOptions readOptions =
+             setupUninitializedReadOptions(exception)) {
+      readOptions.verifyChecksums();
+    }
+  }
+
+  @Test
+  public void failSetFillCacheUninitialized() {
+    try (final ReadOptions readOptions =
+             setupUninitializedReadOptions(exception)) {
+      readOptions.setFillCache(true);
+    }
+  }
+
+  @Test
+  public void failFillCacheUninitialized() {
+    try (final ReadOptions readOptions =
+             setupUninitializedReadOptions(exception)) {
+      readOptions.fillCache();
+    }
+  }
+
+  @Test
+  public void failSetTailingUninitialized() {
+    try (final ReadOptions readOptions =
+             setupUninitializedReadOptions(exception)) {
+      readOptions.setTailing(true);
+    }
+  }
+
+  @Test
+  public void failTailingUninitialized() {
+    try (final ReadOptions readOptions =
+             setupUninitializedReadOptions(exception)) {
+      readOptions.tailing();
+    }
+  }
+
+  @Test
+  public void failSetSnapshotUninitialized() {
+    try (final ReadOptions readOptions =
+             setupUninitializedReadOptions(exception)) {
+      readOptions.setSnapshot(null);
+    }
+  }
+
+  @Test
+  public void failSnapshotUninitialized() {
+    try (final ReadOptions readOptions =
+             setupUninitializedReadOptions(exception)) {
+      readOptions.snapshot();
+    }
+  }
+
+  @Test
+  public void failSetIterateUpperBoundUninitialized() {
+    try (final ReadOptions readOptions =
+             setupUninitializedReadOptions(exception)) {
+      readOptions.setIterateUpperBound(null);
+    }
+  }
+
+  @Test
+  public void failIterateUpperBoundUninitialized() {
+    try (final ReadOptions readOptions =
+             setupUninitializedReadOptions(exception)) {
+      readOptions.iterateUpperBound();
+    }
+  }
+
+  @Test
+  public void failSetIterateLowerBoundUninitialized() {
+    try (final ReadOptions readOptions =
+             setupUninitializedReadOptions(exception)) {
+      readOptions.setIterateLowerBound(null);
+    }
+  }
+
+  @Test
+  public void failIterateLowerBoundUninitialized() {
+    try (final ReadOptions readOptions =
+             setupUninitializedReadOptions(exception)) {
+      readOptions.iterateLowerBound();
+    }
+  }
+
+  private ReadOptions setupUninitializedReadOptions(
+      ExpectedException exception) {
+    final ReadOptions readOptions = new ReadOptions();
+    readOptions.close();
+    exception.expect(AssertionError.class);
+    return readOptions;
+  }
+
+  private Slice buildRandomSlice() {
+    final Random rand = new Random();
+    byte[] sliceBytes = new byte[rand.nextInt(100) + 1];
+    rand.nextBytes(sliceBytes);
+    return new Slice(sliceBytes);
+  }
+
+  private static class AllTablesFilter extends AbstractTableFilter {
+    @Override
+    public boolean filter(final TableProperties tableProperties) {
+      return true;
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/RocksDBExceptionTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/RocksDBExceptionTest.java
new file mode 100644
index 000000000..d3bd4ece7
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/RocksDBExceptionTest.java
@@ -0,0 +1,115 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Test;
+
+import org.rocksdb.Status.Code;
+import org.rocksdb.Status.SubCode;
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.junit.Assert.fail;
+
+public class RocksDBExceptionTest {
+
+  @Test
+  public void exception() {
+    try {
+      raiseException();
+    } catch(final RocksDBException e) {
+      assertThat(e.getStatus()).isNull();
+      assertThat(e.getMessage()).isEqualTo("test message");
+      return;
+    }
+    fail();
+  }
+
+  @Test
+  public void exceptionWithStatusCode() {
+    try {
+      raiseExceptionWithStatusCode();
+    } catch(final RocksDBException e) {
+      assertThat(e.getStatus()).isNotNull();
+      assertThat(e.getStatus().getCode()).isEqualTo(Code.NotSupported);
+      assertThat(e.getStatus().getSubCode()).isEqualTo(SubCode.None);
+      assertThat(e.getStatus().getState()).isNull();
+      assertThat(e.getMessage()).isEqualTo("test message");
+      return;
+    }
+    fail();
+  }
+
+  @Test
+  public void exceptionNoMsgWithStatusCode() {
+    try {
+      raiseExceptionNoMsgWithStatusCode();
+    } catch(final RocksDBException e) {
+      assertThat(e.getStatus()).isNotNull();
+      assertThat(e.getStatus().getCode()).isEqualTo(Code.NotSupported);
+      assertThat(e.getStatus().getSubCode()).isEqualTo(SubCode.None);
+      assertThat(e.getStatus().getState()).isNull();
+      assertThat(e.getMessage()).isEqualTo(Code.NotSupported.name());
+      return;
+    }
+    fail();
+  }
+
+  @Test
+  public void exceptionWithStatusCodeSubCode() {
+    try {
+      raiseExceptionWithStatusCodeSubCode();
+    } catch(final RocksDBException e) {
+      assertThat(e.getStatus()).isNotNull();
+      assertThat(e.getStatus().getCode()).isEqualTo(Code.TimedOut);
+      assertThat(e.getStatus().getSubCode())
+          .isEqualTo(Status.SubCode.LockTimeout);
+      assertThat(e.getStatus().getState()).isNull();
+      assertThat(e.getMessage()).isEqualTo("test message");
+      return;
+    }
+    fail();
+  }
+
+  @Test
+  public void exceptionNoMsgWithStatusCodeSubCode() {
+    try {
+      raiseExceptionNoMsgWithStatusCodeSubCode();
+    } catch(final RocksDBException e) {
+      assertThat(e.getStatus()).isNotNull();
+      assertThat(e.getStatus().getCode()).isEqualTo(Code.TimedOut);
+      assertThat(e.getStatus().getSubCode()).isEqualTo(SubCode.LockTimeout);
+      assertThat(e.getStatus().getState()).isNull();
+      assertThat(e.getMessage()).isEqualTo(Code.TimedOut.name() +
+          "(" + SubCode.LockTimeout.name() + ")");
+      return;
+    }
+    fail();
+  }
+
+  @Test
+  public void exceptionWithStatusCodeState() {
+    try {
+      raiseExceptionWithStatusCodeState();
+    } catch(final RocksDBException e) {
+      assertThat(e.getStatus()).isNotNull();
+      assertThat(e.getStatus().getCode()).isEqualTo(Code.NotSupported);
+      assertThat(e.getStatus().getSubCode()).isEqualTo(SubCode.None);
+      assertThat(e.getStatus().getState()).isNotNull();
+      assertThat(e.getMessage()).isEqualTo("test message");
+      return;
+    }
+    fail();
+  }
+
+  private native void raiseException() throws RocksDBException;
+  private native void raiseExceptionWithStatusCode() throws RocksDBException;
+  private native void raiseExceptionNoMsgWithStatusCode() throws RocksDBException;
+  private native void raiseExceptionWithStatusCodeSubCode()
+      throws RocksDBException;
+  private native void raiseExceptionNoMsgWithStatusCodeSubCode()
+      throws RocksDBException;
+  private native void raiseExceptionWithStatusCodeState()
+      throws RocksDBException;
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/RocksDBTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/RocksDBTest.java
new file mode 100644
index 000000000..422bed40c
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/RocksDBTest.java
@@ -0,0 +1,1695 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+import org.junit.*;
+import org.junit.rules.ExpectedException;
+import org.junit.rules.TemporaryFolder;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.junit.Assert.fail;
+
+public class RocksDBTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  public static final Random rand = PlatformRandomHelper.
+      getPlatformSpecificRandomFactory();
+
+  @Test
+  public void open() throws RocksDBException {
+    try (final RocksDB db =
+             RocksDB.open(dbFolder.getRoot().getAbsolutePath())) {
+      assertThat(db).isNotNull();
+    }
+  }
+
+  @Test
+  public void open_opt() throws RocksDBException {
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt,
+             dbFolder.getRoot().getAbsolutePath())) {
+      assertThat(db).isNotNull();
+    }
+  }
+
+  @Test
+  public void openWhenOpen() throws RocksDBException {
+    final String dbPath = dbFolder.getRoot().getAbsolutePath();
+
+    try (final RocksDB db1 = RocksDB.open(dbPath)) {
+      try (final RocksDB db2 = RocksDB.open(dbPath)) {
+        fail("Should have thrown an exception when opening the same db twice");
+      } catch (final RocksDBException e) {
+        assertThat(e.getStatus().getCode()).isEqualTo(Status.Code.IOError);
+        assertThat(e.getStatus().getSubCode()).isEqualTo(Status.SubCode.None);
+        assertThat(e.getStatus().getState()).contains("lock ");
+      }
+    }
+  }
+
+  @Test
+  public void createColumnFamily() throws RocksDBException {
+      final byte[] col1Name = "col1".getBytes(UTF_8);
+
+      try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+           final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()
+      ) {
+        try (final ColumnFamilyHandle col1 =
+            db.createColumnFamily(new ColumnFamilyDescriptor(col1Name, cfOpts))) {
+          assertThat(col1).isNotNull();
+          assertThat(col1.getName()).isEqualTo(col1Name);
+        }
+      }
+
+      final List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
+      try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath(),
+        Arrays.asList(
+            new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+            new ColumnFamilyDescriptor(col1Name)),
+            cfHandles)) {
+        try {
+          assertThat(cfHandles.size()).isEqualTo(2);
+          assertThat(cfHandles.get(1)).isNotNull();
+          assertThat(cfHandles.get(1).getName()).isEqualTo(col1Name);
+        } finally {
+          for (final ColumnFamilyHandle cfHandle :
+              cfHandles) {
+            cfHandle.close();
+          }
+        }
+      }
+  }
+
+
+  @Test
+  public void createColumnFamilies() throws RocksDBException {
+    final byte[] col1Name = "col1".getBytes(UTF_8);
+    final byte[] col2Name = "col2".getBytes(UTF_8);
+
+    List<ColumnFamilyHandle> cfHandles;
+    try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+         final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()
+    ) {
+      cfHandles =
+          db.createColumnFamilies(cfOpts, Arrays.asList(col1Name, col2Name));
+      try {
+        assertThat(cfHandles).isNotNull();
+        assertThat(cfHandles.size()).isEqualTo(2);
+        assertThat(cfHandles.get(0).getName()).isEqualTo(col1Name);
+        assertThat(cfHandles.get(1).getName()).isEqualTo(col2Name);
+      } finally {
+        for (final ColumnFamilyHandle cfHandle : cfHandles) {
+          cfHandle.close();
+        }
+      }
+    }
+
+    cfHandles = new ArrayList<>();
+    try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath(),
+        Arrays.asList(
+            new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+            new ColumnFamilyDescriptor(col1Name),
+            new ColumnFamilyDescriptor(col2Name)),
+        cfHandles)) {
+      try {
+        assertThat(cfHandles.size()).isEqualTo(3);
+        assertThat(cfHandles.get(1)).isNotNull();
+        assertThat(cfHandles.get(1).getName()).isEqualTo(col1Name);
+        assertThat(cfHandles.get(2)).isNotNull();
+        assertThat(cfHandles.get(2).getName()).isEqualTo(col2Name);
+      } finally {
+        for (final ColumnFamilyHandle cfHandle : cfHandles) {
+          cfHandle.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  public void createColumnFamiliesfromDescriptors() throws RocksDBException {
+    final byte[] col1Name = "col1".getBytes(UTF_8);
+    final byte[] col2Name = "col2".getBytes(UTF_8);
+
+    List<ColumnFamilyHandle> cfHandles;
+    try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+         final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()
+    ) {
+      cfHandles =
+          db.createColumnFamilies(Arrays.asList(
+              new ColumnFamilyDescriptor(col1Name, cfOpts),
+              new ColumnFamilyDescriptor(col2Name, cfOpts)));
+      try {
+        assertThat(cfHandles).isNotNull();
+        assertThat(cfHandles.size()).isEqualTo(2);
+        assertThat(cfHandles.get(0).getName()).isEqualTo(col1Name);
+        assertThat(cfHandles.get(1).getName()).isEqualTo(col2Name);
+      } finally {
+        for (final ColumnFamilyHandle cfHandle : cfHandles) {
+          cfHandle.close();
+        }
+      }
+    }
+
+    cfHandles = new ArrayList<>();
+    try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath(),
+        Arrays.asList(
+            new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+            new ColumnFamilyDescriptor(col1Name),
+            new ColumnFamilyDescriptor(col2Name)),
+        cfHandles)) {
+      try {
+        assertThat(cfHandles.size()).isEqualTo(3);
+        assertThat(cfHandles.get(1)).isNotNull();
+        assertThat(cfHandles.get(1).getName()).isEqualTo(col1Name);
+        assertThat(cfHandles.get(2)).isNotNull();
+        assertThat(cfHandles.get(2).getName()).isEqualTo(col2Name);
+      } finally {
+        for (final ColumnFamilyHandle cfHandle : cfHandles) {
+          cfHandle.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  public void put() throws RocksDBException {
+    try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+         final WriteOptions opt = new WriteOptions(); final ReadOptions optr = new ReadOptions()) {
+      db.put("key1".getBytes(), "value".getBytes());
+      db.put(opt, "key2".getBytes(), "12345678".getBytes());
+      assertThat(db.get("key1".getBytes())).isEqualTo(
+          "value".getBytes());
+      assertThat(db.get("key2".getBytes())).isEqualTo(
+          "12345678".getBytes());
+
+      ByteBuffer key = ByteBuffer.allocateDirect(12);
+      ByteBuffer value = ByteBuffer.allocateDirect(12);
+      key.position(4);
+      key.put("key3".getBytes());
+      key.position(4).limit(8);
+      value.position(4);
+      value.put("val3".getBytes());
+      value.position(4).limit(8);
+
+      db.put(opt, key, value);
+
+      assertThat(key.position()).isEqualTo(8);
+      assertThat(key.limit()).isEqualTo(8);
+
+      assertThat(value.position()).isEqualTo(8);
+      assertThat(value.limit()).isEqualTo(8);
+
+      key.position(4);
+
+      ByteBuffer result = ByteBuffer.allocateDirect(12);
+      assertThat(db.get(optr, key, result)).isEqualTo(4);
+      assertThat(result.position()).isEqualTo(0);
+      assertThat(result.limit()).isEqualTo(4);
+      assertThat(key.position()).isEqualTo(8);
+      assertThat(key.limit()).isEqualTo(8);
+
+      byte[] tmp = new byte[4];
+      result.get(tmp);
+      assertThat(tmp).isEqualTo("val3".getBytes());
+
+      key.position(4);
+
+      result.clear().position(9);
+      assertThat(db.get(optr, key, result)).isEqualTo(4);
+      assertThat(result.position()).isEqualTo(9);
+      assertThat(result.limit()).isEqualTo(12);
+      assertThat(key.position()).isEqualTo(8);
+      assertThat(key.limit()).isEqualTo(8);
+      byte[] tmp2 = new byte[3];
+      result.get(tmp2);
+      assertThat(tmp2).isEqualTo("val".getBytes());
+
+      // put
+      Segment key3 = sliceSegment("key3");
+      Segment key4 = sliceSegment("key4");
+      Segment value0 = sliceSegment("value 0");
+      Segment value1 = sliceSegment("value 1");
+      db.put(key3.data, key3.offset, key3.len, value0.data, value0.offset, value0.len);
+      db.put(opt, key4.data, key4.offset, key4.len, value1.data, value1.offset, value1.len);
+
+      // compare
+      Assert.assertTrue(value0.isSamePayload(db.get(key3.data, key3.offset, key3.len)));
+      Assert.assertTrue(value1.isSamePayload(db.get(key4.data, key4.offset, key4.len)));
+    }
+  }
+
+  private static Segment sliceSegment(String key) {
+    ByteBuffer rawKey = ByteBuffer.allocate(key.length() + 4);
+    rawKey.put((byte)0);
+    rawKey.put((byte)0);
+    rawKey.put(key.getBytes());
+
+    return new Segment(rawKey.array(), 2, key.length());
+  }
+
+  private static class Segment {
+    final byte[] data;
+    final int offset;
+    final int len;
+
+    public boolean isSamePayload(byte[] value) {
+      if (value == null) {
+        return false;
+      }
+      if (value.length != len) {
+        return false;
+      }
+
+      for (int i = 0; i < value.length; i++) {
+        if (data[i + offset] != value[i]) {
+          return false;
+        }
+      }
+
+      return true;
+    }
+
+    public Segment(byte[] value, int offset, int len) {
+      this.data = value;
+      this.offset = offset;
+      this.len = len;
+    }
+  }
+
+  @Test
+  public void write() throws RocksDBException {
+    try (final StringAppendOperator stringAppendOperator = new StringAppendOperator();
+         final Options options = new Options()
+             .setMergeOperator(stringAppendOperator)
+             .setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath());
+         final WriteOptions opts = new WriteOptions()) {
+
+      try (final WriteBatch wb1 = new WriteBatch()) {
+        wb1.put("key1".getBytes(), "aa".getBytes());
+        wb1.merge("key1".getBytes(), "bb".getBytes());
+
+        try (final WriteBatch wb2 = new WriteBatch()) {
+          wb2.put("key2".getBytes(), "xx".getBytes());
+          wb2.merge("key2".getBytes(), "yy".getBytes());
+          db.write(opts, wb1);
+          db.write(opts, wb2);
+        }
+      }
+
+      assertThat(db.get("key1".getBytes())).isEqualTo(
+          "aa,bb".getBytes());
+      assertThat(db.get("key2".getBytes())).isEqualTo(
+          "xx,yy".getBytes());
+    }
+  }
+
+  @Test
+  public void getWithOutValue() throws RocksDBException {
+    try (final RocksDB db =
+             RocksDB.open(dbFolder.getRoot().getAbsolutePath())) {
+      db.put("key1".getBytes(), "value".getBytes());
+      db.put("key2".getBytes(), "12345678".getBytes());
+      byte[] outValue = new byte[5];
+      // not found value
+      int getResult = db.get("keyNotFound".getBytes(), outValue);
+      assertThat(getResult).isEqualTo(RocksDB.NOT_FOUND);
+      // found value which fits in outValue
+      getResult = db.get("key1".getBytes(), outValue);
+      assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND);
+      assertThat(outValue).isEqualTo("value".getBytes());
+      // found value which fits partially
+      getResult = db.get("key2".getBytes(), outValue);
+      assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND);
+      assertThat(outValue).isEqualTo("12345".getBytes());
+    }
+  }
+
+  @Test
+  public void getWithOutValueReadOptions() throws RocksDBException {
+    try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+         final ReadOptions rOpt = new ReadOptions()) {
+      db.put("key1".getBytes(), "value".getBytes());
+      db.put("key2".getBytes(), "12345678".getBytes());
+      byte[] outValue = new byte[5];
+      // not found value
+      int getResult = db.get(rOpt, "keyNotFound".getBytes(),
+          outValue);
+      assertThat(getResult).isEqualTo(RocksDB.NOT_FOUND);
+      // found value which fits in outValue
+      getResult = db.get(rOpt, "key1".getBytes(), outValue);
+      assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND);
+      assertThat(outValue).isEqualTo("value".getBytes());
+      // found value which fits partially
+      getResult = db.get(rOpt, "key2".getBytes(), outValue);
+      assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND);
+      assertThat(outValue).isEqualTo("12345".getBytes());
+    }
+  }
+
+  @Rule
+  public ExpectedException thrown = ExpectedException.none();
+
+  @Test
+  public void getOutOfArrayMaxSizeValue() throws RocksDBException {
+    final int numberOfValueSplits = 10;
+    final int splitSize = Integer.MAX_VALUE / numberOfValueSplits;
+
+    Runtime runtime = Runtime.getRuntime();
+    long neededMemory = ((long)(splitSize)) * (((long)numberOfValueSplits) + 3);
+    boolean isEnoughMemory = runtime.maxMemory() - runtime.totalMemory() > neededMemory;
+    Assume.assumeTrue(isEnoughMemory);
+
+    final byte[] valueSplit = new byte[splitSize];
+    final byte[] key = "key".getBytes();
+
+    thrown.expect(RocksDBException.class);
+    thrown.expectMessage("Requested array size exceeds VM limit");
+
+    // merge (numberOfValueSplits + 1) valueSplit's to get value size exceeding Integer.MAX_VALUE
+    try (final StringAppendOperator stringAppendOperator = new StringAppendOperator();
+         final Options opt = new Options()
+                 .setCreateIfMissing(true)
+                 .setMergeOperator(stringAppendOperator);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      db.put(key, valueSplit);
+      for (int i = 0; i < numberOfValueSplits; i++) {
+        db.merge(key, valueSplit);
+      }
+      db.get(key);
+    }
+  }
+
+  @Test
+  public void multiGetAsList() throws RocksDBException {
+    try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+         final ReadOptions rOpt = new ReadOptions()) {
+      db.put("key1".getBytes(), "value".getBytes());
+      db.put("key2".getBytes(), "12345678".getBytes());
+      List<byte[]> lookupKeys = new ArrayList<>();
+      lookupKeys.add("key1".getBytes());
+      lookupKeys.add("key2".getBytes());
+      List<byte[]> results = db.multiGetAsList(lookupKeys);
+      assertThat(results).isNotNull();
+      assertThat(results).hasSize(lookupKeys.size());
+      assertThat(results).
+          containsExactly("value".getBytes(), "12345678".getBytes());
+      // test same method with ReadOptions
+      results = db.multiGetAsList(rOpt, lookupKeys);
+      assertThat(results).isNotNull();
+      assertThat(results).
+          contains("value".getBytes(), "12345678".getBytes());
+
+      // remove existing key
+      lookupKeys.remove(1);
+      // add non existing key
+      lookupKeys.add("key3".getBytes());
+      results = db.multiGetAsList(lookupKeys);
+      assertThat(results).isNotNull();
+      assertThat(results).
+          containsExactly("value".getBytes(), null);
+      // test same call with readOptions
+      results = db.multiGetAsList(rOpt, lookupKeys);
+      assertThat(results).isNotNull();
+      assertThat(results).contains("value".getBytes());
+    }
+  }
+
+  @Test
+  public void merge() throws RocksDBException {
+    try (final StringAppendOperator stringAppendOperator = new StringAppendOperator();
+         final Options opt = new Options()
+            .setCreateIfMissing(true)
+            .setMergeOperator(stringAppendOperator);
+         final WriteOptions wOpt = new WriteOptions();
+         final RocksDB db = RocksDB.open(opt,
+             dbFolder.getRoot().getAbsolutePath())
+    ) {
+      db.put("key1".getBytes(), "value".getBytes());
+      assertThat(db.get("key1".getBytes())).isEqualTo(
+          "value".getBytes());
+      // merge key1 with another value portion
+      db.merge("key1".getBytes(), "value2".getBytes());
+      assertThat(db.get("key1".getBytes())).isEqualTo(
+          "value,value2".getBytes());
+      // merge key1 with another value portion
+      db.merge(wOpt, "key1".getBytes(), "value3".getBytes());
+      assertThat(db.get("key1".getBytes())).isEqualTo(
+          "value,value2,value3".getBytes());
+      // merge on non existent key shall insert the value
+      db.merge(wOpt, "key2".getBytes(), "xxxx".getBytes());
+      assertThat(db.get("key2".getBytes())).isEqualTo(
+          "xxxx".getBytes());
+
+      Segment key3 = sliceSegment("key3");
+      Segment key4 = sliceSegment("key4");
+      Segment value0 = sliceSegment("value 0");
+      Segment value1 = sliceSegment("value 1");
+
+      db.merge(key3.data, key3.offset, key3.len, value0.data, value0.offset, value0.len);
+      db.merge(wOpt, key4.data, key4.offset, key4.len, value1.data, value1.offset, value1.len);
+
+      // compare
+      Assert.assertTrue(value0.isSamePayload(db.get(key3.data, key3.offset, key3.len)));
+      Assert.assertTrue(value1.isSamePayload(db.get(key4.data, key4.offset, key4.len)));
+    }
+  }
+
+  @Test
+  public void delete() throws RocksDBException {
+    try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+         final WriteOptions wOpt = new WriteOptions()) {
+      db.put("key1".getBytes(), "value".getBytes());
+      db.put("key2".getBytes(), "12345678".getBytes());
+      db.put("key3".getBytes(), "33".getBytes());
+      assertThat(db.get("key1".getBytes())).isEqualTo(
+          "value".getBytes());
+      assertThat(db.get("key2".getBytes())).isEqualTo(
+          "12345678".getBytes());
+      assertThat(db.get("key3".getBytes())).isEqualTo("33".getBytes());
+      db.delete("key1".getBytes());
+      db.delete(wOpt, "key2".getBytes());
+      ByteBuffer key = ByteBuffer.allocateDirect(16);
+      key.put("key3".getBytes()).flip();
+      db.delete(wOpt, key);
+      assertThat(key.position()).isEqualTo(4);
+      assertThat(key.limit()).isEqualTo(4);
+
+      assertThat(db.get("key1".getBytes())).isNull();
+      assertThat(db.get("key2".getBytes())).isNull();
+
+      Segment key3 = sliceSegment("key3");
+      Segment key4 = sliceSegment("key4");
+      db.put("key3".getBytes(), "key3 value".getBytes());
+      db.put("key4".getBytes(), "key4 value".getBytes());
+
+      db.delete(key3.data, key3.offset, key3.len);
+      db.delete(wOpt, key4.data, key4.offset, key4.len);
+
+      assertThat(db.get("key3".getBytes())).isNull();
+      assertThat(db.get("key4".getBytes())).isNull();
+    }
+  }
+
+  @Test
+  public void singleDelete() throws RocksDBException {
+    try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+         final WriteOptions wOpt = new WriteOptions()) {
+      db.put("key1".getBytes(), "value".getBytes());
+      db.put("key2".getBytes(), "12345678".getBytes());
+      assertThat(db.get("key1".getBytes())).isEqualTo(
+          "value".getBytes());
+      assertThat(db.get("key2".getBytes())).isEqualTo(
+          "12345678".getBytes());
+      db.singleDelete("key1".getBytes());
+      db.singleDelete(wOpt, "key2".getBytes());
+      assertThat(db.get("key1".getBytes())).isNull();
+      assertThat(db.get("key2".getBytes())).isNull();
+    }
+  }
+
+  @Test
+  public void singleDelete_nonExisting() throws RocksDBException {
+    try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+         final WriteOptions wOpt = new WriteOptions()) {
+      db.singleDelete("key1".getBytes());
+      db.singleDelete(wOpt, "key2".getBytes());
+      assertThat(db.get("key1".getBytes())).isNull();
+      assertThat(db.get("key2".getBytes())).isNull();
+    }
+  }
+
+  @Test
+  public void deleteRange() throws RocksDBException {
+    try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath())) {
+      db.put("key1".getBytes(), "value".getBytes());
+      db.put("key2".getBytes(), "12345678".getBytes());
+      db.put("key3".getBytes(), "abcdefg".getBytes());
+      db.put("key4".getBytes(), "xyz".getBytes());
+      assertThat(db.get("key1".getBytes())).isEqualTo("value".getBytes());
+      assertThat(db.get("key2".getBytes())).isEqualTo("12345678".getBytes());
+      assertThat(db.get("key3".getBytes())).isEqualTo("abcdefg".getBytes());
+      assertThat(db.get("key4".getBytes())).isEqualTo("xyz".getBytes());
+      db.deleteRange("key2".getBytes(), "key4".getBytes());
+      assertThat(db.get("key1".getBytes())).isEqualTo("value".getBytes());
+      assertThat(db.get("key2".getBytes())).isNull();
+      assertThat(db.get("key3".getBytes())).isNull();
+      assertThat(db.get("key4".getBytes())).isEqualTo("xyz".getBytes());
+    }
+  }
+
+  @Test
+  public void getIntProperty() throws RocksDBException {
+    try (
+        final Options options = new Options()
+            .setCreateIfMissing(true)
+            .setMaxWriteBufferNumber(10)
+            .setMinWriteBufferNumberToMerge(10);
+        final RocksDB db = RocksDB.open(options,
+            dbFolder.getRoot().getAbsolutePath());
+        final WriteOptions wOpt = new WriteOptions().setDisableWAL(true)
+    ) {
+      db.put(wOpt, "key1".getBytes(), "value1".getBytes());
+      db.put(wOpt, "key2".getBytes(), "value2".getBytes());
+      db.put(wOpt, "key3".getBytes(), "value3".getBytes());
+      db.put(wOpt, "key4".getBytes(), "value4".getBytes());
+      assertThat(db.getLongProperty("rocksdb.num-entries-active-mem-table"))
+          .isGreaterThan(0);
+      assertThat(db.getLongProperty("rocksdb.cur-size-active-mem-table"))
+          .isGreaterThan(0);
+    }
+  }
+
+  @Test
+  public void fullCompactRange() throws RocksDBException {
+    try (final Options opt = new Options().
+        setCreateIfMissing(true).
+        setDisableAutoCompactions(true).
+        setCompactionStyle(CompactionStyle.LEVEL).
+        setNumLevels(4).
+        setWriteBufferSize(100 << 10).
+        setLevelZeroFileNumCompactionTrigger(3).
+        setTargetFileSizeBase(200 << 10).
+        setTargetFileSizeMultiplier(1).
+        setMaxBytesForLevelBase(500 << 10).
+        setMaxBytesForLevelMultiplier(1).
+        setDisableAutoCompactions(false);
+         final RocksDB db = RocksDB.open(opt,
+             dbFolder.getRoot().getAbsolutePath())) {
+      // fill database with key/value pairs
+      byte[] b = new byte[10000];
+      for (int i = 0; i < 200; i++) {
+        rand.nextBytes(b);
+        db.put((String.valueOf(i)).getBytes(), b);
+      }
+      db.compactRange();
+    }
+  }
+
+  @Test
+  public void fullCompactRangeColumnFamily()
+      throws RocksDBException {
+    try (
+        final DBOptions opt = new DBOptions().
+            setCreateIfMissing(true).
+            setCreateMissingColumnFamilies(true);
+        final ColumnFamilyOptions new_cf_opts = new ColumnFamilyOptions().
+            setDisableAutoCompactions(true).
+            setCompactionStyle(CompactionStyle.LEVEL).
+            setNumLevels(4).
+            setWriteBufferSize(100 << 10).
+            setLevelZeroFileNumCompactionTrigger(3).
+            setTargetFileSizeBase(200 << 10).
+            setTargetFileSizeMultiplier(1).
+            setMaxBytesForLevelBase(500 << 10).
+            setMaxBytesForLevelMultiplier(1).
+            setDisableAutoCompactions(false)
+    ) {
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+          Arrays.asList(
+              new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+              new ColumnFamilyDescriptor("new_cf".getBytes(), new_cf_opts));
+
+      // open database
+      final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+      try (final RocksDB db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath(),
+          columnFamilyDescriptors,
+          columnFamilyHandles)) {
+        try {
+          // fill database with key/value pairs
+          byte[] b = new byte[10000];
+          for (int i = 0; i < 200; i++) {
+            rand.nextBytes(b);
+            db.put(columnFamilyHandles.get(1),
+                String.valueOf(i).getBytes(), b);
+          }
+          db.compactRange(columnFamilyHandles.get(1));
+        } finally {
+          for (final ColumnFamilyHandle handle : columnFamilyHandles) {
+            handle.close();
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void compactRangeWithKeys()
+      throws RocksDBException {
+    try (final Options opt = new Options().
+        setCreateIfMissing(true).
+        setDisableAutoCompactions(true).
+        setCompactionStyle(CompactionStyle.LEVEL).
+        setNumLevels(4).
+        setWriteBufferSize(100 << 10).
+        setLevelZeroFileNumCompactionTrigger(3).
+        setTargetFileSizeBase(200 << 10).
+        setTargetFileSizeMultiplier(1).
+        setMaxBytesForLevelBase(500 << 10).
+        setMaxBytesForLevelMultiplier(1).
+        setDisableAutoCompactions(false);
+         final RocksDB db = RocksDB.open(opt,
+             dbFolder.getRoot().getAbsolutePath())) {
+      // fill database with key/value pairs
+      byte[] b = new byte[10000];
+      for (int i = 0; i < 200; i++) {
+        rand.nextBytes(b);
+        db.put((String.valueOf(i)).getBytes(), b);
+      }
+      db.compactRange("0".getBytes(), "201".getBytes());
+    }
+  }
+
+  @Test
+  public void compactRangeWithKeysReduce()
+      throws RocksDBException {
+    try (
+        final Options opt = new Options().
+            setCreateIfMissing(true).
+            setDisableAutoCompactions(true).
+            setCompactionStyle(CompactionStyle.LEVEL).
+            setNumLevels(4).
+            setWriteBufferSize(100 << 10).
+            setLevelZeroFileNumCompactionTrigger(3).
+            setTargetFileSizeBase(200 << 10).
+            setTargetFileSizeMultiplier(1).
+            setMaxBytesForLevelBase(500 << 10).
+            setMaxBytesForLevelMultiplier(1).
+            setDisableAutoCompactions(false);
+        final RocksDB db = RocksDB.open(opt,
+            dbFolder.getRoot().getAbsolutePath())) {
+      // fill database with key/value pairs
+      byte[] b = new byte[10000];
+      for (int i = 0; i < 200; i++) {
+        rand.nextBytes(b);
+        db.put((String.valueOf(i)).getBytes(), b);
+      }
+      db.flush(new FlushOptions().setWaitForFlush(true));
+      try (final CompactRangeOptions compactRangeOpts = new CompactRangeOptions()
+            .setChangeLevel(true)
+            .setTargetLevel(-1)
+            .setTargetPathId(0)) {
+        db.compactRange(null, "0".getBytes(), "201".getBytes(),
+            compactRangeOpts);
+      }
+    }
+  }
+
+  @Test
+  public void compactRangeWithKeysColumnFamily()
+      throws RocksDBException {
+    try (final DBOptions opt = new DBOptions().
+        setCreateIfMissing(true).
+        setCreateMissingColumnFamilies(true);
+         final ColumnFamilyOptions new_cf_opts = new ColumnFamilyOptions().
+             setDisableAutoCompactions(true).
+             setCompactionStyle(CompactionStyle.LEVEL).
+             setNumLevels(4).
+             setWriteBufferSize(100 << 10).
+             setLevelZeroFileNumCompactionTrigger(3).
+             setTargetFileSizeBase(200 << 10).
+             setTargetFileSizeMultiplier(1).
+             setMaxBytesForLevelBase(500 << 10).
+             setMaxBytesForLevelMultiplier(1).
+             setDisableAutoCompactions(false)
+    ) {
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+          Arrays.asList(
+              new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+              new ColumnFamilyDescriptor("new_cf".getBytes(), new_cf_opts)
+          );
+
+      // open database
+      final List<ColumnFamilyHandle> columnFamilyHandles =
+          new ArrayList<>();
+      try (final RocksDB db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath(),
+          columnFamilyDescriptors,
+          columnFamilyHandles)) {
+        try {
+          // fill database with key/value pairs
+          byte[] b = new byte[10000];
+          for (int i = 0; i < 200; i++) {
+            rand.nextBytes(b);
+            db.put(columnFamilyHandles.get(1),
+                String.valueOf(i).getBytes(), b);
+          }
+          db.compactRange(columnFamilyHandles.get(1),
+              "0".getBytes(), "201".getBytes());
+        } finally {
+          for (final ColumnFamilyHandle handle : columnFamilyHandles) {
+            handle.close();
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void compactRangeWithKeysReduceColumnFamily()
+      throws RocksDBException {
+    try (final DBOptions opt = new DBOptions().
+        setCreateIfMissing(true).
+        setCreateMissingColumnFamilies(true);
+         final ColumnFamilyOptions new_cf_opts = new ColumnFamilyOptions().
+             setDisableAutoCompactions(true).
+             setCompactionStyle(CompactionStyle.LEVEL).
+             setNumLevels(4).
+             setWriteBufferSize(100 << 10).
+             setLevelZeroFileNumCompactionTrigger(3).
+             setTargetFileSizeBase(200 << 10).
+             setTargetFileSizeMultiplier(1).
+             setMaxBytesForLevelBase(500 << 10).
+             setMaxBytesForLevelMultiplier(1).
+             setDisableAutoCompactions(false)
+    ) {
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+          Arrays.asList(
+              new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+              new ColumnFamilyDescriptor("new_cf".getBytes(), new_cf_opts)
+          );
+
+      final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+      // open database
+      try (final RocksDB db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath(),
+          columnFamilyDescriptors,
+          columnFamilyHandles)) {
+        try (final CompactRangeOptions compactRangeOpts = new CompactRangeOptions()
+            .setChangeLevel(true)
+            .setTargetLevel(-1)
+            .setTargetPathId(0)) {
+          // fill database with key/value pairs
+          byte[] b = new byte[10000];
+          for (int i = 0; i < 200; i++) {
+            rand.nextBytes(b);
+            db.put(columnFamilyHandles.get(1),
+                String.valueOf(i).getBytes(), b);
+          }
+          db.compactRange(columnFamilyHandles.get(1), "0".getBytes(),
+              "201".getBytes(), compactRangeOpts);
+        } finally {
+          for (final ColumnFamilyHandle handle : columnFamilyHandles) {
+            handle.close();
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void compactRangeToLevel()
+      throws RocksDBException, InterruptedException {
+    final int NUM_KEYS_PER_L0_FILE = 100;
+    final int KEY_SIZE = 20;
+    final int VALUE_SIZE = 300;
+    final int L0_FILE_SIZE =
+        NUM_KEYS_PER_L0_FILE * (KEY_SIZE + VALUE_SIZE);
+    final int NUM_L0_FILES = 10;
+    final int TEST_SCALE = 5;
+    final int KEY_INTERVAL = 100;
+    try (final Options opt = new Options().
+        setCreateIfMissing(true).
+        setCompactionStyle(CompactionStyle.LEVEL).
+        setNumLevels(5).
+        // a slightly bigger write buffer than L0 file
+        // so that we can ensure manual flush always
+        // go before background flush happens.
+            setWriteBufferSize(L0_FILE_SIZE * 2).
+        // Disable auto L0 -> L1 compaction
+            setLevelZeroFileNumCompactionTrigger(20).
+            setTargetFileSizeBase(L0_FILE_SIZE * 100).
+            setTargetFileSizeMultiplier(1).
+        // To disable auto compaction
+            setMaxBytesForLevelBase(NUM_L0_FILES * L0_FILE_SIZE * 100).
+            setMaxBytesForLevelMultiplier(2).
+            setDisableAutoCompactions(true);
+         final RocksDB db = RocksDB.open(opt,
+             dbFolder.getRoot().getAbsolutePath())
+    ) {
+      // fill database with key/value pairs
+      byte[] value = new byte[VALUE_SIZE];
+      int int_key = 0;
+      for (int round = 0; round < 5; ++round) {
+        int initial_key = int_key;
+        for (int f = 1; f <= NUM_L0_FILES; ++f) {
+          for (int i = 0; i < NUM_KEYS_PER_L0_FILE; ++i) {
+            int_key += KEY_INTERVAL;
+            rand.nextBytes(value);
+
+            db.put(String.format("%020d", int_key).getBytes(),
+                value);
+          }
+          db.flush(new FlushOptions().setWaitForFlush(true));
+          // Make sure we do create one more L0 files.
+          assertThat(
+              db.getProperty("rocksdb.num-files-at-level0")).
+              isEqualTo("" + f);
+        }
+
+        // Compact all L0 files we just created
+        db.compactRange(
+            String.format("%020d", initial_key).getBytes(),
+            String.format("%020d", int_key - 1).getBytes());
+        // Making sure there isn't any L0 files.
+        assertThat(
+            db.getProperty("rocksdb.num-files-at-level0")).
+            isEqualTo("0");
+        // Making sure there are some L1 files.
+        // Here we only use != 0 instead of a specific number
+        // as we don't want the test make any assumption on
+        // how compaction works.
+        assertThat(
+            db.getProperty("rocksdb.num-files-at-level1")).
+            isNotEqualTo("0");
+        // Because we only compacted those keys we issued
+        // in this round, there shouldn't be any L1 -> L2
+        // compaction.  So we expect zero L2 files here.
+        assertThat(
+            db.getProperty("rocksdb.num-files-at-level2")).
+            isEqualTo("0");
+      }
+    }
+  }
+
+  @Test
+  public void deleteFilesInRange() throws RocksDBException, InterruptedException {
+    final int KEY_SIZE = 20;
+    final int VALUE_SIZE = 1000;
+    final int FILE_SIZE = 64000;
+    final int NUM_FILES = 10;
+
+    final int KEY_INTERVAL = 10000;
+    /*
+     * Intention of these options is to end up reliably with 10 files
+     * we will be deleting using deleteFilesInRange.
+     * It is writing roughly number of keys that will fit in 10 files (target size)
+     * It is writing interleaved so that files from memory on L0 will overlap
+     * Then compaction cleans everything and we should end up with 10 files
+     */
+    try (final Options opt = new Options()
+                                 .setCreateIfMissing(true)
+                                 .setCompressionType(CompressionType.NO_COMPRESSION)
+                                 .setTargetFileSizeBase(FILE_SIZE)
+                                 .setWriteBufferSize(FILE_SIZE / 2)
+                                 .setDisableAutoCompactions(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      int records = FILE_SIZE / (KEY_SIZE + VALUE_SIZE);
+
+      // fill database with key/value pairs
+      byte[] value = new byte[VALUE_SIZE];
+      int key_init = 0;
+      for (int o = 0; o < NUM_FILES; ++o) {
+        int int_key = key_init++;
+        for (int i = 0; i < records; ++i) {
+          int_key += KEY_INTERVAL;
+          rand.nextBytes(value);
+
+          db.put(String.format("%020d", int_key).getBytes(), value);
+        }
+      }
+      db.flush(new FlushOptions().setWaitForFlush(true));
+      db.compactRange();
+      // Make sure we do create one more L0 files.
+      assertThat(db.getProperty("rocksdb.num-files-at-level0")).isEqualTo("0");
+
+      // Should be 10, but we are OK with asserting +- 2
+      int files = Integer.parseInt(db.getProperty("rocksdb.num-files-at-level1"));
+      assertThat(files).isBetween(8, 12);
+
+      // Delete lower 60% (roughly). Result should be 5, but we are OK with asserting +- 2
+      // Important is that we know something was deleted (JNI call did something)
+      // Exact assertions are done in C++ unit tests
+      db.deleteFilesInRanges(null,
+          Arrays.asList(null, String.format("%020d", records * KEY_INTERVAL * 6 / 10).getBytes()),
+          false);
+      files = Integer.parseInt(db.getProperty("rocksdb.num-files-at-level1"));
+      assertThat(files).isBetween(3, 7);
+    }
+  }
+
+  @Test
+  public void compactRangeToLevelColumnFamily()
+      throws RocksDBException {
+    final int NUM_KEYS_PER_L0_FILE = 100;
+    final int KEY_SIZE = 20;
+    final int VALUE_SIZE = 300;
+    final int L0_FILE_SIZE =
+        NUM_KEYS_PER_L0_FILE * (KEY_SIZE + VALUE_SIZE);
+    final int NUM_L0_FILES = 10;
+    final int TEST_SCALE = 5;
+    final int KEY_INTERVAL = 100;
+
+    try (final DBOptions opt = new DBOptions().
+        setCreateIfMissing(true).
+        setCreateMissingColumnFamilies(true);
+         final ColumnFamilyOptions new_cf_opts = new ColumnFamilyOptions().
+             setCompactionStyle(CompactionStyle.LEVEL).
+             setNumLevels(5).
+             // a slightly bigger write buffer than L0 file
+             // so that we can ensure manual flush always
+             // go before background flush happens.
+                 setWriteBufferSize(L0_FILE_SIZE * 2).
+             // Disable auto L0 -> L1 compaction
+                 setLevelZeroFileNumCompactionTrigger(20).
+                 setTargetFileSizeBase(L0_FILE_SIZE * 100).
+                 setTargetFileSizeMultiplier(1).
+             // To disable auto compaction
+                 setMaxBytesForLevelBase(NUM_L0_FILES * L0_FILE_SIZE * 100).
+                 setMaxBytesForLevelMultiplier(2).
+                 setDisableAutoCompactions(true)
+    ) {
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+          Arrays.asList(
+              new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+              new ColumnFamilyDescriptor("new_cf".getBytes(), new_cf_opts)
+          );
+
+      final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+      // open database
+      try (final RocksDB db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath(),
+          columnFamilyDescriptors,
+          columnFamilyHandles)) {
+        try {
+          // fill database with key/value pairs
+          byte[] value = new byte[VALUE_SIZE];
+          int int_key = 0;
+          for (int round = 0; round < 5; ++round) {
+            int initial_key = int_key;
+            for (int f = 1; f <= NUM_L0_FILES; ++f) {
+              for (int i = 0; i < NUM_KEYS_PER_L0_FILE; ++i) {
+                int_key += KEY_INTERVAL;
+                rand.nextBytes(value);
+
+                db.put(columnFamilyHandles.get(1),
+                    String.format("%020d", int_key).getBytes(),
+                    value);
+              }
+              db.flush(new FlushOptions().setWaitForFlush(true),
+                  columnFamilyHandles.get(1));
+              // Make sure we do create one more L0 files.
+              assertThat(
+                  db.getProperty(columnFamilyHandles.get(1),
+                      "rocksdb.num-files-at-level0")).
+                  isEqualTo("" + f);
+            }
+
+            // Compact all L0 files we just created
+            db.compactRange(
+                columnFamilyHandles.get(1),
+                String.format("%020d", initial_key).getBytes(),
+                String.format("%020d", int_key - 1).getBytes());
+            // Making sure there isn't any L0 files.
+            assertThat(
+                db.getProperty(columnFamilyHandles.get(1),
+                    "rocksdb.num-files-at-level0")).
+                isEqualTo("0");
+            // Making sure there are some L1 files.
+            // Here we only use != 0 instead of a specific number
+            // as we don't want the test make any assumption on
+            // how compaction works.
+            assertThat(
+                db.getProperty(columnFamilyHandles.get(1),
+                    "rocksdb.num-files-at-level1")).
+                isNotEqualTo("0");
+            // Because we only compacted those keys we issued
+            // in this round, there shouldn't be any L1 -> L2
+            // compaction.  So we expect zero L2 files here.
+            assertThat(
+                db.getProperty(columnFamilyHandles.get(1),
+                    "rocksdb.num-files-at-level2")).
+                isEqualTo("0");
+          }
+        } finally {
+          for (final ColumnFamilyHandle handle : columnFamilyHandles) {
+            handle.close();
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void continueBackgroundWorkAfterCancelAllBackgroundWork() throws RocksDBException {
+    final int KEY_SIZE = 20;
+    final int VALUE_SIZE = 300;
+    try (final DBOptions opt = new DBOptions().
+        setCreateIfMissing(true).
+        setCreateMissingColumnFamilies(true);
+         final ColumnFamilyOptions new_cf_opts = new ColumnFamilyOptions()
+    ) {
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+          Arrays.asList(
+              new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+              new ColumnFamilyDescriptor("new_cf".getBytes(), new_cf_opts)
+          );
+
+      final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+      // open the database
+      try (final RocksDB db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath(),
+          columnFamilyDescriptors,
+          columnFamilyHandles)) {
+        try {
+          db.cancelAllBackgroundWork(true);
+          try {
+            db.put(new byte[KEY_SIZE], new byte[VALUE_SIZE]);
+            db.flush(new FlushOptions().setWaitForFlush(true));
+            fail("Expected RocksDBException to be thrown if we attempt to trigger a flush after" +
+                " all background work is cancelled.");
+          } catch (RocksDBException ignored) { }
+        } finally {
+          for (final ColumnFamilyHandle handle : columnFamilyHandles) {
+            handle.close();
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void cancelAllBackgroundWorkTwice() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath())
+    ) {
+      // Cancel all background work synchronously
+      db.cancelAllBackgroundWork(true);
+      // Cancel all background work asynchronously
+      db.cancelAllBackgroundWork(false);
+    }
+  }
+
+  @Test
+  public void pauseContinueBackgroundWork() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath())
+    ) {
+      db.pauseBackgroundWork();
+      db.continueBackgroundWork();
+      db.pauseBackgroundWork();
+      db.continueBackgroundWork();
+    }
+  }
+
+  @Test
+  public void enableDisableFileDeletions() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath())
+    ) {
+      db.disableFileDeletions();
+      db.enableFileDeletions(false);
+      db.disableFileDeletions();
+      db.enableFileDeletions(true);
+    }
+  }
+
+  @Test
+  public void setOptions() throws RocksDBException {
+    try (final DBOptions options = new DBOptions()
+             .setCreateIfMissing(true)
+             .setCreateMissingColumnFamilies(true);
+         final ColumnFamilyOptions new_cf_opts = new ColumnFamilyOptions()
+             .setWriteBufferSize(4096)) {
+
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+          Arrays.asList(
+              new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+              new ColumnFamilyDescriptor("new_cf".getBytes(), new_cf_opts));
+
+      // open database
+      final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+      try (final RocksDB db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath(), columnFamilyDescriptors, columnFamilyHandles)) {
+        try {
+          final MutableColumnFamilyOptions mutableOptions =
+              MutableColumnFamilyOptions.builder()
+                  .setWriteBufferSize(2048)
+                  .build();
+
+          db.setOptions(columnFamilyHandles.get(1), mutableOptions);
+
+        } finally {
+          for (final ColumnFamilyHandle handle : columnFamilyHandles) {
+            handle.close();
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void destroyDB() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        db.put("key1".getBytes(), "value".getBytes());
+      }
+      assertThat(dbFolder.getRoot().exists() && dbFolder.getRoot().listFiles().length != 0)
+          .isTrue();
+      RocksDB.destroyDB(dbPath, options);
+      assertThat(dbFolder.getRoot().exists() && dbFolder.getRoot().listFiles().length != 0)
+          .isFalse();
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void destroyDBFailIfOpen() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        // Fails as the db is open and locked.
+        RocksDB.destroyDB(dbPath, options);
+      }
+    }
+  }
+
+  @Test
+  public void getApproximateSizes() throws RocksDBException {
+    final byte key1[] = "key1".getBytes(UTF_8);
+    final byte key2[] = "key2".getBytes(UTF_8);
+    final byte key3[] = "key3".getBytes(UTF_8);
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        db.put(key1, key1);
+        db.put(key2, key2);
+        db.put(key3, key3);
+
+        final long[] sizes = db.getApproximateSizes(
+            Arrays.asList(
+                new Range(new Slice(key1), new Slice(key1)),
+                new Range(new Slice(key2), new Slice(key3))
+            ),
+            SizeApproximationFlag.INCLUDE_FILES,
+            SizeApproximationFlag.INCLUDE_MEMTABLES);
+
+        assertThat(sizes.length).isEqualTo(2);
+        assertThat(sizes[0]).isEqualTo(0);
+        assertThat(sizes[1]).isGreaterThanOrEqualTo(1);
+      }
+    }
+  }
+
+  @Test
+  public void getApproximateMemTableStats() throws RocksDBException {
+    final byte key1[] = "key1".getBytes(UTF_8);
+    final byte key2[] = "key2".getBytes(UTF_8);
+    final byte key3[] = "key3".getBytes(UTF_8);
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        db.put(key1, key1);
+        db.put(key2, key2);
+        db.put(key3, key3);
+
+        final RocksDB.CountAndSize stats =
+            db.getApproximateMemTableStats(
+                new Range(new Slice(key1), new Slice(key3)));
+
+        assertThat(stats).isNotNull();
+        assertThat(stats.count).isGreaterThan(1);
+        assertThat(stats.size).isGreaterThan(1);
+      }
+    }
+  }
+
+  @Test
+  public void getApproximateMemTableStatsSingleKey() throws RocksDBException {
+    final byte key1[] = "key1".getBytes(UTF_8);
+    final byte key2[] = "key2".getBytes(UTF_8);
+    final byte key3[] = "key3".getBytes(UTF_8);
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        db.put(key1, key1);
+
+        final RocksDB.CountAndSize stats =
+            db.getApproximateMemTableStats(new Range(new Slice(key1), new Slice(key3)));
+
+        assertThat(stats).isNotNull();
+        assertThat(stats.count).isEqualTo(1);
+        assertThat(stats.size).isGreaterThan(1);
+      }
+    }
+  }
+
+  @Ignore("TODO(AR) re-enable when ready!")
+  @Test
+  public void compactFiles() throws RocksDBException {
+    final int kTestKeySize = 16;
+    final int kTestValueSize = 984;
+    final int kEntrySize = kTestKeySize + kTestValueSize;
+    final int kEntriesPerBuffer = 100;
+    final int writeBufferSize = kEntrySize * kEntriesPerBuffer;
+    final byte[] cfName = "pikachu".getBytes(UTF_8);
+
+    try (final Options options = new Options()
+        .setCreateIfMissing(true)
+        .setWriteBufferSize(writeBufferSize)
+        .setCompactionStyle(CompactionStyle.LEVEL)
+        .setTargetFileSizeBase(writeBufferSize)
+        .setMaxBytesForLevelBase(writeBufferSize * 2)
+        .setLevel0StopWritesTrigger(2)
+        .setMaxBytesForLevelMultiplier(2)
+        .setCompressionType(CompressionType.NO_COMPRESSION)
+        .setMaxSubcompactions(4)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath);
+           final ColumnFamilyOptions cfOptions = new ColumnFamilyOptions(options)) {
+        db.createColumnFamily(new ColumnFamilyDescriptor(cfName,
+            cfOptions)).close();
+      }
+
+      try (final ColumnFamilyOptions cfOptions = new ColumnFamilyOptions(options)) {
+        final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+            new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOptions),
+            new ColumnFamilyDescriptor(cfName, cfOptions)
+        );
+        final List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
+        try (final DBOptions dbOptions = new DBOptions(options);
+            final RocksDB db = RocksDB.open(dbOptions, dbPath, cfDescriptors,
+                cfHandles);
+        ) {
+          try (final FlushOptions flushOptions = new FlushOptions()
+                .setWaitForFlush(true)
+                .setAllowWriteStall(true);
+               final CompactionOptions compactionOptions = new CompactionOptions()) {
+            final Random rnd = new Random(301);
+            for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
+              final byte[] value = new byte[kTestValueSize];
+              rnd.nextBytes(value);
+              db.put(cfHandles.get(1), Integer.toString(key).getBytes(UTF_8),
+                  value);
+            }
+            db.flush(flushOptions, cfHandles);
+
+            final RocksDB.LiveFiles liveFiles = db.getLiveFiles();
+            final List<String> compactedFiles =
+                db.compactFiles(compactionOptions, cfHandles.get(1),
+                    liveFiles.files, 1, -1, null);
+            assertThat(compactedFiles).isNotEmpty();
+          } finally {
+            for (final ColumnFamilyHandle cfHandle : cfHandles) {
+              cfHandle.close();
+            }
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void enableAutoCompaction() throws RocksDBException {
+    try (final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true)) {
+      final List<ColumnFamilyDescriptor> cfDescs = Arrays.asList(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)
+      );
+      final List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath, cfDescs, cfHandles)) {
+        try {
+          db.enableAutoCompaction(cfHandles);
+        } finally {
+          for (final ColumnFamilyHandle cfHandle : cfHandles) {
+            cfHandle.close();
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void numberLevels() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        assertThat(db.numberLevels()).isEqualTo(7);
+      }
+    }
+  }
+
+  @Test
+  public void maxMemCompactionLevel() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        assertThat(db.maxMemCompactionLevel()).isEqualTo(0);
+      }
+    }
+  }
+
+  @Test
+  public void level0StopWriteTrigger() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        assertThat(db.level0StopWriteTrigger()).isEqualTo(36);
+      }
+    }
+  }
+
+  @Test
+  public void getName() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        assertThat(db.getName()).isEqualTo(dbPath);
+      }
+    }
+  }
+
+  @Test
+  public void getEnv() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        assertThat(db.getEnv()).isEqualTo(Env.getDefault());
+      }
+    }
+  }
+
+  @Test
+  public void flush() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath);
+        final FlushOptions flushOptions = new FlushOptions()) {
+        db.flush(flushOptions);
+      }
+    }
+  }
+
+  @Test
+  public void flushWal() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        db.flushWal(true);
+      }
+    }
+  }
+
+  @Test
+  public void syncWal() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        db.syncWal();
+      }
+    }
+  }
+
+  @Test
+  public void getLiveFiles() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        final RocksDB.LiveFiles livefiles = db.getLiveFiles(true);
+        assertThat(livefiles).isNotNull();
+        assertThat(livefiles.manifestFileSize).isEqualTo(59);
+        assertThat(livefiles.files.size()).isEqualTo(3);
+        assertThat(livefiles.files.get(0)).isEqualTo("/CURRENT");
+        assertThat(livefiles.files.get(1)).isEqualTo("/MANIFEST-000005");
+        assertThat(livefiles.files.get(2)).isEqualTo("/OPTIONS-000007");
+      }
+    }
+  }
+
+  @Test
+  public void getSortedWalFiles() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        db.put("key1".getBytes(UTF_8), "value1".getBytes(UTF_8));
+        final List<LogFile> logFiles = db.getSortedWalFiles();
+        assertThat(logFiles).isNotNull();
+        assertThat(logFiles.size()).isEqualTo(1);
+        assertThat(logFiles.get(0).type())
+            .isEqualTo(WalFileType.kAliveLogFile);
+      }
+    }
+  }
+
+  @Test
+  public void deleteFile() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        db.deleteFile("unknown");
+      }
+    }
+  }
+
+  @Test
+  public void getLiveFilesMetaData() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        db.put("key1".getBytes(UTF_8), "value1".getBytes(UTF_8));
+        final List<LiveFileMetaData> liveFilesMetaData
+            = db.getLiveFilesMetaData();
+        assertThat(liveFilesMetaData).isEmpty();
+      }
+    }
+  }
+
+  @Test
+  public void getColumnFamilyMetaData() throws RocksDBException {
+    try (final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true)) {
+      final List<ColumnFamilyDescriptor> cfDescs = Arrays.asList(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)
+      );
+      final List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath, cfDescs, cfHandles)) {
+        db.put(cfHandles.get(0), "key1".getBytes(UTF_8), "value1".getBytes(UTF_8));
+        try {
+          final ColumnFamilyMetaData cfMetadata =
+              db.getColumnFamilyMetaData(cfHandles.get(0));
+          assertThat(cfMetadata).isNotNull();
+          assertThat(cfMetadata.name()).isEqualTo(RocksDB.DEFAULT_COLUMN_FAMILY);
+          assertThat(cfMetadata.levels().size()).isEqualTo(7);
+        } finally {
+          for (final ColumnFamilyHandle cfHandle : cfHandles) {
+            cfHandle.close();
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void verifyChecksum() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        db.verifyChecksum();
+      }
+    }
+  }
+
+  @Test
+  public void getPropertiesOfAllTables() throws RocksDBException {
+    try (final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true)) {
+      final List<ColumnFamilyDescriptor> cfDescs = Arrays.asList(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)
+      );
+      final List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath, cfDescs, cfHandles)) {
+        db.put(cfHandles.get(0), "key1".getBytes(UTF_8), "value1".getBytes(UTF_8));
+        try {
+          final Map<String, TableProperties> properties =
+              db.getPropertiesOfAllTables(cfHandles.get(0));
+          assertThat(properties).isNotNull();
+        } finally {
+          for (final ColumnFamilyHandle cfHandle : cfHandles) {
+            cfHandle.close();
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void getPropertiesOfTablesInRange() throws RocksDBException {
+    try (final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true)) {
+      final List<ColumnFamilyDescriptor> cfDescs = Arrays.asList(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)
+      );
+      final List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath, cfDescs, cfHandles)) {
+        db.put(cfHandles.get(0), "key1".getBytes(UTF_8), "value1".getBytes(UTF_8));
+        db.put(cfHandles.get(0), "key2".getBytes(UTF_8), "value2".getBytes(UTF_8));
+        db.put(cfHandles.get(0), "key3".getBytes(UTF_8), "value3".getBytes(UTF_8));
+        try {
+          final Range range = new Range(
+              new Slice("key1".getBytes(UTF_8)),
+              new Slice("key3".getBytes(UTF_8)));
+          final Map<String, TableProperties> properties =
+              db.getPropertiesOfTablesInRange(
+                  cfHandles.get(0), Arrays.asList(range));
+          assertThat(properties).isNotNull();
+        } finally {
+          for (final ColumnFamilyHandle cfHandle : cfHandles) {
+            cfHandle.close();
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void suggestCompactRange() throws RocksDBException {
+    try (final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true)) {
+      final List<ColumnFamilyDescriptor> cfDescs = Arrays.asList(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)
+      );
+      final List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath, cfDescs, cfHandles)) {
+        db.put(cfHandles.get(0), "key1".getBytes(UTF_8), "value1".getBytes(UTF_8));
+        db.put(cfHandles.get(0), "key2".getBytes(UTF_8), "value2".getBytes(UTF_8));
+        db.put(cfHandles.get(0), "key3".getBytes(UTF_8), "value3".getBytes(UTF_8));
+        try {
+          final Range range =  db.suggestCompactRange(cfHandles.get(0));
+          assertThat(range).isNotNull();
+        } finally {
+          for (final ColumnFamilyHandle cfHandle : cfHandles) {
+            cfHandle.close();
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void promoteL0() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        db.promoteL0(2);
+      }
+    }
+  }
+
+  @Test
+  public void startTrace() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        final TraceOptions traceOptions = new TraceOptions();
+
+        try (final InMemoryTraceWriter traceWriter = new InMemoryTraceWriter()) {
+          db.startTrace(traceOptions, traceWriter);
+
+          db.put("key1".getBytes(UTF_8), "value1".getBytes(UTF_8));
+
+          db.endTrace();
+
+          final List<byte[]> writes = traceWriter.getWrites();
+          assertThat(writes.size()).isGreaterThan(0);
+        }
+      }
+    }
+  }
+
+  @Test
+  public void setDBOptions() throws RocksDBException {
+    try (final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true)
+        .setCreateMissingColumnFamilies(true);
+         final ColumnFamilyOptions new_cf_opts = new ColumnFamilyOptions()
+             .setWriteBufferSize(4096)) {
+
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+          Arrays.asList(
+              new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+              new ColumnFamilyDescriptor("new_cf".getBytes(), new_cf_opts));
+
+      // open database
+      final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+      try (final RocksDB db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath(), columnFamilyDescriptors, columnFamilyHandles)) {
+        try {
+          final MutableDBOptions mutableOptions =
+              MutableDBOptions.builder()
+                  .setBytesPerSync(1024 * 1027 * 7)
+                  .setAvoidFlushDuringShutdown(false)
+                  .build();
+
+          db.setDBOptions(mutableOptions);
+        } finally {
+          for (final ColumnFamilyHandle handle : columnFamilyHandles) {
+            handle.close();
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void rocksdbVersion() {
+    final RocksDB.Version version = RocksDB.rocksdbVersion();
+    assertThat(version).isNotNull();
+    assertThat(version.getMajor()).isGreaterThan(1);
+  }
+
+  private static class InMemoryTraceWriter extends AbstractTraceWriter {
+    private final List<byte[]> writes = new ArrayList<>();
+    private volatile boolean closed = false;
+
+    @Override
+    public void write(final Slice slice) {
+      if (closed) {
+        return;
+      }
+      final byte[] data = slice.data();
+      final byte[] dataCopy = new byte[data.length];
+      System.arraycopy(data, 0, dataCopy, 0, data.length);
+      writes.add(dataCopy);
+    }
+
+    @Override
+    public void closeWriter() {
+      closed = true;
+    }
+
+    @Override
+    public long getFileSize() {
+      long size = 0;
+      for (int i = 0; i < writes.size(); i++) {
+        size += writes.get(i).length;
+      }
+      return size;
+    }
+
+    public List<byte[]> getWrites() {
+      return writes;
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/RocksIteratorTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/RocksIteratorTest.java
new file mode 100644
index 000000000..2a13550b7
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/RocksIteratorTest.java
@@ -0,0 +1,289 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+public class RocksIteratorTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  private void validateByteBufferResult(
+      final int fetched, final ByteBuffer byteBuffer, final String expected) {
+    assertThat(fetched).isEqualTo(expected.length());
+    assertThat(byteBuffer.position()).isEqualTo(0);
+    assertThat(byteBuffer.limit()).isEqualTo(Math.min(byteBuffer.remaining(), expected.length()));
+    final int bufferSpace = byteBuffer.remaining();
+    final byte[] contents = new byte[bufferSpace];
+    byteBuffer.get(contents, 0, bufferSpace);
+    assertThat(contents).isEqualTo(
+        expected.substring(0, bufferSpace).getBytes(StandardCharsets.UTF_8));
+  }
+
+  private void validateKey(
+      final RocksIterator iterator, final ByteBuffer byteBuffer, final String key) {
+    validateByteBufferResult(iterator.key(byteBuffer), byteBuffer, key);
+  }
+
+  private void validateValue(
+      final RocksIterator iterator, final ByteBuffer byteBuffer, final String value) {
+    validateByteBufferResult(iterator.value(byteBuffer), byteBuffer, value);
+  }
+
+  @Test
+  public void rocksIterator() throws RocksDBException {
+    try (final Options options =
+             new Options().setCreateIfMissing(true).setCreateMissingColumnFamilies(true);
+         final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) {
+      db.put("key1".getBytes(), "value1".getBytes());
+      db.put("key2".getBytes(), "value2".getBytes());
+
+      try (final RocksIterator iterator = db.newIterator()) {
+        iterator.seekToFirst();
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo("key1".getBytes());
+        assertThat(iterator.value()).isEqualTo("value1".getBytes());
+
+        validateKey(iterator, ByteBuffer.allocateDirect(2), "key1");
+        validateKey(iterator, ByteBuffer.allocateDirect(2), "key0");
+        validateKey(iterator, ByteBuffer.allocateDirect(4), "key1");
+        validateKey(iterator, ByteBuffer.allocateDirect(5), "key1");
+        validateValue(iterator, ByteBuffer.allocateDirect(2), "value2");
+        validateValue(iterator, ByteBuffer.allocateDirect(2), "vasicu");
+        validateValue(iterator, ByteBuffer.allocateDirect(8), "value1");
+
+        validateKey(iterator, ByteBuffer.allocate(2), "key1");
+        validateKey(iterator, ByteBuffer.allocate(2), "key0");
+        validateKey(iterator, ByteBuffer.allocate(4), "key1");
+        validateKey(iterator, ByteBuffer.allocate(5), "key1");
+        validateValue(iterator, ByteBuffer.allocate(2), "value1");
+        validateValue(iterator, ByteBuffer.allocate(8), "value1");
+
+        iterator.next();
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo("key2".getBytes());
+        assertThat(iterator.value()).isEqualTo("value2".getBytes());
+        iterator.next();
+        assertThat(iterator.isValid()).isFalse();
+        iterator.seekToLast();
+        iterator.prev();
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo("key1".getBytes());
+        assertThat(iterator.value()).isEqualTo("value1".getBytes());
+        iterator.seekToFirst();
+        iterator.seekToLast();
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo("key2".getBytes());
+        assertThat(iterator.value()).isEqualTo("value2".getBytes());
+        iterator.status();
+
+        {
+          final ByteBuffer key = ByteBuffer.allocate(12);
+          key.put("key1".getBytes()).flip();
+          iterator.seek(key);
+          assertThat(iterator.isValid()).isTrue();
+          assertThat(iterator.value()).isEqualTo("value1".getBytes());
+          assertThat(key.position()).isEqualTo(4);
+          assertThat(key.limit()).isEqualTo(4);
+
+          validateValue(iterator, ByteBuffer.allocateDirect(12), "value1");
+          validateValue(iterator, ByteBuffer.allocateDirect(4), "valu56");
+        }
+
+        {
+          final ByteBuffer key = ByteBuffer.allocate(12);
+          key.put("key2".getBytes()).flip();
+          iterator.seekForPrev(key);
+          assertThat(iterator.isValid()).isTrue();
+          assertThat(iterator.value()).isEqualTo("value2".getBytes());
+          assertThat(key.position()).isEqualTo(4);
+          assertThat(key.limit()).isEqualTo(4);
+        }
+
+        {
+          final ByteBuffer key = ByteBuffer.allocate(12);
+          key.put("key1".getBytes()).flip();
+          iterator.seek(key);
+          assertThat(iterator.isValid()).isTrue();
+          assertThat(iterator.value()).isEqualTo("value1".getBytes());
+          assertThat(key.position()).isEqualTo(4);
+          assertThat(key.limit()).isEqualTo(4);
+        }
+
+        {
+          // Check offsets of slice byte buffers
+          final ByteBuffer key0 = ByteBuffer.allocate(24);
+          key0.put("key2key2".getBytes());
+          final ByteBuffer key = key0.slice();
+          key.put("key1".getBytes()).flip();
+          iterator.seek(key);
+          assertThat(iterator.isValid()).isTrue();
+          assertThat(iterator.value()).isEqualTo("value1".getBytes());
+          assertThat(key.position()).isEqualTo(4);
+          assertThat(key.limit()).isEqualTo(4);
+        }
+
+        {
+          // Check offsets of slice byte buffers
+          final ByteBuffer key0 = ByteBuffer.allocateDirect(24);
+          key0.put("key2key2".getBytes());
+          final ByteBuffer key = key0.slice();
+          key.put("key1".getBytes()).flip();
+          iterator.seek(key);
+          assertThat(iterator.isValid()).isTrue();
+          assertThat(iterator.value()).isEqualTo("value1".getBytes());
+          assertThat(key.position()).isEqualTo(4);
+          assertThat(key.limit()).isEqualTo(4);
+        }
+
+        {
+          final ByteBuffer key = ByteBuffer.allocate(12);
+          key.put("key2".getBytes()).flip();
+          iterator.seekForPrev(key);
+          assertThat(iterator.isValid()).isTrue();
+          assertThat(iterator.value()).isEqualTo("value2".getBytes());
+          assertThat(key.position()).isEqualTo(4);
+          assertThat(key.limit()).isEqualTo(4);
+        }
+      }
+    }
+  }
+
+  @Test
+  public void rocksIteratorSeekAndInsert() throws RocksDBException {
+    try (final Options options =
+             new Options().setCreateIfMissing(true).setCreateMissingColumnFamilies(true);
+         final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) {
+      db.put("key1".getBytes(), "value1".getBytes());
+      db.put("key2".getBytes(), "value2".getBytes());
+
+      try (final RocksIterator iterator = db.newIterator()) {
+        iterator.seek("key0".getBytes());
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo("key1".getBytes());
+
+        iterator.seek("key1".getBytes());
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo("key1".getBytes());
+
+        iterator.seek("key1.5".getBytes());
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo("key2".getBytes());
+
+        iterator.seek("key2".getBytes());
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo("key2".getBytes());
+
+        iterator.seek("key3".getBytes());
+        assertThat(iterator.isValid()).isFalse();
+      }
+
+      try (final RocksIterator iterator = db.newIterator()) {
+        iterator.seekForPrev("key0".getBytes());
+        assertThat(iterator.isValid()).isFalse();
+
+        iterator.seekForPrev("key1".getBytes());
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo("key1".getBytes());
+
+        iterator.seekForPrev("key1.5".getBytes());
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo("key1".getBytes());
+
+        iterator.seekForPrev("key2".getBytes());
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo("key2".getBytes());
+
+        iterator.seekForPrev("key3".getBytes());
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo("key2".getBytes());
+      }
+
+      try (final RocksIterator iterator = db.newIterator()) {
+        iterator.seekToFirst();
+        assertThat(iterator.isValid()).isTrue();
+
+        byte[] lastKey;
+        do {
+          lastKey = iterator.key();
+          iterator.next();
+        } while (iterator.isValid());
+
+        db.put("key3".getBytes(), "value3".getBytes());
+        assertThat(iterator.isValid()).isFalse();
+        iterator.refresh();
+        iterator.seek(lastKey);
+        assertThat(iterator.isValid()).isTrue();
+
+        iterator.next();
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo("key3".getBytes());
+      }
+    }
+  }
+
+  @Test
+  public void rocksIteratorReleaseAfterCfClose() throws RocksDBException {
+    try (final Options options = new Options()
+        .setCreateIfMissing(true)
+        .setCreateMissingColumnFamilies(true);
+         final RocksDB db = RocksDB.open(options,
+             this.dbFolder.getRoot().getAbsolutePath())) {
+      db.put("key".getBytes(), "value".getBytes());
+
+      // Test case: release iterator after default CF close
+      try (final RocksIterator iterator = db.newIterator()) {
+        // In fact, calling close() on default CF has no effect
+        db.getDefaultColumnFamily().close();
+
+        iterator.seekToFirst();
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo("key".getBytes());
+        assertThat(iterator.value()).isEqualTo("value".getBytes());
+      }
+
+      // Test case: release iterator after custom CF close
+      final ColumnFamilyDescriptor cfd1 = new ColumnFamilyDescriptor("cf1".getBytes());
+      final ColumnFamilyHandle cfHandle1 = db.createColumnFamily(cfd1);
+      db.put(cfHandle1, "key1".getBytes(), "value1".getBytes());
+
+      try (final RocksIterator iterator = db.newIterator(cfHandle1)) {
+        cfHandle1.close();
+
+        iterator.seekToFirst();
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo("key1".getBytes());
+        assertThat(iterator.value()).isEqualTo("value1".getBytes());
+      }
+
+      // Test case: release iterator after custom CF drop & close
+      final ColumnFamilyDescriptor cfd2 = new ColumnFamilyDescriptor("cf2".getBytes());
+      final ColumnFamilyHandle cfHandle2 = db.createColumnFamily(cfd2);
+      db.put(cfHandle2, "key2".getBytes(), "value2".getBytes());
+
+      try (final RocksIterator iterator = db.newIterator(cfHandle2)) {
+        db.dropColumnFamily(cfHandle2);
+        cfHandle2.close();
+
+        iterator.seekToFirst();
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo("key2".getBytes());
+        assertThat(iterator.value()).isEqualTo("value2".getBytes());
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/RocksMemEnvTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/RocksMemEnvTest.java
new file mode 100644
index 000000000..cce0c61e0
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/RocksMemEnvTest.java
@@ -0,0 +1,141 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class RocksMemEnvTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Test
+  public void memEnvFillAndReopen() throws RocksDBException {
+
+    final byte[][] keys = {
+        "aaa".getBytes(),
+        "bbb".getBytes(),
+        "ccc".getBytes()
+    };
+
+    final byte[][] values = {
+        "foo".getBytes(),
+        "bar".getBytes(),
+        "baz".getBytes()
+    };
+
+    try (final Env env = new RocksMemEnv(Env.getDefault());
+         final Options options = new Options()
+             .setCreateIfMissing(true)
+             .setEnv(env);
+         final FlushOptions flushOptions = new FlushOptions()
+             .setWaitForFlush(true);
+    ) {
+      try (final RocksDB db = RocksDB.open(options, "/dir/db")) {
+        // write key/value pairs using MemEnv
+        for (int i = 0; i < keys.length; i++) {
+          db.put(keys[i], values[i]);
+        }
+
+        // read key/value pairs using MemEnv
+        for (int i = 0; i < keys.length; i++) {
+          assertThat(db.get(keys[i])).isEqualTo(values[i]);
+        }
+
+        // Check iterator access
+        try (final RocksIterator iterator = db.newIterator()) {
+          iterator.seekToFirst();
+          for (int i = 0; i < keys.length; i++) {
+            assertThat(iterator.isValid()).isTrue();
+            assertThat(iterator.key()).isEqualTo(keys[i]);
+            assertThat(iterator.value()).isEqualTo(values[i]);
+            iterator.next();
+          }
+          // reached end of database
+          assertThat(iterator.isValid()).isFalse();
+        }
+
+        // flush
+        db.flush(flushOptions);
+
+        // read key/value pairs after flush using MemEnv
+        for (int i = 0; i < keys.length; i++) {
+          assertThat(db.get(keys[i])).isEqualTo(values[i]);
+        }
+      }
+
+      options.setCreateIfMissing(false);
+
+      // After reopen the values shall still be in the mem env.
+      // as long as the env is not freed.
+      try (final RocksDB db = RocksDB.open(options, "/dir/db")) {
+        // read key/value pairs using MemEnv
+        for (int i = 0; i < keys.length; i++) {
+          assertThat(db.get(keys[i])).isEqualTo(values[i]);
+        }
+      }
+    }
+  }
+
+  @Test
+  public void multipleDatabaseInstances() throws RocksDBException {
+    // db - keys
+    final byte[][] keys = {
+        "aaa".getBytes(),
+        "bbb".getBytes(),
+        "ccc".getBytes()
+    };
+    // otherDb - keys
+    final byte[][] otherKeys = {
+        "111".getBytes(),
+        "222".getBytes(),
+        "333".getBytes()
+    };
+    // values
+    final byte[][] values = {
+        "foo".getBytes(),
+        "bar".getBytes(),
+        "baz".getBytes()
+    };
+
+    try (final Env env = new RocksMemEnv(Env.getDefault());
+         final Options options = new Options().setCreateIfMissing(true).setEnv(env);
+         final RocksDB db = RocksDB.open(options, "/dir/db");
+         final RocksDB otherDb = RocksDB.open(options, "/dir/otherDb")) {
+      // write key/value pairs using MemEnv
+      // to db and to otherDb.
+      for (int i = 0; i < keys.length; i++) {
+        db.put(keys[i], values[i]);
+        otherDb.put(otherKeys[i], values[i]);
+      }
+
+      // verify key/value pairs after flush using MemEnv
+      for (int i = 0; i < keys.length; i++) {
+        // verify db
+        assertThat(db.get(otherKeys[i])).isNull();
+        assertThat(db.get(keys[i])).isEqualTo(values[i]);
+
+        // verify otherDb
+        assertThat(otherDb.get(keys[i])).isNull();
+        assertThat(otherDb.get(otherKeys[i])).isEqualTo(values[i]);
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void createIfMissingFalse() throws RocksDBException {
+    try (final Env env = new RocksMemEnv(Env.getDefault());
+         final Options options = new Options().setCreateIfMissing(false).setEnv(env);
+         final RocksDB db = RocksDB.open(options, "/db/dir")) {
+      // shall throw an exception because db dir does not
+      // exist.
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/RocksNativeLibraryResource.java b/src/rocksdb/java/src/test/java/org/rocksdb/RocksNativeLibraryResource.java
new file mode 100644
index 000000000..6116f2f92
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/RocksNativeLibraryResource.java
@@ -0,0 +1,18 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.rules.ExternalResource;
+
+/**
+ * Resource to load the RocksDB JNI library.
+ */
+public class RocksNativeLibraryResource extends ExternalResource {
+  @Override
+  protected void before() {
+    RocksDB.loadLibrary();
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/SecondaryDBTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/SecondaryDBTest.java
new file mode 100644
index 000000000..557d4a47d
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/SecondaryDBTest.java
@@ -0,0 +1,135 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.util.ArrayList;
+import java.util.List;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+public class SecondaryDBTest {
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Rule public TemporaryFolder secondaryDbFolder = new TemporaryFolder();
+
+  @Test
+  public void openAsSecondary() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) {
+      db.put("key1".getBytes(), "value1".getBytes());
+      db.put("key2".getBytes(), "value2".getBytes());
+      db.put("key3".getBytes(), "value3".getBytes());
+
+      // open secondary
+      try (final Options secondaryOptions = new Options();
+           final RocksDB secondaryDb =
+               RocksDB.openAsSecondary(secondaryOptions, dbFolder.getRoot().getAbsolutePath(),
+                   secondaryDbFolder.getRoot().getAbsolutePath())) {
+        assertThat(secondaryDb.get("key1".getBytes())).isEqualTo("value1".getBytes());
+        assertThat(secondaryDb.get("key2".getBytes())).isEqualTo("value2".getBytes());
+        assertThat(secondaryDb.get("key3".getBytes())).isEqualTo("value3".getBytes());
+
+        // write to primary
+        db.put("key4".getBytes(), "value4".getBytes());
+        db.put("key5".getBytes(), "value5".getBytes());
+        db.put("key6".getBytes(), "value6".getBytes());
+
+        // tell secondary to catch up
+        secondaryDb.tryCatchUpWithPrimary();
+
+        db.put("key7".getBytes(), "value7".getBytes());
+
+        // check secondary
+        assertThat(secondaryDb.get("key4".getBytes())).isEqualTo("value4".getBytes());
+        assertThat(secondaryDb.get("key5".getBytes())).isEqualTo("value5".getBytes());
+        assertThat(secondaryDb.get("key6".getBytes())).isEqualTo("value6".getBytes());
+
+        assertThat(secondaryDb.get("key7".getBytes())).isNull();
+      }
+    }
+  }
+
+  @Test
+  public void openAsSecondaryColumnFamilies() throws RocksDBException {
+    try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) {
+      final List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+      cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts));
+      cfDescriptors.add(new ColumnFamilyDescriptor("cf1".getBytes(), cfOpts));
+
+      final List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
+
+      try (final DBOptions options =
+               new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true);
+           final RocksDB db = RocksDB.open(
+               options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, cfHandles)) {
+        try {
+          final ColumnFamilyHandle cf1 = cfHandles.get(1);
+
+          db.put(cf1, "key1".getBytes(), "value1".getBytes());
+          db.put(cf1, "key2".getBytes(), "value2".getBytes());
+          db.put(cf1, "key3".getBytes(), "value3".getBytes());
+
+          final List<ColumnFamilyHandle> secondaryCfHandles = new ArrayList<>();
+
+          // open secondary
+          try (final DBOptions secondaryOptions = new DBOptions();
+               final RocksDB secondaryDb =
+                   RocksDB.openAsSecondary(secondaryOptions, dbFolder.getRoot().getAbsolutePath(),
+                       secondaryDbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+                       secondaryCfHandles)) {
+            try {
+              final ColumnFamilyHandle secondaryCf1 = secondaryCfHandles.get(1);
+
+              assertThat(secondaryDb.get(secondaryCf1, "key1".getBytes()))
+                  .isEqualTo("value1".getBytes());
+              assertThat(secondaryDb.get(secondaryCf1, "key2".getBytes()))
+                  .isEqualTo("value2".getBytes());
+              assertThat(secondaryDb.get(secondaryCf1, "key3".getBytes()))
+                  .isEqualTo("value3".getBytes());
+
+              // write to primary
+              db.put(cf1, "key4".getBytes(), "value4".getBytes());
+              db.put(cf1, "key5".getBytes(), "value5".getBytes());
+              db.put(cf1, "key6".getBytes(), "value6".getBytes());
+
+              // tell secondary to catch up
+              secondaryDb.tryCatchUpWithPrimary();
+
+              db.put(cf1, "key7".getBytes(), "value7".getBytes());
+
+              // check secondary
+              assertThat(secondaryDb.get(secondaryCf1, "key4".getBytes()))
+                  .isEqualTo("value4".getBytes());
+              assertThat(secondaryDb.get(secondaryCf1, "key5".getBytes()))
+                  .isEqualTo("value5".getBytes());
+              assertThat(secondaryDb.get(secondaryCf1, "key6".getBytes()))
+                  .isEqualTo("value6".getBytes());
+
+              assertThat(secondaryDb.get(secondaryCf1, "key7".getBytes())).isNull();
+
+            } finally {
+              for (final ColumnFamilyHandle secondaryCfHandle : secondaryCfHandles) {
+                secondaryCfHandle.close();
+              }
+            }
+          }
+        } finally {
+          for (final ColumnFamilyHandle cfHandle : cfHandles) {
+            cfHandle.close();
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/SliceTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/SliceTest.java
new file mode 100644
index 000000000..c65b01903
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/SliceTest.java
@@ -0,0 +1,80 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class SliceTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Test
+  public void slice() {
+    try (final Slice slice = new Slice("testSlice")) {
+      assertThat(slice.empty()).isFalse();
+      assertThat(slice.size()).isEqualTo(9);
+      assertThat(slice.data()).isEqualTo("testSlice".getBytes());
+    }
+
+    try (final Slice otherSlice = new Slice("otherSlice".getBytes())) {
+      assertThat(otherSlice.data()).isEqualTo("otherSlice".getBytes());
+    }
+
+    try (final Slice thirdSlice = new Slice("otherSlice".getBytes(), 5)) {
+      assertThat(thirdSlice.data()).isEqualTo("Slice".getBytes());
+    }
+  }
+
+  @Test
+  public void sliceClear() {
+    try (final Slice slice = new Slice("abc")) {
+      assertThat(slice.toString()).isEqualTo("abc");
+      slice.clear();
+      assertThat(slice.toString()).isEmpty();
+      slice.clear();  // make sure we don't double-free
+    }
+  }
+
+  @Test
+  public void sliceRemovePrefix() {
+    try (final Slice slice = new Slice("abc")) {
+      assertThat(slice.toString()).isEqualTo("abc");
+      slice.removePrefix(1);
+      assertThat(slice.toString()).isEqualTo("bc");
+    }
+  }
+
+  @Test
+  public void sliceEquals() {
+    try (final Slice slice = new Slice("abc");
+         final Slice slice2 = new Slice("abc")) {
+      assertThat(slice.equals(slice2)).isTrue();
+      assertThat(slice.hashCode() == slice2.hashCode()).isTrue();
+    }
+  }
+
+  @Test
+  public void sliceStartWith() {
+    try (final Slice slice = new Slice("matchpoint");
+         final Slice match = new Slice("mat");
+         final Slice noMatch = new Slice("nomatch")) {
+      assertThat(slice.startsWith(match)).isTrue();
+      assertThat(slice.startsWith(noMatch)).isFalse();
+    }
+  }
+
+  @Test
+  public void sliceToString() {
+    try (final Slice slice = new Slice("stringTest")) {
+      assertThat(slice.toString()).isEqualTo("stringTest");
+      assertThat(slice.toString(true)).isNotEqualTo("");
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/SnapshotTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/SnapshotTest.java
new file mode 100644
index 000000000..11f0d560a
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/SnapshotTest.java
@@ -0,0 +1,169 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class SnapshotTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void snapshots() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath())) {
+      db.put("key".getBytes(), "value".getBytes());
+      // Get new Snapshot of database
+      try (final Snapshot snapshot = db.getSnapshot()) {
+        assertThat(snapshot.getSequenceNumber()).isGreaterThan(0);
+        assertThat(snapshot.getSequenceNumber()).isEqualTo(1);
+        try (final ReadOptions readOptions = new ReadOptions()) {
+          // set snapshot in ReadOptions
+          readOptions.setSnapshot(snapshot);
+
+          // retrieve key value pair
+          assertThat(new String(db.get("key".getBytes()))).
+              isEqualTo("value");
+          // retrieve key value pair created before
+          // the snapshot was made
+          assertThat(new String(db.get(readOptions,
+              "key".getBytes()))).isEqualTo("value");
+          // add new key/value pair
+          db.put("newkey".getBytes(), "newvalue".getBytes());
+          // using no snapshot the latest db entries
+          // will be taken into account
+          assertThat(new String(db.get("newkey".getBytes()))).
+              isEqualTo("newvalue");
+          // snapshopot was created before newkey
+          assertThat(db.get(readOptions, "newkey".getBytes())).
+              isNull();
+          // Retrieve snapshot from read options
+          try (final Snapshot sameSnapshot = readOptions.snapshot()) {
+            readOptions.setSnapshot(sameSnapshot);
+            // results must be the same with new Snapshot
+            // instance using the same native pointer
+            assertThat(new String(db.get(readOptions,
+                "key".getBytes()))).isEqualTo("value");
+            // update key value pair to newvalue
+            db.put("key".getBytes(), "newvalue".getBytes());
+            // read with previously created snapshot will
+            // read previous version of key value pair
+            assertThat(new String(db.get(readOptions,
+                "key".getBytes()))).isEqualTo("value");
+            // read for newkey using the snapshot must be
+            // null
+            assertThat(db.get(readOptions, "newkey".getBytes())).
+                isNull();
+            // setting null to snapshot in ReadOptions leads
+            // to no Snapshot being used.
+            readOptions.setSnapshot(null);
+            assertThat(new String(db.get(readOptions,
+                "newkey".getBytes()))).isEqualTo("newvalue");
+            // release Snapshot
+            db.releaseSnapshot(snapshot);
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void iteratorWithSnapshot() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath())) {
+      db.put("key".getBytes(), "value".getBytes());
+
+      // Get new Snapshot of database
+      // set snapshot in ReadOptions
+      try (final Snapshot snapshot = db.getSnapshot();
+           final ReadOptions readOptions =
+               new ReadOptions().setSnapshot(snapshot)) {
+        db.put("key2".getBytes(), "value2".getBytes());
+
+        // iterate over current state of db
+        try (final RocksIterator iterator = db.newIterator()) {
+          iterator.seekToFirst();
+          assertThat(iterator.isValid()).isTrue();
+          assertThat(iterator.key()).isEqualTo("key".getBytes());
+          iterator.next();
+          assertThat(iterator.isValid()).isTrue();
+          assertThat(iterator.key()).isEqualTo("key2".getBytes());
+          iterator.next();
+          assertThat(iterator.isValid()).isFalse();
+        }
+
+        // iterate using a snapshot
+        try (final RocksIterator snapshotIterator =
+                 db.newIterator(readOptions)) {
+          snapshotIterator.seekToFirst();
+          assertThat(snapshotIterator.isValid()).isTrue();
+          assertThat(snapshotIterator.key()).isEqualTo("key".getBytes());
+          snapshotIterator.next();
+          assertThat(snapshotIterator.isValid()).isFalse();
+        }
+
+        // release Snapshot
+        db.releaseSnapshot(snapshot);
+      }
+    }
+  }
+
+  @Test
+  public void iteratorWithSnapshotOnColumnFamily() throws RocksDBException {
+    try (final Options options = new Options()
+        .setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath())) {
+
+      db.put("key".getBytes(), "value".getBytes());
+
+      // Get new Snapshot of database
+      // set snapshot in ReadOptions
+      try (final Snapshot snapshot = db.getSnapshot();
+           final ReadOptions readOptions = new ReadOptions()
+               .setSnapshot(snapshot)) {
+        db.put("key2".getBytes(), "value2".getBytes());
+
+        // iterate over current state of column family
+        try (final RocksIterator iterator = db.newIterator(
+            db.getDefaultColumnFamily())) {
+          iterator.seekToFirst();
+          assertThat(iterator.isValid()).isTrue();
+          assertThat(iterator.key()).isEqualTo("key".getBytes());
+          iterator.next();
+          assertThat(iterator.isValid()).isTrue();
+          assertThat(iterator.key()).isEqualTo("key2".getBytes());
+          iterator.next();
+          assertThat(iterator.isValid()).isFalse();
+        }
+
+        // iterate using a snapshot on default column family
+        try (final RocksIterator snapshotIterator = db.newIterator(
+            db.getDefaultColumnFamily(), readOptions)) {
+          snapshotIterator.seekToFirst();
+          assertThat(snapshotIterator.isValid()).isTrue();
+          assertThat(snapshotIterator.key()).isEqualTo("key".getBytes());
+          snapshotIterator.next();
+          assertThat(snapshotIterator.isValid()).isFalse();
+
+          // release Snapshot
+          db.releaseSnapshot(snapshot);
+        }
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/SstFileManagerTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/SstFileManagerTest.java
new file mode 100644
index 000000000..2e136e820
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/SstFileManagerTest.java
@@ -0,0 +1,66 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Test;
+
+import java.util.Collections;
+
+import static org.assertj.core.api.Assertions.*;
+
+public class SstFileManagerTest {
+
+  @Test
+  public void maxAllowedSpaceUsage() throws RocksDBException {
+    try (final SstFileManager sstFileManager = new SstFileManager(Env.getDefault())) {
+      sstFileManager.setMaxAllowedSpaceUsage(1024 * 1024 * 64);
+      assertThat(sstFileManager.isMaxAllowedSpaceReached()).isFalse();
+      assertThat(sstFileManager.isMaxAllowedSpaceReachedIncludingCompactions()).isFalse();
+    }
+  }
+
+  @Test
+  public void compactionBufferSize() throws RocksDBException {
+    try (final SstFileManager sstFileManager = new SstFileManager(Env.getDefault())) {
+      sstFileManager.setCompactionBufferSize(1024 * 1024 * 10);
+      assertThat(sstFileManager.isMaxAllowedSpaceReachedIncludingCompactions()).isFalse();
+    }
+  }
+
+  @Test
+  public void totalSize() throws RocksDBException {
+    try (final SstFileManager sstFileManager = new SstFileManager(Env.getDefault())) {
+      assertThat(sstFileManager.getTotalSize()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void trackedFiles() throws RocksDBException {
+    try (final SstFileManager sstFileManager = new SstFileManager(Env.getDefault())) {
+      assertThat(sstFileManager.getTrackedFiles()).isEqualTo(Collections.emptyMap());
+    }
+  }
+
+  @Test
+  public void deleteRateBytesPerSecond() throws RocksDBException {
+    try (final SstFileManager sstFileManager = new SstFileManager(Env.getDefault())) {
+      assertThat(sstFileManager.getDeleteRateBytesPerSecond()).isEqualTo(SstFileManager.RATE_BYTES_PER_SEC_DEFAULT);
+      final long ratePerSecond = 1024 * 1024 * 52;
+      sstFileManager.setDeleteRateBytesPerSecond(ratePerSecond);
+      assertThat(sstFileManager.getDeleteRateBytesPerSecond()).isEqualTo(ratePerSecond);
+    }
+  }
+
+  @Test
+  public void maxTrashDBRatio() throws RocksDBException {
+    try (final SstFileManager sstFileManager = new SstFileManager(Env.getDefault())) {
+      assertThat(sstFileManager.getMaxTrashDBRatio()).isEqualTo(SstFileManager.MAX_TRASH_DB_RATION_DEFAULT);
+      final double trashRatio = 0.2;
+      sstFileManager.setMaxTrashDBRatio(trashRatio);
+      assertThat(sstFileManager.getMaxTrashDBRatio()).isEqualTo(trashRatio);
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/SstFileReaderTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/SstFileReaderTest.java
new file mode 100644
index 000000000..e29df99f2
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/SstFileReaderTest.java
@@ -0,0 +1,222 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.rocksdb.util.ByteBufferAllocator;
+
+@RunWith(Parameterized.class)
+public class SstFileReaderTest {
+  private static final String SST_FILE_NAME = "test.sst";
+
+  static class KeyValueWithOp {
+    KeyValueWithOp(final String key, final String value, final OpType opType) {
+      this.key = key;
+      this.value = value;
+      this.opType = opType;
+    }
+
+    String getKey() {
+      return key;
+    }
+
+    String getValue() {
+      return value;
+    }
+
+    OpType getOpType() {
+      return opType;
+    }
+
+    private final String key;
+    private final String value;
+    private final OpType opType;
+  }
+
+  @Rule public TemporaryFolder parentFolder = new TemporaryFolder();
+
+  @Parameterized.Parameters(name = "{0}")
+  public static Iterable<Object[]> parameters() {
+    return Arrays.asList(new Object[][] {
+        {"direct", ByteBufferAllocator.DIRECT}, {"indirect", ByteBufferAllocator.HEAP}});
+  }
+
+  @Parameterized.Parameter(0) public String name;
+
+  @Parameterized.Parameter(1) public ByteBufferAllocator byteBufferAllocator;
+
+  enum OpType { PUT, PUT_BYTES, MERGE, MERGE_BYTES, DELETE, DELETE_BYTES }
+
+  private File newSstFile(final List<KeyValueWithOp> keyValues)
+      throws IOException, RocksDBException {
+    final EnvOptions envOptions = new EnvOptions();
+    final StringAppendOperator stringAppendOperator = new StringAppendOperator();
+    final Options options = new Options().setMergeOperator(stringAppendOperator);
+    final SstFileWriter sstFileWriter;
+    sstFileWriter = new SstFileWriter(envOptions, options);
+
+    final File sstFile = parentFolder.newFile(SST_FILE_NAME);
+    try {
+      sstFileWriter.open(sstFile.getAbsolutePath());
+      for (final KeyValueWithOp keyValue : keyValues) {
+        final Slice keySlice = new Slice(keyValue.getKey());
+        final Slice valueSlice = new Slice(keyValue.getValue());
+        final byte[] keyBytes = keyValue.getKey().getBytes();
+        final byte[] valueBytes = keyValue.getValue().getBytes();
+        switch (keyValue.getOpType()) {
+          case PUT:
+            sstFileWriter.put(keySlice, valueSlice);
+            break;
+          case PUT_BYTES:
+            sstFileWriter.put(keyBytes, valueBytes);
+            break;
+          case MERGE:
+            sstFileWriter.merge(keySlice, valueSlice);
+            break;
+          case MERGE_BYTES:
+            sstFileWriter.merge(keyBytes, valueBytes);
+            break;
+          case DELETE:
+            sstFileWriter.delete(keySlice);
+            break;
+          case DELETE_BYTES:
+            sstFileWriter.delete(keyBytes);
+            break;
+          default:
+            fail("Unsupported op type");
+        }
+        keySlice.close();
+        valueSlice.close();
+      }
+      sstFileWriter.finish();
+    } finally {
+      assertThat(sstFileWriter).isNotNull();
+      sstFileWriter.close();
+      options.close();
+      envOptions.close();
+    }
+    return sstFile;
+  }
+
+  @Test
+  public void readSstFile() throws RocksDBException, IOException {
+    final List<KeyValueWithOp> keyValues = new ArrayList<>();
+    keyValues.add(new KeyValueWithOp("key1", "value1", OpType.PUT));
+    keyValues.add(new KeyValueWithOp("key2", "value2", OpType.PUT));
+    keyValues.add(new KeyValueWithOp("key3", "value3", OpType.PUT));
+
+    final File sstFile = newSstFile(keyValues);
+    try (final StringAppendOperator stringAppendOperator = new StringAppendOperator();
+         final Options options =
+             new Options().setCreateIfMissing(true).setMergeOperator(stringAppendOperator);
+         final SstFileReader reader = new SstFileReader(options)) {
+      // Open the sst file and iterator
+      reader.open(sstFile.getAbsolutePath());
+      final ReadOptions readOptions = new ReadOptions();
+      final SstFileReaderIterator iterator = reader.newIterator(readOptions);
+
+      // Use the iterator to read sst file
+      iterator.seekToFirst();
+
+      // Verify Checksum
+      reader.verifyChecksum();
+
+      // Verify Table Properties
+      assertEquals(reader.getTableProperties().getNumEntries(), 3);
+
+      // Check key and value
+      assertThat(iterator.key()).isEqualTo("key1".getBytes());
+      assertThat(iterator.value()).isEqualTo("value1".getBytes());
+
+      final ByteBuffer byteBuffer = byteBufferAllocator.allocate(128);
+      byteBuffer.put("key1".getBytes()).flip();
+      iterator.seek(byteBuffer);
+      assertThat(byteBuffer.position()).isEqualTo(4);
+      assertThat(byteBuffer.limit()).isEqualTo(4);
+
+      assertThat(iterator.isValid()).isTrue();
+      assertThat(iterator.key()).isEqualTo("key1".getBytes());
+      assertThat(iterator.value()).isEqualTo("value1".getBytes());
+
+      {
+        byteBuffer.clear();
+        assertThat(iterator.key(byteBuffer)).isEqualTo("key1".getBytes().length);
+        final byte[] dst = new byte["key1".getBytes().length];
+        byteBuffer.get(dst);
+        assertThat(new String(dst)).isEqualTo("key1");
+      }
+
+      {
+        byteBuffer.clear();
+        byteBuffer.put("PREFIX".getBytes());
+        final ByteBuffer slice = byteBuffer.slice();
+        assertThat(iterator.key(byteBuffer)).isEqualTo("key1".getBytes().length);
+        final byte[] dst = new byte["key1".getBytes().length];
+        slice.get(dst);
+        assertThat(new String(dst)).isEqualTo("key1");
+      }
+
+      {
+        byteBuffer.clear();
+        assertThat(iterator.value(byteBuffer)).isEqualTo("value1".getBytes().length);
+        final byte[] dst = new byte["value1".getBytes().length];
+        byteBuffer.get(dst);
+        assertThat(new String(dst)).isEqualTo("value1");
+      }
+
+      byteBuffer.clear();
+      byteBuffer.put("key1point5".getBytes()).flip();
+      iterator.seek(byteBuffer);
+      assertThat(iterator.isValid()).isTrue();
+      assertThat(iterator.key()).isEqualTo("key2".getBytes());
+      assertThat(iterator.value()).isEqualTo("value2".getBytes());
+
+      byteBuffer.clear();
+      byteBuffer.put("key1point5".getBytes()).flip();
+      iterator.seekForPrev(byteBuffer);
+      assertThat(iterator.isValid()).isTrue();
+      assertThat(iterator.key()).isEqualTo("key1".getBytes());
+      assertThat(iterator.value()).isEqualTo("value1".getBytes());
+
+      byteBuffer.clear();
+      byteBuffer.put("key2point5".getBytes()).flip();
+      iterator.seek(byteBuffer);
+      assertThat(iterator.isValid()).isTrue();
+      assertThat(iterator.key()).isEqualTo("key3".getBytes());
+      assertThat(iterator.value()).isEqualTo("value3".getBytes());
+
+      byteBuffer.clear();
+      byteBuffer.put("key2point5".getBytes()).flip();
+      iterator.seekForPrev(byteBuffer);
+      assertThat(iterator.isValid()).isTrue();
+      assertThat(iterator.key()).isEqualTo("key2".getBytes());
+      assertThat(iterator.value()).isEqualTo("value2".getBytes());
+
+      byteBuffer.clear();
+      byteBuffer.put("PREFIX".getBytes());
+      final ByteBuffer slice = byteBuffer.slice();
+      slice.put("key1point5".getBytes()).flip();
+      iterator.seekForPrev(slice);
+      assertThat(iterator.isValid()).isTrue();
+      assertThat(iterator.key()).isEqualTo("key1".getBytes());
+      assertThat(iterator.value()).isEqualTo("value1".getBytes());
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/SstFileWriterTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/SstFileWriterTest.java
new file mode 100644
index 000000000..87165bfe1
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/SstFileWriterTest.java
@@ -0,0 +1,241 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.junit.Assert.fail;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.rocksdb.util.BytewiseComparator;
+
+public class SstFileWriterTest {
+  private static final String SST_FILE_NAME = "test.sst";
+  private static final String DB_DIRECTORY_NAME = "test_db";
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE
+      = new RocksNativeLibraryResource();
+
+  @Rule public TemporaryFolder parentFolder = new TemporaryFolder();
+
+  enum OpType { PUT, PUT_BYTES, PUT_DIRECT, MERGE, MERGE_BYTES, DELETE, DELETE_BYTES }
+
+  static class KeyValueWithOp {
+    KeyValueWithOp(String key, String value, OpType opType) {
+      this.key = key;
+      this.value = value;
+      this.opType = opType;
+    }
+
+    String getKey() {
+      return key;
+    }
+
+    String getValue() {
+      return value;
+    }
+
+    OpType getOpType() {
+      return opType;
+    }
+
+    private final String key;
+    private final String value;
+    private final OpType opType;
+  };
+
+  private File newSstFile(final List<KeyValueWithOp> keyValues,
+      boolean useJavaBytewiseComparator) throws IOException, RocksDBException {
+    final EnvOptions envOptions = new EnvOptions();
+    final StringAppendOperator stringAppendOperator = new StringAppendOperator();
+    final Options options = new Options().setMergeOperator(stringAppendOperator);
+    SstFileWriter sstFileWriter = null;
+    ComparatorOptions comparatorOptions = null;
+    BytewiseComparator comparator = null;
+    if (useJavaBytewiseComparator) {
+      comparatorOptions = new ComparatorOptions().setUseDirectBuffer(false);
+      comparator = new BytewiseComparator(comparatorOptions);
+      options.setComparator(comparator);
+      sstFileWriter = new SstFileWriter(envOptions, options);
+    } else {
+      sstFileWriter = new SstFileWriter(envOptions, options);
+    }
+
+    final File sstFile = parentFolder.newFile(SST_FILE_NAME);
+    try {
+      sstFileWriter.open(sstFile.getAbsolutePath());
+      assertThat(sstFileWriter.fileSize()).isEqualTo(0);
+      for (KeyValueWithOp keyValue : keyValues) {
+        Slice keySlice = new Slice(keyValue.getKey());
+        Slice valueSlice = new Slice(keyValue.getValue());
+        byte[] keyBytes = keyValue.getKey().getBytes();
+        byte[] valueBytes = keyValue.getValue().getBytes();
+        ByteBuffer keyDirect = ByteBuffer.allocateDirect(keyBytes.length);
+        keyDirect.put(keyBytes);
+        keyDirect.flip();
+        ByteBuffer valueDirect = ByteBuffer.allocateDirect(valueBytes.length);
+        valueDirect.put(valueBytes);
+        valueDirect.flip();
+        switch (keyValue.getOpType()) {
+          case PUT:
+            sstFileWriter.put(keySlice, valueSlice);
+            break;
+          case PUT_BYTES:
+            sstFileWriter.put(keyBytes, valueBytes);
+            break;
+          case PUT_DIRECT:
+            sstFileWriter.put(keyDirect, valueDirect);
+            assertThat(keyDirect.position()).isEqualTo(keyBytes.length);
+            assertThat(keyDirect.limit()).isEqualTo(keyBytes.length);
+            assertThat(valueDirect.position()).isEqualTo(valueBytes.length);
+            assertThat(valueDirect.limit()).isEqualTo(valueBytes.length);
+            break;
+          case MERGE:
+            sstFileWriter.merge(keySlice, valueSlice);
+            break;
+          case MERGE_BYTES:
+            sstFileWriter.merge(keyBytes, valueBytes);
+            break;
+          case DELETE:
+            sstFileWriter.delete(keySlice);
+            break;
+          case DELETE_BYTES:
+            sstFileWriter.delete(keyBytes);
+            break;
+          default:
+            fail("Unsupported op type");
+        }
+        keySlice.close();
+        valueSlice.close();
+      }
+      sstFileWriter.finish();
+      assertThat(sstFileWriter.fileSize()).isGreaterThan(100);
+    } finally {
+      assertThat(sstFileWriter).isNotNull();
+      sstFileWriter.close();
+      options.close();
+      envOptions.close();
+      if (comparatorOptions != null) {
+        comparatorOptions.close();
+      }
+      if (comparator != null) {
+        comparator.close();
+      }
+    }
+    return sstFile;
+  }
+
+  @Test
+  public void generateSstFileWithJavaComparator()
+      throws RocksDBException, IOException {
+    final List<KeyValueWithOp> keyValues = new ArrayList<>();
+    keyValues.add(new KeyValueWithOp("key1", "value1", OpType.PUT));
+    keyValues.add(new KeyValueWithOp("key2", "value2", OpType.PUT));
+    keyValues.add(new KeyValueWithOp("key3", "value3", OpType.MERGE));
+    keyValues.add(new KeyValueWithOp("key4", "value4", OpType.MERGE));
+    keyValues.add(new KeyValueWithOp("key5", "", OpType.DELETE));
+
+    newSstFile(keyValues, true);
+  }
+
+  @Test
+  public void generateSstFileWithNativeComparator()
+      throws RocksDBException, IOException {
+    final List<KeyValueWithOp> keyValues = new ArrayList<>();
+    keyValues.add(new KeyValueWithOp("key1", "value1", OpType.PUT));
+    keyValues.add(new KeyValueWithOp("key2", "value2", OpType.PUT));
+    keyValues.add(new KeyValueWithOp("key3", "value3", OpType.MERGE));
+    keyValues.add(new KeyValueWithOp("key4", "value4", OpType.MERGE));
+    keyValues.add(new KeyValueWithOp("key5", "", OpType.DELETE));
+
+    newSstFile(keyValues, false);
+  }
+
+  @Test
+  public void ingestSstFile() throws RocksDBException, IOException {
+    final List<KeyValueWithOp> keyValues = new ArrayList<>();
+    keyValues.add(new KeyValueWithOp("key1", "value1", OpType.PUT));
+    keyValues.add(new KeyValueWithOp("key2", "value2", OpType.PUT_DIRECT));
+    keyValues.add(new KeyValueWithOp("key3", "value3", OpType.PUT_BYTES));
+    keyValues.add(new KeyValueWithOp("key4", "value4", OpType.MERGE));
+    keyValues.add(new KeyValueWithOp("key5", "value5", OpType.MERGE_BYTES));
+    keyValues.add(new KeyValueWithOp("key6", "", OpType.DELETE));
+    keyValues.add(new KeyValueWithOp("key7", "", OpType.DELETE));
+
+
+    final File sstFile = newSstFile(keyValues, false);
+    final File dbFolder = parentFolder.newFolder(DB_DIRECTORY_NAME);
+    try(final StringAppendOperator stringAppendOperator =
+            new StringAppendOperator();
+        final Options options = new Options()
+            .setCreateIfMissing(true)
+            .setMergeOperator(stringAppendOperator);
+        final RocksDB db = RocksDB.open(options, dbFolder.getAbsolutePath());
+        final IngestExternalFileOptions ingestExternalFileOptions =
+            new IngestExternalFileOptions()) {
+      db.ingestExternalFile(Arrays.asList(sstFile.getAbsolutePath()),
+          ingestExternalFileOptions);
+
+      assertThat(db.get("key1".getBytes())).isEqualTo("value1".getBytes());
+      assertThat(db.get("key2".getBytes())).isEqualTo("value2".getBytes());
+      assertThat(db.get("key3".getBytes())).isEqualTo("value3".getBytes());
+      assertThat(db.get("key4".getBytes())).isEqualTo("value4".getBytes());
+      assertThat(db.get("key5".getBytes())).isEqualTo("value5".getBytes());
+      assertThat(db.get("key6".getBytes())).isEqualTo(null);
+      assertThat(db.get("key7".getBytes())).isEqualTo(null);
+    }
+  }
+
+  @Test
+  public void ingestSstFile_cf() throws RocksDBException, IOException {
+    final List<KeyValueWithOp> keyValues = new ArrayList<>();
+    keyValues.add(new KeyValueWithOp("key1", "value1", OpType.PUT));
+    keyValues.add(new KeyValueWithOp("key2", "value2", OpType.PUT));
+    keyValues.add(new KeyValueWithOp("key3", "value3", OpType.MERGE));
+    keyValues.add(new KeyValueWithOp("key4", "", OpType.DELETE));
+
+    final File sstFile = newSstFile(keyValues, false);
+    final File dbFolder = parentFolder.newFolder(DB_DIRECTORY_NAME);
+    try(final StringAppendOperator stringAppendOperator =
+            new StringAppendOperator();
+        final Options options = new Options()
+            .setCreateIfMissing(true)
+            .setCreateMissingColumnFamilies(true)
+            .setMergeOperator(stringAppendOperator);
+        final RocksDB db = RocksDB.open(options, dbFolder.getAbsolutePath());
+        final IngestExternalFileOptions ingestExternalFileOptions =
+            new IngestExternalFileOptions()) {
+
+      try(final ColumnFamilyOptions cf_opts = new ColumnFamilyOptions()
+              .setMergeOperator(stringAppendOperator);
+          final ColumnFamilyHandle cf_handle = db.createColumnFamily(
+              new ColumnFamilyDescriptor("new_cf".getBytes(), cf_opts))) {
+
+        db.ingestExternalFile(cf_handle,
+            Arrays.asList(sstFile.getAbsolutePath()),
+            ingestExternalFileOptions);
+
+        assertThat(db.get(cf_handle,
+            "key1".getBytes())).isEqualTo("value1".getBytes());
+        assertThat(db.get(cf_handle,
+            "key2".getBytes())).isEqualTo("value2".getBytes());
+        assertThat(db.get(cf_handle,
+            "key3".getBytes())).isEqualTo("value3".getBytes());
+        assertThat(db.get(cf_handle,
+            "key4".getBytes())).isEqualTo(null);
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/SstPartitionerTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/SstPartitionerTest.java
new file mode 100644
index 000000000..74816db93
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/SstPartitionerTest.java
@@ -0,0 +1,72 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.util.List;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+public class SstPartitionerTest {
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void sstFixedPrefix() throws RocksDBException {
+    try (SstPartitionerFixedPrefixFactory factory = new SstPartitionerFixedPrefixFactory(4);
+         final Options opt =
+             new Options().setCreateIfMissing(true).setSstPartitionerFactory(factory);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      // writing (long)100 under key
+      db.put("aaaa1".getBytes(), "A".getBytes());
+      db.put("bbbb1".getBytes(), "B".getBytes());
+      db.flush(new FlushOptions());
+
+      db.put("aaaa0".getBytes(), "A2".getBytes());
+      db.put("aaaa2".getBytes(), "A2".getBytes());
+      db.flush(new FlushOptions());
+
+      db.compactRange();
+
+      List<LiveFileMetaData> metadata = db.getLiveFilesMetaData();
+      assertThat(metadata.size()).isEqualTo(2);
+    }
+  }
+
+  @Test
+  public void sstFixedPrefixFamily() throws RocksDBException {
+    final byte[] cfName = "new_cf".getBytes(UTF_8);
+    final ColumnFamilyDescriptor cfDescriptor = new ColumnFamilyDescriptor(cfName,
+        new ColumnFamilyOptions().setSstPartitionerFactory(
+            new SstPartitionerFixedPrefixFactory(4)));
+
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      final ColumnFamilyHandle columnFamilyHandle = db.createColumnFamily(cfDescriptor);
+
+      // writing (long)100 under key
+      db.put(columnFamilyHandle, "aaaa1".getBytes(), "A".getBytes());
+      db.put(columnFamilyHandle, "bbbb1".getBytes(), "B".getBytes());
+      db.flush(new FlushOptions(), columnFamilyHandle);
+
+      db.put(columnFamilyHandle, "aaaa0".getBytes(), "A2".getBytes());
+      db.put(columnFamilyHandle, "aaaa2".getBytes(), "A2".getBytes());
+      db.flush(new FlushOptions(), columnFamilyHandle);
+
+      db.compactRange(columnFamilyHandle);
+
+      List<LiveFileMetaData> metadata = db.getLiveFilesMetaData();
+      assertThat(metadata.size()).isEqualTo(2);
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/StatisticsCollectorTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/StatisticsCollectorTest.java
new file mode 100644
index 000000000..36721c80d
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/StatisticsCollectorTest.java
@@ -0,0 +1,55 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Collections;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class StatisticsCollectorTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void statisticsCollector()
+      throws InterruptedException, RocksDBException {
+    try (final Statistics statistics = new Statistics();
+            final Options opt = new Options()
+        .setStatistics(statistics)
+        .setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt,
+             dbFolder.getRoot().getAbsolutePath())) {
+
+      try(final Statistics stats = opt.statistics()) {
+
+        final StatsCallbackMock callback = new StatsCallbackMock();
+        final StatsCollectorInput statsInput =
+                new StatsCollectorInput(stats, callback);
+
+        final StatisticsCollector statsCollector = new StatisticsCollector(
+                Collections.singletonList(statsInput), 100);
+        statsCollector.start();
+
+        Thread.sleep(1000);
+
+        assertThat(callback.tickerCallbackCount).isGreaterThan(0);
+        assertThat(callback.histCallbackCount).isGreaterThan(0);
+
+        statsCollector.shutDown(1000);
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/StatisticsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/StatisticsTest.java
new file mode 100644
index 000000000..de92102ec
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/StatisticsTest.java
@@ -0,0 +1,168 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.nio.charset.StandardCharsets;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class StatisticsTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void statsLevel() throws RocksDBException {
+    final Statistics statistics = new Statistics();
+    statistics.setStatsLevel(StatsLevel.ALL);
+    assertThat(statistics.statsLevel()).isEqualTo(StatsLevel.ALL);
+  }
+
+  @Test
+  public void getTickerCount() throws RocksDBException {
+    try (final Statistics statistics = new Statistics();
+         final Options opt = new Options()
+             .setStatistics(statistics)
+             .setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt,
+             dbFolder.getRoot().getAbsolutePath())) {
+
+      final byte[] key = "some-key".getBytes(StandardCharsets.UTF_8);
+      final byte[] value = "some-value".getBytes(StandardCharsets.UTF_8);
+
+      db.put(key, value);
+      for(int i = 0; i < 10; i++) {
+        db.get(key);
+      }
+
+      assertThat(statistics.getTickerCount(TickerType.BYTES_READ)).isGreaterThan(0);
+    }
+  }
+
+  @Test
+  public void getAndResetTickerCount() throws RocksDBException {
+    try (final Statistics statistics = new Statistics();
+         final Options opt = new Options()
+             .setStatistics(statistics)
+             .setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt,
+             dbFolder.getRoot().getAbsolutePath())) {
+
+      final byte[] key = "some-key".getBytes(StandardCharsets.UTF_8);
+      final byte[] value = "some-value".getBytes(StandardCharsets.UTF_8);
+
+      db.put(key, value);
+      for(int i = 0; i < 10; i++) {
+        db.get(key);
+      }
+
+      final long read = statistics.getAndResetTickerCount(TickerType.BYTES_READ);
+      assertThat(read).isGreaterThan(0);
+
+      final long readAfterReset = statistics.getTickerCount(TickerType.BYTES_READ);
+      assertThat(readAfterReset).isLessThan(read);
+    }
+  }
+
+  @Test
+  public void getHistogramData() throws RocksDBException {
+    try (final Statistics statistics = new Statistics();
+         final Options opt = new Options()
+             .setStatistics(statistics)
+             .setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt,
+             dbFolder.getRoot().getAbsolutePath())) {
+
+      final byte[] key = "some-key".getBytes(StandardCharsets.UTF_8);
+      final byte[] value = "some-value".getBytes(StandardCharsets.UTF_8);
+
+      db.put(key, value);
+      for(int i = 0; i < 10; i++) {
+        db.get(key);
+      }
+
+      final HistogramData histogramData = statistics.getHistogramData(HistogramType.BYTES_PER_READ);
+      assertThat(histogramData).isNotNull();
+      assertThat(histogramData.getAverage()).isGreaterThan(0);
+      assertThat(histogramData.getMedian()).isGreaterThan(0);
+      assertThat(histogramData.getPercentile95()).isGreaterThan(0);
+      assertThat(histogramData.getPercentile99()).isGreaterThan(0);
+      assertThat(histogramData.getStandardDeviation()).isEqualTo(0.00);
+      assertThat(histogramData.getMax()).isGreaterThan(0);
+      assertThat(histogramData.getCount()).isGreaterThan(0);
+      assertThat(histogramData.getSum()).isGreaterThan(0);
+      assertThat(histogramData.getMin()).isGreaterThan(0);
+    }
+  }
+
+  @Test
+  public void getHistogramString() throws RocksDBException {
+    try (final Statistics statistics = new Statistics();
+         final Options opt = new Options()
+             .setStatistics(statistics)
+             .setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt,
+             dbFolder.getRoot().getAbsolutePath())) {
+
+      final byte[] key = "some-key".getBytes(StandardCharsets.UTF_8);
+      final byte[] value = "some-value".getBytes(StandardCharsets.UTF_8);
+
+      for(int i = 0; i < 10; i++) {
+        db.put(key, value);
+      }
+
+      assertThat(statistics.getHistogramString(HistogramType.BYTES_PER_WRITE)).isNotNull();
+    }
+  }
+
+  @Test
+  public void reset() throws RocksDBException {
+    try (final Statistics statistics = new Statistics();
+         final Options opt = new Options()
+             .setStatistics(statistics)
+             .setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt,
+             dbFolder.getRoot().getAbsolutePath())) {
+
+      final byte[] key = "some-key".getBytes(StandardCharsets.UTF_8);
+      final byte[] value = "some-value".getBytes(StandardCharsets.UTF_8);
+
+      db.put(key, value);
+      for(int i = 0; i < 10; i++) {
+        db.get(key);
+      }
+
+      final long read = statistics.getTickerCount(TickerType.BYTES_READ);
+      assertThat(read).isGreaterThan(0);
+
+      statistics.reset();
+
+      final long readAfterReset = statistics.getTickerCount(TickerType.BYTES_READ);
+      assertThat(readAfterReset).isLessThan(read);
+    }
+  }
+
+  @Test
+  public void ToString() throws RocksDBException {
+    try (final Statistics statistics = new Statistics();
+         final Options opt = new Options()
+             .setStatistics(statistics)
+             .setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt,
+             dbFolder.getRoot().getAbsolutePath())) {
+      assertThat(statistics.toString()).isNotNull();
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/StatsCallbackMock.java b/src/rocksdb/java/src/test/java/org/rocksdb/StatsCallbackMock.java
new file mode 100644
index 000000000..af8db0caa
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/StatsCallbackMock.java
@@ -0,0 +1,20 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public class StatsCallbackMock implements StatisticsCollectorCallback {
+  public int tickerCallbackCount = 0;
+  public int histCallbackCount = 0;
+
+  public void tickerCallback(TickerType tickerType, long tickerCount) {
+    tickerCallbackCount++;
+  }
+
+  public void histogramCallback(HistogramType histType,
+      HistogramData histData) {
+    histCallbackCount++;
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/TableFilterTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/TableFilterTest.java
new file mode 100644
index 000000000..2bd3b1798
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/TableFilterTest.java
@@ -0,0 +1,106 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+package org.rocksdb;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class TableFilterTest {
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void readOptions() throws RocksDBException {
+    try (final DBOptions opt = new DBOptions().
+            setCreateIfMissing(true).
+            setCreateMissingColumnFamilies(true);
+         final ColumnFamilyOptions new_cf_opts = new ColumnFamilyOptions()
+    ) {
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+          Arrays.asList(
+              new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+              new ColumnFamilyDescriptor("new_cf".getBytes(), new_cf_opts)
+          );
+
+      final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+
+      // open database
+      try (final RocksDB db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath(),
+          columnFamilyDescriptors,
+          columnFamilyHandles)) {
+
+        try (final CfNameCollectionTableFilter cfNameCollectingTableFilter =
+                 new CfNameCollectionTableFilter();
+            final FlushOptions flushOptions =
+                new FlushOptions().setWaitForFlush(true);
+            final ReadOptions readOptions =
+                 new ReadOptions().setTableFilter(cfNameCollectingTableFilter)) {
+
+          db.put(columnFamilyHandles.get(0),
+              "key1".getBytes(UTF_8), "value1".getBytes(UTF_8));
+          db.put(columnFamilyHandles.get(0),
+              "key2".getBytes(UTF_8), "value2".getBytes(UTF_8));
+          db.put(columnFamilyHandles.get(0),
+              "key3".getBytes(UTF_8), "value3".getBytes(UTF_8));
+          db.put(columnFamilyHandles.get(1),
+              "key1".getBytes(UTF_8), "value1".getBytes(UTF_8));
+          db.put(columnFamilyHandles.get(1),
+              "key2".getBytes(UTF_8), "value2".getBytes(UTF_8));
+          db.put(columnFamilyHandles.get(1),
+              "key3".getBytes(UTF_8), "value3".getBytes(UTF_8));
+
+          db.flush(flushOptions, columnFamilyHandles);
+
+          try (final RocksIterator iterator =
+                   db.newIterator(columnFamilyHandles.get(0), readOptions)) {
+            iterator.seekToFirst();
+            while (iterator.isValid()) {
+              iterator.key();
+              iterator.value();
+              iterator.next();
+            }
+          }
+
+          try (final RocksIterator iterator =
+                   db.newIterator(columnFamilyHandles.get(1), readOptions)) {
+            iterator.seekToFirst();
+            while (iterator.isValid()) {
+              iterator.key();
+              iterator.value();
+              iterator.next();
+            }
+          }
+
+          assertThat(cfNameCollectingTableFilter.cfNames.size()).isEqualTo(2);
+          assertThat(cfNameCollectingTableFilter.cfNames.get(0))
+              .isEqualTo(RocksDB.DEFAULT_COLUMN_FAMILY);
+          assertThat(cfNameCollectingTableFilter.cfNames.get(1))
+              .isEqualTo("new_cf".getBytes(UTF_8));
+        } finally {
+          for (final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles) {
+            columnFamilyHandle.close();
+          }
+        }
+      }
+    }
+  }
+
+  private static class CfNameCollectionTableFilter extends AbstractTableFilter {
+    private final List<byte[]> cfNames = new ArrayList<>();
+
+    @Override
+    public boolean filter(final TableProperties tableProperties) {
+      cfNames.add(tableProperties.getColumnFamilyName());
+      return true;
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/TimedEnvTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/TimedEnvTest.java
new file mode 100644
index 000000000..c958f96b2
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/TimedEnvTest.java
@@ -0,0 +1,43 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+public class TimedEnvTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void construct() throws RocksDBException {
+    try (final Env env = new TimedEnv(Env.getDefault())) {
+      // no-op
+    }
+  }
+
+  @Test
+  public void construct_integration() throws RocksDBException {
+    try (final Env env = new TimedEnv(Env.getDefault());
+         final Options options = new Options()
+             .setCreateIfMissing(true)
+             .setEnv(env);
+    ) {
+      try (final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getPath())) {
+        db.put("key1".getBytes(UTF_8), "value1".getBytes(UTF_8));
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/TransactionDBOptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/TransactionDBOptionsTest.java
new file mode 100644
index 000000000..7eaa6b16c
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/TransactionDBOptionsTest.java
@@ -0,0 +1,64 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Test;
+
+import java.util.Random;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class TransactionDBOptionsTest {
+
+  private static final Random rand = PlatformRandomHelper.
+      getPlatformSpecificRandomFactory();
+
+  @Test
+  public void maxNumLocks() {
+    try (final TransactionDBOptions opt = new TransactionDBOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setMaxNumLocks(longValue);
+      assertThat(opt.getMaxNumLocks()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void maxNumStripes() {
+    try (final TransactionDBOptions opt = new TransactionDBOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setNumStripes(longValue);
+      assertThat(opt.getNumStripes()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void transactionLockTimeout() {
+    try (final TransactionDBOptions opt = new TransactionDBOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setTransactionLockTimeout(longValue);
+      assertThat(opt.getTransactionLockTimeout()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void defaultLockTimeout() {
+    try (final TransactionDBOptions opt = new TransactionDBOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setDefaultLockTimeout(longValue);
+      assertThat(opt.getDefaultLockTimeout()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void writePolicy() {
+    try (final TransactionDBOptions opt = new TransactionDBOptions()) {
+      final TxnDBWritePolicy writePolicy = TxnDBWritePolicy.WRITE_UNPREPARED;  // non-default
+      opt.setWritePolicy(writePolicy);
+      assertThat(opt.getWritePolicy()).isEqualTo(writePolicy);
+    }
+  }
+
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/TransactionDBTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/TransactionDBTest.java
new file mode 100644
index 000000000..b0ea813ff
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/TransactionDBTest.java
@@ -0,0 +1,178 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.util.*;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+public class TransactionDBTest {
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void open() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+         final TransactionDB tdb = TransactionDB.open(options, txnDbOptions,
+                 dbFolder.getRoot().getAbsolutePath())) {
+      assertThat(tdb).isNotNull();
+    }
+  }
+
+  @Test
+  public void open_columnFamilies() throws RocksDBException {
+    try(final DBOptions dbOptions = new DBOptions().setCreateIfMissing(true)
+          .setCreateMissingColumnFamilies(true);
+        final ColumnFamilyOptions myCfOpts = new ColumnFamilyOptions()) {
+
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+          Arrays.asList(
+              new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+              new ColumnFamilyDescriptor("myCf".getBytes(), myCfOpts));
+
+      final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+
+      try (final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+           final TransactionDB tdb = TransactionDB.open(dbOptions, txnDbOptions,
+               dbFolder.getRoot().getAbsolutePath(),
+               columnFamilyDescriptors, columnFamilyHandles)) {
+        try {
+          assertThat(tdb).isNotNull();
+        } finally {
+          for (final ColumnFamilyHandle handle : columnFamilyHandles) {
+            handle.close();
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void beginTransaction() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+         final TransactionDB tdb = TransactionDB.open(options, txnDbOptions,
+             dbFolder.getRoot().getAbsolutePath());
+        final WriteOptions writeOptions = new WriteOptions()) {
+
+      try(final Transaction txn = tdb.beginTransaction(writeOptions)) {
+        assertThat(txn).isNotNull();
+      }
+    }
+  }
+
+  @Test
+  public void beginTransaction_transactionOptions() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+         final TransactionDB tdb = TransactionDB.open(options, txnDbOptions,
+             dbFolder.getRoot().getAbsolutePath());
+         final WriteOptions writeOptions = new WriteOptions();
+         final TransactionOptions txnOptions = new TransactionOptions()) {
+
+      try(final Transaction txn = tdb.beginTransaction(writeOptions,
+          txnOptions)) {
+        assertThat(txn).isNotNull();
+      }
+    }
+  }
+
+  @Test
+  public void beginTransaction_withOld() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+         final TransactionDB tdb = TransactionDB.open(options, txnDbOptions,
+             dbFolder.getRoot().getAbsolutePath());
+         final WriteOptions writeOptions = new WriteOptions()) {
+
+      try(final Transaction txn = tdb.beginTransaction(writeOptions)) {
+        final Transaction txnReused = tdb.beginTransaction(writeOptions, txn);
+        assertThat(txnReused).isSameAs(txn);
+      }
+    }
+  }
+
+  @Test
+  public void beginTransaction_withOld_transactionOptions()
+      throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+         final TransactionDB tdb = TransactionDB.open(options, txnDbOptions,
+             dbFolder.getRoot().getAbsolutePath());
+         final WriteOptions writeOptions = new WriteOptions();
+         final TransactionOptions txnOptions = new TransactionOptions()) {
+
+      try(final Transaction txn = tdb.beginTransaction(writeOptions)) {
+        final Transaction txnReused = tdb.beginTransaction(writeOptions,
+            txnOptions, txn);
+        assertThat(txnReused).isSameAs(txn);
+      }
+    }
+  }
+
+  @Test
+  public void lockStatusData() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+         final TransactionDB tdb = TransactionDB.open(options, txnDbOptions,
+             dbFolder.getRoot().getAbsolutePath());
+         final WriteOptions writeOptions = new WriteOptions();
+         final ReadOptions readOptions = new ReadOptions()) {
+
+      try (final Transaction txn = tdb.beginTransaction(writeOptions)) {
+
+        final byte key[] = "key".getBytes(UTF_8);
+        final byte value[] = "value".getBytes(UTF_8);
+
+        txn.put(key, value);
+        assertThat(txn.getForUpdate(readOptions, key, true)).isEqualTo(value);
+
+        final Map<Long, TransactionDB.KeyLockInfo> lockStatus =
+            tdb.getLockStatusData();
+
+        assertThat(lockStatus.size()).isEqualTo(1);
+        final Set<Map.Entry<Long, TransactionDB.KeyLockInfo>> entrySet = lockStatus.entrySet();
+        final Map.Entry<Long, TransactionDB.KeyLockInfo> entry = entrySet.iterator().next();
+        final long columnFamilyId = entry.getKey();
+        assertThat(columnFamilyId).isEqualTo(0);
+        final TransactionDB.KeyLockInfo keyLockInfo = entry.getValue();
+        assertThat(keyLockInfo.getKey()).isEqualTo(new String(key, UTF_8));
+        assertThat(keyLockInfo.getTransactionIDs().length).isEqualTo(1);
+        assertThat(keyLockInfo.getTransactionIDs()[0]).isEqualTo(txn.getId());
+        assertThat(keyLockInfo.isExclusive()).isTrue();
+      }
+    }
+  }
+
+  @Test
+  public void deadlockInfoBuffer() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+         final TransactionDB tdb = TransactionDB.open(options, txnDbOptions,
+             dbFolder.getRoot().getAbsolutePath())) {
+
+      // TODO(AR) can we cause a deadlock so that we can test the output here?
+      assertThat(tdb.getDeadlockInfoBuffer()).isEmpty();
+    }
+  }
+
+  @Test
+  public void setDeadlockInfoBufferSize() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+         final TransactionDB tdb = TransactionDB.open(options, txnDbOptions,
+             dbFolder.getRoot().getAbsolutePath())) {
+      tdb.setDeadlockInfoBufferSize(123);
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/TransactionLogIteratorTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/TransactionLogIteratorTest.java
new file mode 100644
index 000000000..3c4dff7bb
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/TransactionLogIteratorTest.java
@@ -0,0 +1,139 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class TransactionLogIteratorTest {
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void transactionLogIterator() throws RocksDBException {
+    try (final Options options = new Options()
+        .setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath());
+         final TransactionLogIterator transactionLogIterator =
+             db.getUpdatesSince(0)) {
+      //no-op
+    }
+  }
+
+  @Test
+  public void getBatch() throws RocksDBException {
+    final int numberOfPuts = 5;
+    try (final Options options = new Options()
+        .setCreateIfMissing(true)
+        .setWalTtlSeconds(1000)
+        .setWalSizeLimitMB(10);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath())) {
+
+      for (int i = 0; i < numberOfPuts; i++) {
+        db.put(String.valueOf(i).getBytes(),
+            String.valueOf(i).getBytes());
+      }
+      db.flush(new FlushOptions().setWaitForFlush(true));
+
+      // the latest sequence number is 5 because 5 puts
+      // were written beforehand
+      assertThat(db.getLatestSequenceNumber()).
+          isEqualTo(numberOfPuts);
+
+      // insert 5 writes into a cf
+      try (final ColumnFamilyHandle cfHandle = db.createColumnFamily(
+          new ColumnFamilyDescriptor("new_cf".getBytes()))) {
+        for (int i = 0; i < numberOfPuts; i++) {
+          db.put(cfHandle, String.valueOf(i).getBytes(),
+              String.valueOf(i).getBytes());
+        }
+        // the latest sequence number is 10 because
+        // (5 + 5) puts were written beforehand
+        assertThat(db.getLatestSequenceNumber()).
+            isEqualTo(numberOfPuts + numberOfPuts);
+
+        // Get updates since the beginning
+        try (final TransactionLogIterator transactionLogIterator =
+                 db.getUpdatesSince(0)) {
+          assertThat(transactionLogIterator.isValid()).isTrue();
+          transactionLogIterator.status();
+
+          // The first sequence number is 1
+          final TransactionLogIterator.BatchResult batchResult =
+              transactionLogIterator.getBatch();
+          assertThat(batchResult.sequenceNumber()).isEqualTo(1);
+        }
+      }
+    }
+  }
+
+  @Test
+  public void transactionLogIteratorStallAtLastRecord()
+      throws RocksDBException {
+    try (final Options options = new Options()
+        .setCreateIfMissing(true)
+        .setWalTtlSeconds(1000)
+        .setWalSizeLimitMB(10);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath())) {
+
+      db.put("key1".getBytes(), "value1".getBytes());
+      // Get updates since the beginning
+      try (final TransactionLogIterator transactionLogIterator =
+               db.getUpdatesSince(0)) {
+        transactionLogIterator.status();
+        assertThat(transactionLogIterator.isValid()).isTrue();
+        transactionLogIterator.next();
+        assertThat(transactionLogIterator.isValid()).isFalse();
+        transactionLogIterator.status();
+        db.put("key2".getBytes(), "value2".getBytes());
+        transactionLogIterator.next();
+        transactionLogIterator.status();
+        assertThat(transactionLogIterator.isValid()).isTrue();
+      }
+    }
+  }
+
+  @Test
+  public void transactionLogIteratorCheckAfterRestart()
+      throws RocksDBException {
+    final int numberOfKeys = 2;
+    try (final Options options = new Options()
+        .setCreateIfMissing(true)
+        .setWalTtlSeconds(1000)
+        .setWalSizeLimitMB(10)) {
+
+      try (final RocksDB db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath())) {
+        db.put("key1".getBytes(), "value1".getBytes());
+        db.put("key2".getBytes(), "value2".getBytes());
+        db.flush(new FlushOptions().setWaitForFlush(true));
+
+      }
+
+      // reopen
+      try (final RocksDB db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath())) {
+        assertThat(db.getLatestSequenceNumber()).isEqualTo(numberOfKeys);
+
+        try (final TransactionLogIterator transactionLogIterator =
+                 db.getUpdatesSince(0)) {
+          for (int i = 0; i < numberOfKeys; i++) {
+            transactionLogIterator.status();
+            assertThat(transactionLogIterator.isValid()).isTrue();
+            transactionLogIterator.next();
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/TransactionOptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/TransactionOptionsTest.java
new file mode 100644
index 000000000..add0439e0
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/TransactionOptionsTest.java
@@ -0,0 +1,72 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Test;
+
+import java.util.Random;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class TransactionOptionsTest {
+
+  private static final Random rand = PlatformRandomHelper.
+      getPlatformSpecificRandomFactory();
+
+  @Test
+  public void snapshot() {
+    try (final TransactionOptions opt = new TransactionOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setSetSnapshot(boolValue);
+      assertThat(opt.isSetSnapshot()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void deadlockDetect() {
+    try (final TransactionOptions opt = new TransactionOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setDeadlockDetect(boolValue);
+      assertThat(opt.isDeadlockDetect()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void lockTimeout() {
+    try (final TransactionOptions opt = new TransactionOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setLockTimeout(longValue);
+      assertThat(opt.getLockTimeout()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void expiration() {
+    try (final TransactionOptions opt = new TransactionOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setExpiration(longValue);
+      assertThat(opt.getExpiration()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void deadlockDetectDepth() {
+    try (final TransactionOptions opt = new TransactionOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setDeadlockDetectDepth(longValue);
+      assertThat(opt.getDeadlockDetectDepth()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void maxWriteBatchSize() {
+    try (final TransactionOptions opt = new TransactionOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setMaxWriteBatchSize(longValue);
+      assertThat(opt.getMaxWriteBatchSize()).isEqualTo(longValue);
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/TransactionTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/TransactionTest.java
new file mode 100644
index 000000000..8a3067de9
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/TransactionTest.java
@@ -0,0 +1,488 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.fail;
+
+public class TransactionTest extends AbstractTransactionTest {
+
+  @Test
+  public void getForUpdate_cf_conflict() throws RocksDBException {
+    final byte[] k1 = "key1".getBytes(UTF_8);
+    final byte[] v1 = "value1".getBytes(UTF_8);
+    final byte[] v12 = "value12".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(testCf, k1, v1);
+        assertThat(txn.getForUpdate(readOptions, testCf, k1, true)).isEqualTo(v1);
+        txn.commit();
+      }
+
+      try(final Transaction txn2 = dbContainer.beginTransaction()) {
+        try(final Transaction txn3 = dbContainer.beginTransaction()) {
+          assertThat(txn3.getForUpdate(readOptions, testCf, k1, true)).isEqualTo(v1);
+
+          // NOTE: txn2 updates k1, during txn3
+          try {
+            txn2.put(testCf, k1, v12); // should cause an exception!
+          } catch(final RocksDBException e) {
+            assertThat(e.getStatus().getCode()).isSameAs(Status.Code.TimedOut);
+            return;
+          }
+        }
+      }
+
+      fail("Expected an exception for put after getForUpdate from conflicting" +
+          "transactions");
+    }
+  }
+
+  @Test
+  public void prepare_commit() throws RocksDBException {
+    final byte[] k1 = "key1".getBytes(UTF_8);
+    final byte[] v1 = "value1".getBytes(UTF_8);
+    final byte[] v12 = "value12".getBytes(UTF_8);
+
+    try (final DBContainer dbContainer = startDb();
+         final ReadOptions readOptions = new ReadOptions()) {
+      try (final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(k1, v1);
+        txn.commit();
+      }
+
+      try (final Transaction txn = dbContainer.beginTransaction()) {
+        txn.setName("txnPrepare1");
+        txn.put(k1, v12);
+        txn.prepare();
+        txn.commit();
+      }
+
+      try (final Transaction txn = dbContainer.beginTransaction()) {
+        assertThat(txn.get(readOptions, k1)).isEqualTo(v12);
+      }
+    }
+  }
+
+  @Test
+  public void prepare_rollback() throws RocksDBException {
+    final byte[] k1 = "key1".getBytes(UTF_8);
+    final byte[] v1 = "value1".getBytes(UTF_8);
+    final byte[] v12 = "value12".getBytes(UTF_8);
+
+    try (final DBContainer dbContainer = startDb();
+         final ReadOptions readOptions = new ReadOptions()) {
+      try (final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(k1, v1);
+        txn.commit();
+      }
+
+      try (final Transaction txn = dbContainer.beginTransaction()) {
+        txn.setName("txnPrepare1");
+        txn.put(k1, v12);
+        txn.prepare();
+        txn.rollback();
+      }
+
+      try (final Transaction txn = dbContainer.beginTransaction()) {
+        assertThat(txn.get(readOptions, k1)).isEqualTo(v1);
+      }
+    }
+  }
+
+  @Test
+  public void prepare_read_prepared_commit() throws RocksDBException {
+    final byte[] k1 = "key1".getBytes(UTF_8);
+    final byte[] v1 = "value1".getBytes(UTF_8);
+    final byte[] v12 = "value12".getBytes(UTF_8);
+
+    try (final DBContainer dbContainer = startDb();
+         final ReadOptions readOptions = new ReadOptions()) {
+      try (final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(k1, v1);
+        txn.commit();
+      }
+
+      Transaction txnPrepare;
+      txnPrepare = dbContainer.beginTransaction();
+      txnPrepare.setName("txnPrepare1");
+      txnPrepare.put(k1, v12);
+      txnPrepare.prepare();
+
+      try (final Transaction txn = dbContainer.beginTransaction()) {
+        assertThat(txn.get(readOptions, k1)).isEqualTo(v1);
+      }
+
+      txnPrepare.commit();
+
+      try (final Transaction txn = dbContainer.beginTransaction()) {
+        assertThat(txn.get(readOptions, k1)).isEqualTo(v12);
+      }
+    }
+  }
+
+  @Test
+  public void prepare_read_prepared_rollback() throws RocksDBException {
+    final byte[] k1 = "key1".getBytes(UTF_8);
+    final byte[] v1 = "value1".getBytes(UTF_8);
+    final byte[] v12 = "value12".getBytes(UTF_8);
+
+    try (final DBContainer dbContainer = startDb();
+         final ReadOptions readOptions = new ReadOptions()) {
+      try (final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(k1, v1);
+        txn.commit();
+      }
+
+      Transaction txnPrepare;
+      txnPrepare = dbContainer.beginTransaction();
+      txnPrepare.setName("txnPrepare1");
+      txnPrepare.put(k1, v12);
+      txnPrepare.prepare();
+
+      try (final Transaction txn = dbContainer.beginTransaction()) {
+        assertThat(txn.get(readOptions, k1)).isEqualTo(v1);
+      }
+
+      txnPrepare.rollback();
+
+      try (final Transaction txn = dbContainer.beginTransaction()) {
+        assertThat(txn.get(readOptions, k1)).isEqualTo(v1);
+      }
+    }
+  }
+
+  @Test
+  public void getForUpdate_conflict() throws RocksDBException {
+    final byte[] k1 = "key1".getBytes(UTF_8);
+    final byte[] v1 = "value1".getBytes(UTF_8);
+    final byte[] v12 = "value12".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions()) {
+
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(k1, v1);
+        assertThat(txn.getForUpdate(readOptions, k1, true)).isEqualTo(v1);
+        txn.commit();
+      }
+
+      try(final Transaction txn2 = dbContainer.beginTransaction()) {
+        try(final Transaction txn3 = dbContainer.beginTransaction()) {
+          assertThat(txn3.getForUpdate(readOptions, k1, true)).isEqualTo(v1);
+
+          // NOTE: txn2 updates k1, during txn3
+          try {
+            txn2.put(k1, v12); // should cause an exception!
+          } catch(final RocksDBException e) {
+            assertThat(e.getStatus().getCode()).isSameAs(Status.Code.TimedOut);
+            return;
+          }
+        }
+      }
+
+      fail("Expected an exception for put after getForUpdate from conflicting" +
+          "transactions");
+    }
+  }
+
+  @Test
+  public void multiGetForUpdate_cf_conflict() throws RocksDBException {
+    final byte[][] keys = new byte[][] {"key1".getBytes(UTF_8), "key2".getBytes(UTF_8)};
+    final byte[][] values = new byte[][] {"value1".getBytes(UTF_8), "value2".getBytes(UTF_8)};
+    final byte[] otherValue = "otherValue".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      final List<ColumnFamilyHandle> cfList = Arrays.asList(testCf, testCf);
+
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(testCf, keys[0], values[0]);
+        txn.put(testCf, keys[1], values[1]);
+        assertThat(txn.multiGet(readOptions, cfList, keys)).isEqualTo(values);
+        txn.commit();
+      }
+
+      try(final Transaction txn2 = dbContainer.beginTransaction()) {
+        try(final Transaction txn3 = dbContainer.beginTransaction()) {
+          assertThat(txn3.multiGetForUpdate(readOptions, cfList, keys))
+              .isEqualTo(values);
+
+          // NOTE: txn2 updates k1, during txn3
+          try {
+            txn2.put(testCf, keys[0], otherValue); // should cause an exception!
+          } catch(final RocksDBException e) {
+            assertThat(e.getStatus().getCode()).isSameAs(Status.Code.TimedOut);
+            return;
+          }
+        }
+      }
+
+      fail("Expected an exception for put after getForUpdate from conflicting" +
+          "transactions");
+    }
+  }
+
+  @Test
+  public void multiGetAsListForUpdate_cf_conflict() throws RocksDBException {
+    final byte[][] keys = new byte[][] {"key1".getBytes(UTF_8), "key2".getBytes(UTF_8)};
+    final byte[][] values = new byte[][] {"value1".getBytes(UTF_8), "value2".getBytes(UTF_8)};
+    final byte[] otherValue = "otherValue".getBytes(UTF_8);
+
+    try (final DBContainer dbContainer = startDb();
+         final ReadOptions readOptions = new ReadOptions()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      final List<ColumnFamilyHandle> cfList = Arrays.asList(testCf, testCf);
+
+      try (final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(testCf, keys[0], values[0]);
+        txn.put(testCf, keys[1], values[1]);
+        assertThat(txn.multiGetAsList(readOptions, cfList, Arrays.asList(keys)))
+            .containsExactly(values);
+        txn.commit();
+      }
+
+      try (final Transaction txn2 = dbContainer.beginTransaction()) {
+        try (final Transaction txn3 = dbContainer.beginTransaction()) {
+          assertThat(txn3.multiGetForUpdateAsList(readOptions, cfList, Arrays.asList(keys)))
+              .containsExactly(values);
+
+          // NOTE: txn2 updates k1, during txn3
+          try {
+            txn2.put(testCf, keys[0], otherValue); // should cause an exception!
+          } catch (final RocksDBException e) {
+            assertThat(e.getStatus().getCode()).isSameAs(Status.Code.TimedOut);
+            return;
+          }
+        }
+      }
+
+      fail("Expected an exception for put after getForUpdate from conflicting"
+          + "transactions");
+    }
+  }
+
+  @Test
+  public void multiGetForUpdate_conflict() throws RocksDBException {
+    final byte[][] keys = new byte[][] {"key1".getBytes(UTF_8), "key2".getBytes(UTF_8)};
+    final byte[][] values = new byte[][] {"value1".getBytes(UTF_8), "value2".getBytes(UTF_8)};
+    final byte[] otherValue = "otherValue".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions()) {
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(keys[0], values[0]);
+        txn.put(keys[1], values[1]);
+        assertThat(txn.multiGet(readOptions, keys)).isEqualTo(values);
+        txn.commit();
+      }
+
+      try(final Transaction txn2 = dbContainer.beginTransaction()) {
+        try(final Transaction txn3 = dbContainer.beginTransaction()) {
+          assertThat(txn3.multiGetForUpdate(readOptions, keys))
+              .isEqualTo(values);
+
+          // NOTE: txn2 updates k1, during txn3
+          try {
+            txn2.put(keys[0], otherValue); // should cause an exception!
+          } catch (final RocksDBException e) {
+            assertThat(e.getStatus().getCode()).isSameAs(Status.Code.TimedOut);
+            return;
+          }
+        }
+      }
+
+      fail("Expected an exception for put after getForUpdate from conflicting"
+          + "transactions");
+    }
+  }
+
+  @Test
+  public void multiGetAsListForUpdate_conflict() throws RocksDBException {
+    final byte[][] keys = new byte[][] {"key1".getBytes(UTF_8), "key2".getBytes(UTF_8)};
+    final byte[][] values = new byte[][] {"value1".getBytes(UTF_8), "value2".getBytes(UTF_8)};
+    final byte[] otherValue = "otherValue".getBytes(UTF_8);
+
+    try (final DBContainer dbContainer = startDb();
+         final ReadOptions readOptions = new ReadOptions()) {
+      try (final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(keys[0], values[0]);
+        txn.put(keys[1], values[1]);
+        assertThat(txn.multiGetAsList(readOptions, Arrays.asList(keys))).containsExactly(values);
+        txn.commit();
+      }
+
+      try (final Transaction txn2 = dbContainer.beginTransaction()) {
+        try (final Transaction txn3 = dbContainer.beginTransaction()) {
+          assertThat(txn3.multiGetForUpdateAsList(readOptions, Arrays.asList(keys)))
+              .containsExactly(values);
+
+          // NOTE: txn2 updates k1, during txn3
+          try {
+            txn2.put(keys[0], otherValue); // should cause an exception!
+          } catch(final RocksDBException e) {
+            assertThat(e.getStatus().getCode()).isSameAs(Status.Code.TimedOut);
+            return;
+          }
+        }
+      }
+
+      fail("Expected an exception for put after getForUpdate from conflicting" +
+          "transactions");
+    }
+  }
+
+  @Test
+  public void name() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      assertThat(txn.getName()).isEmpty();
+      final String name = "my-transaction-" + rand.nextLong();
+      txn.setName(name);
+      assertThat(txn.getName()).isEqualTo(name);
+    }
+  }
+
+  @Test
+  public void ID() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      assertThat(txn.getID()).isGreaterThan(0);
+    }
+  }
+
+  @Test
+  public void deadlockDetect() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      assertThat(txn.isDeadlockDetect()).isFalse();
+    }
+  }
+
+  @Test
+  public void waitingTxns() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      assertThat(txn.getWaitingTxns().getTransactionIds().length).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void state() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb()) {
+
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        assertThat(txn.getState())
+            .isSameAs(Transaction.TransactionState.STARTED);
+        txn.commit();
+        assertThat(txn.getState())
+            .isSameAs(Transaction.TransactionState.COMMITTED);
+      }
+
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        assertThat(txn.getState())
+            .isSameAs(Transaction.TransactionState.STARTED);
+        txn.rollback();
+        assertThat(txn.getState())
+            .isSameAs(Transaction.TransactionState.STARTED);
+      }
+    }
+  }
+
+  @Test
+  public void Id() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      assertThat(txn.getId()).isNotNull();
+    }
+  }
+
+  @Override
+  public TransactionDBContainer startDb() throws RocksDBException {
+    final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true)
+        .setCreateMissingColumnFamilies(true);
+    final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+    final ColumnFamilyOptions columnFamilyOptions = new ColumnFamilyOptions();
+    final List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+        Arrays.asList(
+            new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+            new ColumnFamilyDescriptor(TXN_TEST_COLUMN_FAMILY,
+                columnFamilyOptions));
+    final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+
+    final TransactionDB txnDb;
+    try {
+      txnDb = TransactionDB.open(options, txnDbOptions,
+          dbFolder.getRoot().getAbsolutePath(), columnFamilyDescriptors,
+              columnFamilyHandles);
+    } catch(final RocksDBException e) {
+      columnFamilyOptions.close();
+      txnDbOptions.close();
+      options.close();
+      throw e;
+    }
+
+    final WriteOptions writeOptions = new WriteOptions();
+    final TransactionOptions txnOptions = new TransactionOptions();
+
+    return new TransactionDBContainer(txnOptions, writeOptions,
+        columnFamilyHandles, txnDb, txnDbOptions, columnFamilyOptions, options);
+  }
+
+  private static class TransactionDBContainer
+      extends DBContainer {
+    private final TransactionOptions txnOptions;
+    private final TransactionDB txnDb;
+    private final TransactionDBOptions txnDbOptions;
+
+    public TransactionDBContainer(
+        final TransactionOptions txnOptions, final WriteOptions writeOptions,
+        final List<ColumnFamilyHandle> columnFamilyHandles,
+        final TransactionDB txnDb, final TransactionDBOptions txnDbOptions,
+        final ColumnFamilyOptions columnFamilyOptions,
+        final DBOptions options) {
+      super(writeOptions, columnFamilyHandles, columnFamilyOptions,
+          options);
+      this.txnOptions = txnOptions;
+      this.txnDb = txnDb;
+      this.txnDbOptions = txnDbOptions;
+    }
+
+    @Override
+    public Transaction beginTransaction() {
+      return txnDb.beginTransaction(writeOptions, txnOptions);
+    }
+
+    @Override
+    public Transaction beginTransaction(final WriteOptions writeOptions) {
+      return txnDb.beginTransaction(writeOptions, txnOptions);
+    }
+
+    @Override
+    public void close() {
+      txnOptions.close();
+      writeOptions.close();
+      for(final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles) {
+        columnFamilyHandle.close();
+      }
+      txnDb.close();
+      txnDbOptions.close();
+      options.close();
+    }
+  }
+
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/TtlDBTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/TtlDBTest.java
new file mode 100644
index 000000000..ffa15e768
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/TtlDBTest.java
@@ -0,0 +1,112 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class TtlDBTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void ttlDBOpen() throws RocksDBException, InterruptedException {
+    try (final Options options = new Options().setCreateIfMissing(true).setMaxCompactionBytes(0);
+         final TtlDB ttlDB = TtlDB.open(options, dbFolder.getRoot().getAbsolutePath())) {
+      ttlDB.put("key".getBytes(), "value".getBytes());
+      assertThat(ttlDB.get("key".getBytes())).
+          isEqualTo("value".getBytes());
+      assertThat(ttlDB.get("key".getBytes())).isNotNull();
+    }
+  }
+
+  @Test
+  public void ttlDBOpenWithTtl() throws RocksDBException, InterruptedException {
+    try (final Options options = new Options().setCreateIfMissing(true).setMaxCompactionBytes(0);
+         final TtlDB ttlDB = TtlDB.open(options, dbFolder.getRoot().getAbsolutePath(), 1, false);) {
+      ttlDB.put("key".getBytes(), "value".getBytes());
+      assertThat(ttlDB.get("key".getBytes())).
+          isEqualTo("value".getBytes());
+      TimeUnit.SECONDS.sleep(2);
+      ttlDB.compactRange();
+      assertThat(ttlDB.get("key".getBytes())).isNull();
+    }
+  }
+
+  @Test
+  public void ttlDbOpenWithColumnFamilies() throws RocksDBException,
+      InterruptedException {
+    final List<ColumnFamilyDescriptor> cfNames = Arrays.asList(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+        new ColumnFamilyDescriptor("new_cf".getBytes())
+    );
+    final List<Integer> ttlValues = Arrays.asList(0, 1);
+
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+    try (final DBOptions dbOptions = new DBOptions()
+        .setCreateMissingColumnFamilies(true)
+        .setCreateIfMissing(true);
+         final TtlDB ttlDB = TtlDB.open(dbOptions,
+             dbFolder.getRoot().getAbsolutePath(), cfNames,
+             columnFamilyHandleList, ttlValues, false)) {
+      try {
+        ttlDB.put("key".getBytes(), "value".getBytes());
+        assertThat(ttlDB.get("key".getBytes())).
+            isEqualTo("value".getBytes());
+        ttlDB.put(columnFamilyHandleList.get(1), "key".getBytes(),
+            "value".getBytes());
+        assertThat(ttlDB.get(columnFamilyHandleList.get(1),
+            "key".getBytes())).isEqualTo("value".getBytes());
+        TimeUnit.SECONDS.sleep(2);
+
+        ttlDB.compactRange();
+        ttlDB.compactRange(columnFamilyHandleList.get(1));
+
+        assertThat(ttlDB.get("key".getBytes())).isNotNull();
+        assertThat(ttlDB.get(columnFamilyHandleList.get(1),
+            "key".getBytes())).isNull();
+      } finally {
+        for (final ColumnFamilyHandle columnFamilyHandle :
+            columnFamilyHandleList) {
+          columnFamilyHandle.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  public void createTtlColumnFamily() throws RocksDBException,
+      InterruptedException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final TtlDB ttlDB = TtlDB.open(options,
+             dbFolder.getRoot().getAbsolutePath());
+         final ColumnFamilyHandle columnFamilyHandle =
+             ttlDB.createColumnFamilyWithTtl(
+                 new ColumnFamilyDescriptor("new_cf".getBytes()), 1)) {
+      ttlDB.put(columnFamilyHandle, "key".getBytes(),
+          "value".getBytes());
+      assertThat(ttlDB.get(columnFamilyHandle, "key".getBytes())).
+          isEqualTo("value".getBytes());
+      TimeUnit.SECONDS.sleep(2);
+      ttlDB.compactRange(columnFamilyHandle);
+      assertThat(ttlDB.get(columnFamilyHandle, "key".getBytes())).isNull();
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/Types.java b/src/rocksdb/java/src/test/java/org/rocksdb/Types.java
new file mode 100644
index 000000000..c3c1de833
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/Types.java
@@ -0,0 +1,43 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Simple type conversion methods
+ * for use in tests
+ */
+public class Types {
+
+  /**
+   * Convert first 4 bytes of a byte array to an int
+   *
+   * @param data The byte array
+   *
+   * @return An integer
+   */
+  public static int byteToInt(final byte data[]) {
+    return (data[0] & 0xff) |
+        ((data[1] & 0xff) << 8) |
+        ((data[2] & 0xff) << 16) |
+        ((data[3] & 0xff) << 24);
+  }
+
+  /**
+   * Convert an int to 4 bytes
+   *
+   * @param v The int
+   *
+   * @return A byte array containing 4 bytes
+   */
+  public static byte[] intToByte(final int v) {
+    return new byte[] {
+        (byte)((v >>> 0) & 0xff),
+        (byte)((v >>> 8) & 0xff),
+        (byte)((v >>> 16) & 0xff),
+        (byte)((v >>> 24) & 0xff)
+    };
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/VerifyChecksumsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/VerifyChecksumsTest.java
new file mode 100644
index 000000000..ddc2a456f
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/VerifyChecksumsTest.java
@@ -0,0 +1,213 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.text.MessageFormat;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+public class VerifyChecksumsTest {
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  /**
+   * Class to factor out the specific DB operations within the test
+   */
+  abstract static class Operations {
+    final int kv_count;
+    final List<String> elements = new ArrayList<>();
+    final List<String> sortedElements = new ArrayList<>();
+
+    Operations(final int kv_count) {
+      this.kv_count = kv_count;
+      for (int i = 0; i < kv_count; i++) elements.add(MessageFormat.format("{0,number,#}", i));
+      sortedElements.addAll(elements);
+      Collections.sort(sortedElements);
+    }
+
+    void fill(final RocksDB db) throws RocksDBException {
+      for (int i = 0; i < kv_count; i++) {
+        final String key = MessageFormat.format("key{0}", elements.get(i));
+        final String value = MessageFormat.format("value{0}", elements.get(i));
+        // noinspection ObjectAllocationInLoop
+        db.put(key.getBytes(), value.getBytes());
+      }
+      db.flush(new FlushOptions());
+    }
+
+    @SuppressWarnings("ObjectAllocationInLoop")
+    void get(final RocksDB db, final boolean verifyFlag) throws RocksDBException {
+      try (final ReadOptions readOptions = new ReadOptions()) {
+        readOptions.setReadaheadSize(32 * 1024);
+        readOptions.setFillCache(false);
+        readOptions.setVerifyChecksums(verifyFlag);
+
+        for (int i = 0; i < kv_count / 10; i++) {
+          @SuppressWarnings("UnsecureRandomNumberGeneration")
+          final int index = Double.valueOf(Math.random() * kv_count).intValue();
+          final String key = MessageFormat.format("key{0}", sortedElements.get(index));
+          final String expectedValue = MessageFormat.format("value{0}", sortedElements.get(index));
+
+          final byte[] value = db.get(readOptions, key.getBytes());
+          assertThat(value).isEqualTo(expectedValue.getBytes());
+        }
+      }
+    }
+
+    @SuppressWarnings("ObjectAllocationInLoop")
+    void multiGet(final RocksDB db, final boolean verifyFlag) throws RocksDBException {
+      try (final ReadOptions readOptions = new ReadOptions()) {
+        readOptions.setReadaheadSize(32 * 1024);
+        readOptions.setFillCache(false);
+        readOptions.setVerifyChecksums(verifyFlag);
+
+        final List<byte[]> keys = new ArrayList<>();
+        final List<String> expectedValues = new ArrayList<>();
+
+        for (int i = 0; i < kv_count / 10; i++) {
+          @SuppressWarnings("UnsecureRandomNumberGeneration")
+          final int index = Double.valueOf(Math.random() * kv_count).intValue();
+          keys.add(MessageFormat.format("key{0}", sortedElements.get(index)).getBytes());
+
+          expectedValues.add(MessageFormat.format("value{0}", sortedElements.get(index)));
+        }
+
+        final List<byte[]> values = db.multiGetAsList(readOptions, keys);
+        for (int i = 0; i < keys.size(); i++) {
+          assertThat(values.get(i)).isEqualTo(expectedValues.get(i).getBytes());
+        }
+      }
+    }
+
+    void iterate(final RocksDB db, final boolean verifyFlag) throws RocksDBException {
+      final ReadOptions readOptions = new ReadOptions();
+      readOptions.setReadaheadSize(32 * 1024);
+      readOptions.setFillCache(false);
+      readOptions.setVerifyChecksums(verifyFlag);
+      int i = 0;
+      try (final RocksIterator rocksIterator = db.newIterator(readOptions)) {
+        rocksIterator.seekToFirst();
+        rocksIterator.status();
+        while (rocksIterator.isValid()) {
+          final byte[] key = rocksIterator.key();
+          final byte[] value = rocksIterator.value();
+          // noinspection ObjectAllocationInLoop
+          assertThat(key).isEqualTo(
+              (MessageFormat.format("key{0}", sortedElements.get(i))).getBytes());
+          // noinspection ObjectAllocationInLoop
+          assertThat(value).isEqualTo(
+              (MessageFormat.format("value{0}", sortedElements.get(i))).getBytes());
+          rocksIterator.next();
+          rocksIterator.status();
+          i++;
+        }
+      }
+      assertThat(i).isEqualTo(kv_count);
+    }
+
+    abstract void performOperations(final RocksDB db, final boolean verifyFlag)
+        throws RocksDBException;
+  }
+
+  private static final int KV_COUNT = 10000;
+
+  /**
+   * Run some operations and count the TickerType.BLOCK_CHECKSUM_COMPUTE_COUNT before and after
+   * It should GO UP when the read options have checksum verification turned on.
+   * It shoulld REMAIN UNCHANGED when the read options have checksum verification turned off.
+   * As the read options refer only to the read operations, there are still a few checksums
+   * performed outside this (blocks are getting loaded for lots of reasons, not aways directly due
+   * to reads) but this test provides a good enough proxy for whether the flag is being noticed.
+   *
+   * @param operations the DB reading operations to perform which affect the checksum stats
+   *
+   * @throws RocksDBException
+   */
+  private void verifyChecksums(final Operations operations) throws RocksDBException {
+    final String dbPath = dbFolder.getRoot().getAbsolutePath();
+
+    // noinspection SingleStatementInBlock
+    try (final Statistics statistics = new Statistics();
+         final Options options = new Options().setCreateIfMissing(true).setStatistics(statistics)) {
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        // 0
+        System.out.println(MessageFormat.format(
+            "newly open {0}", statistics.getTickerCount(TickerType.BLOCK_CHECKSUM_COMPUTE_COUNT)));
+        operations.fill(db);
+        //
+        System.out.println(MessageFormat.format(
+            "flushed {0}", statistics.getTickerCount(TickerType.BLOCK_CHECKSUM_COMPUTE_COUNT)));
+      }
+
+      // 2
+      System.out.println(MessageFormat.format("closed-after-write {0}",
+          statistics.getTickerCount(TickerType.BLOCK_CHECKSUM_COMPUTE_COUNT)));
+
+      for (final boolean verifyFlag : new boolean[] {false, true, false, true}) {
+        try (final RocksDB db = RocksDB.open(options, dbPath)) {
+          final long beforeOperationsCount =
+              statistics.getTickerCount(TickerType.BLOCK_CHECKSUM_COMPUTE_COUNT);
+          System.out.println(MessageFormat.format("re-opened {0}", beforeOperationsCount));
+          operations.performOperations(db, verifyFlag);
+          final long afterOperationsCount =
+              statistics.getTickerCount(TickerType.BLOCK_CHECKSUM_COMPUTE_COUNT);
+          if (verifyFlag) {
+            // We don't need to be exact - we are checking that the checksums happen
+            // exactly how many depends on block size etc etc, so may not be entirely stable
+            System.out.println(MessageFormat.format("verify=true {0}", afterOperationsCount));
+            assertThat(afterOperationsCount).isGreaterThan(beforeOperationsCount + 20);
+          } else {
+            System.out.println(MessageFormat.format("verify=false {0}", afterOperationsCount));
+            assertThat(afterOperationsCount).isEqualTo(beforeOperationsCount);
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void verifyChecksumsInIteration() throws RocksDBException {
+    // noinspection AnonymousInnerClassMayBeStatic
+    verifyChecksums(new Operations(KV_COUNT) {
+      @Override
+      void performOperations(final RocksDB db, final boolean verifyFlag) throws RocksDBException {
+        iterate(db, verifyFlag);
+      }
+    });
+  }
+
+  @Test
+  public void verifyChecksumsGet() throws RocksDBException {
+    // noinspection AnonymousInnerClassMayBeStatic
+    verifyChecksums(new Operations(KV_COUNT) {
+      @Override
+      void performOperations(final RocksDB db, final boolean verifyFlag) throws RocksDBException {
+        get(db, verifyFlag);
+      }
+    });
+  }
+
+  @Test
+  public void verifyChecksumsMultiGet() throws RocksDBException {
+    // noinspection AnonymousInnerClassMayBeStatic
+    verifyChecksums(new Operations(KV_COUNT) {
+      @Override
+      void performOperations(final RocksDB db, final boolean verifyFlag) throws RocksDBException {
+        multiGet(db, verifyFlag);
+      }
+    });
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/WALRecoveryModeTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/WALRecoveryModeTest.java
new file mode 100644
index 000000000..2a0133f6b
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/WALRecoveryModeTest.java
@@ -0,0 +1,22 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+
+public class WALRecoveryModeTest {
+
+  @Test
+  public void getWALRecoveryMode() {
+    for (final WALRecoveryMode walRecoveryMode : WALRecoveryMode.values()) {
+      assertThat(WALRecoveryMode.getWALRecoveryMode(walRecoveryMode.getValue()))
+          .isEqualTo(walRecoveryMode);
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/WalFilterTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/WalFilterTest.java
new file mode 100644
index 000000000..adeb959d1
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/WalFilterTest.java
@@ -0,0 +1,165 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.rocksdb.util.ByteUtil.bytes;
+import static org.rocksdb.util.TestUtil.*;
+
+public class WalFilterTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void walFilter() throws RocksDBException {
+    // Create 3 batches with two keys each
+    final byte[][][] batchKeys = {
+        new byte[][] {
+            bytes("key1"),
+            bytes("key2")
+        },
+        new byte[][] {
+            bytes("key3"),
+            bytes("key4")
+        },
+        new byte[][] {
+            bytes("key5"),
+            bytes("key6")
+        }
+
+    };
+
+    final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+        new ColumnFamilyDescriptor(bytes("pikachu"))
+    );
+    final List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
+
+    // Test with all WAL processing options
+    for (final WalProcessingOption option : WalProcessingOption.values()) {
+      try (final Options options = optionsForLogIterTest();
+           final DBOptions dbOptions = new DBOptions(options)
+               .setCreateMissingColumnFamilies(true);
+           final RocksDB db = RocksDB.open(dbOptions,
+               dbFolder.getRoot().getAbsolutePath(),
+                cfDescriptors, cfHandles)) {
+        try (final WriteOptions writeOptions = new WriteOptions()) {
+          // Write given keys in given batches
+          for (int i = 0; i < batchKeys.length; i++) {
+            final WriteBatch batch = new WriteBatch();
+            for (int j = 0; j < batchKeys[i].length; j++) {
+              batch.put(cfHandles.get(0), batchKeys[i][j], dummyString(1024));
+            }
+            db.write(writeOptions, batch);
+          }
+        } finally {
+          for (final ColumnFamilyHandle cfHandle : cfHandles) {
+            cfHandle.close();
+          }
+          cfHandles.clear();
+        }
+      }
+
+      // Create a test filter that would apply wal_processing_option at the first
+      // record
+      final int applyOptionForRecordIndex = 1;
+      try (final TestableWalFilter walFilter =
+               new TestableWalFilter(option, applyOptionForRecordIndex)) {
+
+        try (final Options options = optionsForLogIterTest();
+             final DBOptions dbOptions = new DBOptions(options)
+                .setWalFilter(walFilter)) {
+
+          try (final RocksDB db = RocksDB.open(dbOptions,
+              dbFolder.getRoot().getAbsolutePath(),
+              cfDescriptors, cfHandles)) {
+
+            try {
+              assertThat(walFilter.logNumbers).isNotEmpty();
+              assertThat(walFilter.logFileNames).isNotEmpty();
+            } finally {
+              for (final ColumnFamilyHandle cfHandle : cfHandles) {
+                cfHandle.close();
+              }
+              cfHandles.clear();
+            }
+          } catch (final RocksDBException e) {
+            if (option != WalProcessingOption.CORRUPTED_RECORD) {
+              // exception is expected when CORRUPTED_RECORD!
+              throw e;
+            }
+          }
+        }
+      }
+    }
+  }
+
+
+  private static class TestableWalFilter extends AbstractWalFilter {
+    private final WalProcessingOption walProcessingOption;
+    private final int applyOptionForRecordIndex;
+    Map<Integer, Long> cfLognumber;
+    Map<String, Integer> cfNameId;
+    final List<Long> logNumbers = new ArrayList<>();
+    final List<String> logFileNames = new ArrayList<>();
+    private int currentRecordIndex = 0;
+
+    public TestableWalFilter(final WalProcessingOption walProcessingOption,
+        final int applyOptionForRecordIndex) {
+      super();
+      this.walProcessingOption = walProcessingOption;
+      this.applyOptionForRecordIndex = applyOptionForRecordIndex;
+    }
+
+    @Override
+    public void columnFamilyLogNumberMap(final Map<Integer, Long> cfLognumber,
+        final Map<String, Integer> cfNameId) {
+      this.cfLognumber = cfLognumber;
+      this.cfNameId = cfNameId;
+    }
+
+    @Override
+    public LogRecordFoundResult logRecordFound(
+        final long logNumber, final String logFileName, final WriteBatch batch,
+        final WriteBatch newBatch) {
+
+      logNumbers.add(logNumber);
+      logFileNames.add(logFileName);
+
+      final WalProcessingOption optionToReturn;
+      if (currentRecordIndex == applyOptionForRecordIndex) {
+        optionToReturn = walProcessingOption;
+      }
+      else {
+        optionToReturn = WalProcessingOption.CONTINUE_PROCESSING;
+      }
+
+      currentRecordIndex++;
+
+      return new LogRecordFoundResult(optionToReturn, false);
+    }
+
+    @Override
+    public String name() {
+      return "testable-wal-filter";
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java
new file mode 100644
index 000000000..2826b128f
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java
@@ -0,0 +1,76 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.rocksdb.util.CapturingWriteBatchHandler;
+import org.rocksdb.util.CapturingWriteBatchHandler.Event;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.rocksdb.util.CapturingWriteBatchHandler.Action.*;
+
+
+public class WriteBatchHandlerTest {
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Test
+  public void writeBatchHandler() throws RocksDBException {
+    // setup test data
+    final List<Event> testEvents = Arrays.asList(
+        new Event(DELETE, "k0".getBytes(), null),
+        new Event(PUT, "k1".getBytes(), "v1".getBytes()),
+        new Event(PUT, "k2".getBytes(), "v2".getBytes()),
+        new Event(PUT, "k3".getBytes(), "v3".getBytes()),
+        new Event(LOG, null, "log1".getBytes()),
+        new Event(MERGE, "k2".getBytes(), "v22".getBytes()),
+        new Event(DELETE, "k3".getBytes(), null)
+    );
+
+    // load test data to the write batch
+    try (final WriteBatch batch = new WriteBatch()) {
+      for (final Event testEvent : testEvents) {
+        switch (testEvent.action) {
+
+          case PUT:
+            batch.put(testEvent.key, testEvent.value);
+            break;
+
+          case MERGE:
+            batch.merge(testEvent.key, testEvent.value);
+            break;
+
+          case DELETE:
+            batch.delete(testEvent.key);
+            break;
+
+          case LOG:
+            batch.putLogData(testEvent.value);
+            break;
+        }
+      }
+
+      // attempt to read test data back from the WriteBatch by iterating
+      // with a handler
+      try (final CapturingWriteBatchHandler handler =
+               new CapturingWriteBatchHandler()) {
+        batch.iterate(handler);
+
+        // compare the results to the test data
+        final List<Event> actualEvents =
+            handler.getEvents();
+        assertThat(testEvents.size()).isSameAs(actualEvents.size());
+
+        assertThat(testEvents).isEqualTo(actualEvents);
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/WriteBatchTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/WriteBatchTest.java
new file mode 100644
index 000000000..cc3ad26eb
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/WriteBatchTest.java
@@ -0,0 +1,528 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+package org.rocksdb;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.rocksdb.util.CapturingWriteBatchHandler.Action.DELETE;
+import static org.rocksdb.util.CapturingWriteBatchHandler.Action.DELETE_RANGE;
+import static org.rocksdb.util.CapturingWriteBatchHandler.Action.LOG;
+import static org.rocksdb.util.CapturingWriteBatchHandler.Action.MERGE;
+import static org.rocksdb.util.CapturingWriteBatchHandler.Action.PUT;
+import static org.rocksdb.util.CapturingWriteBatchHandler.Action.SINGLE_DELETE;
+
+import java.io.UnsupportedEncodingException;
+import java.nio.ByteBuffer;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.rocksdb.util.CapturingWriteBatchHandler;
+import org.rocksdb.util.CapturingWriteBatchHandler.Event;
+import org.rocksdb.util.WriteBatchGetter;
+
+/**
+ * This class mimics the db/write_batch_test.cc
+ * in the c++ rocksdb library.
+ */
+public class WriteBatchTest {
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void emptyWriteBatch() {
+    try (final WriteBatch batch = new WriteBatch()) {
+      assertThat(batch.count()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void multipleBatchOperations()
+      throws RocksDBException {
+
+    final byte[] foo = "foo".getBytes(UTF_8);
+    final byte[] bar = "bar".getBytes(UTF_8);
+    final byte[] box = "box".getBytes(UTF_8);
+    final byte[] baz = "baz".getBytes(UTF_8);
+    final byte[] boo = "boo".getBytes(UTF_8);
+    final byte[] hoo = "hoo".getBytes(UTF_8);
+    final byte[] hello = "hello".getBytes(UTF_8);
+
+    try (final WriteBatch batch = new WriteBatch()) {
+      batch.put(foo, bar);
+      batch.delete(box);
+      batch.put(baz, boo);
+      batch.merge(baz, hoo);
+      batch.singleDelete(foo);
+      batch.deleteRange(baz, foo);
+      batch.putLogData(hello);
+
+      try(final CapturingWriteBatchHandler handler =
+              new CapturingWriteBatchHandler()) {
+        batch.iterate(handler);
+
+        assertThat(handler.getEvents().size()).isEqualTo(7);
+
+        assertThat(handler.getEvents().get(0)).isEqualTo(new Event(PUT, foo, bar));
+        assertThat(handler.getEvents().get(1)).isEqualTo(new Event(DELETE, box, null));
+        assertThat(handler.getEvents().get(2)).isEqualTo(new Event(PUT, baz, boo));
+        assertThat(handler.getEvents().get(3)).isEqualTo(new Event(MERGE, baz, hoo));
+        assertThat(handler.getEvents().get(4)).isEqualTo(new Event(SINGLE_DELETE, foo, null));
+        assertThat(handler.getEvents().get(5)).isEqualTo(new Event(DELETE_RANGE, baz, foo));
+        assertThat(handler.getEvents().get(6)).isEqualTo(new Event(LOG, null, hello));
+      }
+    }
+  }
+
+  @Test
+  public void multipleBatchOperationsDirect()
+      throws UnsupportedEncodingException, RocksDBException {
+    try (WriteBatch batch = new WriteBatch()) {
+      ByteBuffer key = ByteBuffer.allocateDirect(16);
+      ByteBuffer value = ByteBuffer.allocateDirect(16);
+      key.put("foo".getBytes("US-ASCII")).flip();
+      value.put("bar".getBytes("US-ASCII")).flip();
+      batch.put(key, value);
+      assertThat(key.position()).isEqualTo(3);
+      assertThat(key.limit()).isEqualTo(3);
+      assertThat(value.position()).isEqualTo(3);
+      assertThat(value.limit()).isEqualTo(3);
+
+      key.clear();
+      key.put("box".getBytes("US-ASCII")).flip();
+      batch.delete(key);
+      assertThat(key.position()).isEqualTo(3);
+      assertThat(key.limit()).isEqualTo(3);
+
+      batch.put("baz".getBytes("US-ASCII"), "boo".getBytes("US-ASCII"));
+
+      WriteBatchTestInternalHelper.setSequence(batch, 100);
+      assertThat(WriteBatchTestInternalHelper.sequence(batch)).isNotNull().isEqualTo(100);
+      assertThat(batch.count()).isEqualTo(3);
+      assertThat(new String(getContents(batch), "US-ASCII"))
+          .isEqualTo("Put(baz, boo)@102"
+              + "Delete(box)@101"
+              + "Put(foo, bar)@100");
+    }
+  }
+
+  @Test
+  public void testAppendOperation()
+      throws RocksDBException {
+    try (final WriteBatch b1 = new WriteBatch();
+         final WriteBatch b2 = new WriteBatch()) {
+      WriteBatchTestInternalHelper.setSequence(b1, 200);
+      WriteBatchTestInternalHelper.setSequence(b2, 300);
+      WriteBatchTestInternalHelper.append(b1, b2);
+      assertThat(getContents(b1).length).isEqualTo(0);
+      assertThat(b1.count()).isEqualTo(0);
+      b2.put("a".getBytes(UTF_8), "va".getBytes(UTF_8));
+      WriteBatchTestInternalHelper.append(b1, b2);
+      assertThat("Put(a, va)@200".equals(new String(getContents(b1),
+          UTF_8)));
+      assertThat(b1.count()).isEqualTo(1);
+      b2.clear();
+      b2.put("b".getBytes(UTF_8), "vb".getBytes(UTF_8));
+      WriteBatchTestInternalHelper.append(b1, b2);
+      assertThat(("Put(a, va)@200" +
+          "Put(b, vb)@201")
+          .equals(new String(getContents(b1), UTF_8)));
+      assertThat(b1.count()).isEqualTo(2);
+      b2.delete("foo".getBytes(UTF_8));
+      WriteBatchTestInternalHelper.append(b1, b2);
+      assertThat(("Put(a, va)@200" +
+          "Put(b, vb)@202" +
+          "Put(b, vb)@201" +
+          "Delete(foo)@203")
+          .equals(new String(getContents(b1), UTF_8)));
+      assertThat(b1.count()).isEqualTo(4);
+    }
+  }
+
+  @Test
+  public void blobOperation()
+      throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      batch.put("k1".getBytes(UTF_8), "v1".getBytes(UTF_8));
+      batch.put("k2".getBytes(UTF_8), "v2".getBytes(UTF_8));
+      batch.put("k3".getBytes(UTF_8), "v3".getBytes(UTF_8));
+      batch.putLogData("blob1".getBytes(UTF_8));
+      batch.delete("k2".getBytes(UTF_8));
+      batch.putLogData("blob2".getBytes(UTF_8));
+      batch.merge("foo".getBytes(UTF_8), "bar".getBytes(UTF_8));
+      assertThat(batch.count()).isEqualTo(5);
+      assertThat(("Merge(foo, bar)@4" +
+          "Put(k1, v1)@0" +
+          "Delete(k2)@3" +
+          "Put(k2, v2)@1" +
+          "Put(k3, v3)@2")
+          .equals(new String(getContents(batch), UTF_8)));
+    }
+  }
+
+  @Test
+  public void savePoints()
+      throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      batch.put("k1".getBytes(UTF_8), "v1".getBytes(UTF_8));
+      batch.put("k2".getBytes(UTF_8), "v2".getBytes(UTF_8));
+      batch.put("k3".getBytes(UTF_8), "v3".getBytes(UTF_8));
+
+      assertThat(getFromWriteBatch(batch, "k1")).isEqualTo("v1");
+      assertThat(getFromWriteBatch(batch, "k2")).isEqualTo("v2");
+      assertThat(getFromWriteBatch(batch, "k3")).isEqualTo("v3");
+
+      batch.setSavePoint();
+
+      batch.delete("k2".getBytes(UTF_8));
+      batch.put("k3".getBytes(UTF_8), "v3-2".getBytes(UTF_8));
+
+      assertThat(getFromWriteBatch(batch, "k2")).isNull();
+      assertThat(getFromWriteBatch(batch, "k3")).isEqualTo("v3-2");
+
+
+      batch.setSavePoint();
+
+      batch.put("k3".getBytes(UTF_8), "v3-3".getBytes(UTF_8));
+      batch.put("k4".getBytes(UTF_8), "v4".getBytes(UTF_8));
+
+      assertThat(getFromWriteBatch(batch, "k3")).isEqualTo("v3-3");
+      assertThat(getFromWriteBatch(batch, "k4")).isEqualTo("v4");
+
+
+      batch.rollbackToSavePoint();
+
+      assertThat(getFromWriteBatch(batch, "k2")).isNull();
+      assertThat(getFromWriteBatch(batch, "k3")).isEqualTo("v3-2");
+      assertThat(getFromWriteBatch(batch, "k4")).isNull();
+
+
+      batch.rollbackToSavePoint();
+
+      assertThat(getFromWriteBatch(batch, "k1")).isEqualTo("v1");
+      assertThat(getFromWriteBatch(batch, "k2")).isEqualTo("v2");
+      assertThat(getFromWriteBatch(batch, "k3")).isEqualTo("v3");
+      assertThat(getFromWriteBatch(batch, "k4")).isNull();
+    }
+  }
+
+  @Test
+  public void deleteRange() throws RocksDBException {
+    try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+         final WriteBatch batch = new WriteBatch();
+         final WriteOptions wOpt = new WriteOptions()) {
+      db.put("key1".getBytes(), "value".getBytes());
+      db.put("key2".getBytes(), "12345678".getBytes());
+      db.put("key3".getBytes(), "abcdefg".getBytes());
+      db.put("key4".getBytes(), "xyz".getBytes());
+      assertThat(db.get("key1".getBytes())).isEqualTo("value".getBytes());
+      assertThat(db.get("key2".getBytes())).isEqualTo("12345678".getBytes());
+      assertThat(db.get("key3".getBytes())).isEqualTo("abcdefg".getBytes());
+      assertThat(db.get("key4".getBytes())).isEqualTo("xyz".getBytes());
+
+      batch.deleteRange("key2".getBytes(), "key4".getBytes());
+      db.write(wOpt, batch);
+
+      assertThat(db.get("key1".getBytes())).isEqualTo("value".getBytes());
+      assertThat(db.get("key2".getBytes())).isNull();
+      assertThat(db.get("key3".getBytes())).isNull();
+      assertThat(db.get("key4".getBytes())).isEqualTo("xyz".getBytes());
+    }
+  }
+
+  @Test
+  public void restorePoints() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+
+      batch.put("k1".getBytes(), "v1".getBytes());
+      batch.put("k2".getBytes(), "v2".getBytes());
+
+      batch.setSavePoint();
+
+      batch.put("k1".getBytes(), "123456789".getBytes());
+      batch.delete("k2".getBytes());
+
+      batch.rollbackToSavePoint();
+
+      try(final CapturingWriteBatchHandler handler = new CapturingWriteBatchHandler()) {
+        batch.iterate(handler);
+
+        assertThat(handler.getEvents().size()).isEqualTo(2);
+        assertThat(handler.getEvents().get(0)).isEqualTo(new Event(PUT, "k1".getBytes(), "v1".getBytes()));
+        assertThat(handler.getEvents().get(1)).isEqualTo(new Event(PUT, "k2".getBytes(), "v2".getBytes()));
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void restorePoints_withoutSavePoints() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      batch.rollbackToSavePoint();
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void restorePoints_withoutSavePoints_nested() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+
+      batch.setSavePoint();
+      batch.rollbackToSavePoint();
+
+      // without previous corresponding setSavePoint
+      batch.rollbackToSavePoint();
+    }
+  }
+
+  @Test
+  public void popSavePoint() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+
+      batch.put("k1".getBytes(), "v1".getBytes());
+      batch.put("k2".getBytes(), "v2".getBytes());
+
+      batch.setSavePoint();
+
+      batch.put("k1".getBytes(), "123456789".getBytes());
+      batch.delete("k2".getBytes());
+
+      batch.setSavePoint();
+
+      batch.popSavePoint();
+
+      batch.rollbackToSavePoint();
+
+      try(final CapturingWriteBatchHandler handler = new CapturingWriteBatchHandler()) {
+        batch.iterate(handler);
+
+        assertThat(handler.getEvents().size()).isEqualTo(2);
+        assertThat(handler.getEvents().get(0)).isEqualTo(new Event(PUT, "k1".getBytes(), "v1".getBytes()));
+        assertThat(handler.getEvents().get(1)).isEqualTo(new Event(PUT, "k2".getBytes(), "v2".getBytes()));
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void popSavePoint_withoutSavePoints() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      batch.popSavePoint();
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void popSavePoint_withoutSavePoints_nested() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+
+      batch.setSavePoint();
+      batch.popSavePoint();
+
+      // without previous corresponding setSavePoint
+      batch.popSavePoint();
+    }
+  }
+
+  @Test
+  public void maxBytes() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      batch.setMaxBytes(19);
+
+      batch.put("k1".getBytes(), "v1".getBytes());
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void maxBytes_over() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      batch.setMaxBytes(1);
+
+      batch.put("k1".getBytes(), "v1".getBytes());
+    }
+  }
+
+  @Test
+  public void data() throws RocksDBException {
+    try (final WriteBatch batch1 = new WriteBatch()) {
+      batch1.delete("k0".getBytes());
+      batch1.put("k1".getBytes(), "v1".getBytes());
+      batch1.put("k2".getBytes(), "v2".getBytes());
+      batch1.put("k3".getBytes(), "v3".getBytes());
+      batch1.putLogData("log1".getBytes());
+      batch1.merge("k2".getBytes(), "v22".getBytes());
+      batch1.delete("k3".getBytes());
+
+      final byte[] serialized = batch1.data();
+
+      try(final WriteBatch batch2 = new WriteBatch(serialized)) {
+        assertThat(batch2.count()).isEqualTo(batch1.count());
+
+        try(final CapturingWriteBatchHandler handler1 = new CapturingWriteBatchHandler()) {
+          batch1.iterate(handler1);
+
+          try (final CapturingWriteBatchHandler handler2 = new CapturingWriteBatchHandler()) {
+            batch2.iterate(handler2);
+
+            assertThat(handler1.getEvents().equals(handler2.getEvents())).isTrue();
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void dataSize() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      batch.put("k1".getBytes(), "v1".getBytes());
+
+      assertThat(batch.getDataSize()).isEqualTo(19);
+    }
+  }
+
+  @Test
+  public void hasPut() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      assertThat(batch.hasPut()).isFalse();
+
+      batch.put("k1".getBytes(), "v1".getBytes());
+
+      assertThat(batch.hasPut()).isTrue();
+    }
+  }
+
+  @Test
+  public void hasDelete() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      assertThat(batch.hasDelete()).isFalse();
+
+      batch.delete("k1".getBytes());
+
+      assertThat(batch.hasDelete()).isTrue();
+    }
+  }
+
+  @Test
+  public void hasSingleDelete() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      assertThat(batch.hasSingleDelete()).isFalse();
+
+      batch.singleDelete("k1".getBytes());
+
+      assertThat(batch.hasSingleDelete()).isTrue();
+    }
+  }
+
+  @Test
+  public void hasDeleteRange() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      assertThat(batch.hasDeleteRange()).isFalse();
+
+      batch.deleteRange("k1".getBytes(), "k2".getBytes());
+
+      assertThat(batch.hasDeleteRange()).isTrue();
+    }
+  }
+
+  @Test
+  public void hasBeginPrepareRange() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      assertThat(batch.hasBeginPrepare()).isFalse();
+    }
+  }
+
+  @Test
+  public void hasEndPrepareRange() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      assertThat(batch.hasEndPrepare()).isFalse();
+    }
+  }
+
+  @Test
+  public void hasCommit() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      assertThat(batch.hasCommit()).isFalse();
+    }
+  }
+
+  @Test
+  public void hasRollback() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      assertThat(batch.hasRollback()).isFalse();
+    }
+  }
+
+  @Test
+  public void walTerminationPoint() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      WriteBatch.SavePoint walTerminationPoint = batch.getWalTerminationPoint();
+      assertThat(walTerminationPoint.isCleared()).isTrue();
+
+      batch.put("k1".getBytes(UTF_8), "v1".getBytes(UTF_8));
+
+      batch.markWalTerminationPoint();
+
+      walTerminationPoint = batch.getWalTerminationPoint();
+      assertThat(walTerminationPoint.getSize()).isEqualTo(19);
+      assertThat(walTerminationPoint.getCount()).isEqualTo(1);
+      assertThat(walTerminationPoint.getContentFlags()).isEqualTo(2);
+    }
+  }
+
+  @Test
+  public void getWriteBatch() {
+    try (final WriteBatch batch = new WriteBatch()) {
+      assertThat(batch.getWriteBatch()).isEqualTo(batch);
+    }
+  }
+
+  static byte[] getContents(final WriteBatch wb) {
+    return getContents(wb.nativeHandle_);
+  }
+
+  static String getFromWriteBatch(final WriteBatch wb, final String key)
+      throws RocksDBException {
+    final WriteBatchGetter getter =
+        new WriteBatchGetter(key.getBytes(UTF_8));
+    wb.iterate(getter);
+    if(getter.getValue() != null) {
+      return new String(getter.getValue(), UTF_8);
+    } else {
+      return null;
+    }
+  }
+
+  private static native byte[] getContents(final long writeBatchHandle);
+}
+
+/**
+ * Package-private class which provides java api to access
+ * c++ WriteBatchInternal.
+ */
+class WriteBatchTestInternalHelper {
+  static void setSequence(final WriteBatch wb, final long sn) {
+    setSequence(wb.nativeHandle_, sn);
+  }
+
+  static long sequence(final WriteBatch wb) {
+    return sequence(wb.nativeHandle_);
+  }
+
+  static void append(final WriteBatch wb1, final WriteBatch wb2) {
+    append(wb1.nativeHandle_, wb2.nativeHandle_);
+  }
+
+  private static native void setSequence(final long writeBatchHandle,
+      final long sn);
+
+  private static native long sequence(final long writeBatchHandle);
+
+  private static native void append(final long writeBatchHandle1,
+      final long writeBatchHandle2);
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/WriteBatchThreadedTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/WriteBatchThreadedTest.java
new file mode 100644
index 000000000..c5090dbce
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/WriteBatchThreadedTest.java
@@ -0,0 +1,104 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameter;
+import org.junit.runners.Parameterized.Parameters;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.concurrent.*;
+
+@RunWith(Parameterized.class)
+public class WriteBatchThreadedTest {
+
+  @Parameters(name = "WriteBatchThreadedTest(threadCount={0})")
+  public static Iterable<Integer> data() {
+    return Arrays.asList(new Integer[]{1, 10, 50, 100});
+  }
+
+  @Parameter
+  public int threadCount;
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  RocksDB db;
+
+  @Before
+  public void setUp() throws Exception {
+    RocksDB.loadLibrary();
+    final Options options = new Options()
+        .setCreateIfMissing(true)
+        .setIncreaseParallelism(32);
+    db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+    assert (db != null);
+  }
+
+  @After
+  public void tearDown() throws Exception {
+    if (db != null) {
+      db.close();
+    }
+  }
+
+  @Test
+  public void threadedWrites() throws InterruptedException, ExecutionException {
+    final List<Callable<Void>> callables = new ArrayList<>();
+    for (int i = 0; i < 100; i++) {
+      final int offset = i * 100;
+      callables.add(new Callable<Void>() {
+        @Override
+        public Void call() throws RocksDBException {
+          try (final WriteBatch wb = new WriteBatch();
+               final WriteOptions w_opt = new WriteOptions()) {
+            for (int i = offset; i < offset + 100; i++) {
+              wb.put(ByteBuffer.allocate(4).putInt(i).array(), "parallel rocks test".getBytes());
+            }
+            db.write(w_opt, wb);
+          }
+          return null;
+        }
+      });
+    }
+
+    //submit the callables
+    final ExecutorService executorService =
+        Executors.newFixedThreadPool(threadCount);
+    try {
+      final ExecutorCompletionService<Void> completionService =
+          new ExecutorCompletionService<>(executorService);
+      final Set<Future<Void>> futures = new HashSet<>();
+      for (final Callable<Void> callable : callables) {
+        futures.add(completionService.submit(callable));
+      }
+
+      while (futures.size() > 0) {
+        final Future<Void> future = completionService.take();
+        futures.remove(future);
+
+        try {
+          future.get();
+        } catch (final ExecutionException e) {
+          for (final Future<Void> f : futures) {
+            f.cancel(true);
+          }
+
+          throw e;
+        }
+      }
+    } finally {
+      executorService.shutdown();
+      executorService.awaitTermination(10, TimeUnit.SECONDS);
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java
new file mode 100644
index 000000000..b0a0cdc0e
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java
@@ -0,0 +1,1068 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+package org.rocksdb;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.rocksdb.util.ByteBufferAllocator;
+
+public class WriteBatchWithIndexTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void readYourOwnWrites() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath())) {
+
+      final byte[] k1 = "key1".getBytes();
+      final byte[] v1 = "value1".getBytes();
+      final byte[] k2 = "key2".getBytes();
+      final byte[] v2 = "value2".getBytes();
+
+      db.put(k1, v1);
+      db.put(k2, v2);
+
+      try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true);
+           final RocksIterator base = db.newIterator();
+           final RocksIterator it = wbwi.newIteratorWithBase(base)) {
+        it.seek(k1);
+        assertThat(it.isValid()).isTrue();
+        assertThat(it.key()).isEqualTo(k1);
+        assertThat(it.value()).isEqualTo(v1);
+
+        it.seek(k2);
+        assertThat(it.isValid()).isTrue();
+        assertThat(it.key()).isEqualTo(k2);
+        assertThat(it.value()).isEqualTo(v2);
+
+        //put data to the write batch and make sure we can read it.
+        final byte[] k3 = "key3".getBytes();
+        final byte[] v3 = "value3".getBytes();
+        wbwi.put(k3, v3);
+        it.seek(k3);
+        assertThat(it.isValid()).isTrue();
+        assertThat(it.key()).isEqualTo(k3);
+        assertThat(it.value()).isEqualTo(v3);
+
+        //update k2 in the write batch and check the value
+        final byte[] v2Other = "otherValue2".getBytes();
+        wbwi.put(k2, v2Other);
+        it.seek(k2);
+        assertThat(it.isValid()).isTrue();
+        assertThat(it.key()).isEqualTo(k2);
+        assertThat(it.value()).isEqualTo(v2Other);
+
+        //delete k1 and make sure we can read back the write
+        wbwi.delete(k1);
+        it.seek(k1);
+        assertThat(it.key()).isNotEqualTo(k1);
+
+        //reinsert k1 and make sure we see the new value
+        final byte[] v1Other = "otherValue1".getBytes();
+        wbwi.put(k1, v1Other);
+        it.seek(k1);
+        assertThat(it.isValid()).isTrue();
+        assertThat(it.key()).isEqualTo(k1);
+        assertThat(it.value()).isEqualTo(v1Other);
+
+        //single remove k3 and make sure we can read back the write
+        wbwi.singleDelete(k3);
+        it.seek(k3);
+        assertThat(it.isValid()).isEqualTo(false);
+
+        //reinsert k3 and make sure we see the new value
+        final byte[] v3Other = "otherValue3".getBytes();
+        wbwi.put(k3, v3Other);
+        it.seek(k3);
+        assertThat(it.isValid()).isTrue();
+        assertThat(it.key()).isEqualTo(k3);
+        assertThat(it.value()).isEqualTo(v3Other);
+      }
+    }
+  }
+
+  @Test
+  public void readYourOwnWritesCf() throws RocksDBException {
+    final List<ColumnFamilyDescriptor> cfNames =
+        Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+            new ColumnFamilyDescriptor("new_cf".getBytes()));
+
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+
+    // Test open database with column family names
+    try (final DBOptions options =
+             new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true);
+         final RocksDB db = RocksDB.open(
+             options, dbFolder.getRoot().getAbsolutePath(), cfNames, columnFamilyHandleList)) {
+      final ColumnFamilyHandle newCf = columnFamilyHandleList.get(1);
+
+      try {
+        final byte[] k1 = "key1".getBytes();
+        final byte[] v1 = "value1".getBytes();
+        final byte[] k2 = "key2".getBytes();
+        final byte[] v2 = "value2".getBytes();
+
+        db.put(newCf, k1, v1);
+        db.put(newCf, k2, v2);
+
+        try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true);
+             final ReadOptions readOptions = new ReadOptions();
+             final RocksIterator base = db.newIterator(newCf, readOptions);
+             final RocksIterator it = wbwi.newIteratorWithBase(newCf, base, readOptions)) {
+          it.seek(k1);
+          assertThat(it.isValid()).isTrue();
+          assertThat(it.key()).isEqualTo(k1);
+          assertThat(it.value()).isEqualTo(v1);
+
+          it.seek(k2);
+          assertThat(it.isValid()).isTrue();
+          assertThat(it.key()).isEqualTo(k2);
+          assertThat(it.value()).isEqualTo(v2);
+
+          // put data to the write batch and make sure we can read it.
+          final byte[] k3 = "key3".getBytes();
+          final byte[] v3 = "value3".getBytes();
+          wbwi.put(newCf, k3, v3);
+          it.seek(k3);
+          assertThat(it.isValid()).isTrue();
+          assertThat(it.key()).isEqualTo(k3);
+          assertThat(it.value()).isEqualTo(v3);
+
+          // update k2 in the write batch and check the value
+          final byte[] v2Other = "otherValue2".getBytes();
+          wbwi.put(newCf, k2, v2Other);
+          it.seek(k2);
+          assertThat(it.isValid()).isTrue();
+          assertThat(it.key()).isEqualTo(k2);
+          assertThat(it.value()).isEqualTo(v2Other);
+
+          // delete k1 and make sure we can read back the write
+          wbwi.delete(newCf, k1);
+          it.seek(k1);
+          assertThat(it.key()).isNotEqualTo(k1);
+
+          // reinsert k1 and make sure we see the new value
+          final byte[] v1Other = "otherValue1".getBytes();
+          wbwi.put(newCf, k1, v1Other);
+          it.seek(k1);
+          assertThat(it.isValid()).isTrue();
+          assertThat(it.key()).isEqualTo(k1);
+          assertThat(it.value()).isEqualTo(v1Other);
+
+          // single remove k3 and make sure we can read back the write
+          wbwi.singleDelete(newCf, k3);
+          it.seek(k3);
+          assertThat(it.isValid()).isEqualTo(false);
+
+          // reinsert k3 and make sure we see the new value
+          final byte[] v3Other = "otherValue3".getBytes();
+          wbwi.put(newCf, k3, v3Other);
+          it.seek(k3);
+          assertThat(it.isValid()).isTrue();
+          assertThat(it.key()).isEqualTo(k3);
+          assertThat(it.value()).isEqualTo(v3Other);
+        }
+      } finally {
+        for (final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
+          columnFamilyHandle.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  public void readYourOwnWritesCfIterDirectBB() throws RocksDBException {
+    readYourOwnWritesCfIterDirect(ByteBufferAllocator.DIRECT);
+  }
+
+  @Test
+  public void readYourOwnWritesCfIterIndirectBB() throws RocksDBException {
+    readYourOwnWritesCfIterDirect(ByteBufferAllocator.HEAP);
+  }
+
+  public void readYourOwnWritesCfIterDirect(final ByteBufferAllocator byteBufferAllocator)
+      throws RocksDBException {
+    final List<ColumnFamilyDescriptor> cfNames =
+        Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+            new ColumnFamilyDescriptor("new_cf".getBytes()));
+
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+
+    // Test open database with column family names
+    try (final DBOptions options =
+             new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true);
+         final RocksDB db = RocksDB.open(
+             options, dbFolder.getRoot().getAbsolutePath(), cfNames, columnFamilyHandleList)) {
+      final ColumnFamilyHandle newCf = columnFamilyHandleList.get(1);
+
+      try {
+        final byte[] kv1 = "key1".getBytes();
+        final byte[] vv1 = "value1".getBytes();
+        final ByteBuffer k1 = byteBufferAllocator.allocate(12);
+        k1.put(kv1);
+        final byte[] kv2 = "key2".getBytes();
+        final byte[] vv2 = "value2".getBytes();
+        final ByteBuffer k2 = byteBufferAllocator.allocate(12);
+        k2.put(kv2);
+
+        db.put(newCf, kv1, vv1);
+        db.put(newCf, kv2, vv2);
+
+        try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true);
+             final ReadOptions readOptions = new ReadOptions();
+             final RocksIterator base = db.newIterator(newCf, readOptions);
+             final RocksIterator it = wbwi.newIteratorWithBase(newCf, base, readOptions)) {
+          k1.flip();
+          it.seek(k1);
+          assertThat(it.isValid()).isTrue();
+          assertThat(it.key()).isEqualTo(kv1);
+          assertThat(it.value()).isEqualTo(vv1);
+
+          k2.flip();
+          it.seek(k2);
+          assertThat(it.isValid()).isTrue();
+          assertThat(it.key()).isEqualTo(kv2);
+          assertThat(it.value()).isEqualTo(vv2);
+
+          final byte[] kv1point5 = "key1point5".getBytes();
+          final ByteBuffer k1point5 = byteBufferAllocator.allocate(12);
+          k1point5.put(kv1point5);
+
+          k1point5.flip();
+          it.seek(k1point5);
+          assertThat(it.isValid()).isTrue();
+          assertThat(it.key()).isEqualTo(kv2);
+          assertThat(it.value()).isEqualTo(vv2);
+
+          k1point5.flip();
+          it.seekForPrev(k1point5);
+          assertThat(it.isValid()).isTrue();
+          assertThat(it.key()).isEqualTo(kv1);
+          assertThat(it.value()).isEqualTo(vv1);
+
+          // put data to the write batch and make sure we can read it.
+          final byte[] kv3 = "key3".getBytes();
+          final ByteBuffer k3 = byteBufferAllocator.allocate(12);
+          k3.put(kv3);
+          final byte[] vv3 = "value3".getBytes();
+          wbwi.put(newCf, kv3, vv3);
+          k3.flip();
+          it.seek(k3);
+          assertThat(it.isValid()).isTrue();
+          assertThat(it.key()).isEqualTo(kv3);
+          assertThat(it.value()).isEqualTo(vv3);
+
+          // update k2 in the write batch and check the value
+          final byte[] v2Other = "otherValue2".getBytes();
+          wbwi.put(newCf, kv2, v2Other);
+          k2.flip();
+          it.seek(k2);
+          assertThat(it.isValid()).isTrue();
+          assertThat(it.key()).isEqualTo(kv2);
+          assertThat(it.value()).isEqualTo(v2Other);
+
+          // delete k1 and make sure we can read back the write
+          wbwi.delete(newCf, kv1);
+          k1.flip();
+          it.seek(k1);
+          assertThat(it.key()).isNotEqualTo(kv1);
+
+          // reinsert k1 and make sure we see the new value
+          final byte[] v1Other = "otherValue1".getBytes();
+          wbwi.put(newCf, kv1, v1Other);
+          k1.flip();
+          it.seek(k1);
+          assertThat(it.isValid()).isTrue();
+          assertThat(it.key()).isEqualTo(kv1);
+          assertThat(it.value()).isEqualTo(v1Other);
+
+          // single remove k3 and make sure we can read back the write
+          wbwi.singleDelete(newCf, kv3);
+          k3.flip();
+          it.seek(k3);
+          assertThat(it.isValid()).isEqualTo(false);
+
+          // reinsert k3 and make sure we see the new value
+          final byte[] v3Other = "otherValue3".getBytes();
+          wbwi.put(newCf, kv3, v3Other);
+          k3.flip();
+          it.seek(k3);
+          assertThat(it.isValid()).isTrue();
+          assertThat(it.key()).isEqualTo(kv3);
+          assertThat(it.value()).isEqualTo(v3Other);
+        }
+      } finally {
+        for (final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
+          columnFamilyHandle.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  public void readYourOwnWritesCfIterIndirect() throws RocksDBException {
+    final List<ColumnFamilyDescriptor> cfNames =
+        Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+            new ColumnFamilyDescriptor("new_cf".getBytes()));
+
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+
+    // Test open database with column family names
+    try (final DBOptions options =
+             new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true);
+         final RocksDB db = RocksDB.open(
+             options, dbFolder.getRoot().getAbsolutePath(), cfNames, columnFamilyHandleList)) {
+      final ColumnFamilyHandle newCf = columnFamilyHandleList.get(1);
+
+      try {
+        final byte[] kv1 = "key1".getBytes();
+        final byte[] vv1 = "value1".getBytes();
+        final ByteBuffer k1 = ByteBuffer.allocate(12);
+        k1.put(kv1).flip();
+        final byte[] kv2 = "key2".getBytes();
+        final byte[] vv2 = "value2".getBytes();
+        final ByteBuffer k2 = ByteBuffer.allocate(12);
+        k2.put(kv2).flip();
+
+        db.put(newCf, kv1, vv1);
+        db.put(newCf, kv2, vv2);
+
+        try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true);
+             final ReadOptions readOptions = new ReadOptions();
+             final RocksIterator base = db.newIterator(newCf, readOptions);
+             final RocksIterator it = wbwi.newIteratorWithBase(newCf, base, readOptions)) {
+          it.seek(k1);
+          assertThat(it.isValid()).isTrue();
+          assertThat(it.key()).isEqualTo(kv1);
+          assertThat(it.value()).isEqualTo(vv1);
+
+          it.seek(k2);
+          assertThat(it.isValid()).isTrue();
+          assertThat(it.key()).isEqualTo(kv2);
+          assertThat(it.value()).isEqualTo(vv2);
+
+          // put data to the write batch and make sure we can read it.
+          final byte[] kv3 = "key3".getBytes();
+          final ByteBuffer k3 = ByteBuffer.allocate(12);
+          k3.put(kv3);
+          final byte[] vv3 = "value3".getBytes();
+          wbwi.put(newCf, kv3, vv3);
+          k3.flip();
+          it.seek(k3);
+          assertThat(it.isValid()).isTrue();
+          assertThat(it.key()).isEqualTo(kv3);
+          assertThat(it.value()).isEqualTo(vv3);
+
+          // update k2 in the write batch and check the value
+          final byte[] v2Other = "otherValue2".getBytes();
+          wbwi.put(newCf, kv2, v2Other);
+          k2.flip();
+          it.seek(k2);
+          assertThat(it.isValid()).isTrue();
+          assertThat(it.key()).isEqualTo(kv2);
+          assertThat(it.value()).isEqualTo(v2Other);
+
+          // delete k1 and make sure we can read back the write
+          wbwi.delete(newCf, kv1);
+          k1.flip();
+          it.seek(k1);
+          assertThat(it.key()).isNotEqualTo(kv1);
+
+          // reinsert k1 and make sure we see the new value
+          final byte[] v1Other = "otherValue1".getBytes();
+          wbwi.put(newCf, kv1, v1Other);
+          k1.flip();
+          it.seek(k1);
+          assertThat(it.isValid()).isTrue();
+          assertThat(it.key()).isEqualTo(kv1);
+          assertThat(it.value()).isEqualTo(v1Other);
+
+          // single remove k3 and make sure we can read back the write
+          wbwi.singleDelete(newCf, kv3);
+          k3.flip();
+          it.seek(k3);
+          assertThat(it.isValid()).isEqualTo(false);
+
+          // reinsert k3 and make sure we see the new value
+          final byte[] v3Other = "otherValue3".getBytes();
+          wbwi.put(newCf, kv3, v3Other);
+          k3.flip();
+          it.seek(k3);
+          assertThat(it.isValid()).isTrue();
+          assertThat(it.key()).isEqualTo(kv3);
+          assertThat(it.value()).isEqualTo(v3Other);
+        }
+      } finally {
+        for (final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
+          columnFamilyHandle.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  public void writeBatchWithIndex() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath())) {
+
+      final byte[] k1 = "key1".getBytes();
+      final byte[] v1 = "value1".getBytes();
+      final byte[] k2 = "key2".getBytes();
+      final byte[] v2 = "value2".getBytes();
+
+      try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex();
+           final WriteOptions wOpt = new WriteOptions()) {
+        wbwi.put(k1, v1);
+        wbwi.put(k2, v2);
+
+        db.write(wOpt, wbwi);
+      }
+
+      assertThat(db.get(k1)).isEqualTo(v1);
+      assertThat(db.get(k2)).isEqualTo(v2);
+    }
+  }
+
+  @Test
+  public void write_writeBatchWithIndexDirect() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) {
+      final ByteBuffer k1 = ByteBuffer.allocateDirect(16);
+      final ByteBuffer v1 = ByteBuffer.allocateDirect(16);
+      final ByteBuffer k2 = ByteBuffer.allocateDirect(16);
+      final ByteBuffer v2 = ByteBuffer.allocateDirect(16);
+      k1.put("key1".getBytes()).flip();
+      v1.put("value1".getBytes()).flip();
+      k2.put("key2".getBytes()).flip();
+      v2.put("value2".getBytes()).flip();
+
+      try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) {
+        wbwi.put(k1, v1);
+        assertThat(k1.position()).isEqualTo(4);
+        assertThat(k1.limit()).isEqualTo(4);
+        assertThat(v1.position()).isEqualTo(6);
+        assertThat(v1.limit()).isEqualTo(6);
+
+        wbwi.put(k2, v2);
+
+        db.write(new WriteOptions(), wbwi);
+      }
+
+      assertThat(db.get("key1".getBytes())).isEqualTo("value1".getBytes());
+      assertThat(db.get("key2".getBytes())).isEqualTo("value2".getBytes());
+    }
+  }
+
+  @Test
+  public void iterator() throws RocksDBException {
+    try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true)) {
+
+      final String k1 = "key1";
+      final String v1 = "value1";
+      final String k2 = "key2";
+      final String v2 = "value2";
+      final String k3 = "key3";
+      final String v3 = "value3";
+      final String k4 = "key4";
+      final String k5 = "key5";
+      final String v8 = "value8";
+      final byte[] k1b = k1.getBytes(UTF_8);
+      final byte[] v1b = v1.getBytes(UTF_8);
+      final byte[] k2b = k2.getBytes(UTF_8);
+      final byte[] v2b = v2.getBytes(UTF_8);
+      final byte[] k3b = k3.getBytes(UTF_8);
+      final byte[] v3b = v3.getBytes(UTF_8);
+      final byte[] k4b = k4.getBytes(UTF_8);
+      final byte[] k5b = k5.getBytes(UTF_8);
+      final byte[] v8b = v8.getBytes(UTF_8);
+
+      final String k1point5 = "key1point5";
+      final String k2point5 = "key2point5";
+
+      // add put records
+      wbwi.put(k1b, v1b);
+      wbwi.put(k2b, v2b);
+      wbwi.put(k3b, v3b);
+
+      // add a deletion record
+      wbwi.delete(k4b);
+
+      // add a single deletion record
+      wbwi.singleDelete(k5b);
+
+      // add a log record
+      wbwi.putLogData(v8b);
+
+      final WBWIRocksIterator.WriteEntry[] expected = {
+          new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.PUT,
+              new DirectSlice(k1), new DirectSlice(v1)),
+          new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.PUT,
+              new DirectSlice(k2), new DirectSlice(v2)),
+          new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.PUT,
+              new DirectSlice(k3), new DirectSlice(v3)),
+          new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.DELETE,
+              new DirectSlice(k4), DirectSlice.NONE),
+          new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.SINGLE_DELETE,
+              new DirectSlice(k5), DirectSlice.NONE),
+      };
+
+      try (final WBWIRocksIterator it = wbwi.newIterator()) {
+        //direct access - seek to key offsets
+        final int[] testOffsets = {2, 0, 3, 4, 1};
+        for (final int testOffset : testOffsets) {
+          final byte[] key = toArray(expected[testOffset].getKey().data());
+
+          it.seek(key);
+          assertThat(it.isValid()).isTrue();
+
+          final WBWIRocksIterator.WriteEntry entry = it.entry();
+          assertThat(entry).isEqualTo(expected[testOffset]);
+        }
+
+        for (final int testOffset : testOffsets) {
+          final byte[] key = toArray(expected[testOffset].getKey().data());
+
+          // Direct buffer seek
+          final ByteBuffer db = expected[testOffset].getKey().data();
+          it.seek(db);
+          assertThat(db.position()).isEqualTo(key.length);
+          assertThat(it.isValid()).isTrue();
+
+          final WBWIRocksIterator.WriteEntry entry = it.entry();
+          assertThat(entry).isEqualTo(expected[testOffset]);
+        }
+
+        for (final int testOffset : testOffsets) {
+          final byte[] key = toArray(expected[testOffset].getKey().data());
+
+          // Direct buffer seek
+          final ByteBuffer db = expected[testOffset].getKey().data();
+          it.seekForPrev(db);
+          assertThat(db.position()).isEqualTo(key.length);
+          assertThat(it.isValid()).isTrue();
+
+          final WBWIRocksIterator.WriteEntry entry = it.entry();
+          assertThat(entry).isEqualTo(expected[testOffset]);
+        }
+
+        for (final int testOffset : testOffsets) {
+          final byte[] key = toArray(expected[testOffset].getKey().data());
+
+          // Indirect buffer seek
+          final ByteBuffer db = ByteBuffer.allocate(key.length);
+          System.arraycopy(key, 0, db.array(), 0, key.length);
+          it.seek(db);
+          assertThat(db.position()).isEqualTo(key.length);
+          assertThat(it.isValid()).isTrue();
+
+          final WBWIRocksIterator.WriteEntry entry = it.entry();
+          assertThat(entry).isEqualTo(expected[testOffset]);
+        }
+
+        for (final int testOffset : testOffsets) {
+          final byte[] key = toArray(expected[testOffset].getKey().data());
+
+          // Indirect buffer seek for prev
+          final ByteBuffer db = ByteBuffer.allocate(key.length);
+          System.arraycopy(key, 0, db.array(), 0, key.length);
+          it.seekForPrev(db);
+          assertThat(db.position()).isEqualTo(key.length);
+          assertThat(it.isValid()).isTrue();
+
+          final WBWIRocksIterator.WriteEntry entry = it.entry();
+          assertThat(entry).isEqualTo(expected[testOffset]);
+        }
+
+        {
+          it.seekForPrev(k2point5.getBytes());
+          assertThat(it.isValid()).isTrue();
+          final WBWIRocksIterator.WriteEntry entry = it.entry();
+          assertThat(entry).isEqualTo(expected[1]);
+        }
+
+        {
+          it.seekForPrev(k1point5.getBytes());
+          assertThat(it.isValid()).isTrue();
+          final WBWIRocksIterator.WriteEntry entry = it.entry();
+          assertThat(entry).isEqualTo(expected[0]);
+        }
+
+        {
+          final ByteBuffer db = ByteBuffer.allocate(k2point5.length());
+          db.put(k2point5.getBytes());
+          db.flip();
+          it.seekForPrev(db);
+          assertThat(it.isValid()).isTrue();
+          final WBWIRocksIterator.WriteEntry entry = it.entry();
+          assertThat(entry).isEqualTo(expected[1]);
+        }
+
+        {
+          final ByteBuffer db = ByteBuffer.allocate(k1point5.length());
+          db.put(k1point5.getBytes());
+          db.flip();
+          it.seekForPrev(db);
+          assertThat(it.isValid()).isTrue();
+          final WBWIRocksIterator.WriteEntry entry = it.entry();
+          assertThat(entry).isEqualTo(expected[0]);
+        }
+
+        //forward iterative access
+        int i = 0;
+        for (it.seekToFirst(); it.isValid(); it.next()) {
+          assertThat(it.entry()).isEqualTo(expected[i++]);
+        }
+
+        //reverse iterative access
+        i = expected.length - 1;
+        for (it.seekToLast(); it.isValid(); it.prev()) {
+          assertThat(it.entry()).isEqualTo(expected[i--]);
+        }
+      }
+    }
+  }
+
+  @Test
+  public void zeroByteTests() throws RocksDBException {
+    try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true)) {
+      final byte[] zeroByteValue = new byte[]{0, 0};
+      //add zero byte value
+      wbwi.put(zeroByteValue, zeroByteValue);
+
+      final ByteBuffer buffer = ByteBuffer.allocateDirect(zeroByteValue.length);
+      buffer.put(zeroByteValue);
+
+      final WBWIRocksIterator.WriteEntry expected =
+          new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.PUT,
+              new DirectSlice(buffer, zeroByteValue.length),
+              new DirectSlice(buffer, zeroByteValue.length));
+
+      try (final WBWIRocksIterator it = wbwi.newIterator()) {
+        it.seekToFirst();
+        final WBWIRocksIterator.WriteEntry actual = it.entry();
+        assertThat(actual.equals(expected)).isTrue();
+        assertThat(it.entry().hashCode() == expected.hashCode()).isTrue();
+      }
+    }
+  }
+
+  @Test
+  public void savePoints() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath())) {
+      try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true);
+           final ReadOptions readOptions = new ReadOptions()) {
+        wbwi.put("k1".getBytes(), "v1".getBytes());
+        wbwi.put("k2".getBytes(), "v2".getBytes());
+        wbwi.put("k3".getBytes(), "v3".getBytes());
+
+        assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k1"))
+            .isEqualTo("v1");
+        assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k2"))
+            .isEqualTo("v2");
+        assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k3"))
+            .isEqualTo("v3");
+
+
+        wbwi.setSavePoint();
+
+        wbwi.delete("k2".getBytes());
+        wbwi.put("k3".getBytes(), "v3-2".getBytes());
+
+        assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k2"))
+            .isNull();
+        assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k3"))
+            .isEqualTo("v3-2");
+
+
+        wbwi.setSavePoint();
+
+        wbwi.put("k3".getBytes(), "v3-3".getBytes());
+        wbwi.put("k4".getBytes(), "v4".getBytes());
+
+        assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k3"))
+            .isEqualTo("v3-3");
+        assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k4"))
+            .isEqualTo("v4");
+
+
+        wbwi.rollbackToSavePoint();
+
+        assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k2"))
+            .isNull();
+        assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k3"))
+            .isEqualTo("v3-2");
+        assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k4"))
+            .isNull();
+
+
+        wbwi.rollbackToSavePoint();
+
+        assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k1"))
+            .isEqualTo("v1");
+        assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k2"))
+            .isEqualTo("v2");
+        assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k3"))
+            .isEqualTo("v3");
+        assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k4"))
+            .isNull();
+      }
+    }
+  }
+
+  @Test
+  public void restorePoints() throws RocksDBException {
+    try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) {
+
+      wbwi.put("k1".getBytes(UTF_8), "v1".getBytes(UTF_8));
+      wbwi.put("k2".getBytes(UTF_8), "v2".getBytes(UTF_8));
+
+      wbwi.setSavePoint();
+
+      wbwi.put("k1".getBytes(UTF_8), "123456789".getBytes(UTF_8));
+      wbwi.delete("k2".getBytes(UTF_8));
+
+      wbwi.rollbackToSavePoint();
+
+      try(final DBOptions options = new DBOptions()) {
+        assertThat(wbwi.getFromBatch(options,"k1".getBytes(UTF_8))).isEqualTo("v1".getBytes());
+        assertThat(wbwi.getFromBatch(options,"k2".getBytes(UTF_8))).isEqualTo("v2".getBytes());
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void restorePoints_withoutSavePoints() throws RocksDBException {
+    try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) {
+      wbwi.rollbackToSavePoint();
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void restorePoints_withoutSavePoints_nested() throws RocksDBException {
+    try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) {
+
+      wbwi.setSavePoint();
+      wbwi.rollbackToSavePoint();
+
+      // without previous corresponding setSavePoint
+      wbwi.rollbackToSavePoint();
+    }
+  }
+
+  @Test
+  public void popSavePoint() throws RocksDBException {
+    try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) {
+
+      wbwi.put("k1".getBytes(), "v1".getBytes());
+      wbwi.put("k2".getBytes(), "v2".getBytes());
+
+      wbwi.setSavePoint();
+
+      wbwi.put("k1".getBytes(), "123456789".getBytes());
+      wbwi.delete("k2".getBytes());
+
+      wbwi.setSavePoint();
+
+      wbwi.popSavePoint();
+
+      wbwi.rollbackToSavePoint();
+
+      try(final DBOptions options = new DBOptions()) {
+        assertThat(wbwi.getFromBatch(options,"k1".getBytes(UTF_8))).isEqualTo("v1".getBytes());
+        assertThat(wbwi.getFromBatch(options,"k2".getBytes(UTF_8))).isEqualTo("v2".getBytes());
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void popSavePoint_withoutSavePoints() throws RocksDBException {
+    try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) {
+      wbwi.popSavePoint();
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void popSavePoint_withoutSavePoints_nested() throws RocksDBException {
+    try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) {
+
+      wbwi.setSavePoint();
+      wbwi.popSavePoint();
+
+      // without previous corresponding setSavePoint
+      wbwi.popSavePoint();
+    }
+  }
+
+  @Test
+  public void maxBytes() throws RocksDBException {
+    try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) {
+      wbwi.setMaxBytes(19);
+
+      wbwi.put("k1".getBytes(), "v1".getBytes());
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void maxBytes_over() throws RocksDBException {
+    try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) {
+      wbwi.setMaxBytes(1);
+
+      wbwi.put("k1".getBytes(), "v1".getBytes());
+    }
+  }
+
+  @Test
+  public void getWriteBatch() {
+    try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) {
+
+      final WriteBatch wb = wbwi.getWriteBatch();
+      assertThat(wb).isNotNull();
+      assertThat(wb.isOwningHandle()).isFalse();
+    }
+  }
+
+  private static String getFromWriteBatchWithIndex(final RocksDB db,
+      final ReadOptions readOptions, final WriteBatchWithIndex wbwi,
+      final String skey) {
+    final byte[] key = skey.getBytes();
+    try (final RocksIterator baseIterator = db.newIterator(readOptions);
+         final RocksIterator iterator = wbwi.newIteratorWithBase(baseIterator)) {
+      iterator.seek(key);
+
+      // Arrays.equals(key, iterator.key()) ensures an exact match in Rocks,
+      // instead of a nearest match
+      return iterator.isValid() &&
+          Arrays.equals(key, iterator.key()) ?
+          new String(iterator.value()) : null;
+    }
+  }
+
+  @Test
+  public void getFromBatch() throws RocksDBException {
+    final byte[] k1 = "k1".getBytes();
+    final byte[] k2 = "k2".getBytes();
+    final byte[] k3 = "k3".getBytes();
+    final byte[] k4 = "k4".getBytes();
+
+    final byte[] v1 = "v1".getBytes();
+    final byte[] v2 = "v2".getBytes();
+    final byte[] v3 = "v3".getBytes();
+
+    try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true);
+         final DBOptions dbOptions = new DBOptions()) {
+      wbwi.put(k1, v1);
+      wbwi.put(k2, v2);
+      wbwi.put(k3, v3);
+
+      assertThat(wbwi.getFromBatch(dbOptions, k1)).isEqualTo(v1);
+      assertThat(wbwi.getFromBatch(dbOptions, k2)).isEqualTo(v2);
+      assertThat(wbwi.getFromBatch(dbOptions, k3)).isEqualTo(v3);
+      assertThat(wbwi.getFromBatch(dbOptions, k4)).isNull();
+
+      wbwi.delete(k2);
+
+      assertThat(wbwi.getFromBatch(dbOptions, k2)).isNull();
+    }
+  }
+
+  @Test
+  public void getFromBatchAndDB() throws RocksDBException {
+    final byte[] k1 = "k1".getBytes();
+    final byte[] k2 = "k2".getBytes();
+    final byte[] k3 = "k3".getBytes();
+    final byte[] k4 = "k4".getBytes();
+
+    final byte[] v1 = "v1".getBytes();
+    final byte[] v2 = "v2".getBytes();
+    final byte[] v3 = "v3".getBytes();
+    final byte[] v4 = "v4".getBytes();
+
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath())) {
+
+      db.put(k1, v1);
+      db.put(k2, v2);
+      db.put(k4, v4);
+
+      try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true);
+           final DBOptions dbOptions = new DBOptions();
+           final ReadOptions readOptions = new ReadOptions()) {
+
+        assertThat(wbwi.getFromBatch(dbOptions, k1)).isNull();
+        assertThat(wbwi.getFromBatch(dbOptions, k2)).isNull();
+        assertThat(wbwi.getFromBatch(dbOptions, k4)).isNull();
+
+        wbwi.put(k3, v3);
+
+        assertThat(wbwi.getFromBatch(dbOptions, k3)).isEqualTo(v3);
+
+        assertThat(wbwi.getFromBatchAndDB(db, readOptions, k1)).isEqualTo(v1);
+        assertThat(wbwi.getFromBatchAndDB(db, readOptions, k2)).isEqualTo(v2);
+        assertThat(wbwi.getFromBatchAndDB(db, readOptions, k3)).isEqualTo(v3);
+        assertThat(wbwi.getFromBatchAndDB(db, readOptions, k4)).isEqualTo(v4);
+
+        wbwi.delete(k4);
+
+        assertThat(wbwi.getFromBatchAndDB(db, readOptions, k4)).isNull();
+      }
+    }
+  }
+  private byte[] toArray(final ByteBuffer buf) {
+    final byte[] ary = new byte[buf.remaining()];
+    buf.get(ary);
+    return ary;
+  }
+
+  @Test
+  public void deleteRange() throws RocksDBException {
+    try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+         final WriteBatch batch = new WriteBatch();
+         final WriteOptions wOpt = new WriteOptions()) {
+      db.put("key1".getBytes(), "value".getBytes());
+      db.put("key2".getBytes(), "12345678".getBytes());
+      db.put("key3".getBytes(), "abcdefg".getBytes());
+      db.put("key4".getBytes(), "xyz".getBytes());
+      assertThat(db.get("key1".getBytes())).isEqualTo("value".getBytes());
+      assertThat(db.get("key2".getBytes())).isEqualTo("12345678".getBytes());
+      assertThat(db.get("key3".getBytes())).isEqualTo("abcdefg".getBytes());
+      assertThat(db.get("key4".getBytes())).isEqualTo("xyz".getBytes());
+
+      batch.deleteRange("key2".getBytes(), "key4".getBytes());
+      db.write(wOpt, batch);
+
+      assertThat(db.get("key1".getBytes())).isEqualTo("value".getBytes());
+      assertThat(db.get("key2".getBytes())).isNull();
+      assertThat(db.get("key3".getBytes())).isNull();
+      assertThat(db.get("key4".getBytes())).isEqualTo("xyz".getBytes());
+    }
+  }
+
+  @Test
+  public void iteratorWithBaseOverwriteTrue() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) {
+      try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true);
+           final RocksIterator baseIter = db.newIterator();
+           final RocksIterator wbwiIter = wbwi.newIteratorWithBase(baseIter)) {
+        assertThat(wbwiIter).isNotNull();
+        assertThat(wbwiIter.nativeHandle_).isGreaterThan(0);
+        wbwiIter.status();
+      }
+
+      try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true);
+           final RocksIterator baseIter = db.newIterator();
+           final ReadOptions readOptions = new ReadOptions();
+           final RocksIterator wbwiIter = wbwi.newIteratorWithBase(baseIter, readOptions)) {
+        assertThat(wbwiIter).isNotNull();
+        assertThat(wbwiIter.nativeHandle_).isGreaterThan(0);
+        wbwiIter.status();
+      }
+    }
+
+    final List<ColumnFamilyDescriptor> cfNames =
+        Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+            new ColumnFamilyDescriptor("new_cf".getBytes()));
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+    try (final DBOptions options =
+             new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true);
+         final RocksDB db = RocksDB.open(
+             options, dbFolder.getRoot().getAbsolutePath(), cfNames, columnFamilyHandleList)) {
+      try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true);
+           final RocksIterator baseIter = db.newIterator();
+           final RocksIterator wbwiIter =
+               wbwi.newIteratorWithBase(columnFamilyHandleList.get(1), baseIter)) {
+        assertThat(wbwiIter).isNotNull();
+        assertThat(wbwiIter.nativeHandle_).isGreaterThan(0);
+        wbwiIter.status();
+      }
+
+      try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true);
+           final RocksIterator baseIter = db.newIterator();
+           final ReadOptions readOptions = new ReadOptions();
+           final RocksIterator wbwiIter =
+               wbwi.newIteratorWithBase(columnFamilyHandleList.get(1), baseIter, readOptions)) {
+        assertThat(wbwiIter).isNotNull();
+        assertThat(wbwiIter.nativeHandle_).isGreaterThan(0);
+        wbwiIter.status();
+      }
+    }
+  }
+
+  @Test
+  public void iteratorWithBaseOverwriteFalse() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) {
+      try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(false);
+           final RocksIterator baseIter = db.newIterator();
+           final RocksIterator wbwiIter = wbwi.newIteratorWithBase(baseIter)) {
+        assertThat(wbwiIter).isNotNull();
+        assertThat(wbwiIter.nativeHandle_).isGreaterThan(0);
+        wbwiIter.status();
+      }
+
+      try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(false);
+           final RocksIterator baseIter = db.newIterator();
+           final ReadOptions readOptions = new ReadOptions();
+           final RocksIterator wbwiIter = wbwi.newIteratorWithBase(baseIter, readOptions)) {
+        assertThat(wbwiIter).isNotNull();
+        assertThat(wbwiIter.nativeHandle_).isGreaterThan(0);
+        wbwiIter.status();
+      }
+    }
+
+    final List<ColumnFamilyDescriptor> cfNames =
+        Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+            new ColumnFamilyDescriptor("new_cf".getBytes()));
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+    try (final DBOptions options =
+             new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true);
+         final RocksDB db = RocksDB.open(
+             options, dbFolder.getRoot().getAbsolutePath(), cfNames, columnFamilyHandleList)) {
+      try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(false);
+           final RocksIterator baseIter = db.newIterator();
+           final RocksIterator wbwiIter =
+               wbwi.newIteratorWithBase(columnFamilyHandleList.get(1), baseIter)) {
+        assertThat(wbwiIter).isNotNull();
+        assertThat(wbwiIter.nativeHandle_).isGreaterThan(0);
+        wbwiIter.status();
+      }
+
+      try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(false);
+           final RocksIterator baseIter = db.newIterator();
+           final ReadOptions readOptions = new ReadOptions();
+           final RocksIterator wbwiIter =
+               wbwi.newIteratorWithBase(columnFamilyHandleList.get(1), baseIter, readOptions)) {
+        assertThat(wbwiIter).isNotNull();
+        assertThat(wbwiIter.nativeHandle_).isGreaterThan(0);
+        wbwiIter.status();
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/WriteOptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/WriteOptionsTest.java
new file mode 100644
index 000000000..735677cb7
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/WriteOptionsTest.java
@@ -0,0 +1,75 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.util.Random;
+import org.junit.ClassRule;
+import org.junit.Test;
+
+public class WriteOptionsTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  public static final Random rand = PlatformRandomHelper.
+          getPlatformSpecificRandomFactory();
+
+  @Test
+  public void writeOptions() {
+    try (final WriteOptions writeOptions = new WriteOptions()) {
+
+      writeOptions.setSync(true);
+      assertThat(writeOptions.sync()).isTrue();
+      writeOptions.setSync(false);
+      assertThat(writeOptions.sync()).isFalse();
+
+      writeOptions.setDisableWAL(true);
+      assertThat(writeOptions.disableWAL()).isTrue();
+      writeOptions.setDisableWAL(false);
+      assertThat(writeOptions.disableWAL()).isFalse();
+
+
+      writeOptions.setIgnoreMissingColumnFamilies(true);
+      assertThat(writeOptions.ignoreMissingColumnFamilies()).isTrue();
+      writeOptions.setIgnoreMissingColumnFamilies(false);
+      assertThat(writeOptions.ignoreMissingColumnFamilies()).isFalse();
+
+      writeOptions.setNoSlowdown(true);
+      assertThat(writeOptions.noSlowdown()).isTrue();
+      writeOptions.setNoSlowdown(false);
+      assertThat(writeOptions.noSlowdown()).isFalse();
+
+      writeOptions.setLowPri(true);
+      assertThat(writeOptions.lowPri()).isTrue();
+      writeOptions.setLowPri(false);
+      assertThat(writeOptions.lowPri()).isFalse();
+
+      writeOptions.setMemtableInsertHintPerBatch(true);
+      assertThat(writeOptions.memtableInsertHintPerBatch()).isTrue();
+      writeOptions.setMemtableInsertHintPerBatch(false);
+      assertThat(writeOptions.memtableInsertHintPerBatch()).isFalse();
+    }
+  }
+
+  @Test
+  public void copyConstructor() {
+    WriteOptions origOpts = new WriteOptions();
+    origOpts.setDisableWAL(rand.nextBoolean());
+    origOpts.setIgnoreMissingColumnFamilies(rand.nextBoolean());
+    origOpts.setSync(rand.nextBoolean());
+    origOpts.setMemtableInsertHintPerBatch(true);
+    WriteOptions copyOpts = new WriteOptions(origOpts);
+    assertThat(origOpts.disableWAL()).isEqualTo(copyOpts.disableWAL());
+    assertThat(origOpts.ignoreMissingColumnFamilies()).isEqualTo(
+            copyOpts.ignoreMissingColumnFamilies());
+    assertThat(origOpts.sync()).isEqualTo(copyOpts.sync());
+    assertThat(origOpts.memtableInsertHintPerBatch())
+        .isEqualTo(copyOpts.memtableInsertHintPerBatch());
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/test/RemoveEmptyValueCompactionFilterFactory.java b/src/rocksdb/java/src/test/java/org/rocksdb/test/RemoveEmptyValueCompactionFilterFactory.java
new file mode 100644
index 000000000..c4e4f25a0
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/test/RemoveEmptyValueCompactionFilterFactory.java
@@ -0,0 +1,21 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+package org.rocksdb.test;
+
+import org.rocksdb.AbstractCompactionFilter;
+import org.rocksdb.AbstractCompactionFilterFactory;
+import org.rocksdb.RemoveEmptyValueCompactionFilter;
+
+/**
+ * Simple CompactionFilterFactory class used in tests. Generates RemoveEmptyValueCompactionFilters.
+ */
+public class RemoveEmptyValueCompactionFilterFactory extends AbstractCompactionFilterFactory<RemoveEmptyValueCompactionFilter> {
+    @Override
+    public RemoveEmptyValueCompactionFilter createCompactionFilter(final AbstractCompactionFilter.Context context) {
+        return new RemoveEmptyValueCompactionFilter();
+    }
+
+    @Override
+    public String name() {
+        return "RemoveEmptyValueCompactionFilterFactory";
+    }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/test/RocksJunitRunner.java b/src/rocksdb/java/src/test/java/org/rocksdb/test/RocksJunitRunner.java
new file mode 100644
index 000000000..42d3148ef
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/test/RocksJunitRunner.java
@@ -0,0 +1,174 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb.test;
+
+import org.junit.internal.JUnitSystem;
+import org.junit.internal.RealSystem;
+import org.junit.internal.TextListener;
+import org.junit.runner.Description;
+import org.junit.runner.JUnitCore;
+import org.junit.runner.Result;
+import org.junit.runner.notification.Failure;
+import org.rocksdb.RocksDB;
+
+import java.io.PrintStream;
+import java.text.DecimalFormat;
+import java.text.NumberFormat;
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.rocksdb.test.RocksJunitRunner.RocksJunitListener.Status.*;
+
+/**
+ * Custom Junit Runner to print also Test classes
+ * and executed methods to command prompt.
+ */
+public class RocksJunitRunner {
+
+  /**
+   * Listener which overrides default functionality
+   * to print class and method to system out.
+   */
+  static class RocksJunitListener extends TextListener {
+
+    private final static NumberFormat secsFormat =
+        new DecimalFormat("###,###.###");
+
+    private final PrintStream writer;
+
+    private String currentClassName = null;
+    private String currentMethodName = null;
+    private Status currentStatus = null;
+    private long currentTestsStartTime;
+    private int currentTestsCount = 0;
+    private int currentTestsIgnoredCount = 0;
+    private int currentTestsFailureCount = 0;
+    private int currentTestsErrorCount = 0;
+
+    enum Status {
+      IGNORED,
+      FAILURE,
+      ERROR,
+      OK
+    }
+
+    /**
+     * RocksJunitListener constructor
+     *
+     * @param system JUnitSystem
+     */
+    public RocksJunitListener(final JUnitSystem system) {
+      this(system.out());
+    }
+
+    public RocksJunitListener(final PrintStream writer) {
+      super(writer);
+      this.writer = writer;
+    }
+
+    @Override
+    public void testRunStarted(final Description description) {
+      writer.format("Starting RocksJava Tests...%n");
+
+    }
+
+    @Override
+    public void testStarted(final Description description) {
+      if(currentClassName == null
+          || !currentClassName.equals(description.getClassName())) {
+        if(currentClassName !=  null) {
+          printTestsSummary();
+        } else {
+          currentTestsStartTime = System.currentTimeMillis();
+        }
+        writer.format("%nRunning: %s%n", description.getClassName());
+        currentClassName = description.getClassName();
+      }
+      currentMethodName = description.getMethodName();
+      currentStatus = OK;
+      currentTestsCount++;
+    }
+
+    private void printTestsSummary() {
+      // print summary of last test set
+      writer.format("Tests run: %d, Failures: %d, Errors: %d, Ignored: %d, Time elapsed: %s sec%n",
+          currentTestsCount,
+          currentTestsFailureCount,
+          currentTestsErrorCount,
+          currentTestsIgnoredCount,
+          formatSecs(System.currentTimeMillis() - currentTestsStartTime));
+
+      // reset counters
+      currentTestsCount = 0;
+      currentTestsFailureCount = 0;
+      currentTestsErrorCount = 0;
+      currentTestsIgnoredCount = 0;
+      currentTestsStartTime = System.currentTimeMillis();
+    }
+
+    private static String formatSecs(final double milliseconds) {
+      final double seconds = milliseconds / 1000;
+      return secsFormat.format(seconds);
+    }
+
+    @Override
+    public void testFailure(final Failure failure) {
+      if (failure.getException() != null
+          && failure.getException() instanceof AssertionError) {
+        currentStatus = FAILURE;
+        currentTestsFailureCount++;
+      } else {
+        currentStatus = ERROR;
+        currentTestsErrorCount++;
+      }
+    }
+
+    @Override
+    public void testIgnored(final Description description) {
+      currentStatus = IGNORED;
+      currentTestsIgnoredCount++;
+    }
+
+    @Override
+    public void testFinished(final Description description) {
+      if(currentStatus == OK) {
+        writer.format("\t%s OK%n",currentMethodName);
+      } else {
+        writer.format("  [%s] %s%n", currentStatus.name(), currentMethodName);
+      }
+    }
+
+    @Override
+    public void testRunFinished(final Result result) {
+      printTestsSummary();
+      super.testRunFinished(result);
+    }
+  }
+
+  /**
+   * Main method to execute tests
+   *
+   * @param args Test classes as String names
+   */
+  public static void main(final String[] args){
+    final JUnitCore runner = new JUnitCore();
+    final JUnitSystem system = new RealSystem();
+    runner.addListener(new RocksJunitListener(system));
+    try {
+      final List<Class<?>> classes = new ArrayList<>();
+      for (final String arg : args) {
+        classes.add(Class.forName(arg));
+      }
+      final Class[] clazzes = classes.toArray(new Class[classes.size()]);
+      final Result result = runner.run(clazzes);
+      if(!result.wasSuccessful()) {
+        System.exit(-1);
+      }
+    } catch (final ClassNotFoundException e) {
+      e.printStackTrace();
+      System.exit(-2);
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/test/TestableEventListener.java b/src/rocksdb/java/src/test/java/org/rocksdb/test/TestableEventListener.java
new file mode 100644
index 000000000..865ad5cf7
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/test/TestableEventListener.java
@@ -0,0 +1,23 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb.test;
+
+import org.rocksdb.AbstractEventListener;
+
+public class TestableEventListener extends AbstractEventListener {
+  public TestableEventListener() {
+    super();
+  }
+
+  public TestableEventListener(final EnabledEventCallback... enabledEventCallbacks) {
+    super(enabledEventCallbacks);
+  }
+
+  public void invokeAllCallbacks() {
+    invokeAllCallbacks(nativeHandle_);
+  }
+
+  private static native void invokeAllCallbacks(final long handle);
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/util/ByteBufferAllocator.java b/src/rocksdb/java/src/test/java/org/rocksdb/util/ByteBufferAllocator.java
new file mode 100644
index 000000000..8d7956cf2
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/util/ByteBufferAllocator.java
@@ -0,0 +1,16 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb.util;
+
+import java.nio.ByteBuffer;
+
+public interface ByteBufferAllocator {
+  ByteBuffer allocate(int capacity);
+
+  ByteBufferAllocator DIRECT = new DirectByteBufferAllocator();
+  ByteBufferAllocator HEAP = new HeapByteBufferAllocator();
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/util/BytewiseComparatorIntTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/util/BytewiseComparatorIntTest.java
new file mode 100644
index 000000000..fb7239c92
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/util/BytewiseComparatorIntTest.java
@@ -0,0 +1,267 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb.util;
+
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameter;
+import org.junit.runners.Parameterized.Parameters;
+import org.rocksdb.*;
+
+import java.nio.ByteBuffer;
+import java.nio.file.FileSystems;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Random;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * Similar to {@link IntComparatorTest}, but uses {@link BytewiseComparator}
+ * which ensures the correct ordering of positive integers.
+ */
+@RunWith(Parameterized.class)
+public class BytewiseComparatorIntTest {
+
+  // test with 500 random positive integer keys
+  private static final int TOTAL_KEYS = 500;
+  private static final byte[][] keys = new byte[TOTAL_KEYS][4];
+
+  @BeforeClass
+  public static void prepareKeys() {
+    final ByteBuffer buf = ByteBuffer.allocate(4);
+    final Random random = new Random();
+    for (int i = 0; i < TOTAL_KEYS; i++) {
+      final int ri = random.nextInt() & Integer.MAX_VALUE;  // the & ensures positive integer
+      buf.putInt(ri);
+      buf.flip();
+      final byte[] key = buf.array();
+
+      // does key already exist (avoid duplicates)
+      if (keyExists(key, i)) {
+        i--; // loop round and generate a different key
+      } else {
+        System.arraycopy(key, 0, keys[i], 0, 4);
+      }
+    }
+  }
+
+  private static boolean keyExists(final byte[] key, final int limit) {
+    for (int j = 0; j < limit; j++) {
+      if (Arrays.equals(key, keys[j])) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  @Parameters(name = "{0}")
+  public static Iterable<Object[]> parameters() {
+    return Arrays.asList(new Object[][] {
+        { "non-direct_reused64_mutex", false, 64, ReusedSynchronisationType.MUTEX },
+        { "direct_reused64_mutex", true, 64, ReusedSynchronisationType.MUTEX },
+        { "non-direct_reused64_adaptive-mutex", false, 64, ReusedSynchronisationType.ADAPTIVE_MUTEX },
+        { "direct_reused64_adaptive-mutex", true, 64, ReusedSynchronisationType.ADAPTIVE_MUTEX },
+        { "non-direct_reused64_thread-local", false, 64, ReusedSynchronisationType.THREAD_LOCAL },
+        { "direct_reused64_thread-local", true, 64, ReusedSynchronisationType.THREAD_LOCAL },
+        { "non-direct_noreuse", false, -1, null },
+        { "direct_noreuse", true, -1, null }
+    });
+  }
+
+  @Parameter(0)
+  public String name;
+
+  @Parameter(1)
+  public boolean useDirectBuffer;
+
+  @Parameter(2)
+  public int maxReusedBufferSize;
+
+  @Parameter(3)
+  public ReusedSynchronisationType reusedSynchronisationType;
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+
+  @Test
+  public void javaComparatorDefaultCf() throws RocksDBException {
+    try (final ComparatorOptions options = new ComparatorOptions()
+        .setUseDirectBuffer(useDirectBuffer)
+        .setMaxReusedBufferSize(maxReusedBufferSize)
+        // if reusedSynchronisationType == null we assume that maxReusedBufferSize <= 0 and so we just set ADAPTIVE_MUTEX, even though it won't be used
+        .setReusedSynchronisationType(reusedSynchronisationType == null ? ReusedSynchronisationType.ADAPTIVE_MUTEX : reusedSynchronisationType);
+        final BytewiseComparator comparator = new BytewiseComparator(options)) {
+
+      // test the round-tripability of keys written and read with the Comparator
+      testRoundtrip(FileSystems.getDefault().getPath(
+          dbFolder.getRoot().getAbsolutePath()), comparator);
+    }
+  }
+
+  @Test
+  public void javaComparatorNamedCf() throws RocksDBException {
+    try (final ComparatorOptions options = new ComparatorOptions()
+        .setUseDirectBuffer(useDirectBuffer)
+        .setMaxReusedBufferSize(maxReusedBufferSize)
+        // if reusedSynchronisationType == null we assume that maxReusedBufferSize <= 0 and so we just set ADAPTIVE_MUTEX, even though it won't be used
+        .setReusedSynchronisationType(reusedSynchronisationType == null ? ReusedSynchronisationType.ADAPTIVE_MUTEX : reusedSynchronisationType);
+      final BytewiseComparator comparator = new BytewiseComparator(options)) {
+
+      // test the round-tripability of keys written and read with the Comparator
+      testRoundtripCf(FileSystems.getDefault().getPath(
+          dbFolder.getRoot().getAbsolutePath()), comparator);
+    }
+  }
+
+  /**
+   * Test which stores random keys into the database
+   * using an {@link IntComparator}
+   * it then checks that these keys are read back in
+   * ascending order
+   *
+   * @param db_path A path where we can store database
+   *                files temporarily
+   *
+   * @param comparator the comparator
+   *
+   * @throws RocksDBException if a database error happens.
+   */
+  private void testRoundtrip(final Path db_path,
+      final AbstractComparator comparator) throws RocksDBException {
+    try (final Options opt = new Options()
+             .setCreateIfMissing(true)
+             .setComparator(comparator)) {
+
+      // store TOTAL_KEYS into the db
+      try (final RocksDB db = RocksDB.open(opt, db_path.toString())) {
+        for (int i = 0; i < TOTAL_KEYS; i++) {
+              db.put(keys[i], "value".getBytes(UTF_8));
+        }
+      }
+
+      // re-open db and read from start to end
+      // integer keys should be in ascending
+      // order as defined by IntComparator
+      final ByteBuffer key = ByteBuffer.allocate(4);
+      try (final RocksDB db = RocksDB.open(opt, db_path.toString());
+           final RocksIterator it = db.newIterator()) {
+        it.seekToFirst();
+        int lastKey = Integer.MIN_VALUE;
+        int count = 0;
+        for (it.seekToFirst(); it.isValid(); it.next()) {
+          key.put(it.key());
+          key.flip();
+          final int thisKey = key.getInt();
+          key.clear();
+          assertThat(thisKey).isGreaterThan(lastKey);
+          lastKey = thisKey;
+          count++;
+        }
+        assertThat(count).isEqualTo(TOTAL_KEYS);
+      }
+    }
+  }
+
+  /**
+   * Test which stores random keys into a column family
+   * in the database
+   * using an {@link IntComparator}
+   * it then checks that these keys are read back in
+   * ascending order
+   *
+   * @param db_path A path where we can store database
+   *                files temporarily
+   *
+   * @param comparator the comparator
+   *
+   * @throws RocksDBException if a database error happens.
+   */
+  private void testRoundtripCf(final Path db_path,
+      final AbstractComparator comparator) throws RocksDBException {
+
+    final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+        new ColumnFamilyDescriptor("new_cf".getBytes(),
+            new ColumnFamilyOptions()
+                .setComparator(comparator))
+    );
+
+    final List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
+
+    try (final DBOptions opt = new DBOptions()
+        .setCreateIfMissing(true)
+        .setCreateMissingColumnFamilies(true)) {
+
+      try (final RocksDB db = RocksDB.open(opt, db_path.toString(),
+          cfDescriptors, cfHandles)) {
+        try {
+          assertThat(cfDescriptors.size()).isEqualTo(2);
+          assertThat(cfHandles.size()).isEqualTo(2);
+
+          for (int i = 0; i < TOTAL_KEYS; i++) {
+            db.put(cfHandles.get(1), keys[i], "value".getBytes(UTF_8));
+          }
+        } finally {
+          for (final ColumnFamilyHandle cfHandle : cfHandles) {
+            cfHandle.close();
+          }
+          cfHandles.clear();
+        }
+      }
+
+      // re-open db and read from start to end
+      // integer keys should be in ascending
+      // order as defined by SimpleIntComparator
+      final ByteBuffer key = ByteBuffer.allocate(4);
+      try (final RocksDB db = RocksDB.open(opt, db_path.toString(),
+          cfDescriptors, cfHandles);
+           final RocksIterator it = db.newIterator(cfHandles.get(1))) {
+        try {
+          assertThat(cfDescriptors.size()).isEqualTo(2);
+          assertThat(cfHandles.size()).isEqualTo(2);
+
+          it.seekToFirst();
+          int lastKey = Integer.MIN_VALUE;
+          int count = 0;
+          for (it.seekToFirst(); it.isValid(); it.next()) {
+            key.put(it.key());
+            key.flip();
+            final int thisKey = key.getInt();
+            key.clear();
+            assertThat(thisKey).isGreaterThan(lastKey);
+            lastKey = thisKey;
+            count++;
+          }
+
+          assertThat(count).isEqualTo(TOTAL_KEYS);
+
+        } finally {
+          for (final ColumnFamilyHandle cfHandle : cfHandles) {
+            cfHandle.close();
+          }
+          cfHandles.clear();
+          for (final ColumnFamilyDescriptor cfDescriptor : cfDescriptors) {
+            cfDescriptor.getOptions().close();
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/util/BytewiseComparatorTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/util/BytewiseComparatorTest.java
new file mode 100644
index 000000000..69f2c282b
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/util/BytewiseComparatorTest.java
@@ -0,0 +1,531 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb.util;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.rocksdb.*;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.file.*;
+import java.util.*;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.*;
+import static org.rocksdb.util.ByteUtil.bytes;
+
+/**
+ * This is a direct port of various C++
+ * tests from db/comparator_db_test.cc
+ * and some code to adapt it to RocksJava
+ */
+public class BytewiseComparatorTest {
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  private List<String> source_strings = Arrays.asList("b", "d", "f", "h", "j", "l");
+  private List<String> interleaving_strings = Arrays.asList("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m");
+
+  /**
+   * Open the database using the C++ BytewiseComparatorImpl
+   * and test the results against our Java BytewiseComparator
+   */
+  @Test
+  public void java_vs_cpp_bytewiseComparator()
+      throws IOException, RocksDBException {
+    for(int rand_seed = 301; rand_seed < 306; rand_seed++) {
+      final Path dbDir =
+          FileSystems.getDefault().getPath(dbFolder.newFolder().getAbsolutePath());
+      try(final RocksDB db = openDatabase(dbDir,
+          BuiltinComparator.BYTEWISE_COMPARATOR)) {
+
+        final Random rnd = new Random(rand_seed);
+        try(final ComparatorOptions copt2 = new ComparatorOptions()
+            .setUseDirectBuffer(false);
+            final AbstractComparator comparator2 = new BytewiseComparator(copt2)) {
+          final java.util.Comparator<String> jComparator = toJavaComparator(comparator2);
+          doRandomIterationTest(
+              db,
+              jComparator,
+              rnd,
+              8, 100, 3
+          );
+        }
+      }
+    }
+  }
+
+  /**
+   * Open the database using the Java BytewiseComparator
+   * and test the results against another Java BytewiseComparator
+   */
+  @Test
+  public void java_vs_java_bytewiseComparator()
+      throws IOException, RocksDBException {
+    for(int rand_seed = 301; rand_seed < 306; rand_seed++) {
+      final Path dbDir =
+          FileSystems.getDefault().getPath(dbFolder.newFolder().getAbsolutePath());
+      try(final ComparatorOptions copt = new ComparatorOptions()
+          .setUseDirectBuffer(false);
+          final AbstractComparator comparator = new BytewiseComparator(copt);
+          final RocksDB db = openDatabase(dbDir, comparator)) {
+
+        final Random rnd = new Random(rand_seed);
+        try(final ComparatorOptions copt2 = new ComparatorOptions()
+            .setUseDirectBuffer(false);
+            final AbstractComparator comparator2 = new BytewiseComparator(copt2)) {
+          final java.util.Comparator<String> jComparator = toJavaComparator(comparator2);
+          doRandomIterationTest(
+              db,
+              jComparator,
+              rnd,
+              8, 100, 3
+          );
+        }
+      }
+    }
+  }
+
+  /**
+   * Open the database using the C++ BytewiseComparatorImpl
+   * and test the results against our Java DirectBytewiseComparator
+   */
+  @Test
+  public void java_vs_cpp_directBytewiseComparator()
+      throws IOException, RocksDBException {
+    for(int rand_seed = 301; rand_seed < 306; rand_seed++) {
+      final Path dbDir =
+          FileSystems.getDefault().getPath(dbFolder.newFolder().getAbsolutePath());
+      try(final RocksDB db = openDatabase(dbDir,
+          BuiltinComparator.BYTEWISE_COMPARATOR)) {
+
+        final Random rnd = new Random(rand_seed);
+        try(final ComparatorOptions copt2 = new ComparatorOptions()
+              .setUseDirectBuffer(true);
+            final AbstractComparator comparator2 = new BytewiseComparator(copt2)) {
+          final java.util.Comparator<String> jComparator = toJavaComparator(comparator2);
+          doRandomIterationTest(
+              db,
+              jComparator,
+              rnd,
+              8, 100, 3
+          );
+        }
+      }
+    }
+  }
+
+  /**
+   * Open the database using the Java DirectBytewiseComparator
+   * and test the results against another Java DirectBytewiseComparator
+   */
+  @Test
+  public void java_vs_java_directBytewiseComparator()
+      throws IOException, RocksDBException {
+    for(int rand_seed = 301; rand_seed < 306; rand_seed++) {
+      final Path dbDir =
+          FileSystems.getDefault().getPath(dbFolder.newFolder().getAbsolutePath());
+      try (final ComparatorOptions copt = new ComparatorOptions()
+           .setUseDirectBuffer(true);
+          final AbstractComparator comparator = new BytewiseComparator(copt);
+          final RocksDB db = openDatabase(dbDir, comparator)) {
+
+        final Random rnd = new Random(rand_seed);
+        try(final ComparatorOptions copt2 = new ComparatorOptions()
+              .setUseDirectBuffer(true);
+            final AbstractComparator comparator2 = new BytewiseComparator(copt2)) {
+          final java.util.Comparator<String> jComparator = toJavaComparator(comparator2);
+          doRandomIterationTest(
+              db,
+              jComparator,
+              rnd,
+              8, 100, 3
+          );
+        }
+      }
+    }
+  }
+
+  /**
+   * Open the database using the C++ ReverseBytewiseComparatorImpl
+   * and test the results against our Java ReverseBytewiseComparator
+   */
+  @Test
+  public void java_vs_cpp_reverseBytewiseComparator()
+      throws IOException, RocksDBException {
+    for(int rand_seed = 301; rand_seed < 306; rand_seed++) {
+      final Path dbDir =
+          FileSystems.getDefault().getPath(dbFolder.newFolder().getAbsolutePath());
+      try(final RocksDB db = openDatabase(dbDir,
+          BuiltinComparator.REVERSE_BYTEWISE_COMPARATOR)) {
+
+        final Random rnd = new Random(rand_seed);
+        try(final ComparatorOptions copt2 = new ComparatorOptions()
+            .setUseDirectBuffer(false);
+            final AbstractComparator comparator2 = new ReverseBytewiseComparator(copt2)) {
+          final java.util.Comparator<String> jComparator = toJavaComparator(comparator2);
+          doRandomIterationTest(
+              db,
+              jComparator,
+              rnd,
+              8, 100, 3
+          );
+        }
+      }
+    }
+  }
+
+  /**
+   * Open the database using the Java ReverseBytewiseComparator
+   * and test the results against another Java ReverseBytewiseComparator
+   */
+  @Test
+  public void java_vs_java_reverseBytewiseComparator()
+      throws IOException, RocksDBException {
+    for(int rand_seed = 301; rand_seed < 306; rand_seed++) {
+      final Path dbDir =
+          FileSystems.getDefault().getPath(dbFolder.newFolder().getAbsolutePath());
+      try (final ComparatorOptions copt = new ComparatorOptions()
+           .setUseDirectBuffer(false);
+           final AbstractComparator comparator = new ReverseBytewiseComparator(copt);
+           final RocksDB db = openDatabase(dbDir, comparator)) {
+
+        final Random rnd = new Random(rand_seed);
+        try(final ComparatorOptions copt2 = new ComparatorOptions()
+            .setUseDirectBuffer(false);
+            final AbstractComparator comparator2 = new ReverseBytewiseComparator(copt2)) {
+          final java.util.Comparator<String> jComparator = toJavaComparator(comparator2);
+          doRandomIterationTest(
+              db,
+              jComparator,
+              rnd,
+              8, 100, 3
+          );
+        }
+      }
+    }
+  }
+
+  private void doRandomIterationTest(
+      final RocksDB db, final java.util.Comparator<String> javaComparator,
+      final Random rnd,
+      final int num_writes, final int num_iter_ops,
+      final int num_trigger_flush) throws RocksDBException {
+
+    final TreeMap<String, String> map = new TreeMap<>(javaComparator);
+
+    try (final FlushOptions flushOptions = new FlushOptions();
+         final WriteOptions writeOptions = new WriteOptions()) {
+      for (int i = 0; i < num_writes; i++) {
+        if (num_trigger_flush > 0 && i != 0 && i % num_trigger_flush == 0) {
+          db.flush(flushOptions);
+        }
+
+        final int type = rnd.nextInt(2);
+        final int index = rnd.nextInt(source_strings.size());
+        final String key = source_strings.get(index);
+        switch (type) {
+          case 0:
+            // put
+            map.put(key, key);
+            db.put(writeOptions, bytes(key), bytes(key));
+            break;
+          case 1:
+            // delete
+            if (map.containsKey(key)) {
+              map.remove(key);
+            }
+            db.delete(writeOptions, bytes(key));
+            break;
+
+          default:
+            fail("Should not be able to generate random outside range 1..2");
+        }
+      }
+    }
+
+    try (final ReadOptions readOptions = new ReadOptions();
+         final RocksIterator iter = db.newIterator(readOptions)) {
+      final KVIter<String, String> result_iter = new KVIter<>(map);
+
+      boolean is_valid = false;
+      for (int i = 0; i < num_iter_ops; i++) {
+        // Random walk and make sure iter and result_iter returns the
+        // same key and value
+        final int type = rnd.nextInt(8);
+        iter.status();
+        switch (type) {
+          case 0:
+            // Seek to First
+            iter.seekToFirst();
+            result_iter.seekToFirst();
+            break;
+          case 1:
+            // Seek to last
+            iter.seekToLast();
+            result_iter.seekToLast();
+            break;
+          case 2: {
+            // Seek to random (existing or non-existing) key
+            final int key_idx = rnd.nextInt(interleaving_strings.size());
+            final String key = interleaving_strings.get(key_idx);
+            iter.seek(bytes(key));
+            result_iter.seek(bytes(key));
+            break;
+          }
+          case 3: {
+            // SeekForPrev to random (existing or non-existing) key
+            final int key_idx = rnd.nextInt(interleaving_strings.size());
+            final String key = interleaving_strings.get(key_idx);
+            iter.seekForPrev(bytes(key));
+            result_iter.seekForPrev(bytes(key));
+            break;
+          }
+          case 4:
+            // Next
+            if (is_valid) {
+              iter.next();
+              result_iter.next();
+            } else {
+              continue;
+            }
+            break;
+          case 5:
+            // Prev
+            if (is_valid) {
+              iter.prev();
+              result_iter.prev();
+            } else {
+              continue;
+            }
+            break;
+          case 6:
+            // Refresh
+            iter.refresh();
+            result_iter.refresh();
+            iter.seekToFirst();
+            result_iter.seekToFirst();
+            break;
+          default: {
+            assert (type == 7);
+            final int key_idx = rnd.nextInt(source_strings.size());
+            final String key = source_strings.get(key_idx);
+            final byte[] result = db.get(readOptions, bytes(key));
+            if (!map.containsKey(key)) {
+              assertNull(result);
+            } else {
+              assertArrayEquals(bytes(map.get(key)), result);
+            }
+            break;
+          }
+        }
+
+        assertEquals(result_iter.isValid(), iter.isValid());
+
+        is_valid = iter.isValid();
+
+        if (is_valid) {
+          assertArrayEquals(bytes(result_iter.key()), iter.key());
+
+          //note that calling value on a non-valid iterator from the Java API
+          //results in a SIGSEGV
+          assertArrayEquals(bytes(result_iter.value()), iter.value());
+        }
+      }
+    }
+  }
+
+  /**
+   * Open the database using a C++ Comparator
+   */
+  private RocksDB openDatabase(
+      final Path dbDir, final BuiltinComparator cppComparator)
+      throws IOException, RocksDBException {
+    final Options options = new Options()
+        .setCreateIfMissing(true)
+        .setComparator(cppComparator);
+    return RocksDB.open(options, dbDir.toAbsolutePath().toString());
+  }
+
+  /**
+   * Open the database using a Java Comparator
+   */
+  private RocksDB openDatabase(
+      final Path dbDir,
+      final AbstractComparator javaComparator)
+      throws IOException, RocksDBException {
+    final Options options = new Options()
+        .setCreateIfMissing(true)
+        .setComparator(javaComparator);
+    return RocksDB.open(options, dbDir.toAbsolutePath().toString());
+  }
+
+  private java.util.Comparator<String> toJavaComparator(
+      final AbstractComparator rocksComparator) {
+    return new java.util.Comparator<String>() {
+      @Override
+      public int compare(final String s1, final String s2) {
+        final ByteBuffer bufS1;
+        final ByteBuffer bufS2;
+        if (rocksComparator.usingDirectBuffers()) {
+          bufS1 = ByteBuffer.allocateDirect(s1.length());
+          bufS2 = ByteBuffer.allocateDirect(s2.length());
+        } else {
+          bufS1 = ByteBuffer.allocate(s1.length());
+          bufS2 = ByteBuffer.allocate(s2.length());
+        }
+        bufS1.put(bytes(s1));
+        bufS1.flip();
+        bufS2.put(bytes(s2));
+        bufS2.flip();
+        return rocksComparator.compare(bufS1, bufS2);
+      }
+    };
+  }
+
+  private static class KVIter<K, V> implements RocksIteratorInterface {
+
+    private final List<Map.Entry<K, V>> entries;
+    private final java.util.Comparator<? super K> comparator;
+    private int offset = -1;
+
+    private int lastPrefixMatchIdx = -1;
+    private int lastPrefixMatch = 0;
+
+    public KVIter(final TreeMap<K, V> map) {
+      this.entries = new ArrayList<>();
+      entries.addAll(map.entrySet());
+      this.comparator = map.comparator();
+    }
+
+
+    @Override
+    public boolean isValid() {
+      return offset > -1 && offset < entries.size();
+    }
+
+    @Override
+    public void seekToFirst() {
+      offset = 0;
+    }
+
+    @Override
+    public void seekToLast() {
+      offset = entries.size() - 1;
+    }
+
+    @SuppressWarnings("unchecked")
+    @Override
+    public void seek(final byte[] target) {
+      for(offset = 0; offset < entries.size(); offset++) {
+        if(comparator.compare(entries.get(offset).getKey(),
+            (K)new String(target, UTF_8)) >= 0) {
+          return;
+        }
+      }
+    }
+
+    @SuppressWarnings("unchecked")
+    @Override
+    public void seekForPrev(final byte[] target) {
+      for(offset = entries.size()-1; offset >= 0; offset--) {
+        if(comparator.compare(entries.get(offset).getKey(),
+            (K)new String(target, UTF_8)) <= 0) {
+          return;
+        }
+      }
+    }
+
+    /**
+     * Is `a` a prefix of `b`
+     *
+     * @return The length of the matching prefix, or 0 if it is not a prefix
+     */
+    private int isPrefix(final byte[] a, final byte[] b) {
+      if(b.length >= a.length) {
+        for(int i = 0; i < a.length; i++) {
+          if(a[i] != b[i]) {
+            return i;
+          }
+        }
+        return a.length;
+      } else {
+        return 0;
+      }
+    }
+
+    @Override
+    public void next() {
+      if(offset < entries.size()) {
+        offset++;
+      }
+    }
+
+    @Override
+    public void prev() {
+      if(offset >= 0) {
+        offset--;
+      }
+    }
+
+    @Override
+    public void refresh() throws RocksDBException {
+      offset = -1;
+    }
+
+    @Override
+    public void status() throws RocksDBException {
+      if(offset < 0 || offset >= entries.size()) {
+        throw new RocksDBException("Index out of bounds. Size is: " +
+            entries.size() + ", offset is: " + offset);
+      }
+    }
+
+    @SuppressWarnings("unchecked")
+    public K key() {
+      if(!isValid()) {
+        if(entries.isEmpty()) {
+          return (K)"";
+        } else if(offset == -1){
+          return entries.get(0).getKey();
+        } else if(offset == entries.size()) {
+          return entries.get(offset - 1).getKey();
+        } else {
+          return (K)"";
+        }
+      } else {
+        return entries.get(offset).getKey();
+      }
+    }
+
+    @SuppressWarnings("unchecked")
+    public V value() {
+      if(!isValid()) {
+        return (V)"";
+      } else {
+        return entries.get(offset).getValue();
+      }
+    }
+
+    @Override
+    public void seek(ByteBuffer target) {
+      throw new IllegalAccessError("Not implemented");
+    }
+
+    @Override
+    public void seekForPrev(ByteBuffer target) {
+      throw new IllegalAccessError("Not implemented");
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java b/src/rocksdb/java/src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java
new file mode 100644
index 000000000..8ea104332
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java
@@ -0,0 +1,190 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+package org.rocksdb.util;
+
+import org.rocksdb.RocksDBException;
+import org.rocksdb.WriteBatch;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Objects;
+
+/**
+ * A simple WriteBatch Handler which adds a record
+ * of each event that it receives to a list
+ */
+public class CapturingWriteBatchHandler extends WriteBatch.Handler {
+
+  private final List<Event> events = new ArrayList<>();
+
+  /**
+   * Returns a copy of the current events list
+   *
+   * @return a list of the events which have happened upto now
+   */
+  public List<Event> getEvents() {
+    return new ArrayList<>(events);
+  }
+
+  @Override
+  public void put(final int columnFamilyId, final byte[] key,
+                  final byte[] value) {
+    events.add(new Event(Action.PUT, columnFamilyId, key, value));
+  }
+
+  @Override
+  public void put(final byte[] key, final byte[] value) {
+    events.add(new Event(Action.PUT, key, value));
+  }
+
+  @Override
+  public void merge(final int columnFamilyId, final byte[] key,
+                    final byte[] value) {
+    events.add(new Event(Action.MERGE, columnFamilyId, key, value));
+  }
+
+  @Override
+  public void merge(final byte[] key, final byte[] value) {
+    events.add(new Event(Action.MERGE, key, value));
+  }
+
+  @Override
+  public void delete(final int columnFamilyId, final byte[] key) {
+    events.add(new Event(Action.DELETE, columnFamilyId, key, (byte[])null));
+  }
+
+  @Override
+  public void delete(final byte[] key) {
+    events.add(new Event(Action.DELETE, key, (byte[])null));
+  }
+
+  @Override
+  public void singleDelete(final int columnFamilyId, final byte[] key) {
+    events.add(new Event(Action.SINGLE_DELETE,
+        columnFamilyId, key, (byte[])null));
+  }
+
+  @Override
+  public void singleDelete(final byte[] key) {
+    events.add(new Event(Action.SINGLE_DELETE, key, (byte[])null));
+  }
+
+  @Override
+  public void deleteRange(final int columnFamilyId, final byte[] beginKey,
+                          final byte[] endKey) {
+    events.add(new Event(Action.DELETE_RANGE, columnFamilyId, beginKey,
+        endKey));
+  }
+
+  @Override
+  public void deleteRange(final byte[] beginKey, final byte[] endKey) {
+    events.add(new Event(Action.DELETE_RANGE, beginKey, endKey));
+  }
+
+  @Override
+  public void logData(final byte[] blob) {
+    events.add(new Event(Action.LOG, (byte[])null, blob));
+  }
+
+  @Override
+  public void putBlobIndex(final int columnFamilyId, final byte[] key,
+                           final byte[] value) {
+    events.add(new Event(Action.PUT_BLOB_INDEX, key, value));
+  }
+
+  @Override
+  public void markBeginPrepare() throws RocksDBException {
+    events.add(new Event(Action.MARK_BEGIN_PREPARE, (byte[])null,
+        (byte[])null));
+  }
+
+  @Override
+  public void markEndPrepare(final byte[] xid) throws RocksDBException {
+    events.add(new Event(Action.MARK_END_PREPARE, (byte[])null,
+        (byte[])null));
+  }
+
+  @Override
+  public void markNoop(final boolean emptyBatch) throws RocksDBException {
+    events.add(new Event(Action.MARK_NOOP, (byte[])null, (byte[])null));
+  }
+
+  @Override
+  public void markRollback(final byte[] xid) throws RocksDBException {
+    events.add(new Event(Action.MARK_ROLLBACK, (byte[])null, (byte[])null));
+  }
+
+  @Override
+  public void markCommit(final byte[] xid) throws RocksDBException {
+    events.add(new Event(Action.MARK_COMMIT, (byte[])null, (byte[])null));
+  }
+
+  @Override
+  public void markCommitWithTimestamp(final byte[] xid, final byte[] ts) throws RocksDBException {
+    events.add(new Event(Action.MARK_COMMIT_WITH_TIMESTAMP, (byte[]) null, (byte[]) null));
+  }
+
+  public static class Event {
+    public final Action action;
+    public final int columnFamilyId;
+    public final byte[] key;
+    public final byte[] value;
+
+    public Event(final Action action, final byte[] key, final byte[] value) {
+      this(action, 0, key, value);
+    }
+
+    public Event(final Action action, final int columnFamilyId, final byte[] key,
+        final byte[] value) {
+      this.action = action;
+      this.columnFamilyId = columnFamilyId;
+      this.key = key;
+      this.value = value;
+    }
+
+    @Override
+    public boolean equals(final Object o) {
+      if (this == o) {
+        return true;
+      }
+      if (o == null || getClass() != o.getClass()) {
+        return false;
+      }
+      final Event event = (Event) o;
+      return columnFamilyId == event.columnFamilyId &&
+          action == event.action &&
+          ((key == null && event.key == null)
+              || Arrays.equals(key, event.key)) &&
+          ((value == null && event.value == null)
+              || Arrays.equals(value, event.value));
+    }
+
+    @Override
+    public int hashCode() {
+      int result = Objects.hash(action, columnFamilyId);
+      result = 31 * result + Arrays.hashCode(key);
+      result = 31 * result + Arrays.hashCode(value);
+      return result;
+    }
+  }
+
+  /**
+   * Enumeration of Write Batch
+   * event actions
+   */
+  public enum Action {
+    PUT,
+    MERGE,
+    DELETE,
+    SINGLE_DELETE,
+    DELETE_RANGE,
+    LOG,
+    PUT_BLOB_INDEX,
+    MARK_BEGIN_PREPARE,
+    MARK_END_PREPARE,
+    MARK_NOOP,
+    MARK_COMMIT,
+    MARK_ROLLBACK,
+    MARK_COMMIT_WITH_TIMESTAMP
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/util/DirectByteBufferAllocator.java b/src/rocksdb/java/src/test/java/org/rocksdb/util/DirectByteBufferAllocator.java
new file mode 100644
index 000000000..d26fb578b
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/util/DirectByteBufferAllocator.java
@@ -0,0 +1,18 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb.util;
+
+import java.nio.ByteBuffer;
+
+public final class DirectByteBufferAllocator implements ByteBufferAllocator {
+  DirectByteBufferAllocator(){};
+
+  @Override
+  public ByteBuffer allocate(final int capacity) {
+    return ByteBuffer.allocateDirect(capacity);
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/util/EnvironmentTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
new file mode 100644
index 000000000..ae340e06d
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
@@ -0,0 +1,304 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb.util;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.hamcrest.Matchers.is;
+
+import java.lang.reflect.Field;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class EnvironmentTest {
+  private final static String ARCH_FIELD_NAME = "ARCH";
+  private final static String OS_FIELD_NAME = "OS";
+
+  private final static String MUSL_ENVIRONMENT_FIELD_NAME = "MUSL_ENVIRONMENT";
+  private final static String MUSL_LIBC_FIELD_NAME = "MUSL_LIBC";
+
+  private static String INITIAL_OS;
+  private static String INITIAL_ARCH;
+  private static String INITIAL_MUSL_ENVIRONMENT;
+  private static Boolean INITIAL_MUSL_LIBC;
+
+  @BeforeClass
+  public static void saveState() {
+    INITIAL_ARCH = getEnvironmentClassField(ARCH_FIELD_NAME);
+    INITIAL_OS = getEnvironmentClassField(OS_FIELD_NAME);
+    INITIAL_MUSL_LIBC = getEnvironmentClassField(MUSL_LIBC_FIELD_NAME);
+    INITIAL_MUSL_ENVIRONMENT = getEnvironmentClassField(MUSL_ENVIRONMENT_FIELD_NAME);
+  }
+
+  @Test
+  public void mac32() {
+    setEnvironmentClassFields("mac", "32");
+    assertThat(Environment.isWindows()).isFalse();
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".jnilib");
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni-osx.jnilib");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni.dylib");
+  }
+
+  @Test
+  public void mac64_x86_64() {
+    setEnvironmentClassFields("mac", "x86_64");
+    assertThat(Environment.isWindows()).isFalse();
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".jnilib");
+    assertThat(Environment.getJniLibraryFileName("rocksdb"))
+        .isEqualTo("librocksdbjni-osx-x86_64.jnilib");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb"))
+        .isEqualTo("librocksdbjni-osx.jnilib");
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni.dylib");
+  }
+
+  @Test
+  public void macAarch64() {
+    setEnvironmentClassFields("mac", "aarch64");
+    assertThat(Environment.isWindows()).isFalse();
+    assertThat(Environment.getJniLibraryExtension()).isEqualTo(".jnilib");
+    assertThat(Environment.getJniLibraryFileName("rocksdb"))
+        .isEqualTo("librocksdbjni-osx-arm64.jnilib");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb"))
+        .isEqualTo("librocksdbjni-osx.jnilib");
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.dylib");
+  }
+
+  @Test
+  public void nix32() {
+    // Linux
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
+    setEnvironmentClassFields("Linux", "32");
+    assertThat(Environment.isWindows()).isFalse();
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".so");
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni-linux32.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni.so");
+    // Linux musl-libc (Alpine)
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, true);
+    assertThat(Environment.isWindows()).isFalse();
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".so");
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni-linux32-musl.so");
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni.so");
+    // UNIX
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
+    setEnvironmentClassFields("Unix", "32");
+    assertThat(Environment.isWindows()).isFalse();
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".so");
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni-linux32.so");
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni.so");
+  }
+
+  @Test(expected = UnsupportedOperationException.class)
+  public void aix32() {
+    // AIX
+    setEnvironmentClassFields("aix", "32");
+    assertThat(Environment.isWindows()).isFalse();
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".so");
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).isEqualTo("blah");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
+  }
+
+  @Test
+  public void nix64() {
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
+    setEnvironmentClassFields("Linux", "x64");
+    assertThat(Environment.isWindows()).isFalse();
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".so");
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni-linux64.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni.so");
+    // Linux musl-libc (Alpine)
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, true);
+    assertThat(Environment.isWindows()).isFalse();
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".so");
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni-linux64-musl.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni.so");
+    // UNIX
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
+    setEnvironmentClassFields("Unix", "x64");
+    assertThat(Environment.isWindows()).isFalse();
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".so");
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni-linux64.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni.so");
+    // AIX
+    setEnvironmentClassFields("aix", "x64");
+    assertThat(Environment.isWindows()).isFalse();
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".so");
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni-aix64.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni.so");
+  }
+
+  @Test
+  public void detectWindows(){
+    setEnvironmentClassFields("win", "x64");
+    assertThat(Environment.isWindows()).isTrue();
+  }
+
+  @Test
+  public void win64() {
+    setEnvironmentClassFields("win", "x64");
+    assertThat(Environment.isWindows()).isTrue();
+    assertThat(Environment.getJniLibraryExtension()).
+      isEqualTo(".dll");
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
+      isEqualTo("librocksdbjni-win64.dll");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
+      isEqualTo("librocksdbjni.dll");
+  }
+
+  @Test
+  public void ppc64le() {
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
+    setEnvironmentClassFields("Linux", "ppc64le");
+    assertThat(Environment.isUnix()).isTrue();
+    assertThat(Environment.isPowerPC()).isTrue();
+    assertThat(Environment.is64Bit()).isTrue();
+    assertThat(Environment.getJniLibraryExtension()).isEqualTo(".so");
+    assertThat(Environment.getSharedLibraryName("rocksdb")).isEqualTo("rocksdbjni");
+    assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-ppc64le");
+    assertThat(Environment.getJniLibraryFileName("rocksdb"))
+        .isEqualTo("librocksdbjni-linux-ppc64le.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
+    // Linux musl-libc (Alpine)
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, true);
+    setEnvironmentClassFields("Linux", "ppc64le");
+    assertThat(Environment.isUnix()).isTrue();
+    assertThat(Environment.isPowerPC()).isTrue();
+    assertThat(Environment.is64Bit()).isTrue();
+    assertThat(Environment.getJniLibraryExtension()).isEqualTo(".so");
+    assertThat(Environment.getSharedLibraryName("rocksdb")).isEqualTo("rocksdbjni");
+    assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-ppc64le-musl");
+    assertThat(Environment.getJniLibraryFileName("rocksdb"))
+        .isEqualTo("librocksdbjni-linux-ppc64le-musl.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
+  }
+
+  @Test
+  public void linuxArch64() {
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
+    setEnvironmentClassFields("Linux", "aarch64");
+    assertThat(Environment.isUnix()).isTrue();
+    assertThat(Environment.isAarch64()).isTrue();
+    assertThat(Environment.is64Bit()).isTrue();
+    assertThat(Environment.getJniLibraryExtension()).isEqualTo(".so");
+    assertThat(Environment.getSharedLibraryName("rocksdb")).isEqualTo("rocksdbjni");
+    assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-aarch64");
+    assertThat(Environment.getJniLibraryFileName("rocksdb"))
+        .isEqualTo("librocksdbjni-linux-aarch64.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
+    // Linux musl-libc (Alpine)
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, true);
+    setEnvironmentClassFields("Linux", "aarch64");
+    assertThat(Environment.isUnix()).isTrue();
+    assertThat(Environment.isAarch64()).isTrue();
+    assertThat(Environment.is64Bit()).isTrue();
+    assertThat(Environment.getJniLibraryExtension()).isEqualTo(".so");
+    assertThat(Environment.getSharedLibraryName("rocksdb")).isEqualTo("rocksdbjni");
+    assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-aarch64-musl");
+    assertThat(Environment.getJniLibraryFileName("rocksdb"))
+        .isEqualTo("librocksdbjni-linux-aarch64-musl.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
+  }
+
+  @Test
+  public void resolveIsMuslLibc() {
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, null);
+    setEnvironmentClassFields("win", "anyarch");
+    assertThat(Environment.isUnix()).isFalse();
+
+    // with user input, will resolve to true if set as true. Even on OSs that appear absurd for
+    // musl. Users choice
+    assertThat(Environment.initIsMuslLibc()).isFalse();
+    setEnvironmentClassField(MUSL_ENVIRONMENT_FIELD_NAME, "true");
+    assertThat(Environment.initIsMuslLibc()).isTrue();
+    setEnvironmentClassField(MUSL_ENVIRONMENT_FIELD_NAME, "false");
+    assertThat(Environment.initIsMuslLibc()).isFalse();
+  }
+
+  private void setEnvironmentClassFields(String osName,
+      String osArch) {
+    setEnvironmentClassField(OS_FIELD_NAME, osName);
+    setEnvironmentClassField(ARCH_FIELD_NAME, osArch);
+  }
+
+  @AfterClass
+  public static void restoreState() {
+    setEnvironmentClassField(OS_FIELD_NAME, INITIAL_OS);
+    setEnvironmentClassField(ARCH_FIELD_NAME, INITIAL_ARCH);
+    setEnvironmentClassField(MUSL_ENVIRONMENT_FIELD_NAME, INITIAL_MUSL_ENVIRONMENT);
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, INITIAL_MUSL_LIBC);
+  }
+
+  @SuppressWarnings("unchecked")
+  private static <T> T getEnvironmentClassField(String fieldName) {
+    final Field field;
+    try {
+      field = Environment.class.getDeclaredField(fieldName);
+      field.setAccessible(true);
+      /* Fails in JDK 13; and not needed unless fields are final
+      final Field modifiersField = Field.class.getDeclaredField("modifiers");
+      modifiersField.setAccessible(true);
+      modifiersField.setInt(field, field.getModifiers() & ~Modifier.FINAL);
+      */
+      return (T)field.get(null);
+    } catch (final NoSuchFieldException | IllegalAccessException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  private static void setEnvironmentClassField(String fieldName, Object value) {
+    final Field field;
+    try {
+      field = Environment.class.getDeclaredField(fieldName);
+      field.setAccessible(true);
+      /* Fails in JDK 13; and not needed unless fields are final
+      final Field modifiersField = Field.class.getDeclaredField("modifiers");
+      modifiersField.setAccessible(true);
+      modifiersField.setInt(field, field.getModifiers() & ~Modifier.FINAL);
+      */
+      field.set(null, value);
+    } catch (final NoSuchFieldException | IllegalAccessException e) {
+      throw new RuntimeException(e);
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/util/HeapByteBufferAllocator.java b/src/rocksdb/java/src/test/java/org/rocksdb/util/HeapByteBufferAllocator.java
new file mode 100644
index 000000000..ad6b8f6f4
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/util/HeapByteBufferAllocator.java
@@ -0,0 +1,18 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb.util;
+
+import java.nio.ByteBuffer;
+
+public final class HeapByteBufferAllocator implements ByteBufferAllocator {
+  HeapByteBufferAllocator(){};
+
+  @Override
+  public ByteBuffer allocate(final int capacity) {
+    return ByteBuffer.allocate(capacity);
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/util/IntComparatorTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/util/IntComparatorTest.java
new file mode 100644
index 000000000..dd3288513
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/util/IntComparatorTest.java
@@ -0,0 +1,266 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb.util;
+
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameter;
+import org.junit.runners.Parameterized.Parameters;
+import org.rocksdb.*;
+
+import java.nio.ByteBuffer;
+import java.nio.file.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Random;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * Tests for IntComparator, but more generally
+ * also for rocksdb::ComparatorJniCallback implementation.
+ */
+@RunWith(Parameterized.class)
+public class IntComparatorTest {
+
+  // test with 500 random integer keys
+  private static final int TOTAL_KEYS = 500;
+  private static final byte[][] keys = new byte[TOTAL_KEYS][4];
+
+  @BeforeClass
+  public static void prepareKeys() {
+    final ByteBuffer buf = ByteBuffer.allocate(4);
+    final Random random = new Random();
+    for (int i = 0; i < TOTAL_KEYS; i++) {
+      final int ri = random.nextInt();
+      buf.putInt(ri);
+      buf.flip();
+      final byte[] key = buf.array();
+
+      // does key already exist (avoid duplicates)
+      if (keyExists(key, i)) {
+        i--; // loop round and generate a different key
+      } else {
+        System.arraycopy(key, 0, keys[i], 0, 4);
+      }
+    }
+  }
+
+  private static boolean keyExists(final byte[] key, final int limit) {
+    for (int j = 0; j < limit; j++) {
+      if (Arrays.equals(key, keys[j])) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  @Parameters(name = "{0}")
+  public static Iterable<Object[]> parameters() {
+    return Arrays.asList(new Object[][] {
+        { "non-direct_reused64_mutex", false, 64, ReusedSynchronisationType.MUTEX },
+        { "direct_reused64_mutex", true, 64, ReusedSynchronisationType.MUTEX },
+        { "non-direct_reused64_adaptive-mutex", false, 64, ReusedSynchronisationType.ADAPTIVE_MUTEX },
+        { "direct_reused64_adaptive-mutex", true, 64, ReusedSynchronisationType.ADAPTIVE_MUTEX },
+        { "non-direct_reused64_thread-local", false, 64, ReusedSynchronisationType.THREAD_LOCAL },
+        { "direct_reused64_thread-local", true, 64, ReusedSynchronisationType.THREAD_LOCAL },
+        { "non-direct_noreuse", false, -1, null },
+        { "direct_noreuse", true, -1, null }
+    });
+  }
+
+  @Parameter(0)
+  public String name;
+
+  @Parameter(1)
+  public boolean useDirectBuffer;
+
+  @Parameter(2)
+  public int maxReusedBufferSize;
+
+  @Parameter(3)
+  public ReusedSynchronisationType reusedSynchronisationType;
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+
+  @Test
+  public void javaComparatorDefaultCf() throws RocksDBException {
+    try (final ComparatorOptions options = new ComparatorOptions()
+        .setUseDirectBuffer(useDirectBuffer)
+        .setMaxReusedBufferSize(maxReusedBufferSize)
+        // if reusedSynchronisationType == null we assume that maxReusedBufferSize <= 0 and so we just set ADAPTIVE_MUTEX, even though it won't be used
+        .setReusedSynchronisationType(reusedSynchronisationType == null ? ReusedSynchronisationType.ADAPTIVE_MUTEX : reusedSynchronisationType);
+        final IntComparator comparator = new IntComparator(options)) {
+
+      // test the round-tripability of keys written and read with the Comparator
+      testRoundtrip(FileSystems.getDefault().getPath(
+          dbFolder.getRoot().getAbsolutePath()), comparator);
+    }
+  }
+
+  @Test
+  public void javaComparatorNamedCf() throws RocksDBException {
+    try (final ComparatorOptions options = new ComparatorOptions()
+        .setUseDirectBuffer(useDirectBuffer)
+        .setMaxReusedBufferSize(maxReusedBufferSize)
+        // if reusedSynchronisationType == null we assume that maxReusedBufferSize <= 0 and so we just set ADAPTIVE_MUTEX, even though it won't be used
+        .setReusedSynchronisationType(reusedSynchronisationType == null ? ReusedSynchronisationType.ADAPTIVE_MUTEX : reusedSynchronisationType);
+      final IntComparator comparator = new IntComparator(options)) {
+
+      // test the round-tripability of keys written and read with the Comparator
+      testRoundtripCf(FileSystems.getDefault().getPath(
+          dbFolder.getRoot().getAbsolutePath()), comparator);
+    }
+  }
+
+  /**
+   * Test which stores random keys into the database
+   * using an {@link IntComparator}
+   * it then checks that these keys are read back in
+   * ascending order
+   *
+   * @param db_path A path where we can store database
+   *                files temporarily
+   *
+   * @param comparator the comparator
+   *
+   * @throws RocksDBException if a database error happens.
+   */
+  private void testRoundtrip(final Path db_path,
+      final AbstractComparator comparator) throws RocksDBException {
+    try (final Options opt = new Options()
+             .setCreateIfMissing(true)
+             .setComparator(comparator)) {
+
+      // store TOTAL_KEYS into the db
+      try (final RocksDB db = RocksDB.open(opt, db_path.toString())) {
+        for (int i = 0; i < TOTAL_KEYS; i++) {
+              db.put(keys[i], "value".getBytes(UTF_8));
+        }
+      }
+
+      // re-open db and read from start to end
+      // integer keys should be in ascending
+      // order as defined by IntComparator
+      final ByteBuffer key = ByteBuffer.allocate(4);
+      try (final RocksDB db = RocksDB.open(opt, db_path.toString());
+           final RocksIterator it = db.newIterator()) {
+        it.seekToFirst();
+        int lastKey = Integer.MIN_VALUE;
+        int count = 0;
+        for (it.seekToFirst(); it.isValid(); it.next()) {
+          key.put(it.key());
+          key.flip();
+          final int thisKey = key.getInt();
+          key.clear();
+          assertThat(thisKey).isGreaterThan(lastKey);
+          lastKey = thisKey;
+          count++;
+        }
+        assertThat(count).isEqualTo(TOTAL_KEYS);
+      }
+    }
+  }
+
+  /**
+   * Test which stores random keys into a column family
+   * in the database
+   * using an {@link IntComparator}
+   * it then checks that these keys are read back in
+   * ascending order
+   *
+   * @param db_path A path where we can store database
+   *                files temporarily
+   *
+   * @param comparator the comparator
+   *
+   * @throws RocksDBException if a database error happens.
+   */
+  private void testRoundtripCf(final Path db_path,
+      final AbstractComparator comparator) throws RocksDBException {
+
+    final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+        new ColumnFamilyDescriptor("new_cf".getBytes(),
+            new ColumnFamilyOptions()
+                .setComparator(comparator))
+    );
+
+    final List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
+
+    try (final DBOptions opt = new DBOptions()
+        .setCreateIfMissing(true)
+        .setCreateMissingColumnFamilies(true)) {
+
+      try (final RocksDB db = RocksDB.open(opt, db_path.toString(),
+          cfDescriptors, cfHandles)) {
+        try {
+          assertThat(cfDescriptors.size()).isEqualTo(2);
+          assertThat(cfHandles.size()).isEqualTo(2);
+
+          for (int i = 0; i < TOTAL_KEYS; i++) {
+            db.put(cfHandles.get(1), keys[i], "value".getBytes(UTF_8));
+          }
+        } finally {
+          for (final ColumnFamilyHandle cfHandle : cfHandles) {
+            cfHandle.close();
+          }
+          cfHandles.clear();
+        }
+      }
+
+      // re-open db and read from start to end
+      // integer keys should be in ascending
+      // order as defined by SimpleIntComparator
+      final ByteBuffer key = ByteBuffer.allocate(4);
+      try (final RocksDB db = RocksDB.open(opt, db_path.toString(),
+          cfDescriptors, cfHandles);
+           final RocksIterator it = db.newIterator(cfHandles.get(1))) {
+        try {
+          assertThat(cfDescriptors.size()).isEqualTo(2);
+          assertThat(cfHandles.size()).isEqualTo(2);
+
+          it.seekToFirst();
+          int lastKey = Integer.MIN_VALUE;
+          int count = 0;
+          for (it.seekToFirst(); it.isValid(); it.next()) {
+            key.put(it.key());
+            key.flip();
+            final int thisKey = key.getInt();
+            key.clear();
+            assertThat(thisKey).isGreaterThan(lastKey);
+            lastKey = thisKey;
+            count++;
+          }
+
+          assertThat(count).isEqualTo(TOTAL_KEYS);
+
+        } finally {
+          for (final ColumnFamilyHandle cfHandle : cfHandles) {
+            cfHandle.close();
+          }
+          cfHandles.clear();
+          for (final ColumnFamilyDescriptor cfDescriptor : cfDescriptors) {
+            cfDescriptor.getOptions().close();
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/util/JNIComparatorTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/util/JNIComparatorTest.java
new file mode 100644
index 000000000..a962b8d78
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/util/JNIComparatorTest.java
@@ -0,0 +1,180 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb.util;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameter;
+import org.junit.runners.Parameterized.Parameters;
+import org.rocksdb.*;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.file.*;
+import java.util.Arrays;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+@RunWith(Parameterized.class)
+public class JNIComparatorTest {
+
+  @Parameters(name = "{0}")
+  public static Iterable<Object[]> parameters() {
+    return Arrays.asList(new Object[][] {
+        { "bytewise_non-direct", BuiltinComparator.BYTEWISE_COMPARATOR, false },
+        { "bytewise_direct", BuiltinComparator.BYTEWISE_COMPARATOR, true },
+        { "reverse-bytewise_non-direct", BuiltinComparator.REVERSE_BYTEWISE_COMPARATOR, false },
+        { "reverse-bytewise_direct", BuiltinComparator.REVERSE_BYTEWISE_COMPARATOR, true },
+    });
+  }
+
+  @Parameter(0)
+  public String name;
+
+  @Parameter(1)
+  public BuiltinComparator builtinComparator;
+
+  @Parameter(2)
+  public boolean useDirectBuffer;
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  private static final int MIN = Short.MIN_VALUE - 1;
+  private static final int MAX = Short.MAX_VALUE + 1;
+
+  @Test
+  public void java_comparator_equals_cpp_comparator() throws RocksDBException, IOException {
+    final int[] javaKeys;
+    try (final ComparatorOptions comparatorOptions = new ComparatorOptions();
+         final AbstractComparator comparator = builtinComparator == BuiltinComparator.BYTEWISE_COMPARATOR
+             ? new BytewiseComparator(comparatorOptions)
+             : new ReverseBytewiseComparator(comparatorOptions)) {
+      final Path javaDbDir =
+          FileSystems.getDefault().getPath(dbFolder.newFolder().getAbsolutePath());
+      storeWithJavaComparator(javaDbDir, comparator);
+      javaKeys = readAllWithJavaComparator(javaDbDir, comparator);
+    }
+
+    final Path cppDbDir =
+        FileSystems.getDefault().getPath(dbFolder.newFolder().getAbsolutePath());
+    storeWithCppComparator(cppDbDir, builtinComparator);
+    final int[] cppKeys =
+        readAllWithCppComparator(cppDbDir, builtinComparator);
+
+    assertThat(javaKeys).isEqualTo(cppKeys);
+  }
+
+  private void storeWithJavaComparator(final Path dir,
+      final AbstractComparator comparator) throws RocksDBException {
+    final ByteBuffer buf = ByteBuffer.allocate(4);
+    try (final Options options = new Options()
+             .setCreateIfMissing(true)
+             .setComparator(comparator);
+         final RocksDB db =
+             RocksDB.open(options, dir.toAbsolutePath().toString())) {
+      for (int i = MIN; i < MAX; i++) {
+        buf.putInt(i);
+        buf.flip();
+
+        db.put(buf.array(), buf.array());
+
+        buf.clear();
+      }
+    }
+  }
+
+  private void storeWithCppComparator(final Path dir,
+      final BuiltinComparator builtinComparator) throws RocksDBException {
+    try (final Options options = new Options()
+             .setCreateIfMissing(true)
+             .setComparator(builtinComparator);
+         final RocksDB db =
+             RocksDB.open(options, dir.toAbsolutePath().toString())) {
+
+      final ByteBuffer buf = ByteBuffer.allocate(4);
+      for (int i = MIN; i < MAX; i++) {
+        buf.putInt(i);
+        buf.flip();
+
+        db.put(buf.array(), buf.array());
+
+        buf.clear();
+      }
+    }
+  }
+
+  private int[] readAllWithJavaComparator(final Path dir,
+      final AbstractComparator comparator) throws RocksDBException {
+    try (final Options options = new Options()
+        .setCreateIfMissing(true)
+        .setComparator(comparator);
+         final RocksDB db =
+             RocksDB.open(options, dir.toAbsolutePath().toString())) {
+
+      try (final RocksIterator it = db.newIterator()) {
+        it.seekToFirst();
+
+        final ByteBuffer buf = ByteBuffer.allocate(4);
+        final int[] keys = new int[MAX - MIN];
+        int idx = 0;
+        while (it.isValid()) {
+          buf.put(it.key());
+          buf.flip();
+
+          final int thisKey = buf.getInt();
+          keys[idx++] = thisKey;
+
+          buf.clear();
+
+          it.next();
+        }
+
+        return keys;
+      }
+    }
+  }
+
+  private int[] readAllWithCppComparator(final Path dir,
+      final BuiltinComparator comparator) throws RocksDBException {
+    try (final Options options = new Options()
+        .setCreateIfMissing(true)
+        .setComparator(comparator);
+         final RocksDB db =
+             RocksDB.open(options, dir.toAbsolutePath().toString())) {
+
+      try (final RocksIterator it = db.newIterator()) {
+        it.seekToFirst();
+
+        final ByteBuffer buf = ByteBuffer.allocate(4);
+        final int[] keys = new int[MAX - MIN];
+        int idx = 0;
+        while (it.isValid()) {
+          buf.put(it.key());
+          buf.flip();
+
+          final int thisKey = buf.getInt();
+          keys[idx++] = thisKey;
+
+          buf.clear();
+
+          it.next();
+        }
+
+        return keys;
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/util/ReverseBytewiseComparatorIntTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/util/ReverseBytewiseComparatorIntTest.java
new file mode 100644
index 000000000..ca08d9de1
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/util/ReverseBytewiseComparatorIntTest.java
@@ -0,0 +1,270 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb.util;
+
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameter;
+import org.junit.runners.Parameterized.Parameters;
+import org.rocksdb.*;
+
+import java.nio.ByteBuffer;
+import java.nio.file.FileSystems;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Random;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * Similar to {@link IntComparatorTest}, but uses
+ * {@link ReverseBytewiseComparator} which ensures the correct reverse
+ * ordering of positive integers.
+ */
+@RunWith(Parameterized.class)
+public class ReverseBytewiseComparatorIntTest {
+
+  // test with 500 random positive integer keys
+  private static final int TOTAL_KEYS = 500;
+  private static final byte[][] keys = new byte[TOTAL_KEYS][4];
+
+  @BeforeClass
+  public static void prepareKeys() {
+    final ByteBuffer buf = ByteBuffer.allocate(4);
+    final Random random = new Random();
+    for (int i = 0; i < TOTAL_KEYS; i++) {
+      final int ri = random.nextInt() & Integer.MAX_VALUE;  // the & ensures positive integer
+      buf.putInt(ri);
+      buf.flip();
+      final byte[] key = buf.array();
+
+      // does key already exist (avoid duplicates)
+      if (keyExists(key, i)) {
+        i--; // loop round and generate a different key
+      } else {
+        System.arraycopy(key, 0, keys[i], 0, 4);
+      }
+    }
+  }
+
+  private static boolean keyExists(final byte[] key, final int limit) {
+    for (int j = 0; j < limit; j++) {
+      if (Arrays.equals(key, keys[j])) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  @Parameters(name = "{0}")
+  public static Iterable<Object[]> parameters() {
+    return Arrays.asList(new Object[][] {
+        { "non-direct_reused64_mutex", false, 64, ReusedSynchronisationType.MUTEX },
+        { "direct_reused64_adaptive-mutex", true, 64, ReusedSynchronisationType.MUTEX },
+        { "non-direct_reused64_adaptive-mutex", false, 64, ReusedSynchronisationType.ADAPTIVE_MUTEX },
+        { "direct_reused64_adaptive-mutex", true, 64, ReusedSynchronisationType.ADAPTIVE_MUTEX },
+        { "non-direct_reused64_adaptive-mutex", false, 64, ReusedSynchronisationType.THREAD_LOCAL },
+        { "direct_reused64_adaptive-mutex", true, 64, ReusedSynchronisationType.THREAD_LOCAL },
+        { "non-direct_noreuse", false, -1, null },
+        { "direct_noreuse", true, -1, null }
+    });
+  }
+
+  @Parameter(0)
+  public String name;
+
+  @Parameter(1)
+  public boolean useDirectBuffer;
+
+  @Parameter(2)
+  public int maxReusedBufferSize;
+
+  @Parameter(3)
+  public ReusedSynchronisationType reusedSynchronisationType;
+
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+
+  @Test
+  public void javaComparatorDefaultCf() throws RocksDBException {
+    try (final ComparatorOptions options = new ComparatorOptions()
+        .setUseDirectBuffer(useDirectBuffer)
+        .setMaxReusedBufferSize(maxReusedBufferSize)
+        // if reusedSynchronisationType == null we assume that maxReusedBufferSize <= 0 and so we just set ADAPTIVE_MUTEX, even though it won't be used
+        .setReusedSynchronisationType(reusedSynchronisationType == null ? ReusedSynchronisationType.ADAPTIVE_MUTEX : reusedSynchronisationType);
+        final ReverseBytewiseComparator comparator =
+            new ReverseBytewiseComparator(options)) {
+
+      // test the round-tripability of keys written and read with the Comparator
+      testRoundtrip(FileSystems.getDefault().getPath(
+          dbFolder.getRoot().getAbsolutePath()), comparator);
+    }
+  }
+
+  @Test
+  public void javaComparatorNamedCf() throws RocksDBException {
+    try (final ComparatorOptions options = new ComparatorOptions()
+        .setUseDirectBuffer(useDirectBuffer)
+        .setMaxReusedBufferSize(maxReusedBufferSize)
+        // if reusedSynchronisationType == null we assume that maxReusedBufferSize <= 0 and so we just set ADAPTIVE_MUTEX, even though it won't be used
+        .setReusedSynchronisationType(reusedSynchronisationType == null ? ReusedSynchronisationType.ADAPTIVE_MUTEX : reusedSynchronisationType);
+      final ReverseBytewiseComparator comparator
+          = new ReverseBytewiseComparator(options)) {
+
+      // test the round-tripability of keys written and read with the Comparator
+      testRoundtripCf(FileSystems.getDefault().getPath(
+          dbFolder.getRoot().getAbsolutePath()), comparator);
+    }
+  }
+
+  /**
+   * Test which stores random keys into the database
+   * using an {@link IntComparator}
+   * it then checks that these keys are read back in
+   * ascending order
+   *
+   * @param db_path A path where we can store database
+   *                files temporarily
+   *
+   * @param comparator the comparator
+   *
+   * @throws RocksDBException if a database error happens.
+   */
+  private void testRoundtrip(final Path db_path,
+      final AbstractComparator comparator) throws RocksDBException {
+    try (final Options opt = new Options()
+             .setCreateIfMissing(true)
+             .setComparator(comparator)) {
+
+      // store TOTAL_KEYS into the db
+      try (final RocksDB db = RocksDB.open(opt, db_path.toString())) {
+        for (int i = 0; i < TOTAL_KEYS; i++) {
+              db.put(keys[i], "value".getBytes(UTF_8));
+        }
+      }
+
+      // re-open db and read from start to end
+      // integer keys should be in descending
+      // order
+      final ByteBuffer key = ByteBuffer.allocate(4);
+      try (final RocksDB db = RocksDB.open(opt, db_path.toString());
+           final RocksIterator it = db.newIterator()) {
+        it.seekToFirst();
+        int lastKey = Integer.MAX_VALUE;
+        int count = 0;
+        for (it.seekToFirst(); it.isValid(); it.next()) {
+          key.put(it.key());
+          key.flip();
+          final int thisKey = key.getInt();
+          key.clear();
+          assertThat(thisKey).isLessThan(lastKey);
+          lastKey = thisKey;
+          count++;
+        }
+        assertThat(count).isEqualTo(TOTAL_KEYS);
+      }
+    }
+  }
+
+  /**
+   * Test which stores random keys into a column family
+   * in the database
+   * using an {@link IntComparator}
+   * it then checks that these keys are read back in
+   * ascending order
+   *
+   * @param db_path A path where we can store database
+   *                files temporarily
+   *
+   * @param comparator the comparator
+   *
+   * @throws RocksDBException if a database error happens.
+   */
+  private void testRoundtripCf(final Path db_path,
+      final AbstractComparator comparator) throws RocksDBException {
+
+    final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+        new ColumnFamilyDescriptor("new_cf".getBytes(),
+            new ColumnFamilyOptions()
+                .setComparator(comparator))
+    );
+
+    final List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
+
+    try (final DBOptions opt = new DBOptions()
+        .setCreateIfMissing(true)
+        .setCreateMissingColumnFamilies(true)) {
+
+      try (final RocksDB db = RocksDB.open(opt, db_path.toString(),
+          cfDescriptors, cfHandles)) {
+        try {
+          assertThat(cfDescriptors.size()).isEqualTo(2);
+          assertThat(cfHandles.size()).isEqualTo(2);
+
+          for (int i = 0; i < TOTAL_KEYS; i++) {
+            db.put(cfHandles.get(1), keys[i], "value".getBytes(UTF_8));
+          }
+        } finally {
+          for (final ColumnFamilyHandle cfHandle : cfHandles) {
+            cfHandle.close();
+          }
+          cfHandles.clear();
+        }
+      }
+
+      // re-open db and read from start to end
+      // integer keys should be in descending
+      // order
+      final ByteBuffer key = ByteBuffer.allocate(4);
+      try (final RocksDB db = RocksDB.open(opt, db_path.toString(),
+          cfDescriptors, cfHandles);
+           final RocksIterator it = db.newIterator(cfHandles.get(1))) {
+        try {
+          assertThat(cfDescriptors.size()).isEqualTo(2);
+          assertThat(cfHandles.size()).isEqualTo(2);
+
+          it.seekToFirst();
+          int lastKey = Integer.MAX_VALUE;
+          int count = 0;
+          for (it.seekToFirst(); it.isValid(); it.next()) {
+            key.put(it.key());
+            key.flip();
+            final int thisKey = key.getInt();
+            key.clear();
+            assertThat(thisKey).isLessThan(lastKey);
+            lastKey = thisKey;
+            count++;
+          }
+
+          assertThat(count).isEqualTo(TOTAL_KEYS);
+
+        } finally {
+          for (final ColumnFamilyHandle cfHandle : cfHandles) {
+            cfHandle.close();
+          }
+          cfHandles.clear();
+          for (final ColumnFamilyDescriptor cfDescriptor : cfDescriptors) {
+            cfDescriptor.getOptions().close();
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/util/SizeUnitTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/util/SizeUnitTest.java
new file mode 100644
index 000000000..990aa5f47
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/util/SizeUnitTest.java
@@ -0,0 +1,27 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb.util;
+
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class SizeUnitTest {
+
+  public static final long COMPUTATION_UNIT = 1024L;
+
+  @Test
+  public void sizeUnit() {
+    assertThat(SizeUnit.KB).isEqualTo(COMPUTATION_UNIT);
+    assertThat(SizeUnit.MB).isEqualTo(
+        SizeUnit.KB * COMPUTATION_UNIT);
+    assertThat(SizeUnit.GB).isEqualTo(
+        SizeUnit.MB * COMPUTATION_UNIT);
+    assertThat(SizeUnit.TB).isEqualTo(
+        SizeUnit.GB * COMPUTATION_UNIT);
+    assertThat(SizeUnit.PB).isEqualTo(
+        SizeUnit.TB * COMPUTATION_UNIT);
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/util/TestUtil.java b/src/rocksdb/java/src/test/java/org/rocksdb/util/TestUtil.java
new file mode 100644
index 000000000..e4f490c8e
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/util/TestUtil.java
@@ -0,0 +1,72 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb.util;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.nio.ByteBuffer;
+import java.util.Random;
+import org.rocksdb.CompactionPriority;
+import org.rocksdb.Options;
+import org.rocksdb.WALRecoveryMode;
+
+/**
+ * General test utilities.
+ */
+public class TestUtil {
+
+  /**
+   * Get the options for log iteration tests.
+   *
+   * @return the options
+   */
+  public static Options optionsForLogIterTest() {
+    return defaultOptions()
+        .setCreateIfMissing(true)
+        .setWalTtlSeconds(1000);
+  }
+
+  /**
+   * Get the default options.
+   *
+   * @return the options
+   */
+  public static Options defaultOptions() {
+      return new Options()
+          .setWriteBufferSize(4090 * 4096)
+          .setTargetFileSizeBase(2 * 1024 * 1024)
+          .setMaxBytesForLevelBase(10 * 1024 * 1024)
+          .setMaxOpenFiles(5000)
+          .setWalRecoveryMode(WALRecoveryMode.TolerateCorruptedTailRecords)
+          .setCompactionPriority(CompactionPriority.ByCompensatedSize);
+  }
+
+  private static final Random random = new Random();
+
+  /**
+   * Generate a random string of bytes.
+   *
+   * @param len the length of the string to generate.
+   *
+   * @return the random string of bytes
+   */
+  public static byte[] dummyString(final int len) {
+    final byte[] str = new byte[len];
+    random.nextBytes(str);
+    return str;
+  }
+
+  /**
+   * Copy a {@link ByteBuffer} into an array for shorthand ease of test coding
+   * @param byteBuffer the buffer to copy
+   * @return a {@link byte[]} containing the same bytes as the input
+   */
+  public static byte[] bufferBytes(final ByteBuffer byteBuffer) {
+    final byte[] result = new byte[byteBuffer.limit()];
+    byteBuffer.get(result);
+    return result;
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/util/WriteBatchGetter.java b/src/rocksdb/java/src/test/java/org/rocksdb/util/WriteBatchGetter.java
new file mode 100644
index 000000000..2efa16473
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/util/WriteBatchGetter.java
@@ -0,0 +1,139 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+package org.rocksdb.util;
+
+import org.rocksdb.RocksDBException;
+import org.rocksdb.WriteBatch;
+
+import java.util.Arrays;
+
+public class WriteBatchGetter extends WriteBatch.Handler {
+
+  private int columnFamilyId = -1;
+  private final byte[] key;
+  private byte[] value;
+
+  public WriteBatchGetter(final byte[] key) {
+    this.key = key;
+  }
+
+  public byte[] getValue() {
+    return value;
+  }
+
+  @Override
+  public void put(final int columnFamilyId, final byte[] key,
+                  final byte[] value) {
+    if(Arrays.equals(this.key, key)) {
+      this.columnFamilyId = columnFamilyId;
+      this.value = value;
+    }
+  }
+
+  @Override
+  public void put(final byte[] key, final byte[] value) {
+    if(Arrays.equals(this.key, key)) {
+      this.value = value;
+    }
+  }
+
+  @Override
+  public void merge(final int columnFamilyId, final byte[] key,
+                    final byte[] value) {
+    if(Arrays.equals(this.key, key)) {
+      this.columnFamilyId = columnFamilyId;
+      this.value = value;
+    }
+  }
+
+  @Override
+  public void merge(final byte[] key, final byte[] value) {
+    if(Arrays.equals(this.key, key)) {
+      this.value = value;
+    }
+  }
+
+  @Override
+  public void delete(final int columnFamilyId, final byte[] key) {
+    if(Arrays.equals(this.key, key)) {
+      this.columnFamilyId = columnFamilyId;
+      this.value = null;
+    }
+  }
+
+  @Override
+  public void delete(final byte[] key) {
+    if(Arrays.equals(this.key, key)) {
+      this.value = null;
+    }
+  }
+
+  @Override
+  public void singleDelete(final int columnFamilyId, final byte[] key) {
+    if(Arrays.equals(this.key, key)) {
+      this.columnFamilyId = columnFamilyId;
+      this.value = null;
+    }
+  }
+
+  @Override
+  public void singleDelete(final byte[] key) {
+    if(Arrays.equals(this.key, key)) {
+      this.value = null;
+    }
+  }
+
+  @Override
+  public void deleteRange(final int columnFamilyId, final byte[] beginKey,
+                          final byte[] endKey) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void deleteRange(final byte[] beginKey, final byte[] endKey) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void logData(final byte[] blob) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void putBlobIndex(final int columnFamilyId, final byte[] key,
+                           final byte[] value) {
+    if(Arrays.equals(this.key, key)) {
+      this.columnFamilyId = columnFamilyId;
+      this.value = value;
+    }
+  }
+
+  @Override
+  public void markBeginPrepare() throws RocksDBException {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void markEndPrepare(final byte[] xid) throws RocksDBException {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void markNoop(final boolean emptyBatch) throws RocksDBException {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void markRollback(final byte[] xid) throws RocksDBException {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void markCommit(final byte[] xid) throws RocksDBException {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void markCommitWithTimestamp(final byte[] xid, final byte[] ts) throws RocksDBException {
+    throw new UnsupportedOperationException();
+  }
+}
diff --git a/src/rocksdb/java/understanding_options.md b/src/rocksdb/java/understanding_options.md
new file mode 100644
index 000000000..0393aff4d
--- /dev/null
+++ b/src/rocksdb/java/understanding_options.md
@@ -0,0 +1,79 @@
+# How RocksDB Options and their Java Wrappers Work
+
+Options in RocksDB come in many different flavours. This is an attempt at a taxonomy and explanation.
+
+## RocksDB Options
+
+Initially, I believe, RocksDB had only database options. I don't know if any of these were mutable. Column families came later. Read on to understand the terminology.
+
+So to begin, one sets up a collection of options and starts/creates a database with these options. That's a useful way to think about it, because from a Java point-of-view (and I didn't realise this initially and got very confused), despite making native calls to C++, the `API`s are just manipulating a native C++ configuration object. This object is just a record of configuration, and it must later be passed to the database (at create or open time) in order to apply the options.
+
+### Database versus Column Family
+
+The concept of the *column family* or `CF` is widespread within RocksDB. I think of it as a data namespace, but conveniently transactions can operate across these namespaces. The concept of a default column family exists, and when operations do not refer to a particular `CF`, it refers to the default.
+
+We raise this w.r.t. options because many options, perhaps most that users encounter, are *column family options*. That is to say they apply individually to a particular column family, or to the default column family. Crucially also, many/most/all of these same options are exposed as *database options* and then apply as the default for column families which do not have the option set explicitly. Obviously some database options are naturally database-wide; they apply to the operation of the database and don't make any sense applied to a column family.
+
+### Mutability
+
+There are 2 kinds of options
+
+- Mutable options
+- Immutable options. We name these in contrast to the mutable ones, but they are usually referred to unqualified.
+
+Mutable options are those which can be changed on a running `RocksDB` instance. Immutable options can only be configured prior to the start of a database. Of course, we can configure the immutable options at this time too; The entirety of options is a strict superset of the mutable options.
+
+Mutable options (whether column-family specific or database-wide) are manipulated at runtime with builders, so we have `MutableDBOptions.MutableDBOptionsBuilder` and `MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder` which share tooling classes/hierarchy and maintain and manipulate the relevant options as a `(key,value)` map.
+
+Mutable options are then passed using `setOptions()` and `setDBOptions()` methods on the live RocksDB, and then take effect immediately (depending on the semantics of the option) on the database.
+
+### Advanced
+
+There are 2 classes of options
+
+- Advanced options
+- Non-advanced options
+
+It's not clear to me what the conceptual distinction is between advanced and not. However, the Java code takes care to reflect it from the underlying C++.
+
+This leads to 2 separate type hierarchies within column family options, one for each `class` of options. The `kind`s are represented by where the options appear in their hierarchy.
+
+```java
+interface ColumnFamilyOptionsInterface<T extends ColumnFamilyOptionsInterface<T>>
+    extends AdvancedColumnFamilyOptionsInterface<T>
+interface MutableColumnFamilyOptionsInterface<T extends MutableColumnFamilyOptionsInterface<T>>
+    extends AdvancedMutableColumnFamilyOptionsInterface<T>
+```
+
+And then there is ultimately a single concrete implementation class for CF options:
+
+```java
+class ColumnFamilyOptions extends RocksObject
+    implements ColumnFamilyOptionsInterface<ColumnFamilyOptions>,
+    MutableColumnFamilyOptionsInterface<ColumnFamilyOptions>
+```
+
+as there is a single concrete implementation class for DB options:
+
+```java
+class DBOptions extends RocksObject
+    implements DBOptionsInterface<DBOptions>,
+    MutableDBOptionsInterface<DBOptions>
+```
+
+Interestingly `DBOptionsInterface` doesn't extend `MutableDBOptionsInterface`, if only in order to disrupt our belief in consistent basic laws of the Universe.
+
+## Startup/Creation Options
+
+```java
+class Options extends RocksObject
+    implements DBOptionsInterface<Options>,
+    MutableDBOptionsInterface<Options>,
+    ColumnFamilyOptionsInterface<Options>,
+    MutableColumnFamilyOptionsInterface<Options>
+```
+
+### Example - Blob Options
+
+The `enable_blob_files` and `min_blob_size` options are per-column-family, and are mutable. The options also appear in the unqualified database options. So by initial configuration, we can set up a RocksDB database where for every `(key,value)` with a value of size at least `min_blob_size`, the value is written (indirected) to a blob file. Blobs may share a blob file, subject to the configuration values set. Later, using the `MutableColumnFamilyOptionsInterface` of the `ColumnFamilyOptions`, we can choose to turn this off (`enable_blob_files=false`) , or alter the `min_blob_size` for the default column family, or any other column family. It seems to me that we cannot, though, mutate the column family options for all column families using the
+`setOptions()` mechanism, either for all existing column families or for all future column families; but maybe we can do the latter on a re-`open()/create()'
diff --git a/src/rocksdb/logging/auto_roll_logger.cc b/src/rocksdb/logging/auto_roll_logger.cc
new file mode 100644
index 000000000..fe0958479
--- /dev/null
+++ b/src/rocksdb/logging/auto_roll_logger.cc
@@ -0,0 +1,372 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "logging/auto_roll_logger.h"
+
+#include <algorithm>
+
+#include "file/filename.h"
+#include "logging/logging.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+// -- AutoRollLogger
+
+AutoRollLogger::AutoRollLogger(const std::shared_ptr<FileSystem>& fs,
+                               const std::shared_ptr<SystemClock>& clock,
+                               const std::string& dbname,
+                               const std::string& db_log_dir,
+                               size_t log_max_size,
+                               size_t log_file_time_to_roll,
+                               size_t keep_log_file_num,
+                               const InfoLogLevel log_level)
+    : Logger(log_level),
+      dbname_(dbname),
+      db_log_dir_(db_log_dir),
+      fs_(fs),
+      clock_(clock),
+      status_(Status::OK()),
+      kMaxLogFileSize(log_max_size),
+      kLogFileTimeToRoll(log_file_time_to_roll),
+      kKeepLogFileNum(keep_log_file_num),
+      cached_now(static_cast<uint64_t>(clock_->NowMicros() * 1e-6)),
+      ctime_(cached_now),
+      cached_now_access_count(0),
+      call_NowMicros_every_N_records_(100),
+      mutex_() {
+  Status s = fs->GetAbsolutePath(dbname, io_options_, &db_absolute_path_,
+                                 &io_context_);
+  if (s.IsNotSupported()) {
+    db_absolute_path_ = dbname;
+  } else {
+    status_ = s;
+  }
+  log_fname_ = InfoLogFileName(dbname_, db_absolute_path_, db_log_dir_);
+  if (fs_->FileExists(log_fname_, io_options_, &io_context_).ok()) {
+    RollLogFile();
+  }
+  GetExistingFiles();
+  s = ResetLogger();
+  if (s.ok() && status_.ok()) {
+    status_ = TrimOldLogFiles();
+  }
+}
+
+Status AutoRollLogger::ResetLogger() {
+  TEST_SYNC_POINT("AutoRollLogger::ResetLogger:BeforeNewLogger");
+  status_ = fs_->NewLogger(log_fname_, io_options_, &logger_, &io_context_);
+  TEST_SYNC_POINT("AutoRollLogger::ResetLogger:AfterNewLogger");
+
+  if (!status_.ok()) {
+    return status_;
+  }
+  assert(logger_);
+  logger_->SetInfoLogLevel(Logger::GetInfoLogLevel());
+
+  if (logger_->GetLogFileSize() == Logger::kDoNotSupportGetLogFileSize) {
+    status_ = Status::NotSupported(
+        "The underlying logger doesn't support GetLogFileSize()");
+  }
+  if (status_.ok()) {
+    cached_now = static_cast<uint64_t>(clock_->NowMicros() * 1e-6);
+    ctime_ = cached_now;
+    cached_now_access_count = 0;
+  }
+
+  return status_;
+}
+
+void AutoRollLogger::RollLogFile() {
+  // This function is called when log is rotating. Two rotations
+  // can happen quickly (NowMicro returns same value). To not overwrite
+  // previous log file we increment by one micro second and try again.
+  uint64_t now = clock_->NowMicros();
+  std::string old_fname;
+  do {
+    old_fname =
+        OldInfoLogFileName(dbname_, now, db_absolute_path_, db_log_dir_);
+    now++;
+  } while (fs_->FileExists(old_fname, io_options_, &io_context_).ok());
+  // Wait for logger_ reference count to turn to 1 as it might be pinned by
+  // Flush. Pinned Logger can't be closed till Flush is completed on that
+  // Logger.
+  while (logger_.use_count() > 1) {
+  }
+  // Close the existing logger first to release the existing handle
+  // before renaming the file using the file system. If this call
+  // fails there is nothing much we can do and we will continue with the
+  // rename and hence ignoring the result status.
+  if (logger_) {
+    logger_->Close().PermitUncheckedError();
+  }
+  Status s = fs_->RenameFile(log_fname_, old_fname, io_options_, &io_context_);
+  if (!s.ok()) {
+    // What should we do on error?
+  }
+  old_log_files_.push(old_fname);
+}
+
+void AutoRollLogger::GetExistingFiles() {
+  {
+    // Empty the queue to avoid duplicated entries in the queue.
+    std::queue<std::string> empty;
+    std::swap(old_log_files_, empty);
+  }
+
+  std::string parent_dir;
+  std::vector<std::string> info_log_files;
+  Status s =
+      GetInfoLogFiles(fs_, db_log_dir_, dbname_, &parent_dir, &info_log_files);
+  if (status_.ok()) {
+    status_ = s;
+  }
+  // We need to sort the file before enqueing it so that when we
+  // delete file from the front, it is the oldest file.
+  std::sort(info_log_files.begin(), info_log_files.end());
+
+  for (const std::string& f : info_log_files) {
+    old_log_files_.push(parent_dir + "/" + f);
+  }
+}
+
+Status AutoRollLogger::TrimOldLogFiles() {
+  // Here we directly list info files and delete them through FileSystem.
+  // The deletion isn't going through DB, so there are shortcomes:
+  // 1. the deletion is not rate limited by SstFileManager
+  // 2. there is a chance that an I/O will be issued here
+  // Since it's going to be complicated to pass DB object down to
+  // here, we take a simple approach to keep the code easier to
+  // maintain.
+
+  // old_log_files_.empty() is helpful for the corner case that
+  // kKeepLogFileNum == 0. We can instead check kKeepLogFileNum != 0 but
+  // it's essentially the same thing, and checking empty before accessing
+  // the queue feels safer.
+  while (!old_log_files_.empty() && old_log_files_.size() >= kKeepLogFileNum) {
+    Status s =
+        fs_->DeleteFile(old_log_files_.front(), io_options_, &io_context_);
+    // Remove the file from the tracking anyway. It's possible that
+    // DB cleaned up the old log file, or people cleaned it up manually.
+    old_log_files_.pop();
+    // To make the file really go away, we should sync parent directory.
+    // Since there isn't any consistency issue involved here, skipping
+    // this part to avoid one I/O here.
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  return Status::OK();
+}
+
+std::string AutoRollLogger::ValistToString(const char* format,
+                                           va_list args) const {
+  // Any log messages longer than 1024 will get truncated.
+  // The user is responsible for chopping longer messages into multi line log
+  static const int MAXBUFFERSIZE = 1024;
+  char buffer[MAXBUFFERSIZE];
+
+  int count = vsnprintf(buffer, MAXBUFFERSIZE, format, args);
+  (void)count;
+  assert(count >= 0);
+
+  return buffer;
+}
+
+void AutoRollLogger::LogInternal(const char* format, ...) {
+  mutex_.AssertHeld();
+
+  if (!logger_) {
+    return;
+  }
+
+  va_list args;
+  va_start(args, format);
+  logger_->Logv(format, args);
+  va_end(args);
+}
+
+void AutoRollLogger::Logv(const char* format, va_list ap) {
+  assert(GetStatus().ok());
+  if (!logger_) {
+    return;
+  }
+
+  std::shared_ptr<Logger> logger;
+  {
+    MutexLock l(&mutex_);
+    if ((kLogFileTimeToRoll > 0 && LogExpired()) ||
+        (kMaxLogFileSize > 0 && logger_->GetLogFileSize() >= kMaxLogFileSize)) {
+      RollLogFile();
+      Status s = ResetLogger();
+      Status s2 = TrimOldLogFiles();
+
+      if (!s.ok()) {
+        // can't really log the error if creating a new LOG file failed
+        return;
+      }
+
+      WriteHeaderInfo();
+
+      if (!s2.ok()) {
+        ROCKS_LOG_WARN(logger.get(), "Fail to trim old info log file: %s",
+                       s2.ToString().c_str());
+      }
+    }
+
+    // pin down the current logger_ instance before releasing the mutex.
+    logger = logger_;
+  }
+
+  // Another thread could have put a new Logger instance into logger_ by now.
+  // However, since logger is still hanging on to the previous instance
+  // (reference count is not zero), we don't have to worry about it being
+  // deleted while we are accessing it.
+  // Note that logv itself is not mutex protected to allow maximum concurrency,
+  // as thread safety should have been handled by the underlying logger.
+  logger->Logv(format, ap);
+}
+
+void AutoRollLogger::WriteHeaderInfo() {
+  mutex_.AssertHeld();
+  for (auto& header : headers_) {
+    LogInternal("%s", header.c_str());
+  }
+}
+
+void AutoRollLogger::LogHeader(const char* format, va_list args) {
+  if (!logger_) {
+    return;
+  }
+
+  // header message are to be retained in memory. Since we cannot make any
+  // assumptions about the data contained in va_list, we will retain them as
+  // strings
+  va_list tmp;
+  va_copy(tmp, args);
+  std::string data = ValistToString(format, tmp);
+  va_end(tmp);
+
+  MutexLock l(&mutex_);
+  headers_.push_back(data);
+
+  // Log the original message to the current log
+  logger_->Logv(format, args);
+}
+
+bool AutoRollLogger::LogExpired() {
+  if (cached_now_access_count >= call_NowMicros_every_N_records_) {
+    cached_now = static_cast<uint64_t>(clock_->NowMicros() * 1e-6);
+    cached_now_access_count = 0;
+  }
+
+  ++cached_now_access_count;
+  return cached_now >= ctime_ + kLogFileTimeToRoll;
+}
+#endif  // !ROCKSDB_LITE
+
+Status CreateLoggerFromOptions(const std::string& dbname,
+                               const DBOptions& options,
+                               std::shared_ptr<Logger>* logger) {
+  if (options.info_log) {
+    *logger = options.info_log;
+    return Status::OK();
+  }
+
+  Env* env = options.env;
+  std::string db_absolute_path;
+  Status s = env->GetAbsolutePath(dbname, &db_absolute_path);
+  TEST_SYNC_POINT_CALLBACK("rocksdb::CreateLoggerFromOptions:AfterGetPath", &s);
+  if (!s.ok()) {
+    return s;
+  }
+  std::string fname =
+      InfoLogFileName(dbname, db_absolute_path, options.db_log_dir);
+
+  const auto& clock = env->GetSystemClock();
+  // In case it does not exist.
+  s = env->CreateDirIfMissing(dbname);
+  if (!s.ok()) {
+    if (options.db_log_dir.empty()) {
+      return s;
+    } else {
+      // Ignore the error returned during creation of dbname because dbname and
+      // db_log_dir can be on different filesystems in which case dbname will
+      // not exist and error should be ignored. db_log_dir creation will handle
+      // the error in case there is any error in the creation of dbname on same
+      // filesystem.
+      s = Status::OK();
+    }
+  }
+  assert(s.ok());
+
+  if (!options.db_log_dir.empty()) {
+    s = env->CreateDirIfMissing(options.db_log_dir);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+#ifndef ROCKSDB_LITE
+  // Currently we only support roll by time-to-roll and log size
+  if (options.log_file_time_to_roll > 0 || options.max_log_file_size > 0) {
+    AutoRollLogger* result = new AutoRollLogger(
+        env->GetFileSystem(), clock, dbname, options.db_log_dir,
+        options.max_log_file_size, options.log_file_time_to_roll,
+        options.keep_log_file_num, options.info_log_level);
+    s = result->GetStatus();
+    if (!s.ok()) {
+      delete result;
+    } else {
+      logger->reset(result);
+    }
+    return s;
+  }
+#endif  // !ROCKSDB_LITE
+  // Open a log file in the same directory as the db
+  s = env->FileExists(fname);
+  if (s.ok()) {
+    s = env->RenameFile(
+        fname, OldInfoLogFileName(dbname, clock->NowMicros(), db_absolute_path,
+                                  options.db_log_dir));
+
+    // The operation sequence of "FileExists -> Rename" is not atomic. It's
+    // possible that FileExists returns OK but file gets deleted before Rename.
+    // This can cause Rename to return IOError with subcode PathNotFound.
+    // Although it may be a rare case and applications should be discouraged
+    // to not concurrently modifying the contents of the directories accessed
+    // by the database instance, it is still helpful if we can perform some
+    // simple handling of this case. Therefore, we do the following:
+    // 1. if Rename() returns IOError with PathNotFound subcode, then we check
+    //    whether the source file, i.e. LOG, exists.
+    // 2. if LOG exists, it means Rename() failed due to something else. Then
+    //    we report error.
+    // 3. if LOG does not exist, it means it may have been removed/renamed by
+    //    someone else. Since it does not exist, we can reset Status to OK so
+    //    that this caller can try creating a new LOG file. If this succeeds,
+    //    we should still allow it.
+    if (s.IsPathNotFound()) {
+      s = env->FileExists(fname);
+      if (s.IsNotFound()) {
+        s = Status::OK();
+      }
+    }
+  } else if (s.IsNotFound()) {
+    // "LOG" is not required to exist since this could be a new DB.
+    s = Status::OK();
+  }
+  if (s.ok()) {
+    s = env->NewLogger(fname, logger);
+  }
+  if (s.ok() && logger->get() != nullptr) {
+    (*logger)->SetInfoLogLevel(options.info_log_level);
+  }
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/logging/auto_roll_logger.h b/src/rocksdb/logging/auto_roll_logger.h
new file mode 100644
index 000000000..805925e5a
--- /dev/null
+++ b/src/rocksdb/logging/auto_roll_logger.h
@@ -0,0 +1,168 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Logger implementation that can be shared by all environments
+// where enough posix functionality is available.
+
+#pragma once
+#include <list>
+#include <queue>
+#include <string>
+
+#include "file/filename.h"
+#include "port/port.h"
+#include "port/util_logger.h"
+#include "test_util/sync_point.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+class FileSystem;
+class SystemClock;
+
+#ifndef ROCKSDB_LITE
+// Rolls the log file by size and/or time
+class AutoRollLogger : public Logger {
+ public:
+  AutoRollLogger(const std::shared_ptr<FileSystem>& fs,
+                 const std::shared_ptr<SystemClock>& clock,
+                 const std::string& dbname, const std::string& db_log_dir,
+                 size_t log_max_size, size_t log_file_time_to_roll,
+                 size_t keep_log_file_num,
+                 const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL);
+
+  using Logger::Logv;
+  void Logv(const char* format, va_list ap) override;
+
+  // Write a header entry to the log. All header information will be written
+  // again every time the log rolls over.
+  virtual void LogHeader(const char* format, va_list ap) override;
+
+  // check if the logger has encountered any problem.
+  Status GetStatus() { return status_; }
+
+  size_t GetLogFileSize() const override {
+    if (!logger_) {
+      return 0;
+    }
+
+    std::shared_ptr<Logger> logger;
+    {
+      MutexLock l(&mutex_);
+      // pin down the current logger_ instance before releasing the mutex.
+      logger = logger_;
+    }
+    return logger->GetLogFileSize();
+  }
+
+  void Flush() override {
+    std::shared_ptr<Logger> logger;
+    {
+      MutexLock l(&mutex_);
+      // pin down the current logger_ instance before releasing the mutex.
+      logger = logger_;
+    }
+    TEST_SYNC_POINT("AutoRollLogger::Flush:PinnedLogger");
+    if (logger) {
+      logger->Flush();
+    }
+  }
+
+  virtual ~AutoRollLogger() {
+    if (logger_ && !closed_) {
+      logger_->Close().PermitUncheckedError();
+    }
+    status_.PermitUncheckedError();
+  }
+
+  using Logger::GetInfoLogLevel;
+  InfoLogLevel GetInfoLogLevel() const override {
+    MutexLock l(&mutex_);
+    if (!logger_) {
+      return Logger::GetInfoLogLevel();
+    }
+    return logger_->GetInfoLogLevel();
+  }
+
+  using Logger::SetInfoLogLevel;
+  void SetInfoLogLevel(const InfoLogLevel log_level) override {
+    MutexLock lock(&mutex_);
+    Logger::SetInfoLogLevel(log_level);
+    if (logger_) {
+      logger_->SetInfoLogLevel(log_level);
+    }
+  }
+
+  void SetCallNowMicrosEveryNRecords(uint64_t call_NowMicros_every_N_records) {
+    call_NowMicros_every_N_records_ = call_NowMicros_every_N_records;
+  }
+
+  // Expose the log file path for testing purpose
+  std::string TEST_log_fname() const { return log_fname_; }
+
+  uint64_t TEST_ctime() const { return ctime_; }
+
+  Logger* TEST_inner_logger() const { return logger_.get(); }
+
+ protected:
+  // Implementation of Close()
+  virtual Status CloseImpl() override {
+    if (logger_) {
+      return logger_->Close();
+    } else {
+      return Status::OK();
+    }
+  }
+
+ private:
+  bool LogExpired();
+  Status ResetLogger();
+  void RollLogFile();
+  // Read all names of old log files into old_log_files_
+  // If there is any error, put the error code in status_
+  void GetExistingFiles();
+  // Delete old log files if it excceeds the limit.
+  Status TrimOldLogFiles();
+  // Log message to logger without rolling
+  void LogInternal(const char* format, ...);
+  // Serialize the va_list to a string
+  std::string ValistToString(const char* format, va_list args) const;
+  // Write the logs marked as headers to the new log file
+  void WriteHeaderInfo();
+  std::string log_fname_;  // Current active info log's file name.
+  std::string dbname_;
+  std::string db_log_dir_;
+  std::string db_absolute_path_;
+  std::shared_ptr<FileSystem> fs_;
+  std::shared_ptr<SystemClock> clock_;
+  std::shared_ptr<Logger> logger_;
+  // current status of the logger
+  Status status_;
+  const size_t kMaxLogFileSize;
+  const size_t kLogFileTimeToRoll;
+  const size_t kKeepLogFileNum;
+  // header information
+  std::list<std::string> headers_;
+  // List of all existing info log files. Used for enforcing number of
+  // info log files.
+  // Full path is stored here. It consumes signifianctly more memory
+  // than only storing file name. Can optimize if it causes a problem.
+  std::queue<std::string> old_log_files_;
+  // to avoid frequent clock->NowMicros() calls, we cached the current time
+  uint64_t cached_now;
+  uint64_t ctime_;
+  uint64_t cached_now_access_count;
+  uint64_t call_NowMicros_every_N_records_;
+  IOOptions io_options_;
+  IODebugContext io_context_;
+  mutable port::Mutex mutex_;
+};
+#endif  // !ROCKSDB_LITE
+
+// Facade to craete logger automatically
+Status CreateLoggerFromOptions(const std::string& dbname,
+                               const DBOptions& options,
+                               std::shared_ptr<Logger>* logger);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/logging/auto_roll_logger_test.cc b/src/rocksdb/logging/auto_roll_logger_test.cc
new file mode 100644
index 000000000..8e94a78c8
--- /dev/null
+++ b/src/rocksdb/logging/auto_roll_logger_test.cc
@@ -0,0 +1,742 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#ifndef ROCKSDB_LITE
+
+#include "logging/auto_roll_logger.h"
+
+#include <sys/stat.h>
+
+#include <algorithm>
+#include <cmath>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "db/db_test_util.h"
+#include "env/emulated_clock.h"
+#include "logging/env_logger.h"
+#include "logging/logging.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// In this test we only want to Log some simple log message with
+// no format. LogMessage() provides such a simple interface and
+// avoids the [format-security] warning which occurs when you
+// call ROCKS_LOG_INFO(logger, log_message) directly.
+namespace {
+void LogMessage(Logger* logger, const char* message) {
+  ROCKS_LOG_INFO(logger, "%s", message);
+}
+
+void LogMessage(const InfoLogLevel log_level, Logger* logger,
+                const char* message) {
+  Log(log_level, logger, "%s", message);
+}
+}  // namespace
+
+class AutoRollLoggerTest : public testing::Test {
+ public:
+  static void InitTestDb() {
+    // TODO replace the `system` calls with Env/FileSystem APIs.
+#ifdef OS_WIN
+    // Replace all slashes in the path so windows CompSpec does not
+    // become confused
+    std::string testDbDir(kTestDbDir);
+    std::replace_if(
+        testDbDir.begin(), testDbDir.end(), [](char ch) { return ch == '/'; },
+        '\\');
+    std::string deleteDbDirCmd =
+        "if exist " + testDbDir + " rd /s /q " + testDbDir;
+    ASSERT_TRUE(system(deleteDbDirCmd.c_str()) == 0);
+
+    std::string testDir(kTestDir);
+    std::replace_if(
+        testDir.begin(), testDir.end(), [](char ch) { return ch == '/'; },
+        '\\');
+    std::string deleteCmd = "if exist " + testDir + " rd /s /q " + testDir;
+#else
+    std::string deleteCmd = "rm -rf " + kTestDir + " " + kTestDbDir;
+#endif
+    ASSERT_TRUE(system(deleteCmd.c_str()) == 0);
+    ASSERT_OK(Env::Default()->CreateDir(kTestDir));
+    ASSERT_OK(Env::Default()->CreateDir(kTestDbDir));
+  }
+
+  void RollLogFileBySizeTest(AutoRollLogger* logger, size_t log_max_size,
+                             const std::string& log_message);
+  void RollLogFileByTimeTest(const std::shared_ptr<FileSystem>& fs,
+                             const std::shared_ptr<SystemClock>& sc,
+                             AutoRollLogger* logger, size_t time,
+                             const std::string& log_message);
+  // return list of files under kTestDir that contains "LOG"
+  std::vector<std::string> GetLogFiles() {
+    std::vector<std::string> ret;
+    std::vector<std::string> files;
+    Status s = default_env->GetChildren(kTestDir, &files);
+    // Should call ASSERT_OK() here but it doesn't compile. It's not
+    // worth the time figuring out why.
+    EXPECT_TRUE(s.ok());
+    for (const auto& f : files) {
+      if (f.find("LOG") != std::string::npos) {
+        ret.push_back(f);
+      }
+    }
+    return ret;
+  }
+
+  // Delete all log files under kTestDir
+  void CleanupLogFiles() {
+    for (const std::string& f : GetLogFiles()) {
+      ASSERT_OK(default_env->DeleteFile(kTestDir + "/" + f));
+    }
+  }
+
+  void RollNTimesBySize(Logger* auto_roll_logger, size_t file_num,
+                        size_t max_log_file_size) {
+    // Roll the log 4 times, and it will trim to 3 files.
+    std::string dummy_large_string;
+    dummy_large_string.assign(max_log_file_size, '=');
+    auto_roll_logger->SetInfoLogLevel(InfoLogLevel::INFO_LEVEL);
+    for (size_t i = 0; i < file_num + 1; i++) {
+      // Log enough bytes to trigger at least one roll.
+      LogMessage(auto_roll_logger, dummy_large_string.c_str());
+      LogMessage(auto_roll_logger, "");
+    }
+  }
+
+  static const std::string kSampleMessage;
+  static const std::string kTestDir;
+  static const std::string kTestDbDir;
+  static const std::string kLogFile;
+  static Env* default_env;
+};
+
+const std::string AutoRollLoggerTest::kSampleMessage(
+    "this is the message to be written to the log file!!");
+const std::string AutoRollLoggerTest::kTestDir(
+    test::PerThreadDBPath("db_log_test"));
+const std::string AutoRollLoggerTest::kTestDbDir(
+    test::PerThreadDBPath("db_log_test_db"));
+const std::string AutoRollLoggerTest::kLogFile(
+    test::PerThreadDBPath("db_log_test") + "/LOG");
+Env* AutoRollLoggerTest::default_env = Env::Default();
+
+void AutoRollLoggerTest::RollLogFileBySizeTest(AutoRollLogger* logger,
+                                               size_t log_max_size,
+                                               const std::string& log_message) {
+  logger->SetInfoLogLevel(InfoLogLevel::INFO_LEVEL);
+  ASSERT_EQ(InfoLogLevel::INFO_LEVEL, logger->GetInfoLogLevel());
+  ASSERT_EQ(InfoLogLevel::INFO_LEVEL,
+            logger->TEST_inner_logger()->GetInfoLogLevel());
+  // measure the size of each message, which is supposed
+  // to be equal or greater than log_message.size()
+  LogMessage(logger, log_message.c_str());
+  size_t message_size = logger->GetLogFileSize();
+  size_t current_log_size = message_size;
+
+  // Test the cases when the log file will not be rolled.
+  while (current_log_size + message_size < log_max_size) {
+    LogMessage(logger, log_message.c_str());
+    current_log_size += message_size;
+    ASSERT_EQ(current_log_size, logger->GetLogFileSize());
+  }
+
+  // Now the log file will be rolled
+  LogMessage(logger, log_message.c_str());
+  // Since rotation is checked before actual logging, we need to
+  // trigger the rotation by logging another message.
+  LogMessage(logger, log_message.c_str());
+
+  ASSERT_TRUE(message_size == logger->GetLogFileSize());
+}
+
+void AutoRollLoggerTest::RollLogFileByTimeTest(
+    const std::shared_ptr<FileSystem>& fs,
+    const std::shared_ptr<SystemClock>& sc, AutoRollLogger* logger, size_t time,
+    const std::string& log_message) {
+  uint64_t expected_ctime;
+  uint64_t actual_ctime;
+
+  uint64_t total_log_size;
+  EXPECT_OK(fs->GetFileSize(kLogFile, IOOptions(), &total_log_size, nullptr));
+  expected_ctime = logger->TEST_ctime();
+  logger->SetCallNowMicrosEveryNRecords(0);
+
+  // -- Write to the log for several times, which is supposed
+  // to be finished before time.
+  for (int i = 0; i < 10; ++i) {
+    sc->SleepForMicroseconds(50000);
+    LogMessage(logger, log_message.c_str());
+    EXPECT_OK(logger->GetStatus());
+    // Make sure we always write to the same log file (by
+    // checking the create time);
+
+    actual_ctime = logger->TEST_ctime();
+
+    // Also make sure the log size is increasing.
+    EXPECT_EQ(expected_ctime, actual_ctime);
+    EXPECT_GT(logger->GetLogFileSize(), total_log_size);
+    total_log_size = logger->GetLogFileSize();
+  }
+
+  // -- Make the log file expire
+  sc->SleepForMicroseconds(static_cast<int>(time * 1000000));
+  LogMessage(logger, log_message.c_str());
+
+  // At this time, the new log file should be created.
+  actual_ctime = logger->TEST_ctime();
+  EXPECT_LT(expected_ctime, actual_ctime);
+  EXPECT_LT(logger->GetLogFileSize(), total_log_size);
+}
+
+TEST_F(AutoRollLoggerTest, RollLogFileBySize) {
+  InitTestDb();
+  size_t log_max_size = 1024 * 5;
+  size_t keep_log_file_num = 10;
+
+  AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(), kTestDir,
+                        "", log_max_size, 0, keep_log_file_num);
+
+  RollLogFileBySizeTest(&logger, log_max_size,
+                        kSampleMessage + ":RollLogFileBySize");
+}
+
+TEST_F(AutoRollLoggerTest, RollLogFileByTime) {
+  auto nsc =
+      std::make_shared<EmulatedSystemClock>(SystemClock::Default(), true);
+
+  size_t time = 2;
+  size_t log_size = 1024 * 5;
+  size_t keep_log_file_num = 10;
+
+  InitTestDb();
+  // -- Test the existence of file during the server restart.
+  ASSERT_EQ(Status::NotFound(), default_env->FileExists(kLogFile));
+  AutoRollLogger logger(default_env->GetFileSystem(), nsc, kTestDir, "",
+                        log_size, time, keep_log_file_num);
+  ASSERT_OK(default_env->FileExists(kLogFile));
+
+  RollLogFileByTimeTest(default_env->GetFileSystem(), nsc, &logger, time,
+                        kSampleMessage + ":RollLogFileByTime");
+}
+
+TEST_F(AutoRollLoggerTest, SetInfoLogLevel) {
+  InitTestDb();
+  Options options;
+  options.info_log_level = InfoLogLevel::FATAL_LEVEL;
+  options.max_log_file_size = 1024;
+  std::shared_ptr<Logger> logger;
+  ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger));
+  auto* auto_roll_logger = dynamic_cast<AutoRollLogger*>(logger.get());
+  ASSERT_NE(nullptr, auto_roll_logger);
+  ASSERT_EQ(InfoLogLevel::FATAL_LEVEL, auto_roll_logger->GetInfoLogLevel());
+  ASSERT_EQ(InfoLogLevel::FATAL_LEVEL,
+            auto_roll_logger->TEST_inner_logger()->GetInfoLogLevel());
+  auto_roll_logger->SetInfoLogLevel(InfoLogLevel::DEBUG_LEVEL);
+  ASSERT_EQ(InfoLogLevel::DEBUG_LEVEL, auto_roll_logger->GetInfoLogLevel());
+  ASSERT_EQ(InfoLogLevel::DEBUG_LEVEL, logger->GetInfoLogLevel());
+  ASSERT_EQ(InfoLogLevel::DEBUG_LEVEL,
+            auto_roll_logger->TEST_inner_logger()->GetInfoLogLevel());
+}
+
+TEST_F(AutoRollLoggerTest, OpenLogFilesMultipleTimesWithOptionLog_max_size) {
+  // If only 'log_max_size' options is specified, then every time
+  // when rocksdb is restarted, a new empty log file will be created.
+  InitTestDb();
+  // WORKAROUND:
+  // avoid complier's complaint of "comparison between signed
+  // and unsigned integer expressions" because literal 0 is
+  // treated as "singed".
+  size_t kZero = 0;
+  size_t log_size = 1024;
+  size_t keep_log_file_num = 10;
+
+  AutoRollLogger* logger =
+      new AutoRollLogger(FileSystem::Default(), SystemClock::Default(),
+                         kTestDir, "", log_size, 0, keep_log_file_num);
+
+  LogMessage(logger, kSampleMessage.c_str());
+  ASSERT_GT(logger->GetLogFileSize(), kZero);
+  delete logger;
+
+  // reopens the log file and an empty log file will be created.
+  logger = new AutoRollLogger(FileSystem::Default(), SystemClock::Default(),
+                              kTestDir, "", log_size, 0, 10);
+  ASSERT_EQ(logger->GetLogFileSize(), kZero);
+  delete logger;
+}
+
+TEST_F(AutoRollLoggerTest, CompositeRollByTimeAndSizeLogger) {
+  size_t time = 2, log_max_size = 1024 * 5;
+  size_t keep_log_file_num = 10;
+
+  InitTestDb();
+
+  auto nsc =
+      std::make_shared<EmulatedSystemClock>(SystemClock::Default(), true);
+  AutoRollLogger logger(FileSystem::Default(), nsc, kTestDir, "", log_max_size,
+                        time, keep_log_file_num);
+
+  // Test the ability to roll by size
+  RollLogFileBySizeTest(&logger, log_max_size,
+                        kSampleMessage + ":CompositeRollByTimeAndSizeLogger");
+
+  // Test the ability to roll by Time
+  RollLogFileByTimeTest(FileSystem::Default(), nsc, &logger, time,
+                        kSampleMessage + ":CompositeRollByTimeAndSizeLogger");
+}
+
+#ifndef OS_WIN
+// TODO: does not build for Windows because of EnvLogger use below. Need to
+// port
+TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) {
+  DBOptions options;
+  auto nsc =
+      std::make_shared<EmulatedSystemClock>(SystemClock::Default(), true);
+  std::unique_ptr<Env> nse(new CompositeEnvWrapper(Env::Default(), nsc));
+
+  std::shared_ptr<Logger> logger;
+
+  // Normal logger
+  ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger));
+  ASSERT_TRUE(dynamic_cast<EnvLogger*>(logger.get()));
+
+  // Only roll by size
+  InitTestDb();
+  options.max_log_file_size = 1024;
+  ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger));
+  AutoRollLogger* auto_roll_logger =
+      dynamic_cast<AutoRollLogger*>(logger.get());
+  ASSERT_TRUE(auto_roll_logger);
+  RollLogFileBySizeTest(auto_roll_logger, options.max_log_file_size,
+                        kSampleMessage + ":CreateLoggerFromOptions - size");
+
+  // Only roll by Time
+  options.env = nse.get();
+  InitTestDb();
+  options.max_log_file_size = 0;
+  options.log_file_time_to_roll = 2;
+  ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger));
+  auto_roll_logger = dynamic_cast<AutoRollLogger*>(logger.get());
+  RollLogFileByTimeTest(options.env->GetFileSystem(), nsc, auto_roll_logger,
+                        options.log_file_time_to_roll,
+                        kSampleMessage + ":CreateLoggerFromOptions - time");
+
+  // roll by both Time and size
+  InitTestDb();
+  options.max_log_file_size = 1024 * 5;
+  options.log_file_time_to_roll = 2;
+  ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger));
+  auto_roll_logger = dynamic_cast<AutoRollLogger*>(logger.get());
+  RollLogFileBySizeTest(auto_roll_logger, options.max_log_file_size,
+                        kSampleMessage + ":CreateLoggerFromOptions - both");
+  RollLogFileByTimeTest(options.env->GetFileSystem(), nsc, auto_roll_logger,
+                        options.log_file_time_to_roll,
+                        kSampleMessage + ":CreateLoggerFromOptions - both");
+
+  // Set keep_log_file_num
+  {
+    const size_t kFileNum = 3;
+    InitTestDb();
+    options.max_log_file_size = 512;
+    options.log_file_time_to_roll = 2;
+    options.keep_log_file_num = kFileNum;
+    ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger));
+    auto_roll_logger = dynamic_cast<AutoRollLogger*>(logger.get());
+
+    // Roll the log 4 times, and it will trim to 3 files.
+    std::string dummy_large_string;
+    dummy_large_string.assign(options.max_log_file_size, '=');
+    auto_roll_logger->SetInfoLogLevel(InfoLogLevel::INFO_LEVEL);
+    for (size_t i = 0; i < kFileNum + 1; i++) {
+      // Log enough bytes to trigger at least one roll.
+      LogMessage(auto_roll_logger, dummy_large_string.c_str());
+      LogMessage(auto_roll_logger, "");
+    }
+
+    std::vector<std::string> files = GetLogFiles();
+    ASSERT_EQ(kFileNum, files.size());
+
+    CleanupLogFiles();
+  }
+
+  // Set keep_log_file_num and dbname is different from
+  // db_log_dir.
+  {
+    const size_t kFileNum = 3;
+    InitTestDb();
+    options.max_log_file_size = 512;
+    options.log_file_time_to_roll = 2;
+    options.keep_log_file_num = kFileNum;
+    options.db_log_dir = kTestDir;
+    ASSERT_OK(CreateLoggerFromOptions(kTestDbDir, options, &logger));
+    auto_roll_logger = dynamic_cast<AutoRollLogger*>(logger.get());
+
+    // Roll the log 4 times, and it will trim to 3 files.
+    std::string dummy_large_string;
+    dummy_large_string.assign(options.max_log_file_size, '=');
+    auto_roll_logger->SetInfoLogLevel(InfoLogLevel::INFO_LEVEL);
+    for (size_t i = 0; i < kFileNum + 1; i++) {
+      // Log enough bytes to trigger at least one roll.
+      LogMessage(auto_roll_logger, dummy_large_string.c_str());
+      LogMessage(auto_roll_logger, "");
+    }
+
+    std::vector<std::string> files = GetLogFiles();
+    ASSERT_EQ(kFileNum, files.size());
+    for (const auto& f : files) {
+      ASSERT_TRUE(f.find("db_log_test_db") != std::string::npos);
+    }
+
+    // Cleaning up those files.
+    CleanupLogFiles();
+  }
+}
+
+TEST_F(AutoRollLoggerTest, AutoDeleting) {
+  for (int attempt = 0; attempt < 2; attempt++) {
+    // In the first attemp, db_log_dir is not set, while in the
+    // second it is set.
+    std::string dbname = (attempt == 0) ? kTestDir : "/test/dummy/dir";
+    std::string db_log_dir = (attempt == 0) ? "" : kTestDir;
+
+    InitTestDb();
+    const size_t kMaxFileSize = 512;
+    {
+      size_t log_num = 8;
+      AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(),
+                            dbname, db_log_dir, kMaxFileSize, 0, log_num);
+      RollNTimesBySize(&logger, log_num, kMaxFileSize);
+
+      ASSERT_EQ(log_num, GetLogFiles().size());
+    }
+    // Shrink number of files
+    {
+      size_t log_num = 5;
+      AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(),
+                            dbname, db_log_dir, kMaxFileSize, 0, log_num);
+      ASSERT_EQ(log_num, GetLogFiles().size());
+
+      RollNTimesBySize(&logger, 3, kMaxFileSize);
+      ASSERT_EQ(log_num, GetLogFiles().size());
+    }
+
+    // Increase number of files again.
+    {
+      size_t log_num = 7;
+      AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(),
+                            dbname, db_log_dir, kMaxFileSize, 0, log_num);
+      ASSERT_EQ(6, GetLogFiles().size());
+
+      RollNTimesBySize(&logger, 3, kMaxFileSize);
+      ASSERT_EQ(log_num, GetLogFiles().size());
+    }
+
+    CleanupLogFiles();
+  }
+}
+
+TEST_F(AutoRollLoggerTest, LogFlushWhileRolling) {
+  DBOptions options;
+  std::shared_ptr<Logger> logger;
+
+  InitTestDb();
+  options.max_log_file_size = 1024 * 5;
+  ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger));
+  AutoRollLogger* auto_roll_logger =
+      dynamic_cast<AutoRollLogger*>(logger.get());
+  ASSERT_TRUE(auto_roll_logger);
+  ROCKSDB_NAMESPACE::port::Thread flush_thread;
+
+  // Notes:
+  // (1) Need to pin the old logger before beginning the roll, as rolling grabs
+  //     the mutex, which would prevent us from accessing the old logger. This
+  //     also marks flush_thread with AutoRollLogger::Flush:PinnedLogger.
+  // (2) New logger will be cut in AutoRollLogger::RollLogFile only when flush
+  //     is completed and reference to pinned logger is released.
+  // (3) EnvLogger::Flush() happens in both threads but its SyncPoints only
+  //     are enabled in flush_thread (the one pinning the old logger).
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependencyAndMarkers(
+      {{"AutoRollLogger::Flush:PinnedLogger",
+        "AutoRollLoggerTest::LogFlushWhileRolling:PreRollAndPostThreadInit"}},
+      {{"AutoRollLogger::Flush:PinnedLogger", "EnvLogger::Flush:Begin1"},
+       {"AutoRollLogger::Flush:PinnedLogger", "EnvLogger::Flush:Begin2"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  flush_thread = port::Thread([&]() { auto_roll_logger->Flush(); });
+  TEST_SYNC_POINT(
+      "AutoRollLoggerTest::LogFlushWhileRolling:PreRollAndPostThreadInit");
+  RollLogFileBySizeTest(auto_roll_logger, options.max_log_file_size,
+                        kSampleMessage + ":LogFlushWhileRolling");
+  flush_thread.join();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+#endif  // OS_WIN
+
+TEST_F(AutoRollLoggerTest, InfoLogLevel) {
+  InitTestDb();
+
+  size_t log_size = 8192;
+  size_t log_lines = 0;
+  // an extra-scope to force the AutoRollLogger to flush the log file when it
+  // becomes out of scope.
+  {
+    AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(),
+                          kTestDir, "", log_size, 0, 10);
+    for (int log_level = InfoLogLevel::HEADER_LEVEL;
+         log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) {
+      logger.SetInfoLogLevel((InfoLogLevel)log_level);
+      for (int log_type = InfoLogLevel::DEBUG_LEVEL;
+           log_type <= InfoLogLevel::HEADER_LEVEL; log_type++) {
+        // log messages with log level smaller than log_level will not be
+        // logged.
+        LogMessage((InfoLogLevel)log_type, &logger, kSampleMessage.c_str());
+      }
+      log_lines += InfoLogLevel::HEADER_LEVEL - log_level + 1;
+    }
+    for (int log_level = InfoLogLevel::HEADER_LEVEL;
+         log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) {
+      logger.SetInfoLogLevel((InfoLogLevel)log_level);
+
+      // again, messages with level smaller than log_level will not be logged.
+      ROCKS_LOG_HEADER(&logger, "%s", kSampleMessage.c_str());
+      ROCKS_LOG_DEBUG(&logger, "%s", kSampleMessage.c_str());
+      ROCKS_LOG_INFO(&logger, "%s", kSampleMessage.c_str());
+      ROCKS_LOG_WARN(&logger, "%s", kSampleMessage.c_str());
+      ROCKS_LOG_ERROR(&logger, "%s", kSampleMessage.c_str());
+      ROCKS_LOG_FATAL(&logger, "%s", kSampleMessage.c_str());
+      log_lines += InfoLogLevel::HEADER_LEVEL - log_level + 1;
+    }
+  }
+  std::ifstream inFile(AutoRollLoggerTest::kLogFile.c_str());
+  size_t lines = std::count(std::istreambuf_iterator<char>(inFile),
+                            std::istreambuf_iterator<char>(), '\n');
+  ASSERT_EQ(log_lines, lines);
+  inFile.close();
+}
+
+TEST_F(AutoRollLoggerTest, Close) {
+  InitTestDb();
+
+  size_t log_size = 8192;
+  size_t log_lines = 0;
+  AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(), kTestDir,
+                        "", log_size, 0, 10);
+  for (int log_level = InfoLogLevel::HEADER_LEVEL;
+       log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) {
+    logger.SetInfoLogLevel((InfoLogLevel)log_level);
+    for (int log_type = InfoLogLevel::DEBUG_LEVEL;
+         log_type <= InfoLogLevel::HEADER_LEVEL; log_type++) {
+      // log messages with log level smaller than log_level will not be
+      // logged.
+      LogMessage((InfoLogLevel)log_type, &logger, kSampleMessage.c_str());
+    }
+    log_lines += InfoLogLevel::HEADER_LEVEL - log_level + 1;
+  }
+  for (int log_level = InfoLogLevel::HEADER_LEVEL;
+       log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) {
+    logger.SetInfoLogLevel((InfoLogLevel)log_level);
+
+    // again, messages with level smaller than log_level will not be logged.
+    ROCKS_LOG_HEADER(&logger, "%s", kSampleMessage.c_str());
+    ROCKS_LOG_DEBUG(&logger, "%s", kSampleMessage.c_str());
+    ROCKS_LOG_INFO(&logger, "%s", kSampleMessage.c_str());
+    ROCKS_LOG_WARN(&logger, "%s", kSampleMessage.c_str());
+    ROCKS_LOG_ERROR(&logger, "%s", kSampleMessage.c_str());
+    ROCKS_LOG_FATAL(&logger, "%s", kSampleMessage.c_str());
+    log_lines += InfoLogLevel::HEADER_LEVEL - log_level + 1;
+  }
+  ASSERT_EQ(logger.Close(), Status::OK());
+
+  std::ifstream inFile(AutoRollLoggerTest::kLogFile.c_str());
+  size_t lines = std::count(std::istreambuf_iterator<char>(inFile),
+                            std::istreambuf_iterator<char>(), '\n');
+  ASSERT_EQ(log_lines, lines);
+  inFile.close();
+}
+
+// Test the logger Header function for roll over logs
+// We expect the new logs creates as roll over to carry the headers specified
+static std::vector<std::string> GetOldFileNames(const std::string& path) {
+  std::vector<std::string> ret;
+
+  const std::string dirname = path.substr(/*start=*/0, path.find_last_of("/"));
+  const std::string fname = path.substr(path.find_last_of("/") + 1);
+
+  std::vector<std::string> children;
+  EXPECT_OK(Env::Default()->GetChildren(dirname, &children));
+
+  // We know that the old log files are named [path]<something>
+  // Return all entities that match the pattern
+  for (auto& child : children) {
+    if (fname != child && child.find(fname) == 0) {
+      ret.push_back(dirname + "/" + child);
+    }
+  }
+
+  return ret;
+}
+
+TEST_F(AutoRollLoggerTest, LogHeaderTest) {
+  static const size_t MAX_HEADERS = 10;
+  static const size_t LOG_MAX_SIZE = 1024 * 5;
+  static const std::string HEADER_STR = "Log header line";
+
+  // test_num == 0 -> standard call to Header()
+  // test_num == 1 -> call to Log() with InfoLogLevel::HEADER_LEVEL
+  for (int test_num = 0; test_num < 2; test_num++) {
+    InitTestDb();
+
+    AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(),
+                          kTestDir, /*db_log_dir=*/"", LOG_MAX_SIZE,
+                          /*log_file_time_to_roll=*/0,
+                          /*keep_log_file_num=*/10);
+
+    if (test_num == 0) {
+      // Log some headers explicitly using Header()
+      for (size_t i = 0; i < MAX_HEADERS; i++) {
+        Header(&logger, "%s %" ROCKSDB_PRIszt, HEADER_STR.c_str(), i);
+      }
+    } else if (test_num == 1) {
+      // HEADER_LEVEL should make this behave like calling Header()
+      for (size_t i = 0; i < MAX_HEADERS; i++) {
+        ROCKS_LOG_HEADER(&logger, "%s %" ROCKSDB_PRIszt, HEADER_STR.c_str(), i);
+      }
+    }
+
+    const std::string newfname = logger.TEST_log_fname();
+
+    // Log enough data to cause a roll over
+    int i = 0;
+    for (size_t iter = 0; iter < 2; iter++) {
+      while (logger.GetLogFileSize() < LOG_MAX_SIZE) {
+        Info(&logger, (kSampleMessage + ":LogHeaderTest line %d").c_str(), i);
+        ++i;
+      }
+
+      Info(&logger, "Rollover");
+    }
+
+    // Flush the log for the latest file
+    LogFlush(&logger);
+
+    const auto oldfiles = GetOldFileNames(newfname);
+
+    ASSERT_EQ(oldfiles.size(), (size_t)2);
+
+    for (auto& oldfname : oldfiles) {
+      // verify that the files rolled over
+      ASSERT_NE(oldfname, newfname);
+      // verify that the old log contains all the header logs
+      ASSERT_EQ(test::GetLinesCount(oldfname, HEADER_STR), MAX_HEADERS);
+    }
+  }
+}
+
+TEST_F(AutoRollLoggerTest, LogFileExistence) {
+  ROCKSDB_NAMESPACE::DB* db;
+  ROCKSDB_NAMESPACE::Options options;
+#ifdef OS_WIN
+  // Replace all slashes in the path so windows CompSpec does not
+  // become confused
+  std::string testDir(kTestDir);
+  std::replace_if(
+      testDir.begin(), testDir.end(), [](char ch) { return ch == '/'; }, '\\');
+  std::string deleteCmd = "if exist " + testDir + " rd /s /q " + testDir;
+#else
+  std::string deleteCmd = "rm -rf " + kTestDir;
+#endif
+  ASSERT_EQ(system(deleteCmd.c_str()), 0);
+  options.max_log_file_size = 100 * 1024 * 1024;
+  options.create_if_missing = true;
+  ASSERT_OK(ROCKSDB_NAMESPACE::DB::Open(options, kTestDir, &db));
+  ASSERT_OK(default_env->FileExists(kLogFile));
+  delete db;
+}
+
+TEST_F(AutoRollLoggerTest, FileCreateFailure) {
+  Options options;
+  options.max_log_file_size = 100 * 1024 * 1024;
+  options.db_log_dir = "/a/dir/does/not/exist/at/all";
+
+  std::shared_ptr<Logger> logger;
+  ASSERT_NOK(CreateLoggerFromOptions("", options, &logger));
+  ASSERT_TRUE(!logger);
+}
+
+TEST_F(AutoRollLoggerTest, RenameOnlyWhenExists) {
+  InitTestDb();
+  SpecialEnv env(Env::Default());
+  Options options;
+  options.env = &env;
+
+  // Originally no LOG exists. Should not see a rename.
+  {
+    std::shared_ptr<Logger> logger;
+    ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger));
+    ASSERT_EQ(0, env.rename_count_);
+  }
+
+  // Now a LOG exists. Create a new one should see a rename.
+  {
+    std::shared_ptr<Logger> logger;
+    ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger));
+    ASSERT_EQ(1, env.rename_count_);
+  }
+}
+
+TEST_F(AutoRollLoggerTest, RenameError) {
+  InitTestDb();
+  SpecialEnv env(Env::Default());
+  env.rename_error_ = true;
+  Options options;
+  options.env = &env;
+
+  // Originally no LOG exists. Should not be impacted by rename error.
+  {
+    std::shared_ptr<Logger> logger;
+    ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger));
+    ASSERT_TRUE(logger != nullptr);
+  }
+
+  // Now a LOG exists. Rename error should cause failure.
+  {
+    std::shared_ptr<Logger> logger;
+    ASSERT_NOK(CreateLoggerFromOptions(kTestDir, options, &logger));
+    ASSERT_TRUE(logger == nullptr);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as AutoRollLogger is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/logging/env_logger.h b/src/rocksdb/logging/env_logger.h
new file mode 100644
index 000000000..8164945cf
--- /dev/null
+++ b/src/rocksdb/logging/env_logger.h
@@ -0,0 +1,193 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Logger implementation that uses custom Env object for logging.
+
+#pragma once
+
+#include <time.h>
+
+#include <atomic>
+#include <memory>
+
+#include "file/writable_file_writer.h"
+#include "monitoring/iostats_context_imp.h"
+#include "port/sys_time.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/perf_level.h"
+#include "rocksdb/slice.h"
+#include "test_util/sync_point.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class EnvLogger : public Logger {
+ public:
+  EnvLogger(std::unique_ptr<FSWritableFile>&& writable_file,
+            const std::string& fname, const EnvOptions& options, Env* env,
+            InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL)
+      : Logger(log_level),
+        env_(env),
+        clock_(env_->GetSystemClock().get()),
+        file_(std::move(writable_file), fname, options, clock_),
+        last_flush_micros_(0),
+        flush_pending_(false) {}
+
+  ~EnvLogger() {
+    if (!closed_) {
+      closed_ = true;
+      CloseHelper().PermitUncheckedError();
+    }
+  }
+
+ private:
+  // A guard to prepare file operations, such as mutex and skip
+  // I/O context.
+  class FileOpGuard {
+   public:
+    explicit FileOpGuard(EnvLogger& logger)
+        : logger_(logger), prev_perf_level_(GetPerfLevel()) {
+      // Preserve iostats not to pollute writes from user writes. We might
+      // need a better solution than this.
+      SetPerfLevel(PerfLevel::kDisable);
+      IOSTATS_SET_DISABLE(true);
+      logger.mutex_.Lock();
+    }
+    ~FileOpGuard() {
+      logger_.mutex_.Unlock();
+      IOSTATS_SET_DISABLE(false);
+      SetPerfLevel(prev_perf_level_);
+    }
+
+   private:
+    EnvLogger& logger_;
+    PerfLevel prev_perf_level_;
+  };
+
+  void FlushLocked() {
+    mutex_.AssertHeld();
+    if (flush_pending_) {
+      flush_pending_ = false;
+      file_.Flush().PermitUncheckedError();
+    }
+    last_flush_micros_ = clock_->NowMicros();
+  }
+
+  void Flush() override {
+    TEST_SYNC_POINT("EnvLogger::Flush:Begin1");
+    TEST_SYNC_POINT("EnvLogger::Flush:Begin2");
+
+    FileOpGuard guard(*this);
+    FlushLocked();
+  }
+
+  Status CloseImpl() override { return CloseHelper(); }
+
+  Status CloseHelper() {
+    FileOpGuard guard(*this);
+    const auto close_status = file_.Close();
+
+    if (close_status.ok()) {
+      return close_status;
+    }
+    return Status::IOError("Close of log file failed with error:" +
+                           (close_status.getState()
+                                ? std::string(close_status.getState())
+                                : std::string()));
+  }
+
+  using Logger::Logv;
+  void Logv(const char* format, va_list ap) override {
+    IOSTATS_TIMER_GUARD(logger_nanos);
+
+    const uint64_t thread_id = env_->GetThreadID();
+
+    // We try twice: the first time with a fixed-size stack allocated buffer,
+    // and the second time with a much larger dynamically allocated buffer.
+    char buffer[500];
+    for (int iter = 0; iter < 2; iter++) {
+      char* base;
+      int bufsize;
+      if (iter == 0) {
+        bufsize = sizeof(buffer);
+        base = buffer;
+      } else {
+        bufsize = 65536;
+        base = new char[bufsize];
+      }
+      char* p = base;
+      char* limit = base + bufsize;
+
+      port::TimeVal now_tv;
+      port::GetTimeOfDay(&now_tv, nullptr);
+      const time_t seconds = now_tv.tv_sec;
+      struct tm t;
+      port::LocalTimeR(&seconds, &t);
+      p += snprintf(p, limit - p, "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llu ",
+                    t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour,
+                    t.tm_min, t.tm_sec, static_cast<int>(now_tv.tv_usec),
+                    static_cast<long long unsigned int>(thread_id));
+
+      // Print the message
+      if (p < limit) {
+        va_list backup_ap;
+        va_copy(backup_ap, ap);
+        p += vsnprintf(p, limit - p, format, backup_ap);
+        va_end(backup_ap);
+      }
+
+      // Truncate to available space if necessary
+      if (p >= limit) {
+        if (iter == 0) {
+          continue;  // Try again with larger buffer
+        } else {
+          p = limit - 1;
+        }
+      }
+
+      // Add newline if necessary
+      if (p == base || p[-1] != '\n') {
+        *p++ = '\n';
+      }
+
+      assert(p <= limit);
+      {
+        FileOpGuard guard(*this);
+        // We will ignore any error returned by Append().
+        file_.Append(Slice(base, p - base)).PermitUncheckedError();
+        flush_pending_ = true;
+        const uint64_t now_micros = clock_->NowMicros();
+        if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) {
+          FlushLocked();
+        }
+      }
+      if (base != buffer) {
+        delete[] base;
+      }
+      break;
+    }
+  }
+
+  size_t GetLogFileSize() const override {
+    MutexLock l(&mutex_);
+    return file_.GetFileSize();
+  }
+
+ private:
+  Env* env_;
+  SystemClock* clock_;
+  WritableFileWriter file_;
+  mutable port::Mutex mutex_;  // Mutex to protect the shared variables below.
+  const static uint64_t flush_every_seconds_ = 5;
+  std::atomic_uint_fast64_t last_flush_micros_;
+  std::atomic<bool> flush_pending_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/logging/env_logger_test.cc b/src/rocksdb/logging/env_logger_test.cc
new file mode 100644
index 000000000..467ab064f
--- /dev/null
+++ b/src/rocksdb/logging/env_logger_test.cc
@@ -0,0 +1,163 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "logging/env_logger.h"
+
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+// In this test we only want to Log some simple log message with
+// no format.
+void LogMessage(std::shared_ptr<Logger> logger, const std::string& message) {
+  Log(logger, "%s", message.c_str());
+}
+
+// Helper method to write the message num_times in the given logger.
+void WriteLogs(std::shared_ptr<Logger> logger, const std::string& message,
+               int num_times) {
+  for (int ii = 0; ii < num_times; ++ii) {
+    LogMessage(logger, message);
+  }
+}
+
+}  // namespace
+
+class EnvLoggerTest : public testing::Test {
+ public:
+  Env* env_;
+
+  EnvLoggerTest() : env_(Env::Default()) {}
+
+  ~EnvLoggerTest() = default;
+
+  std::shared_ptr<Logger> CreateLogger() {
+    std::shared_ptr<Logger> result;
+    assert(NewEnvLogger(kLogFile, env_, &result).ok());
+    assert(result);
+    result->SetInfoLogLevel(InfoLogLevel::INFO_LEVEL);
+    return result;
+  }
+
+  void DeleteLogFile() { ASSERT_OK(env_->DeleteFile(kLogFile)); }
+
+  static const std::string kSampleMessage;
+  static const std::string kTestDir;
+  static const std::string kLogFile;
+};
+
+const std::string EnvLoggerTest::kSampleMessage =
+    "this is the message to be written to the log file!!";
+const std::string EnvLoggerTest::kLogFile = test::PerThreadDBPath("log_file");
+
+TEST_F(EnvLoggerTest, EmptyLogFile) {
+  auto logger = CreateLogger();
+  ASSERT_EQ(logger->Close(), Status::OK());
+
+  // Check the size of the log file.
+  uint64_t file_size;
+  ASSERT_EQ(env_->GetFileSize(kLogFile, &file_size), Status::OK());
+  ASSERT_EQ(file_size, 0);
+  DeleteLogFile();
+}
+
+TEST_F(EnvLoggerTest, LogMultipleLines) {
+  auto logger = CreateLogger();
+
+  // Write multiple lines.
+  const int kNumIter = 10;
+  WriteLogs(logger, kSampleMessage, kNumIter);
+
+  // Flush the logs.
+  logger->Flush();
+  ASSERT_EQ(logger->Close(), Status::OK());
+
+  // Validate whether the log file has 'kNumIter' number of lines.
+  ASSERT_EQ(test::GetLinesCount(kLogFile, kSampleMessage), kNumIter);
+  DeleteLogFile();
+}
+
+TEST_F(EnvLoggerTest, Overwrite) {
+  {
+    auto logger = CreateLogger();
+
+    // Write multiple lines.
+    const int kNumIter = 10;
+    WriteLogs(logger, kSampleMessage, kNumIter);
+
+    ASSERT_EQ(logger->Close(), Status::OK());
+
+    // Validate whether the log file has 'kNumIter' number of lines.
+    ASSERT_EQ(test::GetLinesCount(kLogFile, kSampleMessage), kNumIter);
+  }
+
+  // Now reopen the file again.
+  {
+    auto logger = CreateLogger();
+
+    // File should be empty.
+    uint64_t file_size;
+    ASSERT_EQ(env_->GetFileSize(kLogFile, &file_size), Status::OK());
+    ASSERT_EQ(file_size, 0);
+    ASSERT_EQ(logger->GetLogFileSize(), 0);
+    ASSERT_EQ(logger->Close(), Status::OK());
+  }
+  DeleteLogFile();
+}
+
+TEST_F(EnvLoggerTest, Close) {
+  auto logger = CreateLogger();
+
+  // Write multiple lines.
+  const int kNumIter = 10;
+  WriteLogs(logger, kSampleMessage, kNumIter);
+
+  ASSERT_EQ(logger->Close(), Status::OK());
+
+  // Validate whether the log file has 'kNumIter' number of lines.
+  ASSERT_EQ(test::GetLinesCount(kLogFile, kSampleMessage), kNumIter);
+  DeleteLogFile();
+}
+
+TEST_F(EnvLoggerTest, ConcurrentLogging) {
+  auto logger = CreateLogger();
+
+  const int kNumIter = 20;
+  std::function<void()> cb = [&]() {
+    WriteLogs(logger, kSampleMessage, kNumIter);
+    logger->Flush();
+  };
+
+  // Write to the logs from multiple threads.
+  std::vector<port::Thread> threads;
+  const int kNumThreads = 5;
+  // Create threads.
+  for (int ii = 0; ii < kNumThreads; ++ii) {
+    threads.push_back(port::Thread(cb));
+  }
+
+  // Wait for them to complete.
+  for (auto& th : threads) {
+    th.join();
+  }
+
+  ASSERT_EQ(logger->Close(), Status::OK());
+
+  // Verfiy the log file.
+  ASSERT_EQ(test::GetLinesCount(kLogFile, kSampleMessage),
+            kNumIter * kNumThreads);
+  DeleteLogFile();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/logging/event_logger.cc b/src/rocksdb/logging/event_logger.cc
new file mode 100644
index 000000000..cb9eca687
--- /dev/null
+++ b/src/rocksdb/logging/event_logger.cc
@@ -0,0 +1,68 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "logging/event_logger.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <sstream>
+#include <string>
+
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+EventLoggerStream::EventLoggerStream(Logger* logger)
+    : logger_(logger),
+      log_buffer_(nullptr),
+      max_log_size_(0),
+      json_writer_(nullptr) {}
+
+EventLoggerStream::EventLoggerStream(LogBuffer* log_buffer,
+                                     const size_t max_log_size)
+    : logger_(nullptr),
+      log_buffer_(log_buffer),
+      max_log_size_(max_log_size),
+      json_writer_(nullptr) {}
+
+EventLoggerStream::~EventLoggerStream() {
+  if (json_writer_) {
+    json_writer_->EndObject();
+#ifdef ROCKSDB_PRINT_EVENTS_TO_STDOUT
+    printf("%s\n", json_writer_->Get().c_str());
+#else
+    if (logger_) {
+      EventLogger::Log(logger_, *json_writer_);
+    } else if (log_buffer_) {
+      assert(max_log_size_);
+      EventLogger::LogToBuffer(log_buffer_, *json_writer_, max_log_size_);
+    }
+#endif
+    delete json_writer_;
+  }
+}
+
+void EventLogger::Log(const JSONWriter& jwriter) { Log(logger_, jwriter); }
+
+void EventLogger::Log(Logger* logger, const JSONWriter& jwriter) {
+#ifdef ROCKSDB_PRINT_EVENTS_TO_STDOUT
+  printf("%s\n", jwriter.Get().c_str());
+#else
+  ROCKSDB_NAMESPACE::Log(logger, "%s %s", Prefix(), jwriter.Get().c_str());
+#endif
+}
+
+void EventLogger::LogToBuffer(LogBuffer* log_buffer, const JSONWriter& jwriter,
+                              const size_t max_log_size) {
+#ifdef ROCKSDB_PRINT_EVENTS_TO_STDOUT
+  printf("%s\n", jwriter.Get().c_str());
+#else
+  assert(log_buffer);
+  ROCKSDB_NAMESPACE::LogToBuffer(log_buffer, max_log_size, "%s %s", Prefix(),
+                                 jwriter.Get().c_str());
+#endif
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/logging/event_logger.h b/src/rocksdb/logging/event_logger.h
new file mode 100644
index 000000000..9ce982f50
--- /dev/null
+++ b/src/rocksdb/logging/event_logger.h
@@ -0,0 +1,202 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <chrono>
+#include <memory>
+#include <sstream>
+#include <string>
+
+#include "logging/log_buffer.h"
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class JSONWriter {
+ public:
+  JSONWriter() : state_(kExpectKey), first_element_(true), in_array_(false) {
+    stream_ << "{";
+  }
+
+  void AddKey(const std::string& key) {
+    assert(state_ == kExpectKey);
+    if (!first_element_) {
+      stream_ << ", ";
+    }
+    stream_ << "\"" << key << "\": ";
+    state_ = kExpectValue;
+    first_element_ = false;
+  }
+
+  void AddValue(const char* value) {
+    assert(state_ == kExpectValue || state_ == kInArray);
+    if (state_ == kInArray && !first_element_) {
+      stream_ << ", ";
+    }
+    stream_ << "\"" << value << "\"";
+    if (state_ != kInArray) {
+      state_ = kExpectKey;
+    }
+    first_element_ = false;
+  }
+
+  template <typename T>
+  void AddValue(const T& value) {
+    assert(state_ == kExpectValue || state_ == kInArray);
+    if (state_ == kInArray && !first_element_) {
+      stream_ << ", ";
+    }
+    stream_ << value;
+    if (state_ != kInArray) {
+      state_ = kExpectKey;
+    }
+    first_element_ = false;
+  }
+
+  void StartArray() {
+    assert(state_ == kExpectValue);
+    state_ = kInArray;
+    in_array_ = true;
+    stream_ << "[";
+    first_element_ = true;
+  }
+
+  void EndArray() {
+    assert(state_ == kInArray);
+    state_ = kExpectKey;
+    in_array_ = false;
+    stream_ << "]";
+    first_element_ = false;
+  }
+
+  void StartObject() {
+    assert(state_ == kExpectValue);
+    state_ = kExpectKey;
+    stream_ << "{";
+    first_element_ = true;
+  }
+
+  void EndObject() {
+    assert(state_ == kExpectKey);
+    stream_ << "}";
+    first_element_ = false;
+  }
+
+  void StartArrayedObject() {
+    assert(state_ == kInArray && in_array_);
+    state_ = kExpectValue;
+    if (!first_element_) {
+      stream_ << ", ";
+    }
+    StartObject();
+  }
+
+  void EndArrayedObject() {
+    assert(in_array_);
+    EndObject();
+    state_ = kInArray;
+  }
+
+  std::string Get() const { return stream_.str(); }
+
+  JSONWriter& operator<<(const char* val) {
+    if (state_ == kExpectKey) {
+      AddKey(val);
+    } else {
+      AddValue(val);
+    }
+    return *this;
+  }
+
+  JSONWriter& operator<<(const std::string& val) {
+    return *this << val.c_str();
+  }
+
+  template <typename T>
+  JSONWriter& operator<<(const T& val) {
+    assert(state_ != kExpectKey);
+    AddValue(val);
+    return *this;
+  }
+
+ private:
+  enum JSONWriterState {
+    kExpectKey,
+    kExpectValue,
+    kInArray,
+    kInArrayedObject,
+  };
+  JSONWriterState state_;
+  bool first_element_;
+  bool in_array_;
+  std::ostringstream stream_;
+};
+
+class EventLoggerStream {
+ public:
+  template <typename T>
+  EventLoggerStream& operator<<(const T& val) {
+    MakeStream();
+    *json_writer_ << val;
+    return *this;
+  }
+
+  void StartArray() { json_writer_->StartArray(); }
+  void EndArray() { json_writer_->EndArray(); }
+  void StartObject() { json_writer_->StartObject(); }
+  void EndObject() { json_writer_->EndObject(); }
+
+  ~EventLoggerStream();
+
+ private:
+  void MakeStream() {
+    if (!json_writer_) {
+      json_writer_ = new JSONWriter();
+      *this << "time_micros"
+            << std::chrono::duration_cast<std::chrono::microseconds>(
+                   std::chrono::system_clock::now().time_since_epoch())
+                   .count();
+    }
+  }
+  friend class EventLogger;
+  explicit EventLoggerStream(Logger* logger);
+  explicit EventLoggerStream(LogBuffer* log_buffer, const size_t max_log_size);
+  // exactly one is non-nullptr
+  Logger* const logger_;
+  LogBuffer* const log_buffer_;
+  const size_t max_log_size_;  // used only for log_buffer_
+  // ownership
+  JSONWriter* json_writer_;
+};
+
+// here is an example of the output that will show up in the LOG:
+// 2015/01/15-14:13:25.788019 1105ef000 EVENT_LOG_v1 {"time_micros":
+// 1421360005788015, "event": "table_file_creation", "file_number": 12,
+// "file_size": 1909699}
+class EventLogger {
+ public:
+  static const char* Prefix() { return "EVENT_LOG_v1"; }
+
+  explicit EventLogger(Logger* logger) : logger_(logger) {}
+  EventLoggerStream Log() { return EventLoggerStream(logger_); }
+  EventLoggerStream LogToBuffer(LogBuffer* log_buffer) {
+    return EventLoggerStream(log_buffer, LogBuffer::kDefaultMaxLogSize);
+  }
+  EventLoggerStream LogToBuffer(LogBuffer* log_buffer,
+                                const size_t max_log_size) {
+    return EventLoggerStream(log_buffer, max_log_size);
+  }
+  void Log(const JSONWriter& jwriter);
+  static void Log(Logger* logger, const JSONWriter& jwriter);
+  static void LogToBuffer(
+      LogBuffer* log_buffer, const JSONWriter& jwriter,
+      const size_t max_log_size = LogBuffer::kDefaultMaxLogSize);
+
+ private:
+  Logger* logger_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/logging/event_logger_test.cc b/src/rocksdb/logging/event_logger_test.cc
new file mode 100644
index 000000000..582f56ceb
--- /dev/null
+++ b/src/rocksdb/logging/event_logger_test.cc
@@ -0,0 +1,45 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "logging/event_logger.h"
+
+#include <string>
+
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class EventLoggerTest : public testing::Test {};
+
+class StringLogger : public Logger {
+ public:
+  using Logger::Logv;
+  void Logv(const char* format, va_list ap) override {
+    vsnprintf(buffer_, sizeof(buffer_), format, ap);
+  }
+  char* buffer() { return buffer_; }
+
+ private:
+  char buffer_[1000];
+};
+
+TEST_F(EventLoggerTest, SimpleTest) {
+  StringLogger logger;
+  EventLogger event_logger(&logger);
+  event_logger.Log() << "id" << 5 << "event"
+                     << "just_testing";
+  std::string output(logger.buffer());
+  ASSERT_TRUE(output.find("\"event\": \"just_testing\"") != std::string::npos);
+  ASSERT_TRUE(output.find("\"id\": 5") != std::string::npos);
+  ASSERT_TRUE(output.find("\"time_micros\"") != std::string::npos);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/logging/log_buffer.cc b/src/rocksdb/logging/log_buffer.cc
new file mode 100644
index 000000000..2763e617f
--- /dev/null
+++ b/src/rocksdb/logging/log_buffer.cc
@@ -0,0 +1,91 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "logging/log_buffer.h"
+
+#include "port/port.h"
+#include "port/sys_time.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+LogBuffer::LogBuffer(const InfoLogLevel log_level, Logger* info_log)
+    : log_level_(log_level), info_log_(info_log) {}
+
+void LogBuffer::AddLogToBuffer(size_t max_log_size, const char* format,
+                               va_list ap) {
+  if (!info_log_ || log_level_ < info_log_->GetInfoLogLevel()) {
+    // Skip the level because of its level.
+    return;
+  }
+
+  char* alloc_mem = arena_.AllocateAligned(max_log_size);
+  BufferedLog* buffered_log = new (alloc_mem) BufferedLog();
+  char* p = buffered_log->message;
+  char* limit = alloc_mem + max_log_size - 1;
+
+  // store the time
+  port::GetTimeOfDay(&(buffered_log->now_tv), nullptr);
+
+  // Print the message
+  if (p < limit) {
+    va_list backup_ap;
+    va_copy(backup_ap, ap);
+    auto n = vsnprintf(p, limit - p, format, backup_ap);
+#ifndef OS_WIN
+    // MS reports -1 when the buffer is too short
+    assert(n >= 0);
+#endif
+    if (n > 0) {
+      p += n;
+    } else {
+      p = limit;
+    }
+    va_end(backup_ap);
+  }
+
+  if (p > limit) {
+    p = limit;
+  }
+
+  // Add '\0' to the end
+  *p = '\0';
+
+  logs_.push_back(buffered_log);
+}
+
+void LogBuffer::FlushBufferToLog() {
+  for (BufferedLog* log : logs_) {
+    const time_t seconds = log->now_tv.tv_sec;
+    struct tm t;
+    if (port::LocalTimeR(&seconds, &t) != nullptr) {
+      Log(log_level_, info_log_,
+          "(Original Log Time %04d/%02d/%02d-%02d:%02d:%02d.%06d) %s",
+          t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min,
+          t.tm_sec, static_cast<int>(log->now_tv.tv_usec), log->message);
+    }
+  }
+  logs_.clear();
+}
+
+void LogToBuffer(LogBuffer* log_buffer, size_t max_log_size, const char* format,
+                 ...) {
+  if (log_buffer != nullptr) {
+    va_list ap;
+    va_start(ap, format);
+    log_buffer->AddLogToBuffer(max_log_size, format, ap);
+    va_end(ap);
+  }
+}
+
+void LogToBuffer(LogBuffer* log_buffer, const char* format, ...) {
+  if (log_buffer != nullptr) {
+    va_list ap;
+    va_start(ap, format);
+    log_buffer->AddLogToBuffer(LogBuffer::kDefaultMaxLogSize, format, ap);
+    va_end(ap);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/logging/log_buffer.h b/src/rocksdb/logging/log_buffer.h
new file mode 100644
index 000000000..92d38d10d
--- /dev/null
+++ b/src/rocksdb/logging/log_buffer.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <ctime>
+
+#include "memory/arena.h"
+#include "port/sys_time.h"
+#include "rocksdb/env.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Logger;
+
+// A class to buffer info log entries and flush them in the end.
+class LogBuffer {
+ public:
+  // log_level: the log level for all the logs
+  // info_log:  logger to write the logs to
+  LogBuffer(const InfoLogLevel log_level, Logger* info_log);
+
+  // Add a log entry to the buffer. Use default max_log_size.
+  // max_log_size indicates maximize log size, including some metadata.
+  void AddLogToBuffer(size_t max_log_size, const char* format, va_list ap);
+
+  size_t IsEmpty() const { return logs_.empty(); }
+
+  // Flush all buffered log to the info log.
+  void FlushBufferToLog();
+  static const size_t kDefaultMaxLogSize = 512;
+
+ private:
+  // One log entry with its timestamp
+  struct BufferedLog {
+    port::TimeVal now_tv;  // Timestamp of the log
+    char message[1];       // Beginning of log message
+  };
+
+  const InfoLogLevel log_level_;
+  Logger* info_log_;
+  Arena arena_;
+  autovector<BufferedLog*> logs_;
+};
+
+// Add log to the LogBuffer for a delayed info logging. It can be used when
+// we want to add some logs inside a mutex.
+// max_log_size indicates maximize log size, including some metadata.
+extern void LogToBuffer(LogBuffer* log_buffer, size_t max_log_size,
+                        const char* format, ...);
+// Same as previous function, but with default max log size.
+extern void LogToBuffer(LogBuffer* log_buffer, const char* format, ...);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/logging/logging.h b/src/rocksdb/logging/logging.h
new file mode 100644
index 000000000..0fa882a78
--- /dev/null
+++ b/src/rocksdb/logging/logging.h
@@ -0,0 +1,62 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Must not be included from any .h files to avoid polluting the namespace
+// with macros.
+
+#pragma once
+
+// Helper macros that include information about file name and line number
+#define ROCKS_LOG_STRINGIFY(x) #x
+#define ROCKS_LOG_TOSTRING(x) ROCKS_LOG_STRINGIFY(x)
+#define ROCKS_LOG_PREPEND_FILE_LINE(FMT) \
+  ("[%s:" ROCKS_LOG_TOSTRING(__LINE__) "] " FMT)
+
+inline const char* RocksLogShorterFileName(const char* file) {
+  // 18 is the length of "logging/logging.h".
+  // If the name of this file changed, please change this number, too.
+  return file + (sizeof(__FILE__) > 18 ? sizeof(__FILE__) - 18 : 0);
+}
+
+// Don't inclide file/line info in HEADER level
+#define ROCKS_LOG_HEADER(LGR, FMT, ...) \
+  ROCKSDB_NAMESPACE::Log(InfoLogLevel::HEADER_LEVEL, LGR, FMT, ##__VA_ARGS__)
+
+#define ROCKS_LOG_AT_LEVEL(LGR, LVL, FMT, ...)                           \
+  ROCKSDB_NAMESPACE::Log((LVL), (LGR), ROCKS_LOG_PREPEND_FILE_LINE(FMT), \
+                         RocksLogShorterFileName(__FILE__), ##__VA_ARGS__)
+
+#define ROCKS_LOG_DEBUG(LGR, FMT, ...) \
+  ROCKS_LOG_AT_LEVEL((LGR), InfoLogLevel::DEBUG_LEVEL, FMT, ##__VA_ARGS__)
+
+#define ROCKS_LOG_INFO(LGR, FMT, ...) \
+  ROCKS_LOG_AT_LEVEL((LGR), InfoLogLevel::INFO_LEVEL, FMT, ##__VA_ARGS__)
+
+#define ROCKS_LOG_WARN(LGR, FMT, ...) \
+  ROCKS_LOG_AT_LEVEL((LGR), InfoLogLevel::WARN_LEVEL, FMT, ##__VA_ARGS__)
+
+#define ROCKS_LOG_ERROR(LGR, FMT, ...) \
+  ROCKS_LOG_AT_LEVEL((LGR), InfoLogLevel::ERROR_LEVEL, FMT, ##__VA_ARGS__)
+
+#define ROCKS_LOG_FATAL(LGR, FMT, ...) \
+  ROCKS_LOG_AT_LEVEL((LGR), InfoLogLevel::FATAL_LEVEL, FMT, ##__VA_ARGS__)
+
+#define ROCKS_LOG_BUFFER(LOG_BUF, FMT, ...)                                 \
+  ROCKSDB_NAMESPACE::LogToBuffer(LOG_BUF, ROCKS_LOG_PREPEND_FILE_LINE(FMT), \
+                                 RocksLogShorterFileName(__FILE__),         \
+                                 ##__VA_ARGS__)
+
+#define ROCKS_LOG_BUFFER_MAX_SZ(LOG_BUF, MAX_LOG_SIZE, FMT, ...) \
+  ROCKSDB_NAMESPACE::LogToBuffer(                                \
+      LOG_BUF, MAX_LOG_SIZE, ROCKS_LOG_PREPEND_FILE_LINE(FMT),   \
+      RocksLogShorterFileName(__FILE__), ##__VA_ARGS__)
+
+#define ROCKS_LOG_DETAILS(LGR, FMT, ...) \
+  ;  // due to overhead by default skip such lines
+// ROCKS_LOG_DEBUG(LGR, FMT, ##__VA_ARGS__)
diff --git a/src/rocksdb/memory/allocator.h b/src/rocksdb/memory/allocator.h
new file mode 100644
index 000000000..0d7cd60a9
--- /dev/null
+++ b/src/rocksdb/memory/allocator.h
@@ -0,0 +1,58 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Abstract interface for allocating memory in blocks. This memory is freed
+// when the allocator object is destroyed. See the Arena class for more info.
+
+#pragma once
+#include <cerrno>
+#include <cstddef>
+
+#include "rocksdb/write_buffer_manager.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Logger;
+
+class Allocator {
+ public:
+  virtual ~Allocator() {}
+
+  virtual char* Allocate(size_t bytes) = 0;
+  virtual char* AllocateAligned(size_t bytes, size_t huge_page_size = 0,
+                                Logger* logger = nullptr) = 0;
+
+  virtual size_t BlockSize() const = 0;
+};
+
+class AllocTracker {
+ public:
+  explicit AllocTracker(WriteBufferManager* write_buffer_manager);
+  // No copying allowed
+  AllocTracker(const AllocTracker&) = delete;
+  void operator=(const AllocTracker&) = delete;
+
+  ~AllocTracker();
+  void Allocate(size_t bytes);
+  // Call when we're finished allocating memory so we can free it from
+  // the write buffer's limit.
+  void DoneAllocating();
+
+  void FreeMem();
+
+  bool is_freed() const { return write_buffer_manager_ == nullptr || freed_; }
+
+ private:
+  WriteBufferManager* write_buffer_manager_;
+  std::atomic<size_t> bytes_allocated_;
+  bool done_allocating_;
+  bool freed_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/memory/arena.cc b/src/rocksdb/memory/arena.cc
new file mode 100644
index 000000000..10b8969b4
--- /dev/null
+++ b/src/rocksdb/memory/arena.cc
@@ -0,0 +1,234 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "memory/arena.h"
+#ifndef OS_WIN
+#include <sys/mman.h>
+#endif
+#include <algorithm>
+
+#include "logging/logging.h"
+#include "port/malloc.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// MSVC complains that it is already defined since it is static in the header.
+#ifndef _MSC_VER
+const size_t Arena::kInlineSize;
+#endif
+
+const size_t Arena::kMinBlockSize = 4096;
+const size_t Arena::kMaxBlockSize = 2u << 30;
+static const int kAlignUnit = alignof(max_align_t);
+
+size_t OptimizeBlockSize(size_t block_size) {
+  // Make sure block_size is in optimal range
+  block_size = std::max(Arena::kMinBlockSize, block_size);
+  block_size = std::min(Arena::kMaxBlockSize, block_size);
+
+  // make sure block_size is the multiple of kAlignUnit
+  if (block_size % kAlignUnit != 0) {
+    block_size = (1 + block_size / kAlignUnit) * kAlignUnit;
+  }
+
+  return block_size;
+}
+
+Arena::Arena(size_t block_size, AllocTracker* tracker, size_t huge_page_size)
+    : kBlockSize(OptimizeBlockSize(block_size)), tracker_(tracker) {
+  assert(kBlockSize >= kMinBlockSize && kBlockSize <= kMaxBlockSize &&
+         kBlockSize % kAlignUnit == 0);
+  TEST_SYNC_POINT_CALLBACK("Arena::Arena:0", const_cast<size_t*>(&kBlockSize));
+  alloc_bytes_remaining_ = sizeof(inline_block_);
+  blocks_memory_ += alloc_bytes_remaining_;
+  aligned_alloc_ptr_ = inline_block_;
+  unaligned_alloc_ptr_ = inline_block_ + alloc_bytes_remaining_;
+#ifdef MAP_HUGETLB
+  hugetlb_size_ = huge_page_size;
+  if (hugetlb_size_ && kBlockSize > hugetlb_size_) {
+    hugetlb_size_ = ((kBlockSize - 1U) / hugetlb_size_ + 1U) * hugetlb_size_;
+  }
+#else
+  (void)huge_page_size;
+#endif
+  if (tracker_ != nullptr) {
+    tracker_->Allocate(kInlineSize);
+  }
+}
+
+Arena::~Arena() {
+  if (tracker_ != nullptr) {
+    assert(tracker_->is_freed());
+    tracker_->FreeMem();
+  }
+  for (const auto& block : blocks_) {
+    delete[] block;
+  }
+
+#ifdef MAP_HUGETLB
+  for (const auto& mmap_info : huge_blocks_) {
+    if (mmap_info.addr_ == nullptr) {
+      continue;
+    }
+    auto ret = munmap(mmap_info.addr_, mmap_info.length_);
+    if (ret != 0) {
+      // TODO(sdong): Better handling
+    }
+  }
+#endif
+}
+
+char* Arena::AllocateFallback(size_t bytes, bool aligned) {
+  if (bytes > kBlockSize / 4) {
+    ++irregular_block_num;
+    // Object is more than a quarter of our block size.  Allocate it separately
+    // to avoid wasting too much space in leftover bytes.
+    return AllocateNewBlock(bytes);
+  }
+
+  // We waste the remaining space in the current block.
+  size_t size = 0;
+  char* block_head = nullptr;
+#ifdef MAP_HUGETLB
+  if (hugetlb_size_) {
+    size = hugetlb_size_;
+    block_head = AllocateFromHugePage(size);
+  }
+#endif
+  if (!block_head) {
+    size = kBlockSize;
+    block_head = AllocateNewBlock(size);
+  }
+  alloc_bytes_remaining_ = size - bytes;
+
+  if (aligned) {
+    aligned_alloc_ptr_ = block_head + bytes;
+    unaligned_alloc_ptr_ = block_head + size;
+    return block_head;
+  } else {
+    aligned_alloc_ptr_ = block_head;
+    unaligned_alloc_ptr_ = block_head + size - bytes;
+    return unaligned_alloc_ptr_;
+  }
+}
+
+char* Arena::AllocateFromHugePage(size_t bytes) {
+#ifdef MAP_HUGETLB
+  if (hugetlb_size_ == 0) {
+    return nullptr;
+  }
+  // Reserve space in `huge_blocks_` before calling `mmap`.
+  // Use `emplace_back()` instead of `reserve()` to let std::vector manage its
+  // own memory and do fewer reallocations.
+  //
+  // - If `emplace_back` throws, no memory leaks because we haven't called
+  //   `mmap` yet.
+  // - If `mmap` throws, no memory leaks because the vector will be cleaned up
+  //   via RAII.
+  huge_blocks_.emplace_back(nullptr /* addr */, 0 /* length */);
+
+  void* addr = mmap(nullptr, bytes, (PROT_READ | PROT_WRITE),
+                    (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB), -1, 0);
+
+  if (addr == MAP_FAILED) {
+    return nullptr;
+  }
+  huge_blocks_.back() = MmapInfo(addr, bytes);
+  blocks_memory_ += bytes;
+  if (tracker_ != nullptr) {
+    tracker_->Allocate(bytes);
+  }
+  return reinterpret_cast<char*>(addr);
+#else
+  (void)bytes;
+  return nullptr;
+#endif
+}
+
+char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size,
+                             Logger* logger) {
+  assert((kAlignUnit & (kAlignUnit - 1)) ==
+         0);  // Pointer size should be a power of 2
+
+#ifdef MAP_HUGETLB
+  if (huge_page_size > 0 && bytes > 0) {
+    // Allocate from a huge page TLB table.
+    size_t reserved_size =
+        ((bytes - 1U) / huge_page_size + 1U) * huge_page_size;
+    assert(reserved_size >= bytes);
+
+    char* addr = AllocateFromHugePage(reserved_size);
+    if (addr == nullptr) {
+      ROCKS_LOG_WARN(logger,
+                     "AllocateAligned fail to allocate huge TLB pages: %s",
+                     errnoStr(errno).c_str());
+      // fail back to malloc
+    } else {
+      return addr;
+    }
+  }
+#else
+  (void)huge_page_size;
+  (void)logger;
+#endif
+
+  size_t current_mod =
+      reinterpret_cast<uintptr_t>(aligned_alloc_ptr_) & (kAlignUnit - 1);
+  size_t slop = (current_mod == 0 ? 0 : kAlignUnit - current_mod);
+  size_t needed = bytes + slop;
+  char* result;
+  if (needed <= alloc_bytes_remaining_) {
+    result = aligned_alloc_ptr_ + slop;
+    aligned_alloc_ptr_ += needed;
+    alloc_bytes_remaining_ -= needed;
+  } else {
+    // AllocateFallback always returns aligned memory
+    result = AllocateFallback(bytes, true /* aligned */);
+  }
+  assert((reinterpret_cast<uintptr_t>(result) & (kAlignUnit - 1)) == 0);
+  return result;
+}
+
+char* Arena::AllocateNewBlock(size_t block_bytes) {
+  // Reserve space in `blocks_` before allocating memory via new.
+  // Use `emplace_back()` instead of `reserve()` to let std::vector manage its
+  // own memory and do fewer reallocations.
+  //
+  // - If `emplace_back` throws, no memory leaks because we haven't called `new`
+  //   yet.
+  // - If `new` throws, no memory leaks because the vector will be cleaned up
+  //   via RAII.
+  blocks_.emplace_back(nullptr);
+
+  char* block = new char[block_bytes];
+  size_t allocated_size;
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  allocated_size = malloc_usable_size(block);
+#ifndef NDEBUG
+  // It's hard to predict what malloc_usable_size() returns.
+  // A callback can allow users to change the costed size.
+  std::pair<size_t*, size_t*> pair(&allocated_size, &block_bytes);
+  TEST_SYNC_POINT_CALLBACK("Arena::AllocateNewBlock:0", &pair);
+#endif  // NDEBUG
+#else
+  allocated_size = block_bytes;
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+  blocks_memory_ += allocated_size;
+  if (tracker_ != nullptr) {
+    tracker_->Allocate(allocated_size);
+  }
+  blocks_.back() = block;
+  return block;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/memory/arena.h b/src/rocksdb/memory/arena.h
new file mode 100644
index 000000000..1de04c477
--- /dev/null
+++ b/src/rocksdb/memory/arena.h
@@ -0,0 +1,141 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Arena is an implementation of Allocator class. For a request of small size,
+// it allocates a block with pre-defined block size. For a request of big
+// size, it uses malloc to directly get the requested size.
+
+#pragma once
+#ifndef OS_WIN
+#include <sys/mman.h>
+#endif
+#include <assert.h>
+#include <stdint.h>
+#include <cerrno>
+#include <cstddef>
+#include <vector>
+#include "memory/allocator.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena : public Allocator {
+ public:
+  // No copying allowed
+  Arena(const Arena&) = delete;
+  void operator=(const Arena&) = delete;
+
+  static const size_t kInlineSize = 2048;
+  static const size_t kMinBlockSize;
+  static const size_t kMaxBlockSize;
+
+  // huge_page_size: if 0, don't use huge page TLB. If > 0 (should set to the
+  // supported hugepage size of the system), block allocation will try huge
+  // page TLB first. If allocation fails, will fall back to normal case.
+  explicit Arena(size_t block_size = kMinBlockSize,
+                 AllocTracker* tracker = nullptr, size_t huge_page_size = 0);
+  ~Arena();
+
+  char* Allocate(size_t bytes) override;
+
+  // huge_page_size: if >0, will try to allocate from huage page TLB.
+  // The argument will be the size of the page size for huge page TLB. Bytes
+  // will be rounded up to multiple of the page size to allocate through mmap
+  // anonymous option with huge page on. The extra  space allocated will be
+  // wasted. If allocation fails, will fall back to normal case. To enable it,
+  // need to reserve huge pages for it to be allocated, like:
+  //     sysctl -w vm.nr_hugepages=20
+  // See linux doc Documentation/vm/hugetlbpage.txt for details.
+  // huge page allocation can fail. In this case it will fail back to
+  // normal cases. The messages will be logged to logger. So when calling with
+  // huge_page_tlb_size > 0, we highly recommend a logger is passed in.
+  // Otherwise, the error message will be printed out to stderr directly.
+  char* AllocateAligned(size_t bytes, size_t huge_page_size = 0,
+                        Logger* logger = nullptr) override;
+
+  // Returns an estimate of the total memory usage of data allocated
+  // by the arena (exclude the space allocated but not yet used for future
+  // allocations).
+  size_t ApproximateMemoryUsage() const {
+    return blocks_memory_ + blocks_.capacity() * sizeof(char*) -
+           alloc_bytes_remaining_;
+  }
+
+  size_t MemoryAllocatedBytes() const { return blocks_memory_; }
+
+  size_t AllocatedAndUnused() const { return alloc_bytes_remaining_; }
+
+  // If an allocation is too big, we'll allocate an irregular block with the
+  // same size of that allocation.
+  size_t IrregularBlockNum() const { return irregular_block_num; }
+
+  size_t BlockSize() const override { return kBlockSize; }
+
+  bool IsInInlineBlock() const {
+    return blocks_.empty() && huge_blocks_.empty();
+  }
+
+ private:
+  char inline_block_[kInlineSize] __attribute__((__aligned__(alignof(max_align_t))));
+  // Number of bytes allocated in one block
+  const size_t kBlockSize;
+  // Array of new[] allocated memory blocks
+  using Blocks = std::vector<char*>;
+  Blocks blocks_;
+
+  struct MmapInfo {
+    void* addr_;
+    size_t length_;
+
+    MmapInfo(void* addr, size_t length) : addr_(addr), length_(length) {}
+  };
+  std::vector<MmapInfo> huge_blocks_;
+  size_t irregular_block_num = 0;
+
+  // Stats for current active block.
+  // For each block, we allocate aligned memory chucks from one end and
+  // allocate unaligned memory chucks from the other end. Otherwise the
+  // memory waste for alignment will be higher if we allocate both types of
+  // memory from one direction.
+  char* unaligned_alloc_ptr_ = nullptr;
+  char* aligned_alloc_ptr_ = nullptr;
+  // How many bytes left in currently active block?
+  size_t alloc_bytes_remaining_ = 0;
+
+#ifdef MAP_HUGETLB
+  size_t hugetlb_size_ = 0;
+#endif  // MAP_HUGETLB
+  char* AllocateFromHugePage(size_t bytes);
+  char* AllocateFallback(size_t bytes, bool aligned);
+  char* AllocateNewBlock(size_t block_bytes);
+
+  // Bytes of memory in blocks allocated so far
+  size_t blocks_memory_ = 0;
+  AllocTracker* tracker_;
+};
+
+inline char* Arena::Allocate(size_t bytes) {
+  // The semantics of what to return are a bit messy if we allow
+  // 0-byte allocations, so we disallow them here (we don't need
+  // them for our internal use).
+  assert(bytes > 0);
+  if (bytes <= alloc_bytes_remaining_) {
+    unaligned_alloc_ptr_ -= bytes;
+    alloc_bytes_remaining_ -= bytes;
+    return unaligned_alloc_ptr_;
+  }
+  return AllocateFallback(bytes, false /* unaligned */);
+}
+
+// check and adjust the block_size so that the return value is
+//  1. in the range of [kMinBlockSize, kMaxBlockSize].
+//  2. the multiple of align unit.
+extern size_t OptimizeBlockSize(size_t block_size);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/memory/arena_test.cc b/src/rocksdb/memory/arena_test.cc
new file mode 100644
index 000000000..96e69a932
--- /dev/null
+++ b/src/rocksdb/memory/arena_test.cc
@@ -0,0 +1,205 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "memory/arena.h"
+#include "test_util/testharness.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+const size_t kHugePageSize = 2 * 1024 * 1024;
+}  // namespace
+class ArenaTest : public testing::Test {};
+
+TEST_F(ArenaTest, Empty) { Arena arena0; }
+
+namespace {
+bool CheckMemoryAllocated(size_t allocated, size_t expected) {
+  // The value returned by Arena::MemoryAllocatedBytes() may be greater than
+  // the requested memory. We choose a somewhat arbitrary upper bound of
+  // max_expected = expected * 1.1 to detect critical overallocation.
+  size_t max_expected = expected + expected / 10;
+  return allocated >= expected && allocated <= max_expected;
+}
+
+void MemoryAllocatedBytesTest(size_t huge_page_size) {
+  const int N = 17;
+  size_t req_sz;           // requested size
+  size_t bsz = 32 * 1024;  // block size
+  size_t expected_memory_allocated;
+
+  Arena arena(bsz, nullptr, huge_page_size);
+
+  // requested size > quarter of a block:
+  //   allocate requested size separately
+  req_sz = 12 * 1024;
+  for (int i = 0; i < N; i++) {
+    arena.Allocate(req_sz);
+  }
+  expected_memory_allocated = req_sz * N + Arena::kInlineSize;
+  ASSERT_PRED2(CheckMemoryAllocated, arena.MemoryAllocatedBytes(),
+               expected_memory_allocated);
+
+  arena.Allocate(Arena::kInlineSize - 1);
+
+  // requested size < quarter of a block:
+  //   allocate a block with the default size, then try to use unused part
+  //   of the block. So one new block will be allocated for the first
+  //   Allocate(99) call. All the remaining calls won't lead to new allocation.
+  req_sz = 99;
+  for (int i = 0; i < N; i++) {
+    arena.Allocate(req_sz);
+  }
+  if (huge_page_size) {
+    ASSERT_TRUE(
+        CheckMemoryAllocated(arena.MemoryAllocatedBytes(),
+                             expected_memory_allocated + bsz) ||
+        CheckMemoryAllocated(arena.MemoryAllocatedBytes(),
+                             expected_memory_allocated + huge_page_size));
+  } else {
+    expected_memory_allocated += bsz;
+    ASSERT_PRED2(CheckMemoryAllocated, arena.MemoryAllocatedBytes(),
+                 expected_memory_allocated);
+  }
+
+  // requested size > size of a block:
+  //   allocate requested size separately
+  expected_memory_allocated = arena.MemoryAllocatedBytes();
+  req_sz = 8 * 1024 * 1024;
+  for (int i = 0; i < N; i++) {
+    arena.Allocate(req_sz);
+  }
+  expected_memory_allocated += req_sz * N;
+  ASSERT_PRED2(CheckMemoryAllocated, arena.MemoryAllocatedBytes(),
+               expected_memory_allocated);
+}
+
+// Make sure we didn't count the allocate but not used memory space in
+// Arena::ApproximateMemoryUsage()
+static void ApproximateMemoryUsageTest(size_t huge_page_size) {
+  const size_t kBlockSize = 4096;
+  const size_t kEntrySize = kBlockSize / 8;
+  const size_t kZero = 0;
+  Arena arena(kBlockSize, nullptr, huge_page_size);
+  ASSERT_EQ(kZero, arena.ApproximateMemoryUsage());
+
+  // allocate inline bytes
+  const size_t kAlignUnit = alignof(max_align_t);
+  EXPECT_TRUE(arena.IsInInlineBlock());
+  arena.AllocateAligned(kAlignUnit);
+  EXPECT_TRUE(arena.IsInInlineBlock());
+  arena.AllocateAligned(Arena::kInlineSize / 2 - (2 * kAlignUnit));
+  EXPECT_TRUE(arena.IsInInlineBlock());
+  arena.AllocateAligned(Arena::kInlineSize / 2);
+  EXPECT_TRUE(arena.IsInInlineBlock());
+  ASSERT_EQ(arena.ApproximateMemoryUsage(), Arena::kInlineSize - kAlignUnit);
+  ASSERT_PRED2(CheckMemoryAllocated, arena.MemoryAllocatedBytes(),
+               Arena::kInlineSize);
+
+  auto num_blocks = kBlockSize / kEntrySize;
+
+  // first allocation
+  arena.AllocateAligned(kEntrySize);
+  EXPECT_FALSE(arena.IsInInlineBlock());
+  auto mem_usage = arena.MemoryAllocatedBytes();
+  if (huge_page_size) {
+    ASSERT_TRUE(
+        CheckMemoryAllocated(mem_usage, kBlockSize + Arena::kInlineSize) ||
+        CheckMemoryAllocated(mem_usage, huge_page_size + Arena::kInlineSize));
+  } else {
+    ASSERT_PRED2(CheckMemoryAllocated, mem_usage,
+                 kBlockSize + Arena::kInlineSize);
+  }
+  auto usage = arena.ApproximateMemoryUsage();
+  ASSERT_LT(usage, mem_usage);
+  for (size_t i = 1; i < num_blocks; ++i) {
+    arena.AllocateAligned(kEntrySize);
+    ASSERT_EQ(mem_usage, arena.MemoryAllocatedBytes());
+    ASSERT_EQ(arena.ApproximateMemoryUsage(), usage + kEntrySize);
+    EXPECT_FALSE(arena.IsInInlineBlock());
+    usage = arena.ApproximateMemoryUsage();
+  }
+  if (huge_page_size) {
+    ASSERT_TRUE(usage > mem_usage ||
+                usage + huge_page_size - kBlockSize == mem_usage);
+  } else {
+    ASSERT_GT(usage, mem_usage);
+  }
+}
+
+static void SimpleTest(size_t huge_page_size) {
+  std::vector<std::pair<size_t, char*>> allocated;
+  Arena arena(Arena::kMinBlockSize, nullptr, huge_page_size);
+  const int N = 100000;
+  size_t bytes = 0;
+  Random rnd(301);
+  for (int i = 0; i < N; i++) {
+    size_t s;
+    if (i % (N / 10) == 0) {
+      s = i;
+    } else {
+      s = rnd.OneIn(4000)
+              ? rnd.Uniform(6000)
+              : (rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20));
+    }
+    if (s == 0) {
+      // Our arena disallows size 0 allocations.
+      s = 1;
+    }
+    char* r;
+    if (rnd.OneIn(10)) {
+      r = arena.AllocateAligned(s);
+    } else {
+      r = arena.Allocate(s);
+    }
+
+    for (unsigned int b = 0; b < s; b++) {
+      // Fill the "i"th allocation with a known bit pattern
+      r[b] = i % 256;
+    }
+    bytes += s;
+    allocated.push_back(std::make_pair(s, r));
+    ASSERT_GE(arena.ApproximateMemoryUsage(), bytes);
+    if (i > N / 10) {
+      ASSERT_LE(arena.ApproximateMemoryUsage(), bytes * 1.10);
+    }
+  }
+  for (unsigned int i = 0; i < allocated.size(); i++) {
+    size_t num_bytes = allocated[i].first;
+    const char* p = allocated[i].second;
+    for (unsigned int b = 0; b < num_bytes; b++) {
+      // Check the "i"th allocation for the known bit pattern
+      ASSERT_EQ(int(p[b]) & 0xff, (int)(i % 256));
+    }
+  }
+}
+}  // namespace
+
+TEST_F(ArenaTest, MemoryAllocatedBytes) {
+  MemoryAllocatedBytesTest(0);
+  MemoryAllocatedBytesTest(kHugePageSize);
+}
+
+TEST_F(ArenaTest, ApproximateMemoryUsage) {
+  ApproximateMemoryUsageTest(0);
+  ApproximateMemoryUsageTest(kHugePageSize);
+}
+
+TEST_F(ArenaTest, Simple) {
+  SimpleTest(0);
+  SimpleTest(kHugePageSize);
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/memory/concurrent_arena.cc b/src/rocksdb/memory/concurrent_arena.cc
new file mode 100644
index 000000000..1619bd93b
--- /dev/null
+++ b/src/rocksdb/memory/concurrent_arena.cc
@@ -0,0 +1,45 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "memory/concurrent_arena.h"
+
+#include <thread>
+
+#include "port/port.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+thread_local size_t ConcurrentArena::tls_cpuid = 0;
+
+namespace {
+// If the shard block size is too large, in the worst case, every core
+// allocates a block without populate it. If the shared block size is
+// 1MB, 64 cores will quickly allocate 64MB, and may quickly trigger a
+// flush. Cap the size instead.
+const size_t kMaxShardBlockSize = size_t{128 * 1024};
+}  // namespace
+
+ConcurrentArena::ConcurrentArena(size_t block_size, AllocTracker* tracker,
+                                 size_t huge_page_size)
+    : shard_block_size_(std::min(kMaxShardBlockSize, block_size / 8)),
+      shards_(),
+      arena_(block_size, tracker, huge_page_size) {
+  Fixup();
+}
+
+ConcurrentArena::Shard* ConcurrentArena::Repick() {
+  auto shard_and_index = shards_.AccessElementAndIndex();
+  // even if we are cpu 0, use a non-zero tls_cpuid so we can tell we
+  // have repicked
+  tls_cpuid = shard_and_index.second | shards_.Size();
+  return shard_and_index.first;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/memory/concurrent_arena.h b/src/rocksdb/memory/concurrent_arena.h
new file mode 100644
index 000000000..f14507d30
--- /dev/null
+++ b/src/rocksdb/memory/concurrent_arena.h
@@ -0,0 +1,215 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <memory>
+#include <utility>
+
+#include "memory/allocator.h"
+#include "memory/arena.h"
+#include "port/lang.h"
+#include "port/likely.h"
+#include "util/core_local.h"
+#include "util/mutexlock.h"
+#include "util/thread_local.h"
+
+// Only generate field unused warning for padding array, or build under
+// GCC 4.8.1 will fail.
+#ifdef __clang__
+#define ROCKSDB_FIELD_UNUSED __attribute__((__unused__))
+#else
+#define ROCKSDB_FIELD_UNUSED
+#endif  // __clang__
+
+namespace ROCKSDB_NAMESPACE {
+
+class Logger;
+
+// ConcurrentArena wraps an Arena.  It makes it thread safe using a fast
+// inlined spinlock, and adds small per-core allocation caches to avoid
+// contention for small allocations.  To avoid any memory waste from the
+// per-core shards, they are kept small, they are lazily instantiated
+// only if ConcurrentArena actually notices concurrent use, and they
+// adjust their size so that there is no fragmentation waste when the
+// shard blocks are allocated from the underlying main arena.
+class ConcurrentArena : public Allocator {
+ public:
+  // block_size and huge_page_size are the same as for Arena (and are
+  // in fact just passed to the constructor of arena_.  The core-local
+  // shards compute their shard_block_size as a fraction of block_size
+  // that varies according to the hardware concurrency level.
+  explicit ConcurrentArena(size_t block_size = Arena::kMinBlockSize,
+                           AllocTracker* tracker = nullptr,
+                           size_t huge_page_size = 0);
+
+  char* Allocate(size_t bytes) override {
+    return AllocateImpl(bytes, false /*force_arena*/,
+                        [this, bytes]() { return arena_.Allocate(bytes); });
+  }
+
+  char* AllocateAligned(size_t bytes, size_t huge_page_size = 0,
+                        Logger* logger = nullptr) override {
+    size_t rounded_up = ((bytes - 1) | (sizeof(void*) - 1)) + 1;
+    assert(rounded_up >= bytes && rounded_up < bytes + sizeof(void*) &&
+           (rounded_up % sizeof(void*)) == 0);
+
+    return AllocateImpl(rounded_up, huge_page_size != 0 /*force_arena*/,
+                        [this, rounded_up, huge_page_size, logger]() {
+                          return arena_.AllocateAligned(rounded_up,
+                                                        huge_page_size, logger);
+                        });
+  }
+
+  size_t ApproximateMemoryUsage() const {
+    std::unique_lock<SpinMutex> lock(arena_mutex_, std::defer_lock);
+    lock.lock();
+    return arena_.ApproximateMemoryUsage() - ShardAllocatedAndUnused();
+  }
+
+  size_t MemoryAllocatedBytes() const {
+    return memory_allocated_bytes_.load(std::memory_order_relaxed);
+  }
+
+  size_t AllocatedAndUnused() const {
+    return arena_allocated_and_unused_.load(std::memory_order_relaxed) +
+           ShardAllocatedAndUnused();
+  }
+
+  size_t IrregularBlockNum() const {
+    return irregular_block_num_.load(std::memory_order_relaxed);
+  }
+
+  size_t BlockSize() const override { return arena_.BlockSize(); }
+
+ private:
+  struct Shard {
+    char padding[40] ROCKSDB_FIELD_UNUSED;
+    mutable SpinMutex mutex;
+    char* free_begin_;
+    std::atomic<size_t> allocated_and_unused_;
+
+    Shard() : free_begin_(nullptr), allocated_and_unused_(0) {}
+  };
+
+  static thread_local size_t tls_cpuid;
+
+  char padding0[56] ROCKSDB_FIELD_UNUSED;
+
+  size_t shard_block_size_;
+
+  CoreLocalArray<Shard> shards_;
+
+  Arena arena_;
+  mutable SpinMutex arena_mutex_;
+  std::atomic<size_t> arena_allocated_and_unused_;
+  std::atomic<size_t> memory_allocated_bytes_;
+  std::atomic<size_t> irregular_block_num_;
+
+  char padding1[56] ROCKSDB_FIELD_UNUSED;
+
+  Shard* Repick();
+
+  size_t ShardAllocatedAndUnused() const {
+    size_t total = 0;
+    for (size_t i = 0; i < shards_.Size(); ++i) {
+      total += shards_.AccessAtCore(i)->allocated_and_unused_.load(
+          std::memory_order_relaxed);
+    }
+    return total;
+  }
+
+  template <typename Func>
+  char* AllocateImpl(size_t bytes, bool force_arena, const Func& func) {
+    size_t cpu;
+
+    // Go directly to the arena if the allocation is too large, or if
+    // we've never needed to Repick() and the arena mutex is available
+    // with no waiting.  This keeps the fragmentation penalty of
+    // concurrency zero unless it might actually confer an advantage.
+    std::unique_lock<SpinMutex> arena_lock(arena_mutex_, std::defer_lock);
+    if (bytes > shard_block_size_ / 4 || force_arena ||
+        ((cpu = tls_cpuid) == 0 &&
+         !shards_.AccessAtCore(0)->allocated_and_unused_.load(
+             std::memory_order_relaxed) &&
+         arena_lock.try_lock())) {
+      if (!arena_lock.owns_lock()) {
+        arena_lock.lock();
+      }
+      auto rv = func();
+      Fixup();
+      return rv;
+    }
+
+    // pick a shard from which to allocate
+    Shard* s = shards_.AccessAtCore(cpu & (shards_.Size() - 1));
+    if (!s->mutex.try_lock()) {
+      s = Repick();
+      s->mutex.lock();
+    }
+    std::unique_lock<SpinMutex> lock(s->mutex, std::adopt_lock);
+
+    size_t avail = s->allocated_and_unused_.load(std::memory_order_relaxed);
+    if (avail < bytes) {
+      // reload
+      std::lock_guard<SpinMutex> reload_lock(arena_mutex_);
+
+      // If the arena's current block is within a factor of 2 of the right
+      // size, we adjust our request to avoid arena waste.
+      auto exact = arena_allocated_and_unused_.load(std::memory_order_relaxed);
+      assert(exact == arena_.AllocatedAndUnused());
+
+      if (exact >= bytes && arena_.IsInInlineBlock()) {
+        // If we haven't exhausted arena's inline block yet, allocate from arena
+        // directly. This ensures that we'll do the first few small allocations
+        // without allocating any blocks.
+        // In particular this prevents empty memtables from using
+        // disproportionately large amount of memory: a memtable allocates on
+        // the order of 1 KB of memory when created; we wouldn't want to
+        // allocate a full arena block (typically a few megabytes) for that,
+        // especially if there are thousands of empty memtables.
+        auto rv = func();
+        Fixup();
+        return rv;
+      }
+
+      avail = exact >= shard_block_size_ / 2 && exact < shard_block_size_ * 2
+                  ? exact
+                  : shard_block_size_;
+      s->free_begin_ = arena_.AllocateAligned(avail);
+      Fixup();
+    }
+    s->allocated_and_unused_.store(avail - bytes, std::memory_order_relaxed);
+
+    char* rv;
+    if ((bytes % sizeof(void*)) == 0) {
+      // aligned allocation from the beginning
+      rv = s->free_begin_;
+      s->free_begin_ += bytes;
+    } else {
+      // unaligned from the end
+      rv = s->free_begin_ + avail - bytes;
+    }
+    return rv;
+  }
+
+  void Fixup() {
+    arena_allocated_and_unused_.store(arena_.AllocatedAndUnused(),
+                                      std::memory_order_relaxed);
+    memory_allocated_bytes_.store(arena_.MemoryAllocatedBytes(),
+                                  std::memory_order_relaxed);
+    irregular_block_num_.store(arena_.IrregularBlockNum(),
+                               std::memory_order_relaxed);
+  }
+
+  ConcurrentArena(const ConcurrentArena&) = delete;
+  ConcurrentArena& operator=(const ConcurrentArena&) = delete;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/memory/jemalloc_nodump_allocator.cc b/src/rocksdb/memory/jemalloc_nodump_allocator.cc
new file mode 100644
index 000000000..62ee661d2
--- /dev/null
+++ b/src/rocksdb/memory/jemalloc_nodump_allocator.cc
@@ -0,0 +1,269 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "memory/jemalloc_nodump_allocator.h"
+
+#include <string>
+#include <thread>
+
+#include "port/likely.h"
+#include "port/port.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+std::atomic<extent_alloc_t*> JemallocNodumpAllocator::original_alloc_{nullptr};
+#endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+
+static std::unordered_map<std::string, OptionTypeInfo> jemalloc_type_info = {
+#ifndef ROCKSDB_LITE
+    {"limit_tcache_size",
+     {offsetof(struct JemallocAllocatorOptions, limit_tcache_size),
+      OptionType::kBoolean, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"tcache_size_lower_bound",
+     {offsetof(struct JemallocAllocatorOptions, tcache_size_lower_bound),
+      OptionType::kSizeT, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"tcache_size_upper_bound",
+     {offsetof(struct JemallocAllocatorOptions, tcache_size_upper_bound),
+      OptionType::kSizeT, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+};
+bool JemallocNodumpAllocator::IsSupported(std::string* why) {
+#ifndef ROCKSDB_JEMALLOC
+  *why = "Not compiled with ROCKSDB_JEMALLOC";
+  return false;
+#else
+  static const std::string unsupported =
+      "JemallocNodumpAllocator only available with jemalloc version >= 5 "
+      "and MADV_DONTDUMP is available.";
+  if (!HasJemalloc()) {
+    *why = unsupported;
+    return false;
+  }
+#ifndef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+  *why = unsupported;
+  return false;
+#else
+  return true;
+#endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+#endif  // ROCKSDB_MALLOC
+}
+
+JemallocNodumpAllocator::JemallocNodumpAllocator(
+    JemallocAllocatorOptions& options)
+    : options_(options),
+#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+      tcache_(&JemallocNodumpAllocator::DestroyThreadSpecificCache),
+#endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+      arena_index_(0) {
+  RegisterOptions(&options_, &jemalloc_type_info);
+}
+
+#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+JemallocNodumpAllocator::~JemallocNodumpAllocator() {
+  // Destroy tcache before destroying arena.
+  autovector<void*> tcache_list;
+  tcache_.Scrape(&tcache_list, nullptr);
+  for (void* tcache_index : tcache_list) {
+    DestroyThreadSpecificCache(tcache_index);
+  }
+  if (arena_index_ > 0) {
+    // Destroy arena. Silently ignore error.
+    Status s = DestroyArena(arena_index_);
+    assert(s.ok());
+    s.PermitUncheckedError();
+  }
+}
+
+size_t JemallocNodumpAllocator::UsableSize(void* p,
+                                           size_t /*allocation_size*/) const {
+  return malloc_usable_size(static_cast<void*>(p));
+}
+
+void* JemallocNodumpAllocator::Allocate(size_t size) {
+  int tcache_flag = GetThreadSpecificCache(size);
+  return mallocx(size, MALLOCX_ARENA(arena_index_) | tcache_flag);
+}
+
+void JemallocNodumpAllocator::Deallocate(void* p) {
+  // Obtain tcache.
+  size_t size = 0;
+  if (options_.limit_tcache_size) {
+    size = malloc_usable_size(p);
+  }
+  int tcache_flag = GetThreadSpecificCache(size);
+  // No need to pass arena index to dallocx(). Jemalloc will find arena index
+  // from its own metadata.
+  dallocx(p, tcache_flag);
+}
+
+Status JemallocNodumpAllocator::InitializeArenas() {
+  // Create arena.
+  size_t arena_index_size = sizeof(arena_index_);
+  int ret =
+      mallctl("arenas.create", &arena_index_, &arena_index_size, nullptr, 0);
+  if (ret != 0) {
+    return Status::Incomplete("Failed to create jemalloc arena, error code: " +
+                              std::to_string(ret));
+  }
+  assert(arena_index_ != 0);
+
+  // Read existing hooks.
+  std::string key = "arena." + std::to_string(arena_index_) + ".extent_hooks";
+  extent_hooks_t* hooks;
+  size_t hooks_size = sizeof(hooks);
+  ret = mallctl(key.c_str(), &hooks, &hooks_size, nullptr, 0);
+  if (ret != 0) {
+    return Status::Incomplete("Failed to read existing hooks, error code: " +
+                              std::to_string(ret));
+  }
+
+  // Store existing alloc.
+  extent_alloc_t* original_alloc = hooks->alloc;
+  extent_alloc_t* expected = nullptr;
+  bool success =
+      JemallocNodumpAllocator::original_alloc_.compare_exchange_strong(
+          expected, original_alloc);
+  if (!success && original_alloc != expected) {
+    return Status::Incomplete("Original alloc conflict.");
+  }
+
+  // Set the custom hook.
+  arena_hooks_.reset(new extent_hooks_t(*hooks));
+  arena_hooks_->alloc = &JemallocNodumpAllocator::Alloc;
+  extent_hooks_t* hooks_ptr = arena_hooks_.get();
+  ret = mallctl(key.c_str(), nullptr, nullptr, &hooks_ptr, sizeof(hooks_ptr));
+  if (ret != 0) {
+    return Status::Incomplete("Failed to set custom hook, error code: " +
+                              std::to_string(ret));
+  }
+  return Status::OK();
+}
+
+#endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+
+Status JemallocNodumpAllocator::PrepareOptions(
+    const ConfigOptions& config_options) {
+  std::string message;
+
+  if (!IsSupported(&message)) {
+    return Status::NotSupported(message);
+  } else if (options_.limit_tcache_size &&
+             options_.tcache_size_lower_bound >=
+                 options_.tcache_size_upper_bound) {
+    return Status::InvalidArgument(
+        "tcache_size_lower_bound larger or equal to tcache_size_upper_bound.");
+  } else if (IsMutable()) {
+    Status s = MemoryAllocator::PrepareOptions(config_options);
+#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+    if (s.ok()) {
+      s = InitializeArenas();
+    }
+#endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+    return s;
+  } else {
+    // Already prepared
+    return Status::OK();
+  }
+}
+
+#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+int JemallocNodumpAllocator::GetThreadSpecificCache(size_t size) {
+  // We always enable tcache. The only corner case is when there are a ton of
+  // threads accessing with low frequency, then it could consume a lot of
+  // memory (may reach # threads * ~1MB) without bringing too much benefit.
+  if (options_.limit_tcache_size && (size <= options_.tcache_size_lower_bound ||
+                                     size > options_.tcache_size_upper_bound)) {
+    return MALLOCX_TCACHE_NONE;
+  }
+  unsigned* tcache_index = reinterpret_cast<unsigned*>(tcache_.Get());
+  if (UNLIKELY(tcache_index == nullptr)) {
+    // Instantiate tcache.
+    tcache_index = new unsigned(0);
+    size_t tcache_index_size = sizeof(unsigned);
+    int ret =
+        mallctl("tcache.create", tcache_index, &tcache_index_size, nullptr, 0);
+    if (ret != 0) {
+      // No good way to expose the error. Silently disable tcache.
+      delete tcache_index;
+      return MALLOCX_TCACHE_NONE;
+    }
+    tcache_.Reset(static_cast<void*>(tcache_index));
+  }
+  return MALLOCX_TCACHE(*tcache_index);
+}
+void* JemallocNodumpAllocator::Alloc(extent_hooks_t* extent, void* new_addr,
+                                     size_t size, size_t alignment, bool* zero,
+                                     bool* commit, unsigned arena_ind) {
+  extent_alloc_t* original_alloc =
+      original_alloc_.load(std::memory_order_relaxed);
+  assert(original_alloc != nullptr);
+  void* result = original_alloc(extent, new_addr, size, alignment, zero, commit,
+                                arena_ind);
+  if (result != nullptr) {
+    int ret = madvise(result, size, MADV_DONTDUMP);
+    if (ret != 0) {
+      fprintf(
+          stderr,
+          "JemallocNodumpAllocator failed to set MADV_DONTDUMP, error code: %d",
+          ret);
+      assert(false);
+    }
+  }
+  return result;
+}
+
+Status JemallocNodumpAllocator::DestroyArena(unsigned arena_index) {
+  assert(arena_index != 0);
+  std::string key = "arena." + std::to_string(arena_index) + ".destroy";
+  int ret = mallctl(key.c_str(), nullptr, 0, nullptr, 0);
+  if (ret != 0) {
+    return Status::Incomplete("Failed to destroy jemalloc arena, error code: " +
+                              std::to_string(ret));
+  }
+  return Status::OK();
+}
+
+void JemallocNodumpAllocator::DestroyThreadSpecificCache(void* ptr) {
+  assert(ptr != nullptr);
+  unsigned* tcache_index = static_cast<unsigned*>(ptr);
+  size_t tcache_index_size = sizeof(unsigned);
+  int ret __attribute__((__unused__)) =
+      mallctl("tcache.destroy", nullptr, 0, tcache_index, tcache_index_size);
+  // Silently ignore error.
+  assert(ret == 0);
+  delete tcache_index;
+}
+
+#endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+
+Status NewJemallocNodumpAllocator(
+    JemallocAllocatorOptions& options,
+    std::shared_ptr<MemoryAllocator>* memory_allocator) {
+  if (memory_allocator == nullptr) {
+    return Status::InvalidArgument("memory_allocator must be non-null.");
+  }
+#ifndef ROCKSDB_JEMALLOC
+  (void)options;
+  return Status::NotSupported("Not compiled with JEMALLOC");
+#else
+  std::unique_ptr<MemoryAllocator> allocator(
+      new JemallocNodumpAllocator(options));
+  Status s = allocator->PrepareOptions(ConfigOptions());
+  if (s.ok()) {
+    memory_allocator->reset(allocator.release());
+  }
+  return s;
+#endif
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/memory/jemalloc_nodump_allocator.h b/src/rocksdb/memory/jemalloc_nodump_allocator.h
new file mode 100644
index 000000000..a1e1547d7
--- /dev/null
+++ b/src/rocksdb/memory/jemalloc_nodump_allocator.h
@@ -0,0 +1,94 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <vector>
+
+#include "port/jemalloc_helper.h"
+#include "port/port.h"
+#include "rocksdb/memory_allocator.h"
+#include "util/thread_local.h"
+#include "utilities/memory_allocators.h"
+
+#if defined(ROCKSDB_JEMALLOC) && defined(ROCKSDB_PLATFORM_POSIX)
+
+#include <sys/mman.h>
+
+#if (JEMALLOC_VERSION_MAJOR >= 5) && defined(MADV_DONTDUMP)
+#define ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+#endif  // (JEMALLOC_VERSION_MAJOR >= 5) && MADV_DONTDUMP
+#endif  // ROCKSDB_JEMALLOC && ROCKSDB_PLATFORM_POSIX
+
+namespace ROCKSDB_NAMESPACE {
+class JemallocNodumpAllocator : public BaseMemoryAllocator {
+ public:
+  explicit JemallocNodumpAllocator(JemallocAllocatorOptions& options);
+#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+  ~JemallocNodumpAllocator();
+#endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+
+  static const char* kClassName() { return "JemallocNodumpAllocator"; }
+  const char* Name() const override { return kClassName(); }
+  static bool IsSupported() {
+    std::string unused;
+    return IsSupported(&unused);
+  }
+  static bool IsSupported(std::string* why);
+  bool IsMutable() const { return arena_index_ == 0; }
+
+  Status PrepareOptions(const ConfigOptions& config_options) override;
+
+#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+  void* Allocate(size_t size) override;
+  void Deallocate(void* p) override;
+  size_t UsableSize(void* p, size_t allocation_size) const override;
+#endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+
+ private:
+#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+  Status InitializeArenas();
+
+  friend Status NewJemallocNodumpAllocator(
+      JemallocAllocatorOptions& options,
+      std::shared_ptr<MemoryAllocator>* memory_allocator);
+
+  // Custom alloc hook to replace jemalloc default alloc.
+  static void* Alloc(extent_hooks_t* extent, void* new_addr, size_t size,
+                     size_t alignment, bool* zero, bool* commit,
+                     unsigned arena_ind);
+
+  // Destroy arena on destruction of the allocator, or on failure.
+  static Status DestroyArena(unsigned arena_index);
+
+  // Destroy tcache on destruction of the allocator, or thread exit.
+  static void DestroyThreadSpecificCache(void* ptr);
+
+  // Get or create tcache. Return flag suitable to use with `mallocx`:
+  // either MALLOCX_TCACHE_NONE or MALLOCX_TCACHE(tc).
+  int GetThreadSpecificCache(size_t size);
+#endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+  JemallocAllocatorOptions options_;
+
+#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+  // A function pointer to jemalloc default alloc. Use atomic to make sure
+  // NewJemallocNodumpAllocator is thread-safe.
+  //
+  // Hack: original_alloc_ needs to be static for Alloc() to access it.
+  // alloc needs to be static to pass to jemalloc as function pointer.
+  static std::atomic<extent_alloc_t*> original_alloc_;
+
+  // Custom hooks has to outlive corresponding arena.
+  std::unique_ptr<extent_hooks_t> arena_hooks_;
+
+  // Hold thread-local tcache index.
+  ThreadLocalPtr tcache_;
+#endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+
+  // Arena index.
+  unsigned arena_index_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/memory/memkind_kmem_allocator.cc b/src/rocksdb/memory/memkind_kmem_allocator.cc
new file mode 100644
index 000000000..635c2210e
--- /dev/null
+++ b/src/rocksdb/memory/memkind_kmem_allocator.cc
@@ -0,0 +1,44 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2019 Intel Corporation
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifdef MEMKIND
+#include <memkind.h>
+#endif  // MEMKIND
+
+#include "memory/memkind_kmem_allocator.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status MemkindKmemAllocator::PrepareOptions(const ConfigOptions& options) {
+  std::string message;
+  if (!IsSupported(&message)) {
+    return Status::NotSupported(message);
+  } else {
+    return MemoryAllocator::PrepareOptions(options);
+  }
+}
+
+#ifdef MEMKIND
+void* MemkindKmemAllocator::Allocate(size_t size) {
+  void* p = memkind_malloc(MEMKIND_DAX_KMEM, size);
+  if (p == NULL) {
+    throw std::bad_alloc();
+  }
+  return p;
+}
+
+void MemkindKmemAllocator::Deallocate(void* p) {
+  memkind_free(MEMKIND_DAX_KMEM, p);
+}
+
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+size_t MemkindKmemAllocator::UsableSize(void* p,
+                                        size_t /*allocation_size*/) const {
+  return memkind_malloc_usable_size(MEMKIND_DAX_KMEM, p);
+}
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+#endif  // MEMKIND
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/memory/memkind_kmem_allocator.h b/src/rocksdb/memory/memkind_kmem_allocator.h
new file mode 100644
index 000000000..7176f17e3
--- /dev/null
+++ b/src/rocksdb/memory/memkind_kmem_allocator.h
@@ -0,0 +1,43 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2019 Intel Corporation
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/memory_allocator.h"
+#include "utilities/memory_allocators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MemkindKmemAllocator : public BaseMemoryAllocator {
+ public:
+  static const char* kClassName() { return "MemkindKmemAllocator"; }
+  const char* Name() const override { return kClassName(); }
+  static bool IsSupported() {
+    std::string unused;
+    return IsSupported(&unused);
+  }
+
+  static bool IsSupported(std::string* msg) {
+#ifdef MEMKIND
+    (void)msg;
+    return true;
+#else
+    *msg = "Not compiled with MemKind";
+    return false;
+#endif
+  }
+  Status PrepareOptions(const ConfigOptions& options) override;
+
+#ifdef MEMKIND
+  void* Allocate(size_t size) override;
+  void Deallocate(void* p) override;
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  size_t UsableSize(void* p, size_t /*allocation_size*/) const override;
+#endif
+#endif  // MEMKIND
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/memory/memory_allocator.cc b/src/rocksdb/memory/memory_allocator.cc
new file mode 100644
index 000000000..34dce9bb6
--- /dev/null
+++ b/src/rocksdb/memory/memory_allocator.cc
@@ -0,0 +1,91 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/memory_allocator.h"
+
+#include "memory/jemalloc_nodump_allocator.h"
+#include "memory/memkind_kmem_allocator.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+#include "utilities/memory_allocators.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo> ma_wrapper_type_info = {
+#ifndef ROCKSDB_LITE
+    {"target", OptionTypeInfo::AsCustomSharedPtr<MemoryAllocator>(
+                   0, OptionVerificationType::kByName, OptionTypeFlags::kNone)},
+#endif  // ROCKSDB_LITE
+};
+
+#ifndef ROCKSDB_LITE
+static int RegisterBuiltinAllocators(ObjectLibrary& library,
+                                     const std::string& /*arg*/) {
+  library.AddFactory<MemoryAllocator>(
+      DefaultMemoryAllocator::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<MemoryAllocator>* guard,
+         std::string* /*errmsg*/) {
+        guard->reset(new DefaultMemoryAllocator());
+        return guard->get();
+      });
+  library.AddFactory<MemoryAllocator>(
+      CountedMemoryAllocator::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<MemoryAllocator>* guard,
+         std::string* /*errmsg*/) {
+        guard->reset(new CountedMemoryAllocator(
+            std::make_shared<DefaultMemoryAllocator>()));
+        return guard->get();
+      });
+  library.AddFactory<MemoryAllocator>(
+      JemallocNodumpAllocator::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<MemoryAllocator>* guard,
+         std::string* errmsg) {
+        if (JemallocNodumpAllocator::IsSupported(errmsg)) {
+          JemallocAllocatorOptions options;
+          guard->reset(new JemallocNodumpAllocator(options));
+        }
+        return guard->get();
+      });
+  library.AddFactory<MemoryAllocator>(
+      MemkindKmemAllocator::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<MemoryAllocator>* guard,
+         std::string* errmsg) {
+        if (MemkindKmemAllocator::IsSupported(errmsg)) {
+          guard->reset(new MemkindKmemAllocator());
+        }
+        return guard->get();
+      });
+  size_t num_types;
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+#endif  // ROCKSDB_LITE
+}  // namespace
+
+MemoryAllocatorWrapper::MemoryAllocatorWrapper(
+    const std::shared_ptr<MemoryAllocator>& t)
+    : target_(t) {
+  RegisterOptions("", &target_, &ma_wrapper_type_info);
+}
+
+Status MemoryAllocator::CreateFromString(
+    const ConfigOptions& options, const std::string& value,
+    std::shared_ptr<MemoryAllocator>* result) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    RegisterBuiltinAllocators(*(ObjectLibrary::Default().get()), "");
+  });
+#else
+  if (value == DefaultMemoryAllocator::kClassName()) {
+    result->reset(new DefaultMemoryAllocator());
+    return Status::OK();
+  }
+#endif  // ROCKSDB_LITE
+  ConfigOptions copy = options;
+  copy.invoke_prepare_options = true;
+  return LoadManagedObject<MemoryAllocator>(copy, value, result);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/memory/memory_allocator.h b/src/rocksdb/memory/memory_allocator.h
new file mode 100644
index 000000000..f1a548659
--- /dev/null
+++ b/src/rocksdb/memory/memory_allocator.h
@@ -0,0 +1,38 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include "rocksdb/memory_allocator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct CustomDeleter {
+  CustomDeleter(MemoryAllocator* a = nullptr) : allocator(a) {}
+
+  void operator()(char* ptr) const {
+    if (allocator) {
+      allocator->Deallocate(reinterpret_cast<void*>(ptr));
+    } else {
+      delete[] ptr;
+    }
+  }
+
+  MemoryAllocator* allocator;
+};
+
+using CacheAllocationPtr = std::unique_ptr<char[], CustomDeleter>;
+
+inline CacheAllocationPtr AllocateBlock(size_t size,
+                                        MemoryAllocator* allocator) {
+  if (allocator) {
+    auto block = reinterpret_cast<char*>(allocator->Allocate(size));
+    return CacheAllocationPtr(block, allocator);
+  }
+  return CacheAllocationPtr(new char[size]);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/memory/memory_allocator_test.cc b/src/rocksdb/memory/memory_allocator_test.cc
new file mode 100644
index 000000000..6afde7165
--- /dev/null
+++ b/src/rocksdb/memory/memory_allocator_test.cc
@@ -0,0 +1,240 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2019 Intel Corporation
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <cstdio>
+
+#include "memory/jemalloc_nodump_allocator.h"
+#include "memory/memkind_kmem_allocator.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "test_util/testharness.h"
+#include "utilities/memory_allocators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO: the tests do not work in LITE mode due to relying on
+// `CreateFromString()` to create non-default memory allocators.
+#ifndef ROCKSDB_LITE
+
+class MemoryAllocatorTest
+    : public testing::Test,
+      public ::testing::WithParamInterface<std::tuple<std::string, bool>> {
+ public:
+  MemoryAllocatorTest() {
+    std::tie(id_, supported_) = GetParam();
+    Status s =
+        MemoryAllocator::CreateFromString(ConfigOptions(), id_, &allocator_);
+    EXPECT_EQ(supported_, s.ok());
+  }
+  bool IsSupported() { return supported_; }
+
+  std::shared_ptr<MemoryAllocator> allocator_;
+  std::string id_;
+
+ private:
+  bool supported_;
+};
+
+TEST_P(MemoryAllocatorTest, Allocate) {
+  if (!IsSupported()) {
+    return;
+  }
+  void* p = allocator_->Allocate(1024);
+  ASSERT_NE(p, nullptr);
+  size_t size = allocator_->UsableSize(p, 1024);
+  ASSERT_GE(size, 1024);
+  allocator_->Deallocate(p);
+}
+
+TEST_P(MemoryAllocatorTest, CreateAllocator) {
+  ConfigOptions config_options;
+  config_options.ignore_unknown_options = false;
+  config_options.ignore_unsupported_options = false;
+  std::shared_ptr<MemoryAllocator> orig, copy;
+  Status s = MemoryAllocator::CreateFromString(config_options, id_, &orig);
+  if (!IsSupported()) {
+    ASSERT_TRUE(s.IsNotSupported());
+  } else {
+    ASSERT_OK(s);
+    ASSERT_NE(orig, nullptr);
+#ifndef ROCKSDB_LITE
+    std::string str = orig->ToString(config_options);
+    ASSERT_OK(MemoryAllocator::CreateFromString(config_options, str, &copy));
+    ASSERT_EQ(orig, copy);
+#endif  // ROCKSDB_LITE
+  }
+}
+
+TEST_P(MemoryAllocatorTest, DatabaseBlockCache) {
+  if (!IsSupported()) {
+    // Check if a memory node is available for allocation
+  }
+
+  // Create database with block cache using the MemoryAllocator
+  Options options;
+  std::string dbname = test::PerThreadDBPath("allocator_test");
+  ASSERT_OK(DestroyDB(dbname, options));
+
+  options.create_if_missing = true;
+  BlockBasedTableOptions table_options;
+  auto cache = NewLRUCache(1024 * 1024, 6, false, 0.0, allocator_);
+  table_options.block_cache = cache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DB* db = nullptr;
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_NE(db, nullptr);
+  ASSERT_LE(cache->GetUsage(), 104);  // Cache will contain stats
+
+  // Write 2kB (200 values, each 10 bytes)
+  int num_keys = 200;
+  WriteOptions wo;
+  std::string val = "0123456789";
+  for (int i = 0; i < num_keys; i++) {
+    std::string key = std::to_string(i);
+    s = db->Put(wo, Slice(key), Slice(val));
+    ASSERT_OK(s);
+  }
+  ASSERT_OK(db->Flush(FlushOptions()));  // Flush all data from memtable so that
+                                         // reads are from block cache
+
+  // Read and check block cache usage
+  ReadOptions ro;
+  std::string result;
+  for (int i = 0; i < num_keys; i++) {
+    std::string key = std::to_string(i);
+    s = db->Get(ro, key, &result);
+    ASSERT_OK(s);
+    ASSERT_EQ(result, val);
+  }
+  ASSERT_GT(cache->GetUsage(), 2000);
+
+  // Close database
+  s = db->Close();
+  ASSERT_OK(s);
+  delete db;
+  ASSERT_OK(DestroyDB(dbname, options));
+}
+
+class CreateMemoryAllocatorTest : public testing::Test {
+ public:
+  CreateMemoryAllocatorTest() {
+    config_options_.ignore_unknown_options = false;
+    config_options_.ignore_unsupported_options = false;
+  }
+  ConfigOptions config_options_;
+};
+
+TEST_F(CreateMemoryAllocatorTest, JemallocOptionsTest) {
+  std::shared_ptr<MemoryAllocator> allocator;
+  std::string id = std::string("id=") + JemallocNodumpAllocator::kClassName();
+  Status s = MemoryAllocator::CreateFromString(config_options_, id, &allocator);
+  if (!JemallocNodumpAllocator::IsSupported()) {
+    ASSERT_NOK(s);
+    ROCKSDB_GTEST_BYPASS("JEMALLOC not supported");
+    return;
+  }
+  ASSERT_OK(s);
+  ASSERT_NE(allocator, nullptr);
+  JemallocAllocatorOptions jopts;
+  auto opts = allocator->GetOptions<JemallocAllocatorOptions>();
+  ASSERT_NE(opts, nullptr);
+  ASSERT_EQ(opts->limit_tcache_size, jopts.limit_tcache_size);
+  ASSERT_EQ(opts->tcache_size_lower_bound, jopts.tcache_size_lower_bound);
+  ASSERT_EQ(opts->tcache_size_upper_bound, jopts.tcache_size_upper_bound);
+
+  ASSERT_NOK(MemoryAllocator::CreateFromString(
+      config_options_,
+      id + "; limit_tcache_size=true; tcache_size_lower_bound=4096; "
+           "tcache_size_upper_bound=1024",
+      &allocator));
+  ASSERT_OK(MemoryAllocator::CreateFromString(
+      config_options_,
+      id + "; limit_tcache_size=false; tcache_size_lower_bound=4096; "
+           "tcache_size_upper_bound=1024",
+      &allocator));
+  opts = allocator->GetOptions<JemallocAllocatorOptions>();
+  ASSERT_NE(opts, nullptr);
+  ASSERT_EQ(opts->limit_tcache_size, false);
+  ASSERT_EQ(opts->tcache_size_lower_bound, 4096U);
+  ASSERT_EQ(opts->tcache_size_upper_bound, 1024U);
+  ASSERT_OK(MemoryAllocator::CreateFromString(
+      config_options_,
+      id + "; limit_tcache_size=true; tcache_size_upper_bound=4096; "
+           "tcache_size_lower_bound=1024",
+      &allocator));
+  opts = allocator->GetOptions<JemallocAllocatorOptions>();
+  ASSERT_NE(opts, nullptr);
+  ASSERT_EQ(opts->limit_tcache_size, true);
+  ASSERT_EQ(opts->tcache_size_lower_bound, 1024U);
+  ASSERT_EQ(opts->tcache_size_upper_bound, 4096U);
+}
+
+TEST_F(CreateMemoryAllocatorTest, NewJemallocNodumpAllocator) {
+  JemallocAllocatorOptions jopts;
+  std::shared_ptr<MemoryAllocator> allocator;
+
+  jopts.limit_tcache_size = true;
+  jopts.tcache_size_lower_bound = 2 * 1024;
+  jopts.tcache_size_upper_bound = 1024;
+
+  ASSERT_NOK(NewJemallocNodumpAllocator(jopts, nullptr));
+  Status s = NewJemallocNodumpAllocator(jopts, &allocator);
+  std::string msg;
+  if (!JemallocNodumpAllocator::IsSupported(&msg)) {
+    ASSERT_NOK(s);
+    ROCKSDB_GTEST_BYPASS("JEMALLOC not supported");
+    return;
+  }
+  ASSERT_NOK(s);  // Invalid options
+  ASSERT_EQ(allocator, nullptr);
+
+  jopts.tcache_size_upper_bound = 4 * 1024;
+  ASSERT_OK(NewJemallocNodumpAllocator(jopts, &allocator));
+  ASSERT_NE(allocator, nullptr);
+  auto opts = allocator->GetOptions<JemallocAllocatorOptions>();
+  ASSERT_EQ(opts->tcache_size_upper_bound, jopts.tcache_size_upper_bound);
+  ASSERT_EQ(opts->tcache_size_lower_bound, jopts.tcache_size_lower_bound);
+  ASSERT_EQ(opts->limit_tcache_size, jopts.limit_tcache_size);
+
+  jopts.limit_tcache_size = false;
+  ASSERT_OK(NewJemallocNodumpAllocator(jopts, &allocator));
+  ASSERT_NE(allocator, nullptr);
+  opts = allocator->GetOptions<JemallocAllocatorOptions>();
+  ASSERT_EQ(opts->tcache_size_upper_bound, jopts.tcache_size_upper_bound);
+  ASSERT_EQ(opts->tcache_size_lower_bound, jopts.tcache_size_lower_bound);
+  ASSERT_EQ(opts->limit_tcache_size, jopts.limit_tcache_size);
+}
+
+INSTANTIATE_TEST_CASE_P(DefaultMemoryAllocator, MemoryAllocatorTest,
+                        ::testing::Values(std::make_tuple(
+                            DefaultMemoryAllocator::kClassName(), true)));
+#ifdef MEMKIND
+INSTANTIATE_TEST_CASE_P(
+    MemkindkMemAllocator, MemoryAllocatorTest,
+    ::testing::Values(std::make_tuple(MemkindKmemAllocator::kClassName(),
+                                      MemkindKmemAllocator::IsSupported())));
+#endif  // MEMKIND
+
+#ifdef ROCKSDB_JEMALLOC
+INSTANTIATE_TEST_CASE_P(
+    JemallocNodumpAllocator, MemoryAllocatorTest,
+    ::testing::Values(std::make_tuple(JemallocNodumpAllocator::kClassName(),
+                                      JemallocNodumpAllocator::IsSupported())));
+#endif  // ROCKSDB_JEMALLOC
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/memory/memory_usage.h b/src/rocksdb/memory/memory_usage.h
new file mode 100644
index 000000000..76b9bd130
--- /dev/null
+++ b/src/rocksdb/memory/memory_usage.h
@@ -0,0 +1,38 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstddef>
+#include <unordered_map>
+#ifdef USE_FOLLY
+#include <folly/container/F14Map.h>
+#endif
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Helper methods to estimate memroy usage by std containers.
+
+template <class Key, class Value, class Hash>
+size_t ApproximateMemoryUsage(
+    const std::unordered_map<Key, Value, Hash>& umap) {
+  using Map = std::unordered_map<Key, Value, Hash>;
+  return sizeof(umap) +
+         // Size of all items plus a next pointer for each item.
+         (sizeof(typename Map::value_type) + sizeof(void*)) * umap.size() +
+         // Size of hash buckets.
+         umap.bucket_count() * sizeof(void*);
+}
+
+#ifdef USE_FOLLY
+template <class Key, class Value, class Hash>
+size_t ApproximateMemoryUsage(const folly::F14FastMap<Key, Value, Hash>& umap) {
+  return sizeof(umap) + umap.getAllocatedMemorySize();
+}
+#endif
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/memtable/alloc_tracker.cc b/src/rocksdb/memtable/alloc_tracker.cc
new file mode 100644
index 000000000..4c6d35431
--- /dev/null
+++ b/src/rocksdb/memtable/alloc_tracker.cc
@@ -0,0 +1,63 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <assert.h>
+
+#include "memory/allocator.h"
+#include "memory/arena.h"
+#include "rocksdb/write_buffer_manager.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+AllocTracker::AllocTracker(WriteBufferManager* write_buffer_manager)
+    : write_buffer_manager_(write_buffer_manager),
+      bytes_allocated_(0),
+      done_allocating_(false),
+      freed_(false) {}
+
+AllocTracker::~AllocTracker() { FreeMem(); }
+
+void AllocTracker::Allocate(size_t bytes) {
+  assert(write_buffer_manager_ != nullptr);
+  if (write_buffer_manager_->enabled() ||
+      write_buffer_manager_->cost_to_cache()) {
+    bytes_allocated_.fetch_add(bytes, std::memory_order_relaxed);
+    write_buffer_manager_->ReserveMem(bytes);
+  }
+}
+
+void AllocTracker::DoneAllocating() {
+  if (write_buffer_manager_ != nullptr && !done_allocating_) {
+    if (write_buffer_manager_->enabled() ||
+        write_buffer_manager_->cost_to_cache()) {
+      write_buffer_manager_->ScheduleFreeMem(
+          bytes_allocated_.load(std::memory_order_relaxed));
+    } else {
+      assert(bytes_allocated_.load(std::memory_order_relaxed) == 0);
+    }
+    done_allocating_ = true;
+  }
+}
+
+void AllocTracker::FreeMem() {
+  if (!done_allocating_) {
+    DoneAllocating();
+  }
+  if (write_buffer_manager_ != nullptr && !freed_) {
+    if (write_buffer_manager_->enabled() ||
+        write_buffer_manager_->cost_to_cache()) {
+      write_buffer_manager_->FreeMem(
+          bytes_allocated_.load(std::memory_order_relaxed));
+    } else {
+      assert(bytes_allocated_.load(std::memory_order_relaxed) == 0);
+    }
+    freed_ = true;
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/memtable/hash_linklist_rep.cc b/src/rocksdb/memtable/hash_linklist_rep.cc
new file mode 100644
index 000000000..a71768304
--- /dev/null
+++ b/src/rocksdb/memtable/hash_linklist_rep.cc
@@ -0,0 +1,926 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <atomic>
+
+#include "db/memtable.h"
+#include "memory/arena.h"
+#include "memtable/skiplist.h"
+#include "monitoring/histogram.h"
+#include "port/port.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+
+using Key = const char*;
+using MemtableSkipList = SkipList<Key, const MemTableRep::KeyComparator&>;
+using Pointer = std::atomic<void*>;
+
+// A data structure used as the header of a link list of a hash bucket.
+struct BucketHeader {
+  Pointer next;
+  std::atomic<uint32_t> num_entries;
+
+  explicit BucketHeader(void* n, uint32_t count)
+      : next(n), num_entries(count) {}
+
+  bool IsSkipListBucket() {
+    return next.load(std::memory_order_relaxed) == this;
+  }
+
+  uint32_t GetNumEntries() const {
+    return num_entries.load(std::memory_order_relaxed);
+  }
+
+  // REQUIRES: called from single-threaded Insert()
+  void IncNumEntries() {
+    // Only one thread can do write at one time. No need to do atomic
+    // incremental. Update it with relaxed load and store.
+    num_entries.store(GetNumEntries() + 1, std::memory_order_relaxed);
+  }
+};
+
+// A data structure used as the header of a skip list of a hash bucket.
+struct SkipListBucketHeader {
+  BucketHeader Counting_header;
+  MemtableSkipList skip_list;
+
+  explicit SkipListBucketHeader(const MemTableRep::KeyComparator& cmp,
+                                Allocator* allocator, uint32_t count)
+      : Counting_header(this,  // Pointing to itself to indicate header type.
+                        count),
+        skip_list(cmp, allocator) {}
+};
+
+struct Node {
+  // Accessors/mutators for links.  Wrapped in methods so we can
+  // add the appropriate barriers as necessary.
+  Node* Next() {
+    // Use an 'acquire load' so that we observe a fully initialized
+    // version of the returned Node.
+    return next_.load(std::memory_order_acquire);
+  }
+  void SetNext(Node* x) {
+    // Use a 'release store' so that anybody who reads through this
+    // pointer observes a fully initialized version of the inserted node.
+    next_.store(x, std::memory_order_release);
+  }
+  // No-barrier variants that can be safely used in a few locations.
+  Node* NoBarrier_Next() { return next_.load(std::memory_order_relaxed); }
+
+  void NoBarrier_SetNext(Node* x) { next_.store(x, std::memory_order_relaxed); }
+
+  // Needed for placement new below which is fine
+  Node() {}
+
+ private:
+  std::atomic<Node*> next_;
+
+  // Prohibit copying due to the below
+  Node(const Node&) = delete;
+  Node& operator=(const Node&) = delete;
+
+ public:
+  char key[1];
+};
+
+// Memory structure of the mem table:
+// It is a hash table, each bucket points to one entry, a linked list or a
+// skip list. In order to track total number of records in a bucket to determine
+// whether should switch to skip list, a header is added just to indicate
+// number of entries in the bucket.
+//
+//
+//          +-----> NULL    Case 1. Empty bucket
+//          |
+//          |
+//          | +---> +-------+
+//          | |     | Next  +--> NULL
+//          | |     +-------+
+//  +-----+ | |     |       |  Case 2. One Entry in bucket.
+//  |     +-+ |     | Data  |          next pointer points to
+//  +-----+   |     |       |          NULL. All other cases
+//  |     |   |     |       |          next pointer is not NULL.
+//  +-----+   |     +-------+
+//  |     +---+
+//  +-----+     +-> +-------+  +> +-------+  +-> +-------+
+//  |     |     |   | Next  +--+  | Next  +--+   | Next  +-->NULL
+//  +-----+     |   +-------+     +-------+      +-------+
+//  |     +-----+   | Count |     |       |      |       |
+//  +-----+         +-------+     | Data  |      | Data  |
+//  |     |                       |       |      |       |
+//  +-----+          Case 3.      |       |      |       |
+//  |     |          A header     +-------+      +-------+
+//  +-----+          points to
+//  |     |          a linked list. Count indicates total number
+//  +-----+          of rows in this bucket.
+//  |     |
+//  +-----+    +-> +-------+ <--+
+//  |     |    |   | Next  +----+
+//  +-----+    |   +-------+   Case 4. A header points to a skip
+//  |     +----+   | Count |           list and next pointer points to
+//  +-----+        +-------+           itself, to distinguish case 3 or 4.
+//  |     |        |       |           Count still is kept to indicates total
+//  +-----+        | Skip +-->         of entries in the bucket for debugging
+//  |     |        | List  |   Data    purpose.
+//  |     |        |      +-->
+//  +-----+        |       |
+//  |     |        +-------+
+//  +-----+
+//
+// We don't have data race when changing cases because:
+// (1) When changing from case 2->3, we create a new bucket header, put the
+//     single node there first without changing the original node, and do a
+//     release store when changing the bucket pointer. In that case, a reader
+//     who sees a stale value of the bucket pointer will read this node, while
+//     a reader sees the correct value because of the release store.
+// (2) When changing case 3->4, a new header is created with skip list points
+//     to the data, before doing an acquire store to change the bucket pointer.
+//     The old header and nodes are never changed, so any reader sees any
+//     of those existing pointers will guarantee to be able to iterate to the
+//     end of the linked list.
+// (3) Header's next pointer in case 3 might change, but they are never equal
+//     to itself, so no matter a reader sees any stale or newer value, it will
+//     be able to correctly distinguish case 3 and 4.
+//
+// The reason that we use case 2 is we want to make the format to be efficient
+// when the utilization of buckets is relatively low. If we use case 3 for
+// single entry bucket, we will need to waste 12 bytes for every entry,
+// which can be significant decrease of memory utilization.
+class HashLinkListRep : public MemTableRep {
+ public:
+  HashLinkListRep(const MemTableRep::KeyComparator& compare,
+                  Allocator* allocator, const SliceTransform* transform,
+                  size_t bucket_size, uint32_t threshold_use_skiplist,
+                  size_t huge_page_tlb_size, Logger* logger,
+                  int bucket_entries_logging_threshold,
+                  bool if_log_bucket_dist_when_flash);
+
+  KeyHandle Allocate(const size_t len, char** buf) override;
+
+  void Insert(KeyHandle handle) override;
+
+  bool Contains(const char* key) const override;
+
+  size_t ApproximateMemoryUsage() override;
+
+  void Get(const LookupKey& k, void* callback_args,
+           bool (*callback_func)(void* arg, const char* entry)) override;
+
+  ~HashLinkListRep() override;
+
+  MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override;
+
+  MemTableRep::Iterator* GetDynamicPrefixIterator(
+      Arena* arena = nullptr) override;
+
+ private:
+  friend class DynamicIterator;
+
+  size_t bucket_size_;
+
+  // Maps slices (which are transformed user keys) to buckets of keys sharing
+  // the same transform.
+  Pointer* buckets_;
+
+  const uint32_t threshold_use_skiplist_;
+
+  // The user-supplied transform whose domain is the user keys.
+  const SliceTransform* transform_;
+
+  const MemTableRep::KeyComparator& compare_;
+
+  Logger* logger_;
+  int bucket_entries_logging_threshold_;
+  bool if_log_bucket_dist_when_flash_;
+
+  bool LinkListContains(Node* head, const Slice& key) const;
+
+  bool IsEmptyBucket(Pointer& bucket_pointer) const {
+    return bucket_pointer.load(std::memory_order_acquire) == nullptr;
+  }
+
+  // Precondition: GetLinkListFirstNode() must have been called first and return
+  // null so that it must be a skip list bucket
+  SkipListBucketHeader* GetSkipListBucketHeader(Pointer& bucket_pointer) const;
+
+  // Returning nullptr indicates it is a skip list bucket.
+  Node* GetLinkListFirstNode(Pointer& bucket_pointer) const;
+
+  Slice GetPrefix(const Slice& internal_key) const {
+    return transform_->Transform(ExtractUserKey(internal_key));
+  }
+
+  size_t GetHash(const Slice& slice) const {
+    return GetSliceRangedNPHash(slice, bucket_size_);
+  }
+
+  Pointer& GetBucket(size_t i) const { return buckets_[i]; }
+
+  Pointer& GetBucket(const Slice& slice) const {
+    return GetBucket(GetHash(slice));
+  }
+
+  bool Equal(const Slice& a, const Key& b) const {
+    return (compare_(b, a) == 0);
+  }
+
+  bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
+
+  bool KeyIsAfterNode(const Slice& internal_key, const Node* n) const {
+    // nullptr n is considered infinite
+    return (n != nullptr) && (compare_(n->key, internal_key) < 0);
+  }
+
+  bool KeyIsAfterNode(const Key& key, const Node* n) const {
+    // nullptr n is considered infinite
+    return (n != nullptr) && (compare_(n->key, key) < 0);
+  }
+
+  bool KeyIsAfterOrAtNode(const Slice& internal_key, const Node* n) const {
+    // nullptr n is considered infinite
+    return (n != nullptr) && (compare_(n->key, internal_key) <= 0);
+  }
+
+  bool KeyIsAfterOrAtNode(const Key& key, const Node* n) const {
+    // nullptr n is considered infinite
+    return (n != nullptr) && (compare_(n->key, key) <= 0);
+  }
+
+  Node* FindGreaterOrEqualInBucket(Node* head, const Slice& key) const;
+  Node* FindLessOrEqualInBucket(Node* head, const Slice& key) const;
+
+  class FullListIterator : public MemTableRep::Iterator {
+   public:
+    explicit FullListIterator(MemtableSkipList* list, Allocator* allocator)
+        : iter_(list), full_list_(list), allocator_(allocator) {}
+
+    ~FullListIterator() override {}
+
+    // Returns true iff the iterator is positioned at a valid node.
+    bool Valid() const override { return iter_.Valid(); }
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    const char* key() const override {
+      assert(Valid());
+      return iter_.key();
+    }
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    void Next() override {
+      assert(Valid());
+      iter_.Next();
+    }
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    void Prev() override {
+      assert(Valid());
+      iter_.Prev();
+    }
+
+    // Advance to the first entry with a key >= target
+    void Seek(const Slice& internal_key, const char* memtable_key) override {
+      const char* encoded_key = (memtable_key != nullptr)
+                                    ? memtable_key
+                                    : EncodeKey(&tmp_, internal_key);
+      iter_.Seek(encoded_key);
+    }
+
+    // Retreat to the last entry with a key <= target
+    void SeekForPrev(const Slice& internal_key,
+                     const char* memtable_key) override {
+      const char* encoded_key = (memtable_key != nullptr)
+                                    ? memtable_key
+                                    : EncodeKey(&tmp_, internal_key);
+      iter_.SeekForPrev(encoded_key);
+    }
+
+    // Position at the first entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    void SeekToFirst() override { iter_.SeekToFirst(); }
+
+    // Position at the last entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    void SeekToLast() override { iter_.SeekToLast(); }
+
+   private:
+    MemtableSkipList::Iterator iter_;
+    // To destruct with the iterator.
+    std::unique_ptr<MemtableSkipList> full_list_;
+    std::unique_ptr<Allocator> allocator_;
+    std::string tmp_;  // For passing to EncodeKey
+  };
+
+  class LinkListIterator : public MemTableRep::Iterator {
+   public:
+    explicit LinkListIterator(const HashLinkListRep* const hash_link_list_rep,
+                              Node* head)
+        : hash_link_list_rep_(hash_link_list_rep),
+          head_(head),
+          node_(nullptr) {}
+
+    ~LinkListIterator() override {}
+
+    // Returns true iff the iterator is positioned at a valid node.
+    bool Valid() const override { return node_ != nullptr; }
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    const char* key() const override {
+      assert(Valid());
+      return node_->key;
+    }
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    void Next() override {
+      assert(Valid());
+      node_ = node_->Next();
+    }
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    void Prev() override {
+      // Prefix iterator does not support total order.
+      // We simply set the iterator to invalid state
+      Reset(nullptr);
+    }
+
+    // Advance to the first entry with a key >= target
+    void Seek(const Slice& internal_key,
+              const char* /*memtable_key*/) override {
+      node_ =
+          hash_link_list_rep_->FindGreaterOrEqualInBucket(head_, internal_key);
+    }
+
+    // Retreat to the last entry with a key <= target
+    void SeekForPrev(const Slice& /*internal_key*/,
+                     const char* /*memtable_key*/) override {
+      // Since we do not support Prev()
+      // We simply do not support SeekForPrev
+      Reset(nullptr);
+    }
+
+    // Position at the first entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    void SeekToFirst() override {
+      // Prefix iterator does not support total order.
+      // We simply set the iterator to invalid state
+      Reset(nullptr);
+    }
+
+    // Position at the last entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    void SeekToLast() override {
+      // Prefix iterator does not support total order.
+      // We simply set the iterator to invalid state
+      Reset(nullptr);
+    }
+
+   protected:
+    void Reset(Node* head) {
+      head_ = head;
+      node_ = nullptr;
+    }
+
+   private:
+    friend class HashLinkListRep;
+    const HashLinkListRep* const hash_link_list_rep_;
+    Node* head_;
+    Node* node_;
+
+    virtual void SeekToHead() { node_ = head_; }
+  };
+
+  class DynamicIterator : public HashLinkListRep::LinkListIterator {
+   public:
+    explicit DynamicIterator(HashLinkListRep& memtable_rep)
+        : HashLinkListRep::LinkListIterator(&memtable_rep, nullptr),
+          memtable_rep_(memtable_rep) {}
+
+    // Advance to the first entry with a key >= target
+    void Seek(const Slice& k, const char* memtable_key) override {
+      auto transformed = memtable_rep_.GetPrefix(k);
+      Pointer& bucket = memtable_rep_.GetBucket(transformed);
+
+      if (memtable_rep_.IsEmptyBucket(bucket)) {
+        skip_list_iter_.reset();
+        Reset(nullptr);
+      } else {
+        Node* first_linked_list_node =
+            memtable_rep_.GetLinkListFirstNode(bucket);
+        if (first_linked_list_node != nullptr) {
+          // The bucket is organized as a linked list
+          skip_list_iter_.reset();
+          Reset(first_linked_list_node);
+          HashLinkListRep::LinkListIterator::Seek(k, memtable_key);
+
+        } else {
+          SkipListBucketHeader* skip_list_header =
+              memtable_rep_.GetSkipListBucketHeader(bucket);
+          assert(skip_list_header != nullptr);
+          // The bucket is organized as a skip list
+          if (!skip_list_iter_) {
+            skip_list_iter_.reset(
+                new MemtableSkipList::Iterator(&skip_list_header->skip_list));
+          } else {
+            skip_list_iter_->SetList(&skip_list_header->skip_list);
+          }
+          if (memtable_key != nullptr) {
+            skip_list_iter_->Seek(memtable_key);
+          } else {
+            IterKey encoded_key;
+            encoded_key.EncodeLengthPrefixedKey(k);
+            skip_list_iter_->Seek(encoded_key.GetUserKey().data());
+          }
+        }
+      }
+    }
+
+    bool Valid() const override {
+      if (skip_list_iter_) {
+        return skip_list_iter_->Valid();
+      }
+      return HashLinkListRep::LinkListIterator::Valid();
+    }
+
+    const char* key() const override {
+      if (skip_list_iter_) {
+        return skip_list_iter_->key();
+      }
+      return HashLinkListRep::LinkListIterator::key();
+    }
+
+    void Next() override {
+      if (skip_list_iter_) {
+        skip_list_iter_->Next();
+      } else {
+        HashLinkListRep::LinkListIterator::Next();
+      }
+    }
+
+   private:
+    // the underlying memtable
+    const HashLinkListRep& memtable_rep_;
+    std::unique_ptr<MemtableSkipList::Iterator> skip_list_iter_;
+  };
+
+  class EmptyIterator : public MemTableRep::Iterator {
+    // This is used when there wasn't a bucket. It is cheaper than
+    // instantiating an empty bucket over which to iterate.
+   public:
+    EmptyIterator() {}
+    bool Valid() const override { return false; }
+    const char* key() const override {
+      assert(false);
+      return nullptr;
+    }
+    void Next() override {}
+    void Prev() override {}
+    void Seek(const Slice& /*user_key*/,
+              const char* /*memtable_key*/) override {}
+    void SeekForPrev(const Slice& /*user_key*/,
+                     const char* /*memtable_key*/) override {}
+    void SeekToFirst() override {}
+    void SeekToLast() override {}
+
+   private:
+  };
+};
+
+HashLinkListRep::HashLinkListRep(
+    const MemTableRep::KeyComparator& compare, Allocator* allocator,
+    const SliceTransform* transform, size_t bucket_size,
+    uint32_t threshold_use_skiplist, size_t huge_page_tlb_size, Logger* logger,
+    int bucket_entries_logging_threshold, bool if_log_bucket_dist_when_flash)
+    : MemTableRep(allocator),
+      bucket_size_(bucket_size),
+      // Threshold to use skip list doesn't make sense if less than 3, so we
+      // force it to be minimum of 3 to simplify implementation.
+      threshold_use_skiplist_(std::max(threshold_use_skiplist, 3U)),
+      transform_(transform),
+      compare_(compare),
+      logger_(logger),
+      bucket_entries_logging_threshold_(bucket_entries_logging_threshold),
+      if_log_bucket_dist_when_flash_(if_log_bucket_dist_when_flash) {
+  char* mem = allocator_->AllocateAligned(sizeof(Pointer) * bucket_size,
+                                          huge_page_tlb_size, logger);
+
+  buckets_ = new (mem) Pointer[bucket_size];
+
+  for (size_t i = 0; i < bucket_size_; ++i) {
+    buckets_[i].store(nullptr, std::memory_order_relaxed);
+  }
+}
+
+HashLinkListRep::~HashLinkListRep() {}
+
+KeyHandle HashLinkListRep::Allocate(const size_t len, char** buf) {
+  char* mem = allocator_->AllocateAligned(sizeof(Node) + len);
+  Node* x = new (mem) Node();
+  *buf = x->key;
+  return static_cast<void*>(x);
+}
+
+SkipListBucketHeader* HashLinkListRep::GetSkipListBucketHeader(
+    Pointer& bucket_pointer) const {
+  Pointer* first_next_pointer =
+      static_cast<Pointer*>(bucket_pointer.load(std::memory_order_acquire));
+  assert(first_next_pointer != nullptr);
+  assert(first_next_pointer->load(std::memory_order_relaxed) != nullptr);
+
+  // Counting header
+  BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
+  assert(header->IsSkipListBucket());
+  assert(header->GetNumEntries() > threshold_use_skiplist_);
+  auto* skip_list_bucket_header =
+      reinterpret_cast<SkipListBucketHeader*>(header);
+  assert(skip_list_bucket_header->Counting_header.next.load(
+             std::memory_order_relaxed) == header);
+  return skip_list_bucket_header;
+}
+
+Node* HashLinkListRep::GetLinkListFirstNode(Pointer& bucket_pointer) const {
+  Pointer* first_next_pointer =
+      static_cast<Pointer*>(bucket_pointer.load(std::memory_order_acquire));
+  assert(first_next_pointer != nullptr);
+  if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
+    // Single entry bucket
+    return reinterpret_cast<Node*>(first_next_pointer);
+  }
+
+  // It is possible that after we fetch first_next_pointer it is modified
+  // and the next is not null anymore. In this case, the bucket should have been
+  // modified to a counting header, so we should reload the first_next_pointer
+  // to make sure we see the update.
+  first_next_pointer =
+      static_cast<Pointer*>(bucket_pointer.load(std::memory_order_acquire));
+  // Counting header
+  BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
+  if (!header->IsSkipListBucket()) {
+    assert(header->GetNumEntries() <= threshold_use_skiplist_);
+    return reinterpret_cast<Node*>(
+        header->next.load(std::memory_order_acquire));
+  }
+  assert(header->GetNumEntries() > threshold_use_skiplist_);
+  return nullptr;
+}
+
+void HashLinkListRep::Insert(KeyHandle handle) {
+  Node* x = static_cast<Node*>(handle);
+  assert(!Contains(x->key));
+  Slice internal_key = GetLengthPrefixedSlice(x->key);
+  auto transformed = GetPrefix(internal_key);
+  auto& bucket = buckets_[GetHash(transformed)];
+  Pointer* first_next_pointer =
+      static_cast<Pointer*>(bucket.load(std::memory_order_relaxed));
+
+  if (first_next_pointer == nullptr) {
+    // Case 1. empty bucket
+    // NoBarrier_SetNext() suffices since we will add a barrier when
+    // we publish a pointer to "x" in prev[i].
+    x->NoBarrier_SetNext(nullptr);
+    bucket.store(x, std::memory_order_release);
+    return;
+  }
+
+  BucketHeader* header = nullptr;
+  if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
+    // Case 2. only one entry in the bucket
+    // Need to convert to a Counting bucket and turn to case 4.
+    Node* first = reinterpret_cast<Node*>(first_next_pointer);
+    // Need to add a bucket header.
+    // We have to first convert it to a bucket with header before inserting
+    // the new node. Otherwise, we might need to change next pointer of first.
+    // In that case, a reader might sees the next pointer is NULL and wrongly
+    // think the node is a bucket header.
+    auto* mem = allocator_->AllocateAligned(sizeof(BucketHeader));
+    header = new (mem) BucketHeader(first, 1);
+    bucket.store(header, std::memory_order_release);
+  } else {
+    header = reinterpret_cast<BucketHeader*>(first_next_pointer);
+    if (header->IsSkipListBucket()) {
+      // Case 4. Bucket is already a skip list
+      assert(header->GetNumEntries() > threshold_use_skiplist_);
+      auto* skip_list_bucket_header =
+          reinterpret_cast<SkipListBucketHeader*>(header);
+      // Only one thread can execute Insert() at one time. No need to do atomic
+      // incremental.
+      skip_list_bucket_header->Counting_header.IncNumEntries();
+      skip_list_bucket_header->skip_list.Insert(x->key);
+      return;
+    }
+  }
+
+  if (bucket_entries_logging_threshold_ > 0 &&
+      header->GetNumEntries() ==
+          static_cast<uint32_t>(bucket_entries_logging_threshold_)) {
+    Info(logger_,
+         "HashLinkedList bucket %" ROCKSDB_PRIszt
+         " has more than %d "
+         "entries. Key to insert: %s",
+         GetHash(transformed), header->GetNumEntries(),
+         GetLengthPrefixedSlice(x->key).ToString(true).c_str());
+  }
+
+  if (header->GetNumEntries() == threshold_use_skiplist_) {
+    // Case 3. number of entries reaches the threshold so need to convert to
+    // skip list.
+    LinkListIterator bucket_iter(
+        this, reinterpret_cast<Node*>(
+                  first_next_pointer->load(std::memory_order_relaxed)));
+    auto mem = allocator_->AllocateAligned(sizeof(SkipListBucketHeader));
+    SkipListBucketHeader* new_skip_list_header = new (mem)
+        SkipListBucketHeader(compare_, allocator_, header->GetNumEntries() + 1);
+    auto& skip_list = new_skip_list_header->skip_list;
+
+    // Add all current entries to the skip list
+    for (bucket_iter.SeekToHead(); bucket_iter.Valid(); bucket_iter.Next()) {
+      skip_list.Insert(bucket_iter.key());
+    }
+
+    // insert the new entry
+    skip_list.Insert(x->key);
+    // Set the bucket
+    bucket.store(new_skip_list_header, std::memory_order_release);
+  } else {
+    // Case 5. Need to insert to the sorted linked list without changing the
+    // header.
+    Node* first =
+        reinterpret_cast<Node*>(header->next.load(std::memory_order_relaxed));
+    assert(first != nullptr);
+    // Advance counter unless the bucket needs to be advanced to skip list.
+    // In that case, we need to make sure the previous count never exceeds
+    // threshold_use_skiplist_ to avoid readers to cast to wrong format.
+    header->IncNumEntries();
+
+    Node* cur = first;
+    Node* prev = nullptr;
+    while (true) {
+      if (cur == nullptr) {
+        break;
+      }
+      Node* next = cur->Next();
+      // Make sure the lists are sorted.
+      // If x points to head_ or next points nullptr, it is trivially satisfied.
+      assert((cur == first) || (next == nullptr) ||
+             KeyIsAfterNode(next->key, cur));
+      if (KeyIsAfterNode(internal_key, cur)) {
+        // Keep searching in this list
+        prev = cur;
+        cur = next;
+      } else {
+        break;
+      }
+    }
+
+    // Our data structure does not allow duplicate insertion
+    assert(cur == nullptr || !Equal(x->key, cur->key));
+
+    // NoBarrier_SetNext() suffices since we will add a barrier when
+    // we publish a pointer to "x" in prev[i].
+    x->NoBarrier_SetNext(cur);
+
+    if (prev) {
+      prev->SetNext(x);
+    } else {
+      header->next.store(static_cast<void*>(x), std::memory_order_release);
+    }
+  }
+}
+
+bool HashLinkListRep::Contains(const char* key) const {
+  Slice internal_key = GetLengthPrefixedSlice(key);
+
+  auto transformed = GetPrefix(internal_key);
+  Pointer& bucket = GetBucket(transformed);
+  if (IsEmptyBucket(bucket)) {
+    return false;
+  }
+
+  Node* linked_list_node = GetLinkListFirstNode(bucket);
+  if (linked_list_node != nullptr) {
+    return LinkListContains(linked_list_node, internal_key);
+  }
+
+  SkipListBucketHeader* skip_list_header = GetSkipListBucketHeader(bucket);
+  if (skip_list_header != nullptr) {
+    return skip_list_header->skip_list.Contains(key);
+  }
+  return false;
+}
+
+size_t HashLinkListRep::ApproximateMemoryUsage() {
+  // Memory is always allocated from the allocator.
+  return 0;
+}
+
+void HashLinkListRep::Get(const LookupKey& k, void* callback_args,
+                          bool (*callback_func)(void* arg, const char* entry)) {
+  auto transformed = transform_->Transform(k.user_key());
+  Pointer& bucket = GetBucket(transformed);
+
+  if (IsEmptyBucket(bucket)) {
+    return;
+  }
+
+  auto* link_list_head = GetLinkListFirstNode(bucket);
+  if (link_list_head != nullptr) {
+    LinkListIterator iter(this, link_list_head);
+    for (iter.Seek(k.internal_key(), nullptr);
+         iter.Valid() && callback_func(callback_args, iter.key());
+         iter.Next()) {
+    }
+  } else {
+    auto* skip_list_header = GetSkipListBucketHeader(bucket);
+    if (skip_list_header != nullptr) {
+      // Is a skip list
+      MemtableSkipList::Iterator iter(&skip_list_header->skip_list);
+      for (iter.Seek(k.memtable_key().data());
+           iter.Valid() && callback_func(callback_args, iter.key());
+           iter.Next()) {
+      }
+    }
+  }
+}
+
+MemTableRep::Iterator* HashLinkListRep::GetIterator(Arena* alloc_arena) {
+  // allocate a new arena of similar size to the one currently in use
+  Arena* new_arena = new Arena(allocator_->BlockSize());
+  auto list = new MemtableSkipList(compare_, new_arena);
+  HistogramImpl keys_per_bucket_hist;
+
+  for (size_t i = 0; i < bucket_size_; ++i) {
+    int count = 0;
+    Pointer& bucket = GetBucket(i);
+    if (!IsEmptyBucket(bucket)) {
+      auto* link_list_head = GetLinkListFirstNode(bucket);
+      if (link_list_head != nullptr) {
+        LinkListIterator itr(this, link_list_head);
+        for (itr.SeekToHead(); itr.Valid(); itr.Next()) {
+          list->Insert(itr.key());
+          count++;
+        }
+      } else {
+        auto* skip_list_header = GetSkipListBucketHeader(bucket);
+        assert(skip_list_header != nullptr);
+        // Is a skip list
+        MemtableSkipList::Iterator itr(&skip_list_header->skip_list);
+        for (itr.SeekToFirst(); itr.Valid(); itr.Next()) {
+          list->Insert(itr.key());
+          count++;
+        }
+      }
+    }
+    if (if_log_bucket_dist_when_flash_) {
+      keys_per_bucket_hist.Add(count);
+    }
+  }
+  if (if_log_bucket_dist_when_flash_ && logger_ != nullptr) {
+    Info(logger_, "hashLinkedList Entry distribution among buckets: %s",
+         keys_per_bucket_hist.ToString().c_str());
+  }
+
+  if (alloc_arena == nullptr) {
+    return new FullListIterator(list, new_arena);
+  } else {
+    auto mem = alloc_arena->AllocateAligned(sizeof(FullListIterator));
+    return new (mem) FullListIterator(list, new_arena);
+  }
+}
+
+MemTableRep::Iterator* HashLinkListRep::GetDynamicPrefixIterator(
+    Arena* alloc_arena) {
+  if (alloc_arena == nullptr) {
+    return new DynamicIterator(*this);
+  } else {
+    auto mem = alloc_arena->AllocateAligned(sizeof(DynamicIterator));
+    return new (mem) DynamicIterator(*this);
+  }
+}
+
+bool HashLinkListRep::LinkListContains(Node* head,
+                                       const Slice& user_key) const {
+  Node* x = FindGreaterOrEqualInBucket(head, user_key);
+  return (x != nullptr && Equal(user_key, x->key));
+}
+
+Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head,
+                                                  const Slice& key) const {
+  Node* x = head;
+  while (true) {
+    if (x == nullptr) {
+      return x;
+    }
+    Node* next = x->Next();
+    // Make sure the lists are sorted.
+    // If x points to head_ or next points nullptr, it is trivially satisfied.
+    assert((x == head) || (next == nullptr) || KeyIsAfterNode(next->key, x));
+    if (KeyIsAfterNode(key, x)) {
+      // Keep searching in this list
+      x = next;
+    } else {
+      break;
+    }
+  }
+  return x;
+}
+
+struct HashLinkListRepOptions {
+  static const char* kName() { return "HashLinkListRepFactoryOptions"; }
+  size_t bucket_count;
+  uint32_t threshold_use_skiplist;
+  size_t huge_page_tlb_size;
+  int bucket_entries_logging_threshold;
+  bool if_log_bucket_dist_when_flash;
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> hash_linklist_info = {
+    {"bucket_count",
+     {offsetof(struct HashLinkListRepOptions, bucket_count), OptionType::kSizeT,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"threshold",
+     {offsetof(struct HashLinkListRepOptions, threshold_use_skiplist),
+      OptionType::kUInt32T, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"huge_page_size",
+     {offsetof(struct HashLinkListRepOptions, huge_page_tlb_size),
+      OptionType::kSizeT, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"logging_threshold",
+     {offsetof(struct HashLinkListRepOptions, bucket_entries_logging_threshold),
+      OptionType::kInt, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"log_when_flash",
+     {offsetof(struct HashLinkListRepOptions, if_log_bucket_dist_when_flash),
+      OptionType::kBoolean, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+};
+
+class HashLinkListRepFactory : public MemTableRepFactory {
+ public:
+  explicit HashLinkListRepFactory(size_t bucket_count,
+                                  uint32_t threshold_use_skiplist,
+                                  size_t huge_page_tlb_size,
+                                  int bucket_entries_logging_threshold,
+                                  bool if_log_bucket_dist_when_flash) {
+    options_.bucket_count = bucket_count;
+    options_.threshold_use_skiplist = threshold_use_skiplist;
+    options_.huge_page_tlb_size = huge_page_tlb_size;
+    options_.bucket_entries_logging_threshold =
+        bucket_entries_logging_threshold;
+    options_.if_log_bucket_dist_when_flash = if_log_bucket_dist_when_flash;
+    RegisterOptions(&options_, &hash_linklist_info);
+  }
+
+  using MemTableRepFactory::CreateMemTableRep;
+  virtual MemTableRep* CreateMemTableRep(
+      const MemTableRep::KeyComparator& compare, Allocator* allocator,
+      const SliceTransform* transform, Logger* logger) override;
+
+  static const char* kClassName() { return "HashLinkListRepFactory"; }
+  static const char* kNickName() { return "hash_linkedlist"; }
+  virtual const char* Name() const override { return kClassName(); }
+  virtual const char* NickName() const override { return kNickName(); }
+
+ private:
+  HashLinkListRepOptions options_;
+};
+
+}  // namespace
+
+MemTableRep* HashLinkListRepFactory::CreateMemTableRep(
+    const MemTableRep::KeyComparator& compare, Allocator* allocator,
+    const SliceTransform* transform, Logger* logger) {
+  return new HashLinkListRep(
+      compare, allocator, transform, options_.bucket_count,
+      options_.threshold_use_skiplist, options_.huge_page_tlb_size, logger,
+      options_.bucket_entries_logging_threshold,
+      options_.if_log_bucket_dist_when_flash);
+}
+
+MemTableRepFactory* NewHashLinkListRepFactory(
+    size_t bucket_count, size_t huge_page_tlb_size,
+    int bucket_entries_logging_threshold, bool if_log_bucket_dist_when_flash,
+    uint32_t threshold_use_skiplist) {
+  return new HashLinkListRepFactory(
+      bucket_count, threshold_use_skiplist, huge_page_tlb_size,
+      bucket_entries_logging_threshold, if_log_bucket_dist_when_flash);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/memtable/hash_skiplist_rep.cc b/src/rocksdb/memtable/hash_skiplist_rep.cc
new file mode 100644
index 000000000..9d093829b
--- /dev/null
+++ b/src/rocksdb/memtable/hash_skiplist_rep.cc
@@ -0,0 +1,393 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#ifndef ROCKSDB_LITE
+#include <atomic>
+
+#include "db/memtable.h"
+#include "memory/arena.h"
+#include "memtable/skiplist.h"
+#include "port/port.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/murmurhash.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+
+class HashSkipListRep : public MemTableRep {
+ public:
+  HashSkipListRep(const MemTableRep::KeyComparator& compare,
+                  Allocator* allocator, const SliceTransform* transform,
+                  size_t bucket_size, int32_t skiplist_height,
+                  int32_t skiplist_branching_factor);
+
+  void Insert(KeyHandle handle) override;
+
+  bool Contains(const char* key) const override;
+
+  size_t ApproximateMemoryUsage() override;
+
+  void Get(const LookupKey& k, void* callback_args,
+           bool (*callback_func)(void* arg, const char* entry)) override;
+
+  ~HashSkipListRep() override;
+
+  MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override;
+
+  MemTableRep::Iterator* GetDynamicPrefixIterator(
+      Arena* arena = nullptr) override;
+
+ private:
+  friend class DynamicIterator;
+  using Bucket = SkipList<const char*, const MemTableRep::KeyComparator&>;
+
+  size_t bucket_size_;
+
+  const int32_t skiplist_height_;
+  const int32_t skiplist_branching_factor_;
+
+  // Maps slices (which are transformed user keys) to buckets of keys sharing
+  // the same transform.
+  std::atomic<Bucket*>* buckets_;
+
+  // The user-supplied transform whose domain is the user keys.
+  const SliceTransform* transform_;
+
+  const MemTableRep::KeyComparator& compare_;
+  // immutable after construction
+  Allocator* const allocator_;
+
+  inline size_t GetHash(const Slice& slice) const {
+    return MurmurHash(slice.data(), static_cast<int>(slice.size()), 0) %
+           bucket_size_;
+  }
+  inline Bucket* GetBucket(size_t i) const {
+    return buckets_[i].load(std::memory_order_acquire);
+  }
+  inline Bucket* GetBucket(const Slice& slice) const {
+    return GetBucket(GetHash(slice));
+  }
+  // Get a bucket from buckets_. If the bucket hasn't been initialized yet,
+  // initialize it before returning.
+  Bucket* GetInitializedBucket(const Slice& transformed);
+
+  class Iterator : public MemTableRep::Iterator {
+   public:
+    explicit Iterator(Bucket* list, bool own_list = true,
+                      Arena* arena = nullptr)
+        : list_(list), iter_(list), own_list_(own_list), arena_(arena) {}
+
+    ~Iterator() override {
+      // if we own the list, we should also delete it
+      if (own_list_) {
+        assert(list_ != nullptr);
+        delete list_;
+      }
+    }
+
+    // Returns true iff the iterator is positioned at a valid node.
+    bool Valid() const override { return list_ != nullptr && iter_.Valid(); }
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    const char* key() const override {
+      assert(Valid());
+      return iter_.key();
+    }
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    void Next() override {
+      assert(Valid());
+      iter_.Next();
+    }
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    void Prev() override {
+      assert(Valid());
+      iter_.Prev();
+    }
+
+    // Advance to the first entry with a key >= target
+    void Seek(const Slice& internal_key, const char* memtable_key) override {
+      if (list_ != nullptr) {
+        const char* encoded_key = (memtable_key != nullptr)
+                                      ? memtable_key
+                                      : EncodeKey(&tmp_, internal_key);
+        iter_.Seek(encoded_key);
+      }
+    }
+
+    // Retreat to the last entry with a key <= target
+    void SeekForPrev(const Slice& /*internal_key*/,
+                     const char* /*memtable_key*/) override {
+      // not supported
+      assert(false);
+    }
+
+    // Position at the first entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    void SeekToFirst() override {
+      if (list_ != nullptr) {
+        iter_.SeekToFirst();
+      }
+    }
+
+    // Position at the last entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    void SeekToLast() override {
+      if (list_ != nullptr) {
+        iter_.SeekToLast();
+      }
+    }
+
+   protected:
+    void Reset(Bucket* list) {
+      if (own_list_) {
+        assert(list_ != nullptr);
+        delete list_;
+      }
+      list_ = list;
+      iter_.SetList(list);
+      own_list_ = false;
+    }
+
+   private:
+    // if list_ is nullptr, we should NEVER call any methods on iter_
+    // if list_ is nullptr, this Iterator is not Valid()
+    Bucket* list_;
+    Bucket::Iterator iter_;
+    // here we track if we own list_. If we own it, we are also
+    // responsible for it's cleaning. This is a poor man's std::shared_ptr
+    bool own_list_;
+    std::unique_ptr<Arena> arena_;
+    std::string tmp_;  // For passing to EncodeKey
+  };
+
+  class DynamicIterator : public HashSkipListRep::Iterator {
+   public:
+    explicit DynamicIterator(const HashSkipListRep& memtable_rep)
+        : HashSkipListRep::Iterator(nullptr, false),
+          memtable_rep_(memtable_rep) {}
+
+    // Advance to the first entry with a key >= target
+    void Seek(const Slice& k, const char* memtable_key) override {
+      auto transformed = memtable_rep_.transform_->Transform(ExtractUserKey(k));
+      Reset(memtable_rep_.GetBucket(transformed));
+      HashSkipListRep::Iterator::Seek(k, memtable_key);
+    }
+
+    // Position at the first entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    void SeekToFirst() override {
+      // Prefix iterator does not support total order.
+      // We simply set the iterator to invalid state
+      Reset(nullptr);
+    }
+
+    // Position at the last entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    void SeekToLast() override {
+      // Prefix iterator does not support total order.
+      // We simply set the iterator to invalid state
+      Reset(nullptr);
+    }
+
+   private:
+    // the underlying memtable
+    const HashSkipListRep& memtable_rep_;
+  };
+
+  class EmptyIterator : public MemTableRep::Iterator {
+    // This is used when there wasn't a bucket. It is cheaper than
+    // instantiating an empty bucket over which to iterate.
+   public:
+    EmptyIterator() {}
+    bool Valid() const override { return false; }
+    const char* key() const override {
+      assert(false);
+      return nullptr;
+    }
+    void Next() override {}
+    void Prev() override {}
+    void Seek(const Slice& /*internal_key*/,
+              const char* /*memtable_key*/) override {}
+    void SeekForPrev(const Slice& /*internal_key*/,
+                     const char* /*memtable_key*/) override {}
+    void SeekToFirst() override {}
+    void SeekToLast() override {}
+
+   private:
+  };
+};
+
+HashSkipListRep::HashSkipListRep(const MemTableRep::KeyComparator& compare,
+                                 Allocator* allocator,
+                                 const SliceTransform* transform,
+                                 size_t bucket_size, int32_t skiplist_height,
+                                 int32_t skiplist_branching_factor)
+    : MemTableRep(allocator),
+      bucket_size_(bucket_size),
+      skiplist_height_(skiplist_height),
+      skiplist_branching_factor_(skiplist_branching_factor),
+      transform_(transform),
+      compare_(compare),
+      allocator_(allocator) {
+  auto mem =
+      allocator->AllocateAligned(sizeof(std::atomic<void*>) * bucket_size);
+  buckets_ = new (mem) std::atomic<Bucket*>[bucket_size];
+
+  for (size_t i = 0; i < bucket_size_; ++i) {
+    buckets_[i].store(nullptr, std::memory_order_relaxed);
+  }
+}
+
+HashSkipListRep::~HashSkipListRep() {}
+
+HashSkipListRep::Bucket* HashSkipListRep::GetInitializedBucket(
+    const Slice& transformed) {
+  size_t hash = GetHash(transformed);
+  auto bucket = GetBucket(hash);
+  if (bucket == nullptr) {
+    auto addr = allocator_->AllocateAligned(sizeof(Bucket));
+    bucket = new (addr) Bucket(compare_, allocator_, skiplist_height_,
+                               skiplist_branching_factor_);
+    buckets_[hash].store(bucket, std::memory_order_release);
+  }
+  return bucket;
+}
+
+void HashSkipListRep::Insert(KeyHandle handle) {
+  auto* key = static_cast<char*>(handle);
+  assert(!Contains(key));
+  auto transformed = transform_->Transform(UserKey(key));
+  auto bucket = GetInitializedBucket(transformed);
+  bucket->Insert(key);
+}
+
+bool HashSkipListRep::Contains(const char* key) const {
+  auto transformed = transform_->Transform(UserKey(key));
+  auto bucket = GetBucket(transformed);
+  if (bucket == nullptr) {
+    return false;
+  }
+  return bucket->Contains(key);
+}
+
+size_t HashSkipListRep::ApproximateMemoryUsage() { return 0; }
+
+void HashSkipListRep::Get(const LookupKey& k, void* callback_args,
+                          bool (*callback_func)(void* arg, const char* entry)) {
+  auto transformed = transform_->Transform(k.user_key());
+  auto bucket = GetBucket(transformed);
+  if (bucket != nullptr) {
+    Bucket::Iterator iter(bucket);
+    for (iter.Seek(k.memtable_key().data());
+         iter.Valid() && callback_func(callback_args, iter.key());
+         iter.Next()) {
+    }
+  }
+}
+
+MemTableRep::Iterator* HashSkipListRep::GetIterator(Arena* arena) {
+  // allocate a new arena of similar size to the one currently in use
+  Arena* new_arena = new Arena(allocator_->BlockSize());
+  auto list = new Bucket(compare_, new_arena);
+  for (size_t i = 0; i < bucket_size_; ++i) {
+    auto bucket = GetBucket(i);
+    if (bucket != nullptr) {
+      Bucket::Iterator itr(bucket);
+      for (itr.SeekToFirst(); itr.Valid(); itr.Next()) {
+        list->Insert(itr.key());
+      }
+    }
+  }
+  if (arena == nullptr) {
+    return new Iterator(list, true, new_arena);
+  } else {
+    auto mem = arena->AllocateAligned(sizeof(Iterator));
+    return new (mem) Iterator(list, true, new_arena);
+  }
+}
+
+MemTableRep::Iterator* HashSkipListRep::GetDynamicPrefixIterator(Arena* arena) {
+  if (arena == nullptr) {
+    return new DynamicIterator(*this);
+  } else {
+    auto mem = arena->AllocateAligned(sizeof(DynamicIterator));
+    return new (mem) DynamicIterator(*this);
+  }
+}
+
+struct HashSkipListRepOptions {
+  static const char* kName() { return "HashSkipListRepFactoryOptions"; }
+  size_t bucket_count;
+  int32_t skiplist_height;
+  int32_t skiplist_branching_factor;
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> hash_skiplist_info = {
+    {"bucket_count",
+     {offsetof(struct HashSkipListRepOptions, bucket_count), OptionType::kSizeT,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"skiplist_height",
+     {offsetof(struct HashSkipListRepOptions, skiplist_height),
+      OptionType::kInt32T, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"branching_factor",
+     {offsetof(struct HashSkipListRepOptions, skiplist_branching_factor),
+      OptionType::kInt32T, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+};
+
+class HashSkipListRepFactory : public MemTableRepFactory {
+ public:
+  explicit HashSkipListRepFactory(size_t bucket_count, int32_t skiplist_height,
+                                  int32_t skiplist_branching_factor) {
+    options_.bucket_count = bucket_count;
+    options_.skiplist_height = skiplist_height;
+    options_.skiplist_branching_factor = skiplist_branching_factor;
+    RegisterOptions(&options_, &hash_skiplist_info);
+  }
+
+  using MemTableRepFactory::CreateMemTableRep;
+  virtual MemTableRep* CreateMemTableRep(
+      const MemTableRep::KeyComparator& compare, Allocator* allocator,
+      const SliceTransform* transform, Logger* logger) override;
+
+  static const char* kClassName() { return "HashSkipListRepFactory"; }
+  static const char* kNickName() { return "prefix_hash"; }
+
+  virtual const char* Name() const override { return kClassName(); }
+  virtual const char* NickName() const override { return kNickName(); }
+
+ private:
+  HashSkipListRepOptions options_;
+};
+
+}  // namespace
+
+MemTableRep* HashSkipListRepFactory::CreateMemTableRep(
+    const MemTableRep::KeyComparator& compare, Allocator* allocator,
+    const SliceTransform* transform, Logger* /*logger*/) {
+  return new HashSkipListRep(compare, allocator, transform,
+                             options_.bucket_count, options_.skiplist_height,
+                             options_.skiplist_branching_factor);
+}
+
+MemTableRepFactory* NewHashSkipListRepFactory(
+    size_t bucket_count, int32_t skiplist_height,
+    int32_t skiplist_branching_factor) {
+  return new HashSkipListRepFactory(bucket_count, skiplist_height,
+                                    skiplist_branching_factor);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/memtable/inlineskiplist.h b/src/rocksdb/memtable/inlineskiplist.h
new file mode 100644
index 000000000..abb3c3ddb
--- /dev/null
+++ b/src/rocksdb/memtable/inlineskiplist.h
@@ -0,0 +1,1051 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.  Use of
+// this source code is governed by a BSD-style license that can be found
+// in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// InlineSkipList is derived from SkipList (skiplist.h), but it optimizes
+// the memory layout by requiring that the key storage be allocated through
+// the skip list instance.  For the common case of SkipList<const char*,
+// Cmp> this saves 1 pointer per skip list node and gives better cache
+// locality, at the expense of wasted padding from using AllocateAligned
+// instead of Allocate for the keys.  The unused padding will be from
+// 0 to sizeof(void*)-1 bytes, and the space savings are sizeof(void*)
+// bytes, so despite the padding the space used is always less than
+// SkipList<const char*, ..>.
+//
+// Thread safety -------------
+//
+// Writes via Insert require external synchronization, most likely a mutex.
+// InsertConcurrently can be safely called concurrently with reads and
+// with other concurrent inserts.  Reads require a guarantee that the
+// InlineSkipList will not be destroyed while the read is in progress.
+// Apart from that, reads progress without any internal locking or
+// synchronization.
+//
+// Invariants:
+//
+// (1) Allocated nodes are never deleted until the InlineSkipList is
+// destroyed.  This is trivially guaranteed by the code since we never
+// delete any skip list nodes.
+//
+// (2) The contents of a Node except for the next/prev pointers are
+// immutable after the Node has been linked into the InlineSkipList.
+// Only Insert() modifies the list, and it is careful to initialize a
+// node and use release-stores to publish the nodes in one or more lists.
+//
+// ... prev vs. next pointer ordering ...
+//
+
+#pragma once
+#include <assert.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <atomic>
+#include <type_traits>
+
+#include "memory/allocator.h"
+#include "port/likely.h"
+#include "port/port.h"
+#include "rocksdb/slice.h"
+#include "util/coding.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+template <class Comparator>
+class InlineSkipList {
+ private:
+  struct Node;
+  struct Splice;
+
+ public:
+  using DecodedKey =
+      typename std::remove_reference<Comparator>::type::DecodedType;
+
+  static const uint16_t kMaxPossibleHeight = 32;
+
+  // Create a new InlineSkipList object that will use "cmp" for comparing
+  // keys, and will allocate memory using "*allocator".  Objects allocated
+  // in the allocator must remain allocated for the lifetime of the
+  // skiplist object.
+  explicit InlineSkipList(Comparator cmp, Allocator* allocator,
+                          int32_t max_height = 12,
+                          int32_t branching_factor = 4);
+  // No copying allowed
+  InlineSkipList(const InlineSkipList&) = delete;
+  InlineSkipList& operator=(const InlineSkipList&) = delete;
+
+  // Allocates a key and a skip-list node, returning a pointer to the key
+  // portion of the node.  This method is thread-safe if the allocator
+  // is thread-safe.
+  char* AllocateKey(size_t key_size);
+
+  // Allocate a splice using allocator.
+  Splice* AllocateSplice();
+
+  // Allocate a splice on heap.
+  Splice* AllocateSpliceOnHeap();
+
+  // Inserts a key allocated by AllocateKey, after the actual key value
+  // has been filled in.
+  //
+  // REQUIRES: nothing that compares equal to key is currently in the list.
+  // REQUIRES: no concurrent calls to any of inserts.
+  bool Insert(const char* key);
+
+  // Inserts a key allocated by AllocateKey with a hint of last insert
+  // position in the skip-list. If hint points to nullptr, a new hint will be
+  // populated, which can be used in subsequent calls.
+  //
+  // It can be used to optimize the workload where there are multiple groups
+  // of keys, and each key is likely to insert to a location close to the last
+  // inserted key in the same group. One example is sequential inserts.
+  //
+  // REQUIRES: nothing that compares equal to key is currently in the list.
+  // REQUIRES: no concurrent calls to any of inserts.
+  bool InsertWithHint(const char* key, void** hint);
+
+  // Like InsertConcurrently, but with a hint
+  //
+  // REQUIRES: nothing that compares equal to key is currently in the list.
+  // REQUIRES: no concurrent calls that use same hint
+  bool InsertWithHintConcurrently(const char* key, void** hint);
+
+  // Like Insert, but external synchronization is not required.
+  bool InsertConcurrently(const char* key);
+
+  // Inserts a node into the skip list.  key must have been allocated by
+  // AllocateKey and then filled in by the caller.  If UseCAS is true,
+  // then external synchronization is not required, otherwise this method
+  // may not be called concurrently with any other insertions.
+  //
+  // Regardless of whether UseCAS is true, the splice must be owned
+  // exclusively by the current thread.  If allow_partial_splice_fix is
+  // true, then the cost of insertion is amortized O(log D), where D is
+  // the distance from the splice to the inserted key (measured as the
+  // number of intervening nodes).  Note that this bound is very good for
+  // sequential insertions!  If allow_partial_splice_fix is false then
+  // the existing splice will be ignored unless the current key is being
+  // inserted immediately after the splice.  allow_partial_splice_fix ==
+  // false has worse running time for the non-sequential case O(log N),
+  // but a better constant factor.
+  template <bool UseCAS>
+  bool Insert(const char* key, Splice* splice, bool allow_partial_splice_fix);
+
+  // Returns true iff an entry that compares equal to key is in the list.
+  bool Contains(const char* key) const;
+
+  // Return estimated number of entries smaller than `key`.
+  uint64_t EstimateCount(const char* key) const;
+
+  // Validate correctness of the skip-list.
+  void TEST_Validate() const;
+
+  // Iteration over the contents of a skip list
+  class Iterator {
+   public:
+    // Initialize an iterator over the specified list.
+    // The returned iterator is not valid.
+    explicit Iterator(const InlineSkipList* list);
+
+    // Change the underlying skiplist used for this iterator
+    // This enables us not changing the iterator without deallocating
+    // an old one and then allocating a new one
+    void SetList(const InlineSkipList* list);
+
+    // Returns true iff the iterator is positioned at a valid node.
+    bool Valid() const;
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    const char* key() const;
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    void Next();
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    void Prev();
+
+    // Advance to the first entry with a key >= target
+    void Seek(const char* target);
+
+    // Retreat to the last entry with a key <= target
+    void SeekForPrev(const char* target);
+
+    // Advance to a random entry in the list.
+    void RandomSeek();
+
+    // Position at the first entry in list.
+    // Final state of iterator is Valid() iff list is not empty.
+    void SeekToFirst();
+
+    // Position at the last entry in list.
+    // Final state of iterator is Valid() iff list is not empty.
+    void SeekToLast();
+
+   private:
+    const InlineSkipList* list_;
+    Node* node_;
+    // Intentionally copyable
+  };
+
+ private:
+  const uint16_t kMaxHeight_;
+  const uint16_t kBranching_;
+  const uint32_t kScaledInverseBranching_;
+
+  Allocator* const allocator_;  // Allocator used for allocations of nodes
+  // Immutable after construction
+  Comparator const compare_;
+  Node* const head_;
+
+  // Modified only by Insert().  Read racily by readers, but stale
+  // values are ok.
+  std::atomic<int> max_height_;  // Height of the entire list
+
+  // seq_splice_ is a Splice used for insertions in the non-concurrent
+  // case.  It caches the prev and next found during the most recent
+  // non-concurrent insertion.
+  Splice* seq_splice_;
+
+  inline int GetMaxHeight() const {
+    return max_height_.load(std::memory_order_relaxed);
+  }
+
+  int RandomHeight();
+
+  Node* AllocateNode(size_t key_size, int height);
+
+  bool Equal(const char* a, const char* b) const {
+    return (compare_(a, b) == 0);
+  }
+
+  bool LessThan(const char* a, const char* b) const {
+    return (compare_(a, b) < 0);
+  }
+
+  // Return true if key is greater than the data stored in "n".  Null n
+  // is considered infinite.  n should not be head_.
+  bool KeyIsAfterNode(const char* key, Node* n) const;
+  bool KeyIsAfterNode(const DecodedKey& key, Node* n) const;
+
+  // Returns the earliest node with a key >= key.
+  // Return nullptr if there is no such node.
+  Node* FindGreaterOrEqual(const char* key) const;
+
+  // Return the latest node with a key < key.
+  // Return head_ if there is no such node.
+  // Fills prev[level] with pointer to previous node at "level" for every
+  // level in [0..max_height_-1], if prev is non-null.
+  Node* FindLessThan(const char* key, Node** prev = nullptr) const;
+
+  // Return the latest node with a key < key on bottom_level. Start searching
+  // from root node on the level below top_level.
+  // Fills prev[level] with pointer to previous node at "level" for every
+  // level in [bottom_level..top_level-1], if prev is non-null.
+  Node* FindLessThan(const char* key, Node** prev, Node* root, int top_level,
+                     int bottom_level) const;
+
+  // Return the last node in the list.
+  // Return head_ if list is empty.
+  Node* FindLast() const;
+
+  // Returns a random entry.
+  Node* FindRandomEntry() const;
+
+  // Traverses a single level of the list, setting *out_prev to the last
+  // node before the key and *out_next to the first node after. Assumes
+  // that the key is not present in the skip list. On entry, before should
+  // point to a node that is before the key, and after should point to
+  // a node that is after the key.  after should be nullptr if a good after
+  // node isn't conveniently available.
+  template <bool prefetch_before>
+  void FindSpliceForLevel(const DecodedKey& key, Node* before, Node* after,
+                          int level, Node** out_prev, Node** out_next);
+
+  // Recomputes Splice levels from highest_level (inclusive) down to
+  // lowest_level (inclusive).
+  void RecomputeSpliceLevels(const DecodedKey& key, Splice* splice,
+                             int recompute_level);
+};
+
+// Implementation details follow
+
+template <class Comparator>
+struct InlineSkipList<Comparator>::Splice {
+  // The invariant of a Splice is that prev_[i+1].key <= prev_[i].key <
+  // next_[i].key <= next_[i+1].key for all i.  That means that if a
+  // key is bracketed by prev_[i] and next_[i] then it is bracketed by
+  // all higher levels.  It is _not_ required that prev_[i]->Next(i) ==
+  // next_[i] (it probably did at some point in the past, but intervening
+  // or concurrent operations might have inserted nodes in between).
+  int height_ = 0;
+  Node** prev_;
+  Node** next_;
+};
+
+// The Node data type is more of a pointer into custom-managed memory than
+// a traditional C++ struct.  The key is stored in the bytes immediately
+// after the struct, and the next_ pointers for nodes with height > 1 are
+// stored immediately _before_ the struct.  This avoids the need to include
+// any pointer or sizing data, which reduces per-node memory overheads.
+template <class Comparator>
+struct InlineSkipList<Comparator>::Node {
+  // Stores the height of the node in the memory location normally used for
+  // next_[0].  This is used for passing data from AllocateKey to Insert.
+  void StashHeight(const int height) {
+    assert(sizeof(int) <= sizeof(next_[0]));
+    memcpy(static_cast<void*>(&next_[0]), &height, sizeof(int));
+  }
+
+  // Retrieves the value passed to StashHeight.  Undefined after a call
+  // to SetNext or NoBarrier_SetNext.
+  int UnstashHeight() const {
+    int rv;
+    memcpy(&rv, &next_[0], sizeof(int));
+    return rv;
+  }
+
+  const char* Key() const { return reinterpret_cast<const char*>(&next_[1]); }
+
+  // Accessors/mutators for links.  Wrapped in methods so we can add
+  // the appropriate barriers as necessary, and perform the necessary
+  // addressing trickery for storing links below the Node in memory.
+  Node* Next(int n) {
+    assert(n >= 0);
+    // Use an 'acquire load' so that we observe a fully initialized
+    // version of the returned Node.
+    return ((&next_[0] - n)->load(std::memory_order_acquire));
+  }
+
+  void SetNext(int n, Node* x) {
+    assert(n >= 0);
+    // Use a 'release store' so that anybody who reads through this
+    // pointer observes a fully initialized version of the inserted node.
+    (&next_[0] - n)->store(x, std::memory_order_release);
+  }
+
+  bool CASNext(int n, Node* expected, Node* x) {
+    assert(n >= 0);
+    return (&next_[0] - n)->compare_exchange_strong(expected, x);
+  }
+
+  // No-barrier variants that can be safely used in a few locations.
+  Node* NoBarrier_Next(int n) {
+    assert(n >= 0);
+    return (&next_[0] - n)->load(std::memory_order_relaxed);
+  }
+
+  void NoBarrier_SetNext(int n, Node* x) {
+    assert(n >= 0);
+    (&next_[0] - n)->store(x, std::memory_order_relaxed);
+  }
+
+  // Insert node after prev on specific level.
+  void InsertAfter(Node* prev, int level) {
+    // NoBarrier_SetNext() suffices since we will add a barrier when
+    // we publish a pointer to "this" in prev.
+    NoBarrier_SetNext(level, prev->NoBarrier_Next(level));
+    prev->SetNext(level, this);
+  }
+
+ private:
+  // next_[0] is the lowest level link (level 0).  Higher levels are
+  // stored _earlier_, so level 1 is at next_[-1].
+  std::atomic<Node*> next_[1];
+};
+
+template <class Comparator>
+inline InlineSkipList<Comparator>::Iterator::Iterator(
+    const InlineSkipList* list) {
+  SetList(list);
+}
+
+template <class Comparator>
+inline void InlineSkipList<Comparator>::Iterator::SetList(
+    const InlineSkipList* list) {
+  list_ = list;
+  node_ = nullptr;
+}
+
+template <class Comparator>
+inline bool InlineSkipList<Comparator>::Iterator::Valid() const {
+  return node_ != nullptr;
+}
+
+template <class Comparator>
+inline const char* InlineSkipList<Comparator>::Iterator::key() const {
+  assert(Valid());
+  return node_->Key();
+}
+
+template <class Comparator>
+inline void InlineSkipList<Comparator>::Iterator::Next() {
+  assert(Valid());
+  node_ = node_->Next(0);
+}
+
+template <class Comparator>
+inline void InlineSkipList<Comparator>::Iterator::Prev() {
+  // Instead of using explicit "prev" links, we just search for the
+  // last node that falls before key.
+  assert(Valid());
+  node_ = list_->FindLessThan(node_->Key());
+  if (node_ == list_->head_) {
+    node_ = nullptr;
+  }
+}
+
+template <class Comparator>
+inline void InlineSkipList<Comparator>::Iterator::Seek(const char* target) {
+  node_ = list_->FindGreaterOrEqual(target);
+}
+
+template <class Comparator>
+inline void InlineSkipList<Comparator>::Iterator::SeekForPrev(
+    const char* target) {
+  Seek(target);
+  if (!Valid()) {
+    SeekToLast();
+  }
+  while (Valid() && list_->LessThan(target, key())) {
+    Prev();
+  }
+}
+
+template <class Comparator>
+inline void InlineSkipList<Comparator>::Iterator::RandomSeek() {
+  node_ = list_->FindRandomEntry();
+}
+
+template <class Comparator>
+inline void InlineSkipList<Comparator>::Iterator::SeekToFirst() {
+  node_ = list_->head_->Next(0);
+}
+
+template <class Comparator>
+inline void InlineSkipList<Comparator>::Iterator::SeekToLast() {
+  node_ = list_->FindLast();
+  if (node_ == list_->head_) {
+    node_ = nullptr;
+  }
+}
+
+template <class Comparator>
+int InlineSkipList<Comparator>::RandomHeight() {
+  auto rnd = Random::GetTLSInstance();
+
+  // Increase height with probability 1 in kBranching
+  int height = 1;
+  while (height < kMaxHeight_ && height < kMaxPossibleHeight &&
+         rnd->Next() < kScaledInverseBranching_) {
+    height++;
+  }
+  assert(height > 0);
+  assert(height <= kMaxHeight_);
+  assert(height <= kMaxPossibleHeight);
+  return height;
+}
+
+template <class Comparator>
+bool InlineSkipList<Comparator>::KeyIsAfterNode(const char* key,
+                                                Node* n) const {
+  // nullptr n is considered infinite
+  assert(n != head_);
+  return (n != nullptr) && (compare_(n->Key(), key) < 0);
+}
+
+template <class Comparator>
+bool InlineSkipList<Comparator>::KeyIsAfterNode(const DecodedKey& key,
+                                                Node* n) const {
+  // nullptr n is considered infinite
+  assert(n != head_);
+  return (n != nullptr) && (compare_(n->Key(), key) < 0);
+}
+
+template <class Comparator>
+typename InlineSkipList<Comparator>::Node*
+InlineSkipList<Comparator>::FindGreaterOrEqual(const char* key) const {
+  // Note: It looks like we could reduce duplication by implementing
+  // this function as FindLessThan(key)->Next(0), but we wouldn't be able
+  // to exit early on equality and the result wouldn't even be correct.
+  // A concurrent insert might occur after FindLessThan(key) but before
+  // we get a chance to call Next(0).
+  Node* x = head_;
+  int level = GetMaxHeight() - 1;
+  Node* last_bigger = nullptr;
+  const DecodedKey key_decoded = compare_.decode_key(key);
+  while (true) {
+    Node* next = x->Next(level);
+    if (next != nullptr) {
+      PREFETCH(next->Next(level), 0, 1);
+    }
+    // Make sure the lists are sorted
+    assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x));
+    // Make sure we haven't overshot during our search
+    assert(x == head_ || KeyIsAfterNode(key_decoded, x));
+    int cmp = (next == nullptr || next == last_bigger)
+                  ? 1
+                  : compare_(next->Key(), key_decoded);
+    if (cmp == 0 || (cmp > 0 && level == 0)) {
+      return next;
+    } else if (cmp < 0) {
+      // Keep searching in this list
+      x = next;
+    } else {
+      // Switch to next list, reuse compare_() result
+      last_bigger = next;
+      level--;
+    }
+  }
+}
+
+template <class Comparator>
+typename InlineSkipList<Comparator>::Node*
+InlineSkipList<Comparator>::FindLessThan(const char* key, Node** prev) const {
+  return FindLessThan(key, prev, head_, GetMaxHeight(), 0);
+}
+
+template <class Comparator>
+typename InlineSkipList<Comparator>::Node*
+InlineSkipList<Comparator>::FindLessThan(const char* key, Node** prev,
+                                         Node* root, int top_level,
+                                         int bottom_level) const {
+  assert(top_level > bottom_level);
+  int level = top_level - 1;
+  Node* x = root;
+  // KeyIsAfter(key, last_not_after) is definitely false
+  Node* last_not_after = nullptr;
+  const DecodedKey key_decoded = compare_.decode_key(key);
+  while (true) {
+    assert(x != nullptr);
+    Node* next = x->Next(level);
+    if (next != nullptr) {
+      PREFETCH(next->Next(level), 0, 1);
+    }
+    assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x));
+    assert(x == head_ || KeyIsAfterNode(key_decoded, x));
+    if (next != last_not_after && KeyIsAfterNode(key_decoded, next)) {
+      // Keep searching in this list
+      assert(next != nullptr);
+      x = next;
+    } else {
+      if (prev != nullptr) {
+        prev[level] = x;
+      }
+      if (level == bottom_level) {
+        return x;
+      } else {
+        // Switch to next list, reuse KeyIsAfterNode() result
+        last_not_after = next;
+        level--;
+      }
+    }
+  }
+}
+
+template <class Comparator>
+typename InlineSkipList<Comparator>::Node*
+InlineSkipList<Comparator>::FindLast() const {
+  Node* x = head_;
+  int level = GetMaxHeight() - 1;
+  while (true) {
+    Node* next = x->Next(level);
+    if (next == nullptr) {
+      if (level == 0) {
+        return x;
+      } else {
+        // Switch to next list
+        level--;
+      }
+    } else {
+      x = next;
+    }
+  }
+}
+
+template <class Comparator>
+typename InlineSkipList<Comparator>::Node*
+InlineSkipList<Comparator>::FindRandomEntry() const {
+  // TODO(bjlemaire): consider adding PREFETCH calls.
+  Node *x = head_, *scan_node = nullptr, *limit_node = nullptr;
+
+  // We start at the max level.
+  // FOr each level, we look at all the nodes at the level, and
+  // we randomly pick one of them. Then decrement the level
+  // and reiterate the process.
+  // eg: assume GetMaxHeight()=5, and there are #100 elements (nodes).
+  // level 4 nodes: lvl_nodes={#1, #15, #67, #84}. Randomly pick #15.
+  // We will consider all the nodes between #15 (inclusive) and #67
+  // (exclusive). #67 is called 'limit_node' here.
+  // level 3 nodes: lvl_nodes={#15, #21, #45, #51}. Randomly choose
+  // #51. #67 remains 'limit_node'.
+  // [...]
+  // level 0 nodes: lvl_nodes={#56,#57,#58,#59}. Randomly pick $57.
+  // Return Node #57.
+  std::vector<Node*> lvl_nodes;
+  Random* rnd = Random::GetTLSInstance();
+  int level = GetMaxHeight() - 1;
+
+  while (level >= 0) {
+    lvl_nodes.clear();
+    scan_node = x;
+    while (scan_node != limit_node) {
+      lvl_nodes.push_back(scan_node);
+      scan_node = scan_node->Next(level);
+    }
+    uint32_t rnd_idx = rnd->Next() % lvl_nodes.size();
+    x = lvl_nodes[rnd_idx];
+    if (rnd_idx + 1 < lvl_nodes.size()) {
+      limit_node = lvl_nodes[rnd_idx + 1];
+    }
+    level--;
+  }
+  // There is a special case where x could still be the head_
+  // (note that the head_ contains no key).
+  return x == head_ && head_ != nullptr ? head_->Next(0) : x;
+}
+
+template <class Comparator>
+uint64_t InlineSkipList<Comparator>::EstimateCount(const char* key) const {
+  uint64_t count = 0;
+
+  Node* x = head_;
+  int level = GetMaxHeight() - 1;
+  const DecodedKey key_decoded = compare_.decode_key(key);
+  while (true) {
+    assert(x == head_ || compare_(x->Key(), key_decoded) < 0);
+    Node* next = x->Next(level);
+    if (next != nullptr) {
+      PREFETCH(next->Next(level), 0, 1);
+    }
+    if (next == nullptr || compare_(next->Key(), key_decoded) >= 0) {
+      if (level == 0) {
+        return count;
+      } else {
+        // Switch to next list
+        count *= kBranching_;
+        level--;
+      }
+    } else {
+      x = next;
+      count++;
+    }
+  }
+}
+
+template <class Comparator>
+InlineSkipList<Comparator>::InlineSkipList(const Comparator cmp,
+                                           Allocator* allocator,
+                                           int32_t max_height,
+                                           int32_t branching_factor)
+    : kMaxHeight_(static_cast<uint16_t>(max_height)),
+      kBranching_(static_cast<uint16_t>(branching_factor)),
+      kScaledInverseBranching_((Random::kMaxNext + 1) / kBranching_),
+      allocator_(allocator),
+      compare_(cmp),
+      head_(AllocateNode(0, max_height)),
+      max_height_(1),
+      seq_splice_(AllocateSplice()) {
+  assert(max_height > 0 && kMaxHeight_ == static_cast<uint32_t>(max_height));
+  assert(branching_factor > 1 &&
+         kBranching_ == static_cast<uint32_t>(branching_factor));
+  assert(kScaledInverseBranching_ > 0);
+
+  for (int i = 0; i < kMaxHeight_; ++i) {
+    head_->SetNext(i, nullptr);
+  }
+}
+
+template <class Comparator>
+char* InlineSkipList<Comparator>::AllocateKey(size_t key_size) {
+  return const_cast<char*>(AllocateNode(key_size, RandomHeight())->Key());
+}
+
+template <class Comparator>
+typename InlineSkipList<Comparator>::Node*
+InlineSkipList<Comparator>::AllocateNode(size_t key_size, int height) {
+  auto prefix = sizeof(std::atomic<Node*>) * (height - 1);
+
+  // prefix is space for the height - 1 pointers that we store before
+  // the Node instance (next_[-(height - 1) .. -1]).  Node starts at
+  // raw + prefix, and holds the bottom-mode (level 0) skip list pointer
+  // next_[0].  key_size is the bytes for the key, which comes just after
+  // the Node.
+  char* raw = allocator_->AllocateAligned(prefix + sizeof(Node) + key_size);
+  Node* x = reinterpret_cast<Node*>(raw + prefix);
+
+  // Once we've linked the node into the skip list we don't actually need
+  // to know its height, because we can implicitly use the fact that we
+  // traversed into a node at level h to known that h is a valid level
+  // for that node.  We need to convey the height to the Insert step,
+  // however, so that it can perform the proper links.  Since we're not
+  // using the pointers at the moment, StashHeight temporarily borrow
+  // storage from next_[0] for that purpose.
+  x->StashHeight(height);
+  return x;
+}
+
+template <class Comparator>
+typename InlineSkipList<Comparator>::Splice*
+InlineSkipList<Comparator>::AllocateSplice() {
+  // size of prev_ and next_
+  size_t array_size = sizeof(Node*) * (kMaxHeight_ + 1);
+  char* raw = allocator_->AllocateAligned(sizeof(Splice) + array_size * 2);
+  Splice* splice = reinterpret_cast<Splice*>(raw);
+  splice->height_ = 0;
+  splice->prev_ = reinterpret_cast<Node**>(raw + sizeof(Splice));
+  splice->next_ = reinterpret_cast<Node**>(raw + sizeof(Splice) + array_size);
+  return splice;
+}
+
+template <class Comparator>
+typename InlineSkipList<Comparator>::Splice*
+InlineSkipList<Comparator>::AllocateSpliceOnHeap() {
+  size_t array_size = sizeof(Node*) * (kMaxHeight_ + 1);
+  char* raw = new char[sizeof(Splice) + array_size * 2];
+  Splice* splice = reinterpret_cast<Splice*>(raw);
+  splice->height_ = 0;
+  splice->prev_ = reinterpret_cast<Node**>(raw + sizeof(Splice));
+  splice->next_ = reinterpret_cast<Node**>(raw + sizeof(Splice) + array_size);
+  return splice;
+}
+
+template <class Comparator>
+bool InlineSkipList<Comparator>::Insert(const char* key) {
+  return Insert<false>(key, seq_splice_, false);
+}
+
+template <class Comparator>
+bool InlineSkipList<Comparator>::InsertConcurrently(const char* key) {
+  Node* prev[kMaxPossibleHeight];
+  Node* next[kMaxPossibleHeight];
+  Splice splice;
+  splice.prev_ = prev;
+  splice.next_ = next;
+  return Insert<true>(key, &splice, false);
+}
+
+template <class Comparator>
+bool InlineSkipList<Comparator>::InsertWithHint(const char* key, void** hint) {
+  assert(hint != nullptr);
+  Splice* splice = reinterpret_cast<Splice*>(*hint);
+  if (splice == nullptr) {
+    splice = AllocateSplice();
+    *hint = reinterpret_cast<void*>(splice);
+  }
+  return Insert<false>(key, splice, true);
+}
+
+template <class Comparator>
+bool InlineSkipList<Comparator>::InsertWithHintConcurrently(const char* key,
+                                                            void** hint) {
+  assert(hint != nullptr);
+  Splice* splice = reinterpret_cast<Splice*>(*hint);
+  if (splice == nullptr) {
+    splice = AllocateSpliceOnHeap();
+    *hint = reinterpret_cast<void*>(splice);
+  }
+  return Insert<true>(key, splice, true);
+}
+
+template <class Comparator>
+template <bool prefetch_before>
+void InlineSkipList<Comparator>::FindSpliceForLevel(const DecodedKey& key,
+                                                    Node* before, Node* after,
+                                                    int level, Node** out_prev,
+                                                    Node** out_next) {
+  while (true) {
+    Node* next = before->Next(level);
+    if (next != nullptr) {
+      PREFETCH(next->Next(level), 0, 1);
+    }
+    if (prefetch_before == true) {
+      if (next != nullptr && level > 0) {
+        PREFETCH(next->Next(level - 1), 0, 1);
+      }
+    }
+    assert(before == head_ || next == nullptr ||
+           KeyIsAfterNode(next->Key(), before));
+    assert(before == head_ || KeyIsAfterNode(key, before));
+    if (next == after || !KeyIsAfterNode(key, next)) {
+      // found it
+      *out_prev = before;
+      *out_next = next;
+      return;
+    }
+    before = next;
+  }
+}
+
+template <class Comparator>
+void InlineSkipList<Comparator>::RecomputeSpliceLevels(const DecodedKey& key,
+                                                       Splice* splice,
+                                                       int recompute_level) {
+  assert(recompute_level > 0);
+  assert(recompute_level <= splice->height_);
+  for (int i = recompute_level - 1; i >= 0; --i) {
+    FindSpliceForLevel<true>(key, splice->prev_[i + 1], splice->next_[i + 1], i,
+                             &splice->prev_[i], &splice->next_[i]);
+  }
+}
+
+template <class Comparator>
+template <bool UseCAS>
+bool InlineSkipList<Comparator>::Insert(const char* key, Splice* splice,
+                                        bool allow_partial_splice_fix) {
+  Node* x = reinterpret_cast<Node*>(const_cast<char*>(key)) - 1;
+  const DecodedKey key_decoded = compare_.decode_key(key);
+  int height = x->UnstashHeight();
+  assert(height >= 1 && height <= kMaxHeight_);
+
+  int max_height = max_height_.load(std::memory_order_relaxed);
+  while (height > max_height) {
+    if (max_height_.compare_exchange_weak(max_height, height)) {
+      // successfully updated it
+      max_height = height;
+      break;
+    }
+    // else retry, possibly exiting the loop because somebody else
+    // increased it
+  }
+  assert(max_height <= kMaxPossibleHeight);
+
+  int recompute_height = 0;
+  if (splice->height_ < max_height) {
+    // Either splice has never been used or max_height has grown since
+    // last use.  We could potentially fix it in the latter case, but
+    // that is tricky.
+    splice->prev_[max_height] = head_;
+    splice->next_[max_height] = nullptr;
+    splice->height_ = max_height;
+    recompute_height = max_height;
+  } else {
+    // Splice is a valid proper-height splice that brackets some
+    // key, but does it bracket this one?  We need to validate it and
+    // recompute a portion of the splice (levels 0..recompute_height-1)
+    // that is a superset of all levels that don't bracket the new key.
+    // Several choices are reasonable, because we have to balance the work
+    // saved against the extra comparisons required to validate the Splice.
+    //
+    // One strategy is just to recompute all of orig_splice_height if the
+    // bottom level isn't bracketing.  This pessimistically assumes that
+    // we will either get a perfect Splice hit (increasing sequential
+    // inserts) or have no locality.
+    //
+    // Another strategy is to walk up the Splice's levels until we find
+    // a level that brackets the key.  This strategy lets the Splice
+    // hint help for other cases: it turns insertion from O(log N) into
+    // O(log D), where D is the number of nodes in between the key that
+    // produced the Splice and the current insert (insertion is aided
+    // whether the new key is before or after the splice).  If you have
+    // a way of using a prefix of the key to map directly to the closest
+    // Splice out of O(sqrt(N)) Splices and we make it so that splices
+    // can also be used as hints during read, then we end up with Oshman's
+    // and Shavit's SkipTrie, which has O(log log N) lookup and insertion
+    // (compare to O(log N) for skip list).
+    //
+    // We control the pessimistic strategy with allow_partial_splice_fix.
+    // A good strategy is probably to be pessimistic for seq_splice_,
+    // optimistic if the caller actually went to the work of providing
+    // a Splice.
+    while (recompute_height < max_height) {
+      if (splice->prev_[recompute_height]->Next(recompute_height) !=
+          splice->next_[recompute_height]) {
+        // splice isn't tight at this level, there must have been some inserts
+        // to this
+        // location that didn't update the splice.  We might only be a little
+        // stale, but if
+        // the splice is very stale it would be O(N) to fix it.  We haven't used
+        // up any of
+        // our budget of comparisons, so always move up even if we are
+        // pessimistic about
+        // our chances of success.
+        ++recompute_height;
+      } else if (splice->prev_[recompute_height] != head_ &&
+                 !KeyIsAfterNode(key_decoded,
+                                 splice->prev_[recompute_height])) {
+        // key is from before splice
+        if (allow_partial_splice_fix) {
+          // skip all levels with the same node without more comparisons
+          Node* bad = splice->prev_[recompute_height];
+          while (splice->prev_[recompute_height] == bad) {
+            ++recompute_height;
+          }
+        } else {
+          // we're pessimistic, recompute everything
+          recompute_height = max_height;
+        }
+      } else if (KeyIsAfterNode(key_decoded, splice->next_[recompute_height])) {
+        // key is from after splice
+        if (allow_partial_splice_fix) {
+          Node* bad = splice->next_[recompute_height];
+          while (splice->next_[recompute_height] == bad) {
+            ++recompute_height;
+          }
+        } else {
+          recompute_height = max_height;
+        }
+      } else {
+        // this level brackets the key, we won!
+        break;
+      }
+    }
+  }
+  assert(recompute_height <= max_height);
+  if (recompute_height > 0) {
+    RecomputeSpliceLevels(key_decoded, splice, recompute_height);
+  }
+
+  bool splice_is_valid = true;
+  if (UseCAS) {
+    for (int i = 0; i < height; ++i) {
+      while (true) {
+        // Checking for duplicate keys on the level 0 is sufficient
+        if (UNLIKELY(i == 0 && splice->next_[i] != nullptr &&
+                     compare_(x->Key(), splice->next_[i]->Key()) >= 0)) {
+          // duplicate key
+          return false;
+        }
+        if (UNLIKELY(i == 0 && splice->prev_[i] != head_ &&
+                     compare_(splice->prev_[i]->Key(), x->Key()) >= 0)) {
+          // duplicate key
+          return false;
+        }
+        assert(splice->next_[i] == nullptr ||
+               compare_(x->Key(), splice->next_[i]->Key()) < 0);
+        assert(splice->prev_[i] == head_ ||
+               compare_(splice->prev_[i]->Key(), x->Key()) < 0);
+        x->NoBarrier_SetNext(i, splice->next_[i]);
+        if (splice->prev_[i]->CASNext(i, splice->next_[i], x)) {
+          // success
+          break;
+        }
+        // CAS failed, we need to recompute prev and next. It is unlikely
+        // to be helpful to try to use a different level as we redo the
+        // search, because it should be unlikely that lots of nodes have
+        // been inserted between prev[i] and next[i]. No point in using
+        // next[i] as the after hint, because we know it is stale.
+        FindSpliceForLevel<false>(key_decoded, splice->prev_[i], nullptr, i,
+                                  &splice->prev_[i], &splice->next_[i]);
+
+        // Since we've narrowed the bracket for level i, we might have
+        // violated the Splice constraint between i and i-1.  Make sure
+        // we recompute the whole thing next time.
+        if (i > 0) {
+          splice_is_valid = false;
+        }
+      }
+    }
+  } else {
+    for (int i = 0; i < height; ++i) {
+      if (i >= recompute_height &&
+          splice->prev_[i]->Next(i) != splice->next_[i]) {
+        FindSpliceForLevel<false>(key_decoded, splice->prev_[i], nullptr, i,
+                                  &splice->prev_[i], &splice->next_[i]);
+      }
+      // Checking for duplicate keys on the level 0 is sufficient
+      if (UNLIKELY(i == 0 && splice->next_[i] != nullptr &&
+                   compare_(x->Key(), splice->next_[i]->Key()) >= 0)) {
+        // duplicate key
+        return false;
+      }
+      if (UNLIKELY(i == 0 && splice->prev_[i] != head_ &&
+                   compare_(splice->prev_[i]->Key(), x->Key()) >= 0)) {
+        // duplicate key
+        return false;
+      }
+      assert(splice->next_[i] == nullptr ||
+             compare_(x->Key(), splice->next_[i]->Key()) < 0);
+      assert(splice->prev_[i] == head_ ||
+             compare_(splice->prev_[i]->Key(), x->Key()) < 0);
+      assert(splice->prev_[i]->Next(i) == splice->next_[i]);
+      x->NoBarrier_SetNext(i, splice->next_[i]);
+      splice->prev_[i]->SetNext(i, x);
+    }
+  }
+  if (splice_is_valid) {
+    for (int i = 0; i < height; ++i) {
+      splice->prev_[i] = x;
+    }
+    assert(splice->prev_[splice->height_] == head_);
+    assert(splice->next_[splice->height_] == nullptr);
+    for (int i = 0; i < splice->height_; ++i) {
+      assert(splice->next_[i] == nullptr ||
+             compare_(key, splice->next_[i]->Key()) < 0);
+      assert(splice->prev_[i] == head_ ||
+             compare_(splice->prev_[i]->Key(), key) <= 0);
+      assert(splice->prev_[i + 1] == splice->prev_[i] ||
+             splice->prev_[i + 1] == head_ ||
+             compare_(splice->prev_[i + 1]->Key(), splice->prev_[i]->Key()) <
+                 0);
+      assert(splice->next_[i + 1] == splice->next_[i] ||
+             splice->next_[i + 1] == nullptr ||
+             compare_(splice->next_[i]->Key(), splice->next_[i + 1]->Key()) <
+                 0);
+    }
+  } else {
+    splice->height_ = 0;
+  }
+  return true;
+}
+
+template <class Comparator>
+bool InlineSkipList<Comparator>::Contains(const char* key) const {
+  Node* x = FindGreaterOrEqual(key);
+  if (x != nullptr && Equal(key, x->Key())) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+template <class Comparator>
+void InlineSkipList<Comparator>::TEST_Validate() const {
+  // Interate over all levels at the same time, and verify nodes appear in
+  // the right order, and nodes appear in upper level also appear in lower
+  // levels.
+  Node* nodes[kMaxPossibleHeight];
+  int max_height = GetMaxHeight();
+  assert(max_height > 0);
+  for (int i = 0; i < max_height; i++) {
+    nodes[i] = head_;
+  }
+  while (nodes[0] != nullptr) {
+    Node* l0_next = nodes[0]->Next(0);
+    if (l0_next == nullptr) {
+      break;
+    }
+    assert(nodes[0] == head_ || compare_(nodes[0]->Key(), l0_next->Key()) < 0);
+    nodes[0] = l0_next;
+
+    int i = 1;
+    while (i < max_height) {
+      Node* next = nodes[i]->Next(i);
+      if (next == nullptr) {
+        break;
+      }
+      auto cmp = compare_(nodes[0]->Key(), next->Key());
+      assert(cmp <= 0);
+      if (cmp == 0) {
+        assert(next == nodes[0]);
+        nodes[i] = next;
+      } else {
+        break;
+      }
+      i++;
+    }
+  }
+  for (int i = 1; i < max_height; i++) {
+    assert(nodes[i] != nullptr && nodes[i]->Next(i) == nullptr);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/memtable/inlineskiplist_test.cc b/src/rocksdb/memtable/inlineskiplist_test.cc
new file mode 100644
index 000000000..f85644064
--- /dev/null
+++ b/src/rocksdb/memtable/inlineskiplist_test.cc
@@ -0,0 +1,664 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "memtable/inlineskiplist.h"
+
+#include <set>
+#include <unordered_set>
+
+#include "memory/concurrent_arena.h"
+#include "rocksdb/env.h"
+#include "test_util/testharness.h"
+#include "util/hash.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Our test skip list stores 8-byte unsigned integers
+using Key = uint64_t;
+
+static const char* Encode(const uint64_t* key) {
+  return reinterpret_cast<const char*>(key);
+}
+
+static Key Decode(const char* key) {
+  Key rv;
+  memcpy(&rv, key, sizeof(Key));
+  return rv;
+}
+
+struct TestComparator {
+  using DecodedType = Key;
+
+  static DecodedType decode_key(const char* b) { return Decode(b); }
+
+  int operator()(const char* a, const char* b) const {
+    if (Decode(a) < Decode(b)) {
+      return -1;
+    } else if (Decode(a) > Decode(b)) {
+      return +1;
+    } else {
+      return 0;
+    }
+  }
+
+  int operator()(const char* a, const DecodedType b) const {
+    if (Decode(a) < b) {
+      return -1;
+    } else if (Decode(a) > b) {
+      return +1;
+    } else {
+      return 0;
+    }
+  }
+};
+
+using TestInlineSkipList = InlineSkipList<TestComparator>;
+
+class InlineSkipTest : public testing::Test {
+ public:
+  void Insert(TestInlineSkipList* list, Key key) {
+    char* buf = list->AllocateKey(sizeof(Key));
+    memcpy(buf, &key, sizeof(Key));
+    list->Insert(buf);
+    keys_.insert(key);
+  }
+
+  bool InsertWithHint(TestInlineSkipList* list, Key key, void** hint) {
+    char* buf = list->AllocateKey(sizeof(Key));
+    memcpy(buf, &key, sizeof(Key));
+    bool res = list->InsertWithHint(buf, hint);
+    keys_.insert(key);
+    return res;
+  }
+
+  void Validate(TestInlineSkipList* list) {
+    // Check keys exist.
+    for (Key key : keys_) {
+      ASSERT_TRUE(list->Contains(Encode(&key)));
+    }
+    // Iterate over the list, make sure keys appears in order and no extra
+    // keys exist.
+    TestInlineSkipList::Iterator iter(list);
+    ASSERT_FALSE(iter.Valid());
+    Key zero = 0;
+    iter.Seek(Encode(&zero));
+    for (Key key : keys_) {
+      ASSERT_TRUE(iter.Valid());
+      ASSERT_EQ(key, Decode(iter.key()));
+      iter.Next();
+    }
+    ASSERT_FALSE(iter.Valid());
+    // Validate the list is well-formed.
+    list->TEST_Validate();
+  }
+
+ private:
+  std::set<Key> keys_;
+};
+
+TEST_F(InlineSkipTest, Empty) {
+  Arena arena;
+  TestComparator cmp;
+  InlineSkipList<TestComparator> list(cmp, &arena);
+  Key key = 10;
+  ASSERT_TRUE(!list.Contains(Encode(&key)));
+
+  InlineSkipList<TestComparator>::Iterator iter(&list);
+  ASSERT_TRUE(!iter.Valid());
+  iter.SeekToFirst();
+  ASSERT_TRUE(!iter.Valid());
+  key = 100;
+  iter.Seek(Encode(&key));
+  ASSERT_TRUE(!iter.Valid());
+  iter.SeekForPrev(Encode(&key));
+  ASSERT_TRUE(!iter.Valid());
+  iter.SeekToLast();
+  ASSERT_TRUE(!iter.Valid());
+}
+
+TEST_F(InlineSkipTest, InsertAndLookup) {
+  const int N = 2000;
+  const int R = 5000;
+  Random rnd(1000);
+  std::set<Key> keys;
+  ConcurrentArena arena;
+  TestComparator cmp;
+  InlineSkipList<TestComparator> list(cmp, &arena);
+  for (int i = 0; i < N; i++) {
+    Key key = rnd.Next() % R;
+    if (keys.insert(key).second) {
+      char* buf = list.AllocateKey(sizeof(Key));
+      memcpy(buf, &key, sizeof(Key));
+      list.Insert(buf);
+    }
+  }
+
+  for (Key i = 0; i < R; i++) {
+    if (list.Contains(Encode(&i))) {
+      ASSERT_EQ(keys.count(i), 1U);
+    } else {
+      ASSERT_EQ(keys.count(i), 0U);
+    }
+  }
+
+  // Simple iterator tests
+  {
+    InlineSkipList<TestComparator>::Iterator iter(&list);
+    ASSERT_TRUE(!iter.Valid());
+
+    uint64_t zero = 0;
+    iter.Seek(Encode(&zero));
+    ASSERT_TRUE(iter.Valid());
+    ASSERT_EQ(*(keys.begin()), Decode(iter.key()));
+
+    uint64_t max_key = R - 1;
+    iter.SeekForPrev(Encode(&max_key));
+    ASSERT_TRUE(iter.Valid());
+    ASSERT_EQ(*(keys.rbegin()), Decode(iter.key()));
+
+    iter.SeekToFirst();
+    ASSERT_TRUE(iter.Valid());
+    ASSERT_EQ(*(keys.begin()), Decode(iter.key()));
+
+    iter.SeekToLast();
+    ASSERT_TRUE(iter.Valid());
+    ASSERT_EQ(*(keys.rbegin()), Decode(iter.key()));
+  }
+
+  // Forward iteration test
+  for (Key i = 0; i < R; i++) {
+    InlineSkipList<TestComparator>::Iterator iter(&list);
+    iter.Seek(Encode(&i));
+
+    // Compare against model iterator
+    std::set<Key>::iterator model_iter = keys.lower_bound(i);
+    for (int j = 0; j < 3; j++) {
+      if (model_iter == keys.end()) {
+        ASSERT_TRUE(!iter.Valid());
+        break;
+      } else {
+        ASSERT_TRUE(iter.Valid());
+        ASSERT_EQ(*model_iter, Decode(iter.key()));
+        ++model_iter;
+        iter.Next();
+      }
+    }
+  }
+
+  // Backward iteration test
+  for (Key i = 0; i < R; i++) {
+    InlineSkipList<TestComparator>::Iterator iter(&list);
+    iter.SeekForPrev(Encode(&i));
+
+    // Compare against model iterator
+    std::set<Key>::iterator model_iter = keys.upper_bound(i);
+    for (int j = 0; j < 3; j++) {
+      if (model_iter == keys.begin()) {
+        ASSERT_TRUE(!iter.Valid());
+        break;
+      } else {
+        ASSERT_TRUE(iter.Valid());
+        ASSERT_EQ(*--model_iter, Decode(iter.key()));
+        iter.Prev();
+      }
+    }
+  }
+}
+
+TEST_F(InlineSkipTest, InsertWithHint_Sequential) {
+  const int N = 100000;
+  Arena arena;
+  TestComparator cmp;
+  TestInlineSkipList list(cmp, &arena);
+  void* hint = nullptr;
+  for (int i = 0; i < N; i++) {
+    Key key = i;
+    InsertWithHint(&list, key, &hint);
+  }
+  Validate(&list);
+}
+
+TEST_F(InlineSkipTest, InsertWithHint_MultipleHints) {
+  const int N = 100000;
+  const int S = 100;
+  Random rnd(534);
+  Arena arena;
+  TestComparator cmp;
+  TestInlineSkipList list(cmp, &arena);
+  void* hints[S];
+  Key last_key[S];
+  for (int i = 0; i < S; i++) {
+    hints[i] = nullptr;
+    last_key[i] = 0;
+  }
+  for (int i = 0; i < N; i++) {
+    Key s = rnd.Uniform(S);
+    Key key = (s << 32) + (++last_key[s]);
+    InsertWithHint(&list, key, &hints[s]);
+  }
+  Validate(&list);
+}
+
+TEST_F(InlineSkipTest, InsertWithHint_MultipleHintsRandom) {
+  const int N = 100000;
+  const int S = 100;
+  Random rnd(534);
+  Arena arena;
+  TestComparator cmp;
+  TestInlineSkipList list(cmp, &arena);
+  void* hints[S];
+  for (int i = 0; i < S; i++) {
+    hints[i] = nullptr;
+  }
+  for (int i = 0; i < N; i++) {
+    Key s = rnd.Uniform(S);
+    Key key = (s << 32) + rnd.Next();
+    InsertWithHint(&list, key, &hints[s]);
+  }
+  Validate(&list);
+}
+
+TEST_F(InlineSkipTest, InsertWithHint_CompatibleWithInsertWithoutHint) {
+  const int N = 100000;
+  const int S1 = 100;
+  const int S2 = 100;
+  Random rnd(534);
+  Arena arena;
+  TestComparator cmp;
+  TestInlineSkipList list(cmp, &arena);
+  std::unordered_set<Key> used;
+  Key with_hint[S1];
+  Key without_hint[S2];
+  void* hints[S1];
+  for (int i = 0; i < S1; i++) {
+    hints[i] = nullptr;
+    while (true) {
+      Key s = rnd.Next();
+      if (used.insert(s).second) {
+        with_hint[i] = s;
+        break;
+      }
+    }
+  }
+  for (int i = 0; i < S2; i++) {
+    while (true) {
+      Key s = rnd.Next();
+      if (used.insert(s).second) {
+        without_hint[i] = s;
+        break;
+      }
+    }
+  }
+  for (int i = 0; i < N; i++) {
+    Key s = rnd.Uniform(S1 + S2);
+    if (s < S1) {
+      Key key = (with_hint[s] << 32) + rnd.Next();
+      InsertWithHint(&list, key, &hints[s]);
+    } else {
+      Key key = (without_hint[s - S1] << 32) + rnd.Next();
+      Insert(&list, key);
+    }
+  }
+  Validate(&list);
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+// We want to make sure that with a single writer and multiple
+// concurrent readers (with no synchronization other than when a
+// reader's iterator is created), the reader always observes all the
+// data that was present in the skip list when the iterator was
+// constructor.  Because insertions are happening concurrently, we may
+// also observe new values that were inserted since the iterator was
+// constructed, but we should never miss any values that were present
+// at iterator construction time.
+//
+// We generate multi-part keys:
+//     <key,gen,hash>
+// where:
+//     key is in range [0..K-1]
+//     gen is a generation number for key
+//     hash is hash(key,gen)
+//
+// The insertion code picks a random key, sets gen to be 1 + the last
+// generation number inserted for that key, and sets hash to Hash(key,gen).
+//
+// At the beginning of a read, we snapshot the last inserted
+// generation number for each key.  We then iterate, including random
+// calls to Next() and Seek().  For every key we encounter, we
+// check that it is either expected given the initial snapshot or has
+// been concurrently added since the iterator started.
+class ConcurrentTest {
+ public:
+  static const uint32_t K = 8;
+
+ private:
+  static uint64_t key(Key key) { return (key >> 40); }
+  static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; }
+  static uint64_t hash(Key key) { return key & 0xff; }
+
+  static uint64_t HashNumbers(uint64_t k, uint64_t g) {
+    uint64_t data[2] = {k, g};
+    return Hash(reinterpret_cast<char*>(data), sizeof(data), 0);
+  }
+
+  static Key MakeKey(uint64_t k, uint64_t g) {
+    assert(sizeof(Key) == sizeof(uint64_t));
+    assert(k <= K);  // We sometimes pass K to seek to the end of the skiplist
+    assert(g <= 0xffffffffu);
+    return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff));
+  }
+
+  static bool IsValidKey(Key k) {
+    return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff);
+  }
+
+  static Key RandomTarget(Random* rnd) {
+    switch (rnd->Next() % 10) {
+      case 0:
+        // Seek to beginning
+        return MakeKey(0, 0);
+      case 1:
+        // Seek to end
+        return MakeKey(K, 0);
+      default:
+        // Seek to middle
+        return MakeKey(rnd->Next() % K, 0);
+    }
+  }
+
+  // Per-key generation
+  struct State {
+    std::atomic<int> generation[K];
+    void Set(int k, int v) {
+      generation[k].store(v, std::memory_order_release);
+    }
+    int Get(int k) { return generation[k].load(std::memory_order_acquire); }
+
+    State() {
+      for (unsigned int k = 0; k < K; k++) {
+        Set(k, 0);
+      }
+    }
+  };
+
+  // Current state of the test
+  State current_;
+
+  ConcurrentArena arena_;
+
+  // InlineSkipList is not protected by mu_.  We just use a single writer
+  // thread to modify it.
+  InlineSkipList<TestComparator> list_;
+
+ public:
+  ConcurrentTest() : list_(TestComparator(), &arena_) {}
+
+  // REQUIRES: No concurrent calls to WriteStep or ConcurrentWriteStep
+  void WriteStep(Random* rnd) {
+    const uint32_t k = rnd->Next() % K;
+    const int g = current_.Get(k) + 1;
+    const Key new_key = MakeKey(k, g);
+    char* buf = list_.AllocateKey(sizeof(Key));
+    memcpy(buf, &new_key, sizeof(Key));
+    list_.Insert(buf);
+    current_.Set(k, g);
+  }
+
+  // REQUIRES: No concurrent calls for the same k
+  void ConcurrentWriteStep(uint32_t k, bool use_hint = false) {
+    const int g = current_.Get(k) + 1;
+    const Key new_key = MakeKey(k, g);
+    char* buf = list_.AllocateKey(sizeof(Key));
+    memcpy(buf, &new_key, sizeof(Key));
+    if (use_hint) {
+      void* hint = nullptr;
+      list_.InsertWithHintConcurrently(buf, &hint);
+      delete[] reinterpret_cast<char*>(hint);
+    } else {
+      list_.InsertConcurrently(buf);
+    }
+    ASSERT_EQ(g, current_.Get(k) + 1);
+    current_.Set(k, g);
+  }
+
+  void ReadStep(Random* rnd) {
+    // Remember the initial committed state of the skiplist.
+    State initial_state;
+    for (unsigned int k = 0; k < K; k++) {
+      initial_state.Set(k, current_.Get(k));
+    }
+
+    Key pos = RandomTarget(rnd);
+    InlineSkipList<TestComparator>::Iterator iter(&list_);
+    iter.Seek(Encode(&pos));
+    while (true) {
+      Key current;
+      if (!iter.Valid()) {
+        current = MakeKey(K, 0);
+      } else {
+        current = Decode(iter.key());
+        ASSERT_TRUE(IsValidKey(current)) << current;
+      }
+      ASSERT_LE(pos, current) << "should not go backwards";
+
+      // Verify that everything in [pos,current) was not present in
+      // initial_state.
+      while (pos < current) {
+        ASSERT_LT(key(pos), K) << pos;
+
+        // Note that generation 0 is never inserted, so it is ok if
+        // <*,0,*> is missing.
+        ASSERT_TRUE((gen(pos) == 0U) ||
+                    (gen(pos) > static_cast<uint64_t>(initial_state.Get(
+                                    static_cast<int>(key(pos))))))
+            << "key: " << key(pos) << "; gen: " << gen(pos)
+            << "; initgen: " << initial_state.Get(static_cast<int>(key(pos)));
+
+        // Advance to next key in the valid key space
+        if (key(pos) < key(current)) {
+          pos = MakeKey(key(pos) + 1, 0);
+        } else {
+          pos = MakeKey(key(pos), gen(pos) + 1);
+        }
+      }
+
+      if (!iter.Valid()) {
+        break;
+      }
+
+      if (rnd->Next() % 2) {
+        iter.Next();
+        pos = MakeKey(key(pos), gen(pos) + 1);
+      } else {
+        Key new_target = RandomTarget(rnd);
+        if (new_target > pos) {
+          pos = new_target;
+          iter.Seek(Encode(&new_target));
+        }
+      }
+    }
+  }
+};
+const uint32_t ConcurrentTest::K;
+
+// Simple test that does single-threaded testing of the ConcurrentTest
+// scaffolding.
+TEST_F(InlineSkipTest, ConcurrentReadWithoutThreads) {
+  ConcurrentTest test;
+  Random rnd(test::RandomSeed());
+  for (int i = 0; i < 10000; i++) {
+    test.ReadStep(&rnd);
+    test.WriteStep(&rnd);
+  }
+}
+
+TEST_F(InlineSkipTest, ConcurrentInsertWithoutThreads) {
+  ConcurrentTest test;
+  Random rnd(test::RandomSeed());
+  for (int i = 0; i < 10000; i++) {
+    test.ReadStep(&rnd);
+    uint32_t base = rnd.Next();
+    for (int j = 0; j < 4; ++j) {
+      test.ConcurrentWriteStep((base + j) % ConcurrentTest::K);
+    }
+  }
+}
+
+class TestState {
+ public:
+  ConcurrentTest t_;
+  bool use_hint_;
+  int seed_;
+  std::atomic<bool> quit_flag_;
+  std::atomic<uint32_t> next_writer_;
+
+  enum ReaderState { STARTING, RUNNING, DONE };
+
+  explicit TestState(int s)
+      : seed_(s),
+        quit_flag_(false),
+        state_(STARTING),
+        pending_writers_(0),
+        state_cv_(&mu_) {}
+
+  void Wait(ReaderState s) {
+    mu_.Lock();
+    while (state_ != s) {
+      state_cv_.Wait();
+    }
+    mu_.Unlock();
+  }
+
+  void Change(ReaderState s) {
+    mu_.Lock();
+    state_ = s;
+    state_cv_.Signal();
+    mu_.Unlock();
+  }
+
+  void AdjustPendingWriters(int delta) {
+    mu_.Lock();
+    pending_writers_ += delta;
+    if (pending_writers_ == 0) {
+      state_cv_.Signal();
+    }
+    mu_.Unlock();
+  }
+
+  void WaitForPendingWriters() {
+    mu_.Lock();
+    while (pending_writers_ != 0) {
+      state_cv_.Wait();
+    }
+    mu_.Unlock();
+  }
+
+ private:
+  port::Mutex mu_;
+  ReaderState state_;
+  int pending_writers_;
+  port::CondVar state_cv_;
+};
+
+static void ConcurrentReader(void* arg) {
+  TestState* state = reinterpret_cast<TestState*>(arg);
+  Random rnd(state->seed_);
+  int64_t reads = 0;
+  state->Change(TestState::RUNNING);
+  while (!state->quit_flag_.load(std::memory_order_acquire)) {
+    state->t_.ReadStep(&rnd);
+    ++reads;
+  }
+  state->Change(TestState::DONE);
+}
+
+static void ConcurrentWriter(void* arg) {
+  TestState* state = reinterpret_cast<TestState*>(arg);
+  uint32_t k = state->next_writer_++ % ConcurrentTest::K;
+  state->t_.ConcurrentWriteStep(k, state->use_hint_);
+  state->AdjustPendingWriters(-1);
+}
+
+static void RunConcurrentRead(int run) {
+  const int seed = test::RandomSeed() + (run * 100);
+  Random rnd(seed);
+  const int N = 1000;
+  const int kSize = 1000;
+  for (int i = 0; i < N; i++) {
+    if ((i % 100) == 0) {
+      fprintf(stderr, "Run %d of %d\n", i, N);
+    }
+    TestState state(seed + 1);
+    Env::Default()->SetBackgroundThreads(1);
+    Env::Default()->Schedule(ConcurrentReader, &state);
+    state.Wait(TestState::RUNNING);
+    for (int k = 0; k < kSize; ++k) {
+      state.t_.WriteStep(&rnd);
+    }
+    state.quit_flag_.store(true, std::memory_order_release);
+    state.Wait(TestState::DONE);
+  }
+}
+
+static void RunConcurrentInsert(int run, bool use_hint = false,
+                                int write_parallelism = 4) {
+  Env::Default()->SetBackgroundThreads(1 + write_parallelism,
+                                       Env::Priority::LOW);
+  const int seed = test::RandomSeed() + (run * 100);
+  Random rnd(seed);
+  const int N = 1000;
+  const int kSize = 1000;
+  for (int i = 0; i < N; i++) {
+    if ((i % 100) == 0) {
+      fprintf(stderr, "Run %d of %d\n", i, N);
+    }
+    TestState state(seed + 1);
+    state.use_hint_ = use_hint;
+    Env::Default()->Schedule(ConcurrentReader, &state);
+    state.Wait(TestState::RUNNING);
+    for (int k = 0; k < kSize; k += write_parallelism) {
+      state.next_writer_ = rnd.Next();
+      state.AdjustPendingWriters(write_parallelism);
+      for (int p = 0; p < write_parallelism; ++p) {
+        Env::Default()->Schedule(ConcurrentWriter, &state);
+      }
+      state.WaitForPendingWriters();
+    }
+    state.quit_flag_.store(true, std::memory_order_release);
+    state.Wait(TestState::DONE);
+  }
+}
+
+TEST_F(InlineSkipTest, ConcurrentRead1) { RunConcurrentRead(1); }
+TEST_F(InlineSkipTest, ConcurrentRead2) { RunConcurrentRead(2); }
+TEST_F(InlineSkipTest, ConcurrentRead3) { RunConcurrentRead(3); }
+TEST_F(InlineSkipTest, ConcurrentRead4) { RunConcurrentRead(4); }
+TEST_F(InlineSkipTest, ConcurrentRead5) { RunConcurrentRead(5); }
+TEST_F(InlineSkipTest, ConcurrentInsert1) { RunConcurrentInsert(1); }
+TEST_F(InlineSkipTest, ConcurrentInsert2) { RunConcurrentInsert(2); }
+TEST_F(InlineSkipTest, ConcurrentInsert3) { RunConcurrentInsert(3); }
+TEST_F(InlineSkipTest, ConcurrentInsertWithHint1) {
+  RunConcurrentInsert(1, true);
+}
+TEST_F(InlineSkipTest, ConcurrentInsertWithHint2) {
+  RunConcurrentInsert(2, true);
+}
+TEST_F(InlineSkipTest, ConcurrentInsertWithHint3) {
+  RunConcurrentInsert(3, true);
+}
+
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/memtable/memtablerep_bench.cc b/src/rocksdb/memtable/memtablerep_bench.cc
new file mode 100644
index 000000000..a915abed7
--- /dev/null
+++ b/src/rocksdb/memtable/memtablerep_bench.cc
@@ -0,0 +1,689 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+
+#include <atomic>
+#include <iostream>
+#include <memory>
+#include <thread>
+#include <type_traits>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/memtable.h"
+#include "memory/arena.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "test_util/testutil.h"
+#include "util/gflags_compat.h"
+#include "util/mutexlock.h"
+#include "util/stop_watch.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::RegisterFlagValidator;
+using GFLAGS_NAMESPACE::SetUsageMessage;
+
+DEFINE_string(benchmarks, "fillrandom",
+              "Comma-separated list of benchmarks to run. Options:\n"
+              "\tfillrandom             -- write N random values\n"
+              "\tfillseq                -- write N values in sequential order\n"
+              "\treadrandom             -- read N values in random order\n"
+              "\treadseq                -- scan the DB\n"
+              "\treadwrite              -- 1 thread writes while N - 1 threads "
+              "do random\n"
+              "\t                          reads\n"
+              "\tseqreadwrite           -- 1 thread writes while N - 1 threads "
+              "do scans\n");
+
+DEFINE_string(memtablerep, "skiplist",
+              "Which implementation of memtablerep to use. See "
+              "include/memtablerep.h for\n"
+              "  more details. Options:\n"
+              "\tskiplist            -- backed by a skiplist\n"
+              "\tvector              -- backed by an std::vector\n"
+              "\thashskiplist        -- backed by a hash skip list\n"
+              "\thashlinklist        -- backed by a hash linked list\n"
+              "\tcuckoo              -- backed by a cuckoo hash table");
+
+DEFINE_int64(bucket_count, 1000000,
+             "bucket_count parameter to pass into NewHashSkiplistRepFactory or "
+             "NewHashLinkListRepFactory");
+
+DEFINE_int32(
+    hashskiplist_height, 4,
+    "skiplist_height parameter to pass into NewHashSkiplistRepFactory");
+
+DEFINE_int32(
+    hashskiplist_branching_factor, 4,
+    "branching_factor parameter to pass into NewHashSkiplistRepFactory");
+
+DEFINE_int32(
+    huge_page_tlb_size, 0,
+    "huge_page_tlb_size parameter to pass into NewHashLinkListRepFactory");
+
+DEFINE_int32(bucket_entries_logging_threshold, 4096,
+             "bucket_entries_logging_threshold parameter to pass into "
+             "NewHashLinkListRepFactory");
+
+DEFINE_bool(if_log_bucket_dist_when_flash, true,
+            "if_log_bucket_dist_when_flash parameter to pass into "
+            "NewHashLinkListRepFactory");
+
+DEFINE_int32(
+    threshold_use_skiplist, 256,
+    "threshold_use_skiplist parameter to pass into NewHashLinkListRepFactory");
+
+DEFINE_int64(write_buffer_size, 256,
+             "write_buffer_size parameter to pass into WriteBufferManager");
+
+DEFINE_int32(
+    num_threads, 1,
+    "Number of concurrent threads to run. If the benchmark includes writes,\n"
+    "then at most one thread will be a writer");
+
+DEFINE_int32(num_operations, 1000000,
+             "Number of operations to do for write and random read benchmarks");
+
+DEFINE_int32(num_scans, 10,
+             "Number of times for each thread to scan the memtablerep for "
+             "sequential read "
+             "benchmarks");
+
+DEFINE_int32(item_size, 100, "Number of bytes each item should be");
+
+DEFINE_int32(prefix_length, 8,
+             "Prefix length to pass into NewFixedPrefixTransform");
+
+/* VectorRep settings */
+DEFINE_int64(vectorrep_count, 0,
+             "Number of entries to reserve on VectorRep initialization");
+
+DEFINE_int64(seed, 0,
+             "Seed base for random number generators. "
+             "When 0 it is deterministic.");
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+struct CallbackVerifyArgs {
+  bool found;
+  LookupKey* key;
+  MemTableRep* table;
+  InternalKeyComparator* comparator;
+};
+}  // namespace
+
+// Helper for quickly generating random data.
+class RandomGenerator {
+ private:
+  std::string data_;
+  unsigned int pos_;
+
+ public:
+  RandomGenerator() {
+    Random rnd(301);
+    auto size = (unsigned)std::max(1048576, FLAGS_item_size);
+    data_ = rnd.RandomString(size);
+    pos_ = 0;
+  }
+
+  Slice Generate(unsigned int len) {
+    assert(len <= data_.size());
+    if (pos_ + len > data_.size()) {
+      pos_ = 0;
+    }
+    pos_ += len;
+    return Slice(data_.data() + pos_ - len, len);
+  }
+};
+
+enum WriteMode { SEQUENTIAL, RANDOM, UNIQUE_RANDOM };
+
+class KeyGenerator {
+ public:
+  KeyGenerator(Random64* rand, WriteMode mode, uint64_t num)
+      : rand_(rand), mode_(mode), num_(num), next_(0) {
+    if (mode_ == UNIQUE_RANDOM) {
+      // NOTE: if memory consumption of this approach becomes a concern,
+      // we can either break it into pieces and only random shuffle a section
+      // each time. Alternatively, use a bit map implementation
+      // (https://reviews.facebook.net/differential/diff/54627/)
+      values_.resize(num_);
+      for (uint64_t i = 0; i < num_; ++i) {
+        values_[i] = i;
+      }
+      RandomShuffle(values_.begin(), values_.end(),
+                    static_cast<uint32_t>(FLAGS_seed));
+    }
+  }
+
+  uint64_t Next() {
+    switch (mode_) {
+      case SEQUENTIAL:
+        return next_++;
+      case RANDOM:
+        return rand_->Next() % num_;
+      case UNIQUE_RANDOM:
+        return values_[next_++];
+    }
+    assert(false);
+    return std::numeric_limits<uint64_t>::max();
+  }
+
+ private:
+  Random64* rand_;
+  WriteMode mode_;
+  const uint64_t num_;
+  uint64_t next_;
+  std::vector<uint64_t> values_;
+};
+
+class BenchmarkThread {
+ public:
+  explicit BenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
+                           uint64_t* bytes_written, uint64_t* bytes_read,
+                           uint64_t* sequence, uint64_t num_ops,
+                           uint64_t* read_hits)
+      : table_(table),
+        key_gen_(key_gen),
+        bytes_written_(bytes_written),
+        bytes_read_(bytes_read),
+        sequence_(sequence),
+        num_ops_(num_ops),
+        read_hits_(read_hits) {}
+
+  virtual void operator()() = 0;
+  virtual ~BenchmarkThread() {}
+
+ protected:
+  MemTableRep* table_;
+  KeyGenerator* key_gen_;
+  uint64_t* bytes_written_;
+  uint64_t* bytes_read_;
+  uint64_t* sequence_;
+  uint64_t num_ops_;
+  uint64_t* read_hits_;
+  RandomGenerator generator_;
+};
+
+class FillBenchmarkThread : public BenchmarkThread {
+ public:
+  FillBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
+                      uint64_t* bytes_written, uint64_t* bytes_read,
+                      uint64_t* sequence, uint64_t num_ops, uint64_t* read_hits)
+      : BenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence,
+                        num_ops, read_hits) {}
+
+  void FillOne() {
+    char* buf = nullptr;
+    auto internal_key_size = 16;
+    auto encoded_len =
+        FLAGS_item_size + VarintLength(internal_key_size) + internal_key_size;
+    KeyHandle handle = table_->Allocate(encoded_len, &buf);
+    assert(buf != nullptr);
+    char* p = EncodeVarint32(buf, internal_key_size);
+    auto key = key_gen_->Next();
+    EncodeFixed64(p, key);
+    p += 8;
+    EncodeFixed64(p, ++(*sequence_));
+    p += 8;
+    Slice bytes = generator_.Generate(FLAGS_item_size);
+    memcpy(p, bytes.data(), FLAGS_item_size);
+    p += FLAGS_item_size;
+    assert(p == buf + encoded_len);
+    table_->Insert(handle);
+    *bytes_written_ += encoded_len;
+  }
+
+  void operator()() override {
+    for (unsigned int i = 0; i < num_ops_; ++i) {
+      FillOne();
+    }
+  }
+};
+
+class ConcurrentFillBenchmarkThread : public FillBenchmarkThread {
+ public:
+  ConcurrentFillBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
+                                uint64_t* bytes_written, uint64_t* bytes_read,
+                                uint64_t* sequence, uint64_t num_ops,
+                                uint64_t* read_hits,
+                                std::atomic_int* threads_done)
+      : FillBenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence,
+                            num_ops, read_hits) {
+    threads_done_ = threads_done;
+  }
+
+  void operator()() override {
+    // # of read threads will be total threads - write threads (always 1). Loop
+    // while all reads complete.
+    while ((*threads_done_).load() < (FLAGS_num_threads - 1)) {
+      FillOne();
+    }
+  }
+
+ private:
+  std::atomic_int* threads_done_;
+};
+
+class ReadBenchmarkThread : public BenchmarkThread {
+ public:
+  ReadBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
+                      uint64_t* bytes_written, uint64_t* bytes_read,
+                      uint64_t* sequence, uint64_t num_ops, uint64_t* read_hits)
+      : BenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence,
+                        num_ops, read_hits) {}
+
+  static bool callback(void* arg, const char* entry) {
+    CallbackVerifyArgs* callback_args = static_cast<CallbackVerifyArgs*>(arg);
+    assert(callback_args != nullptr);
+    uint32_t key_length;
+    const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+    if ((callback_args->comparator)
+            ->user_comparator()
+            ->Equal(Slice(key_ptr, key_length - 8),
+                    callback_args->key->user_key())) {
+      callback_args->found = true;
+    }
+    return false;
+  }
+
+  void ReadOne() {
+    std::string user_key;
+    auto key = key_gen_->Next();
+    PutFixed64(&user_key, key);
+    LookupKey lookup_key(user_key, *sequence_);
+    InternalKeyComparator internal_key_comp(BytewiseComparator());
+    CallbackVerifyArgs verify_args;
+    verify_args.found = false;
+    verify_args.key = &lookup_key;
+    verify_args.table = table_;
+    verify_args.comparator = &internal_key_comp;
+    table_->Get(lookup_key, &verify_args, callback);
+    if (verify_args.found) {
+      *bytes_read_ += VarintLength(16) + 16 + FLAGS_item_size;
+      ++*read_hits_;
+    }
+  }
+  void operator()() override {
+    for (unsigned int i = 0; i < num_ops_; ++i) {
+      ReadOne();
+    }
+  }
+};
+
+class SeqReadBenchmarkThread : public BenchmarkThread {
+ public:
+  SeqReadBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
+                         uint64_t* bytes_written, uint64_t* bytes_read,
+                         uint64_t* sequence, uint64_t num_ops,
+                         uint64_t* read_hits)
+      : BenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence,
+                        num_ops, read_hits) {}
+
+  void ReadOneSeq() {
+    std::unique_ptr<MemTableRep::Iterator> iter(table_->GetIterator());
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      // pretend to read the value
+      *bytes_read_ += VarintLength(16) + 16 + FLAGS_item_size;
+    }
+    ++*read_hits_;
+  }
+
+  void operator()() override {
+    for (unsigned int i = 0; i < num_ops_; ++i) {
+      { ReadOneSeq(); }
+    }
+  }
+};
+
+class ConcurrentReadBenchmarkThread : public ReadBenchmarkThread {
+ public:
+  ConcurrentReadBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
+                                uint64_t* bytes_written, uint64_t* bytes_read,
+                                uint64_t* sequence, uint64_t num_ops,
+                                uint64_t* read_hits,
+                                std::atomic_int* threads_done)
+      : ReadBenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence,
+                            num_ops, read_hits) {
+    threads_done_ = threads_done;
+  }
+
+  void operator()() override {
+    for (unsigned int i = 0; i < num_ops_; ++i) {
+      ReadOne();
+    }
+    ++*threads_done_;
+  }
+
+ private:
+  std::atomic_int* threads_done_;
+};
+
+class SeqConcurrentReadBenchmarkThread : public SeqReadBenchmarkThread {
+ public:
+  SeqConcurrentReadBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
+                                   uint64_t* bytes_written,
+                                   uint64_t* bytes_read, uint64_t* sequence,
+                                   uint64_t num_ops, uint64_t* read_hits,
+                                   std::atomic_int* threads_done)
+      : SeqReadBenchmarkThread(table, key_gen, bytes_written, bytes_read,
+                               sequence, num_ops, read_hits) {
+    threads_done_ = threads_done;
+  }
+
+  void operator()() override {
+    for (unsigned int i = 0; i < num_ops_; ++i) {
+      ReadOneSeq();
+    }
+    ++*threads_done_;
+  }
+
+ private:
+  std::atomic_int* threads_done_;
+};
+
+class Benchmark {
+ public:
+  explicit Benchmark(MemTableRep* table, KeyGenerator* key_gen,
+                     uint64_t* sequence, uint32_t num_threads)
+      : table_(table),
+        key_gen_(key_gen),
+        sequence_(sequence),
+        num_threads_(num_threads) {}
+
+  virtual ~Benchmark() {}
+  virtual void Run() {
+    std::cout << "Number of threads: " << num_threads_ << std::endl;
+    std::vector<port::Thread> threads;
+    uint64_t bytes_written = 0;
+    uint64_t bytes_read = 0;
+    uint64_t read_hits = 0;
+    StopWatchNano timer(SystemClock::Default().get(), true);
+    RunThreads(&threads, &bytes_written, &bytes_read, true, &read_hits);
+    auto elapsed_time = static_cast<double>(timer.ElapsedNanos() / 1000);
+    std::cout << "Elapsed time: " << static_cast<int>(elapsed_time) << " us"
+              << std::endl;
+
+    if (bytes_written > 0) {
+      auto MiB_written = static_cast<double>(bytes_written) / (1 << 20);
+      auto write_throughput = MiB_written / (elapsed_time / 1000000);
+      std::cout << "Total bytes written: " << MiB_written << " MiB"
+                << std::endl;
+      std::cout << "Write throughput: " << write_throughput << " MiB/s"
+                << std::endl;
+      auto us_per_op = elapsed_time / num_write_ops_per_thread_;
+      std::cout << "write us/op: " << us_per_op << std::endl;
+    }
+    if (bytes_read > 0) {
+      auto MiB_read = static_cast<double>(bytes_read) / (1 << 20);
+      auto read_throughput = MiB_read / (elapsed_time / 1000000);
+      std::cout << "Total bytes read: " << MiB_read << " MiB" << std::endl;
+      std::cout << "Read throughput: " << read_throughput << " MiB/s"
+                << std::endl;
+      auto us_per_op = elapsed_time / num_read_ops_per_thread_;
+      std::cout << "read us/op: " << us_per_op << std::endl;
+    }
+  }
+
+  virtual void RunThreads(std::vector<port::Thread>* threads,
+                          uint64_t* bytes_written, uint64_t* bytes_read,
+                          bool write, uint64_t* read_hits) = 0;
+
+ protected:
+  MemTableRep* table_;
+  KeyGenerator* key_gen_;
+  uint64_t* sequence_;
+  uint64_t num_write_ops_per_thread_ = 0;
+  uint64_t num_read_ops_per_thread_ = 0;
+  const uint32_t num_threads_;
+};
+
+class FillBenchmark : public Benchmark {
+ public:
+  explicit FillBenchmark(MemTableRep* table, KeyGenerator* key_gen,
+                         uint64_t* sequence)
+      : Benchmark(table, key_gen, sequence, 1) {
+    num_write_ops_per_thread_ = FLAGS_num_operations;
+  }
+
+  void RunThreads(std::vector<port::Thread>* /*threads*/,
+                  uint64_t* bytes_written, uint64_t* bytes_read, bool /*write*/,
+                  uint64_t* read_hits) override {
+    FillBenchmarkThread(table_, key_gen_, bytes_written, bytes_read, sequence_,
+                        num_write_ops_per_thread_, read_hits)();
+  }
+};
+
+class ReadBenchmark : public Benchmark {
+ public:
+  explicit ReadBenchmark(MemTableRep* table, KeyGenerator* key_gen,
+                         uint64_t* sequence)
+      : Benchmark(table, key_gen, sequence, FLAGS_num_threads) {
+    num_read_ops_per_thread_ = FLAGS_num_operations / FLAGS_num_threads;
+  }
+
+  void RunThreads(std::vector<port::Thread>* threads, uint64_t* bytes_written,
+                  uint64_t* bytes_read, bool /*write*/,
+                  uint64_t* read_hits) override {
+    for (int i = 0; i < FLAGS_num_threads; ++i) {
+      threads->emplace_back(
+          ReadBenchmarkThread(table_, key_gen_, bytes_written, bytes_read,
+                              sequence_, num_read_ops_per_thread_, read_hits));
+    }
+    for (auto& thread : *threads) {
+      thread.join();
+    }
+    std::cout << "read hit%: "
+              << (static_cast<double>(*read_hits) / FLAGS_num_operations) * 100
+              << std::endl;
+  }
+};
+
+class SeqReadBenchmark : public Benchmark {
+ public:
+  explicit SeqReadBenchmark(MemTableRep* table, uint64_t* sequence)
+      : Benchmark(table, nullptr, sequence, FLAGS_num_threads) {
+    num_read_ops_per_thread_ = FLAGS_num_scans;
+  }
+
+  void RunThreads(std::vector<port::Thread>* threads, uint64_t* bytes_written,
+                  uint64_t* bytes_read, bool /*write*/,
+                  uint64_t* read_hits) override {
+    for (int i = 0; i < FLAGS_num_threads; ++i) {
+      threads->emplace_back(SeqReadBenchmarkThread(
+          table_, key_gen_, bytes_written, bytes_read, sequence_,
+          num_read_ops_per_thread_, read_hits));
+    }
+    for (auto& thread : *threads) {
+      thread.join();
+    }
+  }
+};
+
+template <class ReadThreadType>
+class ReadWriteBenchmark : public Benchmark {
+ public:
+  explicit ReadWriteBenchmark(MemTableRep* table, KeyGenerator* key_gen,
+                              uint64_t* sequence)
+      : Benchmark(table, key_gen, sequence, FLAGS_num_threads) {
+    num_read_ops_per_thread_ =
+        FLAGS_num_threads <= 1
+            ? 0
+            : (FLAGS_num_operations / (FLAGS_num_threads - 1));
+    num_write_ops_per_thread_ = FLAGS_num_operations;
+  }
+
+  void RunThreads(std::vector<port::Thread>* threads, uint64_t* bytes_written,
+                  uint64_t* bytes_read, bool /*write*/,
+                  uint64_t* read_hits) override {
+    std::atomic_int threads_done;
+    threads_done.store(0);
+    threads->emplace_back(ConcurrentFillBenchmarkThread(
+        table_, key_gen_, bytes_written, bytes_read, sequence_,
+        num_write_ops_per_thread_, read_hits, &threads_done));
+    for (int i = 1; i < FLAGS_num_threads; ++i) {
+      threads->emplace_back(
+          ReadThreadType(table_, key_gen_, bytes_written, bytes_read, sequence_,
+                         num_read_ops_per_thread_, read_hits, &threads_done));
+    }
+    for (auto& thread : *threads) {
+      thread.join();
+    }
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+void PrintWarnings() {
+#if defined(__GNUC__) && !defined(__OPTIMIZE__)
+  fprintf(stdout,
+          "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n");
+#endif
+#ifndef NDEBUG
+  fprintf(stdout,
+          "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
+#endif
+}
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                  " [OPTIONS]...");
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  PrintWarnings();
+
+  ROCKSDB_NAMESPACE::Options options;
+
+  std::unique_ptr<ROCKSDB_NAMESPACE::MemTableRepFactory> factory;
+  if (FLAGS_memtablerep == "skiplist") {
+    factory.reset(new ROCKSDB_NAMESPACE::SkipListFactory);
+#ifndef ROCKSDB_LITE
+  } else if (FLAGS_memtablerep == "vector") {
+    factory.reset(new ROCKSDB_NAMESPACE::VectorRepFactory);
+  } else if (FLAGS_memtablerep == "hashskiplist" ||
+             FLAGS_memtablerep == "prefix_hash") {
+    factory.reset(ROCKSDB_NAMESPACE::NewHashSkipListRepFactory(
+        FLAGS_bucket_count, FLAGS_hashskiplist_height,
+        FLAGS_hashskiplist_branching_factor));
+    options.prefix_extractor.reset(
+        ROCKSDB_NAMESPACE::NewFixedPrefixTransform(FLAGS_prefix_length));
+  } else if (FLAGS_memtablerep == "hashlinklist" ||
+             FLAGS_memtablerep == "hash_linkedlist") {
+    factory.reset(ROCKSDB_NAMESPACE::NewHashLinkListRepFactory(
+        FLAGS_bucket_count, FLAGS_huge_page_tlb_size,
+        FLAGS_bucket_entries_logging_threshold,
+        FLAGS_if_log_bucket_dist_when_flash, FLAGS_threshold_use_skiplist));
+    options.prefix_extractor.reset(
+        ROCKSDB_NAMESPACE::NewFixedPrefixTransform(FLAGS_prefix_length));
+#endif  // ROCKSDB_LITE
+  } else {
+    ROCKSDB_NAMESPACE::ConfigOptions config_options;
+    config_options.ignore_unsupported_options = false;
+
+    ROCKSDB_NAMESPACE::Status s =
+        ROCKSDB_NAMESPACE::MemTableRepFactory::CreateFromString(
+            config_options, FLAGS_memtablerep, &factory);
+    if (!s.ok()) {
+      fprintf(stdout, "Unknown memtablerep: %s\n", s.ToString().c_str());
+      exit(1);
+    }
+  }
+
+  ROCKSDB_NAMESPACE::InternalKeyComparator internal_key_comp(
+      ROCKSDB_NAMESPACE::BytewiseComparator());
+  ROCKSDB_NAMESPACE::MemTable::KeyComparator key_comp(internal_key_comp);
+  ROCKSDB_NAMESPACE::Arena arena;
+  ROCKSDB_NAMESPACE::WriteBufferManager wb(FLAGS_write_buffer_size);
+  uint64_t sequence;
+  auto createMemtableRep = [&] {
+    sequence = 0;
+    return factory->CreateMemTableRep(key_comp, &arena,
+                                      options.prefix_extractor.get(),
+                                      options.info_log.get());
+  };
+  std::unique_ptr<ROCKSDB_NAMESPACE::MemTableRep> memtablerep;
+  ROCKSDB_NAMESPACE::Random64 rng(FLAGS_seed);
+  const char* benchmarks = FLAGS_benchmarks.c_str();
+  while (benchmarks != nullptr) {
+    std::unique_ptr<ROCKSDB_NAMESPACE::KeyGenerator> key_gen;
+    const char* sep = strchr(benchmarks, ',');
+    ROCKSDB_NAMESPACE::Slice name;
+    if (sep == nullptr) {
+      name = benchmarks;
+      benchmarks = nullptr;
+    } else {
+      name = ROCKSDB_NAMESPACE::Slice(benchmarks, sep - benchmarks);
+      benchmarks = sep + 1;
+    }
+    std::unique_ptr<ROCKSDB_NAMESPACE::Benchmark> benchmark;
+    if (name == ROCKSDB_NAMESPACE::Slice("fillseq")) {
+      memtablerep.reset(createMemtableRep());
+      key_gen.reset(new ROCKSDB_NAMESPACE::KeyGenerator(
+          &rng, ROCKSDB_NAMESPACE::SEQUENTIAL, FLAGS_num_operations));
+      benchmark.reset(new ROCKSDB_NAMESPACE::FillBenchmark(
+          memtablerep.get(), key_gen.get(), &sequence));
+    } else if (name == ROCKSDB_NAMESPACE::Slice("fillrandom")) {
+      memtablerep.reset(createMemtableRep());
+      key_gen.reset(new ROCKSDB_NAMESPACE::KeyGenerator(
+          &rng, ROCKSDB_NAMESPACE::UNIQUE_RANDOM, FLAGS_num_operations));
+      benchmark.reset(new ROCKSDB_NAMESPACE::FillBenchmark(
+          memtablerep.get(), key_gen.get(), &sequence));
+    } else if (name == ROCKSDB_NAMESPACE::Slice("readrandom")) {
+      key_gen.reset(new ROCKSDB_NAMESPACE::KeyGenerator(
+          &rng, ROCKSDB_NAMESPACE::RANDOM, FLAGS_num_operations));
+      benchmark.reset(new ROCKSDB_NAMESPACE::ReadBenchmark(
+          memtablerep.get(), key_gen.get(), &sequence));
+    } else if (name == ROCKSDB_NAMESPACE::Slice("readseq")) {
+      key_gen.reset(new ROCKSDB_NAMESPACE::KeyGenerator(
+          &rng, ROCKSDB_NAMESPACE::SEQUENTIAL, FLAGS_num_operations));
+      benchmark.reset(new ROCKSDB_NAMESPACE::SeqReadBenchmark(memtablerep.get(),
+                                                              &sequence));
+    } else if (name == ROCKSDB_NAMESPACE::Slice("readwrite")) {
+      memtablerep.reset(createMemtableRep());
+      key_gen.reset(new ROCKSDB_NAMESPACE::KeyGenerator(
+          &rng, ROCKSDB_NAMESPACE::RANDOM, FLAGS_num_operations));
+      benchmark.reset(new ROCKSDB_NAMESPACE::ReadWriteBenchmark<
+                      ROCKSDB_NAMESPACE::ConcurrentReadBenchmarkThread>(
+          memtablerep.get(), key_gen.get(), &sequence));
+    } else if (name == ROCKSDB_NAMESPACE::Slice("seqreadwrite")) {
+      memtablerep.reset(createMemtableRep());
+      key_gen.reset(new ROCKSDB_NAMESPACE::KeyGenerator(
+          &rng, ROCKSDB_NAMESPACE::RANDOM, FLAGS_num_operations));
+      benchmark.reset(new ROCKSDB_NAMESPACE::ReadWriteBenchmark<
+                      ROCKSDB_NAMESPACE::SeqConcurrentReadBenchmarkThread>(
+          memtablerep.get(), key_gen.get(), &sequence));
+    } else {
+      std::cout << "WARNING: skipping unknown benchmark '" << name.ToString()
+                << std::endl;
+      continue;
+    }
+    std::cout << "Running " << name.ToString() << std::endl;
+    benchmark->Run();
+  }
+
+  return 0;
+}
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/memtable/skiplist.h b/src/rocksdb/memtable/skiplist.h
new file mode 100644
index 000000000..e3cecd30c
--- /dev/null
+++ b/src/rocksdb/memtable/skiplist.h
@@ -0,0 +1,498 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Thread safety
+// -------------
+//
+// Writes require external synchronization, most likely a mutex.
+// Reads require a guarantee that the SkipList will not be destroyed
+// while the read is in progress.  Apart from that, reads progress
+// without any internal locking or synchronization.
+//
+// Invariants:
+//
+// (1) Allocated nodes are never deleted until the SkipList is
+// destroyed.  This is trivially guaranteed by the code since we
+// never delete any skip list nodes.
+//
+// (2) The contents of a Node except for the next/prev pointers are
+// immutable after the Node has been linked into the SkipList.
+// Only Insert() modifies the list, and it is careful to initialize
+// a node and use release-stores to publish the nodes in one or
+// more lists.
+//
+// ... prev vs. next pointer ordering ...
+//
+
+#pragma once
+#include <assert.h>
+#include <stdlib.h>
+
+#include <atomic>
+
+#include "memory/allocator.h"
+#include "port/port.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+template <typename Key, class Comparator>
+class SkipList {
+ private:
+  struct Node;
+
+ public:
+  // Create a new SkipList object that will use "cmp" for comparing keys,
+  // and will allocate memory using "*allocator".  Objects allocated in the
+  // allocator must remain allocated for the lifetime of the skiplist object.
+  explicit SkipList(Comparator cmp, Allocator* allocator,
+                    int32_t max_height = 12, int32_t branching_factor = 4);
+  // No copying allowed
+  SkipList(const SkipList&) = delete;
+  void operator=(const SkipList&) = delete;
+
+  // Insert key into the list.
+  // REQUIRES: nothing that compares equal to key is currently in the list.
+  void Insert(const Key& key);
+
+  // Returns true iff an entry that compares equal to key is in the list.
+  bool Contains(const Key& key) const;
+
+  // Return estimated number of entries smaller than `key`.
+  uint64_t EstimateCount(const Key& key) const;
+
+  // Iteration over the contents of a skip list
+  class Iterator {
+   public:
+    // Initialize an iterator over the specified list.
+    // The returned iterator is not valid.
+    explicit Iterator(const SkipList* list);
+
+    // Change the underlying skiplist used for this iterator
+    // This enables us not changing the iterator without deallocating
+    // an old one and then allocating a new one
+    void SetList(const SkipList* list);
+
+    // Returns true iff the iterator is positioned at a valid node.
+    bool Valid() const;
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    const Key& key() const;
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    void Next();
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    void Prev();
+
+    // Advance to the first entry with a key >= target
+    void Seek(const Key& target);
+
+    // Retreat to the last entry with a key <= target
+    void SeekForPrev(const Key& target);
+
+    // Position at the first entry in list.
+    // Final state of iterator is Valid() iff list is not empty.
+    void SeekToFirst();
+
+    // Position at the last entry in list.
+    // Final state of iterator is Valid() iff list is not empty.
+    void SeekToLast();
+
+   private:
+    const SkipList* list_;
+    Node* node_;
+    // Intentionally copyable
+  };
+
+ private:
+  const uint16_t kMaxHeight_;
+  const uint16_t kBranching_;
+  const uint32_t kScaledInverseBranching_;
+
+  // Immutable after construction
+  Comparator const compare_;
+  Allocator* const allocator_;  // Allocator used for allocations of nodes
+
+  Node* const head_;
+
+  // Modified only by Insert().  Read racily by readers, but stale
+  // values are ok.
+  std::atomic<int> max_height_;  // Height of the entire list
+
+  // Used for optimizing sequential insert patterns.  Tricky.  prev_[i] for
+  // i up to max_height_ is the predecessor of prev_[0] and prev_height_
+  // is the height of prev_[0].  prev_[0] can only be equal to head before
+  // insertion, in which case max_height_ and prev_height_ are 1.
+  Node** prev_;
+  int32_t prev_height_;
+
+  inline int GetMaxHeight() const {
+    return max_height_.load(std::memory_order_relaxed);
+  }
+
+  Node* NewNode(const Key& key, int height);
+  int RandomHeight();
+  bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
+  bool LessThan(const Key& a, const Key& b) const {
+    return (compare_(a, b) < 0);
+  }
+
+  // Return true if key is greater than the data stored in "n"
+  bool KeyIsAfterNode(const Key& key, Node* n) const;
+
+  // Returns the earliest node with a key >= key.
+  // Return nullptr if there is no such node.
+  Node* FindGreaterOrEqual(const Key& key) const;
+
+  // Return the latest node with a key < key.
+  // Return head_ if there is no such node.
+  // Fills prev[level] with pointer to previous node at "level" for every
+  // level in [0..max_height_-1], if prev is non-null.
+  Node* FindLessThan(const Key& key, Node** prev = nullptr) const;
+
+  // Return the last node in the list.
+  // Return head_ if list is empty.
+  Node* FindLast() const;
+};
+
+// Implementation details follow
+template <typename Key, class Comparator>
+struct SkipList<Key, Comparator>::Node {
+  explicit Node(const Key& k) : key(k) {}
+
+  Key const key;
+
+  // Accessors/mutators for links.  Wrapped in methods so we can
+  // add the appropriate barriers as necessary.
+  Node* Next(int n) {
+    assert(n >= 0);
+    // Use an 'acquire load' so that we observe a fully initialized
+    // version of the returned Node.
+    return (next_[n].load(std::memory_order_acquire));
+  }
+  void SetNext(int n, Node* x) {
+    assert(n >= 0);
+    // Use a 'release store' so that anybody who reads through this
+    // pointer observes a fully initialized version of the inserted node.
+    next_[n].store(x, std::memory_order_release);
+  }
+
+  // No-barrier variants that can be safely used in a few locations.
+  Node* NoBarrier_Next(int n) {
+    assert(n >= 0);
+    return next_[n].load(std::memory_order_relaxed);
+  }
+  void NoBarrier_SetNext(int n, Node* x) {
+    assert(n >= 0);
+    next_[n].store(x, std::memory_order_relaxed);
+  }
+
+ private:
+  // Array of length equal to the node height.  next_[0] is lowest level link.
+  std::atomic<Node*> next_[1];
+};
+
+template <typename Key, class Comparator>
+typename SkipList<Key, Comparator>::Node* SkipList<Key, Comparator>::NewNode(
+    const Key& key, int height) {
+  char* mem = allocator_->AllocateAligned(
+      sizeof(Node) + sizeof(std::atomic<Node*>) * (height - 1));
+  return new (mem) Node(key);
+}
+
+template <typename Key, class Comparator>
+inline SkipList<Key, Comparator>::Iterator::Iterator(const SkipList* list) {
+  SetList(list);
+}
+
+template <typename Key, class Comparator>
+inline void SkipList<Key, Comparator>::Iterator::SetList(const SkipList* list) {
+  list_ = list;
+  node_ = nullptr;
+}
+
+template <typename Key, class Comparator>
+inline bool SkipList<Key, Comparator>::Iterator::Valid() const {
+  return node_ != nullptr;
+}
+
+template <typename Key, class Comparator>
+inline const Key& SkipList<Key, Comparator>::Iterator::key() const {
+  assert(Valid());
+  return node_->key;
+}
+
+template <typename Key, class Comparator>
+inline void SkipList<Key, Comparator>::Iterator::Next() {
+  assert(Valid());
+  node_ = node_->Next(0);
+}
+
+template <typename Key, class Comparator>
+inline void SkipList<Key, Comparator>::Iterator::Prev() {
+  // Instead of using explicit "prev" links, we just search for the
+  // last node that falls before key.
+  assert(Valid());
+  node_ = list_->FindLessThan(node_->key);
+  if (node_ == list_->head_) {
+    node_ = nullptr;
+  }
+}
+
+template <typename Key, class Comparator>
+inline void SkipList<Key, Comparator>::Iterator::Seek(const Key& target) {
+  node_ = list_->FindGreaterOrEqual(target);
+}
+
+template <typename Key, class Comparator>
+inline void SkipList<Key, Comparator>::Iterator::SeekForPrev(
+    const Key& target) {
+  Seek(target);
+  if (!Valid()) {
+    SeekToLast();
+  }
+  while (Valid() && list_->LessThan(target, key())) {
+    Prev();
+  }
+}
+
+template <typename Key, class Comparator>
+inline void SkipList<Key, Comparator>::Iterator::SeekToFirst() {
+  node_ = list_->head_->Next(0);
+}
+
+template <typename Key, class Comparator>
+inline void SkipList<Key, Comparator>::Iterator::SeekToLast() {
+  node_ = list_->FindLast();
+  if (node_ == list_->head_) {
+    node_ = nullptr;
+  }
+}
+
+template <typename Key, class Comparator>
+int SkipList<Key, Comparator>::RandomHeight() {
+  auto rnd = Random::GetTLSInstance();
+
+  // Increase height with probability 1 in kBranching
+  int height = 1;
+  while (height < kMaxHeight_ && rnd->Next() < kScaledInverseBranching_) {
+    height++;
+  }
+  assert(height > 0);
+  assert(height <= kMaxHeight_);
+  return height;
+}
+
+template <typename Key, class Comparator>
+bool SkipList<Key, Comparator>::KeyIsAfterNode(const Key& key, Node* n) const {
+  // nullptr n is considered infinite
+  return (n != nullptr) && (compare_(n->key, key) < 0);
+}
+
+template <typename Key, class Comparator>
+typename SkipList<Key, Comparator>::Node*
+SkipList<Key, Comparator>::FindGreaterOrEqual(const Key& key) const {
+  // Note: It looks like we could reduce duplication by implementing
+  // this function as FindLessThan(key)->Next(0), but we wouldn't be able
+  // to exit early on equality and the result wouldn't even be correct.
+  // A concurrent insert might occur after FindLessThan(key) but before
+  // we get a chance to call Next(0).
+  Node* x = head_;
+  int level = GetMaxHeight() - 1;
+  Node* last_bigger = nullptr;
+  while (true) {
+    assert(x != nullptr);
+    Node* next = x->Next(level);
+    // Make sure the lists are sorted
+    assert(x == head_ || next == nullptr || KeyIsAfterNode(next->key, x));
+    // Make sure we haven't overshot during our search
+    assert(x == head_ || KeyIsAfterNode(key, x));
+    int cmp =
+        (next == nullptr || next == last_bigger) ? 1 : compare_(next->key, key);
+    if (cmp == 0 || (cmp > 0 && level == 0)) {
+      return next;
+    } else if (cmp < 0) {
+      // Keep searching in this list
+      x = next;
+    } else {
+      // Switch to next list, reuse compare_() result
+      last_bigger = next;
+      level--;
+    }
+  }
+}
+
+template <typename Key, class Comparator>
+typename SkipList<Key, Comparator>::Node*
+SkipList<Key, Comparator>::FindLessThan(const Key& key, Node** prev) const {
+  Node* x = head_;
+  int level = GetMaxHeight() - 1;
+  // KeyIsAfter(key, last_not_after) is definitely false
+  Node* last_not_after = nullptr;
+  while (true) {
+    assert(x != nullptr);
+    Node* next = x->Next(level);
+    assert(x == head_ || next == nullptr || KeyIsAfterNode(next->key, x));
+    assert(x == head_ || KeyIsAfterNode(key, x));
+    if (next != last_not_after && KeyIsAfterNode(key, next)) {
+      // Keep searching in this list
+      x = next;
+    } else {
+      if (prev != nullptr) {
+        prev[level] = x;
+      }
+      if (level == 0) {
+        return x;
+      } else {
+        // Switch to next list, reuse KeyIUsAfterNode() result
+        last_not_after = next;
+        level--;
+      }
+    }
+  }
+}
+
+template <typename Key, class Comparator>
+typename SkipList<Key, Comparator>::Node* SkipList<Key, Comparator>::FindLast()
+    const {
+  Node* x = head_;
+  int level = GetMaxHeight() - 1;
+  while (true) {
+    Node* next = x->Next(level);
+    if (next == nullptr) {
+      if (level == 0) {
+        return x;
+      } else {
+        // Switch to next list
+        level--;
+      }
+    } else {
+      x = next;
+    }
+  }
+}
+
+template <typename Key, class Comparator>
+uint64_t SkipList<Key, Comparator>::EstimateCount(const Key& key) const {
+  uint64_t count = 0;
+
+  Node* x = head_;
+  int level = GetMaxHeight() - 1;
+  while (true) {
+    assert(x == head_ || compare_(x->key, key) < 0);
+    Node* next = x->Next(level);
+    if (next == nullptr || compare_(next->key, key) >= 0) {
+      if (level == 0) {
+        return count;
+      } else {
+        // Switch to next list
+        count *= kBranching_;
+        level--;
+      }
+    } else {
+      x = next;
+      count++;
+    }
+  }
+}
+
+template <typename Key, class Comparator>
+SkipList<Key, Comparator>::SkipList(const Comparator cmp, Allocator* allocator,
+                                    int32_t max_height,
+                                    int32_t branching_factor)
+    : kMaxHeight_(static_cast<uint16_t>(max_height)),
+      kBranching_(static_cast<uint16_t>(branching_factor)),
+      kScaledInverseBranching_((Random::kMaxNext + 1) / kBranching_),
+      compare_(cmp),
+      allocator_(allocator),
+      head_(NewNode(0 /* any key will do */, max_height)),
+      max_height_(1),
+      prev_height_(1) {
+  assert(max_height > 0 && kMaxHeight_ == static_cast<uint32_t>(max_height));
+  assert(branching_factor > 0 &&
+         kBranching_ == static_cast<uint32_t>(branching_factor));
+  assert(kScaledInverseBranching_ > 0);
+  // Allocate the prev_ Node* array, directly from the passed-in allocator.
+  // prev_ does not need to be freed, as its life cycle is tied up with
+  // the allocator as a whole.
+  prev_ = reinterpret_cast<Node**>(
+      allocator_->AllocateAligned(sizeof(Node*) * kMaxHeight_));
+  for (int i = 0; i < kMaxHeight_; i++) {
+    head_->SetNext(i, nullptr);
+    prev_[i] = head_;
+  }
+}
+
+template <typename Key, class Comparator>
+void SkipList<Key, Comparator>::Insert(const Key& key) {
+  // fast path for sequential insertion
+  if (!KeyIsAfterNode(key, prev_[0]->NoBarrier_Next(0)) &&
+      (prev_[0] == head_ || KeyIsAfterNode(key, prev_[0]))) {
+    assert(prev_[0] != head_ || (prev_height_ == 1 && GetMaxHeight() == 1));
+
+    // Outside of this method prev_[1..max_height_] is the predecessor
+    // of prev_[0], and prev_height_ refers to prev_[0].  Inside Insert
+    // prev_[0..max_height - 1] is the predecessor of key.  Switch from
+    // the external state to the internal
+    for (int i = 1; i < prev_height_; i++) {
+      prev_[i] = prev_[0];
+    }
+  } else {
+    // TODO(opt): we could use a NoBarrier predecessor search as an
+    // optimization for architectures where memory_order_acquire needs
+    // a synchronization instruction.  Doesn't matter on x86
+    FindLessThan(key, prev_);
+  }
+
+  // Our data structure does not allow duplicate insertion
+  assert(prev_[0]->Next(0) == nullptr || !Equal(key, prev_[0]->Next(0)->key));
+
+  int height = RandomHeight();
+  if (height > GetMaxHeight()) {
+    for (int i = GetMaxHeight(); i < height; i++) {
+      prev_[i] = head_;
+    }
+    // fprintf(stderr, "Change height from %d to %d\n", max_height_, height);
+
+    // It is ok to mutate max_height_ without any synchronization
+    // with concurrent readers.  A concurrent reader that observes
+    // the new value of max_height_ will see either the old value of
+    // new level pointers from head_ (nullptr), or a new value set in
+    // the loop below.  In the former case the reader will
+    // immediately drop to the next level since nullptr sorts after all
+    // keys.  In the latter case the reader will use the new node.
+    max_height_.store(height, std::memory_order_relaxed);
+  }
+
+  Node* x = NewNode(key, height);
+  for (int i = 0; i < height; i++) {
+    // NoBarrier_SetNext() suffices since we will add a barrier when
+    // we publish a pointer to "x" in prev[i].
+    x->NoBarrier_SetNext(i, prev_[i]->NoBarrier_Next(i));
+    prev_[i]->SetNext(i, x);
+  }
+  prev_[0] = x;
+  prev_height_ = height;
+}
+
+template <typename Key, class Comparator>
+bool SkipList<Key, Comparator>::Contains(const Key& key) const {
+  Node* x = FindGreaterOrEqual(key);
+  if (x != nullptr && Equal(key, x->key)) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/memtable/skiplist_test.cc b/src/rocksdb/memtable/skiplist_test.cc
new file mode 100644
index 000000000..a07088511
--- /dev/null
+++ b/src/rocksdb/memtable/skiplist_test.cc
@@ -0,0 +1,387 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "memtable/skiplist.h"
+
+#include <set>
+
+#include "memory/arena.h"
+#include "rocksdb/env.h"
+#include "test_util/testharness.h"
+#include "util/hash.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+using Key = uint64_t;
+
+struct TestComparator {
+  int operator()(const Key& a, const Key& b) const {
+    if (a < b) {
+      return -1;
+    } else if (a > b) {
+      return +1;
+    } else {
+      return 0;
+    }
+  }
+};
+
+class SkipTest : public testing::Test {};
+
+TEST_F(SkipTest, Empty) {
+  Arena arena;
+  TestComparator cmp;
+  SkipList<Key, TestComparator> list(cmp, &arena);
+  ASSERT_TRUE(!list.Contains(10));
+
+  SkipList<Key, TestComparator>::Iterator iter(&list);
+  ASSERT_TRUE(!iter.Valid());
+  iter.SeekToFirst();
+  ASSERT_TRUE(!iter.Valid());
+  iter.Seek(100);
+  ASSERT_TRUE(!iter.Valid());
+  iter.SeekForPrev(100);
+  ASSERT_TRUE(!iter.Valid());
+  iter.SeekToLast();
+  ASSERT_TRUE(!iter.Valid());
+}
+
+TEST_F(SkipTest, InsertAndLookup) {
+  const int N = 2000;
+  const int R = 5000;
+  Random rnd(1000);
+  std::set<Key> keys;
+  Arena arena;
+  TestComparator cmp;
+  SkipList<Key, TestComparator> list(cmp, &arena);
+  for (int i = 0; i < N; i++) {
+    Key key = rnd.Next() % R;
+    if (keys.insert(key).second) {
+      list.Insert(key);
+    }
+  }
+
+  for (int i = 0; i < R; i++) {
+    if (list.Contains(i)) {
+      ASSERT_EQ(keys.count(i), 1U);
+    } else {
+      ASSERT_EQ(keys.count(i), 0U);
+    }
+  }
+
+  // Simple iterator tests
+  {
+    SkipList<Key, TestComparator>::Iterator iter(&list);
+    ASSERT_TRUE(!iter.Valid());
+
+    iter.Seek(0);
+    ASSERT_TRUE(iter.Valid());
+    ASSERT_EQ(*(keys.begin()), iter.key());
+
+    iter.SeekForPrev(R - 1);
+    ASSERT_TRUE(iter.Valid());
+    ASSERT_EQ(*(keys.rbegin()), iter.key());
+
+    iter.SeekToFirst();
+    ASSERT_TRUE(iter.Valid());
+    ASSERT_EQ(*(keys.begin()), iter.key());
+
+    iter.SeekToLast();
+    ASSERT_TRUE(iter.Valid());
+    ASSERT_EQ(*(keys.rbegin()), iter.key());
+  }
+
+  // Forward iteration test
+  for (int i = 0; i < R; i++) {
+    SkipList<Key, TestComparator>::Iterator iter(&list);
+    iter.Seek(i);
+
+    // Compare against model iterator
+    std::set<Key>::iterator model_iter = keys.lower_bound(i);
+    for (int j = 0; j < 3; j++) {
+      if (model_iter == keys.end()) {
+        ASSERT_TRUE(!iter.Valid());
+        break;
+      } else {
+        ASSERT_TRUE(iter.Valid());
+        ASSERT_EQ(*model_iter, iter.key());
+        ++model_iter;
+        iter.Next();
+      }
+    }
+  }
+
+  // Backward iteration test
+  for (int i = 0; i < R; i++) {
+    SkipList<Key, TestComparator>::Iterator iter(&list);
+    iter.SeekForPrev(i);
+
+    // Compare against model iterator
+    std::set<Key>::iterator model_iter = keys.upper_bound(i);
+    for (int j = 0; j < 3; j++) {
+      if (model_iter == keys.begin()) {
+        ASSERT_TRUE(!iter.Valid());
+        break;
+      } else {
+        ASSERT_TRUE(iter.Valid());
+        ASSERT_EQ(*--model_iter, iter.key());
+        iter.Prev();
+      }
+    }
+  }
+}
+
+// We want to make sure that with a single writer and multiple
+// concurrent readers (with no synchronization other than when a
+// reader's iterator is created), the reader always observes all the
+// data that was present in the skip list when the iterator was
+// constructor.  Because insertions are happening concurrently, we may
+// also observe new values that were inserted since the iterator was
+// constructed, but we should never miss any values that were present
+// at iterator construction time.
+//
+// We generate multi-part keys:
+//     <key,gen,hash>
+// where:
+//     key is in range [0..K-1]
+//     gen is a generation number for key
+//     hash is hash(key,gen)
+//
+// The insertion code picks a random key, sets gen to be 1 + the last
+// generation number inserted for that key, and sets hash to Hash(key,gen).
+//
+// At the beginning of a read, we snapshot the last inserted
+// generation number for each key.  We then iterate, including random
+// calls to Next() and Seek().  For every key we encounter, we
+// check that it is either expected given the initial snapshot or has
+// been concurrently added since the iterator started.
+class ConcurrentTest {
+ private:
+  static const uint32_t K = 4;
+
+  static uint64_t key(Key key) { return (key >> 40); }
+  static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; }
+  static uint64_t hash(Key key) { return key & 0xff; }
+
+  static uint64_t HashNumbers(uint64_t k, uint64_t g) {
+    uint64_t data[2] = {k, g};
+    return Hash(reinterpret_cast<char*>(data), sizeof(data), 0);
+  }
+
+  static Key MakeKey(uint64_t k, uint64_t g) {
+    assert(sizeof(Key) == sizeof(uint64_t));
+    assert(k <= K);  // We sometimes pass K to seek to the end of the skiplist
+    assert(g <= 0xffffffffu);
+    return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff));
+  }
+
+  static bool IsValidKey(Key k) {
+    return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff);
+  }
+
+  static Key RandomTarget(Random* rnd) {
+    switch (rnd->Next() % 10) {
+      case 0:
+        // Seek to beginning
+        return MakeKey(0, 0);
+      case 1:
+        // Seek to end
+        return MakeKey(K, 0);
+      default:
+        // Seek to middle
+        return MakeKey(rnd->Next() % K, 0);
+    }
+  }
+
+  // Per-key generation
+  struct State {
+    std::atomic<int> generation[K];
+    void Set(int k, int v) {
+      generation[k].store(v, std::memory_order_release);
+    }
+    int Get(int k) { return generation[k].load(std::memory_order_acquire); }
+
+    State() {
+      for (unsigned int k = 0; k < K; k++) {
+        Set(k, 0);
+      }
+    }
+  };
+
+  // Current state of the test
+  State current_;
+
+  Arena arena_;
+
+  // SkipList is not protected by mu_.  We just use a single writer
+  // thread to modify it.
+  SkipList<Key, TestComparator> list_;
+
+ public:
+  ConcurrentTest() : list_(TestComparator(), &arena_) {}
+
+  // REQUIRES: External synchronization
+  void WriteStep(Random* rnd) {
+    const uint32_t k = rnd->Next() % K;
+    const int g = current_.Get(k) + 1;
+    const Key new_key = MakeKey(k, g);
+    list_.Insert(new_key);
+    current_.Set(k, g);
+  }
+
+  void ReadStep(Random* rnd) {
+    // Remember the initial committed state of the skiplist.
+    State initial_state;
+    for (unsigned int k = 0; k < K; k++) {
+      initial_state.Set(k, current_.Get(k));
+    }
+
+    Key pos = RandomTarget(rnd);
+    SkipList<Key, TestComparator>::Iterator iter(&list_);
+    iter.Seek(pos);
+    while (true) {
+      Key current;
+      if (!iter.Valid()) {
+        current = MakeKey(K, 0);
+      } else {
+        current = iter.key();
+        ASSERT_TRUE(IsValidKey(current)) << current;
+      }
+      ASSERT_LE(pos, current) << "should not go backwards";
+
+      // Verify that everything in [pos,current) was not present in
+      // initial_state.
+      while (pos < current) {
+        ASSERT_LT(key(pos), K) << pos;
+
+        // Note that generation 0 is never inserted, so it is ok if
+        // <*,0,*> is missing.
+        ASSERT_TRUE((gen(pos) == 0U) ||
+                    (gen(pos) > static_cast<uint64_t>(initial_state.Get(
+                                    static_cast<int>(key(pos))))))
+            << "key: " << key(pos) << "; gen: " << gen(pos)
+            << "; initgen: " << initial_state.Get(static_cast<int>(key(pos)));
+
+        // Advance to next key in the valid key space
+        if (key(pos) < key(current)) {
+          pos = MakeKey(key(pos) + 1, 0);
+        } else {
+          pos = MakeKey(key(pos), gen(pos) + 1);
+        }
+      }
+
+      if (!iter.Valid()) {
+        break;
+      }
+
+      if (rnd->Next() % 2) {
+        iter.Next();
+        pos = MakeKey(key(pos), gen(pos) + 1);
+      } else {
+        Key new_target = RandomTarget(rnd);
+        if (new_target > pos) {
+          pos = new_target;
+          iter.Seek(new_target);
+        }
+      }
+    }
+  }
+};
+const uint32_t ConcurrentTest::K;
+
+// Simple test that does single-threaded testing of the ConcurrentTest
+// scaffolding.
+TEST_F(SkipTest, ConcurrentWithoutThreads) {
+  ConcurrentTest test;
+  Random rnd(test::RandomSeed());
+  for (int i = 0; i < 10000; i++) {
+    test.ReadStep(&rnd);
+    test.WriteStep(&rnd);
+  }
+}
+
+class TestState {
+ public:
+  ConcurrentTest t_;
+  int seed_;
+  std::atomic<bool> quit_flag_;
+
+  enum ReaderState { STARTING, RUNNING, DONE };
+
+  explicit TestState(int s)
+      : seed_(s), quit_flag_(false), state_(STARTING), state_cv_(&mu_) {}
+
+  void Wait(ReaderState s) {
+    mu_.Lock();
+    while (state_ != s) {
+      state_cv_.Wait();
+    }
+    mu_.Unlock();
+  }
+
+  void Change(ReaderState s) {
+    mu_.Lock();
+    state_ = s;
+    state_cv_.Signal();
+    mu_.Unlock();
+  }
+
+ private:
+  port::Mutex mu_;
+  ReaderState state_;
+  port::CondVar state_cv_;
+};
+
+static void ConcurrentReader(void* arg) {
+  TestState* state = reinterpret_cast<TestState*>(arg);
+  Random rnd(state->seed_);
+  int64_t reads = 0;
+  state->Change(TestState::RUNNING);
+  while (!state->quit_flag_.load(std::memory_order_acquire)) {
+    state->t_.ReadStep(&rnd);
+    ++reads;
+  }
+  state->Change(TestState::DONE);
+}
+
+static void RunConcurrent(int run) {
+  const int seed = test::RandomSeed() + (run * 100);
+  Random rnd(seed);
+  const int N = 1000;
+  const int kSize = 1000;
+  for (int i = 0; i < N; i++) {
+    if ((i % 100) == 0) {
+      fprintf(stderr, "Run %d of %d\n", i, N);
+    }
+    TestState state(seed + 1);
+    Env::Default()->SetBackgroundThreads(1);
+    Env::Default()->Schedule(ConcurrentReader, &state);
+    state.Wait(TestState::RUNNING);
+    for (int k = 0; k < kSize; k++) {
+      state.t_.WriteStep(&rnd);
+    }
+    state.quit_flag_.store(true, std::memory_order_release);
+    state.Wait(TestState::DONE);
+  }
+}
+
+TEST_F(SkipTest, Concurrent1) { RunConcurrent(1); }
+TEST_F(SkipTest, Concurrent2) { RunConcurrent(2); }
+TEST_F(SkipTest, Concurrent3) { RunConcurrent(3); }
+TEST_F(SkipTest, Concurrent4) { RunConcurrent(4); }
+TEST_F(SkipTest, Concurrent5) { RunConcurrent(5); }
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/memtable/skiplistrep.cc b/src/rocksdb/memtable/skiplistrep.cc
new file mode 100644
index 000000000..40f13a2c1
--- /dev/null
+++ b/src/rocksdb/memtable/skiplistrep.cc
@@ -0,0 +1,370 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include <random>
+
+#include "db/memtable.h"
+#include "memory/arena.h"
+#include "memtable/inlineskiplist.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+class SkipListRep : public MemTableRep {
+  InlineSkipList<const MemTableRep::KeyComparator&> skip_list_;
+  const MemTableRep::KeyComparator& cmp_;
+  const SliceTransform* transform_;
+  const size_t lookahead_;
+
+  friend class LookaheadIterator;
+
+ public:
+  explicit SkipListRep(const MemTableRep::KeyComparator& compare,
+                       Allocator* allocator, const SliceTransform* transform,
+                       const size_t lookahead)
+      : MemTableRep(allocator),
+        skip_list_(compare, allocator),
+        cmp_(compare),
+        transform_(transform),
+        lookahead_(lookahead) {}
+
+  KeyHandle Allocate(const size_t len, char** buf) override {
+    *buf = skip_list_.AllocateKey(len);
+    return static_cast<KeyHandle>(*buf);
+  }
+
+  // Insert key into the list.
+  // REQUIRES: nothing that compares equal to key is currently in the list.
+  void Insert(KeyHandle handle) override {
+    skip_list_.Insert(static_cast<char*>(handle));
+  }
+
+  bool InsertKey(KeyHandle handle) override {
+    return skip_list_.Insert(static_cast<char*>(handle));
+  }
+
+  void InsertWithHint(KeyHandle handle, void** hint) override {
+    skip_list_.InsertWithHint(static_cast<char*>(handle), hint);
+  }
+
+  bool InsertKeyWithHint(KeyHandle handle, void** hint) override {
+    return skip_list_.InsertWithHint(static_cast<char*>(handle), hint);
+  }
+
+  void InsertWithHintConcurrently(KeyHandle handle, void** hint) override {
+    skip_list_.InsertWithHintConcurrently(static_cast<char*>(handle), hint);
+  }
+
+  bool InsertKeyWithHintConcurrently(KeyHandle handle, void** hint) override {
+    return skip_list_.InsertWithHintConcurrently(static_cast<char*>(handle),
+                                                 hint);
+  }
+
+  void InsertConcurrently(KeyHandle handle) override {
+    skip_list_.InsertConcurrently(static_cast<char*>(handle));
+  }
+
+  bool InsertKeyConcurrently(KeyHandle handle) override {
+    return skip_list_.InsertConcurrently(static_cast<char*>(handle));
+  }
+
+  // Returns true iff an entry that compares equal to key is in the list.
+  bool Contains(const char* key) const override {
+    return skip_list_.Contains(key);
+  }
+
+  size_t ApproximateMemoryUsage() override {
+    // All memory is allocated through allocator; nothing to report here
+    return 0;
+  }
+
+  void Get(const LookupKey& k, void* callback_args,
+           bool (*callback_func)(void* arg, const char* entry)) override {
+    SkipListRep::Iterator iter(&skip_list_);
+    Slice dummy_slice;
+    for (iter.Seek(dummy_slice, k.memtable_key().data());
+         iter.Valid() && callback_func(callback_args, iter.key());
+         iter.Next()) {
+    }
+  }
+
+  uint64_t ApproximateNumEntries(const Slice& start_ikey,
+                                 const Slice& end_ikey) override {
+    std::string tmp;
+    uint64_t start_count =
+        skip_list_.EstimateCount(EncodeKey(&tmp, start_ikey));
+    uint64_t end_count = skip_list_.EstimateCount(EncodeKey(&tmp, end_ikey));
+    return (end_count >= start_count) ? (end_count - start_count) : 0;
+  }
+
+  void UniqueRandomSample(const uint64_t num_entries,
+                          const uint64_t target_sample_size,
+                          std::unordered_set<const char*>* entries) override {
+    entries->clear();
+    // Avoid divide-by-0.
+    assert(target_sample_size > 0);
+    assert(num_entries > 0);
+    // NOTE: the size of entries is not enforced to be exactly
+    // target_sample_size at the end of this function, it might be slightly
+    // greater or smaller.
+    SkipListRep::Iterator iter(&skip_list_);
+    // There are two methods to create the subset of samples (size m)
+    // from the table containing N elements:
+    // 1-Iterate linearly through the N memtable entries. For each entry i,
+    //   add it to the sample set with a probability
+    //   (target_sample_size - entries.size() ) / (N-i).
+    //
+    // 2-Pick m random elements without repetition.
+    // We pick Option 2 when m<sqrt(N) and
+    // Option 1 when m > sqrt(N).
+    if (target_sample_size >
+        static_cast<uint64_t>(std::sqrt(1.0 * num_entries))) {
+      Random* rnd = Random::GetTLSInstance();
+      iter.SeekToFirst();
+      uint64_t counter = 0, num_samples_left = target_sample_size;
+      for (; iter.Valid() && (num_samples_left > 0); iter.Next(), counter++) {
+        // Add entry to sample set with probability
+        // num_samples_left/(num_entries - counter).
+        if (rnd->Next() % (num_entries - counter) < num_samples_left) {
+          entries->insert(iter.key());
+          num_samples_left--;
+        }
+      }
+    } else {
+      // Option 2: pick m random elements with no duplicates.
+      // If Option 2 is picked, then target_sample_size<sqrt(N)
+      // Using a set spares the need to check for duplicates.
+      for (uint64_t i = 0; i < target_sample_size; i++) {
+        // We give it 5 attempts to find a non-duplicate
+        // With 5 attempts, the chances of returning `entries` set
+        // of size target_sample_size is:
+        // PROD_{i=1}^{target_sample_size-1} [1-(i/N)^5]
+        // which is monotonically increasing with N in the worse case
+        // of target_sample_size=sqrt(N), and is always >99.9% for N>4.
+        // At worst, for the final pick , when m=sqrt(N) there is
+        // a probability of p= 1/sqrt(N) chances to find a duplicate.
+        for (uint64_t j = 0; j < 5; j++) {
+          iter.RandomSeek();
+          // unordered_set::insert returns pair<iterator, bool>.
+          // The second element is true if an insert successfully happened.
+          // If element is already in the set, this bool will be false, and
+          // true otherwise.
+          if ((entries->insert(iter.key())).second) {
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  ~SkipListRep() override {}
+
+  // Iteration over the contents of a skip list
+  class Iterator : public MemTableRep::Iterator {
+    InlineSkipList<const MemTableRep::KeyComparator&>::Iterator iter_;
+
+   public:
+    // Initialize an iterator over the specified list.
+    // The returned iterator is not valid.
+    explicit Iterator(
+        const InlineSkipList<const MemTableRep::KeyComparator&>* list)
+        : iter_(list) {}
+
+    ~Iterator() override {}
+
+    // Returns true iff the iterator is positioned at a valid node.
+    bool Valid() const override { return iter_.Valid(); }
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    const char* key() const override { return iter_.key(); }
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    void Next() override { iter_.Next(); }
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    void Prev() override { iter_.Prev(); }
+
+    // Advance to the first entry with a key >= target
+    void Seek(const Slice& user_key, const char* memtable_key) override {
+      if (memtable_key != nullptr) {
+        iter_.Seek(memtable_key);
+      } else {
+        iter_.Seek(EncodeKey(&tmp_, user_key));
+      }
+    }
+
+    // Retreat to the last entry with a key <= target
+    void SeekForPrev(const Slice& user_key, const char* memtable_key) override {
+      if (memtable_key != nullptr) {
+        iter_.SeekForPrev(memtable_key);
+      } else {
+        iter_.SeekForPrev(EncodeKey(&tmp_, user_key));
+      }
+    }
+
+    void RandomSeek() override { iter_.RandomSeek(); }
+
+    // Position at the first entry in list.
+    // Final state of iterator is Valid() iff list is not empty.
+    void SeekToFirst() override { iter_.SeekToFirst(); }
+
+    // Position at the last entry in list.
+    // Final state of iterator is Valid() iff list is not empty.
+    void SeekToLast() override { iter_.SeekToLast(); }
+
+   protected:
+    std::string tmp_;  // For passing to EncodeKey
+  };
+
+  // Iterator over the contents of a skip list which also keeps track of the
+  // previously visited node. In Seek(), it examines a few nodes after it
+  // first, falling back to O(log n) search from the head of the list only if
+  // the target key hasn't been found.
+  class LookaheadIterator : public MemTableRep::Iterator {
+   public:
+    explicit LookaheadIterator(const SkipListRep& rep)
+        : rep_(rep), iter_(&rep_.skip_list_), prev_(iter_) {}
+
+    ~LookaheadIterator() override {}
+
+    bool Valid() const override { return iter_.Valid(); }
+
+    const char* key() const override {
+      assert(Valid());
+      return iter_.key();
+    }
+
+    void Next() override {
+      assert(Valid());
+
+      bool advance_prev = true;
+      if (prev_.Valid()) {
+        auto k1 = rep_.UserKey(prev_.key());
+        auto k2 = rep_.UserKey(iter_.key());
+
+        if (k1.compare(k2) == 0) {
+          // same user key, don't move prev_
+          advance_prev = false;
+        } else if (rep_.transform_) {
+          // only advance prev_ if it has the same prefix as iter_
+          auto t1 = rep_.transform_->Transform(k1);
+          auto t2 = rep_.transform_->Transform(k2);
+          advance_prev = t1.compare(t2) == 0;
+        }
+      }
+
+      if (advance_prev) {
+        prev_ = iter_;
+      }
+      iter_.Next();
+    }
+
+    void Prev() override {
+      assert(Valid());
+      iter_.Prev();
+      prev_ = iter_;
+    }
+
+    void Seek(const Slice& internal_key, const char* memtable_key) override {
+      const char* encoded_key = (memtable_key != nullptr)
+                                    ? memtable_key
+                                    : EncodeKey(&tmp_, internal_key);
+
+      if (prev_.Valid() && rep_.cmp_(encoded_key, prev_.key()) >= 0) {
+        // prev_.key() is smaller or equal to our target key; do a quick
+        // linear search (at most lookahead_ steps) starting from prev_
+        iter_ = prev_;
+
+        size_t cur = 0;
+        while (cur++ <= rep_.lookahead_ && iter_.Valid()) {
+          if (rep_.cmp_(encoded_key, iter_.key()) <= 0) {
+            return;
+          }
+          Next();
+        }
+      }
+
+      iter_.Seek(encoded_key);
+      prev_ = iter_;
+    }
+
+    void SeekForPrev(const Slice& internal_key,
+                     const char* memtable_key) override {
+      const char* encoded_key = (memtable_key != nullptr)
+                                    ? memtable_key
+                                    : EncodeKey(&tmp_, internal_key);
+      iter_.SeekForPrev(encoded_key);
+      prev_ = iter_;
+    }
+
+    void SeekToFirst() override {
+      iter_.SeekToFirst();
+      prev_ = iter_;
+    }
+
+    void SeekToLast() override {
+      iter_.SeekToLast();
+      prev_ = iter_;
+    }
+
+   protected:
+    std::string tmp_;  // For passing to EncodeKey
+
+   private:
+    const SkipListRep& rep_;
+    InlineSkipList<const MemTableRep::KeyComparator&>::Iterator iter_;
+    InlineSkipList<const MemTableRep::KeyComparator&>::Iterator prev_;
+  };
+
+  MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override {
+    if (lookahead_ > 0) {
+      void* mem =
+          arena ? arena->AllocateAligned(sizeof(SkipListRep::LookaheadIterator))
+                :
+                operator new(sizeof(SkipListRep::LookaheadIterator));
+      return new (mem) SkipListRep::LookaheadIterator(*this);
+    } else {
+      void* mem = arena ? arena->AllocateAligned(sizeof(SkipListRep::Iterator))
+                        :
+                        operator new(sizeof(SkipListRep::Iterator));
+      return new (mem) SkipListRep::Iterator(&skip_list_);
+    }
+  }
+};
+}  // namespace
+
+static std::unordered_map<std::string, OptionTypeInfo> skiplist_factory_info = {
+#ifndef ROCKSDB_LITE
+    {"lookahead",
+     {0, OptionType::kSizeT, OptionVerificationType::kNormal,
+      OptionTypeFlags::kDontSerialize /*Since it is part of the ID*/}},
+#endif
+};
+
+SkipListFactory::SkipListFactory(size_t lookahead) : lookahead_(lookahead) {
+  RegisterOptions("SkipListFactoryOptions", &lookahead_,
+                  &skiplist_factory_info);
+}
+
+std::string SkipListFactory::GetId() const {
+  std::string id = Name();
+  if (lookahead_ > 0) {
+    id.append(":").append(std::to_string(lookahead_));
+  }
+  return id;
+}
+
+MemTableRep* SkipListFactory::CreateMemTableRep(
+    const MemTableRep::KeyComparator& compare, Allocator* allocator,
+    const SliceTransform* transform, Logger* /*logger*/) {
+  return new SkipListRep(compare, allocator, transform, lookahead_);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/memtable/stl_wrappers.h b/src/rocksdb/memtable/stl_wrappers.h
new file mode 100644
index 000000000..783a8088d
--- /dev/null
+++ b/src/rocksdb/memtable/stl_wrappers.h
@@ -0,0 +1,33 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <map>
+#include <string>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/slice.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace stl_wrappers {
+
+class Base {
+ protected:
+  const MemTableRep::KeyComparator& compare_;
+  explicit Base(const MemTableRep::KeyComparator& compare)
+      : compare_(compare) {}
+};
+
+struct Compare : private Base {
+  explicit Compare(const MemTableRep::KeyComparator& compare) : Base(compare) {}
+  inline bool operator()(const char* a, const char* b) const {
+    return compare_(a, b) < 0;
+  }
+};
+
+}  // namespace stl_wrappers
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/memtable/vectorrep.cc b/src/rocksdb/memtable/vectorrep.cc
new file mode 100644
index 000000000..293163349
--- /dev/null
+++ b/src/rocksdb/memtable/vectorrep.cc
@@ -0,0 +1,309 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+#include <algorithm>
+#include <memory>
+#include <set>
+#include <type_traits>
+#include <unordered_set>
+
+#include "db/memtable.h"
+#include "memory/arena.h"
+#include "memtable/stl_wrappers.h"
+#include "port/port.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+
+class VectorRep : public MemTableRep {
+ public:
+  VectorRep(const KeyComparator& compare, Allocator* allocator, size_t count);
+
+  // Insert key into the collection. (The caller will pack key and value into a
+  // single buffer and pass that in as the parameter to Insert)
+  // REQUIRES: nothing that compares equal to key is currently in the
+  // collection.
+  void Insert(KeyHandle handle) override;
+
+  // Returns true iff an entry that compares equal to key is in the collection.
+  bool Contains(const char* key) const override;
+
+  void MarkReadOnly() override;
+
+  size_t ApproximateMemoryUsage() override;
+
+  void Get(const LookupKey& k, void* callback_args,
+           bool (*callback_func)(void* arg, const char* entry)) override;
+
+  ~VectorRep() override {}
+
+  class Iterator : public MemTableRep::Iterator {
+    class VectorRep* vrep_;
+    std::shared_ptr<std::vector<const char*>> bucket_;
+    std::vector<const char*>::const_iterator mutable cit_;
+    const KeyComparator& compare_;
+    std::string tmp_;  // For passing to EncodeKey
+    bool mutable sorted_;
+    void DoSort() const;
+
+   public:
+    explicit Iterator(class VectorRep* vrep,
+                      std::shared_ptr<std::vector<const char*>> bucket,
+                      const KeyComparator& compare);
+
+    // Initialize an iterator over the specified collection.
+    // The returned iterator is not valid.
+    // explicit Iterator(const MemTableRep* collection);
+    ~Iterator() override{};
+
+    // Returns true iff the iterator is positioned at a valid node.
+    bool Valid() const override;
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    const char* key() const override;
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    void Next() override;
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    void Prev() override;
+
+    // Advance to the first entry with a key >= target
+    void Seek(const Slice& user_key, const char* memtable_key) override;
+
+    // Advance to the first entry with a key <= target
+    void SeekForPrev(const Slice& user_key, const char* memtable_key) override;
+
+    // Position at the first entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    void SeekToFirst() override;
+
+    // Position at the last entry in collection.
+    // Final state of iterator is Valid() iff collection is not empty.
+    void SeekToLast() override;
+  };
+
+  // Return an iterator over the keys in this representation.
+  MemTableRep::Iterator* GetIterator(Arena* arena) override;
+
+ private:
+  friend class Iterator;
+  using Bucket = std::vector<const char*>;
+  std::shared_ptr<Bucket> bucket_;
+  mutable port::RWMutex rwlock_;
+  bool immutable_;
+  bool sorted_;
+  const KeyComparator& compare_;
+};
+
+void VectorRep::Insert(KeyHandle handle) {
+  auto* key = static_cast<char*>(handle);
+  WriteLock l(&rwlock_);
+  assert(!immutable_);
+  bucket_->push_back(key);
+}
+
+// Returns true iff an entry that compares equal to key is in the collection.
+bool VectorRep::Contains(const char* key) const {
+  ReadLock l(&rwlock_);
+  return std::find(bucket_->begin(), bucket_->end(), key) != bucket_->end();
+}
+
+void VectorRep::MarkReadOnly() {
+  WriteLock l(&rwlock_);
+  immutable_ = true;
+}
+
+size_t VectorRep::ApproximateMemoryUsage() {
+  return sizeof(bucket_) + sizeof(*bucket_) +
+         bucket_->size() *
+             sizeof(
+                 std::remove_reference<decltype(*bucket_)>::type::value_type);
+}
+
+VectorRep::VectorRep(const KeyComparator& compare, Allocator* allocator,
+                     size_t count)
+    : MemTableRep(allocator),
+      bucket_(new Bucket()),
+      immutable_(false),
+      sorted_(false),
+      compare_(compare) {
+  bucket_.get()->reserve(count);
+}
+
+VectorRep::Iterator::Iterator(class VectorRep* vrep,
+                              std::shared_ptr<std::vector<const char*>> bucket,
+                              const KeyComparator& compare)
+    : vrep_(vrep),
+      bucket_(bucket),
+      cit_(bucket_->end()),
+      compare_(compare),
+      sorted_(false) {}
+
+void VectorRep::Iterator::DoSort() const {
+  // vrep is non-null means that we are working on an immutable memtable
+  if (!sorted_ && vrep_ != nullptr) {
+    WriteLock l(&vrep_->rwlock_);
+    if (!vrep_->sorted_) {
+      std::sort(bucket_->begin(), bucket_->end(),
+                stl_wrappers::Compare(compare_));
+      cit_ = bucket_->begin();
+      vrep_->sorted_ = true;
+    }
+    sorted_ = true;
+  }
+  if (!sorted_) {
+    std::sort(bucket_->begin(), bucket_->end(),
+              stl_wrappers::Compare(compare_));
+    cit_ = bucket_->begin();
+    sorted_ = true;
+  }
+  assert(sorted_);
+  assert(vrep_ == nullptr || vrep_->sorted_);
+}
+
+// Returns true iff the iterator is positioned at a valid node.
+bool VectorRep::Iterator::Valid() const {
+  DoSort();
+  return cit_ != bucket_->end();
+}
+
+// Returns the key at the current position.
+// REQUIRES: Valid()
+const char* VectorRep::Iterator::key() const {
+  assert(sorted_);
+  return *cit_;
+}
+
+// Advances to the next position.
+// REQUIRES: Valid()
+void VectorRep::Iterator::Next() {
+  assert(sorted_);
+  if (cit_ == bucket_->end()) {
+    return;
+  }
+  ++cit_;
+}
+
+// Advances to the previous position.
+// REQUIRES: Valid()
+void VectorRep::Iterator::Prev() {
+  assert(sorted_);
+  if (cit_ == bucket_->begin()) {
+    // If you try to go back from the first element, the iterator should be
+    // invalidated. So we set it to past-the-end. This means that you can
+    // treat the container circularly.
+    cit_ = bucket_->end();
+  } else {
+    --cit_;
+  }
+}
+
+// Advance to the first entry with a key >= target
+void VectorRep::Iterator::Seek(const Slice& user_key,
+                               const char* memtable_key) {
+  DoSort();
+  // Do binary search to find first value not less than the target
+  const char* encoded_key =
+      (memtable_key != nullptr) ? memtable_key : EncodeKey(&tmp_, user_key);
+  cit_ = std::equal_range(bucket_->begin(), bucket_->end(), encoded_key,
+                          [this](const char* a, const char* b) {
+                            return compare_(a, b) < 0;
+                          })
+             .first;
+}
+
+// Advance to the first entry with a key <= target
+void VectorRep::Iterator::SeekForPrev(const Slice& /*user_key*/,
+                                      const char* /*memtable_key*/) {
+  assert(false);
+}
+
+// Position at the first entry in collection.
+// Final state of iterator is Valid() iff collection is not empty.
+void VectorRep::Iterator::SeekToFirst() {
+  DoSort();
+  cit_ = bucket_->begin();
+}
+
+// Position at the last entry in collection.
+// Final state of iterator is Valid() iff collection is not empty.
+void VectorRep::Iterator::SeekToLast() {
+  DoSort();
+  cit_ = bucket_->end();
+  if (bucket_->size() != 0) {
+    --cit_;
+  }
+}
+
+void VectorRep::Get(const LookupKey& k, void* callback_args,
+                    bool (*callback_func)(void* arg, const char* entry)) {
+  rwlock_.ReadLock();
+  VectorRep* vector_rep;
+  std::shared_ptr<Bucket> bucket;
+  if (immutable_) {
+    vector_rep = this;
+  } else {
+    vector_rep = nullptr;
+    bucket.reset(new Bucket(*bucket_));  // make a copy
+  }
+  VectorRep::Iterator iter(vector_rep, immutable_ ? bucket_ : bucket, compare_);
+  rwlock_.ReadUnlock();
+
+  for (iter.Seek(k.user_key(), k.memtable_key().data());
+       iter.Valid() && callback_func(callback_args, iter.key()); iter.Next()) {
+  }
+}
+
+MemTableRep::Iterator* VectorRep::GetIterator(Arena* arena) {
+  char* mem = nullptr;
+  if (arena != nullptr) {
+    mem = arena->AllocateAligned(sizeof(Iterator));
+  }
+  ReadLock l(&rwlock_);
+  // Do not sort here. The sorting would be done the first time
+  // a Seek is performed on the iterator.
+  if (immutable_) {
+    if (arena == nullptr) {
+      return new Iterator(this, bucket_, compare_);
+    } else {
+      return new (mem) Iterator(this, bucket_, compare_);
+    }
+  } else {
+    std::shared_ptr<Bucket> tmp;
+    tmp.reset(new Bucket(*bucket_));  // make a copy
+    if (arena == nullptr) {
+      return new Iterator(nullptr, tmp, compare_);
+    } else {
+      return new (mem) Iterator(nullptr, tmp, compare_);
+    }
+  }
+}
+}  // namespace
+
+static std::unordered_map<std::string, OptionTypeInfo> vector_rep_table_info = {
+    {"count",
+     {0, OptionType::kSizeT, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+};
+
+VectorRepFactory::VectorRepFactory(size_t count) : count_(count) {
+  RegisterOptions("VectorRepFactoryOptions", &count_, &vector_rep_table_info);
+}
+
+MemTableRep* VectorRepFactory::CreateMemTableRep(
+    const MemTableRep::KeyComparator& compare, Allocator* allocator,
+    const SliceTransform*, Logger* /*logger*/) {
+  return new VectorRep(compare, allocator, count_);
+}
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/memtable/write_buffer_manager.cc b/src/rocksdb/memtable/write_buffer_manager.cc
new file mode 100644
index 000000000..8db9816be
--- /dev/null
+++ b/src/rocksdb/memtable/write_buffer_manager.cc
@@ -0,0 +1,202 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/write_buffer_manager.h"
+
+#include <memory>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_reservation_manager.h"
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/status.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+WriteBufferManager::WriteBufferManager(size_t _buffer_size,
+                                       std::shared_ptr<Cache> cache,
+                                       bool allow_stall)
+    : buffer_size_(_buffer_size),
+      mutable_limit_(buffer_size_ * 7 / 8),
+      memory_used_(0),
+      memory_active_(0),
+      cache_res_mgr_(nullptr),
+      allow_stall_(allow_stall),
+      stall_active_(false) {
+#ifndef ROCKSDB_LITE
+  if (cache) {
+    // Memtable's memory usage tends to fluctuate frequently
+    // therefore we set delayed_decrease = true to save some dummy entry
+    // insertion on memory increase right after memory decrease
+    cache_res_mgr_ = std::make_shared<
+        CacheReservationManagerImpl<CacheEntryRole::kWriteBuffer>>(
+        cache, true /* delayed_decrease */);
+  }
+#else
+  (void)cache;
+#endif  // ROCKSDB_LITE
+}
+
+WriteBufferManager::~WriteBufferManager() {
+#ifndef NDEBUG
+  std::unique_lock<std::mutex> lock(mu_);
+  assert(queue_.empty());
+#endif
+}
+
+std::size_t WriteBufferManager::dummy_entries_in_cache_usage() const {
+  if (cache_res_mgr_ != nullptr) {
+    return cache_res_mgr_->GetTotalReservedCacheSize();
+  } else {
+    return 0;
+  }
+}
+
+void WriteBufferManager::ReserveMem(size_t mem) {
+  if (cache_res_mgr_ != nullptr) {
+    ReserveMemWithCache(mem);
+  } else if (enabled()) {
+    memory_used_.fetch_add(mem, std::memory_order_relaxed);
+  }
+  if (enabled()) {
+    memory_active_.fetch_add(mem, std::memory_order_relaxed);
+  }
+}
+
+// Should only be called from write thread
+void WriteBufferManager::ReserveMemWithCache(size_t mem) {
+#ifndef ROCKSDB_LITE
+  assert(cache_res_mgr_ != nullptr);
+  // Use a mutex to protect various data structures. Can be optimized to a
+  // lock-free solution if it ends up with a performance bottleneck.
+  std::lock_guard<std::mutex> lock(cache_res_mgr_mu_);
+
+  size_t new_mem_used = memory_used_.load(std::memory_order_relaxed) + mem;
+  memory_used_.store(new_mem_used, std::memory_order_relaxed);
+  Status s = cache_res_mgr_->UpdateCacheReservation(new_mem_used);
+
+  // We absorb the error since WriteBufferManager is not able to handle
+  // this failure properly. Ideallly we should prevent this allocation
+  // from happening if this cache charging fails.
+  // [TODO] We'll need to improve it in the future and figure out what to do on
+  // error
+  s.PermitUncheckedError();
+#else
+  (void)mem;
+#endif  // ROCKSDB_LITE
+}
+
+void WriteBufferManager::ScheduleFreeMem(size_t mem) {
+  if (enabled()) {
+    memory_active_.fetch_sub(mem, std::memory_order_relaxed);
+  }
+}
+
+void WriteBufferManager::FreeMem(size_t mem) {
+  if (cache_res_mgr_ != nullptr) {
+    FreeMemWithCache(mem);
+  } else if (enabled()) {
+    memory_used_.fetch_sub(mem, std::memory_order_relaxed);
+  }
+  // Check if stall is active and can be ended.
+  MaybeEndWriteStall();
+}
+
+void WriteBufferManager::FreeMemWithCache(size_t mem) {
+#ifndef ROCKSDB_LITE
+  assert(cache_res_mgr_ != nullptr);
+  // Use a mutex to protect various data structures. Can be optimized to a
+  // lock-free solution if it ends up with a performance bottleneck.
+  std::lock_guard<std::mutex> lock(cache_res_mgr_mu_);
+  size_t new_mem_used = memory_used_.load(std::memory_order_relaxed) - mem;
+  memory_used_.store(new_mem_used, std::memory_order_relaxed);
+  Status s = cache_res_mgr_->UpdateCacheReservation(new_mem_used);
+
+  // We absorb the error since WriteBufferManager is not able to handle
+  // this failure properly.
+  // [TODO] We'll need to improve it in the future and figure out what to do on
+  // error
+  s.PermitUncheckedError();
+#else
+  (void)mem;
+#endif  // ROCKSDB_LITE
+}
+
+void WriteBufferManager::BeginWriteStall(StallInterface* wbm_stall) {
+  assert(wbm_stall != nullptr);
+  assert(allow_stall_);
+
+  // Allocate outside of the lock.
+  std::list<StallInterface*> new_node = {wbm_stall};
+
+  {
+    std::unique_lock<std::mutex> lock(mu_);
+    // Verify if the stall conditions are stil active.
+    if (ShouldStall()) {
+      stall_active_.store(true, std::memory_order_relaxed);
+      queue_.splice(queue_.end(), std::move(new_node));
+    }
+  }
+
+  // If the node was not consumed, the stall has ended already and we can signal
+  // the caller.
+  if (!new_node.empty()) {
+    new_node.front()->Signal();
+  }
+}
+
+// Called when memory is freed in FreeMem or the buffer size has changed.
+void WriteBufferManager::MaybeEndWriteStall() {
+  // Cannot early-exit on !enabled() because SetBufferSize(0) needs to unblock
+  // the writers.
+  if (!allow_stall_) {
+    return;
+  }
+
+  if (IsStallThresholdExceeded()) {
+    return;  // Stall conditions have not resolved.
+  }
+
+  // Perform all deallocations outside of the lock.
+  std::list<StallInterface*> cleanup;
+
+  std::unique_lock<std::mutex> lock(mu_);
+  if (!stall_active_.load(std::memory_order_relaxed)) {
+    return;  // Nothing to do.
+  }
+
+  // Unblock new writers.
+  stall_active_.store(false, std::memory_order_relaxed);
+
+  // Unblock the writers in the queue.
+  for (StallInterface* wbm_stall : queue_) {
+    wbm_stall->Signal();
+  }
+  cleanup = std::move(queue_);
+}
+
+void WriteBufferManager::RemoveDBFromQueue(StallInterface* wbm_stall) {
+  assert(wbm_stall != nullptr);
+
+  // Deallocate the removed nodes outside of the lock.
+  std::list<StallInterface*> cleanup;
+
+  if (enabled() && allow_stall_) {
+    std::unique_lock<std::mutex> lock(mu_);
+    for (auto it = queue_.begin(); it != queue_.end();) {
+      auto next = std::next(it);
+      if (*it == wbm_stall) {
+        cleanup.splice(cleanup.end(), queue_, std::move(it));
+      }
+      it = next;
+    }
+  }
+  wbm_stall->Signal();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/memtable/write_buffer_manager_test.cc b/src/rocksdb/memtable/write_buffer_manager_test.cc
new file mode 100644
index 000000000..1cc4c2cc5
--- /dev/null
+++ b/src/rocksdb/memtable/write_buffer_manager_test.cc
@@ -0,0 +1,305 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/write_buffer_manager.h"
+
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+class WriteBufferManagerTest : public testing::Test {};
+
+#ifndef ROCKSDB_LITE
+const size_t kSizeDummyEntry = 256 * 1024;
+
+TEST_F(WriteBufferManagerTest, ShouldFlush) {
+  // A write buffer manager of size 10MB
+  std::unique_ptr<WriteBufferManager> wbf(
+      new WriteBufferManager(10 * 1024 * 1024));
+
+  wbf->ReserveMem(8 * 1024 * 1024);
+  ASSERT_FALSE(wbf->ShouldFlush());
+  // 90% of the hard limit will hit the condition
+  wbf->ReserveMem(1 * 1024 * 1024);
+  ASSERT_TRUE(wbf->ShouldFlush());
+  // Scheduling for freeing will release the condition
+  wbf->ScheduleFreeMem(1 * 1024 * 1024);
+  ASSERT_FALSE(wbf->ShouldFlush());
+
+  wbf->ReserveMem(2 * 1024 * 1024);
+  ASSERT_TRUE(wbf->ShouldFlush());
+
+  wbf->ScheduleFreeMem(4 * 1024 * 1024);
+  // 11MB total, 6MB mutable. hard limit still hit
+  ASSERT_TRUE(wbf->ShouldFlush());
+
+  wbf->ScheduleFreeMem(2 * 1024 * 1024);
+  // 11MB total, 4MB mutable. hard limit stills but won't flush because more
+  // than half data is already being flushed.
+  ASSERT_FALSE(wbf->ShouldFlush());
+
+  wbf->ReserveMem(4 * 1024 * 1024);
+  // 15 MB total, 8MB mutable.
+  ASSERT_TRUE(wbf->ShouldFlush());
+
+  wbf->FreeMem(7 * 1024 * 1024);
+  // 8MB total, 8MB mutable.
+  ASSERT_FALSE(wbf->ShouldFlush());
+
+  // change size: 8M limit, 7M mutable limit
+  wbf->SetBufferSize(8 * 1024 * 1024);
+  // 8MB total, 8MB mutable.
+  ASSERT_TRUE(wbf->ShouldFlush());
+
+  wbf->ScheduleFreeMem(2 * 1024 * 1024);
+  // 8MB total, 6MB mutable.
+  ASSERT_TRUE(wbf->ShouldFlush());
+
+  wbf->FreeMem(2 * 1024 * 1024);
+  // 6MB total, 6MB mutable.
+  ASSERT_FALSE(wbf->ShouldFlush());
+
+  wbf->ReserveMem(1 * 1024 * 1024);
+  // 7MB total, 7MB mutable.
+  ASSERT_FALSE(wbf->ShouldFlush());
+
+  wbf->ReserveMem(1 * 1024 * 1024);
+  // 8MB total, 8MB mutable.
+  ASSERT_TRUE(wbf->ShouldFlush());
+
+  wbf->ScheduleFreeMem(1 * 1024 * 1024);
+  wbf->FreeMem(1 * 1024 * 1024);
+  // 7MB total, 7MB mutable.
+  ASSERT_FALSE(wbf->ShouldFlush());
+}
+
+class ChargeWriteBufferTest : public testing::Test {};
+
+TEST_F(ChargeWriteBufferTest, Basic) {
+  constexpr std::size_t kMetaDataChargeOverhead = 10000;
+
+  LRUCacheOptions co;
+  // 1GB cache
+  co.capacity = 1024 * 1024 * 1024;
+  co.num_shard_bits = 4;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<Cache> cache = NewLRUCache(co);
+  // A write buffer manager of size 50MB
+  std::unique_ptr<WriteBufferManager> wbf(
+      new WriteBufferManager(50 * 1024 * 1024, cache));
+
+  // Allocate 333KB will allocate 512KB, memory_used_ = 333KB
+  wbf->ReserveMem(333 * 1024);
+  // 2 dummy entries are added for size 333 KB
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 2 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 2 * 256 * 1024);
+  ASSERT_LT(cache->GetPinnedUsage(), 2 * 256 * 1024 + kMetaDataChargeOverhead);
+
+  // Allocate another 512KB, memory_used_ = 845KB
+  wbf->ReserveMem(512 * 1024);
+  // 2 more dummy entries are added for size 512 KB
+  // since ceil((memory_used_ - dummy_entries_in_cache_usage) % kSizeDummyEntry)
+  // = 2
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 4 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 4 * 256 * 1024);
+  ASSERT_LT(cache->GetPinnedUsage(), 4 * 256 * 1024 + kMetaDataChargeOverhead);
+
+  // Allocate another 10MB, memory_used_ = 11085KB
+  wbf->ReserveMem(10 * 1024 * 1024);
+  // 40 more entries are added for size 10 * 1024 * 1024 KB
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 44 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 44 * 256 * 1024);
+  ASSERT_LT(cache->GetPinnedUsage(), 44 * 256 * 1024 + kMetaDataChargeOverhead);
+
+  // Free 1MB, memory_used_ = 10061KB
+  // It will not cause any change in cache cost
+  // since memory_used_ > dummy_entries_in_cache_usage * (3/4)
+  wbf->FreeMem(1 * 1024 * 1024);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 44 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 44 * 256 * 1024);
+  ASSERT_LT(cache->GetPinnedUsage(), 44 * 256 * 1024 + kMetaDataChargeOverhead);
+  ASSERT_FALSE(wbf->ShouldFlush());
+
+  // Allocate another 41MB, memory_used_ = 52045KB
+  wbf->ReserveMem(41 * 1024 * 1024);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 204 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 204 * 256 * 1024);
+  ASSERT_LT(cache->GetPinnedUsage(),
+            204 * 256 * 1024 + kMetaDataChargeOverhead);
+  ASSERT_TRUE(wbf->ShouldFlush());
+
+  ASSERT_TRUE(wbf->ShouldFlush());
+
+  // Schedule free 20MB, memory_used_ = 52045KB
+  // It will not cause any change in memory_used and cache cost
+  wbf->ScheduleFreeMem(20 * 1024 * 1024);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 204 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 204 * 256 * 1024);
+  ASSERT_LT(cache->GetPinnedUsage(),
+            204 * 256 * 1024 + kMetaDataChargeOverhead);
+  // Still need flush as the hard limit hits
+  ASSERT_TRUE(wbf->ShouldFlush());
+
+  // Free 20MB, memory_used_ = 31565KB
+  // It will releae 80 dummy entries from cache since
+  // since memory_used_ < dummy_entries_in_cache_usage * (3/4)
+  // and floor((dummy_entries_in_cache_usage - memory_used_) % kSizeDummyEntry)
+  // = 80
+  wbf->FreeMem(20 * 1024 * 1024);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 124 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 124 * 256 * 1024);
+  ASSERT_LT(cache->GetPinnedUsage(),
+            124 * 256 * 1024 + kMetaDataChargeOverhead);
+
+  ASSERT_FALSE(wbf->ShouldFlush());
+
+  // Free 16KB, memory_used_ = 31549KB
+  // It will not release any dummy entry since memory_used_ >=
+  // dummy_entries_in_cache_usage * (3/4)
+  wbf->FreeMem(16 * 1024);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 124 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 124 * 256 * 1024);
+  ASSERT_LT(cache->GetPinnedUsage(),
+            124 * 256 * 1024 + kMetaDataChargeOverhead);
+
+  // Free 20MB, memory_used_ = 11069KB
+  // It will releae 80 dummy entries from cache
+  // since memory_used_ < dummy_entries_in_cache_usage * (3/4)
+  // and floor((dummy_entries_in_cache_usage - memory_used_) % kSizeDummyEntry)
+  // = 80
+  wbf->FreeMem(20 * 1024 * 1024);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 44 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 44 * 256 * 1024);
+  ASSERT_LT(cache->GetPinnedUsage(), 44 * 256 * 1024 + kMetaDataChargeOverhead);
+
+  // Free 1MB, memory_used_ = 10045KB
+  // It will not cause any change in cache cost
+  // since memory_used_ > dummy_entries_in_cache_usage * (3/4)
+  wbf->FreeMem(1 * 1024 * 1024);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 44 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 44 * 256 * 1024);
+  ASSERT_LT(cache->GetPinnedUsage(), 44 * 256 * 1024 + kMetaDataChargeOverhead);
+
+  // Reserve 512KB, memory_used_ = 10557KB
+  // It will not casue any change in cache cost
+  // since memory_used_ > dummy_entries_in_cache_usage * (3/4)
+  // which reflects the benefit of saving dummy entry insertion on memory
+  // reservation after delay decrease
+  wbf->ReserveMem(512 * 1024);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 44 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 44 * 256 * 1024);
+  ASSERT_LT(cache->GetPinnedUsage(), 44 * 256 * 1024 + kMetaDataChargeOverhead);
+
+  // Destroy write buffer manger should free everything
+  wbf.reset();
+  ASSERT_EQ(cache->GetPinnedUsage(), 0);
+}
+
+TEST_F(ChargeWriteBufferTest, BasicWithNoBufferSizeLimit) {
+  constexpr std::size_t kMetaDataChargeOverhead = 10000;
+  // 1GB cache
+  std::shared_ptr<Cache> cache = NewLRUCache(1024 * 1024 * 1024, 4);
+  // A write buffer manager of size 256MB
+  std::unique_ptr<WriteBufferManager> wbf(new WriteBufferManager(0, cache));
+
+  // Allocate 10MB,  memory_used_ = 10240KB
+  // It will allocate 40 dummy entries
+  wbf->ReserveMem(10 * 1024 * 1024);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 40 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 40 * 256 * 1024);
+  ASSERT_LT(cache->GetPinnedUsage(), 40 * 256 * 1024 + kMetaDataChargeOverhead);
+
+  ASSERT_FALSE(wbf->ShouldFlush());
+
+  // Free 9MB,  memory_used_ = 1024KB
+  // It will free 36 dummy entries
+  wbf->FreeMem(9 * 1024 * 1024);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 4 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 4 * 256 * 1024);
+  ASSERT_LT(cache->GetPinnedUsage(), 4 * 256 * 1024 + kMetaDataChargeOverhead);
+
+  // Free 160KB gradually, memory_used_ = 864KB
+  // It will not cause any change
+  // since memory_used_ > dummy_entries_in_cache_usage * 3/4
+  for (int i = 0; i < 40; i++) {
+    wbf->FreeMem(4 * 1024);
+  }
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 4 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 4 * 256 * 1024);
+  ASSERT_LT(cache->GetPinnedUsage(), 4 * 256 * 1024 + kMetaDataChargeOverhead);
+}
+
+TEST_F(ChargeWriteBufferTest, BasicWithCacheFull) {
+  constexpr std::size_t kMetaDataChargeOverhead = 20000;
+
+  // 12MB cache size with strict capacity
+  LRUCacheOptions lo;
+  lo.capacity = 12 * 1024 * 1024;
+  lo.num_shard_bits = 0;
+  lo.strict_capacity_limit = true;
+  std::shared_ptr<Cache> cache = NewLRUCache(lo);
+  std::unique_ptr<WriteBufferManager> wbf(new WriteBufferManager(0, cache));
+
+  // Allocate 10MB, memory_used_ = 10240KB
+  wbf->ReserveMem(10 * 1024 * 1024);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 40 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 40 * kSizeDummyEntry);
+  ASSERT_LT(cache->GetPinnedUsage(),
+            40 * kSizeDummyEntry + kMetaDataChargeOverhead);
+
+  // Allocate 10MB, memory_used_ = 20480KB
+  // Some dummy entry insertion will fail due to full cache
+  wbf->ReserveMem(10 * 1024 * 1024);
+  ASSERT_GE(cache->GetPinnedUsage(), 40 * kSizeDummyEntry);
+  ASSERT_LE(cache->GetPinnedUsage(), 12 * 1024 * 1024);
+  ASSERT_LT(wbf->dummy_entries_in_cache_usage(), 80 * kSizeDummyEntry);
+
+  // Free 15MB after encoutering cache full, memory_used_ = 5120KB
+  wbf->FreeMem(15 * 1024 * 1024);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 20 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 20 * kSizeDummyEntry);
+  ASSERT_LT(cache->GetPinnedUsage(),
+            20 * kSizeDummyEntry + kMetaDataChargeOverhead);
+
+  // Reserve 15MB, creating cache full again, memory_used_ = 20480KB
+  wbf->ReserveMem(15 * 1024 * 1024);
+  ASSERT_LE(cache->GetPinnedUsage(), 12 * 1024 * 1024);
+  ASSERT_LT(wbf->dummy_entries_in_cache_usage(), 80 * kSizeDummyEntry);
+
+  // Increase capacity so next insert will fully succeed
+  cache->SetCapacity(40 * 1024 * 1024);
+
+  // Allocate 10MB, memory_used_ = 30720KB
+  wbf->ReserveMem(10 * 1024 * 1024);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 120 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 120 * kSizeDummyEntry);
+  ASSERT_LT(cache->GetPinnedUsage(),
+            120 * kSizeDummyEntry + kMetaDataChargeOverhead);
+
+  // Gradually release 20 MB
+  // It ended up sequentially releasing 32, 24, 18 dummy entries when
+  // memory_used_ decreases to 22528KB, 16384KB, 11776KB.
+  // In total, it releases 74 dummy entries
+  for (int i = 0; i < 40; i++) {
+    wbf->FreeMem(512 * 1024);
+  }
+
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 46 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 46 * kSizeDummyEntry);
+  ASSERT_LT(cache->GetPinnedUsage(),
+            46 * kSizeDummyEntry + kMetaDataChargeOverhead);
+}
+
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/microbench/CMakeLists.txt b/src/rocksdb/microbench/CMakeLists.txt
new file mode 100644
index 000000000..483e97973
--- /dev/null
+++ b/src/rocksdb/microbench/CMakeLists.txt
@@ -0,0 +1,17 @@
+find_package(benchmark REQUIRED)
+find_package(Threads REQUIRED)
+
+file(GLOB_RECURSE ALL_BENCH_CPP *.cc)
+foreach(ONE_BENCH_CPP ${ALL_BENCH_CPP})
+  get_filename_component(TARGET_NAME ${ONE_BENCH_CPP} NAME_WE)
+  add_executable(${TARGET_NAME} ${ONE_BENCH_CPP})
+  target_link_libraries(${TARGET_NAME} ${ROCKSDB_LIB} benchmark::benchmark
+          ${CMAKE_THREAD_LIBS_INIT})
+  # run benchmark like a test, if added, the benchmark tests could be run by `ctest -R Bench_`
+  # add_test(Bench_${TARGET_NAME} ${TARGET_NAME})
+  list(APPEND ALL_BENCH_TARGETS ${TARGET_NAME})
+endforeach()
+add_custom_target(microbench DEPENDS ${ALL_BENCH_TARGETS})
+add_custom_target(run_microbench
+        COMMAND for t in ${ALL_BENCH_TARGETS}\; do \.\/$$t \|\| exit 1\; done
+        DEPENDS ${ALL_BENCH_TARGETS})
diff --git a/src/rocksdb/microbench/README.md b/src/rocksdb/microbench/README.md
new file mode 100644
index 000000000..290ca58d7
--- /dev/null
+++ b/src/rocksdb/microbench/README.md
@@ -0,0 +1,60 @@
+# RocksDB Micro-Benchmark
+
+## Overview
+
+RocksDB micro-benchmark is a set of tests for benchmarking a single component or simple DB operations. The test artificially generates input data and executes the same operation with it to collect and report performance metrics. As it's focusing on testing a single, well-defined operation, the result is more precise and reproducible, which also has its limitation of not representing a real production use case. The test author needs to carefully design the microbench to represent its true purpose.
+
+The tests are based on [Google Benchmark](https://github.com/google/benchmark) library, which provides a standard framework for writing benchmarks.
+
+## How to Run
+### Prerequisite
+Install the [Google Benchmark](https://github.com/google/benchmark) version `1.6.0` or above.
+
+*Note: Google Benchmark `1.6.x` is incompatible with previous versions like `1.5.x`, please make sure you're using the newer version.*
+
+### Build and Run
+With `Makefile`:
+```bash
+$ DEBUG_LEVEL=0 make run_microbench
+```
+Or with cmake:
+```bash
+$ mkdir build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_BENCHMARK
+$ make run_microbench
+```
+
+*Note: Please run the benchmark code in release build.*
+### Run Single Test
+Example:
+```bash
+$ make db_basic_bench
+$ ./db_basic_bench --benchmark_filter=<TEST_NAME>
+```
+
+## Best Practices
+#### * Use the Same Test Directory Setting as Unittest
+Most of the Micro-benchmark tests use the same test directory setup as unittest, so it could be overridden by:
+```bash
+$ TEST_TMPDIR=/mydata/tmp/ ./db_basic_bench --benchmark_filter=<TEST_NAME>
+```
+Please also follow that when designing new tests.
+
+#### * Avoid Using Debug API
+Even though micro-benchmark is a test, avoid using internal Debug API like TEST_WaitForRun() which is designed for unittest. As benchmark tests are designed for release build, don't use any of that.
+
+#### * Pay Attention to Local Optimization
+As a micro-benchmark is focusing on a single component or area, make sure it is a key part for impacting the overall application performance.
+
+The compiler might be able to optimize the code that not the same way as the whole application, and if the test data input is simple and small, it may be able to all cached in CPU memory, which is leading to a wrong metric. Take these into consideration when designing the tests.
+
+#### * Names of user-defined counters/metrics has to be `[A-Za-z0-9_]`
+It's a restriction of the metrics collecting and reporting system RocksDB is using internally. It will also help integrate with more systems.
+
+#### * Minimize the Metrics Variation
+Try reducing the test result variation, one way to check that is running the test multiple times and check the CV (Coefficient of Variation) reported by gbenchmark.
+```bash
+$ ./db_basic_bench --benchmark_filter=<TEST_NAME> --benchmark_repetitions=10
+...
+<TEST_NAME>_cv    3.2%
+```
+RocksDB has background compaction jobs which may cause the test result to vary a lot. If the micro-benchmark is not purposely testing the operation while compaction is in progress, it should wait for the compaction to finish (`db_impl->WaitForCompact()`) or disable auto-compaction.
diff --git a/src/rocksdb/microbench/db_basic_bench.cc b/src/rocksdb/microbench/db_basic_bench.cc
new file mode 100644
index 000000000..6c70ad21d
--- /dev/null
+++ b/src/rocksdb/microbench/db_basic_bench.cc
@@ -0,0 +1,1575 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef OS_WIN
+#include <unistd.h>
+#endif  // ! OS_WIN
+
+#include "benchmark/benchmark.h"
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/db.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_builder.h"
+#include "util/random.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class KeyGenerator {
+ public:
+  // Generate next key
+  // buff: the caller needs to make sure there's enough space for generated key
+  // offset: to control the group of the key, 0 means normal key, 1 means
+  // non-existing key, 2 is reserved prefix_only: only return a prefix
+  Slice Next(char* buff, int8_t offset = 0, bool prefix_only = false) {
+    assert(max_key_ < std::numeric_limits<uint32_t>::max() /
+                          MULTIPLIER);  // TODO: add large key support
+
+    uint32_t k;
+    if (is_sequential_) {
+      assert(next_sequential_key_ < max_key_);
+      k = (next_sequential_key_ % max_key_) * MULTIPLIER + offset;
+      if (next_sequential_key_ + 1 == max_key_) {
+        next_sequential_key_ = 0;
+      } else {
+        next_sequential_key_++;
+      }
+    } else {
+      k = (rnd_->Next() % max_key_) * MULTIPLIER + offset;
+    }
+    // TODO: make sure the buff is large enough
+    memset(buff, 0, key_size_);
+    if (prefix_num_ > 0) {
+      uint32_t prefix = (k % prefix_num_) * MULTIPLIER + offset;
+      Encode(buff, prefix);
+      if (prefix_only) {
+        return {buff, prefix_size_};
+      }
+    }
+    Encode(buff + prefix_size_, k);
+    return {buff, key_size_};
+  }
+
+  // use internal buffer for generated key, make sure there's only one caller in
+  // single thread
+  Slice Next() { return Next(buff_); }
+
+  // user internal buffer for generated prefix
+  Slice NextPrefix() {
+    assert(prefix_num_ > 0);
+    return Next(buff_, 0, true);
+  }
+
+  // helper function to get non exist key
+  Slice NextNonExist() { return Next(buff_, 1); }
+
+  Slice MaxKey(char* buff) const {
+    memset(buff, 0xff, key_size_);
+    return {buff, key_size_};
+  }
+
+  Slice MinKey(char* buff) const {
+    memset(buff, 0, key_size_);
+    return {buff, key_size_};
+  }
+
+  // max_key: the max key that it could generate
+  // prefix_num: the max prefix number
+  // key_size: in bytes
+  explicit KeyGenerator(Random* rnd, uint64_t max_key = 100 * 1024 * 1024,
+                        size_t prefix_num = 0, size_t key_size = 10) {
+    prefix_num_ = prefix_num;
+    key_size_ = key_size;
+    max_key_ = max_key;
+    rnd_ = rnd;
+    if (prefix_num > 0) {
+      prefix_size_ = 4;  // TODO: support different prefix_size
+    }
+  }
+
+  // generate sequential keys
+  explicit KeyGenerator(uint64_t max_key = 100 * 1024 * 1024,
+                        size_t key_size = 10) {
+    key_size_ = key_size;
+    max_key_ = max_key;
+    rnd_ = nullptr;
+    is_sequential_ = true;
+  }
+
+ private:
+  Random* rnd_;
+  size_t prefix_num_ = 0;
+  size_t prefix_size_ = 0;
+  size_t key_size_;
+  uint64_t max_key_;
+  bool is_sequential_ = false;
+  uint32_t next_sequential_key_ = 0;
+  char buff_[256] = {0};
+  const int MULTIPLIER = 3;
+
+  void static Encode(char* buf, uint32_t value) {
+    if (port::kLittleEndian) {
+      buf[0] = static_cast<char>((value >> 24) & 0xff);
+      buf[1] = static_cast<char>((value >> 16) & 0xff);
+      buf[2] = static_cast<char>((value >> 8) & 0xff);
+      buf[3] = static_cast<char>(value & 0xff);
+    } else {
+      memcpy(buf, &value, sizeof(value));
+    }
+  }
+};
+
+static void SetupDB(benchmark::State& state, Options& options,
+                    std::unique_ptr<DB>* db,
+                    const std::string& test_name = "") {
+  options.create_if_missing = true;
+  auto env = Env::Default();
+  std::string db_path;
+  Status s = env->GetTestDirectory(&db_path);
+  if (!s.ok()) {
+    state.SkipWithError(s.ToString().c_str());
+    return;
+  }
+  std::string db_name =
+      db_path + kFilePathSeparator + test_name + std::to_string(getpid());
+  DestroyDB(db_name, options);
+
+  DB* db_ptr = nullptr;
+  s = DB::Open(options, db_name, &db_ptr);
+  if (!s.ok()) {
+    state.SkipWithError(s.ToString().c_str());
+    return;
+  }
+  db->reset(db_ptr);
+}
+
+static void TeardownDB(benchmark::State& state, const std::unique_ptr<DB>& db,
+                       const Options& options, KeyGenerator& kg) {
+  char min_buff[256], max_buff[256];
+  const Range r(kg.MinKey(min_buff), kg.MaxKey(max_buff));
+  uint64_t size;
+  Status s = db->GetApproximateSizes(&r, 1, &size);
+  if (!s.ok()) {
+    state.SkipWithError(s.ToString().c_str());
+  }
+  state.counters["db_size"] = static_cast<double>(size);
+
+  std::string db_name = db->GetName();
+  s = db->Close();
+  if (!s.ok()) {
+    state.SkipWithError(s.ToString().c_str());
+  }
+  DestroyDB(db_name, options);
+}
+
+static void DBOpen(benchmark::State& state) {
+  // create DB
+  std::unique_ptr<DB> db;
+  Options options;
+  SetupDB(state, options, &db, "DBOpen");
+
+  std::string db_name = db->GetName();
+  db->Close();
+
+  options.create_if_missing = false;
+
+  auto rnd = Random(123);
+
+  for (auto _ : state) {
+    {
+      DB* db_ptr = nullptr;
+      Status s = DB::Open(options, db_name, &db_ptr);
+      if (!s.ok()) {
+        state.SkipWithError(s.ToString().c_str());
+      }
+      db.reset(db_ptr);
+    }
+    state.PauseTiming();
+    auto wo = WriteOptions();
+    Status s;
+    for (int i = 0; i < 2; i++) {
+      for (int j = 0; j < 100; j++) {
+        s = db->Put(wo, rnd.RandomString(10), rnd.RandomString(100));
+        if (!s.ok()) {
+          state.SkipWithError(s.ToString().c_str());
+        }
+      }
+      s = db->Flush(FlushOptions());
+    }
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+    }
+    s = db->Close();
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+    }
+    state.ResumeTiming();
+  }
+  DestroyDB(db_name, options);
+}
+
+BENCHMARK(DBOpen)->Iterations(200);  // specify iteration number as the db size
+                                     // is impacted by iteration number
+
+static void DBClose(benchmark::State& state) {
+  // create DB
+  std::unique_ptr<DB> db;
+  Options options;
+  SetupDB(state, options, &db, "DBClose");
+
+  std::string db_name = db->GetName();
+  db->Close();
+
+  options.create_if_missing = false;
+
+  auto rnd = Random(12345);
+
+  for (auto _ : state) {
+    state.PauseTiming();
+    {
+      DB* db_ptr = nullptr;
+      Status s = DB::Open(options, db_name, &db_ptr);
+      if (!s.ok()) {
+        state.SkipWithError(s.ToString().c_str());
+      }
+      db.reset(db_ptr);
+    }
+    auto wo = WriteOptions();
+    Status s;
+    for (int i = 0; i < 2; i++) {
+      for (int j = 0; j < 100; j++) {
+        s = db->Put(wo, rnd.RandomString(10), rnd.RandomString(100));
+        if (!s.ok()) {
+          state.SkipWithError(s.ToString().c_str());
+        }
+      }
+      s = db->Flush(FlushOptions());
+    }
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+    }
+    state.ResumeTiming();
+    s = db->Close();
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+    }
+  }
+  DestroyDB(db_name, options);
+}
+
+BENCHMARK(DBClose)->Iterations(200);  // specify iteration number as the db size
+                                      // is impacted by iteration number
+
+static void DBPut(benchmark::State& state) {
+  auto compaction_style = static_cast<CompactionStyle>(state.range(0));
+  uint64_t max_data = state.range(1);
+  uint64_t per_key_size = state.range(2);
+  bool enable_statistics = state.range(3);
+  bool enable_wal = state.range(4);
+  uint64_t key_num = max_data / per_key_size;
+
+  // setup DB
+  static std::unique_ptr<DB> db = nullptr;
+  Options options;
+  if (enable_statistics) {
+    options.statistics = CreateDBStatistics();
+  }
+  options.compaction_style = compaction_style;
+
+  auto rnd = Random(301 + state.thread_index());
+  KeyGenerator kg(&rnd, key_num);
+
+  if (state.thread_index() == 0) {
+    SetupDB(state, options, &db, "DBPut");
+  }
+
+  auto wo = WriteOptions();
+  wo.disableWAL = !enable_wal;
+
+  for (auto _ : state) {
+    state.PauseTiming();
+    Slice key = kg.Next();
+    std::string val = rnd.RandomString(static_cast<int>(per_key_size));
+    state.ResumeTiming();
+    Status s = db->Put(wo, key, val);
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+    }
+  }
+
+  if (state.thread_index() == 0) {
+    auto db_full = static_cast_with_check<DBImpl>(db.get());
+    Status s = db_full->WaitForCompact(true);
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+      return;
+    }
+    if (enable_statistics) {
+      HistogramData histogram_data;
+      options.statistics->histogramData(DB_WRITE, &histogram_data);
+      state.counters["put_mean"] = histogram_data.average * std::milli::den;
+      state.counters["put_p95"] = histogram_data.percentile95 * std::milli::den;
+      state.counters["put_p99"] = histogram_data.percentile99 * std::milli::den;
+    }
+
+    TeardownDB(state, db, options, kg);
+  }
+}
+
+static void DBPutArguments(benchmark::internal::Benchmark* b) {
+  for (int comp_style : {kCompactionStyleLevel, kCompactionStyleUniversal,
+                         kCompactionStyleFIFO}) {
+    for (int64_t max_data : {100l << 30}) {
+      for (int64_t per_key_size : {256, 1024}) {
+        for (bool enable_statistics : {false, true}) {
+          for (bool wal : {false, true}) {
+            b->Args(
+                {comp_style, max_data, per_key_size, enable_statistics, wal});
+          }
+        }
+      }
+    }
+  }
+  b->ArgNames(
+      {"comp_style", "max_data", "per_key_size", "enable_statistics", "wal"});
+}
+
+static const uint64_t DBPutNum = 409600l;
+BENCHMARK(DBPut)->Threads(1)->Iterations(DBPutNum)->Apply(DBPutArguments);
+BENCHMARK(DBPut)->Threads(8)->Iterations(DBPutNum / 8)->Apply(DBPutArguments);
+
+static void ManualCompaction(benchmark::State& state) {
+  auto compaction_style = static_cast<CompactionStyle>(state.range(0));
+  uint64_t max_data = state.range(1);
+  uint64_t per_key_size = state.range(2);
+  bool enable_statistics = state.range(3);
+  uint64_t key_num = max_data / per_key_size;
+
+  // setup DB
+  static std::unique_ptr<DB> db;
+  Options options;
+  if (enable_statistics) {
+    options.statistics = CreateDBStatistics();
+  }
+  options.compaction_style = compaction_style;
+  // No auto compaction
+  options.disable_auto_compactions = true;
+  options.level0_file_num_compaction_trigger = (1 << 30);
+  options.level0_slowdown_writes_trigger = (1 << 30);
+  options.level0_stop_writes_trigger = (1 << 30);
+  options.soft_pending_compaction_bytes_limit = 0;
+  options.hard_pending_compaction_bytes_limit = 0;
+
+  auto rnd = Random(301 + state.thread_index());
+  KeyGenerator kg(&rnd, key_num);
+
+  if (state.thread_index() == 0) {
+    SetupDB(state, options, &db, "ManualCompaction");
+  }
+
+  auto wo = WriteOptions();
+  wo.disableWAL = true;
+  uint64_t flush_mod = key_num / 4;  // at least generate 4 files for compaction
+  for (uint64_t i = 0; i < key_num; i++) {
+    Status s = db->Put(wo, kg.Next(),
+                       rnd.RandomString(static_cast<int>(per_key_size)));
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+    }
+    if (i + 1 % flush_mod == 0) {
+      s = db->Flush(FlushOptions());
+    }
+  }
+  FlushOptions fo;
+  Status s = db->Flush(fo);
+  if (!s.ok()) {
+    state.SkipWithError(s.ToString().c_str());
+  }
+  std::vector<LiveFileMetaData> files_meta;
+  db->GetLiveFilesMetaData(&files_meta);
+  std::vector<std::string> files_before_compact;
+  files_before_compact.reserve(files_meta.size());
+  for (const LiveFileMetaData& file : files_meta) {
+    files_before_compact.emplace_back(file.name);
+  }
+
+  SetPerfLevel(kEnableTime);
+  get_perf_context()->EnablePerLevelPerfContext();
+  get_perf_context()->Reset();
+  CompactionOptions co;
+  for (auto _ : state) {
+    s = db->CompactFiles(co, files_before_compact, 1);
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+    }
+  }
+
+  if (state.thread_index() == 0) {
+    auto db_full = static_cast_with_check<DBImpl>(db.get());
+    s = db_full->WaitForCompact(true);
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+      return;
+    }
+    if (enable_statistics) {
+      HistogramData histogram_data;
+      options.statistics->histogramData(COMPACTION_TIME, &histogram_data);
+      state.counters["comp_time"] = histogram_data.average;
+      options.statistics->histogramData(COMPACTION_CPU_TIME, &histogram_data);
+      state.counters["comp_cpu_time"] = histogram_data.average;
+      options.statistics->histogramData(COMPACTION_OUTFILE_SYNC_MICROS,
+                                        &histogram_data);
+      state.counters["comp_outfile_sync"] = histogram_data.average;
+
+      state.counters["comp_read"] = static_cast<double>(
+          options.statistics->getTickerCount(COMPACT_READ_BYTES));
+      state.counters["comp_write"] = static_cast<double>(
+          options.statistics->getTickerCount(COMPACT_WRITE_BYTES));
+
+      state.counters["user_key_comparison_count"] =
+          static_cast<double>(get_perf_context()->user_key_comparison_count);
+      state.counters["block_read_count"] =
+          static_cast<double>(get_perf_context()->block_read_count);
+      state.counters["block_read_time"] =
+          static_cast<double>(get_perf_context()->block_read_time);
+      state.counters["block_checksum_time"] =
+          static_cast<double>(get_perf_context()->block_checksum_time);
+      state.counters["new_table_block_iter_nanos"] =
+          static_cast<double>(get_perf_context()->new_table_block_iter_nanos);
+      state.counters["new_table_iterator_nanos"] =
+          static_cast<double>(get_perf_context()->new_table_iterator_nanos);
+      state.counters["find_table_nanos"] =
+          static_cast<double>(get_perf_context()->find_table_nanos);
+    }
+
+    TeardownDB(state, db, options, kg);
+  }
+}
+
+static void ManualCompactionArguments(benchmark::internal::Benchmark* b) {
+  for (int comp_style : {kCompactionStyleLevel, kCompactionStyleUniversal}) {
+    for (int64_t max_data : {32l << 20, 128l << 20}) {
+      for (int64_t per_key_size : {256, 1024}) {
+        for (bool enable_statistics : {false, true}) {
+          b->Args({comp_style, max_data, per_key_size, enable_statistics});
+        }
+      }
+    }
+  }
+  b->ArgNames({"comp_style", "max_data", "per_key_size", "enable_statistics"});
+}
+
+BENCHMARK(ManualCompaction)->Iterations(1)->Apply(ManualCompactionArguments);
+
+static void ManualFlush(benchmark::State& state) {
+  uint64_t key_num = state.range(0);
+  uint64_t per_key_size = state.range(1);
+  bool enable_statistics = true;
+
+  // setup DB
+  static std::unique_ptr<DB> db;
+  Options options;
+  if (enable_statistics) {
+    options.statistics = CreateDBStatistics();
+  }
+  options.disable_auto_compactions = true;
+  options.level0_file_num_compaction_trigger = (1 << 30);
+  options.level0_slowdown_writes_trigger = (1 << 30);
+  options.level0_stop_writes_trigger = (1 << 30);
+  options.soft_pending_compaction_bytes_limit = 0;
+  options.hard_pending_compaction_bytes_limit = 0;
+  options.write_buffer_size = 2l << 30;  // 2G to avoid auto flush
+
+  auto rnd = Random(301 + state.thread_index());
+  KeyGenerator kg(&rnd, key_num);
+
+  if (state.thread_index() == 0) {
+    SetupDB(state, options, &db, "ManualFlush");
+  }
+
+  auto wo = WriteOptions();
+  for (auto _ : state) {
+    state.PauseTiming();
+    for (uint64_t i = 0; i < key_num; i++) {
+      Status s = db->Put(wo, kg.Next(),
+                         rnd.RandomString(static_cast<int>(per_key_size)));
+    }
+    FlushOptions fo;
+    state.ResumeTiming();
+    Status s = db->Flush(fo);
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+    }
+  }
+
+  if (state.thread_index() == 0) {
+    auto db_full = static_cast_with_check<DBImpl>(db.get());
+    Status s = db_full->WaitForCompact(true);
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+      return;
+    }
+    if (enable_statistics) {
+      HistogramData histogram_data;
+      options.statistics->histogramData(FLUSH_TIME, &histogram_data);
+      state.counters["flush_time"] = histogram_data.average;
+      state.counters["flush_write_bytes"] = static_cast<double>(
+          options.statistics->getTickerCount(FLUSH_WRITE_BYTES));
+    }
+
+    TeardownDB(state, db, options, kg);
+  }
+}
+
+static void ManualFlushArguments(benchmark::internal::Benchmark* b) {
+  for (int64_t key_num : {1l << 10, 8l << 10, 64l << 10}) {
+    for (int64_t per_key_size : {256, 1024}) {
+      b->Args({key_num, per_key_size});
+    }
+  }
+  b->ArgNames({"key_num", "per_key_size"});
+}
+
+BENCHMARK(ManualFlush)->Iterations(1)->Apply(ManualFlushArguments);
+
+static void DBGet(benchmark::State& state) {
+  auto compaction_style = static_cast<CompactionStyle>(state.range(0));
+  uint64_t max_data = state.range(1);
+  uint64_t per_key_size = state.range(2);
+  bool enable_statistics = state.range(3);
+  bool negative_query = state.range(4);
+  bool enable_filter = state.range(5);
+  bool mmap = state.range(6);
+  uint64_t key_num = max_data / per_key_size;
+
+  // setup DB
+  static std::unique_ptr<DB> db;
+  Options options;
+  if (enable_statistics) {
+    options.statistics = CreateDBStatistics();
+  }
+  if (mmap) {
+    options.allow_mmap_reads = true;
+    options.compression = kNoCompression;
+  }
+  options.compaction_style = compaction_style;
+
+  BlockBasedTableOptions table_options;
+  if (enable_filter) {
+    table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  }
+  if (mmap) {
+    table_options.no_block_cache = true;
+    table_options.block_restart_interval = 1;
+  }
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  auto rnd = Random(301 + state.thread_index());
+  KeyGenerator kg(&rnd, key_num);
+
+  if (state.thread_index() == 0) {
+    SetupDB(state, options, &db, "DBGet");
+
+    // load db
+    auto wo = WriteOptions();
+    wo.disableWAL = true;
+    for (uint64_t i = 0; i < key_num; i++) {
+      Status s = db->Put(wo, kg.Next(),
+                         rnd.RandomString(static_cast<int>(per_key_size)));
+      if (!s.ok()) {
+        state.SkipWithError(s.ToString().c_str());
+      }
+    }
+
+    FlushOptions fo;
+    Status s = db->Flush(fo);
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+    }
+
+    auto db_full = static_cast_with_check<DBImpl>(db.get());
+    s = db_full->WaitForCompact(true);
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+      return;
+    }
+  }
+
+  auto ro = ReadOptions();
+  if (mmap) {
+    ro.verify_checksums = false;
+  }
+  size_t not_found = 0;
+  if (negative_query) {
+    for (auto _ : state) {
+      std::string val;
+      Status s = db->Get(ro, kg.NextNonExist(), &val);
+      if (s.IsNotFound()) {
+        not_found++;
+      }
+    }
+  } else {
+    for (auto _ : state) {
+      std::string val;
+      Status s = db->Get(ro, kg.Next(), &val);
+      if (s.IsNotFound()) {
+        not_found++;
+      }
+    }
+  }
+
+  state.counters["neg_qu_pct"] = benchmark::Counter(
+      static_cast<double>(not_found * 100), benchmark::Counter::kAvgIterations);
+
+  if (state.thread_index() == 0) {
+    if (enable_statistics) {
+      HistogramData histogram_data;
+      options.statistics->histogramData(DB_GET, &histogram_data);
+      state.counters["get_mean"] = histogram_data.average * std::milli::den;
+      state.counters["get_p95"] = histogram_data.percentile95 * std::milli::den;
+      state.counters["get_p99"] = histogram_data.percentile99 * std::milli::den;
+    }
+
+    TeardownDB(state, db, options, kg);
+  }
+}
+
+static void DBGetArguments(benchmark::internal::Benchmark* b) {
+  for (int comp_style : {kCompactionStyleLevel, kCompactionStyleUniversal,
+                         kCompactionStyleFIFO}) {
+    for (int64_t max_data : {128l << 20, 512l << 20}) {
+      for (int64_t per_key_size : {256, 1024}) {
+        for (bool enable_statistics : {false, true}) {
+          for (bool negative_query : {false, true}) {
+            for (bool enable_filter : {false, true}) {
+              for (bool mmap : {false, true}) {
+                b->Args({comp_style, max_data, per_key_size, enable_statistics,
+                         negative_query, enable_filter, mmap});
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  b->ArgNames({"comp_style", "max_data", "per_key_size", "enable_statistics",
+               "negative_query", "enable_filter", "mmap"});
+}
+
+static constexpr uint64_t kDBGetNum = 1l << 20;
+BENCHMARK(DBGet)->Threads(1)->Iterations(kDBGetNum)->Apply(DBGetArguments);
+BENCHMARK(DBGet)->Threads(8)->Iterations(kDBGetNum / 8)->Apply(DBGetArguments);
+
+static void SimpleGetWithPerfContext(benchmark::State& state) {
+  // setup DB
+  static std::unique_ptr<DB> db;
+  std::string db_name;
+  Options options;
+  options.create_if_missing = true;
+  options.arena_block_size = 8 << 20;
+
+  auto rnd = Random(301 + state.thread_index());
+  KeyGenerator kg(&rnd, 1024);
+
+  if (state.thread_index() == 0) {
+    auto env = Env::Default();
+    std::string db_path;
+    Status s = env->GetTestDirectory(&db_path);
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+      return;
+    }
+    db_name = db_path + "/simple_get_" + std::to_string(getpid());
+    DestroyDB(db_name, options);
+
+    {
+      DB* db_ptr = nullptr;
+      s = DB::Open(options, db_name, &db_ptr);
+      if (!s.ok()) {
+        state.SkipWithError(s.ToString().c_str());
+        return;
+      }
+      db.reset(db_ptr);
+    }
+    // load db
+    auto wo = WriteOptions();
+    wo.disableWAL = true;
+    for (uint64_t i = 0; i < 1024; i++) {
+      s = db->Put(wo, kg.Next(), rnd.RandomString(1024));
+      if (!s.ok()) {
+        state.SkipWithError(s.ToString().c_str());
+      }
+    }
+    auto db_full = static_cast_with_check<DBImpl>(db.get());
+    s = db_full->WaitForCompact(true);
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+      return;
+    }
+    FlushOptions fo;
+    s = db->Flush(fo);
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+    }
+  }
+
+  auto ro = ReadOptions();
+  size_t not_found = 0;
+  uint64_t user_key_comparison_count = 0;
+  uint64_t block_read_time = 0;
+  uint64_t block_checksum_time = 0;
+  uint64_t get_snapshot_time = 0;
+  uint64_t get_post_process_time = 0;
+  uint64_t get_from_output_files_time = 0;
+  uint64_t new_table_block_iter_nanos = 0;
+  uint64_t block_seek_nanos = 0;
+  uint64_t get_cpu_nanos = 0;
+  uint64_t get_from_table_nanos = 0;
+  SetPerfLevel(kEnableTime);
+  get_perf_context()->EnablePerLevelPerfContext();
+  for (auto _ : state) {
+    std::string val;
+    get_perf_context()->Reset();
+    Status s = db->Get(ro, kg.NextNonExist(), &val);
+    if (s.IsNotFound()) {
+      not_found++;
+    }
+    user_key_comparison_count += get_perf_context()->user_key_comparison_count;
+    block_read_time += get_perf_context()->block_read_time;
+    block_checksum_time += get_perf_context()->block_checksum_time;
+    get_snapshot_time += get_perf_context()->get_snapshot_time;
+    get_post_process_time += get_perf_context()->get_post_process_time;
+    get_from_output_files_time +=
+        get_perf_context()->get_from_output_files_time;
+    new_table_block_iter_nanos +=
+        get_perf_context()->new_table_block_iter_nanos;
+    block_seek_nanos += get_perf_context()->block_seek_nanos;
+    get_cpu_nanos += get_perf_context()->get_cpu_nanos;
+    get_from_table_nanos +=
+        (*(get_perf_context()->level_to_perf_context))[0].get_from_table_nanos;
+  }
+
+  state.counters["neg_qu_pct"] = benchmark::Counter(
+      static_cast<double>(not_found * 100), benchmark::Counter::kAvgIterations);
+  state.counters["user_key_comparison_count"] =
+      benchmark::Counter(static_cast<double>(user_key_comparison_count),
+                         benchmark::Counter::kAvgIterations);
+  state.counters["block_read_time"] = benchmark::Counter(
+      static_cast<double>(block_read_time), benchmark::Counter::kAvgIterations);
+  state.counters["block_checksum_time"] =
+      benchmark::Counter(static_cast<double>(block_checksum_time),
+                         benchmark::Counter::kAvgIterations);
+  state.counters["get_snapshot_time"] =
+      benchmark::Counter(static_cast<double>(get_snapshot_time),
+                         benchmark::Counter::kAvgIterations);
+  state.counters["get_post_process_time"] =
+      benchmark::Counter(static_cast<double>(get_post_process_time),
+                         benchmark::Counter::kAvgIterations);
+  state.counters["get_from_output_files_time"] =
+      benchmark::Counter(static_cast<double>(get_from_output_files_time),
+                         benchmark::Counter::kAvgIterations);
+  state.counters["new_table_block_iter_nanos"] =
+      benchmark::Counter(static_cast<double>(new_table_block_iter_nanos),
+                         benchmark::Counter::kAvgIterations);
+  state.counters["block_seek_nanos"] =
+      benchmark::Counter(static_cast<double>(block_seek_nanos),
+                         benchmark::Counter::kAvgIterations);
+  state.counters["get_cpu_nanos"] = benchmark::Counter(
+      static_cast<double>(get_cpu_nanos), benchmark::Counter::kAvgIterations);
+  state.counters["get_from_table_nanos"] =
+      benchmark::Counter(static_cast<double>(get_from_table_nanos),
+                         benchmark::Counter::kAvgIterations);
+
+  if (state.thread_index() == 0) {
+    TeardownDB(state, db, options, kg);
+  }
+}
+
+BENCHMARK(SimpleGetWithPerfContext)->Iterations(1000000);
+
+static void DBGetMergeOperandsInMemtable(benchmark::State& state) {
+  const uint64_t kDataLen = 16 << 20;  // 16MB
+  const uint64_t kValueLen = 64;
+  const uint64_t kNumEntries = kDataLen / kValueLen;
+  const uint64_t kNumEntriesPerKey = state.range(0);
+  const uint64_t kNumKeys = kNumEntries / kNumEntriesPerKey;
+
+  // setup DB
+  static std::unique_ptr<DB> db;
+
+  Options options;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  // Make memtable large enough that automatic flush will not be triggered.
+  options.write_buffer_size = 2 * kDataLen;
+
+  KeyGenerator sequential_key_gen(kNumKeys);
+  auto rnd = Random(301 + state.thread_index());
+
+  if (state.thread_index() == 0) {
+    SetupDB(state, options, &db, "DBGetMergeOperandsInMemtable");
+
+    // load db
+    auto write_opts = WriteOptions();
+    write_opts.disableWAL = true;
+    for (uint64_t i = 0; i < kNumEntries; i++) {
+      Status s = db->Merge(write_opts, sequential_key_gen.Next(),
+                           rnd.RandomString(static_cast<int>(kValueLen)));
+      if (!s.ok()) {
+        state.SkipWithError(s.ToString().c_str());
+      }
+    }
+  }
+
+  KeyGenerator random_key_gen(kNumKeys);
+  std::vector<PinnableSlice> value_operands;
+  value_operands.resize(kNumEntriesPerKey);
+  GetMergeOperandsOptions get_merge_ops_opts;
+  get_merge_ops_opts.expected_max_number_of_operands =
+      static_cast<int>(kNumEntriesPerKey);
+  for (auto _ : state) {
+    int num_value_operands = 0;
+    Status s = db->GetMergeOperands(
+        ReadOptions(), db->DefaultColumnFamily(), random_key_gen.Next(),
+        value_operands.data(), &get_merge_ops_opts, &num_value_operands);
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+    }
+    if (num_value_operands != static_cast<int>(kNumEntriesPerKey)) {
+      state.SkipWithError("Unexpected number of merge operands found for key");
+    }
+    for (auto& value_operand : value_operands) {
+      value_operand.Reset();
+    }
+  }
+
+  if (state.thread_index() == 0) {
+    TeardownDB(state, db, options, random_key_gen);
+  }
+}
+
+static void DBGetMergeOperandsInSstFile(benchmark::State& state) {
+  const uint64_t kDataLen = 16 << 20;  // 16MB
+  const uint64_t kValueLen = 64;
+  const uint64_t kNumEntries = kDataLen / kValueLen;
+  const uint64_t kNumEntriesPerKey = state.range(0);
+  const uint64_t kNumKeys = kNumEntries / kNumEntriesPerKey;
+  const bool kMmap = state.range(1);
+
+  // setup DB
+  static std::unique_ptr<DB> db;
+
+  BlockBasedTableOptions table_options;
+  if (kMmap) {
+    table_options.no_block_cache = true;
+  } else {
+    // Make block cache large enough that eviction will not be triggered.
+    table_options.block_cache = NewLRUCache(2 * kDataLen);
+  }
+
+  Options options;
+  if (kMmap) {
+    options.allow_mmap_reads = true;
+  }
+  options.compression = kNoCompression;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  // Make memtable large enough that automatic flush will not be triggered.
+  options.write_buffer_size = 2 * kDataLen;
+
+  KeyGenerator sequential_key_gen(kNumKeys);
+  auto rnd = Random(301 + state.thread_index());
+
+  if (state.thread_index() == 0) {
+    SetupDB(state, options, &db, "DBGetMergeOperandsInBlockCache");
+
+    // load db
+    //
+    // Take a snapshot after each cycle of merges to ensure flush cannot
+    // merge any entries.
+    std::vector<const Snapshot*> snapshots;
+    snapshots.resize(kNumEntriesPerKey);
+    auto write_opts = WriteOptions();
+    write_opts.disableWAL = true;
+    for (uint64_t i = 0; i < kNumEntriesPerKey; i++) {
+      for (uint64_t j = 0; j < kNumKeys; j++) {
+        Status s = db->Merge(write_opts, sequential_key_gen.Next(),
+                             rnd.RandomString(static_cast<int>(kValueLen)));
+        if (!s.ok()) {
+          state.SkipWithError(s.ToString().c_str());
+        }
+      }
+      snapshots[i] = db->GetSnapshot();
+    }
+
+    // Flush to an L0 file; read back to prime the cache/mapped memory.
+    db->Flush(FlushOptions());
+    for (uint64_t i = 0; i < kNumKeys; ++i) {
+      std::string value;
+      Status s = db->Get(ReadOptions(), sequential_key_gen.Next(), &value);
+      if (!s.ok()) {
+        state.SkipWithError(s.ToString().c_str());
+      }
+    }
+
+    if (state.thread_index() == 0) {
+      for (uint64_t i = 0; i < kNumEntriesPerKey; ++i) {
+        db->ReleaseSnapshot(snapshots[i]);
+      }
+    }
+  }
+
+  KeyGenerator random_key_gen(kNumKeys);
+  std::vector<PinnableSlice> value_operands;
+  value_operands.resize(kNumEntriesPerKey);
+  GetMergeOperandsOptions get_merge_ops_opts;
+  get_merge_ops_opts.expected_max_number_of_operands =
+      static_cast<int>(kNumEntriesPerKey);
+  for (auto _ : state) {
+    int num_value_operands = 0;
+    ReadOptions read_opts;
+    read_opts.verify_checksums = false;
+    Status s = db->GetMergeOperands(
+        read_opts, db->DefaultColumnFamily(), random_key_gen.Next(),
+        value_operands.data(), &get_merge_ops_opts, &num_value_operands);
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+    }
+    if (num_value_operands != static_cast<int>(kNumEntriesPerKey)) {
+      state.SkipWithError("Unexpected number of merge operands found for key");
+    }
+    for (auto& value_operand : value_operands) {
+      value_operand.Reset();
+    }
+  }
+
+  if (state.thread_index() == 0) {
+    TeardownDB(state, db, options, random_key_gen);
+  }
+}
+
+static void DBGetMergeOperandsInMemtableArguments(
+    benchmark::internal::Benchmark* b) {
+  for (int entries_per_key : {1, 32, 1024}) {
+    b->Args({entries_per_key});
+  }
+  b->ArgNames({"entries_per_key"});
+}
+
+static void DBGetMergeOperandsInSstFileArguments(
+    benchmark::internal::Benchmark* b) {
+  for (int entries_per_key : {1, 32, 1024}) {
+    for (bool mmap : {false, true}) {
+      b->Args({entries_per_key, mmap});
+    }
+  }
+  b->ArgNames({"entries_per_key", "mmap"});
+}
+
+BENCHMARK(DBGetMergeOperandsInMemtable)
+    ->Threads(1)
+    ->Apply(DBGetMergeOperandsInMemtableArguments);
+BENCHMARK(DBGetMergeOperandsInMemtable)
+    ->Threads(8)
+    ->Apply(DBGetMergeOperandsInMemtableArguments);
+BENCHMARK(DBGetMergeOperandsInSstFile)
+    ->Threads(1)
+    ->Apply(DBGetMergeOperandsInSstFileArguments);
+BENCHMARK(DBGetMergeOperandsInSstFile)
+    ->Threads(8)
+    ->Apply(DBGetMergeOperandsInSstFileArguments);
+
+std::string GenerateKey(int primary_key, int secondary_key, int padding_size,
+                        Random* rnd) {
+  char buf[50];
+  char* p = &buf[0];
+  snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key);
+  std::string k(p);
+  if (padding_size) {
+    k += rnd->RandomString(padding_size);
+  }
+
+  return k;
+}
+
+void GenerateRandomKVs(std::vector<std::string>* keys,
+                       std::vector<std::string>* values, const int from,
+                       const int len, const int step = 1,
+                       const int padding_size = 0,
+                       const int keys_share_prefix = 1) {
+  Random rnd(302);
+
+  // generate different prefix
+  for (int i = from; i < from + len; i += step) {
+    // generating keys that share the prefix
+    for (int j = 0; j < keys_share_prefix; ++j) {
+      keys->emplace_back(GenerateKey(i, j, padding_size, &rnd));
+      // 100 bytes values
+      values->emplace_back(rnd.RandomString(100));
+    }
+  }
+}
+
+// TODO: move it to different files, as it's testing an internal API
+static void DataBlockSeek(benchmark::State& state) {
+  Random rnd(301);
+  Options options = Options();
+
+  BlockBuilder builder(16, true, false,
+                       BlockBasedTableOptions::kDataBlockBinarySearch);
+
+  int num_records = 500;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+
+  GenerateRandomKVs(&keys, &values, 0, num_records);
+
+  for (int i = 0; i < num_records; i++) {
+    std::string ukey(keys[i] + "1");
+    InternalKey ikey(ukey, 0, kTypeValue);
+    builder.Add(ikey.Encode().ToString(), values[i]);
+  }
+
+  Slice rawblock = builder.Finish();
+
+  BlockContents contents;
+  contents.data = rawblock;
+  Block reader(std::move(contents));
+
+  SetPerfLevel(kEnableTime);
+  uint64_t total = 0;
+  for (auto _ : state) {
+    DataBlockIter* iter = reader.NewDataIterator(options.comparator,
+                                                 kDisableGlobalSequenceNumber);
+    uint32_t index = rnd.Uniform(static_cast<int>(num_records));
+    std::string ukey(keys[index] + "1");
+    InternalKey ikey(ukey, 0, kTypeValue);
+    get_perf_context()->Reset();
+    bool may_exist = iter->SeekForGet(ikey.Encode().ToString());
+    if (!may_exist) {
+      state.SkipWithError("key not found");
+    }
+    total += get_perf_context()->block_seek_nanos;
+    delete iter;
+  }
+  state.counters["seek_ns"] = benchmark::Counter(
+      static_cast<double>(total), benchmark::Counter::kAvgIterations);
+}
+
+BENCHMARK(DataBlockSeek)->Iterations(1000000);
+
+static void IteratorSeek(benchmark::State& state) {
+  auto compaction_style = static_cast<CompactionStyle>(state.range(0));
+  uint64_t max_data = state.range(1);
+  uint64_t per_key_size = state.range(2);
+  bool enable_statistics = state.range(3);
+  bool negative_query = state.range(4);
+  bool enable_filter = state.range(5);
+  uint64_t key_num = max_data / per_key_size;
+
+  // setup DB
+  static std::unique_ptr<DB> db;
+  Options options;
+  if (enable_statistics) {
+    options.statistics = CreateDBStatistics();
+  }
+  options.compaction_style = compaction_style;
+
+  if (enable_filter) {
+    BlockBasedTableOptions table_options;
+    table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  }
+
+  auto rnd = Random(301 + state.thread_index());
+  KeyGenerator kg(&rnd, key_num);
+
+  if (state.thread_index() == 0) {
+    SetupDB(state, options, &db, "IteratorSeek");
+
+    // load db
+    auto wo = WriteOptions();
+    wo.disableWAL = true;
+    for (uint64_t i = 0; i < key_num; i++) {
+      Status s = db->Put(wo, kg.Next(),
+                         rnd.RandomString(static_cast<int>(per_key_size)));
+      if (!s.ok()) {
+        state.SkipWithError(s.ToString().c_str());
+      }
+    }
+
+    FlushOptions fo;
+    Status s = db->Flush(fo);
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+    }
+
+    auto db_full = static_cast_with_check<DBImpl>(db.get());
+    s = db_full->WaitForCompact(true);
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+      return;
+    }
+  }
+
+  for (auto _ : state) {
+    std::unique_ptr<Iterator> iter{nullptr};
+    state.PauseTiming();
+    if (!iter) {
+      iter.reset(db->NewIterator(ReadOptions()));
+    }
+    Slice key = negative_query ? kg.NextNonExist() : kg.Next();
+    if (!iter->status().ok()) {
+      state.SkipWithError(iter->status().ToString().c_str());
+      return;
+    }
+    state.ResumeTiming();
+    iter->Seek(key);
+  }
+
+  if (state.thread_index() == 0) {
+    TeardownDB(state, db, options, kg);
+  }
+}
+
+static void IteratorSeekArguments(benchmark::internal::Benchmark* b) {
+  for (int comp_style : {kCompactionStyleLevel, kCompactionStyleUniversal,
+                         kCompactionStyleFIFO}) {
+    for (int64_t max_data : {128l << 20, 512l << 20}) {
+      for (int64_t per_key_size : {256, 1024}) {
+        for (bool enable_statistics : {false, true}) {
+          for (bool negative_query : {false, true}) {
+            for (bool enable_filter : {false, true}) {
+              b->Args({comp_style, max_data, per_key_size, enable_statistics,
+                       negative_query, enable_filter});
+            }
+          }
+        }
+      }
+    }
+  }
+  b->ArgNames({"comp_style", "max_data", "per_key_size", "enable_statistics",
+               "negative_query", "enable_filter"});
+}
+
+static constexpr uint64_t kDBSeekNum = 10l << 10;
+BENCHMARK(IteratorSeek)
+    ->Threads(1)
+    ->Iterations(kDBSeekNum)
+    ->Apply(IteratorSeekArguments);
+BENCHMARK(IteratorSeek)
+    ->Threads(8)
+    ->Iterations(kDBSeekNum / 8)
+    ->Apply(IteratorSeekArguments);
+
+static void IteratorNext(benchmark::State& state) {
+  auto compaction_style = static_cast<CompactionStyle>(state.range(0));
+  uint64_t max_data = state.range(1);
+  uint64_t per_key_size = state.range(2);
+  uint64_t key_num = max_data / per_key_size;
+
+  // setup DB
+  static std::unique_ptr<DB> db;
+  Options options;
+  options.compaction_style = compaction_style;
+
+  auto rnd = Random(301 + state.thread_index());
+  KeyGenerator kg(&rnd, key_num);
+
+  if (state.thread_index() == 0) {
+    SetupDB(state, options, &db, "IteratorNext");
+    // load db
+    auto wo = WriteOptions();
+    wo.disableWAL = true;
+    for (uint64_t i = 0; i < key_num; i++) {
+      Status s = db->Put(wo, kg.Next(),
+                         rnd.RandomString(static_cast<int>(per_key_size)));
+      if (!s.ok()) {
+        state.SkipWithError(s.ToString().c_str());
+      }
+    }
+
+    FlushOptions fo;
+    Status s = db->Flush(fo);
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+    }
+
+    auto db_full = static_cast_with_check<DBImpl>(db.get());
+    s = db_full->WaitForCompact(true);
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+      return;
+    }
+  }
+
+  for (auto _ : state) {
+    std::unique_ptr<Iterator> iter{nullptr};
+    state.PauseTiming();
+    if (!iter) {
+      iter.reset(db->NewIterator(ReadOptions()));
+    }
+    while (!iter->Valid()) {
+      iter->Seek(kg.Next());
+      if (!iter->status().ok()) {
+        state.SkipWithError(iter->status().ToString().c_str());
+      }
+    }
+    state.ResumeTiming();
+    iter->Next();
+  }
+
+  if (state.thread_index() == 0) {
+    TeardownDB(state, db, options, kg);
+  }
+}
+
+static void IteratorNextArguments(benchmark::internal::Benchmark* b) {
+  for (int comp_style : {kCompactionStyleLevel, kCompactionStyleUniversal,
+                         kCompactionStyleFIFO}) {
+    for (int64_t max_data : {128l << 20, 512l << 20}) {
+      for (int64_t per_key_size : {256, 1024}) {
+        b->Args({comp_style, max_data, per_key_size});
+      }
+    }
+  }
+  b->ArgNames({"comp_style", "max_data", "per_key_size"});
+}
+static constexpr uint64_t kIteratorNextNum = 10l << 10;
+BENCHMARK(IteratorNext)
+    ->Iterations(kIteratorNextNum)
+    ->Apply(IteratorNextArguments);
+
+static void IteratorNextWithPerfContext(benchmark::State& state) {
+  // setup DB
+  static std::unique_ptr<DB> db;
+  Options options;
+
+  auto rnd = Random(301 + state.thread_index());
+  KeyGenerator kg(&rnd, 1024);
+
+  if (state.thread_index() == 0) {
+    SetupDB(state, options, &db, "IteratorNextWithPerfContext");
+    // load db
+    auto wo = WriteOptions();
+    wo.disableWAL = true;
+    for (uint64_t i = 0; i < 1024; i++) {
+      Status s = db->Put(wo, kg.Next(), rnd.RandomString(1024));
+      if (!s.ok()) {
+        state.SkipWithError(s.ToString().c_str());
+      }
+    }
+    auto db_full = static_cast_with_check<DBImpl>(db.get());
+    Status s = db_full->WaitForCompact(true);
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+      return;
+    }
+    FlushOptions fo;
+    s = db->Flush(fo);
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+    }
+  }
+
+  uint64_t user_key_comparison_count = 0;
+  uint64_t internal_key_skipped_count = 0;
+  uint64_t find_next_user_entry_time = 0;
+  uint64_t iter_next_cpu_nanos = 0;
+
+  SetPerfLevel(kEnableTime);
+  get_perf_context()->EnablePerLevelPerfContext();
+
+  for (auto _ : state) {
+    std::unique_ptr<Iterator> iter{nullptr};
+    state.PauseTiming();
+    if (!iter) {
+      iter.reset(db->NewIterator(ReadOptions()));
+    }
+    while (!iter->Valid()) {
+      iter->Seek(kg.Next());
+      if (!iter->status().ok()) {
+        state.SkipWithError(iter->status().ToString().c_str());
+      }
+    }
+    get_perf_context()->Reset();
+    state.ResumeTiming();
+
+    iter->Next();
+    user_key_comparison_count += get_perf_context()->user_key_comparison_count;
+    internal_key_skipped_count +=
+        get_perf_context()->internal_key_skipped_count;
+    find_next_user_entry_time += get_perf_context()->find_next_user_entry_time;
+    iter_next_cpu_nanos += get_perf_context()->iter_next_cpu_nanos;
+  }
+
+  state.counters["user_key_comparison_count"] =
+      benchmark::Counter(static_cast<double>(user_key_comparison_count),
+                         benchmark::Counter::kAvgIterations);
+  state.counters["internal_key_skipped_count"] =
+      benchmark::Counter(static_cast<double>(internal_key_skipped_count),
+                         benchmark::Counter::kAvgIterations);
+  state.counters["find_next_user_entry_time"] =
+      benchmark::Counter(static_cast<double>(find_next_user_entry_time),
+                         benchmark::Counter::kAvgIterations);
+  state.counters["iter_next_cpu_nanos"] =
+      benchmark::Counter(static_cast<double>(iter_next_cpu_nanos),
+                         benchmark::Counter::kAvgIterations);
+
+  if (state.thread_index() == 0) {
+    TeardownDB(state, db, options, kg);
+  }
+}
+
+BENCHMARK(IteratorNextWithPerfContext)->Iterations(100000);
+
+static void IteratorPrev(benchmark::State& state) {
+  auto compaction_style = static_cast<CompactionStyle>(state.range(0));
+  uint64_t max_data = state.range(1);
+  uint64_t per_key_size = state.range(2);
+  uint64_t key_num = max_data / per_key_size;
+
+  // setup DB
+  static std::unique_ptr<DB> db;
+  std::string db_name;
+  Options options;
+  options.compaction_style = compaction_style;
+
+  auto rnd = Random(301 + state.thread_index());
+  KeyGenerator kg(&rnd, key_num);
+
+  if (state.thread_index() == 0) {
+    SetupDB(state, options, &db, "IteratorPrev");
+    // load db
+    auto wo = WriteOptions();
+    wo.disableWAL = true;
+    for (uint64_t i = 0; i < key_num; i++) {
+      Status s = db->Put(wo, kg.Next(),
+                         rnd.RandomString(static_cast<int>(per_key_size)));
+      if (!s.ok()) {
+        state.SkipWithError(s.ToString().c_str());
+      }
+    }
+
+    FlushOptions fo;
+    Status s = db->Flush(fo);
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+    }
+
+    auto db_full = static_cast_with_check<DBImpl>(db.get());
+    s = db_full->WaitForCompact(true);
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+      return;
+    }
+  }
+
+  for (auto _ : state) {
+    std::unique_ptr<Iterator> iter{nullptr};
+    state.PauseTiming();
+    if (!iter) {
+      iter.reset(db->NewIterator(ReadOptions()));
+    }
+    while (!iter->Valid()) {
+      iter->Seek(kg.Next());
+      if (!iter->status().ok()) {
+        state.SkipWithError(iter->status().ToString().c_str());
+      }
+    }
+    state.ResumeTiming();
+    iter->Prev();
+  }
+
+  if (state.thread_index() == 0) {
+    TeardownDB(state, db, options, kg);
+  }
+}
+
+static void IteratorPrevArguments(benchmark::internal::Benchmark* b) {
+  for (int comp_style : {kCompactionStyleLevel, kCompactionStyleUniversal,
+                         kCompactionStyleFIFO}) {
+    for (int64_t max_data : {128l << 20, 512l << 20}) {
+      for (int64_t per_key_size : {256, 1024}) {
+        b->Args({comp_style, max_data, per_key_size});
+      }
+    }
+  }
+  b->ArgNames({"comp_style", "max_data", "per_key_size"});
+}
+
+static constexpr uint64_t kIteratorPrevNum = 10l << 10;
+BENCHMARK(IteratorPrev)
+    ->Iterations(kIteratorPrevNum)
+    ->Apply(IteratorPrevArguments);
+
+static void PrefixSeek(benchmark::State& state) {
+  auto compaction_style = static_cast<CompactionStyle>(state.range(0));
+  uint64_t max_data = state.range(1);
+  uint64_t per_key_size = state.range(2);
+  bool enable_statistics = state.range(3);
+  bool enable_filter = state.range(4);
+  uint64_t key_num = max_data / per_key_size;
+
+  // setup DB
+  static std::unique_ptr<DB> db;
+  Options options;
+  if (enable_statistics) {
+    options.statistics = CreateDBStatistics();
+  }
+  options.compaction_style = compaction_style;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(4));
+
+  if (enable_filter) {
+    BlockBasedTableOptions table_options;
+    table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  }
+
+  auto rnd = Random(301 + state.thread_index());
+  KeyGenerator kg(&rnd, key_num, key_num / 100);
+
+  if (state.thread_index() == 0) {
+    SetupDB(state, options, &db, "PrefixSeek");
+
+    // load db
+    auto wo = WriteOptions();
+    wo.disableWAL = true;
+    for (uint64_t i = 0; i < key_num; i++) {
+      Status s = db->Put(wo, kg.Next(),
+                         rnd.RandomString(static_cast<int>(per_key_size)));
+      if (!s.ok()) {
+        state.SkipWithError(s.ToString().c_str());
+      }
+    }
+
+    FlushOptions fo;
+    Status s = db->Flush(fo);
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+    }
+
+    auto db_full = static_cast_with_check<DBImpl>(db.get());
+    s = db_full->WaitForCompact(true);
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+      return;
+    }
+  }
+
+  for (auto _ : state) {
+    std::unique_ptr<Iterator> iter{nullptr};
+    state.PauseTiming();
+    if (!iter) {
+      iter.reset(db->NewIterator(ReadOptions()));
+    }
+    state.ResumeTiming();
+    iter->Seek(kg.NextPrefix());
+    if (!iter->status().ok()) {
+      state.SkipWithError(iter->status().ToString().c_str());
+      return;
+    }
+  }
+
+  if (state.thread_index() == 0) {
+    TeardownDB(state, db, options, kg);
+  }
+}
+
+static void PrefixSeekArguments(benchmark::internal::Benchmark* b) {
+  for (int comp_style : {kCompactionStyleLevel, kCompactionStyleUniversal,
+                         kCompactionStyleFIFO}) {
+    for (int64_t max_data : {128l << 20, 512l << 20}) {
+      for (int64_t per_key_size : {256, 1024}) {
+        for (bool enable_statistics : {false, true}) {
+          for (bool enable_filter : {false, true}) {
+            b->Args({comp_style, max_data, per_key_size, enable_statistics,
+                     enable_filter});
+          }
+        }
+      }
+    }
+  }
+  b->ArgNames({"comp_style", "max_data", "per_key_size", "enable_statistics",
+               "enable_filter"});
+}
+
+static constexpr uint64_t kPrefixSeekNum = 10l << 10;
+BENCHMARK(PrefixSeek)->Iterations(kPrefixSeekNum)->Apply(PrefixSeekArguments);
+BENCHMARK(PrefixSeek)
+    ->Threads(8)
+    ->Iterations(kPrefixSeekNum / 8)
+    ->Apply(PrefixSeekArguments);
+
+// TODO: move it to different files, as it's testing an internal API
+static void RandomAccessFileReaderRead(benchmark::State& state) {
+  bool enable_statistics = state.range(0);
+  constexpr int kFileNum = 10;
+  auto env = Env::Default();
+  auto fs = env->GetFileSystem();
+  std::string db_path;
+  Status s = env->GetTestDirectory(&db_path);
+  if (!s.ok()) {
+    state.SkipWithError(s.ToString().c_str());
+    return;
+  }
+
+  // Setup multiple `RandomAccessFileReader`s with different parameters to be
+  // used for test
+  Random rand(301);
+  std::string fname_base =
+      db_path + kFilePathSeparator + "random-access-file-reader-read";
+  std::vector<std::unique_ptr<RandomAccessFileReader>> readers;
+  auto statistics_share = CreateDBStatistics();
+  Statistics* statistics = enable_statistics ? statistics_share.get() : nullptr;
+  for (int i = 0; i < kFileNum; i++) {
+    std::string fname = fname_base + std::to_string(i);
+    std::string content = rand.RandomString(kDefaultPageSize);
+    std::unique_ptr<WritableFile> tgt_file;
+    env->NewWritableFile(fname, &tgt_file, EnvOptions());
+    tgt_file->Append(content);
+    tgt_file->Close();
+
+    std::unique_ptr<FSRandomAccessFile> f;
+    fs->NewRandomAccessFile(fname, FileOptions(), &f, nullptr);
+    int rand_num = rand.Next() % 3;
+    auto temperature = rand_num == 0   ? Temperature::kUnknown
+                       : rand_num == 1 ? Temperature::kWarm
+                                       : Temperature::kCold;
+    readers.emplace_back(new RandomAccessFileReader(
+        std::move(f), fname, env->GetSystemClock().get(), nullptr, statistics,
+        0, nullptr, nullptr, {}, temperature, rand_num == 1));
+  }
+
+  IOOptions io_options;
+  std::unique_ptr<char[]> scratch(new char[2048]);
+  Slice result;
+  uint64_t idx = 0;
+  for (auto _ : state) {
+    s = readers[idx++ % kFileNum]->Read(io_options, 0, kDefaultPageSize / 3,
+                                        &result, scratch.get(), nullptr,
+                                        Env::IO_TOTAL);
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+    }
+  }
+
+  // clean up
+  for (int i = 0; i < kFileNum; i++) {
+    std::string fname = fname_base + std::to_string(i);
+    env->DeleteFile(fname);  // ignore return, okay to fail cleanup
+  }
+}
+
+BENCHMARK(RandomAccessFileReaderRead)
+    ->Iterations(1000000)
+    ->Arg(0)
+    ->Arg(1)
+    ->ArgName("enable_statistics");
+
+}  // namespace ROCKSDB_NAMESPACE
+
+BENCHMARK_MAIN();
diff --git a/src/rocksdb/microbench/ribbon_bench.cc b/src/rocksdb/microbench/ribbon_bench.cc
new file mode 100644
index 000000000..d0fb2ec9a
--- /dev/null
+++ b/src/rocksdb/microbench/ribbon_bench.cc
@@ -0,0 +1,155 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+// this is a simple micro-benchmark for compare ribbon filter vs. other filter
+// for more comprehensive, please check the dedicate util/filter_bench.
+#include "benchmark/benchmark.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "table/block_based/mock_block_based_table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct KeyMaker {
+  explicit KeyMaker(size_t avg_size)
+      : smallest_size_(avg_size),
+        buf_size_(avg_size + 11),  // pad to vary key size and alignment
+        buf_(new char[buf_size_]) {
+    memset(buf_.get(), 0, buf_size_);
+    assert(smallest_size_ > 8);
+  }
+  size_t smallest_size_;
+  size_t buf_size_;
+  std::unique_ptr<char[]> buf_;
+
+  // Returns a unique(-ish) key based on the given parameter values. Each
+  // call returns a Slice from the same buffer so previously returned
+  // Slices should be considered invalidated.
+  Slice Get(uint32_t filter_num, uint32_t val_num) const {
+    size_t start = val_num % 4;
+    size_t len = smallest_size_;
+    // To get range [avg_size - 2, avg_size + 2]
+    // use range [smallest_size, smallest_size + 4]
+    len += FastRange32((val_num >> 5) * 1234567891, 5);
+    char *data = buf_.get() + start;
+    // Populate key data such that all data makes it into a key of at
+    // least 8 bytes. We also don't want all the within-filter key
+    // variance confined to a contiguous 32 bits, because then a 32 bit
+    // hash function can "cheat" the false positive rate by
+    // approximating a perfect hash.
+    EncodeFixed32(data, val_num);
+    EncodeFixed32(data + 4, filter_num + val_num);
+    // ensure clearing leftovers from different alignment
+    EncodeFixed32(data + 8, 0);
+    return {data, len};
+  }
+};
+
+// benchmark arguments:
+// 0. filter impl (like filter_bench -impl)
+// 1. filter config bits_per_key
+// 2. average data key length
+// 3. data entry number
+static void CustomArguments(benchmark::internal::Benchmark *b) {
+  const auto kImplCount =
+      static_cast<int>(BloomLikeFilterPolicy::GetAllFixedImpls().size());
+  for (int filter_impl = 0; filter_impl < kImplCount; ++filter_impl) {
+    for (int bits_per_key : {10, 20}) {
+      for (int key_len_avg : {10, 100}) {
+        for (int64_t entry_num : {1 << 10, 1 << 20}) {
+          b->Args({filter_impl, bits_per_key, key_len_avg, entry_num});
+        }
+      }
+    }
+  }
+  b->ArgNames({"filter_impl", "bits_per_key", "key_len_avg", "entry_num"});
+}
+
+static void FilterBuild(benchmark::State &state) {
+  // setup data
+  auto filter = BloomLikeFilterPolicy::Create(
+      BloomLikeFilterPolicy::GetAllFixedImpls().at(state.range(0)),
+      static_cast<double>(state.range(1)));
+  auto tester = std::make_unique<mock::MockBlockBasedTableTester>(filter);
+  KeyMaker km(state.range(2));
+  std::unique_ptr<const char[]> owner;
+  const int64_t kEntryNum = state.range(3);
+  auto rnd = Random32(12345);
+  uint32_t filter_num = rnd.Next();
+  // run the test
+  for (auto _ : state) {
+    std::unique_ptr<FilterBitsBuilder> builder(tester->GetBuilder());
+    for (uint32_t i = 0; i < kEntryNum; i++) {
+      builder->AddKey(km.Get(filter_num, i));
+    }
+    auto ret = builder->Finish(&owner);
+    state.counters["size"] = static_cast<double>(ret.size());
+  }
+}
+BENCHMARK(FilterBuild)->Apply(CustomArguments);
+
+static void FilterQueryPositive(benchmark::State &state) {
+  // setup data
+  auto filter = BloomLikeFilterPolicy::Create(
+      BloomLikeFilterPolicy::GetAllFixedImpls().at(state.range(0)),
+      static_cast<double>(state.range(1)));
+  auto tester = std::make_unique<mock::MockBlockBasedTableTester>(filter);
+  KeyMaker km(state.range(2));
+  std::unique_ptr<const char[]> owner;
+  const int64_t kEntryNum = state.range(3);
+  auto rnd = Random32(12345);
+  uint32_t filter_num = rnd.Next();
+  std::unique_ptr<FilterBitsBuilder> builder(tester->GetBuilder());
+  for (uint32_t i = 0; i < kEntryNum; i++) {
+    builder->AddKey(km.Get(filter_num, i));
+  }
+  auto data = builder->Finish(&owner);
+  std::unique_ptr<FilterBitsReader> reader{filter->GetFilterBitsReader(data)};
+
+  // run test
+  uint32_t i = 0;
+  for (auto _ : state) {
+    i++;
+    i = i % kEntryNum;
+    reader->MayMatch(km.Get(filter_num, i));
+  }
+}
+BENCHMARK(FilterQueryPositive)->Apply(CustomArguments);
+
+static void FilterQueryNegative(benchmark::State &state) {
+  // setup data
+  auto filter = BloomLikeFilterPolicy::Create(
+      BloomLikeFilterPolicy::GetAllFixedImpls().at(state.range(0)),
+      static_cast<double>(state.range(1)));
+  auto tester = std::make_unique<mock::MockBlockBasedTableTester>(filter);
+  KeyMaker km(state.range(2));
+  std::unique_ptr<const char[]> owner;
+  const int64_t kEntryNum = state.range(3);
+  auto rnd = Random32(12345);
+  uint32_t filter_num = rnd.Next();
+  std::unique_ptr<FilterBitsBuilder> builder(tester->GetBuilder());
+  for (uint32_t i = 0; i < kEntryNum; i++) {
+    builder->AddKey(km.Get(filter_num, i));
+  }
+  auto data = builder->Finish(&owner);
+  std::unique_ptr<FilterBitsReader> reader{filter->GetFilterBitsReader(data)};
+
+  // run test
+  uint32_t i = 0;
+  double fp_cnt = 0;
+  for (auto _ : state) {
+    i++;
+    auto result = reader->MayMatch(km.Get(filter_num + 1, i));
+    if (result) {
+      fp_cnt++;
+    }
+  }
+  state.counters["fp_pct"] =
+      benchmark::Counter(fp_cnt * 100, benchmark::Counter::kAvgIterations);
+}
+BENCHMARK(FilterQueryNegative)->Apply(CustomArguments);
+
+}  // namespace ROCKSDB_NAMESPACE
+
+BENCHMARK_MAIN();
diff --git a/src/rocksdb/monitoring/file_read_sample.h b/src/rocksdb/monitoring/file_read_sample.h
new file mode 100644
index 000000000..82a933e0a
--- /dev/null
+++ b/src/rocksdb/monitoring/file_read_sample.h
@@ -0,0 +1,23 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include "db/version_edit.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+static const uint32_t kFileReadSampleRate = 1024;
+extern bool should_sample_file_read();
+extern void sample_file_read_inc(FileMetaData*);
+
+inline bool should_sample_file_read() {
+  return (Random::GetTLSInstance()->Next() % kFileReadSampleRate == 307);
+}
+
+inline void sample_file_read_inc(FileMetaData* meta) {
+  meta->stats.num_reads_sampled.fetch_add(kFileReadSampleRate,
+                                          std::memory_order_relaxed);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/monitoring/histogram.cc b/src/rocksdb/monitoring/histogram.cc
new file mode 100644
index 000000000..61bc6c140
--- /dev/null
+++ b/src/rocksdb/monitoring/histogram.cc
@@ -0,0 +1,270 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "monitoring/histogram.h"
+
+#include <stdio.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+
+#include "port/port.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+HistogramBucketMapper::HistogramBucketMapper() {
+  // If you change this, you also need to change
+  // size of array buckets_ in HistogramImpl
+  bucketValues_ = {1, 2};
+  double bucket_val = static_cast<double>(bucketValues_.back());
+  while ((bucket_val = 1.5 * bucket_val) <=
+         static_cast<double>(std::numeric_limits<uint64_t>::max())) {
+    bucketValues_.push_back(static_cast<uint64_t>(bucket_val));
+    // Extracts two most significant digits to make histogram buckets more
+    // human-readable. E.g., 172 becomes 170.
+    uint64_t pow_of_ten = 1;
+    while (bucketValues_.back() / 10 > 10) {
+      bucketValues_.back() /= 10;
+      pow_of_ten *= 10;
+    }
+    bucketValues_.back() *= pow_of_ten;
+  }
+  maxBucketValue_ = bucketValues_.back();
+  minBucketValue_ = bucketValues_.front();
+}
+
+size_t HistogramBucketMapper::IndexForValue(const uint64_t value) const {
+  auto beg = bucketValues_.begin();
+  auto end = bucketValues_.end();
+  if (value >= maxBucketValue_)
+    return end - beg - 1;  // bucketValues_.size() - 1
+  else
+    return std::lower_bound(beg, end, value) - beg;
+}
+
+namespace {
+const HistogramBucketMapper bucketMapper;
+}
+
+HistogramStat::HistogramStat() : num_buckets_(bucketMapper.BucketCount()) {
+  assert(num_buckets_ == sizeof(buckets_) / sizeof(*buckets_));
+  Clear();
+}
+
+void HistogramStat::Clear() {
+  min_.store(bucketMapper.LastValue(), std::memory_order_relaxed);
+  max_.store(0, std::memory_order_relaxed);
+  num_.store(0, std::memory_order_relaxed);
+  sum_.store(0, std::memory_order_relaxed);
+  sum_squares_.store(0, std::memory_order_relaxed);
+  for (unsigned int b = 0; b < num_buckets_; b++) {
+    buckets_[b].store(0, std::memory_order_relaxed);
+  }
+};
+
+bool HistogramStat::Empty() const { return num() == 0; }
+
+void HistogramStat::Add(uint64_t value) {
+  // This function is designed to be lock free, as it's in the critical path
+  // of any operation. Each individual value is atomic and the order of updates
+  // by concurrent threads is tolerable.
+  const size_t index = bucketMapper.IndexForValue(value);
+  assert(index < num_buckets_);
+  buckets_[index].store(buckets_[index].load(std::memory_order_relaxed) + 1,
+                        std::memory_order_relaxed);
+
+  uint64_t old_min = min();
+  if (value < old_min) {
+    min_.store(value, std::memory_order_relaxed);
+  }
+
+  uint64_t old_max = max();
+  if (value > old_max) {
+    max_.store(value, std::memory_order_relaxed);
+  }
+
+  num_.store(num_.load(std::memory_order_relaxed) + 1,
+             std::memory_order_relaxed);
+  sum_.store(sum_.load(std::memory_order_relaxed) + value,
+             std::memory_order_relaxed);
+  sum_squares_.store(
+      sum_squares_.load(std::memory_order_relaxed) + value * value,
+      std::memory_order_relaxed);
+}
+
+void HistogramStat::Merge(const HistogramStat& other) {
+  // This function needs to be performned with the outer lock acquired
+  // However, atomic operation on every member is still need, since Add()
+  // requires no lock and value update can still happen concurrently
+  uint64_t old_min = min();
+  uint64_t other_min = other.min();
+  while (other_min < old_min &&
+         !min_.compare_exchange_weak(old_min, other_min)) {
+  }
+
+  uint64_t old_max = max();
+  uint64_t other_max = other.max();
+  while (other_max > old_max &&
+         !max_.compare_exchange_weak(old_max, other_max)) {
+  }
+
+  num_.fetch_add(other.num(), std::memory_order_relaxed);
+  sum_.fetch_add(other.sum(), std::memory_order_relaxed);
+  sum_squares_.fetch_add(other.sum_squares(), std::memory_order_relaxed);
+  for (unsigned int b = 0; b < num_buckets_; b++) {
+    buckets_[b].fetch_add(other.bucket_at(b), std::memory_order_relaxed);
+  }
+}
+
+double HistogramStat::Median() const { return Percentile(50.0); }
+
+double HistogramStat::Percentile(double p) const {
+  double threshold = num() * (p / 100.0);
+  uint64_t cumulative_sum = 0;
+  for (unsigned int b = 0; b < num_buckets_; b++) {
+    uint64_t bucket_value = bucket_at(b);
+    cumulative_sum += bucket_value;
+    if (cumulative_sum >= threshold) {
+      // Scale linearly within this bucket
+      uint64_t left_point = (b == 0) ? 0 : bucketMapper.BucketLimit(b - 1);
+      uint64_t right_point = bucketMapper.BucketLimit(b);
+      uint64_t left_sum = cumulative_sum - bucket_value;
+      uint64_t right_sum = cumulative_sum;
+      double pos = 0;
+      uint64_t right_left_diff = right_sum - left_sum;
+      if (right_left_diff != 0) {
+        pos = (threshold - left_sum) / right_left_diff;
+      }
+      double r = left_point + (right_point - left_point) * pos;
+      uint64_t cur_min = min();
+      uint64_t cur_max = max();
+      if (r < cur_min) r = static_cast<double>(cur_min);
+      if (r > cur_max) r = static_cast<double>(cur_max);
+      return r;
+    }
+  }
+  return static_cast<double>(max());
+}
+
+double HistogramStat::Average() const {
+  uint64_t cur_num = num();
+  uint64_t cur_sum = sum();
+  if (cur_num == 0) return 0;
+  return static_cast<double>(cur_sum) / static_cast<double>(cur_num);
+}
+
+double HistogramStat::StandardDeviation() const {
+  double cur_num =
+      static_cast<double>(num());  // Use double to avoid integer overflow
+  double cur_sum = static_cast<double>(sum());
+  double cur_sum_squares = static_cast<double>(sum_squares());
+  if (cur_num == 0.0) {
+    return 0.0;
+  }
+  double variance =
+      (cur_sum_squares * cur_num - cur_sum * cur_sum) / (cur_num * cur_num);
+  return std::sqrt(std::max(variance, 0.0));
+}
+
+std::string HistogramStat::ToString() const {
+  uint64_t cur_num = num();
+  std::string r;
+  char buf[1650];
+  snprintf(buf, sizeof(buf), "Count: %" PRIu64 " Average: %.4f  StdDev: %.2f\n",
+           cur_num, Average(), StandardDeviation());
+  r.append(buf);
+  snprintf(buf, sizeof(buf),
+           "Min: %" PRIu64 "  Median: %.4f  Max: %" PRIu64 "\n",
+           (cur_num == 0 ? 0 : min()), Median(), (cur_num == 0 ? 0 : max()));
+  r.append(buf);
+  snprintf(buf, sizeof(buf),
+           "Percentiles: "
+           "P50: %.2f P75: %.2f P99: %.2f P99.9: %.2f P99.99: %.2f\n",
+           Percentile(50), Percentile(75), Percentile(99), Percentile(99.9),
+           Percentile(99.99));
+  r.append(buf);
+  r.append("------------------------------------------------------\n");
+  if (cur_num == 0) return r;  // all buckets are empty
+  const double mult = 100.0 / cur_num;
+  uint64_t cumulative_sum = 0;
+  for (unsigned int b = 0; b < num_buckets_; b++) {
+    uint64_t bucket_value = bucket_at(b);
+    if (bucket_value <= 0.0) continue;
+    cumulative_sum += bucket_value;
+    snprintf(buf, sizeof(buf),
+             "%c %7" PRIu64 ", %7" PRIu64 " ] %8" PRIu64 " %7.3f%% %7.3f%% ",
+             (b == 0) ? '[' : '(',
+             (b == 0) ? 0 : bucketMapper.BucketLimit(b - 1),  // left
+             bucketMapper.BucketLimit(b),                     // right
+             bucket_value,                                    // count
+             (mult * bucket_value),                           // percentage
+             (mult * cumulative_sum));  // cumulative percentage
+    r.append(buf);
+
+    // Add hash marks based on percentage; 20 marks for 100%.
+    size_t marks = static_cast<size_t>(mult * bucket_value / 5 + 0.5);
+    r.append(marks, '#');
+    r.push_back('\n');
+  }
+  return r;
+}
+
+void HistogramStat::Data(HistogramData* const data) const {
+  assert(data);
+  data->median = Median();
+  data->percentile95 = Percentile(95);
+  data->percentile99 = Percentile(99);
+  data->max = static_cast<double>(max());
+  data->average = Average();
+  data->standard_deviation = StandardDeviation();
+  data->count = num();
+  data->sum = sum();
+  data->min = static_cast<double>(min());
+}
+
+void HistogramImpl::Clear() {
+  std::lock_guard<std::mutex> lock(mutex_);
+  stats_.Clear();
+}
+
+bool HistogramImpl::Empty() const { return stats_.Empty(); }
+
+void HistogramImpl::Add(uint64_t value) { stats_.Add(value); }
+
+void HistogramImpl::Merge(const Histogram& other) {
+  if (strcmp(Name(), other.Name()) == 0) {
+    Merge(*static_cast_with_check<const HistogramImpl>(&other));
+  }
+}
+
+void HistogramImpl::Merge(const HistogramImpl& other) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  stats_.Merge(other.stats_);
+}
+
+double HistogramImpl::Median() const { return stats_.Median(); }
+
+double HistogramImpl::Percentile(double p) const {
+  return stats_.Percentile(p);
+}
+
+double HistogramImpl::Average() const { return stats_.Average(); }
+
+double HistogramImpl::StandardDeviation() const {
+  return stats_.StandardDeviation();
+}
+
+std::string HistogramImpl::ToString() const { return stats_.ToString(); }
+
+void HistogramImpl::Data(HistogramData* const data) const { stats_.Data(data); }
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/monitoring/histogram.h b/src/rocksdb/monitoring/histogram.h
new file mode 100644
index 000000000..15fee2b4f
--- /dev/null
+++ b/src/rocksdb/monitoring/histogram.h
@@ -0,0 +1,143 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <cassert>
+#include <map>
+#include <mutex>
+#include <string>
+#include <vector>
+
+#include "rocksdb/statistics.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class HistogramBucketMapper {
+ public:
+  HistogramBucketMapper();
+
+  // converts a value to the bucket index.
+  size_t IndexForValue(uint64_t value) const;
+  // number of buckets required.
+
+  size_t BucketCount() const { return bucketValues_.size(); }
+
+  uint64_t LastValue() const { return maxBucketValue_; }
+
+  uint64_t FirstValue() const { return minBucketValue_; }
+
+  uint64_t BucketLimit(const size_t bucketNumber) const {
+    assert(bucketNumber < BucketCount());
+    return bucketValues_[bucketNumber];
+  }
+
+ private:
+  std::vector<uint64_t> bucketValues_;
+  uint64_t maxBucketValue_;
+  uint64_t minBucketValue_;
+};
+
+struct HistogramStat {
+  HistogramStat();
+  ~HistogramStat() {}
+
+  HistogramStat(const HistogramStat&) = delete;
+  HistogramStat& operator=(const HistogramStat&) = delete;
+
+  void Clear();
+  bool Empty() const;
+  void Add(uint64_t value);
+  void Merge(const HistogramStat& other);
+
+  inline uint64_t min() const { return min_.load(std::memory_order_relaxed); }
+  inline uint64_t max() const { return max_.load(std::memory_order_relaxed); }
+  inline uint64_t num() const { return num_.load(std::memory_order_relaxed); }
+  inline uint64_t sum() const { return sum_.load(std::memory_order_relaxed); }
+  inline uint64_t sum_squares() const {
+    return sum_squares_.load(std::memory_order_relaxed);
+  }
+  inline uint64_t bucket_at(size_t b) const {
+    return buckets_[b].load(std::memory_order_relaxed);
+  }
+
+  double Median() const;
+  double Percentile(double p) const;
+  double Average() const;
+  double StandardDeviation() const;
+  void Data(HistogramData* const data) const;
+  std::string ToString() const;
+
+  // To be able to use HistogramStat as thread local variable, it
+  // cannot have dynamic allocated member. That's why we're
+  // using manually values from BucketMapper
+  std::atomic_uint_fast64_t min_;
+  std::atomic_uint_fast64_t max_;
+  std::atomic_uint_fast64_t num_;
+  std::atomic_uint_fast64_t sum_;
+  std::atomic_uint_fast64_t sum_squares_;
+  std::atomic_uint_fast64_t buckets_[109];  // 109==BucketMapper::BucketCount()
+  const uint64_t num_buckets_;
+};
+
+class Histogram {
+ public:
+  Histogram() {}
+  virtual ~Histogram(){};
+
+  virtual void Clear() = 0;
+  virtual bool Empty() const = 0;
+  virtual void Add(uint64_t value) = 0;
+  virtual void Merge(const Histogram&) = 0;
+
+  virtual std::string ToString() const = 0;
+  virtual const char* Name() const = 0;
+  virtual uint64_t min() const = 0;
+  virtual uint64_t max() const = 0;
+  virtual uint64_t num() const = 0;
+  virtual double Median() const = 0;
+  virtual double Percentile(double p) const = 0;
+  virtual double Average() const = 0;
+  virtual double StandardDeviation() const = 0;
+  virtual void Data(HistogramData* const data) const = 0;
+};
+
+class HistogramImpl : public Histogram {
+ public:
+  HistogramImpl() { Clear(); }
+
+  HistogramImpl(const HistogramImpl&) = delete;
+  HistogramImpl& operator=(const HistogramImpl&) = delete;
+
+  virtual void Clear() override;
+  virtual bool Empty() const override;
+  virtual void Add(uint64_t value) override;
+  virtual void Merge(const Histogram& other) override;
+  void Merge(const HistogramImpl& other);
+
+  virtual std::string ToString() const override;
+  virtual const char* Name() const override { return "HistogramImpl"; }
+  virtual uint64_t min() const override { return stats_.min(); }
+  virtual uint64_t max() const override { return stats_.max(); }
+  virtual uint64_t num() const override { return stats_.num(); }
+  virtual double Median() const override;
+  virtual double Percentile(double p) const override;
+  virtual double Average() const override;
+  virtual double StandardDeviation() const override;
+  virtual void Data(HistogramData* const data) const override;
+
+  virtual ~HistogramImpl() {}
+
+  inline HistogramStat& TEST_GetStats() { return stats_; }
+
+ private:
+  HistogramStat stats_;
+  std::mutex mutex_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/monitoring/histogram_test.cc b/src/rocksdb/monitoring/histogram_test.cc
new file mode 100644
index 000000000..19e9f15d0
--- /dev/null
+++ b/src/rocksdb/monitoring/histogram_test.cc
@@ -0,0 +1,254 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "monitoring/histogram.h"
+
+#include <cmath>
+
+#include "monitoring/histogram_windowing.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/mock_time_env.h"
+#include "test_util/testharness.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class HistogramTest : public testing::Test {};
+
+namespace {
+const double kIota = 0.1;
+const HistogramBucketMapper bucketMapper;
+std::shared_ptr<MockSystemClock> clock =
+    std::make_shared<MockSystemClock>(SystemClock::Default());
+}  // namespace
+
+void PopulateHistogram(Histogram& histogram, uint64_t low, uint64_t high,
+                       uint64_t loop = 1) {
+  Random rnd(test::RandomSeed());
+  for (; loop > 0; loop--) {
+    for (uint64_t i = low; i <= high; i++) {
+      histogram.Add(i);
+      // sleep a random microseconds [0-10)
+      clock->SleepForMicroseconds(rnd.Uniform(10));
+    }
+  }
+  // make sure each data population at least take some time
+  clock->SleepForMicroseconds(1);
+}
+
+void BasicOperation(Histogram& histogram) {
+  PopulateHistogram(histogram, 1, 110, 10);  // fill up to bucket [70, 110)
+
+  HistogramData data;
+  histogram.Data(&data);
+
+  ASSERT_LE(fabs(histogram.Percentile(100.0) - 110.0), kIota);
+  ASSERT_LE(fabs(data.percentile99 - 108.9), kIota);  // 99 * 110 / 100
+  ASSERT_LE(fabs(data.percentile95 - 104.5), kIota);  // 95 * 110 / 100
+  ASSERT_LE(fabs(data.median - 55.0), kIota);         // 50 * 110 / 100
+  ASSERT_EQ(data.average, 55.5);                      // (1 + 110) / 2
+}
+
+void MergeHistogram(Histogram& histogram, Histogram& other) {
+  PopulateHistogram(histogram, 1, 100);
+  PopulateHistogram(other, 101, 250);
+  histogram.Merge(other);
+
+  HistogramData data;
+  histogram.Data(&data);
+
+  ASSERT_LE(fabs(histogram.Percentile(100.0) - 250.0), kIota);
+  ASSERT_LE(fabs(data.percentile99 - 247.5), kIota);  // 99 * 250 / 100
+  ASSERT_LE(fabs(data.percentile95 - 237.5), kIota);  // 95 * 250 / 100
+  ASSERT_LE(fabs(data.median - 125.0), kIota);        // 50 * 250 / 100
+  ASSERT_EQ(data.average, 125.5);                     // (1 + 250) / 2
+}
+
+void EmptyHistogram(Histogram& histogram) {
+  ASSERT_EQ(histogram.min(), bucketMapper.LastValue());
+  ASSERT_EQ(histogram.max(), 0);
+  ASSERT_EQ(histogram.num(), 0);
+  ASSERT_EQ(histogram.Median(), 0.0);
+  ASSERT_EQ(histogram.Percentile(85.0), 0.0);
+  ASSERT_EQ(histogram.Average(), 0.0);
+  ASSERT_EQ(histogram.StandardDeviation(), 0.0);
+}
+
+void ClearHistogram(Histogram& histogram) {
+  for (uint64_t i = 1; i <= 100; i++) {
+    histogram.Add(i);
+  }
+  histogram.Clear();
+  ASSERT_TRUE(histogram.Empty());
+  ASSERT_EQ(histogram.Median(), 0);
+  ASSERT_EQ(histogram.Percentile(85.0), 0);
+  ASSERT_EQ(histogram.Average(), 0);
+}
+
+TEST_F(HistogramTest, BasicOperation) {
+  HistogramImpl histogram;
+  BasicOperation(histogram);
+
+  HistogramWindowingImpl histogramWindowing;
+  BasicOperation(histogramWindowing);
+}
+
+TEST_F(HistogramTest, BoundaryValue) {
+  HistogramImpl histogram;
+  // - both should be in [0, 1] bucket because we place values on bucket
+  //   boundaries in the lower bucket.
+  // - all points are in [0, 1] bucket, so p50 will be 0.5
+  // - the test cannot be written with a single point since histogram won't
+  //   report percentiles lower than the min or greater than the max.
+  histogram.Add(0);
+  histogram.Add(1);
+
+  ASSERT_LE(fabs(histogram.Percentile(50.0) - 0.5), kIota);
+}
+
+TEST_F(HistogramTest, MergeHistogram) {
+  HistogramImpl histogram;
+  HistogramImpl other;
+  MergeHistogram(histogram, other);
+
+  HistogramWindowingImpl histogramWindowing;
+  HistogramWindowingImpl otherWindowing;
+  MergeHistogram(histogramWindowing, otherWindowing);
+}
+
+TEST_F(HistogramTest, EmptyHistogram) {
+  HistogramImpl histogram;
+  EmptyHistogram(histogram);
+
+  HistogramWindowingImpl histogramWindowing;
+  EmptyHistogram(histogramWindowing);
+}
+
+TEST_F(HistogramTest, ClearHistogram) {
+  HistogramImpl histogram;
+  ClearHistogram(histogram);
+
+  HistogramWindowingImpl histogramWindowing;
+  ClearHistogram(histogramWindowing);
+}
+
+TEST_F(HistogramTest, HistogramWindowingExpire) {
+  uint64_t num_windows = 3;
+  int micros_per_window = 1000000;
+  uint64_t min_num_per_window = 0;
+
+  HistogramWindowingImpl histogramWindowing(num_windows, micros_per_window,
+                                            min_num_per_window);
+  histogramWindowing.TEST_UpdateClock(clock);
+  PopulateHistogram(histogramWindowing, 1, 1, 100);
+  clock->SleepForMicroseconds(micros_per_window);
+  ASSERT_EQ(histogramWindowing.num(), 100);
+  ASSERT_EQ(histogramWindowing.min(), 1);
+  ASSERT_EQ(histogramWindowing.max(), 1);
+  ASSERT_EQ(histogramWindowing.Average(), 1.0);
+  ASSERT_EQ(histogramWindowing.StandardDeviation(), 0.0);
+
+  PopulateHistogram(histogramWindowing, 2, 2, 100);
+  clock->SleepForMicroseconds(micros_per_window);
+  ASSERT_EQ(histogramWindowing.num(), 200);
+  ASSERT_EQ(histogramWindowing.min(), 1);
+  ASSERT_EQ(histogramWindowing.max(), 2);
+  ASSERT_EQ(histogramWindowing.Average(), 1.5);
+  ASSERT_GT(histogramWindowing.StandardDeviation(), 0.0);
+
+  PopulateHistogram(histogramWindowing, 3, 3, 100);
+  clock->SleepForMicroseconds(micros_per_window);
+  ASSERT_EQ(histogramWindowing.num(), 300);
+  ASSERT_EQ(histogramWindowing.min(), 1);
+  ASSERT_EQ(histogramWindowing.max(), 3);
+  ASSERT_EQ(histogramWindowing.Average(), 2.0);
+  ASSERT_GT(histogramWindowing.StandardDeviation(), 0.0);
+
+  // dropping oldest window with value 1, remaining 2 ~ 4
+  PopulateHistogram(histogramWindowing, 4, 4, 100);
+  clock->SleepForMicroseconds(micros_per_window);
+  ASSERT_EQ(histogramWindowing.num(), 300);
+  ASSERT_EQ(histogramWindowing.min(), 2);
+  ASSERT_EQ(histogramWindowing.max(), 4);
+  ASSERT_EQ(histogramWindowing.Average(), 3.0);
+  ASSERT_GT(histogramWindowing.StandardDeviation(), 0.0);
+
+  // dropping oldest window with value 2, remaining 3 ~ 5
+  PopulateHistogram(histogramWindowing, 5, 5, 100);
+  clock->SleepForMicroseconds(micros_per_window);
+  ASSERT_EQ(histogramWindowing.num(), 300);
+  ASSERT_EQ(histogramWindowing.min(), 3);
+  ASSERT_EQ(histogramWindowing.max(), 5);
+  ASSERT_EQ(histogramWindowing.Average(), 4.0);
+  ASSERT_GT(histogramWindowing.StandardDeviation(), 0.0);
+}
+
+TEST_F(HistogramTest, HistogramWindowingMerge) {
+  uint64_t num_windows = 3;
+  int micros_per_window = 1000000;
+  uint64_t min_num_per_window = 0;
+
+  HistogramWindowingImpl histogramWindowing(num_windows, micros_per_window,
+                                            min_num_per_window);
+  HistogramWindowingImpl otherWindowing(num_windows, micros_per_window,
+                                        min_num_per_window);
+  histogramWindowing.TEST_UpdateClock(clock);
+  otherWindowing.TEST_UpdateClock(clock);
+
+  PopulateHistogram(histogramWindowing, 1, 1, 100);
+  PopulateHistogram(otherWindowing, 1, 1, 100);
+  clock->SleepForMicroseconds(micros_per_window);
+
+  PopulateHistogram(histogramWindowing, 2, 2, 100);
+  PopulateHistogram(otherWindowing, 2, 2, 100);
+  clock->SleepForMicroseconds(micros_per_window);
+
+  PopulateHistogram(histogramWindowing, 3, 3, 100);
+  PopulateHistogram(otherWindowing, 3, 3, 100);
+  clock->SleepForMicroseconds(micros_per_window);
+
+  histogramWindowing.Merge(otherWindowing);
+  ASSERT_EQ(histogramWindowing.num(), 600);
+  ASSERT_EQ(histogramWindowing.min(), 1);
+  ASSERT_EQ(histogramWindowing.max(), 3);
+  ASSERT_EQ(histogramWindowing.Average(), 2.0);
+
+  // dropping oldest window with value 1, remaining 2 ~ 4
+  PopulateHistogram(histogramWindowing, 4, 4, 100);
+  clock->SleepForMicroseconds(micros_per_window);
+  ASSERT_EQ(histogramWindowing.num(), 500);
+  ASSERT_EQ(histogramWindowing.min(), 2);
+  ASSERT_EQ(histogramWindowing.max(), 4);
+
+  // dropping oldest window with value 2, remaining 3 ~ 5
+  PopulateHistogram(histogramWindowing, 5, 5, 100);
+  clock->SleepForMicroseconds(micros_per_window);
+  ASSERT_EQ(histogramWindowing.num(), 400);
+  ASSERT_EQ(histogramWindowing.min(), 3);
+  ASSERT_EQ(histogramWindowing.max(), 5);
+}
+
+TEST_F(HistogramTest, LargeStandardDeviation) {
+  HistogramImpl histogram;
+  PopulateHistogram(histogram, 1, 1000000);
+  ASSERT_LT(fabs(histogram.StandardDeviation() - 288675), 1);
+}
+
+TEST_F(HistogramTest, LostUpdateStandardDeviation) {
+  HistogramImpl histogram;
+  PopulateHistogram(histogram, 100, 100, 100);
+  // Simulate a possible lost update (since they are not atomic)
+  histogram.TEST_GetStats().sum_squares_ -= 10000;
+  // Ideally zero, but should never be negative or NaN
+  ASSERT_GE(histogram.StandardDeviation(), 0.0);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/monitoring/histogram_windowing.cc b/src/rocksdb/monitoring/histogram_windowing.cc
new file mode 100644
index 000000000..c41ae8a03
--- /dev/null
+++ b/src/rocksdb/monitoring/histogram_windowing.cc
@@ -0,0 +1,194 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "monitoring/histogram_windowing.h"
+
+#include <algorithm>
+
+#include "monitoring/histogram.h"
+#include "rocksdb/system_clock.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+HistogramWindowingImpl::HistogramWindowingImpl() {
+  clock_ = SystemClock::Default();
+  window_stats_.reset(new HistogramStat[static_cast<size_t>(num_windows_)]);
+  Clear();
+}
+
+HistogramWindowingImpl::HistogramWindowingImpl(uint64_t num_windows,
+                                               uint64_t micros_per_window,
+                                               uint64_t min_num_per_window)
+    : num_windows_(num_windows),
+      micros_per_window_(micros_per_window),
+      min_num_per_window_(min_num_per_window) {
+  clock_ = SystemClock::Default();
+  window_stats_.reset(new HistogramStat[static_cast<size_t>(num_windows_)]);
+  Clear();
+}
+
+HistogramWindowingImpl::~HistogramWindowingImpl() {}
+
+void HistogramWindowingImpl::Clear() {
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  stats_.Clear();
+  for (size_t i = 0; i < num_windows_; i++) {
+    window_stats_[i].Clear();
+  }
+  current_window_.store(0, std::memory_order_relaxed);
+  last_swap_time_.store(clock_->NowMicros(), std::memory_order_relaxed);
+}
+
+bool HistogramWindowingImpl::Empty() const { return stats_.Empty(); }
+
+// This function is designed to be lock free, as it's in the critical path
+// of any operation.
+// Each individual value is atomic, it is just that some samples can go
+// in the older bucket which is tolerable.
+void HistogramWindowingImpl::Add(uint64_t value) {
+  TimerTick();
+
+  // Parent (global) member update
+  stats_.Add(value);
+
+  // Current window update
+  window_stats_[static_cast<size_t>(current_window())].Add(value);
+}
+
+void HistogramWindowingImpl::Merge(const Histogram& other) {
+  if (strcmp(Name(), other.Name()) == 0) {
+    Merge(*static_cast_with_check<const HistogramWindowingImpl>(&other));
+  }
+}
+
+void HistogramWindowingImpl::Merge(const HistogramWindowingImpl& other) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  stats_.Merge(other.stats_);
+
+  if (stats_.num_buckets_ != other.stats_.num_buckets_ ||
+      micros_per_window_ != other.micros_per_window_) {
+    return;
+  }
+
+  uint64_t cur_window = current_window();
+  uint64_t other_cur_window = other.current_window();
+  // going backwards for alignment
+  for (unsigned int i = 0; i < std::min(num_windows_, other.num_windows_);
+       i++) {
+    uint64_t window_index = (cur_window + num_windows_ - i) % num_windows_;
+    uint64_t other_window_index =
+        (other_cur_window + other.num_windows_ - i) % other.num_windows_;
+    size_t windex = static_cast<size_t>(window_index);
+    size_t other_windex = static_cast<size_t>(other_window_index);
+
+    window_stats_[windex].Merge(other.window_stats_[other_windex]);
+  }
+}
+
+std::string HistogramWindowingImpl::ToString() const {
+  return stats_.ToString();
+}
+
+double HistogramWindowingImpl::Median() const { return Percentile(50.0); }
+
+double HistogramWindowingImpl::Percentile(double p) const {
+  // Retry 3 times in total
+  for (int retry = 0; retry < 3; retry++) {
+    uint64_t start_num = stats_.num();
+    double result = stats_.Percentile(p);
+    // Detect if swap buckets or Clear() was called during calculation
+    if (stats_.num() >= start_num) {
+      return result;
+    }
+  }
+  return 0.0;
+}
+
+double HistogramWindowingImpl::Average() const { return stats_.Average(); }
+
+double HistogramWindowingImpl::StandardDeviation() const {
+  return stats_.StandardDeviation();
+}
+
+void HistogramWindowingImpl::Data(HistogramData* const data) const {
+  stats_.Data(data);
+}
+
+void HistogramWindowingImpl::TimerTick() {
+  uint64_t curr_time = clock_->NowMicros();
+  size_t curr_window_ = static_cast<size_t>(current_window());
+  if (curr_time - last_swap_time() > micros_per_window_ &&
+      window_stats_[curr_window_].num() >= min_num_per_window_) {
+    SwapHistoryBucket();
+  }
+}
+
+void HistogramWindowingImpl::SwapHistoryBucket() {
+  // Threads executing Add() would be competing for this mutex, the first one
+  // who got the metex would take care of the bucket swap, other threads
+  // can skip this.
+  // If mutex is held by Merge() or Clear(), next Add() will take care of the
+  // swap, if needed.
+  if (mutex_.try_lock()) {
+    last_swap_time_.store(clock_->NowMicros(), std::memory_order_relaxed);
+
+    uint64_t curr_window = current_window();
+    uint64_t next_window =
+        (curr_window == num_windows_ - 1) ? 0 : curr_window + 1;
+
+    // subtract next buckets from totals and swap to next buckets
+    HistogramStat& stats_to_drop =
+        window_stats_[static_cast<size_t>(next_window)];
+
+    if (!stats_to_drop.Empty()) {
+      for (size_t b = 0; b < stats_.num_buckets_; b++) {
+        stats_.buckets_[b].fetch_sub(stats_to_drop.bucket_at(b),
+                                     std::memory_order_relaxed);
+      }
+
+      if (stats_.min() == stats_to_drop.min()) {
+        uint64_t new_min = std::numeric_limits<uint64_t>::max();
+        for (unsigned int i = 0; i < num_windows_; i++) {
+          if (i != next_window) {
+            uint64_t m = window_stats_[i].min();
+            if (m < new_min) new_min = m;
+          }
+        }
+        stats_.min_.store(new_min, std::memory_order_relaxed);
+      }
+
+      if (stats_.max() == stats_to_drop.max()) {
+        uint64_t new_max = 0;
+        for (unsigned int i = 0; i < num_windows_; i++) {
+          if (i != next_window) {
+            uint64_t m = window_stats_[i].max();
+            if (m > new_max) new_max = m;
+          }
+        }
+        stats_.max_.store(new_max, std::memory_order_relaxed);
+      }
+
+      stats_.num_.fetch_sub(stats_to_drop.num(), std::memory_order_relaxed);
+      stats_.sum_.fetch_sub(stats_to_drop.sum(), std::memory_order_relaxed);
+      stats_.sum_squares_.fetch_sub(stats_to_drop.sum_squares(),
+                                    std::memory_order_relaxed);
+
+      stats_to_drop.Clear();
+    }
+
+    // advance to next window bucket
+    current_window_.store(next_window, std::memory_order_relaxed);
+
+    mutex_.unlock();
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/monitoring/histogram_windowing.h b/src/rocksdb/monitoring/histogram_windowing.h
new file mode 100644
index 000000000..9a862671f
--- /dev/null
+++ b/src/rocksdb/monitoring/histogram_windowing.h
@@ -0,0 +1,84 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "monitoring/histogram.h"
+
+namespace ROCKSDB_NAMESPACE {
+class SystemClock;
+
+class HistogramWindowingImpl : public Histogram {
+ public:
+  HistogramWindowingImpl();
+  HistogramWindowingImpl(uint64_t num_windows, uint64_t micros_per_window,
+                         uint64_t min_num_per_window);
+
+  HistogramWindowingImpl(const HistogramWindowingImpl&) = delete;
+  HistogramWindowingImpl& operator=(const HistogramWindowingImpl&) = delete;
+
+  ~HistogramWindowingImpl();
+
+  virtual void Clear() override;
+  virtual bool Empty() const override;
+  virtual void Add(uint64_t value) override;
+  virtual void Merge(const Histogram& other) override;
+  void Merge(const HistogramWindowingImpl& other);
+
+  virtual std::string ToString() const override;
+  virtual const char* Name() const override { return "HistogramWindowingImpl"; }
+  virtual uint64_t min() const override { return stats_.min(); }
+  virtual uint64_t max() const override { return stats_.max(); }
+  virtual uint64_t num() const override { return stats_.num(); }
+  virtual double Median() const override;
+  virtual double Percentile(double p) const override;
+  virtual double Average() const override;
+  virtual double StandardDeviation() const override;
+  virtual void Data(HistogramData* const data) const override;
+
+#ifndef NDEBUG
+  void TEST_UpdateClock(const std::shared_ptr<SystemClock>& clock) {
+    clock_ = clock;
+  }
+#endif  // NDEBUG
+
+ private:
+  void TimerTick();
+  void SwapHistoryBucket();
+  inline uint64_t current_window() const {
+    return current_window_.load(std::memory_order_relaxed);
+  }
+  inline uint64_t last_swap_time() const {
+    return last_swap_time_.load(std::memory_order_relaxed);
+  }
+
+  std::shared_ptr<SystemClock> clock_;
+  std::mutex mutex_;
+
+  // Aggregated stats over windows_stats_, all the computation is done
+  // upon aggregated values
+  HistogramStat stats_;
+
+  // This is a circular array representing the latest N time-windows.
+  // Each entry stores a time-window of data. Expiration is done
+  // on window-based.
+  std::unique_ptr<HistogramStat[]> window_stats_;
+
+  std::atomic_uint_fast64_t current_window_;
+  std::atomic_uint_fast64_t last_swap_time_;
+
+  // Following parameters are configuable
+  uint64_t num_windows_ = 5;
+  uint64_t micros_per_window_ = 60000000;
+  // By default, don't care about the number of values in current window
+  // when decide whether to swap windows or not.
+  uint64_t min_num_per_window_ = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/monitoring/in_memory_stats_history.cc b/src/rocksdb/monitoring/in_memory_stats_history.cc
new file mode 100644
index 000000000..568d8ec13
--- /dev/null
+++ b/src/rocksdb/monitoring/in_memory_stats_history.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "monitoring/in_memory_stats_history.h"
+
+#include "db/db_impl/db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+InMemoryStatsHistoryIterator::~InMemoryStatsHistoryIterator() {}
+
+bool InMemoryStatsHistoryIterator::Valid() const { return valid_; }
+
+Status InMemoryStatsHistoryIterator::status() const { return status_; }
+
+// Because of garbage collection, the next stats snapshot may or may not be
+// right after the current one. When reading from DBImpl::stats_history_, this
+// call will be protected by DB Mutex so it will not return partial or
+// corrupted results.
+void InMemoryStatsHistoryIterator::Next() {
+  // increment start_time by 1 to avoid infinite loop
+  AdvanceIteratorByTime(GetStatsTime() + 1, end_time_);
+}
+
+uint64_t InMemoryStatsHistoryIterator::GetStatsTime() const { return time_; }
+
+const std::map<std::string, uint64_t>&
+InMemoryStatsHistoryIterator::GetStatsMap() const {
+  return stats_map_;
+}
+
+// advance the iterator to the next time between [start_time, end_time)
+// if success, update time_ and stats_map_ with new_time and stats_map
+void InMemoryStatsHistoryIterator::AdvanceIteratorByTime(uint64_t start_time,
+                                                         uint64_t end_time) {
+  // try to find next entry in stats_history_ map
+  if (db_impl_ != nullptr) {
+    valid_ =
+        db_impl_->FindStatsByTime(start_time, end_time, &time_, &stats_map_);
+  } else {
+    valid_ = false;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/monitoring/in_memory_stats_history.h b/src/rocksdb/monitoring/in_memory_stats_history.h
new file mode 100644
index 000000000..3be864fe2
--- /dev/null
+++ b/src/rocksdb/monitoring/in_memory_stats_history.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "rocksdb/stats_history.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// InMemoryStatsHistoryIterator can be used to access stats history that was
+// stored by an in-memory two level std::map(DBImpl::stats_history_). It keeps
+// a copy of the stats snapshot (in stats_map_) that is currently being pointed
+// to, which allows the iterator to access the stats snapshot even when
+// the background garbage collecting thread purges it from the source of truth
+// (`DBImpl::stats_history_`). In that case, the iterator will continue to be
+// valid until a call to `Next()` returns no result and invalidates it. In
+// some extreme cases, the iterator may also return fragmented segments of
+// stats snapshots due to long gaps between `Next()` calls and interleaved
+// garbage collection.
+class InMemoryStatsHistoryIterator final : public StatsHistoryIterator {
+ public:
+  // Setup InMemoryStatsHistoryIterator to return stats snapshots between
+  // seconds timestamps [start_time, end_time)
+  InMemoryStatsHistoryIterator(uint64_t start_time, uint64_t end_time,
+                               DBImpl* db_impl)
+      : start_time_(start_time),
+        end_time_(end_time),
+        valid_(true),
+        db_impl_(db_impl) {
+    AdvanceIteratorByTime(start_time_, end_time_);
+  }
+  // no copying allowed
+  InMemoryStatsHistoryIterator(const InMemoryStatsHistoryIterator&) = delete;
+  void operator=(const InMemoryStatsHistoryIterator&) = delete;
+  InMemoryStatsHistoryIterator(InMemoryStatsHistoryIterator&&) = delete;
+  InMemoryStatsHistoryIterator& operator=(InMemoryStatsHistoryIterator&&) =
+      delete;
+
+  ~InMemoryStatsHistoryIterator() override;
+  bool Valid() const override;
+  Status status() const override;
+
+  // Move to the next stats snapshot currently available
+  // This function may invalidate the iterator
+  // REQUIRES: Valid()
+  void Next() override;
+
+  // REQUIRES: Valid()
+  uint64_t GetStatsTime() const override;
+
+  // This function is idempotent
+  // REQUIRES: Valid()
+  const std::map<std::string, uint64_t>& GetStatsMap() const override;
+
+ private:
+  // advance the iterator to the next stats history record with timestamp
+  // between [start_time, end_time)
+  void AdvanceIteratorByTime(uint64_t start_time, uint64_t end_time);
+
+  uint64_t time_;
+  uint64_t start_time_;
+  uint64_t end_time_;
+  std::map<std::string, uint64_t> stats_map_;
+  Status status_;
+  bool valid_;
+  DBImpl* db_impl_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/monitoring/instrumented_mutex.cc b/src/rocksdb/monitoring/instrumented_mutex.cc
new file mode 100644
index 000000000..699495a34
--- /dev/null
+++ b/src/rocksdb/monitoring/instrumented_mutex.cc
@@ -0,0 +1,90 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "monitoring/instrumented_mutex.h"
+
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/thread_status_util.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+#ifndef NPERF_CONTEXT
+Statistics* stats_for_report(SystemClock* clock, Statistics* stats) {
+  if (clock != nullptr && stats != nullptr &&
+      stats->get_stats_level() > kExceptTimeForMutex) {
+    return stats;
+  } else {
+    return nullptr;
+  }
+}
+#endif  // NPERF_CONTEXT
+}  // namespace
+
+void InstrumentedMutex::Lock() {
+  PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(
+      db_mutex_lock_nanos, stats_code_ == DB_MUTEX_WAIT_MICROS,
+      stats_for_report(clock_, stats_), stats_code_);
+  LockInternal();
+}
+
+void InstrumentedMutex::LockInternal() {
+#ifndef NDEBUG
+  ThreadStatusUtil::TEST_StateDelay(ThreadStatus::STATE_MUTEX_WAIT);
+#endif
+#ifdef COERCE_CONTEXT_SWITCH
+  if (stats_code_ == DB_MUTEX_WAIT_MICROS) {
+    thread_local Random rnd(301);
+    if (rnd.OneIn(2)) {
+      if (bg_cv_) {
+        bg_cv_->SignalAll();
+      }
+      sched_yield();
+    } else {
+      uint32_t sleep_us = rnd.Uniform(11) * 1000;
+      if (bg_cv_) {
+        bg_cv_->SignalAll();
+      }
+      SystemClock::Default()->SleepForMicroseconds(sleep_us);
+    }
+  }
+#endif
+  mutex_.Lock();
+}
+
+void InstrumentedCondVar::Wait() {
+  PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(
+      db_condition_wait_nanos, stats_code_ == DB_MUTEX_WAIT_MICROS,
+      stats_for_report(clock_, stats_), stats_code_);
+  WaitInternal();
+}
+
+void InstrumentedCondVar::WaitInternal() {
+#ifndef NDEBUG
+  ThreadStatusUtil::TEST_StateDelay(ThreadStatus::STATE_MUTEX_WAIT);
+#endif
+  cond_.Wait();
+}
+
+bool InstrumentedCondVar::TimedWait(uint64_t abs_time_us) {
+  PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(
+      db_condition_wait_nanos, stats_code_ == DB_MUTEX_WAIT_MICROS,
+      stats_for_report(clock_, stats_), stats_code_);
+  return TimedWaitInternal(abs_time_us);
+}
+
+bool InstrumentedCondVar::TimedWaitInternal(uint64_t abs_time_us) {
+#ifndef NDEBUG
+  ThreadStatusUtil::TEST_StateDelay(ThreadStatus::STATE_MUTEX_WAIT);
+#endif
+
+  TEST_SYNC_POINT_CALLBACK("InstrumentedCondVar::TimedWaitInternal",
+                           &abs_time_us);
+
+  return cond_.TimedWait(abs_time_us);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/monitoring/instrumented_mutex.h b/src/rocksdb/monitoring/instrumented_mutex.h
new file mode 100644
index 000000000..e5aae34df
--- /dev/null
+++ b/src/rocksdb/monitoring/instrumented_mutex.h
@@ -0,0 +1,126 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "monitoring/statistics.h"
+#include "port/port.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/thread_status.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+class InstrumentedCondVar;
+
+// A wrapper class for port::Mutex that provides additional layer
+// for collecting stats and instrumentation.
+class InstrumentedMutex {
+ public:
+  explicit InstrumentedMutex(bool adaptive = false)
+      : mutex_(adaptive), stats_(nullptr), clock_(nullptr), stats_code_(0) {}
+
+  explicit InstrumentedMutex(SystemClock* clock, bool adaptive = false)
+      : mutex_(adaptive), stats_(nullptr), clock_(clock), stats_code_(0) {}
+
+  InstrumentedMutex(Statistics* stats, SystemClock* clock, int stats_code,
+                    bool adaptive = false)
+      : mutex_(adaptive),
+        stats_(stats),
+        clock_(clock),
+        stats_code_(stats_code) {}
+
+#ifdef COERCE_CONTEXT_SWITCH
+  InstrumentedMutex(Statistics* stats, SystemClock* clock, int stats_code,
+                    InstrumentedCondVar* bg_cv, bool adaptive = false)
+      : mutex_(adaptive),
+        stats_(stats),
+        clock_(clock),
+        stats_code_(stats_code),
+        bg_cv_(bg_cv) {}
+#endif
+
+  void Lock();
+
+  void Unlock() { mutex_.Unlock(); }
+
+  void AssertHeld() { mutex_.AssertHeld(); }
+
+ private:
+  void LockInternal();
+  friend class InstrumentedCondVar;
+  port::Mutex mutex_;
+  Statistics* stats_;
+  SystemClock* clock_;
+  int stats_code_;
+#ifdef COERCE_CONTEXT_SWITCH
+  InstrumentedCondVar* bg_cv_ = nullptr;
+#endif
+};
+
+class ALIGN_AS(CACHE_LINE_SIZE) CacheAlignedInstrumentedMutex
+    : public InstrumentedMutex {
+  using InstrumentedMutex::InstrumentedMutex;
+};
+static_assert(alignof(CacheAlignedInstrumentedMutex) != CACHE_LINE_SIZE ||
+              sizeof(CacheAlignedInstrumentedMutex) % CACHE_LINE_SIZE == 0);
+
+// RAII wrapper for InstrumentedMutex
+class InstrumentedMutexLock {
+ public:
+  explicit InstrumentedMutexLock(InstrumentedMutex* mutex) : mutex_(mutex) {
+    mutex_->Lock();
+  }
+
+  ~InstrumentedMutexLock() { mutex_->Unlock(); }
+
+ private:
+  InstrumentedMutex* const mutex_;
+  InstrumentedMutexLock(const InstrumentedMutexLock&) = delete;
+  void operator=(const InstrumentedMutexLock&) = delete;
+};
+
+// RAII wrapper for temporary releasing InstrumentedMutex inside
+// InstrumentedMutexLock
+class InstrumentedMutexUnlock {
+ public:
+  explicit InstrumentedMutexUnlock(InstrumentedMutex* mutex) : mutex_(mutex) {
+    mutex_->Unlock();
+  }
+
+  ~InstrumentedMutexUnlock() { mutex_->Lock(); }
+
+ private:
+  InstrumentedMutex* const mutex_;
+  InstrumentedMutexUnlock(const InstrumentedMutexUnlock&) = delete;
+  void operator=(const InstrumentedMutexUnlock&) = delete;
+};
+
+class InstrumentedCondVar {
+ public:
+  explicit InstrumentedCondVar(InstrumentedMutex* instrumented_mutex)
+      : cond_(&(instrumented_mutex->mutex_)),
+        stats_(instrumented_mutex->stats_),
+        clock_(instrumented_mutex->clock_),
+        stats_code_(instrumented_mutex->stats_code_) {}
+
+  void Wait();
+
+  bool TimedWait(uint64_t abs_time_us);
+
+  void Signal() { cond_.Signal(); }
+
+  void SignalAll() { cond_.SignalAll(); }
+
+ private:
+  void WaitInternal();
+  bool TimedWaitInternal(uint64_t abs_time_us);
+  port::CondVar cond_;
+  Statistics* stats_;
+  SystemClock* clock_;
+  int stats_code_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/monitoring/iostats_context.cc b/src/rocksdb/monitoring/iostats_context.cc
new file mode 100644
index 000000000..04e98914d
--- /dev/null
+++ b/src/rocksdb/monitoring/iostats_context.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <sstream>
+
+#include "monitoring/iostats_context_imp.h"
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifdef NIOSTATS_CONTEXT
+// Should not be used because the counters are not thread-safe.
+// Put here just to make get_iostats_context() simple without ifdef.
+static IOStatsContext iostats_context;
+#else
+thread_local IOStatsContext iostats_context;
+#endif
+
+IOStatsContext* get_iostats_context() { return &iostats_context; }
+
+void IOStatsContext::Reset() {
+#ifndef NIOSTATS_CONTEXT
+  thread_pool_id = Env::Priority::TOTAL;
+  bytes_read = 0;
+  bytes_written = 0;
+  open_nanos = 0;
+  allocate_nanos = 0;
+  write_nanos = 0;
+  read_nanos = 0;
+  range_sync_nanos = 0;
+  prepare_write_nanos = 0;
+  fsync_nanos = 0;
+  logger_nanos = 0;
+  cpu_write_nanos = 0;
+  cpu_read_nanos = 0;
+  file_io_stats_by_temperature.Reset();
+#endif  //! NIOSTATS_CONTEXT
+}
+
+#define IOSTATS_CONTEXT_OUTPUT(counter)         \
+  if (!exclude_zero_counters || counter > 0) {  \
+    ss << #counter << " = " << counter << ", "; \
+  }
+
+std::string IOStatsContext::ToString(bool exclude_zero_counters) const {
+#ifdef NIOSTATS_CONTEXT
+  (void)exclude_zero_counters;
+  return "";
+#else
+  std::ostringstream ss;
+  IOSTATS_CONTEXT_OUTPUT(thread_pool_id);
+  IOSTATS_CONTEXT_OUTPUT(bytes_read);
+  IOSTATS_CONTEXT_OUTPUT(bytes_written);
+  IOSTATS_CONTEXT_OUTPUT(open_nanos);
+  IOSTATS_CONTEXT_OUTPUT(allocate_nanos);
+  IOSTATS_CONTEXT_OUTPUT(write_nanos);
+  IOSTATS_CONTEXT_OUTPUT(read_nanos);
+  IOSTATS_CONTEXT_OUTPUT(range_sync_nanos);
+  IOSTATS_CONTEXT_OUTPUT(fsync_nanos);
+  IOSTATS_CONTEXT_OUTPUT(prepare_write_nanos);
+  IOSTATS_CONTEXT_OUTPUT(logger_nanos);
+  IOSTATS_CONTEXT_OUTPUT(cpu_write_nanos);
+  IOSTATS_CONTEXT_OUTPUT(cpu_read_nanos);
+  IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.hot_file_bytes_read);
+  IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.warm_file_bytes_read);
+  IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.cold_file_bytes_read);
+  IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.hot_file_read_count);
+  IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.warm_file_read_count);
+  IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.cold_file_read_count);
+  std::string str = ss.str();
+  str.erase(str.find_last_not_of(", ") + 1);
+  return str;
+#endif  //! NIOSTATS_CONTEXT
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/monitoring/iostats_context_imp.h b/src/rocksdb/monitoring/iostats_context_imp.h
new file mode 100644
index 000000000..a0b4292df
--- /dev/null
+++ b/src/rocksdb/monitoring/iostats_context_imp.h
@@ -0,0 +1,62 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include "monitoring/perf_step_timer.h"
+#include "rocksdb/iostats_context.h"
+
+#if !defined(NIOSTATS_CONTEXT)
+namespace ROCKSDB_NAMESPACE {
+extern thread_local IOStatsContext iostats_context;
+}  // namespace ROCKSDB_NAMESPACE
+
+// increment a specific counter by the specified value
+#define IOSTATS_ADD(metric, value)        \
+  if (!iostats_context.disable_iostats) { \
+    iostats_context.metric += value;      \
+  }
+
+// reset a specific counter to zero
+#define IOSTATS_RESET(metric) (iostats_context.metric = 0)
+
+// reset all counters to zero
+#define IOSTATS_RESET_ALL() (iostats_context.Reset())
+
+#define IOSTATS_SET_THREAD_POOL_ID(value) \
+  (iostats_context.thread_pool_id = value)
+
+#define IOSTATS_THREAD_POOL_ID() (iostats_context.thread_pool_id)
+
+#define IOSTATS(metric) (iostats_context.metric)
+
+// Declare and set start time of the timer
+#define IOSTATS_TIMER_GUARD(metric)                                     \
+  PerfStepTimer iostats_step_timer_##metric(&(iostats_context.metric)); \
+  iostats_step_timer_##metric.Start();
+
+// Declare and set start time of the timer
+#define IOSTATS_CPU_TIMER_GUARD(metric, clock)         \
+  PerfStepTimer iostats_step_timer_##metric(           \
+      &(iostats_context.metric), clock, true,          \
+      PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); \
+  iostats_step_timer_##metric.Start();
+
+#define IOSTATS_SET_DISABLE(disable) (iostats_context.disable_iostats = disable)
+
+#else  // !NIOSTATS_CONTEXT
+
+#define IOSTATS_ADD(metric, value)
+#define IOSTATS_ADD_IF_POSITIVE(metric, value)
+#define IOSTATS_RESET(metric)
+#define IOSTATS_RESET_ALL()
+#define IOSTATS_SET_THREAD_POOL_ID(value)
+#define IOSTATS_THREAD_POOL_ID()
+#define IOSTATS(metric) 0
+#define IOSTATS_SET_DISABLE(disable)
+
+#define IOSTATS_TIMER_GUARD(metric)
+#define IOSTATS_CPU_TIMER_GUARD(metric, clock) static_cast<void>(clock)
+
+#endif  // !NIOSTATS_CONTEXT
diff --git a/src/rocksdb/monitoring/iostats_context_test.cc b/src/rocksdb/monitoring/iostats_context_test.cc
new file mode 100644
index 000000000..5fce33406
--- /dev/null
+++ b/src/rocksdb/monitoring/iostats_context_test.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/iostats_context.h"
+
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+TEST(IOStatsContextTest, ToString) {
+  get_iostats_context()->Reset();
+  get_iostats_context()->bytes_read = 12345;
+
+  std::string zero_included = get_iostats_context()->ToString();
+  ASSERT_NE(std::string::npos, zero_included.find("= 0"));
+  ASSERT_NE(std::string::npos, zero_included.find("= 12345"));
+
+  std::string zero_excluded = get_iostats_context()->ToString(true);
+  ASSERT_EQ(std::string::npos, zero_excluded.find("= 0"));
+  ASSERT_NE(std::string::npos, zero_excluded.find("= 12345"));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/monitoring/perf_context.cc b/src/rocksdb/monitoring/perf_context.cc
new file mode 100644
index 000000000..9068ede01
--- /dev/null
+++ b/src/rocksdb/monitoring/perf_context.cc
@@ -0,0 +1,652 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include <sstream>
+
+#include "monitoring/perf_context_imp.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#if defined(NPERF_CONTEXT)
+// Should not be used because the counters are not thread-safe.
+// Put here just to make get_perf_context() simple without ifdef.
+PerfContext perf_context;
+#else
+thread_local PerfContext perf_context;
+#endif
+
+PerfContext* get_perf_context() { return &perf_context; }
+
+PerfContext::~PerfContext() {
+#if !defined(NPERF_CONTEXT) && !defined(OS_SOLARIS)
+  ClearPerLevelPerfContext();
+#endif
+}
+
+PerfContext::PerfContext(const PerfContext& other) {
+#ifdef NPERF_CONTEXT
+  (void)other;
+#else
+  user_key_comparison_count = other.user_key_comparison_count;
+  block_cache_hit_count = other.block_cache_hit_count;
+  block_read_count = other.block_read_count;
+  block_read_byte = other.block_read_byte;
+  block_read_time = other.block_read_time;
+  block_cache_index_hit_count = other.block_cache_index_hit_count;
+  block_cache_standalone_handle_count =
+      other.block_cache_standalone_handle_count;
+  block_cache_real_handle_count = other.block_cache_real_handle_count;
+  index_block_read_count = other.index_block_read_count;
+  block_cache_filter_hit_count = other.block_cache_filter_hit_count;
+  filter_block_read_count = other.filter_block_read_count;
+  compression_dict_block_read_count = other.compression_dict_block_read_count;
+  secondary_cache_hit_count = other.secondary_cache_hit_count;
+  compressed_sec_cache_insert_real_count =
+      other.compressed_sec_cache_insert_real_count;
+  compressed_sec_cache_insert_dummy_count =
+      other.compressed_sec_cache_insert_dummy_count;
+  compressed_sec_cache_uncompressed_bytes =
+      other.compressed_sec_cache_uncompressed_bytes;
+  compressed_sec_cache_compressed_bytes =
+      other.compressed_sec_cache_compressed_bytes;
+  block_checksum_time = other.block_checksum_time;
+  block_decompress_time = other.block_decompress_time;
+  get_read_bytes = other.get_read_bytes;
+  multiget_read_bytes = other.multiget_read_bytes;
+  iter_read_bytes = other.iter_read_bytes;
+
+  blob_cache_hit_count = other.blob_cache_hit_count;
+  blob_read_count = other.blob_read_count;
+  blob_read_byte = other.blob_read_byte;
+  blob_read_time = other.blob_read_time;
+  blob_checksum_time = other.blob_checksum_time;
+  blob_decompress_time = other.blob_decompress_time;
+
+  internal_key_skipped_count = other.internal_key_skipped_count;
+  internal_delete_skipped_count = other.internal_delete_skipped_count;
+  internal_recent_skipped_count = other.internal_recent_skipped_count;
+  internal_merge_count = other.internal_merge_count;
+  internal_range_del_reseek_count = other.internal_range_del_reseek_count;
+  write_wal_time = other.write_wal_time;
+  get_snapshot_time = other.get_snapshot_time;
+  get_from_memtable_time = other.get_from_memtable_time;
+  get_from_memtable_count = other.get_from_memtable_count;
+  get_post_process_time = other.get_post_process_time;
+  get_from_output_files_time = other.get_from_output_files_time;
+  seek_on_memtable_time = other.seek_on_memtable_time;
+  seek_on_memtable_count = other.seek_on_memtable_count;
+  next_on_memtable_count = other.next_on_memtable_count;
+  prev_on_memtable_count = other.prev_on_memtable_count;
+  seek_child_seek_time = other.seek_child_seek_time;
+  seek_child_seek_count = other.seek_child_seek_count;
+  seek_min_heap_time = other.seek_min_heap_time;
+  seek_internal_seek_time = other.seek_internal_seek_time;
+  find_next_user_entry_time = other.find_next_user_entry_time;
+  write_pre_and_post_process_time = other.write_pre_and_post_process_time;
+  write_memtable_time = other.write_memtable_time;
+  write_delay_time = other.write_delay_time;
+  write_thread_wait_nanos = other.write_thread_wait_nanos;
+  write_scheduling_flushes_compactions_time =
+      other.write_scheduling_flushes_compactions_time;
+  db_mutex_lock_nanos = other.db_mutex_lock_nanos;
+  db_condition_wait_nanos = other.db_condition_wait_nanos;
+  merge_operator_time_nanos = other.merge_operator_time_nanos;
+  read_index_block_nanos = other.read_index_block_nanos;
+  read_filter_block_nanos = other.read_filter_block_nanos;
+  new_table_block_iter_nanos = other.new_table_block_iter_nanos;
+  new_table_iterator_nanos = other.new_table_iterator_nanos;
+  block_seek_nanos = other.block_seek_nanos;
+  find_table_nanos = other.find_table_nanos;
+  bloom_memtable_hit_count = other.bloom_memtable_hit_count;
+  bloom_memtable_miss_count = other.bloom_memtable_miss_count;
+  bloom_sst_hit_count = other.bloom_sst_hit_count;
+  bloom_sst_miss_count = other.bloom_sst_miss_count;
+  key_lock_wait_time = other.key_lock_wait_time;
+  key_lock_wait_count = other.key_lock_wait_count;
+
+  env_new_sequential_file_nanos = other.env_new_sequential_file_nanos;
+  env_new_random_access_file_nanos = other.env_new_random_access_file_nanos;
+  env_new_writable_file_nanos = other.env_new_writable_file_nanos;
+  env_reuse_writable_file_nanos = other.env_reuse_writable_file_nanos;
+  env_new_random_rw_file_nanos = other.env_new_random_rw_file_nanos;
+  env_new_directory_nanos = other.env_new_directory_nanos;
+  env_file_exists_nanos = other.env_file_exists_nanos;
+  env_get_children_nanos = other.env_get_children_nanos;
+  env_get_children_file_attributes_nanos =
+      other.env_get_children_file_attributes_nanos;
+  env_delete_file_nanos = other.env_delete_file_nanos;
+  env_create_dir_nanos = other.env_create_dir_nanos;
+  env_create_dir_if_missing_nanos = other.env_create_dir_if_missing_nanos;
+  env_delete_dir_nanos = other.env_delete_dir_nanos;
+  env_get_file_size_nanos = other.env_get_file_size_nanos;
+  env_get_file_modification_time_nanos =
+      other.env_get_file_modification_time_nanos;
+  env_rename_file_nanos = other.env_rename_file_nanos;
+  env_link_file_nanos = other.env_link_file_nanos;
+  env_lock_file_nanos = other.env_lock_file_nanos;
+  env_unlock_file_nanos = other.env_unlock_file_nanos;
+  env_new_logger_nanos = other.env_new_logger_nanos;
+  get_cpu_nanos = other.get_cpu_nanos;
+  iter_next_cpu_nanos = other.iter_next_cpu_nanos;
+  iter_prev_cpu_nanos = other.iter_prev_cpu_nanos;
+  iter_seek_cpu_nanos = other.iter_seek_cpu_nanos;
+  number_async_seek = other.number_async_seek;
+  if (per_level_perf_context_enabled && level_to_perf_context != nullptr) {
+    ClearPerLevelPerfContext();
+  }
+  if (other.level_to_perf_context != nullptr) {
+    level_to_perf_context = new std::map<uint32_t, PerfContextByLevel>();
+    *level_to_perf_context = *other.level_to_perf_context;
+  }
+  per_level_perf_context_enabled = other.per_level_perf_context_enabled;
+#endif
+}
+
+PerfContext::PerfContext(PerfContext&& other) noexcept {
+#ifdef NPERF_CONTEXT
+  (void)other;
+#else
+  user_key_comparison_count = other.user_key_comparison_count;
+  block_cache_hit_count = other.block_cache_hit_count;
+  block_read_count = other.block_read_count;
+  block_read_byte = other.block_read_byte;
+  block_read_time = other.block_read_time;
+  block_cache_index_hit_count = other.block_cache_index_hit_count;
+  block_cache_standalone_handle_count =
+      other.block_cache_standalone_handle_count;
+  block_cache_real_handle_count = other.block_cache_real_handle_count;
+  index_block_read_count = other.index_block_read_count;
+  block_cache_filter_hit_count = other.block_cache_filter_hit_count;
+  filter_block_read_count = other.filter_block_read_count;
+  compression_dict_block_read_count = other.compression_dict_block_read_count;
+  secondary_cache_hit_count = other.secondary_cache_hit_count;
+  compressed_sec_cache_insert_real_count =
+      other.compressed_sec_cache_insert_real_count;
+  compressed_sec_cache_insert_dummy_count =
+      other.compressed_sec_cache_insert_dummy_count;
+  compressed_sec_cache_uncompressed_bytes =
+      other.compressed_sec_cache_uncompressed_bytes;
+  compressed_sec_cache_compressed_bytes =
+      other.compressed_sec_cache_compressed_bytes;
+  block_checksum_time = other.block_checksum_time;
+  block_decompress_time = other.block_decompress_time;
+  get_read_bytes = other.get_read_bytes;
+  multiget_read_bytes = other.multiget_read_bytes;
+  iter_read_bytes = other.iter_read_bytes;
+
+  blob_cache_hit_count = other.blob_cache_hit_count;
+  blob_read_count = other.blob_read_count;
+  blob_read_byte = other.blob_read_byte;
+  blob_read_time = other.blob_read_time;
+  blob_checksum_time = other.blob_checksum_time;
+  blob_decompress_time = other.blob_decompress_time;
+
+  internal_key_skipped_count = other.internal_key_skipped_count;
+  internal_delete_skipped_count = other.internal_delete_skipped_count;
+  internal_recent_skipped_count = other.internal_recent_skipped_count;
+  internal_merge_count = other.internal_merge_count;
+  internal_range_del_reseek_count = other.internal_range_del_reseek_count;
+  write_wal_time = other.write_wal_time;
+  get_snapshot_time = other.get_snapshot_time;
+  get_from_memtable_time = other.get_from_memtable_time;
+  get_from_memtable_count = other.get_from_memtable_count;
+  get_post_process_time = other.get_post_process_time;
+  get_from_output_files_time = other.get_from_output_files_time;
+  seek_on_memtable_time = other.seek_on_memtable_time;
+  seek_on_memtable_count = other.seek_on_memtable_count;
+  next_on_memtable_count = other.next_on_memtable_count;
+  prev_on_memtable_count = other.prev_on_memtable_count;
+  seek_child_seek_time = other.seek_child_seek_time;
+  seek_child_seek_count = other.seek_child_seek_count;
+  seek_min_heap_time = other.seek_min_heap_time;
+  seek_internal_seek_time = other.seek_internal_seek_time;
+  find_next_user_entry_time = other.find_next_user_entry_time;
+  write_pre_and_post_process_time = other.write_pre_and_post_process_time;
+  write_memtable_time = other.write_memtable_time;
+  write_delay_time = other.write_delay_time;
+  write_thread_wait_nanos = other.write_thread_wait_nanos;
+  write_scheduling_flushes_compactions_time =
+      other.write_scheduling_flushes_compactions_time;
+  db_mutex_lock_nanos = other.db_mutex_lock_nanos;
+  db_condition_wait_nanos = other.db_condition_wait_nanos;
+  merge_operator_time_nanos = other.merge_operator_time_nanos;
+  read_index_block_nanos = other.read_index_block_nanos;
+  read_filter_block_nanos = other.read_filter_block_nanos;
+  new_table_block_iter_nanos = other.new_table_block_iter_nanos;
+  new_table_iterator_nanos = other.new_table_iterator_nanos;
+  block_seek_nanos = other.block_seek_nanos;
+  find_table_nanos = other.find_table_nanos;
+  bloom_memtable_hit_count = other.bloom_memtable_hit_count;
+  bloom_memtable_miss_count = other.bloom_memtable_miss_count;
+  bloom_sst_hit_count = other.bloom_sst_hit_count;
+  bloom_sst_miss_count = other.bloom_sst_miss_count;
+  key_lock_wait_time = other.key_lock_wait_time;
+  key_lock_wait_count = other.key_lock_wait_count;
+
+  env_new_sequential_file_nanos = other.env_new_sequential_file_nanos;
+  env_new_random_access_file_nanos = other.env_new_random_access_file_nanos;
+  env_new_writable_file_nanos = other.env_new_writable_file_nanos;
+  env_reuse_writable_file_nanos = other.env_reuse_writable_file_nanos;
+  env_new_random_rw_file_nanos = other.env_new_random_rw_file_nanos;
+  env_new_directory_nanos = other.env_new_directory_nanos;
+  env_file_exists_nanos = other.env_file_exists_nanos;
+  env_get_children_nanos = other.env_get_children_nanos;
+  env_get_children_file_attributes_nanos =
+      other.env_get_children_file_attributes_nanos;
+  env_delete_file_nanos = other.env_delete_file_nanos;
+  env_create_dir_nanos = other.env_create_dir_nanos;
+  env_create_dir_if_missing_nanos = other.env_create_dir_if_missing_nanos;
+  env_delete_dir_nanos = other.env_delete_dir_nanos;
+  env_get_file_size_nanos = other.env_get_file_size_nanos;
+  env_get_file_modification_time_nanos =
+      other.env_get_file_modification_time_nanos;
+  env_rename_file_nanos = other.env_rename_file_nanos;
+  env_link_file_nanos = other.env_link_file_nanos;
+  env_lock_file_nanos = other.env_lock_file_nanos;
+  env_unlock_file_nanos = other.env_unlock_file_nanos;
+  env_new_logger_nanos = other.env_new_logger_nanos;
+  get_cpu_nanos = other.get_cpu_nanos;
+  iter_next_cpu_nanos = other.iter_next_cpu_nanos;
+  iter_prev_cpu_nanos = other.iter_prev_cpu_nanos;
+  iter_seek_cpu_nanos = other.iter_seek_cpu_nanos;
+  number_async_seek = other.number_async_seek;
+  if (per_level_perf_context_enabled && level_to_perf_context != nullptr) {
+    ClearPerLevelPerfContext();
+  }
+  if (other.level_to_perf_context != nullptr) {
+    level_to_perf_context = other.level_to_perf_context;
+    other.level_to_perf_context = nullptr;
+  }
+  per_level_perf_context_enabled = other.per_level_perf_context_enabled;
+#endif
+}
+
+// TODO(Zhongyi): reduce code duplication between copy constructor and
+// assignment operator
+PerfContext& PerfContext::operator=(const PerfContext& other) {
+#ifdef NPERF_CONTEXT
+  (void)other;
+#else
+  user_key_comparison_count = other.user_key_comparison_count;
+  block_cache_hit_count = other.block_cache_hit_count;
+  block_read_count = other.block_read_count;
+  block_read_byte = other.block_read_byte;
+  block_read_time = other.block_read_time;
+  block_cache_index_hit_count = other.block_cache_index_hit_count;
+  block_cache_standalone_handle_count =
+      other.block_cache_standalone_handle_count;
+  block_cache_real_handle_count = other.block_cache_real_handle_count;
+  index_block_read_count = other.index_block_read_count;
+  block_cache_filter_hit_count = other.block_cache_filter_hit_count;
+  filter_block_read_count = other.filter_block_read_count;
+  compression_dict_block_read_count = other.compression_dict_block_read_count;
+  secondary_cache_hit_count = other.secondary_cache_hit_count;
+  compressed_sec_cache_insert_real_count =
+      other.compressed_sec_cache_insert_real_count;
+  compressed_sec_cache_insert_dummy_count =
+      other.compressed_sec_cache_insert_dummy_count;
+  compressed_sec_cache_uncompressed_bytes =
+      other.compressed_sec_cache_uncompressed_bytes;
+  compressed_sec_cache_compressed_bytes =
+      other.compressed_sec_cache_compressed_bytes;
+  block_checksum_time = other.block_checksum_time;
+  block_decompress_time = other.block_decompress_time;
+  get_read_bytes = other.get_read_bytes;
+  multiget_read_bytes = other.multiget_read_bytes;
+  iter_read_bytes = other.iter_read_bytes;
+
+  blob_cache_hit_count = other.blob_cache_hit_count;
+  blob_read_count = other.blob_read_count;
+  blob_read_byte = other.blob_read_byte;
+  blob_read_time = other.blob_read_time;
+  blob_checksum_time = other.blob_checksum_time;
+  blob_decompress_time = other.blob_decompress_time;
+
+  internal_key_skipped_count = other.internal_key_skipped_count;
+  internal_delete_skipped_count = other.internal_delete_skipped_count;
+  internal_recent_skipped_count = other.internal_recent_skipped_count;
+  internal_merge_count = other.internal_merge_count;
+  internal_range_del_reseek_count = other.internal_range_del_reseek_count;
+  write_wal_time = other.write_wal_time;
+  get_snapshot_time = other.get_snapshot_time;
+  get_from_memtable_time = other.get_from_memtable_time;
+  get_from_memtable_count = other.get_from_memtable_count;
+  get_post_process_time = other.get_post_process_time;
+  get_from_output_files_time = other.get_from_output_files_time;
+  seek_on_memtable_time = other.seek_on_memtable_time;
+  seek_on_memtable_count = other.seek_on_memtable_count;
+  next_on_memtable_count = other.next_on_memtable_count;
+  prev_on_memtable_count = other.prev_on_memtable_count;
+  seek_child_seek_time = other.seek_child_seek_time;
+  seek_child_seek_count = other.seek_child_seek_count;
+  seek_min_heap_time = other.seek_min_heap_time;
+  seek_internal_seek_time = other.seek_internal_seek_time;
+  find_next_user_entry_time = other.find_next_user_entry_time;
+  write_pre_and_post_process_time = other.write_pre_and_post_process_time;
+  write_memtable_time = other.write_memtable_time;
+  write_delay_time = other.write_delay_time;
+  write_thread_wait_nanos = other.write_thread_wait_nanos;
+  write_scheduling_flushes_compactions_time =
+      other.write_scheduling_flushes_compactions_time;
+  db_mutex_lock_nanos = other.db_mutex_lock_nanos;
+  db_condition_wait_nanos = other.db_condition_wait_nanos;
+  merge_operator_time_nanos = other.merge_operator_time_nanos;
+  read_index_block_nanos = other.read_index_block_nanos;
+  read_filter_block_nanos = other.read_filter_block_nanos;
+  new_table_block_iter_nanos = other.new_table_block_iter_nanos;
+  new_table_iterator_nanos = other.new_table_iterator_nanos;
+  block_seek_nanos = other.block_seek_nanos;
+  find_table_nanos = other.find_table_nanos;
+  bloom_memtable_hit_count = other.bloom_memtable_hit_count;
+  bloom_memtable_miss_count = other.bloom_memtable_miss_count;
+  bloom_sst_hit_count = other.bloom_sst_hit_count;
+  bloom_sst_miss_count = other.bloom_sst_miss_count;
+  key_lock_wait_time = other.key_lock_wait_time;
+  key_lock_wait_count = other.key_lock_wait_count;
+
+  env_new_sequential_file_nanos = other.env_new_sequential_file_nanos;
+  env_new_random_access_file_nanos = other.env_new_random_access_file_nanos;
+  env_new_writable_file_nanos = other.env_new_writable_file_nanos;
+  env_reuse_writable_file_nanos = other.env_reuse_writable_file_nanos;
+  env_new_random_rw_file_nanos = other.env_new_random_rw_file_nanos;
+  env_new_directory_nanos = other.env_new_directory_nanos;
+  env_file_exists_nanos = other.env_file_exists_nanos;
+  env_get_children_nanos = other.env_get_children_nanos;
+  env_get_children_file_attributes_nanos =
+      other.env_get_children_file_attributes_nanos;
+  env_delete_file_nanos = other.env_delete_file_nanos;
+  env_create_dir_nanos = other.env_create_dir_nanos;
+  env_create_dir_if_missing_nanos = other.env_create_dir_if_missing_nanos;
+  env_delete_dir_nanos = other.env_delete_dir_nanos;
+  env_get_file_size_nanos = other.env_get_file_size_nanos;
+  env_get_file_modification_time_nanos =
+      other.env_get_file_modification_time_nanos;
+  env_rename_file_nanos = other.env_rename_file_nanos;
+  env_link_file_nanos = other.env_link_file_nanos;
+  env_lock_file_nanos = other.env_lock_file_nanos;
+  env_unlock_file_nanos = other.env_unlock_file_nanos;
+  env_new_logger_nanos = other.env_new_logger_nanos;
+  get_cpu_nanos = other.get_cpu_nanos;
+  iter_next_cpu_nanos = other.iter_next_cpu_nanos;
+  iter_prev_cpu_nanos = other.iter_prev_cpu_nanos;
+  iter_seek_cpu_nanos = other.iter_seek_cpu_nanos;
+  number_async_seek = other.number_async_seek;
+  if (per_level_perf_context_enabled && level_to_perf_context != nullptr) {
+    ClearPerLevelPerfContext();
+  }
+  if (other.level_to_perf_context != nullptr) {
+    level_to_perf_context = new std::map<uint32_t, PerfContextByLevel>();
+    *level_to_perf_context = *other.level_to_perf_context;
+  }
+  per_level_perf_context_enabled = other.per_level_perf_context_enabled;
+#endif
+  return *this;
+}
+
+void PerfContext::Reset() {
+#ifndef NPERF_CONTEXT
+  user_key_comparison_count = 0;
+  block_cache_hit_count = 0;
+  block_read_count = 0;
+  block_read_byte = 0;
+  block_read_time = 0;
+  block_cache_index_hit_count = 0;
+  block_cache_standalone_handle_count = 0;
+  block_cache_real_handle_count = 0;
+  index_block_read_count = 0;
+  block_cache_filter_hit_count = 0;
+  filter_block_read_count = 0;
+  compression_dict_block_read_count = 0;
+  secondary_cache_hit_count = 0;
+  compressed_sec_cache_insert_real_count = 0;
+  compressed_sec_cache_insert_dummy_count = 0;
+  compressed_sec_cache_uncompressed_bytes = 0;
+  compressed_sec_cache_compressed_bytes = 0;
+  block_checksum_time = 0;
+  block_decompress_time = 0;
+  get_read_bytes = 0;
+  multiget_read_bytes = 0;
+  iter_read_bytes = 0;
+
+  blob_cache_hit_count = 0;
+  blob_read_count = 0;
+  blob_read_byte = 0;
+  blob_read_time = 0;
+  blob_checksum_time = 0;
+  blob_decompress_time = 0;
+
+  internal_key_skipped_count = 0;
+  internal_delete_skipped_count = 0;
+  internal_recent_skipped_count = 0;
+  internal_merge_count = 0;
+  internal_range_del_reseek_count = 0;
+  write_wal_time = 0;
+
+  get_snapshot_time = 0;
+  get_from_memtable_time = 0;
+  get_from_memtable_count = 0;
+  get_post_process_time = 0;
+  get_from_output_files_time = 0;
+  seek_on_memtable_time = 0;
+  seek_on_memtable_count = 0;
+  next_on_memtable_count = 0;
+  prev_on_memtable_count = 0;
+  seek_child_seek_time = 0;
+  seek_child_seek_count = 0;
+  seek_min_heap_time = 0;
+  seek_internal_seek_time = 0;
+  find_next_user_entry_time = 0;
+  write_pre_and_post_process_time = 0;
+  write_memtable_time = 0;
+  write_delay_time = 0;
+  write_thread_wait_nanos = 0;
+  write_scheduling_flushes_compactions_time = 0;
+  db_mutex_lock_nanos = 0;
+  db_condition_wait_nanos = 0;
+  merge_operator_time_nanos = 0;
+  read_index_block_nanos = 0;
+  read_filter_block_nanos = 0;
+  new_table_block_iter_nanos = 0;
+  new_table_iterator_nanos = 0;
+  block_seek_nanos = 0;
+  find_table_nanos = 0;
+  bloom_memtable_hit_count = 0;
+  bloom_memtable_miss_count = 0;
+  bloom_sst_hit_count = 0;
+  bloom_sst_miss_count = 0;
+  key_lock_wait_time = 0;
+  key_lock_wait_count = 0;
+
+  env_new_sequential_file_nanos = 0;
+  env_new_random_access_file_nanos = 0;
+  env_new_writable_file_nanos = 0;
+  env_reuse_writable_file_nanos = 0;
+  env_new_random_rw_file_nanos = 0;
+  env_new_directory_nanos = 0;
+  env_file_exists_nanos = 0;
+  env_get_children_nanos = 0;
+  env_get_children_file_attributes_nanos = 0;
+  env_delete_file_nanos = 0;
+  env_create_dir_nanos = 0;
+  env_create_dir_if_missing_nanos = 0;
+  env_delete_dir_nanos = 0;
+  env_get_file_size_nanos = 0;
+  env_get_file_modification_time_nanos = 0;
+  env_rename_file_nanos = 0;
+  env_link_file_nanos = 0;
+  env_lock_file_nanos = 0;
+  env_unlock_file_nanos = 0;
+  env_new_logger_nanos = 0;
+  get_cpu_nanos = 0;
+  iter_next_cpu_nanos = 0;
+  iter_prev_cpu_nanos = 0;
+  iter_seek_cpu_nanos = 0;
+  number_async_seek = 0;
+  if (per_level_perf_context_enabled && level_to_perf_context) {
+    for (auto& kv : *level_to_perf_context) {
+      kv.second.Reset();
+    }
+  }
+#endif
+}
+
+#define PERF_CONTEXT_OUTPUT(counter)             \
+  if (!exclude_zero_counters || (counter > 0)) { \
+    ss << #counter << " = " << counter << ", ";  \
+  }
+
+#define PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(counter)        \
+  if (per_level_perf_context_enabled && level_to_perf_context) { \
+    ss << #counter << " = ";                                     \
+    for (auto& kv : *level_to_perf_context) {                    \
+      if (!exclude_zero_counters || (kv.second.counter > 0)) {   \
+        ss << kv.second.counter << "@level" << kv.first << ", "; \
+      }                                                          \
+    }                                                            \
+  }
+
+void PerfContextByLevel::Reset() {
+#ifndef NPERF_CONTEXT
+  bloom_filter_useful = 0;
+  bloom_filter_full_positive = 0;
+  bloom_filter_full_true_positive = 0;
+  block_cache_hit_count = 0;
+  block_cache_miss_count = 0;
+#endif
+}
+
+std::string PerfContext::ToString(bool exclude_zero_counters) const {
+#ifdef NPERF_CONTEXT
+  (void)exclude_zero_counters;
+  return "";
+#else
+  std::ostringstream ss;
+  PERF_CONTEXT_OUTPUT(user_key_comparison_count);
+  PERF_CONTEXT_OUTPUT(block_cache_hit_count);
+  PERF_CONTEXT_OUTPUT(block_read_count);
+  PERF_CONTEXT_OUTPUT(block_read_byte);
+  PERF_CONTEXT_OUTPUT(block_read_time);
+  PERF_CONTEXT_OUTPUT(block_cache_index_hit_count);
+  PERF_CONTEXT_OUTPUT(block_cache_standalone_handle_count);
+  PERF_CONTEXT_OUTPUT(block_cache_real_handle_count);
+  PERF_CONTEXT_OUTPUT(index_block_read_count);
+  PERF_CONTEXT_OUTPUT(block_cache_filter_hit_count);
+  PERF_CONTEXT_OUTPUT(filter_block_read_count);
+  PERF_CONTEXT_OUTPUT(compression_dict_block_read_count);
+  PERF_CONTEXT_OUTPUT(secondary_cache_hit_count);
+  PERF_CONTEXT_OUTPUT(compressed_sec_cache_insert_real_count);
+  PERF_CONTEXT_OUTPUT(compressed_sec_cache_insert_dummy_count);
+  PERF_CONTEXT_OUTPUT(compressed_sec_cache_uncompressed_bytes);
+  PERF_CONTEXT_OUTPUT(compressed_sec_cache_compressed_bytes);
+  PERF_CONTEXT_OUTPUT(block_checksum_time);
+  PERF_CONTEXT_OUTPUT(block_decompress_time);
+  PERF_CONTEXT_OUTPUT(get_read_bytes);
+  PERF_CONTEXT_OUTPUT(multiget_read_bytes);
+  PERF_CONTEXT_OUTPUT(iter_read_bytes);
+  PERF_CONTEXT_OUTPUT(blob_cache_hit_count);
+  PERF_CONTEXT_OUTPUT(blob_read_count);
+  PERF_CONTEXT_OUTPUT(blob_read_byte);
+  PERF_CONTEXT_OUTPUT(blob_read_time);
+  PERF_CONTEXT_OUTPUT(blob_checksum_time);
+  PERF_CONTEXT_OUTPUT(blob_decompress_time);
+  PERF_CONTEXT_OUTPUT(internal_key_skipped_count);
+  PERF_CONTEXT_OUTPUT(internal_delete_skipped_count);
+  PERF_CONTEXT_OUTPUT(internal_recent_skipped_count);
+  PERF_CONTEXT_OUTPUT(internal_merge_count);
+  PERF_CONTEXT_OUTPUT(internal_range_del_reseek_count);
+  PERF_CONTEXT_OUTPUT(write_wal_time);
+  PERF_CONTEXT_OUTPUT(get_snapshot_time);
+  PERF_CONTEXT_OUTPUT(get_from_memtable_time);
+  PERF_CONTEXT_OUTPUT(get_from_memtable_count);
+  PERF_CONTEXT_OUTPUT(get_post_process_time);
+  PERF_CONTEXT_OUTPUT(get_from_output_files_time);
+  PERF_CONTEXT_OUTPUT(seek_on_memtable_time);
+  PERF_CONTEXT_OUTPUT(seek_on_memtable_count);
+  PERF_CONTEXT_OUTPUT(next_on_memtable_count);
+  PERF_CONTEXT_OUTPUT(prev_on_memtable_count);
+  PERF_CONTEXT_OUTPUT(seek_child_seek_time);
+  PERF_CONTEXT_OUTPUT(seek_child_seek_count);
+  PERF_CONTEXT_OUTPUT(seek_min_heap_time);
+  PERF_CONTEXT_OUTPUT(seek_internal_seek_time);
+  PERF_CONTEXT_OUTPUT(find_next_user_entry_time);
+  PERF_CONTEXT_OUTPUT(write_pre_and_post_process_time);
+  PERF_CONTEXT_OUTPUT(write_memtable_time);
+  PERF_CONTEXT_OUTPUT(write_thread_wait_nanos);
+  PERF_CONTEXT_OUTPUT(write_scheduling_flushes_compactions_time);
+  PERF_CONTEXT_OUTPUT(db_mutex_lock_nanos);
+  PERF_CONTEXT_OUTPUT(db_condition_wait_nanos);
+  PERF_CONTEXT_OUTPUT(merge_operator_time_nanos);
+  PERF_CONTEXT_OUTPUT(write_delay_time);
+  PERF_CONTEXT_OUTPUT(read_index_block_nanos);
+  PERF_CONTEXT_OUTPUT(read_filter_block_nanos);
+  PERF_CONTEXT_OUTPUT(new_table_block_iter_nanos);
+  PERF_CONTEXT_OUTPUT(new_table_iterator_nanos);
+  PERF_CONTEXT_OUTPUT(block_seek_nanos);
+  PERF_CONTEXT_OUTPUT(find_table_nanos);
+  PERF_CONTEXT_OUTPUT(bloom_memtable_hit_count);
+  PERF_CONTEXT_OUTPUT(bloom_memtable_miss_count);
+  PERF_CONTEXT_OUTPUT(bloom_sst_hit_count);
+  PERF_CONTEXT_OUTPUT(bloom_sst_miss_count);
+  PERF_CONTEXT_OUTPUT(key_lock_wait_time);
+  PERF_CONTEXT_OUTPUT(key_lock_wait_count);
+  PERF_CONTEXT_OUTPUT(env_new_sequential_file_nanos);
+  PERF_CONTEXT_OUTPUT(env_new_random_access_file_nanos);
+  PERF_CONTEXT_OUTPUT(env_new_writable_file_nanos);
+  PERF_CONTEXT_OUTPUT(env_reuse_writable_file_nanos);
+  PERF_CONTEXT_OUTPUT(env_new_random_rw_file_nanos);
+  PERF_CONTEXT_OUTPUT(env_new_directory_nanos);
+  PERF_CONTEXT_OUTPUT(env_file_exists_nanos);
+  PERF_CONTEXT_OUTPUT(env_get_children_nanos);
+  PERF_CONTEXT_OUTPUT(env_get_children_file_attributes_nanos);
+  PERF_CONTEXT_OUTPUT(env_delete_file_nanos);
+  PERF_CONTEXT_OUTPUT(env_create_dir_nanos);
+  PERF_CONTEXT_OUTPUT(env_create_dir_if_missing_nanos);
+  PERF_CONTEXT_OUTPUT(env_delete_dir_nanos);
+  PERF_CONTEXT_OUTPUT(env_get_file_size_nanos);
+  PERF_CONTEXT_OUTPUT(env_get_file_modification_time_nanos);
+  PERF_CONTEXT_OUTPUT(env_rename_file_nanos);
+  PERF_CONTEXT_OUTPUT(env_link_file_nanos);
+  PERF_CONTEXT_OUTPUT(env_lock_file_nanos);
+  PERF_CONTEXT_OUTPUT(env_unlock_file_nanos);
+  PERF_CONTEXT_OUTPUT(env_new_logger_nanos);
+  PERF_CONTEXT_OUTPUT(get_cpu_nanos);
+  PERF_CONTEXT_OUTPUT(iter_next_cpu_nanos);
+  PERF_CONTEXT_OUTPUT(iter_prev_cpu_nanos);
+  PERF_CONTEXT_OUTPUT(iter_seek_cpu_nanos);
+  PERF_CONTEXT_OUTPUT(number_async_seek);
+  PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(bloom_filter_useful);
+  PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(bloom_filter_full_positive);
+  PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(bloom_filter_full_true_positive);
+  PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(block_cache_hit_count);
+  PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(block_cache_miss_count);
+
+  std::string str = ss.str();
+  str.erase(str.find_last_not_of(", ") + 1);
+  return str;
+#endif
+}
+
+void PerfContext::EnablePerLevelPerfContext() {
+  if (level_to_perf_context == nullptr) {
+    level_to_perf_context = new std::map<uint32_t, PerfContextByLevel>();
+  }
+  per_level_perf_context_enabled = true;
+}
+
+void PerfContext::DisablePerLevelPerfContext() {
+  per_level_perf_context_enabled = false;
+}
+
+void PerfContext::ClearPerLevelPerfContext() {
+  if (level_to_perf_context != nullptr) {
+    level_to_perf_context->clear();
+    delete level_to_perf_context;
+    level_to_perf_context = nullptr;
+  }
+  per_level_perf_context_enabled = false;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/monitoring/perf_context_imp.h b/src/rocksdb/monitoring/perf_context_imp.h
new file mode 100644
index 000000000..5b66ff2ff
--- /dev/null
+++ b/src/rocksdb/monitoring/perf_context_imp.h
@@ -0,0 +1,96 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include "monitoring/perf_step_timer.h"
+#include "rocksdb/perf_context.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+#if defined(NPERF_CONTEXT)
+extern PerfContext perf_context;
+#else
+#if defined(OS_SOLARIS)
+extern thread_local PerfContext perf_context_;
+#define perf_context (*get_perf_context())
+#else
+extern thread_local PerfContext perf_context;
+#endif
+#endif
+
+#if defined(NPERF_CONTEXT)
+
+#define PERF_TIMER_STOP(metric)
+#define PERF_TIMER_START(metric)
+#define PERF_TIMER_GUARD(metric)
+#define PERF_TIMER_GUARD_WITH_CLOCK(metric, clock)
+#define PERF_CPU_TIMER_GUARD(metric, clock)
+#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats, \
+                                               ticker_type)
+#define PERF_TIMER_MEASURE(metric)
+#define PERF_COUNTER_ADD(metric, value)
+#define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level)
+
+#else
+
+// Stop the timer and update the metric
+#define PERF_TIMER_STOP(metric) perf_step_timer_##metric.Stop();
+
+#define PERF_TIMER_START(metric) perf_step_timer_##metric.Start();
+
+// Declare and set start time of the timer
+#define PERF_TIMER_GUARD(metric)                                  \
+  PerfStepTimer perf_step_timer_##metric(&(perf_context.metric)); \
+  perf_step_timer_##metric.Start();
+
+// Declare and set start time of the timer
+#define PERF_TIMER_GUARD_WITH_CLOCK(metric, clock)                       \
+  PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), clock); \
+  perf_step_timer_##metric.Start();
+
+// Declare and set start time of the timer
+#define PERF_CPU_TIMER_GUARD(metric, clock)            \
+  PerfStepTimer perf_step_timer_##metric(              \
+      &(perf_context.metric), clock, true,             \
+      PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); \
+  perf_step_timer_##metric.Start();
+
+#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats,       \
+                                               ticker_type)                    \
+  PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), nullptr,      \
+                                         false, PerfLevel::kEnableTime, stats, \
+                                         ticker_type);                         \
+  if (condition) {                                                             \
+    perf_step_timer_##metric.Start();                                          \
+  }
+
+// Update metric with time elapsed since last START. start time is reset
+// to current timestamp.
+#define PERF_TIMER_MEASURE(metric) perf_step_timer_##metric.Measure();
+
+// Increase metric value
+#define PERF_COUNTER_ADD(metric, value)        \
+  if (perf_level >= PerfLevel::kEnableCount) { \
+    perf_context.metric += value;              \
+  }
+
+// Increase metric value
+#define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level)               \
+  if (perf_level >= PerfLevel::kEnableCount &&                        \
+      perf_context.per_level_perf_context_enabled &&                  \
+      perf_context.level_to_perf_context) {                           \
+    if ((*(perf_context.level_to_perf_context)).find(level) !=        \
+        (*(perf_context.level_to_perf_context)).end()) {              \
+      (*(perf_context.level_to_perf_context))[level].metric += value; \
+    } else {                                                          \
+      PerfContextByLevel empty_context;                               \
+      (*(perf_context.level_to_perf_context))[level] = empty_context; \
+      (*(perf_context.level_to_perf_context))[level].metric += value; \
+    }                                                                 \
+  }
+
+#endif
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/monitoring/perf_level.cc b/src/rocksdb/monitoring/perf_level.cc
new file mode 100644
index 000000000..e3507624b
--- /dev/null
+++ b/src/rocksdb/monitoring/perf_level.cc
@@ -0,0 +1,23 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include <assert.h>
+
+#include "monitoring/perf_level_imp.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+thread_local PerfLevel perf_level = kEnableCount;
+
+void SetPerfLevel(PerfLevel level) {
+  assert(level > kUninitialized);
+  assert(level < kOutOfBounds);
+  perf_level = level;
+}
+
+PerfLevel GetPerfLevel() { return perf_level; }
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/monitoring/perf_level_imp.h b/src/rocksdb/monitoring/perf_level_imp.h
new file mode 100644
index 000000000..28bd185cd
--- /dev/null
+++ b/src/rocksdb/monitoring/perf_level_imp.h
@@ -0,0 +1,14 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include "port/port.h"
+#include "rocksdb/perf_level.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+extern thread_local PerfLevel perf_level;
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/monitoring/perf_step_timer.h b/src/rocksdb/monitoring/perf_step_timer.h
new file mode 100644
index 000000000..8deb31252
--- /dev/null
+++ b/src/rocksdb/monitoring/perf_step_timer.h
@@ -0,0 +1,77 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include "monitoring/perf_level_imp.h"
+#include "monitoring/statistics.h"
+#include "rocksdb/system_clock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class PerfStepTimer {
+ public:
+  explicit PerfStepTimer(
+      uint64_t* metric, SystemClock* clock = nullptr, bool use_cpu_time = false,
+      PerfLevel enable_level = PerfLevel::kEnableTimeExceptForMutex,
+      Statistics* statistics = nullptr, uint32_t ticker_type = 0)
+      : perf_counter_enabled_(perf_level >= enable_level),
+        use_cpu_time_(use_cpu_time),
+        ticker_type_(ticker_type),
+        clock_((perf_counter_enabled_ || statistics != nullptr)
+                   ? (clock ? clock : SystemClock::Default().get())
+                   : nullptr),
+        start_(0),
+        metric_(metric),
+        statistics_(statistics) {}
+
+  ~PerfStepTimer() { Stop(); }
+
+  void Start() {
+    if (perf_counter_enabled_ || statistics_ != nullptr) {
+      start_ = time_now();
+    }
+  }
+
+  void Measure() {
+    if (start_) {
+      uint64_t now = time_now();
+      *metric_ += now - start_;
+      start_ = now;
+    }
+  }
+
+  void Stop() {
+    if (start_) {
+      uint64_t duration = time_now() - start_;
+      if (perf_counter_enabled_) {
+        *metric_ += duration;
+      }
+
+      if (statistics_ != nullptr) {
+        RecordTick(statistics_, ticker_type_, duration);
+      }
+      start_ = 0;
+    }
+  }
+
+ private:
+  uint64_t time_now() {
+    if (!use_cpu_time_) {
+      return clock_->NowNanos();
+    } else {
+      return clock_->CPUNanos();
+    }
+  }
+
+  const bool perf_counter_enabled_;
+  const bool use_cpu_time_;
+  uint32_t ticker_type_;
+  SystemClock* const clock_;
+  uint64_t start_;
+  uint64_t* metric_;
+  Statistics* statistics_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/monitoring/persistent_stats_history.cc b/src/rocksdb/monitoring/persistent_stats_history.cc
new file mode 100644
index 000000000..f4c022148
--- /dev/null
+++ b/src/rocksdb/monitoring/persistent_stats_history.cc
@@ -0,0 +1,170 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "monitoring/persistent_stats_history.h"
+
+#include <cstring>
+#include <string>
+#include <utility>
+
+#include "db/db_impl/db_impl.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+// 10 digit seconds timestamp => [Sep 9, 2001 ~ Nov 20, 2286]
+const int kNowSecondsStringLength = 10;
+const std::string kFormatVersionKeyString =
+    "__persistent_stats_format_version__";
+const std::string kCompatibleVersionKeyString =
+    "__persistent_stats_compatible_version__";
+// Every release maintains two versions numbers for persistents stats: Current
+// format version and compatible format version. Current format version
+// designates what type of encoding will be used when writing to stats CF;
+// compatible format version designates the minimum format version that
+// can decode the stats CF encoded using the current format version.
+const uint64_t kStatsCFCurrentFormatVersion = 1;
+const uint64_t kStatsCFCompatibleFormatVersion = 1;
+
+Status DecodePersistentStatsVersionNumber(DBImpl* db, StatsVersionKeyType type,
+                                          uint64_t* version_number) {
+  if (type >= StatsVersionKeyType::kKeyTypeMax) {
+    return Status::InvalidArgument("Invalid stats version key type provided");
+  }
+  std::string key;
+  if (type == StatsVersionKeyType::kFormatVersion) {
+    key = kFormatVersionKeyString;
+  } else if (type == StatsVersionKeyType::kCompatibleVersion) {
+    key = kCompatibleVersionKeyString;
+  }
+  ReadOptions options;
+  options.verify_checksums = true;
+  std::string result;
+  Status s = db->Get(options, db->PersistentStatsColumnFamily(), key, &result);
+  if (!s.ok() || result.empty()) {
+    return Status::NotFound("Persistent stats version key " + key +
+                            " not found.");
+  }
+
+  // read version_number but do nothing in current version
+  *version_number = ParseUint64(result);
+  return Status::OK();
+}
+
+int EncodePersistentStatsKey(uint64_t now_seconds, const std::string& key,
+                             int size, char* buf) {
+  char timestamp[kNowSecondsStringLength + 1];
+  // make time stamp string equal in length to allow sorting by time
+  snprintf(timestamp, sizeof(timestamp), "%010d",
+           static_cast<int>(now_seconds));
+  timestamp[kNowSecondsStringLength] = '\0';
+  return snprintf(buf, size, "%s#%s", timestamp, key.c_str());
+}
+
+void OptimizeForPersistentStats(ColumnFamilyOptions* cfo) {
+  cfo->write_buffer_size = 2 << 20;
+  cfo->target_file_size_base = 2 * 1048576;
+  cfo->max_bytes_for_level_base = 10 * 1048576;
+  cfo->soft_pending_compaction_bytes_limit = 256 * 1048576;
+  cfo->hard_pending_compaction_bytes_limit = 1073741824ul;
+  cfo->compression = kNoCompression;
+}
+
+PersistentStatsHistoryIterator::~PersistentStatsHistoryIterator() {}
+
+bool PersistentStatsHistoryIterator::Valid() const { return valid_; }
+
+Status PersistentStatsHistoryIterator::status() const { return status_; }
+
+void PersistentStatsHistoryIterator::Next() {
+  // increment start_time by 1 to avoid infinite loop
+  AdvanceIteratorByTime(GetStatsTime() + 1, end_time_);
+}
+
+uint64_t PersistentStatsHistoryIterator::GetStatsTime() const { return time_; }
+
+const std::map<std::string, uint64_t>&
+PersistentStatsHistoryIterator::GetStatsMap() const {
+  return stats_map_;
+}
+
+std::pair<uint64_t, std::string> parseKey(const Slice& key,
+                                          uint64_t start_time) {
+  std::pair<uint64_t, std::string> result;
+  std::string key_str = key.ToString();
+  std::string::size_type pos = key_str.find("#");
+  // TODO(Zhongyi): add counters to track parse failures?
+  if (pos == std::string::npos) {
+    result.first = std::numeric_limits<uint64_t>::max();
+    result.second.clear();
+  } else {
+    uint64_t parsed_time = ParseUint64(key_str.substr(0, pos));
+    // skip entries with timestamp smaller than start_time
+    if (parsed_time < start_time) {
+      result.first = std::numeric_limits<uint64_t>::max();
+      result.second = "";
+    } else {
+      result.first = parsed_time;
+      std::string key_resize = key_str.substr(pos + 1);
+      result.second = key_resize;
+    }
+  }
+  return result;
+}
+
+// advance the iterator to the next time between [start_time, end_time)
+// if success, update time_ and stats_map_ with new_time and stats_map
+void PersistentStatsHistoryIterator::AdvanceIteratorByTime(uint64_t start_time,
+                                                           uint64_t end_time) {
+  // try to find next entry in stats_history_ map
+  if (db_impl_ != nullptr) {
+    ReadOptions ro;
+    Iterator* iter =
+        db_impl_->NewIterator(ro, db_impl_->PersistentStatsColumnFamily());
+
+    char timestamp[kNowSecondsStringLength + 1];
+    snprintf(timestamp, sizeof(timestamp), "%010d",
+             static_cast<int>(std::max(time_, start_time)));
+    timestamp[kNowSecondsStringLength] = '\0';
+
+    iter->Seek(timestamp);
+    // no more entries with timestamp >= start_time is found or version key
+    // is found to be incompatible
+    if (!iter->Valid()) {
+      valid_ = false;
+      delete iter;
+      return;
+    }
+    time_ = parseKey(iter->key(), start_time).first;
+    valid_ = true;
+    // check parsed time and invalid if it exceeds end_time
+    if (time_ > end_time) {
+      valid_ = false;
+      delete iter;
+      return;
+    }
+    // find all entries with timestamp equal to time_
+    std::map<std::string, uint64_t> new_stats_map;
+    std::pair<uint64_t, std::string> kv;
+    for (; iter->Valid(); iter->Next()) {
+      kv = parseKey(iter->key(), start_time);
+      if (kv.first != time_) {
+        break;
+      }
+      if (kv.second.compare(kFormatVersionKeyString) == 0) {
+        continue;
+      }
+      new_stats_map[kv.second] = ParseUint64(iter->value().ToString());
+    }
+    stats_map_.swap(new_stats_map);
+    delete iter;
+  } else {
+    valid_ = false;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/monitoring/persistent_stats_history.h b/src/rocksdb/monitoring/persistent_stats_history.h
new file mode 100644
index 000000000..7c711fe4e
--- /dev/null
+++ b/src/rocksdb/monitoring/persistent_stats_history.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/stats_history.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+extern const std::string kFormatVersionKeyString;
+extern const std::string kCompatibleVersionKeyString;
+extern const uint64_t kStatsCFCurrentFormatVersion;
+extern const uint64_t kStatsCFCompatibleFormatVersion;
+
+enum StatsVersionKeyType : uint32_t {
+  kFormatVersion = 1,
+  kCompatibleVersion = 2,
+  kKeyTypeMax = 3
+};
+
+// Read the version number from persitent stats cf depending on type provided
+// stores the version number in `*version_number`
+// returns Status::OK() on success, or other status code on failure
+Status DecodePersistentStatsVersionNumber(DBImpl* db, StatsVersionKeyType type,
+                                          uint64_t* version_number);
+
+// Encode timestamp and stats key into buf
+// Format: timestamp(10 digit) + '#' + key
+// Total length of encoded key will be capped at 100 bytes
+int EncodePersistentStatsKey(uint64_t timestamp, const std::string& key,
+                             int size, char* buf);
+
+void OptimizeForPersistentStats(ColumnFamilyOptions* cfo);
+
+class PersistentStatsHistoryIterator final : public StatsHistoryIterator {
+ public:
+  PersistentStatsHistoryIterator(uint64_t start_time, uint64_t end_time,
+                                 DBImpl* db_impl)
+      : time_(0),
+        start_time_(start_time),
+        end_time_(end_time),
+        valid_(true),
+        db_impl_(db_impl) {
+    AdvanceIteratorByTime(start_time_, end_time_);
+  }
+  ~PersistentStatsHistoryIterator() override;
+  bool Valid() const override;
+  Status status() const override;
+
+  void Next() override;
+  uint64_t GetStatsTime() const override;
+
+  const std::map<std::string, uint64_t>& GetStatsMap() const override;
+
+ private:
+  // advance the iterator to the next stats history record with timestamp
+  // between [start_time, end_time)
+  void AdvanceIteratorByTime(uint64_t start_time, uint64_t end_time);
+
+  // No copying allowed
+  PersistentStatsHistoryIterator(const PersistentStatsHistoryIterator&) =
+      delete;
+  void operator=(const PersistentStatsHistoryIterator&) = delete;
+  PersistentStatsHistoryIterator(PersistentStatsHistoryIterator&&) = delete;
+  PersistentStatsHistoryIterator& operator=(PersistentStatsHistoryIterator&&) =
+      delete;
+
+  uint64_t time_;
+  uint64_t start_time_;
+  uint64_t end_time_;
+  std::map<std::string, uint64_t> stats_map_;
+  Status status_;
+  bool valid_;
+  DBImpl* db_impl_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/monitoring/statistics.cc b/src/rocksdb/monitoring/statistics.cc
new file mode 100644
index 000000000..e01eed3f3
--- /dev/null
+++ b/src/rocksdb/monitoring/statistics.cc
@@ -0,0 +1,527 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "monitoring/statistics.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <cstdio>
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The order of items listed in  Tickers should be the same as
+// the order listed in TickersNameMap
+const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
+    {BLOCK_CACHE_MISS, "rocksdb.block.cache.miss"},
+    {BLOCK_CACHE_HIT, "rocksdb.block.cache.hit"},
+    {BLOCK_CACHE_ADD, "rocksdb.block.cache.add"},
+    {BLOCK_CACHE_ADD_FAILURES, "rocksdb.block.cache.add.failures"},
+    {BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss"},
+    {BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit"},
+    {BLOCK_CACHE_INDEX_ADD, "rocksdb.block.cache.index.add"},
+    {BLOCK_CACHE_INDEX_BYTES_INSERT, "rocksdb.block.cache.index.bytes.insert"},
+    {BLOCK_CACHE_INDEX_BYTES_EVICT, "rocksdb.block.cache.index.bytes.evict"},
+    {BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss"},
+    {BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit"},
+    {BLOCK_CACHE_FILTER_ADD, "rocksdb.block.cache.filter.add"},
+    {BLOCK_CACHE_FILTER_BYTES_INSERT,
+     "rocksdb.block.cache.filter.bytes.insert"},
+    {BLOCK_CACHE_FILTER_BYTES_EVICT, "rocksdb.block.cache.filter.bytes.evict"},
+    {BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss"},
+    {BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit"},
+    {BLOCK_CACHE_DATA_ADD, "rocksdb.block.cache.data.add"},
+    {BLOCK_CACHE_DATA_BYTES_INSERT, "rocksdb.block.cache.data.bytes.insert"},
+    {BLOCK_CACHE_BYTES_READ, "rocksdb.block.cache.bytes.read"},
+    {BLOCK_CACHE_BYTES_WRITE, "rocksdb.block.cache.bytes.write"},
+    {BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful"},
+    {BLOOM_FILTER_FULL_POSITIVE, "rocksdb.bloom.filter.full.positive"},
+    {BLOOM_FILTER_FULL_TRUE_POSITIVE,
+     "rocksdb.bloom.filter.full.true.positive"},
+    {BLOOM_FILTER_MICROS, "rocksdb.bloom.filter.micros"},
+    {PERSISTENT_CACHE_HIT, "rocksdb.persistent.cache.hit"},
+    {PERSISTENT_CACHE_MISS, "rocksdb.persistent.cache.miss"},
+    {SIM_BLOCK_CACHE_HIT, "rocksdb.sim.block.cache.hit"},
+    {SIM_BLOCK_CACHE_MISS, "rocksdb.sim.block.cache.miss"},
+    {MEMTABLE_HIT, "rocksdb.memtable.hit"},
+    {MEMTABLE_MISS, "rocksdb.memtable.miss"},
+    {GET_HIT_L0, "rocksdb.l0.hit"},
+    {GET_HIT_L1, "rocksdb.l1.hit"},
+    {GET_HIT_L2_AND_UP, "rocksdb.l2andup.hit"},
+    {COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new"},
+    {COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete"},
+    {COMPACTION_KEY_DROP_RANGE_DEL, "rocksdb.compaction.key.drop.range_del"},
+    {COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user"},
+    {COMPACTION_RANGE_DEL_DROP_OBSOLETE,
+     "rocksdb.compaction.range_del.drop.obsolete"},
+    {COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
+     "rocksdb.compaction.optimized.del.drop.obsolete"},
+    {COMPACTION_CANCELLED, "rocksdb.compaction.cancelled"},
+    {NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written"},
+    {NUMBER_KEYS_READ, "rocksdb.number.keys.read"},
+    {NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated"},
+    {BYTES_WRITTEN, "rocksdb.bytes.written"},
+    {BYTES_READ, "rocksdb.bytes.read"},
+    {NUMBER_DB_SEEK, "rocksdb.number.db.seek"},
+    {NUMBER_DB_NEXT, "rocksdb.number.db.next"},
+    {NUMBER_DB_PREV, "rocksdb.number.db.prev"},
+    {NUMBER_DB_SEEK_FOUND, "rocksdb.number.db.seek.found"},
+    {NUMBER_DB_NEXT_FOUND, "rocksdb.number.db.next.found"},
+    {NUMBER_DB_PREV_FOUND, "rocksdb.number.db.prev.found"},
+    {ITER_BYTES_READ, "rocksdb.db.iter.bytes.read"},
+    {NO_FILE_CLOSES, "rocksdb.no.file.closes"},
+    {NO_FILE_OPENS, "rocksdb.no.file.opens"},
+    {NO_FILE_ERRORS, "rocksdb.no.file.errors"},
+    {STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros"},
+    {STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros"},
+    {STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros"},
+    {STALL_MICROS, "rocksdb.stall.micros"},
+    {DB_MUTEX_WAIT_MICROS, "rocksdb.db.mutex.wait.micros"},
+    {RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis"},
+    {NO_ITERATORS, "rocksdb.num.iterators"},
+    {NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get"},
+    {NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read"},
+    {NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read"},
+    {NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered"},
+    {NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures"},
+    {BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked"},
+    {BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful"},
+    {NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration"},
+    {GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls"},
+    {BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss"},
+    {BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit"},
+    {BLOCK_CACHE_COMPRESSED_ADD, "rocksdb.block.cachecompressed.add"},
+    {BLOCK_CACHE_COMPRESSED_ADD_FAILURES,
+     "rocksdb.block.cachecompressed.add.failures"},
+    {WAL_FILE_SYNCED, "rocksdb.wal.synced"},
+    {WAL_FILE_BYTES, "rocksdb.wal.bytes"},
+    {WRITE_DONE_BY_SELF, "rocksdb.write.self"},
+    {WRITE_DONE_BY_OTHER, "rocksdb.write.other"},
+    {WRITE_TIMEDOUT, "rocksdb.write.timeout"},
+    {WRITE_WITH_WAL, "rocksdb.write.wal"},
+    {COMPACT_READ_BYTES, "rocksdb.compact.read.bytes"},
+    {COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes"},
+    {FLUSH_WRITE_BYTES, "rocksdb.flush.write.bytes"},
+    {COMPACT_READ_BYTES_MARKED, "rocksdb.compact.read.marked.bytes"},
+    {COMPACT_READ_BYTES_PERIODIC, "rocksdb.compact.read.periodic.bytes"},
+    {COMPACT_READ_BYTES_TTL, "rocksdb.compact.read.ttl.bytes"},
+    {COMPACT_WRITE_BYTES_MARKED, "rocksdb.compact.write.marked.bytes"},
+    {COMPACT_WRITE_BYTES_PERIODIC, "rocksdb.compact.write.periodic.bytes"},
+    {COMPACT_WRITE_BYTES_TTL, "rocksdb.compact.write.ttl.bytes"},
+    {NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
+     "rocksdb.number.direct.load.table.properties"},
+    {NUMBER_SUPERVERSION_ACQUIRES, "rocksdb.number.superversion_acquires"},
+    {NUMBER_SUPERVERSION_RELEASES, "rocksdb.number.superversion_releases"},
+    {NUMBER_SUPERVERSION_CLEANUPS, "rocksdb.number.superversion_cleanups"},
+    {NUMBER_BLOCK_COMPRESSED, "rocksdb.number.block.compressed"},
+    {NUMBER_BLOCK_DECOMPRESSED, "rocksdb.number.block.decompressed"},
+    {NUMBER_BLOCK_NOT_COMPRESSED, "rocksdb.number.block.not_compressed"},
+    {MERGE_OPERATION_TOTAL_TIME, "rocksdb.merge.operation.time.nanos"},
+    {FILTER_OPERATION_TOTAL_TIME, "rocksdb.filter.operation.time.nanos"},
+    {ROW_CACHE_HIT, "rocksdb.row.cache.hit"},
+    {ROW_CACHE_MISS, "rocksdb.row.cache.miss"},
+    {READ_AMP_ESTIMATE_USEFUL_BYTES, "rocksdb.read.amp.estimate.useful.bytes"},
+    {READ_AMP_TOTAL_READ_BYTES, "rocksdb.read.amp.total.read.bytes"},
+    {NUMBER_RATE_LIMITER_DRAINS, "rocksdb.number.rate_limiter.drains"},
+    {NUMBER_ITER_SKIP, "rocksdb.number.iter.skip"},
+    {BLOB_DB_NUM_PUT, "rocksdb.blobdb.num.put"},
+    {BLOB_DB_NUM_WRITE, "rocksdb.blobdb.num.write"},
+    {BLOB_DB_NUM_GET, "rocksdb.blobdb.num.get"},
+    {BLOB_DB_NUM_MULTIGET, "rocksdb.blobdb.num.multiget"},
+    {BLOB_DB_NUM_SEEK, "rocksdb.blobdb.num.seek"},
+    {BLOB_DB_NUM_NEXT, "rocksdb.blobdb.num.next"},
+    {BLOB_DB_NUM_PREV, "rocksdb.blobdb.num.prev"},
+    {BLOB_DB_NUM_KEYS_WRITTEN, "rocksdb.blobdb.num.keys.written"},
+    {BLOB_DB_NUM_KEYS_READ, "rocksdb.blobdb.num.keys.read"},
+    {BLOB_DB_BYTES_WRITTEN, "rocksdb.blobdb.bytes.written"},
+    {BLOB_DB_BYTES_READ, "rocksdb.blobdb.bytes.read"},
+    {BLOB_DB_WRITE_INLINED, "rocksdb.blobdb.write.inlined"},
+    {BLOB_DB_WRITE_INLINED_TTL, "rocksdb.blobdb.write.inlined.ttl"},
+    {BLOB_DB_WRITE_BLOB, "rocksdb.blobdb.write.blob"},
+    {BLOB_DB_WRITE_BLOB_TTL, "rocksdb.blobdb.write.blob.ttl"},
+    {BLOB_DB_BLOB_FILE_BYTES_WRITTEN, "rocksdb.blobdb.blob.file.bytes.written"},
+    {BLOB_DB_BLOB_FILE_BYTES_READ, "rocksdb.blobdb.blob.file.bytes.read"},
+    {BLOB_DB_BLOB_FILE_SYNCED, "rocksdb.blobdb.blob.file.synced"},
+    {BLOB_DB_BLOB_INDEX_EXPIRED_COUNT,
+     "rocksdb.blobdb.blob.index.expired.count"},
+    {BLOB_DB_BLOB_INDEX_EXPIRED_SIZE, "rocksdb.blobdb.blob.index.expired.size"},
+    {BLOB_DB_BLOB_INDEX_EVICTED_COUNT,
+     "rocksdb.blobdb.blob.index.evicted.count"},
+    {BLOB_DB_BLOB_INDEX_EVICTED_SIZE, "rocksdb.blobdb.blob.index.evicted.size"},
+    {BLOB_DB_GC_NUM_FILES, "rocksdb.blobdb.gc.num.files"},
+    {BLOB_DB_GC_NUM_NEW_FILES, "rocksdb.blobdb.gc.num.new.files"},
+    {BLOB_DB_GC_FAILURES, "rocksdb.blobdb.gc.failures"},
+    {BLOB_DB_GC_NUM_KEYS_OVERWRITTEN, "rocksdb.blobdb.gc.num.keys.overwritten"},
+    {BLOB_DB_GC_NUM_KEYS_EXPIRED, "rocksdb.blobdb.gc.num.keys.expired"},
+    {BLOB_DB_GC_NUM_KEYS_RELOCATED, "rocksdb.blobdb.gc.num.keys.relocated"},
+    {BLOB_DB_GC_BYTES_OVERWRITTEN, "rocksdb.blobdb.gc.bytes.overwritten"},
+    {BLOB_DB_GC_BYTES_EXPIRED, "rocksdb.blobdb.gc.bytes.expired"},
+    {BLOB_DB_GC_BYTES_RELOCATED, "rocksdb.blobdb.gc.bytes.relocated"},
+    {BLOB_DB_FIFO_NUM_FILES_EVICTED, "rocksdb.blobdb.fifo.num.files.evicted"},
+    {BLOB_DB_FIFO_NUM_KEYS_EVICTED, "rocksdb.blobdb.fifo.num.keys.evicted"},
+    {BLOB_DB_FIFO_BYTES_EVICTED, "rocksdb.blobdb.fifo.bytes.evicted"},
+    {TXN_PREPARE_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.prepare"},
+    {TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD,
+     "rocksdb.txn.overhead.mutex.old.commit.map"},
+    {TXN_DUPLICATE_KEY_OVERHEAD, "rocksdb.txn.overhead.duplicate.key"},
+    {TXN_SNAPSHOT_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.snapshot"},
+    {TXN_GET_TRY_AGAIN, "rocksdb.txn.get.tryagain"},
+    {NUMBER_MULTIGET_KEYS_FOUND, "rocksdb.number.multiget.keys.found"},
+    {NO_ITERATOR_CREATED, "rocksdb.num.iterator.created"},
+    {NO_ITERATOR_DELETED, "rocksdb.num.iterator.deleted"},
+    {BLOCK_CACHE_COMPRESSION_DICT_MISS,
+     "rocksdb.block.cache.compression.dict.miss"},
+    {BLOCK_CACHE_COMPRESSION_DICT_HIT,
+     "rocksdb.block.cache.compression.dict.hit"},
+    {BLOCK_CACHE_COMPRESSION_DICT_ADD,
+     "rocksdb.block.cache.compression.dict.add"},
+    {BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
+     "rocksdb.block.cache.compression.dict.bytes.insert"},
+    {BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT,
+     "rocksdb.block.cache.compression.dict.bytes.evict"},
+    {BLOCK_CACHE_ADD_REDUNDANT, "rocksdb.block.cache.add.redundant"},
+    {BLOCK_CACHE_INDEX_ADD_REDUNDANT,
+     "rocksdb.block.cache.index.add.redundant"},
+    {BLOCK_CACHE_FILTER_ADD_REDUNDANT,
+     "rocksdb.block.cache.filter.add.redundant"},
+    {BLOCK_CACHE_DATA_ADD_REDUNDANT, "rocksdb.block.cache.data.add.redundant"},
+    {BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT,
+     "rocksdb.block.cache.compression.dict.add.redundant"},
+    {FILES_MARKED_TRASH, "rocksdb.files.marked.trash"},
+    {FILES_DELETED_IMMEDIATELY, "rocksdb.files.deleted.immediately"},
+    {ERROR_HANDLER_BG_ERROR_COUNT, "rocksdb.error.handler.bg.errro.count"},
+    {ERROR_HANDLER_BG_IO_ERROR_COUNT,
+     "rocksdb.error.handler.bg.io.errro.count"},
+    {ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT,
+     "rocksdb.error.handler.bg.retryable.io.errro.count"},
+    {ERROR_HANDLER_AUTORESUME_COUNT, "rocksdb.error.handler.autoresume.count"},
+    {ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT,
+     "rocksdb.error.handler.autoresume.retry.total.count"},
+    {ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT,
+     "rocksdb.error.handler.autoresume.success.count"},
+    {MEMTABLE_PAYLOAD_BYTES_AT_FLUSH,
+     "rocksdb.memtable.payload.bytes.at.flush"},
+    {MEMTABLE_GARBAGE_BYTES_AT_FLUSH,
+     "rocksdb.memtable.garbage.bytes.at.flush"},
+    {SECONDARY_CACHE_HITS, "rocksdb.secondary.cache.hits"},
+    {VERIFY_CHECKSUM_READ_BYTES, "rocksdb.verify_checksum.read.bytes"},
+    {BACKUP_READ_BYTES, "rocksdb.backup.read.bytes"},
+    {BACKUP_WRITE_BYTES, "rocksdb.backup.write.bytes"},
+    {REMOTE_COMPACT_READ_BYTES, "rocksdb.remote.compact.read.bytes"},
+    {REMOTE_COMPACT_WRITE_BYTES, "rocksdb.remote.compact.write.bytes"},
+    {HOT_FILE_READ_BYTES, "rocksdb.hot.file.read.bytes"},
+    {WARM_FILE_READ_BYTES, "rocksdb.warm.file.read.bytes"},
+    {COLD_FILE_READ_BYTES, "rocksdb.cold.file.read.bytes"},
+    {HOT_FILE_READ_COUNT, "rocksdb.hot.file.read.count"},
+    {WARM_FILE_READ_COUNT, "rocksdb.warm.file.read.count"},
+    {COLD_FILE_READ_COUNT, "rocksdb.cold.file.read.count"},
+    {LAST_LEVEL_READ_BYTES, "rocksdb.last.level.read.bytes"},
+    {LAST_LEVEL_READ_COUNT, "rocksdb.last.level.read.count"},
+    {NON_LAST_LEVEL_READ_BYTES, "rocksdb.non.last.level.read.bytes"},
+    {NON_LAST_LEVEL_READ_COUNT, "rocksdb.non.last.level.read.count"},
+    {BLOCK_CHECKSUM_COMPUTE_COUNT, "rocksdb.block.checksum.compute.count"},
+    {MULTIGET_COROUTINE_COUNT, "rocksdb.multiget.coroutine.count"},
+    {BLOB_DB_CACHE_MISS, "rocksdb.blobdb.cache.miss"},
+    {BLOB_DB_CACHE_HIT, "rocksdb.blobdb.cache.hit"},
+    {BLOB_DB_CACHE_ADD, "rocksdb.blobdb.cache.add"},
+    {BLOB_DB_CACHE_ADD_FAILURES, "rocksdb.blobdb.cache.add.failures"},
+    {BLOB_DB_CACHE_BYTES_READ, "rocksdb.blobdb.cache.bytes.read"},
+    {BLOB_DB_CACHE_BYTES_WRITE, "rocksdb.blobdb.cache.bytes.write"},
+    {READ_ASYNC_MICROS, "rocksdb.read.async.micros"},
+    {ASYNC_READ_ERROR_COUNT, "rocksdb.async.read.error.count"}};
+
+const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
+    {DB_GET, "rocksdb.db.get.micros"},
+    {DB_WRITE, "rocksdb.db.write.micros"},
+    {COMPACTION_TIME, "rocksdb.compaction.times.micros"},
+    {COMPACTION_CPU_TIME, "rocksdb.compaction.times.cpu_micros"},
+    {SUBCOMPACTION_SETUP_TIME, "rocksdb.subcompaction.setup.times.micros"},
+    {TABLE_SYNC_MICROS, "rocksdb.table.sync.micros"},
+    {COMPACTION_OUTFILE_SYNC_MICROS, "rocksdb.compaction.outfile.sync.micros"},
+    {WAL_FILE_SYNC_MICROS, "rocksdb.wal.file.sync.micros"},
+    {MANIFEST_FILE_SYNC_MICROS, "rocksdb.manifest.file.sync.micros"},
+    {TABLE_OPEN_IO_MICROS, "rocksdb.table.open.io.micros"},
+    {DB_MULTIGET, "rocksdb.db.multiget.micros"},
+    {READ_BLOCK_COMPACTION_MICROS, "rocksdb.read.block.compaction.micros"},
+    {READ_BLOCK_GET_MICROS, "rocksdb.read.block.get.micros"},
+    {WRITE_RAW_BLOCK_MICROS, "rocksdb.write.raw.block.micros"},
+    {STALL_L0_SLOWDOWN_COUNT, "rocksdb.l0.slowdown.count"},
+    {STALL_MEMTABLE_COMPACTION_COUNT, "rocksdb.memtable.compaction.count"},
+    {STALL_L0_NUM_FILES_COUNT, "rocksdb.num.files.stall.count"},
+    {HARD_RATE_LIMIT_DELAY_COUNT, "rocksdb.hard.rate.limit.delay.count"},
+    {SOFT_RATE_LIMIT_DELAY_COUNT, "rocksdb.soft.rate.limit.delay.count"},
+    {NUM_FILES_IN_SINGLE_COMPACTION, "rocksdb.numfiles.in.singlecompaction"},
+    {DB_SEEK, "rocksdb.db.seek.micros"},
+    {WRITE_STALL, "rocksdb.db.write.stall"},
+    {SST_READ_MICROS, "rocksdb.sst.read.micros"},
+    {NUM_SUBCOMPACTIONS_SCHEDULED, "rocksdb.num.subcompactions.scheduled"},
+    {BYTES_PER_READ, "rocksdb.bytes.per.read"},
+    {BYTES_PER_WRITE, "rocksdb.bytes.per.write"},
+    {BYTES_PER_MULTIGET, "rocksdb.bytes.per.multiget"},
+    {BYTES_COMPRESSED, "rocksdb.bytes.compressed"},
+    {BYTES_DECOMPRESSED, "rocksdb.bytes.decompressed"},
+    {COMPRESSION_TIMES_NANOS, "rocksdb.compression.times.nanos"},
+    {DECOMPRESSION_TIMES_NANOS, "rocksdb.decompression.times.nanos"},
+    {READ_NUM_MERGE_OPERANDS, "rocksdb.read.num.merge_operands"},
+    {BLOB_DB_KEY_SIZE, "rocksdb.blobdb.key.size"},
+    {BLOB_DB_VALUE_SIZE, "rocksdb.blobdb.value.size"},
+    {BLOB_DB_WRITE_MICROS, "rocksdb.blobdb.write.micros"},
+    {BLOB_DB_GET_MICROS, "rocksdb.blobdb.get.micros"},
+    {BLOB_DB_MULTIGET_MICROS, "rocksdb.blobdb.multiget.micros"},
+    {BLOB_DB_SEEK_MICROS, "rocksdb.blobdb.seek.micros"},
+    {BLOB_DB_NEXT_MICROS, "rocksdb.blobdb.next.micros"},
+    {BLOB_DB_PREV_MICROS, "rocksdb.blobdb.prev.micros"},
+    {BLOB_DB_BLOB_FILE_WRITE_MICROS, "rocksdb.blobdb.blob.file.write.micros"},
+    {BLOB_DB_BLOB_FILE_READ_MICROS, "rocksdb.blobdb.blob.file.read.micros"},
+    {BLOB_DB_BLOB_FILE_SYNC_MICROS, "rocksdb.blobdb.blob.file.sync.micros"},
+    {BLOB_DB_GC_MICROS, "rocksdb.blobdb.gc.micros"},
+    {BLOB_DB_COMPRESSION_MICROS, "rocksdb.blobdb.compression.micros"},
+    {BLOB_DB_DECOMPRESSION_MICROS, "rocksdb.blobdb.decompression.micros"},
+    {FLUSH_TIME, "rocksdb.db.flush.micros"},
+    {SST_BATCH_SIZE, "rocksdb.sst.batch.size"},
+    {NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
+     "rocksdb.num.index.and.filter.blocks.read.per.level"},
+    {NUM_DATA_BLOCKS_READ_PER_LEVEL, "rocksdb.num.data.blocks.read.per.level"},
+    {NUM_SST_READ_PER_LEVEL, "rocksdb.num.sst.read.per.level"},
+    {ERROR_HANDLER_AUTORESUME_RETRY_COUNT,
+     "rocksdb.error.handler.autoresume.retry.count"},
+    {ASYNC_READ_BYTES, "rocksdb.async.read.bytes"},
+    {POLL_WAIT_MICROS, "rocksdb.poll.wait.micros"},
+    {PREFETCHED_BYTES_DISCARDED, "rocksdb.prefetched.bytes.discarded"},
+    {MULTIGET_IO_BATCH_SIZE, "rocksdb.multiget.io.batch.size"},
+    {NUM_LEVEL_READ_PER_MULTIGET, "rocksdb.num.level.read.per.multiget"},
+    {ASYNC_PREFETCH_ABORT_MICROS, "rocksdb.async.prefetch.abort.micros"},
+};
+
+std::shared_ptr<Statistics> CreateDBStatistics() {
+  return std::make_shared<StatisticsImpl>(nullptr);
+}
+
+#ifndef ROCKSDB_LITE
+static int RegisterBuiltinStatistics(ObjectLibrary& library,
+                                     const std::string& /*arg*/) {
+  library.AddFactory<Statistics>(
+      StatisticsImpl::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<Statistics>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new StatisticsImpl(nullptr));
+        return guard->get();
+      });
+  return 1;
+}
+#endif  // ROCKSDB_LITE
+
+Status Statistics::CreateFromString(const ConfigOptions& config_options,
+                                    const std::string& id,
+                                    std::shared_ptr<Statistics>* result) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    RegisterBuiltinStatistics(*(ObjectLibrary::Default().get()), "");
+  });
+#endif  // ROCKSDB_LITE
+  Status s;
+  if (id == "" || id == StatisticsImpl::kClassName()) {
+    result->reset(new StatisticsImpl(nullptr));
+  } else if (id == kNullptrString) {
+    result->reset();
+  } else {
+    s = LoadSharedObject<Statistics>(config_options, id, nullptr, result);
+  }
+  return s;
+}
+
+static std::unordered_map<std::string, OptionTypeInfo> stats_type_info = {
+#ifndef ROCKSDB_LITE
+    {"inner", OptionTypeInfo::AsCustomSharedPtr<Statistics>(
+                  0, OptionVerificationType::kByNameAllowFromNull,
+                  OptionTypeFlags::kCompareNever)},
+#endif  // !ROCKSDB_LITE
+};
+
+StatisticsImpl::StatisticsImpl(std::shared_ptr<Statistics> stats)
+    : stats_(std::move(stats)) {
+  RegisterOptions("StatisticsOptions", &stats_, &stats_type_info);
+}
+
+StatisticsImpl::~StatisticsImpl() {}
+
+uint64_t StatisticsImpl::getTickerCount(uint32_t tickerType) const {
+  MutexLock lock(&aggregate_lock_);
+  return getTickerCountLocked(tickerType);
+}
+
+uint64_t StatisticsImpl::getTickerCountLocked(uint32_t tickerType) const {
+  assert(tickerType < TICKER_ENUM_MAX);
+  uint64_t res = 0;
+  for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) {
+    res += per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType];
+  }
+  return res;
+}
+
+void StatisticsImpl::histogramData(uint32_t histogramType,
+                                   HistogramData* const data) const {
+  MutexLock lock(&aggregate_lock_);
+  getHistogramImplLocked(histogramType)->Data(data);
+}
+
+std::unique_ptr<HistogramImpl> StatisticsImpl::getHistogramImplLocked(
+    uint32_t histogramType) const {
+  assert(histogramType < HISTOGRAM_ENUM_MAX);
+  std::unique_ptr<HistogramImpl> res_hist(new HistogramImpl());
+  for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) {
+    res_hist->Merge(
+        per_core_stats_.AccessAtCore(core_idx)->histograms_[histogramType]);
+  }
+  return res_hist;
+}
+
+std::string StatisticsImpl::getHistogramString(uint32_t histogramType) const {
+  MutexLock lock(&aggregate_lock_);
+  return getHistogramImplLocked(histogramType)->ToString();
+}
+
+void StatisticsImpl::setTickerCount(uint32_t tickerType, uint64_t count) {
+  {
+    MutexLock lock(&aggregate_lock_);
+    setTickerCountLocked(tickerType, count);
+  }
+  if (stats_ && tickerType < TICKER_ENUM_MAX) {
+    stats_->setTickerCount(tickerType, count);
+  }
+}
+
+void StatisticsImpl::setTickerCountLocked(uint32_t tickerType, uint64_t count) {
+  assert(tickerType < TICKER_ENUM_MAX);
+  for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) {
+    if (core_idx == 0) {
+      per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType] = count;
+    } else {
+      per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType] = 0;
+    }
+  }
+}
+
+uint64_t StatisticsImpl::getAndResetTickerCount(uint32_t tickerType) {
+  uint64_t sum = 0;
+  {
+    MutexLock lock(&aggregate_lock_);
+    assert(tickerType < TICKER_ENUM_MAX);
+    for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) {
+      sum +=
+          per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType].exchange(
+              0, std::memory_order_relaxed);
+    }
+  }
+  if (stats_ && tickerType < TICKER_ENUM_MAX) {
+    stats_->setTickerCount(tickerType, 0);
+  }
+  return sum;
+}
+
+void StatisticsImpl::recordTick(uint32_t tickerType, uint64_t count) {
+  if (get_stats_level() <= StatsLevel::kExceptTickers) {
+    return;
+  }
+  if (tickerType < TICKER_ENUM_MAX) {
+    per_core_stats_.Access()->tickers_[tickerType].fetch_add(
+        count, std::memory_order_relaxed);
+    if (stats_) {
+      stats_->recordTick(tickerType, count);
+    }
+  } else {
+    assert(false);
+  }
+}
+
+void StatisticsImpl::recordInHistogram(uint32_t histogramType, uint64_t value) {
+  assert(histogramType < HISTOGRAM_ENUM_MAX);
+  if (get_stats_level() <= StatsLevel::kExceptHistogramOrTimers) {
+    return;
+  }
+  per_core_stats_.Access()->histograms_[histogramType].Add(value);
+  if (stats_ && histogramType < HISTOGRAM_ENUM_MAX) {
+    stats_->recordInHistogram(histogramType, value);
+  }
+}
+
+Status StatisticsImpl::Reset() {
+  MutexLock lock(&aggregate_lock_);
+  for (uint32_t i = 0; i < TICKER_ENUM_MAX; ++i) {
+    setTickerCountLocked(i, 0);
+  }
+  for (uint32_t i = 0; i < HISTOGRAM_ENUM_MAX; ++i) {
+    for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) {
+      per_core_stats_.AccessAtCore(core_idx)->histograms_[i].Clear();
+    }
+  }
+  return Status::OK();
+}
+
+namespace {
+
+// a buffer size used for temp string buffers
+const int kTmpStrBufferSize = 200;
+
+}  // namespace
+
+std::string StatisticsImpl::ToString() const {
+  MutexLock lock(&aggregate_lock_);
+  std::string res;
+  res.reserve(20000);
+  for (const auto& t : TickersNameMap) {
+    assert(t.first < TICKER_ENUM_MAX);
+    char buffer[kTmpStrBufferSize];
+    snprintf(buffer, kTmpStrBufferSize, "%s COUNT : %" PRIu64 "\n",
+             t.second.c_str(), getTickerCountLocked(t.first));
+    res.append(buffer);
+  }
+  for (const auto& h : HistogramsNameMap) {
+    assert(h.first < HISTOGRAM_ENUM_MAX);
+    char buffer[kTmpStrBufferSize];
+    HistogramData hData;
+    getHistogramImplLocked(h.first)->Data(&hData);
+    // don't handle failures - buffer should always be big enough and arguments
+    // should be provided correctly
+    int ret =
+        snprintf(buffer, kTmpStrBufferSize,
+                 "%s P50 : %f P95 : %f P99 : %f P100 : %f COUNT : %" PRIu64
+                 " SUM : %" PRIu64 "\n",
+                 h.second.c_str(), hData.median, hData.percentile95,
+                 hData.percentile99, hData.max, hData.count, hData.sum);
+    if (ret < 0 || ret >= kTmpStrBufferSize) {
+      assert(false);
+      continue;
+    }
+    res.append(buffer);
+  }
+  res.shrink_to_fit();
+  return res;
+}
+
+bool StatisticsImpl::getTickerMap(
+    std::map<std::string, uint64_t>* stats_map) const {
+  assert(stats_map);
+  if (!stats_map) return false;
+  stats_map->clear();
+  MutexLock lock(&aggregate_lock_);
+  for (const auto& t : TickersNameMap) {
+    assert(t.first < TICKER_ENUM_MAX);
+    (*stats_map)[t.second.c_str()] = getTickerCountLocked(t.first);
+  }
+  return true;
+}
+
+bool StatisticsImpl::HistEnabledForType(uint32_t type) const {
+  return type < HISTOGRAM_ENUM_MAX;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/monitoring/statistics.h b/src/rocksdb/monitoring/statistics.h
new file mode 100644
index 000000000..e0dc29d28
--- /dev/null
+++ b/src/rocksdb/monitoring/statistics.h
@@ -0,0 +1,144 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include <atomic>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "monitoring/histogram.h"
+#include "port/likely.h"
+#include "port/port.h"
+#include "rocksdb/statistics.h"
+#include "util/core_local.h"
+#include "util/mutexlock.h"
+
+#ifdef __clang__
+#define ROCKSDB_FIELD_UNUSED __attribute__((__unused__))
+#else
+#define ROCKSDB_FIELD_UNUSED
+#endif  // __clang__
+
+#ifndef STRINGIFY
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+enum TickersInternal : uint32_t {
+  INTERNAL_TICKER_ENUM_START = TICKER_ENUM_MAX,
+  INTERNAL_TICKER_ENUM_MAX
+};
+
+enum HistogramsInternal : uint32_t {
+  INTERNAL_HISTOGRAM_START = HISTOGRAM_ENUM_MAX,
+  INTERNAL_HISTOGRAM_ENUM_MAX
+};
+
+class StatisticsImpl : public Statistics {
+ public:
+  StatisticsImpl(std::shared_ptr<Statistics> stats);
+  virtual ~StatisticsImpl();
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "BasicStatistics"; }
+
+  virtual uint64_t getTickerCount(uint32_t ticker_type) const override;
+  virtual void histogramData(uint32_t histogram_type,
+                             HistogramData* const data) const override;
+  std::string getHistogramString(uint32_t histogram_type) const override;
+
+  virtual void setTickerCount(uint32_t ticker_type, uint64_t count) override;
+  virtual uint64_t getAndResetTickerCount(uint32_t ticker_type) override;
+  virtual void recordTick(uint32_t ticker_type, uint64_t count) override;
+  // The function is implemented for now for backward compatibility reason.
+  // In case a user explictly calls it, for example, they may have a wrapped
+  // Statistics object, passing the call to recordTick() into here, nothing
+  // will break.
+  void measureTime(uint32_t histogramType, uint64_t time) override {
+    recordInHistogram(histogramType, time);
+  }
+  virtual void recordInHistogram(uint32_t histogram_type,
+                                 uint64_t value) override;
+
+  virtual Status Reset() override;
+  virtual std::string ToString() const override;
+  virtual bool getTickerMap(std::map<std::string, uint64_t>*) const override;
+  virtual bool HistEnabledForType(uint32_t type) const override;
+
+  const Customizable* Inner() const override { return stats_.get(); }
+
+ private:
+  // If non-nullptr, forwards updates to the object pointed to by `stats_`.
+  std::shared_ptr<Statistics> stats_;
+  // Synchronizes anything that operates across other cores' local data,
+  // such that operations like Reset() can be performed atomically.
+  mutable port::Mutex aggregate_lock_;
+
+  // The ticker/histogram data are stored in this structure, which we will store
+  // per-core. It is cache-aligned, so tickers/histograms belonging to different
+  // cores can never share the same cache line.
+  //
+  // Alignment attributes expand to nothing depending on the platform
+  struct ALIGN_AS(CACHE_LINE_SIZE) StatisticsData {
+    std::atomic_uint_fast64_t tickers_[INTERNAL_TICKER_ENUM_MAX] = {{0}};
+    HistogramImpl histograms_[INTERNAL_HISTOGRAM_ENUM_MAX];
+#ifndef HAVE_ALIGNED_NEW
+    char
+        padding[(CACHE_LINE_SIZE -
+                 (INTERNAL_TICKER_ENUM_MAX * sizeof(std::atomic_uint_fast64_t) +
+                  INTERNAL_HISTOGRAM_ENUM_MAX * sizeof(HistogramImpl)) %
+                     CACHE_LINE_SIZE)] ROCKSDB_FIELD_UNUSED;
+#endif
+    void* operator new(size_t s) { return port::cacheline_aligned_alloc(s); }
+    void* operator new[](size_t s) { return port::cacheline_aligned_alloc(s); }
+    void operator delete(void* p) { port::cacheline_aligned_free(p); }
+    void operator delete[](void* p) { port::cacheline_aligned_free(p); }
+  };
+
+#ifndef TEST_CACHE_LINE_SIZE
+  static_assert(sizeof(StatisticsData) % CACHE_LINE_SIZE == 0,
+                "Expected " TOSTRING(CACHE_LINE_SIZE) "-byte aligned");
+#endif
+
+  CoreLocalArray<StatisticsData> per_core_stats_;
+
+  uint64_t getTickerCountLocked(uint32_t ticker_type) const;
+  std::unique_ptr<HistogramImpl> getHistogramImplLocked(
+      uint32_t histogram_type) const;
+  void setTickerCountLocked(uint32_t ticker_type, uint64_t count);
+};
+
+// Utility functions
+inline void RecordInHistogram(Statistics* statistics, uint32_t histogram_type,
+                              uint64_t value) {
+  if (statistics) {
+    statistics->recordInHistogram(histogram_type, value);
+  }
+}
+
+inline void RecordTimeToHistogram(Statistics* statistics,
+                                  uint32_t histogram_type, uint64_t value) {
+  if (statistics) {
+    statistics->reportTimeToHistogram(histogram_type, value);
+  }
+}
+
+inline void RecordTick(Statistics* statistics, uint32_t ticker_type,
+                       uint64_t count = 1) {
+  if (statistics) {
+    statistics->recordTick(ticker_type, count);
+  }
+}
+
+inline void SetTickerCount(Statistics* statistics, uint32_t ticker_type,
+                           uint64_t count) {
+  if (statistics) {
+    statistics->setTickerCount(ticker_type, count);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/monitoring/statistics_test.cc b/src/rocksdb/monitoring/statistics_test.cc
new file mode 100644
index 000000000..cffa5054a
--- /dev/null
+++ b/src/rocksdb/monitoring/statistics_test.cc
@@ -0,0 +1,92 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "rocksdb/statistics.h"
+
+#include "port/stack_trace.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/utilities/options_type.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class StatisticsTest : public testing::Test {};
+
+// Sanity check to make sure that contents and order of TickersNameMap
+// match Tickers enum
+TEST_F(StatisticsTest, SanityTickers) {
+  EXPECT_EQ(static_cast<size_t>(Tickers::TICKER_ENUM_MAX),
+            TickersNameMap.size());
+
+  for (uint32_t t = 0; t < Tickers::TICKER_ENUM_MAX; t++) {
+    auto pair = TickersNameMap[static_cast<size_t>(t)];
+    ASSERT_EQ(pair.first, t) << "Miss match at " << pair.second;
+  }
+}
+
+// Sanity check to make sure that contents and order of HistogramsNameMap
+// match Tickers enum
+TEST_F(StatisticsTest, SanityHistograms) {
+  EXPECT_EQ(static_cast<size_t>(Histograms::HISTOGRAM_ENUM_MAX),
+            HistogramsNameMap.size());
+
+  for (uint32_t h = 0; h < Histograms::HISTOGRAM_ENUM_MAX; h++) {
+    auto pair = HistogramsNameMap[static_cast<size_t>(h)];
+    ASSERT_EQ(pair.first, h) << "Miss match at " << pair.second;
+  }
+}
+
+TEST_F(StatisticsTest, NoNameStats) {
+  static std::unordered_map<std::string, OptionTypeInfo> no_name_opt_info = {
+#ifndef ROCKSDB_LITE
+      {"inner",
+       OptionTypeInfo::AsCustomSharedPtr<Statistics>(
+           0, OptionVerificationType::kByName,
+           OptionTypeFlags::kAllowNull | OptionTypeFlags::kCompareNever)},
+#endif  // ROCKSDB_LITE
+  };
+
+  class DefaultNameStatistics : public Statistics {
+   public:
+    DefaultNameStatistics(const std::shared_ptr<Statistics>& stats = nullptr)
+        : inner(stats) {
+      RegisterOptions("", &inner, &no_name_opt_info);
+    }
+
+    uint64_t getTickerCount(uint32_t /*tickerType*/) const override {
+      return 0;
+    }
+    void histogramData(uint32_t /*type*/,
+                       HistogramData* const /*data*/) const override {}
+    void recordTick(uint32_t /*tickerType*/, uint64_t /*count*/) override {}
+    void setTickerCount(uint32_t /*tickerType*/, uint64_t /*count*/) override {}
+    uint64_t getAndResetTickerCount(uint32_t /*tickerType*/) override {
+      return 0;
+    }
+    std::shared_ptr<Statistics> inner;
+  };
+  ConfigOptions options;
+  options.ignore_unsupported_options = false;
+  auto stats = std::make_shared<DefaultNameStatistics>();
+  ASSERT_STREQ(stats->Name(), "");
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("", stats->ToString(
+                    options));  // A stats with no name with have no options...
+  ASSERT_OK(stats->ConfigureFromString(options, "inner="));
+  ASSERT_EQ("", stats->ToString(
+                    options));  // A stats with no name with have no options...
+  ASSERT_NE(stats->inner, nullptr);
+  ASSERT_NE("", stats->inner->ToString(options));  // ... even if it does...
+#endif                                             // ROCKSDB_LITE
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/monitoring/stats_history_test.cc b/src/rocksdb/monitoring/stats_history_test.cc
new file mode 100644
index 000000000..21ac786b4
--- /dev/null
+++ b/src/rocksdb/monitoring/stats_history_test.cc
@@ -0,0 +1,664 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "rocksdb/stats_history.h"
+
+#include <limits>
+#include <string>
+#include <unordered_map>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/periodic_task_scheduler.h"
+#include "monitoring/persistent_stats_history.h"
+#include "options/options_helper.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/rate_limiter.h"
+#include "test_util/mock_time_env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+class StatsHistoryTest : public DBTestBase {
+ public:
+  StatsHistoryTest() : DBTestBase("stats_history_test", /*env_do_fsync=*/true) {
+    mock_clock_ = std::make_shared<MockSystemClock>(env_->GetSystemClock());
+    mock_env_.reset(new CompositeEnvWrapper(env_, mock_clock_));
+  }
+
+ protected:
+  std::shared_ptr<MockSystemClock> mock_clock_;
+  std::unique_ptr<Env> mock_env_;
+
+  void SetUp() override {
+    mock_clock_->InstallTimedWaitFixCallback();
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::StartPeriodicTaskScheduler:Init", [&](void* arg) {
+          auto periodic_task_scheduler_ptr =
+              reinterpret_cast<PeriodicTaskScheduler*>(arg);
+          periodic_task_scheduler_ptr->TEST_OverrideTimer(mock_clock_.get());
+        });
+  }
+};
+
+TEST_F(StatsHistoryTest, RunStatsDumpPeriodSec) {
+  constexpr int kPeriodSec = 5;
+  Options options;
+  options.create_if_missing = true;
+  options.stats_dump_period_sec = kPeriodSec;
+  options.env = mock_env_.get();
+  int counter = 0;
+  SyncPoint::GetInstance()->SetCallBack("DBImpl::DumpStats:1",
+                                        [&](void* /*arg*/) { counter++; });
+  Reopen(options);
+  ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_dump_period_sec);
+
+  // Wait for the first stats persist to finish, as the initial delay could be
+  // different.
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); });
+
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+  ASSERT_GE(counter, 1);
+
+  // Test cancel job through SetOptions
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_dump_period_sec", "0"}}));
+  int old_val = counter;
+  for (int i = 1; i < 20; ++i) {
+    mock_clock_->MockSleepForSeconds(kPeriodSec);
+  }
+  ASSERT_EQ(counter, old_val);
+  Close();
+}
+
+// Test persistent stats background thread scheduling and cancelling
+TEST_F(StatsHistoryTest, StatsPersistScheduling) {
+  constexpr int kPeriodSec = 5;
+  Options options;
+  options.create_if_missing = true;
+  options.stats_persist_period_sec = kPeriodSec;
+  options.env = mock_env_.get();
+  int counter = 0;
+  SyncPoint::GetInstance()->SetCallBack("DBImpl::PersistStats:Entry",
+                                        [&](void* /*arg*/) { counter++; });
+  Reopen(options);
+  ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_persist_period_sec);
+
+  // Wait for the first stats persist to finish, as the initial delay could be
+  // different.
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); });
+
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+  ASSERT_GE(counter, 1);
+
+  // Test cancel job through SetOptions
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}}));
+  int old_val = counter;
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec * 2); });
+  ASSERT_EQ(counter, old_val);
+
+  Close();
+}
+
+// Test enabling persistent stats for the first time
+TEST_F(StatsHistoryTest, PersistentStatsFreshInstall) {
+  constexpr unsigned int kPeriodSec = 5;
+  Options options;
+  options.create_if_missing = true;
+  options.stats_persist_period_sec = 0;
+  options.env = mock_env_.get();
+  int counter = 0;
+  SyncPoint::GetInstance()->SetCallBack("DBImpl::PersistStats:Entry",
+                                        [&](void* /*arg*/) { counter++; });
+  Reopen(options);
+  ASSERT_OK(dbfull()->SetDBOptions(
+      {{"stats_persist_period_sec", std::to_string(kPeriodSec)}}));
+  ASSERT_EQ(kPeriodSec, dbfull()->GetDBOptions().stats_persist_period_sec);
+
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+  ASSERT_GE(counter, 1);
+  Close();
+}
+
+// TODO(Zhongyi): Move persistent stats related tests to a separate file
+TEST_F(StatsHistoryTest, GetStatsHistoryInMemory) {
+  constexpr int kPeriodSec = 5;
+  Options options;
+  options.create_if_missing = true;
+  options.stats_persist_period_sec = kPeriodSec;
+  options.statistics = CreateDBStatistics();
+  options.env = mock_env_.get();
+  CreateColumnFamilies({"pikachu"}, options);
+  ASSERT_OK(Put("foo", "bar"));
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  // make sure the first stats persist to finish
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); });
+
+  // Wait for stats persist to finish
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+
+  std::unique_ptr<StatsHistoryIterator> stats_iter;
+  ASSERT_OK(
+      db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter));
+  ASSERT_TRUE(stats_iter != nullptr);
+  // disabled stats snapshots
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}}));
+  size_t stats_count = 0;
+  for (; stats_iter->Valid(); stats_iter->Next()) {
+    auto stats_map = stats_iter->GetStatsMap();
+    ASSERT_EQ(stats_iter->GetStatsTime(), mock_clock_->NowSeconds());
+    stats_count += stats_map.size();
+  }
+  ASSERT_GT(stats_count, 0);
+  // Wait a bit and verify no more stats are found
+  for (int i = 0; i < 10; ++i) {
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(1); });
+  }
+  ASSERT_OK(db_->GetStatsHistory(0, mock_clock_->NowSeconds(), &stats_iter));
+  ASSERT_TRUE(stats_iter != nullptr);
+  size_t stats_count_new = 0;
+  for (; stats_iter->Valid(); stats_iter->Next()) {
+    stats_count_new += stats_iter->GetStatsMap().size();
+  }
+  ASSERT_EQ(stats_count_new, stats_count);
+  Close();
+}
+
+TEST_F(StatsHistoryTest, InMemoryStatsHistoryPurging) {
+  constexpr int kPeriodSec = 1;
+  Options options;
+  options.create_if_missing = true;
+  options.statistics = CreateDBStatistics();
+  options.stats_persist_period_sec = kPeriodSec;
+  options.env = mock_env_.get();
+
+  CreateColumnFamilies({"pikachu"}, options);
+  ASSERT_OK(Put("foo", "bar"));
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  // some random operation to populate statistics
+  ASSERT_OK(Delete("foo"));
+  ASSERT_OK(Put("sol", "sol"));
+  ASSERT_OK(Put("epic", "epic"));
+  ASSERT_OK(Put("ltd", "ltd"));
+  ASSERT_EQ("sol", Get("sol"));
+  ASSERT_EQ("epic", Get("epic"));
+  ASSERT_EQ("ltd", Get("ltd"));
+  Iterator* iterator = db_->NewIterator(ReadOptions());
+  for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+    ASSERT_TRUE(iterator->key() == iterator->value());
+  }
+  delete iterator;
+  ASSERT_OK(Flush());
+  ASSERT_OK(Delete("sol"));
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // second round of ops
+  ASSERT_OK(Put("saigon", "saigon"));
+  ASSERT_OK(Put("noodle talk", "noodle talk"));
+  ASSERT_OK(Put("ping bistro", "ping bistro"));
+  iterator = db_->NewIterator(ReadOptions());
+  for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+    ASSERT_TRUE(iterator->key() == iterator->value());
+  }
+  delete iterator;
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  const int kIterations = 10;
+  for (int i = 0; i < kIterations; ++i) {
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+  }
+
+  std::unique_ptr<StatsHistoryIterator> stats_iter;
+  ASSERT_OK(
+      db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter));
+  ASSERT_TRUE(stats_iter != nullptr);
+  size_t stats_count = 0;
+  int slice_count = 0;
+  for (; stats_iter->Valid(); stats_iter->Next()) {
+    slice_count++;
+    auto stats_map = stats_iter->GetStatsMap();
+    stats_count += stats_map.size();
+  }
+  size_t stats_history_size = dbfull()->TEST_EstimateInMemoryStatsHistorySize();
+  ASSERT_GE(slice_count, kIterations - 1);
+  ASSERT_GE(stats_history_size, 15000);
+  // capping memory cost at 15000 bytes since one slice is around 10000~15000
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_history_buffer_size", "15000"}}));
+  ASSERT_EQ(15000, dbfull()->GetDBOptions().stats_history_buffer_size);
+
+  // Wait for stats persist to finish
+  for (int i = 0; i < kIterations; ++i) {
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+  }
+
+  ASSERT_OK(
+      db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter));
+  ASSERT_TRUE(stats_iter != nullptr);
+  size_t stats_count_reopen = 0;
+  slice_count = 0;
+  for (; stats_iter->Valid(); stats_iter->Next()) {
+    slice_count++;
+    auto stats_map = stats_iter->GetStatsMap();
+    stats_count_reopen += stats_map.size();
+  }
+  size_t stats_history_size_reopen =
+      dbfull()->TEST_EstimateInMemoryStatsHistorySize();
+  // only one slice can fit under the new stats_history_buffer_size
+  ASSERT_LT(slice_count, 2);
+  ASSERT_TRUE(stats_history_size_reopen < 15000 &&
+              stats_history_size_reopen > 0);
+  ASSERT_TRUE(stats_count_reopen < stats_count && stats_count_reopen > 0);
+  Close();
+  // TODO: may also want to verify stats timestamp to make sure we are purging
+  // the correct stats snapshot
+}
+
+int countkeys(Iterator* iter) {
+  int count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    count++;
+  }
+  return count;
+}
+
+TEST_F(StatsHistoryTest, GetStatsHistoryFromDisk) {
+  constexpr int kPeriodSec = 5;
+  Options options;
+  options.create_if_missing = true;
+  options.stats_persist_period_sec = kPeriodSec;
+  options.statistics = CreateDBStatistics();
+  options.persist_stats_to_disk = true;
+  options.env = mock_env_.get();
+  CreateColumnFamilies({"pikachu"}, options);
+  ASSERT_OK(Put("foo", "bar"));
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ(Get("foo"), "bar");
+
+  // Wait for the first stats persist to finish, as the initial delay could be
+  // different.
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); });
+
+  // Wait for stats persist to finish
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+
+  auto iter =
+      db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
+  int key_count1 = countkeys(iter);
+  delete iter;
+
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+  iter =
+      db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
+  int key_count2 = countkeys(iter);
+  delete iter;
+
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+  iter =
+      db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
+  int key_count3 = countkeys(iter);
+  delete iter;
+  ASSERT_GE(key_count2, key_count1);
+  ASSERT_GE(key_count3, key_count2);
+  ASSERT_EQ(key_count3 - key_count2, key_count2 - key_count1);
+  std::unique_ptr<StatsHistoryIterator> stats_iter;
+  ASSERT_OK(
+      db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter));
+  ASSERT_TRUE(stats_iter != nullptr);
+  size_t stats_count = 0;
+  int slice_count = 0;
+  int non_zero_count = 0;
+  for (int i = 2; stats_iter->Valid(); stats_iter->Next(), i++) {
+    slice_count++;
+    auto stats_map = stats_iter->GetStatsMap();
+    ASSERT_EQ(stats_iter->GetStatsTime(), kPeriodSec * i - 1);
+    for (auto& stat : stats_map) {
+      if (stat.second != 0) {
+        non_zero_count++;
+      }
+    }
+    stats_count += stats_map.size();
+  }
+  ASSERT_EQ(slice_count, 3);
+  // 2 extra keys for format version
+  ASSERT_EQ(stats_count, key_count3 - 2);
+  // verify reopen will not cause data loss
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_OK(
+      db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter));
+  ASSERT_TRUE(stats_iter != nullptr);
+  size_t stats_count_reopen = 0;
+  int slice_count_reopen = 0;
+  int non_zero_count_recover = 0;
+  for (; stats_iter->Valid(); stats_iter->Next()) {
+    slice_count_reopen++;
+    auto stats_map = stats_iter->GetStatsMap();
+    for (auto& stat : stats_map) {
+      if (stat.second != 0) {
+        non_zero_count_recover++;
+      }
+    }
+    stats_count_reopen += stats_map.size();
+  }
+
+  ASSERT_EQ(non_zero_count, non_zero_count_recover);
+  ASSERT_EQ(slice_count, slice_count_reopen);
+  ASSERT_EQ(stats_count, stats_count_reopen);
+  Close();
+}
+
+// Test persisted stats matches the value found in options.statistics and
+// the stats value retains after DB reopen
+TEST_F(StatsHistoryTest, PersitentStatsVerifyValue) {
+  constexpr int kPeriodSec = 5;
+  Options options;
+  options.create_if_missing = true;
+  options.stats_persist_period_sec = kPeriodSec;
+  options.statistics = CreateDBStatistics();
+  options.persist_stats_to_disk = true;
+  std::map<std::string, uint64_t> stats_map_before;
+  ASSERT_TRUE(options.statistics->getTickerMap(&stats_map_before));
+  options.env = mock_env_.get();
+  CreateColumnFamilies({"pikachu"}, options);
+  ASSERT_OK(Put("foo", "bar"));
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ(Get("foo"), "bar");
+
+  // Wait for the first stats persist to finish, as the initial delay could be
+  // different.
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); });
+
+  // Wait for stats persist to finish
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+  auto iter =
+      db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
+  countkeys(iter);
+  delete iter;
+
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+  iter =
+      db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
+  countkeys(iter);
+  delete iter;
+
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+  iter =
+      db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
+  countkeys(iter);
+  delete iter;
+
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+
+  std::map<std::string, uint64_t> stats_map_after;
+  ASSERT_TRUE(options.statistics->getTickerMap(&stats_map_after));
+  std::unique_ptr<StatsHistoryIterator> stats_iter;
+  ASSERT_OK(
+      db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter));
+  ASSERT_TRUE(stats_iter != nullptr);
+  std::string sample = "rocksdb.num.iterator.deleted";
+  uint64_t recovered_value = 0;
+  for (int i = 2; stats_iter->Valid(); stats_iter->Next(), ++i) {
+    auto stats_map = stats_iter->GetStatsMap();
+    ASSERT_EQ(stats_iter->GetStatsTime(), kPeriodSec * i - 1);
+    for (const auto& stat : stats_map) {
+      if (sample.compare(stat.first) == 0) {
+        recovered_value += stat.second;
+      }
+    }
+  }
+  ASSERT_EQ(recovered_value, stats_map_after[sample]);
+
+  // test stats value retains after recovery
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_OK(
+      db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter));
+  ASSERT_TRUE(stats_iter != nullptr);
+  uint64_t new_recovered_value = 0;
+  for (int i = 2; stats_iter->Valid(); stats_iter->Next(), i++) {
+    auto stats_map = stats_iter->GetStatsMap();
+    ASSERT_EQ(stats_iter->GetStatsTime(), kPeriodSec * i - 1);
+    for (const auto& stat : stats_map) {
+      if (sample.compare(stat.first) == 0) {
+        new_recovered_value += stat.second;
+      }
+    }
+  }
+  ASSERT_EQ(recovered_value, new_recovered_value);
+
+  // TODO(Zhongyi): also add test to read raw values from disk and verify
+  // correctness
+  Close();
+}
+
+// TODO(Zhongyi): add test for different format versions
+
+TEST_F(StatsHistoryTest, PersistentStatsCreateColumnFamilies) {
+  constexpr int kPeriodSec = 5;
+  Options options;
+  options.create_if_missing = true;
+  options.stats_persist_period_sec = kPeriodSec;
+  options.statistics = CreateDBStatistics();
+  options.persist_stats_to_disk = true;
+  options.env = mock_env_.get();
+  ASSERT_OK(TryReopen(options));
+  CreateColumnFamilies({"one", "two", "three"}, options);
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ReopenWithColumnFamilies({"default", "one", "two", "three"}, options);
+  ASSERT_EQ(Get(2, "foo"), "bar");
+  CreateColumnFamilies({"four"}, options);
+  ReopenWithColumnFamilies({"default", "one", "two", "three", "four"}, options);
+  ASSERT_EQ(Get(2, "foo"), "bar");
+
+  // make sure the first stats persist to finish
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); });
+
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+  auto iter =
+      db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
+  int key_count = countkeys(iter);
+  delete iter;
+  ASSERT_GE(key_count, 0);
+  uint64_t num_write_wal = 0;
+  std::string sample = "rocksdb.write.wal";
+  std::unique_ptr<StatsHistoryIterator> stats_iter;
+  ASSERT_OK(db_->GetStatsHistory(0, mock_clock_->NowSeconds(), &stats_iter));
+  ASSERT_TRUE(stats_iter != nullptr);
+  for (; stats_iter->Valid(); stats_iter->Next()) {
+    auto stats_map = stats_iter->GetStatsMap();
+    for (const auto& stat : stats_map) {
+      if (sample.compare(stat.first) == 0) {
+        num_write_wal += stat.second;
+      }
+    }
+  }
+  stats_iter.reset();
+  ASSERT_EQ(num_write_wal, 1);
+
+  options.persist_stats_to_disk = false;
+  ReopenWithColumnFamilies({"default", "one", "two", "three", "four"}, options);
+  int cf_count = 0;
+  for (auto cfd : *dbfull()->versions_->GetColumnFamilySet()) {
+    (void)cfd;
+    cf_count++;
+  }
+  // persistent stats cf will be implicitly opened even if
+  // persist_stats_to_disk is false
+  ASSERT_EQ(cf_count, 6);
+  ASSERT_EQ(Get(2, "foo"), "bar");
+
+  // attempt to create column family using same name, should fail
+  ColumnFamilyOptions cf_opts(options);
+  ColumnFamilyHandle* handle;
+  ASSERT_NOK(db_->CreateColumnFamily(cf_opts, kPersistentStatsColumnFamilyName,
+                                     &handle));
+
+  options.persist_stats_to_disk = true;
+  ReopenWithColumnFamilies({"default", "one", "two", "three", "four"}, options);
+  ASSERT_NOK(db_->CreateColumnFamily(cf_opts, kPersistentStatsColumnFamilyName,
+                                     &handle));
+  // verify stats is not affected by prior failed CF creation
+  ASSERT_OK(db_->GetStatsHistory(0, mock_clock_->NowSeconds(), &stats_iter));
+  ASSERT_TRUE(stats_iter != nullptr);
+  num_write_wal = 0;
+  for (; stats_iter->Valid(); stats_iter->Next()) {
+    auto stats_map = stats_iter->GetStatsMap();
+    for (const auto& stat : stats_map) {
+      if (sample.compare(stat.first) == 0) {
+        num_write_wal += stat.second;
+      }
+    }
+  }
+  ASSERT_EQ(num_write_wal, 1);
+
+  Close();
+  Destroy(options);
+}
+
+TEST_F(StatsHistoryTest, PersistentStatsReadOnly) {
+  ASSERT_OK(Put("bar", "v2"));
+  Close();
+
+  auto options = CurrentOptions();
+  options.stats_persist_period_sec = 5;
+  options.persist_stats_to_disk = true;
+  assert(options.env == env_);
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_EQ("v2", Get("bar"));
+  Close();
+
+  // Reopen and flush memtable.
+  ASSERT_OK(TryReopen(options));
+  ASSERT_OK(Flush());
+  Close();
+  // Now check keys in read only mode.
+  ASSERT_OK(ReadOnlyReopen(options));
+}
+
+TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) {
+  constexpr int kPeriodSec = 5;
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = 1024 * 1024 * 10;  // 10 Mb
+  options.stats_persist_period_sec = kPeriodSec;
+  options.statistics = CreateDBStatistics();
+  options.persist_stats_to_disk = true;
+  options.env = mock_env_.get();
+  CreateColumnFamilies({"pikachu"}, options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  // Wait for the first stats persist to finish, as the initial delay could be
+  // different.
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); });
+
+  ColumnFamilyData* cfd_default =
+      static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily())
+          ->cfd();
+  ColumnFamilyData* cfd_stats = static_cast<ColumnFamilyHandleImpl*>(
+                                    dbfull()->PersistentStatsColumnFamily())
+                                    ->cfd();
+  ColumnFamilyData* cfd_test =
+      static_cast<ColumnFamilyHandleImpl*>(handles_[1])->cfd();
+
+  ASSERT_OK(Put("foo", "v0"));
+  ASSERT_OK(Put("bar", "v0"));
+  ASSERT_EQ("v0", Get("bar"));
+  ASSERT_EQ("v0", Get("foo"));
+  ASSERT_OK(Put(1, "Eevee", "v0"));
+  ASSERT_EQ("v0", Get(1, "Eevee"));
+
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+  // writing to all three cf, flush default cf
+  // LogNumbers: default: 16, stats: 10, pikachu: 5
+  // Since in recovery process, cfd_stats column is created after WAL is
+  // created, synced and MANIFEST is persisted, its log number which depends on
+  // logfile_number_ will be different. Since "pikachu" is never flushed, thus
+  // its log_number should be the smallest of the three.
+  ASSERT_OK(Flush());
+  ASSERT_LT(cfd_test->GetLogNumber(), cfd_stats->GetLogNumber());
+  ASSERT_LT(cfd_test->GetLogNumber(), cfd_default->GetLogNumber());
+
+  ASSERT_OK(Put("foo1", "v1"));
+  ASSERT_OK(Put("bar1", "v1"));
+  ASSERT_EQ("v1", Get("bar1"));
+  ASSERT_EQ("v1", Get("foo1"));
+  ASSERT_OK(Put(1, "Vaporeon", "v1"));
+  ASSERT_EQ("v1", Get(1, "Vaporeon"));
+  // writing to default and test cf, flush test cf
+  // LogNumbers: default: 14, stats: 16, pikachu: 16
+  ASSERT_OK(Flush(1));
+  ASSERT_EQ(cfd_stats->GetLogNumber(), cfd_test->GetLogNumber());
+  ASSERT_GT(cfd_stats->GetLogNumber(), cfd_default->GetLogNumber());
+
+  ASSERT_OK(Put("foo2", "v2"));
+  ASSERT_OK(Put("bar2", "v2"));
+  ASSERT_EQ("v2", Get("bar2"));
+  ASSERT_EQ("v2", Get("foo2"));
+
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+  // writing to default and stats cf, flushing default cf
+  // LogNumbers: default: 19, stats: 19, pikachu: 19
+  ASSERT_OK(Flush());
+  ASSERT_EQ(cfd_stats->GetLogNumber(), cfd_test->GetLogNumber());
+  ASSERT_EQ(cfd_stats->GetLogNumber(), cfd_default->GetLogNumber());
+
+  ASSERT_OK(Put("foo3", "v3"));
+  ASSERT_OK(Put("bar3", "v3"));
+  ASSERT_EQ("v3", Get("bar3"));
+  ASSERT_EQ("v3", Get("foo3"));
+  ASSERT_OK(Put(1, "Jolteon", "v3"));
+  ASSERT_EQ("v3", Get(1, "Jolteon"));
+
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+  // writing to all three cf, flushing test cf
+  // LogNumbers: default: 19, stats: 19, pikachu: 22
+  ASSERT_OK(Flush(1));
+  ASSERT_LT(cfd_stats->GetLogNumber(), cfd_test->GetLogNumber());
+  ASSERT_EQ(cfd_stats->GetLogNumber(), cfd_default->GetLogNumber());
+  Close();
+}
+
+#endif  // !ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/monitoring/thread_status_impl.cc b/src/rocksdb/monitoring/thread_status_impl.cc
new file mode 100644
index 000000000..9619dfd81
--- /dev/null
+++ b/src/rocksdb/monitoring/thread_status_impl.cc
@@ -0,0 +1,163 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include <sstream>
+
+#include "rocksdb/env.h"
+#include "rocksdb/thread_status.h"
+#include "util/string_util.h"
+#include "util/thread_operation.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifdef ROCKSDB_USING_THREAD_STATUS
+std::string ThreadStatus::GetThreadTypeName(
+    ThreadStatus::ThreadType thread_type) {
+  switch (thread_type) {
+    case ThreadStatus::ThreadType::HIGH_PRIORITY:
+      return "High Pri";
+    case ThreadStatus::ThreadType::LOW_PRIORITY:
+      return "Low Pri";
+    case ThreadStatus::ThreadType::USER:
+      return "User";
+    case ThreadStatus::ThreadType::BOTTOM_PRIORITY:
+      return "Bottom Pri";
+    case ThreadStatus::ThreadType::NUM_THREAD_TYPES:
+      assert(false);
+  }
+  return "Unknown";
+}
+
+const std::string& ThreadStatus::GetOperationName(
+    ThreadStatus::OperationType op_type) {
+  if (op_type < 0 || op_type >= NUM_OP_TYPES) {
+    return global_operation_table[OP_UNKNOWN].name;
+  }
+  return global_operation_table[op_type].name;
+}
+
+const std::string& ThreadStatus::GetOperationStageName(
+    ThreadStatus::OperationStage stage) {
+  if (stage < 0 || stage >= NUM_OP_STAGES) {
+    return global_op_stage_table[STAGE_UNKNOWN].name;
+  }
+  return global_op_stage_table[stage].name;
+}
+
+const std::string& ThreadStatus::GetStateName(
+    ThreadStatus::StateType state_type) {
+  if (state_type < 0 || state_type >= NUM_STATE_TYPES) {
+    return global_state_table[STATE_UNKNOWN].name;
+  }
+  return global_state_table[state_type].name;
+}
+
+const std::string ThreadStatus::MicrosToString(uint64_t micros) {
+  if (micros == 0) {
+    return "";
+  }
+  const int kBufferLen = 100;
+  char buffer[kBufferLen];
+  AppendHumanMicros(micros, buffer, kBufferLen, false);
+  return std::string(buffer);
+}
+
+const std::string& ThreadStatus::GetOperationPropertyName(
+    ThreadStatus::OperationType op_type, int i) {
+  static const std::string empty_str = "";
+  switch (op_type) {
+    case ThreadStatus::OP_COMPACTION:
+      if (i >= NUM_COMPACTION_PROPERTIES) {
+        return empty_str;
+      }
+      return compaction_operation_properties[i].name;
+    case ThreadStatus::OP_FLUSH:
+      if (i >= NUM_FLUSH_PROPERTIES) {
+        return empty_str;
+      }
+      return flush_operation_properties[i].name;
+    default:
+      return empty_str;
+  }
+}
+
+std::map<std::string, uint64_t> ThreadStatus::InterpretOperationProperties(
+    ThreadStatus::OperationType op_type, const uint64_t* op_properties) {
+  int num_properties;
+  switch (op_type) {
+    case OP_COMPACTION:
+      num_properties = NUM_COMPACTION_PROPERTIES;
+      break;
+    case OP_FLUSH:
+      num_properties = NUM_FLUSH_PROPERTIES;
+      break;
+    default:
+      num_properties = 0;
+  }
+
+  std::map<std::string, uint64_t> property_map;
+  for (int i = 0; i < num_properties; ++i) {
+    if (op_type == OP_COMPACTION && i == COMPACTION_INPUT_OUTPUT_LEVEL) {
+      property_map.insert({"BaseInputLevel", op_properties[i] >> 32});
+      property_map.insert(
+          {"OutputLevel", op_properties[i] % (uint64_t(1) << 32U)});
+    } else if (op_type == OP_COMPACTION && i == COMPACTION_PROP_FLAGS) {
+      property_map.insert({"IsManual", ((op_properties[i] & 2) >> 1)});
+      property_map.insert({"IsDeletion", ((op_properties[i] & 4) >> 2)});
+      property_map.insert({"IsTrivialMove", ((op_properties[i] & 8) >> 3)});
+    } else {
+      property_map.insert(
+          {GetOperationPropertyName(op_type, i), op_properties[i]});
+    }
+  }
+  return property_map;
+}
+
+#else
+
+std::string ThreadStatus::GetThreadTypeName(
+    ThreadStatus::ThreadType /*thread_type*/) {
+  static std::string dummy_str = "";
+  return dummy_str;
+}
+
+const std::string& ThreadStatus::GetOperationName(
+    ThreadStatus::OperationType /*op_type*/) {
+  static std::string dummy_str = "";
+  return dummy_str;
+}
+
+const std::string& ThreadStatus::GetOperationStageName(
+    ThreadStatus::OperationStage /*stage*/) {
+  static std::string dummy_str = "";
+  return dummy_str;
+}
+
+const std::string& ThreadStatus::GetStateName(
+    ThreadStatus::StateType /*state_type*/) {
+  static std::string dummy_str = "";
+  return dummy_str;
+}
+
+const std::string ThreadStatus::MicrosToString(uint64_t /*op_elapsed_time*/) {
+  static std::string dummy_str = "";
+  return dummy_str;
+}
+
+const std::string& ThreadStatus::GetOperationPropertyName(
+    ThreadStatus::OperationType /*op_type*/, int /*i*/) {
+  static std::string dummy_str = "";
+  return dummy_str;
+}
+
+std::map<std::string, uint64_t> ThreadStatus::InterpretOperationProperties(
+    ThreadStatus::OperationType /*op_type*/,
+    const uint64_t* /*op_properties*/) {
+  return std::map<std::string, uint64_t>();
+}
+
+#endif  // ROCKSDB_USING_THREAD_STATUS
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/monitoring/thread_status_updater.cc b/src/rocksdb/monitoring/thread_status_updater.cc
new file mode 100644
index 000000000..9707d2265
--- /dev/null
+++ b/src/rocksdb/monitoring/thread_status_updater.cc
@@ -0,0 +1,318 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "monitoring/thread_status_updater.h"
+
+#include <memory>
+
+#include "port/likely.h"
+#include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifdef ROCKSDB_USING_THREAD_STATUS
+
+thread_local ThreadStatusData* ThreadStatusUpdater::thread_status_data_ =
+    nullptr;
+
+void ThreadStatusUpdater::RegisterThread(ThreadStatus::ThreadType ttype,
+                                         uint64_t thread_id) {
+  if (UNLIKELY(thread_status_data_ == nullptr)) {
+    thread_status_data_ = new ThreadStatusData();
+    thread_status_data_->thread_type = ttype;
+    thread_status_data_->thread_id = thread_id;
+    std::lock_guard<std::mutex> lck(thread_list_mutex_);
+    thread_data_set_.insert(thread_status_data_);
+  }
+
+  ClearThreadOperationProperties();
+}
+
+void ThreadStatusUpdater::UnregisterThread() {
+  if (thread_status_data_ != nullptr) {
+    std::lock_guard<std::mutex> lck(thread_list_mutex_);
+    thread_data_set_.erase(thread_status_data_);
+    delete thread_status_data_;
+    thread_status_data_ = nullptr;
+  }
+}
+
+void ThreadStatusUpdater::ResetThreadStatus() {
+  ClearThreadState();
+  ClearThreadOperation();
+  SetColumnFamilyInfoKey(nullptr);
+}
+
+void ThreadStatusUpdater::SetColumnFamilyInfoKey(const void* cf_key) {
+  auto* data = Get();
+  if (data == nullptr) {
+    return;
+  }
+  // set the tracking flag based on whether cf_key is non-null or not.
+  // If enable_thread_tracking is set to false, the input cf_key
+  // would be nullptr.
+  data->enable_tracking = (cf_key != nullptr);
+  data->cf_key.store(const_cast<void*>(cf_key), std::memory_order_relaxed);
+}
+
+const void* ThreadStatusUpdater::GetColumnFamilyInfoKey() {
+  auto* data = GetLocalThreadStatus();
+  if (data == nullptr) {
+    return nullptr;
+  }
+  return data->cf_key.load(std::memory_order_relaxed);
+}
+
+void ThreadStatusUpdater::SetThreadOperation(
+    const ThreadStatus::OperationType type) {
+  auto* data = GetLocalThreadStatus();
+  if (data == nullptr) {
+    return;
+  }
+  // NOTE: Our practice here is to set all the thread operation properties
+  //       and stage before we set thread operation, and thread operation
+  //       will be set in std::memory_order_release.  This is to ensure
+  //       whenever a thread operation is not OP_UNKNOWN, we will always
+  //       have a consistent information on its properties.
+  data->operation_type.store(type, std::memory_order_release);
+  if (type == ThreadStatus::OP_UNKNOWN) {
+    data->operation_stage.store(ThreadStatus::STAGE_UNKNOWN,
+                                std::memory_order_relaxed);
+    ClearThreadOperationProperties();
+  }
+}
+
+void ThreadStatusUpdater::SetThreadOperationProperty(int i, uint64_t value) {
+  auto* data = GetLocalThreadStatus();
+  if (data == nullptr) {
+    return;
+  }
+  data->op_properties[i].store(value, std::memory_order_relaxed);
+}
+
+void ThreadStatusUpdater::IncreaseThreadOperationProperty(int i,
+                                                          uint64_t delta) {
+  auto* data = GetLocalThreadStatus();
+  if (data == nullptr) {
+    return;
+  }
+  data->op_properties[i].fetch_add(delta, std::memory_order_relaxed);
+}
+
+void ThreadStatusUpdater::SetOperationStartTime(const uint64_t start_time) {
+  auto* data = GetLocalThreadStatus();
+  if (data == nullptr) {
+    return;
+  }
+  data->op_start_time.store(start_time, std::memory_order_relaxed);
+}
+
+void ThreadStatusUpdater::ClearThreadOperation() {
+  auto* data = GetLocalThreadStatus();
+  if (data == nullptr) {
+    return;
+  }
+  data->operation_stage.store(ThreadStatus::STAGE_UNKNOWN,
+                              std::memory_order_relaxed);
+  data->operation_type.store(ThreadStatus::OP_UNKNOWN,
+                             std::memory_order_relaxed);
+  ClearThreadOperationProperties();
+}
+
+void ThreadStatusUpdater::ClearThreadOperationProperties() {
+  auto* data = GetLocalThreadStatus();
+  if (data == nullptr) {
+    return;
+  }
+  for (int i = 0; i < ThreadStatus::kNumOperationProperties; ++i) {
+    data->op_properties[i].store(0, std::memory_order_relaxed);
+  }
+}
+
+ThreadStatus::OperationStage ThreadStatusUpdater::SetThreadOperationStage(
+    ThreadStatus::OperationStage stage) {
+  auto* data = GetLocalThreadStatus();
+  if (data == nullptr) {
+    return ThreadStatus::STAGE_UNKNOWN;
+  }
+  return data->operation_stage.exchange(stage, std::memory_order_relaxed);
+}
+
+void ThreadStatusUpdater::SetThreadState(const ThreadStatus::StateType type) {
+  auto* data = GetLocalThreadStatus();
+  if (data == nullptr) {
+    return;
+  }
+  data->state_type.store(type, std::memory_order_relaxed);
+}
+
+void ThreadStatusUpdater::ClearThreadState() {
+  auto* data = GetLocalThreadStatus();
+  if (data == nullptr) {
+    return;
+  }
+  data->state_type.store(ThreadStatus::STATE_UNKNOWN,
+                         std::memory_order_relaxed);
+}
+
+Status ThreadStatusUpdater::GetThreadList(
+    std::vector<ThreadStatus>* thread_list) {
+  thread_list->clear();
+  std::vector<std::shared_ptr<ThreadStatusData>> valid_list;
+  uint64_t now_micros = SystemClock::Default()->NowMicros();
+
+  std::lock_guard<std::mutex> lck(thread_list_mutex_);
+  for (auto* thread_data : thread_data_set_) {
+    assert(thread_data);
+    auto thread_id = thread_data->thread_id.load(std::memory_order_relaxed);
+    auto thread_type = thread_data->thread_type.load(std::memory_order_relaxed);
+    // Since any change to cf_info_map requires thread_list_mutex,
+    // which is currently held by GetThreadList(), here we can safely
+    // use "memory_order_relaxed" to load the cf_key.
+    auto cf_key = thread_data->cf_key.load(std::memory_order_relaxed);
+
+    ThreadStatus::OperationType op_type = ThreadStatus::OP_UNKNOWN;
+    ThreadStatus::OperationStage op_stage = ThreadStatus::STAGE_UNKNOWN;
+    ThreadStatus::StateType state_type = ThreadStatus::STATE_UNKNOWN;
+    uint64_t op_elapsed_micros = 0;
+    uint64_t op_props[ThreadStatus::kNumOperationProperties] = {0};
+
+    auto iter = cf_info_map_.find(cf_key);
+    if (iter != cf_info_map_.end()) {
+      op_type = thread_data->operation_type.load(std::memory_order_acquire);
+      // display lower-level info only when higher-level info is available.
+      if (op_type != ThreadStatus::OP_UNKNOWN) {
+        op_elapsed_micros = now_micros - thread_data->op_start_time.load(
+                                             std::memory_order_relaxed);
+        op_stage = thread_data->operation_stage.load(std::memory_order_relaxed);
+        state_type = thread_data->state_type.load(std::memory_order_relaxed);
+        for (int i = 0; i < ThreadStatus::kNumOperationProperties; ++i) {
+          op_props[i] =
+              thread_data->op_properties[i].load(std::memory_order_relaxed);
+        }
+      }
+    }
+
+    thread_list->emplace_back(
+        thread_id, thread_type,
+        iter != cf_info_map_.end() ? iter->second.db_name : "",
+        iter != cf_info_map_.end() ? iter->second.cf_name : "", op_type,
+        op_elapsed_micros, op_stage, op_props, state_type);
+  }
+
+  return Status::OK();
+}
+
+ThreadStatusData* ThreadStatusUpdater::GetLocalThreadStatus() {
+  if (thread_status_data_ == nullptr) {
+    return nullptr;
+  }
+  if (!thread_status_data_->enable_tracking) {
+    assert(thread_status_data_->cf_key.load(std::memory_order_relaxed) ==
+           nullptr);
+    return nullptr;
+  }
+  return thread_status_data_;
+}
+
+void ThreadStatusUpdater::NewColumnFamilyInfo(const void* db_key,
+                                              const std::string& db_name,
+                                              const void* cf_key,
+                                              const std::string& cf_name) {
+  // Acquiring same lock as GetThreadList() to guarantee
+  // a consistent view of global column family table (cf_info_map).
+  std::lock_guard<std::mutex> lck(thread_list_mutex_);
+
+  cf_info_map_.emplace(std::piecewise_construct, std::make_tuple(cf_key),
+                       std::make_tuple(db_key, db_name, cf_name));
+  db_key_map_[db_key].insert(cf_key);
+}
+
+void ThreadStatusUpdater::EraseColumnFamilyInfo(const void* cf_key) {
+  // Acquiring same lock as GetThreadList() to guarantee
+  // a consistent view of global column family table (cf_info_map).
+  std::lock_guard<std::mutex> lck(thread_list_mutex_);
+
+  auto cf_pair = cf_info_map_.find(cf_key);
+  if (cf_pair != cf_info_map_.end()) {
+    // Remove its entry from db_key_map_ by the following steps:
+    // 1. Obtain the entry in db_key_map_ whose set contains cf_key
+    // 2. Remove it from the set.
+    ConstantColumnFamilyInfo& cf_info = cf_pair->second;
+    auto db_pair = db_key_map_.find(cf_info.db_key);
+    assert(db_pair != db_key_map_.end());
+    size_t result __attribute__((__unused__));
+    result = db_pair->second.erase(cf_key);
+    assert(result);
+    cf_info_map_.erase(cf_pair);
+  }
+}
+
+void ThreadStatusUpdater::EraseDatabaseInfo(const void* db_key) {
+  // Acquiring same lock as GetThreadList() to guarantee
+  // a consistent view of global column family table (cf_info_map).
+  std::lock_guard<std::mutex> lck(thread_list_mutex_);
+  auto db_pair = db_key_map_.find(db_key);
+  if (UNLIKELY(db_pair == db_key_map_.end())) {
+    // In some occasional cases such as DB::Open fails, we won't
+    // register ColumnFamilyInfo for a db.
+    return;
+  }
+
+  for (auto cf_key : db_pair->second) {
+    auto cf_pair = cf_info_map_.find(cf_key);
+    if (cf_pair != cf_info_map_.end()) {
+      cf_info_map_.erase(cf_pair);
+    }
+  }
+  db_key_map_.erase(db_key);
+}
+
+#else
+
+void ThreadStatusUpdater::RegisterThread(ThreadStatus::ThreadType /*ttype*/,
+                                         uint64_t /*thread_id*/) {}
+
+void ThreadStatusUpdater::UnregisterThread() {}
+
+void ThreadStatusUpdater::ResetThreadStatus() {}
+
+void ThreadStatusUpdater::SetColumnFamilyInfoKey(const void* /*cf_key*/) {}
+
+void ThreadStatusUpdater::SetThreadOperation(
+    const ThreadStatus::OperationType /*type*/) {}
+
+void ThreadStatusUpdater::ClearThreadOperation() {}
+
+void ThreadStatusUpdater::SetThreadState(
+    const ThreadStatus::StateType /*type*/) {}
+
+void ThreadStatusUpdater::ClearThreadState() {}
+
+Status ThreadStatusUpdater::GetThreadList(
+    std::vector<ThreadStatus>* /*thread_list*/) {
+  return Status::NotSupported(
+      "GetThreadList is not supported in the current running environment.");
+}
+
+void ThreadStatusUpdater::NewColumnFamilyInfo(const void* /*db_key*/,
+                                              const std::string& /*db_name*/,
+                                              const void* /*cf_key*/,
+                                              const std::string& /*cf_name*/) {}
+
+void ThreadStatusUpdater::EraseColumnFamilyInfo(const void* /*cf_key*/) {}
+
+void ThreadStatusUpdater::EraseDatabaseInfo(const void* /*db_key*/) {}
+
+void ThreadStatusUpdater::SetThreadOperationProperty(int /*i*/,
+                                                     uint64_t /*value*/) {}
+
+void ThreadStatusUpdater::IncreaseThreadOperationProperty(int /*i*/,
+                                                          uint64_t /*delta*/) {}
+
+#endif  // ROCKSDB_USING_THREAD_STATUS
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/monitoring/thread_status_updater.h b/src/rocksdb/monitoring/thread_status_updater.h
new file mode 100644
index 000000000..762c73ae2
--- /dev/null
+++ b/src/rocksdb/monitoring/thread_status_updater.h
@@ -0,0 +1,223 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// The implementation of ThreadStatus.
+//
+// Note that we make get and set access to ThreadStatusData lockless.
+// As a result, ThreadStatusData as a whole is not atomic.  However,
+// we guarantee consistent ThreadStatusData all the time whenever
+// user call GetThreadList().  This consistency guarantee is done
+// by having the following constraint in the internal implementation
+// of set and get order:
+//
+// 1. When reset any information in ThreadStatusData, always start from
+//    clearing up the lower-level information first.
+// 2. When setting any information in ThreadStatusData, always start from
+//    setting the higher-level information.
+// 3. When returning ThreadStatusData to the user, fields are fetched from
+//    higher-level to lower-level.  In addition, where there's a nullptr
+//    in one field, then all fields that has lower-level than that field
+//    should be ignored.
+//
+// The high to low level information would be:
+// thread_id > thread_type > db > cf > operation > state
+//
+// This means user might not always get full information, but whenever
+// returned by the GetThreadList() is guaranteed to be consistent.
+#pragma once
+#include <atomic>
+#include <list>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "port/port.h"
+#include "rocksdb/status.h"
+#include "rocksdb/thread_status.h"
+#include "util/thread_operation.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ColumnFamilyHandle;
+
+// The structure that keeps constant information about a column family.
+struct ConstantColumnFamilyInfo {
+#ifdef ROCKSDB_USING_THREAD_STATUS
+ public:
+  ConstantColumnFamilyInfo(const void* _db_key, const std::string& _db_name,
+                           const std::string& _cf_name)
+      : db_key(_db_key), db_name(_db_name), cf_name(_cf_name) {}
+  const void* db_key;
+  const std::string db_name;
+  const std::string cf_name;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+};
+
+// the internal data-structure that is used to reflect the current
+// status of a thread using a set of atomic pointers.
+struct ThreadStatusData {
+#ifdef ROCKSDB_USING_THREAD_STATUS
+  explicit ThreadStatusData() : enable_tracking(false) {
+    thread_id.store(0);
+    thread_type.store(ThreadStatus::USER);
+    cf_key.store(nullptr);
+    operation_type.store(ThreadStatus::OP_UNKNOWN);
+    op_start_time.store(0);
+    state_type.store(ThreadStatus::STATE_UNKNOWN);
+  }
+
+  // A flag to indicate whether the thread tracking is enabled
+  // in the current thread.  This value will be updated based on whether
+  // the associated Options::enable_thread_tracking is set to true
+  // in ThreadStatusUtil::SetColumnFamily().
+  //
+  // If set to false, then SetThreadOperation and SetThreadState
+  // will be no-op.
+  bool enable_tracking;
+
+  std::atomic<uint64_t> thread_id;
+  std::atomic<ThreadStatus::ThreadType> thread_type;
+  std::atomic<void*> cf_key;
+  std::atomic<ThreadStatus::OperationType> operation_type;
+  std::atomic<uint64_t> op_start_time;
+  std::atomic<ThreadStatus::OperationStage> operation_stage;
+  std::atomic<uint64_t> op_properties[ThreadStatus::kNumOperationProperties];
+  std::atomic<ThreadStatus::StateType> state_type;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+};
+
+// The class that stores and updates the status of the current thread
+// using a thread-local ThreadStatusData.
+//
+// In most of the case, you should use ThreadStatusUtil to update
+// the status of the current thread instead of using ThreadSatusUpdater
+// directly.
+//
+// @see ThreadStatusUtil
+class ThreadStatusUpdater {
+ public:
+  ThreadStatusUpdater() {}
+
+  // Releases all ThreadStatusData of all active threads.
+  virtual ~ThreadStatusUpdater() {}
+
+  // Unregister the current thread.
+  void UnregisterThread();
+
+  // Reset the status of the current thread.  This includes resetting
+  // ColumnFamilyInfoKey, ThreadOperation, and ThreadState.
+  void ResetThreadStatus();
+
+  // Set the id of the current thread.
+  void SetThreadID(uint64_t thread_id);
+
+  // Register the current thread for tracking.
+  void RegisterThread(ThreadStatus::ThreadType ttype, uint64_t thread_id);
+
+  // Update the column-family info of the current thread by setting
+  // its thread-local pointer of ThreadStateInfo to the correct entry.
+  void SetColumnFamilyInfoKey(const void* cf_key);
+
+  // returns the column family info key.
+  const void* GetColumnFamilyInfoKey();
+
+  // Update the thread operation of the current thread.
+  void SetThreadOperation(const ThreadStatus::OperationType type);
+
+  // The start time of the current thread operation.  It is in the format
+  // of micro-seconds since some fixed point in time.
+  void SetOperationStartTime(const uint64_t start_time);
+
+  // Set the "i"th property of the current operation.
+  //
+  // NOTE: Our practice here is to set all the thread operation properties
+  //       and stage before we set thread operation, and thread operation
+  //       will be set in std::memory_order_release.  This is to ensure
+  //       whenever a thread operation is not OP_UNKNOWN, we will always
+  //       have a consistent information on its properties.
+  void SetThreadOperationProperty(int i, uint64_t value);
+
+  // Increase the "i"th property of the current operation with
+  // the specified delta.
+  void IncreaseThreadOperationProperty(int i, uint64_t delta);
+
+  // Update the thread operation stage of the current thread.
+  ThreadStatus::OperationStage SetThreadOperationStage(
+      const ThreadStatus::OperationStage stage);
+
+  // Clear thread operation of the current thread.
+  void ClearThreadOperation();
+
+  // Reset all thread-operation-properties to 0.
+  void ClearThreadOperationProperties();
+
+  // Update the thread state of the current thread.
+  void SetThreadState(const ThreadStatus::StateType type);
+
+  // Clear the thread state of the current thread.
+  void ClearThreadState();
+
+  // Obtain the status of all active registered threads.
+  Status GetThreadList(std::vector<ThreadStatus>* thread_list);
+
+  // Create an entry in the global ColumnFamilyInfo table for the
+  // specified column family.  This function should be called only
+  // when the current thread does not hold db_mutex.
+  void NewColumnFamilyInfo(const void* db_key, const std::string& db_name,
+                           const void* cf_key, const std::string& cf_name);
+
+  // Erase all ConstantColumnFamilyInfo that is associated with the
+  // specified db instance.  This function should be called only when
+  // the current thread does not hold db_mutex.
+  void EraseDatabaseInfo(const void* db_key);
+
+  // Erase the ConstantColumnFamilyInfo that is associated with the
+  // specified ColumnFamilyData.  This function should be called only
+  // when the current thread does not hold db_mutex.
+  void EraseColumnFamilyInfo(const void* cf_key);
+
+  // Verifies whether the input ColumnFamilyHandles matches
+  // the information stored in the current cf_info_map.
+  void TEST_VerifyColumnFamilyInfoMap(
+      const std::vector<ColumnFamilyHandle*>& handles, bool check_exist);
+
+ protected:
+#ifdef ROCKSDB_USING_THREAD_STATUS
+  // The thread-local variable for storing thread status.
+  static thread_local ThreadStatusData* thread_status_data_;
+
+  // Returns the pointer to the thread status data only when the
+  // thread status data is non-null and has enable_tracking == true.
+  ThreadStatusData* GetLocalThreadStatus();
+
+  // Directly returns the pointer to thread_status_data_ without
+  // checking whether enabling_tracking is true of not.
+  ThreadStatusData* Get() { return thread_status_data_; }
+
+  // The mutex that protects cf_info_map and db_key_map.
+  std::mutex thread_list_mutex_;
+
+  // The current status data of all active threads.
+  std::unordered_set<ThreadStatusData*> thread_data_set_;
+
+  // A global map that keeps the column family information.  It is stored
+  // globally instead of inside DB is to avoid the situation where DB is
+  // closing while GetThreadList function already get the pointer to its
+  // CopnstantColumnFamilyInfo.
+  std::unordered_map<const void*, ConstantColumnFamilyInfo> cf_info_map_;
+
+  // A db_key to cf_key map that allows erasing elements in cf_info_map
+  // associated to the same db_key faster.
+  std::unordered_map<const void*, std::unordered_set<const void*>> db_key_map_;
+
+#else
+  static ThreadStatusData* thread_status_data_;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/monitoring/thread_status_updater_debug.cc b/src/rocksdb/monitoring/thread_status_updater_debug.cc
new file mode 100644
index 000000000..464c23bba
--- /dev/null
+++ b/src/rocksdb/monitoring/thread_status_updater_debug.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <mutex>
+
+#include "db/column_family.h"
+#include "monitoring/thread_status_updater.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef NDEBUG
+#ifdef ROCKSDB_USING_THREAD_STATUS
+void ThreadStatusUpdater::TEST_VerifyColumnFamilyInfoMap(
+    const std::vector<ColumnFamilyHandle*>& handles, bool check_exist) {
+  std::unique_lock<std::mutex> lock(thread_list_mutex_);
+  if (check_exist) {
+    assert(cf_info_map_.size() == handles.size());
+  }
+  for (auto* handle : handles) {
+    auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(handle)->cfd();
+    auto iter __attribute__((__unused__)) = cf_info_map_.find(cfd);
+    if (check_exist) {
+      assert(iter != cf_info_map_.end());
+      assert(iter->second.cf_name == cfd->GetName());
+    } else {
+      assert(iter == cf_info_map_.end());
+    }
+  }
+}
+
+#else
+
+void ThreadStatusUpdater::TEST_VerifyColumnFamilyInfoMap(
+    const std::vector<ColumnFamilyHandle*>& /*handles*/, bool /*check_exist*/) {
+}
+
+#endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NDEBUG
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/monitoring/thread_status_util.cc b/src/rocksdb/monitoring/thread_status_util.cc
new file mode 100644
index 000000000..c07b85fa8
--- /dev/null
+++ b/src/rocksdb/monitoring/thread_status_util.cc
@@ -0,0 +1,207 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "monitoring/thread_status_util.h"
+
+#include "monitoring/thread_status_updater.h"
+#include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifdef ROCKSDB_USING_THREAD_STATUS
+thread_local ThreadStatusUpdater*
+    ThreadStatusUtil::thread_updater_local_cache_ = nullptr;
+thread_local bool ThreadStatusUtil::thread_updater_initialized_ = false;
+
+void ThreadStatusUtil::RegisterThread(const Env* env,
+                                      ThreadStatus::ThreadType thread_type) {
+  if (!MaybeInitThreadLocalUpdater(env)) {
+    return;
+  }
+  assert(thread_updater_local_cache_);
+  thread_updater_local_cache_->RegisterThread(thread_type, env->GetThreadID());
+}
+
+void ThreadStatusUtil::UnregisterThread() {
+  thread_updater_initialized_ = false;
+  if (thread_updater_local_cache_ != nullptr) {
+    thread_updater_local_cache_->UnregisterThread();
+    thread_updater_local_cache_ = nullptr;
+  }
+}
+
+void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* cfd,
+                                       const Env* env,
+                                       bool enable_thread_tracking) {
+  if (!MaybeInitThreadLocalUpdater(env)) {
+    return;
+  }
+  assert(thread_updater_local_cache_);
+  if (cfd != nullptr && enable_thread_tracking) {
+    thread_updater_local_cache_->SetColumnFamilyInfoKey(cfd);
+  } else {
+    // When cfd == nullptr or enable_thread_tracking == false, we set
+    // ColumnFamilyInfoKey to nullptr, which makes SetThreadOperation
+    // and SetThreadState become no-op.
+    thread_updater_local_cache_->SetColumnFamilyInfoKey(nullptr);
+  }
+}
+
+void ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType op) {
+  if (thread_updater_local_cache_ == nullptr) {
+    // thread_updater_local_cache_ must be set in SetColumnFamily
+    // or other ThreadStatusUtil functions.
+    return;
+  }
+
+  if (op != ThreadStatus::OP_UNKNOWN) {
+    uint64_t current_time = SystemClock::Default()->NowMicros();
+    thread_updater_local_cache_->SetOperationStartTime(current_time);
+  } else {
+    // TDOO(yhchiang): we could report the time when we set operation to
+    // OP_UNKNOWN once the whole instrumentation has been done.
+    thread_updater_local_cache_->SetOperationStartTime(0);
+  }
+  thread_updater_local_cache_->SetThreadOperation(op);
+}
+
+ThreadStatus::OperationStage ThreadStatusUtil::SetThreadOperationStage(
+    ThreadStatus::OperationStage stage) {
+  if (thread_updater_local_cache_ == nullptr) {
+    // thread_updater_local_cache_ must be set in SetColumnFamily
+    // or other ThreadStatusUtil functions.
+    return ThreadStatus::STAGE_UNKNOWN;
+  }
+
+  return thread_updater_local_cache_->SetThreadOperationStage(stage);
+}
+
+void ThreadStatusUtil::SetThreadOperationProperty(int code, uint64_t value) {
+  if (thread_updater_local_cache_ == nullptr) {
+    // thread_updater_local_cache_ must be set in SetColumnFamily
+    // or other ThreadStatusUtil functions.
+    return;
+  }
+
+  thread_updater_local_cache_->SetThreadOperationProperty(code, value);
+}
+
+void ThreadStatusUtil::IncreaseThreadOperationProperty(int code,
+                                                       uint64_t delta) {
+  if (thread_updater_local_cache_ == nullptr) {
+    // thread_updater_local_cache_ must be set in SetColumnFamily
+    // or other ThreadStatusUtil functions.
+    return;
+  }
+
+  thread_updater_local_cache_->IncreaseThreadOperationProperty(code, delta);
+}
+
+void ThreadStatusUtil::SetThreadState(ThreadStatus::StateType state) {
+  if (thread_updater_local_cache_ == nullptr) {
+    // thread_updater_local_cache_ must be set in SetColumnFamily
+    // or other ThreadStatusUtil functions.
+    return;
+  }
+
+  thread_updater_local_cache_->SetThreadState(state);
+}
+
+void ThreadStatusUtil::ResetThreadStatus() {
+  if (thread_updater_local_cache_ == nullptr) {
+    return;
+  }
+  thread_updater_local_cache_->ResetThreadStatus();
+}
+
+void ThreadStatusUtil::NewColumnFamilyInfo(const DB* db,
+                                           const ColumnFamilyData* cfd,
+                                           const std::string& cf_name,
+                                           const Env* env) {
+  if (!MaybeInitThreadLocalUpdater(env)) {
+    return;
+  }
+  assert(thread_updater_local_cache_);
+  if (thread_updater_local_cache_) {
+    thread_updater_local_cache_->NewColumnFamilyInfo(db, db->GetName(), cfd,
+                                                     cf_name);
+  }
+}
+
+void ThreadStatusUtil::EraseColumnFamilyInfo(const ColumnFamilyData* cfd) {
+  if (thread_updater_local_cache_ == nullptr) {
+    return;
+  }
+  thread_updater_local_cache_->EraseColumnFamilyInfo(cfd);
+}
+
+void ThreadStatusUtil::EraseDatabaseInfo(const DB* db) {
+  ThreadStatusUpdater* thread_updater = db->GetEnv()->GetThreadStatusUpdater();
+  if (thread_updater == nullptr) {
+    return;
+  }
+  thread_updater->EraseDatabaseInfo(db);
+}
+
+bool ThreadStatusUtil::MaybeInitThreadLocalUpdater(const Env* env) {
+  if (!thread_updater_initialized_ && env != nullptr) {
+    thread_updater_initialized_ = true;
+    thread_updater_local_cache_ = env->GetThreadStatusUpdater();
+  }
+  return (thread_updater_local_cache_ != nullptr);
+}
+
+AutoThreadOperationStageUpdater::AutoThreadOperationStageUpdater(
+    ThreadStatus::OperationStage stage) {
+  prev_stage_ = ThreadStatusUtil::SetThreadOperationStage(stage);
+}
+
+AutoThreadOperationStageUpdater::~AutoThreadOperationStageUpdater() {
+  ThreadStatusUtil::SetThreadOperationStage(prev_stage_);
+}
+
+#else
+
+ThreadStatusUpdater* ThreadStatusUtil::thread_updater_local_cache_ = nullptr;
+bool ThreadStatusUtil::thread_updater_initialized_ = false;
+
+bool ThreadStatusUtil::MaybeInitThreadLocalUpdater(const Env* /*env*/) {
+  return false;
+}
+
+void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* /*cfd*/,
+                                       const Env* /*env*/,
+                                       bool /*enable_thread_tracking*/) {}
+
+void ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType /*op*/) {}
+
+void ThreadStatusUtil::SetThreadOperationProperty(int /*code*/,
+                                                  uint64_t /*value*/) {}
+
+void ThreadStatusUtil::IncreaseThreadOperationProperty(int /*code*/,
+                                                       uint64_t /*delta*/) {}
+
+void ThreadStatusUtil::SetThreadState(ThreadStatus::StateType /*state*/) {}
+
+void ThreadStatusUtil::NewColumnFamilyInfo(const DB* /*db*/,
+                                           const ColumnFamilyData* /*cfd*/,
+                                           const std::string& /*cf_name*/,
+                                           const Env* /*env*/) {}
+
+void ThreadStatusUtil::EraseColumnFamilyInfo(const ColumnFamilyData* /*cfd*/) {}
+
+void ThreadStatusUtil::EraseDatabaseInfo(const DB* /*db*/) {}
+
+void ThreadStatusUtil::ResetThreadStatus() {}
+
+AutoThreadOperationStageUpdater::AutoThreadOperationStageUpdater(
+    ThreadStatus::OperationStage /*stage*/) {}
+
+AutoThreadOperationStageUpdater::~AutoThreadOperationStageUpdater() {}
+
+#endif  // ROCKSDB_USING_THREAD_STATUS
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/monitoring/thread_status_util.h b/src/rocksdb/monitoring/thread_status_util.h
new file mode 100644
index 000000000..0137d2682
--- /dev/null
+++ b/src/rocksdb/monitoring/thread_status_util.h
@@ -0,0 +1,131 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+
+#include "monitoring/thread_status_updater.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/thread_status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ColumnFamilyData;
+
+// The static utility class for updating thread-local status.
+//
+// The thread-local status is updated via the thread-local cached
+// pointer thread_updater_local_cache_.  During each function call,
+// when ThreadStatusUtil finds thread_updater_local_cache_ is
+// left uninitialized (determined by thread_updater_initialized_),
+// it will tries to initialize it using the return value of
+// Env::GetThreadStatusUpdater().  When thread_updater_local_cache_
+// is initialized by a non-null pointer, each function call will
+// then update the status of the current thread.  Otherwise,
+// all function calls to ThreadStatusUtil will be no-op.
+class ThreadStatusUtil {
+ public:
+  // Register the current thread for tracking.
+  static void RegisterThread(const Env* env,
+                             ThreadStatus::ThreadType thread_type);
+
+  // Unregister the current thread.
+  static void UnregisterThread();
+
+  // Create an entry in the global ColumnFamilyInfo table for the
+  // specified column family.  This function should be called only
+  // when the current thread does not hold db_mutex.
+  static void NewColumnFamilyInfo(const DB* db, const ColumnFamilyData* cfd,
+                                  const std::string& cf_name, const Env* env);
+
+  // Erase the ConstantColumnFamilyInfo that is associated with the
+  // specified ColumnFamilyData.  This function should be called only
+  // when the current thread does not hold db_mutex.
+  static void EraseColumnFamilyInfo(const ColumnFamilyData* cfd);
+
+  // Erase all ConstantColumnFamilyInfo that is associated with the
+  // specified db instance.  This function should be called only when
+  // the current thread does not hold db_mutex.
+  static void EraseDatabaseInfo(const DB* db);
+
+  // Update the thread status to indicate the current thread is doing
+  // something related to the specified column family.
+  static void SetColumnFamily(const ColumnFamilyData* cfd, const Env* env,
+                              bool enable_thread_tracking);
+
+  static void SetThreadOperation(ThreadStatus::OperationType type);
+
+  static ThreadStatus::OperationStage SetThreadOperationStage(
+      ThreadStatus::OperationStage stage);
+
+  static void SetThreadOperationProperty(int code, uint64_t value);
+
+  static void IncreaseThreadOperationProperty(int code, uint64_t delta);
+
+  static void SetThreadState(ThreadStatus::StateType type);
+
+  static void ResetThreadStatus();
+
+#ifndef NDEBUG
+  static void TEST_SetStateDelay(const ThreadStatus::StateType state,
+                                 int micro);
+  static void TEST_StateDelay(const ThreadStatus::StateType state);
+#endif
+
+ protected:
+  // Initialize the thread-local ThreadStatusUpdater when it finds
+  // the cached value is nullptr.  Returns true if it has cached
+  // a non-null pointer.
+  static bool MaybeInitThreadLocalUpdater(const Env* env);
+
+#ifdef ROCKSDB_USING_THREAD_STATUS
+  // A boolean flag indicating whether thread_updater_local_cache_
+  // is initialized.  It is set to true when an Env uses any
+  // ThreadStatusUtil functions using the current thread other
+  // than UnregisterThread().  It will be set to false when
+  // UnregisterThread() is called.
+  //
+  // When this variable is set to true, thread_updater_local_cache_
+  // will not be updated until this variable is again set to false
+  // in UnregisterThread().
+  static thread_local bool thread_updater_initialized_;
+
+  // The thread-local cached ThreadStatusUpdater that caches the
+  // thread_status_updater_ of the first Env that uses any ThreadStatusUtil
+  // function other than UnregisterThread().  This variable will
+  // be cleared when UnregisterThread() is called.
+  //
+  // When this variable is set to a non-null pointer, then the status
+  // of the current thread will be updated when a function of
+  // ThreadStatusUtil is called.  Otherwise, all functions of
+  // ThreadStatusUtil will be no-op.
+  //
+  // When thread_updater_initialized_ is set to true, this variable
+  // will not be updated until this thread_updater_initialized_ is
+  // again set to false in UnregisterThread().
+  static thread_local ThreadStatusUpdater* thread_updater_local_cache_;
+#else
+  static bool thread_updater_initialized_;
+  static ThreadStatusUpdater* thread_updater_local_cache_;
+#endif
+};
+
+// A helper class for updating thread state.  It will set the
+// thread state according to the input parameter in its constructor
+// and set the thread state to the previous state in its destructor.
+class AutoThreadOperationStageUpdater {
+ public:
+  explicit AutoThreadOperationStageUpdater(ThreadStatus::OperationStage stage);
+  ~AutoThreadOperationStageUpdater();
+
+#ifdef ROCKSDB_USING_THREAD_STATUS
+ private:
+  ThreadStatus::OperationStage prev_stage_;
+#endif
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/monitoring/thread_status_util_debug.cc b/src/rocksdb/monitoring/thread_status_util_debug.cc
new file mode 100644
index 000000000..f7a94355d
--- /dev/null
+++ b/src/rocksdb/monitoring/thread_status_util_debug.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <atomic>
+
+#include "monitoring/thread_status_updater.h"
+#include "monitoring/thread_status_util.h"
+#include "rocksdb/system_clock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef NDEBUG
+// the delay for debugging purpose.
+static std::atomic<int> states_delay[ThreadStatus::NUM_STATE_TYPES];
+
+void ThreadStatusUtil::TEST_SetStateDelay(const ThreadStatus::StateType state,
+                                          int micro) {
+  states_delay[state].store(micro, std::memory_order_relaxed);
+}
+
+void ThreadStatusUtil::TEST_StateDelay(const ThreadStatus::StateType state) {
+  auto delay = states_delay[state].load(std::memory_order_relaxed);
+  if (delay > 0) {
+    SystemClock::Default()->SleepForMicroseconds(delay);
+  }
+}
+
+#endif  // !NDEBUG
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/options/cf_options.cc b/src/rocksdb/options/cf_options.cc
new file mode 100644
index 000000000..dbf0bf9b0
--- /dev/null
+++ b/src/rocksdb/options/cf_options.cc
@@ -0,0 +1,1166 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "options/cf_options.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <limits>
+#include <string>
+
+#include "logging/logging.h"
+#include "options/configurable_helper.h"
+#include "options/db_options.h"
+#include "options/options_helper.h"
+#include "options/options_parser.h"
+#include "port/port.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/concurrent_task_limiter.h"
+#include "rocksdb/configurable.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/cast_util.h"
+
+// NOTE: in this file, many option flags that were deprecated
+// and removed from the rest of the code have to be kept here
+// and marked as kDeprecated in order to be able to read old
+// OPTIONS files.
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+static Status ParseCompressionOptions(const std::string& value,
+                                      const std::string& name,
+                                      CompressionOptions& compression_opts) {
+  const char kDelimiter = ':';
+  std::istringstream field_stream(value);
+  std::string field;
+
+  if (!std::getline(field_stream, field, kDelimiter)) {
+    return Status::InvalidArgument("unable to parse the specified CF option " +
+                                   name);
+  }
+  compression_opts.window_bits = ParseInt(field);
+
+  if (!std::getline(field_stream, field, kDelimiter)) {
+    return Status::InvalidArgument("unable to parse the specified CF option " +
+                                   name);
+  }
+  compression_opts.level = ParseInt(field);
+
+  if (!std::getline(field_stream, field, kDelimiter)) {
+    return Status::InvalidArgument("unable to parse the specified CF option " +
+                                   name);
+  }
+  compression_opts.strategy = ParseInt(field);
+
+  // max_dict_bytes is optional for backwards compatibility
+  if (!field_stream.eof()) {
+    if (!std::getline(field_stream, field, kDelimiter)) {
+      return Status::InvalidArgument(
+          "unable to parse the specified CF option " + name);
+    }
+    compression_opts.max_dict_bytes = ParseInt(field);
+  }
+
+  // zstd_max_train_bytes is optional for backwards compatibility
+  if (!field_stream.eof()) {
+    if (!std::getline(field_stream, field, kDelimiter)) {
+      return Status::InvalidArgument(
+          "unable to parse the specified CF option " + name);
+    }
+    compression_opts.zstd_max_train_bytes = ParseInt(field);
+  }
+
+  // parallel_threads is optional for backwards compatibility
+  if (!field_stream.eof()) {
+    if (!std::getline(field_stream, field, kDelimiter)) {
+      return Status::InvalidArgument(
+          "unable to parse the specified CF option " + name);
+    }
+    // Since parallel_threads comes before enabled but was added optionally
+    // later, we need to check if this is the final token (meaning it is the
+    // enabled bit), or if there are more tokens (meaning this one is
+    // parallel_threads).
+    if (!field_stream.eof()) {
+      compression_opts.parallel_threads = ParseInt(field);
+    } else {
+      // parallel_threads is not serialized with this format, but enabled is
+      compression_opts.enabled = ParseBoolean("", field);
+    }
+  }
+
+  // enabled is optional for backwards compatibility
+  if (!field_stream.eof()) {
+    if (!std::getline(field_stream, field, kDelimiter)) {
+      return Status::InvalidArgument(
+          "unable to parse the specified CF option " + name);
+    }
+    compression_opts.enabled = ParseBoolean("", field);
+  }
+
+  // max_dict_buffer_bytes is optional for backwards compatibility
+  if (!field_stream.eof()) {
+    if (!std::getline(field_stream, field, kDelimiter)) {
+      return Status::InvalidArgument(
+          "unable to parse the specified CF option " + name);
+    }
+    compression_opts.max_dict_buffer_bytes = ParseUint64(field);
+  }
+
+  // use_zstd_dict_trainer is optional for backwards compatibility
+  if (!field_stream.eof()) {
+    if (!std::getline(field_stream, field, kDelimiter)) {
+      return Status::InvalidArgument(
+          "unable to parse the specified CF option " + name);
+    }
+    compression_opts.use_zstd_dict_trainer = ParseBoolean("", field);
+  }
+
+  if (!field_stream.eof()) {
+    return Status::InvalidArgument("unable to parse the specified CF option " +
+                                   name);
+  }
+  return Status::OK();
+}
+
+const std::string kOptNameBMCompOpts = "bottommost_compression_opts";
+const std::string kOptNameCompOpts = "compression_opts";
+
+// OptionTypeInfo map for CompressionOptions
+static std::unordered_map<std::string, OptionTypeInfo>
+    compression_options_type_info = {
+        {"window_bits",
+         {offsetof(struct CompressionOptions, window_bits), OptionType::kInt,
+          OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
+        {"level",
+         {offsetof(struct CompressionOptions, level), OptionType::kInt,
+          OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
+        {"strategy",
+         {offsetof(struct CompressionOptions, strategy), OptionType::kInt,
+          OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
+        {"max_dict_bytes",
+         {offsetof(struct CompressionOptions, max_dict_bytes), OptionType::kInt,
+          OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
+        {"zstd_max_train_bytes",
+         {offsetof(struct CompressionOptions, zstd_max_train_bytes),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"parallel_threads",
+         {offsetof(struct CompressionOptions, parallel_threads),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"enabled",
+         {offsetof(struct CompressionOptions, enabled), OptionType::kBoolean,
+          OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
+        {"max_dict_buffer_bytes",
+         {offsetof(struct CompressionOptions, max_dict_buffer_bytes),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"use_zstd_dict_trainer",
+         {offsetof(struct CompressionOptions, use_zstd_dict_trainer),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    fifo_compaction_options_type_info = {
+        {"max_table_files_size",
+         {offsetof(struct CompactionOptionsFIFO, max_table_files_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"age_for_warm",
+         {offsetof(struct CompactionOptionsFIFO, age_for_warm),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"ttl",
+         {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"allow_compaction",
+         {offsetof(struct CompactionOptionsFIFO, allow_compaction),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    universal_compaction_options_type_info = {
+        {"size_ratio",
+         {offsetof(class CompactionOptionsUniversal, size_ratio),
+          OptionType::kUInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"min_merge_width",
+         {offsetof(class CompactionOptionsUniversal, min_merge_width),
+          OptionType::kUInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"max_merge_width",
+         {offsetof(class CompactionOptionsUniversal, max_merge_width),
+          OptionType::kUInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"max_size_amplification_percent",
+         {offsetof(class CompactionOptionsUniversal,
+                   max_size_amplification_percent),
+          OptionType::kUInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"compression_size_percent",
+         {offsetof(class CompactionOptionsUniversal, compression_size_percent),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"stop_style",
+         {offsetof(class CompactionOptionsUniversal, stop_style),
+          OptionType::kCompactionStopStyle, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"incremental",
+         {offsetof(class CompactionOptionsUniversal, incremental),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"allow_trivial_move",
+         {offsetof(class CompactionOptionsUniversal, allow_trivial_move),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}}};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    cf_mutable_options_type_info = {
+        {"report_bg_io_stats",
+         {offsetof(struct MutableCFOptions, report_bg_io_stats),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"disable_auto_compactions",
+         {offsetof(struct MutableCFOptions, disable_auto_compactions),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"filter_deletes",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kMutable}},
+        {"check_flush_compaction_key_order",
+         {offsetof(struct MutableCFOptions, check_flush_compaction_key_order),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"paranoid_file_checks",
+         {offsetof(struct MutableCFOptions, paranoid_file_checks),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"verify_checksums_in_compaction",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kMutable}},
+        {"soft_pending_compaction_bytes_limit",
+         {offsetof(struct MutableCFOptions,
+                   soft_pending_compaction_bytes_limit),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"hard_pending_compaction_bytes_limit",
+         {offsetof(struct MutableCFOptions,
+                   hard_pending_compaction_bytes_limit),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"hard_rate_limit",
+         {0, OptionType::kDouble, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kMutable}},
+        {"soft_rate_limit",
+         {0, OptionType::kDouble, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kMutable}},
+        {"max_compaction_bytes",
+         {offsetof(struct MutableCFOptions, max_compaction_bytes),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"ignore_max_compaction_bytes_for_input",
+         {offsetof(struct MutableCFOptions,
+                   ignore_max_compaction_bytes_for_input),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"expanded_compaction_factor",
+         {0, OptionType::kInt, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kMutable}},
+        {"level0_file_num_compaction_trigger",
+         {offsetof(struct MutableCFOptions, level0_file_num_compaction_trigger),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"level0_slowdown_writes_trigger",
+         {offsetof(struct MutableCFOptions, level0_slowdown_writes_trigger),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"level0_stop_writes_trigger",
+         {offsetof(struct MutableCFOptions, level0_stop_writes_trigger),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"max_grandparent_overlap_factor",
+         {0, OptionType::kInt, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kMutable}},
+        {"max_write_buffer_number",
+         {offsetof(struct MutableCFOptions, max_write_buffer_number),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"source_compaction_factor",
+         {0, OptionType::kInt, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kMutable}},
+        {"target_file_size_multiplier",
+         {offsetof(struct MutableCFOptions, target_file_size_multiplier),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"arena_block_size",
+         {offsetof(struct MutableCFOptions, arena_block_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"inplace_update_num_locks",
+         {offsetof(struct MutableCFOptions, inplace_update_num_locks),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"max_successive_merges",
+         {offsetof(struct MutableCFOptions, max_successive_merges),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"memtable_huge_page_size",
+         {offsetof(struct MutableCFOptions, memtable_huge_page_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"memtable_prefix_bloom_huge_page_tlb_size",
+         {0, OptionType::kSizeT, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kMutable}},
+        {"write_buffer_size",
+         {offsetof(struct MutableCFOptions, write_buffer_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"memtable_prefix_bloom_bits",
+         {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kMutable}},
+        {"memtable_prefix_bloom_size_ratio",
+         {offsetof(struct MutableCFOptions, memtable_prefix_bloom_size_ratio),
+          OptionType::kDouble, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"memtable_prefix_bloom_probes",
+         {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kMutable}},
+        {"memtable_whole_key_filtering",
+         {offsetof(struct MutableCFOptions, memtable_whole_key_filtering),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"min_partial_merge_operands",
+         {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kMutable}},
+        {"max_bytes_for_level_base",
+         {offsetof(struct MutableCFOptions, max_bytes_for_level_base),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"snap_refresh_nanos",
+         {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kMutable}},
+        {"max_bytes_for_level_multiplier",
+         {offsetof(struct MutableCFOptions, max_bytes_for_level_multiplier),
+          OptionType::kDouble, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"max_bytes_for_level_multiplier_additional",
+         OptionTypeInfo::Vector<int>(
+             offsetof(struct MutableCFOptions,
+                      max_bytes_for_level_multiplier_additional),
+             OptionVerificationType::kNormal, OptionTypeFlags::kMutable,
+             {0, OptionType::kInt})},
+        {"max_sequential_skip_in_iterations",
+         {offsetof(struct MutableCFOptions, max_sequential_skip_in_iterations),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"target_file_size_base",
+         {offsetof(struct MutableCFOptions, target_file_size_base),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"compression",
+         {offsetof(struct MutableCFOptions, compression),
+          OptionType::kCompressionType, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"prefix_extractor",
+         OptionTypeInfo::AsCustomSharedPtr<const SliceTransform>(
+             offsetof(struct MutableCFOptions, prefix_extractor),
+             OptionVerificationType::kByNameAllowNull,
+             (OptionTypeFlags::kMutable | OptionTypeFlags::kAllowNull))},
+        {"compaction_options_fifo",
+         OptionTypeInfo::Struct(
+             "compaction_options_fifo", &fifo_compaction_options_type_info,
+             offsetof(struct MutableCFOptions, compaction_options_fifo),
+             OptionVerificationType::kNormal, OptionTypeFlags::kMutable)
+             .SetParseFunc([](const ConfigOptions& opts,
+                              const std::string& name, const std::string& value,
+                              void* addr) {
+               // This is to handle backward compatibility, where
+               // compaction_options_fifo could be assigned a single scalar
+               // value, say, like "23", which would be assigned to
+               // max_table_files_size.
+               if (name == "compaction_options_fifo" &&
+                   value.find("=") == std::string::npos) {
+                 // Old format. Parse just a single uint64_t value.
+                 auto options = static_cast<CompactionOptionsFIFO*>(addr);
+                 options->max_table_files_size = ParseUint64(value);
+                 return Status::OK();
+               } else {
+                 return OptionTypeInfo::ParseStruct(
+                     opts, "compaction_options_fifo",
+                     &fifo_compaction_options_type_info, name, value, addr);
+               }
+             })},
+        {"compaction_options_universal",
+         OptionTypeInfo::Struct(
+             "compaction_options_universal",
+             &universal_compaction_options_type_info,
+             offsetof(struct MutableCFOptions, compaction_options_universal),
+             OptionVerificationType::kNormal, OptionTypeFlags::kMutable)},
+        {"ttl",
+         {offsetof(struct MutableCFOptions, ttl), OptionType::kUInt64T,
+          OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
+        {"periodic_compaction_seconds",
+         {offsetof(struct MutableCFOptions, periodic_compaction_seconds),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"bottommost_temperature",
+         {0, OptionType::kTemperature, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kMutable}},
+        {"last_level_temperature",
+         {offsetof(struct MutableCFOptions, last_level_temperature),
+          OptionType::kTemperature, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"enable_blob_files",
+         {offsetof(struct MutableCFOptions, enable_blob_files),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"min_blob_size",
+         {offsetof(struct MutableCFOptions, min_blob_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"blob_file_size",
+         {offsetof(struct MutableCFOptions, blob_file_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"blob_compression_type",
+         {offsetof(struct MutableCFOptions, blob_compression_type),
+          OptionType::kCompressionType, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"enable_blob_garbage_collection",
+         {offsetof(struct MutableCFOptions, enable_blob_garbage_collection),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"blob_garbage_collection_age_cutoff",
+         {offsetof(struct MutableCFOptions, blob_garbage_collection_age_cutoff),
+          OptionType::kDouble, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"blob_garbage_collection_force_threshold",
+         {offsetof(struct MutableCFOptions,
+                   blob_garbage_collection_force_threshold),
+          OptionType::kDouble, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"blob_compaction_readahead_size",
+         {offsetof(struct MutableCFOptions, blob_compaction_readahead_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"blob_file_starting_level",
+         {offsetof(struct MutableCFOptions, blob_file_starting_level),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"prepopulate_blob_cache",
+         OptionTypeInfo::Enum<PrepopulateBlobCache>(
+             offsetof(struct MutableCFOptions, prepopulate_blob_cache),
+             &prepopulate_blob_cache_string_map, OptionTypeFlags::kMutable)},
+        {"sample_for_compression",
+         {offsetof(struct MutableCFOptions, sample_for_compression),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"bottommost_compression",
+         {offsetof(struct MutableCFOptions, bottommost_compression),
+          OptionType::kCompressionType, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"compression_per_level",
+         OptionTypeInfo::Vector<CompressionType>(
+             offsetof(struct MutableCFOptions, compression_per_level),
+             OptionVerificationType::kNormal, OptionTypeFlags::kMutable,
+             {0, OptionType::kCompressionType})},
+        {"experimental_mempurge_threshold",
+         {offsetof(struct MutableCFOptions, experimental_mempurge_threshold),
+          OptionType::kDouble, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"memtable_protection_bytes_per_key",
+         {offsetof(struct MutableCFOptions, memtable_protection_bytes_per_key),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {kOptNameCompOpts,
+         OptionTypeInfo::Struct(
+             kOptNameCompOpts, &compression_options_type_info,
+             offsetof(struct MutableCFOptions, compression_opts),
+             OptionVerificationType::kNormal,
+             (OptionTypeFlags::kMutable | OptionTypeFlags::kCompareNever),
+             [](const ConfigOptions& opts, const std::string& name,
+                const std::string& value, void* addr) {
+               // This is to handle backward compatibility, where
+               // compression_options was a ":" separated list.
+               if (name == kOptNameCompOpts &&
+                   value.find("=") == std::string::npos) {
+                 auto* compression = static_cast<CompressionOptions*>(addr);
+                 return ParseCompressionOptions(value, name, *compression);
+               } else {
+                 return OptionTypeInfo::ParseStruct(
+                     opts, kOptNameCompOpts, &compression_options_type_info,
+                     name, value, addr);
+               }
+             })},
+        {kOptNameBMCompOpts,
+         OptionTypeInfo::Struct(
+             kOptNameBMCompOpts, &compression_options_type_info,
+             offsetof(struct MutableCFOptions, bottommost_compression_opts),
+             OptionVerificationType::kNormal,
+             (OptionTypeFlags::kMutable | OptionTypeFlags::kCompareNever),
+             [](const ConfigOptions& opts, const std::string& name,
+                const std::string& value, void* addr) {
+               // This is to handle backward compatibility, where
+               // compression_options was a ":" separated list.
+               if (name == kOptNameBMCompOpts &&
+                   value.find("=") == std::string::npos) {
+                 auto* compression = static_cast<CompressionOptions*>(addr);
+                 return ParseCompressionOptions(value, name, *compression);
+               } else {
+                 return OptionTypeInfo::ParseStruct(
+                     opts, kOptNameBMCompOpts, &compression_options_type_info,
+                     name, value, addr);
+               }
+             })},
+        // End special case properties
+};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    cf_immutable_options_type_info = {
+        /* not yet supported
+        CompressionOptions compression_opts;
+        TablePropertiesCollectorFactories table_properties_collector_factories;
+        using TablePropertiesCollectorFactories =
+            std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>;
+        UpdateStatus (*inplace_callback)(char* existing_value,
+                                         uint34_t* existing_value_size,
+                                         Slice delta_value,
+                                         std::string* merged_value);
+        std::vector<DbPath> cf_paths;
+         */
+        {"compaction_measure_io_stats",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"purge_redundant_kvs_while_flush",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"inplace_update_support",
+         {offsetof(struct ImmutableCFOptions, inplace_update_support),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"level_compaction_dynamic_level_bytes",
+         {offsetof(struct ImmutableCFOptions,
+                   level_compaction_dynamic_level_bytes),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"level_compaction_dynamic_file_size",
+         {offsetof(struct ImmutableCFOptions,
+                   level_compaction_dynamic_file_size),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"optimize_filters_for_hits",
+         {offsetof(struct ImmutableCFOptions, optimize_filters_for_hits),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"force_consistency_checks",
+         {offsetof(struct ImmutableCFOptions, force_consistency_checks),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"preclude_last_level_data_seconds",
+         {offsetof(struct ImmutableCFOptions, preclude_last_level_data_seconds),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"preserve_internal_time_seconds",
+         {offsetof(struct ImmutableCFOptions, preserve_internal_time_seconds),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        // Need to keep this around to be able to read old OPTIONS files.
+        {"max_mem_compaction_level",
+         {0, OptionType::kInt, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"max_write_buffer_number_to_maintain",
+         {offsetof(struct ImmutableCFOptions,
+                   max_write_buffer_number_to_maintain),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone, 0}},
+        {"max_write_buffer_size_to_maintain",
+         {offsetof(struct ImmutableCFOptions,
+                   max_write_buffer_size_to_maintain),
+          OptionType::kInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"min_write_buffer_number_to_merge",
+         {offsetof(struct ImmutableCFOptions, min_write_buffer_number_to_merge),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone, 0}},
+        {"num_levels",
+         {offsetof(struct ImmutableCFOptions, num_levels), OptionType::kInt,
+          OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+        {"bloom_locality",
+         {offsetof(struct ImmutableCFOptions, bloom_locality),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"rate_limit_delay_max_milliseconds",
+         {0, OptionType::kUInt, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"comparator",
+         OptionTypeInfo::AsCustomRawPtr<const Comparator>(
+             offsetof(struct ImmutableCFOptions, user_comparator),
+             OptionVerificationType::kByName, OptionTypeFlags::kCompareLoose)
+             .SetSerializeFunc(
+                 // Serializes a Comparator
+                 [](const ConfigOptions& opts, const std::string&,
+                    const void* addr, std::string* value) {
+                   // it's a const pointer of const Comparator*
+                   const auto* ptr =
+                       static_cast<const Comparator* const*>(addr);
+                   // Since the user-specified comparator will be wrapped by
+                   // InternalKeyComparator, we should persist the
+                   // user-specified one instead of InternalKeyComparator.
+                   if (*ptr == nullptr) {
+                     *value = kNullptrString;
+                   } else if (opts.mutable_options_only) {
+                     *value = "";
+                   } else {
+                     const Comparator* root_comp = (*ptr)->GetRootComparator();
+                     if (root_comp == nullptr) {
+                       root_comp = (*ptr);
+                     }
+                     *value = root_comp->ToString(opts);
+                   }
+                   return Status::OK();
+                 })},
+        {"memtable_insert_with_hint_prefix_extractor",
+         OptionTypeInfo::AsCustomSharedPtr<const SliceTransform>(
+             offsetof(struct ImmutableCFOptions,
+                      memtable_insert_with_hint_prefix_extractor),
+             OptionVerificationType::kByNameAllowNull, OptionTypeFlags::kNone)},
+        {"memtable_factory",
+         {offsetof(struct ImmutableCFOptions, memtable_factory),
+          OptionType::kCustomizable, OptionVerificationType::kByName,
+          OptionTypeFlags::kShared,
+          [](const ConfigOptions& opts, const std::string&,
+             const std::string& value, void* addr) {
+            std::unique_ptr<MemTableRepFactory> factory;
+            auto* shared =
+                static_cast<std::shared_ptr<MemTableRepFactory>*>(addr);
+            Status s =
+                MemTableRepFactory::CreateFromString(opts, value, shared);
+            return s;
+          }}},
+        {"memtable",
+         {offsetof(struct ImmutableCFOptions, memtable_factory),
+          OptionType::kCustomizable, OptionVerificationType::kAlias,
+          OptionTypeFlags::kShared,
+          [](const ConfigOptions& opts, const std::string&,
+             const std::string& value, void* addr) {
+            std::unique_ptr<MemTableRepFactory> factory;
+            auto* shared =
+                static_cast<std::shared_ptr<MemTableRepFactory>*>(addr);
+            Status s =
+                MemTableRepFactory::CreateFromString(opts, value, shared);
+            return s;
+          }}},
+        {"table_factory",
+         OptionTypeInfo::AsCustomSharedPtr<TableFactory>(
+             offsetof(struct ImmutableCFOptions, table_factory),
+             OptionVerificationType::kByName,
+             (OptionTypeFlags::kCompareLoose |
+              OptionTypeFlags::kStringNameOnly |
+              OptionTypeFlags::kDontPrepare))},
+        {"block_based_table_factory",
+         {offsetof(struct ImmutableCFOptions, table_factory),
+          OptionType::kCustomizable, OptionVerificationType::kAlias,
+          OptionTypeFlags::kShared | OptionTypeFlags::kCompareLoose,
+          // Parses the input value and creates a BlockBasedTableFactory
+          [](const ConfigOptions& opts, const std::string& name,
+             const std::string& value, void* addr) {
+            BlockBasedTableOptions* old_opts = nullptr;
+            auto table_factory =
+                static_cast<std::shared_ptr<TableFactory>*>(addr);
+            if (table_factory->get() != nullptr) {
+              old_opts =
+                  table_factory->get()->GetOptions<BlockBasedTableOptions>();
+            }
+            if (name == "block_based_table_factory") {
+              std::unique_ptr<TableFactory> new_factory;
+              if (old_opts != nullptr) {
+                new_factory.reset(NewBlockBasedTableFactory(*old_opts));
+              } else {
+                new_factory.reset(NewBlockBasedTableFactory());
+              }
+              Status s = new_factory->ConfigureFromString(opts, value);
+              if (s.ok()) {
+                table_factory->reset(new_factory.release());
+              }
+              return s;
+            } else if (old_opts != nullptr) {
+              return table_factory->get()->ConfigureOption(opts, name, value);
+            } else {
+              return Status::NotFound("Mismatched table option: ", name);
+            }
+          }}},
+        {"plain_table_factory",
+         {offsetof(struct ImmutableCFOptions, table_factory),
+          OptionType::kCustomizable, OptionVerificationType::kAlias,
+          OptionTypeFlags::kShared | OptionTypeFlags::kCompareLoose,
+          // Parses the input value and creates a PlainTableFactory
+          [](const ConfigOptions& opts, const std::string& name,
+             const std::string& value, void* addr) {
+            PlainTableOptions* old_opts = nullptr;
+            auto table_factory =
+                static_cast<std::shared_ptr<TableFactory>*>(addr);
+            if (table_factory->get() != nullptr) {
+              old_opts = table_factory->get()->GetOptions<PlainTableOptions>();
+            }
+            if (name == "plain_table_factory") {
+              std::unique_ptr<TableFactory> new_factory;
+              if (old_opts != nullptr) {
+                new_factory.reset(NewPlainTableFactory(*old_opts));
+              } else {
+                new_factory.reset(NewPlainTableFactory());
+              }
+              Status s = new_factory->ConfigureFromString(opts, value);
+              if (s.ok()) {
+                table_factory->reset(new_factory.release());
+              }
+              return s;
+            } else if (old_opts != nullptr) {
+              return table_factory->get()->ConfigureOption(opts, name, value);
+            } else {
+              return Status::NotFound("Mismatched table option: ", name);
+            }
+          }}},
+        {"table_properties_collectors",
+         OptionTypeInfo::Vector<
+             std::shared_ptr<TablePropertiesCollectorFactory>>(
+             offsetof(struct ImmutableCFOptions,
+                      table_properties_collector_factories),
+             OptionVerificationType::kByName, OptionTypeFlags::kNone,
+             OptionTypeInfo::AsCustomSharedPtr<TablePropertiesCollectorFactory>(
+                 0, OptionVerificationType::kByName, OptionTypeFlags::kNone))},
+        {"compaction_filter",
+         OptionTypeInfo::AsCustomRawPtr<const CompactionFilter>(
+             offsetof(struct ImmutableCFOptions, compaction_filter),
+             OptionVerificationType::kByName, OptionTypeFlags::kAllowNull)},
+        {"compaction_filter_factory",
+         OptionTypeInfo::AsCustomSharedPtr<CompactionFilterFactory>(
+             offsetof(struct ImmutableCFOptions, compaction_filter_factory),
+             OptionVerificationType::kByName, OptionTypeFlags::kAllowNull)},
+        {"merge_operator",
+         OptionTypeInfo::AsCustomSharedPtr<MergeOperator>(
+             offsetof(struct ImmutableCFOptions, merge_operator),
+             OptionVerificationType::kByNameAllowFromNull,
+             OptionTypeFlags::kCompareLoose | OptionTypeFlags::kAllowNull)},
+        {"compaction_style",
+         {offsetof(struct ImmutableCFOptions, compaction_style),
+          OptionType::kCompactionStyle, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"compaction_pri",
+         {offsetof(struct ImmutableCFOptions, compaction_pri),
+          OptionType::kCompactionPri, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"sst_partitioner_factory",
+         OptionTypeInfo::AsCustomSharedPtr<SstPartitionerFactory>(
+             offsetof(struct ImmutableCFOptions, sst_partitioner_factory),
+             OptionVerificationType::kByName, OptionTypeFlags::kAllowNull)},
+        {"blob_cache",
+         {offsetof(struct ImmutableCFOptions, blob_cache), OptionType::kUnknown,
+          OptionVerificationType::kNormal,
+          (OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize),
+          // Parses the input value as a Cache
+          [](const ConfigOptions& opts, const std::string&,
+             const std::string& value, void* addr) {
+            auto* cache = static_cast<std::shared_ptr<Cache>*>(addr);
+            return Cache::CreateFromString(opts, value, cache);
+          }}},
+};
+
+const std::string OptionsHelper::kCFOptionsName = "ColumnFamilyOptions";
+
+class ConfigurableMutableCFOptions : public Configurable {
+ public:
+  explicit ConfigurableMutableCFOptions(const MutableCFOptions& mcf) {
+    mutable_ = mcf;
+    RegisterOptions(&mutable_, &cf_mutable_options_type_info);
+  }
+
+ protected:
+  MutableCFOptions mutable_;
+};
+
+class ConfigurableCFOptions : public ConfigurableMutableCFOptions {
+ public:
+  ConfigurableCFOptions(const ColumnFamilyOptions& opts,
+                        const std::unordered_map<std::string, std::string>* map)
+      : ConfigurableMutableCFOptions(MutableCFOptions(opts)),
+        immutable_(opts),
+        cf_options_(opts),
+        opt_map_(map) {
+    RegisterOptions(&immutable_, &cf_immutable_options_type_info);
+  }
+
+ protected:
+  Status ConfigureOptions(
+      const ConfigOptions& config_options,
+      const std::unordered_map<std::string, std::string>& opts_map,
+      std::unordered_map<std::string, std::string>* unused) override {
+    Status s = Configurable::ConfigureOptions(config_options, opts_map, unused);
+    if (s.ok()) {
+      UpdateColumnFamilyOptions(mutable_, &cf_options_);
+      UpdateColumnFamilyOptions(immutable_, &cf_options_);
+      s = PrepareOptions(config_options);
+    }
+    return s;
+  }
+
+  virtual const void* GetOptionsPtr(const std::string& name) const override {
+    if (name == OptionsHelper::kCFOptionsName) {
+      return &cf_options_;
+    } else {
+      return ConfigurableMutableCFOptions::GetOptionsPtr(name);
+    }
+  }
+
+  bool OptionsAreEqual(const ConfigOptions& config_options,
+                       const OptionTypeInfo& opt_info,
+                       const std::string& opt_name, const void* const this_ptr,
+                       const void* const that_ptr,
+                       std::string* mismatch) const override {
+    bool equals = opt_info.AreEqual(config_options, opt_name, this_ptr,
+                                    that_ptr, mismatch);
+    if (!equals && opt_info.IsByName()) {
+      if (opt_map_ == nullptr) {
+        equals = true;
+      } else {
+        const auto& iter = opt_map_->find(opt_name);
+        if (iter == opt_map_->end()) {
+          equals = true;
+        } else {
+          equals = opt_info.AreEqualByName(config_options, opt_name, this_ptr,
+                                           iter->second);
+        }
+      }
+      if (equals) {  // False alarm, clear mismatch
+        *mismatch = "";
+      }
+    }
+    if (equals && opt_info.IsConfigurable() && opt_map_ != nullptr) {
+      const auto* this_config = opt_info.AsRawPointer<Configurable>(this_ptr);
+      if (this_config == nullptr) {
+        const auto& iter = opt_map_->find(opt_name);
+        // If the name exists in the map and is not empty/null,
+        // then the this_config should be set.
+        if (iter != opt_map_->end() && !iter->second.empty() &&
+            iter->second != kNullptrString) {
+          *mismatch = opt_name;
+          equals = false;
+        }
+      }
+    }
+    return equals;
+  }
+
+ private:
+  ImmutableCFOptions immutable_;
+  ColumnFamilyOptions cf_options_;
+  const std::unordered_map<std::string, std::string>* opt_map_;
+};
+
+std::unique_ptr<Configurable> CFOptionsAsConfigurable(
+    const MutableCFOptions& opts) {
+  std::unique_ptr<Configurable> ptr(new ConfigurableMutableCFOptions(opts));
+  return ptr;
+}
+std::unique_ptr<Configurable> CFOptionsAsConfigurable(
+    const ColumnFamilyOptions& opts,
+    const std::unordered_map<std::string, std::string>* opt_map) {
+  std::unique_ptr<Configurable> ptr(new ConfigurableCFOptions(opts, opt_map));
+  return ptr;
+}
+#endif  // ROCKSDB_LITE
+
+ImmutableCFOptions::ImmutableCFOptions() : ImmutableCFOptions(Options()) {}
+
+ImmutableCFOptions::ImmutableCFOptions(const ColumnFamilyOptions& cf_options)
+    : compaction_style(cf_options.compaction_style),
+      compaction_pri(cf_options.compaction_pri),
+      user_comparator(cf_options.comparator),
+      internal_comparator(InternalKeyComparator(cf_options.comparator)),
+      merge_operator(cf_options.merge_operator),
+      compaction_filter(cf_options.compaction_filter),
+      compaction_filter_factory(cf_options.compaction_filter_factory),
+      min_write_buffer_number_to_merge(
+          cf_options.min_write_buffer_number_to_merge),
+      max_write_buffer_number_to_maintain(
+          cf_options.max_write_buffer_number_to_maintain),
+      max_write_buffer_size_to_maintain(
+          cf_options.max_write_buffer_size_to_maintain),
+      inplace_update_support(cf_options.inplace_update_support),
+      inplace_callback(cf_options.inplace_callback),
+      memtable_factory(cf_options.memtable_factory),
+      table_factory(cf_options.table_factory),
+      table_properties_collector_factories(
+          cf_options.table_properties_collector_factories),
+      bloom_locality(cf_options.bloom_locality),
+      level_compaction_dynamic_level_bytes(
+          cf_options.level_compaction_dynamic_level_bytes),
+      level_compaction_dynamic_file_size(
+          cf_options.level_compaction_dynamic_file_size),
+      num_levels(cf_options.num_levels),
+      optimize_filters_for_hits(cf_options.optimize_filters_for_hits),
+      force_consistency_checks(cf_options.force_consistency_checks),
+      preclude_last_level_data_seconds(
+          cf_options.preclude_last_level_data_seconds),
+      preserve_internal_time_seconds(cf_options.preserve_internal_time_seconds),
+      memtable_insert_with_hint_prefix_extractor(
+          cf_options.memtable_insert_with_hint_prefix_extractor),
+      cf_paths(cf_options.cf_paths),
+      compaction_thread_limiter(cf_options.compaction_thread_limiter),
+      sst_partitioner_factory(cf_options.sst_partitioner_factory),
+      blob_cache(cf_options.blob_cache) {}
+
+ImmutableOptions::ImmutableOptions() : ImmutableOptions(Options()) {}
+
+ImmutableOptions::ImmutableOptions(const Options& options)
+    : ImmutableOptions(options, options) {}
+
+ImmutableOptions::ImmutableOptions(const DBOptions& db_options,
+                                   const ColumnFamilyOptions& cf_options)
+    : ImmutableDBOptions(db_options), ImmutableCFOptions(cf_options) {}
+
+ImmutableOptions::ImmutableOptions(const DBOptions& db_options,
+                                   const ImmutableCFOptions& cf_options)
+    : ImmutableDBOptions(db_options), ImmutableCFOptions(cf_options) {}
+
+ImmutableOptions::ImmutableOptions(const ImmutableDBOptions& db_options,
+                                   const ColumnFamilyOptions& cf_options)
+    : ImmutableDBOptions(db_options), ImmutableCFOptions(cf_options) {}
+
+ImmutableOptions::ImmutableOptions(const ImmutableDBOptions& db_options,
+                                   const ImmutableCFOptions& cf_options)
+    : ImmutableDBOptions(db_options), ImmutableCFOptions(cf_options) {}
+
+// Multiple two operands. If they overflow, return op1.
+uint64_t MultiplyCheckOverflow(uint64_t op1, double op2) {
+  if (op1 == 0 || op2 <= 0) {
+    return 0;
+  }
+  if (std::numeric_limits<uint64_t>::max() / op1 < op2) {
+    return op1;
+  }
+  return static_cast<uint64_t>(op1 * op2);
+}
+
+// when level_compaction_dynamic_level_bytes is true and leveled compaction
+// is used, the base level is not always L1, so precomupted max_file_size can
+// no longer be used. Recompute file_size_for_level from base level.
+uint64_t MaxFileSizeForLevel(const MutableCFOptions& cf_options,
+    int level, CompactionStyle compaction_style, int base_level,
+    bool level_compaction_dynamic_level_bytes) {
+  if (!level_compaction_dynamic_level_bytes || level < base_level ||
+      compaction_style != kCompactionStyleLevel) {
+    assert(level >= 0);
+    assert(level < (int)cf_options.max_file_size.size());
+    return cf_options.max_file_size[level];
+  } else {
+    assert(level >= 0 && base_level >= 0);
+    assert(level - base_level < (int)cf_options.max_file_size.size());
+    return cf_options.max_file_size[level - base_level];
+  }
+}
+
+size_t MaxFileSizeForL0MetaPin(const MutableCFOptions& cf_options) {
+  // We do not want to pin meta-blocks that almost certainly came from intra-L0
+  // or a former larger `write_buffer_size` value to avoid surprising users with
+  // pinned memory usage. We use a factor of 1.5 to account for overhead
+  // introduced during flush in most cases.
+  if (std::numeric_limits<size_t>::max() / 3 <
+      cf_options.write_buffer_size / 2) {
+    return std::numeric_limits<size_t>::max();
+  }
+  return cf_options.write_buffer_size / 2 * 3;
+}
+
+void MutableCFOptions::RefreshDerivedOptions(int num_levels,
+                                             CompactionStyle compaction_style) {
+  max_file_size.resize(num_levels);
+  for (int i = 0; i < num_levels; ++i) {
+    if (i == 0 && compaction_style == kCompactionStyleUniversal) {
+      max_file_size[i] = ULLONG_MAX;
+    } else if (i > 1) {
+      max_file_size[i] = MultiplyCheckOverflow(max_file_size[i - 1],
+                                               target_file_size_multiplier);
+    } else {
+      max_file_size[i] = target_file_size_base;
+    }
+  }
+}
+
+void MutableCFOptions::Dump(Logger* log) const {
+  // Memtable related options
+  ROCKS_LOG_INFO(log,
+                 "                        write_buffer_size: %" ROCKSDB_PRIszt,
+                 write_buffer_size);
+  ROCKS_LOG_INFO(log, "                  max_write_buffer_number: %d",
+                 max_write_buffer_number);
+  ROCKS_LOG_INFO(log,
+                 "                         arena_block_size: %" ROCKSDB_PRIszt,
+                 arena_block_size);
+  ROCKS_LOG_INFO(log, "              memtable_prefix_bloom_ratio: %f",
+                 memtable_prefix_bloom_size_ratio);
+  ROCKS_LOG_INFO(log, "              memtable_whole_key_filtering: %d",
+                 memtable_whole_key_filtering);
+  ROCKS_LOG_INFO(log,
+                 "                  memtable_huge_page_size: %" ROCKSDB_PRIszt,
+                 memtable_huge_page_size);
+  ROCKS_LOG_INFO(log,
+                 "                    max_successive_merges: %" ROCKSDB_PRIszt,
+                 max_successive_merges);
+  ROCKS_LOG_INFO(log,
+                 "                 inplace_update_num_locks: %" ROCKSDB_PRIszt,
+                 inplace_update_num_locks);
+  ROCKS_LOG_INFO(log, "                         prefix_extractor: %s",
+                 prefix_extractor == nullptr
+                     ? "nullptr"
+                     : prefix_extractor->GetId().c_str());
+  ROCKS_LOG_INFO(log, "                 disable_auto_compactions: %d",
+                 disable_auto_compactions);
+  ROCKS_LOG_INFO(log, "      soft_pending_compaction_bytes_limit: %" PRIu64,
+                 soft_pending_compaction_bytes_limit);
+  ROCKS_LOG_INFO(log, "      hard_pending_compaction_bytes_limit: %" PRIu64,
+                 hard_pending_compaction_bytes_limit);
+  ROCKS_LOG_INFO(log, "       level0_file_num_compaction_trigger: %d",
+                 level0_file_num_compaction_trigger);
+  ROCKS_LOG_INFO(log, "           level0_slowdown_writes_trigger: %d",
+                 level0_slowdown_writes_trigger);
+  ROCKS_LOG_INFO(log, "               level0_stop_writes_trigger: %d",
+                 level0_stop_writes_trigger);
+  ROCKS_LOG_INFO(log, "                     max_compaction_bytes: %" PRIu64,
+                 max_compaction_bytes);
+  ROCKS_LOG_INFO(log, "    ignore_max_compaction_bytes_for_input: %s",
+                 ignore_max_compaction_bytes_for_input ? "true" : "false");
+  ROCKS_LOG_INFO(log, "                    target_file_size_base: %" PRIu64,
+                 target_file_size_base);
+  ROCKS_LOG_INFO(log, "              target_file_size_multiplier: %d",
+                 target_file_size_multiplier);
+  ROCKS_LOG_INFO(log, "                 max_bytes_for_level_base: %" PRIu64,
+                 max_bytes_for_level_base);
+  ROCKS_LOG_INFO(log, "           max_bytes_for_level_multiplier: %f",
+                 max_bytes_for_level_multiplier);
+  ROCKS_LOG_INFO(log, "                                      ttl: %" PRIu64,
+                 ttl);
+  ROCKS_LOG_INFO(log, "              periodic_compaction_seconds: %" PRIu64,
+                 periodic_compaction_seconds);
+  std::string result;
+  char buf[10];
+  for (const auto m : max_bytes_for_level_multiplier_additional) {
+    snprintf(buf, sizeof(buf), "%d, ", m);
+    result += buf;
+  }
+  if (result.size() >= 2) {
+    result.resize(result.size() - 2);
+  } else {
+    result = "";
+  }
+
+  ROCKS_LOG_INFO(log, "max_bytes_for_level_multiplier_additional: %s",
+                 result.c_str());
+  ROCKS_LOG_INFO(log, "        max_sequential_skip_in_iterations: %" PRIu64,
+                 max_sequential_skip_in_iterations);
+  ROCKS_LOG_INFO(log, "         check_flush_compaction_key_order: %d",
+                 check_flush_compaction_key_order);
+  ROCKS_LOG_INFO(log, "                     paranoid_file_checks: %d",
+                 paranoid_file_checks);
+  ROCKS_LOG_INFO(log, "                       report_bg_io_stats: %d",
+                 report_bg_io_stats);
+  ROCKS_LOG_INFO(log, "                              compression: %d",
+                 static_cast<int>(compression));
+  ROCKS_LOG_INFO(log,
+                 "                       experimental_mempurge_threshold: %f",
+                 experimental_mempurge_threshold);
+
+  // Universal Compaction Options
+  ROCKS_LOG_INFO(log, "compaction_options_universal.size_ratio : %d",
+                 compaction_options_universal.size_ratio);
+  ROCKS_LOG_INFO(log, "compaction_options_universal.min_merge_width : %d",
+                 compaction_options_universal.min_merge_width);
+  ROCKS_LOG_INFO(log, "compaction_options_universal.max_merge_width : %d",
+                 compaction_options_universal.max_merge_width);
+  ROCKS_LOG_INFO(
+      log, "compaction_options_universal.max_size_amplification_percent : %d",
+      compaction_options_universal.max_size_amplification_percent);
+  ROCKS_LOG_INFO(log,
+                 "compaction_options_universal.compression_size_percent : %d",
+                 compaction_options_universal.compression_size_percent);
+  ROCKS_LOG_INFO(log, "compaction_options_universal.stop_style : %d",
+                 compaction_options_universal.stop_style);
+  ROCKS_LOG_INFO(
+      log, "compaction_options_universal.allow_trivial_move : %d",
+      static_cast<int>(compaction_options_universal.allow_trivial_move));
+  ROCKS_LOG_INFO(log, "compaction_options_universal.incremental        : %d",
+                 static_cast<int>(compaction_options_universal.incremental));
+
+  // FIFO Compaction Options
+  ROCKS_LOG_INFO(log, "compaction_options_fifo.max_table_files_size : %" PRIu64,
+                 compaction_options_fifo.max_table_files_size);
+  ROCKS_LOG_INFO(log, "compaction_options_fifo.allow_compaction : %d",
+                 compaction_options_fifo.allow_compaction);
+
+  // Blob file related options
+  ROCKS_LOG_INFO(log, "                        enable_blob_files: %s",
+                 enable_blob_files ? "true" : "false");
+  ROCKS_LOG_INFO(log, "                            min_blob_size: %" PRIu64,
+                 min_blob_size);
+  ROCKS_LOG_INFO(log, "                           blob_file_size: %" PRIu64,
+                 blob_file_size);
+  ROCKS_LOG_INFO(log, "                    blob_compression_type: %s",
+                 CompressionTypeToString(blob_compression_type).c_str());
+  ROCKS_LOG_INFO(log, "           enable_blob_garbage_collection: %s",
+                 enable_blob_garbage_collection ? "true" : "false");
+  ROCKS_LOG_INFO(log, "       blob_garbage_collection_age_cutoff: %f",
+                 blob_garbage_collection_age_cutoff);
+  ROCKS_LOG_INFO(log, "  blob_garbage_collection_force_threshold: %f",
+                 blob_garbage_collection_force_threshold);
+  ROCKS_LOG_INFO(log, "           blob_compaction_readahead_size: %" PRIu64,
+                 blob_compaction_readahead_size);
+  ROCKS_LOG_INFO(log, "                 blob_file_starting_level: %d",
+                 blob_file_starting_level);
+  ROCKS_LOG_INFO(log, "                   prepopulate_blob_cache: %s",
+                 prepopulate_blob_cache == PrepopulateBlobCache::kFlushOnly
+                     ? "flush only"
+                     : "disable");
+  ROCKS_LOG_INFO(log, "                   last_level_temperature: %d",
+                 static_cast<int>(last_level_temperature));
+}
+
+MutableCFOptions::MutableCFOptions(const Options& options)
+    : MutableCFOptions(ColumnFamilyOptions(options)) {}
+
+#ifndef ROCKSDB_LITE
+Status GetMutableOptionsFromStrings(
+    const MutableCFOptions& base_options,
+    const std::unordered_map<std::string, std::string>& options_map,
+    Logger* /*info_log*/, MutableCFOptions* new_options) {
+  assert(new_options);
+  *new_options = base_options;
+  ConfigOptions config_options;
+  Status s = OptionTypeInfo::ParseType(
+      config_options, options_map, cf_mutable_options_type_info, new_options);
+  if (!s.ok()) {
+    *new_options = base_options;
+  }
+  return s;
+}
+
+Status GetStringFromMutableCFOptions(const ConfigOptions& config_options,
+                                     const MutableCFOptions& mutable_opts,
+                                     std::string* opt_string) {
+  assert(opt_string);
+  opt_string->clear();
+  return OptionTypeInfo::SerializeType(
+      config_options, cf_mutable_options_type_info, &mutable_opts, opt_string);
+}
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/options/cf_options.h b/src/rocksdb/options/cf_options.h
new file mode 100644
index 000000000..050618eda
--- /dev/null
+++ b/src/rocksdb/options/cf_options.h
@@ -0,0 +1,344 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "options/db_options.h"
+#include "rocksdb/options.h"
+#include "util/compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// ImmutableCFOptions is a data struct used by RocksDB internal. It contains a
+// subset of Options that should not be changed during the entire lifetime
+// of DB. Raw pointers defined in this struct do not have ownership to the data
+// they point to. Options contains std::shared_ptr to these data.
+struct ImmutableCFOptions {
+ public:
+  static const char* kName() { return "ImmutableCFOptions"; }
+  explicit ImmutableCFOptions();
+  explicit ImmutableCFOptions(const ColumnFamilyOptions& cf_options);
+
+  CompactionStyle compaction_style;
+
+  CompactionPri compaction_pri;
+
+  const Comparator* user_comparator;
+  InternalKeyComparator internal_comparator;  // Only in Immutable
+
+  std::shared_ptr<MergeOperator> merge_operator;
+
+  const CompactionFilter* compaction_filter;
+
+  std::shared_ptr<CompactionFilterFactory> compaction_filter_factory;
+
+  int min_write_buffer_number_to_merge;
+
+  int max_write_buffer_number_to_maintain;
+
+  int64_t max_write_buffer_size_to_maintain;
+
+  bool inplace_update_support;
+
+  UpdateStatus (*inplace_callback)(char* existing_value,
+                                   uint32_t* existing_value_size,
+                                   Slice delta_value,
+                                   std::string* merged_value);
+
+  std::shared_ptr<MemTableRepFactory> memtable_factory;
+
+  std::shared_ptr<TableFactory> table_factory;
+
+  Options::TablePropertiesCollectorFactories
+      table_properties_collector_factories;
+
+  // This options is required by PlainTableReader. May need to move it
+  // to PlainTableOptions just like bloom_bits_per_key
+  uint32_t bloom_locality;
+
+  bool level_compaction_dynamic_level_bytes;
+
+  bool level_compaction_dynamic_file_size;
+
+  int num_levels;
+
+  bool optimize_filters_for_hits;
+
+  bool force_consistency_checks;
+
+  uint64_t preclude_last_level_data_seconds;
+
+  uint64_t preserve_internal_time_seconds;
+
+  std::shared_ptr<const SliceTransform>
+      memtable_insert_with_hint_prefix_extractor;
+
+  std::vector<DbPath> cf_paths;
+
+  std::shared_ptr<ConcurrentTaskLimiter> compaction_thread_limiter;
+
+  std::shared_ptr<SstPartitionerFactory> sst_partitioner_factory;
+
+  std::shared_ptr<Cache> blob_cache;
+};
+
+struct ImmutableOptions : public ImmutableDBOptions, public ImmutableCFOptions {
+  explicit ImmutableOptions();
+  explicit ImmutableOptions(const Options& options);
+
+  ImmutableOptions(const DBOptions& db_options,
+                   const ColumnFamilyOptions& cf_options);
+
+  ImmutableOptions(const ImmutableDBOptions& db_options,
+                   const ImmutableCFOptions& cf_options);
+
+  ImmutableOptions(const DBOptions& db_options,
+                   const ImmutableCFOptions& cf_options);
+
+  ImmutableOptions(const ImmutableDBOptions& db_options,
+                   const ColumnFamilyOptions& cf_options);
+};
+
+struct MutableCFOptions {
+  static const char* kName() { return "MutableCFOptions"; }
+  explicit MutableCFOptions(const ColumnFamilyOptions& options)
+      : write_buffer_size(options.write_buffer_size),
+        max_write_buffer_number(options.max_write_buffer_number),
+        arena_block_size(options.arena_block_size),
+        memtable_prefix_bloom_size_ratio(
+            options.memtable_prefix_bloom_size_ratio),
+        memtable_whole_key_filtering(options.memtable_whole_key_filtering),
+        memtable_huge_page_size(options.memtable_huge_page_size),
+        max_successive_merges(options.max_successive_merges),
+        inplace_update_num_locks(options.inplace_update_num_locks),
+        prefix_extractor(options.prefix_extractor),
+        experimental_mempurge_threshold(
+            options.experimental_mempurge_threshold),
+        disable_auto_compactions(options.disable_auto_compactions),
+        soft_pending_compaction_bytes_limit(
+            options.soft_pending_compaction_bytes_limit),
+        hard_pending_compaction_bytes_limit(
+            options.hard_pending_compaction_bytes_limit),
+        level0_file_num_compaction_trigger(
+            options.level0_file_num_compaction_trigger),
+        level0_slowdown_writes_trigger(options.level0_slowdown_writes_trigger),
+        level0_stop_writes_trigger(options.level0_stop_writes_trigger),
+        max_compaction_bytes(options.max_compaction_bytes),
+        ignore_max_compaction_bytes_for_input(
+            options.ignore_max_compaction_bytes_for_input),
+        target_file_size_base(options.target_file_size_base),
+        target_file_size_multiplier(options.target_file_size_multiplier),
+        max_bytes_for_level_base(options.max_bytes_for_level_base),
+        max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier),
+        ttl(options.ttl),
+        periodic_compaction_seconds(options.periodic_compaction_seconds),
+        max_bytes_for_level_multiplier_additional(
+            options.max_bytes_for_level_multiplier_additional),
+        compaction_options_fifo(options.compaction_options_fifo),
+        compaction_options_universal(options.compaction_options_universal),
+        enable_blob_files(options.enable_blob_files),
+        min_blob_size(options.min_blob_size),
+        blob_file_size(options.blob_file_size),
+        blob_compression_type(options.blob_compression_type),
+        enable_blob_garbage_collection(options.enable_blob_garbage_collection),
+        blob_garbage_collection_age_cutoff(
+            options.blob_garbage_collection_age_cutoff),
+        blob_garbage_collection_force_threshold(
+            options.blob_garbage_collection_force_threshold),
+        blob_compaction_readahead_size(options.blob_compaction_readahead_size),
+        blob_file_starting_level(options.blob_file_starting_level),
+        prepopulate_blob_cache(options.prepopulate_blob_cache),
+        max_sequential_skip_in_iterations(
+            options.max_sequential_skip_in_iterations),
+        check_flush_compaction_key_order(
+            options.check_flush_compaction_key_order),
+        paranoid_file_checks(options.paranoid_file_checks),
+        report_bg_io_stats(options.report_bg_io_stats),
+        compression(options.compression),
+        bottommost_compression(options.bottommost_compression),
+        compression_opts(options.compression_opts),
+        bottommost_compression_opts(options.bottommost_compression_opts),
+        last_level_temperature(options.last_level_temperature ==
+                                       Temperature::kUnknown
+                                   ? options.bottommost_temperature
+                                   : options.last_level_temperature),
+        memtable_protection_bytes_per_key(
+            options.memtable_protection_bytes_per_key),
+        sample_for_compression(
+            options.sample_for_compression),  // TODO: is 0 fine here?
+        compression_per_level(options.compression_per_level) {
+    RefreshDerivedOptions(options.num_levels, options.compaction_style);
+  }
+
+  MutableCFOptions()
+      : write_buffer_size(0),
+        max_write_buffer_number(0),
+        arena_block_size(0),
+        memtable_prefix_bloom_size_ratio(0),
+        memtable_whole_key_filtering(false),
+        memtable_huge_page_size(0),
+        max_successive_merges(0),
+        inplace_update_num_locks(0),
+        prefix_extractor(nullptr),
+        experimental_mempurge_threshold(0.0),
+        disable_auto_compactions(false),
+        soft_pending_compaction_bytes_limit(0),
+        hard_pending_compaction_bytes_limit(0),
+        level0_file_num_compaction_trigger(0),
+        level0_slowdown_writes_trigger(0),
+        level0_stop_writes_trigger(0),
+        max_compaction_bytes(0),
+        ignore_max_compaction_bytes_for_input(true),
+        target_file_size_base(0),
+        target_file_size_multiplier(0),
+        max_bytes_for_level_base(0),
+        max_bytes_for_level_multiplier(0),
+        ttl(0),
+        periodic_compaction_seconds(0),
+        compaction_options_fifo(),
+        enable_blob_files(false),
+        min_blob_size(0),
+        blob_file_size(0),
+        blob_compression_type(kNoCompression),
+        enable_blob_garbage_collection(false),
+        blob_garbage_collection_age_cutoff(0.0),
+        blob_garbage_collection_force_threshold(0.0),
+        blob_compaction_readahead_size(0),
+        blob_file_starting_level(0),
+        prepopulate_blob_cache(PrepopulateBlobCache::kDisable),
+        max_sequential_skip_in_iterations(0),
+        check_flush_compaction_key_order(true),
+        paranoid_file_checks(false),
+        report_bg_io_stats(false),
+        compression(Snappy_Supported() ? kSnappyCompression : kNoCompression),
+        bottommost_compression(kDisableCompressionOption),
+        last_level_temperature(Temperature::kUnknown),
+        memtable_protection_bytes_per_key(0),
+        sample_for_compression(0) {}
+
+  explicit MutableCFOptions(const Options& options);
+
+  // Must be called after any change to MutableCFOptions
+  void RefreshDerivedOptions(int num_levels, CompactionStyle compaction_style);
+
+  void RefreshDerivedOptions(const ImmutableCFOptions& ioptions) {
+    RefreshDerivedOptions(ioptions.num_levels, ioptions.compaction_style);
+  }
+
+  int MaxBytesMultiplerAdditional(int level) const {
+    if (level >=
+        static_cast<int>(max_bytes_for_level_multiplier_additional.size())) {
+      return 1;
+    }
+    return max_bytes_for_level_multiplier_additional[level];
+  }
+
+  void Dump(Logger* log) const;
+
+  // Memtable related options
+  size_t write_buffer_size;
+  int max_write_buffer_number;
+  size_t arena_block_size;
+  double memtable_prefix_bloom_size_ratio;
+  bool memtable_whole_key_filtering;
+  size_t memtable_huge_page_size;
+  size_t max_successive_merges;
+  size_t inplace_update_num_locks;
+  std::shared_ptr<const SliceTransform> prefix_extractor;
+  // [experimental]
+  // Used to activate or deactive the Mempurge feature (memtable garbage
+  // collection). (deactivated by default). At every flush, the total useful
+  // payload (total entries minus garbage entries) is estimated as a ratio
+  // [useful payload bytes]/[size of a memtable (in bytes)]. This ratio is then
+  // compared to this `threshold` value:
+  //     - if ratio<threshold: the flush is replaced by a mempurge operation
+  //     - else: a regular flush operation takes place.
+  // Threshold values:
+  //   0.0: mempurge deactivated (default).
+  //   1.0: recommended threshold value.
+  //   >1.0 : aggressive mempurge.
+  //   0 < threshold < 1.0: mempurge triggered only for very low useful payload
+  //   ratios.
+  // [experimental]
+  double experimental_mempurge_threshold;
+
+  // Compaction related options
+  bool disable_auto_compactions;
+  uint64_t soft_pending_compaction_bytes_limit;
+  uint64_t hard_pending_compaction_bytes_limit;
+  int level0_file_num_compaction_trigger;
+  int level0_slowdown_writes_trigger;
+  int level0_stop_writes_trigger;
+  uint64_t max_compaction_bytes;
+  bool ignore_max_compaction_bytes_for_input;
+  uint64_t target_file_size_base;
+  int target_file_size_multiplier;
+  uint64_t max_bytes_for_level_base;
+  double max_bytes_for_level_multiplier;
+  uint64_t ttl;
+  uint64_t periodic_compaction_seconds;
+  std::vector<int> max_bytes_for_level_multiplier_additional;
+  CompactionOptionsFIFO compaction_options_fifo;
+  CompactionOptionsUniversal compaction_options_universal;
+
+  // Blob file related options
+  bool enable_blob_files;
+  uint64_t min_blob_size;
+  uint64_t blob_file_size;
+  CompressionType blob_compression_type;
+  bool enable_blob_garbage_collection;
+  double blob_garbage_collection_age_cutoff;
+  double blob_garbage_collection_force_threshold;
+  uint64_t blob_compaction_readahead_size;
+  int blob_file_starting_level;
+  PrepopulateBlobCache prepopulate_blob_cache;
+
+  // Misc options
+  uint64_t max_sequential_skip_in_iterations;
+  bool check_flush_compaction_key_order;
+  bool paranoid_file_checks;
+  bool report_bg_io_stats;
+  CompressionType compression;
+  CompressionType bottommost_compression;
+  CompressionOptions compression_opts;
+  CompressionOptions bottommost_compression_opts;
+  Temperature last_level_temperature;
+  uint32_t memtable_protection_bytes_per_key;
+
+  uint64_t sample_for_compression;
+  std::vector<CompressionType> compression_per_level;
+
+  // Derived options
+  // Per-level target file size.
+  std::vector<uint64_t> max_file_size;
+};
+
+uint64_t MultiplyCheckOverflow(uint64_t op1, double op2);
+
+// Get the max file size in a given level.
+uint64_t MaxFileSizeForLevel(const MutableCFOptions& cf_options,
+    int level, CompactionStyle compaction_style, int base_level = 1,
+    bool level_compaction_dynamic_level_bytes = false);
+
+// Get the max size of an L0 file for which we will pin its meta-blocks when
+// `pin_l0_filter_and_index_blocks_in_cache` is set.
+size_t MaxFileSizeForL0MetaPin(const MutableCFOptions& cf_options);
+
+#ifndef ROCKSDB_LITE
+Status GetStringFromMutableCFOptions(const ConfigOptions& config_options,
+                                     const MutableCFOptions& mutable_opts,
+                                     std::string* opt_string);
+
+Status GetMutableOptionsFromStrings(
+    const MutableCFOptions& base_options,
+    const std::unordered_map<std::string, std::string>& options_map,
+    Logger* info_log, MutableCFOptions* new_options);
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/options/configurable.cc b/src/rocksdb/options/configurable.cc
new file mode 100644
index 000000000..08aff10fd
--- /dev/null
+++ b/src/rocksdb/options/configurable.cc
@@ -0,0 +1,767 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/configurable.h"
+
+#include "logging/logging.h"
+#include "options/configurable_helper.h"
+#include "options/options_helper.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void Configurable::RegisterOptions(
+    const std::string& name, void* opt_ptr,
+    const std::unordered_map<std::string, OptionTypeInfo>* type_map) {
+  RegisteredOptions opts;
+  opts.name = name;
+#ifndef ROCKSDB_LITE
+  opts.type_map = type_map;
+#else
+  (void)type_map;
+#endif  // ROCKSDB_LITE
+  opts.opt_ptr = opt_ptr;
+  options_.emplace_back(opts);
+}
+
+//*************************************************************************
+//
+//       Methods for Initializing and Validating Configurable Objects
+//
+//*************************************************************************
+
+Status Configurable::PrepareOptions(const ConfigOptions& opts) {
+  // We ignore the invoke_prepare_options here intentionally,
+  // as if you are here, you must have called PrepareOptions explicitly.
+  Status status = Status::OK();
+#ifndef ROCKSDB_LITE
+  for (auto opt_iter : options_) {
+    if (opt_iter.type_map != nullptr) {
+      for (auto map_iter : *(opt_iter.type_map)) {
+        auto& opt_info = map_iter.second;
+        if (opt_info.ShouldPrepare()) {
+          status = opt_info.Prepare(opts, map_iter.first, opt_iter.opt_ptr);
+          if (!status.ok()) {
+            return status;
+          }
+        }
+      }
+    }
+  }
+#else
+  (void)opts;
+#endif  // ROCKSDB_LITE
+  return status;
+}
+
+Status Configurable::ValidateOptions(const DBOptions& db_opts,
+                                     const ColumnFamilyOptions& cf_opts) const {
+  Status status;
+#ifndef ROCKSDB_LITE
+  for (auto opt_iter : options_) {
+    if (opt_iter.type_map != nullptr) {
+      for (auto map_iter : *(opt_iter.type_map)) {
+        auto& opt_info = map_iter.second;
+        if (opt_info.ShouldValidate()) {
+          status = opt_info.Validate(db_opts, cf_opts, map_iter.first,
+                                     opt_iter.opt_ptr);
+          if (!status.ok()) {
+            return status;
+          }
+        }
+      }
+    }
+  }
+#else
+  (void)db_opts;
+  (void)cf_opts;
+#endif  // ROCKSDB_LITE
+  return status;
+}
+
+/*********************************************************************************/
+/*                                                                               */
+/*       Methods for Retrieving Options from Configurables */
+/*                                                                               */
+/*********************************************************************************/
+
+const void* Configurable::GetOptionsPtr(const std::string& name) const {
+  for (auto o : options_) {
+    if (o.name == name) {
+      return o.opt_ptr;
+    }
+  }
+  return nullptr;
+}
+
+std::string Configurable::GetOptionName(const std::string& opt_name) const {
+  return opt_name;
+}
+
+#ifndef ROCKSDB_LITE
+const OptionTypeInfo* ConfigurableHelper::FindOption(
+    const std::vector<Configurable::RegisteredOptions>& options,
+    const std::string& short_name, std::string* opt_name, void** opt_ptr) {
+  for (auto iter : options) {
+    if (iter.type_map != nullptr) {
+      const auto opt_info =
+          OptionTypeInfo::Find(short_name, *(iter.type_map), opt_name);
+      if (opt_info != nullptr) {
+        *opt_ptr = iter.opt_ptr;
+        return opt_info;
+      }
+    }
+  }
+  return nullptr;
+}
+#endif  // ROCKSDB_LITE
+
+//*************************************************************************
+//
+//       Methods for Configuring Options from Strings/Name-Value Pairs/Maps
+//
+//*************************************************************************
+
+Status Configurable::ConfigureFromMap(
+    const ConfigOptions& config_options,
+    const std::unordered_map<std::string, std::string>& opts_map) {
+  Status s = ConfigureFromMap(config_options, opts_map, nullptr);
+  return s;
+}
+
+Status Configurable::ConfigureFromMap(
+    const ConfigOptions& config_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    std::unordered_map<std::string, std::string>* unused) {
+  return ConfigureOptions(config_options, opts_map, unused);
+}
+
+Status Configurable::ConfigureOptions(
+    const ConfigOptions& config_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    std::unordered_map<std::string, std::string>* unused) {
+  std::string curr_opts;
+  Status s;
+  if (!opts_map.empty()) {
+    // There are options in the map.
+    // Save the current configuration in curr_opts and then configure the
+    // options, but do not prepare them now.  We will do all the prepare when
+    // the configuration is complete.
+    ConfigOptions copy = config_options;
+    copy.invoke_prepare_options = false;
+#ifndef ROCKSDB_LITE
+    if (!config_options.ignore_unknown_options) {
+      // If we are not ignoring unused, get the defaults in case we need to
+      // reset
+      copy.depth = ConfigOptions::kDepthDetailed;
+      copy.delimiter = "; ";
+      GetOptionString(copy, &curr_opts).PermitUncheckedError();
+    }
+#endif  // ROCKSDB_LITE
+
+    s = ConfigurableHelper::ConfigureOptions(copy, *this, opts_map, unused);
+  }
+  if (config_options.invoke_prepare_options && s.ok()) {
+    s = PrepareOptions(config_options);
+  }
+#ifndef ROCKSDB_LITE
+  if (!s.ok() && !curr_opts.empty()) {
+    ConfigOptions reset = config_options;
+    reset.ignore_unknown_options = true;
+    reset.invoke_prepare_options = true;
+    reset.ignore_unsupported_options = true;
+    // There are some options to reset from this current error
+    ConfigureFromString(reset, curr_opts).PermitUncheckedError();
+  }
+#endif  // ROCKSDB_LITE
+  return s;
+}
+
+Status Configurable::ParseStringOptions(const ConfigOptions& /*config_options*/,
+                                        const std::string& /*opts_str*/) {
+  return Status::OK();
+}
+
+Status Configurable::ConfigureFromString(const ConfigOptions& config_options,
+                                         const std::string& opts_str) {
+  Status s;
+  if (!opts_str.empty()) {
+#ifndef ROCKSDB_LITE
+    if (opts_str.find(';') != std::string::npos ||
+        opts_str.find('=') != std::string::npos) {
+      std::unordered_map<std::string, std::string> opt_map;
+      s = StringToMap(opts_str, &opt_map);
+      if (s.ok()) {
+        s = ConfigureFromMap(config_options, opt_map, nullptr);
+      }
+    } else {
+#endif  // ROCKSDB_LITE
+      s = ParseStringOptions(config_options, opts_str);
+      if (s.ok() && config_options.invoke_prepare_options) {
+        s = PrepareOptions(config_options);
+      }
+#ifndef ROCKSDB_LITE
+    }
+#endif  // ROCKSDB_LITE
+  } else if (config_options.invoke_prepare_options) {
+    s = PrepareOptions(config_options);
+  } else {
+    s = Status::OK();
+  }
+  return s;
+}
+
+#ifndef ROCKSDB_LITE
+/**
+ * Sets the value of the named property to the input value, returning OK on
+ * succcess.
+ */
+Status Configurable::ConfigureOption(const ConfigOptions& config_options,
+                                     const std::string& name,
+                                     const std::string& value) {
+  return ConfigurableHelper::ConfigureSingleOption(config_options, *this, name,
+                                                   value);
+}
+
+/**
+ * Looks for the named option amongst the options for this type and sets
+ * the value for it to be the input value.
+ * If the name was found, found_option will be set to true and the resulting
+ * status should be returned.
+ */
+
+Status Configurable::ParseOption(const ConfigOptions& config_options,
+                                 const OptionTypeInfo& opt_info,
+                                 const std::string& opt_name,
+                                 const std::string& opt_value, void* opt_ptr) {
+  if (opt_info.IsMutable()) {
+    if (config_options.mutable_options_only) {
+      // This option is mutable. Treat all of its children as mutable as well
+      ConfigOptions copy = config_options;
+      copy.mutable_options_only = false;
+      return opt_info.Parse(copy, opt_name, opt_value, opt_ptr);
+    } else {
+      return opt_info.Parse(config_options, opt_name, opt_value, opt_ptr);
+    }
+  } else if (config_options.mutable_options_only) {
+    return Status::InvalidArgument("Option not changeable: " + opt_name);
+  } else {
+    return opt_info.Parse(config_options, opt_name, opt_value, opt_ptr);
+  }
+}
+
+#endif  // ROCKSDB_LITE
+
+Status ConfigurableHelper::ConfigureOptions(
+    const ConfigOptions& config_options, Configurable& configurable,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    std::unordered_map<std::string, std::string>* unused) {
+  std::unordered_map<std::string, std::string> remaining = opts_map;
+  Status s = Status::OK();
+  if (!opts_map.empty()) {
+#ifndef ROCKSDB_LITE
+    for (const auto& iter : configurable.options_) {
+      if (iter.type_map != nullptr) {
+        s = ConfigureSomeOptions(config_options, configurable, *(iter.type_map),
+                                 &remaining, iter.opt_ptr);
+        if (remaining.empty()) {  // Are there more options left?
+          break;
+        } else if (!s.ok()) {
+          break;
+        }
+      }
+    }
+#else
+    (void)configurable;
+    if (!config_options.ignore_unknown_options) {
+      s = Status::NotSupported("ConfigureFromMap not supported in LITE mode");
+    }
+#endif  // ROCKSDB_LITE
+  }
+  if (unused != nullptr && !remaining.empty()) {
+    unused->insert(remaining.begin(), remaining.end());
+  }
+  if (config_options.ignore_unknown_options) {
+    s = Status::OK();
+  } else if (s.ok() && unused == nullptr && !remaining.empty()) {
+    s = Status::NotFound("Could not find option: ", remaining.begin()->first);
+  }
+  return s;
+}
+
+#ifndef ROCKSDB_LITE
+/**
+ * Updates the object with the named-value property values, returning OK on
+ * succcess. Any properties that were found are removed from the options list;
+ * upon return only options that were not found in this opt_map remain.
+
+ * Returns:
+ * -  OK if ignore_unknown_options is set
+ * - InvalidArgument, if any option was invalid
+ * - NotSupported, if any option is unsupported and ignore_unsupported_options
+ is OFF
+ * - OK, if no option was invalid or not supported (or ignored)
+ */
+Status ConfigurableHelper::ConfigureSomeOptions(
+    const ConfigOptions& config_options, Configurable& configurable,
+    const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+    std::unordered_map<std::string, std::string>* options, void* opt_ptr) {
+  Status result = Status::OK();  // The last non-OK result (if any)
+  Status notsup = Status::OK();  // The last NotSupported result (if any)
+  std::string elem_name;
+  int found = 1;
+  std::unordered_set<std::string> unsupported;
+  // While there are unused properties and we processed at least one,
+  // go through the remaining unused properties and attempt to configure them.
+  while (found > 0 && !options->empty()) {
+    found = 0;
+    notsup = Status::OK();
+    for (auto it = options->begin(); it != options->end();) {
+      const std::string& opt_name = configurable.GetOptionName(it->first);
+      const std::string& opt_value = it->second;
+      const auto opt_info =
+          OptionTypeInfo::Find(opt_name, type_map, &elem_name);
+      if (opt_info == nullptr) {  // Did not find the option.  Skip it
+        ++it;
+      } else {
+        Status s = ConfigureOption(config_options, configurable, *opt_info,
+                                   opt_name, elem_name, opt_value, opt_ptr);
+        if (s.IsNotFound()) {
+          ++it;
+        } else if (s.IsNotSupported()) {
+          notsup = s;
+          unsupported.insert(it->first);
+          ++it;  // Skip it for now
+        } else {
+          found++;
+          it = options->erase(it);
+          if (!s.ok()) {
+            result = s;
+          }
+        }
+      }
+    }  // End for all remaining options
+  }    // End while found one or options remain
+
+  // Now that we have been through the list, remove any unsupported
+  for (auto u : unsupported) {
+    auto it = options->find(u);
+    if (it != options->end()) {
+      options->erase(it);
+    }
+  }
+  if (config_options.ignore_unknown_options) {
+    if (!result.ok()) result.PermitUncheckedError();
+    if (!notsup.ok()) notsup.PermitUncheckedError();
+    return Status::OK();
+  } else if (!result.ok()) {
+    if (!notsup.ok()) notsup.PermitUncheckedError();
+    return result;
+  } else if (config_options.ignore_unsupported_options) {
+    if (!notsup.ok()) notsup.PermitUncheckedError();
+    return Status::OK();
+  } else {
+    return notsup;
+  }
+}
+
+Status ConfigurableHelper::ConfigureSingleOption(
+    const ConfigOptions& config_options, Configurable& configurable,
+    const std::string& name, const std::string& value) {
+  const std::string& opt_name = configurable.GetOptionName(name);
+  std::string elem_name;
+  void* opt_ptr = nullptr;
+  const auto opt_info =
+      FindOption(configurable.options_, opt_name, &elem_name, &opt_ptr);
+  if (opt_info == nullptr) {
+    return Status::NotFound("Could not find option: ", name);
+  } else {
+    return ConfigureOption(config_options, configurable, *opt_info, opt_name,
+                           elem_name, value, opt_ptr);
+  }
+}
+Status ConfigurableHelper::ConfigureCustomizableOption(
+    const ConfigOptions& config_options, Configurable& configurable,
+    const OptionTypeInfo& opt_info, const std::string& opt_name,
+    const std::string& name, const std::string& value, void* opt_ptr) {
+  Customizable* custom = opt_info.AsRawPointer<Customizable>(opt_ptr);
+  ConfigOptions copy = config_options;
+  if (opt_info.IsMutable()) {
+    // This option is mutable. Pass that property on to any subsequent calls
+    copy.mutable_options_only = false;
+  }
+
+  if (opt_info.IsMutable() || !config_options.mutable_options_only) {
+    // Either the option is mutable, or we are processing all of the options
+    if (opt_name == name || name == OptionTypeInfo::kIdPropName() ||
+        EndsWith(opt_name, OptionTypeInfo::kIdPropSuffix())) {
+      return configurable.ParseOption(copy, opt_info, name, value, opt_ptr);
+    } else if (value.empty()) {
+      return Status::OK();
+    } else if (custom == nullptr || !StartsWith(name, custom->GetId() + ".")) {
+      return configurable.ParseOption(copy, opt_info, name, value, opt_ptr);
+    } else if (value.find("=") != std::string::npos) {
+      return custom->ConfigureFromString(copy, value);
+    } else {
+      return custom->ConfigureOption(copy, name, value);
+    }
+  } else {
+    // We are processing immutable options, which means that we cannot change
+    // the Customizable object itself, but could change its mutable properties.
+    // Check to make sure that nothing is trying to change the Customizable
+    if (custom == nullptr) {
+      // We do not have a Customizable to configure.  This is OK if the
+      // value is empty (nothing being configured) but an error otherwise
+      if (value.empty()) {
+        return Status::OK();
+      } else {
+        return Status::InvalidArgument("Option not changeable: " + opt_name);
+      }
+    } else if (EndsWith(opt_name, OptionTypeInfo::kIdPropSuffix()) ||
+               name == OptionTypeInfo::kIdPropName()) {
+      // We have a property of the form "id=value" or "table.id=value"
+      // This is OK if we ID/value matches the current customizable object
+      if (custom->GetId() == value) {
+        return Status::OK();
+      } else {
+        return Status::InvalidArgument("Option not changeable: " + opt_name);
+      }
+    } else if (opt_name == name) {
+      // The properties are of one of forms:
+      //    name = { id = id; prop1 = value1; ... }
+      //    name = { prop1=value1; prop2=value2; ... }
+      //    name = ID
+      // Convert the value to a map and extract the ID
+      // If the ID does not match that of the current customizable, return an
+      // error. Otherwise, update the current customizable via the properties
+      // map
+      std::unordered_map<std::string, std::string> props;
+      std::string id;
+      Status s =
+          Configurable::GetOptionsMap(value, custom->GetId(), &id, &props);
+      if (!s.ok()) {
+        return s;
+      } else if (custom->GetId() != id) {
+        return Status::InvalidArgument("Option not changeable: " + opt_name);
+      } else if (props.empty()) {
+        return Status::OK();
+      } else {
+        return custom->ConfigureFromMap(copy, props);
+      }
+    } else {
+      // Attempting to configure one of the properties of the customizable
+      // Let it through
+      return custom->ConfigureOption(copy, name, value);
+    }
+  }
+}
+
+Status ConfigurableHelper::ConfigureOption(
+    const ConfigOptions& config_options, Configurable& configurable,
+    const OptionTypeInfo& opt_info, const std::string& opt_name,
+    const std::string& name, const std::string& value, void* opt_ptr) {
+  if (opt_info.IsCustomizable()) {
+    return ConfigureCustomizableOption(config_options, configurable, opt_info,
+                                       opt_name, name, value, opt_ptr);
+  } else if (opt_name == name) {
+    return configurable.ParseOption(config_options, opt_info, opt_name, value,
+                                    opt_ptr);
+  } else if (opt_info.IsStruct() || opt_info.IsConfigurable()) {
+    return configurable.ParseOption(config_options, opt_info, name, value,
+                                    opt_ptr);
+  } else {
+    return Status::NotFound("Could not find option: ", name);
+  }
+}
+#endif  // ROCKSDB_LITE
+
+//*******************************************************************************
+//
+//       Methods for Converting Options into strings
+//
+//*******************************************************************************
+
+Status Configurable::GetOptionString(const ConfigOptions& config_options,
+                                     std::string* result) const {
+  assert(result);
+  result->clear();
+#ifndef ROCKSDB_LITE
+  return ConfigurableHelper::SerializeOptions(config_options, *this, "",
+                                              result);
+#else
+  (void)config_options;
+  return Status::NotSupported("GetOptionString not supported in LITE mode");
+#endif  // ROCKSDB_LITE
+}
+
+#ifndef ROCKSDB_LITE
+std::string Configurable::ToString(const ConfigOptions& config_options,
+                                   const std::string& prefix) const {
+  std::string result = SerializeOptions(config_options, prefix);
+  if (result.empty() || result.find('=') == std::string::npos) {
+    return result;
+  } else {
+    return "{" + result + "}";
+  }
+}
+
+std::string Configurable::SerializeOptions(const ConfigOptions& config_options,
+                                           const std::string& header) const {
+  std::string result;
+  Status s = ConfigurableHelper::SerializeOptions(config_options, *this, header,
+                                                  &result);
+  assert(s.ok());
+  return result;
+}
+
+Status Configurable::GetOption(const ConfigOptions& config_options,
+                               const std::string& name,
+                               std::string* value) const {
+  return ConfigurableHelper::GetOption(config_options, *this,
+                                       GetOptionName(name), value);
+}
+
+Status ConfigurableHelper::GetOption(const ConfigOptions& config_options,
+                                     const Configurable& configurable,
+                                     const std::string& short_name,
+                                     std::string* value) {
+  // Look for option directly
+  assert(value);
+  value->clear();
+
+  std::string opt_name;
+  void* opt_ptr = nullptr;
+  const auto opt_info =
+      FindOption(configurable.options_, short_name, &opt_name, &opt_ptr);
+  if (opt_info != nullptr) {
+    ConfigOptions embedded = config_options;
+    embedded.delimiter = ";";
+    if (short_name == opt_name) {
+      return opt_info->Serialize(embedded, opt_name, opt_ptr, value);
+    } else if (opt_info->IsStruct()) {
+      return opt_info->Serialize(embedded, opt_name, opt_ptr, value);
+    } else if (opt_info->IsConfigurable()) {
+      auto const* config = opt_info->AsRawPointer<Configurable>(opt_ptr);
+      if (config != nullptr) {
+        return config->GetOption(embedded, opt_name, value);
+      }
+    }
+  }
+  return Status::NotFound("Cannot find option: ", short_name);
+}
+
+Status ConfigurableHelper::SerializeOptions(const ConfigOptions& config_options,
+                                            const Configurable& configurable,
+                                            const std::string& prefix,
+                                            std::string* result) {
+  assert(result);
+  for (auto const& opt_iter : configurable.options_) {
+    if (opt_iter.type_map != nullptr) {
+      for (const auto& map_iter : *(opt_iter.type_map)) {
+        const auto& opt_name = map_iter.first;
+        const auto& opt_info = map_iter.second;
+        if (opt_info.ShouldSerialize()) {
+          std::string value;
+          Status s;
+          if (!config_options.mutable_options_only) {
+            s = opt_info.Serialize(config_options, prefix + opt_name,
+                                   opt_iter.opt_ptr, &value);
+          } else if (opt_info.IsMutable()) {
+            ConfigOptions copy = config_options;
+            copy.mutable_options_only = false;
+            s = opt_info.Serialize(copy, prefix + opt_name, opt_iter.opt_ptr,
+                                   &value);
+          } else if (opt_info.IsConfigurable()) {
+            // If it is a Configurable and we are either printing all of the
+            // details or not printing only the name, this option should be
+            // included in the list
+            if (config_options.IsDetailed() ||
+                !opt_info.IsEnabled(OptionTypeFlags::kStringNameOnly)) {
+              s = opt_info.Serialize(config_options, prefix + opt_name,
+                                     opt_iter.opt_ptr, &value);
+            }
+          }
+          if (!s.ok()) {
+            return s;
+          } else if (!value.empty()) {
+            // <prefix><opt_name>=<value><delimiter>
+            result->append(prefix + opt_name + "=" + value +
+                           config_options.delimiter);
+          }
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+#endif  // ROCKSDB_LITE
+
+//********************************************************************************
+//
+// Methods for listing the options from Configurables
+//
+//********************************************************************************
+#ifndef ROCKSDB_LITE
+Status Configurable::GetOptionNames(
+    const ConfigOptions& config_options,
+    std::unordered_set<std::string>* result) const {
+  assert(result);
+  return ConfigurableHelper::ListOptions(config_options, *this, "", result);
+}
+
+Status ConfigurableHelper::ListOptions(
+    const ConfigOptions& config_options, const Configurable& configurable,
+    const std::string& prefix, std::unordered_set<std::string>* result) {
+  Status status;
+  for (auto const& opt_iter : configurable.options_) {
+    if (opt_iter.type_map != nullptr) {
+      for (const auto& map_iter : *(opt_iter.type_map)) {
+        const auto& opt_name = map_iter.first;
+        const auto& opt_info = map_iter.second;
+        // If the option is no longer used in rocksdb and marked as deprecated,
+        // we skip it in the serialization.
+        if (!opt_info.IsDeprecated() && !opt_info.IsAlias()) {
+          if (!config_options.mutable_options_only) {
+            result->emplace(prefix + opt_name);
+          } else if (opt_info.IsMutable()) {
+            result->emplace(prefix + opt_name);
+          }
+        }
+      }
+    }
+  }
+  return status;
+}
+#endif  // ROCKSDB_LITE
+
+//*******************************************************************************
+//
+//       Methods for Comparing Configurables
+//
+//*******************************************************************************
+
+bool Configurable::AreEquivalent(const ConfigOptions& config_options,
+                                 const Configurable* other,
+                                 std::string* name) const {
+  assert(name);
+  name->clear();
+  if (this == other || config_options.IsCheckDisabled()) {
+    return true;
+  } else if (other != nullptr) {
+#ifndef ROCKSDB_LITE
+    return ConfigurableHelper::AreEquivalent(config_options, *this, *other,
+                                             name);
+#else
+    return true;
+#endif  // ROCKSDB_LITE
+  } else {
+    return false;
+  }
+}
+
+#ifndef ROCKSDB_LITE
+bool Configurable::OptionsAreEqual(const ConfigOptions& config_options,
+                                   const OptionTypeInfo& opt_info,
+                                   const std::string& opt_name,
+                                   const void* const this_ptr,
+                                   const void* const that_ptr,
+                                   std::string* mismatch) const {
+  if (opt_info.AreEqual(config_options, opt_name, this_ptr, that_ptr,
+                        mismatch)) {
+    return true;
+  } else if (opt_info.AreEqualByName(config_options, opt_name, this_ptr,
+                                     that_ptr)) {
+    *mismatch = "";
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool ConfigurableHelper::AreEquivalent(const ConfigOptions& config_options,
+                                       const Configurable& this_one,
+                                       const Configurable& that_one,
+                                       std::string* mismatch) {
+  assert(mismatch != nullptr);
+  for (auto const& o : this_one.options_) {
+    const auto this_offset = this_one.GetOptionsPtr(o.name);
+    const auto that_offset = that_one.GetOptionsPtr(o.name);
+    if (this_offset != that_offset) {
+      if (this_offset == nullptr || that_offset == nullptr) {
+        return false;
+      } else if (o.type_map != nullptr) {
+        for (const auto& map_iter : *(o.type_map)) {
+          const auto& opt_info = map_iter.second;
+          if (config_options.IsCheckEnabled(opt_info.GetSanityLevel())) {
+            if (!config_options.mutable_options_only) {
+              if (!this_one.OptionsAreEqual(config_options, opt_info,
+                                            map_iter.first, this_offset,
+                                            that_offset, mismatch)) {
+                return false;
+              }
+            } else if (opt_info.IsMutable()) {
+              ConfigOptions copy = config_options;
+              copy.mutable_options_only = false;
+              if (!this_one.OptionsAreEqual(copy, opt_info, map_iter.first,
+                                            this_offset, that_offset,
+                                            mismatch)) {
+                return false;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return true;
+}
+#endif  // ROCKSDB_LITE
+
+Status Configurable::GetOptionsMap(
+    const std::string& value, const std::string& default_id, std::string* id,
+    std::unordered_map<std::string, std::string>* props) {
+  assert(id);
+  assert(props);
+  Status status;
+  if (value.empty() || value == kNullptrString) {
+    *id = default_id;
+  } else if (value.find('=') == std::string::npos) {
+    *id = value;
+#ifndef ROCKSDB_LITE
+  } else {
+    status = StringToMap(value, props);
+    if (!status.ok()) {       // There was an error creating the map.
+      *id = value;            // Treat the value as id
+      props->clear();         // Clear the properties
+      status = Status::OK();  // and ignore the error
+    } else {
+      auto iter = props->find(OptionTypeInfo::kIdPropName());
+      if (iter != props->end()) {
+        *id = iter->second;
+        props->erase(iter);
+        if (*id == kNullptrString) {
+          id->clear();
+        }
+      } else if (!default_id.empty()) {
+        *id = default_id;
+      } else {           // No id property and no default
+        *id = value;     // Treat the value as id
+        props->clear();  // Clear the properties
+      }
+    }
+#else
+  } else {
+    *id = value;
+    props->clear();
+#endif
+  }
+  return status;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/options/configurable_helper.h b/src/rocksdb/options/configurable_helper.h
new file mode 100644
index 000000000..0f5f918cb
--- /dev/null
+++ b/src/rocksdb/options/configurable_helper.h
@@ -0,0 +1,187 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <map>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "rocksdb/configurable.h"
+#include "rocksdb/convenience.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Helper class defining static methods for supporting the Configurable
+// class.  The purpose of this class is to keep the Configurable class
+// as tight as possible and provide methods for doing the actual work
+// of configuring the objects.
+class ConfigurableHelper {
+ public:
+  // Configures the input Configurable object based on the parameters.
+  // On successful completion, the Configurable is updated with the settings
+  // from the opt_map.
+  //
+  // The acceptable values of the name/value pairs are documented with the
+  // specific class/instance.
+  //
+  // @param config_options Controls how the arguments are processed.
+  // @param opt_map Name/value pairs of the options to update
+  // @param unused If specified, this value will return the name/value
+  //      pairs from opt_map that were NotFound for this object.
+  // @return OK If all values in the map were successfully updated
+  // @return NotFound If any of the names in the opt_map were not valid
+  //      for this object.  If unused is specified, it will contain the
+  //      collection of NotFound entries
+  // @return NotSupported  If any of the names are valid but the object does
+  //       not know how to convert the value.  This can happen if, for example,
+  //       there is some nested Configurable that cannot be created.
+  // @return InvalidArgument If any of the values cannot be successfully
+  //       parsed.  This can also be returned if PrepareOptions encounters an
+  //       error.
+  static Status ConfigureOptions(
+      const ConfigOptions& config_options, Configurable& configurable,
+      const std::unordered_map<std::string, std::string>& options,
+      std::unordered_map<std::string, std::string>* unused);
+
+#ifndef ROCKSDB_LITE
+  // Internal method to configure a set of options for this object.
+  // Classes may override this value to change its behavior.
+  // @param config_options Controls how the options are being configured
+  // @param type_name The name that was registered for this set of options
+  // @param type_map The map of options for this name
+  // @param opt_ptr Pointer to the object being configured for this option set.
+  // @param options The option name/values being updated.  On return, any
+  //    option that was found is removed from the list.
+  // @return OK If all of the options were successfully updated.
+  // @return InvalidArgument If an option was found but the value could not
+  //       be updated.
+  // @return NotFound If an option name was not found in type_mape
+  // @return NotSupported If the option was found but no rule for converting
+  //       the value could be found.
+  static Status ConfigureSomeOptions(
+      const ConfigOptions& config_options, Configurable& configurable,
+      const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+      std::unordered_map<std::string, std::string>* options, void* opt_ptr);
+
+  // Configures a single option in the input Configurable.
+  // This method will look through the set of option names for this
+  // Configurable searching for one with the input name.  If such an option
+  // is found, it will be configured via the input value.
+  //
+  // @param config_options Controls how the option is being configured
+  // @param configurable The object to configure
+  // @param name For options with sub-options (like Structs or
+  // Configurables),
+  //      this value may be the name of the sub-field of the option being
+  //      updated. For example, if the option is
+  //      "compaction_options_fifo.allow_compaction", then field name would be
+  //      "allow_compaction".  For most options, field_name and opt_name will be
+  //      equivalent.
+  // @param value The new value for this option.
+  // @param See ConfigureOptions for the possible return values
+  static Status ConfigureSingleOption(const ConfigOptions& config_options,
+                                      Configurable& configurable,
+                                      const std::string& name,
+                                      const std::string& value);
+
+  // Configures the option referenced by opt_info for this configurable
+  // This method configures the option based on opt_info for the input
+  // configurable.
+  // @param config_options Controls how the option is being configured
+  // @param configurable The object to configure
+  // @param opt_name The full option name
+  // @param name For options with sub-options (like Structs or
+  // Configurables),
+  //      this value may be the name of the sub-field of the option being
+  //      updated. For example, if the option is
+  //      "compaction_options_fifo.allow_compaction", then field name would be
+  //      "allow_compaction".  For most options, field_name and opt_name will be
+  //      equivalent.
+  // @param value The new value for this option.
+  // @param See ConfigureOptions for the possible return values
+  static Status ConfigureOption(const ConfigOptions& config_options,
+                                Configurable& configurable,
+                                const OptionTypeInfo& opt_info,
+                                const std::string& opt_name,
+                                const std::string& name,
+                                const std::string& value, void* opt_ptr);
+
+  // Returns the value of the option associated with the input name
+  // This method is the functional inverse of ConfigureOption
+  // @param config_options Controls how the value is returned
+  // @param configurable The object from which to get the option.
+  // @param name The name of the option to return a value for.
+  // @param value The returned value associated with the named option.
+  //              Note that value will be only the serialized version
+  //              of the option and not "name=value"
+  // @return OK If the named field was successfully updated to value.
+  // @return NotFound If the name is not valid for this object.
+  // @param InvalidArgument If the name is valid for this object but
+  //      its value cannot be serialized.
+  static Status GetOption(const ConfigOptions& config_options,
+                          const Configurable& configurable,
+                          const std::string& name, std::string* value);
+
+  // Serializes the input Configurable into the output result.
+  // This is the inverse of ConfigureOptions
+  // @param config_options Controls how serialization happens.
+  // @param configurable The object to serialize
+  // @param prefix A prefix to add to the each option as it is serialized.
+  // @param result The string representation of the configurable.
+  // @return OK If the options for this object wer successfully serialized.
+  // @return InvalidArgument If one or more of the options could not be
+  // serialized.
+  static Status SerializeOptions(const ConfigOptions& config_options,
+                                 const Configurable& configurable,
+                                 const std::string& prefix,
+                                 std::string* result);
+
+  // Internal method to list the option names for this object.
+  // Classes may override this value to change its behavior.
+  // @see ListOptions for more details
+  static Status ListOptions(const ConfigOptions& config_options,
+                            const Configurable& configurable,
+                            const std::string& prefix,
+                            std::unordered_set<std::string>* result);
+
+  // Checks to see if the two configurables are equivalent to one other.
+  // This method assumes that the two objects are of the same class.
+  // @param config_options Controls how the options are compared.
+  // @param this_one The object to compare to.
+  // @param that_one The other object being compared.
+  // @param mismatch If the objects do not match, this parameter contains
+  //      the name of the option that triggered the match failure.
+  // @param True if the objects match, false otherwise.
+  static bool AreEquivalent(const ConfigOptions& config_options,
+                            const Configurable& this_one,
+                            const Configurable& that_one,
+                            std::string* mismatch);
+
+ private:
+  // Looks for the option specified by name in the RegisteredOptions.
+  // This method traverses the types in the input options vector.  If an entry
+  // matching name is found, that entry, opt_name, and pointer are returned.
+  // @param options  The vector of options to search through
+  // @param name     The name of the option to search for in the OptionType map
+  // @param opt_name If the name was found, this value is set to the option name
+  //                 associated with the input name/type.
+  // @param opt_ptr  If the name was found, this value is set to the option
+  // pointer
+  //                 in the RegisteredOptions vector associated with this entry
+  // @return         A pointer to the OptionTypeInfo from the options if found,
+  //                 nullptr if the name was not found in the input options
+  static const OptionTypeInfo* FindOption(
+      const std::vector<Configurable::RegisteredOptions>& options,
+      const std::string& name, std::string* opt_name, void** opt_ptr);
+
+  static Status ConfigureCustomizableOption(
+      const ConfigOptions& config_options, Configurable& configurable,
+      const OptionTypeInfo& opt_info, const std::string& opt_name,
+      const std::string& name, const std::string& value, void* opt_ptr);
+#endif  // ROCKSDB_LITE
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/options/configurable_test.cc b/src/rocksdb/options/configurable_test.cc
new file mode 100644
index 000000000..6ec02cf3a
--- /dev/null
+++ b/src/rocksdb/options/configurable_test.cc
@@ -0,0 +1,881 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "options/configurable_test.h"
+
+#include <cctype>
+#include <cinttypes>
+#include <cstring>
+#include <unordered_map>
+
+#include "options/configurable_helper.h"
+#include "options/options_helper.h"
+#include "options/options_parser.h"
+#include "rocksdb/configurable.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+#ifndef GFLAGS
+bool FLAGS_enable_print = false;
+#else
+#include "util/gflags_compat.h"
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+DEFINE_bool(enable_print, false, "Print options generated to console.");
+#endif  // GFLAGS
+
+namespace ROCKSDB_NAMESPACE {
+namespace test {
+class StringLogger : public Logger {
+ public:
+  using Logger::Logv;
+  void Logv(const char* format, va_list ap) override {
+    char buffer[1000];
+    vsnprintf(buffer, sizeof(buffer), format, ap);
+    string_.append(buffer);
+  }
+  const std::string& str() const { return string_; }
+  void clear() { string_.clear(); }
+
+ private:
+  std::string string_;
+};
+static std::unordered_map<std::string, OptionTypeInfo> struct_option_info = {
+#ifndef ROCKSDB_LITE
+    {"struct", OptionTypeInfo::Struct("struct", &simple_option_info, 0,
+                                      OptionVerificationType::kNormal,
+                                      OptionTypeFlags::kMutable)},
+#endif  // ROCKSDB_LITE
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> imm_struct_option_info =
+    {
+#ifndef ROCKSDB_LITE
+        {"struct", OptionTypeInfo::Struct("struct", &simple_option_info, 0,
+                                          OptionVerificationType::kNormal,
+                                          OptionTypeFlags::kNone)},
+#endif  // ROCKSDB_LITE
+};
+
+class SimpleConfigurable : public TestConfigurable<Configurable> {
+ public:
+  static SimpleConfigurable* Create(
+      const std::string& name = "simple",
+      int mode = TestConfigMode::kDefaultMode,
+      const std::unordered_map<std::string, OptionTypeInfo>* map =
+          &simple_option_info) {
+    return new SimpleConfigurable(name, mode, map);
+  }
+
+  SimpleConfigurable(const std::string& name, int mode,
+                     const std::unordered_map<std::string, OptionTypeInfo>*
+                         map = &simple_option_info)
+      : TestConfigurable(name, mode, map) {
+    if ((mode & TestConfigMode::kUniqueMode) != 0) {
+      unique_.reset(SimpleConfigurable::Create("Unique" + name_));
+      RegisterOptions(name_ + "Unique", &unique_, &unique_option_info);
+    }
+    if ((mode & TestConfigMode::kSharedMode) != 0) {
+      shared_.reset(SimpleConfigurable::Create("Shared" + name_));
+      RegisterOptions(name_ + "Shared", &shared_, &shared_option_info);
+    }
+    if ((mode & TestConfigMode::kRawPtrMode) != 0) {
+      pointer_ = SimpleConfigurable::Create("Pointer" + name_);
+      RegisterOptions(name_ + "Pointer", &pointer_, &pointer_option_info);
+    }
+  }
+
+};  // End class SimpleConfigurable
+
+using ConfigTestFactoryFunc = std::function<Configurable*()>;
+
+class ConfigurableTest : public testing::Test {
+ public:
+  ConfigurableTest() { config_options_.invoke_prepare_options = false; }
+
+  ConfigOptions config_options_;
+};
+
+TEST_F(ConfigurableTest, GetOptionsPtrTest) {
+  std::string opt_str;
+  std::unique_ptr<Configurable> configurable(SimpleConfigurable::Create());
+  ASSERT_NE(configurable->GetOptions<TestOptions>("simple"), nullptr);
+  ASSERT_EQ(configurable->GetOptions<TestOptions>("bad-opt"), nullptr);
+}
+
+TEST_F(ConfigurableTest, ConfigureFromMapTest) {
+  std::unique_ptr<Configurable> configurable(SimpleConfigurable::Create());
+  auto* opts = configurable->GetOptions<TestOptions>("simple");
+  ASSERT_OK(configurable->ConfigureFromMap(config_options_, {}));
+  ASSERT_NE(opts, nullptr);
+#ifndef ROCKSDB_LITE
+  std::unordered_map<std::string, std::string> options_map = {
+      {"int", "1"}, {"bool", "true"}, {"string", "string"}};
+  ASSERT_OK(configurable->ConfigureFromMap(config_options_, options_map));
+  ASSERT_EQ(opts->i, 1);
+  ASSERT_EQ(opts->b, true);
+  ASSERT_EQ(opts->s, "string");
+#endif
+}
+
+TEST_F(ConfigurableTest, ConfigureFromStringTest) {
+  std::unique_ptr<Configurable> configurable(SimpleConfigurable::Create());
+  auto* opts = configurable->GetOptions<TestOptions>("simple");
+  ASSERT_OK(configurable->ConfigureFromString(config_options_, ""));
+  ASSERT_NE(opts, nullptr);
+#ifndef ROCKSDB_LITE  // GetOptionsFromMap is not supported in ROCKSDB_LITE
+  ASSERT_OK(configurable->ConfigureFromString(config_options_,
+                                              "int=1;bool=true;string=s"));
+  ASSERT_EQ(opts->i, 1);
+  ASSERT_EQ(opts->b, true);
+  ASSERT_EQ(opts->s, "s");
+#endif
+}
+
+#ifndef ROCKSDB_LITE  // GetOptionsFromMap is not supported in ROCKSDB_LITE
+TEST_F(ConfigurableTest, ConfigureIgnoreTest) {
+  std::unique_ptr<Configurable> configurable(SimpleConfigurable::Create());
+  std::unordered_map<std::string, std::string> options_map = {{"unused", "u"}};
+  ConfigOptions ignore = config_options_;
+  ignore.ignore_unknown_options = true;
+  ASSERT_NOK(configurable->ConfigureFromMap(config_options_, options_map));
+  ASSERT_OK(configurable->ConfigureFromMap(ignore, options_map));
+  ASSERT_NOK(configurable->ConfigureFromString(config_options_, "unused=u"));
+  ASSERT_OK(configurable->ConfigureFromString(ignore, "unused=u"));
+}
+
+TEST_F(ConfigurableTest, ConfigureNestedOptionsTest) {
+  std::unique_ptr<Configurable> base, copy;
+  std::string opt_str;
+  std::string mismatch;
+
+  base.reset(SimpleConfigurable::Create("simple", TestConfigMode::kAllOptMode));
+  copy.reset(SimpleConfigurable::Create("simple", TestConfigMode::kAllOptMode));
+  ASSERT_OK(base->ConfigureFromString(config_options_,
+                                      "shared={int=10; string=10};"
+                                      "unique={int=20; string=20};"
+                                      "pointer={int=30; string=30};"));
+  ASSERT_OK(base->GetOptionString(config_options_, &opt_str));
+  ASSERT_OK(copy->ConfigureFromString(config_options_, opt_str));
+  ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+
+TEST_F(ConfigurableTest, GetOptionsTest) {
+  std::unique_ptr<Configurable> simple;
+
+  simple.reset(
+      SimpleConfigurable::Create("simple", TestConfigMode::kAllOptMode));
+  int i = 11;
+  for (auto opt : {"", "shared.", "unique.", "pointer."}) {
+    std::string value;
+    std::string expected = std::to_string(i);
+    std::string opt_name = opt;
+    ASSERT_OK(
+        simple->ConfigureOption(config_options_, opt_name + "int", expected));
+    ASSERT_OK(simple->GetOption(config_options_, opt_name + "int", &value));
+    ASSERT_EQ(expected, value);
+    ASSERT_OK(simple->ConfigureOption(config_options_, opt_name + "string",
+                                      expected));
+    ASSERT_OK(simple->GetOption(config_options_, opt_name + "string", &value));
+    ASSERT_EQ(expected, value);
+
+    ASSERT_NOK(
+        simple->ConfigureOption(config_options_, opt_name + "bad", expected));
+    ASSERT_NOK(simple->GetOption(config_options_, "bad option", &value));
+    ASSERT_TRUE(value.empty());
+    i += 11;
+  }
+}
+
+TEST_F(ConfigurableTest, ConfigureBadOptionsTest) {
+  std::unique_ptr<Configurable> configurable(SimpleConfigurable::Create());
+  auto* opts = configurable->GetOptions<TestOptions>("simple");
+  ASSERT_NE(opts, nullptr);
+  ASSERT_OK(configurable->ConfigureOption(config_options_, "int", "42"));
+  ASSERT_EQ(opts->i, 42);
+  ASSERT_NOK(configurable->ConfigureOption(config_options_, "int", "fred"));
+  ASSERT_NOK(configurable->ConfigureOption(config_options_, "bool", "fred"));
+  ASSERT_NOK(
+      configurable->ConfigureFromString(config_options_, "int=33;unused=u"));
+  ASSERT_EQ(opts->i, 42);
+}
+
+TEST_F(ConfigurableTest, InvalidOptionTest) {
+  std::unique_ptr<Configurable> configurable(SimpleConfigurable::Create());
+  std::unordered_map<std::string, std::string> options_map = {
+      {"bad-option", "bad"}};
+  ASSERT_NOK(configurable->ConfigureFromMap(config_options_, options_map));
+  ASSERT_NOK(
+      configurable->ConfigureFromString(config_options_, "bad-option=bad"));
+  ASSERT_NOK(
+      configurable->ConfigureOption(config_options_, "bad-option", "bad"));
+}
+
+static std::unordered_map<std::string, OptionTypeInfo> validated_option_info = {
+#ifndef ROCKSDB_LITE
+    {"validated",
+     {0, OptionType::kBoolean, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+};
+static std::unordered_map<std::string, OptionTypeInfo> prepared_option_info = {
+#ifndef ROCKSDB_LITE
+    {"prepared",
+     {0, OptionType::kInt, OptionVerificationType::kNormal,
+      OptionTypeFlags::kMutable}},
+#endif  // ROCKSDB_LITE
+};
+static std::unordered_map<std::string, OptionTypeInfo>
+    dont_prepare_option_info = {
+#ifndef ROCKSDB_LITE
+        {"unique",
+         {0, OptionType::kConfigurable, OptionVerificationType::kNormal,
+          (OptionTypeFlags::kUnique | OptionTypeFlags::kDontPrepare)}},
+
+#endif  // ROCKSDB_LITE
+};
+
+class ValidatedConfigurable : public SimpleConfigurable {
+ public:
+  ValidatedConfigurable(const std::string& name, unsigned char mode,
+                        bool dont_prepare = false)
+      : SimpleConfigurable(name, TestConfigMode::kDefaultMode),
+        validated(false),
+        prepared(0) {
+    RegisterOptions("Validated", &validated, &validated_option_info);
+    RegisterOptions("Prepared", &prepared, &prepared_option_info);
+    if ((mode & TestConfigMode::kUniqueMode) != 0) {
+      unique_.reset(new ValidatedConfigurable(
+          "Unique" + name_, TestConfigMode::kDefaultMode, false));
+      if (dont_prepare) {
+        RegisterOptions(name_ + "Unique", &unique_, &dont_prepare_option_info);
+      } else {
+        RegisterOptions(name_ + "Unique", &unique_, &unique_option_info);
+      }
+    }
+  }
+
+  Status PrepareOptions(const ConfigOptions& config_options) override {
+    if (++prepared <= 0) {
+      return Status::InvalidArgument("Cannot prepare option");
+    } else {
+      return SimpleConfigurable::PrepareOptions(config_options);
+    }
+  }
+
+  Status ValidateOptions(const DBOptions& db_opts,
+                         const ColumnFamilyOptions& cf_opts) const override {
+    if (!validated) {
+      return Status::InvalidArgument("Not Validated");
+    } else {
+      return SimpleConfigurable::ValidateOptions(db_opts, cf_opts);
+    }
+  }
+
+ private:
+  bool validated;
+  int prepared;
+};
+
+TEST_F(ConfigurableTest, ValidateOptionsTest) {
+  std::unique_ptr<Configurable> configurable(
+      new ValidatedConfigurable("validated", TestConfigMode::kDefaultMode));
+  ColumnFamilyOptions cf_opts;
+  DBOptions db_opts;
+  ASSERT_OK(
+      configurable->ConfigureOption(config_options_, "validated", "false"));
+  ASSERT_NOK(configurable->ValidateOptions(db_opts, cf_opts));
+  ASSERT_OK(
+      configurable->ConfigureOption(config_options_, "validated", "true"));
+  ASSERT_OK(configurable->ValidateOptions(db_opts, cf_opts));
+}
+
+TEST_F(ConfigurableTest, PrepareOptionsTest) {
+  std::unique_ptr<Configurable> c(
+      new ValidatedConfigurable("Simple", TestConfigMode::kUniqueMode, false));
+  auto cp = c->GetOptions<int>("Prepared");
+  auto u = c->GetOptions<std::unique_ptr<Configurable>>("SimpleUnique");
+  auto up = u->get()->GetOptions<int>("Prepared");
+  config_options_.invoke_prepare_options = false;
+
+  ASSERT_NE(cp, nullptr);
+  ASSERT_NE(up, nullptr);
+  ASSERT_EQ(*cp, 0);
+  ASSERT_EQ(*up, 0);
+  ASSERT_OK(c->ConfigureFromMap(config_options_, {}));
+  ASSERT_EQ(*cp, 0);
+  ASSERT_EQ(*up, 0);
+  config_options_.invoke_prepare_options = true;
+  ASSERT_OK(c->ConfigureFromMap(config_options_, {}));
+  ASSERT_EQ(*cp, 1);
+  ASSERT_EQ(*up, 1);
+  ASSERT_OK(c->ConfigureFromString(config_options_, "prepared=0"));
+  ASSERT_EQ(*up, 2);
+  ASSERT_EQ(*cp, 1);
+
+  ASSERT_NOK(c->ConfigureFromString(config_options_, "prepared=-2"));
+
+  c.reset(
+      new ValidatedConfigurable("Simple", TestConfigMode::kUniqueMode, true));
+  cp = c->GetOptions<int>("Prepared");
+  u = c->GetOptions<std::unique_ptr<Configurable>>("SimpleUnique");
+  up = u->get()->GetOptions<int>("Prepared");
+
+  ASSERT_OK(c->ConfigureFromString(config_options_, "prepared=0"));
+  ASSERT_EQ(*cp, 1);
+  ASSERT_EQ(*up, 0);
+}
+
+TEST_F(ConfigurableTest, CopyObjectTest) {
+  class CopyConfigurable : public Configurable {
+   public:
+    CopyConfigurable() : prepared_(0), validated_(0) {}
+    Status PrepareOptions(const ConfigOptions& options) override {
+      prepared_++;
+      return Configurable::PrepareOptions(options);
+    }
+    Status ValidateOptions(const DBOptions& db_opts,
+                           const ColumnFamilyOptions& cf_opts) const override {
+      validated_++;
+      return Configurable::ValidateOptions(db_opts, cf_opts);
+    }
+    int prepared_;
+    mutable int validated_;
+  };
+
+  CopyConfigurable c1;
+  ConfigOptions config_options;
+  Options options;
+
+  ASSERT_OK(c1.PrepareOptions(config_options));
+  ASSERT_OK(c1.ValidateOptions(options, options));
+  ASSERT_EQ(c1.prepared_, 1);
+  ASSERT_EQ(c1.validated_, 1);
+  CopyConfigurable c2 = c1;
+  ASSERT_OK(c1.PrepareOptions(config_options));
+  ASSERT_OK(c1.ValidateOptions(options, options));
+  ASSERT_EQ(c2.prepared_, 1);
+  ASSERT_EQ(c2.validated_, 1);
+  ASSERT_EQ(c1.prepared_, 2);
+  ASSERT_EQ(c1.validated_, 2);
+}
+
+TEST_F(ConfigurableTest, MutableOptionsTest) {
+  static std::unordered_map<std::string, OptionTypeInfo> imm_option_info = {
+#ifndef ROCKSDB_LITE
+      {"imm", OptionTypeInfo::Struct("imm", &simple_option_info, 0,
+                                     OptionVerificationType::kNormal,
+                                     OptionTypeFlags::kNone)},
+#endif  // ROCKSDB_LITE
+  };
+
+  class MutableConfigurable : public SimpleConfigurable {
+   public:
+    MutableConfigurable()
+        : SimpleConfigurable("mutable", TestConfigMode::kDefaultMode |
+                                            TestConfigMode::kUniqueMode |
+                                            TestConfigMode::kSharedMode) {
+      RegisterOptions("struct", &options_, &struct_option_info);
+      RegisterOptions("imm", &options_, &imm_option_info);
+    }
+  };
+  MutableConfigurable mc;
+  ConfigOptions options = config_options_;
+
+  ASSERT_OK(mc.ConfigureOption(options, "bool", "true"));
+  ASSERT_OK(mc.ConfigureOption(options, "int", "42"));
+  auto* opts = mc.GetOptions<TestOptions>("mutable");
+  ASSERT_NE(opts, nullptr);
+  ASSERT_EQ(opts->i, 42);
+  ASSERT_EQ(opts->b, true);
+  ASSERT_OK(mc.ConfigureOption(options, "struct", "{bool=false;}"));
+  ASSERT_OK(mc.ConfigureOption(options, "imm", "{int=55;}"));
+
+  options.mutable_options_only = true;
+
+  // Now only mutable options should be settable.
+  ASSERT_NOK(mc.ConfigureOption(options, "bool", "true"));
+  ASSERT_OK(mc.ConfigureOption(options, "int", "24"));
+  ASSERT_EQ(opts->i, 24);
+  ASSERT_EQ(opts->b, false);
+  ASSERT_NOK(mc.ConfigureFromString(options, "bool=false;int=33;"));
+  ASSERT_EQ(opts->i, 24);
+  ASSERT_EQ(opts->b, false);
+
+  // Setting options through an immutable struct fails
+  ASSERT_NOK(mc.ConfigureOption(options, "imm", "{int=55;}"));
+  ASSERT_NOK(mc.ConfigureOption(options, "imm.int", "55"));
+  ASSERT_EQ(opts->i, 24);
+  ASSERT_EQ(opts->b, false);
+
+  // Setting options through an mutable struct succeeds
+  ASSERT_OK(mc.ConfigureOption(options, "struct", "{int=44;}"));
+  ASSERT_EQ(opts->i, 44);
+  ASSERT_OK(mc.ConfigureOption(options, "struct.int", "55"));
+  ASSERT_EQ(opts->i, 55);
+
+  // Setting nested immutable configurable options fail
+  ASSERT_NOK(mc.ConfigureOption(options, "shared", "{bool=true;}"));
+  ASSERT_NOK(mc.ConfigureOption(options, "shared.bool", "true"));
+
+  // Setting nested mutable configurable options succeeds
+  ASSERT_OK(mc.ConfigureOption(options, "unique", "{bool=true}"));
+  ASSERT_OK(mc.ConfigureOption(options, "unique.bool", "true"));
+}
+
+TEST_F(ConfigurableTest, DeprecatedOptionsTest) {
+  static std::unordered_map<std::string, OptionTypeInfo>
+      deprecated_option_info = {
+          {"deprecated",
+           {offsetof(struct TestOptions, b), OptionType::kBoolean,
+            OptionVerificationType::kDeprecated, OptionTypeFlags::kNone}}};
+  std::unique_ptr<Configurable> orig;
+  orig.reset(SimpleConfigurable::Create("simple", TestConfigMode::kDefaultMode,
+                                        &deprecated_option_info));
+  auto* opts = orig->GetOptions<TestOptions>("simple");
+  ASSERT_NE(opts, nullptr);
+  opts->d = true;
+  ASSERT_OK(orig->ConfigureOption(config_options_, "deprecated", "false"));
+  ASSERT_TRUE(opts->d);
+  ASSERT_OK(orig->ConfigureFromString(config_options_, "deprecated=false"));
+  ASSERT_TRUE(opts->d);
+}
+
+TEST_F(ConfigurableTest, AliasOptionsTest) {
+  static std::unordered_map<std::string, OptionTypeInfo> alias_option_info = {
+      {"bool",
+       {offsetof(struct TestOptions, b), OptionType::kBoolean,
+        OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+      {"alias",
+       {offsetof(struct TestOptions, b), OptionType::kBoolean,
+        OptionVerificationType::kAlias, OptionTypeFlags::kNone, 0}}};
+  std::unique_ptr<Configurable> orig;
+  orig.reset(SimpleConfigurable::Create("simple", TestConfigMode::kDefaultMode,
+                                        &alias_option_info));
+  auto* opts = orig->GetOptions<TestOptions>("simple");
+  ASSERT_NE(opts, nullptr);
+  ASSERT_OK(orig->ConfigureOption(config_options_, "bool", "false"));
+  ASSERT_FALSE(opts->b);
+  ASSERT_OK(orig->ConfigureOption(config_options_, "alias", "true"));
+  ASSERT_TRUE(opts->b);
+  std::string opts_str;
+  ASSERT_OK(orig->GetOptionString(config_options_, &opts_str));
+  ASSERT_EQ(opts_str.find("alias"), std::string::npos);
+
+  ASSERT_OK(orig->ConfigureOption(config_options_, "bool", "false"));
+  ASSERT_FALSE(opts->b);
+  ASSERT_OK(orig->GetOption(config_options_, "alias", &opts_str));
+  ASSERT_EQ(opts_str, "false");
+}
+
+TEST_F(ConfigurableTest, NestedUniqueConfigTest) {
+  std::unique_ptr<Configurable> simple;
+  simple.reset(
+      SimpleConfigurable::Create("Outer", TestConfigMode::kAllOptMode));
+  const auto outer = simple->GetOptions<TestOptions>("Outer");
+  const auto unique =
+      simple->GetOptions<std::unique_ptr<Configurable>>("OuterUnique");
+  ASSERT_NE(outer, nullptr);
+  ASSERT_NE(unique, nullptr);
+  ASSERT_OK(
+      simple->ConfigureFromString(config_options_, "int=24;string=outer"));
+  ASSERT_OK(simple->ConfigureFromString(config_options_,
+                                        "unique={int=42;string=nested}"));
+  const auto inner = unique->get()->GetOptions<TestOptions>("UniqueOuter");
+  ASSERT_NE(inner, nullptr);
+  ASSERT_EQ(outer->i, 24);
+  ASSERT_EQ(outer->s, "outer");
+  ASSERT_EQ(inner->i, 42);
+  ASSERT_EQ(inner->s, "nested");
+}
+
+TEST_F(ConfigurableTest, NestedSharedConfigTest) {
+  std::unique_ptr<Configurable> simple;
+  simple.reset(SimpleConfigurable::Create(
+      "Outer", TestConfigMode::kDefaultMode | TestConfigMode::kSharedMode));
+  ASSERT_OK(
+      simple->ConfigureFromString(config_options_, "int=24;string=outer"));
+  ASSERT_OK(simple->ConfigureFromString(config_options_,
+                                        "shared={int=42;string=nested}"));
+  const auto outer = simple->GetOptions<TestOptions>("Outer");
+  const auto shared =
+      simple->GetOptions<std::shared_ptr<Configurable>>("OuterShared");
+  ASSERT_NE(outer, nullptr);
+  ASSERT_NE(shared, nullptr);
+  const auto inner = shared->get()->GetOptions<TestOptions>("SharedOuter");
+  ASSERT_NE(inner, nullptr);
+  ASSERT_EQ(outer->i, 24);
+  ASSERT_EQ(outer->s, "outer");
+  ASSERT_EQ(inner->i, 42);
+  ASSERT_EQ(inner->s, "nested");
+}
+
+TEST_F(ConfigurableTest, NestedRawConfigTest) {
+  std::unique_ptr<Configurable> simple;
+  simple.reset(SimpleConfigurable::Create(
+      "Outer", TestConfigMode::kDefaultMode | TestConfigMode::kRawPtrMode));
+  ASSERT_OK(
+      simple->ConfigureFromString(config_options_, "int=24;string=outer"));
+  ASSERT_OK(simple->ConfigureFromString(config_options_,
+                                        "pointer={int=42;string=nested}"));
+  const auto outer = simple->GetOptions<TestOptions>("Outer");
+  const auto pointer = simple->GetOptions<Configurable*>("OuterPointer");
+  ASSERT_NE(outer, nullptr);
+  ASSERT_NE(pointer, nullptr);
+  const auto inner = (*pointer)->GetOptions<TestOptions>("PointerOuter");
+  ASSERT_NE(inner, nullptr);
+  ASSERT_EQ(outer->i, 24);
+  ASSERT_EQ(outer->s, "outer");
+  ASSERT_EQ(inner->i, 42);
+  ASSERT_EQ(inner->s, "nested");
+}
+
+TEST_F(ConfigurableTest, MatchesTest) {
+  std::string mismatch;
+  std::unique_ptr<Configurable> base, copy;
+  base.reset(SimpleConfigurable::Create(
+      "simple", TestConfigMode::kDefaultMode | TestConfigMode::kNestedMode));
+  copy.reset(SimpleConfigurable::Create(
+      "simple", TestConfigMode::kDefaultMode | TestConfigMode::kNestedMode));
+  ASSERT_OK(base->ConfigureFromString(
+      config_options_,
+      "int=11;string=outer;unique={int=22;string=u};shared={int=33;string=s}"));
+  ASSERT_OK(copy->ConfigureFromString(
+      config_options_,
+      "int=11;string=outer;unique={int=22;string=u};shared={int=33;string=s}"));
+  ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &mismatch));
+  ASSERT_OK(base->ConfigureOption(config_options_, "shared", "int=44"));
+  ASSERT_FALSE(base->AreEquivalent(config_options_, copy.get(), &mismatch));
+  ASSERT_EQ(mismatch, "shared.int");
+  std::string c1value, c2value;
+  ASSERT_OK(base->GetOption(config_options_, mismatch, &c1value));
+  ASSERT_OK(copy->GetOption(config_options_, mismatch, &c2value));
+  ASSERT_NE(c1value, c2value);
+}
+
+static Configurable* SimpleStructFactory() {
+  return SimpleConfigurable::Create(
+      "simple-struct", TestConfigMode::kDefaultMode, &struct_option_info);
+}
+
+TEST_F(ConfigurableTest, ConfigureStructTest) {
+  std::unique_ptr<Configurable> base(SimpleStructFactory());
+  std::unique_ptr<Configurable> copy(SimpleStructFactory());
+  std::string opt_str, value;
+  std::string mismatch;
+  std::unordered_set<std::string> names;
+
+  ASSERT_OK(
+      base->ConfigureFromString(config_options_, "struct={int=10; string=10}"));
+  ASSERT_OK(base->GetOptionString(config_options_, &opt_str));
+  ASSERT_OK(copy->ConfigureFromString(config_options_, opt_str));
+  ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &mismatch));
+  ASSERT_OK(base->GetOptionNames(config_options_, &names));
+  ASSERT_EQ(names.size(), 1);
+  ASSERT_EQ(*(names.begin()), "struct");
+  ASSERT_OK(
+      base->ConfigureFromString(config_options_, "struct={int=20; string=20}"));
+  ASSERT_OK(base->GetOption(config_options_, "struct", &value));
+  ASSERT_OK(copy->ConfigureOption(config_options_, "struct", value));
+  ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &mismatch));
+
+  ASSERT_NOK(base->ConfigureFromString(config_options_,
+                                       "struct={int=10; string=10; bad=11}"));
+  ASSERT_OK(base->ConfigureOption(config_options_, "struct.int", "42"));
+  ASSERT_NOK(base->ConfigureOption(config_options_, "struct.bad", "42"));
+  ASSERT_NOK(base->GetOption(config_options_, "struct.bad", &value));
+  ASSERT_OK(base->GetOption(config_options_, "struct.int", &value));
+  ASSERT_EQ(value, "42");
+}
+
+TEST_F(ConfigurableTest, ConfigurableEnumTest) {
+  std::unique_ptr<Configurable> base, copy;
+  base.reset(SimpleConfigurable::Create("e", TestConfigMode::kEnumMode));
+  copy.reset(SimpleConfigurable::Create("e", TestConfigMode::kEnumMode));
+
+  std::string opts_str;
+  std::string mismatch;
+
+  ASSERT_OK(base->ConfigureFromString(config_options_, "enum=B"));
+  ASSERT_FALSE(base->AreEquivalent(config_options_, copy.get(), &mismatch));
+  ASSERT_OK(base->GetOptionString(config_options_, &opts_str));
+  ASSERT_OK(copy->ConfigureFromString(config_options_, opts_str));
+  ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &mismatch));
+  ASSERT_NOK(base->ConfigureOption(config_options_, "enum", "bad"));
+  ASSERT_NOK(base->ConfigureOption(config_options_, "unknown", "bad"));
+}
+
+#ifndef ROCKSDB_LITE
+static std::unordered_map<std::string, OptionTypeInfo> noserialize_option_info =
+    {
+        {"int",
+         {offsetof(struct TestOptions, i), OptionType::kInt,
+          OptionVerificationType::kNormal, OptionTypeFlags::kDontSerialize}},
+};
+
+TEST_F(ConfigurableTest, TestNoSerialize) {
+  std::unique_ptr<Configurable> base;
+  base.reset(SimpleConfigurable::Create("c", TestConfigMode::kDefaultMode,
+                                        &noserialize_option_info));
+  std::string opts_str, value;
+  ASSERT_OK(base->ConfigureFromString(config_options_, "int=10"));
+  ASSERT_OK(base->GetOptionString(config_options_, &opts_str));
+  ASSERT_EQ(opts_str, "");
+  ASSERT_NOK(base->GetOption(config_options_, "int", &value));
+}
+
+TEST_F(ConfigurableTest, TestNoCompare) {
+  std::unordered_map<std::string, OptionTypeInfo> nocomp_option_info = {
+      {"int",
+       {offsetof(struct TestOptions, i), OptionType::kInt,
+        OptionVerificationType::kNormal, OptionTypeFlags::kCompareNever}},
+  };
+  std::unordered_map<std::string, OptionTypeInfo> normal_option_info = {
+      {"int",
+       {offsetof(struct TestOptions, i), OptionType::kInt,
+        OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+  };
+
+  std::unique_ptr<Configurable> base, copy;
+  base.reset(SimpleConfigurable::Create("c", TestConfigMode::kDefaultMode,
+                                        &nocomp_option_info));
+  copy.reset(SimpleConfigurable::Create("c", TestConfigMode::kDefaultMode,
+                                        &normal_option_info));
+  ASSERT_OK(base->ConfigureFromString(config_options_, "int=10"));
+  ASSERT_OK(copy->ConfigureFromString(config_options_, "int=20"));
+  std::string bvalue, cvalue, mismatch;
+  ASSERT_OK(base->GetOption(config_options_, "int", &bvalue));
+  ASSERT_OK(copy->GetOption(config_options_, "int", &cvalue));
+  ASSERT_EQ(bvalue, "10");
+  ASSERT_EQ(cvalue, "20");
+  ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &mismatch));
+  ASSERT_FALSE(copy->AreEquivalent(config_options_, base.get(), &mismatch));
+}
+
+TEST_F(ConfigurableTest, NullOptionMapTest) {
+  std::unique_ptr<Configurable> base;
+  std::unordered_set<std::string> names;
+  std::string str;
+
+  base.reset(
+      SimpleConfigurable::Create("c", TestConfigMode::kDefaultMode, nullptr));
+  ASSERT_NOK(base->ConfigureFromString(config_options_, "int=10"));
+  ASSERT_NOK(base->ConfigureFromString(config_options_, "int=20"));
+  ASSERT_NOK(base->ConfigureOption(config_options_, "int", "20"));
+  ASSERT_NOK(base->GetOption(config_options_, "int", &str));
+  ASSERT_NE(base->GetOptions<TestOptions>("c"), nullptr);
+  ASSERT_OK(base->GetOptionNames(config_options_, &names));
+  ASSERT_EQ(names.size(), 0UL);
+  ASSERT_OK(base->PrepareOptions(config_options_));
+  ASSERT_OK(base->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+  std::unique_ptr<Configurable> copy;
+  copy.reset(
+      SimpleConfigurable::Create("c", TestConfigMode::kDefaultMode, nullptr));
+  ASSERT_OK(base->GetOptionString(config_options_, &str));
+  ASSERT_OK(copy->ConfigureFromString(config_options_, str));
+  ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &str));
+}
+#endif
+
+static std::unordered_map<std::string, ConfigTestFactoryFunc> TestFactories = {
+    {"Simple", []() { return SimpleConfigurable::Create("simple"); }},
+    {"Struct", []() { return SimpleStructFactory(); }},
+    {"Unique",
+     []() {
+       return SimpleConfigurable::Create(
+           "simple", TestConfigMode::kSimpleMode | TestConfigMode::kUniqueMode);
+     }},
+    {"Shared",
+     []() {
+       return SimpleConfigurable::Create(
+           "simple", TestConfigMode::kSimpleMode | TestConfigMode::kSharedMode);
+     }},
+    {"Nested",
+     []() {
+       return SimpleConfigurable::Create(
+           "simple", TestConfigMode::kSimpleMode | TestConfigMode::kNestedMode);
+     }},
+    {"Mutable",
+     []() {
+       return SimpleConfigurable::Create("simple",
+                                         TestConfigMode::kMutableMode |
+                                             TestConfigMode::kSimpleMode |
+                                             TestConfigMode::kNestedMode);
+     }},
+    {"ThreeDeep",
+     []() {
+       Configurable* simple = SimpleConfigurable::Create(
+           "Simple",
+           TestConfigMode::kUniqueMode | TestConfigMode::kDefaultMode);
+       auto* unique =
+           simple->GetOptions<std::unique_ptr<Configurable>>("SimpleUnique");
+       unique->reset(SimpleConfigurable::Create(
+           "Child",
+           TestConfigMode::kUniqueMode | TestConfigMode::kDefaultMode));
+       unique = unique->get()->GetOptions<std::unique_ptr<Configurable>>(
+           "ChildUnique");
+       unique->reset(
+           SimpleConfigurable::Create("Child", TestConfigMode::kDefaultMode));
+       return simple;
+     }},
+    {"DBOptions",
+     []() {
+       auto config = DBOptionsAsConfigurable(DBOptions());
+       return config.release();
+     }},
+    {"CFOptions",
+     []() {
+       auto config = CFOptionsAsConfigurable(ColumnFamilyOptions());
+       return config.release();
+     }},
+    {"BlockBased", []() { return NewBlockBasedTableFactory(); }},
+};
+
+class ConfigurableParamTest : public ConfigurableTest,
+                              virtual public ::testing::WithParamInterface<
+                                  std::pair<std::string, std::string>> {
+ public:
+  ConfigurableParamTest() {
+    type_ = GetParam().first;
+    configuration_ = GetParam().second;
+    assert(TestFactories.find(type_) != TestFactories.end());
+    object_.reset(CreateConfigurable());
+  }
+
+  Configurable* CreateConfigurable() {
+    const auto& iter = TestFactories.find(type_);
+    return (iter->second)();
+  }
+
+  void TestConfigureOptions(const ConfigOptions& opts);
+  std::string type_;
+  std::string configuration_;
+  std::unique_ptr<Configurable> object_;
+};
+
+void ConfigurableParamTest::TestConfigureOptions(
+    const ConfigOptions& config_options) {
+  std::unique_ptr<Configurable> base, copy;
+  std::unordered_set<std::string> names;
+  std::string opt_str, mismatch;
+
+  base.reset(CreateConfigurable());
+  copy.reset(CreateConfigurable());
+
+  ASSERT_OK(base->ConfigureFromString(config_options, configuration_));
+  ASSERT_OK(base->GetOptionString(config_options, &opt_str));
+  ASSERT_OK(copy->ConfigureFromString(config_options, opt_str));
+  ASSERT_OK(copy->GetOptionString(config_options, &opt_str));
+  ASSERT_TRUE(base->AreEquivalent(config_options, copy.get(), &mismatch));
+
+  copy.reset(CreateConfigurable());
+  ASSERT_OK(base->GetOptionNames(config_options, &names));
+  std::unordered_map<std::string, std::string> unused;
+  bool found_one = false;
+  for (auto name : names) {
+    std::string value;
+    Status s = base->GetOption(config_options, name, &value);
+    if (s.ok()) {
+      s = copy->ConfigureOption(config_options, name, value);
+      if (s.ok() || s.IsNotSupported()) {
+        found_one = true;
+      } else {
+        unused[name] = value;
+      }
+    } else {
+      ASSERT_TRUE(s.IsNotSupported());
+    }
+  }
+  ASSERT_TRUE(found_one || names.empty());
+  while (found_one && !unused.empty()) {
+    found_one = false;
+    for (auto iter = unused.begin(); iter != unused.end();) {
+      if (copy->ConfigureOption(config_options, iter->first, iter->second)
+              .ok()) {
+        found_one = true;
+        iter = unused.erase(iter);
+      } else {
+        ++iter;
+      }
+    }
+  }
+  ASSERT_EQ(0, unused.size());
+  ASSERT_TRUE(base->AreEquivalent(config_options, copy.get(), &mismatch));
+}
+
+TEST_P(ConfigurableParamTest, GetDefaultOptionsTest) {
+  TestConfigureOptions(config_options_);
+}
+
+TEST_P(ConfigurableParamTest, ConfigureFromPropsTest) {
+  std::string opt_str, mismatch;
+  std::unordered_set<std::string> names;
+  std::unique_ptr<Configurable> copy(CreateConfigurable());
+
+  ASSERT_OK(object_->ConfigureFromString(config_options_, configuration_));
+  config_options_.delimiter = "\n";
+  ASSERT_OK(object_->GetOptionString(config_options_, &opt_str));
+  std::istringstream iss(opt_str);
+  std::unordered_map<std::string, std::string> copy_map;
+  std::string line;
+  for (int line_num = 0; std::getline(iss, line); line_num++) {
+    std::string name;
+    std::string value;
+    ASSERT_OK(
+        RocksDBOptionsParser::ParseStatement(&name, &value, line, line_num));
+    copy_map[name] = value;
+  }
+  ASSERT_OK(copy->ConfigureFromMap(config_options_, copy_map));
+  ASSERT_TRUE(object_->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    ParamTest, ConfigurableParamTest,
+    testing::Values(
+        std::pair<std::string, std::string>("Simple",
+                                            "int=42;bool=true;string=s"),
+        std::pair<std::string, std::string>(
+            "Mutable", "int=42;unique={int=33;string=unique}"),
+        std::pair<std::string, std::string>(
+            "Struct", "struct={int=33;bool=true;string=s;}"),
+        std::pair<std::string, std::string>("Shared",
+                                            "int=33;bool=true;string=outer;"
+                                            "shared={int=42;string=shared}"),
+        std::pair<std::string, std::string>("Unique",
+                                            "int=33;bool=true;string=outer;"
+                                            "unique={int=42;string=unique}"),
+        std::pair<std::string, std::string>("Nested",
+                                            "int=11;bool=true;string=outer;"
+                                            "pointer={int=22;string=pointer};"
+                                            "unique={int=33;string=unique};"
+                                            "shared={int=44;string=shared}"),
+        std::pair<std::string, std::string>("ThreeDeep",
+                                            "int=11;bool=true;string=outer;"
+                                            "unique={int=22;string=inner;"
+                                            "unique={int=33;string=unique}};"),
+        std::pair<std::string, std::string>("DBOptions",
+                                            "max_background_jobs=100;"
+                                            "max_open_files=200;"),
+        std::pair<std::string, std::string>("CFOptions",
+                                            "table_factory=BlockBasedTable;"
+                                            "disable_auto_compactions=true;"),
+        std::pair<std::string, std::string>("BlockBased",
+                                            "block_size=1024;"
+                                            "no_block_cache=true;")));
+#endif  // ROCKSDB_LITE
+
+}  // namespace test
+}  // namespace ROCKSDB_NAMESPACE
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+#ifdef GFLAGS
+  ParseCommandLineFlags(&argc, &argv, true);
+#endif  // GFLAGS
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/options/configurable_test.h b/src/rocksdb/options/configurable_test.h
new file mode 100644
index 000000000..cf9d06678
--- /dev/null
+++ b/src/rocksdb/options/configurable_test.h
@@ -0,0 +1,126 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <algorithm>
+#include <memory>
+#include <unordered_map>
+
+#include "options/configurable_helper.h"
+#include "rocksdb/configurable.h"
+#include "rocksdb/utilities/options_type.h"
+
+namespace ROCKSDB_NAMESPACE {
+struct ColumnFamilyOptions;
+struct DBOptions;
+
+namespace test {
+enum TestEnum { kTestA, kTestB };
+
+static const std::unordered_map<std::string, int> test_enum_map = {
+    {"A", TestEnum::kTestA},
+    {"B", TestEnum::kTestB},
+};
+
+struct TestOptions {
+  int i = 0;
+  bool b = false;
+  bool d = true;
+  TestEnum e = TestEnum::kTestA;
+  std::string s = "";
+  std::string u = "";
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> simple_option_info = {
+#ifndef ROCKSDB_LITE
+    {"int",
+     {offsetof(struct TestOptions, i), OptionType::kInt,
+      OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
+    {"bool",
+     {offsetof(struct TestOptions, b), OptionType::kBoolean,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"string",
+     {offsetof(struct TestOptions, s), OptionType::kString,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> enum_option_info = {
+#ifndef ROCKSDB_LITE
+    {"enum",
+     OptionTypeInfo::Enum(offsetof(struct TestOptions, e), &test_enum_map)}
+#endif
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> unique_option_info = {
+#ifndef ROCKSDB_LITE
+    {"unique",
+     {0, OptionType::kConfigurable, OptionVerificationType::kNormal,
+      (OptionTypeFlags::kUnique | OptionTypeFlags::kMutable)}},
+#endif  // ROCKSDB_LITE
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> shared_option_info = {
+#ifndef ROCKSDB_LITE
+    {"shared",
+     {0, OptionType::kConfigurable, OptionVerificationType::kNormal,
+      (OptionTypeFlags::kShared)}},
+#endif  // ROCKSDB_LITE
+};
+static std::unordered_map<std::string, OptionTypeInfo> pointer_option_info = {
+#ifndef ROCKSDB_LITE
+    {"pointer",
+     {0, OptionType::kConfigurable, OptionVerificationType::kNormal,
+      OptionTypeFlags::kRawPointer}},
+#endif  // ROCKSDB_LITE
+};
+
+enum TestConfigMode {
+  kEmptyMode = 0x0,            // Don't register anything
+  kMutableMode = 0x01,         // Configuration is mutable
+  kSimpleMode = 0x02,          // Use the simple options
+  kEnumMode = 0x04,            // Use the enum options
+  kDefaultMode = kSimpleMode,  // Use no inner nested configurations
+  kSharedMode = 0x10,          // Use shared configuration
+  kUniqueMode = 0x20,          // Use unique configuration
+  kRawPtrMode = 0x40,          // Use pointer configuration
+  kNestedMode = (kSharedMode | kUniqueMode | kRawPtrMode),
+  kAllOptMode = (kNestedMode | kEnumMode | kSimpleMode),
+};
+
+template <typename T>
+class TestConfigurable : public Configurable {
+ protected:
+  std::string name_;
+  std::string prefix_;
+  TestOptions options_;
+
+ public:
+  std::unique_ptr<T> unique_;
+  std::shared_ptr<T> shared_;
+  T* pointer_;
+
+  TestConfigurable(const std::string& name, int mode,
+                   const std::unordered_map<std::string, OptionTypeInfo>* map =
+                       &simple_option_info)
+      : name_(name), pointer_(nullptr) {
+    prefix_ = "test." + name + ".";
+    if ((mode & TestConfigMode::kSimpleMode) != 0) {
+      RegisterOptions(name_, &options_, map);
+    }
+    if ((mode & TestConfigMode::kEnumMode) != 0) {
+      RegisterOptions(name_ + "Enum", &options_, &enum_option_info);
+    }
+  }
+
+  ~TestConfigurable() override { delete pointer_; }
+};
+
+}  // namespace test
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/options/customizable.cc b/src/rocksdb/options/customizable.cc
new file mode 100644
index 000000000..cd39550e5
--- /dev/null
+++ b/src/rocksdb/options/customizable.cc
@@ -0,0 +1,139 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/customizable.h"
+
+#include <sstream>
+
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::string Customizable::GetOptionName(const std::string& long_name) const {
+  const std::string& name = Name();
+  size_t name_len = name.size();
+  if (long_name.size() > name_len + 1 &&
+      long_name.compare(0, name_len, name) == 0 &&
+      long_name.at(name_len) == '.') {
+    return long_name.substr(name_len + 1);
+  } else {
+    return Configurable::GetOptionName(long_name);
+  }
+}
+
+std::string Customizable::GenerateIndividualId() const {
+  std::ostringstream ostr;
+  ostr << Name() << "@" << static_cast<const void*>(this) << "#"
+       << port::GetProcessID();
+  return ostr.str();
+}
+
+#ifndef ROCKSDB_LITE
+Status Customizable::GetOption(const ConfigOptions& config_options,
+                               const std::string& opt_name,
+                               std::string* value) const {
+  if (opt_name == OptionTypeInfo::kIdPropName()) {
+    *value = GetId();
+    return Status::OK();
+  } else {
+    return Configurable::GetOption(config_options, opt_name, value);
+  }
+}
+
+std::string Customizable::SerializeOptions(const ConfigOptions& config_options,
+                                           const std::string& prefix) const {
+  std::string result;
+  std::string parent;
+  std::string id = GetId();
+  if (!config_options.IsShallow() && !id.empty()) {
+    parent = Configurable::SerializeOptions(config_options, "");
+  }
+  if (parent.empty()) {
+    result = id;
+  } else {
+    result.append(prefix);
+    result.append(OptionTypeInfo::kIdPropName());
+    result.append("=");
+    result.append(id);
+    result.append(config_options.delimiter);
+    result.append(parent);
+  }
+  return result;
+}
+
+#endif  // ROCKSDB_LITE
+
+bool Customizable::AreEquivalent(const ConfigOptions& config_options,
+                                 const Configurable* other,
+                                 std::string* mismatch) const {
+  if (config_options.sanity_level > ConfigOptions::kSanityLevelNone &&
+      this != other) {
+    const Customizable* custom = reinterpret_cast<const Customizable*>(other);
+    if (custom == nullptr) {  // Cast failed
+      return false;
+    } else if (GetId() != custom->GetId()) {
+      *mismatch = OptionTypeInfo::kIdPropName();
+      return false;
+    } else if (config_options.sanity_level >
+               ConfigOptions::kSanityLevelLooselyCompatible) {
+      bool matches =
+          Configurable::AreEquivalent(config_options, other, mismatch);
+      return matches;
+    }
+  }
+  return true;
+}
+
+Status Customizable::GetOptionsMap(
+    const ConfigOptions& config_options, const Customizable* customizable,
+    const std::string& value, std::string* id,
+    std::unordered_map<std::string, std::string>* props) {
+  Status status;
+  if (value.empty() || value == kNullptrString) {
+    *id = "";
+    props->clear();
+  } else if (customizable != nullptr) {
+    status =
+        Configurable::GetOptionsMap(value, customizable->GetId(), id, props);
+#ifdef ROCKSDB_LITE
+    (void)config_options;
+#else
+    if (status.ok() && customizable->IsInstanceOf(*id)) {
+      // The new ID and the old ID match, so the objects are the same type.
+      // Try to get the existing options, ignoring any errors
+      ConfigOptions embedded = config_options;
+      embedded.delimiter = ";";
+      std::string curr_opts;
+      if (customizable->GetOptionString(embedded, &curr_opts).ok()) {
+        std::unordered_map<std::string, std::string> curr_props;
+        if (StringToMap(curr_opts, &curr_props).ok()) {
+          props->insert(curr_props.begin(), curr_props.end());
+        }
+      }
+    }
+#endif  // ROCKSDB_LITE
+  } else {
+    status = Configurable::GetOptionsMap(value, "", id, props);
+  }
+  return status;
+}
+
+Status Customizable::ConfigureNewObject(
+    const ConfigOptions& config_options, Customizable* object,
+    const std::unordered_map<std::string, std::string>& opt_map) {
+  Status status;
+  if (object != nullptr) {
+    status = object->ConfigureFromMap(config_options, opt_map);
+  } else if (!opt_map.empty()) {
+    status = Status::InvalidArgument("Cannot configure null object ");
+  }
+  return status;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/options/customizable_test.cc b/src/rocksdb/options/customizable_test.cc
new file mode 100644
index 000000000..9d3c86c62
--- /dev/null
+++ b/src/rocksdb/options/customizable_test.cc
@@ -0,0 +1,2255 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/customizable.h"
+
+#include <cctype>
+#include <cinttypes>
+#include <cstring>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "db/db_test_util.h"
+#include "memory/jemalloc_nodump_allocator.h"
+#include "memory/memkind_kmem_allocator.h"
+#include "options/options_helper.h"
+#include "options/options_parser.h"
+#include "port/stack_trace.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/env_encryption.h"
+#include "rocksdb/file_checksum.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/memory_allocator.h"
+#include "rocksdb/secondary_cache.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/sst_partitioner.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "table/block_based/flush_block_policy.h"
+#include "table/mock_table.h"
+#include "test_util/mock_time_env.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/file_checksum_helper.h"
+#include "util/string_util.h"
+#include "utilities/compaction_filters/remove_emptyvalue_compactionfilter.h"
+#include "utilities/memory_allocators.h"
+#include "utilities/merge_operators/bytesxor.h"
+#include "utilities/merge_operators/sortlist.h"
+#include "utilities/merge_operators/string_append/stringappend.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
+
+#ifndef GFLAGS
+bool FLAGS_enable_print = false;
+#else
+#include "util/gflags_compat.h"
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+DEFINE_bool(enable_print, false, "Print options generated to console.");
+#endif  // GFLAGS
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+class StringLogger : public Logger {
+ public:
+  using Logger::Logv;
+  void Logv(const char* format, va_list ap) override {
+    char buffer[1000];
+    vsnprintf(buffer, sizeof(buffer), format, ap);
+    string_.append(buffer);
+  }
+  const std::string& str() const { return string_; }
+  void clear() { string_.clear(); }
+
+ private:
+  std::string string_;
+};
+
+class TestCustomizable : public Customizable {
+ public:
+  TestCustomizable(const std::string& name) : name_(name) {}
+  // Method to allow CheckedCast to work for this class
+  static const char* kClassName() {
+    return "TestCustomizable";
+  }
+
+  const char* Name() const override { return name_.c_str(); }
+  static const char* Type() { return "test.custom"; }
+#ifndef ROCKSDB_LITE
+  static Status CreateFromString(const ConfigOptions& opts,
+                                 const std::string& value,
+                                 std::unique_ptr<TestCustomizable>* result);
+  static Status CreateFromString(const ConfigOptions& opts,
+                                 const std::string& value,
+                                 std::shared_ptr<TestCustomizable>* result);
+  static Status CreateFromString(const ConfigOptions& opts,
+                                 const std::string& value,
+                                 TestCustomizable** result);
+#endif  // ROCKSDB_LITE
+  bool IsInstanceOf(const std::string& name) const override {
+    if (name == kClassName()) {
+      return true;
+    } else {
+      return Customizable::IsInstanceOf(name);
+    }
+  }
+
+ protected:
+  const std::string name_;
+};
+
+struct AOptions {
+  static const char* kName() { return "A"; }
+  int i = 0;
+  bool b = false;
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> a_option_info = {
+#ifndef ROCKSDB_LITE
+    {"int",
+     {offsetof(struct AOptions, i), OptionType::kInt,
+      OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
+    {"bool",
+     {offsetof(struct AOptions, b), OptionType::kBoolean,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+};
+
+class ACustomizable : public TestCustomizable {
+ public:
+  explicit ACustomizable(const std::string& id)
+      : TestCustomizable("A"), id_(id) {
+    RegisterOptions(&opts_, &a_option_info);
+  }
+  std::string GetId() const override { return id_; }
+  static const char* kClassName() { return "A"; }
+
+ private:
+  AOptions opts_;
+  const std::string id_;
+};
+
+struct BOptions {
+  std::string s;
+  bool b = false;
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> b_option_info = {
+#ifndef ROCKSDB_LITE
+    {"string",
+     {offsetof(struct BOptions, s), OptionType::kString,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"bool",
+     {offsetof(struct BOptions, b), OptionType::kBoolean,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+};
+
+class BCustomizable : public TestCustomizable {
+ private:
+ public:
+  explicit BCustomizable(const std::string& name) : TestCustomizable(name) {
+    RegisterOptions(name, &opts_, &b_option_info);
+  }
+  static const char* kClassName() { return "B"; }
+
+ private:
+  BOptions opts_;
+};
+
+#ifndef ROCKSDB_LITE
+static bool LoadSharedB(const std::string& id,
+                        std::shared_ptr<TestCustomizable>* result) {
+  if (id == "B") {
+    result->reset(new BCustomizable(id));
+    return true;
+  } else if (id.empty()) {
+    result->reset();
+    return true;
+  } else {
+    return false;
+  }
+}
+
+static int A_count = 0;
+static int RegisterCustomTestObjects(ObjectLibrary& library,
+                                     const std::string& /*arg*/) {
+  library.AddFactory<TestCustomizable>(
+      ObjectLibrary::PatternEntry("A", true).AddSeparator("_"),
+      [](const std::string& name, std::unique_ptr<TestCustomizable>* guard,
+         std::string* /* msg */) {
+        guard->reset(new ACustomizable(name));
+        A_count++;
+        return guard->get();
+      });
+
+  library.AddFactory<TestCustomizable>(
+      "S", [](const std::string& name,
+              std::unique_ptr<TestCustomizable>* /* guard */,
+              std::string* /* msg */) { return new BCustomizable(name); });
+  size_t num_types;
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+#endif  // ROCKSDB_LITE
+
+struct SimpleOptions {
+  static const char* kName() { return "simple"; }
+  bool b = true;
+  std::unique_ptr<TestCustomizable> cu;
+  std::shared_ptr<TestCustomizable> cs;
+  TestCustomizable* cp = nullptr;
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> simple_option_info = {
+#ifndef ROCKSDB_LITE
+    {"bool",
+     {offsetof(struct SimpleOptions, b), OptionType::kBoolean,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"unique",
+     OptionTypeInfo::AsCustomUniquePtr<TestCustomizable>(
+         offsetof(struct SimpleOptions, cu), OptionVerificationType::kNormal,
+         OptionTypeFlags::kAllowNull)},
+    {"shared",
+     OptionTypeInfo::AsCustomSharedPtr<TestCustomizable>(
+         offsetof(struct SimpleOptions, cs), OptionVerificationType::kNormal,
+         OptionTypeFlags::kAllowNull)},
+    {"pointer",
+     OptionTypeInfo::AsCustomRawPtr<TestCustomizable>(
+         offsetof(struct SimpleOptions, cp), OptionVerificationType::kNormal,
+         OptionTypeFlags::kAllowNull)},
+#endif  // ROCKSDB_LITE
+};
+
+class SimpleConfigurable : public Configurable {
+ private:
+  SimpleOptions simple_;
+
+ public:
+  SimpleConfigurable() { RegisterOptions(&simple_, &simple_option_info); }
+
+  explicit SimpleConfigurable(
+      const std::unordered_map<std::string, OptionTypeInfo>* map) {
+    RegisterOptions(&simple_, map);
+  }
+};
+
+#ifndef ROCKSDB_LITE
+static void GetMapFromProperties(
+    const std::string& props,
+    std::unordered_map<std::string, std::string>* map) {
+  std::istringstream iss(props);
+  std::unordered_map<std::string, std::string> copy_map;
+  std::string line;
+  map->clear();
+  for (int line_num = 0; std::getline(iss, line); line_num++) {
+    std::string name;
+    std::string value;
+    ASSERT_OK(
+        RocksDBOptionsParser::ParseStatement(&name, &value, line, line_num));
+    (*map)[name] = value;
+  }
+}
+#endif  // ROCKSDB_LITE
+}  // namespace
+
+#ifndef ROCKSDB_LITE
+Status TestCustomizable::CreateFromString(
+    const ConfigOptions& config_options, const std::string& value,
+    std::shared_ptr<TestCustomizable>* result) {
+  return LoadSharedObject<TestCustomizable>(config_options, value, LoadSharedB,
+                                            result);
+}
+
+Status TestCustomizable::CreateFromString(
+    const ConfigOptions& config_options, const std::string& value,
+    std::unique_ptr<TestCustomizable>* result) {
+  return LoadUniqueObject<TestCustomizable>(
+      config_options, value,
+      [](const std::string& id, std::unique_ptr<TestCustomizable>* u) {
+        if (id == "B") {
+          u->reset(new BCustomizable(id));
+          return true;
+        } else if (id.empty()) {
+          u->reset();
+          return true;
+        } else {
+          return false;
+        }
+      },
+      result);
+}
+
+Status TestCustomizable::CreateFromString(const ConfigOptions& config_options,
+                                          const std::string& value,
+                                          TestCustomizable** result) {
+  return LoadStaticObject<TestCustomizable>(
+      config_options, value,
+      [](const std::string& id, TestCustomizable** ptr) {
+        if (id == "B") {
+          *ptr = new BCustomizable(id);
+          return true;
+        } else if (id.empty()) {
+          *ptr = nullptr;
+          return true;
+        } else {
+          return false;
+        }
+      },
+      result);
+}
+#endif  // ROCKSDB_LITE
+
+class CustomizableTest : public testing::Test {
+ public:
+  CustomizableTest() {
+    config_options_.invoke_prepare_options = false;
+#ifndef ROCKSDB_LITE
+    // GetOptionsFromMap is not supported in ROCKSDB_LITE
+    config_options_.registry->AddLibrary("CustomizableTest",
+                                         RegisterCustomTestObjects, "");
+#endif  // ROCKSDB_LITE
+  }
+
+  ConfigOptions config_options_;
+};
+
+#ifndef ROCKSDB_LITE  // GetOptionsFromMap is not supported in ROCKSDB_LITE
+// Tests that a Customizable can be created by:
+//    - a simple name
+//    - a XXX.id option
+//    - a property with a name
+TEST_F(CustomizableTest, CreateByNameTest) {
+  ObjectLibrary::Default()->AddFactory<TestCustomizable>(
+      ObjectLibrary::PatternEntry("TEST", false).AddSeparator("_"),
+      [](const std::string& name, std::unique_ptr<TestCustomizable>* guard,
+         std::string* /* msg */) {
+        guard->reset(new TestCustomizable(name));
+        return guard->get();
+      });
+  std::unique_ptr<Configurable> configurable(new SimpleConfigurable());
+  SimpleOptions* simple = configurable->GetOptions<SimpleOptions>();
+  ASSERT_NE(simple, nullptr);
+  ASSERT_OK(
+      configurable->ConfigureFromString(config_options_, "unique={id=TEST_1}"));
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_EQ(simple->cu->GetId(), "TEST_1");
+  ASSERT_OK(
+      configurable->ConfigureFromString(config_options_, "unique.id=TEST_2"));
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_EQ(simple->cu->GetId(), "TEST_2");
+  ASSERT_OK(
+      configurable->ConfigureFromString(config_options_, "unique=TEST_3"));
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_EQ(simple->cu->GetId(), "TEST_3");
+}
+
+TEST_F(CustomizableTest, ToStringTest) {
+  std::unique_ptr<TestCustomizable> custom(new TestCustomizable("test"));
+  ASSERT_EQ(custom->ToString(config_options_), "test");
+}
+
+TEST_F(CustomizableTest, SimpleConfigureTest) {
+  std::unordered_map<std::string, std::string> opt_map = {
+      {"unique", "id=A;int=1;bool=true"},
+      {"shared", "id=B;string=s"},
+  };
+  std::unique_ptr<Configurable> configurable(new SimpleConfigurable());
+  ASSERT_OK(configurable->ConfigureFromMap(config_options_, opt_map));
+  SimpleOptions* simple = configurable->GetOptions<SimpleOptions>();
+  ASSERT_NE(simple, nullptr);
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_EQ(simple->cu->GetId(), "A");
+  std::string opt_str;
+  std::string mismatch;
+  ASSERT_OK(configurable->GetOptionString(config_options_, &opt_str));
+  std::unique_ptr<Configurable> copy(new SimpleConfigurable());
+  ASSERT_OK(copy->ConfigureFromString(config_options_, opt_str));
+  ASSERT_TRUE(
+      configurable->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+
+TEST_F(CustomizableTest, ConfigureFromPropsTest) {
+  std::unordered_map<std::string, std::string> opt_map = {
+      {"unique.id", "A"}, {"unique.A.int", "1"},    {"unique.A.bool", "true"},
+      {"shared.id", "B"}, {"shared.B.string", "s"},
+  };
+  std::unique_ptr<Configurable> configurable(new SimpleConfigurable());
+  ASSERT_OK(configurable->ConfigureFromMap(config_options_, opt_map));
+  SimpleOptions* simple = configurable->GetOptions<SimpleOptions>();
+  ASSERT_NE(simple, nullptr);
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_EQ(simple->cu->GetId(), "A");
+  std::string opt_str;
+  std::string mismatch;
+  config_options_.delimiter = "\n";
+  std::unordered_map<std::string, std::string> props;
+  ASSERT_OK(configurable->GetOptionString(config_options_, &opt_str));
+  GetMapFromProperties(opt_str, &props);
+  std::unique_ptr<Configurable> copy(new SimpleConfigurable());
+  ASSERT_OK(copy->ConfigureFromMap(config_options_, props));
+  ASSERT_TRUE(
+      configurable->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+
+TEST_F(CustomizableTest, ConfigureFromShortTest) {
+  std::unordered_map<std::string, std::string> opt_map = {
+      {"unique.id", "A"}, {"unique.A.int", "1"},    {"unique.A.bool", "true"},
+      {"shared.id", "B"}, {"shared.B.string", "s"},
+  };
+  std::unique_ptr<Configurable> configurable(new SimpleConfigurable());
+  ASSERT_OK(configurable->ConfigureFromMap(config_options_, opt_map));
+  SimpleOptions* simple = configurable->GetOptions<SimpleOptions>();
+  ASSERT_NE(simple, nullptr);
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_EQ(simple->cu->GetId(), "A");
+}
+
+TEST_F(CustomizableTest, AreEquivalentOptionsTest) {
+  std::unordered_map<std::string, std::string> opt_map = {
+      {"unique", "id=A;int=1;bool=true"},
+      {"shared", "id=A;int=1;bool=true"},
+  };
+  std::string mismatch;
+  ConfigOptions config_options = config_options_;
+  std::unique_ptr<Configurable> c1(new SimpleConfigurable());
+  std::unique_ptr<Configurable> c2(new SimpleConfigurable());
+  ASSERT_OK(c1->ConfigureFromMap(config_options, opt_map));
+  ASSERT_OK(c2->ConfigureFromMap(config_options, opt_map));
+  ASSERT_TRUE(c1->AreEquivalent(config_options, c2.get(), &mismatch));
+  SimpleOptions* simple = c1->GetOptions<SimpleOptions>();
+  ASSERT_TRUE(
+      simple->cu->AreEquivalent(config_options, simple->cs.get(), &mismatch));
+  ASSERT_OK(simple->cu->ConfigureOption(config_options, "int", "2"));
+  ASSERT_FALSE(
+      simple->cu->AreEquivalent(config_options, simple->cs.get(), &mismatch));
+  ASSERT_FALSE(c1->AreEquivalent(config_options, c2.get(), &mismatch));
+  ConfigOptions loosely = config_options;
+  loosely.sanity_level = ConfigOptions::kSanityLevelLooselyCompatible;
+  ASSERT_TRUE(c1->AreEquivalent(loosely, c2.get(), &mismatch));
+  ASSERT_TRUE(simple->cu->AreEquivalent(loosely, simple->cs.get(), &mismatch));
+
+  ASSERT_OK(c1->ConfigureOption(config_options, "shared", "id=B;string=3"));
+  ASSERT_TRUE(c1->AreEquivalent(loosely, c2.get(), &mismatch));
+  ASSERT_FALSE(c1->AreEquivalent(config_options, c2.get(), &mismatch));
+  ASSERT_FALSE(simple->cs->AreEquivalent(loosely, simple->cu.get(), &mismatch));
+  simple->cs.reset();
+  ASSERT_TRUE(c1->AreEquivalent(loosely, c2.get(), &mismatch));
+  ASSERT_FALSE(c1->AreEquivalent(config_options, c2.get(), &mismatch));
+}
+
+// Tests that we can initialize a customizable from its options
+TEST_F(CustomizableTest, ConfigureStandaloneCustomTest) {
+  std::unique_ptr<TestCustomizable> base, copy;
+  const auto& registry = config_options_.registry;
+  ASSERT_OK(registry->NewUniqueObject<TestCustomizable>("A", &base));
+  ASSERT_OK(registry->NewUniqueObject<TestCustomizable>("A", &copy));
+  ASSERT_OK(base->ConfigureFromString(config_options_, "int=33;bool=true"));
+  std::string opt_str;
+  std::string mismatch;
+  ASSERT_OK(base->GetOptionString(config_options_, &opt_str));
+  ASSERT_OK(copy->ConfigureFromString(config_options_, opt_str));
+  ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+
+// Tests that we fail appropriately if the pattern is not registered
+TEST_F(CustomizableTest, BadNameTest) {
+  config_options_.ignore_unsupported_options = false;
+  std::unique_ptr<Configurable> c1(new SimpleConfigurable());
+  ASSERT_NOK(
+      c1->ConfigureFromString(config_options_, "unique.shared.id=bad name"));
+  config_options_.ignore_unsupported_options = true;
+  ASSERT_OK(
+      c1->ConfigureFromString(config_options_, "unique.shared.id=bad name"));
+}
+
+// Tests that we fail appropriately if a bad option is passed to the underlying
+// configurable
+TEST_F(CustomizableTest, BadOptionTest) {
+  std::unique_ptr<Configurable> c1(new SimpleConfigurable());
+  ConfigOptions ignore = config_options_;
+  ignore.ignore_unknown_options = true;
+
+  ASSERT_NOK(c1->ConfigureFromString(config_options_, "A.int=11"));
+  ASSERT_NOK(c1->ConfigureFromString(config_options_, "shared={id=B;int=1}"));
+  ASSERT_OK(c1->ConfigureFromString(ignore, "shared={id=A;string=s}"));
+  ASSERT_NOK(c1->ConfigureFromString(config_options_, "B.int=11"));
+  ASSERT_OK(c1->ConfigureFromString(ignore, "B.int=11"));
+  ASSERT_NOK(c1->ConfigureFromString(config_options_, "A.string=s"));
+  ASSERT_OK(c1->ConfigureFromString(ignore, "A.string=s"));
+  // Test as detached
+  ASSERT_NOK(
+      c1->ConfigureFromString(config_options_, "shared.id=A;A.string=b}"));
+  ASSERT_OK(c1->ConfigureFromString(ignore, "shared.id=A;A.string=s}"));
+}
+
+TEST_F(CustomizableTest, FailingFactoryTest) {
+  std::shared_ptr<ObjectRegistry> registry = ObjectRegistry::NewInstance();
+  std::unique_ptr<Configurable> c1(new SimpleConfigurable());
+  ConfigOptions ignore = config_options_;
+
+  Status s;
+  ignore.registry->AddLibrary("failing")->AddFactory<TestCustomizable>(
+      "failing",
+      [](const std::string& /*uri*/,
+         std::unique_ptr<TestCustomizable>* /*guard */, std::string* errmsg) {
+        *errmsg = "Bad Factory";
+        return nullptr;
+      });
+
+  // If we are ignoring unknown and unsupported options, will see
+  // different errors for failing versus missing
+  ignore.ignore_unknown_options = false;
+  ignore.ignore_unsupported_options = false;
+  s = c1->ConfigureFromString(ignore, "shared.id=failing");
+  ASSERT_TRUE(s.IsInvalidArgument());
+  s = c1->ConfigureFromString(ignore, "unique.id=failing");
+  ASSERT_TRUE(s.IsInvalidArgument());
+  s = c1->ConfigureFromString(ignore, "shared.id=missing");
+  ASSERT_TRUE(s.IsNotSupported());
+  s = c1->ConfigureFromString(ignore, "unique.id=missing");
+  ASSERT_TRUE(s.IsNotSupported());
+
+  // If we are ignoring unsupported options, will see
+  // errors for failing but not missing
+  ignore.ignore_unknown_options = false;
+  ignore.ignore_unsupported_options = true;
+  s = c1->ConfigureFromString(ignore, "shared.id=failing");
+  ASSERT_TRUE(s.IsInvalidArgument());
+  s = c1->ConfigureFromString(ignore, "unique.id=failing");
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  ASSERT_OK(c1->ConfigureFromString(ignore, "shared.id=missing"));
+  ASSERT_OK(c1->ConfigureFromString(ignore, "unique.id=missing"));
+
+  // If we are ignoring unknown options, will see no errors
+  // for failing or missing
+  ignore.ignore_unknown_options = true;
+  ignore.ignore_unsupported_options = false;
+  ASSERT_OK(c1->ConfigureFromString(ignore, "shared.id=failing"));
+  ASSERT_OK(c1->ConfigureFromString(ignore, "unique.id=failing"));
+  ASSERT_OK(c1->ConfigureFromString(ignore, "shared.id=missing"));
+  ASSERT_OK(c1->ConfigureFromString(ignore, "unique.id=missing"));
+}
+
+// Tests that different IDs lead to different objects
+TEST_F(CustomizableTest, UniqueIdTest) {
+  std::unique_ptr<Configurable> base(new SimpleConfigurable());
+  ASSERT_OK(base->ConfigureFromString(config_options_,
+                                      "unique={id=A_1;int=1;bool=true}"));
+  SimpleOptions* simple = base->GetOptions<SimpleOptions>();
+  ASSERT_NE(simple, nullptr);
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_EQ(simple->cu->GetId(), std::string("A_1"));
+  std::string opt_str;
+  std::string mismatch;
+  ASSERT_OK(base->GetOptionString(config_options_, &opt_str));
+  std::unique_ptr<Configurable> copy(new SimpleConfigurable());
+  ASSERT_OK(copy->ConfigureFromString(config_options_, opt_str));
+  ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &mismatch));
+  ASSERT_OK(base->ConfigureFromString(config_options_,
+                                      "unique={id=A_2;int=1;bool=true}"));
+  ASSERT_FALSE(base->AreEquivalent(config_options_, copy.get(), &mismatch));
+  ASSERT_EQ(simple->cu->GetId(), std::string("A_2"));
+}
+
+TEST_F(CustomizableTest, IsInstanceOfTest) {
+  std::shared_ptr<TestCustomizable> tc = std::make_shared<ACustomizable>("A_1");
+
+  ASSERT_EQ(tc->GetId(), std::string("A_1"));
+  ASSERT_TRUE(tc->IsInstanceOf("A"));
+  ASSERT_TRUE(tc->IsInstanceOf("TestCustomizable"));
+  ASSERT_FALSE(tc->IsInstanceOf("B"));
+  ASSERT_FALSE(tc->IsInstanceOf("A_1"));
+  ASSERT_EQ(tc->CheckedCast<ACustomizable>(), tc.get());
+  ASSERT_EQ(tc->CheckedCast<TestCustomizable>(), tc.get());
+  ASSERT_EQ(tc->CheckedCast<BCustomizable>(), nullptr);
+
+  tc.reset(new BCustomizable("B"));
+  ASSERT_TRUE(tc->IsInstanceOf("B"));
+  ASSERT_TRUE(tc->IsInstanceOf("TestCustomizable"));
+  ASSERT_FALSE(tc->IsInstanceOf("A"));
+  ASSERT_EQ(tc->CheckedCast<BCustomizable>(), tc.get());
+  ASSERT_EQ(tc->CheckedCast<TestCustomizable>(), tc.get());
+  ASSERT_EQ(tc->CheckedCast<ACustomizable>(), nullptr);
+}
+
+TEST_F(CustomizableTest, PrepareOptionsTest) {
+  static std::unordered_map<std::string, OptionTypeInfo> p_option_info = {
+#ifndef ROCKSDB_LITE
+      {"can_prepare",
+       {0, OptionType::kBoolean, OptionVerificationType::kNormal,
+        OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+  };
+
+  class PrepareCustomizable : public TestCustomizable {
+   public:
+    bool can_prepare_ = true;
+
+    PrepareCustomizable() : TestCustomizable("P") {
+      RegisterOptions("Prepare", &can_prepare_, &p_option_info);
+    }
+
+    Status PrepareOptions(const ConfigOptions& opts) override {
+      if (!can_prepare_) {
+        return Status::InvalidArgument("Cannot Prepare");
+      } else {
+        return TestCustomizable::PrepareOptions(opts);
+      }
+    }
+  };
+
+  ObjectLibrary::Default()->AddFactory<TestCustomizable>(
+      "P",
+      [](const std::string& /*name*/, std::unique_ptr<TestCustomizable>* guard,
+         std::string* /* msg */) {
+        guard->reset(new PrepareCustomizable());
+        return guard->get();
+      });
+
+  std::unique_ptr<Configurable> base(new SimpleConfigurable());
+  ConfigOptions prepared(config_options_);
+  prepared.invoke_prepare_options = true;
+
+  ASSERT_OK(base->ConfigureFromString(
+      prepared, "unique=A_1; shared={id=B;string=s}; pointer.id=S"));
+  SimpleOptions* simple = base->GetOptions<SimpleOptions>();
+  ASSERT_NE(simple, nullptr);
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_NE(simple->cs, nullptr);
+  ASSERT_NE(simple->cp, nullptr);
+  delete simple->cp;
+  base.reset(new SimpleConfigurable());
+  ASSERT_OK(base->ConfigureFromString(
+      config_options_, "unique=A_1; shared={id=B;string=s}; pointer.id=S"));
+
+  simple = base->GetOptions<SimpleOptions>();
+  ASSERT_NE(simple, nullptr);
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_NE(simple->cs, nullptr);
+  ASSERT_NE(simple->cp, nullptr);
+
+  ASSERT_OK(base->PrepareOptions(config_options_));
+  delete simple->cp;
+  base.reset(new SimpleConfigurable());
+  simple = base->GetOptions<SimpleOptions>();
+  ASSERT_NE(simple, nullptr);
+
+  ASSERT_NOK(
+      base->ConfigureFromString(prepared, "unique={id=P; can_prepare=false}"));
+  ASSERT_EQ(simple->cu, nullptr);
+
+  ASSERT_OK(
+      base->ConfigureFromString(prepared, "unique={id=P; can_prepare=true}"));
+  ASSERT_NE(simple->cu, nullptr);
+
+  ASSERT_OK(base->ConfigureFromString(config_options_,
+                                      "unique={id=P; can_prepare=true}"));
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_OK(simple->cu->PrepareOptions(prepared));
+
+  ASSERT_OK(base->ConfigureFromString(config_options_,
+                                      "unique={id=P; can_prepare=false}"));
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_NOK(simple->cu->PrepareOptions(prepared));
+}
+
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo> inner_option_info = {
+#ifndef ROCKSDB_LITE
+    {"inner",
+     OptionTypeInfo::AsCustomSharedPtr<TestCustomizable>(
+         0, OptionVerificationType::kNormal, OptionTypeFlags::kStringNameOnly)}
+#endif  // ROCKSDB_LITE
+};
+
+struct InnerOptions {
+  static const char* kName() { return "InnerOptions"; }
+  std::shared_ptr<Customizable> inner;
+};
+
+class InnerCustomizable : public Customizable {
+ public:
+  explicit InnerCustomizable(const std::shared_ptr<Customizable>& w) {
+    iopts_.inner = w;
+    RegisterOptions(&iopts_, &inner_option_info);
+  }
+  static const char* kClassName() { return "Inner"; }
+  const char* Name() const override { return kClassName(); }
+
+  bool IsInstanceOf(const std::string& name) const override {
+    if (name == kClassName()) {
+      return true;
+    } else {
+      return Customizable::IsInstanceOf(name);
+    }
+  }
+
+ protected:
+  const Customizable* Inner() const override { return iopts_.inner.get(); }
+
+ private:
+  InnerOptions iopts_;
+};
+
+struct WrappedOptions1 {
+  static const char* kName() { return "WrappedOptions1"; }
+  int i = 42;
+};
+
+class WrappedCustomizable1 : public InnerCustomizable {
+ public:
+  explicit WrappedCustomizable1(const std::shared_ptr<Customizable>& w)
+      : InnerCustomizable(w) {
+    RegisterOptions(&wopts_, nullptr);
+  }
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "Wrapped1"; }
+
+ private:
+  WrappedOptions1 wopts_;
+};
+
+struct WrappedOptions2 {
+  static const char* kName() { return "WrappedOptions2"; }
+  std::string s = "42";
+};
+class WrappedCustomizable2 : public InnerCustomizable {
+ public:
+  explicit WrappedCustomizable2(const std::shared_ptr<Customizable>& w)
+      : InnerCustomizable(w) {}
+  const void* GetOptionsPtr(const std::string& name) const override {
+    if (name == WrappedOptions2::kName()) {
+      return &wopts_;
+    } else {
+      return InnerCustomizable::GetOptionsPtr(name);
+    }
+  }
+
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "Wrapped2"; }
+
+ private:
+  WrappedOptions2 wopts_;
+};
+}  // namespace
+
+TEST_F(CustomizableTest, WrappedInnerTest) {
+  std::shared_ptr<TestCustomizable> ac =
+      std::make_shared<TestCustomizable>("A");
+
+  ASSERT_TRUE(ac->IsInstanceOf("A"));
+  ASSERT_TRUE(ac->IsInstanceOf("TestCustomizable"));
+  ASSERT_EQ(ac->CheckedCast<TestCustomizable>(), ac.get());
+  ASSERT_EQ(ac->CheckedCast<InnerCustomizable>(), nullptr);
+  ASSERT_EQ(ac->CheckedCast<WrappedCustomizable1>(), nullptr);
+  ASSERT_EQ(ac->CheckedCast<WrappedCustomizable2>(), nullptr);
+  std::shared_ptr<Customizable> wc1 =
+      std::make_shared<WrappedCustomizable1>(ac);
+
+  ASSERT_TRUE(wc1->IsInstanceOf(WrappedCustomizable1::kClassName()));
+  ASSERT_EQ(wc1->CheckedCast<WrappedCustomizable1>(), wc1.get());
+  ASSERT_EQ(wc1->CheckedCast<WrappedCustomizable2>(), nullptr);
+  ASSERT_EQ(wc1->CheckedCast<InnerCustomizable>(), wc1.get());
+  ASSERT_EQ(wc1->CheckedCast<TestCustomizable>(), ac.get());
+
+  std::shared_ptr<Customizable> wc2 =
+      std::make_shared<WrappedCustomizable2>(wc1);
+  ASSERT_TRUE(wc2->IsInstanceOf(WrappedCustomizable2::kClassName()));
+  ASSERT_EQ(wc2->CheckedCast<WrappedCustomizable2>(), wc2.get());
+  ASSERT_EQ(wc2->CheckedCast<WrappedCustomizable1>(), wc1.get());
+  ASSERT_EQ(wc2->CheckedCast<InnerCustomizable>(), wc2.get());
+  ASSERT_EQ(wc2->CheckedCast<TestCustomizable>(), ac.get());
+}
+
+TEST_F(CustomizableTest, CustomizableInnerTest) {
+  std::shared_ptr<Customizable> c =
+      std::make_shared<InnerCustomizable>(std::make_shared<ACustomizable>("a"));
+  std::shared_ptr<Customizable> wc1 = std::make_shared<WrappedCustomizable1>(c);
+  std::shared_ptr<Customizable> wc2 = std::make_shared<WrappedCustomizable2>(c);
+  auto inner = c->GetOptions<InnerOptions>();
+  ASSERT_NE(inner, nullptr);
+
+  auto aopts = c->GetOptions<AOptions>();
+  ASSERT_NE(aopts, nullptr);
+  ASSERT_EQ(aopts, wc1->GetOptions<AOptions>());
+  ASSERT_EQ(aopts, wc2->GetOptions<AOptions>());
+  auto w1opts = wc1->GetOptions<WrappedOptions1>();
+  ASSERT_NE(w1opts, nullptr);
+  ASSERT_EQ(c->GetOptions<WrappedOptions1>(), nullptr);
+  ASSERT_EQ(wc2->GetOptions<WrappedOptions1>(), nullptr);
+
+  auto w2opts = wc2->GetOptions<WrappedOptions2>();
+  ASSERT_NE(w2opts, nullptr);
+  ASSERT_EQ(c->GetOptions<WrappedOptions2>(), nullptr);
+  ASSERT_EQ(wc1->GetOptions<WrappedOptions2>(), nullptr);
+}
+
+TEST_F(CustomizableTest, CopyObjectTest) {
+  class CopyCustomizable : public Customizable {
+   public:
+    CopyCustomizable() : prepared_(0), validated_(0) {}
+    const char* Name() const override { return "CopyCustomizable"; }
+
+    Status PrepareOptions(const ConfigOptions& options) override {
+      prepared_++;
+      return Customizable::PrepareOptions(options);
+    }
+    Status ValidateOptions(const DBOptions& db_opts,
+                           const ColumnFamilyOptions& cf_opts) const override {
+      validated_++;
+      return Customizable::ValidateOptions(db_opts, cf_opts);
+    }
+    int prepared_;
+    mutable int validated_;
+  };
+
+  CopyCustomizable c1;
+  ConfigOptions config_options;
+  Options options;
+
+  ASSERT_OK(c1.PrepareOptions(config_options));
+  ASSERT_OK(c1.ValidateOptions(options, options));
+  ASSERT_EQ(c1.prepared_, 1);
+  ASSERT_EQ(c1.validated_, 1);
+  CopyCustomizable c2 = c1;
+  ASSERT_OK(c1.PrepareOptions(config_options));
+  ASSERT_OK(c1.ValidateOptions(options, options));
+  ASSERT_EQ(c2.prepared_, 1);
+  ASSERT_EQ(c2.validated_, 1);
+  ASSERT_EQ(c1.prepared_, 2);
+  ASSERT_EQ(c1.validated_, 2);
+}
+
+TEST_F(CustomizableTest, TestStringDepth) {
+  ConfigOptions shallow = config_options_;
+  std::unique_ptr<Configurable> c(
+      new InnerCustomizable(std::make_shared<ACustomizable>("a")));
+  std::string opt_str;
+  shallow.depth = ConfigOptions::Depth::kDepthShallow;
+  ASSERT_OK(c->GetOptionString(shallow, &opt_str));
+  ASSERT_EQ(opt_str, "inner=a;");
+  shallow.depth = ConfigOptions::Depth::kDepthDetailed;
+  ASSERT_OK(c->GetOptionString(shallow, &opt_str));
+  ASSERT_NE(opt_str, "inner=a;");
+}
+
+// Tests that we only get a new customizable when it changes
+TEST_F(CustomizableTest, NewUniqueCustomizableTest) {
+  std::unique_ptr<Configurable> base(new SimpleConfigurable());
+  A_count = 0;
+  ASSERT_OK(base->ConfigureFromString(config_options_,
+                                      "unique={id=A_1;int=1;bool=true}"));
+  SimpleOptions* simple = base->GetOptions<SimpleOptions>();
+  ASSERT_NE(simple, nullptr);
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_EQ(A_count, 1);  // Created one A
+  ASSERT_OK(base->ConfigureFromString(config_options_,
+                                      "unique={id=A_1;int=1;bool=false}"));
+  ASSERT_EQ(A_count, 2);  // Create another A_1
+  ASSERT_OK(base->ConfigureFromString(config_options_, "unique={id=}"));
+  ASSERT_EQ(simple->cu, nullptr);
+  ASSERT_EQ(A_count, 2);
+  ASSERT_OK(base->ConfigureFromString(config_options_,
+                                      "unique={id=A_2;int=1;bool=false}"));
+  ASSERT_EQ(A_count, 3);  // Created another A
+  ASSERT_OK(base->ConfigureFromString(config_options_, "unique.id="));
+  ASSERT_EQ(simple->cu, nullptr);
+  ASSERT_OK(base->ConfigureFromString(config_options_, "unique=nullptr"));
+  ASSERT_EQ(simple->cu, nullptr);
+  ASSERT_OK(base->ConfigureFromString(config_options_, "unique.id=nullptr"));
+  ASSERT_EQ(simple->cu, nullptr);
+  ASSERT_EQ(A_count, 3);
+}
+
+TEST_F(CustomizableTest, NewEmptyUniqueTest) {
+  std::unique_ptr<Configurable> base(new SimpleConfigurable());
+  SimpleOptions* simple = base->GetOptions<SimpleOptions>();
+  ASSERT_EQ(simple->cu, nullptr);
+  simple->cu.reset(new BCustomizable("B"));
+
+  ASSERT_OK(base->ConfigureFromString(config_options_, "unique={id=}"));
+  ASSERT_EQ(simple->cu, nullptr);
+  simple->cu.reset(new BCustomizable("B"));
+
+  ASSERT_OK(base->ConfigureFromString(config_options_, "unique={id=nullptr}"));
+  ASSERT_EQ(simple->cu, nullptr);
+  simple->cu.reset(new BCustomizable("B"));
+
+  ASSERT_OK(base->ConfigureFromString(config_options_, "unique.id="));
+  ASSERT_EQ(simple->cu, nullptr);
+  simple->cu.reset(new BCustomizable("B"));
+
+  ASSERT_OK(base->ConfigureFromString(config_options_, "unique=nullptr"));
+  ASSERT_EQ(simple->cu, nullptr);
+  simple->cu.reset(new BCustomizable("B"));
+
+  ASSERT_OK(base->ConfigureFromString(config_options_, "unique.id=nullptr"));
+  ASSERT_EQ(simple->cu, nullptr);
+}
+
+TEST_F(CustomizableTest, NewEmptySharedTest) {
+  std::unique_ptr<Configurable> base(new SimpleConfigurable());
+
+  SimpleOptions* simple = base->GetOptions<SimpleOptions>();
+  ASSERT_NE(simple, nullptr);
+  ASSERT_EQ(simple->cs, nullptr);
+  simple->cs.reset(new BCustomizable("B"));
+
+  ASSERT_OK(base->ConfigureFromString(config_options_, "shared={id=}"));
+  ASSERT_NE(simple, nullptr);
+  ASSERT_EQ(simple->cs, nullptr);
+  simple->cs.reset(new BCustomizable("B"));
+
+  ASSERT_OK(base->ConfigureFromString(config_options_, "shared={id=nullptr}"));
+  ASSERT_EQ(simple->cs, nullptr);
+  simple->cs.reset(new BCustomizable("B"));
+
+  ASSERT_OK(base->ConfigureFromString(config_options_, "shared.id="));
+  ASSERT_EQ(simple->cs, nullptr);
+  simple->cs.reset(new BCustomizable("B"));
+
+  ASSERT_OK(base->ConfigureFromString(config_options_, "shared.id=nullptr"));
+  ASSERT_EQ(simple->cs, nullptr);
+  simple->cs.reset(new BCustomizable("B"));
+
+  ASSERT_OK(base->ConfigureFromString(config_options_, "shared=nullptr"));
+  ASSERT_EQ(simple->cs, nullptr);
+}
+
+TEST_F(CustomizableTest, NewEmptyStaticTest) {
+  std::unique_ptr<Configurable> base(new SimpleConfigurable());
+  ASSERT_OK(base->ConfigureFromString(config_options_, "pointer={id=}"));
+  SimpleOptions* simple = base->GetOptions<SimpleOptions>();
+  ASSERT_NE(simple, nullptr);
+  ASSERT_EQ(simple->cp, nullptr);
+  ASSERT_OK(base->ConfigureFromString(config_options_, "pointer={id=nullptr}"));
+  ASSERT_EQ(simple->cp, nullptr);
+
+  ASSERT_OK(base->ConfigureFromString(config_options_, "pointer="));
+  ASSERT_EQ(simple->cp, nullptr);
+  ASSERT_OK(base->ConfigureFromString(config_options_, "pointer=nullptr"));
+  ASSERT_EQ(simple->cp, nullptr);
+
+  ASSERT_OK(base->ConfigureFromString(config_options_, "pointer.id="));
+  ASSERT_EQ(simple->cp, nullptr);
+  ASSERT_OK(base->ConfigureFromString(config_options_, "pointer.id=nullptr"));
+  ASSERT_EQ(simple->cp, nullptr);
+}
+
+namespace {
+#ifndef ROCKSDB_LITE
+static std::unordered_map<std::string, OptionTypeInfo> vector_option_info = {
+    {"vector",
+     OptionTypeInfo::Vector<std::shared_ptr<TestCustomizable>>(
+         0, OptionVerificationType::kNormal,
+
+         OptionTypeFlags::kNone,
+
+         OptionTypeInfo::AsCustomSharedPtr<TestCustomizable>(
+             0, OptionVerificationType::kNormal, OptionTypeFlags::kNone))},
+};
+class VectorConfigurable : public SimpleConfigurable {
+ public:
+  VectorConfigurable() { RegisterOptions("vector", &cv, &vector_option_info); }
+  std::vector<std::shared_ptr<TestCustomizable>> cv;
+};
+}  // namespace
+
+TEST_F(CustomizableTest, VectorConfigTest) {
+  VectorConfigurable orig, copy;
+  std::shared_ptr<TestCustomizable> c1, c2;
+  ASSERT_OK(TestCustomizable::CreateFromString(config_options_, "A", &c1));
+  ASSERT_OK(TestCustomizable::CreateFromString(config_options_, "B", &c2));
+  orig.cv.push_back(c1);
+  orig.cv.push_back(c2);
+  ASSERT_OK(orig.ConfigureFromString(config_options_, "unique=A2"));
+  std::string opt_str, mismatch;
+  ASSERT_OK(orig.GetOptionString(config_options_, &opt_str));
+  ASSERT_OK(copy.ConfigureFromString(config_options_, opt_str));
+  ASSERT_TRUE(orig.AreEquivalent(config_options_, &copy, &mismatch));
+}
+
+TEST_F(CustomizableTest, NoNameTest) {
+  // If Customizables are created without names, they are not
+  // part of the serialization (since they cannot be recreated)
+  VectorConfigurable orig, copy;
+  auto sopts = orig.GetOptions<SimpleOptions>();
+  auto copts = copy.GetOptions<SimpleOptions>();
+  sopts->cu.reset(new ACustomizable(""));
+  orig.cv.push_back(std::make_shared<ACustomizable>(""));
+  orig.cv.push_back(std::make_shared<ACustomizable>("A_1"));
+  std::string opt_str, mismatch;
+  ASSERT_OK(orig.GetOptionString(config_options_, &opt_str));
+  ASSERT_OK(copy.ConfigureFromString(config_options_, opt_str));
+  ASSERT_EQ(copy.cv.size(), 1U);
+  ASSERT_EQ(copy.cv[0]->GetId(), "A_1");
+  ASSERT_EQ(copts->cu, nullptr);
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST_F(CustomizableTest, IgnoreUnknownObjects) {
+  ConfigOptions ignore = config_options_;
+  std::shared_ptr<TestCustomizable> shared;
+  std::unique_ptr<TestCustomizable> unique;
+  TestCustomizable* pointer = nullptr;
+  ignore.ignore_unsupported_options = false;
+  ASSERT_NOK(
+      LoadSharedObject<TestCustomizable>(ignore, "Unknown", nullptr, &shared));
+  ASSERT_NOK(
+      LoadUniqueObject<TestCustomizable>(ignore, "Unknown", nullptr, &unique));
+  ASSERT_NOK(
+      LoadStaticObject<TestCustomizable>(ignore, "Unknown", nullptr, &pointer));
+  ASSERT_EQ(shared.get(), nullptr);
+  ASSERT_EQ(unique.get(), nullptr);
+  ASSERT_EQ(pointer, nullptr);
+  ignore.ignore_unsupported_options = true;
+  ASSERT_OK(
+      LoadSharedObject<TestCustomizable>(ignore, "Unknown", nullptr, &shared));
+  ASSERT_OK(
+      LoadUniqueObject<TestCustomizable>(ignore, "Unknown", nullptr, &unique));
+  ASSERT_OK(
+      LoadStaticObject<TestCustomizable>(ignore, "Unknown", nullptr, &pointer));
+  ASSERT_EQ(shared.get(), nullptr);
+  ASSERT_EQ(unique.get(), nullptr);
+  ASSERT_EQ(pointer, nullptr);
+  ASSERT_OK(LoadSharedObject<TestCustomizable>(ignore, "id=Unknown", nullptr,
+                                               &shared));
+  ASSERT_OK(LoadUniqueObject<TestCustomizable>(ignore, "id=Unknown", nullptr,
+                                               &unique));
+  ASSERT_OK(LoadStaticObject<TestCustomizable>(ignore, "id=Unknown", nullptr,
+                                               &pointer));
+  ASSERT_EQ(shared.get(), nullptr);
+  ASSERT_EQ(unique.get(), nullptr);
+  ASSERT_EQ(pointer, nullptr);
+  ASSERT_OK(LoadSharedObject<TestCustomizable>(ignore, "id=Unknown;option=bad",
+                                               nullptr, &shared));
+  ASSERT_OK(LoadUniqueObject<TestCustomizable>(ignore, "id=Unknown;option=bad",
+                                               nullptr, &unique));
+  ASSERT_OK(LoadStaticObject<TestCustomizable>(ignore, "id=Unknown;option=bad",
+                                               nullptr, &pointer));
+  ASSERT_EQ(shared.get(), nullptr);
+  ASSERT_EQ(unique.get(), nullptr);
+  ASSERT_EQ(pointer, nullptr);
+}
+
+TEST_F(CustomizableTest, FactoryFunctionTest) {
+  std::shared_ptr<TestCustomizable> shared;
+  std::unique_ptr<TestCustomizable> unique;
+  TestCustomizable* pointer = nullptr;
+  ConfigOptions ignore = config_options_;
+  ignore.ignore_unsupported_options = false;
+  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "B", &shared));
+  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "B", &unique));
+  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "B", &pointer));
+  ASSERT_NE(shared.get(), nullptr);
+  ASSERT_NE(unique.get(), nullptr);
+  ASSERT_NE(pointer, nullptr);
+  delete pointer;
+  pointer = nullptr;
+  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "id=", &shared));
+  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "id=", &unique));
+  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "id=", &pointer));
+  ASSERT_EQ(shared.get(), nullptr);
+  ASSERT_EQ(unique.get(), nullptr);
+  ASSERT_EQ(pointer, nullptr);
+  ASSERT_NOK(TestCustomizable::CreateFromString(ignore, "option=bad", &shared));
+  ASSERT_NOK(TestCustomizable::CreateFromString(ignore, "option=bad", &unique));
+  ASSERT_NOK(
+      TestCustomizable::CreateFromString(ignore, "option=bad", &pointer));
+  ASSERT_EQ(pointer, nullptr);
+}
+
+TEST_F(CustomizableTest, URLFactoryTest) {
+  std::unique_ptr<TestCustomizable> unique;
+  config_options_.registry->AddLibrary("URL")->AddFactory<TestCustomizable>(
+      ObjectLibrary::PatternEntry("Z", false).AddSeparator(""),
+      [](const std::string& name, std::unique_ptr<TestCustomizable>* guard,
+         std::string* /* msg */) {
+        guard->reset(new TestCustomizable(name));
+        return guard->get();
+      });
+
+  ConfigOptions ignore = config_options_;
+  ignore.ignore_unsupported_options = false;
+  ignore.ignore_unsupported_options = false;
+  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "Z=1;x=y", &unique));
+  ASSERT_NE(unique, nullptr);
+  ASSERT_EQ(unique->GetId(), "Z=1;x=y");
+  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "Z;x=y", &unique));
+  ASSERT_NE(unique, nullptr);
+  ASSERT_EQ(unique->GetId(), "Z;x=y");
+  unique.reset();
+  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "Z=1?x=y", &unique));
+  ASSERT_NE(unique, nullptr);
+  ASSERT_EQ(unique->GetId(), "Z=1?x=y");
+}
+
+TEST_F(CustomizableTest, MutableOptionsTest) {
+  static std::unordered_map<std::string, OptionTypeInfo> mutable_option_info = {
+      {"mutable",
+       OptionTypeInfo::AsCustomSharedPtr<TestCustomizable>(
+           0, OptionVerificationType::kNormal, OptionTypeFlags::kMutable)}};
+  static std::unordered_map<std::string, OptionTypeInfo> immutable_option_info =
+      {{"immutable",
+        OptionTypeInfo::AsCustomSharedPtr<TestCustomizable>(
+            0, OptionVerificationType::kNormal, OptionTypeFlags::kAllowNull)}};
+
+  class MutableCustomizable : public Customizable {
+   private:
+    std::shared_ptr<TestCustomizable> mutable_;
+    std::shared_ptr<TestCustomizable> immutable_;
+
+   public:
+    MutableCustomizable() {
+      RegisterOptions("mutable", &mutable_, &mutable_option_info);
+      RegisterOptions("immutable", &immutable_, &immutable_option_info);
+    }
+    const char* Name() const override { return "MutableCustomizable"; }
+  };
+  MutableCustomizable mc, mc2;
+  std::string mismatch;
+  std::string opt_str;
+
+  ConfigOptions options = config_options_;
+  ASSERT_OK(mc.ConfigureOption(options, "mutable", "{id=B;}"));
+  options.mutable_options_only = true;
+  ASSERT_OK(mc.GetOptionString(options, &opt_str));
+  ASSERT_OK(mc2.ConfigureFromString(options, opt_str));
+  ASSERT_TRUE(mc.AreEquivalent(options, &mc2, &mismatch));
+
+  options.mutable_options_only = false;
+  ASSERT_OK(mc.ConfigureOption(options, "immutable", "{id=A; int=10}"));
+  auto* mm = mc.GetOptions<std::shared_ptr<TestCustomizable>>("mutable");
+  auto* im = mc.GetOptions<std::shared_ptr<TestCustomizable>>("immutable");
+  ASSERT_NE(mm, nullptr);
+  ASSERT_NE(mm->get(), nullptr);
+  ASSERT_NE(im, nullptr);
+  ASSERT_NE(im->get(), nullptr);
+
+  // Now only deal with mutable options
+  options.mutable_options_only = true;
+
+  // Setting nested immutable customizable options fails
+  ASSERT_NOK(mc.ConfigureOption(options, "immutable", "{id=B;}"));
+  ASSERT_NOK(mc.ConfigureOption(options, "immutable.id", "B"));
+  ASSERT_NOK(mc.ConfigureOption(options, "immutable.bool", "true"));
+  ASSERT_NOK(mc.ConfigureOption(options, "immutable", "bool=true"));
+  ASSERT_NOK(mc.ConfigureOption(options, "immutable", "{int=11;bool=true}"));
+  auto* im_a = im->get()->GetOptions<AOptions>("A");
+  ASSERT_NE(im_a, nullptr);
+  ASSERT_EQ(im_a->i, 10);
+  ASSERT_EQ(im_a->b, false);
+
+  // Setting nested mutable customizable options succeeds but the object did not
+  // change
+  ASSERT_OK(mc.ConfigureOption(options, "immutable.int", "11"));
+  ASSERT_EQ(im_a->i, 11);
+  ASSERT_EQ(im_a, im->get()->GetOptions<AOptions>("A"));
+
+  // The mutable configurable itself can be changed
+  ASSERT_OK(mc.ConfigureOption(options, "mutable.id", "A"));
+  ASSERT_OK(mc.ConfigureOption(options, "mutable", "A"));
+  ASSERT_OK(mc.ConfigureOption(options, "mutable", "{id=A}"));
+  ASSERT_OK(mc.ConfigureOption(options, "mutable", "{bool=true}"));
+
+  // The Nested options in the mutable object can be changed
+  ASSERT_OK(mc.ConfigureOption(options, "mutable", "{bool=true}"));
+  auto* mm_a = mm->get()->GetOptions<AOptions>("A");
+  ASSERT_EQ(mm_a->b, true);
+  ASSERT_OK(mc.ConfigureOption(options, "mutable", "{int=22;bool=false}"));
+  mm_a = mm->get()->GetOptions<AOptions>("A");
+  ASSERT_EQ(mm_a->i, 22);
+  ASSERT_EQ(mm_a->b, false);
+
+  // Only the mutable options should get serialized
+  options.mutable_options_only = false;
+  ASSERT_OK(mc.GetOptionString(options, &opt_str));
+  ASSERT_OK(mc.ConfigureOption(options, "immutable", "{id=B;}"));
+  options.mutable_options_only = true;
+
+  ASSERT_OK(mc.GetOptionString(options, &opt_str));
+  ASSERT_OK(mc2.ConfigureFromString(options, opt_str));
+  ASSERT_TRUE(mc.AreEquivalent(options, &mc2, &mismatch));
+  options.mutable_options_only = false;
+  ASSERT_FALSE(mc.AreEquivalent(options, &mc2, &mismatch));
+  ASSERT_EQ(mismatch, "immutable");
+}
+
+TEST_F(CustomizableTest, CustomManagedObjects) {
+  std::shared_ptr<TestCustomizable> object1, object2;
+  ASSERT_OK(LoadManagedObject<TestCustomizable>(
+      config_options_, "id=A_1;int=1;bool=true", &object1));
+  ASSERT_NE(object1, nullptr);
+  ASSERT_OK(
+      LoadManagedObject<TestCustomizable>(config_options_, "A_1", &object2));
+  ASSERT_EQ(object1, object2);
+  auto* opts = object2->GetOptions<AOptions>("A");
+  ASSERT_NE(opts, nullptr);
+  ASSERT_EQ(opts->i, 1);
+  ASSERT_EQ(opts->b, true);
+  ASSERT_OK(
+      LoadManagedObject<TestCustomizable>(config_options_, "A_2", &object2));
+  ASSERT_NE(object1, object2);
+  object1.reset();
+  ASSERT_OK(LoadManagedObject<TestCustomizable>(
+      config_options_, "id=A_1;int=2;bool=false", &object1));
+  opts = object1->GetOptions<AOptions>("A");
+  ASSERT_NE(opts, nullptr);
+  ASSERT_EQ(opts->i, 2);
+  ASSERT_EQ(opts->b, false);
+}
+
+TEST_F(CustomizableTest, CreateManagedObjects) {
+  class ManagedCustomizable : public Customizable {
+   public:
+    static const char* Type() { return "ManagedCustomizable"; }
+    static const char* kClassName() { return "Managed"; }
+    const char* Name() const override { return kClassName(); }
+    std::string GetId() const override { return id_; }
+    ManagedCustomizable() { id_ = GenerateIndividualId(); }
+    static Status CreateFromString(
+        const ConfigOptions& opts, const std::string& value,
+        std::shared_ptr<ManagedCustomizable>* result) {
+      return LoadManagedObject<ManagedCustomizable>(opts, value, result);
+    }
+
+   private:
+    std::string id_;
+  };
+
+  config_options_.registry->AddLibrary("Managed")
+      ->AddFactory<ManagedCustomizable>(
+          ObjectLibrary::PatternEntry::AsIndividualId(
+              ManagedCustomizable::kClassName()),
+          [](const std::string& /*name*/,
+             std::unique_ptr<ManagedCustomizable>* guard,
+             std::string* /* msg */) {
+            guard->reset(new ManagedCustomizable());
+            return guard->get();
+          });
+
+  std::shared_ptr<ManagedCustomizable> mc1, mc2, mc3, obj;
+  // Create a "deadbeef" customizable
+  std::string deadbeef =
+      std::string(ManagedCustomizable::kClassName()) + "@0xdeadbeef#0001";
+  ASSERT_OK(
+      ManagedCustomizable::CreateFromString(config_options_, deadbeef, &mc1));
+  // Create an object with the base/class name
+  ASSERT_OK(ManagedCustomizable::CreateFromString(
+      config_options_, ManagedCustomizable::kClassName(), &mc2));
+  // Creating another with the base name returns a different object
+  ASSERT_OK(ManagedCustomizable::CreateFromString(
+      config_options_, ManagedCustomizable::kClassName(), &mc3));
+  // At this point, there should be 4 managed objects (deadbeef, mc1, 2, and 3)
+  std::vector<std::shared_ptr<ManagedCustomizable>> objects;
+  ASSERT_OK(config_options_.registry->ListManagedObjects(&objects));
+  ASSERT_EQ(objects.size(), 4U);
+  objects.clear();
+  // Three separate object, none of them equal
+  ASSERT_NE(mc1, mc2);
+  ASSERT_NE(mc1, mc3);
+  ASSERT_NE(mc2, mc3);
+
+  // Creating another object with "deadbeef" object
+  ASSERT_OK(
+      ManagedCustomizable::CreateFromString(config_options_, deadbeef, &obj));
+  ASSERT_EQ(mc1, obj);
+  // Create another with the IDs of the instances
+  ASSERT_OK(ManagedCustomizable::CreateFromString(config_options_, mc1->GetId(),
+                                                  &obj));
+  ASSERT_EQ(mc1, obj);
+  ASSERT_OK(ManagedCustomizable::CreateFromString(config_options_, mc2->GetId(),
+                                                  &obj));
+  ASSERT_EQ(mc2, obj);
+  ASSERT_OK(ManagedCustomizable::CreateFromString(config_options_, mc3->GetId(),
+                                                  &obj));
+  ASSERT_EQ(mc3, obj);
+
+  // Now get rid of deadbeef.  2 Objects left (m2+m3)
+  mc1.reset();
+  ASSERT_EQ(
+      config_options_.registry->GetManagedObject<ManagedCustomizable>(deadbeef),
+      nullptr);
+  ASSERT_OK(config_options_.registry->ListManagedObjects(&objects));
+  ASSERT_EQ(objects.size(), 2U);
+  objects.clear();
+
+  // Associate deadbeef with #2
+  ASSERT_OK(config_options_.registry->SetManagedObject(deadbeef, mc2));
+  ASSERT_OK(
+      ManagedCustomizable::CreateFromString(config_options_, deadbeef, &obj));
+  ASSERT_EQ(mc2, obj);
+  obj.reset();
+
+  // Get the ID of mc2 and then reset it.  1 Object left
+  std::string mc2id = mc2->GetId();
+  mc2.reset();
+  ASSERT_EQ(
+      config_options_.registry->GetManagedObject<ManagedCustomizable>(mc2id),
+      nullptr);
+  ASSERT_OK(config_options_.registry->ListManagedObjects(&objects));
+  ASSERT_EQ(objects.size(), 1U);
+  objects.clear();
+
+  // Create another object with the old mc2id.
+  ASSERT_OK(
+      ManagedCustomizable::CreateFromString(config_options_, mc2id, &mc2));
+  ASSERT_OK(
+      ManagedCustomizable::CreateFromString(config_options_, mc2id, &obj));
+  ASSERT_EQ(mc2, obj);
+
+  // For good measure, create another deadbeef object
+  ASSERT_OK(
+      ManagedCustomizable::CreateFromString(config_options_, deadbeef, &mc1));
+  ASSERT_OK(
+      ManagedCustomizable::CreateFromString(config_options_, deadbeef, &obj));
+  ASSERT_EQ(mc1, obj);
+}
+
+#endif  // !ROCKSDB_LITE
+
+namespace {
+class TestSecondaryCache : public SecondaryCache {
+ public:
+  static const char* kClassName() { return "Test"; }
+  const char* Name() const override { return kClassName(); }
+  Status Insert(const Slice& /*key*/, void* /*value*/,
+                const Cache::CacheItemHelper* /*helper*/) override {
+    return Status::NotSupported();
+  }
+  std::unique_ptr<SecondaryCacheResultHandle> Lookup(
+      const Slice& /*key*/, const Cache::CreateCallback& /*create_cb*/,
+      bool /*wait*/, bool /*advise_erase*/, bool& is_in_sec_cache) override {
+    is_in_sec_cache = true;
+    return nullptr;
+  }
+
+  bool SupportForceErase() const override { return false; }
+
+  void Erase(const Slice& /*key*/) override {}
+
+  // Wait for a collection of handles to become ready
+  void WaitAll(std::vector<SecondaryCacheResultHandle*> /*handles*/) override {}
+
+  std::string GetPrintableOptions() const override { return ""; }
+};
+
+class TestStatistics : public StatisticsImpl {
+ public:
+  TestStatistics() : StatisticsImpl(nullptr) {}
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "Test"; }
+};
+
+class TestFlushBlockPolicyFactory : public FlushBlockPolicyFactory {
+ public:
+  TestFlushBlockPolicyFactory() {}
+
+  static const char* kClassName() { return "TestFlushBlockPolicyFactory"; }
+  const char* Name() const override { return kClassName(); }
+
+  FlushBlockPolicy* NewFlushBlockPolicy(
+      const BlockBasedTableOptions& /*table_options*/,
+      const BlockBuilder& /*data_block_builder*/) const override {
+    return nullptr;
+  }
+};
+
+class MockSliceTransform : public SliceTransform {
+ public:
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "Mock"; }
+
+  Slice Transform(const Slice& /*key*/) const override { return Slice(); }
+
+  bool InDomain(const Slice& /*key*/) const override { return false; }
+
+  bool InRange(const Slice& /*key*/) const override { return false; }
+};
+
+class MockMemoryAllocator : public BaseMemoryAllocator {
+ public:
+  static const char* kClassName() { return "MockMemoryAllocator"; }
+  const char* Name() const override { return kClassName(); }
+};
+
+#ifndef ROCKSDB_LITE
+class MockEncryptionProvider : public EncryptionProvider {
+ public:
+  explicit MockEncryptionProvider(const std::string& id) : id_(id) {}
+  static const char* kClassName() { return "Mock"; }
+  const char* Name() const override { return kClassName(); }
+  size_t GetPrefixLength() const override { return 0; }
+  Status CreateNewPrefix(const std::string& /*fname*/, char* /*prefix*/,
+                         size_t /*prefixLength*/) const override {
+    return Status::NotSupported();
+  }
+
+  Status AddCipher(const std::string& /*descriptor*/, const char* /*cipher*/,
+                   size_t /*len*/, bool /*for_write*/) override {
+    return Status::NotSupported();
+  }
+
+  Status CreateCipherStream(
+      const std::string& /*fname*/, const EnvOptions& /*options*/,
+      Slice& /*prefix*/,
+      std::unique_ptr<BlockAccessCipherStream>* /*result*/) override {
+    return Status::NotSupported();
+  }
+  Status ValidateOptions(const DBOptions& db_opts,
+                         const ColumnFamilyOptions& cf_opts) const override {
+    if (EndsWith(id_, "://test")) {
+      return EncryptionProvider::ValidateOptions(db_opts, cf_opts);
+    } else {
+      return Status::InvalidArgument("MockProvider not initialized");
+    }
+  }
+
+ private:
+  std::string id_;
+};
+
+class MockCipher : public BlockCipher {
+ public:
+  const char* Name() const override { return "Mock"; }
+  size_t BlockSize() override { return 0; }
+  Status Encrypt(char* /*data*/) override { return Status::NotSupported(); }
+  Status Decrypt(char* data) override { return Encrypt(data); }
+};
+#endif  // ROCKSDB_LITE
+
+class DummyFileSystem : public FileSystemWrapper {
+ public:
+  explicit DummyFileSystem(const std::shared_ptr<FileSystem>& t)
+      : FileSystemWrapper(t) {}
+  static const char* kClassName() { return "DummyFileSystem"; }
+  const char* Name() const override { return kClassName(); }
+};
+
+#ifndef ROCKSDB_LITE
+
+#endif  // ROCKSDB_LITE
+
+class MockTablePropertiesCollectorFactory
+    : public TablePropertiesCollectorFactory {
+ private:
+ public:
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /*context*/) override {
+    return nullptr;
+  }
+  static const char* kClassName() { return "Mock"; }
+  const char* Name() const override { return kClassName(); }
+};
+
+class MockSstPartitionerFactory : public SstPartitionerFactory {
+ public:
+  static const char* kClassName() { return "Mock"; }
+  const char* Name() const override { return kClassName(); }
+  std::unique_ptr<SstPartitioner> CreatePartitioner(
+      const SstPartitioner::Context& /* context */) const override {
+    return nullptr;
+  }
+};
+
+class MockFileChecksumGenFactory : public FileChecksumGenFactory {
+ public:
+  static const char* kClassName() { return "Mock"; }
+  const char* Name() const override { return kClassName(); }
+  std::unique_ptr<FileChecksumGenerator> CreateFileChecksumGenerator(
+      const FileChecksumGenContext& /*context*/) override {
+    return nullptr;
+  }
+};
+
+class MockFilterPolicy : public FilterPolicy {
+ public:
+  static const char* kClassName() { return "MockFilterPolicy"; }
+  const char* Name() const override { return kClassName(); }
+  const char* CompatibilityName() const override { return Name(); }
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext&) const override {
+    return nullptr;
+  }
+  FilterBitsReader* GetFilterBitsReader(
+      const Slice& /*contents*/) const override {
+    return nullptr;
+  }
+};
+
+#ifndef ROCKSDB_LITE
+static int RegisterLocalObjects(ObjectLibrary& library,
+                                const std::string& /*arg*/) {
+  size_t num_types;
+  library.AddFactory<TableFactory>(
+      mock::MockTableFactory::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<TableFactory>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new mock::MockTableFactory());
+        return guard->get();
+      });
+  library.AddFactory<EventListener>(
+      OnFileDeletionListener::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<EventListener>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new OnFileDeletionListener());
+        return guard->get();
+      });
+  library.AddFactory<EventListener>(
+      FlushCounterListener::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<EventListener>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new FlushCounterListener());
+        return guard->get();
+      });
+  // Load any locally defined objects here
+  library.AddFactory<const SliceTransform>(
+      MockSliceTransform::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<const SliceTransform>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new MockSliceTransform());
+        return guard->get();
+      });
+  library.AddFactory<Statistics>(
+      TestStatistics::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<Statistics>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new TestStatistics());
+        return guard->get();
+      });
+
+  library.AddFactory<EncryptionProvider>(
+      ObjectLibrary::PatternEntry(MockEncryptionProvider::kClassName(), true)
+          .AddSuffix("://test"),
+      [](const std::string& uri, std::unique_ptr<EncryptionProvider>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new MockEncryptionProvider(uri));
+        return guard->get();
+      });
+  library.AddFactory<BlockCipher>(
+      "Mock",
+      [](const std::string& /*uri*/, std::unique_ptr<BlockCipher>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new MockCipher());
+        return guard->get();
+      });
+  library.AddFactory<MemoryAllocator>(
+      MockMemoryAllocator::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<MemoryAllocator>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new MockMemoryAllocator());
+        return guard->get();
+      });
+  library.AddFactory<FlushBlockPolicyFactory>(
+      TestFlushBlockPolicyFactory::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<FlushBlockPolicyFactory>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new TestFlushBlockPolicyFactory());
+        return guard->get();
+      });
+
+  library.AddFactory<SecondaryCache>(
+      TestSecondaryCache::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<SecondaryCache>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new TestSecondaryCache());
+        return guard->get();
+      });
+
+  library.AddFactory<FileSystem>(
+      DummyFileSystem::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<FileSystem>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new DummyFileSystem(nullptr));
+        return guard->get();
+      });
+
+  library.AddFactory<SstPartitionerFactory>(
+      MockSstPartitionerFactory::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<SstPartitionerFactory>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new MockSstPartitionerFactory());
+        return guard->get();
+      });
+
+  library.AddFactory<FileChecksumGenFactory>(
+      MockFileChecksumGenFactory::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<FileChecksumGenFactory>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new MockFileChecksumGenFactory());
+        return guard->get();
+      });
+
+  library.AddFactory<TablePropertiesCollectorFactory>(
+      MockTablePropertiesCollectorFactory::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<TablePropertiesCollectorFactory>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new MockTablePropertiesCollectorFactory());
+        return guard->get();
+      });
+
+  library.AddFactory<const FilterPolicy>(
+      MockFilterPolicy::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<const FilterPolicy>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new MockFilterPolicy());
+        return guard->get();
+      });
+
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+#endif  // !ROCKSDB_LITE
+}  // namespace
+
+class LoadCustomizableTest : public testing::Test {
+ public:
+  LoadCustomizableTest() {
+    config_options_.ignore_unsupported_options = false;
+    config_options_.invoke_prepare_options = false;
+  }
+  bool RegisterTests(const std::string& arg) {
+#ifndef ROCKSDB_LITE
+    config_options_.registry->AddLibrary("custom-tests",
+                                         test::RegisterTestObjects, arg);
+    config_options_.registry->AddLibrary("local-tests", RegisterLocalObjects,
+                                         arg);
+    return true;
+#else
+    (void)arg;
+    return false;
+#endif  // !ROCKSDB_LITE
+  }
+
+  template <typename T, typename U>
+  Status TestCreateStatic(const std::string& name, U** result,
+                          bool delete_result = false) {
+    Status s = T::CreateFromString(config_options_, name, result);
+    if (s.ok()) {
+      EXPECT_NE(*result, nullptr);
+      EXPECT_TRUE(*result != nullptr && (*result)->IsInstanceOf(name));
+    }
+    if (delete_result) {
+      delete *result;
+      *result = nullptr;
+    }
+    return s;
+  }
+
+  template <typename T, typename U>
+  std::shared_ptr<U> ExpectCreateShared(const std::string& name,
+                                        std::shared_ptr<U>* object) {
+    EXPECT_OK(T::CreateFromString(config_options_, name, object));
+    EXPECT_NE(object->get(), nullptr);
+    EXPECT_TRUE(object->get()->IsInstanceOf(name));
+    return *object;
+  }
+
+  template <typename T>
+  std::shared_ptr<T> ExpectCreateShared(const std::string& name) {
+    std::shared_ptr<T> result;
+    return ExpectCreateShared<T>(name, &result);
+  }
+
+  template <typename T, typename U>
+  Status TestExpectedBuiltins(
+      const std::string& mock, const std::unordered_set<std::string>& expected,
+      std::shared_ptr<U>* object, std::vector<std::string>* failed,
+      const std::function<std::vector<std::string>(const std::string&)>& alt =
+          nullptr) {
+    std::unordered_set<std::string> factories = expected;
+    Status s = T::CreateFromString(config_options_, mock, object);
+    EXPECT_NOK(s);
+#ifndef ROCKSDB_LITE
+    std::vector<std::string> builtins;
+    ObjectLibrary::Default()->GetFactoryNames(T::Type(), &builtins);
+    factories.insert(builtins.begin(), builtins.end());
+#endif  // ROCKSDB_LITE
+    Status result;
+    int created = 0;
+    for (const auto& name : factories) {
+      created++;
+      s = T::CreateFromString(config_options_, name, object);
+      if (!s.ok() && alt != nullptr) {
+        for (const auto& alt_name : alt(name)) {
+          s = T::CreateFromString(config_options_, alt_name, object);
+          if (s.ok()) {
+            break;
+          }
+        }
+      }
+      if (!s.ok()) {
+        result = s;
+        failed->push_back(name);
+      } else {
+        EXPECT_NE(object->get(), nullptr);
+        EXPECT_TRUE(object->get()->IsInstanceOf(name));
+      }
+    }
+#ifndef ROCKSDB_LITE
+    std::vector<std::string> plugins;
+    ObjectRegistry::Default()->GetFactoryNames(T::Type(), &plugins);
+    if (plugins.size() > builtins.size()) {
+      for (const auto& name : plugins) {
+        if (factories.find(name) == factories.end()) {
+          created++;
+          s = T::CreateFromString(config_options_, name, object);
+          if (!s.ok() && alt != nullptr) {
+            for (const auto& alt_name : alt(name)) {
+              s = T::CreateFromString(config_options_, alt_name, object);
+              if (s.ok()) {
+                break;
+              }
+            }
+          }
+          if (!s.ok()) {
+            failed->push_back(name);
+            if (result.ok()) {
+              result = s;
+            }
+            printf("%s: Failed creating plugin[%s]: %s\n", T::Type(),
+                   name.c_str(), s.ToString().c_str());
+          } else if (object->get() == nullptr ||
+                     !object->get()->IsInstanceOf(name)) {
+            failed->push_back(name);
+            printf("%s: Invalid plugin[%s]\n", T::Type(), name.c_str());
+          }
+        }
+      }
+    }
+    printf("%s: Created %d (expected+builtins+plugins %d+%d+%d) %d Failed\n",
+           T::Type(), created, (int)expected.size(),
+           (int)(factories.size() - expected.size()),
+           (int)(plugins.size() - builtins.size()), (int)failed->size());
+#else
+    printf("%s: Created %d (expected %d) %d Failed\n", T::Type(), created,
+           (int)expected.size(), (int)failed->size());
+#endif  // ROCKSDB_LITE
+    return result;
+  }
+
+  template <typename T>
+  Status TestSharedBuiltins(const std::string& mock,
+                            const std::string& expected,
+                            std::vector<std::string>* failed = nullptr) {
+    std::unordered_set<std::string> values;
+    if (!expected.empty()) {
+      values.insert(expected);
+    }
+    std::shared_ptr<T> object;
+    if (failed != nullptr) {
+      return TestExpectedBuiltins<T>(mock, values, &object, failed);
+    } else {
+      std::vector<std::string> failures;
+      Status s = TestExpectedBuiltins<T>(mock, values, &object, &failures);
+      EXPECT_EQ(0U, failures.size());
+      return s;
+    }
+  }
+
+  template <typename T, typename U>
+  Status TestStaticBuiltins(const std::string& mock, U** object,
+                            const std::unordered_set<std::string>& expected,
+                            std::vector<std::string>* failed,
+                            bool delete_objects = false) {
+    std::unordered_set<std::string> factories = expected;
+    Status s = TestCreateStatic<T>(mock, object, delete_objects);
+    EXPECT_NOK(s);
+#ifndef ROCKSDB_LITE
+    std::vector<std::string> builtins;
+    ObjectLibrary::Default()->GetFactoryNames(T::Type(), &builtins);
+    factories.insert(builtins.begin(), builtins.end());
+#endif  // ROCKSDB_LITE
+    int created = 0;
+    Status result;
+    for (const auto& name : factories) {
+      created++;
+      s = TestCreateStatic<T>(name, object, delete_objects);
+      if (!s.ok()) {
+        result = s;
+        failed->push_back(name);
+      }
+    }
+#ifndef ROCKSDB_LITE
+    std::vector<std::string> plugins;
+    ObjectRegistry::Default()->GetFactoryNames(T::Type(), &plugins);
+    if (plugins.size() > builtins.size()) {
+      for (const auto& name : plugins) {
+        if (factories.find(name) == factories.end()) {
+          created++;
+          s = T::CreateFromString(config_options_, name, object);
+          if (!s.ok() || *object == nullptr ||
+              !((*object)->IsInstanceOf(name))) {
+            failed->push_back(name);
+            if (result.ok() && !s.ok()) {
+              result = s;
+            }
+            printf("%s: Failed creating plugin[%s]: %s\n", T::Type(),
+                   name.c_str(), s.ToString().c_str());
+          }
+          if (delete_objects) {
+            delete *object;
+            *object = nullptr;
+          }
+        }
+      }
+    }
+    printf("%s: Created %d (expected+builtins+plugins %d+%d+%d) %d Failed\n",
+           T::Type(), created, (int)expected.size(),
+           (int)(factories.size() - expected.size()),
+           (int)(plugins.size() - builtins.size()), (int)failed->size());
+#else
+    printf("%s: Created %d (expected %d) %d Failed\n", T::Type(), created,
+           (int)expected.size(), (int)failed->size());
+#endif  // ROCKSDB_LITE
+    return result;
+  }
+
+ protected:
+  DBOptions db_opts_;
+  ColumnFamilyOptions cf_opts_;
+  ConfigOptions config_options_;
+};
+
+TEST_F(LoadCustomizableTest, LoadTableFactoryTest) {
+  ASSERT_OK(
+      TestSharedBuiltins<TableFactory>(mock::MockTableFactory::kClassName(),
+                                       TableFactory::kBlockBasedTableName()));
+#ifndef ROCKSDB_LITE
+  std::string opts_str = "table_factory=";
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options_, cf_opts_,
+      opts_str + TableFactory::kBlockBasedTableName(), &cf_opts_));
+  ASSERT_NE(cf_opts_.table_factory.get(), nullptr);
+  ASSERT_STREQ(cf_opts_.table_factory->Name(),
+               TableFactory::kBlockBasedTableName());
+#endif  // ROCKSDB_LITE
+  if (RegisterTests("Test")) {
+    ExpectCreateShared<TableFactory>(mock::MockTableFactory::kClassName());
+#ifndef ROCKSDB_LITE
+    ASSERT_OK(GetColumnFamilyOptionsFromString(
+        config_options_, cf_opts_,
+        opts_str + mock::MockTableFactory::kClassName(), &cf_opts_));
+    ASSERT_NE(cf_opts_.table_factory.get(), nullptr);
+    ASSERT_STREQ(cf_opts_.table_factory->Name(),
+                 mock::MockTableFactory::kClassName());
+#endif  // ROCKSDB_LITE
+  }
+}
+
+TEST_F(LoadCustomizableTest, LoadFileSystemTest) {
+  ASSERT_OK(TestSharedBuiltins<FileSystem>(DummyFileSystem::kClassName(),
+                                           FileSystem::kDefaultName()));
+  if (RegisterTests("Test")) {
+    auto fs = ExpectCreateShared<FileSystem>(DummyFileSystem::kClassName());
+    ASSERT_FALSE(fs->IsInstanceOf(FileSystem::kDefaultName()));
+  }
+}
+
+TEST_F(LoadCustomizableTest, LoadSecondaryCacheTest) {
+  ASSERT_OK(
+      TestSharedBuiltins<SecondaryCache>(TestSecondaryCache::kClassName(), ""));
+  if (RegisterTests("Test")) {
+    ExpectCreateShared<SecondaryCache>(TestSecondaryCache::kClassName());
+  }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(LoadCustomizableTest, LoadSstPartitionerFactoryTest) {
+  ASSERT_OK(TestSharedBuiltins<SstPartitionerFactory>(
+      "Mock", SstPartitionerFixedPrefixFactory::kClassName()));
+  if (RegisterTests("Test")) {
+    ExpectCreateShared<SstPartitionerFactory>("Mock");
+  }
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(LoadCustomizableTest, LoadChecksumGenFactoryTest) {
+  ASSERT_OK(TestSharedBuiltins<FileChecksumGenFactory>("Mock", ""));
+  if (RegisterTests("Test")) {
+    ExpectCreateShared<FileChecksumGenFactory>("Mock");
+  }
+}
+
+TEST_F(LoadCustomizableTest, LoadTablePropertiesCollectorFactoryTest) {
+  ASSERT_OK(TestSharedBuiltins<TablePropertiesCollectorFactory>(
+      MockTablePropertiesCollectorFactory::kClassName(), ""));
+  if (RegisterTests("Test")) {
+    ExpectCreateShared<TablePropertiesCollectorFactory>(
+        MockTablePropertiesCollectorFactory::kClassName());
+  }
+}
+
+TEST_F(LoadCustomizableTest, LoadComparatorTest) {
+  const Comparator* bytewise = BytewiseComparator();
+  const Comparator* reverse = ReverseBytewiseComparator();
+  const Comparator* result = nullptr;
+  std::unordered_set<std::string> expected = {bytewise->Name(),
+                                              reverse->Name()};
+  std::vector<std::string> failures;
+  ASSERT_OK(TestStaticBuiltins<Comparator>(
+      test::SimpleSuffixReverseComparator::kClassName(), &result, expected,
+      &failures));
+  if (RegisterTests("Test")) {
+    ASSERT_OK(TestCreateStatic<Comparator>(
+        test::SimpleSuffixReverseComparator::kClassName(), &result));
+  }
+}
+
+TEST_F(LoadCustomizableTest, LoadSliceTransformFactoryTest) {
+  std::shared_ptr<const SliceTransform> result;
+  std::vector<std::string> failures;
+  std::unordered_set<std::string> expected = {"rocksdb.Noop", "fixed",
+                                              "rocksdb.FixedPrefix", "capped",
+                                              "rocksdb.CappedPrefix"};
+  ASSERT_OK(TestExpectedBuiltins<SliceTransform>(
+      "Mock", expected, &result, &failures, [](const std::string& name) {
+        std::vector<std::string> names = {name + ":22", name + ".22"};
+        return names;
+      }));
+  ASSERT_OK(SliceTransform::CreateFromString(
+      config_options_, "rocksdb.FixedPrefix.22", &result));
+  ASSERT_NE(result.get(), nullptr);
+  ASSERT_TRUE(result->IsInstanceOf("fixed"));
+  ASSERT_OK(SliceTransform::CreateFromString(
+      config_options_, "rocksdb.CappedPrefix.22", &result));
+  ASSERT_NE(result.get(), nullptr);
+  ASSERT_TRUE(result->IsInstanceOf("capped"));
+  if (RegisterTests("Test")) {
+    ExpectCreateShared<SliceTransform>("Mock", &result);
+  }
+}
+
+TEST_F(LoadCustomizableTest, LoadStatisticsTest) {
+  ASSERT_OK(TestSharedBuiltins<Statistics>(TestStatistics::kClassName(),
+                                           "BasicStatistics"));
+  // Empty will create a default BasicStatistics
+  ASSERT_OK(
+      Statistics::CreateFromString(config_options_, "", &db_opts_.statistics));
+  ASSERT_NE(db_opts_.statistics, nullptr);
+  ASSERT_STREQ(db_opts_.statistics->Name(), "BasicStatistics");
+
+#ifndef ROCKSDB_LITE
+  ASSERT_NOK(GetDBOptionsFromString(config_options_, db_opts_,
+                                    "statistics=Test", &db_opts_));
+  ASSERT_OK(GetDBOptionsFromString(config_options_, db_opts_,
+                                   "statistics=BasicStatistics", &db_opts_));
+  ASSERT_NE(db_opts_.statistics, nullptr);
+  ASSERT_STREQ(db_opts_.statistics->Name(), "BasicStatistics");
+
+  if (RegisterTests("test")) {
+    auto stats = ExpectCreateShared<Statistics>(TestStatistics::kClassName());
+
+    ASSERT_OK(GetDBOptionsFromString(config_options_, db_opts_,
+                                     "statistics=Test", &db_opts_));
+    ASSERT_NE(db_opts_.statistics, nullptr);
+    ASSERT_STREQ(db_opts_.statistics->Name(), TestStatistics::kClassName());
+
+    ASSERT_OK(GetDBOptionsFromString(
+        config_options_, db_opts_, "statistics={id=Test;inner=BasicStatistics}",
+        &db_opts_));
+    ASSERT_NE(db_opts_.statistics, nullptr);
+    ASSERT_STREQ(db_opts_.statistics->Name(), TestStatistics::kClassName());
+    auto* inner = db_opts_.statistics->GetOptions<std::shared_ptr<Statistics>>(
+        "StatisticsOptions");
+    ASSERT_NE(inner, nullptr);
+    ASSERT_NE(inner->get(), nullptr);
+    ASSERT_STREQ(inner->get()->Name(), "BasicStatistics");
+
+    ASSERT_OK(Statistics::CreateFromString(
+        config_options_, "id=BasicStatistics;inner=Test", &stats));
+    ASSERT_NE(stats, nullptr);
+    ASSERT_STREQ(stats->Name(), "BasicStatistics");
+    inner = stats->GetOptions<std::shared_ptr<Statistics>>("StatisticsOptions");
+    ASSERT_NE(inner, nullptr);
+    ASSERT_NE(inner->get(), nullptr);
+    ASSERT_STREQ(inner->get()->Name(), TestStatistics::kClassName());
+  }
+#endif
+}
+
+TEST_F(LoadCustomizableTest, LoadMemTableRepFactoryTest) {
+  std::unordered_set<std::string> expected = {
+      SkipListFactory::kClassName(),
+      SkipListFactory::kNickName(),
+  };
+
+  std::vector<std::string> failures;
+  std::shared_ptr<MemTableRepFactory> factory;
+  Status s = TestExpectedBuiltins<MemTableRepFactory>(
+      "SpecialSkipListFactory", expected, &factory, &failures);
+  // There is a "cuckoo" factory registered that we expect to fail.  Ignore the
+  // error if this is the one
+  if (s.ok() || failures.size() > 1 || failures[0] != "cuckoo") {
+    ASSERT_OK(s);
+  }
+  if (RegisterTests("Test")) {
+    ExpectCreateShared<MemTableRepFactory>("SpecialSkipListFactory");
+  }
+}
+
+TEST_F(LoadCustomizableTest, LoadMergeOperatorTest) {
+  std::shared_ptr<MergeOperator> result;
+  std::vector<std::string> failed;
+  std::unordered_set<std::string> expected = {
+      "put", "put_v1",      "PutOperator", "uint64add", "UInt64AddOperator",
+      "max", "MaxOperator",
+  };
+#ifndef ROCKSDB_LITE
+  expected.insert({
+      StringAppendOperator::kClassName(),
+      StringAppendOperator::kNickName(),
+      StringAppendTESTOperator::kClassName(),
+      StringAppendTESTOperator::kNickName(),
+      SortList::kClassName(),
+      SortList::kNickName(),
+      BytesXOROperator::kClassName(),
+      BytesXOROperator::kNickName(),
+  });
+#endif  // ROCKSDB_LITE
+
+  ASSERT_OK(TestExpectedBuiltins<MergeOperator>("Changling", expected, &result,
+                                                &failed));
+  if (RegisterTests("Test")) {
+    ExpectCreateShared<MergeOperator>("Changling");
+  }
+}
+
+TEST_F(LoadCustomizableTest, LoadCompactionFilterFactoryTest) {
+  ASSERT_OK(TestSharedBuiltins<CompactionFilterFactory>("Changling", ""));
+  if (RegisterTests("Test")) {
+    ExpectCreateShared<CompactionFilterFactory>("Changling");
+  }
+}
+
+TEST_F(LoadCustomizableTest, LoadCompactionFilterTest) {
+  const CompactionFilter* result = nullptr;
+  std::vector<std::string> failures;
+  ASSERT_OK(TestStaticBuiltins<CompactionFilter>("Changling", &result, {},
+                                                 &failures, true));
+  if (RegisterTests("Test")) {
+    ASSERT_OK(TestCreateStatic<CompactionFilter>("Changling", &result, true));
+  }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(LoadCustomizableTest, LoadEventListenerTest) {
+  ASSERT_OK(TestSharedBuiltins<EventListener>(
+      OnFileDeletionListener::kClassName(), ""));
+  if (RegisterTests("Test")) {
+    ExpectCreateShared<EventListener>(OnFileDeletionListener::kClassName());
+    ExpectCreateShared<EventListener>(FlushCounterListener::kClassName());
+  }
+}
+
+TEST_F(LoadCustomizableTest, LoadEncryptionProviderTest) {
+  std::vector<std::string> failures;
+  std::shared_ptr<EncryptionProvider> result;
+  ASSERT_OK(
+      TestExpectedBuiltins<EncryptionProvider>("Mock", {}, &result, &failures));
+  if (!failures.empty()) {
+    ASSERT_EQ(failures[0], "1://test");
+    ASSERT_EQ(failures.size(), 1U);
+  }
+
+  result = ExpectCreateShared<EncryptionProvider>("CTR");
+  ASSERT_NOK(result->ValidateOptions(db_opts_, cf_opts_));
+  ASSERT_OK(EncryptionProvider::CreateFromString(config_options_, "CTR://test",
+                                                 &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), "CTR");
+  ASSERT_OK(result->ValidateOptions(db_opts_, cf_opts_));
+
+  if (RegisterTests("Test")) {
+    ExpectCreateShared<EncryptionProvider>("Mock");
+    ASSERT_OK(EncryptionProvider::CreateFromString(config_options_,
+                                                   "Mock://test", &result));
+    ASSERT_NE(result, nullptr);
+    ASSERT_STREQ(result->Name(), "Mock");
+    ASSERT_OK(result->ValidateOptions(db_opts_, cf_opts_));
+  }
+}
+
+TEST_F(LoadCustomizableTest, LoadEncryptionCipherTest) {
+  ASSERT_OK(TestSharedBuiltins<BlockCipher>("Mock", "ROT13"));
+  if (RegisterTests("Test")) {
+    ExpectCreateShared<BlockCipher>("Mock");
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_F(LoadCustomizableTest, LoadSystemClockTest) {
+  ASSERT_OK(TestSharedBuiltins<SystemClock>(MockSystemClock::kClassName(),
+                                            SystemClock::kDefaultName()));
+  if (RegisterTests("Test")) {
+    auto result =
+        ExpectCreateShared<SystemClock>(MockSystemClock::kClassName());
+    ASSERT_FALSE(result->IsInstanceOf(SystemClock::kDefaultName()));
+  }
+}
+
+TEST_F(LoadCustomizableTest, LoadMemoryAllocatorTest) {
+  std::vector<std::string> failures;
+  Status s = TestSharedBuiltins<MemoryAllocator>(
+      MockMemoryAllocator::kClassName(), DefaultMemoryAllocator::kClassName(),
+      &failures);
+  if (failures.empty()) {
+    ASSERT_OK(s);
+  } else {
+    ASSERT_NOK(s);
+    for (const auto& failure : failures) {
+      if (failure == JemallocNodumpAllocator::kClassName()) {
+        ASSERT_FALSE(JemallocNodumpAllocator::IsSupported());
+      } else if (failure == MemkindKmemAllocator::kClassName()) {
+        ASSERT_FALSE(MemkindKmemAllocator::IsSupported());
+      } else {
+        printf("BYPASSED: %s -- %s\n", failure.c_str(), s.ToString().c_str());
+      }
+    }
+  }
+  if (RegisterTests("Test")) {
+    ExpectCreateShared<MemoryAllocator>(MockMemoryAllocator::kClassName());
+  }
+}
+
+TEST_F(LoadCustomizableTest, LoadFilterPolicyTest) {
+  const std::string kAutoBloom = BloomFilterPolicy::kClassName();
+  const std::string kAutoRibbon = RibbonFilterPolicy::kClassName();
+
+  std::shared_ptr<const FilterPolicy> result;
+  std::vector<std::string> failures;
+  std::unordered_set<std::string> expected = {
+      ReadOnlyBuiltinFilterPolicy::kClassName(),
+  };
+
+#ifndef ROCKSDB_LITE
+  expected.insert({
+      kAutoBloom,
+      BloomFilterPolicy::kNickName(),
+      kAutoRibbon,
+      RibbonFilterPolicy::kNickName(),
+  });
+#endif  // ROCKSDB_LITE
+  ASSERT_OK(TestExpectedBuiltins<const FilterPolicy>(
+      "Mock", expected, &result, &failures, [](const std::string& name) {
+        std::vector<std::string> names = {name + ":1.234"};
+        return names;
+      }));
+#ifndef ROCKSDB_LITE
+  ASSERT_OK(FilterPolicy::CreateFromString(
+      config_options_, kAutoBloom + ":1.234:false", &result));
+  ASSERT_NE(result.get(), nullptr);
+  ASSERT_TRUE(result->IsInstanceOf(kAutoBloom));
+  ASSERT_OK(FilterPolicy::CreateFromString(
+      config_options_, kAutoBloom + ":1.234:false", &result));
+  ASSERT_NE(result.get(), nullptr);
+  ASSERT_TRUE(result->IsInstanceOf(kAutoBloom));
+  ASSERT_OK(FilterPolicy::CreateFromString(config_options_,
+                                           kAutoRibbon + ":1.234:-1", &result));
+  ASSERT_NE(result.get(), nullptr);
+  ASSERT_TRUE(result->IsInstanceOf(kAutoRibbon));
+  ASSERT_OK(FilterPolicy::CreateFromString(config_options_,
+                                           kAutoRibbon + ":1.234:56", &result));
+  ASSERT_NE(result.get(), nullptr);
+  ASSERT_TRUE(result->IsInstanceOf(kAutoRibbon));
+#endif  // ROCKSDB_LITE
+
+  if (RegisterTests("Test")) {
+    ExpectCreateShared<FilterPolicy>(MockFilterPolicy::kClassName(), &result);
+  }
+
+  std::shared_ptr<TableFactory> table;
+
+#ifndef ROCKSDB_LITE
+  std::string table_opts = "id=BlockBasedTable; filter_policy=";
+  ASSERT_OK(TableFactory::CreateFromString(config_options_,
+                                           table_opts + "nullptr", &table));
+  ASSERT_NE(table.get(), nullptr);
+  auto bbto = table->GetOptions<BlockBasedTableOptions>();
+  ASSERT_NE(bbto, nullptr);
+  ASSERT_EQ(bbto->filter_policy.get(), nullptr);
+  ASSERT_OK(TableFactory::CreateFromString(
+      config_options_, table_opts + ReadOnlyBuiltinFilterPolicy::kClassName(),
+      &table));
+  bbto = table->GetOptions<BlockBasedTableOptions>();
+  ASSERT_NE(bbto, nullptr);
+  ASSERT_NE(bbto->filter_policy.get(), nullptr);
+  ASSERT_STREQ(bbto->filter_policy->Name(),
+               ReadOnlyBuiltinFilterPolicy::kClassName());
+  ASSERT_OK(TableFactory::CreateFromString(
+      config_options_, table_opts + MockFilterPolicy::kClassName(), &table));
+  bbto = table->GetOptions<BlockBasedTableOptions>();
+  ASSERT_NE(bbto, nullptr);
+  ASSERT_NE(bbto->filter_policy.get(), nullptr);
+  ASSERT_TRUE(
+      bbto->filter_policy->IsInstanceOf(MockFilterPolicy::kClassName()));
+#endif  // ROCKSDB_LITE
+}
+
+TEST_F(LoadCustomizableTest, LoadFlushBlockPolicyFactoryTest) {
+  std::shared_ptr<FlushBlockPolicyFactory> result;
+  std::shared_ptr<TableFactory> table;
+  std::vector<std::string> failed;
+  std::unordered_set<std::string> expected = {
+      FlushBlockBySizePolicyFactory::kClassName(),
+      FlushBlockEveryKeyPolicyFactory::kClassName(),
+  };
+
+  ASSERT_OK(TestExpectedBuiltins<FlushBlockPolicyFactory>(
+      TestFlushBlockPolicyFactory::kClassName(), expected, &result, &failed));
+
+  // An empty policy name creates a BySize policy
+  ASSERT_OK(
+      FlushBlockPolicyFactory::CreateFromString(config_options_, "", &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), FlushBlockBySizePolicyFactory::kClassName());
+
+#ifndef ROCKSDB_LITE
+  std::string table_opts = "id=BlockBasedTable; flush_block_policy_factory=";
+  ASSERT_OK(TableFactory::CreateFromString(
+      config_options_,
+      table_opts + FlushBlockEveryKeyPolicyFactory::kClassName(), &table));
+  auto bbto = table->GetOptions<BlockBasedTableOptions>();
+  ASSERT_NE(bbto, nullptr);
+  ASSERT_NE(bbto->flush_block_policy_factory.get(), nullptr);
+  ASSERT_STREQ(bbto->flush_block_policy_factory->Name(),
+               FlushBlockEveryKeyPolicyFactory::kClassName());
+  if (RegisterTests("Test")) {
+    ExpectCreateShared<FlushBlockPolicyFactory>(
+        TestFlushBlockPolicyFactory::kClassName());
+    ASSERT_OK(TableFactory::CreateFromString(
+        config_options_, table_opts + TestFlushBlockPolicyFactory::kClassName(),
+        &table));
+    bbto = table->GetOptions<BlockBasedTableOptions>();
+    ASSERT_NE(bbto, nullptr);
+    ASSERT_NE(bbto->flush_block_policy_factory.get(), nullptr);
+    ASSERT_STREQ(bbto->flush_block_policy_factory->Name(),
+                 TestFlushBlockPolicyFactory::kClassName());
+  }
+#endif  // ROCKSDB_LITE
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+#ifdef GFLAGS
+  ParseCommandLineFlags(&argc, &argv, true);
+#endif  // GFLAGS
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/options/db_options.cc b/src/rocksdb/options/db_options.cc
new file mode 100644
index 000000000..e0bc892fc
--- /dev/null
+++ b/src/rocksdb/options/db_options.cc
@@ -0,0 +1,1086 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "options/db_options.h"
+
+#include <cinttypes>
+
+#include "logging/logging.h"
+#include "options/configurable_helper.h"
+#include "options/options_helper.h"
+#include "options/options_parser.h"
+#include "port/port.h"
+#include "rocksdb/configurable.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/sst_file_manager.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/utilities/options_type.h"
+#include "rocksdb/wal_filter.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+#ifndef ROCKSDB_LITE
+static std::unordered_map<std::string, WALRecoveryMode>
+    wal_recovery_mode_string_map = {
+        {"kTolerateCorruptedTailRecords",
+         WALRecoveryMode::kTolerateCorruptedTailRecords},
+        {"kAbsoluteConsistency", WALRecoveryMode::kAbsoluteConsistency},
+        {"kPointInTimeRecovery", WALRecoveryMode::kPointInTimeRecovery},
+        {"kSkipAnyCorruptedRecords",
+         WALRecoveryMode::kSkipAnyCorruptedRecords}};
+
+static std::unordered_map<std::string, DBOptions::AccessHint>
+    access_hint_string_map = {{"NONE", DBOptions::AccessHint::NONE},
+                              {"NORMAL", DBOptions::AccessHint::NORMAL},
+                              {"SEQUENTIAL", DBOptions::AccessHint::SEQUENTIAL},
+                              {"WILLNEED", DBOptions::AccessHint::WILLNEED}};
+
+static std::unordered_map<std::string, CacheTier> cache_tier_string_map = {
+    {"kVolatileTier", CacheTier::kVolatileTier},
+    {"kNonVolatileBlockTier", CacheTier::kNonVolatileBlockTier}};
+
+static std::unordered_map<std::string, InfoLogLevel> info_log_level_string_map =
+    {{"DEBUG_LEVEL", InfoLogLevel::DEBUG_LEVEL},
+     {"INFO_LEVEL", InfoLogLevel::INFO_LEVEL},
+     {"WARN_LEVEL", InfoLogLevel::WARN_LEVEL},
+     {"ERROR_LEVEL", InfoLogLevel::ERROR_LEVEL},
+     {"FATAL_LEVEL", InfoLogLevel::FATAL_LEVEL},
+     {"HEADER_LEVEL", InfoLogLevel::HEADER_LEVEL}};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    db_mutable_options_type_info = {
+        {"allow_os_buffer",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kMutable}},
+        {"base_background_compactions",
+         {0, OptionType::kInt, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kMutable}},
+        {"max_background_jobs",
+         {offsetof(struct MutableDBOptions, max_background_jobs),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"max_background_compactions",
+         {offsetof(struct MutableDBOptions, max_background_compactions),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"max_subcompactions",
+         {offsetof(struct MutableDBOptions, max_subcompactions),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"avoid_flush_during_shutdown",
+         {offsetof(struct MutableDBOptions, avoid_flush_during_shutdown),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"writable_file_max_buffer_size",
+         {offsetof(struct MutableDBOptions, writable_file_max_buffer_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"delayed_write_rate",
+         {offsetof(struct MutableDBOptions, delayed_write_rate),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"max_total_wal_size",
+         {offsetof(struct MutableDBOptions, max_total_wal_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"delete_obsolete_files_period_micros",
+         {offsetof(struct MutableDBOptions,
+                   delete_obsolete_files_period_micros),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"stats_dump_period_sec",
+         {offsetof(struct MutableDBOptions, stats_dump_period_sec),
+          OptionType::kUInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"stats_persist_period_sec",
+         {offsetof(struct MutableDBOptions, stats_persist_period_sec),
+          OptionType::kUInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"stats_history_buffer_size",
+         {offsetof(struct MutableDBOptions, stats_history_buffer_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"max_open_files",
+         {offsetof(struct MutableDBOptions, max_open_files), OptionType::kInt,
+          OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
+        {"bytes_per_sync",
+         {offsetof(struct MutableDBOptions, bytes_per_sync),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"wal_bytes_per_sync",
+         {offsetof(struct MutableDBOptions, wal_bytes_per_sync),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"strict_bytes_per_sync",
+         {offsetof(struct MutableDBOptions, strict_bytes_per_sync),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"compaction_readahead_size",
+         {offsetof(struct MutableDBOptions, compaction_readahead_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"max_background_flushes",
+         {offsetof(struct MutableDBOptions, max_background_flushes),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    db_immutable_options_type_info = {
+        /*
+         // not yet supported
+          std::shared_ptr<Cache> row_cache;
+          std::shared_ptr<DeleteScheduler> delete_scheduler;
+          std::shared_ptr<Logger> info_log;
+          std::shared_ptr<RateLimiter> rate_limiter;
+          std::shared_ptr<Statistics> statistics;
+          std::vector<DbPath> db_paths;
+          FileTypeSet checksum_handoff_file_types;
+         */
+        {"advise_random_on_open",
+         {offsetof(struct ImmutableDBOptions, advise_random_on_open),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"allow_mmap_reads",
+         {offsetof(struct ImmutableDBOptions, allow_mmap_reads),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"allow_fallocate",
+         {offsetof(struct ImmutableDBOptions, allow_fallocate),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"allow_mmap_writes",
+         {offsetof(struct ImmutableDBOptions, allow_mmap_writes),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"use_direct_reads",
+         {offsetof(struct ImmutableDBOptions, use_direct_reads),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"use_direct_writes",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"use_direct_io_for_flush_and_compaction",
+         {offsetof(struct ImmutableDBOptions,
+                   use_direct_io_for_flush_and_compaction),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"allow_2pc",
+         {offsetof(struct ImmutableDBOptions, allow_2pc), OptionType::kBoolean,
+          OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+        {"wal_filter",
+         OptionTypeInfo::AsCustomRawPtr<WalFilter>(
+             offsetof(struct ImmutableDBOptions, wal_filter),
+             OptionVerificationType::kByName,
+             (OptionTypeFlags::kAllowNull | OptionTypeFlags::kCompareNever))},
+        {"create_if_missing",
+         {offsetof(struct ImmutableDBOptions, create_if_missing),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"create_missing_column_families",
+         {offsetof(struct ImmutableDBOptions, create_missing_column_families),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"disableDataSync",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"disable_data_sync",  // for compatibility
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"enable_thread_tracking",
+         {offsetof(struct ImmutableDBOptions, enable_thread_tracking),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"error_if_exists",
+         {offsetof(struct ImmutableDBOptions, error_if_exists),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"experimental_allow_mempurge",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"experimental_mempurge_policy",
+         {0, OptionType::kString, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"experimental_mempurge_threshold",
+         {0, OptionType::kDouble, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"is_fd_close_on_exec",
+         {offsetof(struct ImmutableDBOptions, is_fd_close_on_exec),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"paranoid_checks",
+         {offsetof(struct ImmutableDBOptions, paranoid_checks),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"flush_verify_memtable_count",
+         {offsetof(struct ImmutableDBOptions, flush_verify_memtable_count),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"track_and_verify_wals_in_manifest",
+         {offsetof(struct ImmutableDBOptions,
+                   track_and_verify_wals_in_manifest),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"verify_sst_unique_id_in_manifest",
+         {offsetof(struct ImmutableDBOptions, verify_sst_unique_id_in_manifest),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"skip_log_error_on_recovery",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"skip_stats_update_on_db_open",
+         {offsetof(struct ImmutableDBOptions, skip_stats_update_on_db_open),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"skip_checking_sst_file_sizes_on_db_open",
+         {offsetof(struct ImmutableDBOptions,
+                   skip_checking_sst_file_sizes_on_db_open),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"new_table_reader_for_compaction_inputs",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"random_access_max_buffer_size",
+         {offsetof(struct ImmutableDBOptions, random_access_max_buffer_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"use_adaptive_mutex",
+         {offsetof(struct ImmutableDBOptions, use_adaptive_mutex),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"use_fsync",
+         {offsetof(struct ImmutableDBOptions, use_fsync), OptionType::kBoolean,
+          OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+        {"max_file_opening_threads",
+         {offsetof(struct ImmutableDBOptions, max_file_opening_threads),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"table_cache_numshardbits",
+         {offsetof(struct ImmutableDBOptions, table_cache_numshardbits),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"db_write_buffer_size",
+         {offsetof(struct ImmutableDBOptions, db_write_buffer_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"keep_log_file_num",
+         {offsetof(struct ImmutableDBOptions, keep_log_file_num),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"recycle_log_file_num",
+         {offsetof(struct ImmutableDBOptions, recycle_log_file_num),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"log_file_time_to_roll",
+         {offsetof(struct ImmutableDBOptions, log_file_time_to_roll),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"manifest_preallocation_size",
+         {offsetof(struct ImmutableDBOptions, manifest_preallocation_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"max_log_file_size",
+         {offsetof(struct ImmutableDBOptions, max_log_file_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"db_log_dir",
+         {offsetof(struct ImmutableDBOptions, db_log_dir), OptionType::kString,
+          OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+        {"wal_dir",
+         {offsetof(struct ImmutableDBOptions, wal_dir), OptionType::kString,
+          OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+        {"WAL_size_limit_MB",
+         {offsetof(struct ImmutableDBOptions, WAL_size_limit_MB),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"WAL_ttl_seconds",
+         {offsetof(struct ImmutableDBOptions, WAL_ttl_seconds),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"max_manifest_file_size",
+         {offsetof(struct ImmutableDBOptions, max_manifest_file_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"persist_stats_to_disk",
+         {offsetof(struct ImmutableDBOptions, persist_stats_to_disk),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"fail_if_options_file_error",
+         {offsetof(struct ImmutableDBOptions, fail_if_options_file_error),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"enable_pipelined_write",
+         {offsetof(struct ImmutableDBOptions, enable_pipelined_write),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"unordered_write",
+         {offsetof(struct ImmutableDBOptions, unordered_write),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"allow_concurrent_memtable_write",
+         {offsetof(struct ImmutableDBOptions, allow_concurrent_memtable_write),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"wal_recovery_mode",
+         OptionTypeInfo::Enum<WALRecoveryMode>(
+             offsetof(struct ImmutableDBOptions, wal_recovery_mode),
+             &wal_recovery_mode_string_map)},
+        {"enable_write_thread_adaptive_yield",
+         {offsetof(struct ImmutableDBOptions,
+                   enable_write_thread_adaptive_yield),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"write_thread_slow_yield_usec",
+         {offsetof(struct ImmutableDBOptions, write_thread_slow_yield_usec),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"max_write_batch_group_size_bytes",
+         {offsetof(struct ImmutableDBOptions, max_write_batch_group_size_bytes),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"write_thread_max_yield_usec",
+         {offsetof(struct ImmutableDBOptions, write_thread_max_yield_usec),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"access_hint_on_compaction_start",
+         OptionTypeInfo::Enum<DBOptions::AccessHint>(
+             offsetof(struct ImmutableDBOptions,
+                      access_hint_on_compaction_start),
+             &access_hint_string_map)},
+        {"info_log_level",
+         OptionTypeInfo::Enum<InfoLogLevel>(
+             offsetof(struct ImmutableDBOptions, info_log_level),
+             &info_log_level_string_map)},
+        {"dump_malloc_stats",
+         {offsetof(struct ImmutableDBOptions, dump_malloc_stats),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"avoid_flush_during_recovery",
+         {offsetof(struct ImmutableDBOptions, avoid_flush_during_recovery),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"allow_ingest_behind",
+         {offsetof(struct ImmutableDBOptions, allow_ingest_behind),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"preserve_deletes",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"concurrent_prepare",  // Deprecated by two_write_queues
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"two_write_queues",
+         {offsetof(struct ImmutableDBOptions, two_write_queues),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"manual_wal_flush",
+         {offsetof(struct ImmutableDBOptions, manual_wal_flush),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"wal_compression",
+         {offsetof(struct ImmutableDBOptions, wal_compression),
+          OptionType::kCompressionType, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"seq_per_batch",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"atomic_flush",
+         {offsetof(struct ImmutableDBOptions, atomic_flush),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"avoid_unnecessary_blocking_io",
+         {offsetof(struct ImmutableDBOptions, avoid_unnecessary_blocking_io),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"write_dbid_to_manifest",
+         {offsetof(struct ImmutableDBOptions, write_dbid_to_manifest),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"log_readahead_size",
+         {offsetof(struct ImmutableDBOptions, log_readahead_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"best_efforts_recovery",
+         {offsetof(struct ImmutableDBOptions, best_efforts_recovery),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"max_bgerror_resume_count",
+         {offsetof(struct ImmutableDBOptions, max_bgerror_resume_count),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"bgerror_resume_retry_interval",
+         {offsetof(struct ImmutableDBOptions, bgerror_resume_retry_interval),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"db_host_id",
+         {offsetof(struct ImmutableDBOptions, db_host_id), OptionType::kString,
+          OptionVerificationType::kNormal, OptionTypeFlags::kCompareNever}},
+        // Temporarily deprecated due to race conditions (examples in PR 10375).
+        {"rate_limiter",
+         {offsetof(struct ImmutableDBOptions, rate_limiter),
+          OptionType::kUnknown, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kDontSerialize | OptionTypeFlags::kCompareNever}},
+        // The following properties were handled as special cases in ParseOption
+        // This means that the properties could be read from the options file
+        // but never written to the file or compared to each other.
+        {"rate_limiter_bytes_per_sec",
+         {offsetof(struct ImmutableDBOptions, rate_limiter),
+          OptionType::kUnknown, OptionVerificationType::kNormal,
+          (OptionTypeFlags::kDontSerialize | OptionTypeFlags::kCompareNever),
+          // Parse the input value as a RateLimiter
+          [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
+             const std::string& value, void* addr) {
+            auto limiter = static_cast<std::shared_ptr<RateLimiter>*>(addr);
+            limiter->reset(NewGenericRateLimiter(
+                static_cast<int64_t>(ParseUint64(value))));
+            return Status::OK();
+          }}},
+        {"env",  //**TODO: Should this be kCustomizable?
+         OptionTypeInfo(
+             offsetof(struct ImmutableDBOptions, env), OptionType::kUnknown,
+             OptionVerificationType::kNormal,
+             (OptionTypeFlags::kDontSerialize | OptionTypeFlags::kCompareNever))
+             .SetParseFunc([](const ConfigOptions& opts,
+                              const std::string& /*name*/,
+                              const std::string& value, void* addr) {
+               // Parse the input value as an Env
+               auto old_env = static_cast<Env**>(addr);  // Get the old value
+               Env* new_env = *old_env;                  // Set new to old
+               Status s = Env::CreateFromString(opts, value,
+                                                &new_env);  // Update new value
+               if (s.ok()) {                                // It worked
+                 *old_env = new_env;  // Update the old one
+               }
+               return s;
+             })
+             .SetPrepareFunc([](const ConfigOptions& opts,
+                                const std::string& /*name*/, void* addr) {
+               auto env = static_cast<Env**>(addr);
+               return (*env)->PrepareOptions(opts);
+             })
+             .SetValidateFunc([](const DBOptions& db_opts,
+                                 const ColumnFamilyOptions& cf_opts,
+                                 const std::string& /*name*/,
+                                 const void* addr) {
+               const auto env = static_cast<const Env* const*>(addr);
+               return (*env)->ValidateOptions(db_opts, cf_opts);
+             })},
+        {"allow_data_in_errors",
+         {offsetof(struct ImmutableDBOptions, allow_data_in_errors),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"file_checksum_gen_factory",
+         OptionTypeInfo::AsCustomSharedPtr<FileChecksumGenFactory>(
+             offsetof(struct ImmutableDBOptions, file_checksum_gen_factory),
+             OptionVerificationType::kByNameAllowFromNull,
+             OptionTypeFlags::kAllowNull)},
+        {"statistics",
+         OptionTypeInfo::AsCustomSharedPtr<Statistics>(
+             // Statistics should not be compared and can be null
+             // Statistics are maked "don't serialize" until they can be shared
+             // between DBs
+             offsetof(struct ImmutableDBOptions, statistics),
+             OptionVerificationType::kNormal,
+             OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize |
+                 OptionTypeFlags::kAllowNull)},
+        // Allow EventListeners that have a non-empty Name() to be read/written
+        // as options Each listener will either be
+        // - A simple name (e.g. "MyEventListener")
+        // - A name with properties (e.g. "{id=MyListener1; timeout=60}"
+        // Multiple listeners will be separated by a ":":
+        //   - "MyListener0;{id=MyListener1; timeout=60}
+        {"listeners",
+         {offsetof(struct ImmutableDBOptions, listeners), OptionType::kVector,
+          OptionVerificationType::kByNameAllowNull,
+          OptionTypeFlags::kCompareNever,
+          [](const ConfigOptions& opts, const std::string& /*name*/,
+             const std::string& value, void* addr) {
+            ConfigOptions embedded = opts;
+            embedded.ignore_unsupported_options = true;
+            std::vector<std::shared_ptr<EventListener>> listeners;
+            Status s;
+            for (size_t start = 0, end = 0;
+                 s.ok() && start < value.size() && end != std::string::npos;
+                 start = end + 1) {
+              std::string token;
+              s = OptionTypeInfo::NextToken(value, ':', start, &end, &token);
+              if (s.ok() && !token.empty()) {
+                std::shared_ptr<EventListener> listener;
+                s = EventListener::CreateFromString(embedded, token, &listener);
+                if (s.ok() && listener != nullptr) {
+                  listeners.push_back(listener);
+                }
+              }
+            }
+            if (s.ok()) {  // It worked
+              *(static_cast<std::vector<std::shared_ptr<EventListener>>*>(
+                  addr)) = listeners;
+            }
+            return s;
+          },
+          [](const ConfigOptions& opts, const std::string& /*name*/,
+             const void* addr, std::string* value) {
+            const auto listeners =
+                static_cast<const std::vector<std::shared_ptr<EventListener>>*>(
+                    addr);
+            ConfigOptions embedded = opts;
+            embedded.delimiter = ";";
+            int printed = 0;
+            for (const auto& listener : *listeners) {
+              auto id = listener->GetId();
+              if (!id.empty()) {
+                std::string elem_str = listener->ToString(embedded, "");
+                if (printed++ == 0) {
+                  value->append("{");
+                } else {
+                  value->append(":");
+                }
+                value->append(elem_str);
+              }
+            }
+            if (printed > 0) {
+              value->append("}");
+            }
+            return Status::OK();
+          },
+          nullptr}},
+        {"lowest_used_cache_tier",
+         OptionTypeInfo::Enum<CacheTier>(
+             offsetof(struct ImmutableDBOptions, lowest_used_cache_tier),
+             &cache_tier_string_map, OptionTypeFlags::kNone)},
+        {"enforce_single_del_contracts",
+         {offsetof(struct ImmutableDBOptions, enforce_single_del_contracts),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+};
+
+const std::string OptionsHelper::kDBOptionsName = "DBOptions";
+
+class MutableDBConfigurable : public Configurable {
+ public:
+  explicit MutableDBConfigurable(
+      const MutableDBOptions& mdb,
+      const std::unordered_map<std::string, std::string>* map = nullptr)
+      : mutable_(mdb), opt_map_(map) {
+    RegisterOptions(&mutable_, &db_mutable_options_type_info);
+  }
+
+  bool OptionsAreEqual(const ConfigOptions& config_options,
+                       const OptionTypeInfo& opt_info,
+                       const std::string& opt_name, const void* const this_ptr,
+                       const void* const that_ptr,
+                       std::string* mismatch) const override {
+    bool equals = opt_info.AreEqual(config_options, opt_name, this_ptr,
+                                    that_ptr, mismatch);
+    if (!equals && opt_info.IsByName()) {
+      if (opt_map_ == nullptr) {
+        equals = true;
+      } else {
+        const auto& iter = opt_map_->find(opt_name);
+        if (iter == opt_map_->end()) {
+          equals = true;
+        } else {
+          equals = opt_info.AreEqualByName(config_options, opt_name, this_ptr,
+                                           iter->second);
+        }
+      }
+      if (equals) {  // False alarm, clear mismatch
+        *mismatch = "";
+      }
+    }
+    if (equals && opt_info.IsConfigurable() && opt_map_ != nullptr) {
+      const auto* this_config = opt_info.AsRawPointer<Configurable>(this_ptr);
+      if (this_config == nullptr) {
+        const auto& iter = opt_map_->find(opt_name);
+        // If the name exists in the map and is not empty/null,
+        // then the this_config should be set.
+        if (iter != opt_map_->end() && !iter->second.empty() &&
+            iter->second != kNullptrString) {
+          *mismatch = opt_name;
+          equals = false;
+        }
+      }
+    }
+    return equals;
+  }
+
+ protected:
+  MutableDBOptions mutable_;
+  const std::unordered_map<std::string, std::string>* opt_map_;
+};
+
+class DBOptionsConfigurable : public MutableDBConfigurable {
+ public:
+  explicit DBOptionsConfigurable(
+      const DBOptions& opts,
+      const std::unordered_map<std::string, std::string>* map = nullptr)
+      : MutableDBConfigurable(MutableDBOptions(opts), map), db_options_(opts) {
+    // The ImmutableDBOptions currently requires the env to be non-null.  Make
+    // sure it is
+    if (opts.env != nullptr) {
+      immutable_ = ImmutableDBOptions(opts);
+    } else {
+      DBOptions copy = opts;
+      copy.env = Env::Default();
+      immutable_ = ImmutableDBOptions(copy);
+    }
+    RegisterOptions(&immutable_, &db_immutable_options_type_info);
+  }
+
+ protected:
+  Status ConfigureOptions(
+      const ConfigOptions& config_options,
+      const std::unordered_map<std::string, std::string>& opts_map,
+      std::unordered_map<std::string, std::string>* unused) override {
+    Status s = Configurable::ConfigureOptions(config_options, opts_map, unused);
+    if (s.ok()) {
+      db_options_ = BuildDBOptions(immutable_, mutable_);
+      s = PrepareOptions(config_options);
+    }
+    return s;
+  }
+
+  const void* GetOptionsPtr(const std::string& name) const override {
+    if (name == OptionsHelper::kDBOptionsName) {
+      return &db_options_;
+    } else {
+      return MutableDBConfigurable::GetOptionsPtr(name);
+    }
+  }
+
+ private:
+  ImmutableDBOptions immutable_;
+  DBOptions db_options_;
+};
+
+std::unique_ptr<Configurable> DBOptionsAsConfigurable(
+    const MutableDBOptions& opts) {
+  std::unique_ptr<Configurable> ptr(new MutableDBConfigurable(opts));
+  return ptr;
+}
+std::unique_ptr<Configurable> DBOptionsAsConfigurable(
+    const DBOptions& opts,
+    const std::unordered_map<std::string, std::string>* opt_map) {
+  std::unique_ptr<Configurable> ptr(new DBOptionsConfigurable(opts, opt_map));
+  return ptr;
+}
+#endif  // ROCKSDB_LITE
+
+ImmutableDBOptions::ImmutableDBOptions() : ImmutableDBOptions(Options()) {}
+
+ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
+    : create_if_missing(options.create_if_missing),
+      create_missing_column_families(options.create_missing_column_families),
+      error_if_exists(options.error_if_exists),
+      paranoid_checks(options.paranoid_checks),
+      flush_verify_memtable_count(options.flush_verify_memtable_count),
+      track_and_verify_wals_in_manifest(
+          options.track_and_verify_wals_in_manifest),
+      verify_sst_unique_id_in_manifest(
+          options.verify_sst_unique_id_in_manifest),
+      env(options.env),
+      rate_limiter(options.rate_limiter),
+      sst_file_manager(options.sst_file_manager),
+      info_log(options.info_log),
+      info_log_level(options.info_log_level),
+      max_file_opening_threads(options.max_file_opening_threads),
+      statistics(options.statistics),
+      use_fsync(options.use_fsync),
+      db_paths(options.db_paths),
+      db_log_dir(options.db_log_dir),
+      wal_dir(options.wal_dir),
+      max_log_file_size(options.max_log_file_size),
+      log_file_time_to_roll(options.log_file_time_to_roll),
+      keep_log_file_num(options.keep_log_file_num),
+      recycle_log_file_num(options.recycle_log_file_num),
+      max_manifest_file_size(options.max_manifest_file_size),
+      table_cache_numshardbits(options.table_cache_numshardbits),
+      WAL_ttl_seconds(options.WAL_ttl_seconds),
+      WAL_size_limit_MB(options.WAL_size_limit_MB),
+      max_write_batch_group_size_bytes(
+          options.max_write_batch_group_size_bytes),
+      manifest_preallocation_size(options.manifest_preallocation_size),
+      allow_mmap_reads(options.allow_mmap_reads),
+      allow_mmap_writes(options.allow_mmap_writes),
+      use_direct_reads(options.use_direct_reads),
+      use_direct_io_for_flush_and_compaction(
+          options.use_direct_io_for_flush_and_compaction),
+      allow_fallocate(options.allow_fallocate),
+      is_fd_close_on_exec(options.is_fd_close_on_exec),
+      advise_random_on_open(options.advise_random_on_open),
+      db_write_buffer_size(options.db_write_buffer_size),
+      write_buffer_manager(options.write_buffer_manager),
+      access_hint_on_compaction_start(options.access_hint_on_compaction_start),
+      random_access_max_buffer_size(options.random_access_max_buffer_size),
+      use_adaptive_mutex(options.use_adaptive_mutex),
+      listeners(options.listeners),
+      enable_thread_tracking(options.enable_thread_tracking),
+      enable_pipelined_write(options.enable_pipelined_write),
+      unordered_write(options.unordered_write),
+      allow_concurrent_memtable_write(options.allow_concurrent_memtable_write),
+      enable_write_thread_adaptive_yield(
+          options.enable_write_thread_adaptive_yield),
+      write_thread_max_yield_usec(options.write_thread_max_yield_usec),
+      write_thread_slow_yield_usec(options.write_thread_slow_yield_usec),
+      skip_stats_update_on_db_open(options.skip_stats_update_on_db_open),
+      skip_checking_sst_file_sizes_on_db_open(
+          options.skip_checking_sst_file_sizes_on_db_open),
+      wal_recovery_mode(options.wal_recovery_mode),
+      allow_2pc(options.allow_2pc),
+      row_cache(options.row_cache),
+#ifndef ROCKSDB_LITE
+      wal_filter(options.wal_filter),
+#endif  // ROCKSDB_LITE
+      fail_if_options_file_error(options.fail_if_options_file_error),
+      dump_malloc_stats(options.dump_malloc_stats),
+      avoid_flush_during_recovery(options.avoid_flush_during_recovery),
+      allow_ingest_behind(options.allow_ingest_behind),
+      two_write_queues(options.two_write_queues),
+      manual_wal_flush(options.manual_wal_flush),
+      wal_compression(options.wal_compression),
+      atomic_flush(options.atomic_flush),
+      avoid_unnecessary_blocking_io(options.avoid_unnecessary_blocking_io),
+      persist_stats_to_disk(options.persist_stats_to_disk),
+      write_dbid_to_manifest(options.write_dbid_to_manifest),
+      log_readahead_size(options.log_readahead_size),
+      file_checksum_gen_factory(options.file_checksum_gen_factory),
+      best_efforts_recovery(options.best_efforts_recovery),
+      max_bgerror_resume_count(options.max_bgerror_resume_count),
+      bgerror_resume_retry_interval(options.bgerror_resume_retry_interval),
+      allow_data_in_errors(options.allow_data_in_errors),
+      db_host_id(options.db_host_id),
+      checksum_handoff_file_types(options.checksum_handoff_file_types),
+      lowest_used_cache_tier(options.lowest_used_cache_tier),
+      compaction_service(options.compaction_service),
+      enforce_single_del_contracts(options.enforce_single_del_contracts) {
+  fs = env->GetFileSystem();
+  clock = env->GetSystemClock().get();
+  logger = info_log.get();
+  stats = statistics.get();
+}
+
+void ImmutableDBOptions::Dump(Logger* log) const {
+  ROCKS_LOG_HEADER(log, "                        Options.error_if_exists: %d",
+                   error_if_exists);
+  ROCKS_LOG_HEADER(log, "                      Options.create_if_missing: %d",
+                   create_if_missing);
+  ROCKS_LOG_HEADER(log, "                        Options.paranoid_checks: %d",
+                   paranoid_checks);
+  ROCKS_LOG_HEADER(log, "            Options.flush_verify_memtable_count: %d",
+                   flush_verify_memtable_count);
+  ROCKS_LOG_HEADER(log,
+                   "                              "
+                   "Options.track_and_verify_wals_in_manifest: %d",
+                   track_and_verify_wals_in_manifest);
+  ROCKS_LOG_HEADER(log, "       Options.verify_sst_unique_id_in_manifest: %d",
+                   verify_sst_unique_id_in_manifest);
+  ROCKS_LOG_HEADER(log, "                                    Options.env: %p",
+                   env);
+  ROCKS_LOG_HEADER(log, "                                     Options.fs: %s",
+                   fs->Name());
+  ROCKS_LOG_HEADER(log, "                               Options.info_log: %p",
+                   info_log.get());
+  ROCKS_LOG_HEADER(log, "               Options.max_file_opening_threads: %d",
+                   max_file_opening_threads);
+  ROCKS_LOG_HEADER(log, "                             Options.statistics: %p",
+                   stats);
+  ROCKS_LOG_HEADER(log, "                              Options.use_fsync: %d",
+                   use_fsync);
+  ROCKS_LOG_HEADER(
+      log, "                      Options.max_log_file_size: %" ROCKSDB_PRIszt,
+      max_log_file_size);
+  ROCKS_LOG_HEADER(log,
+                   "                 Options.max_manifest_file_size: %" PRIu64,
+                   max_manifest_file_size);
+  ROCKS_LOG_HEADER(
+      log, "                  Options.log_file_time_to_roll: %" ROCKSDB_PRIszt,
+      log_file_time_to_roll);
+  ROCKS_LOG_HEADER(
+      log, "                      Options.keep_log_file_num: %" ROCKSDB_PRIszt,
+      keep_log_file_num);
+  ROCKS_LOG_HEADER(
+      log, "                   Options.recycle_log_file_num: %" ROCKSDB_PRIszt,
+      recycle_log_file_num);
+  ROCKS_LOG_HEADER(log, "                        Options.allow_fallocate: %d",
+                   allow_fallocate);
+  ROCKS_LOG_HEADER(log, "                       Options.allow_mmap_reads: %d",
+                   allow_mmap_reads);
+  ROCKS_LOG_HEADER(log, "                      Options.allow_mmap_writes: %d",
+                   allow_mmap_writes);
+  ROCKS_LOG_HEADER(log, "                       Options.use_direct_reads: %d",
+                   use_direct_reads);
+  ROCKS_LOG_HEADER(log,
+                   "                       "
+                   "Options.use_direct_io_for_flush_and_compaction: %d",
+                   use_direct_io_for_flush_and_compaction);
+  ROCKS_LOG_HEADER(log, "         Options.create_missing_column_families: %d",
+                   create_missing_column_families);
+  ROCKS_LOG_HEADER(log, "                             Options.db_log_dir: %s",
+                   db_log_dir.c_str());
+  ROCKS_LOG_HEADER(log, "                                Options.wal_dir: %s",
+                   wal_dir.c_str());
+  ROCKS_LOG_HEADER(log, "               Options.table_cache_numshardbits: %d",
+                   table_cache_numshardbits);
+  ROCKS_LOG_HEADER(log,
+                   "                        Options.WAL_ttl_seconds: %" PRIu64,
+                   WAL_ttl_seconds);
+  ROCKS_LOG_HEADER(log,
+                   "                      Options.WAL_size_limit_MB: %" PRIu64,
+                   WAL_size_limit_MB);
+  ROCKS_LOG_HEADER(log,
+                   "                       "
+                   "Options.max_write_batch_group_size_bytes: %" PRIu64,
+                   max_write_batch_group_size_bytes);
+  ROCKS_LOG_HEADER(
+      log, "            Options.manifest_preallocation_size: %" ROCKSDB_PRIszt,
+      manifest_preallocation_size);
+  ROCKS_LOG_HEADER(log, "                    Options.is_fd_close_on_exec: %d",
+                   is_fd_close_on_exec);
+  ROCKS_LOG_HEADER(log, "                  Options.advise_random_on_open: %d",
+                   advise_random_on_open);
+  ROCKS_LOG_HEADER(
+      log, "                   Options.db_write_buffer_size: %" ROCKSDB_PRIszt,
+      db_write_buffer_size);
+  ROCKS_LOG_HEADER(log, "                   Options.write_buffer_manager: %p",
+                   write_buffer_manager.get());
+  ROCKS_LOG_HEADER(log, "        Options.access_hint_on_compaction_start: %d",
+                   static_cast<int>(access_hint_on_compaction_start));
+  ROCKS_LOG_HEADER(
+      log, "          Options.random_access_max_buffer_size: %" ROCKSDB_PRIszt,
+      random_access_max_buffer_size);
+  ROCKS_LOG_HEADER(log, "                     Options.use_adaptive_mutex: %d",
+                   use_adaptive_mutex);
+  ROCKS_LOG_HEADER(log, "                           Options.rate_limiter: %p",
+                   rate_limiter.get());
+  Header(
+      log, "    Options.sst_file_manager.rate_bytes_per_sec: %" PRIi64,
+      sst_file_manager ? sst_file_manager->GetDeleteRateBytesPerSecond() : 0);
+  ROCKS_LOG_HEADER(log, "                      Options.wal_recovery_mode: %d",
+                   static_cast<int>(wal_recovery_mode));
+  ROCKS_LOG_HEADER(log, "                 Options.enable_thread_tracking: %d",
+                   enable_thread_tracking);
+  ROCKS_LOG_HEADER(log, "                 Options.enable_pipelined_write: %d",
+                   enable_pipelined_write);
+  ROCKS_LOG_HEADER(log, "                 Options.unordered_write: %d",
+                   unordered_write);
+  ROCKS_LOG_HEADER(log, "        Options.allow_concurrent_memtable_write: %d",
+                   allow_concurrent_memtable_write);
+  ROCKS_LOG_HEADER(log, "     Options.enable_write_thread_adaptive_yield: %d",
+                   enable_write_thread_adaptive_yield);
+  ROCKS_LOG_HEADER(log,
+                   "            Options.write_thread_max_yield_usec: %" PRIu64,
+                   write_thread_max_yield_usec);
+  ROCKS_LOG_HEADER(log,
+                   "           Options.write_thread_slow_yield_usec: %" PRIu64,
+                   write_thread_slow_yield_usec);
+  if (row_cache) {
+    ROCKS_LOG_HEADER(
+        log,
+        "                              Options.row_cache: %" ROCKSDB_PRIszt,
+        row_cache->GetCapacity());
+  } else {
+    ROCKS_LOG_HEADER(log,
+                     "                              Options.row_cache: None");
+  }
+#ifndef ROCKSDB_LITE
+  ROCKS_LOG_HEADER(log, "                             Options.wal_filter: %s",
+                   wal_filter ? wal_filter->Name() : "None");
+#endif  // ROCKDB_LITE
+
+  ROCKS_LOG_HEADER(log, "            Options.avoid_flush_during_recovery: %d",
+                   avoid_flush_during_recovery);
+  ROCKS_LOG_HEADER(log, "            Options.allow_ingest_behind: %d",
+                   allow_ingest_behind);
+  ROCKS_LOG_HEADER(log, "            Options.two_write_queues: %d",
+                   two_write_queues);
+  ROCKS_LOG_HEADER(log, "            Options.manual_wal_flush: %d",
+                   manual_wal_flush);
+  ROCKS_LOG_HEADER(log, "            Options.wal_compression: %d",
+                   wal_compression);
+  ROCKS_LOG_HEADER(log, "            Options.atomic_flush: %d", atomic_flush);
+  ROCKS_LOG_HEADER(log,
+                   "            Options.avoid_unnecessary_blocking_io: %d",
+                   avoid_unnecessary_blocking_io);
+  ROCKS_LOG_HEADER(log, "                Options.persist_stats_to_disk: %u",
+                   persist_stats_to_disk);
+  ROCKS_LOG_HEADER(log, "                Options.write_dbid_to_manifest: %d",
+                   write_dbid_to_manifest);
+  ROCKS_LOG_HEADER(
+      log, "                Options.log_readahead_size: %" ROCKSDB_PRIszt,
+      log_readahead_size);
+  ROCKS_LOG_HEADER(log, "                Options.file_checksum_gen_factory: %s",
+                   file_checksum_gen_factory ? file_checksum_gen_factory->Name()
+                                             : kUnknownFileChecksumFuncName);
+  ROCKS_LOG_HEADER(log, "                Options.best_efforts_recovery: %d",
+                   static_cast<int>(best_efforts_recovery));
+  ROCKS_LOG_HEADER(log, "               Options.max_bgerror_resume_count: %d",
+                   max_bgerror_resume_count);
+  ROCKS_LOG_HEADER(log,
+                   "           Options.bgerror_resume_retry_interval: %" PRIu64,
+                   bgerror_resume_retry_interval);
+  ROCKS_LOG_HEADER(log, "            Options.allow_data_in_errors: %d",
+                   allow_data_in_errors);
+  ROCKS_LOG_HEADER(log, "            Options.db_host_id: %s",
+                   db_host_id.c_str());
+  ROCKS_LOG_HEADER(log, "            Options.enforce_single_del_contracts: %s",
+                   enforce_single_del_contracts ? "true" : "false");
+}
+
+bool ImmutableDBOptions::IsWalDirSameAsDBPath() const {
+  assert(!db_paths.empty());
+  return IsWalDirSameAsDBPath(db_paths[0].path);
+}
+
+bool ImmutableDBOptions::IsWalDirSameAsDBPath(
+    const std::string& db_path) const {
+  bool same = wal_dir.empty();
+  if (!same) {
+    Status s = env->AreFilesSame(wal_dir, db_path, &same);
+    if (s.IsNotSupported()) {
+      same = wal_dir == db_path;
+    }
+  }
+  return same;
+}
+
+const std::string& ImmutableDBOptions::GetWalDir() const {
+  if (wal_dir.empty()) {
+    assert(!db_paths.empty());
+    return db_paths[0].path;
+  } else {
+    return wal_dir;
+  }
+}
+
+const std::string& ImmutableDBOptions::GetWalDir(
+    const std::string& path) const {
+  if (wal_dir.empty()) {
+    return path;
+  } else {
+    return wal_dir;
+  }
+}
+
+MutableDBOptions::MutableDBOptions()
+    : max_background_jobs(2),
+      max_background_compactions(-1),
+      max_subcompactions(0),
+      avoid_flush_during_shutdown(false),
+      writable_file_max_buffer_size(1024 * 1024),
+      delayed_write_rate(2 * 1024U * 1024U),
+      max_total_wal_size(0),
+      delete_obsolete_files_period_micros(6ULL * 60 * 60 * 1000000),
+      stats_dump_period_sec(600),
+      stats_persist_period_sec(600),
+      stats_history_buffer_size(1024 * 1024),
+      max_open_files(-1),
+      bytes_per_sync(0),
+      wal_bytes_per_sync(0),
+      strict_bytes_per_sync(false),
+      compaction_readahead_size(0),
+      max_background_flushes(-1) {}
+
+MutableDBOptions::MutableDBOptions(const DBOptions& options)
+    : max_background_jobs(options.max_background_jobs),
+      max_background_compactions(options.max_background_compactions),
+      max_subcompactions(options.max_subcompactions),
+      avoid_flush_during_shutdown(options.avoid_flush_during_shutdown),
+      writable_file_max_buffer_size(options.writable_file_max_buffer_size),
+      delayed_write_rate(options.delayed_write_rate),
+      max_total_wal_size(options.max_total_wal_size),
+      delete_obsolete_files_period_micros(
+          options.delete_obsolete_files_period_micros),
+      stats_dump_period_sec(options.stats_dump_period_sec),
+      stats_persist_period_sec(options.stats_persist_period_sec),
+      stats_history_buffer_size(options.stats_history_buffer_size),
+      max_open_files(options.max_open_files),
+      bytes_per_sync(options.bytes_per_sync),
+      wal_bytes_per_sync(options.wal_bytes_per_sync),
+      strict_bytes_per_sync(options.strict_bytes_per_sync),
+      compaction_readahead_size(options.compaction_readahead_size),
+      max_background_flushes(options.max_background_flushes) {}
+
+void MutableDBOptions::Dump(Logger* log) const {
+  ROCKS_LOG_HEADER(log, "            Options.max_background_jobs: %d",
+                   max_background_jobs);
+  ROCKS_LOG_HEADER(log, "            Options.max_background_compactions: %d",
+                   max_background_compactions);
+  ROCKS_LOG_HEADER(log, "            Options.max_subcompactions: %" PRIu32,
+                   max_subcompactions);
+  ROCKS_LOG_HEADER(log, "            Options.avoid_flush_during_shutdown: %d",
+                   avoid_flush_during_shutdown);
+  ROCKS_LOG_HEADER(
+      log, "          Options.writable_file_max_buffer_size: %" ROCKSDB_PRIszt,
+      writable_file_max_buffer_size);
+  ROCKS_LOG_HEADER(log, "            Options.delayed_write_rate : %" PRIu64,
+                   delayed_write_rate);
+  ROCKS_LOG_HEADER(log, "            Options.max_total_wal_size: %" PRIu64,
+                   max_total_wal_size);
+  ROCKS_LOG_HEADER(
+      log, "            Options.delete_obsolete_files_period_micros: %" PRIu64,
+      delete_obsolete_files_period_micros);
+  ROCKS_LOG_HEADER(log, "                  Options.stats_dump_period_sec: %u",
+                   stats_dump_period_sec);
+  ROCKS_LOG_HEADER(log, "                Options.stats_persist_period_sec: %d",
+                   stats_persist_period_sec);
+  ROCKS_LOG_HEADER(
+      log,
+      "                Options.stats_history_buffer_size: %" ROCKSDB_PRIszt,
+      stats_history_buffer_size);
+  ROCKS_LOG_HEADER(log, "                         Options.max_open_files: %d",
+                   max_open_files);
+  ROCKS_LOG_HEADER(log,
+                   "                         Options.bytes_per_sync: %" PRIu64,
+                   bytes_per_sync);
+  ROCKS_LOG_HEADER(log,
+                   "                     Options.wal_bytes_per_sync: %" PRIu64,
+                   wal_bytes_per_sync);
+  ROCKS_LOG_HEADER(log,
+                   "                  Options.strict_bytes_per_sync: %d",
+                   strict_bytes_per_sync);
+  ROCKS_LOG_HEADER(log,
+                   "      Options.compaction_readahead_size: %" ROCKSDB_PRIszt,
+                   compaction_readahead_size);
+  ROCKS_LOG_HEADER(log, "                 Options.max_background_flushes: %d",
+                          max_background_flushes);
+}
+
+#ifndef ROCKSDB_LITE
+Status GetMutableDBOptionsFromStrings(
+    const MutableDBOptions& base_options,
+    const std::unordered_map<std::string, std::string>& options_map,
+    MutableDBOptions* new_options) {
+  assert(new_options);
+  *new_options = base_options;
+  ConfigOptions config_options;
+  Status s = OptionTypeInfo::ParseType(
+      config_options, options_map, db_mutable_options_type_info, new_options);
+  if (!s.ok()) {
+    *new_options = base_options;
+  }
+  return s;
+}
+
+bool MutableDBOptionsAreEqual(const MutableDBOptions& this_options,
+                              const MutableDBOptions& that_options) {
+  ConfigOptions config_options;
+  std::string mismatch;
+  return OptionTypeInfo::StructsAreEqual(
+      config_options, "MutableDBOptions", &db_mutable_options_type_info,
+      "MutableDBOptions", &this_options, &that_options, &mismatch);
+}
+
+Status GetStringFromMutableDBOptions(const ConfigOptions& config_options,
+                                     const MutableDBOptions& mutable_opts,
+                                     std::string* opt_string) {
+  return OptionTypeInfo::SerializeType(
+      config_options, db_mutable_options_type_info, &mutable_opts, opt_string);
+}
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/options/db_options.h b/src/rocksdb/options/db_options.h
new file mode 100644
index 000000000..8946f60ff
--- /dev/null
+++ b/src/rocksdb/options/db_options.h
@@ -0,0 +1,156 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/options.h"
+
+namespace ROCKSDB_NAMESPACE {
+class SystemClock;
+
+struct ImmutableDBOptions {
+  static const char* kName() { return "ImmutableDBOptions"; }
+  ImmutableDBOptions();
+  explicit ImmutableDBOptions(const DBOptions& options);
+
+  void Dump(Logger* log) const;
+
+  bool create_if_missing;
+  bool create_missing_column_families;
+  bool error_if_exists;
+  bool paranoid_checks;
+  bool flush_verify_memtable_count;
+  bool track_and_verify_wals_in_manifest;
+  bool verify_sst_unique_id_in_manifest;
+  Env* env;
+  std::shared_ptr<RateLimiter> rate_limiter;
+  std::shared_ptr<SstFileManager> sst_file_manager;
+  std::shared_ptr<Logger> info_log;
+  InfoLogLevel info_log_level;
+  int max_file_opening_threads;
+  std::shared_ptr<Statistics> statistics;
+  bool use_fsync;
+  std::vector<DbPath> db_paths;
+  std::string db_log_dir;
+  // The wal_dir option from the file.  To determine the
+  // directory in use, the GetWalDir or IsWalDirSameAsDBPath
+  // methods should be used instead of accessing this variable directly.
+  std::string wal_dir;
+  size_t max_log_file_size;
+  size_t log_file_time_to_roll;
+  size_t keep_log_file_num;
+  size_t recycle_log_file_num;
+  uint64_t max_manifest_file_size;
+  int table_cache_numshardbits;
+  uint64_t WAL_ttl_seconds;
+  uint64_t WAL_size_limit_MB;
+  uint64_t max_write_batch_group_size_bytes;
+  size_t manifest_preallocation_size;
+  bool allow_mmap_reads;
+  bool allow_mmap_writes;
+  bool use_direct_reads;
+  bool use_direct_io_for_flush_and_compaction;
+  bool allow_fallocate;
+  bool is_fd_close_on_exec;
+  bool advise_random_on_open;
+  size_t db_write_buffer_size;
+  std::shared_ptr<WriteBufferManager> write_buffer_manager;
+  DBOptions::AccessHint access_hint_on_compaction_start;
+  size_t random_access_max_buffer_size;
+  bool use_adaptive_mutex;
+  std::vector<std::shared_ptr<EventListener>> listeners;
+  bool enable_thread_tracking;
+  bool enable_pipelined_write;
+  bool unordered_write;
+  bool allow_concurrent_memtable_write;
+  bool enable_write_thread_adaptive_yield;
+  uint64_t write_thread_max_yield_usec;
+  uint64_t write_thread_slow_yield_usec;
+  bool skip_stats_update_on_db_open;
+  bool skip_checking_sst_file_sizes_on_db_open;
+  WALRecoveryMode wal_recovery_mode;
+  bool allow_2pc;
+  std::shared_ptr<Cache> row_cache;
+#ifndef ROCKSDB_LITE
+  WalFilter* wal_filter;
+#endif  // ROCKSDB_LITE
+  bool fail_if_options_file_error;
+  bool dump_malloc_stats;
+  bool avoid_flush_during_recovery;
+  bool allow_ingest_behind;
+  bool two_write_queues;
+  bool manual_wal_flush;
+  CompressionType wal_compression;
+  bool atomic_flush;
+  bool avoid_unnecessary_blocking_io;
+  bool persist_stats_to_disk;
+  bool write_dbid_to_manifest;
+  size_t log_readahead_size;
+  std::shared_ptr<FileChecksumGenFactory> file_checksum_gen_factory;
+  bool best_efforts_recovery;
+  int max_bgerror_resume_count;
+  uint64_t bgerror_resume_retry_interval;
+  bool allow_data_in_errors;
+  std::string db_host_id;
+  FileTypeSet checksum_handoff_file_types;
+  CacheTier lowest_used_cache_tier;
+  // Convenience/Helper objects that are not part of the base DBOptions
+  std::shared_ptr<FileSystem> fs;
+  SystemClock* clock;
+  Statistics* stats;
+  Logger* logger;
+  std::shared_ptr<CompactionService> compaction_service;
+  bool enforce_single_del_contracts;
+
+  bool IsWalDirSameAsDBPath() const;
+  bool IsWalDirSameAsDBPath(const std::string& path) const;
+  const std::string& GetWalDir() const;
+  const std::string& GetWalDir(const std::string& path) const;
+};
+
+struct MutableDBOptions {
+  static const char* kName() { return "MutableDBOptions"; }
+  MutableDBOptions();
+  explicit MutableDBOptions(const DBOptions& options);
+
+  void Dump(Logger* log) const;
+
+  int max_background_jobs;
+  int max_background_compactions;
+  uint32_t max_subcompactions;
+  bool avoid_flush_during_shutdown;
+  size_t writable_file_max_buffer_size;
+  uint64_t delayed_write_rate;
+  uint64_t max_total_wal_size;
+  uint64_t delete_obsolete_files_period_micros;
+  unsigned int stats_dump_period_sec;
+  unsigned int stats_persist_period_sec;
+  size_t stats_history_buffer_size;
+  int max_open_files;
+  uint64_t bytes_per_sync;
+  uint64_t wal_bytes_per_sync;
+  bool strict_bytes_per_sync;
+  size_t compaction_readahead_size;
+  int max_background_flushes;
+};
+
+#ifndef ROCKSDB_LITE
+Status GetStringFromMutableDBOptions(const ConfigOptions& config_options,
+                                     const MutableDBOptions& mutable_opts,
+                                     std::string* opt_string);
+
+Status GetMutableDBOptionsFromStrings(
+    const MutableDBOptions& base_options,
+    const std::unordered_map<std::string, std::string>& options_map,
+    MutableDBOptions* new_options);
+
+bool MutableDBOptionsAreEqual(const MutableDBOptions& this_options,
+                              const MutableDBOptions& that_options);
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/options/options.cc b/src/rocksdb/options/options.cc
new file mode 100644
index 000000000..316d3550e
--- /dev/null
+++ b/src/rocksdb/options/options.cc
@@ -0,0 +1,735 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/options.h"
+
+#include <cinttypes>
+#include <limits>
+
+#include "logging/logging.h"
+#include "monitoring/statistics.h"
+#include "options/db_options.h"
+#include "options/options_helper.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/sst_file_manager.h"
+#include "rocksdb/sst_partitioner.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/wal_filter.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "util/compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions() {
+  assert(memtable_factory.get() != nullptr);
+}
+
+AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options)
+    : max_write_buffer_number(options.max_write_buffer_number),
+      min_write_buffer_number_to_merge(
+          options.min_write_buffer_number_to_merge),
+      max_write_buffer_number_to_maintain(
+          options.max_write_buffer_number_to_maintain),
+      max_write_buffer_size_to_maintain(
+          options.max_write_buffer_size_to_maintain),
+      inplace_update_support(options.inplace_update_support),
+      inplace_update_num_locks(options.inplace_update_num_locks),
+      experimental_mempurge_threshold(options.experimental_mempurge_threshold),
+      inplace_callback(options.inplace_callback),
+      memtable_prefix_bloom_size_ratio(
+          options.memtable_prefix_bloom_size_ratio),
+      memtable_whole_key_filtering(options.memtable_whole_key_filtering),
+      memtable_huge_page_size(options.memtable_huge_page_size),
+      memtable_insert_with_hint_prefix_extractor(
+          options.memtable_insert_with_hint_prefix_extractor),
+      bloom_locality(options.bloom_locality),
+      arena_block_size(options.arena_block_size),
+      compression_per_level(options.compression_per_level),
+      num_levels(options.num_levels),
+      level0_slowdown_writes_trigger(options.level0_slowdown_writes_trigger),
+      level0_stop_writes_trigger(options.level0_stop_writes_trigger),
+      target_file_size_base(options.target_file_size_base),
+      target_file_size_multiplier(options.target_file_size_multiplier),
+      level_compaction_dynamic_level_bytes(
+          options.level_compaction_dynamic_level_bytes),
+      max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier),
+      max_bytes_for_level_multiplier_additional(
+          options.max_bytes_for_level_multiplier_additional),
+      max_compaction_bytes(options.max_compaction_bytes),
+      ignore_max_compaction_bytes_for_input(
+          options.ignore_max_compaction_bytes_for_input),
+      soft_pending_compaction_bytes_limit(
+          options.soft_pending_compaction_bytes_limit),
+      hard_pending_compaction_bytes_limit(
+          options.hard_pending_compaction_bytes_limit),
+      compaction_style(options.compaction_style),
+      compaction_pri(options.compaction_pri),
+      compaction_options_universal(options.compaction_options_universal),
+      compaction_options_fifo(options.compaction_options_fifo),
+      max_sequential_skip_in_iterations(
+          options.max_sequential_skip_in_iterations),
+      memtable_factory(options.memtable_factory),
+      table_properties_collector_factories(
+          options.table_properties_collector_factories),
+      max_successive_merges(options.max_successive_merges),
+      optimize_filters_for_hits(options.optimize_filters_for_hits),
+      paranoid_file_checks(options.paranoid_file_checks),
+      force_consistency_checks(options.force_consistency_checks),
+      report_bg_io_stats(options.report_bg_io_stats),
+      ttl(options.ttl),
+      periodic_compaction_seconds(options.periodic_compaction_seconds),
+      sample_for_compression(options.sample_for_compression),
+      preclude_last_level_data_seconds(
+          options.preclude_last_level_data_seconds),
+      preserve_internal_time_seconds(options.preserve_internal_time_seconds),
+      enable_blob_files(options.enable_blob_files),
+      min_blob_size(options.min_blob_size),
+      blob_file_size(options.blob_file_size),
+      blob_compression_type(options.blob_compression_type),
+      enable_blob_garbage_collection(options.enable_blob_garbage_collection),
+      blob_garbage_collection_age_cutoff(
+          options.blob_garbage_collection_age_cutoff),
+      blob_garbage_collection_force_threshold(
+          options.blob_garbage_collection_force_threshold),
+      blob_compaction_readahead_size(options.blob_compaction_readahead_size),
+      blob_file_starting_level(options.blob_file_starting_level),
+      blob_cache(options.blob_cache),
+      prepopulate_blob_cache(options.prepopulate_blob_cache) {
+  assert(memtable_factory.get() != nullptr);
+  if (max_bytes_for_level_multiplier_additional.size() <
+      static_cast<unsigned int>(num_levels)) {
+    max_bytes_for_level_multiplier_additional.resize(num_levels, 1);
+  }
+}
+
+ColumnFamilyOptions::ColumnFamilyOptions()
+    : compression(Snappy_Supported() ? kSnappyCompression : kNoCompression),
+      table_factory(
+          std::shared_ptr<TableFactory>(new BlockBasedTableFactory())) {}
+
+ColumnFamilyOptions::ColumnFamilyOptions(const Options& options)
+    : ColumnFamilyOptions(*static_cast<const ColumnFamilyOptions*>(&options)) {}
+
+DBOptions::DBOptions() {}
+DBOptions::DBOptions(const Options& options)
+    : DBOptions(*static_cast<const DBOptions*>(&options)) {}
+
+void DBOptions::Dump(Logger* log) const {
+    ImmutableDBOptions(*this).Dump(log);
+    MutableDBOptions(*this).Dump(log);
+}  // DBOptions::Dump
+
+void ColumnFamilyOptions::Dump(Logger* log) const {
+  ROCKS_LOG_HEADER(log, "              Options.comparator: %s",
+                   comparator->Name());
+  ROCKS_LOG_HEADER(log, "          Options.merge_operator: %s",
+                   merge_operator ? merge_operator->Name() : "None");
+  ROCKS_LOG_HEADER(log, "       Options.compaction_filter: %s",
+                   compaction_filter ? compaction_filter->Name() : "None");
+  ROCKS_LOG_HEADER(
+      log, "       Options.compaction_filter_factory: %s",
+      compaction_filter_factory ? compaction_filter_factory->Name() : "None");
+  ROCKS_LOG_HEADER(
+      log, " Options.sst_partitioner_factory: %s",
+      sst_partitioner_factory ? sst_partitioner_factory->Name() : "None");
+  ROCKS_LOG_HEADER(log, "        Options.memtable_factory: %s",
+                   memtable_factory->Name());
+  ROCKS_LOG_HEADER(log, "           Options.table_factory: %s",
+                   table_factory->Name());
+  ROCKS_LOG_HEADER(log, "           table_factory options: %s",
+                   table_factory->GetPrintableOptions().c_str());
+  ROCKS_LOG_HEADER(log, "       Options.write_buffer_size: %" ROCKSDB_PRIszt,
+                   write_buffer_size);
+  ROCKS_LOG_HEADER(log, " Options.max_write_buffer_number: %d",
+                   max_write_buffer_number);
+  if (!compression_per_level.empty()) {
+    for (unsigned int i = 0; i < compression_per_level.size(); i++) {
+      ROCKS_LOG_HEADER(
+          log, "       Options.compression[%d]: %s", i,
+          CompressionTypeToString(compression_per_level[i]).c_str());
+    }
+    } else {
+      ROCKS_LOG_HEADER(log, "         Options.compression: %s",
+                       CompressionTypeToString(compression).c_str());
+    }
+    ROCKS_LOG_HEADER(
+        log, "                 Options.bottommost_compression: %s",
+        bottommost_compression == kDisableCompressionOption
+            ? "Disabled"
+            : CompressionTypeToString(bottommost_compression).c_str());
+    ROCKS_LOG_HEADER(
+        log, "      Options.prefix_extractor: %s",
+        prefix_extractor == nullptr ? "nullptr" : prefix_extractor->Name());
+    ROCKS_LOG_HEADER(log,
+                     "  Options.memtable_insert_with_hint_prefix_extractor: %s",
+                     memtable_insert_with_hint_prefix_extractor == nullptr
+                         ? "nullptr"
+                         : memtable_insert_with_hint_prefix_extractor->Name());
+    ROCKS_LOG_HEADER(log, "            Options.num_levels: %d", num_levels);
+    ROCKS_LOG_HEADER(log, "       Options.min_write_buffer_number_to_merge: %d",
+                     min_write_buffer_number_to_merge);
+    ROCKS_LOG_HEADER(log, "    Options.max_write_buffer_number_to_maintain: %d",
+                     max_write_buffer_number_to_maintain);
+    ROCKS_LOG_HEADER(log,
+                     "    Options.max_write_buffer_size_to_maintain: %" PRIu64,
+                     max_write_buffer_size_to_maintain);
+    ROCKS_LOG_HEADER(
+        log, "           Options.bottommost_compression_opts.window_bits: %d",
+        bottommost_compression_opts.window_bits);
+    ROCKS_LOG_HEADER(
+        log, "                 Options.bottommost_compression_opts.level: %d",
+        bottommost_compression_opts.level);
+    ROCKS_LOG_HEADER(
+        log, "              Options.bottommost_compression_opts.strategy: %d",
+        bottommost_compression_opts.strategy);
+    ROCKS_LOG_HEADER(
+        log,
+        "        Options.bottommost_compression_opts.max_dict_bytes: "
+        "%" PRIu32,
+        bottommost_compression_opts.max_dict_bytes);
+    ROCKS_LOG_HEADER(
+        log,
+        "        Options.bottommost_compression_opts.zstd_max_train_bytes: "
+        "%" PRIu32,
+        bottommost_compression_opts.zstd_max_train_bytes);
+    ROCKS_LOG_HEADER(
+        log,
+        "        Options.bottommost_compression_opts.parallel_threads: "
+        "%" PRIu32,
+        bottommost_compression_opts.parallel_threads);
+    ROCKS_LOG_HEADER(
+        log, "                 Options.bottommost_compression_opts.enabled: %s",
+        bottommost_compression_opts.enabled ? "true" : "false");
+    ROCKS_LOG_HEADER(
+        log,
+        "        Options.bottommost_compression_opts.max_dict_buffer_bytes: "
+        "%" PRIu64,
+        bottommost_compression_opts.max_dict_buffer_bytes);
+    ROCKS_LOG_HEADER(
+        log,
+        "        Options.bottommost_compression_opts.use_zstd_dict_trainer: %s",
+        bottommost_compression_opts.use_zstd_dict_trainer ? "true" : "false");
+    ROCKS_LOG_HEADER(log, "           Options.compression_opts.window_bits: %d",
+                     compression_opts.window_bits);
+    ROCKS_LOG_HEADER(log, "                 Options.compression_opts.level: %d",
+                     compression_opts.level);
+    ROCKS_LOG_HEADER(log, "              Options.compression_opts.strategy: %d",
+                     compression_opts.strategy);
+    ROCKS_LOG_HEADER(
+        log,
+        "        Options.compression_opts.max_dict_bytes: %" PRIu32,
+        compression_opts.max_dict_bytes);
+    ROCKS_LOG_HEADER(log,
+                     "        Options.compression_opts.zstd_max_train_bytes: "
+                     "%" PRIu32,
+                     compression_opts.zstd_max_train_bytes);
+    ROCKS_LOG_HEADER(
+        log, "        Options.compression_opts.use_zstd_dict_trainer: %s",
+        compression_opts.use_zstd_dict_trainer ? "true" : "false");
+    ROCKS_LOG_HEADER(log,
+                     "        Options.compression_opts.parallel_threads: "
+                     "%" PRIu32,
+                     compression_opts.parallel_threads);
+    ROCKS_LOG_HEADER(log,
+                     "                 Options.compression_opts.enabled: %s",
+                     compression_opts.enabled ? "true" : "false");
+    ROCKS_LOG_HEADER(log,
+                     "        Options.compression_opts.max_dict_buffer_bytes: "
+                     "%" PRIu64,
+                     compression_opts.max_dict_buffer_bytes);
+    ROCKS_LOG_HEADER(log, "     Options.level0_file_num_compaction_trigger: %d",
+                     level0_file_num_compaction_trigger);
+    ROCKS_LOG_HEADER(log, "         Options.level0_slowdown_writes_trigger: %d",
+                     level0_slowdown_writes_trigger);
+    ROCKS_LOG_HEADER(log, "             Options.level0_stop_writes_trigger: %d",
+                     level0_stop_writes_trigger);
+    ROCKS_LOG_HEADER(
+        log, "                  Options.target_file_size_base: %" PRIu64,
+        target_file_size_base);
+    ROCKS_LOG_HEADER(log, "            Options.target_file_size_multiplier: %d",
+                     target_file_size_multiplier);
+    ROCKS_LOG_HEADER(
+        log, "               Options.max_bytes_for_level_base: %" PRIu64,
+        max_bytes_for_level_base);
+    ROCKS_LOG_HEADER(log, "Options.level_compaction_dynamic_level_bytes: %d",
+                     level_compaction_dynamic_level_bytes);
+    ROCKS_LOG_HEADER(log, "         Options.max_bytes_for_level_multiplier: %f",
+                     max_bytes_for_level_multiplier);
+    for (size_t i = 0; i < max_bytes_for_level_multiplier_additional.size();
+         i++) {
+      ROCKS_LOG_HEADER(
+          log, "Options.max_bytes_for_level_multiplier_addtl[%" ROCKSDB_PRIszt
+               "]: %d",
+          i, max_bytes_for_level_multiplier_additional[i]);
+    }
+    ROCKS_LOG_HEADER(
+        log, "      Options.max_sequential_skip_in_iterations: %" PRIu64,
+        max_sequential_skip_in_iterations);
+    ROCKS_LOG_HEADER(
+        log, "                   Options.max_compaction_bytes: %" PRIu64,
+        max_compaction_bytes);
+    ROCKS_LOG_HEADER(log, "  Options.ignore_max_compaction_bytes_for_input: %s",
+                     ignore_max_compaction_bytes_for_input ? "true" : "false");
+    ROCKS_LOG_HEADER(
+        log,
+        "                       Options.arena_block_size: %" ROCKSDB_PRIszt,
+        arena_block_size);
+    ROCKS_LOG_HEADER(log,
+                     "  Options.soft_pending_compaction_bytes_limit: %" PRIu64,
+                     soft_pending_compaction_bytes_limit);
+    ROCKS_LOG_HEADER(log,
+                     "  Options.hard_pending_compaction_bytes_limit: %" PRIu64,
+                     hard_pending_compaction_bytes_limit);
+    ROCKS_LOG_HEADER(log, "               Options.disable_auto_compactions: %d",
+                     disable_auto_compactions);
+
+    const auto& it_compaction_style =
+        compaction_style_to_string.find(compaction_style);
+    std::string str_compaction_style;
+    if (it_compaction_style == compaction_style_to_string.end()) {
+      assert(false);
+      str_compaction_style = "unknown_" + std::to_string(compaction_style);
+    } else {
+      str_compaction_style = it_compaction_style->second;
+    }
+    ROCKS_LOG_HEADER(log,
+                     "                       Options.compaction_style: %s",
+                     str_compaction_style.c_str());
+
+    const auto& it_compaction_pri =
+        compaction_pri_to_string.find(compaction_pri);
+    std::string str_compaction_pri;
+    if (it_compaction_pri == compaction_pri_to_string.end()) {
+      assert(false);
+      str_compaction_pri = "unknown_" + std::to_string(compaction_pri);
+    } else {
+      str_compaction_pri = it_compaction_pri->second;
+    }
+    ROCKS_LOG_HEADER(log,
+                     "                         Options.compaction_pri: %s",
+                     str_compaction_pri.c_str());
+    ROCKS_LOG_HEADER(log,
+                     "Options.compaction_options_universal.size_ratio: %u",
+                     compaction_options_universal.size_ratio);
+    ROCKS_LOG_HEADER(log,
+                     "Options.compaction_options_universal.min_merge_width: %u",
+                     compaction_options_universal.min_merge_width);
+    ROCKS_LOG_HEADER(log,
+                     "Options.compaction_options_universal.max_merge_width: %u",
+                     compaction_options_universal.max_merge_width);
+    ROCKS_LOG_HEADER(
+        log,
+        "Options.compaction_options_universal."
+        "max_size_amplification_percent: %u",
+        compaction_options_universal.max_size_amplification_percent);
+    ROCKS_LOG_HEADER(
+        log,
+        "Options.compaction_options_universal.compression_size_percent: %d",
+        compaction_options_universal.compression_size_percent);
+    const auto& it_compaction_stop_style = compaction_stop_style_to_string.find(
+        compaction_options_universal.stop_style);
+    std::string str_compaction_stop_style;
+    if (it_compaction_stop_style == compaction_stop_style_to_string.end()) {
+      assert(false);
+      str_compaction_stop_style =
+          "unknown_" + std::to_string(compaction_options_universal.stop_style);
+    } else {
+      str_compaction_stop_style = it_compaction_stop_style->second;
+    }
+    ROCKS_LOG_HEADER(log,
+                     "Options.compaction_options_universal.stop_style: %s",
+                     str_compaction_stop_style.c_str());
+    ROCKS_LOG_HEADER(
+        log, "Options.compaction_options_fifo.max_table_files_size: %" PRIu64,
+        compaction_options_fifo.max_table_files_size);
+    ROCKS_LOG_HEADER(log,
+                     "Options.compaction_options_fifo.allow_compaction: %d",
+                     compaction_options_fifo.allow_compaction);
+    std::ostringstream collector_info;
+    for (const auto& collector_factory : table_properties_collector_factories) {
+      collector_info << collector_factory->ToString() << ';';
+    }
+    ROCKS_LOG_HEADER(
+        log, "                  Options.table_properties_collectors: %s",
+        collector_info.str().c_str());
+    ROCKS_LOG_HEADER(log,
+                     "                  Options.inplace_update_support: %d",
+                     inplace_update_support);
+    ROCKS_LOG_HEADER(
+        log,
+        "                Options.inplace_update_num_locks: %" ROCKSDB_PRIszt,
+        inplace_update_num_locks);
+    // TODO: easier config for bloom (maybe based on avg key/value size)
+    ROCKS_LOG_HEADER(
+        log, "              Options.memtable_prefix_bloom_size_ratio: %f",
+        memtable_prefix_bloom_size_ratio);
+    ROCKS_LOG_HEADER(log,
+                     "              Options.memtable_whole_key_filtering: %d",
+                     memtable_whole_key_filtering);
+
+    ROCKS_LOG_HEADER(log, "  Options.memtable_huge_page_size: %" ROCKSDB_PRIszt,
+                     memtable_huge_page_size);
+    ROCKS_LOG_HEADER(log,
+                     "                          Options.bloom_locality: %d",
+                     bloom_locality);
+
+    ROCKS_LOG_HEADER(
+        log,
+        "                   Options.max_successive_merges: %" ROCKSDB_PRIszt,
+        max_successive_merges);
+    ROCKS_LOG_HEADER(log,
+                     "               Options.optimize_filters_for_hits: %d",
+                     optimize_filters_for_hits);
+    ROCKS_LOG_HEADER(log, "               Options.paranoid_file_checks: %d",
+                     paranoid_file_checks);
+    ROCKS_LOG_HEADER(log, "               Options.force_consistency_checks: %d",
+                     force_consistency_checks);
+    ROCKS_LOG_HEADER(log, "               Options.report_bg_io_stats: %d",
+                     report_bg_io_stats);
+    ROCKS_LOG_HEADER(log, "                              Options.ttl: %" PRIu64,
+                     ttl);
+    ROCKS_LOG_HEADER(log,
+                     "         Options.periodic_compaction_seconds: %" PRIu64,
+                     periodic_compaction_seconds);
+    ROCKS_LOG_HEADER(log, " Options.preclude_last_level_data_seconds: %" PRIu64,
+                     preclude_last_level_data_seconds);
+    ROCKS_LOG_HEADER(log, "   Options.preserve_internal_time_seconds: %" PRIu64,
+                     preserve_internal_time_seconds);
+    ROCKS_LOG_HEADER(log, "                      Options.enable_blob_files: %s",
+                     enable_blob_files ? "true" : "false");
+    ROCKS_LOG_HEADER(
+        log, "                          Options.min_blob_size: %" PRIu64,
+        min_blob_size);
+    ROCKS_LOG_HEADER(
+        log, "                         Options.blob_file_size: %" PRIu64,
+        blob_file_size);
+    ROCKS_LOG_HEADER(log, "                  Options.blob_compression_type: %s",
+                     CompressionTypeToString(blob_compression_type).c_str());
+    ROCKS_LOG_HEADER(log, "         Options.enable_blob_garbage_collection: %s",
+                     enable_blob_garbage_collection ? "true" : "false");
+    ROCKS_LOG_HEADER(log, "     Options.blob_garbage_collection_age_cutoff: %f",
+                     blob_garbage_collection_age_cutoff);
+    ROCKS_LOG_HEADER(log, "Options.blob_garbage_collection_force_threshold: %f",
+                     blob_garbage_collection_force_threshold);
+    ROCKS_LOG_HEADER(
+        log, "         Options.blob_compaction_readahead_size: %" PRIu64,
+        blob_compaction_readahead_size);
+    ROCKS_LOG_HEADER(log, "               Options.blob_file_starting_level: %d",
+                     blob_file_starting_level);
+    if (blob_cache) {
+      ROCKS_LOG_HEADER(log, "                          Options.blob_cache: %s",
+                       blob_cache->Name());
+      ROCKS_LOG_HEADER(log, "                          blob_cache options: %s",
+                       blob_cache->GetPrintableOptions().c_str());
+      ROCKS_LOG_HEADER(
+          log, "                          blob_cache prepopulated: %s",
+          prepopulate_blob_cache == PrepopulateBlobCache::kFlushOnly
+              ? "flush only"
+              : "disabled");
+    }
+    ROCKS_LOG_HEADER(log, "Options.experimental_mempurge_threshold: %f",
+                     experimental_mempurge_threshold);
+}  // ColumnFamilyOptions::Dump
+
+void Options::Dump(Logger* log) const {
+  DBOptions::Dump(log);
+  ColumnFamilyOptions::Dump(log);
+}  // Options::Dump
+
+void Options::DumpCFOptions(Logger* log) const {
+  ColumnFamilyOptions::Dump(log);
+}  // Options::DumpCFOptions
+
+//
+// The goal of this method is to create a configuration that
+// allows an application to write all files into L0 and
+// then do a single compaction to output all files into L1.
+Options*
+Options::PrepareForBulkLoad()
+{
+  // never slowdown ingest.
+  level0_file_num_compaction_trigger = (1<<30);
+  level0_slowdown_writes_trigger = (1<<30);
+  level0_stop_writes_trigger = (1<<30);
+  soft_pending_compaction_bytes_limit = 0;
+  hard_pending_compaction_bytes_limit = 0;
+
+  // no auto compactions please. The application should issue a
+  // manual compaction after all data is loaded into L0.
+  disable_auto_compactions = true;
+  // A manual compaction run should pick all files in L0 in
+  // a single compaction run.
+  max_compaction_bytes = (static_cast<uint64_t>(1) << 60);
+
+  // It is better to have only 2 levels, otherwise a manual
+  // compaction would compact at every possible level, thereby
+  // increasing the total time needed for compactions.
+  num_levels = 2;
+
+  // Need to allow more write buffers to allow more parallism
+  // of flushes.
+  max_write_buffer_number = 6;
+  min_write_buffer_number_to_merge = 1;
+
+  // When compaction is disabled, more parallel flush threads can
+  // help with write throughput.
+  max_background_flushes = 4;
+
+  // Prevent a memtable flush to automatically promote files
+  // to L1. This is helpful so that all files that are
+  // input to the manual compaction are all at L0.
+  max_background_compactions = 2;
+
+  // The compaction would create large files in L1.
+  target_file_size_base = 256 * 1024 * 1024;
+  return this;
+}
+
+Options* Options::OptimizeForSmallDb() {
+  // 16MB block cache
+  std::shared_ptr<Cache> cache = NewLRUCache(16 << 20);
+
+  ColumnFamilyOptions::OptimizeForSmallDb(&cache);
+  DBOptions::OptimizeForSmallDb(&cache);
+  return this;
+}
+
+Options* Options::DisableExtraChecks() {
+  // See https://github.com/facebook/rocksdb/issues/9354
+  force_consistency_checks = false;
+  // Considered but no clear performance impact seen:
+  // * check_flush_compaction_key_order
+  // * paranoid_checks
+  // * flush_verify_memtable_count
+  // By current API contract, not including
+  // * verify_checksums
+  // because checking storage data integrity is a more standard practice.
+  return this;
+}
+
+Options* Options::OldDefaults(int rocksdb_major_version,
+                              int rocksdb_minor_version) {
+  ColumnFamilyOptions::OldDefaults(rocksdb_major_version,
+                                   rocksdb_minor_version);
+  DBOptions::OldDefaults(rocksdb_major_version, rocksdb_minor_version);
+  return this;
+}
+
+DBOptions* DBOptions::OldDefaults(int rocksdb_major_version,
+                                  int rocksdb_minor_version) {
+  if (rocksdb_major_version < 4 ||
+      (rocksdb_major_version == 4 && rocksdb_minor_version < 7)) {
+    max_file_opening_threads = 1;
+    table_cache_numshardbits = 4;
+  }
+  if (rocksdb_major_version < 5 ||
+      (rocksdb_major_version == 5 && rocksdb_minor_version < 2)) {
+    delayed_write_rate = 2 * 1024U * 1024U;
+  } else if (rocksdb_major_version < 5 ||
+             (rocksdb_major_version == 5 && rocksdb_minor_version < 6)) {
+    delayed_write_rate = 16 * 1024U * 1024U;
+  }
+  max_open_files = 5000;
+  wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords;
+  return this;
+}
+
+ColumnFamilyOptions* ColumnFamilyOptions::OldDefaults(
+    int rocksdb_major_version, int rocksdb_minor_version) {
+  if (rocksdb_major_version < 5 ||
+      (rocksdb_major_version == 5 && rocksdb_minor_version <= 18)) {
+    compaction_pri = CompactionPri::kByCompensatedSize;
+  }
+  if (rocksdb_major_version < 4 ||
+      (rocksdb_major_version == 4 && rocksdb_minor_version < 7)) {
+    write_buffer_size = 4 << 20;
+    target_file_size_base = 2 * 1048576;
+    max_bytes_for_level_base = 10 * 1048576;
+    soft_pending_compaction_bytes_limit = 0;
+    hard_pending_compaction_bytes_limit = 0;
+  }
+  if (rocksdb_major_version < 5) {
+    level0_stop_writes_trigger = 24;
+  } else if (rocksdb_major_version == 5 && rocksdb_minor_version < 2) {
+    level0_stop_writes_trigger = 30;
+  }
+
+  return this;
+}
+
+// Optimization functions
+DBOptions* DBOptions::OptimizeForSmallDb(std::shared_ptr<Cache>* cache) {
+  max_file_opening_threads = 1;
+  max_open_files = 5000;
+
+  // Cost memtable to block cache too.
+  std::shared_ptr<ROCKSDB_NAMESPACE::WriteBufferManager> wbm =
+      std::make_shared<ROCKSDB_NAMESPACE::WriteBufferManager>(
+          0, (cache != nullptr) ? *cache : std::shared_ptr<Cache>());
+  write_buffer_manager = wbm;
+
+  return this;
+}
+
+ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForSmallDb(
+    std::shared_ptr<Cache>* cache) {
+  write_buffer_size = 2 << 20;
+  target_file_size_base = 2 * 1048576;
+  max_bytes_for_level_base = 10 * 1048576;
+  soft_pending_compaction_bytes_limit = 256 * 1048576;
+  hard_pending_compaction_bytes_limit = 1073741824ul;
+
+  BlockBasedTableOptions table_options;
+  table_options.block_cache =
+      (cache != nullptr) ? *cache : std::shared_ptr<Cache>();
+  table_options.cache_index_and_filter_blocks = true;
+  // Two level iterator to avoid LRU cache imbalance
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  table_factory.reset(new BlockBasedTableFactory(table_options));
+
+  return this;
+}
+
+#ifndef ROCKSDB_LITE
+ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForPointLookup(
+    uint64_t block_cache_size_mb) {
+  BlockBasedTableOptions block_based_options;
+  block_based_options.data_block_index_type =
+      BlockBasedTableOptions::kDataBlockBinaryAndHash;
+  block_based_options.data_block_hash_table_util_ratio = 0.75;
+  block_based_options.filter_policy.reset(NewBloomFilterPolicy(10));
+  block_based_options.block_cache =
+      NewLRUCache(static_cast<size_t>(block_cache_size_mb * 1024 * 1024));
+  table_factory.reset(new BlockBasedTableFactory(block_based_options));
+  memtable_prefix_bloom_size_ratio = 0.02;
+  memtable_whole_key_filtering = true;
+  return this;
+}
+
+ColumnFamilyOptions* ColumnFamilyOptions::OptimizeLevelStyleCompaction(
+    uint64_t memtable_memory_budget) {
+  write_buffer_size = static_cast<size_t>(memtable_memory_budget / 4);
+  // merge two memtables when flushing to L0
+  min_write_buffer_number_to_merge = 2;
+  // this means we'll use 50% extra memory in the worst case, but will reduce
+  // write stalls.
+  max_write_buffer_number = 6;
+  // start flushing L0->L1 as soon as possible. each file on level0 is
+  // (memtable_memory_budget / 2). This will flush level 0 when it's bigger than
+  // memtable_memory_budget.
+  level0_file_num_compaction_trigger = 2;
+  // doesn't really matter much, but we don't want to create too many files
+  target_file_size_base = memtable_memory_budget / 8;
+  // make Level1 size equal to Level0 size, so that L0->L1 compactions are fast
+  max_bytes_for_level_base = memtable_memory_budget;
+
+  // level style compaction
+  compaction_style = kCompactionStyleLevel;
+
+  // only compress levels >= 2
+  compression_per_level.resize(num_levels);
+  for (int i = 0; i < num_levels; ++i) {
+    if (i < 2) {
+      compression_per_level[i] = kNoCompression;
+    } else {
+      compression_per_level[i] =
+          LZ4_Supported()
+              ? kLZ4Compression
+              : (Snappy_Supported() ? kSnappyCompression : kNoCompression);
+    }
+  }
+  return this;
+}
+
+ColumnFamilyOptions* ColumnFamilyOptions::OptimizeUniversalStyleCompaction(
+    uint64_t memtable_memory_budget) {
+  write_buffer_size = static_cast<size_t>(memtable_memory_budget / 4);
+  // merge two memtables when flushing to L0
+  min_write_buffer_number_to_merge = 2;
+  // this means we'll use 50% extra memory in the worst case, but will reduce
+  // write stalls.
+  max_write_buffer_number = 6;
+  // universal style compaction
+  compaction_style = kCompactionStyleUniversal;
+  compaction_options_universal.compression_size_percent = 80;
+  return this;
+}
+
+DBOptions* DBOptions::IncreaseParallelism(int total_threads) {
+  max_background_jobs = total_threads;
+  env->SetBackgroundThreads(total_threads, Env::LOW);
+  env->SetBackgroundThreads(1, Env::HIGH);
+  return this;
+}
+
+#endif  // !ROCKSDB_LITE
+
+ReadOptions::ReadOptions()
+    : snapshot(nullptr),
+      iterate_lower_bound(nullptr),
+      iterate_upper_bound(nullptr),
+      readahead_size(0),
+      max_skippable_internal_keys(0),
+      read_tier(kReadAllTier),
+      verify_checksums(true),
+      fill_cache(true),
+      tailing(false),
+      managed(false),
+      total_order_seek(false),
+      auto_prefix_mode(false),
+      prefix_same_as_start(false),
+      pin_data(false),
+      background_purge_on_iterator_cleanup(false),
+      ignore_range_deletions(false),
+      timestamp(nullptr),
+      iter_start_ts(nullptr),
+      deadline(std::chrono::microseconds::zero()),
+      io_timeout(std::chrono::microseconds::zero()),
+      value_size_soft_limit(std::numeric_limits<uint64_t>::max()),
+      adaptive_readahead(false),
+      async_io(false),
+      optimize_multiget_for_io(true) {}
+
+ReadOptions::ReadOptions(bool cksum, bool cache)
+    : snapshot(nullptr),
+      iterate_lower_bound(nullptr),
+      iterate_upper_bound(nullptr),
+      readahead_size(0),
+      max_skippable_internal_keys(0),
+      read_tier(kReadAllTier),
+      verify_checksums(cksum),
+      fill_cache(cache),
+      tailing(false),
+      managed(false),
+      total_order_seek(false),
+      auto_prefix_mode(false),
+      prefix_same_as_start(false),
+      pin_data(false),
+      background_purge_on_iterator_cleanup(false),
+      ignore_range_deletions(false),
+      timestamp(nullptr),
+      iter_start_ts(nullptr),
+      deadline(std::chrono::microseconds::zero()),
+      io_timeout(std::chrono::microseconds::zero()),
+      value_size_soft_limit(std::numeric_limits<uint64_t>::max()),
+      adaptive_readahead(false),
+      async_io(false),
+      optimize_multiget_for_io(true) {}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/options/options_helper.cc b/src/rocksdb/options/options_helper.cc
new file mode 100644
index 000000000..59b01e6fb
--- /dev/null
+++ b/src/rocksdb/options/options_helper.cc
@@ -0,0 +1,1478 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#include "options/options_helper.h"
+
+#include <cassert>
+#include <cctype>
+#include <cstdlib>
+#include <set>
+#include <unordered_set>
+#include <vector>
+
+#include "options/cf_options.h"
+#include "options/db_options.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+ConfigOptions::ConfigOptions()
+#ifndef ROCKSDB_LITE
+    : registry(ObjectRegistry::NewInstance())
+#endif
+{
+  env = Env::Default();
+}
+
+ConfigOptions::ConfigOptions(const DBOptions& db_opts) : env(db_opts.env) {
+#ifndef ROCKSDB_LITE
+  registry = ObjectRegistry::NewInstance();
+#endif
+}
+
+Status ValidateOptions(const DBOptions& db_opts,
+                       const ColumnFamilyOptions& cf_opts) {
+  Status s;
+#ifndef ROCKSDB_LITE
+  auto db_cfg = DBOptionsAsConfigurable(db_opts);
+  auto cf_cfg = CFOptionsAsConfigurable(cf_opts);
+  s = db_cfg->ValidateOptions(db_opts, cf_opts);
+  if (s.ok()) s = cf_cfg->ValidateOptions(db_opts, cf_opts);
+#else
+  s = cf_opts.table_factory->ValidateOptions(db_opts, cf_opts);
+#endif
+  return s;
+}
+
+DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
+                         const MutableDBOptions& mutable_db_options) {
+  DBOptions options;
+
+  options.create_if_missing = immutable_db_options.create_if_missing;
+  options.create_missing_column_families =
+      immutable_db_options.create_missing_column_families;
+  options.error_if_exists = immutable_db_options.error_if_exists;
+  options.paranoid_checks = immutable_db_options.paranoid_checks;
+  options.flush_verify_memtable_count =
+      immutable_db_options.flush_verify_memtable_count;
+  options.track_and_verify_wals_in_manifest =
+      immutable_db_options.track_and_verify_wals_in_manifest;
+  options.verify_sst_unique_id_in_manifest =
+      immutable_db_options.verify_sst_unique_id_in_manifest;
+  options.env = immutable_db_options.env;
+  options.rate_limiter = immutable_db_options.rate_limiter;
+  options.sst_file_manager = immutable_db_options.sst_file_manager;
+  options.info_log = immutable_db_options.info_log;
+  options.info_log_level = immutable_db_options.info_log_level;
+  options.max_open_files = mutable_db_options.max_open_files;
+  options.max_file_opening_threads =
+      immutable_db_options.max_file_opening_threads;
+  options.max_total_wal_size = mutable_db_options.max_total_wal_size;
+  options.statistics = immutable_db_options.statistics;
+  options.use_fsync = immutable_db_options.use_fsync;
+  options.db_paths = immutable_db_options.db_paths;
+  options.db_log_dir = immutable_db_options.db_log_dir;
+  options.wal_dir = immutable_db_options.wal_dir;
+  options.delete_obsolete_files_period_micros =
+      mutable_db_options.delete_obsolete_files_period_micros;
+  options.max_background_jobs = mutable_db_options.max_background_jobs;
+  options.max_background_compactions =
+      mutable_db_options.max_background_compactions;
+  options.bytes_per_sync = mutable_db_options.bytes_per_sync;
+  options.wal_bytes_per_sync = mutable_db_options.wal_bytes_per_sync;
+  options.strict_bytes_per_sync = mutable_db_options.strict_bytes_per_sync;
+  options.max_subcompactions = mutable_db_options.max_subcompactions;
+  options.max_background_flushes = mutable_db_options.max_background_flushes;
+  options.max_log_file_size = immutable_db_options.max_log_file_size;
+  options.log_file_time_to_roll = immutable_db_options.log_file_time_to_roll;
+  options.keep_log_file_num = immutable_db_options.keep_log_file_num;
+  options.recycle_log_file_num = immutable_db_options.recycle_log_file_num;
+  options.max_manifest_file_size = immutable_db_options.max_manifest_file_size;
+  options.table_cache_numshardbits =
+      immutable_db_options.table_cache_numshardbits;
+  options.WAL_ttl_seconds = immutable_db_options.WAL_ttl_seconds;
+  options.WAL_size_limit_MB = immutable_db_options.WAL_size_limit_MB;
+  options.manifest_preallocation_size =
+      immutable_db_options.manifest_preallocation_size;
+  options.allow_mmap_reads = immutable_db_options.allow_mmap_reads;
+  options.allow_mmap_writes = immutable_db_options.allow_mmap_writes;
+  options.use_direct_reads = immutable_db_options.use_direct_reads;
+  options.use_direct_io_for_flush_and_compaction =
+      immutable_db_options.use_direct_io_for_flush_and_compaction;
+  options.allow_fallocate = immutable_db_options.allow_fallocate;
+  options.is_fd_close_on_exec = immutable_db_options.is_fd_close_on_exec;
+  options.stats_dump_period_sec = mutable_db_options.stats_dump_period_sec;
+  options.stats_persist_period_sec =
+      mutable_db_options.stats_persist_period_sec;
+  options.persist_stats_to_disk = immutable_db_options.persist_stats_to_disk;
+  options.stats_history_buffer_size =
+      mutable_db_options.stats_history_buffer_size;
+  options.advise_random_on_open = immutable_db_options.advise_random_on_open;
+  options.db_write_buffer_size = immutable_db_options.db_write_buffer_size;
+  options.write_buffer_manager = immutable_db_options.write_buffer_manager;
+  options.access_hint_on_compaction_start =
+      immutable_db_options.access_hint_on_compaction_start;
+  options.compaction_readahead_size =
+      mutable_db_options.compaction_readahead_size;
+  options.random_access_max_buffer_size =
+      immutable_db_options.random_access_max_buffer_size;
+  options.writable_file_max_buffer_size =
+      mutable_db_options.writable_file_max_buffer_size;
+  options.use_adaptive_mutex = immutable_db_options.use_adaptive_mutex;
+  options.listeners = immutable_db_options.listeners;
+  options.enable_thread_tracking = immutable_db_options.enable_thread_tracking;
+  options.delayed_write_rate = mutable_db_options.delayed_write_rate;
+  options.enable_pipelined_write = immutable_db_options.enable_pipelined_write;
+  options.unordered_write = immutable_db_options.unordered_write;
+  options.allow_concurrent_memtable_write =
+      immutable_db_options.allow_concurrent_memtable_write;
+  options.enable_write_thread_adaptive_yield =
+      immutable_db_options.enable_write_thread_adaptive_yield;
+  options.max_write_batch_group_size_bytes =
+      immutable_db_options.max_write_batch_group_size_bytes;
+  options.write_thread_max_yield_usec =
+      immutable_db_options.write_thread_max_yield_usec;
+  options.write_thread_slow_yield_usec =
+      immutable_db_options.write_thread_slow_yield_usec;
+  options.skip_stats_update_on_db_open =
+      immutable_db_options.skip_stats_update_on_db_open;
+  options.skip_checking_sst_file_sizes_on_db_open =
+      immutable_db_options.skip_checking_sst_file_sizes_on_db_open;
+  options.wal_recovery_mode = immutable_db_options.wal_recovery_mode;
+  options.allow_2pc = immutable_db_options.allow_2pc;
+  options.row_cache = immutable_db_options.row_cache;
+#ifndef ROCKSDB_LITE
+  options.wal_filter = immutable_db_options.wal_filter;
+#endif  // ROCKSDB_LITE
+  options.fail_if_options_file_error =
+      immutable_db_options.fail_if_options_file_error;
+  options.dump_malloc_stats = immutable_db_options.dump_malloc_stats;
+  options.avoid_flush_during_recovery =
+      immutable_db_options.avoid_flush_during_recovery;
+  options.avoid_flush_during_shutdown =
+      mutable_db_options.avoid_flush_during_shutdown;
+  options.allow_ingest_behind = immutable_db_options.allow_ingest_behind;
+  options.two_write_queues = immutable_db_options.two_write_queues;
+  options.manual_wal_flush = immutable_db_options.manual_wal_flush;
+  options.wal_compression = immutable_db_options.wal_compression;
+  options.atomic_flush = immutable_db_options.atomic_flush;
+  options.avoid_unnecessary_blocking_io =
+      immutable_db_options.avoid_unnecessary_blocking_io;
+  options.log_readahead_size = immutable_db_options.log_readahead_size;
+  options.file_checksum_gen_factory =
+      immutable_db_options.file_checksum_gen_factory;
+  options.best_efforts_recovery = immutable_db_options.best_efforts_recovery;
+  options.max_bgerror_resume_count =
+      immutable_db_options.max_bgerror_resume_count;
+  options.bgerror_resume_retry_interval =
+      immutable_db_options.bgerror_resume_retry_interval;
+  options.db_host_id = immutable_db_options.db_host_id;
+  options.allow_data_in_errors = immutable_db_options.allow_data_in_errors;
+  options.checksum_handoff_file_types =
+      immutable_db_options.checksum_handoff_file_types;
+  options.lowest_used_cache_tier = immutable_db_options.lowest_used_cache_tier;
+  options.enforce_single_del_contracts =
+      immutable_db_options.enforce_single_del_contracts;
+  return options;
+}
+
+ColumnFamilyOptions BuildColumnFamilyOptions(
+    const ColumnFamilyOptions& options,
+    const MutableCFOptions& mutable_cf_options) {
+  ColumnFamilyOptions cf_opts(options);
+  UpdateColumnFamilyOptions(mutable_cf_options, &cf_opts);
+  // TODO(yhchiang): find some way to handle the following derived options
+  // * max_file_size
+  return cf_opts;
+}
+
+void UpdateColumnFamilyOptions(const MutableCFOptions& moptions,
+                               ColumnFamilyOptions* cf_opts) {
+  // Memtable related options
+  cf_opts->write_buffer_size = moptions.write_buffer_size;
+  cf_opts->max_write_buffer_number = moptions.max_write_buffer_number;
+  cf_opts->arena_block_size = moptions.arena_block_size;
+  cf_opts->memtable_prefix_bloom_size_ratio =
+      moptions.memtable_prefix_bloom_size_ratio;
+  cf_opts->memtable_whole_key_filtering = moptions.memtable_whole_key_filtering;
+  cf_opts->memtable_huge_page_size = moptions.memtable_huge_page_size;
+  cf_opts->max_successive_merges = moptions.max_successive_merges;
+  cf_opts->inplace_update_num_locks = moptions.inplace_update_num_locks;
+  cf_opts->prefix_extractor = moptions.prefix_extractor;
+  cf_opts->experimental_mempurge_threshold =
+      moptions.experimental_mempurge_threshold;
+  cf_opts->memtable_protection_bytes_per_key =
+      moptions.memtable_protection_bytes_per_key;
+
+  // Compaction related options
+  cf_opts->disable_auto_compactions = moptions.disable_auto_compactions;
+  cf_opts->soft_pending_compaction_bytes_limit =
+      moptions.soft_pending_compaction_bytes_limit;
+  cf_opts->hard_pending_compaction_bytes_limit =
+      moptions.hard_pending_compaction_bytes_limit;
+  cf_opts->level0_file_num_compaction_trigger =
+      moptions.level0_file_num_compaction_trigger;
+  cf_opts->level0_slowdown_writes_trigger =
+      moptions.level0_slowdown_writes_trigger;
+  cf_opts->level0_stop_writes_trigger = moptions.level0_stop_writes_trigger;
+  cf_opts->max_compaction_bytes = moptions.max_compaction_bytes;
+  cf_opts->ignore_max_compaction_bytes_for_input =
+      moptions.ignore_max_compaction_bytes_for_input;
+  cf_opts->target_file_size_base = moptions.target_file_size_base;
+  cf_opts->target_file_size_multiplier = moptions.target_file_size_multiplier;
+  cf_opts->max_bytes_for_level_base = moptions.max_bytes_for_level_base;
+  cf_opts->max_bytes_for_level_multiplier =
+      moptions.max_bytes_for_level_multiplier;
+  cf_opts->ttl = moptions.ttl;
+  cf_opts->periodic_compaction_seconds = moptions.periodic_compaction_seconds;
+
+  cf_opts->max_bytes_for_level_multiplier_additional.clear();
+  for (auto value : moptions.max_bytes_for_level_multiplier_additional) {
+    cf_opts->max_bytes_for_level_multiplier_additional.emplace_back(value);
+  }
+
+  cf_opts->compaction_options_fifo = moptions.compaction_options_fifo;
+  cf_opts->compaction_options_universal = moptions.compaction_options_universal;
+
+  // Blob file related options
+  cf_opts->enable_blob_files = moptions.enable_blob_files;
+  cf_opts->min_blob_size = moptions.min_blob_size;
+  cf_opts->blob_file_size = moptions.blob_file_size;
+  cf_opts->blob_compression_type = moptions.blob_compression_type;
+  cf_opts->enable_blob_garbage_collection =
+      moptions.enable_blob_garbage_collection;
+  cf_opts->blob_garbage_collection_age_cutoff =
+      moptions.blob_garbage_collection_age_cutoff;
+  cf_opts->blob_garbage_collection_force_threshold =
+      moptions.blob_garbage_collection_force_threshold;
+  cf_opts->blob_compaction_readahead_size =
+      moptions.blob_compaction_readahead_size;
+  cf_opts->blob_file_starting_level = moptions.blob_file_starting_level;
+  cf_opts->prepopulate_blob_cache = moptions.prepopulate_blob_cache;
+
+  // Misc options
+  cf_opts->max_sequential_skip_in_iterations =
+      moptions.max_sequential_skip_in_iterations;
+  cf_opts->check_flush_compaction_key_order =
+      moptions.check_flush_compaction_key_order;
+  cf_opts->paranoid_file_checks = moptions.paranoid_file_checks;
+  cf_opts->report_bg_io_stats = moptions.report_bg_io_stats;
+  cf_opts->compression = moptions.compression;
+  cf_opts->compression_opts = moptions.compression_opts;
+  cf_opts->bottommost_compression = moptions.bottommost_compression;
+  cf_opts->bottommost_compression_opts = moptions.bottommost_compression_opts;
+  cf_opts->sample_for_compression = moptions.sample_for_compression;
+  cf_opts->compression_per_level = moptions.compression_per_level;
+  cf_opts->last_level_temperature = moptions.last_level_temperature;
+  cf_opts->bottommost_temperature = moptions.last_level_temperature;
+}
+
+void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions,
+                               ColumnFamilyOptions* cf_opts) {
+  cf_opts->compaction_style = ioptions.compaction_style;
+  cf_opts->compaction_pri = ioptions.compaction_pri;
+  cf_opts->comparator = ioptions.user_comparator;
+  cf_opts->merge_operator = ioptions.merge_operator;
+  cf_opts->compaction_filter = ioptions.compaction_filter;
+  cf_opts->compaction_filter_factory = ioptions.compaction_filter_factory;
+  cf_opts->min_write_buffer_number_to_merge =
+      ioptions.min_write_buffer_number_to_merge;
+  cf_opts->max_write_buffer_number_to_maintain =
+      ioptions.max_write_buffer_number_to_maintain;
+  cf_opts->max_write_buffer_size_to_maintain =
+      ioptions.max_write_buffer_size_to_maintain;
+  cf_opts->inplace_update_support = ioptions.inplace_update_support;
+  cf_opts->inplace_callback = ioptions.inplace_callback;
+  cf_opts->memtable_factory = ioptions.memtable_factory;
+  cf_opts->table_factory = ioptions.table_factory;
+  cf_opts->table_properties_collector_factories =
+      ioptions.table_properties_collector_factories;
+  cf_opts->bloom_locality = ioptions.bloom_locality;
+  cf_opts->level_compaction_dynamic_level_bytes =
+      ioptions.level_compaction_dynamic_level_bytes;
+  cf_opts->level_compaction_dynamic_file_size =
+      ioptions.level_compaction_dynamic_file_size;
+  cf_opts->num_levels = ioptions.num_levels;
+  cf_opts->optimize_filters_for_hits = ioptions.optimize_filters_for_hits;
+  cf_opts->force_consistency_checks = ioptions.force_consistency_checks;
+  cf_opts->memtable_insert_with_hint_prefix_extractor =
+      ioptions.memtable_insert_with_hint_prefix_extractor;
+  cf_opts->cf_paths = ioptions.cf_paths;
+  cf_opts->compaction_thread_limiter = ioptions.compaction_thread_limiter;
+  cf_opts->sst_partitioner_factory = ioptions.sst_partitioner_factory;
+  cf_opts->blob_cache = ioptions.blob_cache;
+  cf_opts->preclude_last_level_data_seconds =
+      ioptions.preclude_last_level_data_seconds;
+  cf_opts->preserve_internal_time_seconds =
+      ioptions.preserve_internal_time_seconds;
+
+  // TODO(yhchiang): find some way to handle the following derived options
+  // * max_file_size
+}
+
+std::map<CompactionStyle, std::string>
+    OptionsHelper::compaction_style_to_string = {
+        {kCompactionStyleLevel, "kCompactionStyleLevel"},
+        {kCompactionStyleUniversal, "kCompactionStyleUniversal"},
+        {kCompactionStyleFIFO, "kCompactionStyleFIFO"},
+        {kCompactionStyleNone, "kCompactionStyleNone"}};
+
+std::map<CompactionPri, std::string> OptionsHelper::compaction_pri_to_string = {
+    {kByCompensatedSize, "kByCompensatedSize"},
+    {kOldestLargestSeqFirst, "kOldestLargestSeqFirst"},
+    {kOldestSmallestSeqFirst, "kOldestSmallestSeqFirst"},
+    {kMinOverlappingRatio, "kMinOverlappingRatio"},
+    {kRoundRobin, "kRoundRobin"}};
+
+std::map<CompactionStopStyle, std::string>
+    OptionsHelper::compaction_stop_style_to_string = {
+        {kCompactionStopStyleSimilarSize, "kCompactionStopStyleSimilarSize"},
+        {kCompactionStopStyleTotalSize, "kCompactionStopStyleTotalSize"}};
+
+std::map<Temperature, std::string> OptionsHelper::temperature_to_string = {
+    {Temperature::kUnknown, "kUnknown"},
+    {Temperature::kHot, "kHot"},
+    {Temperature::kWarm, "kWarm"},
+    {Temperature::kCold, "kCold"}};
+
+std::unordered_map<std::string, ChecksumType>
+    OptionsHelper::checksum_type_string_map = {{"kNoChecksum", kNoChecksum},
+                                               {"kCRC32c", kCRC32c},
+                                               {"kxxHash", kxxHash},
+                                               {"kxxHash64", kxxHash64},
+                                               {"kXXH3", kXXH3}};
+
+std::unordered_map<std::string, CompressionType>
+    OptionsHelper::compression_type_string_map = {
+        {"kNoCompression", kNoCompression},
+        {"kSnappyCompression", kSnappyCompression},
+        {"kZlibCompression", kZlibCompression},
+        {"kBZip2Compression", kBZip2Compression},
+        {"kLZ4Compression", kLZ4Compression},
+        {"kLZ4HCCompression", kLZ4HCCompression},
+        {"kXpressCompression", kXpressCompression},
+        {"kZSTD", kZSTD},
+        {"kZSTDNotFinalCompression", kZSTDNotFinalCompression},
+        {"kDisableCompressionOption", kDisableCompressionOption}};
+
+std::vector<CompressionType> GetSupportedCompressions() {
+  // std::set internally to deduplicate potential name aliases
+  std::set<CompressionType> supported_compressions;
+  for (const auto& comp_to_name : OptionsHelper::compression_type_string_map) {
+    CompressionType t = comp_to_name.second;
+    if (t != kDisableCompressionOption && CompressionTypeSupported(t)) {
+      supported_compressions.insert(t);
+    }
+  }
+  return std::vector<CompressionType>(supported_compressions.begin(),
+                                      supported_compressions.end());
+}
+
+std::vector<CompressionType> GetSupportedDictCompressions() {
+  std::set<CompressionType> dict_compression_types;
+  for (const auto& comp_to_name : OptionsHelper::compression_type_string_map) {
+    CompressionType t = comp_to_name.second;
+    if (t != kDisableCompressionOption && DictCompressionTypeSupported(t)) {
+      dict_compression_types.insert(t);
+    }
+  }
+  return std::vector<CompressionType>(dict_compression_types.begin(),
+                                      dict_compression_types.end());
+}
+
+std::vector<ChecksumType> GetSupportedChecksums() {
+  std::set<ChecksumType> checksum_types;
+  for (const auto& e : OptionsHelper::checksum_type_string_map) {
+    checksum_types.insert(e.second);
+  }
+  return std::vector<ChecksumType>(checksum_types.begin(),
+                                   checksum_types.end());
+}
+
+#ifndef ROCKSDB_LITE
+static bool ParseOptionHelper(void* opt_address, const OptionType& opt_type,
+                              const std::string& value) {
+  switch (opt_type) {
+    case OptionType::kBoolean:
+      *static_cast<bool*>(opt_address) = ParseBoolean("", value);
+      break;
+    case OptionType::kInt:
+      *static_cast<int*>(opt_address) = ParseInt(value);
+      break;
+    case OptionType::kInt32T:
+      *static_cast<int32_t*>(opt_address) = ParseInt32(value);
+      break;
+    case OptionType::kInt64T:
+      PutUnaligned(static_cast<int64_t*>(opt_address), ParseInt64(value));
+      break;
+    case OptionType::kUInt:
+      *static_cast<unsigned int*>(opt_address) = ParseUint32(value);
+      break;
+    case OptionType::kUInt8T:
+      *static_cast<uint8_t*>(opt_address) = ParseUint8(value);
+      break;
+    case OptionType::kUInt32T:
+      *static_cast<uint32_t*>(opt_address) = ParseUint32(value);
+      break;
+    case OptionType::kUInt64T:
+      PutUnaligned(static_cast<uint64_t*>(opt_address), ParseUint64(value));
+      break;
+    case OptionType::kSizeT:
+      PutUnaligned(static_cast<size_t*>(opt_address), ParseSizeT(value));
+      break;
+    case OptionType::kString:
+      *static_cast<std::string*>(opt_address) = value;
+      break;
+    case OptionType::kDouble:
+      *static_cast<double*>(opt_address) = ParseDouble(value);
+      break;
+    case OptionType::kCompactionStyle:
+      return ParseEnum<CompactionStyle>(
+          compaction_style_string_map, value,
+          static_cast<CompactionStyle*>(opt_address));
+    case OptionType::kCompactionPri:
+      return ParseEnum<CompactionPri>(compaction_pri_string_map, value,
+                                      static_cast<CompactionPri*>(opt_address));
+    case OptionType::kCompressionType:
+      return ParseEnum<CompressionType>(
+          compression_type_string_map, value,
+          static_cast<CompressionType*>(opt_address));
+    case OptionType::kChecksumType:
+      return ParseEnum<ChecksumType>(checksum_type_string_map, value,
+                                     static_cast<ChecksumType*>(opt_address));
+    case OptionType::kEncodingType:
+      return ParseEnum<EncodingType>(encoding_type_string_map, value,
+                                     static_cast<EncodingType*>(opt_address));
+    case OptionType::kCompactionStopStyle:
+      return ParseEnum<CompactionStopStyle>(
+          compaction_stop_style_string_map, value,
+          static_cast<CompactionStopStyle*>(opt_address));
+    case OptionType::kEncodedString: {
+      std::string* output_addr = static_cast<std::string*>(opt_address);
+      (Slice(value)).DecodeHex(output_addr);
+      break;
+    }
+    case OptionType::kTemperature: {
+      return ParseEnum<Temperature>(temperature_string_map, value,
+                                    static_cast<Temperature*>(opt_address));
+    }
+    default:
+      return false;
+  }
+  return true;
+}
+
+bool SerializeSingleOptionHelper(const void* opt_address,
+                                 const OptionType opt_type,
+                                 std::string* value) {
+  assert(value);
+  switch (opt_type) {
+    case OptionType::kBoolean:
+      *value = *(static_cast<const bool*>(opt_address)) ? "true" : "false";
+      break;
+    case OptionType::kInt:
+      *value = std::to_string(*(static_cast<const int*>(opt_address)));
+      break;
+    case OptionType::kInt32T:
+      *value = std::to_string(*(static_cast<const int32_t*>(opt_address)));
+      break;
+    case OptionType::kInt64T:
+      {
+        int64_t v;
+        GetUnaligned(static_cast<const int64_t*>(opt_address), &v);
+        *value = std::to_string(v);
+      }
+      break;
+    case OptionType::kUInt:
+      *value = std::to_string(*(static_cast<const unsigned int*>(opt_address)));
+      break;
+    case OptionType::kUInt8T:
+      *value = std::to_string(*(static_cast<const uint8_t*>(opt_address)));
+      break;
+    case OptionType::kUInt32T:
+      *value = std::to_string(*(static_cast<const uint32_t*>(opt_address)));
+      break;
+    case OptionType::kUInt64T:
+      {
+        uint64_t v;
+        GetUnaligned(static_cast<const uint64_t*>(opt_address), &v);
+        *value = std::to_string(v);
+      }
+      break;
+    case OptionType::kSizeT:
+      {
+        size_t v;
+        GetUnaligned(static_cast<const size_t*>(opt_address), &v);
+        *value = std::to_string(v);
+      }
+      break;
+    case OptionType::kDouble:
+      *value = std::to_string(*(static_cast<const double*>(opt_address)));
+      break;
+    case OptionType::kString:
+      *value =
+          EscapeOptionString(*(static_cast<const std::string*>(opt_address)));
+      break;
+    case OptionType::kCompactionStyle:
+      return SerializeEnum<CompactionStyle>(
+          compaction_style_string_map,
+          *(static_cast<const CompactionStyle*>(opt_address)), value);
+    case OptionType::kCompactionPri:
+      return SerializeEnum<CompactionPri>(
+          compaction_pri_string_map,
+          *(static_cast<const CompactionPri*>(opt_address)), value);
+    case OptionType::kCompressionType:
+      return SerializeEnum<CompressionType>(
+          compression_type_string_map,
+          *(static_cast<const CompressionType*>(opt_address)), value);
+      break;
+    case OptionType::kChecksumType:
+      return SerializeEnum<ChecksumType>(
+          checksum_type_string_map,
+          *static_cast<const ChecksumType*>(opt_address), value);
+    case OptionType::kEncodingType:
+      return SerializeEnum<EncodingType>(
+          encoding_type_string_map,
+          *static_cast<const EncodingType*>(opt_address), value);
+    case OptionType::kCompactionStopStyle:
+      return SerializeEnum<CompactionStopStyle>(
+          compaction_stop_style_string_map,
+          *static_cast<const CompactionStopStyle*>(opt_address), value);
+    case OptionType::kEncodedString: {
+      const auto* ptr = static_cast<const std::string*>(opt_address);
+      *value = (Slice(*ptr)).ToString(true);
+      break;
+    }
+    case OptionType::kTemperature: {
+      return SerializeEnum<Temperature>(
+          temperature_string_map, *static_cast<const Temperature*>(opt_address),
+          value);
+    }
+    default:
+      return false;
+  }
+  return true;
+}
+
+template <typename T>
+Status ConfigureFromMap(
+    const ConfigOptions& config_options,
+    const std::unordered_map<std::string, std::string>& opt_map,
+    const std::string& option_name, Configurable* config, T* new_opts) {
+  Status s = config->ConfigureFromMap(config_options, opt_map);
+  if (s.ok()) {
+    *new_opts = *(config->GetOptions<T>(option_name));
+  }
+  return s;
+}
+
+
+Status StringToMap(const std::string& opts_str,
+                   std::unordered_map<std::string, std::string>* opts_map) {
+  assert(opts_map);
+  // Example:
+  //   opts_str = "write_buffer_size=1024;max_write_buffer_number=2;"
+  //              "nested_opt={opt1=1;opt2=2};max_bytes_for_level_base=100"
+  size_t pos = 0;
+  std::string opts = trim(opts_str);
+  // If the input string starts and ends with "{...}", strip off the brackets
+  while (opts.size() > 2 && opts[0] == '{' && opts[opts.size() - 1] == '}') {
+    opts = trim(opts.substr(1, opts.size() - 2));
+  }
+
+  while (pos < opts.size()) {
+    size_t eq_pos = opts.find_first_of("={};", pos);
+    if (eq_pos == std::string::npos) {
+      return Status::InvalidArgument("Mismatched key value pair, '=' expected");
+    } else if (opts[eq_pos] != '=') {
+      return Status::InvalidArgument("Unexpected char in key");
+    }
+
+    std::string key = trim(opts.substr(pos, eq_pos - pos));
+    if (key.empty()) {
+      return Status::InvalidArgument("Empty key found");
+    }
+
+    std::string value;
+    Status s = OptionTypeInfo::NextToken(opts, ';', eq_pos + 1, &pos, &value);
+    if (!s.ok()) {
+      return s;
+    } else {
+      (*opts_map)[key] = value;
+      if (pos == std::string::npos) {
+        break;
+      } else {
+        pos++;
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+
+Status GetStringFromDBOptions(std::string* opt_string,
+                              const DBOptions& db_options,
+                              const std::string& delimiter) {
+  ConfigOptions config_options(db_options);
+  config_options.delimiter = delimiter;
+  return GetStringFromDBOptions(config_options, db_options, opt_string);
+}
+
+Status GetStringFromDBOptions(const ConfigOptions& config_options,
+                              const DBOptions& db_options,
+                              std::string* opt_string) {
+  assert(opt_string);
+  opt_string->clear();
+  auto config = DBOptionsAsConfigurable(db_options);
+  return config->GetOptionString(config_options, opt_string);
+}
+
+
+Status GetStringFromColumnFamilyOptions(std::string* opt_string,
+                                        const ColumnFamilyOptions& cf_options,
+                                        const std::string& delimiter) {
+  ConfigOptions config_options;
+  config_options.delimiter = delimiter;
+  return GetStringFromColumnFamilyOptions(config_options, cf_options,
+                                          opt_string);
+}
+
+Status GetStringFromColumnFamilyOptions(const ConfigOptions& config_options,
+                                        const ColumnFamilyOptions& cf_options,
+                                        std::string* opt_string) {
+  const auto config = CFOptionsAsConfigurable(cf_options);
+  return config->GetOptionString(config_options, opt_string);
+}
+
+Status GetStringFromCompressionType(std::string* compression_str,
+                                    CompressionType compression_type) {
+  bool ok = SerializeEnum<CompressionType>(compression_type_string_map,
+                                           compression_type, compression_str);
+  if (ok) {
+    return Status::OK();
+  } else {
+    return Status::InvalidArgument("Invalid compression types");
+  }
+}
+
+Status GetColumnFamilyOptionsFromMap(
+    const ColumnFamilyOptions& base_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    ColumnFamilyOptions* new_options, bool input_strings_escaped,
+    bool ignore_unknown_options) {
+  ConfigOptions config_options;
+  config_options.ignore_unknown_options = ignore_unknown_options;
+  config_options.input_strings_escaped = input_strings_escaped;
+  return GetColumnFamilyOptionsFromMap(config_options, base_options, opts_map,
+                                       new_options);
+}
+
+Status GetColumnFamilyOptionsFromMap(
+    const ConfigOptions& config_options,
+    const ColumnFamilyOptions& base_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    ColumnFamilyOptions* new_options) {
+  assert(new_options);
+
+  *new_options = base_options;
+
+  const auto config = CFOptionsAsConfigurable(base_options);
+  Status s = ConfigureFromMap<ColumnFamilyOptions>(
+      config_options, opts_map, OptionsHelper::kCFOptionsName, config.get(),
+      new_options);
+  // Translate any errors (NotFound, NotSupported, to InvalidArgument
+  if (s.ok() || s.IsInvalidArgument()) {
+    return s;
+  } else {
+    return Status::InvalidArgument(s.getState());
+  }
+}
+
+Status GetColumnFamilyOptionsFromString(
+    const ColumnFamilyOptions& base_options,
+    const std::string& opts_str,
+    ColumnFamilyOptions* new_options) {
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = false;
+  config_options.ignore_unknown_options = false;
+  return GetColumnFamilyOptionsFromString(config_options, base_options,
+                                          opts_str, new_options);
+}
+
+Status GetColumnFamilyOptionsFromString(const ConfigOptions& config_options,
+                                        const ColumnFamilyOptions& base_options,
+                                        const std::string& opts_str,
+                                        ColumnFamilyOptions* new_options) {
+  std::unordered_map<std::string, std::string> opts_map;
+  Status s = StringToMap(opts_str, &opts_map);
+  if (!s.ok()) {
+    *new_options = base_options;
+    return s;
+  }
+  return GetColumnFamilyOptionsFromMap(config_options, base_options, opts_map,
+                                       new_options);
+}
+
+Status GetDBOptionsFromMap(
+    const DBOptions& base_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    DBOptions* new_options, bool input_strings_escaped,
+    bool ignore_unknown_options) {
+  ConfigOptions config_options(base_options);
+  config_options.input_strings_escaped = input_strings_escaped;
+  config_options.ignore_unknown_options = ignore_unknown_options;
+  return GetDBOptionsFromMap(config_options, base_options, opts_map,
+                             new_options);
+}
+
+Status GetDBOptionsFromMap(
+    const ConfigOptions& config_options, const DBOptions& base_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    DBOptions* new_options) {
+  assert(new_options);
+  *new_options = base_options;
+  auto config = DBOptionsAsConfigurable(base_options);
+  Status s = ConfigureFromMap<DBOptions>(config_options, opts_map,
+                                         OptionsHelper::kDBOptionsName,
+                                         config.get(), new_options);
+  // Translate any errors (NotFound, NotSupported, to InvalidArgument
+  if (s.ok() || s.IsInvalidArgument()) {
+    return s;
+  } else {
+    return Status::InvalidArgument(s.getState());
+  }
+}
+
+Status GetDBOptionsFromString(const DBOptions& base_options,
+                              const std::string& opts_str,
+                              DBOptions* new_options) {
+  ConfigOptions config_options(base_options);
+  config_options.input_strings_escaped = false;
+  config_options.ignore_unknown_options = false;
+
+  return GetDBOptionsFromString(config_options, base_options, opts_str,
+                                new_options);
+}
+
+Status GetDBOptionsFromString(const ConfigOptions& config_options,
+                              const DBOptions& base_options,
+                              const std::string& opts_str,
+                              DBOptions* new_options) {
+  std::unordered_map<std::string, std::string> opts_map;
+  Status s = StringToMap(opts_str, &opts_map);
+  if (!s.ok()) {
+    *new_options = base_options;
+    return s;
+  }
+  return GetDBOptionsFromMap(config_options, base_options, opts_map,
+                             new_options);
+}
+
+Status GetOptionsFromString(const Options& base_options,
+                            const std::string& opts_str, Options* new_options) {
+  ConfigOptions config_options(base_options);
+  config_options.input_strings_escaped = false;
+  config_options.ignore_unknown_options = false;
+
+  return GetOptionsFromString(config_options, base_options, opts_str,
+                              new_options);
+}
+
+Status GetOptionsFromString(const ConfigOptions& config_options,
+                            const Options& base_options,
+                            const std::string& opts_str, Options* new_options) {
+  ColumnFamilyOptions new_cf_options;
+  std::unordered_map<std::string, std::string> unused_opts;
+  std::unordered_map<std::string, std::string> opts_map;
+
+  assert(new_options);
+  *new_options = base_options;
+  Status s = StringToMap(opts_str, &opts_map);
+  if (!s.ok()) {
+    return s;
+  }
+  auto config = DBOptionsAsConfigurable(base_options);
+  s = config->ConfigureFromMap(config_options, opts_map, &unused_opts);
+
+  if (s.ok()) {
+    DBOptions* new_db_options =
+        config->GetOptions<DBOptions>(OptionsHelper::kDBOptionsName);
+    if (!unused_opts.empty()) {
+      s = GetColumnFamilyOptionsFromMap(config_options, base_options,
+                                        unused_opts, &new_cf_options);
+      if (s.ok()) {
+        *new_options = Options(*new_db_options, new_cf_options);
+      }
+    } else {
+      *new_options = Options(*new_db_options, base_options);
+    }
+  }
+  // Translate any errors (NotFound, NotSupported, to InvalidArgument
+  if (s.ok() || s.IsInvalidArgument()) {
+    return s;
+  } else {
+    return Status::InvalidArgument(s.getState());
+  }
+}
+
+std::unordered_map<std::string, EncodingType>
+    OptionsHelper::encoding_type_string_map = {{"kPlain", kPlain},
+                                               {"kPrefix", kPrefix}};
+
+std::unordered_map<std::string, CompactionStyle>
+    OptionsHelper::compaction_style_string_map = {
+        {"kCompactionStyleLevel", kCompactionStyleLevel},
+        {"kCompactionStyleUniversal", kCompactionStyleUniversal},
+        {"kCompactionStyleFIFO", kCompactionStyleFIFO},
+        {"kCompactionStyleNone", kCompactionStyleNone}};
+
+std::unordered_map<std::string, CompactionPri>
+    OptionsHelper::compaction_pri_string_map = {
+        {"kByCompensatedSize", kByCompensatedSize},
+        {"kOldestLargestSeqFirst", kOldestLargestSeqFirst},
+        {"kOldestSmallestSeqFirst", kOldestSmallestSeqFirst},
+        {"kMinOverlappingRatio", kMinOverlappingRatio},
+        {"kRoundRobin", kRoundRobin}};
+
+std::unordered_map<std::string, CompactionStopStyle>
+    OptionsHelper::compaction_stop_style_string_map = {
+        {"kCompactionStopStyleSimilarSize", kCompactionStopStyleSimilarSize},
+        {"kCompactionStopStyleTotalSize", kCompactionStopStyleTotalSize}};
+
+std::unordered_map<std::string, Temperature>
+    OptionsHelper::temperature_string_map = {
+        {"kUnknown", Temperature::kUnknown},
+        {"kHot", Temperature::kHot},
+        {"kWarm", Temperature::kWarm},
+        {"kCold", Temperature::kCold}};
+
+std::unordered_map<std::string, PrepopulateBlobCache>
+    OptionsHelper::prepopulate_blob_cache_string_map = {
+        {"kDisable", PrepopulateBlobCache::kDisable},
+        {"kFlushOnly", PrepopulateBlobCache::kFlushOnly}};
+
+Status OptionTypeInfo::NextToken(const std::string& opts, char delimiter,
+                                 size_t pos, size_t* end, std::string* token) {
+  while (pos < opts.size() && isspace(opts[pos])) {
+    ++pos;
+  }
+  // Empty value at the end
+  if (pos >= opts.size()) {
+    *token = "";
+    *end = std::string::npos;
+    return Status::OK();
+  } else if (opts[pos] == '{') {
+    int count = 1;
+    size_t brace_pos = pos + 1;
+    while (brace_pos < opts.size()) {
+      if (opts[brace_pos] == '{') {
+        ++count;
+      } else if (opts[brace_pos] == '}') {
+        --count;
+        if (count == 0) {
+          break;
+        }
+      }
+      ++brace_pos;
+    }
+    // found the matching closing brace
+    if (count == 0) {
+      *token = trim(opts.substr(pos + 1, brace_pos - pos - 1));
+      // skip all whitespace and move to the next delimiter
+      // brace_pos points to the next position after the matching '}'
+      pos = brace_pos + 1;
+      while (pos < opts.size() && isspace(opts[pos])) {
+        ++pos;
+      }
+      if (pos < opts.size() && opts[pos] != delimiter) {
+        return Status::InvalidArgument("Unexpected chars after nested options");
+      }
+      *end = pos;
+    } else {
+      return Status::InvalidArgument(
+          "Mismatched curly braces for nested options");
+    }
+  } else {
+    *end = opts.find(delimiter, pos);
+    if (*end == std::string::npos) {
+      // It either ends with a trailing semi-colon or the last key-value pair
+      *token = trim(opts.substr(pos));
+    } else {
+      *token = trim(opts.substr(pos, *end - pos));
+    }
+  }
+  return Status::OK();
+}
+
+Status OptionTypeInfo::Parse(const ConfigOptions& config_options,
+                             const std::string& opt_name,
+                             const std::string& value, void* opt_ptr) const {
+  if (IsDeprecated()) {
+    return Status::OK();
+  }
+  try {
+    const std::string& opt_value = config_options.input_strings_escaped
+                                       ? UnescapeOptionString(value)
+                                       : value;
+
+    if (opt_ptr == nullptr) {
+      return Status::NotFound("Could not find option", opt_name);
+    } else if (parse_func_ != nullptr) {
+      ConfigOptions copy = config_options;
+      copy.invoke_prepare_options = false;
+      void* opt_addr = GetOffset(opt_ptr);
+      return parse_func_(copy, opt_name, opt_value, opt_addr);
+    } else if (ParseOptionHelper(GetOffset(opt_ptr), type_, opt_value)) {
+      return Status::OK();
+    } else if (IsConfigurable()) {
+      // The option is <config>.<name>
+      Configurable* config = AsRawPointer<Configurable>(opt_ptr);
+      if (opt_value.empty()) {
+        return Status::OK();
+      } else if (config == nullptr) {
+        return Status::NotFound("Could not find configurable: ", opt_name);
+      } else {
+        ConfigOptions copy = config_options;
+        copy.ignore_unknown_options = false;
+        copy.invoke_prepare_options = false;
+        if (opt_value.find("=") != std::string::npos) {
+          return config->ConfigureFromString(copy, opt_value);
+        } else {
+          return config->ConfigureOption(copy, opt_name, opt_value);
+        }
+      }
+    } else if (IsByName()) {
+      return Status::NotSupported("Deserializing the option " + opt_name +
+                                  " is not supported");
+    } else {
+      return Status::InvalidArgument("Error parsing:", opt_name);
+    }
+  } catch (std::exception& e) {
+    return Status::InvalidArgument("Error parsing " + opt_name + ":" +
+                                   std::string(e.what()));
+  }
+}
+
+Status OptionTypeInfo::ParseType(
+    const ConfigOptions& config_options, const std::string& opts_str,
+    const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+    void* opt_addr, std::unordered_map<std::string, std::string>* unused) {
+  std::unordered_map<std::string, std::string> opts_map;
+  Status status = StringToMap(opts_str, &opts_map);
+  if (!status.ok()) {
+    return status;
+  } else {
+    return ParseType(config_options, opts_map, type_map, opt_addr, unused);
+  }
+}
+
+Status OptionTypeInfo::ParseType(
+    const ConfigOptions& config_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+    void* opt_addr, std::unordered_map<std::string, std::string>* unused) {
+  for (const auto& opts_iter : opts_map) {
+    std::string opt_name;
+    const auto* opt_info = Find(opts_iter.first, type_map, &opt_name);
+    if (opt_info != nullptr) {
+      Status status =
+          opt_info->Parse(config_options, opt_name, opts_iter.second, opt_addr);
+      if (!status.ok()) {
+        return status;
+      }
+    } else if (unused != nullptr) {
+      (*unused)[opts_iter.first] = opts_iter.second;
+    } else if (!config_options.ignore_unknown_options) {
+      return Status::NotFound("Unrecognized option", opts_iter.first);
+    }
+  }
+  return Status::OK();
+}
+
+Status OptionTypeInfo::ParseStruct(
+    const ConfigOptions& config_options, const std::string& struct_name,
+    const std::unordered_map<std::string, OptionTypeInfo>* struct_map,
+    const std::string& opt_name, const std::string& opt_value, void* opt_addr) {
+  assert(struct_map);
+  Status status;
+  if (opt_name == struct_name || EndsWith(opt_name, "." + struct_name)) {
+    // This option represents the entire struct
+    std::unordered_map<std::string, std::string> unused;
+    status =
+        ParseType(config_options, opt_value, *struct_map, opt_addr, &unused);
+    if (status.ok() && !unused.empty()) {
+      status = Status::InvalidArgument(
+          "Unrecognized option", struct_name + "." + unused.begin()->first);
+    }
+  } else if (StartsWith(opt_name, struct_name + ".")) {
+    // This option represents a nested field in the struct (e.g, struct.field)
+    std::string elem_name;
+    const auto opt_info =
+        Find(opt_name.substr(struct_name.size() + 1), *struct_map, &elem_name);
+    if (opt_info != nullptr) {
+      status = opt_info->Parse(config_options, elem_name, opt_value, opt_addr);
+    } else {
+      status = Status::InvalidArgument("Unrecognized option", opt_name);
+    }
+  } else {
+    // This option represents a field in the struct (e.g. field)
+    std::string elem_name;
+    const auto opt_info = Find(opt_name, *struct_map, &elem_name);
+    if (opt_info != nullptr) {
+      status = opt_info->Parse(config_options, elem_name, opt_value, opt_addr);
+    } else {
+      status = Status::InvalidArgument("Unrecognized option",
+                                       struct_name + "." + opt_name);
+    }
+  }
+  return status;
+}
+
+Status OptionTypeInfo::Serialize(const ConfigOptions& config_options,
+                                 const std::string& opt_name,
+                                 const void* const opt_ptr,
+                                 std::string* opt_value) const {
+  // If the option is no longer used in rocksdb and marked as deprecated,
+  // we skip it in the serialization.
+  if (opt_ptr == nullptr || IsDeprecated()) {
+    return Status::OK();
+  } else if (IsEnabled(OptionTypeFlags::kDontSerialize)) {
+    return Status::NotSupported("Cannot serialize option: ", opt_name);
+  } else if (serialize_func_ != nullptr) {
+    const void* opt_addr = GetOffset(opt_ptr);
+    return serialize_func_(config_options, opt_name, opt_addr, opt_value);
+  } else if (IsCustomizable()) {
+    const Customizable* custom = AsRawPointer<Customizable>(opt_ptr);
+    opt_value->clear();
+    if (custom == nullptr) {
+      // We do not have a custom object to serialize.
+      // If the option is not mutable and we are doing only mutable options,
+      // we return an empty string (which will cause the option not to be
+      // printed). Otherwise, we return the "nullptr" string, which will result
+      // in "option=nullptr" being printed.
+      if (IsMutable() || !config_options.mutable_options_only) {
+        *opt_value = kNullptrString;
+      } else {
+        *opt_value = "";
+      }
+    } else if (IsEnabled(OptionTypeFlags::kStringNameOnly) &&
+               !config_options.IsDetailed()) {
+      if (!config_options.mutable_options_only || IsMutable()) {
+        *opt_value = custom->GetId();
+      }
+    } else {
+      ConfigOptions embedded = config_options;
+      embedded.delimiter = ";";
+      // If this option is mutable, everything inside it should be considered
+      // mutable
+      if (IsMutable()) {
+        embedded.mutable_options_only = false;
+      }
+      std::string value = custom->ToString(embedded);
+      if (!embedded.mutable_options_only ||
+          value.find("=") != std::string::npos) {
+        *opt_value = value;
+      } else {
+        *opt_value = "";
+      }
+    }
+    return Status::OK();
+  } else if (IsConfigurable()) {
+    const Configurable* config = AsRawPointer<Configurable>(opt_ptr);
+    if (config != nullptr) {
+      ConfigOptions embedded = config_options;
+      embedded.delimiter = ";";
+      *opt_value = config->ToString(embedded);
+    }
+    return Status::OK();
+  } else if (config_options.mutable_options_only && !IsMutable()) {
+    return Status::OK();
+  } else if (SerializeSingleOptionHelper(GetOffset(opt_ptr), type_,
+                                         opt_value)) {
+    return Status::OK();
+  } else {
+    return Status::InvalidArgument("Cannot serialize option: ", opt_name);
+  }
+}
+
+Status OptionTypeInfo::SerializeType(
+    const ConfigOptions& config_options,
+    const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+    const void* opt_addr, std::string* result) {
+  Status status;
+  for (const auto& iter : type_map) {
+    std::string single;
+    const auto& opt_info = iter.second;
+    if (opt_info.ShouldSerialize()) {
+      status =
+          opt_info.Serialize(config_options, iter.first, opt_addr, &single);
+      if (!status.ok()) {
+        return status;
+      } else {
+        result->append(iter.first + "=" + single + config_options.delimiter);
+      }
+    }
+  }
+  return status;
+}
+
+Status OptionTypeInfo::SerializeStruct(
+    const ConfigOptions& config_options, const std::string& struct_name,
+    const std::unordered_map<std::string, OptionTypeInfo>* struct_map,
+    const std::string& opt_name, const void* opt_addr, std::string* value) {
+  assert(struct_map);
+  Status status;
+  if (EndsWith(opt_name, struct_name)) {
+    // We are going to write the struct as "{ prop1=value1; prop2=value2;}.
+    // Set the delimiter to ";" so that the everything will be on one line.
+    ConfigOptions embedded = config_options;
+    embedded.delimiter = ";";
+
+    // This option represents the entire struct
+    std::string result;
+    status = SerializeType(embedded, *struct_map, opt_addr, &result);
+    if (!status.ok()) {
+      return status;
+    } else {
+      *value = "{" + result + "}";
+    }
+  } else if (StartsWith(opt_name, struct_name + ".")) {
+    // This option represents a nested field in the struct (e.g, struct.field)
+    std::string elem_name;
+    const auto opt_info =
+        Find(opt_name.substr(struct_name.size() + 1), *struct_map, &elem_name);
+    if (opt_info != nullptr) {
+      status = opt_info->Serialize(config_options, elem_name, opt_addr, value);
+    } else {
+      status = Status::InvalidArgument("Unrecognized option", opt_name);
+    }
+  } else {
+    // This option represents a field in the struct (e.g. field)
+    std::string elem_name;
+    const auto opt_info = Find(opt_name, *struct_map, &elem_name);
+    if (opt_info == nullptr) {
+      status = Status::InvalidArgument("Unrecognized option", opt_name);
+    } else if (opt_info->ShouldSerialize()) {
+      status = opt_info->Serialize(config_options, opt_name + "." + elem_name,
+                                   opt_addr, value);
+    }
+  }
+  return status;
+}
+
+template <typename T>
+bool IsOptionEqual(const void* offset1, const void* offset2) {
+  return (*static_cast<const T*>(offset1) == *static_cast<const T*>(offset2));
+}
+
+static bool AreEqualDoubles(const double a, const double b) {
+  return (fabs(a - b) < 0.00001);
+}
+
+static bool AreOptionsEqual(OptionType type, const void* this_offset,
+                            const void* that_offset) {
+  switch (type) {
+    case OptionType::kBoolean:
+      return IsOptionEqual<bool>(this_offset, that_offset);
+    case OptionType::kInt:
+      return IsOptionEqual<int>(this_offset, that_offset);
+    case OptionType::kUInt:
+      return IsOptionEqual<unsigned int>(this_offset, that_offset);
+    case OptionType::kInt32T:
+      return IsOptionEqual<int32_t>(this_offset, that_offset);
+    case OptionType::kInt64T: {
+      int64_t v1, v2;
+      GetUnaligned(static_cast<const int64_t*>(this_offset), &v1);
+      GetUnaligned(static_cast<const int64_t*>(that_offset), &v2);
+      return (v1 == v2);
+    }
+    case OptionType::kUInt8T:
+      return IsOptionEqual<uint8_t>(this_offset, that_offset);
+    case OptionType::kUInt32T:
+      return IsOptionEqual<uint32_t>(this_offset, that_offset);
+    case OptionType::kUInt64T: {
+      uint64_t v1, v2;
+      GetUnaligned(static_cast<const uint64_t*>(this_offset), &v1);
+      GetUnaligned(static_cast<const uint64_t*>(that_offset), &v2);
+      return (v1 == v2);
+    }
+    case OptionType::kSizeT: {
+      size_t v1, v2;
+      GetUnaligned(static_cast<const size_t*>(this_offset), &v1);
+      GetUnaligned(static_cast<const size_t*>(that_offset), &v2);
+      return (v1 == v2);
+    }
+    case OptionType::kString:
+      return IsOptionEqual<std::string>(this_offset, that_offset);
+    case OptionType::kDouble:
+      return AreEqualDoubles(*static_cast<const double*>(this_offset),
+                             *static_cast<const double*>(that_offset));
+    case OptionType::kCompactionStyle:
+      return IsOptionEqual<CompactionStyle>(this_offset, that_offset);
+    case OptionType::kCompactionStopStyle:
+      return IsOptionEqual<CompactionStopStyle>(this_offset, that_offset);
+    case OptionType::kCompactionPri:
+      return IsOptionEqual<CompactionPri>(this_offset, that_offset);
+    case OptionType::kCompressionType:
+      return IsOptionEqual<CompressionType>(this_offset, that_offset);
+    case OptionType::kChecksumType:
+      return IsOptionEqual<ChecksumType>(this_offset, that_offset);
+    case OptionType::kEncodingType:
+      return IsOptionEqual<EncodingType>(this_offset, that_offset);
+    case OptionType::kEncodedString:
+      return IsOptionEqual<std::string>(this_offset, that_offset);
+    case OptionType::kTemperature:
+      return IsOptionEqual<Temperature>(this_offset, that_offset);
+    default:
+      return false;
+  }  // End switch
+}
+
+bool OptionTypeInfo::AreEqual(const ConfigOptions& config_options,
+                              const std::string& opt_name,
+                              const void* const this_ptr,
+                              const void* const that_ptr,
+                              std::string* mismatch) const {
+  auto level = GetSanityLevel();
+  if (!config_options.IsCheckEnabled(level)) {
+    return true;  // If the sanity level is not being checked, skip it
+  }
+  if (this_ptr == nullptr || that_ptr == nullptr) {
+    if (this_ptr == that_ptr) {
+      return true;
+    }
+  } else if (equals_func_ != nullptr) {
+    const void* this_addr = GetOffset(this_ptr);
+    const void* that_addr = GetOffset(that_ptr);
+    if (equals_func_(config_options, opt_name, this_addr, that_addr,
+                     mismatch)) {
+      return true;
+    }
+  } else {
+    const void* this_addr = GetOffset(this_ptr);
+    const void* that_addr = GetOffset(that_ptr);
+    if (AreOptionsEqual(type_, this_addr, that_addr)) {
+      return true;
+    } else if (IsConfigurable()) {
+      const auto* this_config = AsRawPointer<Configurable>(this_ptr);
+      const auto* that_config = AsRawPointer<Configurable>(that_ptr);
+      if (this_config == that_config) {
+        return true;
+      } else if (this_config != nullptr && that_config != nullptr) {
+        std::string bad_name;
+        bool matches;
+        if (level < config_options.sanity_level) {
+          ConfigOptions copy = config_options;
+          copy.sanity_level = level;
+          matches = this_config->AreEquivalent(copy, that_config, &bad_name);
+        } else {
+          matches = this_config->AreEquivalent(config_options, that_config,
+                                               &bad_name);
+        }
+        if (!matches) {
+          *mismatch = opt_name + "." + bad_name;
+        }
+        return matches;
+      }
+    }
+  }
+  if (mismatch->empty()) {
+    *mismatch = opt_name;
+  }
+  return false;
+}
+
+bool OptionTypeInfo::TypesAreEqual(
+    const ConfigOptions& config_options,
+    const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+    const void* this_addr, const void* that_addr, std::string* mismatch) {
+  for (const auto& iter : type_map) {
+    const auto& opt_info = iter.second;
+    if (!opt_info.AreEqual(config_options, iter.first, this_addr, that_addr,
+                           mismatch)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool OptionTypeInfo::StructsAreEqual(
+    const ConfigOptions& config_options, const std::string& struct_name,
+    const std::unordered_map<std::string, OptionTypeInfo>* struct_map,
+    const std::string& opt_name, const void* this_addr, const void* that_addr,
+    std::string* mismatch) {
+  assert(struct_map);
+  bool matches = true;
+  std::string result;
+  if (EndsWith(opt_name, struct_name)) {
+    // This option represents the entire struct
+    matches = TypesAreEqual(config_options, *struct_map, this_addr, that_addr,
+                            &result);
+    if (!matches) {
+      *mismatch = struct_name + "." + result;
+      return false;
+    }
+  } else if (StartsWith(opt_name, struct_name + ".")) {
+    // This option represents a nested field in the struct (e.g, struct.field)
+    std::string elem_name;
+    const auto opt_info =
+        Find(opt_name.substr(struct_name.size() + 1), *struct_map, &elem_name);
+    assert(opt_info);
+    if (opt_info == nullptr) {
+      *mismatch = opt_name;
+      matches = false;
+    } else if (!opt_info->AreEqual(config_options, elem_name, this_addr,
+                                   that_addr, &result)) {
+      matches = false;
+      *mismatch = struct_name + "." + result;
+    }
+  } else {
+    // This option represents a field in the struct (e.g. field)
+    std::string elem_name;
+    const auto opt_info = Find(opt_name, *struct_map, &elem_name);
+    assert(opt_info);
+    if (opt_info == nullptr) {
+      *mismatch = struct_name + "." + opt_name;
+      matches = false;
+    } else if (!opt_info->AreEqual(config_options, elem_name, this_addr,
+                                   that_addr, &result)) {
+      matches = false;
+      *mismatch = struct_name + "." + result;
+    }
+  }
+  return matches;
+}
+
+bool MatchesOptionsTypeFromMap(
+    const ConfigOptions& config_options,
+    const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+    const void* const this_ptr, const void* const that_ptr,
+    std::string* mismatch) {
+  for (auto& pair : type_map) {
+    // We skip checking deprecated variables as they might
+    // contain random values since they might not be initialized
+    if (config_options.IsCheckEnabled(pair.second.GetSanityLevel())) {
+      if (!pair.second.AreEqual(config_options, pair.first, this_ptr, that_ptr,
+                                mismatch) &&
+          !pair.second.AreEqualByName(config_options, pair.first, this_ptr,
+                                      that_ptr)) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+bool OptionTypeInfo::AreEqualByName(const ConfigOptions& config_options,
+                                    const std::string& opt_name,
+                                    const void* const this_ptr,
+                                    const void* const that_ptr) const {
+  if (IsByName()) {
+    std::string that_value;
+    if (Serialize(config_options, opt_name, that_ptr, &that_value).ok()) {
+      return AreEqualByName(config_options, opt_name, this_ptr, that_value);
+    }
+  }
+  return false;
+}
+
+bool OptionTypeInfo::AreEqualByName(const ConfigOptions& config_options,
+                                    const std::string& opt_name,
+                                    const void* const opt_ptr,
+                                    const std::string& that_value) const {
+  std::string this_value;
+  if (!IsByName()) {
+    return false;
+  } else if (!Serialize(config_options, opt_name, opt_ptr, &this_value).ok()) {
+    return false;
+  } else if (IsEnabled(OptionVerificationType::kByNameAllowFromNull)) {
+    if (that_value == kNullptrString) {
+      return true;
+    }
+  } else if (IsEnabled(OptionVerificationType::kByNameAllowNull)) {
+    if (that_value == kNullptrString) {
+      return true;
+    }
+  }
+  return (this_value == that_value);
+}
+
+Status OptionTypeInfo::Prepare(const ConfigOptions& config_options,
+                               const std::string& name, void* opt_ptr) const {
+  if (ShouldPrepare()) {
+    if (prepare_func_ != nullptr) {
+      void* opt_addr = GetOffset(opt_ptr);
+      return prepare_func_(config_options, name, opt_addr);
+    } else if (IsConfigurable()) {
+      Configurable* config = AsRawPointer<Configurable>(opt_ptr);
+      if (config != nullptr) {
+        return config->PrepareOptions(config_options);
+      } else if (!CanBeNull()) {
+        return Status::NotFound("Missing configurable object", name);
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status OptionTypeInfo::Validate(const DBOptions& db_opts,
+                                const ColumnFamilyOptions& cf_opts,
+                                const std::string& name,
+                                const void* opt_ptr) const {
+  if (ShouldValidate()) {
+    if (validate_func_ != nullptr) {
+      const void* opt_addr = GetOffset(opt_ptr);
+      return validate_func_(db_opts, cf_opts, name, opt_addr);
+    } else if (IsConfigurable()) {
+      const Configurable* config = AsRawPointer<Configurable>(opt_ptr);
+      if (config != nullptr) {
+        return config->ValidateOptions(db_opts, cf_opts);
+      } else if (!CanBeNull()) {
+        return Status::NotFound("Missing configurable object", name);
+      }
+    }
+  }
+  return Status::OK();
+}
+
+const OptionTypeInfo* OptionTypeInfo::Find(
+    const std::string& opt_name,
+    const std::unordered_map<std::string, OptionTypeInfo>& opt_map,
+    std::string* elem_name) {
+  const auto iter = opt_map.find(opt_name);  // Look up the value in the map
+  if (iter != opt_map.end()) {               // Found the option in the map
+    *elem_name = opt_name;                   // Return the name
+    return &(iter->second);  // Return the contents of the iterator
+  } else {
+    auto idx = opt_name.find(".");              // Look for a separator
+    if (idx > 0 && idx != std::string::npos) {  // We found a separator
+      auto siter =
+          opt_map.find(opt_name.substr(0, idx));  // Look for the short name
+      if (siter != opt_map.end()) {               // We found the short name
+        if (siter->second.IsStruct() ||           // If the object is a struct
+            siter->second.IsConfigurable()) {     // or a Configurable
+          *elem_name = opt_name.substr(idx + 1);  // Return the rest
+          return &(siter->second);  // Return the contents of the iterator
+        }
+      }
+    }
+  }
+  return nullptr;
+}
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/options/options_helper.h b/src/rocksdb/options/options_helper.h
new file mode 100644
index 000000000..7c751fc25
--- /dev/null
+++ b/src/rocksdb/options/options_helper.h
@@ -0,0 +1,122 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <map>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+struct ColumnFamilyOptions;
+struct ConfigOptions;
+struct DBOptions;
+struct ImmutableCFOptions;
+struct ImmutableDBOptions;
+struct MutableDBOptions;
+struct MutableCFOptions;
+struct Options;
+
+std::vector<CompressionType> GetSupportedCompressions();
+
+std::vector<CompressionType> GetSupportedDictCompressions();
+
+std::vector<ChecksumType> GetSupportedChecksums();
+
+inline bool IsSupportedChecksumType(ChecksumType type) {
+  // Avoid annoying compiler warning-as-error (-Werror=type-limits)
+  auto min = kNoChecksum;
+  auto max = kXXH3;
+  return type >= min && type <= max;
+}
+
+// Checks that the combination of DBOptions and ColumnFamilyOptions are valid
+Status ValidateOptions(const DBOptions& db_opts,
+                       const ColumnFamilyOptions& cf_opts);
+
+DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
+                         const MutableDBOptions& mutable_db_options);
+
+ColumnFamilyOptions BuildColumnFamilyOptions(
+    const ColumnFamilyOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options);
+
+void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions,
+                               ColumnFamilyOptions* cf_opts);
+void UpdateColumnFamilyOptions(const MutableCFOptions& moptions,
+                               ColumnFamilyOptions* cf_opts);
+
+#ifndef ROCKSDB_LITE
+std::unique_ptr<Configurable> DBOptionsAsConfigurable(
+    const MutableDBOptions& opts);
+std::unique_ptr<Configurable> DBOptionsAsConfigurable(
+    const DBOptions& opts,
+    const std::unordered_map<std::string, std::string>* opt_map = nullptr);
+std::unique_ptr<Configurable> CFOptionsAsConfigurable(
+    const MutableCFOptions& opts);
+std::unique_ptr<Configurable> CFOptionsAsConfigurable(
+    const ColumnFamilyOptions& opts,
+    const std::unordered_map<std::string, std::string>* opt_map = nullptr);
+
+extern Status StringToMap(
+    const std::string& opts_str,
+    std::unordered_map<std::string, std::string>* opts_map);
+#endif  // !ROCKSDB_LITE
+
+struct OptionsHelper {
+  static const std::string kCFOptionsName /*= "ColumnFamilyOptions"*/;
+  static const std::string kDBOptionsName /*= "DBOptions" */;
+  static std::map<CompactionStyle, std::string> compaction_style_to_string;
+  static std::map<CompactionPri, std::string> compaction_pri_to_string;
+  static std::map<CompactionStopStyle, std::string>
+      compaction_stop_style_to_string;
+  static std::map<Temperature, std::string> temperature_to_string;
+  static std::unordered_map<std::string, ChecksumType> checksum_type_string_map;
+  static std::unordered_map<std::string, CompressionType>
+      compression_type_string_map;
+  static std::unordered_map<std::string, PrepopulateBlobCache>
+      prepopulate_blob_cache_string_map;
+#ifndef ROCKSDB_LITE
+  static std::unordered_map<std::string, CompactionStopStyle>
+      compaction_stop_style_string_map;
+  static std::unordered_map<std::string, EncodingType> encoding_type_string_map;
+  static std::unordered_map<std::string, CompactionStyle>
+      compaction_style_string_map;
+  static std::unordered_map<std::string, CompactionPri>
+      compaction_pri_string_map;
+  static std::unordered_map<std::string, Temperature> temperature_string_map;
+#endif  // !ROCKSDB_LITE
+};
+
+// Some aliasing
+static auto& compaction_style_to_string =
+    OptionsHelper::compaction_style_to_string;
+static auto& compaction_pri_to_string = OptionsHelper::compaction_pri_to_string;
+static auto& compaction_stop_style_to_string =
+    OptionsHelper::compaction_stop_style_to_string;
+static auto& temperature_to_string = OptionsHelper::temperature_to_string;
+static auto& checksum_type_string_map = OptionsHelper::checksum_type_string_map;
+#ifndef ROCKSDB_LITE
+static auto& compaction_stop_style_string_map =
+    OptionsHelper::compaction_stop_style_string_map;
+static auto& compression_type_string_map =
+    OptionsHelper::compression_type_string_map;
+static auto& encoding_type_string_map = OptionsHelper::encoding_type_string_map;
+static auto& compaction_style_string_map =
+    OptionsHelper::compaction_style_string_map;
+static auto& compaction_pri_string_map =
+    OptionsHelper::compaction_pri_string_map;
+static auto& temperature_string_map = OptionsHelper::temperature_string_map;
+static auto& prepopulate_blob_cache_string_map =
+    OptionsHelper::prepopulate_blob_cache_string_map;
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/options/options_parser.cc b/src/rocksdb/options/options_parser.cc
new file mode 100644
index 000000000..562a7b214
--- /dev/null
+++ b/src/rocksdb/options/options_parser.cc
@@ -0,0 +1,727 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "options/options_parser.h"
+
+#include <cmath>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "file/line_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+#include "options/db_options.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/options_type.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static const std::string option_file_header =
+    "# This is a RocksDB option file.\n"
+    "#\n"
+    "# For detailed file format spec, please refer to the example file\n"
+    "# in examples/rocksdb_option_file_example.ini\n"
+    "#\n"
+    "\n";
+
+Status PersistRocksDBOptions(const DBOptions& db_opt,
+                             const std::vector<std::string>& cf_names,
+                             const std::vector<ColumnFamilyOptions>& cf_opts,
+                             const std::string& file_name, FileSystem* fs) {
+  ConfigOptions
+      config_options;  // Use default for escaped(true) and check (exact)
+  config_options.delimiter = "\n  ";
+  // Do not invoke PrepareOptions when we are doing validation.
+  config_options.invoke_prepare_options = false;
+  // If a readahead size was set in the input options, use it
+  if (db_opt.log_readahead_size > 0) {
+    config_options.file_readahead_size = db_opt.log_readahead_size;
+  }
+  return PersistRocksDBOptions(config_options, db_opt, cf_names, cf_opts,
+                               file_name, fs);
+}
+
+Status PersistRocksDBOptions(const ConfigOptions& config_options_in,
+                             const DBOptions& db_opt,
+                             const std::vector<std::string>& cf_names,
+                             const std::vector<ColumnFamilyOptions>& cf_opts,
+                             const std::string& file_name, FileSystem* fs) {
+  ConfigOptions config_options = config_options_in;
+  config_options.delimiter = "\n  ";  // Override the default to nl
+
+  TEST_SYNC_POINT("PersistRocksDBOptions:start");
+  if (cf_names.size() != cf_opts.size()) {
+    return Status::InvalidArgument(
+        "cf_names.size() and cf_opts.size() must be the same");
+  }
+  std::unique_ptr<FSWritableFile> wf;
+
+  Status s =
+      fs->NewWritableFile(file_name, FileOptions(), &wf, nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+  std::unique_ptr<WritableFileWriter> writable;
+  writable.reset(new WritableFileWriter(std::move(wf), file_name, EnvOptions(),
+                                        nullptr /* statistics */));
+
+  std::string options_file_content;
+
+  s = writable->Append(
+      option_file_header + "[" + opt_section_titles[kOptionSectionVersion] +
+      "]\n"
+      "  rocksdb_version=" +
+      std::to_string(ROCKSDB_MAJOR) + "." + std::to_string(ROCKSDB_MINOR) +
+      "." + std::to_string(ROCKSDB_PATCH) + "\n");
+  if (s.ok()) {
+    s = writable->Append(
+        "  options_file_version=" + std::to_string(ROCKSDB_OPTION_FILE_MAJOR) +
+        "." + std::to_string(ROCKSDB_OPTION_FILE_MINOR) + "\n");
+  }
+  if (s.ok()) {
+    s = writable->Append("\n[" + opt_section_titles[kOptionSectionDBOptions] +
+                         "]\n  ");
+  }
+
+  if (s.ok()) {
+    s = GetStringFromDBOptions(config_options, db_opt, &options_file_content);
+  }
+  if (s.ok()) {
+    s = writable->Append(options_file_content + "\n");
+  }
+
+  for (size_t i = 0; s.ok() && i < cf_opts.size(); ++i) {
+    // CFOptions section
+    s = writable->Append("\n[" + opt_section_titles[kOptionSectionCFOptions] +
+                         " \"" + EscapeOptionString(cf_names[i]) + "\"]\n  ");
+    if (s.ok()) {
+      s = GetStringFromColumnFamilyOptions(config_options, cf_opts[i],
+                                           &options_file_content);
+    }
+    if (s.ok()) {
+      s = writable->Append(options_file_content + "\n");
+    }
+    // TableOptions section
+    auto* tf = cf_opts[i].table_factory.get();
+    if (tf != nullptr) {
+      if (s.ok()) {
+        s = writable->Append(
+            "[" + opt_section_titles[kOptionSectionTableOptions] + tf->Name() +
+            " \"" + EscapeOptionString(cf_names[i]) + "\"]\n  ");
+      }
+      if (s.ok()) {
+        options_file_content.clear();
+        s = tf->GetOptionString(config_options, &options_file_content);
+      }
+      if (s.ok()) {
+        s = writable->Append(options_file_content + "\n");
+      }
+    }
+  }
+  if (s.ok()) {
+    s = writable->Sync(true /* use_fsync */);
+  }
+  if (s.ok()) {
+    s = writable->Close();
+  }
+  if (s.ok()) {
+    return RocksDBOptionsParser::VerifyRocksDBOptionsFromFile(
+        config_options, db_opt, cf_names, cf_opts, file_name, fs);
+  }
+  return s;
+}
+
+RocksDBOptionsParser::RocksDBOptionsParser() { Reset(); }
+
+void RocksDBOptionsParser::Reset() {
+  db_opt_ = DBOptions();
+  db_opt_map_.clear();
+  cf_names_.clear();
+  cf_opts_.clear();
+  cf_opt_maps_.clear();
+  has_version_section_ = false;
+  has_db_options_ = false;
+  has_default_cf_options_ = false;
+  for (int i = 0; i < 3; ++i) {
+    db_version[i] = 0;
+    opt_file_version[i] = 0;
+  }
+}
+
+bool RocksDBOptionsParser::IsSection(const std::string& line) {
+  if (line.size() < 2) {
+    return false;
+  }
+  if (line[0] != '[' || line[line.size() - 1] != ']') {
+    return false;
+  }
+  return true;
+}
+
+Status RocksDBOptionsParser::ParseSection(OptionSection* section,
+                                          std::string* title,
+                                          std::string* argument,
+                                          const std::string& line,
+                                          const int line_num) {
+  *section = kOptionSectionUnknown;
+  // A section is of the form [<SectionName> "<SectionArg>"], where
+  // "<SectionArg>" is optional.
+  size_t arg_start_pos = line.find("\"");
+  size_t arg_end_pos = line.rfind("\"");
+  // The following if-then check tries to identify whether the input
+  // section has the optional section argument.
+  if (arg_start_pos != std::string::npos && arg_start_pos != arg_end_pos) {
+    *title = TrimAndRemoveComment(line.substr(1, arg_start_pos - 1), true);
+    *argument = UnescapeOptionString(
+        line.substr(arg_start_pos + 1, arg_end_pos - arg_start_pos - 1));
+  } else {
+    *title = TrimAndRemoveComment(line.substr(1, line.size() - 2), true);
+    *argument = "";
+  }
+  for (int i = 0; i < kOptionSectionUnknown; ++i) {
+    if (title->find(opt_section_titles[i]) == 0) {
+      if (i == kOptionSectionVersion || i == kOptionSectionDBOptions ||
+          i == kOptionSectionCFOptions) {
+        if (title->size() == opt_section_titles[i].size()) {
+          // if true, then it indicats equal
+          *section = static_cast<OptionSection>(i);
+          return CheckSection(*section, *argument, line_num);
+        }
+      } else if (i == kOptionSectionTableOptions) {
+        // This type of sections has a sufffix at the end of the
+        // section title
+        if (title->size() > opt_section_titles[i].size()) {
+          *section = static_cast<OptionSection>(i);
+          return CheckSection(*section, *argument, line_num);
+        }
+      }
+    }
+  }
+  return Status::InvalidArgument(std::string("Unknown section ") + line);
+}
+
+Status RocksDBOptionsParser::InvalidArgument(const int line_num,
+                                             const std::string& message) {
+  return Status::InvalidArgument(
+      "[RocksDBOptionsParser Error] ",
+      message + " (at line " + std::to_string(line_num) + ")");
+}
+
+Status RocksDBOptionsParser::ParseStatement(std::string* name,
+                                            std::string* value,
+                                            const std::string& line,
+                                            const int line_num) {
+  size_t eq_pos = line.find("=");
+  if (eq_pos == std::string::npos) {
+    return InvalidArgument(line_num, "A valid statement must have a '='.");
+  }
+
+  *name = TrimAndRemoveComment(line.substr(0, eq_pos), true);
+  *value =
+      TrimAndRemoveComment(line.substr(eq_pos + 1, line.size() - eq_pos - 1));
+  if (name->empty()) {
+    return InvalidArgument(line_num,
+                           "A valid statement must have a variable name.");
+  }
+  return Status::OK();
+}
+
+Status RocksDBOptionsParser::Parse(const std::string& file_name, FileSystem* fs,
+                                   bool ignore_unknown_options,
+                                   size_t file_readahead_size) {
+  ConfigOptions
+      config_options;  // Use default for escaped(true) and check (exact)
+  config_options.ignore_unknown_options = ignore_unknown_options;
+  if (file_readahead_size > 0) {
+    config_options.file_readahead_size = file_readahead_size;
+  }
+  return Parse(config_options, file_name, fs);
+}
+
+Status RocksDBOptionsParser::Parse(const ConfigOptions& config_options_in,
+                                   const std::string& file_name,
+                                   FileSystem* fs) {
+  Reset();
+  ConfigOptions config_options = config_options_in;
+
+  std::unique_ptr<FSSequentialFile> seq_file;
+  Status s = fs->NewSequentialFile(file_name, FileOptions(), &seq_file,
+                                   nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+  LineFileReader lf_reader(std::move(seq_file), file_name,
+                           config_options.file_readahead_size);
+
+  OptionSection section = kOptionSectionUnknown;
+  std::string title;
+  std::string argument;
+  std::unordered_map<std::string, std::string> opt_map;
+  std::string line;
+  // we only support single-lined statement.
+  while (lf_reader.ReadLine(&line, Env::IO_TOTAL /* rate_limiter_priority */)) {
+    int line_num = static_cast<int>(lf_reader.GetLineNumber());
+    line = TrimAndRemoveComment(line);
+    if (line.empty()) {
+      continue;
+    }
+    if (IsSection(line)) {
+      s = EndSection(config_options, section, title, argument, opt_map);
+      opt_map.clear();
+      if (!s.ok()) {
+        return s;
+      }
+
+      // If the option file is not generated by a higher minor version,
+      // there shouldn't be any unknown option.
+      if (config_options.ignore_unknown_options &&
+          section == kOptionSectionVersion) {
+        if (db_version[0] < ROCKSDB_MAJOR || (db_version[0] == ROCKSDB_MAJOR &&
+                                              db_version[1] <= ROCKSDB_MINOR)) {
+          config_options.ignore_unknown_options = false;
+        }
+      }
+
+      s = ParseSection(&section, &title, &argument, line, line_num);
+      if (!s.ok()) {
+        return s;
+      }
+    } else {
+      std::string name;
+      std::string value;
+      s = ParseStatement(&name, &value, line, line_num);
+      if (!s.ok()) {
+        return s;
+      }
+      opt_map.insert({name, value});
+    }
+  }
+  s = lf_reader.GetStatus();
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = EndSection(config_options, section, title, argument, opt_map);
+  opt_map.clear();
+  if (!s.ok()) {
+    return s;
+  }
+  return ValidityCheck();
+}
+
+Status RocksDBOptionsParser::CheckSection(const OptionSection section,
+                                          const std::string& section_arg,
+                                          const int line_num) {
+  if (section == kOptionSectionDBOptions) {
+    if (has_db_options_) {
+      return InvalidArgument(
+          line_num,
+          "More than one DBOption section found in the option config file");
+    }
+    has_db_options_ = true;
+  } else if (section == kOptionSectionCFOptions) {
+    bool is_default_cf = (section_arg == kDefaultColumnFamilyName);
+    if (cf_opts_.size() == 0 && !is_default_cf) {
+      return InvalidArgument(
+          line_num,
+          "Default column family must be the first CFOptions section "
+          "in the option config file");
+    } else if (cf_opts_.size() != 0 && is_default_cf) {
+      return InvalidArgument(
+          line_num,
+          "Default column family must be the first CFOptions section "
+          "in the optio/n config file");
+    } else if (GetCFOptions(section_arg) != nullptr) {
+      return InvalidArgument(
+          line_num,
+          "Two identical column families found in option config file");
+    }
+    has_default_cf_options_ |= is_default_cf;
+  } else if (section == kOptionSectionTableOptions) {
+    if (GetCFOptions(section_arg) == nullptr) {
+      return InvalidArgument(
+          line_num, std::string(
+                        "Does not find a matched column family name in "
+                        "TableOptions section.  Column Family Name:") +
+                        section_arg);
+    }
+  } else if (section == kOptionSectionVersion) {
+    if (has_version_section_) {
+      return InvalidArgument(
+          line_num,
+          "More than one Version section found in the option config file.");
+    }
+    has_version_section_ = true;
+  }
+  return Status::OK();
+}
+
+Status RocksDBOptionsParser::ParseVersionNumber(const std::string& ver_name,
+                                                const std::string& ver_string,
+                                                const int max_count,
+                                                int* version) {
+  int version_index = 0;
+  int current_number = 0;
+  int current_digit_count = 0;
+  bool has_dot = false;
+  for (int i = 0; i < max_count; ++i) {
+    version[i] = 0;
+  }
+  constexpr int kBufferSize = 200;
+  char buffer[kBufferSize];
+  for (size_t i = 0; i < ver_string.size(); ++i) {
+    if (ver_string[i] == '.') {
+      if (version_index >= max_count - 1) {
+        snprintf(buffer, sizeof(buffer) - 1,
+                 "A valid %s can only contains at most %d dots.",
+                 ver_name.c_str(), max_count - 1);
+        return Status::InvalidArgument(buffer);
+      }
+      if (current_digit_count == 0) {
+        snprintf(buffer, sizeof(buffer) - 1,
+                 "A valid %s must have at least one digit before each dot.",
+                 ver_name.c_str());
+        return Status::InvalidArgument(buffer);
+      }
+      version[version_index++] = current_number;
+      current_number = 0;
+      current_digit_count = 0;
+      has_dot = true;
+    } else if (isdigit(ver_string[i])) {
+      current_number = current_number * 10 + (ver_string[i] - '0');
+      current_digit_count++;
+    } else {
+      snprintf(buffer, sizeof(buffer) - 1,
+               "A valid %s can only contains dots and numbers.",
+               ver_name.c_str());
+      return Status::InvalidArgument(buffer);
+    }
+  }
+  version[version_index] = current_number;
+  if (has_dot && current_digit_count == 0) {
+    snprintf(buffer, sizeof(buffer) - 1,
+             "A valid %s must have at least one digit after each dot.",
+             ver_name.c_str());
+    return Status::InvalidArgument(buffer);
+  }
+  return Status::OK();
+}
+
+Status RocksDBOptionsParser::EndSection(
+    const ConfigOptions& config_options, const OptionSection section,
+    const std::string& section_title, const std::string& section_arg,
+    const std::unordered_map<std::string, std::string>& opt_map) {
+  Status s;
+  if (section == kOptionSectionDBOptions) {
+    s = GetDBOptionsFromMap(config_options, DBOptions(), opt_map, &db_opt_);
+    if (!s.ok()) {
+      return s;
+    }
+    db_opt_map_ = opt_map;
+  } else if (section == kOptionSectionCFOptions) {
+    // This condition should be ensured earlier in ParseSection
+    // so we make an assertion here.
+    assert(GetCFOptions(section_arg) == nullptr);
+    cf_names_.emplace_back(section_arg);
+    cf_opts_.emplace_back();
+    s = GetColumnFamilyOptionsFromMap(config_options, ColumnFamilyOptions(),
+                                      opt_map, &cf_opts_.back());
+    if (!s.ok()) {
+      return s;
+    }
+    // keep the parsed string.
+    cf_opt_maps_.emplace_back(opt_map);
+  } else if (section == kOptionSectionTableOptions) {
+    assert(GetCFOptions(section_arg) != nullptr);
+    auto* cf_opt = GetCFOptionsImpl(section_arg);
+    if (cf_opt == nullptr) {
+      return Status::InvalidArgument(
+          "The specified column family must be defined before the "
+          "TableOptions section:",
+          section_arg);
+    }
+    // Ignore error as table factory deserialization is optional
+    cf_opt->table_factory.reset();
+    s = TableFactory::CreateFromString(
+        config_options,
+        section_title.substr(
+            opt_section_titles[kOptionSectionTableOptions].size()),
+        &(cf_opt->table_factory));
+    if (s.ok() && cf_opt->table_factory != nullptr) {
+      s = cf_opt->table_factory->ConfigureFromMap(config_options, opt_map);
+      // Translate any errors (NotFound, NotSupported, to InvalidArgument
+      if (s.ok() || s.IsInvalidArgument()) {
+        return s;
+      } else {
+        return Status::InvalidArgument(s.getState());
+      }
+    } else {
+      // Return OK for not supported table factories as TableFactory
+      // Deserialization is optional.
+      cf_opt->table_factory.reset();
+      return Status::OK();
+    }
+  } else if (section == kOptionSectionVersion) {
+    for (const auto& pair : opt_map) {
+      if (pair.first == "rocksdb_version") {
+        s = ParseVersionNumber(pair.first, pair.second, 3, db_version);
+        if (!s.ok()) {
+          return s;
+        }
+      } else if (pair.first == "options_file_version") {
+        s = ParseVersionNumber(pair.first, pair.second, 2, opt_file_version);
+        if (!s.ok()) {
+          return s;
+        }
+        if (opt_file_version[0] < 1) {
+          return Status::InvalidArgument(
+              "A valid options_file_version must be at least 1.");
+        }
+      }
+    }
+  }
+  return s;
+}
+
+Status RocksDBOptionsParser::ValidityCheck() {
+  if (!has_db_options_) {
+    return Status::Corruption(
+        "A RocksDB Option file must have a single DBOptions section");
+  }
+  if (!has_default_cf_options_) {
+    return Status::Corruption(
+        "A RocksDB Option file must have a single CFOptions:default section");
+  }
+
+  return Status::OK();
+}
+
+std::string RocksDBOptionsParser::TrimAndRemoveComment(const std::string& line,
+                                                       bool trim_only) {
+  size_t start = 0;
+  size_t end = line.size();
+
+  // we only support "#" style comment
+  if (!trim_only) {
+    size_t search_pos = 0;
+    while (search_pos < line.size()) {
+      size_t comment_pos = line.find('#', search_pos);
+      if (comment_pos == std::string::npos) {
+        break;
+      }
+      if (comment_pos == 0 || line[comment_pos - 1] != '\\') {
+        end = comment_pos;
+        break;
+      }
+      search_pos = comment_pos + 1;
+    }
+  }
+
+  while (start < end && isspace(line[start]) != 0) {
+    ++start;
+  }
+
+  // start < end implies end > 0.
+  while (start < end && isspace(line[end - 1]) != 0) {
+    --end;
+  }
+
+  if (start < end) {
+    return line.substr(start, end - start);
+  }
+
+  return "";
+}
+
+Status RocksDBOptionsParser::VerifyRocksDBOptionsFromFile(
+    const ConfigOptions& config_options_in, const DBOptions& db_opt,
+    const std::vector<std::string>& cf_names,
+    const std::vector<ColumnFamilyOptions>& cf_opts,
+    const std::string& file_name, FileSystem* fs) {
+  RocksDBOptionsParser parser;
+  ConfigOptions config_options = config_options_in;
+  config_options.invoke_prepare_options =
+      false;  // No need to do a prepare for verify
+  if (config_options.sanity_level < ConfigOptions::kSanityLevelExactMatch) {
+    // If we are not doing an exact comparison, we should ignore
+    // unsupported options, as they may cause the Parse to fail
+    // (if the ObjectRegistry is not initialized)
+    config_options.ignore_unsupported_options = true;
+  }
+  Status s = parser.Parse(config_options, file_name, fs);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Verify DBOptions
+  s = VerifyDBOptions(config_options, db_opt, *parser.db_opt(),
+                      parser.db_opt_map());
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Verify ColumnFamily Name
+  if (cf_names.size() != parser.cf_names()->size()) {
+    if (config_options.sanity_level >=
+        ConfigOptions::kSanityLevelLooselyCompatible) {
+      return Status::InvalidArgument(
+          "[RocksDBOptionParser Error] The persisted options does not have "
+          "the same number of column family names as the db instance.");
+    } else if (cf_opts.size() > parser.cf_opts()->size()) {
+      return Status::InvalidArgument(
+          "[RocksDBOptionsParser Error]",
+          "The persisted options file has less number of column family "
+          "names than that of the specified one.");
+    }
+  }
+  for (size_t i = 0; i < cf_names.size(); ++i) {
+    if (cf_names[i] != parser.cf_names()->at(i)) {
+      return Status::InvalidArgument(
+          "[RocksDBOptionParser Error] The persisted options and the db"
+          "instance does not have the same name for column family ",
+          std::to_string(i));
+    }
+  }
+
+  // Verify Column Family Options
+  if (cf_opts.size() != parser.cf_opts()->size()) {
+    if (config_options.sanity_level >=
+        ConfigOptions::kSanityLevelLooselyCompatible) {
+      return Status::InvalidArgument(
+          "[RocksDBOptionsParser Error]",
+          "The persisted options does not have the same number of "
+          "column families as the db instance.");
+    } else if (cf_opts.size() > parser.cf_opts()->size()) {
+      return Status::InvalidArgument(
+          "[RocksDBOptionsParser Error]",
+          "The persisted options file has less number of column families "
+          "than that of the specified number.");
+    }
+  }
+  for (size_t i = 0; i < cf_opts.size(); ++i) {
+    s = VerifyCFOptions(config_options, cf_opts[i], parser.cf_opts()->at(i),
+                        &(parser.cf_opt_maps()->at(i)));
+    if (!s.ok()) {
+      return s;
+    }
+    s = VerifyTableFactory(config_options, cf_opts[i].table_factory.get(),
+                           parser.cf_opts()->at(i).table_factory.get());
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  return Status::OK();
+}
+
+Status RocksDBOptionsParser::VerifyDBOptions(
+    const ConfigOptions& config_options, const DBOptions& base_opt,
+    const DBOptions& file_opt,
+    const std::unordered_map<std::string, std::string>* opt_map) {
+  auto base_config = DBOptionsAsConfigurable(base_opt, opt_map);
+  auto file_config = DBOptionsAsConfigurable(file_opt, opt_map);
+  std::string mismatch;
+  if (!base_config->AreEquivalent(config_options, file_config.get(),
+                                  &mismatch)) {
+    const size_t kBufferSize = 2048;
+    char buffer[kBufferSize];
+    std::string base_value;
+    std::string file_value;
+    int offset = snprintf(buffer, sizeof(buffer),
+                          "[RocksDBOptionsParser]: "
+                          "failed the verification on DBOptions::%s -- ",
+                          mismatch.c_str());
+    Status s = base_config->GetOption(config_options, mismatch, &base_value);
+    if (s.ok()) {
+      s = file_config->GetOption(config_options, mismatch, &file_value);
+    }
+    assert(offset >= 0);
+    assert(static_cast<size_t>(offset) < sizeof(buffer));
+    if (s.ok()) {
+      snprintf(buffer + offset, sizeof(buffer) - static_cast<size_t>(offset),
+               "-- The specified one is %s while the persisted one is %s.\n",
+               base_value.c_str(), file_value.c_str());
+    } else {
+      snprintf(buffer + offset, sizeof(buffer) - static_cast<size_t>(offset),
+               "-- Unable to re-serialize an option: %s.\n",
+               s.ToString().c_str());
+    }
+    return Status::InvalidArgument(Slice(buffer, strlen(buffer)));
+  }
+  return Status::OK();
+}
+
+Status RocksDBOptionsParser::VerifyCFOptions(
+    const ConfigOptions& config_options, const ColumnFamilyOptions& base_opt,
+    const ColumnFamilyOptions& file_opt,
+    const std::unordered_map<std::string, std::string>* opt_map) {
+  auto base_config = CFOptionsAsConfigurable(base_opt, opt_map);
+  auto file_config = CFOptionsAsConfigurable(file_opt, opt_map);
+  std::string mismatch;
+  if (!base_config->AreEquivalent(config_options, file_config.get(),
+                                  &mismatch)) {
+    std::string base_value;
+    std::string file_value;
+    // The options do not match
+    const size_t kBufferSize = 2048;
+    char buffer[kBufferSize];
+    Status s = base_config->GetOption(config_options, mismatch, &base_value);
+    if (s.ok()) {
+      s = file_config->GetOption(config_options, mismatch, &file_value);
+    }
+    int offset = snprintf(buffer, sizeof(buffer),
+                          "[RocksDBOptionsParser]: "
+                          "failed the verification on ColumnFamilyOptions::%s",
+                          mismatch.c_str());
+    assert(offset >= 0);
+    assert(static_cast<size_t>(offset) < sizeof(buffer));
+    if (s.ok()) {
+      snprintf(buffer + offset, sizeof(buffer) - static_cast<size_t>(offset),
+               "--- The specified one is %s while the persisted one is %s.\n",
+               base_value.c_str(), file_value.c_str());
+    } else {
+      snprintf(buffer + offset, sizeof(buffer) - static_cast<size_t>(offset),
+               "--- Unable to re-serialize an option: %s.\n",
+               s.ToString().c_str());
+    }
+    return Status::InvalidArgument(Slice(buffer, sizeof(buffer)));
+  }  // For each option
+  return Status::OK();
+}
+
+Status RocksDBOptionsParser::VerifyTableFactory(
+    const ConfigOptions& config_options, const TableFactory* base_tf,
+    const TableFactory* file_tf) {
+  std::string mismatch;
+  if (base_tf && file_tf) {
+    if (config_options.sanity_level > ConfigOptions::kSanityLevelNone &&
+        std::string(base_tf->Name()) != std::string(file_tf->Name())) {
+      return Status::Corruption(
+          "[RocksDBOptionsParser]: "
+          "failed the verification on TableFactory->Name()");
+    } else if (!base_tf->AreEquivalent(config_options, file_tf, &mismatch)) {
+      return Status::Corruption(std::string("[RocksDBOptionsParser]:"
+                                            "failed the verification on ") +
+                                    base_tf->Name() + "::",
+                                mismatch);
+    }
+  } else {
+    // TODO(yhchiang): further support sanity check here
+  }
+  return Status::OK();
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/options/options_parser.h b/src/rocksdb/options/options_parser.h
new file mode 100644
index 000000000..20e3d772d
--- /dev/null
+++ b/src/rocksdb/options/options_parser.h
@@ -0,0 +1,151 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+struct ConfigOptions;
+class OptionTypeInfo;
+class TableFactory;
+
+#define ROCKSDB_OPTION_FILE_MAJOR 1
+#define ROCKSDB_OPTION_FILE_MINOR 1
+
+enum OptionSection : char {
+  kOptionSectionVersion = 0,
+  kOptionSectionDBOptions,
+  kOptionSectionCFOptions,
+  kOptionSectionTableOptions,
+  kOptionSectionUnknown
+};
+
+static const std::string opt_section_titles[] = {
+    "Version", "DBOptions", "CFOptions", "TableOptions/", "Unknown"};
+
+Status PersistRocksDBOptions(const DBOptions& db_opt,
+                             const std::vector<std::string>& cf_names,
+                             const std::vector<ColumnFamilyOptions>& cf_opts,
+                             const std::string& file_name, FileSystem* fs);
+Status PersistRocksDBOptions(const ConfigOptions& config_options,
+                             const DBOptions& db_opt,
+                             const std::vector<std::string>& cf_names,
+                             const std::vector<ColumnFamilyOptions>& cf_opts,
+                             const std::string& file_name, FileSystem* fs);
+
+class RocksDBOptionsParser {
+ public:
+  explicit RocksDBOptionsParser();
+  ~RocksDBOptionsParser() {}
+  void Reset();
+
+  // `file_readahead_size` is used for readahead for the option file.
+  // If 0 is given, a default value will be used.
+  Status Parse(const std::string& file_name, FileSystem* fs,
+               bool ignore_unknown_options, size_t file_readahead_size);
+
+  Status Parse(const ConfigOptions& config_options,
+               const std::string& file_name, FileSystem* fs);
+
+  static std::string TrimAndRemoveComment(const std::string& line,
+                                          const bool trim_only = false);
+
+  const DBOptions* db_opt() const { return &db_opt_; }
+  const std::unordered_map<std::string, std::string>* db_opt_map() const {
+    return &db_opt_map_;
+  }
+  const std::vector<ColumnFamilyOptions>* cf_opts() const { return &cf_opts_; }
+  const std::vector<std::string>* cf_names() const { return &cf_names_; }
+  const std::vector<std::unordered_map<std::string, std::string>>* cf_opt_maps()
+      const {
+    return &cf_opt_maps_;
+  }
+
+  const ColumnFamilyOptions* GetCFOptions(const std::string& name) {
+    return GetCFOptionsImpl(name);
+  }
+  size_t NumColumnFamilies() { return cf_opts_.size(); }
+  static Status VerifyRocksDBOptionsFromFile(
+      const ConfigOptions& config_options, const DBOptions& db_opt,
+      const std::vector<std::string>& cf_names,
+      const std::vector<ColumnFamilyOptions>& cf_opts,
+      const std::string& file_name, FileSystem* fs);
+  static Status VerifyDBOptions(
+      const ConfigOptions& config_options, const DBOptions& base_opt,
+      const DBOptions& new_opt,
+      const std::unordered_map<std::string, std::string>* new_opt_map =
+          nullptr);
+
+  static Status VerifyCFOptions(
+      const ConfigOptions& config_options, const ColumnFamilyOptions& base_opt,
+      const ColumnFamilyOptions& new_opt,
+      const std::unordered_map<std::string, std::string>* new_opt_map =
+          nullptr);
+
+  static Status VerifyTableFactory(const ConfigOptions& config_options,
+                                   const TableFactory* base_tf,
+                                   const TableFactory* file_tf);
+
+  static Status ExtraParserCheck(const RocksDBOptionsParser& input_parser);
+
+  static Status ParseStatement(std::string* name, std::string* value,
+                               const std::string& line, const int line_num);
+
+ protected:
+  bool IsSection(const std::string& line);
+  Status ParseSection(OptionSection* section, std::string* title,
+                      std::string* argument, const std::string& line,
+                      const int line_num);
+
+  Status CheckSection(const OptionSection section,
+                      const std::string& section_arg, const int line_num);
+
+  Status EndSection(
+      const ConfigOptions& config_options, const OptionSection section,
+      const std::string& title, const std::string& section_arg,
+      const std::unordered_map<std::string, std::string>& opt_map);
+
+  Status ValidityCheck();
+
+  static Status InvalidArgument(const int line_num, const std::string& message);
+
+  Status ParseVersionNumber(const std::string& ver_name,
+                            const std::string& ver_string, const int max_count,
+                            int* version);
+
+  ColumnFamilyOptions* GetCFOptionsImpl(const std::string& name) {
+    assert(cf_names_.size() == cf_opts_.size());
+    for (size_t i = 0; i < cf_names_.size(); ++i) {
+      if (cf_names_[i] == name) {
+        return &cf_opts_[i];
+      }
+    }
+    return nullptr;
+  }
+
+ private:
+  DBOptions db_opt_;
+  std::unordered_map<std::string, std::string> db_opt_map_;
+  std::vector<std::string> cf_names_;
+  std::vector<ColumnFamilyOptions> cf_opts_;
+  std::vector<std::unordered_map<std::string, std::string>> cf_opt_maps_;
+  bool has_version_section_;
+  bool has_db_options_;
+  bool has_default_cf_options_;
+  int db_version[3];
+  int opt_file_version[3];
+};
+
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/options/options_settable_test.cc b/src/rocksdb/options/options_settable_test.cc
new file mode 100644
index 000000000..63e9721ca
--- /dev/null
+++ b/src/rocksdb/options/options_settable_test.cc
@@ -0,0 +1,621 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <cstring>
+
+#include "options/cf_options.h"
+#include "options/db_options.h"
+#include "options/options_helper.h"
+#include "rocksdb/convenience.h"
+#include "test_util/testharness.h"
+
+#ifndef GFLAGS
+bool FLAGS_enable_print = false;
+#else
+#include "util/gflags_compat.h"
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+DEFINE_bool(enable_print, false, "Print options generated to console.");
+#endif  // GFLAGS
+
+namespace ROCKSDB_NAMESPACE {
+
+// Verify options are settable from options strings.
+// We take the approach that depends on compiler behavior that copy constructor
+// won't touch implicit padding bytes, so that the test is fragile.
+// As a result, we only run the tests to verify new fields in options are
+// settable through string on limited platforms as it depends on behavior of
+// compilers.
+#ifndef ROCKSDB_LITE
+#if defined OS_LINUX || defined OS_WIN
+#ifndef __clang__
+#ifndef ROCKSDB_UBSAN_RUN
+
+class OptionsSettableTest : public testing::Test {
+ public:
+  OptionsSettableTest() {}
+};
+
+const char kSpecialChar = 'z';
+using OffsetGap = std::vector<std::pair<size_t, size_t>>;
+
+void FillWithSpecialChar(char* start_ptr, size_t total_size,
+                         const OffsetGap& excluded,
+                         char special_char = kSpecialChar) {
+  size_t offset = 0;
+  // The excluded vector contains pairs of bytes, (first, second).
+  // The first bytes are all set to the special char (represented as 'c' below).
+  // The second bytes are simply skipped (padding bytes).
+  // ccccc[skipped]cccccccc[skiped]cccccccc[skipped]
+  for (auto& pair : excluded) {
+    std::memset(start_ptr + offset, special_char, pair.first - offset);
+    offset = pair.first + pair.second;
+  }
+  // The rest of the structure is filled with the special characters.
+  // ccccc[skipped]cccccccc[skiped]cccccccc[skipped]cccccccccccccccc
+  std::memset(start_ptr + offset, special_char, total_size - offset);
+}
+
+int NumUnsetBytes(char* start_ptr, size_t total_size,
+                  const OffsetGap& excluded) {
+  int total_unset_bytes_base = 0;
+  size_t offset = 0;
+  for (auto& pair : excluded) {
+    // The first part of the structure contains memory spaces that can be
+    // set (pair.first), and memory spaces that cannot be set (pair.second).
+    // Therefore total_unset_bytes_base only agregates bytes set to kSpecialChar
+    // in the pair.first bytes, but skips the pair.second bytes (padding bytes).
+    for (char* ptr = start_ptr + offset; ptr < start_ptr + pair.first; ptr++) {
+      if (*ptr == kSpecialChar) {
+        total_unset_bytes_base++;
+      }
+    }
+    offset = pair.first + pair.second;
+  }
+  // Then total_unset_bytes_base aggregates the bytes
+  // set to kSpecialChar in the rest of the structure
+  for (char* ptr = start_ptr + offset; ptr < start_ptr + total_size; ptr++) {
+    if (*ptr == kSpecialChar) {
+      total_unset_bytes_base++;
+    }
+  }
+  return total_unset_bytes_base;
+}
+
+// Return true iff two structs are the same except excluded fields.
+bool CompareBytes(char* start_ptr1, char* start_ptr2, size_t total_size,
+                  const OffsetGap& excluded) {
+  size_t offset = 0;
+  for (auto& pair : excluded) {
+    for (; offset < pair.first; offset++) {
+      if (*(start_ptr1 + offset) != *(start_ptr2 + offset)) {
+        return false;
+      }
+    }
+    offset = pair.first + pair.second;
+  }
+  for (; offset < total_size; offset++) {
+    if (*(start_ptr1 + offset) != *(start_ptr2 + offset)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// If the test fails, likely a new option is added to BlockBasedTableOptions
+// but it cannot be set through GetBlockBasedTableOptionsFromString(), or the
+// test is not updated accordingly.
+// After adding an option, we need to make sure it is settable by
+// GetBlockBasedTableOptionsFromString() and add the option to the input string
+// passed to the GetBlockBasedTableOptionsFromString() in this test.
+// If it is a complicated type, you also need to add the field to
+// kBbtoExcluded, and maybe add customized verification for it.
+TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) {
+  // Items in the form of <offset, size>. Need to be in ascending order
+  // and not overlapping. Need to update if new option to be excluded is added
+  // (e.g, pointer-type)
+  const OffsetGap kBbtoExcluded = {
+      {offsetof(struct BlockBasedTableOptions, flush_block_policy_factory),
+       sizeof(std::shared_ptr<FlushBlockPolicyFactory>)},
+      {offsetof(struct BlockBasedTableOptions, block_cache),
+       sizeof(std::shared_ptr<Cache>)},
+      {offsetof(struct BlockBasedTableOptions, persistent_cache),
+       sizeof(std::shared_ptr<PersistentCache>)},
+      {offsetof(struct BlockBasedTableOptions, block_cache_compressed),
+       sizeof(std::shared_ptr<Cache>)},
+      {offsetof(struct BlockBasedTableOptions, cache_usage_options),
+       sizeof(CacheUsageOptions)},
+      {offsetof(struct BlockBasedTableOptions, filter_policy),
+       sizeof(std::shared_ptr<const FilterPolicy>)},
+  };
+
+  // In this test, we catch a new option of BlockBasedTableOptions that is not
+  // settable through GetBlockBasedTableOptionsFromString().
+  // We count padding bytes of the option struct, and assert it to be the same
+  // as unset bytes of an option struct initialized by
+  // GetBlockBasedTableOptionsFromString().
+
+  char* bbto_ptr = new char[sizeof(BlockBasedTableOptions)];
+
+  // Count padding bytes by setting all bytes in the memory to a special char,
+  // copy a well constructed struct to this memory and see how many special
+  // bytes left.
+  BlockBasedTableOptions* bbto = new (bbto_ptr) BlockBasedTableOptions();
+  FillWithSpecialChar(bbto_ptr, sizeof(BlockBasedTableOptions), kBbtoExcluded);
+  // It based on the behavior of compiler that padding bytes are not changed
+  // when copying the struct. It's prone to failure when compiler behavior
+  // changes. We verify there is unset bytes to detect the case.
+  *bbto = BlockBasedTableOptions();
+  int unset_bytes_base =
+      NumUnsetBytes(bbto_ptr, sizeof(BlockBasedTableOptions), kBbtoExcluded);
+  ASSERT_GT(unset_bytes_base, 0);
+  bbto->~BlockBasedTableOptions();
+
+  // Construct the base option passed into
+  // GetBlockBasedTableOptionsFromString().
+  bbto = new (bbto_ptr) BlockBasedTableOptions();
+  FillWithSpecialChar(bbto_ptr, sizeof(BlockBasedTableOptions), kBbtoExcluded);
+  // This option is not setable:
+  bbto->use_delta_encoding = true;
+
+  char* new_bbto_ptr = new char[sizeof(BlockBasedTableOptions)];
+  BlockBasedTableOptions* new_bbto =
+      new (new_bbto_ptr) BlockBasedTableOptions();
+  FillWithSpecialChar(new_bbto_ptr, sizeof(BlockBasedTableOptions),
+                      kBbtoExcluded);
+
+  // Need to update the option string if a new option is added.
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      *bbto,
+      "cache_index_and_filter_blocks=1;"
+      "cache_index_and_filter_blocks_with_high_priority=true;"
+      "metadata_cache_options={top_level_index_pinning=kFallback;"
+      "partition_pinning=kAll;"
+      "unpartitioned_pinning=kFlushedAndSimilar;};"
+      "pin_l0_filter_and_index_blocks_in_cache=1;"
+      "pin_top_level_index_and_filter=1;"
+      "index_type=kHashSearch;"
+      "data_block_index_type=kDataBlockBinaryAndHash;"
+      "index_shortening=kNoShortening;"
+      "data_block_hash_table_util_ratio=0.75;"
+      "checksum=kxxHash;no_block_cache=1;"
+      "block_cache=1M;block_cache_compressed=1k;block_size=1024;"
+      "block_size_deviation=8;block_restart_interval=4; "
+      "metadata_block_size=1024;"
+      "partition_filters=false;"
+      "optimize_filters_for_memory=true;"
+      "index_block_restart_interval=4;"
+      "filter_policy=bloomfilter:4:true;whole_key_filtering=1;detect_filter_"
+      "construct_corruption=false;"
+      "format_version=1;"
+      "verify_compression=true;read_amp_bytes_per_bit=0;"
+      "enable_index_compression=false;"
+      "block_align=true;"
+      "max_auto_readahead_size=0;"
+      "prepopulate_block_cache=kDisable;"
+      "initial_auto_readahead_size=0;"
+      "num_file_reads_for_auto_readahead=0",
+      new_bbto));
+
+  ASSERT_EQ(unset_bytes_base,
+            NumUnsetBytes(new_bbto_ptr, sizeof(BlockBasedTableOptions),
+                          kBbtoExcluded));
+
+  ASSERT_TRUE(new_bbto->block_cache.get() != nullptr);
+  ASSERT_TRUE(new_bbto->block_cache_compressed.get() != nullptr);
+  ASSERT_TRUE(new_bbto->filter_policy.get() != nullptr);
+
+  bbto->~BlockBasedTableOptions();
+  new_bbto->~BlockBasedTableOptions();
+
+  delete[] bbto_ptr;
+  delete[] new_bbto_ptr;
+}
+
+// If the test fails, likely a new option is added to DBOptions
+// but it cannot be set through GetDBOptionsFromString(), or the test is not
+// updated accordingly.
+// After adding an option, we need to make sure it is settable by
+// GetDBOptionsFromString() and add the option to the input string passed to
+// DBOptionsFromString()in this test.
+// If it is a complicated type, you also need to add the field to
+// kDBOptionsExcluded, and maybe add customized verification for it.
+TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
+  const OffsetGap kDBOptionsExcluded = {
+      {offsetof(struct DBOptions, env), sizeof(Env*)},
+      {offsetof(struct DBOptions, rate_limiter),
+       sizeof(std::shared_ptr<RateLimiter>)},
+      {offsetof(struct DBOptions, sst_file_manager),
+       sizeof(std::shared_ptr<SstFileManager>)},
+      {offsetof(struct DBOptions, info_log), sizeof(std::shared_ptr<Logger>)},
+      {offsetof(struct DBOptions, statistics),
+       sizeof(std::shared_ptr<Statistics>)},
+      {offsetof(struct DBOptions, db_paths), sizeof(std::vector<DbPath>)},
+      {offsetof(struct DBOptions, db_log_dir), sizeof(std::string)},
+      {offsetof(struct DBOptions, wal_dir), sizeof(std::string)},
+      {offsetof(struct DBOptions, write_buffer_manager),
+       sizeof(std::shared_ptr<WriteBufferManager>)},
+      {offsetof(struct DBOptions, listeners),
+       sizeof(std::vector<std::shared_ptr<EventListener>>)},
+      {offsetof(struct DBOptions, row_cache), sizeof(std::shared_ptr<Cache>)},
+      {offsetof(struct DBOptions, wal_filter), sizeof(const WalFilter*)},
+      {offsetof(struct DBOptions, file_checksum_gen_factory),
+       sizeof(std::shared_ptr<FileChecksumGenFactory>)},
+      {offsetof(struct DBOptions, db_host_id), sizeof(std::string)},
+      {offsetof(struct DBOptions, checksum_handoff_file_types),
+       sizeof(FileTypeSet)},
+      {offsetof(struct DBOptions, compaction_service),
+       sizeof(std::shared_ptr<CompactionService>)},
+  };
+
+  char* options_ptr = new char[sizeof(DBOptions)];
+
+  // Count padding bytes by setting all bytes in the memory to a special char,
+  // copy a well constructed struct to this memory and see how many special
+  // bytes left.
+  DBOptions* options = new (options_ptr) DBOptions();
+  FillWithSpecialChar(options_ptr, sizeof(DBOptions), kDBOptionsExcluded);
+  // It based on the behavior of compiler that padding bytes are not changed
+  // when copying the struct. It's prone to failure when compiler behavior
+  // changes. We verify there is unset bytes to detect the case.
+  *options = DBOptions();
+  int unset_bytes_base =
+      NumUnsetBytes(options_ptr, sizeof(DBOptions), kDBOptionsExcluded);
+  ASSERT_GT(unset_bytes_base, 0);
+  options->~DBOptions();
+
+  options = new (options_ptr) DBOptions();
+  FillWithSpecialChar(options_ptr, sizeof(DBOptions), kDBOptionsExcluded);
+
+  char* new_options_ptr = new char[sizeof(DBOptions)];
+  DBOptions* new_options = new (new_options_ptr) DBOptions();
+  FillWithSpecialChar(new_options_ptr, sizeof(DBOptions), kDBOptionsExcluded);
+
+  // Need to update the option string if a new option is added.
+  ASSERT_OK(
+      GetDBOptionsFromString(*options,
+                             "wal_bytes_per_sync=4295048118;"
+                             "delete_obsolete_files_period_micros=4294967758;"
+                             "WAL_ttl_seconds=4295008036;"
+                             "WAL_size_limit_MB=4295036161;"
+                             "max_write_batch_group_size_bytes=1048576;"
+                             "wal_dir=path/to/wal_dir;"
+                             "db_write_buffer_size=2587;"
+                             "max_subcompactions=64330;"
+                             "table_cache_numshardbits=28;"
+                             "max_open_files=72;"
+                             "max_file_opening_threads=35;"
+                             "max_background_jobs=8;"
+                             "max_background_compactions=33;"
+                             "use_fsync=true;"
+                             "use_adaptive_mutex=false;"
+                             "max_total_wal_size=4295005604;"
+                             "compaction_readahead_size=0;"
+                             "keep_log_file_num=4890;"
+                             "skip_stats_update_on_db_open=false;"
+                             "skip_checking_sst_file_sizes_on_db_open=false;"
+                             "max_manifest_file_size=4295009941;"
+                             "db_log_dir=path/to/db_log_dir;"
+                             "writable_file_max_buffer_size=1048576;"
+                             "paranoid_checks=true;"
+                             "flush_verify_memtable_count=true;"
+                             "track_and_verify_wals_in_manifest=true;"
+                             "verify_sst_unique_id_in_manifest=true;"
+                             "is_fd_close_on_exec=false;"
+                             "bytes_per_sync=4295013613;"
+                             "strict_bytes_per_sync=true;"
+                             "enable_thread_tracking=false;"
+                             "recycle_log_file_num=0;"
+                             "create_missing_column_families=true;"
+                             "log_file_time_to_roll=3097;"
+                             "max_background_flushes=35;"
+                             "create_if_missing=false;"
+                             "error_if_exists=true;"
+                             "delayed_write_rate=4294976214;"
+                             "manifest_preallocation_size=1222;"
+                             "allow_mmap_writes=false;"
+                             "stats_dump_period_sec=70127;"
+                             "stats_persist_period_sec=54321;"
+                             "persist_stats_to_disk=true;"
+                             "stats_history_buffer_size=14159;"
+                             "allow_fallocate=true;"
+                             "allow_mmap_reads=false;"
+                             "use_direct_reads=false;"
+                             "use_direct_io_for_flush_and_compaction=false;"
+                             "max_log_file_size=4607;"
+                             "random_access_max_buffer_size=1048576;"
+                             "advise_random_on_open=true;"
+                             "fail_if_options_file_error=false;"
+                             "enable_pipelined_write=false;"
+                             "unordered_write=false;"
+                             "allow_concurrent_memtable_write=true;"
+                             "wal_recovery_mode=kPointInTimeRecovery;"
+                             "enable_write_thread_adaptive_yield=true;"
+                             "write_thread_slow_yield_usec=5;"
+                             "write_thread_max_yield_usec=1000;"
+                             "access_hint_on_compaction_start=NONE;"
+                             "info_log_level=DEBUG_LEVEL;"
+                             "dump_malloc_stats=false;"
+                             "allow_2pc=false;"
+                             "avoid_flush_during_recovery=false;"
+                             "avoid_flush_during_shutdown=false;"
+                             "allow_ingest_behind=false;"
+                             "concurrent_prepare=false;"
+                             "two_write_queues=false;"
+                             "manual_wal_flush=false;"
+                             "wal_compression=kZSTD;"
+                             "seq_per_batch=false;"
+                             "atomic_flush=false;"
+                             "avoid_unnecessary_blocking_io=false;"
+                             "log_readahead_size=0;"
+                             "write_dbid_to_manifest=false;"
+                             "best_efforts_recovery=false;"
+                             "max_bgerror_resume_count=2;"
+                             "bgerror_resume_retry_interval=1000000;"
+                             "db_host_id=hostname;"
+                             "lowest_used_cache_tier=kNonVolatileBlockTier;"
+                             "allow_data_in_errors=false;"
+                             "enforce_single_del_contracts=false;",
+                             new_options));
+
+  ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(DBOptions),
+                                            kDBOptionsExcluded));
+
+  options->~DBOptions();
+  new_options->~DBOptions();
+
+  delete[] options_ptr;
+  delete[] new_options_ptr;
+}
+
+// If the test fails, likely a new option is added to ColumnFamilyOptions
+// but it cannot be set through GetColumnFamilyOptionsFromString(), or the
+// test is not updated accordingly.
+// After adding an option, we need to make sure it is settable by
+// GetColumnFamilyOptionsFromString() and add the option to the input
+// string passed to GetColumnFamilyOptionsFromString() in this test.
+// If it is a complicated type, you also need to add the field to
+// kColumnFamilyOptionsExcluded, and maybe add customized verification
+// for it.
+TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
+  // options in the excluded set need to appear in the same order as in
+  // ColumnFamilyOptions.
+  const OffsetGap kColumnFamilyOptionsExcluded = {
+      {offsetof(struct ColumnFamilyOptions, inplace_callback),
+       sizeof(UpdateStatus(*)(char*, uint32_t*, Slice, std::string*))},
+      {offsetof(struct ColumnFamilyOptions,
+                memtable_insert_with_hint_prefix_extractor),
+       sizeof(std::shared_ptr<const SliceTransform>)},
+      {offsetof(struct ColumnFamilyOptions, compression_per_level),
+       sizeof(std::vector<CompressionType>)},
+      {offsetof(struct ColumnFamilyOptions,
+                max_bytes_for_level_multiplier_additional),
+       sizeof(std::vector<int>)},
+      {offsetof(struct ColumnFamilyOptions, memtable_factory),
+       sizeof(std::shared_ptr<MemTableRepFactory>)},
+      {offsetof(struct ColumnFamilyOptions,
+                table_properties_collector_factories),
+       sizeof(ColumnFamilyOptions::TablePropertiesCollectorFactories)},
+      {offsetof(struct ColumnFamilyOptions, preclude_last_level_data_seconds),
+       sizeof(uint64_t)},
+      {offsetof(struct ColumnFamilyOptions, preserve_internal_time_seconds),
+       sizeof(uint64_t)},
+      {offsetof(struct ColumnFamilyOptions, blob_cache),
+       sizeof(std::shared_ptr<Cache>)},
+      {offsetof(struct ColumnFamilyOptions, comparator), sizeof(Comparator*)},
+      {offsetof(struct ColumnFamilyOptions, merge_operator),
+       sizeof(std::shared_ptr<MergeOperator>)},
+      {offsetof(struct ColumnFamilyOptions, compaction_filter),
+       sizeof(const CompactionFilter*)},
+      {offsetof(struct ColumnFamilyOptions, compaction_filter_factory),
+       sizeof(std::shared_ptr<CompactionFilterFactory>)},
+      {offsetof(struct ColumnFamilyOptions, prefix_extractor),
+       sizeof(std::shared_ptr<const SliceTransform>)},
+      {offsetof(struct ColumnFamilyOptions, snap_refresh_nanos),
+       sizeof(uint64_t)},
+      {offsetof(struct ColumnFamilyOptions, table_factory),
+       sizeof(std::shared_ptr<TableFactory>)},
+      {offsetof(struct ColumnFamilyOptions, cf_paths),
+       sizeof(std::vector<DbPath>)},
+      {offsetof(struct ColumnFamilyOptions, compaction_thread_limiter),
+       sizeof(std::shared_ptr<ConcurrentTaskLimiter>)},
+      {offsetof(struct ColumnFamilyOptions, sst_partitioner_factory),
+       sizeof(std::shared_ptr<SstPartitionerFactory>)},
+  };
+
+  char* options_ptr = new char[sizeof(ColumnFamilyOptions)];
+
+  // Count padding bytes by setting all bytes in the memory to a special char,
+  // copy a well constructed struct to this memory and see how many special
+  // bytes left.
+  FillWithSpecialChar(options_ptr, sizeof(ColumnFamilyOptions),
+                      kColumnFamilyOptionsExcluded);
+
+  // Invoke a user-defined constructor in the hope that it does not overwrite
+  // padding bytes. Note that previously we relied on the implicitly-defined
+  // copy-assignment operator (i.e., `*options = ColumnFamilyOptions();`) here,
+  // which did in fact modify padding bytes.
+  ColumnFamilyOptions* options = new (options_ptr) ColumnFamilyOptions();
+
+  int unset_bytes_base = NumUnsetBytes(options_ptr, sizeof(ColumnFamilyOptions),
+                                       kColumnFamilyOptionsExcluded);
+  ASSERT_GT(unset_bytes_base, 0);
+  options->~ColumnFamilyOptions();
+
+  options = new (options_ptr) ColumnFamilyOptions();
+  FillWithSpecialChar(options_ptr, sizeof(ColumnFamilyOptions),
+                      kColumnFamilyOptionsExcluded);
+
+  // Following options are not settable through
+  // GetColumnFamilyOptionsFromString():
+  options->compaction_options_universal = CompactionOptionsUniversal();
+  options->num_levels = 42;  // Initialize options for MutableCF
+  options->compaction_filter = nullptr;
+  options->sst_partitioner_factory = nullptr;
+
+  char* new_options_ptr = new char[sizeof(ColumnFamilyOptions)];
+  ColumnFamilyOptions* new_options =
+      new (new_options_ptr) ColumnFamilyOptions();
+  FillWithSpecialChar(new_options_ptr, sizeof(ColumnFamilyOptions),
+                      kColumnFamilyOptionsExcluded);
+
+  // Need to update the option string if a new option is added.
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      *options,
+      "compaction_filter_factory=mpudlojcujCompactionFilterFactory;"
+      "table_factory=PlainTable;"
+      "prefix_extractor=rocksdb.CappedPrefix.13;"
+      "comparator=leveldb.BytewiseComparator;"
+      "compression_per_level=kBZip2Compression:kBZip2Compression:"
+      "kBZip2Compression:kNoCompression:kZlibCompression:kBZip2Compression:"
+      "kSnappyCompression;"
+      "max_bytes_for_level_base=986;"
+      "bloom_locality=8016;"
+      "target_file_size_base=4294976376;"
+      "memtable_huge_page_size=2557;"
+      "max_successive_merges=5497;"
+      "max_sequential_skip_in_iterations=4294971408;"
+      "arena_block_size=1893;"
+      "target_file_size_multiplier=35;"
+      "min_write_buffer_number_to_merge=9;"
+      "max_write_buffer_number=84;"
+      "write_buffer_size=1653;"
+      "max_compaction_bytes=64;"
+      "ignore_max_compaction_bytes_for_input=true;"
+      "max_bytes_for_level_multiplier=60;"
+      "memtable_factory=SkipListFactory;"
+      "compression=kNoCompression;"
+      "compression_opts=5:6:7:8:9:10:true:11:false;"
+      "bottommost_compression_opts=4:5:6:7:8:9:true:10:true;"
+      "bottommost_compression=kDisableCompressionOption;"
+      "level0_stop_writes_trigger=33;"
+      "num_levels=99;"
+      "level0_slowdown_writes_trigger=22;"
+      "level0_file_num_compaction_trigger=14;"
+      "compaction_filter=urxcqstuwnCompactionFilter;"
+      "soft_pending_compaction_bytes_limit=0;"
+      "max_write_buffer_number_to_maintain=84;"
+      "max_write_buffer_size_to_maintain=2147483648;"
+      "merge_operator=aabcxehazrMergeOperator;"
+      "memtable_prefix_bloom_size_ratio=0.4642;"
+      "memtable_whole_key_filtering=true;"
+      "memtable_insert_with_hint_prefix_extractor=rocksdb.CappedPrefix.13;"
+      "check_flush_compaction_key_order=false;"
+      "paranoid_file_checks=true;"
+      "force_consistency_checks=true;"
+      "inplace_update_num_locks=7429;"
+      "experimental_mempurge_threshold=0.0001;"
+      "optimize_filters_for_hits=false;"
+      "level_compaction_dynamic_level_bytes=false;"
+      "level_compaction_dynamic_file_size=true;"
+      "inplace_update_support=false;"
+      "compaction_style=kCompactionStyleFIFO;"
+      "compaction_pri=kMinOverlappingRatio;"
+      "hard_pending_compaction_bytes_limit=0;"
+      "disable_auto_compactions=false;"
+      "report_bg_io_stats=true;"
+      "ttl=60;"
+      "periodic_compaction_seconds=3600;"
+      "sample_for_compression=0;"
+      "enable_blob_files=true;"
+      "min_blob_size=256;"
+      "blob_file_size=1000000;"
+      "blob_compression_type=kBZip2Compression;"
+      "enable_blob_garbage_collection=true;"
+      "blob_garbage_collection_age_cutoff=0.5;"
+      "blob_garbage_collection_force_threshold=0.75;"
+      "blob_compaction_readahead_size=262144;"
+      "blob_file_starting_level=1;"
+      "prepopulate_blob_cache=kDisable;"
+      "bottommost_temperature=kWarm;"
+      "last_level_temperature=kWarm;"
+      "preclude_last_level_data_seconds=86400;"
+      "preserve_internal_time_seconds=86400;"
+      "compaction_options_fifo={max_table_files_size=3;allow_"
+      "compaction=false;age_for_warm=1;};"
+      "blob_cache=1M;"
+      "memtable_protection_bytes_per_key=2;",
+      new_options));
+
+  ASSERT_NE(new_options->blob_cache.get(), nullptr);
+
+  ASSERT_EQ(unset_bytes_base,
+            NumUnsetBytes(new_options_ptr, sizeof(ColumnFamilyOptions),
+                          kColumnFamilyOptionsExcluded));
+
+  ColumnFamilyOptions rnd_filled_options = *new_options;
+
+  options->~ColumnFamilyOptions();
+  new_options->~ColumnFamilyOptions();
+
+  delete[] options_ptr;
+  delete[] new_options_ptr;
+
+  // Test copying to mutabable and immutable options and copy back the mutable
+  // part.
+  const OffsetGap kMutableCFOptionsExcluded = {
+      {offsetof(struct MutableCFOptions, prefix_extractor),
+       sizeof(std::shared_ptr<const SliceTransform>)},
+      {offsetof(struct MutableCFOptions,
+                max_bytes_for_level_multiplier_additional),
+       sizeof(std::vector<int>)},
+      {offsetof(struct MutableCFOptions, compression_per_level),
+       sizeof(std::vector<CompressionType>)},
+      {offsetof(struct MutableCFOptions, max_file_size),
+       sizeof(std::vector<uint64_t>)},
+  };
+
+  // For all memory used for options, pre-fill every char. Otherwise, the
+  // padding bytes might be different so that byte-wise comparison doesn't
+  // general equal results even if objects are equal.
+  const char kMySpecialChar = 'x';
+  char* mcfo1_ptr = new char[sizeof(MutableCFOptions)];
+  FillWithSpecialChar(mcfo1_ptr, sizeof(MutableCFOptions),
+                      kMutableCFOptionsExcluded, kMySpecialChar);
+  char* mcfo2_ptr = new char[sizeof(MutableCFOptions)];
+  FillWithSpecialChar(mcfo2_ptr, sizeof(MutableCFOptions),
+                      kMutableCFOptionsExcluded, kMySpecialChar);
+
+  // A clean column family options is constructed after filling the same special
+  // char as the initial one. So that the padding bytes are the same.
+  char* cfo_clean_ptr = new char[sizeof(ColumnFamilyOptions)];
+  FillWithSpecialChar(cfo_clean_ptr, sizeof(ColumnFamilyOptions),
+                      kColumnFamilyOptionsExcluded);
+  rnd_filled_options.num_levels = 66;
+  ColumnFamilyOptions* cfo_clean = new (cfo_clean_ptr) ColumnFamilyOptions();
+
+  MutableCFOptions* mcfo1 =
+      new (mcfo1_ptr) MutableCFOptions(rnd_filled_options);
+  ColumnFamilyOptions cfo_back = BuildColumnFamilyOptions(*cfo_clean, *mcfo1);
+  MutableCFOptions* mcfo2 = new (mcfo2_ptr) MutableCFOptions(cfo_back);
+
+  ASSERT_TRUE(CompareBytes(mcfo1_ptr, mcfo2_ptr, sizeof(MutableCFOptions),
+                           kMutableCFOptionsExcluded));
+
+  cfo_clean->~ColumnFamilyOptions();
+  mcfo1->~MutableCFOptions();
+  mcfo2->~MutableCFOptions();
+  delete[] mcfo1_ptr;
+  delete[] mcfo2_ptr;
+  delete[] cfo_clean_ptr;
+}
+#endif  // !ROCKSDB_UBSAN_RUN
+#endif  // !__clang__
+#endif  // OS_LINUX || OS_WIN
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+#ifdef GFLAGS
+  ParseCommandLineFlags(&argc, &argv, true);
+#endif  // GFLAGS
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/options/options_test.cc b/src/rocksdb/options/options_test.cc
new file mode 100644
index 000000000..37001379a
--- /dev/null
+++ b/src/rocksdb/options/options_test.cc
@@ -0,0 +1,5014 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <cctype>
+#include <cinttypes>
+#include <cstring>
+#include <unordered_map>
+
+#include "cache/lru_cache.h"
+#include "cache/sharded_cache.h"
+#include "options/options_helper.h"
+#include "options/options_parser.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/file_checksum.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/utilities/leveldb_options.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+#include "util/stderr_logger.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators/bytesxor.h"
+#include "utilities/merge_operators/sortlist.h"
+#include "utilities/merge_operators/string_append/stringappend.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
+
+#ifndef GFLAGS
+bool FLAGS_enable_print = false;
+#else
+#include "util/gflags_compat.h"
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+DEFINE_bool(enable_print, false, "Print options generated to console.");
+#endif  // GFLAGS
+
+namespace ROCKSDB_NAMESPACE {
+
+class OptionsTest : public testing::Test {};
+
+class UnregisteredTableFactory : public TableFactory {
+ public:
+  UnregisteredTableFactory() {}
+  const char* Name() const override { return "Unregistered"; }
+  using TableFactory::NewTableReader;
+  Status NewTableReader(const ReadOptions&, const TableReaderOptions&,
+                        std::unique_ptr<RandomAccessFileReader>&&, uint64_t,
+                        std::unique_ptr<TableReader>*, bool) const override {
+    return Status::NotSupported();
+  }
+  TableBuilder* NewTableBuilder(const TableBuilderOptions&,
+                                WritableFileWriter*) const override {
+    return nullptr;
+  }
+};
+
+#ifndef ROCKSDB_LITE  // GetOptionsFromMap is not supported in ROCKSDB_LITE
+TEST_F(OptionsTest, GetOptionsFromMapTest) {
+  std::unordered_map<std::string, std::string> cf_options_map = {
+      {"write_buffer_size", "1"},
+      {"max_write_buffer_number", "2"},
+      {"min_write_buffer_number_to_merge", "3"},
+      {"max_write_buffer_number_to_maintain", "99"},
+      {"max_write_buffer_size_to_maintain", "-99999"},
+      {"compression", "kSnappyCompression"},
+      {"compression_per_level",
+       "kNoCompression:"
+       "kSnappyCompression:"
+       "kZlibCompression:"
+       "kBZip2Compression:"
+       "kLZ4Compression:"
+       "kLZ4HCCompression:"
+       "kXpressCompression:"
+       "kZSTD:"
+       "kZSTDNotFinalCompression"},
+      {"bottommost_compression", "kLZ4Compression"},
+      {"bottommost_compression_opts", "5:6:7:8:10:true"},
+      {"compression_opts", "4:5:6:7:8:2:true:100:false"},
+      {"num_levels", "8"},
+      {"level0_file_num_compaction_trigger", "8"},
+      {"level0_slowdown_writes_trigger", "9"},
+      {"level0_stop_writes_trigger", "10"},
+      {"target_file_size_base", "12"},
+      {"target_file_size_multiplier", "13"},
+      {"max_bytes_for_level_base", "14"},
+      {"level_compaction_dynamic_level_bytes", "true"},
+      {"max_bytes_for_level_multiplier", "15.0"},
+      {"max_bytes_for_level_multiplier_additional", "16:17:18"},
+      {"max_compaction_bytes", "21"},
+      {"hard_pending_compaction_bytes_limit", "211"},
+      {"arena_block_size", "22"},
+      {"disable_auto_compactions", "true"},
+      {"compaction_style", "kCompactionStyleLevel"},
+      {"compaction_pri", "kOldestSmallestSeqFirst"},
+      {"verify_checksums_in_compaction", "false"},
+      {"compaction_options_fifo", "23"},
+      {"max_sequential_skip_in_iterations", "24"},
+      {"inplace_update_support", "true"},
+      {"report_bg_io_stats", "true"},
+      {"compaction_measure_io_stats", "false"},
+      {"purge_redundant_kvs_while_flush", "false"},
+      {"inplace_update_num_locks", "25"},
+      {"memtable_prefix_bloom_size_ratio", "0.26"},
+      {"memtable_whole_key_filtering", "true"},
+      {"memtable_huge_page_size", "28"},
+      {"bloom_locality", "29"},
+      {"max_successive_merges", "30"},
+      {"min_partial_merge_operands", "31"},
+      {"prefix_extractor", "fixed:31"},
+      {"experimental_mempurge_threshold", "0.003"},
+      {"optimize_filters_for_hits", "true"},
+      {"enable_blob_files", "true"},
+      {"min_blob_size", "1K"},
+      {"blob_file_size", "1G"},
+      {"blob_compression_type", "kZSTD"},
+      {"enable_blob_garbage_collection", "true"},
+      {"blob_garbage_collection_age_cutoff", "0.5"},
+      {"blob_garbage_collection_force_threshold", "0.75"},
+      {"blob_compaction_readahead_size", "256K"},
+      {"blob_file_starting_level", "1"},
+      {"prepopulate_blob_cache", "kDisable"},
+      {"last_level_temperature", "kWarm"},
+  };
+
+  std::unordered_map<std::string, std::string> db_options_map = {
+      {"create_if_missing", "false"},
+      {"create_missing_column_families", "true"},
+      {"error_if_exists", "false"},
+      {"paranoid_checks", "true"},
+      {"track_and_verify_wals_in_manifest", "true"},
+      {"verify_sst_unique_id_in_manifest", "true"},
+      {"max_open_files", "32"},
+      {"max_total_wal_size", "33"},
+      {"use_fsync", "true"},
+      {"db_log_dir", "/db_log_dir"},
+      {"wal_dir", "/wal_dir"},
+      {"delete_obsolete_files_period_micros", "34"},
+      {"max_background_compactions", "35"},
+      {"max_background_flushes", "36"},
+      {"max_log_file_size", "37"},
+      {"log_file_time_to_roll", "38"},
+      {"keep_log_file_num", "39"},
+      {"recycle_log_file_num", "5"},
+      {"max_manifest_file_size", "40"},
+      {"table_cache_numshardbits", "41"},
+      {"WAL_ttl_seconds", "43"},
+      {"WAL_size_limit_MB", "44"},
+      {"manifest_preallocation_size", "45"},
+      {"allow_mmap_reads", "true"},
+      {"allow_mmap_writes", "false"},
+      {"use_direct_reads", "false"},
+      {"use_direct_io_for_flush_and_compaction", "false"},
+      {"is_fd_close_on_exec", "true"},
+      {"skip_log_error_on_recovery", "false"},
+      {"stats_dump_period_sec", "46"},
+      {"stats_persist_period_sec", "57"},
+      {"persist_stats_to_disk", "false"},
+      {"stats_history_buffer_size", "69"},
+      {"advise_random_on_open", "true"},
+      {"use_adaptive_mutex", "false"},
+      {"compaction_readahead_size", "100"},
+      {"random_access_max_buffer_size", "3145728"},
+      {"writable_file_max_buffer_size", "314159"},
+      {"bytes_per_sync", "47"},
+      {"wal_bytes_per_sync", "48"},
+      {"strict_bytes_per_sync", "true"},
+      {"preserve_deletes", "false"},
+  };
+
+  ColumnFamilyOptions base_cf_opt;
+  ColumnFamilyOptions new_cf_opt;
+  ConfigOptions exact, loose;
+  exact.input_strings_escaped = false;
+  exact.ignore_unknown_options = false;
+  exact.sanity_level = ConfigOptions::kSanityLevelExactMatch;
+  loose.sanity_level = ConfigOptions::kSanityLevelLooselyCompatible;
+
+  loose.input_strings_escaped = false;
+  loose.ignore_unknown_options = true;
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(exact, base_cf_opt, cf_options_map,
+                                          &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 1U);
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number, 2);
+  ASSERT_EQ(new_cf_opt.min_write_buffer_number_to_merge, 3);
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number_to_maintain, 99);
+  ASSERT_EQ(new_cf_opt.max_write_buffer_size_to_maintain, -99999);
+  ASSERT_EQ(new_cf_opt.compression, kSnappyCompression);
+  ASSERT_EQ(new_cf_opt.compression_per_level.size(), 9U);
+  ASSERT_EQ(new_cf_opt.compression_per_level[0], kNoCompression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[1], kSnappyCompression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[2], kZlibCompression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[3], kBZip2Compression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[4], kLZ4Compression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[5], kLZ4HCCompression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[6], kXpressCompression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[7], kZSTD);
+  ASSERT_EQ(new_cf_opt.compression_per_level[8], kZSTDNotFinalCompression);
+  ASSERT_EQ(new_cf_opt.compression_opts.window_bits, 4);
+  ASSERT_EQ(new_cf_opt.compression_opts.level, 5);
+  ASSERT_EQ(new_cf_opt.compression_opts.strategy, 6);
+  ASSERT_EQ(new_cf_opt.compression_opts.max_dict_bytes, 7u);
+  ASSERT_EQ(new_cf_opt.compression_opts.zstd_max_train_bytes, 8u);
+  ASSERT_EQ(new_cf_opt.compression_opts.parallel_threads, 2u);
+  ASSERT_EQ(new_cf_opt.compression_opts.enabled, true);
+  ASSERT_EQ(new_cf_opt.compression_opts.max_dict_buffer_bytes, 100u);
+  ASSERT_EQ(new_cf_opt.compression_opts.use_zstd_dict_trainer, false);
+  ASSERT_EQ(new_cf_opt.bottommost_compression, kLZ4Compression);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.window_bits, 5);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.level, 6);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.strategy, 7);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.max_dict_bytes, 8u);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.zstd_max_train_bytes, 10u);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.parallel_threads,
+            CompressionOptions().parallel_threads);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.enabled, true);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.use_zstd_dict_trainer,
+            CompressionOptions().use_zstd_dict_trainer);
+  ASSERT_EQ(new_cf_opt.num_levels, 8);
+  ASSERT_EQ(new_cf_opt.level0_file_num_compaction_trigger, 8);
+  ASSERT_EQ(new_cf_opt.level0_slowdown_writes_trigger, 9);
+  ASSERT_EQ(new_cf_opt.level0_stop_writes_trigger, 10);
+  ASSERT_EQ(new_cf_opt.target_file_size_base, static_cast<uint64_t>(12));
+  ASSERT_EQ(new_cf_opt.target_file_size_multiplier, 13);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_base, 14U);
+  ASSERT_EQ(new_cf_opt.level_compaction_dynamic_level_bytes, true);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier, 15.0);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional.size(), 3U);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional[0], 16);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional[1], 17);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional[2], 18);
+  ASSERT_EQ(new_cf_opt.max_compaction_bytes, 21);
+  ASSERT_EQ(new_cf_opt.hard_pending_compaction_bytes_limit, 211);
+  ASSERT_EQ(new_cf_opt.arena_block_size, 22U);
+  ASSERT_EQ(new_cf_opt.disable_auto_compactions, true);
+  ASSERT_EQ(new_cf_opt.compaction_style, kCompactionStyleLevel);
+  ASSERT_EQ(new_cf_opt.compaction_pri, kOldestSmallestSeqFirst);
+  ASSERT_EQ(new_cf_opt.compaction_options_fifo.max_table_files_size,
+            static_cast<uint64_t>(23));
+  ASSERT_EQ(new_cf_opt.max_sequential_skip_in_iterations,
+            static_cast<uint64_t>(24));
+  ASSERT_EQ(new_cf_opt.inplace_update_support, true);
+  ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 25U);
+  ASSERT_EQ(new_cf_opt.memtable_prefix_bloom_size_ratio, 0.26);
+  ASSERT_EQ(new_cf_opt.memtable_whole_key_filtering, true);
+  ASSERT_EQ(new_cf_opt.memtable_huge_page_size, 28U);
+  ASSERT_EQ(new_cf_opt.bloom_locality, 29U);
+  ASSERT_EQ(new_cf_opt.max_successive_merges, 30U);
+  ASSERT_TRUE(new_cf_opt.prefix_extractor != nullptr);
+  ASSERT_EQ(new_cf_opt.optimize_filters_for_hits, true);
+  ASSERT_EQ(new_cf_opt.prefix_extractor->AsString(), "rocksdb.FixedPrefix.31");
+  ASSERT_EQ(new_cf_opt.experimental_mempurge_threshold, 0.003);
+  ASSERT_EQ(new_cf_opt.enable_blob_files, true);
+  ASSERT_EQ(new_cf_opt.min_blob_size, 1ULL << 10);
+  ASSERT_EQ(new_cf_opt.blob_file_size, 1ULL << 30);
+  ASSERT_EQ(new_cf_opt.blob_compression_type, kZSTD);
+  ASSERT_EQ(new_cf_opt.enable_blob_garbage_collection, true);
+  ASSERT_EQ(new_cf_opt.blob_garbage_collection_age_cutoff, 0.5);
+  ASSERT_EQ(new_cf_opt.blob_garbage_collection_force_threshold, 0.75);
+  ASSERT_EQ(new_cf_opt.blob_compaction_readahead_size, 262144);
+  ASSERT_EQ(new_cf_opt.blob_file_starting_level, 1);
+  ASSERT_EQ(new_cf_opt.prepopulate_blob_cache, PrepopulateBlobCache::kDisable);
+  ASSERT_EQ(new_cf_opt.last_level_temperature, Temperature::kWarm);
+  ASSERT_EQ(new_cf_opt.bottommost_temperature, Temperature::kWarm);
+
+  cf_options_map["write_buffer_size"] = "hello";
+  ASSERT_NOK(GetColumnFamilyOptionsFromMap(exact, base_cf_opt, cf_options_map,
+                                           &new_cf_opt));
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  cf_options_map["write_buffer_size"] = "1";
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(exact, base_cf_opt, cf_options_map,
+                                          &new_cf_opt));
+
+  cf_options_map["unknown_option"] = "1";
+  ASSERT_NOK(GetColumnFamilyOptionsFromMap(exact, base_cf_opt, cf_options_map,
+                                           &new_cf_opt));
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  // ignore_unknown_options=true;input_strings_escaped=false
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(loose, base_cf_opt, cf_options_map,
+                                          &new_cf_opt));
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyCFOptions(loose, base_cf_opt, new_cf_opt));
+  ASSERT_NOK(
+      RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  DBOptions base_db_opt;
+  DBOptions new_db_opt;
+  ASSERT_OK(
+      GetDBOptionsFromMap(exact, base_db_opt, db_options_map, &new_db_opt));
+  ASSERT_EQ(new_db_opt.create_if_missing, false);
+  ASSERT_EQ(new_db_opt.create_missing_column_families, true);
+  ASSERT_EQ(new_db_opt.error_if_exists, false);
+  ASSERT_EQ(new_db_opt.paranoid_checks, true);
+  ASSERT_EQ(new_db_opt.track_and_verify_wals_in_manifest, true);
+  ASSERT_EQ(new_db_opt.verify_sst_unique_id_in_manifest, true);
+  ASSERT_EQ(new_db_opt.max_open_files, 32);
+  ASSERT_EQ(new_db_opt.max_total_wal_size, static_cast<uint64_t>(33));
+  ASSERT_EQ(new_db_opt.use_fsync, true);
+  ASSERT_EQ(new_db_opt.db_log_dir, "/db_log_dir");
+  ASSERT_EQ(new_db_opt.wal_dir, "/wal_dir");
+  ASSERT_EQ(new_db_opt.delete_obsolete_files_period_micros,
+            static_cast<uint64_t>(34));
+  ASSERT_EQ(new_db_opt.max_background_compactions, 35);
+  ASSERT_EQ(new_db_opt.max_background_flushes, 36);
+  ASSERT_EQ(new_db_opt.max_log_file_size, 37U);
+  ASSERT_EQ(new_db_opt.log_file_time_to_roll, 38U);
+  ASSERT_EQ(new_db_opt.keep_log_file_num, 39U);
+  ASSERT_EQ(new_db_opt.recycle_log_file_num, 5U);
+  ASSERT_EQ(new_db_opt.max_manifest_file_size, static_cast<uint64_t>(40));
+  ASSERT_EQ(new_db_opt.table_cache_numshardbits, 41);
+  ASSERT_EQ(new_db_opt.WAL_ttl_seconds, static_cast<uint64_t>(43));
+  ASSERT_EQ(new_db_opt.WAL_size_limit_MB, static_cast<uint64_t>(44));
+  ASSERT_EQ(new_db_opt.manifest_preallocation_size, 45U);
+  ASSERT_EQ(new_db_opt.allow_mmap_reads, true);
+  ASSERT_EQ(new_db_opt.allow_mmap_writes, false);
+  ASSERT_EQ(new_db_opt.use_direct_reads, false);
+  ASSERT_EQ(new_db_opt.use_direct_io_for_flush_and_compaction, false);
+  ASSERT_EQ(new_db_opt.is_fd_close_on_exec, true);
+  ASSERT_EQ(new_db_opt.stats_dump_period_sec, 46U);
+  ASSERT_EQ(new_db_opt.stats_persist_period_sec, 57U);
+  ASSERT_EQ(new_db_opt.persist_stats_to_disk, false);
+  ASSERT_EQ(new_db_opt.stats_history_buffer_size, 69U);
+  ASSERT_EQ(new_db_opt.advise_random_on_open, true);
+  ASSERT_EQ(new_db_opt.use_adaptive_mutex, false);
+  ASSERT_EQ(new_db_opt.compaction_readahead_size, 100);
+  ASSERT_EQ(new_db_opt.random_access_max_buffer_size, 3145728);
+  ASSERT_EQ(new_db_opt.writable_file_max_buffer_size, 314159);
+  ASSERT_EQ(new_db_opt.bytes_per_sync, static_cast<uint64_t>(47));
+  ASSERT_EQ(new_db_opt.wal_bytes_per_sync, static_cast<uint64_t>(48));
+  ASSERT_EQ(new_db_opt.strict_bytes_per_sync, true);
+
+  db_options_map["max_open_files"] = "hello";
+  Status s =
+      GetDBOptionsFromMap(exact, base_db_opt, db_options_map, &new_db_opt);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt));
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyDBOptions(loose, base_db_opt, new_db_opt));
+
+  // unknow options should fail parsing without ignore_unknown_options = true
+  db_options_map["unknown_db_option"] = "1";
+  s = GetDBOptionsFromMap(exact, base_db_opt, db_options_map, &new_db_opt);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt));
+
+  ASSERT_OK(
+      GetDBOptionsFromMap(loose, base_db_opt, db_options_map, &new_db_opt));
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyDBOptions(loose, base_db_opt, new_db_opt));
+  ASSERT_NOK(
+      RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt));
+}
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE  // GetColumnFamilyOptionsFromString is not supported in
+                      // ROCKSDB_LITE
+TEST_F(OptionsTest, GetColumnFamilyOptionsFromStringTest) {
+  ColumnFamilyOptions base_cf_opt;
+  ColumnFamilyOptions new_cf_opt;
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = false;
+  config_options.ignore_unknown_options = false;
+
+  base_cf_opt.table_factory.reset();
+  ASSERT_OK(GetColumnFamilyOptionsFromString(config_options, base_cf_opt, "",
+                                             &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt, "write_buffer_size=5", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 5U);
+  ASSERT_TRUE(new_cf_opt.table_factory == nullptr);
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt, "write_buffer_size=6;", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 6U);
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt, "  write_buffer_size =  7  ", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 7U);
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt, "  write_buffer_size =  8 ; ", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 8U);
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=9;max_write_buffer_number=10", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 9U);
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number, 10);
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=11; max_write_buffer_number  =  12 ;", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 11U);
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number, 12);
+  // Wrong name "max_write_buffer_number_"
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=13;max_write_buffer_number_=14;", &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt,
+                                                  new_cf_opt));
+
+  // Comparator from object registry
+  std::string kCompName = "reverse_comp";
+  ObjectLibrary::Default()->AddFactory<const Comparator>(
+      kCompName,
+      [](const std::string& /*name*/,
+         std::unique_ptr<const Comparator>* /*guard*/,
+         std::string* /* errmsg */) { return ReverseBytewiseComparator(); });
+
+  ASSERT_OK(GetColumnFamilyOptionsFromString(config_options, base_cf_opt,
+                                             "comparator=" + kCompName + ";",
+                                             &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.comparator, ReverseBytewiseComparator());
+
+  // MergeOperator from object registry
+  std::unique_ptr<BytesXOROperator> bxo(new BytesXOROperator());
+  std::string kMoName = bxo->Name();
+
+  ASSERT_OK(GetColumnFamilyOptionsFromString(config_options, base_cf_opt,
+                                             "merge_operator=" + kMoName + ";",
+                                             &new_cf_opt));
+  ASSERT_EQ(kMoName, std::string(new_cf_opt.merge_operator->Name()));
+
+  // Wrong key/value pair
+  Status s = GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=13;max_write_buffer_number;", &new_cf_opt);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt,
+                                                  new_cf_opt));
+
+  // Error Parsing value
+  s = GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=13;max_write_buffer_number=;", &new_cf_opt);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt,
+                                                  new_cf_opt));
+
+  // Missing option name
+  s = GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt, "write_buffer_size=13; =100;", &new_cf_opt);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt,
+                                                  new_cf_opt));
+
+  const uint64_t kilo = 1024UL;
+  const uint64_t mega = 1024 * kilo;
+  const uint64_t giga = 1024 * mega;
+  const uint64_t tera = 1024 * giga;
+
+  // Units (k)
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt, "max_write_buffer_number=15K", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number, 15 * kilo);
+  // Units (m)
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "max_write_buffer_number=16m;inplace_update_num_locks=17M", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number, 16 * mega);
+  ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 17u * mega);
+  // Units (g)
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=18g;prefix_extractor=capped:8;"
+      "arena_block_size=19G",
+      &new_cf_opt));
+
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 18 * giga);
+  ASSERT_EQ(new_cf_opt.arena_block_size, 19 * giga);
+  ASSERT_TRUE(new_cf_opt.prefix_extractor.get() != nullptr);
+  ASSERT_EQ(new_cf_opt.prefix_extractor->AsString(), "rocksdb.CappedPrefix.8");
+
+  // Units (t)
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt, "write_buffer_size=20t;arena_block_size=21T",
+      &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 20 * tera);
+  ASSERT_EQ(new_cf_opt.arena_block_size, 21 * tera);
+
+  // Nested block based table options
+  // Empty
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=10;max_write_buffer_number=16;"
+      "block_based_table_factory={};arena_block_size=1024",
+      &new_cf_opt));
+  ASSERT_TRUE(new_cf_opt.table_factory != nullptr);
+  // Non-empty
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=10;max_write_buffer_number=16;"
+      "block_based_table_factory={block_cache=1M;block_size=4;};"
+      "arena_block_size=1024",
+      &new_cf_opt));
+  ASSERT_TRUE(new_cf_opt.table_factory != nullptr);
+  // Last one
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=10;max_write_buffer_number=16;"
+      "block_based_table_factory={block_cache=1M;block_size=4;}",
+      &new_cf_opt));
+  ASSERT_TRUE(new_cf_opt.table_factory != nullptr);
+  // Mismatch curly braces
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=10;max_write_buffer_number=16;"
+      "block_based_table_factory={{{block_size=4;};"
+      "arena_block_size=1024",
+      &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt,
+                                                  new_cf_opt));
+
+  // Unexpected chars after closing curly brace
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=10;max_write_buffer_number=16;"
+      "block_based_table_factory={block_size=4;}};"
+      "arena_block_size=1024",
+      &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt,
+                                                  new_cf_opt));
+
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=10;max_write_buffer_number=16;"
+      "block_based_table_factory={block_size=4;}xdfa;"
+      "arena_block_size=1024",
+      &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt,
+                                                  new_cf_opt));
+
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=10;max_write_buffer_number=16;"
+      "block_based_table_factory={block_size=4;}xdfa",
+      &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt,
+                                                  new_cf_opt));
+
+  // Invalid block based table option
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=10;max_write_buffer_number=16;"
+      "block_based_table_factory={xx_block_size=4;}",
+      &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt,
+                                                  new_cf_opt));
+
+  ASSERT_OK(GetColumnFamilyOptionsFromString(config_options, base_cf_opt,
+                                             "optimize_filters_for_hits=true",
+                                             &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(config_options, base_cf_opt,
+                                             "optimize_filters_for_hits=false",
+                                             &new_cf_opt));
+
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(config_options, base_cf_opt,
+                                              "optimize_filters_for_hits=junk",
+                                              &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt,
+                                                  new_cf_opt));
+
+  // Nested plain table options
+  // Empty
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=10;max_write_buffer_number=16;"
+      "plain_table_factory={};arena_block_size=1024",
+      &new_cf_opt));
+  ASSERT_TRUE(new_cf_opt.table_factory != nullptr);
+  ASSERT_EQ(std::string(new_cf_opt.table_factory->Name()), "PlainTable");
+  // Non-empty
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=10;max_write_buffer_number=16;"
+      "plain_table_factory={user_key_len=66;bloom_bits_per_key=20;};"
+      "arena_block_size=1024",
+      &new_cf_opt));
+  ASSERT_TRUE(new_cf_opt.table_factory != nullptr);
+  ASSERT_EQ(std::string(new_cf_opt.table_factory->Name()), "PlainTable");
+
+  // memtable factory
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=10;max_write_buffer_number=16;"
+      "memtable=skip_list:10;arena_block_size=1024",
+      &new_cf_opt));
+  ASSERT_TRUE(new_cf_opt.memtable_factory != nullptr);
+  ASSERT_EQ(std::string(new_cf_opt.memtable_factory->Name()), "SkipListFactory");
+  ASSERT_TRUE(new_cf_opt.memtable_factory->IsInstanceOf("SkipListFactory"));
+
+  // blob cache
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "blob_cache={capacity=1M;num_shard_bits=4;"
+      "strict_capacity_limit=true;high_pri_pool_ratio=0.5;};",
+      &new_cf_opt));
+  ASSERT_NE(new_cf_opt.blob_cache, nullptr);
+  ASSERT_EQ(new_cf_opt.blob_cache->GetCapacity(), 1024UL * 1024UL);
+  ASSERT_EQ(static_cast<ShardedCacheBase*>(new_cf_opt.blob_cache.get())
+                ->GetNumShardBits(),
+            4);
+  ASSERT_EQ(new_cf_opt.blob_cache->HasStrictCapacityLimit(), true);
+  ASSERT_EQ(static_cast<LRUCache*>(new_cf_opt.blob_cache.get())
+                ->GetHighPriPoolRatio(),
+            0.5);
+}
+
+TEST_F(OptionsTest, CompressionOptionsFromString) {
+  ColumnFamilyOptions base_cf_opt;
+  ColumnFamilyOptions new_cf_opt;
+  ConfigOptions config_options;
+  std::string opts_str;
+  config_options.ignore_unknown_options = false;
+  CompressionOptions dflt;
+  // Test with some optional values removed....
+  ASSERT_OK(
+      GetColumnFamilyOptionsFromString(config_options, ColumnFamilyOptions(),
+                                       "compression_opts=3:4:5; "
+                                       "bottommost_compression_opts=4:5:6:7",
+                                       &base_cf_opt));
+  ASSERT_EQ(base_cf_opt.compression_opts.window_bits, 3);
+  ASSERT_EQ(base_cf_opt.compression_opts.level, 4);
+  ASSERT_EQ(base_cf_opt.compression_opts.strategy, 5);
+  ASSERT_EQ(base_cf_opt.compression_opts.max_dict_bytes, dflt.max_dict_bytes);
+  ASSERT_EQ(base_cf_opt.compression_opts.zstd_max_train_bytes,
+            dflt.zstd_max_train_bytes);
+  ASSERT_EQ(base_cf_opt.compression_opts.parallel_threads,
+            dflt.parallel_threads);
+  ASSERT_EQ(base_cf_opt.compression_opts.enabled, dflt.enabled);
+  ASSERT_EQ(base_cf_opt.compression_opts.use_zstd_dict_trainer,
+            dflt.use_zstd_dict_trainer);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.window_bits, 4);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.level, 5);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.strategy, 6);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.max_dict_bytes, 7u);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.zstd_max_train_bytes,
+            dflt.zstd_max_train_bytes);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.parallel_threads,
+            dflt.parallel_threads);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.enabled, dflt.enabled);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.use_zstd_dict_trainer,
+            dflt.use_zstd_dict_trainer);
+
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, ColumnFamilyOptions(),
+      "compression_opts=4:5:6:7:8:9:true:10:false; "
+      "bottommost_compression_opts=5:6:7:8:9:false",
+      &base_cf_opt));
+  ASSERT_EQ(base_cf_opt.compression_opts.window_bits, 4);
+  ASSERT_EQ(base_cf_opt.compression_opts.level, 5);
+  ASSERT_EQ(base_cf_opt.compression_opts.strategy, 6);
+  ASSERT_EQ(base_cf_opt.compression_opts.max_dict_bytes, 7u);
+  ASSERT_EQ(base_cf_opt.compression_opts.zstd_max_train_bytes, 8u);
+  ASSERT_EQ(base_cf_opt.compression_opts.parallel_threads, 9u);
+  ASSERT_EQ(base_cf_opt.compression_opts.enabled, true);
+  ASSERT_EQ(base_cf_opt.compression_opts.max_dict_buffer_bytes, 10u);
+  ASSERT_EQ(base_cf_opt.compression_opts.use_zstd_dict_trainer, false);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.window_bits, 5);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.level, 6);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.strategy, 7);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.max_dict_bytes, 8u);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.zstd_max_train_bytes, 9u);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.parallel_threads,
+            dflt.parallel_threads);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.enabled, false);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.use_zstd_dict_trainer,
+            dflt.use_zstd_dict_trainer);
+
+  ASSERT_OK(
+      GetStringFromColumnFamilyOptions(config_options, base_cf_opt, &opts_str));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, ColumnFamilyOptions(), opts_str, &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.compression_opts.window_bits, 4);
+  ASSERT_EQ(new_cf_opt.compression_opts.level, 5);
+  ASSERT_EQ(new_cf_opt.compression_opts.strategy, 6);
+  ASSERT_EQ(new_cf_opt.compression_opts.max_dict_bytes, 7u);
+  ASSERT_EQ(new_cf_opt.compression_opts.zstd_max_train_bytes, 8u);
+  ASSERT_EQ(new_cf_opt.compression_opts.parallel_threads, 9u);
+  ASSERT_EQ(new_cf_opt.compression_opts.enabled, true);
+  ASSERT_EQ(base_cf_opt.compression_opts.max_dict_buffer_bytes, 10u);
+  ASSERT_EQ(base_cf_opt.compression_opts.use_zstd_dict_trainer, false);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.window_bits, 5);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.level, 6);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.strategy, 7);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.max_dict_bytes, 8u);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.zstd_max_train_bytes, 9u);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.parallel_threads,
+            dflt.parallel_threads);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.enabled, false);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.use_zstd_dict_trainer,
+            dflt.use_zstd_dict_trainer);
+
+  // Test as struct values
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, ColumnFamilyOptions(),
+      "compression_opts={window_bits=5; level=6; strategy=7; max_dict_bytes=8;"
+      "zstd_max_train_bytes=9;parallel_threads=10;enabled=true;use_zstd_dict_"
+      "trainer=false}; "
+      "bottommost_compression_opts={window_bits=4; level=5; strategy=6;"
+      " max_dict_bytes=7;zstd_max_train_bytes=8;parallel_threads=9;"
+      "enabled=false;use_zstd_dict_trainer=true}; ",
+      &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.compression_opts.window_bits, 5);
+  ASSERT_EQ(new_cf_opt.compression_opts.level, 6);
+  ASSERT_EQ(new_cf_opt.compression_opts.strategy, 7);
+  ASSERT_EQ(new_cf_opt.compression_opts.max_dict_bytes, 8u);
+  ASSERT_EQ(new_cf_opt.compression_opts.zstd_max_train_bytes, 9u);
+  ASSERT_EQ(new_cf_opt.compression_opts.parallel_threads, 10u);
+  ASSERT_EQ(new_cf_opt.compression_opts.enabled, true);
+  ASSERT_EQ(new_cf_opt.compression_opts.use_zstd_dict_trainer, false);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.window_bits, 4);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.level, 5);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.strategy, 6);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.max_dict_bytes, 7u);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.zstd_max_train_bytes, 8u);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.parallel_threads, 9u);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.enabled, false);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.use_zstd_dict_trainer, true);
+
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "compression_opts={window_bits=4; strategy=5;};"
+      "bottommost_compression_opts={level=6; strategy=7;}",
+      &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.compression_opts.window_bits, 4);
+  ASSERT_EQ(new_cf_opt.compression_opts.strategy, 5);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.level, 6);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.strategy, 7);
+
+  ASSERT_EQ(new_cf_opt.compression_opts.level,
+            base_cf_opt.compression_opts.level);
+  ASSERT_EQ(new_cf_opt.compression_opts.max_dict_bytes,
+            base_cf_opt.compression_opts.max_dict_bytes);
+  ASSERT_EQ(new_cf_opt.compression_opts.zstd_max_train_bytes,
+            base_cf_opt.compression_opts.zstd_max_train_bytes);
+  ASSERT_EQ(new_cf_opt.compression_opts.parallel_threads,
+            base_cf_opt.compression_opts.parallel_threads);
+  ASSERT_EQ(new_cf_opt.compression_opts.enabled,
+            base_cf_opt.compression_opts.enabled);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.window_bits,
+            base_cf_opt.bottommost_compression_opts.window_bits);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.max_dict_bytes,
+            base_cf_opt.bottommost_compression_opts.max_dict_bytes);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.zstd_max_train_bytes,
+            base_cf_opt.bottommost_compression_opts.zstd_max_train_bytes);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.parallel_threads,
+            base_cf_opt.bottommost_compression_opts.parallel_threads);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.enabled,
+            base_cf_opt.bottommost_compression_opts.enabled);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.use_zstd_dict_trainer,
+            base_cf_opt.bottommost_compression_opts.use_zstd_dict_trainer);
+
+  // Test a few individual struct values
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "compression_opts.enabled=false; "
+      "bottommost_compression_opts.enabled=true; ",
+      &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.compression_opts.enabled, false);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.enabled, true);
+
+  // Now test some illegal values
+  ConfigOptions ignore;
+  ignore.ignore_unknown_options = true;
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(
+      config_options, ColumnFamilyOptions(),
+      "compression_opts=5:6:7:8:9:x:false", &base_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      ignore, ColumnFamilyOptions(), "compression_opts=5:6:7:8:9:x:false",
+      &base_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, ColumnFamilyOptions(),
+      "compression_opts=1:2:3:4:5:6:true:8", &base_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      ignore, ColumnFamilyOptions(), "compression_opts=1:2:3:4:5:6:true:8",
+      &base_cf_opt));
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(
+      config_options, ColumnFamilyOptions(),
+      "compression_opts=1:2:3:4:5:6:true:8:9", &base_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      ignore, ColumnFamilyOptions(), "compression_opts=1:2:3:4:5:6:true:8:9",
+      &base_cf_opt));
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(
+      config_options, ColumnFamilyOptions(), "compression_opts={unknown=bad;}",
+      &base_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(ignore, ColumnFamilyOptions(),
+                                             "compression_opts={unknown=bad;}",
+                                             &base_cf_opt));
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(
+      config_options, ColumnFamilyOptions(), "compression_opts.unknown=bad",
+      &base_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(ignore, ColumnFamilyOptions(),
+                                             "compression_opts.unknown=bad",
+                                             &base_cf_opt));
+}
+
+TEST_F(OptionsTest, OldInterfaceTest) {
+  ColumnFamilyOptions base_cf_opt;
+  ColumnFamilyOptions new_cf_opt;
+  ConfigOptions exact;
+
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      base_cf_opt,
+      "write_buffer_size=18;prefix_extractor=capped:8;"
+      "arena_block_size=19",
+      &new_cf_opt));
+
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 18);
+  ASSERT_EQ(new_cf_opt.arena_block_size, 19);
+  ASSERT_TRUE(new_cf_opt.prefix_extractor.get() != nullptr);
+
+  // And with a bad option
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(
+      base_cf_opt,
+      "write_buffer_size=10;max_write_buffer_number=16;"
+      "block_based_table_factory={xx_block_size=4;}",
+      &new_cf_opt));
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  std::unordered_map<std::string, std::string> cf_options_map = {
+      {"write_buffer_size", "1"},
+      {"max_write_buffer_number", "2"},
+      {"min_write_buffer_number_to_merge", "3"},
+  };
+  ASSERT_OK(
+      GetColumnFamilyOptionsFromMap(base_cf_opt, cf_options_map, &new_cf_opt));
+  cf_options_map["unknown_option"] = "1";
+  ASSERT_NOK(
+      GetColumnFamilyOptionsFromMap(base_cf_opt, cf_options_map, &new_cf_opt));
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(base_cf_opt, cf_options_map,
+                                          &new_cf_opt, true, true));
+
+  DBOptions base_db_opt;
+  DBOptions new_db_opt;
+  std::unordered_map<std::string, std::string> db_options_map = {
+      {"create_if_missing", "false"},
+      {"create_missing_column_families", "true"},
+      {"error_if_exists", "false"},
+      {"paranoid_checks", "true"},
+      {"track_and_verify_wals_in_manifest", "true"},
+      {"verify_sst_unique_id_in_manifest", "true"},
+      {"max_open_files", "32"},
+  };
+  ASSERT_OK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt));
+  ASSERT_EQ(new_db_opt.create_if_missing, false);
+  ASSERT_EQ(new_db_opt.create_missing_column_families, true);
+  ASSERT_EQ(new_db_opt.error_if_exists, false);
+  ASSERT_EQ(new_db_opt.paranoid_checks, true);
+  ASSERT_EQ(new_db_opt.track_and_verify_wals_in_manifest, true);
+  ASSERT_EQ(new_db_opt.verify_sst_unique_id_in_manifest, true);
+  ASSERT_EQ(new_db_opt.max_open_files, 32);
+  db_options_map["unknown_option"] = "1";
+  Status s = GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt));
+  ASSERT_OK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt, true,
+                                true));
+  ASSERT_OK(GetDBOptionsFromString(
+      base_db_opt,
+      "create_if_missing=false;error_if_exists=false;max_open_files=42;",
+      &new_db_opt));
+  ASSERT_EQ(new_db_opt.create_if_missing, false);
+  ASSERT_EQ(new_db_opt.error_if_exists, false);
+  ASSERT_EQ(new_db_opt.max_open_files, 42);
+  s = GetDBOptionsFromString(
+      base_db_opt,
+      "create_if_missing=false;error_if_exists=false;max_open_files=42;"
+      "unknown_option=1;",
+      &new_db_opt);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt));
+}
+
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE  // GetBlockBasedTableOptionsFromString is not supported
+TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
+  BlockBasedTableOptions table_opt;
+  BlockBasedTableOptions new_opt;
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = false;
+  config_options.ignore_unknown_options = false;
+  config_options.ignore_unsupported_options = false;
+
+  // make sure default values are overwritten by something else
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      config_options, table_opt,
+      "cache_index_and_filter_blocks=1;index_type=kHashSearch;"
+      "checksum=kxxHash;"
+      "block_cache=1M;block_cache_compressed=1k;block_size=1024;"
+      "block_size_deviation=8;block_restart_interval=4;"
+      "format_version=5;whole_key_filtering=1;"
+      "filter_policy=bloomfilter:4.567:false;detect_filter_construct_"
+      "corruption=true;"
+      // A bug caused read_amp_bytes_per_bit to be a large integer in OPTIONS
+      // file generated by 6.10 to 6.14. Though bug is fixed in these releases,
+      // we need to handle the case of loading OPTIONS file generated before the
+      // fix.
+      "read_amp_bytes_per_bit=17179869185;",
+      &new_opt));
+  ASSERT_TRUE(new_opt.cache_index_and_filter_blocks);
+  ASSERT_EQ(new_opt.index_type, BlockBasedTableOptions::kHashSearch);
+  ASSERT_EQ(new_opt.checksum, ChecksumType::kxxHash);
+  ASSERT_TRUE(new_opt.block_cache != nullptr);
+  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL);
+  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
+  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL);
+  ASSERT_EQ(new_opt.block_size, 1024UL);
+  ASSERT_EQ(new_opt.block_size_deviation, 8);
+  ASSERT_EQ(new_opt.block_restart_interval, 4);
+  ASSERT_EQ(new_opt.format_version, 5U);
+  ASSERT_EQ(new_opt.whole_key_filtering, true);
+  ASSERT_EQ(new_opt.detect_filter_construct_corruption, true);
+  ASSERT_TRUE(new_opt.filter_policy != nullptr);
+  auto bfp = new_opt.filter_policy->CheckedCast<BloomFilterPolicy>();
+  ASSERT_NE(bfp, nullptr);
+  EXPECT_EQ(bfp->GetMillibitsPerKey(), 4567);
+  EXPECT_EQ(bfp->GetWholeBitsPerKey(), 5);
+  // Verify that only the lower 32bits are stored in
+  // new_opt.read_amp_bytes_per_bit.
+  EXPECT_EQ(1U, new_opt.read_amp_bytes_per_bit);
+
+  // unknown option
+  Status s = GetBlockBasedTableOptionsFromString(
+      config_options, table_opt,
+      "cache_index_and_filter_blocks=1;index_type=kBinarySearch;"
+      "bad_option=1",
+      &new_opt);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_EQ(static_cast<bool>(table_opt.cache_index_and_filter_blocks),
+            new_opt.cache_index_and_filter_blocks);
+  ASSERT_EQ(table_opt.index_type, new_opt.index_type);
+
+  // unrecognized index type
+  s = GetBlockBasedTableOptionsFromString(
+      config_options, table_opt,
+      "cache_index_and_filter_blocks=1;index_type=kBinarySearchXX", &new_opt);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_EQ(table_opt.cache_index_and_filter_blocks,
+            new_opt.cache_index_and_filter_blocks);
+  ASSERT_EQ(table_opt.index_type, new_opt.index_type);
+
+  // unrecognized checksum type
+  ASSERT_NOK(GetBlockBasedTableOptionsFromString(
+      config_options, table_opt,
+      "cache_index_and_filter_blocks=1;checksum=kxxHashXX", &new_opt));
+  ASSERT_EQ(table_opt.cache_index_and_filter_blocks,
+            new_opt.cache_index_and_filter_blocks);
+  ASSERT_EQ(table_opt.index_type, new_opt.index_type);
+
+  // unrecognized filter policy name
+  s = GetBlockBasedTableOptionsFromString(config_options, table_opt,
+                                          "filter_policy=bloomfilterxx:4:true",
+                                          &new_opt);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  // missing bits per key
+  s = GetBlockBasedTableOptionsFromString(
+      config_options, table_opt, "filter_policy=bloomfilter", &new_opt);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  // Used to be rejected, now accepted
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      config_options, table_opt, "filter_policy=bloomfilter:4", &new_opt));
+  bfp = dynamic_cast<const BloomFilterPolicy*>(new_opt.filter_policy.get());
+  EXPECT_EQ(bfp->GetMillibitsPerKey(), 4000);
+  EXPECT_EQ(bfp->GetWholeBitsPerKey(), 4);
+
+  // use_block_based_builder=true now ignored in public API (same as false)
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      config_options, table_opt, "filter_policy=bloomfilter:4:true", &new_opt));
+  bfp = dynamic_cast<const BloomFilterPolicy*>(new_opt.filter_policy.get());
+  EXPECT_EQ(bfp->GetMillibitsPerKey(), 4000);
+  EXPECT_EQ(bfp->GetWholeBitsPerKey(), 4);
+
+  // Test configuring using other internal names
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      config_options, table_opt,
+      "filter_policy=rocksdb.internal.LegacyBloomFilter:3", &new_opt));
+  auto builtin =
+      dynamic_cast<const BuiltinFilterPolicy*>(new_opt.filter_policy.get());
+  EXPECT_EQ(builtin->GetId(), "rocksdb.internal.LegacyBloomFilter:3");
+
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      config_options, table_opt,
+      "filter_policy=rocksdb.internal.FastLocalBloomFilter:1.234", &new_opt));
+  builtin =
+      dynamic_cast<const BuiltinFilterPolicy*>(new_opt.filter_policy.get());
+  EXPECT_EQ(builtin->GetId(), "rocksdb.internal.FastLocalBloomFilter:1.234");
+
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      config_options, table_opt,
+      "filter_policy=rocksdb.internal.Standard128RibbonFilter:1.234",
+      &new_opt));
+  builtin =
+      dynamic_cast<const BuiltinFilterPolicy*>(new_opt.filter_policy.get());
+  EXPECT_EQ(builtin->GetId(), "rocksdb.internal.Standard128RibbonFilter:1.234");
+
+  // Ribbon filter policy (no Bloom hybrid)
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      config_options, table_opt, "filter_policy=ribbonfilter:5.678:-1;",
+      &new_opt));
+  ASSERT_TRUE(new_opt.filter_policy != nullptr);
+  auto rfp =
+      dynamic_cast<const RibbonFilterPolicy*>(new_opt.filter_policy.get());
+  EXPECT_EQ(rfp->GetMillibitsPerKey(), 5678);
+  EXPECT_EQ(rfp->GetBloomBeforeLevel(), -1);
+
+  // Ribbon filter policy (default Bloom hybrid)
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      config_options, table_opt, "filter_policy=ribbonfilter:6.789;",
+      &new_opt));
+  ASSERT_TRUE(new_opt.filter_policy != nullptr);
+  rfp = dynamic_cast<const RibbonFilterPolicy*>(new_opt.filter_policy.get());
+  EXPECT_EQ(rfp->GetMillibitsPerKey(), 6789);
+  EXPECT_EQ(rfp->GetBloomBeforeLevel(), 0);
+
+  // Ribbon filter policy (custom Bloom hybrid)
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      config_options, table_opt, "filter_policy=ribbonfilter:6.789:5;",
+      &new_opt));
+  ASSERT_TRUE(new_opt.filter_policy != nullptr);
+  rfp = dynamic_cast<const RibbonFilterPolicy*>(new_opt.filter_policy.get());
+  EXPECT_EQ(rfp->GetMillibitsPerKey(), 6789);
+  EXPECT_EQ(rfp->GetBloomBeforeLevel(), 5);
+
+  // Check block cache options are overwritten when specified
+  // in new format as a struct.
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      config_options, table_opt,
+      "block_cache={capacity=1M;num_shard_bits=4;"
+      "strict_capacity_limit=true;high_pri_pool_ratio=0.5;};"
+      "block_cache_compressed={capacity=1M;num_shard_bits=4;"
+      "strict_capacity_limit=true;high_pri_pool_ratio=0.5;}",
+      &new_opt));
+  ASSERT_TRUE(new_opt.block_cache != nullptr);
+  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(new_opt.block_cache)
+                ->GetNumShardBits(),
+            4);
+  ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), true);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
+                new_opt.block_cache)->GetHighPriPoolRatio(), 0.5);
+  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
+  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL*1024UL);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(
+                new_opt.block_cache_compressed)
+                ->GetNumShardBits(),
+            4);
+  ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), true);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
+                new_opt.block_cache_compressed)->GetHighPriPoolRatio(),
+                0.5);
+
+  // Set only block cache capacity. Check other values are
+  // reset to default values.
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      config_options, table_opt,
+      "block_cache={capacity=2M};"
+      "block_cache_compressed={capacity=2M}",
+      &new_opt));
+  ASSERT_TRUE(new_opt.block_cache != nullptr);
+  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 2*1024UL*1024UL);
+  // Default values
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(new_opt.block_cache)
+                ->GetNumShardBits(),
+            GetDefaultCacheShardBits(new_opt.block_cache->GetCapacity()));
+  ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), false);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache)
+                ->GetHighPriPoolRatio(),
+            0.5);
+  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
+  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 2*1024UL*1024UL);
+  // Default values
+  ASSERT_EQ(
+      std::dynamic_pointer_cast<ShardedCacheBase>(
+          new_opt.block_cache_compressed)
+          ->GetNumShardBits(),
+      GetDefaultCacheShardBits(new_opt.block_cache_compressed->GetCapacity()));
+  ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache_compressed)
+                ->GetHighPriPoolRatio(),
+            0.5);
+
+  // Set couple of block cache options.
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      config_options, table_opt,
+      "block_cache={num_shard_bits=5;high_pri_pool_ratio=0.5;};"
+      "block_cache_compressed={num_shard_bits=5;"
+      "high_pri_pool_ratio=0.0;}",
+      &new_opt));
+  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 0);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(new_opt.block_cache)
+                ->GetNumShardBits(),
+            5);
+  ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), false);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
+                new_opt.block_cache)->GetHighPriPoolRatio(), 0.5);
+  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
+  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 0);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(
+                new_opt.block_cache_compressed)
+                ->GetNumShardBits(),
+            5);
+  ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache_compressed)
+                ->GetHighPriPoolRatio(),
+            0.0);
+
+  // Set couple of block cache options.
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      config_options, table_opt,
+      "block_cache={capacity=1M;num_shard_bits=4;"
+      "strict_capacity_limit=true;};"
+      "block_cache_compressed={capacity=1M;num_shard_bits=4;"
+      "strict_capacity_limit=true;}",
+      &new_opt));
+  ASSERT_TRUE(new_opt.block_cache != nullptr);
+  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(new_opt.block_cache)
+                ->GetNumShardBits(),
+            4);
+  ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), true);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache)
+                ->GetHighPriPoolRatio(),
+            0.5);
+  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
+  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL*1024UL);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(
+                new_opt.block_cache_compressed)
+                ->GetNumShardBits(),
+            4);
+  ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), true);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache_compressed)
+                ->GetHighPriPoolRatio(),
+            0.5);
+
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      config_options, table_opt, "filter_policy=rocksdb.BloomFilter:1.234",
+      &new_opt));
+  ASSERT_TRUE(new_opt.filter_policy != nullptr);
+  ASSERT_TRUE(
+      new_opt.filter_policy->IsInstanceOf(BloomFilterPolicy::kClassName()));
+  ASSERT_TRUE(
+      new_opt.filter_policy->IsInstanceOf(BloomFilterPolicy::kNickName()));
+
+  // Ribbon filter policy alternative name
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      config_options, table_opt, "filter_policy=rocksdb.RibbonFilter:6.789:5;",
+      &new_opt));
+  ASSERT_TRUE(new_opt.filter_policy != nullptr);
+  ASSERT_TRUE(
+      new_opt.filter_policy->IsInstanceOf(RibbonFilterPolicy::kClassName()));
+  ASSERT_TRUE(
+      new_opt.filter_policy->IsInstanceOf(RibbonFilterPolicy::kNickName()));
+}
+#endif  // !ROCKSDB_LITE
+
+
+#ifndef ROCKSDB_LITE  // GetPlainTableOptionsFromString is not supported
+TEST_F(OptionsTest, GetPlainTableOptionsFromString) {
+  PlainTableOptions table_opt;
+  PlainTableOptions new_opt;
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = false;
+  config_options.ignore_unknown_options = false;
+  // make sure default values are overwritten by something else
+  ASSERT_OK(GetPlainTableOptionsFromString(
+      config_options, table_opt,
+      "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;"
+      "index_sparseness=8;huge_page_tlb_size=4;encoding_type=kPrefix;"
+      "full_scan_mode=true;store_index_in_file=true",
+      &new_opt));
+  ASSERT_EQ(new_opt.user_key_len, 66u);
+  ASSERT_EQ(new_opt.bloom_bits_per_key, 20);
+  ASSERT_EQ(new_opt.hash_table_ratio, 0.5);
+  ASSERT_EQ(new_opt.index_sparseness, 8);
+  ASSERT_EQ(new_opt.huge_page_tlb_size, 4);
+  ASSERT_EQ(new_opt.encoding_type, EncodingType::kPrefix);
+  ASSERT_TRUE(new_opt.full_scan_mode);
+  ASSERT_TRUE(new_opt.store_index_in_file);
+
+  // unknown option
+  Status s = GetPlainTableOptionsFromString(
+      config_options, table_opt,
+      "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;"
+      "bad_option=1",
+      &new_opt);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  // unrecognized EncodingType
+  s = GetPlainTableOptionsFromString(
+      config_options, table_opt,
+      "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;"
+      "encoding_type=kPrefixXX",
+      &new_opt);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+}
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE  // GetMemTableRepFactoryFromString is not supported
+TEST_F(OptionsTest, GetMemTableRepFactoryFromString) {
+  std::unique_ptr<MemTableRepFactory> new_mem_factory = nullptr;
+
+  ASSERT_OK(GetMemTableRepFactoryFromString("skip_list", &new_mem_factory));
+  ASSERT_OK(GetMemTableRepFactoryFromString("skip_list:16", &new_mem_factory));
+  ASSERT_STREQ(new_mem_factory->Name(), "SkipListFactory");
+  ASSERT_NOK(GetMemTableRepFactoryFromString("skip_list:16:invalid_opt",
+                                             &new_mem_factory));
+
+  ASSERT_OK(GetMemTableRepFactoryFromString("prefix_hash", &new_mem_factory));
+  ASSERT_OK(GetMemTableRepFactoryFromString("prefix_hash:1000",
+                                            &new_mem_factory));
+  ASSERT_STREQ(new_mem_factory->Name(), "HashSkipListRepFactory");
+  ASSERT_NOK(GetMemTableRepFactoryFromString("prefix_hash:1000:invalid_opt",
+                                             &new_mem_factory));
+
+  ASSERT_OK(GetMemTableRepFactoryFromString("hash_linkedlist",
+                                            &new_mem_factory));
+  ASSERT_OK(GetMemTableRepFactoryFromString("hash_linkedlist:1000",
+                                            &new_mem_factory));
+  ASSERT_EQ(std::string(new_mem_factory->Name()), "HashLinkListRepFactory");
+  ASSERT_NOK(GetMemTableRepFactoryFromString("hash_linkedlist:1000:invalid_opt",
+                                             &new_mem_factory));
+
+  ASSERT_OK(GetMemTableRepFactoryFromString("vector", &new_mem_factory));
+  ASSERT_OK(GetMemTableRepFactoryFromString("vector:1024", &new_mem_factory));
+  ASSERT_EQ(std::string(new_mem_factory->Name()), "VectorRepFactory");
+  ASSERT_NOK(GetMemTableRepFactoryFromString("vector:1024:invalid_opt",
+                                             &new_mem_factory));
+
+  ASSERT_NOK(GetMemTableRepFactoryFromString("cuckoo", &new_mem_factory));
+  // CuckooHash memtable is already removed.
+  ASSERT_NOK(GetMemTableRepFactoryFromString("cuckoo:1024", &new_mem_factory));
+
+  ASSERT_NOK(GetMemTableRepFactoryFromString("bad_factory", &new_mem_factory));
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_F(OptionsTest, MemTableRepFactoryCreateFromString) {
+  std::unique_ptr<MemTableRepFactory> new_mem_factory = nullptr;
+  ConfigOptions config_options;
+  config_options.ignore_unsupported_options = false;
+  config_options.ignore_unknown_options = false;
+
+  ASSERT_OK(MemTableRepFactory::CreateFromString(config_options, "skip_list",
+                                                 &new_mem_factory));
+  ASSERT_OK(MemTableRepFactory::CreateFromString(config_options, "skip_list:16",
+                                                 &new_mem_factory));
+  ASSERT_STREQ(new_mem_factory->Name(), "SkipListFactory");
+  ASSERT_TRUE(new_mem_factory->IsInstanceOf("skip_list"));
+  ASSERT_TRUE(new_mem_factory->IsInstanceOf("SkipListFactory"));
+  ASSERT_NOK(MemTableRepFactory::CreateFromString(
+      config_options, "skip_list:16:invalid_opt", &new_mem_factory));
+
+  ASSERT_NOK(MemTableRepFactory::CreateFromString(
+      config_options, "invalid_opt=10", &new_mem_factory));
+
+  // Test a reset
+  ASSERT_OK(MemTableRepFactory::CreateFromString(config_options, "",
+                                                 &new_mem_factory));
+  ASSERT_EQ(new_mem_factory, nullptr);
+  ASSERT_NOK(MemTableRepFactory::CreateFromString(
+      config_options, "invalid_opt=10", &new_mem_factory));
+
+#ifndef ROCKSDB_LITE
+  ASSERT_OK(MemTableRepFactory::CreateFromString(
+      config_options, "id=skip_list; lookahead=32", &new_mem_factory));
+  ASSERT_OK(MemTableRepFactory::CreateFromString(config_options, "prefix_hash",
+                                                 &new_mem_factory));
+  ASSERT_OK(MemTableRepFactory::CreateFromString(
+      config_options, "prefix_hash:1000", &new_mem_factory));
+  ASSERT_STREQ(new_mem_factory->Name(), "HashSkipListRepFactory");
+  ASSERT_TRUE(new_mem_factory->IsInstanceOf("prefix_hash"));
+  ASSERT_TRUE(new_mem_factory->IsInstanceOf("HashSkipListRepFactory"));
+  ASSERT_NOK(MemTableRepFactory::CreateFromString(
+      config_options, "prefix_hash:1000:invalid_opt", &new_mem_factory));
+  ASSERT_OK(MemTableRepFactory::CreateFromString(
+      config_options,
+      "id=prefix_hash; bucket_count=32; skiplist_height=64; "
+      "branching_factor=16",
+      &new_mem_factory));
+  ASSERT_NOK(MemTableRepFactory::CreateFromString(
+      config_options,
+      "id=prefix_hash; bucket_count=32; skiplist_height=64; "
+      "branching_factor=16; invalid=unknown",
+      &new_mem_factory));
+
+  ASSERT_OK(MemTableRepFactory::CreateFromString(
+      config_options, "hash_linkedlist", &new_mem_factory));
+  ASSERT_OK(MemTableRepFactory::CreateFromString(
+      config_options, "hash_linkedlist:1000", &new_mem_factory));
+  ASSERT_STREQ(new_mem_factory->Name(), "HashLinkListRepFactory");
+  ASSERT_TRUE(new_mem_factory->IsInstanceOf("hash_linkedlist"));
+  ASSERT_TRUE(new_mem_factory->IsInstanceOf("HashLinkListRepFactory"));
+  ASSERT_NOK(MemTableRepFactory::CreateFromString(
+      config_options, "hash_linkedlist:1000:invalid_opt", &new_mem_factory));
+  ASSERT_OK(MemTableRepFactory::CreateFromString(
+      config_options,
+      "id=hash_linkedlist; bucket_count=32; threshold=64; huge_page_size=16; "
+      "logging_threshold=12; log_when_flash=true",
+      &new_mem_factory));
+  ASSERT_NOK(MemTableRepFactory::CreateFromString(
+      config_options,
+      "id=hash_linkedlist; bucket_count=32; threshold=64; huge_page_size=16; "
+      "logging_threshold=12; log_when_flash=true; invalid=unknown",
+      &new_mem_factory));
+
+  ASSERT_OK(MemTableRepFactory::CreateFromString(config_options, "vector",
+                                                 &new_mem_factory));
+  ASSERT_OK(MemTableRepFactory::CreateFromString(config_options, "vector:1024",
+                                                 &new_mem_factory));
+  ASSERT_STREQ(new_mem_factory->Name(), "VectorRepFactory");
+  ASSERT_TRUE(new_mem_factory->IsInstanceOf("vector"));
+  ASSERT_TRUE(new_mem_factory->IsInstanceOf("VectorRepFactory"));
+  ASSERT_NOK(MemTableRepFactory::CreateFromString(
+      config_options, "vector:1024:invalid_opt", &new_mem_factory));
+  ASSERT_OK(MemTableRepFactory::CreateFromString(
+      config_options, "id=vector; count=42", &new_mem_factory));
+  ASSERT_NOK(MemTableRepFactory::CreateFromString(
+      config_options, "id=vector; invalid=unknown", &new_mem_factory));
+#endif  // ROCKSDB_LITE
+  ASSERT_NOK(MemTableRepFactory::CreateFromString(config_options, "cuckoo",
+                                                  &new_mem_factory));
+  // CuckooHash memtable is already removed.
+  ASSERT_NOK(MemTableRepFactory::CreateFromString(config_options, "cuckoo:1024",
+                                                  &new_mem_factory));
+
+  ASSERT_NOK(MemTableRepFactory::CreateFromString(config_options, "bad_factory",
+                                                  &new_mem_factory));
+}
+
+#ifndef ROCKSDB_LITE  // GetOptionsFromString is not supported in RocksDB Lite
+class CustomEnv : public EnvWrapper {
+ public:
+  explicit CustomEnv(Env* _target) : EnvWrapper(_target) {}
+  static const char* kClassName() { return "CustomEnv"; }
+  const char* Name() const override { return kClassName(); }
+};
+
+TEST_F(OptionsTest, GetOptionsFromStringTest) {
+  Options base_options, new_options;
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = false;
+  config_options.ignore_unknown_options = false;
+
+  base_options.write_buffer_size = 20;
+  base_options.min_write_buffer_number_to_merge = 15;
+  BlockBasedTableOptions block_based_table_options;
+  block_based_table_options.cache_index_and_filter_blocks = true;
+  base_options.table_factory.reset(
+      NewBlockBasedTableFactory(block_based_table_options));
+
+  // Register an Env with object registry.
+  ObjectLibrary::Default()->AddFactory<Env>(
+      CustomEnv::kClassName(),
+      [](const std::string& /*name*/, std::unique_ptr<Env>* /*env_guard*/,
+         std::string* /* errmsg */) {
+        static CustomEnv env(Env::Default());
+        return &env;
+      });
+
+  ASSERT_OK(GetOptionsFromString(
+      config_options, base_options,
+      "write_buffer_size=10;max_write_buffer_number=16;"
+      "block_based_table_factory={block_cache=1M;block_size=4;};"
+      "compression_opts=4:5:6;create_if_missing=true;max_open_files=1;"
+      "bottommost_compression_opts=5:6:7;create_if_missing=true;max_open_files="
+      "1;"
+      "rate_limiter_bytes_per_sec=1024;env=CustomEnv",
+      &new_options));
+
+  ASSERT_EQ(new_options.compression_opts.window_bits, 4);
+  ASSERT_EQ(new_options.compression_opts.level, 5);
+  ASSERT_EQ(new_options.compression_opts.strategy, 6);
+  ASSERT_EQ(new_options.compression_opts.max_dict_bytes, 0u);
+  ASSERT_EQ(new_options.compression_opts.zstd_max_train_bytes, 0u);
+  ASSERT_EQ(new_options.compression_opts.parallel_threads, 1u);
+  ASSERT_EQ(new_options.compression_opts.enabled, false);
+  ASSERT_EQ(new_options.compression_opts.use_zstd_dict_trainer, true);
+  ASSERT_EQ(new_options.bottommost_compression, kDisableCompressionOption);
+  ASSERT_EQ(new_options.bottommost_compression_opts.window_bits, 5);
+  ASSERT_EQ(new_options.bottommost_compression_opts.level, 6);
+  ASSERT_EQ(new_options.bottommost_compression_opts.strategy, 7);
+  ASSERT_EQ(new_options.bottommost_compression_opts.max_dict_bytes, 0u);
+  ASSERT_EQ(new_options.bottommost_compression_opts.zstd_max_train_bytes, 0u);
+  ASSERT_EQ(new_options.bottommost_compression_opts.parallel_threads, 1u);
+  ASSERT_EQ(new_options.bottommost_compression_opts.enabled, false);
+  ASSERT_EQ(new_options.bottommost_compression_opts.use_zstd_dict_trainer,
+            true);
+  ASSERT_EQ(new_options.write_buffer_size, 10U);
+  ASSERT_EQ(new_options.max_write_buffer_number, 16);
+  const auto new_bbto =
+      new_options.table_factory->GetOptions<BlockBasedTableOptions>();
+  ASSERT_NE(new_bbto, nullptr);
+  ASSERT_EQ(new_bbto->block_cache->GetCapacity(), 1U << 20);
+  ASSERT_EQ(new_bbto->block_size, 4U);
+  // don't overwrite block based table options
+  ASSERT_TRUE(new_bbto->cache_index_and_filter_blocks);
+
+  ASSERT_EQ(new_options.create_if_missing, true);
+  ASSERT_EQ(new_options.max_open_files, 1);
+  ASSERT_TRUE(new_options.rate_limiter.get() != nullptr);
+  Env* newEnv = new_options.env;
+  ASSERT_OK(Env::LoadEnv(CustomEnv::kClassName(), &newEnv));
+  ASSERT_EQ(newEnv, new_options.env);
+
+  config_options.ignore_unknown_options = false;
+  // Test a bad value for a DBOption returns a failure
+  base_options.dump_malloc_stats = false;
+  base_options.write_buffer_size = 1024;
+  Options bad_options = new_options;
+  Status s = GetOptionsFromString(config_options, base_options,
+                                  "create_if_missing=XX;dump_malloc_stats=true",
+                                  &bad_options);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_EQ(bad_options.dump_malloc_stats, false);
+
+  bad_options = new_options;
+  s = GetOptionsFromString(config_options, base_options,
+                           "write_buffer_size=XX;dump_malloc_stats=true",
+                           &bad_options);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  ASSERT_EQ(bad_options.dump_malloc_stats, false);
+
+  // Test a bad value for a TableFactory Option returns a failure
+  bad_options = new_options;
+  s = GetOptionsFromString(config_options, base_options,
+                           "write_buffer_size=16;dump_malloc_stats=true"
+                           "block_based_table_factory={block_size=XX;};",
+                           &bad_options);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_EQ(bad_options.dump_malloc_stats, false);
+  ASSERT_EQ(bad_options.write_buffer_size, 1024);
+
+  config_options.ignore_unknown_options = true;
+  ASSERT_OK(GetOptionsFromString(config_options, base_options,
+                                 "create_if_missing=XX;dump_malloc_stats=true;"
+                                 "write_buffer_size=XX;"
+                                 "block_based_table_factory={block_size=XX;};",
+                                 &bad_options));
+  ASSERT_EQ(bad_options.create_if_missing, base_options.create_if_missing);
+  ASSERT_EQ(bad_options.dump_malloc_stats, true);
+  ASSERT_EQ(bad_options.write_buffer_size, base_options.write_buffer_size);
+
+  // Test the old interface
+  ASSERT_OK(GetOptionsFromString(
+      base_options,
+      "write_buffer_size=22;max_write_buffer_number=33;max_open_files=44;",
+      &new_options));
+  ASSERT_EQ(new_options.write_buffer_size, 22U);
+  ASSERT_EQ(new_options.max_write_buffer_number, 33);
+  ASSERT_EQ(new_options.max_open_files, 44);
+}
+
+TEST_F(OptionsTest, DBOptionsSerialization) {
+  Options base_options, new_options;
+  Random rnd(301);
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = false;
+  config_options.ignore_unknown_options = false;
+
+  // Phase 1: Make big change in base_options
+  test::RandomInitDBOptions(&base_options, &rnd);
+
+  // Phase 2: obtain a string from base_option
+  std::string base_options_file_content;
+  ASSERT_OK(GetStringFromDBOptions(config_options, base_options,
+                                   &base_options_file_content));
+
+  // Phase 3: Set new_options from the derived string and expect
+  //          new_options == base_options
+  ASSERT_OK(GetDBOptionsFromString(config_options, DBOptions(),
+                                   base_options_file_content, &new_options));
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_options, base_options,
+                                                  new_options));
+}
+
+TEST_F(OptionsTest, OptionsComposeDecompose) {
+  // build an Options from DBOptions + CFOptions, then decompose it to verify
+  // we get same constituent options.
+  DBOptions base_db_opts;
+  ColumnFamilyOptions base_cf_opts;
+  ConfigOptions
+      config_options;  // Use default for ignore(false) and check (exact)
+  config_options.input_strings_escaped = false;
+
+  Random rnd(301);
+  test::RandomInitDBOptions(&base_db_opts, &rnd);
+  test::RandomInitCFOptions(&base_cf_opts, base_db_opts, &rnd);
+
+  Options base_opts(base_db_opts, base_cf_opts);
+  DBOptions new_db_opts(base_opts);
+  ColumnFamilyOptions new_cf_opts(base_opts);
+
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_options, base_db_opts,
+                                                  new_db_opts));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opts,
+                                                  new_cf_opts));
+  delete new_cf_opts.compaction_filter;
+}
+
+TEST_F(OptionsTest, DBOptionsComposeImmutable) {
+  // Build a DBOptions from an Immutable/Mutable one and verify that
+  // we get same constituent options.
+  ConfigOptions config_options;
+  Random rnd(301);
+  DBOptions base_opts, new_opts;
+  test::RandomInitDBOptions(&base_opts, &rnd);
+  MutableDBOptions m_opts(base_opts);
+  ImmutableDBOptions i_opts(base_opts);
+  new_opts = BuildDBOptions(i_opts, m_opts);
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_options, base_opts,
+                                                  new_opts));
+}
+
+TEST_F(OptionsTest, GetMutableDBOptions) {
+  Random rnd(228);
+  DBOptions base_opts;
+  std::string opts_str;
+  std::unordered_map<std::string, std::string> opts_map;
+  ConfigOptions config_options;
+
+  test::RandomInitDBOptions(&base_opts, &rnd);
+  ImmutableDBOptions i_opts(base_opts);
+  MutableDBOptions m_opts(base_opts);
+  MutableDBOptions new_opts;
+  ASSERT_OK(GetStringFromMutableDBOptions(config_options, m_opts, &opts_str));
+  ASSERT_OK(StringToMap(opts_str, &opts_map));
+  ASSERT_OK(GetMutableDBOptionsFromStrings(m_opts, opts_map, &new_opts));
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(
+      config_options, base_opts, BuildDBOptions(i_opts, new_opts)));
+}
+
+TEST_F(OptionsTest, CFOptionsComposeImmutable) {
+  // Build a DBOptions from an Immutable/Mutable one and verify that
+  // we get same constituent options.
+  ConfigOptions config_options;
+  Random rnd(301);
+  ColumnFamilyOptions base_opts, new_opts;
+  DBOptions dummy;  // Needed to create ImmutableCFOptions
+  test::RandomInitCFOptions(&base_opts, dummy, &rnd);
+  MutableCFOptions m_opts(base_opts);
+  ImmutableCFOptions i_opts(base_opts);
+  UpdateColumnFamilyOptions(i_opts, &new_opts);
+  UpdateColumnFamilyOptions(m_opts, &new_opts);
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_opts,
+                                                  new_opts));
+  delete new_opts.compaction_filter;
+}
+
+TEST_F(OptionsTest, GetMutableCFOptions) {
+  Random rnd(228);
+  ColumnFamilyOptions base, copy;
+  std::string opts_str;
+  std::unordered_map<std::string, std::string> opts_map;
+  ConfigOptions config_options;
+  DBOptions dummy;  // Needed to create ImmutableCFOptions
+
+  test::RandomInitCFOptions(&base, dummy, &rnd);
+  ColumnFamilyOptions result;
+  MutableCFOptions m_opts(base), new_opts;
+
+  ASSERT_OK(GetStringFromMutableCFOptions(config_options, m_opts, &opts_str));
+  ASSERT_OK(StringToMap(opts_str, &opts_map));
+  ASSERT_OK(GetMutableOptionsFromStrings(m_opts, opts_map, nullptr, &new_opts));
+  UpdateColumnFamilyOptions(ImmutableCFOptions(base), &copy);
+  UpdateColumnFamilyOptions(new_opts, &copy);
+
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base, copy));
+  delete copy.compaction_filter;
+}
+
+TEST_F(OptionsTest, ColumnFamilyOptionsSerialization) {
+  Options options;
+  ColumnFamilyOptions base_opt, new_opt;
+  Random rnd(302);
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = false;
+
+  // Phase 1: randomly assign base_opt
+  // custom type options
+  test::RandomInitCFOptions(&base_opt, options, &rnd);
+
+  // Phase 2: obtain a string from base_opt
+  std::string base_options_file_content;
+  ASSERT_OK(GetStringFromColumnFamilyOptions(config_options, base_opt,
+                                             &base_options_file_content));
+
+  // Phase 3: Set new_opt from the derived string and expect
+  //          new_opt == base_opt
+  ASSERT_OK(
+      GetColumnFamilyOptionsFromString(config_options, ColumnFamilyOptions(),
+                                       base_options_file_content, &new_opt));
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyCFOptions(config_options, base_opt, new_opt));
+  if (base_opt.compaction_filter) {
+    delete base_opt.compaction_filter;
+  }
+}
+
+TEST_F(OptionsTest, CheckBlockBasedTableOptions) {
+  ColumnFamilyOptions cf_opts;
+  DBOptions db_opts;
+  ConfigOptions config_opts;
+
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_opts, cf_opts, "prefix_extractor=capped:8", &cf_opts));
+  ASSERT_OK(TableFactory::CreateFromString(config_opts, "BlockBasedTable",
+                                           &cf_opts.table_factory));
+  ASSERT_NE(cf_opts.table_factory.get(), nullptr);
+  ASSERT_TRUE(cf_opts.table_factory->IsInstanceOf(
+      TableFactory::kBlockBasedTableName()));
+  auto bbto = cf_opts.table_factory->GetOptions<BlockBasedTableOptions>();
+  ASSERT_OK(cf_opts.table_factory->ConfigureFromString(
+      config_opts,
+      "block_cache={capacity=1M;num_shard_bits=4;};"
+      "block_size_deviation=101;"
+      "block_restart_interval=0;"
+      "index_block_restart_interval=5;"
+      "partition_filters=true;"
+      "index_type=kHashSearch;"
+      "no_block_cache=1;"));
+  ASSERT_NE(bbto, nullptr);
+  ASSERT_EQ(bbto->block_cache.get(), nullptr);
+  ASSERT_EQ(bbto->block_size_deviation, 0);
+  ASSERT_EQ(bbto->block_restart_interval, 1);
+  ASSERT_EQ(bbto->index_block_restart_interval, 1);
+  ASSERT_FALSE(bbto->partition_filters);
+  ASSERT_OK(TableFactory::CreateFromString(config_opts, "BlockBasedTable",
+                                           &cf_opts.table_factory));
+  bbto = cf_opts.table_factory->GetOptions<BlockBasedTableOptions>();
+
+  ASSERT_OK(cf_opts.table_factory->ConfigureFromString(config_opts,
+                                                       "no_block_cache=0;"));
+  ASSERT_NE(bbto->block_cache.get(), nullptr);
+  ASSERT_OK(cf_opts.table_factory->ValidateOptions(db_opts, cf_opts));
+}
+
+TEST_F(OptionsTest, MutableTableOptions) {
+  ConfigOptions config_options;
+  std::shared_ptr<TableFactory> bbtf;
+  bbtf.reset(NewBlockBasedTableFactory());
+  auto bbto = bbtf->GetOptions<BlockBasedTableOptions>();
+  ASSERT_NE(bbto, nullptr);
+  ASSERT_OK(bbtf->ConfigureOption(config_options, "block_align", "true"));
+  ASSERT_OK(bbtf->ConfigureOption(config_options, "block_size", "1024"));
+  ASSERT_EQ(bbto->block_align, true);
+  ASSERT_EQ(bbto->block_size, 1024);
+  ASSERT_OK(bbtf->PrepareOptions(config_options));
+  config_options.mutable_options_only = true;
+  ASSERT_OK(bbtf->ConfigureOption(config_options, "block_size", "1024"));
+  ASSERT_EQ(bbto->block_align, true);
+  ASSERT_NOK(bbtf->ConfigureOption(config_options, "block_align", "false"));
+  ASSERT_OK(bbtf->ConfigureOption(config_options, "block_size", "2048"));
+  ASSERT_EQ(bbto->block_align, true);
+  ASSERT_EQ(bbto->block_size, 2048);
+
+  ColumnFamilyOptions cf_opts;
+  cf_opts.table_factory = bbtf;
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(
+      config_options, cf_opts, "block_based_table_factory.block_align=false",
+      &cf_opts));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, cf_opts, "block_based_table_factory.block_size=8192",
+      &cf_opts));
+  ASSERT_EQ(bbto->block_align, true);
+  ASSERT_EQ(bbto->block_size, 8192);
+}
+
+TEST_F(OptionsTest, MutableCFOptions) {
+  ConfigOptions config_options;
+  ColumnFamilyOptions cf_opts;
+
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, cf_opts,
+      "paranoid_file_checks=true; block_based_table_factory.block_align=false; "
+      "block_based_table_factory.block_size=8192;",
+      &cf_opts));
+  ASSERT_TRUE(cf_opts.paranoid_file_checks);
+  ASSERT_NE(cf_opts.table_factory.get(), nullptr);
+  const auto bbto = cf_opts.table_factory->GetOptions<BlockBasedTableOptions>();
+  ASSERT_NE(bbto, nullptr);
+  ASSERT_EQ(bbto->block_size, 8192);
+  ASSERT_EQ(bbto->block_align, false);
+  std::unordered_map<std::string, std::string> unused_opts;
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(
+      config_options, cf_opts, {{"paranoid_file_checks", "false"}}, &cf_opts));
+  ASSERT_EQ(cf_opts.paranoid_file_checks, false);
+
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(
+      config_options, cf_opts,
+      {{"block_based_table_factory.block_size", "16384"}}, &cf_opts));
+  ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions<BlockBasedTableOptions>());
+  ASSERT_EQ(bbto->block_size, 16384);
+
+  config_options.mutable_options_only = true;
+  // Force consistency checks is not mutable
+  ASSERT_NOK(GetColumnFamilyOptionsFromMap(
+      config_options, cf_opts, {{"force_consistency_checks", "true"}},
+      &cf_opts));
+
+  // Attempt to change the table.  It is not mutable, so this should fail and
+  // leave the original intact
+  ASSERT_NOK(GetColumnFamilyOptionsFromMap(
+      config_options, cf_opts, {{"table_factory", "PlainTable"}}, &cf_opts));
+  ASSERT_NOK(GetColumnFamilyOptionsFromMap(
+      config_options, cf_opts, {{"table_factory.id", "PlainTable"}}, &cf_opts));
+  ASSERT_NE(cf_opts.table_factory.get(), nullptr);
+  ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions<BlockBasedTableOptions>());
+
+  // Change the block size.  Should update the value in the current table
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(
+      config_options, cf_opts,
+      {{"block_based_table_factory.block_size", "8192"}}, &cf_opts));
+  ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions<BlockBasedTableOptions>());
+  ASSERT_EQ(bbto->block_size, 8192);
+
+  // Attempt to turn off block cache fails, as this option is not mutable
+  ASSERT_NOK(GetColumnFamilyOptionsFromMap(
+      config_options, cf_opts,
+      {{"block_based_table_factory.no_block_cache", "true"}}, &cf_opts));
+  ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions<BlockBasedTableOptions>());
+
+  // Attempt to change the block size via a config string/map.  Should update
+  // the current value
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(
+      config_options, cf_opts,
+      {{"block_based_table_factory", "{block_size=32768}"}}, &cf_opts));
+  ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions<BlockBasedTableOptions>());
+  ASSERT_EQ(bbto->block_size, 32768);
+
+  // Attempt to change the block size and no cache through the map.  Should
+  // fail, leaving the old values intact
+  ASSERT_NOK(GetColumnFamilyOptionsFromMap(
+      config_options, cf_opts,
+      {{"block_based_table_factory",
+        "{block_size=16384; no_block_cache=true}"}},
+      &cf_opts));
+  ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions<BlockBasedTableOptions>());
+  ASSERT_EQ(bbto->block_size, 32768);
+}
+
+#endif  // !ROCKSDB_LITE
+
+Status StringToMap(
+    const std::string& opts_str,
+    std::unordered_map<std::string, std::string>* opts_map);
+
+#ifndef ROCKSDB_LITE  // StringToMap is not supported in ROCKSDB_LITE
+TEST_F(OptionsTest, StringToMapTest) {
+  std::unordered_map<std::string, std::string> opts_map;
+  // Regular options
+  ASSERT_OK(StringToMap("k1=v1;k2=v2;k3=v3", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_EQ(opts_map["k2"], "v2");
+  ASSERT_EQ(opts_map["k3"], "v3");
+  // Value with '='
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1==v1;k2=v2=;", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "=v1");
+  ASSERT_EQ(opts_map["k2"], "v2=");
+  // Overwrriten option
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k1=v2;k3=v3", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v2");
+  ASSERT_EQ(opts_map["k3"], "v3");
+  // Empty value
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2=;k3=v3;k4=", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_TRUE(opts_map.find("k2") != opts_map.end());
+  ASSERT_EQ(opts_map["k2"], "");
+  ASSERT_EQ(opts_map["k3"], "v3");
+  ASSERT_TRUE(opts_map.find("k4") != opts_map.end());
+  ASSERT_EQ(opts_map["k4"], "");
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2=;k3=v3;k4=   ", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_TRUE(opts_map.find("k2") != opts_map.end());
+  ASSERT_EQ(opts_map["k2"], "");
+  ASSERT_EQ(opts_map["k3"], "v3");
+  ASSERT_TRUE(opts_map.find("k4") != opts_map.end());
+  ASSERT_EQ(opts_map["k4"], "");
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2=;k3=", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_TRUE(opts_map.find("k2") != opts_map.end());
+  ASSERT_EQ(opts_map["k2"], "");
+  ASSERT_TRUE(opts_map.find("k3") != opts_map.end());
+  ASSERT_EQ(opts_map["k3"], "");
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2=;k3=;", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_TRUE(opts_map.find("k2") != opts_map.end());
+  ASSERT_EQ(opts_map["k2"], "");
+  ASSERT_TRUE(opts_map.find("k3") != opts_map.end());
+  ASSERT_EQ(opts_map["k3"], "");
+  // Regular nested options
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2={nk1=nv1;nk2=nv2};k3=v3", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_EQ(opts_map["k2"], "nk1=nv1;nk2=nv2");
+  ASSERT_EQ(opts_map["k3"], "v3");
+  // Multi-level nested options
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2={nk1=nv1;nk2={nnk1=nnk2}};"
+                        "k3={nk1={nnk1={nnnk1=nnnv1;nnnk2;nnnv2}}};k4=v4",
+                        &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_EQ(opts_map["k2"], "nk1=nv1;nk2={nnk1=nnk2}");
+  ASSERT_EQ(opts_map["k3"], "nk1={nnk1={nnnk1=nnnv1;nnnk2;nnnv2}}");
+  ASSERT_EQ(opts_map["k4"], "v4");
+  // Garbage inside curly braces
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2={dfad=};k3={=};k4=v4",
+                        &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_EQ(opts_map["k2"], "dfad=");
+  ASSERT_EQ(opts_map["k3"], "=");
+  ASSERT_EQ(opts_map["k4"], "v4");
+  // Empty nested options
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2={};", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_EQ(opts_map["k2"], "");
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2={{{{}}}{}{}};", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_EQ(opts_map["k2"], "{{{}}}{}{}");
+  // With random spaces
+  opts_map.clear();
+  ASSERT_OK(StringToMap("  k1 =  v1 ; k2= {nk1=nv1; nk2={nnk1=nnk2}}  ; "
+                        "k3={  {   } }; k4= v4  ",
+                        &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_EQ(opts_map["k2"], "nk1=nv1; nk2={nnk1=nnk2}");
+  ASSERT_EQ(opts_map["k3"], "{   }");
+  ASSERT_EQ(opts_map["k4"], "v4");
+
+  // Empty key
+  ASSERT_NOK(StringToMap("k1=v1;k2=v2;=", &opts_map));
+  ASSERT_NOK(StringToMap("=v1;k2=v2", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2v2;", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2=v2;fadfa", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2=v2;;", &opts_map));
+  // Mismatch curly braces
+  ASSERT_NOK(StringToMap("k1=v1;k2={;k3=v3", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2={{};k3=v3", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2={}};k3=v3", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2={{}{}}};k3=v3", &opts_map));
+  // However this is valid!
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2=};k3=v3", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_EQ(opts_map["k2"], "}");
+  ASSERT_EQ(opts_map["k3"], "v3");
+
+  // Invalid chars after closing curly brace
+  ASSERT_NOK(StringToMap("k1=v1;k2={{}}{};k3=v3", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2={{}}cfda;k3=v3", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2={{}}  cfda;k3=v3", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2={{}}  cfda", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2={{}}{}", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2={{dfdl}adfa}{}", &opts_map));
+}
+#endif  // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE  // StringToMap is not supported in ROCKSDB_LITE
+TEST_F(OptionsTest, StringToMapRandomTest) {
+  std::unordered_map<std::string, std::string> opts_map;
+  // Make sure segfault is not hit by semi-random strings
+
+  std::vector<std::string> bases = {
+      "a={aa={};tt={xxx={}}};c=defff",
+      "a={aa={};tt={xxx={}}};c=defff;d={{}yxx{}3{xx}}",
+      "abc={{}{}{}{{{}}}{{}{}{}{}{}{}{}"};
+
+  for (std::string base : bases) {
+    for (int rand_seed = 301; rand_seed < 401; rand_seed++) {
+      Random rnd(rand_seed);
+      for (int attempt = 0; attempt < 10; attempt++) {
+        std::string str = base;
+        // Replace random position to space
+        size_t pos = static_cast<size_t>(
+            rnd.Uniform(static_cast<int>(base.size())));
+        str[pos] = ' ';
+        Status s = StringToMap(str, &opts_map);
+        ASSERT_TRUE(s.ok() || s.IsInvalidArgument());
+        opts_map.clear();
+      }
+    }
+  }
+
+  // Random Construct a string
+  std::vector<char> chars = {'{', '}', ' ', '=', ';', 'c'};
+  for (int rand_seed = 301; rand_seed < 1301; rand_seed++) {
+    Random rnd(rand_seed);
+    int len = rnd.Uniform(30);
+    std::string str = "";
+    for (int attempt = 0; attempt < len; attempt++) {
+      // Add a random character
+      size_t pos = static_cast<size_t>(
+          rnd.Uniform(static_cast<int>(chars.size())));
+      str.append(1, chars[pos]);
+    }
+    Status s = StringToMap(str, &opts_map);
+    ASSERT_TRUE(s.ok() || s.IsInvalidArgument());
+    s = StringToMap("name=" + str, &opts_map);
+    ASSERT_TRUE(s.ok() || s.IsInvalidArgument());
+    opts_map.clear();
+  }
+}
+
+TEST_F(OptionsTest, GetStringFromCompressionType) {
+  std::string res;
+
+  ASSERT_OK(GetStringFromCompressionType(&res, kNoCompression));
+  ASSERT_EQ(res, "kNoCompression");
+
+  ASSERT_OK(GetStringFromCompressionType(&res, kSnappyCompression));
+  ASSERT_EQ(res, "kSnappyCompression");
+
+  ASSERT_OK(GetStringFromCompressionType(&res, kDisableCompressionOption));
+  ASSERT_EQ(res, "kDisableCompressionOption");
+
+  ASSERT_OK(GetStringFromCompressionType(&res, kLZ4Compression));
+  ASSERT_EQ(res, "kLZ4Compression");
+
+  ASSERT_OK(GetStringFromCompressionType(&res, kZlibCompression));
+  ASSERT_EQ(res, "kZlibCompression");
+
+  ASSERT_NOK(
+      GetStringFromCompressionType(&res, static_cast<CompressionType>(-10)));
+}
+
+TEST_F(OptionsTest, OnlyMutableDBOptions) {
+  std::string opt_str;
+  Random rnd(302);
+  ConfigOptions cfg_opts;
+  DBOptions db_opts;
+  DBOptions mdb_opts;
+  std::unordered_set<std::string> m_names;
+  std::unordered_set<std::string> a_names;
+
+  test::RandomInitDBOptions(&db_opts, &rnd);
+  auto db_config = DBOptionsAsConfigurable(db_opts);
+
+  // Get all of the DB Option names (mutable or not)
+  ASSERT_OK(db_config->GetOptionNames(cfg_opts, &a_names));
+
+  // Get only the mutable options from db_opts and set those in mdb_opts
+  cfg_opts.mutable_options_only = true;
+
+  // Get only the Mutable DB Option names
+  ASSERT_OK(db_config->GetOptionNames(cfg_opts, &m_names));
+  ASSERT_OK(GetStringFromDBOptions(cfg_opts, db_opts, &opt_str));
+  ASSERT_OK(GetDBOptionsFromString(cfg_opts, mdb_opts, opt_str, &mdb_opts));
+  std::string mismatch;
+  // Comparing only the mutable options, the two are equivalent
+  auto mdb_config = DBOptionsAsConfigurable(mdb_opts);
+  ASSERT_TRUE(mdb_config->AreEquivalent(cfg_opts, db_config.get(), &mismatch));
+  ASSERT_TRUE(db_config->AreEquivalent(cfg_opts, mdb_config.get(), &mismatch));
+
+  ASSERT_GT(a_names.size(), m_names.size());
+  for (const auto& n : m_names) {
+    std::string m, d;
+    ASSERT_OK(mdb_config->GetOption(cfg_opts, n, &m));
+    ASSERT_OK(db_config->GetOption(cfg_opts, n, &d));
+    ASSERT_EQ(m, d);
+  }
+
+  cfg_opts.mutable_options_only = false;
+  // Comparing all of the options, the two are not equivalent
+  ASSERT_FALSE(mdb_config->AreEquivalent(cfg_opts, db_config.get(), &mismatch));
+  ASSERT_FALSE(db_config->AreEquivalent(cfg_opts, mdb_config.get(), &mismatch));
+
+  // Make sure there are only mutable options being configured
+  ASSERT_OK(GetDBOptionsFromString(cfg_opts, DBOptions(), opt_str, &db_opts));
+}
+
+TEST_F(OptionsTest, OnlyMutableCFOptions) {
+  std::string opt_str;
+  Random rnd(302);
+  ConfigOptions cfg_opts;
+  DBOptions db_opts;
+  ColumnFamilyOptions mcf_opts;
+  ColumnFamilyOptions cf_opts;
+  std::unordered_set<std::string> m_names;
+  std::unordered_set<std::string> a_names;
+
+  test::RandomInitCFOptions(&cf_opts, db_opts, &rnd);
+  cf_opts.comparator = ReverseBytewiseComparator();
+  auto cf_config = CFOptionsAsConfigurable(cf_opts);
+
+  // Get all of the CF Option names (mutable or not)
+  ASSERT_OK(cf_config->GetOptionNames(cfg_opts, &a_names));
+
+  // Get only the mutable options from cf_opts and set those in mcf_opts
+  cfg_opts.mutable_options_only = true;
+  // Get only the Mutable CF Option names
+  ASSERT_OK(cf_config->GetOptionNames(cfg_opts, &m_names));
+  ASSERT_OK(GetStringFromColumnFamilyOptions(cfg_opts, cf_opts, &opt_str));
+  ASSERT_OK(
+      GetColumnFamilyOptionsFromString(cfg_opts, mcf_opts, opt_str, &mcf_opts));
+  std::string mismatch;
+
+  auto mcf_config = CFOptionsAsConfigurable(mcf_opts);
+  // Comparing only the mutable options, the two are equivalent
+  ASSERT_TRUE(mcf_config->AreEquivalent(cfg_opts, cf_config.get(), &mismatch));
+  ASSERT_TRUE(cf_config->AreEquivalent(cfg_opts, mcf_config.get(), &mismatch));
+
+  ASSERT_GT(a_names.size(), m_names.size());
+  for (const auto& n : m_names) {
+    std::string m, d;
+    ASSERT_OK(mcf_config->GetOption(cfg_opts, n, &m));
+    ASSERT_OK(cf_config->GetOption(cfg_opts, n, &d));
+    ASSERT_EQ(m, d);
+  }
+
+  cfg_opts.mutable_options_only = false;
+  // Comparing all of the options, the two are not equivalent
+  ASSERT_FALSE(mcf_config->AreEquivalent(cfg_opts, cf_config.get(), &mismatch));
+  ASSERT_FALSE(cf_config->AreEquivalent(cfg_opts, mcf_config.get(), &mismatch));
+  delete cf_opts.compaction_filter;
+
+  // Make sure the options string contains only mutable options
+  ASSERT_OK(GetColumnFamilyOptionsFromString(cfg_opts, ColumnFamilyOptions(),
+                                             opt_str, &cf_opts));
+  delete cf_opts.compaction_filter;
+}
+
+TEST_F(OptionsTest, SstPartitionerTest) {
+  ConfigOptions cfg_opts;
+  ColumnFamilyOptions cf_opts, new_opt;
+  std::string opts_str, mismatch;
+
+  ASSERT_OK(SstPartitionerFactory::CreateFromString(
+      cfg_opts, SstPartitionerFixedPrefixFactory::kClassName(),
+      &cf_opts.sst_partitioner_factory));
+  ASSERT_NE(cf_opts.sst_partitioner_factory, nullptr);
+  ASSERT_STREQ(cf_opts.sst_partitioner_factory->Name(),
+               SstPartitionerFixedPrefixFactory::kClassName());
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(
+      cfg_opts, ColumnFamilyOptions(),
+      std::string("sst_partitioner_factory={id=") +
+          SstPartitionerFixedPrefixFactory::kClassName() + "; unknown=10;}",
+      &cf_opts));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      cfg_opts, ColumnFamilyOptions(),
+      std::string("sst_partitioner_factory={id=") +
+          SstPartitionerFixedPrefixFactory::kClassName() + "; length=10;}",
+      &cf_opts));
+  ASSERT_NE(cf_opts.sst_partitioner_factory, nullptr);
+  ASSERT_STREQ(cf_opts.sst_partitioner_factory->Name(),
+               SstPartitionerFixedPrefixFactory::kClassName());
+  ASSERT_OK(GetStringFromColumnFamilyOptions(cfg_opts, cf_opts, &opts_str));
+  ASSERT_OK(
+      GetColumnFamilyOptionsFromString(cfg_opts, cf_opts, opts_str, &new_opt));
+  ASSERT_NE(new_opt.sst_partitioner_factory, nullptr);
+  ASSERT_STREQ(new_opt.sst_partitioner_factory->Name(),
+               SstPartitionerFixedPrefixFactory::kClassName());
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(cfg_opts, cf_opts, new_opt));
+  ASSERT_TRUE(cf_opts.sst_partitioner_factory->AreEquivalent(
+      cfg_opts, new_opt.sst_partitioner_factory.get(), &mismatch));
+}
+
+TEST_F(OptionsTest, FileChecksumGenFactoryTest) {
+  ConfigOptions cfg_opts;
+  DBOptions db_opts, new_opt;
+  std::string opts_str, mismatch;
+  auto factory = GetFileChecksumGenCrc32cFactory();
+
+  cfg_opts.ignore_unsupported_options = false;
+
+  ASSERT_OK(GetStringFromDBOptions(cfg_opts, db_opts, &opts_str));
+  ASSERT_OK(GetDBOptionsFromString(cfg_opts, db_opts, opts_str, &new_opt));
+
+  ASSERT_NE(factory, nullptr);
+  ASSERT_OK(FileChecksumGenFactory::CreateFromString(
+      cfg_opts, factory->Name(), &db_opts.file_checksum_gen_factory));
+  ASSERT_NE(db_opts.file_checksum_gen_factory, nullptr);
+  ASSERT_STREQ(db_opts.file_checksum_gen_factory->Name(), factory->Name());
+  ASSERT_NOK(GetDBOptionsFromString(
+      cfg_opts, DBOptions(), "file_checksum_gen_factory=unknown", &db_opts));
+  ASSERT_OK(GetDBOptionsFromString(
+      cfg_opts, DBOptions(),
+      std::string("file_checksum_gen_factory=") + factory->Name(), &db_opts));
+  ASSERT_NE(db_opts.file_checksum_gen_factory, nullptr);
+  ASSERT_STREQ(db_opts.file_checksum_gen_factory->Name(), factory->Name());
+
+  ASSERT_OK(GetStringFromDBOptions(cfg_opts, db_opts, &opts_str));
+  ASSERT_OK(GetDBOptionsFromString(cfg_opts, db_opts, opts_str, &new_opt));
+  ASSERT_NE(new_opt.file_checksum_gen_factory, nullptr);
+  ASSERT_STREQ(new_opt.file_checksum_gen_factory->Name(), factory->Name());
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(cfg_opts, db_opts, new_opt));
+  ASSERT_TRUE(factory->AreEquivalent(
+      cfg_opts, new_opt.file_checksum_gen_factory.get(), &mismatch));
+  ASSERT_TRUE(db_opts.file_checksum_gen_factory->AreEquivalent(
+      cfg_opts, new_opt.file_checksum_gen_factory.get(), &mismatch));
+}
+
+class TestTablePropertiesCollectorFactory
+    : public TablePropertiesCollectorFactory {
+ private:
+  std::string id_;
+
+ public:
+  explicit TestTablePropertiesCollectorFactory(const std::string& id)
+      : id_(id) {}
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /*context*/) override {
+    return nullptr;
+  }
+  static const char* kClassName() { return "TestCollector"; }
+  const char* Name() const override { return kClassName(); }
+  std::string GetId() const override {
+    return std::string(kClassName()) + ":" + id_;
+  }
+};
+
+TEST_F(OptionsTest, OptionTablePropertiesTest) {
+  ConfigOptions cfg_opts;
+  ColumnFamilyOptions orig, copy;
+  orig.table_properties_collector_factories.push_back(
+      std::make_shared<TestTablePropertiesCollectorFactory>("1"));
+  orig.table_properties_collector_factories.push_back(
+      std::make_shared<TestTablePropertiesCollectorFactory>("2"));
+
+  // Push two TablePropertiesCollectorFactories then create a new
+  // ColumnFamilyOptions based on those settings.  The copy should
+  // have no properties but still match the original
+  std::string opts_str;
+  ASSERT_OK(GetStringFromColumnFamilyOptions(cfg_opts, orig, &opts_str));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(cfg_opts, orig, opts_str, &copy));
+  ASSERT_EQ(copy.table_properties_collector_factories.size(), 0);
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(cfg_opts, orig, copy));
+
+  // Now register a TablePropertiesCollectorFactory
+  // Repeat the experiment.  The copy should have the same
+  // properties as the original
+  cfg_opts.registry->AddLibrary("collector")
+      ->AddFactory<TablePropertiesCollectorFactory>(
+          ObjectLibrary::PatternEntry(
+              TestTablePropertiesCollectorFactory::kClassName(), false)
+              .AddSeparator(":"),
+          [](const std::string& name,
+             std::unique_ptr<TablePropertiesCollectorFactory>* guard,
+             std::string* /* errmsg */) {
+            std::string id = name.substr(
+                strlen(TestTablePropertiesCollectorFactory::kClassName()) + 1);
+            guard->reset(new TestTablePropertiesCollectorFactory(id));
+            return guard->get();
+          });
+
+  ASSERT_OK(GetColumnFamilyOptionsFromString(cfg_opts, orig, opts_str, &copy));
+  ASSERT_EQ(copy.table_properties_collector_factories.size(), 2);
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(cfg_opts, orig, copy));
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_F(OptionsTest, ConvertOptionsTest) {
+  LevelDBOptions leveldb_opt;
+  Options converted_opt = ConvertOptions(leveldb_opt);
+
+  ASSERT_EQ(converted_opt.create_if_missing, leveldb_opt.create_if_missing);
+  ASSERT_EQ(converted_opt.error_if_exists, leveldb_opt.error_if_exists);
+  ASSERT_EQ(converted_opt.paranoid_checks, leveldb_opt.paranoid_checks);
+  ASSERT_EQ(converted_opt.env, leveldb_opt.env);
+  ASSERT_EQ(converted_opt.info_log.get(), leveldb_opt.info_log);
+  ASSERT_EQ(converted_opt.write_buffer_size, leveldb_opt.write_buffer_size);
+  ASSERT_EQ(converted_opt.max_open_files, leveldb_opt.max_open_files);
+  ASSERT_EQ(converted_opt.compression, leveldb_opt.compression);
+
+  std::shared_ptr<TableFactory> table_factory = converted_opt.table_factory;
+  const auto table_opt = table_factory->GetOptions<BlockBasedTableOptions>();
+  ASSERT_NE(table_opt, nullptr);
+
+  ASSERT_EQ(table_opt->block_cache->GetCapacity(), 8UL << 20);
+  ASSERT_EQ(table_opt->block_size, leveldb_opt.block_size);
+  ASSERT_EQ(table_opt->block_restart_interval,
+            leveldb_opt.block_restart_interval);
+  ASSERT_EQ(table_opt->filter_policy.get(), leveldb_opt.filter_policy);
+}
+#ifndef ROCKSDB_LITE
+class TestEventListener : public EventListener {
+ private:
+  std::string id_;
+
+ public:
+  explicit TestEventListener(const std::string& id) : id_("Test" + id) {}
+  const char* Name() const override { return id_.c_str(); }
+};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    test_listener_option_info = {
+        {"s",
+         {0, OptionType::kString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+
+};
+
+class TestConfigEventListener : public TestEventListener {
+ private:
+  std::string s_;
+
+ public:
+  explicit TestConfigEventListener(const std::string& id)
+      : TestEventListener("Config" + id) {
+    s_ = id;
+    RegisterOptions("Test", &s_, &test_listener_option_info);
+  }
+};
+
+static int RegisterTestEventListener(ObjectLibrary& library,
+                                     const std::string& arg) {
+  library.AddFactory<EventListener>(
+      "Test" + arg,
+      [](const std::string& name, std::unique_ptr<EventListener>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new TestEventListener(name.substr(4)));
+        return guard->get();
+      });
+  library.AddFactory<EventListener>(
+      "TestConfig" + arg,
+      [](const std::string& name, std::unique_ptr<EventListener>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new TestConfigEventListener(name.substr(10)));
+        return guard->get();
+      });
+  return 1;
+}
+TEST_F(OptionsTest, OptionsListenerTest) {
+  DBOptions orig, copy;
+  orig.listeners.push_back(std::make_shared<TestEventListener>("1"));
+  orig.listeners.push_back(std::make_shared<TestEventListener>("2"));
+  orig.listeners.push_back(std::make_shared<TestEventListener>(""));
+  orig.listeners.push_back(std::make_shared<TestConfigEventListener>("1"));
+  orig.listeners.push_back(std::make_shared<TestConfigEventListener>("2"));
+  orig.listeners.push_back(std::make_shared<TestConfigEventListener>(""));
+  ConfigOptions config_opts(orig);
+  config_opts.registry->AddLibrary("listener", RegisterTestEventListener, "1");
+  std::string opts_str;
+  ASSERT_OK(GetStringFromDBOptions(config_opts, orig, &opts_str));
+  ASSERT_OK(GetDBOptionsFromString(config_opts, orig, opts_str, &copy));
+  ASSERT_OK(GetStringFromDBOptions(config_opts, copy, &opts_str));
+  ASSERT_EQ(
+      copy.listeners.size(),
+      2);  // The Test{Config}1 Listeners could be loaded but not the others
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_opts, orig, copy));
+}
+#endif  // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+const static std::string kCustomEnvName = "Custom";
+const static std::string kCustomEnvProp = "env=" + kCustomEnvName;
+
+static int RegisterCustomEnv(ObjectLibrary& library, const std::string& arg) {
+  library.AddFactory<Env>(
+      arg, [](const std::string& /*name*/, std::unique_ptr<Env>* /*env_guard*/,
+              std::string* /* errmsg */) {
+        static CustomEnv env(Env::Default());
+        return &env;
+      });
+  return 1;
+}
+
+// This test suite tests the old APIs into the Configure options methods.
+// Once those APIs are officially deprecated, this test suite can be deleted.
+class OptionsOldApiTest : public testing::Test {};
+
+TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) {
+  std::unordered_map<std::string, std::string> cf_options_map = {
+      {"write_buffer_size", "1"},
+      {"max_write_buffer_number", "2"},
+      {"min_write_buffer_number_to_merge", "3"},
+      {"max_write_buffer_number_to_maintain", "99"},
+      {"max_write_buffer_size_to_maintain", "-99999"},
+      {"compression", "kSnappyCompression"},
+      {"compression_per_level",
+       "kNoCompression:"
+       "kSnappyCompression:"
+       "kZlibCompression:"
+       "kBZip2Compression:"
+       "kLZ4Compression:"
+       "kLZ4HCCompression:"
+       "kXpressCompression:"
+       "kZSTD:"
+       "kZSTDNotFinalCompression"},
+      {"bottommost_compression", "kLZ4Compression"},
+      {"bottommost_compression_opts", "5:6:7:8:9:true"},
+      {"compression_opts", "4:5:6:7:8:9:true:10:false"},
+      {"num_levels", "8"},
+      {"level0_file_num_compaction_trigger", "8"},
+      {"level0_slowdown_writes_trigger", "9"},
+      {"level0_stop_writes_trigger", "10"},
+      {"target_file_size_base", "12"},
+      {"target_file_size_multiplier", "13"},
+      {"max_bytes_for_level_base", "14"},
+      {"level_compaction_dynamic_level_bytes", "true"},
+      {"level_compaction_dynamic_file_size", "true"},
+      {"max_bytes_for_level_multiplier", "15.0"},
+      {"max_bytes_for_level_multiplier_additional", "16:17:18"},
+      {"max_compaction_bytes", "21"},
+      {"soft_rate_limit", "1.1"},
+      {"hard_rate_limit", "2.1"},
+      {"rate_limit_delay_max_milliseconds", "100"},
+      {"hard_pending_compaction_bytes_limit", "211"},
+      {"arena_block_size", "22"},
+      {"disable_auto_compactions", "true"},
+      {"compaction_style", "kCompactionStyleLevel"},
+      {"compaction_pri", "kOldestSmallestSeqFirst"},
+      {"verify_checksums_in_compaction", "false"},
+      {"compaction_options_fifo", "23"},
+      {"max_sequential_skip_in_iterations", "24"},
+      {"inplace_update_support", "true"},
+      {"report_bg_io_stats", "true"},
+      {"compaction_measure_io_stats", "false"},
+      {"purge_redundant_kvs_while_flush", "false"},
+      {"inplace_update_num_locks", "25"},
+      {"memtable_prefix_bloom_size_ratio", "0.26"},
+      {"memtable_whole_key_filtering", "true"},
+      {"memtable_huge_page_size", "28"},
+      {"bloom_locality", "29"},
+      {"max_successive_merges", "30"},
+      {"min_partial_merge_operands", "31"},
+      {"prefix_extractor", "fixed:31"},
+      {"experimental_mempurge_threshold", "0.003"},
+      {"optimize_filters_for_hits", "true"},
+      {"enable_blob_files", "true"},
+      {"min_blob_size", "1K"},
+      {"blob_file_size", "1G"},
+      {"blob_compression_type", "kZSTD"},
+      {"enable_blob_garbage_collection", "true"},
+      {"blob_garbage_collection_age_cutoff", "0.5"},
+      {"blob_garbage_collection_force_threshold", "0.75"},
+      {"blob_compaction_readahead_size", "256K"},
+      {"blob_file_starting_level", "1"},
+      {"prepopulate_blob_cache", "kDisable"},
+      {"last_level_temperature", "kWarm"},
+  };
+
+  std::unordered_map<std::string, std::string> db_options_map = {
+      {"create_if_missing", "false"},
+      {"create_missing_column_families", "true"},
+      {"error_if_exists", "false"},
+      {"paranoid_checks", "true"},
+      {"track_and_verify_wals_in_manifest", "true"},
+      {"verify_sst_unique_id_in_manifest", "true"},
+      {"max_open_files", "32"},
+      {"max_total_wal_size", "33"},
+      {"use_fsync", "true"},
+      {"db_log_dir", "/db_log_dir"},
+      {"wal_dir", "/wal_dir"},
+      {"delete_obsolete_files_period_micros", "34"},
+      {"max_background_compactions", "35"},
+      {"max_background_flushes", "36"},
+      {"max_log_file_size", "37"},
+      {"log_file_time_to_roll", "38"},
+      {"keep_log_file_num", "39"},
+      {"recycle_log_file_num", "5"},
+      {"max_manifest_file_size", "40"},
+      {"table_cache_numshardbits", "41"},
+      {"WAL_ttl_seconds", "43"},
+      {"WAL_size_limit_MB", "44"},
+      {"manifest_preallocation_size", "45"},
+      {"allow_mmap_reads", "true"},
+      {"allow_mmap_writes", "false"},
+      {"use_direct_reads", "false"},
+      {"use_direct_io_for_flush_and_compaction", "false"},
+      {"is_fd_close_on_exec", "true"},
+      {"skip_log_error_on_recovery", "false"},
+      {"stats_dump_period_sec", "46"},
+      {"stats_persist_period_sec", "57"},
+      {"persist_stats_to_disk", "false"},
+      {"stats_history_buffer_size", "69"},
+      {"advise_random_on_open", "true"},
+      {"use_adaptive_mutex", "false"},
+      {"compaction_readahead_size", "100"},
+      {"random_access_max_buffer_size", "3145728"},
+      {"writable_file_max_buffer_size", "314159"},
+      {"bytes_per_sync", "47"},
+      {"wal_bytes_per_sync", "48"},
+      {"strict_bytes_per_sync", "true"},
+      {"preserve_deletes", "false"},
+  };
+
+  ColumnFamilyOptions base_cf_opt;
+  ColumnFamilyOptions new_cf_opt;
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(
+            base_cf_opt, cf_options_map, &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 1U);
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number, 2);
+  ASSERT_EQ(new_cf_opt.min_write_buffer_number_to_merge, 3);
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number_to_maintain, 99);
+  ASSERT_EQ(new_cf_opt.max_write_buffer_size_to_maintain, -99999);
+  ASSERT_EQ(new_cf_opt.compression, kSnappyCompression);
+  ASSERT_EQ(new_cf_opt.compression_per_level.size(), 9U);
+  ASSERT_EQ(new_cf_opt.compression_per_level[0], kNoCompression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[1], kSnappyCompression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[2], kZlibCompression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[3], kBZip2Compression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[4], kLZ4Compression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[5], kLZ4HCCompression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[6], kXpressCompression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[7], kZSTD);
+  ASSERT_EQ(new_cf_opt.compression_per_level[8], kZSTDNotFinalCompression);
+  ASSERT_EQ(new_cf_opt.compression_opts.window_bits, 4);
+  ASSERT_EQ(new_cf_opt.compression_opts.level, 5);
+  ASSERT_EQ(new_cf_opt.compression_opts.strategy, 6);
+  ASSERT_EQ(new_cf_opt.compression_opts.max_dict_bytes, 7u);
+  ASSERT_EQ(new_cf_opt.compression_opts.zstd_max_train_bytes, 8u);
+  ASSERT_EQ(new_cf_opt.compression_opts.parallel_threads, 9u);
+  ASSERT_EQ(new_cf_opt.compression_opts.enabled, true);
+  ASSERT_EQ(new_cf_opt.compression_opts.max_dict_buffer_bytes, 10u);
+  ASSERT_EQ(new_cf_opt.compression_opts.use_zstd_dict_trainer, false);
+  ASSERT_EQ(new_cf_opt.bottommost_compression, kLZ4Compression);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.window_bits, 5);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.level, 6);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.strategy, 7);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.max_dict_bytes, 8u);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.zstd_max_train_bytes, 9u);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.parallel_threads,
+            CompressionOptions().parallel_threads);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.enabled, true);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.max_dict_buffer_bytes,
+            CompressionOptions().max_dict_buffer_bytes);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.use_zstd_dict_trainer,
+            CompressionOptions().use_zstd_dict_trainer);
+  ASSERT_EQ(new_cf_opt.num_levels, 8);
+  ASSERT_EQ(new_cf_opt.level0_file_num_compaction_trigger, 8);
+  ASSERT_EQ(new_cf_opt.level0_slowdown_writes_trigger, 9);
+  ASSERT_EQ(new_cf_opt.level0_stop_writes_trigger, 10);
+  ASSERT_EQ(new_cf_opt.target_file_size_base, static_cast<uint64_t>(12));
+  ASSERT_EQ(new_cf_opt.target_file_size_multiplier, 13);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_base, 14U);
+  ASSERT_EQ(new_cf_opt.level_compaction_dynamic_level_bytes, true);
+  ASSERT_EQ(new_cf_opt.level_compaction_dynamic_file_size, true);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier, 15.0);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional.size(), 3U);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional[0], 16);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional[1], 17);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional[2], 18);
+  ASSERT_EQ(new_cf_opt.max_compaction_bytes, 21);
+  ASSERT_EQ(new_cf_opt.hard_pending_compaction_bytes_limit, 211);
+  ASSERT_EQ(new_cf_opt.arena_block_size, 22U);
+  ASSERT_EQ(new_cf_opt.disable_auto_compactions, true);
+  ASSERT_EQ(new_cf_opt.compaction_style, kCompactionStyleLevel);
+  ASSERT_EQ(new_cf_opt.compaction_pri, kOldestSmallestSeqFirst);
+  ASSERT_EQ(new_cf_opt.compaction_options_fifo.max_table_files_size,
+            static_cast<uint64_t>(23));
+  ASSERT_EQ(new_cf_opt.max_sequential_skip_in_iterations,
+            static_cast<uint64_t>(24));
+  ASSERT_EQ(new_cf_opt.inplace_update_support, true);
+  ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 25U);
+  ASSERT_EQ(new_cf_opt.memtable_prefix_bloom_size_ratio, 0.26);
+  ASSERT_EQ(new_cf_opt.memtable_whole_key_filtering, true);
+  ASSERT_EQ(new_cf_opt.memtable_huge_page_size, 28U);
+  ASSERT_EQ(new_cf_opt.bloom_locality, 29U);
+  ASSERT_EQ(new_cf_opt.max_successive_merges, 30U);
+  ASSERT_TRUE(new_cf_opt.prefix_extractor != nullptr);
+  ASSERT_EQ(new_cf_opt.optimize_filters_for_hits, true);
+  ASSERT_EQ(new_cf_opt.prefix_extractor->AsString(), "rocksdb.FixedPrefix.31");
+  ASSERT_EQ(new_cf_opt.experimental_mempurge_threshold, 0.003);
+  ASSERT_EQ(new_cf_opt.enable_blob_files, true);
+  ASSERT_EQ(new_cf_opt.min_blob_size, 1ULL << 10);
+  ASSERT_EQ(new_cf_opt.blob_file_size, 1ULL << 30);
+  ASSERT_EQ(new_cf_opt.blob_compression_type, kZSTD);
+  ASSERT_EQ(new_cf_opt.enable_blob_garbage_collection, true);
+  ASSERT_EQ(new_cf_opt.blob_garbage_collection_age_cutoff, 0.5);
+  ASSERT_EQ(new_cf_opt.blob_garbage_collection_force_threshold, 0.75);
+  ASSERT_EQ(new_cf_opt.blob_compaction_readahead_size, 262144);
+  ASSERT_EQ(new_cf_opt.blob_file_starting_level, 1);
+  ASSERT_EQ(new_cf_opt.prepopulate_blob_cache, PrepopulateBlobCache::kDisable);
+  ASSERT_EQ(new_cf_opt.last_level_temperature, Temperature::kWarm);
+  ASSERT_EQ(new_cf_opt.bottommost_temperature, Temperature::kWarm);
+
+  cf_options_map["write_buffer_size"] = "hello";
+  ASSERT_NOK(GetColumnFamilyOptionsFromMap(
+             base_cf_opt, cf_options_map, &new_cf_opt));
+  ConfigOptions exact, loose;
+  exact.sanity_level = ConfigOptions::kSanityLevelExactMatch;
+  loose.sanity_level = ConfigOptions::kSanityLevelLooselyCompatible;
+
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  cf_options_map["write_buffer_size"] = "1";
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(
+            base_cf_opt, cf_options_map, &new_cf_opt));
+
+  cf_options_map["unknown_option"] = "1";
+  ASSERT_NOK(GetColumnFamilyOptionsFromMap(
+             base_cf_opt, cf_options_map, &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(base_cf_opt, cf_options_map,
+                                          &new_cf_opt,
+                                          false, /* input_strings_escaped  */
+                                          true /* ignore_unknown_options */));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
+      loose, base_cf_opt, new_cf_opt, nullptr /* new_opt_map */));
+  ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions(
+      exact /* default for VerifyCFOptions */, base_cf_opt, new_cf_opt, nullptr));
+
+  DBOptions base_db_opt;
+  DBOptions new_db_opt;
+  ASSERT_OK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt));
+  ASSERT_EQ(new_db_opt.create_if_missing, false);
+  ASSERT_EQ(new_db_opt.create_missing_column_families, true);
+  ASSERT_EQ(new_db_opt.error_if_exists, false);
+  ASSERT_EQ(new_db_opt.paranoid_checks, true);
+  ASSERT_EQ(new_db_opt.track_and_verify_wals_in_manifest, true);
+  ASSERT_EQ(new_db_opt.max_open_files, 32);
+  ASSERT_EQ(new_db_opt.max_total_wal_size, static_cast<uint64_t>(33));
+  ASSERT_EQ(new_db_opt.use_fsync, true);
+  ASSERT_EQ(new_db_opt.db_log_dir, "/db_log_dir");
+  ASSERT_EQ(new_db_opt.wal_dir, "/wal_dir");
+  ASSERT_EQ(new_db_opt.delete_obsolete_files_period_micros,
+            static_cast<uint64_t>(34));
+  ASSERT_EQ(new_db_opt.max_background_compactions, 35);
+  ASSERT_EQ(new_db_opt.max_background_flushes, 36);
+  ASSERT_EQ(new_db_opt.max_log_file_size, 37U);
+  ASSERT_EQ(new_db_opt.log_file_time_to_roll, 38U);
+  ASSERT_EQ(new_db_opt.keep_log_file_num, 39U);
+  ASSERT_EQ(new_db_opt.recycle_log_file_num, 5U);
+  ASSERT_EQ(new_db_opt.max_manifest_file_size, static_cast<uint64_t>(40));
+  ASSERT_EQ(new_db_opt.table_cache_numshardbits, 41);
+  ASSERT_EQ(new_db_opt.WAL_ttl_seconds, static_cast<uint64_t>(43));
+  ASSERT_EQ(new_db_opt.WAL_size_limit_MB, static_cast<uint64_t>(44));
+  ASSERT_EQ(new_db_opt.manifest_preallocation_size, 45U);
+  ASSERT_EQ(new_db_opt.allow_mmap_reads, true);
+  ASSERT_EQ(new_db_opt.allow_mmap_writes, false);
+  ASSERT_EQ(new_db_opt.use_direct_reads, false);
+  ASSERT_EQ(new_db_opt.use_direct_io_for_flush_and_compaction, false);
+  ASSERT_EQ(new_db_opt.is_fd_close_on_exec, true);
+  ASSERT_EQ(new_db_opt.stats_dump_period_sec, 46U);
+  ASSERT_EQ(new_db_opt.stats_persist_period_sec, 57U);
+  ASSERT_EQ(new_db_opt.persist_stats_to_disk, false);
+  ASSERT_EQ(new_db_opt.stats_history_buffer_size, 69U);
+  ASSERT_EQ(new_db_opt.advise_random_on_open, true);
+  ASSERT_EQ(new_db_opt.use_adaptive_mutex, false);
+  ASSERT_EQ(new_db_opt.compaction_readahead_size, 100);
+  ASSERT_EQ(new_db_opt.random_access_max_buffer_size, 3145728);
+  ASSERT_EQ(new_db_opt.writable_file_max_buffer_size, 314159);
+  ASSERT_EQ(new_db_opt.bytes_per_sync, static_cast<uint64_t>(47));
+  ASSERT_EQ(new_db_opt.wal_bytes_per_sync, static_cast<uint64_t>(48));
+  ASSERT_EQ(new_db_opt.strict_bytes_per_sync, true);
+
+  db_options_map["max_open_files"] = "hello";
+  ASSERT_NOK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(loose, base_db_opt, new_db_opt));
+
+  // unknow options should fail parsing without ignore_unknown_options = true
+  db_options_map["unknown_db_option"] = "1";
+  ASSERT_NOK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt));
+
+  ASSERT_OK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt,
+                                false, /* input_strings_escaped  */
+                                true /* ignore_unknown_options */));
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(loose, base_db_opt, new_db_opt));
+  ASSERT_NOK(RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt));
+}
+
+TEST_F(OptionsOldApiTest, GetColumnFamilyOptionsFromStringTest) {
+  ColumnFamilyOptions base_cf_opt;
+  ColumnFamilyOptions new_cf_opt;
+  base_cf_opt.table_factory.reset();
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, "", &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=5", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 5U);
+  ASSERT_TRUE(new_cf_opt.table_factory == nullptr);
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=6;", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 6U);
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "  write_buffer_size =  7  ", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 7U);
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "  write_buffer_size =  8 ; ", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 8U);
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=9;max_write_buffer_number=10", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 9U);
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number, 10);
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=11; max_write_buffer_number  =  12 ;",
+            &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 11U);
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number, 12);
+  // Wrong name "max_write_buffer_number_"
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=13;max_write_buffer_number_=14;",
+              &new_cf_opt));
+  ConfigOptions exact;
+  exact.sanity_level = ConfigOptions::kSanityLevelExactMatch;
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  // Comparator from object registry
+  std::string kCompName = "reverse_comp";
+  ObjectLibrary::Default()->AddFactory<const Comparator>(
+      kCompName,
+      [](const std::string& /*name*/,
+         std::unique_ptr<const Comparator>* /*guard*/,
+         std::string* /* errmsg */) { return ReverseBytewiseComparator(); });
+
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      base_cf_opt, "comparator=" + kCompName + ";", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.comparator, ReverseBytewiseComparator());
+
+  // MergeOperator from object registry
+  std::unique_ptr<BytesXOROperator> bxo(new BytesXOROperator());
+  std::string kMoName = bxo->Name();
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      base_cf_opt, "merge_operator=" + kMoName + ";", &new_cf_opt));
+  ASSERT_EQ(kMoName, std::string(new_cf_opt.merge_operator->Name()));
+
+  // Wrong key/value pair
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=13;max_write_buffer_number;", &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  // Error Paring value
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=13;max_write_buffer_number=;", &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  // Missing option name
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=13; =100;", &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  const uint64_t kilo = 1024UL;
+  const uint64_t mega = 1024 * kilo;
+  const uint64_t giga = 1024 * mega;
+  const uint64_t tera = 1024 * giga;
+
+  // Units (k)
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      base_cf_opt, "max_write_buffer_number=15K", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number, 15 * kilo);
+  // Units (m)
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "max_write_buffer_number=16m;inplace_update_num_locks=17M",
+            &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number, 16 * mega);
+  ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 17u * mega);
+  // Units (g)
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      base_cf_opt,
+      "write_buffer_size=18g;prefix_extractor=capped:8;"
+      "arena_block_size=19G",
+      &new_cf_opt));
+
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 18 * giga);
+  ASSERT_EQ(new_cf_opt.arena_block_size, 19 * giga);
+  ASSERT_TRUE(new_cf_opt.prefix_extractor.get() != nullptr);
+  ASSERT_EQ(new_cf_opt.prefix_extractor->AsString(), "rocksdb.CappedPrefix.8");
+
+  // Units (t)
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=20t;arena_block_size=21T", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 20 * tera);
+  ASSERT_EQ(new_cf_opt.arena_block_size, 21 * tera);
+
+  // Nested block based table options
+  // Empty
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=10;max_write_buffer_number=16;"
+            "block_based_table_factory={};arena_block_size=1024",
+            &new_cf_opt));
+  ASSERT_TRUE(new_cf_opt.table_factory != nullptr);
+  // Non-empty
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=10;max_write_buffer_number=16;"
+            "block_based_table_factory={block_cache=1M;block_size=4;};"
+            "arena_block_size=1024",
+            &new_cf_opt));
+  ASSERT_TRUE(new_cf_opt.table_factory != nullptr);
+  // Last one
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=10;max_write_buffer_number=16;"
+            "block_based_table_factory={block_cache=1M;block_size=4;}",
+            &new_cf_opt));
+  ASSERT_TRUE(new_cf_opt.table_factory != nullptr);
+  // Mismatch curly braces
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=10;max_write_buffer_number=16;"
+             "block_based_table_factory={{{block_size=4;};"
+             "arena_block_size=1024",
+             &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  // Unexpected chars after closing curly brace
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=10;max_write_buffer_number=16;"
+             "block_based_table_factory={block_size=4;}};"
+             "arena_block_size=1024",
+             &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=10;max_write_buffer_number=16;"
+             "block_based_table_factory={block_size=4;}xdfa;"
+             "arena_block_size=1024",
+             &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=10;max_write_buffer_number=16;"
+             "block_based_table_factory={block_size=4;}xdfa",
+             &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  // Invalid block based table option
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=10;max_write_buffer_number=16;"
+             "block_based_table_factory={xx_block_size=4;}",
+             &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+           "optimize_filters_for_hits=true",
+           &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "optimize_filters_for_hits=false",
+            &new_cf_opt));
+
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+              "optimize_filters_for_hits=junk",
+              &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  // Nested plain table options
+  // Empty
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=10;max_write_buffer_number=16;"
+            "plain_table_factory={};arena_block_size=1024",
+            &new_cf_opt));
+  ASSERT_TRUE(new_cf_opt.table_factory != nullptr);
+  ASSERT_EQ(std::string(new_cf_opt.table_factory->Name()), "PlainTable");
+  // Non-empty
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=10;max_write_buffer_number=16;"
+            "plain_table_factory={user_key_len=66;bloom_bits_per_key=20;};"
+            "arena_block_size=1024",
+            &new_cf_opt));
+  ASSERT_TRUE(new_cf_opt.table_factory != nullptr);
+  ASSERT_EQ(std::string(new_cf_opt.table_factory->Name()), "PlainTable");
+
+  // memtable factory
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=10;max_write_buffer_number=16;"
+            "memtable=skip_list:10;arena_block_size=1024",
+            &new_cf_opt));
+  ASSERT_TRUE(new_cf_opt.memtable_factory != nullptr);
+  ASSERT_TRUE(new_cf_opt.memtable_factory->IsInstanceOf("SkipListFactory"));
+
+  // blob cache
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      base_cf_opt,
+      "blob_cache={capacity=1M;num_shard_bits=4;"
+      "strict_capacity_limit=true;high_pri_pool_ratio=0.5;};",
+      &new_cf_opt));
+  ASSERT_NE(new_cf_opt.blob_cache, nullptr);
+  ASSERT_EQ(new_cf_opt.blob_cache->GetCapacity(), 1024UL * 1024UL);
+  ASSERT_EQ(static_cast<ShardedCacheBase*>(new_cf_opt.blob_cache.get())
+                ->GetNumShardBits(),
+            4);
+  ASSERT_EQ(new_cf_opt.blob_cache->HasStrictCapacityLimit(), true);
+  ASSERT_EQ(static_cast<LRUCache*>(new_cf_opt.blob_cache.get())
+                ->GetHighPriPoolRatio(),
+            0.5);
+}
+
+TEST_F(OptionsTest, SliceTransformCreateFromString) {
+  std::shared_ptr<const SliceTransform> transform = nullptr;
+  ConfigOptions config_options;
+  config_options.ignore_unsupported_options = false;
+  config_options.ignore_unknown_options = false;
+
+  ASSERT_OK(
+      SliceTransform::CreateFromString(config_options, "fixed:31", &transform));
+  ASSERT_NE(transform, nullptr);
+  ASSERT_FALSE(transform->IsInstanceOf("capped"));
+  ASSERT_TRUE(transform->IsInstanceOf("fixed"));
+  ASSERT_TRUE(transform->IsInstanceOf("rocksdb.FixedPrefix"));
+  ASSERT_EQ(transform->GetId(), "rocksdb.FixedPrefix.31");
+  ASSERT_OK(SliceTransform::CreateFromString(
+      config_options, "rocksdb.FixedPrefix.42", &transform));
+  ASSERT_NE(transform, nullptr);
+  ASSERT_EQ(transform->GetId(), "rocksdb.FixedPrefix.42");
+
+  ASSERT_OK(SliceTransform::CreateFromString(config_options, "capped:16",
+                                             &transform));
+  ASSERT_NE(transform, nullptr);
+  ASSERT_FALSE(transform->IsInstanceOf("fixed"));
+  ASSERT_TRUE(transform->IsInstanceOf("capped"));
+  ASSERT_TRUE(transform->IsInstanceOf("rocksdb.CappedPrefix"));
+  ASSERT_EQ(transform->GetId(), "rocksdb.CappedPrefix.16");
+  ASSERT_OK(SliceTransform::CreateFromString(
+      config_options, "rocksdb.CappedPrefix.42", &transform));
+  ASSERT_NE(transform, nullptr);
+  ASSERT_EQ(transform->GetId(), "rocksdb.CappedPrefix.42");
+
+  ASSERT_OK(SliceTransform::CreateFromString(config_options, "rocksdb.Noop",
+                                             &transform));
+  ASSERT_NE(transform, nullptr);
+
+  ASSERT_NOK(SliceTransform::CreateFromString(config_options,
+                                              "fixed:21:invalid", &transform));
+  ASSERT_NOK(SliceTransform::CreateFromString(config_options,
+                                              "capped:21:invalid", &transform));
+  ASSERT_NOK(
+      SliceTransform::CreateFromString(config_options, "fixed", &transform));
+  ASSERT_NOK(
+      SliceTransform::CreateFromString(config_options, "capped", &transform));
+  ASSERT_NOK(
+      SliceTransform::CreateFromString(config_options, "fixed:", &transform));
+  ASSERT_NOK(
+      SliceTransform::CreateFromString(config_options, "capped:", &transform));
+  ASSERT_NOK(SliceTransform::CreateFromString(
+      config_options, "rocksdb.FixedPrefix:42", &transform));
+  ASSERT_NOK(SliceTransform::CreateFromString(
+      config_options, "rocksdb.CappedPrefix:42", &transform));
+  ASSERT_NOK(SliceTransform::CreateFromString(
+      config_options, "rocksdb.FixedPrefix", &transform));
+  ASSERT_NOK(SliceTransform::CreateFromString(
+      config_options, "rocksdb.CappedPrefix", &transform));
+  ASSERT_NOK(SliceTransform::CreateFromString(
+      config_options, "rocksdb.FixedPrefix.", &transform));
+  ASSERT_NOK(SliceTransform::CreateFromString(
+      config_options, "rocksdb.CappedPrefix.", &transform));
+  ASSERT_NOK(
+      SliceTransform::CreateFromString(config_options, "invalid", &transform));
+
+#ifndef ROCKSDB_LITE
+  ASSERT_OK(SliceTransform::CreateFromString(
+      config_options, "rocksdb.CappedPrefix.11", &transform));
+  ASSERT_NE(transform, nullptr);
+  ASSERT_EQ(transform->GetId(), "rocksdb.CappedPrefix.11");
+  ASSERT_TRUE(transform->IsInstanceOf("capped"));
+  ASSERT_TRUE(transform->IsInstanceOf("capped:11"));
+  ASSERT_TRUE(transform->IsInstanceOf("rocksdb.CappedPrefix"));
+  ASSERT_TRUE(transform->IsInstanceOf("rocksdb.CappedPrefix.11"));
+  ASSERT_FALSE(transform->IsInstanceOf("fixed"));
+  ASSERT_FALSE(transform->IsInstanceOf("fixed:11"));
+  ASSERT_FALSE(transform->IsInstanceOf("rocksdb.FixedPrefix"));
+  ASSERT_FALSE(transform->IsInstanceOf("rocksdb.FixedPrefix.11"));
+
+  ASSERT_OK(SliceTransform::CreateFromString(
+      config_options, "rocksdb.FixedPrefix.11", &transform));
+  ASSERT_TRUE(transform->IsInstanceOf("fixed"));
+  ASSERT_TRUE(transform->IsInstanceOf("fixed:11"));
+  ASSERT_TRUE(transform->IsInstanceOf("rocksdb.FixedPrefix"));
+  ASSERT_TRUE(transform->IsInstanceOf("rocksdb.FixedPrefix.11"));
+  ASSERT_FALSE(transform->IsInstanceOf("capped"));
+  ASSERT_FALSE(transform->IsInstanceOf("capped:11"));
+  ASSERT_FALSE(transform->IsInstanceOf("rocksdb.CappedPrefix"));
+  ASSERT_FALSE(transform->IsInstanceOf("rocksdb.CappedPrefix.11"));
+#endif  // ROCKSDB_LITE
+}
+
+TEST_F(OptionsOldApiTest, GetBlockBasedTableOptionsFromString) {
+  BlockBasedTableOptions table_opt;
+  BlockBasedTableOptions new_opt;
+  // make sure default values are overwritten by something else
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      table_opt,
+      "cache_index_and_filter_blocks=1;index_type=kHashSearch;"
+      "checksum=kxxHash;no_block_cache=1;"
+      "block_cache=1M;block_cache_compressed=1k;block_size=1024;"
+      "block_size_deviation=8;block_restart_interval=4;"
+      "format_version=5;whole_key_filtering=1;"
+      "filter_policy=bloomfilter:4.567:false;",
+      &new_opt));
+  ASSERT_TRUE(new_opt.cache_index_and_filter_blocks);
+  ASSERT_EQ(new_opt.index_type, BlockBasedTableOptions::kHashSearch);
+  ASSERT_EQ(new_opt.checksum, ChecksumType::kxxHash);
+  ASSERT_TRUE(new_opt.no_block_cache);
+  ASSERT_TRUE(new_opt.block_cache != nullptr);
+  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL);
+  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
+  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL);
+  ASSERT_EQ(new_opt.block_size, 1024UL);
+  ASSERT_EQ(new_opt.block_size_deviation, 8);
+  ASSERT_EQ(new_opt.block_restart_interval, 4);
+  ASSERT_EQ(new_opt.format_version, 5U);
+  ASSERT_EQ(new_opt.whole_key_filtering, true);
+  ASSERT_TRUE(new_opt.filter_policy != nullptr);
+  const BloomFilterPolicy* bfp =
+      dynamic_cast<const BloomFilterPolicy*>(new_opt.filter_policy.get());
+  EXPECT_EQ(bfp->GetMillibitsPerKey(), 4567);
+  EXPECT_EQ(bfp->GetWholeBitsPerKey(), 5);
+
+  // unknown option
+  ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt,
+             "cache_index_and_filter_blocks=1;index_type=kBinarySearch;"
+             "bad_option=1",
+             &new_opt));
+  ASSERT_EQ(static_cast<bool>(table_opt.cache_index_and_filter_blocks),
+            new_opt.cache_index_and_filter_blocks);
+  ASSERT_EQ(table_opt.index_type, new_opt.index_type);
+
+  // unrecognized index type
+  ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt,
+             "cache_index_and_filter_blocks=1;index_type=kBinarySearchXX",
+             &new_opt));
+  ASSERT_EQ(table_opt.cache_index_and_filter_blocks,
+            new_opt.cache_index_and_filter_blocks);
+  ASSERT_EQ(table_opt.index_type, new_opt.index_type);
+
+  // unrecognized checksum type
+  ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt,
+             "cache_index_and_filter_blocks=1;checksum=kxxHashXX",
+             &new_opt));
+  ASSERT_EQ(table_opt.cache_index_and_filter_blocks,
+            new_opt.cache_index_and_filter_blocks);
+  ASSERT_EQ(table_opt.index_type, new_opt.index_type);
+
+  // unrecognized filter policy name
+  ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt,
+             "cache_index_and_filter_blocks=1;"
+             "filter_policy=bloomfilterxx:4:true",
+             &new_opt));
+  ASSERT_EQ(table_opt.cache_index_and_filter_blocks,
+            new_opt.cache_index_and_filter_blocks);
+  ASSERT_EQ(table_opt.filter_policy, new_opt.filter_policy);
+
+  // Used to be rejected, now accepted
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      table_opt, "filter_policy=bloomfilter:4", &new_opt));
+  bfp = dynamic_cast<const BloomFilterPolicy*>(new_opt.filter_policy.get());
+  EXPECT_EQ(bfp->GetMillibitsPerKey(), 4000);
+  EXPECT_EQ(bfp->GetWholeBitsPerKey(), 4);
+
+  // Check block cache options are overwritten when specified
+  // in new format as a struct.
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt,
+             "block_cache={capacity=1M;num_shard_bits=4;"
+             "strict_capacity_limit=true;high_pri_pool_ratio=0.5;};"
+             "block_cache_compressed={capacity=1M;num_shard_bits=4;"
+             "strict_capacity_limit=true;high_pri_pool_ratio=0.5;}",
+             &new_opt));
+  ASSERT_TRUE(new_opt.block_cache != nullptr);
+  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(new_opt.block_cache)
+                ->GetNumShardBits(),
+            4);
+  ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), true);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
+                new_opt.block_cache)->GetHighPriPoolRatio(), 0.5);
+  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
+  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL*1024UL);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(
+                new_opt.block_cache_compressed)
+                ->GetNumShardBits(),
+            4);
+  ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), true);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
+                new_opt.block_cache_compressed)->GetHighPriPoolRatio(),
+                0.5);
+
+  // Set only block cache capacity. Check other values are
+  // reset to default values.
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt,
+             "block_cache={capacity=2M};"
+             "block_cache_compressed={capacity=2M}",
+             &new_opt));
+  ASSERT_TRUE(new_opt.block_cache != nullptr);
+  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 2*1024UL*1024UL);
+  // Default values
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(new_opt.block_cache)
+                ->GetNumShardBits(),
+            GetDefaultCacheShardBits(new_opt.block_cache->GetCapacity()));
+  ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), false);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache)
+                ->GetHighPriPoolRatio(),
+            0.5);
+  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
+  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 2*1024UL*1024UL);
+  // Default values
+  ASSERT_EQ(
+      std::dynamic_pointer_cast<ShardedCacheBase>(
+          new_opt.block_cache_compressed)
+          ->GetNumShardBits(),
+      GetDefaultCacheShardBits(new_opt.block_cache_compressed->GetCapacity()));
+  ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache_compressed)
+                ->GetHighPriPoolRatio(),
+            0.5);
+
+  // Set couple of block cache options.
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      table_opt,
+      "block_cache={num_shard_bits=5;high_pri_pool_ratio=0.5;};"
+      "block_cache_compressed={num_shard_bits=5;"
+      "high_pri_pool_ratio=0.0;}",
+      &new_opt));
+  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 0);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(new_opt.block_cache)
+                ->GetNumShardBits(),
+            5);
+  ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), false);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
+                new_opt.block_cache)->GetHighPriPoolRatio(), 0.5);
+  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
+  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 0);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(
+                new_opt.block_cache_compressed)
+                ->GetNumShardBits(),
+            5);
+  ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache_compressed)
+                ->GetHighPriPoolRatio(),
+            0.0);
+
+  // Set couple of block cache options.
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt,
+             "block_cache={capacity=1M;num_shard_bits=4;"
+             "strict_capacity_limit=true;};"
+             "block_cache_compressed={capacity=1M;num_shard_bits=4;"
+             "strict_capacity_limit=true;}",
+             &new_opt));
+  ASSERT_TRUE(new_opt.block_cache != nullptr);
+  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(new_opt.block_cache)
+                ->GetNumShardBits(),
+            4);
+  ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), true);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache)
+                ->GetHighPriPoolRatio(),
+            0.5);
+  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
+  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL*1024UL);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCacheBase>(
+                new_opt.block_cache_compressed)
+                ->GetNumShardBits(),
+            4);
+  ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), true);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache_compressed)
+                ->GetHighPriPoolRatio(),
+            0.5);
+}
+
+TEST_F(OptionsOldApiTest, GetPlainTableOptionsFromString) {
+  PlainTableOptions table_opt;
+  PlainTableOptions new_opt;
+  // make sure default values are overwritten by something else
+  ASSERT_OK(GetPlainTableOptionsFromString(table_opt,
+            "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;"
+            "index_sparseness=8;huge_page_tlb_size=4;encoding_type=kPrefix;"
+            "full_scan_mode=true;store_index_in_file=true",
+            &new_opt));
+  ASSERT_EQ(new_opt.user_key_len, 66u);
+  ASSERT_EQ(new_opt.bloom_bits_per_key, 20);
+  ASSERT_EQ(new_opt.hash_table_ratio, 0.5);
+  ASSERT_EQ(new_opt.index_sparseness, 8);
+  ASSERT_EQ(new_opt.huge_page_tlb_size, 4);
+  ASSERT_EQ(new_opt.encoding_type, EncodingType::kPrefix);
+  ASSERT_TRUE(new_opt.full_scan_mode);
+  ASSERT_TRUE(new_opt.store_index_in_file);
+
+  std::unordered_map<std::string, std::string> opt_map;
+  ASSERT_OK(StringToMap(
+      "user_key_len=55;bloom_bits_per_key=10;huge_page_tlb_size=8;", &opt_map));
+  ASSERT_OK(GetPlainTableOptionsFromMap(table_opt, opt_map, &new_opt));
+  ASSERT_EQ(new_opt.user_key_len, 55u);
+  ASSERT_EQ(new_opt.bloom_bits_per_key, 10);
+  ASSERT_EQ(new_opt.huge_page_tlb_size, 8);
+
+  // unknown option
+  ASSERT_NOK(GetPlainTableOptionsFromString(table_opt,
+             "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;"
+             "bad_option=1",
+             &new_opt));
+
+  // unrecognized EncodingType
+  ASSERT_NOK(GetPlainTableOptionsFromString(table_opt,
+             "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;"
+             "encoding_type=kPrefixXX",
+             &new_opt));
+}
+
+TEST_F(OptionsOldApiTest, GetOptionsFromStringTest) {
+  Options base_options, new_options;
+  base_options.write_buffer_size = 20;
+  base_options.min_write_buffer_number_to_merge = 15;
+  BlockBasedTableOptions block_based_table_options;
+  block_based_table_options.cache_index_and_filter_blocks = true;
+  base_options.table_factory.reset(
+      NewBlockBasedTableFactory(block_based_table_options));
+
+  // Register an Env with object registry.
+  ObjectLibrary::Default()->AddFactory<Env>(
+      "CustomEnvDefault",
+      [](const std::string& /*name*/, std::unique_ptr<Env>* /*env_guard*/,
+         std::string* /* errmsg */) {
+        static CustomEnv env(Env::Default());
+        return &env;
+      });
+
+  ASSERT_OK(GetOptionsFromString(
+      base_options,
+      "write_buffer_size=10;max_write_buffer_number=16;"
+      "block_based_table_factory={block_cache=1M;block_size=4;};"
+      "compression_opts=4:5:6;create_if_missing=true;max_open_files=1;"
+      "bottommost_compression_opts=5:6:7;create_if_missing=true;max_open_files="
+      "1;"
+      "rate_limiter_bytes_per_sec=1024;env=CustomEnvDefault",
+      &new_options));
+
+  ASSERT_EQ(new_options.compression_opts.window_bits, 4);
+  ASSERT_EQ(new_options.compression_opts.level, 5);
+  ASSERT_EQ(new_options.compression_opts.strategy, 6);
+  ASSERT_EQ(new_options.compression_opts.max_dict_bytes, 0u);
+  ASSERT_EQ(new_options.compression_opts.zstd_max_train_bytes, 0u);
+  ASSERT_EQ(new_options.compression_opts.parallel_threads, 1u);
+  ASSERT_EQ(new_options.compression_opts.enabled, false);
+  ASSERT_EQ(new_options.compression_opts.use_zstd_dict_trainer, true);
+  ASSERT_EQ(new_options.bottommost_compression, kDisableCompressionOption);
+  ASSERT_EQ(new_options.bottommost_compression_opts.window_bits, 5);
+  ASSERT_EQ(new_options.bottommost_compression_opts.level, 6);
+  ASSERT_EQ(new_options.bottommost_compression_opts.strategy, 7);
+  ASSERT_EQ(new_options.bottommost_compression_opts.max_dict_bytes, 0u);
+  ASSERT_EQ(new_options.bottommost_compression_opts.zstd_max_train_bytes, 0u);
+  ASSERT_EQ(new_options.bottommost_compression_opts.parallel_threads, 1u);
+  ASSERT_EQ(new_options.bottommost_compression_opts.enabled, false);
+  ASSERT_EQ(new_options.bottommost_compression_opts.use_zstd_dict_trainer,
+            true);
+  ASSERT_EQ(new_options.write_buffer_size, 10U);
+  ASSERT_EQ(new_options.max_write_buffer_number, 16);
+
+  auto new_block_based_table_options =
+      new_options.table_factory->GetOptions<BlockBasedTableOptions>();
+  ASSERT_NE(new_block_based_table_options, nullptr);
+  ASSERT_EQ(new_block_based_table_options->block_cache->GetCapacity(),
+            1U << 20);
+  ASSERT_EQ(new_block_based_table_options->block_size, 4U);
+  // don't overwrite block based table options
+  ASSERT_TRUE(new_block_based_table_options->cache_index_and_filter_blocks);
+
+  ASSERT_EQ(new_options.create_if_missing, true);
+  ASSERT_EQ(new_options.max_open_files, 1);
+  ASSERT_TRUE(new_options.rate_limiter.get() != nullptr);
+  Env* newEnv = new_options.env;
+  ASSERT_OK(Env::LoadEnv("CustomEnvDefault", &newEnv));
+  ASSERT_EQ(newEnv, new_options.env);
+}
+
+TEST_F(OptionsOldApiTest, DBOptionsSerialization) {
+  Options base_options, new_options;
+  Random rnd(301);
+
+  // Phase 1: Make big change in base_options
+  test::RandomInitDBOptions(&base_options, &rnd);
+
+  // Phase 2: obtain a string from base_option
+  std::string base_options_file_content;
+  ASSERT_OK(GetStringFromDBOptions(&base_options_file_content, base_options));
+
+  // Phase 3: Set new_options from the derived string and expect
+  //          new_options == base_options
+  ASSERT_OK(GetDBOptionsFromString(DBOptions(), base_options_file_content,
+                                   &new_options));
+  ConfigOptions config_options;
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_options, base_options, new_options));
+}
+
+TEST_F(OptionsOldApiTest, ColumnFamilyOptionsSerialization) {
+  Options options;
+  ColumnFamilyOptions base_opt, new_opt;
+  Random rnd(302);
+  // Phase 1: randomly assign base_opt
+  // custom type options
+  test::RandomInitCFOptions(&base_opt, options, &rnd);
+
+  // Phase 2: obtain a string from base_opt
+  std::string base_options_file_content;
+  ASSERT_OK(
+      GetStringFromColumnFamilyOptions(&base_options_file_content, base_opt));
+
+  // Phase 3: Set new_opt from the derived string and expect
+  //          new_opt == base_opt
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      ColumnFamilyOptions(), base_options_file_content, &new_opt));
+  ConfigOptions config_options;
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_opt, new_opt));
+  if (base_opt.compaction_filter) {
+    delete base_opt.compaction_filter;
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+class OptionsParserTest : public testing::Test {
+ public:
+  OptionsParserTest() { fs_.reset(new test::StringFS(FileSystem::Default())); }
+
+ protected:
+  std::shared_ptr<test::StringFS> fs_;
+};
+
+TEST_F(OptionsParserTest, Comment) {
+  DBOptions db_opt;
+  db_opt.max_open_files = 12345;
+  db_opt.max_background_flushes = 301;
+  db_opt.max_total_wal_size = 1024;
+  ColumnFamilyOptions cf_opt;
+
+  std::string options_file_content =
+      "# This is a testing option string.\n"
+      "# Currently we only support \"#\" styled comment.\n"
+      "\n"
+      "[Version]\n"
+      "  rocksdb_version=3.14.0\n"
+      "  options_file_version=1\n"
+      "[ DBOptions ]\n"
+      "  # note that we don't support space around \"=\"\n"
+      "  max_open_files=12345;\n"
+      "  max_background_flushes=301  # comment after a statement is fine\n"
+      "  # max_background_flushes=1000  # this line would be ignored\n"
+      "  # max_background_compactions=2000 # so does this one\n"
+      "  max_total_wal_size=1024  # keep_log_file_num=1000\n"
+      "[CFOptions   \"default\"]  # column family must be specified\n"
+      "                     # in the correct order\n"
+      "  # if a section is blank, we will use the default\n";
+
+  const std::string kTestFileName = "test-rocksdb-options.ini";
+  ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content));
+  RocksDBOptionsParser parser;
+  ASSERT_OK(
+      parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */));
+
+  ConfigOptions exact;
+  exact.input_strings_escaped = false;
+  exact.sanity_level = ConfigOptions::kSanityLevelExactMatch;
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyDBOptions(exact, *parser.db_opt(), db_opt));
+  ASSERT_EQ(parser.NumColumnFamilies(), 1U);
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
+      exact, *parser.GetCFOptions("default"), cf_opt));
+}
+
+TEST_F(OptionsParserTest, ExtraSpace) {
+  std::string options_file_content =
+      "# This is a testing option string.\n"
+      "# Currently we only support \"#\" styled comment.\n"
+      "\n"
+      "[      Version   ]\n"
+      "  rocksdb_version     = 3.14.0      \n"
+      "  options_file_version=1   # some comment\n"
+      "[DBOptions  ]  # some comment\n"
+      "max_open_files=12345   \n"
+      "    max_background_flushes   =    301   \n"
+      " max_total_wal_size     =   1024  # keep_log_file_num=1000\n"
+      "        [CFOptions      \"default\"     ]\n"
+      "  # if a section is blank, we will use the default\n";
+
+  const std::string kTestFileName = "test-rocksdb-options.ini";
+  ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content));
+  RocksDBOptionsParser parser;
+  ASSERT_OK(
+      parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */));
+}
+
+TEST_F(OptionsParserTest, MissingDBOptions) {
+  std::string options_file_content =
+      "# This is a testing option string.\n"
+      "# Currently we only support \"#\" styled comment.\n"
+      "\n"
+      "[Version]\n"
+      "  rocksdb_version=3.14.0\n"
+      "  options_file_version=1\n"
+      "[CFOptions \"default\"]\n"
+      "  # if a section is blank, we will use the default\n";
+
+  const std::string kTestFileName = "test-rocksdb-options.ini";
+  ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content));
+  RocksDBOptionsParser parser;
+  ASSERT_NOK(
+      parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */));
+  ;
+}
+
+TEST_F(OptionsParserTest, DoubleDBOptions) {
+  DBOptions db_opt;
+  db_opt.max_open_files = 12345;
+  db_opt.max_background_flushes = 301;
+  db_opt.max_total_wal_size = 1024;
+  ColumnFamilyOptions cf_opt;
+
+  std::string options_file_content =
+      "# This is a testing option string.\n"
+      "# Currently we only support \"#\" styled comment.\n"
+      "\n"
+      "[Version]\n"
+      "  rocksdb_version=3.14.0\n"
+      "  options_file_version=1\n"
+      "[DBOptions]\n"
+      "  max_open_files=12345\n"
+      "  max_background_flushes=301\n"
+      "  max_total_wal_size=1024  # keep_log_file_num=1000\n"
+      "[DBOptions]\n"
+      "[CFOptions \"default\"]\n"
+      "  # if a section is blank, we will use the default\n";
+
+  const std::string kTestFileName = "test-rocksdb-options.ini";
+  ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content));
+  RocksDBOptionsParser parser;
+  ASSERT_NOK(
+      parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */));
+}
+
+TEST_F(OptionsParserTest, NoDefaultCFOptions) {
+  DBOptions db_opt;
+  db_opt.max_open_files = 12345;
+  db_opt.max_background_flushes = 301;
+  db_opt.max_total_wal_size = 1024;
+  ColumnFamilyOptions cf_opt;
+
+  std::string options_file_content =
+      "# This is a testing option string.\n"
+      "# Currently we only support \"#\" styled comment.\n"
+      "\n"
+      "[Version]\n"
+      "  rocksdb_version=3.14.0\n"
+      "  options_file_version=1\n"
+      "[DBOptions]\n"
+      "  max_open_files=12345\n"
+      "  max_background_flushes=301\n"
+      "  max_total_wal_size=1024  # keep_log_file_num=1000\n"
+      "[CFOptions \"something_else\"]\n"
+      "  # if a section is blank, we will use the default\n";
+
+  const std::string kTestFileName = "test-rocksdb-options.ini";
+  ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content));
+  RocksDBOptionsParser parser;
+  ASSERT_NOK(
+      parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */));
+}
+
+TEST_F(OptionsParserTest, DefaultCFOptionsMustBeTheFirst) {
+  DBOptions db_opt;
+  db_opt.max_open_files = 12345;
+  db_opt.max_background_flushes = 301;
+  db_opt.max_total_wal_size = 1024;
+  ColumnFamilyOptions cf_opt;
+
+  std::string options_file_content =
+      "# This is a testing option string.\n"
+      "# Currently we only support \"#\" styled comment.\n"
+      "\n"
+      "[Version]\n"
+      "  rocksdb_version=3.14.0\n"
+      "  options_file_version=1\n"
+      "[DBOptions]\n"
+      "  max_open_files=12345\n"
+      "  max_background_flushes=301\n"
+      "  max_total_wal_size=1024  # keep_log_file_num=1000\n"
+      "[CFOptions \"something_else\"]\n"
+      "  # if a section is blank, we will use the default\n"
+      "[CFOptions \"default\"]\n"
+      "  # if a section is blank, we will use the default\n";
+
+  const std::string kTestFileName = "test-rocksdb-options.ini";
+  ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content));
+  RocksDBOptionsParser parser;
+  ASSERT_NOK(
+      parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */));
+}
+
+TEST_F(OptionsParserTest, DuplicateCFOptions) {
+  DBOptions db_opt;
+  db_opt.max_open_files = 12345;
+  db_opt.max_background_flushes = 301;
+  db_opt.max_total_wal_size = 1024;
+  ColumnFamilyOptions cf_opt;
+
+  std::string options_file_content =
+      "# This is a testing option string.\n"
+      "# Currently we only support \"#\" styled comment.\n"
+      "\n"
+      "[Version]\n"
+      "  rocksdb_version=3.14.0\n"
+      "  options_file_version=1\n"
+      "[DBOptions]\n"
+      "  max_open_files=12345\n"
+      "  max_background_flushes=301\n"
+      "  max_total_wal_size=1024  # keep_log_file_num=1000\n"
+      "[CFOptions \"default\"]\n"
+      "[CFOptions \"something_else\"]\n"
+      "[CFOptions \"something_else\"]\n";
+
+  const std::string kTestFileName = "test-rocksdb-options.ini";
+  ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content));
+  RocksDBOptionsParser parser;
+  ASSERT_NOK(
+      parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */));
+}
+
+TEST_F(OptionsParserTest, IgnoreUnknownOptions) {
+  for (int case_id = 0; case_id < 5; case_id++) {
+    DBOptions db_opt;
+    db_opt.max_open_files = 12345;
+    db_opt.max_background_flushes = 301;
+    db_opt.max_total_wal_size = 1024;
+    ColumnFamilyOptions cf_opt;
+
+    std::string version_string;
+    bool should_ignore = true;
+    if (case_id == 0) {
+      // same version
+      should_ignore = false;
+      version_string = std::to_string(ROCKSDB_MAJOR) + "." +
+                       std::to_string(ROCKSDB_MINOR) + ".0";
+    } else if (case_id == 1) {
+      // higher minor version
+      should_ignore = true;
+      version_string = std::to_string(ROCKSDB_MAJOR) + "." +
+                       std::to_string(ROCKSDB_MINOR + 1) + ".0";
+    } else if (case_id == 2) {
+      // higher major version.
+      should_ignore = true;
+      version_string = std::to_string(ROCKSDB_MAJOR + 1) + ".0.0";
+    } else if (case_id == 3) {
+      // lower minor version
+#if ROCKSDB_MINOR == 0
+      continue;
+#else
+      version_string = std::to_string(ROCKSDB_MAJOR) + "." +
+                       std::to_string(ROCKSDB_MINOR - 1) + ".0";
+      should_ignore = false;
+#endif
+    } else {
+      // lower major version
+      should_ignore = false;
+      version_string = std::to_string(ROCKSDB_MAJOR - 1) + "." +
+                       std::to_string(ROCKSDB_MINOR) + ".0";
+    }
+
+    std::string options_file_content =
+        "# This is a testing option string.\n"
+        "# Currently we only support \"#\" styled comment.\n"
+        "\n"
+        "[Version]\n"
+        "  rocksdb_version=" +
+        version_string +
+        "\n"
+        "  options_file_version=1\n"
+        "[DBOptions]\n"
+        "  max_open_files=12345\n"
+        "  max_background_flushes=301\n"
+        "  max_total_wal_size=1024  # keep_log_file_num=1000\n"
+        "  unknown_db_option1=321\n"
+        "  unknown_db_option2=false\n"
+        "[CFOptions \"default\"]\n"
+        "  unknown_cf_option1=hello\n"
+        "[CFOptions \"something_else\"]\n"
+        "  unknown_cf_option2=world\n"
+        "  # if a section is blank, we will use the default\n";
+
+    const std::string kTestFileName = "test-rocksdb-options.ini";
+    auto s = fs_->FileExists(kTestFileName, IOOptions(), nullptr);
+    ASSERT_TRUE(s.ok() || s.IsNotFound());
+    if (s.ok()) {
+      ASSERT_OK(fs_->DeleteFile(kTestFileName, IOOptions(), nullptr));
+    }
+    ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content));
+    RocksDBOptionsParser parser;
+    ASSERT_NOK(parser.Parse(kTestFileName, fs_.get(), false,
+                            4096 /* readahead_size */));
+    if (should_ignore) {
+      ASSERT_OK(parser.Parse(kTestFileName, fs_.get(),
+                             true /* ignore_unknown_options */,
+                             4096 /* readahead_size */));
+    } else {
+      ASSERT_NOK(parser.Parse(kTestFileName, fs_.get(),
+                              true /* ignore_unknown_options */,
+                              4096 /* readahead_size */));
+    }
+  }
+}
+
+TEST_F(OptionsParserTest, ParseVersion) {
+  DBOptions db_opt;
+  db_opt.max_open_files = 12345;
+  db_opt.max_background_flushes = 301;
+  db_opt.max_total_wal_size = 1024;
+  ColumnFamilyOptions cf_opt;
+
+  std::string file_template =
+      "# This is a testing option string.\n"
+      "# Currently we only support \"#\" styled comment.\n"
+      "\n"
+      "[Version]\n"
+      "  rocksdb_version=3.13.1\n"
+      "  options_file_version=%s\n"
+      "[DBOptions]\n"
+      "[CFOptions \"default\"]\n";
+  const int kLength = 1000;
+  char buffer[kLength];
+  RocksDBOptionsParser parser;
+
+  const std::vector<std::string> invalid_versions = {
+      "a.b.c", "3.2.2b", "3.-12", "3. 1",  // only digits and dots are allowed
+      "1.2.3.4",
+      "1.2.3"  // can only contains at most one dot.
+      "0",     // options_file_version must be at least one
+      "3..2",
+      ".", ".1.2",             // must have at least one digit before each dot
+      "1.2.", "1.", "2.34."};  // must have at least one digit after each dot
+  for (auto iv : invalid_versions) {
+    snprintf(buffer, kLength - 1, file_template.c_str(), iv.c_str());
+
+    parser.Reset();
+    ASSERT_OK(fs_->WriteToNewFile(iv, buffer));
+    ASSERT_NOK(parser.Parse(iv, fs_.get(), false, 0 /* readahead_size */));
+  }
+
+  const std::vector<std::string> valid_versions = {
+      "1.232", "100", "3.12", "1", "12.3  ", "  1.25  "};
+  for (auto vv : valid_versions) {
+    snprintf(buffer, kLength - 1, file_template.c_str(), vv.c_str());
+    parser.Reset();
+    ASSERT_OK(fs_->WriteToNewFile(vv, buffer));
+    ASSERT_OK(parser.Parse(vv, fs_.get(), false, 0 /* readahead_size */));
+  }
+}
+
+void VerifyCFPointerTypedOptions(
+    ColumnFamilyOptions* base_cf_opt, const ColumnFamilyOptions* new_cf_opt,
+    const std::unordered_map<std::string, std::string>* new_cf_opt_map) {
+  std::string name_buffer;
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = false;
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, *base_cf_opt,
+                                                  *new_cf_opt, new_cf_opt_map));
+
+  // change the name of merge operator back-and-forth
+  {
+    auto* merge_operator = base_cf_opt->merge_operator
+                               ->CheckedCast<test::ChanglingMergeOperator>();
+    if (merge_operator != nullptr) {
+      name_buffer = merge_operator->Name();
+      // change the name  and expect non-ok status
+      merge_operator->SetName("some-other-name");
+      ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions(
+          config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map));
+      // change the name back and expect ok status
+      merge_operator->SetName(name_buffer);
+      ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
+          config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map));
+    }
+  }
+
+  // change the name of the compaction filter factory back-and-forth
+  {
+    auto* compaction_filter_factory =
+        base_cf_opt->compaction_filter_factory
+            ->CheckedCast<test::ChanglingCompactionFilterFactory>();
+    if (compaction_filter_factory != nullptr) {
+      name_buffer = compaction_filter_factory->Name();
+      // change the name and expect non-ok status
+      compaction_filter_factory->SetName("some-other-name");
+      ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions(
+          config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map));
+      // change the name back and expect ok status
+      compaction_filter_factory->SetName(name_buffer);
+      ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
+          config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map));
+    }
+  }
+
+  // test by setting compaction_filter to nullptr
+  {
+    auto* tmp_compaction_filter = base_cf_opt->compaction_filter;
+    if (tmp_compaction_filter != nullptr) {
+      base_cf_opt->compaction_filter = nullptr;
+      // set compaction_filter to nullptr and expect non-ok status
+      ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions(
+          config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map));
+      // set the value back and expect ok status
+      base_cf_opt->compaction_filter = tmp_compaction_filter;
+      ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
+          config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map));
+    }
+  }
+
+  // test by setting table_factory to nullptr
+  {
+    auto tmp_table_factory = base_cf_opt->table_factory;
+    if (tmp_table_factory != nullptr) {
+      base_cf_opt->table_factory.reset();
+      // set table_factory to nullptr and expect non-ok status
+      ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions(
+          config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map));
+      // set the value back and expect ok status
+      base_cf_opt->table_factory = tmp_table_factory;
+      ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
+          config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map));
+    }
+  }
+
+  // test by setting memtable_factory to nullptr
+  {
+    auto tmp_memtable_factory = base_cf_opt->memtable_factory;
+    if (tmp_memtable_factory != nullptr) {
+      base_cf_opt->memtable_factory.reset();
+      // set memtable_factory to nullptr and expect non-ok status
+      ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions(
+          config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map));
+      // set the value back and expect ok status
+      base_cf_opt->memtable_factory = tmp_memtable_factory;
+      ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
+          config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map));
+    }
+  }
+}
+
+TEST_F(OptionsParserTest, Readahead) {
+  DBOptions base_db_opt;
+  std::vector<ColumnFamilyOptions> base_cf_opts;
+  base_cf_opts.emplace_back();
+  base_cf_opts.emplace_back();
+
+  std::string one_mb_string = std::string(1024 * 1024, 'x');
+  std::vector<std::string> cf_names = {"default", one_mb_string};
+  const std::string kOptionsFileName = "test-persisted-options.ini";
+
+  ASSERT_OK(PersistRocksDBOptions(base_db_opt, cf_names, base_cf_opts,
+                                  kOptionsFileName, fs_.get()));
+
+  uint64_t file_size = 0;
+  ASSERT_OK(
+      fs_->GetFileSize(kOptionsFileName, IOOptions(), &file_size, nullptr));
+  assert(file_size > 0);
+
+  RocksDBOptionsParser parser;
+
+  fs_->num_seq_file_read_ = 0;
+  size_t readahead_size = 128 * 1024;
+
+  ASSERT_OK(parser.Parse(kOptionsFileName, fs_.get(), false, readahead_size));
+  ASSERT_EQ(fs_->num_seq_file_read_.load(),
+            (file_size - 1) / readahead_size + 1);
+
+  fs_->num_seq_file_read_.store(0);
+  readahead_size = 1024 * 1024;
+  ASSERT_OK(parser.Parse(kOptionsFileName, fs_.get(), false, readahead_size));
+  ASSERT_EQ(fs_->num_seq_file_read_.load(),
+            (file_size - 1) / readahead_size + 1);
+
+  // Tiny readahead. 8 KB is read each time.
+  fs_->num_seq_file_read_.store(0);
+  ASSERT_OK(
+      parser.Parse(kOptionsFileName, fs_.get(), false, 1 /* readahead_size */));
+  ASSERT_GE(fs_->num_seq_file_read_.load(), file_size / (8 * 1024));
+  ASSERT_LT(fs_->num_seq_file_read_.load(), file_size / (8 * 1024) * 2);
+
+  // Disable readahead means 512KB readahead.
+  fs_->num_seq_file_read_.store(0);
+  ASSERT_OK(
+      parser.Parse(kOptionsFileName, fs_.get(), false, 0 /* readahead_size */));
+  ASSERT_GE(fs_->num_seq_file_read_.load(), (file_size - 1) / (512 * 1024) + 1);
+}
+
+TEST_F(OptionsParserTest, DumpAndParse) {
+  DBOptions base_db_opt;
+  std::vector<ColumnFamilyOptions> base_cf_opts;
+  std::vector<std::string> cf_names = {"default", "cf1", "cf2", "cf3",
+                                       "c:f:4:4:4"
+                                       "p\\i\\k\\a\\chu\\\\\\",
+                                       "###rocksdb#1-testcf#2###"};
+  const int num_cf = static_cast<int>(cf_names.size());
+  Random rnd(302);
+  test::RandomInitDBOptions(&base_db_opt, &rnd);
+  base_db_opt.db_log_dir += "/#odd #but #could #happen #path #/\\\\#OMG";
+
+  BlockBasedTableOptions special_bbto;
+  special_bbto.cache_index_and_filter_blocks = true;
+  special_bbto.block_size = 999999;
+
+  for (int c = 0; c < num_cf; ++c) {
+    ColumnFamilyOptions cf_opt;
+    Random cf_rnd(0xFB + c);
+    test::RandomInitCFOptions(&cf_opt, base_db_opt, &cf_rnd);
+    if (c < 4) {
+      cf_opt.prefix_extractor.reset(test::RandomSliceTransform(&rnd, c));
+    }
+    if (c < 3) {
+      cf_opt.table_factory.reset(test::RandomTableFactory(&rnd, c));
+    } else if (c == 4) {
+      cf_opt.table_factory.reset(NewBlockBasedTableFactory(special_bbto));
+    } else if (c == 5) {
+      // A table factory that doesn't support deserialization should be
+      // supported.
+      cf_opt.table_factory.reset(new UnregisteredTableFactory());
+    }
+    base_cf_opts.emplace_back(cf_opt);
+  }
+
+  const std::string kOptionsFileName = "test-persisted-options.ini";
+  // Use default for escaped(true), unknown(false) and check (exact)
+  ConfigOptions config_options;
+  ASSERT_OK(PersistRocksDBOptions(base_db_opt, cf_names, base_cf_opts,
+                                  kOptionsFileName, fs_.get()));
+
+  RocksDBOptionsParser parser;
+  ASSERT_OK(parser.Parse(config_options, kOptionsFileName, fs_.get()));
+
+  // Make sure block-based table factory options was deserialized correctly
+  std::shared_ptr<TableFactory> ttf = (*parser.cf_opts())[4].table_factory;
+  ASSERT_EQ(TableFactory::kBlockBasedTableName(), std::string(ttf->Name()));
+  const auto parsed_bbto = ttf->GetOptions<BlockBasedTableOptions>();
+  ASSERT_NE(parsed_bbto, nullptr);
+  ASSERT_EQ(special_bbto.block_size, parsed_bbto->block_size);
+  ASSERT_EQ(special_bbto.cache_index_and_filter_blocks,
+            parsed_bbto->cache_index_and_filter_blocks);
+
+  ASSERT_OK(RocksDBOptionsParser::VerifyRocksDBOptionsFromFile(
+      config_options, base_db_opt, cf_names, base_cf_opts, kOptionsFileName,
+      fs_.get()));
+
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(
+      config_options, *parser.db_opt(), base_db_opt));
+  for (int c = 0; c < num_cf; ++c) {
+    const auto* cf_opt = parser.GetCFOptions(cf_names[c]);
+    ASSERT_NE(cf_opt, nullptr);
+    ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
+        config_options, base_cf_opts[c], *cf_opt,
+        &(parser.cf_opt_maps()->at(c))));
+  }
+
+  // Further verify pointer-typed options
+  for (int c = 0; c < num_cf; ++c) {
+    const auto* cf_opt = parser.GetCFOptions(cf_names[c]);
+    ASSERT_NE(cf_opt, nullptr);
+    VerifyCFPointerTypedOptions(&base_cf_opts[c], cf_opt,
+                                &(parser.cf_opt_maps()->at(c)));
+  }
+
+  ASSERT_EQ(parser.GetCFOptions("does not exist"), nullptr);
+
+  base_db_opt.max_open_files++;
+  ASSERT_NOK(RocksDBOptionsParser::VerifyRocksDBOptionsFromFile(
+      config_options, base_db_opt, cf_names, base_cf_opts, kOptionsFileName,
+      fs_.get()));
+
+  for (int c = 0; c < num_cf; ++c) {
+    if (base_cf_opts[c].compaction_filter) {
+      delete base_cf_opts[c].compaction_filter;
+    }
+  }
+}
+
+TEST_F(OptionsParserTest, DifferentDefault) {
+  const std::string kOptionsFileName = "test-persisted-options.ini";
+
+  ColumnFamilyOptions cf_level_opts;
+  ASSERT_EQ(CompactionPri::kMinOverlappingRatio, cf_level_opts.compaction_pri);
+  cf_level_opts.OptimizeLevelStyleCompaction();
+
+  ColumnFamilyOptions cf_univ_opts;
+  cf_univ_opts.OptimizeUniversalStyleCompaction();
+
+  ASSERT_OK(PersistRocksDBOptions(DBOptions(), {"default", "universal"},
+                                  {cf_level_opts, cf_univ_opts},
+                                  kOptionsFileName, fs_.get()));
+
+  RocksDBOptionsParser parser;
+  ASSERT_OK(parser.Parse(kOptionsFileName, fs_.get(), false,
+                         4096 /* readahead_size */));
+
+  {
+    Options old_default_opts;
+    old_default_opts.OldDefaults();
+    ASSERT_EQ(10 * 1048576, old_default_opts.max_bytes_for_level_base);
+    ASSERT_EQ(5000, old_default_opts.max_open_files);
+    ASSERT_EQ(2 * 1024U * 1024U, old_default_opts.delayed_write_rate);
+    ASSERT_EQ(WALRecoveryMode::kTolerateCorruptedTailRecords,
+              old_default_opts.wal_recovery_mode);
+  }
+  {
+    Options old_default_opts;
+    old_default_opts.OldDefaults(4, 6);
+    ASSERT_EQ(10 * 1048576, old_default_opts.max_bytes_for_level_base);
+    ASSERT_EQ(5000, old_default_opts.max_open_files);
+  }
+  {
+    Options old_default_opts;
+    old_default_opts.OldDefaults(4, 7);
+    ASSERT_NE(10 * 1048576, old_default_opts.max_bytes_for_level_base);
+    ASSERT_NE(4, old_default_opts.table_cache_numshardbits);
+    ASSERT_EQ(5000, old_default_opts.max_open_files);
+    ASSERT_EQ(2 * 1024U * 1024U, old_default_opts.delayed_write_rate);
+  }
+  {
+    ColumnFamilyOptions old_default_cf_opts;
+    old_default_cf_opts.OldDefaults();
+    ASSERT_EQ(2 * 1048576, old_default_cf_opts.target_file_size_base);
+    ASSERT_EQ(4 << 20, old_default_cf_opts.write_buffer_size);
+    ASSERT_EQ(2 * 1048576, old_default_cf_opts.target_file_size_base);
+    ASSERT_EQ(0, old_default_cf_opts.soft_pending_compaction_bytes_limit);
+    ASSERT_EQ(0, old_default_cf_opts.hard_pending_compaction_bytes_limit);
+    ASSERT_EQ(CompactionPri::kByCompensatedSize,
+              old_default_cf_opts.compaction_pri);
+  }
+  {
+    ColumnFamilyOptions old_default_cf_opts;
+    old_default_cf_opts.OldDefaults(4, 6);
+    ASSERT_EQ(2 * 1048576, old_default_cf_opts.target_file_size_base);
+    ASSERT_EQ(CompactionPri::kByCompensatedSize,
+              old_default_cf_opts.compaction_pri);
+  }
+  {
+    ColumnFamilyOptions old_default_cf_opts;
+    old_default_cf_opts.OldDefaults(4, 7);
+    ASSERT_NE(2 * 1048576, old_default_cf_opts.target_file_size_base);
+    ASSERT_EQ(CompactionPri::kByCompensatedSize,
+              old_default_cf_opts.compaction_pri);
+  }
+  {
+    Options old_default_opts;
+    old_default_opts.OldDefaults(5, 1);
+    ASSERT_EQ(2 * 1024U * 1024U, old_default_opts.delayed_write_rate);
+  }
+  {
+    Options old_default_opts;
+    old_default_opts.OldDefaults(5, 2);
+    ASSERT_EQ(16 * 1024U * 1024U, old_default_opts.delayed_write_rate);
+    ASSERT_TRUE(old_default_opts.compaction_pri ==
+                CompactionPri::kByCompensatedSize);
+  }
+  {
+    Options old_default_opts;
+    old_default_opts.OldDefaults(5, 18);
+    ASSERT_TRUE(old_default_opts.compaction_pri ==
+                CompactionPri::kByCompensatedSize);
+  }
+
+  Options small_opts;
+  small_opts.OptimizeForSmallDb();
+  ASSERT_EQ(2 << 20, small_opts.write_buffer_size);
+  ASSERT_EQ(5000, small_opts.max_open_files);
+}
+
+class OptionsSanityCheckTest : public OptionsParserTest,
+                               public ::testing::WithParamInterface<bool> {
+ protected:
+  ConfigOptions config_options_;
+
+ public:
+  OptionsSanityCheckTest() {
+    config_options_.ignore_unknown_options = false;
+    config_options_.ignore_unsupported_options = GetParam();
+    config_options_.input_strings_escaped = true;
+  }
+
+ protected:
+  Status SanityCheckOptions(const DBOptions& db_opts,
+                            const ColumnFamilyOptions& cf_opts,
+                            ConfigOptions::SanityLevel level) {
+    config_options_.sanity_level = level;
+    return RocksDBOptionsParser::VerifyRocksDBOptionsFromFile(
+        config_options_, db_opts, {"default"}, {cf_opts}, kOptionsFileName,
+        fs_.get());
+  }
+
+  Status SanityCheckCFOptions(const ColumnFamilyOptions& cf_opts,
+                              ConfigOptions::SanityLevel level) {
+    return SanityCheckOptions(DBOptions(), cf_opts, level);
+  }
+
+  void SanityCheckCFOptions(const ColumnFamilyOptions& opts, bool exact) {
+    ASSERT_OK(SanityCheckCFOptions(
+        opts, ConfigOptions::kSanityLevelLooselyCompatible));
+    ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelNone));
+    if (exact) {
+      ASSERT_OK(
+          SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch));
+    } else {
+      ASSERT_NOK(
+          SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch));
+    }
+  }
+
+  Status SanityCheckDBOptions(const DBOptions& db_opts,
+                              ConfigOptions::SanityLevel level) {
+    return SanityCheckOptions(db_opts, ColumnFamilyOptions(), level);
+  }
+
+  void SanityCheckDBOptions(const DBOptions& opts, bool exact) {
+    ASSERT_OK(SanityCheckDBOptions(
+        opts, ConfigOptions::kSanityLevelLooselyCompatible));
+    ASSERT_OK(SanityCheckDBOptions(opts, ConfigOptions::kSanityLevelNone));
+    if (exact) {
+      ASSERT_OK(
+          SanityCheckDBOptions(opts, ConfigOptions::kSanityLevelExactMatch));
+    } else {
+      ASSERT_NOK(
+          SanityCheckDBOptions(opts, ConfigOptions::kSanityLevelExactMatch));
+    }
+  }
+
+  Status PersistOptions(const DBOptions& db_opts,
+                        const ColumnFamilyOptions& cf_opts) {
+    Status s = fs_->DeleteFile(kOptionsFileName, IOOptions(), nullptr);
+    if (!s.ok()) {
+      return s;
+    }
+    return PersistRocksDBOptions(db_opts, {"default"}, {cf_opts},
+                                 kOptionsFileName, fs_.get());
+  }
+
+  Status PersistCFOptions(const ColumnFamilyOptions& cf_opts) {
+    return PersistOptions(DBOptions(), cf_opts);
+  }
+
+  Status PersistDBOptions(const DBOptions& db_opts) {
+    return PersistOptions(db_opts, ColumnFamilyOptions());
+  }
+
+  const std::string kOptionsFileName = "OPTIONS";
+};
+
+TEST_P(OptionsSanityCheckTest, CFOptionsSanityCheck) {
+  ColumnFamilyOptions opts;
+  Random rnd(301);
+
+  // default ColumnFamilyOptions
+  {
+    ASSERT_OK(PersistCFOptions(opts));
+    ASSERT_OK(
+        SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch));
+  }
+
+  // prefix_extractor
+  {
+    // Okay to change prefix_extractor form nullptr to non-nullptr
+    ASSERT_EQ(opts.prefix_extractor.get(), nullptr);
+    opts.prefix_extractor.reset(NewCappedPrefixTransform(10));
+    ASSERT_OK(SanityCheckCFOptions(
+        opts, ConfigOptions::kSanityLevelLooselyCompatible));
+    ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelNone));
+
+    // persist the change
+    ASSERT_OK(PersistCFOptions(opts));
+    ASSERT_OK(
+        SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch));
+
+    // use same prefix extractor but with different parameter
+    opts.prefix_extractor.reset(NewCappedPrefixTransform(15));
+    // expect pass only in
+    // ConfigOptions::kSanityLevelLooselyCompatible
+    ASSERT_NOK(
+        SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch));
+    ASSERT_OK(SanityCheckCFOptions(
+        opts, ConfigOptions::kSanityLevelLooselyCompatible));
+    ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelNone));
+
+    // repeat the test with FixedPrefixTransform
+    opts.prefix_extractor.reset(NewFixedPrefixTransform(10));
+    ASSERT_NOK(
+        SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch));
+    ASSERT_OK(SanityCheckCFOptions(
+        opts, ConfigOptions::kSanityLevelLooselyCompatible));
+    ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelNone));
+
+    // persist the change of prefix_extractor
+    ASSERT_OK(PersistCFOptions(opts));
+    ASSERT_OK(
+        SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch));
+
+    // use same prefix extractor but with different parameter
+    opts.prefix_extractor.reset(NewFixedPrefixTransform(15));
+    // expect pass only in
+    // ConfigOptions::kSanityLevelLooselyCompatible
+    SanityCheckCFOptions(opts, false);
+
+    // Change prefix extractor from non-nullptr to nullptr
+    opts.prefix_extractor.reset();
+    // expect pass as it's safe to change prefix_extractor
+    // from non-null to null
+    ASSERT_OK(SanityCheckCFOptions(
+        opts, ConfigOptions::kSanityLevelLooselyCompatible));
+    ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelNone));
+  }
+  // persist the change
+  ASSERT_OK(PersistCFOptions(opts));
+  ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch));
+
+  // table_factory
+  {
+    for (int tb = 0; tb <= 2; ++tb) {
+      // change the table factory
+      opts.table_factory.reset(test::RandomTableFactory(&rnd, tb));
+      ASSERT_NOK(SanityCheckCFOptions(
+          opts, ConfigOptions::kSanityLevelLooselyCompatible));
+      ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelNone));
+
+      // persist the change
+      ASSERT_OK(PersistCFOptions(opts));
+      ASSERT_OK(
+          SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch));
+    }
+  }
+
+  // merge_operator
+  {
+    // Test when going from nullptr -> merge operator
+    opts.merge_operator.reset(test::RandomMergeOperator(&rnd));
+    ASSERT_OK(SanityCheckCFOptions(
+        opts, ConfigOptions::kSanityLevelLooselyCompatible));
+    ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelNone));
+
+    // persist the change
+    ASSERT_OK(PersistCFOptions(opts));
+    SanityCheckCFOptions(opts, config_options_.ignore_unsupported_options);
+
+    for (int test = 0; test < 5; ++test) {
+      // change the merge operator
+      opts.merge_operator.reset(test::RandomMergeOperator(&rnd));
+      ASSERT_NOK(SanityCheckCFOptions(
+          opts, ConfigOptions::kSanityLevelLooselyCompatible));
+      ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelNone));
+
+      // persist the change
+      ASSERT_OK(PersistCFOptions(opts));
+      SanityCheckCFOptions(opts, config_options_.ignore_unsupported_options);
+    }
+
+    // Test when going from merge operator -> nullptr
+    opts.merge_operator = nullptr;
+    ASSERT_NOK(SanityCheckCFOptions(
+        opts, ConfigOptions::kSanityLevelLooselyCompatible));
+    ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelNone));
+
+    // persist the change
+    ASSERT_OK(PersistCFOptions(opts));
+    SanityCheckCFOptions(opts, true);
+  }
+
+  // compaction_filter
+  {
+    for (int test = 0; test < 5; ++test) {
+      // change the compaction filter
+      opts.compaction_filter = test::RandomCompactionFilter(&rnd);
+      SanityCheckCFOptions(opts, false);
+
+      // persist the change
+      ASSERT_OK(PersistCFOptions(opts));
+      SanityCheckCFOptions(opts, config_options_.ignore_unsupported_options);
+      delete opts.compaction_filter;
+      opts.compaction_filter = nullptr;
+    }
+  }
+
+  // compaction_filter_factory
+  {
+    for (int test = 0; test < 5; ++test) {
+      // change the compaction filter factory
+      opts.compaction_filter_factory.reset(
+          test::RandomCompactionFilterFactory(&rnd));
+      SanityCheckCFOptions(opts, false);
+
+      // persist the change
+      ASSERT_OK(PersistCFOptions(opts));
+      SanityCheckCFOptions(opts, config_options_.ignore_unsupported_options);
+    }
+  }
+}
+
+TEST_P(OptionsSanityCheckTest, DBOptionsSanityCheck) {
+  DBOptions opts;
+  Random rnd(301);
+
+  // default DBOptions
+  {
+    ASSERT_OK(PersistDBOptions(opts));
+    ASSERT_OK(
+        SanityCheckDBOptions(opts, ConfigOptions::kSanityLevelExactMatch));
+  }
+
+  // File checksum generator
+  {
+    class MockFileChecksumGenFactory : public FileChecksumGenFactory {
+     public:
+      static const char* kClassName() { return "Mock"; }
+      const char* Name() const override { return kClassName(); }
+      std::unique_ptr<FileChecksumGenerator> CreateFileChecksumGenerator(
+          const FileChecksumGenContext& /*context*/) override {
+        return nullptr;
+      }
+    };
+
+    // Okay to change file_checksum_gen_factory form nullptr to non-nullptr
+    ASSERT_EQ(opts.file_checksum_gen_factory.get(), nullptr);
+    opts.file_checksum_gen_factory.reset(new MockFileChecksumGenFactory());
+
+    // persist the change
+    ASSERT_OK(PersistDBOptions(opts));
+    SanityCheckDBOptions(opts, config_options_.ignore_unsupported_options);
+
+    // Change file_checksum_gen_factory from non-nullptr to nullptr
+    opts.file_checksum_gen_factory.reset();
+    // expect pass as it's safe to change file_checksum_gen_factory
+    // from non-null to null
+    SanityCheckDBOptions(opts, false);
+  }
+  // persist the change
+  ASSERT_OK(PersistDBOptions(opts));
+  ASSERT_OK(SanityCheckDBOptions(opts, ConfigOptions::kSanityLevelExactMatch));
+}
+
+namespace {
+bool IsEscapedString(const std::string& str) {
+  for (size_t i = 0; i < str.size(); ++i) {
+    if (str[i] == '\\') {
+      // since we already handle those two consecutive '\'s in
+      // the next if-then branch, any '\' appear at the end
+      // of an escaped string in such case is not valid.
+      if (i == str.size() - 1) {
+        return false;
+      }
+      if (str[i + 1] == '\\') {
+        // if there're two consecutive '\'s, skip the second one.
+        i++;
+        continue;
+      }
+      switch (str[i + 1]) {
+        case ':':
+        case '\\':
+        case '#':
+          continue;
+        default:
+          // if true, '\' together with str[i + 1] is not a valid escape.
+          if (UnescapeChar(str[i + 1]) == str[i + 1]) {
+            return false;
+          }
+      }
+    } else if (isSpecialChar(str[i]) && (i == 0 || str[i - 1] != '\\')) {
+      return false;
+    }
+  }
+  return true;
+}
+}  // namespace
+
+TEST_F(OptionsParserTest, IntegerParsing) {
+  ASSERT_EQ(ParseUint64("18446744073709551615"), 18446744073709551615U);
+  ASSERT_EQ(ParseUint32("4294967295"), 4294967295U);
+  ASSERT_EQ(ParseSizeT("18446744073709551615"), 18446744073709551615U);
+  ASSERT_EQ(ParseInt64("9223372036854775807"), 9223372036854775807);
+  ASSERT_EQ(ParseInt64("-9223372036854775808"),
+            std::numeric_limits<int64_t>::min());
+  ASSERT_EQ(ParseInt32("2147483647"), 2147483647);
+  ASSERT_EQ(ParseInt32("-2147483648"), std::numeric_limits<int32_t>::min());
+  ASSERT_EQ(ParseInt("-32767"), -32767);
+  ASSERT_EQ(ParseDouble("-1.234567"), -1.234567);
+}
+
+TEST_F(OptionsParserTest, EscapeOptionString) {
+  ASSERT_EQ(UnescapeOptionString(
+                "This is a test string with \\# \\: and \\\\ escape chars."),
+            "This is a test string with # : and \\ escape chars.");
+
+  ASSERT_EQ(
+      EscapeOptionString("This is a test string with # : and \\ escape chars."),
+      "This is a test string with \\# \\: and \\\\ escape chars.");
+
+  std::string readible_chars =
+      "A String like this \"1234567890-=_)(*&^%$#@!ertyuiop[]{POIU"
+      "YTREWQasdfghjkl;':LKJHGFDSAzxcvbnm,.?>"
+      "<MNBVCXZ\\\" should be okay to \\#\\\\\\:\\#\\#\\#\\ "
+      "be serialized and deserialized";
+
+  std::string escaped_string = EscapeOptionString(readible_chars);
+  ASSERT_TRUE(IsEscapedString(escaped_string));
+  // This two transformations should be canceled and should output
+  // the original input.
+  ASSERT_EQ(UnescapeOptionString(escaped_string), readible_chars);
+
+  std::string all_chars;
+  for (unsigned char c = 0;; ++c) {
+    all_chars += c;
+    if (c == 255) {
+      break;
+    }
+  }
+  escaped_string = EscapeOptionString(all_chars);
+  ASSERT_TRUE(IsEscapedString(escaped_string));
+  ASSERT_EQ(UnescapeOptionString(escaped_string), all_chars);
+
+  ASSERT_EQ(RocksDBOptionsParser::TrimAndRemoveComment(
+                "     A simple statement with a comment.  # like this :)"),
+            "A simple statement with a comment.");
+
+  ASSERT_EQ(RocksDBOptionsParser::TrimAndRemoveComment(
+                "Escape \\# and # comment together   ."),
+            "Escape \\# and");
+}
+
+static void TestAndCompareOption(const ConfigOptions& config_options,
+                                 const OptionTypeInfo& opt_info,
+                                 const std::string& opt_name, void* base_ptr,
+                                 void* comp_ptr, bool strip = false) {
+  std::string result, mismatch;
+  ASSERT_OK(opt_info.Serialize(config_options, opt_name, base_ptr, &result));
+  if (strip) {
+    ASSERT_EQ(result.at(0), '{');
+    ASSERT_EQ(result.at(result.size() - 1), '}');
+    result = result.substr(1, result.size() - 2);
+  }
+  ASSERT_OK(opt_info.Parse(config_options, opt_name, result, comp_ptr));
+  ASSERT_TRUE(opt_info.AreEqual(config_options, opt_name, base_ptr, comp_ptr,
+                                &mismatch));
+}
+
+static void TestParseAndCompareOption(const ConfigOptions& config_options,
+                                      const OptionTypeInfo& opt_info,
+                                      const std::string& opt_name,
+                                      const std::string& opt_value,
+                                      void* base_ptr, void* comp_ptr,
+                                      bool strip = false) {
+  ASSERT_OK(opt_info.Parse(config_options, opt_name, opt_value, base_ptr));
+  TestAndCompareOption(config_options, opt_info, opt_name, base_ptr, comp_ptr,
+                       strip);
+}
+
+template <typename T>
+void TestOptInfo(const ConfigOptions& config_options, OptionType opt_type,
+                 T* base, T* comp) {
+  std::string result;
+  OptionTypeInfo opt_info(0, opt_type);
+  ASSERT_FALSE(opt_info.AreEqual(config_options, "base", base, comp, &result));
+  ASSERT_EQ(result, "base");
+  ASSERT_NE(*base, *comp);
+  TestAndCompareOption(config_options, opt_info, "base", base, comp);
+  ASSERT_EQ(*base, *comp);
+}
+
+class OptionTypeInfoTest : public testing::Test {};
+
+TEST_F(OptionTypeInfoTest, BasicTypes) {
+  ConfigOptions config_options;
+  {
+    bool a = true, b = false;
+    TestOptInfo(config_options, OptionType::kBoolean, &a, &b);
+  }
+  {
+    int a = 100, b = 200;
+    TestOptInfo(config_options, OptionType::kInt, &a, &b);
+  }
+  {
+    int32_t a = 100, b = 200;
+    TestOptInfo(config_options, OptionType::kInt32T, &a, &b);
+  }
+  {
+    int64_t a = 100, b = 200;
+    TestOptInfo(config_options, OptionType::kInt64T, &a, &b);
+  }
+  {
+    unsigned int a = 100, b = 200;
+    TestOptInfo(config_options, OptionType::kUInt, &a, &b);
+  }
+  {
+    uint32_t a = 100, b = 200;
+    TestOptInfo(config_options, OptionType::kUInt32T, &a, &b);
+  }
+  {
+    uint64_t a = 100, b = 200;
+    TestOptInfo(config_options, OptionType::kUInt64T, &a, &b);
+  }
+  {
+    size_t a = 100, b = 200;
+    TestOptInfo(config_options, OptionType::kSizeT, &a, &b);
+  }
+  {
+    std::string a = "100", b = "200";
+    TestOptInfo(config_options, OptionType::kString, &a, &b);
+  }
+  {
+    double a = 1.0, b = 2.0;
+    TestOptInfo(config_options, OptionType::kDouble, &a, &b);
+  }
+}
+
+TEST_F(OptionTypeInfoTest, TestInvalidArgs) {
+  ConfigOptions config_options;
+  bool b;
+  int i;
+  int32_t i32;
+  int64_t i64;
+  unsigned int u;
+  int32_t u32;
+  int64_t u64;
+  size_t sz;
+  double d;
+
+  ASSERT_NOK(OptionTypeInfo(0, OptionType::kBoolean)
+                 .Parse(config_options, "b", "x", &b));
+  ASSERT_NOK(
+      OptionTypeInfo(0, OptionType::kInt).Parse(config_options, "b", "x", &i));
+  ASSERT_NOK(OptionTypeInfo(0, OptionType::kInt32T)
+                 .Parse(config_options, "b", "x", &i32));
+  ASSERT_NOK(OptionTypeInfo(0, OptionType::kInt64T)
+                 .Parse(config_options, "b", "x", &i64));
+  ASSERT_NOK(
+      OptionTypeInfo(0, OptionType::kUInt).Parse(config_options, "b", "x", &u));
+  ASSERT_NOK(OptionTypeInfo(0, OptionType::kUInt32T)
+                 .Parse(config_options, "b", "x", &u32));
+  ASSERT_NOK(OptionTypeInfo(0, OptionType::kUInt64T)
+                 .Parse(config_options, "b", "x", &u64));
+  ASSERT_NOK(OptionTypeInfo(0, OptionType::kSizeT)
+                 .Parse(config_options, "b", "x", &sz));
+  ASSERT_NOK(OptionTypeInfo(0, OptionType::kDouble)
+                 .Parse(config_options, "b", "x", &d));
+
+  // Don't know how to convert Unknowns to anything else
+  ASSERT_NOK(OptionTypeInfo(0, OptionType::kUnknown)
+                 .Parse(config_options, "b", "x", &d));
+
+  // Verify that if the parse function throws an exception, it is also trapped
+  OptionTypeInfo func_info(0, OptionType::kUnknown,
+                           OptionVerificationType::kNormal,
+                           OptionTypeFlags::kNone,
+                           [](const ConfigOptions&, const std::string&,
+                              const std::string& value, void* addr) {
+                             auto ptr = static_cast<int*>(addr);
+                             *ptr = ParseInt(value);
+                             return Status::OK();
+                           });
+  ASSERT_OK(func_info.Parse(config_options, "b", "1", &i));
+  ASSERT_NOK(func_info.Parse(config_options, "b", "x", &i));
+}
+
+TEST_F(OptionTypeInfoTest, TestParseFunc) {
+  OptionTypeInfo opt_info(0, OptionType::kUnknown,
+                          OptionVerificationType::kNormal,
+                          OptionTypeFlags::kNone);
+  opt_info.SetParseFunc([](const ConfigOptions& /*opts*/,
+                           const std::string& name, const std::string& value,
+                           void* addr) {
+    auto ptr = static_cast<std::string*>(addr);
+    if (name == "Oops") {
+      return Status::InvalidArgument(value);
+    } else {
+      *ptr = value + " " + name;
+      return Status::OK();
+    }
+  });
+  ConfigOptions config_options;
+  std::string base;
+  ASSERT_OK(opt_info.Parse(config_options, "World", "Hello", &base));
+  ASSERT_EQ(base, "Hello World");
+  ASSERT_NOK(opt_info.Parse(config_options, "Oops", "Hello", &base));
+}
+
+TEST_F(OptionTypeInfoTest, TestSerializeFunc) {
+  OptionTypeInfo opt_info(0, OptionType::kString,
+                          OptionVerificationType::kNormal,
+                          OptionTypeFlags::kNone);
+  opt_info.SetSerializeFunc([](const ConfigOptions& /*opts*/,
+                               const std::string& name, const void* /*addr*/,
+                               std::string* value) {
+    if (name == "Oops") {
+      return Status::InvalidArgument(name);
+    } else {
+      *value = name;
+      return Status::OK();
+    }
+  });
+  ConfigOptions config_options;
+  std::string base;
+  std::string value;
+  ASSERT_OK(opt_info.Serialize(config_options, "Hello", &base, &value));
+  ASSERT_EQ(value, "Hello");
+  ASSERT_NOK(opt_info.Serialize(config_options, "Oops", &base, &value));
+}
+
+TEST_F(OptionTypeInfoTest, TestEqualsFunc) {
+  OptionTypeInfo opt_info(0, OptionType::kInt, OptionVerificationType::kNormal,
+                          OptionTypeFlags::kNone);
+  opt_info.SetEqualsFunc([](const ConfigOptions& /*opts*/,
+                            const std::string& name, const void* addr1,
+                            const void* addr2, std::string* mismatch) {
+    auto i1 = *(static_cast<const int*>(addr1));
+    auto i2 = *(static_cast<const int*>(addr2));
+    if (name == "LT") {
+      return i1 < i2;
+    } else if (name == "GT") {
+      return i1 > i2;
+    } else if (name == "EQ") {
+      return i1 == i2;
+    } else {
+      *mismatch = name + "???";
+      return false;
+    }
+  });
+
+  ConfigOptions config_options;
+  int int1 = 100;
+  int int2 = 200;
+  std::string mismatch;
+  ASSERT_TRUE(opt_info.AreEqual(config_options, "LT", &int1, &int2, &mismatch));
+  ASSERT_EQ(mismatch, "");
+  ASSERT_FALSE(
+      opt_info.AreEqual(config_options, "GT", &int1, &int2, &mismatch));
+  ASSERT_EQ(mismatch, "GT");
+  ASSERT_FALSE(
+      opt_info.AreEqual(config_options, "NO", &int1, &int2, &mismatch));
+  ASSERT_EQ(mismatch, "NO???");
+}
+
+TEST_F(OptionTypeInfoTest, TestPrepareFunc) {
+  OptionTypeInfo opt_info(0, OptionType::kInt, OptionVerificationType::kNormal,
+                          OptionTypeFlags::kNone);
+  opt_info.SetPrepareFunc(
+      [](const ConfigOptions& /*opts*/, const std::string& name, void* addr) {
+        auto i1 = static_cast<int*>(addr);
+        if (name == "x2") {
+          *i1 *= 2;
+        } else if (name == "/2") {
+          *i1 /= 2;
+        } else {
+          return Status::InvalidArgument("Bad Argument", name);
+        }
+        return Status::OK();
+      });
+  ConfigOptions config_options;
+  int int1 = 100;
+  ASSERT_OK(opt_info.Prepare(config_options, "x2", &int1));
+  ASSERT_EQ(int1, 200);
+  ASSERT_OK(opt_info.Prepare(config_options, "/2", &int1));
+  ASSERT_EQ(int1, 100);
+  ASSERT_NOK(opt_info.Prepare(config_options, "??", &int1));
+  ASSERT_EQ(int1, 100);
+}
+TEST_F(OptionTypeInfoTest, TestValidateFunc) {
+  OptionTypeInfo opt_info(0, OptionType::kSizeT,
+                          OptionVerificationType::kNormal,
+                          OptionTypeFlags::kNone);
+  opt_info.SetValidateFunc([](const DBOptions& db_opts,
+                              const ColumnFamilyOptions& cf_opts,
+                              const std::string& name, const void* addr) {
+    const auto sz = static_cast<const size_t*>(addr);
+    bool is_valid = false;
+    if (name == "keep_log_file_num") {
+      is_valid = (*sz == db_opts.keep_log_file_num);
+    } else if (name == "write_buffer_size") {
+      is_valid = (*sz == cf_opts.write_buffer_size);
+    }
+    if (is_valid) {
+      return Status::OK();
+    } else {
+      return Status::InvalidArgument("Mismatched value", name);
+    }
+  });
+  ConfigOptions config_options;
+  DBOptions db_options;
+  ColumnFamilyOptions cf_options;
+
+  ASSERT_OK(opt_info.Validate(db_options, cf_options, "keep_log_file_num",
+                              &db_options.keep_log_file_num));
+  ASSERT_OK(opt_info.Validate(db_options, cf_options, "write_buffer_size",
+                              &cf_options.write_buffer_size));
+  ASSERT_NOK(opt_info.Validate(db_options, cf_options, "keep_log_file_num",
+                               &cf_options.write_buffer_size));
+  ASSERT_NOK(opt_info.Validate(db_options, cf_options, "write_buffer_size",
+                               &db_options.keep_log_file_num));
+}
+
+TEST_F(OptionTypeInfoTest, TestOptionFlags) {
+  OptionTypeInfo opt_none(0, OptionType::kString,
+                          OptionVerificationType::kNormal,
+                          OptionTypeFlags::kDontSerialize);
+  OptionTypeInfo opt_never(0, OptionType::kString,
+                           OptionVerificationType::kNormal,
+                           OptionTypeFlags::kCompareNever);
+  OptionTypeInfo opt_alias(0, OptionType::kString,
+                           OptionVerificationType::kAlias,
+                           OptionTypeFlags::kNone);
+  OptionTypeInfo opt_deprecated(0, OptionType::kString,
+                                OptionVerificationType::kDeprecated,
+                                OptionTypeFlags::kNone);
+  ConfigOptions config_options;
+  std::string opts_str;
+  std::string base = "base";
+  std::string comp = "comp";
+
+  // If marked string none, the serialization returns not supported
+  ASSERT_NOK(opt_none.Serialize(config_options, "None", &base, &opts_str));
+  // If marked never compare, they match even when they do not
+  ASSERT_TRUE(opt_never.AreEqual(config_options, "Never", &base, &comp, &base));
+  ASSERT_FALSE(opt_none.AreEqual(config_options, "Never", &base, &comp, &base));
+
+  // An alias can change the value via parse, but does nothing on serialize on
+  // match
+  std::string result;
+  ASSERT_OK(opt_alias.Parse(config_options, "Alias", "Alias", &base));
+  ASSERT_OK(opt_alias.Serialize(config_options, "Alias", &base, &result));
+  ASSERT_TRUE(
+      opt_alias.AreEqual(config_options, "Alias", &base, &comp, &result));
+  ASSERT_EQ(base, "Alias");
+  ASSERT_NE(base, comp);
+
+  // Deprecated options do nothing on any of the commands
+  ASSERT_OK(opt_deprecated.Parse(config_options, "Alias", "Deprecated", &base));
+  ASSERT_OK(opt_deprecated.Serialize(config_options, "Alias", &base, &result));
+  ASSERT_TRUE(
+      opt_deprecated.AreEqual(config_options, "Alias", &base, &comp, &result));
+  ASSERT_EQ(base, "Alias");
+  ASSERT_NE(base, comp);
+}
+
+TEST_F(OptionTypeInfoTest, TestCustomEnum) {
+  enum TestEnum { kA, kB, kC };
+  std::unordered_map<std::string, TestEnum> enum_map = {
+      {"A", TestEnum::kA},
+      {"B", TestEnum::kB},
+      {"C", TestEnum::kC},
+  };
+  OptionTypeInfo opt_info = OptionTypeInfo::Enum<TestEnum>(0, &enum_map);
+  TestEnum e1, e2;
+  ConfigOptions config_options;
+  std::string result, mismatch;
+
+  e2 = TestEnum::kA;
+
+  ASSERT_OK(opt_info.Parse(config_options, "", "B", &e1));
+  ASSERT_OK(opt_info.Serialize(config_options, "", &e1, &result));
+  ASSERT_EQ(e1, TestEnum::kB);
+  ASSERT_EQ(result, "B");
+
+  ASSERT_FALSE(opt_info.AreEqual(config_options, "Enum", &e1, &e2, &mismatch));
+  ASSERT_EQ(mismatch, "Enum");
+
+  TestParseAndCompareOption(config_options, opt_info, "", "C", &e1, &e2);
+  ASSERT_EQ(e2, TestEnum::kC);
+
+  ASSERT_NOK(opt_info.Parse(config_options, "", "D", &e1));
+  ASSERT_EQ(e1, TestEnum::kC);
+}
+
+TEST_F(OptionTypeInfoTest, TestBuiltinEnum) {
+  ConfigOptions config_options;
+  for (auto iter : OptionsHelper::compaction_style_string_map) {
+    CompactionStyle e1, e2;
+    TestParseAndCompareOption(config_options,
+                              OptionTypeInfo(0, OptionType::kCompactionStyle),
+                              "CompactionStyle", iter.first, &e1, &e2);
+    ASSERT_EQ(e1, iter.second);
+  }
+  for (auto iter : OptionsHelper::compaction_pri_string_map) {
+    CompactionPri e1, e2;
+    TestParseAndCompareOption(config_options,
+                              OptionTypeInfo(0, OptionType::kCompactionPri),
+                              "CompactionPri", iter.first, &e1, &e2);
+    ASSERT_EQ(e1, iter.second);
+  }
+  for (auto iter : OptionsHelper::compression_type_string_map) {
+    CompressionType e1, e2;
+    TestParseAndCompareOption(config_options,
+                              OptionTypeInfo(0, OptionType::kCompressionType),
+                              "CompressionType", iter.first, &e1, &e2);
+    ASSERT_EQ(e1, iter.second);
+  }
+  for (auto iter : OptionsHelper::compaction_stop_style_string_map) {
+    CompactionStopStyle e1, e2;
+    TestParseAndCompareOption(
+        config_options, OptionTypeInfo(0, OptionType::kCompactionStopStyle),
+        "CompactionStopStyle", iter.first, &e1, &e2);
+    ASSERT_EQ(e1, iter.second);
+  }
+  for (auto iter : OptionsHelper::checksum_type_string_map) {
+    ChecksumType e1, e2;
+    TestParseAndCompareOption(config_options,
+                              OptionTypeInfo(0, OptionType::kChecksumType),
+                              "CheckSumType", iter.first, &e1, &e2);
+    ASSERT_EQ(e1, iter.second);
+  }
+  for (auto iter : OptionsHelper::encoding_type_string_map) {
+    EncodingType e1, e2;
+    TestParseAndCompareOption(config_options,
+                              OptionTypeInfo(0, OptionType::kEncodingType),
+                              "EncodingType", iter.first, &e1, &e2);
+    ASSERT_EQ(e1, iter.second);
+  }
+}
+
+TEST_F(OptionTypeInfoTest, TestStruct) {
+  struct Basic {
+    int i = 42;
+    std::string s = "Hello";
+  };
+
+  struct Extended {
+    int j = 11;
+    Basic b;
+  };
+
+  std::unordered_map<std::string, OptionTypeInfo> basic_type_map = {
+      {"i", {offsetof(struct Basic, i), OptionType::kInt}},
+      {"s", {offsetof(struct Basic, s), OptionType::kString}},
+  };
+  OptionTypeInfo basic_info = OptionTypeInfo::Struct(
+      "b", &basic_type_map, 0, OptionVerificationType::kNormal,
+      OptionTypeFlags::kMutable);
+
+  std::unordered_map<std::string, OptionTypeInfo> extended_type_map = {
+      {"j", {offsetof(struct Extended, j), OptionType::kInt}},
+      {"b", OptionTypeInfo::Struct(
+                "b", &basic_type_map, offsetof(struct Extended, b),
+                OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
+      {"m", OptionTypeInfo::Struct(
+                "m", &basic_type_map, offsetof(struct Extended, b),
+                OptionVerificationType::kNormal, OptionTypeFlags::kMutable)},
+  };
+  OptionTypeInfo extended_info = OptionTypeInfo::Struct(
+      "e", &extended_type_map, 0, OptionVerificationType::kNormal,
+      OptionTypeFlags::kMutable);
+  Extended e1, e2;
+  ConfigOptions config_options;
+  std::string mismatch;
+  TestParseAndCompareOption(config_options, basic_info, "b", "{i=33;s=33}",
+                            &e1.b, &e2.b);
+  ASSERT_EQ(e1.b.i, 33);
+  ASSERT_EQ(e1.b.s, "33");
+
+  TestParseAndCompareOption(config_options, basic_info, "b.i", "44", &e1.b,
+                            &e2.b);
+  ASSERT_EQ(e1.b.i, 44);
+
+  TestParseAndCompareOption(config_options, basic_info, "i", "55", &e1.b,
+                            &e2.b);
+  ASSERT_EQ(e1.b.i, 55);
+
+  e1.b.i = 0;
+
+  ASSERT_FALSE(
+      basic_info.AreEqual(config_options, "b", &e1.b, &e2.b, &mismatch));
+  ASSERT_EQ(mismatch, "b.i");
+  mismatch.clear();
+  ASSERT_FALSE(
+      basic_info.AreEqual(config_options, "b.i", &e1.b, &e2.b, &mismatch));
+  ASSERT_EQ(mismatch, "b.i");
+  mismatch.clear();
+  ASSERT_FALSE(
+      basic_info.AreEqual(config_options, "i", &e1.b, &e2.b, &mismatch));
+  ASSERT_EQ(mismatch, "b.i");
+  mismatch.clear();
+
+  e1 = e2;
+  ASSERT_NOK(basic_info.Parse(config_options, "b", "{i=33;s=33;j=44}", &e1.b));
+  ASSERT_NOK(basic_info.Parse(config_options, "b.j", "44", &e1.b));
+  ASSERT_NOK(basic_info.Parse(config_options, "j", "44", &e1.b));
+
+  TestParseAndCompareOption(config_options, extended_info, "e",
+                            "b={i=55;s=55}; j=22;", &e1, &e2);
+  ASSERT_EQ(e1.b.i, 55);
+  ASSERT_EQ(e1.j, 22);
+  ASSERT_EQ(e1.b.s, "55");
+  TestParseAndCompareOption(config_options, extended_info, "e.b",
+                            "{i=66;s=66;}", &e1, &e2);
+  ASSERT_EQ(e1.b.i, 66);
+  ASSERT_EQ(e1.j, 22);
+  ASSERT_EQ(e1.b.s, "66");
+  TestParseAndCompareOption(config_options, extended_info, "e.b.i", "77", &e1,
+                            &e2);
+  ASSERT_EQ(e1.b.i, 77);
+  ASSERT_EQ(e1.j, 22);
+  ASSERT_EQ(e1.b.s, "66");
+}
+
+TEST_F(OptionTypeInfoTest, TestArrayType) {
+  OptionTypeInfo array_info = OptionTypeInfo::Array<std::string, 4>(
+      0, OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+      {0, OptionType::kString});
+  std::array<std::string, 4> array1, array2;
+  std::string mismatch;
+
+  ConfigOptions config_options;
+  TestParseAndCompareOption(config_options, array_info, "v", "a:b:c:d", &array1,
+                            &array2);
+
+  ASSERT_EQ(array1.size(), 4);
+  ASSERT_EQ(array1[0], "a");
+  ASSERT_EQ(array1[1], "b");
+  ASSERT_EQ(array1[2], "c");
+  ASSERT_EQ(array1[3], "d");
+  array1[3] = "e";
+  ASSERT_FALSE(
+      array_info.AreEqual(config_options, "v", &array1, &array2, &mismatch));
+  ASSERT_EQ(mismatch, "v");
+
+  // Test vectors with inner brackets
+  TestParseAndCompareOption(config_options, array_info, "v", "a:{b}:c:d",
+                            &array1, &array2);
+  ASSERT_EQ(array1.size(), 4);
+  ASSERT_EQ(array1[0], "a");
+  ASSERT_EQ(array1[1], "b");
+  ASSERT_EQ(array1[2], "c");
+  ASSERT_EQ(array1[3], "d");
+
+  std::array<std::string, 3> array3, array4;
+  OptionTypeInfo bar_info = OptionTypeInfo::Array<std::string, 3>(
+      0, OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+      {0, OptionType::kString}, '|');
+  TestParseAndCompareOption(config_options, bar_info, "v", "x|y|z", &array3,
+                            &array4);
+
+  // Test arrays with inner array
+  TestParseAndCompareOption(config_options, bar_info, "v",
+                            "a|{b1|b2}|{c1|c2|{d1|d2}}", &array3, &array4,
+                            false);
+  ASSERT_EQ(array3.size(), 3);
+  ASSERT_EQ(array3[0], "a");
+  ASSERT_EQ(array3[1], "b1|b2");
+  ASSERT_EQ(array3[2], "c1|c2|{d1|d2}");
+
+  TestParseAndCompareOption(config_options, bar_info, "v",
+                            "{a1|a2}|{b1|{c1|c2}}|d1", &array3, &array4, true);
+  ASSERT_EQ(array3.size(), 3);
+  ASSERT_EQ(array3[0], "a1|a2");
+  ASSERT_EQ(array3[1], "b1|{c1|c2}");
+  ASSERT_EQ(array3[2], "d1");
+
+  // Test invalid input: less element than requested
+  auto s = bar_info.Parse(config_options, "opt_name1", "a1|a2", &array3);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  // Test invalid input: more element than requested
+  s = bar_info.Parse(config_options, "opt_name2", "a1|b|c1|d3", &array3);
+  ASSERT_TRUE(s.IsInvalidArgument());
+}
+
+TEST_F(OptionTypeInfoTest, TestVectorType) {
+  OptionTypeInfo vec_info = OptionTypeInfo::Vector<std::string>(
+      0, OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+      {0, OptionType::kString});
+  std::vector<std::string> vec1, vec2;
+  std::string mismatch;
+
+  ConfigOptions config_options;
+  TestParseAndCompareOption(config_options, vec_info, "v", "a:b:c:d", &vec1,
+                            &vec2);
+  ASSERT_EQ(vec1.size(), 4);
+  ASSERT_EQ(vec1[0], "a");
+  ASSERT_EQ(vec1[1], "b");
+  ASSERT_EQ(vec1[2], "c");
+  ASSERT_EQ(vec1[3], "d");
+  vec1[3] = "e";
+  ASSERT_FALSE(vec_info.AreEqual(config_options, "v", &vec1, &vec2, &mismatch));
+  ASSERT_EQ(mismatch, "v");
+
+  // Test vectors with inner brackets
+  TestParseAndCompareOption(config_options, vec_info, "v", "a:{b}:c:d", &vec1,
+                            &vec2);
+  ASSERT_EQ(vec1.size(), 4);
+  ASSERT_EQ(vec1[0], "a");
+  ASSERT_EQ(vec1[1], "b");
+  ASSERT_EQ(vec1[2], "c");
+  ASSERT_EQ(vec1[3], "d");
+
+  OptionTypeInfo bar_info = OptionTypeInfo::Vector<std::string>(
+      0, OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+      {0, OptionType::kString}, '|');
+  TestParseAndCompareOption(config_options, vec_info, "v", "x|y|z", &vec1,
+                            &vec2);
+  // Test vectors with inner vector
+  TestParseAndCompareOption(config_options, bar_info, "v",
+                            "a|{b1|b2}|{c1|c2|{d1|d2}}", &vec1, &vec2, false);
+  ASSERT_EQ(vec1.size(), 3);
+  ASSERT_EQ(vec1[0], "a");
+  ASSERT_EQ(vec1[1], "b1|b2");
+  ASSERT_EQ(vec1[2], "c1|c2|{d1|d2}");
+
+  TestParseAndCompareOption(config_options, bar_info, "v",
+                            "{a1|a2}|{b1|{c1|c2}}|d1", &vec1, &vec2, true);
+  ASSERT_EQ(vec1.size(), 3);
+  ASSERT_EQ(vec1[0], "a1|a2");
+  ASSERT_EQ(vec1[1], "b1|{c1|c2}");
+  ASSERT_EQ(vec1[2], "d1");
+
+  TestParseAndCompareOption(config_options, bar_info, "v", "{a1}", &vec1, &vec2,
+                            false);
+  ASSERT_EQ(vec1.size(), 1);
+  ASSERT_EQ(vec1[0], "a1");
+
+  TestParseAndCompareOption(config_options, bar_info, "v", "{a1|a2}|{b1|b2}",
+                            &vec1, &vec2, true);
+  ASSERT_EQ(vec1.size(), 2);
+  ASSERT_EQ(vec1[0], "a1|a2");
+  ASSERT_EQ(vec1[1], "b1|b2");
+}
+
+TEST_F(OptionTypeInfoTest, TestStaticType) {
+  struct SimpleOptions {
+    size_t size = 0;
+    bool verify = true;
+  };
+
+  static std::unordered_map<std::string, OptionTypeInfo> type_map = {
+      {"size", {offsetof(struct SimpleOptions, size), OptionType::kSizeT}},
+      {"verify",
+       {offsetof(struct SimpleOptions, verify), OptionType::kBoolean}},
+  };
+
+  ConfigOptions config_options;
+  SimpleOptions opts, copy;
+  opts.size = 12345;
+  opts.verify = false;
+  std::string str, mismatch;
+
+  ASSERT_OK(
+      OptionTypeInfo::SerializeType(config_options, type_map, &opts, &str));
+  ASSERT_FALSE(OptionTypeInfo::TypesAreEqual(config_options, type_map, &opts,
+                                             &copy, &mismatch));
+  ASSERT_OK(OptionTypeInfo::ParseType(config_options, str, type_map, &copy));
+  ASSERT_TRUE(OptionTypeInfo::TypesAreEqual(config_options, type_map, &opts,
+                                            &copy, &mismatch));
+}
+
+class ConfigOptionsTest : public testing::Test {};
+
+TEST_F(ConfigOptionsTest, EnvFromConfigOptions) {
+  ConfigOptions config_options;
+  DBOptions db_opts;
+  Options opts;
+  Env* mem_env = NewMemEnv(Env::Default());
+  config_options.registry->AddLibrary("custom-env", RegisterCustomEnv,
+                                      kCustomEnvName);
+
+  config_options.env = mem_env;
+  // First test that we can get the env as expected
+  ASSERT_OK(GetDBOptionsFromString(config_options, DBOptions(), kCustomEnvProp,
+                                   &db_opts));
+  ASSERT_OK(
+      GetOptionsFromString(config_options, Options(), kCustomEnvProp, &opts));
+  ASSERT_NE(config_options.env, db_opts.env);
+  ASSERT_EQ(opts.env, db_opts.env);
+  Env* custom_env = db_opts.env;
+
+  // Now try a "bad" env" and check that nothing changed
+  config_options.ignore_unsupported_options = true;
+  ASSERT_OK(
+      GetDBOptionsFromString(config_options, db_opts, "env=unknown", &db_opts));
+  ASSERT_OK(GetOptionsFromString(config_options, opts, "env=unknown", &opts));
+  ASSERT_EQ(config_options.env, mem_env);
+  ASSERT_EQ(db_opts.env, custom_env);
+  ASSERT_EQ(opts.env, db_opts.env);
+
+  // Now try a "bad" env" ignoring unknown objects
+  config_options.ignore_unsupported_options = false;
+  ASSERT_NOK(
+      GetDBOptionsFromString(config_options, db_opts, "env=unknown", &db_opts));
+  ASSERT_EQ(config_options.env, mem_env);
+  ASSERT_EQ(db_opts.env, custom_env);
+  ASSERT_EQ(opts.env, db_opts.env);
+
+  delete mem_env;
+}
+TEST_F(ConfigOptionsTest, MergeOperatorFromString) {
+  ConfigOptions config_options;
+  std::shared_ptr<MergeOperator> merge_op;
+
+  ASSERT_OK(MergeOperator::CreateFromString(config_options, "put", &merge_op));
+  ASSERT_NE(merge_op, nullptr);
+  ASSERT_TRUE(merge_op->IsInstanceOf("put"));
+  ASSERT_STREQ(merge_op->Name(), "PutOperator");
+
+  ASSERT_OK(
+      MergeOperator::CreateFromString(config_options, "put_v1", &merge_op));
+  ASSERT_NE(merge_op, nullptr);
+  ASSERT_TRUE(merge_op->IsInstanceOf("PutOperator"));
+
+  ASSERT_OK(
+      MergeOperator::CreateFromString(config_options, "uint64add", &merge_op));
+  ASSERT_NE(merge_op, nullptr);
+  ASSERT_TRUE(merge_op->IsInstanceOf("uint64add"));
+  ASSERT_STREQ(merge_op->Name(), "UInt64AddOperator");
+
+  ASSERT_OK(MergeOperator::CreateFromString(config_options, "max", &merge_op));
+  ASSERT_NE(merge_op, nullptr);
+  ASSERT_TRUE(merge_op->IsInstanceOf("max"));
+  ASSERT_STREQ(merge_op->Name(), "MaxOperator");
+
+  ASSERT_OK(
+      MergeOperator::CreateFromString(config_options, "bytesxor", &merge_op));
+  ASSERT_NE(merge_op, nullptr);
+  ASSERT_TRUE(merge_op->IsInstanceOf("bytesxor"));
+  ASSERT_STREQ(merge_op->Name(), BytesXOROperator::kClassName());
+
+  ASSERT_OK(
+      MergeOperator::CreateFromString(config_options, "sortlist", &merge_op));
+  ASSERT_NE(merge_op, nullptr);
+  ASSERT_TRUE(merge_op->IsInstanceOf("sortlist"));
+  ASSERT_STREQ(merge_op->Name(), SortList::kClassName());
+
+  ASSERT_OK(MergeOperator::CreateFromString(config_options, "stringappend",
+                                            &merge_op));
+  ASSERT_NE(merge_op, nullptr);
+  ASSERT_TRUE(merge_op->IsInstanceOf("stringappend"));
+  ASSERT_STREQ(merge_op->Name(), StringAppendOperator::kClassName());
+  auto delimiter = merge_op->GetOptions<std::string>("Delimiter");
+  ASSERT_NE(delimiter, nullptr);
+  ASSERT_EQ(*delimiter, ",");
+
+  ASSERT_OK(MergeOperator::CreateFromString(config_options, "stringappendtest",
+                                            &merge_op));
+  ASSERT_NE(merge_op, nullptr);
+  ASSERT_TRUE(merge_op->IsInstanceOf("stringappendtest"));
+  ASSERT_STREQ(merge_op->Name(), StringAppendTESTOperator::kClassName());
+  delimiter = merge_op->GetOptions<std::string>("Delimiter");
+  ASSERT_NE(delimiter, nullptr);
+  ASSERT_EQ(*delimiter, ",");
+
+  ASSERT_OK(MergeOperator::CreateFromString(
+      config_options, "id=stringappend; delimiter=||", &merge_op));
+  ASSERT_NE(merge_op, nullptr);
+  ASSERT_TRUE(merge_op->IsInstanceOf("stringappend"));
+  ASSERT_STREQ(merge_op->Name(), StringAppendOperator::kClassName());
+  delimiter = merge_op->GetOptions<std::string>("Delimiter");
+  ASSERT_NE(delimiter, nullptr);
+  ASSERT_EQ(*delimiter, "||");
+
+  ASSERT_OK(MergeOperator::CreateFromString(
+      config_options, "id=stringappendtest; delimiter=&&", &merge_op));
+  ASSERT_NE(merge_op, nullptr);
+  ASSERT_TRUE(merge_op->IsInstanceOf("stringappendtest"));
+  ASSERT_STREQ(merge_op->Name(), StringAppendTESTOperator::kClassName());
+  delimiter = merge_op->GetOptions<std::string>("Delimiter");
+  ASSERT_NE(delimiter, nullptr);
+  ASSERT_EQ(*delimiter, "&&");
+
+  std::shared_ptr<MergeOperator> copy;
+  std::string mismatch;
+  std::string opts_str = merge_op->ToString(config_options);
+
+  ASSERT_OK(MergeOperator::CreateFromString(config_options, opts_str, &copy));
+  ASSERT_TRUE(merge_op->AreEquivalent(config_options, copy.get(), &mismatch));
+  ASSERT_NE(copy, nullptr);
+  delimiter = copy->GetOptions<std::string>("Delimiter");
+  ASSERT_NE(delimiter, nullptr);
+  ASSERT_EQ(*delimiter, "&&");
+}
+
+TEST_F(ConfigOptionsTest, ConfiguringOptionsDoesNotRevertRateLimiterBandwidth) {
+  // Regression test for bug where rate limiter's dynamically set bandwidth
+  // could be silently reverted when configuring an options structure with an
+  // existing `rate_limiter`.
+  Options base_options;
+  base_options.rate_limiter.reset(
+      NewGenericRateLimiter(1 << 20 /* rate_bytes_per_sec */));
+  Options copy_options(base_options);
+
+  base_options.rate_limiter->SetBytesPerSecond(2 << 20);
+  ASSERT_EQ(2 << 20, base_options.rate_limiter->GetBytesPerSecond());
+
+  ASSERT_OK(GetOptionsFromString(base_options, "", &copy_options));
+  ASSERT_EQ(2 << 20, base_options.rate_limiter->GetBytesPerSecond());
+}
+
+INSTANTIATE_TEST_CASE_P(OptionsSanityCheckTest, OptionsSanityCheckTest,
+                        ::testing::Bool());
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+#ifdef GFLAGS
+  ParseCommandLineFlags(&argc, &argv, true);
+#endif  // GFLAGS
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/plugin/README.md b/src/rocksdb/plugin/README.md
new file mode 100644
index 000000000..5cd899468
--- /dev/null
+++ b/src/rocksdb/plugin/README.md
@@ -0,0 +1,43 @@
+## Building external plugins together with RocksDB
+
+RocksDB offers several plugin interfaces for developers to customize its behavior. One difficulty developers face is how to make their plugin available to end users. The approach discussed here involves building the external code together with the RocksDB code into a single binary. Note another approach we plan to support involves loading plugins dynamically from shared libraries.
+
+### Discovery
+
+We hope developers will mention their work in "PLUGINS.md" so users can easily discover and reuse solutions for customizing RocksDB.
+
+### Directory organization
+
+External plugins will be linked according to their name into a subdirectory of "plugin/". For example, a plugin called "dedupfs" would be linked into "plugin/dedupfs/".
+
+### Build standard
+
+Currently the only supported build system are make and cmake.
+
+For make, files in the plugin directory ending in the .mk extension can define the following variables.
+
+* `$(PLUGIN_NAME)_SOURCES`: these files will be compiled and linked with RocksDB. They can access RocksDB public header files.
+* `$(PLUGIN_NAME)_HEADERS`: these files will be installed in the RocksDB header directory. Their paths will be prefixed by "rocksdb/plugin/$(PLUGIN_NAME)/".
+* `$(PLUGIN_NAME)_LDFLAGS`: these flags will be passed to the final link step. For example, library dependencies can be propagated here, or symbols can be forcibly included, e.g., for static registration.
+* `$(PLUGIN_NAME)_CXXFLAGS`: these flags will be passed to the compiler. For example, they can specify locations of header files in non-standard locations.
+
+Users will run the usual make commands from the RocksDB directory, specifying the plugins to include in a space-separated list in the variable `ROCKSDB_PLUGINS`.
+
+For CMake, the CMakeLists.txt file in the plugin directory can define the following variables.
+
+* `${PLUGIN_NAME}_SOURCES`: these files will be compiled and linked with RocksDB. They can access RocksDB public header files.
+* `${PLUGIN_NAME}_COMPILE_FLAGS`: these flags will be passed to the compiler. For example, they can specify locations of header files in non-standard locations.
+* `${PLUGIN_NAME}_INCLUDE_PATHS`: paths to directories to search for plugin-specific header files during compilation.
+* `${PLUGIN_NAME}_LIBS`: list of library names required to build the plugin, e.g. `dl`, `java`, `jvm`, `rados`, etc. CMake will generate proper flags for linking.
+* `${PLUGIN_NAME}_LINK_PATHS`: list of paths for the linker to search for required libraries in additional to standard locations.
+* `${PLUGIN_NAME}_CMAKE_SHARED_LINKER_FLAGS` additional linker flags used to generate shared libraries. For example, symbols can be forcibly included, e.g., for static registration.
+* `${PLUGIN_NAME}_CMAKE_EXE_LINKER_FLAGS`: additional linker flags used to generate executables. For example, symbols can be forcibly included, e.g., for static registration.
+
+Users will run the usual cmake commands, specifying the plugins to include in a space-separated list in the command line variable `ROCKSDB_PLUGINS` when invoking cmake.
+```
+cmake .. -DROCKSDB_PLUGINS="dedupfs hdfs rados"
+```
+
+### Example
+
+For a working example, see [Dedupfs](https://github.com/ajkr/dedupfs).
diff --git a/src/rocksdb/port/README b/src/rocksdb/port/README
new file mode 100644
index 000000000..422563e25
--- /dev/null
+++ b/src/rocksdb/port/README
@@ -0,0 +1,10 @@
+This directory contains interfaces and implementations that isolate the
+rest of the package from platform details.
+
+Code in the rest of the package includes "port.h" from this directory.
+"port.h" in turn includes a platform specific "port_<platform>.h" file
+that provides the platform specific implementation.
+
+See port_posix.h for an example of what must be provided in a platform
+specific header file.
+
diff --git a/src/rocksdb/port/jemalloc_helper.h b/src/rocksdb/port/jemalloc_helper.h
new file mode 100644
index 000000000..f085f6226
--- /dev/null
+++ b/src/rocksdb/port/jemalloc_helper.h
@@ -0,0 +1,107 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#if defined(__clang__) && defined(__GLIBC__)
+// glibc's `posix_memalign()` declaration specifies `throw()` while clang's
+// declaration does not. There is a hack in clang to make its re-declaration
+// compatible with glibc's if they are declared consecutively. That hack breaks
+// if yet another `posix_memalign()` declaration comes between glibc's and
+// clang's declarations. Include "mm_malloc.h" here ensures glibc's and clang's
+// declarations both come before "jemalloc.h"'s `posix_memalign()` declaration.
+//
+// This problem could also be avoided if "jemalloc.h"'s `posix_memalign()`
+// declaration did not specify `throw()` when built with clang.
+#include <mm_malloc.h>
+#endif
+
+#ifdef ROCKSDB_JEMALLOC
+#ifdef __FreeBSD__
+#include <malloc_np.h>
+#define JEMALLOC_USABLE_SIZE_CONST const
+#else
+#define JEMALLOC_MANGLE
+#include <jemalloc/jemalloc.h>
+#endif
+
+#ifndef JEMALLOC_CXX_THROW
+#define JEMALLOC_CXX_THROW
+#endif
+
+#if defined(OS_WIN) && defined(_MSC_VER)
+
+// MSVC does not have weak symbol support. As long as ROCKSDB_JEMALLOC is
+// defined, Jemalloc memory allocator is used.
+static inline bool HasJemalloc() { return true; }
+
+#else
+
+// definitions for compatibility with older versions of jemalloc
+#if !defined(JEMALLOC_ALLOCATOR)
+#define JEMALLOC_ALLOCATOR
+#endif
+#if !defined(JEMALLOC_RESTRICT_RETURN)
+#define JEMALLOC_RESTRICT_RETURN
+#endif
+#if !defined(JEMALLOC_NOTHROW)
+#define JEMALLOC_NOTHROW JEMALLOC_ATTR(nothrow)
+#endif
+#if !defined(JEMALLOC_ALLOC_SIZE)
+#ifdef JEMALLOC_HAVE_ATTR_ALLOC_SIZE
+#define JEMALLOC_ALLOC_SIZE(s) JEMALLOC_ATTR(alloc_size(s))
+#else
+#define JEMALLOC_ALLOC_SIZE(s)
+#endif
+#endif
+
+// Declare non-standard jemalloc APIs as weak symbols. We can null-check these
+// symbols to detect whether jemalloc is linked with the binary.
+extern "C" JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW *
+mallocx(size_t, int) JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1)
+    __attribute__((__weak__));
+extern "C" JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW *
+rallocx(void *, size_t, int) JEMALLOC_ALLOC_SIZE(2) __attribute__((__weak__));
+extern "C" size_t JEMALLOC_NOTHROW xallocx(void *, size_t, size_t, int)
+    __attribute__((__weak__));
+extern "C" size_t JEMALLOC_NOTHROW sallocx(const void *, int)
+    JEMALLOC_ATTR(pure) __attribute__((__weak__));
+extern "C" void JEMALLOC_NOTHROW dallocx(void *, int) __attribute__((__weak__));
+extern "C" void JEMALLOC_NOTHROW sdallocx(void *, size_t, int)
+    __attribute__((__weak__));
+extern "C" size_t JEMALLOC_NOTHROW nallocx(size_t, int) JEMALLOC_ATTR(pure)
+    __attribute__((__weak__));
+extern "C" int JEMALLOC_NOTHROW mallctl(const char *, void *, size_t *, void *,
+                                        size_t) __attribute__((__weak__));
+extern "C" int JEMALLOC_NOTHROW mallctlnametomib(const char *, size_t *,
+                                                 size_t *)
+    __attribute__((__weak__));
+extern "C" int JEMALLOC_NOTHROW mallctlbymib(const size_t *, size_t, void *,
+                                             size_t *, void *, size_t)
+    __attribute__((__weak__));
+extern "C" void JEMALLOC_NOTHROW
+malloc_stats_print(void (*)(void *, const char *), void *, const char *)
+    __attribute__((__weak__));
+extern "C" size_t JEMALLOC_NOTHROW
+malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *) JEMALLOC_CXX_THROW
+    __attribute__((__weak__));
+
+// Check if Jemalloc is linked with the binary. Note the main program might be
+// using a different memory allocator even this method return true.
+// It is loosely based on folly::usingJEMalloc(), minus the check that actually
+// allocate memory and see if it is through jemalloc, to handle the dlopen()
+// case:
+// https://github.com/facebook/folly/blob/76cf8b5841fb33137cfbf8b224f0226437c855bc/folly/memory/Malloc.h#L147
+static inline bool HasJemalloc() {
+  return mallocx != nullptr && rallocx != nullptr && xallocx != nullptr &&
+         sallocx != nullptr && dallocx != nullptr && sdallocx != nullptr &&
+         nallocx != nullptr && mallctl != nullptr &&
+         mallctlnametomib != nullptr && mallctlbymib != nullptr &&
+         malloc_stats_print != nullptr && malloc_usable_size != nullptr;
+}
+
+#endif
+
+#endif  // ROCKSDB_JEMALLOC
diff --git a/src/rocksdb/port/lang.h b/src/rocksdb/port/lang.h
new file mode 100644
index 000000000..52c597acd
--- /dev/null
+++ b/src/rocksdb/port/lang.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef FALLTHROUGH_INTENDED
+#if defined(__clang__)
+#define FALLTHROUGH_INTENDED [[clang::fallthrough]]
+#elif defined(__GNUC__) && __GNUC__ >= 7
+#define FALLTHROUGH_INTENDED [[gnu::fallthrough]]
+#else
+#define FALLTHROUGH_INTENDED \
+  do {                       \
+  } while (0)
+#endif
+#endif
+
+#define DECLARE_DEFAULT_MOVES(Name) \
+  Name(Name&&) noexcept = default;  \
+  Name& operator=(Name&&) = default
+
+// ASAN (Address sanitizer)
+
+#if defined(__clang__)
+#if defined(__has_feature)
+#if __has_feature(address_sanitizer)
+#define MUST_FREE_HEAP_ALLOCATIONS 1
+#endif  // __has_feature(address_sanitizer)
+#endif  // defined(__has_feature)
+#else   // __clang__
+#ifdef __SANITIZE_ADDRESS__
+#define MUST_FREE_HEAP_ALLOCATIONS 1
+#endif  // __SANITIZE_ADDRESS__
+#endif  // __clang__
+
+#ifdef ROCKSDB_VALGRIND_RUN
+#define MUST_FREE_HEAP_ALLOCATIONS 1
+#endif  // ROCKSDB_VALGRIND_RUN
+
+// Coding guidelines say to avoid static objects with non-trivial destructors,
+// because it's easy to cause trouble (UB) in static destruction. This
+// macro makes it easier to define static objects that are normally never
+// destructed, except are destructed when running under ASAN. This should
+// avoid unexpected, unnecessary destruction behavior in production.
+// Note that constructor arguments can be provided as in
+//   STATIC_AVOID_DESTRUCTION(Foo, foo)(arg1, arg2);
+#ifdef MUST_FREE_HEAP_ALLOCATIONS
+#define STATIC_AVOID_DESTRUCTION(Type, name) static Type name
+constexpr bool kMustFreeHeapAllocations = true;
+#else
+#define STATIC_AVOID_DESTRUCTION(Type, name) static Type& name = *new Type
+constexpr bool kMustFreeHeapAllocations = false;
+#endif
+
+// TSAN (Thread sanitizer)
+
+// For simplicity, standardize on the GCC define
+#if defined(__clang__)
+#if defined(__has_feature) && __has_feature(thread_sanitizer)
+#define __SANITIZE_THREAD__ 1
+#endif  // __has_feature(thread_sanitizer)
+#endif  // __clang__
+
+#ifdef __SANITIZE_THREAD__
+#define TSAN_SUPPRESSION __attribute__((no_sanitize("thread")))
+#else
+#define TSAN_SUPPRESSION
+#endif  // TSAN_SUPPRESSION
diff --git a/src/rocksdb/port/likely.h b/src/rocksdb/port/likely.h
new file mode 100644
index 000000000..0bd90d701
--- /dev/null
+++ b/src/rocksdb/port/likely.h
@@ -0,0 +1,18 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#if defined(__GNUC__) && __GNUC__ >= 4
+#define LIKELY(x) (__builtin_expect((x), 1))
+#define UNLIKELY(x) (__builtin_expect((x), 0))
+#else
+#define LIKELY(x) (x)
+#define UNLIKELY(x) (x)
+#endif
diff --git a/src/rocksdb/port/malloc.h b/src/rocksdb/port/malloc.h
new file mode 100644
index 000000000..f973263e2
--- /dev/null
+++ b/src/rocksdb/port/malloc.h
@@ -0,0 +1,17 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+#ifdef OS_FREEBSD
+#include <malloc_np.h>
+#else
+#include <malloc.h>
+#endif  // OS_FREEBSD
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
diff --git a/src/rocksdb/port/port.h b/src/rocksdb/port/port.h
new file mode 100644
index 000000000..13aa56d47
--- /dev/null
+++ b/src/rocksdb/port/port.h
@@ -0,0 +1,21 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <string>
+
+// Include the appropriate platform specific file below.  If you are
+// porting to a new platform, see "port_example.h" for documentation
+// of what the new port_<platform>.h file must provide.
+#if defined(ROCKSDB_PLATFORM_POSIX)
+#include "port/port_posix.h"
+#elif defined(OS_WIN)
+#include "port/win/port_win.h"
+#endif
diff --git a/src/rocksdb/port/port_dirent.h b/src/rocksdb/port/port_dirent.h
new file mode 100644
index 000000000..2b23e2f07
--- /dev/null
+++ b/src/rocksdb/port/port_dirent.h
@@ -0,0 +1,44 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// See port_example.h for documentation for the following types/functions.
+
+#pragma once
+
+#ifdef ROCKSDB_PLATFORM_POSIX
+#include <dirent.h>
+#include <sys/types.h>
+#elif defined(OS_WIN)
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+
+struct dirent {
+  char d_name[_MAX_PATH]; /* filename */
+};
+
+struct DIR;
+
+DIR* opendir(const char* name);
+
+dirent* readdir(DIR* dirp);
+
+int closedir(DIR* dirp);
+
+}  // namespace port
+
+using port::closedir;
+using port::DIR;
+using port::dirent;
+using port::opendir;
+using port::readdir;
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // OS_WIN
diff --git a/src/rocksdb/port/port_example.h b/src/rocksdb/port/port_example.h
new file mode 100644
index 000000000..794149a69
--- /dev/null
+++ b/src/rocksdb/port/port_example.h
@@ -0,0 +1,101 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// This file contains the specification, but not the implementations,
+// of the types/operations/etc. that should be defined by a platform
+// specific port_<platform>.h file.  Use this file as a reference for
+// how to port this package to a new platform.
+
+#pragma once
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+
+// TODO(jorlow): Many of these belong more in the environment class rather than
+//               here. We should try moving them and see if it affects perf.
+
+// The following boolean constant must be true on a little-endian machine
+// and false otherwise.
+static const bool kLittleEndian = true /* or some other expression */;
+
+// ------------------ Threading -------------------
+
+// A Mutex represents an exclusive lock.
+class Mutex {
+ public:
+  Mutex();
+  ~Mutex();
+
+  // Lock the mutex.  Waits until other lockers have exited.
+  // Will deadlock if the mutex is already locked by this thread.
+  void Lock();
+
+  // Unlock the mutex.
+  // REQUIRES: This mutex was locked by this thread.
+  void Unlock();
+
+  // Optionally crash if this thread does not hold this mutex.
+  // The implementation must be fast, especially if NDEBUG is
+  // defined.  The implementation is allowed to skip all checks.
+  void AssertHeld();
+};
+
+class CondVar {
+ public:
+  explicit CondVar(Mutex* mu);
+  ~CondVar();
+
+  // Atomically release *mu and block on this condition variable until
+  // either a call to SignalAll(), or a call to Signal() that picks
+  // this thread to wakeup.
+  // REQUIRES: this thread holds *mu
+  void Wait();
+
+  // If there are some threads waiting, wake up at least one of them.
+  void Signal();
+
+  // Wake up all waiting threads.
+  void SignallAll();
+};
+
+// Thread-safe initialization.
+// Used as follows:
+//      static port::OnceType init_control = LEVELDB_ONCE_INIT;
+//      static void Initializer() { ... do something ...; }
+//      ...
+//      port::InitOnce(&init_control, &Initializer);
+using OnceType = intptr_t;
+#define LEVELDB_ONCE_INIT 0
+extern void InitOnce(port::OnceType*, void (*initializer)());
+
+// ------------------ Compression -------------------
+
+// Store the snappy compression of "input[0,input_length-1]" in *output.
+// Returns false if snappy is not supported by this port.
+extern bool Snappy_Compress(const char* input, size_t input_length,
+                            std::string* output);
+
+// If input[0,input_length-1] looks like a valid snappy compressed
+// buffer, store the size of the uncompressed data in *result and
+// return true.  Else return false.
+extern bool Snappy_GetUncompressedLength(const char* input, size_t length,
+                                         size_t* result);
+
+// Attempt to snappy uncompress input[0,input_length-1] into *output.
+// Returns true if successful, false if the input is invalid lightweight
+// compressed data.
+//
+// REQUIRES: at least the first "n" bytes of output[] must be writable
+// where "n" is the result of a successful call to
+// Snappy_GetUncompressedLength.
+extern bool Snappy_Uncompress(const char* input_data, size_t input_length,
+                              char* output);
+
+}  // namespace port
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/port/port_posix.cc b/src/rocksdb/port/port_posix.cc
new file mode 100644
index 000000000..3872293b8
--- /dev/null
+++ b/src/rocksdb/port/port_posix.cc
@@ -0,0 +1,300 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#if !defined(OS_WIN)
+
+#include "port/port_posix.h"
+
+#include <assert.h>
+#if defined(__i386__) || defined(__x86_64__)
+#include <cpuid.h>
+#endif
+#include <errno.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include <cstdlib>
+#include <fstream>
+#include <string>
+
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// We want to give users opportunity to default all the mutexes to adaptive if
+// not specified otherwise. This enables a quick way to conduct various
+// performance related experiements.
+//
+// NB! Support for adaptive mutexes is turned on by definining
+// ROCKSDB_PTHREAD_ADAPTIVE_MUTEX during the compilation. If you use RocksDB
+// build environment then this happens automatically; otherwise it's up to the
+// consumer to define the identifier.
+#ifdef ROCKSDB_DEFAULT_TO_ADAPTIVE_MUTEX
+extern const bool kDefaultToAdaptiveMutex = true;
+#else
+extern const bool kDefaultToAdaptiveMutex = false;
+#endif
+
+namespace port {
+
+static int PthreadCall(const char* label, int result) {
+  if (result != 0 && result != ETIMEDOUT && result != EBUSY) {
+    fprintf(stderr, "pthread %s: %s\n", label, errnoStr(result).c_str());
+    abort();
+  }
+  return result;
+}
+
+Mutex::Mutex(bool adaptive) {
+  (void)adaptive;
+#ifdef ROCKSDB_PTHREAD_ADAPTIVE_MUTEX
+  if (!adaptive) {
+    PthreadCall("init mutex", pthread_mutex_init(&mu_, nullptr));
+  } else {
+    pthread_mutexattr_t mutex_attr;
+    PthreadCall("init mutex attr", pthread_mutexattr_init(&mutex_attr));
+    PthreadCall("set mutex attr", pthread_mutexattr_settype(
+                                      &mutex_attr, PTHREAD_MUTEX_ADAPTIVE_NP));
+    PthreadCall("init mutex", pthread_mutex_init(&mu_, &mutex_attr));
+    PthreadCall("destroy mutex attr", pthread_mutexattr_destroy(&mutex_attr));
+  }
+#else
+  PthreadCall("init mutex", pthread_mutex_init(&mu_, nullptr));
+#endif  // ROCKSDB_PTHREAD_ADAPTIVE_MUTEX
+}
+
+Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); }
+
+void Mutex::Lock() {
+  PthreadCall("lock", pthread_mutex_lock(&mu_));
+#ifndef NDEBUG
+  locked_ = true;
+#endif
+}
+
+void Mutex::Unlock() {
+#ifndef NDEBUG
+  locked_ = false;
+#endif
+  PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+}
+
+bool Mutex::TryLock() {
+  bool ret = PthreadCall("trylock", pthread_mutex_trylock(&mu_)) == 0;
+#ifndef NDEBUG
+  if (ret) {
+    locked_ = true;
+  }
+#endif
+  return ret;
+}
+
+void Mutex::AssertHeld() {
+#ifndef NDEBUG
+  assert(locked_);
+#endif
+}
+
+CondVar::CondVar(Mutex* mu) : mu_(mu) {
+  PthreadCall("init cv", pthread_cond_init(&cv_, nullptr));
+}
+
+CondVar::~CondVar() { PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); }
+
+void CondVar::Wait() {
+#ifndef NDEBUG
+  mu_->locked_ = false;
+#endif
+  PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_));
+#ifndef NDEBUG
+  mu_->locked_ = true;
+#endif
+}
+
+bool CondVar::TimedWait(uint64_t abs_time_us) {
+  struct timespec ts;
+  ts.tv_sec = static_cast<time_t>(abs_time_us / 1000000);
+  ts.tv_nsec = static_cast<suseconds_t>((abs_time_us % 1000000) * 1000);
+
+#ifndef NDEBUG
+  mu_->locked_ = false;
+#endif
+  int err = pthread_cond_timedwait(&cv_, &mu_->mu_, &ts);
+#ifndef NDEBUG
+  mu_->locked_ = true;
+#endif
+  if (err == ETIMEDOUT) {
+    return true;
+  }
+  if (err != 0) {
+    PthreadCall("timedwait", err);
+  }
+  return false;
+}
+
+void CondVar::Signal() { PthreadCall("signal", pthread_cond_signal(&cv_)); }
+
+void CondVar::SignalAll() {
+  PthreadCall("broadcast", pthread_cond_broadcast(&cv_));
+}
+
+RWMutex::RWMutex() {
+  PthreadCall("init mutex", pthread_rwlock_init(&mu_, nullptr));
+}
+
+RWMutex::~RWMutex() {
+  PthreadCall("destroy mutex", pthread_rwlock_destroy(&mu_));
+}
+
+void RWMutex::ReadLock() {
+  PthreadCall("read lock", pthread_rwlock_rdlock(&mu_));
+}
+
+void RWMutex::WriteLock() {
+  PthreadCall("write lock", pthread_rwlock_wrlock(&mu_));
+}
+
+void RWMutex::ReadUnlock() {
+  PthreadCall("read unlock", pthread_rwlock_unlock(&mu_));
+}
+
+void RWMutex::WriteUnlock() {
+  PthreadCall("write unlock", pthread_rwlock_unlock(&mu_));
+}
+
+int PhysicalCoreID() {
+#if defined(ROCKSDB_SCHED_GETCPU_PRESENT) && defined(__x86_64__) && \
+    (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 22))
+  // sched_getcpu uses VDSO getcpu() syscall since 2.22. I believe Linux offers
+  // VDSO support only on x86_64. This is the fastest/preferred method if
+  // available.
+  int cpuno = sched_getcpu();
+  if (cpuno < 0) {
+    return -1;
+  }
+  return cpuno;
+#elif defined(__x86_64__) || defined(__i386__)
+  // clang/gcc both provide cpuid.h, which defines __get_cpuid(), for x86_64 and
+  // i386.
+  unsigned eax, ebx = 0, ecx, edx;
+  if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
+    return -1;
+  }
+  return ebx >> 24;
+#else
+  // give up, the caller can generate a random number or something.
+  return -1;
+#endif
+}
+
+void InitOnce(OnceType* once, void (*initializer)()) {
+  PthreadCall("once", pthread_once(once, initializer));
+}
+
+void Crash(const std::string& srcfile, int srcline) {
+  fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline);
+  fflush(stdout);
+  kill(getpid(), SIGTERM);
+}
+
+int GetMaxOpenFiles() {
+#if defined(RLIMIT_NOFILE)
+  struct rlimit no_files_limit;
+  if (getrlimit(RLIMIT_NOFILE, &no_files_limit) != 0) {
+    return -1;
+  }
+  // protect against overflow
+  if (static_cast<uintmax_t>(no_files_limit.rlim_cur) >=
+      static_cast<uintmax_t>(std::numeric_limits<int>::max())) {
+    return std::numeric_limits<int>::max();
+  }
+  return static_cast<int>(no_files_limit.rlim_cur);
+#endif
+  return -1;
+}
+
+void* cacheline_aligned_alloc(size_t size) {
+#if __GNUC__ < 5 && defined(__SANITIZE_ADDRESS__)
+  return malloc(size);
+#elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || defined(__APPLE__))
+  void* m;
+  errno = posix_memalign(&m, CACHE_LINE_SIZE, size);
+  return errno ? nullptr : m;
+#else
+  return malloc(size);
+#endif
+}
+
+void cacheline_aligned_free(void* memblock) { free(memblock); }
+
+static size_t GetPageSize() {
+#if defined(OS_LINUX) || defined(_SC_PAGESIZE)
+  long v = sysconf(_SC_PAGESIZE);
+  if (v >= 1024) {
+    return static_cast<size_t>(v);
+  }
+#endif
+  // Default assume 4KB
+  return 4U * 1024U;
+}
+
+const size_t kPageSize = GetPageSize();
+
+void SetCpuPriority(ThreadId id, CpuPriority priority) {
+#ifdef OS_LINUX
+  sched_param param;
+  param.sched_priority = 0;
+  switch (priority) {
+    case CpuPriority::kHigh:
+      sched_setscheduler(id, SCHED_OTHER, &param);
+      setpriority(PRIO_PROCESS, id, -20);
+      break;
+    case CpuPriority::kNormal:
+      sched_setscheduler(id, SCHED_OTHER, &param);
+      setpriority(PRIO_PROCESS, id, 0);
+      break;
+    case CpuPriority::kLow:
+      sched_setscheduler(id, SCHED_OTHER, &param);
+      setpriority(PRIO_PROCESS, id, 19);
+      break;
+    case CpuPriority::kIdle:
+      sched_setscheduler(id, SCHED_IDLE, &param);
+      break;
+    default:
+      assert(false);
+  }
+#else
+  (void)id;
+  (void)priority;
+#endif
+}
+
+int64_t GetProcessID() { return getpid(); }
+
+bool GenerateRfcUuid(std::string* output) {
+  output->clear();
+  std::ifstream f("/proc/sys/kernel/random/uuid");
+  std::getline(f, /*&*/ *output);
+  if (output->size() == 36) {
+    return true;
+  } else {
+    output->clear();
+    return false;
+  }
+}
+
+}  // namespace port
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/port/port_posix.h b/src/rocksdb/port/port_posix.h
new file mode 100644
index 000000000..ec6aa281d
--- /dev/null
+++ b/src/rocksdb/port/port_posix.h
@@ -0,0 +1,241 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// See port_example.h for documentation for the following types/functions.
+
+#pragma once
+
+#include <thread>
+
+#include "rocksdb/options.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+// size_t printf formatting named in the manner of C99 standard formatting
+// strings such as PRIu64
+// in fact, we could use that one
+#define ROCKSDB_PRIszt "zu"
+
+#define __declspec(S)
+
+#undef PLATFORM_IS_LITTLE_ENDIAN
+#if defined(OS_MACOSX)
+#include <machine/endian.h>
+#if defined(__DARWIN_LITTLE_ENDIAN) && defined(__DARWIN_BYTE_ORDER)
+#define PLATFORM_IS_LITTLE_ENDIAN \
+  (__DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN)
+#endif
+#elif defined(OS_SOLARIS)
+#include <sys/isa_defs.h>
+#ifdef _LITTLE_ENDIAN
+#define PLATFORM_IS_LITTLE_ENDIAN true
+#else
+#define PLATFORM_IS_LITTLE_ENDIAN false
+#endif
+#include <alloca.h>
+#elif defined(OS_AIX)
+#include <arpa/nameser_compat.h>
+#include <sys/types.h>
+#define PLATFORM_IS_LITTLE_ENDIAN (BYTE_ORDER == LITTLE_ENDIAN)
+#include <alloca.h>
+#elif defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || \
+    defined(OS_DRAGONFLYBSD) || defined(OS_ANDROID)
+#include <sys/endian.h>
+#include <sys/types.h>
+#define PLATFORM_IS_LITTLE_ENDIAN (_BYTE_ORDER == _LITTLE_ENDIAN)
+#else
+#include <endian.h>
+#endif
+#include <pthread.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <limits>
+#include <string>
+
+#ifndef PLATFORM_IS_LITTLE_ENDIAN
+#define PLATFORM_IS_LITTLE_ENDIAN (__BYTE_ORDER == __LITTLE_ENDIAN)
+#endif
+
+#if defined(OS_MACOSX) || defined(OS_SOLARIS) || defined(OS_FREEBSD) ||      \
+    defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD) || \
+    defined(OS_ANDROID) || defined(CYGWIN) || defined(OS_AIX)
+// Use fread/fwrite/fflush on platforms without _unlocked variants
+#define fread_unlocked fread
+#define fwrite_unlocked fwrite
+#define fflush_unlocked fflush
+#endif
+
+#if defined(OS_MACOSX) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || \
+    defined(OS_DRAGONFLYBSD)
+// Use fsync() on platforms without fdatasync()
+#define fdatasync fsync
+#endif
+
+#if defined(OS_ANDROID) && __ANDROID_API__ < 9
+// fdatasync() was only introduced in API level 9 on Android. Use fsync()
+// when targeting older platforms.
+#define fdatasync fsync
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+extern const bool kDefaultToAdaptiveMutex;
+
+namespace port {
+constexpr bool kLittleEndian = PLATFORM_IS_LITTLE_ENDIAN;
+#undef PLATFORM_IS_LITTLE_ENDIAN
+
+class CondVar;
+
+class Mutex {
+ public:
+  static const char* kName() { return "pthread_mutex_t"; }
+
+  explicit Mutex(bool adaptive = kDefaultToAdaptiveMutex);
+  // No copying
+  Mutex(const Mutex&) = delete;
+  void operator=(const Mutex&) = delete;
+
+  ~Mutex();
+
+  void Lock();
+  void Unlock();
+
+  bool TryLock();
+
+  // this will assert if the mutex is not locked
+  // it does NOT verify that mutex is held by a calling thread
+  void AssertHeld();
+
+  // Also implement std Lockable
+  inline void lock() { Lock(); }
+  inline void unlock() { Unlock(); }
+  inline bool try_lock() { return TryLock(); }
+
+ private:
+  friend class CondVar;
+  pthread_mutex_t mu_;
+#ifndef NDEBUG
+  bool locked_ = false;
+#endif
+};
+
+class RWMutex {
+ public:
+  RWMutex();
+  // No copying allowed
+  RWMutex(const RWMutex&) = delete;
+  void operator=(const RWMutex&) = delete;
+
+  ~RWMutex();
+
+  void ReadLock();
+  void WriteLock();
+  void ReadUnlock();
+  void WriteUnlock();
+  void AssertHeld() {}
+
+ private:
+  pthread_rwlock_t mu_;  // the underlying platform mutex
+};
+
+class CondVar {
+ public:
+  explicit CondVar(Mutex* mu);
+  ~CondVar();
+  void Wait();
+  // Timed condition wait.  Returns true if timeout occurred.
+  bool TimedWait(uint64_t abs_time_us);
+  void Signal();
+  void SignalAll();
+
+ private:
+  pthread_cond_t cv_;
+  Mutex* mu_;
+};
+
+using Thread = std::thread;
+
+static inline void AsmVolatilePause() {
+#if defined(__i386__) || defined(__x86_64__)
+  asm volatile("pause");
+#elif defined(__aarch64__)
+  asm volatile("isb");
+#elif defined(__powerpc64__)
+  asm volatile("or 27,27,27");
+#endif
+  // it's okay for other platforms to be no-ops
+}
+
+// Returns -1 if not available on this platform
+extern int PhysicalCoreID();
+
+using OnceType = pthread_once_t;
+#define LEVELDB_ONCE_INIT PTHREAD_ONCE_INIT
+extern void InitOnce(OnceType* once, void (*initializer)());
+
+#ifndef CACHE_LINE_SIZE
+// To test behavior with non-native cache line size, e.g. for
+// Bloom filters, set TEST_CACHE_LINE_SIZE to the desired test size.
+// This disables ALIGN_AS to keep it from failing compilation.
+#ifdef TEST_CACHE_LINE_SIZE
+#define CACHE_LINE_SIZE TEST_CACHE_LINE_SIZE
+#define ALIGN_AS(n) /*empty*/
+#else
+#if defined(__s390__)
+#if defined(__GNUC__) && __GNUC__ < 7
+#define CACHE_LINE_SIZE 64U
+#else
+#define CACHE_LINE_SIZE 256U
+#endif
+#elif defined(__powerpc__) || defined(__aarch64__)
+#define CACHE_LINE_SIZE 128U
+#else
+#define CACHE_LINE_SIZE 64U
+#endif
+#define ALIGN_AS(n) alignas(n)
+#endif
+#endif
+
+static_assert((CACHE_LINE_SIZE & (CACHE_LINE_SIZE - 1)) == 0,
+              "Cache line size must be a power of 2 number of bytes");
+
+extern void* cacheline_aligned_alloc(size_t size);
+
+extern void cacheline_aligned_free(void* memblock);
+
+#if defined(__aarch64__)
+//  __builtin_prefetch(..., 1) turns into a prefetch into prfm pldl3keep. On
+// arm64 we want this as close to the core as possible to turn it into a
+// L1 prefetech unless locality == 0 in which case it will be turned into a
+// non-temporal prefetch
+#define PREFETCH(addr, rw, locality) \
+  __builtin_prefetch(addr, rw, locality >= 1 ? 3 : locality)
+#else
+#define PREFETCH(addr, rw, locality) __builtin_prefetch(addr, rw, locality)
+#endif
+
+extern void Crash(const std::string& srcfile, int srcline);
+
+extern int GetMaxOpenFiles();
+
+extern const size_t kPageSize;
+
+using ThreadId = pid_t;
+
+extern void SetCpuPriority(ThreadId id, CpuPriority priority);
+
+int64_t GetProcessID();
+
+// Uses platform APIs to generate a 36-character RFC-4122 UUID. Returns
+// true on success or false on failure.
+bool GenerateRfcUuid(std::string* output);
+
+}  // namespace port
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/port/stack_trace.cc b/src/rocksdb/port/stack_trace.cc
new file mode 100644
index 000000000..ef7144947
--- /dev/null
+++ b/src/rocksdb/port/stack_trace.cc
@@ -0,0 +1,202 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "port/stack_trace.h"
+
+#if defined(ROCKSDB_LITE) ||                                                  \
+    !(defined(ROCKSDB_BACKTRACE) || defined(OS_MACOSX)) || defined(CYGWIN) || \
+    defined(OS_SOLARIS) || defined(OS_WIN)
+
+// noop
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+void InstallStackTraceHandler() {}
+void PrintStack(int /*first_frames_to_skip*/) {}
+void PrintAndFreeStack(void* /*callstack*/, int /*num_frames*/) {}
+void* SaveStack(int* /*num_frames*/, int /*first_frames_to_skip*/) {
+  return nullptr;
+}
+}  // namespace port
+}  // namespace ROCKSDB_NAMESPACE
+
+#else
+
+#include <cxxabi.h>
+#include <execinfo.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#if defined(OS_FREEBSD)
+#include <sys/sysctl.h>
+#endif
+#ifdef OS_LINUX
+#include <sys/prctl.h>
+#endif
+
+#include "port/lang.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+
+namespace {
+
+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD)
+const char* GetExecutableName() {
+  static char name[1024];
+
+#if !defined(OS_FREEBSD)
+  char link[1024];
+  snprintf(link, sizeof(link), "/proc/%d/exe", getpid());
+  auto read = readlink(link, name, sizeof(name) - 1);
+  if (-1 == read) {
+    return nullptr;
+  } else {
+    name[read] = 0;
+    return name;
+  }
+#else
+  int mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1};
+  size_t namesz = sizeof(name);
+
+  auto ret = sysctl(mib, 4, name, &namesz, nullptr, 0);
+  if (-1 == ret) {
+    return nullptr;
+  } else {
+    return name;
+  }
+#endif
+}
+
+void PrintStackTraceLine(const char* symbol, void* frame) {
+  static const char* executable = GetExecutableName();
+  if (symbol) {
+    fprintf(stderr, "%s ", symbol);
+  }
+  if (executable) {
+    // out source to addr2line, for the address translation
+    const int kLineMax = 256;
+    char cmd[kLineMax];
+    snprintf(cmd, kLineMax, "addr2line %p -e %s -f -C 2>&1", frame, executable);
+    auto f = popen(cmd, "r");
+    if (f) {
+      char line[kLineMax];
+      while (fgets(line, sizeof(line), f)) {
+        line[strlen(line) - 1] = 0;  // remove newline
+        fprintf(stderr, "%s\t", line);
+      }
+      pclose(f);
+    }
+  } else {
+    fprintf(stderr, " %p", frame);
+  }
+
+  fprintf(stderr, "\n");
+}
+#elif defined(OS_MACOSX)
+
+void PrintStackTraceLine(const char* symbol, void* frame) {
+  static int pid = getpid();
+  // out source to atos, for the address translation
+  const int kLineMax = 256;
+  char cmd[kLineMax];
+  snprintf(cmd, kLineMax, "xcrun atos %p -p %d  2>&1", frame, pid);
+  auto f = popen(cmd, "r");
+  if (f) {
+    char line[kLineMax];
+    while (fgets(line, sizeof(line), f)) {
+      line[strlen(line) - 1] = 0;  // remove newline
+      fprintf(stderr, "%s\t", line);
+    }
+    pclose(f);
+  } else if (symbol) {
+    fprintf(stderr, "%s ", symbol);
+  }
+
+  fprintf(stderr, "\n");
+}
+
+#endif
+
+}  // namespace
+
+void PrintStack(void* frames[], int num_frames) {
+  auto symbols = backtrace_symbols(frames, num_frames);
+
+  for (int i = 0; i < num_frames; ++i) {
+    fprintf(stderr, "#%-2d  ", i);
+    PrintStackTraceLine((symbols != nullptr) ? symbols[i] : nullptr, frames[i]);
+  }
+  free(symbols);
+}
+
+void PrintStack(int first_frames_to_skip) {
+  const int kMaxFrames = 100;
+  void* frames[kMaxFrames];
+
+  auto num_frames = backtrace(frames, kMaxFrames);
+  PrintStack(&frames[first_frames_to_skip], num_frames - first_frames_to_skip);
+}
+
+void PrintAndFreeStack(void* callstack, int num_frames) {
+  PrintStack(static_cast<void**>(callstack), num_frames);
+  free(callstack);
+}
+
+void* SaveStack(int* num_frames, int first_frames_to_skip) {
+  const int kMaxFrames = 100;
+  void* frames[kMaxFrames];
+
+  auto count = backtrace(frames, kMaxFrames);
+  *num_frames = count - first_frames_to_skip;
+  void* callstack = malloc(sizeof(void*) * *num_frames);
+  memcpy(callstack, &frames[first_frames_to_skip], sizeof(void*) * *num_frames);
+  return callstack;
+}
+
+static void StackTraceHandler(int sig) {
+  // reset to default handler
+  signal(sig, SIG_DFL);
+  fprintf(stderr, "Received signal %d (%s)\n", sig, strsignal(sig));
+  // skip the top three signal handler related frames
+  PrintStack(3);
+
+  // Efforts to fix or suppress TSAN warnings "signal-unsafe call inside of
+  // a signal" have failed, so just warn the user about them.
+#ifdef __SANITIZE_THREAD__
+  fprintf(stderr,
+          "==> NOTE: any above warnings about \"signal-unsafe call\" are\n"
+          "==> ignorable, as they are expected when generating a stack\n"
+          "==> trace because of a signal under TSAN. Consider why the\n"
+          "==> signal was generated to begin with, and the stack trace\n"
+          "==> in the TSAN warning can be useful for that. (The stack\n"
+          "==> trace printed by the signal handler is likely obscured\n"
+          "==> by TSAN output.)\n");
+#endif
+
+  // re-signal to default handler (so we still get core dump if needed...)
+  raise(sig);
+}
+
+void InstallStackTraceHandler() {
+  // just use the plain old signal as it's simple and sufficient
+  // for this use case
+  signal(SIGILL, StackTraceHandler);
+  signal(SIGSEGV, StackTraceHandler);
+  signal(SIGBUS, StackTraceHandler);
+  signal(SIGABRT, StackTraceHandler);
+  // Allow ouside debugger to attach, even with Yama security restrictions
+#ifdef PR_SET_PTRACER_ANY
+  (void)prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0);
+#endif
+}
+
+}  // namespace port
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/port/stack_trace.h b/src/rocksdb/port/stack_trace.h
new file mode 100644
index 000000000..5b3bf9320
--- /dev/null
+++ b/src/rocksdb/port/stack_trace.h
@@ -0,0 +1,31 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+
+// Install a signal handler to print callstack on the following signals:
+// SIGILL SIGSEGV SIGBUS SIGABRT
+// And also (Linux ony for now) overrides security settings to allow outside
+// processes to attach to this one as a debugger. ONLY USE FOR NON-SECURITY
+// CRITICAL PROCESSES such as unit tests or benchmarking tools.
+// Currently supports only some POSIX implementations. No-op otherwise.
+void InstallStackTraceHandler();
+
+// Prints stack, skips skip_first_frames frames
+void PrintStack(int first_frames_to_skip = 0);
+
+// Prints the given callstack
+void PrintAndFreeStack(void* callstack, int num_frames);
+
+// Save the current callstack
+void* SaveStack(int* num_frame, int first_frames_to_skip = 0);
+
+}  // namespace port
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/port/sys_time.h b/src/rocksdb/port/sys_time.h
new file mode 100644
index 000000000..f2137526b
--- /dev/null
+++ b/src/rocksdb/port/sys_time.h
@@ -0,0 +1,63 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This file is a portable substitute for sys/time.h which does not exist on
+// Windows
+
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+#if defined(OS_WIN) && (defined(_MSC_VER) || defined(__MINGW32__))
+
+#include <time.h>
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace port {
+
+struct TimeVal {
+  long tv_sec;
+  long tv_usec;
+};
+
+void GetTimeOfDay(TimeVal* tv, struct timezone* tz);
+
+inline struct tm* LocalTimeR(const time_t* timep, struct tm* result) {
+  errno_t ret = localtime_s(result, timep);
+  return (ret == 0) ? result : NULL;
+}
+
+}  // namespace port
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#else
+#include <sys/time.h>
+#include <time.h>
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace port {
+
+using TimeVal = struct timeval;
+
+inline void GetTimeOfDay(TimeVal* tv, struct timezone* tz) {
+  gettimeofday(tv, tz);
+}
+
+inline struct tm* LocalTimeR(const time_t* timep, struct tm* result) {
+  return localtime_r(timep, result);
+}
+
+}  // namespace port
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/port/util_logger.h b/src/rocksdb/port/util_logger.h
new file mode 100644
index 000000000..ce7e3a941
--- /dev/null
+++ b/src/rocksdb/port/util_logger.h
@@ -0,0 +1,18 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+// Include the appropriate platform specific file below.  If you are
+// porting to a new platform, see "port_example.h" for documentation
+// of what the new port_<platform>.h file must provide.
+
+#if defined(OS_WIN)
+#include "port/win/win_logger.h"
+#endif
diff --git a/src/rocksdb/port/win/env_default.cc b/src/rocksdb/port/win/env_default.cc
new file mode 100644
index 000000000..48853f26e
--- /dev/null
+++ b/src/rocksdb/port/win/env_default.cc
@@ -0,0 +1,45 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#if defined(OS_WIN)
+
+#include <mutex>
+
+#include "port/win/env_win.h"
+#include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "util/compression_context_cache.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+
+// We choose not to destroy the env because joining the threads from the
+// system loader
+//    which destroys the statics (same as from DLLMain) creates a system loader
+//    dead-lock.
+//    in this manner any remaining threads are terminated OK.
+namespace {
+std::once_flag winenv_once_flag;
+Env* envptr;
+};  // namespace
+}  // namespace port
+
+Env* Env::Default() {
+  ThreadLocalPtr::InitSingletons();
+  CompressionContextCache::InitSingleton();
+  INIT_SYNC_POINT_SINGLETONS();
+  std::call_once(port::winenv_once_flag,
+                 []() { port::envptr = new port::WinEnv(); });
+  return port::envptr;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/port/win/env_win.cc b/src/rocksdb/port/win/env_win.cc
new file mode 100644
index 000000000..2262eb59c
--- /dev/null
+++ b/src/rocksdb/port/win/env_win.cc
@@ -0,0 +1,1437 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#if defined(OS_WIN)
+
+#include "port/win/env_win.h"
+
+#include <direct.h>  // _rmdir, _mkdir, _getcwd
+#include <errno.h>
+#include <io.h>   // _access
+#include <rpc.h>  // for uuid generation
+#include <shlwapi.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <windows.h>
+#include <winioctl.h>
+
+#include <algorithm>
+#include <ctime>
+#include <thread>
+
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/thread_status_updater.h"
+#include "monitoring/thread_status_util.h"
+#include "port/lang.h"
+#include "port/port.h"
+#include "port/port_dirent.h"
+#include "port/win/io_win.h"
+#include "port/win/win_logger.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "strsafe.h"
+#include "util/string_util.h"
+
+// Undefine the functions  windows might use (again)...
+#undef GetCurrentTime
+#undef DeleteFile
+#undef LoadLibrary
+
+namespace ROCKSDB_NAMESPACE {
+
+ThreadStatusUpdater* CreateThreadStatusUpdater() {
+  return new ThreadStatusUpdater();
+}
+
+namespace {
+
+// Sector size used when physical sector size cannot be obtained from device.
+static const size_t kSectorSize = 512;
+
+// RAII helpers for HANDLEs
+const auto CloseHandleFunc = [](HANDLE h) { ::CloseHandle(h); };
+using UniqueCloseHandlePtr = std::unique_ptr<void, decltype(CloseHandleFunc)>;
+
+const auto FindCloseFunc = [](HANDLE h) { ::FindClose(h); };
+using UniqueFindClosePtr = std::unique_ptr<void, decltype(FindCloseFunc)>;
+
+void WinthreadCall(const char* label, std::error_code result) {
+  if (0 != result.value()) {
+    fprintf(stderr, "Winthread %s: %s\n", label,
+            errnoStr(result.value()).c_str());
+    abort();
+  }
+}
+
+}  // namespace
+
+namespace port {
+WinClock::WinClock()
+    : perf_counter_frequency_(0),
+      nano_seconds_per_period_(0),
+      GetSystemTimePreciseAsFileTime_(NULL) {
+  {
+    LARGE_INTEGER qpf;
+    BOOL ret __attribute__((__unused__));
+    ret = QueryPerformanceFrequency(&qpf);
+    assert(ret == TRUE);
+    perf_counter_frequency_ = qpf.QuadPart;
+
+    if (std::nano::den % perf_counter_frequency_ == 0) {
+      nano_seconds_per_period_ = std::nano::den / perf_counter_frequency_;
+    }
+  }
+
+  HMODULE module = GetModuleHandle("kernel32.dll");
+  if (module != NULL) {
+    GetSystemTimePreciseAsFileTime_ = (FnGetSystemTimePreciseAsFileTime)(
+        void*)GetProcAddress(module, "GetSystemTimePreciseAsFileTime");
+  }
+}
+
+void WinClock::SleepForMicroseconds(int micros) {
+  std::this_thread::sleep_for(std::chrono::microseconds(micros));
+}
+
+std::string WinClock::TimeToString(uint64_t secondsSince1970) {
+  std::string result;
+
+  const time_t seconds = secondsSince1970;
+  const int maxsize = 64;
+
+  struct tm t;
+  errno_t ret = localtime_s(&t, &seconds);
+
+  if (ret) {
+    result = std::to_string(seconds);
+  } else {
+    result.resize(maxsize);
+    char* p = &result[0];
+
+    int len =
+        snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ", t.tm_year + 1900,
+                 t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec);
+    assert(len > 0);
+
+    result.resize(len);
+  }
+
+  return result;
+}
+
+uint64_t WinClock::NowMicros() {
+  if (GetSystemTimePreciseAsFileTime_ != NULL) {
+    // all std::chrono clocks on windows proved to return
+    // values that may repeat that is not good enough for some uses.
+    const int64_t c_UnixEpochStartTicks = 116444736000000000LL;
+    const int64_t c_FtToMicroSec = 10;
+
+    // This interface needs to return system time and not
+    // just any microseconds because it is often used as an argument
+    // to TimedWait() on condition variable
+    FILETIME ftSystemTime;
+    GetSystemTimePreciseAsFileTime_(&ftSystemTime);
+
+    LARGE_INTEGER li;
+    li.LowPart = ftSystemTime.dwLowDateTime;
+    li.HighPart = ftSystemTime.dwHighDateTime;
+    // Subtract unix epoch start
+    li.QuadPart -= c_UnixEpochStartTicks;
+    // Convert to microsecs
+    li.QuadPart /= c_FtToMicroSec;
+    return li.QuadPart;
+  }
+  return std::chrono::duration_cast<std::chrono::microseconds>(
+             std::chrono::system_clock::now().time_since_epoch())
+      .count();
+}
+
+uint64_t WinClock::NowNanos() {
+  if (nano_seconds_per_period_ != 0) {
+    // all std::chrono clocks on windows have the same resolution that is only
+    // good enough for microseconds but not nanoseconds
+    // On Windows 8 and Windows 2012 Server
+    // GetSystemTimePreciseAsFileTime(&current_time) can be used
+    LARGE_INTEGER li;
+    QueryPerformanceCounter(&li);
+    // Convert performance counter to nanoseconds by precomputed ratio.
+    // Directly multiply nano::den with li.QuadPart causes overflow.
+    // Only do this when nano::den is divisible by perf_counter_frequency_,
+    // which most likely is the case in reality. If it's not, fall back to
+    // high_resolution_clock, which may be less precise under old compilers.
+    li.QuadPart *= nano_seconds_per_period_;
+    return li.QuadPart;
+  }
+  return std::chrono::duration_cast<std::chrono::nanoseconds>(
+             std::chrono::high_resolution_clock::now().time_since_epoch())
+      .count();
+}
+
+Status WinClock::GetCurrentTime(int64_t* unix_time) {
+  time_t time = std::time(nullptr);
+  if (time == (time_t)(-1)) {
+    return Status::NotSupported("Failed to get time");
+  }
+
+  *unix_time = time;
+  return Status::OK();
+}
+
+WinFileSystem::WinFileSystem(const std::shared_ptr<SystemClock>& clock)
+    : clock_(clock), page_size_(4 * 1024), allocation_granularity_(page_size_) {
+  SYSTEM_INFO sinfo;
+  GetSystemInfo(&sinfo);
+
+  page_size_ = sinfo.dwPageSize;
+  allocation_granularity_ = sinfo.dwAllocationGranularity;
+}
+
+const std::shared_ptr<WinFileSystem>& WinFileSystem::Default() {
+  STATIC_AVOID_DESTRUCTION(std::shared_ptr<WinFileSystem>, fs)
+  (std::make_shared<WinFileSystem>(WinClock::Default()));
+  return fs;
+}
+
+WinEnvIO::WinEnvIO(Env* hosted_env) : hosted_env_(hosted_env) {}
+
+WinEnvIO::~WinEnvIO() {}
+
+IOStatus WinFileSystem::DeleteFile(const std::string& fname,
+                                   const IOOptions& /*options*/,
+                                   IODebugContext* /*dbg*/) {
+  IOStatus result;
+
+  BOOL ret = RX_DeleteFile(RX_FN(fname).c_str());
+
+  if (!ret) {
+    auto lastError = GetLastError();
+    result = IOErrorFromWindowsError("Failed to delete: " + fname, lastError);
+  }
+
+  return result;
+}
+
+IOStatus WinFileSystem::Truncate(const std::string& fname, size_t size,
+                                 const IOOptions& /*options*/,
+                                 IODebugContext* /*dbg*/) {
+  IOStatus s;
+  int result = ROCKSDB_NAMESPACE::port::Truncate(fname, size);
+  if (result != 0) {
+    s = IOError("Failed to truncate: " + fname, errno);
+  }
+  return s;
+}
+
+IOStatus WinFileSystem::NewSequentialFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSSequentialFile>* result, IODebugContext* /*dbg*/) {
+  IOStatus s;
+
+  result->reset();
+
+  // Corruption test needs to rename and delete files of these kind
+  // while they are still open with another handle. For that reason we
+  // allow share_write and delete(allows rename).
+  HANDLE hFile = INVALID_HANDLE_VALUE;
+
+  DWORD fileFlags = FILE_ATTRIBUTE_READONLY;
+
+  if (options.use_direct_reads && !options.use_mmap_reads) {
+    fileFlags |= FILE_FLAG_NO_BUFFERING;
+  }
+
+  {
+    IOSTATS_TIMER_GUARD(open_nanos);
+    hFile = RX_CreateFile(
+        RX_FN(fname).c_str(), GENERIC_READ,
+        FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, NULL,
+        OPEN_EXISTING,  // Original fopen mode is "rb"
+        fileFlags, NULL);
+  }
+
+  if (INVALID_HANDLE_VALUE == hFile) {
+    auto lastError = GetLastError();
+    s = IOErrorFromWindowsError("Failed to open NewSequentialFile" + fname,
+                                lastError);
+  } else {
+    result->reset(new WinSequentialFile(fname, hFile, options));
+  }
+  return s;
+}
+
+IOStatus WinFileSystem::NewRandomAccessFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) {
+  result->reset();
+  IOStatus s;
+
+  // Open the file for read-only random access
+  // Random access is to disable read-ahead as the system reads too much data
+  DWORD fileFlags = FILE_ATTRIBUTE_READONLY;
+
+  if (options.use_direct_reads && !options.use_mmap_reads) {
+    fileFlags |= FILE_FLAG_NO_BUFFERING;
+  } else {
+    fileFlags |= FILE_FLAG_RANDOM_ACCESS;
+  }
+
+  /// Shared access is necessary for corruption test to pass
+  // almost all tests would work with a possible exception of fault_injection
+  HANDLE hFile = 0;
+  {
+    IOSTATS_TIMER_GUARD(open_nanos);
+    hFile =
+        RX_CreateFile(RX_FN(fname).c_str(), GENERIC_READ,
+                      FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+                      NULL, OPEN_EXISTING, fileFlags, NULL);
+  }
+
+  if (INVALID_HANDLE_VALUE == hFile) {
+    auto lastError = GetLastError();
+    return IOErrorFromWindowsError(
+        "NewRandomAccessFile failed to Create/Open: " + fname, lastError);
+  }
+
+  UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc);
+
+  // CAUTION! This will map the entire file into the process address space.
+  // Not recommended for 32-bit platforms.
+  if (options.use_mmap_reads) {
+    uint64_t fileSize;
+
+    s = GetFileSize(fname, IOOptions(), &fileSize, dbg);
+
+    if (s.ok()) {
+      // Will not map empty files
+      if (fileSize == 0) {
+        return IOError("NewRandomAccessFile failed to map empty file: " + fname,
+                       EINVAL);
+      }
+
+      HANDLE hMap = RX_CreateFileMapping(hFile, NULL, PAGE_READONLY,
+                                         0,  // At its present length
+                                         0,
+                                         NULL);  // Mapping name
+
+      if (!hMap) {
+        auto lastError = GetLastError();
+        return IOErrorFromWindowsError(
+            "Failed to create file mapping for NewRandomAccessFile: " + fname,
+            lastError);
+      }
+
+      UniqueCloseHandlePtr mapGuard(hMap, CloseHandleFunc);
+
+      const void* mapped_region =
+          MapViewOfFileEx(hMap, FILE_MAP_READ,
+                          0,  // High DWORD of access start
+                          0,  // Low DWORD
+                          static_cast<SIZE_T>(fileSize),
+                          NULL);  // Let the OS choose the mapping
+
+      if (!mapped_region) {
+        auto lastError = GetLastError();
+        return IOErrorFromWindowsError(
+            "Failed to MapViewOfFile for NewRandomAccessFile: " + fname,
+            lastError);
+      }
+
+      result->reset(new WinMmapReadableFile(fname, hFile, hMap, mapped_region,
+                                            static_cast<size_t>(fileSize)));
+
+      mapGuard.release();
+      fileGuard.release();
+    }
+  } else {
+    result->reset(new WinRandomAccessFile(fname, hFile, page_size_, options));
+    fileGuard.release();
+  }
+  return s;
+}
+
+IOStatus WinFileSystem::OpenWritableFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSWritableFile>* result, bool reopen) {
+  const size_t c_BufferCapacity = 64 * 1024;
+
+  EnvOptions local_options(options);
+
+  result->reset();
+  IOStatus s;
+
+  DWORD fileFlags = FILE_ATTRIBUTE_NORMAL;
+
+  if (local_options.use_direct_writes && !local_options.use_mmap_writes) {
+    fileFlags = FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH;
+  }
+
+  // Desired access. We are want to write only here but if we want to memory
+  // map
+  // the file then there is no write only mode so we have to create it
+  // Read/Write
+  // However, MapViewOfFile specifies only Write only
+  DWORD desired_access = GENERIC_WRITE;
+  DWORD shared_mode = FILE_SHARE_READ;
+
+  if (local_options.use_mmap_writes) {
+    desired_access |= GENERIC_READ;
+  } else {
+    // Adding this solely for tests to pass (fault_injection_test,
+    // wal_manager_test).
+    shared_mode |= (FILE_SHARE_WRITE | FILE_SHARE_DELETE);
+  }
+
+  // This will always truncate the file
+  DWORD creation_disposition = CREATE_ALWAYS;
+  if (reopen) {
+    creation_disposition = OPEN_ALWAYS;
+  }
+
+  HANDLE hFile = 0;
+  {
+    IOSTATS_TIMER_GUARD(open_nanos);
+    hFile = RX_CreateFile(
+        RX_FN(fname).c_str(),
+        desired_access,  // Access desired
+        shared_mode,
+        NULL,  // Security attributes
+        // Posix env says (reopen) ? (O_CREATE | O_APPEND) : O_CREAT | O_TRUNC
+        creation_disposition,
+        fileFlags,  // Flags
+        NULL);      // Template File
+  }
+
+  if (INVALID_HANDLE_VALUE == hFile) {
+    auto lastError = GetLastError();
+    return IOErrorFromWindowsError(
+        "Failed to create a NewWritableFile: " + fname, lastError);
+  }
+
+  // We will start writing at the end, appending
+  if (reopen) {
+    LARGE_INTEGER zero_move;
+    zero_move.QuadPart = 0;
+    BOOL ret = SetFilePointerEx(hFile, zero_move, NULL, FILE_END);
+    if (!ret) {
+      auto lastError = GetLastError();
+      return IOErrorFromWindowsError(
+          "Failed to create a ReopenWritableFile move to the end: " + fname,
+          lastError);
+    }
+  }
+
+  if (options.use_mmap_writes) {
+    // We usually do not use mmmapping on SSD and thus we pass memory
+    // page_size
+    result->reset(new WinMmapFile(fname, hFile, page_size_,
+                                  allocation_granularity_, local_options));
+  } else {
+    // Here we want the buffer allocation to be aligned by the SSD page size
+    // and to be a multiple of it
+    result->reset(new WinWritableFile(fname, hFile, GetPageSize(),
+                                      c_BufferCapacity, local_options));
+  }
+  return s;
+}
+
+IOStatus WinFileSystem::NewWritableFile(const std::string& fname,
+                                        const FileOptions& options,
+                                        std::unique_ptr<FSWritableFile>* result,
+                                        IODebugContext* /*dbg*/) {
+  return OpenWritableFile(fname, options, result, false);
+}
+
+IOStatus WinFileSystem::ReopenWritableFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSWritableFile>* result, IODebugContext* /*dbg*/) {
+  return OpenWritableFile(fname, options, result, true);
+}
+
+IOStatus WinFileSystem::NewRandomRWFile(const std::string& fname,
+                                        const FileOptions& options,
+                                        std::unique_ptr<FSRandomRWFile>* result,
+                                        IODebugContext* /*dbg*/) {
+  IOStatus s;
+
+  // Open the file for read-only random access
+  // Random access is to disable read-ahead as the system reads too much data
+  DWORD desired_access = GENERIC_READ | GENERIC_WRITE;
+  DWORD shared_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
+  DWORD creation_disposition = OPEN_EXISTING;  // Fail if file does not exist
+  DWORD file_flags = FILE_FLAG_RANDOM_ACCESS;
+
+  if (options.use_direct_reads && options.use_direct_writes) {
+    file_flags |= FILE_FLAG_NO_BUFFERING;
+  }
+
+  /// Shared access is necessary for corruption test to pass
+  // almost all tests would work with a possible exception of fault_injection
+  HANDLE hFile = 0;
+  {
+    IOSTATS_TIMER_GUARD(open_nanos);
+    hFile = RX_CreateFile(RX_FN(fname).c_str(), desired_access, shared_mode,
+                          NULL,  // Security attributes
+                          creation_disposition, file_flags, NULL);
+  }
+
+  if (INVALID_HANDLE_VALUE == hFile) {
+    auto lastError = GetLastError();
+    return IOErrorFromWindowsError(
+        "NewRandomRWFile failed to Create/Open: " + fname, lastError);
+  }
+
+  UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc);
+  result->reset(new WinRandomRWFile(fname, hFile, GetPageSize(), options));
+  fileGuard.release();
+
+  return s;
+}
+
+IOStatus WinFileSystem::NewMemoryMappedFileBuffer(
+    const std::string& fname, std::unique_ptr<MemoryMappedFileBuffer>* result) {
+  IOStatus s;
+  result->reset();
+
+  DWORD fileFlags = FILE_ATTRIBUTE_READONLY;
+
+  HANDLE hFile = INVALID_HANDLE_VALUE;
+  {
+    IOSTATS_TIMER_GUARD(open_nanos);
+    hFile = RX_CreateFile(
+        RX_FN(fname).c_str(), GENERIC_READ | GENERIC_WRITE,
+        FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, NULL,
+        OPEN_EXISTING,  // Open only if it exists
+        fileFlags, NULL);
+  }
+
+  if (INVALID_HANDLE_VALUE == hFile) {
+    auto lastError = GetLastError();
+    s = IOErrorFromWindowsError(
+        "Failed to open NewMemoryMappedFileBuffer: " + fname, lastError);
+    return s;
+  }
+  UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc);
+
+  uint64_t fileSize = 0;
+  s = GetFileSize(fname, IOOptions(), &fileSize, nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+  // Will not map empty files
+  if (fileSize == 0) {
+    return IOStatus::NotSupported(
+        "NewMemoryMappedFileBuffer can not map zero length files: " + fname);
+  }
+
+  // size_t is 32-bit with 32-bit builds
+  if (fileSize > std::numeric_limits<size_t>::max()) {
+    return IOStatus::NotSupported(
+        "The specified file size does not fit into 32-bit memory addressing: " +
+        fname);
+  }
+
+  HANDLE hMap = RX_CreateFileMapping(hFile, NULL, PAGE_READWRITE,
+                                     0,  // Whole file at its present length
+                                     0,
+                                     NULL);  // Mapping name
+
+  if (!hMap) {
+    auto lastError = GetLastError();
+    return IOErrorFromWindowsError(
+        "Failed to create file mapping for: " + fname, lastError);
+  }
+  UniqueCloseHandlePtr mapGuard(hMap, CloseHandleFunc);
+
+  void* base = MapViewOfFileEx(hMap, FILE_MAP_WRITE,
+                               0,  // High DWORD of access start
+                               0,  // Low DWORD
+                               static_cast<SIZE_T>(fileSize),
+                               NULL);  // Let the OS choose the mapping
+
+  if (!base) {
+    auto lastError = GetLastError();
+    return IOErrorFromWindowsError(
+        "Failed to MapViewOfFile for NewMemoryMappedFileBuffer: " + fname,
+        lastError);
+  }
+
+  result->reset(new WinMemoryMappedBuffer(hFile, hMap, base,
+                                          static_cast<size_t>(fileSize)));
+
+  mapGuard.release();
+  fileGuard.release();
+
+  return s;
+}
+
+IOStatus WinFileSystem::NewDirectory(const std::string& name,
+                                     const IOOptions& /*options*/,
+                                     std::unique_ptr<FSDirectory>* result,
+                                     IODebugContext* /*dbg*/) {
+  IOStatus s;
+  // Must be nullptr on failure
+  result->reset();
+
+  if (!DirExists(name)) {
+    s = IOErrorFromWindowsError("open folder: " + name, ERROR_DIRECTORY);
+    return s;
+  }
+
+  HANDLE handle = INVALID_HANDLE_VALUE;
+  // 0 - for access means read metadata
+  {
+    IOSTATS_TIMER_GUARD(open_nanos);
+    handle = RX_CreateFile(
+        RX_FN(name).c_str(), 0,
+        FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, NULL,
+        OPEN_EXISTING,
+        FILE_FLAG_BACKUP_SEMANTICS,  // make opening folders possible
+        NULL);
+  }
+
+  if (INVALID_HANDLE_VALUE == handle) {
+    auto lastError = GetLastError();
+    s = IOErrorFromWindowsError("open folder: " + name, lastError);
+    return s;
+  }
+
+  result->reset(new WinDirectory(name, handle));
+
+  return s;
+}
+
+IOStatus WinFileSystem::FileExists(const std::string& fname,
+                                   const IOOptions& /*opts*/,
+                                   IODebugContext* /*dbg*/) {
+  IOStatus s;
+  // TODO: This does not follow symbolic links at this point
+  // which is consistent with _access() impl on windows
+  // but can be added
+  WIN32_FILE_ATTRIBUTE_DATA attrs;
+  if (FALSE == RX_GetFileAttributesEx(RX_FN(fname).c_str(),
+                                      GetFileExInfoStandard, &attrs)) {
+    auto lastError = GetLastError();
+    switch (lastError) {
+      case ERROR_ACCESS_DENIED:
+      case ERROR_NOT_FOUND:
+      case ERROR_FILE_NOT_FOUND:
+      case ERROR_PATH_NOT_FOUND:
+        s = IOStatus::NotFound();
+        break;
+      default:
+        s = IOErrorFromWindowsError("Unexpected error for: " + fname,
+                                    lastError);
+        break;
+    }
+  }
+  return s;
+}
+
+IOStatus WinFileSystem::GetChildren(const std::string& dir,
+                                    const IOOptions& /*opts*/,
+                                    std::vector<std::string>* result,
+                                    IODebugContext* /*dbg*/) {
+  IOStatus status;
+  result->clear();
+
+  RX_WIN32_FIND_DATA data;
+  memset(&data, 0, sizeof(data));
+  std::string pattern(dir);
+  pattern.append("\\").append("*");
+
+  HANDLE handle =
+      RX_FindFirstFileEx(RX_FN(pattern).c_str(),
+                         // Do not want alternative name
+                         FindExInfoBasic, &data, FindExSearchNameMatch,
+                         NULL,  // lpSearchFilter
+                         0);
+
+  if (handle == INVALID_HANDLE_VALUE) {
+    auto lastError = GetLastError();
+    switch (lastError) {
+      case ERROR_NOT_FOUND:
+      case ERROR_ACCESS_DENIED:
+      case ERROR_FILE_NOT_FOUND:
+      case ERROR_PATH_NOT_FOUND:
+        status = IOStatus::NotFound();
+        break;
+      default:
+        status = IOErrorFromWindowsError("Failed to GetChhildren for: " + dir,
+                                         lastError);
+    }
+    return status;
+  }
+
+  UniqueFindClosePtr fc(handle, FindCloseFunc);
+
+  // For safety
+  data.cFileName[MAX_PATH - 1] = 0;
+
+  while (true) {
+    // filter out '.' and '..' directory entries
+    // which appear only on some platforms
+    const bool ignore =
+        ((data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) != 0) &&
+        (RX_FNCMP(data.cFileName, ".") == 0 ||
+         RX_FNCMP(data.cFileName, "..") == 0);
+    if (!ignore) {
+      auto x = RX_FILESTRING(data.cFileName, RX_FNLEN(data.cFileName));
+      result->push_back(FN_TO_RX(x));
+    }
+
+    BOOL ret = -RX_FindNextFile(handle, &data);
+    // If the function fails the return value is zero
+    // and non-zero otherwise. Not TRUE or FALSE.
+    if (ret == FALSE) {
+      // Posix does not care why we stopped
+      break;
+    }
+    data.cFileName[MAX_PATH - 1] = 0;
+  }
+  return status;
+}
+
+IOStatus WinFileSystem::CreateDir(const std::string& name,
+                                  const IOOptions& /*opts*/,
+                                  IODebugContext* /*dbg*/) {
+  IOStatus result;
+  BOOL ret = RX_CreateDirectory(RX_FN(name).c_str(), NULL);
+  if (!ret) {
+    auto lastError = GetLastError();
+    result = IOErrorFromWindowsError("Failed to create a directory: " + name,
+                                     lastError);
+  }
+
+  return result;
+}
+
+IOStatus WinFileSystem::CreateDirIfMissing(const std::string& name,
+                                           const IOOptions& /*opts*/,
+                                           IODebugContext* /*dbg*/) {
+  IOStatus result;
+
+  if (DirExists(name)) {
+    return result;
+  }
+
+  BOOL ret = RX_CreateDirectory(RX_FN(name).c_str(), NULL);
+  if (!ret) {
+    auto lastError = GetLastError();
+    if (lastError != ERROR_ALREADY_EXISTS) {
+      result = IOErrorFromWindowsError("Failed to create a directory: " + name,
+                                       lastError);
+    } else {
+      result = IOStatus::IOError(name + ": exists but is not a directory");
+    }
+  }
+  return result;
+}
+
+IOStatus WinFileSystem::DeleteDir(const std::string& name,
+                                  const IOOptions& /*options*/,
+                                  IODebugContext* /*dbg*/) {
+  IOStatus result;
+  BOOL ret = RX_RemoveDirectory(RX_FN(name).c_str());
+  if (!ret) {
+    auto lastError = GetLastError();
+    result =
+        IOErrorFromWindowsError("Failed to remove dir: " + name, lastError);
+  }
+  return result;
+}
+
+IOStatus WinFileSystem::GetFileSize(const std::string& fname,
+                                    const IOOptions& /*opts*/, uint64_t* size,
+                                    IODebugContext* /*dbg*/) {
+  IOStatus s;
+
+  WIN32_FILE_ATTRIBUTE_DATA attrs;
+  if (RX_GetFileAttributesEx(RX_FN(fname).c_str(), GetFileExInfoStandard,
+                             &attrs)) {
+    ULARGE_INTEGER file_size;
+    file_size.HighPart = attrs.nFileSizeHigh;
+    file_size.LowPart = attrs.nFileSizeLow;
+    *size = file_size.QuadPart;
+  } else {
+    auto lastError = GetLastError();
+    s = IOErrorFromWindowsError("Can not get size for: " + fname, lastError);
+  }
+  return s;
+}
+
+uint64_t WinFileSystem::FileTimeToUnixTime(const FILETIME& ftTime) {
+  const uint64_t c_FileTimePerSecond = 10000000U;
+  // UNIX epoch starts on 1970-01-01T00:00:00Z
+  // Windows FILETIME starts on 1601-01-01T00:00:00Z
+  // Therefore, we need to subtract the below number of seconds from
+  // the seconds that we obtain from FILETIME with an obvious loss of
+  // precision
+  const uint64_t c_SecondBeforeUnixEpoch = 11644473600U;
+
+  ULARGE_INTEGER li;
+  li.HighPart = ftTime.dwHighDateTime;
+  li.LowPart = ftTime.dwLowDateTime;
+
+  uint64_t result =
+      (li.QuadPart / c_FileTimePerSecond) - c_SecondBeforeUnixEpoch;
+  return result;
+}
+
+IOStatus WinFileSystem::GetFileModificationTime(const std::string& fname,
+                                                const IOOptions& /*opts*/,
+                                                uint64_t* file_mtime,
+                                                IODebugContext* /*dbg*/) {
+  IOStatus s;
+
+  WIN32_FILE_ATTRIBUTE_DATA attrs;
+  if (RX_GetFileAttributesEx(RX_FN(fname).c_str(), GetFileExInfoStandard,
+                             &attrs)) {
+    *file_mtime = FileTimeToUnixTime(attrs.ftLastWriteTime);
+  } else {
+    auto lastError = GetLastError();
+    s = IOErrorFromWindowsError(
+        "Can not get file modification time for: " + fname, lastError);
+    *file_mtime = 0;
+  }
+
+  return s;
+}
+
+IOStatus WinFileSystem::RenameFile(const std::string& src,
+                                   const std::string& target,
+                                   const IOOptions& /*opts*/,
+                                   IODebugContext* /*dbg*/) {
+  IOStatus result;
+
+  // rename() is not capable of replacing the existing file as on Linux
+  // so use OS API directly
+  if (!RX_MoveFileEx(RX_FN(src).c_str(), RX_FN(target).c_str(),
+                     MOVEFILE_REPLACE_EXISTING)) {
+    DWORD lastError = GetLastError();
+
+    std::string text("Failed to rename: ");
+    text.append(src).append(" to: ").append(target);
+
+    result = IOErrorFromWindowsError(text, lastError);
+  }
+
+  return result;
+}
+
+IOStatus WinFileSystem::LinkFile(const std::string& src,
+                                 const std::string& target,
+                                 const IOOptions& /*opts*/,
+                                 IODebugContext* /*dbg*/) {
+  IOStatus result;
+
+  if (!RX_CreateHardLink(RX_FN(target).c_str(), RX_FN(src).c_str(), NULL)) {
+    DWORD lastError = GetLastError();
+    if (lastError == ERROR_NOT_SAME_DEVICE) {
+      return IOStatus::NotSupported("No cross FS links allowed");
+    }
+
+    std::string text("Failed to link: ");
+    text.append(src).append(" to: ").append(target);
+
+    result = IOErrorFromWindowsError(text, lastError);
+  }
+
+  return result;
+}
+
+IOStatus WinFileSystem::NumFileLinks(const std::string& fname,
+                                     const IOOptions& /*opts*/, uint64_t* count,
+                                     IODebugContext* /*dbg*/) {
+  IOStatus s;
+  HANDLE handle =
+      RX_CreateFile(RX_FN(fname).c_str(), 0,
+                    FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
+                    NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL);
+
+  if (INVALID_HANDLE_VALUE == handle) {
+    auto lastError = GetLastError();
+    s = IOErrorFromWindowsError("NumFileLinks: " + fname, lastError);
+    return s;
+  }
+  UniqueCloseHandlePtr handle_guard(handle, CloseHandleFunc);
+  FILE_STANDARD_INFO standard_info;
+  if (0 != GetFileInformationByHandleEx(handle, FileStandardInfo,
+                                        &standard_info,
+                                        sizeof(standard_info))) {
+    *count = standard_info.NumberOfLinks;
+  } else {
+    auto lastError = GetLastError();
+    s = IOErrorFromWindowsError("GetFileInformationByHandleEx: " + fname,
+                                lastError);
+  }
+  return s;
+}
+
+IOStatus WinFileSystem::AreFilesSame(const std::string& first,
+                                     const std::string& second,
+                                     const IOOptions& /*opts*/, bool* res,
+                                     IODebugContext* /*dbg*/) {
+// For MinGW builds
+#if (_WIN32_WINNT == _WIN32_WINNT_VISTA)
+  IOStatus s = IOStatus::NotSupported();
+#else
+  assert(res != nullptr);
+  IOStatus s;
+  if (res == nullptr) {
+    s = IOStatus::InvalidArgument("res");
+    return s;
+  }
+
+  // 0 - for access means read metadata
+  HANDLE file_1 = RX_CreateFile(
+      RX_FN(first).c_str(), 0,
+      FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, NULL,
+      OPEN_EXISTING,
+      FILE_FLAG_BACKUP_SEMANTICS,  // make opening folders possible
+      NULL);
+
+  if (INVALID_HANDLE_VALUE == file_1) {
+    auto lastError = GetLastError();
+    s = IOErrorFromWindowsError("open file: " + first, lastError);
+    return s;
+  }
+  UniqueCloseHandlePtr g_1(file_1, CloseHandleFunc);
+
+  HANDLE file_2 = RX_CreateFile(
+      RX_FN(second).c_str(), 0,
+      FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, NULL,
+      OPEN_EXISTING,
+      FILE_FLAG_BACKUP_SEMANTICS,  // make opening folders possible
+      NULL);
+
+  if (INVALID_HANDLE_VALUE == file_2) {
+    auto lastError = GetLastError();
+    s = IOErrorFromWindowsError("open file: " + second, lastError);
+    return s;
+  }
+  UniqueCloseHandlePtr g_2(file_2, CloseHandleFunc);
+
+  FILE_ID_INFO FileInfo_1;
+  BOOL result = GetFileInformationByHandleEx(file_1, FileIdInfo, &FileInfo_1,
+                                             sizeof(FileInfo_1));
+
+  if (!result) {
+    auto lastError = GetLastError();
+    s = IOErrorFromWindowsError("stat file: " + first, lastError);
+    return s;
+  }
+
+  FILE_ID_INFO FileInfo_2;
+  result = GetFileInformationByHandleEx(file_2, FileIdInfo, &FileInfo_2,
+                                        sizeof(FileInfo_2));
+
+  if (!result) {
+    auto lastError = GetLastError();
+    s = IOErrorFromWindowsError("stat file: " + second, lastError);
+    return s;
+  }
+
+  if (FileInfo_1.VolumeSerialNumber == FileInfo_2.VolumeSerialNumber) {
+    *res =
+        (0 == memcmp(FileInfo_1.FileId.Identifier, FileInfo_2.FileId.Identifier,
+                     sizeof(FileInfo_1.FileId.Identifier)));
+  } else {
+    *res = false;
+  }
+#endif
+  return s;
+}
+
+IOStatus WinFileSystem::LockFile(const std::string& lockFname,
+                                 const IOOptions& /*opts*/, FileLock** lock,
+                                 IODebugContext* /*dbg*/) {
+  assert(lock != nullptr);
+
+  *lock = NULL;
+  IOStatus result;
+
+  // No-sharing, this is a LOCK file
+  const DWORD ExclusiveAccessON = 0;
+
+  // Obtain exclusive access to the LOCK file
+  // Previously, instead of NORMAL attr we set DELETE on close and that worked
+  // well except with fault_injection test that insists on deleting it.
+  HANDLE hFile = 0;
+  {
+    IOSTATS_TIMER_GUARD(open_nanos);
+    hFile = RX_CreateFile(RX_FN(lockFname).c_str(),
+                          (GENERIC_READ | GENERIC_WRITE), ExclusiveAccessON,
+                          NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
+  }
+
+  if (INVALID_HANDLE_VALUE == hFile) {
+    auto lastError = GetLastError();
+    result = IOErrorFromWindowsError("Failed to create lock file: " + lockFname,
+                                     lastError);
+  } else {
+    *lock = new WinFileLock(hFile);
+  }
+
+  return result;
+}
+
+IOStatus WinFileSystem::UnlockFile(FileLock* lock, const IOOptions& /*opts*/,
+                                   IODebugContext* /*dbg*/) {
+  IOStatus result;
+
+  assert(lock != nullptr);
+
+  delete lock;
+
+  return result;
+}
+
+IOStatus WinFileSystem::GetTestDirectory(const IOOptions& opts,
+                                         std::string* result,
+                                         IODebugContext* dbg) {
+  std::string output;
+
+  const char* env = getenv("TEST_TMPDIR");
+  if (env && env[0] != '\0') {
+    output = env;
+  } else {
+    env = getenv("TMP");
+
+    if (env && env[0] != '\0') {
+      output = env;
+    } else {
+      output = "c:\\tmp";
+    }
+  }
+  CreateDir(output, opts, dbg);
+
+  output.append("\\testrocksdb-");
+  output.append(std::to_string(GetCurrentProcessId()));
+
+  CreateDir(output, opts, dbg);
+
+  output.swap(*result);
+
+  return IOStatus::OK();
+}
+
+IOStatus WinFileSystem::NewLogger(const std::string& fname,
+                                  const IOOptions& /*opts*/,
+                                  std::shared_ptr<Logger>* result,
+                                  IODebugContext* /*dbg*/) {
+  IOStatus s;
+
+  result->reset();
+
+  HANDLE hFile = 0;
+  {
+    IOSTATS_TIMER_GUARD(open_nanos);
+    hFile = RX_CreateFile(
+        RX_FN(fname).c_str(), GENERIC_WRITE,
+        FILE_SHARE_READ | FILE_SHARE_DELETE,  // In RocksDb log files are
+        // renamed and deleted before
+        // they are closed. This enables
+        // doing so.
+        NULL,
+        CREATE_ALWAYS,  // Original fopen mode is "w"
+        FILE_ATTRIBUTE_NORMAL, NULL);
+  }
+
+  if (INVALID_HANDLE_VALUE == hFile) {
+    auto lastError = GetLastError();
+    s = IOErrorFromWindowsError("Failed to open LogFile" + fname, lastError);
+  } else {
+    {
+      // With log files we want to set the true creation time as of now
+      // because the system
+      // for some reason caches the attributes of the previous file that just
+      // been renamed from
+      // this name so auto_roll_logger_test fails
+      FILETIME ft;
+      GetSystemTimeAsFileTime(&ft);
+      // Set creation, last access and last write time to the same value
+      SetFileTime(hFile, &ft, &ft, &ft);
+    }
+    result->reset(new WinLogger(&WinEnvThreads::gettid, clock_.get(), hFile));
+  }
+  return s;
+}
+
+IOStatus WinFileSystem::IsDirectory(const std::string& path,
+                                    const IOOptions& /*opts*/, bool* is_dir,
+                                    IODebugContext* /*dbg*/) {
+  BOOL ret = RX_PathIsDirectory(RX_FN(path).c_str());
+  if (is_dir) {
+    *is_dir = ret ? true : false;
+  }
+  return IOStatus::OK();
+}
+
+Status WinEnvIO::GetHostName(char* name, uint64_t len) {
+  Status s;
+  DWORD nSize = static_cast<DWORD>(
+      std::min<uint64_t>(len, std::numeric_limits<DWORD>::max()));
+
+  if (!::GetComputerNameA(name, &nSize)) {
+    auto lastError = GetLastError();
+    s = IOErrorFromWindowsError("GetHostName", lastError);
+  } else {
+    name[nSize] = 0;
+  }
+
+  return s;
+}
+
+IOStatus WinFileSystem::GetAbsolutePath(const std::string& db_path,
+                                        const IOOptions& /*options*/,
+                                        std::string* output_path,
+                                        IODebugContext* dbg) {
+  // Check if we already have an absolute path
+  // For test compatibility we will consider starting slash as an
+  // absolute path
+  if ((!db_path.empty() && (db_path[0] == '\\' || db_path[0] == '/')) ||
+      !RX_PathIsRelative(RX_FN(db_path).c_str())) {
+    *output_path = db_path;
+    return IOStatus::OK();
+  }
+
+  RX_FILESTRING result;
+  result.resize(MAX_PATH);
+
+  // Hopefully no changes the current directory while we do this
+  // however _getcwd also suffers from the same limitation
+  DWORD len = RX_GetCurrentDirectory(MAX_PATH, &result[0]);
+  if (len == 0) {
+    auto lastError = GetLastError();
+    return IOErrorFromWindowsError("Failed to get current working directory",
+                                   lastError);
+  }
+
+  result.resize(len);
+  std::string res = FN_TO_RX(result);
+
+  res.swap(*output_path);
+  return IOStatus::OK();
+}
+
+IOStatus WinFileSystem::GetFreeSpace(const std::string& path,
+                                     const IOOptions& /*options*/,
+                                     uint64_t* diskfree,
+                                     IODebugContext* /*dbg*/) {
+  assert(diskfree != nullptr);
+  ULARGE_INTEGER freeBytes;
+  BOOL f = RX_GetDiskFreeSpaceEx(RX_FN(path).c_str(), &freeBytes, NULL, NULL);
+  if (f) {
+    *diskfree = freeBytes.QuadPart;
+    return IOStatus::OK();
+  } else {
+    DWORD lastError = GetLastError();
+    return IOErrorFromWindowsError("Failed to get free space: " + path,
+                                   lastError);
+  }
+}
+
+FileOptions WinFileSystem::OptimizeForLogWrite(
+    const FileOptions& file_options, const DBOptions& db_options) const {
+  FileOptions optimized(file_options);
+  // These two the same as default optimizations
+  optimized.bytes_per_sync = db_options.wal_bytes_per_sync;
+  optimized.writable_file_max_buffer_size =
+      db_options.writable_file_max_buffer_size;
+
+  // This adversely affects %999 on windows
+  optimized.use_mmap_writes = false;
+  // Direct writes will produce a huge perf impact on
+  // Windows. Pre-allocate space for WAL.
+  optimized.use_direct_writes = false;
+  return optimized;
+}
+
+FileOptions WinFileSystem::OptimizeForManifestWrite(
+    const FileOptions& options) const {
+  FileOptions optimized(options);
+  optimized.use_mmap_writes = false;
+  optimized.use_direct_reads = false;
+  return optimized;
+}
+
+FileOptions WinFileSystem::OptimizeForManifestRead(
+    const FileOptions& file_options) const {
+  FileOptions optimized(file_options);
+  optimized.use_mmap_writes = false;
+  optimized.use_direct_reads = false;
+  return optimized;
+}
+
+// Returns true iff the named directory exists and is a directory.
+bool WinFileSystem::DirExists(const std::string& dname) {
+  WIN32_FILE_ATTRIBUTE_DATA attrs;
+  if (RX_GetFileAttributesEx(RX_FN(dname).c_str(), GetFileExInfoStandard,
+                             &attrs)) {
+    return 0 != (attrs.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY);
+  }
+  return false;
+}
+
+size_t WinFileSystem::GetSectorSize(const std::string& fname) {
+  size_t sector_size = kSectorSize;
+
+  // obtain device handle
+  char devicename[7] = "\\\\.\\";
+  int erresult = 0;
+  if (RX_PathIsRelative(RX_FN(fname).c_str())) {
+    RX_FILESTRING rx_current_dir;
+    rx_current_dir.resize(MAX_PATH);
+    DWORD len = RX_GetCurrentDirectory(MAX_PATH, &rx_current_dir[0]);
+    if (len == 0) {
+      return sector_size;
+    }
+    rx_current_dir.resize(len);
+    std::string current_dir = FN_TO_RX(rx_current_dir);
+    erresult =
+        strncat_s(devicename, sizeof(devicename), current_dir.c_str(), 2);
+  } else {
+    erresult = strncat_s(devicename, sizeof(devicename), fname.c_str(), 2);
+  }
+
+  if (erresult) {
+    assert(false);
+    return sector_size;
+  }
+
+  HANDLE hDevice = CreateFile(devicename, 0, 0, nullptr, OPEN_EXISTING,
+                              FILE_ATTRIBUTE_NORMAL, nullptr);
+
+  if (hDevice == INVALID_HANDLE_VALUE) {
+    return sector_size;
+  }
+
+  STORAGE_PROPERTY_QUERY spropertyquery;
+  spropertyquery.PropertyId = StorageAccessAlignmentProperty;
+  spropertyquery.QueryType = PropertyStandardQuery;
+
+  BYTE output_buffer[sizeof(STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR)];
+  DWORD output_bytes = 0;
+
+  BOOL ret = DeviceIoControl(
+      hDevice, IOCTL_STORAGE_QUERY_PROPERTY, &spropertyquery,
+      sizeof(spropertyquery), output_buffer,
+      sizeof(STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR), &output_bytes, nullptr);
+
+  if (ret) {
+    sector_size = ((STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR*)output_buffer)
+                      ->BytesPerLogicalSector;
+  } else {
+    // many devices do not support StorageProcessAlignmentProperty. Any failure
+    // here and we fall back to logical alignment
+
+    DISK_GEOMETRY_EX geometry = {0};
+    ret = DeviceIoControl(hDevice, IOCTL_DISK_GET_DRIVE_GEOMETRY, nullptr, 0,
+                          &geometry, sizeof(geometry), &output_bytes, nullptr);
+    if (ret) {
+      sector_size = geometry.Geometry.BytesPerSector;
+    }
+  }
+
+  if (hDevice != INVALID_HANDLE_VALUE) {
+    CloseHandle(hDevice);
+  }
+
+  return sector_size;
+}
+
+////////////////////////////////////////////////////////////////////////
+// WinEnvThreads
+
+WinEnvThreads::WinEnvThreads(Env* hosted_env)
+    : hosted_env_(hosted_env), thread_pools_(Env::Priority::TOTAL) {
+  for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) {
+    thread_pools_[pool_id].SetThreadPriority(
+        static_cast<Env::Priority>(pool_id));
+    // This allows later initializing the thread-local-env of each thread.
+    thread_pools_[pool_id].SetHostEnv(hosted_env);
+  }
+}
+
+WinEnvThreads::~WinEnvThreads() {
+  WaitForJoin();
+
+  for (auto& thpool : thread_pools_) {
+    thpool.JoinAllThreads();
+  }
+}
+
+void WinEnvThreads::Schedule(void (*function)(void*), void* arg,
+                             Env::Priority pri, void* tag,
+                             void (*unschedFunction)(void* arg)) {
+  assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH);
+  thread_pools_[pri].Schedule(function, arg, tag, unschedFunction);
+}
+
+int WinEnvThreads::UnSchedule(void* arg, Env::Priority pri) {
+  return thread_pools_[pri].UnSchedule(arg);
+}
+
+namespace {
+
+struct StartThreadState {
+  void (*user_function)(void*);
+  void* arg;
+};
+
+void* StartThreadWrapper(void* arg) {
+  std::unique_ptr<StartThreadState> state(
+      reinterpret_cast<StartThreadState*>(arg));
+  state->user_function(state->arg);
+  return nullptr;
+}
+
+}  // namespace
+
+void WinEnvThreads::StartThread(void (*function)(void* arg), void* arg) {
+  std::unique_ptr<StartThreadState> state(new StartThreadState);
+  state->user_function = function;
+  state->arg = arg;
+  try {
+    Thread th(&StartThreadWrapper, state.get());
+    state.release();
+
+    std::lock_guard<std::mutex> lg(mu_);
+    threads_to_join_.push_back(std::move(th));
+
+  } catch (const std::system_error& ex) {
+    WinthreadCall("start thread", ex.code());
+  }
+}
+
+void WinEnvThreads::WaitForJoin() {
+  for (auto& th : threads_to_join_) {
+    th.join();
+  }
+  threads_to_join_.clear();
+}
+
+unsigned int WinEnvThreads::GetThreadPoolQueueLen(Env::Priority pri) const {
+  assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH);
+  return thread_pools_[pri].GetQueueLen();
+}
+
+int WinEnvThreads::ReserveThreads(int threads_to_reserved, Env::Priority pri) {
+  assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH);
+  return thread_pools_[pri].ReserveThreads(threads_to_reserved);
+}
+
+int WinEnvThreads::ReleaseThreads(int threads_to_released, Env::Priority pri) {
+  assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH);
+  return thread_pools_[pri].ReleaseThreads(threads_to_released);
+}
+
+uint64_t WinEnvThreads::gettid() {
+  uint64_t thread_id = GetCurrentThreadId();
+  return thread_id;
+}
+
+uint64_t WinEnvThreads::GetThreadID() const { return gettid(); }
+
+void WinEnvThreads::SetBackgroundThreads(int num, Env::Priority pri) {
+  assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH);
+  thread_pools_[pri].SetBackgroundThreads(num);
+}
+
+int WinEnvThreads::GetBackgroundThreads(Env::Priority pri) {
+  assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH);
+  return thread_pools_[pri].GetBackgroundThreads();
+}
+
+void WinEnvThreads::IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) {
+  assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH);
+  thread_pools_[pri].IncBackgroundThreadsIfNeeded(num);
+}
+
+/////////////////////////////////////////////////////////////////////////
+// WinEnv
+
+WinEnv::WinEnv()
+    : CompositeEnv(WinFileSystem::Default(), WinClock::Default()),
+      winenv_io_(this),
+      winenv_threads_(this) {
+  // Protected member of the base class
+  thread_status_updater_ = CreateThreadStatusUpdater();
+}
+
+WinEnv::~WinEnv() {
+  // All threads must be joined before the deletion of
+  // thread_status_updater_.
+  delete thread_status_updater_;
+}
+
+Status WinEnv::GetThreadList(std::vector<ThreadStatus>* thread_list) {
+  assert(thread_status_updater_);
+  return thread_status_updater_->GetThreadList(thread_list);
+}
+
+Status WinEnv::GetHostName(char* name, uint64_t len) {
+  return winenv_io_.GetHostName(name, len);
+}
+
+void WinEnv::Schedule(void (*function)(void*), void* arg, Env::Priority pri,
+                      void* tag, void (*unschedFunction)(void* arg)) {
+  return winenv_threads_.Schedule(function, arg, pri, tag, unschedFunction);
+}
+
+int WinEnv::UnSchedule(void* arg, Env::Priority pri) {
+  return winenv_threads_.UnSchedule(arg, pri);
+}
+
+void WinEnv::StartThread(void (*function)(void* arg), void* arg) {
+  return winenv_threads_.StartThread(function, arg);
+}
+
+void WinEnv::WaitForJoin() { return winenv_threads_.WaitForJoin(); }
+
+unsigned int WinEnv::GetThreadPoolQueueLen(Env::Priority pri) const {
+  return winenv_threads_.GetThreadPoolQueueLen(pri);
+}
+int WinEnv::ReserveThreads(int threads_to_reserved, Env::Priority pri) {
+  return winenv_threads_.ReserveThreads(threads_to_reserved, pri);
+}
+
+int WinEnv::ReleaseThreads(int threads_to_released, Env::Priority pri) {
+  return winenv_threads_.ReleaseThreads(threads_to_released, pri);
+}
+
+uint64_t WinEnv::GetThreadID() const { return winenv_threads_.GetThreadID(); }
+
+// Allow increasing the number of worker threads.
+void WinEnv::SetBackgroundThreads(int num, Env::Priority pri) {
+  return winenv_threads_.SetBackgroundThreads(num, pri);
+}
+
+int WinEnv::GetBackgroundThreads(Env::Priority pri) {
+  return winenv_threads_.GetBackgroundThreads(pri);
+}
+
+void WinEnv::IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) {
+  return winenv_threads_.IncBackgroundThreadsIfNeeded(num, pri);
+}
+
+}  // namespace port
+
+std::shared_ptr<FileSystem> FileSystem::Default() {
+  return port::WinFileSystem::Default();
+}
+
+const std::shared_ptr<SystemClock>& SystemClock::Default() {
+  STATIC_AVOID_DESTRUCTION(std::shared_ptr<SystemClock>, clock)
+  (std::make_shared<port::WinClock>());
+  return clock;
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/port/win/env_win.h b/src/rocksdb/port/win/env_win.h
new file mode 100644
index 000000000..8fbfb8246
--- /dev/null
+++ b/src/rocksdb/port/win/env_win.h
@@ -0,0 +1,304 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// An Env is an interface used by the rocksdb implementation to access
+// operating system functionality like the filesystem etc.  Callers
+// may wish to provide a custom Env object when opening a database to
+// get fine gain control; e.g., to rate limit file system operations.
+//
+// All Env implementations are safe for concurrent access from
+// multiple threads without any external synchronization.
+
+#pragma once
+#include <stdint.h>
+#include <windows.h>
+
+#include <mutex>
+#include <string>
+#include <vector>
+
+#include "env/composite_env_wrapper.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
+#include "util/threadpool_imp.h"
+
+#undef GetCurrentTime
+#undef DeleteFile
+#undef LoadLibrary
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+
+// Currently not designed for inheritance but rather a replacement
+class WinEnvThreads {
+ public:
+  explicit WinEnvThreads(Env* hosted_env);
+
+  ~WinEnvThreads();
+
+  WinEnvThreads(const WinEnvThreads&) = delete;
+  WinEnvThreads& operator=(const WinEnvThreads&) = delete;
+
+  void Schedule(void (*function)(void*), void* arg, Env::Priority pri,
+                void* tag, void (*unschedFunction)(void* arg));
+
+  int UnSchedule(void* arg, Env::Priority pri);
+
+  void StartThread(void (*function)(void* arg), void* arg);
+
+  void WaitForJoin();
+
+  unsigned int GetThreadPoolQueueLen(Env::Priority pri) const;
+
+  int ReserveThreads(int threads_to_be_reserved, Env::Priority pri);
+
+  int ReleaseThreads(int threads_to_be_released, Env::Priority pri);
+
+  static uint64_t gettid();
+
+  uint64_t GetThreadID() const;
+
+  // Allow increasing the number of worker threads.
+  void SetBackgroundThreads(int num, Env::Priority pri);
+  int GetBackgroundThreads(Env::Priority pri);
+
+  void IncBackgroundThreadsIfNeeded(int num, Env::Priority pri);
+
+ private:
+  Env* hosted_env_;
+  mutable std::mutex mu_;
+  std::vector<ThreadPoolImpl> thread_pools_;
+  std::vector<Thread> threads_to_join_;
+};
+
+class WinClock : public SystemClock {
+ public:
+  WinClock();
+  virtual ~WinClock() {}
+
+  static const char* kClassName() { return "WindowsClock"; }
+  const char* Name() const override { return kDefaultName(); }
+  const char* NickName() const override { return kClassName(); }
+
+  uint64_t NowMicros() override;
+
+  uint64_t NowNanos() override;
+
+  // 0 indicates not supported
+  uint64_t CPUMicros() override { return 0; }
+  void SleepForMicroseconds(int micros) override;
+
+  Status GetCurrentTime(int64_t* unix_time) override;
+  // Converts seconds-since-Jan-01-1970 to a printable string
+  virtual std::string TimeToString(uint64_t time);
+
+  uint64_t GetPerfCounterFrequency() const { return perf_counter_frequency_; }
+
+ private:
+  using FnGetSystemTimePreciseAsFileTime = VOID(WINAPI*)(LPFILETIME);
+
+  uint64_t perf_counter_frequency_;
+  uint64_t nano_seconds_per_period_;
+  FnGetSystemTimePreciseAsFileTime GetSystemTimePreciseAsFileTime_;
+};
+
+class WinFileSystem : public FileSystem {
+ public:
+  static const std::shared_ptr<WinFileSystem>& Default();
+  WinFileSystem(const std::shared_ptr<SystemClock>& clock);
+  ~WinFileSystem() {}
+  static const char* kClassName() { return "WinFS"; }
+  const char* Name() const override { return kClassName(); }
+  const char* NickName() const { return kDefaultName(); }
+
+  static size_t GetSectorSize(const std::string& fname);
+  size_t GetPageSize() const { return page_size_; }
+  size_t GetAllocationGranularity() const { return allocation_granularity_; }
+
+  IOStatus DeleteFile(const std::string& fname, const IOOptions& options,
+                      IODebugContext* dbg) override;
+
+  // Truncate the named file to the specified size.
+  IOStatus Truncate(const std::string& /*fname*/, size_t /*size*/,
+                    const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override;
+  IOStatus NewSequentialFile(const std::string& fname,
+                             const FileOptions& file_opts,
+                             std::unique_ptr<FSSequentialFile>* result,
+                             IODebugContext* dbg) override;
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& options,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* /*dbg*/) override;
+  IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts,
+                           std::unique_ptr<FSWritableFile>* r,
+                           IODebugContext* dbg) override;
+  IOStatus ReopenWritableFile(const std::string& fname,
+                              const FileOptions& options,
+                              std::unique_ptr<FSWritableFile>* result,
+                              IODebugContext* dbg) override;
+
+  IOStatus NewRandomRWFile(const std::string& fname,
+                           const FileOptions& file_opts,
+                           std::unique_ptr<FSRandomRWFile>* result,
+                           IODebugContext* dbg) override;
+  IOStatus NewMemoryMappedFileBuffer(
+      const std::string& fname,
+      std::unique_ptr<MemoryMappedFileBuffer>* result) override;
+
+  IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts,
+                        std::unique_ptr<FSDirectory>* result,
+                        IODebugContext* dbg) override;
+  IOStatus FileExists(const std::string& f, const IOOptions& io_opts,
+                      IODebugContext* dbg) override;
+  IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts,
+                       std::vector<std::string>* r,
+                       IODebugContext* dbg) override;
+  IOStatus CreateDir(const std::string& dirname, const IOOptions& options,
+                     IODebugContext* dbg) override;
+
+  // Creates directory if missing. Return Ok if it exists, or successful in
+  // Creating.
+  IOStatus CreateDirIfMissing(const std::string& dirname,
+                              const IOOptions& options,
+                              IODebugContext* dbg) override;
+
+  // Delete the specified directory.
+  IOStatus DeleteDir(const std::string& dirname, const IOOptions& options,
+                     IODebugContext* dbg) override;
+  // Store the size of fname in *file_size.
+  IOStatus GetFileSize(const std::string& fname, const IOOptions& options,
+                       uint64_t* file_size, IODebugContext* dbg) override;
+  // Store the last modification time of fname in *file_mtime.
+  IOStatus GetFileModificationTime(const std::string& fname,
+                                   const IOOptions& options,
+                                   uint64_t* file_mtime,
+                                   IODebugContext* dbg) override;
+  // Rename file src to target.
+  IOStatus RenameFile(const std::string& src, const std::string& target,
+                      const IOOptions& options, IODebugContext* dbg) override;
+
+  // Hard Link file src to target.
+  IOStatus LinkFile(const std::string& /*src*/, const std::string& /*target*/,
+                    const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override;
+  IOStatus NumFileLinks(const std::string& /*fname*/,
+                        const IOOptions& /*options*/, uint64_t* /*count*/,
+                        IODebugContext* /*dbg*/) override;
+  IOStatus AreFilesSame(const std::string& /*first*/,
+                        const std::string& /*second*/,
+                        const IOOptions& /*options*/, bool* /*res*/,
+                        IODebugContext* /*dbg*/) override;
+  IOStatus LockFile(const std::string& fname, const IOOptions& options,
+                    FileLock** lock, IODebugContext* dbg) override;
+  IOStatus UnlockFile(FileLock* lock, const IOOptions& options,
+                      IODebugContext* dbg) override;
+  IOStatus GetTestDirectory(const IOOptions& options, std::string* path,
+                            IODebugContext* dbg) override;
+
+  // Create and returns a default logger (an instance of EnvLogger) for storing
+  // informational messages. Derived classes can override to provide custom
+  // logger.
+  IOStatus NewLogger(const std::string& fname, const IOOptions& io_opts,
+                     std::shared_ptr<Logger>* result,
+                     IODebugContext* dbg) override;
+  // Get full directory name for this db.
+  IOStatus GetAbsolutePath(const std::string& db_path, const IOOptions& options,
+                           std::string* output_path,
+                           IODebugContext* dbg) override;
+  IOStatus IsDirectory(const std::string& /*path*/, const IOOptions& options,
+                       bool* is_dir, IODebugContext* /*dgb*/) override;
+  // This seems to clash with a macro on Windows, so #undef it here
+#undef GetFreeSpace
+  IOStatus GetFreeSpace(const std::string& /*path*/,
+                        const IOOptions& /*options*/, uint64_t* /*diskfree*/,
+                        IODebugContext* /*dbg*/) override;
+  FileOptions OptimizeForLogWrite(const FileOptions& file_options,
+                                  const DBOptions& db_options) const override;
+  FileOptions OptimizeForManifestRead(
+      const FileOptions& file_options) const override;
+  FileOptions OptimizeForManifestWrite(
+      const FileOptions& file_options) const override;
+
+ protected:
+  static uint64_t FileTimeToUnixTime(const FILETIME& ftTime);
+  // Returns true iff the named directory exists and is a directory.
+
+  virtual bool DirExists(const std::string& dname);
+  // Helper for NewWritable and ReopenWritableFile
+  virtual IOStatus OpenWritableFile(const std::string& fname,
+                                    const FileOptions& options,
+                                    std::unique_ptr<FSWritableFile>* result,
+                                    bool reopen);
+
+ private:
+  std::shared_ptr<SystemClock> clock_;
+  size_t page_size_;
+  size_t allocation_granularity_;
+};
+
+// Designed for inheritance so can be re-used
+// but certain parts replaced
+class WinEnvIO {
+ public:
+  explicit WinEnvIO(Env* hosted_env);
+
+  virtual ~WinEnvIO();
+
+  virtual Status GetHostName(char* name, uint64_t len);
+
+ private:
+  Env* hosted_env_;
+};
+
+class WinEnv : public CompositeEnv {
+ public:
+  WinEnv();
+
+  ~WinEnv();
+  static const char* kClassName() { return "WinEnv"; }
+  const char* Name() const override { return kClassName(); }
+  const char* NickName() const override { return kDefaultName(); }
+
+  Status GetHostName(char* name, uint64_t len) override;
+
+  Status GetThreadList(std::vector<ThreadStatus>* thread_list) override;
+
+  void Schedule(void (*function)(void*), void* arg, Env::Priority pri,
+                void* tag, void (*unschedFunction)(void* arg)) override;
+
+  int UnSchedule(void* arg, Env::Priority pri) override;
+
+  void StartThread(void (*function)(void* arg), void* arg) override;
+
+  void WaitForJoin() override;
+
+  unsigned int GetThreadPoolQueueLen(Env::Priority pri) const override;
+
+  int ReserveThreads(int threads_to_be_reserved, Env::Priority pri) override;
+
+  int ReleaseThreads(int threads_to_be_released, Env::Priority pri) override;
+
+  uint64_t GetThreadID() const override;
+
+  // Allow increasing the number of worker threads.
+  void SetBackgroundThreads(int num, Env::Priority pri) override;
+  int GetBackgroundThreads(Env::Priority pri) override;
+
+  void IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) override;
+
+ private:
+  WinEnvIO winenv_io_;
+  WinEnvThreads winenv_threads_;
+};
+
+}  // namespace port
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/port/win/io_win.cc b/src/rocksdb/port/win/io_win.cc
new file mode 100644
index 000000000..4fa735518
--- /dev/null
+++ b/src/rocksdb/port/win/io_win.cc
@@ -0,0 +1,1101 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#if defined(OS_WIN)
+
+#include "port/win/io_win.h"
+
+#include "env_win.h"
+#include "monitoring/iostats_context_imp.h"
+#include "test_util/sync_point.h"
+#include "util/aligned_buffer.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+
+/*
+ * DirectIOHelper
+ */
+namespace {
+
+const size_t kSectorSize = 512;
+
+inline bool IsPowerOfTwo(const size_t alignment) {
+  return ((alignment) & (alignment - 1)) == 0;
+}
+
+inline bool IsAligned(size_t alignment, const void* ptr) {
+  return ((uintptr_t(ptr)) & (alignment - 1)) == 0;
+}
+}  // namespace
+
+std::string GetWindowsErrSz(DWORD err) {
+  std::string Err;
+  LPSTR lpMsgBuf = nullptr;
+  FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
+                     FORMAT_MESSAGE_IGNORE_INSERTS,
+                 NULL, err,
+                 0,  // Default language
+                 reinterpret_cast<LPSTR>(&lpMsgBuf), 0, NULL);
+
+  if (lpMsgBuf) {
+    Err = lpMsgBuf;
+    LocalFree(lpMsgBuf);
+  }
+  return Err;
+}
+
+// We preserve the original name of this interface to denote the original idea
+// behind it.
+// All reads happen by a specified offset and pwrite interface does not change
+// the position of the file pointer. Judging from the man page and errno it does
+// execute
+// lseek atomically to return the position of the file back where it was.
+// WriteFile() does not
+// have this capability. Therefore, for both pread and pwrite the pointer is
+// advanced to the next position
+// which is fine for writes because they are (should be) sequential.
+// Because all the reads/writes happen by the specified offset, the caller in
+// theory should not
+// rely on the current file offset.
+IOStatus pwrite(const WinFileData* file_data, const Slice& data,
+                uint64_t offset, size_t& bytes_written) {
+  IOStatus s;
+  bytes_written = 0;
+
+  size_t num_bytes = data.size();
+  if (num_bytes > std::numeric_limits<DWORD>::max()) {
+    // May happen in 64-bit builds where size_t is 64-bits but
+    // long is still 32-bit, but that's the API here at the moment
+    return IOStatus::InvalidArgument(
+        "num_bytes is too large for a single write: " + file_data->GetName());
+  }
+
+  OVERLAPPED overlapped = {0};
+  ULARGE_INTEGER offsetUnion;
+  offsetUnion.QuadPart = offset;
+
+  overlapped.Offset = offsetUnion.LowPart;
+  overlapped.OffsetHigh = offsetUnion.HighPart;
+
+  DWORD bytesWritten = 0;
+
+  if (FALSE == WriteFile(file_data->GetFileHandle(), data.data(),
+                         static_cast<DWORD>(num_bytes), &bytesWritten,
+                         &overlapped)) {
+    auto lastError = GetLastError();
+    s = IOErrorFromWindowsError("WriteFile failed: " + file_data->GetName(),
+                                lastError);
+  } else {
+    bytes_written = bytesWritten;
+  }
+
+  return s;
+}
+
+// See comments for pwrite above
+IOStatus pread(const WinFileData* file_data, char* src, size_t num_bytes,
+               uint64_t offset, size_t& bytes_read) {
+  IOStatus s;
+  bytes_read = 0;
+
+  if (num_bytes > std::numeric_limits<DWORD>::max()) {
+    return IOStatus::InvalidArgument(
+        "num_bytes is too large for a single read: " + file_data->GetName());
+  }
+
+  OVERLAPPED overlapped = {0};
+  ULARGE_INTEGER offsetUnion;
+  offsetUnion.QuadPart = offset;
+
+  overlapped.Offset = offsetUnion.LowPart;
+  overlapped.OffsetHigh = offsetUnion.HighPart;
+
+  DWORD bytesRead = 0;
+
+  if (FALSE == ReadFile(file_data->GetFileHandle(), src,
+                        static_cast<DWORD>(num_bytes), &bytesRead,
+                        &overlapped)) {
+    auto lastError = GetLastError();
+    // EOF is OK with zero bytes read
+    if (lastError != ERROR_HANDLE_EOF) {
+      s = IOErrorFromWindowsError("ReadFile failed: " + file_data->GetName(),
+                                  lastError);
+    }
+  } else {
+    bytes_read = bytesRead;
+  }
+
+  return s;
+}
+
+// SetFileInformationByHandle() is capable of fast pre-allocates.
+// However, this does not change the file end position unless the file is
+// truncated and the pre-allocated space is not considered filled with zeros.
+IOStatus fallocate(const std::string& filename, HANDLE hFile,
+                   uint64_t to_size) {
+  IOStatus status;
+
+  FILE_ALLOCATION_INFO alloc_info;
+  alloc_info.AllocationSize.QuadPart = to_size;
+
+  if (!SetFileInformationByHandle(hFile, FileAllocationInfo, &alloc_info,
+                                  sizeof(FILE_ALLOCATION_INFO))) {
+    auto lastError = GetLastError();
+    status = IOErrorFromWindowsError(
+        "Failed to pre-allocate space: " + filename, lastError);
+  }
+
+  return status;
+}
+
+IOStatus ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize) {
+  IOStatus status;
+
+  FILE_END_OF_FILE_INFO end_of_file;
+  end_of_file.EndOfFile.QuadPart = toSize;
+
+  if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file,
+                                  sizeof(FILE_END_OF_FILE_INFO))) {
+    auto lastError = GetLastError();
+    status = IOErrorFromWindowsError("Failed to Set end of file: " + filename,
+                                     lastError);
+  }
+
+  return status;
+}
+
+size_t GetUniqueIdFromFile(HANDLE /*hFile*/, char* /*id*/,
+                           size_t /*max_size*/) {
+  // Returning 0 is safe as it causes the table reader to generate a unique ID.
+  // This is suboptimal for performance as it prevents multiple table readers
+  // for the same file from sharing cached blocks. For example, if users have
+  // a low value for `max_open_files`, there can be many table readers opened
+  // for the same file.
+  //
+  // TODO: this is a temporarily solution as it is safe but not optimal for
+  // performance. For more details see discussion in
+  // https://github.com/facebook/rocksdb/pull/5844.
+  return 0;
+}
+
+WinFileData::WinFileData(const std::string& filename, HANDLE hFile,
+                         bool direct_io)
+    : filename_(filename),
+      hFile_(hFile),
+      use_direct_io_(direct_io),
+      sector_size_(WinFileSystem::GetSectorSize(filename)) {}
+
+bool WinFileData::IsSectorAligned(const size_t off) const {
+  return (off & (sector_size_ - 1)) == 0;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// WinMmapReadableFile
+
+WinMmapReadableFile::WinMmapReadableFile(const std::string& fileName,
+                                         HANDLE hFile, HANDLE hMap,
+                                         const void* mapped_region,
+                                         size_t length)
+    : WinFileData(fileName, hFile, false /* use_direct_io */),
+      hMap_(hMap),
+      mapped_region_(mapped_region),
+      length_(length) {}
+
+WinMmapReadableFile::~WinMmapReadableFile() {
+  BOOL ret __attribute__((__unused__));
+  ret = ::UnmapViewOfFile(mapped_region_);
+  assert(ret);
+
+  ret = ::CloseHandle(hMap_);
+  assert(ret);
+}
+
+IOStatus WinMmapReadableFile::Read(uint64_t offset, size_t n,
+                                   const IOOptions& /*options*/, Slice* result,
+                                   char* scratch,
+                                   IODebugContext* /*dbg*/) const {
+  IOStatus s;
+
+  if (offset > length_) {
+    *result = Slice();
+    return IOError(filename_, EINVAL);
+  } else if (offset + n > length_) {
+    n = length_ - static_cast<size_t>(offset);
+  }
+  *result = Slice(reinterpret_cast<const char*>(mapped_region_) + offset, n);
+  return s;
+}
+
+IOStatus WinMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
+  return IOStatus::OK();
+}
+
+size_t WinMmapReadableFile::GetUniqueId(char* id, size_t max_size) const {
+  return GetUniqueIdFromFile(hFile_, id, max_size);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// WinMmapFile
+
+// Can only truncate or reserve to a sector size aligned if
+// used on files that are opened with Unbuffered I/O
+IOStatus WinMmapFile::TruncateFile(uint64_t toSize) {
+  return ftruncate(filename_, hFile_, toSize);
+}
+
+IOStatus WinMmapFile::UnmapCurrentRegion() {
+  IOStatus status;
+
+  if (mapped_begin_ != nullptr) {
+    if (!::UnmapViewOfFile(mapped_begin_)) {
+      status = IOErrorFromWindowsError(
+          "Failed to unmap file view: " + filename_, GetLastError());
+    }
+
+    // Move on to the next portion of the file
+    file_offset_ += view_size_;
+
+    // UnmapView automatically sends data to disk but not the metadata
+    // which is good and provides some equivalent of fdatasync() on Linux
+    // therefore, we donot need separate flag for metadata
+    mapped_begin_ = nullptr;
+    mapped_end_ = nullptr;
+    dst_ = nullptr;
+
+    last_sync_ = nullptr;
+    pending_sync_ = false;
+  }
+
+  return status;
+}
+
+IOStatus WinMmapFile::MapNewRegion(const IOOptions& options,
+                                   IODebugContext* dbg) {
+  IOStatus status;
+
+  assert(mapped_begin_ == nullptr);
+
+  size_t minDiskSize = static_cast<size_t>(file_offset_) + view_size_;
+
+  if (minDiskSize > reserved_size_) {
+    status = Allocate(file_offset_, view_size_, options, dbg);
+    if (!status.ok()) {
+      return status;
+    }
+  }
+
+  // Need to remap
+  if (hMap_ == NULL || reserved_size_ > mapping_size_) {
+    if (hMap_ != NULL) {
+      // Unmap the previous one
+      BOOL ret __attribute__((__unused__));
+      ret = ::CloseHandle(hMap_);
+      assert(ret);
+      hMap_ = NULL;
+    }
+
+    ULARGE_INTEGER mappingSize;
+    mappingSize.QuadPart = reserved_size_;
+
+    hMap_ = CreateFileMappingA(
+        hFile_,
+        NULL,                  // Security attributes
+        PAGE_READWRITE,        // There is not a write only mode for mapping
+        mappingSize.HighPart,  // Enable mapping the whole file but the actual
+        // amount mapped is determined by MapViewOfFile
+        mappingSize.LowPart,
+        NULL);  // Mapping name
+
+    if (NULL == hMap_) {
+      return IOErrorFromWindowsError(
+          "WindowsMmapFile failed to create file mapping for: " + filename_,
+          GetLastError());
+    }
+
+    mapping_size_ = reserved_size_;
+  }
+
+  ULARGE_INTEGER offset;
+  offset.QuadPart = file_offset_;
+
+  // View must begin at the granularity aligned offset
+  mapped_begin_ = reinterpret_cast<char*>(
+      MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart,
+                      view_size_, NULL));
+
+  if (!mapped_begin_) {
+    status = IOErrorFromWindowsError(
+        "WindowsMmapFile failed to map file view: " + filename_,
+        GetLastError());
+  } else {
+    mapped_end_ = mapped_begin_ + view_size_;
+    dst_ = mapped_begin_;
+    last_sync_ = mapped_begin_;
+    pending_sync_ = false;
+  }
+  return status;
+}
+
+IOStatus WinMmapFile::PreallocateInternal(uint64_t spaceToReserve) {
+  return fallocate(filename_, hFile_, spaceToReserve);
+}
+
+WinMmapFile::WinMmapFile(const std::string& fname, HANDLE hFile,
+                         size_t page_size, size_t allocation_granularity,
+                         const FileOptions& options)
+    : WinFileData(fname, hFile, false),
+      FSWritableFile(options),
+      hMap_(NULL),
+      page_size_(page_size),
+      allocation_granularity_(allocation_granularity),
+      reserved_size_(0),
+      mapping_size_(0),
+      view_size_(0),
+      mapped_begin_(nullptr),
+      mapped_end_(nullptr),
+      dst_(nullptr),
+      last_sync_(nullptr),
+      file_offset_(0),
+      pending_sync_(false) {
+  // Allocation granularity must be obtained from GetSystemInfo() and must be
+  // a power of two.
+  assert(allocation_granularity > 0);
+  assert((allocation_granularity & (allocation_granularity - 1)) == 0);
+
+  assert(page_size > 0);
+  assert((page_size & (page_size - 1)) == 0);
+
+  // Only for memory mapped writes
+  assert(options.use_mmap_writes);
+
+  // View size must be both the multiple of allocation_granularity AND the
+  // page size and the granularity is usually a multiple of a page size.
+  const size_t viewSize =
+      32 * 1024;  // 32Kb similar to the Windows File Cache in buffered mode
+  view_size_ = Roundup(viewSize, allocation_granularity_);
+}
+
+WinMmapFile::~WinMmapFile() {
+  if (hFile_) {
+    this->Close(IOOptions(), nullptr);
+  }
+}
+
+IOStatus WinMmapFile::Append(const Slice& data, const IOOptions& options,
+                             IODebugContext* dbg) {
+  const char* src = data.data();
+  size_t left = data.size();
+
+  while (left > 0) {
+    assert(mapped_begin_ <= dst_);
+    size_t avail = mapped_end_ - dst_;
+
+    if (avail == 0) {
+      IOStatus s = UnmapCurrentRegion();
+      if (s.ok()) {
+        s = MapNewRegion(options, dbg);
+      }
+
+      if (!s.ok()) {
+        return s;
+      }
+    } else {
+      size_t n = std::min(left, avail);
+      memcpy(dst_, src, n);
+      dst_ += n;
+      src += n;
+      left -= n;
+      pending_sync_ = true;
+    }
+  }
+
+  // Now make sure that the last partial page is padded with zeros if needed
+  size_t bytesToPad = Roundup(size_t(dst_), page_size_) - size_t(dst_);
+  if (bytesToPad > 0) {
+    memset(dst_, 0, bytesToPad);
+  }
+
+  return IOStatus::OK();
+}
+
+// Means Close() will properly take care of truncate
+// and it does not need any additional information
+IOStatus WinMmapFile::Truncate(uint64_t size, const IOOptions& /*options*/,
+                               IODebugContext* /*dbg*/) {
+  return IOStatus::OK();
+}
+
+IOStatus WinMmapFile::Close(const IOOptions& options, IODebugContext* dbg) {
+  IOStatus s;
+
+  assert(NULL != hFile_);
+
+  // We truncate to the precise size so no
+  // uninitialized data at the end. SetEndOfFile
+  // which we use does not write zeros and it is good.
+  uint64_t targetSize = GetFileSize(options, dbg);
+
+  if (mapped_begin_ != nullptr) {
+    // Sync before unmapping to make sure everything
+    // is on disk and there is not a lazy writing
+    // so we are deterministic with the tests
+    Sync(options, dbg);
+    s = UnmapCurrentRegion();
+  }
+
+  if (NULL != hMap_) {
+    BOOL ret = ::CloseHandle(hMap_);
+    if (!ret && s.ok()) {
+      auto lastError = GetLastError();
+      s = IOErrorFromWindowsError(
+          "Failed to Close mapping for file: " + filename_, lastError);
+    }
+
+    hMap_ = NULL;
+  }
+
+  if (hFile_ != NULL) {
+    TruncateFile(targetSize);
+
+    BOOL ret = ::CloseHandle(hFile_);
+    hFile_ = NULL;
+
+    if (!ret && s.ok()) {
+      auto lastError = GetLastError();
+      s = IOErrorFromWindowsError(
+          "Failed to close file map handle: " + filename_, lastError);
+    }
+  }
+
+  return s;
+}
+
+IOStatus WinMmapFile::Flush(const IOOptions& /*options*/,
+                            IODebugContext* /*dbg*/) {
+  return IOStatus::OK();
+}
+
+// Flush only data
+IOStatus WinMmapFile::Sync(const IOOptions& /*options*/,
+                           IODebugContext* /*dbg*/) {
+  IOStatus s;
+
+  // Some writes occurred since last sync
+  if (dst_ > last_sync_) {
+    assert(mapped_begin_);
+    assert(dst_);
+    assert(dst_ > mapped_begin_);
+    assert(dst_ < mapped_end_);
+
+    size_t page_begin =
+        TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_);
+    size_t page_end =
+        TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1);
+
+    // Flush only the amount of that is a multiple of pages
+    if (!::FlushViewOfFile(mapped_begin_ + page_begin,
+                           (page_end - page_begin) + page_size_)) {
+      s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_,
+                                  GetLastError());
+    } else {
+      last_sync_ = dst_;
+    }
+  }
+
+  return s;
+}
+
+/**
+ * Flush data as well as metadata to stable storage.
+ */
+IOStatus WinMmapFile::Fsync(const IOOptions& options, IODebugContext* dbg) {
+  IOStatus s = Sync(options, dbg);
+
+  // Flush metadata
+  if (s.ok() && pending_sync_) {
+    if (!::FlushFileBuffers(hFile_)) {
+      s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_,
+                                  GetLastError());
+    }
+    pending_sync_ = false;
+  }
+
+  return s;
+}
+
+/**
+ * Get the size of valid data in the file. This will not match the
+ * size that is returned from the filesystem because we use mmap
+ * to extend file by map_size every time.
+ */
+uint64_t WinMmapFile::GetFileSize(const IOOptions& /*options*/,
+                                  IODebugContext* /*dbg*/) {
+  size_t used = dst_ - mapped_begin_;
+  return file_offset_ + used;
+}
+
+IOStatus WinMmapFile::InvalidateCache(size_t offset, size_t length) {
+  return IOStatus::OK();
+}
+
+IOStatus WinMmapFile::Allocate(uint64_t offset, uint64_t len,
+                               const IOOptions& /*options*/,
+                               IODebugContext* /*dbg*/) {
+  IOStatus status;
+  TEST_KILL_RANDOM("WinMmapFile::Allocate");
+
+  // Make sure that we reserve an aligned amount of space
+  // since the reservation block size is driven outside so we want
+  // to check if we are ok with reservation here
+  size_t spaceToReserve =
+      Roundup(static_cast<size_t>(offset + len), view_size_);
+  // Nothing to do
+  if (spaceToReserve <= reserved_size_) {
+    return status;
+  }
+
+  IOSTATS_TIMER_GUARD(allocate_nanos);
+  status = PreallocateInternal(spaceToReserve);
+  if (status.ok()) {
+    reserved_size_ = spaceToReserve;
+  }
+  return status;
+}
+
+size_t WinMmapFile::GetUniqueId(char* id, size_t max_size) const {
+  return GetUniqueIdFromFile(hFile_, id, max_size);
+}
+
+//////////////////////////////////////////////////////////////////////////////////
+// WinSequentialFile
+
+WinSequentialFile::WinSequentialFile(const std::string& fname, HANDLE f,
+                                     const FileOptions& options)
+    : WinFileData(fname, f, options.use_direct_reads) {}
+
+WinSequentialFile::~WinSequentialFile() {
+  assert(hFile_ != INVALID_HANDLE_VALUE);
+}
+
+IOStatus WinSequentialFile::Read(size_t n, const IOOptions& /*opts*/,
+                                 Slice* result, char* scratch,
+                                 IODebugContext* /*dbg*/) {
+  IOStatus s;
+  size_t r = 0;
+
+  assert(result != nullptr);
+  if (WinFileData::use_direct_io()) {
+    return IOStatus::NotSupported("Read() does not support direct_io");
+  }
+
+  // Windows ReadFile API accepts a DWORD.
+  // While it is possible to read in a loop if n is too big
+  // it is an unlikely case.
+  if (n > std::numeric_limits<DWORD>::max()) {
+    return IOStatus::InvalidArgument("n is too big for a single ReadFile: " +
+                                     filename_);
+  }
+
+  DWORD bytesToRead =
+      static_cast<DWORD>(n);  // cast is safe due to the check above
+  DWORD bytesRead = 0;
+  BOOL ret = ReadFile(hFile_, scratch, bytesToRead, &bytesRead, NULL);
+  if (ret != FALSE) {
+    r = bytesRead;
+  } else {
+    auto lastError = GetLastError();
+    if (lastError != ERROR_HANDLE_EOF) {
+      s = IOErrorFromWindowsError("ReadFile failed: " + filename_, lastError);
+    }
+  }
+
+  *result = Slice(scratch, r);
+  return s;
+}
+
+IOStatus WinSequentialFile::PositionedReadInternal(char* src, size_t numBytes,
+                                                   uint64_t offset,
+                                                   size_t& bytes_read) const {
+  return pread(this, src, numBytes, offset, bytes_read);
+}
+
+IOStatus WinSequentialFile::PositionedRead(uint64_t offset, size_t n,
+                                           const IOOptions& /*opts*/,
+                                           Slice* result, char* scratch,
+                                           IODebugContext* /*dbg*/) {
+  if (!WinFileData::use_direct_io()) {
+    return IOStatus::NotSupported("This function is only used for direct_io");
+  }
+
+  assert(IsSectorAligned(static_cast<size_t>(offset)));
+  assert(IsSectorAligned(static_cast<size_t>(n)));
+
+  size_t bytes_read = 0;  // out param
+  IOStatus s = PositionedReadInternal(scratch, static_cast<size_t>(n), offset,
+                                      bytes_read);
+  *result = Slice(scratch, bytes_read);
+  return s;
+}
+
+IOStatus WinSequentialFile::Skip(uint64_t n) {
+  // Can't handle more than signed max as SetFilePointerEx accepts a signed
+  // 64-bit integer. As such it is a highly unlikley case to have n so large.
+  if (n > static_cast<uint64_t>(std::numeric_limits<LONGLONG>::max())) {
+    return IOStatus::InvalidArgument(
+        "n is too large for a single SetFilePointerEx() call" + filename_);
+  }
+
+  LARGE_INTEGER li;
+  li.QuadPart = static_cast<LONGLONG>(n);  // cast is safe due to the check
+                                           // above
+  BOOL ret = SetFilePointerEx(hFile_, li, NULL, FILE_CURRENT);
+  if (ret == FALSE) {
+    auto lastError = GetLastError();
+    return IOErrorFromWindowsError("Skip SetFilePointerEx():" + filename_,
+                                   lastError);
+  }
+  return IOStatus::OK();
+}
+
+IOStatus WinSequentialFile::InvalidateCache(size_t offset, size_t length) {
+  return IOStatus::OK();
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+/// WinRandomAccessBase
+
+inline IOStatus WinRandomAccessImpl::PositionedReadInternal(
+    char* src, size_t numBytes, uint64_t offset, size_t& bytes_read) const {
+  return pread(file_base_, src, numBytes, offset, bytes_read);
+}
+
+inline WinRandomAccessImpl::WinRandomAccessImpl(WinFileData* file_base,
+                                                size_t alignment,
+                                                const FileOptions& options)
+    : file_base_(file_base),
+      alignment_(std::max(alignment, file_base->GetSectorSize())) {
+  assert(!options.use_mmap_reads);
+}
+
+inline IOStatus WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n,
+                                              Slice* result,
+                                              char* scratch) const {
+  // Check buffer alignment
+  if (file_base_->use_direct_io()) {
+    assert(file_base_->IsSectorAligned(static_cast<size_t>(offset)));
+    assert(IsAligned(alignment_, scratch));
+  }
+
+  if (n == 0) {
+    *result = Slice(scratch, 0);
+    return IOStatus::OK();
+  }
+
+  size_t bytes_read = 0;
+  IOStatus s = PositionedReadInternal(scratch, n, offset, bytes_read);
+  *result = Slice(scratch, bytes_read);
+  return s;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// WinRandomAccessFile
+
+WinRandomAccessFile::WinRandomAccessFile(const std::string& fname, HANDLE hFile,
+                                         size_t alignment,
+                                         const FileOptions& options)
+    : WinFileData(fname, hFile, options.use_direct_reads),
+      WinRandomAccessImpl(this, alignment, options) {}
+
+WinRandomAccessFile::~WinRandomAccessFile() {}
+
+IOStatus WinRandomAccessFile::Read(uint64_t offset, size_t n,
+                                   const IOOptions& /*options*/, Slice* result,
+                                   char* scratch,
+                                   IODebugContext* /*dbg*/) const {
+  return ReadImpl(offset, n, result, scratch);
+}
+
+IOStatus WinRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
+  return IOStatus::OK();
+}
+
+size_t WinRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
+  return GetUniqueIdFromFile(GetFileHandle(), id, max_size);
+}
+
+size_t WinRandomAccessFile::GetRequiredBufferAlignment() const {
+  return GetAlignment();
+}
+
+/////////////////////////////////////////////////////////////////////////////
+// WinWritableImpl
+//
+
+inline IOStatus WinWritableImpl::PreallocateInternal(uint64_t spaceToReserve) {
+  return fallocate(file_data_->GetName(), file_data_->GetFileHandle(),
+                   spaceToReserve);
+}
+
+inline WinWritableImpl::WinWritableImpl(WinFileData* file_data,
+                                        size_t alignment)
+    : file_data_(file_data),
+      alignment_(std::max(alignment, file_data->GetSectorSize())),
+      next_write_offset_(0),
+      reservedsize_(0) {
+  // Query current position in case ReopenWritableFile is called
+  // This position is only important for buffered writes
+  // for unbuffered writes we explicitely specify the position.
+  LARGE_INTEGER zero_move;
+  zero_move.QuadPart = 0;  // Do not move
+  LARGE_INTEGER pos;
+  pos.QuadPart = 0;
+  BOOL ret = SetFilePointerEx(file_data_->GetFileHandle(), zero_move, &pos,
+                              FILE_CURRENT);
+  // Querying no supped to fail
+  if (ret != 0) {
+    next_write_offset_ = pos.QuadPart;
+  } else {
+    assert(false);
+  }
+}
+
+inline IOStatus WinWritableImpl::AppendImpl(const Slice& data) {
+  IOStatus s;
+
+  if (data.size() > std::numeric_limits<DWORD>::max()) {
+    return IOStatus::InvalidArgument("data is too long for a single write" +
+                                     file_data_->GetName());
+  }
+
+  size_t bytes_written = 0;  // out param
+
+  if (file_data_->use_direct_io()) {
+    // With no offset specified we are appending
+    // to the end of the file
+    assert(file_data_->IsSectorAligned(next_write_offset_));
+    assert(file_data_->IsSectorAligned(data.size()));
+    assert(IsAligned(static_cast<size_t>(GetAlignment()), data.data()));
+    s = pwrite(file_data_, data, next_write_offset_, bytes_written);
+  } else {
+    DWORD bytesWritten = 0;
+    if (!WriteFile(file_data_->GetFileHandle(), data.data(),
+                   static_cast<DWORD>(data.size()), &bytesWritten, NULL)) {
+      auto lastError = GetLastError();
+      s = IOErrorFromWindowsError(
+          "Failed to WriteFile: " + file_data_->GetName(), lastError);
+    } else {
+      bytes_written = bytesWritten;
+    }
+  }
+
+  if (s.ok()) {
+    if (bytes_written == data.size()) {
+      // This matters for direct_io cases where
+      // we rely on the fact that next_write_offset_
+      // is sector aligned
+      next_write_offset_ += bytes_written;
+    } else {
+      s = IOStatus::IOError("Failed to write all bytes: " +
+                            file_data_->GetName());
+    }
+  }
+
+  return s;
+}
+
+inline IOStatus WinWritableImpl::PositionedAppendImpl(const Slice& data,
+                                                      uint64_t offset) {
+  if (file_data_->use_direct_io()) {
+    assert(file_data_->IsSectorAligned(static_cast<size_t>(offset)));
+    assert(file_data_->IsSectorAligned(data.size()));
+    assert(IsAligned(static_cast<size_t>(GetAlignment()), data.data()));
+  }
+
+  size_t bytes_written = 0;
+  IOStatus s = pwrite(file_data_, data, offset, bytes_written);
+
+  if (s.ok()) {
+    if (bytes_written == data.size()) {
+      // For sequential write this would be simple
+      // size extension by data.size()
+      uint64_t write_end = offset + bytes_written;
+      if (write_end >= next_write_offset_) {
+        next_write_offset_ = write_end;
+      }
+    } else {
+      s = IOStatus::IOError("Failed to write all of the requested data: " +
+                            file_data_->GetName());
+    }
+  }
+  return s;
+}
+
+inline IOStatus WinWritableImpl::TruncateImpl(uint64_t size) {
+  // It is tempting to check for the size for sector alignment
+  // but truncation may come at the end and there is not a requirement
+  // for this to be sector aligned so long as we do not attempt to write
+  // after that. The interface docs state that the behavior is undefined
+  // in that case.
+  IOStatus s =
+      ftruncate(file_data_->GetName(), file_data_->GetFileHandle(), size);
+
+  if (s.ok()) {
+    next_write_offset_ = size;
+  }
+  return s;
+}
+
+inline IOStatus WinWritableImpl::CloseImpl() {
+  IOStatus s;
+
+  auto hFile = file_data_->GetFileHandle();
+  assert(INVALID_HANDLE_VALUE != hFile);
+
+  if (!::FlushFileBuffers(hFile)) {
+    auto lastError = GetLastError();
+    s = IOErrorFromWindowsError(
+        "FlushFileBuffers failed at Close() for: " + file_data_->GetName(),
+        lastError);
+  }
+
+  if (!file_data_->CloseFile() && s.ok()) {
+    auto lastError = GetLastError();
+    s = IOErrorFromWindowsError(
+        "CloseHandle failed for: " + file_data_->GetName(), lastError);
+  }
+  return s;
+}
+
+inline IOStatus WinWritableImpl::SyncImpl(const IOOptions& /*options*/,
+                                          IODebugContext* /*dbg*/) {
+  IOStatus s;
+  if (!::FlushFileBuffers(file_data_->GetFileHandle())) {
+    auto lastError = GetLastError();
+    s = IOErrorFromWindowsError(
+        "FlushFileBuffers failed at Sync() for: " + file_data_->GetName(),
+        lastError);
+  }
+  return s;
+}
+
+inline IOStatus WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) {
+  IOStatus status;
+  TEST_KILL_RANDOM("WinWritableFile::Allocate");
+
+  // Make sure that we reserve an aligned amount of space
+  // since the reservation block size is driven outside so we want
+  // to check if we are ok with reservation here
+  size_t spaceToReserve = Roundup(static_cast<size_t>(offset + len),
+                                  static_cast<size_t>(alignment_));
+  // Nothing to do
+  if (spaceToReserve <= reservedsize_) {
+    return status;
+  }
+
+  IOSTATS_TIMER_GUARD(allocate_nanos);
+  status = PreallocateInternal(spaceToReserve);
+  if (status.ok()) {
+    reservedsize_ = spaceToReserve;
+  }
+  return status;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// WinWritableFile
+
+WinWritableFile::WinWritableFile(const std::string& fname, HANDLE hFile,
+                                 size_t alignment, size_t /* capacity */,
+                                 const FileOptions& options)
+    : WinFileData(fname, hFile, options.use_direct_writes),
+      WinWritableImpl(this, alignment),
+      FSWritableFile(options) {
+  assert(!options.use_mmap_writes);
+}
+
+WinWritableFile::~WinWritableFile() {}
+
+// Indicates if the class makes use of direct I/O
+bool WinWritableFile::use_direct_io() const {
+  return WinFileData::use_direct_io();
+}
+
+size_t WinWritableFile::GetRequiredBufferAlignment() const {
+  return static_cast<size_t>(GetAlignment());
+}
+
+IOStatus WinWritableFile::Append(const Slice& data,
+                                 const IOOptions& /*options*/,
+                                 IODebugContext* /*dbg*/) {
+  return AppendImpl(data);
+}
+
+IOStatus WinWritableFile::PositionedAppend(const Slice& data, uint64_t offset,
+                                           const IOOptions& /*options*/,
+                                           IODebugContext* /*dbg*/) {
+  return PositionedAppendImpl(data, offset);
+}
+
+// Need to implement this so the file is truncated correctly
+// when buffered and unbuffered mode
+IOStatus WinWritableFile::Truncate(uint64_t size, const IOOptions& /*options*/,
+                                   IODebugContext* /*dbg*/) {
+  return TruncateImpl(size);
+}
+
+IOStatus WinWritableFile::Close(const IOOptions& /*options*/,
+                                IODebugContext* /*dbg*/) {
+  return CloseImpl();
+}
+
+// write out the cached data to the OS cache
+// This is now taken care of the WritableFileWriter
+IOStatus WinWritableFile::Flush(const IOOptions& /*options*/,
+                                IODebugContext* /*dbg*/) {
+  return IOStatus::OK();
+}
+
+IOStatus WinWritableFile::Sync(const IOOptions& options, IODebugContext* dbg) {
+  return SyncImpl(options, dbg);
+}
+
+IOStatus WinWritableFile::Fsync(const IOOptions& options, IODebugContext* dbg) {
+  return SyncImpl(options, dbg);
+}
+
+bool WinWritableFile::IsSyncThreadSafe() const { return true; }
+
+uint64_t WinWritableFile::GetFileSize(const IOOptions& /*options*/,
+                                      IODebugContext* /*dbg*/) {
+  return GetFileNextWriteOffset();
+}
+
+IOStatus WinWritableFile::Allocate(uint64_t offset, uint64_t len,
+                                   const IOOptions& /*options*/,
+                                   IODebugContext* /*dbg*/) {
+  return AllocateImpl(offset, len);
+}
+
+size_t WinWritableFile::GetUniqueId(char* id, size_t max_size) const {
+  return GetUniqueIdFromFile(GetFileHandle(), id, max_size);
+}
+
+/////////////////////////////////////////////////////////////////////////
+/// WinRandomRWFile
+
+WinRandomRWFile::WinRandomRWFile(const std::string& fname, HANDLE hFile,
+                                 size_t alignment, const FileOptions& options)
+    : WinFileData(fname, hFile,
+                  options.use_direct_reads && options.use_direct_writes),
+      WinRandomAccessImpl(this, alignment, options),
+      WinWritableImpl(this, alignment) {}
+
+bool WinRandomRWFile::use_direct_io() const {
+  return WinFileData::use_direct_io();
+}
+
+size_t WinRandomRWFile::GetRequiredBufferAlignment() const {
+  assert(WinRandomAccessImpl::GetAlignment() ==
+         WinWritableImpl::GetAlignment());
+  return static_cast<size_t>(WinRandomAccessImpl::GetAlignment());
+}
+
+IOStatus WinRandomRWFile::Write(uint64_t offset, const Slice& data,
+                                const IOOptions& /*options*/,
+                                IODebugContext* /*dbg*/) {
+  return PositionedAppendImpl(data, offset);
+}
+
+IOStatus WinRandomRWFile::Read(uint64_t offset, size_t n,
+                               const IOOptions& /*options*/, Slice* result,
+                               char* scratch, IODebugContext* /*dbg*/) const {
+  return ReadImpl(offset, n, result, scratch);
+}
+
+IOStatus WinRandomRWFile::Flush(const IOOptions& /*options*/,
+                                IODebugContext* /*dbg*/) {
+  return IOStatus::OK();
+}
+
+IOStatus WinRandomRWFile::Sync(const IOOptions& options, IODebugContext* dbg) {
+  return SyncImpl(options, dbg);
+}
+
+IOStatus WinRandomRWFile::Close(const IOOptions& /*options*/,
+                                IODebugContext* /*dbg*/) {
+  return CloseImpl();
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// WinMemoryMappedBufer
+WinMemoryMappedBuffer::~WinMemoryMappedBuffer() {
+  BOOL ret
+#if defined(_MSC_VER)
+      = FALSE;
+#else
+      __attribute__((__unused__));
+#endif
+  if (base_ != nullptr) {
+    ret = ::UnmapViewOfFile(base_);
+    assert(ret);
+    base_ = nullptr;
+  }
+  if (map_handle_ != NULL && map_handle_ != INVALID_HANDLE_VALUE) {
+    ret = ::CloseHandle(map_handle_);
+    assert(ret);
+    map_handle_ = NULL;
+  }
+  if (file_handle_ != NULL && file_handle_ != INVALID_HANDLE_VALUE) {
+    ret = ::CloseHandle(file_handle_);
+    assert(ret);
+    file_handle_ = NULL;
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// WinDirectory
+
+IOStatus WinDirectory::Fsync(const IOOptions& /*options*/,
+                             IODebugContext* /*dbg*/) {
+  return IOStatus::OK();
+}
+
+IOStatus WinDirectory::Close(const IOOptions& /*options*/,
+                             IODebugContext* /*dbg*/) {
+  IOStatus s = IOStatus::OK();
+  BOOL ret __attribute__((__unused__));
+  if (handle_ != INVALID_HANDLE_VALUE) {
+    ret = ::CloseHandle(handle_);
+    if (!ret) {
+      auto lastError = GetLastError();
+      s = IOErrorFromWindowsError("Directory closes failed for : " + GetName(),
+                                  lastError);
+    }
+    handle_ = NULL;
+  }
+  return s;
+}
+
+size_t WinDirectory::GetUniqueId(char* id, size_t max_size) const {
+  return GetUniqueIdFromFile(handle_, id, max_size);
+}
+//////////////////////////////////////////////////////////////////////////
+/// WinFileLock
+
+WinFileLock::~WinFileLock() {
+  BOOL ret __attribute__((__unused__));
+  ret = ::CloseHandle(hFile_);
+  assert(ret);
+}
+
+}  // namespace port
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/port/win/io_win.h b/src/rocksdb/port/win/io_win.h
new file mode 100644
index 000000000..a4fee8346
--- /dev/null
+++ b/src/rocksdb/port/win/io_win.h
@@ -0,0 +1,508 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <stdint.h>
+#include <windows.h>
+
+#include <mutex>
+#include <string>
+
+#include "rocksdb/file_system.h"
+#include "rocksdb/status.h"
+#include "util/aligned_buffer.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+
+std::string GetWindowsErrSz(DWORD err);
+
+inline IOStatus IOErrorFromWindowsError(const std::string& context, DWORD err) {
+  return ((err == ERROR_HANDLE_DISK_FULL) || (err == ERROR_DISK_FULL))
+             ? IOStatus::NoSpace(context, GetWindowsErrSz(err))
+         : ((err == ERROR_FILE_NOT_FOUND) || (err == ERROR_PATH_NOT_FOUND))
+             ? IOStatus::PathNotFound(context, GetWindowsErrSz(err))
+             : IOStatus::IOError(context, GetWindowsErrSz(err));
+}
+
+inline IOStatus IOErrorFromLastWindowsError(const std::string& context) {
+  return IOErrorFromWindowsError(context, GetLastError());
+}
+
+inline IOStatus IOError(const std::string& context, int err_number) {
+  return (err_number == ENOSPC)
+             ? IOStatus::NoSpace(context, errnoStr(err_number).c_str())
+         : (err_number == ENOENT)
+             ? IOStatus::PathNotFound(context, errnoStr(err_number).c_str())
+             : IOStatus::IOError(context, errnoStr(err_number).c_str());
+}
+
+class WinFileData;
+
+IOStatus pwrite(const WinFileData* file_data, const Slice& data,
+                uint64_t offset, size_t& bytes_written);
+
+IOStatus pread(const WinFileData* file_data, char* src, size_t num_bytes,
+               uint64_t offset, size_t& bytes_read);
+
+IOStatus fallocate(const std::string& filename, HANDLE hFile, uint64_t to_size);
+
+IOStatus ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize);
+
+size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size);
+
+class WinFileData {
+ protected:
+  const std::string filename_;
+  HANDLE hFile_;
+  // If true, the I/O issued would be direct I/O which the buffer
+  // will need to be aligned (not sure there is a guarantee that the buffer
+  // passed in is aligned).
+  const bool use_direct_io_;
+  const size_t sector_size_;
+
+ public:
+  // We want this class be usable both for inheritance (prive
+  // or protected) and for containment so __ctor and __dtor public
+  WinFileData(const std::string& filename, HANDLE hFile, bool direct_io);
+
+  virtual ~WinFileData() { this->CloseFile(); }
+
+  bool CloseFile() {
+    bool result = true;
+
+    if (hFile_ != NULL && hFile_ != INVALID_HANDLE_VALUE) {
+      result = ::CloseHandle(hFile_);
+      assert(result);
+      hFile_ = NULL;
+    }
+    return result;
+  }
+
+  const std::string& GetName() const { return filename_; }
+
+  HANDLE GetFileHandle() const { return hFile_; }
+
+  bool use_direct_io() const { return use_direct_io_; }
+
+  size_t GetSectorSize() const { return sector_size_; }
+
+  bool IsSectorAligned(const size_t off) const;
+
+  WinFileData(const WinFileData&) = delete;
+  WinFileData& operator=(const WinFileData&) = delete;
+};
+
+class WinSequentialFile : protected WinFileData, public FSSequentialFile {
+  // Override for behavior change when creating a custom env
+  virtual IOStatus PositionedReadInternal(char* src, size_t numBytes,
+                                          uint64_t offset,
+                                          size_t& bytes_read) const;
+
+ public:
+  WinSequentialFile(const std::string& fname, HANDLE f,
+                    const FileOptions& options);
+
+  ~WinSequentialFile();
+
+  WinSequentialFile(const WinSequentialFile&) = delete;
+  WinSequentialFile& operator=(const WinSequentialFile&) = delete;
+
+  IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+                char* scratch, IODebugContext* dbg) override;
+  IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options,
+                          Slice* result, char* scratch,
+                          IODebugContext* dbg) override;
+
+  IOStatus Skip(uint64_t n) override;
+
+  IOStatus InvalidateCache(size_t offset, size_t length) override;
+
+  virtual bool use_direct_io() const override {
+    return WinFileData::use_direct_io();
+  }
+};
+
+// mmap() based random-access
+class WinMmapReadableFile : private WinFileData, public FSRandomAccessFile {
+  HANDLE hMap_;
+
+  const void* mapped_region_;
+  const size_t length_;
+
+ public:
+  // mapped_region_[0,length-1] contains the mmapped contents of the file.
+  WinMmapReadableFile(const std::string& fileName, HANDLE hFile, HANDLE hMap,
+                      const void* mapped_region, size_t length);
+
+  ~WinMmapReadableFile();
+
+  WinMmapReadableFile(const WinMmapReadableFile&) = delete;
+  WinMmapReadableFile& operator=(const WinMmapReadableFile&) = delete;
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
+
+  virtual IOStatus InvalidateCache(size_t offset, size_t length) override;
+
+  virtual size_t GetUniqueId(char* id, size_t max_size) const override;
+};
+
+// We preallocate and use memcpy to append new
+// data to the file.  This is safe since we either properly close the
+// file before reading from it, or for log files, the reading code
+// knows enough to skip zero suffixes.
+class WinMmapFile : private WinFileData, public FSWritableFile {
+ private:
+  HANDLE hMap_;
+
+  const size_t page_size_;  // We flush the mapping view in page_size
+  // increments. We may decide if this is a memory
+  // page size or SSD page size
+  const size_t
+      allocation_granularity_;  // View must start at such a granularity
+
+  size_t reserved_size_;  // Preallocated size
+
+  size_t mapping_size_;  // The max size of the mapping object
+  // we want to guess the final file size to minimize the remapping
+  size_t view_size_;  // How much memory to map into a view at a time
+
+  char* mapped_begin_;  // Must begin at the file offset that is aligned with
+  // allocation_granularity_
+  char* mapped_end_;
+  char* dst_;  // Where to write next  (in range [mapped_begin_,mapped_end_])
+  char* last_sync_;  // Where have we synced up to
+
+  uint64_t file_offset_;  // Offset of mapped_begin_ in file
+
+  // Do we have unsynced writes?
+  bool pending_sync_;
+
+  // Can only truncate or reserve to a sector size aligned if
+  // used on files that are opened with Unbuffered I/O
+  IOStatus TruncateFile(uint64_t toSize);
+
+  IOStatus UnmapCurrentRegion();
+
+  IOStatus MapNewRegion(const IOOptions& options, IODebugContext* dbg);
+
+  virtual IOStatus PreallocateInternal(uint64_t spaceToReserve);
+
+ public:
+  WinMmapFile(const std::string& fname, HANDLE hFile, size_t page_size,
+              size_t allocation_granularity, const FileOptions& options);
+
+  ~WinMmapFile();
+
+  WinMmapFile(const WinMmapFile&) = delete;
+  WinMmapFile& operator=(const WinMmapFile&) = delete;
+
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  IODebugContext* dbg) override;
+  IOStatus Append(const Slice& data, const IOOptions& opts,
+                  const DataVerificationInfo& /* verification_info */,
+                  IODebugContext* dbg) override {
+    return Append(data, opts, dbg);
+  }
+
+  // Means Close() will properly take care of truncate
+  // and it does not need any additional information
+  IOStatus Truncate(uint64_t size, const IOOptions& options,
+                    IODebugContext* dbg) override;
+
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
+
+  // Flush only data
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
+
+  /**
+   * Flush data as well as metadata to stable storage.
+   */
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override;
+
+  /**
+   * Get the size of valid data in the file. This will not match the
+   * size that is returned from the filesystem because we use mmap
+   * to extend file by map_size every time.
+   */
+  uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus InvalidateCache(size_t offset, size_t length) override;
+
+  IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options,
+                    IODebugContext* dbg) override;
+
+  virtual size_t GetUniqueId(char* id, size_t max_size) const override;
+};
+
+class WinRandomAccessImpl {
+ protected:
+  WinFileData* file_base_;
+  size_t alignment_;
+
+  // Override for behavior change when creating a custom env
+  virtual IOStatus PositionedReadInternal(char* src, size_t numBytes,
+                                          uint64_t offset,
+                                          size_t& bytes_read) const;
+
+  WinRandomAccessImpl(WinFileData* file_base, size_t alignment,
+                      const FileOptions& options);
+
+  virtual ~WinRandomAccessImpl() {}
+
+  IOStatus ReadImpl(uint64_t offset, size_t n, Slice* result,
+                    char* scratch) const;
+
+  size_t GetAlignment() const { return alignment_; }
+
+ public:
+  WinRandomAccessImpl(const WinRandomAccessImpl&) = delete;
+  WinRandomAccessImpl& operator=(const WinRandomAccessImpl&) = delete;
+};
+
+// pread() based random-access
+class WinRandomAccessFile
+    : private WinFileData,
+      protected WinRandomAccessImpl,  // Want to be able to override
+                                      // PositionedReadInternal
+      public FSRandomAccessFile {
+ public:
+  WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment,
+                      const FileOptions& options);
+
+  ~WinRandomAccessFile();
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
+
+  virtual size_t GetUniqueId(char* id, size_t max_size) const override;
+
+  virtual bool use_direct_io() const override {
+    return WinFileData::use_direct_io();
+  }
+
+  IOStatus InvalidateCache(size_t offset, size_t length) override;
+
+  virtual size_t GetRequiredBufferAlignment() const override;
+};
+
+// This is a sequential write class. It has been mimicked (as others) after
+// the original Posix class. We add support for unbuffered I/O on windows as
+// well
+// we utilize the original buffer as an alignment buffer to write directly to
+// file with no buffering.
+// No buffering requires that the provided buffer is aligned to the physical
+// sector size (SSD page size) and
+// that all SetFilePointer() operations to occur with such an alignment.
+// We thus always write in sector/page size increments to the drive and leave
+// the tail for the next write OR for Close() at which point we pad with zeros.
+// No padding is required for
+// buffered access.
+class WinWritableImpl {
+ protected:
+  WinFileData* file_data_;
+  const uint64_t alignment_;
+  uint64_t
+      next_write_offset_;  // Needed because Windows does not support O_APPEND
+  uint64_t reservedsize_;  // how far we have reserved space
+
+  virtual IOStatus PreallocateInternal(uint64_t spaceToReserve);
+
+  WinWritableImpl(WinFileData* file_data, size_t alignment);
+
+  ~WinWritableImpl() {}
+
+  uint64_t GetAlignment() const { return alignment_; }
+
+  IOStatus AppendImpl(const Slice& data);
+
+  // Requires that the data is aligned as specified by
+  // GetRequiredBufferAlignment()
+  IOStatus PositionedAppendImpl(const Slice& data, uint64_t offset);
+
+  IOStatus TruncateImpl(uint64_t size);
+
+  IOStatus CloseImpl();
+
+  IOStatus SyncImpl(const IOOptions& options, IODebugContext* dbg);
+
+  uint64_t GetFileNextWriteOffset() {
+    // Double accounting now here with WritableFileWriter
+    // and this size will be wrong when unbuffered access is used
+    // but tests implement their own writable files and do not use
+    // WritableFileWrapper
+    // so we need to squeeze a square peg through
+    // a round hole here.
+    return next_write_offset_;
+  }
+
+  IOStatus AllocateImpl(uint64_t offset, uint64_t len);
+
+ public:
+  WinWritableImpl(const WinWritableImpl&) = delete;
+  WinWritableImpl& operator=(const WinWritableImpl&) = delete;
+};
+
+class WinWritableFile : private WinFileData,
+                        protected WinWritableImpl,
+                        public FSWritableFile {
+ public:
+  WinWritableFile(const std::string& fname, HANDLE hFile, size_t alignment,
+                  size_t capacity, const FileOptions& options);
+
+  ~WinWritableFile();
+
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  IODebugContext* dbg) override;
+  IOStatus Append(const Slice& data, const IOOptions& opts,
+                  const DataVerificationInfo& /* verification_info */,
+                  IODebugContext* dbg) override {
+    return Append(data, opts, dbg);
+  }
+
+  // Requires that the data is aligned as specified by
+  // GetRequiredBufferAlignment()
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& options,
+                            IODebugContext* dbg) override;
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& opts,
+                            const DataVerificationInfo& /* verification_info */,
+                            IODebugContext* dbg) override {
+    return PositionedAppend(data, offset, opts, dbg);
+  }
+
+  // Need to implement this so the file is truncated correctly
+  // when buffered and unbuffered mode
+  IOStatus Truncate(uint64_t size, const IOOptions& options,
+                    IODebugContext* dbg) override;
+
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
+
+  // write out the cached data to the OS cache
+  // This is now taken care of the WritableFileWriter
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override;
+
+  virtual bool IsSyncThreadSafe() const override;
+
+  // Indicates if the class makes use of direct I/O
+  // Use PositionedAppend
+  virtual bool use_direct_io() const override;
+
+  virtual size_t GetRequiredBufferAlignment() const override;
+
+  uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options,
+                    IODebugContext* dbg) override;
+
+  virtual size_t GetUniqueId(char* id, size_t max_size) const override;
+};
+
+class WinRandomRWFile : private WinFileData,
+                        protected WinRandomAccessImpl,
+                        protected WinWritableImpl,
+                        public FSRandomRWFile {
+ public:
+  WinRandomRWFile(const std::string& fname, HANDLE hFile, size_t alignment,
+                  const FileOptions& options);
+
+  ~WinRandomRWFile() {}
+
+  // Indicates if the class makes use of direct I/O
+  // If false you must pass aligned buffer to Write()
+  virtual bool use_direct_io() const override;
+
+  // Use the returned alignment value to allocate aligned
+  // buffer for Write() when use_direct_io() returns true
+  virtual size_t GetRequiredBufferAlignment() const override;
+
+  // Write bytes in `data` at  offset `offset`, Returns Status::OK() on success.
+  // Pass aligned buffer when use_direct_io() returns true.
+  IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options,
+                 IODebugContext* dbg) override;
+
+  // Read up to `n` bytes starting from offset `offset` and store them in
+  // result, provided `scratch` size should be at least `n`.
+  // Returns Status::OK() on success.
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
+
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override {
+    return Sync(options, dbg);
+  }
+
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
+};
+
+class WinMemoryMappedBuffer : public MemoryMappedFileBuffer {
+ private:
+  HANDLE file_handle_;
+  HANDLE map_handle_;
+
+ public:
+  WinMemoryMappedBuffer(HANDLE file_handle, HANDLE map_handle, void* base,
+                        size_t size)
+      : MemoryMappedFileBuffer(base, size),
+        file_handle_(file_handle),
+        map_handle_(map_handle) {}
+  ~WinMemoryMappedBuffer() override;
+};
+
+class WinDirectory : public FSDirectory {
+  const std::string filename_;
+  HANDLE handle_;
+
+ public:
+  explicit WinDirectory(const std::string& filename, HANDLE h) noexcept
+      : filename_(filename), handle_(h) {
+    assert(handle_ != INVALID_HANDLE_VALUE);
+  }
+  ~WinDirectory() {
+    if (handle_ != NULL) {
+      IOStatus s = WinDirectory::Close(IOOptions(), nullptr);
+      s.PermitUncheckedError();
+    }
+  }
+  const std::string& GetName() const { return filename_; }
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override;
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
+
+  size_t GetUniqueId(char* id, size_t max_size) const override;
+};
+
+class WinFileLock : public FileLock {
+ public:
+  explicit WinFileLock(HANDLE hFile) : hFile_(hFile) {
+    assert(hFile != NULL);
+    assert(hFile != INVALID_HANDLE_VALUE);
+  }
+
+  ~WinFileLock();
+
+ private:
+  HANDLE hFile_;
+};
+}  // namespace port
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/port/win/port_win.cc b/src/rocksdb/port/win/port_win.cc
new file mode 100644
index 000000000..37e8f655c
--- /dev/null
+++ b/src/rocksdb/port/win/port_win.cc
@@ -0,0 +1,303 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#if defined(OS_WIN)
+
+#include "port/win/port_win.h"
+
+#include <assert.h>
+#include <io.h>
+#include <rpc.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <chrono>
+#include <cstdlib>
+#include <exception>
+#include <memory>
+
+#include "port/port_dirent.h"
+#include "port/sys_time.h"
+
+#ifdef ROCKSDB_WINDOWS_UTF8_FILENAMES
+// utf8 <-> utf16
+#include <codecvt>
+#include <locale>
+#include <string>
+#endif
+
+#include "logging/logging.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+extern const bool kDefaultToAdaptiveMutex = false;
+
+namespace port {
+
+#ifdef ROCKSDB_WINDOWS_UTF8_FILENAMES
+std::string utf16_to_utf8(const std::wstring& utf16) {
+  std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t> convert;
+  return convert.to_bytes(utf16);
+}
+
+std::wstring utf8_to_utf16(const std::string& utf8) {
+  std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+  return converter.from_bytes(utf8);
+}
+#endif
+
+void GetTimeOfDay(TimeVal* tv, struct timezone* /* tz */) {
+  std::chrono::microseconds usNow(
+      std::chrono::duration_cast<std::chrono::microseconds>(
+          std::chrono::system_clock::now().time_since_epoch()));
+
+  std::chrono::seconds secNow(
+      std::chrono::duration_cast<std::chrono::seconds>(usNow));
+
+  tv->tv_sec = static_cast<long>(secNow.count());
+  tv->tv_usec = static_cast<long>(
+      usNow.count() -
+      std::chrono::duration_cast<std::chrono::microseconds>(secNow).count());
+}
+
+Mutex::~Mutex() {}
+
+CondVar::~CondVar() {}
+
+void CondVar::Wait() {
+  // Caller must ensure that mutex is held prior to calling this method
+  std::unique_lock<std::mutex> lk(mu_->getLock(), std::adopt_lock);
+#ifndef NDEBUG
+  mu_->locked_ = false;
+#endif
+  cv_.wait(lk);
+#ifndef NDEBUG
+  mu_->locked_ = true;
+#endif
+  // Release ownership of the lock as we don't want it to be unlocked when
+  // it goes out of scope (as we adopted the lock and didn't lock it ourselves)
+  lk.release();
+}
+
+bool CondVar::TimedWait(uint64_t abs_time_us) {
+  // MSVC++ library implements wait_until in terms of wait_for so
+  // we need to convert absolute wait into relative wait.
+  std::chrono::microseconds usAbsTime(abs_time_us);
+
+  std::chrono::microseconds usNow(
+      std::chrono::duration_cast<std::chrono::microseconds>(
+          std::chrono::system_clock::now().time_since_epoch()));
+  std::chrono::microseconds relTimeUs = (usAbsTime > usNow)
+                                            ? (usAbsTime - usNow)
+                                            : std::chrono::microseconds::zero();
+
+  // Caller must ensure that mutex is held prior to calling this method
+  std::unique_lock<std::mutex> lk(mu_->getLock(), std::adopt_lock);
+
+  // Work around https://github.com/microsoft/STL/issues/369
+#if defined(_MSC_VER) && \
+    (!defined(_MSVC_STL_UPDATE) || _MSVC_STL_UPDATE < 202008L)
+  if (relTimeUs == std::chrono::microseconds::zero()) {
+    lk.unlock();
+    lk.lock();
+  }
+#endif
+#ifndef NDEBUG
+  mu_->locked_ = false;
+#endif
+  std::cv_status cvStatus = cv_.wait_for(lk, relTimeUs);
+#ifndef NDEBUG
+  mu_->locked_ = true;
+#endif
+  // Release ownership of the lock as we don't want it to be unlocked when
+  // it goes out of scope (as we adopted the lock and didn't lock it ourselves)
+  lk.release();
+
+  if (cvStatus == std::cv_status::timeout) {
+    return true;
+  }
+
+  return false;
+}
+
+void CondVar::Signal() { cv_.notify_one(); }
+
+void CondVar::SignalAll() { cv_.notify_all(); }
+
+int PhysicalCoreID() { return GetCurrentProcessorNumber(); }
+
+void InitOnce(OnceType* once, void (*initializer)()) {
+  std::call_once(once->flag_, initializer);
+}
+
+// Private structure, exposed only by pointer
+struct DIR {
+  HANDLE handle_;
+  bool firstread_;
+  RX_WIN32_FIND_DATA data_;
+  dirent entry_;
+
+  DIR() : handle_(INVALID_HANDLE_VALUE), firstread_(true) {}
+
+  DIR(const DIR&) = delete;
+  DIR& operator=(const DIR&) = delete;
+
+  ~DIR() {
+    if (INVALID_HANDLE_VALUE != handle_) {
+      ::FindClose(handle_);
+    }
+  }
+};
+
+DIR* opendir(const char* name) {
+  if (!name || *name == 0) {
+    errno = ENOENT;
+    return nullptr;
+  }
+
+  std::string pattern(name);
+  pattern.append("\\").append("*");
+
+  std::unique_ptr<DIR> dir(new DIR);
+
+  dir->handle_ =
+      RX_FindFirstFileEx(RX_FN(pattern).c_str(),
+                         FindExInfoBasic,  // Do not want alternative name
+                         &dir->data_, FindExSearchNameMatch,
+                         NULL,  // lpSearchFilter
+                         0);
+
+  if (dir->handle_ == INVALID_HANDLE_VALUE) {
+    return nullptr;
+  }
+
+  RX_FILESTRING x(dir->data_.cFileName, RX_FNLEN(dir->data_.cFileName));
+  strcpy_s(dir->entry_.d_name, sizeof(dir->entry_.d_name), FN_TO_RX(x).c_str());
+
+  return dir.release();
+}
+
+struct dirent* readdir(DIR* dirp) {
+  if (!dirp || dirp->handle_ == INVALID_HANDLE_VALUE) {
+    errno = EBADF;
+    return nullptr;
+  }
+
+  if (dirp->firstread_) {
+    dirp->firstread_ = false;
+    return &dirp->entry_;
+  }
+
+  auto ret = RX_FindNextFile(dirp->handle_, &dirp->data_);
+
+  if (ret == 0) {
+    return nullptr;
+  }
+
+  RX_FILESTRING x(dirp->data_.cFileName, RX_FNLEN(dirp->data_.cFileName));
+  strcpy_s(dirp->entry_.d_name, sizeof(dirp->entry_.d_name),
+           FN_TO_RX(x).c_str());
+
+  return &dirp->entry_;
+}
+
+int closedir(DIR* dirp) {
+  delete dirp;
+  return 0;
+}
+
+int truncate(const char* path, int64_t length) {
+  if (path == nullptr) {
+    errno = EFAULT;
+    return -1;
+  }
+  return ROCKSDB_NAMESPACE::port::Truncate(path, length);
+}
+
+int Truncate(std::string path, int64_t len) {
+  if (len < 0) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  HANDLE hFile =
+      RX_CreateFile(RX_FN(path).c_str(), GENERIC_READ | GENERIC_WRITE,
+                    FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+                    NULL,           // Security attrs
+                    OPEN_EXISTING,  // Truncate existing file only
+                    FILE_ATTRIBUTE_NORMAL, NULL);
+
+  if (INVALID_HANDLE_VALUE == hFile) {
+    auto lastError = GetLastError();
+    if (lastError == ERROR_FILE_NOT_FOUND) {
+      errno = ENOENT;
+    } else if (lastError == ERROR_ACCESS_DENIED) {
+      errno = EACCES;
+    } else {
+      errno = EIO;
+    }
+    return -1;
+  }
+
+  int result = 0;
+  FILE_END_OF_FILE_INFO end_of_file;
+  end_of_file.EndOfFile.QuadPart = len;
+
+  if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file,
+                                  sizeof(FILE_END_OF_FILE_INFO))) {
+    errno = EIO;
+    result = -1;
+  }
+
+  CloseHandle(hFile);
+  return result;
+}
+
+void Crash(const std::string& srcfile, int srcline) {
+  fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline);
+  fflush(stdout);
+  abort();
+}
+
+int GetMaxOpenFiles() { return -1; }
+
+// Assume 4KB page size
+const size_t kPageSize = 4U * 1024U;
+
+void SetCpuPriority(ThreadId id, CpuPriority priority) {
+  // Not supported
+  (void)id;
+  (void)priority;
+}
+
+int64_t GetProcessID() { return GetCurrentProcessId(); }
+
+bool GenerateRfcUuid(std::string* output) {
+  UUID uuid;
+  UuidCreateSequential(&uuid);
+
+  RPC_CSTR rpc_str;
+  auto status = UuidToStringA(&uuid, &rpc_str);
+  if (status != RPC_S_OK) {
+    return false;
+  }
+
+  // rpc_str is nul-terminated
+  *output = reinterpret_cast<char*>(rpc_str);
+
+  status = RpcStringFreeA(&rpc_str);
+  assert(status == RPC_S_OK);
+
+  return true;
+}
+
+}  // namespace port
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/port/win/port_win.h b/src/rocksdb/port/win/port_win.h
new file mode 100644
index 000000000..989b5620b
--- /dev/null
+++ b/src/rocksdb/port/win/port_win.h
@@ -0,0 +1,378 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// See port_example.h for documentation for the following types/functions.
+
+#pragma once
+
+// Always want minimum headers
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+
+#include <windows.h>
+#include <string>
+#include <thread>
+#include <string.h>
+#include <mutex>
+#include <limits>
+#include <condition_variable>
+#include <malloc.h>
+#include <intrin.h>
+#include <process.h>
+
+#include <stdint.h>
+
+#include "port/win/win_thread.h"
+
+#include "rocksdb/options.h"
+
+#undef min
+#undef max
+#undef DeleteFile
+#undef GetCurrentTime
+
+#ifndef strcasecmp
+#define strcasecmp _stricmp
+#endif
+
+#undef GetCurrentTime
+#undef DeleteFile
+
+#ifndef _SSIZE_T_DEFINED
+using ssize_t = SSIZE_T;
+#endif
+
+// size_t printf formatting named in the manner of C99 standard formatting
+// strings such as PRIu64
+// in fact, we could use that one
+#ifndef ROCKSDB_PRIszt
+#define ROCKSDB_PRIszt "Iu"
+#endif
+
+#ifdef _MSC_VER
+#define __attribute__(A)
+
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+#define PREFETCH(addr, rw, locality)
+
+extern const bool kDefaultToAdaptiveMutex;
+
+namespace port {
+
+// "Windows is designed to run on little-endian computer architectures."
+// https://docs.microsoft.com/en-us/windows/win32/sysinfo/registry-value-types
+constexpr bool kLittleEndian = true;
+#undef PLATFORM_IS_LITTLE_ENDIAN
+
+class CondVar;
+
+class Mutex {
+ public:
+  static const char* kName() { return "std::mutex"; }
+
+  explicit Mutex(bool IGNORED_adaptive = kDefaultToAdaptiveMutex)
+#ifndef NDEBUG
+      : locked_(false)
+#endif
+  {
+    (void)IGNORED_adaptive;
+  }
+
+  ~Mutex();
+
+  void Lock() {
+    mutex_.lock();
+#ifndef NDEBUG
+    locked_ = true;
+#endif
+  }
+
+  void Unlock() {
+#ifndef NDEBUG
+    locked_ = false;
+#endif
+    mutex_.unlock();
+  }
+
+  bool TryLock() {
+    bool ret = mutex_.try_lock();
+#ifndef NDEBUG
+    if (ret) {
+      locked_ = true;
+    }
+#endif
+    return ret;
+  }
+
+  // this will assert if the mutex is not locked
+  // it does NOT verify that mutex is held by a calling thread
+  void AssertHeld() {
+#ifndef NDEBUG
+    assert(locked_);
+#endif
+  }
+
+  // Also implement std Lockable
+  inline void lock() { Lock(); }
+  inline void unlock() { Unlock(); }
+  inline bool try_lock() { return TryLock(); }
+
+  // Mutex is move only with lock ownership transfer
+  Mutex(const Mutex&) = delete;
+  void operator=(const Mutex&) = delete;
+
+ private:
+  friend class CondVar;
+
+  std::mutex& getLock() { return mutex_; }
+
+  std::mutex mutex_;
+#ifndef NDEBUG
+  bool locked_;
+#endif
+};
+
+class RWMutex {
+ public:
+  RWMutex() { InitializeSRWLock(&srwLock_); }
+  // No copying allowed
+  RWMutex(const RWMutex&) = delete;
+  void operator=(const RWMutex&) = delete;
+
+  void ReadLock() { AcquireSRWLockShared(&srwLock_); }
+
+  void WriteLock() { AcquireSRWLockExclusive(&srwLock_); }
+
+  void ReadUnlock() { ReleaseSRWLockShared(&srwLock_); }
+
+  void WriteUnlock() { ReleaseSRWLockExclusive(&srwLock_); }
+
+  // Empty as in POSIX
+  void AssertHeld() {}
+
+ private:
+  SRWLOCK srwLock_;
+};
+
+class CondVar {
+ public:
+  explicit CondVar(Mutex* mu) : mu_(mu) {}
+
+  ~CondVar();
+  void Wait();
+  bool TimedWait(uint64_t expiration_time);
+  void Signal();
+  void SignalAll();
+
+  // Condition var is not copy/move constructible
+  CondVar(const CondVar&) = delete;
+  CondVar& operator=(const CondVar&) = delete;
+
+  CondVar(CondVar&&) = delete;
+  CondVar& operator=(CondVar&&) = delete;
+
+ private:
+  std::condition_variable cv_;
+  Mutex* mu_;
+};
+
+#ifdef _POSIX_THREADS
+using Thread = std::thread;
+#else
+// Wrapper around the platform efficient
+// or otherwise preferrable implementation
+using Thread = WindowsThread;
+#endif
+
+// OnceInit type helps emulate
+// Posix semantics with initialization
+// adopted in the project
+struct OnceType {
+  struct Init {};
+
+  OnceType() {}
+  OnceType(const Init&) {}
+  OnceType(const OnceType&) = delete;
+  OnceType& operator=(const OnceType&) = delete;
+
+  std::once_flag flag_;
+};
+
+#define LEVELDB_ONCE_INIT port::OnceType::Init()
+extern void InitOnce(OnceType* once, void (*initializer)());
+
+#ifndef CACHE_LINE_SIZE
+#define CACHE_LINE_SIZE 64U
+#endif
+
+#ifdef ROCKSDB_JEMALLOC
+// Separate inlines so they can be replaced if needed
+void* jemalloc_aligned_alloc(size_t size, size_t alignment) noexcept;
+void jemalloc_aligned_free(void* p) noexcept;
+#endif
+
+inline void* cacheline_aligned_alloc(size_t size) {
+#ifdef ROCKSDB_JEMALLOC
+  return jemalloc_aligned_alloc(size, CACHE_LINE_SIZE);
+#else
+  return _aligned_malloc(size, CACHE_LINE_SIZE);
+#endif
+}
+
+inline void cacheline_aligned_free(void* memblock) {
+#ifdef ROCKSDB_JEMALLOC
+  jemalloc_aligned_free(memblock);
+#else
+  _aligned_free(memblock);
+#endif
+}
+
+extern const size_t kPageSize;
+
+// Part of C++11
+#define ALIGN_AS(n) alignas(n)
+
+static inline void AsmVolatilePause() {
+#if defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM)
+  YieldProcessor();
+#endif
+  // it would be nice to get "wfe" on ARM here
+}
+
+extern int PhysicalCoreID();
+
+// For Thread Local Storage abstraction
+using pthread_key_t = DWORD;
+
+inline int pthread_key_create(pthread_key_t* key, void (*destructor)(void*)) {
+  // Not used
+  (void)destructor;
+
+  pthread_key_t k = TlsAlloc();
+  if (TLS_OUT_OF_INDEXES == k) {
+    return ENOMEM;
+  }
+
+  *key = k;
+  return 0;
+}
+
+inline int pthread_key_delete(pthread_key_t key) {
+  if (!TlsFree(key)) {
+    return EINVAL;
+  }
+  return 0;
+}
+
+inline int pthread_setspecific(pthread_key_t key, const void* value) {
+  if (!TlsSetValue(key, const_cast<void*>(value))) {
+    return ENOMEM;
+  }
+  return 0;
+}
+
+inline void* pthread_getspecific(pthread_key_t key) {
+  void* result = TlsGetValue(key);
+  if (!result) {
+    if (GetLastError() != ERROR_SUCCESS) {
+      errno = EINVAL;
+    } else {
+      errno = NOERROR;
+    }
+  }
+  return result;
+}
+
+// UNIX equiv although errno numbers will be off
+// using C-runtime to implement. Note, this does not
+// feel space with zeros in case the file is extended.
+int truncate(const char* path, int64_t length);
+int Truncate(std::string path, int64_t length);
+void Crash(const std::string& srcfile, int srcline);
+extern int GetMaxOpenFiles();
+std::string utf16_to_utf8(const std::wstring& utf16);
+std::wstring utf8_to_utf16(const std::string& utf8);
+
+using ThreadId = int;
+
+extern void SetCpuPriority(ThreadId id, CpuPriority priority);
+
+int64_t GetProcessID();
+
+// Uses platform APIs to generate a 36-character RFC-4122 UUID. Returns
+// true on success or false on failure.
+bool GenerateRfcUuid(std::string* output);
+
+}  // namespace port
+
+#ifdef ROCKSDB_WINDOWS_UTF8_FILENAMES
+
+#define RX_FILESTRING std::wstring
+#define RX_FN(a) ROCKSDB_NAMESPACE::port::utf8_to_utf16(a)
+#define FN_TO_RX(a) ROCKSDB_NAMESPACE::port::utf16_to_utf8(a)
+#define RX_FNCMP(a, b) ::wcscmp(a, RX_FN(b).c_str())
+#define RX_FNLEN(a) ::wcslen(a)
+
+#define RX_DeleteFile DeleteFileW
+#define RX_CreateFile CreateFileW
+#define RX_CreateFileMapping CreateFileMappingW
+#define RX_GetFileAttributesEx GetFileAttributesExW
+#define RX_FindFirstFileEx FindFirstFileExW
+#define RX_FindNextFile FindNextFileW
+#define RX_WIN32_FIND_DATA WIN32_FIND_DATAW
+#define RX_CreateDirectory CreateDirectoryW
+#define RX_RemoveDirectory RemoveDirectoryW
+#define RX_GetFileAttributesEx GetFileAttributesExW
+#define RX_MoveFileEx MoveFileExW
+#define RX_CreateHardLink CreateHardLinkW
+#define RX_PathIsRelative PathIsRelativeW
+#define RX_GetCurrentDirectory GetCurrentDirectoryW
+#define RX_GetDiskFreeSpaceEx GetDiskFreeSpaceExW
+#define RX_PathIsDirectory PathIsDirectoryW
+
+#else
+
+#define RX_FILESTRING std::string
+#define RX_FN(a) a
+#define FN_TO_RX(a) a
+#define RX_FNCMP(a, b) strcmp(a, b)
+#define RX_FNLEN(a) strlen(a)
+
+#define RX_DeleteFile DeleteFileA
+#define RX_CreateFile CreateFileA
+#define RX_CreateFileMapping CreateFileMappingA
+#define RX_GetFileAttributesEx GetFileAttributesExA
+#define RX_FindFirstFileEx FindFirstFileExA
+#define RX_CreateDirectory CreateDirectoryA
+#define RX_FindNextFile FindNextFileA
+#define RX_WIN32_FIND_DATA WIN32_FIND_DATAA
+#define RX_CreateDirectory CreateDirectoryA
+#define RX_RemoveDirectory RemoveDirectoryA
+#define RX_GetFileAttributesEx GetFileAttributesExA
+#define RX_MoveFileEx MoveFileExA
+#define RX_CreateHardLink CreateHardLinkA
+#define RX_PathIsRelative PathIsRelativeA
+#define RX_GetCurrentDirectory GetCurrentDirectoryA
+#define RX_GetDiskFreeSpaceEx GetDiskFreeSpaceExA
+#define RX_PathIsDirectory PathIsDirectoryA
+
+#endif
+
+using port::pthread_getspecific;
+using port::pthread_key_create;
+using port::pthread_key_delete;
+using port::pthread_key_t;
+using port::pthread_setspecific;
+using port::truncate;
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/port/win/win_jemalloc.cc b/src/rocksdb/port/win/win_jemalloc.cc
new file mode 100644
index 000000000..cf38f55b7
--- /dev/null
+++ b/src/rocksdb/port/win/win_jemalloc.cc
@@ -0,0 +1,80 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#if defined(OS_WIN)
+
+#ifndef ROCKSDB_JEMALLOC
+#error This file can only be part of jemalloc aware build
+#endif
+
+#include <stdexcept>
+
+#include "jemalloc/jemalloc.h"
+#include "port/win/port_win.h"
+
+#if defined(ZSTD) && defined(ZSTD_STATIC_LINKING_ONLY)
+#include <zstd.h>
+#if (ZSTD_VERSION_NUMBER >= 500)
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+void* JemallocAllocateForZSTD(void* /* opaque */, size_t size) {
+  return je_malloc(size);
+}
+void JemallocDeallocateForZSTD(void* /* opaque */, void* address) {
+  je_free(address);
+}
+ZSTD_customMem GetJeZstdAllocationOverrides() {
+  return {JemallocAllocateForZSTD, JemallocDeallocateForZSTD, nullptr};
+}
+}  // namespace port
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // (ZSTD_VERSION_NUMBER >= 500)
+#endif  // defined(ZSTD) defined(ZSTD_STATIC_LINKING_ONLY)
+
+// Global operators to be replaced by a linker when this file is
+// a part of the build
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+void* jemalloc_aligned_alloc(size_t size, size_t alignment) noexcept {
+  return je_aligned_alloc(alignment, size);
+}
+void jemalloc_aligned_free(void* p) noexcept { je_free(p); }
+}  // namespace port
+}  // namespace ROCKSDB_NAMESPACE
+
+void* operator new(size_t size) {
+  void* p = je_malloc(size);
+  if (!p) {
+    throw std::bad_alloc();
+  }
+  return p;
+}
+
+void* operator new[](size_t size) {
+  void* p = je_malloc(size);
+  if (!p) {
+    throw std::bad_alloc();
+  }
+  return p;
+}
+
+void operator delete(void* p) {
+  if (p) {
+    je_free(p);
+  }
+}
+
+void operator delete[](void* p) {
+  if (p) {
+    je_free(p);
+  }
+}
+
+#endif
diff --git a/src/rocksdb/port/win/win_logger.cc b/src/rocksdb/port/win/win_logger.cc
new file mode 100644
index 000000000..072ea419a
--- /dev/null
+++ b/src/rocksdb/port/win/win_logger.cc
@@ -0,0 +1,192 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Logger implementation that can be shared by all environments
+// where enough posix functionality is available.
+
+#if defined(OS_WIN)
+
+#include "port/win/win_logger.h"
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <time.h>
+
+#include <algorithm>
+#include <atomic>
+
+#include "monitoring/iostats_context_imp.h"
+#include "port/sys_time.h"
+#include "port/win/env_win.h"
+#include "port/win/io_win.h"
+#include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace port {
+
+WinLogger::WinLogger(uint64_t (*gettid)(), SystemClock* clock, HANDLE file,
+                     const InfoLogLevel log_level)
+    : Logger(log_level),
+      file_(file),
+      gettid_(gettid),
+      log_size_(0),
+      last_flush_micros_(0),
+      clock_(clock),
+      flush_pending_(false) {
+  assert(file_ != NULL);
+  assert(file_ != INVALID_HANDLE_VALUE);
+}
+
+void WinLogger::DebugWriter(const char* str, int len) {
+  assert(file_ != INVALID_HANDLE_VALUE);
+  DWORD bytesWritten = 0;
+  BOOL ret = WriteFile(file_, str, len, &bytesWritten, NULL);
+  if (ret == FALSE) {
+    std::string errSz = GetWindowsErrSz(GetLastError());
+    fprintf(stderr, "%s", errSz.c_str());
+  }
+}
+
+WinLogger::~WinLogger() { CloseInternal().PermitUncheckedError(); }
+
+Status WinLogger::CloseImpl() { return CloseInternal(); }
+
+Status WinLogger::CloseInternal() {
+  Status s;
+  if (INVALID_HANDLE_VALUE != file_) {
+    BOOL ret = FlushFileBuffers(file_);
+    if (ret == 0) {
+      auto lastError = GetLastError();
+      s = IOErrorFromWindowsError("Failed to flush LOG on Close() ", lastError);
+    }
+    ret = CloseHandle(file_);
+    // On error the return value is zero
+    if (ret == 0 && s.ok()) {
+      auto lastError = GetLastError();
+      s = IOErrorFromWindowsError("Failed to flush LOG on Close() ", lastError);
+    }
+    file_ = INVALID_HANDLE_VALUE;
+    closed_ = true;
+  }
+  return s;
+}
+
+void WinLogger::Flush() {
+  assert(file_ != INVALID_HANDLE_VALUE);
+  if (flush_pending_) {
+    flush_pending_ = false;
+    // With Windows API writes go to OS buffers directly so no fflush needed
+    // unlike with C runtime API. We don't flush all the way to disk
+    // for perf reasons.
+  }
+
+  last_flush_micros_ = clock_->NowMicros();
+}
+
+void WinLogger::Logv(const char* format, va_list ap) {
+  IOSTATS_TIMER_GUARD(logger_nanos);
+  assert(file_ != INVALID_HANDLE_VALUE);
+
+  const uint64_t thread_id = (*gettid_)();
+
+  // We try twice: the first time with a fixed-size stack allocated buffer,
+  // and the second time with a much larger dynamically allocated buffer.
+  char buffer[500];
+  std::unique_ptr<char[]> largeBuffer;
+  for (int iter = 0; iter < 2; ++iter) {
+    char* base;
+    int bufsize;
+    if (iter == 0) {
+      bufsize = sizeof(buffer);
+      base = buffer;
+    } else {
+      bufsize = 30000;
+      largeBuffer.reset(new char[bufsize]);
+      base = largeBuffer.get();
+    }
+
+    char* p = base;
+    char* limit = base + bufsize;
+
+    port::TimeVal now_tv;
+    port::GetTimeOfDay(&now_tv, nullptr);
+    const time_t seconds = now_tv.tv_sec;
+    struct tm t;
+    localtime_s(&t, &seconds);
+    p += snprintf(p, limit - p, "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ",
+                  t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour,
+                  t.tm_min, t.tm_sec, static_cast<int>(now_tv.tv_usec),
+                  static_cast<long long unsigned int>(thread_id));
+
+    // Print the message
+    if (p < limit) {
+      va_list backup_ap;
+      va_copy(backup_ap, ap);
+      int done = vsnprintf(p, limit - p, format, backup_ap);
+      if (done > 0) {
+        p += done;
+      } else {
+        continue;
+      }
+      va_end(backup_ap);
+    }
+
+    // Truncate to available space if necessary
+    if (p >= limit) {
+      if (iter == 0) {
+        continue;  // Try again with larger buffer
+      } else {
+        p = limit - 1;
+      }
+    }
+
+    // Add newline if necessary
+    if (p == base || p[-1] != '\n') {
+      *p++ = '\n';
+    }
+
+    assert(p <= limit);
+    const size_t write_size = p - base;
+
+    DWORD bytesWritten = 0;
+    BOOL ret = WriteFile(file_, base, static_cast<DWORD>(write_size),
+                         &bytesWritten, NULL);
+    if (ret == FALSE) {
+      std::string errSz = GetWindowsErrSz(GetLastError());
+      fprintf(stderr, "%s", errSz.c_str());
+    }
+
+    flush_pending_ = true;
+    assert((bytesWritten == write_size) || (ret == FALSE));
+    if (bytesWritten > 0) {
+      log_size_ += write_size;
+    }
+
+    uint64_t now_micros =
+        static_cast<uint64_t>(now_tv.tv_sec) * 1000000 + now_tv.tv_usec;
+    if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) {
+      flush_pending_ = false;
+      // With Windows API writes go to OS buffers directly so no fflush needed
+      // unlike with C runtime API. We don't flush all the way to disk
+      // for perf reasons.
+      last_flush_micros_ = now_micros;
+    }
+    break;
+  }
+}
+
+size_t WinLogger::GetLogFileSize() const { return log_size_; }
+
+}  // namespace port
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/port/win/win_logger.h b/src/rocksdb/port/win/win_logger.h
new file mode 100644
index 000000000..1ca4610e9
--- /dev/null
+++ b/src/rocksdb/port/win/win_logger.h
@@ -0,0 +1,64 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Logger implementation that can be shared by all environments
+// where enough posix functionality is available.
+
+#pragma once
+
+#include <stdint.h>
+#include <windows.h>
+
+#include <atomic>
+#include <memory>
+
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+class SystemClock;
+
+namespace port {
+class WinLogger : public ROCKSDB_NAMESPACE::Logger {
+ public:
+  WinLogger(uint64_t (*gettid)(), SystemClock* clock, HANDLE file,
+            const InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL);
+
+  virtual ~WinLogger();
+
+  WinLogger(const WinLogger&) = delete;
+
+  WinLogger& operator=(const WinLogger&) = delete;
+
+  void Flush() override;
+
+  using ROCKSDB_NAMESPACE::Logger::Logv;
+  void Logv(const char* format, va_list ap) override;
+
+  size_t GetLogFileSize() const override;
+
+  void DebugWriter(const char* str, int len);
+
+ protected:
+  Status CloseImpl() override;
+
+ private:
+  HANDLE file_;
+  uint64_t (*gettid_)();  // Return the thread id for the current thread
+  std::atomic_size_t log_size_;
+  std::atomic_uint_fast64_t last_flush_micros_;
+  SystemClock* clock_;
+  bool flush_pending_;
+
+  Status CloseInternal();
+
+  const static uint64_t flush_every_seconds_ = 5;
+};
+}  // namespace port
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/port/win/win_thread.cc b/src/rocksdb/port/win/win_thread.cc
new file mode 100644
index 000000000..3c82e736e
--- /dev/null
+++ b/src/rocksdb/port/win/win_thread.cc
@@ -0,0 +1,170 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#if defined(OS_WIN)
+// Most Mingw builds support std::thread only when using posix threads.
+// In that case, some of these functions will be unavailable.
+// Note that we're using either WindowsThread or std::thread, depending on
+// which one is available.
+#ifndef _POSIX_THREADS
+
+#include "port/win/win_thread.h"
+
+#include <assert.h>
+#include <process.h>  // __beginthreadex
+#include <windows.h>
+
+#include <stdexcept>
+#include <system_error>
+#include <thread>
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+
+struct WindowsThread::Data {
+  std::function<void()> func_;
+  uintptr_t handle_;
+
+  Data(std::function<void()>&& func) : func_(std::move(func)), handle_(0) {}
+
+  Data(const Data&) = delete;
+  Data& operator=(const Data&) = delete;
+
+  static unsigned int __stdcall ThreadProc(void* arg);
+};
+
+void WindowsThread::Init(std::function<void()>&& func) {
+  data_ = std::make_shared<Data>(std::move(func));
+  // We create another instance of std::shared_ptr to get an additional ref
+  // since we may detach and destroy this instance before the threadproc
+  // may start to run. We choose to allocate this additional ref on the heap
+  // so we do not need to synchronize and allow this thread to proceed
+  std::unique_ptr<std::shared_ptr<Data>> th_data(
+      new std::shared_ptr<Data>(data_));
+
+  data_->handle_ = _beginthreadex(NULL,
+                                  0,  // stack size
+                                  &Data::ThreadProc, th_data.get(),
+                                  0,  // init flag
+                                  &th_id_);
+
+  if (data_->handle_ == 0) {
+    throw std::system_error(
+        std::make_error_code(std::errc::resource_unavailable_try_again),
+        "Unable to create a thread");
+  }
+  th_data.release();
+}
+
+WindowsThread::WindowsThread() : data_(nullptr), th_id_(0) {}
+
+WindowsThread::~WindowsThread() {
+  // Must be joined or detached
+  // before destruction.
+  // This is the same as std::thread
+  if (data_) {
+    if (joinable()) {
+      assert(false);
+      std::terminate();
+    }
+    data_.reset();
+  }
+}
+
+WindowsThread::WindowsThread(WindowsThread&& o) noexcept : WindowsThread() {
+  *this = std::move(o);
+}
+
+WindowsThread& WindowsThread::operator=(WindowsThread&& o) noexcept {
+  if (joinable()) {
+    assert(false);
+    std::terminate();
+  }
+
+  data_ = std::move(o.data_);
+
+  // Per spec both instances will have the same id
+  th_id_ = o.th_id_;
+
+  return *this;
+}
+
+bool WindowsThread::joinable() const { return (data_ && data_->handle_ != 0); }
+
+WindowsThread::native_handle_type WindowsThread::native_handle() const {
+  return reinterpret_cast<native_handle_type>(data_->handle_);
+}
+
+unsigned WindowsThread::hardware_concurrency() {
+  return std::thread::hardware_concurrency();
+}
+
+void WindowsThread::join() {
+  if (!joinable()) {
+    assert(false);
+    throw std::system_error(std::make_error_code(std::errc::invalid_argument),
+                            "Thread is no longer joinable");
+  }
+
+  if (GetThreadId(GetCurrentThread()) == th_id_) {
+    assert(false);
+    throw std::system_error(
+        std::make_error_code(std::errc::resource_deadlock_would_occur),
+        "Can not join itself");
+  }
+
+  auto ret =
+      WaitForSingleObject(reinterpret_cast<HANDLE>(data_->handle_), INFINITE);
+  if (ret != WAIT_OBJECT_0) {
+    auto lastError = GetLastError();
+    assert(false);
+    throw std::system_error(static_cast<int>(lastError), std::system_category(),
+                            "WaitForSingleObjectFailed: thread join");
+  }
+
+  BOOL rc
+#if defined(_MSC_VER)
+      = FALSE;
+#else
+      __attribute__((__unused__));
+#endif
+  rc = CloseHandle(reinterpret_cast<HANDLE>(data_->handle_));
+  assert(rc != 0);
+  data_->handle_ = 0;
+}
+
+bool WindowsThread::detach() {
+  if (!joinable()) {
+    assert(false);
+    throw std::system_error(std::make_error_code(std::errc::invalid_argument),
+                            "Thread is no longer available");
+  }
+
+  BOOL ret = CloseHandle(reinterpret_cast<HANDLE>(data_->handle_));
+  data_->handle_ = 0;
+
+  return (ret != 0);
+}
+
+void WindowsThread::swap(WindowsThread& o) {
+  data_.swap(o.data_);
+  std::swap(th_id_, o.th_id_);
+}
+
+unsigned int __stdcall WindowsThread::Data::ThreadProc(void* arg) {
+  auto ptr = reinterpret_cast<std::shared_ptr<Data>*>(arg);
+  std::unique_ptr<std::shared_ptr<Data>> data(ptr);
+  (*data)->func_();
+  return 0;
+}
+}  // namespace port
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !_POSIX_THREADS
+#endif  // OS_WIN
diff --git a/src/rocksdb/port/win/win_thread.h b/src/rocksdb/port/win/win_thread.h
new file mode 100644
index 000000000..916033b77
--- /dev/null
+++ b/src/rocksdb/port/win/win_thread.h
@@ -0,0 +1,117 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifndef _POSIX_THREADS
+
+#include <functional>
+#include <memory>
+#include <type_traits>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+
+// This class is a replacement for std::thread
+// 2 reasons we do not like std::thread:
+//  -- is that it dynamically allocates its internals that are automatically
+//     freed when  the thread terminates and not on the destruction of the
+//     object. This makes it difficult to control the source of memory
+//     allocation
+//  -  This implements Pimpl so we can easily replace the guts of the
+//      object in our private version if necessary.
+class WindowsThread {
+  struct Data;
+
+  std::shared_ptr<Data> data_;
+  unsigned int th_id_;
+
+  void Init(std::function<void()>&&);
+
+ public:
+  using native_handle_type = void*;
+
+  // Construct with no thread
+  WindowsThread();
+
+  // Template constructor
+  //
+  // This templated constructor accomplishes several things
+  //
+  // - Allows the class as whole to be not a template
+  //
+  // - take "universal" references to support both _lvalues and _rvalues
+  //
+  // -  because this constructor is a catchall case in many respects it
+  //    may prevent us from using both the default __ctor, the move __ctor.
+  //    Also it may circumvent copy __ctor deletion. To work around this
+  //    we make sure this one has at least one argument and eliminate
+  //    it from the overload  selection when WindowsThread is the first
+  //    argument.
+  //
+  // - construct with Fx(Ax...) with a variable number of types/arguments.
+  //
+  // - Gathers together the callable object with its arguments and constructs
+  //   a single callable entity
+  //
+  // - Makes use of std::function to convert it to a specification-template
+  //   dependent type that both checks the signature conformance to ensure
+  //   that all of the necessary arguments are provided and allows pimpl
+  //   implementation.
+  template <class Fn, class... Args,
+            class = typename std::enable_if<!std::is_same<
+                typename std::decay<Fn>::type, WindowsThread>::value>::type>
+  explicit WindowsThread(Fn&& fx, Args&&... ax) : WindowsThread() {
+    // Use binder to create a single callable entity
+    auto binder = std::bind(std::forward<Fn>(fx), std::forward<Args>(ax)...);
+    // Use std::function to take advantage of the type erasure
+    // so we can still hide implementation within pimpl
+    // This also makes sure that the binder signature is compliant
+    std::function<void()> target = binder;
+
+    Init(std::move(target));
+  }
+
+  ~WindowsThread();
+
+  WindowsThread(const WindowsThread&) = delete;
+
+  WindowsThread& operator=(const WindowsThread&) = delete;
+
+  WindowsThread(WindowsThread&&) noexcept;
+
+  WindowsThread& operator=(WindowsThread&&) noexcept;
+
+  bool joinable() const;
+
+  unsigned int get_id() const { return th_id_; }
+
+  native_handle_type native_handle() const;
+
+  static unsigned hardware_concurrency();
+
+  void join();
+
+  bool detach();
+
+  void swap(WindowsThread&);
+};
+}  // namespace port
+}  // namespace ROCKSDB_NAMESPACE
+
+namespace std {
+inline void swap(ROCKSDB_NAMESPACE::port::WindowsThread& th1,
+                 ROCKSDB_NAMESPACE::port::WindowsThread& th2) {
+  th1.swap(th2);
+}
+}  // namespace std
+
+#endif  // !_POSIX_THREADS
diff --git a/src/rocksdb/port/win/xpress_win.cc b/src/rocksdb/port/win/xpress_win.cc
new file mode 100644
index 000000000..21904d502
--- /dev/null
+++ b/src/rocksdb/port/win/xpress_win.cc
@@ -0,0 +1,210 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#if defined(OS_WIN)
+
+#include "port/win/xpress_win.h"
+
+#include <windows.h>
+
+#include <cassert>
+#include <iostream>
+#include <limits>
+#include <memory>
+
+#ifdef XPRESS
+
+// Put this under ifdef so windows systems w/o this
+// can still build
+#include <compressapi.h>
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+namespace xpress {
+
+// Helpers
+namespace {
+
+auto CloseCompressorFun = [](void* h) {
+  if (NULL != h) {
+    ::CloseCompressor(reinterpret_cast<COMPRESSOR_HANDLE>(h));
+  }
+};
+
+auto CloseDecompressorFun = [](void* h) {
+  if (NULL != h) {
+    ::CloseDecompressor(reinterpret_cast<DECOMPRESSOR_HANDLE>(h));
+  }
+};
+}  // namespace
+
+bool Compress(const char* input, size_t length, std::string* output) {
+  assert(input != nullptr);
+  assert(output != nullptr);
+
+  if (length == 0) {
+    output->clear();
+    return true;
+  }
+
+  COMPRESS_ALLOCATION_ROUTINES* allocRoutinesPtr = nullptr;
+
+  COMPRESSOR_HANDLE compressor = NULL;
+
+  BOOL success =
+      CreateCompressor(COMPRESS_ALGORITHM_XPRESS,  //  Compression Algorithm
+                       allocRoutinesPtr,  //  Optional allocation routine
+                       &compressor);      //  Handle
+
+  if (!success) {
+#ifdef _DEBUG
+    std::cerr << "XPRESS: Failed to create Compressor LastError: "
+              << GetLastError() << std::endl;
+#endif
+    return false;
+  }
+
+  std::unique_ptr<void, decltype(CloseCompressorFun)> compressorGuard(
+      compressor, CloseCompressorFun);
+
+  SIZE_T compressedBufferSize = 0;
+
+  //  Query compressed buffer size.
+  success = ::Compress(compressor,                //  Compressor Handle
+                       const_cast<char*>(input),  //  Input buffer
+                       length,                    //  Uncompressed data size
+                       NULL,                      //  Compressed Buffer
+                       0,                         //  Compressed Buffer size
+                       &compressedBufferSize);    //  Compressed Data size
+
+  if (!success) {
+    auto lastError = GetLastError();
+
+    if (lastError != ERROR_INSUFFICIENT_BUFFER) {
+#ifdef _DEBUG
+      std::cerr
+          << "XPRESS: Failed to estimate compressed buffer size LastError "
+          << lastError << std::endl;
+#endif
+      return false;
+    }
+  }
+
+  assert(compressedBufferSize > 0);
+
+  std::string result;
+  result.resize(compressedBufferSize);
+
+  SIZE_T compressedDataSize = 0;
+
+  //  Compress
+  success = ::Compress(compressor,                //  Compressor Handle
+                       const_cast<char*>(input),  //  Input buffer
+                       length,                    //  Uncompressed data size
+                       &result[0],                //  Compressed Buffer
+                       compressedBufferSize,      //  Compressed Buffer size
+                       &compressedDataSize);      //  Compressed Data size
+
+  if (!success) {
+#ifdef _DEBUG
+    std::cerr << "XPRESS: Failed to compress LastError " << GetLastError()
+              << std::endl;
+#endif
+    return false;
+  }
+
+  result.resize(compressedDataSize);
+  output->swap(result);
+
+  return true;
+}
+
+char* Decompress(const char* input_data, size_t input_length,
+                 size_t* uncompressed_size) {
+  assert(input_data != nullptr);
+  assert(uncompressed_size != nullptr);
+
+  if (input_length == 0) {
+    return nullptr;
+  }
+
+  COMPRESS_ALLOCATION_ROUTINES* allocRoutinesPtr = nullptr;
+
+  DECOMPRESSOR_HANDLE decompressor = NULL;
+
+  BOOL success =
+      CreateDecompressor(COMPRESS_ALGORITHM_XPRESS,  //  Compression Algorithm
+                         allocRoutinesPtr,  //  Optional allocation routine
+                         &decompressor);    //  Handle
+
+  if (!success) {
+#ifdef _DEBUG
+    std::cerr << "XPRESS: Failed to create Decompressor LastError "
+              << GetLastError() << std::endl;
+#endif
+    return nullptr;
+  }
+
+  std::unique_ptr<void, decltype(CloseDecompressorFun)> compressorGuard(
+      decompressor, CloseDecompressorFun);
+
+  SIZE_T decompressedBufferSize = 0;
+
+  success = ::Decompress(decompressor,                   //  Compressor Handle
+                         const_cast<char*>(input_data),  //  Compressed data
+                         input_length,              //  Compressed data size
+                         NULL,                      //  Buffer set to NULL
+                         0,                         //  Buffer size set to 0
+                         &decompressedBufferSize);  //  Decompressed Data size
+
+  if (!success) {
+    auto lastError = GetLastError();
+
+    if (lastError != ERROR_INSUFFICIENT_BUFFER) {
+#ifdef _DEBUG
+      std::cerr
+          << "XPRESS: Failed to estimate decompressed buffer size LastError "
+          << lastError << std::endl;
+#endif
+      return nullptr;
+    }
+  }
+
+  assert(decompressedBufferSize > 0);
+
+  // The callers are deallocating using delete[]
+  // thus we must allocate with new[]
+  std::unique_ptr<char[]> outputBuffer(new char[decompressedBufferSize]);
+
+  SIZE_T decompressedDataSize = 0;
+
+  success = ::Decompress(decompressor, const_cast<char*>(input_data),
+                         input_length, outputBuffer.get(),
+                         decompressedBufferSize, &decompressedDataSize);
+
+  if (!success) {
+#ifdef _DEBUG
+    std::cerr << "XPRESS: Failed to decompress LastError " << GetLastError()
+              << std::endl;
+#endif
+    return nullptr;
+  }
+
+  *uncompressed_size = decompressedDataSize;
+
+  // Return the raw buffer to the caller supporting the tradition
+  return outputBuffer.release();
+}
+}  // namespace xpress
+}  // namespace port
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif
+
+#endif
diff --git a/src/rocksdb/port/win/xpress_win.h b/src/rocksdb/port/win/xpress_win.h
new file mode 100644
index 000000000..187adffa6
--- /dev/null
+++ b/src/rocksdb/port/win/xpress_win.h
@@ -0,0 +1,26 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace port {
+namespace xpress {
+
+bool Compress(const char* input, size_t length, std::string* output);
+
+char* Decompress(const char* input_data, size_t input_length,
+                 size_t* uncompressed_size);
+}  // namespace xpress
+}  // namespace port
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/port/xpress.h b/src/rocksdb/port/xpress.h
new file mode 100644
index 000000000..457025f66
--- /dev/null
+++ b/src/rocksdb/port/xpress.h
@@ -0,0 +1,17 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+// Xpress on Windows is implemeted using Win API
+#if defined(ROCKSDB_PLATFORM_POSIX)
+#error "Xpress compression not implemented"
+#elif defined(OS_WIN)
+#include "port/win/xpress_win.h"
+#endif
diff --git a/src/rocksdb/rocksdb.pc.in b/src/rocksdb/rocksdb.pc.in
new file mode 100644
index 000000000..5217a4518
--- /dev/null
+++ b/src/rocksdb/rocksdb.pc.in
@@ -0,0 +1,10 @@
+prefix="@CMAKE_INSTALL_PREFIX@"
+includedir="${prefix}/@CMAKE_INSTALL_INCLUDEDIR@"
+libdir="${prefix}/@CMAKE_INSTALL_LIBDIR@"
+
+Name: @PROJECT_NAME@
+Description: @PROJECT_DESCRIPTION@
+URL: @PROJECT_HOMEPAGE_URL@
+Version: @PROJECT_VERSION@
+Cflags: -I"${includedir}"
+Libs: -L"${libdir}" -lrocksdb
diff --git a/src/rocksdb/src.mk b/src/rocksdb/src.mk
new file mode 100644
index 000000000..f955efc60
--- /dev/null
+++ b/src/rocksdb/src.mk
@@ -0,0 +1,703 @@
+# These are the sources from which librocksdb.a is built:
+LIB_SOURCES =                                                   \
+  cache/cache.cc                                                \
+  cache/cache_entry_roles.cc                                    \
+  cache/cache_key.cc                                            \
+  cache/cache_reservation_manager.cc                            \
+  cache/charged_cache.cc                                        \
+  cache/clock_cache.cc                                          \
+  cache/lru_cache.cc                                            \
+  cache/compressed_secondary_cache.cc                           \
+  cache/secondary_cache.cc                                      \
+  cache/sharded_cache.cc                                        \
+  db/arena_wrapped_db_iter.cc                                   \
+  db/blob/blob_contents.cc                                      \
+  db/blob/blob_fetcher.cc                                       \
+  db/blob/blob_file_addition.cc                                 \
+  db/blob/blob_file_builder.cc                                  \
+  db/blob/blob_file_cache.cc                                    \
+  db/blob/blob_file_garbage.cc                                  \
+  db/blob/blob_file_meta.cc                                     \
+  db/blob/blob_file_reader.cc                                   \
+  db/blob/blob_garbage_meter.cc                                 \
+  db/blob/blob_log_format.cc                                    \
+  db/blob/blob_log_sequential_reader.cc                         \
+  db/blob/blob_log_writer.cc                                    \
+  db/blob/blob_source.cc                                        \
+  db/blob/prefetch_buffer_collection.cc                         \
+  db/builder.cc                                                 \
+  db/c.cc                                                       \
+  db/column_family.cc                                           \
+  db/compaction/compaction.cc                                   \
+  db/compaction/compaction_iterator.cc                          \
+  db/compaction/compaction_job.cc                               \
+  db/compaction/compaction_picker.cc                            \
+  db/compaction/compaction_picker_fifo.cc                       \
+  db/compaction/compaction_picker_level.cc                      \
+  db/compaction/compaction_picker_universal.cc                  \
+  db/compaction/compaction_service_job.cc                       \
+  db/compaction/compaction_state.cc                             \
+  db/compaction/compaction_outputs.cc                           \
+  db/compaction/sst_partitioner.cc                              \
+  db/compaction/subcompaction_state.cc                          \
+  db/convenience.cc                                             \
+  db/db_filesnapshot.cc                                         \
+  db/db_impl/compacted_db_impl.cc                               \
+  db/db_impl/db_impl.cc                                         \
+  db/db_impl/db_impl_compaction_flush.cc                        \
+  db/db_impl/db_impl_debug.cc                                   \
+  db/db_impl/db_impl_experimental.cc                            \
+  db/db_impl/db_impl_files.cc                                   \
+  db/db_impl/db_impl_open.cc                                    \
+  db/db_impl/db_impl_readonly.cc                                \
+  db/db_impl/db_impl_secondary.cc                               \
+  db/db_impl/db_impl_write.cc                                   \
+  db/db_info_dumper.cc                                          \
+  db/db_iter.cc                                                 \
+  db/dbformat.cc                                                \
+  db/error_handler.cc                                           \
+  db/event_helpers.cc                                           \
+  db/experimental.cc                                            \
+  db/external_sst_file_ingestion_job.cc                         \
+  db/file_indexer.cc                                            \
+  db/flush_job.cc                                               \
+  db/flush_scheduler.cc                                         \
+  db/forward_iterator.cc                                        \
+  db/import_column_family_job.cc                                \
+  db/internal_stats.cc                                          \
+  db/logs_with_prep_tracker.cc                                  \
+  db/log_reader.cc                                              \
+  db/log_writer.cc                                              \
+  db/malloc_stats.cc                                            \
+  db/memtable.cc                                                \
+  db/memtable_list.cc                                           \
+  db/merge_helper.cc                                            \
+  db/merge_operator.cc                                          \
+  db/output_validator.cc                                        \
+  db/periodic_task_scheduler.cc                                 \
+  db/range_del_aggregator.cc                                    \
+  db/range_tombstone_fragmenter.cc                              \
+  db/repair.cc                                                  \
+  db/seqno_to_time_mapping.cc                                   \
+  db/snapshot_impl.cc                                           \
+  db/table_cache.cc                                             \
+  db/table_properties_collector.cc                              \
+  db/transaction_log_impl.cc                                    \
+  db/trim_history_scheduler.cc                                  \
+  db/version_builder.cc                                         \
+  db/version_edit.cc                                            \
+  db/version_edit_handler.cc                                    \
+  db/version_set.cc                                             \
+  db/wal_edit.cc                                                \
+  db/wal_manager.cc                                             \
+  db/wide/wide_column_serialization.cc                          \
+  db/wide/wide_columns.cc                                       \
+  db/write_batch.cc                                             \
+  db/write_batch_base.cc                                        \
+  db/write_controller.cc                                        \
+  db/write_thread.cc                                            \
+  env/composite_env.cc                                          \
+  env/env.cc                                                    \
+  env/env_chroot.cc                                             \
+  env/env_encryption.cc                                         \
+  env/env_posix.cc                                              \
+  env/file_system.cc                                            \
+  env/fs_posix.cc                                               \
+  env/fs_remap.cc                                               \
+  env/file_system_tracer.cc                                     \
+  env/io_posix.cc                                               \
+  env/mock_env.cc                                               \
+  env/unique_id_gen.cc                                          \
+  file/delete_scheduler.cc                                      \
+  file/file_prefetch_buffer.cc                                  \
+  file/file_util.cc                                             \
+  file/filename.cc                                              \
+  file/line_file_reader.cc                                      \
+  file/random_access_file_reader.cc                             \
+  file/read_write_util.cc                                       \
+  file/readahead_raf.cc                                         \
+  file/sequence_file_reader.cc                                  \
+  file/sst_file_manager_impl.cc                                 \
+  file/writable_file_writer.cc                                  \
+  logging/auto_roll_logger.cc                                   \
+  logging/event_logger.cc                                       \
+  logging/log_buffer.cc                                         \
+  memory/arena.cc                                               \
+  memory/concurrent_arena.cc                                    \
+  memory/jemalloc_nodump_allocator.cc                           \
+  memory/memkind_kmem_allocator.cc                              \
+  memory/memory_allocator.cc                                    \
+  memtable/alloc_tracker.cc                                     \
+  memtable/hash_linklist_rep.cc                                 \
+  memtable/hash_skiplist_rep.cc                                 \
+  memtable/skiplistrep.cc                                       \
+  memtable/vectorrep.cc                                         \
+  memtable/write_buffer_manager.cc                              \
+  monitoring/histogram.cc                                       \
+  monitoring/histogram_windowing.cc                             \
+  monitoring/in_memory_stats_history.cc                         \
+  monitoring/instrumented_mutex.cc                              \
+  monitoring/iostats_context.cc                                 \
+  monitoring/perf_context.cc                                    \
+  monitoring/perf_level.cc                                      \
+  monitoring/persistent_stats_history.cc                        \
+  monitoring/statistics.cc                                      \
+  monitoring/thread_status_impl.cc                              \
+  monitoring/thread_status_updater.cc                           \
+  monitoring/thread_status_updater_debug.cc                     \
+  monitoring/thread_status_util.cc                              \
+  monitoring/thread_status_util_debug.cc                        \
+  options/cf_options.cc                                         \
+  options/configurable.cc                                       \
+  options/customizable.cc                                       \
+  options/db_options.cc                                         \
+  options/options.cc                                            \
+  options/options_helper.cc                                     \
+  options/options_parser.cc                                     \
+  port/port_posix.cc                                            \
+  port/win/env_default.cc                                       \
+  port/win/env_win.cc                                           \
+  port/win/io_win.cc                                            \
+  port/win/port_win.cc                                          \
+  port/win/win_logger.cc                                        \
+  port/win/win_thread.cc                                        \
+  port/stack_trace.cc                                           \
+  table/adaptive/adaptive_table_factory.cc                      \
+  table/block_based/binary_search_index_reader.cc               \
+  table/block_based/block.cc                                    \
+  table/block_based/block_based_table_builder.cc                \
+  table/block_based/block_based_table_factory.cc                \
+  table/block_based/block_based_table_iterator.cc               \
+  table/block_based/block_based_table_reader.cc                 \
+  table/block_based/block_builder.cc                            \
+  table/block_based/block_prefetcher.cc                         \
+  table/block_based/block_prefix_index.cc                       \
+  table/block_based/data_block_hash_index.cc                    \
+  table/block_based/data_block_footer.cc                        \
+  table/block_based/filter_block_reader_common.cc               \
+  table/block_based/filter_policy.cc                            \
+  table/block_based/flush_block_policy.cc                       \
+  table/block_based/full_filter_block.cc                        \
+  table/block_based/hash_index_reader.cc                        \
+  table/block_based/index_builder.cc                            \
+  table/block_based/index_reader_common.cc                      \
+  table/block_based/parsed_full_filter_block.cc                 \
+  table/block_based/partitioned_filter_block.cc                 \
+  table/block_based/partitioned_index_iterator.cc               \
+  table/block_based/partitioned_index_reader.cc                 \
+  table/block_based/reader_common.cc                            \
+  table/block_based/uncompression_dict_reader.cc                \
+  table/block_fetcher.cc                                        \
+  table/cuckoo/cuckoo_table_builder.cc                          \
+  table/cuckoo/cuckoo_table_factory.cc                          \
+  table/cuckoo/cuckoo_table_reader.cc                           \
+  table/format.cc                                               \
+  table/get_context.cc                                          \
+  table/iterator.cc                                             \
+  table/merging_iterator.cc                                     \
+  table/meta_blocks.cc                                          \
+  table/persistent_cache_helper.cc                              \
+  table/plain/plain_table_bloom.cc                              \
+  table/plain/plain_table_builder.cc                            \
+  table/plain/plain_table_factory.cc                            \
+  table/plain/plain_table_index.cc                              \
+  table/plain/plain_table_key_coding.cc                         \
+  table/plain/plain_table_reader.cc                             \
+  table/sst_file_dumper.cc                                      \
+  table/sst_file_reader.cc                                      \
+  table/sst_file_writer.cc                                      \
+  table/table_factory.cc                                        \
+  table/table_properties.cc                                     \
+  table/two_level_iterator.cc                                   \
+  table/unique_id.cc                                            \
+  test_util/sync_point.cc                                       \
+  test_util/sync_point_impl.cc                                  \
+  test_util/transaction_test_util.cc                            \
+  tools/dump/db_dump_tool.cc                                    \
+  trace_replay/trace_record_handler.cc                          \
+  trace_replay/trace_record_result.cc                           \
+  trace_replay/trace_record.cc                                  \
+  trace_replay/trace_replay.cc                                  \
+  trace_replay/block_cache_tracer.cc                            \
+  trace_replay/io_tracer.cc                                     \
+  util/async_file_reader.cc					\
+  util/build_version.cc                                         \
+  util/cleanable.cc                                             \
+  util/coding.cc                                                \
+  util/compaction_job_stats_impl.cc                             \
+  util/comparator.cc                                            \
+  util/compression.cc                                           \
+  util/compression_context_cache.cc                             \
+  util/concurrent_task_limiter_impl.cc                          \
+  util/crc32c.cc                                                \
+  util/crc32c_arm64.cc                                          \
+  util/dynamic_bloom.cc                                         \
+  util/hash.cc                                                  \
+  util/murmurhash.cc                                            \
+  util/random.cc                                                \
+  util/rate_limiter.cc                                          \
+  util/ribbon_config.cc                                         \
+  util/slice.cc                                                 \
+  util/file_checksum_helper.cc                                  \
+  util/status.cc                                                \
+  util/stderr_logger.cc                                         \
+  util/string_util.cc                                           \
+  util/thread_local.cc                                          \
+  util/threadpool_imp.cc                                        \
+  util/xxhash.cc                                                \
+  utilities/agg_merge/agg_merge.cc                              \
+  utilities/backup/backup_engine.cc                             \
+  utilities/blob_db/blob_compaction_filter.cc                   \
+  utilities/blob_db/blob_db.cc                                  \
+  utilities/blob_db/blob_db_impl.cc                             \
+  utilities/blob_db/blob_db_impl_filesnapshot.cc                \
+  utilities/blob_db/blob_file.cc                                \
+  utilities/cache_dump_load.cc                                  \
+  utilities/cache_dump_load_impl.cc                             \
+  utilities/cassandra/cassandra_compaction_filter.cc            \
+  utilities/cassandra/format.cc                                 \
+  utilities/cassandra/merge_operator.cc                         \
+  utilities/checkpoint/checkpoint_impl.cc                       \
+  utilities/compaction_filters.cc                               \
+  utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc    \
+  utilities/convenience/info_log_finder.cc                      \
+  utilities/counted_fs.cc                                       \
+  utilities/debug.cc                                            \
+  utilities/env_mirror.cc                                       \
+  utilities/env_timed.cc                                        \
+  utilities/fault_injection_env.cc                              \
+  utilities/fault_injection_fs.cc                               \
+  utilities/fault_injection_secondary_cache.cc                  \
+  utilities/leveldb_options/leveldb_options.cc                  \
+  utilities/memory/memory_util.cc                               \
+  utilities/merge_operators.cc                                  \
+  utilities/merge_operators/max.cc                              \
+  utilities/merge_operators/put.cc                              \
+  utilities/merge_operators/sortlist.cc                         \
+  utilities/merge_operators/string_append/stringappend.cc       \
+  utilities/merge_operators/string_append/stringappend2.cc      \
+  utilities/merge_operators/uint64add.cc                        \
+  utilities/merge_operators/bytesxor.cc                         \
+  utilities/object_registry.cc                                  \
+  utilities/option_change_migration/option_change_migration.cc  \
+  utilities/options/options_util.cc                             \
+  utilities/persistent_cache/block_cache_tier.cc                \
+  utilities/persistent_cache/block_cache_tier_file.cc           \
+  utilities/persistent_cache/block_cache_tier_metadata.cc       \
+  utilities/persistent_cache/persistent_cache_tier.cc           \
+  utilities/persistent_cache/volatile_tier_impl.cc              \
+  utilities/simulator_cache/cache_simulator.cc                  \
+  utilities/simulator_cache/sim_cache.cc                        \
+  utilities/table_properties_collectors/compact_on_deletion_collector.cc \
+  utilities/trace/file_trace_reader_writer.cc                   \
+  utilities/trace/replayer_impl.cc                              \
+  utilities/transactions/lock/lock_manager.cc                   \
+  utilities/transactions/lock/point/point_lock_tracker.cc       \
+  utilities/transactions/lock/point/point_lock_manager.cc       \
+  utilities/transactions/optimistic_transaction.cc              \
+  utilities/transactions/optimistic_transaction_db_impl.cc      \
+  utilities/transactions/pessimistic_transaction.cc             \
+  utilities/transactions/pessimistic_transaction_db.cc          \
+  utilities/transactions/snapshot_checker.cc                    \
+  utilities/transactions/transaction_base.cc                    \
+  utilities/transactions/transaction_db_mutex_impl.cc           \
+  utilities/transactions/transaction_util.cc                    \
+  utilities/transactions/write_prepared_txn.cc                  \
+  utilities/transactions/write_prepared_txn_db.cc               \
+  utilities/transactions/write_unprepared_txn.cc                \
+  utilities/transactions/write_unprepared_txn_db.cc             \
+  utilities/ttl/db_ttl_impl.cc                                  \
+  utilities/wal_filter.cc                                       \
+  utilities/write_batch_with_index/write_batch_with_index.cc    \
+  utilities/write_batch_with_index/write_batch_with_index_internal.cc    \
+
+ifeq (,$(shell $(CXX) -fsyntax-only -maltivec -xc /dev/null 2>&1))
+LIB_SOURCES_ASM =\
+  util/crc32c_ppc_asm.S
+LIB_SOURCES_C = \
+  util/crc32c_ppc.c
+else
+LIB_SOURCES_ASM =
+LIB_SOURCES_C =
+endif
+
+RANGE_TREE_SOURCES =\
+  utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc \
+  utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc        \
+  utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc    \
+  utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc        \
+  utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc         \
+  utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc    \
+  utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc        \
+  utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc       \
+  utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc             \
+  utilities/transactions/lock/range/range_tree/lib/standalone_port.cc          \
+  utilities/transactions/lock/range/range_tree/lib/util/dbt.cc                 \
+  utilities/transactions/lock/range/range_tree/lib/util/memarena.cc            \
+  utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc      \
+  utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc
+
+TOOL_LIB_SOURCES =                                              \
+  tools/io_tracer_parser_tool.cc                                \
+  tools/ldb_cmd.cc                                              \
+  tools/ldb_tool.cc                                             \
+  tools/sst_dump_tool.cc                                        \
+  utilities/blob_db/blob_dump_tool.cc                           \
+
+ANALYZER_LIB_SOURCES =                                          \
+  tools/block_cache_analyzer/block_cache_trace_analyzer.cc      \
+  tools/trace_analyzer_tool.cc                                  \
+
+MOCK_LIB_SOURCES =                                              \
+  table/mock_table.cc                                           \
+
+BENCH_LIB_SOURCES =                                             \
+  tools/db_bench_tool.cc                                        \
+  tools/simulated_hybrid_file_system.cc                         \
+
+CACHE_BENCH_LIB_SOURCES =					\
+  cache/cache_bench_tool.cc                                     \
+
+STRESS_LIB_SOURCES =                                            \
+  db_stress_tool/batched_ops_stress.cc                         \
+  db_stress_tool/cf_consistency_stress.cc                      \
+  db_stress_tool/db_stress_common.cc                           \
+  db_stress_tool/db_stress_driver.cc                           \
+  db_stress_tool/db_stress_gflags.cc                           \
+  db_stress_tool/db_stress_listener.cc                         \
+  db_stress_tool/db_stress_shared_state.cc                     \
+  db_stress_tool/db_stress_stat.cc                             \
+  db_stress_tool/db_stress_test_base.cc                        \
+  db_stress_tool/db_stress_tool.cc                             \
+  db_stress_tool/expected_state.cc                             \
+  db_stress_tool/no_batched_ops_stress.cc                      \
+  db_stress_tool/multi_ops_txns_stress.cc                      \
+
+TEST_LIB_SOURCES =                                              \
+  db/db_test_util.cc                                            \
+  db/db_with_timestamp_test_util.cc                             \
+  test_util/mock_time_env.cc                                    \
+  test_util/testharness.cc                                      \
+  test_util/testutil.cc                                         \
+  utilities/agg_merge/test_agg_merge.cc                                 \
+  utilities/cassandra/test_utils.cc                             \
+
+FOLLY_SOURCES =                                                 \
+  $(FOLLY_DIR)/folly/container/detail/F14Table.cpp              \
+  $(FOLLY_DIR)/folly/detail/Futex.cpp                           \
+  $(FOLLY_DIR)/folly/lang/SafeAssert.cpp                        \
+  $(FOLLY_DIR)/folly/lang/ToAscii.cpp                           \
+  $(FOLLY_DIR)/folly/ScopeGuard.cpp                             \
+  $(FOLLY_DIR)/folly/synchronization/AtomicNotification.cpp     \
+  $(FOLLY_DIR)/folly/synchronization/DistributedMutex.cpp       \
+  $(FOLLY_DIR)/folly/synchronization/ParkingLot.cpp             \
+
+TOOLS_MAIN_SOURCES =                                                    \
+  db_stress_tool/db_stress.cc                                           \
+  tools/blob_dump.cc                                                    \
+  tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc         \
+  tools/db_repl_stress.cc                                               \
+  tools/db_sanity_test.cc                                               \
+  tools/ldb.cc                                                          \
+  tools/io_tracer_parser.cc                                             \
+  tools/sst_dump.cc                                                     \
+  tools/write_stress.cc                                                 \
+  tools/dump/rocksdb_dump.cc                                            \
+  tools/dump/rocksdb_undump.cc                                          \
+  tools/trace_analyzer.cc                                               \
+  tools/io_tracer_parser_tool.cc                                        \
+
+BENCH_MAIN_SOURCES =                                                    \
+  cache/cache_bench.cc                                                  \
+  db/range_del_aggregator_bench.cc                                      \
+  memtable/memtablerep_bench.cc                                         \
+  table/table_reader_bench.cc                                           \
+  tools/db_bench.cc                                                     \
+  util/filter_bench.cc                                                  \
+  utilities/persistent_cache/persistent_cache_bench.cc                  \
+  #util/log_write_bench.cc                                               \
+
+TEST_MAIN_SOURCES =                                                     \
+  cache/cache_test.cc                                                   \
+  cache/cache_reservation_manager_test.cc                               \
+  cache/lru_cache_test.cc                                               \
+  cache/compressed_secondary_cache_test.cc                              \
+  db/blob/blob_counting_iterator_test.cc                                \
+  db/blob/blob_file_addition_test.cc                                    \
+  db/blob/blob_file_builder_test.cc                                     \
+  db/blob/blob_file_cache_test.cc                                       \
+  db/blob/blob_file_garbage_test.cc                                     \
+  db/blob/blob_file_reader_test.cc                                      \
+  db/blob/blob_garbage_meter_test.cc                                    \
+  db/blob/blob_source_test.cc                                           \
+  db/blob/db_blob_basic_test.cc                                         \
+  db/blob/db_blob_compaction_test.cc                                    \
+  db/blob/db_blob_corruption_test.cc                                    \
+  db/blob/db_blob_index_test.cc                                         \
+  db/column_family_test.cc                                              \
+  db/compact_files_test.cc                                              \
+  db/compaction/clipping_iterator_test.cc                               \
+  db/compaction/compaction_iterator_test.cc                             \
+  db/compaction/compaction_job_test.cc                                  \
+  db/compaction/compaction_job_stats_test.cc                            \
+  db/compaction/compaction_picker_test.cc                               \
+  db/compaction/compaction_service_test.cc                              \
+  db/compaction/tiered_compaction_test.cc                               \
+  db/comparator_db_test.cc                                              \
+  db/corruption_test.cc                                                 \
+  db/cuckoo_table_db_test.cc                                            \
+  db/db_basic_test.cc                                                   \
+  db/db_block_cache_test.cc                                             \
+  db/db_bloom_filter_test.cc                                            \
+  db/db_compaction_filter_test.cc                                       \
+  db/db_compaction_test.cc                                              \
+  db/db_dynamic_level_test.cc                                           \
+  db/db_encryption_test.cc                                              \
+  db/db_flush_test.cc                                                   \
+  db/db_readonly_with_timestamp_test.cc                                 \
+  db/db_with_timestamp_basic_test.cc                                    \
+  db/import_column_family_test.cc                                       \
+  db/db_inplace_update_test.cc                                          \
+  db/db_io_failure_test.cc                                              \
+  db/db_iter_test.cc                                                    \
+  db/db_iter_stress_test.cc                                             \
+  db/db_iterator_test.cc                                                \
+  db/db_kv_checksum_test.cc                                             \
+  db/db_log_iter_test.cc                                                \
+  db/db_memtable_test.cc                                                \
+  db/db_merge_operator_test.cc                                          \
+  db/db_merge_operand_test.cc                                           \
+  db/db_options_test.cc                                                 \
+  db/db_properties_test.cc                                              \
+  db/db_range_del_test.cc                                               \
+  db/db_rate_limiter_test.cc                                            \
+  db/db_secondary_test.cc                                               \
+  db/db_sst_test.cc                                                     \
+  db/db_statistics_test.cc                                              \
+  db/db_table_properties_test.cc                                        \
+  db/db_tailing_iter_test.cc                                            \
+  db/db_test.cc                                                         \
+  db/db_test2.cc                                                        \
+  db/db_logical_block_size_cache_test.cc                                \
+  db/db_universal_compaction_test.cc                                    \
+  db/db_wal_test.cc                                                     \
+  db/db_with_timestamp_compaction_test.cc                               \
+  db/db_write_buffer_manager_test.cc                                    \
+  db/db_write_test.cc                                                   \
+  db/dbformat_test.cc                                                   \
+  db/deletefile_test.cc                                                 \
+  db/error_handler_fs_test.cc                                           \
+  db/external_sst_file_basic_test.cc                                    \
+  db/external_sst_file_test.cc                                          \
+  db/fault_injection_test.cc                                            \
+  db/file_indexer_test.cc                                               \
+  db/filename_test.cc                                                   \
+  db/flush_job_test.cc                                                  \
+  db/listener_test.cc                                                   \
+  db/log_test.cc                                                        \
+  db/manual_compaction_test.cc                                          \
+  db/memtable_list_test.cc                                              \
+  db/merge_helper_test.cc                                               \
+  db/merge_test.cc                                                      \
+  db/obsolete_files_test.cc                                             \
+  db/options_file_test.cc                                               \
+  db/perf_context_test.cc                                               \
+  db/periodic_task_scheduler_test.cc                                    \
+  db/plain_table_db_test.cc                                             \
+  db/prefix_test.cc                                                     \
+  db/repair_test.cc                                                     \
+  db/range_del_aggregator_test.cc                                       \
+  db/range_tombstone_fragmenter_test.cc                                 \
+  db/seqno_time_test.cc                                                 \
+  db/table_properties_collector_test.cc                                 \
+  db/version_builder_test.cc                                            \
+  db/version_edit_test.cc                                               \
+  db/version_set_test.cc                                                \
+  db/wal_manager_test.cc                                                \
+  db/wide/db_wide_basic_test.cc                                         \
+  db/wide/wide_column_serialization_test.cc                             \
+  db/write_batch_test.cc                                                \
+  db/write_callback_test.cc                                             \
+  db/write_controller_test.cc                                           \
+  env/env_basic_test.cc                                                 \
+  env/env_test.cc                                                       \
+  env/io_posix_test.cc                                                  \
+  env/mock_env_test.cc                                                  \
+  file/delete_scheduler_test.cc                                         \
+  file/prefetch_test.cc                                                 \
+  file/random_access_file_reader_test.cc                                \
+  logging/auto_roll_logger_test.cc                                      \
+  logging/env_logger_test.cc                                            \
+  logging/event_logger_test.cc                                          \
+  memory/arena_test.cc                                                  \
+  memory/memory_allocator_test.cc                                       \
+  memtable/inlineskiplist_test.cc                                       \
+  memtable/skiplist_test.cc                                             \
+  memtable/write_buffer_manager_test.cc                                 \
+  monitoring/histogram_test.cc                                          \
+  monitoring/iostats_context_test.cc                                    \
+  monitoring/statistics_test.cc                                         \
+  monitoring/stats_history_test.cc                                      \
+  options/configurable_test.cc                                          \
+  options/customizable_test.cc                                          \
+  options/options_settable_test.cc                                      \
+  options/options_test.cc                                               \
+  table/block_based/block_based_table_reader_test.cc                    \
+  table/block_based/block_test.cc                                       \
+  table/block_based/data_block_hash_index_test.cc                       \
+  table/block_based/full_filter_block_test.cc                           \
+  table/block_based/partitioned_filter_block_test.cc                    \
+  table/cleanable_test.cc                                               \
+  table/cuckoo/cuckoo_table_builder_test.cc                             \
+  table/cuckoo/cuckoo_table_reader_test.cc                              \
+  table/merger_test.cc                                                  \
+  table/sst_file_reader_test.cc                                         \
+  table/table_test.cc                                                   \
+  table/block_fetcher_test.cc                                           \
+  test_util/testutil_test.cc                                            \
+  tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc         \
+  tools/io_tracer_parser_test.cc                                        \
+  tools/ldb_cmd_test.cc                                                 \
+  tools/reduce_levels_test.cc                                           \
+  tools/sst_dump_test.cc                                                \
+  tools/trace_analyzer_test.cc                                          \
+  trace_replay/block_cache_tracer_test.cc                               \
+  trace_replay/io_tracer_test.cc                                        \
+  util/autovector_test.cc                                               \
+  util/bloom_test.cc                                                    \
+  util/coding_test.cc                                                   \
+  util/crc32c_test.cc                                                   \
+  util/defer_test.cc                                                    \
+  util/dynamic_bloom_test.cc                                            \
+  util/filelock_test.cc                                                 \
+  util/file_reader_writer_test.cc                                       \
+  util/hash_test.cc                                                     \
+  util/heap_test.cc                                                     \
+  util/random_test.cc                                                   \
+  util/rate_limiter_test.cc                                             \
+  util/repeatable_thread_test.cc                                        \
+  util/ribbon_test.cc                                                   \
+  util/slice_test.cc                                                    \
+  util/slice_transform_test.cc                                          \
+  util/timer_queue_test.cc                                              \
+  util/timer_test.cc                                                    \
+  util/thread_list_test.cc                                              \
+  util/thread_local_test.cc                                             \
+  util/work_queue_test.cc                                               \
+  utilities/agg_merge/agg_merge_test.cc                                 \
+  utilities/backup/backup_engine_test.cc                                \
+  utilities/blob_db/blob_db_test.cc                                     \
+  utilities/cassandra/cassandra_format_test.cc                          \
+  utilities/cassandra/cassandra_functional_test.cc                      \
+  utilities/cassandra/cassandra_row_merge_test.cc                       \
+  utilities/cassandra/cassandra_serialize_test.cc                       \
+  utilities/checkpoint/checkpoint_test.cc                               \
+  utilities/env_timed_test.cc                                           \
+  utilities/memory/memory_test.cc                                       \
+  utilities/merge_operators/string_append/stringappend_test.cc          \
+  utilities/object_registry_test.cc                                     \
+  utilities/option_change_migration/option_change_migration_test.cc     \
+  utilities/options/options_util_test.cc                                \
+  utilities/persistent_cache/hash_table_test.cc                         \
+  utilities/persistent_cache/persistent_cache_test.cc                   \
+  utilities/simulator_cache/cache_simulator_test.cc                     \
+  utilities/simulator_cache/sim_cache_test.cc                           \
+  utilities/table_properties_collectors/compact_on_deletion_collector_test.cc  \
+  utilities/transactions/optimistic_transaction_test.cc                 \
+  utilities/transactions/lock/range/range_locking_test.cc               \
+  utilities/transactions/transaction_test.cc                            \
+  utilities/transactions/lock/point/point_lock_manager_test.cc          \
+  utilities/transactions/write_prepared_transaction_test.cc             \
+  utilities/transactions/write_unprepared_transaction_test.cc           \
+  utilities/transactions/write_committed_transaction_ts_test.cc         \
+  utilities/transactions/timestamped_snapshot_test.cc                   \
+  utilities/ttl/ttl_test.cc                                             \
+  utilities/util_merge_operators_test.cc                                \
+  utilities/write_batch_with_index/write_batch_with_index_test.cc       \
+
+TEST_MAIN_SOURCES_C = \
+  db/c_test.c                                                           \
+
+MICROBENCH_SOURCES =                                          \
+  microbench/ribbon_bench.cc                                  \
+  microbench/db_basic_bench.cc                                  \
+
+JNI_NATIVE_SOURCES =                                          \
+  java/rocksjni/backupenginejni.cc                            \
+  java/rocksjni/backup_engine_options.cc                      \
+  java/rocksjni/checkpoint.cc                                 \
+  java/rocksjni/clock_cache.cc                                \
+  java/rocksjni/cache.cc                                      \
+  java/rocksjni/columnfamilyhandle.cc                         \
+  java/rocksjni/compact_range_options.cc                      \
+  java/rocksjni/compaction_filter.cc                          \
+  java/rocksjni/compaction_filter_factory.cc                  \
+  java/rocksjni/compaction_filter_factory_jnicallback.cc      \
+  java/rocksjni/compaction_job_info.cc                        \
+  java/rocksjni/compaction_job_stats.cc                       \
+  java/rocksjni/compaction_options.cc                         \
+  java/rocksjni/compaction_options_fifo.cc                    \
+  java/rocksjni/compaction_options_universal.cc               \
+  java/rocksjni/comparator.cc                                 \
+  java/rocksjni/comparatorjnicallback.cc                      \
+  java/rocksjni/compression_options.cc                        \
+  java/rocksjni/concurrent_task_limiter.cc                    \
+  java/rocksjni/config_options.cc                             \
+  java/rocksjni/env.cc                                        \
+  java/rocksjni/env_options.cc                                \
+  java/rocksjni/event_listener.cc                             \
+  java/rocksjni/event_listener_jnicallback.cc                 \
+  java/rocksjni/ingest_external_file_options.cc               \
+  java/rocksjni/filter.cc                                     \
+  java/rocksjni/iterator.cc                                   \
+  java/rocksjni/jnicallback.cc                                \
+  java/rocksjni/loggerjnicallback.cc                          \
+  java/rocksjni/lru_cache.cc                                  \
+  java/rocksjni/memtablejni.cc                                \
+  java/rocksjni/memory_util.cc                                \
+  java/rocksjni/merge_operator.cc                             \
+  java/rocksjni/native_comparator_wrapper_test.cc             \
+  java/rocksjni/optimistic_transaction_db.cc                  \
+  java/rocksjni/optimistic_transaction_options.cc             \
+  java/rocksjni/options.cc                                    \
+  java/rocksjni/options_util.cc                               \
+  java/rocksjni/persistent_cache.cc                           \
+  java/rocksjni/ratelimiterjni.cc                             \
+  java/rocksjni/remove_emptyvalue_compactionfilterjni.cc      \
+  java/rocksjni/cassandra_compactionfilterjni.cc              \
+  java/rocksjni/cassandra_value_operator.cc                   \
+  java/rocksjni/restorejni.cc                                 \
+  java/rocksjni/rocks_callback_object.cc                      \
+  java/rocksjni/rocksjni.cc                                   \
+  java/rocksjni/rocksdb_exception_test.cc                     \
+  java/rocksjni/slice.cc                                      \
+  java/rocksjni/snapshot.cc                                   \
+  java/rocksjni/sst_file_manager.cc                           \
+  java/rocksjni/sst_file_writerjni.cc                         \
+  java/rocksjni/sst_file_readerjni.cc                         \
+  java/rocksjni/sst_file_reader_iterator.cc                   \
+  java/rocksjni/sst_partitioner.cc                            \
+  java/rocksjni/statistics.cc                                 \
+  java/rocksjni/statisticsjni.cc                              \
+  java/rocksjni/table.cc                                      \
+  java/rocksjni/table_filter.cc                               \
+  java/rocksjni/table_filter_jnicallback.cc                   \
+  java/rocksjni/thread_status.cc                              \
+  java/rocksjni/trace_writer.cc                               \
+  java/rocksjni/trace_writer_jnicallback.cc                   \
+  java/rocksjni/transaction.cc                                \
+  java/rocksjni/transaction_db.cc                             \
+  java/rocksjni/transaction_options.cc                        \
+  java/rocksjni/transaction_db_options.cc                     \
+  java/rocksjni/transaction_log.cc                            \
+  java/rocksjni/transaction_notifier.cc                       \
+  java/rocksjni/transaction_notifier_jnicallback.cc           \
+  java/rocksjni/ttl.cc                                        \
+  java/rocksjni/testable_event_listener.cc                    \
+  java/rocksjni/wal_filter.cc                                 \
+  java/rocksjni/wal_filter_jnicallback.cc                     \
+  java/rocksjni/write_batch.cc                                \
+  java/rocksjni/writebatchhandlerjnicallback.cc               \
+  java/rocksjni/write_batch_test.cc                           \
+  java/rocksjni/write_batch_with_index.cc                     \
+  java/rocksjni/write_buffer_manager.cc
diff --git a/src/rocksdb/table/adaptive/adaptive_table_factory.cc b/src/rocksdb/table/adaptive/adaptive_table_factory.cc
new file mode 100644
index 000000000..5b9fe3dbd
--- /dev/null
+++ b/src/rocksdb/table/adaptive/adaptive_table_factory.cc
@@ -0,0 +1,126 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+#include "table/adaptive/adaptive_table_factory.h"
+
+#include "port/port.h"
+#include "table/format.h"
+#include "table/table_builder.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+AdaptiveTableFactory::AdaptiveTableFactory(
+    std::shared_ptr<TableFactory> table_factory_to_write,
+    std::shared_ptr<TableFactory> block_based_table_factory,
+    std::shared_ptr<TableFactory> plain_table_factory,
+    std::shared_ptr<TableFactory> cuckoo_table_factory)
+    : table_factory_to_write_(table_factory_to_write),
+      block_based_table_factory_(block_based_table_factory),
+      plain_table_factory_(plain_table_factory),
+      cuckoo_table_factory_(cuckoo_table_factory) {
+  if (!plain_table_factory_) {
+    plain_table_factory_.reset(NewPlainTableFactory());
+  }
+  if (!block_based_table_factory_) {
+    block_based_table_factory_.reset(NewBlockBasedTableFactory());
+  }
+  if (!cuckoo_table_factory_) {
+    cuckoo_table_factory_.reset(NewCuckooTableFactory());
+  }
+  if (!table_factory_to_write_) {
+    table_factory_to_write_ = block_based_table_factory_;
+  }
+}
+
+extern const uint64_t kPlainTableMagicNumber;
+extern const uint64_t kLegacyPlainTableMagicNumber;
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const uint64_t kLegacyBlockBasedTableMagicNumber;
+extern const uint64_t kCuckooTableMagicNumber;
+
+Status AdaptiveTableFactory::NewTableReader(
+    const ReadOptions& ro, const TableReaderOptions& table_reader_options,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    std::unique_ptr<TableReader>* table,
+    bool prefetch_index_and_filter_in_cache) const {
+  Footer footer;
+  IOOptions opts;
+  auto s = ReadFooterFromFile(opts, file.get(), nullptr /* prefetch_buffer */,
+                              file_size, &footer);
+  if (!s.ok()) {
+    return s;
+  }
+  if (footer.table_magic_number() == kPlainTableMagicNumber ||
+      footer.table_magic_number() == kLegacyPlainTableMagicNumber) {
+    return plain_table_factory_->NewTableReader(
+        table_reader_options, std::move(file), file_size, table);
+  } else if (footer.table_magic_number() == kBlockBasedTableMagicNumber ||
+             footer.table_magic_number() == kLegacyBlockBasedTableMagicNumber) {
+    return block_based_table_factory_->NewTableReader(
+        ro, table_reader_options, std::move(file), file_size, table,
+        prefetch_index_and_filter_in_cache);
+  } else if (footer.table_magic_number() == kCuckooTableMagicNumber) {
+    return cuckoo_table_factory_->NewTableReader(
+        table_reader_options, std::move(file), file_size, table);
+  } else {
+    return Status::NotSupported("Unidentified table format");
+  }
+}
+
+TableBuilder* AdaptiveTableFactory::NewTableBuilder(
+    const TableBuilderOptions& table_builder_options,
+    WritableFileWriter* file) const {
+  return table_factory_to_write_->NewTableBuilder(table_builder_options, file);
+}
+
+std::string AdaptiveTableFactory::GetPrintableOptions() const {
+  std::string ret;
+  ret.reserve(20000);
+  const int kBufferSize = 200;
+  char buffer[kBufferSize];
+
+  if (table_factory_to_write_) {
+    snprintf(buffer, kBufferSize, "  write factory (%s) options:\n%s\n",
+             (table_factory_to_write_->Name() ? table_factory_to_write_->Name()
+                                              : ""),
+             table_factory_to_write_->GetPrintableOptions().c_str());
+    ret.append(buffer);
+  }
+  if (plain_table_factory_) {
+    snprintf(buffer, kBufferSize, "  %s options:\n%s\n",
+             plain_table_factory_->Name() ? plain_table_factory_->Name() : "",
+             plain_table_factory_->GetPrintableOptions().c_str());
+    ret.append(buffer);
+  }
+  if (block_based_table_factory_) {
+    snprintf(
+        buffer, kBufferSize, "  %s options:\n%s\n",
+        (block_based_table_factory_->Name() ? block_based_table_factory_->Name()
+                                            : ""),
+        block_based_table_factory_->GetPrintableOptions().c_str());
+    ret.append(buffer);
+  }
+  if (cuckoo_table_factory_) {
+    snprintf(buffer, kBufferSize, "  %s options:\n%s\n",
+             cuckoo_table_factory_->Name() ? cuckoo_table_factory_->Name() : "",
+             cuckoo_table_factory_->GetPrintableOptions().c_str());
+    ret.append(buffer);
+  }
+  return ret;
+}
+
+extern TableFactory* NewAdaptiveTableFactory(
+    std::shared_ptr<TableFactory> table_factory_to_write,
+    std::shared_ptr<TableFactory> block_based_table_factory,
+    std::shared_ptr<TableFactory> plain_table_factory,
+    std::shared_ptr<TableFactory> cuckoo_table_factory) {
+  return new AdaptiveTableFactory(table_factory_to_write,
+                                  block_based_table_factory,
+                                  plain_table_factory, cuckoo_table_factory);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/adaptive/adaptive_table_factory.h b/src/rocksdb/table/adaptive/adaptive_table_factory.h
new file mode 100644
index 000000000..3b631942d
--- /dev/null
+++ b/src/rocksdb/table/adaptive/adaptive_table_factory.h
@@ -0,0 +1,58 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct EnvOptions;
+
+class Status;
+class RandomAccessFile;
+class WritableFile;
+class Table;
+class TableBuilder;
+
+class AdaptiveTableFactory : public TableFactory {
+ public:
+  ~AdaptiveTableFactory() {}
+
+  explicit AdaptiveTableFactory(
+      std::shared_ptr<TableFactory> table_factory_to_write,
+      std::shared_ptr<TableFactory> block_based_table_factory,
+      std::shared_ptr<TableFactory> plain_table_factory,
+      std::shared_ptr<TableFactory> cuckoo_table_factory);
+
+  const char* Name() const override { return "AdaptiveTableFactory"; }
+
+  using TableFactory::NewTableReader;
+  Status NewTableReader(
+      const ReadOptions& ro, const TableReaderOptions& table_reader_options,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table,
+      bool prefetch_index_and_filter_in_cache = true) const override;
+
+  TableBuilder* NewTableBuilder(
+      const TableBuilderOptions& table_builder_options,
+      WritableFileWriter* file) const override;
+
+  std::string GetPrintableOptions() const override;
+
+ private:
+  std::shared_ptr<TableFactory> table_factory_to_write_;
+  std::shared_ptr<TableFactory> block_based_table_factory_;
+  std::shared_ptr<TableFactory> plain_table_factory_;
+  std::shared_ptr<TableFactory> cuckoo_table_factory_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/block_based/binary_search_index_reader.cc b/src/rocksdb/table/block_based/binary_search_index_reader.cc
new file mode 100644
index 000000000..21787cc1a
--- /dev/null
+++ b/src/rocksdb/table/block_based/binary_search_index_reader.cc
@@ -0,0 +1,74 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/binary_search_index_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status BinarySearchIndexReader::Create(
+    const BlockBasedTable* table, const ReadOptions& ro,
+    FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+    bool pin, BlockCacheLookupContext* lookup_context,
+    std::unique_ptr<IndexReader>* index_reader) {
+  assert(table != nullptr);
+  assert(table->get_rep());
+  assert(!pin || prefetch);
+  assert(index_reader != nullptr);
+
+  CachableEntry<Block> index_block;
+  if (prefetch || !use_cache) {
+    const Status s =
+        ReadIndexBlock(table, prefetch_buffer, ro, use_cache,
+                       /*get_context=*/nullptr, lookup_context, &index_block);
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (use_cache && !pin) {
+      index_block.Reset();
+    }
+  }
+
+  index_reader->reset(
+      new BinarySearchIndexReader(table, std::move(index_block)));
+
+  return Status::OK();
+}
+
+InternalIteratorBase<IndexValue>* BinarySearchIndexReader::NewIterator(
+    const ReadOptions& read_options, bool /* disable_prefix_seek */,
+    IndexBlockIter* iter, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) {
+  const BlockBasedTable::Rep* rep = table()->get_rep();
+  const bool no_io = (read_options.read_tier == kBlockCacheTier);
+  CachableEntry<Block> index_block;
+  const Status s =
+      GetOrReadIndexBlock(no_io, read_options.rate_limiter_priority,
+                          get_context, lookup_context, &index_block);
+  if (!s.ok()) {
+    if (iter != nullptr) {
+      iter->Invalidate(s);
+      return iter;
+    }
+
+    return NewErrorInternalIterator<IndexValue>(s);
+  }
+
+  Statistics* kNullStats = nullptr;
+  // We don't return pinned data from index blocks, so no need
+  // to set `block_contents_pinned`.
+  auto it = index_block.GetValue()->NewIndexIterator(
+      internal_comparator()->user_comparator(),
+      rep->get_global_seqno(BlockType::kIndex), iter, kNullStats, true,
+      index_has_first_key(), index_key_includes_seq(), index_value_is_full());
+
+  assert(it != nullptr);
+  index_block.TransferTo(it);
+
+  return it;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/binary_search_index_reader.h b/src/rocksdb/table/block_based/binary_search_index_reader.h
new file mode 100644
index 000000000..d4a611ecc
--- /dev/null
+++ b/src/rocksdb/table/block_based/binary_search_index_reader.h
@@ -0,0 +1,48 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include "table/block_based/index_reader_common.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Index that allows binary search lookup for the first key of each block.
+// This class can be viewed as a thin wrapper for `Block` class which already
+// supports binary search.
+class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
+ public:
+  // Read index from the file and create an intance for
+  // `BinarySearchIndexReader`.
+  // On success, index_reader will be populated; otherwise it will remain
+  // unmodified.
+  static Status Create(const BlockBasedTable* table, const ReadOptions& ro,
+                       FilePrefetchBuffer* prefetch_buffer, bool use_cache,
+                       bool prefetch, bool pin,
+                       BlockCacheLookupContext* lookup_context,
+                       std::unique_ptr<IndexReader>* index_reader);
+
+  InternalIteratorBase<IndexValue>* NewIterator(
+      const ReadOptions& read_options, bool /* disable_prefix_seek */,
+      IndexBlockIter* iter, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context) override;
+
+  size_t ApproximateMemoryUsage() const override {
+    size_t usage = ApproximateIndexBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    usage += malloc_usable_size(const_cast<BinarySearchIndexReader*>(this));
+#else
+    usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    return usage;
+  }
+
+ private:
+  BinarySearchIndexReader(const BlockBasedTable* t,
+                          CachableEntry<Block>&& index_block)
+      : IndexReaderCommon(t, std::move(index_block)) {}
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block.cc b/src/rocksdb/table/block_based/block.cc
new file mode 100644
index 000000000..7eb0b010f
--- /dev/null
+++ b/src/rocksdb/table/block_based/block.cc
@@ -0,0 +1,1131 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Decodes the blocks generated by block_builder.cc.
+
+#include "table/block_based/block.h"
+
+#include <algorithm>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "monitoring/perf_context_imp.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/comparator.h"
+#include "table/block_based/block_prefix_index.h"
+#include "table/block_based/data_block_footer.h"
+#include "table/format.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Helper routine: decode the next block entry starting at "p",
+// storing the number of shared key bytes, non_shared key bytes,
+// and the length of the value in "*shared", "*non_shared", and
+// "*value_length", respectively.  Will not derefence past "limit".
+//
+// If any errors are detected, returns nullptr.  Otherwise, returns a
+// pointer to the key delta (just past the three decoded values).
+struct DecodeEntry {
+  inline const char* operator()(const char* p, const char* limit,
+                                uint32_t* shared, uint32_t* non_shared,
+                                uint32_t* value_length) {
+    // We need 2 bytes for shared and non_shared size. We also need one more
+    // byte either for value size or the actual value in case of value delta
+    // encoding.
+    assert(limit - p >= 3);
+    *shared = reinterpret_cast<const unsigned char*>(p)[0];
+    *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
+    *value_length = reinterpret_cast<const unsigned char*>(p)[2];
+    if ((*shared | *non_shared | *value_length) < 128) {
+      // Fast path: all three values are encoded in one byte each
+      p += 3;
+    } else {
+      if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
+      if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
+      if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
+        return nullptr;
+      }
+    }
+
+    // Using an assert in place of "return null" since we should not pay the
+    // cost of checking for corruption on every single key decoding
+    assert(!(static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)));
+    return p;
+  }
+};
+
+// Helper routine: similar to DecodeEntry but does not have assertions.
+// Instead, returns nullptr so that caller can detect and report failure.
+struct CheckAndDecodeEntry {
+  inline const char* operator()(const char* p, const char* limit,
+                                uint32_t* shared, uint32_t* non_shared,
+                                uint32_t* value_length) {
+    // We need 2 bytes for shared and non_shared size. We also need one more
+    // byte either for value size or the actual value in case of value delta
+    // encoding.
+    if (limit - p < 3) {
+      return nullptr;
+    }
+    *shared = reinterpret_cast<const unsigned char*>(p)[0];
+    *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
+    *value_length = reinterpret_cast<const unsigned char*>(p)[2];
+    if ((*shared | *non_shared | *value_length) < 128) {
+      // Fast path: all three values are encoded in one byte each
+      p += 3;
+    } else {
+      if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
+      if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
+      if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
+        return nullptr;
+      }
+    }
+
+    if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
+      return nullptr;
+    }
+    return p;
+  }
+};
+
+struct DecodeKey {
+  inline const char* operator()(const char* p, const char* limit,
+                                uint32_t* shared, uint32_t* non_shared) {
+    uint32_t value_length;
+    return DecodeEntry()(p, limit, shared, non_shared, &value_length);
+  }
+};
+
+// In format_version 4, which is used by index blocks, the value size is not
+// encoded before the entry, as the value is known to be the handle with the
+// known size.
+struct DecodeKeyV4 {
+  inline const char* operator()(const char* p, const char* limit,
+                                uint32_t* shared, uint32_t* non_shared) {
+    // We need 2 bytes for shared and non_shared size. We also need one more
+    // byte either for value size or the actual value in case of value delta
+    // encoding.
+    if (limit - p < 3) return nullptr;
+    *shared = reinterpret_cast<const unsigned char*>(p)[0];
+    *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
+    if ((*shared | *non_shared) < 128) {
+      // Fast path: all three values are encoded in one byte each
+      p += 2;
+    } else {
+      if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
+      if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
+    }
+    return p;
+  }
+};
+
+struct DecodeEntryV4 {
+  inline const char* operator()(const char* p, const char* limit,
+                                uint32_t* shared, uint32_t* non_shared,
+                                uint32_t* value_length) {
+    assert(value_length);
+
+    *value_length = 0;
+    return DecodeKeyV4()(p, limit, shared, non_shared);
+  }
+};
+void DataBlockIter::NextImpl() {
+  bool is_shared = false;
+  ParseNextDataKey(&is_shared);
+}
+
+void MetaBlockIter::NextImpl() {
+  bool is_shared = false;
+  ParseNextKey<CheckAndDecodeEntry>(&is_shared);
+}
+
+void IndexBlockIter::NextImpl() { ParseNextIndexKey(); }
+
+void IndexBlockIter::PrevImpl() {
+  assert(Valid());
+  // Scan backwards to a restart point before current_
+  const uint32_t original = current_;
+  while (GetRestartPoint(restart_index_) >= original) {
+    if (restart_index_ == 0) {
+      // No more entries
+      current_ = restarts_;
+      restart_index_ = num_restarts_;
+      return;
+    }
+    restart_index_--;
+  }
+  SeekToRestartPoint(restart_index_);
+  // Loop until end of current entry hits the start of original entry
+  while (ParseNextIndexKey() && NextEntryOffset() < original) {
+  }
+}
+
+void MetaBlockIter::PrevImpl() {
+  assert(Valid());
+  // Scan backwards to a restart point before current_
+  const uint32_t original = current_;
+  while (GetRestartPoint(restart_index_) >= original) {
+    if (restart_index_ == 0) {
+      // No more entries
+      current_ = restarts_;
+      restart_index_ = num_restarts_;
+      return;
+    }
+    restart_index_--;
+  }
+  SeekToRestartPoint(restart_index_);
+  bool is_shared = false;
+  // Loop until end of current entry hits the start of original entry
+  while (ParseNextKey<CheckAndDecodeEntry>(&is_shared) &&
+         NextEntryOffset() < original) {
+  }
+}
+
+// Similar to IndexBlockIter::PrevImpl but also caches the prev entries
+void DataBlockIter::PrevImpl() {
+  assert(Valid());
+
+  assert(prev_entries_idx_ == -1 ||
+         static_cast<size_t>(prev_entries_idx_) < prev_entries_.size());
+  // Check if we can use cached prev_entries_
+  if (prev_entries_idx_ > 0 &&
+      prev_entries_[prev_entries_idx_].offset == current_) {
+    // Read cached CachedPrevEntry
+    prev_entries_idx_--;
+    const CachedPrevEntry& current_prev_entry =
+        prev_entries_[prev_entries_idx_];
+
+    const char* key_ptr = nullptr;
+    bool raw_key_cached;
+    if (current_prev_entry.key_ptr != nullptr) {
+      // The key is not delta encoded and stored in the data block
+      key_ptr = current_prev_entry.key_ptr;
+      raw_key_cached = false;
+    } else {
+      // The key is delta encoded and stored in prev_entries_keys_buff_
+      key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset;
+      raw_key_cached = true;
+    }
+    const Slice current_key(key_ptr, current_prev_entry.key_size);
+
+    current_ = current_prev_entry.offset;
+    // TODO(ajkr): the copy when `raw_key_cached` is done here for convenience,
+    // not necessity. It is convenient since this class treats keys as pinned
+    // when `raw_key_` points to an outside buffer. So we cannot allow
+    // `raw_key_` point into Prev cache as it is a transient outside buffer
+    // (i.e., keys in it are not actually pinned).
+    raw_key_.SetKey(current_key, raw_key_cached /* copy */);
+    value_ = current_prev_entry.value;
+
+    return;
+  }
+
+  // Clear prev entries cache
+  prev_entries_idx_ = -1;
+  prev_entries_.clear();
+  prev_entries_keys_buff_.clear();
+
+  // Scan backwards to a restart point before current_
+  const uint32_t original = current_;
+  while (GetRestartPoint(restart_index_) >= original) {
+    if (restart_index_ == 0) {
+      // No more entries
+      current_ = restarts_;
+      restart_index_ = num_restarts_;
+      return;
+    }
+    restart_index_--;
+  }
+
+  SeekToRestartPoint(restart_index_);
+
+  do {
+    bool is_shared = false;
+    if (!ParseNextDataKey(&is_shared)) {
+      break;
+    }
+    Slice current_key = raw_key_.GetKey();
+
+    if (raw_key_.IsKeyPinned()) {
+      // The key is not delta encoded
+      prev_entries_.emplace_back(current_, current_key.data(), 0,
+                                 current_key.size(), value());
+    } else {
+      // The key is delta encoded, cache decoded key in buffer
+      size_t new_key_offset = prev_entries_keys_buff_.size();
+      prev_entries_keys_buff_.append(current_key.data(), current_key.size());
+
+      prev_entries_.emplace_back(current_, nullptr, new_key_offset,
+                                 current_key.size(), value());
+    }
+    // Loop until end of current entry hits the start of original entry
+  } while (NextEntryOffset() < original);
+  prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1;
+}
+
+void DataBlockIter::SeekImpl(const Slice& target) {
+  Slice seek_key = target;
+  PERF_TIMER_GUARD(block_seek_nanos);
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  uint32_t index = 0;
+  bool skip_linear_scan = false;
+  bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
+
+  if (!ok) {
+    return;
+  }
+  FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
+}
+
+void MetaBlockIter::SeekImpl(const Slice& target) {
+  Slice seek_key = target;
+  PERF_TIMER_GUARD(block_seek_nanos);
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  uint32_t index = 0;
+  bool skip_linear_scan = false;
+  bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
+
+  if (!ok) {
+    return;
+  }
+  FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
+}
+
+// Optimized Seek for point lookup for an internal key `target`
+// target = "seek_user_key @ type | seqno".
+//
+// For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion,
+// kTypeBlobIndex, or kTypeWideColumnEntity, this function behaves identically
+// to Seek().
+//
+// For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion,
+// kTypeBlobIndex, or kTypeWideColumnEntity:
+//
+// If the return value is FALSE, iter location is undefined, and it means:
+// 1) there is no key in this block falling into the range:
+//    ["seek_user_key @ type | seqno", "seek_user_key @ kTypeDeletion | 0"],
+//    inclusive; AND
+// 2) the last key of this block has a greater user_key from seek_user_key
+//
+// If the return value is TRUE, iter location has two possibilies:
+// 1) If iter is valid, it is set to a location as if set by BinarySeek. In
+//    this case, it points to the first key with a larger user_key or a matching
+//    user_key with a seqno no greater than the seeking seqno.
+// 2) If the iter is invalid, it means that either all the user_key is less
+//    than the seek_user_key, or the block ends with a matching user_key but
+//    with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno
+//    but larger type).
+bool DataBlockIter::SeekForGetImpl(const Slice& target) {
+  Slice target_user_key = ExtractUserKey(target);
+  uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t);
+  uint8_t entry =
+      data_block_hash_index_->Lookup(data_, map_offset, target_user_key);
+
+  if (entry == kCollision) {
+    // HashSeek not effective, falling back
+    SeekImpl(target);
+    return true;
+  }
+
+  if (entry == kNoEntry) {
+    // Even if we cannot find the user_key in this block, the result may
+    // exist in the next block. Consider this example:
+    //
+    // Block N:    [aab@100, ... , app@120]
+    // boundary key: axy@50 (we make minimal assumption about a boundary key)
+    // Block N+1:  [axy@10, ...   ]
+    //
+    // If seek_key = axy@60, the search will starts from Block N.
+    // Even if the user_key is not found in the hash map, the caller still
+    // have to continue searching the next block.
+    //
+    // In this case, we pretend the key is the the last restart interval.
+    // The while-loop below will search the last restart interval for the
+    // key. It will stop at the first key that is larger than the seek_key,
+    // or to the end of the block if no one is larger.
+    entry = static_cast<uint8_t>(num_restarts_ - 1);
+  }
+
+  uint32_t restart_index = entry;
+
+  // check if the key is in the restart_interval
+  assert(restart_index < num_restarts_);
+  SeekToRestartPoint(restart_index);
+  current_ = GetRestartPoint(restart_index);
+
+  uint32_t limit = restarts_;
+  if (restart_index + 1 < num_restarts_) {
+    limit = GetRestartPoint(restart_index + 1);
+  }
+  while (current_ < limit) {
+    bool shared;
+    // Here we only linear seek the target key inside the restart interval.
+    // If a key does not exist inside a restart interval, we avoid
+    // further searching the block content across restart interval boundary.
+    //
+    // TODO(fwu): check the left and right boundary of the restart interval
+    // to avoid linear seek a target key that is out of range.
+    if (!ParseNextDataKey(&shared) || CompareCurrentKey(target) >= 0) {
+      // we stop at the first potential matching user key.
+      break;
+    }
+  }
+
+  if (current_ == restarts_) {
+    // Search reaches to the end of the block. There are three possibilites:
+    // 1) there is only one user_key match in the block (otherwise collsion).
+    //    the matching user_key resides in the last restart interval, and it
+    //    is the last key of the restart interval and of the block as well.
+    //    ParseNextKey() skiped it as its [ type | seqno ] is smaller.
+    //
+    // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry,
+    //    AND all existing user_keys in the restart interval are smaller than
+    //    seek_user_key.
+    //
+    // 3) The seek_key is a false positive and happens to be hashed to the
+    //    last restart interval, AND all existing user_keys in the restart
+    //    interval are smaller than seek_user_key.
+    //
+    // The result may exist in the next block each case, so we return true.
+    return true;
+  }
+
+  if (icmp_->user_comparator()->Compare(raw_key_.GetUserKey(),
+                                        target_user_key) != 0) {
+    // the key is not in this block and cannot be at the next block either.
+    return false;
+  }
+
+  // Here we are conservative and only support a limited set of cases
+  ValueType value_type = ExtractValueType(raw_key_.GetInternalKey());
+  if (value_type != ValueType::kTypeValue &&
+      value_type != ValueType::kTypeDeletion &&
+      value_type != ValueType::kTypeSingleDeletion &&
+      value_type != ValueType::kTypeBlobIndex &&
+      value_type != ValueType::kTypeWideColumnEntity) {
+    SeekImpl(target);
+    return true;
+  }
+
+  // Result found, and the iter is correctly set.
+  return true;
+}
+
+void IndexBlockIter::SeekImpl(const Slice& target) {
+  TEST_SYNC_POINT("IndexBlockIter::Seek:0");
+  PERF_TIMER_GUARD(block_seek_nanos);
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  Slice seek_key = target;
+  if (raw_key_.IsUserKey()) {
+    seek_key = ExtractUserKey(target);
+  }
+  status_ = Status::OK();
+  uint32_t index = 0;
+  bool skip_linear_scan = false;
+  bool ok = false;
+  if (prefix_index_) {
+    bool prefix_may_exist = true;
+    ok = PrefixSeek(target, &index, &prefix_may_exist);
+    if (!prefix_may_exist) {
+      // This is to let the caller to distinguish between non-existing prefix,
+      // and when key is larger than the last key, which both set Valid() to
+      // false.
+      current_ = restarts_;
+      status_ = Status::NotFound();
+    }
+    // restart interval must be one when hash search is enabled so the binary
+    // search simply lands at the right place.
+    skip_linear_scan = true;
+  } else if (value_delta_encoded_) {
+    ok = BinarySeek<DecodeKeyV4>(seek_key, &index, &skip_linear_scan);
+  } else {
+    ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
+  }
+
+  if (!ok) {
+    return;
+  }
+  FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
+}
+
+void DataBlockIter::SeekForPrevImpl(const Slice& target) {
+  PERF_TIMER_GUARD(block_seek_nanos);
+  Slice seek_key = target;
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  uint32_t index = 0;
+  bool skip_linear_scan = false;
+  bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
+
+  if (!ok) {
+    return;
+  }
+  FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
+
+  if (!Valid()) {
+    SeekToLastImpl();
+  } else {
+    while (Valid() && CompareCurrentKey(seek_key) > 0) {
+      PrevImpl();
+    }
+  }
+}
+
+void MetaBlockIter::SeekForPrevImpl(const Slice& target) {
+  PERF_TIMER_GUARD(block_seek_nanos);
+  Slice seek_key = target;
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  uint32_t index = 0;
+  bool skip_linear_scan = false;
+  bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
+
+  if (!ok) {
+    return;
+  }
+  FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
+
+  if (!Valid()) {
+    SeekToLastImpl();
+  } else {
+    while (Valid() && CompareCurrentKey(seek_key) > 0) {
+      PrevImpl();
+    }
+  }
+}
+
+void DataBlockIter::SeekToFirstImpl() {
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  SeekToRestartPoint(0);
+  bool is_shared = false;
+  ParseNextDataKey(&is_shared);
+}
+
+void MetaBlockIter::SeekToFirstImpl() {
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  SeekToRestartPoint(0);
+  bool is_shared = false;
+  ParseNextKey<CheckAndDecodeEntry>(&is_shared);
+}
+
+void IndexBlockIter::SeekToFirstImpl() {
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  status_ = Status::OK();
+  SeekToRestartPoint(0);
+  ParseNextIndexKey();
+}
+
+void DataBlockIter::SeekToLastImpl() {
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  SeekToRestartPoint(num_restarts_ - 1);
+  bool is_shared = false;
+  while (ParseNextDataKey(&is_shared) && NextEntryOffset() < restarts_) {
+    // Keep skipping
+  }
+}
+
+void MetaBlockIter::SeekToLastImpl() {
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  SeekToRestartPoint(num_restarts_ - 1);
+  bool is_shared = false;
+  while (ParseNextKey<CheckAndDecodeEntry>(&is_shared) &&
+         NextEntryOffset() < restarts_) {
+    // Keep skipping
+  }
+}
+
+void IndexBlockIter::SeekToLastImpl() {
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  status_ = Status::OK();
+  SeekToRestartPoint(num_restarts_ - 1);
+  while (ParseNextIndexKey() && NextEntryOffset() < restarts_) {
+    // Keep skipping
+  }
+}
+
+template <class TValue>
+void BlockIter<TValue>::CorruptionError() {
+  current_ = restarts_;
+  restart_index_ = num_restarts_;
+  status_ = Status::Corruption("bad entry in block");
+  raw_key_.Clear();
+  value_.clear();
+}
+
+template <class TValue>
+template <typename DecodeEntryFunc>
+bool BlockIter<TValue>::ParseNextKey(bool* is_shared) {
+  current_ = NextEntryOffset();
+  const char* p = data_ + current_;
+  const char* limit = data_ + restarts_;  // Restarts come right after data
+
+  if (p >= limit) {
+    // No more entries to return.  Mark as invalid.
+    current_ = restarts_;
+    restart_index_ = num_restarts_;
+    return false;
+  }
+  // Decode next entry
+  uint32_t shared, non_shared, value_length;
+  p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length);
+  if (p == nullptr || raw_key_.Size() < shared) {
+    CorruptionError();
+    return false;
+  } else {
+    if (shared == 0) {
+      *is_shared = false;
+      // If this key doesn't share any bytes with prev key then we don't need
+      // to decode it and can use its address in the block directly.
+      raw_key_.SetKey(Slice(p, non_shared), false /* copy */);
+    } else {
+      // This key share `shared` bytes with prev key, we need to decode it
+      *is_shared = true;
+      raw_key_.TrimAppend(shared, p, non_shared);
+    }
+    value_ = Slice(p + non_shared, value_length);
+    if (shared == 0) {
+      while (restart_index_ + 1 < num_restarts_ &&
+             GetRestartPoint(restart_index_ + 1) < current_) {
+        ++restart_index_;
+      }
+    }
+    // else we are in the middle of a restart interval and the restart_index_
+    // thus has not changed
+    return true;
+  }
+}
+
+bool DataBlockIter::ParseNextDataKey(bool* is_shared) {
+  if (ParseNextKey<DecodeEntry>(is_shared)) {
+#ifndef NDEBUG
+    if (global_seqno_ != kDisableGlobalSequenceNumber) {
+      // If we are reading a file with a global sequence number we should
+      // expect that all encoded sequence numbers are zeros and any value
+      // type is kTypeValue, kTypeMerge, kTypeDeletion,
+      // kTypeDeletionWithTimestamp, or kTypeRangeDeletion.
+      uint64_t packed = ExtractInternalKeyFooter(raw_key_.GetKey());
+      SequenceNumber seqno;
+      ValueType value_type;
+      UnPackSequenceAndType(packed, &seqno, &value_type);
+      assert(value_type == ValueType::kTypeValue ||
+             value_type == ValueType::kTypeMerge ||
+             value_type == ValueType::kTypeDeletion ||
+             value_type == ValueType::kTypeDeletionWithTimestamp ||
+             value_type == ValueType::kTypeRangeDeletion);
+      assert(seqno == 0);
+    }
+#endif  // NDEBUG
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool IndexBlockIter::ParseNextIndexKey() {
+  bool is_shared = false;
+  bool ok = (value_delta_encoded_) ? ParseNextKey<DecodeEntryV4>(&is_shared)
+                                   : ParseNextKey<DecodeEntry>(&is_shared);
+  if (ok) {
+    if (value_delta_encoded_ || global_seqno_state_ != nullptr) {
+      DecodeCurrentValue(is_shared);
+    }
+  }
+  return ok;
+}
+
+// The format:
+// restart_point   0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
+// restart_point   1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
+// ...
+// restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
+// where, k is key, v is value, and its encoding is in parenthesis.
+// The format of each key is (shared_size, non_shared_size, shared, non_shared)
+// The format of each value, i.e., block handle, is (offset, size) whenever the
+// is_shared is false, which included the first entry in each restart point.
+// Otherwise the format is delta-size = block handle size - size of last block
+// handle.
+void IndexBlockIter::DecodeCurrentValue(bool is_shared) {
+  Slice v(value_.data(), data_ + restarts_ - value_.data());
+  // Delta encoding is used if `shared` != 0.
+  Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom(
+      &v, have_first_key_,
+      (value_delta_encoded_ && is_shared) ? &decoded_value_.handle : nullptr);
+  assert(decode_s.ok());
+  value_ = Slice(value_.data(), v.data() - value_.data());
+
+  if (global_seqno_state_ != nullptr) {
+    // Overwrite sequence number the same way as in DataBlockIter.
+
+    IterKey& first_internal_key = global_seqno_state_->first_internal_key;
+    first_internal_key.SetInternalKey(decoded_value_.first_internal_key,
+                                      /* copy */ true);
+
+    assert(GetInternalKeySeqno(first_internal_key.GetInternalKey()) == 0);
+
+    ValueType value_type = ExtractValueType(first_internal_key.GetKey());
+    assert(value_type == ValueType::kTypeValue ||
+           value_type == ValueType::kTypeMerge ||
+           value_type == ValueType::kTypeDeletion ||
+           value_type == ValueType::kTypeRangeDeletion);
+
+    first_internal_key.UpdateInternalKey(global_seqno_state_->global_seqno,
+                                         value_type);
+    decoded_value_.first_internal_key = first_internal_key.GetKey();
+  }
+}
+
+template <class TValue>
+void BlockIter<TValue>::FindKeyAfterBinarySeek(const Slice& target,
+                                               uint32_t index,
+                                               bool skip_linear_scan) {
+  // SeekToRestartPoint() only does the lookup in the restart block. We need
+  // to follow it up with NextImpl() to position the iterator at the restart
+  // key.
+  SeekToRestartPoint(index);
+  NextImpl();
+
+  if (!skip_linear_scan) {
+    // Linear search (within restart block) for first key >= target
+    uint32_t max_offset;
+    if (index + 1 < num_restarts_) {
+      // We are in a non-last restart interval. Since `BinarySeek()` guarantees
+      // the next restart key is strictly greater than `target`, we can
+      // terminate upon reaching it without any additional key comparison.
+      max_offset = GetRestartPoint(index + 1);
+    } else {
+      // We are in the last restart interval. The while-loop will terminate by
+      // `Valid()` returning false upon advancing past the block's last key.
+      max_offset = std::numeric_limits<uint32_t>::max();
+    }
+    while (true) {
+      NextImpl();
+      if (!Valid()) {
+        break;
+      }
+      if (current_ == max_offset) {
+        assert(CompareCurrentKey(target) > 0);
+        break;
+      } else if (CompareCurrentKey(target) >= 0) {
+        break;
+      }
+    }
+  }
+}
+
+// Binary searches in restart array to find the starting restart point for the
+// linear scan, and stores it in `*index`. Assumes restart array does not
+// contain duplicate keys. It is guaranteed that the restart key at `*index + 1`
+// is strictly greater than `target` or does not exist (this can be used to
+// elide a comparison when linear scan reaches all the way to the next restart
+// key). Furthermore, `*skip_linear_scan` is set to indicate whether the
+// `*index`th restart key is the final result so that key does not need to be
+// compared again later.
+template <class TValue>
+template <typename DecodeKeyFunc>
+bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t* index,
+                                   bool* skip_linear_scan) {
+  if (restarts_ == 0) {
+    // SST files dedicated to range tombstones are written with index blocks
+    // that have no keys while also having `num_restarts_ == 1`. This would
+    // cause a problem for `BinarySeek()` as it'd try to access the first key
+    // which does not exist. We identify such blocks by the offset at which
+    // their restarts are stored, and return false to prevent any attempted
+    // key accesses.
+    return false;
+  }
+
+  *skip_linear_scan = false;
+  // Loop invariants:
+  // - Restart key at index `left` is less than or equal to the target key. The
+  //   sentinel index `-1` is considered to have a key that is less than all
+  //   keys.
+  // - Any restart keys after index `right` are strictly greater than the target
+  //   key.
+  int64_t left = -1, right = num_restarts_ - 1;
+  while (left != right) {
+    // The `mid` is computed by rounding up so it lands in (`left`, `right`].
+    int64_t mid = left + (right - left + 1) / 2;
+    uint32_t region_offset = GetRestartPoint(static_cast<uint32_t>(mid));
+    uint32_t shared, non_shared;
+    const char* key_ptr = DecodeKeyFunc()(
+        data_ + region_offset, data_ + restarts_, &shared, &non_shared);
+    if (key_ptr == nullptr || (shared != 0)) {
+      CorruptionError();
+      return false;
+    }
+    Slice mid_key(key_ptr, non_shared);
+    raw_key_.SetKey(mid_key, false /* copy */);
+    int cmp = CompareCurrentKey(target);
+    if (cmp < 0) {
+      // Key at "mid" is smaller than "target". Therefore all
+      // blocks before "mid" are uninteresting.
+      left = mid;
+    } else if (cmp > 0) {
+      // Key at "mid" is >= "target". Therefore all blocks at or
+      // after "mid" are uninteresting.
+      right = mid - 1;
+    } else {
+      *skip_linear_scan = true;
+      left = right = mid;
+    }
+  }
+
+  if (left == -1) {
+    // All keys in the block were strictly greater than `target`. So the very
+    // first key in the block is the final seek result.
+    *skip_linear_scan = true;
+    *index = 0;
+  } else {
+    *index = static_cast<uint32_t>(left);
+  }
+  return true;
+}
+
+// Compare target key and the block key of the block of `block_index`.
+// Return -1 if error.
+int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
+  uint32_t region_offset = GetRestartPoint(block_index);
+  uint32_t shared, non_shared;
+  const char* key_ptr =
+      value_delta_encoded_
+          ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared,
+                          &non_shared)
+          : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared,
+                        &non_shared);
+  if (key_ptr == nullptr || (shared != 0)) {
+    CorruptionError();
+    return 1;  // Return target is smaller
+  }
+  Slice block_key(key_ptr, non_shared);
+  raw_key_.SetKey(block_key, false /* copy */);
+  return CompareCurrentKey(target);
+}
+
+// Binary search in block_ids to find the first block
+// with a key >= target
+bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target,
+                                          uint32_t* block_ids, uint32_t left,
+                                          uint32_t right, uint32_t* index,
+                                          bool* prefix_may_exist) {
+  assert(left <= right);
+  assert(index);
+  assert(prefix_may_exist);
+  *prefix_may_exist = true;
+  uint32_t left_bound = left;
+
+  while (left <= right) {
+    uint32_t mid = (right + left) / 2;
+
+    int cmp = CompareBlockKey(block_ids[mid], target);
+    if (!status_.ok()) {
+      return false;
+    }
+    if (cmp < 0) {
+      // Key at "target" is larger than "mid". Therefore all
+      // blocks before or at "mid" are uninteresting.
+      left = mid + 1;
+    } else {
+      // Key at "target" is <= "mid". Therefore all blocks
+      // after "mid" are uninteresting.
+      // If there is only one block left, we found it.
+      if (left == right) break;
+      right = mid;
+    }
+  }
+
+  if (left == right) {
+    // In one of the two following cases:
+    // (1) left is the first one of block_ids
+    // (2) there is a gap of blocks between block of `left` and `left-1`.
+    // we can further distinguish the case of key in the block or key not
+    // existing, by comparing the target key and the key of the previous
+    // block to the left of the block found.
+    if (block_ids[left] > 0 &&
+        (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) &&
+        CompareBlockKey(block_ids[left] - 1, target) > 0) {
+      current_ = restarts_;
+      *prefix_may_exist = false;
+      return false;
+    }
+
+    *index = block_ids[left];
+    return true;
+  } else {
+    assert(left > right);
+
+    // If the next block key is larger than seek key, it is possible that
+    // no key shares the prefix with `target`, or all keys with the same
+    // prefix as `target` are smaller than prefix. In the latter case,
+    // we are mandated to set the position the same as the total order.
+    // In the latter case, either:
+    // (1) `target` falls into the range of the next block. In this case,
+    //     we can place the iterator to the next block, or
+    // (2) `target` is larger than all block keys. In this case we can
+    //     keep the iterator invalidate without setting `prefix_may_exist`
+    //     to false.
+    // We might sometimes end up with setting the total order position
+    // while there is no key sharing the prefix as `target`, but it
+    // still follows the contract.
+    uint32_t right_index = block_ids[right];
+    assert(right_index + 1 <= num_restarts_);
+    if (right_index + 1 < num_restarts_) {
+      if (CompareBlockKey(right_index + 1, target) >= 0) {
+        *index = right_index + 1;
+        return true;
+      } else {
+        // We have to set the flag here because we are not positioning
+        // the iterator to the total order position.
+        *prefix_may_exist = false;
+      }
+    }
+
+    // Mark iterator invalid
+    current_ = restarts_;
+    return false;
+  }
+}
+
+bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index,
+                                bool* prefix_may_exist) {
+  assert(index);
+  assert(prefix_may_exist);
+  assert(prefix_index_);
+  *prefix_may_exist = true;
+  Slice seek_key = target;
+  if (raw_key_.IsUserKey()) {
+    seek_key = ExtractUserKey(target);
+  }
+  uint32_t* block_ids = nullptr;
+  uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids);
+
+  if (num_blocks == 0) {
+    current_ = restarts_;
+    *prefix_may_exist = false;
+    return false;
+  } else {
+    assert(block_ids);
+    return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index,
+                                prefix_may_exist);
+  }
+}
+
+uint32_t Block::NumRestarts() const {
+  assert(size_ >= 2 * sizeof(uint32_t));
+  uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
+  uint32_t num_restarts = block_footer;
+  if (size_ > kMaxBlockSizeSupportedByHashIndex) {
+    // In BlockBuilder, we have ensured a block with HashIndex is less than
+    // kMaxBlockSizeSupportedByHashIndex (64KiB).
+    //
+    // Therefore, if we encounter a block with a size > 64KiB, the block
+    // cannot have HashIndex. So the footer will directly interpreted as
+    // num_restarts.
+    //
+    // Such check is for backward compatibility. We can ensure legacy block
+    // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted
+    // correctly as no HashIndex even if the MSB of num_restarts is set.
+    return num_restarts;
+  }
+  BlockBasedTableOptions::DataBlockIndexType index_type;
+  UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
+  return num_restarts;
+}
+
+BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const {
+  assert(size_ >= 2 * sizeof(uint32_t));
+  if (size_ > kMaxBlockSizeSupportedByHashIndex) {
+    // The check is for the same reason as that in NumRestarts()
+    return BlockBasedTableOptions::kDataBlockBinarySearch;
+  }
+  uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
+  uint32_t num_restarts = block_footer;
+  BlockBasedTableOptions::DataBlockIndexType index_type;
+  UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
+  return index_type;
+}
+
+Block::~Block() {
+  // This sync point can be re-enabled if RocksDB can control the
+  // initialization order of any/all static options created by the user.
+  // TEST_SYNC_POINT("Block::~Block");
+}
+
+Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit,
+             Statistics* statistics)
+    : contents_(std::move(contents)),
+      data_(contents_.data.data()),
+      size_(contents_.data.size()),
+      restart_offset_(0),
+      num_restarts_(0) {
+  TEST_SYNC_POINT("Block::Block:0");
+  if (size_ < sizeof(uint32_t)) {
+    size_ = 0;  // Error marker
+  } else {
+    // Should only decode restart points for uncompressed blocks
+    num_restarts_ = NumRestarts();
+    switch (IndexType()) {
+      case BlockBasedTableOptions::kDataBlockBinarySearch:
+        restart_offset_ = static_cast<uint32_t>(size_) -
+                          (1 + num_restarts_) * sizeof(uint32_t);
+        if (restart_offset_ > size_ - sizeof(uint32_t)) {
+          // The size is too small for NumRestarts() and therefore
+          // restart_offset_ wrapped around.
+          size_ = 0;
+        }
+        break;
+      case BlockBasedTableOptions::kDataBlockBinaryAndHash:
+        if (size_ < sizeof(uint32_t) /* block footer */ +
+                        sizeof(uint16_t) /* NUM_BUCK */) {
+          size_ = 0;
+          break;
+        }
+
+        uint16_t map_offset;
+        data_block_hash_index_.Initialize(
+            contents.data.data(),
+            static_cast<uint16_t>(contents.data.size() -
+                                  sizeof(uint32_t)), /*chop off
+                                                 NUM_RESTARTS*/
+            &map_offset);
+
+        restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t);
+
+        if (restart_offset_ > map_offset) {
+          // map_offset is too small for NumRestarts() and
+          // therefore restart_offset_ wrapped around.
+          size_ = 0;
+          break;
+        }
+        break;
+      default:
+        size_ = 0;  // Error marker
+    }
+  }
+  if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) {
+    read_amp_bitmap_.reset(new BlockReadAmpBitmap(
+        restart_offset_, read_amp_bytes_per_bit, statistics));
+  }
+}
+
+MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) {
+  MetaBlockIter* iter = new MetaBlockIter();
+  if (size_ < 2 * sizeof(uint32_t)) {
+    iter->Invalidate(Status::Corruption("bad block contents"));
+    return iter;
+  } else if (num_restarts_ == 0) {
+    // Empty block.
+    iter->Invalidate(Status::OK());
+  } else {
+    iter->Initialize(data_, restart_offset_, num_restarts_,
+                     block_contents_pinned);
+  }
+  return iter;
+}
+
+DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp,
+                                      SequenceNumber global_seqno,
+                                      DataBlockIter* iter, Statistics* stats,
+                                      bool block_contents_pinned) {
+  DataBlockIter* ret_iter;
+  if (iter != nullptr) {
+    ret_iter = iter;
+  } else {
+    ret_iter = new DataBlockIter;
+  }
+  if (size_ < 2 * sizeof(uint32_t)) {
+    ret_iter->Invalidate(Status::Corruption("bad block contents"));
+    return ret_iter;
+  }
+  if (num_restarts_ == 0) {
+    // Empty block.
+    ret_iter->Invalidate(Status::OK());
+    return ret_iter;
+  } else {
+    ret_iter->Initialize(
+        raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno,
+        read_amp_bitmap_.get(), block_contents_pinned,
+        data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr);
+    if (read_amp_bitmap_) {
+      if (read_amp_bitmap_->GetStatistics() != stats) {
+        // DB changed the Statistics pointer, we need to notify read_amp_bitmap_
+        read_amp_bitmap_->SetStatistics(stats);
+      }
+    }
+  }
+
+  return ret_iter;
+}
+
+IndexBlockIter* Block::NewIndexIterator(
+    const Comparator* raw_ucmp, SequenceNumber global_seqno,
+    IndexBlockIter* iter, Statistics* /*stats*/, bool total_order_seek,
+    bool have_first_key, bool key_includes_seq, bool value_is_full,
+    bool block_contents_pinned, BlockPrefixIndex* prefix_index) {
+  IndexBlockIter* ret_iter;
+  if (iter != nullptr) {
+    ret_iter = iter;
+  } else {
+    ret_iter = new IndexBlockIter;
+  }
+  if (size_ < 2 * sizeof(uint32_t)) {
+    ret_iter->Invalidate(Status::Corruption("bad block contents"));
+    return ret_iter;
+  }
+  if (num_restarts_ == 0) {
+    // Empty block.
+    ret_iter->Invalidate(Status::OK());
+    return ret_iter;
+  } else {
+    BlockPrefixIndex* prefix_index_ptr =
+        total_order_seek ? nullptr : prefix_index;
+    ret_iter->Initialize(raw_ucmp, data_, restart_offset_, num_restarts_,
+                         global_seqno, prefix_index_ptr, have_first_key,
+                         key_includes_seq, value_is_full,
+                         block_contents_pinned);
+  }
+
+  return ret_iter;
+}
+
+size_t Block::ApproximateMemoryUsage() const {
+  size_t usage = usable_size();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  usage += malloc_usable_size((void*)this);
+#else
+  usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+  if (read_amp_bitmap_) {
+    usage += read_amp_bitmap_->ApproximateMemoryUsage();
+  }
+  return usage;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block.h b/src/rocksdb/table/block_based/block.h
new file mode 100644
index 000000000..5d73f72f6
--- /dev/null
+++ b/src/rocksdb/table/block_based/block.h
@@ -0,0 +1,744 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+#include "db/pinned_iterators_manager.h"
+#include "port/malloc.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_prefix_index.h"
+#include "table/block_based/data_block_hash_index.h"
+#include "table/format.h"
+#include "table/internal_iterator.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct BlockContents;
+class Comparator;
+template <class TValue>
+class BlockIter;
+class DataBlockIter;
+class IndexBlockIter;
+class MetaBlockIter;
+class BlockPrefixIndex;
+
+// BlockReadAmpBitmap is a bitmap that map the ROCKSDB_NAMESPACE::Block data
+// bytes to a bitmap with ratio bytes_per_bit. Whenever we access a range of
+// bytes in the Block we update the bitmap and increment
+// READ_AMP_ESTIMATE_USEFUL_BYTES.
+class BlockReadAmpBitmap {
+ public:
+  explicit BlockReadAmpBitmap(size_t block_size, size_t bytes_per_bit,
+                              Statistics* statistics)
+      : bitmap_(nullptr),
+        bytes_per_bit_pow_(0),
+        statistics_(statistics),
+        rnd_(Random::GetTLSInstance()->Uniform(
+            static_cast<int>(bytes_per_bit))) {
+    TEST_SYNC_POINT_CALLBACK("BlockReadAmpBitmap:rnd", &rnd_);
+    assert(block_size > 0 && bytes_per_bit > 0);
+
+    // convert bytes_per_bit to be a power of 2
+    while (bytes_per_bit >>= 1) {
+      bytes_per_bit_pow_++;
+    }
+
+    // num_bits_needed = ceil(block_size / bytes_per_bit)
+    size_t num_bits_needed = ((block_size - 1) >> bytes_per_bit_pow_) + 1;
+    assert(num_bits_needed > 0);
+
+    // bitmap_size = ceil(num_bits_needed / kBitsPerEntry)
+    size_t bitmap_size = (num_bits_needed - 1) / kBitsPerEntry + 1;
+
+    // Create bitmap and set all the bits to 0
+    bitmap_ = new std::atomic<uint32_t>[bitmap_size]();
+
+    RecordTick(GetStatistics(), READ_AMP_TOTAL_READ_BYTES, block_size);
+  }
+
+  ~BlockReadAmpBitmap() { delete[] bitmap_; }
+
+  void Mark(uint32_t start_offset, uint32_t end_offset) {
+    assert(end_offset >= start_offset);
+    // Index of first bit in mask
+    uint32_t start_bit =
+        (start_offset + (1 << bytes_per_bit_pow_) - rnd_ - 1) >>
+        bytes_per_bit_pow_;
+    // Index of last bit in mask + 1
+    uint32_t exclusive_end_bit =
+        (end_offset + (1 << bytes_per_bit_pow_) - rnd_) >> bytes_per_bit_pow_;
+    if (start_bit >= exclusive_end_bit) {
+      return;
+    }
+    assert(exclusive_end_bit > 0);
+
+    if (GetAndSet(start_bit) == 0) {
+      uint32_t new_useful_bytes = (exclusive_end_bit - start_bit)
+                                  << bytes_per_bit_pow_;
+      RecordTick(GetStatistics(), READ_AMP_ESTIMATE_USEFUL_BYTES,
+                 new_useful_bytes);
+    }
+  }
+
+  Statistics* GetStatistics() {
+    return statistics_.load(std::memory_order_relaxed);
+  }
+
+  void SetStatistics(Statistics* stats) { statistics_.store(stats); }
+
+  uint32_t GetBytesPerBit() { return 1 << bytes_per_bit_pow_; }
+
+  size_t ApproximateMemoryUsage() const {
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    return malloc_usable_size((void*)this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    return sizeof(*this);
+  }
+
+ private:
+  // Get the current value of bit at `bit_idx` and set it to 1
+  inline bool GetAndSet(uint32_t bit_idx) {
+    const uint32_t byte_idx = bit_idx / kBitsPerEntry;
+    const uint32_t bit_mask = 1 << (bit_idx % kBitsPerEntry);
+
+    return bitmap_[byte_idx].fetch_or(bit_mask, std::memory_order_relaxed) &
+           bit_mask;
+  }
+
+  const uint32_t kBytesPersEntry = sizeof(uint32_t);   // 4 bytes
+  const uint32_t kBitsPerEntry = kBytesPersEntry * 8;  // 32 bits
+
+  // Bitmap used to record the bytes that we read, use atomic to protect
+  // against multiple threads updating the same bit
+  std::atomic<uint32_t>* bitmap_;
+  // (1 << bytes_per_bit_pow_) is bytes_per_bit. Use power of 2 to optimize
+  // muliplication and division
+  uint8_t bytes_per_bit_pow_;
+  // Pointer to DB Statistics object, Since this bitmap may outlive the DB
+  // this pointer maybe invalid, but the DB will update it to a valid pointer
+  // by using SetStatistics() before calling Mark()
+  std::atomic<Statistics*> statistics_;
+  uint32_t rnd_;
+};
+
+// class Block is the uncompressed and "parsed" form for blocks containing
+// key-value pairs. (See BlockContents comments for more on terminology.)
+// This includes the in-memory representation of data blocks, index blocks
+// (including partitions), range deletion blocks, properties blocks, metaindex
+// blocks, as well as the top level of the partitioned filter structure (which
+// is actually an index of the filter partitions). It is NOT suitable for
+// compressed blocks in general, filter blocks/partitions, or compression
+// dictionaries.
+//
+// See https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format
+// for details of the format and the various block types.
+//
+// TODO: Rename to ParsedKvBlock?
+class Block {
+ public:
+  // Initialize the block with the specified contents.
+  explicit Block(BlockContents&& contents, size_t read_amp_bytes_per_bit = 0,
+                 Statistics* statistics = nullptr);
+  // No copying allowed
+  Block(const Block&) = delete;
+  void operator=(const Block&) = delete;
+
+  ~Block();
+
+  size_t size() const { return size_; }
+  const char* data() const { return data_; }
+  // The additional memory space taken by the block data.
+  size_t usable_size() const { return contents_.usable_size(); }
+  uint32_t NumRestarts() const;
+  bool own_bytes() const { return contents_.own_bytes(); }
+
+  BlockBasedTableOptions::DataBlockIndexType IndexType() const;
+
+  // raw_ucmp is a raw (i.e., not wrapped by `UserComparatorWrapper`) user key
+  // comparator.
+  //
+  // If iter is null, return new Iterator
+  // If iter is not null, update this one and return it as Iterator*
+  //
+  // Updates read_amp_bitmap_ if it is not nullptr.
+  //
+  // If `block_contents_pinned` is true, the caller will guarantee that when
+  // the cleanup functions are transferred from the iterator to other
+  // classes, e.g. PinnableSlice, the pointer to the bytes will still be
+  // valid. Either the iterator holds cache handle or ownership of some resource
+  // and release them in a release function, or caller is sure that the data
+  // will not go away (for example, it's from mmapped file which will not be
+  // closed).
+  //
+  // NOTE: for the hash based lookup, if a key prefix doesn't match any key,
+  // the iterator will simply be set as "invalid", rather than returning
+  // the key that is just pass the target key.
+  DataBlockIter* NewDataIterator(const Comparator* raw_ucmp,
+                                 SequenceNumber global_seqno,
+                                 DataBlockIter* iter = nullptr,
+                                 Statistics* stats = nullptr,
+                                 bool block_contents_pinned = false);
+
+  // Returns an MetaBlockIter for iterating over blocks containing metadata
+  // (like Properties blocks).  Unlike data blocks, the keys for these blocks
+  // do not contain sequence numbers, do not use a user-define comparator, and
+  // do not track read amplification/statistics.  Additionally, MetaBlocks will
+  // not assert if the block is formatted improperly.
+  //
+  // If `block_contents_pinned` is true, the caller will guarantee that when
+  // the cleanup functions are transferred from the iterator to other
+  // classes, e.g. PinnableSlice, the pointer to the bytes will still be
+  // valid. Either the iterator holds cache handle or ownership of some resource
+  // and release them in a release function, or caller is sure that the data
+  // will not go away (for example, it's from mmapped file which will not be
+  // closed).
+  MetaBlockIter* NewMetaIterator(bool block_contents_pinned = false);
+
+  // raw_ucmp is a raw (i.e., not wrapped by `UserComparatorWrapper`) user key
+  // comparator.
+  //
+  // key_includes_seq, default true, means that the keys are in internal key
+  // format.
+  // value_is_full, default true, means that no delta encoding is
+  // applied to values.
+  //
+  // If `prefix_index` is not nullptr this block will do hash lookup for the key
+  // prefix. If total_order_seek is true, prefix_index_ is ignored.
+  //
+  // `have_first_key` controls whether IndexValue will contain
+  // first_internal_key. It affects data serialization format, so the same value
+  // have_first_key must be used when writing and reading index.
+  // It is determined by IndexType property of the table.
+  IndexBlockIter* NewIndexIterator(const Comparator* raw_ucmp,
+                                   SequenceNumber global_seqno,
+                                   IndexBlockIter* iter, Statistics* stats,
+                                   bool total_order_seek, bool have_first_key,
+                                   bool key_includes_seq, bool value_is_full,
+                                   bool block_contents_pinned = false,
+                                   BlockPrefixIndex* prefix_index = nullptr);
+
+  // Report an approximation of how much memory has been used.
+  size_t ApproximateMemoryUsage() const;
+
+ private:
+  BlockContents contents_;
+  const char* data_;         // contents_.data.data()
+  size_t size_;              // contents_.data.size()
+  uint32_t restart_offset_;  // Offset in data_ of restart array
+  uint32_t num_restarts_;
+  std::unique_ptr<BlockReadAmpBitmap> read_amp_bitmap_;
+  DataBlockHashIndex data_block_hash_index_;
+};
+
+// A `BlockIter` iterates over the entries in a `Block`'s data buffer. The
+// format of this data buffer is an uncompressed, sorted sequence of key-value
+// pairs (see `Block` API for more details).
+//
+// Notably, the keys may either be in internal key format or user key format.
+// Subclasses are responsible for configuring the key format.
+//
+// `BlockIter` intends to provide final overrides for all of
+// `InternalIteratorBase` functions that can move the iterator. It does
+// this to guarantee `UpdateKey()` is called exactly once after each key
+// movement potentially visible to users. In this step, the key is prepared
+// (e.g., serialized if global seqno is in effect) so it can be returned
+// immediately when the user asks for it via calling `key() const`.
+//
+// For its subclasses, it provides protected variants of the above-mentioned
+// final-overridden methods. They are named with the "Impl" suffix, e.g.,
+// `Seek()` logic would be implemented by subclasses in `SeekImpl()`. These
+// "Impl" functions are responsible for positioning `raw_key_` but not
+// invoking `UpdateKey()`.
+template <class TValue>
+class BlockIter : public InternalIteratorBase<TValue> {
+ public:
+  // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do
+  // nothing. Calls cleanup functions.
+  virtual void Invalidate(const Status& s) {
+    // Assert that the BlockIter is never deleted while Pinning is Enabled.
+    assert(!pinned_iters_mgr_ || !pinned_iters_mgr_->PinningEnabled());
+
+    data_ = nullptr;
+    current_ = restarts_;
+    status_ = s;
+
+    // Call cleanup callbacks.
+    Cleanable::Reset();
+  }
+
+  bool Valid() const override { return current_ < restarts_; }
+
+  virtual void SeekToFirst() override final {
+    SeekToFirstImpl();
+    UpdateKey();
+  }
+
+  virtual void SeekToLast() override final {
+    SeekToLastImpl();
+    UpdateKey();
+  }
+
+  virtual void Seek(const Slice& target) override final {
+    SeekImpl(target);
+    UpdateKey();
+  }
+
+  virtual void SeekForPrev(const Slice& target) override final {
+    SeekForPrevImpl(target);
+    UpdateKey();
+  }
+
+  virtual void Next() override final {
+    NextImpl();
+    UpdateKey();
+  }
+
+  virtual bool NextAndGetResult(IterateResult* result) override final {
+    // This does not need to call `UpdateKey()` as the parent class only has
+    // access to the `UpdateKey()`-invoking functions.
+    return InternalIteratorBase<TValue>::NextAndGetResult(result);
+  }
+
+  virtual void Prev() override final {
+    PrevImpl();
+    UpdateKey();
+  }
+
+  Status status() const override { return status_; }
+  Slice key() const override {
+    assert(Valid());
+    return key_;
+  }
+
+#ifndef NDEBUG
+  ~BlockIter() override {
+    // Assert that the BlockIter is never deleted while Pinning is Enabled.
+    assert(!pinned_iters_mgr_ ||
+           (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled()));
+    status_.PermitUncheckedError();
+  }
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+    pinned_iters_mgr_ = pinned_iters_mgr;
+  }
+  PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
+#endif
+
+  bool IsKeyPinned() const override {
+    return block_contents_pinned_ && key_pinned_;
+  }
+
+  bool IsValuePinned() const override { return block_contents_pinned_; }
+
+  size_t TEST_CurrentEntrySize() { return NextEntryOffset() - current_; }
+
+  uint32_t ValueOffset() const {
+    return static_cast<uint32_t>(value_.data() - data_);
+  }
+
+  void SetCacheHandle(Cache::Handle* handle) { cache_handle_ = handle; }
+
+  Cache::Handle* cache_handle() { return cache_handle_; }
+
+ protected:
+  std::unique_ptr<InternalKeyComparator> icmp_;
+  const char* data_;       // underlying block contents
+  uint32_t num_restarts_;  // Number of uint32_t entries in restart array
+
+  // Index of restart block in which current_ or current_-1 falls
+  uint32_t restart_index_;
+  uint32_t restarts_;  // Offset of restart array (list of fixed32)
+  // current_ is offset in data_ of current entry.  >= restarts_ if !Valid
+  uint32_t current_;
+  // Raw key from block.
+  IterKey raw_key_;
+  // Buffer for key data when global seqno assignment is enabled.
+  IterKey key_buf_;
+  Slice value_;
+  Status status_;
+  // Key to be exposed to users.
+  Slice key_;
+  bool key_pinned_;
+  // Whether the block data is guaranteed to outlive this iterator, and
+  // as long as the cleanup functions are transferred to another class,
+  // e.g. PinnableSlice, the pointer to the bytes will still be valid.
+  bool block_contents_pinned_;
+  SequenceNumber global_seqno_;
+
+  virtual void SeekToFirstImpl() = 0;
+  virtual void SeekToLastImpl() = 0;
+  virtual void SeekImpl(const Slice& target) = 0;
+  virtual void SeekForPrevImpl(const Slice& target) = 0;
+  virtual void NextImpl() = 0;
+
+  virtual void PrevImpl() = 0;
+
+  template <typename DecodeEntryFunc>
+  inline bool ParseNextKey(bool* is_shared);
+
+  void InitializeBase(const Comparator* raw_ucmp, const char* data,
+                      uint32_t restarts, uint32_t num_restarts,
+                      SequenceNumber global_seqno, bool block_contents_pinned) {
+    assert(data_ == nullptr);  // Ensure it is called only once
+    assert(num_restarts > 0);  // Ensure the param is valid
+
+    icmp_ = std::make_unique<InternalKeyComparator>(raw_ucmp);
+    data_ = data;
+    restarts_ = restarts;
+    num_restarts_ = num_restarts;
+    current_ = restarts_;
+    restart_index_ = num_restarts_;
+    global_seqno_ = global_seqno;
+    block_contents_pinned_ = block_contents_pinned;
+    cache_handle_ = nullptr;
+  }
+
+  // Must be called every time a key is found that needs to be returned to user,
+  // and may be called when no key is found (as a no-op). Updates `key_`,
+  // `key_buf_`, and `key_pinned_` with info about the found key.
+  void UpdateKey() {
+    key_buf_.Clear();
+    if (!Valid()) {
+      return;
+    }
+    if (raw_key_.IsUserKey()) {
+      assert(global_seqno_ == kDisableGlobalSequenceNumber);
+      key_ = raw_key_.GetUserKey();
+      key_pinned_ = raw_key_.IsKeyPinned();
+    } else if (global_seqno_ == kDisableGlobalSequenceNumber) {
+      key_ = raw_key_.GetInternalKey();
+      key_pinned_ = raw_key_.IsKeyPinned();
+    } else {
+      key_buf_.SetInternalKey(raw_key_.GetUserKey(), global_seqno_,
+                              ExtractValueType(raw_key_.GetInternalKey()));
+      key_ = key_buf_.GetInternalKey();
+      key_pinned_ = false;
+    }
+  }
+
+  // Returns the result of `Comparator::Compare()`, where the appropriate
+  // comparator is used for the block contents, the LHS argument is the current
+  // key with global seqno applied, and the RHS argument is `other`.
+  int CompareCurrentKey(const Slice& other) {
+    if (raw_key_.IsUserKey()) {
+      assert(global_seqno_ == kDisableGlobalSequenceNumber);
+      return icmp_->user_comparator()->Compare(raw_key_.GetUserKey(), other);
+    } else if (global_seqno_ == kDisableGlobalSequenceNumber) {
+      return icmp_->Compare(raw_key_.GetInternalKey(), other);
+    }
+    return icmp_->Compare(raw_key_.GetInternalKey(), global_seqno_, other,
+                          kDisableGlobalSequenceNumber);
+  }
+
+ private:
+  // Store the cache handle, if the block is cached. We need this since the
+  // only other place the handle is stored is as an argument to the Cleanable
+  // function callback, which is hard to retrieve. When multiple value
+  // PinnableSlices reference the block, they need the cache handle in order
+  // to bump up the ref count
+  Cache::Handle* cache_handle_;
+
+ public:
+  // Return the offset in data_ just past the end of the current entry.
+  inline uint32_t NextEntryOffset() const {
+    // NOTE: We don't support blocks bigger than 2GB
+    return static_cast<uint32_t>((value_.data() + value_.size()) - data_);
+  }
+
+  uint32_t GetRestartPoint(uint32_t index) {
+    assert(index < num_restarts_);
+    return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
+  }
+
+  void SeekToRestartPoint(uint32_t index) {
+    raw_key_.Clear();
+    restart_index_ = index;
+    // current_ will be fixed by ParseNextKey();
+
+    // ParseNextKey() starts at the end of value_, so set value_ accordingly
+    uint32_t offset = GetRestartPoint(index);
+    value_ = Slice(data_ + offset, 0);
+  }
+
+  void CorruptionError();
+
+ protected:
+  template <typename DecodeKeyFunc>
+  inline bool BinarySeek(const Slice& target, uint32_t* index,
+                         bool* is_index_key_result);
+
+  void FindKeyAfterBinarySeek(const Slice& target, uint32_t index,
+                              bool is_index_key_result);
+};
+
+class DataBlockIter final : public BlockIter<Slice> {
+ public:
+  DataBlockIter()
+      : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {}
+  DataBlockIter(const Comparator* raw_ucmp, const char* data, uint32_t restarts,
+                uint32_t num_restarts, SequenceNumber global_seqno,
+                BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned,
+                DataBlockHashIndex* data_block_hash_index)
+      : DataBlockIter() {
+    Initialize(raw_ucmp, data, restarts, num_restarts, global_seqno,
+               read_amp_bitmap, block_contents_pinned, data_block_hash_index);
+  }
+  void Initialize(const Comparator* raw_ucmp, const char* data,
+                  uint32_t restarts, uint32_t num_restarts,
+                  SequenceNumber global_seqno,
+                  BlockReadAmpBitmap* read_amp_bitmap,
+                  bool block_contents_pinned,
+                  DataBlockHashIndex* data_block_hash_index) {
+    InitializeBase(raw_ucmp, data, restarts, num_restarts, global_seqno,
+                   block_contents_pinned);
+    raw_key_.SetIsUserKey(false);
+    read_amp_bitmap_ = read_amp_bitmap;
+    last_bitmap_offset_ = current_ + 1;
+    data_block_hash_index_ = data_block_hash_index;
+  }
+
+  Slice value() const override {
+    assert(Valid());
+    if (read_amp_bitmap_ && current_ < restarts_ &&
+        current_ != last_bitmap_offset_) {
+      read_amp_bitmap_->Mark(current_ /* current entry offset */,
+                             NextEntryOffset() - 1);
+      last_bitmap_offset_ = current_;
+    }
+    return value_;
+  }
+
+  inline bool SeekForGet(const Slice& target) {
+    if (!data_block_hash_index_) {
+      SeekImpl(target);
+      UpdateKey();
+      return true;
+    }
+    bool res = SeekForGetImpl(target);
+    UpdateKey();
+    return res;
+  }
+
+  void Invalidate(const Status& s) override {
+    BlockIter::Invalidate(s);
+    // Clear prev entries cache.
+    prev_entries_keys_buff_.clear();
+    prev_entries_.clear();
+    prev_entries_idx_ = -1;
+  }
+
+ protected:
+  friend Block;
+  inline bool ParseNextDataKey(bool* is_shared);
+  void SeekToFirstImpl() override;
+  void SeekToLastImpl() override;
+  void SeekImpl(const Slice& target) override;
+  void SeekForPrevImpl(const Slice& target) override;
+  void NextImpl() override;
+  void PrevImpl() override;
+
+ private:
+  // read-amp bitmap
+  BlockReadAmpBitmap* read_amp_bitmap_;
+  // last `current_` value we report to read-amp bitmp
+  mutable uint32_t last_bitmap_offset_;
+  struct CachedPrevEntry {
+    explicit CachedPrevEntry(uint32_t _offset, const char* _key_ptr,
+                             size_t _key_offset, size_t _key_size, Slice _value)
+        : offset(_offset),
+          key_ptr(_key_ptr),
+          key_offset(_key_offset),
+          key_size(_key_size),
+          value(_value) {}
+
+    // offset of entry in block
+    uint32_t offset;
+    // Pointer to key data in block (nullptr if key is delta-encoded)
+    const char* key_ptr;
+    // offset of key in prev_entries_keys_buff_ (0 if key_ptr is not nullptr)
+    size_t key_offset;
+    // size of key
+    size_t key_size;
+    // value slice pointing to data in block
+    Slice value;
+  };
+  std::string prev_entries_keys_buff_;
+  std::vector<CachedPrevEntry> prev_entries_;
+  int32_t prev_entries_idx_ = -1;
+
+  DataBlockHashIndex* data_block_hash_index_;
+
+  bool SeekForGetImpl(const Slice& target);
+};
+
+// Iterator over MetaBlocks.  MetaBlocks are similar to Data Blocks and
+// are used to store Properties associated with table.
+// Meta blocks always store user keys (no sequence number) and always
+// use the BytewiseComparator.  Additionally, MetaBlock accesses are
+// not recorded in the Statistics or for Read-Amplification.
+class MetaBlockIter final : public BlockIter<Slice> {
+ public:
+  MetaBlockIter() : BlockIter() { raw_key_.SetIsUserKey(true); }
+  void Initialize(const char* data, uint32_t restarts, uint32_t num_restarts,
+                  bool block_contents_pinned) {
+    // Initializes the iterator with a BytewiseComparator and
+    // the raw key being a user key.
+    InitializeBase(BytewiseComparator(), data, restarts, num_restarts,
+                   kDisableGlobalSequenceNumber, block_contents_pinned);
+    raw_key_.SetIsUserKey(true);
+  }
+
+  Slice value() const override {
+    assert(Valid());
+    return value_;
+  }
+
+ protected:
+  void SeekToFirstImpl() override;
+  void SeekToLastImpl() override;
+  void SeekImpl(const Slice& target) override;
+  void SeekForPrevImpl(const Slice& target) override;
+  void NextImpl() override;
+  void PrevImpl() override;
+};
+
+class IndexBlockIter final : public BlockIter<IndexValue> {
+ public:
+  IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {}
+
+  // key_includes_seq, default true, means that the keys are in internal key
+  // format.
+  // value_is_full, default true, means that no delta encoding is
+  // applied to values.
+  void Initialize(const Comparator* raw_ucmp, const char* data,
+                  uint32_t restarts, uint32_t num_restarts,
+                  SequenceNumber global_seqno, BlockPrefixIndex* prefix_index,
+                  bool have_first_key, bool key_includes_seq,
+                  bool value_is_full, bool block_contents_pinned) {
+    InitializeBase(raw_ucmp, data, restarts, num_restarts,
+                   kDisableGlobalSequenceNumber, block_contents_pinned);
+    raw_key_.SetIsUserKey(!key_includes_seq);
+    prefix_index_ = prefix_index;
+    value_delta_encoded_ = !value_is_full;
+    have_first_key_ = have_first_key;
+    if (have_first_key_ && global_seqno != kDisableGlobalSequenceNumber) {
+      global_seqno_state_.reset(new GlobalSeqnoState(global_seqno));
+    } else {
+      global_seqno_state_.reset();
+    }
+  }
+
+  Slice user_key() const override {
+    assert(Valid());
+    return raw_key_.GetUserKey();
+  }
+
+  IndexValue value() const override {
+    assert(Valid());
+    if (value_delta_encoded_ || global_seqno_state_ != nullptr) {
+      return decoded_value_;
+    } else {
+      IndexValue entry;
+      Slice v = value_;
+      Status decode_s __attribute__((__unused__)) =
+          entry.DecodeFrom(&v, have_first_key_, nullptr);
+      assert(decode_s.ok());
+      return entry;
+    }
+  }
+
+  bool IsValuePinned() const override {
+    return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned();
+  }
+
+ protected:
+  // IndexBlockIter follows a different contract for prefix iterator
+  // from data iterators.
+  // If prefix of the seek key `target` exists in the file, it must
+  // return the same result as total order seek.
+  // If the prefix of `target` doesn't exist in the file, it can either
+  // return the result of total order seek, or set both of Valid() = false
+  // and status() = NotFound().
+  void SeekImpl(const Slice& target) override;
+
+  void SeekForPrevImpl(const Slice&) override {
+    assert(false);
+    current_ = restarts_;
+    restart_index_ = num_restarts_;
+    status_ = Status::InvalidArgument(
+        "RocksDB internal error: should never call SeekForPrev() on index "
+        "blocks");
+    raw_key_.Clear();
+    value_.clear();
+  }
+
+  void PrevImpl() override;
+
+  void NextImpl() override;
+
+  void SeekToFirstImpl() override;
+
+  void SeekToLastImpl() override;
+
+ private:
+  bool value_delta_encoded_;
+  bool have_first_key_;  // value includes first_internal_key
+  BlockPrefixIndex* prefix_index_;
+  // Whether the value is delta encoded. In that case the value is assumed to be
+  // BlockHandle. The first value in each restart interval is the full encoded
+  // BlockHandle; the restart of encoded size part of the BlockHandle. The
+  // offset of delta encoded BlockHandles is computed by adding the size of
+  // previous delta encoded values in the same restart interval to the offset of
+  // the first value in that restart interval.
+  IndexValue decoded_value_;
+
+  // When sequence number overwriting is enabled, this struct contains the seqno
+  // to overwrite with, and current first_internal_key with overwritten seqno.
+  // This is rarely used, so we put it behind a pointer and only allocate when
+  // needed.
+  struct GlobalSeqnoState {
+    // First internal key according to current index entry, but with sequence
+    // number overwritten to global_seqno.
+    IterKey first_internal_key;
+    SequenceNumber global_seqno;
+
+    explicit GlobalSeqnoState(SequenceNumber seqno) : global_seqno(seqno) {}
+  };
+
+  std::unique_ptr<GlobalSeqnoState> global_seqno_state_;
+
+  // Set *prefix_may_exist to false if no key possibly share the same prefix
+  // as `target`. If not set, the result position should be the same as total
+  // order Seek.
+  bool PrefixSeek(const Slice& target, uint32_t* index, bool* prefix_may_exist);
+  // Set *prefix_may_exist to false if no key can possibly share the same
+  // prefix as `target`. If not set, the result position should be the same
+  // as total order seek.
+  bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
+                            uint32_t left, uint32_t right, uint32_t* index,
+                            bool* prefix_may_exist);
+  inline int CompareBlockKey(uint32_t block_index, const Slice& target);
+
+  inline bool ParseNextIndexKey();
+
+  // When value_delta_encoded_ is enabled it decodes the value which is assumed
+  // to be BlockHandle and put it to decoded_value_
+  inline void DecodeCurrentValue(bool is_shared);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_builder.cc b/src/rocksdb/table/block_based/block_based_table_builder.cc
new file mode 100644
index 000000000..fed69af07
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_builder.cc
@@ -0,0 +1,2096 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_based/block_based_table_builder.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <atomic>
+#include <list>
+#include <map>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_helpers.h"
+#include "cache/cache_key.h"
+#include "cache/cache_reservation_manager.h"
+#include "db/dbformat.h"
+#include "index_builder.h"
+#include "logging/logging.h"
+#include "memory/memory_allocator.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/table.h"
+#include "rocksdb/types.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#include "table/block_based/block_like_traits.h"
+#include "table/block_based/filter_block.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "table/block_based/full_filter_block.h"
+#include "table/block_based/partitioned_filter_block.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "table/table_builder.h"
+#include "util/coding.h"
+#include "util/compression.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "util/work_queue.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+extern const std::string kHashIndexPrefixesBlock;
+extern const std::string kHashIndexPrefixesMetadataBlock;
+
+// Without anonymous namespace here, we fail the warning -Wmissing-prototypes
+namespace {
+
+constexpr size_t kBlockTrailerSize = BlockBasedTable::kBlockTrailerSize;
+
+// Create a filter block builder based on its type.
+FilterBlockBuilder* CreateFilterBlockBuilder(
+    const ImmutableCFOptions& /*opt*/, const MutableCFOptions& mopt,
+    const FilterBuildingContext& context,
+    const bool use_delta_encoding_for_index_values,
+    PartitionedIndexBuilder* const p_index_builder) {
+  const BlockBasedTableOptions& table_opt = context.table_options;
+  assert(table_opt.filter_policy);  // precondition
+
+  FilterBitsBuilder* filter_bits_builder =
+      BloomFilterPolicy::GetBuilderFromContext(context);
+  if (filter_bits_builder == nullptr) {
+    return nullptr;
+  } else {
+    if (table_opt.partition_filters) {
+      assert(p_index_builder != nullptr);
+      // Since after partition cut request from filter builder it takes time
+      // until index builder actully cuts the partition, until the end of a
+      // data block potentially with many keys, we take the lower bound as
+      // partition size.
+      assert(table_opt.block_size_deviation <= 100);
+      auto partition_size =
+          static_cast<uint32_t>(((table_opt.metadata_block_size *
+                                  (100 - table_opt.block_size_deviation)) +
+                                 99) /
+                                100);
+      partition_size = std::max(partition_size, static_cast<uint32_t>(1));
+      return new PartitionedFilterBlockBuilder(
+          mopt.prefix_extractor.get(), table_opt.whole_key_filtering,
+          filter_bits_builder, table_opt.index_block_restart_interval,
+          use_delta_encoding_for_index_values, p_index_builder, partition_size);
+    } else {
+      return new FullFilterBlockBuilder(mopt.prefix_extractor.get(),
+                                        table_opt.whole_key_filtering,
+                                        filter_bits_builder);
+    }
+  }
+}
+
+bool GoodCompressionRatio(size_t compressed_size, size_t uncomp_size) {
+  // Check to see if compressed less than 12.5%
+  return compressed_size < uncomp_size - (uncomp_size / 8u);
+}
+
+}  // namespace
+
+// format_version is the block format as defined in include/rocksdb/table.h
+Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info,
+                    CompressionType* type, uint32_t format_version,
+                    bool do_sample, std::string* compressed_output,
+                    std::string* sampled_output_fast,
+                    std::string* sampled_output_slow) {
+  assert(type);
+  assert(compressed_output);
+  assert(compressed_output->empty());
+
+  // If requested, we sample one in every N block with a
+  // fast and slow compression algorithm and report the stats.
+  // The users can use these stats to decide if it is worthwhile
+  // enabling compression and they also get a hint about which
+  // compression algorithm wil be beneficial.
+  if (do_sample && info.SampleForCompression() &&
+      Random::GetTLSInstance()->OneIn(
+          static_cast<int>(info.SampleForCompression()))) {
+    // Sampling with a fast compression algorithm
+    if (sampled_output_fast && (LZ4_Supported() || Snappy_Supported())) {
+      CompressionType c =
+          LZ4_Supported() ? kLZ4Compression : kSnappyCompression;
+      CompressionContext context(c);
+      CompressionOptions options;
+      CompressionInfo info_tmp(options, context,
+                               CompressionDict::GetEmptyDict(), c,
+                               info.SampleForCompression());
+
+      CompressData(uncompressed_data, info_tmp,
+                   GetCompressFormatForVersion(format_version),
+                   sampled_output_fast);
+    }
+
+    // Sampling with a slow but high-compression algorithm
+    if (sampled_output_slow && (ZSTD_Supported() || Zlib_Supported())) {
+      CompressionType c = ZSTD_Supported() ? kZSTD : kZlibCompression;
+      CompressionContext context(c);
+      CompressionOptions options;
+      CompressionInfo info_tmp(options, context,
+                               CompressionDict::GetEmptyDict(), c,
+                               info.SampleForCompression());
+
+      CompressData(uncompressed_data, info_tmp,
+                   GetCompressFormatForVersion(format_version),
+                   sampled_output_slow);
+    }
+  }
+
+  if (info.type() == kNoCompression) {
+    *type = kNoCompression;
+    return uncompressed_data;
+  }
+
+  // Actually compress the data; if the compression method is not supported,
+  // or the compression fails etc., just fall back to uncompressed
+  if (!CompressData(uncompressed_data, info,
+                    GetCompressFormatForVersion(format_version),
+                    compressed_output)) {
+    *type = kNoCompression;
+    return uncompressed_data;
+  }
+
+  // Check the compression ratio; if it's not good enough, just fall back to
+  // uncompressed
+  if (!GoodCompressionRatio(compressed_output->size(),
+                            uncompressed_data.size())) {
+    *type = kNoCompression;
+    return uncompressed_data;
+  }
+
+  *type = info.type();
+  return *compressed_output;
+}
+
+// kBlockBasedTableMagicNumber was picked by running
+//    echo rocksdb.table.block_based | sha1sum
+// and taking the leading 64 bits.
+// Please note that kBlockBasedTableMagicNumber may also be accessed by other
+// .cc files
+// for that reason we declare it extern in the header but to get the space
+// allocated
+// it must be not extern in one place.
+const uint64_t kBlockBasedTableMagicNumber = 0x88e241b785f4cff7ull;
+// We also support reading and writing legacy block based table format (for
+// backwards compatibility)
+const uint64_t kLegacyBlockBasedTableMagicNumber = 0xdb4775248b80fb57ull;
+
+// A collector that collects properties of interest to block-based table.
+// For now this class looks heavy-weight since we only write one additional
+// property.
+// But in the foreseeable future, we will add more and more properties that are
+// specific to block-based table.
+class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
+    : public IntTblPropCollector {
+ public:
+  explicit BlockBasedTablePropertiesCollector(
+      BlockBasedTableOptions::IndexType index_type, bool whole_key_filtering,
+      bool prefix_filtering)
+      : index_type_(index_type),
+        whole_key_filtering_(whole_key_filtering),
+        prefix_filtering_(prefix_filtering) {}
+
+  Status InternalAdd(const Slice& /*key*/, const Slice& /*value*/,
+                     uint64_t /*file_size*/) override {
+    // Intentionally left blank. Have no interest in collecting stats for
+    // individual key/value pairs.
+    return Status::OK();
+  }
+
+  virtual void BlockAdd(uint64_t /* block_uncomp_bytes */,
+                        uint64_t /* block_compressed_bytes_fast */,
+                        uint64_t /* block_compressed_bytes_slow */) override {
+    // Intentionally left blank. No interest in collecting stats for
+    // blocks.
+    return;
+  }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    std::string val;
+    PutFixed32(&val, static_cast<uint32_t>(index_type_));
+    properties->insert({BlockBasedTablePropertyNames::kIndexType, val});
+    properties->insert({BlockBasedTablePropertyNames::kWholeKeyFiltering,
+                        whole_key_filtering_ ? kPropTrue : kPropFalse});
+    properties->insert({BlockBasedTablePropertyNames::kPrefixFiltering,
+                        prefix_filtering_ ? kPropTrue : kPropFalse});
+    return Status::OK();
+  }
+
+  // The name of the properties collector can be used for debugging purpose.
+  const char* Name() const override {
+    return "BlockBasedTablePropertiesCollector";
+  }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    // Intentionally left blank.
+    return UserCollectedProperties();
+  }
+
+ private:
+  BlockBasedTableOptions::IndexType index_type_;
+  bool whole_key_filtering_;
+  bool prefix_filtering_;
+};
+
+struct BlockBasedTableBuilder::Rep {
+  const ImmutableOptions ioptions;
+  const MutableCFOptions moptions;
+  const BlockBasedTableOptions table_options;
+  const InternalKeyComparator& internal_comparator;
+  WritableFileWriter* file;
+  std::atomic<uint64_t> offset;
+  size_t alignment;
+  BlockBuilder data_block;
+  // Buffers uncompressed data blocks to replay later. Needed when
+  // compression dictionary is enabled so we can finalize the dictionary before
+  // compressing any data blocks.
+  std::vector<std::string> data_block_buffers;
+  BlockBuilder range_del_block;
+
+  InternalKeySliceTransform internal_prefix_transform;
+  std::unique_ptr<IndexBuilder> index_builder;
+  PartitionedIndexBuilder* p_index_builder_ = nullptr;
+
+  std::string last_key;
+  const Slice* first_key_in_next_block = nullptr;
+  CompressionType compression_type;
+  uint64_t sample_for_compression;
+  std::atomic<uint64_t> compressible_input_data_bytes;
+  std::atomic<uint64_t> uncompressible_input_data_bytes;
+  std::atomic<uint64_t> sampled_input_data_bytes;
+  std::atomic<uint64_t> sampled_output_slow_data_bytes;
+  std::atomic<uint64_t> sampled_output_fast_data_bytes;
+  CompressionOptions compression_opts;
+  std::unique_ptr<CompressionDict> compression_dict;
+  std::vector<std::unique_ptr<CompressionContext>> compression_ctxs;
+  std::vector<std::unique_ptr<UncompressionContext>> verify_ctxs;
+  std::unique_ptr<UncompressionDict> verify_dict;
+
+  size_t data_begin_offset = 0;
+
+  TableProperties props;
+
+  // States of the builder.
+  //
+  // - `kBuffered`: This is the initial state where zero or more data blocks are
+  //   accumulated uncompressed in-memory. From this state, call
+  //   `EnterUnbuffered()` to finalize the compression dictionary if enabled,
+  //   compress/write out any buffered blocks, and proceed to the `kUnbuffered`
+  //   state.
+  //
+  // - `kUnbuffered`: This is the state when compression dictionary is finalized
+  //   either because it wasn't enabled in the first place or it's been created
+  //   from sampling previously buffered data. In this state, blocks are simply
+  //   compressed/written out as they fill up. From this state, call `Finish()`
+  //   to complete the file (write meta-blocks, etc.), or `Abandon()` to delete
+  //   the partially created file.
+  //
+  // - `kClosed`: This indicates either `Finish()` or `Abandon()` has been
+  //   called, so the table builder is no longer usable. We must be in this
+  //   state by the time the destructor runs.
+  enum class State {
+    kBuffered,
+    kUnbuffered,
+    kClosed,
+  };
+  State state;
+  // `kBuffered` state is allowed only as long as the buffering of uncompressed
+  // data blocks (see `data_block_buffers`) does not exceed `buffer_limit`.
+  uint64_t buffer_limit;
+  std::shared_ptr<CacheReservationManager>
+      compression_dict_buffer_cache_res_mgr;
+  const bool use_delta_encoding_for_index_values;
+  std::unique_ptr<FilterBlockBuilder> filter_builder;
+  OffsetableCacheKey base_cache_key;
+  const TableFileCreationReason reason;
+
+  BlockHandle pending_handle;  // Handle to add to index block
+
+  std::string compressed_output;
+  std::unique_ptr<FlushBlockPolicy> flush_block_policy;
+
+  std::vector<std::unique_ptr<IntTblPropCollector>> table_properties_collectors;
+
+  std::unique_ptr<ParallelCompressionRep> pc_rep;
+
+  uint64_t get_offset() { return offset.load(std::memory_order_relaxed); }
+  void set_offset(uint64_t o) { offset.store(o, std::memory_order_relaxed); }
+
+  bool IsParallelCompressionEnabled() const {
+    return compression_opts.parallel_threads > 1;
+  }
+
+  Status GetStatus() {
+    // We need to make modifications of status visible when status_ok is set
+    // to false, and this is ensured by status_mutex, so no special memory
+    // order for status_ok is required.
+    if (status_ok.load(std::memory_order_relaxed)) {
+      return Status::OK();
+    } else {
+      return CopyStatus();
+    }
+  }
+
+  Status CopyStatus() {
+    std::lock_guard<std::mutex> lock(status_mutex);
+    return status;
+  }
+
+  IOStatus GetIOStatus() {
+    // We need to make modifications of io_status visible when status_ok is set
+    // to false, and this is ensured by io_status_mutex, so no special memory
+    // order for io_status_ok is required.
+    if (io_status_ok.load(std::memory_order_relaxed)) {
+      return IOStatus::OK();
+    } else {
+      return CopyIOStatus();
+    }
+  }
+
+  IOStatus CopyIOStatus() {
+    std::lock_guard<std::mutex> lock(io_status_mutex);
+    return io_status;
+  }
+
+  // Never erase an existing status that is not OK.
+  void SetStatus(Status s) {
+    if (!s.ok() && status_ok.load(std::memory_order_relaxed)) {
+      // Locking is an overkill for non compression_opts.parallel_threads
+      // case but since it's unlikely that s is not OK, we take this cost
+      // to be simplicity.
+      std::lock_guard<std::mutex> lock(status_mutex);
+      status = s;
+      status_ok.store(false, std::memory_order_relaxed);
+    }
+  }
+
+  // Never erase an existing I/O status that is not OK.
+  // Calling this will also SetStatus(ios)
+  void SetIOStatus(IOStatus ios) {
+    if (!ios.ok() && io_status_ok.load(std::memory_order_relaxed)) {
+      // Locking is an overkill for non compression_opts.parallel_threads
+      // case but since it's unlikely that s is not OK, we take this cost
+      // to be simplicity.
+      std::lock_guard<std::mutex> lock(io_status_mutex);
+      io_status = ios;
+      io_status_ok.store(false, std::memory_order_relaxed);
+    }
+    SetStatus(ios);
+  }
+
+  Rep(const BlockBasedTableOptions& table_opt, const TableBuilderOptions& tbo,
+      WritableFileWriter* f)
+      : ioptions(tbo.ioptions),
+        moptions(tbo.moptions),
+        table_options(table_opt),
+        internal_comparator(tbo.internal_comparator),
+        file(f),
+        offset(0),
+        alignment(table_options.block_align
+                      ? std::min(static_cast<size_t>(table_options.block_size),
+                                 kDefaultPageSize)
+                      : 0),
+        data_block(table_options.block_restart_interval,
+                   table_options.use_delta_encoding,
+                   false /* use_value_delta_encoding */,
+                   tbo.internal_comparator.user_comparator()
+                           ->CanKeysWithDifferentByteContentsBeEqual()
+                       ? BlockBasedTableOptions::kDataBlockBinarySearch
+                       : table_options.data_block_index_type,
+                   table_options.data_block_hash_table_util_ratio),
+        range_del_block(1 /* block_restart_interval */),
+        internal_prefix_transform(tbo.moptions.prefix_extractor.get()),
+        compression_type(tbo.compression_type),
+        sample_for_compression(tbo.moptions.sample_for_compression),
+        compressible_input_data_bytes(0),
+        uncompressible_input_data_bytes(0),
+        sampled_input_data_bytes(0),
+        sampled_output_slow_data_bytes(0),
+        sampled_output_fast_data_bytes(0),
+        compression_opts(tbo.compression_opts),
+        compression_dict(),
+        compression_ctxs(tbo.compression_opts.parallel_threads),
+        verify_ctxs(tbo.compression_opts.parallel_threads),
+        verify_dict(),
+        state((tbo.compression_opts.max_dict_bytes > 0) ? State::kBuffered
+                                                        : State::kUnbuffered),
+        use_delta_encoding_for_index_values(table_opt.format_version >= 4 &&
+                                            !table_opt.block_align),
+        reason(tbo.reason),
+        flush_block_policy(
+            table_options.flush_block_policy_factory->NewFlushBlockPolicy(
+                table_options, data_block)),
+        status_ok(true),
+        io_status_ok(true) {
+    if (tbo.target_file_size == 0) {
+      buffer_limit = compression_opts.max_dict_buffer_bytes;
+    } else if (compression_opts.max_dict_buffer_bytes == 0) {
+      buffer_limit = tbo.target_file_size;
+    } else {
+      buffer_limit = std::min(tbo.target_file_size,
+                              compression_opts.max_dict_buffer_bytes);
+    }
+
+    const auto compress_dict_build_buffer_charged =
+        table_options.cache_usage_options.options_overrides
+            .at(CacheEntryRole::kCompressionDictionaryBuildingBuffer)
+            .charged;
+    if (table_options.block_cache &&
+        (compress_dict_build_buffer_charged ==
+             CacheEntryRoleOptions::Decision::kEnabled ||
+         compress_dict_build_buffer_charged ==
+             CacheEntryRoleOptions::Decision::kFallback)) {
+      compression_dict_buffer_cache_res_mgr =
+          std::make_shared<CacheReservationManagerImpl<
+              CacheEntryRole::kCompressionDictionaryBuildingBuffer>>(
+              table_options.block_cache);
+    } else {
+      compression_dict_buffer_cache_res_mgr = nullptr;
+    }
+
+    for (uint32_t i = 0; i < compression_opts.parallel_threads; i++) {
+      compression_ctxs[i].reset(new CompressionContext(compression_type));
+    }
+    if (table_options.index_type ==
+        BlockBasedTableOptions::kTwoLevelIndexSearch) {
+      p_index_builder_ = PartitionedIndexBuilder::CreateIndexBuilder(
+          &internal_comparator, use_delta_encoding_for_index_values,
+          table_options);
+      index_builder.reset(p_index_builder_);
+    } else {
+      index_builder.reset(IndexBuilder::CreateIndexBuilder(
+          table_options.index_type, &internal_comparator,
+          &this->internal_prefix_transform, use_delta_encoding_for_index_values,
+          table_options));
+    }
+    if (ioptions.optimize_filters_for_hits && tbo.is_bottommost) {
+      // Apply optimize_filters_for_hits setting here when applicable by
+      // skipping filter generation
+      filter_builder.reset();
+    } else if (tbo.skip_filters) {
+      // For SstFileWriter skip_filters
+      filter_builder.reset();
+    } else if (!table_options.filter_policy) {
+      // Null filter_policy -> no filter
+      filter_builder.reset();
+    } else {
+      FilterBuildingContext filter_context(table_options);
+
+      filter_context.info_log = ioptions.logger;
+      filter_context.column_family_name = tbo.column_family_name;
+      filter_context.reason = reason;
+
+      // Only populate other fields if known to be in LSM rather than
+      // generating external SST file
+      if (reason != TableFileCreationReason::kMisc) {
+        filter_context.compaction_style = ioptions.compaction_style;
+        filter_context.num_levels = ioptions.num_levels;
+        filter_context.level_at_creation = tbo.level_at_creation;
+        filter_context.is_bottommost = tbo.is_bottommost;
+        assert(filter_context.level_at_creation < filter_context.num_levels);
+      }
+
+      filter_builder.reset(CreateFilterBlockBuilder(
+          ioptions, moptions, filter_context,
+          use_delta_encoding_for_index_values, p_index_builder_));
+    }
+
+    assert(tbo.int_tbl_prop_collector_factories);
+    for (auto& factory : *tbo.int_tbl_prop_collector_factories) {
+      assert(factory);
+
+      table_properties_collectors.emplace_back(
+          factory->CreateIntTblPropCollector(tbo.column_family_id,
+                                             tbo.level_at_creation));
+    }
+    table_properties_collectors.emplace_back(
+        new BlockBasedTablePropertiesCollector(
+            table_options.index_type, table_options.whole_key_filtering,
+            moptions.prefix_extractor != nullptr));
+    const Comparator* ucmp = tbo.internal_comparator.user_comparator();
+    assert(ucmp);
+    if (ucmp->timestamp_size() > 0) {
+      table_properties_collectors.emplace_back(
+          new TimestampTablePropertiesCollector(ucmp));
+    }
+    if (table_options.verify_compression) {
+      for (uint32_t i = 0; i < compression_opts.parallel_threads; i++) {
+        verify_ctxs[i].reset(new UncompressionContext(compression_type));
+      }
+    }
+
+    // These are only needed for populating table properties
+    props.column_family_id = tbo.column_family_id;
+    props.column_family_name = tbo.column_family_name;
+    props.oldest_key_time = tbo.oldest_key_time;
+    props.file_creation_time = tbo.file_creation_time;
+    props.orig_file_number = tbo.cur_file_num;
+    props.db_id = tbo.db_id;
+    props.db_session_id = tbo.db_session_id;
+    props.db_host_id = ioptions.db_host_id;
+    if (!ReifyDbHostIdProperty(ioptions.env, &props.db_host_id).ok()) {
+      ROCKS_LOG_INFO(ioptions.logger, "db_host_id property will not be set");
+    }
+  }
+
+  Rep(const Rep&) = delete;
+  Rep& operator=(const Rep&) = delete;
+
+ private:
+  // Synchronize status & io_status accesses across threads from main thread,
+  // compression thread and write thread in parallel compression.
+  std::mutex status_mutex;
+  std::atomic<bool> status_ok;
+  Status status;
+  std::mutex io_status_mutex;
+  std::atomic<bool> io_status_ok;
+  IOStatus io_status;
+};
+
+struct BlockBasedTableBuilder::ParallelCompressionRep {
+  // Keys is a wrapper of vector of strings avoiding
+  // releasing string memories during vector clear()
+  // in order to save memory allocation overhead
+  class Keys {
+   public:
+    Keys() : keys_(kKeysInitSize), size_(0) {}
+    void PushBack(const Slice& key) {
+      if (size_ == keys_.size()) {
+        keys_.emplace_back(key.data(), key.size());
+      } else {
+        keys_[size_].assign(key.data(), key.size());
+      }
+      size_++;
+    }
+    void SwapAssign(std::vector<std::string>& keys) {
+      size_ = keys.size();
+      std::swap(keys_, keys);
+    }
+    void Clear() { size_ = 0; }
+    size_t Size() { return size_; }
+    std::string& Back() { return keys_[size_ - 1]; }
+    std::string& operator[](size_t idx) {
+      assert(idx < size_);
+      return keys_[idx];
+    }
+
+   private:
+    const size_t kKeysInitSize = 32;
+    std::vector<std::string> keys_;
+    size_t size_;
+  };
+  std::unique_ptr<Keys> curr_block_keys;
+
+  class BlockRepSlot;
+
+  // BlockRep instances are fetched from and recycled to
+  // block_rep_pool during parallel compression.
+  struct BlockRep {
+    Slice contents;
+    Slice compressed_contents;
+    std::unique_ptr<std::string> data;
+    std::unique_ptr<std::string> compressed_data;
+    CompressionType compression_type;
+    std::unique_ptr<std::string> first_key_in_next_block;
+    std::unique_ptr<Keys> keys;
+    std::unique_ptr<BlockRepSlot> slot;
+    Status status;
+  };
+  // Use a vector of BlockRep as a buffer for a determined number
+  // of BlockRep structures. All data referenced by pointers in
+  // BlockRep will be freed when this vector is destructed.
+  using BlockRepBuffer = std::vector<BlockRep>;
+  BlockRepBuffer block_rep_buf;
+  // Use a thread-safe queue for concurrent access from block
+  // building thread and writer thread.
+  using BlockRepPool = WorkQueue<BlockRep*>;
+  BlockRepPool block_rep_pool;
+
+  // Use BlockRepSlot to keep block order in write thread.
+  // slot_ will pass references to BlockRep
+  class BlockRepSlot {
+   public:
+    BlockRepSlot() : slot_(1) {}
+    template <typename T>
+    void Fill(T&& rep) {
+      slot_.push(std::forward<T>(rep));
+    };
+    void Take(BlockRep*& rep) { slot_.pop(rep); }
+
+   private:
+    // slot_ will pass references to BlockRep in block_rep_buf,
+    // and those references are always valid before the destruction of
+    // block_rep_buf.
+    WorkQueue<BlockRep*> slot_;
+  };
+
+  // Compression queue will pass references to BlockRep in block_rep_buf,
+  // and those references are always valid before the destruction of
+  // block_rep_buf.
+  using CompressQueue = WorkQueue<BlockRep*>;
+  CompressQueue compress_queue;
+  std::vector<port::Thread> compress_thread_pool;
+
+  // Write queue will pass references to BlockRep::slot in block_rep_buf,
+  // and those references are always valid before the corresponding
+  // BlockRep::slot is destructed, which is before the destruction of
+  // block_rep_buf.
+  using WriteQueue = WorkQueue<BlockRepSlot*>;
+  WriteQueue write_queue;
+  std::unique_ptr<port::Thread> write_thread;
+
+  // Estimate output file size when parallel compression is enabled. This is
+  // necessary because compression & flush are no longer synchronized,
+  // and BlockBasedTableBuilder::FileSize() is no longer accurate.
+  // memory_order_relaxed suffices because accurate statistics is not required.
+  class FileSizeEstimator {
+   public:
+    explicit FileSizeEstimator()
+        : uncomp_bytes_compressed(0),
+          uncomp_bytes_curr_block(0),
+          uncomp_bytes_curr_block_set(false),
+          uncomp_bytes_inflight(0),
+          blocks_inflight(0),
+          curr_compression_ratio(0),
+          estimated_file_size(0) {}
+
+    // Estimate file size when a block is about to be emitted to
+    // compression thread
+    void EmitBlock(uint64_t uncomp_block_size, uint64_t curr_file_size) {
+      uint64_t new_uncomp_bytes_inflight =
+          uncomp_bytes_inflight.fetch_add(uncomp_block_size,
+                                          std::memory_order_relaxed) +
+          uncomp_block_size;
+
+      uint64_t new_blocks_inflight =
+          blocks_inflight.fetch_add(1, std::memory_order_relaxed) + 1;
+
+      estimated_file_size.store(
+          curr_file_size +
+              static_cast<uint64_t>(
+                  static_cast<double>(new_uncomp_bytes_inflight) *
+                  curr_compression_ratio.load(std::memory_order_relaxed)) +
+              new_blocks_inflight * kBlockTrailerSize,
+          std::memory_order_relaxed);
+    }
+
+    // Estimate file size when a block is already reaped from
+    // compression thread
+    void ReapBlock(uint64_t compressed_block_size, uint64_t curr_file_size) {
+      assert(uncomp_bytes_curr_block_set);
+
+      uint64_t new_uncomp_bytes_compressed =
+          uncomp_bytes_compressed + uncomp_bytes_curr_block;
+      assert(new_uncomp_bytes_compressed > 0);
+
+      curr_compression_ratio.store(
+          (curr_compression_ratio.load(std::memory_order_relaxed) *
+               uncomp_bytes_compressed +
+           compressed_block_size) /
+              static_cast<double>(new_uncomp_bytes_compressed),
+          std::memory_order_relaxed);
+      uncomp_bytes_compressed = new_uncomp_bytes_compressed;
+
+      uint64_t new_uncomp_bytes_inflight =
+          uncomp_bytes_inflight.fetch_sub(uncomp_bytes_curr_block,
+                                          std::memory_order_relaxed) -
+          uncomp_bytes_curr_block;
+
+      uint64_t new_blocks_inflight =
+          blocks_inflight.fetch_sub(1, std::memory_order_relaxed) - 1;
+
+      estimated_file_size.store(
+          curr_file_size +
+              static_cast<uint64_t>(
+                  static_cast<double>(new_uncomp_bytes_inflight) *
+                  curr_compression_ratio.load(std::memory_order_relaxed)) +
+              new_blocks_inflight * kBlockTrailerSize,
+          std::memory_order_relaxed);
+
+      uncomp_bytes_curr_block_set = false;
+    }
+
+    void SetEstimatedFileSize(uint64_t size) {
+      estimated_file_size.store(size, std::memory_order_relaxed);
+    }
+
+    uint64_t GetEstimatedFileSize() {
+      return estimated_file_size.load(std::memory_order_relaxed);
+    }
+
+    void SetCurrBlockUncompSize(uint64_t size) {
+      uncomp_bytes_curr_block = size;
+      uncomp_bytes_curr_block_set = true;
+    }
+
+   private:
+    // Input bytes compressed so far.
+    uint64_t uncomp_bytes_compressed;
+    // Size of current block being appended.
+    uint64_t uncomp_bytes_curr_block;
+    // Whether uncomp_bytes_curr_block has been set for next
+    // ReapBlock call.
+    bool uncomp_bytes_curr_block_set;
+    // Input bytes under compression and not appended yet.
+    std::atomic<uint64_t> uncomp_bytes_inflight;
+    // Number of blocks under compression and not appended yet.
+    std::atomic<uint64_t> blocks_inflight;
+    // Current compression ratio, maintained by BGWorkWriteMaybeCompressedBlock.
+    std::atomic<double> curr_compression_ratio;
+    // Estimated SST file size.
+    std::atomic<uint64_t> estimated_file_size;
+  };
+  FileSizeEstimator file_size_estimator;
+
+  // Facilities used for waiting first block completion. Need to Wait for
+  // the completion of first block compression and flush to get a non-zero
+  // compression ratio.
+  std::atomic<bool> first_block_processed;
+  std::condition_variable first_block_cond;
+  std::mutex first_block_mutex;
+
+  explicit ParallelCompressionRep(uint32_t parallel_threads)
+      : curr_block_keys(new Keys()),
+        block_rep_buf(parallel_threads),
+        block_rep_pool(parallel_threads),
+        compress_queue(parallel_threads),
+        write_queue(parallel_threads),
+        first_block_processed(false) {
+    for (uint32_t i = 0; i < parallel_threads; i++) {
+      block_rep_buf[i].contents = Slice();
+      block_rep_buf[i].compressed_contents = Slice();
+      block_rep_buf[i].data.reset(new std::string());
+      block_rep_buf[i].compressed_data.reset(new std::string());
+      block_rep_buf[i].compression_type = CompressionType();
+      block_rep_buf[i].first_key_in_next_block.reset(new std::string());
+      block_rep_buf[i].keys.reset(new Keys());
+      block_rep_buf[i].slot.reset(new BlockRepSlot());
+      block_rep_buf[i].status = Status::OK();
+      block_rep_pool.push(&block_rep_buf[i]);
+    }
+  }
+
+  ~ParallelCompressionRep() { block_rep_pool.finish(); }
+
+  // Make a block prepared to be emitted to compression thread
+  // Used in non-buffered mode
+  BlockRep* PrepareBlock(CompressionType compression_type,
+                         const Slice* first_key_in_next_block,
+                         BlockBuilder* data_block) {
+    BlockRep* block_rep =
+        PrepareBlockInternal(compression_type, first_key_in_next_block);
+    assert(block_rep != nullptr);
+    data_block->SwapAndReset(*(block_rep->data));
+    block_rep->contents = *(block_rep->data);
+    std::swap(block_rep->keys, curr_block_keys);
+    curr_block_keys->Clear();
+    return block_rep;
+  }
+
+  // Used in EnterUnbuffered
+  BlockRep* PrepareBlock(CompressionType compression_type,
+                         const Slice* first_key_in_next_block,
+                         std::string* data_block,
+                         std::vector<std::string>* keys) {
+    BlockRep* block_rep =
+        PrepareBlockInternal(compression_type, first_key_in_next_block);
+    assert(block_rep != nullptr);
+    std::swap(*(block_rep->data), *data_block);
+    block_rep->contents = *(block_rep->data);
+    block_rep->keys->SwapAssign(*keys);
+    return block_rep;
+  }
+
+  // Emit a block to compression thread
+  void EmitBlock(BlockRep* block_rep) {
+    assert(block_rep != nullptr);
+    assert(block_rep->status.ok());
+    if (!write_queue.push(block_rep->slot.get())) {
+      return;
+    }
+    if (!compress_queue.push(block_rep)) {
+      return;
+    }
+
+    if (!first_block_processed.load(std::memory_order_relaxed)) {
+      std::unique_lock<std::mutex> lock(first_block_mutex);
+      first_block_cond.wait(lock, [this] {
+        return first_block_processed.load(std::memory_order_relaxed);
+      });
+    }
+  }
+
+  // Reap a block from compression thread
+  void ReapBlock(BlockRep* block_rep) {
+    assert(block_rep != nullptr);
+    block_rep->compressed_data->clear();
+    block_rep_pool.push(block_rep);
+
+    if (!first_block_processed.load(std::memory_order_relaxed)) {
+      std::lock_guard<std::mutex> lock(first_block_mutex);
+      first_block_processed.store(true, std::memory_order_relaxed);
+      first_block_cond.notify_one();
+    }
+  }
+
+ private:
+  BlockRep* PrepareBlockInternal(CompressionType compression_type,
+                                 const Slice* first_key_in_next_block) {
+    BlockRep* block_rep = nullptr;
+    block_rep_pool.pop(block_rep);
+    assert(block_rep != nullptr);
+
+    assert(block_rep->data);
+
+    block_rep->compression_type = compression_type;
+
+    if (first_key_in_next_block == nullptr) {
+      block_rep->first_key_in_next_block.reset(nullptr);
+    } else {
+      block_rep->first_key_in_next_block->assign(
+          first_key_in_next_block->data(), first_key_in_next_block->size());
+    }
+
+    return block_rep;
+  }
+};
+
+BlockBasedTableBuilder::BlockBasedTableBuilder(
+    const BlockBasedTableOptions& table_options, const TableBuilderOptions& tbo,
+    WritableFileWriter* file) {
+  BlockBasedTableOptions sanitized_table_options(table_options);
+  if (sanitized_table_options.format_version == 0 &&
+      sanitized_table_options.checksum != kCRC32c) {
+    ROCKS_LOG_WARN(
+        tbo.ioptions.logger,
+        "Silently converting format_version to 1 because checksum is "
+        "non-default");
+    // silently convert format_version to 1 to keep consistent with current
+    // behavior
+    sanitized_table_options.format_version = 1;
+  }
+
+  rep_ = new Rep(sanitized_table_options, tbo, file);
+
+  TEST_SYNC_POINT_CALLBACK(
+      "BlockBasedTableBuilder::BlockBasedTableBuilder:PreSetupBaseCacheKey",
+      const_cast<TableProperties*>(&rep_->props));
+
+  BlockBasedTable::SetupBaseCacheKey(&rep_->props, tbo.db_session_id,
+                                     tbo.cur_file_num, &rep_->base_cache_key);
+
+  if (rep_->IsParallelCompressionEnabled()) {
+    StartParallelCompression();
+  }
+}
+
+BlockBasedTableBuilder::~BlockBasedTableBuilder() {
+  // Catch errors where caller forgot to call Finish()
+  assert(rep_->state == Rep::State::kClosed);
+  delete rep_;
+}
+
+void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
+  Rep* r = rep_;
+  assert(rep_->state != Rep::State::kClosed);
+  if (!ok()) return;
+  ValueType value_type = ExtractValueType(key);
+  if (IsValueType(value_type)) {
+#ifndef NDEBUG
+    if (r->props.num_entries > r->props.num_range_deletions) {
+      assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0);
+    }
+#endif  // !NDEBUG
+
+    auto should_flush = r->flush_block_policy->Update(key, value);
+    if (should_flush) {
+      assert(!r->data_block.empty());
+      r->first_key_in_next_block = &key;
+      Flush();
+      if (r->state == Rep::State::kBuffered) {
+        bool exceeds_buffer_limit =
+            (r->buffer_limit != 0 && r->data_begin_offset > r->buffer_limit);
+        bool exceeds_global_block_cache_limit = false;
+
+        // Increase cache charging for the last buffered data block
+        // only if the block is not going to be unbuffered immediately
+        // and there exists a cache reservation manager
+        if (!exceeds_buffer_limit &&
+            r->compression_dict_buffer_cache_res_mgr != nullptr) {
+          Status s =
+              r->compression_dict_buffer_cache_res_mgr->UpdateCacheReservation(
+                  r->data_begin_offset);
+          exceeds_global_block_cache_limit = s.IsMemoryLimit();
+        }
+
+        if (exceeds_buffer_limit || exceeds_global_block_cache_limit) {
+          EnterUnbuffered();
+        }
+      }
+
+      // Add item to index block.
+      // We do not emit the index entry for a block until we have seen the
+      // first key for the next data block.  This allows us to use shorter
+      // keys in the index block.  For example, consider a block boundary
+      // between the keys "the quick brown fox" and "the who".  We can use
+      // "the r" as the key for the index block entry since it is >= all
+      // entries in the first block and < all entries in subsequent
+      // blocks.
+      if (ok() && r->state == Rep::State::kUnbuffered) {
+        if (r->IsParallelCompressionEnabled()) {
+          r->pc_rep->curr_block_keys->Clear();
+        } else {
+          r->index_builder->AddIndexEntry(&r->last_key, &key,
+                                          r->pending_handle);
+        }
+      }
+    }
+
+    // Note: PartitionedFilterBlockBuilder requires key being added to filter
+    // builder after being added to index builder.
+    if (r->state == Rep::State::kUnbuffered) {
+      if (r->IsParallelCompressionEnabled()) {
+        r->pc_rep->curr_block_keys->PushBack(key);
+      } else {
+        if (r->filter_builder != nullptr) {
+          size_t ts_sz =
+              r->internal_comparator.user_comparator()->timestamp_size();
+          r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz));
+        }
+      }
+    }
+
+    r->data_block.AddWithLastKey(key, value, r->last_key);
+    r->last_key.assign(key.data(), key.size());
+    if (r->state == Rep::State::kBuffered) {
+      // Buffered keys will be replayed from data_block_buffers during
+      // `Finish()` once compression dictionary has been finalized.
+    } else {
+      if (!r->IsParallelCompressionEnabled()) {
+        r->index_builder->OnKeyAdded(key);
+      }
+    }
+    // TODO offset passed in is not accurate for parallel compression case
+    NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(),
+                                      r->table_properties_collectors,
+                                      r->ioptions.logger);
+
+  } else if (value_type == kTypeRangeDeletion) {
+    r->range_del_block.Add(key, value);
+    // TODO offset passed in is not accurate for parallel compression case
+    NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(),
+                                      r->table_properties_collectors,
+                                      r->ioptions.logger);
+  } else {
+    assert(false);
+  }
+
+  r->props.num_entries++;
+  r->props.raw_key_size += key.size();
+  r->props.raw_value_size += value.size();
+  if (value_type == kTypeDeletion || value_type == kTypeSingleDeletion ||
+      value_type == kTypeDeletionWithTimestamp) {
+    r->props.num_deletions++;
+  } else if (value_type == kTypeRangeDeletion) {
+    r->props.num_deletions++;
+    r->props.num_range_deletions++;
+  } else if (value_type == kTypeMerge) {
+    r->props.num_merge_operands++;
+  }
+}
+
+void BlockBasedTableBuilder::Flush() {
+  Rep* r = rep_;
+  assert(rep_->state != Rep::State::kClosed);
+  if (!ok()) return;
+  if (r->data_block.empty()) return;
+  if (r->IsParallelCompressionEnabled() &&
+      r->state == Rep::State::kUnbuffered) {
+    r->data_block.Finish();
+    ParallelCompressionRep::BlockRep* block_rep = r->pc_rep->PrepareBlock(
+        r->compression_type, r->first_key_in_next_block, &(r->data_block));
+    assert(block_rep != nullptr);
+    r->pc_rep->file_size_estimator.EmitBlock(block_rep->data->size(),
+                                             r->get_offset());
+    r->pc_rep->EmitBlock(block_rep);
+  } else {
+    WriteBlock(&r->data_block, &r->pending_handle, BlockType::kData);
+  }
+}
+
+void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block,
+                                        BlockHandle* handle,
+                                        BlockType block_type) {
+  block->Finish();
+  std::string uncompressed_block_data;
+  uncompressed_block_data.reserve(rep_->table_options.block_size);
+  block->SwapAndReset(uncompressed_block_data);
+  if (rep_->state == Rep::State::kBuffered) {
+    assert(block_type == BlockType::kData);
+    rep_->data_block_buffers.emplace_back(std::move(uncompressed_block_data));
+    rep_->data_begin_offset += rep_->data_block_buffers.back().size();
+    return;
+  }
+  WriteBlock(uncompressed_block_data, handle, block_type);
+}
+
+void BlockBasedTableBuilder::WriteBlock(const Slice& uncompressed_block_data,
+                                        BlockHandle* handle,
+                                        BlockType block_type) {
+  Rep* r = rep_;
+  assert(r->state == Rep::State::kUnbuffered);
+  Slice block_contents;
+  CompressionType type;
+  Status compress_status;
+  bool is_data_block = block_type == BlockType::kData;
+  CompressAndVerifyBlock(uncompressed_block_data, is_data_block,
+                         *(r->compression_ctxs[0]), r->verify_ctxs[0].get(),
+                         &(r->compressed_output), &(block_contents), &type,
+                         &compress_status);
+  r->SetStatus(compress_status);
+  if (!ok()) {
+    return;
+  }
+
+  WriteMaybeCompressedBlock(block_contents, type, handle, block_type,
+                            &uncompressed_block_data);
+  r->compressed_output.clear();
+  if (is_data_block) {
+    r->props.data_size = r->get_offset();
+    ++r->props.num_data_blocks;
+  }
+}
+
+void BlockBasedTableBuilder::BGWorkCompression(
+    const CompressionContext& compression_ctx,
+    UncompressionContext* verify_ctx) {
+  ParallelCompressionRep::BlockRep* block_rep = nullptr;
+  while (rep_->pc_rep->compress_queue.pop(block_rep)) {
+    assert(block_rep != nullptr);
+    CompressAndVerifyBlock(block_rep->contents, true, /* is_data_block*/
+                           compression_ctx, verify_ctx,
+                           block_rep->compressed_data.get(),
+                           &block_rep->compressed_contents,
+                           &(block_rep->compression_type), &block_rep->status);
+    block_rep->slot->Fill(block_rep);
+  }
+}
+
+void BlockBasedTableBuilder::CompressAndVerifyBlock(
+    const Slice& uncompressed_block_data, bool is_data_block,
+    const CompressionContext& compression_ctx, UncompressionContext* verify_ctx,
+    std::string* compressed_output, Slice* block_contents,
+    CompressionType* type, Status* out_status) {
+  // File format contains a sequence of blocks where each block has:
+  //    block_data: uint8[n]
+  //    type: uint8
+  //    crc: uint32
+  Rep* r = rep_;
+  bool is_status_ok = ok();
+  if (!r->IsParallelCompressionEnabled()) {
+    assert(is_status_ok);
+  }
+
+  *type = r->compression_type;
+  uint64_t sample_for_compression = r->sample_for_compression;
+  bool abort_compression = false;
+
+  StopWatchNano timer(
+      r->ioptions.clock,
+      ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats));
+
+  if (is_status_ok && uncompressed_block_data.size() < kCompressionSizeLimit) {
+    if (is_data_block) {
+      r->compressible_input_data_bytes.fetch_add(uncompressed_block_data.size(),
+                                                 std::memory_order_relaxed);
+    }
+    const CompressionDict* compression_dict;
+    if (!is_data_block || r->compression_dict == nullptr) {
+      compression_dict = &CompressionDict::GetEmptyDict();
+    } else {
+      compression_dict = r->compression_dict.get();
+    }
+    assert(compression_dict != nullptr);
+    CompressionInfo compression_info(r->compression_opts, compression_ctx,
+                                     *compression_dict, *type,
+                                     sample_for_compression);
+
+    std::string sampled_output_fast;
+    std::string sampled_output_slow;
+    *block_contents = CompressBlock(
+        uncompressed_block_data, compression_info, type,
+        r->table_options.format_version, is_data_block /* do_sample */,
+        compressed_output, &sampled_output_fast, &sampled_output_slow);
+
+    if (sampled_output_slow.size() > 0 || sampled_output_fast.size() > 0) {
+      // Currently compression sampling is only enabled for data block.
+      assert(is_data_block);
+      r->sampled_input_data_bytes.fetch_add(uncompressed_block_data.size(),
+                                            std::memory_order_relaxed);
+      r->sampled_output_slow_data_bytes.fetch_add(sampled_output_slow.size(),
+                                                  std::memory_order_relaxed);
+      r->sampled_output_fast_data_bytes.fetch_add(sampled_output_fast.size(),
+                                                  std::memory_order_relaxed);
+    }
+    // notify collectors on block add
+    NotifyCollectTableCollectorsOnBlockAdd(
+        r->table_properties_collectors, uncompressed_block_data.size(),
+        sampled_output_fast.size(), sampled_output_slow.size());
+
+    // Some of the compression algorithms are known to be unreliable. If
+    // the verify_compression flag is set then try to de-compress the
+    // compressed data and compare to the input.
+    if (*type != kNoCompression && r->table_options.verify_compression) {
+      // Retrieve the uncompressed contents into a new buffer
+      const UncompressionDict* verify_dict;
+      if (!is_data_block || r->verify_dict == nullptr) {
+        verify_dict = &UncompressionDict::GetEmptyDict();
+      } else {
+        verify_dict = r->verify_dict.get();
+      }
+      assert(verify_dict != nullptr);
+      BlockContents contents;
+      UncompressionInfo uncompression_info(*verify_ctx, *verify_dict,
+                                           r->compression_type);
+      Status stat = UncompressBlockData(
+          uncompression_info, block_contents->data(), block_contents->size(),
+          &contents, r->table_options.format_version, r->ioptions);
+
+      if (stat.ok()) {
+        bool compressed_ok =
+            contents.data.compare(uncompressed_block_data) == 0;
+        if (!compressed_ok) {
+          // The result of the compression was invalid. abort.
+          abort_compression = true;
+          const char* const msg =
+              "Decompressed block did not match pre-compression block";
+          ROCKS_LOG_ERROR(r->ioptions.logger, "%s", msg);
+          *out_status = Status::Corruption(msg);
+        }
+      } else {
+        // Decompression reported an error. abort.
+        *out_status = Status::Corruption(std::string("Could not decompress: ") +
+                                         stat.getState());
+        abort_compression = true;
+      }
+    }
+  } else {
+    // Block is too big to be compressed.
+    if (is_data_block) {
+      r->uncompressible_input_data_bytes.fetch_add(
+          uncompressed_block_data.size(), std::memory_order_relaxed);
+    }
+    abort_compression = true;
+  }
+  if (is_data_block) {
+    r->uncompressible_input_data_bytes.fetch_add(kBlockTrailerSize,
+                                                 std::memory_order_relaxed);
+  }
+
+  // Abort compression if the block is too big, or did not pass
+  // verification.
+  if (abort_compression) {
+    RecordTick(r->ioptions.stats, NUMBER_BLOCK_NOT_COMPRESSED);
+    *type = kNoCompression;
+    *block_contents = uncompressed_block_data;
+  } else if (*type != kNoCompression) {
+    if (ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats)) {
+      RecordTimeToHistogram(r->ioptions.stats, COMPRESSION_TIMES_NANOS,
+                            timer.ElapsedNanos());
+    }
+    RecordInHistogram(r->ioptions.stats, BYTES_COMPRESSED,
+                      uncompressed_block_data.size());
+    RecordTick(r->ioptions.stats, NUMBER_BLOCK_COMPRESSED);
+  } else if (*type != r->compression_type) {
+    RecordTick(r->ioptions.stats, NUMBER_BLOCK_NOT_COMPRESSED);
+  }
+}
+
+void BlockBasedTableBuilder::WriteMaybeCompressedBlock(
+    const Slice& block_contents, CompressionType type, BlockHandle* handle,
+    BlockType block_type, const Slice* uncompressed_block_data) {
+  Rep* r = rep_;
+  bool is_data_block = block_type == BlockType::kData;
+  // Old, misleading name of this function: WriteRawBlock
+  StopWatch sw(r->ioptions.clock, r->ioptions.stats, WRITE_RAW_BLOCK_MICROS);
+  handle->set_offset(r->get_offset());
+  handle->set_size(block_contents.size());
+  assert(status().ok());
+  assert(io_status().ok());
+
+  {
+    IOStatus io_s = r->file->Append(block_contents);
+    if (!io_s.ok()) {
+      r->SetIOStatus(io_s);
+      return;
+    }
+  }
+
+  std::array<char, kBlockTrailerSize> trailer;
+  trailer[0] = type;
+  uint32_t checksum = ComputeBuiltinChecksumWithLastByte(
+      r->table_options.checksum, block_contents.data(), block_contents.size(),
+      /*last_byte*/ type);
+
+  if (block_type == BlockType::kFilter) {
+    Status s = r->filter_builder->MaybePostVerifyFilter(block_contents);
+    if (!s.ok()) {
+      r->SetStatus(s);
+      return;
+    }
+  }
+
+  EncodeFixed32(trailer.data() + 1, checksum);
+  TEST_SYNC_POINT_CALLBACK(
+      "BlockBasedTableBuilder::WriteMaybeCompressedBlock:TamperWithChecksum",
+      trailer.data());
+  {
+    IOStatus io_s = r->file->Append(Slice(trailer.data(), trailer.size()));
+    if (!io_s.ok()) {
+      r->SetIOStatus(io_s);
+      return;
+    }
+  }
+
+  {
+    Status s = Status::OK();
+    bool warm_cache;
+    switch (r->table_options.prepopulate_block_cache) {
+      case BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly:
+        warm_cache = (r->reason == TableFileCreationReason::kFlush);
+        break;
+      case BlockBasedTableOptions::PrepopulateBlockCache::kDisable:
+        warm_cache = false;
+        break;
+      default:
+        // missing case
+        assert(false);
+        warm_cache = false;
+    }
+    if (warm_cache) {
+      if (type == kNoCompression) {
+        s = InsertBlockInCacheHelper(block_contents, handle, block_type);
+      } else if (uncompressed_block_data != nullptr) {
+        s = InsertBlockInCacheHelper(*uncompressed_block_data, handle,
+                                     block_type);
+      }
+      if (!s.ok()) {
+        r->SetStatus(s);
+        return;
+      }
+    }
+    s = InsertBlockInCompressedCache(block_contents, type, handle);
+    if (!s.ok()) {
+      r->SetStatus(s);
+      return;
+    }
+  }
+
+  r->set_offset(r->get_offset() + block_contents.size() + kBlockTrailerSize);
+  if (r->table_options.block_align && is_data_block) {
+    size_t pad_bytes =
+        (r->alignment -
+         ((block_contents.size() + kBlockTrailerSize) & (r->alignment - 1))) &
+        (r->alignment - 1);
+    IOStatus io_s = r->file->Pad(pad_bytes);
+    if (io_s.ok()) {
+      r->set_offset(r->get_offset() + pad_bytes);
+    } else {
+      r->SetIOStatus(io_s);
+      return;
+    }
+  }
+
+  if (r->IsParallelCompressionEnabled()) {
+    if (is_data_block) {
+      r->pc_rep->file_size_estimator.ReapBlock(block_contents.size(),
+                                               r->get_offset());
+    } else {
+      r->pc_rep->file_size_estimator.SetEstimatedFileSize(r->get_offset());
+    }
+  }
+}
+
+void BlockBasedTableBuilder::BGWorkWriteMaybeCompressedBlock() {
+  Rep* r = rep_;
+  ParallelCompressionRep::BlockRepSlot* slot = nullptr;
+  ParallelCompressionRep::BlockRep* block_rep = nullptr;
+  while (r->pc_rep->write_queue.pop(slot)) {
+    assert(slot != nullptr);
+    slot->Take(block_rep);
+    assert(block_rep != nullptr);
+    if (!block_rep->status.ok()) {
+      r->SetStatus(block_rep->status);
+      // Reap block so that blocked Flush() can finish
+      // if there is one, and Flush() will notice !ok() next time.
+      block_rep->status = Status::OK();
+      r->pc_rep->ReapBlock(block_rep);
+      continue;
+    }
+
+    for (size_t i = 0; i < block_rep->keys->Size(); i++) {
+      auto& key = (*block_rep->keys)[i];
+      if (r->filter_builder != nullptr) {
+        size_t ts_sz =
+            r->internal_comparator.user_comparator()->timestamp_size();
+        r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz));
+      }
+      r->index_builder->OnKeyAdded(key);
+    }
+
+    r->pc_rep->file_size_estimator.SetCurrBlockUncompSize(
+        block_rep->data->size());
+    WriteMaybeCompressedBlock(block_rep->compressed_contents,
+                              block_rep->compression_type, &r->pending_handle,
+                              BlockType::kData, &block_rep->contents);
+    if (!ok()) {
+      break;
+    }
+
+    r->props.data_size = r->get_offset();
+    ++r->props.num_data_blocks;
+
+    if (block_rep->first_key_in_next_block == nullptr) {
+      r->index_builder->AddIndexEntry(&(block_rep->keys->Back()), nullptr,
+                                      r->pending_handle);
+    } else {
+      Slice first_key_in_next_block =
+          Slice(*block_rep->first_key_in_next_block);
+      r->index_builder->AddIndexEntry(&(block_rep->keys->Back()),
+                                      &first_key_in_next_block,
+                                      r->pending_handle);
+    }
+
+    r->pc_rep->ReapBlock(block_rep);
+  }
+}
+
+void BlockBasedTableBuilder::StartParallelCompression() {
+  rep_->pc_rep.reset(
+      new ParallelCompressionRep(rep_->compression_opts.parallel_threads));
+  rep_->pc_rep->compress_thread_pool.reserve(
+      rep_->compression_opts.parallel_threads);
+  for (uint32_t i = 0; i < rep_->compression_opts.parallel_threads; i++) {
+    rep_->pc_rep->compress_thread_pool.emplace_back([this, i] {
+      BGWorkCompression(*(rep_->compression_ctxs[i]),
+                        rep_->verify_ctxs[i].get());
+    });
+  }
+  rep_->pc_rep->write_thread.reset(
+      new port::Thread([this] { BGWorkWriteMaybeCompressedBlock(); }));
+}
+
+void BlockBasedTableBuilder::StopParallelCompression() {
+  rep_->pc_rep->compress_queue.finish();
+  for (auto& thread : rep_->pc_rep->compress_thread_pool) {
+    thread.join();
+  }
+  rep_->pc_rep->write_queue.finish();
+  rep_->pc_rep->write_thread->join();
+}
+
+Status BlockBasedTableBuilder::status() const { return rep_->GetStatus(); }
+
+IOStatus BlockBasedTableBuilder::io_status() const {
+  return rep_->GetIOStatus();
+}
+
+//
+// Make a copy of the block contents and insert into compressed block cache
+//
+Status BlockBasedTableBuilder::InsertBlockInCompressedCache(
+    const Slice& block_contents, const CompressionType type,
+    const BlockHandle* handle) {
+  Rep* r = rep_;
+  Cache* block_cache_compressed = r->table_options.block_cache_compressed.get();
+  Status s;
+  if (type != kNoCompression && block_cache_compressed != nullptr) {
+    size_t size = block_contents.size();
+
+    auto ubuf =
+        AllocateBlock(size + 1, block_cache_compressed->memory_allocator());
+    memcpy(ubuf.get(), block_contents.data(), size);
+    ubuf[size] = type;
+
+    BlockContents* block_contents_to_cache =
+        new BlockContents(std::move(ubuf), size);
+#ifndef NDEBUG
+    block_contents_to_cache->has_trailer = true;
+#endif  // NDEBUG
+
+    CacheKey key = BlockBasedTable::GetCacheKey(rep_->base_cache_key, *handle);
+
+    s = block_cache_compressed->Insert(
+        key.AsSlice(), block_contents_to_cache,
+        block_contents_to_cache->ApproximateMemoryUsage(),
+        &DeleteCacheEntry<BlockContents>);
+    if (s.ok()) {
+      RecordTick(rep_->ioptions.stats, BLOCK_CACHE_COMPRESSED_ADD);
+    } else {
+      RecordTick(rep_->ioptions.stats, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
+    }
+    // Invalidate OS cache.
+    r->file->InvalidateCache(static_cast<size_t>(r->get_offset()), size)
+        .PermitUncheckedError();
+  }
+  return s;
+}
+
+Status BlockBasedTableBuilder::InsertBlockInCacheHelper(
+    const Slice& block_contents, const BlockHandle* handle,
+    BlockType block_type) {
+  Status s;
+  switch (block_type) {
+    case BlockType::kData:
+    case BlockType::kIndex:
+    case BlockType::kFilterPartitionIndex:
+      s = InsertBlockInCache<Block>(block_contents, handle, block_type);
+      break;
+    case BlockType::kFilter:
+      s = InsertBlockInCache<ParsedFullFilterBlock>(block_contents, handle,
+                                                    block_type);
+      break;
+    case BlockType::kCompressionDictionary:
+      s = InsertBlockInCache<UncompressionDict>(block_contents, handle,
+                                                block_type);
+      break;
+    default:
+      // no-op / not cached
+      break;
+  }
+  return s;
+}
+
+template <typename TBlocklike>
+Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
+                                                  const BlockHandle* handle,
+                                                  BlockType block_type) {
+  // Uncompressed regular block cache
+  Cache* block_cache = rep_->table_options.block_cache.get();
+  Status s;
+  if (block_cache != nullptr) {
+    size_t size = block_contents.size();
+    auto buf = AllocateBlock(size, block_cache->memory_allocator());
+    memcpy(buf.get(), block_contents.data(), size);
+    BlockContents results(std::move(buf), size);
+
+    CacheKey key = BlockBasedTable::GetCacheKey(rep_->base_cache_key, *handle);
+
+    const size_t read_amp_bytes_per_bit =
+        rep_->table_options.read_amp_bytes_per_bit;
+
+    // TODO akanksha:: Dedup below code by calling
+    // BlockBasedTable::PutDataBlockToCache.
+    std::unique_ptr<TBlocklike> block_holder(
+        BlocklikeTraits<TBlocklike>::Create(
+            std::move(results), read_amp_bytes_per_bit,
+            rep_->ioptions.statistics.get(),
+            false /*rep_->blocks_definitely_zstd_compressed*/,
+            rep_->table_options.filter_policy.get()));
+
+    assert(block_holder->own_bytes());
+    size_t charge = block_holder->ApproximateMemoryUsage();
+    s = block_cache->Insert(
+        key.AsSlice(), block_holder.get(),
+        BlocklikeTraits<TBlocklike>::GetCacheItemHelper(block_type), charge,
+        nullptr, Cache::Priority::LOW);
+
+    if (s.ok()) {
+      // Release ownership of block_holder.
+      block_holder.release();
+      BlockBasedTable::UpdateCacheInsertionMetrics(
+          block_type, nullptr /*get_context*/, charge, s.IsOkOverwritten(),
+          rep_->ioptions.stats);
+    } else {
+      RecordTick(rep_->ioptions.stats, BLOCK_CACHE_ADD_FAILURES);
+    }
+  }
+  return s;
+}
+
+void BlockBasedTableBuilder::WriteFilterBlock(
+    MetaIndexBuilder* meta_index_builder) {
+  if (rep_->filter_builder == nullptr || rep_->filter_builder->IsEmpty()) {
+    // No filter block needed
+    return;
+  }
+  BlockHandle filter_block_handle;
+  bool is_partitioned_filter = rep_->table_options.partition_filters;
+  if (ok()) {
+    rep_->props.num_filter_entries +=
+        rep_->filter_builder->EstimateEntriesAdded();
+    Status s = Status::Incomplete();
+    while (ok() && s.IsIncomplete()) {
+      // filter_data is used to store the transferred filter data payload from
+      // FilterBlockBuilder and deallocate the payload by going out of scope.
+      // Otherwise, the payload will unnecessarily remain until
+      // BlockBasedTableBuilder is deallocated.
+      //
+      // See FilterBlockBuilder::Finish() for more on the difference in
+      // transferred filter data payload among different FilterBlockBuilder
+      // subtypes.
+      std::unique_ptr<const char[]> filter_data;
+      Slice filter_content =
+          rep_->filter_builder->Finish(filter_block_handle, &s, &filter_data);
+
+      assert(s.ok() || s.IsIncomplete() || s.IsCorruption());
+      if (s.IsCorruption()) {
+        rep_->SetStatus(s);
+        break;
+      }
+
+      rep_->props.filter_size += filter_content.size();
+
+      BlockType btype = is_partitioned_filter && /* last */ s.ok()
+                            ? BlockType::kFilterPartitionIndex
+                            : BlockType::kFilter;
+      WriteMaybeCompressedBlock(filter_content, kNoCompression,
+                                &filter_block_handle, btype);
+    }
+    rep_->filter_builder->ResetFilterBitsBuilder();
+  }
+  if (ok()) {
+    // Add mapping from "<filter_block_prefix>.Name" to location
+    // of filter data.
+    std::string key;
+    key = is_partitioned_filter ? BlockBasedTable::kPartitionedFilterBlockPrefix
+                                : BlockBasedTable::kFullFilterBlockPrefix;
+    key.append(rep_->table_options.filter_policy->CompatibilityName());
+    meta_index_builder->Add(key, filter_block_handle);
+  }
+}
+
+void BlockBasedTableBuilder::WriteIndexBlock(
+    MetaIndexBuilder* meta_index_builder, BlockHandle* index_block_handle) {
+  if (!ok()) {
+    return;
+  }
+  IndexBuilder::IndexBlocks index_blocks;
+  auto index_builder_status = rep_->index_builder->Finish(&index_blocks);
+  if (index_builder_status.IsIncomplete()) {
+    // We we have more than one index partition then meta_blocks are not
+    // supported for the index. Currently meta_blocks are used only by
+    // HashIndexBuilder which is not multi-partition.
+    assert(index_blocks.meta_blocks.empty());
+  } else if (ok() && !index_builder_status.ok()) {
+    rep_->SetStatus(index_builder_status);
+  }
+  if (ok()) {
+    for (const auto& item : index_blocks.meta_blocks) {
+      BlockHandle block_handle;
+      WriteBlock(item.second, &block_handle, BlockType::kIndex);
+      if (!ok()) {
+        break;
+      }
+      meta_index_builder->Add(item.first, block_handle);
+    }
+  }
+  if (ok()) {
+    if (rep_->table_options.enable_index_compression) {
+      WriteBlock(index_blocks.index_block_contents, index_block_handle,
+                 BlockType::kIndex);
+    } else {
+      WriteMaybeCompressedBlock(index_blocks.index_block_contents,
+                                kNoCompression, index_block_handle,
+                                BlockType::kIndex);
+    }
+  }
+  // If there are more index partitions, finish them and write them out
+  if (index_builder_status.IsIncomplete()) {
+    bool index_building_finished = false;
+    while (ok() && !index_building_finished) {
+      Status s =
+          rep_->index_builder->Finish(&index_blocks, *index_block_handle);
+      if (s.ok()) {
+        index_building_finished = true;
+      } else if (s.IsIncomplete()) {
+        // More partitioned index after this one
+        assert(!index_building_finished);
+      } else {
+        // Error
+        rep_->SetStatus(s);
+        return;
+      }
+
+      if (rep_->table_options.enable_index_compression) {
+        WriteBlock(index_blocks.index_block_contents, index_block_handle,
+                   BlockType::kIndex);
+      } else {
+        WriteMaybeCompressedBlock(index_blocks.index_block_contents,
+                                  kNoCompression, index_block_handle,
+                                  BlockType::kIndex);
+      }
+      // The last index_block_handle will be for the partition index block
+    }
+  }
+}
+
+void BlockBasedTableBuilder::WritePropertiesBlock(
+    MetaIndexBuilder* meta_index_builder) {
+  BlockHandle properties_block_handle;
+  if (ok()) {
+    PropertyBlockBuilder property_block_builder;
+    rep_->props.filter_policy_name =
+        rep_->table_options.filter_policy != nullptr
+            ? rep_->table_options.filter_policy->Name()
+            : "";
+    rep_->props.index_size =
+        rep_->index_builder->IndexSize() + kBlockTrailerSize;
+    rep_->props.comparator_name = rep_->ioptions.user_comparator != nullptr
+                                      ? rep_->ioptions.user_comparator->Name()
+                                      : "nullptr";
+    rep_->props.merge_operator_name =
+        rep_->ioptions.merge_operator != nullptr
+            ? rep_->ioptions.merge_operator->Name()
+            : "nullptr";
+    rep_->props.compression_name =
+        CompressionTypeToString(rep_->compression_type);
+    rep_->props.compression_options =
+        CompressionOptionsToString(rep_->compression_opts);
+    rep_->props.prefix_extractor_name =
+        rep_->moptions.prefix_extractor != nullptr
+            ? rep_->moptions.prefix_extractor->AsString()
+            : "nullptr";
+    std::string property_collectors_names = "[";
+    for (size_t i = 0;
+         i < rep_->ioptions.table_properties_collector_factories.size(); ++i) {
+      if (i != 0) {
+        property_collectors_names += ",";
+      }
+      property_collectors_names +=
+          rep_->ioptions.table_properties_collector_factories[i]->Name();
+    }
+    property_collectors_names += "]";
+    rep_->props.property_collectors_names = property_collectors_names;
+    if (rep_->table_options.index_type ==
+        BlockBasedTableOptions::kTwoLevelIndexSearch) {
+      assert(rep_->p_index_builder_ != nullptr);
+      rep_->props.index_partitions = rep_->p_index_builder_->NumPartitions();
+      rep_->props.top_level_index_size =
+          rep_->p_index_builder_->TopLevelIndexSize(rep_->offset);
+    }
+    rep_->props.index_key_is_user_key =
+        !rep_->index_builder->seperator_is_key_plus_seq();
+    rep_->props.index_value_is_delta_encoded =
+        rep_->use_delta_encoding_for_index_values;
+    if (rep_->sampled_input_data_bytes > 0) {
+      rep_->props.slow_compression_estimated_data_size = static_cast<uint64_t>(
+          static_cast<double>(rep_->sampled_output_slow_data_bytes) /
+              rep_->sampled_input_data_bytes *
+              rep_->compressible_input_data_bytes +
+          rep_->uncompressible_input_data_bytes + 0.5);
+      rep_->props.fast_compression_estimated_data_size = static_cast<uint64_t>(
+          static_cast<double>(rep_->sampled_output_fast_data_bytes) /
+              rep_->sampled_input_data_bytes *
+              rep_->compressible_input_data_bytes +
+          rep_->uncompressible_input_data_bytes + 0.5);
+    } else if (rep_->sample_for_compression > 0) {
+      // We tried to sample but none were found. Assume worst-case (compression
+      // ratio 1.0) so data is complete and aggregatable.
+      rep_->props.slow_compression_estimated_data_size =
+          rep_->compressible_input_data_bytes +
+          rep_->uncompressible_input_data_bytes;
+      rep_->props.fast_compression_estimated_data_size =
+          rep_->compressible_input_data_bytes +
+          rep_->uncompressible_input_data_bytes;
+    }
+
+    // Add basic properties
+    property_block_builder.AddTableProperty(rep_->props);
+
+    // Add use collected properties
+    NotifyCollectTableCollectorsOnFinish(rep_->table_properties_collectors,
+                                         rep_->ioptions.logger,
+                                         &property_block_builder);
+
+    Slice block_data = property_block_builder.Finish();
+    TEST_SYNC_POINT_CALLBACK(
+        "BlockBasedTableBuilder::WritePropertiesBlock:BlockData", &block_data);
+    WriteMaybeCompressedBlock(block_data, kNoCompression,
+                              &properties_block_handle, BlockType::kProperties);
+  }
+  if (ok()) {
+#ifndef NDEBUG
+    {
+      uint64_t props_block_offset = properties_block_handle.offset();
+      uint64_t props_block_size = properties_block_handle.size();
+      TEST_SYNC_POINT_CALLBACK(
+          "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockOffset",
+          &props_block_offset);
+      TEST_SYNC_POINT_CALLBACK(
+          "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockSize",
+          &props_block_size);
+    }
+#endif  // !NDEBUG
+
+    const std::string* properties_block_meta = &kPropertiesBlockName;
+    TEST_SYNC_POINT_CALLBACK(
+        "BlockBasedTableBuilder::WritePropertiesBlock:Meta",
+        &properties_block_meta);
+    meta_index_builder->Add(*properties_block_meta, properties_block_handle);
+  }
+}
+
+void BlockBasedTableBuilder::WriteCompressionDictBlock(
+    MetaIndexBuilder* meta_index_builder) {
+  if (rep_->compression_dict != nullptr &&
+      rep_->compression_dict->GetRawDict().size()) {
+    BlockHandle compression_dict_block_handle;
+    if (ok()) {
+      WriteMaybeCompressedBlock(rep_->compression_dict->GetRawDict(),
+                                kNoCompression, &compression_dict_block_handle,
+                                BlockType::kCompressionDictionary);
+#ifndef NDEBUG
+      Slice compression_dict = rep_->compression_dict->GetRawDict();
+      TEST_SYNC_POINT_CALLBACK(
+          "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
+          &compression_dict);
+#endif  // NDEBUG
+    }
+    if (ok()) {
+      meta_index_builder->Add(kCompressionDictBlockName,
+                              compression_dict_block_handle);
+    }
+  }
+}
+
+void BlockBasedTableBuilder::WriteRangeDelBlock(
+    MetaIndexBuilder* meta_index_builder) {
+  if (ok() && !rep_->range_del_block.empty()) {
+    BlockHandle range_del_block_handle;
+    WriteMaybeCompressedBlock(rep_->range_del_block.Finish(), kNoCompression,
+                              &range_del_block_handle,
+                              BlockType::kRangeDeletion);
+    meta_index_builder->Add(kRangeDelBlockName, range_del_block_handle);
+  }
+}
+
+void BlockBasedTableBuilder::WriteFooter(BlockHandle& metaindex_block_handle,
+                                         BlockHandle& index_block_handle) {
+  Rep* r = rep_;
+  // this is guaranteed by BlockBasedTableBuilder's constructor
+  assert(r->table_options.checksum == kCRC32c ||
+         r->table_options.format_version != 0);
+  assert(ok());
+
+  FooterBuilder footer;
+  footer.Build(kBlockBasedTableMagicNumber, r->table_options.format_version,
+               r->get_offset(), r->table_options.checksum,
+               metaindex_block_handle, index_block_handle);
+  IOStatus ios = r->file->Append(footer.GetSlice());
+  if (ios.ok()) {
+    r->set_offset(r->get_offset() + footer.GetSlice().size());
+  } else {
+    r->SetIOStatus(ios);
+  }
+}
+
+void BlockBasedTableBuilder::EnterUnbuffered() {
+  Rep* r = rep_;
+  assert(r->state == Rep::State::kBuffered);
+  r->state = Rep::State::kUnbuffered;
+  const size_t kSampleBytes = r->compression_opts.zstd_max_train_bytes > 0
+                                  ? r->compression_opts.zstd_max_train_bytes
+                                  : r->compression_opts.max_dict_bytes;
+  const size_t kNumBlocksBuffered = r->data_block_buffers.size();
+  if (kNumBlocksBuffered == 0) {
+    // The below code is neither safe nor necessary for handling zero data
+    // blocks.
+    return;
+  }
+
+  // Abstract algebra teaches us that a finite cyclic group (such as the
+  // additive group of integers modulo N) can be generated by a number that is
+  // coprime with N. Since N is variable (number of buffered data blocks), we
+  // must then pick a prime number in order to guarantee coprimeness with any N.
+  //
+  // One downside of this approach is the spread will be poor when
+  // `kPrimeGeneratorRemainder` is close to zero or close to
+  // `kNumBlocksBuffered`.
+  //
+  // Picked a random number between one and one trillion and then chose the
+  // next prime number greater than or equal to it.
+  const uint64_t kPrimeGenerator = 545055921143ull;
+  // Can avoid repeated division by just adding the remainder repeatedly.
+  const size_t kPrimeGeneratorRemainder = static_cast<size_t>(
+      kPrimeGenerator % static_cast<uint64_t>(kNumBlocksBuffered));
+  const size_t kInitSampleIdx = kNumBlocksBuffered / 2;
+
+  std::string compression_dict_samples;
+  std::vector<size_t> compression_dict_sample_lens;
+  size_t buffer_idx = kInitSampleIdx;
+  for (size_t i = 0;
+       i < kNumBlocksBuffered && compression_dict_samples.size() < kSampleBytes;
+       ++i) {
+    size_t copy_len = std::min(kSampleBytes - compression_dict_samples.size(),
+                               r->data_block_buffers[buffer_idx].size());
+    compression_dict_samples.append(r->data_block_buffers[buffer_idx], 0,
+                                    copy_len);
+    compression_dict_sample_lens.emplace_back(copy_len);
+
+    buffer_idx += kPrimeGeneratorRemainder;
+    if (buffer_idx >= kNumBlocksBuffered) {
+      buffer_idx -= kNumBlocksBuffered;
+    }
+  }
+
+  // final data block flushed, now we can generate dictionary from the samples.
+  // OK if compression_dict_samples is empty, we'll just get empty dictionary.
+  std::string dict;
+  if (r->compression_opts.zstd_max_train_bytes > 0) {
+    if (r->compression_opts.use_zstd_dict_trainer) {
+      dict = ZSTD_TrainDictionary(compression_dict_samples,
+                                  compression_dict_sample_lens,
+                                  r->compression_opts.max_dict_bytes);
+    } else {
+      dict = ZSTD_FinalizeDictionary(
+          compression_dict_samples, compression_dict_sample_lens,
+          r->compression_opts.max_dict_bytes, r->compression_opts.level);
+    }
+  } else {
+    dict = std::move(compression_dict_samples);
+  }
+  r->compression_dict.reset(new CompressionDict(dict, r->compression_type,
+                                                r->compression_opts.level));
+  r->verify_dict.reset(new UncompressionDict(
+      dict, r->compression_type == kZSTD ||
+                r->compression_type == kZSTDNotFinalCompression));
+
+  auto get_iterator_for_block = [&r](size_t i) {
+    auto& data_block = r->data_block_buffers[i];
+    assert(!data_block.empty());
+
+    Block reader{BlockContents{data_block}};
+    DataBlockIter* iter = reader.NewDataIterator(
+        r->internal_comparator.user_comparator(), kDisableGlobalSequenceNumber);
+
+    iter->SeekToFirst();
+    assert(iter->Valid());
+    return std::unique_ptr<DataBlockIter>(iter);
+  };
+
+  std::unique_ptr<DataBlockIter> iter = nullptr, next_block_iter = nullptr;
+
+  for (size_t i = 0; ok() && i < r->data_block_buffers.size(); ++i) {
+    if (iter == nullptr) {
+      iter = get_iterator_for_block(i);
+      assert(iter != nullptr);
+    };
+
+    if (i + 1 < r->data_block_buffers.size()) {
+      next_block_iter = get_iterator_for_block(i + 1);
+    }
+
+    auto& data_block = r->data_block_buffers[i];
+    if (r->IsParallelCompressionEnabled()) {
+      Slice first_key_in_next_block;
+      const Slice* first_key_in_next_block_ptr = &first_key_in_next_block;
+      if (i + 1 < r->data_block_buffers.size()) {
+        assert(next_block_iter != nullptr);
+        first_key_in_next_block = next_block_iter->key();
+      } else {
+        first_key_in_next_block_ptr = r->first_key_in_next_block;
+      }
+
+      std::vector<std::string> keys;
+      for (; iter->Valid(); iter->Next()) {
+        keys.emplace_back(iter->key().ToString());
+      }
+
+      ParallelCompressionRep::BlockRep* block_rep = r->pc_rep->PrepareBlock(
+          r->compression_type, first_key_in_next_block_ptr, &data_block, &keys);
+
+      assert(block_rep != nullptr);
+      r->pc_rep->file_size_estimator.EmitBlock(block_rep->data->size(),
+                                               r->get_offset());
+      r->pc_rep->EmitBlock(block_rep);
+    } else {
+      for (; iter->Valid(); iter->Next()) {
+        Slice key = iter->key();
+        if (r->filter_builder != nullptr) {
+          size_t ts_sz =
+              r->internal_comparator.user_comparator()->timestamp_size();
+          r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz));
+        }
+        r->index_builder->OnKeyAdded(key);
+      }
+      WriteBlock(Slice(data_block), &r->pending_handle, BlockType::kData);
+      if (ok() && i + 1 < r->data_block_buffers.size()) {
+        assert(next_block_iter != nullptr);
+        Slice first_key_in_next_block = next_block_iter->key();
+
+        Slice* first_key_in_next_block_ptr = &first_key_in_next_block;
+
+        iter->SeekToLast();
+        std::string last_key = iter->key().ToString();
+        r->index_builder->AddIndexEntry(&last_key, first_key_in_next_block_ptr,
+                                        r->pending_handle);
+      }
+    }
+    std::swap(iter, next_block_iter);
+  }
+  r->data_block_buffers.clear();
+  r->data_begin_offset = 0;
+  // Release all reserved cache for data block buffers
+  if (r->compression_dict_buffer_cache_res_mgr != nullptr) {
+    Status s = r->compression_dict_buffer_cache_res_mgr->UpdateCacheReservation(
+        r->data_begin_offset);
+    s.PermitUncheckedError();
+  }
+}
+
+Status BlockBasedTableBuilder::Finish() {
+  Rep* r = rep_;
+  assert(r->state != Rep::State::kClosed);
+  bool empty_data_block = r->data_block.empty();
+  r->first_key_in_next_block = nullptr;
+  Flush();
+  if (r->state == Rep::State::kBuffered) {
+    EnterUnbuffered();
+  }
+  if (r->IsParallelCompressionEnabled()) {
+    StopParallelCompression();
+#ifndef NDEBUG
+    for (const auto& br : r->pc_rep->block_rep_buf) {
+      assert(br.status.ok());
+    }
+#endif  // !NDEBUG
+  } else {
+    // To make sure properties block is able to keep the accurate size of index
+    // block, we will finish writing all index entries first.
+    if (ok() && !empty_data_block) {
+      r->index_builder->AddIndexEntry(
+          &r->last_key, nullptr /* no next data block */, r->pending_handle);
+    }
+  }
+
+  // Write meta blocks, metaindex block and footer in the following order.
+  //    1. [meta block: filter]
+  //    2. [meta block: index]
+  //    3. [meta block: compression dictionary]
+  //    4. [meta block: range deletion tombstone]
+  //    5. [meta block: properties]
+  //    6. [metaindex block]
+  //    7. Footer
+  BlockHandle metaindex_block_handle, index_block_handle;
+  MetaIndexBuilder meta_index_builder;
+  WriteFilterBlock(&meta_index_builder);
+  WriteIndexBlock(&meta_index_builder, &index_block_handle);
+  WriteCompressionDictBlock(&meta_index_builder);
+  WriteRangeDelBlock(&meta_index_builder);
+  WritePropertiesBlock(&meta_index_builder);
+  if (ok()) {
+    // flush the meta index block
+    WriteMaybeCompressedBlock(meta_index_builder.Finish(), kNoCompression,
+                              &metaindex_block_handle, BlockType::kMetaIndex);
+  }
+  if (ok()) {
+    WriteFooter(metaindex_block_handle, index_block_handle);
+  }
+  r->state = Rep::State::kClosed;
+  r->SetStatus(r->CopyIOStatus());
+  Status ret_status = r->CopyStatus();
+  assert(!ret_status.ok() || io_status().ok());
+  return ret_status;
+}
+
+void BlockBasedTableBuilder::Abandon() {
+  assert(rep_->state != Rep::State::kClosed);
+  if (rep_->IsParallelCompressionEnabled()) {
+    StopParallelCompression();
+  }
+  rep_->state = Rep::State::kClosed;
+  rep_->CopyStatus().PermitUncheckedError();
+  rep_->CopyIOStatus().PermitUncheckedError();
+}
+
+uint64_t BlockBasedTableBuilder::NumEntries() const {
+  return rep_->props.num_entries;
+}
+
+bool BlockBasedTableBuilder::IsEmpty() const {
+  return rep_->props.num_entries == 0 && rep_->props.num_range_deletions == 0;
+}
+
+uint64_t BlockBasedTableBuilder::FileSize() const { return rep_->offset; }
+
+uint64_t BlockBasedTableBuilder::EstimatedFileSize() const {
+  if (rep_->IsParallelCompressionEnabled()) {
+    // Use compression ratio so far and inflight uncompressed bytes to estimate
+    // final SST size.
+    return rep_->pc_rep->file_size_estimator.GetEstimatedFileSize();
+  } else {
+    return FileSize();
+  }
+}
+
+bool BlockBasedTableBuilder::NeedCompact() const {
+  for (const auto& collector : rep_->table_properties_collectors) {
+    if (collector->NeedCompact()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+TableProperties BlockBasedTableBuilder::GetTableProperties() const {
+  TableProperties ret = rep_->props;
+  for (const auto& collector : rep_->table_properties_collectors) {
+    for (const auto& prop : collector->GetReadableProperties()) {
+      ret.readable_properties.insert(prop);
+    }
+    collector->Finish(&ret.user_collected_properties).PermitUncheckedError();
+  }
+  return ret;
+}
+
+std::string BlockBasedTableBuilder::GetFileChecksum() const {
+  if (rep_->file != nullptr) {
+    return rep_->file->GetFileChecksum();
+  } else {
+    return kUnknownFileChecksum;
+  }
+}
+
+const char* BlockBasedTableBuilder::GetFileChecksumFuncName() const {
+  if (rep_->file != nullptr) {
+    return rep_->file->GetFileChecksumFuncName();
+  } else {
+    return kUnknownFileChecksumFuncName;
+  }
+}
+void BlockBasedTableBuilder::SetSeqnoTimeTableProperties(
+    const std::string& encoded_seqno_to_time_mapping,
+    uint64_t oldest_ancestor_time) {
+  rep_->props.seqno_to_time_mapping = encoded_seqno_to_time_mapping;
+  rep_->props.creation_time = oldest_ancestor_time;
+}
+
+const std::string BlockBasedTable::kObsoleteFilterBlockPrefix = "filter.";
+const std::string BlockBasedTable::kFullFilterBlockPrefix = "fullfilter.";
+const std::string BlockBasedTable::kPartitionedFilterBlockPrefix =
+    "partitionedfilter.";
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_builder.h b/src/rocksdb/table/block_based/block_based_table_builder.h
new file mode 100644
index 000000000..ecc13d0f7
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_builder.h
@@ -0,0 +1,203 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+
+#include <array>
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/version_edit.h"
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "table/meta_blocks.h"
+#include "table/table_builder.h"
+#include "util/compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlockBuilder;
+class BlockHandle;
+class WritableFile;
+struct BlockBasedTableOptions;
+
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const uint64_t kLegacyBlockBasedTableMagicNumber;
+
+class BlockBasedTableBuilder : public TableBuilder {
+ public:
+  // Create a builder that will store the contents of the table it is
+  // building in *file.  Does not close the file.  It is up to the
+  // caller to close the file after calling Finish().
+  BlockBasedTableBuilder(const BlockBasedTableOptions& table_options,
+                         const TableBuilderOptions& table_builder_options,
+                         WritableFileWriter* file);
+
+  // No copying allowed
+  BlockBasedTableBuilder(const BlockBasedTableBuilder&) = delete;
+  BlockBasedTableBuilder& operator=(const BlockBasedTableBuilder&) = delete;
+
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  ~BlockBasedTableBuilder();
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Add(const Slice& key, const Slice& value) override;
+
+  // Return non-ok iff some error has been detected.
+  Status status() const override;
+
+  // Return non-ok iff some error happens during IO.
+  IOStatus io_status() const override;
+
+  // Finish building the table.  Stops using the file passed to the
+  // constructor after this function returns.
+  // REQUIRES: Finish(), Abandon() have not been called
+  Status Finish() override;
+
+  // Indicate that the contents of this builder should be abandoned.  Stops
+  // using the file passed to the constructor after this function returns.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Abandon() override;
+
+  // Number of calls to Add() so far.
+  uint64_t NumEntries() const override;
+
+  bool IsEmpty() const override;
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  uint64_t FileSize() const override;
+
+  // Estimated size of the file generated so far. This is used when
+  // FileSize() cannot estimate final SST size, e.g. parallel compression
+  // is enabled.
+  uint64_t EstimatedFileSize() const override;
+
+  bool NeedCompact() const override;
+
+  // Get table properties
+  TableProperties GetTableProperties() const override;
+
+  // Get file checksum
+  std::string GetFileChecksum() const override;
+
+  // Get file checksum function name
+  const char* GetFileChecksumFuncName() const override;
+
+  void SetSeqnoTimeTableProperties(
+      const std::string& encoded_seqno_to_time_mapping,
+      uint64_t oldest_ancestor_time) override;
+
+ private:
+  bool ok() const { return status().ok(); }
+
+  // Transition state from buffered to unbuffered. See `Rep::State` API comment
+  // for details of the states.
+  // REQUIRES: `rep_->state == kBuffered`
+  void EnterUnbuffered();
+
+  // Call block's Finish() method and then
+  // - in buffered mode, buffer the uncompressed block contents.
+  // - in unbuffered mode, write the compressed block contents to file.
+  void WriteBlock(BlockBuilder* block, BlockHandle* handle,
+                  BlockType blocktype);
+
+  // Compress and write block content to the file.
+  void WriteBlock(const Slice& block_contents, BlockHandle* handle,
+                  BlockType block_type);
+  // Directly write data to the file.
+  void WriteMaybeCompressedBlock(const Slice& data, CompressionType,
+                                 BlockHandle* handle, BlockType block_type,
+                                 const Slice* raw_data = nullptr);
+
+  void SetupCacheKeyPrefix(const TableBuilderOptions& tbo);
+
+  template <typename TBlocklike>
+  Status InsertBlockInCache(const Slice& block_contents,
+                            const BlockHandle* handle, BlockType block_type);
+
+  Status InsertBlockInCacheHelper(const Slice& block_contents,
+                                  const BlockHandle* handle,
+                                  BlockType block_type);
+
+  Status InsertBlockInCompressedCache(const Slice& block_contents,
+                                      const CompressionType type,
+                                      const BlockHandle* handle);
+
+  void WriteFilterBlock(MetaIndexBuilder* meta_index_builder);
+  void WriteIndexBlock(MetaIndexBuilder* meta_index_builder,
+                       BlockHandle* index_block_handle);
+  void WritePropertiesBlock(MetaIndexBuilder* meta_index_builder);
+  void WriteCompressionDictBlock(MetaIndexBuilder* meta_index_builder);
+  void WriteRangeDelBlock(MetaIndexBuilder* meta_index_builder);
+  void WriteFooter(BlockHandle& metaindex_block_handle,
+                   BlockHandle& index_block_handle);
+
+  struct Rep;
+  class BlockBasedTablePropertiesCollectorFactory;
+  class BlockBasedTablePropertiesCollector;
+  Rep* rep_;
+
+  struct ParallelCompressionRep;
+
+  // Advanced operation: flush any buffered key/value pairs to file.
+  // Can be used to ensure that two adjacent entries never live in
+  // the same data block.  Most clients should not need to use this method.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Flush();
+
+  // Some compression libraries fail when the uncompressed size is bigger than
+  // int. If uncompressed size is bigger than kCompressionSizeLimit, don't
+  // compress it
+  const uint64_t kCompressionSizeLimit = std::numeric_limits<int>::max();
+
+  // Get blocks from mem-table walking thread, compress them and
+  // pass them to the write thread. Used in parallel compression mode only
+  void BGWorkCompression(const CompressionContext& compression_ctx,
+                         UncompressionContext* verify_ctx);
+
+  // Given uncompressed block content, try to compress it and return result and
+  // compression type
+  void CompressAndVerifyBlock(const Slice& uncompressed_block_data,
+                              bool is_data_block,
+                              const CompressionContext& compression_ctx,
+                              UncompressionContext* verify_ctx,
+                              std::string* compressed_output,
+                              Slice* result_block_contents,
+                              CompressionType* result_compression_type,
+                              Status* out_status);
+
+  // Get compressed blocks from BGWorkCompression and write them into SST
+  void BGWorkWriteMaybeCompressedBlock();
+
+  // Initialize parallel compression context and
+  // start BGWorkCompression and BGWorkWriteMaybeCompressedBlock threads
+  void StartParallelCompression();
+
+  // Stop BGWorkCompression and BGWorkWriteMaybeCompressedBlock threads
+  void StopParallelCompression();
+};
+
+Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info,
+                    CompressionType* type, uint32_t format_version,
+                    bool do_sample, std::string* compressed_output,
+                    std::string* sampled_output_fast,
+                    std::string* sampled_output_slow);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_factory.cc b/src/rocksdb/table/block_based/block_based_table_factory.cc
new file mode 100644
index 000000000..09c1d2f62
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_factory.cc
@@ -0,0 +1,1058 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_based/block_based_table_factory.h"
+
+#include <stdint.h>
+
+#include <cinttypes>
+#include <memory>
+#include <string>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_reservation_manager.h"
+#include "logging/logging.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/options_type.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/format.h"
+#include "util/mutexlock.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void TailPrefetchStats::RecordEffectiveSize(size_t len) {
+  MutexLock l(&mutex_);
+  if (num_records_ < kNumTracked) {
+    num_records_++;
+  }
+  records_[next_++] = len;
+  if (next_ == kNumTracked) {
+    next_ = 0;
+  }
+}
+
+size_t TailPrefetchStats::GetSuggestedPrefetchSize() {
+  std::vector<size_t> sorted;
+  {
+    MutexLock l(&mutex_);
+
+    if (num_records_ == 0) {
+      return 0;
+    }
+    sorted.assign(records_, records_ + num_records_);
+  }
+
+  // Of the historic size, we find the maximum one that satisifis the condtiion
+  // that if prefetching all, less than 1/8 will be wasted.
+  std::sort(sorted.begin(), sorted.end());
+
+  // Assuming we have 5 data points, and after sorting it looks like this:
+  //
+  //                                     +---+
+  //                             +---+   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                    +---+    |   |   |   |
+  //                    |   |    |   |   |   |
+  //           +---+    |   |    |   |   |   |
+  //           |   |    |   |    |   |   |   |
+  //  +---+    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  +---+    +---+    +---+    +---+   +---+
+  //
+  // and we use every of the value as a candidate, and estimate how much we
+  // wasted, compared to read. For example, when we use the 3rd record
+  // as candiate. This area is what we read:
+  //                                     +---+
+  //                             +---+   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //  ***  ***  ***  ***+ ***  ***  *** *** **
+  //  *                 |   |    |   |   |   |
+  //           +---+    |   |    |   |   |   *
+  //  *        |   |    |   |    |   |   |   |
+  //  +---+    |   |    |   |    |   |   |   *
+  //  *   |    |   |    | X |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   *
+  //  *   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   *
+  //  *   |    |   |    |   |    |   |   |   |
+  //  *** *** ***-***  ***--*** ***--*** +****
+  // which is (size of the record) X (number of records).
+  //
+  // While wasted is this area:
+  //                                     +---+
+  //                             +---+   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //  ***  ***  ***  ****---+    |   |   |   |
+  //  *                 *   |    |   |   |   |
+  //  *        *-***  ***   |    |   |   |   |
+  //  *        *   |    |   |    |   |   |   |
+  //  *--**  ***   |    |   |    |   |   |   |
+  //  |   |    |   |    | X |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  +---+    +---+    +---+    +---+   +---+
+  //
+  // Which can be calculated iteratively.
+  // The difference between wasted using 4st and 3rd record, will
+  // be following area:
+  //                                     +---+
+  //  +--+  +-+   ++  +-+  +-+   +---+   |   |
+  //  + xxxxxxxxxxxxxxxxxxxxxxxx |   |   |   |
+  //    xxxxxxxxxxxxxxxxxxxxxxxx |   |   |   |
+  //  + xxxxxxxxxxxxxxxxxxxxxxxx |   |   |   |
+  //  | xxxxxxxxxxxxxxxxxxxxxxxx |   |   |   |
+  //  +-+ +-+  +-+  ++  +---+ +--+   |   |   |
+  //  |                 |   |    |   |   |   |
+  //           +---+ ++ |   |    |   |   |   |
+  //  |        |   |    |   |    | X |   |   |
+  //  +---+ ++ |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  +---+    +---+    +---+    +---+   +---+
+  //
+  // which will be the size difference between 4st and 3rd record,
+  // times 3, which is number of records before the 4st.
+  // Here we assume that all data within the prefetch range will be useful. In
+  // reality, it may not be the case when a partial block is inside the range,
+  // or there are data in the middle that is not read. We ignore those cases
+  // for simplicity.
+  assert(!sorted.empty());
+  size_t prev_size = sorted[0];
+  size_t max_qualified_size = sorted[0];
+  size_t wasted = 0;
+  for (size_t i = 1; i < sorted.size(); i++) {
+    size_t read = sorted[i] * sorted.size();
+    wasted += (sorted[i] - prev_size) * i;
+    if (wasted <= read / 8) {
+      max_qualified_size = sorted[i];
+    }
+    prev_size = sorted[i];
+  }
+  const size_t kMaxPrefetchSize = 512 * 1024;  // Never exceed 512KB
+  return std::min(kMaxPrefetchSize, max_qualified_size);
+}
+
+#ifndef ROCKSDB_LITE
+
+const std::string kOptNameMetadataCacheOpts = "metadata_cache_options";
+
+static std::unordered_map<std::string, PinningTier>
+    pinning_tier_type_string_map = {
+        {"kFallback", PinningTier::kFallback},
+        {"kNone", PinningTier::kNone},
+        {"kFlushedAndSimilar", PinningTier::kFlushedAndSimilar},
+        {"kAll", PinningTier::kAll}};
+
+static std::unordered_map<std::string, BlockBasedTableOptions::IndexType>
+    block_base_table_index_type_string_map = {
+        {"kBinarySearch", BlockBasedTableOptions::IndexType::kBinarySearch},
+        {"kHashSearch", BlockBasedTableOptions::IndexType::kHashSearch},
+        {"kTwoLevelIndexSearch",
+         BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch},
+        {"kBinarySearchWithFirstKey",
+         BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey}};
+
+static std::unordered_map<std::string,
+                          BlockBasedTableOptions::DataBlockIndexType>
+    block_base_table_data_block_index_type_string_map = {
+        {"kDataBlockBinarySearch",
+         BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch},
+        {"kDataBlockBinaryAndHash",
+         BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash}};
+
+static std::unordered_map<std::string,
+                          BlockBasedTableOptions::IndexShorteningMode>
+    block_base_table_index_shortening_mode_string_map = {
+        {"kNoShortening",
+         BlockBasedTableOptions::IndexShorteningMode::kNoShortening},
+        {"kShortenSeparators",
+         BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators},
+        {"kShortenSeparatorsAndSuccessor",
+         BlockBasedTableOptions::IndexShorteningMode::
+             kShortenSeparatorsAndSuccessor}};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    metadata_cache_options_type_info = {
+        {"top_level_index_pinning",
+         OptionTypeInfo::Enum<PinningTier>(
+             offsetof(struct MetadataCacheOptions, top_level_index_pinning),
+             &pinning_tier_type_string_map)},
+        {"partition_pinning",
+         OptionTypeInfo::Enum<PinningTier>(
+             offsetof(struct MetadataCacheOptions, partition_pinning),
+             &pinning_tier_type_string_map)},
+        {"unpartitioned_pinning",
+         OptionTypeInfo::Enum<PinningTier>(
+             offsetof(struct MetadataCacheOptions, unpartitioned_pinning),
+             &pinning_tier_type_string_map)}};
+
+static std::unordered_map<std::string,
+                          BlockBasedTableOptions::PrepopulateBlockCache>
+    block_base_table_prepopulate_block_cache_string_map = {
+        {"kDisable", BlockBasedTableOptions::PrepopulateBlockCache::kDisable},
+        {"kFlushOnly",
+         BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly}};
+
+#endif  // ROCKSDB_LITE
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    block_based_table_type_info = {
+#ifndef ROCKSDB_LITE
+        /* currently not supported
+          std::shared_ptr<Cache> block_cache = nullptr;
+          std::shared_ptr<Cache> block_cache_compressed = nullptr;
+          CacheUsageOptions cache_usage_options;
+         */
+        {"flush_block_policy_factory",
+         OptionTypeInfo::AsCustomSharedPtr<FlushBlockPolicyFactory>(
+             offsetof(struct BlockBasedTableOptions,
+                      flush_block_policy_factory),
+             OptionVerificationType::kByName, OptionTypeFlags::kCompareNever)},
+        {"cache_index_and_filter_blocks",
+         {offsetof(struct BlockBasedTableOptions,
+                   cache_index_and_filter_blocks),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"cache_index_and_filter_blocks_with_high_priority",
+         {offsetof(struct BlockBasedTableOptions,
+                   cache_index_and_filter_blocks_with_high_priority),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"pin_l0_filter_and_index_blocks_in_cache",
+         {offsetof(struct BlockBasedTableOptions,
+                   pin_l0_filter_and_index_blocks_in_cache),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"index_type", OptionTypeInfo::Enum<BlockBasedTableOptions::IndexType>(
+                           offsetof(struct BlockBasedTableOptions, index_type),
+                           &block_base_table_index_type_string_map)},
+        {"hash_index_allow_collision",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"data_block_index_type",
+         OptionTypeInfo::Enum<BlockBasedTableOptions::DataBlockIndexType>(
+             offsetof(struct BlockBasedTableOptions, data_block_index_type),
+             &block_base_table_data_block_index_type_string_map)},
+        {"index_shortening",
+         OptionTypeInfo::Enum<BlockBasedTableOptions::IndexShorteningMode>(
+             offsetof(struct BlockBasedTableOptions, index_shortening),
+             &block_base_table_index_shortening_mode_string_map)},
+        {"data_block_hash_table_util_ratio",
+         {offsetof(struct BlockBasedTableOptions,
+                   data_block_hash_table_util_ratio),
+          OptionType::kDouble, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"checksum",
+         {offsetof(struct BlockBasedTableOptions, checksum),
+          OptionType::kChecksumType, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"no_block_cache",
+         {offsetof(struct BlockBasedTableOptions, no_block_cache),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"block_size",
+         {offsetof(struct BlockBasedTableOptions, block_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"block_size_deviation",
+         {offsetof(struct BlockBasedTableOptions, block_size_deviation),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"block_restart_interval",
+         {offsetof(struct BlockBasedTableOptions, block_restart_interval),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"index_block_restart_interval",
+         {offsetof(struct BlockBasedTableOptions, index_block_restart_interval),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"index_per_partition",
+         {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"metadata_block_size",
+         {offsetof(struct BlockBasedTableOptions, metadata_block_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"partition_filters",
+         {offsetof(struct BlockBasedTableOptions, partition_filters),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"optimize_filters_for_memory",
+         {offsetof(struct BlockBasedTableOptions, optimize_filters_for_memory),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"filter_policy",
+         OptionTypeInfo::AsCustomSharedPtr<const FilterPolicy>(
+             offsetof(struct BlockBasedTableOptions, filter_policy),
+             OptionVerificationType::kByNameAllowFromNull,
+             OptionTypeFlags::kNone)},
+        {"whole_key_filtering",
+         {offsetof(struct BlockBasedTableOptions, whole_key_filtering),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"detect_filter_construct_corruption",
+         {offsetof(struct BlockBasedTableOptions,
+                   detect_filter_construct_corruption),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"reserve_table_builder_memory",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"reserve_table_reader_memory",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"skip_table_builder_flush",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"format_version",
+         {offsetof(struct BlockBasedTableOptions, format_version),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"verify_compression",
+         {offsetof(struct BlockBasedTableOptions, verify_compression),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"read_amp_bytes_per_bit",
+         {offsetof(struct BlockBasedTableOptions, read_amp_bytes_per_bit),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone,
+          [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
+             const std::string& value, void* addr) {
+            // A workaround to fix a bug in 6.10, 6.11, 6.12, 6.13
+            // and 6.14. The bug will write out 8 bytes to OPTIONS file from the
+            // starting address of BlockBasedTableOptions.read_amp_bytes_per_bit
+            // which is actually a uint32. Consequently, the value of
+            // read_amp_bytes_per_bit written in the OPTIONS file is wrong.
+            // From 6.15, RocksDB will try to parse the read_amp_bytes_per_bit
+            // from OPTIONS file as a uint32. To be able to load OPTIONS file
+            // generated by affected releases before the fix, we need to
+            // manually parse read_amp_bytes_per_bit with this special hack.
+            uint64_t read_amp_bytes_per_bit = ParseUint64(value);
+            *(static_cast<uint32_t*>(addr)) =
+                static_cast<uint32_t>(read_amp_bytes_per_bit);
+            return Status::OK();
+          }}},
+        {"enable_index_compression",
+         {offsetof(struct BlockBasedTableOptions, enable_index_compression),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"block_align",
+         {offsetof(struct BlockBasedTableOptions, block_align),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"pin_top_level_index_and_filter",
+         {offsetof(struct BlockBasedTableOptions,
+                   pin_top_level_index_and_filter),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {kOptNameMetadataCacheOpts,
+         OptionTypeInfo::Struct(
+             kOptNameMetadataCacheOpts, &metadata_cache_options_type_info,
+             offsetof(struct BlockBasedTableOptions, metadata_cache_options),
+             OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
+        {"block_cache",
+         {offsetof(struct BlockBasedTableOptions, block_cache),
+          OptionType::kUnknown, OptionVerificationType::kNormal,
+          (OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize),
+          // Parses the input value as a Cache
+          [](const ConfigOptions& opts, const std::string&,
+             const std::string& value, void* addr) {
+            auto* cache = static_cast<std::shared_ptr<Cache>*>(addr);
+            return Cache::CreateFromString(opts, value, cache);
+          }}},
+        {"block_cache_compressed",
+         {offsetof(struct BlockBasedTableOptions, block_cache_compressed),
+          OptionType::kUnknown, OptionVerificationType::kNormal,
+          (OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize),
+          // Parses the input value as a Cache
+          [](const ConfigOptions& opts, const std::string&,
+             const std::string& value, void* addr) {
+            auto* cache = static_cast<std::shared_ptr<Cache>*>(addr);
+            return Cache::CreateFromString(opts, value, cache);
+          }}},
+        {"max_auto_readahead_size",
+         {offsetof(struct BlockBasedTableOptions, max_auto_readahead_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"prepopulate_block_cache",
+         OptionTypeInfo::Enum<BlockBasedTableOptions::PrepopulateBlockCache>(
+             offsetof(struct BlockBasedTableOptions, prepopulate_block_cache),
+             &block_base_table_prepopulate_block_cache_string_map,
+             OptionTypeFlags::kMutable)},
+        {"initial_auto_readahead_size",
+         {offsetof(struct BlockBasedTableOptions, initial_auto_readahead_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"num_file_reads_for_auto_readahead",
+         {offsetof(struct BlockBasedTableOptions,
+                   num_file_reads_for_auto_readahead),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+
+#endif  // ROCKSDB_LITE
+};
+
+// TODO(myabandeh): We should return an error instead of silently changing the
+// options
+BlockBasedTableFactory::BlockBasedTableFactory(
+    const BlockBasedTableOptions& _table_options)
+    : table_options_(_table_options) {
+  InitializeOptions();
+  RegisterOptions(&table_options_, &block_based_table_type_info);
+
+  const auto table_reader_charged =
+      table_options_.cache_usage_options.options_overrides
+          .at(CacheEntryRole::kBlockBasedTableReader)
+          .charged;
+  if (table_options_.block_cache &&
+      table_reader_charged == CacheEntryRoleOptions::Decision::kEnabled) {
+    table_reader_cache_res_mgr_.reset(new ConcurrentCacheReservationManager(
+        std::make_shared<CacheReservationManagerImpl<
+            CacheEntryRole::kBlockBasedTableReader>>(
+            table_options_.block_cache)));
+  }
+}
+
+void BlockBasedTableFactory::InitializeOptions() {
+  if (table_options_.flush_block_policy_factory == nullptr) {
+    table_options_.flush_block_policy_factory.reset(
+        new FlushBlockBySizePolicyFactory());
+  }
+  if (table_options_.no_block_cache) {
+    table_options_.block_cache.reset();
+  } else if (table_options_.block_cache == nullptr) {
+    LRUCacheOptions co;
+    co.capacity = 8 << 20;
+    // It makes little sense to pay overhead for mid-point insertion while the
+    // block size is only 8MB.
+    co.high_pri_pool_ratio = 0.0;
+    co.low_pri_pool_ratio = 0.0;
+    table_options_.block_cache = NewLRUCache(co);
+  }
+  if (table_options_.block_size_deviation < 0 ||
+      table_options_.block_size_deviation > 100) {
+    table_options_.block_size_deviation = 0;
+  }
+  if (table_options_.block_restart_interval < 1) {
+    table_options_.block_restart_interval = 1;
+  }
+  if (table_options_.index_block_restart_interval < 1) {
+    table_options_.index_block_restart_interval = 1;
+  }
+  if (table_options_.index_type == BlockBasedTableOptions::kHashSearch &&
+      table_options_.index_block_restart_interval != 1) {
+    // Currently kHashSearch is incompatible with
+    // index_block_restart_interval > 1
+    table_options_.index_block_restart_interval = 1;
+  }
+  if (table_options_.partition_filters &&
+      table_options_.index_type !=
+          BlockBasedTableOptions::kTwoLevelIndexSearch) {
+    // We do not support partitioned filters without partitioning indexes
+    table_options_.partition_filters = false;
+  }
+  auto& options_overrides =
+      table_options_.cache_usage_options.options_overrides;
+  const auto options = table_options_.cache_usage_options.options;
+  for (std::uint32_t i = 0; i < kNumCacheEntryRoles; ++i) {
+    CacheEntryRole role = static_cast<CacheEntryRole>(i);
+    auto options_overrides_iter = options_overrides.find(role);
+    if (options_overrides_iter == options_overrides.end()) {
+      options_overrides.insert({role, options});
+    } else if (options_overrides_iter->second.charged ==
+               CacheEntryRoleOptions::Decision::kFallback) {
+      options_overrides_iter->second.charged = options.charged;
+    }
+  }
+}
+
+Status BlockBasedTableFactory::PrepareOptions(const ConfigOptions& opts) {
+  InitializeOptions();
+  return TableFactory::PrepareOptions(opts);
+}
+
+namespace {
+// Different cache kinds use the same keys for physically different values, so
+// they must not share an underlying key space with each other.
+Status CheckCacheOptionCompatibility(const BlockBasedTableOptions& bbto) {
+  int cache_count = (bbto.block_cache != nullptr) +
+                    (bbto.block_cache_compressed != nullptr) +
+                    (bbto.persistent_cache != nullptr);
+  if (cache_count <= 1) {
+    // Nothing to share / overlap
+    return Status::OK();
+  }
+
+  // Simple pointer equality
+  if (bbto.block_cache == bbto.block_cache_compressed) {
+    return Status::InvalidArgument(
+        "block_cache same as block_cache_compressed not currently supported, "
+        "and would be bad for performance anyway");
+  }
+
+  // More complex test of shared key space, in case the instances are wrappers
+  // for some shared underlying cache.
+  CacheKey sentinel_key = CacheKey::CreateUniqueForProcessLifetime();
+  static char kRegularBlockCacheMarker = 'b';
+  static char kCompressedBlockCacheMarker = 'c';
+  static char kPersistentCacheMarker = 'p';
+  if (bbto.block_cache) {
+    bbto.block_cache
+        ->Insert(sentinel_key.AsSlice(), &kRegularBlockCacheMarker, 1,
+                 GetNoopDeleterForRole<CacheEntryRole::kMisc>())
+        .PermitUncheckedError();
+  }
+  if (bbto.block_cache_compressed) {
+    bbto.block_cache_compressed
+        ->Insert(sentinel_key.AsSlice(), &kCompressedBlockCacheMarker, 1,
+                 GetNoopDeleterForRole<CacheEntryRole::kMisc>())
+        .PermitUncheckedError();
+  }
+  if (bbto.persistent_cache) {
+    // Note: persistent cache copies the data, not keeping the pointer
+    bbto.persistent_cache
+        ->Insert(sentinel_key.AsSlice(), &kPersistentCacheMarker, 1)
+        .PermitUncheckedError();
+  }
+  // If we get something different from what we inserted, that indicates
+  // dangerously overlapping key spaces.
+  if (bbto.block_cache) {
+    auto handle = bbto.block_cache->Lookup(sentinel_key.AsSlice());
+    if (handle) {
+      auto v = static_cast<char*>(bbto.block_cache->Value(handle));
+      char c = *v;
+      bbto.block_cache->Release(handle);
+      if (v == &kCompressedBlockCacheMarker) {
+        return Status::InvalidArgument(
+            "block_cache and block_cache_compressed share the same key space, "
+            "which is not supported");
+      } else if (c == kPersistentCacheMarker) {
+        return Status::InvalidArgument(
+            "block_cache and persistent_cache share the same key space, "
+            "which is not supported");
+      } else if (v != &kRegularBlockCacheMarker) {
+        return Status::Corruption("Unexpected mutation to block_cache");
+      }
+    }
+  }
+  if (bbto.block_cache_compressed) {
+    auto handle = bbto.block_cache_compressed->Lookup(sentinel_key.AsSlice());
+    if (handle) {
+      auto v = static_cast<char*>(bbto.block_cache_compressed->Value(handle));
+      char c = *v;
+      bbto.block_cache_compressed->Release(handle);
+      if (v == &kRegularBlockCacheMarker) {
+        return Status::InvalidArgument(
+            "block_cache_compressed and block_cache share the same key space, "
+            "which is not supported");
+      } else if (c == kPersistentCacheMarker) {
+        return Status::InvalidArgument(
+            "block_cache_compressed and persistent_cache share the same key "
+            "space, "
+            "which is not supported");
+      } else if (v != &kCompressedBlockCacheMarker) {
+        return Status::Corruption(
+            "Unexpected mutation to block_cache_compressed");
+      }
+    }
+  }
+  if (bbto.persistent_cache) {
+    std::unique_ptr<char[]> data;
+    size_t size = 0;
+    bbto.persistent_cache->Lookup(sentinel_key.AsSlice(), &data, &size)
+        .PermitUncheckedError();
+    if (data && size > 0) {
+      if (data[0] == kRegularBlockCacheMarker) {
+        return Status::InvalidArgument(
+            "persistent_cache and block_cache share the same key space, "
+            "which is not supported");
+      } else if (data[0] == kCompressedBlockCacheMarker) {
+        return Status::InvalidArgument(
+            "persistent_cache and block_cache_compressed share the same key "
+            "space, "
+            "which is not supported");
+      } else if (data[0] != kPersistentCacheMarker) {
+        return Status::Corruption("Unexpected mutation to persistent_cache");
+      }
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status BlockBasedTableFactory::NewTableReader(
+    const ReadOptions& ro, const TableReaderOptions& table_reader_options,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    std::unique_ptr<TableReader>* table_reader,
+    bool prefetch_index_and_filter_in_cache) const {
+  return BlockBasedTable::Open(
+      ro, table_reader_options.ioptions, table_reader_options.env_options,
+      table_options_, table_reader_options.internal_comparator, std::move(file),
+      file_size, table_reader, table_reader_cache_res_mgr_,
+      table_reader_options.prefix_extractor, prefetch_index_and_filter_in_cache,
+      table_reader_options.skip_filters, table_reader_options.level,
+      table_reader_options.immortal, table_reader_options.largest_seqno,
+      table_reader_options.force_direct_prefetch, &tail_prefetch_stats_,
+      table_reader_options.block_cache_tracer,
+      table_reader_options.max_file_size_for_l0_meta_pin,
+      table_reader_options.cur_db_session_id, table_reader_options.cur_file_num,
+      table_reader_options.unique_id);
+}
+
+TableBuilder* BlockBasedTableFactory::NewTableBuilder(
+    const TableBuilderOptions& table_builder_options,
+    WritableFileWriter* file) const {
+  return new BlockBasedTableBuilder(table_options_, table_builder_options,
+                                    file);
+}
+
+Status BlockBasedTableFactory::ValidateOptions(
+    const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const {
+  if (table_options_.index_type == BlockBasedTableOptions::kHashSearch &&
+      cf_opts.prefix_extractor == nullptr) {
+    return Status::InvalidArgument(
+        "Hash index is specified for block-based "
+        "table, but prefix_extractor is not given");
+  }
+  if (table_options_.cache_index_and_filter_blocks &&
+      table_options_.no_block_cache) {
+    return Status::InvalidArgument(
+        "Enable cache_index_and_filter_blocks, "
+        ", but block cache is disabled");
+  }
+  if (table_options_.pin_l0_filter_and_index_blocks_in_cache &&
+      table_options_.no_block_cache) {
+    return Status::InvalidArgument(
+        "Enable pin_l0_filter_and_index_blocks_in_cache, "
+        ", but block cache is disabled");
+  }
+  if (!IsSupportedFormatVersion(table_options_.format_version)) {
+    return Status::InvalidArgument(
+        "Unsupported BlockBasedTable format_version. Please check "
+        "include/rocksdb/table.h for more info");
+  }
+  if (table_options_.block_align && (cf_opts.compression != kNoCompression)) {
+    return Status::InvalidArgument(
+        "Enable block_align, but compression "
+        "enabled");
+  }
+  if (table_options_.block_align &&
+      (table_options_.block_size & (table_options_.block_size - 1))) {
+    return Status::InvalidArgument(
+        "Block alignment requested but block size is not a power of 2");
+  }
+  if (table_options_.block_size > std::numeric_limits<uint32_t>::max()) {
+    return Status::InvalidArgument(
+        "block size exceeds maximum number (4GiB) allowed");
+  }
+  if (table_options_.data_block_index_type ==
+          BlockBasedTableOptions::kDataBlockBinaryAndHash &&
+      table_options_.data_block_hash_table_util_ratio <= 0) {
+    return Status::InvalidArgument(
+        "data_block_hash_table_util_ratio should be greater than 0 when "
+        "data_block_index_type is set to kDataBlockBinaryAndHash");
+  }
+  if (db_opts.unordered_write && cf_opts.max_successive_merges > 0) {
+    // TODO(myabandeh): support it
+    return Status::InvalidArgument(
+        "max_successive_merges larger than 0 is currently inconsistent with "
+        "unordered_write");
+  }
+  const auto& options_overrides =
+      table_options_.cache_usage_options.options_overrides;
+  for (auto options_overrides_iter = options_overrides.cbegin();
+       options_overrides_iter != options_overrides.cend();
+       ++options_overrides_iter) {
+    const CacheEntryRole role = options_overrides_iter->first;
+    const CacheEntryRoleOptions options = options_overrides_iter->second;
+    static const std::set<CacheEntryRole> kMemoryChargingSupported = {
+        CacheEntryRole::kCompressionDictionaryBuildingBuffer,
+        CacheEntryRole::kFilterConstruction,
+        CacheEntryRole::kBlockBasedTableReader, CacheEntryRole::kFileMetadata,
+        CacheEntryRole::kBlobCache};
+    if (options.charged != CacheEntryRoleOptions::Decision::kFallback &&
+        kMemoryChargingSupported.count(role) == 0) {
+      return Status::NotSupported(
+          "Enable/Disable CacheEntryRoleOptions::charged"
+          " for CacheEntryRole " +
+          kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
+          " is not supported");
+    }
+    if (table_options_.no_block_cache &&
+        options.charged == CacheEntryRoleOptions::Decision::kEnabled) {
+      return Status::InvalidArgument(
+          "Enable CacheEntryRoleOptions::charged"
+          " for CacheEntryRole " +
+          kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
+          " but block cache is disabled");
+    }
+    if (role == CacheEntryRole::kBlobCache &&
+        options.charged == CacheEntryRoleOptions::Decision::kEnabled) {
+      if (cf_opts.blob_cache == nullptr) {
+        return Status::InvalidArgument(
+            "Enable CacheEntryRoleOptions::charged"
+            " for CacheEntryRole " +
+            kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
+            " but blob cache is not configured");
+      }
+      if (table_options_.no_block_cache) {
+        return Status::InvalidArgument(
+            "Enable CacheEntryRoleOptions::charged"
+            " for CacheEntryRole " +
+            kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
+            " but block cache is disabled");
+      }
+      if (table_options_.block_cache == cf_opts.blob_cache) {
+        return Status::InvalidArgument(
+            "Enable CacheEntryRoleOptions::charged"
+            " for CacheEntryRole " +
+            kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
+            " but blob cache is the same as block cache");
+      }
+      if (cf_opts.blob_cache->GetCapacity() >
+          table_options_.block_cache->GetCapacity()) {
+        return Status::InvalidArgument(
+            "Enable CacheEntryRoleOptions::charged"
+            " for CacheEntryRole " +
+            kCacheEntryRoleToCamelString[static_cast<uint32_t>(role)] +
+            " but blob cache capacity is larger than block cache capacity");
+      }
+    }
+  }
+  {
+    Status s = CheckCacheOptionCompatibility(table_options_);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  std::string garbage;
+  if (!SerializeEnum<ChecksumType>(checksum_type_string_map,
+                                   table_options_.checksum, &garbage)) {
+    return Status::InvalidArgument(
+        "Unrecognized ChecksumType for checksum: " +
+        std::to_string(static_cast<uint32_t>(table_options_.checksum)));
+  }
+  return TableFactory::ValidateOptions(db_opts, cf_opts);
+}
+
+std::string BlockBasedTableFactory::GetPrintableOptions() const {
+  std::string ret;
+  ret.reserve(20000);
+  const int kBufferSize = 200;
+  char buffer[kBufferSize];
+
+  snprintf(buffer, kBufferSize, "  flush_block_policy_factory: %s (%p)\n",
+           table_options_.flush_block_policy_factory->Name(),
+           static_cast<void*>(table_options_.flush_block_policy_factory.get()));
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  cache_index_and_filter_blocks: %d\n",
+           table_options_.cache_index_and_filter_blocks);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize,
+           "  cache_index_and_filter_blocks_with_high_priority: %d\n",
+           table_options_.cache_index_and_filter_blocks_with_high_priority);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize,
+           "  pin_l0_filter_and_index_blocks_in_cache: %d\n",
+           table_options_.pin_l0_filter_and_index_blocks_in_cache);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  pin_top_level_index_and_filter: %d\n",
+           table_options_.pin_top_level_index_and_filter);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  index_type: %d\n",
+           table_options_.index_type);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  data_block_index_type: %d\n",
+           table_options_.data_block_index_type);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  index_shortening: %d\n",
+           static_cast<int>(table_options_.index_shortening));
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  data_block_hash_table_util_ratio: %lf\n",
+           table_options_.data_block_hash_table_util_ratio);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  checksum: %d\n", table_options_.checksum);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  no_block_cache: %d\n",
+           table_options_.no_block_cache);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  block_cache: %p\n",
+           static_cast<void*>(table_options_.block_cache.get()));
+  ret.append(buffer);
+  if (table_options_.block_cache) {
+    const char* block_cache_name = table_options_.block_cache->Name();
+    if (block_cache_name != nullptr) {
+      snprintf(buffer, kBufferSize, "  block_cache_name: %s\n",
+               block_cache_name);
+      ret.append(buffer);
+    }
+    ret.append("  block_cache_options:\n");
+    ret.append(table_options_.block_cache->GetPrintableOptions());
+  }
+  snprintf(buffer, kBufferSize, "  block_cache_compressed: %p\n",
+           static_cast<void*>(table_options_.block_cache_compressed.get()));
+  ret.append(buffer);
+  if (table_options_.block_cache_compressed) {
+    const char* block_cache_compressed_name =
+        table_options_.block_cache_compressed->Name();
+    if (block_cache_compressed_name != nullptr) {
+      snprintf(buffer, kBufferSize, "  block_cache_name: %s\n",
+               block_cache_compressed_name);
+      ret.append(buffer);
+    }
+    ret.append("  block_cache_compressed_options:\n");
+    ret.append(table_options_.block_cache_compressed->GetPrintableOptions());
+  }
+  snprintf(buffer, kBufferSize, "  persistent_cache: %p\n",
+           static_cast<void*>(table_options_.persistent_cache.get()));
+  ret.append(buffer);
+  if (table_options_.persistent_cache) {
+    snprintf(buffer, kBufferSize, "  persistent_cache_options:\n");
+    ret.append(buffer);
+    ret.append(table_options_.persistent_cache->GetPrintableOptions());
+  }
+  snprintf(buffer, kBufferSize, "  block_size: %" PRIu64 "\n",
+           table_options_.block_size);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  block_size_deviation: %d\n",
+           table_options_.block_size_deviation);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  block_restart_interval: %d\n",
+           table_options_.block_restart_interval);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  index_block_restart_interval: %d\n",
+           table_options_.index_block_restart_interval);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  metadata_block_size: %" PRIu64 "\n",
+           table_options_.metadata_block_size);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  partition_filters: %d\n",
+           table_options_.partition_filters);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  use_delta_encoding: %d\n",
+           table_options_.use_delta_encoding);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  filter_policy: %s\n",
+           table_options_.filter_policy == nullptr
+               ? "nullptr"
+               : table_options_.filter_policy->Name());
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  whole_key_filtering: %d\n",
+           table_options_.whole_key_filtering);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  verify_compression: %d\n",
+           table_options_.verify_compression);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  read_amp_bytes_per_bit: %d\n",
+           table_options_.read_amp_bytes_per_bit);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  format_version: %d\n",
+           table_options_.format_version);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  enable_index_compression: %d\n",
+           table_options_.enable_index_compression);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  block_align: %d\n",
+           table_options_.block_align);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize,
+           "  max_auto_readahead_size: %" ROCKSDB_PRIszt "\n",
+           table_options_.max_auto_readahead_size);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  prepopulate_block_cache: %d\n",
+           static_cast<int>(table_options_.prepopulate_block_cache));
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize,
+           "  initial_auto_readahead_size: %" ROCKSDB_PRIszt "\n",
+           table_options_.initial_auto_readahead_size);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize,
+           "  num_file_reads_for_auto_readahead: %" PRIu64 "\n",
+           table_options_.num_file_reads_for_auto_readahead);
+  ret.append(buffer);
+  return ret;
+}
+
+const void* BlockBasedTableFactory::GetOptionsPtr(
+    const std::string& name) const {
+  if (name == kBlockCacheOpts()) {
+    if (table_options_.no_block_cache) {
+      return nullptr;
+    } else {
+      return table_options_.block_cache.get();
+    }
+  } else {
+    return TableFactory::GetOptionsPtr(name);
+  }
+}
+
+#ifndef ROCKSDB_LITE
+// Take a default BlockBasedTableOptions "table_options" in addition to a
+// map "opts_map" of option name to option value to construct the new
+// BlockBasedTableOptions "new_table_options".
+//
+// Below are the instructions of how to config some non-primitive-typed
+// options in BlockBasedTableOptions:
+//
+// * filter_policy:
+//   We currently only support the following FilterPolicy in the convenience
+//   functions:
+//   - BloomFilter: use "bloomfilter:[bits_per_key]:[use_block_based_builder]"
+//     to specify BloomFilter.  The above string is equivalent to calling
+//     NewBloomFilterPolicy(bits_per_key, use_block_based_builder).
+//     [Example]:
+//     - Pass {"filter_policy", "bloomfilter:4:true"} in
+//       GetBlockBasedTableOptionsFromMap to use a BloomFilter with 4-bits
+//       per key and use_block_based_builder enabled.
+//
+// * block_cache / block_cache_compressed:
+//   We currently only support LRU cache in the GetOptions API.  The LRU
+//   cache can be set by directly specifying its size.
+//   [Example]:
+//   - Passing {"block_cache", "1M"} in GetBlockBasedTableOptionsFromMap is
+//     equivalent to setting block_cache using NewLRUCache(1024 * 1024).
+//
+// @param table_options the default options of the output "new_table_options".
+// @param opts_map an option name to value map for specifying how
+//     "new_table_options" should be set.
+// @param new_table_options the resulting options based on "table_options"
+//     with the change specified in "opts_map".
+// @param input_strings_escaped when set to true, each escaped characters
+//     prefixed by '\' in the values of the opts_map will be further converted
+//     back to the raw string before assigning to the associated options.
+// @param ignore_unknown_options when set to true, unknown options are ignored
+//     instead of resulting in an unknown-option error.
+// @return Status::OK() on success.  Otherwise, a non-ok status indicating
+//     error will be returned, and "new_table_options" will be set to
+//     "table_options".
+Status BlockBasedTableFactory::ParseOption(const ConfigOptions& config_options,
+                                           const OptionTypeInfo& opt_info,
+                                           const std::string& opt_name,
+                                           const std::string& opt_value,
+                                           void* opt_ptr) {
+  Status status = TableFactory::ParseOption(config_options, opt_info, opt_name,
+                                            opt_value, opt_ptr);
+  if (config_options.input_strings_escaped && !status.ok()) {  // Got an error
+    // !input_strings_escaped indicates the old API, where everything is
+    // parsable.
+    if (opt_info.IsByName()) {
+      status = Status::OK();
+    }
+  }
+  return status;
+}
+
+Status GetBlockBasedTableOptionsFromString(
+    const BlockBasedTableOptions& table_options, const std::string& opts_str,
+    BlockBasedTableOptions* new_table_options) {
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = false;
+  config_options.ignore_unknown_options = false;
+  config_options.invoke_prepare_options = false;
+  config_options.ignore_unsupported_options = false;
+
+  return GetBlockBasedTableOptionsFromString(config_options, table_options,
+                                             opts_str, new_table_options);
+}
+Status GetBlockBasedTableOptionsFromString(
+    const ConfigOptions& config_options,
+    const BlockBasedTableOptions& table_options, const std::string& opts_str,
+    BlockBasedTableOptions* new_table_options) {
+  std::unordered_map<std::string, std::string> opts_map;
+  Status s = StringToMap(opts_str, &opts_map);
+  if (!s.ok()) {
+    return s;
+  }
+  s = GetBlockBasedTableOptionsFromMap(config_options, table_options, opts_map,
+                                       new_table_options);
+  // Translate any errors (NotFound, NotSupported, to InvalidArgument
+  if (s.ok() || s.IsInvalidArgument()) {
+    return s;
+  } else {
+    return Status::InvalidArgument(s.getState());
+  }
+}
+
+Status GetBlockBasedTableOptionsFromMap(
+    const BlockBasedTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    BlockBasedTableOptions* new_table_options, bool input_strings_escaped,
+    bool ignore_unknown_options) {
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = input_strings_escaped;
+  config_options.ignore_unknown_options = ignore_unknown_options;
+  config_options.invoke_prepare_options = false;
+
+  return GetBlockBasedTableOptionsFromMap(config_options, table_options,
+                                          opts_map, new_table_options);
+}
+
+Status GetBlockBasedTableOptionsFromMap(
+    const ConfigOptions& config_options,
+    const BlockBasedTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    BlockBasedTableOptions* new_table_options) {
+  assert(new_table_options);
+  BlockBasedTableFactory bbtf(table_options);
+  Status s = bbtf.ConfigureFromMap(config_options, opts_map);
+  if (s.ok()) {
+    *new_table_options = *(bbtf.GetOptions<BlockBasedTableOptions>());
+  } else {
+    *new_table_options = table_options;
+  }
+  return s;
+}
+#endif  // !ROCKSDB_LITE
+
+TableFactory* NewBlockBasedTableFactory(
+    const BlockBasedTableOptions& _table_options) {
+  return new BlockBasedTableFactory(_table_options);
+}
+
+const std::string BlockBasedTablePropertyNames::kIndexType =
+    "rocksdb.block.based.table.index.type";
+const std::string BlockBasedTablePropertyNames::kWholeKeyFiltering =
+    "rocksdb.block.based.table.whole.key.filtering";
+const std::string BlockBasedTablePropertyNames::kPrefixFiltering =
+    "rocksdb.block.based.table.prefix.filtering";
+const std::string kHashIndexPrefixesBlock = "rocksdb.hashindex.prefixes";
+const std::string kHashIndexPrefixesMetadataBlock =
+    "rocksdb.hashindex.metadata";
+const std::string kPropTrue = "1";
+const std::string kPropFalse = "0";
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_factory.h b/src/rocksdb/table/block_based/block_based_table_factory.h
new file mode 100644
index 000000000..3166cd3cc
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_factory.h
@@ -0,0 +1,101 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+
+#include "cache/cache_reservation_manager.h"
+#include "port/port.h"
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+struct ColumnFamilyOptions;
+struct ConfigOptions;
+struct DBOptions;
+struct EnvOptions;
+
+class BlockBasedTableBuilder;
+class RandomAccessFileReader;
+class WritableFileWriter;
+
+// A class used to track actual bytes written from the tail in the recent SST
+// file opens, and provide a suggestion for following open.
+class TailPrefetchStats {
+ public:
+  void RecordEffectiveSize(size_t len);
+  // 0 indicates no information to determine.
+  size_t GetSuggestedPrefetchSize();
+
+ private:
+  const static size_t kNumTracked = 32;
+  size_t records_[kNumTracked];
+  port::Mutex mutex_;
+  size_t next_ = 0;
+  size_t num_records_ = 0;
+};
+
+class BlockBasedTableFactory : public TableFactory {
+ public:
+  explicit BlockBasedTableFactory(
+      const BlockBasedTableOptions& table_options = BlockBasedTableOptions());
+
+  ~BlockBasedTableFactory() {}
+
+  // Method to allow CheckedCast to work for this class
+  static const char* kClassName() { return kBlockBasedTableName(); }
+
+  const char* Name() const override { return kBlockBasedTableName(); }
+
+  using TableFactory::NewTableReader;
+  Status NewTableReader(
+      const ReadOptions& ro, const TableReaderOptions& table_reader_options,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table_reader,
+      bool prefetch_index_and_filter_in_cache = true) const override;
+
+  TableBuilder* NewTableBuilder(
+      const TableBuilderOptions& table_builder_options,
+      WritableFileWriter* file) const override;
+
+  // Valdates the specified DB Options.
+  Status ValidateOptions(const DBOptions& db_opts,
+                         const ColumnFamilyOptions& cf_opts) const override;
+  Status PrepareOptions(const ConfigOptions& opts) override;
+
+  std::string GetPrintableOptions() const override;
+
+  bool IsDeleteRangeSupported() const override { return true; }
+
+  TailPrefetchStats* tail_prefetch_stats() { return &tail_prefetch_stats_; }
+
+ protected:
+  const void* GetOptionsPtr(const std::string& name) const override;
+#ifndef ROCKSDB_LITE
+  Status ParseOption(const ConfigOptions& config_options,
+                     const OptionTypeInfo& opt_info,
+                     const std::string& opt_name, const std::string& opt_value,
+                     void* opt_ptr) override;
+#endif
+  void InitializeOptions();
+
+ private:
+  BlockBasedTableOptions table_options_;
+  std::shared_ptr<CacheReservationManager> table_reader_cache_res_mgr_;
+  mutable TailPrefetchStats tail_prefetch_stats_;
+};
+
+extern const std::string kHashIndexPrefixesBlock;
+extern const std::string kHashIndexPrefixesMetadataBlock;
+extern const std::string kPropTrue;
+extern const std::string kPropFalse;
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_iterator.cc b/src/rocksdb/table/block_based/block_based_table_iterator.cc
new file mode 100644
index 000000000..d2605670f
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_iterator.cc
@@ -0,0 +1,459 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/block_based_table_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void BlockBasedTableIterator::SeekToFirst() { SeekImpl(nullptr, false); }
+
+void BlockBasedTableIterator::Seek(const Slice& target) {
+  SeekImpl(&target, true);
+}
+
+void BlockBasedTableIterator::SeekImpl(const Slice* target,
+                                       bool async_prefetch) {
+  bool is_first_pass = true;
+  if (async_read_in_progress_) {
+    AsyncInitDataBlock(false);
+    is_first_pass = false;
+  }
+
+  is_out_of_bound_ = false;
+  is_at_first_key_from_index_ = false;
+  if (target && !CheckPrefixMayMatch(*target, IterDirection::kForward)) {
+    ResetDataIter();
+    return;
+  }
+
+  bool need_seek_index = true;
+  if (block_iter_points_to_real_block_ && block_iter_.Valid()) {
+    // Reseek.
+    prev_block_offset_ = index_iter_->value().handle.offset();
+
+    if (target) {
+      // We can avoid an index seek if:
+      // 1. The new seek key is larger than the current key
+      // 2. The new seek key is within the upper bound of the block
+      // Since we don't necessarily know the internal key for either
+      // the current key or the upper bound, we check user keys and
+      // exclude the equality case. Considering internal keys can
+      // improve for the boundary cases, but it would complicate the
+      // code.
+      if (user_comparator_.Compare(ExtractUserKey(*target),
+                                   block_iter_.user_key()) > 0 &&
+          user_comparator_.Compare(ExtractUserKey(*target),
+                                   index_iter_->user_key()) < 0) {
+        need_seek_index = false;
+      }
+    }
+  }
+
+  if (need_seek_index) {
+    if (target) {
+      index_iter_->Seek(*target);
+    } else {
+      index_iter_->SeekToFirst();
+    }
+
+    if (!index_iter_->Valid()) {
+      ResetDataIter();
+      return;
+    }
+  }
+
+  IndexValue v = index_iter_->value();
+  const bool same_block = block_iter_points_to_real_block_ &&
+                          v.handle.offset() == prev_block_offset_;
+
+  if (!v.first_internal_key.empty() && !same_block &&
+      (!target || icomp_.Compare(*target, v.first_internal_key) <= 0) &&
+      allow_unprepared_value_) {
+    // Index contains the first key of the block, and it's >= target.
+    // We can defer reading the block.
+    is_at_first_key_from_index_ = true;
+    // ResetDataIter() will invalidate block_iter_. Thus, there is no need to
+    // call CheckDataBlockWithinUpperBound() to check for iterate_upper_bound
+    // as that will be done later when the data block is actually read.
+    ResetDataIter();
+  } else {
+    // Need to use the data block.
+    if (!same_block) {
+      if (read_options_.async_io && async_prefetch) {
+        if (is_first_pass) {
+          AsyncInitDataBlock(is_first_pass);
+        }
+        if (async_read_in_progress_) {
+          // Status::TryAgain indicates asynchronous request for retrieval of
+          // data blocks has been submitted. So it should return at this point
+          // and Seek should be called again to retrieve the requested block and
+          // execute the remaining code.
+          return;
+        }
+      } else {
+        InitDataBlock();
+      }
+    } else {
+      // When the user does a reseek, the iterate_upper_bound might have
+      // changed. CheckDataBlockWithinUpperBound() needs to be called
+      // explicitly if the reseek ends up in the same data block.
+      // If the reseek ends up in a different block, InitDataBlock() will do
+      // the iterator upper bound check.
+      CheckDataBlockWithinUpperBound();
+    }
+
+    if (target) {
+      block_iter_.Seek(*target);
+    } else {
+      block_iter_.SeekToFirst();
+    }
+    FindKeyForward();
+  }
+
+  CheckOutOfBound();
+
+  if (target) {
+    assert(!Valid() || icomp_.Compare(*target, key()) <= 0);
+  }
+}
+
+void BlockBasedTableIterator::SeekForPrev(const Slice& target) {
+  is_out_of_bound_ = false;
+  is_at_first_key_from_index_ = false;
+  // For now totally disable prefix seek in auto prefix mode because we don't
+  // have logic
+  if (!CheckPrefixMayMatch(target, IterDirection::kBackward)) {
+    ResetDataIter();
+    return;
+  }
+
+  SavePrevIndexValue();
+
+  // Call Seek() rather than SeekForPrev() in the index block, because the
+  // target data block will likely to contain the position for `target`, the
+  // same as Seek(), rather than than before.
+  // For example, if we have three data blocks, each containing two keys:
+  //   [2, 4]  [6, 8] [10, 12]
+  //  (the keys in the index block would be [4, 8, 12])
+  // and the user calls SeekForPrev(7), we need to go to the second block,
+  // just like if they call Seek(7).
+  // The only case where the block is difference is when they seek to a position
+  // in the boundary. For example, if they SeekForPrev(5), we should go to the
+  // first block, rather than the second. However, we don't have the information
+  // to distinguish the two unless we read the second block. In this case, we'll
+  // end up with reading two blocks.
+  index_iter_->Seek(target);
+
+  if (!index_iter_->Valid()) {
+    auto seek_status = index_iter_->status();
+    // Check for IO error
+    if (!seek_status.IsNotFound() && !seek_status.ok()) {
+      ResetDataIter();
+      return;
+    }
+
+    // With prefix index, Seek() returns NotFound if the prefix doesn't exist
+    if (seek_status.IsNotFound()) {
+      // Any key less than the target is fine for prefix seek
+      ResetDataIter();
+      return;
+    } else {
+      index_iter_->SeekToLast();
+    }
+    // Check for IO error
+    if (!index_iter_->Valid()) {
+      ResetDataIter();
+      return;
+    }
+  }
+
+  InitDataBlock();
+
+  block_iter_.SeekForPrev(target);
+
+  FindKeyBackward();
+  CheckDataBlockWithinUpperBound();
+  assert(!block_iter_.Valid() ||
+         icomp_.Compare(target, block_iter_.key()) >= 0);
+}
+
+void BlockBasedTableIterator::SeekToLast() {
+  is_out_of_bound_ = false;
+  is_at_first_key_from_index_ = false;
+  SavePrevIndexValue();
+  index_iter_->SeekToLast();
+  if (!index_iter_->Valid()) {
+    ResetDataIter();
+    return;
+  }
+  InitDataBlock();
+  block_iter_.SeekToLast();
+  FindKeyBackward();
+  CheckDataBlockWithinUpperBound();
+}
+
+void BlockBasedTableIterator::Next() {
+  if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) {
+    return;
+  }
+  assert(block_iter_points_to_real_block_);
+  block_iter_.Next();
+  FindKeyForward();
+  CheckOutOfBound();
+}
+
+bool BlockBasedTableIterator::NextAndGetResult(IterateResult* result) {
+  Next();
+  bool is_valid = Valid();
+  if (is_valid) {
+    result->key = key();
+    result->bound_check_result = UpperBoundCheckResult();
+    result->value_prepared = !is_at_first_key_from_index_;
+  }
+  return is_valid;
+}
+
+void BlockBasedTableIterator::Prev() {
+  if (is_at_first_key_from_index_) {
+    is_at_first_key_from_index_ = false;
+
+    index_iter_->Prev();
+    if (!index_iter_->Valid()) {
+      return;
+    }
+
+    InitDataBlock();
+    block_iter_.SeekToLast();
+  } else {
+    assert(block_iter_points_to_real_block_);
+    block_iter_.Prev();
+  }
+
+  FindKeyBackward();
+}
+
+void BlockBasedTableIterator::InitDataBlock() {
+  BlockHandle data_block_handle = index_iter_->value().handle;
+  if (!block_iter_points_to_real_block_ ||
+      data_block_handle.offset() != prev_block_offset_ ||
+      // if previous attempt of reading the block missed cache, try again
+      block_iter_.status().IsIncomplete()) {
+    if (block_iter_points_to_real_block_) {
+      ResetDataIter();
+    }
+    auto* rep = table_->get_rep();
+
+    bool is_for_compaction =
+        lookup_context_.caller == TableReaderCaller::kCompaction;
+    // Prefetch additional data for range scans (iterators).
+    // Implicit auto readahead:
+    //   Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
+    // Explicit user requested readahead:
+    //   Enabled from the very first IO when ReadOptions.readahead_size is set.
+    block_prefetcher_.PrefetchIfNeeded(
+        rep, data_block_handle, read_options_.readahead_size, is_for_compaction,
+        /*no_sequential_checking=*/false, read_options_.rate_limiter_priority);
+    Status s;
+    table_->NewDataBlockIterator<DataBlockIter>(
+        read_options_, data_block_handle, &block_iter_, BlockType::kData,
+        /*get_context=*/nullptr, &lookup_context_,
+        block_prefetcher_.prefetch_buffer(),
+        /*for_compaction=*/is_for_compaction, /*async_read=*/false, s);
+    block_iter_points_to_real_block_ = true;
+    CheckDataBlockWithinUpperBound();
+  }
+}
+
+void BlockBasedTableIterator::AsyncInitDataBlock(bool is_first_pass) {
+  BlockHandle data_block_handle = index_iter_->value().handle;
+  bool is_for_compaction =
+      lookup_context_.caller == TableReaderCaller::kCompaction;
+  if (is_first_pass) {
+    if (!block_iter_points_to_real_block_ ||
+        data_block_handle.offset() != prev_block_offset_ ||
+        // if previous attempt of reading the block missed cache, try again
+        block_iter_.status().IsIncomplete()) {
+      if (block_iter_points_to_real_block_) {
+        ResetDataIter();
+      }
+      auto* rep = table_->get_rep();
+      // Prefetch additional data for range scans (iterators).
+      // Implicit auto readahead:
+      //   Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
+      // Explicit user requested readahead:
+      //   Enabled from the very first IO when ReadOptions.readahead_size is
+      //   set.
+      // In case of async_io with Implicit readahead, block_prefetcher_ will
+      // always the create the prefetch buffer by setting no_sequential_checking
+      // = true.
+      block_prefetcher_.PrefetchIfNeeded(
+          rep, data_block_handle, read_options_.readahead_size,
+          is_for_compaction, /*no_sequential_checking=*/read_options_.async_io,
+          read_options_.rate_limiter_priority);
+
+      Status s;
+      table_->NewDataBlockIterator<DataBlockIter>(
+          read_options_, data_block_handle, &block_iter_, BlockType::kData,
+          /*get_context=*/nullptr, &lookup_context_,
+          block_prefetcher_.prefetch_buffer(),
+          /*for_compaction=*/is_for_compaction, /*async_read=*/true, s);
+
+      if (s.IsTryAgain()) {
+        async_read_in_progress_ = true;
+        return;
+      }
+    }
+  } else {
+    // Second pass will call the Poll to get the data block which has been
+    // requested asynchronously.
+    Status s;
+    table_->NewDataBlockIterator<DataBlockIter>(
+        read_options_, data_block_handle, &block_iter_, BlockType::kData,
+        /*get_context=*/nullptr, &lookup_context_,
+        block_prefetcher_.prefetch_buffer(),
+        /*for_compaction=*/is_for_compaction, /*async_read=*/false, s);
+  }
+  block_iter_points_to_real_block_ = true;
+  CheckDataBlockWithinUpperBound();
+  async_read_in_progress_ = false;
+}
+
+bool BlockBasedTableIterator::MaterializeCurrentBlock() {
+  assert(is_at_first_key_from_index_);
+  assert(!block_iter_points_to_real_block_);
+  assert(index_iter_->Valid());
+
+  is_at_first_key_from_index_ = false;
+  InitDataBlock();
+  assert(block_iter_points_to_real_block_);
+
+  if (!block_iter_.status().ok()) {
+    return false;
+  }
+
+  block_iter_.SeekToFirst();
+
+  if (!block_iter_.Valid() ||
+      icomp_.Compare(block_iter_.key(),
+                     index_iter_->value().first_internal_key) != 0) {
+    block_iter_.Invalidate(Status::Corruption(
+        "first key in index doesn't match first key in block"));
+    return false;
+  }
+
+  return true;
+}
+
+void BlockBasedTableIterator::FindKeyForward() {
+  // This method's code is kept short to make it likely to be inlined.
+
+  assert(!is_out_of_bound_);
+  assert(block_iter_points_to_real_block_);
+
+  if (!block_iter_.Valid()) {
+    // This is the only call site of FindBlockForward(), but it's extracted into
+    // a separate method to keep FindKeyForward() short and likely to be
+    // inlined. When transitioning to a different block, we call
+    // FindBlockForward(), which is much longer and is probably not inlined.
+    FindBlockForward();
+  } else {
+    // This is the fast path that avoids a function call.
+  }
+}
+
+void BlockBasedTableIterator::FindBlockForward() {
+  // TODO the while loop inherits from two-level-iterator. We don't know
+  // whether a block can be empty so it can be replaced by an "if".
+  do {
+    if (!block_iter_.status().ok()) {
+      return;
+    }
+    // Whether next data block is out of upper bound, if there is one.
+    const bool next_block_is_out_of_bound =
+        read_options_.iterate_upper_bound != nullptr &&
+        block_iter_points_to_real_block_ &&
+        block_upper_bound_check_ == BlockUpperBound::kUpperBoundInCurBlock;
+    assert(!next_block_is_out_of_bound ||
+           user_comparator_.CompareWithoutTimestamp(
+               *read_options_.iterate_upper_bound, /*a_has_ts=*/false,
+               index_iter_->user_key(), /*b_has_ts=*/true) <= 0);
+    ResetDataIter();
+    index_iter_->Next();
+    if (next_block_is_out_of_bound) {
+      // The next block is out of bound. No need to read it.
+      TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr);
+      // We need to make sure this is not the last data block before setting
+      // is_out_of_bound_, since the index key for the last data block can be
+      // larger than smallest key of the next file on the same level.
+      if (index_iter_->Valid()) {
+        is_out_of_bound_ = true;
+      }
+      return;
+    }
+
+    if (!index_iter_->Valid()) {
+      return;
+    }
+
+    IndexValue v = index_iter_->value();
+
+    if (!v.first_internal_key.empty() && allow_unprepared_value_) {
+      // Index contains the first key of the block. Defer reading the block.
+      is_at_first_key_from_index_ = true;
+      return;
+    }
+
+    InitDataBlock();
+    block_iter_.SeekToFirst();
+  } while (!block_iter_.Valid());
+}
+
+void BlockBasedTableIterator::FindKeyBackward() {
+  while (!block_iter_.Valid()) {
+    if (!block_iter_.status().ok()) {
+      return;
+    }
+
+    ResetDataIter();
+    index_iter_->Prev();
+
+    if (index_iter_->Valid()) {
+      InitDataBlock();
+      block_iter_.SeekToLast();
+    } else {
+      return;
+    }
+  }
+
+  // We could have check lower bound here too, but we opt not to do it for
+  // code simplicity.
+}
+
+void BlockBasedTableIterator::CheckOutOfBound() {
+  if (read_options_.iterate_upper_bound != nullptr &&
+      block_upper_bound_check_ != BlockUpperBound::kUpperBoundBeyondCurBlock &&
+      Valid()) {
+    is_out_of_bound_ =
+        user_comparator_.CompareWithoutTimestamp(
+            *read_options_.iterate_upper_bound, /*a_has_ts=*/false, user_key(),
+            /*b_has_ts=*/true) <= 0;
+  }
+}
+
+void BlockBasedTableIterator::CheckDataBlockWithinUpperBound() {
+  if (read_options_.iterate_upper_bound != nullptr &&
+      block_iter_points_to_real_block_) {
+    block_upper_bound_check_ = (user_comparator_.CompareWithoutTimestamp(
+                                    *read_options_.iterate_upper_bound,
+                                    /*a_has_ts=*/false, index_iter_->user_key(),
+                                    /*b_has_ts=*/true) > 0)
+                                   ? BlockUpperBound::kUpperBoundBeyondCurBlock
+                                   : BlockUpperBound::kUpperBoundInCurBlock;
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_iterator.h b/src/rocksdb/table/block_based/block_based_table_iterator.h
new file mode 100644
index 000000000..a2918b248
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_iterator.h
@@ -0,0 +1,280 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_based_table_reader_impl.h"
+#include "table/block_based/block_prefetcher.h"
+#include "table/block_based/reader_common.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Iterates over the contents of BlockBasedTable.
+class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
+  // compaction_readahead_size: its value will only be used if for_compaction =
+  // true
+  // @param read_options Must outlive this iterator.
+ public:
+  BlockBasedTableIterator(
+      const BlockBasedTable* table, const ReadOptions& read_options,
+      const InternalKeyComparator& icomp,
+      std::unique_ptr<InternalIteratorBase<IndexValue>>&& index_iter,
+      bool check_filter, bool need_upper_bound_check,
+      const SliceTransform* prefix_extractor, TableReaderCaller caller,
+      size_t compaction_readahead_size = 0, bool allow_unprepared_value = false)
+      : index_iter_(std::move(index_iter)),
+        table_(table),
+        read_options_(read_options),
+        icomp_(icomp),
+        user_comparator_(icomp.user_comparator()),
+        pinned_iters_mgr_(nullptr),
+        prefix_extractor_(prefix_extractor),
+        lookup_context_(caller),
+        block_prefetcher_(
+            compaction_readahead_size,
+            table_->get_rep()->table_options.initial_auto_readahead_size),
+        allow_unprepared_value_(allow_unprepared_value),
+        block_iter_points_to_real_block_(false),
+        check_filter_(check_filter),
+        need_upper_bound_check_(need_upper_bound_check),
+        async_read_in_progress_(false) {}
+
+  ~BlockBasedTableIterator() {}
+
+  void Seek(const Slice& target) override;
+  void SeekForPrev(const Slice& target) override;
+  void SeekToFirst() override;
+  void SeekToLast() override;
+  void Next() final override;
+  bool NextAndGetResult(IterateResult* result) override;
+  void Prev() override;
+  bool Valid() const override {
+    return !is_out_of_bound_ &&
+           (is_at_first_key_from_index_ ||
+            (block_iter_points_to_real_block_ && block_iter_.Valid()));
+  }
+  Slice key() const override {
+    assert(Valid());
+    if (is_at_first_key_from_index_) {
+      return index_iter_->value().first_internal_key;
+    } else {
+      return block_iter_.key();
+    }
+  }
+  Slice user_key() const override {
+    assert(Valid());
+    if (is_at_first_key_from_index_) {
+      return ExtractUserKey(index_iter_->value().first_internal_key);
+    } else {
+      return block_iter_.user_key();
+    }
+  }
+  bool PrepareValue() override {
+    assert(Valid());
+
+    if (!is_at_first_key_from_index_) {
+      return true;
+    }
+
+    return const_cast<BlockBasedTableIterator*>(this)
+        ->MaterializeCurrentBlock();
+  }
+  Slice value() const override {
+    // PrepareValue() must have been called.
+    assert(!is_at_first_key_from_index_);
+    assert(Valid());
+
+    return block_iter_.value();
+  }
+  Status status() const override {
+    // Prefix index set status to NotFound when the prefix does not exist
+    if (!index_iter_->status().ok() && !index_iter_->status().IsNotFound()) {
+      return index_iter_->status();
+    } else if (block_iter_points_to_real_block_) {
+      return block_iter_.status();
+    } else if (async_read_in_progress_) {
+      return Status::TryAgain();
+    } else {
+      return Status::OK();
+    }
+  }
+
+  inline IterBoundCheck UpperBoundCheckResult() override {
+    if (is_out_of_bound_) {
+      return IterBoundCheck::kOutOfBound;
+    } else if (block_upper_bound_check_ ==
+               BlockUpperBound::kUpperBoundBeyondCurBlock) {
+      assert(!is_out_of_bound_);
+      return IterBoundCheck::kInbound;
+    } else {
+      return IterBoundCheck::kUnknown;
+    }
+  }
+
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+    pinned_iters_mgr_ = pinned_iters_mgr;
+  }
+  bool IsKeyPinned() const override {
+    // Our key comes either from block_iter_'s current key
+    // or index_iter_'s current *value*.
+    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+           ((is_at_first_key_from_index_ && index_iter_->IsValuePinned()) ||
+            (block_iter_points_to_real_block_ && block_iter_.IsKeyPinned()));
+  }
+  bool IsValuePinned() const override {
+    assert(!is_at_first_key_from_index_);
+    assert(Valid());
+
+    // BlockIter::IsValuePinned() is always true. No need to check
+    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+           block_iter_points_to_real_block_;
+  }
+
+  void ResetDataIter() {
+    if (block_iter_points_to_real_block_) {
+      if (pinned_iters_mgr_ != nullptr && pinned_iters_mgr_->PinningEnabled()) {
+        block_iter_.DelegateCleanupsTo(pinned_iters_mgr_);
+      }
+      block_iter_.Invalidate(Status::OK());
+      block_iter_points_to_real_block_ = false;
+    }
+    block_upper_bound_check_ = BlockUpperBound::kUnknown;
+  }
+
+  void SavePrevIndexValue() {
+    if (block_iter_points_to_real_block_) {
+      // Reseek. If they end up with the same data block, we shouldn't re-fetch
+      // the same data block.
+      prev_block_offset_ = index_iter_->value().handle.offset();
+    }
+  }
+
+  void GetReadaheadState(ReadaheadFileInfo* readahead_file_info) override {
+    if (block_prefetcher_.prefetch_buffer() != nullptr &&
+        read_options_.adaptive_readahead) {
+      block_prefetcher_.prefetch_buffer()->GetReadaheadState(
+          &(readahead_file_info->data_block_readahead_info));
+      if (index_iter_) {
+        index_iter_->GetReadaheadState(readahead_file_info);
+      }
+    }
+  }
+
+  void SetReadaheadState(ReadaheadFileInfo* readahead_file_info) override {
+    if (read_options_.adaptive_readahead) {
+      block_prefetcher_.SetReadaheadState(
+          &(readahead_file_info->data_block_readahead_info));
+      if (index_iter_) {
+        index_iter_->SetReadaheadState(readahead_file_info);
+      }
+    }
+  }
+
+  std::unique_ptr<InternalIteratorBase<IndexValue>> index_iter_;
+
+ private:
+  enum class IterDirection {
+    kForward,
+    kBackward,
+  };
+  // This enum indicates whether the upper bound falls into current block
+  // or beyond.
+  //   +-------------+
+  //   |  cur block  |       <-- (1)
+  //   +-------------+
+  //                         <-- (2)
+  //  --- <boundary key> ---
+  //                         <-- (3)
+  //   +-------------+
+  //   |  next block |       <-- (4)
+  //        ......
+  //
+  // When the block is smaller than <boundary key>, kUpperBoundInCurBlock
+  // is the value to use. The examples are (1) or (2) in the graph. It means
+  // all keys in the next block or beyond will be out of bound. Keys within
+  // the current block may or may not be out of bound.
+  // When the block is larger or equal to <boundary key>,
+  // kUpperBoundBeyondCurBlock is to be used. The examples are (3) and (4)
+  // in the graph. It means that all keys in the current block is within the
+  // upper bound and keys in the next block may or may not be within the uppder
+  // bound.
+  // If the boundary key hasn't been checked against the upper bound,
+  // kUnknown can be used.
+  enum class BlockUpperBound {
+    kUpperBoundInCurBlock,
+    kUpperBoundBeyondCurBlock,
+    kUnknown,
+  };
+
+  const BlockBasedTable* table_;
+  const ReadOptions& read_options_;
+  const InternalKeyComparator& icomp_;
+  UserComparatorWrapper user_comparator_;
+  PinnedIteratorsManager* pinned_iters_mgr_;
+  DataBlockIter block_iter_;
+  const SliceTransform* prefix_extractor_;
+  uint64_t prev_block_offset_ = std::numeric_limits<uint64_t>::max();
+  BlockCacheLookupContext lookup_context_;
+
+  BlockPrefetcher block_prefetcher_;
+
+  const bool allow_unprepared_value_;
+  // True if block_iter_ is initialized and points to the same block
+  // as index iterator.
+  bool block_iter_points_to_real_block_;
+  // See InternalIteratorBase::IsOutOfBound().
+  bool is_out_of_bound_ = false;
+  // How current data block's boundary key with the next block is compared with
+  // iterate upper bound.
+  BlockUpperBound block_upper_bound_check_ = BlockUpperBound::kUnknown;
+  // True if we're standing at the first key of a block, and we haven't loaded
+  // that block yet. A call to PrepareValue() will trigger loading the block.
+  bool is_at_first_key_from_index_ = false;
+  bool check_filter_;
+  // TODO(Zhongyi): pick a better name
+  bool need_upper_bound_check_;
+
+  bool async_read_in_progress_;
+
+  // If `target` is null, seek to first.
+  void SeekImpl(const Slice* target, bool async_prefetch);
+
+  void InitDataBlock();
+  void AsyncInitDataBlock(bool is_first_pass);
+  bool MaterializeCurrentBlock();
+  void FindKeyForward();
+  void FindBlockForward();
+  void FindKeyBackward();
+  void CheckOutOfBound();
+
+  // Check if data block is fully within iterate_upper_bound.
+  //
+  // Note MyRocks may update iterate bounds between seek. To workaround it,
+  // we need to check and update data_block_within_upper_bound_ accordingly.
+  void CheckDataBlockWithinUpperBound();
+
+  bool CheckPrefixMayMatch(const Slice& ikey, IterDirection direction) {
+    if (need_upper_bound_check_ && direction == IterDirection::kBackward) {
+      // Upper bound check isn't sufficient for backward direction to
+      // guarantee the same result as total order, so disable prefix
+      // check.
+      return true;
+    }
+    if (check_filter_ && !table_->PrefixRangeMayMatch(
+                             ikey, read_options_, prefix_extractor_,
+                             need_upper_bound_check_, &lookup_context_)) {
+      // TODO remember the iterator is invalidated because of prefix
+      // match. This can avoid the upper level file iterator to falsely
+      // believe the position is the end of the SST file and move to
+      // the first key of the next file.
+      ResetDataIter();
+      return false;
+    }
+    return true;
+  }
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_reader.cc b/src/rocksdb/table/block_based/block_based_table_reader.cc
new file mode 100644
index 000000000..43962ba1d
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_reader.cc
@@ -0,0 +1,3092 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/block_based_table_reader.h"
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_key.h"
+#include "db/compaction/compaction_picker.h"
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+#include "file/file_prefetch_buffer.h"
+#include "file/file_util.h"
+#include "file/random_access_file_reader.h"
+#include "logging/logging.h"
+#include "monitoring/perf_context_imp.h"
+#include "port/lang.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/snapshot.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/trace_record.h"
+#include "table/block_based/binary_search_index_reader.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_based_table_iterator.h"
+#include "table/block_based/block_like_traits.h"
+#include "table/block_based/block_prefix_index.h"
+#include "table/block_based/block_type.h"
+#include "table/block_based/filter_block.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "table/block_based/full_filter_block.h"
+#include "table/block_based/hash_index_reader.h"
+#include "table/block_based/partitioned_filter_block.h"
+#include "table/block_based/partitioned_index_reader.h"
+#include "table/block_fetcher.h"
+#include "table/format.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/meta_blocks.h"
+#include "table/multiget_context.h"
+#include "table/persistent_cache_helper.h"
+#include "table/persistent_cache_options.h"
+#include "table/sst_file_writer_collectors.h"
+#include "table/two_level_iterator.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+
+CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) {
+  CacheAllocationPtr heap_buf;
+  heap_buf = AllocateBlock(buf.size(), allocator);
+  memcpy(heap_buf.get(), buf.data(), buf.size());
+  return heap_buf;
+}
+}  // namespace
+}  // namespace ROCKSDB_NAMESPACE
+
+// Generate the regular and coroutine versions of some methods by
+// including block_based_table_reader_sync_and_async.h twice
+// Macros in the header will expand differently based on whether
+// WITH_COROUTINES or WITHOUT_COROUTINES is defined
+// clang-format off
+#define WITHOUT_COROUTINES
+#include "table/block_based/block_based_table_reader_sync_and_async.h"
+#undef WITHOUT_COROUTINES
+#define WITH_COROUTINES
+#include "table/block_based/block_based_table_reader_sync_and_async.h"
+#undef WITH_COROUTINES
+// clang-format on
+
+namespace ROCKSDB_NAMESPACE {
+
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const std::string kHashIndexPrefixesBlock;
+extern const std::string kHashIndexPrefixesMetadataBlock;
+
+BlockBasedTable::~BlockBasedTable() { delete rep_; }
+
+namespace {
+// Read the block identified by "handle" from "file".
+// The only relevant option is options.verify_checksums for now.
+// On failure return non-OK.
+// On success fill *result and return OK - caller owns *result
+// @param uncompression_dict Data for presetting the compression library's
+//    dictionary.
+template <typename TBlocklike>
+Status ReadBlockFromFile(
+    RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
+    const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
+    std::unique_ptr<TBlocklike>* result, const ImmutableOptions& ioptions,
+    bool do_uncompress, bool maybe_compressed, BlockType block_type,
+    const UncompressionDict& uncompression_dict,
+    const PersistentCacheOptions& cache_options, size_t read_amp_bytes_per_bit,
+    MemoryAllocator* memory_allocator, bool for_compaction, bool using_zstd,
+    const FilterPolicy* filter_policy, bool async_read) {
+  assert(result);
+
+  BlockContents contents;
+  BlockFetcher block_fetcher(
+      file, prefetch_buffer, footer, options, handle, &contents, ioptions,
+      do_uncompress, maybe_compressed, block_type, uncompression_dict,
+      cache_options, memory_allocator, nullptr, for_compaction);
+  Status s;
+  // If prefetch_buffer is not allocated, it will fallback to synchronous
+  // reading of block contents.
+  if (async_read && prefetch_buffer != nullptr) {
+    s = block_fetcher.ReadAsyncBlockContents();
+    if (!s.ok()) {
+      return s;
+    }
+  } else {
+    s = block_fetcher.ReadBlockContents();
+  }
+  if (s.ok()) {
+    result->reset(BlocklikeTraits<TBlocklike>::Create(
+        std::move(contents), read_amp_bytes_per_bit, ioptions.stats, using_zstd,
+        filter_policy));
+  }
+
+  return s;
+}
+
+// For hash based index, return false if table_properties->prefix_extractor_name
+// and prefix_extractor both exist and match, otherwise true.
+inline bool PrefixExtractorChangedHelper(
+    const TableProperties* table_properties,
+    const SliceTransform* prefix_extractor) {
+  // BlockBasedTableOptions::kHashSearch requires prefix_extractor to be set.
+  // Turn off hash index in prefix_extractor is not set; if  prefix_extractor
+  // is set but prefix_extractor_block is not set, also disable hash index
+  if (prefix_extractor == nullptr || table_properties == nullptr ||
+      table_properties->prefix_extractor_name.empty()) {
+    return true;
+  }
+
+  // prefix_extractor and prefix_extractor_block are both non-empty
+  if (table_properties->prefix_extractor_name != prefix_extractor->AsString()) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+}  // namespace
+
+void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type,
+                                            GetContext* get_context,
+                                            size_t usage) const {
+  Statistics* const statistics = rep_->ioptions.stats;
+
+  PERF_COUNTER_ADD(block_cache_hit_count, 1);
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1,
+                            static_cast<uint32_t>(rep_->level));
+
+  if (get_context) {
+    ++get_context->get_context_stats_.num_cache_hit;
+    get_context->get_context_stats_.num_cache_bytes_read += usage;
+  } else {
+    RecordTick(statistics, BLOCK_CACHE_HIT);
+    RecordTick(statistics, BLOCK_CACHE_BYTES_READ, usage);
+  }
+
+  switch (block_type) {
+    case BlockType::kFilter:
+    case BlockType::kFilterPartitionIndex:
+      PERF_COUNTER_ADD(block_cache_filter_hit_count, 1);
+
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_filter_hit;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_FILTER_HIT);
+      }
+      break;
+
+    case BlockType::kCompressionDictionary:
+      // TODO: introduce perf counter for compression dictionary hit count
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_compression_dict_hit;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_HIT);
+      }
+      break;
+
+    case BlockType::kIndex:
+      PERF_COUNTER_ADD(block_cache_index_hit_count, 1);
+
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_index_hit;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_INDEX_HIT);
+      }
+      break;
+
+    default:
+      // TODO: introduce dedicated tickers/statistics/counters
+      // for range tombstones
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_data_hit;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_DATA_HIT);
+      }
+      break;
+  }
+}
+
+void BlockBasedTable::UpdateCacheMissMetrics(BlockType block_type,
+                                             GetContext* get_context) const {
+  Statistics* const statistics = rep_->ioptions.stats;
+
+  // TODO: introduce aggregate (not per-level) block cache miss count
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1,
+                            static_cast<uint32_t>(rep_->level));
+
+  if (get_context) {
+    ++get_context->get_context_stats_.num_cache_miss;
+  } else {
+    RecordTick(statistics, BLOCK_CACHE_MISS);
+  }
+
+  // TODO: introduce perf counters for misses per block type
+  switch (block_type) {
+    case BlockType::kFilter:
+    case BlockType::kFilterPartitionIndex:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_filter_miss;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_FILTER_MISS);
+      }
+      break;
+
+    case BlockType::kCompressionDictionary:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_compression_dict_miss;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_MISS);
+      }
+      break;
+
+    case BlockType::kIndex:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_index_miss;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_INDEX_MISS);
+      }
+      break;
+
+    default:
+      // TODO: introduce dedicated tickers/statistics/counters
+      // for range tombstones
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_data_miss;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_DATA_MISS);
+      }
+      break;
+  }
+}
+
+void BlockBasedTable::UpdateCacheInsertionMetrics(
+    BlockType block_type, GetContext* get_context, size_t usage, bool redundant,
+    Statistics* const statistics) {
+  // TODO: introduce perf counters for block cache insertions
+  if (get_context) {
+    ++get_context->get_context_stats_.num_cache_add;
+    if (redundant) {
+      ++get_context->get_context_stats_.num_cache_add_redundant;
+    }
+    get_context->get_context_stats_.num_cache_bytes_write += usage;
+  } else {
+    RecordTick(statistics, BLOCK_CACHE_ADD);
+    if (redundant) {
+      RecordTick(statistics, BLOCK_CACHE_ADD_REDUNDANT);
+    }
+    RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage);
+  }
+
+  switch (block_type) {
+    case BlockType::kFilter:
+    case BlockType::kFilterPartitionIndex:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_filter_add;
+        if (redundant) {
+          ++get_context->get_context_stats_.num_cache_filter_add_redundant;
+        }
+        get_context->get_context_stats_.num_cache_filter_bytes_insert += usage;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_FILTER_ADD);
+        if (redundant) {
+          RecordTick(statistics, BLOCK_CACHE_FILTER_ADD_REDUNDANT);
+        }
+        RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage);
+      }
+      break;
+
+    case BlockType::kCompressionDictionary:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_compression_dict_add;
+        if (redundant) {
+          ++get_context->get_context_stats_
+                .num_cache_compression_dict_add_redundant;
+        }
+        get_context->get_context_stats_
+            .num_cache_compression_dict_bytes_insert += usage;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD);
+        if (redundant) {
+          RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT);
+        }
+        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
+                   usage);
+      }
+      break;
+
+    case BlockType::kIndex:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_index_add;
+        if (redundant) {
+          ++get_context->get_context_stats_.num_cache_index_add_redundant;
+        }
+        get_context->get_context_stats_.num_cache_index_bytes_insert += usage;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
+        if (redundant) {
+          RecordTick(statistics, BLOCK_CACHE_INDEX_ADD_REDUNDANT);
+        }
+        RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usage);
+      }
+      break;
+
+    default:
+      // TODO: introduce dedicated tickers/statistics/counters
+      // for range tombstones
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_data_add;
+        if (redundant) {
+          ++get_context->get_context_stats_.num_cache_data_add_redundant;
+        }
+        get_context->get_context_stats_.num_cache_data_bytes_insert += usage;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
+        if (redundant) {
+          RecordTick(statistics, BLOCK_CACHE_DATA_ADD_REDUNDANT);
+        }
+        RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, usage);
+      }
+      break;
+  }
+}
+
+Cache::Handle* BlockBasedTable::GetEntryFromCache(
+    const CacheTier& cache_tier, Cache* block_cache, const Slice& key,
+    BlockType block_type, const bool wait, GetContext* get_context,
+    const Cache::CacheItemHelper* cache_helper,
+    const Cache::CreateCallback& create_cb, Cache::Priority priority) const {
+  Cache::Handle* cache_handle = nullptr;
+  if (cache_tier == CacheTier::kNonVolatileBlockTier) {
+    cache_handle = block_cache->Lookup(key, cache_helper, create_cb, priority,
+                                       wait, rep_->ioptions.statistics.get());
+  } else {
+    cache_handle = block_cache->Lookup(key, rep_->ioptions.statistics.get());
+  }
+
+  // Avoid updating metrics here if the handle is not complete yet. This
+  // happens with MultiGet and secondary cache. So update the metrics only
+  // if its a miss, or a hit and value is ready
+  if (!cache_handle || block_cache->Value(cache_handle)) {
+    if (cache_handle != nullptr) {
+      UpdateCacheHitMetrics(block_type, get_context,
+                            block_cache->GetUsage(cache_handle));
+    } else {
+      UpdateCacheMissMetrics(block_type, get_context);
+    }
+  }
+
+  return cache_handle;
+}
+
+template <typename TBlocklike>
+Status BlockBasedTable::InsertEntryToCache(
+    const CacheTier& cache_tier, Cache* block_cache, const Slice& key,
+    const Cache::CacheItemHelper* cache_helper,
+    std::unique_ptr<TBlocklike>&& block_holder, size_t charge,
+    Cache::Handle** cache_handle, Cache::Priority priority) const {
+  Status s = Status::OK();
+  if (cache_tier == CacheTier::kNonVolatileBlockTier) {
+    s = block_cache->Insert(key, block_holder.get(), cache_helper, charge,
+                            cache_handle, priority);
+  } else {
+    s = block_cache->Insert(key, block_holder.get(), charge,
+                            cache_helper->del_cb, cache_handle, priority);
+  }
+  if (s.ok()) {
+    // Cache took ownership
+    block_holder.release();
+  }
+  s.MustCheck();
+  return s;
+}
+
+namespace {
+// Return True if table_properties has `user_prop_name` has a `true` value
+// or it doesn't contain this property (for backward compatible).
+bool IsFeatureSupported(const TableProperties& table_properties,
+                        const std::string& user_prop_name, Logger* info_log) {
+  auto& props = table_properties.user_collected_properties;
+  auto pos = props.find(user_prop_name);
+  // Older version doesn't have this value set. Skip this check.
+  if (pos != props.end()) {
+    if (pos->second == kPropFalse) {
+      return false;
+    } else if (pos->second != kPropTrue) {
+      ROCKS_LOG_WARN(info_log, "Property %s has invalidate value %s",
+                     user_prop_name.c_str(), pos->second.c_str());
+    }
+  }
+  return true;
+}
+
+// Caller has to ensure seqno is not nullptr.
+Status GetGlobalSequenceNumber(const TableProperties& table_properties,
+                               SequenceNumber largest_seqno,
+                               SequenceNumber* seqno) {
+  const auto& props = table_properties.user_collected_properties;
+  const auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion);
+  const auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno);
+
+  *seqno = kDisableGlobalSequenceNumber;
+  if (version_pos == props.end()) {
+    if (seqno_pos != props.end()) {
+      std::array<char, 200> msg_buf;
+      // This is not an external sst file, global_seqno is not supported.
+      snprintf(
+          msg_buf.data(), msg_buf.max_size(),
+          "A non-external sst file have global seqno property with value %s",
+          seqno_pos->second.c_str());
+      return Status::Corruption(msg_buf.data());
+    }
+    return Status::OK();
+  }
+
+  uint32_t version = DecodeFixed32(version_pos->second.c_str());
+  if (version < 2) {
+    if (seqno_pos != props.end() || version != 1) {
+      std::array<char, 200> msg_buf;
+      // This is a v1 external sst file, global_seqno is not supported.
+      snprintf(msg_buf.data(), msg_buf.max_size(),
+               "An external sst file with version %u have global seqno "
+               "property with value %s",
+               version, seqno_pos->second.c_str());
+      return Status::Corruption(msg_buf.data());
+    }
+    return Status::OK();
+  }
+
+  // Since we have a plan to deprecate global_seqno, we do not return failure
+  // if seqno_pos == props.end(). We rely on version_pos to detect whether the
+  // SST is external.
+  SequenceNumber global_seqno(0);
+  if (seqno_pos != props.end()) {
+    global_seqno = DecodeFixed64(seqno_pos->second.c_str());
+  }
+  // SstTableReader open table reader with kMaxSequenceNumber as largest_seqno
+  // to denote it is unknown.
+  if (largest_seqno < kMaxSequenceNumber) {
+    if (global_seqno == 0) {
+      global_seqno = largest_seqno;
+    }
+    if (global_seqno != largest_seqno) {
+      std::array<char, 200> msg_buf;
+      snprintf(
+          msg_buf.data(), msg_buf.max_size(),
+          "An external sst file with version %u have global seqno property "
+          "with value %s, while largest seqno in the file is %llu",
+          version, seqno_pos->second.c_str(),
+          static_cast<unsigned long long>(largest_seqno));
+      return Status::Corruption(msg_buf.data());
+    }
+  }
+  *seqno = global_seqno;
+
+  if (global_seqno > kMaxSequenceNumber) {
+    std::array<char, 200> msg_buf;
+    snprintf(msg_buf.data(), msg_buf.max_size(),
+             "An external sst file with version %u have global seqno property "
+             "with value %llu, which is greater than kMaxSequenceNumber",
+             version, static_cast<unsigned long long>(global_seqno));
+    return Status::Corruption(msg_buf.data());
+  }
+
+  return Status::OK();
+}
+}  // namespace
+
+void BlockBasedTable::SetupBaseCacheKey(const TableProperties* properties,
+                                        const std::string& cur_db_session_id,
+                                        uint64_t cur_file_number,
+                                        OffsetableCacheKey* out_base_cache_key,
+                                        bool* out_is_stable) {
+  // Use a stable cache key if sufficient data is in table properties
+  std::string db_session_id;
+  uint64_t file_num;
+  std::string db_id;
+  if (properties && !properties->db_session_id.empty() &&
+      properties->orig_file_number > 0) {
+    // (Newer SST file case)
+    // We must have both properties to get a stable unique id because
+    // CreateColumnFamilyWithImport or IngestExternalFiles can change the
+    // file numbers on a file.
+    db_session_id = properties->db_session_id;
+    file_num = properties->orig_file_number;
+    // Less critical, populated in earlier release than above
+    db_id = properties->db_id;
+    if (out_is_stable) {
+      *out_is_stable = true;
+    }
+  } else {
+    // (Old SST file case)
+    // We use (unique) cache keys based on current identifiers. These are at
+    // least stable across table file close and re-open, but not across
+    // different DBs nor DB close and re-open.
+    db_session_id = cur_db_session_id;
+    file_num = cur_file_number;
+    // Plumbing through the DB ID to here would be annoying, and of limited
+    // value because of the case of VersionSet::Recover opening some table
+    // files and later setting the DB ID. So we just rely on uniqueness
+    // level provided by session ID.
+    db_id = "unknown";
+    if (out_is_stable) {
+      *out_is_stable = false;
+    }
+  }
+
+  // Too many tests to update to get these working
+  // assert(file_num > 0);
+  // assert(!db_session_id.empty());
+  // assert(!db_id.empty());
+
+  // Minimum block size is 5 bytes; therefore we can trim off two lower bits
+  // from offsets. See GetCacheKey.
+  *out_base_cache_key = OffsetableCacheKey(db_id, db_session_id, file_num);
+}
+
+CacheKey BlockBasedTable::GetCacheKey(const OffsetableCacheKey& base_cache_key,
+                                      const BlockHandle& handle) {
+  // Minimum block size is 5 bytes; therefore we can trim off two lower bits
+  // from offet.
+  return base_cache_key.WithOffset(handle.offset() >> 2);
+}
+
+Status BlockBasedTable::Open(
+    const ReadOptions& read_options, const ImmutableOptions& ioptions,
+    const EnvOptions& env_options, const BlockBasedTableOptions& table_options,
+    const InternalKeyComparator& internal_comparator,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    std::unique_ptr<TableReader>* table_reader,
+    std::shared_ptr<CacheReservationManager> table_reader_cache_res_mgr,
+    const std::shared_ptr<const SliceTransform>& prefix_extractor,
+    const bool prefetch_index_and_filter_in_cache, const bool skip_filters,
+    const int level, const bool immortal_table,
+    const SequenceNumber largest_seqno, const bool force_direct_prefetch,
+    TailPrefetchStats* tail_prefetch_stats,
+    BlockCacheTracer* const block_cache_tracer,
+    size_t max_file_size_for_l0_meta_pin, const std::string& cur_db_session_id,
+    uint64_t cur_file_num, UniqueId64x2 expected_unique_id) {
+  table_reader->reset();
+
+  Status s;
+  Footer footer;
+  std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
+
+  // From read_options, retain deadline, io_timeout, and rate_limiter_priority.
+  // In future, we may retain more
+  // options. Specifically, we ignore verify_checksums and default to
+  // checksum verification anyway when creating the index and filter
+  // readers.
+  ReadOptions ro;
+  ro.deadline = read_options.deadline;
+  ro.io_timeout = read_options.io_timeout;
+  ro.rate_limiter_priority = read_options.rate_limiter_priority;
+
+  // prefetch both index and filters, down to all partitions
+  const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0;
+  const bool preload_all = !table_options.cache_index_and_filter_blocks;
+
+  if (!ioptions.allow_mmap_reads) {
+    s = PrefetchTail(ro, file.get(), file_size, force_direct_prefetch,
+                     tail_prefetch_stats, prefetch_all, preload_all,
+                     &prefetch_buffer);
+    // Return error in prefetch path to users.
+    if (!s.ok()) {
+      return s;
+    }
+  } else {
+    // Should not prefetch for mmap mode.
+    prefetch_buffer.reset(new FilePrefetchBuffer(
+        0 /* readahead_size */, 0 /* max_readahead_size */, false /* enable */,
+        true /* track_min_offset */));
+  }
+
+  // Read in the following order:
+  //    1. Footer
+  //    2. [metaindex block]
+  //    3. [meta block: properties]
+  //    4. [meta block: range deletion tombstone]
+  //    5. [meta block: compression dictionary]
+  //    6. [meta block: index]
+  //    7. [meta block: filter]
+  IOOptions opts;
+  s = file->PrepareIOOptions(ro, opts);
+  if (s.ok()) {
+    s = ReadFooterFromFile(opts, file.get(), prefetch_buffer.get(), file_size,
+                           &footer, kBlockBasedTableMagicNumber);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+  if (!IsSupportedFormatVersion(footer.format_version())) {
+    return Status::Corruption(
+        "Unknown Footer version. Maybe this file was created with newer "
+        "version of RocksDB?");
+  }
+
+  BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
+  Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
+                                      internal_comparator, skip_filters,
+                                      file_size, level, immortal_table);
+  rep->file = std::move(file);
+  rep->footer = footer;
+
+  // For fully portable/stable cache keys, we need to read the properties
+  // block before setting up cache keys. TODO: consider setting up a bootstrap
+  // cache key for PersistentCache to use for metaindex and properties blocks.
+  rep->persistent_cache_options = PersistentCacheOptions();
+
+  // Meta-blocks are not dictionary compressed. Explicitly set the dictionary
+  // handle to null, otherwise it may be seen as uninitialized during the below
+  // meta-block reads.
+  rep->compression_dict_handle = BlockHandle::NullBlockHandle();
+
+  // Read metaindex
+  std::unique_ptr<BlockBasedTable> new_table(
+      new BlockBasedTable(rep, block_cache_tracer));
+  std::unique_ptr<Block> metaindex;
+  std::unique_ptr<InternalIterator> metaindex_iter;
+  s = new_table->ReadMetaIndexBlock(ro, prefetch_buffer.get(), &metaindex,
+                                    &metaindex_iter);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Populates table_properties and some fields that depend on it,
+  // such as index_type.
+  s = new_table->ReadPropertiesBlock(ro, prefetch_buffer.get(),
+                                     metaindex_iter.get(), largest_seqno);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Check expected unique id if provided
+  if (expected_unique_id != kNullUniqueId64x2) {
+    auto props = rep->table_properties;
+    if (!props) {
+      return Status::Corruption("Missing table properties on file " +
+                                std::to_string(cur_file_num) +
+                                " with known unique ID");
+    }
+    UniqueId64x2 actual_unique_id{};
+    s = GetSstInternalUniqueId(props->db_id, props->db_session_id,
+                               props->orig_file_number, &actual_unique_id,
+                               /*force*/ true);
+    assert(s.ok());  // because force=true
+    if (expected_unique_id != actual_unique_id) {
+      return Status::Corruption(
+          "Mismatch in unique ID on table file " +
+          std::to_string(cur_file_num) +
+          ". Expected: " + InternalUniqueIdToHumanString(&expected_unique_id) +
+          " Actual: " + InternalUniqueIdToHumanString(&actual_unique_id));
+    }
+    TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::PassedVerifyUniqueId",
+                             &actual_unique_id);
+  } else {
+    TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::SkippedVerifyUniqueId",
+                             nullptr);
+    if (ioptions.verify_sst_unique_id_in_manifest && ioptions.logger) {
+      // A crude but isolated way of reporting unverified files. This should not
+      // be an ongoing concern so doesn't deserve a place in Statistics IMHO.
+      static std::atomic<uint64_t> unverified_count{0};
+      auto prev_count =
+          unverified_count.fetch_add(1, std::memory_order_relaxed);
+      if (prev_count == 0) {
+        ROCKS_LOG_WARN(
+            ioptions.logger,
+            "At least one SST file opened without unique ID to verify: %" PRIu64
+            ".sst",
+            cur_file_num);
+      } else if (prev_count % 1000 == 0) {
+        ROCKS_LOG_WARN(
+            ioptions.logger,
+            "Another ~1000 SST files opened without unique ID to verify");
+      }
+    }
+  }
+
+  // Set up prefix extracto as needed
+  bool force_null_table_prefix_extractor = false;
+  TEST_SYNC_POINT_CALLBACK(
+      "BlockBasedTable::Open::ForceNullTablePrefixExtractor",
+      &force_null_table_prefix_extractor);
+  if (force_null_table_prefix_extractor) {
+    assert(!rep->table_prefix_extractor);
+  } else if (!PrefixExtractorChangedHelper(rep->table_properties.get(),
+                                           prefix_extractor.get())) {
+    // Establish fast path for unchanged prefix_extractor
+    rep->table_prefix_extractor = prefix_extractor;
+  } else {
+    // Current prefix_extractor doesn't match table
+#ifndef ROCKSDB_LITE
+    if (rep->table_properties) {
+      //**TODO: If/When the DBOptions has a registry in it, the ConfigOptions
+      // will need to use it
+      ConfigOptions config_options;
+      Status st = SliceTransform::CreateFromString(
+          config_options, rep->table_properties->prefix_extractor_name,
+          &(rep->table_prefix_extractor));
+      if (!st.ok()) {
+        //**TODO: Should this be error be returned or swallowed?
+        ROCKS_LOG_ERROR(rep->ioptions.logger,
+                        "Failed to create prefix extractor[%s]: %s",
+                        rep->table_properties->prefix_extractor_name.c_str(),
+                        st.ToString().c_str());
+      }
+    }
+#endif  // ROCKSDB_LITE
+  }
+
+  // With properties loaded, we can set up portable/stable cache keys
+  SetupBaseCacheKey(rep->table_properties.get(), cur_db_session_id,
+                    cur_file_num, &rep->base_cache_key);
+
+  rep->persistent_cache_options =
+      PersistentCacheOptions(rep->table_options.persistent_cache,
+                             rep->base_cache_key, rep->ioptions.stats);
+
+  s = new_table->ReadRangeDelBlock(ro, prefetch_buffer.get(),
+                                   metaindex_iter.get(), internal_comparator,
+                                   &lookup_context);
+  if (!s.ok()) {
+    return s;
+  }
+  s = new_table->PrefetchIndexAndFilterBlocks(
+      ro, prefetch_buffer.get(), metaindex_iter.get(), new_table.get(),
+      prefetch_all, table_options, level, file_size,
+      max_file_size_for_l0_meta_pin, &lookup_context);
+
+  if (s.ok()) {
+    // Update tail prefetch stats
+    assert(prefetch_buffer.get() != nullptr);
+    if (tail_prefetch_stats != nullptr) {
+      assert(prefetch_buffer->min_offset_read() < file_size);
+      tail_prefetch_stats->RecordEffectiveSize(
+          static_cast<size_t>(file_size) - prefetch_buffer->min_offset_read());
+    }
+  }
+
+  if (s.ok() && table_reader_cache_res_mgr) {
+    std::size_t mem_usage = new_table->ApproximateMemoryUsage();
+    s = table_reader_cache_res_mgr->MakeCacheReservation(
+        mem_usage, &(rep->table_reader_cache_res_handle));
+    if (s.IsMemoryLimit()) {
+      s = Status::MemoryLimit(
+          "Can't allocate " +
+          kCacheEntryRoleToCamelString[static_cast<std::uint32_t>(
+              CacheEntryRole::kBlockBasedTableReader)] +
+          " due to memory limit based on "
+          "cache capacity for memory allocation");
+    }
+  }
+
+  if (s.ok()) {
+    *table_reader = std::move(new_table);
+  }
+  return s;
+}
+
+Status BlockBasedTable::PrefetchTail(
+    const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size,
+    bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats,
+    const bool prefetch_all, const bool preload_all,
+    std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer) {
+  size_t tail_prefetch_size = 0;
+  if (tail_prefetch_stats != nullptr) {
+    // Multiple threads may get a 0 (no history) when running in parallel,
+    // but it will get cleared after the first of them finishes.
+    tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize();
+  }
+  if (tail_prefetch_size == 0) {
+    // Before read footer, readahead backwards to prefetch data. Do more
+    // readahead if we're going to read index/filter.
+    // TODO: This may incorrectly select small readahead in case partitioned
+    // index/filter is enabled and top-level partition pinning is enabled.
+    // That's because we need to issue readahead before we read the properties,
+    // at which point we don't yet know the index type.
+    tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024;
+  }
+  size_t prefetch_off;
+  size_t prefetch_len;
+  if (file_size < tail_prefetch_size) {
+    prefetch_off = 0;
+    prefetch_len = static_cast<size_t>(file_size);
+  } else {
+    prefetch_off = static_cast<size_t>(file_size - tail_prefetch_size);
+    prefetch_len = tail_prefetch_size;
+  }
+  TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen",
+                           &tail_prefetch_size);
+
+  // Try file system prefetch
+  if (!file->use_direct_io() && !force_direct_prefetch) {
+    if (!file->Prefetch(prefetch_off, prefetch_len, ro.rate_limiter_priority)
+             .IsNotSupported()) {
+      prefetch_buffer->reset(new FilePrefetchBuffer(
+          0 /* readahead_size */, 0 /* max_readahead_size */,
+          false /* enable */, true /* track_min_offset */));
+      return Status::OK();
+    }
+  }
+
+  // Use `FilePrefetchBuffer`
+  prefetch_buffer->reset(
+      new FilePrefetchBuffer(0 /* readahead_size */, 0 /* max_readahead_size */,
+                             true /* enable */, true /* track_min_offset */));
+
+  IOOptions opts;
+  Status s = file->PrepareIOOptions(ro, opts);
+  if (s.ok()) {
+    s = (*prefetch_buffer)
+            ->Prefetch(opts, file, prefetch_off, prefetch_len,
+                       ro.rate_limiter_priority);
+  }
+  return s;
+}
+
+Status BlockBasedTable::ReadPropertiesBlock(
+    const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
+    InternalIterator* meta_iter, const SequenceNumber largest_seqno) {
+  Status s;
+  BlockHandle handle;
+  s = FindOptionalMetaBlock(meta_iter, kPropertiesBlockName, &handle);
+
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(rep_->ioptions.logger,
+                   "Error when seeking to properties block from file: %s",
+                   s.ToString().c_str());
+  } else if (!handle.IsNull()) {
+    s = meta_iter->status();
+    std::unique_ptr<TableProperties> table_properties;
+    if (s.ok()) {
+      s = ReadTablePropertiesHelper(
+          ro, handle, rep_->file.get(), prefetch_buffer, rep_->footer,
+          rep_->ioptions, &table_properties, nullptr /* memory_allocator */);
+    }
+    IGNORE_STATUS_IF_ERROR(s);
+
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(rep_->ioptions.logger,
+                     "Encountered error while reading data from properties "
+                     "block %s",
+                     s.ToString().c_str());
+    } else {
+      assert(table_properties != nullptr);
+      rep_->table_properties = std::move(table_properties);
+      rep_->blocks_maybe_compressed =
+          rep_->table_properties->compression_name !=
+          CompressionTypeToString(kNoCompression);
+      rep_->blocks_definitely_zstd_compressed =
+          (rep_->table_properties->compression_name ==
+               CompressionTypeToString(kZSTD) ||
+           rep_->table_properties->compression_name ==
+               CompressionTypeToString(kZSTDNotFinalCompression));
+    }
+  } else {
+    ROCKS_LOG_ERROR(rep_->ioptions.logger,
+                    "Cannot find Properties block from file.");
+  }
+
+  // Read the table properties, if provided.
+  if (rep_->table_properties) {
+    rep_->whole_key_filtering &=
+        IsFeatureSupported(*(rep_->table_properties),
+                           BlockBasedTablePropertyNames::kWholeKeyFiltering,
+                           rep_->ioptions.logger);
+    rep_->prefix_filtering &= IsFeatureSupported(
+        *(rep_->table_properties),
+        BlockBasedTablePropertyNames::kPrefixFiltering, rep_->ioptions.logger);
+
+    rep_->index_key_includes_seq =
+        rep_->table_properties->index_key_is_user_key == 0;
+    rep_->index_value_is_full =
+        rep_->table_properties->index_value_is_delta_encoded == 0;
+
+    // Update index_type with the true type.
+    // If table properties don't contain index type, we assume that the table
+    // is in very old format and has kBinarySearch index type.
+    auto& props = rep_->table_properties->user_collected_properties;
+    auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
+    if (pos != props.end()) {
+      rep_->index_type = static_cast<BlockBasedTableOptions::IndexType>(
+          DecodeFixed32(pos->second.c_str()));
+    }
+
+    rep_->index_has_first_key =
+        rep_->index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey;
+
+    s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno,
+                                &(rep_->global_seqno));
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(rep_->ioptions.logger, "%s", s.ToString().c_str());
+    }
+  }
+  return s;
+}
+
+Status BlockBasedTable::ReadRangeDelBlock(
+    const ReadOptions& read_options, FilePrefetchBuffer* prefetch_buffer,
+    InternalIterator* meta_iter,
+    const InternalKeyComparator& internal_comparator,
+    BlockCacheLookupContext* lookup_context) {
+  Status s;
+  BlockHandle range_del_handle;
+  s = FindOptionalMetaBlock(meta_iter, kRangeDelBlockName, &range_del_handle);
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(
+        rep_->ioptions.logger,
+        "Error when seeking to range delete tombstones block from file: %s",
+        s.ToString().c_str());
+  } else if (!range_del_handle.IsNull()) {
+    Status tmp_status;
+    std::unique_ptr<InternalIterator> iter(NewDataBlockIterator<DataBlockIter>(
+        read_options, range_del_handle,
+        /*input_iter=*/nullptr, BlockType::kRangeDeletion,
+        /*get_context=*/nullptr, lookup_context, prefetch_buffer,
+        /*for_compaction= */ false, /*async_read= */ false, tmp_status));
+    assert(iter != nullptr);
+    s = iter->status();
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(
+          rep_->ioptions.logger,
+          "Encountered error while reading data from range del block %s",
+          s.ToString().c_str());
+      IGNORE_STATUS_IF_ERROR(s);
+    } else {
+      rep_->fragmented_range_dels =
+          std::make_shared<FragmentedRangeTombstoneList>(std::move(iter),
+                                                         internal_comparator);
+    }
+  }
+  return s;
+}
+
+Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
+    const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
+    InternalIterator* meta_iter, BlockBasedTable* new_table, bool prefetch_all,
+    const BlockBasedTableOptions& table_options, const int level,
+    size_t file_size, size_t max_file_size_for_l0_meta_pin,
+    BlockCacheLookupContext* lookup_context) {
+  // Find filter handle and filter type
+  if (rep_->filter_policy) {
+    auto name = rep_->filter_policy->CompatibilityName();
+    bool builtin_compatible =
+        strcmp(name, BuiltinFilterPolicy::kCompatibilityName()) == 0;
+
+    for (const auto& [filter_type, prefix] :
+         {std::make_pair(Rep::FilterType::kFullFilter, kFullFilterBlockPrefix),
+          std::make_pair(Rep::FilterType::kPartitionedFilter,
+                         kPartitionedFilterBlockPrefix),
+          std::make_pair(Rep::FilterType::kNoFilter,
+                         kObsoleteFilterBlockPrefix)}) {
+      if (builtin_compatible) {
+        // This code is only here to deal with a hiccup in early 7.0.x where
+        // there was an unintentional name change in the SST files metadata.
+        // It should be OK to remove this in the future (late 2022) and just
+        // have the 'else' code.
+        // NOTE: the test:: names below are likely not needed but included
+        // out of caution
+        static const std::unordered_set<std::string> kBuiltinNameAndAliases = {
+            BuiltinFilterPolicy::kCompatibilityName(),
+            test::LegacyBloomFilterPolicy::kClassName(),
+            test::FastLocalBloomFilterPolicy::kClassName(),
+            test::Standard128RibbonFilterPolicy::kClassName(),
+            "rocksdb.internal.DeprecatedBlockBasedBloomFilter",
+            BloomFilterPolicy::kClassName(),
+            RibbonFilterPolicy::kClassName(),
+        };
+
+        // For efficiency, do a prefix seek and see if the first match is
+        // good.
+        meta_iter->Seek(prefix);
+        if (meta_iter->status().ok() && meta_iter->Valid()) {
+          Slice key = meta_iter->key();
+          if (key.starts_with(prefix)) {
+            key.remove_prefix(prefix.size());
+            if (kBuiltinNameAndAliases.find(key.ToString()) !=
+                kBuiltinNameAndAliases.end()) {
+              Slice v = meta_iter->value();
+              Status s = rep_->filter_handle.DecodeFrom(&v);
+              if (s.ok()) {
+                rep_->filter_type = filter_type;
+                if (filter_type == Rep::FilterType::kNoFilter) {
+                  ROCKS_LOG_WARN(rep_->ioptions.logger,
+                                 "Detected obsolete filter type in %s. Read "
+                                 "performance might suffer until DB is fully "
+                                 "re-compacted.",
+                                 rep_->file->file_name().c_str());
+                }
+                break;
+              }
+            }
+          }
+        }
+      } else {
+        std::string filter_block_key = prefix + name;
+        if (FindMetaBlock(meta_iter, filter_block_key, &rep_->filter_handle)
+                .ok()) {
+          rep_->filter_type = filter_type;
+          if (filter_type == Rep::FilterType::kNoFilter) {
+            ROCKS_LOG_WARN(
+                rep_->ioptions.logger,
+                "Detected obsolete filter type in %s. Read performance might "
+                "suffer until DB is fully re-compacted.",
+                rep_->file->file_name().c_str());
+          }
+          break;
+        }
+      }
+    }
+  }
+  // Partition filters cannot be enabled without partition indexes
+  assert(rep_->filter_type != Rep::FilterType::kPartitionedFilter ||
+         rep_->index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
+
+  // Find compression dictionary handle
+  Status s = FindOptionalMetaBlock(meta_iter, kCompressionDictBlockName,
+                                   &rep_->compression_dict_handle);
+  if (!s.ok()) {
+    return s;
+  }
+
+  BlockBasedTableOptions::IndexType index_type = rep_->index_type;
+
+  const bool use_cache = table_options.cache_index_and_filter_blocks;
+
+  const bool maybe_flushed =
+      level == 0 && file_size <= max_file_size_for_l0_meta_pin;
+  std::function<bool(PinningTier, PinningTier)> is_pinned =
+      [maybe_flushed, &is_pinned](PinningTier pinning_tier,
+                                  PinningTier fallback_pinning_tier) {
+        // Fallback to fallback would lead to infinite recursion. Disallow it.
+        assert(fallback_pinning_tier != PinningTier::kFallback);
+
+        switch (pinning_tier) {
+          case PinningTier::kFallback:
+            return is_pinned(fallback_pinning_tier,
+                             PinningTier::kNone /* fallback_pinning_tier */);
+          case PinningTier::kNone:
+            return false;
+          case PinningTier::kFlushedAndSimilar:
+            return maybe_flushed;
+          case PinningTier::kAll:
+            return true;
+        };
+
+        // In GCC, this is needed to suppress `control reaches end of non-void
+        // function [-Werror=return-type]`.
+        assert(false);
+        return false;
+      };
+  const bool pin_top_level_index = is_pinned(
+      table_options.metadata_cache_options.top_level_index_pinning,
+      table_options.pin_top_level_index_and_filter ? PinningTier::kAll
+                                                   : PinningTier::kNone);
+  const bool pin_partition =
+      is_pinned(table_options.metadata_cache_options.partition_pinning,
+                table_options.pin_l0_filter_and_index_blocks_in_cache
+                    ? PinningTier::kFlushedAndSimilar
+                    : PinningTier::kNone);
+  const bool pin_unpartitioned =
+      is_pinned(table_options.metadata_cache_options.unpartitioned_pinning,
+                table_options.pin_l0_filter_and_index_blocks_in_cache
+                    ? PinningTier::kFlushedAndSimilar
+                    : PinningTier::kNone);
+
+  // pin the first level of index
+  const bool pin_index =
+      index_type == BlockBasedTableOptions::kTwoLevelIndexSearch
+          ? pin_top_level_index
+          : pin_unpartitioned;
+  // prefetch the first level of index
+  // WART: this might be redundant (unnecessary cache hit) if !pin_index,
+  // depending on prepopulate_block_cache option
+  const bool prefetch_index = prefetch_all || pin_index;
+
+  std::unique_ptr<IndexReader> index_reader;
+  s = new_table->CreateIndexReader(ro, prefetch_buffer, meta_iter, use_cache,
+                                   prefetch_index, pin_index, lookup_context,
+                                   &index_reader);
+  if (!s.ok()) {
+    return s;
+  }
+
+  rep_->index_reader = std::move(index_reader);
+
+  // The partitions of partitioned index are always stored in cache. They
+  // are hence follow the configuration for pin and prefetch regardless of
+  // the value of cache_index_and_filter_blocks
+  if (prefetch_all || pin_partition) {
+    s = rep_->index_reader->CacheDependencies(ro, pin_partition);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+
+  // pin the first level of filter
+  const bool pin_filter =
+      rep_->filter_type == Rep::FilterType::kPartitionedFilter
+          ? pin_top_level_index
+          : pin_unpartitioned;
+  // prefetch the first level of filter
+  // WART: this might be redundant (unnecessary cache hit) if !pin_filter,
+  // depending on prepopulate_block_cache option
+  const bool prefetch_filter = prefetch_all || pin_filter;
+
+  if (rep_->filter_policy) {
+    auto filter = new_table->CreateFilterBlockReader(
+        ro, prefetch_buffer, use_cache, prefetch_filter, pin_filter,
+        lookup_context);
+
+    if (filter) {
+      // Refer to the comment above about paritioned indexes always being cached
+      if (prefetch_all || pin_partition) {
+        s = filter->CacheDependencies(ro, pin_partition);
+        if (!s.ok()) {
+          return s;
+        }
+      }
+      rep_->filter = std::move(filter);
+    }
+  }
+
+  if (!rep_->compression_dict_handle.IsNull()) {
+    std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
+    s = UncompressionDictReader::Create(
+        this, ro, prefetch_buffer, use_cache, prefetch_all || pin_unpartitioned,
+        pin_unpartitioned, lookup_context, &uncompression_dict_reader);
+    if (!s.ok()) {
+      return s;
+    }
+
+    rep_->uncompression_dict_reader = std::move(uncompression_dict_reader);
+  }
+
+  assert(s.ok());
+  return s;
+}
+
+void BlockBasedTable::SetupForCompaction() {
+  switch (rep_->ioptions.access_hint_on_compaction_start) {
+    case Options::NONE:
+      break;
+    case Options::NORMAL:
+      rep_->file->file()->Hint(FSRandomAccessFile::kNormal);
+      break;
+    case Options::SEQUENTIAL:
+      rep_->file->file()->Hint(FSRandomAccessFile::kSequential);
+      break;
+    case Options::WILLNEED:
+      rep_->file->file()->Hint(FSRandomAccessFile::kWillNeed);
+      break;
+    default:
+      assert(false);
+  }
+}
+
+std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
+    const {
+  return rep_->table_properties;
+}
+
+size_t BlockBasedTable::ApproximateMemoryUsage() const {
+  size_t usage = 0;
+  if (rep_) {
+    usage += rep_->ApproximateMemoryUsage();
+  } else {
+    return usage;
+  }
+  if (rep_->filter) {
+    usage += rep_->filter->ApproximateMemoryUsage();
+  }
+  if (rep_->index_reader) {
+    usage += rep_->index_reader->ApproximateMemoryUsage();
+  }
+  if (rep_->uncompression_dict_reader) {
+    usage += rep_->uncompression_dict_reader->ApproximateMemoryUsage();
+  }
+  if (rep_->table_properties) {
+    usage += rep_->table_properties->ApproximateMemoryUsage();
+  }
+  return usage;
+}
+
+// Load the meta-index-block from the file. On success, return the loaded
+// metaindex
+// block and its iterator.
+Status BlockBasedTable::ReadMetaIndexBlock(
+    const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
+    std::unique_ptr<Block>* metaindex_block,
+    std::unique_ptr<InternalIterator>* iter) {
+  // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
+  // it is an empty block.
+  std::unique_ptr<Block> metaindex;
+  Status s = ReadBlockFromFile(
+      rep_->file.get(), prefetch_buffer, rep_->footer, ro,
+      rep_->footer.metaindex_handle(), &metaindex, rep_->ioptions,
+      true /* decompress */, true /*maybe_compressed*/, BlockType::kMetaIndex,
+      UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options,
+      0 /* read_amp_bytes_per_bit */, GetMemoryAllocator(rep_->table_options),
+      false /* for_compaction */, rep_->blocks_definitely_zstd_compressed,
+      nullptr /* filter_policy */, false /* async_read */);
+
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(rep_->ioptions.logger,
+                    "Encountered error while reading data from properties"
+                    " block %s",
+                    s.ToString().c_str());
+    return s;
+  }
+
+  *metaindex_block = std::move(metaindex);
+  // meta block uses bytewise comparator.
+  iter->reset(metaindex_block->get()->NewMetaIterator());
+  return Status::OK();
+}
+
+template <typename TBlocklike>
+Status BlockBasedTable::GetDataBlockFromCache(
+    const Slice& cache_key, Cache* block_cache, Cache* block_cache_compressed,
+    const ReadOptions& read_options,
+    CachableEntry<TBlocklike>* out_parsed_block,
+    const UncompressionDict& uncompression_dict, BlockType block_type,
+    const bool wait, GetContext* get_context) const {
+  const size_t read_amp_bytes_per_bit =
+      block_type == BlockType::kData
+          ? rep_->table_options.read_amp_bytes_per_bit
+          : 0;
+  assert(out_parsed_block);
+  assert(out_parsed_block->IsEmpty());
+  // Here we treat the legacy name "...index_and_filter_blocks..." to mean all
+  // metadata blocks that might go into block cache, EXCEPT only those needed
+  // for the read path (Get, etc.). TableProperties should not be needed on the
+  // read path (prefix extractor setting is an O(1) size special case that we
+  // are working not to require from TableProperties), so it is not given
+  // high-priority treatment if it should go into BlockCache.
+  const Cache::Priority priority =
+      rep_->table_options.cache_index_and_filter_blocks_with_high_priority &&
+              block_type != BlockType::kData &&
+              block_type != BlockType::kProperties
+          ? Cache::Priority::HIGH
+          : Cache::Priority::LOW;
+
+  Status s;
+  BlockContents* compressed_block = nullptr;
+  Cache::Handle* block_cache_compressed_handle = nullptr;
+  Statistics* statistics = rep_->ioptions.statistics.get();
+  bool using_zstd = rep_->blocks_definitely_zstd_compressed;
+  const FilterPolicy* filter_policy = rep_->filter_policy;
+  Cache::CreateCallback create_cb = GetCreateCallback<TBlocklike>(
+      read_amp_bytes_per_bit, statistics, using_zstd, filter_policy);
+
+  // Lookup uncompressed cache first
+  if (block_cache != nullptr) {
+    assert(!cache_key.empty());
+    Cache::Handle* cache_handle = nullptr;
+    cache_handle = GetEntryFromCache(
+        rep_->ioptions.lowest_used_cache_tier, block_cache, cache_key,
+        block_type, wait, get_context,
+        BlocklikeTraits<TBlocklike>::GetCacheItemHelper(block_type), create_cb,
+        priority);
+    if (cache_handle != nullptr) {
+      out_parsed_block->SetCachedValue(
+          reinterpret_cast<TBlocklike*>(block_cache->Value(cache_handle)),
+          block_cache, cache_handle);
+      return s;
+    }
+  }
+
+  // If not found, search from the compressed block cache.
+  assert(out_parsed_block->IsEmpty());
+
+  if (block_cache_compressed == nullptr) {
+    return s;
+  }
+
+  assert(!cache_key.empty());
+  BlockContents contents;
+  block_cache_compressed_handle =
+      block_cache_compressed->Lookup(cache_key, statistics);
+
+  // if we found in the compressed cache, then uncompress and insert into
+  // uncompressed cache
+  if (block_cache_compressed_handle == nullptr) {
+    RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS);
+    return s;
+  }
+
+  // found compressed block
+  RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT);
+  compressed_block = reinterpret_cast<BlockContents*>(
+      block_cache_compressed->Value(block_cache_compressed_handle));
+  CompressionType compression_type = GetBlockCompressionType(*compressed_block);
+  assert(compression_type != kNoCompression);
+
+  // Retrieve the uncompressed contents into a new buffer
+  UncompressionContext context(compression_type);
+  UncompressionInfo info(context, uncompression_dict, compression_type);
+  s = UncompressSerializedBlock(
+      info, compressed_block->data.data(), compressed_block->data.size(),
+      &contents, rep_->table_options.format_version, rep_->ioptions,
+      GetMemoryAllocator(rep_->table_options));
+
+  // Insert parsed block into block cache, the priority is based on the
+  // data block type.
+  if (s.ok()) {
+    std::unique_ptr<TBlocklike> block_holder(
+        BlocklikeTraits<TBlocklike>::Create(
+            std::move(contents), read_amp_bytes_per_bit, statistics,
+            rep_->blocks_definitely_zstd_compressed,
+            rep_->table_options.filter_policy.get()));
+
+    if (block_cache != nullptr && block_holder->own_bytes() &&
+        read_options.fill_cache) {
+      size_t charge = block_holder->ApproximateMemoryUsage();
+      Cache::Handle* cache_handle = nullptr;
+      auto block_holder_raw_ptr = block_holder.get();
+      s = InsertEntryToCache(
+          rep_->ioptions.lowest_used_cache_tier, block_cache, cache_key,
+          BlocklikeTraits<TBlocklike>::GetCacheItemHelper(block_type),
+          std::move(block_holder), charge, &cache_handle, priority);
+      if (s.ok()) {
+        assert(cache_handle != nullptr);
+        out_parsed_block->SetCachedValue(block_holder_raw_ptr, block_cache,
+                                         cache_handle);
+
+        UpdateCacheInsertionMetrics(block_type, get_context, charge,
+                                    s.IsOkOverwritten(), rep_->ioptions.stats);
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
+      }
+    } else {
+      out_parsed_block->SetOwnedValue(std::move(block_holder));
+    }
+  }
+
+  // Release hold on compressed cache entry
+  block_cache_compressed->Release(block_cache_compressed_handle);
+  return s;
+}
+
+template <typename TBlocklike>
+Status BlockBasedTable::PutDataBlockToCache(
+    const Slice& cache_key, Cache* block_cache, Cache* block_cache_compressed,
+    CachableEntry<TBlocklike>* out_parsed_block, BlockContents&& block_contents,
+    CompressionType block_comp_type,
+    const UncompressionDict& uncompression_dict,
+    MemoryAllocator* memory_allocator, BlockType block_type,
+    GetContext* get_context) const {
+  const ImmutableOptions& ioptions = rep_->ioptions;
+  const uint32_t format_version = rep_->table_options.format_version;
+  const size_t read_amp_bytes_per_bit =
+      block_type == BlockType::kData
+          ? rep_->table_options.read_amp_bytes_per_bit
+          : 0;
+  const Cache::Priority priority =
+      rep_->table_options.cache_index_and_filter_blocks_with_high_priority &&
+              block_type != BlockType::kData
+          ? Cache::Priority::HIGH
+          : Cache::Priority::LOW;
+  assert(out_parsed_block);
+  assert(out_parsed_block->IsEmpty());
+
+  Status s;
+  Statistics* statistics = ioptions.stats;
+
+  std::unique_ptr<TBlocklike> block_holder;
+  if (block_comp_type != kNoCompression) {
+    // Retrieve the uncompressed contents into a new buffer
+    BlockContents uncompressed_block_contents;
+    UncompressionContext context(block_comp_type);
+    UncompressionInfo info(context, uncompression_dict, block_comp_type);
+    s = UncompressBlockData(info, block_contents.data.data(),
+                            block_contents.data.size(),
+                            &uncompressed_block_contents, format_version,
+                            ioptions, memory_allocator);
+    if (!s.ok()) {
+      return s;
+    }
+
+    block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
+        std::move(uncompressed_block_contents), read_amp_bytes_per_bit,
+        statistics, rep_->blocks_definitely_zstd_compressed,
+        rep_->table_options.filter_policy.get()));
+  } else {
+    block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
+        std::move(block_contents), read_amp_bytes_per_bit, statistics,
+        rep_->blocks_definitely_zstd_compressed,
+        rep_->table_options.filter_policy.get()));
+  }
+
+  // Insert compressed block into compressed block cache.
+  // Release the hold on the compressed cache entry immediately.
+  if (block_cache_compressed != nullptr && block_comp_type != kNoCompression &&
+      block_contents.own_bytes()) {
+    assert(block_contents.has_trailer);
+    assert(!cache_key.empty());
+
+    // We cannot directly put block_contents because this could point to
+    // an object in the stack.
+    auto block_cont_for_comp_cache =
+        std::make_unique<BlockContents>(std::move(block_contents));
+    size_t charge = block_cont_for_comp_cache->ApproximateMemoryUsage();
+
+    s = block_cache_compressed->Insert(
+        cache_key, block_cont_for_comp_cache.get(), charge,
+        &DeleteCacheEntry<BlockContents>, nullptr /*handle*/,
+        Cache::Priority::LOW);
+
+    if (s.ok()) {
+      // Cache took ownership
+      block_cont_for_comp_cache.release();
+      RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD);
+    } else {
+      RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
+    }
+  }
+
+  // insert into uncompressed block cache
+  if (block_cache != nullptr && block_holder->own_bytes()) {
+    size_t charge = block_holder->ApproximateMemoryUsage();
+    auto block_holder_raw_ptr = block_holder.get();
+    Cache::Handle* cache_handle = nullptr;
+    s = InsertEntryToCache(
+        rep_->ioptions.lowest_used_cache_tier, block_cache, cache_key,
+        BlocklikeTraits<TBlocklike>::GetCacheItemHelper(block_type),
+        std::move(block_holder), charge, &cache_handle, priority);
+    if (s.ok()) {
+      assert(cache_handle != nullptr);
+      out_parsed_block->SetCachedValue(block_holder_raw_ptr, block_cache,
+                                       cache_handle);
+
+      UpdateCacheInsertionMetrics(block_type, get_context, charge,
+                                  s.IsOkOverwritten(), rep_->ioptions.stats);
+    } else {
+      RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
+    }
+  } else {
+    out_parsed_block->SetOwnedValue(std::move(block_holder));
+  }
+
+  return s;
+}
+
+std::unique_ptr<FilterBlockReader> BlockBasedTable::CreateFilterBlockReader(
+    const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, bool use_cache,
+    bool prefetch, bool pin, BlockCacheLookupContext* lookup_context) {
+  auto& rep = rep_;
+  auto filter_type = rep->filter_type;
+  if (filter_type == Rep::FilterType::kNoFilter) {
+    return std::unique_ptr<FilterBlockReader>();
+  }
+
+  assert(rep->filter_policy);
+
+  switch (filter_type) {
+    case Rep::FilterType::kPartitionedFilter:
+      return PartitionedFilterBlockReader::Create(
+          this, ro, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
+
+    case Rep::FilterType::kFullFilter:
+      return FullFilterBlockReader::Create(this, ro, prefetch_buffer, use_cache,
+                                           prefetch, pin, lookup_context);
+
+    default:
+      // filter_type is either kNoFilter (exited the function at the first if),
+      // or it must be covered in this switch block
+      assert(false);
+      return std::unique_ptr<FilterBlockReader>();
+  }
+}
+
+// disable_prefix_seek should be set to true when prefix_extractor found in SST
+// differs from the one in mutable_cf_options and index type is HashBasedIndex
+InternalIteratorBase<IndexValue>* BlockBasedTable::NewIndexIterator(
+    const ReadOptions& read_options, bool disable_prefix_seek,
+    IndexBlockIter* input_iter, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) const {
+  assert(rep_ != nullptr);
+  assert(rep_->index_reader != nullptr);
+
+  // We don't return pinned data from index blocks, so no need
+  // to set `block_contents_pinned`.
+  return rep_->index_reader->NewIterator(read_options, disable_prefix_seek,
+                                         input_iter, get_context,
+                                         lookup_context);
+}
+
+template <>
+DataBlockIter* BlockBasedTable::InitBlockIterator<DataBlockIter>(
+    const Rep* rep, Block* block, BlockType block_type,
+    DataBlockIter* input_iter, bool block_contents_pinned) {
+  return block->NewDataIterator(rep->internal_comparator.user_comparator(),
+                                rep->get_global_seqno(block_type), input_iter,
+                                rep->ioptions.stats, block_contents_pinned);
+}
+
+template <>
+IndexBlockIter* BlockBasedTable::InitBlockIterator<IndexBlockIter>(
+    const Rep* rep, Block* block, BlockType block_type,
+    IndexBlockIter* input_iter, bool block_contents_pinned) {
+  return block->NewIndexIterator(
+      rep->internal_comparator.user_comparator(),
+      rep->get_global_seqno(block_type), input_iter, rep->ioptions.stats,
+      /* total_order_seek */ true, rep->index_has_first_key,
+      rep->index_key_includes_seq, rep->index_value_is_full,
+      block_contents_pinned);
+}
+
+// If contents is nullptr, this function looks up the block caches for the
+// data block referenced by handle, and read the block from disk if necessary.
+// If contents is non-null, it skips the cache lookup and disk read, since
+// the caller has already read it. In both cases, if ro.fill_cache is true,
+// it inserts the block into the block cache.
+template <typename TBlocklike>
+Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    const bool wait, const bool for_compaction,
+    CachableEntry<TBlocklike>* out_parsed_block, BlockType block_type,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    BlockContents* contents, bool async_read) const {
+  assert(out_parsed_block != nullptr);
+  const bool no_io = (ro.read_tier == kBlockCacheTier);
+  Cache* block_cache = rep_->table_options.block_cache.get();
+  Cache* block_cache_compressed =
+      rep_->table_options.block_cache_compressed.get();
+
+  // First, try to get the block from the cache
+  //
+  // If either block cache is enabled, we'll try to read from it.
+  Status s;
+  CacheKey key_data;
+  Slice key;
+  bool is_cache_hit = false;
+  if (block_cache != nullptr || block_cache_compressed != nullptr) {
+    // create key for block cache
+    key_data = GetCacheKey(rep_->base_cache_key, handle);
+    key = key_data.AsSlice();
+
+    if (!contents) {
+      s = GetDataBlockFromCache(key, block_cache, block_cache_compressed, ro,
+                                out_parsed_block, uncompression_dict,
+                                block_type, wait, get_context);
+      // Value could still be null at this point, so check the cache handle
+      // and update the read pattern for prefetching
+      if (out_parsed_block->GetValue() || out_parsed_block->GetCacheHandle()) {
+        // TODO(haoyu): Differentiate cache hit on uncompressed block cache and
+        // compressed block cache.
+        is_cache_hit = true;
+        if (prefetch_buffer) {
+          // Update the block details so that PrefetchBuffer can use the read
+          // pattern to determine if reads are sequential or not for
+          // prefetching. It should also take in account blocks read from cache.
+          prefetch_buffer->UpdateReadPattern(
+              handle.offset(), BlockSizeWithTrailer(handle),
+              ro.adaptive_readahead /*decrease_readahead_size*/);
+        }
+      }
+    }
+
+    // Can't find the block from the cache. If I/O is allowed, read from the
+    // file.
+    if (out_parsed_block->GetValue() == nullptr &&
+        out_parsed_block->GetCacheHandle() == nullptr && !no_io &&
+        ro.fill_cache) {
+      Statistics* statistics = rep_->ioptions.stats;
+      const bool maybe_compressed =
+          block_type != BlockType::kFilter &&
+          block_type != BlockType::kCompressionDictionary &&
+          rep_->blocks_maybe_compressed;
+      const bool do_uncompress = maybe_compressed && !block_cache_compressed;
+      CompressionType contents_comp_type;
+      // Maybe serialized or uncompressed
+      BlockContents tmp_contents;
+      if (!contents) {
+        Histograms histogram = for_compaction ? READ_BLOCK_COMPACTION_MICROS
+                                              : READ_BLOCK_GET_MICROS;
+        StopWatch sw(rep_->ioptions.clock, statistics, histogram);
+        BlockFetcher block_fetcher(
+            rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
+            &tmp_contents, rep_->ioptions, do_uncompress, maybe_compressed,
+            block_type, uncompression_dict, rep_->persistent_cache_options,
+            GetMemoryAllocator(rep_->table_options),
+            GetMemoryAllocatorForCompressedBlock(rep_->table_options));
+
+        // If prefetch_buffer is not allocated, it will fallback to synchronous
+        // reading of block contents.
+        if (async_read && prefetch_buffer != nullptr) {
+          s = block_fetcher.ReadAsyncBlockContents();
+          if (!s.ok()) {
+            return s;
+          }
+        } else {
+          s = block_fetcher.ReadBlockContents();
+        }
+
+        contents_comp_type = block_fetcher.get_compression_type();
+        contents = &tmp_contents;
+        if (get_context) {
+          switch (block_type) {
+            case BlockType::kIndex:
+              ++get_context->get_context_stats_.num_index_read;
+              break;
+            case BlockType::kFilter:
+            case BlockType::kFilterPartitionIndex:
+              ++get_context->get_context_stats_.num_filter_read;
+              break;
+            default:
+              break;
+          }
+        }
+      } else {
+        contents_comp_type = GetBlockCompressionType(*contents);
+      }
+
+      if (s.ok()) {
+        // If filling cache is allowed and a cache is configured, try to put the
+        // block to the cache.
+        s = PutDataBlockToCache(
+            key, block_cache, block_cache_compressed, out_parsed_block,
+            std::move(*contents), contents_comp_type, uncompression_dict,
+            GetMemoryAllocator(rep_->table_options), block_type, get_context);
+      }
+    }
+  }
+
+  // Fill lookup_context.
+  if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() &&
+      lookup_context) {
+    size_t usage = 0;
+    uint64_t nkeys = 0;
+    if (out_parsed_block->GetValue()) {
+      // Approximate the number of keys in the block using restarts.
+      nkeys = rep_->table_options.block_restart_interval *
+              BlocklikeTraits<TBlocklike>::GetNumRestarts(
+                  *out_parsed_block->GetValue());
+      usage = out_parsed_block->GetValue()->ApproximateMemoryUsage();
+    }
+    TraceType trace_block_type = TraceType::kTraceMax;
+    switch (block_type) {
+      case BlockType::kData:
+        trace_block_type = TraceType::kBlockTraceDataBlock;
+        break;
+      case BlockType::kFilter:
+      case BlockType::kFilterPartitionIndex:
+        trace_block_type = TraceType::kBlockTraceFilterBlock;
+        break;
+      case BlockType::kCompressionDictionary:
+        trace_block_type = TraceType::kBlockTraceUncompressionDictBlock;
+        break;
+      case BlockType::kRangeDeletion:
+        trace_block_type = TraceType::kBlockTraceRangeDeletionBlock;
+        break;
+      case BlockType::kIndex:
+        trace_block_type = TraceType::kBlockTraceIndexBlock;
+        break;
+      default:
+        // This cannot happen.
+        assert(false);
+        break;
+    }
+    bool no_insert = no_io || !ro.fill_cache;
+    if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(
+            trace_block_type, lookup_context->caller)) {
+      // Defer logging the access to Get() and MultiGet() to trace additional
+      // information, e.g., referenced_key_exist_in_block.
+
+      // Make a copy of the block key here since it will be logged later.
+      lookup_context->FillLookupContext(
+          is_cache_hit, no_insert, trace_block_type,
+          /*block_size=*/usage, /*block_key=*/key.ToString(), nkeys);
+    } else {
+      // Avoid making copy of block_key and cf_name when constructing the access
+      // record.
+      BlockCacheTraceRecord access_record(
+          rep_->ioptions.clock->NowMicros(),
+          /*block_key=*/"", trace_block_type,
+          /*block_size=*/usage, rep_->cf_id_for_tracing(),
+          /*cf_name=*/"", rep_->level_for_tracing(),
+          rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
+          no_insert, lookup_context->get_id,
+          lookup_context->get_from_user_specified_snapshot,
+          /*referenced_key=*/"");
+      // TODO: Should handle this error?
+      block_cache_tracer_
+          ->WriteBlockAccess(access_record, key, rep_->cf_name_for_tracing(),
+                             lookup_context->referenced_key)
+          .PermitUncheckedError();
+    }
+  }
+
+  assert(s.ok() || out_parsed_block->GetValue() == nullptr);
+  return s;
+}
+
+template <typename TBlocklike>
+Status BlockBasedTable::RetrieveBlock(
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    CachableEntry<TBlocklike>* out_parsed_block, BlockType block_type,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    bool for_compaction, bool use_cache, bool wait_for_cache,
+    bool async_read) const {
+  assert(out_parsed_block);
+  assert(out_parsed_block->IsEmpty());
+
+  Status s;
+  if (use_cache) {
+    s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle,
+                                     uncompression_dict, wait_for_cache,
+                                     for_compaction, out_parsed_block,
+                                     block_type, get_context, lookup_context,
+                                     /*contents=*/nullptr, async_read);
+
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (out_parsed_block->GetValue() != nullptr ||
+        out_parsed_block->GetCacheHandle() != nullptr) {
+      assert(s.ok());
+      return s;
+    }
+  }
+
+  assert(out_parsed_block->IsEmpty());
+
+  const bool no_io = ro.read_tier == kBlockCacheTier;
+  if (no_io) {
+    return Status::Incomplete("no blocking io");
+  }
+
+  const bool maybe_compressed =
+      block_type != BlockType::kFilter &&
+      block_type != BlockType::kCompressionDictionary &&
+      rep_->blocks_maybe_compressed;
+  const bool do_uncompress = maybe_compressed;
+  std::unique_ptr<TBlocklike> block;
+
+  {
+    Histograms histogram =
+        for_compaction ? READ_BLOCK_COMPACTION_MICROS : READ_BLOCK_GET_MICROS;
+    StopWatch sw(rep_->ioptions.clock, rep_->ioptions.stats, histogram);
+    s = ReadBlockFromFile(
+        rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
+        rep_->ioptions, do_uncompress, maybe_compressed, block_type,
+        uncompression_dict, rep_->persistent_cache_options,
+        block_type == BlockType::kData
+            ? rep_->table_options.read_amp_bytes_per_bit
+            : 0,
+        GetMemoryAllocator(rep_->table_options), for_compaction,
+        rep_->blocks_definitely_zstd_compressed,
+        rep_->table_options.filter_policy.get(), async_read);
+
+    if (get_context) {
+      switch (block_type) {
+        case BlockType::kIndex:
+          ++(get_context->get_context_stats_.num_index_read);
+          break;
+        case BlockType::kFilter:
+        case BlockType::kFilterPartitionIndex:
+          ++(get_context->get_context_stats_.num_filter_read);
+          break;
+        default:
+          break;
+      }
+    }
+  }
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  out_parsed_block->SetOwnedValue(std::move(block));
+
+  assert(s.ok());
+  return s;
+}
+
+// Explicitly instantiate templates for each "blocklike" type we use.
+// This makes it possible to keep the template definitions in the .cc file.
+template Status BlockBasedTable::RetrieveBlock<ParsedFullFilterBlock>(
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    CachableEntry<ParsedFullFilterBlock>* out_parsed_block,
+    BlockType block_type, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context, bool for_compaction,
+    bool use_cache, bool wait_for_cache, bool async_read) const;
+
+template Status BlockBasedTable::RetrieveBlock<Block>(
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    CachableEntry<Block>* out_parsed_block, BlockType block_type,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    bool for_compaction, bool use_cache, bool wait_for_cache,
+    bool async_read) const;
+
+template Status BlockBasedTable::RetrieveBlock<UncompressionDict>(
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    CachableEntry<UncompressionDict>* out_parsed_block, BlockType block_type,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    bool for_compaction, bool use_cache, bool wait_for_cache,
+    bool async_read) const;
+
+BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
+    const BlockBasedTable* table,
+    UnorderedMap<uint64_t, CachableEntry<Block>>* block_map)
+    : table_(table), block_map_(block_map) {}
+
+InternalIteratorBase<IndexValue>*
+BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
+    const BlockHandle& handle) {
+  // Return a block iterator on the index partition
+  auto block = block_map_->find(handle.offset());
+  // block_map_ must be exhaustive
+  if (block == block_map_->end()) {
+    assert(false);
+    // Signal problem to caller
+    return nullptr;
+  }
+  const Rep* rep = table_->get_rep();
+  assert(rep);
+
+  Statistics* kNullStats = nullptr;
+  // We don't return pinned data from index blocks, so no need
+  // to set `block_contents_pinned`.
+  return block->second.GetValue()->NewIndexIterator(
+      rep->internal_comparator.user_comparator(),
+      rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true,
+      rep->index_has_first_key, rep->index_key_includes_seq,
+      rep->index_value_is_full);
+}
+
+// This will be broken if the user specifies an unusual implementation
+// of Options.comparator, or if the user specifies an unusual
+// definition of prefixes in BlockBasedTableOptions.filter_policy.
+// In particular, we require the following three properties:
+//
+// 1) key.starts_with(prefix(key))
+// 2) Compare(prefix(key), key) <= 0.
+// 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0
+//
+// If read_options.read_tier == kBlockCacheTier, this method will do no I/O and
+// will return true if the filter block is not in memory and not found in block
+// cache.
+//
+// REQUIRES: this method shouldn't be called while the DB lock is held.
+bool BlockBasedTable::PrefixRangeMayMatch(
+    const Slice& internal_key, const ReadOptions& read_options,
+    const SliceTransform* options_prefix_extractor,
+    const bool need_upper_bound_check,
+    BlockCacheLookupContext* lookup_context) const {
+  if (!rep_->filter_policy) {
+    return true;
+  }
+
+  const SliceTransform* prefix_extractor;
+
+  if (rep_->table_prefix_extractor == nullptr) {
+    if (need_upper_bound_check) {
+      return true;
+    }
+    prefix_extractor = options_prefix_extractor;
+  } else {
+    prefix_extractor = rep_->table_prefix_extractor.get();
+  }
+  auto ts_sz = rep_->internal_comparator.user_comparator()->timestamp_size();
+  auto user_key_without_ts =
+      ExtractUserKeyAndStripTimestamp(internal_key, ts_sz);
+  if (!prefix_extractor->InDomain(user_key_without_ts)) {
+    return true;
+  }
+
+  bool may_match = true;
+
+  FilterBlockReader* const filter = rep_->filter.get();
+  bool filter_checked = false;
+  if (filter != nullptr) {
+    const bool no_io = read_options.read_tier == kBlockCacheTier;
+
+    const Slice* const const_ikey_ptr = &internal_key;
+    may_match = filter->RangeMayExist(
+        read_options.iterate_upper_bound, user_key_without_ts, prefix_extractor,
+        rep_->internal_comparator.user_comparator(), const_ikey_ptr,
+        &filter_checked, need_upper_bound_check, no_io, lookup_context,
+        read_options.rate_limiter_priority);
+  }
+
+  if (filter_checked) {
+    Statistics* statistics = rep_->ioptions.stats;
+    RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
+    if (!may_match) {
+      RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
+    }
+  }
+
+  return may_match;
+}
+
+bool BlockBasedTable::PrefixExtractorChanged(
+    const SliceTransform* prefix_extractor) const {
+  if (prefix_extractor == nullptr) {
+    return true;
+  } else if (prefix_extractor == rep_->table_prefix_extractor.get()) {
+    return false;
+  } else {
+    return PrefixExtractorChangedHelper(rep_->table_properties.get(),
+                                        prefix_extractor);
+  }
+}
+
+InternalIterator* BlockBasedTable::NewIterator(
+    const ReadOptions& read_options, const SliceTransform* prefix_extractor,
+    Arena* arena, bool skip_filters, TableReaderCaller caller,
+    size_t compaction_readahead_size, bool allow_unprepared_value) {
+  BlockCacheLookupContext lookup_context{caller};
+  bool need_upper_bound_check =
+      read_options.auto_prefix_mode || PrefixExtractorChanged(prefix_extractor);
+  std::unique_ptr<InternalIteratorBase<IndexValue>> index_iter(NewIndexIterator(
+      read_options,
+      /*disable_prefix_seek=*/need_upper_bound_check &&
+          rep_->index_type == BlockBasedTableOptions::kHashSearch,
+      /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context));
+  if (arena == nullptr) {
+    return new BlockBasedTableIterator(
+        this, read_options, rep_->internal_comparator, std::move(index_iter),
+        !skip_filters && !read_options.total_order_seek &&
+            prefix_extractor != nullptr,
+        need_upper_bound_check, prefix_extractor, caller,
+        compaction_readahead_size, allow_unprepared_value);
+  } else {
+    auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator));
+    return new (mem) BlockBasedTableIterator(
+        this, read_options, rep_->internal_comparator, std::move(index_iter),
+        !skip_filters && !read_options.total_order_seek &&
+            prefix_extractor != nullptr,
+        need_upper_bound_check, prefix_extractor, caller,
+        compaction_readahead_size, allow_unprepared_value);
+  }
+}
+
+FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator(
+    const ReadOptions& read_options) {
+  if (rep_->fragmented_range_dels == nullptr) {
+    return nullptr;
+  }
+  SequenceNumber snapshot = kMaxSequenceNumber;
+  if (read_options.snapshot != nullptr) {
+    snapshot = read_options.snapshot->GetSequenceNumber();
+  }
+  return new FragmentedRangeTombstoneIterator(rep_->fragmented_range_dels,
+                                              rep_->internal_comparator,
+                                              snapshot, read_options.timestamp);
+}
+
+bool BlockBasedTable::FullFilterKeyMayMatch(
+    FilterBlockReader* filter, const Slice& internal_key, const bool no_io,
+    const SliceTransform* prefix_extractor, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority) const {
+  if (filter == nullptr) {
+    return true;
+  }
+  Slice user_key = ExtractUserKey(internal_key);
+  const Slice* const const_ikey_ptr = &internal_key;
+  bool may_match = true;
+  size_t ts_sz = rep_->internal_comparator.user_comparator()->timestamp_size();
+  Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz);
+  if (rep_->whole_key_filtering) {
+    may_match =
+        filter->KeyMayMatch(user_key_without_ts, no_io, const_ikey_ptr,
+                            get_context, lookup_context, rate_limiter_priority);
+  } else if (!PrefixExtractorChanged(prefix_extractor) &&
+             prefix_extractor->InDomain(user_key_without_ts) &&
+             !filter->PrefixMayMatch(
+                 prefix_extractor->Transform(user_key_without_ts), no_io,
+                 const_ikey_ptr, get_context, lookup_context,
+                 rate_limiter_priority)) {
+    // FIXME ^^^: there should be no reason for Get() to depend on current
+    // prefix_extractor at all. It should always use table_prefix_extractor.
+    may_match = false;
+  }
+  if (may_match) {
+    RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_POSITIVE);
+    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level);
+  }
+  return may_match;
+}
+
+void BlockBasedTable::FullFilterKeysMayMatch(
+    FilterBlockReader* filter, MultiGetRange* range, const bool no_io,
+    const SliceTransform* prefix_extractor,
+    BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority) const {
+  if (filter == nullptr) {
+    return;
+  }
+  uint64_t before_keys = range->KeysLeft();
+  assert(before_keys > 0);  // Caller should ensure
+  if (rep_->whole_key_filtering) {
+    filter->KeysMayMatch(range, no_io, lookup_context, rate_limiter_priority);
+    uint64_t after_keys = range->KeysLeft();
+    if (after_keys) {
+      RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_POSITIVE, after_keys);
+      PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, after_keys,
+                                rep_->level);
+    }
+    uint64_t filtered_keys = before_keys - after_keys;
+    if (filtered_keys) {
+      RecordTick(rep_->ioptions.stats, BLOOM_FILTER_USEFUL, filtered_keys);
+      PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, filtered_keys,
+                                rep_->level);
+    }
+  } else if (!PrefixExtractorChanged(prefix_extractor)) {
+    // FIXME ^^^: there should be no reason for MultiGet() to depend on current
+    // prefix_extractor at all. It should always use table_prefix_extractor.
+    filter->PrefixesMayMatch(range, prefix_extractor, false, lookup_context,
+                             rate_limiter_priority);
+    RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_CHECKED, before_keys);
+    uint64_t after_keys = range->KeysLeft();
+    uint64_t filtered_keys = before_keys - after_keys;
+    if (filtered_keys) {
+      RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_USEFUL,
+                 filtered_keys);
+    }
+  }
+}
+
+Status BlockBasedTable::ApproximateKeyAnchors(const ReadOptions& read_options,
+                                              std::vector<Anchor>& anchors) {
+  // We iterator the whole index block here. More efficient implementation
+  // is possible if we push this operation into IndexReader. For example, we
+  // can directly sample from restart block entries in the index block and
+  // only read keys needed. Here we take a simple solution. Performance is
+  // likely not to be a problem. We are compacting the whole file, so all
+  // keys will be read out anyway. An extra read to index block might be
+  // a small share of the overhead. We can try to optimize if needed.
+  IndexBlockIter iiter_on_stack;
+  auto iiter = NewIndexIterator(
+      read_options, /*disable_prefix_seek=*/false, &iiter_on_stack,
+      /*get_context=*/nullptr, /*lookup_context=*/nullptr);
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+  if (iiter != &iiter_on_stack) {
+    iiter_unique_ptr.reset(iiter);
+  }
+
+  // If needed the threshold could be more adaptive. For example, it can be
+  // based on size, so that a larger will be sampled to more partitions than a
+  // smaller file. The size might also need to be passed in by the caller based
+  // on total compaction size.
+  const uint64_t kMaxNumAnchors = uint64_t{128};
+  uint64_t num_blocks = this->GetTableProperties()->num_data_blocks;
+  uint64_t num_blocks_per_anchor = num_blocks / kMaxNumAnchors;
+  if (num_blocks_per_anchor == 0) {
+    num_blocks_per_anchor = 1;
+  }
+
+  uint64_t count = 0;
+  std::string last_key;
+  uint64_t range_size = 0;
+  uint64_t prev_offset = 0;
+  for (iiter->SeekToFirst(); iiter->Valid(); iiter->Next()) {
+    const BlockHandle& bh = iiter->value().handle;
+    range_size += bh.offset() + bh.size() - prev_offset;
+    prev_offset = bh.offset() + bh.size();
+    if (++count % num_blocks_per_anchor == 0) {
+      count = 0;
+      anchors.emplace_back(iiter->user_key(), range_size);
+      range_size = 0;
+    } else {
+      last_key = iiter->user_key().ToString();
+    }
+  }
+  if (count != 0) {
+    anchors.emplace_back(last_key, range_size);
+  }
+  return Status::OK();
+}
+
+Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
+                            GetContext* get_context,
+                            const SliceTransform* prefix_extractor,
+                            bool skip_filters) {
+  assert(key.size() >= 8);  // key must be internal key
+  assert(get_context != nullptr);
+  Status s;
+  const bool no_io = read_options.read_tier == kBlockCacheTier;
+
+  FilterBlockReader* const filter =
+      !skip_filters ? rep_->filter.get() : nullptr;
+
+  // First check the full filter
+  // If full filter not useful, Then go into each block
+  uint64_t tracing_get_id = get_context->get_tracing_get_id();
+  BlockCacheLookupContext lookup_context{
+      TableReaderCaller::kUserGet, tracing_get_id,
+      /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
+  if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
+    // Trace the key since it contains both user key and sequence number.
+    lookup_context.referenced_key = key.ToString();
+    lookup_context.get_from_user_specified_snapshot =
+        read_options.snapshot != nullptr;
+  }
+  TEST_SYNC_POINT("BlockBasedTable::Get:BeforeFilterMatch");
+  const bool may_match = FullFilterKeyMayMatch(
+      filter, key, no_io, prefix_extractor, get_context, &lookup_context,
+      read_options.rate_limiter_priority);
+  TEST_SYNC_POINT("BlockBasedTable::Get:AfterFilterMatch");
+  if (!may_match) {
+    RecordTick(rep_->ioptions.stats, BLOOM_FILTER_USEFUL);
+    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
+  } else {
+    IndexBlockIter iiter_on_stack;
+    // if prefix_extractor found in block differs from options, disable
+    // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
+    bool need_upper_bound_check = false;
+    if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
+      need_upper_bound_check = PrefixExtractorChanged(prefix_extractor);
+    }
+    auto iiter =
+        NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
+                         get_context, &lookup_context);
+    std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+    if (iiter != &iiter_on_stack) {
+      iiter_unique_ptr.reset(iiter);
+    }
+
+    size_t ts_sz =
+        rep_->internal_comparator.user_comparator()->timestamp_size();
+    bool matched = false;  // if such user key matched a key in SST
+    bool done = false;
+    for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
+      IndexValue v = iiter->value();
+
+      if (!v.first_internal_key.empty() && !skip_filters &&
+          UserComparatorWrapper(rep_->internal_comparator.user_comparator())
+                  .CompareWithoutTimestamp(
+                      ExtractUserKey(key),
+                      ExtractUserKey(v.first_internal_key)) < 0) {
+        // The requested key falls between highest key in previous block and
+        // lowest key in current block.
+        break;
+      }
+
+      BlockCacheLookupContext lookup_data_block_context{
+          TableReaderCaller::kUserGet, tracing_get_id,
+          /*get_from_user_specified_snapshot=*/read_options.snapshot !=
+              nullptr};
+      bool does_referenced_key_exist = false;
+      DataBlockIter biter;
+      uint64_t referenced_data_size = 0;
+      Status tmp_status;
+      NewDataBlockIterator<DataBlockIter>(
+          read_options, v.handle, &biter, BlockType::kData, get_context,
+          &lookup_data_block_context, /*prefetch_buffer=*/nullptr,
+          /*for_compaction=*/false, /*async_read=*/false, tmp_status);
+
+      if (no_io && biter.status().IsIncomplete()) {
+        // couldn't get block from block_cache
+        // Update Saver.state to Found because we are only looking for
+        // whether we can guarantee the key is not there when "no_io" is set
+        get_context->MarkKeyMayExist();
+        s = biter.status();
+        break;
+      }
+      if (!biter.status().ok()) {
+        s = biter.status();
+        break;
+      }
+
+      bool may_exist = biter.SeekForGet(key);
+      // If user-specified timestamp is supported, we cannot end the search
+      // just because hash index lookup indicates the key+ts does not exist.
+      if (!may_exist && ts_sz == 0) {
+        // HashSeek cannot find the key this block and the the iter is not
+        // the end of the block, i.e. cannot be in the following blocks
+        // either. In this case, the seek_key cannot be found, so we break
+        // from the top level for-loop.
+        done = true;
+      } else {
+        // Call the *saver function on each entry/block until it returns false
+        for (; biter.Valid(); biter.Next()) {
+          ParsedInternalKey parsed_key;
+          Status pik_status = ParseInternalKey(
+              biter.key(), &parsed_key, false /* log_err_key */);  // TODO
+          if (!pik_status.ok()) {
+            s = pik_status;
+          }
+
+          if (!get_context->SaveValue(
+                  parsed_key, biter.value(), &matched,
+                  biter.IsValuePinned() ? &biter : nullptr)) {
+            if (get_context->State() == GetContext::GetState::kFound) {
+              does_referenced_key_exist = true;
+              referenced_data_size = biter.key().size() + biter.value().size();
+            }
+            done = true;
+            break;
+          }
+        }
+        s = biter.status();
+      }
+      // Write the block cache access record.
+      if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
+        // Avoid making copy of block_key, cf_name, and referenced_key when
+        // constructing the access record.
+        Slice referenced_key;
+        if (does_referenced_key_exist) {
+          referenced_key = biter.key();
+        } else {
+          referenced_key = key;
+        }
+        BlockCacheTraceRecord access_record(
+            rep_->ioptions.clock->NowMicros(),
+            /*block_key=*/"", lookup_data_block_context.block_type,
+            lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
+            /*cf_name=*/"", rep_->level_for_tracing(),
+            rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
+            lookup_data_block_context.is_cache_hit,
+            lookup_data_block_context.no_insert,
+            lookup_data_block_context.get_id,
+            lookup_data_block_context.get_from_user_specified_snapshot,
+            /*referenced_key=*/"", referenced_data_size,
+            lookup_data_block_context.num_keys_in_block,
+            does_referenced_key_exist);
+        // TODO: Should handle status here?
+        block_cache_tracer_
+            ->WriteBlockAccess(access_record,
+                               lookup_data_block_context.block_key,
+                               rep_->cf_name_for_tracing(), referenced_key)
+            .PermitUncheckedError();
+      }
+
+      if (done) {
+        // Avoid the extra Next which is expensive in two-level indexes
+        break;
+      }
+    }
+    if (matched && filter != nullptr) {
+      RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_TRUE_POSITIVE);
+      PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
+                                rep_->level);
+    }
+    if (s.ok() && !iiter->status().IsNotFound()) {
+      s = iiter->status();
+    }
+  }
+
+  return s;
+}
+
+Status BlockBasedTable::MultiGetFilter(const ReadOptions& read_options,
+                                       const SliceTransform* prefix_extractor,
+                                       MultiGetRange* mget_range) {
+  if (mget_range->empty()) {
+    // Caller should ensure non-empty (performance bug)
+    assert(false);
+    return Status::OK();  // Nothing to do
+  }
+
+  FilterBlockReader* const filter = rep_->filter.get();
+  if (!filter) {
+    return Status::OK();
+  }
+
+  // First check the full filter
+  // If full filter not useful, Then go into each block
+  const bool no_io = read_options.read_tier == kBlockCacheTier;
+  uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
+  if (mget_range->begin()->get_context) {
+    tracing_mget_id = mget_range->begin()->get_context->get_tracing_get_id();
+  }
+  BlockCacheLookupContext lookup_context{
+      TableReaderCaller::kUserMultiGet, tracing_mget_id,
+      /*_get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
+  FullFilterKeysMayMatch(filter, mget_range, no_io, prefix_extractor,
+                         &lookup_context, read_options.rate_limiter_priority);
+
+  return Status::OK();
+}
+
+Status BlockBasedTable::Prefetch(const Slice* const begin,
+                                 const Slice* const end) {
+  auto& comparator = rep_->internal_comparator;
+  UserComparatorWrapper user_comparator(comparator.user_comparator());
+  // pre-condition
+  if (begin && end && comparator.Compare(*begin, *end) > 0) {
+    return Status::InvalidArgument(*begin, *end);
+  }
+  BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
+  IndexBlockIter iiter_on_stack;
+  auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
+                                &iiter_on_stack, /*get_context=*/nullptr,
+                                &lookup_context);
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+  if (iiter != &iiter_on_stack) {
+    iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
+  }
+
+  if (!iiter->status().ok()) {
+    // error opening index iterator
+    return iiter->status();
+  }
+
+  // indicates if we are on the last page that need to be pre-fetched
+  bool prefetching_boundary_page = false;
+
+  for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid();
+       iiter->Next()) {
+    BlockHandle block_handle = iiter->value().handle;
+    const bool is_user_key = !rep_->index_key_includes_seq;
+    if (end &&
+        ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) ||
+         (is_user_key &&
+          user_comparator.Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) {
+      if (prefetching_boundary_page) {
+        break;
+      }
+
+      // The index entry represents the last key in the data block.
+      // We should load this page into memory as well, but no more
+      prefetching_boundary_page = true;
+    }
+
+    // Load the block specified by the block_handle into the block cache
+    DataBlockIter biter;
+    Status tmp_status;
+    NewDataBlockIterator<DataBlockIter>(
+        ReadOptions(), block_handle, &biter, /*type=*/BlockType::kData,
+        /*get_context=*/nullptr, &lookup_context,
+        /*prefetch_buffer=*/nullptr, /*for_compaction=*/false,
+        /*async_read=*/false, tmp_status);
+
+    if (!biter.status().ok()) {
+      // there was an unexpected error while pre-fetching
+      return biter.status();
+    }
+  }
+
+  return Status::OK();
+}
+
+Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options,
+                                       TableReaderCaller caller) {
+  Status s;
+  // Check Meta blocks
+  std::unique_ptr<Block> metaindex;
+  std::unique_ptr<InternalIterator> metaindex_iter;
+  ReadOptions ro;
+  s = ReadMetaIndexBlock(ro, nullptr /* prefetch buffer */, &metaindex,
+                         &metaindex_iter);
+  if (s.ok()) {
+    s = VerifyChecksumInMetaBlocks(metaindex_iter.get());
+    if (!s.ok()) {
+      return s;
+    }
+  } else {
+    return s;
+  }
+  // Check Data blocks
+  IndexBlockIter iiter_on_stack;
+  BlockCacheLookupContext context{caller};
+  InternalIteratorBase<IndexValue>* iiter = NewIndexIterator(
+      read_options, /*disable_prefix_seek=*/false, &iiter_on_stack,
+      /*get_context=*/nullptr, &context);
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+  if (iiter != &iiter_on_stack) {
+    iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
+  }
+  if (!iiter->status().ok()) {
+    // error opening index iterator
+    return iiter->status();
+  }
+  s = VerifyChecksumInBlocks(read_options, iiter);
+  return s;
+}
+
+Status BlockBasedTable::VerifyChecksumInBlocks(
+    const ReadOptions& read_options,
+    InternalIteratorBase<IndexValue>* index_iter) {
+  Status s;
+  // We are scanning the whole file, so no need to do exponential
+  // increasing of the buffer size.
+  size_t readahead_size = (read_options.readahead_size != 0)
+                              ? read_options.readahead_size
+                              : rep_->table_options.max_auto_readahead_size;
+  // FilePrefetchBuffer doesn't work in mmap mode and readahead is not
+  // needed there.
+  FilePrefetchBuffer prefetch_buffer(
+      readahead_size /* readahead_size */,
+      readahead_size /* max_readahead_size */,
+      !rep_->ioptions.allow_mmap_reads /* enable */);
+
+  for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
+    s = index_iter->status();
+    if (!s.ok()) {
+      break;
+    }
+    BlockHandle handle = index_iter->value().handle;
+    BlockContents contents;
+    BlockFetcher block_fetcher(
+        rep_->file.get(), &prefetch_buffer, rep_->footer, read_options, handle,
+        &contents, rep_->ioptions, false /* decompress */,
+        false /*maybe_compressed*/, BlockType::kData,
+        UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
+    s = block_fetcher.ReadBlockContents();
+    if (!s.ok()) {
+      break;
+    }
+  }
+  if (s.ok()) {
+    // In the case of two level indexes, we would have exited the above loop
+    // by checking index_iter->Valid(), but Valid() might have returned false
+    // due to an IO error. So check the index_iter status
+    s = index_iter->status();
+  }
+  return s;
+}
+
+BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName(
+    const Slice& meta_block_name) {
+  if (meta_block_name.starts_with(kFullFilterBlockPrefix)) {
+    return BlockType::kFilter;
+  }
+
+  if (meta_block_name.starts_with(kPartitionedFilterBlockPrefix)) {
+    return BlockType::kFilterPartitionIndex;
+  }
+
+  if (meta_block_name == kPropertiesBlockName) {
+    return BlockType::kProperties;
+  }
+
+  if (meta_block_name == kCompressionDictBlockName) {
+    return BlockType::kCompressionDictionary;
+  }
+
+  if (meta_block_name == kRangeDelBlockName) {
+    return BlockType::kRangeDeletion;
+  }
+
+  if (meta_block_name == kHashIndexPrefixesBlock) {
+    return BlockType::kHashIndexPrefixes;
+  }
+
+  if (meta_block_name == kHashIndexPrefixesMetadataBlock) {
+    return BlockType::kHashIndexMetadata;
+  }
+
+  if (meta_block_name.starts_with(kObsoleteFilterBlockPrefix)) {
+    // Obsolete but possible in old files
+    return BlockType::kInvalid;
+  }
+
+  assert(false);
+  return BlockType::kInvalid;
+}
+
+Status BlockBasedTable::VerifyChecksumInMetaBlocks(
+    InternalIteratorBase<Slice>* index_iter) {
+  Status s;
+  for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
+    s = index_iter->status();
+    if (!s.ok()) {
+      break;
+    }
+    BlockHandle handle;
+    Slice input = index_iter->value();
+    s = handle.DecodeFrom(&input);
+    BlockContents contents;
+    const Slice meta_block_name = index_iter->key();
+    if (meta_block_name == kPropertiesBlockName) {
+      // Unfortunate special handling for properties block checksum w/
+      // global seqno
+      std::unique_ptr<TableProperties> table_properties;
+      s = ReadTablePropertiesHelper(ReadOptions(), handle, rep_->file.get(),
+                                    nullptr /* prefetch_buffer */, rep_->footer,
+                                    rep_->ioptions, &table_properties,
+                                    nullptr /* memory_allocator */);
+    } else {
+      s = BlockFetcher(
+              rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
+              ReadOptions(), handle, &contents, rep_->ioptions,
+              false /* decompress */, false /*maybe_compressed*/,
+              GetBlockTypeForMetaBlockByName(meta_block_name),
+              UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options)
+              .ReadBlockContents();
+    }
+    if (!s.ok()) {
+      break;
+    }
+  }
+  return s;
+}
+
+bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const {
+  assert(rep_ != nullptr);
+
+  Cache* const cache = rep_->table_options.block_cache.get();
+  if (cache == nullptr) {
+    return false;
+  }
+
+  CacheKey key = GetCacheKey(rep_->base_cache_key, handle);
+
+  Cache::Handle* const cache_handle = cache->Lookup(key.AsSlice());
+  if (cache_handle == nullptr) {
+    return false;
+  }
+
+  cache->Release(cache_handle);
+
+  return true;
+}
+
+bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
+                                      const Slice& key) {
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
+      options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
+      /*get_context=*/nullptr, /*lookup_context=*/nullptr));
+  iiter->Seek(key);
+  assert(iiter->Valid());
+
+  return TEST_BlockInCache(iiter->value().handle);
+}
+
+// REQUIRES: The following fields of rep_ should have already been populated:
+//  1. file
+//  2. index_handle,
+//  3. options
+//  4. internal_comparator
+//  5. index_type
+Status BlockBasedTable::CreateIndexReader(
+    const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
+    InternalIterator* meta_iter, bool use_cache, bool prefetch, bool pin,
+    BlockCacheLookupContext* lookup_context,
+    std::unique_ptr<IndexReader>* index_reader) {
+  switch (rep_->index_type) {
+    case BlockBasedTableOptions::kTwoLevelIndexSearch: {
+      return PartitionIndexReader::Create(this, ro, prefetch_buffer, use_cache,
+                                          prefetch, pin, lookup_context,
+                                          index_reader);
+    }
+    case BlockBasedTableOptions::kBinarySearch:
+      FALLTHROUGH_INTENDED;
+    case BlockBasedTableOptions::kBinarySearchWithFirstKey: {
+      return BinarySearchIndexReader::Create(this, ro, prefetch_buffer,
+                                             use_cache, prefetch, pin,
+                                             lookup_context, index_reader);
+    }
+    case BlockBasedTableOptions::kHashSearch: {
+      if (!rep_->table_prefix_extractor) {
+        ROCKS_LOG_WARN(rep_->ioptions.logger,
+                       "Missing prefix extractor for hash index. Fall back to"
+                       " binary search index.");
+        return BinarySearchIndexReader::Create(this, ro, prefetch_buffer,
+                                               use_cache, prefetch, pin,
+                                               lookup_context, index_reader);
+      } else {
+        return HashIndexReader::Create(this, ro, prefetch_buffer, meta_iter,
+                                       use_cache, prefetch, pin, lookup_context,
+                                       index_reader);
+      }
+    }
+    default: {
+      std::string error_message =
+          "Unrecognized index type: " + std::to_string(rep_->index_type);
+      return Status::InvalidArgument(error_message.c_str());
+    }
+  }
+}
+
+uint64_t BlockBasedTable::ApproximateDataOffsetOf(
+    const InternalIteratorBase<IndexValue>& index_iter,
+    uint64_t data_size) const {
+  assert(index_iter.status().ok());
+  if (index_iter.Valid()) {
+    BlockHandle handle = index_iter.value().handle;
+    return handle.offset();
+  } else {
+    // The iterator is past the last key in the file.
+    return data_size;
+  }
+}
+
+uint64_t BlockBasedTable::GetApproximateDataSize() {
+  // Should be in table properties unless super old version
+  if (rep_->table_properties) {
+    return rep_->table_properties->data_size;
+  }
+  // Fall back to rough estimate from footer
+  return rep_->footer.metaindex_handle().offset();
+}
+
+uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
+                                              TableReaderCaller caller) {
+  uint64_t data_size = GetApproximateDataSize();
+  if (UNLIKELY(data_size == 0)) {
+    // Hmm. Let's just split in half to avoid skewing one way or another,
+    // since we don't know whether we're operating on lower bound or
+    // upper bound.
+    return rep_->file_size / 2;
+  }
+
+  BlockCacheLookupContext context(caller);
+  IndexBlockIter iiter_on_stack;
+  ReadOptions ro;
+  ro.total_order_seek = true;
+  auto index_iter =
+      NewIndexIterator(ro, /*disable_prefix_seek=*/true,
+                       /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
+                       /*lookup_context=*/&context);
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+  if (index_iter != &iiter_on_stack) {
+    iiter_unique_ptr.reset(index_iter);
+  }
+
+  index_iter->Seek(key);
+  uint64_t offset;
+  if (index_iter->status().ok()) {
+    offset = ApproximateDataOffsetOf(*index_iter, data_size);
+  } else {
+    // Split in half to avoid skewing one way or another,
+    // since we don't know whether we're operating on lower bound or
+    // upper bound.
+    return rep_->file_size / 2;
+  }
+
+  // Pro-rate file metadata (incl filters) size-proportionally across data
+  // blocks.
+  double size_ratio =
+      static_cast<double>(offset) / static_cast<double>(data_size);
+  return static_cast<uint64_t>(size_ratio *
+                               static_cast<double>(rep_->file_size));
+}
+
+uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
+                                          TableReaderCaller caller) {
+  assert(rep_->internal_comparator.Compare(start, end) <= 0);
+
+  uint64_t data_size = GetApproximateDataSize();
+  if (UNLIKELY(data_size == 0)) {
+    // Hmm. Assume whole file is involved, since we have lower and upper
+    // bound. This likely skews the estimate if we consider that this function
+    // is typically called with `[start, end]` fully contained in the file's
+    // key-range.
+    return rep_->file_size;
+  }
+
+  BlockCacheLookupContext context(caller);
+  IndexBlockIter iiter_on_stack;
+  ReadOptions ro;
+  ro.total_order_seek = true;
+  auto index_iter =
+      NewIndexIterator(ro, /*disable_prefix_seek=*/true,
+                       /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
+                       /*lookup_context=*/&context);
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+  if (index_iter != &iiter_on_stack) {
+    iiter_unique_ptr.reset(index_iter);
+  }
+
+  index_iter->Seek(start);
+  uint64_t start_offset;
+  if (index_iter->status().ok()) {
+    start_offset = ApproximateDataOffsetOf(*index_iter, data_size);
+  } else {
+    // Assume file is involved from the start. This likely skews the estimate
+    // but is consistent with the above error handling.
+    start_offset = 0;
+  }
+
+  index_iter->Seek(end);
+  uint64_t end_offset;
+  if (index_iter->status().ok()) {
+    end_offset = ApproximateDataOffsetOf(*index_iter, data_size);
+  } else {
+    // Assume file is involved until the end. This likely skews the estimate
+    // but is consistent with the above error handling.
+    end_offset = data_size;
+  }
+
+  assert(end_offset >= start_offset);
+  // Pro-rate file metadata (incl filters) size-proportionally across data
+  // blocks.
+  double size_ratio = static_cast<double>(end_offset - start_offset) /
+                      static_cast<double>(data_size);
+  return static_cast<uint64_t>(size_ratio *
+                               static_cast<double>(rep_->file_size));
+}
+
+bool BlockBasedTable::TEST_FilterBlockInCache() const {
+  assert(rep_ != nullptr);
+  return rep_->filter_type != Rep::FilterType::kNoFilter &&
+         TEST_BlockInCache(rep_->filter_handle);
+}
+
+bool BlockBasedTable::TEST_IndexBlockInCache() const {
+  assert(rep_ != nullptr);
+
+  return TEST_BlockInCache(rep_->footer.index_handle());
+}
+
+Status BlockBasedTable::GetKVPairsFromDataBlocks(
+    std::vector<KVPairBlock>* kv_pair_blocks) {
+  std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
+      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
+                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
+                       /*lookup_contex=*/nullptr));
+
+  Status s = blockhandles_iter->status();
+  if (!s.ok()) {
+    // Cannot read Index Block
+    return s;
+  }
+
+  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
+       blockhandles_iter->Next()) {
+    s = blockhandles_iter->status();
+
+    if (!s.ok()) {
+      break;
+    }
+
+    std::unique_ptr<InternalIterator> datablock_iter;
+    Status tmp_status;
+    datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
+        ReadOptions(), blockhandles_iter->value().handle,
+        /*input_iter=*/nullptr, /*type=*/BlockType::kData,
+        /*get_context=*/nullptr, /*lookup_context=*/nullptr,
+        /*prefetch_buffer=*/nullptr, /*for_compaction=*/false,
+        /*async_read=*/false, tmp_status));
+    s = datablock_iter->status();
+
+    if (!s.ok()) {
+      // Error reading the block - Skipped
+      continue;
+    }
+
+    KVPairBlock kv_pair_block;
+    for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
+         datablock_iter->Next()) {
+      s = datablock_iter->status();
+      if (!s.ok()) {
+        // Error reading the block - Skipped
+        break;
+      }
+      const Slice& key = datablock_iter->key();
+      const Slice& value = datablock_iter->value();
+      std::string key_copy = std::string(key.data(), key.size());
+      std::string value_copy = std::string(value.data(), value.size());
+
+      kv_pair_block.push_back(
+          std::make_pair(std::move(key_copy), std::move(value_copy)));
+    }
+    kv_pair_blocks->push_back(std::move(kv_pair_block));
+  }
+  return Status::OK();
+}
+
+Status BlockBasedTable::DumpTable(WritableFile* out_file) {
+  WritableFileStringStreamAdapter out_file_wrapper(out_file);
+  std::ostream out_stream(&out_file_wrapper);
+  // Output Footer
+  out_stream << "Footer Details:\n"
+                "--------------------------------------\n";
+  out_stream << "  " << rep_->footer.ToString() << "\n";
+
+  // Output MetaIndex
+  out_stream << "Metaindex Details:\n"
+                "--------------------------------------\n";
+  std::unique_ptr<Block> metaindex;
+  std::unique_ptr<InternalIterator> metaindex_iter;
+  ReadOptions ro;
+  Status s = ReadMetaIndexBlock(ro, nullptr /* prefetch_buffer */, &metaindex,
+                                &metaindex_iter);
+  if (s.ok()) {
+    for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
+         metaindex_iter->Next()) {
+      s = metaindex_iter->status();
+      if (!s.ok()) {
+        return s;
+      }
+      if (metaindex_iter->key() == kPropertiesBlockName) {
+        out_stream << "  Properties block handle: "
+                   << metaindex_iter->value().ToString(true) << "\n";
+      } else if (metaindex_iter->key() == kCompressionDictBlockName) {
+        out_stream << "  Compression dictionary block handle: "
+                   << metaindex_iter->value().ToString(true) << "\n";
+      } else if (strstr(metaindex_iter->key().ToString().c_str(),
+                        "filter.rocksdb.") != nullptr) {
+        out_stream << "  Filter block handle: "
+                   << metaindex_iter->value().ToString(true) << "\n";
+      } else if (metaindex_iter->key() == kRangeDelBlockName) {
+        out_stream << "  Range deletion block handle: "
+                   << metaindex_iter->value().ToString(true) << "\n";
+      }
+    }
+    out_stream << "\n";
+  } else {
+    return s;
+  }
+
+  // Output TableProperties
+  const ROCKSDB_NAMESPACE::TableProperties* table_properties;
+  table_properties = rep_->table_properties.get();
+
+  if (table_properties != nullptr) {
+    out_stream << "Table Properties:\n"
+                  "--------------------------------------\n";
+    out_stream << "  " << table_properties->ToString("\n  ", ": ") << "\n";
+  }
+
+  if (rep_->filter) {
+    out_stream << "Filter Details:\n"
+                  "--------------------------------------\n";
+    out_stream << "  " << rep_->filter->ToString() << "\n";
+  }
+
+  // Output Index block
+  s = DumpIndexBlock(out_stream);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Output compression dictionary
+  if (rep_->uncompression_dict_reader) {
+    CachableEntry<UncompressionDict> uncompression_dict;
+    s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
+        nullptr /* prefetch_buffer */, false /* no_io */,
+        false, /* verify_checksums */
+        nullptr /* get_context */, nullptr /* lookup_context */,
+        &uncompression_dict);
+    if (!s.ok()) {
+      return s;
+    }
+
+    assert(uncompression_dict.GetValue());
+
+    const Slice& raw_dict = uncompression_dict.GetValue()->GetRawDict();
+    out_stream << "Compression Dictionary:\n"
+                  "--------------------------------------\n";
+    out_stream << "  size (bytes): " << raw_dict.size() << "\n\n";
+    out_stream << "  HEX    " << raw_dict.ToString(true) << "\n\n";
+  }
+
+  // Output range deletions block
+  auto* range_del_iter = NewRangeTombstoneIterator(ReadOptions());
+  if (range_del_iter != nullptr) {
+    range_del_iter->SeekToFirst();
+    if (range_del_iter->Valid()) {
+      out_stream << "Range deletions:\n"
+                    "--------------------------------------\n";
+      for (; range_del_iter->Valid(); range_del_iter->Next()) {
+        DumpKeyValue(range_del_iter->key(), range_del_iter->value(),
+                     out_stream);
+      }
+      out_stream << "\n";
+    }
+    delete range_del_iter;
+  }
+  // Output Data blocks
+  s = DumpDataBlocks(out_stream);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (!out_stream.good()) {
+    return Status::IOError("Failed to write to output file");
+  }
+  return Status::OK();
+}
+
+Status BlockBasedTable::DumpIndexBlock(std::ostream& out_stream) {
+  out_stream << "Index Details:\n"
+                "--------------------------------------\n";
+  std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
+      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
+                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
+                       /*lookup_contex=*/nullptr));
+  Status s = blockhandles_iter->status();
+  if (!s.ok()) {
+    out_stream << "Can not read Index Block \n\n";
+    return s;
+  }
+
+  out_stream << "  Block key hex dump: Data block handle\n";
+  out_stream << "  Block key ascii\n\n";
+  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
+       blockhandles_iter->Next()) {
+    s = blockhandles_iter->status();
+    if (!s.ok()) {
+      break;
+    }
+    Slice key = blockhandles_iter->key();
+    Slice user_key;
+    InternalKey ikey;
+    if (!rep_->index_key_includes_seq) {
+      user_key = key;
+    } else {
+      ikey.DecodeFrom(key);
+      user_key = ikey.user_key();
+    }
+
+    out_stream << "  HEX    " << user_key.ToString(true) << ": "
+               << blockhandles_iter->value().ToString(true,
+                                                      rep_->index_has_first_key)
+               << " offset " << blockhandles_iter->value().handle.offset()
+               << " size " << blockhandles_iter->value().handle.size() << "\n";
+
+    std::string str_key = user_key.ToString();
+    std::string res_key("");
+    char cspace = ' ';
+    for (size_t i = 0; i < str_key.size(); i++) {
+      res_key.append(&str_key[i], 1);
+      res_key.append(1, cspace);
+    }
+    out_stream << "  ASCII  " << res_key << "\n";
+    out_stream << "  ------\n";
+  }
+  out_stream << "\n";
+  return Status::OK();
+}
+
+Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream) {
+  std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
+      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
+                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
+                       /*lookup_contex=*/nullptr));
+  Status s = blockhandles_iter->status();
+  if (!s.ok()) {
+    out_stream << "Can not read Index Block \n\n";
+    return s;
+  }
+
+  uint64_t datablock_size_min = std::numeric_limits<uint64_t>::max();
+  uint64_t datablock_size_max = 0;
+  uint64_t datablock_size_sum = 0;
+
+  size_t block_id = 1;
+  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
+       block_id++, blockhandles_iter->Next()) {
+    s = blockhandles_iter->status();
+    if (!s.ok()) {
+      break;
+    }
+
+    BlockHandle bh = blockhandles_iter->value().handle;
+    uint64_t datablock_size = bh.size();
+    datablock_size_min = std::min(datablock_size_min, datablock_size);
+    datablock_size_max = std::max(datablock_size_max, datablock_size);
+    datablock_size_sum += datablock_size;
+
+    out_stream << "Data Block # " << block_id << " @ "
+               << blockhandles_iter->value().handle.ToString(true) << "\n";
+    out_stream << "--------------------------------------\n";
+
+    std::unique_ptr<InternalIterator> datablock_iter;
+    Status tmp_status;
+    datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
+        ReadOptions(), blockhandles_iter->value().handle,
+        /*input_iter=*/nullptr, /*type=*/BlockType::kData,
+        /*get_context=*/nullptr, /*lookup_context=*/nullptr,
+        /*prefetch_buffer=*/nullptr, /*for_compaction=*/false,
+        /*async_read=*/false, tmp_status));
+    s = datablock_iter->status();
+
+    if (!s.ok()) {
+      out_stream << "Error reading the block - Skipped \n\n";
+      continue;
+    }
+
+    for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
+         datablock_iter->Next()) {
+      s = datablock_iter->status();
+      if (!s.ok()) {
+        out_stream << "Error reading the block - Skipped \n";
+        break;
+      }
+      DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_stream);
+    }
+    out_stream << "\n";
+  }
+
+  uint64_t num_datablocks = block_id - 1;
+  if (num_datablocks) {
+    double datablock_size_avg =
+        static_cast<double>(datablock_size_sum) / num_datablocks;
+    out_stream << "Data Block Summary:\n";
+    out_stream << "--------------------------------------\n";
+    out_stream << "  # data blocks: " << num_datablocks << "\n";
+    out_stream << "  min data block size: " << datablock_size_min << "\n";
+    out_stream << "  max data block size: " << datablock_size_max << "\n";
+    out_stream << "  avg data block size: "
+               << std::to_string(datablock_size_avg) << "\n";
+  }
+
+  return Status::OK();
+}
+
+void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
+                                   std::ostream& out_stream) {
+  InternalKey ikey;
+  ikey.DecodeFrom(key);
+
+  out_stream << "  HEX    " << ikey.user_key().ToString(true) << ": "
+             << value.ToString(true) << "\n";
+
+  std::string str_key = ikey.user_key().ToString();
+  std::string str_value = value.ToString();
+  std::string res_key(""), res_value("");
+  char cspace = ' ';
+  for (size_t i = 0; i < str_key.size(); i++) {
+    if (str_key[i] == '\0') {
+      res_key.append("\\0", 2);
+    } else {
+      res_key.append(&str_key[i], 1);
+    }
+    res_key.append(1, cspace);
+  }
+  for (size_t i = 0; i < str_value.size(); i++) {
+    if (str_value[i] == '\0') {
+      res_value.append("\\0", 2);
+    } else {
+      res_value.append(&str_value[i], 1);
+    }
+    res_value.append(1, cspace);
+  }
+
+  out_stream << "  ASCII  " << res_key << ": " << res_value << "\n";
+  out_stream << "  ------\n";
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_reader.h b/src/rocksdb/table/block_based/block_based_table_reader.h
new file mode 100644
index 000000000..89de891c9
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_reader.h
@@ -0,0 +1,739 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_key.h"
+#include "cache/cache_reservation_manager.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "file/filename.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table_properties.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_type.h"
+#include "table/block_based/cachable_entry.h"
+#include "table/block_based/filter_block.h"
+#include "table/block_based/uncompression_dict_reader.h"
+#include "table/format.h"
+#include "table/persistent_cache_options.h"
+#include "table/table_properties_internal.h"
+#include "table/table_reader.h"
+#include "table/two_level_iterator.h"
+#include "trace_replay/block_cache_tracer.h"
+#include "util/coro_utils.h"
+#include "util/hash_containers.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cache;
+class FilterBlockReader;
+class FullFilterBlockReader;
+class Footer;
+class InternalKeyComparator;
+class Iterator;
+class FSRandomAccessFile;
+class TableCache;
+class TableReader;
+class WritableFile;
+struct BlockBasedTableOptions;
+struct EnvOptions;
+struct ReadOptions;
+class GetContext;
+
+using KVPairBlock = std::vector<std::pair<std::string, std::string>>;
+
+// Reader class for BlockBasedTable format.
+// For the format of BlockBasedTable refer to
+// https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format.
+// This is the default table type. Data is chucked into fixed size blocks and
+// each block in-turn stores entries. When storing data, we can compress and/or
+// encode data efficiently within a block, which often results in a much smaller
+// data size compared with the raw data size. As for the record retrieval, we'll
+// first locate the block where target record may reside, then read the block to
+// memory, and finally search that record within the block. Of course, to avoid
+// frequent reads of the same block, we introduced the block cache to keep the
+// loaded blocks in the memory.
+class BlockBasedTable : public TableReader {
+ public:
+  static const std::string kObsoleteFilterBlockPrefix;
+  static const std::string kFullFilterBlockPrefix;
+  static const std::string kPartitionedFilterBlockPrefix;
+
+  // 1-byte compression type + 32-bit checksum
+  static constexpr size_t kBlockTrailerSize = 5;
+
+  // Attempt to open the table that is stored in bytes [0..file_size)
+  // of "file", and read the metadata entries necessary to allow
+  // retrieving data from the table.
+  //
+  // If successful, returns ok and sets "*table_reader" to the newly opened
+  // table.  The client should delete "*table_reader" when no longer needed.
+  // If there was an error while initializing the table, sets "*table_reader"
+  // to nullptr and returns a non-ok status.
+  //
+  // @param file must remain live while this Table is in use.
+  // @param prefetch_index_and_filter_in_cache can be used to disable
+  // prefetching of
+  //    index and filter blocks into block cache at startup
+  // @param skip_filters Disables loading/accessing the filter block. Overrides
+  //    prefetch_index_and_filter_in_cache, so filter will be skipped if both
+  //    are set.
+  // @param force_direct_prefetch if true, always prefetching to RocksDB
+  //    buffer, rather than calling RandomAccessFile::Prefetch().
+  static Status Open(
+      const ReadOptions& ro, const ImmutableOptions& ioptions,
+      const EnvOptions& env_options,
+      const BlockBasedTableOptions& table_options,
+      const InternalKeyComparator& internal_key_comparator,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table_reader,
+      std::shared_ptr<CacheReservationManager> table_reader_cache_res_mgr =
+          nullptr,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+      bool prefetch_index_and_filter_in_cache = true, bool skip_filters = false,
+      int level = -1, const bool immortal_table = false,
+      const SequenceNumber largest_seqno = 0,
+      bool force_direct_prefetch = false,
+      TailPrefetchStats* tail_prefetch_stats = nullptr,
+      BlockCacheTracer* const block_cache_tracer = nullptr,
+      size_t max_file_size_for_l0_meta_pin = 0,
+      const std::string& cur_db_session_id = "", uint64_t cur_file_num = 0,
+      UniqueId64x2 expected_unique_id = {});
+
+  bool PrefixRangeMayMatch(const Slice& internal_key,
+                           const ReadOptions& read_options,
+                           const SliceTransform* options_prefix_extractor,
+                           const bool need_upper_bound_check,
+                           BlockCacheLookupContext* lookup_context) const;
+
+  // Returns a new iterator over the table contents.
+  // The result of NewIterator() is initially invalid (caller must
+  // call one of the Seek methods on the iterator before using it).
+  // @param read_options Must outlive the returned iterator.
+  // @param skip_filters Disables loading/accessing the filter block
+  // compaction_readahead_size: its value will only be used if caller =
+  // kCompaction.
+  InternalIterator* NewIterator(const ReadOptions&,
+                                const SliceTransform* prefix_extractor,
+                                Arena* arena, bool skip_filters,
+                                TableReaderCaller caller,
+                                size_t compaction_readahead_size = 0,
+                                bool allow_unprepared_value = false) override;
+
+  FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
+      const ReadOptions& read_options) override;
+
+  // @param skip_filters Disables loading/accessing the filter block
+  Status Get(const ReadOptions& readOptions, const Slice& key,
+             GetContext* get_context, const SliceTransform* prefix_extractor,
+             bool skip_filters = false) override;
+
+  Status MultiGetFilter(const ReadOptions& read_options,
+                        const SliceTransform* prefix_extractor,
+                        MultiGetRange* mget_range) override;
+
+  DECLARE_SYNC_AND_ASYNC_OVERRIDE(void, MultiGet,
+                                  const ReadOptions& readOptions,
+                                  const MultiGetContext::Range* mget_range,
+                                  const SliceTransform* prefix_extractor,
+                                  bool skip_filters = false);
+
+  // Pre-fetch the disk blocks that correspond to the key range specified by
+  // (kbegin, kend). The call will return error status in the event of
+  // IO or iteration error.
+  Status Prefetch(const Slice* begin, const Slice* end) override;
+
+  // Given a key, return an approximate byte offset in the file where
+  // the data for that key begins (or would begin if the key were
+  // present in the file). The returned value is in terms of file
+  // bytes, and so includes effects like compression of the underlying data.
+  // E.g., the approximate offset of the last key in the table will
+  // be close to the file length.
+  uint64_t ApproximateOffsetOf(const Slice& key,
+                               TableReaderCaller caller) override;
+
+  // Given start and end keys, return the approximate data size in the file
+  // between the keys. The returned value is in terms of file bytes, and so
+  // includes effects like compression of the underlying data.
+  // The start key must not be greater than the end key.
+  uint64_t ApproximateSize(const Slice& start, const Slice& end,
+                           TableReaderCaller caller) override;
+
+  Status ApproximateKeyAnchors(const ReadOptions& read_options,
+                               std::vector<Anchor>& anchors) override;
+
+  bool TEST_BlockInCache(const BlockHandle& handle) const;
+
+  // Returns true if the block for the specified key is in cache.
+  // REQUIRES: key is in this table && block cache enabled
+  bool TEST_KeyInCache(const ReadOptions& options, const Slice& key);
+
+  // Set up the table for Compaction. Might change some parameters with
+  // posix_fadvise
+  void SetupForCompaction() override;
+
+  std::shared_ptr<const TableProperties> GetTableProperties() const override;
+
+  size_t ApproximateMemoryUsage() const override;
+
+  // convert SST file to a human readable form
+  Status DumpTable(WritableFile* out_file) override;
+
+  Status VerifyChecksum(const ReadOptions& readOptions,
+                        TableReaderCaller caller) override;
+
+  ~BlockBasedTable();
+
+  bool TEST_FilterBlockInCache() const;
+  bool TEST_IndexBlockInCache() const;
+
+  // IndexReader is the interface that provides the functionality for index
+  // access.
+  class IndexReader {
+   public:
+    virtual ~IndexReader() = default;
+
+    // Create an iterator for index access. If iter is null, then a new object
+    // is created on the heap, and the callee will have the ownership.
+    // If a non-null iter is passed in, it will be used, and the returned value
+    // is either the same as iter or a new on-heap object that
+    // wraps the passed iter. In the latter case the return value points
+    // to a different object then iter, and the callee has the ownership of the
+    // returned object.
+    virtual InternalIteratorBase<IndexValue>* NewIterator(
+        const ReadOptions& read_options, bool disable_prefix_seek,
+        IndexBlockIter* iter, GetContext* get_context,
+        BlockCacheLookupContext* lookup_context) = 0;
+
+    // Report an approximation of how much memory has been used other than
+    // memory that was allocated in block cache.
+    virtual size_t ApproximateMemoryUsage() const = 0;
+    // Cache the dependencies of the index reader (e.g. the partitions
+    // of a partitioned index).
+    virtual Status CacheDependencies(const ReadOptions& /*ro*/,
+                                     bool /* pin */) {
+      return Status::OK();
+    }
+  };
+
+  class IndexReaderCommon;
+
+  static void SetupBaseCacheKey(const TableProperties* properties,
+                                const std::string& cur_db_session_id,
+                                uint64_t cur_file_number,
+                                OffsetableCacheKey* out_base_cache_key,
+                                bool* out_is_stable = nullptr);
+
+  static CacheKey GetCacheKey(const OffsetableCacheKey& base_cache_key,
+                              const BlockHandle& handle);
+
+  static void UpdateCacheInsertionMetrics(BlockType block_type,
+                                          GetContext* get_context, size_t usage,
+                                          bool redundant,
+                                          Statistics* const statistics);
+
+  // Get the size to read from storage for a BlockHandle. size_t because we
+  // are about to load into memory.
+  static inline size_t BlockSizeWithTrailer(const BlockHandle& handle) {
+    return static_cast<size_t>(handle.size() + kBlockTrailerSize);
+  }
+
+  // It is the caller's responsibility to make sure that this is called with
+  // block-based table serialized block contents, which contains the compression
+  // byte in the trailer after `block_size`.
+  static inline CompressionType GetBlockCompressionType(const char* block_data,
+                                                        size_t block_size) {
+    return static_cast<CompressionType>(block_data[block_size]);
+  }
+  static inline CompressionType GetBlockCompressionType(
+      const BlockContents& contents) {
+    assert(contents.has_trailer);
+    return GetBlockCompressionType(contents.data.data(), contents.data.size());
+  }
+
+  // Retrieve all key value pairs from data blocks in the table.
+  // The key retrieved are internal keys.
+  Status GetKVPairsFromDataBlocks(std::vector<KVPairBlock>* kv_pair_blocks);
+
+  struct Rep;
+
+  Rep* get_rep() { return rep_; }
+  const Rep* get_rep() const { return rep_; }
+
+  // input_iter: if it is not null, update this one and return it as Iterator
+  template <typename TBlockIter>
+  TBlockIter* NewDataBlockIterator(const ReadOptions& ro,
+                                   const BlockHandle& block_handle,
+                                   TBlockIter* input_iter, BlockType block_type,
+                                   GetContext* get_context,
+                                   BlockCacheLookupContext* lookup_context,
+                                   FilePrefetchBuffer* prefetch_buffer,
+                                   bool for_compaction, bool async_read,
+                                   Status& s) const;
+
+  // input_iter: if it is not null, update this one and return it as Iterator
+  template <typename TBlockIter>
+  TBlockIter* NewDataBlockIterator(const ReadOptions& ro,
+                                   CachableEntry<Block>& block,
+                                   TBlockIter* input_iter, Status s) const;
+
+  class PartitionedIndexIteratorState;
+
+  template <typename TBlocklike>
+  friend class FilterBlockReaderCommon;
+
+  friend class PartitionIndexReader;
+
+  friend class UncompressionDictReader;
+
+ protected:
+  Rep* rep_;
+  explicit BlockBasedTable(Rep* rep, BlockCacheTracer* const block_cache_tracer)
+      : rep_(rep), block_cache_tracer_(block_cache_tracer) {}
+  // No copying allowed
+  explicit BlockBasedTable(const TableReader&) = delete;
+  void operator=(const TableReader&) = delete;
+
+ private:
+  friend class MockedBlockBasedTable;
+  friend class BlockBasedTableReaderTestVerifyChecksum_ChecksumMismatch_Test;
+  BlockCacheTracer* const block_cache_tracer_;
+
+  void UpdateCacheHitMetrics(BlockType block_type, GetContext* get_context,
+                             size_t usage) const;
+  void UpdateCacheMissMetrics(BlockType block_type,
+                              GetContext* get_context) const;
+
+  Cache::Handle* GetEntryFromCache(const CacheTier& cache_tier,
+                                   Cache* block_cache, const Slice& key,
+                                   BlockType block_type, const bool wait,
+                                   GetContext* get_context,
+                                   const Cache::CacheItemHelper* cache_helper,
+                                   const Cache::CreateCallback& create_cb,
+                                   Cache::Priority priority) const;
+
+  template <typename TBlocklike>
+  Status InsertEntryToCache(const CacheTier& cache_tier, Cache* block_cache,
+                            const Slice& key,
+                            const Cache::CacheItemHelper* cache_helper,
+                            std::unique_ptr<TBlocklike>&& block_holder,
+                            size_t charge, Cache::Handle** cache_handle,
+                            Cache::Priority priority) const;
+
+  // Either Block::NewDataIterator() or Block::NewIndexIterator().
+  template <typename TBlockIter>
+  static TBlockIter* InitBlockIterator(const Rep* rep, Block* block,
+                                       BlockType block_type,
+                                       TBlockIter* input_iter,
+                                       bool block_contents_pinned);
+
+  // If block cache enabled (compressed or uncompressed), looks for the block
+  // identified by handle in (1) uncompressed cache, (2) compressed cache, and
+  // then (3) file. If found, inserts into the cache(s) that were searched
+  // unsuccessfully (e.g., if found in file, will add to both uncompressed and
+  // compressed caches if they're enabled).
+  //
+  // @param block_entry value is set to the uncompressed block if found. If
+  //    in uncompressed block cache, also sets cache_handle to reference that
+  //    block.
+  template <typename TBlocklike>
+  Status MaybeReadBlockAndLoadToCache(
+      FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+      const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+      const bool wait, const bool for_compaction,
+      CachableEntry<TBlocklike>* block_entry, BlockType block_type,
+      GetContext* get_context, BlockCacheLookupContext* lookup_context,
+      BlockContents* contents, bool async_read) const;
+
+  // Similar to the above, with one crucial difference: it will retrieve the
+  // block from the file even if there are no caches configured (assuming the
+  // read options allow I/O).
+  template <typename TBlocklike>
+  Status RetrieveBlock(FilePrefetchBuffer* prefetch_buffer,
+                       const ReadOptions& ro, const BlockHandle& handle,
+                       const UncompressionDict& uncompression_dict,
+                       CachableEntry<TBlocklike>* block_entry,
+                       BlockType block_type, GetContext* get_context,
+                       BlockCacheLookupContext* lookup_context,
+                       bool for_compaction, bool use_cache, bool wait_for_cache,
+                       bool async_read) const;
+
+  DECLARE_SYNC_AND_ASYNC_CONST(
+      void, RetrieveMultipleBlocks, const ReadOptions& options,
+      const MultiGetRange* batch,
+      const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
+      autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses,
+      autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>*
+          results,
+      char* scratch, const UncompressionDict& uncompression_dict);
+
+  // Get the iterator from the index reader.
+  //
+  // If input_iter is not set, return a new Iterator.
+  // If input_iter is set, try to update it and return it as Iterator.
+  // However note that in some cases the returned iterator may be different
+  // from input_iter. In such case the returned iterator should be freed.
+  //
+  // Note: ErrorIterator with Status::Incomplete shall be returned if all the
+  // following conditions are met:
+  //  1. We enabled table_options.cache_index_and_filter_blocks.
+  //  2. index is not present in block cache.
+  //  3. We disallowed any io to be performed, that is, read_options ==
+  //     kBlockCacheTier
+  InternalIteratorBase<IndexValue>* NewIndexIterator(
+      const ReadOptions& read_options, bool need_upper_bound_check,
+      IndexBlockIter* input_iter, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context) const;
+
+  // Read block cache from block caches (if set): block_cache and
+  // block_cache_compressed.
+  // On success, Status::OK with be returned and @block will be populated with
+  // pointer to the block as well as its block handle.
+  // @param uncompression_dict Data for presetting the compression library's
+  //    dictionary.
+  template <typename TBlocklike>
+  Status GetDataBlockFromCache(const Slice& cache_key, Cache* block_cache,
+                               Cache* block_cache_compressed,
+                               const ReadOptions& read_options,
+                               CachableEntry<TBlocklike>* block,
+                               const UncompressionDict& uncompression_dict,
+                               BlockType block_type, const bool wait,
+                               GetContext* get_context) const;
+
+  // Put a maybe compressed block to the corresponding block caches.
+  // This method will perform decompression against block_contents if needed
+  // and then populate the block caches.
+  // On success, Status::OK will be returned; also @block will be populated with
+  // uncompressed block and its cache handle.
+  //
+  // Allocated memory managed by block_contents will be transferred to
+  // PutDataBlockToCache(). After the call, the object will be invalid.
+  // @param uncompression_dict Data for presetting the compression library's
+  //    dictionary.
+  template <typename TBlocklike>
+  Status PutDataBlockToCache(const Slice& cache_key, Cache* block_cache,
+                             Cache* block_cache_compressed,
+                             CachableEntry<TBlocklike>* cached_block,
+                             BlockContents&& block_contents,
+                             CompressionType block_comp_type,
+                             const UncompressionDict& uncompression_dict,
+                             MemoryAllocator* memory_allocator,
+                             BlockType block_type,
+                             GetContext* get_context) const;
+
+  // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
+  // after a call to Seek(key), until handle_result returns false.
+  // May not make such a call if filter policy says that key is not present.
+  friend class TableCache;
+  friend class BlockBasedTableBuilder;
+
+  // Create a index reader based on the index type stored in the table.
+  // Optionally, user can pass a preloaded meta_index_iter for the index that
+  // need to access extra meta blocks for index construction. This parameter
+  // helps avoid re-reading meta index block if caller already created one.
+  Status CreateIndexReader(const ReadOptions& ro,
+                           FilePrefetchBuffer* prefetch_buffer,
+                           InternalIterator* preloaded_meta_index_iter,
+                           bool use_cache, bool prefetch, bool pin,
+                           BlockCacheLookupContext* lookup_context,
+                           std::unique_ptr<IndexReader>* index_reader);
+
+  bool FullFilterKeyMayMatch(FilterBlockReader* filter, const Slice& user_key,
+                             const bool no_io,
+                             const SliceTransform* prefix_extractor,
+                             GetContext* get_context,
+                             BlockCacheLookupContext* lookup_context,
+                             Env::IOPriority rate_limiter_priority) const;
+
+  void FullFilterKeysMayMatch(FilterBlockReader* filter, MultiGetRange* range,
+                              const bool no_io,
+                              const SliceTransform* prefix_extractor,
+                              BlockCacheLookupContext* lookup_context,
+                              Env::IOPriority rate_limiter_priority) const;
+
+  // If force_direct_prefetch is true, always prefetching to RocksDB
+  //    buffer, rather than calling RandomAccessFile::Prefetch().
+  static Status PrefetchTail(
+      const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size,
+      bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats,
+      const bool prefetch_all, const bool preload_all,
+      std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer);
+  Status ReadMetaIndexBlock(const ReadOptions& ro,
+                            FilePrefetchBuffer* prefetch_buffer,
+                            std::unique_ptr<Block>* metaindex_block,
+                            std::unique_ptr<InternalIterator>* iter);
+  Status ReadPropertiesBlock(const ReadOptions& ro,
+                             FilePrefetchBuffer* prefetch_buffer,
+                             InternalIterator* meta_iter,
+                             const SequenceNumber largest_seqno);
+  Status ReadRangeDelBlock(const ReadOptions& ro,
+                           FilePrefetchBuffer* prefetch_buffer,
+                           InternalIterator* meta_iter,
+                           const InternalKeyComparator& internal_comparator,
+                           BlockCacheLookupContext* lookup_context);
+  Status PrefetchIndexAndFilterBlocks(
+      const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
+      InternalIterator* meta_iter, BlockBasedTable* new_table,
+      bool prefetch_all, const BlockBasedTableOptions& table_options,
+      const int level, size_t file_size, size_t max_file_size_for_l0_meta_pin,
+      BlockCacheLookupContext* lookup_context);
+
+  static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name);
+
+  Status VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice>* index_iter);
+  Status VerifyChecksumInBlocks(const ReadOptions& read_options,
+                                InternalIteratorBase<IndexValue>* index_iter);
+
+  // Create the filter from the filter block.
+  std::unique_ptr<FilterBlockReader> CreateFilterBlockReader(
+      const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
+      bool use_cache, bool prefetch, bool pin,
+      BlockCacheLookupContext* lookup_context);
+
+  // Size of all data blocks, maybe approximate
+  uint64_t GetApproximateDataSize();
+
+  // Given an iterator return its offset in data block section of file.
+  uint64_t ApproximateDataOffsetOf(
+      const InternalIteratorBase<IndexValue>& index_iter,
+      uint64_t data_size) const;
+
+  // Helper functions for DumpTable()
+  Status DumpIndexBlock(std::ostream& out_stream);
+  Status DumpDataBlocks(std::ostream& out_stream);
+  void DumpKeyValue(const Slice& key, const Slice& value,
+                    std::ostream& out_stream);
+
+  // Returns false if prefix_extractor exists and is compatible with that used
+  // in building the table file, otherwise true.
+  bool PrefixExtractorChanged(const SliceTransform* prefix_extractor) const;
+
+  // A cumulative data block file read in MultiGet lower than this size will
+  // use a stack buffer
+  static constexpr size_t kMultiGetReadStackBufSize = 8192;
+
+  friend class PartitionedFilterBlockReader;
+  friend class PartitionedFilterBlockTest;
+  friend class DBBasicTest_MultiGetIOBufferOverrun_Test;
+};
+
+// Maintaining state of a two-level iteration on a partitioned index structure.
+class BlockBasedTable::PartitionedIndexIteratorState
+    : public TwoLevelIteratorState {
+ public:
+  PartitionedIndexIteratorState(
+      const BlockBasedTable* table,
+      UnorderedMap<uint64_t, CachableEntry<Block>>* block_map);
+  InternalIteratorBase<IndexValue>* NewSecondaryIterator(
+      const BlockHandle& index_value) override;
+
+ private:
+  // Don't own table_
+  const BlockBasedTable* table_;
+  UnorderedMap<uint64_t, CachableEntry<Block>>* block_map_;
+};
+
+// Stores all the properties associated with a BlockBasedTable.
+// These are immutable.
+struct BlockBasedTable::Rep {
+  Rep(const ImmutableOptions& _ioptions, const EnvOptions& _env_options,
+      const BlockBasedTableOptions& _table_opt,
+      const InternalKeyComparator& _internal_comparator, bool skip_filters,
+      uint64_t _file_size, int _level, const bool _immortal_table)
+      : ioptions(_ioptions),
+        env_options(_env_options),
+        table_options(_table_opt),
+        filter_policy(skip_filters ? nullptr : _table_opt.filter_policy.get()),
+        internal_comparator(_internal_comparator),
+        filter_type(FilterType::kNoFilter),
+        index_type(BlockBasedTableOptions::IndexType::kBinarySearch),
+        whole_key_filtering(_table_opt.whole_key_filtering),
+        prefix_filtering(true),
+        global_seqno(kDisableGlobalSequenceNumber),
+        file_size(_file_size),
+        level(_level),
+        immortal_table(_immortal_table) {}
+  ~Rep() { status.PermitUncheckedError(); }
+  const ImmutableOptions& ioptions;
+  const EnvOptions& env_options;
+  const BlockBasedTableOptions table_options;
+  const FilterPolicy* const filter_policy;
+  const InternalKeyComparator& internal_comparator;
+  Status status;
+  std::unique_ptr<RandomAccessFileReader> file;
+  OffsetableCacheKey base_cache_key;
+  PersistentCacheOptions persistent_cache_options;
+
+  // Footer contains the fixed table information
+  Footer footer;
+
+  std::unique_ptr<IndexReader> index_reader;
+  std::unique_ptr<FilterBlockReader> filter;
+  std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
+
+  enum class FilterType {
+    kNoFilter,
+    kFullFilter,
+    kPartitionedFilter,
+  };
+  FilterType filter_type;
+  BlockHandle filter_handle;
+  BlockHandle compression_dict_handle;
+
+  std::shared_ptr<const TableProperties> table_properties;
+  BlockBasedTableOptions::IndexType index_type;
+  bool whole_key_filtering;
+  bool prefix_filtering;
+  std::shared_ptr<const SliceTransform> table_prefix_extractor;
+
+  std::shared_ptr<FragmentedRangeTombstoneList> fragmented_range_dels;
+
+  // If global_seqno is used, all Keys in this file will have the same
+  // seqno with value `global_seqno`.
+  //
+  // A value of kDisableGlobalSequenceNumber means that this feature is disabled
+  // and every key have it's own seqno.
+  SequenceNumber global_seqno;
+
+  // Size of the table file on disk
+  uint64_t file_size;
+
+  // the level when the table is opened, could potentially change when trivial
+  // move is involved
+  int level;
+
+  // If false, blocks in this file are definitely all uncompressed. Knowing this
+  // before reading individual blocks enables certain optimizations.
+  bool blocks_maybe_compressed = true;
+
+  // If true, data blocks in this file are definitely ZSTD compressed. If false
+  // they might not be. When false we skip creating a ZSTD digested
+  // uncompression dictionary. Even if we get a false negative, things should
+  // still work, just not as quickly.
+  bool blocks_definitely_zstd_compressed = false;
+
+  // These describe how index is encoded.
+  bool index_has_first_key = false;
+  bool index_key_includes_seq = true;
+  bool index_value_is_full = true;
+
+  const bool immortal_table;
+
+  std::unique_ptr<CacheReservationManager::CacheReservationHandle>
+      table_reader_cache_res_handle = nullptr;
+
+  SequenceNumber get_global_seqno(BlockType block_type) const {
+    return (block_type == BlockType::kFilterPartitionIndex ||
+            block_type == BlockType::kCompressionDictionary)
+               ? kDisableGlobalSequenceNumber
+               : global_seqno;
+  }
+
+  uint64_t cf_id_for_tracing() const {
+    return table_properties
+               ? table_properties->column_family_id
+               : ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory::Context::
+                     kUnknownColumnFamily;
+  }
+
+  Slice cf_name_for_tracing() const {
+    return table_properties ? table_properties->column_family_name
+                            : BlockCacheTraceHelper::kUnknownColumnFamilyName;
+  }
+
+  uint32_t level_for_tracing() const { return level >= 0 ? level : UINT32_MAX; }
+
+  uint64_t sst_number_for_tracing() const {
+    return file ? TableFileNameToNumber(file->file_name()) : UINT64_MAX;
+  }
+  void CreateFilePrefetchBuffer(
+      size_t readahead_size, size_t max_readahead_size,
+      std::unique_ptr<FilePrefetchBuffer>* fpb, bool implicit_auto_readahead,
+      uint64_t num_file_reads,
+      uint64_t num_file_reads_for_auto_readahead) const {
+    fpb->reset(new FilePrefetchBuffer(
+        readahead_size, max_readahead_size,
+        !ioptions.allow_mmap_reads /* enable */, false /* track_min_offset */,
+        implicit_auto_readahead, num_file_reads,
+        num_file_reads_for_auto_readahead, ioptions.fs.get(), ioptions.clock,
+        ioptions.stats));
+  }
+
+  void CreateFilePrefetchBufferIfNotExists(
+      size_t readahead_size, size_t max_readahead_size,
+      std::unique_ptr<FilePrefetchBuffer>* fpb, bool implicit_auto_readahead,
+      uint64_t num_file_reads,
+      uint64_t num_file_reads_for_auto_readahead) const {
+    if (!(*fpb)) {
+      CreateFilePrefetchBuffer(readahead_size, max_readahead_size, fpb,
+                               implicit_auto_readahead, num_file_reads,
+                               num_file_reads_for_auto_readahead);
+    }
+  }
+
+  std::size_t ApproximateMemoryUsage() const {
+    std::size_t usage = 0;
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    usage += malloc_usable_size(const_cast<BlockBasedTable::Rep*>(this));
+#else
+    usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    return usage;
+  }
+};
+
+// This is an adapter class for `WritableFile` to be used for `std::ostream`.
+// The adapter wraps a `WritableFile`, which can be passed to a `std::ostream`
+// constructor for storing streaming data.
+// Note:
+//  * This adapter doesn't provide any buffering, each write is forwarded to
+//    `WritableFile->Append()` directly.
+//  * For a failed write, the user needs to check the status by `ostream.good()`
+class WritableFileStringStreamAdapter : public std::stringbuf {
+ public:
+  explicit WritableFileStringStreamAdapter(WritableFile* writable_file)
+      : file_(writable_file) {}
+
+  // Override overflow() to handle `sputc()`. There are cases that will not go
+  // through `xsputn()` e.g. `std::endl` or an unsigned long long is written by
+  // `os.put()` directly and will call `sputc()` By internal implementation:
+  //    int_type __CLR_OR_THIS_CALL sputc(_Elem _Ch) {  // put a character
+  //        return 0 < _Pnavail() ? _Traits::to_int_type(*_Pninc() = _Ch) :
+  //        overflow(_Traits::to_int_type(_Ch));
+  //    }
+  // As we explicitly disabled buffering (_Pnavail() is always 0), every write,
+  // not captured by xsputn(), becomes an overflow here.
+  int overflow(int ch = EOF) override {
+    if (ch != EOF) {
+      Status s = file_->Append(Slice((char*)&ch, 1));
+      if (s.ok()) {
+        return ch;
+      }
+    }
+    return EOF;
+  }
+
+  std::streamsize xsputn(char const* p, std::streamsize n) override {
+    Status s = file_->Append(Slice(p, n));
+    if (!s.ok()) {
+      return 0;
+    }
+    return n;
+  }
+
+ private:
+  WritableFile* file_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_reader_impl.h b/src/rocksdb/table/block_based/block_based_table_reader_impl.h
new file mode 100644
index 000000000..1f6f5f223
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_reader_impl.h
@@ -0,0 +1,171 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/reader_common.h"
+
+// The file contains some member functions of BlockBasedTable that
+// cannot be implemented in block_based_table_reader.cc because
+// it's called by other files (e.g. block_based_iterator.h) and
+// are templates.
+
+namespace ROCKSDB_NAMESPACE {
+// Convert an index iterator value (i.e., an encoded BlockHandle)
+// into an iterator over the contents of the corresponding block.
+// If input_iter is null, new a iterator
+// If input_iter is not null, update this iter and return it
+template <typename TBlockIter>
+TBlockIter* BlockBasedTable::NewDataBlockIterator(
+    const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter,
+    BlockType block_type, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    FilePrefetchBuffer* prefetch_buffer, bool for_compaction, bool async_read,
+    Status& s) const {
+  PERF_TIMER_GUARD(new_table_block_iter_nanos);
+
+  TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
+  if (!s.ok()) {
+    iter->Invalidate(s);
+    return iter;
+  }
+
+  CachableEntry<Block> block;
+  if (rep_->uncompression_dict_reader && block_type == BlockType::kData) {
+    CachableEntry<UncompressionDict> uncompression_dict;
+    const bool no_io = (ro.read_tier == kBlockCacheTier);
+    s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
+        prefetch_buffer, no_io, ro.verify_checksums, get_context,
+        lookup_context, &uncompression_dict);
+    if (!s.ok()) {
+      iter->Invalidate(s);
+      return iter;
+    }
+    const UncompressionDict& dict = uncompression_dict.GetValue()
+                                        ? *uncompression_dict.GetValue()
+                                        : UncompressionDict::GetEmptyDict();
+    s = RetrieveBlock(prefetch_buffer, ro, handle, dict, &block, block_type,
+                      get_context, lookup_context, for_compaction,
+                      /* use_cache */ true, /* wait_for_cache */ true,
+                      async_read);
+  } else {
+    s = RetrieveBlock(
+        prefetch_buffer, ro, handle, UncompressionDict::GetEmptyDict(), &block,
+        block_type, get_context, lookup_context, for_compaction,
+        /* use_cache */ true, /* wait_for_cache */ true, async_read);
+  }
+
+  if (s.IsTryAgain() && async_read) {
+    return iter;
+  }
+
+  if (!s.ok()) {
+    assert(block.IsEmpty());
+    iter->Invalidate(s);
+    return iter;
+  }
+
+  assert(block.GetValue() != nullptr);
+
+  // Block contents are pinned and it is still pinned after the iterator
+  // is destroyed as long as cleanup functions are moved to another object,
+  // when:
+  // 1. block cache handle is set to be released in cleanup function, or
+  // 2. it's pointing to immortal source. If own_bytes is true then we are
+  //    not reading data from the original source, whether immortal or not.
+  //    Otherwise, the block is pinned iff the source is immortal.
+  const bool block_contents_pinned =
+      block.IsCached() ||
+      (!block.GetValue()->own_bytes() && rep_->immortal_table);
+  iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), block_type, iter,
+                                       block_contents_pinned);
+
+  if (!block.IsCached()) {
+    if (!ro.fill_cache) {
+      Cache* const block_cache = rep_->table_options.block_cache.get();
+      if (block_cache) {
+        // insert a dummy record to block cache to track the memory usage
+        Cache::Handle* cache_handle = nullptr;
+        CacheKey key = CacheKey::CreateUniqueForCacheLifetime(block_cache);
+        s = block_cache->Insert(key.AsSlice(), nullptr,
+                                block.GetValue()->ApproximateMemoryUsage(),
+                                nullptr, &cache_handle);
+
+        if (s.ok()) {
+          assert(cache_handle != nullptr);
+          iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
+                                cache_handle);
+        }
+      }
+    }
+  } else {
+    iter->SetCacheHandle(block.GetCacheHandle());
+  }
+
+  block.TransferTo(iter);
+
+  return iter;
+}
+
+// Convert an uncompressed data block (i.e CachableEntry<Block>)
+// into an iterator over the contents of the corresponding block.
+// If input_iter is null, new a iterator
+// If input_iter is not null, update this iter and return it
+template <typename TBlockIter>
+TBlockIter* BlockBasedTable::NewDataBlockIterator(const ReadOptions& ro,
+                                                  CachableEntry<Block>& block,
+                                                  TBlockIter* input_iter,
+                                                  Status s) const {
+  PERF_TIMER_GUARD(new_table_block_iter_nanos);
+
+  TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
+  if (!s.ok()) {
+    iter->Invalidate(s);
+    return iter;
+  }
+
+  assert(block.GetValue() != nullptr);
+  // Block contents are pinned and it is still pinned after the iterator
+  // is destroyed as long as cleanup functions are moved to another object,
+  // when:
+  // 1. block cache handle is set to be released in cleanup function, or
+  // 2. it's pointing to immortal source. If own_bytes is true then we are
+  //    not reading data from the original source, whether immortal or not.
+  //    Otherwise, the block is pinned iff the source is immortal.
+  const bool block_contents_pinned =
+      block.IsCached() ||
+      (!block.GetValue()->own_bytes() && rep_->immortal_table);
+  iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), BlockType::kData,
+                                       iter, block_contents_pinned);
+
+  if (!block.IsCached()) {
+    if (!ro.fill_cache) {
+      Cache* const block_cache = rep_->table_options.block_cache.get();
+      if (block_cache) {
+        // insert a dummy record to block cache to track the memory usage
+        Cache::Handle* cache_handle = nullptr;
+        CacheKey key = CacheKey::CreateUniqueForCacheLifetime(block_cache);
+        s = block_cache->Insert(key.AsSlice(), nullptr,
+                                block.GetValue()->ApproximateMemoryUsage(),
+                                nullptr, &cache_handle);
+
+        if (s.ok()) {
+          assert(cache_handle != nullptr);
+          iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
+                                cache_handle);
+        }
+      }
+    }
+  } else {
+    iter->SetCacheHandle(block.GetCacheHandle());
+  }
+
+  block.TransferTo(iter);
+  return iter;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h b/src/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h
new file mode 100644
index 000000000..8c7547a2a
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h
@@ -0,0 +1,760 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/async_file_reader.h"
+#include "util/coro_utils.h"
+
+#if defined(WITHOUT_COROUTINES) || \
+    (defined(USE_COROUTINES) && defined(WITH_COROUTINES))
+
+namespace ROCKSDB_NAMESPACE {
+
+// This function reads multiple data blocks from disk using Env::MultiRead()
+// and optionally inserts them into the block cache. It uses the scratch
+// buffer provided by the caller, which is contiguous. If scratch is a nullptr
+// it allocates a separate buffer for each block. Typically, if the blocks
+// need to be uncompressed and there is no compressed block cache, callers
+// can allocate a temporary scratch buffer in order to minimize memory
+// allocations.
+// If options.fill_cache is true, it inserts the blocks into cache. If its
+// false and scratch is non-null and the blocks are uncompressed, it copies
+// the buffers to heap. In any case, the CachableEntry<Block> returned will
+// own the data bytes.
+// If compression is enabled and also there is no compressed block cache,
+// the adjacent blocks are read out in one IO (combined read)
+// batch - A MultiGetRange with only those keys with unique data blocks not
+//         found in cache
+// handles - A vector of block handles. Some of them me be NULL handles
+// scratch - An optional contiguous buffer to read compressed blocks into
+DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks)
+(const ReadOptions& options, const MultiGetRange* batch,
+ const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
+ autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses,
+ autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>* results,
+ char* scratch, const UncompressionDict& uncompression_dict) const {
+  RandomAccessFileReader* file = rep_->file.get();
+  const Footer& footer = rep_->footer;
+  const ImmutableOptions& ioptions = rep_->ioptions;
+  size_t read_amp_bytes_per_bit = rep_->table_options.read_amp_bytes_per_bit;
+  MemoryAllocator* memory_allocator = GetMemoryAllocator(rep_->table_options);
+
+  if (ioptions.allow_mmap_reads) {
+    size_t idx_in_batch = 0;
+    for (auto mget_iter = batch->begin(); mget_iter != batch->end();
+         ++mget_iter, ++idx_in_batch) {
+      BlockCacheLookupContext lookup_data_block_context(
+          TableReaderCaller::kUserMultiGet);
+      const BlockHandle& handle = (*handles)[idx_in_batch];
+      if (handle.IsNull()) {
+        continue;
+      }
+
+      (*statuses)[idx_in_batch] =
+          RetrieveBlock(nullptr, options, handle, uncompression_dict,
+                        &(*results)[idx_in_batch], BlockType::kData,
+                        mget_iter->get_context, &lookup_data_block_context,
+                        /* for_compaction */ false, /* use_cache */ true,
+                        /* wait_for_cache */ true, /* async_read */ false);
+    }
+    CO_RETURN;
+  }
+
+  // In direct IO mode, blocks share the direct io buffer.
+  // Otherwise, blocks share the scratch buffer.
+  const bool use_shared_buffer = file->use_direct_io() || scratch != nullptr;
+
+  autovector<FSReadRequest, MultiGetContext::MAX_BATCH_SIZE> read_reqs;
+  size_t buf_offset = 0;
+  size_t idx_in_batch = 0;
+
+  uint64_t prev_offset = 0;
+  size_t prev_len = 0;
+  autovector<size_t, MultiGetContext::MAX_BATCH_SIZE> req_idx_for_block;
+  autovector<size_t, MultiGetContext::MAX_BATCH_SIZE> req_offset_for_block;
+  for (auto mget_iter = batch->begin(); mget_iter != batch->end();
+       ++mget_iter, ++idx_in_batch) {
+    const BlockHandle& handle = (*handles)[idx_in_batch];
+    if (handle.IsNull()) {
+      continue;
+    }
+
+    size_t prev_end = static_cast<size_t>(prev_offset) + prev_len;
+
+    // If current block is adjacent to the previous one, at the same time,
+    // compression is enabled and there is no compressed cache, we combine
+    // the two block read as one.
+    // We don't combine block reads here in direct IO mode, because when doing
+    // direct IO read, the block requests will be realigned and merged when
+    // necessary.
+    if (use_shared_buffer && !file->use_direct_io() &&
+        prev_end == handle.offset()) {
+      req_offset_for_block.emplace_back(prev_len);
+      prev_len += BlockSizeWithTrailer(handle);
+    } else {
+      // No compression or current block and previous one is not adjacent:
+      // Step 1, create a new request for previous blocks
+      if (prev_len != 0) {
+        FSReadRequest req;
+        req.offset = prev_offset;
+        req.len = prev_len;
+        if (file->use_direct_io()) {
+          req.scratch = nullptr;
+        } else if (use_shared_buffer) {
+          req.scratch = scratch + buf_offset;
+          buf_offset += req.len;
+        } else {
+          req.scratch = new char[req.len];
+        }
+        read_reqs.emplace_back(req);
+      }
+
+      // Step 2, remeber the previous block info
+      prev_offset = handle.offset();
+      prev_len = BlockSizeWithTrailer(handle);
+      req_offset_for_block.emplace_back(0);
+    }
+    req_idx_for_block.emplace_back(read_reqs.size());
+
+    PERF_COUNTER_ADD(block_read_count, 1);
+    PERF_COUNTER_ADD(block_read_byte, BlockSizeWithTrailer(handle));
+  }
+  // Handle the last block and process the pending last request
+  if (prev_len != 0) {
+    FSReadRequest req;
+    req.offset = prev_offset;
+    req.len = prev_len;
+    if (file->use_direct_io()) {
+      req.scratch = nullptr;
+    } else if (use_shared_buffer) {
+      req.scratch = scratch + buf_offset;
+    } else {
+      req.scratch = new char[req.len];
+    }
+    read_reqs.emplace_back(req);
+  }
+
+  AlignedBuf direct_io_buf;
+  {
+    IOOptions opts;
+    IOStatus s = file->PrepareIOOptions(options, opts);
+    if (s.ok()) {
+#if defined(WITH_COROUTINES)
+      if (file->use_direct_io()) {
+#endif  // WITH_COROUTINES
+        s = file->MultiRead(opts, &read_reqs[0], read_reqs.size(),
+                            &direct_io_buf, options.rate_limiter_priority);
+#if defined(WITH_COROUTINES)
+      } else {
+        co_await batch->context()->reader().MultiReadAsync(
+            file, opts, &read_reqs[0], read_reqs.size(), &direct_io_buf);
+      }
+#endif  // WITH_COROUTINES
+    }
+    if (!s.ok()) {
+      // Discard all the results in this batch if there is any time out
+      // or overall MultiRead error
+      for (FSReadRequest& req : read_reqs) {
+        req.status = s;
+      }
+    }
+  }
+
+  idx_in_batch = 0;
+  size_t valid_batch_idx = 0;
+  for (auto mget_iter = batch->begin(); mget_iter != batch->end();
+       ++mget_iter, ++idx_in_batch) {
+    const BlockHandle& handle = (*handles)[idx_in_batch];
+
+    if (handle.IsNull()) {
+      continue;
+    }
+
+    assert(valid_batch_idx < req_idx_for_block.size());
+    assert(valid_batch_idx < req_offset_for_block.size());
+    assert(req_idx_for_block[valid_batch_idx] < read_reqs.size());
+    size_t& req_idx = req_idx_for_block[valid_batch_idx];
+    size_t& req_offset = req_offset_for_block[valid_batch_idx];
+    valid_batch_idx++;
+    FSReadRequest& req = read_reqs[req_idx];
+    Status s = req.status;
+    if (s.ok()) {
+      if ((req.result.size() != req.len) ||
+          (req_offset + BlockSizeWithTrailer(handle) > req.result.size())) {
+        s = Status::Corruption("truncated block read from " +
+                               rep_->file->file_name() + " offset " +
+                               std::to_string(handle.offset()) + ", expected " +
+                               std::to_string(req.len) + " bytes, got " +
+                               std::to_string(req.result.size()));
+      }
+    }
+
+    BlockContents serialized_block;
+    if (s.ok()) {
+      if (!use_shared_buffer) {
+        // We allocated a buffer for this block. Give ownership of it to
+        // BlockContents so it can free the memory
+        assert(req.result.data() == req.scratch);
+        assert(req.result.size() == BlockSizeWithTrailer(handle));
+        assert(req_offset == 0);
+        serialized_block =
+            BlockContents(std::unique_ptr<char[]>(req.scratch), handle.size());
+      } else {
+        // We used the scratch buffer or direct io buffer
+        // which are shared by the blocks.
+        // serialized_block does not have the ownership.
+        serialized_block =
+            BlockContents(Slice(req.result.data() + req_offset, handle.size()));
+      }
+#ifndef NDEBUG
+      serialized_block.has_trailer = true;
+#endif
+
+      if (options.verify_checksums) {
+        PERF_TIMER_GUARD(block_checksum_time);
+        const char* data = req.result.data();
+        // Since the scratch might be shared, the offset of the data block in
+        // the buffer might not be 0. req.result.data() only point to the
+        // begin address of each read request, we need to add the offset
+        // in each read request. Checksum is stored in the block trailer,
+        // beyond the payload size.
+        s = VerifyBlockChecksum(footer.checksum_type(), data + req_offset,
+                                handle.size(), rep_->file->file_name(),
+                                handle.offset());
+        TEST_SYNC_POINT_CALLBACK("RetrieveMultipleBlocks:VerifyChecksum", &s);
+      }
+    } else if (!use_shared_buffer) {
+      // Free the allocated scratch buffer.
+      delete[] req.scratch;
+    }
+
+    if (s.ok()) {
+      // When the blocks share the same underlying buffer (scratch or direct io
+      // buffer), we may need to manually copy the block into heap if the
+      // serialized block has to be inserted into a cache. That falls into the
+      // following cases -
+      // 1. serialized block is not compressed, it needs to be inserted into
+      //    the uncompressed block cache if there is one
+      // 2. If the serialized block is compressed, it needs to be inserted
+      //    into the compressed block cache if there is one
+      //
+      // In all other cases, the serialized block is either uncompressed into a
+      // heap buffer or there is no cache at all.
+      CompressionType compression_type =
+          GetBlockCompressionType(serialized_block);
+      if (use_shared_buffer && (compression_type == kNoCompression ||
+                                (compression_type != kNoCompression &&
+                                 rep_->table_options.block_cache_compressed))) {
+        Slice serialized =
+            Slice(req.result.data() + req_offset, BlockSizeWithTrailer(handle));
+        serialized_block = BlockContents(
+            CopyBufferToHeap(GetMemoryAllocator(rep_->table_options),
+                             serialized),
+            handle.size());
+#ifndef NDEBUG
+        serialized_block.has_trailer = true;
+#endif
+      }
+    }
+
+    if (s.ok()) {
+      if (options.fill_cache) {
+        BlockCacheLookupContext lookup_data_block_context(
+            TableReaderCaller::kUserMultiGet);
+        CachableEntry<Block>* block_entry = &(*results)[idx_in_batch];
+        // MaybeReadBlockAndLoadToCache will insert into the block caches if
+        // necessary. Since we're passing the serialized block contents, it
+        // will avoid looking up the block cache
+        s = MaybeReadBlockAndLoadToCache(
+            nullptr, options, handle, uncompression_dict, /*wait=*/true,
+            /*for_compaction=*/false, block_entry, BlockType::kData,
+            mget_iter->get_context, &lookup_data_block_context,
+            &serialized_block, /*async_read=*/false);
+
+        // block_entry value could be null if no block cache is present, i.e
+        // BlockBasedTableOptions::no_block_cache is true and no compressed
+        // block cache is configured. In that case, fall
+        // through and set up the block explicitly
+        if (block_entry->GetValue() != nullptr) {
+          s.PermitUncheckedError();
+          continue;
+        }
+      }
+
+      CompressionType compression_type =
+          GetBlockCompressionType(serialized_block);
+      BlockContents contents;
+      if (compression_type != kNoCompression) {
+        UncompressionContext context(compression_type);
+        UncompressionInfo info(context, uncompression_dict, compression_type);
+        s = UncompressSerializedBlock(
+            info, req.result.data() + req_offset, handle.size(), &contents,
+            footer.format_version(), rep_->ioptions, memory_allocator);
+      } else {
+        // There are two cases here:
+        // 1) caller uses the shared buffer (scratch or direct io buffer);
+        // 2) we use the requst buffer.
+        // If scratch buffer or direct io buffer is used, we ensure that
+        // all serialized blocks are copyed to the heap as single blocks. If
+        // scratch buffer is not used, we also have no combined read, so the
+        // serialized block can be used directly.
+        contents = std::move(serialized_block);
+      }
+      if (s.ok()) {
+        (*results)[idx_in_batch].SetOwnedValue(std::make_unique<Block>(
+            std::move(contents), read_amp_bytes_per_bit, ioptions.stats));
+      }
+    }
+    (*statuses)[idx_in_batch] = s;
+  }
+}
+
+using MultiGetRange = MultiGetContext::Range;
+DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet)
+(const ReadOptions& read_options, const MultiGetRange* mget_range,
+ const SliceTransform* prefix_extractor, bool skip_filters) {
+  if (mget_range->empty()) {
+    // Caller should ensure non-empty (performance bug)
+    assert(false);
+    CO_RETURN;  // Nothing to do
+  }
+
+  FilterBlockReader* const filter =
+      !skip_filters ? rep_->filter.get() : nullptr;
+  MultiGetRange sst_file_range(*mget_range, mget_range->begin(),
+                               mget_range->end());
+
+  // First check the full filter
+  // If full filter not useful, Then go into each block
+  const bool no_io = read_options.read_tier == kBlockCacheTier;
+  uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
+  if (sst_file_range.begin()->get_context) {
+    tracing_mget_id = sst_file_range.begin()->get_context->get_tracing_get_id();
+  }
+  BlockCacheLookupContext lookup_context{
+      TableReaderCaller::kUserMultiGet, tracing_mget_id,
+      /*_get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
+  FullFilterKeysMayMatch(filter, &sst_file_range, no_io, prefix_extractor,
+                         &lookup_context, read_options.rate_limiter_priority);
+
+  if (!sst_file_range.empty()) {
+    IndexBlockIter iiter_on_stack;
+    // if prefix_extractor found in block differs from options, disable
+    // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
+    bool need_upper_bound_check = false;
+    if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
+      need_upper_bound_check = PrefixExtractorChanged(prefix_extractor);
+    }
+    auto iiter =
+        NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
+                         sst_file_range.begin()->get_context, &lookup_context);
+    std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+    if (iiter != &iiter_on_stack) {
+      iiter_unique_ptr.reset(iiter);
+    }
+
+    uint64_t prev_offset = std::numeric_limits<uint64_t>::max();
+    autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE> block_handles;
+    autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE> results;
+    autovector<Status, MultiGetContext::MAX_BATCH_SIZE> statuses;
+    MultiGetContext::Mask reused_mask = 0;
+    char stack_buf[kMultiGetReadStackBufSize];
+    std::unique_ptr<char[]> block_buf;
+    {
+      MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(),
+                                     sst_file_range.end());
+      std::vector<Cache::Handle*> cache_handles;
+      bool wait_for_cache_results = false;
+
+      CachableEntry<UncompressionDict> uncompression_dict;
+      Status uncompression_dict_status;
+      uncompression_dict_status.PermitUncheckedError();
+      bool uncompression_dict_inited = false;
+      size_t total_len = 0;
+      ReadOptions ro = read_options;
+      ro.read_tier = kBlockCacheTier;
+
+      for (auto miter = data_block_range.begin();
+           miter != data_block_range.end(); ++miter) {
+        const Slice& key = miter->ikey;
+        iiter->Seek(miter->ikey);
+
+        IndexValue v;
+        if (iiter->Valid()) {
+          v = iiter->value();
+        }
+        if (!iiter->Valid() ||
+            (!v.first_internal_key.empty() && !skip_filters &&
+             UserComparatorWrapper(rep_->internal_comparator.user_comparator())
+                     .CompareWithoutTimestamp(
+                         ExtractUserKey(key),
+                         ExtractUserKey(v.first_internal_key)) < 0)) {
+          // The requested key falls between highest key in previous block and
+          // lowest key in current block.
+          if (!iiter->status().IsNotFound()) {
+            *(miter->s) = iiter->status();
+          }
+          data_block_range.SkipKey(miter);
+          sst_file_range.SkipKey(miter);
+          continue;
+        }
+
+        if (!uncompression_dict_inited && rep_->uncompression_dict_reader) {
+          uncompression_dict_status =
+              rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
+                  nullptr /* prefetch_buffer */, no_io,
+                  read_options.verify_checksums,
+                  sst_file_range.begin()->get_context, &lookup_context,
+                  &uncompression_dict);
+          uncompression_dict_inited = true;
+        }
+
+        if (!uncompression_dict_status.ok()) {
+          assert(!uncompression_dict_status.IsNotFound());
+          *(miter->s) = uncompression_dict_status;
+          data_block_range.SkipKey(miter);
+          sst_file_range.SkipKey(miter);
+          continue;
+        }
+
+        statuses.emplace_back();
+        results.emplace_back();
+        if (v.handle.offset() == prev_offset) {
+          // This key can reuse the previous block (later on).
+          // Mark previous as "reused"
+          reused_mask |= MultiGetContext::Mask{1} << (block_handles.size() - 1);
+          // Use null handle to indicate this one reuses same block as
+          // previous.
+          block_handles.emplace_back(BlockHandle::NullBlockHandle());
+          continue;
+        }
+        // Lookup the cache for the given data block referenced by an index
+        // iterator value (i.e BlockHandle). If it exists in the cache,
+        // initialize block to the contents of the data block.
+        prev_offset = v.handle.offset();
+        BlockHandle handle = v.handle;
+        BlockCacheLookupContext lookup_data_block_context(
+            TableReaderCaller::kUserMultiGet);
+        const UncompressionDict& dict = uncompression_dict.GetValue()
+                                            ? *uncompression_dict.GetValue()
+                                            : UncompressionDict::GetEmptyDict();
+        Status s = RetrieveBlock(
+            nullptr, ro, handle, dict, &(results.back()), BlockType::kData,
+            miter->get_context, &lookup_data_block_context,
+            /* for_compaction */ false, /* use_cache */ true,
+            /* wait_for_cache */ false, /* async_read */ false);
+        if (s.IsIncomplete()) {
+          s = Status::OK();
+        }
+        if (s.ok() && !results.back().IsEmpty()) {
+          // Since we have a valid handle, check the value. If its nullptr,
+          // it means the cache is waiting for the final result and we're
+          // supposed to call WaitAll() to wait for the result.
+          if (results.back().GetValue() != nullptr) {
+            // Found it in the cache. Add NULL handle to indicate there is
+            // nothing to read from disk.
+            if (results.back().GetCacheHandle()) {
+              results.back().UpdateCachedValue();
+            }
+            block_handles.emplace_back(BlockHandle::NullBlockHandle());
+          } else {
+            // We have to wait for the cache lookup to finish in the
+            // background, and then we may have to read the block from disk
+            // anyway
+            assert(results.back().GetCacheHandle());
+            wait_for_cache_results = true;
+            block_handles.emplace_back(handle);
+            cache_handles.emplace_back(results.back().GetCacheHandle());
+          }
+        } else {
+          block_handles.emplace_back(handle);
+          total_len += BlockSizeWithTrailer(handle);
+        }
+      }
+
+      if (wait_for_cache_results) {
+        Cache* block_cache = rep_->table_options.block_cache.get();
+        block_cache->WaitAll(cache_handles);
+        for (size_t i = 0; i < block_handles.size(); ++i) {
+          // If this block was a success or failure or not needed because
+          // the corresponding key is in the same block as a prior key, skip
+          if (block_handles[i] == BlockHandle::NullBlockHandle() ||
+              results[i].IsEmpty()) {
+            continue;
+          }
+          results[i].UpdateCachedValue();
+          void* val = results[i].GetValue();
+          Cache::Handle* handle = results[i].GetCacheHandle();
+          // GetContext for any key will do, as the stats will be aggregated
+          // anyway
+          GetContext* get_context = sst_file_range.begin()->get_context;
+          if (!val) {
+            // The async cache lookup failed - could be due to an error
+            // or a false positive. We need to read the data block from
+            // the SST file
+            results[i].Reset();
+            total_len += BlockSizeWithTrailer(block_handles[i]);
+            UpdateCacheMissMetrics(BlockType::kData, get_context);
+          } else {
+            block_handles[i] = BlockHandle::NullBlockHandle();
+            UpdateCacheHitMetrics(BlockType::kData, get_context,
+                                  block_cache->GetUsage(handle));
+          }
+        }
+      }
+
+      if (total_len) {
+        char* scratch = nullptr;
+        const UncompressionDict& dict = uncompression_dict.GetValue()
+                                            ? *uncompression_dict.GetValue()
+                                            : UncompressionDict::GetEmptyDict();
+        assert(uncompression_dict_inited || !rep_->uncompression_dict_reader);
+        assert(uncompression_dict_status.ok());
+        // If using direct IO, then scratch is not used, so keep it nullptr.
+        // If the blocks need to be uncompressed and we don't need the
+        // compressed blocks, then we can use a contiguous block of
+        // memory to read in all the blocks as it will be temporary
+        // storage
+        // 1. If blocks are compressed and compressed block cache is there,
+        //    alloc heap bufs
+        // 2. If blocks are uncompressed, alloc heap bufs
+        // 3. If blocks are compressed and no compressed block cache, use
+        //    stack buf
+        if (!rep_->file->use_direct_io() &&
+            rep_->table_options.block_cache_compressed == nullptr &&
+            rep_->blocks_maybe_compressed) {
+          if (total_len <= kMultiGetReadStackBufSize) {
+            scratch = stack_buf;
+          } else {
+            scratch = new char[total_len];
+            block_buf.reset(scratch);
+          }
+        }
+        CO_AWAIT(RetrieveMultipleBlocks)
+        (read_options, &data_block_range, &block_handles, &statuses, &results,
+         scratch, dict);
+        if (sst_file_range.begin()->get_context) {
+          ++(sst_file_range.begin()
+                 ->get_context->get_context_stats_.num_sst_read);
+        }
+      }
+    }
+
+    DataBlockIter first_biter;
+    DataBlockIter next_biter;
+    size_t idx_in_batch = 0;
+    SharedCleanablePtr shared_cleanable;
+    for (auto miter = sst_file_range.begin(); miter != sst_file_range.end();
+         ++miter) {
+      Status s;
+      GetContext* get_context = miter->get_context;
+      const Slice& key = miter->ikey;
+      bool matched = false;  // if such user key matched a key in SST
+      bool done = false;
+      bool first_block = true;
+      do {
+        DataBlockIter* biter = nullptr;
+        bool reusing_prev_block;
+        bool later_reused;
+        uint64_t referenced_data_size = 0;
+        bool does_referenced_key_exist = false;
+        BlockCacheLookupContext lookup_data_block_context(
+            TableReaderCaller::kUserMultiGet, tracing_mget_id,
+            /*_get_from_user_specified_snapshot=*/read_options.snapshot !=
+                nullptr);
+        if (first_block) {
+          if (!block_handles[idx_in_batch].IsNull() ||
+              !results[idx_in_batch].IsEmpty()) {
+            first_biter.Invalidate(Status::OK());
+            NewDataBlockIterator<DataBlockIter>(
+                read_options, results[idx_in_batch], &first_biter,
+                statuses[idx_in_batch]);
+            reusing_prev_block = false;
+          } else {
+            // If handler is null and result is empty, then the status is never
+            // set, which should be the initial value: ok().
+            assert(statuses[idx_in_batch].ok());
+            reusing_prev_block = true;
+          }
+          biter = &first_biter;
+          later_reused =
+              (reused_mask & (MultiGetContext::Mask{1} << idx_in_batch)) != 0;
+          idx_in_batch++;
+        } else {
+          IndexValue v = iiter->value();
+          if (!v.first_internal_key.empty() && !skip_filters &&
+              UserComparatorWrapper(rep_->internal_comparator.user_comparator())
+                      .CompareWithoutTimestamp(
+                          ExtractUserKey(key),
+                          ExtractUserKey(v.first_internal_key)) < 0) {
+            // The requested key falls between highest key in previous block and
+            // lowest key in current block.
+            break;
+          }
+
+          next_biter.Invalidate(Status::OK());
+          Status tmp_s;
+          NewDataBlockIterator<DataBlockIter>(
+              read_options, iiter->value().handle, &next_biter,
+              BlockType::kData, get_context, &lookup_data_block_context,
+              /* prefetch_buffer= */ nullptr, /* for_compaction = */ false,
+              /*async_read = */ false, tmp_s);
+          biter = &next_biter;
+          reusing_prev_block = false;
+          later_reused = false;
+        }
+
+        if (read_options.read_tier == kBlockCacheTier &&
+            biter->status().IsIncomplete()) {
+          // couldn't get block from block_cache
+          // Update Saver.state to Found because we are only looking for
+          // whether we can guarantee the key is not there when "no_io" is set
+          get_context->MarkKeyMayExist();
+          break;
+        }
+        if (!biter->status().ok()) {
+          s = biter->status();
+          break;
+        }
+
+        // Reusing blocks complicates pinning/Cleanable, because the cache
+        // entry referenced by biter can only be released once all returned
+        // pinned values are released. This code previously did an extra
+        // block_cache Ref for each reuse, but that unnecessarily increases
+        // block cache contention. Instead we can use a variant of shared_ptr
+        // to release in block cache only once.
+        //
+        // Although the biter loop below might SaveValue multiple times for
+        // merges, just one value_pinner suffices, as MultiGet will merge
+        // the operands before returning to the API user.
+        Cleanable* value_pinner;
+        if (biter->IsValuePinned()) {
+          if (reusing_prev_block) {
+            // Note that we don't yet know if the MultiGet results will need
+            // to pin this block, so we might wrap a block for sharing and
+            // still end up with 1 (or 0) pinning ref. Not ideal but OK.
+            //
+            // Here we avoid adding redundant cleanups if we didn't end up
+            // delegating the cleanup from last time around.
+            if (!biter->HasCleanups()) {
+              assert(shared_cleanable.get());
+              if (later_reused) {
+                shared_cleanable.RegisterCopyWith(biter);
+              } else {
+                shared_cleanable.MoveAsCleanupTo(biter);
+              }
+            }
+          } else if (later_reused) {
+            assert(biter->HasCleanups());
+            // Make the existing cleanups on `biter` sharable:
+            shared_cleanable.Allocate();
+            // Move existing `biter` cleanup(s) to `shared_cleanable`
+            biter->DelegateCleanupsTo(&*shared_cleanable);
+            // Reference `shared_cleanable` as new cleanup for `biter`
+            shared_cleanable.RegisterCopyWith(biter);
+          }
+          assert(biter->HasCleanups());
+          value_pinner = biter;
+        } else {
+          value_pinner = nullptr;
+        }
+
+        bool may_exist = biter->SeekForGet(key);
+        if (!may_exist) {
+          // HashSeek cannot find the key this block and the the iter is not
+          // the end of the block, i.e. cannot be in the following blocks
+          // either. In this case, the seek_key cannot be found, so we break
+          // from the top level for-loop.
+          break;
+        }
+
+        // Call the *saver function on each entry/block until it returns false
+        for (; biter->Valid(); biter->Next()) {
+          ParsedInternalKey parsed_key;
+          Status pik_status = ParseInternalKey(
+              biter->key(), &parsed_key, false /* log_err_key */);  // TODO
+          if (!pik_status.ok()) {
+            s = pik_status;
+          }
+          if (!get_context->SaveValue(parsed_key, biter->value(), &matched,
+                                      value_pinner)) {
+            if (get_context->State() == GetContext::GetState::kFound) {
+              does_referenced_key_exist = true;
+              referenced_data_size =
+                  biter->key().size() + biter->value().size();
+            }
+            done = true;
+            break;
+          }
+          s = biter->status();
+        }
+        // Write the block cache access.
+        // XXX: There appear to be 'break' statements above that bypass this
+        // writing of the block cache trace record
+        if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() &&
+            !reusing_prev_block) {
+          // Avoid making copy of block_key, cf_name, and referenced_key when
+          // constructing the access record.
+          Slice referenced_key;
+          if (does_referenced_key_exist) {
+            referenced_key = biter->key();
+          } else {
+            referenced_key = key;
+          }
+          BlockCacheTraceRecord access_record(
+              rep_->ioptions.clock->NowMicros(),
+              /*_block_key=*/"", lookup_data_block_context.block_type,
+              lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
+              /*_cf_name=*/"", rep_->level_for_tracing(),
+              rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
+              lookup_data_block_context.is_cache_hit,
+              lookup_data_block_context.no_insert,
+              lookup_data_block_context.get_id,
+              lookup_data_block_context.get_from_user_specified_snapshot,
+              /*_referenced_key=*/"", referenced_data_size,
+              lookup_data_block_context.num_keys_in_block,
+              does_referenced_key_exist);
+          // TODO: Should handle status here?
+          block_cache_tracer_
+              ->WriteBlockAccess(access_record,
+                                 lookup_data_block_context.block_key,
+                                 rep_->cf_name_for_tracing(), referenced_key)
+              .PermitUncheckedError();
+        }
+        s = biter->status();
+        if (done) {
+          // Avoid the extra Next which is expensive in two-level indexes
+          break;
+        }
+        if (first_block) {
+          iiter->Seek(key);
+          if (!iiter->Valid()) {
+            break;
+          }
+        }
+        first_block = false;
+        iiter->Next();
+      } while (iiter->Valid());
+
+      if (matched && filter != nullptr) {
+        RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_TRUE_POSITIVE);
+        PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
+                                  rep_->level);
+      }
+      if (s.ok() && !iiter->status().IsNotFound()) {
+        s = iiter->status();
+      }
+      *(miter->s) = s;
+    }
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+    // Not sure why we need to do it. Should investigate more.
+    for (auto& st : statuses) {
+      st.PermitUncheckedError();
+    }
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
+#endif
diff --git a/src/rocksdb/table/block_based/block_based_table_reader_test.cc b/src/rocksdb/table/block_based/block_based_table_reader_test.cc
new file mode 100644
index 000000000..c5a615dfc
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_based_table_reader_test.cc
@@ -0,0 +1,572 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/block_based/block_based_table_reader.h"
+
+#include <cmath>
+#include <memory>
+#include <string>
+
+#include "cache/cache_reservation_manager.h"
+#include "db/db_test_util.h"
+#include "db/table_properties_collector.h"
+#include "file/file_util.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/partitioned_index_iterator.h"
+#include "table/format.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlockBasedTableReaderBaseTest : public testing::Test {
+ protected:
+  // Prepare key-value pairs to occupy multiple blocks.
+  // Each value is 256B, every 16 pairs constitute 1 block.
+  // If mixed_with_human_readable_string_value == true,
+  // then adjacent blocks contain values with different compression
+  // complexity: human readable strings are easier to compress than random
+  // strings.
+  static std::map<std::string, std::string> GenerateKVMap(
+      int num_block = 100,
+      bool mixed_with_human_readable_string_value = false) {
+    std::map<std::string, std::string> kv;
+
+    Random rnd(101);
+    uint32_t key = 0;
+    for (int block = 0; block < num_block; block++) {
+      for (int i = 0; i < 16; i++) {
+        char k[9] = {0};
+        // Internal key is constructed directly from this key,
+        // and internal key size is required to be >= 8 bytes,
+        // so use %08u as the format string.
+        sprintf(k, "%08u", key);
+        std::string v;
+        if (mixed_with_human_readable_string_value) {
+          v = (block % 2) ? rnd.HumanReadableString(256)
+                          : rnd.RandomString(256);
+        } else {
+          v = rnd.RandomString(256);
+        }
+        kv[std::string(k)] = v;
+        key++;
+      }
+    }
+    return kv;
+  }
+
+  void SetUp() override {
+    SetupSyncPointsToMockDirectIO();
+    test_dir_ = test::PerThreadDBPath("block_based_table_reader_test");
+    env_ = Env::Default();
+    fs_ = FileSystem::Default();
+    ASSERT_OK(fs_->CreateDir(test_dir_, IOOptions(), nullptr));
+    ConfigureTableFactory();
+  }
+
+  virtual void ConfigureTableFactory() = 0;
+
+  void TearDown() override { EXPECT_OK(DestroyDir(env_, test_dir_)); }
+
+  // Creates a table with the specificied key value pairs (kv).
+  void CreateTable(const std::string& table_name,
+                   const CompressionType& compression_type,
+                   const std::map<std::string, std::string>& kv) {
+    std::unique_ptr<WritableFileWriter> writer;
+    NewFileWriter(table_name, &writer);
+
+    // Create table builder.
+    ImmutableOptions ioptions(options_);
+    InternalKeyComparator comparator(options_.comparator);
+    ColumnFamilyOptions cf_options;
+    MutableCFOptions moptions(cf_options);
+    IntTblPropCollectorFactories factories;
+    std::unique_ptr<TableBuilder> table_builder(
+        options_.table_factory->NewTableBuilder(
+            TableBuilderOptions(ioptions, moptions, comparator, &factories,
+                                compression_type, CompressionOptions(),
+                                0 /* column_family_id */,
+                                kDefaultColumnFamilyName, -1 /* level */),
+            writer.get()));
+
+    // Build table.
+    for (auto it = kv.begin(); it != kv.end(); it++) {
+      std::string k = ToInternalKey(it->first);
+      std::string v = it->second;
+      table_builder->Add(k, v);
+    }
+    ASSERT_OK(table_builder->Finish());
+  }
+
+  void NewBlockBasedTableReader(const FileOptions& foptions,
+                                const ImmutableOptions& ioptions,
+                                const InternalKeyComparator& comparator,
+                                const std::string& table_name,
+                                std::unique_ptr<BlockBasedTable>* table,
+                                bool prefetch_index_and_filter_in_cache = true,
+                                Status* status = nullptr) {
+    const MutableCFOptions moptions(options_);
+    TableReaderOptions table_reader_options = TableReaderOptions(
+        ioptions, moptions.prefix_extractor, EnvOptions(), comparator);
+
+    std::unique_ptr<RandomAccessFileReader> file;
+    NewFileReader(table_name, foptions, &file);
+
+    uint64_t file_size = 0;
+    ASSERT_OK(env_->GetFileSize(Path(table_name), &file_size));
+
+    std::unique_ptr<TableReader> general_table;
+    Status s = options_.table_factory->NewTableReader(
+        ReadOptions(), table_reader_options, std::move(file), file_size,
+        &general_table, prefetch_index_and_filter_in_cache);
+
+    if (s.ok()) {
+      table->reset(reinterpret_cast<BlockBasedTable*>(general_table.release()));
+    }
+
+    if (status) {
+      *status = s;
+    }
+  }
+
+  std::string Path(const std::string& fname) { return test_dir_ + "/" + fname; }
+
+  std::string test_dir_;
+  Env* env_;
+  std::shared_ptr<FileSystem> fs_;
+  Options options_;
+
+ private:
+  void WriteToFile(const std::string& content, const std::string& filename) {
+    std::unique_ptr<FSWritableFile> f;
+    ASSERT_OK(fs_->NewWritableFile(Path(filename), FileOptions(), &f, nullptr));
+    ASSERT_OK(f->Append(content, IOOptions(), nullptr));
+    ASSERT_OK(f->Close(IOOptions(), nullptr));
+  }
+
+  void NewFileWriter(const std::string& filename,
+                     std::unique_ptr<WritableFileWriter>* writer) {
+    std::string path = Path(filename);
+    EnvOptions env_options;
+    FileOptions foptions;
+    std::unique_ptr<FSWritableFile> file;
+    ASSERT_OK(fs_->NewWritableFile(path, foptions, &file, nullptr));
+    writer->reset(new WritableFileWriter(std::move(file), path, env_options));
+  }
+
+  void NewFileReader(const std::string& filename, const FileOptions& opt,
+                     std::unique_ptr<RandomAccessFileReader>* reader) {
+    std::string path = Path(filename);
+    std::unique_ptr<FSRandomAccessFile> f;
+    ASSERT_OK(fs_->NewRandomAccessFile(path, opt, &f, nullptr));
+    reader->reset(new RandomAccessFileReader(std::move(f), path,
+                                             env_->GetSystemClock().get()));
+  }
+
+  std::string ToInternalKey(const std::string& key) {
+    InternalKey internal_key(key, 0, ValueType::kTypeValue);
+    return internal_key.Encode().ToString();
+  }
+};
+
+class BlockBasedTableReaderTest
+    : public BlockBasedTableReaderBaseTest,
+      public testing::WithParamInterface<std::tuple<
+          CompressionType, bool, BlockBasedTableOptions::IndexType, bool>> {
+ protected:
+  void SetUp() override {
+    compression_type_ = std::get<0>(GetParam());
+    use_direct_reads_ = std::get<1>(GetParam());
+    BlockBasedTableReaderBaseTest::SetUp();
+  }
+
+  void ConfigureTableFactory() override {
+    BlockBasedTableOptions opts;
+    opts.index_type = std::get<2>(GetParam());
+    opts.no_block_cache = std::get<3>(GetParam());
+    options_.table_factory.reset(
+        static_cast<BlockBasedTableFactory*>(NewBlockBasedTableFactory(opts)));
+  }
+
+  CompressionType compression_type_;
+  bool use_direct_reads_;
+};
+
+// Tests MultiGet in both direct IO and non-direct IO mode.
+// The keys should be in cache after MultiGet.
+TEST_P(BlockBasedTableReaderTest, MultiGet) {
+  std::map<std::string, std::string> kv =
+      BlockBasedTableReaderBaseTest::GenerateKVMap(
+          100 /* num_block */,
+          true /* mixed_with_human_readable_string_value */);
+
+  // Prepare keys, values, and statuses for MultiGet.
+  autovector<Slice, MultiGetContext::MAX_BATCH_SIZE> keys;
+  autovector<PinnableSlice, MultiGetContext::MAX_BATCH_SIZE> values;
+  autovector<Status, MultiGetContext::MAX_BATCH_SIZE> statuses;
+  {
+    const int step =
+        static_cast<int>(kv.size()) / MultiGetContext::MAX_BATCH_SIZE;
+    auto it = kv.begin();
+    for (int i = 0; i < MultiGetContext::MAX_BATCH_SIZE; i++) {
+      keys.emplace_back(it->first);
+      values.emplace_back();
+      statuses.emplace_back();
+      std::advance(it, step);
+    }
+  }
+
+  std::string table_name =
+      "BlockBasedTableReaderTest" + CompressionTypeToString(compression_type_);
+  CreateTable(table_name, compression_type_, kv);
+
+  std::unique_ptr<BlockBasedTable> table;
+  Options options;
+  ImmutableOptions ioptions(options);
+  FileOptions foptions;
+  foptions.use_direct_reads = use_direct_reads_;
+  InternalKeyComparator comparator(options.comparator);
+  NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table);
+
+  // Ensure that keys are not in cache before MultiGet.
+  for (auto& key : keys) {
+    ASSERT_FALSE(table->TEST_KeyInCache(ReadOptions(), key));
+  }
+
+  // Prepare MultiGetContext.
+  autovector<GetContext, MultiGetContext::MAX_BATCH_SIZE> get_context;
+  autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
+  autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
+  for (size_t i = 0; i < keys.size(); ++i) {
+    get_context.emplace_back(BytewiseComparator(), nullptr, nullptr, nullptr,
+                             GetContext::kNotFound, keys[i], &values[i],
+                             nullptr, nullptr, nullptr, nullptr,
+                             true /* do_merge */, nullptr, nullptr, nullptr,
+                             nullptr, nullptr, nullptr);
+    key_context.emplace_back(nullptr, keys[i], &values[i], nullptr,
+                             &statuses.back());
+    key_context.back().get_context = &get_context.back();
+  }
+  for (auto& key_ctx : key_context) {
+    sorted_keys.emplace_back(&key_ctx);
+  }
+  MultiGetContext ctx(&sorted_keys, 0, sorted_keys.size(), 0, ReadOptions(),
+                      fs_.get(), nullptr);
+
+  // Execute MultiGet.
+  MultiGetContext::Range range = ctx.GetMultiGetRange();
+  PerfContext* perf_ctx = get_perf_context();
+  perf_ctx->Reset();
+  table->MultiGet(ReadOptions(), &range, nullptr);
+
+  ASSERT_GE(perf_ctx->block_read_count - perf_ctx->index_block_read_count -
+                perf_ctx->filter_block_read_count -
+                perf_ctx->compression_dict_block_read_count,
+            1);
+  ASSERT_GE(perf_ctx->block_read_byte, 1);
+
+  for (const Status& status : statuses) {
+    ASSERT_OK(status);
+  }
+  // Check that keys are in cache after MultiGet.
+  for (size_t i = 0; i < keys.size(); i++) {
+    ASSERT_TRUE(table->TEST_KeyInCache(ReadOptions(), keys[i]));
+    ASSERT_EQ(values[i].ToString(), kv[keys[i].ToString()]);
+  }
+}
+
+class ChargeTableReaderTest
+    : public BlockBasedTableReaderBaseTest,
+      public testing::WithParamInterface<
+          CacheEntryRoleOptions::Decision /* charge_table_reader_mem */> {
+ protected:
+  static std::size_t CalculateMaxTableReaderNumBeforeCacheFull(
+      std::size_t cache_capacity, std::size_t approx_table_reader_mem) {
+    // To make calculation easier for testing
+    assert(cache_capacity % CacheReservationManagerImpl<
+                                CacheEntryRole::kBlockBasedTableReader>::
+                                GetDummyEntrySize() ==
+               0 &&
+           cache_capacity >= 2 * CacheReservationManagerImpl<
+                                     CacheEntryRole::kBlockBasedTableReader>::
+                                     GetDummyEntrySize());
+
+    // We need to subtract 1 for max_num_dummy_entry to account for dummy
+    // entries' overhead, assumed the overhead is no greater than 1 dummy entry
+    // size
+    std::size_t max_num_dummy_entry =
+        (size_t)std::floor((
+            1.0 * cache_capacity /
+            CacheReservationManagerImpl<
+                CacheEntryRole::kBlockBasedTableReader>::GetDummyEntrySize())) -
+        1;
+    std::size_t cache_capacity_rounded_to_dummy_entry_multiples =
+        max_num_dummy_entry *
+        CacheReservationManagerImpl<
+            CacheEntryRole::kBlockBasedTableReader>::GetDummyEntrySize();
+    std::size_t max_table_reader_num_capped = static_cast<std::size_t>(
+        std::floor(1.0 * cache_capacity_rounded_to_dummy_entry_multiples /
+                   approx_table_reader_mem));
+
+    return max_table_reader_num_capped;
+  }
+
+  void SetUp() override {
+    // To cache and re-use the same kv map and compression type in the test
+    // suite for elimiating variance caused by these two factors
+    kv_ = BlockBasedTableReaderBaseTest::GenerateKVMap();
+    compression_type_ = CompressionType::kNoCompression;
+
+    table_reader_charge_tracking_cache_ = std::make_shared<
+        TargetCacheChargeTrackingCache<
+            CacheEntryRole::kBlockBasedTableReader>>((NewLRUCache(
+        4 * CacheReservationManagerImpl<
+                CacheEntryRole::kBlockBasedTableReader>::GetDummyEntrySize(),
+        0 /* num_shard_bits */, true /* strict_capacity_limit */)));
+
+    // To ApproximateTableReaderMem() without being affected by
+    // the feature of charging its memory, we turn off the feature
+    charge_table_reader_ = CacheEntryRoleOptions::Decision::kDisabled;
+    BlockBasedTableReaderBaseTest::SetUp();
+    approx_table_reader_mem_ = ApproximateTableReaderMem();
+
+    // Now we condtionally turn on the feature to test
+    charge_table_reader_ = GetParam();
+    ConfigureTableFactory();
+  }
+
+  void ConfigureTableFactory() override {
+    BlockBasedTableOptions table_options;
+    table_options.cache_usage_options.options_overrides.insert(
+        {CacheEntryRole::kBlockBasedTableReader,
+         {/*.charged = */ charge_table_reader_}});
+    table_options.block_cache = table_reader_charge_tracking_cache_;
+
+    table_options.cache_index_and_filter_blocks = false;
+    table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+    table_options.partition_filters = true;
+    table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+
+    options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  }
+
+  CacheEntryRoleOptions::Decision charge_table_reader_;
+  std::shared_ptr<
+      TargetCacheChargeTrackingCache<CacheEntryRole::kBlockBasedTableReader>>
+      table_reader_charge_tracking_cache_;
+  std::size_t approx_table_reader_mem_;
+  std::map<std::string, std::string> kv_;
+  CompressionType compression_type_;
+
+ private:
+  std::size_t ApproximateTableReaderMem() {
+    std::size_t approx_table_reader_mem = 0;
+
+    std::string table_name = "table_for_approx_table_reader_mem";
+    CreateTable(table_name, compression_type_, kv_);
+
+    std::unique_ptr<BlockBasedTable> table;
+    Status s;
+    NewBlockBasedTableReader(
+        FileOptions(), ImmutableOptions(options_),
+        InternalKeyComparator(options_.comparator), table_name, &table,
+        false /* prefetch_index_and_filter_in_cache */, &s);
+    assert(s.ok());
+
+    approx_table_reader_mem = table->ApproximateMemoryUsage();
+    assert(approx_table_reader_mem > 0);
+    return approx_table_reader_mem;
+  }
+};
+
+INSTANTIATE_TEST_CASE_P(
+    ChargeTableReaderTest, ChargeTableReaderTest,
+    ::testing::Values(CacheEntryRoleOptions::Decision::kEnabled,
+                      CacheEntryRoleOptions::Decision::kDisabled));
+
+TEST_P(ChargeTableReaderTest, Basic) {
+  const std::size_t max_table_reader_num_capped =
+      ChargeTableReaderTest::CalculateMaxTableReaderNumBeforeCacheFull(
+          table_reader_charge_tracking_cache_->GetCapacity(),
+          approx_table_reader_mem_);
+
+  // Acceptable estimtation errors coming from
+  // 1. overstimate max_table_reader_num_capped due to # dummy entries is high
+  // and results in metadata charge overhead greater than 1 dummy entry size
+  // (violating our assumption in calculating max_table_reader_num_capped)
+  // 2. overestimate/underestimate max_table_reader_num_capped due to the gap
+  // between ApproximateTableReaderMem() and actual table reader mem
+  std::size_t max_table_reader_num_capped_upper_bound =
+      (std::size_t)(max_table_reader_num_capped * 1.05);
+  std::size_t max_table_reader_num_capped_lower_bound =
+      (std::size_t)(max_table_reader_num_capped * 0.95);
+  std::size_t max_table_reader_num_uncapped =
+      (std::size_t)(max_table_reader_num_capped * 1.1);
+  ASSERT_GT(max_table_reader_num_uncapped,
+            max_table_reader_num_capped_upper_bound)
+      << "We need `max_table_reader_num_uncapped` > "
+         "`max_table_reader_num_capped_upper_bound` to differentiate cases "
+         "between "
+         "charge_table_reader_ == kDisabled and == kEnabled)";
+
+  Status s = Status::OK();
+  std::size_t opened_table_reader_num = 0;
+  std::string table_name;
+  std::vector<std::unique_ptr<BlockBasedTable>> tables;
+  // Keep creating BlockBasedTableReader till hiting the memory limit based on
+  // cache capacity and creation fails (when charge_table_reader_ ==
+  // kEnabled) or reaching a specfied big number of table readers (when
+  // charge_table_reader_ == kDisabled)
+  while (s.ok() && opened_table_reader_num < max_table_reader_num_uncapped) {
+    table_name = "table_" + std::to_string(opened_table_reader_num);
+    CreateTable(table_name, compression_type_, kv_);
+    tables.push_back(std::unique_ptr<BlockBasedTable>());
+    NewBlockBasedTableReader(
+        FileOptions(), ImmutableOptions(options_),
+        InternalKeyComparator(options_.comparator), table_name, &tables.back(),
+        false /* prefetch_index_and_filter_in_cache */, &s);
+    if (s.ok()) {
+      ++opened_table_reader_num;
+    }
+  }
+
+  if (charge_table_reader_ == CacheEntryRoleOptions::Decision::kEnabled) {
+    EXPECT_TRUE(s.IsMemoryLimit()) << "s: " << s.ToString();
+    EXPECT_TRUE(s.ToString().find(
+                    kCacheEntryRoleToCamelString[static_cast<std::uint32_t>(
+                        CacheEntryRole::kBlockBasedTableReader)]) !=
+                std::string::npos);
+    EXPECT_TRUE(s.ToString().find("memory limit based on cache capacity") !=
+                std::string::npos);
+
+    EXPECT_GE(opened_table_reader_num, max_table_reader_num_capped_lower_bound);
+    EXPECT_LE(opened_table_reader_num, max_table_reader_num_capped_upper_bound);
+
+    std::size_t updated_max_table_reader_num_capped =
+        ChargeTableReaderTest::CalculateMaxTableReaderNumBeforeCacheFull(
+            table_reader_charge_tracking_cache_->GetCapacity() / 2,
+            approx_table_reader_mem_);
+
+    // Keep deleting BlockBasedTableReader to lower down memory usage from the
+    // memory limit to make the next creation succeeds
+    while (opened_table_reader_num >= updated_max_table_reader_num_capped) {
+      tables.pop_back();
+      --opened_table_reader_num;
+    }
+    table_name = "table_for_successful_table_reader_open";
+    CreateTable(table_name, compression_type_, kv_);
+    tables.push_back(std::unique_ptr<BlockBasedTable>());
+    NewBlockBasedTableReader(
+        FileOptions(), ImmutableOptions(options_),
+        InternalKeyComparator(options_.comparator), table_name, &tables.back(),
+        false /* prefetch_index_and_filter_in_cache */, &s);
+    EXPECT_TRUE(s.ok()) << s.ToString();
+
+    tables.clear();
+    EXPECT_EQ(table_reader_charge_tracking_cache_->GetCacheCharge(), 0);
+  } else {
+    EXPECT_TRUE(s.ok() &&
+                opened_table_reader_num == max_table_reader_num_uncapped)
+        << "s: " << s.ToString() << " opened_table_reader_num: "
+        << std::to_string(opened_table_reader_num);
+    EXPECT_EQ(table_reader_charge_tracking_cache_->GetCacheCharge(), 0);
+  }
+}
+
+class BlockBasedTableReaderTestVerifyChecksum
+    : public BlockBasedTableReaderTest {
+ public:
+  BlockBasedTableReaderTestVerifyChecksum() : BlockBasedTableReaderTest() {}
+};
+
+TEST_P(BlockBasedTableReaderTestVerifyChecksum, ChecksumMismatch) {
+  std::map<std::string, std::string> kv =
+      BlockBasedTableReaderBaseTest::GenerateKVMap(800 /* num_block */);
+
+  std::string table_name =
+      "BlockBasedTableReaderTest" + CompressionTypeToString(compression_type_);
+  CreateTable(table_name, compression_type_, kv);
+
+  std::unique_ptr<BlockBasedTable> table;
+  Options options;
+  ImmutableOptions ioptions(options);
+  FileOptions foptions;
+  foptions.use_direct_reads = use_direct_reads_;
+  InternalKeyComparator comparator(options.comparator);
+  NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table);
+
+  // Use the top level iterator to find the offset/size of the first
+  // 2nd level index block and corrupt the block
+  IndexBlockIter iiter_on_stack;
+  BlockCacheLookupContext context{TableReaderCaller::kUserVerifyChecksum};
+  InternalIteratorBase<IndexValue>* iiter = table->NewIndexIterator(
+      ReadOptions(), /*disable_prefix_seek=*/false, &iiter_on_stack,
+      /*get_context=*/nullptr, &context);
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+  if (iiter != &iiter_on_stack) {
+    iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
+  }
+  ASSERT_OK(iiter->status());
+  iiter->SeekToFirst();
+  BlockHandle handle = static_cast<PartitionedIndexIterator*>(iiter)
+                           ->index_iter_->value()
+                           .handle;
+  table.reset();
+
+  // Corrupt the block pointed to by handle
+  ASSERT_OK(test::CorruptFile(options.env, Path(table_name),
+                              static_cast<int>(handle.offset()), 128));
+
+  NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table);
+  Status s = table->VerifyChecksum(ReadOptions(),
+                                   TableReaderCaller::kUserVerifyChecksum);
+  ASSERT_EQ(s.code(), Status::kCorruption);
+}
+
+// Param 1: compression type
+// Param 2: whether to use direct reads
+// Param 3: Block Based Table Index type
+// Param 4: BBTO no_block_cache option
+#ifdef ROCKSDB_LITE
+// Skip direct I/O tests in lite mode since direct I/O is unsupported.
+INSTANTIATE_TEST_CASE_P(
+    MultiGet, BlockBasedTableReaderTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(GetSupportedCompressions()),
+        ::testing::Values(false),
+        ::testing::Values(BlockBasedTableOptions::IndexType::kBinarySearch),
+        ::testing::Values(false)));
+#else   // ROCKSDB_LITE
+INSTANTIATE_TEST_CASE_P(
+    MultiGet, BlockBasedTableReaderTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(GetSupportedCompressions()), ::testing::Bool(),
+        ::testing::Values(BlockBasedTableOptions::IndexType::kBinarySearch),
+        ::testing::Values(false)));
+#endif  // ROCKSDB_LITE
+INSTANTIATE_TEST_CASE_P(
+    VerifyChecksum, BlockBasedTableReaderTestVerifyChecksum,
+    ::testing::Combine(
+        ::testing::ValuesIn(GetSupportedCompressions()),
+        ::testing::Values(false),
+        ::testing::Values(
+            BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch),
+        ::testing::Values(true)));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/block_based/block_builder.cc b/src/rocksdb/table/block_based/block_builder.cc
new file mode 100644
index 000000000..92702b17d
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_builder.cc
@@ -0,0 +1,234 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// BlockBuilder generates blocks where keys are prefix-compressed:
+//
+// When we store a key, we drop the prefix shared with the previous
+// string.  This helps reduce the space requirement significantly.
+// Furthermore, once every K keys, we do not apply the prefix
+// compression and store the entire key.  We call this a "restart
+// point".  The tail end of the block stores the offsets of all of the
+// restart points, and can be used to do a binary search when looking
+// for a particular key.  Values are stored as-is (without compression)
+// immediately following the corresponding key.
+//
+// An entry for a particular key-value pair has the form:
+//     shared_bytes: varint32
+//     unshared_bytes: varint32
+//     value_length: varint32
+//     key_delta: char[unshared_bytes]
+//     value: char[value_length]
+// shared_bytes == 0 for restart points.
+//
+// The trailer of the block has the form:
+//     restarts: uint32[num_restarts]
+//     num_restarts: uint32
+// restarts[i] contains the offset within the block of the ith restart point.
+
+#include "table/block_based/block_builder.h"
+
+#include <assert.h>
+
+#include <algorithm>
+
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "table/block_based/data_block_footer.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlockBuilder::BlockBuilder(
+    int block_restart_interval, bool use_delta_encoding,
+    bool use_value_delta_encoding,
+    BlockBasedTableOptions::DataBlockIndexType index_type,
+    double data_block_hash_table_util_ratio)
+    : block_restart_interval_(block_restart_interval),
+      use_delta_encoding_(use_delta_encoding),
+      use_value_delta_encoding_(use_value_delta_encoding),
+      restarts_(1, 0),  // First restart point is at offset 0
+      counter_(0),
+      finished_(false) {
+  switch (index_type) {
+    case BlockBasedTableOptions::kDataBlockBinarySearch:
+      break;
+    case BlockBasedTableOptions::kDataBlockBinaryAndHash:
+      data_block_hash_index_builder_.Initialize(
+          data_block_hash_table_util_ratio);
+      break;
+    default:
+      assert(0);
+  }
+  assert(block_restart_interval_ >= 1);
+  estimate_ = sizeof(uint32_t) + sizeof(uint32_t);
+}
+
+void BlockBuilder::Reset() {
+  buffer_.clear();
+  restarts_.resize(1);  // First restart point is at offset 0
+  assert(restarts_[0] == 0);
+  estimate_ = sizeof(uint32_t) + sizeof(uint32_t);
+  counter_ = 0;
+  finished_ = false;
+  last_key_.clear();
+  if (data_block_hash_index_builder_.Valid()) {
+    data_block_hash_index_builder_.Reset();
+  }
+#ifndef NDEBUG
+  add_with_last_key_called_ = false;
+#endif
+}
+
+void BlockBuilder::SwapAndReset(std::string& buffer) {
+  std::swap(buffer_, buffer);
+  Reset();
+}
+
+size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key,
+                                         const Slice& value) const {
+  size_t estimate = CurrentSizeEstimate();
+  // Note: this is an imprecise estimate as it accounts for the whole key size
+  // instead of non-shared key size.
+  estimate += key.size();
+  // In value delta encoding we estimate the value delta size as half the full
+  // value size since only the size field of block handle is encoded.
+  estimate +=
+      !use_value_delta_encoding_ || (counter_ >= block_restart_interval_)
+          ? value.size()
+          : value.size() / 2;
+
+  if (counter_ >= block_restart_interval_) {
+    estimate += sizeof(uint32_t);  // a new restart entry.
+  }
+
+  estimate += sizeof(int32_t);  // varint for shared prefix length.
+  // Note: this is an imprecise estimate as we will have to encoded size, one
+  // for shared key and one for non-shared key.
+  estimate += VarintLength(key.size());  // varint for key length.
+  if (!use_value_delta_encoding_ || (counter_ >= block_restart_interval_)) {
+    estimate += VarintLength(value.size());  // varint for value length.
+  }
+
+  return estimate;
+}
+
+Slice BlockBuilder::Finish() {
+  // Append restart array
+  for (size_t i = 0; i < restarts_.size(); i++) {
+    PutFixed32(&buffer_, restarts_[i]);
+  }
+
+  uint32_t num_restarts = static_cast<uint32_t>(restarts_.size());
+  BlockBasedTableOptions::DataBlockIndexType index_type =
+      BlockBasedTableOptions::kDataBlockBinarySearch;
+  if (data_block_hash_index_builder_.Valid() &&
+      CurrentSizeEstimate() <= kMaxBlockSizeSupportedByHashIndex) {
+    data_block_hash_index_builder_.Finish(buffer_);
+    index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
+  }
+
+  // footer is a packed format of data_block_index_type and num_restarts
+  uint32_t block_footer = PackIndexTypeAndNumRestarts(index_type, num_restarts);
+
+  PutFixed32(&buffer_, block_footer);
+  finished_ = true;
+  return Slice(buffer_);
+}
+
+void BlockBuilder::Add(const Slice& key, const Slice& value,
+                       const Slice* const delta_value) {
+  // Ensure no unsafe mixing of Add and AddWithLastKey
+  assert(!add_with_last_key_called_);
+
+  AddWithLastKeyImpl(key, value, last_key_, delta_value, buffer_.size());
+  if (use_delta_encoding_) {
+    // Update state
+    // We used to just copy the changed data, but it appears to be
+    // faster to just copy the whole thing.
+    last_key_.assign(key.data(), key.size());
+  }
+}
+
+void BlockBuilder::AddWithLastKey(const Slice& key, const Slice& value,
+                                  const Slice& last_key_param,
+                                  const Slice* const delta_value) {
+  // Ensure no unsafe mixing of Add and AddWithLastKey
+  assert(last_key_.empty());
+#ifndef NDEBUG
+  add_with_last_key_called_ = false;
+#endif
+
+  // Here we make sure to use an empty `last_key` on first call after creation
+  // or Reset. This is more convenient for the caller and we can be more
+  // clever inside BlockBuilder. On this hot code path, we want to avoid
+  // conditional jumps like `buffer_.empty() ? ... : ...` so we can use a
+  // fast min operation instead, with an assertion to be sure our logic is
+  // sound.
+  size_t buffer_size = buffer_.size();
+  size_t last_key_size = last_key_param.size();
+  assert(buffer_size == 0 || buffer_size >= last_key_size);
+
+  Slice last_key(last_key_param.data(), std::min(buffer_size, last_key_size));
+
+  AddWithLastKeyImpl(key, value, last_key, delta_value, buffer_size);
+}
+
+inline void BlockBuilder::AddWithLastKeyImpl(const Slice& key,
+                                             const Slice& value,
+                                             const Slice& last_key,
+                                             const Slice* const delta_value,
+                                             size_t buffer_size) {
+  assert(!finished_);
+  assert(counter_ <= block_restart_interval_);
+  assert(!use_value_delta_encoding_ || delta_value);
+  size_t shared = 0;  // number of bytes shared with prev key
+  if (counter_ >= block_restart_interval_) {
+    // Restart compression
+    restarts_.push_back(static_cast<uint32_t>(buffer_size));
+    estimate_ += sizeof(uint32_t);
+    counter_ = 0;
+  } else if (use_delta_encoding_) {
+    // See how much sharing to do with previous string
+    shared = key.difference_offset(last_key);
+  }
+
+  const size_t non_shared = key.size() - shared;
+
+  if (use_value_delta_encoding_) {
+    // Add "<shared><non_shared>" to buffer_
+    PutVarint32Varint32(&buffer_, static_cast<uint32_t>(shared),
+                        static_cast<uint32_t>(non_shared));
+  } else {
+    // Add "<shared><non_shared><value_size>" to buffer_
+    PutVarint32Varint32Varint32(&buffer_, static_cast<uint32_t>(shared),
+                                static_cast<uint32_t>(non_shared),
+                                static_cast<uint32_t>(value.size()));
+  }
+
+  // Add string delta to buffer_ followed by value
+  buffer_.append(key.data() + shared, non_shared);
+  // Use value delta encoding only when the key has shared bytes. This would
+  // simplify the decoding, where it can figure which decoding to use simply by
+  // looking at the shared bytes size.
+  if (shared != 0 && use_value_delta_encoding_) {
+    buffer_.append(delta_value->data(), delta_value->size());
+  } else {
+    buffer_.append(value.data(), value.size());
+  }
+
+  if (data_block_hash_index_builder_.Valid()) {
+    data_block_hash_index_builder_.Add(ExtractUserKey(key),
+                                       restarts_.size() - 1);
+  }
+
+  counter_++;
+  estimate_ += buffer_.size() - buffer_size;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_builder.h b/src/rocksdb/table/block_based/block_builder.h
new file mode 100644
index 000000000..5f68b449b
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_builder.h
@@ -0,0 +1,102 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+
+#include <vector>
+
+#include "rocksdb/slice.h"
+#include "rocksdb/table.h"
+#include "table/block_based/data_block_hash_index.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlockBuilder {
+ public:
+  BlockBuilder(const BlockBuilder&) = delete;
+  void operator=(const BlockBuilder&) = delete;
+
+  explicit BlockBuilder(int block_restart_interval,
+                        bool use_delta_encoding = true,
+                        bool use_value_delta_encoding = false,
+                        BlockBasedTableOptions::DataBlockIndexType index_type =
+                            BlockBasedTableOptions::kDataBlockBinarySearch,
+                        double data_block_hash_table_util_ratio = 0.75);
+
+  // Reset the contents as if the BlockBuilder was just constructed.
+  void Reset();
+
+  // Swap the contents in BlockBuilder with buffer, then reset the BlockBuilder.
+  void SwapAndReset(std::string& buffer);
+
+  // REQUIRES: Finish() has not been called since the last call to Reset().
+  // REQUIRES: key is larger than any previously added key
+  // DO NOT mix with AddWithLastKey() between Resets. For efficiency, use
+  // AddWithLastKey() in contexts where previous added key is already known
+  // and delta encoding might be used.
+  void Add(const Slice& key, const Slice& value,
+           const Slice* const delta_value = nullptr);
+
+  // A faster version of Add() if the previous key is already known for all
+  // Add()s.
+  // REQUIRES: Finish() has not been called since the last call to Reset().
+  // REQUIRES: key is larger than any previously added key
+  // REQUIRES: if AddWithLastKey has been called since last Reset(), last_key
+  // is the key from most recent AddWithLastKey. (For convenience, last_key
+  // is ignored on first call after creation or Reset().)
+  // DO NOT mix with Add() between Resets.
+  void AddWithLastKey(const Slice& key, const Slice& value,
+                      const Slice& last_key,
+                      const Slice* const delta_value = nullptr);
+
+  // Finish building the block and return a slice that refers to the
+  // block contents.  The returned slice will remain valid for the
+  // lifetime of this builder or until Reset() is called.
+  Slice Finish();
+
+  // Returns an estimate of the current (uncompressed) size of the block
+  // we are building.
+  inline size_t CurrentSizeEstimate() const {
+    return estimate_ + (data_block_hash_index_builder_.Valid()
+                            ? data_block_hash_index_builder_.EstimateSize()
+                            : 0);
+  }
+
+  // Returns an estimated block size after appending key and value.
+  size_t EstimateSizeAfterKV(const Slice& key, const Slice& value) const;
+
+  // Return true iff no entries have been added since the last Reset()
+  bool empty() const { return buffer_.empty(); }
+
+ private:
+  inline void AddWithLastKeyImpl(const Slice& key, const Slice& value,
+                                 const Slice& last_key,
+                                 const Slice* const delta_value,
+                                 size_t buffer_size);
+
+  const int block_restart_interval_;
+  // TODO(myabandeh): put it into a separate IndexBlockBuilder
+  const bool use_delta_encoding_;
+  // Refer to BlockIter::DecodeCurrentValue for format of delta encoded values
+  const bool use_value_delta_encoding_;
+
+  std::string buffer_;              // Destination buffer
+  std::vector<uint32_t> restarts_;  // Restart points
+  size_t estimate_;
+  int counter_;    // Number of entries emitted since restart
+  bool finished_;  // Has Finish() been called?
+  std::string last_key_;
+  DataBlockHashIndexBuilder data_block_hash_index_builder_;
+#ifndef NDEBUG
+  bool add_with_last_key_called_ = false;
+#endif
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_like_traits.h b/src/rocksdb/table/block_based/block_like_traits.h
new file mode 100644
index 000000000..d406dbb5d
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_like_traits.h
@@ -0,0 +1,182 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "cache/cache_entry_roles.h"
+#include "port/lang.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_type.h"
+#include "table/block_based/parsed_full_filter_block.h"
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+template <typename TBlocklike>
+class BlocklikeTraits;
+
+template <typename T, CacheEntryRole R>
+Cache::CacheItemHelper* GetCacheItemHelperForRole();
+
+template <typename TBlocklike>
+Cache::CreateCallback GetCreateCallback(size_t read_amp_bytes_per_bit,
+                                        Statistics* statistics, bool using_zstd,
+                                        const FilterPolicy* filter_policy) {
+  return [read_amp_bytes_per_bit, statistics, using_zstd, filter_policy](
+             const void* buf, size_t size, void** out_obj,
+             size_t* charge) -> Status {
+    assert(buf != nullptr);
+    std::unique_ptr<char[]> buf_data(new char[size]());
+    memcpy(buf_data.get(), buf, size);
+    BlockContents bc = BlockContents(std::move(buf_data), size);
+    TBlocklike* ucd_ptr = BlocklikeTraits<TBlocklike>::Create(
+        std::move(bc), read_amp_bytes_per_bit, statistics, using_zstd,
+        filter_policy);
+    *out_obj = reinterpret_cast<void*>(ucd_ptr);
+    *charge = size;
+    return Status::OK();
+  };
+}
+
+template <>
+class BlocklikeTraits<ParsedFullFilterBlock> {
+ public:
+  static ParsedFullFilterBlock* Create(BlockContents&& contents,
+                                       size_t /* read_amp_bytes_per_bit */,
+                                       Statistics* /* statistics */,
+                                       bool /* using_zstd */,
+                                       const FilterPolicy* filter_policy) {
+    return new ParsedFullFilterBlock(filter_policy, std::move(contents));
+  }
+
+  static uint32_t GetNumRestarts(const ParsedFullFilterBlock& /* block */) {
+    return 0;
+  }
+
+  static size_t SizeCallback(void* obj) {
+    assert(obj != nullptr);
+    ParsedFullFilterBlock* ptr = static_cast<ParsedFullFilterBlock*>(obj);
+    return ptr->GetBlockContentsData().size();
+  }
+
+  static Status SaveToCallback(void* from_obj, size_t from_offset,
+                               size_t length, void* out) {
+    assert(from_obj != nullptr);
+    ParsedFullFilterBlock* ptr = static_cast<ParsedFullFilterBlock*>(from_obj);
+    const char* buf = ptr->GetBlockContentsData().data();
+    assert(length == ptr->GetBlockContentsData().size());
+    (void)from_offset;
+    memcpy(out, buf, length);
+    return Status::OK();
+  }
+
+  static Cache::CacheItemHelper* GetCacheItemHelper(BlockType block_type) {
+    (void)block_type;
+    assert(block_type == BlockType::kFilter);
+    return GetCacheItemHelperForRole<ParsedFullFilterBlock,
+                                     CacheEntryRole::kFilterBlock>();
+  }
+};
+
+template <>
+class BlocklikeTraits<Block> {
+ public:
+  static Block* Create(BlockContents&& contents, size_t read_amp_bytes_per_bit,
+                       Statistics* statistics, bool /* using_zstd */,
+                       const FilterPolicy* /* filter_policy */) {
+    return new Block(std::move(contents), read_amp_bytes_per_bit, statistics);
+  }
+
+  static uint32_t GetNumRestarts(const Block& block) {
+    return block.NumRestarts();
+  }
+
+  static size_t SizeCallback(void* obj) {
+    assert(obj != nullptr);
+    Block* ptr = static_cast<Block*>(obj);
+    return ptr->size();
+  }
+
+  static Status SaveToCallback(void* from_obj, size_t from_offset,
+                               size_t length, void* out) {
+    assert(from_obj != nullptr);
+    Block* ptr = static_cast<Block*>(from_obj);
+    const char* buf = ptr->data();
+    assert(length == ptr->size());
+    (void)from_offset;
+    memcpy(out, buf, length);
+    return Status::OK();
+  }
+
+  static Cache::CacheItemHelper* GetCacheItemHelper(BlockType block_type) {
+    switch (block_type) {
+      case BlockType::kData:
+        return GetCacheItemHelperForRole<Block, CacheEntryRole::kDataBlock>();
+      case BlockType::kIndex:
+        return GetCacheItemHelperForRole<Block, CacheEntryRole::kIndexBlock>();
+      case BlockType::kFilterPartitionIndex:
+        return GetCacheItemHelperForRole<Block,
+                                         CacheEntryRole::kFilterMetaBlock>();
+      default:
+        // Not a recognized combination
+        assert(false);
+        FALLTHROUGH_INTENDED;
+      case BlockType::kRangeDeletion:
+        return GetCacheItemHelperForRole<Block, CacheEntryRole::kOtherBlock>();
+    }
+  }
+};
+
+template <>
+class BlocklikeTraits<UncompressionDict> {
+ public:
+  static UncompressionDict* Create(BlockContents&& contents,
+                                   size_t /* read_amp_bytes_per_bit */,
+                                   Statistics* /* statistics */,
+                                   bool using_zstd,
+                                   const FilterPolicy* /* filter_policy */) {
+    return new UncompressionDict(contents.data, std::move(contents.allocation),
+                                 using_zstd);
+  }
+
+  static uint32_t GetNumRestarts(const UncompressionDict& /* dict */) {
+    return 0;
+  }
+
+  static size_t SizeCallback(void* obj) {
+    assert(obj != nullptr);
+    UncompressionDict* ptr = static_cast<UncompressionDict*>(obj);
+    return ptr->slice_.size();
+  }
+
+  static Status SaveToCallback(void* from_obj, size_t from_offset,
+                               size_t length, void* out) {
+    assert(from_obj != nullptr);
+    UncompressionDict* ptr = static_cast<UncompressionDict*>(from_obj);
+    const char* buf = ptr->slice_.data();
+    assert(length == ptr->slice_.size());
+    (void)from_offset;
+    memcpy(out, buf, length);
+    return Status::OK();
+  }
+
+  static Cache::CacheItemHelper* GetCacheItemHelper(BlockType block_type) {
+    (void)block_type;
+    assert(block_type == BlockType::kCompressionDictionary);
+    return GetCacheItemHelperForRole<UncompressionDict,
+                                     CacheEntryRole::kOtherBlock>();
+  }
+};
+
+// Get an CacheItemHelper pointer for value type T and role R.
+template <typename T, CacheEntryRole R>
+Cache::CacheItemHelper* GetCacheItemHelperForRole() {
+  static Cache::CacheItemHelper cache_helper(
+      BlocklikeTraits<T>::SizeCallback, BlocklikeTraits<T>::SaveToCallback,
+      GetCacheEntryDeleterForRole<T, R>());
+  return &cache_helper;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_prefetcher.cc b/src/rocksdb/table/block_based/block_prefetcher.cc
new file mode 100644
index 000000000..83ec2cb06
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_prefetcher.cc
@@ -0,0 +1,120 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/block_prefetcher.h"
+
+#include "rocksdb/file_system.h"
+#include "table/block_based/block_based_table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+void BlockPrefetcher::PrefetchIfNeeded(
+    const BlockBasedTable::Rep* rep, const BlockHandle& handle,
+    const size_t readahead_size, bool is_for_compaction,
+    const bool no_sequential_checking,
+    const Env::IOPriority rate_limiter_priority) {
+  // num_file_reads is used  by FilePrefetchBuffer only when
+  // implicit_auto_readahead is set.
+  if (is_for_compaction) {
+    rep->CreateFilePrefetchBufferIfNotExists(
+        compaction_readahead_size_, compaction_readahead_size_,
+        &prefetch_buffer_, /*implicit_auto_readahead=*/false,
+        /*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/0);
+    return;
+  }
+
+  // Explicit user requested readahead.
+  if (readahead_size > 0) {
+    rep->CreateFilePrefetchBufferIfNotExists(
+        readahead_size, readahead_size, &prefetch_buffer_,
+        /*implicit_auto_readahead=*/false, /*num_file_reads=*/0,
+        /*num_file_reads_for_auto_readahead=*/0);
+    return;
+  }
+
+  // Implicit readahead.
+
+  // If max_auto_readahead_size is set to be 0 by user, no data will be
+  // prefetched.
+  size_t max_auto_readahead_size = rep->table_options.max_auto_readahead_size;
+  if (max_auto_readahead_size == 0 || initial_auto_readahead_size_ == 0) {
+    return;
+  }
+
+  if (initial_auto_readahead_size_ > max_auto_readahead_size) {
+    initial_auto_readahead_size_ = max_auto_readahead_size;
+  }
+
+  // In case of no_sequential_checking, it will skip the num_file_reads_ and
+  // will always creates the FilePrefetchBuffer.
+  if (no_sequential_checking) {
+    rep->CreateFilePrefetchBufferIfNotExists(
+        initial_auto_readahead_size_, max_auto_readahead_size,
+        &prefetch_buffer_, /*implicit_auto_readahead=*/true,
+        /*num_file_reads=*/0,
+        rep->table_options.num_file_reads_for_auto_readahead);
+    return;
+  }
+
+  size_t len = BlockBasedTable::BlockSizeWithTrailer(handle);
+  size_t offset = handle.offset();
+
+  // If FS supports prefetching (readahead_limit_ will be non zero in that case)
+  // and current block exists in prefetch buffer then return.
+  if (offset + len <= readahead_limit_) {
+    UpdateReadPattern(offset, len);
+    return;
+  }
+
+  if (!IsBlockSequential(offset)) {
+    UpdateReadPattern(offset, len);
+    ResetValues(rep->table_options.initial_auto_readahead_size);
+    return;
+  }
+  UpdateReadPattern(offset, len);
+
+  // Implicit auto readahead, which will be enabled if the number of reads
+  // reached `table_options.num_file_reads_for_auto_readahead` (default: 2)  and
+  // scans are sequential.
+  num_file_reads_++;
+  if (num_file_reads_ <= rep->table_options.num_file_reads_for_auto_readahead) {
+    return;
+  }
+
+  if (rep->file->use_direct_io()) {
+    rep->CreateFilePrefetchBufferIfNotExists(
+        initial_auto_readahead_size_, max_auto_readahead_size,
+        &prefetch_buffer_, /*implicit_auto_readahead=*/true, num_file_reads_,
+        rep->table_options.num_file_reads_for_auto_readahead);
+    return;
+  }
+
+  if (readahead_size_ > max_auto_readahead_size) {
+    readahead_size_ = max_auto_readahead_size;
+  }
+
+  // If prefetch is not supported, fall back to use internal prefetch buffer.
+  // Discarding other return status of Prefetch calls intentionally, as
+  // we can fallback to reading from disk if Prefetch fails.
+  Status s = rep->file->Prefetch(
+      handle.offset(),
+      BlockBasedTable::BlockSizeWithTrailer(handle) + readahead_size_,
+      rate_limiter_priority);
+  if (s.IsNotSupported()) {
+    rep->CreateFilePrefetchBufferIfNotExists(
+        initial_auto_readahead_size_, max_auto_readahead_size,
+        &prefetch_buffer_, /*implicit_auto_readahead=*/true, num_file_reads_,
+        rep->table_options.num_file_reads_for_auto_readahead);
+    return;
+  }
+
+  readahead_limit_ = offset + len + readahead_size_;
+  // Keep exponentially increasing readahead size until
+  // max_auto_readahead_size.
+  readahead_size_ = std::min(max_auto_readahead_size, readahead_size_ * 2);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_prefetcher.h b/src/rocksdb/table/block_based/block_prefetcher.h
new file mode 100644
index 000000000..518868a30
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_prefetcher.h
@@ -0,0 +1,72 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include "table/block_based/block_based_table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+class BlockPrefetcher {
+ public:
+  explicit BlockPrefetcher(size_t compaction_readahead_size,
+                           size_t initial_auto_readahead_size)
+      : compaction_readahead_size_(compaction_readahead_size),
+        readahead_size_(initial_auto_readahead_size),
+        initial_auto_readahead_size_(initial_auto_readahead_size) {}
+
+  void PrefetchIfNeeded(const BlockBasedTable::Rep* rep,
+                        const BlockHandle& handle, size_t readahead_size,
+                        bool is_for_compaction,
+                        const bool no_sequential_checking,
+                        Env::IOPriority rate_limiter_priority);
+  FilePrefetchBuffer* prefetch_buffer() { return prefetch_buffer_.get(); }
+
+  void UpdateReadPattern(const uint64_t& offset, const size_t& len) {
+    prev_offset_ = offset;
+    prev_len_ = len;
+  }
+
+  bool IsBlockSequential(const uint64_t& offset) {
+    return (prev_len_ == 0 || (prev_offset_ + prev_len_ == offset));
+  }
+
+  void ResetValues(size_t initial_auto_readahead_size) {
+    num_file_reads_ = 1;
+    // Since initial_auto_readahead_size_ can be different from
+    // the value passed to BlockBasedTableOptions.initial_auto_readahead_size in
+    // case of adaptive_readahead, so fallback the readahead_size_ to that value
+    // in case of reset.
+    initial_auto_readahead_size_ = initial_auto_readahead_size;
+    readahead_size_ = initial_auto_readahead_size_;
+    readahead_limit_ = 0;
+    return;
+  }
+
+  void SetReadaheadState(ReadaheadFileInfo::ReadaheadInfo* readahead_info) {
+    num_file_reads_ = readahead_info->num_file_reads;
+    initial_auto_readahead_size_ = readahead_info->readahead_size;
+    TEST_SYNC_POINT_CALLBACK("BlockPrefetcher::SetReadaheadState",
+                             &initial_auto_readahead_size_);
+  }
+
+ private:
+  // Readahead size used in compaction, its value is used only if
+  // lookup_context_.caller = kCompaction.
+  size_t compaction_readahead_size_;
+
+  // readahead_size_ is used if underlying FS supports prefetching.
+  size_t readahead_size_;
+  size_t readahead_limit_ = 0;
+  // initial_auto_readahead_size_ is used if RocksDB uses internal prefetch
+  // buffer.
+  uint64_t initial_auto_readahead_size_;
+  uint64_t num_file_reads_ = 0;
+  uint64_t prev_offset_ = 0;
+  size_t prev_len_ = 0;
+  std::unique_ptr<FilePrefetchBuffer> prefetch_buffer_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_prefix_index.cc b/src/rocksdb/table/block_based/block_prefix_index.cc
new file mode 100644
index 000000000..c83701d69
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_prefix_index.cc
@@ -0,0 +1,226 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/block_based/block_prefix_index.h"
+
+#include <vector>
+
+#include "memory/arena.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+inline uint32_t Hash(const Slice& s) {
+  return ROCKSDB_NAMESPACE::Hash(s.data(), s.size(), 0);
+}
+
+inline uint32_t PrefixToBucket(const Slice& prefix, uint32_t num_buckets) {
+  return Hash(prefix) % num_buckets;
+}
+
+// The prefix block index is simply a bucket array, with each entry pointing to
+// the blocks that span the prefixes hashed to this bucket.
+//
+// To reduce memory footprint, if there is only one block per bucket, the entry
+// stores the block id directly. If there are more than one blocks per bucket,
+// because of hash collision or a single prefix spanning multiple blocks,
+// the entry points to an array of block ids. The block array is an array of
+// uint32_t's. The first uint32_t indicates the total number of blocks, followed
+// by the block ids.
+//
+// To differentiate the two cases, the high order bit of the entry indicates
+// whether it is a 'pointer' into a separate block array.
+// 0x7FFFFFFF is reserved for empty bucket.
+
+const uint32_t kNoneBlock = 0x7FFFFFFF;
+const uint32_t kBlockArrayMask = 0x80000000;
+
+inline bool IsNone(uint32_t block_id) { return block_id == kNoneBlock; }
+
+inline bool IsBlockId(uint32_t block_id) {
+  return (block_id & kBlockArrayMask) == 0;
+}
+
+inline uint32_t DecodeIndex(uint32_t block_id) {
+  uint32_t index = block_id ^ kBlockArrayMask;
+  assert(index < kBlockArrayMask);
+  return index;
+}
+
+inline uint32_t EncodeIndex(uint32_t index) {
+  assert(index < kBlockArrayMask);
+  return index | kBlockArrayMask;
+}
+
+// temporary storage for prefix information during index building
+struct PrefixRecord {
+  Slice prefix;
+  uint32_t start_block;
+  uint32_t end_block;
+  uint32_t num_blocks;
+  PrefixRecord* next;
+};
+
+class BlockPrefixIndex::Builder {
+ public:
+  void Add(const Slice& key_prefix, uint32_t start_block, uint32_t num_blocks) {
+    PrefixRecord* record = reinterpret_cast<PrefixRecord*>(
+        arena_.AllocateAligned(sizeof(PrefixRecord)));
+    record->prefix = key_prefix;
+    record->start_block = start_block;
+    record->end_block = start_block + num_blocks - 1;
+    record->num_blocks = num_blocks;
+    prefixes_.push_back(record);
+  }
+
+  BlockPrefixIndex* Finish(const SliceTransform* prefix_extractor) {
+    // For now, use roughly 1:1 prefix to bucket ratio.
+    uint32_t num_buckets = static_cast<uint32_t>(prefixes_.size()) + 1;
+
+    // Collect prefix records that hash to the same bucket, into a single
+    // linklist.
+    std::vector<PrefixRecord*> prefixes_per_bucket(num_buckets, nullptr);
+    std::vector<uint32_t> num_blocks_per_bucket(num_buckets, 0);
+    for (PrefixRecord* current : prefixes_) {
+      uint32_t bucket = PrefixToBucket(current->prefix, num_buckets);
+      // merge the prefix block span if the first block of this prefix is
+      // connected to the last block of the previous prefix.
+      PrefixRecord* prev = prefixes_per_bucket[bucket];
+      if (prev) {
+        assert(current->start_block >= prev->end_block);
+        auto distance = current->start_block - prev->end_block;
+        if (distance <= 1) {
+          prev->end_block = current->end_block;
+          prev->num_blocks = prev->end_block - prev->start_block + 1;
+          num_blocks_per_bucket[bucket] += (current->num_blocks + distance - 1);
+          continue;
+        }
+      }
+      current->next = prev;
+      prefixes_per_bucket[bucket] = current;
+      num_blocks_per_bucket[bucket] += current->num_blocks;
+    }
+
+    // Calculate the block array buffer size
+    uint32_t total_block_array_entries = 0;
+    for (uint32_t i = 0; i < num_buckets; i++) {
+      uint32_t num_blocks = num_blocks_per_bucket[i];
+      if (num_blocks > 1) {
+        total_block_array_entries += (num_blocks + 1);
+      }
+    }
+
+    // Populate the final prefix block index
+    uint32_t* block_array_buffer = new uint32_t[total_block_array_entries];
+    uint32_t* buckets = new uint32_t[num_buckets];
+    uint32_t offset = 0;
+    for (uint32_t i = 0; i < num_buckets; i++) {
+      uint32_t num_blocks = num_blocks_per_bucket[i];
+      if (num_blocks == 0) {
+        assert(prefixes_per_bucket[i] == nullptr);
+        buckets[i] = kNoneBlock;
+      } else if (num_blocks == 1) {
+        assert(prefixes_per_bucket[i] != nullptr);
+        assert(prefixes_per_bucket[i]->next == nullptr);
+        buckets[i] = prefixes_per_bucket[i]->start_block;
+      } else {
+        assert(total_block_array_entries > 0);
+        assert(prefixes_per_bucket[i] != nullptr);
+        buckets[i] = EncodeIndex(offset);
+        block_array_buffer[offset] = num_blocks;
+        uint32_t* last_block = &block_array_buffer[offset + num_blocks];
+        auto current = prefixes_per_bucket[i];
+        // populate block ids from largest to smallest
+        while (current != nullptr) {
+          for (uint32_t iter = 0; iter < current->num_blocks; iter++) {
+            *last_block = current->end_block - iter;
+            last_block--;
+          }
+          current = current->next;
+        }
+        assert(last_block == &block_array_buffer[offset]);
+        offset += (num_blocks + 1);
+      }
+    }
+
+    assert(offset == total_block_array_entries);
+
+    return new BlockPrefixIndex(prefix_extractor, num_buckets, buckets,
+                                total_block_array_entries, block_array_buffer);
+  }
+
+ private:
+  std::vector<PrefixRecord*> prefixes_;
+  Arena arena_;
+};
+
+Status BlockPrefixIndex::Create(const SliceTransform* prefix_extractor,
+                                const Slice& prefixes, const Slice& prefix_meta,
+                                BlockPrefixIndex** prefix_index) {
+  uint64_t pos = 0;
+  auto meta_pos = prefix_meta;
+  Status s;
+  Builder builder;
+
+  while (!meta_pos.empty()) {
+    uint32_t prefix_size = 0;
+    uint32_t entry_index = 0;
+    uint32_t num_blocks = 0;
+    if (!GetVarint32(&meta_pos, &prefix_size) ||
+        !GetVarint32(&meta_pos, &entry_index) ||
+        !GetVarint32(&meta_pos, &num_blocks)) {
+      s = Status::Corruption(
+          "Corrupted prefix meta block: unable to read from it.");
+      break;
+    }
+    if (pos + prefix_size > prefixes.size()) {
+      s = Status::Corruption(
+          "Corrupted prefix meta block: size inconsistency.");
+      break;
+    }
+    Slice prefix(prefixes.data() + pos, prefix_size);
+    builder.Add(prefix, entry_index, num_blocks);
+
+    pos += prefix_size;
+  }
+
+  if (s.ok() && pos != prefixes.size()) {
+    s = Status::Corruption("Corrupted prefix meta block");
+  }
+
+  if (s.ok()) {
+    *prefix_index = builder.Finish(prefix_extractor);
+  }
+
+  return s;
+}
+
+uint32_t BlockPrefixIndex::GetBlocks(const Slice& key, uint32_t** blocks) {
+  Slice prefix = internal_prefix_extractor_.Transform(key);
+
+  uint32_t bucket = PrefixToBucket(prefix, num_buckets_);
+  uint32_t block_id = buckets_[bucket];
+
+  if (IsNone(block_id)) {
+    return 0;
+  } else if (IsBlockId(block_id)) {
+    *blocks = &buckets_[bucket];
+    return 1;
+  } else {
+    uint32_t index = DecodeIndex(block_id);
+    assert(index < num_block_array_buffer_entries_);
+    *blocks = &block_array_buffer_[index + 1];
+    uint32_t num_blocks = block_array_buffer_[index];
+    assert(num_blocks > 1);
+    assert(index + num_blocks < num_block_array_buffer_entries_);
+    return num_blocks;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_prefix_index.h b/src/rocksdb/table/block_based/block_prefix_index.h
new file mode 100644
index 000000000..4db8e2c65
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_prefix_index.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <stdint.h>
+
+#include "db/dbformat.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Comparator;
+class Iterator;
+class Slice;
+class SliceTransform;
+
+// Build a hash-based index to speed up the lookup for "index block".
+// BlockHashIndex accepts a key and, if found, returns its restart index within
+// that index block.
+class BlockPrefixIndex {
+ public:
+  // Maps a key to a list of data blocks that could potentially contain
+  // the key, based on the prefix.
+  // Returns the total number of relevant blocks, 0 means the key does
+  // not exist.
+  uint32_t GetBlocks(const Slice& key, uint32_t** blocks);
+
+  size_t ApproximateMemoryUsage() const {
+    return sizeof(BlockPrefixIndex) +
+           (num_block_array_buffer_entries_ + num_buckets_) * sizeof(uint32_t);
+  }
+
+  // Create hash index by reading from the metadata blocks.
+  // Note: table reader (caller) is responsible for keeping shared_ptr to
+  // underlying prefix extractor
+  // @params prefixes: a sequence of prefixes.
+  // @params prefix_meta: contains the "metadata" to of the prefixes.
+  static Status Create(const SliceTransform* hash_key_extractor,
+                       const Slice& prefixes, const Slice& prefix_meta,
+                       BlockPrefixIndex** prefix_index);
+
+  ~BlockPrefixIndex() {
+    delete[] buckets_;
+    delete[] block_array_buffer_;
+  }
+
+ private:
+  class Builder;
+  friend Builder;
+
+  BlockPrefixIndex(const SliceTransform* prefix_extractor, uint32_t num_buckets,
+                   uint32_t* buckets, uint32_t num_block_array_buffer_entries,
+                   uint32_t* block_array_buffer)
+      : internal_prefix_extractor_(prefix_extractor),
+        num_buckets_(num_buckets),
+        num_block_array_buffer_entries_(num_block_array_buffer_entries),
+        buckets_(buckets),
+        block_array_buffer_(block_array_buffer) {}
+
+  InternalKeySliceTransform internal_prefix_extractor_;
+
+  uint32_t num_buckets_;
+  uint32_t num_block_array_buffer_entries_;
+  uint32_t* buckets_;
+  uint32_t* block_array_buffer_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/block_test.cc b/src/rocksdb/table/block_based/block_test.cc
new file mode 100644
index 000000000..83b87fe79
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_test.cc
@@ -0,0 +1,627 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "table/block_based/block.h"
+
+#include <stdio.h>
+
+#include <algorithm>
+#include <set>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#include "table/format.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::string GenerateInternalKey(int primary_key, int secondary_key,
+                                int padding_size, Random *rnd) {
+  char buf[50];
+  char *p = &buf[0];
+  snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key);
+  std::string k(p);
+  if (padding_size) {
+    k += rnd->RandomString(padding_size);
+  }
+  AppendInternalKeyFooter(&k, 0 /* seqno */, kTypeValue);
+
+  return k;
+}
+
+// Generate random key value pairs.
+// The generated key will be sorted. You can tune the parameters to generated
+// different kinds of test key/value pairs for different scenario.
+void GenerateRandomKVs(std::vector<std::string> *keys,
+                       std::vector<std::string> *values, const int from,
+                       const int len, const int step = 1,
+                       const int padding_size = 0,
+                       const int keys_share_prefix = 1) {
+  Random rnd(302);
+
+  // generate different prefix
+  for (int i = from; i < from + len; i += step) {
+    // generating keys that shares the prefix
+    for (int j = 0; j < keys_share_prefix; ++j) {
+      // `DataBlockIter` assumes it reads only internal keys.
+      keys->emplace_back(GenerateInternalKey(i, j, padding_size, &rnd));
+
+      // 100 bytes values
+      values->emplace_back(rnd.RandomString(100));
+    }
+  }
+}
+
+class BlockTest : public testing::Test {};
+
+// block test
+TEST_F(BlockTest, SimpleTest) {
+  Random rnd(301);
+  Options options = Options();
+
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  BlockBuilder builder(16);
+  int num_records = 100000;
+
+  GenerateRandomKVs(&keys, &values, 0, num_records);
+  // add a bunch of records to a block
+  for (int i = 0; i < num_records; i++) {
+    builder.Add(keys[i], values[i]);
+  }
+
+  // read serialized contents of the block
+  Slice rawblock = builder.Finish();
+
+  // create block reader
+  BlockContents contents;
+  contents.data = rawblock;
+  Block reader(std::move(contents));
+
+  // read contents of block sequentially
+  int count = 0;
+  InternalIterator *iter =
+      reader.NewDataIterator(options.comparator, kDisableGlobalSequenceNumber);
+  for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) {
+    // read kv from block
+    Slice k = iter->key();
+    Slice v = iter->value();
+
+    // compare with lookaside array
+    ASSERT_EQ(k.ToString().compare(keys[count]), 0);
+    ASSERT_EQ(v.ToString().compare(values[count]), 0);
+  }
+  delete iter;
+
+  // read block contents randomly
+  iter =
+      reader.NewDataIterator(options.comparator, kDisableGlobalSequenceNumber);
+  for (int i = 0; i < num_records; i++) {
+    // find a random key in the lookaside array
+    int index = rnd.Uniform(num_records);
+    Slice k(keys[index]);
+
+    // search in block for this key
+    iter->Seek(k);
+    ASSERT_TRUE(iter->Valid());
+    Slice v = iter->value();
+    ASSERT_EQ(v.ToString().compare(values[index]), 0);
+  }
+  delete iter;
+}
+
+// return the block contents
+BlockContents GetBlockContents(std::unique_ptr<BlockBuilder> *builder,
+                               const std::vector<std::string> &keys,
+                               const std::vector<std::string> &values,
+                               const int /*prefix_group_size*/ = 1) {
+  builder->reset(new BlockBuilder(1 /* restart interval */));
+
+  // Add only half of the keys
+  for (size_t i = 0; i < keys.size(); ++i) {
+    (*builder)->Add(keys[i], values[i]);
+  }
+  Slice rawblock = (*builder)->Finish();
+
+  BlockContents contents;
+  contents.data = rawblock;
+
+  return contents;
+}
+
+void CheckBlockContents(BlockContents contents, const int max_key,
+                        const std::vector<std::string> &keys,
+                        const std::vector<std::string> &values) {
+  const size_t prefix_size = 6;
+  // create block reader
+  BlockContents contents_ref(contents.data);
+  Block reader1(std::move(contents));
+  Block reader2(std::move(contents_ref));
+
+  std::unique_ptr<const SliceTransform> prefix_extractor(
+      NewFixedPrefixTransform(prefix_size));
+
+  std::unique_ptr<InternalIterator> regular_iter(reader2.NewDataIterator(
+      BytewiseComparator(), kDisableGlobalSequenceNumber));
+
+  // Seek existent keys
+  for (size_t i = 0; i < keys.size(); i++) {
+    regular_iter->Seek(keys[i]);
+    ASSERT_OK(regular_iter->status());
+    ASSERT_TRUE(regular_iter->Valid());
+
+    Slice v = regular_iter->value();
+    ASSERT_EQ(v.ToString().compare(values[i]), 0);
+  }
+
+  // Seek non-existent keys.
+  // For hash index, if no key with a given prefix is not found, iterator will
+  // simply be set as invalid; whereas the binary search based iterator will
+  // return the one that is closest.
+  for (int i = 1; i < max_key - 1; i += 2) {
+    // `DataBlockIter` assumes its APIs receive only internal keys.
+    auto key = GenerateInternalKey(i, 0, 0, nullptr);
+    regular_iter->Seek(key);
+    ASSERT_TRUE(regular_iter->Valid());
+  }
+}
+
+// In this test case, no two key share same prefix.
+TEST_F(BlockTest, SimpleIndexHash) {
+  const int kMaxKey = 100000;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  GenerateRandomKVs(&keys, &values, 0 /* first key id */,
+                    kMaxKey /* last key id */, 2 /* step */,
+                    8 /* padding size (8 bytes randomly generated suffix) */);
+
+  std::unique_ptr<BlockBuilder> builder;
+  auto contents = GetBlockContents(&builder, keys, values);
+
+  CheckBlockContents(std::move(contents), kMaxKey, keys, values);
+}
+
+TEST_F(BlockTest, IndexHashWithSharedPrefix) {
+  const int kMaxKey = 100000;
+  // for each prefix, there will be 5 keys starts with it.
+  const int kPrefixGroup = 5;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  // Generate keys with same prefix.
+  GenerateRandomKVs(&keys, &values, 0,  // first key id
+                    kMaxKey,            // last key id
+                    2,                  // step
+                    10,                 // padding size,
+                    kPrefixGroup);
+
+  std::unique_ptr<BlockBuilder> builder;
+  auto contents = GetBlockContents(&builder, keys, values, kPrefixGroup);
+
+  CheckBlockContents(std::move(contents), kMaxKey, keys, values);
+}
+
+// A slow and accurate version of BlockReadAmpBitmap that simply store
+// all the marked ranges in a set.
+class BlockReadAmpBitmapSlowAndAccurate {
+ public:
+  void Mark(size_t start_offset, size_t end_offset) {
+    assert(end_offset >= start_offset);
+    marked_ranges_.emplace(end_offset, start_offset);
+  }
+
+  void ResetCheckSequence() { iter_valid_ = false; }
+
+  // Return true if any byte in this range was Marked
+  // This does linear search from the previous position. When calling
+  // multiple times, `offset` needs to be incremental to get correct results.
+  // Call ResetCheckSequence() to reset it.
+  bool IsPinMarked(size_t offset) {
+    if (iter_valid_) {
+      // Has existing iterator, try linear search from
+      // the iterator.
+      for (int i = 0; i < 64; i++) {
+        if (offset < iter_->second) {
+          return false;
+        }
+        if (offset <= iter_->first) {
+          return true;
+        }
+
+        iter_++;
+        if (iter_ == marked_ranges_.end()) {
+          iter_valid_ = false;
+          return false;
+        }
+      }
+    }
+    // Initial call or have linear searched too many times.
+    // Do binary search.
+    iter_ = marked_ranges_.lower_bound(
+        std::make_pair(offset, static_cast<size_t>(0)));
+    if (iter_ == marked_ranges_.end()) {
+      iter_valid_ = false;
+      return false;
+    }
+    iter_valid_ = true;
+    return offset <= iter_->first && offset >= iter_->second;
+  }
+
+ private:
+  std::set<std::pair<size_t, size_t>> marked_ranges_;
+  std::set<std::pair<size_t, size_t>>::iterator iter_;
+  bool iter_valid_ = false;
+};
+
+TEST_F(BlockTest, BlockReadAmpBitmap) {
+  uint32_t pin_offset = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockReadAmpBitmap:rnd", [&pin_offset](void *arg) {
+        pin_offset = *(static_cast<uint32_t *>(arg));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  std::vector<size_t> block_sizes = {
+      1,                // 1 byte
+      32,               // 32 bytes
+      61,               // 61 bytes
+      64,               // 64 bytes
+      512,              // 0.5 KB
+      1024,             // 1 KB
+      1024 * 4,         // 4 KB
+      1024 * 10,        // 10 KB
+      1024 * 50,        // 50 KB
+      1024 * 1024 * 4,  // 5 MB
+      777,
+      124653,
+  };
+  const size_t kBytesPerBit = 64;
+
+  Random rnd(301);
+  for (size_t block_size : block_sizes) {
+    std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    BlockReadAmpBitmap read_amp_bitmap(block_size, kBytesPerBit, stats.get());
+    BlockReadAmpBitmapSlowAndAccurate read_amp_slow_and_accurate;
+
+    size_t needed_bits = (block_size / kBytesPerBit);
+    if (block_size % kBytesPerBit != 0) {
+      needed_bits++;
+    }
+
+    ASSERT_EQ(stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES), block_size);
+
+    // Generate some random entries
+    std::vector<size_t> random_entry_offsets;
+    for (int i = 0; i < 1000; i++) {
+      random_entry_offsets.push_back(rnd.Next() % block_size);
+    }
+    std::sort(random_entry_offsets.begin(), random_entry_offsets.end());
+    auto it =
+        std::unique(random_entry_offsets.begin(), random_entry_offsets.end());
+    random_entry_offsets.resize(
+        std::distance(random_entry_offsets.begin(), it));
+
+    std::vector<std::pair<size_t, size_t>> random_entries;
+    for (size_t i = 0; i < random_entry_offsets.size(); i++) {
+      size_t entry_start = random_entry_offsets[i];
+      size_t entry_end;
+      if (i + 1 < random_entry_offsets.size()) {
+        entry_end = random_entry_offsets[i + 1] - 1;
+      } else {
+        entry_end = block_size - 1;
+      }
+      random_entries.emplace_back(entry_start, entry_end);
+    }
+
+    for (size_t i = 0; i < random_entries.size(); i++) {
+      read_amp_slow_and_accurate.ResetCheckSequence();
+      auto &current_entry = random_entries[rnd.Next() % random_entries.size()];
+
+      read_amp_bitmap.Mark(static_cast<uint32_t>(current_entry.first),
+                           static_cast<uint32_t>(current_entry.second));
+      read_amp_slow_and_accurate.Mark(current_entry.first,
+                                      current_entry.second);
+
+      size_t total_bits = 0;
+      for (size_t bit_idx = 0; bit_idx < needed_bits; bit_idx++) {
+        total_bits += read_amp_slow_and_accurate.IsPinMarked(
+            bit_idx * kBytesPerBit + pin_offset);
+      }
+      size_t expected_estimate_useful = total_bits * kBytesPerBit;
+      size_t got_estimate_useful =
+          stats->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES);
+      ASSERT_EQ(expected_estimate_useful, got_estimate_useful);
+    }
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(BlockTest, BlockWithReadAmpBitmap) {
+  Random rnd(301);
+  Options options = Options();
+
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  BlockBuilder builder(16);
+  int num_records = 10000;
+
+  GenerateRandomKVs(&keys, &values, 0, num_records, 1);
+  // add a bunch of records to a block
+  for (int i = 0; i < num_records; i++) {
+    builder.Add(keys[i], values[i]);
+  }
+
+  Slice rawblock = builder.Finish();
+  const size_t kBytesPerBit = 8;
+
+  // Read the block sequentially using Next()
+  {
+    std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    Block reader(std::move(contents), kBytesPerBit, stats.get());
+
+    // read contents of block sequentially
+    size_t read_bytes = 0;
+    DataBlockIter *iter = reader.NewDataIterator(
+        options.comparator, kDisableGlobalSequenceNumber, nullptr, stats.get());
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      iter->value();
+      read_bytes += iter->TEST_CurrentEntrySize();
+
+      double semi_acc_read_amp =
+          static_cast<double>(read_bytes) / rawblock.size();
+      double read_amp = static_cast<double>(stats->getTickerCount(
+                            READ_AMP_ESTIMATE_USEFUL_BYTES)) /
+                        stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+      // Error in read amplification will be less than 1% if we are reading
+      // sequentially
+      double error_pct = fabs(semi_acc_read_amp - read_amp) * 100;
+      EXPECT_LT(error_pct, 1);
+    }
+
+    delete iter;
+  }
+
+  // Read the block sequentially using Seek()
+  {
+    std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    Block reader(std::move(contents), kBytesPerBit, stats.get());
+
+    size_t read_bytes = 0;
+    DataBlockIter *iter = reader.NewDataIterator(
+        options.comparator, kDisableGlobalSequenceNumber, nullptr, stats.get());
+    for (int i = 0; i < num_records; i++) {
+      Slice k(keys[i]);
+
+      // search in block for this key
+      iter->Seek(k);
+      iter->value();
+      read_bytes += iter->TEST_CurrentEntrySize();
+
+      double semi_acc_read_amp =
+          static_cast<double>(read_bytes) / rawblock.size();
+      double read_amp = static_cast<double>(stats->getTickerCount(
+                            READ_AMP_ESTIMATE_USEFUL_BYTES)) /
+                        stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+      // Error in read amplification will be less than 1% if we are reading
+      // sequentially
+      double error_pct = fabs(semi_acc_read_amp - read_amp) * 100;
+      EXPECT_LT(error_pct, 1);
+    }
+    delete iter;
+  }
+
+  // Read the block randomly
+  {
+    std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    Block reader(std::move(contents), kBytesPerBit, stats.get());
+
+    size_t read_bytes = 0;
+    DataBlockIter *iter = reader.NewDataIterator(
+        options.comparator, kDisableGlobalSequenceNumber, nullptr, stats.get());
+    std::unordered_set<int> read_keys;
+    for (int i = 0; i < num_records; i++) {
+      int index = rnd.Uniform(num_records);
+      Slice k(keys[index]);
+
+      iter->Seek(k);
+      iter->value();
+      if (read_keys.find(index) == read_keys.end()) {
+        read_keys.insert(index);
+        read_bytes += iter->TEST_CurrentEntrySize();
+      }
+
+      double semi_acc_read_amp =
+          static_cast<double>(read_bytes) / rawblock.size();
+      double read_amp = static_cast<double>(stats->getTickerCount(
+                            READ_AMP_ESTIMATE_USEFUL_BYTES)) /
+                        stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+      double error_pct = fabs(semi_acc_read_amp - read_amp) * 100;
+      // Error in read amplification will be less than 2% if we are reading
+      // randomly
+      EXPECT_LT(error_pct, 2);
+    }
+    delete iter;
+  }
+}
+
+TEST_F(BlockTest, ReadAmpBitmapPow2) {
+  std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  ASSERT_EQ(BlockReadAmpBitmap(100, 1, stats.get()).GetBytesPerBit(), 1u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 2, stats.get()).GetBytesPerBit(), 2u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 4, stats.get()).GetBytesPerBit(), 4u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 8, stats.get()).GetBytesPerBit(), 8u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 16, stats.get()).GetBytesPerBit(), 16u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 32, stats.get()).GetBytesPerBit(), 32u);
+
+  ASSERT_EQ(BlockReadAmpBitmap(100, 3, stats.get()).GetBytesPerBit(), 2u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 7, stats.get()).GetBytesPerBit(), 4u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 11, stats.get()).GetBytesPerBit(), 8u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 17, stats.get()).GetBytesPerBit(), 16u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 33, stats.get()).GetBytesPerBit(), 32u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 35, stats.get()).GetBytesPerBit(), 32u);
+}
+
+class IndexBlockTest
+    : public testing::Test,
+      public testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  IndexBlockTest() = default;
+
+  bool useValueDeltaEncoding() const { return std::get<0>(GetParam()); }
+  bool includeFirstKey() const { return std::get<1>(GetParam()); }
+};
+
+// Similar to GenerateRandomKVs but for index block contents.
+void GenerateRandomIndexEntries(std::vector<std::string> *separators,
+                                std::vector<BlockHandle> *block_handles,
+                                std::vector<std::string> *first_keys,
+                                const int len) {
+  Random rnd(42);
+
+  // For each of `len` blocks, we need to generate a first and last key.
+  // Let's generate n*2 random keys, sort them, group into consecutive pairs.
+  std::set<std::string> keys;
+  while ((int)keys.size() < len * 2) {
+    // Keys need to be at least 8 bytes long to look like internal keys.
+    keys.insert(test::RandomKey(&rnd, 12));
+  }
+
+  uint64_t offset = 0;
+  for (auto it = keys.begin(); it != keys.end();) {
+    first_keys->emplace_back(*it++);
+    separators->emplace_back(*it++);
+    uint64_t size = rnd.Uniform(1024 * 16);
+    BlockHandle handle(offset, size);
+    offset += size + BlockBasedTable::kBlockTrailerSize;
+    block_handles->emplace_back(handle);
+  }
+}
+
+TEST_P(IndexBlockTest, IndexValueEncodingTest) {
+  Random rnd(301);
+  Options options = Options();
+
+  std::vector<std::string> separators;
+  std::vector<BlockHandle> block_handles;
+  std::vector<std::string> first_keys;
+  const bool kUseDeltaEncoding = true;
+  BlockBuilder builder(16, kUseDeltaEncoding, useValueDeltaEncoding());
+  int num_records = 100;
+
+  GenerateRandomIndexEntries(&separators, &block_handles, &first_keys,
+                             num_records);
+  BlockHandle last_encoded_handle;
+  for (int i = 0; i < num_records; i++) {
+    IndexValue entry(block_handles[i], first_keys[i]);
+    std::string encoded_entry;
+    std::string delta_encoded_entry;
+    entry.EncodeTo(&encoded_entry, includeFirstKey(), nullptr);
+    if (useValueDeltaEncoding() && i > 0) {
+      entry.EncodeTo(&delta_encoded_entry, includeFirstKey(),
+                     &last_encoded_handle);
+    }
+    last_encoded_handle = entry.handle;
+    const Slice delta_encoded_entry_slice(delta_encoded_entry);
+    builder.Add(separators[i], encoded_entry, &delta_encoded_entry_slice);
+  }
+
+  // read serialized contents of the block
+  Slice rawblock = builder.Finish();
+
+  // create block reader
+  BlockContents contents;
+  contents.data = rawblock;
+  Block reader(std::move(contents));
+
+  const bool kTotalOrderSeek = true;
+  const bool kIncludesSeq = true;
+  const bool kValueIsFull = !useValueDeltaEncoding();
+  IndexBlockIter *kNullIter = nullptr;
+  Statistics *kNullStats = nullptr;
+  // read contents of block sequentially
+  InternalIteratorBase<IndexValue> *iter = reader.NewIndexIterator(
+      options.comparator, kDisableGlobalSequenceNumber, kNullIter, kNullStats,
+      kTotalOrderSeek, includeFirstKey(), kIncludesSeq, kValueIsFull);
+  iter->SeekToFirst();
+  for (int index = 0; index < num_records; ++index) {
+    ASSERT_TRUE(iter->Valid());
+
+    Slice k = iter->key();
+    IndexValue v = iter->value();
+
+    EXPECT_EQ(separators[index], k.ToString());
+    EXPECT_EQ(block_handles[index].offset(), v.handle.offset());
+    EXPECT_EQ(block_handles[index].size(), v.handle.size());
+    EXPECT_EQ(includeFirstKey() ? first_keys[index] : "",
+              v.first_internal_key.ToString());
+
+    iter->Next();
+  }
+  delete iter;
+
+  // read block contents randomly
+  iter = reader.NewIndexIterator(
+      options.comparator, kDisableGlobalSequenceNumber, kNullIter, kNullStats,
+      kTotalOrderSeek, includeFirstKey(), kIncludesSeq, kValueIsFull);
+  for (int i = 0; i < num_records * 2; i++) {
+    // find a random key in the lookaside array
+    int index = rnd.Uniform(num_records);
+    Slice k(separators[index]);
+
+    // search in block for this key
+    iter->Seek(k);
+    ASSERT_TRUE(iter->Valid());
+    IndexValue v = iter->value();
+    EXPECT_EQ(separators[index], iter->key().ToString());
+    EXPECT_EQ(block_handles[index].offset(), v.handle.offset());
+    EXPECT_EQ(block_handles[index].size(), v.handle.size());
+    EXPECT_EQ(includeFirstKey() ? first_keys[index] : "",
+              v.first_internal_key.ToString());
+  }
+  delete iter;
+}
+
+INSTANTIATE_TEST_CASE_P(P, IndexBlockTest,
+                        ::testing::Values(std::make_tuple(false, false),
+                                          std::make_tuple(false, true),
+                                          std::make_tuple(true, false),
+                                          std::make_tuple(true, true)));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char **argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/block_based/block_type.h b/src/rocksdb/table/block_based/block_type.h
new file mode 100644
index 000000000..a9d6a1a77
--- /dev/null
+++ b/src/rocksdb/table/block_based/block_type.h
@@ -0,0 +1,34 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Represents the types of blocks used in the block based table format.
+// See https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format
+// for details.
+// For code sanity, BlockType should imply a specific TBlocklike for
+// BlocklikeTraits.
+enum class BlockType : uint8_t {
+  kData,
+  kFilter,  // for second level partitioned filters and full filters
+  kFilterPartitionIndex,  // for top-level index of filter partitions
+  kProperties,
+  kCompressionDictionary,
+  kRangeDeletion,
+  kHashIndexPrefixes,
+  kHashIndexMetadata,
+  kMetaIndex,
+  kIndex,
+  // Note: keep kInvalid the last value when adding new enum values.
+  kInvalid
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/cachable_entry.h b/src/rocksdb/table/block_based/cachable_entry.h
new file mode 100644
index 000000000..ad8acb18d
--- /dev/null
+++ b/src/rocksdb/table/block_based/cachable_entry.h
@@ -0,0 +1,232 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <cassert>
+
+#include "port/likely.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/cleanable.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// CachableEntry is a handle to an object that may or may not be in the block
+// cache. It is used in a variety of ways:
+//
+// 1) It may refer to an object in the block cache. In this case, cache_ and
+// cache_handle_ are not nullptr, and the cache handle has to be released when
+// the CachableEntry is destroyed (the lifecycle of the cached object, on the
+// other hand, is managed by the cache itself).
+// 2) It may uniquely own the (non-cached) object it refers to (examples include
+// a block read directly from file, or uncompressed blocks when there is a
+// compressed block cache but no uncompressed block cache). In such cases, the
+// object has to be destroyed when the CachableEntry is destroyed.
+// 3) It may point to an object (cached or not) without owning it. In this case,
+// no action is needed when the CachableEntry is destroyed.
+// 4) Sometimes, management of a cached or owned object (see #1 and #2 above)
+// is transferred to some other object. This is used for instance with iterators
+// (where cleanup is performed using a chain of cleanup functions,
+// see Cleanable).
+//
+// Because of #1 and #2 above, copying a CachableEntry is not safe (and thus not
+// allowed); hence, this is a move-only type, where a move transfers the
+// management responsibilities, and leaves the source object in an empty state.
+
+template <class T>
+class CachableEntry {
+ public:
+  CachableEntry() = default;
+
+  CachableEntry(T* value, Cache* cache, Cache::Handle* cache_handle,
+                bool own_value)
+      : value_(value),
+        cache_(cache),
+        cache_handle_(cache_handle),
+        own_value_(own_value) {
+    assert(value_ != nullptr ||
+           (cache_ == nullptr && cache_handle_ == nullptr && !own_value_));
+    assert(!!cache_ == !!cache_handle_);
+    assert(!cache_handle_ || !own_value_);
+  }
+
+  CachableEntry(const CachableEntry&) = delete;
+  CachableEntry& operator=(const CachableEntry&) = delete;
+
+  CachableEntry(CachableEntry&& rhs) noexcept
+      : value_(rhs.value_),
+        cache_(rhs.cache_),
+        cache_handle_(rhs.cache_handle_),
+        own_value_(rhs.own_value_) {
+    assert(value_ != nullptr ||
+           (cache_ == nullptr && cache_handle_ == nullptr && !own_value_));
+    assert(!!cache_ == !!cache_handle_);
+    assert(!cache_handle_ || !own_value_);
+
+    rhs.ResetFields();
+  }
+
+  CachableEntry& operator=(CachableEntry&& rhs) noexcept {
+    if (UNLIKELY(this == &rhs)) {
+      return *this;
+    }
+
+    ReleaseResource();
+
+    value_ = rhs.value_;
+    cache_ = rhs.cache_;
+    cache_handle_ = rhs.cache_handle_;
+    own_value_ = rhs.own_value_;
+
+    assert(value_ != nullptr ||
+           (cache_ == nullptr && cache_handle_ == nullptr && !own_value_));
+    assert(!!cache_ == !!cache_handle_);
+    assert(!cache_handle_ || !own_value_);
+
+    rhs.ResetFields();
+
+    return *this;
+  }
+
+  ~CachableEntry() { ReleaseResource(); }
+
+  bool IsEmpty() const {
+    return value_ == nullptr && cache_ == nullptr && cache_handle_ == nullptr &&
+           !own_value_;
+  }
+
+  bool IsCached() const {
+    assert(!!cache_ == !!cache_handle_);
+
+    return cache_handle_ != nullptr;
+  }
+
+  T* GetValue() const { return value_; }
+  Cache* GetCache() const { return cache_; }
+  Cache::Handle* GetCacheHandle() const { return cache_handle_; }
+  bool GetOwnValue() const { return own_value_; }
+
+  void Reset() {
+    ReleaseResource();
+    ResetFields();
+  }
+
+  void TransferTo(Cleanable* cleanable) {
+    if (cleanable) {
+      if (cache_handle_ != nullptr) {
+        assert(cache_ != nullptr);
+        cleanable->RegisterCleanup(&ReleaseCacheHandle, cache_, cache_handle_);
+      } else if (own_value_) {
+        cleanable->RegisterCleanup(&DeleteValue, value_, nullptr);
+      }
+    }
+
+    ResetFields();
+  }
+
+  void SetOwnedValue(std::unique_ptr<T>&& value) {
+    assert(value.get() != nullptr);
+
+    if (UNLIKELY(value_ == value.get() && own_value_)) {
+      assert(cache_ == nullptr && cache_handle_ == nullptr);
+      return;
+    }
+
+    Reset();
+
+    value_ = value.release();
+    own_value_ = true;
+  }
+
+  void SetUnownedValue(T* value) {
+    assert(value != nullptr);
+
+    if (UNLIKELY(value_ == value && cache_ == nullptr &&
+                 cache_handle_ == nullptr && !own_value_)) {
+      return;
+    }
+
+    Reset();
+
+    value_ = value;
+    assert(!own_value_);
+  }
+
+  void SetCachedValue(T* value, Cache* cache, Cache::Handle* cache_handle) {
+    assert(cache != nullptr);
+    assert(cache_handle != nullptr);
+
+    if (UNLIKELY(value_ == value && cache_ == cache &&
+                 cache_handle_ == cache_handle && !own_value_)) {
+      return;
+    }
+
+    Reset();
+
+    value_ = value;
+    cache_ = cache;
+    cache_handle_ = cache_handle;
+    assert(!own_value_);
+  }
+
+  void UpdateCachedValue() {
+    assert(cache_ != nullptr);
+    assert(cache_handle_ != nullptr);
+
+    value_ = static_cast<T*>(cache_->Value(cache_handle_));
+  }
+
+  bool IsReady() {
+    if (!own_value_) {
+      assert(cache_ != nullptr);
+      assert(cache_handle_ != nullptr);
+      return cache_->IsReady(cache_handle_);
+    }
+    return true;
+  }
+
+ private:
+  void ReleaseResource() noexcept {
+    if (LIKELY(cache_handle_ != nullptr)) {
+      assert(cache_ != nullptr);
+      cache_->Release(cache_handle_);
+    } else if (own_value_) {
+      delete value_;
+    }
+  }
+
+  void ResetFields() noexcept {
+    value_ = nullptr;
+    cache_ = nullptr;
+    cache_handle_ = nullptr;
+    own_value_ = false;
+  }
+
+  static void ReleaseCacheHandle(void* arg1, void* arg2) {
+    Cache* const cache = static_cast<Cache*>(arg1);
+    assert(cache);
+
+    Cache::Handle* const cache_handle = static_cast<Cache::Handle*>(arg2);
+    assert(cache_handle);
+
+    cache->Release(cache_handle);
+  }
+
+  static void DeleteValue(void* arg1, void* /* arg2 */) {
+    delete static_cast<T*>(arg1);
+  }
+
+ private:
+  T* value_ = nullptr;
+  Cache* cache_ = nullptr;
+  Cache::Handle* cache_handle_ = nullptr;
+  bool own_value_ = false;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/data_block_footer.cc b/src/rocksdb/table/block_based/data_block_footer.cc
new file mode 100644
index 000000000..5d5d8ed55
--- /dev/null
+++ b/src/rocksdb/table/block_based/data_block_footer.cc
@@ -0,0 +1,59 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_based/data_block_footer.h"
+
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const int kDataBlockIndexTypeBitShift = 31;
+
+// 0x7FFFFFFF
+const uint32_t kMaxNumRestarts = (1u << kDataBlockIndexTypeBitShift) - 1u;
+
+// 0x7FFFFFFF
+const uint32_t kNumRestartsMask = (1u << kDataBlockIndexTypeBitShift) - 1u;
+
+uint32_t PackIndexTypeAndNumRestarts(
+    BlockBasedTableOptions::DataBlockIndexType index_type,
+    uint32_t num_restarts) {
+  if (num_restarts > kMaxNumRestarts) {
+    assert(0);  // mute travis "unused" warning
+  }
+
+  uint32_t block_footer = num_restarts;
+  if (index_type == BlockBasedTableOptions::kDataBlockBinaryAndHash) {
+    block_footer |= 1u << kDataBlockIndexTypeBitShift;
+  } else if (index_type != BlockBasedTableOptions::kDataBlockBinarySearch) {
+    assert(0);
+  }
+
+  return block_footer;
+}
+
+void UnPackIndexTypeAndNumRestarts(
+    uint32_t block_footer,
+    BlockBasedTableOptions::DataBlockIndexType* index_type,
+    uint32_t* num_restarts) {
+  if (index_type) {
+    if (block_footer & 1u << kDataBlockIndexTypeBitShift) {
+      *index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
+    } else {
+      *index_type = BlockBasedTableOptions::kDataBlockBinarySearch;
+    }
+  }
+
+  if (num_restarts) {
+    *num_restarts = block_footer & kNumRestartsMask;
+    assert(*num_restarts <= kMaxNumRestarts);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/data_block_footer.h b/src/rocksdb/table/block_based/data_block_footer.h
new file mode 100644
index 000000000..c1cfd4730
--- /dev/null
+++ b/src/rocksdb/table/block_based/data_block_footer.h
@@ -0,0 +1,25 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+uint32_t PackIndexTypeAndNumRestarts(
+    BlockBasedTableOptions::DataBlockIndexType index_type,
+    uint32_t num_restarts);
+
+void UnPackIndexTypeAndNumRestarts(
+    uint32_t block_footer,
+    BlockBasedTableOptions::DataBlockIndexType* index_type,
+    uint32_t* num_restarts);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/data_block_hash_index.cc b/src/rocksdb/table/block_based/data_block_hash_index.cc
new file mode 100644
index 000000000..c579dcc43
--- /dev/null
+++ b/src/rocksdb/table/block_based/data_block_hash_index.cc
@@ -0,0 +1,94 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#include "table/block_based/data_block_hash_index.h"
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/slice.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void DataBlockHashIndexBuilder::Add(const Slice& key,
+                                    const size_t restart_index) {
+  assert(Valid());
+  if (restart_index > kMaxRestartSupportedByHashIndex) {
+    valid_ = false;
+    return;
+  }
+
+  uint32_t hash_value = GetSliceHash(key);
+  hash_and_restart_pairs_.emplace_back(hash_value,
+                                       static_cast<uint8_t>(restart_index));
+  estimated_num_buckets_ += bucket_per_key_;
+}
+
+void DataBlockHashIndexBuilder::Finish(std::string& buffer) {
+  assert(Valid());
+  uint16_t num_buckets = static_cast<uint16_t>(estimated_num_buckets_);
+
+  if (num_buckets == 0) {
+    num_buckets = 1;  // sanity check
+  }
+
+  // The build-in hash cannot well distribute strings when into different
+  // buckets when num_buckets is power of two, resulting in high hash
+  // collision.
+  // We made the num_buckets to be odd to avoid this issue.
+  num_buckets |= 1;
+
+  std::vector<uint8_t> buckets(num_buckets, kNoEntry);
+  // write the restart_index array
+  for (auto& entry : hash_and_restart_pairs_) {
+    uint32_t hash_value = entry.first;
+    uint8_t restart_index = entry.second;
+    uint16_t buck_idx = static_cast<uint16_t>(hash_value % num_buckets);
+    if (buckets[buck_idx] == kNoEntry) {
+      buckets[buck_idx] = restart_index;
+    } else if (buckets[buck_idx] != restart_index) {
+      // same bucket cannot store two different restart_index, mark collision
+      buckets[buck_idx] = kCollision;
+    }
+  }
+
+  for (uint8_t restart_index : buckets) {
+    buffer.append(
+        const_cast<const char*>(reinterpret_cast<char*>(&restart_index)),
+        sizeof(restart_index));
+  }
+
+  // write NUM_BUCK
+  PutFixed16(&buffer, num_buckets);
+
+  assert(buffer.size() <= kMaxBlockSizeSupportedByHashIndex);
+}
+
+void DataBlockHashIndexBuilder::Reset() {
+  estimated_num_buckets_ = 0;
+  valid_ = true;
+  hash_and_restart_pairs_.clear();
+}
+
+void DataBlockHashIndex::Initialize(const char* data, uint16_t size,
+                                    uint16_t* map_offset) {
+  assert(size >= sizeof(uint16_t));  // NUM_BUCKETS
+  num_buckets_ = DecodeFixed16(data + size - sizeof(uint16_t));
+  assert(num_buckets_ > 0);
+  assert(size > num_buckets_ * sizeof(uint8_t));
+  *map_offset = static_cast<uint16_t>(size - sizeof(uint16_t) -
+                                      num_buckets_ * sizeof(uint8_t));
+}
+
+uint8_t DataBlockHashIndex::Lookup(const char* data, uint32_t map_offset,
+                                   const Slice& key) const {
+  uint32_t hash_value = GetSliceHash(key);
+  uint16_t idx = static_cast<uint16_t>(hash_value % num_buckets_);
+  const char* bucket_table = data + map_offset;
+  return static_cast<uint8_t>(*(bucket_table + idx * sizeof(uint8_t)));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/data_block_hash_index.h b/src/rocksdb/table/block_based/data_block_hash_index.h
new file mode 100644
index 000000000..321522175
--- /dev/null
+++ b/src/rocksdb/table/block_based/data_block_hash_index.h
@@ -0,0 +1,137 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+// This is an experimental feature aiming to reduce the CPU utilization of
+// point-lookup within a data-block. It is only used in data blocks, and not
+// in meta-data blocks or per-table index blocks.
+//
+// It only used to support BlockBasedTable::Get().
+//
+// A serialized hash index is appended to the data-block. The new block data
+// format is as follows:
+//
+// DATA_BLOCK: [RI RI RI ... RI RI_IDX HASH_IDX FOOTER]
+//
+// RI:       Restart Interval (the same as the default data-block format)
+// RI_IDX:   Restart Interval index (the same as the default data-block format)
+// HASH_IDX: The new data-block hash index feature.
+// FOOTER:   A 32bit block footer, which is the NUM_RESTARTS with the MSB as
+//           the flag indicating if this hash index is in use. Note that
+//           given a data block < 32KB, the MSB is never used. So we can
+//           borrow the MSB as the hash index flag. Therefore, this format is
+//           compatible with the legacy data-blocks with num_restarts < 32768,
+//           as the MSB is 0.
+//
+// The format of the data-block hash index is as follows:
+//
+// HASH_IDX: [B B B ... B NUM_BUCK]
+//
+// B:         bucket, an array of restart index. Each buckets is uint8_t.
+// NUM_BUCK:  Number of buckets, which is the length of the bucket array.
+//
+// We reserve two special flag:
+//    kNoEntry=255,
+//    kCollision=254.
+//
+// Therefore, the max number of restarts this hash index can supoport is 253.
+//
+// Buckets are initialized to be kNoEntry.
+//
+// When storing a key in the hash index, the key is first hashed to a bucket.
+// If there the bucket is empty (kNoEntry), the restart index is stored in
+// the bucket. If there is already a restart index there, we will update the
+// existing restart index to a collision marker (kCollision). If the
+// the bucket is already marked as collision, we do not store the restart
+// index either.
+//
+// During query process, a key is first hashed to a bucket. Then we examine if
+// the buckets store nothing (kNoEntry) or the bucket had a collision
+// (kCollision). If either of those happens, we get the restart index of
+// the key and will directly go to the restart interval to search the key.
+//
+// Note that we only support blocks with #restart_interval < 254. If a block
+// has more restart interval than that, hash index will not be create for it.
+
+const uint8_t kNoEntry = 255;
+const uint8_t kCollision = 254;
+const uint8_t kMaxRestartSupportedByHashIndex = 253;
+
+// Because we use uint16_t address, we only support block no more than 64KB
+const size_t kMaxBlockSizeSupportedByHashIndex = 1u << 16;
+const double kDefaultUtilRatio = 0.75;
+
+class DataBlockHashIndexBuilder {
+ public:
+  DataBlockHashIndexBuilder()
+      : bucket_per_key_(-1 /*uninitialized marker*/),
+        estimated_num_buckets_(0),
+        valid_(false) {}
+
+  void Initialize(double util_ratio) {
+    if (util_ratio <= 0) {
+      util_ratio = kDefaultUtilRatio;  // sanity check
+    }
+    bucket_per_key_ = 1 / util_ratio;
+    valid_ = true;
+  }
+
+  inline bool Valid() const { return valid_ && bucket_per_key_ > 0; }
+  void Add(const Slice& key, const size_t restart_index);
+  void Finish(std::string& buffer);
+  void Reset();
+  inline size_t EstimateSize() const {
+    uint16_t estimated_num_buckets =
+        static_cast<uint16_t>(estimated_num_buckets_);
+
+    // Maching the num_buckets number in DataBlockHashIndexBuilder::Finish.
+    estimated_num_buckets |= 1;
+
+    return sizeof(uint16_t) +
+           static_cast<size_t>(estimated_num_buckets * sizeof(uint8_t));
+  }
+
+ private:
+  double bucket_per_key_;  // is the multiplicative inverse of util_ratio_
+  double estimated_num_buckets_;
+
+  // Now the only usage for `valid_` is to mark false when the inserted
+  // restart_index is larger than supported. In this case HashIndex is not
+  // appended to the block content.
+  bool valid_;
+
+  std::vector<std::pair<uint32_t, uint8_t>> hash_and_restart_pairs_;
+  friend class DataBlockHashIndex_DataBlockHashTestSmall_Test;
+};
+
+class DataBlockHashIndex {
+ public:
+  DataBlockHashIndex() : num_buckets_(0) {}
+
+  void Initialize(const char* data, uint16_t size, uint16_t* map_offset);
+
+  uint8_t Lookup(const char* data, uint32_t map_offset, const Slice& key) const;
+
+  inline bool Valid() { return num_buckets_ != 0; }
+
+ private:
+  // To make the serialized hash index compact and to save the space overhead,
+  // here all the data fields persisted in the block are in uint16 format.
+  // We find that a uint16 is large enough to index every offset of a 64KiB
+  // block.
+  // So in other words, DataBlockHashIndex does not support block size equal
+  // or greater then 64KiB.
+  uint16_t num_buckets_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/data_block_hash_index_test.cc b/src/rocksdb/table/block_based/data_block_hash_index_test.cc
new file mode 100644
index 000000000..cd2e30833
--- /dev/null
+++ b/src/rocksdb/table/block_based/data_block_hash_index_test.cc
@@ -0,0 +1,717 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/block_based/data_block_hash_index.h"
+
+#include <cstdlib>
+#include <string>
+#include <unordered_map>
+
+#include "db/table_properties_collector.h"
+#include "rocksdb/slice.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#include "table/get_context.h"
+#include "table/table_builder.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool SearchForOffset(DataBlockHashIndex& index, const char* data,
+                     uint16_t map_offset, const Slice& key,
+                     uint8_t& restart_point) {
+  uint8_t entry = index.Lookup(data, map_offset, key);
+  if (entry == kCollision) {
+    return true;
+  }
+
+  if (entry == kNoEntry) {
+    return false;
+  }
+
+  return entry == restart_point;
+}
+
+std::string GenerateKey(int primary_key, int secondary_key, int padding_size,
+                        Random* rnd) {
+  char buf[50];
+  char* p = &buf[0];
+  snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key);
+  std::string k(p);
+  if (padding_size) {
+    k += rnd->RandomString(padding_size);
+  }
+
+  return k;
+}
+
+// Generate random key value pairs.
+// The generated key will be sorted. You can tune the parameters to generated
+// different kinds of test key/value pairs for different scenario.
+void GenerateRandomKVs(std::vector<std::string>* keys,
+                       std::vector<std::string>* values, const int from,
+                       const int len, const int step = 1,
+                       const int padding_size = 0,
+                       const int keys_share_prefix = 1) {
+  Random rnd(302);
+
+  // generate different prefix
+  for (int i = from; i < from + len; i += step) {
+    // generating keys that shares the prefix
+    for (int j = 0; j < keys_share_prefix; ++j) {
+      keys->emplace_back(GenerateKey(i, j, padding_size, &rnd));
+
+      // 100 bytes values
+      values->emplace_back(rnd.RandomString(100));
+    }
+  }
+}
+
+TEST(DataBlockHashIndex, DataBlockHashTestSmall) {
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);
+  for (int j = 0; j < 5; j++) {
+    for (uint8_t i = 0; i < 2 + j; i++) {
+      std::string key("key" + std::to_string(i));
+      uint8_t restart_point = i;
+      builder.Add(key, restart_point);
+    }
+
+    size_t estimated_size = builder.EstimateSize();
+
+    std::string buffer("fake"), buffer2;
+    size_t original_size = buffer.size();
+    estimated_size += original_size;
+    builder.Finish(buffer);
+
+    ASSERT_EQ(buffer.size(), estimated_size);
+
+    buffer2 = buffer;  // test for the correctness of relative offset
+
+    Slice s(buffer2);
+    DataBlockHashIndex index;
+    uint16_t map_offset;
+    index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset);
+
+    // the additional hash map should start at the end of the buffer
+    ASSERT_EQ(original_size, map_offset);
+    for (uint8_t i = 0; i < 2; i++) {
+      std::string key("key" + std::to_string(i));
+      uint8_t restart_point = i;
+      ASSERT_TRUE(
+          SearchForOffset(index, s.data(), map_offset, key, restart_point));
+    }
+    builder.Reset();
+  }
+}
+
+TEST(DataBlockHashIndex, DataBlockHashTest) {
+  // bucket_num = 200, #keys = 100. 50% utilization
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);
+
+  for (uint8_t i = 0; i < 100; i++) {
+    std::string key("key" + std::to_string(i));
+    uint8_t restart_point = i;
+    builder.Add(key, restart_point);
+  }
+
+  size_t estimated_size = builder.EstimateSize();
+
+  std::string buffer("fake content"), buffer2;
+  size_t original_size = buffer.size();
+  estimated_size += original_size;
+  builder.Finish(buffer);
+
+  ASSERT_EQ(buffer.size(), estimated_size);
+
+  buffer2 = buffer;  // test for the correctness of relative offset
+
+  Slice s(buffer2);
+  DataBlockHashIndex index;
+  uint16_t map_offset;
+  index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset);
+
+  // the additional hash map should start at the end of the buffer
+  ASSERT_EQ(original_size, map_offset);
+  for (uint8_t i = 0; i < 100; i++) {
+    std::string key("key" + std::to_string(i));
+    uint8_t restart_point = i;
+    ASSERT_TRUE(
+        SearchForOffset(index, s.data(), map_offset, key, restart_point));
+  }
+}
+
+TEST(DataBlockHashIndex, DataBlockHashTestCollision) {
+  // bucket_num = 2. There will be intense hash collisions
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);
+
+  for (uint8_t i = 0; i < 100; i++) {
+    std::string key("key" + std::to_string(i));
+    uint8_t restart_point = i;
+    builder.Add(key, restart_point);
+  }
+
+  size_t estimated_size = builder.EstimateSize();
+
+  std::string buffer("some other fake content to take up space"), buffer2;
+  size_t original_size = buffer.size();
+  estimated_size += original_size;
+  builder.Finish(buffer);
+
+  ASSERT_EQ(buffer.size(), estimated_size);
+
+  buffer2 = buffer;  // test for the correctness of relative offset
+
+  Slice s(buffer2);
+  DataBlockHashIndex index;
+  uint16_t map_offset;
+  index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset);
+
+  // the additional hash map should start at the end of the buffer
+  ASSERT_EQ(original_size, map_offset);
+  for (uint8_t i = 0; i < 100; i++) {
+    std::string key("key" + std::to_string(i));
+    uint8_t restart_point = i;
+    ASSERT_TRUE(
+        SearchForOffset(index, s.data(), map_offset, key, restart_point));
+  }
+}
+
+TEST(DataBlockHashIndex, DataBlockHashTestLarge) {
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);
+  std::unordered_map<std::string, uint8_t> m;
+
+  for (uint8_t i = 0; i < 100; i++) {
+    if (i % 2) {
+      continue;  // leave half of the keys out
+    }
+    std::string key = "key" + std::to_string(i);
+    uint8_t restart_point = i;
+    builder.Add(key, restart_point);
+    m[key] = restart_point;
+  }
+
+  size_t estimated_size = builder.EstimateSize();
+
+  std::string buffer("filling stuff"), buffer2;
+  size_t original_size = buffer.size();
+  estimated_size += original_size;
+  builder.Finish(buffer);
+
+  ASSERT_EQ(buffer.size(), estimated_size);
+
+  buffer2 = buffer;  // test for the correctness of relative offset
+
+  Slice s(buffer2);
+  DataBlockHashIndex index;
+  uint16_t map_offset;
+  index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset);
+
+  // the additional hash map should start at the end of the buffer
+  ASSERT_EQ(original_size, map_offset);
+  for (uint8_t i = 0; i < 100; i++) {
+    std::string key = "key" + std::to_string(i);
+    uint8_t restart_point = i;
+    if (m.count(key)) {
+      ASSERT_TRUE(m[key] == restart_point);
+      ASSERT_TRUE(
+          SearchForOffset(index, s.data(), map_offset, key, restart_point));
+    } else {
+      // we allow false positve, so don't test the nonexisting keys.
+      // when false positive happens, the search will continue to the
+      // restart intervals to see if the key really exist.
+    }
+  }
+}
+
+TEST(DataBlockHashIndex, RestartIndexExceedMax) {
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);
+  std::unordered_map<std::string, uint8_t> m;
+
+  for (uint8_t i = 0; i <= 253; i++) {
+    std::string key = "key" + std::to_string(i);
+    uint8_t restart_point = i;
+    builder.Add(key, restart_point);
+  }
+  ASSERT_TRUE(builder.Valid());
+
+  builder.Reset();
+
+  for (uint8_t i = 0; i <= 254; i++) {
+    std::string key = "key" + std::to_string(i);
+    uint8_t restart_point = i;
+    builder.Add(key, restart_point);
+  }
+
+  ASSERT_FALSE(builder.Valid());
+
+  builder.Reset();
+  ASSERT_TRUE(builder.Valid());
+}
+
+TEST(DataBlockHashIndex, BlockRestartIndexExceedMax) {
+  Options options = Options();
+
+  BlockBuilder builder(1 /* block_restart_interval */,
+                       true /* use_delta_encoding */,
+                       false /* use_value_delta_encoding */,
+                       BlockBasedTableOptions::kDataBlockBinaryAndHash);
+
+  // #restarts <= 253. HashIndex is valid
+  for (int i = 0; i <= 253; i++) {
+    std::string ukey = "key" + std::to_string(i);
+    InternalKey ikey(ukey, 0, kTypeValue);
+    builder.Add(ikey.Encode().ToString(), "value");
+  }
+
+  {
+    // read serialized contents of the block
+    Slice rawblock = builder.Finish();
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    Block reader(std::move(contents));
+
+    ASSERT_EQ(reader.IndexType(),
+              BlockBasedTableOptions::kDataBlockBinaryAndHash);
+  }
+
+  builder.Reset();
+
+  // #restarts > 253. HashIndex is not used
+  for (int i = 0; i <= 254; i++) {
+    std::string ukey = "key" + std::to_string(i);
+    InternalKey ikey(ukey, 0, kTypeValue);
+    builder.Add(ikey.Encode().ToString(), "value");
+  }
+
+  {
+    // read serialized contents of the block
+    Slice rawblock = builder.Finish();
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    Block reader(std::move(contents));
+
+    ASSERT_EQ(reader.IndexType(),
+              BlockBasedTableOptions::kDataBlockBinarySearch);
+  }
+}
+
+TEST(DataBlockHashIndex, BlockSizeExceedMax) {
+  Options options = Options();
+  std::string ukey(10, 'k');
+  InternalKey ikey(ukey, 0, kTypeValue);
+
+  BlockBuilder builder(1 /* block_restart_interval */,
+                       false /* use_delta_encoding */,
+                       false /* use_value_delta_encoding */,
+                       BlockBasedTableOptions::kDataBlockBinaryAndHash);
+
+  {
+    // insert a large value. The block size plus HashIndex is 65536.
+    std::string value(65502, 'v');
+
+    builder.Add(ikey.Encode().ToString(), value);
+
+    // read serialized contents of the block
+    Slice rawblock = builder.Finish();
+    ASSERT_LE(rawblock.size(), kMaxBlockSizeSupportedByHashIndex);
+    std::cerr << "block size: " << rawblock.size() << std::endl;
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    Block reader(std::move(contents));
+
+    ASSERT_EQ(reader.IndexType(),
+              BlockBasedTableOptions::kDataBlockBinaryAndHash);
+  }
+
+  builder.Reset();
+
+  {
+    // insert a large value. The block size plus HashIndex would be 65537.
+    // This excceed the max block size supported by HashIndex (65536).
+    // So when build finishes HashIndex will not be created for the block.
+    std::string value(65503, 'v');
+
+    builder.Add(ikey.Encode().ToString(), value);
+
+    // read serialized contents of the block
+    Slice rawblock = builder.Finish();
+    ASSERT_LE(rawblock.size(), kMaxBlockSizeSupportedByHashIndex);
+    std::cerr << "block size: " << rawblock.size() << std::endl;
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    Block reader(std::move(contents));
+
+    // the index type have fallen back to binary when build finish.
+    ASSERT_EQ(reader.IndexType(),
+              BlockBasedTableOptions::kDataBlockBinarySearch);
+  }
+}
+
+TEST(DataBlockHashIndex, BlockTestSingleKey) {
+  Options options = Options();
+
+  BlockBuilder builder(16 /* block_restart_interval */,
+                       true /* use_delta_encoding */,
+                       false /* use_value_delta_encoding */,
+                       BlockBasedTableOptions::kDataBlockBinaryAndHash);
+
+  std::string ukey("gopher");
+  std::string value("gold");
+  InternalKey ikey(ukey, 10, kTypeValue);
+  builder.Add(ikey.Encode().ToString(), value /*value*/);
+
+  // read serialized contents of the block
+  Slice rawblock = builder.Finish();
+
+  // create block reader
+  BlockContents contents;
+  contents.data = rawblock;
+  Block reader(std::move(contents));
+
+  const InternalKeyComparator icmp(BytewiseComparator());
+  auto iter = reader.NewDataIterator(icmp.user_comparator(),
+                                     kDisableGlobalSequenceNumber);
+  bool may_exist;
+  // search in block for the key just inserted
+  {
+    InternalKey seek_ikey(ukey, 10, kValueTypeForSeek);
+    may_exist = iter->SeekForGet(seek_ikey.Encode().ToString());
+    ASSERT_TRUE(may_exist);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(
+        options.comparator->Compare(iter->key(), ikey.Encode().ToString()), 0);
+    ASSERT_EQ(iter->value(), value);
+  }
+
+  // search in block for the existing ukey, but with higher seqno
+  {
+    InternalKey seek_ikey(ukey, 20, kValueTypeForSeek);
+
+    // HashIndex should be able to set the iter correctly
+    may_exist = iter->SeekForGet(seek_ikey.Encode().ToString());
+    ASSERT_TRUE(may_exist);
+    ASSERT_TRUE(iter->Valid());
+
+    // user key should match
+    ASSERT_EQ(options.comparator->Compare(ExtractUserKey(iter->key()), ukey),
+              0);
+
+    // seek_key seqno number should be greater than that of iter result
+    ASSERT_GT(GetInternalKeySeqno(seek_ikey.Encode()),
+              GetInternalKeySeqno(iter->key()));
+
+    ASSERT_EQ(iter->value(), value);
+  }
+
+  // Search in block for the existing ukey, but with lower seqno
+  // in this case, hash can find the only occurrence of the user_key, but
+  // ParseNextDataKey() will skip it as it does not have a older seqno.
+  // In this case, GetForSeek() is effective to locate the user_key, and
+  // iter->Valid() == false indicates that we've reached to the end of
+  // the block and the caller should continue searching the next block.
+  {
+    InternalKey seek_ikey(ukey, 5, kValueTypeForSeek);
+    may_exist = iter->SeekForGet(seek_ikey.Encode().ToString());
+    ASSERT_TRUE(may_exist);
+    ASSERT_FALSE(iter->Valid());  // should have reached to the end of block
+  }
+
+  delete iter;
+}
+
+TEST(DataBlockHashIndex, BlockTestLarge) {
+  Random rnd(1019);
+  Options options = Options();
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+
+  BlockBuilder builder(16 /* block_restart_interval */,
+                       true /* use_delta_encoding */,
+                       false /* use_value_delta_encoding */,
+                       BlockBasedTableOptions::kDataBlockBinaryAndHash);
+  int num_records = 500;
+
+  GenerateRandomKVs(&keys, &values, 0, num_records);
+
+  // Generate keys. Adding a trailing "1" to indicate existent keys.
+  // Later will Seeking for keys with a trailing "0" to test seeking
+  // non-existent keys.
+  for (int i = 0; i < num_records; i++) {
+    std::string ukey(keys[i] + "1" /* existing key marker */);
+    InternalKey ikey(ukey, 0, kTypeValue);
+    builder.Add(ikey.Encode().ToString(), values[i]);
+  }
+
+  // read serialized contents of the block
+  Slice rawblock = builder.Finish();
+
+  // create block reader
+  BlockContents contents;
+  contents.data = rawblock;
+  Block reader(std::move(contents));
+  const InternalKeyComparator icmp(BytewiseComparator());
+
+  // random seek existent keys
+  for (int i = 0; i < num_records; i++) {
+    auto iter = reader.NewDataIterator(icmp.user_comparator(),
+                                       kDisableGlobalSequenceNumber);
+    // find a random key in the lookaside array
+    int index = rnd.Uniform(num_records);
+    std::string ukey(keys[index] + "1" /* existing key marker */);
+    InternalKey ikey(ukey, 0, kTypeValue);
+
+    // search in block for this key
+    bool may_exist = iter->SeekForGet(ikey.Encode().ToString());
+    ASSERT_TRUE(may_exist);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(values[index], iter->value());
+
+    delete iter;
+  }
+
+  // random seek non-existent user keys
+  // In this case A), the user_key cannot be found in HashIndex. The key may
+  // exist in the next block. So the iter is set invalidated to tell the
+  // caller to search the next block. This test case belongs to this case A).
+  //
+  // Note that for non-existent keys, there is possibility of false positive,
+  // i.e. the key is still hashed into some restart interval.
+  // Two additional possible outcome:
+  // B) linear seek the restart interval and not found, the iter stops at the
+  //    starting of the next restart interval. The key does not exist
+  //    anywhere.
+  // C) linear seek the restart interval and not found, the iter stops at the
+  //    the end of the block, i.e. restarts_. The key may exist in the next
+  //    block.
+  // So these combinations are possible when searching non-existent user_key:
+  //
+  // case#    may_exist  iter->Valid()
+  //     A         true          false
+  //     B        false           true
+  //     C         true          false
+
+  for (int i = 0; i < num_records; i++) {
+    auto iter = reader.NewDataIterator(icmp.user_comparator(),
+                                       kDisableGlobalSequenceNumber);
+    // find a random key in the lookaside array
+    int index = rnd.Uniform(num_records);
+    std::string ukey(keys[index] + "0" /* non-existing key marker */);
+    InternalKey ikey(ukey, 0, kTypeValue);
+
+    // search in block for this key
+    bool may_exist = iter->SeekForGet(ikey.Encode().ToString());
+    if (!may_exist) {
+      ASSERT_TRUE(iter->Valid());
+    }
+    if (!iter->Valid()) {
+      ASSERT_TRUE(may_exist);
+    }
+
+    delete iter;
+  }
+}
+
+// helper routine for DataBlockHashIndex.BlockBoundary
+void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2,
+                  std::string& v2, InternalKey& seek_ikey,
+                  GetContext& get_context, Options& options) {
+  std::unique_ptr<WritableFileWriter> file_writer;
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  std::unique_ptr<TableReader> table_reader;
+  int level_ = -1;
+
+  std::vector<std::string> keys;
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  const InternalKeyComparator internal_comparator(options.comparator);
+
+  EnvOptions soptions;
+
+  soptions.use_mmap_reads = ioptions.allow_mmap_reads;
+  test::StringSink* sink = new test::StringSink();
+  std::unique_ptr<FSWritableFile> f(sink);
+  file_writer.reset(
+      new WritableFileWriter(std::move(f), "" /* don't care */, FileOptions()));
+  std::unique_ptr<TableBuilder> builder;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+  std::string column_family_name;
+  builder.reset(ioptions.table_factory->NewTableBuilder(
+      TableBuilderOptions(
+          ioptions, moptions, internal_comparator,
+          &int_tbl_prop_collector_factories, options.compression,
+          CompressionOptions(),
+          TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+          column_family_name, level_),
+      file_writer.get()));
+
+  builder->Add(ik1.Encode().ToString(), v1);
+  builder->Add(ik2.Encode().ToString(), v2);
+  EXPECT_TRUE(builder->status().ok());
+
+  Status s = builder->Finish();
+  ASSERT_OK(file_writer->Flush());
+  EXPECT_TRUE(s.ok()) << s.ToString();
+
+  EXPECT_EQ(sink->contents().size(), builder->FileSize());
+
+  // Open the table
+  test::StringSource* source = new test::StringSource(
+      sink->contents(), 0 /*uniq_id*/, ioptions.allow_mmap_reads);
+  std::unique_ptr<FSRandomAccessFile> file(source);
+  file_reader.reset(new RandomAccessFileReader(std::move(file), "test"));
+  const bool kSkipFilters = true;
+  const bool kImmortal = true;
+  ASSERT_OK(ioptions.table_factory->NewTableReader(
+      TableReaderOptions(ioptions, moptions.prefix_extractor, soptions,
+                         internal_comparator, !kSkipFilters, !kImmortal,
+                         level_),
+      std::move(file_reader), sink->contents().size(), &table_reader));
+  // Search using Get()
+  ReadOptions ro;
+
+  ASSERT_OK(table_reader->Get(ro, seek_ikey.Encode().ToString(), &get_context,
+                              moptions.prefix_extractor.get()));
+}
+
+TEST(DataBlockHashIndex, BlockBoundary) {
+  BlockBasedTableOptions table_options;
+  table_options.data_block_index_type =
+      BlockBasedTableOptions::kDataBlockBinaryAndHash;
+  table_options.block_restart_interval = 1;
+  table_options.block_size = 4096;
+
+  Options options;
+  options.comparator = BytewiseComparator();
+
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  // insert two large k/v pair. Given that the block_size is 4096, one k/v
+  // pair will take up one block.
+  // [    k1/v1   ][    k2/v2  ]
+  // [   Block N  ][ Block N+1 ]
+
+  {
+    // [ "aab"@100 ][ "axy"@10  ]
+    // | Block  N  ][ Block N+1 ]
+    // seek for "axy"@60
+    std::string uk1("aab");
+    InternalKey ik1(uk1, 100, kTypeValue);
+    std::string v1(4100, '1');  // large value
+
+    std::string uk2("axy");
+    InternalKey ik2(uk2, 10, kTypeValue);
+    std::string v2(4100, '2');  // large value
+
+    PinnableSlice value;
+    std::string seek_ukey("axy");
+    InternalKey seek_ikey(seek_ukey, 60, kTypeValue);
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, seek_ukey, &value, nullptr,
+                           nullptr, nullptr, true, nullptr, nullptr);
+
+    TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
+    ASSERT_EQ(get_context.State(), GetContext::kFound);
+    ASSERT_EQ(value, v2);
+    value.Reset();
+  }
+
+  {
+    // [ "axy"@100 ][ "axy"@10  ]
+    // | Block  N  ][ Block N+1 ]
+    // seek for "axy"@60
+    std::string uk1("axy");
+    InternalKey ik1(uk1, 100, kTypeValue);
+    std::string v1(4100, '1');  // large value
+
+    std::string uk2("axy");
+    InternalKey ik2(uk2, 10, kTypeValue);
+    std::string v2(4100, '2');  // large value
+
+    PinnableSlice value;
+    std::string seek_ukey("axy");
+    InternalKey seek_ikey(seek_ukey, 60, kTypeValue);
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, seek_ukey, &value, nullptr,
+                           nullptr, nullptr, true, nullptr, nullptr);
+
+    TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
+    ASSERT_EQ(get_context.State(), GetContext::kFound);
+    ASSERT_EQ(value, v2);
+    value.Reset();
+  }
+
+  {
+    // [ "axy"@100 ][ "axy"@10  ]
+    // | Block  N  ][ Block N+1 ]
+    // seek for "axy"@120
+    std::string uk1("axy");
+    InternalKey ik1(uk1, 100, kTypeValue);
+    std::string v1(4100, '1');  // large value
+
+    std::string uk2("axy");
+    InternalKey ik2(uk2, 10, kTypeValue);
+    std::string v2(4100, '2');  // large value
+
+    PinnableSlice value;
+    std::string seek_ukey("axy");
+    InternalKey seek_ikey(seek_ukey, 120, kTypeValue);
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, seek_ukey, &value, nullptr,
+                           nullptr, nullptr, true, nullptr, nullptr);
+
+    TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
+    ASSERT_EQ(get_context.State(), GetContext::kFound);
+    ASSERT_EQ(value, v1);
+    value.Reset();
+  }
+
+  {
+    // [ "axy"@100 ][ "axy"@10  ]
+    // | Block  N  ][ Block N+1 ]
+    // seek for "axy"@5
+    std::string uk1("axy");
+    InternalKey ik1(uk1, 100, kTypeValue);
+    std::string v1(4100, '1');  // large value
+
+    std::string uk2("axy");
+    InternalKey ik2(uk2, 10, kTypeValue);
+    std::string v2(4100, '2');  // large value
+
+    PinnableSlice value;
+    std::string seek_ukey("axy");
+    InternalKey seek_ikey(seek_ukey, 5, kTypeValue);
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, seek_ukey, &value, nullptr,
+                           nullptr, nullptr, true, nullptr, nullptr);
+
+    TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
+    ASSERT_EQ(get_context.State(), GetContext::kNotFound);
+    value.Reset();
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/block_based/filter_block.h b/src/rocksdb/table/block_based/filter_block.h
new file mode 100644
index 000000000..e1e206990
--- /dev/null
+++ b/src/rocksdb/table/block_based/filter_block.h
@@ -0,0 +1,182 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A filter block is stored near the end of a Table file.  It contains
+// filters (e.g., bloom filters) for all data blocks in the table combined
+// into a single filter block.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "table/format.h"
+#include "table/multiget_context.h"
+#include "trace_replay/block_cache_tracer.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const uint64_t kNotValid = ULLONG_MAX;
+class FilterPolicy;
+
+class GetContext;
+using MultiGetRange = MultiGetContext::Range;
+
+// A FilterBlockBuilder is used to construct all of the filters for a
+// particular Table.  It generates a single string which is stored as
+// a special block in the Table, or partitioned into smaller filters.
+//
+// The sequence of calls to FilterBlockBuilder must match the regexp:
+//      Add* Finish
+class FilterBlockBuilder {
+ public:
+  explicit FilterBlockBuilder() {}
+  // No copying allowed
+  FilterBlockBuilder(const FilterBlockBuilder&) = delete;
+  void operator=(const FilterBlockBuilder&) = delete;
+
+  virtual ~FilterBlockBuilder() {}
+
+  virtual void Add(
+      const Slice& key_without_ts) = 0;  // Add a key to current filter
+  virtual bool IsEmpty() const = 0;      // Empty == none added
+  // For reporting stats on how many entries the builder considered unique
+  virtual size_t EstimateEntriesAdded() = 0;
+  Slice Finish() {  // Generate Filter
+    const BlockHandle empty_handle;
+    Status dont_care_status;
+    auto ret = Finish(empty_handle, &dont_care_status);
+    assert(dont_care_status.ok());
+    return ret;
+  }
+  // If filter_data is not nullptr, Finish() may transfer ownership of
+  // underlying filter data to the caller,  so that it can be freed as soon as
+  // possible. BlockBasedFilterBlock will ignore this parameter.
+  //
+  virtual Slice Finish(
+      const BlockHandle& tmp /* only used in PartitionedFilterBlock as
+                                last_partition_block_handle */
+      ,
+      Status* status, std::unique_ptr<const char[]>* filter_data = nullptr) = 0;
+
+  // This is called when finishes using the FilterBitsBuilder
+  // in order to release memory usage and cache charge
+  // associated with it timely
+  virtual void ResetFilterBitsBuilder() {}
+
+  // To optionally post-verify the filter returned from
+  // FilterBlockBuilder::Finish.
+  // Return Status::OK() if skipped.
+  virtual Status MaybePostVerifyFilter(const Slice& /* filter_content */) {
+    return Status::OK();
+  }
+};
+
+// A FilterBlockReader is used to parse filter from SST table.
+// KeyMayMatch and PrefixMayMatch would trigger filter checking
+//
+// BlockBased/Full FilterBlock would be called in the same way.
+class FilterBlockReader {
+ public:
+  FilterBlockReader() = default;
+  virtual ~FilterBlockReader() = default;
+
+  FilterBlockReader(const FilterBlockReader&) = delete;
+  FilterBlockReader& operator=(const FilterBlockReader&) = delete;
+
+  /**
+   * If no_io is set, then it returns true if it cannot answer the query without
+   * reading data from disk. This is used in PartitionedFilterBlockReader to
+   * avoid reading partitions that are not in block cache already
+   *
+   * Normally filters are built on only the user keys and the InternalKey is not
+   * needed for a query. The index in PartitionedFilterBlockReader however is
+   * built upon InternalKey and must be provided via const_ikey_ptr when running
+   * queries.
+   */
+  virtual bool KeyMayMatch(const Slice& key, const bool no_io,
+                           const Slice* const const_ikey_ptr,
+                           GetContext* get_context,
+                           BlockCacheLookupContext* lookup_context,
+                           Env::IOPriority rate_limiter_priority) = 0;
+
+  virtual void KeysMayMatch(MultiGetRange* range, const bool no_io,
+                            BlockCacheLookupContext* lookup_context,
+                            Env::IOPriority rate_limiter_priority) {
+    for (auto iter = range->begin(); iter != range->end(); ++iter) {
+      const Slice ukey_without_ts = iter->ukey_without_ts;
+      const Slice ikey = iter->ikey;
+      GetContext* const get_context = iter->get_context;
+      if (!KeyMayMatch(ukey_without_ts, no_io, &ikey, get_context,
+                       lookup_context, rate_limiter_priority)) {
+        range->SkipKey(iter);
+      }
+    }
+  }
+
+  /**
+   * no_io and const_ikey_ptr here means the same as in KeyMayMatch
+   */
+  virtual bool PrefixMayMatch(const Slice& prefix, const bool no_io,
+                              const Slice* const const_ikey_ptr,
+                              GetContext* get_context,
+                              BlockCacheLookupContext* lookup_context,
+                              Env::IOPriority rate_limiter_priority) = 0;
+
+  virtual void PrefixesMayMatch(MultiGetRange* range,
+                                const SliceTransform* prefix_extractor,
+                                const bool no_io,
+                                BlockCacheLookupContext* lookup_context,
+                                Env::IOPriority rate_limiter_priority) {
+    for (auto iter = range->begin(); iter != range->end(); ++iter) {
+      const Slice ukey_without_ts = iter->ukey_without_ts;
+      const Slice ikey = iter->ikey;
+      GetContext* const get_context = iter->get_context;
+      if (prefix_extractor->InDomain(ukey_without_ts) &&
+          !PrefixMayMatch(prefix_extractor->Transform(ukey_without_ts), no_io,
+                          &ikey, get_context, lookup_context,
+                          rate_limiter_priority)) {
+        range->SkipKey(iter);
+      }
+    }
+  }
+
+  virtual size_t ApproximateMemoryUsage() const = 0;
+
+  // convert this object to a human readable form
+  virtual std::string ToString() const {
+    std::string error_msg("Unsupported filter \n");
+    return error_msg;
+  }
+
+  virtual Status CacheDependencies(const ReadOptions& /*ro*/, bool /*pin*/) {
+    return Status::OK();
+  }
+
+  virtual bool RangeMayExist(const Slice* /*iterate_upper_bound*/,
+                             const Slice& user_key_without_ts,
+                             const SliceTransform* prefix_extractor,
+                             const Comparator* /*comparator*/,
+                             const Slice* const const_ikey_ptr,
+                             bool* filter_checked, bool need_upper_bound_check,
+                             bool no_io,
+                             BlockCacheLookupContext* lookup_context,
+                             Env::IOPriority rate_limiter_priority) = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/filter_block_reader_common.cc b/src/rocksdb/table/block_based/filter_block_reader_common.cc
new file mode 100644
index 000000000..7dc49e83e
--- /dev/null
+++ b/src/rocksdb/table/block_based/filter_block_reader_common.cc
@@ -0,0 +1,164 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "table/block_based/filter_block_reader_common.h"
+
+#include "monitoring/perf_context_imp.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/parsed_full_filter_block.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+template <typename TBlocklike>
+Status FilterBlockReaderCommon<TBlocklike>::ReadFilterBlock(
+    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    const ReadOptions& read_options, bool use_cache, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<TBlocklike>* filter_block, BlockType block_type) {
+  PERF_TIMER_GUARD(read_filter_block_nanos);
+
+  assert(table);
+  assert(filter_block);
+  assert(filter_block->IsEmpty());
+
+  const BlockBasedTable::Rep* const rep = table->get_rep();
+  assert(rep);
+
+  const Status s =
+      table->RetrieveBlock(prefetch_buffer, read_options, rep->filter_handle,
+                           UncompressionDict::GetEmptyDict(), filter_block,
+                           block_type, get_context, lookup_context,
+                           /* for_compaction */ false, use_cache,
+                           /* wait_for_cache */ true, /* async_read */ false);
+
+  return s;
+}
+
+template <typename TBlocklike>
+const SliceTransform*
+FilterBlockReaderCommon<TBlocklike>::table_prefix_extractor() const {
+  assert(table_);
+
+  const BlockBasedTable::Rep* const rep = table_->get_rep();
+  assert(rep);
+
+  return rep->prefix_filtering ? rep->table_prefix_extractor.get() : nullptr;
+}
+
+template <typename TBlocklike>
+bool FilterBlockReaderCommon<TBlocklike>::whole_key_filtering() const {
+  assert(table_);
+  assert(table_->get_rep());
+
+  return table_->get_rep()->whole_key_filtering;
+}
+
+template <typename TBlocklike>
+bool FilterBlockReaderCommon<TBlocklike>::cache_filter_blocks() const {
+  assert(table_);
+  assert(table_->get_rep());
+
+  return table_->get_rep()->table_options.cache_index_and_filter_blocks;
+}
+
+template <typename TBlocklike>
+Status FilterBlockReaderCommon<TBlocklike>::GetOrReadFilterBlock(
+    bool no_io, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<TBlocklike>* filter_block, BlockType block_type,
+    Env::IOPriority rate_limiter_priority) const {
+  assert(filter_block);
+
+  if (!filter_block_.IsEmpty()) {
+    filter_block->SetUnownedValue(filter_block_.GetValue());
+    return Status::OK();
+  }
+
+  ReadOptions read_options;
+  read_options.rate_limiter_priority = rate_limiter_priority;
+  if (no_io) {
+    read_options.read_tier = kBlockCacheTier;
+  }
+
+  return ReadFilterBlock(table_, nullptr /* prefetch_buffer */, read_options,
+                         cache_filter_blocks(), get_context, lookup_context,
+                         filter_block, block_type);
+}
+
+template <typename TBlocklike>
+size_t FilterBlockReaderCommon<TBlocklike>::ApproximateFilterBlockMemoryUsage()
+    const {
+  assert(!filter_block_.GetOwnValue() || filter_block_.GetValue() != nullptr);
+  return filter_block_.GetOwnValue()
+             ? filter_block_.GetValue()->ApproximateMemoryUsage()
+             : 0;
+}
+
+template <typename TBlocklike>
+bool FilterBlockReaderCommon<TBlocklike>::RangeMayExist(
+    const Slice* iterate_upper_bound, const Slice& user_key_without_ts,
+    const SliceTransform* prefix_extractor, const Comparator* comparator,
+    const Slice* const const_ikey_ptr, bool* filter_checked,
+    bool need_upper_bound_check, bool no_io,
+    BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority) {
+  if (!prefix_extractor || !prefix_extractor->InDomain(user_key_without_ts)) {
+    *filter_checked = false;
+    return true;
+  }
+  Slice prefix = prefix_extractor->Transform(user_key_without_ts);
+  if (need_upper_bound_check &&
+      !IsFilterCompatible(iterate_upper_bound, prefix, comparator)) {
+    *filter_checked = false;
+    return true;
+  } else {
+    *filter_checked = true;
+    return PrefixMayMatch(prefix, no_io, const_ikey_ptr,
+                          /* get_context */ nullptr, lookup_context,
+                          rate_limiter_priority);
+  }
+}
+
+template <typename TBlocklike>
+bool FilterBlockReaderCommon<TBlocklike>::IsFilterCompatible(
+    const Slice* iterate_upper_bound, const Slice& prefix,
+    const Comparator* comparator) const {
+  // Try to reuse the bloom filter in the SST table if prefix_extractor in
+  // mutable_cf_options has changed. If range [user_key, upper_bound) all
+  // share the same prefix then we may still be able to use the bloom filter.
+  const SliceTransform* const prefix_extractor = table_prefix_extractor();
+  if (iterate_upper_bound != nullptr && prefix_extractor) {
+    if (!prefix_extractor->InDomain(*iterate_upper_bound)) {
+      return false;
+    }
+    Slice upper_bound_xform = prefix_extractor->Transform(*iterate_upper_bound);
+    // first check if user_key and upper_bound all share the same prefix
+    if (comparator->CompareWithoutTimestamp(prefix, false, upper_bound_xform,
+                                            false) != 0) {
+      // second check if user_key's prefix is the immediate predecessor of
+      // upper_bound and have the same length. If so, we know for sure all
+      // keys in the range [user_key, upper_bound) share the same prefix.
+      // Also need to make sure upper_bound are full length to ensure
+      // correctness
+      if (!full_length_enabled_ ||
+          iterate_upper_bound->size() != prefix_extractor_full_length_ ||
+          !comparator->IsSameLengthImmediateSuccessor(prefix,
+                                                      *iterate_upper_bound)) {
+        return false;
+      }
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Explicitly instantiate templates for both "blocklike" types we use.
+// This makes it possible to keep the template definitions in the .cc file.
+template class FilterBlockReaderCommon<Block>;
+template class FilterBlockReaderCommon<ParsedFullFilterBlock>;
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/filter_block_reader_common.h b/src/rocksdb/table/block_based/filter_block_reader_common.h
new file mode 100644
index 000000000..ca07f5050
--- /dev/null
+++ b/src/rocksdb/table/block_based/filter_block_reader_common.h
@@ -0,0 +1,79 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include <cassert>
+
+#include "block_type.h"
+#include "table/block_based/cachable_entry.h"
+#include "table/block_based/filter_block.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlockBasedTable;
+class FilePrefetchBuffer;
+
+// Encapsulates common functionality for the various filter block reader
+// implementations. Provides access to the filter block regardless of whether
+// it is owned by the reader or stored in the cache, or whether it is pinned
+// in the cache or not.
+template <typename TBlocklike>
+class FilterBlockReaderCommon : public FilterBlockReader {
+ public:
+  FilterBlockReaderCommon(const BlockBasedTable* t,
+                          CachableEntry<TBlocklike>&& filter_block)
+      : table_(t), filter_block_(std::move(filter_block)) {
+    assert(table_);
+    const SliceTransform* const prefix_extractor = table_prefix_extractor();
+    if (prefix_extractor) {
+      full_length_enabled_ =
+          prefix_extractor->FullLengthEnabled(&prefix_extractor_full_length_);
+    }
+  }
+
+  bool RangeMayExist(const Slice* iterate_upper_bound, const Slice& user_key,
+                     const SliceTransform* prefix_extractor,
+                     const Comparator* comparator,
+                     const Slice* const const_ikey_ptr, bool* filter_checked,
+                     bool need_upper_bound_check, bool no_io,
+                     BlockCacheLookupContext* lookup_context,
+                     Env::IOPriority rate_limiter_priority) override;
+
+ protected:
+  static Status ReadFilterBlock(const BlockBasedTable* table,
+                                FilePrefetchBuffer* prefetch_buffer,
+                                const ReadOptions& read_options, bool use_cache,
+                                GetContext* get_context,
+                                BlockCacheLookupContext* lookup_context,
+                                CachableEntry<TBlocklike>* filter_block,
+                                BlockType block_type);
+
+  const BlockBasedTable* table() const { return table_; }
+  const SliceTransform* table_prefix_extractor() const;
+  bool whole_key_filtering() const;
+  bool cache_filter_blocks() const;
+
+  Status GetOrReadFilterBlock(bool no_io, GetContext* get_context,
+                              BlockCacheLookupContext* lookup_context,
+                              CachableEntry<TBlocklike>* filter_block,
+                              BlockType block_type,
+                              Env::IOPriority rate_limiter_priority) const;
+
+  size_t ApproximateFilterBlockMemoryUsage() const;
+
+ private:
+  bool IsFilterCompatible(const Slice* iterate_upper_bound, const Slice& prefix,
+                          const Comparator* comparator) const;
+
+ private:
+  const BlockBasedTable* table_;
+  CachableEntry<TBlocklike> filter_block_;
+  size_t prefix_extractor_full_length_ = 0;
+  bool full_length_enabled_ = false;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/filter_policy.cc b/src/rocksdb/table/block_based/filter_policy.cc
new file mode 100644
index 000000000..f84f804dd
--- /dev/null
+++ b/src/rocksdb/table/block_based/filter_policy.cc
@@ -0,0 +1,1973 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/filter_policy.h"
+
+#include <array>
+#include <climits>
+#include <cstring>
+#include <deque>
+#include <limits>
+#include <memory>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_reservation_manager.h"
+#include "logging/logging.h"
+#include "port/lang.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "table/block_based/full_filter_block.h"
+#include "util/bloom_impl.h"
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/math.h"
+#include "util/ribbon_config.h"
+#include "util/ribbon_impl.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Metadata trailer size for built-in filters. (This is separate from
+// block-based table block trailer.)
+//
+// Originally this was 1 byte for num_probes and 4 bytes for number of
+// cache lines in the Bloom filter, but now the first trailer byte is
+// usually an implementation marker and remaining 4 bytes have various
+// meanings.
+static constexpr uint32_t kMetadataLen = 5;
+
+Slice FinishAlwaysFalse(std::unique_ptr<const char[]>* /*buf*/) {
+  // Missing metadata, treated as zero entries
+  return Slice(nullptr, 0);
+}
+
+Slice FinishAlwaysTrue(std::unique_ptr<const char[]>* /*buf*/) {
+  return Slice("\0\0\0\0\0\0", 6);
+}
+
+// Base class for filter builders using the XXH3 preview hash,
+// also known as Hash64 or GetSliceHash64.
+class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder {
+ public:
+  explicit XXPH3FilterBitsBuilder(
+      std::atomic<int64_t>* aggregate_rounding_balance,
+      std::shared_ptr<CacheReservationManager> cache_res_mgr,
+      bool detect_filter_construct_corruption)
+      : aggregate_rounding_balance_(aggregate_rounding_balance),
+        cache_res_mgr_(cache_res_mgr),
+        detect_filter_construct_corruption_(
+            detect_filter_construct_corruption) {}
+
+  ~XXPH3FilterBitsBuilder() override {}
+
+  virtual void AddKey(const Slice& key) override {
+    uint64_t hash = GetSliceHash64(key);
+    // Especially with prefixes, it is common to have repetition,
+    // though only adjacent repetition, which we want to immediately
+    // recognize and collapse for estimating true filter space
+    // requirements.
+    if (hash_entries_info_.entries.empty() ||
+        hash != hash_entries_info_.entries.back()) {
+      if (detect_filter_construct_corruption_) {
+        hash_entries_info_.xor_checksum ^= hash;
+      }
+      hash_entries_info_.entries.push_back(hash);
+      if (cache_res_mgr_ &&
+          // Traditional rounding to whole bucket size
+          ((hash_entries_info_.entries.size() %
+            kUint64tHashEntryCacheResBucketSize) ==
+           kUint64tHashEntryCacheResBucketSize / 2)) {
+        hash_entries_info_.cache_res_bucket_handles.emplace_back(nullptr);
+        Status s = cache_res_mgr_->MakeCacheReservation(
+            kUint64tHashEntryCacheResBucketSize * sizeof(hash),
+            &hash_entries_info_.cache_res_bucket_handles.back());
+        s.PermitUncheckedError();
+      }
+    }
+  }
+
+  virtual size_t EstimateEntriesAdded() override {
+    return hash_entries_info_.entries.size();
+  }
+
+  virtual Status MaybePostVerify(const Slice& filter_content) override;
+
+ protected:
+  static constexpr uint32_t kMetadataLen = 5;
+
+  // Number of hash entries to accumulate before charging their memory usage to
+  // the cache when cache charging is available
+  static const std::size_t kUint64tHashEntryCacheResBucketSize =
+      CacheReservationManagerImpl<
+          CacheEntryRole::kFilterConstruction>::GetDummyEntrySize() /
+      sizeof(uint64_t);
+
+  // For delegating between XXPH3FilterBitsBuilders
+  void SwapEntriesWith(XXPH3FilterBitsBuilder* other) {
+    assert(other != nullptr);
+    hash_entries_info_.Swap(&(other->hash_entries_info_));
+  }
+
+  void ResetEntries() { hash_entries_info_.Reset(); }
+
+  virtual size_t RoundDownUsableSpace(size_t available_size) = 0;
+
+  // To choose size using malloc_usable_size, we have to actually allocate.
+  size_t AllocateMaybeRounding(size_t target_len_with_metadata,
+                               size_t num_entries,
+                               std::unique_ptr<char[]>* buf) {
+    // Return value set to a default; overwritten in some cases
+    size_t rv = target_len_with_metadata;
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    if (aggregate_rounding_balance_ != nullptr) {
+      // Do optimize_filters_for_memory, using malloc_usable_size.
+      // Approach: try to keep FP rate balance better than or on
+      // target (negative aggregate_rounding_balance_). We can then select a
+      // lower bound filter size (within reasonable limits) that gets us as
+      // close to on target as possible. We request allocation for that filter
+      // size and use malloc_usable_size to "round up" to the actual
+      // allocation size.
+
+      // Although it can be considered bad practice to use malloc_usable_size
+      // to access an object beyond its original size, this approach should be
+      // quite general: working for all allocators that properly support
+      // malloc_usable_size.
+
+      // Race condition on balance is OK because it can only cause temporary
+      // skew in rounding up vs. rounding down, as long as updates are atomic
+      // and relative.
+      int64_t balance = aggregate_rounding_balance_->load();
+
+      double target_fp_rate =
+          EstimatedFpRate(num_entries, target_len_with_metadata);
+      double rv_fp_rate = target_fp_rate;
+
+      if (balance < 0) {
+        // See formula for BloomFilterPolicy::aggregate_rounding_balance_
+        double for_balance_fp_rate =
+            -balance / double{0x100000000} + target_fp_rate;
+
+        // To simplify, we just try a few modified smaller sizes. This also
+        // caps how much we vary filter size vs. target, to avoid outlier
+        // behavior from excessive variance.
+        size_t target_len = target_len_with_metadata - kMetadataLen;
+        assert(target_len < target_len_with_metadata);  // check underflow
+        for (uint64_t maybe_len_rough :
+             {uint64_t{3} * target_len / 4, uint64_t{13} * target_len / 16,
+              uint64_t{7} * target_len / 8, uint64_t{15} * target_len / 16}) {
+          size_t maybe_len_with_metadata =
+              RoundDownUsableSpace(maybe_len_rough + kMetadataLen);
+          double maybe_fp_rate =
+              EstimatedFpRate(num_entries, maybe_len_with_metadata);
+          if (maybe_fp_rate <= for_balance_fp_rate) {
+            rv = maybe_len_with_metadata;
+            rv_fp_rate = maybe_fp_rate;
+            break;
+          }
+        }
+      }
+
+      // Filter blocks are loaded into block cache with their block trailer.
+      // We need to make sure that's accounted for in choosing a
+      // fragmentation-friendly size.
+      const size_t kExtraPadding = BlockBasedTable::kBlockTrailerSize;
+      size_t requested = rv + kExtraPadding;
+
+      // Allocate and get usable size
+      buf->reset(new char[requested]);
+      size_t usable = malloc_usable_size(buf->get());
+
+      if (usable - usable / 4 > requested) {
+        // Ratio greater than 4/3 is too much for utilizing, if it's
+        // not a buggy or mislinked malloc_usable_size implementation.
+        // Non-linearity of FP rates with bits/key means rapidly
+        // diminishing returns in overall accuracy for additional
+        // storage on disk.
+        // Nothing to do, except assert that the result is accurate about
+        // the usable size. (Assignment never used.)
+        assert(((*buf)[usable - 1] = 'x'));
+      } else if (usable > requested) {
+        rv = RoundDownUsableSpace(usable - kExtraPadding);
+        assert(rv <= usable - kExtraPadding);
+        rv_fp_rate = EstimatedFpRate(num_entries, rv);
+      } else {
+        // Too small means bad malloc_usable_size
+        assert(usable == requested);
+      }
+      memset(buf->get(), 0, rv);
+
+      // Update balance
+      int64_t diff = static_cast<int64_t>((rv_fp_rate - target_fp_rate) *
+                                          double{0x100000000});
+      *aggregate_rounding_balance_ += diff;
+    } else {
+      buf->reset(new char[rv]());
+    }
+#else
+    (void)num_entries;
+    buf->reset(new char[rv]());
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    return rv;
+  }
+
+  // TODO: Ideally we want to verify the hash entry
+  // as it is added to the filter and eliminate this function
+  // for speeding up and leaving fewer spaces for undetected memory/CPU
+  // corruption. For Ribbon Filter, it's bit harder.
+  // Possible solution:
+  // pass a custom iterator that tracks the xor checksum as
+  // it iterates to ResetAndFindSeedToSolve
+  Status MaybeVerifyHashEntriesChecksum() {
+    if (!detect_filter_construct_corruption_) {
+      return Status::OK();
+    }
+
+    uint64_t actual_hash_entries_xor_checksum = 0;
+    for (uint64_t h : hash_entries_info_.entries) {
+      actual_hash_entries_xor_checksum ^= h;
+    }
+
+    if (actual_hash_entries_xor_checksum == hash_entries_info_.xor_checksum) {
+      return Status::OK();
+    } else {
+      // Since these hash entries are corrupted and they will not be used
+      // anymore, we can reset them and release memory.
+      ResetEntries();
+      return Status::Corruption("Filter's hash entries checksum mismatched");
+    }
+  }
+
+  // See BloomFilterPolicy::aggregate_rounding_balance_. If nullptr,
+  // always "round up" like historic behavior.
+  std::atomic<int64_t>* aggregate_rounding_balance_;
+
+  // For reserving memory used in (new) Bloom and Ribbon Filter construction
+  std::shared_ptr<CacheReservationManager> cache_res_mgr_;
+
+  // For managing cache charge for final filter in (new) Bloom and Ribbon
+  // Filter construction
+  std::deque<std::unique_ptr<CacheReservationManager::CacheReservationHandle>>
+      final_filter_cache_res_handles_;
+
+  bool detect_filter_construct_corruption_;
+
+  struct HashEntriesInfo {
+    // A deque avoids unnecessary copying of already-saved values
+    // and has near-minimal peak memory use.
+    std::deque<uint64_t> entries;
+
+    // If cache_res_mgr_ != nullptr,
+    // it manages cache charge for buckets of hash entries in (new) Bloom
+    // or Ribbon Filter construction.
+    // Otherwise, it is empty.
+    std::deque<std::unique_ptr<CacheReservationManager::CacheReservationHandle>>
+        cache_res_bucket_handles;
+
+    // If detect_filter_construct_corruption_ == true,
+    // it records the xor checksum of hash entries.
+    // Otherwise, it is 0.
+    uint64_t xor_checksum = 0;
+
+    void Swap(HashEntriesInfo* other) {
+      assert(other != nullptr);
+      std::swap(entries, other->entries);
+      std::swap(cache_res_bucket_handles, other->cache_res_bucket_handles);
+      std::swap(xor_checksum, other->xor_checksum);
+    }
+
+    void Reset() {
+      entries.clear();
+      cache_res_bucket_handles.clear();
+      xor_checksum = 0;
+    }
+  };
+
+  HashEntriesInfo hash_entries_info_;
+};
+
+// #################### FastLocalBloom implementation ################## //
+// ############## also known as format_version=5 Bloom filter ########## //
+
+// See description in FastLocalBloomImpl
+class FastLocalBloomBitsBuilder : public XXPH3FilterBitsBuilder {
+ public:
+  // Non-null aggregate_rounding_balance implies optimize_filters_for_memory
+  explicit FastLocalBloomBitsBuilder(
+      const int millibits_per_key,
+      std::atomic<int64_t>* aggregate_rounding_balance,
+      std::shared_ptr<CacheReservationManager> cache_res_mgr,
+      bool detect_filter_construct_corruption)
+      : XXPH3FilterBitsBuilder(aggregate_rounding_balance, cache_res_mgr,
+                               detect_filter_construct_corruption),
+        millibits_per_key_(millibits_per_key) {
+    assert(millibits_per_key >= 1000);
+  }
+
+  // No Copy allowed
+  FastLocalBloomBitsBuilder(const FastLocalBloomBitsBuilder&) = delete;
+  void operator=(const FastLocalBloomBitsBuilder&) = delete;
+
+  ~FastLocalBloomBitsBuilder() override {}
+
+  using FilterBitsBuilder::Finish;
+
+  virtual Slice Finish(std::unique_ptr<const char[]>* buf) override {
+    return Finish(buf, nullptr);
+  }
+
+  virtual Slice Finish(std::unique_ptr<const char[]>* buf,
+                       Status* status) override {
+    size_t num_entries = hash_entries_info_.entries.size();
+    size_t len_with_metadata = CalculateSpace(num_entries);
+
+    std::unique_ptr<char[]> mutable_buf;
+    std::unique_ptr<CacheReservationManager::CacheReservationHandle>
+        final_filter_cache_res_handle;
+    len_with_metadata =
+        AllocateMaybeRounding(len_with_metadata, num_entries, &mutable_buf);
+    // Cache charging for mutable_buf
+    if (cache_res_mgr_) {
+      Status s = cache_res_mgr_->MakeCacheReservation(
+          len_with_metadata * sizeof(char), &final_filter_cache_res_handle);
+      s.PermitUncheckedError();
+    }
+
+    assert(mutable_buf);
+    assert(len_with_metadata >= kMetadataLen);
+
+    // Max size supported by implementation
+    assert(len_with_metadata <= 0xffffffffU);
+
+    // Compute num_probes after any rounding / adjustments
+    int num_probes = GetNumProbes(num_entries, len_with_metadata);
+
+    uint32_t len = static_cast<uint32_t>(len_with_metadata - kMetadataLen);
+    if (len > 0) {
+      TEST_SYNC_POINT_CALLBACK(
+          "XXPH3FilterBitsBuilder::Finish::"
+          "TamperHashEntries",
+          &hash_entries_info_.entries);
+      AddAllEntries(mutable_buf.get(), len, num_probes);
+      Status verify_hash_entries_checksum_status =
+          MaybeVerifyHashEntriesChecksum();
+      if (!verify_hash_entries_checksum_status.ok()) {
+        if (status) {
+          *status = verify_hash_entries_checksum_status;
+        }
+        return FinishAlwaysTrue(buf);
+      }
+    }
+
+    bool keep_entries_for_postverify = detect_filter_construct_corruption_;
+    if (!keep_entries_for_postverify) {
+      ResetEntries();
+    }
+
+    // See BloomFilterPolicy::GetBloomBitsReader re: metadata
+    // -1 = Marker for newer Bloom implementations
+    mutable_buf[len] = static_cast<char>(-1);
+    // 0 = Marker for this sub-implementation
+    mutable_buf[len + 1] = static_cast<char>(0);
+    // num_probes (and 0 in upper bits for 64-byte block size)
+    mutable_buf[len + 2] = static_cast<char>(num_probes);
+    // rest of metadata stays zero
+
+    auto TEST_arg_pair __attribute__((__unused__)) =
+        std::make_pair(&mutable_buf, len_with_metadata);
+    TEST_SYNC_POINT_CALLBACK("XXPH3FilterBitsBuilder::Finish::TamperFilter",
+                             &TEST_arg_pair);
+
+    Slice rv(mutable_buf.get(), len_with_metadata);
+    *buf = std::move(mutable_buf);
+    final_filter_cache_res_handles_.push_back(
+        std::move(final_filter_cache_res_handle));
+    if (status) {
+      *status = Status::OK();
+    }
+    return rv;
+  }
+
+  size_t ApproximateNumEntries(size_t bytes) override {
+    size_t bytes_no_meta =
+        bytes >= kMetadataLen ? RoundDownUsableSpace(bytes) - kMetadataLen : 0;
+    return static_cast<size_t>(uint64_t{8000} * bytes_no_meta /
+                               millibits_per_key_);
+  }
+
+  size_t CalculateSpace(size_t num_entries) override {
+    // If not for cache line blocks in the filter, what would the target
+    // length in bytes be?
+    size_t raw_target_len = static_cast<size_t>(
+        (uint64_t{num_entries} * millibits_per_key_ + 7999) / 8000);
+
+    if (raw_target_len >= size_t{0xffffffc0}) {
+      // Max supported for this data structure implementation
+      raw_target_len = size_t{0xffffffc0};
+    }
+
+    // Round up to nearest multiple of 64 (block size). This adjustment is
+    // used for target FP rate only so that we don't receive complaints about
+    // lower FP rate vs. historic Bloom filter behavior.
+    return ((raw_target_len + 63) & ~size_t{63}) + kMetadataLen;
+  }
+
+  double EstimatedFpRate(size_t keys, size_t len_with_metadata) override {
+    int num_probes = GetNumProbes(keys, len_with_metadata);
+    return FastLocalBloomImpl::EstimatedFpRate(
+        keys, len_with_metadata - kMetadataLen, num_probes, /*hash bits*/ 64);
+  }
+
+ protected:
+  size_t RoundDownUsableSpace(size_t available_size) override {
+    size_t rv = available_size - kMetadataLen;
+
+    if (rv >= size_t{0xffffffc0}) {
+      // Max supported for this data structure implementation
+      rv = size_t{0xffffffc0};
+    }
+
+    // round down to multiple of 64 (block size)
+    rv &= ~size_t{63};
+
+    return rv + kMetadataLen;
+  }
+
+ private:
+  // Compute num_probes after any rounding / adjustments
+  int GetNumProbes(size_t keys, size_t len_with_metadata) {
+    uint64_t millibits = uint64_t{len_with_metadata - kMetadataLen} * 8000;
+    int actual_millibits_per_key =
+        static_cast<int>(millibits / std::max(keys, size_t{1}));
+    // BEGIN XXX/TODO(peterd): preserving old/default behavior for now to
+    // minimize unit test churn. Remove this some time.
+    if (!aggregate_rounding_balance_) {
+      actual_millibits_per_key = millibits_per_key_;
+    }
+    // END XXX/TODO
+    return FastLocalBloomImpl::ChooseNumProbes(actual_millibits_per_key);
+  }
+
+  void AddAllEntries(char* data, uint32_t len, int num_probes) {
+    // Simple version without prefetching:
+    //
+    // for (auto h : hash_entries_info_.entries) {
+    //   FastLocalBloomImpl::AddHash(Lower32of64(h), Upper32of64(h), len,
+    //                               num_probes, data);
+    // }
+
+    const size_t num_entries = hash_entries_info_.entries.size();
+    constexpr size_t kBufferMask = 7;
+    static_assert(((kBufferMask + 1) & kBufferMask) == 0,
+                  "Must be power of 2 minus 1");
+
+    std::array<uint32_t, kBufferMask + 1> hashes;
+    std::array<uint32_t, kBufferMask + 1> byte_offsets;
+
+    // Prime the buffer
+    size_t i = 0;
+    std::deque<uint64_t>::iterator hash_entries_it =
+        hash_entries_info_.entries.begin();
+    for (; i <= kBufferMask && i < num_entries; ++i) {
+      uint64_t h = *hash_entries_it;
+      FastLocalBloomImpl::PrepareHash(Lower32of64(h), len, data,
+                                      /*out*/ &byte_offsets[i]);
+      hashes[i] = Upper32of64(h);
+      ++hash_entries_it;
+    }
+
+    // Process and buffer
+    for (; i < num_entries; ++i) {
+      uint32_t& hash_ref = hashes[i & kBufferMask];
+      uint32_t& byte_offset_ref = byte_offsets[i & kBufferMask];
+      // Process (add)
+      FastLocalBloomImpl::AddHashPrepared(hash_ref, num_probes,
+                                          data + byte_offset_ref);
+      // And buffer
+      uint64_t h = *hash_entries_it;
+      FastLocalBloomImpl::PrepareHash(Lower32of64(h), len, data,
+                                      /*out*/ &byte_offset_ref);
+      hash_ref = Upper32of64(h);
+      ++hash_entries_it;
+    }
+
+    // Finish processing
+    for (i = 0; i <= kBufferMask && i < num_entries; ++i) {
+      FastLocalBloomImpl::AddHashPrepared(hashes[i], num_probes,
+                                          data + byte_offsets[i]);
+    }
+  }
+
+  // Target allocation per added key, in thousandths of a bit.
+  int millibits_per_key_;
+};
+
+// See description in FastLocalBloomImpl
+class FastLocalBloomBitsReader : public BuiltinFilterBitsReader {
+ public:
+  FastLocalBloomBitsReader(const char* data, int num_probes, uint32_t len_bytes)
+      : data_(data), num_probes_(num_probes), len_bytes_(len_bytes) {}
+
+  // No Copy allowed
+  FastLocalBloomBitsReader(const FastLocalBloomBitsReader&) = delete;
+  void operator=(const FastLocalBloomBitsReader&) = delete;
+
+  ~FastLocalBloomBitsReader() override {}
+
+  bool MayMatch(const Slice& key) override {
+    uint64_t h = GetSliceHash64(key);
+    uint32_t byte_offset;
+    FastLocalBloomImpl::PrepareHash(Lower32of64(h), len_bytes_, data_,
+                                    /*out*/ &byte_offset);
+    return FastLocalBloomImpl::HashMayMatchPrepared(Upper32of64(h), num_probes_,
+                                                    data_ + byte_offset);
+  }
+
+  virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
+    std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes;
+    std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets;
+    for (int i = 0; i < num_keys; ++i) {
+      uint64_t h = GetSliceHash64(*keys[i]);
+      FastLocalBloomImpl::PrepareHash(Lower32of64(h), len_bytes_, data_,
+                                      /*out*/ &byte_offsets[i]);
+      hashes[i] = Upper32of64(h);
+    }
+    for (int i = 0; i < num_keys; ++i) {
+      may_match[i] = FastLocalBloomImpl::HashMayMatchPrepared(
+          hashes[i], num_probes_, data_ + byte_offsets[i]);
+    }
+  }
+
+  bool HashMayMatch(const uint64_t h) override {
+    return FastLocalBloomImpl::HashMayMatch(Lower32of64(h), Upper32of64(h),
+                                            len_bytes_, num_probes_, data_);
+  }
+
+ private:
+  const char* data_;
+  const int num_probes_;
+  const uint32_t len_bytes_;
+};
+
+// ##################### Ribbon filter implementation ################### //
+
+// Implements concept RehasherTypesAndSettings in ribbon_impl.h
+struct Standard128RibbonRehasherTypesAndSettings {
+  // These are schema-critical. Any change almost certainly changes
+  // underlying data.
+  static constexpr bool kIsFilter = true;
+  static constexpr bool kHomogeneous = false;
+  static constexpr bool kFirstCoeffAlwaysOne = true;
+  static constexpr bool kUseSmash = false;
+  using CoeffRow = ROCKSDB_NAMESPACE::Unsigned128;
+  using Hash = uint64_t;
+  using Seed = uint32_t;
+  // Changing these doesn't necessarily change underlying data,
+  // but might affect supported scalability of those dimensions.
+  using Index = uint32_t;
+  using ResultRow = uint32_t;
+  // Save a conditional in Ribbon queries
+  static constexpr bool kAllowZeroStarts = false;
+};
+
+using Standard128RibbonTypesAndSettings =
+    ribbon::StandardRehasherAdapter<Standard128RibbonRehasherTypesAndSettings>;
+
+class Standard128RibbonBitsBuilder : public XXPH3FilterBitsBuilder {
+ public:
+  explicit Standard128RibbonBitsBuilder(
+      double desired_one_in_fp_rate, int bloom_millibits_per_key,
+      std::atomic<int64_t>* aggregate_rounding_balance,
+      std::shared_ptr<CacheReservationManager> cache_res_mgr,
+      bool detect_filter_construct_corruption, Logger* info_log)
+      : XXPH3FilterBitsBuilder(aggregate_rounding_balance, cache_res_mgr,
+                               detect_filter_construct_corruption),
+        desired_one_in_fp_rate_(desired_one_in_fp_rate),
+        info_log_(info_log),
+        bloom_fallback_(bloom_millibits_per_key, aggregate_rounding_balance,
+                        cache_res_mgr, detect_filter_construct_corruption) {
+    assert(desired_one_in_fp_rate >= 1.0);
+  }
+
+  // No Copy allowed
+  Standard128RibbonBitsBuilder(const Standard128RibbonBitsBuilder&) = delete;
+  void operator=(const Standard128RibbonBitsBuilder&) = delete;
+
+  ~Standard128RibbonBitsBuilder() override {}
+
+  using FilterBitsBuilder::Finish;
+
+  virtual Slice Finish(std::unique_ptr<const char[]>* buf) override {
+    return Finish(buf, nullptr);
+  }
+
+  virtual Slice Finish(std::unique_ptr<const char[]>* buf,
+                       Status* status) override {
+    if (hash_entries_info_.entries.size() > kMaxRibbonEntries) {
+      ROCKS_LOG_WARN(
+          info_log_, "Too many keys for Ribbon filter: %llu",
+          static_cast<unsigned long long>(hash_entries_info_.entries.size()));
+      SwapEntriesWith(&bloom_fallback_);
+      assert(hash_entries_info_.entries.empty());
+      return bloom_fallback_.Finish(buf, status);
+    }
+    if (hash_entries_info_.entries.size() == 0) {
+      // Save a conditional in Ribbon queries by using alternate reader
+      // for zero entries added.
+      if (status) {
+        *status = Status::OK();
+      }
+      return FinishAlwaysFalse(buf);
+    }
+    uint32_t num_entries =
+        static_cast<uint32_t>(hash_entries_info_.entries.size());
+    uint32_t num_slots;
+    size_t len_with_metadata;
+
+    CalculateSpaceAndSlots(num_entries, &len_with_metadata, &num_slots);
+
+    // Bloom fall-back indicator
+    if (num_slots == 0) {
+      SwapEntriesWith(&bloom_fallback_);
+      assert(hash_entries_info_.entries.empty());
+      return bloom_fallback_.Finish(buf, status);
+    }
+
+    uint32_t entropy = 0;
+    if (!hash_entries_info_.entries.empty()) {
+      entropy = Lower32of64(hash_entries_info_.entries.front());
+    }
+
+    BandingType banding;
+    std::size_t bytes_banding = ribbon::StandardBanding<
+        Standard128RibbonTypesAndSettings>::EstimateMemoryUsage(num_slots);
+    Status status_banding_cache_res = Status::OK();
+
+    // Cache charging for banding
+    std::unique_ptr<CacheReservationManager::CacheReservationHandle>
+        banding_res_handle;
+    if (cache_res_mgr_) {
+      status_banding_cache_res = cache_res_mgr_->MakeCacheReservation(
+          bytes_banding, &banding_res_handle);
+    }
+
+    if (status_banding_cache_res.IsMemoryLimit()) {
+      ROCKS_LOG_WARN(info_log_,
+                     "Cache charging for Ribbon filter banding failed due "
+                     "to cache full");
+      SwapEntriesWith(&bloom_fallback_);
+      assert(hash_entries_info_.entries.empty());
+      // Release cache for banding since the banding won't be allocated
+      banding_res_handle.reset();
+      return bloom_fallback_.Finish(buf, status);
+    }
+
+    TEST_SYNC_POINT_CALLBACK(
+        "XXPH3FilterBitsBuilder::Finish::"
+        "TamperHashEntries",
+        &hash_entries_info_.entries);
+
+    bool success = banding.ResetAndFindSeedToSolve(
+        num_slots, hash_entries_info_.entries.begin(),
+        hash_entries_info_.entries.end(),
+        /*starting seed*/ entropy & 255, /*seed mask*/ 255);
+    if (!success) {
+      ROCKS_LOG_WARN(
+          info_log_, "Too many re-seeds (256) for Ribbon filter, %llu / %llu",
+          static_cast<unsigned long long>(hash_entries_info_.entries.size()),
+          static_cast<unsigned long long>(num_slots));
+      SwapEntriesWith(&bloom_fallback_);
+      assert(hash_entries_info_.entries.empty());
+      return bloom_fallback_.Finish(buf, status);
+    }
+
+    Status verify_hash_entries_checksum_status =
+        MaybeVerifyHashEntriesChecksum();
+    if (!verify_hash_entries_checksum_status.ok()) {
+      ROCKS_LOG_WARN(info_log_, "Verify hash entries checksum error: %s",
+                     verify_hash_entries_checksum_status.getState());
+      if (status) {
+        *status = verify_hash_entries_checksum_status;
+      }
+      return FinishAlwaysTrue(buf);
+    }
+
+    bool keep_entries_for_postverify = detect_filter_construct_corruption_;
+    if (!keep_entries_for_postverify) {
+      ResetEntries();
+    }
+
+    uint32_t seed = banding.GetOrdinalSeed();
+    assert(seed < 256);
+
+    std::unique_ptr<char[]> mutable_buf;
+    std::unique_ptr<CacheReservationManager::CacheReservationHandle>
+        final_filter_cache_res_handle;
+    len_with_metadata =
+        AllocateMaybeRounding(len_with_metadata, num_entries, &mutable_buf);
+    // Cache charging for mutable_buf
+    if (cache_res_mgr_) {
+      Status s = cache_res_mgr_->MakeCacheReservation(
+          len_with_metadata * sizeof(char), &final_filter_cache_res_handle);
+      s.PermitUncheckedError();
+    }
+
+    SolnType soln(mutable_buf.get(), len_with_metadata);
+    soln.BackSubstFrom(banding);
+    uint32_t num_blocks = soln.GetNumBlocks();
+    // This should be guaranteed:
+    // num_entries < 2^30
+    // => (overhead_factor < 2.0)
+    // num_entries * overhead_factor == num_slots < 2^31
+    // => (num_blocks = num_slots / 128)
+    // num_blocks < 2^24
+    assert(num_blocks < 0x1000000U);
+
+    // See BloomFilterPolicy::GetBloomBitsReader re: metadata
+    // -2 = Marker for Standard128 Ribbon
+    mutable_buf[len_with_metadata - 5] = static_cast<char>(-2);
+    // Hash seed
+    mutable_buf[len_with_metadata - 4] = static_cast<char>(seed);
+    // Number of blocks, in 24 bits
+    // (Along with bytes, we can derive other settings)
+    mutable_buf[len_with_metadata - 3] = static_cast<char>(num_blocks & 255);
+    mutable_buf[len_with_metadata - 2] =
+        static_cast<char>((num_blocks >> 8) & 255);
+    mutable_buf[len_with_metadata - 1] =
+        static_cast<char>((num_blocks >> 16) & 255);
+
+    auto TEST_arg_pair __attribute__((__unused__)) =
+        std::make_pair(&mutable_buf, len_with_metadata);
+    TEST_SYNC_POINT_CALLBACK("XXPH3FilterBitsBuilder::Finish::TamperFilter",
+                             &TEST_arg_pair);
+
+    Slice rv(mutable_buf.get(), len_with_metadata);
+    *buf = std::move(mutable_buf);
+    final_filter_cache_res_handles_.push_back(
+        std::move(final_filter_cache_res_handle));
+    if (status) {
+      *status = Status::OK();
+    }
+    return rv;
+  }
+
+  // Setting num_slots to 0 means "fall back on Bloom filter."
+  // And note this implementation does not support num_entries or num_slots
+  // beyond uint32_t; see kMaxRibbonEntries.
+  void CalculateSpaceAndSlots(size_t num_entries,
+                              size_t* target_len_with_metadata,
+                              uint32_t* num_slots) {
+    if (num_entries > kMaxRibbonEntries) {
+      // More entries than supported by this Ribbon
+      *num_slots = 0;  // use Bloom
+      *target_len_with_metadata = bloom_fallback_.CalculateSpace(num_entries);
+      return;
+    }
+    uint32_t entropy = 0;
+    if (!hash_entries_info_.entries.empty()) {
+      entropy = Upper32of64(hash_entries_info_.entries.front());
+    }
+
+    *num_slots = NumEntriesToNumSlots(static_cast<uint32_t>(num_entries));
+    *target_len_with_metadata =
+        SolnType::GetBytesForOneInFpRate(*num_slots, desired_one_in_fp_rate_,
+                                         /*rounding*/ entropy) +
+        kMetadataLen;
+
+    // Consider possible Bloom fallback for small filters
+    if (*num_slots < 1024) {
+      size_t bloom = bloom_fallback_.CalculateSpace(num_entries);
+      if (bloom < *target_len_with_metadata) {
+        *num_slots = 0;  // use Bloom
+        *target_len_with_metadata = bloom;
+        return;
+      }
+    }
+  }
+
+  size_t CalculateSpace(size_t num_entries) override {
+    if (num_entries == 0) {
+      // See FinishAlwaysFalse
+      return 0;
+    }
+    size_t target_len_with_metadata;
+    uint32_t num_slots;
+    CalculateSpaceAndSlots(num_entries, &target_len_with_metadata, &num_slots);
+    (void)num_slots;
+    return target_len_with_metadata;
+  }
+
+  // This is a somewhat ugly but reasonably fast and reasonably accurate
+  // reversal of CalculateSpace.
+  size_t ApproximateNumEntries(size_t bytes) override {
+    size_t len_no_metadata =
+        RoundDownUsableSpace(std::max(bytes, size_t{kMetadataLen})) -
+        kMetadataLen;
+
+    if (!(desired_one_in_fp_rate_ > 1.0)) {
+      // Effectively asking for 100% FP rate, or NaN etc.
+      // Note that NaN is neither < 1.0 nor > 1.0
+      return kMaxRibbonEntries;
+    }
+
+    // Find a slight under-estimate for actual average bits per slot
+    double min_real_bits_per_slot;
+    if (desired_one_in_fp_rate_ >= 1.0 + std::numeric_limits<uint32_t>::max()) {
+      // Max of 32 solution columns (result bits)
+      min_real_bits_per_slot = 32.0;
+    } else {
+      // Account for mix of b and b+1 solution columns being slightly
+      // suboptimal vs. ideal log2(1/fp_rate) bits.
+      uint32_t rounded = static_cast<uint32_t>(desired_one_in_fp_rate_);
+      int upper_bits_per_key = 1 + FloorLog2(rounded);
+      double fp_rate_for_upper = std::pow(2.0, -upper_bits_per_key);
+      double portion_lower =
+          (1.0 / desired_one_in_fp_rate_ - fp_rate_for_upper) /
+          fp_rate_for_upper;
+      min_real_bits_per_slot = upper_bits_per_key - portion_lower;
+      assert(min_real_bits_per_slot > 0.0);
+      assert(min_real_bits_per_slot <= 32.0);
+    }
+
+    // An overestimate, but this should only be O(1) slots away from truth.
+    double max_slots = len_no_metadata * 8.0 / min_real_bits_per_slot;
+
+    // Let's not bother accounting for overflow to Bloom filter
+    // (Includes NaN case)
+    if (!(max_slots < ConfigHelper::GetNumSlots(kMaxRibbonEntries))) {
+      return kMaxRibbonEntries;
+    }
+
+    // Set up for short iteration
+    uint32_t slots = static_cast<uint32_t>(max_slots);
+    slots = SolnType::RoundUpNumSlots(slots);
+
+    // Assert that we have a valid upper bound on slots
+    assert(SolnType::GetBytesForOneInFpRate(
+               SolnType::RoundUpNumSlots(slots + 1), desired_one_in_fp_rate_,
+               /*rounding*/ 0) > len_no_metadata);
+
+    // Iterate up to a few times to rather precisely account for small effects
+    for (int i = 0; slots > 0; ++i) {
+      size_t reqd_bytes =
+          SolnType::GetBytesForOneInFpRate(slots, desired_one_in_fp_rate_,
+                                           /*rounding*/ 0);
+      if (reqd_bytes <= len_no_metadata) {
+        break;  // done
+      }
+      if (i >= 2) {
+        // should have been enough iterations
+        assert(false);
+        break;
+      }
+      slots = SolnType::RoundDownNumSlots(slots - 1);
+    }
+
+    uint32_t num_entries = ConfigHelper::GetNumToAdd(slots);
+
+    // Consider possible Bloom fallback for small filters
+    if (slots < 1024) {
+      size_t bloom = bloom_fallback_.ApproximateNumEntries(bytes);
+      if (bloom > num_entries) {
+        return bloom;
+      } else {
+        return num_entries;
+      }
+    } else {
+      return std::min(num_entries, kMaxRibbonEntries);
+    }
+  }
+
+  double EstimatedFpRate(size_t num_entries,
+                         size_t len_with_metadata) override {
+    if (num_entries > kMaxRibbonEntries) {
+      // More entries than supported by this Ribbon
+      return bloom_fallback_.EstimatedFpRate(num_entries, len_with_metadata);
+    }
+    uint32_t num_slots =
+        NumEntriesToNumSlots(static_cast<uint32_t>(num_entries));
+    SolnType fake_soln(nullptr, len_with_metadata);
+    fake_soln.ConfigureForNumSlots(num_slots);
+    return fake_soln.ExpectedFpRate();
+  }
+
+  Status MaybePostVerify(const Slice& filter_content) override {
+    bool fall_back = (bloom_fallback_.EstimateEntriesAdded() > 0);
+    return fall_back ? bloom_fallback_.MaybePostVerify(filter_content)
+                     : XXPH3FilterBitsBuilder::MaybePostVerify(filter_content);
+  }
+
+ protected:
+  size_t RoundDownUsableSpace(size_t available_size) override {
+    size_t rv = available_size - kMetadataLen;
+
+    // round down to multiple of 16 (segment size)
+    rv &= ~size_t{15};
+
+    return rv + kMetadataLen;
+  }
+
+ private:
+  using TS = Standard128RibbonTypesAndSettings;
+  using SolnType = ribbon::SerializableInterleavedSolution<TS>;
+  using BandingType = ribbon::StandardBanding<TS>;
+  using ConfigHelper = ribbon::BandingConfigHelper1TS<ribbon::kOneIn20, TS>;
+
+  static uint32_t NumEntriesToNumSlots(uint32_t num_entries) {
+    uint32_t num_slots1 = ConfigHelper::GetNumSlots(num_entries);
+    return SolnType::RoundUpNumSlots(num_slots1);
+  }
+
+  // Approximate num_entries to ensure number of bytes fits in 32 bits, which
+  // is not an inherent limitation but does ensure somewhat graceful Bloom
+  // fallback for crazy high number of entries, since the Bloom implementation
+  // does not support number of bytes bigger than fits in 32 bits. This is
+  // within an order of magnitude of implementation limit on num_slots
+  // fitting in 32 bits, and even closer for num_blocks fitting in 24 bits
+  // (for filter metadata).
+  static constexpr uint32_t kMaxRibbonEntries = 950000000;  // ~ 1 billion
+
+  // A desired value for 1/fp_rate. For example, 100 -> 1% fp rate.
+  double desired_one_in_fp_rate_;
+
+  // For warnings, or can be nullptr
+  Logger* info_log_;
+
+  // For falling back on Bloom filter in some exceptional cases and
+  // very small filter cases
+  FastLocalBloomBitsBuilder bloom_fallback_;
+};
+
+// for the linker, at least with DEBUG_LEVEL=2
+constexpr uint32_t Standard128RibbonBitsBuilder::kMaxRibbonEntries;
+
+class Standard128RibbonBitsReader : public BuiltinFilterBitsReader {
+ public:
+  Standard128RibbonBitsReader(const char* data, size_t len_bytes,
+                              uint32_t num_blocks, uint32_t seed)
+      : soln_(const_cast<char*>(data), len_bytes) {
+    soln_.ConfigureForNumBlocks(num_blocks);
+    hasher_.SetOrdinalSeed(seed);
+  }
+
+  // No Copy allowed
+  Standard128RibbonBitsReader(const Standard128RibbonBitsReader&) = delete;
+  void operator=(const Standard128RibbonBitsReader&) = delete;
+
+  ~Standard128RibbonBitsReader() override {}
+
+  bool MayMatch(const Slice& key) override {
+    uint64_t h = GetSliceHash64(key);
+    return soln_.FilterQuery(h, hasher_);
+  }
+
+  virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
+    struct SavedData {
+      uint64_t seeded_hash;
+      uint32_t segment_num;
+      uint32_t num_columns;
+      uint32_t start_bits;
+    };
+    std::array<SavedData, MultiGetContext::MAX_BATCH_SIZE> saved;
+    for (int i = 0; i < num_keys; ++i) {
+      ribbon::InterleavedPrepareQuery(
+          GetSliceHash64(*keys[i]), hasher_, soln_, &saved[i].seeded_hash,
+          &saved[i].segment_num, &saved[i].num_columns, &saved[i].start_bits);
+    }
+    for (int i = 0; i < num_keys; ++i) {
+      may_match[i] = ribbon::InterleavedFilterQuery(
+          saved[i].seeded_hash, saved[i].segment_num, saved[i].num_columns,
+          saved[i].start_bits, hasher_, soln_);
+    }
+  }
+
+  bool HashMayMatch(const uint64_t h) override {
+    return soln_.FilterQuery(h, hasher_);
+  }
+
+ private:
+  using TS = Standard128RibbonTypesAndSettings;
+  ribbon::SerializableInterleavedSolution<TS> soln_;
+  ribbon::StandardHasher<TS> hasher_;
+};
+
+// ##################### Legacy Bloom implementation ################### //
+
+using LegacyBloomImpl = LegacyLocalityBloomImpl</*ExtraRotates*/ false>;
+
+class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder {
+ public:
+  explicit LegacyBloomBitsBuilder(const int bits_per_key, Logger* info_log);
+
+  // No Copy allowed
+  LegacyBloomBitsBuilder(const LegacyBloomBitsBuilder&) = delete;
+  void operator=(const LegacyBloomBitsBuilder&) = delete;
+
+  ~LegacyBloomBitsBuilder() override;
+
+  void AddKey(const Slice& key) override;
+
+  virtual size_t EstimateEntriesAdded() override {
+    return hash_entries_.size();
+  }
+
+  using FilterBitsBuilder::Finish;
+
+  Slice Finish(std::unique_ptr<const char[]>* buf) override;
+
+  size_t CalculateSpace(size_t num_entries) override {
+    uint32_t dont_care1;
+    uint32_t dont_care2;
+    return CalculateSpace(num_entries, &dont_care1, &dont_care2);
+  }
+
+  double EstimatedFpRate(size_t keys, size_t bytes) override {
+    return LegacyBloomImpl::EstimatedFpRate(keys, bytes - kMetadataLen,
+                                            num_probes_);
+  }
+
+  size_t ApproximateNumEntries(size_t bytes) override;
+
+ private:
+  int bits_per_key_;
+  int num_probes_;
+  std::vector<uint32_t> hash_entries_;
+  Logger* info_log_;
+
+  // Get totalbits that optimized for cpu cache line
+  uint32_t GetTotalBitsForLocality(uint32_t total_bits);
+
+  // Reserve space for new filter
+  char* ReserveSpace(size_t num_entries, uint32_t* total_bits,
+                     uint32_t* num_lines);
+
+  // Implementation-specific variant of public CalculateSpace
+  uint32_t CalculateSpace(size_t num_entries, uint32_t* total_bits,
+                          uint32_t* num_lines);
+
+  // Assuming single threaded access to this function.
+  void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits);
+};
+
+LegacyBloomBitsBuilder::LegacyBloomBitsBuilder(const int bits_per_key,
+                                               Logger* info_log)
+    : bits_per_key_(bits_per_key),
+      num_probes_(LegacyNoLocalityBloomImpl::ChooseNumProbes(bits_per_key_)),
+      info_log_(info_log) {
+  assert(bits_per_key_);
+}
+
+LegacyBloomBitsBuilder::~LegacyBloomBitsBuilder() {}
+
+void LegacyBloomBitsBuilder::AddKey(const Slice& key) {
+  uint32_t hash = BloomHash(key);
+  if (hash_entries_.size() == 0 || hash != hash_entries_.back()) {
+    hash_entries_.push_back(hash);
+  }
+}
+
+Slice LegacyBloomBitsBuilder::Finish(std::unique_ptr<const char[]>* buf) {
+  uint32_t total_bits, num_lines;
+  size_t num_entries = hash_entries_.size();
+  char* data =
+      ReserveSpace(static_cast<int>(num_entries), &total_bits, &num_lines);
+  assert(data);
+
+  if (total_bits != 0 && num_lines != 0) {
+    for (auto h : hash_entries_) {
+      AddHash(h, data, num_lines, total_bits);
+    }
+
+    // Check for excessive entries for 32-bit hash function
+    if (num_entries >= /* minimum of 3 million */ 3000000U) {
+      // More specifically, we can detect that the 32-bit hash function
+      // is causing significant increase in FP rate by comparing current
+      // estimated FP rate to what we would get with a normal number of
+      // keys at same memory ratio.
+      double est_fp_rate = LegacyBloomImpl::EstimatedFpRate(
+          num_entries, total_bits / 8, num_probes_);
+      double vs_fp_rate = LegacyBloomImpl::EstimatedFpRate(
+          1U << 16, (1U << 16) * bits_per_key_ / 8, num_probes_);
+
+      if (est_fp_rate >= 1.50 * vs_fp_rate) {
+        // For more details, see
+        // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter
+        ROCKS_LOG_WARN(
+            info_log_,
+            "Using legacy SST/BBT Bloom filter with excessive key count "
+            "(%.1fM @ %dbpk), causing estimated %.1fx higher filter FP rate. "
+            "Consider using new Bloom with format_version>=5, smaller SST "
+            "file size, or partitioned filters.",
+            num_entries / 1000000.0, bits_per_key_, est_fp_rate / vs_fp_rate);
+      }
+    }
+  }
+  // See BloomFilterPolicy::GetFilterBitsReader for metadata
+  data[total_bits / 8] = static_cast<char>(num_probes_);
+  EncodeFixed32(data + total_bits / 8 + 1, static_cast<uint32_t>(num_lines));
+
+  const char* const_data = data;
+  buf->reset(const_data);
+  hash_entries_.clear();
+
+  return Slice(data, total_bits / 8 + kMetadataLen);
+}
+
+size_t LegacyBloomBitsBuilder::ApproximateNumEntries(size_t bytes) {
+  assert(bits_per_key_);
+  assert(bytes > 0);
+
+  uint64_t total_bits_tmp = bytes * 8;
+  // total bits, including temporary computations, cannot exceed 2^32
+  // for compatibility
+  total_bits_tmp = std::min(total_bits_tmp, uint64_t{0xffff0000});
+
+  uint32_t high = static_cast<uint32_t>(total_bits_tmp) /
+                      static_cast<uint32_t>(bits_per_key_) +
+                  1;
+  uint32_t low = 1;
+  uint32_t n = high;
+  for (; n >= low; n--) {
+    if (CalculateSpace(n) <= bytes) {
+      break;
+    }
+  }
+  return n;
+}
+
+uint32_t LegacyBloomBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) {
+  uint32_t num_lines =
+      (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
+
+  // Make num_lines an odd number to make sure more bits are involved
+  // when determining which block.
+  if (num_lines % 2 == 0) {
+    num_lines++;
+  }
+  return num_lines * (CACHE_LINE_SIZE * 8);
+}
+
+uint32_t LegacyBloomBitsBuilder::CalculateSpace(size_t num_entries,
+                                                uint32_t* total_bits,
+                                                uint32_t* num_lines) {
+  assert(bits_per_key_);
+  if (num_entries != 0) {
+    size_t total_bits_tmp = num_entries * bits_per_key_;
+    // total bits, including temporary computations, cannot exceed 2^32
+    // for compatibility
+    total_bits_tmp = std::min(total_bits_tmp, size_t{0xffff0000});
+
+    *total_bits =
+        GetTotalBitsForLocality(static_cast<uint32_t>(total_bits_tmp));
+    *num_lines = *total_bits / (CACHE_LINE_SIZE * 8);
+    assert(*total_bits > 0 && *total_bits % 8 == 0);
+  } else {
+    // filter is empty, just leave space for metadata
+    *total_bits = 0;
+    *num_lines = 0;
+  }
+
+  // Reserve space for Filter
+  uint32_t sz = *total_bits / 8;
+  sz += kMetadataLen;  // 4 bytes for num_lines, 1 byte for num_probes
+  return sz;
+}
+
+char* LegacyBloomBitsBuilder::ReserveSpace(size_t num_entries,
+                                           uint32_t* total_bits,
+                                           uint32_t* num_lines) {
+  uint32_t sz = CalculateSpace(num_entries, total_bits, num_lines);
+  char* data = new char[sz];
+  memset(data, 0, sz);
+  return data;
+}
+
+inline void LegacyBloomBitsBuilder::AddHash(uint32_t h, char* data,
+                                            uint32_t num_lines,
+                                            uint32_t total_bits) {
+#ifdef NDEBUG
+  static_cast<void>(total_bits);
+#endif
+  assert(num_lines > 0 && total_bits > 0);
+
+  LegacyBloomImpl::AddHash(h, num_lines, num_probes_, data,
+                           ConstexprFloorLog2(CACHE_LINE_SIZE));
+}
+
+class LegacyBloomBitsReader : public BuiltinFilterBitsReader {
+ public:
+  LegacyBloomBitsReader(const char* data, int num_probes, uint32_t num_lines,
+                        uint32_t log2_cache_line_size)
+      : data_(data),
+        num_probes_(num_probes),
+        num_lines_(num_lines),
+        log2_cache_line_size_(log2_cache_line_size) {}
+
+  // No Copy allowed
+  LegacyBloomBitsReader(const LegacyBloomBitsReader&) = delete;
+  void operator=(const LegacyBloomBitsReader&) = delete;
+
+  ~LegacyBloomBitsReader() override {}
+
+  // "contents" contains the data built by a preceding call to
+  // FilterBitsBuilder::Finish. MayMatch must return true if the key was
+  // passed to FilterBitsBuilder::AddKey. This method may return true or false
+  // if the key was not on the list, but it should aim to return false with a
+  // high probability.
+  bool MayMatch(const Slice& key) override {
+    uint32_t hash = BloomHash(key);
+    uint32_t byte_offset;
+    LegacyBloomImpl::PrepareHashMayMatch(
+        hash, num_lines_, data_, /*out*/ &byte_offset, log2_cache_line_size_);
+    return LegacyBloomImpl::HashMayMatchPrepared(
+        hash, num_probes_, data_ + byte_offset, log2_cache_line_size_);
+  }
+
+  virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
+    std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes;
+    std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets;
+    for (int i = 0; i < num_keys; ++i) {
+      hashes[i] = BloomHash(*keys[i]);
+      LegacyBloomImpl::PrepareHashMayMatch(hashes[i], num_lines_, data_,
+                                           /*out*/ &byte_offsets[i],
+                                           log2_cache_line_size_);
+    }
+    for (int i = 0; i < num_keys; ++i) {
+      may_match[i] = LegacyBloomImpl::HashMayMatchPrepared(
+          hashes[i], num_probes_, data_ + byte_offsets[i],
+          log2_cache_line_size_);
+    }
+  }
+
+  bool HashMayMatch(const uint64_t /* h */) override { return false; }
+
+ private:
+  const char* data_;
+  const int num_probes_;
+  const uint32_t num_lines_;
+  const uint32_t log2_cache_line_size_;
+};
+
+class AlwaysTrueFilter : public BuiltinFilterBitsReader {
+ public:
+  bool MayMatch(const Slice&) override { return true; }
+  using FilterBitsReader::MayMatch;  // inherit overload
+  bool HashMayMatch(const uint64_t) override { return true; }
+  using BuiltinFilterBitsReader::HashMayMatch;  // inherit overload
+};
+
+class AlwaysFalseFilter : public BuiltinFilterBitsReader {
+ public:
+  bool MayMatch(const Slice&) override { return false; }
+  using FilterBitsReader::MayMatch;  // inherit overload
+  bool HashMayMatch(const uint64_t) override { return false; }
+  using BuiltinFilterBitsReader::HashMayMatch;  // inherit overload
+};
+
+Status XXPH3FilterBitsBuilder::MaybePostVerify(const Slice& filter_content) {
+  Status s = Status::OK();
+
+  if (!detect_filter_construct_corruption_) {
+    return s;
+  }
+
+  std::unique_ptr<BuiltinFilterBitsReader> bits_reader(
+      BuiltinFilterPolicy::GetBuiltinFilterBitsReader(filter_content));
+
+  for (uint64_t h : hash_entries_info_.entries) {
+    // The current approach will not detect corruption from XXPH3Filter to
+    // AlwaysTrueFilter, which can lead to performance cost later due to
+    // AlwaysTrueFilter not filtering anything. But this cost is acceptable
+    // given the extra implementation complixity to detect such case.
+    bool may_match = bits_reader->HashMayMatch(h);
+    if (!may_match) {
+      s = Status::Corruption("Corrupted filter content");
+      break;
+    }
+  }
+
+  ResetEntries();
+  return s;
+}
+}  // namespace
+
+const char* BuiltinFilterPolicy::kClassName() {
+  return "rocksdb.internal.BuiltinFilter";
+}
+
+bool BuiltinFilterPolicy::IsInstanceOf(const std::string& name) const {
+  if (name == kClassName()) {
+    return true;
+  } else {
+    return FilterPolicy::IsInstanceOf(name);
+  }
+}
+
+static const char* kBuiltinFilterMetadataName = "rocksdb.BuiltinBloomFilter";
+
+const char* BuiltinFilterPolicy::kCompatibilityName() {
+  return kBuiltinFilterMetadataName;
+}
+
+const char* BuiltinFilterPolicy::CompatibilityName() const {
+  return kBuiltinFilterMetadataName;
+}
+
+BloomLikeFilterPolicy::BloomLikeFilterPolicy(double bits_per_key)
+    : warned_(false), aggregate_rounding_balance_(0) {
+  // Sanitize bits_per_key
+  if (bits_per_key < 0.5) {
+    // Round down to no filter
+    bits_per_key = 0;
+  } else if (bits_per_key < 1.0) {
+    // Minimum 1 bit per key (equiv) when creating filter
+    bits_per_key = 1.0;
+  } else if (!(bits_per_key < 100.0)) {  // including NaN
+    bits_per_key = 100.0;
+  }
+
+  // Includes a nudge toward rounding up, to ensure on all platforms
+  // that doubles specified with three decimal digits after the decimal
+  // point are interpreted accurately.
+  millibits_per_key_ = static_cast<int>(bits_per_key * 1000.0 + 0.500001);
+
+  // For now configure Ribbon filter to match Bloom FP rate and save
+  // memory. (Ribbon bits per key will be ~30% less than Bloom bits per key
+  // for same FP rate.)
+  desired_one_in_fp_rate_ =
+      1.0 / BloomMath::CacheLocalFpRate(
+                bits_per_key,
+                FastLocalBloomImpl::ChooseNumProbes(millibits_per_key_),
+                /*cache_line_bits*/ 512);
+
+  // For better or worse, this is a rounding up of a nudged rounding up,
+  // e.g. 7.4999999999999 will round up to 8, but that provides more
+  // predictability against small arithmetic errors in floating point.
+  whole_bits_per_key_ = (millibits_per_key_ + 500) / 1000;
+}
+
+BloomLikeFilterPolicy::~BloomLikeFilterPolicy() {}
+const char* BloomLikeFilterPolicy::kClassName() {
+  return "rocksdb.internal.BloomLikeFilter";
+}
+
+bool BloomLikeFilterPolicy::IsInstanceOf(const std::string& name) const {
+  if (name == kClassName()) {
+    return true;
+  } else {
+    return BuiltinFilterPolicy::IsInstanceOf(name);
+  }
+}
+
+const char* ReadOnlyBuiltinFilterPolicy::kClassName() {
+  return kBuiltinFilterMetadataName;
+}
+
+std::string BloomLikeFilterPolicy::GetId() const {
+  return Name() + GetBitsPerKeySuffix();
+}
+
+BloomFilterPolicy::BloomFilterPolicy(double bits_per_key)
+    : BloomLikeFilterPolicy(bits_per_key) {}
+
+FilterBitsBuilder* BloomFilterPolicy::GetBuilderWithContext(
+    const FilterBuildingContext& context) const {
+  if (GetMillibitsPerKey() == 0) {
+    // "No filter" special case
+    return nullptr;
+  } else if (context.table_options.format_version < 5) {
+    return GetLegacyBloomBuilderWithContext(context);
+  } else {
+    return GetFastLocalBloomBuilderWithContext(context);
+  }
+}
+
+const char* BloomFilterPolicy::kClassName() { return "bloomfilter"; }
+const char* BloomFilterPolicy::kNickName() { return "rocksdb.BloomFilter"; }
+
+std::string BloomFilterPolicy::GetId() const {
+  // Including ":false" for better forward-compatibility with 6.29 and earlier
+  // which required a boolean `use_block_based_builder` parameter
+  return BloomLikeFilterPolicy::GetId() + ":false";
+}
+
+FilterBitsBuilder* BloomLikeFilterPolicy::GetFastLocalBloomBuilderWithContext(
+    const FilterBuildingContext& context) const {
+  bool offm = context.table_options.optimize_filters_for_memory;
+  const auto options_overrides_iter =
+      context.table_options.cache_usage_options.options_overrides.find(
+          CacheEntryRole::kFilterConstruction);
+  const auto filter_construction_charged =
+      options_overrides_iter !=
+              context.table_options.cache_usage_options.options_overrides.end()
+          ? options_overrides_iter->second.charged
+          : context.table_options.cache_usage_options.options.charged;
+
+  std::shared_ptr<CacheReservationManager> cache_res_mgr;
+  if (context.table_options.block_cache &&
+      filter_construction_charged ==
+          CacheEntryRoleOptions::Decision::kEnabled) {
+    cache_res_mgr = std::make_shared<
+        CacheReservationManagerImpl<CacheEntryRole::kFilterConstruction>>(
+        context.table_options.block_cache);
+  }
+  return new FastLocalBloomBitsBuilder(
+      millibits_per_key_, offm ? &aggregate_rounding_balance_ : nullptr,
+      cache_res_mgr, context.table_options.detect_filter_construct_corruption);
+}
+
+FilterBitsBuilder* BloomLikeFilterPolicy::GetLegacyBloomBuilderWithContext(
+    const FilterBuildingContext& context) const {
+  if (whole_bits_per_key_ >= 14 && context.info_log &&
+      !warned_.load(std::memory_order_relaxed)) {
+    warned_ = true;
+    const char* adjective;
+    if (whole_bits_per_key_ >= 20) {
+      adjective = "Dramatic";
+    } else {
+      adjective = "Significant";
+    }
+    // For more details, see
+    // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter
+    ROCKS_LOG_WARN(context.info_log,
+                   "Using legacy Bloom filter with high (%d) bits/key. "
+                   "%s filter space and/or accuracy improvement is available "
+                   "with format_version>=5.",
+                   whole_bits_per_key_, adjective);
+  }
+  return new LegacyBloomBitsBuilder(whole_bits_per_key_, context.info_log);
+}
+
+FilterBitsBuilder*
+BloomLikeFilterPolicy::GetStandard128RibbonBuilderWithContext(
+    const FilterBuildingContext& context) const {
+  // FIXME: code duplication with GetFastLocalBloomBuilderWithContext
+  bool offm = context.table_options.optimize_filters_for_memory;
+  const auto options_overrides_iter =
+      context.table_options.cache_usage_options.options_overrides.find(
+          CacheEntryRole::kFilterConstruction);
+  const auto filter_construction_charged =
+      options_overrides_iter !=
+              context.table_options.cache_usage_options.options_overrides.end()
+          ? options_overrides_iter->second.charged
+          : context.table_options.cache_usage_options.options.charged;
+
+  std::shared_ptr<CacheReservationManager> cache_res_mgr;
+  if (context.table_options.block_cache &&
+      filter_construction_charged ==
+          CacheEntryRoleOptions::Decision::kEnabled) {
+    cache_res_mgr = std::make_shared<
+        CacheReservationManagerImpl<CacheEntryRole::kFilterConstruction>>(
+        context.table_options.block_cache);
+  }
+  return new Standard128RibbonBitsBuilder(
+      desired_one_in_fp_rate_, millibits_per_key_,
+      offm ? &aggregate_rounding_balance_ : nullptr, cache_res_mgr,
+      context.table_options.detect_filter_construct_corruption,
+      context.info_log);
+}
+
+std::string BloomLikeFilterPolicy::GetBitsPerKeySuffix() const {
+  std::string rv = ":" + std::to_string(millibits_per_key_ / 1000);
+  int frac = millibits_per_key_ % 1000;
+  if (frac > 0) {
+    rv.push_back('.');
+    rv.push_back(static_cast<char>('0' + (frac / 100)));
+    frac %= 100;
+    if (frac > 0) {
+      rv.push_back(static_cast<char>('0' + (frac / 10)));
+      frac %= 10;
+      if (frac > 0) {
+        rv.push_back(static_cast<char>('0' + frac));
+      }
+    }
+  }
+  return rv;
+}
+
+FilterBitsBuilder* BuiltinFilterPolicy::GetBuilderFromContext(
+    const FilterBuildingContext& context) {
+  if (context.table_options.filter_policy) {
+    return context.table_options.filter_policy->GetBuilderWithContext(context);
+  } else {
+    return nullptr;
+  }
+}
+
+// For testing only, but always constructable with internal names
+namespace test {
+
+const char* LegacyBloomFilterPolicy::kClassName() {
+  return "rocksdb.internal.LegacyBloomFilter";
+}
+
+FilterBitsBuilder* LegacyBloomFilterPolicy::GetBuilderWithContext(
+    const FilterBuildingContext& context) const {
+  if (GetMillibitsPerKey() == 0) {
+    // "No filter" special case
+    return nullptr;
+  }
+  return GetLegacyBloomBuilderWithContext(context);
+}
+
+const char* FastLocalBloomFilterPolicy::kClassName() {
+  return "rocksdb.internal.FastLocalBloomFilter";
+}
+
+FilterBitsBuilder* FastLocalBloomFilterPolicy::GetBuilderWithContext(
+    const FilterBuildingContext& context) const {
+  if (GetMillibitsPerKey() == 0) {
+    // "No filter" special case
+    return nullptr;
+  }
+  return GetFastLocalBloomBuilderWithContext(context);
+}
+
+const char* Standard128RibbonFilterPolicy::kClassName() {
+  return "rocksdb.internal.Standard128RibbonFilter";
+}
+
+FilterBitsBuilder* Standard128RibbonFilterPolicy::GetBuilderWithContext(
+    const FilterBuildingContext& context) const {
+  if (GetMillibitsPerKey() == 0) {
+    // "No filter" special case
+    return nullptr;
+  }
+  return GetStandard128RibbonBuilderWithContext(context);
+}
+
+}  // namespace test
+
+BuiltinFilterBitsReader* BuiltinFilterPolicy::GetBuiltinFilterBitsReader(
+    const Slice& contents) {
+  uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
+  if (len_with_meta <= kMetadataLen) {
+    // filter is empty or broken. Treat like zero keys added.
+    return new AlwaysFalseFilter();
+  }
+
+  // Legacy Bloom filter data:
+  //             0 +-----------------------------------+
+  //               | Raw Bloom filter data             |
+  //               | ...                               |
+  //           len +-----------------------------------+
+  //               | byte for num_probes or            |
+  //               |   marker for new implementations  |
+  //         len+1 +-----------------------------------+
+  //               | four bytes for number of cache    |
+  //               |   lines                           |
+  // len_with_meta +-----------------------------------+
+
+  int8_t raw_num_probes =
+      static_cast<int8_t>(contents.data()[len_with_meta - kMetadataLen]);
+  // NB: *num_probes > 30 and < 128 probably have not been used, because of
+  // BloomFilterPolicy::initialize, unless directly calling
+  // LegacyBloomBitsBuilder as an API, but we are leaving those cases in
+  // limbo with LegacyBloomBitsReader for now.
+
+  if (raw_num_probes < 1) {
+    // Note: < 0 (or unsigned > 127) indicate special new implementations
+    // (or reserved for future use)
+    switch (raw_num_probes) {
+      case 0:
+        // Treat as zero probes (always FP)
+        return new AlwaysTrueFilter();
+      case -1:
+        // Marker for newer Bloom implementations
+        return GetBloomBitsReader(contents);
+      case -2:
+        // Marker for Ribbon implementations
+        return GetRibbonBitsReader(contents);
+      default:
+        // Reserved (treat as zero probes, always FP, for now)
+        return new AlwaysTrueFilter();
+    }
+  }
+  // else attempt decode for LegacyBloomBitsReader
+
+  int num_probes = raw_num_probes;
+  assert(num_probes >= 1);
+  assert(num_probes <= 127);
+
+  uint32_t len = len_with_meta - kMetadataLen;
+  assert(len > 0);
+
+  uint32_t num_lines = DecodeFixed32(contents.data() + len_with_meta - 4);
+  uint32_t log2_cache_line_size;
+
+  if (num_lines * CACHE_LINE_SIZE == len) {
+    // Common case
+    log2_cache_line_size = ConstexprFloorLog2(CACHE_LINE_SIZE);
+  } else if (num_lines == 0 || len % num_lines != 0) {
+    // Invalid (no solution to num_lines * x == len)
+    // Treat as zero probes (always FP) for now.
+    return new AlwaysTrueFilter();
+  } else {
+    // Determine the non-native cache line size (from another system)
+    log2_cache_line_size = 0;
+    while ((num_lines << log2_cache_line_size) < len) {
+      ++log2_cache_line_size;
+    }
+    if ((num_lines << log2_cache_line_size) != len) {
+      // Invalid (block size not a power of two)
+      // Treat as zero probes (always FP) for now.
+      return new AlwaysTrueFilter();
+    }
+  }
+  // if not early return
+  return new LegacyBloomBitsReader(contents.data(), num_probes, num_lines,
+                                   log2_cache_line_size);
+}
+
+// Read metadata to determine what kind of FilterBitsReader is needed
+// and return a new one.
+FilterBitsReader* BuiltinFilterPolicy::GetFilterBitsReader(
+    const Slice& contents) const {
+  return BuiltinFilterPolicy::GetBuiltinFilterBitsReader(contents);
+}
+
+BuiltinFilterBitsReader* BuiltinFilterPolicy::GetRibbonBitsReader(
+    const Slice& contents) {
+  uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
+  uint32_t len = len_with_meta - kMetadataLen;
+
+  assert(len > 0);  // precondition
+
+  uint32_t seed = static_cast<uint8_t>(contents.data()[len + 1]);
+  uint32_t num_blocks = static_cast<uint8_t>(contents.data()[len + 2]);
+  num_blocks |= static_cast<uint8_t>(contents.data()[len + 3]) << 8;
+  num_blocks |= static_cast<uint8_t>(contents.data()[len + 4]) << 16;
+  if (num_blocks < 2) {
+    // Not supported
+    // num_blocks == 1 is not used because num_starts == 1 is problematic
+    // for the hashing scheme. num_blocks == 0 is unused because there's
+    // already a concise encoding of an "always false" filter.
+    // Return something safe:
+    return new AlwaysTrueFilter();
+  }
+  return new Standard128RibbonBitsReader(contents.data(), len, num_blocks,
+                                         seed);
+}
+
+// For newer Bloom filter implementations
+BuiltinFilterBitsReader* BuiltinFilterPolicy::GetBloomBitsReader(
+    const Slice& contents) {
+  uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
+  uint32_t len = len_with_meta - kMetadataLen;
+
+  assert(len > 0);  // precondition
+
+  // New Bloom filter data:
+  //             0 +-----------------------------------+
+  //               | Raw Bloom filter data             |
+  //               | ...                               |
+  //           len +-----------------------------------+
+  //               | char{-1} byte -> new Bloom filter |
+  //         len+1 +-----------------------------------+
+  //               | byte for subimplementation        |
+  //               |   0: FastLocalBloom               |
+  //               |   other: reserved                 |
+  //         len+2 +-----------------------------------+
+  //               | byte for block_and_probes         |
+  //               |   0 in top 3 bits -> 6 -> 64-byte |
+  //               |   reserved:                       |
+  //               |   1 in top 3 bits -> 7 -> 128-byte|
+  //               |   2 in top 3 bits -> 8 -> 256-byte|
+  //               |   ...                             |
+  //               |   num_probes in bottom 5 bits,    |
+  //               |     except 0 and 31 reserved      |
+  //         len+3 +-----------------------------------+
+  //               | two bytes reserved                |
+  //               |   possibly for hash seed          |
+  // len_with_meta +-----------------------------------+
+
+  // Read more metadata (see above)
+  char sub_impl_val = contents.data()[len_with_meta - 4];
+  char block_and_probes = contents.data()[len_with_meta - 3];
+  int log2_block_bytes = ((block_and_probes >> 5) & 7) + 6;
+
+  int num_probes = (block_and_probes & 31);
+  if (num_probes < 1 || num_probes > 30) {
+    // Reserved / future safe
+    return new AlwaysTrueFilter();
+  }
+
+  uint16_t rest = DecodeFixed16(contents.data() + len_with_meta - 2);
+  if (rest != 0) {
+    // Reserved, possibly for hash seed
+    // Future safe
+    return new AlwaysTrueFilter();
+  }
+
+  if (sub_impl_val == 0) {        // FastLocalBloom
+    if (log2_block_bytes == 6) {  // Only block size supported for now
+      return new FastLocalBloomBitsReader(contents.data(), num_probes, len);
+    }
+  }
+  // otherwise
+  // Reserved / future safe
+  return new AlwaysTrueFilter();
+}
+
+const FilterPolicy* NewBloomFilterPolicy(double bits_per_key,
+                                         bool /*use_block_based_builder*/) {
+  // NOTE: use_block_based_builder now ignored so block-based filter is no
+  // longer accessible in public API.
+  return new BloomFilterPolicy(bits_per_key);
+}
+
+RibbonFilterPolicy::RibbonFilterPolicy(double bloom_equivalent_bits_per_key,
+                                       int bloom_before_level)
+    : BloomLikeFilterPolicy(bloom_equivalent_bits_per_key),
+      bloom_before_level_(bloom_before_level) {}
+
+FilterBitsBuilder* RibbonFilterPolicy::GetBuilderWithContext(
+    const FilterBuildingContext& context) const {
+  if (GetMillibitsPerKey() == 0) {
+    // "No filter" special case
+    return nullptr;
+  }
+  // Treat unknown same as bottommost
+  int levelish = INT_MAX;
+
+  switch (context.compaction_style) {
+    case kCompactionStyleLevel:
+    case kCompactionStyleUniversal: {
+      if (context.reason == TableFileCreationReason::kFlush) {
+        // Treat flush as level -1
+        assert(context.level_at_creation == 0);
+        levelish = -1;
+      } else if (context.level_at_creation == -1) {
+        // Unknown level
+        assert(levelish == INT_MAX);
+      } else {
+        levelish = context.level_at_creation;
+      }
+      break;
+    }
+    case kCompactionStyleFIFO:
+    case kCompactionStyleNone:
+      // Treat as bottommost
+      assert(levelish == INT_MAX);
+      break;
+  }
+  if (levelish < bloom_before_level_) {
+    return GetFastLocalBloomBuilderWithContext(context);
+  } else {
+    return GetStandard128RibbonBuilderWithContext(context);
+  }
+}
+
+const char* RibbonFilterPolicy::kClassName() { return "ribbonfilter"; }
+const char* RibbonFilterPolicy::kNickName() { return "rocksdb.RibbonFilter"; }
+
+std::string RibbonFilterPolicy::GetId() const {
+  return BloomLikeFilterPolicy::GetId() + ":" +
+         std::to_string(bloom_before_level_);
+}
+
+const FilterPolicy* NewRibbonFilterPolicy(double bloom_equivalent_bits_per_key,
+                                          int bloom_before_level) {
+  return new RibbonFilterPolicy(bloom_equivalent_bits_per_key,
+                                bloom_before_level);
+}
+
+FilterBuildingContext::FilterBuildingContext(
+    const BlockBasedTableOptions& _table_options)
+    : table_options(_table_options) {}
+
+FilterPolicy::~FilterPolicy() {}
+
+std::shared_ptr<const FilterPolicy> BloomLikeFilterPolicy::Create(
+    const std::string& name, double bits_per_key) {
+  if (name == test::LegacyBloomFilterPolicy::kClassName()) {
+    return std::make_shared<test::LegacyBloomFilterPolicy>(bits_per_key);
+  } else if (name == test::FastLocalBloomFilterPolicy::kClassName()) {
+    return std::make_shared<test::FastLocalBloomFilterPolicy>(bits_per_key);
+  } else if (name == test::Standard128RibbonFilterPolicy::kClassName()) {
+    return std::make_shared<test::Standard128RibbonFilterPolicy>(bits_per_key);
+  } else if (name == BloomFilterPolicy::kClassName()) {
+    // For testing
+    return std::make_shared<BloomFilterPolicy>(bits_per_key);
+  } else if (name == RibbonFilterPolicy::kClassName()) {
+    // For testing
+    return std::make_shared<RibbonFilterPolicy>(bits_per_key,
+                                                /*bloom_before_level*/ 0);
+  } else {
+    return nullptr;
+  }
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+static ObjectLibrary::PatternEntry FilterPatternEntryWithBits(
+    const char* name) {
+  return ObjectLibrary::PatternEntry(name, false).AddNumber(":", false);
+}
+
+template <typename T>
+T* NewBuiltinFilterPolicyWithBits(const std::string& uri) {
+  const std::vector<std::string> vals = StringSplit(uri, ':');
+  double bits_per_key = ParseDouble(vals[1]);
+  return new T(bits_per_key);
+}
+static int RegisterBuiltinFilterPolicies(ObjectLibrary& library,
+                                         const std::string& /*arg*/) {
+  library.AddFactory<const FilterPolicy>(
+      ReadOnlyBuiltinFilterPolicy::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<const FilterPolicy>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new ReadOnlyBuiltinFilterPolicy());
+        return guard->get();
+      });
+
+  library.AddFactory<const FilterPolicy>(
+      FilterPatternEntryWithBits(BloomFilterPolicy::kClassName())
+          .AnotherName(BloomFilterPolicy::kNickName()),
+      [](const std::string& uri, std::unique_ptr<const FilterPolicy>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(NewBuiltinFilterPolicyWithBits<BloomFilterPolicy>(uri));
+        return guard->get();
+      });
+  library.AddFactory<const FilterPolicy>(
+      FilterPatternEntryWithBits(BloomFilterPolicy::kClassName())
+          .AnotherName(BloomFilterPolicy::kNickName())
+          .AddSuffix(":false"),
+      [](const std::string& uri, std::unique_ptr<const FilterPolicy>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(NewBuiltinFilterPolicyWithBits<BloomFilterPolicy>(uri));
+        return guard->get();
+      });
+  library.AddFactory<const FilterPolicy>(
+      FilterPatternEntryWithBits(BloomFilterPolicy::kClassName())
+          .AnotherName(BloomFilterPolicy::kNickName())
+          .AddSuffix(":true"),
+      [](const std::string& uri, std::unique_ptr<const FilterPolicy>* guard,
+         std::string* /* errmsg */) {
+        const std::vector<std::string> vals = StringSplit(uri, ':');
+        double bits_per_key = ParseDouble(vals[1]);
+        // NOTE: This case previously configured the deprecated block-based
+        // filter, but old ways of configuring that now map to full filter. We
+        // defer to the corresponding API to ensure consistency in case that
+        // change is reverted.
+        guard->reset(NewBloomFilterPolicy(bits_per_key, true));
+        return guard->get();
+      });
+  library.AddFactory<const FilterPolicy>(
+      FilterPatternEntryWithBits(RibbonFilterPolicy::kClassName())
+          .AnotherName(RibbonFilterPolicy::kNickName()),
+      [](const std::string& uri, std::unique_ptr<const FilterPolicy>* guard,
+         std::string* /* errmsg */) {
+        const std::vector<std::string> vals = StringSplit(uri, ':');
+        double bits_per_key = ParseDouble(vals[1]);
+        guard->reset(NewRibbonFilterPolicy(bits_per_key));
+        return guard->get();
+      });
+  library.AddFactory<const FilterPolicy>(
+      FilterPatternEntryWithBits(RibbonFilterPolicy::kClassName())
+          .AnotherName(RibbonFilterPolicy::kNickName())
+          .AddNumber(":", true),
+      [](const std::string& uri, std::unique_ptr<const FilterPolicy>* guard,
+         std::string* /* errmsg */) {
+        const std::vector<std::string> vals = StringSplit(uri, ':');
+        double bits_per_key = ParseDouble(vals[1]);
+        int bloom_before_level = ParseInt(vals[2]);
+        guard->reset(NewRibbonFilterPolicy(bits_per_key, bloom_before_level));
+        return guard->get();
+      });
+  library.AddFactory<const FilterPolicy>(
+      FilterPatternEntryWithBits(test::LegacyBloomFilterPolicy::kClassName()),
+      [](const std::string& uri, std::unique_ptr<const FilterPolicy>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(
+            NewBuiltinFilterPolicyWithBits<test::LegacyBloomFilterPolicy>(uri));
+        return guard->get();
+      });
+  library.AddFactory<const FilterPolicy>(
+      FilterPatternEntryWithBits(
+          test::FastLocalBloomFilterPolicy::kClassName()),
+      [](const std::string& uri, std::unique_ptr<const FilterPolicy>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(
+            NewBuiltinFilterPolicyWithBits<test::FastLocalBloomFilterPolicy>(
+                uri));
+        return guard->get();
+      });
+  library.AddFactory<const FilterPolicy>(
+      FilterPatternEntryWithBits(
+          test::Standard128RibbonFilterPolicy::kClassName()),
+      [](const std::string& uri, std::unique_ptr<const FilterPolicy>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(
+            NewBuiltinFilterPolicyWithBits<test::Standard128RibbonFilterPolicy>(
+                uri));
+        return guard->get();
+      });
+  size_t num_types;
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+}  // namespace
+#endif  // ROCKSDB_LITE
+
+Status FilterPolicy::CreateFromString(
+    const ConfigOptions& options, const std::string& value,
+    std::shared_ptr<const FilterPolicy>* policy) {
+  if (value == kNullptrString || value.empty()) {
+    policy->reset();
+    return Status::OK();
+  } else if (value == ReadOnlyBuiltinFilterPolicy::kClassName()) {
+    *policy = std::make_shared<ReadOnlyBuiltinFilterPolicy>();
+    return Status::OK();
+  }
+
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+  Status status =
+      Customizable::GetOptionsMap(options, policy->get(), value, &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
+  } else if (id.empty()) {  // We have no Id but have options.  Not good
+    return Status::NotSupported("Cannot reset object ", id);
+  } else {
+#ifndef ROCKSDB_LITE
+    static std::once_flag loaded;
+    std::call_once(loaded, [&]() {
+      RegisterBuiltinFilterPolicies(*(ObjectLibrary::Default().get()), "");
+    });
+    status = options.registry->NewSharedObject(id, policy);
+#else
+    status =
+        Status::NotSupported("Cannot load filter policy in LITE mode ", value);
+#endif  // ROCKSDB_LITE
+  }
+  if (options.ignore_unsupported_options && status.IsNotSupported()) {
+    return Status::OK();
+  } else if (status.ok()) {
+    status = Customizable::ConfigureNewObject(
+        options, const_cast<FilterPolicy*>(policy->get()), opt_map);
+  }
+  return status;
+}
+
+const std::vector<std::string>& BloomLikeFilterPolicy::GetAllFixedImpls() {
+  STATIC_AVOID_DESTRUCTION(std::vector<std::string>, impls){
+      // Match filter_bench -impl=x ordering
+      test::LegacyBloomFilterPolicy::kClassName(),
+      test::FastLocalBloomFilterPolicy::kClassName(),
+      test::Standard128RibbonFilterPolicy::kClassName(),
+  };
+  return impls;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/filter_policy_internal.h b/src/rocksdb/table/block_based/filter_policy_internal.h
new file mode 100644
index 000000000..9bc3a2482
--- /dev/null
+++ b/src/rocksdb/table/block_based/filter_policy_internal.h
@@ -0,0 +1,340 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A class that takes a bunch of keys, then generates filter
+class FilterBitsBuilder {
+ public:
+  virtual ~FilterBitsBuilder() {}
+
+  // Add a key (or prefix) to the filter. Typically, a builder will keep
+  // a set of 64-bit key hashes and only build the filter in Finish
+  // when the final number of keys is known. Keys are added in sorted order
+  // and duplicated keys are possible, so typically, the builder will
+  // only add this key if its hash is different from the most recently
+  // added.
+  virtual void AddKey(const Slice& key) = 0;
+
+  // Called by RocksDB before Finish to populate
+  // TableProperties::num_filter_entries, so should represent the
+  // number of unique keys (and/or prefixes) added, but does not have
+  // to be exact. `return 0;` may be used to conspicuously indicate "unknown".
+  virtual size_t EstimateEntriesAdded() = 0;
+
+  // Generate the filter using the keys that are added
+  // The return value of this function would be the filter bits,
+  // The ownership of actual data is set to buf
+  virtual Slice Finish(std::unique_ptr<const char[]>* buf) = 0;
+
+  // Similar to Finish(std::unique_ptr<const char[]>* buf), except that
+  // for a non-null status pointer argument, it will point to
+  // Status::Corruption() when there is any corruption during filter
+  // construction or Status::OK() otherwise.
+  //
+  // WARNING: do not use a filter resulted from a corrupted construction
+  // TODO: refactor this to have a better signature, consolidate
+  virtual Slice Finish(std::unique_ptr<const char[]>* buf,
+                       Status* /* status */) {
+    return Finish(buf);
+  }
+
+  // Verify the filter returned from calling FilterBitsBuilder::Finish.
+  // The function returns Status::Corruption() if there is any corruption in the
+  // constructed filter or Status::OK() otherwise.
+  //
+  // Implementations should normally consult
+  // FilterBuildingContext::table_options.detect_filter_construct_corruption
+  // to determine whether to perform verification or to skip by returning
+  // Status::OK(). The decision is left to the FilterBitsBuilder so that
+  // verification prerequisites before PostVerify can be skipped when not
+  // configured.
+  //
+  // RocksDB internal will always call MaybePostVerify() on the filter after
+  // it is returned from calling FilterBitsBuilder::Finish
+  // except for FilterBitsBuilder::Finish resulting a corruption
+  // status, which indicates the filter is already in a corrupted state and
+  // there is no need to post-verify
+  virtual Status MaybePostVerify(const Slice& /* filter_content */) {
+    return Status::OK();
+  }
+
+  // Approximate the number of keys that can be added and generate a filter
+  // <= the specified number of bytes. Callers (including RocksDB) should
+  // only use this result for optimizing performance and not as a guarantee.
+  virtual size_t ApproximateNumEntries(size_t bytes) = 0;
+};
+
+// A class that checks if a key can be in filter
+// It should be initialized by Slice generated by BitsBuilder
+class FilterBitsReader {
+ public:
+  virtual ~FilterBitsReader() {}
+
+  // Check if the entry match the bits in filter
+  virtual bool MayMatch(const Slice& entry) = 0;
+
+  // Check if an array of entries match the bits in filter
+  virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) {
+    for (int i = 0; i < num_keys; ++i) {
+      may_match[i] = MayMatch(*keys[i]);
+    }
+  }
+};
+
+// Exposes any extra information needed for testing built-in
+// FilterBitsBuilders
+class BuiltinFilterBitsBuilder : public FilterBitsBuilder {
+ public:
+  // Calculate number of bytes needed for a new filter, including
+  // metadata. Passing the result to ApproximateNumEntries should
+  // (ideally, usually) return >= the num_entry passed in.
+  // When optimize_filters_for_memory is enabled, this function
+  // is not authoritative but represents a target size that should
+  // be close to the average size.
+  virtual size_t CalculateSpace(size_t num_entries) = 0;
+
+  // Returns an estimate of the FP rate of the returned filter if
+  // `num_entries` keys are added and the filter returned by Finish
+  // is `bytes` bytes.
+  virtual double EstimatedFpRate(size_t num_entries, size_t bytes) = 0;
+};
+
+// Base class for RocksDB built-in filter reader with
+// extra useful functionalities for inernal.
+class BuiltinFilterBitsReader : public FilterBitsReader {
+ public:
+  // Check if the hash of the entry match the bits in filter
+  virtual bool HashMayMatch(const uint64_t /* h */) { return true; }
+};
+
+// Base class for RocksDB built-in filter policies. This provides the
+// ability to read all kinds of built-in filters (so that old filters can
+// be used even when you change between built-in policies).
+class BuiltinFilterPolicy : public FilterPolicy {
+ public:  // overrides
+  // Read metadata to determine what kind of FilterBitsReader is needed
+  // and return a new one. This must successfully process any filter data
+  // generated by a built-in FilterBitsBuilder, regardless of the impl
+  // chosen for this BloomFilterPolicy.
+  FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override;
+  static const char* kClassName();
+  bool IsInstanceOf(const std::string& id) const override;
+  // All variants of BuiltinFilterPolicy can read each others filters.
+  const char* CompatibilityName() const override;
+  static const char* kCompatibilityName();
+
+ public:  // new
+  // An internal function for the implementation of
+  // BuiltinFilterBitsReader::GetFilterBitsReader without requiring an instance
+  // or working around potential virtual overrides.
+  static BuiltinFilterBitsReader* GetBuiltinFilterBitsReader(
+      const Slice& contents);
+
+  // Returns a new FilterBitsBuilder from the filter_policy in
+  // table_options of a context, or nullptr if not applicable.
+  // (An internal convenience function to save boilerplate.)
+  static FilterBitsBuilder* GetBuilderFromContext(const FilterBuildingContext&);
+
+ private:
+  // For Bloom filter implementation(s)
+  static BuiltinFilterBitsReader* GetBloomBitsReader(const Slice& contents);
+
+  // For Ribbon filter implementation(s)
+  static BuiltinFilterBitsReader* GetRibbonBitsReader(const Slice& contents);
+};
+
+// A "read only" filter policy used for backward compatibility with old
+// OPTIONS files, which did not specifying a Bloom configuration, just
+// "rocksdb.BuiltinBloomFilter". Although this can read existing filters,
+// this policy does not build new filters, so new SST files generated
+// under the policy will get no filters (like nullptr FilterPolicy).
+// This class is considered internal API and subject to change.
+class ReadOnlyBuiltinFilterPolicy : public BuiltinFilterPolicy {
+ public:
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName();
+
+  // Does not write filters.
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext&) const override {
+    return nullptr;
+  }
+};
+
+// RocksDB built-in filter policy for Bloom or Bloom-like filters including
+// Ribbon filters.
+// This class is considered internal API and subject to change.
+// See NewBloomFilterPolicy and NewRibbonFilterPolicy.
+class BloomLikeFilterPolicy : public BuiltinFilterPolicy {
+ public:
+  explicit BloomLikeFilterPolicy(double bits_per_key);
+
+  ~BloomLikeFilterPolicy() override;
+  static const char* kClassName();
+  bool IsInstanceOf(const std::string& id) const override;
+
+  std::string GetId() const override;
+
+  // Essentially for testing only: configured millibits/key
+  int GetMillibitsPerKey() const { return millibits_per_key_; }
+  // Essentially for testing only: legacy whole bits/key
+  int GetWholeBitsPerKey() const { return whole_bits_per_key_; }
+
+  // All the different underlying implementations that a BloomLikeFilterPolicy
+  // might use, as a configuration string name for a testing mode for
+  // "always use this implementation." Only appropriate for unit tests.
+  static const std::vector<std::string>& GetAllFixedImpls();
+
+  // Convenience function for creating by name for fixed impls
+  static std::shared_ptr<const FilterPolicy> Create(const std::string& name,
+                                                    double bits_per_key);
+
+ protected:
+  // Some implementations used by aggregating policies
+  FilterBitsBuilder* GetLegacyBloomBuilderWithContext(
+      const FilterBuildingContext& context) const;
+  FilterBitsBuilder* GetFastLocalBloomBuilderWithContext(
+      const FilterBuildingContext& context) const;
+  FilterBitsBuilder* GetStandard128RibbonBuilderWithContext(
+      const FilterBuildingContext& context) const;
+
+  std::string GetBitsPerKeySuffix() const;
+
+ private:
+  // Bits per key settings are for configuring Bloom filters.
+
+  // Newer filters support fractional bits per key. For predictable behavior
+  // of 0.001-precision values across floating point implementations, we
+  // round to thousandths of a bit (on average) per key.
+  int millibits_per_key_;
+
+  // Older filters round to whole number bits per key. (There *should* be no
+  // compatibility issue with fractional bits per key, but preserving old
+  // behavior with format_version < 5 just in case.)
+  int whole_bits_per_key_;
+
+  // For configuring Ribbon filter: a desired value for 1/fp_rate. For
+  // example, 100 -> 1% fp rate.
+  double desired_one_in_fp_rate_;
+
+  // Whether relevant warnings have been logged already. (Remember so we
+  // only report once per BloomFilterPolicy instance, to keep the noise down.)
+  mutable std::atomic<bool> warned_;
+
+  // State for implementing optimize_filters_for_memory. Essentially, this
+  // tracks a surplus or deficit in total FP rate of filters generated by
+  // builders under this policy vs. what would have been generated without
+  // optimize_filters_for_memory.
+  //
+  // To avoid floating point weirdness, the actual value is
+  //  Sum over all generated filters f:
+  //   (predicted_fp_rate(f) - predicted_fp_rate(f|o_f_f_m=false)) * 2^32
+  mutable std::atomic<int64_t> aggregate_rounding_balance_;
+};
+
+// For NewBloomFilterPolicy
+//
+// This is a user-facing policy that automatically choose between
+// LegacyBloom and FastLocalBloom based on context at build time,
+// including compatibility with format_version.
+class BloomFilterPolicy : public BloomLikeFilterPolicy {
+ public:
+  explicit BloomFilterPolicy(double bits_per_key);
+
+  // To use this function, call BuiltinFilterPolicy::GetBuilderFromContext().
+  //
+  // Neither the context nor any objects therein should be saved beyond
+  // the call to this function, unless it's shared_ptr.
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext&) const override;
+
+  static const char* kClassName();
+  const char* Name() const override { return kClassName(); }
+  static const char* kNickName();
+  const char* NickName() const override { return kNickName(); }
+  std::string GetId() const override;
+};
+
+// For NewRibbonFilterPolicy
+//
+// This is a user-facing policy that chooses between Standard128Ribbon
+// and FastLocalBloom based on context at build time (LSM level and other
+// factors in extreme cases).
+class RibbonFilterPolicy : public BloomLikeFilterPolicy {
+ public:
+  explicit RibbonFilterPolicy(double bloom_equivalent_bits_per_key,
+                              int bloom_before_level);
+
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext&) const override;
+
+  int GetBloomBeforeLevel() const { return bloom_before_level_; }
+
+  static const char* kClassName();
+  const char* Name() const override { return kClassName(); }
+  static const char* kNickName();
+  const char* NickName() const override { return kNickName(); }
+  std::string GetId() const override;
+
+ private:
+  const int bloom_before_level_;
+};
+
+// For testing only, but always constructable with internal names
+namespace test {
+
+class LegacyBloomFilterPolicy : public BloomLikeFilterPolicy {
+ public:
+  explicit LegacyBloomFilterPolicy(double bits_per_key)
+      : BloomLikeFilterPolicy(bits_per_key) {}
+
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext& context) const override;
+
+  static const char* kClassName();
+  const char* Name() const override { return kClassName(); }
+};
+
+class FastLocalBloomFilterPolicy : public BloomLikeFilterPolicy {
+ public:
+  explicit FastLocalBloomFilterPolicy(double bits_per_key)
+      : BloomLikeFilterPolicy(bits_per_key) {}
+
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext& context) const override;
+
+  static const char* kClassName();
+  const char* Name() const override { return kClassName(); }
+};
+
+class Standard128RibbonFilterPolicy : public BloomLikeFilterPolicy {
+ public:
+  explicit Standard128RibbonFilterPolicy(double bloom_equiv_bits_per_key)
+      : BloomLikeFilterPolicy(bloom_equiv_bits_per_key) {}
+
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext& context) const override;
+
+  static const char* kClassName();
+  const char* Name() const override { return kClassName(); }
+};
+
+}  // namespace test
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/flush_block_policy.cc b/src/rocksdb/table/block_based/flush_block_policy.cc
new file mode 100644
index 000000000..9bb1f334b
--- /dev/null
+++ b/src/rocksdb/table/block_based/flush_block_policy.cc
@@ -0,0 +1,146 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/flush_block_policy.h"
+
+#include <cassert>
+#include <mutex>
+
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#include "table/block_based/flush_block_policy.h"
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Flush block by size
+class FlushBlockBySizePolicy : public FlushBlockPolicy {
+ public:
+  // @params block_size:           Approximate size of user data packed per
+  //                               block.
+  // @params block_size_deviation: This is used to close a block before it
+  //                               reaches the configured
+  FlushBlockBySizePolicy(const uint64_t block_size,
+                         const uint64_t block_size_deviation, const bool align,
+                         const BlockBuilder& data_block_builder)
+      : block_size_(block_size),
+        block_size_deviation_limit_(
+            ((block_size * (100 - block_size_deviation)) + 99) / 100),
+        align_(align),
+        data_block_builder_(data_block_builder) {}
+
+  bool Update(const Slice& key, const Slice& value) override {
+    // it makes no sense to flush when the data block is empty
+    if (data_block_builder_.empty()) {
+      return false;
+    }
+
+    auto curr_size = data_block_builder_.CurrentSizeEstimate();
+
+    // Do flush if one of the below two conditions is true:
+    // 1) if the current estimated size already exceeds the block size,
+    // 2) block_size_deviation is set and the estimated size after appending
+    // the kv will exceed the block size and the current size is under the
+    // the deviation.
+    return curr_size >= block_size_ || BlockAlmostFull(key, value);
+  }
+
+ private:
+  bool BlockAlmostFull(const Slice& key, const Slice& value) const {
+    if (block_size_deviation_limit_ == 0) {
+      return false;
+    }
+
+    const auto curr_size = data_block_builder_.CurrentSizeEstimate();
+    auto estimated_size_after =
+        data_block_builder_.EstimateSizeAfterKV(key, value);
+
+    if (align_) {
+      estimated_size_after += BlockBasedTable::kBlockTrailerSize;
+      return estimated_size_after > block_size_;
+    }
+
+    return estimated_size_after > block_size_ &&
+           curr_size > block_size_deviation_limit_;
+  }
+
+  const uint64_t block_size_;
+  const uint64_t block_size_deviation_limit_;
+  const bool align_;
+  const BlockBuilder& data_block_builder_;
+};
+
+FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
+    const BlockBasedTableOptions& table_options,
+    const BlockBuilder& data_block_builder) const {
+  return new FlushBlockBySizePolicy(
+      table_options.block_size, table_options.block_size_deviation,
+      table_options.block_align, data_block_builder);
+}
+
+FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
+    const uint64_t size, const int deviation,
+    const BlockBuilder& data_block_builder) {
+  return new FlushBlockBySizePolicy(size, deviation, false, data_block_builder);
+}
+
+#ifndef ROCKSDB_LITE
+static int RegisterFlushBlockPolicyFactories(ObjectLibrary& library,
+                                             const std::string& /*arg*/) {
+  library.AddFactory<FlushBlockPolicyFactory>(
+      FlushBlockBySizePolicyFactory::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<FlushBlockPolicyFactory>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new FlushBlockBySizePolicyFactory());
+        return guard->get();
+      });
+  library.AddFactory<FlushBlockPolicyFactory>(
+      FlushBlockEveryKeyPolicyFactory::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<FlushBlockPolicyFactory>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new FlushBlockEveryKeyPolicyFactory());
+        return guard->get();
+      });
+  return 2;
+}
+#endif  // ROCKSDB_LITE
+
+static bool LoadFlushPolicyFactory(
+    const std::string& id, std::shared_ptr<FlushBlockPolicyFactory>* result) {
+  if (id.empty()) {
+    result->reset(new FlushBlockBySizePolicyFactory());
+#ifdef ROCKSDB_LITE
+  } else if (id == FlushBlockBySizePolicyFactory::kClassName()) {
+    result->reset(new FlushBlockBySizePolicyFactory());
+  } else if (id == FlushBlockEveryKeyPolicyFactory::kClassName()) {
+    result->reset(new FlushBlockEveryKeyPolicyFactory());
+#endif  // ROCKSDB_LITE
+  } else {
+    return false;
+  }
+  return true;
+}
+
+FlushBlockBySizePolicyFactory::FlushBlockBySizePolicyFactory()
+    : FlushBlockPolicyFactory() {}
+
+Status FlushBlockPolicyFactory::CreateFromString(
+    const ConfigOptions& config_options, const std::string& value,
+    std::shared_ptr<FlushBlockPolicyFactory>* factory) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    RegisterFlushBlockPolicyFactories(*(ObjectLibrary::Default().get()), "");
+  });
+#endif  // ROCKSDB_LITE
+  return LoadSharedObject<FlushBlockPolicyFactory>(
+      config_options, value, LoadFlushPolicyFactory, factory);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/flush_block_policy.h b/src/rocksdb/table/block_based/flush_block_policy.h
new file mode 100644
index 000000000..4f79682bc
--- /dev/null
+++ b/src/rocksdb/table/block_based/flush_block_policy.h
@@ -0,0 +1,40 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/flush_block_policy.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// FlushBlockEveryKeyPolicy currently used only in tests.
+
+class FlushBlockEveryKeyPolicy : public FlushBlockPolicy {
+ public:
+  bool Update(const Slice& /*key*/, const Slice& /*value*/) override {
+    if (!start_) {
+      start_ = true;
+      return false;
+    }
+    return true;
+  }
+
+ private:
+  bool start_ = false;
+};
+
+class FlushBlockEveryKeyPolicyFactory : public FlushBlockPolicyFactory {
+ public:
+  explicit FlushBlockEveryKeyPolicyFactory() {}
+
+  static const char* kClassName() { return "FlushBlockEveryKeyPolicyFactory"; }
+  const char* Name() const override { return kClassName(); }
+
+  FlushBlockPolicy* NewFlushBlockPolicy(
+      const BlockBasedTableOptions& /*table_options*/,
+      const BlockBuilder& /*data_block_builder*/) const override {
+    return new FlushBlockEveryKeyPolicy;
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/full_filter_block.cc b/src/rocksdb/table/block_based/full_filter_block.cc
new file mode 100644
index 000000000..62b7a9eca
--- /dev/null
+++ b/src/rocksdb/table/block_based/full_filter_block.cc
@@ -0,0 +1,296 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/block_based/full_filter_block.h"
+
+#include <array>
+
+#include "block_type.h"
+#include "monitoring/perf_context_imp.h"
+#include "port/malloc.h"
+#include "port/port.h"
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+FullFilterBlockBuilder::FullFilterBlockBuilder(
+    const SliceTransform* _prefix_extractor, bool whole_key_filtering,
+    FilterBitsBuilder* filter_bits_builder)
+    : prefix_extractor_(_prefix_extractor),
+      whole_key_filtering_(whole_key_filtering),
+      last_whole_key_recorded_(false),
+      last_prefix_recorded_(false),
+      last_key_in_domain_(false),
+      any_added_(false) {
+  assert(filter_bits_builder != nullptr);
+  filter_bits_builder_.reset(filter_bits_builder);
+}
+
+size_t FullFilterBlockBuilder::EstimateEntriesAdded() {
+  return filter_bits_builder_->EstimateEntriesAdded();
+}
+
+void FullFilterBlockBuilder::Add(const Slice& key_without_ts) {
+  const bool add_prefix =
+      prefix_extractor_ && prefix_extractor_->InDomain(key_without_ts);
+
+  if (!last_prefix_recorded_ && last_key_in_domain_) {
+    // We can reach here when a new filter partition starts in partitioned
+    // filter. The last prefix in the previous partition should be added if
+    // necessary regardless of key_without_ts, to support prefix SeekForPrev.
+    AddKey(last_prefix_str_);
+    last_prefix_recorded_ = true;
+  }
+
+  if (whole_key_filtering_) {
+    if (!add_prefix) {
+      AddKey(key_without_ts);
+    } else {
+      // if both whole_key and prefix are added to bloom then we will have whole
+      // key_without_ts and prefix addition being interleaved and thus cannot
+      // rely on the bits builder to properly detect the duplicates by comparing
+      // with the last item.
+      Slice last_whole_key = Slice(last_whole_key_str_);
+      if (!last_whole_key_recorded_ ||
+          last_whole_key.compare(key_without_ts) != 0) {
+        AddKey(key_without_ts);
+        last_whole_key_recorded_ = true;
+        last_whole_key_str_.assign(key_without_ts.data(),
+                                   key_without_ts.size());
+      }
+    }
+  }
+  if (add_prefix) {
+    last_key_in_domain_ = true;
+    AddPrefix(key_without_ts);
+  } else {
+    last_key_in_domain_ = false;
+  }
+}
+
+// Add key to filter if needed
+inline void FullFilterBlockBuilder::AddKey(const Slice& key) {
+  filter_bits_builder_->AddKey(key);
+  any_added_ = true;
+}
+
+// Add prefix to filter if needed
+void FullFilterBlockBuilder::AddPrefix(const Slice& key) {
+  assert(prefix_extractor_ && prefix_extractor_->InDomain(key));
+  Slice prefix = prefix_extractor_->Transform(key);
+  if (whole_key_filtering_) {
+    // if both whole_key and prefix are added to bloom then we will have whole
+    // key and prefix addition being interleaved and thus cannot rely on the
+    // bits builder to properly detect the duplicates by comparing with the last
+    // item.
+    Slice last_prefix = Slice(last_prefix_str_);
+    if (!last_prefix_recorded_ || last_prefix.compare(prefix) != 0) {
+      AddKey(prefix);
+      last_prefix_recorded_ = true;
+      last_prefix_str_.assign(prefix.data(), prefix.size());
+    }
+  } else {
+    AddKey(prefix);
+  }
+}
+
+void FullFilterBlockBuilder::Reset() {
+  last_whole_key_recorded_ = false;
+  last_prefix_recorded_ = false;
+}
+
+Slice FullFilterBlockBuilder::Finish(
+    const BlockHandle& /*tmp*/, Status* status,
+    std::unique_ptr<const char[]>* filter_data) {
+  Reset();
+  // In this impl we ignore BlockHandle
+  *status = Status::OK();
+  if (any_added_) {
+    any_added_ = false;
+    Slice filter_content = filter_bits_builder_->Finish(
+        filter_data ? filter_data : &filter_data_, status);
+    return filter_content;
+  }
+  return Slice();
+}
+
+FullFilterBlockReader::FullFilterBlockReader(
+    const BlockBasedTable* t,
+    CachableEntry<ParsedFullFilterBlock>&& filter_block)
+    : FilterBlockReaderCommon(t, std::move(filter_block)) {}
+
+bool FullFilterBlockReader::KeyMayMatch(const Slice& key, const bool no_io,
+                                        const Slice* const /*const_ikey_ptr*/,
+                                        GetContext* get_context,
+                                        BlockCacheLookupContext* lookup_context,
+                                        Env::IOPriority rate_limiter_priority) {
+  if (!whole_key_filtering()) {
+    return true;
+  }
+  return MayMatch(key, no_io, get_context, lookup_context,
+                  rate_limiter_priority);
+}
+
+std::unique_ptr<FilterBlockReader> FullFilterBlockReader::Create(
+    const BlockBasedTable* table, const ReadOptions& ro,
+    FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+    bool pin, BlockCacheLookupContext* lookup_context) {
+  assert(table);
+  assert(table->get_rep());
+  assert(!pin || prefetch);
+
+  CachableEntry<ParsedFullFilterBlock> filter_block;
+  if (prefetch || !use_cache) {
+    const Status s = ReadFilterBlock(table, prefetch_buffer, ro, use_cache,
+                                     nullptr /* get_context */, lookup_context,
+                                     &filter_block, BlockType::kFilter);
+    if (!s.ok()) {
+      IGNORE_STATUS_IF_ERROR(s);
+      return std::unique_ptr<FilterBlockReader>();
+    }
+
+    if (use_cache && !pin) {
+      filter_block.Reset();
+    }
+  }
+
+  return std::unique_ptr<FilterBlockReader>(
+      new FullFilterBlockReader(table, std::move(filter_block)));
+}
+
+bool FullFilterBlockReader::PrefixMayMatch(
+    const Slice& prefix, const bool no_io,
+    const Slice* const /*const_ikey_ptr*/, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority) {
+  return MayMatch(prefix, no_io, get_context, lookup_context,
+                  rate_limiter_priority);
+}
+
+bool FullFilterBlockReader::MayMatch(
+    const Slice& entry, bool no_io, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority) const {
+  CachableEntry<ParsedFullFilterBlock> filter_block;
+
+  const Status s =
+      GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block,
+                           BlockType::kFilter, rate_limiter_priority);
+  if (!s.ok()) {
+    IGNORE_STATUS_IF_ERROR(s);
+    return true;
+  }
+
+  assert(filter_block.GetValue());
+
+  FilterBitsReader* const filter_bits_reader =
+      filter_block.GetValue()->filter_bits_reader();
+
+  if (filter_bits_reader) {
+    if (filter_bits_reader->MayMatch(entry)) {
+      PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
+      return true;
+    } else {
+      PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
+      return false;
+    }
+  }
+  return true;
+}
+
+void FullFilterBlockReader::KeysMayMatch(
+    MultiGetRange* range, const bool no_io,
+    BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority) {
+  if (!whole_key_filtering()) {
+    // Simply return. Don't skip any key - consider all keys as likely to be
+    // present
+    return;
+  }
+  MayMatch(range, no_io, nullptr, lookup_context, rate_limiter_priority);
+}
+
+void FullFilterBlockReader::PrefixesMayMatch(
+    MultiGetRange* range, const SliceTransform* prefix_extractor,
+    const bool no_io, BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority) {
+  MayMatch(range, no_io, prefix_extractor, lookup_context,
+           rate_limiter_priority);
+}
+
+void FullFilterBlockReader::MayMatch(
+    MultiGetRange* range, bool no_io, const SliceTransform* prefix_extractor,
+    BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority) const {
+  CachableEntry<ParsedFullFilterBlock> filter_block;
+
+  const Status s = GetOrReadFilterBlock(
+      no_io, range->begin()->get_context, lookup_context, &filter_block,
+      BlockType::kFilter, rate_limiter_priority);
+  if (!s.ok()) {
+    IGNORE_STATUS_IF_ERROR(s);
+    return;
+  }
+
+  assert(filter_block.GetValue());
+
+  FilterBitsReader* const filter_bits_reader =
+      filter_block.GetValue()->filter_bits_reader();
+
+  if (!filter_bits_reader) {
+    return;
+  }
+
+  // We need to use an array instead of autovector for may_match since
+  // &may_match[0] doesn't work for autovector<bool> (compiler error). So
+  // declare both keys and may_match as arrays, which is also slightly less
+  // expensive compared to autovector
+  std::array<Slice*, MultiGetContext::MAX_BATCH_SIZE> keys;
+  std::array<bool, MultiGetContext::MAX_BATCH_SIZE> may_match = {{true}};
+  autovector<Slice, MultiGetContext::MAX_BATCH_SIZE> prefixes;
+  int num_keys = 0;
+  MultiGetRange filter_range(*range, range->begin(), range->end());
+  for (auto iter = filter_range.begin(); iter != filter_range.end(); ++iter) {
+    if (!prefix_extractor) {
+      keys[num_keys++] = &iter->ukey_without_ts;
+    } else if (prefix_extractor->InDomain(iter->ukey_without_ts)) {
+      prefixes.emplace_back(prefix_extractor->Transform(iter->ukey_without_ts));
+      keys[num_keys++] = &prefixes.back();
+    } else {
+      filter_range.SkipKey(iter);
+    }
+  }
+
+  filter_bits_reader->MayMatch(num_keys, &keys[0], &may_match[0]);
+
+  int i = 0;
+  for (auto iter = filter_range.begin(); iter != filter_range.end(); ++iter) {
+    if (!may_match[i]) {
+      // Update original MultiGet range to skip this key. The filter_range
+      // was temporarily used just to skip keys not in prefix_extractor domain
+      range->SkipKey(iter);
+      PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
+    } else {
+      // PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
+      PerfContext* perf_ctx = get_perf_context();
+      perf_ctx->bloom_sst_hit_count++;
+    }
+    ++i;
+  }
+}
+
+size_t FullFilterBlockReader::ApproximateMemoryUsage() const {
+  size_t usage = ApproximateFilterBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  usage += malloc_usable_size(const_cast<FullFilterBlockReader*>(this));
+#else
+  usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+  return usage;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/full_filter_block.h b/src/rocksdb/table/block_based/full_filter_block.h
new file mode 100644
index 000000000..cd1771a38
--- /dev/null
+++ b/src/rocksdb/table/block_based/full_filter_block.h
@@ -0,0 +1,147 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "table/block_based/filter_block_reader_common.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "table/block_based/parsed_full_filter_block.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class FilterPolicy;
+class FilterBitsBuilder;
+class FilterBitsReader;
+
+// A FullFilterBlockBuilder is used to construct a full filter for a
+// particular Table.  It generates a single string which is stored as
+// a special block in the Table.
+// The format of full filter block is:
+// +----------------------------------------------------------------+
+// |              full filter for all keys in sst file              |
+// +----------------------------------------------------------------+
+// The full filter can be very large. At the end of it, we put
+// num_probes: how many hash functions are used in bloom filter
+//
+class FullFilterBlockBuilder : public FilterBlockBuilder {
+ public:
+  explicit FullFilterBlockBuilder(const SliceTransform* prefix_extractor,
+                                  bool whole_key_filtering,
+                                  FilterBitsBuilder* filter_bits_builder);
+  // No copying allowed
+  FullFilterBlockBuilder(const FullFilterBlockBuilder&) = delete;
+  void operator=(const FullFilterBlockBuilder&) = delete;
+
+  // bits_builder is created in filter_policy, it should be passed in here
+  // directly. and be deleted here
+  ~FullFilterBlockBuilder() {}
+
+  virtual void Add(const Slice& key_without_ts) override;
+  virtual bool IsEmpty() const override { return !any_added_; }
+  virtual size_t EstimateEntriesAdded() override;
+  virtual Slice Finish(
+      const BlockHandle& tmp, Status* status,
+      std::unique_ptr<const char[]>* filter_data = nullptr) override;
+  using FilterBlockBuilder::Finish;
+
+  virtual void ResetFilterBitsBuilder() override {
+    filter_bits_builder_.reset();
+  }
+
+  virtual Status MaybePostVerifyFilter(const Slice& filter_content) override {
+    return filter_bits_builder_->MaybePostVerify(filter_content);
+  }
+
+ protected:
+  virtual void AddKey(const Slice& key);
+  std::unique_ptr<FilterBitsBuilder> filter_bits_builder_;
+  virtual void Reset();
+  void AddPrefix(const Slice& key);
+  const SliceTransform* prefix_extractor() { return prefix_extractor_; }
+  const std::string& last_prefix_str() const { return last_prefix_str_; }
+
+ private:
+  // important: all of these might point to invalid addresses
+  // at the time of destruction of this filter block. destructor
+  // should NOT dereference them.
+  const SliceTransform* prefix_extractor_;
+  bool whole_key_filtering_;
+  bool last_whole_key_recorded_;
+  std::string last_whole_key_str_;
+  bool last_prefix_recorded_;
+  std::string last_prefix_str_;
+  // Whether prefix_extractor_->InDomain(last_whole_key_) is true.
+  // Used in partitioned filters so that the last prefix from the previous
+  // filter partition will be added to the current partition if
+  // last_key_in_domain_ is true, regardless of the current key.
+  bool last_key_in_domain_;
+  bool any_added_;
+  std::unique_ptr<const char[]> filter_data_;
+};
+
+// A FilterBlockReader is used to parse filter from SST table.
+// KeyMayMatch and PrefixMayMatch would trigger filter checking
+class FullFilterBlockReader
+    : public FilterBlockReaderCommon<ParsedFullFilterBlock> {
+ public:
+  FullFilterBlockReader(const BlockBasedTable* t,
+                        CachableEntry<ParsedFullFilterBlock>&& filter_block);
+
+  static std::unique_ptr<FilterBlockReader> Create(
+      const BlockBasedTable* table, const ReadOptions& ro,
+      FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+      bool pin, BlockCacheLookupContext* lookup_context);
+
+  bool KeyMayMatch(const Slice& key, const bool no_io,
+                   const Slice* const const_ikey_ptr, GetContext* get_context,
+                   BlockCacheLookupContext* lookup_context,
+                   Env::IOPriority rate_limiter_priority) override;
+
+  bool PrefixMayMatch(const Slice& prefix, const bool no_io,
+                      const Slice* const const_ikey_ptr,
+                      GetContext* get_context,
+                      BlockCacheLookupContext* lookup_context,
+                      Env::IOPriority rate_limiter_priority) override;
+
+  void KeysMayMatch(MultiGetRange* range, const bool no_io,
+                    BlockCacheLookupContext* lookup_context,
+                    Env::IOPriority rate_limiter_priority) override;
+  // Used in partitioned filter code
+  void KeysMayMatch2(MultiGetRange* range,
+                     const SliceTransform* /*prefix_extractor*/,
+                     const bool no_io, BlockCacheLookupContext* lookup_context,
+                     Env::IOPriority rate_limiter_priority) {
+    KeysMayMatch(range, no_io, lookup_context, rate_limiter_priority);
+  }
+
+  void PrefixesMayMatch(MultiGetRange* range,
+                        const SliceTransform* prefix_extractor,
+                        const bool no_io,
+                        BlockCacheLookupContext* lookup_context,
+                        Env::IOPriority rate_limiter_priority) override;
+  size_t ApproximateMemoryUsage() const override;
+
+ private:
+  bool MayMatch(const Slice& entry, bool no_io, GetContext* get_context,
+                BlockCacheLookupContext* lookup_context,
+                Env::IOPriority rate_limiter_priority) const;
+  void MayMatch(MultiGetRange* range, bool no_io,
+                const SliceTransform* prefix_extractor,
+                BlockCacheLookupContext* lookup_context,
+                Env::IOPriority rate_limiter_priority) const;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/full_filter_block_test.cc b/src/rocksdb/table/block_based/full_filter_block_test.cc
new file mode 100644
index 000000000..bd98638e5
--- /dev/null
+++ b/src/rocksdb/table/block_based/full_filter_block_test.cc
@@ -0,0 +1,339 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/block_based/full_filter_block.h"
+
+#include <set>
+
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/status.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "table/block_based/mock_block_based_table.h"
+#include "table/format.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TestFilterBitsBuilder : public FilterBitsBuilder {
+ public:
+  explicit TestFilterBitsBuilder() {}
+
+  // Add Key to filter
+  void AddKey(const Slice& key) override {
+    hash_entries_.push_back(Hash(key.data(), key.size(), 1));
+  }
+
+  using FilterBitsBuilder::Finish;
+
+  // Generate the filter using the keys that are added
+  Slice Finish(std::unique_ptr<const char[]>* buf) override {
+    uint32_t len = static_cast<uint32_t>(hash_entries_.size()) * 4;
+    char* data = new char[len];
+    for (size_t i = 0; i < hash_entries_.size(); i++) {
+      EncodeFixed32(data + i * 4, hash_entries_[i]);
+    }
+    const char* const_data = data;
+    buf->reset(const_data);
+    return Slice(data, len);
+  }
+
+  size_t EstimateEntriesAdded() override { return hash_entries_.size(); }
+
+  size_t ApproximateNumEntries(size_t bytes) override { return bytes / 4; }
+
+ private:
+  std::vector<uint32_t> hash_entries_;
+};
+
+class MockBlockBasedTable : public BlockBasedTable {
+ public:
+  explicit MockBlockBasedTable(Rep* rep)
+      : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {}
+};
+
+class TestFilterBitsReader : public FilterBitsReader {
+ public:
+  explicit TestFilterBitsReader(const Slice& contents)
+      : data_(contents.data()), len_(static_cast<uint32_t>(contents.size())) {}
+
+  // Silence compiler warning about overloaded virtual
+  using FilterBitsReader::MayMatch;
+  bool MayMatch(const Slice& entry) override {
+    uint32_t h = Hash(entry.data(), entry.size(), 1);
+    for (size_t i = 0; i + 4 <= len_; i += 4) {
+      if (h == DecodeFixed32(data_ + i)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+ private:
+  const char* data_;
+  uint32_t len_;
+};
+
+class TestHashFilter : public FilterPolicy {
+ public:
+  const char* Name() const override { return "TestHashFilter"; }
+  const char* CompatibilityName() const override { return Name(); }
+
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext&) const override {
+    return new TestFilterBitsBuilder();
+  }
+
+  FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override {
+    return new TestFilterBitsReader(contents);
+  }
+};
+
+class PluginFullFilterBlockTest : public mock::MockBlockBasedTableTester,
+                                  public testing::Test {
+ public:
+  PluginFullFilterBlockTest()
+      : mock::MockBlockBasedTableTester(new TestHashFilter) {}
+};
+
+TEST_F(PluginFullFilterBlockTest, PluginEmptyBuilder) {
+  FullFilterBlockBuilder builder(nullptr, true, GetBuilder());
+  Slice slice = builder.Finish();
+  ASSERT_EQ("", EscapeString(slice));
+
+  CachableEntry<ParsedFullFilterBlock> block(
+      new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+                                BlockContents(slice)),
+      nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
+
+  FullFilterBlockReader reader(table_.get(), std::move(block));
+  // Remain same symantic with blockbased filter
+  ASSERT_TRUE(reader.KeyMayMatch("foo",
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr, Env::IO_TOTAL));
+}
+
+TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) {
+  FullFilterBlockBuilder builder(nullptr, true, GetBuilder());
+  builder.Add("foo");
+  builder.Add("bar");
+  builder.Add("box");
+  builder.Add("box");
+  builder.Add("hello");
+  Slice slice = builder.Finish();
+
+  CachableEntry<ParsedFullFilterBlock> block(
+      new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+                                BlockContents(slice)),
+      nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
+
+  FullFilterBlockReader reader(table_.get(), std::move(block));
+  Env::IOPriority rate_limiter_priority = Env::IO_TOTAL;
+  ASSERT_TRUE(reader.KeyMayMatch("foo",
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr,
+                                 rate_limiter_priority));
+  ASSERT_TRUE(reader.KeyMayMatch("bar",
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr,
+                                 rate_limiter_priority));
+  ASSERT_TRUE(reader.KeyMayMatch("box",
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr,
+                                 rate_limiter_priority));
+  ASSERT_TRUE(reader.KeyMayMatch("hello",
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr,
+                                 rate_limiter_priority));
+  ASSERT_TRUE(reader.KeyMayMatch("foo",
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr,
+                                 rate_limiter_priority));
+  ASSERT_TRUE(!reader.KeyMayMatch("missing",
+                                  /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                  /*get_context=*/nullptr,
+                                  /*lookup_context=*/nullptr,
+                                  rate_limiter_priority));
+  ASSERT_TRUE(!reader.KeyMayMatch("other",
+                                  /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                  /*get_context=*/nullptr,
+                                  /*lookup_context=*/nullptr,
+                                  rate_limiter_priority));
+}
+
+class FullFilterBlockTest : public mock::MockBlockBasedTableTester,
+                            public testing::Test {
+ public:
+  FullFilterBlockTest()
+      : mock::MockBlockBasedTableTester(NewBloomFilterPolicy(10, false)) {}
+};
+
+TEST_F(FullFilterBlockTest, EmptyBuilder) {
+  FullFilterBlockBuilder builder(nullptr, true, GetBuilder());
+  Slice slice = builder.Finish();
+  ASSERT_EQ("", EscapeString(slice));
+
+  CachableEntry<ParsedFullFilterBlock> block(
+      new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+                                BlockContents(slice)),
+      nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
+
+  FullFilterBlockReader reader(table_.get(), std::move(block));
+  // Remain same symantic with blockbased filter
+  ASSERT_TRUE(reader.KeyMayMatch("foo",
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr, Env::IO_TOTAL));
+}
+
+class CountUniqueFilterBitsBuilderWrapper : public FilterBitsBuilder {
+  std::unique_ptr<FilterBitsBuilder> b_;
+  std::set<std::string> uniq_;
+
+ public:
+  explicit CountUniqueFilterBitsBuilderWrapper(FilterBitsBuilder* b) : b_(b) {}
+
+  ~CountUniqueFilterBitsBuilderWrapper() override {}
+
+  void AddKey(const Slice& key) override {
+    b_->AddKey(key);
+    uniq_.insert(key.ToString());
+  }
+
+  using FilterBitsBuilder::Finish;
+
+  Slice Finish(std::unique_ptr<const char[]>* buf) override {
+    Slice rv = b_->Finish(buf);
+    Status s_dont_care = b_->MaybePostVerify(rv);
+    s_dont_care.PermitUncheckedError();
+    uniq_.clear();
+    return rv;
+  }
+
+  size_t EstimateEntriesAdded() override { return b_->EstimateEntriesAdded(); }
+
+  size_t ApproximateNumEntries(size_t bytes) override {
+    return b_->ApproximateNumEntries(bytes);
+  }
+
+  size_t CountUnique() { return uniq_.size(); }
+};
+
+TEST_F(FullFilterBlockTest, DuplicateEntries) {
+  {  // empty prefixes
+    std::unique_ptr<const SliceTransform> prefix_extractor(
+        NewFixedPrefixTransform(0));
+    auto bits_builder = new CountUniqueFilterBitsBuilderWrapper(GetBuilder());
+    const bool WHOLE_KEY = true;
+    FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY,
+                                   bits_builder);
+    ASSERT_EQ(0, bits_builder->CountUnique());
+    // adds key and empty prefix; both abstractions count them
+    builder.Add("key1");
+    ASSERT_EQ(2, bits_builder->CountUnique());
+    // Add different key (unique) and also empty prefix (not unique).
+    // From here in this test, it's immaterial whether the block builder
+    // can count unique keys.
+    builder.Add("key2");
+    ASSERT_EQ(3, bits_builder->CountUnique());
+    // Empty key -> nothing unique
+    builder.Add("");
+    ASSERT_EQ(3, bits_builder->CountUnique());
+  }
+
+  // mix of empty and non-empty
+  std::unique_ptr<const SliceTransform> prefix_extractor(
+      NewFixedPrefixTransform(7));
+  auto bits_builder = new CountUniqueFilterBitsBuilderWrapper(GetBuilder());
+  const bool WHOLE_KEY = true;
+  FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY,
+                                 bits_builder);
+  builder.Add("");  // test with empty key too
+  builder.Add("prefix1key1");
+  builder.Add("prefix1key1");
+  builder.Add("prefix1key2");
+  builder.Add("prefix1key3");
+  builder.Add("prefix2key4");
+  // 1 empty, 2 non-empty prefixes, and 4 non-empty keys
+  ASSERT_EQ(1 + 2 + 4, bits_builder->CountUnique());
+}
+
+TEST_F(FullFilterBlockTest, SingleChunk) {
+  FullFilterBlockBuilder builder(nullptr, true, GetBuilder());
+  ASSERT_TRUE(builder.IsEmpty());
+  builder.Add("foo");
+  ASSERT_FALSE(builder.IsEmpty());
+  builder.Add("bar");
+  builder.Add("box");
+  builder.Add("box");
+  builder.Add("hello");
+  // "box" only counts once
+  ASSERT_EQ(4, builder.EstimateEntriesAdded());
+  ASSERT_FALSE(builder.IsEmpty());
+  Status s;
+  Slice slice = builder.Finish(BlockHandle(), &s);
+  ASSERT_OK(s);
+
+  CachableEntry<ParsedFullFilterBlock> block(
+      new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+                                BlockContents(slice)),
+      nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
+
+  FullFilterBlockReader reader(table_.get(), std::move(block));
+  Env::IOPriority rate_limiter_priority = Env::IO_TOTAL;
+  ASSERT_TRUE(reader.KeyMayMatch("foo",
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr,
+                                 rate_limiter_priority));
+  ASSERT_TRUE(reader.KeyMayMatch("bar",
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr,
+                                 rate_limiter_priority));
+  ASSERT_TRUE(reader.KeyMayMatch("box",
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr,
+                                 rate_limiter_priority));
+  ASSERT_TRUE(reader.KeyMayMatch("hello",
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr,
+                                 rate_limiter_priority));
+  ASSERT_TRUE(reader.KeyMayMatch("foo",
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr,
+                                 rate_limiter_priority));
+  ASSERT_TRUE(!reader.KeyMayMatch("missing",
+                                  /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                  /*get_context=*/nullptr,
+                                  /*lookup_context=*/nullptr,
+                                  rate_limiter_priority));
+  ASSERT_TRUE(!reader.KeyMayMatch("other",
+                                  /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                  /*get_context=*/nullptr,
+                                  /*lookup_context=*/nullptr,
+                                  rate_limiter_priority));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/block_based/hash_index_reader.cc b/src/rocksdb/table/block_based/hash_index_reader.cc
new file mode 100644
index 000000000..bcaba17a2
--- /dev/null
+++ b/src/rocksdb/table/block_based/hash_index_reader.cc
@@ -0,0 +1,148 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/hash_index_reader.h"
+
+#include "table/block_fetcher.h"
+#include "table/meta_blocks.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status HashIndexReader::Create(const BlockBasedTable* table,
+                               const ReadOptions& ro,
+                               FilePrefetchBuffer* prefetch_buffer,
+                               InternalIterator* meta_index_iter,
+                               bool use_cache, bool prefetch, bool pin,
+                               BlockCacheLookupContext* lookup_context,
+                               std::unique_ptr<IndexReader>* index_reader) {
+  assert(table != nullptr);
+  assert(index_reader != nullptr);
+  assert(!pin || prefetch);
+
+  const BlockBasedTable::Rep* rep = table->get_rep();
+  assert(rep != nullptr);
+
+  CachableEntry<Block> index_block;
+  if (prefetch || !use_cache) {
+    const Status s =
+        ReadIndexBlock(table, prefetch_buffer, ro, use_cache,
+                       /*get_context=*/nullptr, lookup_context, &index_block);
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (use_cache && !pin) {
+      index_block.Reset();
+    }
+  }
+
+  // Note, failure to create prefix hash index does not need to be a
+  // hard error. We can still fall back to the original binary search index.
+  // So, Create will succeed regardless, from this point on.
+
+  index_reader->reset(new HashIndexReader(table, std::move(index_block)));
+
+  // Get prefixes block
+  BlockHandle prefixes_handle;
+  Status s =
+      FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock, &prefixes_handle);
+  if (!s.ok()) {
+    // TODO: log error
+    return Status::OK();
+  }
+
+  // Get index metadata block
+  BlockHandle prefixes_meta_handle;
+  s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock,
+                    &prefixes_meta_handle);
+  if (!s.ok()) {
+    // TODO: log error
+    return Status::OK();
+  }
+
+  RandomAccessFileReader* const file = rep->file.get();
+  const Footer& footer = rep->footer;
+  const ImmutableOptions& ioptions = rep->ioptions;
+  const PersistentCacheOptions& cache_options = rep->persistent_cache_options;
+  MemoryAllocator* const memory_allocator =
+      GetMemoryAllocator(rep->table_options);
+
+  // Read contents for the blocks
+  BlockContents prefixes_contents;
+  BlockFetcher prefixes_block_fetcher(
+      file, prefetch_buffer, footer, ReadOptions(), prefixes_handle,
+      &prefixes_contents, ioptions, true /*decompress*/,
+      true /*maybe_compressed*/, BlockType::kHashIndexPrefixes,
+      UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
+  s = prefixes_block_fetcher.ReadBlockContents();
+  if (!s.ok()) {
+    return s;
+  }
+  BlockContents prefixes_meta_contents;
+  BlockFetcher prefixes_meta_block_fetcher(
+      file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle,
+      &prefixes_meta_contents, ioptions, true /*decompress*/,
+      true /*maybe_compressed*/, BlockType::kHashIndexMetadata,
+      UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
+  s = prefixes_meta_block_fetcher.ReadBlockContents();
+  if (!s.ok()) {
+    // TODO: log error
+    return Status::OK();
+  }
+
+  BlockPrefixIndex* prefix_index = nullptr;
+  assert(rep->table_prefix_extractor);
+  s = BlockPrefixIndex::Create(rep->table_prefix_extractor.get(),
+                               prefixes_contents.data,
+                               prefixes_meta_contents.data, &prefix_index);
+  // TODO: log error
+  if (s.ok()) {
+    HashIndexReader* const hash_index_reader =
+        static_cast<HashIndexReader*>(index_reader->get());
+    hash_index_reader->prefix_index_.reset(prefix_index);
+  }
+
+  return Status::OK();
+}
+
+InternalIteratorBase<IndexValue>* HashIndexReader::NewIterator(
+    const ReadOptions& read_options, bool disable_prefix_seek,
+    IndexBlockIter* iter, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) {
+  const BlockBasedTable::Rep* rep = table()->get_rep();
+  const bool no_io = (read_options.read_tier == kBlockCacheTier);
+  CachableEntry<Block> index_block;
+  const Status s =
+      GetOrReadIndexBlock(no_io, read_options.rate_limiter_priority,
+                          get_context, lookup_context, &index_block);
+  if (!s.ok()) {
+    if (iter != nullptr) {
+      iter->Invalidate(s);
+      return iter;
+    }
+
+    return NewErrorInternalIterator<IndexValue>(s);
+  }
+
+  Statistics* kNullStats = nullptr;
+  const bool total_order_seek =
+      read_options.total_order_seek || disable_prefix_seek;
+  // We don't return pinned data from index blocks, so no need
+  // to set `block_contents_pinned`.
+  auto it = index_block.GetValue()->NewIndexIterator(
+      internal_comparator()->user_comparator(),
+      rep->get_global_seqno(BlockType::kIndex), iter, kNullStats,
+      total_order_seek, index_has_first_key(), index_key_includes_seq(),
+      index_value_is_full(), false /* block_contents_pinned */,
+      prefix_index_.get());
+
+  assert(it != nullptr);
+  index_block.TransferTo(it);
+
+  return it;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/hash_index_reader.h b/src/rocksdb/table/block_based/hash_index_reader.h
new file mode 100644
index 000000000..9037efc87
--- /dev/null
+++ b/src/rocksdb/table/block_based/hash_index_reader.h
@@ -0,0 +1,49 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include "table/block_based/index_reader_common.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Index that leverages an internal hash table to quicken the lookup for a given
+// key.
+class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
+ public:
+  static Status Create(const BlockBasedTable* table, const ReadOptions& ro,
+                       FilePrefetchBuffer* prefetch_buffer,
+                       InternalIterator* meta_index_iter, bool use_cache,
+                       bool prefetch, bool pin,
+                       BlockCacheLookupContext* lookup_context,
+                       std::unique_ptr<IndexReader>* index_reader);
+
+  InternalIteratorBase<IndexValue>* NewIterator(
+      const ReadOptions& read_options, bool disable_prefix_seek,
+      IndexBlockIter* iter, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context) override;
+
+  size_t ApproximateMemoryUsage() const override {
+    size_t usage = ApproximateIndexBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    usage += malloc_usable_size(const_cast<HashIndexReader*>(this));
+#else
+    if (prefix_index_) {
+      usage += prefix_index_->ApproximateMemoryUsage();
+    }
+    usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    return usage;
+  }
+
+ private:
+  HashIndexReader(const BlockBasedTable* t, CachableEntry<Block>&& index_block)
+      : IndexReaderCommon(t, std::move(index_block)) {}
+
+  std::unique_ptr<BlockPrefixIndex> prefix_index_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/index_builder.cc b/src/rocksdb/table/block_based/index_builder.cc
new file mode 100644
index 000000000..024730178
--- /dev/null
+++ b/src/rocksdb/table/block_based/index_builder.cc
@@ -0,0 +1,282 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_based/index_builder.h"
+
+#include <assert.h>
+
+#include <cinttypes>
+#include <list>
+#include <string>
+
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/flush_block_policy.h"
+#include "table/block_based/partitioned_filter_block.h"
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Create a index builder based on its type.
+IndexBuilder* IndexBuilder::CreateIndexBuilder(
+    BlockBasedTableOptions::IndexType index_type,
+    const InternalKeyComparator* comparator,
+    const InternalKeySliceTransform* int_key_slice_transform,
+    const bool use_value_delta_encoding,
+    const BlockBasedTableOptions& table_opt) {
+  IndexBuilder* result = nullptr;
+  switch (index_type) {
+    case BlockBasedTableOptions::kBinarySearch: {
+      result = new ShortenedIndexBuilder(
+          comparator, table_opt.index_block_restart_interval,
+          table_opt.format_version, use_value_delta_encoding,
+          table_opt.index_shortening, /* include_first_key */ false);
+      break;
+    }
+    case BlockBasedTableOptions::kHashSearch: {
+      // Currently kHashSearch is incompatible with index_block_restart_interval
+      // > 1
+      assert(table_opt.index_block_restart_interval == 1);
+      result = new HashIndexBuilder(
+          comparator, int_key_slice_transform,
+          table_opt.index_block_restart_interval, table_opt.format_version,
+          use_value_delta_encoding, table_opt.index_shortening);
+      break;
+    }
+    case BlockBasedTableOptions::kTwoLevelIndexSearch: {
+      result = PartitionedIndexBuilder::CreateIndexBuilder(
+          comparator, use_value_delta_encoding, table_opt);
+      break;
+    }
+    case BlockBasedTableOptions::kBinarySearchWithFirstKey: {
+      result = new ShortenedIndexBuilder(
+          comparator, table_opt.index_block_restart_interval,
+          table_opt.format_version, use_value_delta_encoding,
+          table_opt.index_shortening, /* include_first_key */ true);
+      break;
+    }
+    default: {
+      assert(!"Do not recognize the index type ");
+      break;
+    }
+  }
+  return result;
+}
+
+void ShortenedIndexBuilder::FindShortestInternalKeySeparator(
+    const Comparator& comparator, std::string* start, const Slice& limit) {
+  // Attempt to shorten the user portion of the key
+  Slice user_start = ExtractUserKey(*start);
+  Slice user_limit = ExtractUserKey(limit);
+  std::string tmp(user_start.data(), user_start.size());
+  comparator.FindShortestSeparator(&tmp, user_limit);
+  if (tmp.size() <= user_start.size() &&
+      comparator.Compare(user_start, tmp) < 0) {
+    // User key has become shorter physically, but larger logically.
+    // Tack on the earliest possible number to the shortened user key.
+    PutFixed64(&tmp,
+               PackSequenceAndType(kMaxSequenceNumber, kValueTypeForSeek));
+    assert(InternalKeyComparator(&comparator).Compare(*start, tmp) < 0);
+    assert(InternalKeyComparator(&comparator).Compare(tmp, limit) < 0);
+    start->swap(tmp);
+  }
+}
+
+void ShortenedIndexBuilder::FindShortInternalKeySuccessor(
+    const Comparator& comparator, std::string* key) {
+  Slice user_key = ExtractUserKey(*key);
+  std::string tmp(user_key.data(), user_key.size());
+  comparator.FindShortSuccessor(&tmp);
+  if (tmp.size() <= user_key.size() && comparator.Compare(user_key, tmp) < 0) {
+    // User key has become shorter physically, but larger logically.
+    // Tack on the earliest possible number to the shortened user key.
+    PutFixed64(&tmp,
+               PackSequenceAndType(kMaxSequenceNumber, kValueTypeForSeek));
+    assert(InternalKeyComparator(&comparator).Compare(*key, tmp) < 0);
+    key->swap(tmp);
+  }
+}
+
+PartitionedIndexBuilder* PartitionedIndexBuilder::CreateIndexBuilder(
+    const InternalKeyComparator* comparator,
+    const bool use_value_delta_encoding,
+    const BlockBasedTableOptions& table_opt) {
+  return new PartitionedIndexBuilder(comparator, table_opt,
+                                     use_value_delta_encoding);
+}
+
+PartitionedIndexBuilder::PartitionedIndexBuilder(
+    const InternalKeyComparator* comparator,
+    const BlockBasedTableOptions& table_opt,
+    const bool use_value_delta_encoding)
+    : IndexBuilder(comparator),
+      index_block_builder_(table_opt.index_block_restart_interval,
+                           true /*use_delta_encoding*/,
+                           use_value_delta_encoding),
+      index_block_builder_without_seq_(table_opt.index_block_restart_interval,
+                                       true /*use_delta_encoding*/,
+                                       use_value_delta_encoding),
+      sub_index_builder_(nullptr),
+      table_opt_(table_opt),
+      // We start by false. After each partition we revise the value based on
+      // what the sub_index_builder has decided. If the feature is disabled
+      // entirely, this will be set to true after switching the first
+      // sub_index_builder. Otherwise, it could be set to true even one of the
+      // sub_index_builders could not safely exclude seq from the keys, then it
+      // wil be enforced on all sub_index_builders on ::Finish.
+      seperator_is_key_plus_seq_(false),
+      use_value_delta_encoding_(use_value_delta_encoding) {}
+
+PartitionedIndexBuilder::~PartitionedIndexBuilder() {
+  delete sub_index_builder_;
+}
+
+void PartitionedIndexBuilder::MakeNewSubIndexBuilder() {
+  assert(sub_index_builder_ == nullptr);
+  sub_index_builder_ = new ShortenedIndexBuilder(
+      comparator_, table_opt_.index_block_restart_interval,
+      table_opt_.format_version, use_value_delta_encoding_,
+      table_opt_.index_shortening, /* include_first_key */ false);
+
+  // Set sub_index_builder_->seperator_is_key_plus_seq_ to true if
+  // seperator_is_key_plus_seq_ is true (internal-key mode) (set to false by
+  // default on Creation) so that flush policy can point to
+  // sub_index_builder_->index_block_builder_
+  if (seperator_is_key_plus_seq_) {
+    sub_index_builder_->seperator_is_key_plus_seq_ = true;
+  }
+
+  flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
+      table_opt_.metadata_block_size, table_opt_.block_size_deviation,
+      // Note: this is sub-optimal since sub_index_builder_ could later reset
+      // seperator_is_key_plus_seq_ but the probability of that is low.
+      sub_index_builder_->seperator_is_key_plus_seq_
+          ? sub_index_builder_->index_block_builder_
+          : sub_index_builder_->index_block_builder_without_seq_));
+  partition_cut_requested_ = false;
+}
+
+void PartitionedIndexBuilder::RequestPartitionCut() {
+  partition_cut_requested_ = true;
+}
+
+void PartitionedIndexBuilder::AddIndexEntry(
+    std::string* last_key_in_current_block,
+    const Slice* first_key_in_next_block, const BlockHandle& block_handle) {
+  // Note: to avoid two consecuitive flush in the same method call, we do not
+  // check flush policy when adding the last key
+  if (UNLIKELY(first_key_in_next_block == nullptr)) {  // no more keys
+    if (sub_index_builder_ == nullptr) {
+      MakeNewSubIndexBuilder();
+    }
+    sub_index_builder_->AddIndexEntry(last_key_in_current_block,
+                                      first_key_in_next_block, block_handle);
+    if (!seperator_is_key_plus_seq_ &&
+        sub_index_builder_->seperator_is_key_plus_seq_) {
+      // then we need to apply it to all sub-index builders and reset
+      // flush_policy to point to Block Builder of sub_index_builder_ that store
+      // internal keys.
+      seperator_is_key_plus_seq_ = true;
+      flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
+          table_opt_.metadata_block_size, table_opt_.block_size_deviation,
+          sub_index_builder_->index_block_builder_));
+    }
+    sub_index_last_key_ = std::string(*last_key_in_current_block);
+    entries_.push_back(
+        {sub_index_last_key_,
+         std::unique_ptr<ShortenedIndexBuilder>(sub_index_builder_)});
+    sub_index_builder_ = nullptr;
+    cut_filter_block = true;
+  } else {
+    // apply flush policy only to non-empty sub_index_builder_
+    if (sub_index_builder_ != nullptr) {
+      std::string handle_encoding;
+      block_handle.EncodeTo(&handle_encoding);
+      bool do_flush =
+          partition_cut_requested_ ||
+          flush_policy_->Update(*last_key_in_current_block, handle_encoding);
+      if (do_flush) {
+        entries_.push_back(
+            {sub_index_last_key_,
+             std::unique_ptr<ShortenedIndexBuilder>(sub_index_builder_)});
+        cut_filter_block = true;
+        sub_index_builder_ = nullptr;
+      }
+    }
+    if (sub_index_builder_ == nullptr) {
+      MakeNewSubIndexBuilder();
+    }
+    sub_index_builder_->AddIndexEntry(last_key_in_current_block,
+                                      first_key_in_next_block, block_handle);
+    sub_index_last_key_ = std::string(*last_key_in_current_block);
+    if (!seperator_is_key_plus_seq_ &&
+        sub_index_builder_->seperator_is_key_plus_seq_) {
+      // then we need to apply it to all sub-index builders and reset
+      // flush_policy to point to Block Builder of sub_index_builder_ that store
+      // internal keys.
+      seperator_is_key_plus_seq_ = true;
+      flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
+          table_opt_.metadata_block_size, table_opt_.block_size_deviation,
+          sub_index_builder_->index_block_builder_));
+    }
+  }
+}
+
+Status PartitionedIndexBuilder::Finish(
+    IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) {
+  if (partition_cnt_ == 0) {
+    partition_cnt_ = entries_.size();
+  }
+  // It must be set to null after last key is added
+  assert(sub_index_builder_ == nullptr);
+  if (finishing_indexes == true) {
+    Entry& last_entry = entries_.front();
+    std::string handle_encoding;
+    last_partition_block_handle.EncodeTo(&handle_encoding);
+    std::string handle_delta_encoding;
+    PutVarsignedint64(
+        &handle_delta_encoding,
+        last_partition_block_handle.size() - last_encoded_handle_.size());
+    last_encoded_handle_ = last_partition_block_handle;
+    const Slice handle_delta_encoding_slice(handle_delta_encoding);
+    index_block_builder_.Add(last_entry.key, handle_encoding,
+                             &handle_delta_encoding_slice);
+    if (!seperator_is_key_plus_seq_) {
+      index_block_builder_without_seq_.Add(ExtractUserKey(last_entry.key),
+                                           handle_encoding,
+                                           &handle_delta_encoding_slice);
+    }
+    entries_.pop_front();
+  }
+  // If there is no sub_index left, then return the 2nd level index.
+  if (UNLIKELY(entries_.empty())) {
+    if (seperator_is_key_plus_seq_) {
+      index_blocks->index_block_contents = index_block_builder_.Finish();
+    } else {
+      index_blocks->index_block_contents =
+          index_block_builder_without_seq_.Finish();
+    }
+    top_level_index_size_ = index_blocks->index_block_contents.size();
+    index_size_ += top_level_index_size_;
+    return Status::OK();
+  } else {
+    // Finish the next partition index in line and Incomplete() to indicate we
+    // expect more calls to Finish
+    Entry& entry = entries_.front();
+    // Apply the policy to all sub-indexes
+    entry.value->seperator_is_key_plus_seq_ = seperator_is_key_plus_seq_;
+    auto s = entry.value->Finish(index_blocks);
+    index_size_ += index_blocks->index_block_contents.size();
+    finishing_indexes = true;
+    return s.ok() ? Status::Incomplete() : s;
+  }
+}
+
+size_t PartitionedIndexBuilder::NumPartitions() const { return partition_cnt_; }
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/index_builder.h b/src/rocksdb/table/block_based/index_builder.h
new file mode 100644
index 000000000..dd3be0331
--- /dev/null
+++ b/src/rocksdb/table/block_based/index_builder.h
@@ -0,0 +1,455 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <assert.h>
+
+#include <cinttypes>
+#include <list>
+#include <string>
+#include <unordered_map>
+
+#include "rocksdb/comparator.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_builder.h"
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+// The interface for building index.
+// Instruction for adding a new concrete IndexBuilder:
+//  1. Create a subclass instantiated from IndexBuilder.
+//  2. Add a new entry associated with that subclass in TableOptions::IndexType.
+//  3. Add a create function for the new subclass in CreateIndexBuilder.
+// Note: we can devise more advanced design to simplify the process for adding
+// new subclass, which will, on the other hand, increase the code complexity and
+// catch unwanted attention from readers. Given that we won't add/change
+// indexes frequently, it makes sense to just embrace a more straightforward
+// design that just works.
+class IndexBuilder {
+ public:
+  static IndexBuilder* CreateIndexBuilder(
+      BlockBasedTableOptions::IndexType index_type,
+      const ROCKSDB_NAMESPACE::InternalKeyComparator* comparator,
+      const InternalKeySliceTransform* int_key_slice_transform,
+      const bool use_value_delta_encoding,
+      const BlockBasedTableOptions& table_opt);
+
+  // Index builder will construct a set of blocks which contain:
+  //  1. One primary index block.
+  //  2. (Optional) a set of metablocks that contains the metadata of the
+  //     primary index.
+  struct IndexBlocks {
+    Slice index_block_contents;
+    std::unordered_map<std::string, Slice> meta_blocks;
+  };
+  explicit IndexBuilder(const InternalKeyComparator* comparator)
+      : comparator_(comparator) {}
+
+  virtual ~IndexBuilder() {}
+
+  // Add a new index entry to index block.
+  // To allow further optimization, we provide `last_key_in_current_block` and
+  // `first_key_in_next_block`, based on which the specific implementation can
+  // determine the best index key to be used for the index block.
+  // Called before the OnKeyAdded() call for first_key_in_next_block.
+  // @last_key_in_current_block: this parameter maybe overridden with the value
+  //                             "substitute key".
+  // @first_key_in_next_block: it will be nullptr if the entry being added is
+  //                           the last one in the table
+  //
+  // REQUIRES: Finish() has not yet been called.
+  virtual void AddIndexEntry(std::string* last_key_in_current_block,
+                             const Slice* first_key_in_next_block,
+                             const BlockHandle& block_handle) = 0;
+
+  // This method will be called whenever a key is added. The subclasses may
+  // override OnKeyAdded() if they need to collect additional information.
+  virtual void OnKeyAdded(const Slice& /*key*/) {}
+
+  // Inform the index builder that all entries has been written. Block builder
+  // may therefore perform any operation required for block finalization.
+  //
+  // REQUIRES: Finish() has not yet been called.
+  inline Status Finish(IndexBlocks* index_blocks) {
+    // Throw away the changes to last_partition_block_handle. It has no effect
+    // on the first call to Finish anyway.
+    BlockHandle last_partition_block_handle;
+    return Finish(index_blocks, last_partition_block_handle);
+  }
+
+  // This override of Finish can be utilized to build the 2nd level index in
+  // PartitionIndexBuilder.
+  //
+  // index_blocks will be filled with the resulting index data. If the return
+  // value is Status::InComplete() then it means that the index is partitioned
+  // and the callee should keep calling Finish until Status::OK() is returned.
+  // In that case, last_partition_block_handle is pointer to the block written
+  // with the result of the last call to Finish. This can be utilized to build
+  // the second level index pointing to each block of partitioned indexes. The
+  // last call to Finish() that returns Status::OK() populates index_blocks with
+  // the 2nd level index content.
+  virtual Status Finish(IndexBlocks* index_blocks,
+                        const BlockHandle& last_partition_block_handle) = 0;
+
+  // Get the size for index block. Must be called after ::Finish.
+  virtual size_t IndexSize() const = 0;
+
+  virtual bool seperator_is_key_plus_seq() { return true; }
+
+ protected:
+  const InternalKeyComparator* comparator_;
+  // Set after ::Finish is called
+  size_t index_size_ = 0;
+};
+
+// This index builder builds space-efficient index block.
+//
+// Optimizations:
+//  1. Made block's `block_restart_interval` to be 1, which will avoid linear
+//     search when doing index lookup (can be disabled by setting
+//     index_block_restart_interval).
+//  2. Shorten the key length for index block. Other than honestly using the
+//     last key in the data block as the index key, we instead find a shortest
+//     substitute key that serves the same function.
+class ShortenedIndexBuilder : public IndexBuilder {
+ public:
+  explicit ShortenedIndexBuilder(
+      const InternalKeyComparator* comparator,
+      const int index_block_restart_interval, const uint32_t format_version,
+      const bool use_value_delta_encoding,
+      BlockBasedTableOptions::IndexShorteningMode shortening_mode,
+      bool include_first_key)
+      : IndexBuilder(comparator),
+        index_block_builder_(index_block_restart_interval,
+                             true /*use_delta_encoding*/,
+                             use_value_delta_encoding),
+        index_block_builder_without_seq_(index_block_restart_interval,
+                                         true /*use_delta_encoding*/,
+                                         use_value_delta_encoding),
+        use_value_delta_encoding_(use_value_delta_encoding),
+        include_first_key_(include_first_key),
+        shortening_mode_(shortening_mode) {
+    // Making the default true will disable the feature for old versions
+    seperator_is_key_plus_seq_ = (format_version <= 2);
+  }
+
+  virtual void OnKeyAdded(const Slice& key) override {
+    if (include_first_key_ && current_block_first_internal_key_.empty()) {
+      current_block_first_internal_key_.assign(key.data(), key.size());
+    }
+  }
+
+  virtual void AddIndexEntry(std::string* last_key_in_current_block,
+                             const Slice* first_key_in_next_block,
+                             const BlockHandle& block_handle) override {
+    if (first_key_in_next_block != nullptr) {
+      if (shortening_mode_ !=
+          BlockBasedTableOptions::IndexShorteningMode::kNoShortening) {
+        FindShortestInternalKeySeparator(*comparator_->user_comparator(),
+                                         last_key_in_current_block,
+                                         *first_key_in_next_block);
+      }
+      if (!seperator_is_key_plus_seq_ &&
+          comparator_->user_comparator()->Compare(
+              ExtractUserKey(*last_key_in_current_block),
+              ExtractUserKey(*first_key_in_next_block)) == 0) {
+        seperator_is_key_plus_seq_ = true;
+      }
+    } else {
+      if (shortening_mode_ == BlockBasedTableOptions::IndexShorteningMode::
+                                  kShortenSeparatorsAndSuccessor) {
+        FindShortInternalKeySuccessor(*comparator_->user_comparator(),
+                                      last_key_in_current_block);
+      }
+    }
+    auto sep = Slice(*last_key_in_current_block);
+
+    assert(!include_first_key_ || !current_block_first_internal_key_.empty());
+    IndexValue entry(block_handle, current_block_first_internal_key_);
+    std::string encoded_entry;
+    std::string delta_encoded_entry;
+    entry.EncodeTo(&encoded_entry, include_first_key_, nullptr);
+    if (use_value_delta_encoding_ && !last_encoded_handle_.IsNull()) {
+      entry.EncodeTo(&delta_encoded_entry, include_first_key_,
+                     &last_encoded_handle_);
+    } else {
+      // If it's the first block, or delta encoding is disabled,
+      // BlockBuilder::Add() below won't use delta-encoded slice.
+    }
+    last_encoded_handle_ = block_handle;
+    const Slice delta_encoded_entry_slice(delta_encoded_entry);
+    index_block_builder_.Add(sep, encoded_entry, &delta_encoded_entry_slice);
+    if (!seperator_is_key_plus_seq_) {
+      index_block_builder_without_seq_.Add(ExtractUserKey(sep), encoded_entry,
+                                           &delta_encoded_entry_slice);
+    }
+
+    current_block_first_internal_key_.clear();
+  }
+
+  using IndexBuilder::Finish;
+  virtual Status Finish(
+      IndexBlocks* index_blocks,
+      const BlockHandle& /*last_partition_block_handle*/) override {
+    if (seperator_is_key_plus_seq_) {
+      index_blocks->index_block_contents = index_block_builder_.Finish();
+    } else {
+      index_blocks->index_block_contents =
+          index_block_builder_without_seq_.Finish();
+    }
+    index_size_ = index_blocks->index_block_contents.size();
+    return Status::OK();
+  }
+
+  virtual size_t IndexSize() const override { return index_size_; }
+
+  virtual bool seperator_is_key_plus_seq() override {
+    return seperator_is_key_plus_seq_;
+  }
+
+  // Changes *key to a short string >= *key.
+  //
+  static void FindShortestInternalKeySeparator(const Comparator& comparator,
+                                               std::string* start,
+                                               const Slice& limit);
+
+  static void FindShortInternalKeySuccessor(const Comparator& comparator,
+                                            std::string* key);
+
+  friend class PartitionedIndexBuilder;
+
+ private:
+  BlockBuilder index_block_builder_;
+  BlockBuilder index_block_builder_without_seq_;
+  const bool use_value_delta_encoding_;
+  bool seperator_is_key_plus_seq_;
+  const bool include_first_key_;
+  BlockBasedTableOptions::IndexShorteningMode shortening_mode_;
+  BlockHandle last_encoded_handle_ = BlockHandle::NullBlockHandle();
+  std::string current_block_first_internal_key_;
+};
+
+// HashIndexBuilder contains a binary-searchable primary index and the
+// metadata for secondary hash index construction.
+// The metadata for hash index consists two parts:
+//  - a metablock that compactly contains a sequence of prefixes. All prefixes
+//    are stored consectively without any metadata (like, prefix sizes) being
+//    stored, which is kept in the other metablock.
+//  - a metablock contains the metadata of the prefixes, including prefix size,
+//    restart index and number of block it spans. The format looks like:
+//
+// +-----------------+---------------------------+---------------------+
+// <=prefix 1
+// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
+// +-----------------+---------------------------+---------------------+
+// <=prefix 2
+// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
+// +-----------------+---------------------------+---------------------+
+// |                                                                   |
+// | ....                                                              |
+// |                                                                   |
+// +-----------------+---------------------------+---------------------+
+// <=prefix n
+// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
+// +-----------------+---------------------------+---------------------+
+//
+// The reason of separating these two metablocks is to enable the efficiently
+// reuse the first metablock during hash index construction without unnecessary
+// data copy or small heap allocations for prefixes.
+class HashIndexBuilder : public IndexBuilder {
+ public:
+  explicit HashIndexBuilder(
+      const InternalKeyComparator* comparator,
+      const SliceTransform* hash_key_extractor,
+      int index_block_restart_interval, int format_version,
+      bool use_value_delta_encoding,
+      BlockBasedTableOptions::IndexShorteningMode shortening_mode)
+      : IndexBuilder(comparator),
+        primary_index_builder_(comparator, index_block_restart_interval,
+                               format_version, use_value_delta_encoding,
+                               shortening_mode, /* include_first_key */ false),
+        hash_key_extractor_(hash_key_extractor) {}
+
+  virtual void AddIndexEntry(std::string* last_key_in_current_block,
+                             const Slice* first_key_in_next_block,
+                             const BlockHandle& block_handle) override {
+    ++current_restart_index_;
+    primary_index_builder_.AddIndexEntry(last_key_in_current_block,
+                                         first_key_in_next_block, block_handle);
+  }
+
+  virtual void OnKeyAdded(const Slice& key) override {
+    auto key_prefix = hash_key_extractor_->Transform(key);
+    bool is_first_entry = pending_block_num_ == 0;
+
+    // Keys may share the prefix
+    if (is_first_entry || pending_entry_prefix_ != key_prefix) {
+      if (!is_first_entry) {
+        FlushPendingPrefix();
+      }
+
+      // need a hard copy otherwise the underlying data changes all the time.
+      // TODO(kailiu) std::to_string() is expensive. We may speed up can avoid
+      // data copy.
+      pending_entry_prefix_ = key_prefix.ToString();
+      pending_block_num_ = 1;
+      pending_entry_index_ = static_cast<uint32_t>(current_restart_index_);
+    } else {
+      // entry number increments when keys share the prefix reside in
+      // different data blocks.
+      auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1;
+      assert(last_restart_index <= current_restart_index_);
+      if (last_restart_index != current_restart_index_) {
+        ++pending_block_num_;
+      }
+    }
+  }
+
+  virtual Status Finish(
+      IndexBlocks* index_blocks,
+      const BlockHandle& last_partition_block_handle) override {
+    if (pending_block_num_ != 0) {
+      FlushPendingPrefix();
+    }
+    Status s = primary_index_builder_.Finish(index_blocks,
+                                             last_partition_block_handle);
+    index_blocks->meta_blocks.insert(
+        {kHashIndexPrefixesBlock.c_str(), prefix_block_});
+    index_blocks->meta_blocks.insert(
+        {kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_});
+    return s;
+  }
+
+  virtual size_t IndexSize() const override {
+    return primary_index_builder_.IndexSize() + prefix_block_.size() +
+           prefix_meta_block_.size();
+  }
+
+  virtual bool seperator_is_key_plus_seq() override {
+    return primary_index_builder_.seperator_is_key_plus_seq();
+  }
+
+ private:
+  void FlushPendingPrefix() {
+    prefix_block_.append(pending_entry_prefix_.data(),
+                         pending_entry_prefix_.size());
+    PutVarint32Varint32Varint32(
+        &prefix_meta_block_,
+        static_cast<uint32_t>(pending_entry_prefix_.size()),
+        pending_entry_index_, pending_block_num_);
+  }
+
+  ShortenedIndexBuilder primary_index_builder_;
+  const SliceTransform* hash_key_extractor_;
+
+  // stores a sequence of prefixes
+  std::string prefix_block_;
+  // stores the metadata of prefixes
+  std::string prefix_meta_block_;
+
+  // The following 3 variables keeps unflushed prefix and its metadata.
+  // The details of block_num and entry_index can be found in
+  // "block_hash_index.{h,cc}"
+  uint32_t pending_block_num_ = 0;
+  uint32_t pending_entry_index_ = 0;
+  std::string pending_entry_prefix_;
+
+  uint64_t current_restart_index_ = 0;
+};
+
+/**
+ * IndexBuilder for two-level indexing. Internally it creates a new index for
+ * each partition and Finish then in order when Finish is called on it
+ * continiously until Status::OK() is returned.
+ *
+ * The format on the disk would be I I I I I I IP where I is block containing a
+ * partition of indexes built using ShortenedIndexBuilder and IP is a block
+ * containing a secondary index on the partitions, built using
+ * ShortenedIndexBuilder.
+ */
+class PartitionedIndexBuilder : public IndexBuilder {
+ public:
+  static PartitionedIndexBuilder* CreateIndexBuilder(
+      const ROCKSDB_NAMESPACE::InternalKeyComparator* comparator,
+      const bool use_value_delta_encoding,
+      const BlockBasedTableOptions& table_opt);
+
+  explicit PartitionedIndexBuilder(const InternalKeyComparator* comparator,
+                                   const BlockBasedTableOptions& table_opt,
+                                   const bool use_value_delta_encoding);
+
+  virtual ~PartitionedIndexBuilder();
+
+  virtual void AddIndexEntry(std::string* last_key_in_current_block,
+                             const Slice* first_key_in_next_block,
+                             const BlockHandle& block_handle) override;
+
+  virtual Status Finish(
+      IndexBlocks* index_blocks,
+      const BlockHandle& last_partition_block_handle) override;
+
+  virtual size_t IndexSize() const override { return index_size_; }
+  size_t TopLevelIndexSize(uint64_t) const { return top_level_index_size_; }
+  size_t NumPartitions() const;
+
+  inline bool ShouldCutFilterBlock() {
+    // Current policy is to align the partitions of index and filters
+    if (cut_filter_block) {
+      cut_filter_block = false;
+      return true;
+    }
+    return false;
+  }
+
+  std::string& GetPartitionKey() { return sub_index_last_key_; }
+
+  // Called when an external entity (such as filter partition builder) request
+  // cutting the next partition
+  void RequestPartitionCut();
+
+  virtual bool seperator_is_key_plus_seq() override {
+    return seperator_is_key_plus_seq_;
+  }
+
+  bool get_use_value_delta_encoding() { return use_value_delta_encoding_; }
+
+ private:
+  // Set after ::Finish is called
+  size_t top_level_index_size_ = 0;
+  // Set after ::Finish is called
+  size_t partition_cnt_ = 0;
+
+  void MakeNewSubIndexBuilder();
+
+  struct Entry {
+    std::string key;
+    std::unique_ptr<ShortenedIndexBuilder> value;
+  };
+  std::list<Entry> entries_;  // list of partitioned indexes and their keys
+  BlockBuilder index_block_builder_;              // top-level index builder
+  BlockBuilder index_block_builder_without_seq_;  // same for user keys
+  // the active partition index builder
+  ShortenedIndexBuilder* sub_index_builder_;
+  // the last key in the active partition index builder
+  std::string sub_index_last_key_;
+  std::unique_ptr<FlushBlockPolicy> flush_policy_;
+  // true if Finish is called once but not complete yet.
+  bool finishing_indexes = false;
+  const BlockBasedTableOptions& table_opt_;
+  bool seperator_is_key_plus_seq_;
+  bool use_value_delta_encoding_;
+  // true if an external entity (such as filter partition builder) request
+  // cutting the next partition
+  bool partition_cut_requested_ = true;
+  // true if it should cut the next filter partition block
+  bool cut_filter_block = false;
+  BlockHandle last_encoded_handle_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/index_reader_common.cc b/src/rocksdb/table/block_based/index_reader_common.cc
new file mode 100644
index 000000000..6584586c9
--- /dev/null
+++ b/src/rocksdb/table/block_based/index_reader_common.cc
@@ -0,0 +1,56 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/index_reader_common.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
+    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    const ReadOptions& read_options, bool use_cache, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<Block>* index_block) {
+  PERF_TIMER_GUARD(read_index_block_nanos);
+
+  assert(table != nullptr);
+  assert(index_block != nullptr);
+  assert(index_block->IsEmpty());
+
+  const Rep* const rep = table->get_rep();
+  assert(rep != nullptr);
+
+  const Status s = table->RetrieveBlock(
+      prefetch_buffer, read_options, rep->footer.index_handle(),
+      UncompressionDict::GetEmptyDict(), index_block, BlockType::kIndex,
+      get_context, lookup_context, /* for_compaction */ false, use_cache,
+      /* wait_for_cache */ true, /* async_read */ false);
+
+  return s;
+}
+
+Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock(
+    bool no_io, Env::IOPriority rate_limiter_priority, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<Block>* index_block) const {
+  assert(index_block != nullptr);
+
+  if (!index_block_.IsEmpty()) {
+    index_block->SetUnownedValue(index_block_.GetValue());
+    return Status::OK();
+  }
+
+  ReadOptions read_options;
+  read_options.rate_limiter_priority = rate_limiter_priority;
+  if (no_io) {
+    read_options.read_tier = kBlockCacheTier;
+  }
+
+  return ReadIndexBlock(table_, /*prefetch_buffer=*/nullptr, read_options,
+                        cache_index_blocks(), get_context, lookup_context,
+                        index_block);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/index_reader_common.h b/src/rocksdb/table/block_based/index_reader_common.h
new file mode 100644
index 000000000..5627b0eeb
--- /dev/null
+++ b/src/rocksdb/table/block_based/index_reader_common.h
@@ -0,0 +1,85 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/reader_common.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Encapsulates common functionality for the various index reader
+// implementations. Provides access to the index block regardless of whether
+// it is owned by the reader or stored in the cache, or whether it is pinned
+// in the cache or not.
+class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
+ public:
+  IndexReaderCommon(const BlockBasedTable* t,
+                    CachableEntry<Block>&& index_block)
+      : table_(t), index_block_(std::move(index_block)) {
+    assert(table_ != nullptr);
+  }
+
+ protected:
+  static Status ReadIndexBlock(const BlockBasedTable* table,
+                               FilePrefetchBuffer* prefetch_buffer,
+                               const ReadOptions& read_options, bool use_cache,
+                               GetContext* get_context,
+                               BlockCacheLookupContext* lookup_context,
+                               CachableEntry<Block>* index_block);
+
+  const BlockBasedTable* table() const { return table_; }
+
+  const InternalKeyComparator* internal_comparator() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+
+    return &table_->get_rep()->internal_comparator;
+  }
+
+  bool index_has_first_key() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+    return table_->get_rep()->index_has_first_key;
+  }
+
+  bool index_key_includes_seq() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+    return table_->get_rep()->index_key_includes_seq;
+  }
+
+  bool index_value_is_full() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+    return table_->get_rep()->index_value_is_full;
+  }
+
+  bool cache_index_blocks() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+    return table_->get_rep()->table_options.cache_index_and_filter_blocks;
+  }
+
+  Status GetOrReadIndexBlock(bool no_io, Env::IOPriority rate_limiter_priority,
+                             GetContext* get_context,
+                             BlockCacheLookupContext* lookup_context,
+                             CachableEntry<Block>* index_block) const;
+
+  size_t ApproximateIndexBlockMemoryUsage() const {
+    assert(!index_block_.GetOwnValue() || index_block_.GetValue() != nullptr);
+    return index_block_.GetOwnValue()
+               ? index_block_.GetValue()->ApproximateMemoryUsage()
+               : 0;
+  }
+
+ private:
+  const BlockBasedTable* table_;
+  CachableEntry<Block> index_block_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/mock_block_based_table.h b/src/rocksdb/table/block_based/mock_block_based_table.h
new file mode 100644
index 000000000..13f3dfaee
--- /dev/null
+++ b/src/rocksdb/table/block_based/mock_block_based_table.h
@@ -0,0 +1,62 @@
+//  Copyright (c) 2019-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <memory>
+
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/filter_policy_internal.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace mock {
+
+class MockBlockBasedTable : public BlockBasedTable {
+ public:
+  explicit MockBlockBasedTable(Rep* rep)
+      : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {}
+};
+
+class MockBlockBasedTableTester {
+  static constexpr int kMockLevel = 0;
+
+ public:
+  Options options_;
+  ImmutableOptions ioptions_;
+  EnvOptions env_options_;
+  BlockBasedTableOptions table_options_;
+  InternalKeyComparator icomp_;
+  std::unique_ptr<BlockBasedTable> table_;
+
+  explicit MockBlockBasedTableTester(const FilterPolicy* filter_policy)
+      : MockBlockBasedTableTester(
+            std::shared_ptr<const FilterPolicy>(filter_policy)){};
+
+  explicit MockBlockBasedTableTester(
+      std::shared_ptr<const FilterPolicy> filter_policy)
+      : ioptions_(options_),
+        env_options_(options_),
+        icomp_(options_.comparator) {
+    table_options_.filter_policy = std::move(filter_policy);
+
+    constexpr bool skip_filters = false;
+    constexpr bool immortal_table = false;
+    table_.reset(new MockBlockBasedTable(new BlockBasedTable::Rep(
+        ioptions_, env_options_, table_options_, icomp_, skip_filters,
+        12345 /*file_size*/, kMockLevel, immortal_table)));
+  }
+
+  FilterBitsBuilder* GetBuilder() const {
+    FilterBuildingContext context(table_options_);
+    context.column_family_name = "mock_cf";
+    context.compaction_style = ioptions_.compaction_style;
+    context.level_at_creation = kMockLevel;
+    context.info_log = ioptions_.logger;
+    return BloomFilterPolicy::GetBuilderFromContext(context);
+  }
+};
+
+}  // namespace mock
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/parsed_full_filter_block.cc b/src/rocksdb/table/block_based/parsed_full_filter_block.cc
new file mode 100644
index 000000000..9184a48d2
--- /dev/null
+++ b/src/rocksdb/table/block_based/parsed_full_filter_block.cc
@@ -0,0 +1,23 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "table/block_based/parsed_full_filter_block.h"
+
+#include "table/block_based/filter_policy_internal.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ParsedFullFilterBlock::ParsedFullFilterBlock(const FilterPolicy* filter_policy,
+                                             BlockContents&& contents)
+    : block_contents_(std::move(contents)),
+      filter_bits_reader_(
+          !block_contents_.data.empty()
+              ? filter_policy->GetFilterBitsReader(block_contents_.data)
+              : nullptr) {}
+
+ParsedFullFilterBlock::~ParsedFullFilterBlock() = default;
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/parsed_full_filter_block.h b/src/rocksdb/table/block_based/parsed_full_filter_block.h
new file mode 100644
index 000000000..95d7b5208
--- /dev/null
+++ b/src/rocksdb/table/block_based/parsed_full_filter_block.h
@@ -0,0 +1,42 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class FilterBitsReader;
+class FilterPolicy;
+
+// The sharable/cachable part of the full filter.
+class ParsedFullFilterBlock {
+ public:
+  ParsedFullFilterBlock(const FilterPolicy* filter_policy,
+                        BlockContents&& contents);
+  ~ParsedFullFilterBlock();
+
+  FilterBitsReader* filter_bits_reader() const {
+    return filter_bits_reader_.get();
+  }
+
+  // TODO: consider memory usage of the FilterBitsReader
+  size_t ApproximateMemoryUsage() const {
+    return block_contents_.ApproximateMemoryUsage();
+  }
+
+  bool own_bytes() const { return block_contents_.own_bytes(); }
+
+  const Slice GetBlockContentsData() const { return block_contents_.data; }
+
+ private:
+  BlockContents block_contents_;
+  std::unique_ptr<FilterBitsReader> filter_bits_reader_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/partitioned_filter_block.cc b/src/rocksdb/table/block_based/partitioned_filter_block.cc
new file mode 100644
index 000000000..af30925b7
--- /dev/null
+++ b/src/rocksdb/table/block_based/partitioned_filter_block.cc
@@ -0,0 +1,561 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/block_based/partitioned_filter_block.h"
+
+#include <utility>
+
+#include "block_type.h"
+#include "file/random_access_file_reader.h"
+#include "logging/logging.h"
+#include "monitoring/perf_context_imp.h"
+#include "port/malloc.h"
+#include "port/port.h"
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder(
+    const SliceTransform* _prefix_extractor, bool whole_key_filtering,
+    FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval,
+    const bool use_value_delta_encoding,
+    PartitionedIndexBuilder* const p_index_builder,
+    const uint32_t partition_size)
+    : FullFilterBlockBuilder(_prefix_extractor, whole_key_filtering,
+                             filter_bits_builder),
+      index_on_filter_block_builder_(index_block_restart_interval,
+                                     true /*use_delta_encoding*/,
+                                     use_value_delta_encoding),
+      index_on_filter_block_builder_without_seq_(index_block_restart_interval,
+                                                 true /*use_delta_encoding*/,
+                                                 use_value_delta_encoding),
+      p_index_builder_(p_index_builder),
+      keys_added_to_partition_(0),
+      total_added_in_built_(0) {
+  keys_per_partition_ = static_cast<uint32_t>(
+      filter_bits_builder_->ApproximateNumEntries(partition_size));
+  if (keys_per_partition_ < 1) {
+    // partition_size (minus buffer, ~10%) might be smaller than minimum
+    // filter size, sometimes based on cache line size. Try to find that
+    // minimum size without CalculateSpace (not necessarily available).
+    uint32_t larger = std::max(partition_size + 4, uint32_t{16});
+    for (;;) {
+      keys_per_partition_ = static_cast<uint32_t>(
+          filter_bits_builder_->ApproximateNumEntries(larger));
+      if (keys_per_partition_ >= 1) {
+        break;
+      }
+      larger += larger / 4;
+      if (larger > 100000) {
+        // might be a broken implementation. substitute something reasonable:
+        // 1 key / byte.
+        keys_per_partition_ = partition_size;
+        break;
+      }
+    }
+  }
+}
+
+PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {
+  partitioned_filters_construction_status_.PermitUncheckedError();
+}
+
+void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock(
+    const Slice* next_key) {
+  // Use == to send the request only once
+  if (keys_added_to_partition_ == keys_per_partition_) {
+    // Currently only index builder is in charge of cutting a partition. We keep
+    // requesting until it is granted.
+    p_index_builder_->RequestPartitionCut();
+  }
+  if (!p_index_builder_->ShouldCutFilterBlock()) {
+    return;
+  }
+
+  // Add the prefix of the next key before finishing the partition without
+  // updating last_prefix_str_. This hack, fixes a bug with format_verison=3
+  // where seeking for the prefix would lead us to the previous partition.
+  const bool maybe_add_prefix =
+      next_key && prefix_extractor() && prefix_extractor()->InDomain(*next_key);
+  if (maybe_add_prefix) {
+    const Slice next_key_prefix = prefix_extractor()->Transform(*next_key);
+    if (next_key_prefix.compare(last_prefix_str()) != 0) {
+      AddKey(next_key_prefix);
+    }
+  }
+
+  total_added_in_built_ += filter_bits_builder_->EstimateEntriesAdded();
+  std::unique_ptr<const char[]> filter_data;
+  Status filter_construction_status = Status::OK();
+  Slice filter =
+      filter_bits_builder_->Finish(&filter_data, &filter_construction_status);
+  if (filter_construction_status.ok()) {
+    filter_construction_status = filter_bits_builder_->MaybePostVerify(filter);
+  }
+  std::string& index_key = p_index_builder_->GetPartitionKey();
+  filters.push_back({index_key, std::move(filter_data), filter});
+  if (!filter_construction_status.ok() &&
+      partitioned_filters_construction_status_.ok()) {
+    partitioned_filters_construction_status_ = filter_construction_status;
+  }
+  keys_added_to_partition_ = 0;
+  Reset();
+}
+
+void PartitionedFilterBlockBuilder::Add(const Slice& key) {
+  MaybeCutAFilterBlock(&key);
+  FullFilterBlockBuilder::Add(key);
+}
+
+void PartitionedFilterBlockBuilder::AddKey(const Slice& key) {
+  FullFilterBlockBuilder::AddKey(key);
+  keys_added_to_partition_++;
+}
+
+size_t PartitionedFilterBlockBuilder::EstimateEntriesAdded() {
+  return total_added_in_built_ + filter_bits_builder_->EstimateEntriesAdded();
+}
+
+Slice PartitionedFilterBlockBuilder::Finish(
+    const BlockHandle& last_partition_block_handle, Status* status,
+    std::unique_ptr<const char[]>* filter_data) {
+  if (finishing_filters == true) {
+    // Record the handle of the last written filter block in the index
+    std::string handle_encoding;
+    last_partition_block_handle.EncodeTo(&handle_encoding);
+    std::string handle_delta_encoding;
+    PutVarsignedint64(
+        &handle_delta_encoding,
+        last_partition_block_handle.size() - last_encoded_handle_.size());
+    last_encoded_handle_ = last_partition_block_handle;
+    const Slice handle_delta_encoding_slice(handle_delta_encoding);
+    index_on_filter_block_builder_.Add(last_filter_entry_key, handle_encoding,
+                                       &handle_delta_encoding_slice);
+    if (!p_index_builder_->seperator_is_key_plus_seq()) {
+      index_on_filter_block_builder_without_seq_.Add(
+          ExtractUserKey(last_filter_entry_key), handle_encoding,
+          &handle_delta_encoding_slice);
+    }
+  } else {
+    MaybeCutAFilterBlock(nullptr);
+  }
+
+  if (!partitioned_filters_construction_status_.ok()) {
+    *status = partitioned_filters_construction_status_;
+    return Slice();
+  }
+
+  // If there is no filter partition left, then return the index on filter
+  // partitions
+  if (UNLIKELY(filters.empty())) {
+    *status = Status::OK();
+    last_filter_data.reset();
+    if (finishing_filters) {
+      // Simplest to just add them all at the end
+      total_added_in_built_ = 0;
+      if (p_index_builder_->seperator_is_key_plus_seq()) {
+        return index_on_filter_block_builder_.Finish();
+      } else {
+        return index_on_filter_block_builder_without_seq_.Finish();
+      }
+    } else {
+      // This is the rare case where no key was added to the filter
+      return Slice();
+    }
+  } else {
+    // Return the next filter partition in line and set Incomplete() status to
+    // indicate we expect more calls to Finish
+    *status = Status::Incomplete();
+    finishing_filters = true;
+
+    last_filter_entry_key = filters.front().key;
+    Slice filter = filters.front().filter;
+    last_filter_data = std::move(filters.front().filter_data);
+    if (filter_data != nullptr) {
+      *filter_data = std::move(last_filter_data);
+    }
+    filters.pop_front();
+    return filter;
+  }
+}
+
+PartitionedFilterBlockReader::PartitionedFilterBlockReader(
+    const BlockBasedTable* t, CachableEntry<Block>&& filter_block)
+    : FilterBlockReaderCommon(t, std::move(filter_block)) {}
+
+std::unique_ptr<FilterBlockReader> PartitionedFilterBlockReader::Create(
+    const BlockBasedTable* table, const ReadOptions& ro,
+    FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+    bool pin, BlockCacheLookupContext* lookup_context) {
+  assert(table);
+  assert(table->get_rep());
+  assert(!pin || prefetch);
+
+  CachableEntry<Block> filter_block;
+  if (prefetch || !use_cache) {
+    const Status s = ReadFilterBlock(
+        table, prefetch_buffer, ro, use_cache, nullptr /* get_context */,
+        lookup_context, &filter_block, BlockType::kFilterPartitionIndex);
+    if (!s.ok()) {
+      IGNORE_STATUS_IF_ERROR(s);
+      return std::unique_ptr<FilterBlockReader>();
+    }
+
+    if (use_cache && !pin) {
+      filter_block.Reset();
+    }
+  }
+
+  return std::unique_ptr<FilterBlockReader>(
+      new PartitionedFilterBlockReader(table, std::move(filter_block)));
+}
+
+bool PartitionedFilterBlockReader::KeyMayMatch(
+    const Slice& key, const bool no_io, const Slice* const const_ikey_ptr,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority) {
+  assert(const_ikey_ptr != nullptr);
+  if (!whole_key_filtering()) {
+    return true;
+  }
+
+  return MayMatch(key, no_io, const_ikey_ptr, get_context, lookup_context,
+                  rate_limiter_priority, &FullFilterBlockReader::KeyMayMatch);
+}
+
+void PartitionedFilterBlockReader::KeysMayMatch(
+    MultiGetRange* range, const bool no_io,
+    BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority) {
+  if (!whole_key_filtering()) {
+    return;  // Any/all may match
+  }
+
+  MayMatch(range, nullptr, no_io, lookup_context, rate_limiter_priority,
+           &FullFilterBlockReader::KeysMayMatch2);
+}
+
+bool PartitionedFilterBlockReader::PrefixMayMatch(
+    const Slice& prefix, const bool no_io, const Slice* const const_ikey_ptr,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority) {
+  assert(const_ikey_ptr != nullptr);
+  return MayMatch(prefix, no_io, const_ikey_ptr, get_context, lookup_context,
+                  rate_limiter_priority,
+                  &FullFilterBlockReader::PrefixMayMatch);
+}
+
+void PartitionedFilterBlockReader::PrefixesMayMatch(
+    MultiGetRange* range, const SliceTransform* prefix_extractor,
+    const bool no_io, BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority) {
+  assert(prefix_extractor);
+  MayMatch(range, prefix_extractor, no_io, lookup_context,
+           rate_limiter_priority, &FullFilterBlockReader::PrefixesMayMatch);
+}
+
+BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle(
+    const CachableEntry<Block>& filter_block, const Slice& entry) const {
+  IndexBlockIter iter;
+  const InternalKeyComparator* const comparator = internal_comparator();
+  Statistics* kNullStats = nullptr;
+  filter_block.GetValue()->NewIndexIterator(
+      comparator->user_comparator(),
+      table()->get_rep()->get_global_seqno(BlockType::kFilterPartitionIndex),
+      &iter, kNullStats, true /* total_order_seek */,
+      false /* have_first_key */, index_key_includes_seq(),
+      index_value_is_full());
+  iter.Seek(entry);
+  if (UNLIKELY(!iter.Valid())) {
+    // entry is larger than all the keys. However its prefix might still be
+    // present in the last partition. If this is called by PrefixMayMatch this
+    // is necessary for correct behavior. Otherwise it is unnecessary but safe.
+    // Assuming this is an unlikely case for full key search, the performance
+    // overhead should be negligible.
+    iter.SeekToLast();
+  }
+  assert(iter.Valid());
+  BlockHandle fltr_blk_handle = iter.value().handle;
+  return fltr_blk_handle;
+}
+
+Status PartitionedFilterBlockReader::GetFilterPartitionBlock(
+    FilePrefetchBuffer* prefetch_buffer, const BlockHandle& fltr_blk_handle,
+    bool no_io, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority,
+    CachableEntry<ParsedFullFilterBlock>* filter_block) const {
+  assert(table());
+  assert(filter_block);
+  assert(filter_block->IsEmpty());
+
+  if (!filter_map_.empty()) {
+    auto iter = filter_map_.find(fltr_blk_handle.offset());
+    // This is a possible scenario since block cache might not have had space
+    // for the partition
+    if (iter != filter_map_.end()) {
+      filter_block->SetUnownedValue(iter->second.GetValue());
+      return Status::OK();
+    }
+  }
+
+  ReadOptions read_options;
+  read_options.rate_limiter_priority = rate_limiter_priority;
+  if (no_io) {
+    read_options.read_tier = kBlockCacheTier;
+  }
+
+  const Status s =
+      table()->RetrieveBlock(prefetch_buffer, read_options, fltr_blk_handle,
+                             UncompressionDict::GetEmptyDict(), filter_block,
+                             BlockType::kFilter, get_context, lookup_context,
+                             /* for_compaction */ false, /* use_cache */ true,
+                             /* wait_for_cache */ true, /* async_read */ false);
+
+  return s;
+}
+
+bool PartitionedFilterBlockReader::MayMatch(
+    const Slice& slice, bool no_io, const Slice* const_ikey_ptr,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority,
+    FilterFunction filter_function) const {
+  CachableEntry<Block> filter_block;
+  Status s = GetOrReadFilterBlock(
+      no_io, get_context, lookup_context, &filter_block,
+      BlockType::kFilterPartitionIndex, rate_limiter_priority);
+  if (UNLIKELY(!s.ok())) {
+    IGNORE_STATUS_IF_ERROR(s);
+    return true;
+  }
+
+  if (UNLIKELY(filter_block.GetValue()->size() == 0)) {
+    return true;
+  }
+
+  auto filter_handle = GetFilterPartitionHandle(filter_block, *const_ikey_ptr);
+  if (UNLIKELY(filter_handle.size() == 0)) {  // key is out of range
+    return false;
+  }
+
+  CachableEntry<ParsedFullFilterBlock> filter_partition_block;
+  s = GetFilterPartitionBlock(nullptr /* prefetch_buffer */, filter_handle,
+                              no_io, get_context, lookup_context,
+                              rate_limiter_priority, &filter_partition_block);
+  if (UNLIKELY(!s.ok())) {
+    IGNORE_STATUS_IF_ERROR(s);
+    return true;
+  }
+
+  FullFilterBlockReader filter_partition(table(),
+                                         std::move(filter_partition_block));
+  return (filter_partition.*filter_function)(slice, no_io, const_ikey_ptr,
+                                             get_context, lookup_context,
+                                             rate_limiter_priority);
+}
+
+void PartitionedFilterBlockReader::MayMatch(
+    MultiGetRange* range, const SliceTransform* prefix_extractor, bool no_io,
+    BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority,
+    FilterManyFunction filter_function) const {
+  CachableEntry<Block> filter_block;
+  Status s = GetOrReadFilterBlock(
+      no_io, range->begin()->get_context, lookup_context, &filter_block,
+      BlockType::kFilterPartitionIndex, rate_limiter_priority);
+  if (UNLIKELY(!s.ok())) {
+    IGNORE_STATUS_IF_ERROR(s);
+    return;  // Any/all may match
+  }
+
+  if (UNLIKELY(filter_block.GetValue()->size() == 0)) {
+    return;  // Any/all may match
+  }
+
+  auto start_iter_same_handle = range->begin();
+  BlockHandle prev_filter_handle = BlockHandle::NullBlockHandle();
+
+  // For all keys mapping to same partition (must be adjacent in sorted order)
+  // share block cache lookup and use full filter multiget on the partition
+  // filter.
+  for (auto iter = start_iter_same_handle; iter != range->end(); ++iter) {
+    // TODO: re-use one top-level index iterator
+    BlockHandle this_filter_handle =
+        GetFilterPartitionHandle(filter_block, iter->ikey);
+    if (!prev_filter_handle.IsNull() &&
+        this_filter_handle != prev_filter_handle) {
+      MultiGetRange subrange(*range, start_iter_same_handle, iter);
+      MayMatchPartition(&subrange, prefix_extractor, prev_filter_handle, no_io,
+                        lookup_context, rate_limiter_priority, filter_function);
+      range->AddSkipsFrom(subrange);
+      start_iter_same_handle = iter;
+    }
+    if (UNLIKELY(this_filter_handle.size() == 0)) {  // key is out of range
+      // Not reachable with current behavior of GetFilterPartitionHandle
+      assert(false);
+      range->SkipKey(iter);
+      prev_filter_handle = BlockHandle::NullBlockHandle();
+    } else {
+      prev_filter_handle = this_filter_handle;
+    }
+  }
+  if (!prev_filter_handle.IsNull()) {
+    MultiGetRange subrange(*range, start_iter_same_handle, range->end());
+    MayMatchPartition(&subrange, prefix_extractor, prev_filter_handle, no_io,
+                      lookup_context, rate_limiter_priority, filter_function);
+    range->AddSkipsFrom(subrange);
+  }
+}
+
+void PartitionedFilterBlockReader::MayMatchPartition(
+    MultiGetRange* range, const SliceTransform* prefix_extractor,
+    BlockHandle filter_handle, bool no_io,
+    BlockCacheLookupContext* lookup_context,
+    Env::IOPriority rate_limiter_priority,
+    FilterManyFunction filter_function) const {
+  CachableEntry<ParsedFullFilterBlock> filter_partition_block;
+  Status s = GetFilterPartitionBlock(
+      nullptr /* prefetch_buffer */, filter_handle, no_io,
+      range->begin()->get_context, lookup_context, rate_limiter_priority,
+      &filter_partition_block);
+  if (UNLIKELY(!s.ok())) {
+    IGNORE_STATUS_IF_ERROR(s);
+    return;  // Any/all may match
+  }
+
+  FullFilterBlockReader filter_partition(table(),
+                                         std::move(filter_partition_block));
+  (filter_partition.*filter_function)(range, prefix_extractor, no_io,
+                                      lookup_context, rate_limiter_priority);
+}
+
+size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const {
+  size_t usage = ApproximateFilterBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  usage += malloc_usable_size(const_cast<PartitionedFilterBlockReader*>(this));
+#else
+  usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+  return usage;
+  // TODO(myabandeh): better estimation for filter_map_ size
+}
+
+// TODO(myabandeh): merge this with the same function in IndexReader
+Status PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro,
+                                                       bool pin) {
+  assert(table());
+
+  const BlockBasedTable::Rep* const rep = table()->get_rep();
+  assert(rep);
+
+  BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
+
+  CachableEntry<Block> filter_block;
+
+  Status s = GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */,
+                                  &lookup_context, &filter_block,
+                                  BlockType::kFilterPartitionIndex,
+                                  ro.rate_limiter_priority);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(rep->ioptions.logger,
+                    "Error retrieving top-level filter block while trying to "
+                    "cache filter partitions: %s",
+                    s.ToString().c_str());
+    return s;
+  }
+
+  // Before read partitions, prefetch them to avoid lots of IOs
+  assert(filter_block.GetValue());
+
+  IndexBlockIter biter;
+  const InternalKeyComparator* const comparator = internal_comparator();
+  Statistics* kNullStats = nullptr;
+  filter_block.GetValue()->NewIndexIterator(
+      comparator->user_comparator(),
+      rep->get_global_seqno(BlockType::kFilterPartitionIndex), &biter,
+      kNullStats, true /* total_order_seek */, false /* have_first_key */,
+      index_key_includes_seq(), index_value_is_full());
+  // Index partitions are assumed to be consecuitive. Prefetch them all.
+  // Read the first block offset
+  biter.SeekToFirst();
+  BlockHandle handle = biter.value().handle;
+  uint64_t prefetch_off = handle.offset();
+
+  // Read the last block's offset
+  biter.SeekToLast();
+  handle = biter.value().handle;
+  uint64_t last_off =
+      handle.offset() + handle.size() + BlockBasedTable::kBlockTrailerSize;
+  uint64_t prefetch_len = last_off - prefetch_off;
+  std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
+  rep->CreateFilePrefetchBuffer(
+      0, 0, &prefetch_buffer, false /* Implicit autoreadahead */,
+      0 /*num_reads_*/, 0 /*num_file_reads_for_auto_readahead*/);
+
+  IOOptions opts;
+  s = rep->file->PrepareIOOptions(ro, opts);
+  if (s.ok()) {
+    s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off,
+                                  static_cast<size_t>(prefetch_len),
+                                  ro.rate_limiter_priority);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+
+  // After prefetch, read the partitions one by one
+  for (biter.SeekToFirst(); biter.Valid(); biter.Next()) {
+    handle = biter.value().handle;
+
+    CachableEntry<ParsedFullFilterBlock> block;
+    // TODO: Support counter batch update for partitioned index and
+    // filter blocks
+    s = table()->MaybeReadBlockAndLoadToCache(
+        prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
+        /* wait */ true, /* for_compaction */ false, &block, BlockType::kFilter,
+        nullptr /* get_context */, &lookup_context, nullptr /* contents */,
+        false);
+    if (!s.ok()) {
+      return s;
+    }
+    assert(s.ok() || block.GetValue() == nullptr);
+
+    if (block.GetValue() != nullptr) {
+      if (block.IsCached()) {
+        if (pin) {
+          filter_map_[handle.offset()] = std::move(block);
+        }
+      }
+    }
+  }
+  return biter.status();
+}
+
+const InternalKeyComparator* PartitionedFilterBlockReader::internal_comparator()
+    const {
+  assert(table());
+  assert(table()->get_rep());
+
+  return &table()->get_rep()->internal_comparator;
+}
+
+bool PartitionedFilterBlockReader::index_key_includes_seq() const {
+  assert(table());
+  assert(table()->get_rep());
+
+  return table()->get_rep()->index_key_includes_seq;
+}
+
+bool PartitionedFilterBlockReader::index_value_is_full() const {
+  assert(table());
+  assert(table()->get_rep());
+
+  return table()->get_rep()->index_value_is_full;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/partitioned_filter_block.h b/src/rocksdb/table/block_based/partitioned_filter_block.h
new file mode 100644
index 000000000..955b50739
--- /dev/null
+++ b/src/rocksdb/table/block_based/partitioned_filter_block.h
@@ -0,0 +1,178 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <deque>
+#include <list>
+#include <string>
+#include <unordered_map>
+
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "table/block_based/block.h"
+#include "table/block_based/filter_block_reader_common.h"
+#include "table/block_based/full_filter_block.h"
+#include "table/block_based/index_builder.h"
+#include "util/autovector.h"
+#include "util/hash_containers.h"
+
+namespace ROCKSDB_NAMESPACE {
+class InternalKeyComparator;
+
+class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
+ public:
+  explicit PartitionedFilterBlockBuilder(
+      const SliceTransform* prefix_extractor, bool whole_key_filtering,
+      FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval,
+      const bool use_value_delta_encoding,
+      PartitionedIndexBuilder* const p_index_builder,
+      const uint32_t partition_size);
+
+  virtual ~PartitionedFilterBlockBuilder();
+
+  void AddKey(const Slice& key) override;
+  void Add(const Slice& key) override;
+  size_t EstimateEntriesAdded() override;
+
+  virtual Slice Finish(
+      const BlockHandle& last_partition_block_handle, Status* status,
+      std::unique_ptr<const char[]>* filter_data = nullptr) override;
+
+  virtual void ResetFilterBitsBuilder() override {
+    // Previously constructed partitioned filters by
+    // this to-be-reset FiterBitsBuilder can also be
+    // cleared
+    filters.clear();
+    FullFilterBlockBuilder::ResetFilterBitsBuilder();
+  }
+
+  // For PartitionFilter, optional post-verifing the filter is done
+  // as part of PartitionFilterBlockBuilder::Finish
+  // to avoid implementation complexity of doing it elsewhere.
+  // Therefore we are skipping it in here.
+  virtual Status MaybePostVerifyFilter(
+      const Slice& /* filter_content */) override {
+    return Status::OK();
+  }
+
+ private:
+  // Filter data
+  BlockBuilder index_on_filter_block_builder_;  // top-level index builder
+  BlockBuilder
+      index_on_filter_block_builder_without_seq_;  // same for user keys
+  struct FilterEntry {
+    std::string key;
+    std::unique_ptr<const char[]> filter_data;
+    Slice filter;
+  };
+  std::deque<FilterEntry> filters;  // list of partitioned filters and keys used
+                                    // in building the index
+
+  // Set to the first non-okay status if any of the filter
+  // partitions experiences construction error.
+  // If partitioned_filters_construction_status_ is non-okay,
+  // then the whole partitioned filters should not be used.
+  Status partitioned_filters_construction_status_;
+  std::string last_filter_entry_key;
+  std::unique_ptr<const char[]> last_filter_data;
+  std::unique_ptr<IndexBuilder> value;
+  bool finishing_filters =
+      false;  // true if Finish is called once but not complete yet.
+  // The policy of when cut a filter block and Finish it
+  void MaybeCutAFilterBlock(const Slice* next_key);
+  // Currently we keep the same number of partitions for filters and indexes.
+  // This would allow for some potentioal optimizations in future. If such
+  // optimizations did not realize we can use different number of partitions and
+  // eliminate p_index_builder_
+  PartitionedIndexBuilder* const p_index_builder_;
+  // The desired number of keys per partition
+  uint32_t keys_per_partition_;
+  // The number of keys added to the last partition so far
+  uint32_t keys_added_to_partition_;
+  // According to the bits builders, how many keys/prefixes added
+  // in all the filters we have fully built
+  uint64_t total_added_in_built_;
+  BlockHandle last_encoded_handle_;
+};
+
+class PartitionedFilterBlockReader : public FilterBlockReaderCommon<Block> {
+ public:
+  PartitionedFilterBlockReader(const BlockBasedTable* t,
+                               CachableEntry<Block>&& filter_block);
+
+  static std::unique_ptr<FilterBlockReader> Create(
+      const BlockBasedTable* table, const ReadOptions& ro,
+      FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+      bool pin, BlockCacheLookupContext* lookup_context);
+
+  bool KeyMayMatch(const Slice& key, const bool no_io,
+                   const Slice* const const_ikey_ptr, GetContext* get_context,
+                   BlockCacheLookupContext* lookup_context,
+                   Env::IOPriority rate_limiter_priority) override;
+  void KeysMayMatch(MultiGetRange* range, const bool no_io,
+                    BlockCacheLookupContext* lookup_context,
+                    Env::IOPriority rate_limiter_priority) override;
+
+  bool PrefixMayMatch(const Slice& prefix, const bool no_io,
+                      const Slice* const const_ikey_ptr,
+                      GetContext* get_context,
+                      BlockCacheLookupContext* lookup_context,
+                      Env::IOPriority rate_limiter_priority) override;
+  void PrefixesMayMatch(MultiGetRange* range,
+                        const SliceTransform* prefix_extractor,
+                        const bool no_io,
+                        BlockCacheLookupContext* lookup_context,
+                        Env::IOPriority rate_limiter_priority) override;
+
+  size_t ApproximateMemoryUsage() const override;
+
+ private:
+  BlockHandle GetFilterPartitionHandle(const CachableEntry<Block>& filter_block,
+                                       const Slice& entry) const;
+  Status GetFilterPartitionBlock(
+      FilePrefetchBuffer* prefetch_buffer, const BlockHandle& handle,
+      bool no_io, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context,
+      Env::IOPriority rate_limiter_priority,
+      CachableEntry<ParsedFullFilterBlock>* filter_block) const;
+
+  using FilterFunction = bool (FullFilterBlockReader::*)(
+      const Slice& slice, const bool no_io, const Slice* const const_ikey_ptr,
+      GetContext* get_context, BlockCacheLookupContext* lookup_context,
+      Env::IOPriority rate_limiter_priority);
+  bool MayMatch(const Slice& slice, bool no_io, const Slice* const_ikey_ptr,
+                GetContext* get_context,
+                BlockCacheLookupContext* lookup_context,
+                Env::IOPriority rate_limiter_priority,
+                FilterFunction filter_function) const;
+  using FilterManyFunction = void (FullFilterBlockReader::*)(
+      MultiGetRange* range, const SliceTransform* prefix_extractor,
+      const bool no_io, BlockCacheLookupContext* lookup_context,
+      Env::IOPriority rate_limiter_priority);
+  void MayMatch(MultiGetRange* range, const SliceTransform* prefix_extractor,
+                bool no_io, BlockCacheLookupContext* lookup_context,
+                Env::IOPriority rate_limiter_priority,
+                FilterManyFunction filter_function) const;
+  void MayMatchPartition(MultiGetRange* range,
+                         const SliceTransform* prefix_extractor,
+                         BlockHandle filter_handle, bool no_io,
+                         BlockCacheLookupContext* lookup_context,
+                         Env::IOPriority rate_limiter_priority,
+                         FilterManyFunction filter_function) const;
+  Status CacheDependencies(const ReadOptions& ro, bool pin) override;
+
+  const InternalKeyComparator* internal_comparator() const;
+  bool index_key_includes_seq() const;
+  bool index_value_is_full() const;
+
+ protected:
+  // For partition blocks pinned in cache. Can be a subset of blocks
+  // in case some fail insertion on attempt to pin.
+  UnorderedMap<uint64_t, CachableEntry<ParsedFullFilterBlock>> filter_map_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/partitioned_filter_block_test.cc b/src/rocksdb/table/block_based/partitioned_filter_block_test.cc
new file mode 100644
index 000000000..0ce50d2bc
--- /dev/null
+++ b/src/rocksdb/table/block_based/partitioned_filter_block_test.cc
@@ -0,0 +1,436 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/block_based/partitioned_filter_block.h"
+
+#include <map>
+
+#include "index_builder.h"
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "table/format.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::map<uint64_t, std::string> blooms;
+
+class MockedBlockBasedTable : public BlockBasedTable {
+ public:
+  MockedBlockBasedTable(Rep* rep, PartitionedIndexBuilder* pib)
+      : BlockBasedTable(rep, /*block_cache_tracer=*/nullptr) {
+    // Initialize what Open normally does as much as necessary for the test
+    rep->index_key_includes_seq = pib->seperator_is_key_plus_seq();
+    rep->index_value_is_full = !pib->get_use_value_delta_encoding();
+  }
+};
+
+class MyPartitionedFilterBlockReader : public PartitionedFilterBlockReader {
+ public:
+  MyPartitionedFilterBlockReader(BlockBasedTable* t,
+                                 CachableEntry<Block>&& filter_block)
+      : PartitionedFilterBlockReader(t, std::move(filter_block)) {
+    for (const auto& pair : blooms) {
+      const uint64_t offset = pair.first;
+      const std::string& bloom = pair.second;
+
+      assert(t);
+      assert(t->get_rep());
+      CachableEntry<ParsedFullFilterBlock> block(
+          new ParsedFullFilterBlock(
+              t->get_rep()->table_options.filter_policy.get(),
+              BlockContents(Slice(bloom))),
+          nullptr /* cache */, nullptr /* cache_handle */,
+          true /* own_value */);
+      filter_map_[offset] = std::move(block);
+    }
+  }
+};
+
+class PartitionedFilterBlockTest
+    : public testing::Test,
+      virtual public ::testing::WithParamInterface<uint32_t> {
+ public:
+  Options options_;
+  ImmutableOptions ioptions_;
+  EnvOptions env_options_;
+  BlockBasedTableOptions table_options_;
+  InternalKeyComparator icomp_;
+  std::unique_ptr<BlockBasedTable> table_;
+  std::shared_ptr<Cache> cache_;
+  int bits_per_key_;
+
+  PartitionedFilterBlockTest()
+      : ioptions_(options_),
+        env_options_(options_),
+        icomp_(options_.comparator),
+        bits_per_key_(10) {
+    table_options_.filter_policy.reset(
+        NewBloomFilterPolicy(bits_per_key_, false));
+    table_options_.format_version = GetParam();
+    table_options_.index_block_restart_interval = 3;
+  }
+
+  ~PartitionedFilterBlockTest() override {}
+
+  const std::string keys[4] = {"afoo", "bar", "box", "hello"};
+  const std::string missing_keys[2] = {"missing", "other"};
+
+  uint64_t MaxIndexSize() {
+    int num_keys = sizeof(keys) / sizeof(*keys);
+    uint64_t max_key_size = 0;
+    for (int i = 1; i < num_keys; i++) {
+      max_key_size =
+          std::max(max_key_size, static_cast<uint64_t>(keys[i].size()));
+    }
+    uint64_t max_index_size = num_keys * (max_key_size + 8 /*handle*/);
+    return max_index_size;
+  }
+
+  uint64_t MaxFilterSize() {
+    int num_keys = sizeof(keys) / sizeof(*keys);
+    // General, rough over-approximation
+    return num_keys * bits_per_key_ + (CACHE_LINE_SIZE * 8 + /*metadata*/ 5);
+  }
+
+  uint64_t last_offset = 10;
+  BlockHandle Write(const Slice& slice) {
+    BlockHandle bh(last_offset + 1, slice.size());
+    blooms[bh.offset()] = slice.ToString();
+    last_offset += bh.size();
+    return bh;
+  }
+
+  PartitionedIndexBuilder* NewIndexBuilder() {
+    const bool kValueDeltaEncoded = true;
+    return PartitionedIndexBuilder::CreateIndexBuilder(
+        &icomp_, !kValueDeltaEncoded, table_options_);
+  }
+
+  PartitionedFilterBlockBuilder* NewBuilder(
+      PartitionedIndexBuilder* const p_index_builder,
+      const SliceTransform* prefix_extractor = nullptr) {
+    assert(table_options_.block_size_deviation <= 100);
+    auto partition_size =
+        static_cast<uint32_t>(((table_options_.metadata_block_size *
+                                (100 - table_options_.block_size_deviation)) +
+                               99) /
+                              100);
+    partition_size = std::max(partition_size, static_cast<uint32_t>(1));
+    const bool kValueDeltaEncoded = true;
+    return new PartitionedFilterBlockBuilder(
+        prefix_extractor, table_options_.whole_key_filtering,
+        BloomFilterPolicy::GetBuilderFromContext(
+            FilterBuildingContext(table_options_)),
+        table_options_.index_block_restart_interval, !kValueDeltaEncoded,
+        p_index_builder, partition_size);
+  }
+
+  PartitionedFilterBlockReader* NewReader(
+      PartitionedFilterBlockBuilder* builder, PartitionedIndexBuilder* pib) {
+    BlockHandle bh;
+    Status status;
+    Slice slice;
+    std::unique_ptr<const char[]> filter_data;
+    do {
+      slice = builder->Finish(bh, &status, &filter_data);
+      bh = Write(slice);
+    } while (status.IsIncomplete());
+
+    constexpr bool skip_filters = false;
+    constexpr uint64_t file_size = 12345;
+    constexpr int level = 0;
+    constexpr bool immortal_table = false;
+    table_.reset(new MockedBlockBasedTable(
+        new BlockBasedTable::Rep(ioptions_, env_options_, table_options_,
+                                 icomp_, skip_filters, file_size, level,
+                                 immortal_table),
+        pib));
+    BlockContents contents(slice);
+    CachableEntry<Block> block(
+        new Block(std::move(contents), 0 /* read_amp_bytes_per_bit */, nullptr),
+        nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
+    auto reader =
+        new MyPartitionedFilterBlockReader(table_.get(), std::move(block));
+    return reader;
+  }
+
+  void VerifyReader(PartitionedFilterBlockBuilder* builder,
+                    PartitionedIndexBuilder* pib, bool empty = false) {
+    std::unique_ptr<PartitionedFilterBlockReader> reader(
+        NewReader(builder, pib));
+    Env::IOPriority rate_limiter_priority = Env::IO_TOTAL;
+    // Querying added keys
+    const bool no_io = true;
+    for (auto key : keys) {
+      auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
+      const Slice ikey_slice = Slice(*ikey.rep());
+      ASSERT_TRUE(reader->KeyMayMatch(key, !no_io, &ikey_slice,
+                                      /*get_context=*/nullptr,
+                                      /*lookup_context=*/nullptr,
+                                      rate_limiter_priority));
+    }
+    {
+      // querying a key twice
+      auto ikey = InternalKey(keys[0], 0, ValueType::kTypeValue);
+      const Slice ikey_slice = Slice(*ikey.rep());
+      ASSERT_TRUE(reader->KeyMayMatch(keys[0], !no_io, &ikey_slice,
+                                      /*get_context=*/nullptr,
+                                      /*lookup_context=*/nullptr,
+                                      rate_limiter_priority));
+    }
+    // querying missing keys
+    for (auto key : missing_keys) {
+      auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
+      const Slice ikey_slice = Slice(*ikey.rep());
+      if (empty) {
+        ASSERT_TRUE(reader->KeyMayMatch(key, !no_io, &ikey_slice,
+                                        /*get_context=*/nullptr,
+                                        /*lookup_context=*/nullptr,
+                                        rate_limiter_priority));
+      } else {
+        // assuming a good hash function
+        ASSERT_FALSE(reader->KeyMayMatch(key, !no_io, &ikey_slice,
+                                         /*get_context=*/nullptr,
+                                         /*lookup_context=*/nullptr,
+                                         rate_limiter_priority));
+      }
+    }
+  }
+
+  int TestBlockPerKey() {
+    std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
+    std::unique_ptr<PartitionedFilterBlockBuilder> builder(
+        NewBuilder(pib.get()));
+    int i = 0;
+    builder->Add(keys[i]);
+    CutABlock(pib.get(), keys[i], keys[i + 1]);
+    i++;
+    builder->Add(keys[i]);
+    CutABlock(pib.get(), keys[i], keys[i + 1]);
+    i++;
+    builder->Add(keys[i]);
+    builder->Add(keys[i]);
+    CutABlock(pib.get(), keys[i], keys[i + 1]);
+    i++;
+    builder->Add(keys[i]);
+    CutABlock(pib.get(), keys[i]);
+
+    VerifyReader(builder.get(), pib.get());
+    return CountNumOfIndexPartitions(pib.get());
+  }
+
+  void TestBlockPerTwoKeys(const SliceTransform* prefix_extractor = nullptr) {
+    std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
+    std::unique_ptr<PartitionedFilterBlockBuilder> builder(
+        NewBuilder(pib.get(), prefix_extractor));
+    int i = 0;
+    builder->Add(keys[i]);
+    i++;
+    builder->Add(keys[i]);
+    CutABlock(pib.get(), keys[i], keys[i + 1]);
+    i++;
+    builder->Add(keys[i]);
+    builder->Add(keys[i]);
+    i++;
+    builder->Add(keys[i]);
+    CutABlock(pib.get(), keys[i]);
+
+    VerifyReader(builder.get(), pib.get(), prefix_extractor);
+  }
+
+  void TestBlockPerAllKeys() {
+    std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
+    std::unique_ptr<PartitionedFilterBlockBuilder> builder(
+        NewBuilder(pib.get()));
+    int i = 0;
+    builder->Add(keys[i]);
+    i++;
+    builder->Add(keys[i]);
+    i++;
+    builder->Add(keys[i]);
+    builder->Add(keys[i]);
+    i++;
+    builder->Add(keys[i]);
+    CutABlock(pib.get(), keys[i]);
+
+    VerifyReader(builder.get(), pib.get());
+  }
+
+  void CutABlock(PartitionedIndexBuilder* builder,
+                 const std::string& user_key) {
+    // Assuming a block is cut, add an entry to the index
+    std::string key =
+        std::string(*InternalKey(user_key, 0, ValueType::kTypeValue).rep());
+    BlockHandle dont_care_block_handle(1, 1);
+    builder->AddIndexEntry(&key, nullptr, dont_care_block_handle);
+  }
+
+  void CutABlock(PartitionedIndexBuilder* builder, const std::string& user_key,
+                 const std::string& next_user_key) {
+    // Assuming a block is cut, add an entry to the index
+    std::string key =
+        std::string(*InternalKey(user_key, 0, ValueType::kTypeValue).rep());
+    std::string next_key = std::string(
+        *InternalKey(next_user_key, 0, ValueType::kTypeValue).rep());
+    BlockHandle dont_care_block_handle(1, 1);
+    Slice slice = Slice(next_key.data(), next_key.size());
+    builder->AddIndexEntry(&key, &slice, dont_care_block_handle);
+  }
+
+  int CountNumOfIndexPartitions(PartitionedIndexBuilder* builder) {
+    IndexBuilder::IndexBlocks dont_care_ib;
+    BlockHandle dont_care_bh(10, 10);
+    Status s;
+    int cnt = 0;
+    do {
+      s = builder->Finish(&dont_care_ib, dont_care_bh);
+      cnt++;
+    } while (s.IsIncomplete());
+    return cnt - 1;  // 1 is 2nd level index
+  }
+};
+
+// Format versions potentially intersting to partitioning
+INSTANTIATE_TEST_CASE_P(FormatVersions, PartitionedFilterBlockTest,
+                        testing::ValuesIn(std::set<uint32_t>{
+                            2, 3, 4, test::kDefaultFormatVersion,
+                            kLatestFormatVersion}));
+
+TEST_P(PartitionedFilterBlockTest, EmptyBuilder) {
+  std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
+  std::unique_ptr<PartitionedFilterBlockBuilder> builder(NewBuilder(pib.get()));
+  const bool empty = true;
+  VerifyReader(builder.get(), pib.get(), empty);
+}
+
+TEST_P(PartitionedFilterBlockTest, OneBlock) {
+  uint64_t max_index_size = MaxIndexSize();
+  for (uint64_t i = 1; i < max_index_size + 1; i++) {
+    table_options_.metadata_block_size = i;
+    TestBlockPerAllKeys();
+  }
+}
+
+TEST_P(PartitionedFilterBlockTest, TwoBlocksPerKey) {
+  uint64_t max_index_size = MaxIndexSize();
+  for (uint64_t i = 1; i < max_index_size + 1; i++) {
+    table_options_.metadata_block_size = i;
+    TestBlockPerTwoKeys();
+  }
+}
+
+// This reproduces the bug that a prefix is the same among multiple consecutive
+// blocks but the bug would add it only to the first block.
+TEST_P(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) {
+  // some small number to cause partition cuts
+  table_options_.metadata_block_size = 1;
+  std::unique_ptr<const SliceTransform> prefix_extractor(
+      ROCKSDB_NAMESPACE::NewFixedPrefixTransform(1));
+  std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
+  std::unique_ptr<PartitionedFilterBlockBuilder> builder(
+      NewBuilder(pib.get(), prefix_extractor.get()));
+  const std::string pkeys[3] = {"p-key10", "p-key20", "p-key30"};
+  builder->Add(pkeys[0]);
+  CutABlock(pib.get(), pkeys[0], pkeys[1]);
+  builder->Add(pkeys[1]);
+  CutABlock(pib.get(), pkeys[1], pkeys[2]);
+  builder->Add(pkeys[2]);
+  CutABlock(pib.get(), pkeys[2]);
+  std::unique_ptr<PartitionedFilterBlockReader> reader(
+      NewReader(builder.get(), pib.get()));
+  for (auto key : pkeys) {
+    auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
+    const Slice ikey_slice = Slice(*ikey.rep());
+    ASSERT_TRUE(reader->PrefixMayMatch(prefix_extractor->Transform(key),
+                                       /*no_io=*/false, &ikey_slice,
+                                       /*get_context=*/nullptr,
+                                       /*lookup_context=*/nullptr,
+                                       Env::IO_TOTAL));
+  }
+  // Non-existent keys but with the same prefix
+  const std::string pnonkeys[4] = {"p-key9", "p-key11", "p-key21", "p-key31"};
+  for (auto key : pnonkeys) {
+    auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
+    const Slice ikey_slice = Slice(*ikey.rep());
+    ASSERT_TRUE(reader->PrefixMayMatch(prefix_extractor->Transform(key),
+                                       /*no_io=*/false, &ikey_slice,
+                                       /*get_context=*/nullptr,
+                                       /*lookup_context=*/nullptr,
+                                       Env::IO_TOTAL));
+  }
+}
+
+// This reproduces the bug in format_version=3 that the seeking the prefix will
+// lead us to the partition before the one that has filter for the prefix.
+TEST_P(PartitionedFilterBlockTest, PrefixInWrongPartitionBug) {
+  // some small number to cause partition cuts
+  table_options_.metadata_block_size = 1;
+  std::unique_ptr<const SliceTransform> prefix_extractor(
+      ROCKSDB_NAMESPACE::NewFixedPrefixTransform(2));
+  std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
+  std::unique_ptr<PartitionedFilterBlockBuilder> builder(
+      NewBuilder(pib.get(), prefix_extractor.get()));
+  // In the bug, searching for prefix "p3" on an index with format version 3,
+  // will give the key "p3" and the partition of the keys that are <= p3, i.e.,
+  // p2-keys, where the filter for prefix "p3" does not exist.
+  const std::string pkeys[] = {"p1-key1", "p2-key2", "p3-key3", "p4-key3",
+                               "p5-key3"};
+  builder->Add(pkeys[0]);
+  CutABlock(pib.get(), pkeys[0], pkeys[1]);
+  builder->Add(pkeys[1]);
+  CutABlock(pib.get(), pkeys[1], pkeys[2]);
+  builder->Add(pkeys[2]);
+  CutABlock(pib.get(), pkeys[2], pkeys[3]);
+  builder->Add(pkeys[3]);
+  CutABlock(pib.get(), pkeys[3], pkeys[4]);
+  builder->Add(pkeys[4]);
+  CutABlock(pib.get(), pkeys[4]);
+  std::unique_ptr<PartitionedFilterBlockReader> reader(
+      NewReader(builder.get(), pib.get()));
+  Env::IOPriority rate_limiter_priority = Env::IO_TOTAL;
+  for (auto key : pkeys) {
+    auto prefix = prefix_extractor->Transform(key);
+    auto ikey = InternalKey(prefix, 0, ValueType::kTypeValue);
+    const Slice ikey_slice = Slice(*ikey.rep());
+    ASSERT_TRUE(reader->PrefixMayMatch(prefix,
+                                       /*no_io=*/false, &ikey_slice,
+                                       /*get_context=*/nullptr,
+                                       /*lookup_context=*/nullptr,
+                                       rate_limiter_priority));
+  }
+}
+
+TEST_P(PartitionedFilterBlockTest, OneBlockPerKey) {
+  uint64_t max_index_size = MaxIndexSize();
+  for (uint64_t i = 1; i < max_index_size + 1; i++) {
+    table_options_.metadata_block_size = i;
+    TestBlockPerKey();
+  }
+}
+
+TEST_P(PartitionedFilterBlockTest, PartitionCount) {
+  int num_keys = sizeof(keys) / sizeof(*keys);
+  table_options_.metadata_block_size =
+      std::max(MaxIndexSize(), MaxFilterSize());
+  int partitions = TestBlockPerKey();
+  ASSERT_EQ(partitions, 1);
+  // A low number ensures cutting a block after each key
+  table_options_.metadata_block_size = 1;
+  partitions = TestBlockPerKey();
+  ASSERT_EQ(partitions, num_keys - 1 /* last two keys make one flush */);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/block_based/partitioned_index_iterator.cc b/src/rocksdb/table/block_based/partitioned_index_iterator.cc
new file mode 100644
index 000000000..b9bc2155a
--- /dev/null
+++ b/src/rocksdb/table/block_based/partitioned_index_iterator.cc
@@ -0,0 +1,163 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/partitioned_index_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+void PartitionedIndexIterator::Seek(const Slice& target) { SeekImpl(&target); }
+
+void PartitionedIndexIterator::SeekToFirst() { SeekImpl(nullptr); }
+
+void PartitionedIndexIterator::SeekImpl(const Slice* target) {
+  SavePrevIndexValue();
+
+  if (target) {
+    index_iter_->Seek(*target);
+  } else {
+    index_iter_->SeekToFirst();
+  }
+
+  if (!index_iter_->Valid()) {
+    ResetPartitionedIndexIter();
+    return;
+  }
+
+  InitPartitionedIndexBlock();
+
+  if (target) {
+    block_iter_.Seek(*target);
+  } else {
+    block_iter_.SeekToFirst();
+  }
+  FindKeyForward();
+
+  // We could check upper bound here, but that would be too complicated
+  // and checking index upper bound is less useful than for data blocks.
+
+  if (target) {
+    assert(!Valid() || (table_->get_rep()->index_key_includes_seq
+                            ? (icomp_.Compare(*target, key()) <= 0)
+                            : (user_comparator_.Compare(ExtractUserKey(*target),
+                                                        key()) <= 0)));
+  }
+}
+
+void PartitionedIndexIterator::SeekToLast() {
+  SavePrevIndexValue();
+  index_iter_->SeekToLast();
+  if (!index_iter_->Valid()) {
+    ResetPartitionedIndexIter();
+    return;
+  }
+  InitPartitionedIndexBlock();
+  block_iter_.SeekToLast();
+  FindKeyBackward();
+}
+
+void PartitionedIndexIterator::Next() {
+  assert(block_iter_points_to_real_block_);
+  block_iter_.Next();
+  FindKeyForward();
+}
+
+void PartitionedIndexIterator::Prev() {
+  assert(block_iter_points_to_real_block_);
+  block_iter_.Prev();
+
+  FindKeyBackward();
+}
+
+void PartitionedIndexIterator::InitPartitionedIndexBlock() {
+  BlockHandle partitioned_index_handle = index_iter_->value().handle;
+  if (!block_iter_points_to_real_block_ ||
+      partitioned_index_handle.offset() != prev_block_offset_ ||
+      // if previous attempt of reading the block missed cache, try again
+      block_iter_.status().IsIncomplete()) {
+    if (block_iter_points_to_real_block_) {
+      ResetPartitionedIndexIter();
+    }
+    auto* rep = table_->get_rep();
+    bool is_for_compaction =
+        lookup_context_.caller == TableReaderCaller::kCompaction;
+    // Prefetch additional data for range scans (iterators).
+    // Implicit auto readahead:
+    //   Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
+    // Explicit user requested readahead:
+    //   Enabled from the very first IO when ReadOptions.readahead_size is set.
+    block_prefetcher_.PrefetchIfNeeded(
+        rep, partitioned_index_handle, read_options_.readahead_size,
+        is_for_compaction, /*no_sequential_checking=*/false,
+        read_options_.rate_limiter_priority);
+    Status s;
+    table_->NewDataBlockIterator<IndexBlockIter>(
+        read_options_, partitioned_index_handle, &block_iter_,
+        BlockType::kIndex,
+        /*get_context=*/nullptr, &lookup_context_,
+        block_prefetcher_.prefetch_buffer(),
+        /*for_compaction=*/is_for_compaction, /*async_read=*/false, s);
+    block_iter_points_to_real_block_ = true;
+    // We could check upper bound here but it is complicated to reason about
+    // upper bound in index iterator. On the other than, in large scans, index
+    // iterators are moved much less frequently compared to data blocks. So
+    // the upper bound check is skipped for simplicity.
+  }
+}
+
+void PartitionedIndexIterator::FindKeyForward() {
+  // This method's code is kept short to make it likely to be inlined.
+
+  assert(block_iter_points_to_real_block_);
+
+  if (!block_iter_.Valid()) {
+    // This is the only call site of FindBlockForward(), but it's extracted into
+    // a separate method to keep FindKeyForward() short and likely to be
+    // inlined. When transitioning to a different block, we call
+    // FindBlockForward(), which is much longer and is probably not inlined.
+    FindBlockForward();
+  } else {
+    // This is the fast path that avoids a function call.
+  }
+}
+
+void PartitionedIndexIterator::FindBlockForward() {
+  // TODO the while loop inherits from two-level-iterator. We don't know
+  // whether a block can be empty so it can be replaced by an "if".
+  do {
+    if (!block_iter_.status().ok()) {
+      return;
+    }
+    ResetPartitionedIndexIter();
+    index_iter_->Next();
+
+    if (!index_iter_->Valid()) {
+      return;
+    }
+
+    InitPartitionedIndexBlock();
+    block_iter_.SeekToFirst();
+  } while (!block_iter_.Valid());
+}
+
+void PartitionedIndexIterator::FindKeyBackward() {
+  while (!block_iter_.Valid()) {
+    if (!block_iter_.status().ok()) {
+      return;
+    }
+
+    ResetPartitionedIndexIter();
+    index_iter_->Prev();
+
+    if (index_iter_->Valid()) {
+      InitPartitionedIndexBlock();
+      block_iter_.SeekToLast();
+    } else {
+      return;
+    }
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/partitioned_index_iterator.h b/src/rocksdb/table/block_based/partitioned_index_iterator.h
new file mode 100644
index 000000000..6412fe239
--- /dev/null
+++ b/src/rocksdb/table/block_based/partitioned_index_iterator.h
@@ -0,0 +1,160 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_based_table_reader_impl.h"
+#include "table/block_based/block_prefetcher.h"
+#include "table/block_based/reader_common.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Iterator that iterates over partitioned index.
+// Some upper and lower bound tricks played in block based table iterators
+// could be played here, but it's too complicated to reason about index
+// keys with upper or lower bound, so we skip it for simplicity.
+class PartitionedIndexIterator : public InternalIteratorBase<IndexValue> {
+  // compaction_readahead_size: its value will only be used if for_compaction =
+  // true
+ public:
+  PartitionedIndexIterator(
+      const BlockBasedTable* table, const ReadOptions& read_options,
+      const InternalKeyComparator& icomp,
+      std::unique_ptr<InternalIteratorBase<IndexValue>>&& index_iter,
+      TableReaderCaller caller, size_t compaction_readahead_size = 0)
+      : index_iter_(std::move(index_iter)),
+        table_(table),
+        read_options_(read_options),
+#ifndef NDEBUG
+        icomp_(icomp),
+#endif
+        user_comparator_(icomp.user_comparator()),
+        block_iter_points_to_real_block_(false),
+        lookup_context_(caller),
+        block_prefetcher_(
+            compaction_readahead_size,
+            table_->get_rep()->table_options.initial_auto_readahead_size) {
+  }
+
+  ~PartitionedIndexIterator() override {}
+
+  void Seek(const Slice& target) override;
+  void SeekForPrev(const Slice&) override {
+    // Shouldn't be called.
+    assert(false);
+  }
+  void SeekToFirst() override;
+  void SeekToLast() override;
+  void Next() final override;
+  bool NextAndGetResult(IterateResult*) override {
+    assert(false);
+    return false;
+  }
+  void Prev() override;
+  bool Valid() const override {
+    return block_iter_points_to_real_block_ && block_iter_.Valid();
+  }
+  Slice key() const override {
+    assert(Valid());
+    return block_iter_.key();
+  }
+  Slice user_key() const override {
+    assert(Valid());
+    return block_iter_.user_key();
+  }
+  IndexValue value() const override {
+    assert(Valid());
+    return block_iter_.value();
+  }
+  Status status() const override {
+    // Prefix index set status to NotFound when the prefix does not exist
+    if (!index_iter_->status().ok() && !index_iter_->status().IsNotFound()) {
+      return index_iter_->status();
+    } else if (block_iter_points_to_real_block_) {
+      return block_iter_.status();
+    } else {
+      return Status::OK();
+    }
+  }
+  inline IterBoundCheck UpperBoundCheckResult() override {
+    // Shouldn't be called.
+    assert(false);
+    return IterBoundCheck::kUnknown;
+  }
+  void SetPinnedItersMgr(PinnedIteratorsManager*) override {
+    // Shouldn't be called.
+    assert(false);
+  }
+  bool IsKeyPinned() const override {
+    // Shouldn't be called.
+    assert(false);
+    return false;
+  }
+  bool IsValuePinned() const override {
+    // Shouldn't be called.
+    assert(false);
+    return false;
+  }
+
+  void ResetPartitionedIndexIter() {
+    if (block_iter_points_to_real_block_) {
+      block_iter_.Invalidate(Status::OK());
+      block_iter_points_to_real_block_ = false;
+    }
+  }
+
+  void SavePrevIndexValue() {
+    if (block_iter_points_to_real_block_) {
+      // Reseek. If they end up with the same data block, we shouldn't re-fetch
+      // the same data block.
+      prev_block_offset_ = index_iter_->value().handle.offset();
+    }
+  }
+
+  void GetReadaheadState(ReadaheadFileInfo* readahead_file_info) override {
+    if (block_prefetcher_.prefetch_buffer() != nullptr &&
+        read_options_.adaptive_readahead) {
+      block_prefetcher_.prefetch_buffer()->GetReadaheadState(
+          &(readahead_file_info->index_block_readahead_info));
+    }
+  }
+
+  void SetReadaheadState(ReadaheadFileInfo* readahead_file_info) override {
+    if (read_options_.adaptive_readahead) {
+      block_prefetcher_.SetReadaheadState(
+          &(readahead_file_info->index_block_readahead_info));
+    }
+  }
+
+  std::unique_ptr<InternalIteratorBase<IndexValue>> index_iter_;
+
+ private:
+  friend class BlockBasedTableReaderTestVerifyChecksum_ChecksumMismatch_Test;
+  const BlockBasedTable* table_;
+  const ReadOptions read_options_;
+#ifndef NDEBUG
+  const InternalKeyComparator& icomp_;
+#endif
+  UserComparatorWrapper user_comparator_;
+  IndexBlockIter block_iter_;
+
+  // True if block_iter_ is initialized and points to the same block
+  // as index iterator.
+  bool block_iter_points_to_real_block_;
+  uint64_t prev_block_offset_ = std::numeric_limits<uint64_t>::max();
+  BlockCacheLookupContext lookup_context_;
+  BlockPrefetcher block_prefetcher_;
+
+  // If `target` is null, seek to first.
+  void SeekImpl(const Slice* target);
+
+  void InitPartitionedIndexBlock();
+  void FindKeyForward();
+  void FindBlockForward();
+  void FindKeyBackward();
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/partitioned_index_reader.cc b/src/rocksdb/table/block_based/partitioned_index_reader.cc
new file mode 100644
index 000000000..017ea4a3a
--- /dev/null
+++ b/src/rocksdb/table/block_based/partitioned_index_reader.cc
@@ -0,0 +1,215 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/partitioned_index_reader.h"
+
+#include "file/random_access_file_reader.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/partitioned_index_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status PartitionIndexReader::Create(
+    const BlockBasedTable* table, const ReadOptions& ro,
+    FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+    bool pin, BlockCacheLookupContext* lookup_context,
+    std::unique_ptr<IndexReader>* index_reader) {
+  assert(table != nullptr);
+  assert(table->get_rep());
+  assert(!pin || prefetch);
+  assert(index_reader != nullptr);
+
+  CachableEntry<Block> index_block;
+  if (prefetch || !use_cache) {
+    const Status s =
+        ReadIndexBlock(table, prefetch_buffer, ro, use_cache,
+                       /*get_context=*/nullptr, lookup_context, &index_block);
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (use_cache && !pin) {
+      index_block.Reset();
+    }
+  }
+
+  index_reader->reset(new PartitionIndexReader(table, std::move(index_block)));
+
+  return Status::OK();
+}
+
+InternalIteratorBase<IndexValue>* PartitionIndexReader::NewIterator(
+    const ReadOptions& read_options, bool /* disable_prefix_seek */,
+    IndexBlockIter* iter, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) {
+  const bool no_io = (read_options.read_tier == kBlockCacheTier);
+  CachableEntry<Block> index_block;
+  const Status s =
+      GetOrReadIndexBlock(no_io, read_options.rate_limiter_priority,
+                          get_context, lookup_context, &index_block);
+  if (!s.ok()) {
+    if (iter != nullptr) {
+      iter->Invalidate(s);
+      return iter;
+    }
+
+    return NewErrorInternalIterator<IndexValue>(s);
+  }
+
+  const BlockBasedTable::Rep* rep = table()->rep_;
+  InternalIteratorBase<IndexValue>* it = nullptr;
+
+  Statistics* kNullStats = nullptr;
+  // Filters are already checked before seeking the index
+  if (!partition_map_.empty()) {
+    // We don't return pinned data from index blocks, so no need
+    // to set `block_contents_pinned`.
+    it = NewTwoLevelIterator(
+        new BlockBasedTable::PartitionedIndexIteratorState(table(),
+                                                           &partition_map_),
+        index_block.GetValue()->NewIndexIterator(
+            internal_comparator()->user_comparator(),
+            rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true,
+            index_has_first_key(), index_key_includes_seq(),
+            index_value_is_full()));
+  } else {
+    ReadOptions ro;
+    ro.fill_cache = read_options.fill_cache;
+    ro.deadline = read_options.deadline;
+    ro.io_timeout = read_options.io_timeout;
+    ro.adaptive_readahead = read_options.adaptive_readahead;
+    ro.async_io = read_options.async_io;
+    ro.rate_limiter_priority = read_options.rate_limiter_priority;
+
+    // We don't return pinned data from index blocks, so no need
+    // to set `block_contents_pinned`.
+    std::unique_ptr<InternalIteratorBase<IndexValue>> index_iter(
+        index_block.GetValue()->NewIndexIterator(
+            internal_comparator()->user_comparator(),
+            rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true,
+            index_has_first_key(), index_key_includes_seq(),
+            index_value_is_full()));
+
+    it = new PartitionedIndexIterator(
+        table(), ro, *internal_comparator(), std::move(index_iter),
+        lookup_context ? lookup_context->caller
+                       : TableReaderCaller::kUncategorized);
+  }
+
+  assert(it != nullptr);
+  index_block.TransferTo(it);
+
+  return it;
+
+  // TODO(myabandeh): Update TwoLevelIterator to be able to make use of
+  // on-stack BlockIter while the state is on heap. Currentlly it assumes
+  // the first level iter is always on heap and will attempt to delete it
+  // in its destructor.
+}
+Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro,
+                                               bool pin) {
+  // Before read partitions, prefetch them to avoid lots of IOs
+  BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
+  const BlockBasedTable::Rep* rep = table()->rep_;
+  IndexBlockIter biter;
+  BlockHandle handle;
+  Statistics* kNullStats = nullptr;
+
+  CachableEntry<Block> index_block;
+  {
+    Status s = GetOrReadIndexBlock(false /* no_io */, ro.rate_limiter_priority,
+                                   nullptr /* get_context */, &lookup_context,
+                                   &index_block);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  // We don't return pinned data from index blocks, so no need
+  // to set `block_contents_pinned`.
+  index_block.GetValue()->NewIndexIterator(
+      internal_comparator()->user_comparator(),
+      rep->get_global_seqno(BlockType::kIndex), &biter, kNullStats, true,
+      index_has_first_key(), index_key_includes_seq(), index_value_is_full());
+  // Index partitions are assumed to be consecuitive. Prefetch them all.
+  // Read the first block offset
+  biter.SeekToFirst();
+  if (!biter.Valid()) {
+    // Empty index.
+    return biter.status();
+  }
+  handle = biter.value().handle;
+  uint64_t prefetch_off = handle.offset();
+
+  // Read the last block's offset
+  biter.SeekToLast();
+  if (!biter.Valid()) {
+    // Empty index.
+    return biter.status();
+  }
+  handle = biter.value().handle;
+  uint64_t last_off =
+      handle.offset() + BlockBasedTable::BlockSizeWithTrailer(handle);
+  uint64_t prefetch_len = last_off - prefetch_off;
+  std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
+  rep->CreateFilePrefetchBuffer(
+      0, 0, &prefetch_buffer, false /*Implicit auto readahead*/,
+      0 /*num_reads_*/, 0 /*num_file_reads_for_auto_readahead*/);
+  IOOptions opts;
+  {
+    Status s = rep->file->PrepareIOOptions(ro, opts);
+    if (s.ok()) {
+      s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off,
+                                    static_cast<size_t>(prefetch_len),
+                                    ro.rate_limiter_priority);
+    }
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  // For saving "all or nothing" to partition_map_
+  UnorderedMap<uint64_t, CachableEntry<Block>> map_in_progress;
+
+  // After prefetch, read the partitions one by one
+  biter.SeekToFirst();
+  size_t partition_count = 0;
+  for (; biter.Valid(); biter.Next()) {
+    handle = biter.value().handle;
+    CachableEntry<Block> block;
+    ++partition_count;
+    // TODO: Support counter batch update for partitioned index and
+    // filter blocks
+    Status s = table()->MaybeReadBlockAndLoadToCache(
+        prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
+        /*wait=*/true, /*for_compaction=*/false, &block, BlockType::kIndex,
+        /*get_context=*/nullptr, &lookup_context, /*contents=*/nullptr,
+        /*async_read=*/false);
+
+    if (!s.ok()) {
+      return s;
+    }
+    if (block.GetValue() != nullptr) {
+      // Might need to "pin" some mmap-read blocks (GetOwnValue) if some
+      // partitions are successfully compressed (cached) and some are not
+      // compressed (mmap eligible)
+      if (block.IsCached() || block.GetOwnValue()) {
+        if (pin) {
+          map_in_progress[handle.offset()] = std::move(block);
+        }
+      }
+    }
+  }
+  Status s = biter.status();
+  // Save (pin) them only if everything checks out
+  if (map_in_progress.size() == partition_count && s.ok()) {
+    std::swap(partition_map_, map_in_progress);
+  }
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/partitioned_index_reader.h b/src/rocksdb/table/block_based/partitioned_index_reader.h
new file mode 100644
index 000000000..58a7877ab
--- /dev/null
+++ b/src/rocksdb/table/block_based/partitioned_index_reader.h
@@ -0,0 +1,55 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include "table/block_based/index_reader_common.h"
+#include "util/hash_containers.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Index that allows binary search lookup in a two-level index structure.
+class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
+ public:
+  // Read the partition index from the file and create an instance for
+  // `PartitionIndexReader`.
+  // On success, index_reader will be populated; otherwise it will remain
+  // unmodified.
+  static Status Create(const BlockBasedTable* table, const ReadOptions& ro,
+                       FilePrefetchBuffer* prefetch_buffer, bool use_cache,
+                       bool prefetch, bool pin,
+                       BlockCacheLookupContext* lookup_context,
+                       std::unique_ptr<IndexReader>* index_reader);
+
+  // return a two-level iterator: first level is on the partition index
+  InternalIteratorBase<IndexValue>* NewIterator(
+      const ReadOptions& read_options, bool /* disable_prefix_seek */,
+      IndexBlockIter* iter, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context) override;
+
+  Status CacheDependencies(const ReadOptions& ro, bool pin) override;
+  size_t ApproximateMemoryUsage() const override {
+    size_t usage = ApproximateIndexBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    usage += malloc_usable_size(const_cast<PartitionIndexReader*>(this));
+#else
+    usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    // TODO(myabandeh): more accurate estimate of partition_map_ mem usage
+    return usage;
+  }
+
+ private:
+  PartitionIndexReader(const BlockBasedTable* t,
+                       CachableEntry<Block>&& index_block)
+      : IndexReaderCommon(t, std::move(index_block)) {}
+
+  // For partition blocks pinned in cache. This is expected to be "all or
+  // none" so that !partition_map_.empty() can use an iterator expecting
+  // all partitions to be saved here.
+  UnorderedMap<uint64_t, CachableEntry<Block>> partition_map_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/reader_common.cc b/src/rocksdb/table/block_based/reader_common.cc
new file mode 100644
index 000000000..0ff43e9b4
--- /dev/null
+++ b/src/rocksdb/table/block_based/reader_common.cc
@@ -0,0 +1,52 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/reader_common.h"
+
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/table.h"
+#include "table/format.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+void ForceReleaseCachedEntry(void* arg, void* h) {
+  Cache* cache = reinterpret_cast<Cache*>(arg);
+  Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
+  cache->Release(handle, true /* erase_if_last_ref */);
+}
+
+// WART: this is specific to block-based table
+Status VerifyBlockChecksum(ChecksumType type, const char* data,
+                           size_t block_size, const std::string& file_name,
+                           uint64_t offset) {
+  PERF_TIMER_GUARD(block_checksum_time);
+  // After block_size bytes is compression type (1 byte), which is part of
+  // the checksummed section.
+  size_t len = block_size + 1;
+  // And then the stored checksum value (4 bytes).
+  uint32_t stored = DecodeFixed32(data + len);
+
+  uint32_t computed = ComputeBuiltinChecksum(type, data, len);
+  if (stored == computed) {
+    return Status::OK();
+  } else {
+    // Unmask for people who might look for reference crc value
+    if (type == kCRC32c) {
+      stored = crc32c::Unmask(stored);
+      computed = crc32c::Unmask(computed);
+    }
+    return Status::Corruption(
+        "block checksum mismatch: stored = " + std::to_string(stored) +
+        ", computed = " + std::to_string(computed) +
+        ", type = " + std::to_string(type) + "  in " + file_name + " offset " +
+        std::to_string(offset) + " size " + std::to_string(block_size));
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/reader_common.h b/src/rocksdb/table/block_based/reader_common.h
new file mode 100644
index 000000000..5bb199f28
--- /dev/null
+++ b/src/rocksdb/table/block_based/reader_common.h
@@ -0,0 +1,38 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include "rocksdb/cache.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Release the cached entry and decrement its ref count.
+extern void ForceReleaseCachedEntry(void* arg, void* h);
+
+inline MemoryAllocator* GetMemoryAllocator(
+    const BlockBasedTableOptions& table_options) {
+  return table_options.block_cache.get()
+             ? table_options.block_cache->memory_allocator()
+             : nullptr;
+}
+
+inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock(
+    const BlockBasedTableOptions& table_options) {
+  return table_options.block_cache_compressed.get()
+             ? table_options.block_cache_compressed->memory_allocator()
+             : nullptr;
+}
+
+// Assumes block has a trailer as in format.h. file_name and offset provided
+// for generating a diagnostic message in returned status.
+extern Status VerifyBlockChecksum(ChecksumType type, const char* data,
+                                  size_t block_size,
+                                  const std::string& file_name,
+                                  uint64_t offset);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/uncompression_dict_reader.cc b/src/rocksdb/table/block_based/uncompression_dict_reader.cc
new file mode 100644
index 000000000..dc9a47ec7
--- /dev/null
+++ b/src/rocksdb/table/block_based/uncompression_dict_reader.cc
@@ -0,0 +1,124 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "table/block_based/uncompression_dict_reader.h"
+
+#include "logging/logging.h"
+#include "monitoring/perf_context_imp.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "util/compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status UncompressionDictReader::Create(
+    const BlockBasedTable* table, const ReadOptions& ro,
+    FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+    bool pin, BlockCacheLookupContext* lookup_context,
+    std::unique_ptr<UncompressionDictReader>* uncompression_dict_reader) {
+  assert(table);
+  assert(table->get_rep());
+  assert(!pin || prefetch);
+  assert(uncompression_dict_reader);
+
+  CachableEntry<UncompressionDict> uncompression_dict;
+  if (prefetch || !use_cache) {
+    const Status s = ReadUncompressionDictionary(
+        table, prefetch_buffer, ro, use_cache, nullptr /* get_context */,
+        lookup_context, &uncompression_dict);
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (use_cache && !pin) {
+      uncompression_dict.Reset();
+    }
+  }
+
+  uncompression_dict_reader->reset(
+      new UncompressionDictReader(table, std::move(uncompression_dict)));
+
+  return Status::OK();
+}
+
+Status UncompressionDictReader::ReadUncompressionDictionary(
+    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    const ReadOptions& read_options, bool use_cache, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<UncompressionDict>* uncompression_dict) {
+  // TODO: add perf counter for compression dictionary read time
+
+  assert(table);
+  assert(uncompression_dict);
+  assert(uncompression_dict->IsEmpty());
+
+  const BlockBasedTable::Rep* const rep = table->get_rep();
+  assert(rep);
+  assert(!rep->compression_dict_handle.IsNull());
+
+  const Status s = table->RetrieveBlock(
+      prefetch_buffer, read_options, rep->compression_dict_handle,
+      UncompressionDict::GetEmptyDict(), uncompression_dict,
+      BlockType::kCompressionDictionary, get_context, lookup_context,
+      /* for_compaction */ false, use_cache, /* wait_for_cache */ true,
+      /* async_read */ false);
+
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(
+        rep->ioptions.logger,
+        "Encountered error while reading data from compression dictionary "
+        "block %s",
+        s.ToString().c_str());
+  }
+
+  return s;
+}
+
+Status UncompressionDictReader::GetOrReadUncompressionDictionary(
+    FilePrefetchBuffer* prefetch_buffer, bool no_io, bool verify_checksums,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    CachableEntry<UncompressionDict>* uncompression_dict) const {
+  assert(uncompression_dict);
+
+  if (!uncompression_dict_.IsEmpty()) {
+    uncompression_dict->SetUnownedValue(uncompression_dict_.GetValue());
+    return Status::OK();
+  }
+
+  ReadOptions read_options;
+  if (no_io) {
+    read_options.read_tier = kBlockCacheTier;
+  }
+  read_options.verify_checksums = verify_checksums;
+
+  return ReadUncompressionDictionary(table_, prefetch_buffer, read_options,
+                                     cache_dictionary_blocks(), get_context,
+                                     lookup_context, uncompression_dict);
+}
+
+size_t UncompressionDictReader::ApproximateMemoryUsage() const {
+  assert(!uncompression_dict_.GetOwnValue() ||
+         uncompression_dict_.GetValue() != nullptr);
+  size_t usage = uncompression_dict_.GetOwnValue()
+                     ? uncompression_dict_.GetValue()->ApproximateMemoryUsage()
+                     : 0;
+
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  usage += malloc_usable_size(const_cast<UncompressionDictReader*>(this));
+#else
+  usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+
+  return usage;
+}
+
+bool UncompressionDictReader::cache_dictionary_blocks() const {
+  assert(table_);
+  assert(table_->get_rep());
+
+  return table_->get_rep()->table_options.cache_index_and_filter_blocks;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_based/uncompression_dict_reader.h b/src/rocksdb/table/block_based/uncompression_dict_reader.h
new file mode 100644
index 000000000..416d25e2d
--- /dev/null
+++ b/src/rocksdb/table/block_based/uncompression_dict_reader.h
@@ -0,0 +1,60 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include <cassert>
+
+#include "table/block_based/cachable_entry.h"
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlockBasedTable;
+struct BlockCacheLookupContext;
+class FilePrefetchBuffer;
+class GetContext;
+struct ReadOptions;
+struct UncompressionDict;
+
+// Provides access to the uncompression dictionary regardless of whether
+// it is owned by the reader or stored in the cache, or whether it is pinned
+// in the cache or not.
+class UncompressionDictReader {
+ public:
+  static Status Create(
+      const BlockBasedTable* table, const ReadOptions& ro,
+      FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+      bool pin, BlockCacheLookupContext* lookup_context,
+      std::unique_ptr<UncompressionDictReader>* uncompression_dict_reader);
+
+  Status GetOrReadUncompressionDictionary(
+      FilePrefetchBuffer* prefetch_buffer, bool no_io, bool verify_checksums,
+      GetContext* get_context, BlockCacheLookupContext* lookup_context,
+      CachableEntry<UncompressionDict>* uncompression_dict) const;
+
+  size_t ApproximateMemoryUsage() const;
+
+ private:
+  UncompressionDictReader(const BlockBasedTable* t,
+                          CachableEntry<UncompressionDict>&& uncompression_dict)
+      : table_(t), uncompression_dict_(std::move(uncompression_dict)) {
+    assert(table_);
+  }
+
+  bool cache_dictionary_blocks() const;
+
+  static Status ReadUncompressionDictionary(
+      const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+      const ReadOptions& read_options, bool use_cache, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context,
+      CachableEntry<UncompressionDict>* uncompression_dict);
+
+  const BlockBasedTable* table_;
+  CachableEntry<UncompressionDict> uncompression_dict_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_fetcher.cc b/src/rocksdb/table/block_fetcher.cc
new file mode 100644
index 000000000..8df0850b3
--- /dev/null
+++ b/src/rocksdb/table/block_fetcher.cc
@@ -0,0 +1,399 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_fetcher.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <string>
+
+#include "logging/logging.h"
+#include "memory/memory_allocator.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/env.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_type.h"
+#include "table/block_based/reader_common.h"
+#include "table/format.h"
+#include "table/persistent_cache_helper.h"
+#include "util/compression.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+inline void BlockFetcher::ProcessTrailerIfPresent() {
+  if (footer_.GetBlockTrailerSize() > 0) {
+    assert(footer_.GetBlockTrailerSize() == BlockBasedTable::kBlockTrailerSize);
+    if (read_options_.verify_checksums) {
+      io_status_ = status_to_io_status(VerifyBlockChecksum(
+          footer_.checksum_type(), slice_.data(), block_size_,
+          file_->file_name(), handle_.offset()));
+      RecordTick(ioptions_.stats, BLOCK_CHECKSUM_COMPUTE_COUNT);
+    }
+    compression_type_ =
+        BlockBasedTable::GetBlockCompressionType(slice_.data(), block_size_);
+  } else {
+    // E.g. plain table or cuckoo table
+    compression_type_ = kNoCompression;
+  }
+}
+
+inline bool BlockFetcher::TryGetUncompressBlockFromPersistentCache() {
+  if (cache_options_.persistent_cache &&
+      !cache_options_.persistent_cache->IsCompressed()) {
+    Status status = PersistentCacheHelper::LookupUncompressed(
+        cache_options_, handle_, contents_);
+    if (status.ok()) {
+      // uncompressed page is found for the block handle
+      return true;
+    } else {
+      // uncompressed page is not found
+      if (ioptions_.logger && !status.IsNotFound()) {
+        assert(!status.ok());
+        ROCKS_LOG_INFO(ioptions_.logger,
+                       "Error reading from persistent cache. %s",
+                       status.ToString().c_str());
+      }
+    }
+  }
+  return false;
+}
+
+inline bool BlockFetcher::TryGetFromPrefetchBuffer() {
+  if (prefetch_buffer_ != nullptr) {
+    IOOptions opts;
+    IOStatus io_s = file_->PrepareIOOptions(read_options_, opts);
+    if (io_s.ok()) {
+      bool read_from_prefetch_buffer = false;
+      if (read_options_.async_io && !for_compaction_) {
+        read_from_prefetch_buffer = prefetch_buffer_->TryReadFromCacheAsync(
+            opts, file_, handle_.offset(), block_size_with_trailer_, &slice_,
+            &io_s, read_options_.rate_limiter_priority);
+      } else {
+        read_from_prefetch_buffer = prefetch_buffer_->TryReadFromCache(
+            opts, file_, handle_.offset(), block_size_with_trailer_, &slice_,
+            &io_s, read_options_.rate_limiter_priority, for_compaction_);
+      }
+      if (read_from_prefetch_buffer) {
+        ProcessTrailerIfPresent();
+        if (!io_status_.ok()) {
+          return true;
+        }
+        got_from_prefetch_buffer_ = true;
+        used_buf_ = const_cast<char*>(slice_.data());
+      }
+    }
+    if (!io_s.ok()) {
+      io_status_ = io_s;
+      return true;
+    }
+  }
+  return got_from_prefetch_buffer_;
+}
+
+inline bool BlockFetcher::TryGetSerializedBlockFromPersistentCache() {
+  if (cache_options_.persistent_cache &&
+      cache_options_.persistent_cache->IsCompressed()) {
+    std::unique_ptr<char[]> buf;
+    io_status_ = status_to_io_status(PersistentCacheHelper::LookupSerialized(
+        cache_options_, handle_, &buf, block_size_with_trailer_));
+    if (io_status_.ok()) {
+      heap_buf_ = CacheAllocationPtr(buf.release());
+      used_buf_ = heap_buf_.get();
+      slice_ = Slice(heap_buf_.get(), block_size_);
+      ProcessTrailerIfPresent();
+      return true;
+    } else if (!io_status_.IsNotFound() && ioptions_.logger) {
+      assert(!io_status_.ok());
+      ROCKS_LOG_INFO(ioptions_.logger,
+                     "Error reading from persistent cache. %s",
+                     io_status_.ToString().c_str());
+    }
+  }
+  return false;
+}
+
+inline void BlockFetcher::PrepareBufferForBlockFromFile() {
+  // cache miss read from device
+  if ((do_uncompress_ || ioptions_.allow_mmap_reads) &&
+      block_size_with_trailer_ < kDefaultStackBufferSize) {
+    // If we've got a small enough chunk of data, read it in to the
+    // trivially allocated stack buffer instead of needing a full malloc()
+    //
+    // `GetBlockContents()` cannot return this data as its lifetime is tied to
+    // this `BlockFetcher`'s lifetime. That is fine because this is only used
+    // in cases where we do not expect the `GetBlockContents()` result to be the
+    // same buffer we are assigning here. If we guess incorrectly, there will be
+    // a heap allocation and memcpy in `GetBlockContents()` to obtain the final
+    // result. Considering we are eliding a heap allocation here by using the
+    // stack buffer, the cost of guessing incorrectly here is one extra memcpy.
+    //
+    // When `do_uncompress_` is true, we expect the uncompression step will
+    // allocate heap memory for the final result. However this expectation will
+    // be wrong if the block turns out to already be uncompressed, which we
+    // won't know for sure until after reading it.
+    //
+    // When `ioptions_.allow_mmap_reads` is true, we do not expect the file
+    // reader to use the scratch buffer at all, but instead return a pointer
+    // into the mapped memory. This expectation will be wrong when using a
+    // file reader that does not implement mmap reads properly.
+    used_buf_ = &stack_buf_[0];
+  } else if (maybe_compressed_ && !do_uncompress_) {
+    compressed_buf_ =
+        AllocateBlock(block_size_with_trailer_, memory_allocator_compressed_);
+    used_buf_ = compressed_buf_.get();
+  } else {
+    heap_buf_ = AllocateBlock(block_size_with_trailer_, memory_allocator_);
+    used_buf_ = heap_buf_.get();
+  }
+}
+
+inline void BlockFetcher::InsertCompressedBlockToPersistentCacheIfNeeded() {
+  if (io_status_.ok() && read_options_.fill_cache &&
+      cache_options_.persistent_cache &&
+      cache_options_.persistent_cache->IsCompressed()) {
+    PersistentCacheHelper::InsertSerialized(cache_options_, handle_, used_buf_,
+                                            block_size_with_trailer_);
+  }
+}
+
+inline void BlockFetcher::InsertUncompressedBlockToPersistentCacheIfNeeded() {
+  if (io_status_.ok() && !got_from_prefetch_buffer_ &&
+      read_options_.fill_cache && cache_options_.persistent_cache &&
+      !cache_options_.persistent_cache->IsCompressed()) {
+    // insert to uncompressed cache
+    PersistentCacheHelper::InsertUncompressed(cache_options_, handle_,
+                                              *contents_);
+  }
+}
+
+inline void BlockFetcher::CopyBufferToHeapBuf() {
+  assert(used_buf_ != heap_buf_.get());
+  heap_buf_ = AllocateBlock(block_size_with_trailer_, memory_allocator_);
+  memcpy(heap_buf_.get(), used_buf_, block_size_with_trailer_);
+#ifndef NDEBUG
+  num_heap_buf_memcpy_++;
+#endif
+}
+
+inline void BlockFetcher::CopyBufferToCompressedBuf() {
+  assert(used_buf_ != compressed_buf_.get());
+  compressed_buf_ =
+      AllocateBlock(block_size_with_trailer_, memory_allocator_compressed_);
+  memcpy(compressed_buf_.get(), used_buf_, block_size_with_trailer_);
+#ifndef NDEBUG
+  num_compressed_buf_memcpy_++;
+#endif
+}
+
+// Entering this method means the block is not compressed or do not need to be
+// uncompressed. The block can be in one of the following buffers:
+// 1. prefetch buffer if prefetch is enabled and the block is prefetched before
+// 2. stack_buf_ if block size is smaller than the stack_buf_ size and block
+//    is not compressed
+// 3. heap_buf_ if the block is not compressed
+// 4. compressed_buf_ if the block is compressed
+// 5. direct_io_buf_ if direct IO is enabled
+// After this method, if the block is compressed, it should be in
+// compressed_buf_, otherwise should be in heap_buf_.
+inline void BlockFetcher::GetBlockContents() {
+  if (slice_.data() != used_buf_) {
+    // the slice content is not the buffer provided
+    *contents_ = BlockContents(Slice(slice_.data(), block_size_));
+  } else {
+    // page can be either uncompressed or compressed, the buffer either stack
+    // or heap provided. Refer to https://github.com/facebook/rocksdb/pull/4096
+    if (got_from_prefetch_buffer_ || used_buf_ == &stack_buf_[0]) {
+      CopyBufferToHeapBuf();
+    } else if (used_buf_ == compressed_buf_.get()) {
+      if (compression_type_ == kNoCompression &&
+          memory_allocator_ != memory_allocator_compressed_) {
+        CopyBufferToHeapBuf();
+      } else {
+        heap_buf_ = std::move(compressed_buf_);
+      }
+    } else if (direct_io_buf_.get() != nullptr) {
+      if (compression_type_ == kNoCompression) {
+        CopyBufferToHeapBuf();
+      } else {
+        CopyBufferToCompressedBuf();
+        heap_buf_ = std::move(compressed_buf_);
+      }
+    }
+    *contents_ = BlockContents(std::move(heap_buf_), block_size_);
+  }
+#ifndef NDEBUG
+  contents_->has_trailer = footer_.GetBlockTrailerSize() > 0;
+#endif
+}
+
+IOStatus BlockFetcher::ReadBlockContents() {
+  if (TryGetUncompressBlockFromPersistentCache()) {
+    compression_type_ = kNoCompression;
+#ifndef NDEBUG
+    contents_->has_trailer = footer_.GetBlockTrailerSize() > 0;
+#endif  // NDEBUG
+    return IOStatus::OK();
+  }
+  if (TryGetFromPrefetchBuffer()) {
+    if (!io_status_.ok()) {
+      return io_status_;
+    }
+  } else if (!TryGetSerializedBlockFromPersistentCache()) {
+    IOOptions opts;
+    io_status_ = file_->PrepareIOOptions(read_options_, opts);
+    // Actual file read
+    if (io_status_.ok()) {
+      if (file_->use_direct_io()) {
+        PERF_TIMER_GUARD(block_read_time);
+        io_status_ = file_->Read(
+            opts, handle_.offset(), block_size_with_trailer_, &slice_, nullptr,
+            &direct_io_buf_, read_options_.rate_limiter_priority);
+        PERF_COUNTER_ADD(block_read_count, 1);
+        used_buf_ = const_cast<char*>(slice_.data());
+      } else {
+        PrepareBufferForBlockFromFile();
+        PERF_TIMER_GUARD(block_read_time);
+        io_status_ = file_->Read(opts, handle_.offset(),
+                                 block_size_with_trailer_, &slice_, used_buf_,
+                                 nullptr, read_options_.rate_limiter_priority);
+        PERF_COUNTER_ADD(block_read_count, 1);
+#ifndef NDEBUG
+        if (slice_.data() == &stack_buf_[0]) {
+          num_stack_buf_memcpy_++;
+        } else if (slice_.data() == heap_buf_.get()) {
+          num_heap_buf_memcpy_++;
+        } else if (slice_.data() == compressed_buf_.get()) {
+          num_compressed_buf_memcpy_++;
+        }
+#endif
+      }
+    }
+
+    // TODO: introduce dedicated perf counter for range tombstones
+    switch (block_type_) {
+      case BlockType::kFilter:
+      case BlockType::kFilterPartitionIndex:
+        PERF_COUNTER_ADD(filter_block_read_count, 1);
+        break;
+
+      case BlockType::kCompressionDictionary:
+        PERF_COUNTER_ADD(compression_dict_block_read_count, 1);
+        break;
+
+      case BlockType::kIndex:
+        PERF_COUNTER_ADD(index_block_read_count, 1);
+        break;
+
+      // Nothing to do here as we don't have counters for the other types.
+      default:
+        break;
+    }
+
+    PERF_COUNTER_ADD(block_read_byte, block_size_with_trailer_);
+    if (!io_status_.ok()) {
+      return io_status_;
+    }
+
+    if (slice_.size() != block_size_with_trailer_) {
+      return IOStatus::Corruption(
+          "truncated block read from " + file_->file_name() + " offset " +
+          std::to_string(handle_.offset()) + ", expected " +
+          std::to_string(block_size_with_trailer_) + " bytes, got " +
+          std::to_string(slice_.size()));
+    }
+
+    ProcessTrailerIfPresent();
+    if (io_status_.ok()) {
+      InsertCompressedBlockToPersistentCacheIfNeeded();
+    } else {
+      return io_status_;
+    }
+  }
+
+  if (do_uncompress_ && compression_type_ != kNoCompression) {
+    PERF_TIMER_GUARD(block_decompress_time);
+    // compressed page, uncompress, update cache
+    UncompressionContext context(compression_type_);
+    UncompressionInfo info(context, uncompression_dict_, compression_type_);
+    io_status_ = status_to_io_status(UncompressSerializedBlock(
+        info, slice_.data(), block_size_, contents_, footer_.format_version(),
+        ioptions_, memory_allocator_));
+#ifndef NDEBUG
+    num_heap_buf_memcpy_++;
+#endif
+    compression_type_ = kNoCompression;
+  } else {
+    GetBlockContents();
+  }
+
+  InsertUncompressedBlockToPersistentCacheIfNeeded();
+
+  return io_status_;
+}
+
+IOStatus BlockFetcher::ReadAsyncBlockContents() {
+  if (TryGetUncompressBlockFromPersistentCache()) {
+    compression_type_ = kNoCompression;
+#ifndef NDEBUG
+    contents_->has_trailer = footer_.GetBlockTrailerSize() > 0;
+#endif  // NDEBUG
+    return IOStatus::OK();
+  } else if (!TryGetSerializedBlockFromPersistentCache()) {
+    assert(prefetch_buffer_ != nullptr);
+    if (!for_compaction_) {
+      IOOptions opts;
+      IOStatus io_s = file_->PrepareIOOptions(read_options_, opts);
+      if (!io_s.ok()) {
+        return io_s;
+      }
+      io_s = status_to_io_status(prefetch_buffer_->PrefetchAsync(
+          opts, file_, handle_.offset(), block_size_with_trailer_, &slice_));
+      if (io_s.IsTryAgain()) {
+        return io_s;
+      }
+      if (io_s.ok()) {
+        // Data Block is already in prefetch.
+        got_from_prefetch_buffer_ = true;
+        ProcessTrailerIfPresent();
+        if (!io_status_.ok()) {
+          return io_status_;
+        }
+        used_buf_ = const_cast<char*>(slice_.data());
+
+        if (do_uncompress_ && compression_type_ != kNoCompression) {
+          PERF_TIMER_GUARD(block_decompress_time);
+          // compressed page, uncompress, update cache
+          UncompressionContext context(compression_type_);
+          UncompressionInfo info(context, uncompression_dict_,
+                                 compression_type_);
+          io_status_ = status_to_io_status(UncompressSerializedBlock(
+              info, slice_.data(), block_size_, contents_,
+              footer_.format_version(), ioptions_, memory_allocator_));
+#ifndef NDEBUG
+          num_heap_buf_memcpy_++;
+#endif
+          compression_type_ = kNoCompression;
+        } else {
+          GetBlockContents();
+        }
+        InsertUncompressedBlockToPersistentCacheIfNeeded();
+        return io_status_;
+      }
+    }
+    // Fallback to sequential reading of data blocks in case of io_s returns
+    // error or for_compaction_is true.
+    return ReadBlockContents();
+  }
+  return io_status_;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_fetcher.h b/src/rocksdb/table/block_fetcher.h
new file mode 100644
index 000000000..72adced30
--- /dev/null
+++ b/src/rocksdb/table/block_fetcher.h
@@ -0,0 +1,142 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "memory/memory_allocator.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_type.h"
+#include "table/format.h"
+#include "table/persistent_cache_options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Retrieves a single block of a given file. Utilizes the prefetch buffer and/or
+// persistent cache provided (if any) to try to avoid reading from the file
+// directly. Note that both the prefetch buffer and the persistent cache are
+// optional; also, note that the persistent cache may be configured to store
+// either compressed or uncompressed blocks.
+//
+// If the retrieved block is compressed and the do_uncompress flag is set,
+// BlockFetcher uncompresses the block (using the uncompression dictionary,
+// if provided, to prime the compression algorithm), and returns the resulting
+// uncompressed block data. Otherwise, it returns the original block.
+//
+// Two read options affect the behavior of BlockFetcher: if verify_checksums is
+// true, the checksum of the (original) block is checked; if fill_cache is true,
+// the block is added to the persistent cache if needed.
+//
+// Memory for uncompressed and compressed blocks is allocated as needed
+// using memory_allocator and memory_allocator_compressed, respectively
+// (if provided; otherwise, the default allocator is used).
+
+class BlockFetcher {
+ public:
+  BlockFetcher(RandomAccessFileReader* file,
+               FilePrefetchBuffer* prefetch_buffer,
+               const Footer& footer /* ref retained */,
+               const ReadOptions& read_options,
+               const BlockHandle& handle /* ref retained */,
+               BlockContents* contents,
+               const ImmutableOptions& ioptions /* ref retained */,
+               bool do_uncompress, bool maybe_compressed, BlockType block_type,
+               const UncompressionDict& uncompression_dict /* ref retained */,
+               const PersistentCacheOptions& cache_options /* ref retained */,
+               MemoryAllocator* memory_allocator = nullptr,
+               MemoryAllocator* memory_allocator_compressed = nullptr,
+               bool for_compaction = false)
+      : file_(file),
+        prefetch_buffer_(prefetch_buffer),
+        footer_(footer),
+        read_options_(read_options),
+        handle_(handle),
+        contents_(contents),
+        ioptions_(ioptions),
+        do_uncompress_(do_uncompress),
+        maybe_compressed_(maybe_compressed),
+        block_type_(block_type),
+        block_size_(static_cast<size_t>(handle_.size())),
+        block_size_with_trailer_(block_size_ + footer.GetBlockTrailerSize()),
+        uncompression_dict_(uncompression_dict),
+        cache_options_(cache_options),
+        memory_allocator_(memory_allocator),
+        memory_allocator_compressed_(memory_allocator_compressed),
+        for_compaction_(for_compaction) {
+    io_status_.PermitUncheckedError();  // TODO(AR) can we improve on this?
+  }
+
+  IOStatus ReadBlockContents();
+  IOStatus ReadAsyncBlockContents();
+
+  inline CompressionType get_compression_type() const {
+    return compression_type_;
+  }
+  inline size_t GetBlockSizeWithTrailer() const {
+    return block_size_with_trailer_;
+  }
+
+#ifndef NDEBUG
+  int TEST_GetNumStackBufMemcpy() const { return num_stack_buf_memcpy_; }
+  int TEST_GetNumHeapBufMemcpy() const { return num_heap_buf_memcpy_; }
+  int TEST_GetNumCompressedBufMemcpy() const {
+    return num_compressed_buf_memcpy_;
+  }
+
+#endif
+ private:
+#ifndef NDEBUG
+  int num_stack_buf_memcpy_ = 0;
+  int num_heap_buf_memcpy_ = 0;
+  int num_compressed_buf_memcpy_ = 0;
+
+#endif
+  static const uint32_t kDefaultStackBufferSize = 5000;
+
+  RandomAccessFileReader* file_;
+  FilePrefetchBuffer* prefetch_buffer_;
+  const Footer& footer_;
+  const ReadOptions read_options_;
+  const BlockHandle& handle_;
+  BlockContents* contents_;
+  const ImmutableOptions& ioptions_;
+  const bool do_uncompress_;
+  const bool maybe_compressed_;
+  const BlockType block_type_;
+  const size_t block_size_;
+  const size_t block_size_with_trailer_;
+  const UncompressionDict& uncompression_dict_;
+  const PersistentCacheOptions& cache_options_;
+  MemoryAllocator* memory_allocator_;
+  MemoryAllocator* memory_allocator_compressed_;
+  IOStatus io_status_;
+  Slice slice_;
+  char* used_buf_ = nullptr;
+  AlignedBuf direct_io_buf_;
+  CacheAllocationPtr heap_buf_;
+  CacheAllocationPtr compressed_buf_;
+  char stack_buf_[kDefaultStackBufferSize];
+  bool got_from_prefetch_buffer_ = false;
+  CompressionType compression_type_;
+  bool for_compaction_ = false;
+
+  // return true if found
+  bool TryGetUncompressBlockFromPersistentCache();
+  // return true if found
+  bool TryGetFromPrefetchBuffer();
+  bool TryGetSerializedBlockFromPersistentCache();
+  void PrepareBufferForBlockFromFile();
+  // Copy content from used_buf_ to new heap_buf_.
+  void CopyBufferToHeapBuf();
+  // Copy content from used_buf_ to new compressed_buf_.
+  void CopyBufferToCompressedBuf();
+  void GetBlockContents();
+  void InsertCompressedBlockToPersistentCacheIfNeeded();
+  void InsertUncompressedBlockToPersistentCacheIfNeeded();
+  void ProcessTrailerIfPresent();
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/block_fetcher_test.cc b/src/rocksdb/table/block_fetcher_test.cc
new file mode 100644
index 000000000..82caee282
--- /dev/null
+++ b/src/rocksdb/table/block_fetcher_test.cc
@@ -0,0 +1,521 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/block_fetcher.h"
+
+#include "db/table_properties_collector.h"
+#include "file/file_util.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
+#include "table/block_based/binary_search_index_reader.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/format.h"
+#include "test_util/testharness.h"
+#include "utilities/memory_allocators.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+struct MemcpyStats {
+  int num_stack_buf_memcpy;
+  int num_heap_buf_memcpy;
+  int num_compressed_buf_memcpy;
+};
+
+struct BufAllocationStats {
+  int num_heap_buf_allocations;
+  int num_compressed_buf_allocations;
+};
+
+struct TestStats {
+  MemcpyStats memcpy_stats;
+  BufAllocationStats buf_allocation_stats;
+};
+
+class BlockFetcherTest : public testing::Test {
+ public:
+  enum class Mode {
+    kBufferedRead = 0,
+    kBufferedMmap,
+    kDirectRead,
+    kNumModes,
+  };
+  // use NumModes as array size to avoid "size of array '...' has non-integral
+  // type" errors.
+  const static int NumModes = static_cast<int>(Mode::kNumModes);
+
+ protected:
+  void SetUp() override {
+    SetupSyncPointsToMockDirectIO();
+    test_dir_ = test::PerThreadDBPath("block_fetcher_test");
+    env_ = Env::Default();
+    fs_ = FileSystem::Default();
+    ASSERT_OK(fs_->CreateDir(test_dir_, IOOptions(), nullptr));
+  }
+
+  void TearDown() override { EXPECT_OK(DestroyDir(env_, test_dir_)); }
+
+  void AssertSameBlock(const std::string& block1, const std::string& block2) {
+    ASSERT_EQ(block1, block2);
+  }
+
+  // Creates a table with kv pairs (i, i) where i ranges from 0 to 9, inclusive.
+  void CreateTable(const std::string& table_name,
+                   const CompressionType& compression_type) {
+    std::unique_ptr<WritableFileWriter> writer;
+    NewFileWriter(table_name, &writer);
+
+    // Create table builder.
+    ImmutableOptions ioptions(options_);
+    InternalKeyComparator comparator(options_.comparator);
+    ColumnFamilyOptions cf_options(options_);
+    MutableCFOptions moptions(cf_options);
+    IntTblPropCollectorFactories factories;
+    std::unique_ptr<TableBuilder> table_builder(table_factory_.NewTableBuilder(
+        TableBuilderOptions(ioptions, moptions, comparator, &factories,
+                            compression_type, CompressionOptions(),
+                            0 /* column_family_id */, kDefaultColumnFamilyName,
+                            -1 /* level */),
+        writer.get()));
+
+    // Build table.
+    for (int i = 0; i < 9; i++) {
+      std::string key = ToInternalKey(std::to_string(i));
+      // Append "00000000" to string value to enhance compression ratio
+      std::string value = "00000000" + std::to_string(i);
+      table_builder->Add(key, value);
+    }
+    ASSERT_OK(table_builder->Finish());
+  }
+
+  void FetchIndexBlock(const std::string& table_name,
+                       CountedMemoryAllocator* heap_buf_allocator,
+                       CountedMemoryAllocator* compressed_buf_allocator,
+                       MemcpyStats* memcpy_stats, BlockContents* index_block,
+                       std::string* result) {
+    FileOptions fopt(options_);
+    std::unique_ptr<RandomAccessFileReader> file;
+    NewFileReader(table_name, fopt, &file);
+
+    // Get handle of the index block.
+    Footer footer;
+    ReadFooter(file.get(), &footer);
+    const BlockHandle& index_handle = footer.index_handle();
+
+    CompressionType compression_type;
+    FetchBlock(file.get(), index_handle, BlockType::kIndex,
+               false /* compressed */, false /* do_uncompress */,
+               heap_buf_allocator, compressed_buf_allocator, index_block,
+               memcpy_stats, &compression_type);
+    ASSERT_EQ(compression_type, CompressionType::kNoCompression);
+    result->assign(index_block->data.ToString());
+  }
+
+  // Fetches the first data block in both direct IO and non-direct IO mode.
+  //
+  // compressed: whether the data blocks are compressed;
+  // do_uncompress: whether the data blocks should be uncompressed on fetching.
+  // compression_type: the expected compression type.
+  //
+  // Expects:
+  // Block contents are the same.
+  // Bufferr allocation and memory copy statistics are expected.
+  void TestFetchDataBlock(
+      const std::string& table_name_prefix, bool compressed, bool do_uncompress,
+      std::array<TestStats, NumModes> expected_stats_by_mode) {
+    for (CompressionType compression_type : GetSupportedCompressions()) {
+      bool do_compress = compression_type != kNoCompression;
+      if (compressed != do_compress) continue;
+      std::string compression_type_str =
+          CompressionTypeToString(compression_type);
+
+      std::string table_name = table_name_prefix + compression_type_str;
+      CreateTable(table_name, compression_type);
+
+      CompressionType expected_compression_type_after_fetch =
+          (compressed && !do_uncompress) ? compression_type : kNoCompression;
+
+      BlockContents blocks[NumModes];
+      std::string block_datas[NumModes];
+      MemcpyStats memcpy_stats[NumModes];
+      CountedMemoryAllocator heap_buf_allocators[NumModes];
+      CountedMemoryAllocator compressed_buf_allocators[NumModes];
+      for (int i = 0; i < NumModes; ++i) {
+        SetMode(static_cast<Mode>(i));
+        FetchFirstDataBlock(table_name, compressed, do_uncompress,
+                            expected_compression_type_after_fetch,
+                            &heap_buf_allocators[i],
+                            &compressed_buf_allocators[i], &blocks[i],
+                            &block_datas[i], &memcpy_stats[i]);
+      }
+
+      for (int i = 0; i < NumModes - 1; ++i) {
+        AssertSameBlock(block_datas[i], block_datas[i + 1]);
+      }
+
+      // Check memcpy and buffer allocation statistics.
+      for (int i = 0; i < NumModes; ++i) {
+        const TestStats& expected_stats = expected_stats_by_mode[i];
+
+        ASSERT_EQ(memcpy_stats[i].num_stack_buf_memcpy,
+                  expected_stats.memcpy_stats.num_stack_buf_memcpy);
+        ASSERT_EQ(memcpy_stats[i].num_heap_buf_memcpy,
+                  expected_stats.memcpy_stats.num_heap_buf_memcpy);
+        ASSERT_EQ(memcpy_stats[i].num_compressed_buf_memcpy,
+                  expected_stats.memcpy_stats.num_compressed_buf_memcpy);
+
+        if (kXpressCompression == compression_type) {
+          // XPRESS allocates memory internally, thus does not support for
+          // custom allocator verification
+          continue;
+        } else {
+          ASSERT_EQ(
+              heap_buf_allocators[i].GetNumAllocations(),
+              expected_stats.buf_allocation_stats.num_heap_buf_allocations);
+          ASSERT_EQ(compressed_buf_allocators[i].GetNumAllocations(),
+                    expected_stats.buf_allocation_stats
+                        .num_compressed_buf_allocations);
+
+          // The allocated buffers are not deallocated until
+          // the block content is deleted.
+          ASSERT_EQ(heap_buf_allocators[i].GetNumDeallocations(), 0);
+          ASSERT_EQ(compressed_buf_allocators[i].GetNumDeallocations(), 0);
+          blocks[i].allocation.reset();
+          ASSERT_EQ(
+              heap_buf_allocators[i].GetNumDeallocations(),
+              expected_stats.buf_allocation_stats.num_heap_buf_allocations);
+          ASSERT_EQ(compressed_buf_allocators[i].GetNumDeallocations(),
+                    expected_stats.buf_allocation_stats
+                        .num_compressed_buf_allocations);
+        }
+      }
+    }
+  }
+
+  void SetMode(Mode mode) {
+    switch (mode) {
+      case Mode::kBufferedRead:
+        options_.use_direct_reads = false;
+        options_.allow_mmap_reads = false;
+        break;
+      case Mode::kBufferedMmap:
+        options_.use_direct_reads = false;
+        options_.allow_mmap_reads = true;
+        break;
+      case Mode::kDirectRead:
+        options_.use_direct_reads = true;
+        options_.allow_mmap_reads = false;
+        break;
+      case Mode::kNumModes:
+        assert(false);
+    }
+  }
+
+ private:
+  std::string test_dir_;
+  Env* env_;
+  std::shared_ptr<FileSystem> fs_;
+  BlockBasedTableFactory table_factory_;
+  Options options_;
+
+  std::string Path(const std::string& fname) { return test_dir_ + "/" + fname; }
+
+  void WriteToFile(const std::string& content, const std::string& filename) {
+    std::unique_ptr<FSWritableFile> f;
+    ASSERT_OK(fs_->NewWritableFile(Path(filename), FileOptions(), &f, nullptr));
+    ASSERT_OK(f->Append(content, IOOptions(), nullptr));
+    ASSERT_OK(f->Close(IOOptions(), nullptr));
+  }
+
+  void NewFileWriter(const std::string& filename,
+                     std::unique_ptr<WritableFileWriter>* writer) {
+    std::string path = Path(filename);
+    FileOptions file_options;
+    ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), path,
+                                         file_options, writer, nullptr));
+  }
+
+  void NewFileReader(const std::string& filename, const FileOptions& opt,
+                     std::unique_ptr<RandomAccessFileReader>* reader) {
+    std::string path = Path(filename);
+    std::unique_ptr<FSRandomAccessFile> f;
+    ASSERT_OK(fs_->NewRandomAccessFile(path, opt, &f, nullptr));
+    reader->reset(new RandomAccessFileReader(std::move(f), path,
+                                             env_->GetSystemClock().get()));
+  }
+
+  void NewTableReader(const ImmutableOptions& ioptions,
+                      const FileOptions& foptions,
+                      const InternalKeyComparator& comparator,
+                      const std::string& table_name,
+                      std::unique_ptr<BlockBasedTable>* table) {
+    std::unique_ptr<RandomAccessFileReader> file;
+    NewFileReader(table_name, foptions, &file);
+
+    uint64_t file_size = 0;
+    ASSERT_OK(env_->GetFileSize(Path(table_name), &file_size));
+
+    std::unique_ptr<TableReader> table_reader;
+    ReadOptions ro;
+    const auto* table_options =
+        table_factory_.GetOptions<BlockBasedTableOptions>();
+    ASSERT_NE(table_options, nullptr);
+    ASSERT_OK(BlockBasedTable::Open(ro, ioptions, EnvOptions(), *table_options,
+                                    comparator, std::move(file), file_size,
+                                    &table_reader));
+
+    table->reset(reinterpret_cast<BlockBasedTable*>(table_reader.release()));
+  }
+
+  std::string ToInternalKey(const std::string& key) {
+    InternalKey internal_key(key, 0, ValueType::kTypeValue);
+    return internal_key.Encode().ToString();
+  }
+
+  void ReadFooter(RandomAccessFileReader* file, Footer* footer) {
+    uint64_t file_size = 0;
+    ASSERT_OK(env_->GetFileSize(file->file_name(), &file_size));
+    IOOptions opts;
+    ASSERT_OK(ReadFooterFromFile(opts, file, nullptr /* prefetch_buffer */,
+                                 file_size, footer,
+                                 kBlockBasedTableMagicNumber));
+  }
+
+  // NOTE: compression_type returns the compression type of the fetched block
+  // contents, so if the block is fetched and uncompressed, then it's
+  // kNoCompression.
+  void FetchBlock(RandomAccessFileReader* file, const BlockHandle& block,
+                  BlockType block_type, bool compressed, bool do_uncompress,
+                  MemoryAllocator* heap_buf_allocator,
+                  MemoryAllocator* compressed_buf_allocator,
+                  BlockContents* contents, MemcpyStats* stats,
+                  CompressionType* compresstion_type) {
+    ImmutableOptions ioptions(options_);
+    ReadOptions roptions;
+    PersistentCacheOptions persistent_cache_options;
+    Footer footer;
+    ReadFooter(file, &footer);
+    std::unique_ptr<BlockFetcher> fetcher(new BlockFetcher(
+        file, nullptr /* prefetch_buffer */, footer, roptions, block, contents,
+        ioptions, do_uncompress, compressed, block_type,
+        UncompressionDict::GetEmptyDict(), persistent_cache_options,
+        heap_buf_allocator, compressed_buf_allocator));
+
+    ASSERT_OK(fetcher->ReadBlockContents());
+
+    stats->num_stack_buf_memcpy = fetcher->TEST_GetNumStackBufMemcpy();
+    stats->num_heap_buf_memcpy = fetcher->TEST_GetNumHeapBufMemcpy();
+    stats->num_compressed_buf_memcpy =
+        fetcher->TEST_GetNumCompressedBufMemcpy();
+
+    *compresstion_type = fetcher->get_compression_type();
+  }
+
+  // NOTE: expected_compression_type is the expected compression
+  // type of the fetched block content, if the block is uncompressed,
+  // then the expected compression type is kNoCompression.
+  void FetchFirstDataBlock(const std::string& table_name, bool compressed,
+                           bool do_uncompress,
+                           CompressionType expected_compression_type,
+                           MemoryAllocator* heap_buf_allocator,
+                           MemoryAllocator* compressed_buf_allocator,
+                           BlockContents* block, std::string* result,
+                           MemcpyStats* memcpy_stats) {
+    ImmutableOptions ioptions(options_);
+    InternalKeyComparator comparator(options_.comparator);
+    FileOptions foptions(options_);
+
+    // Get block handle for the first data block.
+    std::unique_ptr<BlockBasedTable> table;
+    NewTableReader(ioptions, foptions, comparator, table_name, &table);
+
+    std::unique_ptr<BlockBasedTable::IndexReader> index_reader;
+    ReadOptions ro;
+    ASSERT_OK(BinarySearchIndexReader::Create(
+        table.get(), ro, nullptr /* prefetch_buffer */, false /* use_cache */,
+        false /* prefetch */, false /* pin */, nullptr /* lookup_context */,
+        &index_reader));
+
+    std::unique_ptr<InternalIteratorBase<IndexValue>> iter(
+        index_reader->NewIterator(
+            ReadOptions(), false /* disable_prefix_seek */, nullptr /* iter */,
+            nullptr /* get_context */, nullptr /* lookup_context */));
+    ASSERT_OK(iter->status());
+    iter->SeekToFirst();
+    BlockHandle first_block_handle = iter->value().handle;
+
+    // Fetch first data block.
+    std::unique_ptr<RandomAccessFileReader> file;
+    NewFileReader(table_name, foptions, &file);
+    CompressionType compression_type;
+    FetchBlock(file.get(), first_block_handle, BlockType::kData, compressed,
+               do_uncompress, heap_buf_allocator, compressed_buf_allocator,
+               block, memcpy_stats, &compression_type);
+    ASSERT_EQ(compression_type, expected_compression_type);
+    result->assign(block->data.ToString());
+  }
+};
+
+// Skip the following tests in lite mode since direct I/O is unsupported.
+#ifndef ROCKSDB_LITE
+
+// Fetch index block under both direct IO and non-direct IO.
+// Expects:
+// the index block contents are the same for both read modes.
+TEST_F(BlockFetcherTest, FetchIndexBlock) {
+  for (CompressionType compression : GetSupportedCompressions()) {
+    std::string table_name =
+        "FetchIndexBlock" + CompressionTypeToString(compression);
+    CreateTable(table_name, compression);
+
+    CountedMemoryAllocator allocator;
+    MemcpyStats memcpy_stats;
+    BlockContents indexes[NumModes];
+    std::string index_datas[NumModes];
+    for (int i = 0; i < NumModes; ++i) {
+      SetMode(static_cast<Mode>(i));
+      FetchIndexBlock(table_name, &allocator, &allocator, &memcpy_stats,
+                      &indexes[i], &index_datas[i]);
+    }
+    for (int i = 0; i < NumModes - 1; ++i) {
+      AssertSameBlock(index_datas[i], index_datas[i + 1]);
+    }
+  }
+}
+
+// Data blocks are not compressed,
+// fetch data block under direct IO, mmap IO,and non-direct IO.
+// Expects:
+// 1. in non-direct IO mode, allocate a heap buffer and memcpy the block
+//    into the buffer;
+// 2. in direct IO mode, allocate a heap buffer and memcpy from the
+//    direct IO buffer to the heap buffer.
+TEST_F(BlockFetcherTest, FetchUncompressedDataBlock) {
+  TestStats expected_non_mmap_stats = {
+      {
+          0 /* num_stack_buf_memcpy */,
+          1 /* num_heap_buf_memcpy */,
+          0 /* num_compressed_buf_memcpy */,
+      },
+      {
+          1 /* num_heap_buf_allocations */,
+          0 /* num_compressed_buf_allocations */,
+      }};
+  TestStats expected_mmap_stats = {{
+                                       0 /* num_stack_buf_memcpy */,
+                                       0 /* num_heap_buf_memcpy */,
+                                       0 /* num_compressed_buf_memcpy */,
+                                   },
+                                   {
+                                       0 /* num_heap_buf_allocations */,
+                                       0 /* num_compressed_buf_allocations */,
+                                   }};
+  std::array<TestStats, NumModes> expected_stats_by_mode{{
+      expected_non_mmap_stats /* kBufferedRead */,
+      expected_mmap_stats /* kBufferedMmap */,
+      expected_non_mmap_stats /* kDirectRead */,
+  }};
+  TestFetchDataBlock("FetchUncompressedDataBlock", false, false,
+                     expected_stats_by_mode);
+}
+
+// Data blocks are compressed,
+// fetch data block under both direct IO and non-direct IO,
+// but do not uncompress.
+// Expects:
+// 1. in non-direct IO mode, allocate a compressed buffer and memcpy the block
+//    into the buffer;
+// 2. in direct IO mode, allocate a compressed buffer and memcpy from the
+//    direct IO buffer to the compressed buffer.
+TEST_F(BlockFetcherTest, FetchCompressedDataBlock) {
+  TestStats expected_non_mmap_stats = {
+      {
+          0 /* num_stack_buf_memcpy */,
+          0 /* num_heap_buf_memcpy */,
+          1 /* num_compressed_buf_memcpy */,
+      },
+      {
+          0 /* num_heap_buf_allocations */,
+          1 /* num_compressed_buf_allocations */,
+      }};
+  TestStats expected_mmap_stats = {{
+                                       0 /* num_stack_buf_memcpy */,
+                                       0 /* num_heap_buf_memcpy */,
+                                       0 /* num_compressed_buf_memcpy */,
+                                   },
+                                   {
+                                       0 /* num_heap_buf_allocations */,
+                                       0 /* num_compressed_buf_allocations */,
+                                   }};
+  std::array<TestStats, NumModes> expected_stats_by_mode{{
+      expected_non_mmap_stats /* kBufferedRead */,
+      expected_mmap_stats /* kBufferedMmap */,
+      expected_non_mmap_stats /* kDirectRead */,
+  }};
+  TestFetchDataBlock("FetchCompressedDataBlock", true, false,
+                     expected_stats_by_mode);
+}
+
+// Data blocks are compressed,
+// fetch and uncompress data block under both direct IO and non-direct IO.
+// Expects:
+// 1. in non-direct IO mode, since the block is small, so it's first memcpyed
+//    to the stack buffer, then a heap buffer is allocated and the block is
+//    uncompressed into the heap.
+// 2. in direct IO mode mode, allocate a heap buffer, then directly uncompress
+//    and memcpy from the direct IO buffer to the heap buffer.
+TEST_F(BlockFetcherTest, FetchAndUncompressCompressedDataBlock) {
+  TestStats expected_buffered_read_stats = {
+      {
+          1 /* num_stack_buf_memcpy */,
+          1 /* num_heap_buf_memcpy */,
+          0 /* num_compressed_buf_memcpy */,
+      },
+      {
+          1 /* num_heap_buf_allocations */,
+          0 /* num_compressed_buf_allocations */,
+      }};
+  TestStats expected_mmap_stats = {{
+                                       0 /* num_stack_buf_memcpy */,
+                                       1 /* num_heap_buf_memcpy */,
+                                       0 /* num_compressed_buf_memcpy */,
+                                   },
+                                   {
+                                       1 /* num_heap_buf_allocations */,
+                                       0 /* num_compressed_buf_allocations */,
+                                   }};
+  TestStats expected_direct_read_stats = {
+      {
+          0 /* num_stack_buf_memcpy */,
+          1 /* num_heap_buf_memcpy */,
+          0 /* num_compressed_buf_memcpy */,
+      },
+      {
+          1 /* num_heap_buf_allocations */,
+          0 /* num_compressed_buf_allocations */,
+      }};
+  std::array<TestStats, NumModes> expected_stats_by_mode{{
+      expected_buffered_read_stats,
+      expected_mmap_stats,
+      expected_direct_read_stats,
+  }};
+  TestFetchDataBlock("FetchAndUncompressCompressedDataBlock", true, true,
+                     expected_stats_by_mode);
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/cleanable_test.cc b/src/rocksdb/table/cleanable_test.cc
new file mode 100644
index 000000000..b58eb7dc6
--- /dev/null
+++ b/src/rocksdb/table/cleanable_test.cc
@@ -0,0 +1,390 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/cleanable.h"
+
+#include <gtest/gtest.h>
+
+#include <functional>
+
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/iostats_context.h"
+#include "rocksdb/perf_context.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CleanableTest : public testing::Test {};
+
+// Use this to keep track of the cleanups that were actually performed
+void Multiplier(void* arg1, void* arg2) {
+  int* res = reinterpret_cast<int*>(arg1);
+  int* num = reinterpret_cast<int*>(arg2);
+  *res *= *num;
+}
+
+// the first Cleanup is on stack and the rest on heap, so test with both cases
+TEST_F(CleanableTest, Register) {
+  int n2 = 2, n3 = 3;
+  int res = 1;
+  { Cleanable c1; }
+  // ~Cleanable
+  ASSERT_EQ(1, res);
+
+  res = 1;
+  {
+    Cleanable c1;
+    c1.RegisterCleanup(Multiplier, &res, &n2);  // res = 2;
+  }
+  // ~Cleanable
+  ASSERT_EQ(2, res);
+
+  res = 1;
+  {
+    Cleanable c1;
+    c1.RegisterCleanup(Multiplier, &res, &n2);  // res = 2;
+    c1.RegisterCleanup(Multiplier, &res, &n3);  // res = 2 * 3;
+  }
+  // ~Cleanable
+  ASSERT_EQ(6, res);
+
+  // Test the Reset does cleanup
+  res = 1;
+  {
+    Cleanable c1;
+    c1.RegisterCleanup(Multiplier, &res, &n2);  // res = 2;
+    c1.RegisterCleanup(Multiplier, &res, &n3);  // res = 2 * 3;
+    c1.Reset();
+    ASSERT_EQ(6, res);
+  }
+  // ~Cleanable
+  ASSERT_EQ(6, res);
+
+  // Test Clenable is usable after Reset
+  res = 1;
+  {
+    Cleanable c1;
+    c1.RegisterCleanup(Multiplier, &res, &n2);  // res = 2;
+    c1.Reset();
+    ASSERT_EQ(2, res);
+    c1.RegisterCleanup(Multiplier, &res, &n3);  // res = 2 * 3;
+  }
+  // ~Cleanable
+  ASSERT_EQ(6, res);
+}
+
+// the first Cleanup is on stack and the rest on heap,
+// so test all the combinations of them
+TEST_F(CleanableTest, Delegation) {
+  int n2 = 2, n3 = 3, n5 = 5, n7 = 7;
+  int res = 1;
+  {
+    Cleanable c2;
+    {
+      Cleanable c1;
+      c1.RegisterCleanup(Multiplier, &res, &n2);  // res = 2;
+      c1.DelegateCleanupsTo(&c2);
+    }
+    // ~Cleanable
+    ASSERT_EQ(1, res);
+  }
+  // ~Cleanable
+  ASSERT_EQ(2, res);
+
+  res = 1;
+  {
+    Cleanable c2;
+    {
+      Cleanable c1;
+      c1.DelegateCleanupsTo(&c2);
+    }
+    // ~Cleanable
+    ASSERT_EQ(1, res);
+  }
+  // ~Cleanable
+  ASSERT_EQ(1, res);
+
+  res = 1;
+  {
+    Cleanable c2;
+    {
+      Cleanable c1;
+      c1.RegisterCleanup(Multiplier, &res, &n2);  // res = 2;
+      c1.RegisterCleanup(Multiplier, &res, &n3);  // res = 2 * 3;
+      c1.DelegateCleanupsTo(&c2);
+    }
+    // ~Cleanable
+    ASSERT_EQ(1, res);
+  }
+  // ~Cleanable
+  ASSERT_EQ(6, res);
+
+  res = 1;
+  {
+    Cleanable c2;
+    c2.RegisterCleanup(Multiplier, &res, &n5);  // res = 5;
+    {
+      Cleanable c1;
+      c1.RegisterCleanup(Multiplier, &res, &n2);  // res = 2;
+      c1.RegisterCleanup(Multiplier, &res, &n3);  // res = 2 * 3;
+      c1.DelegateCleanupsTo(&c2);                 // res = 2 * 3 * 5;
+    }
+    // ~Cleanable
+    ASSERT_EQ(1, res);
+  }
+  // ~Cleanable
+  ASSERT_EQ(30, res);
+
+  res = 1;
+  {
+    Cleanable c2;
+    c2.RegisterCleanup(Multiplier, &res, &n5);  // res = 5;
+    c2.RegisterCleanup(Multiplier, &res, &n7);  // res = 5 * 7;
+    {
+      Cleanable c1;
+      c1.RegisterCleanup(Multiplier, &res, &n2);  // res = 2;
+      c1.RegisterCleanup(Multiplier, &res, &n3);  // res = 2 * 3;
+      c1.DelegateCleanupsTo(&c2);                 // res = 2 * 3 * 5 * 7;
+    }
+    // ~Cleanable
+    ASSERT_EQ(1, res);
+  }
+  // ~Cleanable
+  ASSERT_EQ(210, res);
+
+  res = 1;
+  {
+    Cleanable c2;
+    c2.RegisterCleanup(Multiplier, &res, &n5);  // res = 5;
+    c2.RegisterCleanup(Multiplier, &res, &n7);  // res = 5 * 7;
+    {
+      Cleanable c1;
+      c1.RegisterCleanup(Multiplier, &res, &n2);  // res = 2;
+      c1.DelegateCleanupsTo(&c2);                 // res = 2 * 5 * 7;
+    }
+    // ~Cleanable
+    ASSERT_EQ(1, res);
+  }
+  // ~Cleanable
+  ASSERT_EQ(70, res);
+
+  res = 1;
+  {
+    Cleanable c2;
+    c2.RegisterCleanup(Multiplier, &res, &n5);  // res = 5;
+    c2.RegisterCleanup(Multiplier, &res, &n7);  // res = 5 * 7;
+    {
+      Cleanable c1;
+      c1.DelegateCleanupsTo(&c2);  // res = 5 * 7;
+    }
+    // ~Cleanable
+    ASSERT_EQ(1, res);
+  }
+  // ~Cleanable
+  ASSERT_EQ(35, res);
+
+  res = 1;
+  {
+    Cleanable c2;
+    c2.RegisterCleanup(Multiplier, &res, &n5);  // res = 5;
+    {
+      Cleanable c1;
+      c1.DelegateCleanupsTo(&c2);  // res = 5;
+    }
+    // ~Cleanable
+    ASSERT_EQ(1, res);
+  }
+  // ~Cleanable
+  ASSERT_EQ(5, res);
+}
+
+static void ReleaseStringHeap(void* s, void*) {
+  delete reinterpret_cast<const std::string*>(s);
+}
+
+class PinnableSlice4Test : public PinnableSlice {
+ public:
+  void TestStringIsRegistered(std::string* s) {
+    ASSERT_TRUE(cleanup_.function == ReleaseStringHeap);
+    ASSERT_EQ(cleanup_.arg1, s);
+    ASSERT_EQ(cleanup_.arg2, nullptr);
+    ASSERT_EQ(cleanup_.next, nullptr);
+  }
+};
+
+// Putting the PinnableSlice tests here due to similarity to Cleanable tests
+TEST_F(CleanableTest, PinnableSlice) {
+  int n2 = 2;
+  int res = 1;
+  const std::string const_str = "123";
+
+  {
+    res = 1;
+    PinnableSlice4Test value;
+    Slice slice(const_str);
+    value.PinSlice(slice, Multiplier, &res, &n2);
+    std::string str;
+    str.assign(value.data(), value.size());
+    ASSERT_EQ(const_str, str);
+  }
+  // ~Cleanable
+  ASSERT_EQ(2, res);
+
+  {
+    res = 1;
+    PinnableSlice4Test value;
+    Slice slice(const_str);
+    {
+      Cleanable c1;
+      c1.RegisterCleanup(Multiplier, &res, &n2);  // res = 2;
+      value.PinSlice(slice, &c1);
+    }
+    // ~Cleanable
+    ASSERT_EQ(1, res);  // cleanups must have be delegated to value
+    std::string str;
+    str.assign(value.data(), value.size());
+    ASSERT_EQ(const_str, str);
+  }
+  // ~Cleanable
+  ASSERT_EQ(2, res);
+
+  {
+    PinnableSlice4Test value;
+    Slice slice(const_str);
+    value.PinSelf(slice);
+    std::string str;
+    str.assign(value.data(), value.size());
+    ASSERT_EQ(const_str, str);
+  }
+
+  {
+    PinnableSlice4Test value;
+    std::string* self_str_ptr = value.GetSelf();
+    self_str_ptr->assign(const_str);
+    value.PinSelf();
+    std::string str;
+    str.assign(value.data(), value.size());
+    ASSERT_EQ(const_str, str);
+  }
+}
+
+static void Decrement(void* intptr, void*) { --*static_cast<int*>(intptr); }
+
+// Allow unit testing moved-from data
+template <class T>
+void MarkInitializedForClangAnalyze(T& t) {
+  // No net effect, but confuse analyzer. (Published advice doesn't work.)
+  char* p = reinterpret_cast<char*>(&t);
+  std::swap(*p, *p);
+}
+
+TEST_F(CleanableTest, SharedWrapCleanables) {
+  int val = 5;
+  Cleanable c1, c2;
+  c1.RegisterCleanup(&Decrement, &val, nullptr);
+  c1.RegisterCleanup(&Decrement, &val, nullptr);
+  ASSERT_TRUE(c1.HasCleanups());
+  ASSERT_FALSE(c2.HasCleanups());
+
+  SharedCleanablePtr scp1;
+  ASSERT_EQ(scp1.get(), nullptr);
+
+  // No-ops
+  scp1.RegisterCopyWith(&c2);
+  scp1.MoveAsCleanupTo(&c2);
+
+  ASSERT_FALSE(c2.HasCleanups());
+  c2.RegisterCleanup(&Decrement, &val, nullptr);
+  c2.RegisterCleanup(&Decrement, &val, nullptr);
+  c2.RegisterCleanup(&Decrement, &val, nullptr);
+
+  scp1.Allocate();
+  ASSERT_NE(scp1.get(), nullptr);
+  ASSERT_FALSE(scp1->HasCleanups());
+
+  // Copy ctor (alias scp2 = scp1)
+  SharedCleanablePtr scp2{scp1};
+  ASSERT_EQ(scp1.get(), scp2.get());
+
+  c1.DelegateCleanupsTo(&*scp1);
+  ASSERT_TRUE(scp1->HasCleanups());
+  ASSERT_TRUE(scp2->HasCleanups());
+  ASSERT_FALSE(c1.HasCleanups());
+
+  SharedCleanablePtr scp3;
+  ASSERT_EQ(scp3.get(), nullptr);
+
+  // Copy operator (alias scp3 = scp2 = scp1)
+  scp3 = scp2;
+
+  // Make scp2 point elsewhere
+  scp2.Allocate();
+  c2.DelegateCleanupsTo(&*scp2);
+
+  ASSERT_EQ(val, 5);
+  // Move operator, invoke old c2 cleanups
+  scp2 = std::move(scp1);
+  ASSERT_EQ(val, 2);
+  MarkInitializedForClangAnalyze(scp1);
+  ASSERT_EQ(scp1.get(), nullptr);
+
+  // Move ctor
+  {
+    SharedCleanablePtr scp4{std::move(scp3)};
+    MarkInitializedForClangAnalyze(scp3);
+    ASSERT_EQ(scp3.get(), nullptr);
+    ASSERT_EQ(scp4.get(), scp2.get());
+
+    scp2.Reset();
+    ASSERT_EQ(val, 2);
+    // invoke old c1 cleanups
+  }
+  ASSERT_EQ(val, 0);
+}
+
+TEST_F(CleanableTest, CleanableWrapShared) {
+  int val = 5;
+  SharedCleanablePtr scp1, scp2;
+  scp1.Allocate();
+  scp1->RegisterCleanup(&Decrement, &val, nullptr);
+  scp1->RegisterCleanup(&Decrement, &val, nullptr);
+
+  scp2.Allocate();
+  scp2->RegisterCleanup(&Decrement, &val, nullptr);
+  scp2->RegisterCleanup(&Decrement, &val, nullptr);
+  scp2->RegisterCleanup(&Decrement, &val, nullptr);
+
+  {
+    Cleanable c1;
+    {
+      Cleanable c2, c3;
+      scp1.RegisterCopyWith(&c1);
+      scp1.MoveAsCleanupTo(&c2);
+      ASSERT_TRUE(c1.HasCleanups());
+      ASSERT_TRUE(c2.HasCleanups());
+      ASSERT_EQ(scp1.get(), nullptr);
+      scp2.MoveAsCleanupTo(&c3);
+      ASSERT_TRUE(c3.HasCleanups());
+      ASSERT_EQ(scp2.get(), nullptr);
+      c2.Reset();
+      ASSERT_FALSE(c2.HasCleanups());
+      ASSERT_EQ(val, 5);
+      // invoke cleanups from scp2
+    }
+    ASSERT_EQ(val, 2);
+    // invoke cleanups from scp1
+  }
+  ASSERT_EQ(val, 0);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_builder.cc b/src/rocksdb/table/cuckoo/cuckoo_table_builder.cc
new file mode 100644
index 000000000..296825d94
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_builder.cc
@@ -0,0 +1,553 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "table/cuckoo/cuckoo_table_builder.h"
+
+#include <assert.h>
+
+#include <algorithm>
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_builder.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "util/autovector.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+const std::string CuckooTablePropertyNames::kEmptyKey =
+    "rocksdb.cuckoo.bucket.empty.key";
+const std::string CuckooTablePropertyNames::kNumHashFunc =
+    "rocksdb.cuckoo.hash.num";
+const std::string CuckooTablePropertyNames::kHashTableSize =
+    "rocksdb.cuckoo.hash.size";
+const std::string CuckooTablePropertyNames::kValueLength =
+    "rocksdb.cuckoo.value.length";
+const std::string CuckooTablePropertyNames::kIsLastLevel =
+    "rocksdb.cuckoo.file.islastlevel";
+const std::string CuckooTablePropertyNames::kCuckooBlockSize =
+    "rocksdb.cuckoo.hash.cuckooblocksize";
+const std::string CuckooTablePropertyNames::kIdentityAsFirstHash =
+    "rocksdb.cuckoo.hash.identityfirst";
+const std::string CuckooTablePropertyNames::kUseModuleHash =
+    "rocksdb.cuckoo.hash.usemodule";
+const std::string CuckooTablePropertyNames::kUserKeyLength =
+    "rocksdb.cuckoo.hash.userkeylength";
+
+// Obtained by running echo rocksdb.table.cuckoo | sha1sum
+extern const uint64_t kCuckooTableMagicNumber = 0x926789d0c5f17873ull;
+
+CuckooTableBuilder::CuckooTableBuilder(
+    WritableFileWriter* file, double max_hash_table_ratio,
+    uint32_t max_num_hash_table, uint32_t max_search_depth,
+    const Comparator* user_comparator, uint32_t cuckoo_block_size,
+    bool use_module_hash, bool identity_as_first_hash,
+    uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t),
+    uint32_t column_family_id, const std::string& column_family_name,
+    const std::string& db_id, const std::string& db_session_id,
+    uint64_t file_number)
+    : num_hash_func_(2),
+      file_(file),
+      max_hash_table_ratio_(max_hash_table_ratio),
+      max_num_hash_func_(max_num_hash_table),
+      max_search_depth_(max_search_depth),
+      cuckoo_block_size_(std::max(1U, cuckoo_block_size)),
+      hash_table_size_(use_module_hash ? 0 : 2),
+      is_last_level_file_(false),
+      has_seen_first_key_(false),
+      has_seen_first_value_(false),
+      key_size_(0),
+      value_size_(0),
+      num_entries_(0),
+      num_values_(0),
+      ucomp_(user_comparator),
+      use_module_hash_(use_module_hash),
+      identity_as_first_hash_(identity_as_first_hash),
+      get_slice_hash_(get_slice_hash),
+      closed_(false) {
+  // Data is in a huge block.
+  properties_.num_data_blocks = 1;
+  properties_.index_size = 0;
+  properties_.filter_size = 0;
+  properties_.column_family_id = column_family_id;
+  properties_.column_family_name = column_family_name;
+  properties_.db_id = db_id;
+  properties_.db_session_id = db_session_id;
+  properties_.orig_file_number = file_number;
+  status_.PermitUncheckedError();
+  io_status_.PermitUncheckedError();
+}
+
+void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
+  if (num_entries_ >= kMaxVectorIdx - 1) {
+    status_ = Status::NotSupported("Number of keys in a file must be < 2^32-1");
+    return;
+  }
+  ParsedInternalKey ikey;
+  Status pik_status =
+      ParseInternalKey(key, &ikey, false /* log_err_key */);  // TODO
+  if (!pik_status.ok()) {
+    status_ = Status::Corruption("Unable to parse key into internal key. ",
+                                 pik_status.getState());
+    return;
+  }
+  if (ikey.type != kTypeDeletion && ikey.type != kTypeValue) {
+    status_ = Status::NotSupported("Unsupported key type " +
+                                   std::to_string(ikey.type));
+    return;
+  }
+
+  // Determine if we can ignore the sequence number and value type from
+  // internal keys by looking at sequence number from first key. We assume
+  // that if first key has a zero sequence number, then all the remaining
+  // keys will have zero seq. no.
+  if (!has_seen_first_key_) {
+    is_last_level_file_ = ikey.sequence == 0;
+    has_seen_first_key_ = true;
+    smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
+    largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
+    key_size_ = is_last_level_file_ ? ikey.user_key.size() : key.size();
+  }
+  if (key_size_ != (is_last_level_file_ ? ikey.user_key.size() : key.size())) {
+    status_ = Status::NotSupported("all keys have to be the same size");
+    return;
+  }
+
+  if (ikey.type == kTypeValue) {
+    if (!has_seen_first_value_) {
+      has_seen_first_value_ = true;
+      value_size_ = value.size();
+    }
+    if (value_size_ != value.size()) {
+      status_ = Status::NotSupported("all values have to be the same size");
+      return;
+    }
+
+    if (is_last_level_file_) {
+      kvs_.append(ikey.user_key.data(), ikey.user_key.size());
+    } else {
+      kvs_.append(key.data(), key.size());
+    }
+    kvs_.append(value.data(), value.size());
+    ++num_values_;
+  } else {
+    if (is_last_level_file_) {
+      deleted_keys_.append(ikey.user_key.data(), ikey.user_key.size());
+    } else {
+      deleted_keys_.append(key.data(), key.size());
+    }
+  }
+  ++num_entries_;
+
+  // In order to fill the empty buckets in the hash table, we identify a
+  // key which is not used so far (unused_user_key). We determine this by
+  // maintaining smallest and largest keys inserted so far in bytewise order
+  // and use them to find a key outside this range in Finish() operation.
+  // Note that this strategy is independent of user comparator used here.
+  if (ikey.user_key.compare(smallest_user_key_) < 0) {
+    smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
+  } else if (ikey.user_key.compare(largest_user_key_) > 0) {
+    largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
+  }
+  if (!use_module_hash_) {
+    if (hash_table_size_ < num_entries_ / max_hash_table_ratio_) {
+      hash_table_size_ *= 2;
+    }
+  }
+}
+
+bool CuckooTableBuilder::IsDeletedKey(uint64_t idx) const {
+  assert(closed_);
+  return idx >= num_values_;
+}
+
+Slice CuckooTableBuilder::GetKey(uint64_t idx) const {
+  assert(closed_);
+  if (IsDeletedKey(idx)) {
+    return Slice(
+        &deleted_keys_[static_cast<size_t>((idx - num_values_) * key_size_)],
+        static_cast<size_t>(key_size_));
+  }
+  return Slice(&kvs_[static_cast<size_t>(idx * (key_size_ + value_size_))],
+               static_cast<size_t>(key_size_));
+}
+
+Slice CuckooTableBuilder::GetUserKey(uint64_t idx) const {
+  assert(closed_);
+  return is_last_level_file_ ? GetKey(idx) : ExtractUserKey(GetKey(idx));
+}
+
+Slice CuckooTableBuilder::GetValue(uint64_t idx) const {
+  assert(closed_);
+  if (IsDeletedKey(idx)) {
+    static std::string empty_value(static_cast<unsigned int>(value_size_), 'a');
+    return Slice(empty_value);
+  }
+  return Slice(
+      &kvs_[static_cast<size_t>(idx * (key_size_ + value_size_) + key_size_)],
+      static_cast<size_t>(value_size_));
+}
+
+Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
+  buckets->resize(
+      static_cast<size_t>(hash_table_size_ + cuckoo_block_size_ - 1));
+  uint32_t make_space_for_key_call_id = 0;
+  for (uint32_t vector_idx = 0; vector_idx < num_entries_; vector_idx++) {
+    uint64_t bucket_id = 0;
+    bool bucket_found = false;
+    autovector<uint64_t> hash_vals;
+    Slice user_key = GetUserKey(vector_idx);
+    for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !bucket_found;
+         ++hash_cnt) {
+      uint64_t hash_val =
+          CuckooHash(user_key, hash_cnt, use_module_hash_, hash_table_size_,
+                     identity_as_first_hash_, get_slice_hash_);
+      // If there is a collision, check next cuckoo_block_size_ locations for
+      // empty locations. While checking, if we reach end of the hash table,
+      // stop searching and proceed for next hash function.
+      for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
+           ++block_idx, ++hash_val) {
+        if ((*buckets)[static_cast<size_t>(hash_val)].vector_idx ==
+            kMaxVectorIdx) {
+          bucket_id = hash_val;
+          bucket_found = true;
+          break;
+        } else {
+          if (ucomp_->Compare(
+                  user_key, GetUserKey((*buckets)[static_cast<size_t>(hash_val)]
+                                           .vector_idx)) == 0) {
+            return Status::NotSupported("Same key is being inserted again.");
+          }
+          hash_vals.push_back(hash_val);
+        }
+      }
+    }
+    while (!bucket_found &&
+           !MakeSpaceForKey(hash_vals, ++make_space_for_key_call_id, buckets,
+                            &bucket_id)) {
+      // Rehash by increashing number of hash tables.
+      if (num_hash_func_ >= max_num_hash_func_) {
+        return Status::NotSupported("Too many collisions. Unable to hash.");
+      }
+      // We don't really need to rehash the entire table because old hashes are
+      // still valid and we only increased the number of hash functions.
+      uint64_t hash_val = CuckooHash(user_key, num_hash_func_, use_module_hash_,
+                                     hash_table_size_, identity_as_first_hash_,
+                                     get_slice_hash_);
+      ++num_hash_func_;
+      for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
+           ++block_idx, ++hash_val) {
+        if ((*buckets)[static_cast<size_t>(hash_val)].vector_idx ==
+            kMaxVectorIdx) {
+          bucket_found = true;
+          bucket_id = hash_val;
+          break;
+        } else {
+          hash_vals.push_back(hash_val);
+        }
+      }
+    }
+    (*buckets)[static_cast<size_t>(bucket_id)].vector_idx = vector_idx;
+  }
+  return Status::OK();
+}
+
+Status CuckooTableBuilder::Finish() {
+  assert(!closed_);
+  closed_ = true;
+  std::vector<CuckooBucket> buckets;
+  std::string unused_bucket;
+  if (num_entries_ > 0) {
+    // Calculate the real hash size if module hash is enabled.
+    if (use_module_hash_) {
+      hash_table_size_ =
+          static_cast<uint64_t>(num_entries_ / max_hash_table_ratio_);
+    }
+    status_ = MakeHashTable(&buckets);
+    if (!status_.ok()) {
+      return status_;
+    }
+    // Determine unused_user_key to fill empty buckets.
+    std::string unused_user_key = smallest_user_key_;
+    int curr_pos = static_cast<int>(unused_user_key.size()) - 1;
+    while (curr_pos >= 0) {
+      --unused_user_key[curr_pos];
+      if (Slice(unused_user_key).compare(smallest_user_key_) < 0) {
+        break;
+      }
+      --curr_pos;
+    }
+    if (curr_pos < 0) {
+      // Try using the largest key to identify an unused key.
+      unused_user_key = largest_user_key_;
+      curr_pos = static_cast<int>(unused_user_key.size()) - 1;
+      while (curr_pos >= 0) {
+        ++unused_user_key[curr_pos];
+        if (Slice(unused_user_key).compare(largest_user_key_) > 0) {
+          break;
+        }
+        --curr_pos;
+      }
+    }
+    if (curr_pos < 0) {
+      return Status::Corruption("Unable to find unused key");
+    }
+    if (is_last_level_file_) {
+      unused_bucket = unused_user_key;
+    } else {
+      ParsedInternalKey ikey(unused_user_key, 0, kTypeValue);
+      AppendInternalKey(&unused_bucket, ikey);
+    }
+  }
+  properties_.num_entries = num_entries_;
+  properties_.num_deletions = num_entries_ - num_values_;
+  properties_.fixed_key_len = key_size_;
+  properties_.user_collected_properties[CuckooTablePropertyNames::kValueLength]
+      .assign(reinterpret_cast<const char*>(&value_size_), sizeof(value_size_));
+
+  uint64_t bucket_size = key_size_ + value_size_;
+  unused_bucket.resize(static_cast<size_t>(bucket_size), 'a');
+  // Write the table.
+  uint32_t num_added = 0;
+  for (auto& bucket : buckets) {
+    if (bucket.vector_idx == kMaxVectorIdx) {
+      io_status_ = file_->Append(Slice(unused_bucket));
+    } else {
+      ++num_added;
+      io_status_ = file_->Append(GetKey(bucket.vector_idx));
+      if (io_status_.ok()) {
+        if (value_size_ > 0) {
+          io_status_ = file_->Append(GetValue(bucket.vector_idx));
+        }
+      }
+    }
+    if (!io_status_.ok()) {
+      status_ = io_status_;
+      return status_;
+    }
+  }
+  assert(num_added == NumEntries());
+  properties_.raw_key_size = num_added * properties_.fixed_key_len;
+  properties_.raw_value_size = num_added * value_size_;
+
+  uint64_t offset = buckets.size() * bucket_size;
+  properties_.data_size = offset;
+  unused_bucket.resize(static_cast<size_t>(properties_.fixed_key_len));
+  properties_.user_collected_properties[CuckooTablePropertyNames::kEmptyKey] =
+      unused_bucket;
+  properties_.user_collected_properties[CuckooTablePropertyNames::kNumHashFunc]
+      .assign(reinterpret_cast<char*>(&num_hash_func_), sizeof(num_hash_func_));
+
+  properties_
+      .user_collected_properties[CuckooTablePropertyNames::kHashTableSize]
+      .assign(reinterpret_cast<const char*>(&hash_table_size_),
+              sizeof(hash_table_size_));
+  properties_.user_collected_properties[CuckooTablePropertyNames::kIsLastLevel]
+      .assign(reinterpret_cast<const char*>(&is_last_level_file_),
+              sizeof(is_last_level_file_));
+  properties_
+      .user_collected_properties[CuckooTablePropertyNames::kCuckooBlockSize]
+      .assign(reinterpret_cast<const char*>(&cuckoo_block_size_),
+              sizeof(cuckoo_block_size_));
+  properties_
+      .user_collected_properties[CuckooTablePropertyNames::kIdentityAsFirstHash]
+      .assign(reinterpret_cast<const char*>(&identity_as_first_hash_),
+              sizeof(identity_as_first_hash_));
+  properties_
+      .user_collected_properties[CuckooTablePropertyNames::kUseModuleHash]
+      .assign(reinterpret_cast<const char*>(&use_module_hash_),
+              sizeof(use_module_hash_));
+  uint32_t user_key_len = static_cast<uint32_t>(smallest_user_key_.size());
+  properties_
+      .user_collected_properties[CuckooTablePropertyNames::kUserKeyLength]
+      .assign(reinterpret_cast<const char*>(&user_key_len),
+              sizeof(user_key_len));
+
+  // Write meta blocks.
+  MetaIndexBuilder meta_index_builder;
+  PropertyBlockBuilder property_block_builder;
+
+  property_block_builder.AddTableProperty(properties_);
+  property_block_builder.Add(properties_.user_collected_properties);
+  Slice property_block = property_block_builder.Finish();
+  BlockHandle property_block_handle;
+  property_block_handle.set_offset(offset);
+  property_block_handle.set_size(property_block.size());
+  io_status_ = file_->Append(property_block);
+  offset += property_block.size();
+  if (!io_status_.ok()) {
+    status_ = io_status_;
+    return status_;
+  }
+
+  meta_index_builder.Add(kPropertiesBlockName, property_block_handle);
+  Slice meta_index_block = meta_index_builder.Finish();
+
+  BlockHandle meta_index_block_handle;
+  meta_index_block_handle.set_offset(offset);
+  meta_index_block_handle.set_size(meta_index_block.size());
+  io_status_ = file_->Append(meta_index_block);
+  if (!io_status_.ok()) {
+    status_ = io_status_;
+    return status_;
+  }
+
+  FooterBuilder footer;
+  footer.Build(kCuckooTableMagicNumber, /* format_version */ 1, offset,
+               kNoChecksum, meta_index_block_handle);
+  io_status_ = file_->Append(footer.GetSlice());
+  status_ = io_status_;
+  return status_;
+}
+
+void CuckooTableBuilder::Abandon() {
+  assert(!closed_);
+  closed_ = true;
+}
+
+uint64_t CuckooTableBuilder::NumEntries() const { return num_entries_; }
+
+uint64_t CuckooTableBuilder::FileSize() const {
+  if (closed_) {
+    return file_->GetFileSize();
+  } else if (num_entries_ == 0) {
+    return 0;
+  }
+
+  if (use_module_hash_) {
+    return static_cast<uint64_t>((key_size_ + value_size_) * num_entries_ /
+                                 max_hash_table_ratio_);
+  } else {
+    // Account for buckets being a power of two.
+    // As elements are added, file size remains constant for a while and
+    // doubles its size. Since compaction algorithm stops adding elements
+    // only after it exceeds the file limit, we account for the extra element
+    // being added here.
+    uint64_t expected_hash_table_size = hash_table_size_;
+    if (expected_hash_table_size < (num_entries_ + 1) / max_hash_table_ratio_) {
+      expected_hash_table_size *= 2;
+    }
+    return (key_size_ + value_size_) * expected_hash_table_size - 1;
+  }
+}
+
+// This method is invoked when there is no place to insert the target key.
+// It searches for a set of elements that can be moved to accommodate target
+// key. The search is a BFS graph traversal with first level (hash_vals)
+// being all the buckets target key could go to.
+// Then, from each node (curr_node), we find all the buckets that curr_node
+// could go to. They form the children of curr_node in the tree.
+// We continue the traversal until we find an empty bucket, in which case, we
+// move all elements along the path from first level to this empty bucket, to
+// make space for target key which is inserted at first level (*bucket_id).
+// If tree depth exceedes max depth, we return false indicating failure.
+bool CuckooTableBuilder::MakeSpaceForKey(
+    const autovector<uint64_t>& hash_vals,
+    const uint32_t make_space_for_key_call_id,
+    std::vector<CuckooBucket>* buckets, uint64_t* bucket_id) {
+  struct CuckooNode {
+    uint64_t bucket_id;
+    uint32_t depth;
+    uint32_t parent_pos;
+    CuckooNode(uint64_t _bucket_id, uint32_t _depth, int _parent_pos)
+        : bucket_id(_bucket_id), depth(_depth), parent_pos(_parent_pos) {}
+  };
+  // This is BFS search tree that is stored simply as a vector.
+  // Each node stores the index of parent node in the vector.
+  std::vector<CuckooNode> tree;
+  // We want to identify already visited buckets in the current method call so
+  // that we don't add same buckets again for exploration in the tree.
+  // We do this by maintaining a count of current method call in
+  // make_space_for_key_call_id, which acts as a unique id for this invocation
+  // of the method. We store this number into the nodes that we explore in
+  // current method call.
+  // It is unlikely for the increment operation to overflow because the maximum
+  // no. of times this will be called is <= max_num_hash_func_ + num_entries_.
+  for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
+    uint64_t bid = hash_vals[hash_cnt];
+    (*buckets)[static_cast<size_t>(bid)].make_space_for_key_call_id =
+        make_space_for_key_call_id;
+    tree.push_back(CuckooNode(bid, 0, 0));
+  }
+  bool null_found = false;
+  uint32_t curr_pos = 0;
+  while (!null_found && curr_pos < tree.size()) {
+    CuckooNode& curr_node = tree[curr_pos];
+    uint32_t curr_depth = curr_node.depth;
+    if (curr_depth >= max_search_depth_) {
+      break;
+    }
+    CuckooBucket& curr_bucket =
+        (*buckets)[static_cast<size_t>(curr_node.bucket_id)];
+    for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !null_found;
+         ++hash_cnt) {
+      uint64_t child_bucket_id = CuckooHash(
+          GetUserKey(curr_bucket.vector_idx), hash_cnt, use_module_hash_,
+          hash_table_size_, identity_as_first_hash_, get_slice_hash_);
+      // Iterate inside Cuckoo Block.
+      for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
+           ++block_idx, ++child_bucket_id) {
+        if ((*buckets)[static_cast<size_t>(child_bucket_id)]
+                .make_space_for_key_call_id == make_space_for_key_call_id) {
+          continue;
+        }
+        (*buckets)[static_cast<size_t>(child_bucket_id)]
+            .make_space_for_key_call_id = make_space_for_key_call_id;
+        tree.push_back(CuckooNode(child_bucket_id, curr_depth + 1, curr_pos));
+        if ((*buckets)[static_cast<size_t>(child_bucket_id)].vector_idx ==
+            kMaxVectorIdx) {
+          null_found = true;
+          break;
+        }
+      }
+    }
+    ++curr_pos;
+  }
+
+  if (null_found) {
+    // There is an empty node in tree.back(). Now, traverse the path from this
+    // empty node to top of the tree and at every node in the path, replace
+    // child with the parent. Stop when first level is reached in the tree
+    // (happens when 0 <= bucket_to_replace_pos < num_hash_func_) and return
+    // this location in first level for target key to be inserted.
+    uint32_t bucket_to_replace_pos = static_cast<uint32_t>(tree.size()) - 1;
+    while (bucket_to_replace_pos >= num_hash_func_) {
+      CuckooNode& curr_node = tree[bucket_to_replace_pos];
+      (*buckets)[static_cast<size_t>(curr_node.bucket_id)] =
+          (*buckets)[static_cast<size_t>(tree[curr_node.parent_pos].bucket_id)];
+      bucket_to_replace_pos = curr_node.parent_pos;
+    }
+    *bucket_id = tree[bucket_to_replace_pos].bucket_id;
+  }
+  return null_found;
+}
+
+std::string CuckooTableBuilder::GetFileChecksum() const {
+  if (file_ != nullptr) {
+    return file_->GetFileChecksum();
+  } else {
+    return kUnknownFileChecksum;
+  }
+}
+
+const char* CuckooTableBuilder::GetFileChecksumFuncName() const {
+  if (file_ != nullptr) {
+    return file_->GetFileChecksumFuncName();
+  } else {
+    return kUnknownFileChecksumFuncName;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_builder.h b/src/rocksdb/table/cuckoo/cuckoo_table_builder.h
new file mode 100644
index 000000000..a125e1f4c
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_builder.h
@@ -0,0 +1,138 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <stdint.h>
+
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/version_edit.h"
+#include "port/port.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/table_builder.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CuckooTableBuilder : public TableBuilder {
+ public:
+  CuckooTableBuilder(
+      WritableFileWriter* file, double max_hash_table_ratio,
+      uint32_t max_num_hash_func, uint32_t max_search_depth,
+      const Comparator* user_comparator, uint32_t cuckoo_block_size,
+      bool use_module_hash, bool identity_as_first_hash,
+      uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t),
+      uint32_t column_family_id, const std::string& column_family_name,
+      const std::string& db_id = "", const std::string& db_session_id = "",
+      uint64_t file_number = 0);
+  // No copying allowed
+  CuckooTableBuilder(const CuckooTableBuilder&) = delete;
+  void operator=(const CuckooTableBuilder&) = delete;
+
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  ~CuckooTableBuilder() {}
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Add(const Slice& key, const Slice& value) override;
+
+  // Return non-ok iff some error has been detected.
+  Status status() const override { return status_; }
+
+  // Return non-ok iff some error happens during IO.
+  IOStatus io_status() const override { return io_status_; }
+
+  // Finish building the table.  Stops using the file passed to the
+  // constructor after this function returns.
+  // REQUIRES: Finish(), Abandon() have not been called
+  Status Finish() override;
+
+  // Indicate that the contents of this builder should be abandoned.  Stops
+  // using the file passed to the constructor after this function returns.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Abandon() override;
+
+  // Number of calls to Add() so far.
+  uint64_t NumEntries() const override;
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  uint64_t FileSize() const override;
+
+  TableProperties GetTableProperties() const override { return properties_; }
+
+  // Get file checksum
+  std::string GetFileChecksum() const override;
+
+  // Get file checksum function name
+  const char* GetFileChecksumFuncName() const override;
+
+ private:
+  struct CuckooBucket {
+    CuckooBucket() : vector_idx(kMaxVectorIdx), make_space_for_key_call_id(0) {}
+    uint32_t vector_idx;
+    // This number will not exceed kvs_.size() + max_num_hash_func_.
+    // We assume number of items is <= 2^32.
+    uint32_t make_space_for_key_call_id;
+  };
+  static const uint32_t kMaxVectorIdx = std::numeric_limits<int32_t>::max();
+
+  bool MakeSpaceForKey(const autovector<uint64_t>& hash_vals,
+                       const uint32_t call_id,
+                       std::vector<CuckooBucket>* buckets, uint64_t* bucket_id);
+  Status MakeHashTable(std::vector<CuckooBucket>* buckets);
+
+  inline bool IsDeletedKey(uint64_t idx) const;
+  inline Slice GetKey(uint64_t idx) const;
+  inline Slice GetUserKey(uint64_t idx) const;
+  inline Slice GetValue(uint64_t idx) const;
+
+  uint32_t num_hash_func_;
+  WritableFileWriter* file_;
+  const double max_hash_table_ratio_;
+  const uint32_t max_num_hash_func_;
+  const uint32_t max_search_depth_;
+  const uint32_t cuckoo_block_size_;
+  uint64_t hash_table_size_;
+  bool is_last_level_file_;
+  bool has_seen_first_key_;
+  bool has_seen_first_value_;
+  uint64_t key_size_;
+  uint64_t value_size_;
+  // A list of fixed-size key-value pairs concatenating into a string.
+  // Use GetKey(), GetUserKey(), and GetValue() to retrieve a specific
+  // key / value given an index
+  std::string kvs_;
+  std::string deleted_keys_;
+  // Number of key-value pairs stored in kvs_ + number of deleted keys
+  uint64_t num_entries_;
+  // Number of keys that contain value (non-deletion op)
+  uint64_t num_values_;
+  Status status_;
+  IOStatus io_status_;
+  TableProperties properties_;
+  const Comparator* ucomp_;
+  bool use_module_hash_;
+  bool identity_as_first_hash_;
+  uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index,
+                              uint64_t max_num_buckets);
+  std::string largest_user_key_ = "";
+  std::string smallest_user_key_ = "";
+
+  bool closed_;  // Either Finish() or Abandon() has been called.
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc b/src/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc
new file mode 100644
index 000000000..be1c62117
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc
@@ -0,0 +1,640 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "table/cuckoo/cuckoo_table_builder.h"
+
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "file/random_access_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
+#include "table/meta_blocks.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+extern const uint64_t kCuckooTableMagicNumber;
+
+namespace {
+std::unordered_map<std::string, std::vector<uint64_t>> hash_map;
+
+uint64_t GetSliceHash(const Slice& s, uint32_t index,
+                      uint64_t /*max_num_buckets*/) {
+  return hash_map[s.ToString()][index];
+}
+}  // namespace
+
+class CuckooBuilderTest : public testing::Test {
+ public:
+  CuckooBuilderTest() {
+    env_ = Env::Default();
+    Options options;
+    options.allow_mmap_reads = true;
+    file_options_ = FileOptions(options);
+  }
+
+  void CheckFileContents(const std::vector<std::string>& keys,
+                         const std::vector<std::string>& values,
+                         const std::vector<uint64_t>& expected_locations,
+                         std::string expected_unused_bucket,
+                         uint64_t expected_table_size,
+                         uint32_t expected_num_hash_func,
+                         bool expected_is_last_level,
+                         uint32_t expected_cuckoo_block_size = 1) {
+    uint64_t num_deletions = 0;
+    for (const auto& key : keys) {
+      ParsedInternalKey parsed;
+      Status pik_status =
+          ParseInternalKey(key, &parsed, true /* log_err_key */);
+      if (pik_status.ok() && parsed.type == kTypeDeletion) {
+        num_deletions++;
+      }
+    }
+    // Read file
+    uint64_t read_file_size;
+    ASSERT_OK(env_->GetFileSize(fname, &read_file_size));
+    std::unique_ptr<RandomAccessFileReader> file_reader;
+    ASSERT_OK(RandomAccessFileReader::Create(
+        env_->GetFileSystem(), fname, file_options_, &file_reader, nullptr));
+
+    Options options;
+    options.allow_mmap_reads = true;
+    ImmutableOptions ioptions(options);
+
+    // Assert Table Properties.
+    std::unique_ptr<TableProperties> props;
+    ASSERT_OK(ReadTableProperties(file_reader.get(), read_file_size,
+                                  kCuckooTableMagicNumber, ioptions, &props));
+    // Check unused bucket.
+    std::string unused_key =
+        props->user_collected_properties[CuckooTablePropertyNames::kEmptyKey];
+    ASSERT_EQ(expected_unused_bucket.substr(0, props->fixed_key_len),
+              unused_key);
+
+    uint64_t value_len_found = *reinterpret_cast<const uint64_t*>(
+        props->user_collected_properties[CuckooTablePropertyNames::kValueLength]
+            .data());
+    ASSERT_EQ(values.empty() ? 0 : values[0].size(), value_len_found);
+    ASSERT_EQ(props->raw_value_size, values.size() * value_len_found);
+    const uint64_t table_size = *reinterpret_cast<const uint64_t*>(
+        props
+            ->user_collected_properties
+                [CuckooTablePropertyNames::kHashTableSize]
+            .data());
+    ASSERT_EQ(expected_table_size, table_size);
+    const uint32_t num_hash_func_found = *reinterpret_cast<const uint32_t*>(
+        props->user_collected_properties[CuckooTablePropertyNames::kNumHashFunc]
+            .data());
+    ASSERT_EQ(expected_num_hash_func, num_hash_func_found);
+    const uint32_t cuckoo_block_size = *reinterpret_cast<const uint32_t*>(
+        props
+            ->user_collected_properties
+                [CuckooTablePropertyNames::kCuckooBlockSize]
+            .data());
+    ASSERT_EQ(expected_cuckoo_block_size, cuckoo_block_size);
+    const bool is_last_level_found = *reinterpret_cast<const bool*>(
+        props->user_collected_properties[CuckooTablePropertyNames::kIsLastLevel]
+            .data());
+    ASSERT_EQ(expected_is_last_level, is_last_level_found);
+
+    ASSERT_EQ(props->num_entries, keys.size());
+    ASSERT_EQ(props->num_deletions, num_deletions);
+    ASSERT_EQ(props->fixed_key_len, keys.empty() ? 0 : keys[0].size());
+    ASSERT_EQ(props->data_size,
+              expected_unused_bucket.size() *
+                  (expected_table_size + expected_cuckoo_block_size - 1));
+    ASSERT_EQ(props->raw_key_size, keys.size() * props->fixed_key_len);
+    ASSERT_EQ(props->column_family_id, 0);
+    ASSERT_EQ(props->column_family_name, kDefaultColumnFamilyName);
+
+    // Check contents of the bucket.
+    std::vector<bool> keys_found(keys.size(), false);
+    size_t bucket_size = expected_unused_bucket.size();
+    for (uint32_t i = 0; i + 1 < table_size + cuckoo_block_size; ++i) {
+      Slice read_slice;
+      ASSERT_OK(file_reader->Read(IOOptions(), i * bucket_size, bucket_size,
+                                  &read_slice, nullptr, nullptr,
+                                  Env::IO_TOTAL /* rate_limiter_priority */));
+      size_t key_idx =
+          std::find(expected_locations.begin(), expected_locations.end(), i) -
+          expected_locations.begin();
+      if (key_idx == keys.size()) {
+        // i is not one of the expected locations. Empty bucket.
+        if (read_slice.data() == nullptr) {
+          ASSERT_EQ(0, expected_unused_bucket.size());
+        } else {
+          ASSERT_EQ(read_slice.compare(expected_unused_bucket), 0);
+        }
+      } else {
+        keys_found[key_idx] = true;
+        ASSERT_EQ(read_slice.compare(keys[key_idx] + values[key_idx]), 0);
+      }
+    }
+    for (auto key_found : keys_found) {
+      // Check that all keys wereReader found.
+      ASSERT_TRUE(key_found);
+    }
+  }
+
+  std::string GetInternalKey(Slice user_key, bool zero_seqno,
+                             ValueType type = kTypeValue) {
+    IterKey ikey;
+    ikey.SetInternalKey(user_key, zero_seqno ? 0 : 1000, type);
+    return ikey.GetInternalKey().ToString();
+  }
+
+  uint64_t NextPowOf2(uint64_t num) {
+    uint64_t n = 2;
+    while (n <= num) {
+      n *= 2;
+    }
+    return n;
+  }
+
+  uint64_t GetExpectedTableSize(uint64_t num) {
+    return NextPowOf2(static_cast<uint64_t>(num / kHashTableRatio));
+  }
+
+  Env* env_;
+  FileOptions file_options_;
+  std::string fname;
+  const double kHashTableRatio = 0.9;
+};
+
+TEST_F(CuckooBuilderTest, SuccessWithEmptyFile) {
+  std::unique_ptr<WritableFile> writable_file;
+  fname = test::PerThreadDBPath("EmptyFile");
+  std::unique_ptr<WritableFileWriter> file_writer;
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, 4, 100,
+                             BytewiseComparator(), 1, false, false,
+                             GetSliceHash, 0 /* column_family_id */,
+                             kDefaultColumnFamilyName);
+  ASSERT_OK(builder.status());
+  ASSERT_EQ(0UL, builder.FileSize());
+  ASSERT_OK(builder.Finish());
+  ASSERT_OK(file_writer->Close());
+  CheckFileContents({}, {}, {}, "", 2, 2, false);
+}
+
+TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) {
+  for (auto type : {kTypeValue, kTypeDeletion}) {
+    uint32_t num_hash_fun = 4;
+    std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
+    std::vector<std::string> values;
+    if (type == kTypeValue) {
+      values = {"v01", "v02", "v03", "v04"};
+    } else {
+      values = {"", "", "", ""};
+    }
+    // Need to have a temporary variable here as VS compiler does not currently
+    // support operator= with initializer_list as a parameter
+    std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+        {user_keys[0], {0, 1, 2, 3}},
+        {user_keys[1], {1, 2, 3, 4}},
+        {user_keys[2], {2, 3, 4, 5}},
+        {user_keys[3], {3, 4, 5, 6}}};
+    hash_map = std::move(hm);
+
+    std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+    std::vector<std::string> keys;
+    for (auto& user_key : user_keys) {
+      keys.push_back(GetInternalKey(user_key, false, type));
+    }
+    uint64_t expected_table_size = GetExpectedTableSize(keys.size());
+
+    fname = test::PerThreadDBPath("NoCollisionFullKey");
+    std::unique_ptr<WritableFileWriter> file_writer;
+    ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                         file_options_, &file_writer, nullptr));
+    CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                               100, BytewiseComparator(), 1, false, false,
+                               GetSliceHash, 0 /* column_family_id */,
+                               kDefaultColumnFamilyName);
+    ASSERT_OK(builder.status());
+    for (uint32_t i = 0; i < user_keys.size(); i++) {
+      builder.Add(Slice(keys[i]), Slice(values[i]));
+      ASSERT_EQ(builder.NumEntries(), i + 1);
+      ASSERT_OK(builder.status());
+    }
+    size_t bucket_size = keys[0].size() + values[0].size();
+    ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+    ASSERT_OK(builder.Finish());
+    ASSERT_OK(file_writer->Close());
+    ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+    std::string expected_unused_bucket = GetInternalKey("key00", true);
+    expected_unused_bucket += std::string(values[0].size(), 'a');
+    CheckFileContents(keys, values, expected_locations, expected_unused_bucket,
+                      expected_table_size, 2, false);
+  }
+}
+
+TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) {
+  uint32_t num_hash_fun = 4;
+  std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
+  std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
+  // Need to have a temporary variable here as VS compiler does not currently
+  // support operator= with initializer_list as a parameter
+  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+      {user_keys[0], {0, 1, 2, 3}},
+      {user_keys[1], {0, 1, 2, 3}},
+      {user_keys[2], {0, 1, 2, 3}},
+      {user_keys[3], {0, 1, 2, 3}},
+  };
+  hash_map = std::move(hm);
+
+  std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+  std::vector<std::string> keys;
+  for (auto& user_key : user_keys) {
+    keys.push_back(GetInternalKey(user_key, false));
+  }
+  uint64_t expected_table_size = GetExpectedTableSize(keys.size());
+
+  fname = test::PerThreadDBPath("WithCollisionFullKey");
+  std::unique_ptr<WritableFileWriter> file_writer;
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                             100, BytewiseComparator(), 1, false, false,
+                             GetSliceHash, 0 /* column_family_id */,
+                             kDefaultColumnFamilyName);
+  ASSERT_OK(builder.status());
+  for (uint32_t i = 0; i < user_keys.size(); i++) {
+    builder.Add(Slice(keys[i]), Slice(values[i]));
+    ASSERT_EQ(builder.NumEntries(), i + 1);
+    ASSERT_OK(builder.status());
+  }
+  size_t bucket_size = keys[0].size() + values[0].size();
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+  ASSERT_OK(builder.Finish());
+  ASSERT_OK(file_writer->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+  std::string expected_unused_bucket = GetInternalKey("key00", true);
+  expected_unused_bucket += std::string(values[0].size(), 'a');
+  CheckFileContents(keys, values, expected_locations, expected_unused_bucket,
+                    expected_table_size, 4, false);
+}
+
+TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) {
+  uint32_t num_hash_fun = 4;
+  std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
+  std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
+  // Need to have a temporary variable here as VS compiler does not currently
+  // support operator= with initializer_list as a parameter
+  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+      {user_keys[0], {0, 1, 2, 3}},
+      {user_keys[1], {0, 1, 2, 3}},
+      {user_keys[2], {0, 1, 2, 3}},
+      {user_keys[3], {0, 1, 2, 3}},
+  };
+  hash_map = std::move(hm);
+
+  std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+  std::vector<std::string> keys;
+  for (auto& user_key : user_keys) {
+    keys.push_back(GetInternalKey(user_key, false));
+  }
+  uint64_t expected_table_size = GetExpectedTableSize(keys.size());
+
+  std::unique_ptr<WritableFileWriter> file_writer;
+  uint32_t cuckoo_block_size = 2;
+  fname = test::PerThreadDBPath("WithCollisionFullKey2");
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
+  CuckooTableBuilder builder(
+      file_writer.get(), kHashTableRatio, num_hash_fun, 100,
+      BytewiseComparator(), cuckoo_block_size, false, false, GetSliceHash,
+      0 /* column_family_id */, kDefaultColumnFamilyName);
+  ASSERT_OK(builder.status());
+  for (uint32_t i = 0; i < user_keys.size(); i++) {
+    builder.Add(Slice(keys[i]), Slice(values[i]));
+    ASSERT_EQ(builder.NumEntries(), i + 1);
+    ASSERT_OK(builder.status());
+  }
+  size_t bucket_size = keys[0].size() + values[0].size();
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+  ASSERT_OK(builder.Finish());
+  ASSERT_OK(file_writer->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+  std::string expected_unused_bucket = GetInternalKey("key00", true);
+  expected_unused_bucket += std::string(values[0].size(), 'a');
+  CheckFileContents(keys, values, expected_locations, expected_unused_bucket,
+                    expected_table_size, 3, false, cuckoo_block_size);
+}
+
+TEST_F(CuckooBuilderTest, WithCollisionPathFullKey) {
+  // Have two hash functions. Insert elements with overlapping hashes.
+  // Finally insert an element with hash value somewhere in the middle
+  // so that it displaces all the elements after that.
+  uint32_t num_hash_fun = 2;
+  std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04",
+                                        "key05"};
+  std::vector<std::string> values = {"v01", "v02", "v03", "v04", "v05"};
+  // Need to have a temporary variable here as VS compiler does not currently
+  // support operator= with initializer_list as a parameter
+  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+      {user_keys[0], {0, 1}}, {user_keys[1], {1, 2}}, {user_keys[2], {2, 3}},
+      {user_keys[3], {3, 4}}, {user_keys[4], {0, 2}},
+  };
+  hash_map = std::move(hm);
+
+  std::vector<uint64_t> expected_locations = {0, 1, 3, 4, 2};
+  std::vector<std::string> keys;
+  for (auto& user_key : user_keys) {
+    keys.push_back(GetInternalKey(user_key, false));
+  }
+  uint64_t expected_table_size = GetExpectedTableSize(keys.size());
+
+  std::unique_ptr<WritableFileWriter> file_writer;
+  fname = test::PerThreadDBPath("WithCollisionPathFullKey");
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                             100, BytewiseComparator(), 1, false, false,
+                             GetSliceHash, 0 /* column_family_id */,
+                             kDefaultColumnFamilyName);
+  ASSERT_OK(builder.status());
+  for (uint32_t i = 0; i < user_keys.size(); i++) {
+    builder.Add(Slice(keys[i]), Slice(values[i]));
+    ASSERT_EQ(builder.NumEntries(), i + 1);
+    ASSERT_OK(builder.status());
+  }
+  size_t bucket_size = keys[0].size() + values[0].size();
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+  ASSERT_OK(builder.Finish());
+  ASSERT_OK(file_writer->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+  std::string expected_unused_bucket = GetInternalKey("key00", true);
+  expected_unused_bucket += std::string(values[0].size(), 'a');
+  CheckFileContents(keys, values, expected_locations, expected_unused_bucket,
+                    expected_table_size, 2, false);
+}
+
+TEST_F(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) {
+  uint32_t num_hash_fun = 2;
+  std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04",
+                                        "key05"};
+  std::vector<std::string> values = {"v01", "v02", "v03", "v04", "v05"};
+  // Need to have a temporary variable here as VS compiler does not currently
+  // support operator= with initializer_list as a parameter
+  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+      {user_keys[0], {0, 1}}, {user_keys[1], {1, 2}}, {user_keys[2], {3, 4}},
+      {user_keys[3], {4, 5}}, {user_keys[4], {0, 3}},
+  };
+  hash_map = std::move(hm);
+
+  std::vector<uint64_t> expected_locations = {2, 1, 3, 4, 0};
+  std::vector<std::string> keys;
+  for (auto& user_key : user_keys) {
+    keys.push_back(GetInternalKey(user_key, false));
+  }
+  uint64_t expected_table_size = GetExpectedTableSize(keys.size());
+
+  std::unique_ptr<WritableFileWriter> file_writer;
+  fname = test::PerThreadDBPath("WithCollisionPathFullKeyAndCuckooBlock");
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                             100, BytewiseComparator(), 2, false, false,
+                             GetSliceHash, 0 /* column_family_id */,
+                             kDefaultColumnFamilyName);
+  ASSERT_OK(builder.status());
+  for (uint32_t i = 0; i < user_keys.size(); i++) {
+    builder.Add(Slice(keys[i]), Slice(values[i]));
+    ASSERT_EQ(builder.NumEntries(), i + 1);
+    ASSERT_OK(builder.status());
+  }
+  size_t bucket_size = keys[0].size() + values[0].size();
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+  ASSERT_OK(builder.Finish());
+  ASSERT_OK(file_writer->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+  std::string expected_unused_bucket = GetInternalKey("key00", true);
+  expected_unused_bucket += std::string(values[0].size(), 'a');
+  CheckFileContents(keys, values, expected_locations, expected_unused_bucket,
+                    expected_table_size, 2, false, 2);
+}
+
+TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) {
+  uint32_t num_hash_fun = 4;
+  std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
+  std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
+  // Need to have a temporary variable here as VS compiler does not currently
+  // support operator= with initializer_list as a parameter
+  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+      {user_keys[0], {0, 1, 2, 3}},
+      {user_keys[1], {1, 2, 3, 4}},
+      {user_keys[2], {2, 3, 4, 5}},
+      {user_keys[3], {3, 4, 5, 6}}};
+  hash_map = std::move(hm);
+
+  std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+  uint64_t expected_table_size = GetExpectedTableSize(user_keys.size());
+
+  std::unique_ptr<WritableFileWriter> file_writer;
+  fname = test::PerThreadDBPath("NoCollisionUserKey");
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
+
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                             100, BytewiseComparator(), 1, false, false,
+                             GetSliceHash, 0 /* column_family_id */,
+                             kDefaultColumnFamilyName);
+  ASSERT_OK(builder.status());
+  for (uint32_t i = 0; i < user_keys.size(); i++) {
+    builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i]));
+    ASSERT_EQ(builder.NumEntries(), i + 1);
+    ASSERT_OK(builder.status());
+  }
+  size_t bucket_size = user_keys[0].size() + values[0].size();
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+  ASSERT_OK(builder.Finish());
+  ASSERT_OK(file_writer->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+  std::string expected_unused_bucket = "key00";
+  expected_unused_bucket += std::string(values[0].size(), 'a');
+  CheckFileContents(user_keys, values, expected_locations,
+                    expected_unused_bucket, expected_table_size, 2, true);
+}
+
+TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) {
+  uint32_t num_hash_fun = 4;
+  std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
+  std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
+  // Need to have a temporary variable here as VS compiler does not currently
+  // support operator= with initializer_list as a parameter
+  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+      {user_keys[0], {0, 1, 2, 3}},
+      {user_keys[1], {0, 1, 2, 3}},
+      {user_keys[2], {0, 1, 2, 3}},
+      {user_keys[3], {0, 1, 2, 3}},
+  };
+  hash_map = std::move(hm);
+
+  std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+  uint64_t expected_table_size = GetExpectedTableSize(user_keys.size());
+
+  std::unique_ptr<WritableFileWriter> file_writer;
+  fname = test::PerThreadDBPath("WithCollisionUserKey");
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
+
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                             100, BytewiseComparator(), 1, false, false,
+                             GetSliceHash, 0 /* column_family_id */,
+                             kDefaultColumnFamilyName);
+  ASSERT_OK(builder.status());
+  for (uint32_t i = 0; i < user_keys.size(); i++) {
+    builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i]));
+    ASSERT_EQ(builder.NumEntries(), i + 1);
+    ASSERT_OK(builder.status());
+  }
+  size_t bucket_size = user_keys[0].size() + values[0].size();
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+  ASSERT_OK(builder.Finish());
+  ASSERT_OK(file_writer->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+  std::string expected_unused_bucket = "key00";
+  expected_unused_bucket += std::string(values[0].size(), 'a');
+  CheckFileContents(user_keys, values, expected_locations,
+                    expected_unused_bucket, expected_table_size, 4, true);
+}
+
+TEST_F(CuckooBuilderTest, WithCollisionPathUserKey) {
+  uint32_t num_hash_fun = 2;
+  std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04",
+                                        "key05"};
+  std::vector<std::string> values = {"v01", "v02", "v03", "v04", "v05"};
+  // Need to have a temporary variable here as VS compiler does not currently
+  // support operator= with initializer_list as a parameter
+  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+      {user_keys[0], {0, 1}}, {user_keys[1], {1, 2}}, {user_keys[2], {2, 3}},
+      {user_keys[3], {3, 4}}, {user_keys[4], {0, 2}},
+  };
+  hash_map = std::move(hm);
+
+  std::vector<uint64_t> expected_locations = {0, 1, 3, 4, 2};
+  uint64_t expected_table_size = GetExpectedTableSize(user_keys.size());
+
+  std::unique_ptr<WritableFileWriter> file_writer;
+  fname = test::PerThreadDBPath("WithCollisionPathUserKey");
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
+
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                             2, BytewiseComparator(), 1, false, false,
+                             GetSliceHash, 0 /* column_family_id */,
+                             kDefaultColumnFamilyName);
+  ASSERT_OK(builder.status());
+  for (uint32_t i = 0; i < user_keys.size(); i++) {
+    builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i]));
+    ASSERT_EQ(builder.NumEntries(), i + 1);
+    ASSERT_OK(builder.status());
+  }
+  size_t bucket_size = user_keys[0].size() + values[0].size();
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+  ASSERT_OK(builder.Finish());
+  ASSERT_OK(file_writer->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+  std::string expected_unused_bucket = "key00";
+  expected_unused_bucket += std::string(values[0].size(), 'a');
+  CheckFileContents(user_keys, values, expected_locations,
+                    expected_unused_bucket, expected_table_size, 2, true);
+}
+
+TEST_F(CuckooBuilderTest, FailWhenCollisionPathTooLong) {
+  // Have two hash functions. Insert elements with overlapping hashes.
+  // Finally try inserting an element with hash value somewhere in the middle
+  // and it should fail because the no. of elements to displace is too high.
+  uint32_t num_hash_fun = 2;
+  std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04",
+                                        "key05"};
+  // Need to have a temporary variable here as VS compiler does not currently
+  // support operator= with initializer_list as a parameter
+  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+      {user_keys[0], {0, 1}}, {user_keys[1], {1, 2}}, {user_keys[2], {2, 3}},
+      {user_keys[3], {3, 4}}, {user_keys[4], {0, 1}},
+  };
+  hash_map = std::move(hm);
+
+  std::unique_ptr<WritableFileWriter> file_writer;
+  fname = test::PerThreadDBPath("WithCollisionPathUserKey");
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                             2, BytewiseComparator(), 1, false, false,
+                             GetSliceHash, 0 /* column_family_id */,
+                             kDefaultColumnFamilyName);
+  ASSERT_OK(builder.status());
+  for (uint32_t i = 0; i < user_keys.size(); i++) {
+    builder.Add(Slice(GetInternalKey(user_keys[i], false)), Slice("value"));
+    ASSERT_EQ(builder.NumEntries(), i + 1);
+    ASSERT_OK(builder.status());
+  }
+  ASSERT_TRUE(builder.Finish().IsNotSupported());
+  ASSERT_OK(file_writer->Close());
+}
+
+TEST_F(CuckooBuilderTest, FailWhenSameKeyInserted) {
+  // Need to have a temporary variable here as VS compiler does not currently
+  // support operator= with initializer_list as a parameter
+  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+      {"repeatedkey", {0, 1, 2, 3}}};
+  hash_map = std::move(hm);
+  uint32_t num_hash_fun = 4;
+  std::string user_key = "repeatedkey";
+
+  std::unique_ptr<WritableFileWriter> file_writer;
+  fname = test::PerThreadDBPath("FailWhenSameKeyInserted");
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                             100, BytewiseComparator(), 1, false, false,
+                             GetSliceHash, 0 /* column_family_id */,
+                             kDefaultColumnFamilyName);
+  ASSERT_OK(builder.status());
+
+  builder.Add(Slice(GetInternalKey(user_key, false)), Slice("value1"));
+  ASSERT_EQ(builder.NumEntries(), 1u);
+  ASSERT_OK(builder.status());
+  builder.Add(Slice(GetInternalKey(user_key, true)), Slice("value2"));
+  ASSERT_EQ(builder.NumEntries(), 2u);
+  ASSERT_OK(builder.status());
+
+  ASSERT_TRUE(builder.Finish().IsNotSupported());
+  ASSERT_OK(file_writer->Close());
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_factory.cc b/src/rocksdb/table/cuckoo/cuckoo_table_factory.cc
new file mode 100644
index 000000000..1253c92dd
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_factory.cc
@@ -0,0 +1,104 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "table/cuckoo/cuckoo_table_factory.h"
+
+#include "db/dbformat.h"
+#include "options/configurable_helper.h"
+#include "rocksdb/utilities/options_type.h"
+#include "table/cuckoo/cuckoo_table_builder.h"
+#include "table/cuckoo/cuckoo_table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status CuckooTableFactory::NewTableReader(
+    const ReadOptions& /*ro*/, const TableReaderOptions& table_reader_options,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    std::unique_ptr<TableReader>* table,
+    bool /*prefetch_index_and_filter_in_cache*/) const {
+  std::unique_ptr<CuckooTableReader> new_reader(new CuckooTableReader(
+      table_reader_options.ioptions, std::move(file), file_size,
+      table_reader_options.internal_comparator.user_comparator(), nullptr));
+  Status s = new_reader->status();
+  if (s.ok()) {
+    *table = std::move(new_reader);
+  }
+  return s;
+}
+
+TableBuilder* CuckooTableFactory::NewTableBuilder(
+    const TableBuilderOptions& table_builder_options,
+    WritableFileWriter* file) const {
+  // TODO: change builder to take the option struct
+  return new CuckooTableBuilder(
+      file, table_options_.hash_table_ratio, 64,
+      table_options_.max_search_depth,
+      table_builder_options.internal_comparator.user_comparator(),
+      table_options_.cuckoo_block_size, table_options_.use_module_hash,
+      table_options_.identity_as_first_hash, nullptr /* get_slice_hash */,
+      table_builder_options.column_family_id,
+      table_builder_options.column_family_name, table_builder_options.db_id,
+      table_builder_options.db_session_id, table_builder_options.cur_file_num);
+}
+
+std::string CuckooTableFactory::GetPrintableOptions() const {
+  std::string ret;
+  ret.reserve(2000);
+  const int kBufferSize = 200;
+  char buffer[kBufferSize];
+
+  snprintf(buffer, kBufferSize, "  hash_table_ratio: %lf\n",
+           table_options_.hash_table_ratio);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  max_search_depth: %u\n",
+           table_options_.max_search_depth);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  cuckoo_block_size: %u\n",
+           table_options_.cuckoo_block_size);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  identity_as_first_hash: %d\n",
+           table_options_.identity_as_first_hash);
+  ret.append(buffer);
+  return ret;
+}
+
+static std::unordered_map<std::string, OptionTypeInfo> cuckoo_table_type_info =
+    {
+#ifndef ROCKSDB_LITE
+        {"hash_table_ratio",
+         {offsetof(struct CuckooTableOptions, hash_table_ratio),
+          OptionType::kDouble, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"max_search_depth",
+         {offsetof(struct CuckooTableOptions, max_search_depth),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"cuckoo_block_size",
+         {offsetof(struct CuckooTableOptions, cuckoo_block_size),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"identity_as_first_hash",
+         {offsetof(struct CuckooTableOptions, identity_as_first_hash),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"use_module_hash",
+         {offsetof(struct CuckooTableOptions, use_module_hash),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+};
+
+CuckooTableFactory::CuckooTableFactory(const CuckooTableOptions& table_options)
+    : table_options_(table_options) {
+  RegisterOptions(&table_options_, &cuckoo_table_type_info);
+}
+
+TableFactory* NewCuckooTableFactory(const CuckooTableOptions& table_options) {
+  return new CuckooTableFactory(table_options);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_factory.h b/src/rocksdb/table/cuckoo/cuckoo_table_factory.h
new file mode 100644
index 000000000..9937c28dd
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_factory.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "util/murmurhash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const uint32_t kCuckooMurmurSeedMultiplier = 816922183;
+static inline uint64_t CuckooHash(
+    const Slice& user_key, uint32_t hash_cnt, bool use_module_hash,
+    uint64_t table_size_, bool identity_as_first_hash,
+    uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) {
+#if !defined NDEBUG || defined OS_WIN
+  // This part is used only in unit tests but we have to keep it for Windows
+  // build as we run test in both debug and release modes under Windows.
+  if (get_slice_hash != nullptr) {
+    return get_slice_hash(user_key, hash_cnt, table_size_);
+  }
+#else
+  (void)get_slice_hash;
+#endif
+
+  uint64_t value = 0;
+  if (hash_cnt == 0 && identity_as_first_hash) {
+    value = (*reinterpret_cast<const int64_t*>(user_key.data()));
+  } else {
+    value = MurmurHash(user_key.data(), static_cast<int>(user_key.size()),
+                       kCuckooMurmurSeedMultiplier * hash_cnt);
+  }
+  if (use_module_hash) {
+    return value % table_size_;
+  } else {
+    return value & (table_size_ - 1);
+  }
+}
+
+// Cuckoo Table is designed for applications that require fast point lookups
+// but not fast range scans.
+//
+// Some assumptions:
+// - Key length and Value length are fixed.
+// - Does not support Snapshot.
+// - Does not support Merge operations.
+// - Does not support prefix bloom filters.
+class CuckooTableFactory : public TableFactory {
+ public:
+  explicit CuckooTableFactory(
+      const CuckooTableOptions& table_option = CuckooTableOptions());
+  ~CuckooTableFactory() {}
+
+  // Method to allow CheckedCast to work for this class
+  static const char* kClassName() { return kCuckooTableName(); }
+  const char* Name() const override { return kCuckooTableName(); }
+
+  using TableFactory::NewTableReader;
+  Status NewTableReader(
+      const ReadOptions& ro, const TableReaderOptions& table_reader_options,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table,
+      bool prefetch_index_and_filter_in_cache = true) const override;
+
+  TableBuilder* NewTableBuilder(
+      const TableBuilderOptions& table_builder_options,
+      WritableFileWriter* file) const override;
+
+  std::string GetPrintableOptions() const override;
+
+ private:
+  CuckooTableOptions table_options_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_reader.cc b/src/rocksdb/table/cuckoo/cuckoo_table_reader.cc
new file mode 100644
index 000000000..1d70909a6
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_reader.cc
@@ -0,0 +1,411 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+#include "table/cuckoo/cuckoo_table_reader.h"
+
+#include <algorithm>
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "memory/arena.h"
+#include "options/cf_options.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/table.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/meta_blocks.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+const uint64_t CACHE_LINE_MASK = ~((uint64_t)CACHE_LINE_SIZE - 1);
+const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max();
+}  // namespace
+
+extern const uint64_t kCuckooTableMagicNumber;
+
+CuckooTableReader::CuckooTableReader(
+    const ImmutableOptions& ioptions,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    const Comparator* comparator,
+    uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t))
+    : file_(std::move(file)),
+      is_last_level_(false),
+      identity_as_first_hash_(false),
+      use_module_hash_(false),
+      num_hash_func_(0),
+      unused_key_(""),
+      key_length_(0),
+      user_key_length_(0),
+      value_length_(0),
+      bucket_length_(0),
+      cuckoo_block_size_(0),
+      cuckoo_block_bytes_minus_one_(0),
+      table_size_(0),
+      ucomp_(comparator),
+      get_slice_hash_(get_slice_hash) {
+  if (!ioptions.allow_mmap_reads) {
+    status_ = Status::InvalidArgument("File is not mmaped");
+    return;
+  }
+  {
+    std::unique_ptr<TableProperties> props;
+    status_ = ReadTableProperties(file_.get(), file_size,
+                                  kCuckooTableMagicNumber, ioptions, &props);
+    if (!status_.ok()) {
+      return;
+    }
+    table_props_ = std::move(props);
+  }
+  auto& user_props = table_props_->user_collected_properties;
+  auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashFunc);
+  if (hash_funs == user_props.end()) {
+    status_ = Status::Corruption("Number of hash functions not found");
+    return;
+  }
+  num_hash_func_ = *reinterpret_cast<const uint32_t*>(hash_funs->second.data());
+  auto unused_key = user_props.find(CuckooTablePropertyNames::kEmptyKey);
+  if (unused_key == user_props.end()) {
+    status_ = Status::Corruption("Empty bucket value not found");
+    return;
+  }
+  unused_key_ = unused_key->second;
+
+  key_length_ = static_cast<uint32_t>(table_props_->fixed_key_len);
+  auto user_key_len = user_props.find(CuckooTablePropertyNames::kUserKeyLength);
+  if (user_key_len == user_props.end()) {
+    status_ = Status::Corruption("User key length not found");
+    return;
+  }
+  user_key_length_ =
+      *reinterpret_cast<const uint32_t*>(user_key_len->second.data());
+
+  auto value_length = user_props.find(CuckooTablePropertyNames::kValueLength);
+  if (value_length == user_props.end()) {
+    status_ = Status::Corruption("Value length not found");
+    return;
+  }
+  value_length_ =
+      *reinterpret_cast<const uint32_t*>(value_length->second.data());
+  bucket_length_ = key_length_ + value_length_;
+
+  auto hash_table_size =
+      user_props.find(CuckooTablePropertyNames::kHashTableSize);
+  if (hash_table_size == user_props.end()) {
+    status_ = Status::Corruption("Hash table size not found");
+    return;
+  }
+  table_size_ =
+      *reinterpret_cast<const uint64_t*>(hash_table_size->second.data());
+
+  auto is_last_level = user_props.find(CuckooTablePropertyNames::kIsLastLevel);
+  if (is_last_level == user_props.end()) {
+    status_ = Status::Corruption("Is last level not found");
+    return;
+  }
+  is_last_level_ = *reinterpret_cast<const bool*>(is_last_level->second.data());
+
+  auto identity_as_first_hash =
+      user_props.find(CuckooTablePropertyNames::kIdentityAsFirstHash);
+  if (identity_as_first_hash == user_props.end()) {
+    status_ = Status::Corruption("identity as first hash not found");
+    return;
+  }
+  identity_as_first_hash_ =
+      *reinterpret_cast<const bool*>(identity_as_first_hash->second.data());
+
+  auto use_module_hash =
+      user_props.find(CuckooTablePropertyNames::kUseModuleHash);
+  if (use_module_hash == user_props.end()) {
+    status_ = Status::Corruption("hash type is not found");
+    return;
+  }
+  use_module_hash_ =
+      *reinterpret_cast<const bool*>(use_module_hash->second.data());
+  auto cuckoo_block_size =
+      user_props.find(CuckooTablePropertyNames::kCuckooBlockSize);
+  if (cuckoo_block_size == user_props.end()) {
+    status_ = Status::Corruption("Cuckoo block size not found");
+    return;
+  }
+  cuckoo_block_size_ =
+      *reinterpret_cast<const uint32_t*>(cuckoo_block_size->second.data());
+  cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1;
+  // TODO: rate limit reads of whole cuckoo tables.
+  status_ =
+      file_->Read(IOOptions(), 0, static_cast<size_t>(file_size), &file_data_,
+                  nullptr, nullptr, Env::IO_TOTAL /* rate_limiter_priority */);
+}
+
+Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/,
+                              const Slice& key, GetContext* get_context,
+                              const SliceTransform* /* prefix_extractor */,
+                              bool /*skip_filters*/) {
+  assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0));
+  Slice user_key = ExtractUserKey(key);
+  for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
+    uint64_t offset =
+        bucket_length_ * CuckooHash(user_key, hash_cnt, use_module_hash_,
+                                    table_size_, identity_as_first_hash_,
+                                    get_slice_hash_);
+    const char* bucket = &file_data_.data()[offset];
+    for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
+         ++block_idx, bucket += bucket_length_) {
+      if (ucomp_->Equal(Slice(unused_key_.data(), user_key.size()),
+                        Slice(bucket, user_key.size()))) {
+        return Status::OK();
+      }
+      // Here, we compare only the user key part as we support only one entry
+      // per user key and we don't support snapshot.
+      if (ucomp_->Equal(user_key, Slice(bucket, user_key.size()))) {
+        Slice value(bucket + key_length_, value_length_);
+        if (is_last_level_) {
+          // Sequence number is not stored at the last level, so we will use
+          // kMaxSequenceNumber since it is unknown.  This could cause some
+          // transactions to fail to lock a key due to known sequence number.
+          // However, it is expected for anyone to use a CuckooTable in a
+          // TransactionDB.
+          get_context->SaveValue(value, kMaxSequenceNumber);
+        } else {
+          Slice full_key(bucket, key_length_);
+          ParsedInternalKey found_ikey;
+          Status s = ParseInternalKey(full_key, &found_ikey,
+                                      false /* log_err_key */);  // TODO
+          if (!s.ok()) return s;
+          bool dont_care __attribute__((__unused__));
+          get_context->SaveValue(found_ikey, value, &dont_care);
+        }
+        // We don't support merge operations. So, we return here.
+        return Status::OK();
+      }
+    }
+  }
+  return Status::OK();
+}
+
+void CuckooTableReader::Prepare(const Slice& key) {
+  // Prefetch the first Cuckoo Block.
+  Slice user_key = ExtractUserKey(key);
+  uint64_t addr =
+      reinterpret_cast<uint64_t>(file_data_.data()) +
+      bucket_length_ * CuckooHash(user_key, 0, use_module_hash_, table_size_,
+                                  identity_as_first_hash_, nullptr);
+  uint64_t end_addr = addr + cuckoo_block_bytes_minus_one_;
+  for (addr &= CACHE_LINE_MASK; addr < end_addr; addr += CACHE_LINE_SIZE) {
+    PREFETCH(reinterpret_cast<const char*>(addr), 0, 3);
+  }
+}
+
+class CuckooTableIterator : public InternalIterator {
+ public:
+  explicit CuckooTableIterator(CuckooTableReader* reader);
+  // No copying allowed
+  CuckooTableIterator(const CuckooTableIterator&) = delete;
+  void operator=(const Iterator&) = delete;
+  ~CuckooTableIterator() override {}
+  bool Valid() const override;
+  void SeekToFirst() override;
+  void SeekToLast() override;
+  void Seek(const Slice& target) override;
+  void SeekForPrev(const Slice& target) override;
+  void Next() override;
+  void Prev() override;
+  Slice key() const override;
+  Slice value() const override;
+  Status status() const override { return Status::OK(); }
+  void InitIfNeeded();
+
+ private:
+  struct BucketComparator {
+    BucketComparator(const Slice& file_data, const Comparator* ucomp,
+                     uint32_t bucket_len, uint32_t user_key_len,
+                     const Slice& target = Slice())
+        : file_data_(file_data),
+          ucomp_(ucomp),
+          bucket_len_(bucket_len),
+          user_key_len_(user_key_len),
+          target_(target) {}
+    bool operator()(const uint32_t first, const uint32_t second) const {
+      const char* first_bucket = (first == kInvalidIndex)
+                                     ? target_.data()
+                                     : &file_data_.data()[first * bucket_len_];
+      const char* second_bucket =
+          (second == kInvalidIndex) ? target_.data()
+                                    : &file_data_.data()[second * bucket_len_];
+      return ucomp_->Compare(Slice(first_bucket, user_key_len_),
+                             Slice(second_bucket, user_key_len_)) < 0;
+    }
+
+   private:
+    const Slice file_data_;
+    const Comparator* ucomp_;
+    const uint32_t bucket_len_;
+    const uint32_t user_key_len_;
+    const Slice target_;
+  };
+
+  const BucketComparator bucket_comparator_;
+  void PrepareKVAtCurrIdx();
+  CuckooTableReader* reader_;
+  bool initialized_;
+  // Contains a map of keys to bucket_id sorted in key order.
+  std::vector<uint32_t> sorted_bucket_ids_;
+  // We assume that the number of items can be stored in uint32 (4 Billion).
+  uint32_t curr_key_idx_;
+  Slice curr_value_;
+  IterKey curr_key_;
+};
+
+CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader)
+    : bucket_comparator_(reader->file_data_, reader->ucomp_,
+                         reader->bucket_length_, reader->user_key_length_),
+      reader_(reader),
+      initialized_(false),
+      curr_key_idx_(kInvalidIndex) {
+  sorted_bucket_ids_.clear();
+  curr_value_.clear();
+  curr_key_.Clear();
+}
+
+void CuckooTableIterator::InitIfNeeded() {
+  if (initialized_) {
+    return;
+  }
+  sorted_bucket_ids_.reserve(
+      static_cast<size_t>(reader_->GetTableProperties()->num_entries));
+  uint64_t num_buckets = reader_->table_size_ + reader_->cuckoo_block_size_ - 1;
+  assert(num_buckets < kInvalidIndex);
+  const char* bucket = reader_->file_data_.data();
+  for (uint32_t bucket_id = 0; bucket_id < num_buckets; ++bucket_id) {
+    if (Slice(bucket, reader_->key_length_) != Slice(reader_->unused_key_)) {
+      sorted_bucket_ids_.push_back(bucket_id);
+    }
+    bucket += reader_->bucket_length_;
+  }
+  assert(sorted_bucket_ids_.size() ==
+         reader_->GetTableProperties()->num_entries);
+  std::sort(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(),
+            bucket_comparator_);
+  curr_key_idx_ = kInvalidIndex;
+  initialized_ = true;
+}
+
+void CuckooTableIterator::SeekToFirst() {
+  InitIfNeeded();
+  curr_key_idx_ = 0;
+  PrepareKVAtCurrIdx();
+}
+
+void CuckooTableIterator::SeekToLast() {
+  InitIfNeeded();
+  curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size()) - 1;
+  PrepareKVAtCurrIdx();
+}
+
+void CuckooTableIterator::Seek(const Slice& target) {
+  InitIfNeeded();
+  const BucketComparator seek_comparator(
+      reader_->file_data_, reader_->ucomp_, reader_->bucket_length_,
+      reader_->user_key_length_, ExtractUserKey(target));
+  auto seek_it =
+      std::lower_bound(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(),
+                       kInvalidIndex, seek_comparator);
+  curr_key_idx_ =
+      static_cast<uint32_t>(std::distance(sorted_bucket_ids_.begin(), seek_it));
+  PrepareKVAtCurrIdx();
+}
+
+void CuckooTableIterator::SeekForPrev(const Slice& /*target*/) {
+  // Not supported
+  assert(false);
+}
+
+bool CuckooTableIterator::Valid() const {
+  return curr_key_idx_ < sorted_bucket_ids_.size();
+}
+
+void CuckooTableIterator::PrepareKVAtCurrIdx() {
+  if (!Valid()) {
+    curr_value_.clear();
+    curr_key_.Clear();
+    return;
+  }
+  uint32_t id = sorted_bucket_ids_[curr_key_idx_];
+  const char* offset =
+      reader_->file_data_.data() + id * reader_->bucket_length_;
+  if (reader_->is_last_level_) {
+    // Always return internal key.
+    curr_key_.SetInternalKey(Slice(offset, reader_->user_key_length_), 0,
+                             kTypeValue);
+  } else {
+    curr_key_.SetInternalKey(Slice(offset, reader_->key_length_));
+  }
+  curr_value_ = Slice(offset + reader_->key_length_, reader_->value_length_);
+}
+
+void CuckooTableIterator::Next() {
+  if (!Valid()) {
+    curr_value_.clear();
+    curr_key_.Clear();
+    return;
+  }
+  ++curr_key_idx_;
+  PrepareKVAtCurrIdx();
+}
+
+void CuckooTableIterator::Prev() {
+  if (curr_key_idx_ == 0) {
+    curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size());
+  }
+  if (!Valid()) {
+    curr_value_.clear();
+    curr_key_.Clear();
+    return;
+  }
+  --curr_key_idx_;
+  PrepareKVAtCurrIdx();
+}
+
+Slice CuckooTableIterator::key() const {
+  assert(Valid());
+  return curr_key_.GetInternalKey();
+}
+
+Slice CuckooTableIterator::value() const {
+  assert(Valid());
+  return curr_value_;
+}
+
+InternalIterator* CuckooTableReader::NewIterator(
+    const ReadOptions& /*read_options*/,
+    const SliceTransform* /* prefix_extractor */, Arena* arena,
+    bool /*skip_filters*/, TableReaderCaller /*caller*/,
+    size_t /*compaction_readahead_size*/, bool /* allow_unprepared_value */) {
+  if (!status().ok()) {
+    return NewErrorInternalIterator<Slice>(
+        Status::Corruption("CuckooTableReader status is not okay."), arena);
+  }
+  CuckooTableIterator* iter;
+  if (arena == nullptr) {
+    iter = new CuckooTableIterator(this);
+  } else {
+    auto iter_mem = arena->AllocateAligned(sizeof(CuckooTableIterator));
+    iter = new (iter_mem) CuckooTableIterator(this);
+  }
+  return iter;
+}
+
+size_t CuckooTableReader::ApproximateMemoryUsage() const { return 0; }
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_reader.h b/src/rocksdb/table/cuckoo/cuckoo_table_reader.h
new file mode 100644
index 000000000..f6c599ae8
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_reader.h
@@ -0,0 +1,100 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "file/random_access_file_reader.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "table/table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena;
+class TableReader;
+struct ImmutableOptions;
+
+class CuckooTableReader : public TableReader {
+ public:
+  CuckooTableReader(const ImmutableOptions& ioptions,
+                    std::unique_ptr<RandomAccessFileReader>&& file,
+                    uint64_t file_size, const Comparator* user_comparator,
+                    uint64_t (*get_slice_hash)(const Slice&, uint32_t,
+                                               uint64_t));
+  ~CuckooTableReader() {}
+
+  std::shared_ptr<const TableProperties> GetTableProperties() const override {
+    return table_props_;
+  }
+
+  Status status() const { return status_; }
+
+  Status Get(const ReadOptions& readOptions, const Slice& key,
+             GetContext* get_context, const SliceTransform* prefix_extractor,
+             bool skip_filters = false) override;
+
+  // Returns a new iterator over table contents
+  // compaction_readahead_size: its value will only be used if for_compaction =
+  // true
+  InternalIterator* NewIterator(const ReadOptions&,
+                                const SliceTransform* prefix_extractor,
+                                Arena* arena, bool skip_filters,
+                                TableReaderCaller caller,
+                                size_t compaction_readahead_size = 0,
+                                bool allow_unprepared_value = false) override;
+  void Prepare(const Slice& target) override;
+
+  // Report an approximation of how much memory has been used.
+  size_t ApproximateMemoryUsage() const override;
+
+  // Following methods are not implemented for Cuckoo Table Reader
+  uint64_t ApproximateOffsetOf(const Slice& /*key*/,
+                               TableReaderCaller /*caller*/) override {
+    return 0;
+  }
+
+  uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/,
+                           TableReaderCaller /*caller*/) override {
+    return 0;
+  }
+
+  void SetupForCompaction() override {}
+  // End of methods not implemented.
+
+ private:
+  friend class CuckooTableIterator;
+  void LoadAllKeys(std::vector<std::pair<Slice, uint32_t>>* key_to_bucket_id);
+  std::unique_ptr<RandomAccessFileReader> file_;
+  Slice file_data_;
+  bool is_last_level_;
+  bool identity_as_first_hash_;
+  bool use_module_hash_;
+  std::shared_ptr<const TableProperties> table_props_;
+  Status status_;
+  uint32_t num_hash_func_;
+  std::string unused_key_;
+  uint32_t key_length_;
+  uint32_t user_key_length_;
+  uint32_t value_length_;
+  uint32_t bucket_length_;
+  uint32_t cuckoo_block_size_;
+  uint32_t cuckoo_block_bytes_minus_one_;
+  uint64_t table_size_;
+  const Comparator* ucomp_;
+  uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index,
+                              uint64_t max_num_buckets);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc b/src/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc
new file mode 100644
index 000000000..d3d1490c6
--- /dev/null
+++ b/src/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc
@@ -0,0 +1,584 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run this test... Skipping...\n");
+  return 0;
+}
+#else
+
+#include <cinttypes>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "memory/arena.h"
+#include "rocksdb/db.h"
+#include "table/cuckoo/cuckoo_table_builder.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
+#include "table/cuckoo/cuckoo_table_reader.h"
+#include "table/get_context.h"
+#include "table/meta_blocks.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/gflags_compat.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_string(file_dir, "",
+              "Directory where the files will be created"
+              " for benchmark. Added for using tmpfs.");
+DEFINE_bool(enable_perf, false, "Run Benchmark Tests too.");
+DEFINE_bool(write, false,
+            "Should write new values to file in performance tests?");
+DEFINE_bool(identity_as_first_hash, true, "use identity as first hash");
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+const uint32_t kNumHashFunc = 10;
+// Methods, variables related to Hash functions.
+std::unordered_map<std::string, std::vector<uint64_t> > hash_map;
+
+void AddHashLookups(const std::string& s, uint64_t bucket_id,
+                    uint32_t num_hash_fun) {
+  std::vector<uint64_t> v;
+  for (uint32_t i = 0; i < num_hash_fun; i++) {
+    v.push_back(bucket_id + i);
+  }
+  hash_map[s] = v;
+}
+
+uint64_t GetSliceHash(const Slice& s, uint32_t index,
+                      uint64_t /*max_num_buckets*/) {
+  return hash_map[s.ToString()][index];
+}
+}  // namespace
+
+class CuckooReaderTest : public testing::Test {
+ public:
+  using testing::Test::SetUp;
+
+  CuckooReaderTest() {
+    options.allow_mmap_reads = true;
+    env = options.env;
+    file_options = FileOptions(options);
+  }
+
+  void SetUp(int num) {
+    num_items = num;
+    hash_map.clear();
+    keys.clear();
+    keys.resize(num_items);
+    user_keys.clear();
+    user_keys.resize(num_items);
+    values.clear();
+    values.resize(num_items);
+  }
+
+  std::string NumToStr(int64_t i) {
+    return std::string(reinterpret_cast<char*>(&i), sizeof(i));
+  }
+
+  void CreateCuckooFileAndCheckReader(
+      const Comparator* ucomp = BytewiseComparator()) {
+    std::unique_ptr<WritableFileWriter> file_writer;
+    ASSERT_OK(WritableFileWriter::Create(env->GetFileSystem(), fname,
+                                         file_options, &file_writer, nullptr));
+    CuckooTableBuilder builder(
+        file_writer.get(), 0.9, kNumHashFunc, 100, ucomp, 2, false, false,
+        GetSliceHash, 0 /* column_family_id */, kDefaultColumnFamilyName);
+    ASSERT_OK(builder.status());
+    for (uint32_t key_idx = 0; key_idx < num_items; ++key_idx) {
+      builder.Add(Slice(keys[key_idx]), Slice(values[key_idx]));
+      ASSERT_OK(builder.status());
+      ASSERT_EQ(builder.NumEntries(), key_idx + 1);
+    }
+    ASSERT_OK(builder.Finish());
+    ASSERT_EQ(num_items, builder.NumEntries());
+    file_size = builder.FileSize();
+    ASSERT_OK(file_writer->Close());
+
+    // Check reader now.
+    std::unique_ptr<RandomAccessFileReader> file_reader;
+    ASSERT_OK(RandomAccessFileReader::Create(
+        env->GetFileSystem(), fname, file_options, &file_reader, nullptr));
+    const ImmutableOptions ioptions(options);
+    CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp,
+                             GetSliceHash);
+    ASSERT_OK(reader.status());
+    // Assume no merge/deletion
+    for (uint32_t i = 0; i < num_items; ++i) {
+      PinnableSlice value;
+      GetContext get_context(ucomp, nullptr, nullptr, nullptr,
+                             GetContext::kNotFound, Slice(user_keys[i]), &value,
+                             nullptr, nullptr, nullptr, nullptr, true, nullptr,
+                             nullptr);
+      ASSERT_OK(
+          reader.Get(ReadOptions(), Slice(keys[i]), &get_context, nullptr));
+      ASSERT_STREQ(values[i].c_str(), value.data());
+    }
+  }
+  void UpdateKeys(bool with_zero_seqno) {
+    for (uint32_t i = 0; i < num_items; i++) {
+      ParsedInternalKey ikey(user_keys[i], with_zero_seqno ? 0 : i + 1000,
+                             kTypeValue);
+      keys[i].clear();
+      AppendInternalKey(&keys[i], ikey);
+    }
+  }
+
+  void CheckIterator(const Comparator* ucomp = BytewiseComparator()) {
+    std::unique_ptr<RandomAccessFileReader> file_reader;
+    ASSERT_OK(RandomAccessFileReader::Create(
+        env->GetFileSystem(), fname, file_options, &file_reader, nullptr));
+    const ImmutableOptions ioptions(options);
+    CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp,
+                             GetSliceHash);
+    ASSERT_OK(reader.status());
+    InternalIterator* it = reader.NewIterator(
+        ReadOptions(), /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized);
+    ASSERT_OK(it->status());
+    ASSERT_TRUE(!it->Valid());
+    it->SeekToFirst();
+    int cnt = 0;
+    while (it->Valid()) {
+      ASSERT_OK(it->status());
+      ASSERT_TRUE(Slice(keys[cnt]) == it->key());
+      ASSERT_TRUE(Slice(values[cnt]) == it->value());
+      ++cnt;
+      it->Next();
+    }
+    ASSERT_EQ(static_cast<uint32_t>(cnt), num_items);
+
+    it->SeekToLast();
+    cnt = static_cast<int>(num_items) - 1;
+    ASSERT_TRUE(it->Valid());
+    while (it->Valid()) {
+      ASSERT_OK(it->status());
+      ASSERT_TRUE(Slice(keys[cnt]) == it->key());
+      ASSERT_TRUE(Slice(values[cnt]) == it->value());
+      --cnt;
+      it->Prev();
+    }
+    ASSERT_EQ(cnt, -1);
+
+    cnt = static_cast<int>(num_items) / 2;
+    it->Seek(keys[cnt]);
+    while (it->Valid()) {
+      ASSERT_OK(it->status());
+      ASSERT_TRUE(Slice(keys[cnt]) == it->key());
+      ASSERT_TRUE(Slice(values[cnt]) == it->value());
+      ++cnt;
+      it->Next();
+    }
+    ASSERT_EQ(static_cast<uint32_t>(cnt), num_items);
+    delete it;
+
+    Arena arena;
+    it = reader.NewIterator(ReadOptions(), /*prefix_extractor=*/nullptr, &arena,
+                            /*skip_filters=*/false,
+                            TableReaderCaller::kUncategorized);
+    ASSERT_OK(it->status());
+    ASSERT_TRUE(!it->Valid());
+    it->Seek(keys[num_items / 2]);
+    ASSERT_TRUE(it->Valid());
+    ASSERT_OK(it->status());
+    ASSERT_TRUE(keys[num_items / 2] == it->key());
+    ASSERT_TRUE(values[num_items / 2] == it->value());
+    ASSERT_OK(it->status());
+    it->~InternalIterator();
+  }
+
+  std::vector<std::string> keys;
+  std::vector<std::string> user_keys;
+  std::vector<std::string> values;
+  uint64_t num_items;
+  std::string fname;
+  uint64_t file_size;
+  Options options;
+  Env* env;
+  FileOptions file_options;
+};
+
+TEST_F(CuckooReaderTest, FileNotMmaped) {
+  options.allow_mmap_reads = false;
+  ImmutableOptions ioptions(options);
+  CuckooTableReader reader(ioptions, nullptr, 0, nullptr, nullptr);
+  ASSERT_TRUE(reader.status().IsInvalidArgument());
+  ASSERT_STREQ("File is not mmaped", reader.status().getState());
+}
+
+TEST_F(CuckooReaderTest, WhenKeyExists) {
+  SetUp(kNumHashFunc);
+  fname = test::PerThreadDBPath("CuckooReader_WhenKeyExists");
+  for (uint64_t i = 0; i < num_items; i++) {
+    user_keys[i] = "key" + NumToStr(i);
+    ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue);
+    AppendInternalKey(&keys[i], ikey);
+    values[i] = "value" + NumToStr(i);
+    // Give disjoint hash values.
+    AddHashLookups(user_keys[i], i, kNumHashFunc);
+  }
+  CreateCuckooFileAndCheckReader();
+  // Last level file.
+  UpdateKeys(true);
+  CreateCuckooFileAndCheckReader();
+  // Test with collision. Make all hash values collide.
+  hash_map.clear();
+  for (uint32_t i = 0; i < num_items; i++) {
+    AddHashLookups(user_keys[i], 0, kNumHashFunc);
+  }
+  UpdateKeys(false);
+  CreateCuckooFileAndCheckReader();
+  // Last level file.
+  UpdateKeys(true);
+  CreateCuckooFileAndCheckReader();
+}
+
+TEST_F(CuckooReaderTest, WhenKeyExistsWithUint64Comparator) {
+  SetUp(kNumHashFunc);
+  fname = test::PerThreadDBPath("CuckooReaderUint64_WhenKeyExists");
+  for (uint64_t i = 0; i < num_items; i++) {
+    user_keys[i].resize(8);
+    memcpy(&user_keys[i][0], static_cast<void*>(&i), 8);
+    ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue);
+    AppendInternalKey(&keys[i], ikey);
+    values[i] = "value" + NumToStr(i);
+    // Give disjoint hash values.
+    AddHashLookups(user_keys[i], i, kNumHashFunc);
+  }
+  CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+  // Last level file.
+  UpdateKeys(true);
+  CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+  // Test with collision. Make all hash values collide.
+  hash_map.clear();
+  for (uint32_t i = 0; i < num_items; i++) {
+    AddHashLookups(user_keys[i], 0, kNumHashFunc);
+  }
+  UpdateKeys(false);
+  CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+  // Last level file.
+  UpdateKeys(true);
+  CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+}
+
+TEST_F(CuckooReaderTest, CheckIterator) {
+  SetUp(2 * kNumHashFunc);
+  fname = test::PerThreadDBPath("CuckooReader_CheckIterator");
+  for (uint64_t i = 0; i < num_items; i++) {
+    user_keys[i] = "key" + NumToStr(i);
+    ParsedInternalKey ikey(user_keys[i], 1000, kTypeValue);
+    AppendInternalKey(&keys[i], ikey);
+    values[i] = "value" + NumToStr(i);
+    // Give disjoint hash values, in reverse order.
+    AddHashLookups(user_keys[i], num_items - i - 1, kNumHashFunc);
+  }
+  CreateCuckooFileAndCheckReader();
+  CheckIterator();
+  // Last level file.
+  UpdateKeys(true);
+  CreateCuckooFileAndCheckReader();
+  CheckIterator();
+}
+
+TEST_F(CuckooReaderTest, CheckIteratorUint64) {
+  SetUp(2 * kNumHashFunc);
+  fname = test::PerThreadDBPath("CuckooReader_CheckIterator");
+  for (uint64_t i = 0; i < num_items; i++) {
+    user_keys[i].resize(8);
+    memcpy(&user_keys[i][0], static_cast<void*>(&i), 8);
+    ParsedInternalKey ikey(user_keys[i], 1000, kTypeValue);
+    AppendInternalKey(&keys[i], ikey);
+    values[i] = "value" + NumToStr(i);
+    // Give disjoint hash values, in reverse order.
+    AddHashLookups(user_keys[i], num_items - i - 1, kNumHashFunc);
+  }
+  CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+  CheckIterator(test::Uint64Comparator());
+  // Last level file.
+  UpdateKeys(true);
+  CreateCuckooFileAndCheckReader(test::Uint64Comparator());
+  CheckIterator(test::Uint64Comparator());
+}
+
+TEST_F(CuckooReaderTest, WhenKeyNotFound) {
+  // Add keys with colliding hash values.
+  SetUp(kNumHashFunc);
+  fname = test::PerThreadDBPath("CuckooReader_WhenKeyNotFound");
+  for (uint64_t i = 0; i < num_items; i++) {
+    user_keys[i] = "key" + NumToStr(i);
+    ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue);
+    AppendInternalKey(&keys[i], ikey);
+    values[i] = "value" + NumToStr(i);
+    // Make all hash values collide.
+    AddHashLookups(user_keys[i], 0, kNumHashFunc);
+  }
+  auto* ucmp = BytewiseComparator();
+  CreateCuckooFileAndCheckReader();
+
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  ASSERT_OK(RandomAccessFileReader::Create(
+      env->GetFileSystem(), fname, file_options, &file_reader, nullptr));
+
+  const ImmutableOptions ioptions(options);
+  CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucmp,
+                           GetSliceHash);
+  ASSERT_OK(reader.status());
+  // Search for a key with colliding hash values.
+  std::string not_found_user_key = "key" + NumToStr(num_items);
+  std::string not_found_key;
+  AddHashLookups(not_found_user_key, 0, kNumHashFunc);
+  ParsedInternalKey ikey(not_found_user_key, 1000, kTypeValue);
+  AppendInternalKey(&not_found_key, ikey);
+  PinnableSlice value;
+  GetContext get_context(ucmp, nullptr, nullptr, nullptr, GetContext::kNotFound,
+                         Slice(not_found_key), &value, nullptr, nullptr,
+                         nullptr, nullptr, true, nullptr, nullptr);
+  ASSERT_OK(
+      reader.Get(ReadOptions(), Slice(not_found_key), &get_context, nullptr));
+  ASSERT_TRUE(value.empty());
+  ASSERT_OK(reader.status());
+  // Search for a key with an independent hash value.
+  std::string not_found_user_key2 = "key" + NumToStr(num_items + 1);
+  AddHashLookups(not_found_user_key2, kNumHashFunc, kNumHashFunc);
+  ParsedInternalKey ikey2(not_found_user_key2, 1000, kTypeValue);
+  std::string not_found_key2;
+  AppendInternalKey(&not_found_key2, ikey2);
+  value.Reset();
+  GetContext get_context2(ucmp, nullptr, nullptr, nullptr,
+                          GetContext::kNotFound, Slice(not_found_key2), &value,
+                          nullptr, nullptr, nullptr, nullptr, true, nullptr,
+                          nullptr);
+  ASSERT_OK(
+      reader.Get(ReadOptions(), Slice(not_found_key2), &get_context2, nullptr));
+  ASSERT_TRUE(value.empty());
+  ASSERT_OK(reader.status());
+
+  // Test read when key is unused key.
+  std::string unused_key =
+      reader.GetTableProperties()->user_collected_properties.at(
+          CuckooTablePropertyNames::kEmptyKey);
+  // Add hash values that map to empty buckets.
+  AddHashLookups(ExtractUserKey(unused_key).ToString(), kNumHashFunc,
+                 kNumHashFunc);
+  value.Reset();
+  GetContext get_context3(
+      ucmp, nullptr, nullptr, nullptr, GetContext::kNotFound, Slice(unused_key),
+      &value, nullptr, nullptr, nullptr, nullptr, true, nullptr, nullptr);
+  ASSERT_OK(
+      reader.Get(ReadOptions(), Slice(unused_key), &get_context3, nullptr));
+  ASSERT_TRUE(value.empty());
+  ASSERT_OK(reader.status());
+}
+
+// Performance tests
+namespace {
+void GetKeys(uint64_t num, std::vector<std::string>* keys) {
+  keys->clear();
+  IterKey k;
+  k.SetInternalKey("", 0, kTypeValue);
+  std::string internal_key_suffix = k.GetInternalKey().ToString();
+  ASSERT_EQ(static_cast<size_t>(8), internal_key_suffix.size());
+  for (uint64_t key_idx = 0; key_idx < num; ++key_idx) {
+    uint64_t value = 2 * key_idx;
+    std::string new_key(reinterpret_cast<char*>(&value), sizeof(value));
+    new_key += internal_key_suffix;
+    keys->push_back(new_key);
+  }
+}
+
+std::string GetFileName(uint64_t num) {
+  if (FLAGS_file_dir.empty()) {
+    FLAGS_file_dir = test::TmpDir();
+  }
+  return test::PerThreadDBPath(FLAGS_file_dir, "cuckoo_read_benchmark") +
+         std::to_string(num / 1000000) + "Mkeys";
+}
+
+// Create last level file as we are interested in measuring performance of
+// last level file only.
+void WriteFile(const std::vector<std::string>& keys, const uint64_t num,
+               double hash_ratio) {
+  Options options;
+  options.allow_mmap_reads = true;
+  const auto& fs = options.env->GetFileSystem();
+  FileOptions file_options(options);
+  std::string fname = GetFileName(num);
+
+  std::unique_ptr<WritableFileWriter> file_writer;
+  ASSERT_OK(WritableFileWriter::Create(fs, fname, file_options, &file_writer,
+                                       nullptr));
+  CuckooTableBuilder builder(
+      file_writer.get(), hash_ratio, 64, 1000, test::Uint64Comparator(), 5,
+      false, FLAGS_identity_as_first_hash, nullptr, 0 /* column_family_id */,
+      kDefaultColumnFamilyName);
+  ASSERT_OK(builder.status());
+  for (uint64_t key_idx = 0; key_idx < num; ++key_idx) {
+    // Value is just a part of key.
+    builder.Add(Slice(keys[key_idx]), Slice(&keys[key_idx][0], 4));
+    ASSERT_EQ(builder.NumEntries(), key_idx + 1);
+    ASSERT_OK(builder.status());
+  }
+  ASSERT_OK(builder.Finish());
+  ASSERT_EQ(num, builder.NumEntries());
+  ASSERT_OK(file_writer->Close());
+
+  uint64_t file_size;
+  ASSERT_OK(
+      fs->GetFileSize(fname, file_options.io_options, &file_size, nullptr));
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  ASSERT_OK(RandomAccessFileReader::Create(fs, fname, file_options,
+                                           &file_reader, nullptr));
+
+  const ImmutableOptions ioptions(options);
+  CuckooTableReader reader(ioptions, std::move(file_reader), file_size,
+                           test::Uint64Comparator(), nullptr);
+  ASSERT_OK(reader.status());
+  ReadOptions r_options;
+  PinnableSlice value;
+  // Assume only the fast path is triggered
+  GetContext get_context(nullptr, nullptr, nullptr, nullptr,
+                         GetContext::kNotFound, Slice(), &value, nullptr,
+                         nullptr, nullptr, true, nullptr, nullptr);
+  for (uint64_t i = 0; i < num; ++i) {
+    value.Reset();
+    value.clear();
+    ASSERT_OK(reader.Get(r_options, Slice(keys[i]), &get_context, nullptr));
+    ASSERT_TRUE(Slice(keys[i]) == Slice(&keys[i][0], 4));
+  }
+}
+
+void ReadKeys(uint64_t num, uint32_t batch_size) {
+  Options options;
+  options.allow_mmap_reads = true;
+  Env* env = options.env;
+  const auto& fs = options.env->GetFileSystem();
+  FileOptions file_options(options);
+  std::string fname = GetFileName(num);
+
+  uint64_t file_size;
+  ASSERT_OK(
+      fs->GetFileSize(fname, file_options.io_options, &file_size, nullptr));
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  ASSERT_OK(RandomAccessFileReader::Create(fs, fname, file_options,
+                                           &file_reader, nullptr));
+
+  const ImmutableOptions ioptions(options);
+  CuckooTableReader reader(ioptions, std::move(file_reader), file_size,
+                           test::Uint64Comparator(), nullptr);
+  ASSERT_OK(reader.status());
+  const UserCollectedProperties user_props =
+      reader.GetTableProperties()->user_collected_properties;
+  const uint32_t num_hash_fun = *reinterpret_cast<const uint32_t*>(
+      user_props.at(CuckooTablePropertyNames::kNumHashFunc).data());
+  const uint64_t table_size = *reinterpret_cast<const uint64_t*>(
+      user_props.at(CuckooTablePropertyNames::kHashTableSize).data());
+  fprintf(stderr,
+          "With %" PRIu64
+          " items, utilization is %.2f%%, number of"
+          " hash functions: %u.\n",
+          num, num * 100.0 / (table_size), num_hash_fun);
+  ReadOptions r_options;
+
+  std::vector<uint64_t> keys;
+  keys.reserve(num);
+  for (uint64_t i = 0; i < num; ++i) {
+    keys.push_back(2 * i);
+  }
+  RandomShuffle(keys.begin(), keys.end());
+
+  PinnableSlice value;
+  // Assume only the fast path is triggered
+  GetContext get_context(nullptr, nullptr, nullptr, nullptr,
+                         GetContext::kNotFound, Slice(), &value, nullptr,
+                         nullptr, nullptr, true, nullptr, nullptr);
+  uint64_t start_time = env->NowMicros();
+  if (batch_size > 0) {
+    for (uint64_t i = 0; i < num; i += batch_size) {
+      for (uint64_t j = i; j < i + batch_size && j < num; ++j) {
+        reader.Prepare(Slice(reinterpret_cast<char*>(&keys[j]), 16));
+      }
+      for (uint64_t j = i; j < i + batch_size && j < num; ++j) {
+        reader.Get(r_options, Slice(reinterpret_cast<char*>(&keys[j]), 16),
+                   &get_context, nullptr);
+      }
+    }
+  } else {
+    for (uint64_t i = 0; i < num; i++) {
+      reader.Get(r_options, Slice(reinterpret_cast<char*>(&keys[i]), 16),
+                 &get_context, nullptr);
+    }
+  }
+  float time_per_op = (env->NowMicros() - start_time) * 1.0f / num;
+  fprintf(stderr,
+          "Time taken per op is %.3fus (%.1f Mqps) with batch size of %u\n",
+          time_per_op, 1.0 / time_per_op, batch_size);
+}
+}  // namespace.
+
+TEST_F(CuckooReaderTest, TestReadPerformance) {
+  if (!FLAGS_enable_perf) {
+    return;
+  }
+  double hash_ratio = 0.95;
+  // These numbers are chosen to have a hash utilization % close to
+  // 0.9, 0.75, 0.6 and 0.5 respectively.
+  // They all create 128 M buckets.
+  std::vector<uint64_t> nums = {120 * 1024 * 1024, 100 * 1024 * 1024,
+                                80 * 1024 * 1024, 70 * 1024 * 1024};
+#ifndef NDEBUG
+  fprintf(
+      stdout,
+      "WARNING: Not compiled with DNDEBUG. Performance tests may be slow.\n");
+#endif
+  for (uint64_t num : nums) {
+    if (FLAGS_write ||
+        Env::Default()->FileExists(GetFileName(num)).IsNotFound()) {
+      std::vector<std::string> all_keys;
+      GetKeys(num, &all_keys);
+      WriteFile(all_keys, num, hash_ratio);
+    }
+    ReadKeys(num, 0);
+    ReadKeys(num, 10);
+    ReadKeys(num, 25);
+    ReadKeys(num, 50);
+    ReadKeys(num, 100);
+    fprintf(stderr, "\n");
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  if (ROCKSDB_NAMESPACE::port::kLittleEndian) {
+    ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+    ::testing::InitGoogleTest(&argc, argv);
+    ParseCommandLineFlags(&argc, &argv, true);
+    return RUN_ALL_TESTS();
+  } else {
+    fprintf(stderr, "SKIPPED as Cuckoo table doesn't support Big Endian\n");
+    return 0;
+  }
+}
+
+#endif  // GFLAGS.
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/format.cc b/src/rocksdb/table/format.cc
new file mode 100644
index 000000000..efde5e169
--- /dev/null
+++ b/src/rocksdb/table/format.cc
@@ -0,0 +1,575 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/format.h"
+
+#include <cinttypes>
+#include <string>
+
+#include "block_fetcher.h"
+#include "file/random_access_file_reader.h"
+#include "memory/memory_allocator.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/statistics.h"
+#include "options/options_helper.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/persistent_cache_helper.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/compression.h"
+#include "util/crc32c.h"
+#include "util/hash.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "util/xxhash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+extern const uint64_t kLegacyBlockBasedTableMagicNumber;
+extern const uint64_t kBlockBasedTableMagicNumber;
+
+#ifndef ROCKSDB_LITE
+extern const uint64_t kLegacyPlainTableMagicNumber;
+extern const uint64_t kPlainTableMagicNumber;
+#else
+// ROCKSDB_LITE doesn't have plain table
+const uint64_t kLegacyPlainTableMagicNumber = 0;
+const uint64_t kPlainTableMagicNumber = 0;
+#endif
+const char* kHostnameForDbHostId = "__hostname__";
+
+bool ShouldReportDetailedTime(Env* env, Statistics* stats) {
+  return env != nullptr && stats != nullptr &&
+         stats->get_stats_level() > kExceptDetailedTimers;
+}
+
+void BlockHandle::EncodeTo(std::string* dst) const {
+  // Sanity check that all fields have been set
+  assert(offset_ != ~uint64_t{0});
+  assert(size_ != ~uint64_t{0});
+  PutVarint64Varint64(dst, offset_, size_);
+}
+
+char* BlockHandle::EncodeTo(char* dst) const {
+  // Sanity check that all fields have been set
+  assert(offset_ != ~uint64_t{0});
+  assert(size_ != ~uint64_t{0});
+  char* cur = EncodeVarint64(dst, offset_);
+  cur = EncodeVarint64(cur, size_);
+  return cur;
+}
+
+Status BlockHandle::DecodeFrom(Slice* input) {
+  if (GetVarint64(input, &offset_) && GetVarint64(input, &size_)) {
+    return Status::OK();
+  } else {
+    // reset in case failure after partially decoding
+    offset_ = 0;
+    size_ = 0;
+    return Status::Corruption("bad block handle");
+  }
+}
+
+Status BlockHandle::DecodeSizeFrom(uint64_t _offset, Slice* input) {
+  if (GetVarint64(input, &size_)) {
+    offset_ = _offset;
+    return Status::OK();
+  } else {
+    // reset in case failure after partially decoding
+    offset_ = 0;
+    size_ = 0;
+    return Status::Corruption("bad block handle");
+  }
+}
+
+// Return a string that contains the copy of handle.
+std::string BlockHandle::ToString(bool hex) const {
+  std::string handle_str;
+  EncodeTo(&handle_str);
+  if (hex) {
+    return Slice(handle_str).ToString(true);
+  } else {
+    return handle_str;
+  }
+}
+
+const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
+
+void IndexValue::EncodeTo(std::string* dst, bool have_first_key,
+                          const BlockHandle* previous_handle) const {
+  if (previous_handle) {
+    // WART: this is specific to Block-based table
+    assert(handle.offset() == previous_handle->offset() +
+                                  previous_handle->size() +
+                                  BlockBasedTable::kBlockTrailerSize);
+    PutVarsignedint64(dst, handle.size() - previous_handle->size());
+  } else {
+    handle.EncodeTo(dst);
+  }
+  assert(dst->size() != 0);
+
+  if (have_first_key) {
+    PutLengthPrefixedSlice(dst, first_internal_key);
+  }
+}
+
+Status IndexValue::DecodeFrom(Slice* input, bool have_first_key,
+                              const BlockHandle* previous_handle) {
+  if (previous_handle) {
+    int64_t delta;
+    if (!GetVarsignedint64(input, &delta)) {
+      return Status::Corruption("bad delta-encoded index value");
+    }
+    // WART: this is specific to Block-based table
+    handle = BlockHandle(previous_handle->offset() + previous_handle->size() +
+                             BlockBasedTable::kBlockTrailerSize,
+                         previous_handle->size() + delta);
+  } else {
+    Status s = handle.DecodeFrom(input);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (!have_first_key) {
+    first_internal_key = Slice();
+  } else if (!GetLengthPrefixedSlice(input, &first_internal_key)) {
+    return Status::Corruption("bad first key in block info");
+  }
+
+  return Status::OK();
+}
+
+std::string IndexValue::ToString(bool hex, bool have_first_key) const {
+  std::string s;
+  EncodeTo(&s, have_first_key, nullptr);
+  if (hex) {
+    return Slice(s).ToString(true);
+  } else {
+    return s;
+  }
+}
+
+namespace {
+inline bool IsLegacyFooterFormat(uint64_t magic_number) {
+  return magic_number == kLegacyBlockBasedTableMagicNumber ||
+         magic_number == kLegacyPlainTableMagicNumber;
+}
+inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) {
+  if (magic_number == kLegacyBlockBasedTableMagicNumber) {
+    return kBlockBasedTableMagicNumber;
+  }
+  if (magic_number == kLegacyPlainTableMagicNumber) {
+    return kPlainTableMagicNumber;
+  }
+  assert(false);
+  return magic_number;
+}
+inline uint64_t DownconvertToLegacyFooterFormat(uint64_t magic_number) {
+  if (magic_number == kBlockBasedTableMagicNumber) {
+    return kLegacyBlockBasedTableMagicNumber;
+  }
+  if (magic_number == kPlainTableMagicNumber) {
+    return kLegacyPlainTableMagicNumber;
+  }
+  assert(false);
+  return magic_number;
+}
+inline uint8_t BlockTrailerSizeForMagicNumber(uint64_t magic_number) {
+  if (magic_number == kBlockBasedTableMagicNumber ||
+      magic_number == kLegacyBlockBasedTableMagicNumber) {
+    return static_cast<uint8_t>(BlockBasedTable::kBlockTrailerSize);
+  } else {
+    return 0;
+  }
+}
+
+// Footer format, in three parts:
+// * Part1
+//   -> format_version == 0 (inferred from legacy magic number)
+//      <empty> (0 bytes)
+//   -> format_version >= 1
+//      checksum type (char, 1 byte)
+// * Part2
+//      metaindex handle (varint64 offset, varint64 size)
+//      index handle     (varint64 offset, varint64 size)
+//      <zero padding> for part2 size = 2 * BlockHandle::kMaxEncodedLength = 40
+// * Part3
+//   -> format_version == 0 (inferred from legacy magic number)
+//      legacy magic number (8 bytes)
+//   -> format_version >= 1 (inferred from NOT legacy magic number)
+//      format_version (uint32LE, 4 bytes), also called "footer version"
+//      newer magic number (8 bytes)
+
+constexpr size_t kFooterPart2Size = 2 * BlockHandle::kMaxEncodedLength;
+}  // namespace
+
+void FooterBuilder::Build(uint64_t magic_number, uint32_t format_version,
+                          uint64_t footer_offset, ChecksumType checksum_type,
+                          const BlockHandle& metaindex_handle,
+                          const BlockHandle& index_handle) {
+  (void)footer_offset;  // Future use
+
+  assert(magic_number != Footer::kNullTableMagicNumber);
+  assert(IsSupportedFormatVersion(format_version));
+
+  char* part2;
+  char* part3;
+  if (format_version > 0) {
+    slice_ = Slice(data_.data(), Footer::kNewVersionsEncodedLength);
+    // Generate parts 1 and 3
+    char* cur = data_.data();
+    // Part 1
+    *(cur++) = checksum_type;
+    // Part 2
+    part2 = cur;
+    // Skip over part 2 for now
+    cur += kFooterPart2Size;
+    // Part 3
+    part3 = cur;
+    EncodeFixed32(cur, format_version);
+    cur += 4;
+    EncodeFixed64(cur, magic_number);
+    assert(cur + 8 == slice_.data() + slice_.size());
+  } else {
+    slice_ = Slice(data_.data(), Footer::kVersion0EncodedLength);
+    // Legacy SST files use kCRC32c checksum but it's not stored in footer.
+    assert(checksum_type == kNoChecksum || checksum_type == kCRC32c);
+    // Generate part 3 (part 1 empty, skip part 2 for now)
+    part2 = data_.data();
+    part3 = part2 + kFooterPart2Size;
+    char* cur = part3;
+    // Use legacy magic numbers to indicate format_version=0, for
+    // compatibility. No other cases should use format_version=0.
+    EncodeFixed64(cur, DownconvertToLegacyFooterFormat(magic_number));
+    assert(cur + 8 == slice_.data() + slice_.size());
+  }
+
+  {
+    char* cur = part2;
+    cur = metaindex_handle.EncodeTo(cur);
+    cur = index_handle.EncodeTo(cur);
+    // Zero pad remainder
+    std::fill(cur, part3, char{0});
+  }
+}
+
+Status Footer::DecodeFrom(Slice input, uint64_t input_offset) {
+  (void)input_offset;  // Future use
+
+  // Only decode to unused Footer
+  assert(table_magic_number_ == kNullTableMagicNumber);
+  assert(input != nullptr);
+  assert(input.size() >= kMinEncodedLength);
+
+  const char* magic_ptr = input.data() + input.size() - kMagicNumberLengthByte;
+  uint64_t magic = DecodeFixed64(magic_ptr);
+
+  // We check for legacy formats here and silently upconvert them
+  bool legacy = IsLegacyFooterFormat(magic);
+  if (legacy) {
+    magic = UpconvertLegacyFooterFormat(magic);
+  }
+  table_magic_number_ = magic;
+  block_trailer_size_ = BlockTrailerSizeForMagicNumber(magic);
+
+  // Parse Part3
+  if (legacy) {
+    // The size is already asserted to be at least kMinEncodedLength
+    // at the beginning of the function
+    input.remove_prefix(input.size() - kVersion0EncodedLength);
+    format_version_ = 0 /* legacy */;
+    checksum_type_ = kCRC32c;
+  } else {
+    const char* part3_ptr = magic_ptr - 4;
+    format_version_ = DecodeFixed32(part3_ptr);
+    if (!IsSupportedFormatVersion(format_version_)) {
+      return Status::Corruption("Corrupt or unsupported format_version: " +
+                                std::to_string(format_version_));
+    }
+    // All known format versions >= 1 occupy exactly this many bytes.
+    if (input.size() < kNewVersionsEncodedLength) {
+      return Status::Corruption("Input is too short to be an SST file");
+    }
+    uint64_t adjustment = input.size() - kNewVersionsEncodedLength;
+    input.remove_prefix(adjustment);
+
+    // Parse Part1
+    char chksum = input.data()[0];
+    checksum_type_ = lossless_cast<ChecksumType>(chksum);
+    if (!IsSupportedChecksumType(checksum_type())) {
+      return Status::Corruption("Corrupt or unsupported checksum type: " +
+                                std::to_string(lossless_cast<uint8_t>(chksum)));
+    }
+    // Consume checksum type field
+    input.remove_prefix(1);
+  }
+
+  // Parse Part2
+  Status result = metaindex_handle_.DecodeFrom(&input);
+  if (result.ok()) {
+    result = index_handle_.DecodeFrom(&input);
+  }
+  return result;
+  // Padding in part2 is ignored
+}
+
+std::string Footer::ToString() const {
+  std::string result;
+  result.reserve(1024);
+
+  bool legacy = IsLegacyFooterFormat(table_magic_number_);
+  if (legacy) {
+    result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n  ");
+    result.append("index handle: " + index_handle_.ToString() + "\n  ");
+    result.append("table_magic_number: " + std::to_string(table_magic_number_) +
+                  "\n  ");
+  } else {
+    result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n  ");
+    result.append("index handle: " + index_handle_.ToString() + "\n  ");
+    result.append("table_magic_number: " + std::to_string(table_magic_number_) +
+                  "\n  ");
+    result.append("format version: " + std::to_string(format_version_) +
+                  "\n  ");
+  }
+  return result;
+}
+
+Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
+                          FilePrefetchBuffer* prefetch_buffer,
+                          uint64_t file_size, Footer* footer,
+                          uint64_t enforce_table_magic_number) {
+  if (file_size < Footer::kMinEncodedLength) {
+    return Status::Corruption("file is too short (" +
+                              std::to_string(file_size) +
+                              " bytes) to be an "
+                              "sstable: " +
+                              file->file_name());
+  }
+
+  std::string footer_buf;
+  AlignedBuf internal_buf;
+  Slice footer_input;
+  uint64_t read_offset = (file_size > Footer::kMaxEncodedLength)
+                             ? file_size - Footer::kMaxEncodedLength
+                             : 0;
+  Status s;
+  // TODO: Need to pass appropriate deadline to TryReadFromCache(). Right now,
+  // there is no readahead for point lookups, so TryReadFromCache will fail if
+  // the required data is not in the prefetch buffer. Once deadline is enabled
+  // for iterator, TryReadFromCache might do a readahead. Revisit to see if we
+  // need to pass a timeout at that point
+  // TODO: rate limit footer reads.
+  if (prefetch_buffer == nullptr ||
+      !prefetch_buffer->TryReadFromCache(
+          opts, file, read_offset, Footer::kMaxEncodedLength, &footer_input,
+          nullptr, opts.rate_limiter_priority)) {
+    if (file->use_direct_io()) {
+      s = file->Read(opts, read_offset, Footer::kMaxEncodedLength,
+                     &footer_input, nullptr, &internal_buf,
+                     opts.rate_limiter_priority);
+    } else {
+      footer_buf.reserve(Footer::kMaxEncodedLength);
+      s = file->Read(opts, read_offset, Footer::kMaxEncodedLength,
+                     &footer_input, &footer_buf[0], nullptr,
+                     opts.rate_limiter_priority);
+    }
+    if (!s.ok()) return s;
+  }
+
+  // Check that we actually read the whole footer from the file. It may be
+  // that size isn't correct.
+  if (footer_input.size() < Footer::kMinEncodedLength) {
+    // FIXME: this error message is bad. We should be checking whether the
+    // provided file_size matches what's on disk, at least in this case.
+    // Unfortunately FileSystem/Env does not provide a way to get the size
+    // of an open file, so getting file size requires a full path seek.
+    return Status::Corruption("file is too short (" +
+                              std::to_string(file_size) +
+                              " bytes) to be an "
+                              "sstable" +
+                              file->file_name());
+  }
+
+  s = footer->DecodeFrom(footer_input, read_offset);
+  if (!s.ok()) {
+    return s;
+  }
+  if (enforce_table_magic_number != 0 &&
+      enforce_table_magic_number != footer->table_magic_number()) {
+    return Status::Corruption("Bad table magic number: expected " +
+                              std::to_string(enforce_table_magic_number) +
+                              ", found " +
+                              std::to_string(footer->table_magic_number()) +
+                              " in " + file->file_name());
+  }
+  return Status::OK();
+}
+
+namespace {
+// Custom handling for the last byte of a block, to avoid invoking streaming
+// API to get an effective block checksum. This function is its own inverse
+// because it uses xor.
+inline uint32_t ModifyChecksumForLastByte(uint32_t checksum, char last_byte) {
+  // This strategy bears some resemblance to extending a CRC checksum by one
+  // more byte, except we don't need to re-mix the input checksum as long as
+  // we do this step only once (per checksum).
+  const uint32_t kRandomPrime = 0x6b9083d9;
+  return checksum ^ lossless_cast<uint8_t>(last_byte) * kRandomPrime;
+}
+}  // namespace
+
+uint32_t ComputeBuiltinChecksum(ChecksumType type, const char* data,
+                                size_t data_size) {
+  switch (type) {
+    case kCRC32c:
+      return crc32c::Mask(crc32c::Value(data, data_size));
+    case kxxHash:
+      return XXH32(data, data_size, /*seed*/ 0);
+    case kxxHash64:
+      return Lower32of64(XXH64(data, data_size, /*seed*/ 0));
+    case kXXH3: {
+      if (data_size == 0) {
+        // Special case because of special handling for last byte, not
+        // present in this case. Can be any value different from other
+        // small input size checksums.
+        return 0;
+      } else {
+        // See corresponding code in ComputeBuiltinChecksumWithLastByte
+        uint32_t v = Lower32of64(XXH3_64bits(data, data_size - 1));
+        return ModifyChecksumForLastByte(v, data[data_size - 1]);
+      }
+    }
+    default:  // including kNoChecksum
+      return 0;
+  }
+}
+
+uint32_t ComputeBuiltinChecksumWithLastByte(ChecksumType type, const char* data,
+                                            size_t data_size, char last_byte) {
+  switch (type) {
+    case kCRC32c: {
+      uint32_t crc = crc32c::Value(data, data_size);
+      // Extend to cover last byte (compression type)
+      crc = crc32c::Extend(crc, &last_byte, 1);
+      return crc32c::Mask(crc);
+    }
+    case kxxHash: {
+      XXH32_state_t* const state = XXH32_createState();
+      XXH32_reset(state, 0);
+      XXH32_update(state, data, data_size);
+      // Extend to cover last byte (compression type)
+      XXH32_update(state, &last_byte, 1);
+      uint32_t v = XXH32_digest(state);
+      XXH32_freeState(state);
+      return v;
+    }
+    case kxxHash64: {
+      XXH64_state_t* const state = XXH64_createState();
+      XXH64_reset(state, 0);
+      XXH64_update(state, data, data_size);
+      // Extend to cover last byte (compression type)
+      XXH64_update(state, &last_byte, 1);
+      uint32_t v = Lower32of64(XXH64_digest(state));
+      XXH64_freeState(state);
+      return v;
+    }
+    case kXXH3: {
+      // XXH3 is a complicated hash function that is extremely fast on
+      // contiguous input, but that makes its streaming support rather
+      // complex. It is worth custom handling of the last byte (`type`)
+      // in order to avoid allocating a large state object and bringing
+      // that code complexity into CPU working set.
+      uint32_t v = Lower32of64(XXH3_64bits(data, data_size));
+      return ModifyChecksumForLastByte(v, last_byte);
+    }
+    default:  // including kNoChecksum
+      return 0;
+  }
+}
+
+Status UncompressBlockData(const UncompressionInfo& uncompression_info,
+                           const char* data, size_t size,
+                           BlockContents* out_contents, uint32_t format_version,
+                           const ImmutableOptions& ioptions,
+                           MemoryAllocator* allocator) {
+  Status ret = Status::OK();
+
+  assert(uncompression_info.type() != kNoCompression &&
+         "Invalid compression type");
+
+  StopWatchNano timer(ioptions.clock,
+                      ShouldReportDetailedTime(ioptions.env, ioptions.stats));
+  size_t uncompressed_size = 0;
+  CacheAllocationPtr ubuf =
+      UncompressData(uncompression_info, data, size, &uncompressed_size,
+                     GetCompressFormatForVersion(format_version), allocator);
+  if (!ubuf) {
+    if (!CompressionTypeSupported(uncompression_info.type())) {
+      return Status::NotSupported(
+          "Unsupported compression method for this build",
+          CompressionTypeToString(uncompression_info.type()));
+    } else {
+      return Status::Corruption(
+          "Corrupted compressed block contents",
+          CompressionTypeToString(uncompression_info.type()));
+    }
+  }
+
+  *out_contents = BlockContents(std::move(ubuf), uncompressed_size);
+
+  if (ShouldReportDetailedTime(ioptions.env, ioptions.stats)) {
+    RecordTimeToHistogram(ioptions.stats, DECOMPRESSION_TIMES_NANOS,
+                          timer.ElapsedNanos());
+  }
+  RecordTimeToHistogram(ioptions.stats, BYTES_DECOMPRESSED,
+                        out_contents->data.size());
+  RecordTick(ioptions.stats, NUMBER_BLOCK_DECOMPRESSED);
+
+  TEST_SYNC_POINT_CALLBACK("UncompressBlockData:TamperWithReturnValue",
+                           static_cast<void*>(&ret));
+  TEST_SYNC_POINT_CALLBACK(
+      "UncompressBlockData:"
+      "TamperWithDecompressionOutput",
+      static_cast<void*>(out_contents));
+
+  return ret;
+}
+
+Status UncompressSerializedBlock(const UncompressionInfo& uncompression_info,
+                                 const char* data, size_t size,
+                                 BlockContents* out_contents,
+                                 uint32_t format_version,
+                                 const ImmutableOptions& ioptions,
+                                 MemoryAllocator* allocator) {
+  assert(data[size] != kNoCompression);
+  assert(data[size] == static_cast<char>(uncompression_info.type()));
+  return UncompressBlockData(uncompression_info, data, size, out_contents,
+                             format_version, ioptions, allocator);
+}
+
+// Replace the contents of db_host_id with the actual hostname, if db_host_id
+// matches the keyword kHostnameForDbHostId
+Status ReifyDbHostIdProperty(Env* env, std::string* db_host_id) {
+  assert(db_host_id);
+  if (*db_host_id == kHostnameForDbHostId) {
+    Status s = env->GetHostNameString(db_host_id);
+    if (!s.ok()) {
+      db_host_id->clear();
+    }
+    return s;
+  }
+
+  return Status::OK();
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/format.h b/src/rocksdb/table/format.h
new file mode 100644
index 000000000..ffb9fb0ca
--- /dev/null
+++ b/src/rocksdb/table/format.h
@@ -0,0 +1,375 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <string>
+
+#include "file/file_prefetch_buffer.h"
+#include "file/random_access_file_reader.h"
+#include "memory/memory_allocator.h"
+#include "options/cf_options.h"
+#include "port/malloc.h"
+#include "port/port.h"  // noexcept
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class RandomAccessFile;
+struct ReadOptions;
+
+bool ShouldReportDetailedTime(Env* env, Statistics* stats);
+
+// the length of the magic number in bytes.
+constexpr uint32_t kMagicNumberLengthByte = 8;
+
+// BlockHandle is a pointer to the extent of a file that stores a data
+// block or a meta block.
+class BlockHandle {
+ public:
+  // Creates a block handle with special values indicating "uninitialized,"
+  // distinct from the "null" block handle.
+  BlockHandle();
+  BlockHandle(uint64_t offset, uint64_t size);
+
+  // The offset of the block in the file.
+  uint64_t offset() const { return offset_; }
+  void set_offset(uint64_t _offset) { offset_ = _offset; }
+
+  // The size of the stored block
+  uint64_t size() const { return size_; }
+  void set_size(uint64_t _size) { size_ = _size; }
+
+  void EncodeTo(std::string* dst) const;
+  char* EncodeTo(char* dst) const;
+  Status DecodeFrom(Slice* input);
+  Status DecodeSizeFrom(uint64_t offset, Slice* input);
+
+  // Return a string that contains the copy of handle.
+  std::string ToString(bool hex = true) const;
+
+  // if the block handle's offset and size are both "0", we will view it
+  // as a null block handle that points to no where.
+  bool IsNull() const { return offset_ == 0 && size_ == 0; }
+
+  static const BlockHandle& NullBlockHandle() { return kNullBlockHandle; }
+
+  // Maximum encoding length of a BlockHandle
+  static constexpr uint32_t kMaxEncodedLength = 2 * kMaxVarint64Length;
+
+  inline bool operator==(const BlockHandle& rhs) const {
+    return offset_ == rhs.offset_ && size_ == rhs.size_;
+  }
+  inline bool operator!=(const BlockHandle& rhs) const {
+    return !(*this == rhs);
+  }
+
+ private:
+  uint64_t offset_;
+  uint64_t size_;
+
+  static const BlockHandle kNullBlockHandle;
+};
+
+// Value in block-based table file index.
+//
+// The index entry for block n is: y -> h, [x],
+// where: y is some key between the last key of block n (inclusive) and the
+// first key of block n+1 (exclusive); h is BlockHandle pointing to block n;
+// x, if present, is the first key of block n (unshortened).
+// This struct represents the "h, [x]" part.
+struct IndexValue {
+  BlockHandle handle;
+  // Empty means unknown.
+  Slice first_internal_key;
+
+  IndexValue() = default;
+  IndexValue(BlockHandle _handle, Slice _first_internal_key)
+      : handle(_handle), first_internal_key(_first_internal_key) {}
+
+  // have_first_key indicates whether the `first_internal_key` is used.
+  // If previous_handle is not null, delta encoding is used;
+  // in this case, the two handles must point to consecutive blocks:
+  // handle.offset() ==
+  //     previous_handle->offset() + previous_handle->size() + kBlockTrailerSize
+  void EncodeTo(std::string* dst, bool have_first_key,
+                const BlockHandle* previous_handle) const;
+  Status DecodeFrom(Slice* input, bool have_first_key,
+                    const BlockHandle* previous_handle);
+
+  std::string ToString(bool hex, bool have_first_key) const;
+};
+
+inline uint32_t GetCompressFormatForVersion(uint32_t format_version) {
+  // As of format_version 2, we encode compressed block with
+  // compress_format_version == 2. Before that, the version is 1.
+  // DO NOT CHANGE THIS FUNCTION, it affects disk format
+  return format_version >= 2 ? 2 : 1;
+}
+
+constexpr uint32_t kLatestFormatVersion = 5;
+
+inline bool IsSupportedFormatVersion(uint32_t version) {
+  return version <= kLatestFormatVersion;
+}
+
+// Footer encapsulates the fixed information stored at the tail end of every
+// SST file. In general, it should only include things that cannot go
+// elsewhere under the metaindex block. For example, checksum_type is
+// required for verifying metaindex block checksum (when applicable), but
+// index block handle can easily go in metaindex block (possible future).
+// See also FooterBuilder below.
+class Footer {
+ public:
+  // Create empty. Populate using DecodeFrom.
+  Footer() {}
+
+  // Deserialize a footer (populate fields) from `input` and check for various
+  // corruptions. `input_offset` is the offset within the target file of
+  // `input` buffer (future use).
+  Status DecodeFrom(Slice input, uint64_t input_offset);
+
+  // Table magic number identifies file as RocksDB SST file and which kind of
+  // SST format is use.
+  uint64_t table_magic_number() const { return table_magic_number_; }
+
+  // A version (footer and more) within a kind of SST. (It would add more
+  // unnecessary complexity to separate footer versions and
+  // BBTO::format_version.)
+  uint32_t format_version() const { return format_version_; }
+
+  // Block handle for metaindex block.
+  const BlockHandle& metaindex_handle() const { return metaindex_handle_; }
+
+  // Block handle for (top-level) index block.
+  const BlockHandle& index_handle() const { return index_handle_; }
+
+  // Checksum type used in the file.
+  ChecksumType checksum_type() const {
+    return static_cast<ChecksumType>(checksum_type_);
+  }
+
+  // Block trailer size used by file with this footer (e.g. 5 for block-based
+  // table and 0 for plain table). This is inferred from magic number so
+  // not in the serialized form.
+  inline size_t GetBlockTrailerSize() const { return block_trailer_size_; }
+
+  // Convert this object to a human readable form
+  std::string ToString() const;
+
+  // Encoded lengths of Footers. Bytes for serialized Footer will always be
+  // >= kMinEncodedLength and <= kMaxEncodedLength.
+  //
+  // Footer version 0 (legacy) will always occupy exactly this many bytes.
+  // It consists of two block handles, padding, and a magic number.
+  static constexpr uint32_t kVersion0EncodedLength =
+      2 * BlockHandle::kMaxEncodedLength + kMagicNumberLengthByte;
+  static constexpr uint32_t kMinEncodedLength = kVersion0EncodedLength;
+
+  // Footer of versions 1 and higher will always occupy exactly this many
+  // bytes. It originally consisted of the checksum type, two block handles,
+  // padding (to maximum handle encoding size), a format version number, and a
+  // magic number.
+  static constexpr uint32_t kNewVersionsEncodedLength =
+      1 + 2 * BlockHandle::kMaxEncodedLength + 4 + kMagicNumberLengthByte;
+  static constexpr uint32_t kMaxEncodedLength = kNewVersionsEncodedLength;
+
+  static constexpr uint64_t kNullTableMagicNumber = 0;
+
+  static constexpr uint32_t kInvalidFormatVersion = 0xffffffffU;
+
+ private:
+  static constexpr int kInvalidChecksumType =
+      (1 << (sizeof(ChecksumType) * 8)) | kNoChecksum;
+
+  uint64_t table_magic_number_ = kNullTableMagicNumber;
+  uint32_t format_version_ = kInvalidFormatVersion;
+  BlockHandle metaindex_handle_;
+  BlockHandle index_handle_;
+  int checksum_type_ = kInvalidChecksumType;
+  uint8_t block_trailer_size_ = 0;
+};
+
+// Builder for Footer
+class FooterBuilder {
+ public:
+  // Run builder in inputs. This is a single step with lots of parameters for
+  // efficiency (based on perf testing).
+  // * table_magic_number identifies file as RocksDB SST file and which kind of
+  // SST format is use.
+  // * format_version is a version for the footer and can also apply to other
+  // aspects of the SST file (see BlockBasedTableOptions::format_version).
+  // NOTE: To save complexity in the caller, when format_version == 0 and
+  // there is a corresponding legacy magic number to the one specified, the
+  // legacy magic number will be written for forward compatibility.
+  // * footer_offset is the file offset where the footer will be written
+  // (for future use).
+  // * checksum_type is for formats using block checksums.
+  // * index_handle is optional for some kinds of SST files.
+  void Build(uint64_t table_magic_number, uint32_t format_version,
+             uint64_t footer_offset, ChecksumType checksum_type,
+             const BlockHandle& metaindex_handle,
+             const BlockHandle& index_handle = BlockHandle::NullBlockHandle());
+
+  // After Builder, get a Slice for the serialized Footer, backed by this
+  // FooterBuilder.
+  const Slice& GetSlice() const {
+    assert(slice_.size());
+    return slice_;
+  }
+
+ private:
+  Slice slice_;
+  std::array<char, Footer::kMaxEncodedLength> data_;
+};
+
+// Read the footer from file
+// If enforce_table_magic_number != 0, ReadFooterFromFile() will return
+// corruption if table_magic number is not equal to enforce_table_magic_number
+Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
+                          FilePrefetchBuffer* prefetch_buffer,
+                          uint64_t file_size, Footer* footer,
+                          uint64_t enforce_table_magic_number = 0);
+
+// Computes a checksum using the given ChecksumType. Sometimes we need to
+// include one more input byte logically at the end but not part of the main
+// data buffer. If data_size >= 1, then
+// ComputeBuiltinChecksum(type, data, size)
+// ==
+// ComputeBuiltinChecksumWithLastByte(type, data, size - 1, data[size - 1])
+uint32_t ComputeBuiltinChecksum(ChecksumType type, const char* data,
+                                size_t size);
+uint32_t ComputeBuiltinChecksumWithLastByte(ChecksumType type, const char* data,
+                                            size_t size, char last_byte);
+
+// Represents the contents of a block read from an SST file. Depending on how
+// it's created, it may or may not own the actual block bytes. As an example,
+// BlockContents objects representing data read from mmapped files only point
+// into the mmapped region. Depending on context, it might be a serialized
+// (potentially compressed) block, including a trailer beyond `size`, or an
+// uncompressed block.
+//
+// Please try to use this terminology when dealing with blocks:
+// * "Serialized block" - bytes that go into storage. For block-based table
+// (usually the case) this includes the block trailer. Here the `size` does
+// not include the trailer, but other places in code might include the trailer
+// in the size.
+// * "Maybe compressed block" - like a serialized block, but without the
+// trailer (or no promise of including a trailer). Must be accompanied by a
+// CompressionType in some other variable or field.
+// * "Uncompressed block" - "payload" bytes that are either stored with no
+// compression, used as input to compression function, or result of
+// decompression function.
+// * "Parsed block" - an in-memory form of a block in block cache, as it is
+// used by the table reader. Different C++ types are used depending on the
+// block type (see block_like_traits.h). Only trivially parsable block types
+// use BlockContents as the parsed form.
+//
+struct BlockContents {
+  // Points to block payload (without trailer)
+  Slice data;
+  CacheAllocationPtr allocation;
+
+#ifndef NDEBUG
+  // Whether there is a known trailer after what is pointed to by `data`.
+  // See BlockBasedTable::GetCompressionType.
+  bool has_trailer = false;
+#endif  // NDEBUG
+
+  BlockContents() {}
+
+  // Does not take ownership of the underlying data bytes.
+  BlockContents(const Slice& _data) : data(_data) {}
+
+  // Takes ownership of the underlying data bytes.
+  BlockContents(CacheAllocationPtr&& _data, size_t _size)
+      : data(_data.get(), _size), allocation(std::move(_data)) {}
+
+  // Takes ownership of the underlying data bytes.
+  BlockContents(std::unique_ptr<char[]>&& _data, size_t _size)
+      : data(_data.get(), _size) {
+    allocation.reset(_data.release());
+  }
+
+  // Returns whether the object has ownership of the underlying data bytes.
+  bool own_bytes() const { return allocation.get() != nullptr; }
+
+  // The additional memory space taken by the block data.
+  size_t usable_size() const {
+    if (allocation.get() != nullptr) {
+      auto allocator = allocation.get_deleter().allocator;
+      if (allocator) {
+        return allocator->UsableSize(allocation.get(), data.size());
+      }
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+      return malloc_usable_size(allocation.get());
+#else
+      return data.size();
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    } else {
+      return 0;  // no extra memory is occupied by the data
+    }
+  }
+
+  size_t ApproximateMemoryUsage() const {
+    return usable_size() + sizeof(*this);
+  }
+
+  BlockContents(BlockContents&& other) noexcept { *this = std::move(other); }
+
+  BlockContents& operator=(BlockContents&& other) {
+    data = std::move(other.data);
+    allocation = std::move(other.allocation);
+#ifndef NDEBUG
+    has_trailer = other.has_trailer;
+#endif  // NDEBUG
+    return *this;
+  }
+};
+
+// The `data` points to serialized block contents read in from file, which
+// must be compressed and include a trailer beyond `size`. A new buffer is
+// allocated with the given allocator (or default) and the uncompressed
+// contents are returned in `out_contents`.
+// format_version is as defined in include/rocksdb/table.h, which is
+// used to determine compression format version.
+Status UncompressSerializedBlock(const UncompressionInfo& info,
+                                 const char* data, size_t size,
+                                 BlockContents* out_contents,
+                                 uint32_t format_version,
+                                 const ImmutableOptions& ioptions,
+                                 MemoryAllocator* allocator = nullptr);
+
+// This is a variant of UncompressSerializedBlock that does not expect a
+// block trailer beyond `size`. (CompressionType is taken from `info`.)
+Status UncompressBlockData(const UncompressionInfo& info, const char* data,
+                           size_t size, BlockContents* out_contents,
+                           uint32_t format_version,
+                           const ImmutableOptions& ioptions,
+                           MemoryAllocator* allocator = nullptr);
+
+// Replace db_host_id contents with the real hostname if necessary
+Status ReifyDbHostIdProperty(Env* env, std::string* db_host_id);
+
+// Implementation details follow.  Clients should ignore,
+
+// TODO(andrewkr): we should prefer one way of representing a null/uninitialized
+// BlockHandle. Currently we use zeros for null and use negation-of-zeros for
+// uninitialized.
+inline BlockHandle::BlockHandle() : BlockHandle(~uint64_t{0}, ~uint64_t{0}) {}
+
+inline BlockHandle::BlockHandle(uint64_t _offset, uint64_t _size)
+    : offset_(_offset), size_(_size) {}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/get_context.cc b/src/rocksdb/table/get_context.cc
new file mode 100644
index 000000000..69e752714
--- /dev/null
+++ b/src/rocksdb/table/get_context.cc
@@ -0,0 +1,604 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/get_context.h"
+
+#include "db/blob//blob_fetcher.h"
+#include "db/merge_helper.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/read_callback.h"
+#include "db/wide/wide_column_serialization.h"
+#include "monitoring/file_read_sample.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/statistics.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/system_clock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+void appendToReplayLog(std::string* replay_log, ValueType type, Slice value) {
+#ifndef ROCKSDB_LITE
+  if (replay_log) {
+    if (replay_log->empty()) {
+      // Optimization: in the common case of only one operation in the
+      // log, we allocate the exact amount of space needed.
+      replay_log->reserve(1 + VarintLength(value.size()) + value.size());
+    }
+    replay_log->push_back(type);
+    PutLengthPrefixedSlice(replay_log, value);
+  }
+#else
+  (void)replay_log;
+  (void)type;
+  (void)value;
+#endif  // ROCKSDB_LITE
+}
+
+}  // namespace
+
+GetContext::GetContext(
+    const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger,
+    Statistics* statistics, GetState init_state, const Slice& user_key,
+    PinnableSlice* pinnable_val, PinnableWideColumns* columns,
+    std::string* timestamp, bool* value_found, MergeContext* merge_context,
+    bool do_merge, SequenceNumber* _max_covering_tombstone_seq,
+    SystemClock* clock, SequenceNumber* seq,
+    PinnedIteratorsManager* _pinned_iters_mgr, ReadCallback* callback,
+    bool* is_blob_index, uint64_t tracing_get_id, BlobFetcher* blob_fetcher)
+    : ucmp_(ucmp),
+      merge_operator_(merge_operator),
+      logger_(logger),
+      statistics_(statistics),
+      state_(init_state),
+      user_key_(user_key),
+      pinnable_val_(pinnable_val),
+      columns_(columns),
+      timestamp_(timestamp),
+      value_found_(value_found),
+      merge_context_(merge_context),
+      max_covering_tombstone_seq_(_max_covering_tombstone_seq),
+      clock_(clock),
+      seq_(seq),
+      replay_log_(nullptr),
+      pinned_iters_mgr_(_pinned_iters_mgr),
+      callback_(callback),
+      do_merge_(do_merge),
+      is_blob_index_(is_blob_index),
+      tracing_get_id_(tracing_get_id),
+      blob_fetcher_(blob_fetcher) {
+  if (seq_) {
+    *seq_ = kMaxSequenceNumber;
+  }
+  sample_ = should_sample_file_read();
+}
+
+GetContext::GetContext(const Comparator* ucmp,
+                       const MergeOperator* merge_operator, Logger* logger,
+                       Statistics* statistics, GetState init_state,
+                       const Slice& user_key, PinnableSlice* pinnable_val,
+                       PinnableWideColumns* columns, bool* value_found,
+                       MergeContext* merge_context, bool do_merge,
+                       SequenceNumber* _max_covering_tombstone_seq,
+                       SystemClock* clock, SequenceNumber* seq,
+                       PinnedIteratorsManager* _pinned_iters_mgr,
+                       ReadCallback* callback, bool* is_blob_index,
+                       uint64_t tracing_get_id, BlobFetcher* blob_fetcher)
+    : GetContext(ucmp, merge_operator, logger, statistics, init_state, user_key,
+                 pinnable_val, columns, /*timestamp=*/nullptr, value_found,
+                 merge_context, do_merge, _max_covering_tombstone_seq, clock,
+                 seq, _pinned_iters_mgr, callback, is_blob_index,
+                 tracing_get_id, blob_fetcher) {}
+
+// Called from TableCache::Get and Table::Get when file/block in which
+// key may exist are not there in TableCache/BlockCache respectively. In this
+// case we can't guarantee that key does not exist and are not permitted to do
+// IO to be certain.Set the status=kFound and value_found=false to let the
+// caller know that key may exist but is not there in memory
+void GetContext::MarkKeyMayExist() {
+  state_ = kFound;
+  if (value_found_ != nullptr) {
+    *value_found_ = false;
+  }
+}
+
+void GetContext::SaveValue(const Slice& value, SequenceNumber /*seq*/) {
+  assert(state_ == kNotFound);
+  appendToReplayLog(replay_log_, kTypeValue, value);
+
+  state_ = kFound;
+  if (LIKELY(pinnable_val_ != nullptr)) {
+    pinnable_val_->PinSelf(value);
+  }
+}
+
+void GetContext::ReportCounters() {
+  if (get_context_stats_.num_cache_hit > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_HIT, get_context_stats_.num_cache_hit);
+  }
+  if (get_context_stats_.num_cache_index_hit > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_INDEX_HIT,
+               get_context_stats_.num_cache_index_hit);
+  }
+  if (get_context_stats_.num_cache_data_hit > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_DATA_HIT,
+               get_context_stats_.num_cache_data_hit);
+  }
+  if (get_context_stats_.num_cache_filter_hit > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_FILTER_HIT,
+               get_context_stats_.num_cache_filter_hit);
+  }
+  if (get_context_stats_.num_cache_compression_dict_hit > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_HIT,
+               get_context_stats_.num_cache_compression_dict_hit);
+  }
+  if (get_context_stats_.num_cache_index_miss > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_INDEX_MISS,
+               get_context_stats_.num_cache_index_miss);
+  }
+  if (get_context_stats_.num_cache_filter_miss > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_FILTER_MISS,
+               get_context_stats_.num_cache_filter_miss);
+  }
+  if (get_context_stats_.num_cache_data_miss > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_DATA_MISS,
+               get_context_stats_.num_cache_data_miss);
+  }
+  if (get_context_stats_.num_cache_compression_dict_miss > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_MISS,
+               get_context_stats_.num_cache_compression_dict_miss);
+  }
+  if (get_context_stats_.num_cache_bytes_read > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_BYTES_READ,
+               get_context_stats_.num_cache_bytes_read);
+  }
+  if (get_context_stats_.num_cache_miss > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_MISS,
+               get_context_stats_.num_cache_miss);
+  }
+  if (get_context_stats_.num_cache_add > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_ADD, get_context_stats_.num_cache_add);
+  }
+  if (get_context_stats_.num_cache_add_redundant > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_ADD_REDUNDANT,
+               get_context_stats_.num_cache_add_redundant);
+  }
+  if (get_context_stats_.num_cache_bytes_write > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_BYTES_WRITE,
+               get_context_stats_.num_cache_bytes_write);
+  }
+  if (get_context_stats_.num_cache_index_add > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_INDEX_ADD,
+               get_context_stats_.num_cache_index_add);
+  }
+  if (get_context_stats_.num_cache_index_add_redundant > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_INDEX_ADD_REDUNDANT,
+               get_context_stats_.num_cache_index_add_redundant);
+  }
+  if (get_context_stats_.num_cache_index_bytes_insert > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_INDEX_BYTES_INSERT,
+               get_context_stats_.num_cache_index_bytes_insert);
+  }
+  if (get_context_stats_.num_cache_data_add > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_DATA_ADD,
+               get_context_stats_.num_cache_data_add);
+  }
+  if (get_context_stats_.num_cache_data_add_redundant > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_DATA_ADD_REDUNDANT,
+               get_context_stats_.num_cache_data_add_redundant);
+  }
+  if (get_context_stats_.num_cache_data_bytes_insert > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_DATA_BYTES_INSERT,
+               get_context_stats_.num_cache_data_bytes_insert);
+  }
+  if (get_context_stats_.num_cache_filter_add > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_FILTER_ADD,
+               get_context_stats_.num_cache_filter_add);
+  }
+  if (get_context_stats_.num_cache_filter_add_redundant > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_FILTER_ADD_REDUNDANT,
+               get_context_stats_.num_cache_filter_add_redundant);
+  }
+  if (get_context_stats_.num_cache_filter_bytes_insert > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_FILTER_BYTES_INSERT,
+               get_context_stats_.num_cache_filter_bytes_insert);
+  }
+  if (get_context_stats_.num_cache_compression_dict_add > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_ADD,
+               get_context_stats_.num_cache_compression_dict_add);
+  }
+  if (get_context_stats_.num_cache_compression_dict_add_redundant > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT,
+               get_context_stats_.num_cache_compression_dict_add_redundant);
+  }
+  if (get_context_stats_.num_cache_compression_dict_bytes_insert > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
+               get_context_stats_.num_cache_compression_dict_bytes_insert);
+  }
+}
+
+bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
+                           const Slice& value, bool* matched,
+                           Cleanable* value_pinner) {
+  assert(matched);
+  assert((state_ != kMerge && parsed_key.type != kTypeMerge) ||
+         merge_context_ != nullptr);
+  if (ucmp_->EqualWithoutTimestamp(parsed_key.user_key, user_key_)) {
+    *matched = true;
+    // If the value is not in the snapshot, skip it
+    if (!CheckCallback(parsed_key.sequence)) {
+      return true;  // to continue to the next seq
+    }
+
+    appendToReplayLog(replay_log_, parsed_key.type, value);
+
+    if (seq_ != nullptr) {
+      // Set the sequence number if it is uninitialized
+      if (*seq_ == kMaxSequenceNumber) {
+        *seq_ = parsed_key.sequence;
+      }
+      if (max_covering_tombstone_seq_) {
+        *seq_ = std::max(*seq_, *max_covering_tombstone_seq_);
+      }
+    }
+
+    size_t ts_sz = ucmp_->timestamp_size();
+    if (ts_sz > 0 && timestamp_ != nullptr) {
+      if (!timestamp_->empty()) {
+        assert(ts_sz == timestamp_->size());
+        // `timestamp` can be set before `SaveValue` is ever called
+        // when max_covering_tombstone_seq_ was set.
+        // If this key has a higher sequence number than range tombstone,
+        // then timestamp should be updated. `ts_from_rangetombstone_` is
+        // set to false afterwards so that only the key with highest seqno
+        // updates the timestamp.
+        if (ts_from_rangetombstone_) {
+          assert(max_covering_tombstone_seq_);
+          if (parsed_key.sequence > *max_covering_tombstone_seq_) {
+            Slice ts = ExtractTimestampFromUserKey(parsed_key.user_key, ts_sz);
+            timestamp_->assign(ts.data(), ts.size());
+            ts_from_rangetombstone_ = false;
+          }
+        }
+      }
+      // TODO optimize for small size ts
+      const std::string kMaxTs(ts_sz, '\xff');
+      if (timestamp_->empty() ||
+          ucmp_->CompareTimestamp(*timestamp_, kMaxTs) == 0) {
+        Slice ts = ExtractTimestampFromUserKey(parsed_key.user_key, ts_sz);
+        timestamp_->assign(ts.data(), ts.size());
+      }
+    }
+
+    auto type = parsed_key.type;
+    // Key matches. Process it
+    if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex ||
+         type == kTypeWideColumnEntity || type == kTypeDeletion ||
+         type == kTypeDeletionWithTimestamp || type == kTypeSingleDeletion) &&
+        max_covering_tombstone_seq_ != nullptr &&
+        *max_covering_tombstone_seq_ > parsed_key.sequence) {
+      // Note that deletion types are also considered, this is for the case
+      // when we need to return timestamp to user. If a range tombstone has a
+      // higher seqno than point tombstone, its timestamp should be returned.
+      type = kTypeRangeDeletion;
+    }
+    switch (type) {
+      case kTypeValue:
+      case kTypeBlobIndex:
+      case kTypeWideColumnEntity:
+        assert(state_ == kNotFound || state_ == kMerge);
+        if (type == kTypeBlobIndex) {
+          if (is_blob_index_ == nullptr) {
+            // Blob value not supported. Stop.
+            state_ = kUnexpectedBlobIndex;
+            return false;
+          }
+        }
+
+        if (is_blob_index_ != nullptr) {
+          *is_blob_index_ = (type == kTypeBlobIndex);
+        }
+
+        if (kNotFound == state_) {
+          state_ = kFound;
+          if (do_merge_) {
+            if (LIKELY(pinnable_val_ != nullptr)) {
+              Slice value_to_use = value;
+
+              if (type == kTypeWideColumnEntity) {
+                Slice value_copy = value;
+
+                if (!WideColumnSerialization::GetValueOfDefaultColumn(
+                         value_copy, value_to_use)
+                         .ok()) {
+                  state_ = kCorrupt;
+                  return false;
+                }
+              }
+
+              if (LIKELY(value_pinner != nullptr)) {
+                // If the backing resources for the value are provided, pin them
+                pinnable_val_->PinSlice(value_to_use, value_pinner);
+              } else {
+                TEST_SYNC_POINT_CALLBACK("GetContext::SaveValue::PinSelf",
+                                         this);
+                // Otherwise copy the value
+                pinnable_val_->PinSelf(value_to_use);
+              }
+            } else if (columns_ != nullptr) {
+              if (type == kTypeWideColumnEntity) {
+                if (!columns_->SetWideColumnValue(value, value_pinner).ok()) {
+                  state_ = kCorrupt;
+                  return false;
+                }
+              } else {
+                columns_->SetPlainValue(value, value_pinner);
+              }
+            }
+          } else {
+            // It means this function is called as part of DB GetMergeOperands
+            // API and the current value should be part of
+            // merge_context_->operand_list
+            if (type == kTypeBlobIndex) {
+              PinnableSlice pin_val;
+              if (GetBlobValue(value, &pin_val) == false) {
+                return false;
+              }
+              Slice blob_value(pin_val);
+              push_operand(blob_value, nullptr);
+            } else if (type == kTypeWideColumnEntity) {
+              Slice value_copy = value;
+              Slice value_of_default;
+
+              if (!WideColumnSerialization::GetValueOfDefaultColumn(
+                       value_copy, value_of_default)
+                       .ok()) {
+                state_ = kCorrupt;
+                return false;
+              }
+
+              push_operand(value_of_default, value_pinner);
+            } else {
+              assert(type == kTypeValue);
+              push_operand(value, value_pinner);
+            }
+          }
+        } else if (kMerge == state_) {
+          assert(merge_operator_ != nullptr);
+          if (type == kTypeBlobIndex) {
+            PinnableSlice pin_val;
+            if (GetBlobValue(value, &pin_val) == false) {
+              return false;
+            }
+            Slice blob_value(pin_val);
+            state_ = kFound;
+            if (do_merge_) {
+              Merge(&blob_value);
+            } else {
+              // It means this function is called as part of DB GetMergeOperands
+              // API and the current value should be part of
+              // merge_context_->operand_list
+              push_operand(blob_value, nullptr);
+            }
+          } else if (type == kTypeWideColumnEntity) {
+            state_ = kFound;
+
+            if (do_merge_) {
+              MergeWithEntity(value);
+            } else {
+              // It means this function is called as part of DB GetMergeOperands
+              // API and the current value should be part of
+              // merge_context_->operand_list
+              Slice value_copy = value;
+              Slice value_of_default;
+
+              if (!WideColumnSerialization::GetValueOfDefaultColumn(
+                       value_copy, value_of_default)
+                       .ok()) {
+                state_ = kCorrupt;
+                return false;
+              }
+
+              push_operand(value_of_default, value_pinner);
+            }
+          } else {
+            assert(type == kTypeValue);
+
+            state_ = kFound;
+            if (do_merge_) {
+              Merge(&value);
+            } else {
+              // It means this function is called as part of DB GetMergeOperands
+              // API and the current value should be part of
+              // merge_context_->operand_list
+              push_operand(value, value_pinner);
+            }
+          }
+        }
+        return false;
+
+      case kTypeDeletion:
+      case kTypeDeletionWithTimestamp:
+      case kTypeSingleDeletion:
+      case kTypeRangeDeletion:
+        // TODO(noetzli): Verify correctness once merge of single-deletes
+        // is supported
+        assert(state_ == kNotFound || state_ == kMerge);
+        if (kNotFound == state_) {
+          state_ = kDeleted;
+        } else if (kMerge == state_) {
+          state_ = kFound;
+          if (do_merge_) {
+            Merge(nullptr);
+          }
+          // If do_merge_ = false then the current value shouldn't be part of
+          // merge_context_->operand_list
+        }
+        return false;
+
+      case kTypeMerge:
+        assert(state_ == kNotFound || state_ == kMerge);
+        state_ = kMerge;
+        // value_pinner is not set from plain_table_reader.cc for example.
+        push_operand(value, value_pinner);
+        if (do_merge_ && merge_operator_ != nullptr &&
+            merge_operator_->ShouldMerge(
+                merge_context_->GetOperandsDirectionBackward())) {
+          state_ = kFound;
+          Merge(nullptr);
+          return false;
+        }
+        return true;
+
+      default:
+        assert(false);
+        break;
+    }
+  }
+
+  // state_ could be Corrupt, merge or notfound
+  return false;
+}
+
+void GetContext::Merge(const Slice* value) {
+  assert(do_merge_);
+  assert(!pinnable_val_ || !columns_);
+
+  std::string result;
+  const Status s = MergeHelper::TimedFullMerge(
+      merge_operator_, user_key_, value, merge_context_->GetOperands(), &result,
+      logger_, statistics_, clock_, /* result_operand */ nullptr,
+      /* update_num_ops_stats */ true);
+  if (!s.ok()) {
+    state_ = kCorrupt;
+    return;
+  }
+
+  if (LIKELY(pinnable_val_ != nullptr)) {
+    *(pinnable_val_->GetSelf()) = std::move(result);
+    pinnable_val_->PinSelf();
+    return;
+  }
+
+  assert(columns_);
+  columns_->SetPlainValue(result);
+}
+
+void GetContext::MergeWithEntity(Slice entity) {
+  assert(do_merge_);
+  assert(!pinnable_val_ || !columns_);
+
+  if (LIKELY(pinnable_val_ != nullptr)) {
+    Slice value_of_default;
+
+    {
+      const Status s = WideColumnSerialization::GetValueOfDefaultColumn(
+          entity, value_of_default);
+      if (!s.ok()) {
+        state_ = kCorrupt;
+        return;
+      }
+    }
+
+    {
+      const Status s = MergeHelper::TimedFullMerge(
+          merge_operator_, user_key_, &value_of_default,
+          merge_context_->GetOperands(), pinnable_val_->GetSelf(), logger_,
+          statistics_, clock_, /* result_operand */ nullptr,
+          /* update_num_ops_stats */ true);
+      if (!s.ok()) {
+        state_ = kCorrupt;
+        return;
+      }
+    }
+
+    pinnable_val_->PinSelf();
+    return;
+  }
+
+  std::string result;
+
+  {
+    const Status s = MergeHelper::TimedFullMergeWithEntity(
+        merge_operator_, user_key_, entity, merge_context_->GetOperands(),
+        &result, logger_, statistics_, clock_, /* update_num_ops_stats */ true);
+    if (!s.ok()) {
+      state_ = kCorrupt;
+      return;
+    }
+  }
+
+  {
+    assert(columns_);
+    const Status s = columns_->SetWideColumnValue(result);
+    if (!s.ok()) {
+      state_ = kCorrupt;
+      return;
+    }
+  }
+}
+
+bool GetContext::GetBlobValue(const Slice& blob_index,
+                              PinnableSlice* blob_value) {
+  constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+  constexpr uint64_t* bytes_read = nullptr;
+
+  Status status = blob_fetcher_->FetchBlob(
+      user_key_, blob_index, prefetch_buffer, blob_value, bytes_read);
+  if (!status.ok()) {
+    if (status.IsIncomplete()) {
+      // FIXME: this code is not covered by unit tests
+      MarkKeyMayExist();
+      return false;
+    }
+    state_ = kCorrupt;
+    return false;
+  }
+  *is_blob_index_ = false;
+  return true;
+}
+
+void GetContext::push_operand(const Slice& value, Cleanable* value_pinner) {
+  // TODO(yanqin) preserve timestamps information in merge_context
+  if (pinned_iters_mgr() && pinned_iters_mgr()->PinningEnabled() &&
+      value_pinner != nullptr) {
+    value_pinner->DelegateCleanupsTo(pinned_iters_mgr());
+    merge_context_->PushOperand(value, true /*value_pinned*/);
+  } else {
+    merge_context_->PushOperand(value, false);
+  }
+}
+
+void replayGetContextLog(const Slice& replay_log, const Slice& user_key,
+                         GetContext* get_context, Cleanable* value_pinner) {
+#ifndef ROCKSDB_LITE
+  Slice s = replay_log;
+  while (s.size()) {
+    auto type = static_cast<ValueType>(*s.data());
+    s.remove_prefix(1);
+    Slice value;
+    bool ret = GetLengthPrefixedSlice(&s, &value);
+    assert(ret);
+    (void)ret;
+
+    bool dont_care __attribute__((__unused__));
+    // Since SequenceNumber is not stored and unknown, we will use
+    // kMaxSequenceNumber.
+    get_context->SaveValue(
+        ParsedInternalKey(user_key, kMaxSequenceNumber, type), value,
+        &dont_care, value_pinner);
+  }
+#else   // ROCKSDB_LITE
+  (void)replay_log;
+  (void)user_key;
+  (void)get_context;
+  (void)value_pinner;
+  assert(false);
+#endif  // ROCKSDB_LITE
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/get_context.h b/src/rocksdb/table/get_context.h
new file mode 100644
index 000000000..dcc7ab8d6
--- /dev/null
+++ b/src/rocksdb/table/get_context.h
@@ -0,0 +1,231 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <string>
+
+#include "db/read_callback.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+class BlobFetcher;
+class Comparator;
+class Logger;
+class MergeContext;
+class MergeOperator;
+class PinnableWideColumns;
+class PinnedIteratorsManager;
+class Statistics;
+class SystemClock;
+struct ParsedInternalKey;
+
+// Data structure for accumulating statistics during a point lookup. At the
+// end of the point lookup, the corresponding ticker stats are updated. This
+// avoids the overhead of frequent ticker stats updates
+struct GetContextStats {
+  uint64_t num_cache_hit = 0;
+  uint64_t num_cache_index_hit = 0;
+  uint64_t num_cache_data_hit = 0;
+  uint64_t num_cache_filter_hit = 0;
+  uint64_t num_cache_compression_dict_hit = 0;
+  uint64_t num_cache_index_miss = 0;
+  uint64_t num_cache_filter_miss = 0;
+  uint64_t num_cache_data_miss = 0;
+  uint64_t num_cache_compression_dict_miss = 0;
+  uint64_t num_cache_bytes_read = 0;
+  uint64_t num_cache_miss = 0;
+  uint64_t num_cache_add = 0;
+  uint64_t num_cache_add_redundant = 0;
+  uint64_t num_cache_bytes_write = 0;
+  uint64_t num_cache_index_add = 0;
+  uint64_t num_cache_index_add_redundant = 0;
+  uint64_t num_cache_index_bytes_insert = 0;
+  uint64_t num_cache_data_add = 0;
+  uint64_t num_cache_data_add_redundant = 0;
+  uint64_t num_cache_data_bytes_insert = 0;
+  uint64_t num_cache_filter_add = 0;
+  uint64_t num_cache_filter_add_redundant = 0;
+  uint64_t num_cache_filter_bytes_insert = 0;
+  uint64_t num_cache_compression_dict_add = 0;
+  uint64_t num_cache_compression_dict_add_redundant = 0;
+  uint64_t num_cache_compression_dict_bytes_insert = 0;
+  // MultiGet stats.
+  uint64_t num_filter_read = 0;
+  uint64_t num_index_read = 0;
+  uint64_t num_sst_read = 0;
+};
+
+// A class to hold context about a point lookup, such as pointer to value
+// slice, key, merge context etc, as well as the current state of the
+// lookup. Any user using GetContext to track the lookup result must call
+// SaveValue() whenever the internal key is found. This can happen
+// repeatedly in case of merge operands. In case the key may exist with
+// high probability, but IO is required to confirm and the user doesn't allow
+// it, MarkKeyMayExist() must be called instead of SaveValue().
+class GetContext {
+ public:
+  // Current state of the point lookup. All except kNotFound and kMerge are
+  // terminal states
+  enum GetState {
+    kNotFound,
+    kFound,
+    kDeleted,
+    kCorrupt,
+    kMerge,  // saver contains the current merge result (the operands)
+    kUnexpectedBlobIndex,
+  };
+  GetContextStats get_context_stats_;
+
+  // Constructor
+  // @param value Holds the value corresponding to user_key. If its nullptr
+  //              then return all merge operands corresponding to user_key
+  //              via merge_context
+  // @param value_found If non-nullptr, set to false if key may be present
+  //                    but we can't be certain because we cannot do IO
+  // @param max_covering_tombstone_seq Pointer to highest sequence number of
+  //                    range deletion covering the key. When an internal key
+  //                    is found with smaller sequence number, the lookup
+  //                    terminates
+  // @param seq If non-nullptr, the sequence number of the found key will be
+  //            saved here
+  // @param callback Pointer to ReadCallback to perform additional checks
+  //                 for visibility of a key
+  // @param is_blob_index If non-nullptr, will be used to indicate if a found
+  //                      key is of type blob index
+  // @param do_merge True if value associated with user_key has to be returned
+  // and false if all the merge operands associated with user_key has to be
+  // returned. Id do_merge=false then all the merge operands are stored in
+  // merge_context and they are never merged. The value pointer is untouched.
+  GetContext(const Comparator* ucmp, const MergeOperator* merge_operator,
+             Logger* logger, Statistics* statistics, GetState init_state,
+             const Slice& user_key, PinnableSlice* value,
+             PinnableWideColumns* columns, bool* value_found,
+             MergeContext* merge_context, bool do_merge,
+             SequenceNumber* max_covering_tombstone_seq, SystemClock* clock,
+             SequenceNumber* seq = nullptr,
+             PinnedIteratorsManager* _pinned_iters_mgr = nullptr,
+             ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
+             uint64_t tracing_get_id = 0, BlobFetcher* blob_fetcher = nullptr);
+  GetContext(const Comparator* ucmp, const MergeOperator* merge_operator,
+             Logger* logger, Statistics* statistics, GetState init_state,
+             const Slice& user_key, PinnableSlice* value,
+             PinnableWideColumns* columns, std::string* timestamp,
+             bool* value_found, MergeContext* merge_context, bool do_merge,
+             SequenceNumber* max_covering_tombstone_seq, SystemClock* clock,
+             SequenceNumber* seq = nullptr,
+             PinnedIteratorsManager* _pinned_iters_mgr = nullptr,
+             ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
+             uint64_t tracing_get_id = 0, BlobFetcher* blob_fetcher = nullptr);
+
+  GetContext() = delete;
+
+  // This can be called to indicate that a key may be present, but cannot be
+  // confirmed due to IO not allowed
+  void MarkKeyMayExist();
+
+  // Records this key, value, and any meta-data (such as sequence number and
+  // state) into this GetContext.
+  //
+  // If the parsed_key matches the user key that we are looking for, sets
+  // matched to true.
+  //
+  // Returns True if more keys need to be read (due to merges) or
+  //         False if the complete value has been found.
+  bool SaveValue(const ParsedInternalKey& parsed_key, const Slice& value,
+                 bool* matched, Cleanable* value_pinner = nullptr);
+
+  // Simplified version of the previous function. Should only be used when we
+  // know that the operation is a Put.
+  void SaveValue(const Slice& value, SequenceNumber seq);
+
+  GetState State() const { return state_; }
+
+  SequenceNumber* max_covering_tombstone_seq() {
+    return max_covering_tombstone_seq_;
+  }
+
+  bool NeedTimestamp() { return timestamp_ != nullptr; }
+
+  void SetTimestampFromRangeTombstone(const Slice& timestamp) {
+    assert(timestamp_);
+    timestamp_->assign(timestamp.data(), timestamp.size());
+    ts_from_rangetombstone_ = true;
+  }
+
+  PinnedIteratorsManager* pinned_iters_mgr() { return pinned_iters_mgr_; }
+
+  // If a non-null string is passed, all the SaveValue calls will be
+  // logged into the string. The operations can then be replayed on
+  // another GetContext with replayGetContextLog.
+  void SetReplayLog(std::string* replay_log) { replay_log_ = replay_log; }
+
+  // Do we need to fetch the SequenceNumber for this key?
+  bool NeedToReadSequence() const { return (seq_ != nullptr); }
+
+  bool sample() const { return sample_; }
+
+  bool CheckCallback(SequenceNumber seq) {
+    if (callback_) {
+      return callback_->IsVisible(seq);
+    }
+    return true;
+  }
+
+  void ReportCounters();
+
+  bool has_callback() const { return callback_ != nullptr; }
+
+  uint64_t get_tracing_get_id() const { return tracing_get_id_; }
+
+  void push_operand(const Slice& value, Cleanable* value_pinner);
+
+ private:
+  void Merge(const Slice* value);
+  void MergeWithEntity(Slice entity);
+  bool GetBlobValue(const Slice& blob_index, PinnableSlice* blob_value);
+
+  const Comparator* ucmp_;
+  const MergeOperator* merge_operator_;
+  // the merge operations encountered;
+  Logger* logger_;
+  Statistics* statistics_;
+
+  GetState state_;
+  Slice user_key_;
+  PinnableSlice* pinnable_val_;
+  PinnableWideColumns* columns_;
+  std::string* timestamp_;
+  bool ts_from_rangetombstone_{false};
+  bool* value_found_;  // Is value set correctly? Used by KeyMayExist
+  MergeContext* merge_context_;
+  SequenceNumber* max_covering_tombstone_seq_;
+  SystemClock* clock_;
+  // If a key is found, seq_ will be set to the SequenceNumber of most recent
+  // write to the key or kMaxSequenceNumber if unknown
+  SequenceNumber* seq_;
+  std::string* replay_log_;
+  // Used to temporarily pin blocks when state_ == GetContext::kMerge
+  PinnedIteratorsManager* pinned_iters_mgr_;
+  ReadCallback* callback_;
+  bool sample_;
+  // Value is true if it's called as part of DB Get API and false if it's
+  // called as part of DB GetMergeOperands API. When it's false merge operators
+  // are never merged.
+  bool do_merge_;
+  bool* is_blob_index_;
+  // Used for block cache tracing only. A tracing get id uniquely identifies a
+  // Get or a MultiGet.
+  const uint64_t tracing_get_id_;
+  BlobFetcher* blob_fetcher_;
+};
+
+// Call this to replay a log and bring the get_context up to date. The replay
+// log must have been created by another GetContext object, whose replay log
+// must have been set by calling GetContext::SetReplayLog().
+void replayGetContextLog(const Slice& replay_log, const Slice& user_key,
+                         GetContext* get_context,
+                         Cleanable* value_pinner = nullptr);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/internal_iterator.h b/src/rocksdb/table/internal_iterator.h
new file mode 100644
index 000000000..945dec806
--- /dev/null
+++ b/src/rocksdb/table/internal_iterator.h
@@ -0,0 +1,226 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include <string>
+
+#include "db/dbformat.h"
+#include "file/readahead_file_info.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/status.h"
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class PinnedIteratorsManager;
+
+enum class IterBoundCheck : char {
+  kUnknown = 0,
+  kOutOfBound,
+  kInbound,
+};
+
+struct IterateResult {
+  Slice key;
+  IterBoundCheck bound_check_result = IterBoundCheck::kUnknown;
+  // If false, PrepareValue() needs to be called before value().
+  bool value_prepared = true;
+};
+
+template <class TValue>
+class InternalIteratorBase : public Cleanable {
+ public:
+  InternalIteratorBase() {}
+
+  // No copying allowed
+  InternalIteratorBase(const InternalIteratorBase&) = delete;
+  InternalIteratorBase& operator=(const InternalIteratorBase&) = delete;
+
+  virtual ~InternalIteratorBase() {}
+
+  // An iterator is either positioned at a key/value pair, or
+  // not valid.  This method returns true iff the iterator is valid.
+  // Always returns false if !status().ok().
+  virtual bool Valid() const = 0;
+
+  // Position at the first key in the source.  The iterator is Valid()
+  // after this call iff the source is not empty.
+  virtual void SeekToFirst() = 0;
+
+  // Position at the last key in the source.  The iterator is
+  // Valid() after this call iff the source is not empty.
+  virtual void SeekToLast() = 0;
+
+  // Position at the first key in the source that at or past target
+  // The iterator is Valid() after this call iff the source contains
+  // an entry that comes at or past target.
+  // All Seek*() methods clear any error status() that the iterator had prior to
+  // the call; after the seek, status() indicates only the error (if any) that
+  // happened during the seek, not any past errors.
+  // 'target' contains user timestamp if timestamp is enabled.
+  virtual void Seek(const Slice& target) = 0;
+
+  // Position at the first key in the source that at or before target
+  // The iterator is Valid() after this call iff the source contains
+  // an entry that comes at or before target.
+  virtual void SeekForPrev(const Slice& target) = 0;
+
+  // Moves to the next entry in the source.  After this call, Valid() is
+  // true iff the iterator was not positioned at the last entry in the source.
+  // REQUIRES: Valid()
+  virtual void Next() = 0;
+
+  // Moves to the next entry in the source, and return result. Iterator
+  // implementation should override this method to help methods inline better,
+  // or when UpperBoundCheckResult() is non-trivial.
+  // REQUIRES: Valid()
+  virtual bool NextAndGetResult(IterateResult* result) {
+    Next();
+    bool is_valid = Valid();
+    if (is_valid) {
+      result->key = key();
+      // Default may_be_out_of_upper_bound to true to avoid unnecessary virtual
+      // call. If an implementation has non-trivial UpperBoundCheckResult(),
+      // it should also override NextAndGetResult().
+      result->bound_check_result = IterBoundCheck::kUnknown;
+      result->value_prepared = false;
+      assert(UpperBoundCheckResult() != IterBoundCheck::kOutOfBound);
+    }
+    return is_valid;
+  }
+
+  // Moves to the previous entry in the source.  After this call, Valid() is
+  // true iff the iterator was not positioned at the first entry in source.
+  // REQUIRES: Valid()
+  virtual void Prev() = 0;
+
+  // Return the key for the current entry.  The underlying storage for
+  // the returned slice is valid only until the next modification of
+  // the iterator.
+  // REQUIRES: Valid()
+  virtual Slice key() const = 0;
+
+  // Return user key for the current entry.
+  // REQUIRES: Valid()
+  virtual Slice user_key() const { return ExtractUserKey(key()); }
+
+  // Return the value for the current entry.  The underlying storage for
+  // the returned slice is valid only until the next modification of
+  // the iterator.
+  // REQUIRES: Valid()
+  // REQUIRES: PrepareValue() has been called if needed (see PrepareValue()).
+  virtual TValue value() const = 0;
+
+  // If an error has occurred, return it.  Else return an ok status.
+  // If non-blocking IO is requested and this operation cannot be
+  // satisfied without doing some IO, then this returns Status::Incomplete().
+  virtual Status status() const = 0;
+
+  // For some types of iterators, sometimes Seek()/Next()/SeekForPrev()/etc may
+  // load key but not value (to avoid the IO cost of reading the value from disk
+  // if it won't be not needed). This method loads the value in such situation.
+  //
+  // Needs to be called before value() at least once after each iterator
+  // movement (except if IterateResult::value_prepared = true), for iterators
+  // created with allow_unprepared_value = true.
+  //
+  // Returns false if an error occurred; in this case Valid() is also changed
+  // to false, and status() is changed to non-ok.
+  // REQUIRES: Valid()
+  virtual bool PrepareValue() { return true; }
+
+  // Keys return from this iterator can be smaller than iterate_lower_bound.
+  virtual bool MayBeOutOfLowerBound() { return true; }
+
+  // If the iterator has checked the key against iterate_upper_bound, returns
+  // the result here. The function can be used by user of the iterator to skip
+  // their own checks. If Valid() = true, IterBoundCheck::kUnknown is always
+  // a valid value. If Valid() = false, IterBoundCheck::kOutOfBound indicates
+  // that the iterator is filtered out by upper bound checks.
+  virtual IterBoundCheck UpperBoundCheckResult() {
+    return IterBoundCheck::kUnknown;
+  }
+
+  // Pass the PinnedIteratorsManager to the Iterator, most Iterators don't
+  // communicate with PinnedIteratorsManager so default implementation is no-op
+  // but for Iterators that need to communicate with PinnedIteratorsManager
+  // they will implement this function and use the passed pointer to communicate
+  // with PinnedIteratorsManager.
+  virtual void SetPinnedItersMgr(PinnedIteratorsManager* /*pinned_iters_mgr*/) {
+  }
+
+  // If true, this means that the Slice returned by key() is valid as long as
+  // PinnedIteratorsManager::ReleasePinnedData is not called and the
+  // Iterator is not deleted.
+  //
+  // IsKeyPinned() is guaranteed to always return true if
+  //  - Iterator is created with ReadOptions::pin_data = true
+  //  - DB tables were created with BlockBasedTableOptions::use_delta_encoding
+  //    set to false.
+  virtual bool IsKeyPinned() const { return false; }
+
+  // If true, this means that the Slice returned by value() is valid as long as
+  // PinnedIteratorsManager::ReleasePinnedData is not called and the
+  // Iterator is not deleted.
+  // REQUIRES: Same as for value().
+  virtual bool IsValuePinned() const { return false; }
+
+  virtual Status GetProperty(std::string /*prop_name*/, std::string* /*prop*/) {
+    return Status::NotSupported("");
+  }
+
+  // When iterator moves from one file to another file at same level, new file's
+  // readahead state (details of last block read) is updated with previous
+  // file's readahead state. This way internal readahead_size of Prefetch Buffer
+  // doesn't start from scratch and can fall back to 8KB with no prefetch if
+  // reads are not sequential.
+  //
+  // Default implementation is no-op and its implemented by iterators.
+  virtual void GetReadaheadState(ReadaheadFileInfo* /*readahead_file_info*/) {}
+
+  // Default implementation is no-op and its implemented by iterators.
+  virtual void SetReadaheadState(ReadaheadFileInfo* /*readahead_file_info*/) {}
+
+  // When used under merging iterator, LevelIterator treats file boundaries
+  // as sentinel keys to prevent it from moving to next SST file before range
+  // tombstones in the current SST file are no longer needed. This method makes
+  // it cheap to check if the current key is a sentinel key. This should only be
+  // used by MergingIterator and LevelIterator for now.
+  virtual bool IsDeleteRangeSentinelKey() const { return false; }
+
+ protected:
+  void SeekForPrevImpl(const Slice& target, const CompareInterface* cmp) {
+    Seek(target);
+    if (!Valid()) {
+      SeekToLast();
+    }
+    while (Valid() && cmp->Compare(target, key()) < 0) {
+      Prev();
+    }
+  }
+
+  bool is_mutable_;
+};
+
+using InternalIterator = InternalIteratorBase<Slice>;
+
+// Return an empty iterator (yields nothing).
+template <class TValue = Slice>
+extern InternalIteratorBase<TValue>* NewEmptyInternalIterator();
+
+// Return an empty iterator with the specified status.
+template <class TValue = Slice>
+extern InternalIteratorBase<TValue>* NewErrorInternalIterator(
+    const Status& status);
+
+// Return an empty iterator with the specified status, allocated arena.
+template <class TValue = Slice>
+extern InternalIteratorBase<TValue>* NewErrorInternalIterator(
+    const Status& status, Arena* arena);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/iter_heap.h b/src/rocksdb/table/iter_heap.h
new file mode 100644
index 000000000..6ad94be9b
--- /dev/null
+++ b/src/rocksdb/table/iter_heap.h
@@ -0,0 +1,44 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include "db/dbformat.h"
+#include "table/iterator_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// When used with std::priority_queue, this comparison functor puts the
+// iterator with the max/largest key on top.
+class MaxIteratorComparator {
+ public:
+  MaxIteratorComparator(const InternalKeyComparator* comparator)
+      : comparator_(comparator) {}
+
+  bool operator()(IteratorWrapper* a, IteratorWrapper* b) const {
+    return comparator_->Compare(a->key(), b->key()) < 0;
+  }
+
+ private:
+  const InternalKeyComparator* comparator_;
+};
+
+// When used with std::priority_queue, this comparison functor puts the
+// iterator with the min/smallest key on top.
+class MinIteratorComparator {
+ public:
+  MinIteratorComparator(const InternalKeyComparator* comparator)
+      : comparator_(comparator) {}
+
+  bool operator()(IteratorWrapper* a, IteratorWrapper* b) const {
+    return comparator_->Compare(a->key(), b->key()) > 0;
+  }
+
+ private:
+  const InternalKeyComparator* comparator_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/iterator.cc b/src/rocksdb/table/iterator.cc
new file mode 100644
index 000000000..14e280a07
--- /dev/null
+++ b/src/rocksdb/table/iterator.cc
@@ -0,0 +1,130 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/iterator.h"
+
+#include "memory/arena.h"
+#include "table/internal_iterator.h"
+#include "table/iterator_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status Iterator::GetProperty(std::string prop_name, std::string* prop) {
+  if (prop == nullptr) {
+    return Status::InvalidArgument("prop is nullptr");
+  }
+  if (prop_name == "rocksdb.iterator.is-key-pinned") {
+    *prop = "0";
+    return Status::OK();
+  }
+  return Status::InvalidArgument("Unidentified property.");
+}
+
+namespace {
+class EmptyIterator : public Iterator {
+ public:
+  explicit EmptyIterator(const Status& s) : status_(s) {}
+  bool Valid() const override { return false; }
+  void Seek(const Slice& /*target*/) override {}
+  void SeekForPrev(const Slice& /*target*/) override {}
+  void SeekToFirst() override {}
+  void SeekToLast() override {}
+  void Next() override { assert(false); }
+  void Prev() override { assert(false); }
+  Slice key() const override {
+    assert(false);
+    return Slice();
+  }
+  Slice value() const override {
+    assert(false);
+    return Slice();
+  }
+  Status status() const override { return status_; }
+
+ private:
+  Status status_;
+};
+
+template <class TValue = Slice>
+class EmptyInternalIterator : public InternalIteratorBase<TValue> {
+ public:
+  explicit EmptyInternalIterator(const Status& s) : status_(s) {}
+  bool Valid() const override { return false; }
+  void Seek(const Slice& /*target*/) override {}
+  void SeekForPrev(const Slice& /*target*/) override {}
+  void SeekToFirst() override {}
+  void SeekToLast() override {}
+  void Next() override { assert(false); }
+  void Prev() override { assert(false); }
+  Slice key() const override {
+    assert(false);
+    return Slice();
+  }
+  TValue value() const override {
+    assert(false);
+    return TValue();
+  }
+  Status status() const override { return status_; }
+
+ private:
+  Status status_;
+};
+}  // namespace
+
+Iterator* NewEmptyIterator() { return new EmptyIterator(Status::OK()); }
+
+Iterator* NewErrorIterator(const Status& status) {
+  return new EmptyIterator(status);
+}
+
+template <class TValue>
+InternalIteratorBase<TValue>* NewErrorInternalIterator(const Status& status) {
+  return new EmptyInternalIterator<TValue>(status);
+}
+template InternalIteratorBase<IndexValue>* NewErrorInternalIterator(
+    const Status& status);
+template InternalIteratorBase<Slice>* NewErrorInternalIterator(
+    const Status& status);
+
+template <class TValue>
+InternalIteratorBase<TValue>* NewErrorInternalIterator(const Status& status,
+                                                       Arena* arena) {
+  if (arena == nullptr) {
+    return NewErrorInternalIterator<TValue>(status);
+  } else {
+    auto mem = arena->AllocateAligned(sizeof(EmptyInternalIterator<TValue>));
+    return new (mem) EmptyInternalIterator<TValue>(status);
+  }
+}
+template InternalIteratorBase<IndexValue>* NewErrorInternalIterator(
+    const Status& status, Arena* arena);
+template InternalIteratorBase<Slice>* NewErrorInternalIterator(
+    const Status& status, Arena* arena);
+
+template <class TValue>
+InternalIteratorBase<TValue>* NewEmptyInternalIterator() {
+  return new EmptyInternalIterator<TValue>(Status::OK());
+}
+template InternalIteratorBase<IndexValue>* NewEmptyInternalIterator();
+template InternalIteratorBase<Slice>* NewEmptyInternalIterator();
+
+template <class TValue>
+InternalIteratorBase<TValue>* NewEmptyInternalIterator(Arena* arena) {
+  if (arena == nullptr) {
+    return NewEmptyInternalIterator<TValue>();
+  } else {
+    auto mem = arena->AllocateAligned(sizeof(EmptyInternalIterator<TValue>));
+    return new (mem) EmptyInternalIterator<TValue>(Status::OK());
+  }
+}
+template InternalIteratorBase<IndexValue>* NewEmptyInternalIterator(
+    Arena* arena);
+template InternalIteratorBase<Slice>* NewEmptyInternalIterator(Arena* arena);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/iterator_wrapper.h b/src/rocksdb/table/iterator_wrapper.h
new file mode 100644
index 000000000..17abef4ac
--- /dev/null
+++ b/src/rocksdb/table/iterator_wrapper.h
@@ -0,0 +1,190 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <set>
+
+#include "table/internal_iterator.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A internal wrapper class with an interface similar to Iterator that caches
+// the valid() and key() results for an underlying iterator.
+// This can help avoid virtual function calls and also gives better
+// cache locality.
+template <class TValue = Slice>
+class IteratorWrapperBase {
+ public:
+  IteratorWrapperBase() : iter_(nullptr), valid_(false) {}
+  explicit IteratorWrapperBase(InternalIteratorBase<TValue>* _iter)
+      : iter_(nullptr) {
+    Set(_iter);
+  }
+  ~IteratorWrapperBase() {}
+  InternalIteratorBase<TValue>* iter() const { return iter_; }
+
+  // Set the underlying Iterator to _iter and return
+  // previous underlying Iterator.
+  InternalIteratorBase<TValue>* Set(InternalIteratorBase<TValue>* _iter) {
+    InternalIteratorBase<TValue>* old_iter = iter_;
+
+    iter_ = _iter;
+    if (iter_ == nullptr) {
+      valid_ = false;
+    } else {
+      Update();
+    }
+    return old_iter;
+  }
+
+  void DeleteIter(bool is_arena_mode) {
+    if (iter_) {
+      if (!is_arena_mode) {
+        delete iter_;
+      } else {
+        iter_->~InternalIteratorBase<TValue>();
+      }
+    }
+  }
+
+  // Iterator interface methods
+  bool Valid() const { return valid_; }
+  Slice key() const {
+    assert(Valid());
+    return result_.key;
+  }
+  TValue value() const {
+    assert(Valid());
+    return iter_->value();
+  }
+  // Methods below require iter() != nullptr
+  Status status() const {
+    assert(iter_);
+    return iter_->status();
+  }
+  bool PrepareValue() {
+    assert(Valid());
+    if (result_.value_prepared) {
+      return true;
+    }
+    if (iter_->PrepareValue()) {
+      result_.value_prepared = true;
+      return true;
+    }
+
+    assert(!iter_->Valid());
+    valid_ = false;
+    return false;
+  }
+  void Next() {
+    assert(iter_);
+    valid_ = iter_->NextAndGetResult(&result_);
+    assert(!valid_ || iter_->status().ok());
+  }
+  bool NextAndGetResult(IterateResult* result) {
+    assert(iter_);
+    valid_ = iter_->NextAndGetResult(&result_);
+    *result = result_;
+    assert(!valid_ || iter_->status().ok());
+    return valid_;
+  }
+  void Prev() {
+    assert(iter_);
+    iter_->Prev();
+    Update();
+  }
+  void Seek(const Slice& k) {
+    assert(iter_);
+    iter_->Seek(k);
+    Update();
+  }
+  void SeekForPrev(const Slice& k) {
+    assert(iter_);
+    iter_->SeekForPrev(k);
+    Update();
+  }
+  void SeekToFirst() {
+    assert(iter_);
+    iter_->SeekToFirst();
+    Update();
+  }
+  void SeekToLast() {
+    assert(iter_);
+    iter_->SeekToLast();
+    Update();
+  }
+
+  bool MayBeOutOfLowerBound() {
+    assert(Valid());
+    return iter_->MayBeOutOfLowerBound();
+  }
+
+  IterBoundCheck UpperBoundCheckResult() {
+    assert(Valid());
+    return result_.bound_check_result;
+  }
+
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) {
+    assert(iter_);
+    iter_->SetPinnedItersMgr(pinned_iters_mgr);
+  }
+  bool IsKeyPinned() const {
+    assert(Valid());
+    return iter_->IsKeyPinned();
+  }
+  bool IsValuePinned() const {
+    assert(Valid());
+    return iter_->IsValuePinned();
+  }
+
+  bool IsValuePrepared() const { return result_.value_prepared; }
+
+  Slice user_key() const {
+    assert(Valid());
+    return iter_->user_key();
+  }
+
+  void UpdateReadaheadState(InternalIteratorBase<TValue>* old_iter) {
+    if (old_iter && iter_) {
+      ReadaheadFileInfo readahead_file_info;
+      old_iter->GetReadaheadState(&readahead_file_info);
+      iter_->SetReadaheadState(&readahead_file_info);
+    }
+  }
+
+  bool IsDeleteRangeSentinelKey() const {
+    return iter_->IsDeleteRangeSentinelKey();
+  }
+
+ private:
+  void Update() {
+    valid_ = iter_->Valid();
+    if (valid_) {
+      assert(iter_->status().ok());
+      result_.key = iter_->key();
+      result_.bound_check_result = IterBoundCheck::kUnknown;
+      result_.value_prepared = false;
+    }
+  }
+
+  InternalIteratorBase<TValue>* iter_;
+  IterateResult result_;
+  bool valid_;
+};
+
+using IteratorWrapper = IteratorWrapperBase<Slice>;
+
+class Arena;
+// Return an empty iterator (yields nothing) allocated from arena.
+template <class TValue = Slice>
+extern InternalIteratorBase<TValue>* NewEmptyInternalIterator(Arena* arena);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/merger_test.cc b/src/rocksdb/table/merger_test.cc
new file mode 100644
index 000000000..71dc798e5
--- /dev/null
+++ b/src/rocksdb/table/merger_test.cc
@@ -0,0 +1,182 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <string>
+#include <vector>
+
+#include "table/merging_iterator.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+#include "util/vector_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MergerTest : public testing::Test {
+ public:
+  MergerTest()
+      : icomp_(BytewiseComparator()),
+        rnd_(3),
+        merging_iterator_(nullptr),
+        single_iterator_(nullptr) {}
+  ~MergerTest() override = default;
+  std::vector<std::string> GenerateStrings(size_t len, int string_len) {
+    std::vector<std::string> ret;
+
+    for (size_t i = 0; i < len; ++i) {
+      InternalKey ik(rnd_.HumanReadableString(string_len), 0,
+                     ValueType::kTypeValue);
+      ret.push_back(ik.Encode().ToString(false));
+    }
+    return ret;
+  }
+
+  void AssertEquivalence() {
+    auto a = merging_iterator_.get();
+    auto b = single_iterator_.get();
+    if (!a->Valid()) {
+      ASSERT_TRUE(!b->Valid());
+    } else {
+      ASSERT_TRUE(b->Valid());
+      ASSERT_EQ(b->key().ToString(), a->key().ToString());
+      ASSERT_EQ(b->value().ToString(), a->value().ToString());
+    }
+  }
+
+  void SeekToRandom() {
+    InternalKey ik(rnd_.HumanReadableString(5), 0, ValueType::kTypeValue);
+    Seek(ik.Encode().ToString(false));
+  }
+
+  void Seek(std::string target) {
+    merging_iterator_->Seek(target);
+    single_iterator_->Seek(target);
+  }
+
+  void SeekToFirst() {
+    merging_iterator_->SeekToFirst();
+    single_iterator_->SeekToFirst();
+  }
+
+  void SeekToLast() {
+    merging_iterator_->SeekToLast();
+    single_iterator_->SeekToLast();
+  }
+
+  void Next(int times) {
+    for (int i = 0; i < times && merging_iterator_->Valid(); ++i) {
+      AssertEquivalence();
+      merging_iterator_->Next();
+      single_iterator_->Next();
+    }
+    AssertEquivalence();
+  }
+
+  void Prev(int times) {
+    for (int i = 0; i < times && merging_iterator_->Valid(); ++i) {
+      AssertEquivalence();
+      merging_iterator_->Prev();
+      single_iterator_->Prev();
+    }
+    AssertEquivalence();
+  }
+
+  void NextAndPrev(int times) {
+    for (int i = 0; i < times && merging_iterator_->Valid(); ++i) {
+      AssertEquivalence();
+      if (rnd_.OneIn(2)) {
+        merging_iterator_->Prev();
+        single_iterator_->Prev();
+      } else {
+        merging_iterator_->Next();
+        single_iterator_->Next();
+      }
+    }
+    AssertEquivalence();
+  }
+
+  void Generate(size_t num_iterators, size_t strings_per_iterator,
+                int letters_per_string) {
+    std::vector<InternalIterator*> small_iterators;
+    for (size_t i = 0; i < num_iterators; ++i) {
+      auto strings = GenerateStrings(strings_per_iterator, letters_per_string);
+      small_iterators.push_back(new VectorIterator(strings, strings, &icomp_));
+      all_keys_.insert(all_keys_.end(), strings.begin(), strings.end());
+    }
+
+    merging_iterator_.reset(
+        NewMergingIterator(&icomp_, &small_iterators[0],
+                           static_cast<int>(small_iterators.size())));
+    single_iterator_.reset(new VectorIterator(all_keys_, all_keys_, &icomp_));
+  }
+
+  InternalKeyComparator icomp_;
+  Random rnd_;
+  std::unique_ptr<InternalIterator> merging_iterator_;
+  std::unique_ptr<InternalIterator> single_iterator_;
+  std::vector<std::string> all_keys_;
+};
+
+TEST_F(MergerTest, SeekToRandomNextTest) {
+  Generate(1000, 50, 50);
+  for (int i = 0; i < 10; ++i) {
+    SeekToRandom();
+    AssertEquivalence();
+    Next(50000);
+  }
+}
+
+TEST_F(MergerTest, SeekToRandomNextSmallStringsTest) {
+  Generate(1000, 50, 2);
+  for (int i = 0; i < 10; ++i) {
+    SeekToRandom();
+    AssertEquivalence();
+    Next(50000);
+  }
+}
+
+TEST_F(MergerTest, SeekToRandomPrevTest) {
+  Generate(1000, 50, 50);
+  for (int i = 0; i < 10; ++i) {
+    SeekToRandom();
+    AssertEquivalence();
+    Prev(50000);
+  }
+}
+
+TEST_F(MergerTest, SeekToRandomRandomTest) {
+  Generate(200, 50, 50);
+  for (int i = 0; i < 3; ++i) {
+    SeekToRandom();
+    AssertEquivalence();
+    NextAndPrev(5000);
+  }
+}
+
+TEST_F(MergerTest, SeekToFirstTest) {
+  Generate(1000, 50, 50);
+  for (int i = 0; i < 10; ++i) {
+    SeekToFirst();
+    AssertEquivalence();
+    Next(50000);
+  }
+}
+
+TEST_F(MergerTest, SeekToLastTest) {
+  Generate(1000, 50, 50);
+  for (int i = 0; i < 10; ++i) {
+    SeekToLast();
+    AssertEquivalence();
+    Prev(50000);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/merging_iterator.cc b/src/rocksdb/table/merging_iterator.cc
new file mode 100644
index 000000000..beb35ea9a
--- /dev/null
+++ b/src/rocksdb/table/merging_iterator.cc
@@ -0,0 +1,1403 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/merging_iterator.h"
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+#include "memory/arena.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "table/internal_iterator.h"
+#include "table/iter_heap.h"
+#include "table/iterator_wrapper.h"
+#include "test_util/sync_point.h"
+#include "util/autovector.h"
+#include "util/heap.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+// For merging iterator to process range tombstones, we treat the start and end
+// keys of a range tombstone as point keys and put them into the minHeap/maxHeap
+// used in merging iterator. Take minHeap for example, we are able to keep track
+// of currently "active" range tombstones (the ones whose start keys are popped
+// but end keys are still in the heap) in `active_`. This `active_` set of range
+// tombstones is then used to quickly determine whether the point key at heap
+// top is deleted (by heap property, the point key at heap top must be within
+// internal key range of active range tombstones).
+//
+// The HeapItem struct represents 3 types of elements in the minHeap/maxHeap:
+// point key and the start and end keys of a range tombstone.
+struct HeapItem {
+  HeapItem() = default;
+
+  enum Type { ITERATOR, DELETE_RANGE_START, DELETE_RANGE_END };
+  IteratorWrapper iter;
+  size_t level = 0;
+  std::string pinned_key;
+  // Will be overwritten before use, initialize here so compiler does not
+  // complain.
+  Type type = ITERATOR;
+
+  explicit HeapItem(size_t _level, InternalIteratorBase<Slice>* _iter)
+      : level(_level), type(Type::ITERATOR) {
+    iter.Set(_iter);
+  }
+
+  void SetTombstoneKey(ParsedInternalKey&& pik) {
+    pinned_key.clear();
+    // Range tombstone end key is exclusive. If a point internal key has the
+    // same user key and sequence number as the start or end key of a range
+    // tombstone, the order will be start < end key < internal key with the
+    // following op_type change. This is helpful to ensure keys popped from
+    // heap are in expected order since range tombstone start/end keys will
+    // be distinct from point internal keys. Strictly speaking, this is only
+    // needed for tombstone end points that are truncated in
+    // TruncatedRangeDelIterator since untruncated tombstone end points always
+    // have kMaxSequenceNumber and kTypeRangeDeletion (see
+    // TruncatedRangeDelIterator::start_key()/end_key()).
+    ParsedInternalKey p(pik.user_key, pik.sequence, kTypeMaxValid);
+    AppendInternalKey(&pinned_key, p);
+  }
+
+  Slice key() const {
+    if (type == Type::ITERATOR) {
+      return iter.key();
+    }
+    return pinned_key;
+  }
+
+  bool IsDeleteRangeSentinelKey() const {
+    if (type == Type::ITERATOR) {
+      return iter.IsDeleteRangeSentinelKey();
+    }
+    return false;
+  }
+};
+
+class MinHeapItemComparator {
+ public:
+  MinHeapItemComparator(const InternalKeyComparator* comparator)
+      : comparator_(comparator) {}
+  bool operator()(HeapItem* a, HeapItem* b) const {
+    return comparator_->Compare(a->key(), b->key()) > 0;
+  }
+
+ private:
+  const InternalKeyComparator* comparator_;
+};
+
+class MaxHeapItemComparator {
+ public:
+  MaxHeapItemComparator(const InternalKeyComparator* comparator)
+      : comparator_(comparator) {}
+  bool operator()(HeapItem* a, HeapItem* b) const {
+    return comparator_->Compare(a->key(), b->key()) < 0;
+  }
+
+ private:
+  const InternalKeyComparator* comparator_;
+};
+// Without anonymous namespace here, we fail the warning -Wmissing-prototypes
+namespace {
+using MergerMinIterHeap = BinaryHeap<HeapItem*, MinHeapItemComparator>;
+using MergerMaxIterHeap = BinaryHeap<HeapItem*, MaxHeapItemComparator>;
+}  // namespace
+
+class MergingIterator : public InternalIterator {
+ public:
+  MergingIterator(const InternalKeyComparator* comparator,
+                  InternalIterator** children, int n, bool is_arena_mode,
+                  bool prefix_seek_mode,
+                  const Slice* iterate_upper_bound = nullptr)
+      : is_arena_mode_(is_arena_mode),
+        prefix_seek_mode_(prefix_seek_mode),
+        direction_(kForward),
+        comparator_(comparator),
+        current_(nullptr),
+        minHeap_(comparator_),
+        pinned_iters_mgr_(nullptr),
+        iterate_upper_bound_(iterate_upper_bound) {
+    children_.resize(n);
+    for (int i = 0; i < n; i++) {
+      children_[i].level = i;
+      children_[i].iter.Set(children[i]);
+    }
+  }
+
+  void considerStatus(Status s) {
+    if (!s.ok() && status_.ok()) {
+      status_ = s;
+    }
+  }
+
+  virtual void AddIterator(InternalIterator* iter) {
+    children_.emplace_back(children_.size(), iter);
+    if (pinned_iters_mgr_) {
+      iter->SetPinnedItersMgr(pinned_iters_mgr_);
+    }
+    // Invalidate to ensure `Seek*()` is called to construct the heaps before
+    // use.
+    current_ = nullptr;
+  }
+
+  // Merging iterator can optionally process range tombstones: if a key is
+  // covered by a range tombstone, the merging iterator will not output it but
+  // skip it.
+  //
+  // Add the next range tombstone iterator to this merging iterator.
+  // There must be either no range tombstone iterator, or same number of
+  // range tombstone iterators as point iterators after all range tombstone
+  // iters are added. The i-th added range tombstone iterator and the i-th point
+  // iterator must point to the same sorted run.
+  // Merging iterator takes ownership of the range tombstone iterator and
+  // is responsible for freeing it. Note that during Iterator::Refresh()
+  // and when a level iterator moves to a different SST file, the range
+  // tombstone iterator could be updated. In that case, the merging iterator
+  // is only responsible to freeing the new range tombstone iterator
+  // that it has pointers to in range_tombstone_iters_.
+  void AddRangeTombstoneIterator(TruncatedRangeDelIterator* iter) {
+    range_tombstone_iters_.emplace_back(iter);
+  }
+
+  // Called by MergingIteratorBuilder when all point iterators and range
+  // tombstone iterators are added. Initializes HeapItems for range tombstone
+  // iterators so that no further allocation is needed for HeapItem.
+  void Finish() {
+    if (!range_tombstone_iters_.empty()) {
+      pinned_heap_item_.resize(range_tombstone_iters_.size());
+      for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
+        pinned_heap_item_[i].level = i;
+      }
+    }
+  }
+
+  ~MergingIterator() override {
+    for (auto child : range_tombstone_iters_) {
+      delete child;
+    }
+
+    for (auto& child : children_) {
+      child.iter.DeleteIter(is_arena_mode_);
+    }
+    status_.PermitUncheckedError();
+  }
+
+  bool Valid() const override { return current_ != nullptr && status_.ok(); }
+
+  Status status() const override { return status_; }
+
+  // Add range_tombstone_iters_[level] into min heap.
+  // Updates active_ if the end key of a range tombstone is inserted.
+  // @param start_key specifies which end point of the range tombstone to add.
+  void InsertRangeTombstoneToMinHeap(size_t level, bool start_key = true,
+                                     bool replace_top = false) {
+    assert(!range_tombstone_iters_.empty() &&
+           range_tombstone_iters_[level]->Valid());
+    if (start_key) {
+      ParsedInternalKey pik = range_tombstone_iters_[level]->start_key();
+      // iterate_upper_bound does not have timestamp
+      if (iterate_upper_bound_ &&
+          comparator_->user_comparator()->CompareWithoutTimestamp(
+              pik.user_key, true /* a_has_ts */, *iterate_upper_bound_,
+              false /* b_has_ts */) >= 0) {
+        if (replace_top) {
+          // replace_top implies this range tombstone iterator is still in
+          // minHeap_ and at the top.
+          minHeap_.pop();
+        }
+        return;
+      }
+      pinned_heap_item_[level].SetTombstoneKey(std::move(pik));
+      pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_START;
+      assert(active_.count(level) == 0);
+    } else {
+      // allow end key to go over upper bound (if present) since start key is
+      // before upper bound and the range tombstone could still cover a
+      // range before upper bound.
+      pinned_heap_item_[level].SetTombstoneKey(
+          range_tombstone_iters_[level]->end_key());
+      pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_END;
+      active_.insert(level);
+    }
+    if (replace_top) {
+      minHeap_.replace_top(&pinned_heap_item_[level]);
+    } else {
+      minHeap_.push(&pinned_heap_item_[level]);
+    }
+  }
+
+  // Add range_tombstone_iters_[level] into max heap.
+  // Updates active_ if the start key of a range tombstone is inserted.
+  // @param end_key specifies which end point of the range tombstone to add.
+  void InsertRangeTombstoneToMaxHeap(size_t level, bool end_key = true,
+                                     bool replace_top = false) {
+    assert(!range_tombstone_iters_.empty() &&
+           range_tombstone_iters_[level]->Valid());
+    if (end_key) {
+      pinned_heap_item_[level].SetTombstoneKey(
+          range_tombstone_iters_[level]->end_key());
+      pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_END;
+      assert(active_.count(level) == 0);
+    } else {
+      pinned_heap_item_[level].SetTombstoneKey(
+          range_tombstone_iters_[level]->start_key());
+      pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_START;
+      active_.insert(level);
+    }
+    if (replace_top) {
+      maxHeap_->replace_top(&pinned_heap_item_[level]);
+    } else {
+      maxHeap_->push(&pinned_heap_item_[level]);
+    }
+  }
+
+  // Remove HeapItems from top of minHeap_ that are of type DELETE_RANGE_START
+  // until minHeap_ is empty or the top of the minHeap_ is not of type
+  // DELETE_RANGE_START. Each such item means a range tombstone becomes active,
+  // so `active_` is updated accordingly.
+  void PopDeleteRangeStart() {
+    while (!minHeap_.empty() &&
+           minHeap_.top()->type == HeapItem::DELETE_RANGE_START) {
+      TEST_SYNC_POINT_CALLBACK("MergeIterator::PopDeleteRangeStart", nullptr);
+      // insert end key of this range tombstone and updates active_
+      InsertRangeTombstoneToMinHeap(
+          minHeap_.top()->level, false /* start_key */, true /* replace_top */);
+    }
+  }
+
+  // Remove HeapItems from top of maxHeap_ that are of type DELETE_RANGE_END
+  // until maxHeap_ is empty or the top of the maxHeap_ is not of type
+  // DELETE_RANGE_END. Each such item means a range tombstone becomes active,
+  // so `active_` is updated accordingly.
+  void PopDeleteRangeEnd() {
+    while (!maxHeap_->empty() &&
+           maxHeap_->top()->type == HeapItem::DELETE_RANGE_END) {
+      // insert start key of this range tombstone and updates active_
+      InsertRangeTombstoneToMaxHeap(maxHeap_->top()->level, false /* end_key */,
+                                    true /* replace_top */);
+    }
+  }
+
+  void SeekToFirst() override {
+    ClearHeaps();
+    status_ = Status::OK();
+    for (auto& child : children_) {
+      child.iter.SeekToFirst();
+      AddToMinHeapOrCheckStatus(&child);
+    }
+
+    for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
+      if (range_tombstone_iters_[i]) {
+        range_tombstone_iters_[i]->SeekToFirst();
+        if (range_tombstone_iters_[i]->Valid()) {
+          // It is possible to be invalid due to snapshots.
+          InsertRangeTombstoneToMinHeap(i);
+        }
+      }
+    }
+    FindNextVisibleKey();
+    direction_ = kForward;
+    current_ = CurrentForward();
+  }
+
+  void SeekToLast() override {
+    ClearHeaps();
+    InitMaxHeap();
+    status_ = Status::OK();
+    for (auto& child : children_) {
+      child.iter.SeekToLast();
+      AddToMaxHeapOrCheckStatus(&child);
+    }
+
+    for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
+      if (range_tombstone_iters_[i]) {
+        range_tombstone_iters_[i]->SeekToLast();
+        if (range_tombstone_iters_[i]->Valid()) {
+          // It is possible to be invalid due to snapshots.
+          InsertRangeTombstoneToMaxHeap(i);
+        }
+      }
+    }
+    FindPrevVisibleKey();
+    direction_ = kReverse;
+    current_ = CurrentReverse();
+  }
+
+  // Position this merging iterator at the first key >= target (internal key).
+  // If range tombstones are present, keys covered by range tombstones are
+  // skipped, and this merging iter points to the first non-range-deleted key >=
+  // target after Seek(). If !Valid() and status().ok() then end of the iterator
+  // is reached.
+  //
+  // Internally, this involves positioning all child iterators at the first key
+  // >= target. If range tombstones are present, we apply a similar
+  // optimization, cascading seek, as in Pebble
+  // (https://github.com/cockroachdb/pebble). Specifically, if there is a range
+  // tombstone [start, end) that covers the target user key at level L, then
+  // this range tombstone must cover the range [target key, end) in all levels >
+  // L. So for all levels > L, we can pretend the target key is `end`. This
+  // optimization is applied at each level and hence the name "cascading seek".
+  // After a round of (cascading) seeks, the top of the heap is checked to see
+  // if it is covered by a range tombstone (see FindNextVisibleKey() for more
+  // detail), and advanced if so. The process is repeated until a
+  // non-range-deleted key is at the top of the heap, or heap becomes empty.
+  //
+  // As mentioned in comments above HeapItem, to make the checking of whether
+  // top of the heap is covered by some range tombstone efficient, we treat each
+  // range deletion [start, end) as two point keys and insert them into the same
+  // min/maxHeap_ where point iterators are. The set `active_` tracks the levels
+  // that have active range tombstones. If level L is in `active_`, and the
+  // point key at top of the heap is from level >= L, then the point key is
+  // within the internal key range of the range tombstone that
+  // range_tombstone_iters_[L] currently points to. For correctness reasoning,
+  // one invariant that Seek() (and every other public APIs Seek*(),
+  // Next/Prev()) guarantees is as follows. After Seek(), suppose `k` is the
+  // current key of level L's point iterator. Then for each range tombstone
+  // iterator at level <= L, it is at or before the first range tombstone with
+  // end key > `k`. This ensures that when level L's point iterator reaches top
+  // of the heap, `active_` is calculated correctly (it contains the covering
+  // range tombstone's level if there is one), since no range tombstone iterator
+  // was skipped beyond that point iterator's current key during Seek().
+  // Next()/Prev() maintains a stronger version of this invariant where all
+  // range tombstone iterators from level <= L are *at* the first range
+  // tombstone with end key > `k`.
+  void Seek(const Slice& target) override {
+    assert(range_tombstone_iters_.empty() ||
+           range_tombstone_iters_.size() == children_.size());
+    SeekImpl(target);
+    FindNextVisibleKey();
+
+    direction_ = kForward;
+    {
+      PERF_TIMER_GUARD(seek_min_heap_time);
+      current_ = CurrentForward();
+    }
+  }
+
+  void SeekForPrev(const Slice& target) override {
+    assert(range_tombstone_iters_.empty() ||
+           range_tombstone_iters_.size() == children_.size());
+    SeekForPrevImpl(target);
+    FindPrevVisibleKey();
+
+    direction_ = kReverse;
+    {
+      PERF_TIMER_GUARD(seek_max_heap_time);
+      current_ = CurrentReverse();
+    }
+  }
+
+  void Next() override {
+    assert(Valid());
+    // Ensure that all children are positioned after key().
+    // If we are moving in the forward direction, it is already
+    // true for all of the non-current children since current_ is
+    // the smallest child and key() == current_->key().
+    if (direction_ != kForward) {
+      // The loop advanced all non-current children to be > key() so current_
+      // should still be strictly the smallest key.
+      SwitchToForward();
+    }
+
+    // For the heap modifications below to be correct, current_ must be the
+    // current top of the heap.
+    assert(current_ == CurrentForward());
+    // as the current points to the current record. move the iterator forward.
+    current_->Next();
+    if (current_->Valid()) {
+      // current is still valid after the Next() call above.  Call
+      // replace_top() to restore the heap property.  When the same child
+      // iterator yields a sequence of keys, this is cheap.
+      assert(current_->status().ok());
+      minHeap_.replace_top(minHeap_.top());
+    } else {
+      // current stopped being valid, remove it from the heap.
+      considerStatus(current_->status());
+      minHeap_.pop();
+    }
+    FindNextVisibleKey();
+    current_ = CurrentForward();
+  }
+
+  bool NextAndGetResult(IterateResult* result) override {
+    Next();
+    bool is_valid = Valid();
+    if (is_valid) {
+      result->key = key();
+      result->bound_check_result = UpperBoundCheckResult();
+      result->value_prepared = current_->IsValuePrepared();
+    }
+    return is_valid;
+  }
+
+  void Prev() override {
+    assert(Valid());
+    // Ensure that all children are positioned before key().
+    // If we are moving in the reverse direction, it is already
+    // true for all of the non-current children since current_ is
+    // the largest child and key() == current_->key().
+    if (direction_ != kReverse) {
+      // Otherwise, retreat the non-current children.  We retreat current_
+      // just after the if-block.
+      SwitchToBackward();
+    }
+
+    // For the heap modifications below to be correct, current_ must be the
+    // current top of the heap.
+    assert(current_ == CurrentReverse());
+    current_->Prev();
+    if (current_->Valid()) {
+      // current is still valid after the Prev() call above.  Call
+      // replace_top() to restore the heap property.  When the same child
+      // iterator yields a sequence of keys, this is cheap.
+      assert(current_->status().ok());
+      maxHeap_->replace_top(maxHeap_->top());
+    } else {
+      // current stopped being valid, remove it from the heap.
+      considerStatus(current_->status());
+      maxHeap_->pop();
+    }
+    FindPrevVisibleKey();
+    current_ = CurrentReverse();
+  }
+
+  Slice key() const override {
+    assert(Valid());
+    return current_->key();
+  }
+
+  Slice value() const override {
+    assert(Valid());
+    return current_->value();
+  }
+
+  bool PrepareValue() override {
+    assert(Valid());
+    if (current_->PrepareValue()) {
+      return true;
+    }
+
+    considerStatus(current_->status());
+    assert(!status_.ok());
+    return false;
+  }
+
+  // Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result
+  // from current child iterator. Potentially as long as one of child iterator
+  // report out of bound is not possible, we know current key is within bound.
+
+  bool MayBeOutOfLowerBound() override {
+    assert(Valid());
+    return current_->MayBeOutOfLowerBound();
+  }
+
+  IterBoundCheck UpperBoundCheckResult() override {
+    assert(Valid());
+    return current_->UpperBoundCheckResult();
+  }
+
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+    pinned_iters_mgr_ = pinned_iters_mgr;
+    for (auto& child : children_) {
+      child.iter.SetPinnedItersMgr(pinned_iters_mgr);
+    }
+  }
+
+  bool IsKeyPinned() const override {
+    assert(Valid());
+    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+           current_->IsKeyPinned();
+  }
+
+  bool IsValuePinned() const override {
+    assert(Valid());
+    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+           current_->IsValuePinned();
+  }
+
+ private:
+  friend class MergeIteratorBuilder;
+  // Clears heaps for both directions, used when changing direction or seeking
+  void ClearHeaps(bool clear_active = true);
+  // Ensures that maxHeap_ is initialized when starting to go in the reverse
+  // direction
+  void InitMaxHeap();
+
+  // Advance this merging iterator until the current key (top of min heap) is
+  // not covered by any range tombstone or that there is no more keys (heap is
+  // empty). After this call, if Valid(), current_ points to the next key that
+  // is not covered by any range tombstone.
+  void FindNextVisibleKey();
+  void FindPrevVisibleKey();
+
+  void SeekImpl(const Slice& target, size_t starting_level = 0,
+                bool range_tombstone_reseek = false);
+
+  // Seek to fist key <= target key (internal key) for
+  // children_[starting_level:].
+  void SeekForPrevImpl(const Slice& target, size_t starting_level = 0,
+                       bool range_tombstone_reseek = false);
+
+  bool is_arena_mode_;
+  bool prefix_seek_mode_;
+  // Which direction is the iterator moving?
+  enum Direction : uint8_t { kForward, kReverse };
+  Direction direction_;
+  const InternalKeyComparator* comparator_;
+  // We could also use an autovector with a larger reserved size.
+  // HeapItem for all child point iterators.
+  std::vector<HeapItem> children_;
+  // HeapItem for range tombstone start and end keys. Each range tombstone
+  // iterator will have at most one side (start key or end key) in a heap
+  // at the same time, so this vector will be of size children_.size();
+  // pinned_heap_item_[i] corresponds to the start key and end key HeapItem
+  // for range_tombstone_iters_[i].
+  std::vector<HeapItem> pinned_heap_item_;
+  // range_tombstone_iters_[i] contains range tombstones in the sorted run that
+  // corresponds to children_[i]. range_tombstone_iters_.empty() means not
+  // handling range tombstones in merging iterator. range_tombstone_iters_[i] ==
+  // nullptr means the sorted run of children_[i] does not have range
+  // tombstones.
+  std::vector<TruncatedRangeDelIterator*> range_tombstone_iters_;
+
+  // Levels (indices into range_tombstone_iters_/children_ ) that currently have
+  // "active" range tombstones. See comments above Seek() for meaning of
+  // "active".
+  std::set<size_t> active_;
+
+  bool SkipNextDeleted();
+  bool SkipPrevDeleted();
+
+  // Cached pointer to child iterator with the current key, or nullptr if no
+  // child iterators are valid.  This is the top of minHeap_ or maxHeap_
+  // depending on the direction.
+  IteratorWrapper* current_;
+  // If any of the children have non-ok status, this is one of them.
+  Status status_;
+  MergerMinIterHeap minHeap_;
+
+  // Max heap is used for reverse iteration, which is way less common than
+  // forward.  Lazily initialize it to save memory.
+  std::unique_ptr<MergerMaxIterHeap> maxHeap_;
+  PinnedIteratorsManager* pinned_iters_mgr_;
+
+  // Used to bound range tombstones. For point keys, DBIter and SSTable iterator
+  // take care of boundary checking.
+  const Slice* iterate_upper_bound_;
+
+  // In forward direction, process a child that is not in the min heap.
+  // If valid, add to the min heap. Otherwise, check status.
+  void AddToMinHeapOrCheckStatus(HeapItem*);
+
+  // In backward direction, process a child that is not in the max heap.
+  // If valid, add to the min heap. Otherwise, check status.
+  void AddToMaxHeapOrCheckStatus(HeapItem*);
+
+  void SwitchToForward();
+
+  // Switch the direction from forward to backward without changing the
+  // position. Iterator should still be valid.
+  void SwitchToBackward();
+
+  IteratorWrapper* CurrentForward() const {
+    assert(direction_ == kForward);
+    assert(minHeap_.empty() || minHeap_.top()->type == HeapItem::ITERATOR);
+    return !minHeap_.empty() ? &minHeap_.top()->iter : nullptr;
+  }
+
+  IteratorWrapper* CurrentReverse() const {
+    assert(direction_ == kReverse);
+    assert(maxHeap_);
+    assert(maxHeap_->empty() || maxHeap_->top()->type == HeapItem::ITERATOR);
+    return !maxHeap_->empty() ? &maxHeap_->top()->iter : nullptr;
+  }
+};
+
+// Seek to fist key >= target key (internal key) for children_[starting_level:].
+// Cascading seek optimizations are applied if range tombstones are present (see
+// comment above Seek() for more).
+//
+// @param range_tombstone_reseek Whether target is some range tombstone
+// end, i.e., whether this SeekImpl() call is a part of a "cascading seek". This
+// is used only for recoding relevant perf_context.
+void MergingIterator::SeekImpl(const Slice& target, size_t starting_level,
+                               bool range_tombstone_reseek) {
+  // active range tombstones before `starting_level` remain active
+  ClearHeaps(false /* clear_active */);
+  ParsedInternalKey pik;
+  if (!range_tombstone_iters_.empty()) {
+    // pik is only used in InsertRangeTombstoneToMinHeap().
+    ParseInternalKey(target, &pik, false).PermitUncheckedError();
+  }
+
+  // TODO: perhaps we could save some upheap cost by add all child iters first
+  //  and then do a single heapify.
+  for (size_t level = 0; level < starting_level; ++level) {
+    PERF_TIMER_GUARD(seek_min_heap_time);
+    AddToMinHeapOrCheckStatus(&children_[level]);
+  }
+  if (!range_tombstone_iters_.empty()) {
+    // Add range tombstones from levels < starting_level. We can insert from
+    // pinned_heap_item_ for the following reasons:
+    // - pinned_heap_item_[level] is in minHeap_ iff
+    // range_tombstone_iters[level]->Valid().
+    // - If `level` is in active_, then range_tombstone_iters_[level]->Valid()
+    // and pinned_heap_item_[level] is of type RANGE_DELETION_END.
+    for (size_t level = 0; level < starting_level; ++level) {
+      if (range_tombstone_iters_[level] &&
+          range_tombstone_iters_[level]->Valid()) {
+        // use an iterator on active_ if performance becomes an issue here
+        if (active_.count(level) > 0) {
+          assert(pinned_heap_item_[level].type == HeapItem::DELETE_RANGE_END);
+          // if it was active, then start key must be within upper_bound,
+          // so we can add to minHeap_ directly.
+          minHeap_.push(&pinned_heap_item_[level]);
+        } else {
+          // this takes care of checking iterate_upper_bound, but with an extra
+          // key comparison if range_tombstone_iters_[level] was already out of
+          // bound. Consider using a new HeapItem type or some flag to remember
+          // boundary checking result.
+          InsertRangeTombstoneToMinHeap(level);
+        }
+      } else {
+        assert(!active_.count(level));
+      }
+    }
+    // levels >= starting_level will be reseeked below, so clearing their active
+    // state here.
+    active_.erase(active_.lower_bound(starting_level), active_.end());
+  }
+
+  status_ = Status::OK();
+  IterKey current_search_key;
+  current_search_key.SetInternalKey(target, false /* copy */);
+  // Seek target might change to some range tombstone end key, so
+  // we need to remember them for async requests.
+  // (level, target) pairs
+  autovector<std::pair<size_t, std::string>> prefetched_target;
+  for (auto level = starting_level; level < children_.size(); ++level) {
+    {
+      PERF_TIMER_GUARD(seek_child_seek_time);
+      children_[level].iter.Seek(current_search_key.GetInternalKey());
+    }
+
+    PERF_COUNTER_ADD(seek_child_seek_count, 1);
+
+    if (!range_tombstone_iters_.empty()) {
+      if (range_tombstone_reseek) {
+        // This seek is to some range tombstone end key.
+        // Should only happen when there are range tombstones.
+        PERF_COUNTER_ADD(internal_range_del_reseek_count, 1);
+      }
+      if (children_[level].iter.status().IsTryAgain()) {
+        prefetched_target.emplace_back(
+            level, current_search_key.GetInternalKey().ToString());
+      }
+      auto range_tombstone_iter = range_tombstone_iters_[level];
+      if (range_tombstone_iter) {
+        range_tombstone_iter->Seek(current_search_key.GetUserKey());
+        if (range_tombstone_iter->Valid()) {
+          // insert the range tombstone end that is closer to and >=
+          // current_search_key. Strictly speaking, since the Seek() call above
+          // is on user key, it is possible that range_tombstone_iter->end_key()
+          // < current_search_key. This can happen when range_tombstone_iter is
+          // truncated and range_tombstone_iter.largest_ has the same user key
+          // as current_search_key.GetUserKey() but with a larger sequence
+          // number than current_search_key. Correctness is not affected as this
+          // tombstone end key will be popped during FindNextVisibleKey().
+          InsertRangeTombstoneToMinHeap(
+              level, comparator_->Compare(range_tombstone_iter->start_key(),
+                                          pik) > 0 /* start_key */);
+          // current_search_key < end_key guaranteed by the Seek() and Valid()
+          // calls above. Only interested in user key coverage since older
+          // sorted runs must have smaller sequence numbers than this range
+          // tombstone.
+          //
+          // TODO: range_tombstone_iter->Seek() finds the max covering
+          //  sequence number, can make it cheaper by not looking for max.
+          if (comparator_->user_comparator()->Compare(
+                  range_tombstone_iter->start_key().user_key,
+                  current_search_key.GetUserKey()) <= 0) {
+            // Since range_tombstone_iter->Valid(), seqno should be valid, so
+            // there is no need to check it.
+            range_tombstone_reseek = true;
+            // Current target user key is covered by this range tombstone.
+            // All older sorted runs will seek to range tombstone end key.
+            // Note that for prefix seek case, it is possible that the prefix
+            // is not the same as the original target, it should not affect
+            // correctness. Besides, in most cases, range tombstone start and
+            // end key should have the same prefix?
+            // If range_tombstone_iter->end_key() is truncated to its largest_
+            // boundary, the timestamp in user_key will not be max timestamp,
+            // but the timestamp of `range_tombstone_iter.largest_`. This should
+            // be fine here as current_search_key is used to Seek into lower
+            // levels.
+            current_search_key.SetInternalKey(
+                range_tombstone_iter->end_key().user_key, kMaxSequenceNumber);
+          }
+        }
+      }
+    }
+    // child.iter.status() is set to Status::TryAgain indicating asynchronous
+    // request for retrieval of data blocks has been submitted. So it should
+    // return at this point and Seek should be called again to retrieve the
+    // requested block and add the child to min heap.
+    if (children_[level].iter.status().IsTryAgain()) {
+      continue;
+    }
+    {
+      // Strictly, we timed slightly more than min heap operation,
+      // but these operations are very cheap.
+      PERF_TIMER_GUARD(seek_min_heap_time);
+      AddToMinHeapOrCheckStatus(&children_[level]);
+    }
+  }
+
+  if (range_tombstone_iters_.empty()) {
+    for (auto& child : children_) {
+      if (child.iter.status().IsTryAgain()) {
+        child.iter.Seek(target);
+        {
+          PERF_TIMER_GUARD(seek_min_heap_time);
+          AddToMinHeapOrCheckStatus(&child);
+        }
+        PERF_COUNTER_ADD(number_async_seek, 1);
+      }
+    }
+  } else {
+    for (auto& prefetch : prefetched_target) {
+      // (level, target) pairs
+      children_[prefetch.first].iter.Seek(prefetch.second);
+      {
+        PERF_TIMER_GUARD(seek_min_heap_time);
+        AddToMinHeapOrCheckStatus(&children_[prefetch.first]);
+      }
+      PERF_COUNTER_ADD(number_async_seek, 1);
+    }
+  }
+}
+
+// Returns true iff the current key (min heap top) should not be returned
+// to user (of the merging iterator). This can be because the current key
+// is deleted by some range tombstone, the current key is some fake file
+// boundary sentinel key, or the current key is an end point of a range
+// tombstone. Advance the iterator at heap top if needed. Heap order is restored
+// and `active_` is updated accordingly.
+// See FindNextVisibleKey() for more detail on internal implementation
+// of advancing child iters.
+//
+// REQUIRES:
+// - min heap is currently not empty, and iter is in kForward direction.
+// - minHeap_ top is not DELETE_RANGE_START (so that `active_` is current).
+bool MergingIterator::SkipNextDeleted() {
+  // 3 types of keys:
+  // - point key
+  // - file boundary sentinel keys
+  // - range deletion end key
+  auto current = minHeap_.top();
+  if (current->type == HeapItem::DELETE_RANGE_END) {
+    active_.erase(current->level);
+    assert(range_tombstone_iters_[current->level] &&
+           range_tombstone_iters_[current->level]->Valid());
+    range_tombstone_iters_[current->level]->Next();
+    if (range_tombstone_iters_[current->level]->Valid()) {
+      InsertRangeTombstoneToMinHeap(current->level, true /* start_key */,
+                                    true /* replace_top */);
+    } else {
+      minHeap_.pop();
+    }
+    return true /* current key deleted */;
+  }
+  if (current->iter.IsDeleteRangeSentinelKey()) {
+    // If the file boundary is defined by a range deletion, the range
+    // tombstone's end key must come before this sentinel key (see op_type in
+    // SetTombstoneKey()).
+    assert(ExtractValueType(current->iter.key()) != kTypeRangeDeletion ||
+           active_.count(current->level) == 0);
+    // LevelIterator enters a new SST file
+    current->iter.Next();
+    if (current->iter.Valid()) {
+      assert(current->iter.status().ok());
+      minHeap_.replace_top(current);
+    } else {
+      minHeap_.pop();
+    }
+    // Remove last SST file's range tombstone end key if there is one.
+    // This means file boundary is before range tombstone end key,
+    // which could happen when a range tombstone and a user key
+    // straddle two SST files. Note that in TruncatedRangeDelIterator
+    // constructor, parsed_largest.sequence is decremented 1 in this case.
+    if (!minHeap_.empty() && minHeap_.top()->level == current->level &&
+        minHeap_.top()->type == HeapItem::DELETE_RANGE_END) {
+      minHeap_.pop();
+      active_.erase(current->level);
+    }
+    if (range_tombstone_iters_[current->level] &&
+        range_tombstone_iters_[current->level]->Valid()) {
+      InsertRangeTombstoneToMinHeap(current->level);
+    }
+    return true /* current key deleted */;
+  }
+  assert(current->type == HeapItem::ITERATOR);
+  // Point key case: check active_ for range tombstone coverage.
+  ParsedInternalKey pik;
+  ParseInternalKey(current->iter.key(), &pik, false).PermitUncheckedError();
+  if (!active_.empty()) {
+    auto i = *active_.begin();
+    if (i < current->level) {
+      // range tombstone is from a newer level, definitely covers
+      assert(comparator_->Compare(range_tombstone_iters_[i]->start_key(),
+                                  pik) <= 0);
+      assert(comparator_->Compare(pik, range_tombstone_iters_[i]->end_key()) <
+             0);
+      std::string target;
+      AppendInternalKey(&target, range_tombstone_iters_[i]->end_key());
+      SeekImpl(target, current->level, true);
+      return true /* current key deleted */;
+    } else if (i == current->level) {
+      // range tombstone is from the same level as current, check sequence
+      // number. By `active_` we know current key is between start key and end
+      // key.
+      assert(comparator_->Compare(range_tombstone_iters_[i]->start_key(),
+                                  pik) <= 0);
+      assert(comparator_->Compare(pik, range_tombstone_iters_[i]->end_key()) <
+             0);
+      if (pik.sequence < range_tombstone_iters_[current->level]->seq()) {
+        // covered by range tombstone
+        current->iter.Next();
+        if (current->iter.Valid()) {
+          minHeap_.replace_top(current);
+        } else {
+          minHeap_.pop();
+        }
+        return true /* current key deleted */;
+      } else {
+        return false /* current key not deleted */;
+      }
+    } else {
+      return false /* current key not deleted */;
+      // range tombstone from an older sorted run with current key < end key.
+      // current key is not deleted and the older sorted run will have its range
+      // tombstone updated when the range tombstone's end key are popped from
+      // minHeap_.
+    }
+  }
+  // we can reach here only if active_ is empty
+  assert(active_.empty());
+  assert(minHeap_.top()->type == HeapItem::ITERATOR);
+  return false /* current key not deleted */;
+}
+
+void MergingIterator::SeekForPrevImpl(const Slice& target,
+                                      size_t starting_level,
+                                      bool range_tombstone_reseek) {
+  // active range tombstones before `starting_level` remain active
+  ClearHeaps(false /* clear_active */);
+  InitMaxHeap();
+  ParsedInternalKey pik;
+  if (!range_tombstone_iters_.empty()) {
+    ParseInternalKey(target, &pik, false).PermitUncheckedError();
+  }
+  for (size_t level = 0; level < starting_level; ++level) {
+    PERF_TIMER_GUARD(seek_max_heap_time);
+    AddToMaxHeapOrCheckStatus(&children_[level]);
+  }
+  if (!range_tombstone_iters_.empty()) {
+    // Add range tombstones before starting_level.
+    for (size_t level = 0; level < starting_level; ++level) {
+      if (range_tombstone_iters_[level] &&
+          range_tombstone_iters_[level]->Valid()) {
+        assert(static_cast<bool>(active_.count(level)) ==
+               (pinned_heap_item_[level].type == HeapItem::DELETE_RANGE_START));
+        maxHeap_->push(&pinned_heap_item_[level]);
+      } else {
+        assert(!active_.count(level));
+      }
+    }
+    // levels >= starting_level will be reseeked below,
+    active_.erase(active_.lower_bound(starting_level), active_.end());
+  }
+
+  status_ = Status::OK();
+  IterKey current_search_key;
+  current_search_key.SetInternalKey(target, false /* copy */);
+  // Seek target might change to some range tombstone end key, so
+  // we need to remember them for async requests.
+  // (level, target) pairs
+  autovector<std::pair<size_t, std::string>> prefetched_target;
+  for (auto level = starting_level; level < children_.size(); ++level) {
+    {
+      PERF_TIMER_GUARD(seek_child_seek_time);
+      children_[level].iter.SeekForPrev(current_search_key.GetInternalKey());
+    }
+
+    PERF_COUNTER_ADD(seek_child_seek_count, 1);
+
+    if (!range_tombstone_iters_.empty()) {
+      if (range_tombstone_reseek) {
+        // This seek is to some range tombstone end key.
+        // Should only happen when there are range tombstones.
+        PERF_COUNTER_ADD(internal_range_del_reseek_count, 1);
+      }
+      if (children_[level].iter.status().IsTryAgain()) {
+        prefetched_target.emplace_back(
+            level, current_search_key.GetInternalKey().ToString());
+      }
+      auto range_tombstone_iter = range_tombstone_iters_[level];
+      if (range_tombstone_iter) {
+        range_tombstone_iter->SeekForPrev(current_search_key.GetUserKey());
+        if (range_tombstone_iter->Valid()) {
+          InsertRangeTombstoneToMaxHeap(
+              level, comparator_->Compare(range_tombstone_iter->end_key(),
+                                          pik) <= 0 /* end_key */);
+          // start key <= current_search_key guaranteed by the Seek() call above
+          // Only interested in user key coverage since older sorted runs must
+          // have smaller sequence numbers than this tombstone.
+          if (comparator_->user_comparator()->Compare(
+                  current_search_key.GetUserKey(),
+                  range_tombstone_iter->end_key().user_key) < 0) {
+            range_tombstone_reseek = true;
+            current_search_key.SetInternalKey(
+                range_tombstone_iter->start_key().user_key, kMaxSequenceNumber,
+                kValueTypeForSeekForPrev);
+          }
+        }
+      }
+    }
+    // child.iter.status() is set to Status::TryAgain indicating asynchronous
+    // request for retrieval of data blocks has been submitted. So it should
+    // return at this point and Seek should be called again to retrieve the
+    // requested block and add the child to min heap.
+    if (children_[level].iter.status().IsTryAgain()) {
+      continue;
+    }
+    {
+      // Strictly, we timed slightly more than min heap operation,
+      // but these operations are very cheap.
+      PERF_TIMER_GUARD(seek_max_heap_time);
+      AddToMaxHeapOrCheckStatus(&children_[level]);
+    }
+  }
+
+  if (range_tombstone_iters_.empty()) {
+    for (auto& child : children_) {
+      if (child.iter.status().IsTryAgain()) {
+        child.iter.SeekForPrev(target);
+        {
+          PERF_TIMER_GUARD(seek_min_heap_time);
+          AddToMaxHeapOrCheckStatus(&child);
+        }
+        PERF_COUNTER_ADD(number_async_seek, 1);
+      }
+    }
+  } else {
+    for (auto& prefetch : prefetched_target) {
+      // (level, target) pairs
+      children_[prefetch.first].iter.SeekForPrev(prefetch.second);
+      {
+        PERF_TIMER_GUARD(seek_max_heap_time);
+        AddToMaxHeapOrCheckStatus(&children_[prefetch.first]);
+      }
+      PERF_COUNTER_ADD(number_async_seek, 1);
+    }
+  }
+}
+
+// See more in comments above SkipNextDeleted().
+// REQUIRES:
+// - max heap is currently not empty, and iter is in kReverse direction.
+// - maxHeap_ top is not DELETE_RANGE_END (so that `active_` is current).
+bool MergingIterator::SkipPrevDeleted() {
+  // 3 types of keys:
+  // - point key
+  // - file boundary sentinel keys
+  // - range deletion start key
+  auto current = maxHeap_->top();
+  if (current->type == HeapItem::DELETE_RANGE_START) {
+    active_.erase(current->level);
+    assert(range_tombstone_iters_[current->level] &&
+           range_tombstone_iters_[current->level]->Valid());
+    range_tombstone_iters_[current->level]->Prev();
+    if (range_tombstone_iters_[current->level]->Valid()) {
+      InsertRangeTombstoneToMaxHeap(current->level, true /* end_key */,
+                                    true /* replace_top */);
+    } else {
+      maxHeap_->pop();
+    }
+    return true /* current key deleted */;
+  }
+  if (current->iter.IsDeleteRangeSentinelKey()) {
+    // LevelIterator enters a new SST file
+    current->iter.Prev();
+    if (current->iter.Valid()) {
+      assert(current->iter.status().ok());
+      maxHeap_->replace_top(current);
+    } else {
+      maxHeap_->pop();
+    }
+    if (!maxHeap_->empty() && maxHeap_->top()->level == current->level &&
+        maxHeap_->top()->type == HeapItem::DELETE_RANGE_START) {
+      maxHeap_->pop();
+      active_.erase(current->level);
+    }
+    if (range_tombstone_iters_[current->level] &&
+        range_tombstone_iters_[current->level]->Valid()) {
+      InsertRangeTombstoneToMaxHeap(current->level);
+    }
+    return true /* current key deleted */;
+  }
+  assert(current->type == HeapItem::ITERATOR);
+  // Point key case: check active_ for range tombstone coverage.
+  ParsedInternalKey pik;
+  ParseInternalKey(current->iter.key(), &pik, false).PermitUncheckedError();
+  if (!active_.empty()) {
+    auto i = *active_.begin();
+    if (i < current->level) {
+      // range tombstone is from a newer level, definitely covers
+      assert(comparator_->Compare(range_tombstone_iters_[i]->start_key(),
+                                  pik) <= 0);
+      assert(comparator_->Compare(pik, range_tombstone_iters_[i]->end_key()) <
+             0);
+      std::string target;
+      AppendInternalKey(&target, range_tombstone_iters_[i]->start_key());
+      // This is different from SkipNextDeleted() which does reseek at sorted
+      // runs >= level (instead of i+1 here). With min heap, if level L is at
+      // top of the heap, then levels <L all have internal keys > level L's
+      // current internal key, which means levels <L are already at a different
+      // user key. With max heap, if level L is at top of the heap, then levels
+      // <L all have internal keys smaller than level L's current internal key,
+      // which might still be the same user key.
+      SeekForPrevImpl(target, i + 1, true);
+      return true /* current key deleted */;
+    } else if (i == current->level) {
+      // By `active_` we know current key is between start key and end key.
+      assert(comparator_->Compare(range_tombstone_iters_[i]->start_key(),
+                                  pik) <= 0);
+      assert(comparator_->Compare(pik, range_tombstone_iters_[i]->end_key()) <
+             0);
+      if (pik.sequence < range_tombstone_iters_[current->level]->seq()) {
+        current->iter.Prev();
+        if (current->iter.Valid()) {
+          maxHeap_->replace_top(current);
+        } else {
+          maxHeap_->pop();
+        }
+        return true /* current key deleted */;
+      } else {
+        return false /* current key not deleted */;
+      }
+    } else {
+      return false /* current key not deleted */;
+    }
+  }
+
+  assert(active_.empty());
+  assert(maxHeap_->top()->type == HeapItem::ITERATOR);
+  return false /* current key not deleted */;
+}
+
+void MergingIterator::AddToMinHeapOrCheckStatus(HeapItem* child) {
+  if (child->iter.Valid()) {
+    assert(child->iter.status().ok());
+    minHeap_.push(child);
+  } else {
+    considerStatus(child->iter.status());
+  }
+}
+
+void MergingIterator::AddToMaxHeapOrCheckStatus(HeapItem* child) {
+  if (child->iter.Valid()) {
+    assert(child->iter.status().ok());
+    maxHeap_->push(child);
+  } else {
+    considerStatus(child->iter.status());
+  }
+}
+
+// Advance all non current_ child to > current_.key().
+// We advance current_ after the this function call as it does not require
+// Seek().
+// Advance all range tombstones iters, including the one corresponding to
+// current_, to the first tombstone with end_key > current_.key().
+// TODO: potentially do cascading seek here too
+void MergingIterator::SwitchToForward() {
+  ClearHeaps();
+  Slice target = key();
+  for (auto& child : children_) {
+    if (&child.iter != current_) {
+      child.iter.Seek(target);
+      // child.iter.status() is set to Status::TryAgain indicating asynchronous
+      // request for retrieval of data blocks has been submitted. So it should
+      // return at this point and Seek should be called again to retrieve the
+      // requested block and add the child to min heap.
+      if (child.iter.status() == Status::TryAgain()) {
+        continue;
+      }
+      if (child.iter.Valid() && comparator_->Equal(target, child.key())) {
+        assert(child.iter.status().ok());
+        child.iter.Next();
+      }
+    }
+    AddToMinHeapOrCheckStatus(&child);
+  }
+
+  for (auto& child : children_) {
+    if (child.iter.status() == Status::TryAgain()) {
+      child.iter.Seek(target);
+      if (child.iter.Valid() && comparator_->Equal(target, child.key())) {
+        assert(child.iter.status().ok());
+        child.iter.Next();
+      }
+      AddToMinHeapOrCheckStatus(&child);
+    }
+  }
+
+  // Current range tombstone iter also needs to seek for the following case:
+  // Previous direction is backward, so range tombstone iter may point to a
+  // tombstone before current_. If there is no such tombstone, then the range
+  // tombstone iter is !Valid(). Need to reseek here to make it valid again.
+  if (!range_tombstone_iters_.empty()) {
+    ParsedInternalKey pik;
+    ParseInternalKey(target, &pik, false /* log_err_key */)
+        .PermitUncheckedError();
+    for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
+      auto iter = range_tombstone_iters_[i];
+      if (iter) {
+        iter->Seek(pik.user_key);
+        // The while loop is needed as the Seek() call above is only for user
+        // key. We could have a range tombstone with end_key covering user_key,
+        // but still is smaller than target. This happens when the range
+        // tombstone is truncated at iter.largest_.
+        while (iter->Valid() &&
+               comparator_->Compare(iter->end_key(), pik) <= 0) {
+          iter->Next();
+        }
+        if (range_tombstone_iters_[i]->Valid()) {
+          InsertRangeTombstoneToMinHeap(
+              i, comparator_->Compare(range_tombstone_iters_[i]->start_key(),
+                                      pik) > 0 /* start_key */);
+        }
+      }
+    }
+  }
+
+  direction_ = kForward;
+  assert(current_ == CurrentForward());
+}
+
+// Advance all range tombstones iters, including the one corresponding to
+// current_, to the first tombstone with start_key <= current_.key().
+void MergingIterator::SwitchToBackward() {
+  ClearHeaps();
+  InitMaxHeap();
+  Slice target = key();
+  for (auto& child : children_) {
+    if (&child.iter != current_) {
+      child.iter.SeekForPrev(target);
+      TEST_SYNC_POINT_CALLBACK("MergeIterator::Prev:BeforePrev", &child);
+      if (child.iter.Valid() && comparator_->Equal(target, child.key())) {
+        assert(child.iter.status().ok());
+        child.iter.Prev();
+      }
+    }
+    AddToMaxHeapOrCheckStatus(&child);
+  }
+
+  ParsedInternalKey pik;
+  ParseInternalKey(target, &pik, false /* log_err_key */)
+      .PermitUncheckedError();
+  for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
+    auto iter = range_tombstone_iters_[i];
+    if (iter) {
+      iter->SeekForPrev(pik.user_key);
+      // Since the SeekForPrev() call above is only for user key,
+      // we may end up with some range tombstone with start key having the
+      // same user key at current_, but with a smaller sequence number. This
+      // makes current_ not at maxHeap_ top for the CurrentReverse() call
+      // below. If there is a range tombstone start key with the same user
+      // key and the same sequence number as current_.key(), it will be fine as
+      // in InsertRangeTombstoneToMaxHeap() we change op_type to be the smallest
+      // op_type.
+      while (iter->Valid() &&
+             comparator_->Compare(iter->start_key(), pik) > 0) {
+        iter->Prev();
+      }
+      if (iter->Valid()) {
+        InsertRangeTombstoneToMaxHeap(
+            i, comparator_->Compare(range_tombstone_iters_[i]->end_key(),
+                                    pik) <= 0 /* end_key */);
+      }
+    }
+  }
+
+  direction_ = kReverse;
+  if (!prefix_seek_mode_) {
+    // Note that we don't do assert(current_ == CurrentReverse()) here
+    // because it is possible to have some keys larger than the seek-key
+    // inserted between Seek() and SeekToLast(), which makes current_ not
+    // equal to CurrentReverse().
+    current_ = CurrentReverse();
+  }
+  assert(current_ == CurrentReverse());
+}
+
+void MergingIterator::ClearHeaps(bool clear_active) {
+  minHeap_.clear();
+  if (maxHeap_) {
+    maxHeap_->clear();
+  }
+  if (clear_active) {
+    active_.clear();
+  }
+}
+
+void MergingIterator::InitMaxHeap() {
+  if (!maxHeap_) {
+    maxHeap_ = std::make_unique<MergerMaxIterHeap>(comparator_);
+  }
+}
+
+// Repeatedly check and remove heap top key if it is not a point key
+// that is not covered by range tombstones. SeekImpl() is called to seek to end
+// of a range tombstone if the heap top is a point key covered by some range
+// tombstone from a newer sorted run. If the covering tombstone is from current
+// key's level, then the current child iterator is simply advanced to its next
+// key without reseeking.
+inline void MergingIterator::FindNextVisibleKey() {
+  // When active_ is empty, we know heap top cannot be a range tombstone end
+  // key. It cannot be a range tombstone start key per PopDeleteRangeStart().
+  PopDeleteRangeStart();
+  while (!minHeap_.empty() &&
+         (!active_.empty() || minHeap_.top()->IsDeleteRangeSentinelKey()) &&
+         SkipNextDeleted()) {
+    PopDeleteRangeStart();
+  }
+}
+
+inline void MergingIterator::FindPrevVisibleKey() {
+  PopDeleteRangeEnd();
+  while (!maxHeap_->empty() &&
+         (!active_.empty() || maxHeap_->top()->IsDeleteRangeSentinelKey()) &&
+         SkipPrevDeleted()) {
+    PopDeleteRangeEnd();
+  }
+}
+
+InternalIterator* NewMergingIterator(const InternalKeyComparator* cmp,
+                                     InternalIterator** list, int n,
+                                     Arena* arena, bool prefix_seek_mode) {
+  assert(n >= 0);
+  if (n == 0) {
+    return NewEmptyInternalIterator<Slice>(arena);
+  } else if (n == 1) {
+    return list[0];
+  } else {
+    if (arena == nullptr) {
+      return new MergingIterator(cmp, list, n, false, prefix_seek_mode);
+    } else {
+      auto mem = arena->AllocateAligned(sizeof(MergingIterator));
+      return new (mem) MergingIterator(cmp, list, n, true, prefix_seek_mode);
+    }
+  }
+}
+
+MergeIteratorBuilder::MergeIteratorBuilder(
+    const InternalKeyComparator* comparator, Arena* a, bool prefix_seek_mode,
+    const Slice* iterate_upper_bound)
+    : first_iter(nullptr), use_merging_iter(false), arena(a) {
+  auto mem = arena->AllocateAligned(sizeof(MergingIterator));
+  merge_iter = new (mem) MergingIterator(comparator, nullptr, 0, true,
+                                         prefix_seek_mode, iterate_upper_bound);
+}
+
+MergeIteratorBuilder::~MergeIteratorBuilder() {
+  if (first_iter != nullptr) {
+    first_iter->~InternalIterator();
+  }
+  if (merge_iter != nullptr) {
+    merge_iter->~MergingIterator();
+  }
+}
+
+void MergeIteratorBuilder::AddIterator(InternalIterator* iter) {
+  if (!use_merging_iter && first_iter != nullptr) {
+    merge_iter->AddIterator(first_iter);
+    use_merging_iter = true;
+    first_iter = nullptr;
+  }
+  if (use_merging_iter) {
+    merge_iter->AddIterator(iter);
+  } else {
+    first_iter = iter;
+  }
+}
+
+void MergeIteratorBuilder::AddPointAndTombstoneIterator(
+    InternalIterator* point_iter, TruncatedRangeDelIterator* tombstone_iter,
+    TruncatedRangeDelIterator*** tombstone_iter_ptr) {
+  // tombstone_iter_ptr != nullptr means point_iter is a LevelIterator.
+  bool add_range_tombstone = tombstone_iter ||
+                             !merge_iter->range_tombstone_iters_.empty() ||
+                             tombstone_iter_ptr;
+  if (!use_merging_iter && (add_range_tombstone || first_iter)) {
+    use_merging_iter = true;
+    if (first_iter) {
+      merge_iter->AddIterator(first_iter);
+      first_iter = nullptr;
+    }
+  }
+  if (use_merging_iter) {
+    merge_iter->AddIterator(point_iter);
+    if (add_range_tombstone) {
+      // If there was a gap, fill in nullptr as empty range tombstone iterators.
+      while (merge_iter->range_tombstone_iters_.size() <
+             merge_iter->children_.size() - 1) {
+        merge_iter->AddRangeTombstoneIterator(nullptr);
+      }
+      merge_iter->AddRangeTombstoneIterator(tombstone_iter);
+    }
+
+    if (tombstone_iter_ptr) {
+      // This is needed instead of setting to &range_tombstone_iters_[i]
+      // directly here since the memory address of range_tombstone_iters_[i]
+      // might change during vector resizing.
+      range_del_iter_ptrs_.emplace_back(
+          merge_iter->range_tombstone_iters_.size() - 1, tombstone_iter_ptr);
+    }
+  } else {
+    first_iter = point_iter;
+  }
+}
+
+InternalIterator* MergeIteratorBuilder::Finish(ArenaWrappedDBIter* db_iter) {
+  InternalIterator* ret = nullptr;
+  if (!use_merging_iter) {
+    ret = first_iter;
+    first_iter = nullptr;
+  } else {
+    for (auto& p : range_del_iter_ptrs_) {
+      *(p.second) = &(merge_iter->range_tombstone_iters_[p.first]);
+    }
+    if (db_iter && !merge_iter->range_tombstone_iters_.empty()) {
+      // memtable is always the first level
+      db_iter->SetMemtableRangetombstoneIter(
+          &merge_iter->range_tombstone_iters_.front());
+    }
+    merge_iter->Finish();
+    ret = merge_iter;
+    merge_iter = nullptr;
+  }
+  return ret;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/merging_iterator.h b/src/rocksdb/table/merging_iterator.h
new file mode 100644
index 000000000..16fc0877e
--- /dev/null
+++ b/src/rocksdb/table/merging_iterator.h
@@ -0,0 +1,92 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/range_del_aggregator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena;
+class ArenaWrappedDBIter;
+class InternalKeyComparator;
+
+template <class TValue>
+class InternalIteratorBase;
+using InternalIterator = InternalIteratorBase<Slice>;
+
+// Return an iterator that provided the union of the data in
+// children[0,n-1].  Takes ownership of the child iterators and
+// will delete them when the result iterator is deleted.
+//
+// The result does no duplicate suppression.  I.e., if a particular
+// key is present in K child iterators, it will be yielded K times.
+//
+// REQUIRES: n >= 0
+extern InternalIterator* NewMergingIterator(
+    const InternalKeyComparator* comparator, InternalIterator** children, int n,
+    Arena* arena = nullptr, bool prefix_seek_mode = false);
+
+class MergingIterator;
+
+// A builder class to build a merging iterator by adding iterators one by one.
+// User should call only one of AddIterator() or AddPointAndTombstoneIterator()
+// exclusively for the same builder.
+class MergeIteratorBuilder {
+ public:
+  // comparator: the comparator used in merging comparator
+  // arena: where the merging iterator needs to be allocated from.
+  explicit MergeIteratorBuilder(const InternalKeyComparator* comparator,
+                                Arena* arena, bool prefix_seek_mode = false,
+                                const Slice* iterate_upper_bound = nullptr);
+  ~MergeIteratorBuilder();
+
+  // Add iter to the merging iterator.
+  void AddIterator(InternalIterator* iter);
+
+  // Add a point key iterator and a range tombstone iterator.
+  // `tombstone_iter_ptr` should and only be set by LevelIterator.
+  // *tombstone_iter_ptr will be set to where the merging iterator stores
+  // `tombstone_iter` when MergeIteratorBuilder::Finish() is called. This is
+  // used by LevelIterator to update range tombstone iters when switching to a
+  // different SST file. If a single point iterator with a nullptr range
+  // tombstone iterator is provided, and the point iterator is not a level
+  // iterator, then this builder will return the point iterator directly,
+  // instead of creating a merging iterator on top of it. Internally, if all
+  // point iterators are not LevelIterator, then range tombstone iterator is
+  // only added to the merging iter if there is a non-null `tombstone_iter`.
+  void AddPointAndTombstoneIterator(
+      InternalIterator* point_iter, TruncatedRangeDelIterator* tombstone_iter,
+      TruncatedRangeDelIterator*** tombstone_iter_ptr = nullptr);
+
+  // Get arena used to build the merging iterator. It is called one a child
+  // iterator needs to be allocated.
+  Arena* GetArena() { return arena; }
+
+  // Return the result merging iterator.
+  // If db_iter is not nullptr, then db_iter->SetMemtableRangetombstoneIter()
+  // will be called with pointer to where the merging iterator
+  // stores the memtable range tombstone iterator.
+  // This is used for DB iterator to refresh memtable range tombstones.
+  InternalIterator* Finish(ArenaWrappedDBIter* db_iter = nullptr);
+
+ private:
+  MergingIterator* merge_iter;
+  InternalIterator* first_iter;
+  bool use_merging_iter;
+  Arena* arena;
+  // Used to set LevelIterator.range_tombstone_iter_.
+  // See AddRangeTombstoneIterator() implementation for more detail.
+  std::vector<std::pair<size_t, TruncatedRangeDelIterator***>>
+      range_del_iter_ptrs_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/meta_blocks.cc b/src/rocksdb/table/meta_blocks.cc
new file mode 100644
index 000000000..78a62359d
--- /dev/null
+++ b/src/rocksdb/table/meta_blocks.cc
@@ -0,0 +1,553 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#include "table/meta_blocks.h"
+
+#include <map>
+#include <string>
+
+#include "block_fetcher.h"
+#include "db/table_properties_collector.h"
+#include "file/random_access_file_reader.h"
+#include "logging/logging.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/block_based/block.h"
+#include "table/block_based/reader_common.h"
+#include "table/format.h"
+#include "table/internal_iterator.h"
+#include "table/persistent_cache_helper.h"
+#include "table/sst_file_writer_collectors.h"
+#include "table/table_properties_internal.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const std::string kPropertiesBlockName = "rocksdb.properties";
+// Old property block name for backward compatibility
+const std::string kPropertiesBlockOldName = "rocksdb.stats";
+const std::string kCompressionDictBlockName = "rocksdb.compression_dict";
+const std::string kRangeDelBlockName = "rocksdb.range_del";
+
+MetaIndexBuilder::MetaIndexBuilder()
+    : meta_index_block_(new BlockBuilder(1 /* restart interval */)) {}
+
+void MetaIndexBuilder::Add(const std::string& key, const BlockHandle& handle) {
+  std::string handle_encoding;
+  handle.EncodeTo(&handle_encoding);
+  meta_block_handles_.insert({key, handle_encoding});
+}
+
+Slice MetaIndexBuilder::Finish() {
+  for (const auto& metablock : meta_block_handles_) {
+    meta_index_block_->Add(metablock.first, metablock.second);
+  }
+  return meta_index_block_->Finish();
+}
+
+// Property block will be read sequentially and cached in a heap located
+// object, so there's no need for restart points. Thus we set the restart
+// interval to infinity to save space.
+PropertyBlockBuilder::PropertyBlockBuilder()
+    : properties_block_(new BlockBuilder(
+          std::numeric_limits<int32_t>::max() /* restart interval */)) {}
+
+void PropertyBlockBuilder::Add(const std::string& name,
+                               const std::string& val) {
+  props_.insert({name, val});
+}
+
+void PropertyBlockBuilder::Add(const std::string& name, uint64_t val) {
+  assert(props_.find(name) == props_.end());
+
+  std::string dst;
+  PutVarint64(&dst, val);
+
+  Add(name, dst);
+}
+
+void PropertyBlockBuilder::Add(
+    const UserCollectedProperties& user_collected_properties) {
+  for (const auto& prop : user_collected_properties) {
+    Add(prop.first, prop.second);
+  }
+}
+
+void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
+  TEST_SYNC_POINT_CALLBACK("PropertyBlockBuilder::AddTableProperty:Start",
+                           const_cast<TableProperties*>(&props));
+
+  Add(TablePropertiesNames::kOriginalFileNumber, props.orig_file_number);
+  Add(TablePropertiesNames::kRawKeySize, props.raw_key_size);
+  Add(TablePropertiesNames::kRawValueSize, props.raw_value_size);
+  Add(TablePropertiesNames::kDataSize, props.data_size);
+  Add(TablePropertiesNames::kIndexSize, props.index_size);
+  if (props.index_partitions != 0) {
+    Add(TablePropertiesNames::kIndexPartitions, props.index_partitions);
+    Add(TablePropertiesNames::kTopLevelIndexSize, props.top_level_index_size);
+  }
+  Add(TablePropertiesNames::kIndexKeyIsUserKey, props.index_key_is_user_key);
+  Add(TablePropertiesNames::kIndexValueIsDeltaEncoded,
+      props.index_value_is_delta_encoded);
+  Add(TablePropertiesNames::kNumEntries, props.num_entries);
+  Add(TablePropertiesNames::kNumFilterEntries, props.num_filter_entries);
+  Add(TablePropertiesNames::kDeletedKeys, props.num_deletions);
+  Add(TablePropertiesNames::kMergeOperands, props.num_merge_operands);
+  Add(TablePropertiesNames::kNumRangeDeletions, props.num_range_deletions);
+  Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks);
+  Add(TablePropertiesNames::kFilterSize, props.filter_size);
+  Add(TablePropertiesNames::kFormatVersion, props.format_version);
+  Add(TablePropertiesNames::kFixedKeyLen, props.fixed_key_len);
+  Add(TablePropertiesNames::kColumnFamilyId, props.column_family_id);
+  Add(TablePropertiesNames::kCreationTime, props.creation_time);
+  Add(TablePropertiesNames::kOldestKeyTime, props.oldest_key_time);
+  if (props.file_creation_time > 0) {
+    Add(TablePropertiesNames::kFileCreationTime, props.file_creation_time);
+  }
+  if (props.slow_compression_estimated_data_size > 0) {
+    Add(TablePropertiesNames::kSlowCompressionEstimatedDataSize,
+        props.slow_compression_estimated_data_size);
+  }
+  if (props.fast_compression_estimated_data_size > 0) {
+    Add(TablePropertiesNames::kFastCompressionEstimatedDataSize,
+        props.fast_compression_estimated_data_size);
+  }
+  if (!props.db_id.empty()) {
+    Add(TablePropertiesNames::kDbId, props.db_id);
+  }
+  if (!props.db_session_id.empty()) {
+    Add(TablePropertiesNames::kDbSessionId, props.db_session_id);
+  }
+  if (!props.db_host_id.empty()) {
+    Add(TablePropertiesNames::kDbHostId, props.db_host_id);
+  }
+
+  if (!props.filter_policy_name.empty()) {
+    Add(TablePropertiesNames::kFilterPolicy, props.filter_policy_name);
+  }
+  if (!props.comparator_name.empty()) {
+    Add(TablePropertiesNames::kComparator, props.comparator_name);
+  }
+
+  if (!props.merge_operator_name.empty()) {
+    Add(TablePropertiesNames::kMergeOperator, props.merge_operator_name);
+  }
+  if (!props.prefix_extractor_name.empty()) {
+    Add(TablePropertiesNames::kPrefixExtractorName,
+        props.prefix_extractor_name);
+  }
+  if (!props.property_collectors_names.empty()) {
+    Add(TablePropertiesNames::kPropertyCollectors,
+        props.property_collectors_names);
+  }
+  if (!props.column_family_name.empty()) {
+    Add(TablePropertiesNames::kColumnFamilyName, props.column_family_name);
+  }
+
+  if (!props.compression_name.empty()) {
+    Add(TablePropertiesNames::kCompression, props.compression_name);
+  }
+  if (!props.compression_options.empty()) {
+    Add(TablePropertiesNames::kCompressionOptions, props.compression_options);
+  }
+  if (!props.seqno_to_time_mapping.empty()) {
+    Add(TablePropertiesNames::kSequenceNumberTimeMapping,
+        props.seqno_to_time_mapping);
+  }
+}
+
+Slice PropertyBlockBuilder::Finish() {
+  for (const auto& prop : props_) {
+    properties_block_->Add(prop.first, prop.second);
+  }
+
+  return properties_block_->Finish();
+}
+
+void LogPropertiesCollectionError(Logger* info_log, const std::string& method,
+                                  const std::string& name) {
+  assert(method == "Add" || method == "Finish");
+
+  std::string msg =
+      "Encountered error when calling TablePropertiesCollector::" + method +
+      "() with collector name: " + name;
+  ROCKS_LOG_ERROR(info_log, "%s", msg.c_str());
+}
+
+bool NotifyCollectTableCollectorsOnAdd(
+    const Slice& key, const Slice& value, uint64_t file_size,
+    const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
+    Logger* info_log) {
+  bool all_succeeded = true;
+  for (auto& collector : collectors) {
+    Status s = collector->InternalAdd(key, value, file_size);
+    all_succeeded = all_succeeded && s.ok();
+    if (!s.ok()) {
+      LogPropertiesCollectionError(info_log, "Add" /* method */,
+                                   collector->Name());
+    }
+  }
+  return all_succeeded;
+}
+
+void NotifyCollectTableCollectorsOnBlockAdd(
+    const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
+    const uint64_t block_uncomp_bytes,
+    const uint64_t block_compressed_bytes_fast,
+    const uint64_t block_compressed_bytes_slow) {
+  for (auto& collector : collectors) {
+    collector->BlockAdd(block_uncomp_bytes, block_compressed_bytes_fast,
+                        block_compressed_bytes_slow);
+  }
+}
+
+bool NotifyCollectTableCollectorsOnFinish(
+    const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
+    Logger* info_log, PropertyBlockBuilder* builder) {
+  bool all_succeeded = true;
+  for (auto& collector : collectors) {
+    UserCollectedProperties user_collected_properties;
+    Status s = collector->Finish(&user_collected_properties);
+
+    all_succeeded = all_succeeded && s.ok();
+    if (!s.ok()) {
+      LogPropertiesCollectionError(info_log, "Finish" /* method */,
+                                   collector->Name());
+    } else {
+      builder->Add(user_collected_properties);
+    }
+  }
+
+  return all_succeeded;
+}
+
+// FIXME: should be a parameter for reading table properties to use persistent
+// cache?
+Status ReadTablePropertiesHelper(
+    const ReadOptions& ro, const BlockHandle& handle,
+    RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
+    const Footer& footer, const ImmutableOptions& ioptions,
+    std::unique_ptr<TableProperties>* table_properties,
+    MemoryAllocator* memory_allocator) {
+  assert(table_properties);
+
+  // If this is an external SST file ingested with write_global_seqno set to
+  // true, then we expect the checksum mismatch because checksum was written
+  // by SstFileWriter, but its global seqno in the properties block may have
+  // been changed during ingestion. For this reason, we initially read
+  // and process without checksum verification, then later try checksum
+  // verification so that if it fails, we can copy to a temporary buffer with
+  // global seqno set to its original value, i.e. 0, and attempt checksum
+  // verification again.
+  ReadOptions modified_ro = ro;
+  modified_ro.verify_checksums = false;
+  BlockContents block_contents;
+  BlockFetcher block_fetcher(file, prefetch_buffer, footer, modified_ro, handle,
+                             &block_contents, ioptions, false /* decompress */,
+                             false /*maybe_compressed*/, BlockType::kProperties,
+                             UncompressionDict::GetEmptyDict(),
+                             PersistentCacheOptions::kEmpty, memory_allocator);
+  Status s = block_fetcher.ReadBlockContents();
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Unfortunately, Block::size() might not equal block_contents.data.size(),
+  // and Block hides block_contents
+  uint64_t block_size = block_contents.data.size();
+  Block properties_block(std::move(block_contents));
+  std::unique_ptr<MetaBlockIter> iter(properties_block.NewMetaIterator());
+
+  std::unique_ptr<TableProperties> new_table_properties{new TableProperties};
+  // All pre-defined properties of type uint64_t
+  std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = {
+      {TablePropertiesNames::kOriginalFileNumber,
+       &new_table_properties->orig_file_number},
+      {TablePropertiesNames::kDataSize, &new_table_properties->data_size},
+      {TablePropertiesNames::kIndexSize, &new_table_properties->index_size},
+      {TablePropertiesNames::kIndexPartitions,
+       &new_table_properties->index_partitions},
+      {TablePropertiesNames::kTopLevelIndexSize,
+       &new_table_properties->top_level_index_size},
+      {TablePropertiesNames::kIndexKeyIsUserKey,
+       &new_table_properties->index_key_is_user_key},
+      {TablePropertiesNames::kIndexValueIsDeltaEncoded,
+       &new_table_properties->index_value_is_delta_encoded},
+      {TablePropertiesNames::kFilterSize, &new_table_properties->filter_size},
+      {TablePropertiesNames::kRawKeySize, &new_table_properties->raw_key_size},
+      {TablePropertiesNames::kRawValueSize,
+       &new_table_properties->raw_value_size},
+      {TablePropertiesNames::kNumDataBlocks,
+       &new_table_properties->num_data_blocks},
+      {TablePropertiesNames::kNumEntries, &new_table_properties->num_entries},
+      {TablePropertiesNames::kNumFilterEntries,
+       &new_table_properties->num_filter_entries},
+      {TablePropertiesNames::kDeletedKeys,
+       &new_table_properties->num_deletions},
+      {TablePropertiesNames::kMergeOperands,
+       &new_table_properties->num_merge_operands},
+      {TablePropertiesNames::kNumRangeDeletions,
+       &new_table_properties->num_range_deletions},
+      {TablePropertiesNames::kFormatVersion,
+       &new_table_properties->format_version},
+      {TablePropertiesNames::kFixedKeyLen,
+       &new_table_properties->fixed_key_len},
+      {TablePropertiesNames::kColumnFamilyId,
+       &new_table_properties->column_family_id},
+      {TablePropertiesNames::kCreationTime,
+       &new_table_properties->creation_time},
+      {TablePropertiesNames::kOldestKeyTime,
+       &new_table_properties->oldest_key_time},
+      {TablePropertiesNames::kFileCreationTime,
+       &new_table_properties->file_creation_time},
+      {TablePropertiesNames::kSlowCompressionEstimatedDataSize,
+       &new_table_properties->slow_compression_estimated_data_size},
+      {TablePropertiesNames::kFastCompressionEstimatedDataSize,
+       &new_table_properties->fast_compression_estimated_data_size},
+  };
+
+  std::string last_key;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    s = iter->status();
+    if (!s.ok()) {
+      break;
+    }
+
+    auto key = iter->key().ToString();
+    // properties block should be strictly sorted with no duplicate key.
+    if (!last_key.empty() &&
+        BytewiseComparator()->Compare(key, last_key) <= 0) {
+      s = Status::Corruption("properties unsorted");
+      break;
+    }
+    last_key = key;
+
+    auto raw_val = iter->value();
+    auto pos = predefined_uint64_properties.find(key);
+
+    if (key == ExternalSstFilePropertyNames::kGlobalSeqno) {
+      new_table_properties->external_sst_file_global_seqno_offset =
+          handle.offset() + iter->ValueOffset();
+    }
+
+    if (pos != predefined_uint64_properties.end()) {
+      if (key == TablePropertiesNames::kDeletedKeys ||
+          key == TablePropertiesNames::kMergeOperands) {
+        // Insert in user-collected properties for API backwards compatibility
+        new_table_properties->user_collected_properties.insert(
+            {key, raw_val.ToString()});
+      }
+      // handle predefined rocksdb properties
+      uint64_t val;
+      if (!GetVarint64(&raw_val, &val)) {
+        // skip malformed value
+        auto error_msg =
+            "Detect malformed value in properties meta-block:"
+            "\tkey: " +
+            key + "\tval: " + raw_val.ToString();
+        ROCKS_LOG_ERROR(ioptions.logger, "%s", error_msg.c_str());
+        continue;
+      }
+      *(pos->second) = val;
+    } else if (key == TablePropertiesNames::kDbId) {
+      new_table_properties->db_id = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kDbSessionId) {
+      new_table_properties->db_session_id = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kDbHostId) {
+      new_table_properties->db_host_id = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kFilterPolicy) {
+      new_table_properties->filter_policy_name = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kColumnFamilyName) {
+      new_table_properties->column_family_name = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kComparator) {
+      new_table_properties->comparator_name = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kMergeOperator) {
+      new_table_properties->merge_operator_name = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kPrefixExtractorName) {
+      new_table_properties->prefix_extractor_name = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kPropertyCollectors) {
+      new_table_properties->property_collectors_names = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kCompression) {
+      new_table_properties->compression_name = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kCompressionOptions) {
+      new_table_properties->compression_options = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kSequenceNumberTimeMapping) {
+      new_table_properties->seqno_to_time_mapping = raw_val.ToString();
+    } else {
+      // handle user-collected properties
+      new_table_properties->user_collected_properties.insert(
+          {key, raw_val.ToString()});
+    }
+  }
+
+  // Modified version of BlockFetcher checksum verification
+  // (See write_global_seqno comment above)
+  if (s.ok() && footer.GetBlockTrailerSize() > 0) {
+    s = VerifyBlockChecksum(footer.checksum_type(), properties_block.data(),
+                            block_size, file->file_name(), handle.offset());
+    if (s.IsCorruption()) {
+      if (new_table_properties->external_sst_file_global_seqno_offset != 0) {
+        std::string tmp_buf(properties_block.data(),
+                            block_fetcher.GetBlockSizeWithTrailer());
+        uint64_t global_seqno_offset =
+            new_table_properties->external_sst_file_global_seqno_offset -
+            handle.offset();
+        EncodeFixed64(&tmp_buf[static_cast<size_t>(global_seqno_offset)], 0);
+        s = VerifyBlockChecksum(footer.checksum_type(), tmp_buf.data(),
+                                block_size, file->file_name(), handle.offset());
+      }
+    }
+  }
+
+  if (s.ok()) {
+    *table_properties = std::move(new_table_properties);
+  }
+
+  return s;
+}
+
+Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
+                           uint64_t table_magic_number,
+                           const ImmutableOptions& ioptions,
+                           std::unique_ptr<TableProperties>* properties,
+                           MemoryAllocator* memory_allocator,
+                           FilePrefetchBuffer* prefetch_buffer) {
+  BlockHandle block_handle;
+  Footer footer;
+  Status s = FindMetaBlockInFile(file, file_size, table_magic_number, ioptions,
+                                 kPropertiesBlockName, &block_handle,
+                                 memory_allocator, prefetch_buffer, &footer);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (!block_handle.IsNull()) {
+    s = ReadTablePropertiesHelper(ReadOptions(), block_handle, file,
+                                  prefetch_buffer, footer, ioptions, properties,
+                                  memory_allocator);
+  } else {
+    s = Status::NotFound();
+  }
+  return s;
+}
+
+Status FindOptionalMetaBlock(InternalIterator* meta_index_iter,
+                             const std::string& meta_block_name,
+                             BlockHandle* block_handle) {
+  assert(block_handle != nullptr);
+  meta_index_iter->Seek(meta_block_name);
+  if (meta_index_iter->status().ok()) {
+    if (meta_index_iter->Valid() && meta_index_iter->key() == meta_block_name) {
+      Slice v = meta_index_iter->value();
+      return block_handle->DecodeFrom(&v);
+    } else if (meta_block_name == kPropertiesBlockName) {
+      // Have to try old name for compatibility
+      meta_index_iter->Seek(kPropertiesBlockOldName);
+      if (meta_index_iter->status().ok() && meta_index_iter->Valid() &&
+          meta_index_iter->key() == kPropertiesBlockOldName) {
+        Slice v = meta_index_iter->value();
+        return block_handle->DecodeFrom(&v);
+      }
+    }
+  }
+  // else
+  *block_handle = BlockHandle::NullBlockHandle();
+  return meta_index_iter->status();
+}
+
+Status FindMetaBlock(InternalIterator* meta_index_iter,
+                     const std::string& meta_block_name,
+                     BlockHandle* block_handle) {
+  Status s =
+      FindOptionalMetaBlock(meta_index_iter, meta_block_name, block_handle);
+  if (s.ok() && block_handle->IsNull()) {
+    return Status::Corruption("Cannot find the meta block", meta_block_name);
+  } else {
+    return s;
+  }
+}
+
+Status ReadMetaIndexBlockInFile(RandomAccessFileReader* file,
+                                uint64_t file_size, uint64_t table_magic_number,
+                                const ImmutableOptions& ioptions,
+                                BlockContents* metaindex_contents,
+                                MemoryAllocator* memory_allocator,
+                                FilePrefetchBuffer* prefetch_buffer,
+                                Footer* footer_out) {
+  Footer footer;
+  IOOptions opts;
+  auto s = ReadFooterFromFile(opts, file, prefetch_buffer, file_size, &footer,
+                              table_magic_number);
+  if (!s.ok()) {
+    return s;
+  }
+  if (footer_out) {
+    *footer_out = footer;
+  }
+
+  auto metaindex_handle = footer.metaindex_handle();
+  return BlockFetcher(file, prefetch_buffer, footer, ReadOptions(),
+                      metaindex_handle, metaindex_contents, ioptions,
+                      false /* do decompression */, false /*maybe_compressed*/,
+                      BlockType::kMetaIndex, UncompressionDict::GetEmptyDict(),
+                      PersistentCacheOptions::kEmpty, memory_allocator)
+      .ReadBlockContents();
+}
+
+Status FindMetaBlockInFile(RandomAccessFileReader* file, uint64_t file_size,
+                           uint64_t table_magic_number,
+                           const ImmutableOptions& ioptions,
+                           const std::string& meta_block_name,
+                           BlockHandle* block_handle,
+                           MemoryAllocator* memory_allocator,
+                           FilePrefetchBuffer* prefetch_buffer,
+                           Footer* footer_out) {
+  BlockContents metaindex_contents;
+  auto s = ReadMetaIndexBlockInFile(
+      file, file_size, table_magic_number, ioptions, &metaindex_contents,
+      memory_allocator, prefetch_buffer, footer_out);
+  if (!s.ok()) {
+    return s;
+  }
+  // meta blocks are never compressed. Need to add uncompress logic if we are to
+  // compress it.
+  Block metaindex_block(std::move(metaindex_contents));
+
+  std::unique_ptr<InternalIterator> meta_iter;
+  meta_iter.reset(metaindex_block.NewMetaIterator());
+
+  return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle);
+}
+
+Status ReadMetaBlock(RandomAccessFileReader* file,
+                     FilePrefetchBuffer* prefetch_buffer, uint64_t file_size,
+                     uint64_t table_magic_number,
+                     const ImmutableOptions& ioptions,
+                     const std::string& meta_block_name, BlockType block_type,
+                     BlockContents* contents,
+                     MemoryAllocator* memory_allocator) {
+  // TableProperties requires special handling because of checksum issues.
+  // Call ReadTableProperties instead for that case.
+  assert(block_type != BlockType::kProperties);
+
+  BlockHandle block_handle;
+  Footer footer;
+  Status status = FindMetaBlockInFile(
+      file, file_size, table_magic_number, ioptions, meta_block_name,
+      &block_handle, memory_allocator, prefetch_buffer, &footer);
+  if (!status.ok()) {
+    return status;
+  }
+
+  return BlockFetcher(file, prefetch_buffer, footer, ReadOptions(),
+                      block_handle, contents, ioptions, false /* decompress */,
+                      false /*maybe_compressed*/, block_type,
+                      UncompressionDict::GetEmptyDict(),
+                      PersistentCacheOptions::kEmpty, memory_allocator)
+      .ReadBlockContents();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/meta_blocks.h b/src/rocksdb/table/meta_blocks.h
new file mode 100644
index 000000000..b867dd01d
--- /dev/null
+++ b/src/rocksdb/table/meta_blocks.h
@@ -0,0 +1,168 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "db/builder.h"
+#include "db/table_properties_collector.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/memory_allocator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "table/block_based/block_builder.h"
+#include "table/block_based/block_type.h"
+#include "table/format.h"
+#include "util/kv_map.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlockBuilder;
+class BlockHandle;
+class Env;
+class Footer;
+class Logger;
+class RandomAccessFile;
+struct TableProperties;
+
+// Meta block names for metaindex
+extern const std::string kPropertiesBlockName;
+extern const std::string kPropertiesBlockOldName;
+extern const std::string kCompressionDictBlockName;
+extern const std::string kRangeDelBlockName;
+
+class MetaIndexBuilder {
+ public:
+  MetaIndexBuilder(const MetaIndexBuilder&) = delete;
+  MetaIndexBuilder& operator=(const MetaIndexBuilder&) = delete;
+
+  MetaIndexBuilder();
+  void Add(const std::string& key, const BlockHandle& handle);
+
+  // Write all the added key/value pairs to the block and return the contents
+  // of the block.
+  Slice Finish();
+
+ private:
+  // store the sorted key/handle of the metablocks.
+  stl_wrappers::KVMap meta_block_handles_;
+  std::unique_ptr<BlockBuilder> meta_index_block_;
+};
+
+class PropertyBlockBuilder {
+ public:
+  PropertyBlockBuilder(const PropertyBlockBuilder&) = delete;
+  PropertyBlockBuilder& operator=(const PropertyBlockBuilder&) = delete;
+
+  PropertyBlockBuilder();
+
+  void AddTableProperty(const TableProperties& props);
+  void Add(const std::string& key, uint64_t value);
+  void Add(const std::string& key, const std::string& value);
+  void Add(const UserCollectedProperties& user_collected_properties);
+
+  // Write all the added entries to the block and return the block contents
+  Slice Finish();
+
+ private:
+  std::unique_ptr<BlockBuilder> properties_block_;
+  stl_wrappers::KVMap props_;
+};
+
+// Were we encounter any error occurs during user-defined statistics collection,
+// we'll write the warning message to info log.
+void LogPropertiesCollectionError(Logger* info_log, const std::string& method,
+                                  const std::string& name);
+
+// Utility functions help table builder to trigger batch events for user
+// defined property collectors.
+// Return value indicates if there is any error occurred; if error occurred,
+// the warning message will be logged.
+// NotifyCollectTableCollectorsOnAdd() triggers the `Add` event for all
+// property collectors.
+bool NotifyCollectTableCollectorsOnAdd(
+    const Slice& key, const Slice& value, uint64_t file_size,
+    const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
+    Logger* info_log);
+
+void NotifyCollectTableCollectorsOnBlockAdd(
+    const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
+    uint64_t block_uncomp_bytes, uint64_t block_compressed_bytes_fast,
+    uint64_t block_compressed_bytes_slow);
+
+// NotifyCollectTableCollectorsOnFinish() triggers the `Finish` event for all
+// property collectors. The collected properties will be added to `builder`.
+bool NotifyCollectTableCollectorsOnFinish(
+    const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
+    Logger* info_log, PropertyBlockBuilder* builder);
+
+// Read table properties from a file using known BlockHandle.
+// @returns a status to indicate if the operation succeeded. On success,
+//          *table_properties will point to a heap-allocated TableProperties
+//          object, otherwise value of `table_properties` will not be modified.
+Status ReadTablePropertiesHelper(
+    const ReadOptions& ro, const BlockHandle& handle,
+    RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
+    const Footer& footer, const ImmutableOptions& ioptions,
+    std::unique_ptr<TableProperties>* table_properties,
+    MemoryAllocator* memory_allocator = nullptr);
+
+// Read table properties from the properties block of a plain table.
+// @returns a status to indicate if the operation succeeded. On success,
+//          *table_properties will point to a heap-allocated TableProperties
+//          object, otherwise value of `table_properties` will not be modified.
+Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
+                           uint64_t table_magic_number,
+                           const ImmutableOptions& ioptions,
+                           std::unique_ptr<TableProperties>* properties,
+                           MemoryAllocator* memory_allocator = nullptr,
+                           FilePrefetchBuffer* prefetch_buffer = nullptr);
+
+// Find the meta block from the meta index block. Returns OK and
+// block_handle->IsNull() if not found.
+Status FindOptionalMetaBlock(InternalIterator* meta_index_iter,
+                             const std::string& meta_block_name,
+                             BlockHandle* block_handle);
+
+// Find the meta block from the meta index block. Returns Corruption if not
+// found.
+Status FindMetaBlock(InternalIterator* meta_index_iter,
+                     const std::string& meta_block_name,
+                     BlockHandle* block_handle);
+
+// Find the meta block
+Status FindMetaBlockInFile(RandomAccessFileReader* file, uint64_t file_size,
+                           uint64_t table_magic_number,
+                           const ImmutableOptions& ioptions,
+                           const std::string& meta_block_name,
+                           BlockHandle* block_handle,
+                           MemoryAllocator* memory_allocator = nullptr,
+                           FilePrefetchBuffer* prefetch_buffer = nullptr,
+                           Footer* footer_out = nullptr);
+
+// Read meta block contents
+Status ReadMetaIndexBlockInFile(RandomAccessFileReader* file,
+                                uint64_t file_size, uint64_t table_magic_number,
+                                const ImmutableOptions& ioptions,
+                                BlockContents* block_contents,
+                                MemoryAllocator* memory_allocator = nullptr,
+                                FilePrefetchBuffer* prefetch_buffer = nullptr,
+                                Footer* footer_out = nullptr);
+
+// Read the specified meta block with name meta_block_name
+// from `file` and initialize `contents` with contents of this block.
+// Return Status::OK in case of success.
+Status ReadMetaBlock(RandomAccessFileReader* file,
+                     FilePrefetchBuffer* prefetch_buffer, uint64_t file_size,
+                     uint64_t table_magic_number,
+                     const ImmutableOptions& ioptions,
+                     const std::string& meta_block_name, BlockType block_type,
+                     BlockContents* contents,
+                     MemoryAllocator* memory_allocator = nullptr);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/mock_table.cc b/src/rocksdb/table/mock_table.cc
new file mode 100644
index 000000000..130889eaa
--- /dev/null
+++ b/src/rocksdb/table/mock_table.cc
@@ -0,0 +1,344 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/mock_table.h"
+
+#include "db/dbformat.h"
+#include "env/composite_env_wrapper.h"
+#include "file/random_access_file_reader.h"
+#include "port/port.h"
+#include "rocksdb/table_properties.h"
+#include "table/get_context.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace mock {
+
+KVVector MakeMockFile(std::initializer_list<KVPair> l) { return KVVector(l); }
+
+void SortKVVector(KVVector* kv_vector, const Comparator* ucmp) {
+  InternalKeyComparator icmp(ucmp);
+  std::sort(kv_vector->begin(), kv_vector->end(),
+            [icmp](KVPair a, KVPair b) -> bool {
+              return icmp.Compare(a.first, b.first) < 0;
+            });
+}
+
+class MockTableReader : public TableReader {
+ public:
+  explicit MockTableReader(const KVVector& table) : table_(table) {}
+
+  InternalIterator* NewIterator(const ReadOptions&,
+                                const SliceTransform* prefix_extractor,
+                                Arena* arena, bool skip_filters,
+                                TableReaderCaller caller,
+                                size_t compaction_readahead_size = 0,
+                                bool allow_unprepared_value = false) override;
+
+  Status Get(const ReadOptions& readOptions, const Slice& key,
+             GetContext* get_context, const SliceTransform* prefix_extractor,
+             bool skip_filters = false) override;
+
+  uint64_t ApproximateOffsetOf(const Slice& /*key*/,
+                               TableReaderCaller /*caller*/) override {
+    return 0;
+  }
+
+  uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/,
+                           TableReaderCaller /*caller*/) override {
+    return 0;
+  }
+
+  size_t ApproximateMemoryUsage() const override { return 0; }
+
+  void SetupForCompaction() override {}
+
+  std::shared_ptr<const TableProperties> GetTableProperties() const override;
+
+  ~MockTableReader() {}
+
+ private:
+  const KVVector& table_;
+};
+
+class MockTableIterator : public InternalIterator {
+ public:
+  explicit MockTableIterator(const KVVector& table) : table_(table) {
+    itr_ = table_.end();
+  }
+
+  bool Valid() const override { return itr_ != table_.end(); }
+
+  void SeekToFirst() override { itr_ = table_.begin(); }
+
+  void SeekToLast() override {
+    itr_ = table_.end();
+    --itr_;
+  }
+
+  void Seek(const Slice& target) override {
+    KVPair target_pair(target.ToString(), "");
+    InternalKeyComparator icmp(BytewiseComparator());
+    itr_ = std::lower_bound(table_.begin(), table_.end(), target_pair,
+                            [icmp](KVPair a, KVPair b) -> bool {
+                              return icmp.Compare(a.first, b.first) < 0;
+                            });
+  }
+
+  void SeekForPrev(const Slice& target) override {
+    KVPair target_pair(target.ToString(), "");
+    InternalKeyComparator icmp(BytewiseComparator());
+    itr_ = std::upper_bound(table_.begin(), table_.end(), target_pair,
+                            [icmp](KVPair a, KVPair b) -> bool {
+                              return icmp.Compare(a.first, b.first) < 0;
+                            });
+    Prev();
+  }
+
+  void Next() override { ++itr_; }
+
+  void Prev() override {
+    if (itr_ == table_.begin()) {
+      itr_ = table_.end();
+    } else {
+      --itr_;
+    }
+  }
+
+  Slice key() const override { return Slice(itr_->first); }
+
+  Slice value() const override { return Slice(itr_->second); }
+
+  Status status() const override { return Status::OK(); }
+
+ private:
+  const KVVector& table_;
+  KVVector::const_iterator itr_;
+};
+
+class MockTableBuilder : public TableBuilder {
+ public:
+  MockTableBuilder(uint32_t id, MockTableFileSystem* file_system,
+                   MockTableFactory::MockCorruptionMode corrupt_mode =
+                       MockTableFactory::kCorruptNone,
+                   size_t key_value_size = 1)
+      : id_(id),
+        file_system_(file_system),
+        corrupt_mode_(corrupt_mode),
+        key_value_size_(key_value_size) {
+    table_ = MakeMockFile({});
+  }
+
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  ~MockTableBuilder() {}
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Add(const Slice& key, const Slice& value) override {
+    if (corrupt_mode_ == MockTableFactory::kCorruptValue) {
+      // Corrupt the value
+      table_.push_back({key.ToString(), value.ToString() + " "});
+      corrupt_mode_ = MockTableFactory::kCorruptNone;
+    } else if (corrupt_mode_ == MockTableFactory::kCorruptKey) {
+      table_.push_back({key.ToString() + " ", value.ToString()});
+      corrupt_mode_ = MockTableFactory::kCorruptNone;
+    } else if (corrupt_mode_ == MockTableFactory::kCorruptReorderKey) {
+      if (prev_key_.empty()) {
+        prev_key_ = key.ToString();
+        prev_value_ = value.ToString();
+      } else {
+        table_.push_back({key.ToString(), value.ToString()});
+        table_.push_back({prev_key_, prev_value_});
+        corrupt_mode_ = MockTableFactory::kCorruptNone;
+      }
+    } else {
+      table_.push_back({key.ToString(), value.ToString()});
+    }
+  }
+
+  // Return non-ok iff some error has been detected.
+  Status status() const override { return Status::OK(); }
+
+  // Return non-ok iff some error happens during IO.
+  IOStatus io_status() const override { return IOStatus::OK(); }
+
+  Status Finish() override {
+    MutexLock lock_guard(&file_system_->mutex);
+    file_system_->files.insert({id_, table_});
+    return Status::OK();
+  }
+
+  void Abandon() override {}
+
+  uint64_t NumEntries() const override { return table_.size(); }
+
+  uint64_t FileSize() const override { return table_.size() * key_value_size_; }
+
+  TableProperties GetTableProperties() const override {
+    return TableProperties();
+  }
+
+  // Get file checksum
+  std::string GetFileChecksum() const override { return kUnknownFileChecksum; }
+  // Get file checksum function name
+  const char* GetFileChecksumFuncName() const override {
+    return kUnknownFileChecksumFuncName;
+  }
+
+ private:
+  uint32_t id_;
+  std::string prev_key_;
+  std::string prev_value_;
+  MockTableFileSystem* file_system_;
+  int corrupt_mode_;
+  KVVector table_;
+  size_t key_value_size_;
+};
+
+InternalIterator* MockTableReader::NewIterator(
+    const ReadOptions&, const SliceTransform* /* prefix_extractor */,
+    Arena* /*arena*/, bool /*skip_filters*/, TableReaderCaller /*caller*/,
+    size_t /*compaction_readahead_size*/, bool /* allow_unprepared_value */) {
+  return new MockTableIterator(table_);
+}
+
+Status MockTableReader::Get(const ReadOptions&, const Slice& key,
+                            GetContext* get_context,
+                            const SliceTransform* /*prefix_extractor*/,
+                            bool /*skip_filters*/) {
+  std::unique_ptr<MockTableIterator> iter(new MockTableIterator(table_));
+  for (iter->Seek(key); iter->Valid(); iter->Next()) {
+    ParsedInternalKey parsed_key;
+    Status pik_status =
+        ParseInternalKey(iter->key(), &parsed_key, true /* log_err_key */);
+    if (!pik_status.ok()) {
+      return pik_status;
+    }
+
+    bool dont_care __attribute__((__unused__));
+    if (!get_context->SaveValue(parsed_key, iter->value(), &dont_care)) {
+      break;
+    }
+  }
+  return Status::OK();
+}
+
+std::shared_ptr<const TableProperties> MockTableReader::GetTableProperties()
+    const {
+  return std::shared_ptr<const TableProperties>(new TableProperties());
+}
+
+MockTableFactory::MockTableFactory()
+    : next_id_(1), corrupt_mode_(MockTableFactory::kCorruptNone) {}
+
+Status MockTableFactory::NewTableReader(
+    const ReadOptions& /*ro*/,
+    const TableReaderOptions& /*table_reader_options*/,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t /*file_size*/,
+    std::unique_ptr<TableReader>* table_reader,
+    bool /*prefetch_index_and_filter_in_cache*/) const {
+  uint32_t id;
+  Status s = GetIDFromFile(file.get(), &id);
+  if (!s.ok()) {
+    return s;
+  }
+
+  MutexLock lock_guard(&file_system_.mutex);
+
+  auto it = file_system_.files.find(id);
+  if (it == file_system_.files.end()) {
+    return Status::IOError("Mock file not found");
+  }
+
+  table_reader->reset(new MockTableReader(it->second));
+
+  return Status::OK();
+}
+
+TableBuilder* MockTableFactory::NewTableBuilder(
+    const TableBuilderOptions& /*table_builder_options*/,
+    WritableFileWriter* file) const {
+  uint32_t id;
+  Status s = GetAndWriteNextID(file, &id);
+  assert(s.ok());
+
+  return new MockTableBuilder(id, &file_system_, corrupt_mode_,
+                              key_value_size_);
+}
+
+Status MockTableFactory::CreateMockTable(Env* env, const std::string& fname,
+                                         KVVector file_contents) {
+  std::unique_ptr<WritableFileWriter> file_writer;
+  Status s = WritableFileWriter::Create(env->GetFileSystem(), fname,
+                                        FileOptions(), &file_writer, nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+  uint32_t id;
+  s = GetAndWriteNextID(file_writer.get(), &id);
+  if (s.ok()) {
+    file_system_.files.insert({id, std::move(file_contents)});
+  }
+  return s;
+}
+
+Status MockTableFactory::GetAndWriteNextID(WritableFileWriter* file,
+                                           uint32_t* next_id) const {
+  *next_id = next_id_.fetch_add(1);
+  char buf[4];
+  EncodeFixed32(buf, *next_id);
+  return file->Append(Slice(buf, 4));
+}
+
+Status MockTableFactory::GetIDFromFile(RandomAccessFileReader* file,
+                                       uint32_t* id) const {
+  char buf[4];
+  Slice result;
+  Status s = file->Read(IOOptions(), 0, 4, &result, buf, nullptr,
+                        Env::IO_TOTAL /* rate_limiter_priority */);
+  assert(result.size() == 4);
+  *id = DecodeFixed32(buf);
+  return s;
+}
+
+void MockTableFactory::AssertSingleFile(const KVVector& file_contents) {
+  ASSERT_EQ(file_system_.files.size(), 1U);
+  ASSERT_EQ(file_contents, file_system_.files.begin()->second);
+}
+
+void MockTableFactory::AssertLatestFiles(
+    const std::vector<KVVector>& files_contents) {
+  ASSERT_GE(file_system_.files.size(), files_contents.size());
+  auto it = file_system_.files.rbegin();
+  for (auto expect = files_contents.rbegin(); expect != files_contents.rend();
+       expect++, it++) {
+    ASSERT_TRUE(it != file_system_.files.rend());
+    if (*expect != it->second) {
+      std::cout << "Wrong content! Content of file, expect:" << std::endl;
+      for (const auto& kv : *expect) {
+        ParsedInternalKey ikey;
+        std::string key, value;
+        std::tie(key, value) = kv;
+        ASSERT_OK(ParseInternalKey(Slice(key), &ikey, true /* log_err_key */));
+        std::cout << ikey.DebugString(true, false) << " -> " << value
+                  << std::endl;
+      }
+      std::cout << "actual:" << std::endl;
+      for (const auto& kv : it->second) {
+        ParsedInternalKey ikey;
+        std::string key, value;
+        std::tie(key, value) = kv;
+        ASSERT_OK(ParseInternalKey(Slice(key), &ikey, true /* log_err_key */));
+        std::cout << ikey.DebugString(true, false) << " -> " << value
+                  << std::endl;
+      }
+      FAIL();
+    }
+  }
+}
+
+}  // namespace mock
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/mock_table.h b/src/rocksdb/table/mock_table.h
new file mode 100644
index 000000000..e4850d060
--- /dev/null
+++ b/src/rocksdb/table/mock_table.h
@@ -0,0 +1,94 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <algorithm>
+#include <atomic>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+
+#include "db/version_edit.h"
+#include "port/port.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/table.h"
+#include "table/internal_iterator.h"
+#include "table/table_builder.h"
+#include "table/table_reader.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/kv_map.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace mock {
+using KVPair = std::pair<std::string, std::string>;
+using KVVector = std::vector<KVPair>;
+
+KVVector MakeMockFile(std::initializer_list<KVPair> l = {});
+void SortKVVector(KVVector* kv_vector,
+                  const Comparator* ucmp = BytewiseComparator());
+
+struct MockTableFileSystem {
+  port::Mutex mutex;
+  std::map<uint32_t, KVVector> files;
+};
+
+class MockTableFactory : public TableFactory {
+ public:
+  enum MockCorruptionMode {
+    kCorruptNone,
+    kCorruptKey,
+    kCorruptValue,
+    kCorruptReorderKey,
+  };
+
+  MockTableFactory();
+  static const char* kClassName() { return "MockTable"; }
+  const char* Name() const override { return kClassName(); }
+  using TableFactory::NewTableReader;
+  Status NewTableReader(
+      const ReadOptions& ro, const TableReaderOptions& table_reader_options,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table_reader,
+      bool prefetch_index_and_filter_in_cache = true) const override;
+  TableBuilder* NewTableBuilder(
+      const TableBuilderOptions& table_builder_options,
+      WritableFileWriter* file) const override;
+
+  // This function will directly create mock table instead of going through
+  // MockTableBuilder. file_contents has to have a format of <internal_key,
+  // value>. Those key-value pairs will then be inserted into the mock table.
+  Status CreateMockTable(Env* env, const std::string& fname,
+                         KVVector file_contents);
+
+  virtual std::string GetPrintableOptions() const override {
+    return std::string();
+  }
+
+  void SetCorruptionMode(MockCorruptionMode mode) { corrupt_mode_ = mode; }
+
+  void SetKeyValueSize(size_t size) { key_value_size_ = size; }
+  // This function will assert that only a single file exists and that the
+  // contents are equal to file_contents
+  void AssertSingleFile(const KVVector& file_contents);
+  void AssertLatestFiles(const std::vector<KVVector>& files_contents);
+
+ private:
+  Status GetAndWriteNextID(WritableFileWriter* file, uint32_t* id) const;
+  Status GetIDFromFile(RandomAccessFileReader* file, uint32_t* id) const;
+
+  mutable MockTableFileSystem file_system_;
+  mutable std::atomic<uint32_t> next_id_;
+  MockCorruptionMode corrupt_mode_;
+
+  size_t key_value_size_ = 1;
+};
+
+}  // namespace mock
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/multiget_context.h b/src/rocksdb/table/multiget_context.h
new file mode 100644
index 000000000..76027a952
--- /dev/null
+++ b/src/rocksdb/table/multiget_context.h
@@ -0,0 +1,402 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <algorithm>
+#include <array>
+#include <string>
+
+#include "db/dbformat.h"
+#include "db/lookup_key.h"
+#include "db/merge_context.h"
+#include "rocksdb/env.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/types.h"
+#include "util/async_file_reader.h"
+#include "util/autovector.h"
+#include "util/math.h"
+#include "util/single_thread_executor.h"
+
+namespace ROCKSDB_NAMESPACE {
+class GetContext;
+
+struct KeyContext {
+  const Slice* key;
+  LookupKey* lkey;
+  Slice ukey_with_ts;
+  Slice ukey_without_ts;
+  Slice ikey;
+  ColumnFamilyHandle* column_family;
+  Status* s;
+  MergeContext merge_context;
+  SequenceNumber max_covering_tombstone_seq;
+  bool key_exists;
+  bool is_blob_index;
+  void* cb_arg;
+  PinnableSlice* value;
+  std::string* timestamp;
+  GetContext* get_context;
+
+  KeyContext(ColumnFamilyHandle* col_family, const Slice& user_key,
+             PinnableSlice* val, std::string* ts, Status* stat)
+      : key(&user_key),
+        lkey(nullptr),
+        column_family(col_family),
+        s(stat),
+        max_covering_tombstone_seq(0),
+        key_exists(false),
+        is_blob_index(false),
+        cb_arg(nullptr),
+        value(val),
+        timestamp(ts),
+        get_context(nullptr) {}
+
+  KeyContext() = default;
+};
+
+// The MultiGetContext class is a container for the sorted list of keys that
+// we need to lookup in a batch. Its main purpose is to make batch execution
+// easier by allowing various stages of the MultiGet lookups to operate on
+// subsets of keys, potentially non-contiguous. In order to accomplish this,
+// it defines the following classes -
+//
+// MultiGetContext::Range
+// MultiGetContext::Range::Iterator
+// MultiGetContext::Range::IteratorWrapper
+//
+// Here is an example of how this can be used -
+//
+// {
+//    MultiGetContext ctx(...);
+//    MultiGetContext::Range range = ctx.GetMultiGetRange();
+//
+//    // Iterate to determine some subset of the keys
+//    MultiGetContext::Range::Iterator start = range.begin();
+//    MultiGetContext::Range::Iterator end = ...;
+//
+//    // Make a new range with a subset of keys
+//    MultiGetContext::Range subrange(range, start, end);
+//
+//    // Define an auxillary vector, if needed, to hold additional data for
+//    // each key
+//    std::array<Foo, MultiGetContext::MAX_BATCH_SIZE> aux;
+//
+//    // Iterate over the subrange and the auxillary vector simultaneously
+//    MultiGetContext::Range::Iterator iter = subrange.begin();
+//    for (; iter != subrange.end(); ++iter) {
+//      KeyContext& key = *iter;
+//      Foo& aux_key = aux_iter[iter.index()];
+//      ...
+//    }
+//  }
+class MultiGetContext {
+ public:
+  // Limit the number of keys in a batch to this number. Benchmarks show that
+  // there is negligible benefit for batches exceeding this. Keeping this < 32
+  // simplifies iteration, as well as reduces the amount of stack allocations
+  // that need to be performed
+  static const int MAX_BATCH_SIZE = 32;
+
+  // A bitmask of at least MAX_BATCH_SIZE - 1 bits, so that
+  // Mask{1} << MAX_BATCH_SIZE is well defined
+  using Mask = uint64_t;
+  static_assert(MAX_BATCH_SIZE < sizeof(Mask) * 8);
+
+  MultiGetContext(autovector<KeyContext*, MAX_BATCH_SIZE>* sorted_keys,
+                  size_t begin, size_t num_keys, SequenceNumber snapshot,
+                  const ReadOptions& read_opts, FileSystem* fs,
+                  Statistics* stats)
+      : num_keys_(num_keys),
+        value_mask_(0),
+        value_size_(0),
+        lookup_key_ptr_(reinterpret_cast<LookupKey*>(lookup_key_stack_buf))
+#if USE_COROUTINES
+        ,
+        reader_(fs, stats),
+        executor_(reader_)
+#endif  // USE_COROUTINES
+  {
+    (void)fs;
+    (void)stats;
+    assert(num_keys <= MAX_BATCH_SIZE);
+    if (num_keys > MAX_LOOKUP_KEYS_ON_STACK) {
+      lookup_key_heap_buf.reset(new char[sizeof(LookupKey) * num_keys]);
+      lookup_key_ptr_ = reinterpret_cast<LookupKey*>(lookup_key_heap_buf.get());
+    }
+
+    for (size_t iter = 0; iter != num_keys_; ++iter) {
+      // autovector may not be contiguous storage, so make a copy
+      sorted_keys_[iter] = (*sorted_keys)[begin + iter];
+      sorted_keys_[iter]->lkey = new (&lookup_key_ptr_[iter])
+          LookupKey(*sorted_keys_[iter]->key, snapshot, read_opts.timestamp);
+      sorted_keys_[iter]->ukey_with_ts = sorted_keys_[iter]->lkey->user_key();
+      sorted_keys_[iter]->ukey_without_ts = StripTimestampFromUserKey(
+          sorted_keys_[iter]->lkey->user_key(),
+          read_opts.timestamp == nullptr ? 0 : read_opts.timestamp->size());
+      sorted_keys_[iter]->ikey = sorted_keys_[iter]->lkey->internal_key();
+      sorted_keys_[iter]->timestamp = (*sorted_keys)[begin + iter]->timestamp;
+      sorted_keys_[iter]->get_context =
+          (*sorted_keys)[begin + iter]->get_context;
+    }
+  }
+
+  ~MultiGetContext() {
+    for (size_t i = 0; i < num_keys_; ++i) {
+      lookup_key_ptr_[i].~LookupKey();
+    }
+  }
+
+#if USE_COROUTINES
+  SingleThreadExecutor& executor() { return executor_; }
+
+  AsyncFileReader& reader() { return reader_; }
+#endif  // USE_COROUTINES
+
+ private:
+  static const int MAX_LOOKUP_KEYS_ON_STACK = 16;
+  alignas(
+      alignof(LookupKey)) char lookup_key_stack_buf[sizeof(LookupKey) *
+                                                    MAX_LOOKUP_KEYS_ON_STACK];
+  std::array<KeyContext*, MAX_BATCH_SIZE> sorted_keys_;
+  size_t num_keys_;
+  Mask value_mask_;
+  uint64_t value_size_;
+  std::unique_ptr<char[]> lookup_key_heap_buf;
+  LookupKey* lookup_key_ptr_;
+#if USE_COROUTINES
+  AsyncFileReader reader_;
+  SingleThreadExecutor executor_;
+#endif  // USE_COROUTINES
+
+ public:
+  // MultiGetContext::Range - Specifies a range of keys, by start and end index,
+  // from the parent MultiGetContext. Each range contains a bit vector that
+  // indicates whether the corresponding keys need to be processed or skipped.
+  // A Range object can be copy constructed, and the new object inherits the
+  // original Range's bit vector. This is useful for progressively skipping
+  // keys as the lookup goes through various stages. For example, when looking
+  // up keys in the same SST file, a Range is created excluding keys not
+  // belonging to that file. A new Range is then copy constructed and individual
+  // keys are skipped based on bloom filter lookup.
+  class Range {
+   public:
+    // MultiGetContext::Range::Iterator - A forward iterator that iterates over
+    // non-skippable keys in a Range, as well as keys whose final value has been
+    // found. The latter is tracked by MultiGetContext::value_mask_
+    class Iterator {
+     public:
+      // -- iterator traits
+      using self_type = Iterator;
+      using value_type = KeyContext;
+      using reference = KeyContext&;
+      using pointer = KeyContext*;
+      using difference_type = int;
+      using iterator_category = std::forward_iterator_tag;
+
+      Iterator(const Range* range, size_t idx)
+          : range_(range), ctx_(range->ctx_), index_(idx) {
+        while (index_ < range_->end_ &&
+               (Mask{1} << index_) &
+                   (range_->ctx_->value_mask_ | range_->skip_mask_ |
+                    range_->invalid_mask_))
+          index_++;
+      }
+
+      Iterator(const Iterator&) = default;
+
+      Iterator(const Iterator& other, const Range* range)
+          : range_(range), ctx_(other.ctx_), index_(other.index_) {
+        assert(range->ctx_ == other.ctx_);
+      }
+      Iterator& operator=(const Iterator&) = default;
+
+      Iterator& operator++() {
+        while (++index_ < range_->end_ &&
+               (Mask{1} << index_) &
+                   (range_->ctx_->value_mask_ | range_->skip_mask_ |
+                    range_->invalid_mask_))
+          ;
+        return *this;
+      }
+
+      bool operator==(Iterator other) const {
+        assert(range_->ctx_ == other.range_->ctx_);
+        return index_ == other.index_;
+      }
+
+      bool operator!=(Iterator other) const {
+        assert(range_->ctx_ == other.range_->ctx_);
+        return index_ != other.index_;
+      }
+
+      KeyContext& operator*() {
+        assert(index_ < range_->end_ && index_ >= range_->start_);
+        return *(ctx_->sorted_keys_[index_]);
+      }
+
+      KeyContext* operator->() {
+        assert(index_ < range_->end_ && index_ >= range_->start_);
+        return ctx_->sorted_keys_[index_];
+      }
+
+      size_t index() { return index_; }
+
+     private:
+      friend Range;
+      const Range* range_;
+      const MultiGetContext* ctx_;
+      size_t index_;
+    };
+
+    Range(const Range& mget_range, const Iterator& first,
+          const Iterator& last) {
+      ctx_ = mget_range.ctx_;
+      if (first == last) {
+        // This means create an empty range based on mget_range. So just
+        // set start_ and and_ to the same value
+        start_ = mget_range.start_;
+        end_ = start_;
+      } else {
+        start_ = first.index_;
+        end_ = last.index_;
+      }
+      skip_mask_ = mget_range.skip_mask_;
+      invalid_mask_ = mget_range.invalid_mask_;
+      assert(start_ < 64);
+      assert(end_ < 64);
+    }
+
+    Range() = default;
+
+    Iterator begin() const { return Iterator(this, start_); }
+
+    Iterator end() const { return Iterator(this, end_); }
+
+    bool empty() const { return RemainingMask() == 0; }
+
+    void SkipIndex(size_t index) { skip_mask_ |= Mask{1} << index; }
+
+    void SkipKey(const Iterator& iter) { SkipIndex(iter.index_); }
+
+    bool IsKeySkipped(const Iterator& iter) const {
+      return skip_mask_ & (Mask{1} << iter.index_);
+    }
+
+    // Update the value_mask_ in MultiGetContext so its
+    // immediately reflected in all the Range Iterators
+    void MarkKeyDone(Iterator& iter) {
+      ctx_->value_mask_ |= (Mask{1} << iter.index_);
+    }
+
+    bool CheckKeyDone(Iterator& iter) const {
+      return ctx_->value_mask_ & (Mask{1} << iter.index_);
+    }
+
+    uint64_t KeysLeft() const { return BitsSetToOne(RemainingMask()); }
+
+    void AddSkipsFrom(const Range& other) {
+      assert(ctx_ == other.ctx_);
+      skip_mask_ |= other.skip_mask_;
+    }
+
+    uint64_t GetValueSize() { return ctx_->value_size_; }
+
+    void AddValueSize(uint64_t value_size) { ctx_->value_size_ += value_size; }
+
+    MultiGetContext* context() const { return ctx_; }
+
+    Range Suffix(const Range& other) const {
+      size_t other_last = other.FindLastRemaining();
+      size_t my_last = FindLastRemaining();
+
+      if (my_last > other_last) {
+        return Range(*this, Iterator(this, other_last),
+                     Iterator(this, my_last));
+      } else {
+        return Range(*this, begin(), begin());
+      }
+    }
+
+    // The += operator expands the number of keys in this range. The expansion
+    // is always to the right, i.e start of the additional range >= end of
+    // current range. There should be no overlap. Any skipped keys in rhs are
+    // marked as invalid in the invalid_mask_.
+    Range& operator+=(const Range& rhs) {
+      assert(rhs.start_ >= end_);
+      // Check for non-overlapping ranges and adjust invalid_mask_ accordingly
+      if (end_ < rhs.start_) {
+        invalid_mask_ |= RangeMask(end_, rhs.start_);
+        skip_mask_ |= RangeMask(end_, rhs.start_);
+      }
+      start_ = std::min<size_t>(start_, rhs.start_);
+      end_ = std::max<size_t>(end_, rhs.end_);
+      skip_mask_ |= rhs.skip_mask_ & RangeMask(rhs.start_, rhs.end_);
+      invalid_mask_ |= (rhs.invalid_mask_ | rhs.skip_mask_) &
+                       RangeMask(rhs.start_, rhs.end_);
+      assert(start_ < 64);
+      assert(end_ < 64);
+      return *this;
+    }
+
+    // The -= operator removes keys from this range. The removed keys should
+    // come from a range completely overlapping the current range. The removed
+    // keys are marked invalid in the invalid_mask_.
+    Range& operator-=(const Range& rhs) {
+      assert(start_ <= rhs.start_ && end_ >= rhs.end_);
+      skip_mask_ |= (~rhs.skip_mask_ | rhs.invalid_mask_) &
+                    RangeMask(rhs.start_, rhs.end_);
+      invalid_mask_ |= (~rhs.skip_mask_ | rhs.invalid_mask_) &
+                       RangeMask(rhs.start_, rhs.end_);
+      return *this;
+    }
+
+    // Return a complement of the current range
+    Range operator~() {
+      Range res = *this;
+      res.skip_mask_ = ~skip_mask_ & RangeMask(start_, end_);
+      return res;
+    }
+
+   private:
+    friend MultiGetContext;
+    MultiGetContext* ctx_;
+    size_t start_;
+    size_t end_;
+    Mask skip_mask_;
+    Mask invalid_mask_;
+
+    Range(MultiGetContext* ctx, size_t num_keys)
+        : ctx_(ctx),
+          start_(0),
+          end_(num_keys),
+          skip_mask_(0),
+          invalid_mask_(0) {
+      assert(num_keys < 64);
+    }
+
+    static Mask RangeMask(size_t start, size_t end) {
+      return (((Mask{1} << (end - start)) - 1) << start);
+    }
+
+    Mask RemainingMask() const {
+      return (((Mask{1} << end_) - 1) & ~((Mask{1} << start_) - 1) &
+              ~(ctx_->value_mask_ | skip_mask_));
+    }
+
+    size_t FindLastRemaining() const {
+      Mask mask = RemainingMask();
+      size_t index = (mask >>= start_) ? start_ : 0;
+      while (mask >>= 1) {
+        index++;
+      }
+      return index;
+    }
+  };
+
+  // Return the initial range that encompasses all the keys in the batch
+  Range GetMultiGetRange() { return Range(this, num_keys_); }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/persistent_cache_helper.cc b/src/rocksdb/table/persistent_cache_helper.cc
new file mode 100644
index 000000000..eece8100e
--- /dev/null
+++ b/src/rocksdb/table/persistent_cache_helper.cc
@@ -0,0 +1,111 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/persistent_cache_helper.h"
+
+#include "table/block_based/block_based_table_reader.h"
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const PersistentCacheOptions PersistentCacheOptions::kEmpty;
+
+void PersistentCacheHelper::InsertSerialized(
+    const PersistentCacheOptions& cache_options, const BlockHandle& handle,
+    const char* data, const size_t size) {
+  assert(cache_options.persistent_cache);
+  assert(cache_options.persistent_cache->IsCompressed());
+
+  CacheKey key =
+      BlockBasedTable::GetCacheKey(cache_options.base_cache_key, handle);
+
+  cache_options.persistent_cache->Insert(key.AsSlice(), data, size)
+      .PermitUncheckedError();
+}
+
+void PersistentCacheHelper::InsertUncompressed(
+    const PersistentCacheOptions& cache_options, const BlockHandle& handle,
+    const BlockContents& contents) {
+  assert(cache_options.persistent_cache);
+  assert(!cache_options.persistent_cache->IsCompressed());
+  // Precondition:
+  // (1) content is cacheable
+  // (2) content is not compressed
+
+  CacheKey key =
+      BlockBasedTable::GetCacheKey(cache_options.base_cache_key, handle);
+
+  cache_options.persistent_cache
+      ->Insert(key.AsSlice(), contents.data.data(), contents.data.size())
+      .PermitUncheckedError();
+  ;
+}
+
+Status PersistentCacheHelper::LookupSerialized(
+    const PersistentCacheOptions& cache_options, const BlockHandle& handle,
+    std::unique_ptr<char[]>* out_data, const size_t expected_data_size) {
+#ifdef NDEBUG
+  (void)expected_data_size;
+#endif
+  assert(cache_options.persistent_cache);
+  assert(cache_options.persistent_cache->IsCompressed());
+
+  CacheKey key =
+      BlockBasedTable::GetCacheKey(cache_options.base_cache_key, handle);
+
+  size_t size;
+  Status s =
+      cache_options.persistent_cache->Lookup(key.AsSlice(), out_data, &size);
+  if (!s.ok()) {
+    // cache miss
+    RecordTick(cache_options.statistics, PERSISTENT_CACHE_MISS);
+    return s;
+  }
+
+  // cache hit
+  // Block-based table is assumed
+  assert(expected_data_size ==
+         handle.size() + BlockBasedTable::kBlockTrailerSize);
+  assert(size == expected_data_size);
+  RecordTick(cache_options.statistics, PERSISTENT_CACHE_HIT);
+  return Status::OK();
+}
+
+Status PersistentCacheHelper::LookupUncompressed(
+    const PersistentCacheOptions& cache_options, const BlockHandle& handle,
+    BlockContents* contents) {
+  assert(cache_options.persistent_cache);
+  assert(!cache_options.persistent_cache->IsCompressed());
+  if (!contents) {
+    // We shouldn't lookup in the cache. Either
+    // (1) Nowhere to store
+    return Status::NotFound();
+  }
+
+  CacheKey key =
+      BlockBasedTable::GetCacheKey(cache_options.base_cache_key, handle);
+
+  std::unique_ptr<char[]> data;
+  size_t size;
+  Status s =
+      cache_options.persistent_cache->Lookup(key.AsSlice(), &data, &size);
+  if (!s.ok()) {
+    // cache miss
+    RecordTick(cache_options.statistics, PERSISTENT_CACHE_MISS);
+    return s;
+  }
+
+  // please note we are potentially comparing compressed data size with
+  // uncompressed data size
+  assert(handle.size() <= size);
+
+  // update stats
+  RecordTick(cache_options.statistics, PERSISTENT_CACHE_HIT);
+  // construct result and return
+  *contents = BlockContents(std::move(data), size);
+  return Status::OK();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/persistent_cache_helper.h b/src/rocksdb/table/persistent_cache_helper.h
new file mode 100644
index 000000000..ece339aee
--- /dev/null
+++ b/src/rocksdb/table/persistent_cache_helper.h
@@ -0,0 +1,46 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <string>
+
+#include "monitoring/statistics.h"
+#include "table/format.h"
+#include "table/persistent_cache_options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct BlockContents;
+
+// PersistentCacheHelper
+//
+// Encapsulates  some of the helper logic for read and writing from the cache
+class PersistentCacheHelper {
+ public:
+  // Insert block into cache of serialized blocks. Size includes block trailer
+  // (if applicable).
+  static void InsertSerialized(const PersistentCacheOptions& cache_options,
+                               const BlockHandle& handle, const char* data,
+                               const size_t size);
+
+  // Insert block into cache of uncompressed blocks. No block trailer.
+  static void InsertUncompressed(const PersistentCacheOptions& cache_options,
+                                 const BlockHandle& handle,
+                                 const BlockContents& contents);
+
+  // Lookup block from cache of serialized blocks. Size includes block trailer
+  // (if applicable).
+  static Status LookupSerialized(const PersistentCacheOptions& cache_options,
+                                 const BlockHandle& handle,
+                                 std::unique_ptr<char[]>* out_data,
+                                 const size_t expected_data_size);
+
+  // Lookup block from uncompressed cache. No block trailer.
+  static Status LookupUncompressed(const PersistentCacheOptions& cache_options,
+                                   const BlockHandle& handle,
+                                   BlockContents* contents);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/persistent_cache_options.h b/src/rocksdb/table/persistent_cache_options.h
new file mode 100644
index 000000000..b543ab3a3
--- /dev/null
+++ b/src/rocksdb/table/persistent_cache_options.h
@@ -0,0 +1,34 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <string>
+
+#include "cache/cache_key.h"
+#include "monitoring/statistics.h"
+#include "rocksdb/persistent_cache.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// PersistentCacheOptions
+//
+// This describe the caching behavior for page cache
+// This is used to pass the context for caching and the cache handle
+struct PersistentCacheOptions {
+  PersistentCacheOptions() {}
+  explicit PersistentCacheOptions(
+      const std::shared_ptr<PersistentCache>& _persistent_cache,
+      const OffsetableCacheKey& _base_cache_key, Statistics* const _statistics)
+      : persistent_cache(_persistent_cache),
+        base_cache_key(_base_cache_key),
+        statistics(_statistics) {}
+  std::shared_ptr<PersistentCache> persistent_cache;
+  OffsetableCacheKey base_cache_key;
+  Statistics* statistics = nullptr;
+
+  static const PersistentCacheOptions kEmpty;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/plain/plain_table_bloom.cc b/src/rocksdb/table/plain/plain_table_bloom.cc
new file mode 100644
index 000000000..21441f616
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_bloom.cc
@@ -0,0 +1,78 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/plain/plain_table_bloom.h"
+
+#include <algorithm>
+#include <string>
+
+#include "memory/allocator.h"
+#include "util/dynamic_bloom.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+uint32_t GetTotalBitsForLocality(uint32_t total_bits) {
+  uint32_t num_blocks =
+      (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
+
+  // Make num_blocks an odd number to make sure more bits are involved
+  // when determining which block.
+  if (num_blocks % 2 == 0) {
+    num_blocks++;
+  }
+
+  return num_blocks * (CACHE_LINE_SIZE * 8);
+}
+}  // namespace
+
+PlainTableBloomV1::PlainTableBloomV1(uint32_t num_probes)
+    : kTotalBits(0), kNumBlocks(0), kNumProbes(num_probes), data_(nullptr) {}
+
+void PlainTableBloomV1::SetRawData(char* raw_data, uint32_t total_bits,
+                                   uint32_t num_blocks) {
+  data_ = raw_data;
+  kTotalBits = total_bits;
+  kNumBlocks = num_blocks;
+}
+
+void PlainTableBloomV1::SetTotalBits(Allocator* allocator, uint32_t total_bits,
+                                     uint32_t locality,
+                                     size_t huge_page_tlb_size,
+                                     Logger* logger) {
+  kTotalBits = (locality > 0) ? GetTotalBitsForLocality(total_bits)
+                              : (total_bits + 7) / 8 * 8;
+  kNumBlocks = (locality > 0) ? (kTotalBits / (CACHE_LINE_SIZE * 8)) : 0;
+
+  assert(kNumBlocks > 0 || kTotalBits > 0);
+  assert(kNumProbes > 0);
+
+  uint32_t sz = kTotalBits / 8;
+  if (kNumBlocks > 0) {
+    sz += CACHE_LINE_SIZE - 1;
+  }
+  assert(allocator);
+
+  char* raw = allocator->AllocateAligned(sz, huge_page_tlb_size, logger);
+  memset(raw, 0, sz);
+  auto cache_line_offset = reinterpret_cast<uintptr_t>(raw) % CACHE_LINE_SIZE;
+  if (kNumBlocks > 0 && cache_line_offset > 0) {
+    raw += CACHE_LINE_SIZE - cache_line_offset;
+  }
+  data_ = raw;
+}
+
+void BloomBlockBuilder::AddKeysHashes(
+    const std::vector<uint32_t>& keys_hashes) {
+  for (auto hash : keys_hashes) {
+    bloom_.AddHash(hash);
+  }
+}
+
+Slice BloomBlockBuilder::Finish() { return bloom_.GetRawData(); }
+
+const std::string BloomBlockBuilder::kBloomBlock = "kBloomBlock";
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/plain/plain_table_bloom.h b/src/rocksdb/table/plain/plain_table_bloom.h
new file mode 100644
index 000000000..460e7ec39
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_bloom.h
@@ -0,0 +1,132 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "port/port.h"
+#include "rocksdb/slice.h"
+#include "util/bloom_impl.h"
+#include "util/hash.h"
+#include "util/math.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Slice;
+class Allocator;
+class Logger;
+
+// A legacy Bloom filter implementation used by Plain Table db format, for
+// schema backward compatibility. Not for use in new filter applications.
+class PlainTableBloomV1 {
+ public:
+  // allocator: pass allocator to bloom filter, hence trace the usage of memory
+  // total_bits: fixed total bits for the bloom
+  // num_probes: number of hash probes for a single key
+  // locality:  If positive, optimize for cache line locality, 0 otherwise.
+  // hash_func:  customized hash function
+  // huge_page_tlb_size:  if >0, try to allocate bloom bytes from huge page TLB
+  //                      within this page size. Need to reserve huge pages for
+  //                      it to be allocated, like:
+  //                         sysctl -w vm.nr_hugepages=20
+  //                     See linux doc Documentation/vm/hugetlbpage.txt
+  explicit PlainTableBloomV1(uint32_t num_probes = 6);
+  void SetTotalBits(Allocator* allocator, uint32_t total_bits,
+                    uint32_t locality, size_t huge_page_tlb_size,
+                    Logger* logger);
+
+  ~PlainTableBloomV1() {}
+
+  // Assuming single threaded access to this function.
+  void AddHash(uint32_t hash);
+
+  // Multithreaded access to this function is OK
+  bool MayContainHash(uint32_t hash) const;
+
+  void Prefetch(uint32_t hash);
+
+  uint32_t GetNumBlocks() const { return kNumBlocks; }
+
+  Slice GetRawData() const { return Slice(data_, GetTotalBits() / 8); }
+
+  void SetRawData(char* raw_data, uint32_t total_bits, uint32_t num_blocks = 0);
+
+  uint32_t GetTotalBits() const { return kTotalBits; }
+
+  bool IsInitialized() const { return kNumBlocks > 0 || kTotalBits > 0; }
+
+ private:
+  uint32_t kTotalBits;
+  uint32_t kNumBlocks;
+  const uint32_t kNumProbes;
+
+  char* data_;
+
+  static constexpr int LOG2_CACHE_LINE_SIZE =
+      ConstexprFloorLog2(CACHE_LINE_SIZE);
+};
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+// local variable is initialized but not referenced
+#pragma warning(disable : 4189)
+#endif
+inline void PlainTableBloomV1::Prefetch(uint32_t h) {
+  if (kNumBlocks != 0) {
+    uint32_t ignored;
+    LegacyLocalityBloomImpl</*ExtraRotates*/ true>::PrepareHashMayMatch(
+        h, kNumBlocks, data_, &ignored, LOG2_CACHE_LINE_SIZE);
+  }
+}
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+inline bool PlainTableBloomV1::MayContainHash(uint32_t h) const {
+  assert(IsInitialized());
+  if (kNumBlocks != 0) {
+    return LegacyLocalityBloomImpl<true>::HashMayMatch(
+        h, kNumBlocks, kNumProbes, data_, LOG2_CACHE_LINE_SIZE);
+  } else {
+    return LegacyNoLocalityBloomImpl::HashMayMatch(h, kTotalBits, kNumProbes,
+                                                   data_);
+  }
+}
+
+inline void PlainTableBloomV1::AddHash(uint32_t h) {
+  assert(IsInitialized());
+  if (kNumBlocks != 0) {
+    LegacyLocalityBloomImpl<true>::AddHash(h, kNumBlocks, kNumProbes, data_,
+                                           LOG2_CACHE_LINE_SIZE);
+  } else {
+    LegacyNoLocalityBloomImpl::AddHash(h, kTotalBits, kNumProbes, data_);
+  }
+}
+
+class BloomBlockBuilder {
+ public:
+  static const std::string kBloomBlock;
+
+  explicit BloomBlockBuilder(uint32_t num_probes = 6) : bloom_(num_probes) {}
+
+  void SetTotalBits(Allocator* allocator, uint32_t total_bits,
+                    uint32_t locality, size_t huge_page_tlb_size,
+                    Logger* logger) {
+    bloom_.SetTotalBits(allocator, total_bits, locality, huge_page_tlb_size,
+                        logger);
+  }
+
+  uint32_t GetNumBlocks() const { return bloom_.GetNumBlocks(); }
+
+  void AddKeysHashes(const std::vector<uint32_t>& keys_hashes);
+
+  Slice Finish();
+
+ private:
+  PlainTableBloomV1 bloom_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/plain/plain_table_builder.cc b/src/rocksdb/table/plain/plain_table_builder.cc
new file mode 100644
index 000000000..04723955c
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_builder.cc
@@ -0,0 +1,337 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "table/plain/plain_table_builder.h"
+
+#include <assert.h>
+
+#include <limits>
+#include <map>
+#include <string>
+
+#include "db/dbformat.h"
+#include "file/writable_file_writer.h"
+#include "logging/logging.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_builder.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_index.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// a utility that helps writing block content to the file
+//   @offset will advance if @block_contents was successfully written.
+//   @block_handle the block handle this particular block.
+IOStatus WriteBlock(const Slice& block_contents, WritableFileWriter* file,
+                    uint64_t* offset, BlockHandle* block_handle) {
+  block_handle->set_offset(*offset);
+  block_handle->set_size(block_contents.size());
+  IOStatus io_s = file->Append(block_contents);
+
+  if (io_s.ok()) {
+    *offset += block_contents.size();
+  }
+  return io_s;
+}
+
+}  // namespace
+
+// kPlainTableMagicNumber was picked by running
+//    echo rocksdb.table.plain | sha1sum
+// and taking the leading 64 bits.
+extern const uint64_t kPlainTableMagicNumber = 0x8242229663bf9564ull;
+extern const uint64_t kLegacyPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull;
+
+PlainTableBuilder::PlainTableBuilder(
+    const ImmutableOptions& ioptions, const MutableCFOptions& moptions,
+    const IntTblPropCollectorFactories* int_tbl_prop_collector_factories,
+    uint32_t column_family_id, int level_at_creation, WritableFileWriter* file,
+    uint32_t user_key_len, EncodingType encoding_type, size_t index_sparseness,
+    uint32_t bloom_bits_per_key, const std::string& column_family_name,
+    uint32_t num_probes, size_t huge_page_tlb_size, double hash_table_ratio,
+    bool store_index_in_file, const std::string& db_id,
+    const std::string& db_session_id, uint64_t file_number)
+    : ioptions_(ioptions),
+      moptions_(moptions),
+      bloom_block_(num_probes),
+      file_(file),
+      bloom_bits_per_key_(bloom_bits_per_key),
+      huge_page_tlb_size_(huge_page_tlb_size),
+      encoder_(encoding_type, user_key_len, moptions.prefix_extractor.get(),
+               index_sparseness),
+      store_index_in_file_(store_index_in_file),
+      prefix_extractor_(moptions.prefix_extractor.get()) {
+  // Build index block and save it in the file if hash_table_ratio > 0
+  if (store_index_in_file_) {
+    assert(hash_table_ratio > 0 || IsTotalOrderMode());
+    index_builder_.reset(new PlainTableIndexBuilder(
+        &arena_, ioptions, moptions.prefix_extractor.get(), index_sparseness,
+        hash_table_ratio, huge_page_tlb_size_));
+    properties_
+        .user_collected_properties[PlainTablePropertyNames::kBloomVersion] =
+        "1";  // For future use
+  }
+
+  properties_.fixed_key_len = user_key_len;
+
+  // for plain table, we put all the data in a big chuck.
+  properties_.num_data_blocks = 1;
+  // Fill it later if store_index_in_file_ == true
+  properties_.index_size = 0;
+  properties_.filter_size = 0;
+  // To support roll-back to previous version, now still use version 0 for
+  // plain encoding.
+  properties_.format_version = (encoding_type == kPlain) ? 0 : 1;
+  properties_.column_family_id = column_family_id;
+  properties_.column_family_name = column_family_name;
+  properties_.db_id = db_id;
+  properties_.db_session_id = db_session_id;
+  properties_.db_host_id = ioptions.db_host_id;
+  if (!ReifyDbHostIdProperty(ioptions_.env, &properties_.db_host_id).ok()) {
+    ROCKS_LOG_INFO(ioptions_.logger, "db_host_id property will not be set");
+  }
+  properties_.orig_file_number = file_number;
+  properties_.prefix_extractor_name =
+      moptions_.prefix_extractor != nullptr
+          ? moptions_.prefix_extractor->AsString()
+          : "nullptr";
+
+  std::string val;
+  PutFixed32(&val, static_cast<uint32_t>(encoder_.GetEncodingType()));
+  properties_
+      .user_collected_properties[PlainTablePropertyNames::kEncodingType] = val;
+
+  assert(int_tbl_prop_collector_factories);
+  for (auto& factory : *int_tbl_prop_collector_factories) {
+    assert(factory);
+
+    table_properties_collectors_.emplace_back(
+        factory->CreateIntTblPropCollector(column_family_id,
+                                           level_at_creation));
+  }
+}
+
+PlainTableBuilder::~PlainTableBuilder() {
+  // They are supposed to have been passed to users through Finish()
+  // if the file succeeds.
+  status_.PermitUncheckedError();
+  io_status_.PermitUncheckedError();
+}
+
+void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
+  // temp buffer for metadata bytes between key and value.
+  char meta_bytes_buf[6];
+  size_t meta_bytes_buf_size = 0;
+
+  ParsedInternalKey internal_key;
+  if (!ParseInternalKey(key, &internal_key, false /* log_err_key */)
+           .ok()) {  // TODO
+    assert(false);
+    return;
+  }
+  if (internal_key.type == kTypeRangeDeletion) {
+    status_ = Status::NotSupported("Range deletion unsupported");
+    return;
+  }
+
+  // Store key hash
+  if (store_index_in_file_) {
+    if (moptions_.prefix_extractor == nullptr) {
+      keys_or_prefixes_hashes_.push_back(GetSliceHash(internal_key.user_key));
+    } else {
+      Slice prefix =
+          moptions_.prefix_extractor->Transform(internal_key.user_key);
+      keys_or_prefixes_hashes_.push_back(GetSliceHash(prefix));
+    }
+  }
+
+  // Write value
+  assert(offset_ <= std::numeric_limits<uint32_t>::max());
+  auto prev_offset = static_cast<uint32_t>(offset_);
+  // Write out the key
+  io_status_ = encoder_.AppendKey(key, file_, &offset_, meta_bytes_buf,
+                                  &meta_bytes_buf_size);
+  if (SaveIndexInFile()) {
+    index_builder_->AddKeyPrefix(GetPrefix(internal_key), prev_offset);
+  }
+
+  // Write value length
+  uint32_t value_size = static_cast<uint32_t>(value.size());
+  if (io_status_.ok()) {
+    char* end_ptr =
+        EncodeVarint32(meta_bytes_buf + meta_bytes_buf_size, value_size);
+    assert(end_ptr <= meta_bytes_buf + sizeof(meta_bytes_buf));
+    meta_bytes_buf_size = end_ptr - meta_bytes_buf;
+    io_status_ = file_->Append(Slice(meta_bytes_buf, meta_bytes_buf_size));
+  }
+
+  // Write value
+  if (io_status_.ok()) {
+    io_status_ = file_->Append(value);
+    offset_ += value_size + meta_bytes_buf_size;
+  }
+
+  if (io_status_.ok()) {
+    properties_.num_entries++;
+    properties_.raw_key_size += key.size();
+    properties_.raw_value_size += value.size();
+    if (internal_key.type == kTypeDeletion ||
+        internal_key.type == kTypeSingleDeletion) {
+      properties_.num_deletions++;
+    } else if (internal_key.type == kTypeMerge) {
+      properties_.num_merge_operands++;
+    }
+  }
+
+  // notify property collectors
+  NotifyCollectTableCollectorsOnAdd(
+      key, value, offset_, table_properties_collectors_, ioptions_.logger);
+  status_ = io_status_;
+}
+
+Status PlainTableBuilder::Finish() {
+  assert(!closed_);
+  closed_ = true;
+
+  properties_.data_size = offset_;
+
+  //  Write the following blocks
+  //  1. [meta block: bloom] - optional
+  //  2. [meta block: index] - optional
+  //  3. [meta block: properties]
+  //  4. [metaindex block]
+  //  5. [footer]
+
+  MetaIndexBuilder meta_index_builer;
+
+  if (store_index_in_file_ && (properties_.num_entries > 0)) {
+    assert(properties_.num_entries <= std::numeric_limits<uint32_t>::max());
+    BlockHandle bloom_block_handle;
+    if (bloom_bits_per_key_ > 0) {
+      bloom_block_.SetTotalBits(
+          &arena_,
+          static_cast<uint32_t>(properties_.num_entries) * bloom_bits_per_key_,
+          ioptions_.bloom_locality, huge_page_tlb_size_, ioptions_.logger);
+
+      PutVarint32(&properties_.user_collected_properties
+                       [PlainTablePropertyNames::kNumBloomBlocks],
+                  bloom_block_.GetNumBlocks());
+
+      bloom_block_.AddKeysHashes(keys_or_prefixes_hashes_);
+
+      Slice bloom_finish_result = bloom_block_.Finish();
+
+      properties_.filter_size = bloom_finish_result.size();
+      io_status_ =
+          WriteBlock(bloom_finish_result, file_, &offset_, &bloom_block_handle);
+
+      if (!io_status_.ok()) {
+        status_ = io_status_;
+        return status_;
+      }
+      meta_index_builer.Add(BloomBlockBuilder::kBloomBlock, bloom_block_handle);
+    }
+    BlockHandle index_block_handle;
+    Slice index_finish_result = index_builder_->Finish();
+
+    properties_.index_size = index_finish_result.size();
+    io_status_ =
+        WriteBlock(index_finish_result, file_, &offset_, &index_block_handle);
+
+    if (!io_status_.ok()) {
+      status_ = io_status_;
+      return status_;
+    }
+
+    meta_index_builer.Add(PlainTableIndexBuilder::kPlainTableIndexBlock,
+                          index_block_handle);
+  }
+
+  // Calculate bloom block size and index block size
+  PropertyBlockBuilder property_block_builder;
+  // -- Add basic properties
+  property_block_builder.AddTableProperty(properties_);
+
+  property_block_builder.Add(properties_.user_collected_properties);
+
+  // -- Add user collected properties
+  NotifyCollectTableCollectorsOnFinish(
+      table_properties_collectors_, ioptions_.logger, &property_block_builder);
+
+  // -- Write property block
+  BlockHandle property_block_handle;
+  IOStatus s = WriteBlock(property_block_builder.Finish(), file_, &offset_,
+                          &property_block_handle);
+  if (!s.ok()) {
+    return static_cast<Status>(s);
+  }
+  meta_index_builer.Add(kPropertiesBlockName, property_block_handle);
+
+  // -- write metaindex block
+  BlockHandle metaindex_block_handle;
+  io_status_ = WriteBlock(meta_index_builer.Finish(), file_, &offset_,
+                          &metaindex_block_handle);
+  if (!io_status_.ok()) {
+    status_ = io_status_;
+    return status_;
+  }
+
+  // Write Footer
+  // no need to write out new footer if we're using default checksum
+  FooterBuilder footer;
+  footer.Build(kPlainTableMagicNumber, /* format_version */ 0, offset_,
+               kNoChecksum, metaindex_block_handle);
+  io_status_ = file_->Append(footer.GetSlice());
+  if (io_status_.ok()) {
+    offset_ += footer.GetSlice().size();
+  }
+  status_ = io_status_;
+  return status_;
+}
+
+void PlainTableBuilder::Abandon() { closed_ = true; }
+
+uint64_t PlainTableBuilder::NumEntries() const {
+  return properties_.num_entries;
+}
+
+uint64_t PlainTableBuilder::FileSize() const { return offset_; }
+
+std::string PlainTableBuilder::GetFileChecksum() const {
+  if (file_ != nullptr) {
+    return file_->GetFileChecksum();
+  } else {
+    return kUnknownFileChecksum;
+  }
+}
+
+const char* PlainTableBuilder::GetFileChecksumFuncName() const {
+  if (file_ != nullptr) {
+    return file_->GetFileChecksumFuncName();
+  } else {
+    return kUnknownFileChecksumFuncName;
+  }
+}
+void PlainTableBuilder::SetSeqnoTimeTableProperties(const std::string& string,
+                                                    uint64_t uint_64) {
+  // TODO: storing seqno to time mapping is not yet support for plain table.
+  TableBuilder::SetSeqnoTimeTableProperties(string, uint_64);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_builder.h b/src/rocksdb/table/plain/plain_table_builder.h
new file mode 100644
index 000000000..445491c2a
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_builder.h
@@ -0,0 +1,154 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+#include "db/version_edit.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_index.h"
+#include "table/plain/plain_table_key_coding.h"
+#include "table/table_builder.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlockBuilder;
+class BlockHandle;
+class WritableFile;
+class TableBuilder;
+
+// The builder class of PlainTable. For description of PlainTable format
+// See comments of class PlainTableFactory, where instances of
+// PlainTableReader are created.
+class PlainTableBuilder : public TableBuilder {
+ public:
+  // Create a builder that will store the contents of the table it is
+  // building in *file.  Does not close the file.  It is up to the
+  // caller to close the file after calling Finish(). The output file
+  // will be part of level specified by 'level'.  A value of -1 means
+  // that the caller does not know which level the output file will reside.
+  PlainTableBuilder(
+      const ImmutableOptions& ioptions, const MutableCFOptions& moptions,
+      const IntTblPropCollectorFactories* int_tbl_prop_collector_factories,
+      uint32_t column_family_id, int level_at_creation,
+      WritableFileWriter* file, uint32_t user_key_size,
+      EncodingType encoding_type, size_t index_sparseness,
+      uint32_t bloom_bits_per_key, const std::string& column_family_name,
+      uint32_t num_probes = 6, size_t huge_page_tlb_size = 0,
+      double hash_table_ratio = 0, bool store_index_in_file = false,
+      const std::string& db_id = "", const std::string& db_session_id = "",
+      uint64_t file_number = 0);
+  // No copying allowed
+  PlainTableBuilder(const PlainTableBuilder&) = delete;
+  void operator=(const PlainTableBuilder&) = delete;
+
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  ~PlainTableBuilder();
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Add(const Slice& key, const Slice& value) override;
+
+  // Return non-ok iff some error has been detected.
+  Status status() const override { return status_; }
+
+  // Return non-ok iff some error happens during IO.
+  IOStatus io_status() const override { return io_status_; }
+
+  // Finish building the table.  Stops using the file passed to the
+  // constructor after this function returns.
+  // REQUIRES: Finish(), Abandon() have not been called
+  Status Finish() override;
+
+  // Indicate that the contents of this builder should be abandoned.  Stops
+  // using the file passed to the constructor after this function returns.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Abandon() override;
+
+  // Number of calls to Add() so far.
+  uint64_t NumEntries() const override;
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  uint64_t FileSize() const override;
+
+  TableProperties GetTableProperties() const override { return properties_; }
+
+  bool SaveIndexInFile() const { return store_index_in_file_; }
+
+  // Get file checksum
+  std::string GetFileChecksum() const override;
+
+  // Get file checksum function name
+  const char* GetFileChecksumFuncName() const override;
+
+  void SetSeqnoTimeTableProperties(const std::string& string,
+                                   uint64_t uint_64) override;
+
+ private:
+  Arena arena_;
+  const ImmutableOptions& ioptions_;
+  const MutableCFOptions& moptions_;
+  std::vector<std::unique_ptr<IntTblPropCollector>>
+      table_properties_collectors_;
+
+  BloomBlockBuilder bloom_block_;
+  std::unique_ptr<PlainTableIndexBuilder> index_builder_;
+
+  WritableFileWriter* file_;
+  uint64_t offset_ = 0;
+  uint32_t bloom_bits_per_key_;
+  size_t huge_page_tlb_size_;
+  Status status_;
+  IOStatus io_status_;
+  TableProperties properties_;
+  PlainTableKeyEncoder encoder_;
+
+  bool store_index_in_file_;
+
+  std::vector<uint32_t> keys_or_prefixes_hashes_;
+  bool closed_ = false;  // Either Finish() or Abandon() has been called.
+
+  const SliceTransform* prefix_extractor_;
+
+  Slice GetPrefix(const Slice& target) const {
+    assert(target.size() >= 8);  // target is internal key
+    return GetPrefixFromUserKey(ExtractUserKey(target));
+  }
+
+  Slice GetPrefix(const ParsedInternalKey& target) const {
+    return GetPrefixFromUserKey(target.user_key);
+  }
+
+  Slice GetPrefixFromUserKey(const Slice& user_key) const {
+    if (!IsTotalOrderMode()) {
+      return prefix_extractor_->Transform(user_key);
+    } else {
+      // Use empty slice as prefix if prefix_extractor is not set.
+      // In that case,
+      // it falls back to pure binary search and
+      // total iterator seek is supported.
+      return Slice();
+    }
+  }
+
+  bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_factory.cc b/src/rocksdb/table/plain/plain_table_factory.cc
new file mode 100644
index 000000000..dfe5241a5
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_factory.cc
@@ -0,0 +1,350 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/plain/plain_table_factory.h"
+
+#include <stdint.h>
+
+#include <memory>
+
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+#include "table/plain/plain_table_builder.h"
+#include "table/plain/plain_table_reader.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+#ifndef ROCKSDB_LITE
+static std::unordered_map<std::string, OptionTypeInfo> plain_table_type_info = {
+    {"user_key_len",
+     {offsetof(struct PlainTableOptions, user_key_len), OptionType::kUInt32T,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"bloom_bits_per_key",
+     {offsetof(struct PlainTableOptions, bloom_bits_per_key), OptionType::kInt,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"hash_table_ratio",
+     {offsetof(struct PlainTableOptions, hash_table_ratio), OptionType::kDouble,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"index_sparseness",
+     {offsetof(struct PlainTableOptions, index_sparseness), OptionType::kSizeT,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"huge_page_tlb_size",
+     {offsetof(struct PlainTableOptions, huge_page_tlb_size),
+      OptionType::kSizeT, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"encoding_type",
+     {offsetof(struct PlainTableOptions, encoding_type),
+      OptionType::kEncodingType, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"full_scan_mode",
+     {offsetof(struct PlainTableOptions, full_scan_mode), OptionType::kBoolean,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"store_index_in_file",
+     {offsetof(struct PlainTableOptions, store_index_in_file),
+      OptionType::kBoolean, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+};
+
+PlainTableFactory::PlainTableFactory(const PlainTableOptions& options)
+    : table_options_(options) {
+  RegisterOptions(&table_options_, &plain_table_type_info);
+}
+
+Status PlainTableFactory::NewTableReader(
+    const ReadOptions& /*ro*/, const TableReaderOptions& table_reader_options,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    std::unique_ptr<TableReader>* table,
+    bool /*prefetch_index_and_filter_in_cache*/) const {
+  return PlainTableReader::Open(
+      table_reader_options.ioptions, table_reader_options.env_options,
+      table_reader_options.internal_comparator, std::move(file), file_size,
+      table, table_options_.bloom_bits_per_key, table_options_.hash_table_ratio,
+      table_options_.index_sparseness, table_options_.huge_page_tlb_size,
+      table_options_.full_scan_mode, table_reader_options.immortal,
+      table_reader_options.prefix_extractor.get());
+}
+
+TableBuilder* PlainTableFactory::NewTableBuilder(
+    const TableBuilderOptions& table_builder_options,
+    WritableFileWriter* file) const {
+  // Ignore the skip_filters flag. PlainTable format is optimized for small
+  // in-memory dbs. The skip_filters optimization is not useful for plain
+  // tables
+  //
+  return new PlainTableBuilder(
+      table_builder_options.ioptions, table_builder_options.moptions,
+      table_builder_options.int_tbl_prop_collector_factories,
+      table_builder_options.column_family_id,
+      table_builder_options.level_at_creation, file,
+      table_options_.user_key_len, table_options_.encoding_type,
+      table_options_.index_sparseness, table_options_.bloom_bits_per_key,
+      table_builder_options.column_family_name, 6,
+      table_options_.huge_page_tlb_size, table_options_.hash_table_ratio,
+      table_options_.store_index_in_file, table_builder_options.db_id,
+      table_builder_options.db_session_id, table_builder_options.cur_file_num);
+}
+
+std::string PlainTableFactory::GetPrintableOptions() const {
+  std::string ret;
+  ret.reserve(20000);
+  const int kBufferSize = 200;
+  char buffer[kBufferSize];
+
+  snprintf(buffer, kBufferSize, "  user_key_len: %u\n",
+           table_options_.user_key_len);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  bloom_bits_per_key: %d\n",
+           table_options_.bloom_bits_per_key);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  hash_table_ratio: %lf\n",
+           table_options_.hash_table_ratio);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  index_sparseness: %" ROCKSDB_PRIszt "\n",
+           table_options_.index_sparseness);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  huge_page_tlb_size: %" ROCKSDB_PRIszt "\n",
+           table_options_.huge_page_tlb_size);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  encoding_type: %d\n",
+           table_options_.encoding_type);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  full_scan_mode: %d\n",
+           table_options_.full_scan_mode);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  store_index_in_file: %d\n",
+           table_options_.store_index_in_file);
+  ret.append(buffer);
+  return ret;
+}
+
+Status GetPlainTableOptionsFromString(const PlainTableOptions& table_options,
+                                      const std::string& opts_str,
+                                      PlainTableOptions* new_table_options) {
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = false;
+  config_options.ignore_unknown_options = false;
+  config_options.invoke_prepare_options = false;
+  return GetPlainTableOptionsFromString(config_options, table_options, opts_str,
+                                        new_table_options);
+}
+
+Status GetPlainTableOptionsFromString(const ConfigOptions& config_options,
+                                      const PlainTableOptions& table_options,
+                                      const std::string& opts_str,
+                                      PlainTableOptions* new_table_options) {
+  std::unordered_map<std::string, std::string> opts_map;
+  Status s = StringToMap(opts_str, &opts_map);
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = GetPlainTableOptionsFromMap(config_options, table_options, opts_map,
+                                  new_table_options);
+  // Translate any errors (NotFound, NotSupported, to InvalidArgument
+  if (s.ok() || s.IsInvalidArgument()) {
+    return s;
+  } else {
+    return Status::InvalidArgument(s.getState());
+  }
+}
+#endif  // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+static int RegisterBuiltinMemTableRepFactory(ObjectLibrary& library,
+                                             const std::string& /*arg*/) {
+  // The MemTableRepFactory built-in classes will be either a class
+  // (VectorRepFactory) or a nickname (vector), followed optionally by ":#",
+  // where # is the "size" of the factory.
+  auto AsPattern = [](const std::string& name, const std::string& alt) {
+    auto pattern = ObjectLibrary::PatternEntry(name, true);
+    pattern.AnotherName(alt);
+    pattern.AddNumber(":");
+    return pattern;
+  };
+  library.AddFactory<MemTableRepFactory>(
+      AsPattern(VectorRepFactory::kClassName(), VectorRepFactory::kNickName()),
+      [](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
+         std::string* /*errmsg*/) {
+        auto colon = uri.find(":");
+        if (colon != std::string::npos) {
+          size_t count = ParseSizeT(uri.substr(colon + 1));
+          guard->reset(new VectorRepFactory(count));
+        } else {
+          guard->reset(new VectorRepFactory());
+        }
+        return guard->get();
+      });
+  library.AddFactory<MemTableRepFactory>(
+      AsPattern(SkipListFactory::kClassName(), SkipListFactory::kNickName()),
+      [](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
+         std::string* /*errmsg*/) {
+        auto colon = uri.find(":");
+        if (colon != std::string::npos) {
+          size_t lookahead = ParseSizeT(uri.substr(colon + 1));
+          guard->reset(new SkipListFactory(lookahead));
+        } else {
+          guard->reset(new SkipListFactory());
+        }
+        return guard->get();
+      });
+  library.AddFactory<MemTableRepFactory>(
+      AsPattern("HashLinkListRepFactory", "hash_linkedlist"),
+      [](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
+         std::string* /*errmsg*/) {
+        // Expecting format: hash_linkedlist:<hash_bucket_count>
+        auto colon = uri.find(":");
+        if (colon != std::string::npos) {
+          size_t hash_bucket_count = ParseSizeT(uri.substr(colon + 1));
+          guard->reset(NewHashLinkListRepFactory(hash_bucket_count));
+        } else {
+          guard->reset(NewHashLinkListRepFactory());
+        }
+        return guard->get();
+      });
+  library.AddFactory<MemTableRepFactory>(
+      AsPattern("HashSkipListRepFactory", "prefix_hash"),
+      [](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
+         std::string* /*errmsg*/) {
+        // Expecting format: prefix_hash:<hash_bucket_count>
+        auto colon = uri.find(":");
+        if (colon != std::string::npos) {
+          size_t hash_bucket_count = ParseSizeT(uri.substr(colon + 1));
+          guard->reset(NewHashSkipListRepFactory(hash_bucket_count));
+        } else {
+          guard->reset(NewHashSkipListRepFactory());
+        }
+        return guard->get();
+      });
+  library.AddFactory<MemTableRepFactory>(
+      "cuckoo",
+      [](const std::string& /*uri*/,
+         std::unique_ptr<MemTableRepFactory>* /*guard*/, std::string* errmsg) {
+        *errmsg = "cuckoo hash memtable is not supported anymore.";
+        return nullptr;
+      });
+
+  size_t num_types;
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+#endif  // ROCKSDB_LITE
+
+Status GetMemTableRepFactoryFromString(
+    const std::string& opts_str, std::unique_ptr<MemTableRepFactory>* result) {
+  ConfigOptions config_options;
+  config_options.ignore_unsupported_options = false;
+  config_options.ignore_unknown_options = false;
+  return MemTableRepFactory::CreateFromString(config_options, opts_str, result);
+}
+
+Status MemTableRepFactory::CreateFromString(
+    const ConfigOptions& config_options, const std::string& value,
+    std::unique_ptr<MemTableRepFactory>* result) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    RegisterBuiltinMemTableRepFactory(*(ObjectLibrary::Default().get()), "");
+  });
+#endif  // ROCKSDB_LITE
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+  Status status = Customizable::GetOptionsMap(config_options, result->get(),
+                                              value, &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
+  } else if (value.empty()) {
+    // No Id and no options.  Clear the object
+    result->reset();
+    return Status::OK();
+  } else if (id.empty()) {  // We have no Id but have options.  Not good
+    return Status::NotSupported("Cannot reset object ", id);
+  } else {
+#ifndef ROCKSDB_LITE
+    status = NewUniqueObject<MemTableRepFactory>(config_options, id, opt_map,
+                                                 result);
+#else
+    // To make it possible to configure the memtables in LITE mode, the ID
+    // is of the form <name>:<size>, where name is the name of the class and
+    // <size> is the length of the object (e.g. skip_list:10).
+    std::vector<std::string> opts_list = StringSplit(id, ':');
+    if (opts_list.empty() || opts_list.size() > 2 || !opt_map.empty()) {
+      status = Status::InvalidArgument("Can't parse memtable_factory option ",
+                                       value);
+    } else if (opts_list[0] == SkipListFactory::kNickName() ||
+               opts_list[0] == SkipListFactory::kClassName()) {
+      // Expecting format
+      // skip_list:<lookahead>
+      if (opts_list.size() == 2) {
+        size_t lookahead = ParseSizeT(opts_list[1]);
+        result->reset(new SkipListFactory(lookahead));
+      } else {
+        result->reset(new SkipListFactory());
+      }
+    } else if (!config_options.ignore_unsupported_options) {
+      status = Status::NotSupported("Cannot load object in LITE mode ", id);
+    }
+#endif  // ROCKSDB_LITE
+  }
+  return status;
+}
+
+Status MemTableRepFactory::CreateFromString(
+    const ConfigOptions& config_options, const std::string& value,
+    std::shared_ptr<MemTableRepFactory>* result) {
+  std::unique_ptr<MemTableRepFactory> factory;
+  Status s = CreateFromString(config_options, value, &factory);
+  if (factory && s.ok()) {
+    result->reset(factory.release());
+  }
+  return s;
+}
+
+#ifndef ROCKSDB_LITE
+Status GetPlainTableOptionsFromMap(
+    const PlainTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    PlainTableOptions* new_table_options, bool input_strings_escaped,
+    bool ignore_unknown_options) {
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = input_strings_escaped;
+  config_options.ignore_unknown_options = ignore_unknown_options;
+  return GetPlainTableOptionsFromMap(config_options, table_options, opts_map,
+                                     new_table_options);
+}
+
+Status GetPlainTableOptionsFromMap(
+    const ConfigOptions& config_options, const PlainTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    PlainTableOptions* new_table_options) {
+  assert(new_table_options);
+  PlainTableFactory ptf(table_options);
+  Status s = ptf.ConfigureFromMap(config_options, opts_map);
+  if (s.ok()) {
+    *new_table_options = *(ptf.GetOptions<PlainTableOptions>());
+  } else {
+    // Restore "new_options" to the default "base_options".
+    *new_table_options = table_options;
+  }
+  return s;
+}
+
+extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options) {
+  return new PlainTableFactory(options);
+}
+
+const std::string PlainTablePropertyNames::kEncodingType =
+    "rocksdb.plain.table.encoding.type";
+
+const std::string PlainTablePropertyNames::kBloomVersion =
+    "rocksdb.plain.table.bloom.version";
+
+const std::string PlainTablePropertyNames::kNumBloomBlocks =
+    "rocksdb.plain.table.bloom.numblocks";
+
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/plain/plain_table_factory.h b/src/rocksdb/table/plain/plain_table_factory.h
new file mode 100644
index 000000000..ce60b9d19
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_factory.h
@@ -0,0 +1,182 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct EnvOptions;
+
+class Status;
+class RandomAccessFile;
+class WritableFile;
+class Table;
+class TableBuilder;
+
+// PlainTableFactory is the entrance function to the PlainTable format of
+// SST files. It returns instances PlainTableBuilder as the builder
+// class and PlainTableReader as the reader class, where the format is
+// actually implemented.
+//
+// The PlainTable is designed for memory-mapped file systems, e.g. tmpfs.
+// Data is not organized in blocks, which allows fast access. Because of
+// following downsides
+// 1. Data compression is not supported.
+// 2. Data is not checksumed.
+// it is not recommended to use this format on other type of file systems.
+//
+// PlainTable requires fixed length key, configured as a constructor
+// parameter of the factory class. Output file format:
+// +-------------+-----------------+
+// | version     | user_key_length |
+// +------------++------------+-----------------+  <= key1 offset
+// |  encoded key1            | value_size  |   |
+// +------------+-------------+-------------+   |
+// | value1                                     |
+// |                                            |
+// +--------------------------+-------------+---+  <= key2 offset
+// | encoded key2             | value_size  |   |
+// +------------+-------------+-------------+   |
+// | value2                                     |
+// |                                            |
+// |        ......                              |
+// +-----------------+--------------------------+
+//
+// When the key encoding type is kPlain. Key part is encoded as:
+// +------------+--------------------+
+// | [key_size] |  internal key      |
+// +------------+--------------------+
+// for the case of user_key_len = kPlainTableVariableLength case,
+// and simply:
+// +----------------------+
+// |  internal key        |
+// +----------------------+
+// for user_key_len != kPlainTableVariableLength case.
+//
+// If key encoding type is kPrefix. Keys are encoding in this format.
+// There are three ways to encode a key:
+// (1) Full Key
+// +---------------+---------------+-------------------+
+// | Full Key Flag | Full Key Size | Full Internal Key |
+// +---------------+---------------+-------------------+
+// which simply encodes a full key
+//
+// (2) A key shared the same prefix as the previous key, which is encoded as
+//     format of (1).
+// +-------------+-------------+-------------+-------------+------------+
+// | Prefix Flag | Prefix Size | Suffix Flag | Suffix Size | Key Suffix |
+// +-------------+-------------+-------------+-------------+------------+
+// where key is the suffix part of the key, including the internal bytes.
+// the actual key will be constructed by concatenating prefix part of the
+// previous key, with the suffix part of the key here, with sizes given here.
+//
+// (3) A key shared the same prefix as the previous key, which is encoded as
+//     the format of (2).
+// +-----------------+-----------------+------------------------+
+// | Key Suffix Flag | Key Suffix Size | Suffix of Internal Key |
+// +-----------------+-----------------+------------------------+
+// The key will be constructed by concatenating previous key's prefix (which is
+// also a prefix which the last key encoded in the format of (1)) and the
+// key given here.
+//
+// For example, we for following keys (prefix and suffix are separated by
+// spaces):
+//   0000 0001
+//   0000 00021
+//   0000 0002
+//   00011 00
+//   0002 0001
+// Will be encoded like this:
+//   FK 8 00000001
+//   PF 4 SF 5 00021
+//   SF 4 0002
+//   FK 7 0001100
+//   FK 8 00020001
+// (where FK means full key flag, PF means prefix flag and SF means suffix flag)
+//
+// All those "key flag + key size" shown above are in this format:
+// The 8 bits of the first byte:
+// +----+----+----+----+----+----+----+----+
+// |  Type   |            Size             |
+// +----+----+----+----+----+----+----+----+
+// Type indicates: full key, prefix, or suffix.
+// The last 6 bits are for size. If the size bits are not all 1, it means the
+// size of the key. Otherwise, varint32 is read after this byte. This varint
+// value + 0x3F (the value of all 1) will be the key size.
+//
+// For example, full key with length 16 will be encoded as (binary):
+//     00 010000
+// (00 means full key)
+// and a prefix with 100 bytes will be encoded as:
+//     01 111111    00100101
+//         (63)       (37)
+// (01 means key suffix)
+//
+// All the internal keys above (including kPlain and kPrefix) are encoded in
+// this format:
+// There are two types:
+// (1) normal internal key format
+// +----------- ...... -------------+----+---+---+---+---+---+---+---+
+// |       user key                 |type|      sequence ID          |
+// +----------- ..... --------------+----+---+---+---+---+---+---+---+
+// (2) Special case for keys whose sequence ID is 0 and is value type
+// +----------- ...... -------------+----+
+// |       user key                 |0x80|
+// +----------- ..... --------------+----+
+// To save 7 bytes for the special case where sequence ID = 0.
+//
+//
+class PlainTableFactory : public TableFactory {
+ public:
+  ~PlainTableFactory() {}
+  // user_key_len is the length of the user key. If it is set to be
+  // kPlainTableVariableLength, then it means variable length. Otherwise, all
+  // the keys need to have the fix length of this value. bloom_bits_per_key is
+  // number of bits used for bloom filer per key. hash_table_ratio is
+  // the desired utilization of the hash table used for prefix hashing.
+  // hash_table_ratio = number of prefixes / #buckets in the hash table
+  // hash_table_ratio = 0 means skip hash table but only replying on binary
+  // search.
+  // index_sparseness determines index interval for keys
+  // inside the same prefix. It will be the maximum number of linear search
+  // required after hash and binary search.
+  // index_sparseness = 0 means index for every key.
+  // huge_page_tlb_size determines whether to allocate hash indexes from huge
+  // page TLB and the page size if allocating from there. See comments of
+  // Arena::AllocateAligned() for details.
+  explicit PlainTableFactory(
+      const PlainTableOptions& _table_options = PlainTableOptions());
+
+  // Method to allow CheckedCast to work for this class
+  static const char* kClassName() { return kPlainTableName(); }
+  const char* Name() const override { return kPlainTableName(); }
+  using TableFactory::NewTableReader;
+  Status NewTableReader(const ReadOptions& ro,
+                        const TableReaderOptions& table_reader_options,
+                        std::unique_ptr<RandomAccessFileReader>&& file,
+                        uint64_t file_size, std::unique_ptr<TableReader>* table,
+                        bool prefetch_index_and_filter_in_cache) const override;
+
+  TableBuilder* NewTableBuilder(
+      const TableBuilderOptions& table_builder_options,
+      WritableFileWriter* file) const override;
+
+  std::string GetPrintableOptions() const override;
+  static const char kValueTypeSeqId0 = char(~0);
+
+ private:
+  PlainTableOptions table_options_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_index.cc b/src/rocksdb/table/plain/plain_table_index.cc
new file mode 100644
index 000000000..b7e07cfb2
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_index.cc
@@ -0,0 +1,213 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "table/plain/plain_table_index.h"
+
+#include <cinttypes>
+
+#include "logging/logging.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
+  assert(num_buckets > 0);
+  return hash % num_buckets;
+}
+}  // namespace
+
+Status PlainTableIndex::InitFromRawData(Slice data) {
+  if (!GetVarint32(&data, &index_size_)) {
+    return Status::Corruption("Couldn't read the index size!");
+  }
+  assert(index_size_ > 0);
+  if (!GetVarint32(&data, &num_prefixes_)) {
+    return Status::Corruption("Couldn't read the index size!");
+  }
+  sub_index_size_ =
+      static_cast<uint32_t>(data.size()) - index_size_ * kOffsetLen;
+
+  char* index_data_begin = const_cast<char*>(data.data());
+  index_ = reinterpret_cast<uint32_t*>(index_data_begin);
+  sub_index_ = reinterpret_cast<char*>(index_ + index_size_);
+  return Status::OK();
+}
+
+PlainTableIndex::IndexSearchResult PlainTableIndex::GetOffset(
+    uint32_t prefix_hash, uint32_t* bucket_value) const {
+  int bucket = GetBucketIdFromHash(prefix_hash, index_size_);
+  GetUnaligned(index_ + bucket, bucket_value);
+  if ((*bucket_value & kSubIndexMask) == kSubIndexMask) {
+    *bucket_value ^= kSubIndexMask;
+    return kSubindex;
+  }
+  if (*bucket_value >= kMaxFileSize) {
+    return kNoPrefixForBucket;
+  } else {
+    // point directly to the file
+    return kDirectToFile;
+  }
+}
+
+void PlainTableIndexBuilder::IndexRecordList::AddRecord(uint32_t hash,
+                                                        uint32_t offset) {
+  if (num_records_in_current_group_ == kNumRecordsPerGroup) {
+    current_group_ = AllocateNewGroup();
+    num_records_in_current_group_ = 0;
+  }
+  auto& new_record = current_group_[num_records_in_current_group_++];
+  new_record.hash = hash;
+  new_record.offset = offset;
+  new_record.next = nullptr;
+}
+
+void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice,
+                                          uint32_t key_offset) {
+  if (is_first_record_ || prev_key_prefix_ != key_prefix_slice.ToString()) {
+    ++num_prefixes_;
+    if (!is_first_record_) {
+      keys_per_prefix_hist_.Add(num_keys_per_prefix_);
+    }
+    num_keys_per_prefix_ = 0;
+    prev_key_prefix_ = key_prefix_slice.ToString();
+    prev_key_prefix_hash_ = GetSliceHash(key_prefix_slice);
+    due_index_ = true;
+  }
+
+  if (due_index_) {
+    // Add an index key for every kIndexIntervalForSamePrefixKeys keys
+    record_list_.AddRecord(prev_key_prefix_hash_, key_offset);
+    due_index_ = false;
+  }
+
+  num_keys_per_prefix_++;
+  if (index_sparseness_ == 0 || num_keys_per_prefix_ % index_sparseness_ == 0) {
+    due_index_ = true;
+  }
+  is_first_record_ = false;
+}
+
+Slice PlainTableIndexBuilder::Finish() {
+  AllocateIndex();
+  std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr);
+  std::vector<uint32_t> entries_per_bucket(index_size_, 0);
+  BucketizeIndexes(&hash_to_offsets, &entries_per_bucket);
+
+  keys_per_prefix_hist_.Add(num_keys_per_prefix_);
+  ROCKS_LOG_INFO(ioptions_.logger, "Number of Keys per prefix Histogram: %s",
+                 keys_per_prefix_hist_.ToString().c_str());
+
+  // From the temp data structure, populate indexes.
+  return FillIndexes(hash_to_offsets, entries_per_bucket);
+}
+
+void PlainTableIndexBuilder::AllocateIndex() {
+  if (prefix_extractor_ == nullptr || hash_table_ratio_ <= 0) {
+    // Fall back to pure binary search if the user fails to specify a prefix
+    // extractor.
+    index_size_ = 1;
+  } else {
+    double hash_table_size_multipier = 1.0 / hash_table_ratio_;
+    index_size_ =
+        static_cast<uint32_t>(num_prefixes_ * hash_table_size_multipier) + 1;
+    assert(index_size_ > 0);
+  }
+}
+
+void PlainTableIndexBuilder::BucketizeIndexes(
+    std::vector<IndexRecord*>* hash_to_offsets,
+    std::vector<uint32_t>* entries_per_bucket) {
+  bool first = true;
+  uint32_t prev_hash = 0;
+  size_t num_records = record_list_.GetNumRecords();
+  for (size_t i = 0; i < num_records; i++) {
+    IndexRecord* index_record = record_list_.At(i);
+    uint32_t cur_hash = index_record->hash;
+    if (first || prev_hash != cur_hash) {
+      prev_hash = cur_hash;
+      first = false;
+    }
+    uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_);
+    IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket];
+    index_record->next = prev_bucket_head;
+    (*hash_to_offsets)[bucket] = index_record;
+    (*entries_per_bucket)[bucket]++;
+  }
+
+  sub_index_size_ = 0;
+  for (auto entry_count : *entries_per_bucket) {
+    if (entry_count <= 1) {
+      continue;
+    }
+    // Only buckets with more than 1 entry will have subindex.
+    sub_index_size_ += VarintLength(entry_count);
+    // total bytes needed to store these entries' in-file offsets.
+    sub_index_size_ += entry_count * PlainTableIndex::kOffsetLen;
+  }
+}
+
+Slice PlainTableIndexBuilder::FillIndexes(
+    const std::vector<IndexRecord*>& hash_to_offsets,
+    const std::vector<uint32_t>& entries_per_bucket) {
+  ROCKS_LOG_DEBUG(ioptions_.logger,
+                  "Reserving %" PRIu32 " bytes for plain table's sub_index",
+                  sub_index_size_);
+  auto total_allocate_size = GetTotalSize();
+  char* allocated = arena_->AllocateAligned(
+      total_allocate_size, huge_page_tlb_size_, ioptions_.logger);
+
+  auto temp_ptr = EncodeVarint32(allocated, index_size_);
+  uint32_t* index =
+      reinterpret_cast<uint32_t*>(EncodeVarint32(temp_ptr, num_prefixes_));
+  char* sub_index = reinterpret_cast<char*>(index + index_size_);
+
+  uint32_t sub_index_offset = 0;
+  for (uint32_t i = 0; i < index_size_; i++) {
+    uint32_t num_keys_for_bucket = entries_per_bucket[i];
+    switch (num_keys_for_bucket) {
+      case 0:
+        // No key for bucket
+        PutUnaligned(index + i, (uint32_t)PlainTableIndex::kMaxFileSize);
+        break;
+      case 1:
+        // point directly to the file offset
+        PutUnaligned(index + i, hash_to_offsets[i]->offset);
+        break;
+      default:
+        // point to second level indexes.
+        PutUnaligned(index + i,
+                     sub_index_offset | PlainTableIndex::kSubIndexMask);
+        char* prev_ptr = &sub_index[sub_index_offset];
+        char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
+        sub_index_offset += static_cast<uint32_t>(cur_ptr - prev_ptr);
+        char* sub_index_pos = &sub_index[sub_index_offset];
+        IndexRecord* record = hash_to_offsets[i];
+        int j;
+        for (j = num_keys_for_bucket - 1; j >= 0 && record;
+             j--, record = record->next) {
+          EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset);
+        }
+        assert(j == -1 && record == nullptr);
+        sub_index_offset += PlainTableIndex::kOffsetLen * num_keys_for_bucket;
+        assert(sub_index_offset <= sub_index_size_);
+        break;
+    }
+  }
+  assert(sub_index_offset == sub_index_size_);
+
+  ROCKS_LOG_DEBUG(ioptions_.logger,
+                  "hash table size: %" PRIu32 ", suffix_map length %" PRIu32,
+                  index_size_, sub_index_size_);
+  return Slice(allocated, GetTotalSize());
+}
+
+const std::string PlainTableIndexBuilder::kPlainTableIndexBlock =
+    "PlainTableIndexBlock";
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_index.h b/src/rocksdb/table/plain/plain_table_index.h
new file mode 100644
index 000000000..9f5f0eeff
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_index.h
@@ -0,0 +1,248 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "memory/arena.h"
+#include "monitoring/histogram.h"
+#include "options/cf_options.h"
+#include "rocksdb/options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The file contains two classes PlainTableIndex and PlainTableIndexBuilder
+// The two classes implement the index format of PlainTable.
+// For description of PlainTable format, see comments of class
+// PlainTableFactory
+//
+//
+// PlainTableIndex contains buckets size of index_size_, each is a
+// 32-bit integer. The lower 31 bits contain an offset value (explained below)
+// and the first bit of the integer indicates type of the offset.
+//
+// +--------------+------------------------------------------------------+
+// | Flag (1 bit) | Offset to binary search buffer or file (31 bits)     +
+// +--------------+------------------------------------------------------+
+//
+// Explanation for the "flag bit":
+//
+// 0 indicates that the bucket contains only one prefix (no conflict when
+//   hashing this prefix), whose first row starts from this offset of the
+// file.
+// 1 indicates that the bucket contains more than one prefixes, or there
+//   are too many rows for one prefix so we need a binary search for it. In
+//   this case, the offset indicates the offset of sub_index_ holding the
+//   binary search indexes of keys for those rows. Those binary search indexes
+//   are organized in this way:
+//
+// The first 4 bytes, indicate how many indexes (N) are stored after it. After
+// it, there are N 32-bit integers, each points of an offset of the file,
+// which
+// points to starting of a row. Those offsets need to be guaranteed to be in
+// ascending order so the keys they are pointing to are also in ascending
+// order
+// to make sure we can use them to do binary searches. Below is visual
+// presentation of a bucket.
+//
+// <begin>
+//   number_of_records:  varint32
+//   record 1 file offset:  fixedint32
+//   record 2 file offset:  fixedint32
+//    ....
+//   record N file offset:  fixedint32
+// <end>
+
+// The class loads the index block from a PlainTable SST file, and executes
+// the index lookup.
+// The class is used by PlainTableReader class.
+class PlainTableIndex {
+ public:
+  enum IndexSearchResult {
+    kNoPrefixForBucket = 0,
+    kDirectToFile = 1,
+    kSubindex = 2
+  };
+
+  explicit PlainTableIndex(Slice data) { InitFromRawData(data); }
+
+  PlainTableIndex()
+      : index_size_(0),
+        sub_index_size_(0),
+        num_prefixes_(0),
+        index_(nullptr),
+        sub_index_(nullptr) {}
+
+  // The function that executes the lookup the hash table.
+  // The hash key is `prefix_hash`. The function fills the hash bucket
+  // content in `bucket_value`, which is up to the caller to interpret.
+  IndexSearchResult GetOffset(uint32_t prefix_hash,
+                              uint32_t* bucket_value) const;
+
+  // Initialize data from `index_data`, which points to raw data for
+  // index stored in the SST file.
+  Status InitFromRawData(Slice index_data);
+
+  // Decode the sub index for specific hash bucket.
+  // The `offset` is the value returned as `bucket_value` by GetOffset()
+  // and is only valid when the return value is `kSubindex`.
+  // The return value is the pointer to the starting address of the
+  // sub-index. `upper_bound` is filled with the value indicating how many
+  // entries the sub-index has.
+  const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset,
+                                              uint32_t* upper_bound) const {
+    const char* index_ptr = &sub_index_[offset];
+    return GetVarint32Ptr(index_ptr, index_ptr + 4, upper_bound);
+  }
+
+  uint32_t GetIndexSize() const { return index_size_; }
+
+  uint32_t GetSubIndexSize() const { return sub_index_size_; }
+
+  uint32_t GetNumPrefixes() const { return num_prefixes_; }
+
+  static const uint64_t kMaxFileSize = (1u << 31) - 1;
+  static const uint32_t kSubIndexMask = 0x80000000;
+  static const size_t kOffsetLen = sizeof(uint32_t);
+
+ private:
+  uint32_t index_size_;
+  uint32_t sub_index_size_;
+  uint32_t num_prefixes_;
+
+  uint32_t* index_;
+  char* sub_index_;
+};
+
+// PlainTableIndexBuilder is used to create plain table index.
+// After calling Finish(), it returns Slice, which is usually
+// used either to initialize PlainTableIndex or
+// to save index to sst file.
+// For more details about the index, please refer to:
+// https://github.com/facebook/rocksdb/wiki/PlainTable-Format
+// #wiki-in-memory-index-format
+// The class is used by PlainTableBuilder class.
+class PlainTableIndexBuilder {
+ public:
+  PlainTableIndexBuilder(Arena* arena, const ImmutableOptions& ioptions,
+                         const SliceTransform* prefix_extractor,
+                         size_t index_sparseness, double hash_table_ratio,
+                         size_t huge_page_tlb_size)
+      : arena_(arena),
+        ioptions_(ioptions),
+        record_list_(kRecordsPerGroup),
+        is_first_record_(true),
+        due_index_(false),
+        num_prefixes_(0),
+        num_keys_per_prefix_(0),
+        prev_key_prefix_hash_(0),
+        index_sparseness_(index_sparseness),
+        index_size_(0),
+        sub_index_size_(0),
+        prefix_extractor_(prefix_extractor),
+        hash_table_ratio_(hash_table_ratio),
+        huge_page_tlb_size_(huge_page_tlb_size) {}
+
+  void AddKeyPrefix(Slice key_prefix_slice, uint32_t key_offset);
+
+  Slice Finish();
+
+  uint32_t GetTotalSize() const {
+    return VarintLength(index_size_) + VarintLength(num_prefixes_) +
+           PlainTableIndex::kOffsetLen * index_size_ + sub_index_size_;
+  }
+
+  static const std::string kPlainTableIndexBlock;
+
+ private:
+  struct IndexRecord {
+    uint32_t hash;    // hash of the prefix
+    uint32_t offset;  // offset of a row
+    IndexRecord* next;
+  };
+
+  // Helper class to track all the index records
+  class IndexRecordList {
+   public:
+    explicit IndexRecordList(size_t num_records_per_group)
+        : kNumRecordsPerGroup(num_records_per_group),
+          current_group_(nullptr),
+          num_records_in_current_group_(num_records_per_group) {}
+
+    ~IndexRecordList() {
+      for (size_t i = 0; i < groups_.size(); i++) {
+        delete[] groups_[i];
+      }
+    }
+
+    void AddRecord(uint32_t hash, uint32_t offset);
+
+    size_t GetNumRecords() const {
+      return (groups_.size() - 1) * kNumRecordsPerGroup +
+             num_records_in_current_group_;
+    }
+    IndexRecord* At(size_t index) {
+      return &(
+          groups_[index / kNumRecordsPerGroup][index % kNumRecordsPerGroup]);
+    }
+
+   private:
+    IndexRecord* AllocateNewGroup() {
+      IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
+      groups_.push_back(result);
+      return result;
+    }
+
+    // Each group in `groups_` contains fix-sized records (determined by
+    // kNumRecordsPerGroup). Which can help us minimize the cost if resizing
+    // occurs.
+    const size_t kNumRecordsPerGroup;
+    IndexRecord* current_group_;
+    // List of arrays allocated
+    std::vector<IndexRecord*> groups_;
+    size_t num_records_in_current_group_;
+  };
+
+  void AllocateIndex();
+
+  // Internal helper function to bucket index record list to hash buckets.
+  void BucketizeIndexes(std::vector<IndexRecord*>* hash_to_offsets,
+                        std::vector<uint32_t>* entries_per_bucket);
+
+  // Internal helper class to fill the indexes and bloom filters to internal
+  // data structures.
+  Slice FillIndexes(const std::vector<IndexRecord*>& hash_to_offsets,
+                    const std::vector<uint32_t>& entries_per_bucket);
+
+  Arena* arena_;
+  const ImmutableOptions ioptions_;
+  HistogramImpl keys_per_prefix_hist_;
+  IndexRecordList record_list_;
+  bool is_first_record_;
+  bool due_index_;
+  uint32_t num_prefixes_;
+  uint32_t num_keys_per_prefix_;
+
+  uint32_t prev_key_prefix_hash_;
+  size_t index_sparseness_;
+  uint32_t index_size_;
+  uint32_t sub_index_size_;
+
+  const SliceTransform* prefix_extractor_;
+  double hash_table_ratio_;
+  size_t huge_page_tlb_size_;
+
+  std::string prev_key_prefix_;
+
+  static const size_t kRecordsPerGroup = 256;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_key_coding.cc b/src/rocksdb/table/plain/plain_table_key_coding.cc
new file mode 100644
index 000000000..800d8d76f
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_key_coding.cc
@@ -0,0 +1,509 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "table/plain/plain_table_key_coding.h"
+
+#include <algorithm>
+#include <string>
+
+#include "db/dbformat.h"
+#include "file/writable_file_writer.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+enum PlainTableEntryType : unsigned char {
+  kFullKey = 0,
+  kPrefixFromPreviousKey = 1,
+  kKeySuffix = 2,
+};
+
+namespace {
+
+// Control byte:
+// First two bits indicate type of entry
+// Other bytes are inlined sizes. If all bits are 1 (0x03F), overflow bytes
+// are used. key_size-0x3F will be encoded as a variint32 after this bytes.
+
+const unsigned char kSizeInlineLimit = 0x3F;
+
+// Return 0 for error
+size_t EncodeSize(PlainTableEntryType type, uint32_t key_size,
+                  char* out_buffer) {
+  out_buffer[0] = type << 6;
+
+  if (key_size < static_cast<uint32_t>(kSizeInlineLimit)) {
+    // size inlined
+    out_buffer[0] |= static_cast<char>(key_size);
+    return 1;
+  } else {
+    out_buffer[0] |= kSizeInlineLimit;
+    char* ptr = EncodeVarint32(out_buffer + 1, key_size - kSizeInlineLimit);
+    return ptr - out_buffer;
+  }
+}
+}  // namespace
+
+// Fill bytes_read with number of bytes read.
+inline Status PlainTableKeyDecoder::DecodeSize(uint32_t start_offset,
+                                               PlainTableEntryType* entry_type,
+                                               uint32_t* key_size,
+                                               uint32_t* bytes_read) {
+  Slice next_byte_slice;
+  bool success = file_reader_.Read(start_offset, 1, &next_byte_slice);
+  if (!success) {
+    return file_reader_.status();
+  }
+  *entry_type = static_cast<PlainTableEntryType>(
+      (static_cast<unsigned char>(next_byte_slice[0]) & ~kSizeInlineLimit) >>
+      6);
+  char inline_key_size = next_byte_slice[0] & kSizeInlineLimit;
+  if (inline_key_size < kSizeInlineLimit) {
+    *key_size = inline_key_size;
+    *bytes_read = 1;
+    return Status::OK();
+  } else {
+    uint32_t extra_size;
+    uint32_t tmp_bytes_read;
+    success = file_reader_.ReadVarint32(start_offset + 1, &extra_size,
+                                        &tmp_bytes_read);
+    if (!success) {
+      return file_reader_.status();
+    }
+    assert(tmp_bytes_read > 0);
+    *key_size = kSizeInlineLimit + extra_size;
+    *bytes_read = tmp_bytes_read + 1;
+    return Status::OK();
+  }
+}
+
+IOStatus PlainTableKeyEncoder::AppendKey(const Slice& key,
+                                         WritableFileWriter* file,
+                                         uint64_t* offset, char* meta_bytes_buf,
+                                         size_t* meta_bytes_buf_size) {
+  ParsedInternalKey parsed_key;
+  Status pik_status =
+      ParseInternalKey(key, &parsed_key, false /* log_err_key */);  // TODO
+  if (!pik_status.ok()) {
+    return IOStatus::Corruption(pik_status.getState());
+  }
+
+  Slice key_to_write = key;  // Portion of internal key to write out.
+
+  uint32_t user_key_size = static_cast<uint32_t>(key.size() - 8);
+  if (encoding_type_ == kPlain) {
+    if (fixed_user_key_len_ == kPlainTableVariableLength) {
+      // Write key length
+      char key_size_buf[5];  // tmp buffer for key size as varint32
+      char* ptr = EncodeVarint32(key_size_buf, user_key_size);
+      assert(ptr <= key_size_buf + sizeof(key_size_buf));
+      auto len = ptr - key_size_buf;
+      IOStatus io_s = file->Append(Slice(key_size_buf, len));
+      if (!io_s.ok()) {
+        return io_s;
+      }
+      *offset += len;
+    }
+  } else {
+    assert(encoding_type_ == kPrefix);
+    char size_bytes[12];
+    size_t size_bytes_pos = 0;
+
+    Slice prefix =
+        prefix_extractor_->Transform(Slice(key.data(), user_key_size));
+    if (key_count_for_prefix_ == 0 || prefix != pre_prefix_.GetUserKey() ||
+        key_count_for_prefix_ % index_sparseness_ == 0) {
+      key_count_for_prefix_ = 1;
+      pre_prefix_.SetUserKey(prefix);
+      size_bytes_pos += EncodeSize(kFullKey, user_key_size, size_bytes);
+      IOStatus io_s = file->Append(Slice(size_bytes, size_bytes_pos));
+      if (!io_s.ok()) {
+        return io_s;
+      }
+      *offset += size_bytes_pos;
+    } else {
+      key_count_for_prefix_++;
+      if (key_count_for_prefix_ == 2) {
+        // For second key within a prefix, need to encode prefix length
+        size_bytes_pos +=
+            EncodeSize(kPrefixFromPreviousKey,
+                       static_cast<uint32_t>(pre_prefix_.GetUserKey().size()),
+                       size_bytes + size_bytes_pos);
+      }
+      uint32_t prefix_len =
+          static_cast<uint32_t>(pre_prefix_.GetUserKey().size());
+      size_bytes_pos += EncodeSize(kKeySuffix, user_key_size - prefix_len,
+                                   size_bytes + size_bytes_pos);
+      IOStatus io_s = file->Append(Slice(size_bytes, size_bytes_pos));
+      if (!io_s.ok()) {
+        return io_s;
+      }
+      *offset += size_bytes_pos;
+      key_to_write = Slice(key.data() + prefix_len, key.size() - prefix_len);
+    }
+  }
+
+  // Encode full key
+  // For value size as varint32 (up to 5 bytes).
+  // If the row is of value type with seqId 0, flush the special flag together
+  // in this buffer to safe one file append call, which takes 1 byte.
+  if (parsed_key.sequence == 0 && parsed_key.type == kTypeValue) {
+    IOStatus io_s =
+        file->Append(Slice(key_to_write.data(), key_to_write.size() - 8));
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    *offset += key_to_write.size() - 8;
+    meta_bytes_buf[*meta_bytes_buf_size] = PlainTableFactory::kValueTypeSeqId0;
+    *meta_bytes_buf_size += 1;
+  } else {
+    IOStatus io_s = file->Append(key_to_write);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    *offset += key_to_write.size();
+  }
+
+  return IOStatus::OK();
+}
+
+Slice PlainTableFileReader::GetFromBuffer(Buffer* buffer, uint32_t file_offset,
+                                          uint32_t len) {
+  assert(file_offset + len <= file_info_->data_end_offset);
+  return Slice(buffer->buf.get() + (file_offset - buffer->buf_start_offset),
+               len);
+}
+
+bool PlainTableFileReader::ReadNonMmap(uint32_t file_offset, uint32_t len,
+                                       Slice* out) {
+  const uint32_t kPrefetchSize = 256u;
+
+  // Try to read from buffers.
+  for (uint32_t i = 0; i < num_buf_; i++) {
+    Buffer* buffer = buffers_[num_buf_ - 1 - i].get();
+    if (file_offset >= buffer->buf_start_offset &&
+        file_offset + len <= buffer->buf_start_offset + buffer->buf_len) {
+      *out = GetFromBuffer(buffer, file_offset, len);
+      return true;
+    }
+  }
+
+  Buffer* new_buffer;
+  // Data needed is not in any of the buffer. Allocate a new buffer.
+  if (num_buf_ < buffers_.size()) {
+    // Add a new buffer
+    new_buffer = new Buffer();
+    buffers_[num_buf_++].reset(new_buffer);
+  } else {
+    // Now simply replace the last buffer. Can improve the placement policy
+    // if needed.
+    new_buffer = buffers_[num_buf_ - 1].get();
+  }
+
+  assert(file_offset + len <= file_info_->data_end_offset);
+  uint32_t size_to_read = std::min(file_info_->data_end_offset - file_offset,
+                                   std::max(kPrefetchSize, len));
+  if (size_to_read > new_buffer->buf_capacity) {
+    new_buffer->buf.reset(new char[size_to_read]);
+    new_buffer->buf_capacity = size_to_read;
+    new_buffer->buf_len = 0;
+  }
+  Slice read_result;
+  // TODO: rate limit plain table reads.
+  Status s =
+      file_info_->file->Read(IOOptions(), file_offset, size_to_read,
+                             &read_result, new_buffer->buf.get(), nullptr,
+                             Env::IO_TOTAL /* rate_limiter_priority */);
+  if (!s.ok()) {
+    status_ = s;
+    return false;
+  }
+  new_buffer->buf_start_offset = file_offset;
+  new_buffer->buf_len = size_to_read;
+  *out = GetFromBuffer(new_buffer, file_offset, len);
+  return true;
+}
+
+inline bool PlainTableFileReader::ReadVarint32(uint32_t offset, uint32_t* out,
+                                               uint32_t* bytes_read) {
+  if (file_info_->is_mmap_mode) {
+    const char* start = file_info_->file_data.data() + offset;
+    const char* limit =
+        file_info_->file_data.data() + file_info_->data_end_offset;
+    const char* key_ptr = GetVarint32Ptr(start, limit, out);
+    assert(key_ptr != nullptr);
+    *bytes_read = static_cast<uint32_t>(key_ptr - start);
+    return true;
+  } else {
+    return ReadVarint32NonMmap(offset, out, bytes_read);
+  }
+}
+
+bool PlainTableFileReader::ReadVarint32NonMmap(uint32_t offset, uint32_t* out,
+                                               uint32_t* bytes_read) {
+  const char* start;
+  const char* limit;
+  const uint32_t kMaxVarInt32Size = 6u;
+  uint32_t bytes_to_read =
+      std::min(file_info_->data_end_offset - offset, kMaxVarInt32Size);
+  Slice bytes;
+  if (!Read(offset, bytes_to_read, &bytes)) {
+    return false;
+  }
+  start = bytes.data();
+  limit = bytes.data() + bytes.size();
+
+  const char* key_ptr = GetVarint32Ptr(start, limit, out);
+  *bytes_read =
+      (key_ptr != nullptr) ? static_cast<uint32_t>(key_ptr - start) : 0;
+  return true;
+}
+
+Status PlainTableKeyDecoder::ReadInternalKey(
+    uint32_t file_offset, uint32_t user_key_size, ParsedInternalKey* parsed_key,
+    uint32_t* bytes_read, bool* internal_key_valid, Slice* internal_key) {
+  Slice tmp_slice;
+  bool success = file_reader_.Read(file_offset, user_key_size + 1, &tmp_slice);
+  if (!success) {
+    return file_reader_.status();
+  }
+  if (tmp_slice[user_key_size] == PlainTableFactory::kValueTypeSeqId0) {
+    // Special encoding for the row with seqID=0
+    parsed_key->user_key = Slice(tmp_slice.data(), user_key_size);
+    parsed_key->sequence = 0;
+    parsed_key->type = kTypeValue;
+    *bytes_read += user_key_size + 1;
+    *internal_key_valid = false;
+  } else {
+    success = file_reader_.Read(file_offset, user_key_size + 8, internal_key);
+    if (!success) {
+      return file_reader_.status();
+    }
+    *internal_key_valid = true;
+    Status pik_status = ParseInternalKey(*internal_key, parsed_key,
+                                         false /* log_err_key */);  // TODO
+    if (!pik_status.ok()) {
+      return Status::Corruption(
+          Slice("Corrupted key found during next key read. "),
+          pik_status.getState());
+    }
+    *bytes_read += user_key_size + 8;
+  }
+  return Status::OK();
+}
+
+Status PlainTableKeyDecoder::NextPlainEncodingKey(uint32_t start_offset,
+                                                  ParsedInternalKey* parsed_key,
+                                                  Slice* internal_key,
+                                                  uint32_t* bytes_read,
+                                                  bool* /*seekable*/) {
+  uint32_t user_key_size = 0;
+  Status s;
+  if (fixed_user_key_len_ != kPlainTableVariableLength) {
+    user_key_size = fixed_user_key_len_;
+  } else {
+    uint32_t tmp_size = 0;
+    uint32_t tmp_read;
+    bool success =
+        file_reader_.ReadVarint32(start_offset, &tmp_size, &tmp_read);
+    if (!success) {
+      return file_reader_.status();
+    }
+    assert(tmp_read > 0);
+    user_key_size = tmp_size;
+    *bytes_read = tmp_read;
+  }
+  // dummy initial value to avoid compiler complain
+  bool decoded_internal_key_valid = true;
+  Slice decoded_internal_key;
+  s = ReadInternalKey(start_offset + *bytes_read, user_key_size, parsed_key,
+                      bytes_read, &decoded_internal_key_valid,
+                      &decoded_internal_key);
+  if (!s.ok()) {
+    return s;
+  }
+  if (!file_reader_.file_info()->is_mmap_mode) {
+    cur_key_.SetInternalKey(*parsed_key);
+    parsed_key->user_key =
+        Slice(cur_key_.GetInternalKey().data(), user_key_size);
+    if (internal_key != nullptr) {
+      *internal_key = cur_key_.GetInternalKey();
+    }
+  } else if (internal_key != nullptr) {
+    if (decoded_internal_key_valid) {
+      *internal_key = decoded_internal_key;
+    } else {
+      // Need to copy out the internal key
+      cur_key_.SetInternalKey(*parsed_key);
+      *internal_key = cur_key_.GetInternalKey();
+    }
+  }
+  return Status::OK();
+}
+
+Status PlainTableKeyDecoder::NextPrefixEncodingKey(
+    uint32_t start_offset, ParsedInternalKey* parsed_key, Slice* internal_key,
+    uint32_t* bytes_read, bool* seekable) {
+  PlainTableEntryType entry_type;
+
+  bool expect_suffix = false;
+  Status s;
+  do {
+    uint32_t size = 0;
+    // dummy initial value to avoid compiler complain
+    bool decoded_internal_key_valid = true;
+    uint32_t my_bytes_read = 0;
+    s = DecodeSize(start_offset + *bytes_read, &entry_type, &size,
+                   &my_bytes_read);
+    if (!s.ok()) {
+      return s;
+    }
+    if (my_bytes_read == 0) {
+      return Status::Corruption("Unexpected EOF when reading size of the key");
+    }
+    *bytes_read += my_bytes_read;
+
+    switch (entry_type) {
+      case kFullKey: {
+        expect_suffix = false;
+        Slice decoded_internal_key;
+        s = ReadInternalKey(start_offset + *bytes_read, size, parsed_key,
+                            bytes_read, &decoded_internal_key_valid,
+                            &decoded_internal_key);
+        if (!s.ok()) {
+          return s;
+        }
+        if (!file_reader_.file_info()->is_mmap_mode ||
+            (internal_key != nullptr && !decoded_internal_key_valid)) {
+          // In non-mmap mode, always need to make a copy of keys returned to
+          // users, because after reading value for the key, the key might
+          // be invalid.
+          cur_key_.SetInternalKey(*parsed_key);
+          saved_user_key_ = cur_key_.GetUserKey();
+          if (!file_reader_.file_info()->is_mmap_mode) {
+            parsed_key->user_key =
+                Slice(cur_key_.GetInternalKey().data(), size);
+          }
+          if (internal_key != nullptr) {
+            *internal_key = cur_key_.GetInternalKey();
+          }
+        } else {
+          if (internal_key != nullptr) {
+            *internal_key = decoded_internal_key;
+          }
+          saved_user_key_ = parsed_key->user_key;
+        }
+        break;
+      }
+      case kPrefixFromPreviousKey: {
+        if (seekable != nullptr) {
+          *seekable = false;
+        }
+        prefix_len_ = size;
+        assert(prefix_extractor_ == nullptr ||
+               prefix_extractor_->Transform(saved_user_key_).size() ==
+                   prefix_len_);
+        // Need read another size flag for suffix
+        expect_suffix = true;
+        break;
+      }
+      case kKeySuffix: {
+        expect_suffix = false;
+        if (seekable != nullptr) {
+          *seekable = false;
+        }
+
+        Slice tmp_slice;
+        s = ReadInternalKey(start_offset + *bytes_read, size, parsed_key,
+                            bytes_read, &decoded_internal_key_valid,
+                            &tmp_slice);
+        if (!s.ok()) {
+          return s;
+        }
+        if (!file_reader_.file_info()->is_mmap_mode) {
+          // In non-mmap mode, we need to make a copy of keys returned to
+          // users, because after reading value for the key, the key might
+          // be invalid.
+          // saved_user_key_ points to cur_key_. We are making a copy of
+          // the prefix part to another string, and construct the current
+          // key from the prefix part and the suffix part back to cur_key_.
+          std::string tmp =
+              Slice(saved_user_key_.data(), prefix_len_).ToString();
+          cur_key_.Reserve(prefix_len_ + size);
+          cur_key_.SetInternalKey(tmp, *parsed_key);
+          parsed_key->user_key =
+              Slice(cur_key_.GetInternalKey().data(), prefix_len_ + size);
+          saved_user_key_ = cur_key_.GetUserKey();
+        } else {
+          cur_key_.Reserve(prefix_len_ + size);
+          cur_key_.SetInternalKey(Slice(saved_user_key_.data(), prefix_len_),
+                                  *parsed_key);
+        }
+        parsed_key->user_key = cur_key_.GetUserKey();
+        if (internal_key != nullptr) {
+          *internal_key = cur_key_.GetInternalKey();
+        }
+        break;
+      }
+      default:
+        return Status::Corruption("Un-identified size flag.");
+    }
+  } while (expect_suffix);  // Another round if suffix is expected.
+  return Status::OK();
+}
+
+Status PlainTableKeyDecoder::NextKey(uint32_t start_offset,
+                                     ParsedInternalKey* parsed_key,
+                                     Slice* internal_key, Slice* value,
+                                     uint32_t* bytes_read, bool* seekable) {
+  assert(value != nullptr);
+  Status s = NextKeyNoValue(start_offset, parsed_key, internal_key, bytes_read,
+                            seekable);
+  if (s.ok()) {
+    assert(bytes_read != nullptr);
+    uint32_t value_size;
+    uint32_t value_size_bytes;
+    bool success = file_reader_.ReadVarint32(start_offset + *bytes_read,
+                                             &value_size, &value_size_bytes);
+    if (!success) {
+      return file_reader_.status();
+    }
+    if (value_size_bytes == 0) {
+      return Status::Corruption(
+          "Unexpected EOF when reading the next value's size.");
+    }
+    *bytes_read += value_size_bytes;
+    success = file_reader_.Read(start_offset + *bytes_read, value_size, value);
+    if (!success) {
+      return file_reader_.status();
+    }
+    *bytes_read += value_size;
+  }
+  return s;
+}
+
+Status PlainTableKeyDecoder::NextKeyNoValue(uint32_t start_offset,
+                                            ParsedInternalKey* parsed_key,
+                                            Slice* internal_key,
+                                            uint32_t* bytes_read,
+                                            bool* seekable) {
+  *bytes_read = 0;
+  if (seekable != nullptr) {
+    *seekable = true;
+  }
+  if (encoding_type_ == kPlain) {
+    return NextPlainEncodingKey(start_offset, parsed_key, internal_key,
+                                bytes_read, seekable);
+  } else {
+    assert(encoding_type_ == kPrefix);
+    return NextPrefixEncodingKey(start_offset, parsed_key, internal_key,
+                                 bytes_read, seekable);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LIT
diff --git a/src/rocksdb/table/plain/plain_table_key_coding.h b/src/rocksdb/table/plain/plain_table_key_coding.h
new file mode 100644
index 000000000..9cda7df32
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_key_coding.h
@@ -0,0 +1,201 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <array>
+
+#include "rocksdb/slice.h"
+#include "table/plain/plain_table_reader.h"
+
+// The file contains three helper classes of PlainTable format,
+// PlainTableKeyEncoder, PlainTableKeyDecoder and PlainTableFileReader.
+// These classes issue the lowest level of operations of PlainTable.
+// Actual data format of the key is documented in comments of class
+// PlainTableFactory.
+namespace ROCKSDB_NAMESPACE {
+
+class WritableFile;
+struct ParsedInternalKey;
+struct PlainTableReaderFileInfo;
+enum PlainTableEntryType : unsigned char;
+
+// Helper class for PlainTable format to write out a key to an output file
+// The class is used in PlainTableBuilder.
+class PlainTableKeyEncoder {
+ public:
+  explicit PlainTableKeyEncoder(EncodingType encoding_type,
+                                uint32_t user_key_len,
+                                const SliceTransform* prefix_extractor,
+                                size_t index_sparseness)
+      : encoding_type_((prefix_extractor != nullptr) ? encoding_type : kPlain),
+        fixed_user_key_len_(user_key_len),
+        prefix_extractor_(prefix_extractor),
+        index_sparseness_((index_sparseness > 1) ? index_sparseness : 1),
+        key_count_for_prefix_(0) {}
+  // key: the key to write out, in the format of internal key.
+  // file: the output file to write out
+  // offset: offset in the file. Needs to be updated after appending bytes
+  //         for the key
+  // meta_bytes_buf: buffer for extra meta bytes
+  // meta_bytes_buf_size: offset to append extra meta bytes. Will be updated
+  //                      if meta_bytes_buf is updated.
+  IOStatus AppendKey(const Slice& key, WritableFileWriter* file,
+                     uint64_t* offset, char* meta_bytes_buf,
+                     size_t* meta_bytes_buf_size);
+
+  // Return actual encoding type to be picked
+  EncodingType GetEncodingType() { return encoding_type_; }
+
+ private:
+  EncodingType encoding_type_;
+  uint32_t fixed_user_key_len_;
+  const SliceTransform* prefix_extractor_;
+  const size_t index_sparseness_;
+  size_t key_count_for_prefix_;
+  IterKey pre_prefix_;
+};
+
+// The class does raw file reads for PlainTableReader.
+// It hides whether it is a mmap-read, or a non-mmap read.
+// The class is implemented in a way to favor the performance of mmap case.
+// The class is used by PlainTableReader.
+class PlainTableFileReader {
+ public:
+  explicit PlainTableFileReader(const PlainTableReaderFileInfo* _file_info)
+      : file_info_(_file_info), num_buf_(0) {}
+
+  ~PlainTableFileReader() {
+    // Should fix.
+    status_.PermitUncheckedError();
+  }
+
+  // In mmaped mode, the results point to mmaped area of the file, which
+  // means it is always valid before closing the file.
+  // In non-mmap mode, the results point to an internal buffer. If the caller
+  // makes another read call, the results may not be valid. So callers should
+  // make a copy when needed.
+  // In order to save read calls to files, we keep two internal buffers:
+  // the first read and the most recent read. This is efficient because it
+  // columns these two common use cases:
+  // (1) hash index only identify one location, we read the key to verify
+  //     the location, and read key and value if it is the right location.
+  // (2) after hash index checking, we identify two locations (because of
+  //     hash bucket conflicts), we binary search the two location to see
+  //     which one is what we need and start to read from the location.
+  // These two most common use cases will be covered by the two buffers
+  // so that we don't need to re-read the same location.
+  // Currently we keep a fixed size buffer. If a read doesn't exactly fit
+  // the buffer, we replace the second buffer with the location user reads.
+  //
+  // If return false, status code is stored in status_.
+  bool Read(uint32_t file_offset, uint32_t len, Slice* out) {
+    if (file_info_->is_mmap_mode) {
+      assert(file_offset + len <= file_info_->data_end_offset);
+      *out = Slice(file_info_->file_data.data() + file_offset, len);
+      return true;
+    } else {
+      return ReadNonMmap(file_offset, len, out);
+    }
+  }
+
+  // If return false, status code is stored in status_.
+  bool ReadNonMmap(uint32_t file_offset, uint32_t len, Slice* output);
+
+  // *bytes_read = 0 means eof. false means failure and status is saved
+  // in status_. Not directly returning Status to save copying status
+  // object to map previous performance of mmap mode.
+  inline bool ReadVarint32(uint32_t offset, uint32_t* output,
+                           uint32_t* bytes_read);
+
+  bool ReadVarint32NonMmap(uint32_t offset, uint32_t* output,
+                           uint32_t* bytes_read);
+
+  Status status() const { return status_; }
+
+  const PlainTableReaderFileInfo* file_info() { return file_info_; }
+
+ private:
+  const PlainTableReaderFileInfo* file_info_;
+
+  struct Buffer {
+    Buffer() : buf_start_offset(0), buf_len(0), buf_capacity(0) {}
+    std::unique_ptr<char[]> buf;
+    uint32_t buf_start_offset;
+    uint32_t buf_len;
+    uint32_t buf_capacity;
+  };
+
+  // Keep buffers for two recent reads.
+  std::array<std::unique_ptr<Buffer>, 2> buffers_;
+  uint32_t num_buf_;
+  Status status_;
+
+  Slice GetFromBuffer(Buffer* buf, uint32_t file_offset, uint32_t len);
+};
+
+// A helper class to decode keys from input buffer
+// The class is used by PlainTableBuilder.
+class PlainTableKeyDecoder {
+ public:
+  explicit PlainTableKeyDecoder(const PlainTableReaderFileInfo* file_info,
+                                EncodingType encoding_type,
+                                uint32_t user_key_len,
+                                const SliceTransform* prefix_extractor)
+      : file_reader_(file_info),
+        encoding_type_(encoding_type),
+        prefix_len_(0),
+        fixed_user_key_len_(user_key_len),
+        prefix_extractor_(prefix_extractor),
+        in_prefix_(false) {}
+
+  // Find the next key.
+  // start: char array where the key starts.
+  // limit: boundary of the char array
+  // parsed_key: the output of the result key
+  // internal_key: if not null, fill with the output of the result key in
+  //               un-parsed format
+  // bytes_read: how many bytes read from start. Output
+  // seekable: whether key can be read from this place. Used when building
+  //           indexes. Output.
+  Status NextKey(uint32_t start_offset, ParsedInternalKey* parsed_key,
+                 Slice* internal_key, Slice* value, uint32_t* bytes_read,
+                 bool* seekable = nullptr);
+
+  Status NextKeyNoValue(uint32_t start_offset, ParsedInternalKey* parsed_key,
+                        Slice* internal_key, uint32_t* bytes_read,
+                        bool* seekable = nullptr);
+
+  PlainTableFileReader file_reader_;
+  EncodingType encoding_type_;
+  uint32_t prefix_len_;
+  uint32_t fixed_user_key_len_;
+  Slice saved_user_key_;
+  IterKey cur_key_;
+  const SliceTransform* prefix_extractor_;
+  bool in_prefix_;
+
+ private:
+  Status NextPlainEncodingKey(uint32_t start_offset,
+                              ParsedInternalKey* parsed_key,
+                              Slice* internal_key, uint32_t* bytes_read,
+                              bool* seekable = nullptr);
+  Status NextPrefixEncodingKey(uint32_t start_offset,
+                               ParsedInternalKey* parsed_key,
+                               Slice* internal_key, uint32_t* bytes_read,
+                               bool* seekable = nullptr);
+  Status ReadInternalKey(uint32_t file_offset, uint32_t user_key_size,
+                         ParsedInternalKey* parsed_key, uint32_t* bytes_read,
+                         bool* internal_key_valid, Slice* internal_key);
+  inline Status DecodeSize(uint32_t start_offset,
+                           PlainTableEntryType* entry_type, uint32_t* key_size,
+                           uint32_t* bytes_read);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_reader.cc b/src/rocksdb/table/plain/plain_table_reader.cc
new file mode 100644
index 000000000..6ce3d0ab9
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_reader.cc
@@ -0,0 +1,765 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include "table/plain/plain_table_reader.h"
+
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "memory/arena.h"
+#include "monitoring/histogram.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "table/block_based/block.h"
+#include "table/block_based/filter_block.h"
+#include "table/format.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/meta_blocks.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_key_coding.h"
+#include "table/two_level_iterator.h"
+#include "util/coding.h"
+#include "util/dynamic_bloom.h"
+#include "util/hash.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Safely getting a uint32_t element from a char array, where, starting from
+// `base`, every 4 bytes are considered as an fixed 32 bit integer.
+inline uint32_t GetFixed32Element(const char* base, size_t offset) {
+  return DecodeFixed32(base + offset * sizeof(uint32_t));
+}
+}  // namespace
+
+// Iterator to iterate IndexedTable
+class PlainTableIterator : public InternalIterator {
+ public:
+  explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek);
+  // No copying allowed
+  PlainTableIterator(const PlainTableIterator&) = delete;
+  void operator=(const Iterator&) = delete;
+
+  ~PlainTableIterator() override;
+
+  bool Valid() const override;
+
+  void SeekToFirst() override;
+
+  void SeekToLast() override;
+
+  void Seek(const Slice& target) override;
+
+  void SeekForPrev(const Slice& target) override;
+
+  void Next() override;
+
+  void Prev() override;
+
+  Slice key() const override;
+
+  Slice value() const override;
+
+  Status status() const override;
+
+ private:
+  PlainTableReader* table_;
+  PlainTableKeyDecoder decoder_;
+  bool use_prefix_seek_;
+  uint32_t offset_;
+  uint32_t next_offset_;
+  Slice key_;
+  Slice value_;
+  Status status_;
+};
+
+extern const uint64_t kPlainTableMagicNumber;
+PlainTableReader::PlainTableReader(
+    const ImmutableOptions& ioptions,
+    std::unique_ptr<RandomAccessFileReader>&& file,
+    const EnvOptions& storage_options, const InternalKeyComparator& icomparator,
+    EncodingType encoding_type, uint64_t file_size,
+    const TableProperties* table_properties,
+    const SliceTransform* prefix_extractor)
+    : internal_comparator_(icomparator),
+      encoding_type_(encoding_type),
+      full_scan_mode_(false),
+      user_key_len_(static_cast<uint32_t>(table_properties->fixed_key_len)),
+      prefix_extractor_(prefix_extractor),
+      enable_bloom_(false),
+      bloom_(6),
+      file_info_(std::move(file), storage_options,
+                 static_cast<uint32_t>(table_properties->data_size)),
+      ioptions_(ioptions),
+      file_size_(file_size),
+      table_properties_(nullptr) {}
+
+PlainTableReader::~PlainTableReader() {
+  // Should fix?
+  status_.PermitUncheckedError();
+}
+
+Status PlainTableReader::Open(
+    const ImmutableOptions& ioptions, const EnvOptions& env_options,
+    const InternalKeyComparator& internal_comparator,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    std::unique_ptr<TableReader>* table_reader, const int bloom_bits_per_key,
+    double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size,
+    bool full_scan_mode, const bool immortal_table,
+    const SliceTransform* prefix_extractor) {
+  if (file_size > PlainTableIndex::kMaxFileSize) {
+    return Status::NotSupported("File is too large for PlainTableReader!");
+  }
+
+  std::unique_ptr<TableProperties> props;
+  auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
+                               ioptions, &props);
+  if (!s.ok()) {
+    return s;
+  }
+
+  assert(hash_table_ratio >= 0.0);
+  auto& user_props = props->user_collected_properties;
+  auto prefix_extractor_in_file = props->prefix_extractor_name;
+
+  if (!full_scan_mode &&
+      !prefix_extractor_in_file.empty() /* old version sst file*/
+      && prefix_extractor_in_file != "nullptr") {
+    if (!prefix_extractor) {
+      return Status::InvalidArgument(
+          "Prefix extractor is missing when opening a PlainTable built "
+          "using a prefix extractor");
+    } else if (prefix_extractor_in_file != prefix_extractor->AsString()) {
+      return Status::InvalidArgument(
+          "Prefix extractor given doesn't match the one used to build "
+          "PlainTable");
+    }
+  }
+
+  EncodingType encoding_type = kPlain;
+  auto encoding_type_prop =
+      user_props.find(PlainTablePropertyNames::kEncodingType);
+  if (encoding_type_prop != user_props.end()) {
+    encoding_type = static_cast<EncodingType>(
+        DecodeFixed32(encoding_type_prop->second.c_str()));
+  }
+
+  std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
+      ioptions, std::move(file), env_options, internal_comparator,
+      encoding_type, file_size, props.get(), prefix_extractor));
+
+  s = new_reader->MmapDataIfNeeded();
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (!full_scan_mode) {
+    s = new_reader->PopulateIndex(props.get(), bloom_bits_per_key,
+                                  hash_table_ratio, index_sparseness,
+                                  huge_page_tlb_size);
+    if (!s.ok()) {
+      return s;
+    }
+  } else {
+    // Flag to indicate it is a full scan mode so that none of the indexes
+    // can be used.
+    new_reader->full_scan_mode_ = true;
+  }
+  // PopulateIndex can add to the props, so don't store them until now
+  new_reader->table_properties_ = std::move(props);
+
+  if (immortal_table && new_reader->file_info_.is_mmap_mode) {
+    new_reader->dummy_cleanable_.reset(new Cleanable());
+  }
+
+  *table_reader = std::move(new_reader);
+  return s;
+}
+
+void PlainTableReader::SetupForCompaction() {}
+
+InternalIterator* PlainTableReader::NewIterator(
+    const ReadOptions& options, const SliceTransform* /* prefix_extractor */,
+    Arena* arena, bool /*skip_filters*/, TableReaderCaller /*caller*/,
+    size_t /*compaction_readahead_size*/, bool /* allow_unprepared_value */) {
+  // Not necessarily used here, but make sure this has been initialized
+  assert(table_properties_);
+
+  // Auto prefix mode is not implemented in PlainTable.
+  bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek &&
+                         !options.auto_prefix_mode;
+  if (arena == nullptr) {
+    return new PlainTableIterator(this, use_prefix_seek);
+  } else {
+    auto mem = arena->AllocateAligned(sizeof(PlainTableIterator));
+    return new (mem) PlainTableIterator(this, use_prefix_seek);
+  }
+}
+
+Status PlainTableReader::PopulateIndexRecordList(
+    PlainTableIndexBuilder* index_builder,
+    std::vector<uint32_t>* prefix_hashes) {
+  Slice prev_key_prefix_slice;
+  std::string prev_key_prefix_buf;
+  uint32_t pos = data_start_offset_;
+
+  bool is_first_record = true;
+  Slice key_prefix_slice;
+  PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_,
+                               prefix_extractor_);
+  while (pos < file_info_.data_end_offset) {
+    uint32_t key_offset = pos;
+    ParsedInternalKey key;
+    Slice value_slice;
+    bool seekable = false;
+    Status s = Next(&decoder, &pos, &key, nullptr, &value_slice, &seekable);
+    if (!s.ok()) {
+      return s;
+    }
+
+    key_prefix_slice = GetPrefix(key);
+    if (enable_bloom_) {
+      bloom_.AddHash(GetSliceHash(key.user_key));
+    } else {
+      if (is_first_record || prev_key_prefix_slice != key_prefix_slice) {
+        if (!is_first_record) {
+          prefix_hashes->push_back(GetSliceHash(prev_key_prefix_slice));
+        }
+        if (file_info_.is_mmap_mode) {
+          prev_key_prefix_slice = key_prefix_slice;
+        } else {
+          prev_key_prefix_buf = key_prefix_slice.ToString();
+          prev_key_prefix_slice = prev_key_prefix_buf;
+        }
+      }
+    }
+
+    index_builder->AddKeyPrefix(GetPrefix(key), key_offset);
+
+    if (!seekable && is_first_record) {
+      return Status::Corruption("Key for a prefix is not seekable");
+    }
+
+    is_first_record = false;
+  }
+
+  prefix_hashes->push_back(GetSliceHash(key_prefix_slice));
+  auto s = index_.InitFromRawData(index_builder->Finish());
+  return s;
+}
+
+void PlainTableReader::AllocateBloom(int bloom_bits_per_key, int num_keys,
+                                     size_t huge_page_tlb_size) {
+  uint32_t bloom_total_bits = num_keys * bloom_bits_per_key;
+  if (bloom_total_bits > 0) {
+    enable_bloom_ = true;
+    bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality,
+                        huge_page_tlb_size, ioptions_.logger);
+  }
+}
+
+void PlainTableReader::FillBloom(const std::vector<uint32_t>& prefix_hashes) {
+  assert(bloom_.IsInitialized());
+  for (const auto prefix_hash : prefix_hashes) {
+    bloom_.AddHash(prefix_hash);
+  }
+}
+
+Status PlainTableReader::MmapDataIfNeeded() {
+  if (file_info_.is_mmap_mode) {
+    // Get mmapped memory.
+    return file_info_.file->Read(
+        IOOptions(), 0, static_cast<size_t>(file_size_), &file_info_.file_data,
+        nullptr, nullptr, Env::IO_TOTAL /* rate_limiter_priority */);
+  }
+  return Status::OK();
+}
+
+Status PlainTableReader::PopulateIndex(TableProperties* props,
+                                       int bloom_bits_per_key,
+                                       double hash_table_ratio,
+                                       size_t index_sparseness,
+                                       size_t huge_page_tlb_size) {
+  assert(props != nullptr);
+
+  BlockContents index_block_contents;
+  Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
+                           file_size_, kPlainTableMagicNumber, ioptions_,
+                           PlainTableIndexBuilder::kPlainTableIndexBlock,
+                           BlockType::kIndex, &index_block_contents);
+
+  bool index_in_file = s.ok();
+
+  BlockContents bloom_block_contents;
+  bool bloom_in_file = false;
+  // We only need to read the bloom block if index block is in file.
+  if (index_in_file) {
+    s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
+                      file_size_, kPlainTableMagicNumber, ioptions_,
+                      BloomBlockBuilder::kBloomBlock, BlockType::kFilter,
+                      &bloom_block_contents);
+    bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0;
+  }
+
+  Slice* bloom_block;
+  if (bloom_in_file) {
+    // If bloom_block_contents.allocation is not empty (which will be the case
+    // for non-mmap mode), it holds the alloated memory for the bloom block.
+    // It needs to be kept alive to keep `bloom_block` valid.
+    bloom_block_alloc_ = std::move(bloom_block_contents.allocation);
+    bloom_block = &bloom_block_contents.data;
+  } else {
+    bloom_block = nullptr;
+  }
+
+  Slice* index_block;
+  if (index_in_file) {
+    // If index_block_contents.allocation is not empty (which will be the case
+    // for non-mmap mode), it holds the alloated memory for the index block.
+    // It needs to be kept alive to keep `index_block` valid.
+    index_block_alloc_ = std::move(index_block_contents.allocation);
+    index_block = &index_block_contents.data;
+  } else {
+    index_block = nullptr;
+  }
+
+  if ((prefix_extractor_ == nullptr) && (hash_table_ratio != 0)) {
+    // moptions.prefix_extractor is requried for a hash-based look-up.
+    return Status::NotSupported(
+        "PlainTable requires a prefix extractor enable prefix hash mode.");
+  }
+
+  // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows
+  // for a prefix (starting from the first one), generate a record of (hash,
+  // offset) and append it to IndexRecordList, which is a data structure created
+  // to store them.
+
+  if (!index_in_file) {
+    // Allocate bloom filter here for total order mode.
+    if (IsTotalOrderMode()) {
+      AllocateBloom(bloom_bits_per_key,
+                    static_cast<uint32_t>(props->num_entries),
+                    huge_page_tlb_size);
+    }
+  } else if (bloom_in_file) {
+    enable_bloom_ = true;
+    auto num_blocks_property = props->user_collected_properties.find(
+        PlainTablePropertyNames::kNumBloomBlocks);
+
+    uint32_t num_blocks = 0;
+    if (num_blocks_property != props->user_collected_properties.end()) {
+      Slice temp_slice(num_blocks_property->second);
+      if (!GetVarint32(&temp_slice, &num_blocks)) {
+        num_blocks = 0;
+      }
+    }
+    // cast away const qualifier, because bloom_ won't be changed
+    bloom_.SetRawData(const_cast<char*>(bloom_block->data()),
+                      static_cast<uint32_t>(bloom_block->size()) * 8,
+                      num_blocks);
+  } else {
+    // Index in file but no bloom in file. Disable bloom filter in this case.
+    enable_bloom_ = false;
+    bloom_bits_per_key = 0;
+  }
+
+  PlainTableIndexBuilder index_builder(&arena_, ioptions_, prefix_extractor_,
+                                       index_sparseness, hash_table_ratio,
+                                       huge_page_tlb_size);
+
+  std::vector<uint32_t> prefix_hashes;
+  if (!index_in_file) {
+    // Populates _bloom if enabled (total order mode)
+    s = PopulateIndexRecordList(&index_builder, &prefix_hashes);
+    if (!s.ok()) {
+      return s;
+    }
+  } else {
+    s = index_.InitFromRawData(*index_block);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (!index_in_file) {
+    if (!IsTotalOrderMode()) {
+      // Calculated bloom filter size and allocate memory for
+      // bloom filter based on the number of prefixes, then fill it.
+      AllocateBloom(bloom_bits_per_key, index_.GetNumPrefixes(),
+                    huge_page_tlb_size);
+      if (enable_bloom_) {
+        FillBloom(prefix_hashes);
+      }
+    }
+  }
+
+  // Fill two table properties.
+  if (!index_in_file) {
+    props->user_collected_properties["plain_table_hash_table_size"] =
+        std::to_string(index_.GetIndexSize() * PlainTableIndex::kOffsetLen);
+    props->user_collected_properties["plain_table_sub_index_size"] =
+        std::to_string(index_.GetSubIndexSize());
+  } else {
+    props->user_collected_properties["plain_table_hash_table_size"] =
+        std::to_string(0);
+    props->user_collected_properties["plain_table_sub_index_size"] =
+        std::to_string(0);
+  }
+
+  return Status::OK();
+}
+
+Status PlainTableReader::GetOffset(PlainTableKeyDecoder* decoder,
+                                   const Slice& target, const Slice& prefix,
+                                   uint32_t prefix_hash, bool& prefix_matched,
+                                   uint32_t* offset) const {
+  prefix_matched = false;
+  uint32_t prefix_index_offset;
+  auto res = index_.GetOffset(prefix_hash, &prefix_index_offset);
+  if (res == PlainTableIndex::kNoPrefixForBucket) {
+    *offset = file_info_.data_end_offset;
+    return Status::OK();
+  } else if (res == PlainTableIndex::kDirectToFile) {
+    *offset = prefix_index_offset;
+    return Status::OK();
+  }
+
+  // point to sub-index, need to do a binary search
+  uint32_t upper_bound = 0;
+  const char* base_ptr =
+      index_.GetSubIndexBasePtrAndUpperBound(prefix_index_offset, &upper_bound);
+  uint32_t low = 0;
+  uint32_t high = upper_bound;
+  ParsedInternalKey mid_key;
+  ParsedInternalKey parsed_target;
+  Status s = ParseInternalKey(target, &parsed_target,
+                              false /* log_err_key */);  // TODO
+  if (!s.ok()) return s;
+
+  // The key is between [low, high). Do a binary search between it.
+  while (high - low > 1) {
+    uint32_t mid = (high + low) / 2;
+    uint32_t file_offset = GetFixed32Element(base_ptr, mid);
+    uint32_t tmp;
+    s = decoder->NextKeyNoValue(file_offset, &mid_key, nullptr, &tmp);
+    if (!s.ok()) {
+      return s;
+    }
+    int cmp_result = internal_comparator_.Compare(mid_key, parsed_target);
+    if (cmp_result < 0) {
+      low = mid;
+    } else {
+      if (cmp_result == 0) {
+        // Happen to have found the exact key or target is smaller than the
+        // first key after base_offset.
+        prefix_matched = true;
+        *offset = file_offset;
+        return Status::OK();
+      } else {
+        high = mid;
+      }
+    }
+  }
+  // Both of the key at the position low or low+1 could share the same
+  // prefix as target. We need to rule out one of them to avoid to go
+  // to the wrong prefix.
+  ParsedInternalKey low_key;
+  uint32_t tmp;
+  uint32_t low_key_offset = GetFixed32Element(base_ptr, low);
+  s = decoder->NextKeyNoValue(low_key_offset, &low_key, nullptr, &tmp);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (GetPrefix(low_key) == prefix) {
+    prefix_matched = true;
+    *offset = low_key_offset;
+  } else if (low + 1 < upper_bound) {
+    // There is possible a next prefix, return it
+    prefix_matched = false;
+    *offset = GetFixed32Element(base_ptr, low + 1);
+  } else {
+    // target is larger than a key of the last prefix in this bucket
+    // but with a different prefix. Key does not exist.
+    *offset = file_info_.data_end_offset;
+  }
+  return Status::OK();
+}
+
+bool PlainTableReader::MatchBloom(uint32_t hash) const {
+  if (!enable_bloom_) {
+    return true;
+  }
+
+  if (bloom_.MayContainHash(hash)) {
+    PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
+    return true;
+  } else {
+    PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
+    return false;
+  }
+}
+
+Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
+                              ParsedInternalKey* parsed_key,
+                              Slice* internal_key, Slice* value,
+                              bool* seekable) const {
+  if (*offset == file_info_.data_end_offset) {
+    *offset = file_info_.data_end_offset;
+    return Status::OK();
+  }
+
+  if (*offset > file_info_.data_end_offset) {
+    return Status::Corruption("Offset is out of file size");
+  }
+
+  uint32_t bytes_read;
+  Status s = decoder->NextKey(*offset, parsed_key, internal_key, value,
+                              &bytes_read, seekable);
+  if (!s.ok()) {
+    return s;
+  }
+  *offset = *offset + bytes_read;
+  return Status::OK();
+}
+
+void PlainTableReader::Prepare(const Slice& target) {
+  if (enable_bloom_) {
+    uint32_t prefix_hash = GetSliceHash(GetPrefix(target));
+    bloom_.Prefetch(prefix_hash);
+  }
+}
+
+Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target,
+                             GetContext* get_context,
+                             const SliceTransform* /* prefix_extractor */,
+                             bool /*skip_filters*/) {
+  // Check bloom filter first.
+  Slice prefix_slice;
+  uint32_t prefix_hash;
+  if (IsTotalOrderMode()) {
+    if (full_scan_mode_) {
+      status_ =
+          Status::InvalidArgument("Get() is not allowed in full scan mode.");
+    }
+    // Match whole user key for bloom filter check.
+    if (!MatchBloom(GetSliceHash(ExtractUserKey(target)))) {
+      return Status::OK();
+    }
+    // in total order mode, there is only one bucket 0, and we always use empty
+    // prefix.
+    prefix_slice = Slice();
+    prefix_hash = 0;
+  } else {
+    prefix_slice = GetPrefix(target);
+    prefix_hash = GetSliceHash(prefix_slice);
+    if (!MatchBloom(prefix_hash)) {
+      return Status::OK();
+    }
+  }
+  uint32_t offset;
+  bool prefix_match;
+  PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_,
+                               prefix_extractor_);
+  Status s = GetOffset(&decoder, target, prefix_slice, prefix_hash,
+                       prefix_match, &offset);
+
+  if (!s.ok()) {
+    return s;
+  }
+  ParsedInternalKey found_key;
+  ParsedInternalKey parsed_target;
+  s = ParseInternalKey(target, &parsed_target,
+                       false /* log_err_key */);  // TODO
+  if (!s.ok()) return s;
+
+  Slice found_value;
+  while (offset < file_info_.data_end_offset) {
+    s = Next(&decoder, &offset, &found_key, nullptr, &found_value);
+    if (!s.ok()) {
+      return s;
+    }
+    if (!prefix_match) {
+      // Need to verify prefix for the first key found if it is not yet
+      // checked.
+      if (GetPrefix(found_key) != prefix_slice) {
+        return Status::OK();
+      }
+      prefix_match = true;
+    }
+    // TODO(ljin): since we know the key comparison result here,
+    // can we enable the fast path?
+    if (internal_comparator_.Compare(found_key, parsed_target) >= 0) {
+      bool dont_care __attribute__((__unused__));
+      if (!get_context->SaveValue(found_key, found_value, &dont_care,
+                                  dummy_cleanable_.get())) {
+        break;
+      }
+    }
+  }
+  return Status::OK();
+}
+
+uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/,
+                                               TableReaderCaller /*caller*/) {
+  return 0;
+}
+
+uint64_t PlainTableReader::ApproximateSize(const Slice& /*start*/,
+                                           const Slice& /*end*/,
+                                           TableReaderCaller /*caller*/) {
+  return 0;
+}
+
+PlainTableIterator::PlainTableIterator(PlainTableReader* table,
+                                       bool use_prefix_seek)
+    : table_(table),
+      decoder_(&table_->file_info_, table_->encoding_type_,
+               table_->user_key_len_, table_->prefix_extractor_),
+      use_prefix_seek_(use_prefix_seek) {
+  next_offset_ = offset_ = table_->file_info_.data_end_offset;
+}
+
+PlainTableIterator::~PlainTableIterator() {}
+
+bool PlainTableIterator::Valid() const {
+  return offset_ < table_->file_info_.data_end_offset &&
+         offset_ >= table_->data_start_offset_;
+}
+
+void PlainTableIterator::SeekToFirst() {
+  status_ = Status::OK();
+  next_offset_ = table_->data_start_offset_;
+  if (next_offset_ >= table_->file_info_.data_end_offset) {
+    next_offset_ = offset_ = table_->file_info_.data_end_offset;
+  } else {
+    Next();
+  }
+}
+
+void PlainTableIterator::SeekToLast() {
+  assert(false);
+  status_ = Status::NotSupported("SeekToLast() is not supported in PlainTable");
+  next_offset_ = offset_ = table_->file_info_.data_end_offset;
+}
+
+void PlainTableIterator::Seek(const Slice& target) {
+  if (use_prefix_seek_ != !table_->IsTotalOrderMode()) {
+    // This check is done here instead of NewIterator() to permit creating an
+    // iterator with total_order_seek = true even if we won't be able to Seek()
+    // it. This is needed for compaction: it creates iterator with
+    // total_order_seek = true but usually never does Seek() on it,
+    // only SeekToFirst().
+    status_ = Status::InvalidArgument(
+        "total_order_seek not implemented for PlainTable.");
+    offset_ = next_offset_ = table_->file_info_.data_end_offset;
+    return;
+  }
+
+  // If the user doesn't set prefix seek option and we are not able to do a
+  // total Seek(). assert failure.
+  if (table_->IsTotalOrderMode()) {
+    if (table_->full_scan_mode_) {
+      status_ =
+          Status::InvalidArgument("Seek() is not allowed in full scan mode.");
+      offset_ = next_offset_ = table_->file_info_.data_end_offset;
+      return;
+    } else if (table_->GetIndexSize() > 1) {
+      assert(false);
+      status_ = Status::NotSupported(
+          "PlainTable cannot issue non-prefix seek unless in total order "
+          "mode.");
+      offset_ = next_offset_ = table_->file_info_.data_end_offset;
+      return;
+    }
+  }
+
+  Slice prefix_slice = table_->GetPrefix(target);
+  uint32_t prefix_hash = 0;
+  // Bloom filter is ignored in total-order mode.
+  if (!table_->IsTotalOrderMode()) {
+    prefix_hash = GetSliceHash(prefix_slice);
+    if (!table_->MatchBloom(prefix_hash)) {
+      status_ = Status::OK();
+      offset_ = next_offset_ = table_->file_info_.data_end_offset;
+      return;
+    }
+  }
+  bool prefix_match;
+  status_ = table_->GetOffset(&decoder_, target, prefix_slice, prefix_hash,
+                              prefix_match, &next_offset_);
+  if (!status_.ok()) {
+    offset_ = next_offset_ = table_->file_info_.data_end_offset;
+    return;
+  }
+
+  if (next_offset_ < table_->file_info_.data_end_offset) {
+    for (Next(); status_.ok() && Valid(); Next()) {
+      if (!prefix_match) {
+        // Need to verify the first key's prefix
+        if (table_->GetPrefix(key()) != prefix_slice) {
+          offset_ = next_offset_ = table_->file_info_.data_end_offset;
+          break;
+        }
+        prefix_match = true;
+      }
+      if (table_->internal_comparator_.Compare(key(), target) >= 0) {
+        break;
+      }
+    }
+  } else {
+    offset_ = table_->file_info_.data_end_offset;
+  }
+}
+
+void PlainTableIterator::SeekForPrev(const Slice& /*target*/) {
+  assert(false);
+  status_ =
+      Status::NotSupported("SeekForPrev() is not supported in PlainTable");
+  offset_ = next_offset_ = table_->file_info_.data_end_offset;
+}
+
+void PlainTableIterator::Next() {
+  offset_ = next_offset_;
+  if (offset_ < table_->file_info_.data_end_offset) {
+    Slice tmp_slice;
+    ParsedInternalKey parsed_key;
+    status_ =
+        table_->Next(&decoder_, &next_offset_, &parsed_key, &key_, &value_);
+    if (!status_.ok()) {
+      offset_ = next_offset_ = table_->file_info_.data_end_offset;
+    }
+  }
+}
+
+void PlainTableIterator::Prev() { assert(false); }
+
+Slice PlainTableIterator::key() const {
+  assert(Valid());
+  return key_;
+}
+
+Slice PlainTableIterator::value() const {
+  assert(Valid());
+  return value_;
+}
+
+Status PlainTableIterator::status() const { return status_; }
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/plain/plain_table_reader.h b/src/rocksdb/table/plain/plain_table_reader.h
new file mode 100644
index 000000000..62bda693a
--- /dev/null
+++ b/src/rocksdb/table/plain/plain_table_reader.h
@@ -0,0 +1,244 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "file/random_access_file_reader.h"
+#include "memory/arena.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_index.h"
+#include "table/table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Block;
+struct BlockContents;
+class BlockHandle;
+class Footer;
+struct Options;
+class RandomAccessFile;
+struct ReadOptions;
+class TableCache;
+class TableReader;
+class InternalKeyComparator;
+class PlainTableKeyDecoder;
+class GetContext;
+
+extern const uint32_t kPlainTableVariableLength;
+
+struct PlainTableReaderFileInfo {
+  bool is_mmap_mode;
+  Slice file_data;
+  uint32_t data_end_offset;
+  std::unique_ptr<RandomAccessFileReader> file;
+
+  PlainTableReaderFileInfo(std::unique_ptr<RandomAccessFileReader>&& _file,
+                           const EnvOptions& storage_options,
+                           uint32_t _data_size_offset)
+      : is_mmap_mode(storage_options.use_mmap_reads),
+        data_end_offset(_data_size_offset),
+        file(std::move(_file)) {}
+};
+
+// The reader class of PlainTable. For description of PlainTable format
+// See comments of class PlainTableFactory, where instances of
+// PlainTableReader are created.
+class PlainTableReader : public TableReader {
+ public:
+  // Based on following output file format shown in plain_table_factory.h
+  // When opening the output file, PlainTableReader creates a hash table
+  // from key prefixes to offset of the output file. PlainTable will decide
+  // whether it points to the data offset of the first key with the key prefix
+  // or the offset of it. If there are too many keys share this prefix, it will
+  // create a binary search-able index from the suffix to offset on disk.
+  static Status Open(const ImmutableOptions& ioptions,
+                     const EnvOptions& env_options,
+                     const InternalKeyComparator& internal_comparator,
+                     std::unique_ptr<RandomAccessFileReader>&& file,
+                     uint64_t file_size, std::unique_ptr<TableReader>* table,
+                     const int bloom_bits_per_key, double hash_table_ratio,
+                     size_t index_sparseness, size_t huge_page_tlb_size,
+                     bool full_scan_mode, const bool immortal_table = false,
+                     const SliceTransform* prefix_extractor = nullptr);
+
+  // Returns new iterator over table contents
+  // compaction_readahead_size: its value will only be used if for_compaction =
+  // true
+  InternalIterator* NewIterator(const ReadOptions&,
+                                const SliceTransform* prefix_extractor,
+                                Arena* arena, bool skip_filters,
+                                TableReaderCaller caller,
+                                size_t compaction_readahead_size = 0,
+                                bool allow_unprepared_value = false) override;
+
+  void Prepare(const Slice& target) override;
+
+  Status Get(const ReadOptions& readOptions, const Slice& key,
+             GetContext* get_context, const SliceTransform* prefix_extractor,
+             bool skip_filters = false) override;
+
+  uint64_t ApproximateOffsetOf(const Slice& key,
+                               TableReaderCaller caller) override;
+
+  uint64_t ApproximateSize(const Slice& start, const Slice& end,
+                           TableReaderCaller caller) override;
+
+  uint32_t GetIndexSize() const { return index_.GetIndexSize(); }
+  void SetupForCompaction() override;
+
+  std::shared_ptr<const TableProperties> GetTableProperties() const override {
+    return table_properties_;
+  }
+
+  virtual size_t ApproximateMemoryUsage() const override {
+    return arena_.MemoryAllocatedBytes();
+  }
+
+  PlainTableReader(const ImmutableOptions& ioptions,
+                   std::unique_ptr<RandomAccessFileReader>&& file,
+                   const EnvOptions& env_options,
+                   const InternalKeyComparator& internal_comparator,
+                   EncodingType encoding_type, uint64_t file_size,
+                   const TableProperties* table_properties,
+                   const SliceTransform* prefix_extractor);
+  virtual ~PlainTableReader();
+
+ protected:
+  // Check bloom filter to see whether it might contain this prefix.
+  // The hash of the prefix is given, since it can be reused for index lookup
+  // too.
+  virtual bool MatchBloom(uint32_t hash) const;
+
+  // PopulateIndex() builds index of keys. It must be called before any query
+  // to the table.
+  //
+  // props: the table properties object that need to be stored. Ownership of
+  //        the object will be passed.
+  //
+
+  Status PopulateIndex(TableProperties* props, int bloom_bits_per_key,
+                       double hash_table_ratio, size_t index_sparseness,
+                       size_t huge_page_tlb_size);
+
+  Status MmapDataIfNeeded();
+
+ private:
+  const InternalKeyComparator internal_comparator_;
+  EncodingType encoding_type_;
+  // represents plain table's current status.
+  Status status_;
+
+  PlainTableIndex index_;
+  bool full_scan_mode_;
+
+  // data_start_offset_ and data_end_offset_ defines the range of the
+  // sst file that stores data.
+  const uint32_t data_start_offset_ = 0;
+  const uint32_t user_key_len_;
+  const SliceTransform* prefix_extractor_;
+
+  static const size_t kNumInternalBytes = 8;
+
+  // Bloom filter is used to rule out non-existent key
+  bool enable_bloom_;
+  PlainTableBloomV1 bloom_;
+  PlainTableReaderFileInfo file_info_;
+  Arena arena_;
+  CacheAllocationPtr index_block_alloc_;
+  CacheAllocationPtr bloom_block_alloc_;
+
+  const ImmutableOptions& ioptions_;
+  std::unique_ptr<Cleanable> dummy_cleanable_;
+  uint64_t file_size_;
+
+ protected:  // for testing
+  std::shared_ptr<const TableProperties> table_properties_;
+
+ private:
+  bool IsFixedLength() const {
+    return user_key_len_ != kPlainTableVariableLength;
+  }
+
+  size_t GetFixedInternalKeyLength() const {
+    return user_key_len_ + kNumInternalBytes;
+  }
+
+  Slice GetPrefix(const Slice& target) const {
+    assert(target.size() >= 8);  // target is internal key
+    return GetPrefixFromUserKey(ExtractUserKey(target));
+  }
+
+  Slice GetPrefix(const ParsedInternalKey& target) const {
+    return GetPrefixFromUserKey(target.user_key);
+  }
+
+  Slice GetPrefixFromUserKey(const Slice& user_key) const {
+    if (!IsTotalOrderMode()) {
+      return prefix_extractor_->Transform(user_key);
+    } else {
+      // Use empty slice as prefix if prefix_extractor is not set.
+      // In that case,
+      // it falls back to pure binary search and
+      // total iterator seek is supported.
+      return Slice();
+    }
+  }
+
+  friend class TableCache;
+  friend class PlainTableIterator;
+
+  // Internal helper function to generate an IndexRecordList object from all
+  // the rows, which contains index records as a list.
+  // If bloom_ is not null, all the keys' full-key hash will be added to the
+  // bloom filter.
+  Status PopulateIndexRecordList(PlainTableIndexBuilder* index_builder,
+                                 std::vector<uint32_t>* prefix_hashes);
+
+  // Internal helper function to allocate memory for bloom filter
+  void AllocateBloom(int bloom_bits_per_key, int num_prefixes,
+                     size_t huge_page_tlb_size);
+
+  void FillBloom(const std::vector<uint32_t>& prefix_hashes);
+
+  // Read the key and value at `offset` to parameters for keys, the and
+  // `seekable`.
+  // On success, `offset` will be updated as the offset for the next key.
+  // `parsed_key` will be key in parsed format.
+  // if `internal_key` is not empty, it will be filled with key with slice
+  // format.
+  // if `seekable` is not null, it will return whether we can directly read
+  // data using this offset.
+  Status Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
+              ParsedInternalKey* parsed_key, Slice* internal_key, Slice* value,
+              bool* seekable = nullptr) const;
+  // Get file offset for key target.
+  // return value prefix_matched is set to true if the offset is confirmed
+  // for a key with the same prefix as target.
+  Status GetOffset(PlainTableKeyDecoder* decoder, const Slice& target,
+                   const Slice& prefix, uint32_t prefix_hash,
+                   bool& prefix_matched, uint32_t* offset) const;
+
+  bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
+
+  // No copying allowed
+  explicit PlainTableReader(const TableReader&) = delete;
+  void operator=(const TableReader&) = delete;
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/scoped_arena_iterator.h b/src/rocksdb/table/scoped_arena_iterator.h
new file mode 100644
index 000000000..2b8824d95
--- /dev/null
+++ b/src/rocksdb/table/scoped_arena_iterator.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include "port/port.h"
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+class ScopedArenaIterator {
+  void reset(InternalIterator* iter) noexcept {
+    if (iter_ != nullptr) {
+      iter_->~InternalIterator();
+    }
+    iter_ = iter;
+  }
+
+ public:
+  explicit ScopedArenaIterator(InternalIterator* iter = nullptr)
+      : iter_(iter) {}
+
+  ScopedArenaIterator(const ScopedArenaIterator&) = delete;
+  ScopedArenaIterator& operator=(const ScopedArenaIterator&) = delete;
+
+  ScopedArenaIterator(ScopedArenaIterator&& o) noexcept {
+    iter_ = o.iter_;
+    o.iter_ = nullptr;
+  }
+
+  ScopedArenaIterator& operator=(ScopedArenaIterator&& o) noexcept {
+    reset(o.iter_);
+    o.iter_ = nullptr;
+    return *this;
+  }
+
+  InternalIterator* operator->() { return iter_; }
+  InternalIterator* get() { return iter_; }
+
+  void set(InternalIterator* iter) { reset(iter); }
+
+  InternalIterator* release() {
+    assert(iter_ != nullptr);
+    auto* res = iter_;
+    iter_ = nullptr;
+    return res;
+  }
+
+  ~ScopedArenaIterator() { reset(nullptr); }
+
+ private:
+  InternalIterator* iter_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/sst_file_dumper.cc b/src/rocksdb/table/sst_file_dumper.cc
new file mode 100644
index 000000000..122f0995a
--- /dev/null
+++ b/src/rocksdb/table/sst_file_dumper.cc
@@ -0,0 +1,519 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+
+#include "table/sst_file_dumper.h"
+
+#include <chrono>
+#include <cinttypes>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <sstream>
+#include <vector>
+
+#include "db/blob/blob_index.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "options/cf_options.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/utilities/ldb_cmd.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_builder.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/table_reader.h"
+#include "util/compression.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+SstFileDumper::SstFileDumper(const Options& options,
+                             const std::string& file_path,
+                             Temperature file_temp, size_t readahead_size,
+                             bool verify_checksum, bool output_hex,
+                             bool decode_blob_index, const EnvOptions& soptions,
+                             bool silent)
+    : file_name_(file_path),
+      read_num_(0),
+      file_temp_(file_temp),
+      output_hex_(output_hex),
+      decode_blob_index_(decode_blob_index),
+      soptions_(soptions),
+      silent_(silent),
+      options_(options),
+      ioptions_(options_),
+      moptions_(ColumnFamilyOptions(options_)),
+      read_options_(verify_checksum, false),
+      internal_comparator_(BytewiseComparator()) {
+  read_options_.readahead_size = readahead_size;
+  if (!silent_) {
+    fprintf(stdout, "Process %s\n", file_path.c_str());
+  }
+  init_result_ = GetTableReader(file_name_);
+}
+
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const uint64_t kLegacyBlockBasedTableMagicNumber;
+extern const uint64_t kPlainTableMagicNumber;
+extern const uint64_t kLegacyPlainTableMagicNumber;
+
+const char* testFileName = "test_file_name";
+
+Status SstFileDumper::GetTableReader(const std::string& file_path) {
+  // Warning about 'magic_number' being uninitialized shows up only in UBsan
+  // builds. Though access is guarded by 's.ok()' checks, fix the issue to
+  // avoid any warnings.
+  uint64_t magic_number = Footer::kNullTableMagicNumber;
+
+  // read table magic number
+  Footer footer;
+
+  const auto& fs = options_.env->GetFileSystem();
+  std::unique_ptr<FSRandomAccessFile> file;
+  uint64_t file_size = 0;
+  FileOptions fopts = soptions_;
+  fopts.temperature = file_temp_;
+  Status s = fs->NewRandomAccessFile(file_path, fopts, &file, nullptr);
+  if (s.ok()) {
+    s = fs->GetFileSize(file_path, IOOptions(), &file_size, nullptr);
+  }
+
+  // check empty file
+  // if true, skip further processing of this file
+  if (file_size == 0) {
+    return Status::Aborted(file_path, "Empty file");
+  }
+
+  file_.reset(new RandomAccessFileReader(std::move(file), file_path));
+
+  FilePrefetchBuffer prefetch_buffer(
+      0 /* readahead_size */, 0 /* max_readahead_size */, true /* enable */,
+      false /* track_min_offset */);
+  if (s.ok()) {
+    const uint64_t kSstDumpTailPrefetchSize = 512 * 1024;
+    uint64_t prefetch_size = (file_size > kSstDumpTailPrefetchSize)
+                                 ? kSstDumpTailPrefetchSize
+                                 : file_size;
+    uint64_t prefetch_off = file_size - prefetch_size;
+    IOOptions opts;
+    s = prefetch_buffer.Prefetch(opts, file_.get(), prefetch_off,
+                                 static_cast<size_t>(prefetch_size),
+                                 Env::IO_TOTAL /* rate_limiter_priority */);
+
+    s = ReadFooterFromFile(opts, file_.get(), &prefetch_buffer, file_size,
+                           &footer);
+  }
+  if (s.ok()) {
+    magic_number = footer.table_magic_number();
+  }
+
+  if (s.ok()) {
+    if (magic_number == kPlainTableMagicNumber ||
+        magic_number == kLegacyPlainTableMagicNumber) {
+      soptions_.use_mmap_reads = true;
+
+      fs->NewRandomAccessFile(file_path, fopts, &file, nullptr);
+      file_.reset(new RandomAccessFileReader(std::move(file), file_path));
+    }
+
+    // For old sst format, ReadTableProperties might fail but file can be read
+    if (ReadTableProperties(magic_number, file_.get(), file_size,
+                            (magic_number == kBlockBasedTableMagicNumber)
+                                ? &prefetch_buffer
+                                : nullptr)
+            .ok()) {
+      s = SetTableOptionsByMagicNumber(magic_number);
+      if (s.ok()) {
+        if (table_properties_ && !table_properties_->comparator_name.empty()) {
+          ConfigOptions config_options;
+          const Comparator* user_comparator = nullptr;
+          s = Comparator::CreateFromString(config_options,
+                                           table_properties_->comparator_name,
+                                           &user_comparator);
+          if (s.ok()) {
+            assert(user_comparator);
+            internal_comparator_ = InternalKeyComparator(user_comparator);
+          }
+        }
+      }
+    } else {
+      s = SetOldTableOptions();
+    }
+    options_.comparator = internal_comparator_.user_comparator();
+  }
+
+  if (s.ok()) {
+    s = NewTableReader(ioptions_, soptions_, internal_comparator_, file_size,
+                       &table_reader_);
+  }
+  return s;
+}
+
+Status SstFileDumper::NewTableReader(
+    const ImmutableOptions& /*ioptions*/, const EnvOptions& /*soptions*/,
+    const InternalKeyComparator& /*internal_comparator*/, uint64_t file_size,
+    std::unique_ptr<TableReader>* /*table_reader*/) {
+  auto t_opt =
+      TableReaderOptions(ioptions_, moptions_.prefix_extractor, soptions_,
+                         internal_comparator_, false /* skip_filters */,
+                         false /* imortal */, true /* force_direct_prefetch */);
+  // Allow open file with global sequence number for backward compatibility.
+  t_opt.largest_seqno = kMaxSequenceNumber;
+
+  // We need to turn off pre-fetching of index and filter nodes for
+  // BlockBasedTable
+  if (options_.table_factory->IsInstanceOf(
+          TableFactory::kBlockBasedTableName())) {
+    return options_.table_factory->NewTableReader(t_opt, std::move(file_),
+                                                  file_size, &table_reader_,
+                                                  /*enable_prefetch=*/false);
+  }
+
+  // For all other factory implementation
+  return options_.table_factory->NewTableReader(t_opt, std::move(file_),
+                                                file_size, &table_reader_);
+}
+
+Status SstFileDumper::VerifyChecksum() {
+  // We could pass specific readahead setting into read options if needed.
+  return table_reader_->VerifyChecksum(read_options_,
+                                       TableReaderCaller::kSSTDumpTool);
+}
+
+Status SstFileDumper::DumpTable(const std::string& out_filename) {
+  std::unique_ptr<WritableFile> out_file;
+  Env* env = options_.env;
+  Status s = env->NewWritableFile(out_filename, &out_file, soptions_);
+  if (s.ok()) {
+    s = table_reader_->DumpTable(out_file.get());
+  }
+  if (!s.ok()) {
+    // close the file before return error, ignore the close error if there's any
+    out_file->Close().PermitUncheckedError();
+    return s;
+  }
+  return out_file->Close();
+}
+
+Status SstFileDumper::CalculateCompressedTableSize(
+    const TableBuilderOptions& tb_options, size_t block_size,
+    uint64_t* num_data_blocks, uint64_t* compressed_table_size) {
+  std::unique_ptr<Env> env(NewMemEnv(options_.env));
+  std::unique_ptr<WritableFileWriter> dest_writer;
+  Status s =
+      WritableFileWriter::Create(env->GetFileSystem(), testFileName,
+                                 FileOptions(soptions_), &dest_writer, nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+  BlockBasedTableOptions table_options;
+  table_options.block_size = block_size;
+  BlockBasedTableFactory block_based_tf(table_options);
+  std::unique_ptr<TableBuilder> table_builder;
+  table_builder.reset(
+      block_based_tf.NewTableBuilder(tb_options, dest_writer.get()));
+  std::unique_ptr<InternalIterator> iter(table_reader_->NewIterator(
+      read_options_, moptions_.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kSSTDumpTool));
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    table_builder->Add(iter->key(), iter->value());
+  }
+  s = iter->status();
+  if (!s.ok()) {
+    return s;
+  }
+  s = table_builder->Finish();
+  if (!s.ok()) {
+    return s;
+  }
+  *compressed_table_size = table_builder->FileSize();
+  assert(num_data_blocks != nullptr);
+  *num_data_blocks = table_builder->GetTableProperties().num_data_blocks;
+  return env->DeleteFile(testFileName);
+}
+
+Status SstFileDumper::ShowAllCompressionSizes(
+    size_t block_size,
+    const std::vector<std::pair<CompressionType, const char*>>&
+        compression_types,
+    int32_t compress_level_from, int32_t compress_level_to,
+    uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes,
+    uint64_t max_dict_buffer_bytes, bool use_zstd_dict_trainer) {
+  fprintf(stdout, "Block Size: %" ROCKSDB_PRIszt "\n", block_size);
+  for (auto& i : compression_types) {
+    if (CompressionTypeSupported(i.first)) {
+      fprintf(stdout, "Compression: %-24s\n", i.second);
+      CompressionOptions compress_opt;
+      compress_opt.max_dict_bytes = max_dict_bytes;
+      compress_opt.zstd_max_train_bytes = zstd_max_train_bytes;
+      compress_opt.max_dict_buffer_bytes = max_dict_buffer_bytes;
+      compress_opt.use_zstd_dict_trainer = use_zstd_dict_trainer;
+      for (int32_t j = compress_level_from; j <= compress_level_to; j++) {
+        fprintf(stdout, "Compression level: %d", j);
+        compress_opt.level = j;
+        Status s = ShowCompressionSize(block_size, i.first, compress_opt);
+        if (!s.ok()) {
+          return s;
+        }
+      }
+    } else {
+      fprintf(stdout, "Unsupported compression type: %s.\n", i.second);
+    }
+  }
+  return Status::OK();
+}
+
+Status SstFileDumper::ShowCompressionSize(
+    size_t block_size, CompressionType compress_type,
+    const CompressionOptions& compress_opt) {
+  Options opts;
+  opts.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  opts.statistics->set_stats_level(StatsLevel::kAll);
+  const ImmutableOptions imoptions(opts);
+  const ColumnFamilyOptions cfo(opts);
+  const MutableCFOptions moptions(cfo);
+  ROCKSDB_NAMESPACE::InternalKeyComparator ikc(opts.comparator);
+  IntTblPropCollectorFactories block_based_table_factories;
+
+  std::string column_family_name;
+  int unknown_level = -1;
+  TableBuilderOptions tb_opts(
+      imoptions, moptions, ikc, &block_based_table_factories, compress_type,
+      compress_opt,
+      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+      column_family_name, unknown_level);
+  uint64_t num_data_blocks = 0;
+  std::chrono::steady_clock::time_point start =
+      std::chrono::steady_clock::now();
+  uint64_t file_size;
+  Status s = CalculateCompressedTableSize(tb_opts, block_size, &num_data_blocks,
+                                          &file_size);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
+  fprintf(stdout, " Size: %10" PRIu64, file_size);
+  fprintf(stdout, " Blocks: %6" PRIu64, num_data_blocks);
+  fprintf(stdout, " Time Taken: %10s microsecs",
+          std::to_string(
+              std::chrono::duration_cast<std::chrono::microseconds>(end - start)
+                  .count())
+              .c_str());
+  const uint64_t compressed_blocks =
+      opts.statistics->getAndResetTickerCount(NUMBER_BLOCK_COMPRESSED);
+  const uint64_t not_compressed_blocks =
+      opts.statistics->getAndResetTickerCount(NUMBER_BLOCK_NOT_COMPRESSED);
+  // When the option enable_index_compression is true,
+  // NUMBER_BLOCK_COMPRESSED is incremented for index block(s).
+  if ((compressed_blocks + not_compressed_blocks) > num_data_blocks) {
+    num_data_blocks = compressed_blocks + not_compressed_blocks;
+  }
+
+  const uint64_t ratio_not_compressed_blocks =
+      (num_data_blocks - compressed_blocks) - not_compressed_blocks;
+  const double compressed_pcnt =
+      (0 == num_data_blocks) ? 0.0
+                             : ((static_cast<double>(compressed_blocks) /
+                                 static_cast<double>(num_data_blocks)) *
+                                100.0);
+  const double ratio_not_compressed_pcnt =
+      (0 == num_data_blocks)
+          ? 0.0
+          : ((static_cast<double>(ratio_not_compressed_blocks) /
+              static_cast<double>(num_data_blocks)) *
+             100.0);
+  const double not_compressed_pcnt =
+      (0 == num_data_blocks) ? 0.0
+                             : ((static_cast<double>(not_compressed_blocks) /
+                                 static_cast<double>(num_data_blocks)) *
+                                100.0);
+  fprintf(stdout, " Compressed: %6" PRIu64 " (%5.1f%%)", compressed_blocks,
+          compressed_pcnt);
+  fprintf(stdout, " Not compressed (ratio): %6" PRIu64 " (%5.1f%%)",
+          ratio_not_compressed_blocks, ratio_not_compressed_pcnt);
+  fprintf(stdout, " Not compressed (abort): %6" PRIu64 " (%5.1f%%)\n",
+          not_compressed_blocks, not_compressed_pcnt);
+  return Status::OK();
+}
+
+// Reads TableProperties prior to opening table reader in order to set up
+// options.
+Status SstFileDumper::ReadTableProperties(uint64_t table_magic_number,
+                                          RandomAccessFileReader* file,
+                                          uint64_t file_size,
+                                          FilePrefetchBuffer* prefetch_buffer) {
+  Status s = ROCKSDB_NAMESPACE::ReadTableProperties(
+      file, file_size, table_magic_number, ioptions_, &table_properties_,
+      /* memory_allocator= */ nullptr, prefetch_buffer);
+  if (!s.ok()) {
+    if (!silent_) {
+      fprintf(stdout, "Not able to read table properties\n");
+    }
+  }
+  return s;
+}
+
+Status SstFileDumper::SetTableOptionsByMagicNumber(
+    uint64_t table_magic_number) {
+  assert(table_properties_);
+  if (table_magic_number == kBlockBasedTableMagicNumber ||
+      table_magic_number == kLegacyBlockBasedTableMagicNumber) {
+    BlockBasedTableFactory* bbtf = new BlockBasedTableFactory();
+    // To force tail prefetching, we fake reporting two useful reads of 512KB
+    // from the tail.
+    // It needs at least two data points to warm up the stats.
+    bbtf->tail_prefetch_stats()->RecordEffectiveSize(512 * 1024);
+    bbtf->tail_prefetch_stats()->RecordEffectiveSize(512 * 1024);
+
+    options_.table_factory.reset(bbtf);
+    if (!silent_) {
+      fprintf(stdout, "Sst file format: block-based\n");
+    }
+
+    auto& props = table_properties_->user_collected_properties;
+    auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
+    if (pos != props.end()) {
+      auto index_type_on_file = static_cast<BlockBasedTableOptions::IndexType>(
+          DecodeFixed32(pos->second.c_str()));
+      if (index_type_on_file ==
+          BlockBasedTableOptions::IndexType::kHashSearch) {
+        options_.prefix_extractor.reset(NewNoopTransform());
+      }
+    }
+  } else if (table_magic_number == kPlainTableMagicNumber ||
+             table_magic_number == kLegacyPlainTableMagicNumber) {
+    options_.allow_mmap_reads = true;
+
+    PlainTableOptions plain_table_options;
+    plain_table_options.user_key_len = kPlainTableVariableLength;
+    plain_table_options.bloom_bits_per_key = 0;
+    plain_table_options.hash_table_ratio = 0;
+    plain_table_options.index_sparseness = 1;
+    plain_table_options.huge_page_tlb_size = 0;
+    plain_table_options.encoding_type = kPlain;
+    plain_table_options.full_scan_mode = true;
+
+    options_.table_factory.reset(NewPlainTableFactory(plain_table_options));
+    if (!silent_) {
+      fprintf(stdout, "Sst file format: plain table\n");
+    }
+  } else {
+    char error_msg_buffer[80];
+    snprintf(error_msg_buffer, sizeof(error_msg_buffer) - 1,
+             "Unsupported table magic number --- %lx",
+             (long)table_magic_number);
+    return Status::InvalidArgument(error_msg_buffer);
+  }
+
+  return Status::OK();
+}
+
+Status SstFileDumper::SetOldTableOptions() {
+  assert(table_properties_ == nullptr);
+  options_.table_factory = std::make_shared<BlockBasedTableFactory>();
+  if (!silent_) {
+    fprintf(stdout, "Sst file format: block-based(old version)\n");
+  }
+
+  return Status::OK();
+}
+
+Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num,
+                                     bool has_from, const std::string& from_key,
+                                     bool has_to, const std::string& to_key,
+                                     bool use_from_as_prefix) {
+  if (!table_reader_) {
+    return init_result_;
+  }
+
+  InternalIterator* iter = table_reader_->NewIterator(
+      read_options_, moptions_.prefix_extractor.get(),
+      /*arena=*/nullptr, /*skip_filters=*/false,
+      TableReaderCaller::kSSTDumpTool);
+  uint64_t i = 0;
+  if (has_from) {
+    InternalKey ikey;
+    ikey.SetMinPossibleForUserKey(from_key);
+    iter->Seek(ikey.Encode());
+  } else {
+    iter->SeekToFirst();
+  }
+  for (; iter->Valid(); iter->Next()) {
+    Slice key = iter->key();
+    Slice value = iter->value();
+    ++i;
+    if (read_num > 0 && i > read_num) break;
+
+    ParsedInternalKey ikey;
+    Status pik_status = ParseInternalKey(key, &ikey, true /* log_err_key */);
+    if (!pik_status.ok()) {
+      std::cerr << pik_status.getState() << "\n";
+      continue;
+    }
+
+    // the key returned is not prefixed with out 'from' key
+    if (use_from_as_prefix && !ikey.user_key.starts_with(from_key)) {
+      break;
+    }
+
+    // If end marker was specified, we stop before it
+    if (has_to && BytewiseComparator()->Compare(ikey.user_key, to_key) >= 0) {
+      break;
+    }
+
+    if (print_kv) {
+      if (!decode_blob_index_ || ikey.type != kTypeBlobIndex) {
+        fprintf(stdout, "%s => %s\n",
+                ikey.DebugString(true, output_hex_).c_str(),
+                value.ToString(output_hex_).c_str());
+      } else {
+        BlobIndex blob_index;
+
+        const Status s = blob_index.DecodeFrom(value);
+        if (!s.ok()) {
+          fprintf(stderr, "%s => error decoding blob index\n",
+                  ikey.DebugString(true, output_hex_).c_str());
+          continue;
+        }
+
+        fprintf(stdout, "%s => %s\n",
+                ikey.DebugString(true, output_hex_).c_str(),
+                blob_index.DebugString(output_hex_).c_str());
+      }
+    }
+  }
+
+  read_num_ += i;
+
+  Status ret = iter->status();
+  delete iter;
+  return ret;
+}
+
+// Provides TableProperties to API user
+Status SstFileDumper::ReadTableProperties(
+    std::shared_ptr<const TableProperties>* table_properties) {
+  if (!table_reader_) {
+    return init_result_;
+  }
+
+  *table_properties = table_reader_->GetTableProperties();
+  return init_result_;
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/sst_file_dumper.h b/src/rocksdb/table/sst_file_dumper.h
new file mode 100644
index 000000000..7be876390
--- /dev/null
+++ b/src/rocksdb/table/sst_file_dumper.h
@@ -0,0 +1,101 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <memory>
+#include <string>
+
+#include "db/dbformat.h"
+#include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+#include "rocksdb/advanced_options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SstFileDumper {
+ public:
+  explicit SstFileDumper(const Options& options, const std::string& file_name,
+                         Temperature file_temp, size_t readahead_size,
+                         bool verify_checksum, bool output_hex,
+                         bool decode_blob_index,
+                         const EnvOptions& soptions = EnvOptions(),
+                         bool silent = false);
+
+  Status ReadSequential(bool print_kv, uint64_t read_num, bool has_from,
+                        const std::string& from_key, bool has_to,
+                        const std::string& to_key,
+                        bool use_from_as_prefix = false);
+
+  Status ReadTableProperties(
+      std::shared_ptr<const TableProperties>* table_properties);
+  uint64_t GetReadNumber() { return read_num_; }
+  TableProperties* GetInitTableProperties() { return table_properties_.get(); }
+
+  Status VerifyChecksum();
+  Status DumpTable(const std::string& out_filename);
+  Status getStatus() { return init_result_; }
+
+  Status ShowAllCompressionSizes(
+      size_t block_size,
+      const std::vector<std::pair<CompressionType, const char*>>&
+          compression_types,
+      int32_t compress_level_from, int32_t compress_level_to,
+      uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes,
+      uint64_t max_dict_buffer_bytes, bool use_zstd_dict_trainer);
+
+  Status ShowCompressionSize(size_t block_size, CompressionType compress_type,
+                             const CompressionOptions& compress_opt);
+
+ private:
+  // Get the TableReader implementation for the sst file
+  Status GetTableReader(const std::string& file_path);
+  Status ReadTableProperties(uint64_t table_magic_number,
+                             RandomAccessFileReader* file, uint64_t file_size,
+                             FilePrefetchBuffer* prefetch_buffer);
+
+  Status CalculateCompressedTableSize(const TableBuilderOptions& tb_options,
+                                      size_t block_size,
+                                      uint64_t* num_data_blocks,
+                                      uint64_t* compressed_table_size);
+
+  Status SetTableOptionsByMagicNumber(uint64_t table_magic_number);
+  Status SetOldTableOptions();
+
+  // Helper function to call the factory with settings specific to the
+  // factory implementation
+  Status NewTableReader(const ImmutableOptions& ioptions,
+                        const EnvOptions& soptions,
+                        const InternalKeyComparator& internal_comparator,
+                        uint64_t file_size,
+                        std::unique_ptr<TableReader>* table_reader);
+
+  std::string file_name_;
+  uint64_t read_num_;
+  Temperature file_temp_;
+  bool output_hex_;
+  bool decode_blob_index_;
+  EnvOptions soptions_;
+  // less verbose in stdout/stderr
+  bool silent_;
+
+  // options_ and internal_comparator_ will also be used in
+  // ReadSequential internally (specifically, seek-related operations)
+  Options options_;
+
+  Status init_result_;
+  std::unique_ptr<TableReader> table_reader_;
+  std::unique_ptr<RandomAccessFileReader> file_;
+
+  const ImmutableOptions ioptions_;
+  const MutableCFOptions moptions_;
+  ReadOptions read_options_;
+  InternalKeyComparator internal_comparator_;
+  std::unique_ptr<TableProperties> table_properties_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/sst_file_reader.cc b/src/rocksdb/table/sst_file_reader.cc
new file mode 100644
index 000000000..48f1be0be
--- /dev/null
+++ b/src/rocksdb/table/sst_file_reader.cc
@@ -0,0 +1,101 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/sst_file_reader.h"
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "file/random_access_file_reader.h"
+#include "options/cf_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "table/get_context.h"
+#include "table/table_builder.h"
+#include "table/table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct SstFileReader::Rep {
+  Options options;
+  EnvOptions soptions;
+  ImmutableOptions ioptions;
+  MutableCFOptions moptions;
+
+  std::unique_ptr<TableReader> table_reader;
+
+  Rep(const Options& opts)
+      : options(opts),
+        soptions(options),
+        ioptions(options),
+        moptions(ColumnFamilyOptions(options)) {}
+};
+
+SstFileReader::SstFileReader(const Options& options) : rep_(new Rep(options)) {}
+
+SstFileReader::~SstFileReader() {}
+
+Status SstFileReader::Open(const std::string& file_path) {
+  auto r = rep_.get();
+  Status s;
+  uint64_t file_size = 0;
+  std::unique_ptr<FSRandomAccessFile> file;
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  FileOptions fopts(r->soptions);
+  const auto& fs = r->options.env->GetFileSystem();
+
+  s = fs->GetFileSize(file_path, fopts.io_options, &file_size, nullptr);
+  if (s.ok()) {
+    s = fs->NewRandomAccessFile(file_path, fopts, &file, nullptr);
+  }
+  if (s.ok()) {
+    file_reader.reset(new RandomAccessFileReader(std::move(file), file_path));
+  }
+  if (s.ok()) {
+    TableReaderOptions t_opt(r->ioptions, r->moptions.prefix_extractor,
+                             r->soptions, r->ioptions.internal_comparator);
+    // Allow open file with global sequence number for backward compatibility.
+    t_opt.largest_seqno = kMaxSequenceNumber;
+    s = r->options.table_factory->NewTableReader(t_opt, std::move(file_reader),
+                                                 file_size, &r->table_reader);
+  }
+  return s;
+}
+
+Iterator* SstFileReader::NewIterator(const ReadOptions& roptions) {
+  auto r = rep_.get();
+  auto sequence = roptions.snapshot != nullptr
+                      ? roptions.snapshot->GetSequenceNumber()
+                      : kMaxSequenceNumber;
+  ArenaWrappedDBIter* res = new ArenaWrappedDBIter();
+  res->Init(r->options.env, roptions, r->ioptions, r->moptions,
+            nullptr /* version */, sequence,
+            r->moptions.max_sequential_skip_in_iterations,
+            0 /* version_number */, nullptr /* read_callback */,
+            nullptr /* db_impl */, nullptr /* cfd */,
+            true /* expose_blob_index */, false /* allow_refresh */);
+  auto internal_iter = r->table_reader->NewIterator(
+      res->GetReadOptions(), r->moptions.prefix_extractor.get(),
+      res->GetArena(), false /* skip_filters */,
+      TableReaderCaller::kSSTFileReader);
+  res->SetIterUnderDBIter(internal_iter);
+  return res;
+}
+
+std::shared_ptr<const TableProperties> SstFileReader::GetTableProperties()
+    const {
+  return rep_->table_reader->GetTableProperties();
+}
+
+Status SstFileReader::VerifyChecksum(const ReadOptions& read_options) {
+  return rep_->table_reader->VerifyChecksum(read_options,
+                                            TableReaderCaller::kSSTFileReader);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/table/sst_file_reader_test.cc b/src/rocksdb/table/sst_file_reader_test.cc
new file mode 100644
index 000000000..4837d223b
--- /dev/null
+++ b/src/rocksdb/table/sst_file_reader_test.cc
@@ -0,0 +1,434 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/sst_file_reader.h"
+
+#include <cinttypes>
+
+#include "port/stack_trace.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/sst_file_writer.h"
+#include "table/sst_file_writer_collectors.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::string EncodeAsString(uint64_t v) {
+  char buf[16];
+  snprintf(buf, sizeof(buf), "%08" PRIu64, v);
+  return std::string(buf);
+}
+
+std::string EncodeAsUint64(uint64_t v) {
+  std::string dst;
+  PutFixed64(&dst, v);
+  return dst;
+}
+
+class SstFileReaderTest : public testing::Test {
+ public:
+  SstFileReaderTest() {
+    options_.merge_operator = MergeOperators::CreateUInt64AddOperator();
+    sst_name_ = test::PerThreadDBPath("sst_file");
+
+    Env* base_env = Env::Default();
+    EXPECT_OK(
+        test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_));
+    EXPECT_NE(nullptr, base_env);
+    env_ = base_env;
+    options_.env = env_;
+  }
+
+  ~SstFileReaderTest() {
+    Status s = env_->DeleteFile(sst_name_);
+    EXPECT_OK(s);
+  }
+
+  void CreateFile(const std::string& file_name,
+                  const std::vector<std::string>& keys) {
+    SstFileWriter writer(soptions_, options_);
+    ASSERT_OK(writer.Open(file_name));
+    for (size_t i = 0; i + 2 < keys.size(); i += 3) {
+      ASSERT_OK(writer.Put(keys[i], keys[i]));
+      ASSERT_OK(writer.Merge(keys[i + 1], EncodeAsUint64(i + 1)));
+      ASSERT_OK(writer.Delete(keys[i + 2]));
+    }
+    ASSERT_OK(writer.Finish());
+  }
+
+  void CheckFile(const std::string& file_name,
+                 const std::vector<std::string>& keys,
+                 bool check_global_seqno = false) {
+    ReadOptions ropts;
+    SstFileReader reader(options_);
+    ASSERT_OK(reader.Open(file_name));
+    ASSERT_OK(reader.VerifyChecksum());
+    std::unique_ptr<Iterator> iter(reader.NewIterator(ropts));
+    iter->SeekToFirst();
+    for (size_t i = 0; i + 2 < keys.size(); i += 3) {
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ(iter->key().compare(keys[i]), 0);
+      ASSERT_EQ(iter->value().compare(keys[i]), 0);
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ(iter->key().compare(keys[i + 1]), 0);
+      ASSERT_EQ(iter->value().compare(EncodeAsUint64(i + 1)), 0);
+      iter->Next();
+    }
+    ASSERT_FALSE(iter->Valid());
+    if (check_global_seqno) {
+      auto properties = reader.GetTableProperties();
+      ASSERT_TRUE(properties);
+      std::string hostname;
+      ASSERT_OK(env_->GetHostNameString(&hostname));
+      ASSERT_EQ(properties->db_host_id, hostname);
+      auto& user_properties = properties->user_collected_properties;
+      ASSERT_TRUE(
+          user_properties.count(ExternalSstFilePropertyNames::kGlobalSeqno));
+    }
+  }
+
+  void CreateFileAndCheck(const std::vector<std::string>& keys) {
+    CreateFile(sst_name_, keys);
+    CheckFile(sst_name_, keys);
+  }
+
+ protected:
+  Options options_;
+  EnvOptions soptions_;
+  std::string sst_name_;
+  std::shared_ptr<Env> env_guard_;
+  Env* env_;
+};
+
+const uint64_t kNumKeys = 100;
+
+TEST_F(SstFileReaderTest, Basic) {
+  std::vector<std::string> keys;
+  for (uint64_t i = 0; i < kNumKeys; i++) {
+    keys.emplace_back(EncodeAsString(i));
+  }
+  CreateFileAndCheck(keys);
+}
+
+TEST_F(SstFileReaderTest, Uint64Comparator) {
+  options_.comparator = test::Uint64Comparator();
+  std::vector<std::string> keys;
+  for (uint64_t i = 0; i < kNumKeys; i++) {
+    keys.emplace_back(EncodeAsUint64(i));
+  }
+  CreateFileAndCheck(keys);
+}
+
+TEST_F(SstFileReaderTest, ReadOptionsOutOfScope) {
+  // Repro a bug where the SstFileReader depended on its configured ReadOptions
+  // outliving it.
+  options_.comparator = test::Uint64Comparator();
+  std::vector<std::string> keys;
+  for (uint64_t i = 0; i < kNumKeys; i++) {
+    keys.emplace_back(EncodeAsUint64(i));
+  }
+  CreateFile(sst_name_, keys);
+
+  SstFileReader reader(options_);
+  ASSERT_OK(reader.Open(sst_name_));
+  std::unique_ptr<Iterator> iter;
+  {
+    // Make sure ReadOptions go out of scope ASAP so we know the iterator
+    // operations do not depend on it.
+    ReadOptions ropts;
+    iter.reset(reader.NewIterator(ropts));
+  }
+  iter->SeekToFirst();
+  while (iter->Valid()) {
+    iter->Next();
+  }
+}
+
+TEST_F(SstFileReaderTest, ReadFileWithGlobalSeqno) {
+  std::vector<std::string> keys;
+  for (uint64_t i = 0; i < kNumKeys; i++) {
+    keys.emplace_back(EncodeAsString(i));
+  }
+  // Generate a SST file.
+  CreateFile(sst_name_, keys);
+
+  // Ingest the file into a db, to assign it a global sequence number.
+  Options options;
+  options.create_if_missing = true;
+  std::string db_name = test::PerThreadDBPath("test_db");
+  DB* db;
+  ASSERT_OK(DB::Open(options, db_name, &db));
+  // Bump sequence number.
+  ASSERT_OK(db->Put(WriteOptions(), keys[0], "foo"));
+  ASSERT_OK(db->Flush(FlushOptions()));
+  // Ingest the file.
+  IngestExternalFileOptions ingest_options;
+  ingest_options.write_global_seqno = true;
+  ASSERT_OK(db->IngestExternalFile({sst_name_}, ingest_options));
+  std::vector<std::string> live_files;
+  uint64_t manifest_file_size = 0;
+  ASSERT_OK(db->GetLiveFiles(live_files, &manifest_file_size));
+  // Get the ingested file.
+  std::string ingested_file;
+  for (auto& live_file : live_files) {
+    if (live_file.substr(live_file.size() - 4, std::string::npos) == ".sst") {
+      if (ingested_file.empty() || ingested_file < live_file) {
+        ingested_file = live_file;
+      }
+    }
+  }
+  ASSERT_FALSE(ingested_file.empty());
+  delete db;
+
+  // Verify the file can be open and read by SstFileReader.
+  CheckFile(db_name + ingested_file, keys, true /* check_global_seqno */);
+
+  // Cleanup.
+  ASSERT_OK(DestroyDB(db_name, options));
+}
+
+TEST_F(SstFileReaderTest, TimestampSizeMismatch) {
+  SstFileWriter writer(soptions_, options_);
+
+  ASSERT_OK(writer.Open(sst_name_));
+
+  // Comparator is not timestamp-aware; calls to APIs taking timestamps should
+  // fail.
+  ASSERT_NOK(writer.Put("key", EncodeAsUint64(100), "value"));
+  ASSERT_NOK(writer.Delete("another_key", EncodeAsUint64(200)));
+}
+
+class SstFileReaderTimestampTest : public testing::Test {
+ public:
+  SstFileReaderTimestampTest() {
+    Env* env = Env::Default();
+    EXPECT_OK(test::CreateEnvFromSystem(ConfigOptions(), &env, &env_guard_));
+    EXPECT_NE(nullptr, env);
+
+    options_.env = env;
+
+    options_.comparator = test::BytewiseComparatorWithU64TsWrapper();
+
+    sst_name_ = test::PerThreadDBPath("sst_file_ts");
+  }
+
+  ~SstFileReaderTimestampTest() {
+    EXPECT_OK(options_.env->DeleteFile(sst_name_));
+  }
+
+  struct KeyValueDesc {
+    KeyValueDesc(std::string k, std::string ts, std::string v)
+        : key(std::move(k)), timestamp(std::move(ts)), value(std::move(v)) {}
+
+    std::string key;
+    std::string timestamp;
+    std::string value;
+  };
+
+  struct InputKeyValueDesc : public KeyValueDesc {
+    InputKeyValueDesc(std::string k, std::string ts, std::string v, bool is_del,
+                      bool use_contig_buf)
+        : KeyValueDesc(std::move(k), std::move(ts), std::move(v)),
+          is_delete(is_del),
+          use_contiguous_buffer(use_contig_buf) {}
+
+    bool is_delete = false;
+    bool use_contiguous_buffer = false;
+  };
+
+  struct OutputKeyValueDesc : public KeyValueDesc {
+    OutputKeyValueDesc(std::string k, std::string ts, std::string v)
+        : KeyValueDesc(std::move(k), std::string(ts), std::string(v)) {}
+  };
+
+  void CreateFile(const std::vector<InputKeyValueDesc>& descs) {
+    SstFileWriter writer(soptions_, options_);
+
+    ASSERT_OK(writer.Open(sst_name_));
+
+    for (const auto& desc : descs) {
+      if (desc.is_delete) {
+        if (desc.use_contiguous_buffer) {
+          std::string key_with_ts(desc.key + desc.timestamp);
+          ASSERT_OK(writer.Delete(Slice(key_with_ts.data(), desc.key.size()),
+                                  Slice(key_with_ts.data() + desc.key.size(),
+                                        desc.timestamp.size())));
+        } else {
+          ASSERT_OK(writer.Delete(desc.key, desc.timestamp));
+        }
+      } else {
+        if (desc.use_contiguous_buffer) {
+          std::string key_with_ts(desc.key + desc.timestamp);
+          ASSERT_OK(writer.Put(Slice(key_with_ts.data(), desc.key.size()),
+                               Slice(key_with_ts.data() + desc.key.size(),
+                                     desc.timestamp.size()),
+                               desc.value));
+        } else {
+          ASSERT_OK(writer.Put(desc.key, desc.timestamp, desc.value));
+        }
+      }
+    }
+
+    ASSERT_OK(writer.Finish());
+  }
+
+  void CheckFile(const std::string& timestamp,
+                 const std::vector<OutputKeyValueDesc>& descs) {
+    SstFileReader reader(options_);
+
+    ASSERT_OK(reader.Open(sst_name_));
+    ASSERT_OK(reader.VerifyChecksum());
+
+    Slice ts_slice(timestamp);
+
+    ReadOptions read_options;
+    read_options.timestamp = &ts_slice;
+
+    std::unique_ptr<Iterator> iter(reader.NewIterator(read_options));
+    iter->SeekToFirst();
+
+    for (const auto& desc : descs) {
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ(iter->key(), desc.key);
+      ASSERT_EQ(iter->timestamp(), desc.timestamp);
+      ASSERT_EQ(iter->value(), desc.value);
+
+      iter->Next();
+    }
+
+    ASSERT_FALSE(iter->Valid());
+  }
+
+ protected:
+  std::shared_ptr<Env> env_guard_;
+  Options options_;
+  EnvOptions soptions_;
+  std::string sst_name_;
+};
+
+TEST_F(SstFileReaderTimestampTest, Basic) {
+  std::vector<InputKeyValueDesc> input_descs;
+
+  for (uint64_t k = 0; k < kNumKeys; k += 4) {
+    // A Put with key k, timestamp k that gets overwritten by a subsequent Put
+    // with timestamp (k + 1). Note that the comparator uses descending order
+    // for the timestamp part, so we add the later Put first.
+    input_descs.emplace_back(
+        /* key */ EncodeAsString(k), /* timestamp */ EncodeAsUint64(k + 1),
+        /* value */ EncodeAsString(k * 2), /* is_delete */ false,
+        /* use_contiguous_buffer */ false);
+    input_descs.emplace_back(
+        /* key */ EncodeAsString(k), /* timestamp */ EncodeAsUint64(k),
+        /* value */ EncodeAsString(k * 3), /* is_delete */ false,
+        /* use_contiguous_buffer */ true);
+
+    // A Put with key (k + 2), timestamp (k + 2) that gets cancelled out by a
+    // Delete with timestamp (k + 3).  Note that the comparator uses descending
+    // order for the timestamp part, so we add the Delete first.
+    input_descs.emplace_back(/* key */ EncodeAsString(k + 2),
+                             /* timestamp */ EncodeAsUint64(k + 3),
+                             /* value */ std::string(), /* is_delete */ true,
+                             /* use_contiguous_buffer */ (k % 8) == 0);
+    input_descs.emplace_back(
+        /* key */ EncodeAsString(k + 2), /* timestamp */ EncodeAsUint64(k + 2),
+        /* value */ EncodeAsString(k * 5), /* is_delete */ false,
+        /* use_contiguous_buffer */ (k % 8) != 0);
+  }
+
+  CreateFile(input_descs);
+
+  // Note: below, we check the results as of each timestamp in the range,
+  // updating the expected result as needed.
+  std::vector<OutputKeyValueDesc> output_descs;
+
+  for (uint64_t ts = 0; ts < kNumKeys; ++ts) {
+    const uint64_t k = ts - (ts % 4);
+
+    switch (ts % 4) {
+      case 0:  // Initial Put for key k
+        output_descs.emplace_back(/* key */ EncodeAsString(k),
+                                  /* timestamp */ EncodeAsUint64(ts),
+                                  /* value */ EncodeAsString(k * 3));
+        break;
+
+      case 1:  // Second Put for key k
+        assert(output_descs.back().key == EncodeAsString(k));
+        assert(output_descs.back().timestamp == EncodeAsUint64(ts - 1));
+        assert(output_descs.back().value == EncodeAsString(k * 3));
+        output_descs.back().timestamp = EncodeAsUint64(ts);
+        output_descs.back().value = EncodeAsString(k * 2);
+        break;
+
+      case 2:  // Put for key (k + 2)
+        output_descs.emplace_back(/* key */ EncodeAsString(k + 2),
+                                  /* timestamp */ EncodeAsUint64(ts),
+                                  /* value */ EncodeAsString(k * 5));
+        break;
+
+      case 3:  // Delete for key (k + 2)
+        assert(output_descs.back().key == EncodeAsString(k + 2));
+        assert(output_descs.back().timestamp == EncodeAsUint64(ts - 1));
+        assert(output_descs.back().value == EncodeAsString(k * 5));
+        output_descs.pop_back();
+        break;
+    }
+
+    CheckFile(EncodeAsUint64(ts), output_descs);
+  }
+}
+
+TEST_F(SstFileReaderTimestampTest, TimestampsOutOfOrder) {
+  SstFileWriter writer(soptions_, options_);
+
+  ASSERT_OK(writer.Open(sst_name_));
+
+  // Note: KVs that have the same user key disregarding timestamps should be in
+  // descending order of timestamps.
+  ASSERT_OK(writer.Put("key", EncodeAsUint64(1), "value1"));
+  ASSERT_NOK(writer.Put("key", EncodeAsUint64(2), "value2"));
+}
+
+TEST_F(SstFileReaderTimestampTest, TimestampSizeMismatch) {
+  SstFileWriter writer(soptions_, options_);
+
+  ASSERT_OK(writer.Open(sst_name_));
+
+  // Comparator expects 64-bit timestamps; timestamps with other sizes as well
+  // as calls to the timestamp-less APIs should be rejected.
+  ASSERT_NOK(writer.Put("key", "not_an_actual_64_bit_timestamp", "value"));
+  ASSERT_NOK(writer.Delete("another_key", "timestamp_of_unexpected_size"));
+
+  ASSERT_NOK(writer.Put("key_without_timestamp", "value"));
+  ASSERT_NOK(writer.Merge("another_key_missing_a_timestamp", "merge_operand"));
+  ASSERT_NOK(writer.Delete("yet_another_key_still_no_timestamp"));
+  ASSERT_NOK(writer.DeleteRange("begin_key_timestamp_absent",
+                                "end_key_with_a_complete_lack_of_timestamps"));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as SstFileReader is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/sst_file_writer.cc b/src/rocksdb/table/sst_file_writer.cc
new file mode 100644
index 000000000..16d11efd3
--- /dev/null
+++ b/src/rocksdb/table/sst_file_writer.cc
@@ -0,0 +1,427 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/sst_file_writer.h"
+
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/sst_file_writer_collectors.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const std::string ExternalSstFilePropertyNames::kVersion =
+    "rocksdb.external_sst_file.version";
+const std::string ExternalSstFilePropertyNames::kGlobalSeqno =
+    "rocksdb.external_sst_file.global_seqno";
+
+#ifndef ROCKSDB_LITE
+
+const size_t kFadviseTrigger = 1024 * 1024;  // 1MB
+
+struct SstFileWriter::Rep {
+  Rep(const EnvOptions& _env_options, const Options& options,
+      Env::IOPriority _io_priority, const Comparator* _user_comparator,
+      ColumnFamilyHandle* _cfh, bool _invalidate_page_cache, bool _skip_filters,
+      std::string _db_session_id)
+      : env_options(_env_options),
+        ioptions(options),
+        mutable_cf_options(options),
+        io_priority(_io_priority),
+        internal_comparator(_user_comparator),
+        cfh(_cfh),
+        invalidate_page_cache(_invalidate_page_cache),
+        skip_filters(_skip_filters),
+        db_session_id(_db_session_id) {}
+
+  std::unique_ptr<WritableFileWriter> file_writer;
+  std::unique_ptr<TableBuilder> builder;
+  EnvOptions env_options;
+  ImmutableOptions ioptions;
+  MutableCFOptions mutable_cf_options;
+  Env::IOPriority io_priority;
+  InternalKeyComparator internal_comparator;
+  ExternalSstFileInfo file_info;
+  InternalKey ikey;
+  std::string column_family_name;
+  ColumnFamilyHandle* cfh;
+  // If true, We will give the OS a hint that this file pages is not needed
+  // every time we write 1MB to the file.
+  bool invalidate_page_cache;
+  // The size of the file during the last time we called Fadvise to remove
+  // cached pages from page cache.
+  uint64_t last_fadvise_size = 0;
+  bool skip_filters;
+  std::string db_session_id;
+  uint64_t next_file_number = 1;
+
+  Status AddImpl(const Slice& user_key, const Slice& value,
+                 ValueType value_type) {
+    if (!builder) {
+      return Status::InvalidArgument("File is not opened");
+    }
+
+    if (file_info.num_entries == 0) {
+      file_info.smallest_key.assign(user_key.data(), user_key.size());
+    } else {
+      if (internal_comparator.user_comparator()->Compare(
+              user_key, file_info.largest_key) <= 0) {
+        // Make sure that keys are added in order
+        return Status::InvalidArgument(
+            "Keys must be added in strict ascending order.");
+      }
+    }
+
+    assert(value_type == kTypeValue || value_type == kTypeMerge ||
+           value_type == kTypeDeletion ||
+           value_type == kTypeDeletionWithTimestamp);
+
+    constexpr SequenceNumber sequence_number = 0;
+
+    ikey.Set(user_key, sequence_number, value_type);
+
+    builder->Add(ikey.Encode(), value);
+
+    // update file info
+    file_info.num_entries++;
+    file_info.largest_key.assign(user_key.data(), user_key.size());
+    file_info.file_size = builder->FileSize();
+
+    InvalidatePageCache(false /* closing */).PermitUncheckedError();
+    return Status::OK();
+  }
+
+  Status Add(const Slice& user_key, const Slice& value, ValueType value_type) {
+    if (internal_comparator.user_comparator()->timestamp_size() != 0) {
+      return Status::InvalidArgument("Timestamp size mismatch");
+    }
+
+    return AddImpl(user_key, value, value_type);
+  }
+
+  Status Add(const Slice& user_key, const Slice& timestamp, const Slice& value,
+             ValueType value_type) {
+    const size_t timestamp_size = timestamp.size();
+
+    if (internal_comparator.user_comparator()->timestamp_size() !=
+        timestamp_size) {
+      return Status::InvalidArgument("Timestamp size mismatch");
+    }
+
+    const size_t user_key_size = user_key.size();
+
+    if (user_key.data() + user_key_size == timestamp.data()) {
+      Slice user_key_with_ts(user_key.data(), user_key_size + timestamp_size);
+      return AddImpl(user_key_with_ts, value, value_type);
+    }
+
+    std::string user_key_with_ts;
+    user_key_with_ts.reserve(user_key_size + timestamp_size);
+    user_key_with_ts.append(user_key.data(), user_key_size);
+    user_key_with_ts.append(timestamp.data(), timestamp_size);
+
+    return AddImpl(user_key_with_ts, value, value_type);
+  }
+
+  Status DeleteRangeImpl(const Slice& begin_key, const Slice& end_key) {
+    if (!builder) {
+      return Status::InvalidArgument("File is not opened");
+    }
+    RangeTombstone tombstone(begin_key, end_key, 0 /* Sequence Number */);
+    if (file_info.num_range_del_entries == 0) {
+      file_info.smallest_range_del_key.assign(tombstone.start_key_.data(),
+                                              tombstone.start_key_.size());
+      file_info.largest_range_del_key.assign(tombstone.end_key_.data(),
+                                             tombstone.end_key_.size());
+    } else {
+      if (internal_comparator.user_comparator()->Compare(
+              tombstone.start_key_, file_info.smallest_range_del_key) < 0) {
+        file_info.smallest_range_del_key.assign(tombstone.start_key_.data(),
+                                                tombstone.start_key_.size());
+      }
+      if (internal_comparator.user_comparator()->Compare(
+              tombstone.end_key_, file_info.largest_range_del_key) > 0) {
+        file_info.largest_range_del_key.assign(tombstone.end_key_.data(),
+                                               tombstone.end_key_.size());
+      }
+    }
+
+    auto ikey_and_end_key = tombstone.Serialize();
+    builder->Add(ikey_and_end_key.first.Encode(), ikey_and_end_key.second);
+
+    // update file info
+    file_info.num_range_del_entries++;
+    file_info.file_size = builder->FileSize();
+
+    InvalidatePageCache(false /* closing */).PermitUncheckedError();
+    return Status::OK();
+  }
+
+  Status DeleteRange(const Slice& begin_key, const Slice& end_key) {
+    if (internal_comparator.user_comparator()->timestamp_size() != 0) {
+      return Status::InvalidArgument("Timestamp size mismatch");
+    }
+    return DeleteRangeImpl(begin_key, end_key);
+  }
+
+  // begin_key and end_key should be users keys without timestamp.
+  Status DeleteRange(const Slice& begin_key, const Slice& end_key,
+                     const Slice& timestamp) {
+    const size_t timestamp_size = timestamp.size();
+
+    if (internal_comparator.user_comparator()->timestamp_size() !=
+        timestamp_size) {
+      return Status::InvalidArgument("Timestamp size mismatch");
+    }
+
+    const size_t begin_key_size = begin_key.size();
+    const size_t end_key_size = end_key.size();
+    if (begin_key.data() + begin_key_size == timestamp.data() ||
+        end_key.data() + begin_key_size == timestamp.data()) {
+      assert(memcmp(begin_key.data() + begin_key_size,
+                    end_key.data() + end_key_size, timestamp_size) == 0);
+      Slice begin_key_with_ts(begin_key.data(),
+                              begin_key_size + timestamp_size);
+      Slice end_key_with_ts(end_key.data(), end_key.size() + timestamp_size);
+      return DeleteRangeImpl(begin_key_with_ts, end_key_with_ts);
+    }
+    std::string begin_key_with_ts;
+    begin_key_with_ts.reserve(begin_key_size + timestamp_size);
+    begin_key_with_ts.append(begin_key.data(), begin_key_size);
+    begin_key_with_ts.append(timestamp.data(), timestamp_size);
+    std::string end_key_with_ts;
+    end_key_with_ts.reserve(end_key_size + timestamp_size);
+    end_key_with_ts.append(end_key.data(), end_key_size);
+    end_key_with_ts.append(timestamp.data(), timestamp_size);
+    return DeleteRangeImpl(begin_key_with_ts, end_key_with_ts);
+  }
+
+  Status InvalidatePageCache(bool closing) {
+    Status s = Status::OK();
+    if (invalidate_page_cache == false) {
+      // Fadvise disabled
+      return s;
+    }
+    uint64_t bytes_since_last_fadvise = builder->FileSize() - last_fadvise_size;
+    if (bytes_since_last_fadvise > kFadviseTrigger || closing) {
+      TEST_SYNC_POINT_CALLBACK("SstFileWriter::Rep::InvalidatePageCache",
+                               &(bytes_since_last_fadvise));
+      // Tell the OS that we don't need this file in page cache
+      s = file_writer->InvalidateCache(0, 0);
+      if (s.IsNotSupported()) {
+        // NotSupported is fine as it could be a file type that doesn't use page
+        // cache.
+        s = Status::OK();
+      }
+      last_fadvise_size = builder->FileSize();
+    }
+    return s;
+  }
+};
+
+SstFileWriter::SstFileWriter(const EnvOptions& env_options,
+                             const Options& options,
+                             const Comparator* user_comparator,
+                             ColumnFamilyHandle* column_family,
+                             bool invalidate_page_cache,
+                             Env::IOPriority io_priority, bool skip_filters)
+    : rep_(new Rep(env_options, options, io_priority, user_comparator,
+                   column_family, invalidate_page_cache, skip_filters,
+                   DBImpl::GenerateDbSessionId(options.env))) {
+  // SstFileWriter is used to create sst files that can be added to database
+  // later. Therefore, no real db_id and db_session_id are associated with it.
+  // Here we mimic the way db_session_id behaves by getting a db_session_id
+  // for each SstFileWriter, and (later below) assign unique file numbers
+  // in the table properties. The db_id is set to be "SST Writer" for clarity.
+
+  rep_->file_info.file_size = 0;
+}
+
+SstFileWriter::~SstFileWriter() {
+  if (rep_->builder) {
+    // User did not call Finish() or Finish() failed, we need to
+    // abandon the builder.
+    rep_->builder->Abandon();
+  }
+}
+
+Status SstFileWriter::Open(const std::string& file_path) {
+  Rep* r = rep_.get();
+  Status s;
+  std::unique_ptr<FSWritableFile> sst_file;
+  FileOptions cur_file_opts(r->env_options);
+  s = r->ioptions.env->GetFileSystem()->NewWritableFile(
+      file_path, cur_file_opts, &sst_file, nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+
+  sst_file->SetIOPriority(r->io_priority);
+
+  CompressionType compression_type;
+  CompressionOptions compression_opts;
+  if (r->mutable_cf_options.bottommost_compression !=
+      kDisableCompressionOption) {
+    compression_type = r->mutable_cf_options.bottommost_compression;
+    if (r->mutable_cf_options.bottommost_compression_opts.enabled) {
+      compression_opts = r->mutable_cf_options.bottommost_compression_opts;
+    } else {
+      compression_opts = r->mutable_cf_options.compression_opts;
+    }
+  } else if (!r->mutable_cf_options.compression_per_level.empty()) {
+    // Use the compression of the last level if we have per level compression
+    compression_type = *(r->mutable_cf_options.compression_per_level.rbegin());
+    compression_opts = r->mutable_cf_options.compression_opts;
+  } else {
+    compression_type = r->mutable_cf_options.compression;
+    compression_opts = r->mutable_cf_options.compression_opts;
+  }
+
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+
+  // SstFileWriter properties collector to add SstFileWriter version.
+  int_tbl_prop_collector_factories.emplace_back(
+      new SstFileWriterPropertiesCollectorFactory(2 /* version */,
+                                                  0 /* global_seqno*/));
+
+  // User collector factories
+  auto user_collector_factories =
+      r->ioptions.table_properties_collector_factories;
+  for (size_t i = 0; i < user_collector_factories.size(); i++) {
+    int_tbl_prop_collector_factories.emplace_back(
+        new UserKeyTablePropertiesCollectorFactory(
+            user_collector_factories[i]));
+  }
+  int unknown_level = -1;
+  uint32_t cf_id;
+
+  if (r->cfh != nullptr) {
+    // user explicitly specified that this file will be ingested into cfh,
+    // we can persist this information in the file.
+    cf_id = r->cfh->GetID();
+    r->column_family_name = r->cfh->GetName();
+  } else {
+    r->column_family_name = "";
+    cf_id = TablePropertiesCollectorFactory::Context::kUnknownColumnFamily;
+  }
+
+  // TODO: it would be better to set oldest_key_time to be used for getting the
+  //  approximate time of ingested keys.
+  TableBuilderOptions table_builder_options(
+      r->ioptions, r->mutable_cf_options, r->internal_comparator,
+      &int_tbl_prop_collector_factories, compression_type, compression_opts,
+      cf_id, r->column_family_name, unknown_level, false /* is_bottommost */,
+      TableFileCreationReason::kMisc, 0 /* oldest_key_time */,
+      0 /* file_creation_time */, "SST Writer" /* db_id */, r->db_session_id,
+      0 /* target_file_size */, r->next_file_number);
+  // External SST files used to each get a unique session id. Now for
+  // slightly better uniqueness probability in constructing cache keys, we
+  // assign fake file numbers to each file (into table properties) and keep
+  // the same session id for the life of the SstFileWriter.
+  r->next_file_number++;
+  // XXX: when we can remove skip_filters from the SstFileWriter public API
+  // we can remove it from TableBuilderOptions.
+  table_builder_options.skip_filters = r->skip_filters;
+  FileTypeSet tmp_set = r->ioptions.checksum_handoff_file_types;
+  r->file_writer.reset(new WritableFileWriter(
+      std::move(sst_file), file_path, r->env_options, r->ioptions.clock,
+      nullptr /* io_tracer */, nullptr /* stats */, r->ioptions.listeners,
+      r->ioptions.file_checksum_gen_factory.get(),
+      tmp_set.Contains(FileType::kTableFile), false));
+
+  // TODO(tec) : If table_factory is using compressed block cache, we will
+  // be adding the external sst file blocks into it, which is wasteful.
+  r->builder.reset(r->ioptions.table_factory->NewTableBuilder(
+      table_builder_options, r->file_writer.get()));
+
+  r->file_info = ExternalSstFileInfo();
+  r->file_info.file_path = file_path;
+  r->file_info.version = 2;
+  return s;
+}
+
+Status SstFileWriter::Add(const Slice& user_key, const Slice& value) {
+  return rep_->Add(user_key, value, ValueType::kTypeValue);
+}
+
+Status SstFileWriter::Put(const Slice& user_key, const Slice& value) {
+  return rep_->Add(user_key, value, ValueType::kTypeValue);
+}
+
+Status SstFileWriter::Put(const Slice& user_key, const Slice& timestamp,
+                          const Slice& value) {
+  return rep_->Add(user_key, timestamp, value, ValueType::kTypeValue);
+}
+
+Status SstFileWriter::Merge(const Slice& user_key, const Slice& value) {
+  return rep_->Add(user_key, value, ValueType::kTypeMerge);
+}
+
+Status SstFileWriter::Delete(const Slice& user_key) {
+  return rep_->Add(user_key, Slice(), ValueType::kTypeDeletion);
+}
+
+Status SstFileWriter::Delete(const Slice& user_key, const Slice& timestamp) {
+  return rep_->Add(user_key, timestamp, Slice(),
+                   ValueType::kTypeDeletionWithTimestamp);
+}
+
+Status SstFileWriter::DeleteRange(const Slice& begin_key,
+                                  const Slice& end_key) {
+  return rep_->DeleteRange(begin_key, end_key);
+}
+
+Status SstFileWriter::DeleteRange(const Slice& begin_key, const Slice& end_key,
+                                  const Slice& timestamp) {
+  return rep_->DeleteRange(begin_key, end_key, timestamp);
+}
+
+Status SstFileWriter::Finish(ExternalSstFileInfo* file_info) {
+  Rep* r = rep_.get();
+  if (!r->builder) {
+    return Status::InvalidArgument("File is not opened");
+  }
+  if (r->file_info.num_entries == 0 &&
+      r->file_info.num_range_del_entries == 0) {
+    return Status::InvalidArgument("Cannot create sst file with no entries");
+  }
+
+  Status s = r->builder->Finish();
+  r->file_info.file_size = r->builder->FileSize();
+
+  if (s.ok()) {
+    s = r->file_writer->Sync(r->ioptions.use_fsync);
+    r->InvalidatePageCache(true /* closing */).PermitUncheckedError();
+    if (s.ok()) {
+      s = r->file_writer->Close();
+    }
+  }
+  if (s.ok()) {
+    r->file_info.file_checksum = r->file_writer->GetFileChecksum();
+    r->file_info.file_checksum_func_name =
+        r->file_writer->GetFileChecksumFuncName();
+  }
+  if (!s.ok()) {
+    r->ioptions.env->DeleteFile(r->file_info.file_path);
+  }
+
+  if (file_info != nullptr) {
+    *file_info = r->file_info;
+  }
+
+  r->builder.reset();
+  return s;
+}
+
+uint64_t SstFileWriter::FileSize() { return rep_->file_info.file_size; }
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/sst_file_writer_collectors.h b/src/rocksdb/table/sst_file_writer_collectors.h
new file mode 100644
index 000000000..486315fb5
--- /dev/null
+++ b/src/rocksdb/table/sst_file_writer_collectors.h
@@ -0,0 +1,95 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <string>
+
+#include "db/table_properties_collector.h"
+#include "rocksdb/types.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Table Properties that are specific to tables created by SstFileWriter.
+struct ExternalSstFilePropertyNames {
+  // value of this property is a fixed uint32 number.
+  static const std::string kVersion;
+  // value of this property is a fixed uint64 number.
+  static const std::string kGlobalSeqno;
+};
+
+// PropertiesCollector used to add properties specific to tables
+// generated by SstFileWriter
+class SstFileWriterPropertiesCollector : public IntTblPropCollector {
+ public:
+  explicit SstFileWriterPropertiesCollector(int32_t version,
+                                            SequenceNumber global_seqno)
+      : version_(version), global_seqno_(global_seqno) {}
+
+  virtual Status InternalAdd(const Slice& /*key*/, const Slice& /*value*/,
+                             uint64_t /*file_size*/) override {
+    // Intentionally left blank. Have no interest in collecting stats for
+    // individual key/value pairs.
+    return Status::OK();
+  }
+
+  virtual void BlockAdd(uint64_t /* block_uncomp_bytes */,
+                        uint64_t /* block_compressed_bytes_fast */,
+                        uint64_t /* block_compressed_bytes_slow */) override {
+    // Intentionally left blank. No interest in collecting stats for
+    // blocks.
+    return;
+  }
+
+  virtual Status Finish(UserCollectedProperties* properties) override {
+    // File version
+    std::string version_val;
+    PutFixed32(&version_val, static_cast<uint32_t>(version_));
+    properties->insert({ExternalSstFilePropertyNames::kVersion, version_val});
+
+    // Global Sequence number
+    std::string seqno_val;
+    PutFixed64(&seqno_val, static_cast<uint64_t>(global_seqno_));
+    properties->insert({ExternalSstFilePropertyNames::kGlobalSeqno, seqno_val});
+
+    return Status::OK();
+  }
+
+  virtual const char* Name() const override {
+    return "SstFileWriterPropertiesCollector";
+  }
+
+  virtual UserCollectedProperties GetReadableProperties() const override {
+    return {{ExternalSstFilePropertyNames::kVersion, std::to_string(version_)}};
+  }
+
+ private:
+  int32_t version_;
+  SequenceNumber global_seqno_;
+};
+
+class SstFileWriterPropertiesCollectorFactory
+    : public IntTblPropCollectorFactory {
+ public:
+  explicit SstFileWriterPropertiesCollectorFactory(int32_t version,
+                                                   SequenceNumber global_seqno)
+      : version_(version), global_seqno_(global_seqno) {}
+
+  virtual IntTblPropCollector* CreateIntTblPropCollector(
+      uint32_t /*column_family_id*/, int /* level_at_creation */) override {
+    return new SstFileWriterPropertiesCollector(version_, global_seqno_);
+  }
+
+  virtual const char* Name() const override {
+    return "SstFileWriterPropertiesCollector";
+  }
+
+ private:
+  int32_t version_;
+  SequenceNumber global_seqno_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/table_builder.h b/src/rocksdb/table/table_builder.h
new file mode 100644
index 000000000..1790f33b1
--- /dev/null
+++ b/src/rocksdb/table/table_builder.h
@@ -0,0 +1,219 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/seqno_to_time_mapping.h"
+#include "db/table_properties_collector.h"
+#include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table_properties.h"
+#include "table/unique_id_impl.h"
+#include "trace_replay/block_cache_tracer.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class Status;
+
+struct TableReaderOptions {
+  // @param skip_filters Disables loading/accessing the filter block
+  TableReaderOptions(
+      const ImmutableOptions& _ioptions,
+      const std::shared_ptr<const SliceTransform>& _prefix_extractor,
+      const EnvOptions& _env_options,
+      const InternalKeyComparator& _internal_comparator,
+      bool _skip_filters = false, bool _immortal = false,
+      bool _force_direct_prefetch = false, int _level = -1,
+      BlockCacheTracer* const _block_cache_tracer = nullptr,
+      size_t _max_file_size_for_l0_meta_pin = 0,
+      const std::string& _cur_db_session_id = "", uint64_t _cur_file_num = 0,
+      UniqueId64x2 _unique_id = {}, SequenceNumber _largest_seqno = 0)
+      : ioptions(_ioptions),
+        prefix_extractor(_prefix_extractor),
+        env_options(_env_options),
+        internal_comparator(_internal_comparator),
+        skip_filters(_skip_filters),
+        immortal(_immortal),
+        force_direct_prefetch(_force_direct_prefetch),
+        level(_level),
+        largest_seqno(_largest_seqno),
+        block_cache_tracer(_block_cache_tracer),
+        max_file_size_for_l0_meta_pin(_max_file_size_for_l0_meta_pin),
+        cur_db_session_id(_cur_db_session_id),
+        cur_file_num(_cur_file_num),
+        unique_id(_unique_id) {}
+
+  const ImmutableOptions& ioptions;
+  const std::shared_ptr<const SliceTransform>& prefix_extractor;
+  const EnvOptions& env_options;
+  const InternalKeyComparator& internal_comparator;
+  // This is only used for BlockBasedTable (reader)
+  bool skip_filters;
+  // Whether the table will be valid as long as the DB is open
+  bool immortal;
+  // When data prefetching is needed, even if direct I/O is off, read data to
+  // fetch into RocksDB's buffer, rather than relying
+  // RandomAccessFile::Prefetch().
+  bool force_direct_prefetch;
+  // What level this table/file is on, -1 for "not set, don't know." Used
+  // for level-specific statistics.
+  int level;
+  // largest seqno in the table (or 0 means unknown???)
+  SequenceNumber largest_seqno;
+  BlockCacheTracer* const block_cache_tracer;
+  // Largest L0 file size whose meta-blocks may be pinned (can be zero when
+  // unknown).
+  const size_t max_file_size_for_l0_meta_pin;
+
+  std::string cur_db_session_id;
+
+  uint64_t cur_file_num;
+
+  // Known unique_id or {}, kNullUniqueId64x2 means unknown
+  UniqueId64x2 unique_id;
+};
+
+struct TableBuilderOptions {
+  TableBuilderOptions(
+      const ImmutableOptions& _ioptions, const MutableCFOptions& _moptions,
+      const InternalKeyComparator& _internal_comparator,
+      const IntTblPropCollectorFactories* _int_tbl_prop_collector_factories,
+      CompressionType _compression_type,
+      const CompressionOptions& _compression_opts, uint32_t _column_family_id,
+      const std::string& _column_family_name, int _level,
+      bool _is_bottommost = false,
+      TableFileCreationReason _reason = TableFileCreationReason::kMisc,
+      const int64_t _oldest_key_time = 0,
+      const uint64_t _file_creation_time = 0, const std::string& _db_id = "",
+      const std::string& _db_session_id = "",
+      const uint64_t _target_file_size = 0, const uint64_t _cur_file_num = 0)
+      : ioptions(_ioptions),
+        moptions(_moptions),
+        internal_comparator(_internal_comparator),
+        int_tbl_prop_collector_factories(_int_tbl_prop_collector_factories),
+        compression_type(_compression_type),
+        compression_opts(_compression_opts),
+        column_family_id(_column_family_id),
+        column_family_name(_column_family_name),
+        oldest_key_time(_oldest_key_time),
+        target_file_size(_target_file_size),
+        file_creation_time(_file_creation_time),
+        db_id(_db_id),
+        db_session_id(_db_session_id),
+        level_at_creation(_level),
+        is_bottommost(_is_bottommost),
+        reason(_reason),
+        cur_file_num(_cur_file_num) {}
+
+  const ImmutableOptions& ioptions;
+  const MutableCFOptions& moptions;
+  const InternalKeyComparator& internal_comparator;
+  const IntTblPropCollectorFactories* int_tbl_prop_collector_factories;
+  const CompressionType compression_type;
+  const CompressionOptions& compression_opts;
+  const uint32_t column_family_id;
+  const std::string& column_family_name;
+  const int64_t oldest_key_time;
+  const uint64_t target_file_size;
+  const uint64_t file_creation_time;
+  const std::string db_id;
+  const std::string db_session_id;
+  // BEGIN for FilterBuildingContext
+  const int level_at_creation;
+  const bool is_bottommost;
+  const TableFileCreationReason reason;
+  // END for FilterBuildingContext
+
+  // XXX: only used by BlockBasedTableBuilder for SstFileWriter. If you
+  // want to skip filters, that should be (for example) null filter_policy
+  // in the table options of the ioptions.table_factory
+  bool skip_filters = false;
+  const uint64_t cur_file_num;
+};
+
+// TableBuilder provides the interface used to build a Table
+// (an immutable and sorted map from keys to values).
+//
+// Multiple threads can invoke const methods on a TableBuilder without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same TableBuilder must use
+// external synchronization.
+class TableBuilder {
+ public:
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  virtual ~TableBuilder() {}
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  virtual void Add(const Slice& key, const Slice& value) = 0;
+
+  // Return non-ok iff some error has been detected.
+  virtual Status status() const = 0;
+
+  // Return non-ok iff some error happens during IO.
+  virtual IOStatus io_status() const = 0;
+
+  // Finish building the table.
+  // REQUIRES: Finish(), Abandon() have not been called
+  virtual Status Finish() = 0;
+
+  // Indicate that the contents of this builder should be abandoned.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  virtual void Abandon() = 0;
+
+  // Number of calls to Add() so far.
+  virtual uint64_t NumEntries() const = 0;
+
+  // Whether the output file is completely empty. It has neither entries
+  // or tombstones.
+  virtual bool IsEmpty() const {
+    return NumEntries() == 0 && GetTableProperties().num_range_deletions == 0;
+  }
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  virtual uint64_t FileSize() const = 0;
+
+  // Estimated size of the file generated so far. This is used when
+  // FileSize() cannot estimate final SST size, e.g. parallel compression
+  // is enabled.
+  virtual uint64_t EstimatedFileSize() const { return FileSize(); }
+
+  // If the user defined table properties collector suggest the file to
+  // be further compacted.
+  virtual bool NeedCompact() const { return false; }
+
+  // Returns table properties
+  virtual TableProperties GetTableProperties() const = 0;
+
+  // Return file checksum
+  virtual std::string GetFileChecksum() const = 0;
+
+  // Return file checksum function name
+  virtual const char* GetFileChecksumFuncName() const = 0;
+
+  // Set the sequence number to time mapping
+  virtual void SetSeqnoTimeTableProperties(
+      const std::string& /*encoded_seqno_to_time_mapping*/,
+      uint64_t /*oldest_ancestor_time*/){};
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/table_factory.cc b/src/rocksdb/table/table_factory.cc
new file mode 100644
index 000000000..fc5c5ccde
--- /dev/null
+++ b/src/rocksdb/table/table_factory.cc
@@ -0,0 +1,65 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <mutex>
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
+#include "table/plain/plain_table_factory.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static void RegisterTableFactories(const std::string& /*arg*/) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag loaded;
+  std::call_once(loaded, []() {
+    auto library = ObjectLibrary::Default();
+    library->AddFactory<TableFactory>(
+        TableFactory::kBlockBasedTableName(),
+        [](const std::string& /*uri*/, std::unique_ptr<TableFactory>* guard,
+           std::string* /* errmsg */) {
+          guard->reset(new BlockBasedTableFactory());
+          return guard->get();
+        });
+    library->AddFactory<TableFactory>(
+        TableFactory::kPlainTableName(),
+        [](const std::string& /*uri*/, std::unique_ptr<TableFactory>* guard,
+           std::string* /* errmsg */) {
+          guard->reset(new PlainTableFactory());
+          return guard->get();
+        });
+    library->AddFactory<TableFactory>(
+        TableFactory::kCuckooTableName(),
+        [](const std::string& /*uri*/, std::unique_ptr<TableFactory>* guard,
+           std::string* /* errmsg */) {
+          guard->reset(new CuckooTableFactory());
+          return guard->get();
+        });
+  });
+#endif  // ROCKSDB_LITE
+}
+
+static bool LoadFactory(const std::string& name,
+                        std::shared_ptr<TableFactory>* factory) {
+  if (name == TableFactory::kBlockBasedTableName()) {
+    factory->reset(new BlockBasedTableFactory());
+    return true;
+  } else {
+    return false;
+  }
+}
+
+Status TableFactory::CreateFromString(const ConfigOptions& config_options,
+                                      const std::string& value,
+                                      std::shared_ptr<TableFactory>* factory) {
+  RegisterTableFactories("");
+  return LoadSharedObject<TableFactory>(config_options, value, LoadFactory,
+                                        factory);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/table_properties.cc b/src/rocksdb/table/table_properties.cc
new file mode 100644
index 000000000..b382281f8
--- /dev/null
+++ b/src/rocksdb/table/table_properties.cc
@@ -0,0 +1,349 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/table_properties.h"
+
+#include "db/seqno_to_time_mapping.h"
+#include "port/malloc.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/unique_id.h"
+#include "table/table_properties_internal.h"
+#include "table/unique_id_impl.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const uint32_t TablePropertiesCollectorFactory::Context::kUnknownColumnFamily =
+    std::numeric_limits<int32_t>::max();
+
+namespace {
+void AppendProperty(std::string& props, const std::string& key,
+                    const std::string& value, const std::string& prop_delim,
+                    const std::string& kv_delim) {
+  props.append(key);
+  props.append(kv_delim);
+  props.append(value);
+  props.append(prop_delim);
+}
+
+template <class TValue>
+void AppendProperty(std::string& props, const std::string& key,
+                    const TValue& value, const std::string& prop_delim,
+                    const std::string& kv_delim) {
+  AppendProperty(props, key, std::to_string(value), prop_delim, kv_delim);
+}
+}  // namespace
+
+std::string TableProperties::ToString(const std::string& prop_delim,
+                                      const std::string& kv_delim) const {
+  std::string result;
+  result.reserve(1024);
+
+  // Basic Info
+  AppendProperty(result, "# data blocks", num_data_blocks, prop_delim,
+                 kv_delim);
+  AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim);
+  AppendProperty(result, "# deletions", num_deletions, prop_delim, kv_delim);
+  AppendProperty(result, "# merge operands", num_merge_operands, prop_delim,
+                 kv_delim);
+  AppendProperty(result, "# range deletions", num_range_deletions, prop_delim,
+                 kv_delim);
+
+  AppendProperty(result, "raw key size", raw_key_size, prop_delim, kv_delim);
+  AppendProperty(result, "raw average key size",
+                 num_entries != 0 ? 1.0 * raw_key_size / num_entries : 0.0,
+                 prop_delim, kv_delim);
+  AppendProperty(result, "raw value size", raw_value_size, prop_delim,
+                 kv_delim);
+  AppendProperty(result, "raw average value size",
+                 num_entries != 0 ? 1.0 * raw_value_size / num_entries : 0.0,
+                 prop_delim, kv_delim);
+
+  AppendProperty(result, "data block size", data_size, prop_delim, kv_delim);
+  char index_block_size_str[80];
+  snprintf(index_block_size_str, sizeof(index_block_size_str),
+           "index block size (user-key? %d, delta-value? %d)",
+           static_cast<int>(index_key_is_user_key),
+           static_cast<int>(index_value_is_delta_encoded));
+  AppendProperty(result, index_block_size_str, index_size, prop_delim,
+                 kv_delim);
+  if (index_partitions != 0) {
+    AppendProperty(result, "# index partitions", index_partitions, prop_delim,
+                   kv_delim);
+    AppendProperty(result, "top-level index size", top_level_index_size,
+                   prop_delim, kv_delim);
+  }
+  AppendProperty(result, "filter block size", filter_size, prop_delim,
+                 kv_delim);
+  AppendProperty(result, "# entries for filter", num_filter_entries, prop_delim,
+                 kv_delim);
+  AppendProperty(result, "(estimated) table size",
+                 data_size + index_size + filter_size, prop_delim, kv_delim);
+
+  AppendProperty(
+      result, "filter policy name",
+      filter_policy_name.empty() ? std::string("N/A") : filter_policy_name,
+      prop_delim, kv_delim);
+
+  AppendProperty(result, "prefix extractor name",
+                 prefix_extractor_name.empty() ? std::string("N/A")
+                                               : prefix_extractor_name,
+                 prop_delim, kv_delim);
+
+  AppendProperty(result, "column family ID",
+                 column_family_id ==
+                         ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory::
+                             Context::kUnknownColumnFamily
+                     ? std::string("N/A")
+                     : std::to_string(column_family_id),
+                 prop_delim, kv_delim);
+  AppendProperty(
+      result, "column family name",
+      column_family_name.empty() ? std::string("N/A") : column_family_name,
+      prop_delim, kv_delim);
+
+  AppendProperty(result, "comparator name",
+                 comparator_name.empty() ? std::string("N/A") : comparator_name,
+                 prop_delim, kv_delim);
+
+  AppendProperty(
+      result, "merge operator name",
+      merge_operator_name.empty() ? std::string("N/A") : merge_operator_name,
+      prop_delim, kv_delim);
+
+  AppendProperty(result, "property collectors names",
+                 property_collectors_names.empty() ? std::string("N/A")
+                                                   : property_collectors_names,
+                 prop_delim, kv_delim);
+
+  AppendProperty(
+      result, "SST file compression algo",
+      compression_name.empty() ? std::string("N/A") : compression_name,
+      prop_delim, kv_delim);
+
+  AppendProperty(
+      result, "SST file compression options",
+      compression_options.empty() ? std::string("N/A") : compression_options,
+      prop_delim, kv_delim);
+
+  AppendProperty(result, "creation time", creation_time, prop_delim, kv_delim);
+
+  AppendProperty(result, "time stamp of earliest key", oldest_key_time,
+                 prop_delim, kv_delim);
+
+  AppendProperty(result, "file creation time", file_creation_time, prop_delim,
+                 kv_delim);
+
+  AppendProperty(result, "slow compression estimated data size",
+                 slow_compression_estimated_data_size, prop_delim, kv_delim);
+  AppendProperty(result, "fast compression estimated data size",
+                 fast_compression_estimated_data_size, prop_delim, kv_delim);
+
+  // DB identity and DB session ID
+  AppendProperty(result, "DB identity", db_id, prop_delim, kv_delim);
+  AppendProperty(result, "DB session identity", db_session_id, prop_delim,
+                 kv_delim);
+  AppendProperty(result, "DB host id", db_host_id, prop_delim, kv_delim);
+  AppendProperty(result, "original file number", orig_file_number, prop_delim,
+                 kv_delim);
+
+  // Unique ID, when available
+  std::string id;
+  Status s = GetUniqueIdFromTableProperties(*this, &id);
+  AppendProperty(result, "unique ID",
+                 s.ok() ? UniqueIdToHumanString(id) : "N/A", prop_delim,
+                 kv_delim);
+
+  SeqnoToTimeMapping seq_time_mapping;
+  s = seq_time_mapping.Add(seqno_to_time_mapping);
+  AppendProperty(result, "Sequence number to time mapping",
+                 s.ok() ? seq_time_mapping.ToHumanString() : "N/A", prop_delim,
+                 kv_delim);
+
+  return result;
+}
+
+void TableProperties::Add(const TableProperties& tp) {
+  data_size += tp.data_size;
+  index_size += tp.index_size;
+  index_partitions += tp.index_partitions;
+  top_level_index_size += tp.top_level_index_size;
+  index_key_is_user_key += tp.index_key_is_user_key;
+  index_value_is_delta_encoded += tp.index_value_is_delta_encoded;
+  filter_size += tp.filter_size;
+  raw_key_size += tp.raw_key_size;
+  raw_value_size += tp.raw_value_size;
+  num_data_blocks += tp.num_data_blocks;
+  num_entries += tp.num_entries;
+  num_filter_entries += tp.num_filter_entries;
+  num_deletions += tp.num_deletions;
+  num_merge_operands += tp.num_merge_operands;
+  num_range_deletions += tp.num_range_deletions;
+  slow_compression_estimated_data_size +=
+      tp.slow_compression_estimated_data_size;
+  fast_compression_estimated_data_size +=
+      tp.fast_compression_estimated_data_size;
+}
+
+std::map<std::string, uint64_t>
+TableProperties::GetAggregatablePropertiesAsMap() const {
+  std::map<std::string, uint64_t> rv;
+  rv["data_size"] = data_size;
+  rv["index_size"] = index_size;
+  rv["index_partitions"] = index_partitions;
+  rv["top_level_index_size"] = top_level_index_size;
+  rv["filter_size"] = filter_size;
+  rv["raw_key_size"] = raw_key_size;
+  rv["raw_value_size"] = raw_value_size;
+  rv["num_data_blocks"] = num_data_blocks;
+  rv["num_entries"] = num_entries;
+  rv["num_filter_entries"] = num_filter_entries;
+  rv["num_deletions"] = num_deletions;
+  rv["num_merge_operands"] = num_merge_operands;
+  rv["num_range_deletions"] = num_range_deletions;
+  rv["slow_compression_estimated_data_size"] =
+      slow_compression_estimated_data_size;
+  rv["fast_compression_estimated_data_size"] =
+      fast_compression_estimated_data_size;
+  return rv;
+}
+
+// WARNING: manual update to this function is needed
+// whenever a new string property is added to TableProperties
+// to reduce approximation error.
+//
+// TODO: eliminate the need of manually updating this function
+// for new string properties
+std::size_t TableProperties::ApproximateMemoryUsage() const {
+  std::size_t usage = 0;
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  usage += malloc_usable_size((void*)this);
+#else
+  usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+
+  std::size_t string_props_mem_usage =
+      db_id.size() + db_session_id.size() + db_host_id.size() +
+      column_family_name.size() + filter_policy_name.size() +
+      comparator_name.size() + merge_operator_name.size() +
+      prefix_extractor_name.size() + property_collectors_names.size() +
+      compression_name.size() + compression_options.size();
+  usage += string_props_mem_usage;
+
+  for (auto iter = user_collected_properties.begin();
+       iter != user_collected_properties.end(); ++iter) {
+    usage += (iter->first.size() + iter->second.size());
+  }
+
+  return usage;
+}
+
+const std::string TablePropertiesNames::kDbId = "rocksdb.creating.db.identity";
+const std::string TablePropertiesNames::kDbSessionId =
+    "rocksdb.creating.session.identity";
+const std::string TablePropertiesNames::kDbHostId =
+    "rocksdb.creating.host.identity";
+const std::string TablePropertiesNames::kOriginalFileNumber =
+    "rocksdb.original.file.number";
+const std::string TablePropertiesNames::kDataSize = "rocksdb.data.size";
+const std::string TablePropertiesNames::kIndexSize = "rocksdb.index.size";
+const std::string TablePropertiesNames::kIndexPartitions =
+    "rocksdb.index.partitions";
+const std::string TablePropertiesNames::kTopLevelIndexSize =
+    "rocksdb.top-level.index.size";
+const std::string TablePropertiesNames::kIndexKeyIsUserKey =
+    "rocksdb.index.key.is.user.key";
+const std::string TablePropertiesNames::kIndexValueIsDeltaEncoded =
+    "rocksdb.index.value.is.delta.encoded";
+const std::string TablePropertiesNames::kFilterSize = "rocksdb.filter.size";
+const std::string TablePropertiesNames::kRawKeySize = "rocksdb.raw.key.size";
+const std::string TablePropertiesNames::kRawValueSize =
+    "rocksdb.raw.value.size";
+const std::string TablePropertiesNames::kNumDataBlocks =
+    "rocksdb.num.data.blocks";
+const std::string TablePropertiesNames::kNumEntries = "rocksdb.num.entries";
+const std::string TablePropertiesNames::kNumFilterEntries =
+    "rocksdb.num.filter_entries";
+const std::string TablePropertiesNames::kDeletedKeys = "rocksdb.deleted.keys";
+const std::string TablePropertiesNames::kMergeOperands =
+    "rocksdb.merge.operands";
+const std::string TablePropertiesNames::kNumRangeDeletions =
+    "rocksdb.num.range-deletions";
+const std::string TablePropertiesNames::kFilterPolicy = "rocksdb.filter.policy";
+const std::string TablePropertiesNames::kFormatVersion =
+    "rocksdb.format.version";
+const std::string TablePropertiesNames::kFixedKeyLen =
+    "rocksdb.fixed.key.length";
+const std::string TablePropertiesNames::kColumnFamilyId =
+    "rocksdb.column.family.id";
+const std::string TablePropertiesNames::kColumnFamilyName =
+    "rocksdb.column.family.name";
+const std::string TablePropertiesNames::kComparator = "rocksdb.comparator";
+const std::string TablePropertiesNames::kMergeOperator =
+    "rocksdb.merge.operator";
+const std::string TablePropertiesNames::kPrefixExtractorName =
+    "rocksdb.prefix.extractor.name";
+const std::string TablePropertiesNames::kPropertyCollectors =
+    "rocksdb.property.collectors";
+const std::string TablePropertiesNames::kCompression = "rocksdb.compression";
+const std::string TablePropertiesNames::kCompressionOptions =
+    "rocksdb.compression_options";
+const std::string TablePropertiesNames::kCreationTime = "rocksdb.creation.time";
+const std::string TablePropertiesNames::kOldestKeyTime =
+    "rocksdb.oldest.key.time";
+const std::string TablePropertiesNames::kFileCreationTime =
+    "rocksdb.file.creation.time";
+const std::string TablePropertiesNames::kSlowCompressionEstimatedDataSize =
+    "rocksdb.sample_for_compression.slow.data.size";
+const std::string TablePropertiesNames::kFastCompressionEstimatedDataSize =
+    "rocksdb.sample_for_compression.fast.data.size";
+const std::string TablePropertiesNames::kSequenceNumberTimeMapping =
+    "rocksdb.seqno.time.map";
+
+#ifndef NDEBUG
+// WARNING: TEST_SetRandomTableProperties assumes the following layout of
+// TableProperties
+//
+// struct TableProperties {
+//    int64_t orig_file_number = 0;
+//    ...
+//    ... int64_t properties only
+//    ...
+//    std::string db_id;
+//    ...
+//    ... std::string properties only
+//    ...
+//    std::string compression_options;
+//    UserCollectedProperties user_collected_properties;
+//    ...
+//    ... Other extra properties: non-int64_t/non-std::string properties only
+//    ...
+// }
+void TEST_SetRandomTableProperties(TableProperties* props) {
+  Random* r = Random::GetTLSInstance();
+  uint64_t* pu = &props->orig_file_number;
+  assert(static_cast<void*>(pu) == static_cast<void*>(props));
+  std::string* ps = &props->db_id;
+  const uint64_t* const pu_end = reinterpret_cast<const uint64_t*>(ps);
+  // Use the last string property's address instead of
+  // the first extra property (e.g `user_collected_properties`)'s address
+  // in the for-loop to avoid advancing pointer to pointing to
+  // potential non-zero padding bytes between these two addresses due to
+  // user_collected_properties's alignment requirement
+  const std::string* const ps_end_inclusive = &props->compression_options;
+
+  for (; pu < pu_end; ++pu) {
+    *pu = r->Next64();
+  }
+  assert(static_cast<void*>(pu) == static_cast<void*>(ps));
+  for (; ps <= ps_end_inclusive; ++ps) {
+    *ps = r->RandomBinaryString(13);
+  }
+}
+#endif
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/table_properties_internal.h b/src/rocksdb/table/table_properties_internal.h
new file mode 100644
index 000000000..5c2a0cb9a
--- /dev/null
+++ b/src/rocksdb/table/table_properties_internal.h
@@ -0,0 +1,14 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/table_properties.h"
+
+namespace ROCKSDB_NAMESPACE {
+#ifndef NDEBUG
+void TEST_SetRandomTableProperties(TableProperties* props);
+#endif
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/table_reader.h b/src/rocksdb/table/table_reader.h
new file mode 100644
index 000000000..391072eec
--- /dev/null
+++ b/src/rocksdb/table/table_reader.h
@@ -0,0 +1,184 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <memory>
+
+#include "db/range_tombstone_fragmenter.h"
+#if USE_COROUTINES
+#include "folly/experimental/coro/Coroutine.h"
+#include "folly/experimental/coro/Task.h"
+#endif
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table_reader_caller.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/multiget_context.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Iterator;
+struct ParsedInternalKey;
+class Slice;
+class Arena;
+struct ReadOptions;
+struct TableProperties;
+class GetContext;
+class MultiGetContext;
+
+// A Table (also referred to as SST) is a sorted map from strings to strings.
+// Tables are immutable and persistent.  A Table may be safely accessed from
+// multiple threads without external synchronization. Table readers are used
+// for reading various types of table formats supported by rocksdb including
+// BlockBasedTable, PlainTable and CuckooTable format.
+class TableReader {
+ public:
+  virtual ~TableReader() {}
+
+  // Returns a new iterator over the table contents.
+  // The result of NewIterator() is initially invalid (caller must
+  // call one of the Seek methods on the iterator before using it).
+  //
+  // read_options: Must outlive the returned iterator.
+  // arena: If not null, the arena needs to be used to allocate the Iterator.
+  //        When destroying the iterator, the caller will not call "delete"
+  //        but Iterator::~Iterator() directly. The destructor needs to destroy
+  //        all the states but those allocated in arena.
+  // skip_filters: disables checking the bloom filters even if they exist. This
+  //               option is effective only for block-based table format.
+  // compaction_readahead_size: its value will only be used if caller =
+  // kCompaction
+  virtual InternalIterator* NewIterator(
+      const ReadOptions& read_options, const SliceTransform* prefix_extractor,
+      Arena* arena, bool skip_filters, TableReaderCaller caller,
+      size_t compaction_readahead_size = 0,
+      bool allow_unprepared_value = false) = 0;
+
+  virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
+      const ReadOptions& /*read_options*/) {
+    return nullptr;
+  }
+
+  // Given a key, return an approximate byte offset in the file where
+  // the data for that key begins (or would begin if the key were
+  // present in the file).  The returned value is in terms of file
+  // bytes, and so includes effects like compression of the underlying data.
+  // E.g., the approximate offset of the last key in the table will
+  // be close to the file length.
+  // TODO(peterd): Since this function is only used for approximate size
+  // from beginning of file, reduce code duplication by removing this
+  // function and letting ApproximateSize take optional start and end, so
+  // that absolute start and end can be specified and optimized without
+  // key / index work.
+  virtual uint64_t ApproximateOffsetOf(const Slice& key,
+                                       TableReaderCaller caller) = 0;
+
+  // Given start and end keys, return the approximate data size in the file
+  // between the keys. The returned value is in terms of file bytes, and so
+  // includes effects like compression of the underlying data and applicable
+  // portions of metadata including filters and indexes. Nullptr for start or
+  // end (or both) indicates absolute start or end of the table.
+  virtual uint64_t ApproximateSize(const Slice& start, const Slice& end,
+                                   TableReaderCaller caller) = 0;
+
+  struct Anchor {
+    Anchor(const Slice& _user_key, size_t _range_size)
+        : user_key(_user_key.ToStringView()), range_size(_range_size) {}
+    std::string user_key;
+    size_t range_size;
+  };
+
+  // Now try to return approximately 128 anchor keys.
+  // The last one tends to be the largest key.
+  virtual Status ApproximateKeyAnchors(const ReadOptions& /*read_options*/,
+                                       std::vector<Anchor>& /*anchors*/) {
+    return Status::NotSupported("ApproximateKeyAnchors() not supported.");
+  }
+
+  // Set up the table for Compaction. Might change some parameters with
+  // posix_fadvise
+  virtual void SetupForCompaction() = 0;
+
+  virtual std::shared_ptr<const TableProperties> GetTableProperties() const = 0;
+
+  // Prepare work that can be done before the real Get()
+  virtual void Prepare(const Slice& /*target*/) {}
+
+  // Report an approximation of how much memory has been used.
+  virtual size_t ApproximateMemoryUsage() const = 0;
+
+  // Calls get_context->SaveValue() repeatedly, starting with
+  // the entry found after a call to Seek(key), until it returns false.
+  // May not make such a call if filter policy says that key is not present.
+  //
+  // get_context->MarkKeyMayExist needs to be called when it is configured to be
+  // memory only and the key is not found in the block cache.
+  //
+  // readOptions is the options for the read
+  // key is the key to search for
+  // skip_filters: disables checking the bloom filters even if they exist. This
+  //               option is effective only for block-based table format.
+  virtual Status Get(const ReadOptions& readOptions, const Slice& key,
+                     GetContext* get_context,
+                     const SliceTransform* prefix_extractor,
+                     bool skip_filters = false) = 0;
+
+  // Use bloom filters in the table file, if present, to filter out keys. The
+  // mget_range will be updated to skip keys that get a negative result from
+  // the filter lookup.
+  virtual Status MultiGetFilter(const ReadOptions& /*readOptions*/,
+                                const SliceTransform* /*prefix_extractor*/,
+                                MultiGetContext::Range* /*mget_range*/) {
+    return Status::NotSupported();
+  }
+
+  virtual void MultiGet(const ReadOptions& readOptions,
+                        const MultiGetContext::Range* mget_range,
+                        const SliceTransform* prefix_extractor,
+                        bool skip_filters = false) {
+    for (auto iter = mget_range->begin(); iter != mget_range->end(); ++iter) {
+      *iter->s = Get(readOptions, iter->ikey, iter->get_context,
+                     prefix_extractor, skip_filters);
+    }
+  }
+
+#if USE_COROUTINES
+  virtual folly::coro::Task<void> MultiGetCoroutine(
+      const ReadOptions& readOptions, const MultiGetContext::Range* mget_range,
+      const SliceTransform* prefix_extractor, bool skip_filters = false) {
+    MultiGet(readOptions, mget_range, prefix_extractor, skip_filters);
+    co_return;
+  }
+#endif  // USE_COROUTINES
+
+  // Prefetch data corresponding to a give range of keys
+  // Typically this functionality is required for table implementations that
+  // persists the data on a non volatile storage medium like disk/SSD
+  virtual Status Prefetch(const Slice* begin = nullptr,
+                          const Slice* end = nullptr) {
+    (void)begin;
+    (void)end;
+    // Default implementation is NOOP.
+    // The child class should implement functionality when applicable
+    return Status::OK();
+  }
+
+  // convert db file to a human readable form
+  virtual Status DumpTable(WritableFile* /*out_file*/) {
+    return Status::NotSupported("DumpTable() not supported");
+  }
+
+  // check whether there is corruption in this db file
+  virtual Status VerifyChecksum(const ReadOptions& /*read_options*/,
+                                TableReaderCaller /*caller*/) {
+    return Status::NotSupported("VerifyChecksum() not supported");
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/table_reader_bench.cc b/src/rocksdb/table/table_reader_bench.cc
new file mode 100644
index 000000000..b13caf68d
--- /dev/null
+++ b/src/rocksdb/table/table_reader_bench.cc
@@ -0,0 +1,349 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "file/random_access_file_reader.h"
+#include "monitoring/histogram.h"
+#include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/table_builder.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/gflags_compat.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::SetUsageMessage;
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+// Make a key that i determines the first 4 characters and j determines the
+// last 4 characters.
+static std::string MakeKey(int i, int j, bool through_db) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "%04d__key___%04d", i, j);
+  if (through_db) {
+    return std::string(buf);
+  }
+  // If we directly query table, which operates on internal keys
+  // instead of user keys, we need to add 8 bytes of internal
+  // information (row type etc) to user key to make an internal
+  // key.
+  InternalKey key(std::string(buf), 0, ValueType::kTypeValue);
+  return key.Encode().ToString();
+}
+
+uint64_t Now(SystemClock* clock, bool measured_by_nanosecond) {
+  return measured_by_nanosecond ? clock->NowNanos() : clock->NowMicros();
+}
+}  // namespace
+
+// A very simple benchmark that.
+// Create a table with roughly numKey1 * numKey2 keys,
+// where there are numKey1 prefixes of the key, each has numKey2 number of
+// distinguished key, differing in the suffix part.
+// If if_query_empty_keys = false, query the existing keys numKey1 * numKey2
+// times randomly.
+// If if_query_empty_keys = true, query numKey1 * numKey2 random empty keys.
+// Print out the total time.
+// If through_db=true, a full DB will be created and queries will be against
+// it. Otherwise, operations will be directly through table level.
+//
+// If for_terator=true, instead of just query one key each time, it queries
+// a range sharing the same prefix.
+namespace {
+void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
+                          ReadOptions& read_options, int num_keys1,
+                          int num_keys2, int num_iter, int /*prefix_len*/,
+                          bool if_query_empty_keys, bool for_iterator,
+                          bool through_db, bool measured_by_nanosecond) {
+  ROCKSDB_NAMESPACE::InternalKeyComparator ikc(opts.comparator);
+
+  std::string file_name =
+      test::PerThreadDBPath("rocksdb_table_reader_benchmark");
+  std::string dbname = test::PerThreadDBPath("rocksdb_table_reader_bench_db");
+  WriteOptions wo;
+  Env* env = Env::Default();
+  auto* clock = env->GetSystemClock().get();
+  TableBuilder* tb = nullptr;
+  DB* db = nullptr;
+  Status s;
+  const ImmutableOptions ioptions(opts);
+  const ColumnFamilyOptions cfo(opts);
+  const MutableCFOptions moptions(cfo);
+  std::unique_ptr<WritableFileWriter> file_writer;
+  if (!through_db) {
+    ASSERT_OK(WritableFileWriter::Create(env->GetFileSystem(), file_name,
+                                         FileOptions(env_options), &file_writer,
+                                         nullptr));
+
+    IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+
+    int unknown_level = -1;
+    tb = opts.table_factory->NewTableBuilder(
+        TableBuilderOptions(
+            ioptions, moptions, ikc, &int_tbl_prop_collector_factories,
+            CompressionType::kNoCompression, CompressionOptions(),
+            0 /* column_family_id */, kDefaultColumnFamilyName, unknown_level),
+        file_writer.get());
+  } else {
+    s = DB::Open(opts, dbname, &db);
+    ASSERT_OK(s);
+    ASSERT_TRUE(db != nullptr);
+  }
+  // Populate slightly more than 1M keys
+  for (int i = 0; i < num_keys1; i++) {
+    for (int j = 0; j < num_keys2; j++) {
+      std::string key = MakeKey(i * 2, j, through_db);
+      if (!through_db) {
+        tb->Add(key, key);
+      } else {
+        db->Put(wo, key, key);
+      }
+    }
+  }
+  if (!through_db) {
+    tb->Finish();
+    file_writer->Close();
+  } else {
+    db->Flush(FlushOptions());
+  }
+
+  std::unique_ptr<TableReader> table_reader;
+  if (!through_db) {
+    const auto& fs = env->GetFileSystem();
+    FileOptions fopts(env_options);
+
+    std::unique_ptr<FSRandomAccessFile> raf;
+    s = fs->NewRandomAccessFile(file_name, fopts, &raf, nullptr);
+    if (!s.ok()) {
+      fprintf(stderr, "Create File Error: %s\n", s.ToString().c_str());
+      exit(1);
+    }
+    uint64_t file_size;
+    fs->GetFileSize(file_name, fopts.io_options, &file_size, nullptr);
+    std::unique_ptr<RandomAccessFileReader> file_reader(
+        new RandomAccessFileReader(std::move(raf), file_name));
+    s = opts.table_factory->NewTableReader(
+        TableReaderOptions(ioptions, moptions.prefix_extractor, env_options,
+                           ikc),
+        std::move(file_reader), file_size, &table_reader);
+    if (!s.ok()) {
+      fprintf(stderr, "Open Table Error: %s\n", s.ToString().c_str());
+      exit(1);
+    }
+  }
+
+  Random rnd(301);
+  std::string result;
+  HistogramImpl hist;
+
+  for (int it = 0; it < num_iter; it++) {
+    for (int i = 0; i < num_keys1; i++) {
+      for (int j = 0; j < num_keys2; j++) {
+        int r1 = rnd.Uniform(num_keys1) * 2;
+        int r2 = rnd.Uniform(num_keys2);
+        if (if_query_empty_keys) {
+          r1++;
+          r2 = num_keys2 * 2 - r2;
+        }
+
+        if (!for_iterator) {
+          // Query one existing key;
+          std::string key = MakeKey(r1, r2, through_db);
+          uint64_t start_time = Now(clock, measured_by_nanosecond);
+          if (!through_db) {
+            PinnableSlice value;
+            MergeContext merge_context;
+            SequenceNumber max_covering_tombstone_seq = 0;
+            GetContext get_context(
+                ioptions.user_comparator, ioptions.merge_operator.get(),
+                ioptions.logger, ioptions.stats, GetContext::kNotFound,
+                Slice(key), &value, /*columns=*/nullptr, /*timestamp=*/nullptr,
+                &merge_context, true, &max_covering_tombstone_seq, clock);
+            s = table_reader->Get(read_options, key, &get_context, nullptr);
+          } else {
+            s = db->Get(read_options, key, &result);
+          }
+          hist.Add(Now(clock, measured_by_nanosecond) - start_time);
+        } else {
+          int r2_len;
+          if (if_query_empty_keys) {
+            r2_len = 0;
+          } else {
+            r2_len = rnd.Uniform(num_keys2) + 1;
+            if (r2_len + r2 > num_keys2) {
+              r2_len = num_keys2 - r2;
+            }
+          }
+          std::string start_key = MakeKey(r1, r2, through_db);
+          std::string end_key = MakeKey(r1, r2 + r2_len, through_db);
+          uint64_t total_time = 0;
+          uint64_t start_time = Now(clock, measured_by_nanosecond);
+          Iterator* iter = nullptr;
+          InternalIterator* iiter = nullptr;
+          if (!through_db) {
+            iiter = table_reader->NewIterator(
+                read_options, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+                /*skip_filters=*/false, TableReaderCaller::kUncategorized);
+          } else {
+            iter = db->NewIterator(read_options);
+          }
+          int count = 0;
+          for (through_db ? iter->Seek(start_key) : iiter->Seek(start_key);
+               through_db ? iter->Valid() : iiter->Valid();
+               through_db ? iter->Next() : iiter->Next()) {
+            if (if_query_empty_keys) {
+              break;
+            }
+            // verify key;
+            total_time += Now(clock, measured_by_nanosecond) - start_time;
+            assert(Slice(MakeKey(r1, r2 + count, through_db)) ==
+                   (through_db ? iter->key() : iiter->key()));
+            start_time = Now(clock, measured_by_nanosecond);
+            if (++count >= r2_len) {
+              break;
+            }
+          }
+          if (count != r2_len) {
+            fprintf(stderr,
+                    "Iterator cannot iterate expected number of entries. "
+                    "Expected %d but got %d\n",
+                    r2_len, count);
+            assert(false);
+          }
+          delete iter;
+          total_time += Now(clock, measured_by_nanosecond) - start_time;
+          hist.Add(total_time);
+        }
+      }
+    }
+  }
+
+  fprintf(
+      stderr,
+      "==================================================="
+      "====================================================\n"
+      "InMemoryTableSimpleBenchmark: %20s   num_key1:  %5d   "
+      "num_key2: %5d  %10s\n"
+      "==================================================="
+      "===================================================="
+      "\nHistogram (unit: %s): \n%s",
+      opts.table_factory->Name(), num_keys1, num_keys2,
+      for_iterator ? "iterator" : (if_query_empty_keys ? "empty" : "non_empty"),
+      measured_by_nanosecond ? "nanosecond" : "microsecond",
+      hist.ToString().c_str());
+  if (!through_db) {
+    env->DeleteFile(file_name);
+  } else {
+    delete db;
+    db = nullptr;
+    DestroyDB(dbname, opts);
+  }
+}
+}  // namespace
+}  // namespace ROCKSDB_NAMESPACE
+
+DEFINE_bool(query_empty, false,
+            "query non-existing keys instead of existing ones.");
+DEFINE_int32(num_keys1, 4096, "number of distinguish prefix of keys");
+DEFINE_int32(num_keys2, 512, "number of distinguish keys for each prefix");
+DEFINE_int32(iter, 3, "query non-existing keys instead of existing ones");
+DEFINE_int32(prefix_len, 16, "Prefix length used for iterators and indexes");
+DEFINE_bool(iterator, false, "For test iterator");
+DEFINE_bool(through_db, false,
+            "If enable, a DB instance will be created and the query will be "
+            "against DB. Otherwise, will be directly against a table reader.");
+DEFINE_bool(mmap_read, true, "Whether use mmap read");
+DEFINE_string(table_factory, "block_based",
+              "Table factory to use: `block_based` (default), `plain_table` or "
+              "`cuckoo_hash`.");
+DEFINE_string(time_unit, "microsecond",
+              "The time unit used for measuring performance. User can specify "
+              "`microsecond` (default) or `nanosecond`");
+
+int main(int argc, char** argv) {
+  SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                  " [OPTIONS]...");
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  std::shared_ptr<ROCKSDB_NAMESPACE::TableFactory> tf;
+  ROCKSDB_NAMESPACE::Options options;
+  if (FLAGS_prefix_len < 16) {
+    options.prefix_extractor.reset(
+        ROCKSDB_NAMESPACE::NewFixedPrefixTransform(FLAGS_prefix_len));
+  }
+  ROCKSDB_NAMESPACE::ReadOptions ro;
+  ROCKSDB_NAMESPACE::EnvOptions env_options;
+  options.create_if_missing = true;
+  options.compression = ROCKSDB_NAMESPACE::CompressionType::kNoCompression;
+
+  if (FLAGS_table_factory == "cuckoo_hash") {
+#ifndef ROCKSDB_LITE
+    options.allow_mmap_reads = FLAGS_mmap_read;
+    env_options.use_mmap_reads = FLAGS_mmap_read;
+    ROCKSDB_NAMESPACE::CuckooTableOptions table_options;
+    table_options.hash_table_ratio = 0.75;
+    tf.reset(ROCKSDB_NAMESPACE::NewCuckooTableFactory(table_options));
+#else
+    fprintf(stderr, "Plain table is not supported in lite mode\n");
+    exit(1);
+#endif  // ROCKSDB_LITE
+  } else if (FLAGS_table_factory == "plain_table") {
+#ifndef ROCKSDB_LITE
+    options.allow_mmap_reads = FLAGS_mmap_read;
+    env_options.use_mmap_reads = FLAGS_mmap_read;
+
+    ROCKSDB_NAMESPACE::PlainTableOptions plain_table_options;
+    plain_table_options.user_key_len = 16;
+    plain_table_options.bloom_bits_per_key = (FLAGS_prefix_len == 16) ? 0 : 8;
+    plain_table_options.hash_table_ratio = 0.75;
+
+    tf.reset(new ROCKSDB_NAMESPACE::PlainTableFactory(plain_table_options));
+    options.prefix_extractor.reset(
+        ROCKSDB_NAMESPACE::NewFixedPrefixTransform(FLAGS_prefix_len));
+#else
+    fprintf(stderr, "Cuckoo table is not supported in lite mode\n");
+    exit(1);
+#endif  // ROCKSDB_LITE
+  } else if (FLAGS_table_factory == "block_based") {
+    tf.reset(new ROCKSDB_NAMESPACE::BlockBasedTableFactory());
+  } else {
+    fprintf(stderr, "Invalid table type %s\n", FLAGS_table_factory.c_str());
+  }
+
+  if (tf) {
+    // if user provides invalid options, just fall back to microsecond.
+    bool measured_by_nanosecond = FLAGS_time_unit == "nanosecond";
+
+    options.table_factory = tf;
+    ROCKSDB_NAMESPACE::TableReaderBenchmark(
+        options, env_options, ro, FLAGS_num_keys1, FLAGS_num_keys2, FLAGS_iter,
+        FLAGS_prefix_len, FLAGS_query_empty, FLAGS_iterator, FLAGS_through_db,
+        measured_by_nanosecond);
+  } else {
+    return 1;
+  }
+
+  return 0;
+}
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/table/table_test.cc b/src/rocksdb/table/table_test.cc
new file mode 100644
index 000000000..af9c177e8
--- /dev/null
+++ b/src/rocksdb/table/table_test.cc
@@ -0,0 +1,5596 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/table.h"
+
+#include <gtest/gtest.h>
+#include <stddef.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "cache/lru_cache.h"
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "memtable/stl_wrappers.h"
+#include "monitoring/statistics.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_checksum.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/trace_record.h"
+#include "rocksdb/unique_id.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "table/block_based/flush_block_policy.h"
+#include "table/block_fetcher.h"
+#include "table/format.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/meta_blocks.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/sst_file_writer_collectors.h"
+#include "table/unique_id_impl.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding_lean.h"
+#include "util/compression.h"
+#include "util/file_checksum_helper.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/memory_allocators.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+extern const uint64_t kLegacyBlockBasedTableMagicNumber;
+extern const uint64_t kLegacyPlainTableMagicNumber;
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const uint64_t kPlainTableMagicNumber;
+
+namespace {
+
+const std::string kDummyValue(10000, 'o');
+
+// DummyPropertiesCollector used to test BlockBasedTableProperties
+class DummyPropertiesCollector : public TablePropertiesCollector {
+ public:
+  const char* Name() const override { return "DummyPropertiesCollector"; }
+
+  Status Finish(UserCollectedProperties* /*properties*/) override {
+    return Status::OK();
+  }
+
+  Status Add(const Slice& /*user_key*/, const Slice& /*value*/) override {
+    return Status::OK();
+  }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{};
+  }
+};
+
+class DummyPropertiesCollectorFactory1
+    : public TablePropertiesCollectorFactory {
+ public:
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /*context*/) override {
+    return new DummyPropertiesCollector();
+  }
+  const char* Name() const override {
+    return "DummyPropertiesCollectorFactory1";
+  }
+};
+
+class DummyPropertiesCollectorFactory2
+    : public TablePropertiesCollectorFactory {
+ public:
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /*context*/) override {
+    return new DummyPropertiesCollector();
+  }
+  const char* Name() const override {
+    return "DummyPropertiesCollectorFactory2";
+  }
+};
+
+// Return reverse of "key".
+// Used to test non-lexicographic comparators.
+std::string Reverse(const Slice& key) {
+  auto rev = key.ToString();
+  std::reverse(rev.begin(), rev.end());
+  return rev;
+}
+
+class ReverseKeyComparator : public Comparator {
+ public:
+  const char* Name() const override {
+    return "rocksdb.ReverseBytewiseComparator";
+  }
+
+  int Compare(const Slice& a, const Slice& b) const override {
+    return BytewiseComparator()->Compare(Reverse(a), Reverse(b));
+  }
+
+  void FindShortestSeparator(std::string* start,
+                             const Slice& limit) const override {
+    std::string s = Reverse(*start);
+    std::string l = Reverse(limit);
+    BytewiseComparator()->FindShortestSeparator(&s, l);
+    *start = Reverse(s);
+  }
+
+  void FindShortSuccessor(std::string* key) const override {
+    std::string s = Reverse(*key);
+    BytewiseComparator()->FindShortSuccessor(&s);
+    *key = Reverse(s);
+  }
+};
+
+ReverseKeyComparator reverse_key_comparator;
+
+void Increment(const Comparator* cmp, std::string* key) {
+  if (cmp == BytewiseComparator()) {
+    key->push_back('\0');
+  } else {
+    assert(cmp == &reverse_key_comparator);
+    std::string rev = Reverse(*key);
+    rev.push_back('\0');
+    *key = Reverse(rev);
+  }
+}
+
+const auto kUnknownColumnFamily =
+    TablePropertiesCollectorFactory::Context::kUnknownColumnFamily;
+
+}  // namespace
+
+// Helper class for tests to unify the interface between
+// BlockBuilder/TableBuilder and Block/Table.
+class Constructor {
+ public:
+  explicit Constructor(const Comparator* cmp)
+      : data_(stl_wrappers::LessOfComparator(cmp)) {}
+  virtual ~Constructor() {}
+
+  void Add(const std::string& key, const Slice& value) {
+    data_[key] = value.ToString();
+  }
+
+  // Finish constructing the data structure with all the keys that have
+  // been added so far.  Returns the keys in sorted order in "*keys"
+  // and stores the key/value pairs in "*kvmap"
+  void Finish(const Options& options, const ImmutableOptions& ioptions,
+              const MutableCFOptions& moptions,
+              const BlockBasedTableOptions& table_options,
+              const InternalKeyComparator& internal_comparator,
+              std::vector<std::string>* keys, stl_wrappers::KVMap* kvmap) {
+    last_internal_comparator_ = &internal_comparator;
+    *kvmap = data_;
+    keys->clear();
+    for (const auto& kv : data_) {
+      keys->push_back(kv.first);
+    }
+    data_.clear();
+    Status s = FinishImpl(options, ioptions, moptions, table_options,
+                          internal_comparator, *kvmap);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+  }
+
+  // Construct the data structure from the data in "data"
+  virtual Status FinishImpl(const Options& options,
+                            const ImmutableOptions& ioptions,
+                            const MutableCFOptions& moptions,
+                            const BlockBasedTableOptions& table_options,
+                            const InternalKeyComparator& internal_comparator,
+                            const stl_wrappers::KVMap& data) = 0;
+
+  virtual InternalIterator* NewIterator(
+      const SliceTransform* prefix_extractor = nullptr) const = 0;
+
+  virtual const stl_wrappers::KVMap& data() { return data_; }
+
+  virtual bool IsArenaMode() const { return false; }
+
+  virtual DB* db() const { return nullptr; }  // Overridden in DBConstructor
+
+  virtual bool AnywayDeleteIterator() const { return false; }
+
+ protected:
+  const InternalKeyComparator* last_internal_comparator_;
+
+ private:
+  stl_wrappers::KVMap data_;
+};
+
+// A helper class that converts internal format keys into user keys
+class KeyConvertingIterator : public InternalIterator {
+ public:
+  explicit KeyConvertingIterator(InternalIterator* iter,
+                                 bool arena_mode = false)
+      : iter_(iter), arena_mode_(arena_mode) {}
+  ~KeyConvertingIterator() override {
+    if (arena_mode_) {
+      iter_->~InternalIterator();
+    } else {
+      delete iter_;
+    }
+  }
+  bool Valid() const override { return iter_->Valid() && status_.ok(); }
+  void Seek(const Slice& target) override {
+    ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue);
+    std::string encoded;
+    AppendInternalKey(&encoded, ikey);
+    iter_->Seek(encoded);
+  }
+  void SeekForPrev(const Slice& target) override {
+    ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue);
+    std::string encoded;
+    AppendInternalKey(&encoded, ikey);
+    iter_->SeekForPrev(encoded);
+  }
+  void SeekToFirst() override { iter_->SeekToFirst(); }
+  void SeekToLast() override { iter_->SeekToLast(); }
+  void Next() override { iter_->Next(); }
+  void Prev() override { iter_->Prev(); }
+  IterBoundCheck UpperBoundCheckResult() override {
+    return iter_->UpperBoundCheckResult();
+  }
+
+  Slice key() const override {
+    assert(Valid());
+    ParsedInternalKey parsed_key;
+    Status pik_status =
+        ParseInternalKey(iter_->key(), &parsed_key, true /* log_err_key */);
+    if (!pik_status.ok()) {
+      status_ = pik_status;
+      return Slice(status_.getState());
+    }
+    return parsed_key.user_key;
+  }
+
+  Slice value() const override { return iter_->value(); }
+  Status status() const override {
+    return status_.ok() ? iter_->status() : status_;
+  }
+
+ private:
+  mutable Status status_;
+  InternalIterator* iter_;
+  bool arena_mode_;
+
+  // No copying allowed
+  KeyConvertingIterator(const KeyConvertingIterator&);
+  void operator=(const KeyConvertingIterator&);
+};
+
+// `BlockConstructor` APIs always accept/return user keys.
+class BlockConstructor : public Constructor {
+ public:
+  explicit BlockConstructor(const Comparator* cmp)
+      : Constructor(cmp), comparator_(cmp), block_(nullptr) {}
+  ~BlockConstructor() override { delete block_; }
+  Status FinishImpl(const Options& /*options*/,
+                    const ImmutableOptions& /*ioptions*/,
+                    const MutableCFOptions& /*moptions*/,
+                    const BlockBasedTableOptions& table_options,
+                    const InternalKeyComparator& /*internal_comparator*/,
+                    const stl_wrappers::KVMap& kv_map) override {
+    delete block_;
+    block_ = nullptr;
+    BlockBuilder builder(table_options.block_restart_interval);
+
+    for (const auto& kv : kv_map) {
+      // `DataBlockIter` assumes it reads only internal keys. `BlockConstructor`
+      // clients provide user keys, so we need to convert to internal key format
+      // before writing the data block.
+      ParsedInternalKey ikey(kv.first, kMaxSequenceNumber, kTypeValue);
+      std::string encoded;
+      AppendInternalKey(&encoded, ikey);
+      builder.Add(encoded, kv.second);
+    }
+    // Open the block
+    data_ = builder.Finish().ToString();
+    BlockContents contents;
+    contents.data = data_;
+    block_ = new Block(std::move(contents));
+    return Status::OK();
+  }
+  InternalIterator* NewIterator(
+      const SliceTransform* /*prefix_extractor*/) const override {
+    // `DataBlockIter` returns the internal keys it reads.
+    // `KeyConvertingIterator` converts them to user keys before they are
+    // exposed to the `BlockConstructor` clients.
+    return new KeyConvertingIterator(
+        block_->NewDataIterator(comparator_, kDisableGlobalSequenceNumber));
+  }
+
+ private:
+  const Comparator* comparator_;
+  std::string data_;
+  Block* block_;
+
+  BlockConstructor();
+};
+
+class TableConstructor : public Constructor {
+ public:
+  explicit TableConstructor(const Comparator* cmp,
+                            bool convert_to_internal_key = false,
+                            int level = -1, SequenceNumber largest_seqno = 0)
+      : Constructor(cmp),
+        largest_seqno_(largest_seqno),
+        convert_to_internal_key_(convert_to_internal_key),
+        level_(level) {
+    env_ = ROCKSDB_NAMESPACE::Env::Default();
+  }
+  ~TableConstructor() override { Reset(); }
+
+  Status FinishImpl(const Options& options, const ImmutableOptions& ioptions,
+                    const MutableCFOptions& moptions,
+                    const BlockBasedTableOptions& /*table_options*/,
+                    const InternalKeyComparator& internal_comparator,
+                    const stl_wrappers::KVMap& kv_map) override {
+    Reset();
+    soptions.use_mmap_reads = ioptions.allow_mmap_reads;
+    std::unique_ptr<FSWritableFile> sink(new test::StringSink());
+    file_writer_.reset(new WritableFileWriter(
+        std::move(sink), "" /* don't care */, FileOptions()));
+    std::unique_ptr<TableBuilder> builder;
+    IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+
+    if (largest_seqno_ != 0) {
+      // Pretend that it's an external file written by SstFileWriter.
+      int_tbl_prop_collector_factories.emplace_back(
+          new SstFileWriterPropertiesCollectorFactory(2 /* version */,
+                                                      0 /* global_seqno*/));
+    }
+
+    std::string column_family_name;
+    builder.reset(ioptions.table_factory->NewTableBuilder(
+        TableBuilderOptions(ioptions, moptions, internal_comparator,
+                            &int_tbl_prop_collector_factories,
+                            options.compression, options.compression_opts,
+                            kUnknownColumnFamily, column_family_name, level_),
+        file_writer_.get()));
+
+    for (const auto& kv : kv_map) {
+      if (convert_to_internal_key_) {
+        ParsedInternalKey ikey(kv.first, kMaxSequenceNumber, kTypeValue);
+        std::string encoded;
+        AppendInternalKey(&encoded, ikey);
+        builder->Add(encoded, kv.second);
+      } else {
+        builder->Add(kv.first, kv.second);
+      }
+      EXPECT_OK(builder->status());
+    }
+    Status s = builder->Finish();
+    EXPECT_OK(file_writer_->Flush());
+    EXPECT_TRUE(s.ok()) << s.ToString();
+
+    EXPECT_EQ(TEST_GetSink()->contents().size(), builder->FileSize());
+
+    // Open the table
+    file_num_ = cur_file_num_++;
+
+    return Reopen(ioptions, moptions);
+  }
+
+  InternalIterator* NewIterator(
+      const SliceTransform* prefix_extractor) const override {
+    InternalIterator* iter = table_reader_->NewIterator(
+        read_options_, prefix_extractor, /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized);
+    if (convert_to_internal_key_) {
+      return new KeyConvertingIterator(iter);
+    } else {
+      return iter;
+    }
+  }
+
+  uint64_t ApproximateOffsetOf(const Slice& key) const {
+    if (convert_to_internal_key_) {
+      InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
+      const Slice skey = ikey.Encode();
+      return table_reader_->ApproximateOffsetOf(
+          skey, TableReaderCaller::kUncategorized);
+    }
+    return table_reader_->ApproximateOffsetOf(
+        key, TableReaderCaller::kUncategorized);
+  }
+
+  virtual Status Reopen(const ImmutableOptions& ioptions,
+                        const MutableCFOptions& moptions) {
+    std::unique_ptr<FSRandomAccessFile> source(new test::StringSource(
+        TEST_GetSink()->contents(), file_num_, ioptions.allow_mmap_reads));
+
+    file_reader_.reset(new RandomAccessFileReader(std::move(source), "test"));
+    return ioptions.table_factory->NewTableReader(
+        TableReaderOptions(ioptions, moptions.prefix_extractor, soptions,
+                           *last_internal_comparator_, /*skip_filters*/ false,
+                           /*immortal*/ false, false, level_,
+                           &block_cache_tracer_, moptions.write_buffer_size, "",
+                           file_num_, kNullUniqueId64x2, largest_seqno_),
+        std::move(file_reader_), TEST_GetSink()->contents().size(),
+        &table_reader_);
+  }
+
+  virtual TableReader* GetTableReader() { return table_reader_.get(); }
+
+  bool AnywayDeleteIterator() const override {
+    return convert_to_internal_key_;
+  }
+
+  void ResetTableReader() { table_reader_.reset(); }
+
+  bool ConvertToInternalKey() { return convert_to_internal_key_; }
+
+  test::StringSink* TEST_GetSink() {
+    return static_cast<test::StringSink*>(file_writer_->writable_file());
+  }
+
+  BlockCacheTracer block_cache_tracer_;
+
+ private:
+  void Reset() {
+    file_num_ = 0;
+    table_reader_.reset();
+    file_writer_.reset();
+    file_reader_.reset();
+  }
+
+  const ReadOptions read_options_;
+  uint64_t file_num_;
+  std::unique_ptr<WritableFileWriter> file_writer_;
+  std::unique_ptr<RandomAccessFileReader> file_reader_;
+  std::unique_ptr<TableReader> table_reader_;
+  SequenceNumber largest_seqno_;
+  bool convert_to_internal_key_;
+  int level_;
+
+  TableConstructor();
+
+  static uint64_t cur_file_num_;
+  EnvOptions soptions;
+  Env* env_;
+};
+uint64_t TableConstructor::cur_file_num_ = 1;
+
+class MemTableConstructor : public Constructor {
+ public:
+  explicit MemTableConstructor(const Comparator* cmp, WriteBufferManager* wb)
+      : Constructor(cmp),
+        internal_comparator_(cmp),
+        write_buffer_manager_(wb),
+        table_factory_(new SkipListFactory) {
+    options_.memtable_factory = table_factory_;
+    ImmutableOptions ioptions(options_);
+    memtable_ =
+        new MemTable(internal_comparator_, ioptions, MutableCFOptions(options_),
+                     wb, kMaxSequenceNumber, 0 /* column_family_id */);
+    memtable_->Ref();
+  }
+  ~MemTableConstructor() override { delete memtable_->Unref(); }
+  Status FinishImpl(const Options&, const ImmutableOptions& ioptions,
+                    const MutableCFOptions& /*moptions*/,
+                    const BlockBasedTableOptions& /*table_options*/,
+                    const InternalKeyComparator& /*internal_comparator*/,
+                    const stl_wrappers::KVMap& kv_map) override {
+    delete memtable_->Unref();
+    ImmutableOptions mem_ioptions(ioptions);
+    memtable_ = new MemTable(internal_comparator_, mem_ioptions,
+                             MutableCFOptions(options_), write_buffer_manager_,
+                             kMaxSequenceNumber, 0 /* column_family_id */);
+    memtable_->Ref();
+    int seq = 1;
+    for (const auto& kv : kv_map) {
+      Status s = memtable_->Add(seq, kTypeValue, kv.first, kv.second,
+                                nullptr /* kv_prot_info */);
+      if (!s.ok()) {
+        return s;
+      }
+      seq++;
+    }
+    return Status::OK();
+  }
+  InternalIterator* NewIterator(
+      const SliceTransform* /*prefix_extractor*/) const override {
+    return new KeyConvertingIterator(
+        memtable_->NewIterator(ReadOptions(), &arena_), true);
+  }
+
+  bool AnywayDeleteIterator() const override { return true; }
+
+  bool IsArenaMode() const override { return true; }
+
+ private:
+  mutable Arena arena_;
+  InternalKeyComparator internal_comparator_;
+  Options options_;
+  WriteBufferManager* write_buffer_manager_;
+  MemTable* memtable_;
+  std::shared_ptr<SkipListFactory> table_factory_;
+};
+
+class InternalIteratorFromIterator : public InternalIterator {
+ public:
+  explicit InternalIteratorFromIterator(Iterator* it) : it_(it) {}
+  bool Valid() const override { return it_->Valid(); }
+  void Seek(const Slice& target) override { it_->Seek(target); }
+  void SeekForPrev(const Slice& target) override { it_->SeekForPrev(target); }
+  void SeekToFirst() override { it_->SeekToFirst(); }
+  void SeekToLast() override { it_->SeekToLast(); }
+  void Next() override { it_->Next(); }
+  void Prev() override { it_->Prev(); }
+  Slice key() const override { return it_->key(); }
+  Slice value() const override { return it_->value(); }
+  Status status() const override { return it_->status(); }
+
+ private:
+  std::unique_ptr<Iterator> it_;
+};
+
+class DBConstructor : public Constructor {
+ public:
+  explicit DBConstructor(const Comparator* cmp)
+      : Constructor(cmp), comparator_(cmp) {
+    db_ = nullptr;
+    NewDB();
+  }
+  ~DBConstructor() override { delete db_; }
+  Status FinishImpl(const Options& /*options*/,
+                    const ImmutableOptions& /*ioptions*/,
+                    const MutableCFOptions& /*moptions*/,
+                    const BlockBasedTableOptions& /*table_options*/,
+                    const InternalKeyComparator& /*internal_comparator*/,
+                    const stl_wrappers::KVMap& kv_map) override {
+    delete db_;
+    db_ = nullptr;
+    NewDB();
+    for (const auto& kv : kv_map) {
+      WriteBatch batch;
+      EXPECT_OK(batch.Put(kv.first, kv.second));
+      EXPECT_TRUE(db_->Write(WriteOptions(), &batch).ok());
+    }
+    return Status::OK();
+  }
+
+  InternalIterator* NewIterator(
+      const SliceTransform* /*prefix_extractor*/) const override {
+    return new InternalIteratorFromIterator(db_->NewIterator(ReadOptions()));
+  }
+
+  DB* db() const override { return db_; }
+
+ private:
+  void NewDB() {
+    std::string name = test::PerThreadDBPath("table_testdb");
+
+    Options options;
+    options.comparator = comparator_;
+    Status status = DestroyDB(name, options);
+    ASSERT_TRUE(status.ok()) << status.ToString();
+
+    options.create_if_missing = true;
+    options.error_if_exists = true;
+    options.write_buffer_size = 10000;  // Something small to force merging
+    status = DB::Open(options, name, &db_);
+    ASSERT_TRUE(status.ok()) << status.ToString();
+  }
+
+  const Comparator* comparator_;
+  DB* db_;
+};
+
+enum TestType {
+  BLOCK_BASED_TABLE_TEST,
+#ifndef ROCKSDB_LITE
+  PLAIN_TABLE_SEMI_FIXED_PREFIX,
+  PLAIN_TABLE_FULL_STR_PREFIX,
+  PLAIN_TABLE_TOTAL_ORDER,
+#endif  // !ROCKSDB_LITE
+  BLOCK_TEST,
+  MEMTABLE_TEST,
+  DB_TEST
+};
+
+struct TestArgs {
+  TestType type;
+  bool reverse_compare;
+  int restart_interval;
+  CompressionType compression;
+  uint32_t compression_parallel_threads;
+  uint32_t format_version;
+  bool use_mmap;
+};
+
+std::ostream& operator<<(std::ostream& os, const TestArgs& args) {
+  os << "type: " << args.type << " reverse_compare: " << args.reverse_compare
+     << " restart_interval: " << args.restart_interval
+     << " compression: " << args.compression
+     << " compression_parallel_threads: " << args.compression_parallel_threads
+     << " format_version: " << args.format_version
+     << " use_mmap: " << args.use_mmap;
+
+  return os;
+}
+
+static std::vector<TestArgs> GenerateArgList() {
+  std::vector<TestArgs> test_args;
+  std::vector<TestType> test_types = {BLOCK_BASED_TABLE_TEST,
+#ifndef ROCKSDB_LITE
+                                      PLAIN_TABLE_SEMI_FIXED_PREFIX,
+                                      PLAIN_TABLE_FULL_STR_PREFIX,
+                                      PLAIN_TABLE_TOTAL_ORDER,
+#endif  // !ROCKSDB_LITE
+                                      BLOCK_TEST,
+                                      MEMTABLE_TEST,
+                                      DB_TEST};
+  std::vector<bool> reverse_compare_types = {false, true};
+  std::vector<int> restart_intervals = {16, 1, 1024};
+  std::vector<uint32_t> compression_parallel_threads = {1, 4};
+
+  // Only add compression if it is supported
+  std::vector<std::pair<CompressionType, bool>> compression_types;
+  compression_types.emplace_back(kNoCompression, false);
+  if (Snappy_Supported()) {
+    compression_types.emplace_back(kSnappyCompression, false);
+  }
+  if (Zlib_Supported()) {
+    compression_types.emplace_back(kZlibCompression, false);
+    compression_types.emplace_back(kZlibCompression, true);
+  }
+  if (BZip2_Supported()) {
+    compression_types.emplace_back(kBZip2Compression, false);
+    compression_types.emplace_back(kBZip2Compression, true);
+  }
+  if (LZ4_Supported()) {
+    compression_types.emplace_back(kLZ4Compression, false);
+    compression_types.emplace_back(kLZ4Compression, true);
+    compression_types.emplace_back(kLZ4HCCompression, false);
+    compression_types.emplace_back(kLZ4HCCompression, true);
+  }
+  if (XPRESS_Supported()) {
+    compression_types.emplace_back(kXpressCompression, false);
+    compression_types.emplace_back(kXpressCompression, true);
+  }
+  if (ZSTD_Supported()) {
+    compression_types.emplace_back(kZSTD, false);
+    compression_types.emplace_back(kZSTD, true);
+  }
+
+  for (auto test_type : test_types) {
+    for (auto reverse_compare : reverse_compare_types) {
+#ifndef ROCKSDB_LITE
+      if (test_type == PLAIN_TABLE_SEMI_FIXED_PREFIX ||
+          test_type == PLAIN_TABLE_FULL_STR_PREFIX ||
+          test_type == PLAIN_TABLE_TOTAL_ORDER) {
+        // Plain table doesn't use restart index or compression.
+        TestArgs one_arg;
+        one_arg.type = test_type;
+        one_arg.reverse_compare = reverse_compare;
+        one_arg.restart_interval = restart_intervals[0];
+        one_arg.compression = compression_types[0].first;
+        one_arg.compression_parallel_threads = 1;
+        one_arg.format_version = 0;
+        one_arg.use_mmap = true;
+        test_args.push_back(one_arg);
+        one_arg.use_mmap = false;
+        test_args.push_back(one_arg);
+        continue;
+      }
+#endif  // !ROCKSDB_LITE
+
+      for (auto restart_interval : restart_intervals) {
+        for (auto compression_type : compression_types) {
+          for (auto num_threads : compression_parallel_threads) {
+            TestArgs one_arg;
+            one_arg.type = test_type;
+            one_arg.reverse_compare = reverse_compare;
+            one_arg.restart_interval = restart_interval;
+            one_arg.compression = compression_type.first;
+            one_arg.compression_parallel_threads = num_threads;
+            one_arg.format_version = compression_type.second ? 2 : 1;
+            one_arg.use_mmap = false;
+            test_args.push_back(one_arg);
+          }
+        }
+      }
+    }
+  }
+  return test_args;
+}
+
+// In order to make all tests run for plain table format, including
+// those operating on empty keys, create a new prefix transformer which
+// return fixed prefix if the slice is not shorter than the prefix length,
+// and the full slice if it is shorter.
+class FixedOrLessPrefixTransform : public SliceTransform {
+ private:
+  const size_t prefix_len_;
+
+ public:
+  explicit FixedOrLessPrefixTransform(size_t prefix_len)
+      : prefix_len_(prefix_len) {}
+
+  const char* Name() const override { return "rocksdb.FixedPrefix"; }
+
+  Slice Transform(const Slice& src) const override {
+    assert(InDomain(src));
+    if (src.size() < prefix_len_) {
+      return src;
+    }
+    return Slice(src.data(), prefix_len_);
+  }
+
+  bool InDomain(const Slice& /*src*/) const override { return true; }
+
+  bool InRange(const Slice& dst) const override {
+    return (dst.size() <= prefix_len_);
+  }
+  bool FullLengthEnabled(size_t* /*len*/) const override { return false; }
+};
+
+class HarnessTest : public testing::Test {
+ public:
+  explicit HarnessTest(const TestArgs& args)
+      : args_(args),
+        ioptions_(options_),
+        moptions_(options_),
+        write_buffer_(options_.db_write_buffer_size),
+        support_prev_(true),
+        only_support_prefix_seek_(false) {
+    options_.compression = args_.compression;
+    options_.compression_opts.parallel_threads =
+        args_.compression_parallel_threads;
+    // Use shorter block size for tests to exercise block boundary
+    // conditions more.
+    if (args_.reverse_compare) {
+      options_.comparator = &reverse_key_comparator;
+    }
+
+    internal_comparator_.reset(
+        new test::PlainInternalKeyComparator(options_.comparator));
+
+    options_.allow_mmap_reads = args_.use_mmap;
+    switch (args_.type) {
+      case BLOCK_BASED_TABLE_TEST:
+        table_options_.flush_block_policy_factory.reset(
+            new FlushBlockBySizePolicyFactory());
+        table_options_.block_size = 256;
+        table_options_.block_restart_interval = args_.restart_interval;
+        table_options_.index_block_restart_interval = args_.restart_interval;
+        table_options_.format_version = args_.format_version;
+        options_.table_factory.reset(
+            new BlockBasedTableFactory(table_options_));
+        constructor_.reset(new TableConstructor(
+            options_.comparator, true /* convert_to_internal_key_ */));
+        internal_comparator_.reset(
+            new InternalKeyComparator(options_.comparator));
+        break;
+// Plain table is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+      case PLAIN_TABLE_SEMI_FIXED_PREFIX:
+        support_prev_ = false;
+        only_support_prefix_seek_ = true;
+        options_.prefix_extractor.reset(new FixedOrLessPrefixTransform(2));
+        options_.table_factory.reset(NewPlainTableFactory());
+        constructor_.reset(new TableConstructor(
+            options_.comparator, true /* convert_to_internal_key_ */));
+        internal_comparator_.reset(
+            new InternalKeyComparator(options_.comparator));
+        break;
+      case PLAIN_TABLE_FULL_STR_PREFIX:
+        support_prev_ = false;
+        only_support_prefix_seek_ = true;
+        options_.prefix_extractor.reset(NewNoopTransform());
+        options_.table_factory.reset(NewPlainTableFactory());
+        constructor_.reset(new TableConstructor(
+            options_.comparator, true /* convert_to_internal_key_ */));
+        internal_comparator_.reset(
+            new InternalKeyComparator(options_.comparator));
+        break;
+      case PLAIN_TABLE_TOTAL_ORDER:
+        support_prev_ = false;
+        only_support_prefix_seek_ = false;
+        options_.prefix_extractor = nullptr;
+
+        {
+          PlainTableOptions plain_table_options;
+          plain_table_options.user_key_len = kPlainTableVariableLength;
+          plain_table_options.bloom_bits_per_key = 0;
+          plain_table_options.hash_table_ratio = 0;
+
+          options_.table_factory.reset(
+              NewPlainTableFactory(plain_table_options));
+        }
+        constructor_.reset(new TableConstructor(
+            options_.comparator, true /* convert_to_internal_key_ */));
+        internal_comparator_.reset(
+            new InternalKeyComparator(options_.comparator));
+        break;
+#endif  // !ROCKSDB_LITE
+      case BLOCK_TEST:
+        table_options_.block_size = 256;
+        options_.table_factory.reset(
+            new BlockBasedTableFactory(table_options_));
+        constructor_.reset(new BlockConstructor(options_.comparator));
+        break;
+      case MEMTABLE_TEST:
+        table_options_.block_size = 256;
+        options_.table_factory.reset(
+            new BlockBasedTableFactory(table_options_));
+        constructor_.reset(
+            new MemTableConstructor(options_.comparator, &write_buffer_));
+        break;
+      case DB_TEST:
+        table_options_.block_size = 256;
+        options_.table_factory.reset(
+            new BlockBasedTableFactory(table_options_));
+        constructor_.reset(new DBConstructor(options_.comparator));
+        break;
+    }
+    ioptions_ = ImmutableOptions(options_);
+    moptions_ = MutableCFOptions(options_);
+  }
+
+  void Add(const std::string& key, const std::string& value) {
+    constructor_->Add(key, value);
+  }
+
+  void Test(Random* rnd) {
+    std::vector<std::string> keys;
+    stl_wrappers::KVMap data;
+    constructor_->Finish(options_, ioptions_, moptions_, table_options_,
+                         *internal_comparator_, &keys, &data);
+
+    TestForwardScan(keys, data);
+    if (support_prev_) {
+      TestBackwardScan(keys, data);
+    }
+    TestRandomAccess(rnd, keys, data);
+  }
+
+  void TestForwardScan(const std::vector<std::string>& /*keys*/,
+                       const stl_wrappers::KVMap& data) {
+    InternalIterator* iter = constructor_->NewIterator();
+    ASSERT_TRUE(!iter->Valid());
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    for (stl_wrappers::KVMap::const_iterator model_iter = data.begin();
+         model_iter != data.end(); ++model_iter) {
+      ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+      iter->Next();
+      ASSERT_OK(iter->status());
+    }
+    ASSERT_TRUE(!iter->Valid());
+    ASSERT_OK(iter->status());
+    if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) {
+      iter->~InternalIterator();
+    } else {
+      delete iter;
+    }
+  }
+
+  void TestBackwardScan(const std::vector<std::string>& /*keys*/,
+                        const stl_wrappers::KVMap& data) {
+    InternalIterator* iter = constructor_->NewIterator();
+    ASSERT_TRUE(!iter->Valid());
+    iter->SeekToLast();
+    ASSERT_OK(iter->status());
+    for (stl_wrappers::KVMap::const_reverse_iterator model_iter = data.rbegin();
+         model_iter != data.rend(); ++model_iter) {
+      ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+      iter->Prev();
+      ASSERT_OK(iter->status());
+    }
+    ASSERT_TRUE(!iter->Valid());
+    ASSERT_OK(iter->status());
+    if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) {
+      iter->~InternalIterator();
+    } else {
+      delete iter;
+    }
+  }
+
+  void TestRandomAccess(Random* rnd, const std::vector<std::string>& keys,
+                        const stl_wrappers::KVMap& data) {
+    static const bool kVerbose = false;
+    InternalIterator* iter = constructor_->NewIterator();
+    ASSERT_TRUE(!iter->Valid());
+    stl_wrappers::KVMap::const_iterator model_iter = data.begin();
+    if (kVerbose) fprintf(stderr, "---\n");
+    for (int i = 0; i < 200; i++) {
+      const int toss = rnd->Uniform(support_prev_ ? 5 : 3);
+      switch (toss) {
+        case 0: {
+          if (iter->Valid()) {
+            if (kVerbose) fprintf(stderr, "Next\n");
+            iter->Next();
+            ASSERT_OK(iter->status());
+            ++model_iter;
+            ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+          }
+          break;
+        }
+
+        case 1: {
+          if (kVerbose) fprintf(stderr, "SeekToFirst\n");
+          iter->SeekToFirst();
+          ASSERT_OK(iter->status());
+          model_iter = data.begin();
+          ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+          break;
+        }
+
+        case 2: {
+          std::string key = PickRandomKey(rnd, keys);
+          model_iter = data.lower_bound(key);
+          if (kVerbose)
+            fprintf(stderr, "Seek '%s'\n", EscapeString(key).c_str());
+          iter->Seek(Slice(key));
+          ASSERT_OK(iter->status());
+          ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+          break;
+        }
+
+        case 3: {
+          if (iter->Valid()) {
+            if (kVerbose) fprintf(stderr, "Prev\n");
+            iter->Prev();
+            ASSERT_OK(iter->status());
+            if (model_iter == data.begin()) {
+              model_iter = data.end();  // Wrap around to invalid value
+            } else {
+              --model_iter;
+            }
+            ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+          }
+          break;
+        }
+
+        case 4: {
+          if (kVerbose) fprintf(stderr, "SeekToLast\n");
+          iter->SeekToLast();
+          ASSERT_OK(iter->status());
+          if (keys.empty()) {
+            model_iter = data.end();
+          } else {
+            std::string last = data.rbegin()->first;
+            model_iter = data.lower_bound(last);
+          }
+          ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+          break;
+        }
+      }
+    }
+    if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) {
+      iter->~InternalIterator();
+    } else {
+      delete iter;
+    }
+  }
+
+  std::string ToString(const stl_wrappers::KVMap& data,
+                       const stl_wrappers::KVMap::const_iterator& it) {
+    if (it == data.end()) {
+      return "END";
+    } else {
+      return "'" + it->first + "->" + it->second + "'";
+    }
+  }
+
+  std::string ToString(const stl_wrappers::KVMap& data,
+                       const stl_wrappers::KVMap::const_reverse_iterator& it) {
+    if (it == data.rend()) {
+      return "END";
+    } else {
+      return "'" + it->first + "->" + it->second + "'";
+    }
+  }
+
+  std::string ToString(const InternalIterator* it) {
+    if (!it->Valid()) {
+      return "END";
+    } else {
+      return "'" + it->key().ToString() + "->" + it->value().ToString() + "'";
+    }
+  }
+
+  std::string PickRandomKey(Random* rnd, const std::vector<std::string>& keys) {
+    if (keys.empty()) {
+      return "foo";
+    } else {
+      const int index = rnd->Uniform(static_cast<int>(keys.size()));
+      std::string result = keys[index];
+      switch (rnd->Uniform(support_prev_ ? 3 : 1)) {
+        case 0:
+          // Return an existing key
+          break;
+        case 1: {
+          // Attempt to return something smaller than an existing key
+          if (result.size() > 0 && result[result.size() - 1] > '\0' &&
+              (!only_support_prefix_seek_ ||
+               options_.prefix_extractor->Transform(result).size() <
+                   result.size())) {
+            result[result.size() - 1]--;
+          }
+          break;
+        }
+        case 2: {
+          // Return something larger than an existing key
+          Increment(options_.comparator, &result);
+          break;
+        }
+      }
+      return result;
+    }
+  }
+
+  // Returns nullptr if not running against a DB
+  DB* db() const { return constructor_->db(); }
+
+ private:
+  TestArgs args_;
+  Options options_;
+  ImmutableOptions ioptions_;
+  MutableCFOptions moptions_;
+  BlockBasedTableOptions table_options_;
+  std::unique_ptr<Constructor> constructor_;
+  WriteBufferManager write_buffer_;
+  bool support_prev_;
+  bool only_support_prefix_seek_;
+  std::shared_ptr<InternalKeyComparator> internal_comparator_;
+};
+
+class ParameterizedHarnessTest : public HarnessTest,
+                                 public testing::WithParamInterface<TestArgs> {
+ public:
+  ParameterizedHarnessTest() : HarnessTest(GetParam()) {}
+};
+
+INSTANTIATE_TEST_CASE_P(TableTest, ParameterizedHarnessTest,
+                        ::testing::ValuesIn(GenerateArgList()));
+
+class DBHarnessTest : public HarnessTest {
+ public:
+  DBHarnessTest()
+      : HarnessTest(TestArgs{DB_TEST, /* reverse_compare */ false,
+                             /* restart_interval */ 16, kNoCompression,
+                             /* compression_parallel_threads */ 1,
+                             /* format_version */ 0, /* use_mmap */ false}) {}
+};
+
+static bool Between(uint64_t val, uint64_t low, uint64_t high) {
+  bool result = (val >= low) && (val <= high);
+  if (!result) {
+    fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
+            (unsigned long long)(val), (unsigned long long)(low),
+            (unsigned long long)(high));
+  }
+  return result;
+}
+
+// Tests against all kinds of tables
+class TableTest : public testing::Test {
+ public:
+  const InternalKeyComparator& GetPlainInternalComparator(
+      const Comparator* comp) {
+    if (!plain_internal_comparator) {
+      plain_internal_comparator.reset(
+          new test::PlainInternalKeyComparator(comp));
+    }
+    return *plain_internal_comparator;
+  }
+  void IndexTest(BlockBasedTableOptions table_options);
+
+ private:
+  std::unique_ptr<InternalKeyComparator> plain_internal_comparator;
+};
+
+class GeneralTableTest : public TableTest {};
+class BlockBasedTableTestBase : public TableTest {};
+class BlockBasedTableTest
+    : public BlockBasedTableTestBase,
+      virtual public ::testing::WithParamInterface<uint32_t> {
+ public:
+  BlockBasedTableTest() : format_(GetParam()) {
+    env_ = ROCKSDB_NAMESPACE::Env::Default();
+  }
+
+  BlockBasedTableOptions GetBlockBasedTableOptions() {
+    BlockBasedTableOptions options;
+    options.format_version = format_;
+    return options;
+  }
+
+  void SetupTracingTest(TableConstructor* c) {
+    test_path_ = test::PerThreadDBPath("block_based_table_tracing_test");
+    EXPECT_OK(env_->CreateDir(test_path_));
+    trace_file_path_ = test_path_ + "/block_cache_trace_file";
+
+    BlockCacheTraceWriterOptions trace_writer_opt;
+    BlockCacheTraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    EXPECT_OK(NewFileTraceWriter(env_, EnvOptions(), trace_file_path_,
+                                 &trace_writer));
+    std::unique_ptr<BlockCacheTraceWriter> block_cache_trace_writer =
+        NewBlockCacheTraceWriter(env_->GetSystemClock().get(), trace_writer_opt,
+                                 std::move(trace_writer));
+    ASSERT_NE(block_cache_trace_writer, nullptr);
+    // Always return Status::OK().
+    assert(c->block_cache_tracer_
+               .StartTrace(trace_opt, std::move(block_cache_trace_writer))
+               .ok());
+
+    {
+      std::string user_key = "k01";
+      InternalKey internal_key(user_key, 0, kTypeValue);
+      std::string encoded_key = internal_key.Encode().ToString();
+      c->Add(encoded_key, kDummyValue);
+    }
+    {
+      std::string user_key = "k02";
+      InternalKey internal_key(user_key, 0, kTypeValue);
+      std::string encoded_key = internal_key.Encode().ToString();
+      c->Add(encoded_key, kDummyValue);
+    }
+  }
+
+  void VerifyBlockAccessTrace(
+      TableConstructor* c,
+      const std::vector<BlockCacheTraceRecord>& expected_records) {
+    c->block_cache_tracer_.EndTrace();
+
+    {
+      std::unique_ptr<TraceReader> trace_reader;
+      Status s = NewFileTraceReader(env_, EnvOptions(), trace_file_path_,
+                                    &trace_reader);
+      EXPECT_OK(s);
+      BlockCacheTraceReader reader(std::move(trace_reader));
+      BlockCacheTraceHeader header;
+      EXPECT_OK(reader.ReadHeader(&header));
+      uint32_t index = 0;
+      while (s.ok()) {
+        BlockCacheTraceRecord access;
+        s = reader.ReadAccess(&access);
+        if (!s.ok()) {
+          break;
+        }
+        ASSERT_LT(index, expected_records.size());
+        EXPECT_NE("", access.block_key);
+        EXPECT_EQ(access.block_type, expected_records[index].block_type);
+        EXPECT_GT(access.block_size, 0);
+        EXPECT_EQ(access.caller, expected_records[index].caller);
+        EXPECT_EQ(access.no_insert, expected_records[index].no_insert);
+        EXPECT_EQ(access.is_cache_hit, expected_records[index].is_cache_hit);
+        // Get
+        if (access.caller == TableReaderCaller::kUserGet) {
+          EXPECT_EQ(access.referenced_key,
+                    expected_records[index].referenced_key);
+          EXPECT_EQ(access.get_id, expected_records[index].get_id);
+          EXPECT_EQ(access.get_from_user_specified_snapshot,
+                    expected_records[index].get_from_user_specified_snapshot);
+          if (access.block_type == TraceType::kBlockTraceDataBlock) {
+            EXPECT_GT(access.referenced_data_size, 0);
+            EXPECT_GT(access.num_keys_in_block, 0);
+            EXPECT_EQ(access.referenced_key_exist_in_block,
+                      expected_records[index].referenced_key_exist_in_block);
+          }
+        } else {
+          EXPECT_EQ(access.referenced_key, "");
+          EXPECT_EQ(access.get_id, 0);
+          EXPECT_FALSE(access.get_from_user_specified_snapshot);
+          EXPECT_EQ(access.referenced_data_size, 0);
+          EXPECT_EQ(access.num_keys_in_block, 0);
+          EXPECT_FALSE(access.referenced_key_exist_in_block);
+        }
+        index++;
+      }
+      EXPECT_EQ(index, expected_records.size());
+    }
+    EXPECT_OK(env_->DeleteFile(trace_file_path_));
+    EXPECT_OK(env_->DeleteDir(test_path_));
+  }
+
+ protected:
+  uint64_t IndexUncompressedHelper(bool indexCompress);
+
+ private:
+  uint32_t format_;
+  Env* env_;
+  std::string trace_file_path_;
+  std::string test_path_;
+};
+class PlainTableTest : public TableTest {};
+class TablePropertyTest : public testing::Test {};
+class BBTTailPrefetchTest : public TableTest {};
+
+// The helper class to test the file checksum
+class FileChecksumTestHelper {
+ public:
+  FileChecksumTestHelper(bool convert_to_internal_key = false)
+      : convert_to_internal_key_(convert_to_internal_key) {}
+  ~FileChecksumTestHelper() {}
+
+  void CreateWritableFile() {
+    sink_ = new test::StringSink();
+    std::unique_ptr<FSWritableFile> holder(sink_);
+    file_writer_.reset(new WritableFileWriter(
+        std::move(holder), "" /* don't care */, FileOptions()));
+  }
+
+  void SetFileChecksumGenerator(FileChecksumGenerator* checksum_generator) {
+    if (file_writer_ != nullptr) {
+      file_writer_->TEST_SetFileChecksumGenerator(checksum_generator);
+    } else {
+      delete checksum_generator;
+    }
+  }
+
+  WritableFileWriter* GetFileWriter() { return file_writer_.get(); }
+
+  Status ResetTableBuilder(std::unique_ptr<TableBuilder>&& builder) {
+    assert(builder != nullptr);
+    table_builder_ = std::move(builder);
+    return Status::OK();
+  }
+
+  void AddKVtoKVMap(int num_entries) {
+    Random rnd(test::RandomSeed());
+    for (int i = 0; i < num_entries; i++) {
+      std::string v = rnd.RandomString(100);
+      kv_map_[test::RandomKey(&rnd, 20)] = v;
+    }
+  }
+
+  Status WriteKVAndFlushTable() {
+    for (const auto& kv : kv_map_) {
+      if (convert_to_internal_key_) {
+        ParsedInternalKey ikey(kv.first, kMaxSequenceNumber, kTypeValue);
+        std::string encoded;
+        AppendInternalKey(&encoded, ikey);
+        table_builder_->Add(encoded, kv.second);
+      } else {
+        table_builder_->Add(kv.first, kv.second);
+      }
+      EXPECT_TRUE(table_builder_->status().ok());
+    }
+    Status s = table_builder_->Finish();
+    EXPECT_OK(file_writer_->Flush());
+    EXPECT_OK(s);
+
+    EXPECT_EQ(sink_->contents().size(), table_builder_->FileSize());
+    return s;
+  }
+
+  std::string GetFileChecksum() {
+    EXPECT_OK(file_writer_->Close());
+    return table_builder_->GetFileChecksum();
+  }
+
+  const char* GetFileChecksumFuncName() {
+    return table_builder_->GetFileChecksumFuncName();
+  }
+
+  Status CalculateFileChecksum(FileChecksumGenerator* file_checksum_generator,
+                               std::string* checksum) {
+    assert(file_checksum_generator != nullptr);
+    cur_file_num_ = checksum_file_num_++;
+    test::StringSink* ss_rw =
+        static_cast<test::StringSink*>(file_writer_->writable_file());
+    std::unique_ptr<FSRandomAccessFile> source(
+        new test::StringSource(ss_rw->contents()));
+    file_reader_.reset(new RandomAccessFileReader(std::move(source), "test"));
+
+    std::unique_ptr<char[]> scratch(new char[2048]);
+    Slice result;
+    uint64_t offset = 0;
+    Status s;
+    s = file_reader_->Read(IOOptions(), offset, 2048, &result, scratch.get(),
+                           nullptr, Env::IO_TOTAL /* rate_limiter_priority */);
+    if (!s.ok()) {
+      return s;
+    }
+    while (result.size() != 0) {
+      file_checksum_generator->Update(scratch.get(), result.size());
+      offset += static_cast<uint64_t>(result.size());
+      s = file_reader_->Read(IOOptions(), offset, 2048, &result, scratch.get(),
+                             nullptr,
+                             Env::IO_TOTAL /* rate_limiter_priority */);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+    EXPECT_EQ(offset, static_cast<uint64_t>(table_builder_->FileSize()));
+    file_checksum_generator->Finalize();
+    *checksum = file_checksum_generator->GetChecksum();
+    return Status::OK();
+  }
+
+ private:
+  bool convert_to_internal_key_;
+  uint64_t cur_file_num_;
+  std::unique_ptr<WritableFileWriter> file_writer_;
+  std::unique_ptr<RandomAccessFileReader> file_reader_;
+  std::unique_ptr<TableBuilder> table_builder_;
+  stl_wrappers::KVMap kv_map_;
+  test::StringSink* sink_ = nullptr;
+
+  static uint64_t checksum_file_num_;
+};
+
+uint64_t FileChecksumTestHelper::checksum_file_num_ = 1;
+
+INSTANTIATE_TEST_CASE_P(FormatVersions, BlockBasedTableTest,
+                        testing::ValuesIn(test::kFooterFormatVersionsToTest));
+
+// This test serves as the living tutorial for the prefix scan of user collected
+// properties.
+TEST_F(TablePropertyTest, PrefixScanTest) {
+  UserCollectedProperties props{
+      {"num.111.1", "1"}, {"num.111.2", "2"}, {"num.111.3", "3"},
+      {"num.333.1", "1"}, {"num.333.2", "2"}, {"num.333.3", "3"},
+      {"num.555.1", "1"}, {"num.555.2", "2"}, {"num.555.3", "3"},
+  };
+
+  // prefixes that exist
+  for (const std::string prefix : {"num.111", "num.333", "num.555"}) {
+    int num = 0;
+    for (auto pos = props.lower_bound(prefix);
+         pos != props.end() &&
+         pos->first.compare(0, prefix.size(), prefix) == 0;
+         ++pos) {
+      ++num;
+      auto key = prefix + "." + std::to_string(num);
+      ASSERT_EQ(key, pos->first);
+      ASSERT_EQ(std::to_string(num), pos->second);
+    }
+    ASSERT_EQ(3, num);
+  }
+
+  // prefixes that don't exist
+  for (const std::string prefix :
+       {"num.000", "num.222", "num.444", "num.666"}) {
+    auto pos = props.lower_bound(prefix);
+    ASSERT_TRUE(pos == props.end() ||
+                pos->first.compare(0, prefix.size(), prefix) != 0);
+  }
+}
+
+namespace {
+struct TestIds {
+  UniqueId64x3 internal_id;
+  UniqueId64x3 external_id;
+};
+
+inline bool operator==(const TestIds& lhs, const TestIds& rhs) {
+  return lhs.internal_id == rhs.internal_id &&
+         lhs.external_id == rhs.external_id;
+}
+
+std::ostream& operator<<(std::ostream& os, const TestIds& ids) {
+  return os << std::hex << "{{{ 0x" << ids.internal_id[0] << "U, 0x"
+            << ids.internal_id[1] << "U, 0x" << ids.internal_id[2]
+            << "U }}, {{ 0x" << ids.external_id[0] << "U, 0x"
+            << ids.external_id[1] << "U, 0x" << ids.external_id[2] << "U }}}";
+}
+
+TestIds GetUniqueId(TableProperties* tp, std::unordered_set<uint64_t>* seen,
+                    const std::string& db_id, const std::string& db_session_id,
+                    uint64_t file_number) {
+  // First test session id logic
+  if (db_session_id.size() == 20) {
+    uint64_t upper;
+    uint64_t lower;
+    EXPECT_OK(DecodeSessionId(db_session_id, &upper, &lower));
+    EXPECT_EQ(EncodeSessionId(upper, lower), db_session_id);
+  }
+
+  // Get external using public API
+  tp->db_id = db_id;
+  tp->db_session_id = db_session_id;
+  tp->orig_file_number = file_number;
+  TestIds t;
+  {
+    std::string euid;
+    EXPECT_OK(GetExtendedUniqueIdFromTableProperties(*tp, &euid));
+    EXPECT_EQ(euid.size(), 24U);
+    t.external_id[0] = DecodeFixed64(&euid[0]);
+    t.external_id[1] = DecodeFixed64(&euid[8]);
+    t.external_id[2] = DecodeFixed64(&euid[16]);
+
+    std::string uid;
+    EXPECT_OK(GetUniqueIdFromTableProperties(*tp, &uid));
+    EXPECT_EQ(uid.size(), 16U);
+    EXPECT_EQ(uid, euid.substr(0, 16));
+    EXPECT_EQ(t.external_id[0], DecodeFixed64(&uid[0]));
+    EXPECT_EQ(t.external_id[1], DecodeFixed64(&uid[8]));
+  }
+  // All these should be effectively random
+  EXPECT_TRUE(seen->insert(t.external_id[0]).second);
+  EXPECT_TRUE(seen->insert(t.external_id[1]).second);
+  EXPECT_TRUE(seen->insert(t.external_id[2]).second);
+
+  // Get internal with internal API
+  EXPECT_OK(GetSstInternalUniqueId(db_id, db_session_id, file_number,
+                                   &t.internal_id));
+  EXPECT_NE(t.internal_id, kNullUniqueId64x3);
+
+  // Verify relationship
+  UniqueId64x3 tmp = t.internal_id;
+  InternalUniqueIdToExternal(&tmp);
+  EXPECT_EQ(tmp, t.external_id);
+  ExternalUniqueIdToInternal(&tmp);
+  EXPECT_EQ(tmp, t.internal_id);
+
+  // And 128-bit internal version
+  UniqueId64x2 tmp2{};
+  EXPECT_OK(GetSstInternalUniqueId(db_id, db_session_id, file_number, &tmp2));
+  EXPECT_NE(tmp2, kNullUniqueId64x2);
+
+  EXPECT_EQ(tmp2[0], t.internal_id[0]);
+  EXPECT_EQ(tmp2[1], t.internal_id[1]);
+  InternalUniqueIdToExternal(&tmp2);
+  EXPECT_EQ(tmp2[0], t.external_id[0]);
+  EXPECT_EQ(tmp2[1], t.external_id[1]);
+  ExternalUniqueIdToInternal(&tmp2);
+  EXPECT_EQ(tmp2[0], t.internal_id[0]);
+  EXPECT_EQ(tmp2[1], t.internal_id[1]);
+
+  return t;
+}
+}  // namespace
+
+TEST_F(TablePropertyTest, UniqueIdsSchemaAndQuality) {
+  // To ensure the computation only depends on the expected entries, we set
+  // the rest randomly
+  TableProperties tp;
+  TEST_SetRandomTableProperties(&tp);
+
+  // DB id is normally RFC-4122
+  const std::string db_id1 = "7265b6eb-4e42-4aec-86a4-0dc5e73a228d";
+  // Allow other forms of DB id
+  const std::string db_id2 = "1728000184588763620";
+  const std::string db_id3 = "x";
+
+  // DB session id is normally 20 chars in base-36, but 13 to 24 chars
+  // is ok, roughly 64 to 128 bits.
+  const std::string ses_id1 = "ABCDEFGHIJ0123456789";
+  // Same trailing 13 digits
+  const std::string ses_id2 = "HIJ0123456789";
+  const std::string ses_id3 = "0123ABCDEFGHIJ0123456789";
+  // Different trailing 12 digits
+  const std::string ses_id4 = "ABCDEFGH888888888888";
+  // And change length
+  const std::string ses_id5 = "ABCDEFGHIJ012";
+  const std::string ses_id6 = "ABCDEFGHIJ0123456789ABCD";
+
+  using T = TestIds;
+  std::unordered_set<uint64_t> seen;
+  // Establish a stable schema for the unique IDs. These values must not
+  // change for existing table files.
+  // (Note: parens needed for macro parsing, extra braces needed for some
+  // compilers.)
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id1, ses_id1, 1),
+      T({{{0x61d7dcf415d9cf19U, 0x160d77aae90757fdU, 0x907f41dfd90724ffU}},
+         {{0xf0bd230365df7464U, 0xca089303f3648eb4U, 0x4b44f7e7324b2817U}}}));
+  // Only change internal_id[1] with file number
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id1, ses_id1, 2),
+      T({{{0x61d7dcf415d9cf19U, 0x160d77aae90757feU, 0x907f41dfd90724ffU}},
+         {{0xf13fdf7adcfebb6dU, 0x97cd2226cc033ea2U, 0x198c438182091f0eU}}}));
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id1, ses_id1, 123456789),
+      T({{{0x61d7dcf415d9cf19U, 0x160d77aaee5c9ae9U, 0x907f41dfd90724ffU}},
+         {{0x81fbcebe1ac6c4f0U, 0x6b14a64cfdc0f1c4U, 0x7d8fb6eaf18edbb3U}}}));
+  // Change internal_id[1] and internal_id[2] with db_id
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id2, ses_id1, 1),
+      T({{{0x61d7dcf415d9cf19U, 0xf89c471f572f0d25U, 0x1f0f2a5eb0e6257eU}},
+         {{0x7f1d01d453616991U, 0x32ddf2afec804ab2U, 0xd10a1ee2f0c7d9c1U}}}));
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id3, ses_id1, 1),
+      T({{{0x61d7dcf415d9cf19U, 0xfed297a8154a57d0U, 0x8b931b9cdebd9e8U}},
+         {{0x62b2f43183f6894bU, 0x897ff2b460eefad1U, 0xf4ec189fb2d15e04U}}}));
+  // Keeping same last 13 digits of ses_id keeps same internal_id[0]
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id1, ses_id2, 1),
+      T({{{0x61d7dcf415d9cf19U, 0x5f6cc4fa2d528c8U, 0x7b70845d5bfb5446U}},
+         {{0x96d1c83ffcc94266U, 0x82663eac0ec6e14aU, 0x94a88b49678b77f6U}}}));
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id1, ses_id3, 1),
+      T({{{0x61d7dcf415d9cf19U, 0xfc7232879db37ea2U, 0xc0378d74ea4c89cdU}},
+         {{0xdf2ef57e98776905U, 0xda5b31c987da833bU, 0x79c1b4bd0a9e760dU}}}));
+  // Changing last 12 digits of ses_id only changes internal_id[0]
+  // (vs. db_id1, ses_id1, 1)
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id1, ses_id4, 1),
+      T({{{0x4f07cc0d003a83a8U, 0x160d77aae90757fdU, 0x907f41dfd90724ffU}},
+         {{0xbcf85336a9f71f04U, 0x4f2949e2f3adb60dU, 0x9ca0def976abfa10U}}}));
+  // ses_id can change everything.
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id1, ses_id5, 1),
+      T({{{0x94b8768e43f87ce6U, 0xc2559653ac4e7c93U, 0xde6dff6bbb1223U}},
+         {{0x5a9537af681817fbU, 0x1afcd1fecaead5eaU, 0x767077ad9ebe0008U}}}));
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id1, ses_id6, 1),
+      T({{{0x43cfb0ffa3b710edU, 0x263c580426406a1bU, 0xfacc91379a80d29dU}},
+         {{0xfa90547d84cb1cdbU, 0x2afe99c641992d4aU, 0x205b7f7b60e51cc2U}}}));
+
+  // Now verify more thoroughly that any small change in inputs completely
+  // changes external unique id.
+  // (Relying on 'seen' checks etc. in GetUniqueId)
+  std::string db_id = "00000000-0000-0000-0000-000000000000";
+  std::string ses_id = "000000000000000000000000";
+  uint64_t file_num = 1;
+  // change db_id
+  for (size_t i = 0; i < db_id.size(); ++i) {
+    if (db_id[i] == '-') {
+      continue;
+    }
+    for (char alt : std::string("123456789abcdef")) {
+      db_id[i] = alt;
+      GetUniqueId(&tp, &seen, db_id, ses_id, file_num);
+    }
+    db_id[i] = '0';
+  }
+  // change ses_id
+  for (size_t i = 0; i < ses_id.size(); ++i) {
+    for (char alt : std::string("123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ")) {
+      ses_id[i] = alt;
+      GetUniqueId(&tp, &seen, db_id, ses_id, file_num);
+    }
+    ses_id[i] = '0';
+  }
+  // change file_num
+  for (int i = 1; i < 64; ++i) {
+    GetUniqueId(&tp, &seen, db_id, ses_id, file_num << i);
+  }
+
+  // Verify that "all zeros" in first 128 bits is equivalent for internal and
+  // external IDs. This way, as long as we avoid "all zeros" in internal IDs,
+  // we avoid it in external IDs.
+  {
+    UniqueId64x3 id1{{0, 0, Random::GetTLSInstance()->Next64()}};
+    UniqueId64x3 id2 = id1;
+    InternalUniqueIdToExternal(&id1);
+    EXPECT_EQ(id1, id2);
+    ExternalUniqueIdToInternal(&id2);
+    EXPECT_EQ(id1, id2);
+  }
+}
+
+namespace {
+void SetGoodTableProperties(TableProperties* tp) {
+  // To ensure the computation only depends on the expected entries, we set
+  // the rest randomly
+  TEST_SetRandomTableProperties(tp);
+  tp->db_id = "7265b6eb-4e42-4aec-86a4-0dc5e73a228d";
+  tp->db_session_id = "ABCDEFGHIJ0123456789";
+  tp->orig_file_number = 1;
+}
+}  // namespace
+
+TEST_F(TablePropertyTest, UniqueIdHumanStrings) {
+  TableProperties tp;
+  SetGoodTableProperties(&tp);
+
+  std::string tmp;
+  EXPECT_OK(GetExtendedUniqueIdFromTableProperties(tp, &tmp));
+  EXPECT_EQ(tmp,
+            (std::string{{'\x64', '\x74', '\xdf', '\x65', '\x03', '\x23',
+                          '\xbd', '\xf0', '\xb4', '\x8e', '\x64', '\xf3',
+                          '\x03', '\x93', '\x08', '\xca', '\x17', '\x28',
+                          '\x4b', '\x32', '\xe7', '\xf7', '\x44', '\x4b'}}));
+  EXPECT_EQ(UniqueIdToHumanString(tmp),
+            "6474DF650323BDF0-B48E64F3039308CA-17284B32E7F7444B");
+
+  EXPECT_OK(GetUniqueIdFromTableProperties(tp, &tmp));
+  EXPECT_EQ(UniqueIdToHumanString(tmp), "6474DF650323BDF0-B48E64F3039308CA");
+
+  // including zero padding
+  tmp = std::string(24U, '\0');
+  tmp[15] = '\x12';
+  tmp[23] = '\xAB';
+  EXPECT_EQ(UniqueIdToHumanString(tmp),
+            "0000000000000000-0000000000000012-00000000000000AB");
+
+  // And shortened
+  tmp = std::string(20U, '\0');
+  tmp[5] = '\x12';
+  tmp[10] = '\xAB';
+  tmp[17] = '\xEF';
+  EXPECT_EQ(UniqueIdToHumanString(tmp),
+            "0000000000120000-0000AB0000000000-00EF0000");
+
+  tmp.resize(16);
+  EXPECT_EQ(UniqueIdToHumanString(tmp), "0000000000120000-0000AB0000000000");
+
+  tmp.resize(11);
+  EXPECT_EQ(UniqueIdToHumanString(tmp), "0000000000120000-0000AB");
+
+  tmp.resize(6);
+  EXPECT_EQ(UniqueIdToHumanString(tmp), "000000000012");
+
+  // Also internal IDs to human string
+  UniqueId64x3 euid = {12345, 678, 9};
+  EXPECT_EQ(InternalUniqueIdToHumanString(&euid), "{12345,678,9}");
+
+  UniqueId64x2 uid = {1234, 567890};
+  EXPECT_EQ(InternalUniqueIdToHumanString(&uid), "{1234,567890}");
+}
+
+TEST_F(TablePropertyTest, UniqueIdsFailure) {
+  TableProperties tp;
+  std::string tmp;
+
+  // Missing DB id
+  SetGoodTableProperties(&tp);
+  tp.db_id = "";
+  EXPECT_TRUE(GetUniqueIdFromTableProperties(tp, &tmp).IsNotSupported());
+  EXPECT_TRUE(
+      GetExtendedUniqueIdFromTableProperties(tp, &tmp).IsNotSupported());
+
+  // Missing session id
+  SetGoodTableProperties(&tp);
+  tp.db_session_id = "";
+  EXPECT_TRUE(GetUniqueIdFromTableProperties(tp, &tmp).IsNotSupported());
+  EXPECT_TRUE(
+      GetExtendedUniqueIdFromTableProperties(tp, &tmp).IsNotSupported());
+
+  // Missing file number
+  SetGoodTableProperties(&tp);
+  tp.orig_file_number = 0;
+  EXPECT_TRUE(GetUniqueIdFromTableProperties(tp, &tmp).IsNotSupported());
+  EXPECT_TRUE(
+      GetExtendedUniqueIdFromTableProperties(tp, &tmp).IsNotSupported());
+}
+
+// This test include all the basic checks except those for index size and block
+// size, which will be conducted in separated unit tests.
+TEST_P(BlockBasedTableTest, BasicBlockBasedTableProperties) {
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+
+  c.Add("a1", "val1");
+  c.Add("b2", "val2");
+  c.Add("c3", "val3");
+  c.Add("d4", "val4");
+  c.Add("e5", "val5");
+  c.Add("f6", "val6");
+  c.Add("g7", "val7");
+  c.Add("h8", "val8");
+  c.Add("j9", "val9");
+  uint64_t diff_internal_user_bytes = 9 * 8;  // 8 is seq size, 9 k-v totally
+
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  Options options;
+  options.compression = kNoCompression;
+  options.statistics = CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kAll);
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.block_restart_interval = 1;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  ImmutableOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+  ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_NOT_COMPRESSED), 0);
+
+  auto& props = *c.GetTableReader()->GetTableProperties();
+  ASSERT_EQ(kvmap.size(), props.num_entries);
+
+  auto raw_key_size = kvmap.size() * 2ul;
+  auto raw_value_size = kvmap.size() * 4ul;
+
+  ASSERT_EQ(raw_key_size + diff_internal_user_bytes, props.raw_key_size);
+  ASSERT_EQ(raw_value_size, props.raw_value_size);
+  ASSERT_EQ(1ul, props.num_data_blocks);
+  ASSERT_EQ("", props.filter_policy_name);  // no filter policy is used
+
+  // Verify data size.
+  BlockBuilder block_builder(1);
+  for (const auto& item : kvmap) {
+    block_builder.Add(item.first, item.second);
+  }
+  Slice content = block_builder.Finish();
+  ASSERT_EQ(content.size() + BlockBasedTable::kBlockTrailerSize +
+                diff_internal_user_bytes,
+            props.data_size);
+  c.ResetTableReader();
+}
+
+#ifdef SNAPPY
+uint64_t BlockBasedTableTest::IndexUncompressedHelper(bool compressed) {
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+  constexpr size_t kNumKeys = 10000;
+
+  for (size_t k = 0; k < kNumKeys; ++k) {
+    c.Add("key" + std::to_string(k), "val" + std::to_string(k));
+  }
+
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  Options options;
+  options.compression = kSnappyCompression;
+  options.statistics = CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kAll);
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.block_restart_interval = 1;
+  table_options.enable_index_compression = compressed;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  ImmutableOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+  c.ResetTableReader();
+  return options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
+}
+TEST_P(BlockBasedTableTest, IndexUncompressed) {
+  uint64_t tbl1_compressed_cnt = IndexUncompressedHelper(true);
+  uint64_t tbl2_compressed_cnt = IndexUncompressedHelper(false);
+  // tbl1_compressed_cnt should include 1 index block
+  EXPECT_EQ(tbl2_compressed_cnt + 1, tbl1_compressed_cnt);
+}
+#endif  // SNAPPY
+
+TEST_P(BlockBasedTableTest, BlockBasedTableProperties2) {
+  TableConstructor c(&reverse_key_comparator);
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+
+  {
+    Options options;
+    options.compression = CompressionType::kNoCompression;
+    BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    const ImmutableOptions ioptions(options);
+    const MutableCFOptions moptions(options);
+    c.Finish(options, ioptions, moptions, table_options,
+             GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+
+    auto& props = *c.GetTableReader()->GetTableProperties();
+
+    // Default comparator
+    ASSERT_EQ("leveldb.BytewiseComparator", props.comparator_name);
+    // No merge operator
+    ASSERT_EQ("nullptr", props.merge_operator_name);
+    // No prefix extractor
+    ASSERT_EQ("nullptr", props.prefix_extractor_name);
+    // No property collectors
+    ASSERT_EQ("[]", props.property_collectors_names);
+    // No filter policy is used
+    ASSERT_EQ("", props.filter_policy_name);
+    // Compression type == that set:
+    ASSERT_EQ("NoCompression", props.compression_name);
+    c.ResetTableReader();
+  }
+
+  {
+    Options options;
+    BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    options.comparator = &reverse_key_comparator;
+    options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+    options.prefix_extractor.reset(NewNoopTransform());
+    options.table_properties_collector_factories.emplace_back(
+        new DummyPropertiesCollectorFactory1());
+    options.table_properties_collector_factories.emplace_back(
+        new DummyPropertiesCollectorFactory2());
+
+    const ImmutableOptions ioptions(options);
+    const MutableCFOptions moptions(options);
+    c.Finish(options, ioptions, moptions, table_options,
+             GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+
+    auto& props = *c.GetTableReader()->GetTableProperties();
+
+    ASSERT_EQ("rocksdb.ReverseBytewiseComparator", props.comparator_name);
+    ASSERT_EQ("UInt64AddOperator", props.merge_operator_name);
+    ASSERT_EQ("rocksdb.Noop", props.prefix_extractor_name);
+    ASSERT_EQ(
+        "[DummyPropertiesCollectorFactory1,DummyPropertiesCollectorFactory2]",
+        props.property_collectors_names);
+    ASSERT_EQ("", props.filter_policy_name);  // no filter policy is used
+    c.ResetTableReader();
+  }
+}
+
+TEST_P(BlockBasedTableTest, RangeDelBlock) {
+  TableConstructor c(BytewiseComparator());
+  std::vector<std::string> keys = {"1pika", "2chu"};
+  std::vector<std::string> vals = {"p", "c"};
+
+  std::vector<RangeTombstone> expected_tombstones = {
+      {"1pika", "2chu", 0},
+      {"2chu", "c", 1},
+      {"2chu", "c", 0},
+      {"c", "p", 0},
+  };
+
+  for (int i = 0; i < 2; i++) {
+    RangeTombstone t(keys[i], vals[i], i);
+    std::pair<InternalKey, Slice> p = t.Serialize();
+    c.Add(p.first.Encode().ToString(), p.second);
+  }
+
+  std::vector<std::string> sorted_keys;
+  stl_wrappers::KVMap kvmap;
+  Options options;
+  options.compression = kNoCompression;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.block_restart_interval = 1;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  std::unique_ptr<InternalKeyComparator> internal_cmp(
+      new InternalKeyComparator(options.comparator));
+  c.Finish(options, ioptions, moptions, table_options, *internal_cmp,
+           &sorted_keys, &kvmap);
+
+  for (int j = 0; j < 2; ++j) {
+    std::unique_ptr<InternalIterator> iter(
+        c.GetTableReader()->NewRangeTombstoneIterator(ReadOptions()));
+    if (j > 0) {
+      // For second iteration, delete the table reader object and verify the
+      // iterator can still access its metablock's range tombstones.
+      c.ResetTableReader();
+    }
+    ASSERT_FALSE(iter->Valid());
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    for (size_t i = 0; i < expected_tombstones.size(); i++) {
+      ASSERT_TRUE(iter->Valid());
+      ParsedInternalKey parsed_key;
+      ASSERT_OK(
+          ParseInternalKey(iter->key(), &parsed_key, true /* log_err_key */));
+      RangeTombstone t(parsed_key, iter->value());
+      const auto& expected_t = expected_tombstones[i];
+      ASSERT_EQ(t.start_key_, expected_t.start_key_);
+      ASSERT_EQ(t.end_key_, expected_t.end_key_);
+      ASSERT_EQ(t.seq_, expected_t.seq_);
+      iter->Next();
+    }
+    ASSERT_TRUE(!iter->Valid());
+  }
+}
+
+TEST_P(BlockBasedTableTest, FilterPolicyNameProperties) {
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+  c.Add("a1", "val1");
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+  Options options;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+  auto& props = *c.GetTableReader()->GetTableProperties();
+  ASSERT_EQ(table_options.filter_policy->Name(), props.filter_policy_name);
+  c.ResetTableReader();
+}
+
+//
+// BlockBasedTableTest::PrefetchTest
+//
+void AssertKeysInCache(BlockBasedTable* table_reader,
+                       const std::vector<std::string>& keys_in_cache,
+                       const std::vector<std::string>& keys_not_in_cache,
+                       bool convert = false) {
+  if (convert) {
+    for (auto key : keys_in_cache) {
+      InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
+      ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode()));
+    }
+    for (auto key : keys_not_in_cache) {
+      InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
+      ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode()));
+    }
+  } else {
+    for (auto key : keys_in_cache) {
+      ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key));
+    }
+    for (auto key : keys_not_in_cache) {
+      ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), key));
+    }
+  }
+}
+
+void PrefetchRange(TableConstructor* c, Options* opt,
+                   BlockBasedTableOptions* table_options, const char* key_begin,
+                   const char* key_end,
+                   const std::vector<std::string>& keys_in_cache,
+                   const std::vector<std::string>& keys_not_in_cache,
+                   const Status expected_status = Status::OK()) {
+  // reset the cache and reopen the table
+  table_options->block_cache = NewLRUCache(16 * 1024 * 1024, 4);
+  opt->table_factory.reset(NewBlockBasedTableFactory(*table_options));
+  const ImmutableOptions ioptions2(*opt);
+  const MutableCFOptions moptions(*opt);
+  ASSERT_OK(c->Reopen(ioptions2, moptions));
+
+  // prefetch
+  auto* table_reader = dynamic_cast<BlockBasedTable*>(c->GetTableReader());
+  Status s;
+  std::unique_ptr<Slice> begin, end;
+  std::unique_ptr<InternalKey> i_begin, i_end;
+  if (key_begin != nullptr) {
+    if (c->ConvertToInternalKey()) {
+      i_begin.reset(new InternalKey(key_begin, kMaxSequenceNumber, kTypeValue));
+      begin.reset(new Slice(i_begin->Encode()));
+    } else {
+      begin.reset(new Slice(key_begin));
+    }
+  }
+  if (key_end != nullptr) {
+    if (c->ConvertToInternalKey()) {
+      i_end.reset(new InternalKey(key_end, kMaxSequenceNumber, kTypeValue));
+      end.reset(new Slice(i_end->Encode()));
+    } else {
+      end.reset(new Slice(key_end));
+    }
+  }
+  s = table_reader->Prefetch(begin.get(), end.get());
+
+  ASSERT_TRUE(s.code() == expected_status.code());
+
+  // assert our expectation in cache warmup
+  AssertKeysInCache(table_reader, keys_in_cache, keys_not_in_cache,
+                    c->ConvertToInternalKey());
+  c->ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, PrefetchTest) {
+  // The purpose of this test is to test the prefetching operation built into
+  // BlockBasedTable.
+  Options opt;
+  std::unique_ptr<InternalKeyComparator> ikc;
+  ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
+  opt.compression = kNoCompression;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.block_size = 1024;
+  // big enough so we don't ever lose cached values.
+  table_options.block_cache = NewLRUCache(16 * 1024 * 1024, 4);
+  opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+  c.Add("k01", "hello");
+  c.Add("k02", "hello2");
+  c.Add("k03", std::string(10000, 'x'));
+  c.Add("k04", std::string(200000, 'x'));
+  c.Add("k05", std::string(300000, 'x'));
+  c.Add("k06", "hello3");
+  c.Add("k07", std::string(100000, 'x'));
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  const ImmutableOptions ioptions(opt);
+  const MutableCFOptions moptions(opt);
+  c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap);
+  c.ResetTableReader();
+
+  // We get the following data spread :
+  //
+  // Data block         Index
+  // ========================
+  // [ k01 k02 k03 ]    k03
+  // [ k04         ]    k04
+  // [ k05         ]    k05
+  // [ k06 k07     ]    k07
+
+  // Simple
+  PrefetchRange(&c, &opt, &table_options,
+                /*key_range=*/"k01", "k05",
+                /*keys_in_cache=*/{"k01", "k02", "k03", "k04", "k05"},
+                /*keys_not_in_cache=*/{"k06", "k07"});
+  PrefetchRange(&c, &opt, &table_options, "k01", "k01", {"k01", "k02", "k03"},
+                {"k04", "k05", "k06", "k07"});
+  // odd
+  PrefetchRange(&c, &opt, &table_options, "a", "z",
+                {"k01", "k02", "k03", "k04", "k05", "k06", "k07"}, {});
+  PrefetchRange(&c, &opt, &table_options, "k00", "k00", {"k01", "k02", "k03"},
+                {"k04", "k05", "k06", "k07"});
+  // Edge cases
+  PrefetchRange(&c, &opt, &table_options, "k00", "k06",
+                {"k01", "k02", "k03", "k04", "k05", "k06", "k07"}, {});
+  PrefetchRange(&c, &opt, &table_options, "k00", "zzz",
+                {"k01", "k02", "k03", "k04", "k05", "k06", "k07"}, {});
+  // null keys
+  PrefetchRange(&c, &opt, &table_options, nullptr, nullptr,
+                {"k01", "k02", "k03", "k04", "k05", "k06", "k07"}, {});
+  PrefetchRange(&c, &opt, &table_options, "k04", nullptr,
+                {"k04", "k05", "k06", "k07"}, {"k01", "k02", "k03"});
+  PrefetchRange(&c, &opt, &table_options, nullptr, "k05",
+                {"k01", "k02", "k03", "k04", "k05"}, {"k06", "k07"});
+  // invalid
+  PrefetchRange(&c, &opt, &table_options, "k06", "k00", {}, {},
+                Status::InvalidArgument(Slice("k06 "), Slice("k07")));
+  c.ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, TotalOrderSeekOnHashIndex) {
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  for (int i = 0; i <= 4; ++i) {
+    Options options;
+    // Make each key/value an individual block
+    table_options.block_size = 64;
+    switch (i) {
+      case 0:
+        // Binary search index
+        table_options.index_type = BlockBasedTableOptions::kBinarySearch;
+        options.table_factory.reset(new BlockBasedTableFactory(table_options));
+        break;
+      case 1:
+        // Hash search index
+        table_options.index_type = BlockBasedTableOptions::kHashSearch;
+        options.table_factory.reset(new BlockBasedTableFactory(table_options));
+        options.prefix_extractor.reset(NewFixedPrefixTransform(4));
+        break;
+      case 2:
+        // Hash search index with filter policy
+        table_options.index_type = BlockBasedTableOptions::kHashSearch;
+        table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+        options.table_factory.reset(new BlockBasedTableFactory(table_options));
+        options.prefix_extractor.reset(NewFixedPrefixTransform(4));
+        break;
+      case 3:
+        // Two-level index
+        table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+        options.table_factory.reset(new BlockBasedTableFactory(table_options));
+        break;
+      case 4:
+        // Binary search with first key
+        table_options.index_type =
+            BlockBasedTableOptions::kBinarySearchWithFirstKey;
+        options.table_factory.reset(new BlockBasedTableFactory(table_options));
+        break;
+    }
+
+    TableConstructor c(BytewiseComparator(),
+                       true /* convert_to_internal_key_ */);
+    c.Add("aaaa1", std::string('a', 56));
+    c.Add("bbaa1", std::string('a', 56));
+    c.Add("cccc1", std::string('a', 56));
+    c.Add("bbbb1", std::string('a', 56));
+    c.Add("baaa1", std::string('a', 56));
+    c.Add("abbb1", std::string('a', 56));
+    c.Add("cccc2", std::string('a', 56));
+    std::vector<std::string> keys;
+    stl_wrappers::KVMap kvmap;
+    const ImmutableOptions ioptions(options);
+    const MutableCFOptions moptions(options);
+    c.Finish(options, ioptions, moptions, table_options,
+             GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+    auto props = c.GetTableReader()->GetTableProperties();
+    ASSERT_EQ(7u, props->num_data_blocks);
+    auto* reader = c.GetTableReader();
+    ReadOptions ro;
+    ro.total_order_seek = true;
+    std::unique_ptr<InternalIterator> iter(reader->NewIterator(
+        ro, moptions.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+    iter->Seek(InternalKey("b", 0, kTypeValue).Encode());
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("baaa1", ExtractUserKey(iter->key()).ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("bbaa1", ExtractUserKey(iter->key()).ToString());
+
+    iter->Seek(InternalKey("bb", 0, kTypeValue).Encode());
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("bbaa1", ExtractUserKey(iter->key()).ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("bbbb1", ExtractUserKey(iter->key()).ToString());
+
+    iter->Seek(InternalKey("bbb", 0, kTypeValue).Encode());
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("bbbb1", ExtractUserKey(iter->key()).ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("cccc1", ExtractUserKey(iter->key()).ToString());
+  }
+}
+
+TEST_P(BlockBasedTableTest, NoopTransformSeek) {
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+
+  Options options;
+  options.comparator = BytewiseComparator();
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset(NewNoopTransform());
+
+  TableConstructor c(options.comparator);
+  // To tickle the PrefixMayMatch bug it is important that the
+  // user-key is a single byte so that the index key exactly matches
+  // the user-key.
+  InternalKey key("a", 1, kTypeValue);
+  c.Add(key.Encode().ToString(), "b");
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  const InternalKeyComparator internal_comparator(options.comparator);
+  c.Finish(options, ioptions, moptions, table_options, internal_comparator,
+           &keys, &kvmap);
+
+  auto* reader = c.GetTableReader();
+  for (int i = 0; i < 2; ++i) {
+    ReadOptions ro;
+    ro.total_order_seek = (i == 0);
+    std::unique_ptr<InternalIterator> iter(reader->NewIterator(
+        ro, moptions.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+    iter->Seek(key.Encode());
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a", ExtractUserKey(iter->key()).ToString());
+  }
+}
+
+TEST_P(BlockBasedTableTest, SkipPrefixBloomFilter) {
+  // if DB is opened with a prefix extractor of a different name,
+  // prefix bloom is skipped when read the file
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.filter_policy.reset(NewBloomFilterPolicy(2));
+  table_options.whole_key_filtering = false;
+
+  Options options;
+  options.comparator = BytewiseComparator();
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+
+  TableConstructor c(options.comparator);
+  InternalKey key("abcdefghijk", 1, kTypeValue);
+  c.Add(key.Encode().ToString(), "test");
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  const InternalKeyComparator internal_comparator(options.comparator);
+  c.Finish(options, ioptions, moptions, table_options, internal_comparator,
+           &keys, &kvmap);
+  // TODO(Zhongyi): update test to use MutableCFOptions
+  options.prefix_extractor.reset(NewFixedPrefixTransform(9));
+  const ImmutableOptions new_ioptions(options);
+  const MutableCFOptions new_moptions(options);
+  ASSERT_OK(c.Reopen(new_ioptions, new_moptions));
+  auto reader = c.GetTableReader();
+  ReadOptions read_options;
+  std::unique_ptr<InternalIterator> db_iter(reader->NewIterator(
+      read_options, new_moptions.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+  // Test point lookup
+  // only one kv
+  for (auto& kv : kvmap) {
+    db_iter->Seek(kv.first);
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_OK(db_iter->status());
+    ASSERT_EQ(db_iter->key(), kv.first);
+    ASSERT_EQ(db_iter->value(), kv.second);
+  }
+}
+
+TEST_P(BlockBasedTableTest, BadChecksumType) {
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+
+  Options options;
+  options.comparator = BytewiseComparator();
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+
+  TableConstructor c(options.comparator);
+  InternalKey key("abc", 1, kTypeValue);
+  c.Add(key.Encode().ToString(), "test");
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  const InternalKeyComparator internal_comparator(options.comparator);
+  c.Finish(options, ioptions, moptions, table_options, internal_comparator,
+           &keys, &kvmap);
+
+  // Corrupt checksum type (123 is invalid)
+  auto& sink = *c.TEST_GetSink();
+  size_t len = sink.contents_.size();
+  ASSERT_EQ(sink.contents_[len - Footer::kNewVersionsEncodedLength],
+            table_options.checksum);
+  sink.contents_[len - Footer::kNewVersionsEncodedLength] = char{123};
+
+  // (Re-)Open table file with bad checksum type
+  const ImmutableOptions new_ioptions(options);
+  const MutableCFOptions new_moptions(options);
+  Status s = c.Reopen(new_ioptions, new_moptions);
+  ASSERT_NOK(s);
+  ASSERT_EQ(s.ToString(),
+            "Corruption: Corrupt or unsupported checksum type: 123");
+}
+
+namespace {
+std::string ChecksumAsString(const std::string& data,
+                             ChecksumType checksum_type) {
+  uint32_t v = ComputeBuiltinChecksum(checksum_type, data.data(), data.size());
+
+  // Verify consistency with other function
+  if (data.size() >= 1) {
+    EXPECT_EQ(v, ComputeBuiltinChecksumWithLastByte(
+                     checksum_type, data.data(), data.size() - 1, data.back()));
+  }
+  // Little endian as in file
+  std::array<char, 4> raw_bytes;
+  EncodeFixed32(raw_bytes.data(), v);
+  return Slice(raw_bytes.data(), raw_bytes.size()).ToString(/*hex*/ true);
+}
+
+std::string ChecksumAsString(std::string* data, char new_last_byte,
+                             ChecksumType checksum_type) {
+  data->back() = new_last_byte;
+  return ChecksumAsString(*data, checksum_type);
+}
+}  // namespace
+
+// Make sure that checksum values don't change in later versions, even if
+// consistent within current version.
+TEST_P(BlockBasedTableTest, ChecksumSchemas) {
+  std::string b0 = "x";
+  std::string b1 = "This is a short block!x";
+  std::string b2;
+  for (int i = 0; i < 100; ++i) {
+    b2.append("This is a long block!");
+  }
+  b2.append("x");
+  // Trailing 'x' will be replaced by compression type
+
+  std::string empty;
+
+  char ct1 = kNoCompression;
+  char ct2 = kSnappyCompression;
+  char ct3 = kZSTD;
+
+  // Note: first byte of trailer is compression type, last 4 are checksum
+
+  for (ChecksumType t : GetSupportedChecksums()) {
+    switch (t) {
+      case kNoChecksum:
+        EXPECT_EQ(ChecksumAsString(empty, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "00000000");
+        break;
+      case kCRC32c:
+        EXPECT_EQ(ChecksumAsString(empty, t), "D8EA82A2");
+        EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "D28F2549");
+        EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "052B2843");
+        EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "46F8F711");
+        EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "583F0355");
+        EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "2F9B0A57");
+        EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "ECE7DA1D");
+        EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "943EF0AB");
+        EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "43A2EDB1");
+        EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "00E53D63");
+        break;
+      case kxxHash:
+        EXPECT_EQ(ChecksumAsString(empty, t), "055DCC02");
+        EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "3EB065CF");
+        EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "31F79238");
+        EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "320D2E00");
+        EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "4A2E5FB0");
+        EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "0BD9F652");
+        EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "B4107E50");
+        EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "20F4D4BA");
+        EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "8F1A1F99");
+        EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "A191A338");
+        break;
+      case kxxHash64:
+        EXPECT_EQ(ChecksumAsString(empty, t), "99E9D851");
+        EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "682705DB");
+        EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "30E7211B");
+        EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "B7BB58E8");
+        EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "B74655EF");
+        EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "B6C8BBBE");
+        EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "AED9E3B4");
+        EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "0D4999FE");
+        EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "F5932423");
+        EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "6B31BAB1");
+        break;
+      case kXXH3:
+        EXPECT_EQ(ChecksumAsString(empty, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "C294D338");
+        EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "1B174353");
+        EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "2D0E20C8");
+        EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "B37FB5E6");
+        EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "6AFC258D");
+        EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "5CE54616");
+        EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "FA2D482E");
+        EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "23AED845");
+        EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "15B7BBDE");
+        break;
+      default:
+        // Force this test to be updated on new ChecksumTypes
+        assert(false);
+        break;
+    }
+  }
+}
+
+void AddInternalKey(TableConstructor* c, const std::string& prefix,
+                    std::string value = "v", int /*suffix_len*/ = 800) {
+  static Random rnd(1023);
+  InternalKey k(prefix + rnd.RandomString(800), 0, kTypeValue);
+  c->Add(k.Encode().ToString(), value);
+}
+
+void TableTest::IndexTest(BlockBasedTableOptions table_options) {
+  TableConstructor c(BytewiseComparator());
+
+  // keys with prefix length 3, make sure the key/value is big enough to fill
+  // one block
+  AddInternalKey(&c, "0015");
+  AddInternalKey(&c, "0035");
+
+  AddInternalKey(&c, "0054");
+  AddInternalKey(&c, "0055");
+
+  AddInternalKey(&c, "0056");
+  AddInternalKey(&c, "0057");
+
+  AddInternalKey(&c, "0058");
+  AddInternalKey(&c, "0075");
+
+  AddInternalKey(&c, "0076");
+  AddInternalKey(&c, "0095");
+
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  Options options;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+  table_options.block_size = 1700;
+  table_options.block_cache = NewLRUCache(1024, 4);
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  std::unique_ptr<InternalKeyComparator> comparator(
+      new InternalKeyComparator(BytewiseComparator()));
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
+           &kvmap);
+  auto reader = c.GetTableReader();
+
+  auto props = reader->GetTableProperties();
+  ASSERT_EQ(5u, props->num_data_blocks);
+
+  // TODO(Zhongyi): update test to use MutableCFOptions
+  ReadOptions read_options;
+  std::unique_ptr<InternalIterator> index_iter(reader->NewIterator(
+      read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+  // -- Find keys do not exist, but have common prefix.
+  std::vector<std::string> prefixes = {"001", "003", "005", "007", "009"};
+  std::vector<std::string> lower_bound = {
+      keys[0], keys[1], keys[2], keys[7], keys[9],
+  };
+
+  // find the lower bound of the prefix
+  for (size_t i = 0; i < prefixes.size(); ++i) {
+    index_iter->Seek(InternalKey(prefixes[i], 0, kTypeValue).Encode());
+    ASSERT_OK(index_iter->status());
+    ASSERT_TRUE(index_iter->Valid());
+
+    // seek the first element in the block
+    ASSERT_EQ(lower_bound[i], index_iter->key().ToString());
+    ASSERT_EQ("v", index_iter->value().ToString());
+  }
+
+  // find the upper bound of prefixes
+  std::vector<std::string> upper_bound = {
+      keys[1],
+      keys[2],
+      keys[7],
+      keys[9],
+  };
+
+  // find existing keys
+  for (const auto& item : kvmap) {
+    auto ukey = ExtractUserKey(item.first).ToString();
+    index_iter->Seek(ukey);
+
+    // ASSERT_OK(regular_iter->status());
+    ASSERT_OK(index_iter->status());
+
+    // ASSERT_TRUE(regular_iter->Valid());
+    ASSERT_TRUE(index_iter->Valid());
+
+    ASSERT_EQ(item.first, index_iter->key().ToString());
+    ASSERT_EQ(item.second, index_iter->value().ToString());
+  }
+
+  for (size_t i = 0; i < prefixes.size(); ++i) {
+    // the key is greater than any existing keys.
+    auto key = prefixes[i] + "9";
+    index_iter->Seek(InternalKey(key, 0, kTypeValue).Encode());
+
+    ASSERT_TRUE(index_iter->status().ok() || index_iter->status().IsNotFound());
+    ASSERT_TRUE(!index_iter->status().IsNotFound() || !index_iter->Valid());
+    if (i == prefixes.size() - 1) {
+      // last key
+      ASSERT_TRUE(!index_iter->Valid());
+    } else {
+      ASSERT_TRUE(index_iter->Valid());
+      // seek the first element in the block
+      ASSERT_EQ(upper_bound[i], index_iter->key().ToString());
+      ASSERT_EQ("v", index_iter->value().ToString());
+    }
+  }
+
+  // find keys with prefix that don't match any of the existing prefixes.
+  std::vector<std::string> non_exist_prefixes = {"002", "004", "006", "008"};
+  for (const auto& prefix : non_exist_prefixes) {
+    index_iter->Seek(InternalKey(prefix, 0, kTypeValue).Encode());
+    // regular_iter->Seek(prefix);
+
+    ASSERT_OK(index_iter->status());
+    // Seek to non-existing prefixes should yield either invalid, or a
+    // key with prefix greater than the target.
+    if (index_iter->Valid()) {
+      Slice ukey = ExtractUserKey(index_iter->key());
+      Slice ukey_prefix = options.prefix_extractor->Transform(ukey);
+      ASSERT_TRUE(BytewiseComparator()->Compare(prefix, ukey_prefix) < 0);
+    }
+  }
+  for (const auto& prefix : non_exist_prefixes) {
+    index_iter->SeekForPrev(InternalKey(prefix, 0, kTypeValue).Encode());
+    // regular_iter->Seek(prefix);
+
+    ASSERT_OK(index_iter->status());
+    // Seek to non-existing prefixes should yield either invalid, or a
+    // key with prefix greater than the target.
+    if (index_iter->Valid()) {
+      Slice ukey = ExtractUserKey(index_iter->key());
+      Slice ukey_prefix = options.prefix_extractor->Transform(ukey);
+      ASSERT_TRUE(BytewiseComparator()->Compare(prefix, ukey_prefix) > 0);
+    }
+  }
+
+  {
+    // Test reseek case. It should impact partitioned index more.
+    ReadOptions ro;
+    ro.total_order_seek = true;
+    std::unique_ptr<InternalIterator> index_iter2(reader->NewIterator(
+        ro, moptions.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+    // Things to cover in partitioned index:
+    // 1. Both of Seek() and SeekToLast() has optimization to prevent
+    //    rereek leaf index block if it remains to the same one, and
+    //    they reuse the same variable.
+    // 2. When Next() or Prev() is called, the block moves, so the
+    //    optimization should kick in only with the current one.
+    index_iter2->Seek(InternalKey("0055", 0, kTypeValue).Encode());
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0055", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->SeekToLast();
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->Seek(InternalKey("0055", 0, kTypeValue).Encode());
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0055", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->SeekToLast();
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
+    index_iter2->Prev();
+    ASSERT_TRUE(index_iter2->Valid());
+    index_iter2->Prev();
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0075", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->Seek(InternalKey("0095", 0, kTypeValue).Encode());
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
+    index_iter2->Prev();
+    ASSERT_TRUE(index_iter2->Valid());
+    index_iter2->Prev();
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0075", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->SeekToLast();
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->Seek(InternalKey("0095", 0, kTypeValue).Encode());
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->Prev();
+    ASSERT_TRUE(index_iter2->Valid());
+    index_iter2->Prev();
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0075", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->Seek(InternalKey("0075", 0, kTypeValue).Encode());
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0075", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->Next();
+    ASSERT_TRUE(index_iter2->Valid());
+    index_iter2->Next();
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->SeekToLast();
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
+  }
+
+  c.ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, BinaryIndexTest) {
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.index_type = BlockBasedTableOptions::kBinarySearch;
+  IndexTest(table_options);
+}
+
+TEST_P(BlockBasedTableTest, HashIndexTest) {
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.index_type = BlockBasedTableOptions::kHashSearch;
+  IndexTest(table_options);
+}
+
+TEST_P(BlockBasedTableTest, PartitionIndexTest) {
+  const int max_index_keys = 5;
+  const int est_max_index_key_value_size = 32;
+  const int est_max_index_size = max_index_keys * est_max_index_key_value_size;
+  for (int i = 1; i <= est_max_index_size + 1; i++) {
+    BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+    table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+    table_options.metadata_block_size = i;
+    IndexTest(table_options);
+  }
+}
+
+TEST_P(BlockBasedTableTest, IndexSeekOptimizationIncomplete) {
+  std::unique_ptr<InternalKeyComparator> comparator(
+      new InternalKeyComparator(BytewiseComparator()));
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  Options options;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+
+  TableConstructor c(BytewiseComparator());
+  AddInternalKey(&c, "pika");
+
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
+           &kvmap);
+  ASSERT_EQ(1, keys.size());
+
+  auto reader = c.GetTableReader();
+  ReadOptions ropt;
+  ropt.read_tier = ReadTier::kBlockCacheTier;
+  std::unique_ptr<InternalIterator> iter(reader->NewIterator(
+      ropt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+  auto ikey = [](Slice user_key) {
+    return InternalKey(user_key, 0, kTypeValue).Encode().ToString();
+  };
+
+  iter->Seek(ikey("pika"));
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_TRUE(iter->status().IsIncomplete());
+
+  // This used to crash at some point.
+  iter->Seek(ikey("pika"));
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_TRUE(iter->status().IsIncomplete());
+}
+
+TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKey1) {
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.index_type = BlockBasedTableOptions::kBinarySearchWithFirstKey;
+  IndexTest(table_options);
+}
+
+class CustomFlushBlockPolicy : public FlushBlockPolicyFactory,
+                               public FlushBlockPolicy {
+ public:
+  explicit CustomFlushBlockPolicy(std::vector<int> keys_per_block)
+      : keys_per_block_(keys_per_block) {}
+
+  const char* Name() const override { return "CustomFlushBlockPolicy"; }
+
+  FlushBlockPolicy* NewFlushBlockPolicy(const BlockBasedTableOptions&,
+                                        const BlockBuilder&) const override {
+    return new CustomFlushBlockPolicy(keys_per_block_);
+  }
+
+  bool Update(const Slice&, const Slice&) override {
+    if (keys_in_current_block_ >= keys_per_block_.at(current_block_idx_)) {
+      ++current_block_idx_;
+      keys_in_current_block_ = 1;
+      return true;
+    }
+
+    ++keys_in_current_block_;
+    return false;
+  }
+
+  std::vector<int> keys_per_block_;
+
+  int current_block_idx_ = 0;
+  int keys_in_current_block_ = 0;
+};
+
+TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKey2) {
+  for (int use_first_key = 0; use_first_key < 2; ++use_first_key) {
+    SCOPED_TRACE("use_first_key = " + std::to_string(use_first_key));
+    BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+    table_options.index_type =
+        use_first_key ? BlockBasedTableOptions::kBinarySearchWithFirstKey
+                      : BlockBasedTableOptions::kBinarySearch;
+    table_options.block_cache = NewLRUCache(10000);  // fits all blocks
+    table_options.index_shortening =
+        BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+    table_options.flush_block_policy_factory =
+        std::make_shared<CustomFlushBlockPolicy>(std::vector<int>{2, 1, 3, 2});
+    Options options;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    options.statistics = CreateDBStatistics();
+    Statistics* stats = options.statistics.get();
+    std::unique_ptr<InternalKeyComparator> comparator(
+        new InternalKeyComparator(BytewiseComparator()));
+    const ImmutableOptions ioptions(options);
+    const MutableCFOptions moptions(options);
+
+    TableConstructor c(BytewiseComparator());
+
+    // Block 0.
+    AddInternalKey(&c, "aaaa", "v0");
+    AddInternalKey(&c, "aaac", "v1");
+
+    // Block 1.
+    AddInternalKey(&c, "aaca", "v2");
+
+    // Block 2.
+    AddInternalKey(&c, "caaa", "v3");
+    AddInternalKey(&c, "caac", "v4");
+    AddInternalKey(&c, "caae", "v5");
+
+    // Block 3.
+    AddInternalKey(&c, "ccaa", "v6");
+    AddInternalKey(&c, "ccac", "v7");
+
+    // Write the file.
+    std::vector<std::string> keys;
+    stl_wrappers::KVMap kvmap;
+    c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
+             &kvmap);
+    ASSERT_EQ(8, keys.size());
+
+    auto reader = c.GetTableReader();
+    auto props = reader->GetTableProperties();
+    ASSERT_EQ(4u, props->num_data_blocks);
+    ReadOptions read_options;
+    std::unique_ptr<InternalIterator> iter(reader->NewIterator(
+        read_options, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized,
+        /*compaction_readahead_size=*/0, /*allow_unprepared_value=*/true));
+
+    // Shouldn't have read data blocks before iterator is seeked.
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    auto ikey = [](Slice user_key) {
+      return InternalKey(user_key, 0, kTypeValue).Encode().ToString();
+    };
+
+    // Seek to a key between blocks. If index contains first key, we shouldn't
+    // read any data blocks until value is requested.
+    iter->Seek(ikey("aaba"));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[2], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 0 : 1,
+              stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    ASSERT_TRUE(iter->PrepareValue());
+    EXPECT_EQ("v2", iter->value().ToString());
+    EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Seek to the middle of a block. The block should be read right away.
+    iter->Seek(ikey("caab"));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[4], iter->key().ToString());
+    EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    ASSERT_TRUE(iter->PrepareValue());
+    EXPECT_EQ("v4", iter->value().ToString());
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Seek to just before the same block and don't access value.
+    // The iterator should keep pinning the block contents.
+    iter->Seek(ikey("baaa"));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[3], iter->key().ToString());
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Seek to the same block again to check that the block is still pinned.
+    iter->Seek(ikey("caae"));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[5], iter->key().ToString());
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    ASSERT_TRUE(iter->PrepareValue());
+    EXPECT_EQ("v5", iter->value().ToString());
+    EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Step forward and fall through to the next block. Don't access value.
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[6], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 2 : 3,
+              stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Step forward again. Block should be read.
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[7], iter->key().ToString());
+    EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    ASSERT_TRUE(iter->PrepareValue());
+    EXPECT_EQ("v7", iter->value().ToString());
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Step forward and reach the end.
+    iter->Next();
+    EXPECT_FALSE(iter->Valid());
+    EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Seek to a single-key block and step forward without accessing value.
+    iter->Seek(ikey("aaca"));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[2], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 0 : 1,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[3], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 1 : 2,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    ASSERT_TRUE(iter->PrepareValue());
+    EXPECT_EQ("v3", iter->value().ToString());
+    EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    // Seek between blocks and step back without accessing value.
+    iter->Seek(ikey("aaca"));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[2], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 2 : 3,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    iter->Prev();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[1], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 2 : 3,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    // All blocks are in cache now, there'll be no more misses ever.
+    EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    ASSERT_TRUE(iter->PrepareValue());
+    EXPECT_EQ("v1", iter->value().ToString());
+
+    // Next into the next block again.
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[2], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 2 : 4,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Seek to first and step back without accessing value.
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[0], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 2 : 5,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    iter->Prev();
+    EXPECT_FALSE(iter->Valid());
+    EXPECT_EQ(use_first_key ? 2 : 5,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Do some SeekForPrev() and SeekToLast() just to cover all methods.
+    iter->SeekForPrev(ikey("caad"));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[4], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 3 : 6,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    ASSERT_TRUE(iter->PrepareValue());
+    EXPECT_EQ("v4", iter->value().ToString());
+    EXPECT_EQ(use_first_key ? 3 : 6,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    iter->SeekToLast();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[7], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 4 : 7,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    ASSERT_TRUE(iter->PrepareValue());
+    EXPECT_EQ("v7", iter->value().ToString());
+    EXPECT_EQ(use_first_key ? 4 : 7,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    c.ResetTableReader();
+  }
+}
+
+TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKeyGlobalSeqno) {
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.index_type = BlockBasedTableOptions::kBinarySearchWithFirstKey;
+  table_options.block_cache = NewLRUCache(10000);
+  Options options;
+  options.statistics = CreateDBStatistics();
+  Statistics* stats = options.statistics.get();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  std::unique_ptr<InternalKeyComparator> comparator(
+      new InternalKeyComparator(BytewiseComparator()));
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+
+  TableConstructor c(BytewiseComparator(), /* convert_to_internal_key */ false,
+                     /* level */ -1, /* largest_seqno */ 42);
+
+  c.Add(InternalKey("b", 0, kTypeValue).Encode().ToString(), "x");
+  c.Add(InternalKey("c", 0, kTypeValue).Encode().ToString(), "y");
+
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
+           &kvmap);
+  ASSERT_EQ(2, keys.size());
+
+  auto reader = c.GetTableReader();
+  auto props = reader->GetTableProperties();
+  ASSERT_EQ(1u, props->num_data_blocks);
+  ReadOptions read_options;
+  std::unique_ptr<InternalIterator> iter(reader->NewIterator(
+      read_options, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized,
+      /*compaction_readahead_size=*/0, /*allow_unprepared_value=*/true));
+
+  iter->Seek(InternalKey("a", 0, kTypeValue).Encode().ToString());
+  ASSERT_TRUE(iter->Valid());
+  EXPECT_EQ(InternalKey("b", 42, kTypeValue).Encode().ToString(),
+            iter->key().ToString());
+  EXPECT_NE(keys[0], iter->key().ToString());
+  // Key should have been served from index, without reading data blocks.
+  EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+  ASSERT_TRUE(iter->PrepareValue());
+  EXPECT_EQ("x", iter->value().ToString());
+  EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+  EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+  EXPECT_EQ(InternalKey("b", 42, kTypeValue).Encode().ToString(),
+            iter->key().ToString());
+
+  c.ResetTableReader();
+}
+
+// It's very hard to figure out the index block size of a block accurately.
+// To make sure we get the index size, we just make sure as key number
+// grows, the filter block size also grows.
+TEST_P(BlockBasedTableTest, IndexSizeStat) {
+  uint64_t last_index_size = 0;
+
+  // we need to use random keys since the pure human readable texts
+  // may be well compressed, resulting insignifcant change of index
+  // block size.
+  Random rnd(test::RandomSeed());
+  std::vector<std::string> keys;
+
+  for (int i = 0; i < 100; ++i) {
+    keys.push_back(rnd.RandomString(10000));
+  }
+
+  // Each time we load one more key to the table. the table index block
+  // size is expected to be larger than last time's.
+  for (size_t i = 1; i < keys.size(); ++i) {
+    TableConstructor c(BytewiseComparator(),
+                       true /* convert_to_internal_key_ */);
+    for (size_t j = 0; j < i; ++j) {
+      c.Add(keys[j], "val");
+    }
+
+    std::vector<std::string> ks;
+    stl_wrappers::KVMap kvmap;
+    Options options;
+    options.compression = kNoCompression;
+    BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+    table_options.block_restart_interval = 1;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    const ImmutableOptions ioptions(options);
+    const MutableCFOptions moptions(options);
+    c.Finish(options, ioptions, moptions, table_options,
+             GetPlainInternalComparator(options.comparator), &ks, &kvmap);
+    auto index_size = c.GetTableReader()->GetTableProperties()->index_size;
+    ASSERT_GT(index_size, last_index_size);
+    last_index_size = index_size;
+    c.ResetTableReader();
+  }
+}
+
+TEST_P(BlockBasedTableTest, NumBlockStat) {
+  Random rnd(test::RandomSeed());
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+  Options options;
+  options.compression = kNoCompression;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.block_restart_interval = 1;
+  table_options.block_size = 1000;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  for (int i = 0; i < 10; ++i) {
+    // the key/val are slightly smaller than block size, so that each block
+    // holds roughly one key/value pair.
+    c.Add(rnd.RandomString(900), "val");
+  }
+
+  std::vector<std::string> ks;
+  stl_wrappers::KVMap kvmap;
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &ks, &kvmap);
+  ASSERT_EQ(kvmap.size(),
+            c.GetTableReader()->GetTableProperties()->num_data_blocks);
+  c.ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, TracingGetTest) {
+  TableConstructor c(BytewiseComparator());
+  Options options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  options.create_if_missing = true;
+  table_options.block_cache = NewLRUCache(1024 * 1024, 0);
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  SetupTracingTest(&c);
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  ImmutableOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+  std::string user_key = "k01";
+  InternalKey internal_key(user_key, 0, kTypeValue);
+  std::string encoded_key = internal_key.Encode().ToString();
+  for (uint32_t i = 1; i <= 2; i++) {
+    PinnableSlice value;
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, user_key, &value, nullptr,
+                           nullptr, nullptr, true, nullptr, nullptr, nullptr,
+                           nullptr, nullptr, nullptr, /*tracing_get_id=*/i);
+    get_perf_context()->Reset();
+    ASSERT_OK(c.GetTableReader()->Get(ReadOptions(), encoded_key, &get_context,
+                                      moptions.prefix_extractor.get()));
+    ASSERT_EQ(get_context.State(), GetContext::kFound);
+    ASSERT_EQ(value.ToString(), kDummyValue);
+  }
+
+  // Verify traces.
+  std::vector<BlockCacheTraceRecord> expected_records;
+  // The first two records should be prefetching index and filter blocks.
+  BlockCacheTraceRecord record;
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  record.caller = TableReaderCaller::kPrefetch;
+  record.is_cache_hit = false;
+  record.no_insert = false;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceFilterBlock;
+  expected_records.push_back(record);
+  // Then we should have three records for one index, one filter, and one data
+  // block access.
+  record.get_id = 1;
+  record.block_type = TraceType::kBlockTraceFilterBlock;
+  record.caller = TableReaderCaller::kUserGet;
+  record.get_from_user_specified_snapshot = false;
+  record.referenced_key = encoded_key;
+  record.referenced_key_exist_in_block = true;
+  record.is_cache_hit = true;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  expected_records.push_back(record);
+  record.is_cache_hit = false;
+  record.block_type = TraceType::kBlockTraceDataBlock;
+  expected_records.push_back(record);
+  // The second get should all observe cache hits.
+  record.is_cache_hit = true;
+  record.get_id = 2;
+  record.block_type = TraceType::kBlockTraceFilterBlock;
+  record.caller = TableReaderCaller::kUserGet;
+  record.get_from_user_specified_snapshot = false;
+  record.referenced_key = encoded_key;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceDataBlock;
+  expected_records.push_back(record);
+  VerifyBlockAccessTrace(&c, expected_records);
+  c.ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, TracingApproximateOffsetOfTest) {
+  TableConstructor c(BytewiseComparator());
+  Options options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  options.create_if_missing = true;
+  table_options.block_cache = NewLRUCache(1024 * 1024, 0);
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  SetupTracingTest(&c);
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  ImmutableOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+  for (uint32_t i = 1; i <= 2; i++) {
+    std::string user_key = "k01";
+    InternalKey internal_key(user_key, 0, kTypeValue);
+    std::string encoded_key = internal_key.Encode().ToString();
+    c.GetTableReader()->ApproximateOffsetOf(
+        encoded_key, TableReaderCaller::kUserApproximateSize);
+  }
+  // Verify traces.
+  std::vector<BlockCacheTraceRecord> expected_records;
+  // The first two records should be prefetching index and filter blocks.
+  BlockCacheTraceRecord record;
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  record.caller = TableReaderCaller::kPrefetch;
+  record.is_cache_hit = false;
+  record.no_insert = false;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceFilterBlock;
+  expected_records.push_back(record);
+  // Then we should have two records for only index blocks.
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  record.caller = TableReaderCaller::kUserApproximateSize;
+  record.is_cache_hit = true;
+  expected_records.push_back(record);
+  expected_records.push_back(record);
+  VerifyBlockAccessTrace(&c, expected_records);
+  c.ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, TracingIterator) {
+  TableConstructor c(BytewiseComparator());
+  Options options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  options.create_if_missing = true;
+  table_options.block_cache = NewLRUCache(1024 * 1024, 0);
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  SetupTracingTest(&c);
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  ImmutableOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+
+  for (uint32_t i = 1; i <= 2; i++) {
+    ReadOptions read_options;
+    std::unique_ptr<InternalIterator> iter(c.GetTableReader()->NewIterator(
+        read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUserIterator));
+    iter->SeekToFirst();
+    while (iter->Valid()) {
+      iter->key();
+      iter->value();
+      iter->Next();
+    }
+    ASSERT_OK(iter->status());
+    iter.reset();
+  }
+
+  // Verify traces.
+  std::vector<BlockCacheTraceRecord> expected_records;
+  // The first two records should be prefetching index and filter blocks.
+  BlockCacheTraceRecord record;
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  record.caller = TableReaderCaller::kPrefetch;
+  record.is_cache_hit = false;
+  record.no_insert = false;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceFilterBlock;
+  expected_records.push_back(record);
+  // Then we should have three records for index and two data block access.
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  record.caller = TableReaderCaller::kUserIterator;
+  record.is_cache_hit = true;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceDataBlock;
+  record.is_cache_hit = false;
+  expected_records.push_back(record);
+  expected_records.push_back(record);
+  // When we iterate this file for the second time, we should observe all cache
+  // hits.
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  record.is_cache_hit = true;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceDataBlock;
+  expected_records.push_back(record);
+  expected_records.push_back(record);
+  VerifyBlockAccessTrace(&c, expected_records);
+  c.ResetTableReader();
+}
+
+// A simple tool that takes the snapshot of block cache statistics.
+class BlockCachePropertiesSnapshot {
+ public:
+  explicit BlockCachePropertiesSnapshot(Statistics* statistics) {
+    block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_MISS);
+    block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_HIT);
+    index_block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_INDEX_MISS);
+    index_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_INDEX_HIT);
+    data_block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_DATA_MISS);
+    data_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_DATA_HIT);
+    filter_block_cache_miss =
+        statistics->getTickerCount(BLOCK_CACHE_FILTER_MISS);
+    filter_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT);
+    block_cache_bytes_read = statistics->getTickerCount(BLOCK_CACHE_BYTES_READ);
+    block_cache_bytes_write =
+        statistics->getTickerCount(BLOCK_CACHE_BYTES_WRITE);
+  }
+
+  void AssertIndexBlockStat(int64_t expected_index_block_cache_miss,
+                            int64_t expected_index_block_cache_hit) {
+    ASSERT_EQ(expected_index_block_cache_miss, index_block_cache_miss);
+    ASSERT_EQ(expected_index_block_cache_hit, index_block_cache_hit);
+  }
+
+  void AssertFilterBlockStat(int64_t expected_filter_block_cache_miss,
+                             int64_t expected_filter_block_cache_hit) {
+    ASSERT_EQ(expected_filter_block_cache_miss, filter_block_cache_miss);
+    ASSERT_EQ(expected_filter_block_cache_hit, filter_block_cache_hit);
+  }
+
+  // Check if the fetched props matches the expected ones.
+  // TODO(kailiu) Use this only when you disabled filter policy!
+  void AssertEqual(int64_t expected_index_block_cache_miss,
+                   int64_t expected_index_block_cache_hit,
+                   int64_t expected_data_block_cache_miss,
+                   int64_t expected_data_block_cache_hit) const {
+    ASSERT_EQ(expected_index_block_cache_miss, index_block_cache_miss);
+    ASSERT_EQ(expected_index_block_cache_hit, index_block_cache_hit);
+    ASSERT_EQ(expected_data_block_cache_miss, data_block_cache_miss);
+    ASSERT_EQ(expected_data_block_cache_hit, data_block_cache_hit);
+    ASSERT_EQ(expected_index_block_cache_miss + expected_data_block_cache_miss,
+              block_cache_miss);
+    ASSERT_EQ(expected_index_block_cache_hit + expected_data_block_cache_hit,
+              block_cache_hit);
+  }
+
+  int64_t GetCacheBytesRead() { return block_cache_bytes_read; }
+
+  int64_t GetCacheBytesWrite() { return block_cache_bytes_write; }
+
+ private:
+  int64_t block_cache_miss = 0;
+  int64_t block_cache_hit = 0;
+  int64_t index_block_cache_miss = 0;
+  int64_t index_block_cache_hit = 0;
+  int64_t data_block_cache_miss = 0;
+  int64_t data_block_cache_hit = 0;
+  int64_t filter_block_cache_miss = 0;
+  int64_t filter_block_cache_hit = 0;
+  int64_t block_cache_bytes_read = 0;
+  int64_t block_cache_bytes_write = 0;
+};
+
+// Make sure, by default, index/filter blocks were pre-loaded (meaning we won't
+// use block cache to store them).
+TEST_P(BlockBasedTableTest, BlockCacheDisabledTest) {
+  Options options;
+  options.create_if_missing = true;
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.block_cache = NewLRUCache(1024, 4);
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+  c.Add("key", "value");
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+
+  // preloading filter/index blocks is enabled.
+  auto reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
+  ASSERT_FALSE(reader->TEST_FilterBlockInCache());
+  ASSERT_FALSE(reader->TEST_IndexBlockInCache());
+
+  {
+    // nothing happens in the beginning
+    BlockCachePropertiesSnapshot props(options.statistics.get());
+    props.AssertIndexBlockStat(0, 0);
+    props.AssertFilterBlockStat(0, 0);
+  }
+
+  {
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, Slice(), nullptr, nullptr,
+                           nullptr, nullptr, true, nullptr, nullptr);
+    // a hack that just to trigger BlockBasedTable::GetFilter.
+    ASSERT_OK(reader->Get(ReadOptions(), "non-exist-key", &get_context,
+                          moptions.prefix_extractor.get()));
+    BlockCachePropertiesSnapshot props(options.statistics.get());
+    props.AssertIndexBlockStat(0, 0);
+    props.AssertFilterBlockStat(0, 0);
+  }
+}
+
+// Due to the difficulities of the intersaction between statistics, this test
+// only tests the case when "index block is put to block cache"
+TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
+  // -- Table construction
+  Options options;
+  options.create_if_missing = true;
+  options.statistics = CreateDBStatistics();
+
+  // Enable the cache for index/filter blocks
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  LRUCacheOptions co;
+  co.capacity = 2048;
+  co.num_shard_bits = 2;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  table_options.block_cache = NewLRUCache(co);
+  table_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+  c.Add("key", "value");
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+  // preloading filter/index blocks is prohibited.
+  auto* reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
+  ASSERT_FALSE(reader->TEST_FilterBlockInCache());
+  ASSERT_TRUE(reader->TEST_IndexBlockInCache());
+
+  // -- PART 1: Open with regular block cache.
+  // Since block_cache is disabled, no cache activities will be involved.
+  std::unique_ptr<InternalIterator> iter;
+
+  int64_t last_cache_bytes_read = 0;
+  // At first, no block will be accessed.
+  {
+    BlockCachePropertiesSnapshot props(options.statistics.get());
+    // index will be added to block cache.
+    props.AssertEqual(1,  // index block miss
+                      0, 0, 0);
+    ASSERT_EQ(props.GetCacheBytesRead(), 0);
+    ASSERT_EQ(props.GetCacheBytesWrite(),
+              static_cast<int64_t>(table_options.block_cache->GetUsage()));
+    last_cache_bytes_read = props.GetCacheBytesRead();
+  }
+
+  // Only index block will be accessed
+  {
+    iter.reset(c.NewIterator(moptions.prefix_extractor.get()));
+    BlockCachePropertiesSnapshot props(options.statistics.get());
+    // NOTE: to help better highlight the "detla" of each ticker, I use
+    // <last_value> + <added_value> to indicate the increment of changed
+    // value; other numbers remain the same.
+    props.AssertEqual(1, 0 + 1,  // index block hit
+                      0, 0);
+    // Cache hit, bytes read from cache should increase
+    ASSERT_GT(props.GetCacheBytesRead(), last_cache_bytes_read);
+    ASSERT_EQ(props.GetCacheBytesWrite(),
+              static_cast<int64_t>(table_options.block_cache->GetUsage()));
+    last_cache_bytes_read = props.GetCacheBytesRead();
+  }
+
+  // Only data block will be accessed
+  {
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    BlockCachePropertiesSnapshot props(options.statistics.get());
+    props.AssertEqual(1, 1, 0 + 1,  // data block miss
+                      0);
+    // Cache miss, Bytes read from cache should not change
+    ASSERT_EQ(props.GetCacheBytesRead(), last_cache_bytes_read);
+    ASSERT_EQ(props.GetCacheBytesWrite(),
+              static_cast<int64_t>(table_options.block_cache->GetUsage()));
+    last_cache_bytes_read = props.GetCacheBytesRead();
+  }
+
+  // Data block will be in cache
+  {
+    iter.reset(c.NewIterator(moptions.prefix_extractor.get()));
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    BlockCachePropertiesSnapshot props(options.statistics.get());
+    props.AssertEqual(1, 1 + 1, /* index block hit */
+                      1, 0 + 1 /* data block hit */);
+    // Cache hit, bytes read from cache should increase
+    ASSERT_GT(props.GetCacheBytesRead(), last_cache_bytes_read);
+    ASSERT_EQ(props.GetCacheBytesWrite(),
+              static_cast<int64_t>(table_options.block_cache->GetUsage()));
+  }
+  // release the iterator so that the block cache can reset correctly.
+  iter.reset();
+
+  c.ResetTableReader();
+
+  // -- PART 2: Open with very small block cache
+  // In this test, no block will ever get hit since the block cache is
+  // too small to fit even one entry.
+  table_options.block_cache = NewLRUCache(1, 4);
+  options.statistics = CreateDBStatistics();
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  const ImmutableOptions ioptions2(options);
+  const MutableCFOptions moptions2(options);
+  ASSERT_OK(c.Reopen(ioptions2, moptions2));
+  {
+    BlockCachePropertiesSnapshot props(options.statistics.get());
+    props.AssertEqual(1,  // index block miss
+                      0, 0, 0);
+    // Cache miss, Bytes read from cache should not change
+    ASSERT_EQ(props.GetCacheBytesRead(), 0);
+  }
+
+  {
+    // Both index and data block get accessed.
+    // It first cache index block then data block. But since the cache size
+    // is only 1, index block will be purged after data block is inserted.
+    iter.reset(c.NewIterator(moptions2.prefix_extractor.get()));
+    BlockCachePropertiesSnapshot props(options.statistics.get());
+    props.AssertEqual(1 + 1,  // index block miss
+                      0, 0,   // data block miss
+                      0);
+    // Cache hit, bytes read from cache should increase
+    ASSERT_EQ(props.GetCacheBytesRead(), 0);
+  }
+
+  {
+    // SeekToFirst() accesses data block. With similar reason, we expect data
+    // block's cache miss.
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    BlockCachePropertiesSnapshot props(options.statistics.get());
+    props.AssertEqual(2, 0, 0 + 1,  // data block miss
+                      0);
+    // Cache miss, Bytes read from cache should not change
+    ASSERT_EQ(props.GetCacheBytesRead(), 0);
+  }
+  iter.reset();
+  c.ResetTableReader();
+
+  // -- PART 3: Open table with bloom filter enabled but not in SST file
+  table_options.block_cache = NewLRUCache(4096, 4);
+  table_options.cache_index_and_filter_blocks = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  TableConstructor c3(BytewiseComparator());
+  std::string user_key = "k01";
+  InternalKey internal_key(user_key, 0, kTypeValue);
+  c3.Add(internal_key.Encode().ToString(), "hello");
+  ImmutableOptions ioptions3(options);
+  MutableCFOptions moptions3(options);
+  // Generate table without filter policy
+  c3.Finish(options, ioptions3, moptions3, table_options,
+            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+  c3.ResetTableReader();
+
+  // Open table with filter policy
+  table_options.filter_policy.reset(NewBloomFilterPolicy(1));
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.statistics = CreateDBStatistics();
+  ImmutableOptions ioptions4(options);
+  MutableCFOptions moptions4(options);
+  ASSERT_OK(c3.Reopen(ioptions4, moptions4));
+  reader = dynamic_cast<BlockBasedTable*>(c3.GetTableReader());
+  ASSERT_FALSE(reader->TEST_FilterBlockInCache());
+  PinnableSlice value;
+  GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                         GetContext::kNotFound, user_key, &value, nullptr,
+                         nullptr, nullptr, true, nullptr, nullptr);
+  ASSERT_OK(reader->Get(ReadOptions(), internal_key.Encode(), &get_context,
+                        moptions4.prefix_extractor.get()));
+  ASSERT_STREQ(value.data(), "hello");
+  BlockCachePropertiesSnapshot props(options.statistics.get());
+  props.AssertFilterBlockStat(0, 0);
+  c3.ResetTableReader();
+}
+
+void ValidateBlockSizeDeviation(int value, int expected) {
+  BlockBasedTableOptions table_options;
+  table_options.block_size_deviation = value;
+  BlockBasedTableFactory* factory = new BlockBasedTableFactory(table_options);
+
+  const BlockBasedTableOptions* normalized_table_options =
+      factory->GetOptions<BlockBasedTableOptions>();
+  ASSERT_EQ(normalized_table_options->block_size_deviation, expected);
+
+  delete factory;
+}
+
+void ValidateBlockRestartInterval(int value, int expected) {
+  BlockBasedTableOptions table_options;
+  table_options.block_restart_interval = value;
+  BlockBasedTableFactory* factory = new BlockBasedTableFactory(table_options);
+
+  const BlockBasedTableOptions* normalized_table_options =
+      factory->GetOptions<BlockBasedTableOptions>();
+  ASSERT_EQ(normalized_table_options->block_restart_interval, expected);
+
+  delete factory;
+}
+
+TEST_P(BlockBasedTableTest, InvalidOptions) {
+  // invalid values for block_size_deviation (<0 or >100) are silently set to 0
+  ValidateBlockSizeDeviation(-10, 0);
+  ValidateBlockSizeDeviation(-1, 0);
+  ValidateBlockSizeDeviation(0, 0);
+  ValidateBlockSizeDeviation(1, 1);
+  ValidateBlockSizeDeviation(99, 99);
+  ValidateBlockSizeDeviation(100, 100);
+  ValidateBlockSizeDeviation(101, 0);
+  ValidateBlockSizeDeviation(1000, 0);
+
+  // invalid values for block_restart_interval (<1) are silently set to 1
+  ValidateBlockRestartInterval(-10, 1);
+  ValidateBlockRestartInterval(-1, 1);
+  ValidateBlockRestartInterval(0, 1);
+  ValidateBlockRestartInterval(1, 1);
+  ValidateBlockRestartInterval(2, 2);
+  ValidateBlockRestartInterval(1000, 1000);
+}
+
+TEST_P(BlockBasedTableTest, BlockReadCountTest) {
+  // bloom_filter_type = 1 -- full filter using use_block_based_builder=false
+  // bloom_filter_type = 2 -- full filter using use_block_based_builder=true
+  //                          because of API change to hide block-based filter
+  for (int bloom_filter_type = 1; bloom_filter_type <= 2; ++bloom_filter_type) {
+    for (int index_and_filter_in_cache = 0; index_and_filter_in_cache < 2;
+         ++index_and_filter_in_cache) {
+      Options options;
+      options.create_if_missing = true;
+
+      BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+      table_options.block_cache = NewLRUCache(1, 0);
+      table_options.cache_index_and_filter_blocks = index_and_filter_in_cache;
+      table_options.filter_policy.reset(
+          NewBloomFilterPolicy(10, bloom_filter_type == 2));
+      options.table_factory.reset(new BlockBasedTableFactory(table_options));
+      std::vector<std::string> keys;
+      stl_wrappers::KVMap kvmap;
+
+      TableConstructor c(BytewiseComparator());
+      std::string user_key = "k04";
+      InternalKey internal_key(user_key, 0, kTypeValue);
+      std::string encoded_key = internal_key.Encode().ToString();
+      c.Add(encoded_key, "hello");
+      ImmutableOptions ioptions(options);
+      MutableCFOptions moptions(options);
+      // Generate table with filter policy
+      c.Finish(options, ioptions, moptions, table_options,
+               GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+      auto reader = c.GetTableReader();
+      PinnableSlice value;
+      {
+        GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                               GetContext::kNotFound, user_key, &value, nullptr,
+                               nullptr, nullptr, true, nullptr, nullptr);
+        get_perf_context()->Reset();
+        ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context,
+                              moptions.prefix_extractor.get()));
+        if (index_and_filter_in_cache) {
+          // data, index and filter block
+          ASSERT_EQ(get_perf_context()->block_read_count, 3);
+          ASSERT_EQ(get_perf_context()->index_block_read_count, 1);
+          ASSERT_EQ(get_perf_context()->filter_block_read_count, 1);
+        } else {
+          // just the data block
+          ASSERT_EQ(get_perf_context()->block_read_count, 1);
+        }
+        ASSERT_EQ(get_context.State(), GetContext::kFound);
+        ASSERT_STREQ(value.data(), "hello");
+      }
+
+      // Get non-existing key
+      user_key = "does-not-exist";
+      internal_key = InternalKey(user_key, 0, kTypeValue);
+      encoded_key = internal_key.Encode().ToString();
+
+      value.Reset();
+      {
+        GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                               GetContext::kNotFound, user_key, &value, nullptr,
+                               nullptr, nullptr, true, nullptr, nullptr);
+        get_perf_context()->Reset();
+        ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context,
+                              moptions.prefix_extractor.get()));
+        ASSERT_EQ(get_context.State(), GetContext::kNotFound);
+      }
+
+      if (index_and_filter_in_cache) {
+        if (bloom_filter_type == 0) {
+          // with block-based, we read index and then the filter
+          ASSERT_EQ(get_perf_context()->block_read_count, 2);
+          ASSERT_EQ(get_perf_context()->index_block_read_count, 1);
+          ASSERT_EQ(get_perf_context()->filter_block_read_count, 1);
+        } else {
+          // with full-filter, we read filter first and then we stop
+          ASSERT_EQ(get_perf_context()->block_read_count, 1);
+          ASSERT_EQ(get_perf_context()->filter_block_read_count, 1);
+        }
+      } else {
+        // filter is already in memory and it figures out that the key doesn't
+        // exist
+        ASSERT_EQ(get_perf_context()->block_read_count, 0);
+      }
+    }
+  }
+}
+
+TEST_P(BlockBasedTableTest, BlockCacheLeak) {
+  // Check that when we reopen a table we don't lose access to blocks already
+  // in the cache. This test checks whether the Table actually makes use of the
+  // unique ID from the file.
+
+  Options opt;
+  std::unique_ptr<InternalKeyComparator> ikc;
+  ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
+  opt.compression = kNoCompression;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.block_size = 1024;
+  // big enough so we don't ever lose cached values.
+  table_options.block_cache = NewLRUCache(16 * 1024 * 1024, 4);
+  opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+  c.Add("k01", "hello");
+  c.Add("k02", "hello2");
+  c.Add("k03", std::string(10000, 'x'));
+  c.Add("k04", std::string(200000, 'x'));
+  c.Add("k05", std::string(300000, 'x'));
+  c.Add("k06", "hello3");
+  c.Add("k07", std::string(100000, 'x'));
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  const ImmutableOptions ioptions(opt);
+  const MutableCFOptions moptions(opt);
+  c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap);
+
+  std::unique_ptr<InternalIterator> iter(
+      c.NewIterator(moptions.prefix_extractor.get()));
+  iter->SeekToFirst();
+  while (iter->Valid()) {
+    iter->key();
+    iter->value();
+    iter->Next();
+  }
+  ASSERT_OK(iter->status());
+  iter.reset();
+
+  const ImmutableOptions ioptions1(opt);
+  const MutableCFOptions moptions1(opt);
+  ASSERT_OK(c.Reopen(ioptions1, moptions1));
+  auto table_reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
+  for (const std::string& key : keys) {
+    InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
+    ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode()));
+  }
+  c.ResetTableReader();
+
+  // rerun with different block cache
+  table_options.block_cache = NewLRUCache(16 * 1024 * 1024, 4);
+  opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  const ImmutableOptions ioptions2(opt);
+  const MutableCFOptions moptions2(opt);
+  ASSERT_OK(c.Reopen(ioptions2, moptions2));
+  table_reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
+  for (const std::string& key : keys) {
+    InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
+    ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode()));
+  }
+  c.ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, MemoryAllocator) {
+  auto default_memory_allocator = std::make_shared<DefaultMemoryAllocator>();
+  auto custom_memory_allocator =
+      std::make_shared<CountedMemoryAllocator>(default_memory_allocator);
+  {
+    Options opt;
+    std::unique_ptr<InternalKeyComparator> ikc;
+    ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
+    opt.compression = kNoCompression;
+    BlockBasedTableOptions table_options;
+    table_options.block_size = 1024;
+    LRUCacheOptions lruOptions;
+    lruOptions.memory_allocator = custom_memory_allocator;
+    lruOptions.capacity = 16 * 1024 * 1024;
+    lruOptions.num_shard_bits = 4;
+    table_options.block_cache = NewLRUCache(std::move(lruOptions));
+    opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    TableConstructor c(BytewiseComparator(),
+                       true /* convert_to_internal_key_ */);
+    c.Add("k01", "hello");
+    c.Add("k02", "hello2");
+    c.Add("k03", std::string(10000, 'x'));
+    c.Add("k04", std::string(200000, 'x'));
+    c.Add("k05", std::string(300000, 'x'));
+    c.Add("k06", "hello3");
+    c.Add("k07", std::string(100000, 'x'));
+    std::vector<std::string> keys;
+    stl_wrappers::KVMap kvmap;
+    const ImmutableOptions ioptions(opt);
+    const MutableCFOptions moptions(opt);
+    c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap);
+
+    std::unique_ptr<InternalIterator> iter(
+        c.NewIterator(moptions.prefix_extractor.get()));
+    iter->SeekToFirst();
+    while (iter->Valid()) {
+      iter->key();
+      iter->value();
+      iter->Next();
+    }
+    ASSERT_OK(iter->status());
+  }
+
+  // out of scope, block cache should have been deleted, all allocations
+  // deallocated
+  EXPECT_EQ(custom_memory_allocator->GetNumAllocations(),
+            custom_memory_allocator->GetNumDeallocations());
+  // make sure that allocations actually happened through the cache allocator
+  EXPECT_GT(custom_memory_allocator->GetNumAllocations(), 0);
+}
+
+// Test the file checksum of block based table
+TEST_P(BlockBasedTableTest, NoFileChecksum) {
+  Options options;
+  ImmutableOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  std::unique_ptr<InternalKeyComparator> comparator(
+      new InternalKeyComparator(BytewiseComparator()));
+  int level = 0;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+  std::string column_family_name;
+
+  FileChecksumTestHelper f(true);
+  f.CreateWritableFile();
+  std::unique_ptr<TableBuilder> builder;
+  builder.reset(ioptions.table_factory->NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, *comparator,
+                          &int_tbl_prop_collector_factories,
+                          options.compression, options.compression_opts,
+                          kUnknownColumnFamily, column_family_name, level),
+      f.GetFileWriter()));
+  ASSERT_OK(f.ResetTableBuilder(std::move(builder)));
+  f.AddKVtoKVMap(1000);
+  ASSERT_OK(f.WriteKVAndFlushTable());
+  ASSERT_STREQ(f.GetFileChecksumFuncName(), kUnknownFileChecksumFuncName);
+  ASSERT_STREQ(f.GetFileChecksum().c_str(), kUnknownFileChecksum);
+}
+
+TEST_P(BlockBasedTableTest, Crc32cFileChecksum) {
+  FileChecksumGenCrc32cFactory* file_checksum_gen_factory =
+      new FileChecksumGenCrc32cFactory();
+  Options options;
+  options.file_checksum_gen_factory.reset(file_checksum_gen_factory);
+  ImmutableOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  std::unique_ptr<InternalKeyComparator> comparator(
+      new InternalKeyComparator(BytewiseComparator()));
+  int level = 0;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+  std::string column_family_name;
+
+  FileChecksumGenContext gen_context;
+  gen_context.file_name = "db/tmp";
+  std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen1 =
+      options.file_checksum_gen_factory->CreateFileChecksumGenerator(
+          gen_context);
+  FileChecksumTestHelper f(true);
+  f.CreateWritableFile();
+  f.SetFileChecksumGenerator(checksum_crc32c_gen1.release());
+  std::unique_ptr<TableBuilder> builder;
+  builder.reset(ioptions.table_factory->NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, *comparator,
+                          &int_tbl_prop_collector_factories,
+                          options.compression, options.compression_opts,
+                          kUnknownColumnFamily, column_family_name, level),
+      f.GetFileWriter()));
+  ASSERT_OK(f.ResetTableBuilder(std::move(builder)));
+  f.AddKVtoKVMap(1000);
+  ASSERT_OK(f.WriteKVAndFlushTable());
+  ASSERT_STREQ(f.GetFileChecksumFuncName(), "FileChecksumCrc32c");
+
+  std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen2 =
+      options.file_checksum_gen_factory->CreateFileChecksumGenerator(
+          gen_context);
+  std::string checksum;
+  ASSERT_OK(f.CalculateFileChecksum(checksum_crc32c_gen2.get(), &checksum));
+  ASSERT_STREQ(f.GetFileChecksum().c_str(), checksum.c_str());
+
+  // Unit test the generator itself for schema stability
+  std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen3 =
+      options.file_checksum_gen_factory->CreateFileChecksumGenerator(
+          gen_context);
+  const char data[] = "here is some data";
+  checksum_crc32c_gen3->Update(data, sizeof(data));
+  checksum_crc32c_gen3->Finalize();
+  checksum = checksum_crc32c_gen3->GetChecksum();
+  ASSERT_STREQ(checksum.c_str(), "\345\245\277\110");
+}
+
+// Plain table is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+TEST_F(PlainTableTest, BasicPlainTableProperties) {
+  PlainTableOptions plain_table_options;
+  plain_table_options.user_key_len = 8;
+  plain_table_options.bloom_bits_per_key = 8;
+  plain_table_options.hash_table_ratio = 0;
+
+  PlainTableFactory factory(plain_table_options);
+  std::unique_ptr<FSWritableFile> sink(new test::StringSink());
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(sink), "" /* don't care */, FileOptions()));
+  Options options;
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  InternalKeyComparator ikc(options.comparator);
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+  std::string column_family_name;
+  int unknown_level = -1;
+  std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, ikc,
+                          &int_tbl_prop_collector_factories, kNoCompression,
+                          CompressionOptions(), kUnknownColumnFamily,
+                          column_family_name, unknown_level),
+      file_writer.get()));
+
+  for (char c = 'a'; c <= 'z'; ++c) {
+    std::string key(8, c);
+    key.append("\1       ");  // PlainTable expects internal key structure
+    std::string value(28, c + 42);
+    builder->Add(key, value);
+  }
+  ASSERT_OK(builder->Finish());
+  ASSERT_OK(file_writer->Flush());
+
+  test::StringSink* ss =
+      static_cast<test::StringSink*>(file_writer->writable_file());
+  std::unique_ptr<FSRandomAccessFile> source(
+      new test::StringSource(ss->contents(), 72242, true));
+  std::unique_ptr<RandomAccessFileReader> file_reader(
+      new RandomAccessFileReader(std::move(source), "test"));
+
+  std::unique_ptr<TableProperties> props;
+  auto s = ReadTableProperties(file_reader.get(), ss->contents().size(),
+                               kPlainTableMagicNumber, ioptions, &props);
+  ASSERT_OK(s);
+
+  ASSERT_EQ(0ul, props->index_size);
+  ASSERT_EQ(0ul, props->filter_size);
+  ASSERT_EQ(16ul * 26, props->raw_key_size);
+  ASSERT_EQ(28ul * 26, props->raw_value_size);
+  ASSERT_EQ(26ul, props->num_entries);
+  ASSERT_EQ(1ul, props->num_data_blocks);
+}
+
+TEST_F(PlainTableTest, NoFileChecksum) {
+  PlainTableOptions plain_table_options;
+  plain_table_options.user_key_len = 20;
+  plain_table_options.bloom_bits_per_key = 8;
+  plain_table_options.hash_table_ratio = 0;
+  PlainTableFactory factory(plain_table_options);
+
+  Options options;
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  InternalKeyComparator ikc(options.comparator);
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+  std::string column_family_name;
+  int unknown_level = -1;
+  FileChecksumTestHelper f(true);
+  f.CreateWritableFile();
+
+  std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, ikc,
+                          &int_tbl_prop_collector_factories, kNoCompression,
+                          CompressionOptions(), kUnknownColumnFamily,
+                          column_family_name, unknown_level),
+      f.GetFileWriter()));
+  ASSERT_OK(f.ResetTableBuilder(std::move(builder)));
+  f.AddKVtoKVMap(1000);
+  ASSERT_OK(f.WriteKVAndFlushTable());
+  ASSERT_STREQ(f.GetFileChecksumFuncName(), kUnknownFileChecksumFuncName);
+  EXPECT_EQ(f.GetFileChecksum(), kUnknownFileChecksum);
+}
+
+TEST_F(PlainTableTest, Crc32cFileChecksum) {
+  PlainTableOptions plain_table_options;
+  plain_table_options.user_key_len = 20;
+  plain_table_options.bloom_bits_per_key = 8;
+  plain_table_options.hash_table_ratio = 0;
+  PlainTableFactory factory(plain_table_options);
+
+  FileChecksumGenCrc32cFactory* file_checksum_gen_factory =
+      new FileChecksumGenCrc32cFactory();
+  Options options;
+  options.file_checksum_gen_factory.reset(file_checksum_gen_factory);
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  InternalKeyComparator ikc(options.comparator);
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+  std::string column_family_name;
+  int unknown_level = -1;
+
+  FileChecksumGenContext gen_context;
+  gen_context.file_name = "db/tmp";
+  std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen1 =
+      options.file_checksum_gen_factory->CreateFileChecksumGenerator(
+          gen_context);
+  FileChecksumTestHelper f(true);
+  f.CreateWritableFile();
+  f.SetFileChecksumGenerator(checksum_crc32c_gen1.release());
+
+  std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, ikc,
+                          &int_tbl_prop_collector_factories, kNoCompression,
+                          CompressionOptions(), kUnknownColumnFamily,
+                          column_family_name, unknown_level),
+      f.GetFileWriter()));
+  ASSERT_OK(f.ResetTableBuilder(std::move(builder)));
+  f.AddKVtoKVMap(1000);
+  ASSERT_OK(f.WriteKVAndFlushTable());
+  ASSERT_STREQ(f.GetFileChecksumFuncName(), "FileChecksumCrc32c");
+
+  std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen2 =
+      options.file_checksum_gen_factory->CreateFileChecksumGenerator(
+          gen_context);
+  std::string checksum;
+  ASSERT_OK(f.CalculateFileChecksum(checksum_crc32c_gen2.get(), &checksum));
+  EXPECT_STREQ(f.GetFileChecksum().c_str(), checksum.c_str());
+}
+
+#endif  // !ROCKSDB_LITE
+
+TEST_F(GeneralTableTest, ApproximateOffsetOfPlain) {
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+  c.Add("k01", "hello");
+  c.Add("k02", "hello2");
+  c.Add("k03", std::string(10000, 'x'));
+  c.Add("k04", std::string(200000, 'x'));
+  c.Add("k05", std::string(300000, 'x'));
+  c.Add("k06", "hello3");
+  c.Add("k07", std::string(100000, 'x'));
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  Options options;
+  options.db_host_id = "";
+  test::PlainInternalKeyComparator internal_comparator(options.comparator);
+  options.compression = kNoCompression;
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1024;
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options, internal_comparator,
+           &keys, &kvmap);
+
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01a"), 0, 0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 0, 0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 10000, 11000));
+  // k04 and k05 will be in two consecutive blocks, the index is
+  // an arbitrary slice between k04 and k05, either before or after k04a
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04a"), 10000, 211000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"), 210000, 211000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 612000));
+  c.ResetTableReader();
+}
+
+static void DoCompressionTest(CompressionType comp) {
+  Random rnd(301);
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+  std::string tmp;
+  c.Add("k01", "hello");
+  c.Add("k02", test::CompressibleString(&rnd, 0.25, 10000, &tmp));
+  c.Add("k03", "hello3");
+  c.Add("k04", test::CompressibleString(&rnd, 0.25, 10000, &tmp));
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  Options options;
+  test::PlainInternalKeyComparator ikc(options.comparator);
+  options.compression = comp;
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1024;
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options, ikc, &keys, &kvmap);
+
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3525));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3525));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 7075));
+  c.ResetTableReader();
+}
+
+TEST_F(GeneralTableTest, ApproximateOffsetOfCompressed) {
+  std::vector<CompressionType> compression_state;
+  if (!Snappy_Supported()) {
+    fprintf(stderr, "skipping snappy compression tests\n");
+  } else {
+    compression_state.push_back(kSnappyCompression);
+  }
+
+  if (!Zlib_Supported()) {
+    fprintf(stderr, "skipping zlib compression tests\n");
+  } else {
+    compression_state.push_back(kZlibCompression);
+  }
+
+  // TODO(kailiu) DoCompressionTest() doesn't work with BZip2.
+  /*
+  if (!BZip2_Supported()) {
+    fprintf(stderr, "skipping bzip2 compression tests\n");
+  } else {
+    compression_state.push_back(kBZip2Compression);
+  }
+  */
+
+  if (!LZ4_Supported()) {
+    fprintf(stderr, "skipping lz4 and lz4hc compression tests\n");
+  } else {
+    compression_state.push_back(kLZ4Compression);
+    compression_state.push_back(kLZ4HCCompression);
+  }
+
+  if (!XPRESS_Supported()) {
+    fprintf(stderr, "skipping xpress and xpress compression tests\n");
+  } else {
+    compression_state.push_back(kXpressCompression);
+  }
+
+  for (auto state : compression_state) {
+    DoCompressionTest(state);
+  }
+}
+
+TEST_F(GeneralTableTest, ApproximateKeyAnchors) {
+  Random rnd(301);
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+  std::string tmp;
+  for (int i = 1000; i < 9000; i++) {
+    c.Add(std::to_string(i), rnd.RandomString(2000));
+  }
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  Options options;
+  InternalKeyComparator ikc(options.comparator);
+  options.compression = kNoCompression;
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 4096;
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options, ikc, &keys, &kvmap);
+
+  std::vector<TableReader::Anchor> anchors;
+  ASSERT_OK(c.GetTableReader()->ApproximateKeyAnchors(ReadOptions(), anchors));
+  // The target is 128 anchors. But in reality it can be slightly more or fewer.
+  ASSERT_GT(anchors.size(), 120);
+  ASSERT_LT(anchors.size(), 140);
+
+  // We have around 8000 keys. With 128 anchors, in average 62.5 keys per
+  // anchor. Here we take a rough range and estimate the distance between
+  // anchors is between 50 and 100.
+  // Total data size is about 18,000,000, so each anchor range is about
+  // 140,625. We also take a rough range.
+  int prev_num = 1000;
+  // Non-last anchor
+  for (size_t i = 0; i + 1 < anchors.size(); i++) {
+    auto& anchor = anchors[i];
+    ASSERT_GT(anchor.range_size, 100000);
+    ASSERT_LT(anchor.range_size, 200000);
+
+    // Key might be shortened, so fill 0 in the end if it is the case.
+    std::string key_cpy = anchor.user_key;
+    key_cpy.append(4 - key_cpy.size(), '0');
+    int num = std::stoi(key_cpy);
+    ASSERT_GT(num - prev_num, 50);
+    ASSERT_LT(num - prev_num, 100);
+    prev_num = num;
+  }
+
+  ASSERT_EQ("8999", anchors.back().user_key);
+  ASSERT_LT(anchors.back().range_size, 200000);
+
+  c.ResetTableReader();
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+TEST_P(ParameterizedHarnessTest, RandomizedHarnessTest) {
+  Random rnd(test::RandomSeed() + 5);
+  for (int num_entries = 0; num_entries < 2000;
+       num_entries += (num_entries < 50 ? 1 : 200)) {
+    for (int e = 0; e < num_entries; e++) {
+      Add(test::RandomKey(&rnd, rnd.Skewed(4)),
+          rnd.RandomString(rnd.Skewed(5)));
+    }
+    Test(&rnd);
+  }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBHarnessTest, RandomizedLongDB) {
+  Random rnd(test::RandomSeed());
+  int num_entries = 100000;
+  for (int e = 0; e < num_entries; e++) {
+    std::string v;
+    Add(test::RandomKey(&rnd, rnd.Skewed(4)), rnd.RandomString(rnd.Skewed(5)));
+  }
+  Test(&rnd);
+
+  // We must have created enough data to force merging
+  int files = 0;
+  for (int level = 0; level < db()->NumberLevels(); level++) {
+    std::string value;
+    char name[100];
+    snprintf(name, sizeof(name), "rocksdb.num-files-at-level%d", level);
+    ASSERT_TRUE(db()->GetProperty(name, &value));
+    files += atoi(value.c_str());
+  }
+  ASSERT_GT(files, 0);
+}
+#endif  // ROCKSDB_LITE
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+class MemTableTest : public testing::Test {
+ public:
+  MemTableTest() {
+    InternalKeyComparator cmp(BytewiseComparator());
+    auto table_factory = std::make_shared<SkipListFactory>();
+    options_.memtable_factory = table_factory;
+    ImmutableOptions ioptions(options_);
+    wb_ = new WriteBufferManager(options_.db_write_buffer_size);
+    memtable_ = new MemTable(cmp, ioptions, MutableCFOptions(options_), wb_,
+                             kMaxSequenceNumber, 0 /* column_family_id */);
+    memtable_->Ref();
+  }
+
+  ~MemTableTest() {
+    delete memtable_->Unref();
+    delete wb_;
+  }
+
+  MemTable* GetMemTable() { return memtable_; }
+
+ private:
+  MemTable* memtable_;
+  Options options_;
+  WriteBufferManager* wb_;
+};
+
+TEST_F(MemTableTest, Simple) {
+  WriteBatch batch;
+  WriteBatchInternal::SetSequence(&batch, 100);
+  ASSERT_OK(batch.Put(std::string("k1"), std::string("v1")));
+  ASSERT_OK(batch.Put(std::string("k2"), std::string("v2")));
+  ASSERT_OK(batch.Put(std::string("k3"), std::string("v3")));
+  ASSERT_OK(batch.Put(std::string("largekey"), std::string("vlarge")));
+  ASSERT_OK(batch.DeleteRange(std::string("chi"), std::string("xigua")));
+  ASSERT_OK(batch.DeleteRange(std::string("begin"), std::string("end")));
+  ColumnFamilyMemTablesDefault cf_mems_default(GetMemTable());
+  ASSERT_TRUE(
+      WriteBatchInternal::InsertInto(&batch, &cf_mems_default, nullptr, nullptr)
+          .ok());
+
+  for (int i = 0; i < 2; ++i) {
+    Arena arena;
+    ScopedArenaIterator arena_iter_guard;
+    std::unique_ptr<InternalIterator> iter_guard;
+    InternalIterator* iter;
+    if (i == 0) {
+      iter = GetMemTable()->NewIterator(ReadOptions(), &arena);
+      arena_iter_guard.set(iter);
+    } else {
+      iter = GetMemTable()->NewRangeTombstoneIterator(
+          ReadOptions(), kMaxSequenceNumber /* read_seq */,
+          false /* immutable_memtable */);
+      iter_guard.reset(iter);
+    }
+    if (iter == nullptr) {
+      continue;
+    }
+    iter->SeekToFirst();
+    while (iter->Valid()) {
+      fprintf(stderr, "key: '%s' -> '%s'\n", iter->key().ToString().c_str(),
+              iter->value().ToString().c_str());
+      iter->Next();
+    }
+  }
+}
+
+// Test the empty key
+TEST_P(ParameterizedHarnessTest, SimpleEmptyKey) {
+  Random rnd(test::RandomSeed() + 1);
+  Add("", "v");
+  Test(&rnd);
+}
+
+TEST_P(ParameterizedHarnessTest, SimpleSingle) {
+  Random rnd(test::RandomSeed() + 2);
+  Add("abc", "v");
+  Test(&rnd);
+}
+
+TEST_P(ParameterizedHarnessTest, SimpleMulti) {
+  Random rnd(test::RandomSeed() + 3);
+  Add("abc", "v");
+  Add("abcd", "v");
+  Add("ac", "v2");
+  Test(&rnd);
+}
+
+TEST_P(ParameterizedHarnessTest, SimpleSpecialKey) {
+  Random rnd(test::RandomSeed() + 4);
+  Add("\xff\xff", "v3");
+  Test(&rnd);
+}
+
+TEST(TableTest, FooterTests) {
+  Random* r = Random::GetTLSInstance();
+  uint64_t data_size = (uint64_t{1} << r->Uniform(40)) + r->Uniform(100);
+  uint64_t index_size = r->Uniform(1000000000);
+  uint64_t metaindex_size = r->Uniform(1000000);
+  // 5 == block trailer size
+  BlockHandle index(data_size + 5, index_size);
+  BlockHandle meta_index(data_size + index_size + 2 * 5, metaindex_size);
+  uint64_t footer_offset = data_size + metaindex_size + index_size + 3 * 5;
+  {
+    // legacy block based
+    FooterBuilder footer;
+    footer.Build(kBlockBasedTableMagicNumber, /* format_version */ 0,
+                 footer_offset, kCRC32c, meta_index, index);
+    Footer decoded_footer;
+    ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
+    ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber);
+    ASSERT_EQ(decoded_footer.checksum_type(), kCRC32c);
+    ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
+    ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
+    ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
+    ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
+    ASSERT_EQ(decoded_footer.format_version(), 0U);
+    ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 5U);
+    // Ensure serialized with legacy magic
+    ASSERT_EQ(
+        DecodeFixed64(footer.GetSlice().data() + footer.GetSlice().size() - 8),
+        kLegacyBlockBasedTableMagicNumber);
+  }
+  // block based, various checksums, various versions
+  for (auto t : GetSupportedChecksums()) {
+    for (uint32_t fv = 1; IsSupportedFormatVersion(fv); ++fv) {
+      FooterBuilder footer;
+      footer.Build(kBlockBasedTableMagicNumber, fv, footer_offset, t,
+                   meta_index, index);
+      Footer decoded_footer;
+      ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
+      ASSERT_EQ(decoded_footer.table_magic_number(),
+                kBlockBasedTableMagicNumber);
+      ASSERT_EQ(decoded_footer.checksum_type(), t);
+      ASSERT_EQ(decoded_footer.metaindex_handle().offset(),
+                meta_index.offset());
+      ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
+      ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
+      ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
+      ASSERT_EQ(decoded_footer.format_version(), fv);
+      ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 5U);
+    }
+  }
+// Plain table is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+  {
+    // legacy plain table
+    FooterBuilder footer;
+    footer.Build(kPlainTableMagicNumber, /* format_version */ 0, footer_offset,
+                 kNoChecksum, meta_index);
+    Footer decoded_footer;
+    ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
+    ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber);
+    ASSERT_EQ(decoded_footer.checksum_type(), kCRC32c);
+    ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
+    ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
+    ASSERT_EQ(decoded_footer.index_handle().offset(), 0U);
+    ASSERT_EQ(decoded_footer.index_handle().size(), 0U);
+    ASSERT_EQ(decoded_footer.format_version(), 0U);
+    ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 0U);
+    // Ensure serialized with legacy magic
+    ASSERT_EQ(
+        DecodeFixed64(footer.GetSlice().data() + footer.GetSlice().size() - 8),
+        kLegacyPlainTableMagicNumber);
+  }
+  {
+    // xxhash plain table (not currently used)
+    FooterBuilder footer;
+    footer.Build(kPlainTableMagicNumber, /* format_version */ 1, footer_offset,
+                 kxxHash, meta_index);
+    Footer decoded_footer;
+    ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
+    ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber);
+    ASSERT_EQ(decoded_footer.checksum_type(), kxxHash);
+    ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
+    ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
+    ASSERT_EQ(decoded_footer.index_handle().offset(), 0U);
+    ASSERT_EQ(decoded_footer.index_handle().size(), 0U);
+    ASSERT_EQ(decoded_footer.format_version(), 1U);
+    ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 0U);
+  }
+#endif  // !ROCKSDB_LITE
+}
+
+class IndexBlockRestartIntervalTest
+    : public TableTest,
+      public ::testing::WithParamInterface<std::pair<int, bool>> {
+ public:
+  static std::vector<std::pair<int, bool>> GetRestartValues() {
+    return {{-1, false}, {0, false},  {1, false}, {8, false},
+            {16, false}, {32, false}, {-1, true}, {0, true},
+            {1, true},   {8, true},   {16, true}, {32, true}};
+  }
+};
+
+INSTANTIATE_TEST_CASE_P(
+    IndexBlockRestartIntervalTest, IndexBlockRestartIntervalTest,
+    ::testing::ValuesIn(IndexBlockRestartIntervalTest::GetRestartValues()));
+
+TEST_P(IndexBlockRestartIntervalTest, IndexBlockRestartInterval) {
+  const int kKeysInTable = 10000;
+  const int kKeySize = 100;
+  const int kValSize = 500;
+
+  const int index_block_restart_interval = std::get<0>(GetParam());
+  const bool value_delta_encoding = std::get<1>(GetParam());
+
+  Options options;
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 64;  // small block size to get big index block
+  table_options.index_block_restart_interval = index_block_restart_interval;
+  if (value_delta_encoding) {
+    table_options.format_version = 4;
+  } else {
+    table_options.format_version = 3;
+  }
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+
+  TableConstructor c(BytewiseComparator());
+  static Random rnd(301);
+  for (int i = 0; i < kKeysInTable; i++) {
+    InternalKey k(rnd.RandomString(kKeySize), 0, kTypeValue);
+    c.Add(k.Encode().ToString(), rnd.RandomString(kValSize));
+  }
+
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  std::unique_ptr<InternalKeyComparator> comparator(
+      new InternalKeyComparator(BytewiseComparator()));
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
+           &kvmap);
+  auto reader = c.GetTableReader();
+
+  ReadOptions read_options;
+  std::unique_ptr<InternalIterator> db_iter(reader->NewIterator(
+      read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+  // Test point lookup
+  for (auto& kv : kvmap) {
+    db_iter->Seek(kv.first);
+
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_OK(db_iter->status());
+    ASSERT_EQ(db_iter->key(), kv.first);
+    ASSERT_EQ(db_iter->value(), kv.second);
+  }
+
+  // Test iterating
+  auto kv_iter = kvmap.begin();
+  for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+    ASSERT_EQ(db_iter->key(), kv_iter->first);
+    ASSERT_EQ(db_iter->value(), kv_iter->second);
+    kv_iter++;
+  }
+  ASSERT_EQ(kv_iter, kvmap.end());
+  c.ResetTableReader();
+}
+
+class PrefixTest : public testing::Test {
+ public:
+  PrefixTest() : testing::Test() {}
+  ~PrefixTest() override {}
+};
+
+namespace {
+// A simple PrefixExtractor that only works for test PrefixAndWholeKeyTest
+class TestPrefixExtractor : public ROCKSDB_NAMESPACE::SliceTransform {
+ public:
+  ~TestPrefixExtractor() override{};
+  const char* Name() const override { return "TestPrefixExtractor"; }
+
+  ROCKSDB_NAMESPACE::Slice Transform(
+      const ROCKSDB_NAMESPACE::Slice& src) const override {
+    assert(IsValid(src));
+    return ROCKSDB_NAMESPACE::Slice(src.data(), 3);
+  }
+
+  bool InDomain(const ROCKSDB_NAMESPACE::Slice& src) const override {
+    return IsValid(src);
+  }
+
+  bool InRange(const ROCKSDB_NAMESPACE::Slice& /*dst*/) const override {
+    return true;
+  }
+
+  bool IsValid(const ROCKSDB_NAMESPACE::Slice& src) const {
+    if (src.size() != 4) {
+      return false;
+    }
+    if (src[0] != '[') {
+      return false;
+    }
+    if (src[1] < '0' || src[1] > '9') {
+      return false;
+    }
+    if (src[2] != ']') {
+      return false;
+    }
+    if (src[3] < '0' || src[3] > '9') {
+      return false;
+    }
+    return true;
+  }
+};
+}  // namespace
+
+TEST_F(PrefixTest, PrefixAndWholeKeyTest) {
+  ROCKSDB_NAMESPACE::Options options;
+  options.compaction_style = ROCKSDB_NAMESPACE::kCompactionStyleUniversal;
+  options.num_levels = 20;
+  options.create_if_missing = true;
+  options.optimize_filters_for_hits = false;
+  options.target_file_size_base = 268435456;
+  options.prefix_extractor = std::make_shared<TestPrefixExtractor>();
+  ROCKSDB_NAMESPACE::BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10));
+  bbto.block_size = 262144;
+  bbto.whole_key_filtering = true;
+
+  const std::string kDBPath = test::PerThreadDBPath("table_prefix_test");
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  ASSERT_OK(DestroyDB(kDBPath, options));
+  ROCKSDB_NAMESPACE::DB* db;
+  ASSERT_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
+
+  // Create a bunch of keys with 10 filters.
+  for (int i = 0; i < 10; i++) {
+    std::string prefix = "[" + std::to_string(i) + "]";
+    for (int j = 0; j < 10; j++) {
+      std::string key = prefix + std::to_string(j);
+      ASSERT_OK(db->Put(ROCKSDB_NAMESPACE::WriteOptions(), key, "1"));
+    }
+  }
+
+  // Trigger compaction.
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  delete db;
+  // In the second round, turn whole_key_filtering off and expect
+  // rocksdb still works.
+}
+
+/*
+ * Disable TableWithGlobalSeqno since RocksDB does not store global_seqno in
+ * the SST file any more. Instead, RocksDB deduces global_seqno from the
+ * MANIFEST while reading from an SST. Therefore, it's not possible to test the
+ * functionality of global_seqno in a single, isolated unit test without the
+ * involvement of Version, VersionSet, etc.
+ */
+TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
+  BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
+  test::StringSink* sink = new test::StringSink();
+  std::unique_ptr<FSWritableFile> holder(sink);
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(holder), "" /* don't care */, FileOptions()));
+  Options options;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  InternalKeyComparator ikc(options.comparator);
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+  int_tbl_prop_collector_factories.emplace_back(
+      new SstFileWriterPropertiesCollectorFactory(2 /* version */,
+                                                  0 /* global_seqno*/));
+  std::string column_family_name;
+  std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, ikc,
+                          &int_tbl_prop_collector_factories, kNoCompression,
+                          CompressionOptions(), kUnknownColumnFamily,
+                          column_family_name, -1),
+      file_writer.get()));
+
+  for (char c = 'a'; c <= 'z'; ++c) {
+    std::string key(8, c);
+    std::string value = key;
+    InternalKey ik(key, 0, kTypeValue);
+
+    builder->Add(ik.Encode(), value);
+  }
+  ASSERT_OK(builder->Finish());
+  ASSERT_OK(file_writer->Flush());
+
+  test::RandomRWStringSink ss_rw(sink);
+  uint32_t version;
+  uint64_t global_seqno;
+  uint64_t global_seqno_offset;
+
+  // Helper function to get version, global_seqno, global_seqno_offset
+  std::function<void()> GetVersionAndGlobalSeqno = [&]() {
+    std::unique_ptr<FSRandomAccessFile> source(
+        new test::StringSource(ss_rw.contents(), 73342, true));
+    std::unique_ptr<RandomAccessFileReader> file_reader(
+        new RandomAccessFileReader(std::move(source), ""));
+
+    std::unique_ptr<TableProperties> props;
+    ASSERT_OK(ReadTableProperties(file_reader.get(), ss_rw.contents().size(),
+                                  kBlockBasedTableMagicNumber, ioptions,
+                                  &props));
+
+    UserCollectedProperties user_props = props->user_collected_properties;
+    version = DecodeFixed32(
+        user_props[ExternalSstFilePropertyNames::kVersion].c_str());
+    global_seqno = DecodeFixed64(
+        user_props[ExternalSstFilePropertyNames::kGlobalSeqno].c_str());
+    global_seqno_offset = props->external_sst_file_global_seqno_offset;
+  };
+
+  // Helper function to update the value of the global seqno in the file
+  std::function<void(uint64_t)> SetGlobalSeqno = [&](uint64_t val) {
+    std::string new_global_seqno;
+    PutFixed64(&new_global_seqno, val);
+
+    ASSERT_OK(ss_rw.Write(global_seqno_offset, new_global_seqno, IOOptions(),
+                          nullptr));
+  };
+
+  // Helper function to get the contents of the table InternalIterator
+  std::unique_ptr<TableReader> table_reader;
+  const ReadOptions read_options;
+  std::function<InternalIterator*()> GetTableInternalIter = [&]() {
+    std::unique_ptr<FSRandomAccessFile> source(
+        new test::StringSource(ss_rw.contents(), 73342, true));
+    std::unique_ptr<RandomAccessFileReader> file_reader(
+        new RandomAccessFileReader(std::move(source), ""));
+
+    options.table_factory->NewTableReader(
+        TableReaderOptions(ioptions, moptions.prefix_extractor, EnvOptions(),
+                           ikc),
+        std::move(file_reader), ss_rw.contents().size(), &table_reader);
+
+    return table_reader->NewIterator(
+        read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized);
+  };
+
+  GetVersionAndGlobalSeqno();
+  ASSERT_EQ(2u, version);
+  ASSERT_EQ(0u, global_seqno);
+
+  InternalIterator* iter = GetTableInternalIter();
+  char current_c = 'a';
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ParsedInternalKey pik;
+    ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
+
+    ASSERT_EQ(pik.type, ValueType::kTypeValue);
+    ASSERT_EQ(pik.sequence, 0);
+    ASSERT_EQ(pik.user_key, iter->value());
+    ASSERT_EQ(pik.user_key.ToString(), std::string(8, current_c));
+    current_c++;
+  }
+  ASSERT_EQ(current_c, 'z' + 1);
+  delete iter;
+
+  // Update global sequence number to 10
+  SetGlobalSeqno(10);
+  GetVersionAndGlobalSeqno();
+  ASSERT_EQ(2u, version);
+  ASSERT_EQ(10u, global_seqno);
+
+  iter = GetTableInternalIter();
+  current_c = 'a';
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ParsedInternalKey pik;
+    ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
+
+    ASSERT_EQ(pik.type, ValueType::kTypeValue);
+    ASSERT_EQ(pik.sequence, 10);
+    ASSERT_EQ(pik.user_key, iter->value());
+    ASSERT_EQ(pik.user_key.ToString(), std::string(8, current_c));
+    current_c++;
+  }
+  ASSERT_EQ(current_c, 'z' + 1);
+
+  // Verify Seek
+  for (char c = 'a'; c <= 'z'; c++) {
+    std::string k = std::string(8, c);
+    InternalKey ik(k, 10, kValueTypeForSeek);
+    iter->Seek(ik.Encode());
+    ASSERT_TRUE(iter->Valid());
+
+    ParsedInternalKey pik;
+    ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
+
+    ASSERT_EQ(pik.type, ValueType::kTypeValue);
+    ASSERT_EQ(pik.sequence, 10);
+    ASSERT_EQ(pik.user_key.ToString(), k);
+    ASSERT_EQ(iter->value().ToString(), k);
+  }
+  delete iter;
+
+  // Update global sequence number to 3
+  SetGlobalSeqno(3);
+  GetVersionAndGlobalSeqno();
+  ASSERT_EQ(2u, version);
+  ASSERT_EQ(3u, global_seqno);
+
+  iter = GetTableInternalIter();
+  current_c = 'a';
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ParsedInternalKey pik;
+    ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
+
+    ASSERT_EQ(pik.type, ValueType::kTypeValue);
+    ASSERT_EQ(pik.sequence, 3);
+    ASSERT_EQ(pik.user_key, iter->value());
+    ASSERT_EQ(pik.user_key.ToString(), std::string(8, current_c));
+    current_c++;
+  }
+  ASSERT_EQ(current_c, 'z' + 1);
+
+  // Verify Seek
+  for (char c = 'a'; c <= 'z'; c++) {
+    std::string k = std::string(8, c);
+    // seqno=4 is less than 3 so we still should get our key
+    InternalKey ik(k, 4, kValueTypeForSeek);
+    iter->Seek(ik.Encode());
+    ASSERT_TRUE(iter->Valid());
+
+    ParsedInternalKey pik;
+    ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
+
+    ASSERT_EQ(pik.type, ValueType::kTypeValue);
+    ASSERT_EQ(pik.sequence, 3);
+    ASSERT_EQ(pik.user_key.ToString(), k);
+    ASSERT_EQ(iter->value().ToString(), k);
+  }
+
+  delete iter;
+}
+
+TEST_P(BlockBasedTableTest, BlockAlignTest) {
+  BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
+  bbto.block_align = true;
+  test::StringSink* sink = new test::StringSink();
+  std::unique_ptr<FSWritableFile> holder(sink);
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(holder), "" /* don't care */, FileOptions()));
+  Options options;
+  options.compression = kNoCompression;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  InternalKeyComparator ikc(options.comparator);
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+  std::string column_family_name;
+  std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, ikc,
+                          &int_tbl_prop_collector_factories, kNoCompression,
+                          CompressionOptions(), kUnknownColumnFamily,
+                          column_family_name, -1),
+      file_writer.get()));
+
+  for (int i = 1; i <= 10000; ++i) {
+    std::ostringstream ostr;
+    ostr << std::setfill('0') << std::setw(5) << i;
+    std::string key = ostr.str();
+    std::string value = "val";
+    InternalKey ik(key, 0, kTypeValue);
+
+    builder->Add(ik.Encode(), value);
+  }
+  ASSERT_OK(builder->Finish());
+  ASSERT_OK(file_writer->Flush());
+
+  std::unique_ptr<FSRandomAccessFile> source(
+      new test::StringSource(sink->contents(), 73342, false));
+  std::unique_ptr<RandomAccessFileReader> file_reader(
+      new RandomAccessFileReader(std::move(source), "test"));
+  // Helper function to get version, global_seqno, global_seqno_offset
+  std::function<void()> VerifyBlockAlignment = [&]() {
+    std::unique_ptr<TableProperties> props;
+    ASSERT_OK(ReadTableProperties(file_reader.get(), sink->contents().size(),
+                                  kBlockBasedTableMagicNumber, ioptions,
+                                  &props));
+
+    uint64_t data_block_size = props->data_size / props->num_data_blocks;
+    ASSERT_EQ(data_block_size, 4096);
+    ASSERT_EQ(props->data_size, data_block_size * props->num_data_blocks);
+  };
+
+  VerifyBlockAlignment();
+
+  // The below block of code verifies that we can read back the keys. Set
+  // block_align to false when creating the reader to ensure we can flip between
+  // the two modes without any issues
+  std::unique_ptr<TableReader> table_reader;
+  bbto.block_align = false;
+  Options options2;
+  options2.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  ImmutableOptions ioptions2(options2);
+  const MutableCFOptions moptions2(options2);
+
+  ASSERT_OK(ioptions.table_factory->NewTableReader(
+      TableReaderOptions(ioptions2, moptions2.prefix_extractor, EnvOptions(),
+                         GetPlainInternalComparator(options2.comparator)),
+      std::move(file_reader), sink->contents().size(), &table_reader));
+
+  ReadOptions read_options;
+  std::unique_ptr<InternalIterator> db_iter(table_reader->NewIterator(
+      read_options, moptions2.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+  int expected_key = 1;
+  for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+    std::ostringstream ostr;
+    ostr << std::setfill('0') << std::setw(5) << expected_key++;
+    std::string key = ostr.str();
+    std::string value = "val";
+
+    ASSERT_OK(db_iter->status());
+    ASSERT_EQ(ExtractUserKey(db_iter->key()).ToString(), key);
+    ASSERT_EQ(db_iter->value().ToString(), value);
+  }
+  expected_key--;
+  ASSERT_EQ(expected_key, 10000);
+  table_reader.reset();
+}
+
+TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
+  BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
+  bbto.block_align = true;
+  test::StringSink* sink = new test::StringSink();
+  std::unique_ptr<FSWritableFile> holder(sink);
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(holder), "" /* don't care */, FileOptions()));
+
+  Options options;
+  options.compression = kNoCompression;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  InternalKeyComparator ikc(options.comparator);
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+  std::string column_family_name;
+
+  std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, ikc,
+                          &int_tbl_prop_collector_factories, kNoCompression,
+                          CompressionOptions(), kUnknownColumnFamily,
+                          column_family_name, -1),
+      file_writer.get()));
+
+  for (int i = 1; i <= 10000; ++i) {
+    std::ostringstream ostr;
+    ostr << std::setfill('0') << std::setw(5) << i;
+    std::string key = ostr.str();
+    std::string value = "val";
+    InternalKey ik(key, 0, kTypeValue);
+
+    builder->Add(ik.Encode(), value);
+  }
+  ASSERT_OK(builder->Finish());
+  ASSERT_OK(file_writer->Flush());
+
+  std::unique_ptr<FSRandomAccessFile> source(
+      new test::StringSource(sink->contents(), 73342, true));
+  std::unique_ptr<RandomAccessFileReader> file_reader(
+      new RandomAccessFileReader(std::move(source), "test"));
+
+  {
+    RandomAccessFileReader* file = file_reader.get();
+    uint64_t file_size = sink->contents().size();
+
+    Footer footer;
+    IOOptions opts;
+    ASSERT_OK(ReadFooterFromFile(opts, file, nullptr /* prefetch_buffer */,
+                                 file_size, &footer,
+                                 kBlockBasedTableMagicNumber));
+
+    auto BlockFetchHelper = [&](const BlockHandle& handle, BlockType block_type,
+                                BlockContents* contents) {
+      ReadOptions read_options;
+      read_options.verify_checksums = false;
+      PersistentCacheOptions cache_options;
+
+      BlockFetcher block_fetcher(
+          file, nullptr /* prefetch_buffer */, footer, read_options, handle,
+          contents, ioptions, false /* decompress */,
+          false /*maybe_compressed*/, block_type,
+          UncompressionDict::GetEmptyDict(), cache_options);
+
+      ASSERT_OK(block_fetcher.ReadBlockContents());
+    };
+
+    // -- Read metaindex block
+    auto metaindex_handle = footer.metaindex_handle();
+    BlockContents metaindex_contents;
+
+    BlockFetchHelper(metaindex_handle, BlockType::kMetaIndex,
+                     &metaindex_contents);
+    Block metaindex_block(std::move(metaindex_contents));
+
+    std::unique_ptr<InternalIterator> meta_iter(metaindex_block.NewDataIterator(
+        BytewiseComparator(), kDisableGlobalSequenceNumber));
+
+    // -- Read properties block
+    BlockHandle properties_handle;
+    ASSERT_OK(FindOptionalMetaBlock(meta_iter.get(), kPropertiesBlockName,
+                                    &properties_handle));
+    ASSERT_FALSE(properties_handle.IsNull());
+    BlockContents properties_contents;
+    BlockFetchHelper(properties_handle, BlockType::kProperties,
+                     &properties_contents);
+    Block properties_block(std::move(properties_contents));
+
+    ASSERT_EQ(properties_block.NumRestarts(), 1u);
+  }
+}
+
+TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) {
+  // The properties meta-block should come at the end since we always need to
+  // read it when opening a file, unlike index/filter/other meta-blocks, which
+  // are sometimes read depending on the user's configuration. This ordering
+  // allows us to do a small readahead on the end of the file to read properties
+  // and meta-index blocks with one I/O.
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+  c.Add("a1", "val1");
+  c.Add("b2", "val2");
+  c.Add("c3", "val3");
+  c.Add("d4", "val4");
+  c.Add("e5", "val5");
+  c.Add("f6", "val6");
+  c.Add("g7", "val7");
+  c.Add("h8", "val8");
+  c.Add("j9", "val9");
+
+  // write an SST file
+  Options options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.filter_policy.reset(NewBloomFilterPolicy(
+      8 /* bits_per_key */, false /* use_block_based_filter */));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ImmutableOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+
+  // get file reader
+  test::StringSink* table_sink = c.TEST_GetSink();
+  std::unique_ptr<FSRandomAccessFile> source(new test::StringSource(
+      table_sink->contents(), 0 /* unique_id */, false /* allow_mmap_reads */));
+
+  std::unique_ptr<RandomAccessFileReader> table_reader(
+      new RandomAccessFileReader(std::move(source), "test"));
+  size_t table_size = table_sink->contents().size();
+
+  // read footer
+  Footer footer;
+  IOOptions opts;
+  ASSERT_OK(ReadFooterFromFile(opts, table_reader.get(),
+                               nullptr /* prefetch_buffer */, table_size,
+                               &footer, kBlockBasedTableMagicNumber));
+
+  // read metaindex
+  auto metaindex_handle = footer.metaindex_handle();
+  BlockContents metaindex_contents;
+  PersistentCacheOptions pcache_opts;
+  BlockFetcher block_fetcher(
+      table_reader.get(), nullptr /* prefetch_buffer */, footer, ReadOptions(),
+      metaindex_handle, &metaindex_contents, ioptions, false /* decompress */,
+      false /*maybe_compressed*/, BlockType::kMetaIndex,
+      UncompressionDict::GetEmptyDict(), pcache_opts,
+      nullptr /*memory_allocator*/);
+  ASSERT_OK(block_fetcher.ReadBlockContents());
+  Block metaindex_block(std::move(metaindex_contents));
+
+  // verify properties block comes last
+  std::unique_ptr<InternalIterator> metaindex_iter{
+      metaindex_block.NewMetaIterator()};
+  uint64_t max_offset = 0;
+  std::string key_at_max_offset;
+  for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
+       metaindex_iter->Next()) {
+    BlockHandle handle;
+    Slice value = metaindex_iter->value();
+    ASSERT_OK(handle.DecodeFrom(&value));
+    if (handle.offset() > max_offset) {
+      max_offset = handle.offset();
+      key_at_max_offset = metaindex_iter->key().ToString();
+    }
+  }
+  ASSERT_EQ(kPropertiesBlockName, key_at_max_offset);
+  // index handle is stored in footer rather than metaindex block, so need
+  // separate logic to verify it comes before properties block.
+  ASSERT_GT(max_offset, footer.index_handle().offset());
+  c.ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, SeekMetaBlocks) {
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+  c.Add("foo_a1", "val1");
+  c.Add("foo_b2", "val2");
+  c.Add("foo_c3", "val3");
+  c.Add("foo_d4", "val4");
+  c.Add("foo_e5", "val5");
+  c.Add("foo_f6", "val6");
+  c.Add("foo_g7", "val7");
+  c.Add("foo_h8", "val8");
+  c.Add("foo_j9", "val9");
+
+  // write an SST file
+  Options options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.index_type = BlockBasedTableOptions::kHashSearch;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(
+      8 /* bits_per_key */, false /* use_block_based_filter */));
+  options.prefix_extractor.reset(NewFixedPrefixTransform(4));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ImmutableOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+
+  // get file reader
+  test::StringSink* table_sink = c.TEST_GetSink();
+  std::unique_ptr<FSRandomAccessFile> source(new test::StringSource(
+      table_sink->contents(), 0 /* unique_id */, false /* allow_mmap_reads */));
+
+  std::unique_ptr<RandomAccessFileReader> table_reader(
+      new RandomAccessFileReader(std::move(source), "test"));
+  size_t table_size = table_sink->contents().size();
+
+  // read footer
+  Footer footer;
+  IOOptions opts;
+  ASSERT_OK(ReadFooterFromFile(opts, table_reader.get(),
+                               nullptr /* prefetch_buffer */, table_size,
+                               &footer, kBlockBasedTableMagicNumber));
+
+  // read metaindex
+  auto metaindex_handle = footer.metaindex_handle();
+  BlockContents metaindex_contents;
+  PersistentCacheOptions pcache_opts;
+  BlockFetcher block_fetcher(
+      table_reader.get(), nullptr /* prefetch_buffer */, footer, ReadOptions(),
+      metaindex_handle, &metaindex_contents, ioptions, false /* decompress */,
+      false /*maybe_compressed*/, BlockType::kMetaIndex,
+      UncompressionDict::GetEmptyDict(), pcache_opts,
+      nullptr /*memory_allocator*/);
+  ASSERT_OK(block_fetcher.ReadBlockContents());
+  Block metaindex_block(std::move(metaindex_contents));
+
+  // verify properties block comes last
+  std::unique_ptr<MetaBlockIter> metaindex_iter(
+      metaindex_block.NewMetaIterator());
+  bool has_hash_prefixes = false;
+  bool has_hash_metadata = false;
+  for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
+       metaindex_iter->Next()) {
+    if (metaindex_iter->key().ToString() == kHashIndexPrefixesBlock) {
+      has_hash_prefixes = true;
+    } else if (metaindex_iter->key().ToString() ==
+               kHashIndexPrefixesMetadataBlock) {
+      has_hash_metadata = true;
+    }
+  }
+  if (has_hash_metadata) {
+    metaindex_iter->Seek(kHashIndexPrefixesMetadataBlock);
+    ASSERT_TRUE(metaindex_iter->Valid());
+    ASSERT_EQ(kHashIndexPrefixesMetadataBlock,
+              metaindex_iter->key().ToString());
+  }
+  if (has_hash_prefixes) {
+    metaindex_iter->Seek(kHashIndexPrefixesBlock);
+    ASSERT_TRUE(metaindex_iter->Valid());
+    ASSERT_EQ(kHashIndexPrefixesBlock, metaindex_iter->key().ToString());
+  }
+  c.ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, BadOptions) {
+  ROCKSDB_NAMESPACE::Options options;
+  options.compression = kNoCompression;
+  BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
+  bbto.block_size = 4000;
+  bbto.block_align = true;
+
+  const std::string kDBPath =
+      test::PerThreadDBPath("block_based_table_bad_options_test");
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  ASSERT_OK(DestroyDB(kDBPath, options));
+  ROCKSDB_NAMESPACE::DB* db;
+  ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
+
+  bbto.block_size = 4096;
+  options.compression = kSnappyCompression;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
+}
+
+TEST_F(BBTTailPrefetchTest, TestTailPrefetchStats) {
+  TailPrefetchStats tpstats;
+  ASSERT_EQ(0, tpstats.GetSuggestedPrefetchSize());
+  tpstats.RecordEffectiveSize(size_t{1000});
+  tpstats.RecordEffectiveSize(size_t{1005});
+  tpstats.RecordEffectiveSize(size_t{1002});
+  ASSERT_EQ(1005, tpstats.GetSuggestedPrefetchSize());
+
+  // One single super large value shouldn't influence much
+  tpstats.RecordEffectiveSize(size_t{1002000});
+  tpstats.RecordEffectiveSize(size_t{999});
+  ASSERT_LE(1005, tpstats.GetSuggestedPrefetchSize());
+  ASSERT_GT(1200, tpstats.GetSuggestedPrefetchSize());
+
+  // Only history of 32 is kept
+  for (int i = 0; i < 32; i++) {
+    tpstats.RecordEffectiveSize(size_t{100});
+  }
+  ASSERT_EQ(100, tpstats.GetSuggestedPrefetchSize());
+
+  // 16 large values and 16 small values. The result should be closer
+  // to the small value as the algorithm.
+  for (int i = 0; i < 16; i++) {
+    tpstats.RecordEffectiveSize(size_t{1000});
+  }
+  tpstats.RecordEffectiveSize(size_t{10});
+  tpstats.RecordEffectiveSize(size_t{20});
+  for (int i = 0; i < 6; i++) {
+    tpstats.RecordEffectiveSize(size_t{100});
+  }
+  ASSERT_LE(80, tpstats.GetSuggestedPrefetchSize());
+  ASSERT_GT(200, tpstats.GetSuggestedPrefetchSize());
+}
+
+TEST_F(BBTTailPrefetchTest, FilePrefetchBufferMinOffset) {
+  TailPrefetchStats tpstats;
+  FilePrefetchBuffer buffer(0 /* readahead_size */, 0 /* max_readahead_size */,
+                            false /* enable */, true /* track_min_offset */);
+  IOOptions opts;
+  buffer.TryReadFromCache(opts, nullptr /* reader */, 500 /* offset */,
+                          10 /* n */, nullptr /* result */,
+                          nullptr /* status */,
+                          Env::IO_TOTAL /* rate_limiter_priority */);
+  buffer.TryReadFromCache(opts, nullptr /* reader */, 480 /* offset */,
+                          10 /* n */, nullptr /* result */,
+                          nullptr /* status */,
+                          Env::IO_TOTAL /* rate_limiter_priority */);
+  buffer.TryReadFromCache(opts, nullptr /* reader */, 490 /* offset */,
+                          10 /* n */, nullptr /* result */,
+                          nullptr /* status */,
+                          Env::IO_TOTAL /* rate_limiter_priority */);
+  ASSERT_EQ(480, buffer.min_offset_read());
+}
+
+TEST_P(BlockBasedTableTest, DataBlockHashIndex) {
+  const int kNumKeys = 500;
+  const int kKeySize = 8;
+  const int kValSize = 40;
+
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.data_block_index_type =
+      BlockBasedTableOptions::kDataBlockBinaryAndHash;
+
+  Options options;
+  options.comparator = BytewiseComparator();
+
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+
+  TableConstructor c(options.comparator);
+
+  static Random rnd(1048);
+  for (int i = 0; i < kNumKeys; i++) {
+    // padding one "0" to mark existent keys.
+    std::string random_key(rnd.RandomString(kKeySize - 1) + "1");
+    InternalKey k(random_key, 0, kTypeValue);
+    c.Add(k.Encode().ToString(), rnd.RandomString(kValSize));
+  }
+
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  const InternalKeyComparator internal_comparator(options.comparator);
+  c.Finish(options, ioptions, moptions, table_options, internal_comparator,
+           &keys, &kvmap);
+
+  auto reader = c.GetTableReader();
+
+  std::unique_ptr<InternalIterator> seek_iter;
+  ReadOptions read_options;
+  seek_iter.reset(reader->NewIterator(
+      read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+  for (int i = 0; i < 2; ++i) {
+    ReadOptions ro;
+    // for every kv, we seek using two method: Get() and Seek()
+    // Get() will use the SuffixIndexHash in Block. For non-existent key it
+    //      will invalidate the iterator
+    // Seek() will use the default BinarySeek() in Block. So for non-existent
+    //      key it will land at the closest key that is large than target.
+
+    // Search for existent keys
+    for (auto& kv : kvmap) {
+      if (i == 0) {
+        // Search using Seek()
+        seek_iter->Seek(kv.first);
+        ASSERT_OK(seek_iter->status());
+        ASSERT_TRUE(seek_iter->Valid());
+        ASSERT_EQ(seek_iter->key(), kv.first);
+        ASSERT_EQ(seek_iter->value(), kv.second);
+      } else {
+        // Search using Get()
+        PinnableSlice value;
+        std::string user_key = ExtractUserKey(kv.first).ToString();
+        GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                               GetContext::kNotFound, user_key, &value, nullptr,
+                               nullptr, nullptr, true, nullptr, nullptr);
+        ASSERT_OK(reader->Get(ro, kv.first, &get_context,
+                              moptions.prefix_extractor.get()));
+        ASSERT_EQ(get_context.State(), GetContext::kFound);
+        ASSERT_EQ(value, Slice(kv.second));
+        value.Reset();
+      }
+    }
+
+    // Search for non-existent keys
+    for (auto& kv : kvmap) {
+      std::string user_key = ExtractUserKey(kv.first).ToString();
+      user_key.back() = '0';  // make it non-existent key
+      InternalKey internal_key(user_key, 0, kTypeValue);
+      std::string encoded_key = internal_key.Encode().ToString();
+      if (i == 0) {  // Search using Seek()
+        seek_iter->Seek(encoded_key);
+        ASSERT_OK(seek_iter->status());
+        if (seek_iter->Valid()) {
+          ASSERT_TRUE(BytewiseComparator()->Compare(
+                          user_key, ExtractUserKey(seek_iter->key())) < 0);
+        }
+      } else {  // Search using Get()
+        PinnableSlice value;
+        GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                               GetContext::kNotFound, user_key, &value, nullptr,
+                               nullptr, nullptr, true, nullptr, nullptr);
+        ASSERT_OK(reader->Get(ro, encoded_key, &get_context,
+                              moptions.prefix_extractor.get()));
+        ASSERT_EQ(get_context.State(), GetContext::kNotFound);
+        value.Reset();
+      }
+    }
+  }
+}
+
+// BlockBasedTableIterator should invalidate itself and return
+// OutOfBound()=true immediately after Seek(), to allow LevelIterator
+// filter out corresponding level.
+TEST_P(BlockBasedTableTest, OutOfBoundOnSeek) {
+  TableConstructor c(BytewiseComparator(), true /*convert_to_internal_key*/);
+  c.Add("foo", "v1");
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  Options options;
+  BlockBasedTableOptions table_opt(GetBlockBasedTableOptions());
+  options.table_factory.reset(NewBlockBasedTableFactory(table_opt));
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_opt,
+           GetPlainInternalComparator(BytewiseComparator()), &keys, &kvmap);
+  auto* reader = c.GetTableReader();
+  ReadOptions read_opt;
+  std::string upper_bound = "bar";
+  Slice upper_bound_slice(upper_bound);
+  read_opt.iterate_upper_bound = &upper_bound_slice;
+  std::unique_ptr<InternalIterator> iter;
+  iter.reset(new KeyConvertingIterator(reader->NewIterator(
+      read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized)));
+  iter->SeekToFirst();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->UpperBoundCheckResult() == IterBoundCheck::kOutOfBound);
+  iter.reset(new KeyConvertingIterator(reader->NewIterator(
+      read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized)));
+  iter->Seek("foo");
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->UpperBoundCheckResult() == IterBoundCheck::kOutOfBound);
+}
+
+// BlockBasedTableIterator should invalidate itself and return
+// OutOfBound()=true after Next(), if it finds current index key is no smaller
+// than upper bound, unless it is pointing to the last data block.
+TEST_P(BlockBasedTableTest, OutOfBoundOnNext) {
+  TableConstructor c(BytewiseComparator(), true /*convert_to_internal_key*/);
+  c.Add("bar", "v");
+  c.Add("foo", "v");
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  Options options;
+  BlockBasedTableOptions table_opt(GetBlockBasedTableOptions());
+  table_opt.flush_block_policy_factory =
+      std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_opt));
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_opt,
+           GetPlainInternalComparator(BytewiseComparator()), &keys, &kvmap);
+  auto* reader = c.GetTableReader();
+  ReadOptions read_opt;
+  std::string ub1 = "bar_after";
+  Slice ub_slice1(ub1);
+  read_opt.iterate_upper_bound = &ub_slice1;
+  std::unique_ptr<InternalIterator> iter;
+  iter.reset(new KeyConvertingIterator(reader->NewIterator(
+      read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized)));
+  iter->Seek("bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("bar", iter->key());
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_TRUE(iter->UpperBoundCheckResult() == IterBoundCheck::kOutOfBound);
+  std::string ub2 = "foo_after";
+  Slice ub_slice2(ub2);
+  read_opt.iterate_upper_bound = &ub_slice2;
+  iter.reset(new KeyConvertingIterator(reader->NewIterator(
+      read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized)));
+  iter->Seek("foo");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("foo", iter->key());
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_FALSE(iter->UpperBoundCheckResult() == IterBoundCheck::kOutOfBound);
+}
+
+class ChargeCompressionDictionaryBuildingBufferTest
+    : public BlockBasedTableTestBase {};
+TEST_F(ChargeCompressionDictionaryBuildingBufferTest, Basic) {
+  constexpr std::size_t kSizeDummyEntry = 256 * 1024;
+  constexpr std::size_t kMetaDataChargeOverhead = 10000;
+  constexpr std::size_t kCacheCapacity = 8 * 1024 * 1024;
+  constexpr std::size_t kMaxDictBytes = 1024;
+  constexpr std::size_t kMaxDictBufferBytes = 1024;
+
+  for (CacheEntryRoleOptions::Decision
+           charge_compression_dictionary_building_buffer :
+       {CacheEntryRoleOptions::Decision::kEnabled,
+        CacheEntryRoleOptions::Decision::kDisabled}) {
+    BlockBasedTableOptions table_options;
+    LRUCacheOptions lo;
+    lo.capacity = kCacheCapacity;
+    lo.num_shard_bits = 0;  // 2^0 shard
+    lo.strict_capacity_limit = true;
+    std::shared_ptr<Cache> cache(NewLRUCache(lo));
+    table_options.block_cache = cache;
+    table_options.flush_block_policy_factory =
+        std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+    table_options.cache_usage_options.options_overrides.insert(
+        {CacheEntryRole::kCompressionDictionaryBuildingBuffer,
+         {/*.charged = */ charge_compression_dictionary_building_buffer}});
+    Options options;
+    options.compression = kSnappyCompression;
+    options.compression_opts.max_dict_bytes = kMaxDictBytes;
+    options.compression_opts.max_dict_buffer_bytes = kMaxDictBufferBytes;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    test::StringSink* sink = new test::StringSink();
+    std::unique_ptr<FSWritableFile> holder(sink);
+    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+        std::move(holder), "test_file_name", FileOptions()));
+
+    ImmutableOptions ioptions(options);
+    MutableCFOptions moptions(options);
+    InternalKeyComparator ikc(options.comparator);
+    IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+
+    std::unique_ptr<TableBuilder> builder(
+        options.table_factory->NewTableBuilder(
+            TableBuilderOptions(
+                ioptions, moptions, ikc, &int_tbl_prop_collector_factories,
+                kSnappyCompression, options.compression_opts,
+                kUnknownColumnFamily, "test_cf", -1 /* level */),
+            file_writer.get()));
+
+    std::string key1 = "key1";
+    std::string value1 = "val1";
+    InternalKey ik1(key1, 0 /* sequnce number */, kTypeValue);
+    // Adding the first key won't trigger a flush by FlushBlockEveryKeyPolicy
+    // therefore won't trigger any data block's buffering
+    builder->Add(ik1.Encode(), value1);
+    ASSERT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
+
+    std::string key2 = "key2";
+    std::string value2 = "val2";
+    InternalKey ik2(key2, 1 /* sequnce number */, kTypeValue);
+    // Adding the second key will trigger a flush of the last data block (the
+    // one containing key1 and value1) by FlushBlockEveryKeyPolicy and hence
+    // trigger buffering of that data block.
+    builder->Add(ik2.Encode(), value2);
+    // Cache charging will increase for last buffered data block (the one
+    // containing key1 and value1) since the buffer limit is not exceeded after
+    // that buffering and the cache will not be full after this reservation
+    if (charge_compression_dictionary_building_buffer ==
+        CacheEntryRoleOptions::Decision::kEnabled) {
+      EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry);
+      EXPECT_LT(cache->GetPinnedUsage(),
+                1 * kSizeDummyEntry + kMetaDataChargeOverhead);
+    } else {
+      EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
+    }
+
+    ASSERT_OK(builder->Finish());
+    EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
+  }
+}
+
+TEST_F(ChargeCompressionDictionaryBuildingBufferTest,
+       BasicWithBufferLimitExceed) {
+  constexpr std::size_t kSizeDummyEntry = 256 * 1024;
+  constexpr std::size_t kMetaDataChargeOverhead = 10000;
+  constexpr std::size_t kCacheCapacity = 8 * 1024 * 1024;
+  constexpr std::size_t kMaxDictBytes = 1024;
+  constexpr std::size_t kMaxDictBufferBytes = 2 * kSizeDummyEntry;
+
+  // `CacheEntryRoleOptions::charged` is enabled by default for
+  // CacheEntryRole::kCompressionDictionaryBuildingBuffer
+  BlockBasedTableOptions table_options;
+  LRUCacheOptions lo;
+  lo.capacity = kCacheCapacity;
+  lo.num_shard_bits = 0;  // 2^0 shard
+  lo.strict_capacity_limit = true;
+  std::shared_ptr<Cache> cache(NewLRUCache(lo));
+  table_options.block_cache = cache;
+  table_options.flush_block_policy_factory =
+      std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+
+  Options options;
+  options.compression = kSnappyCompression;
+  options.compression_opts.max_dict_bytes = kMaxDictBytes;
+  options.compression_opts.max_dict_buffer_bytes = kMaxDictBufferBytes;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  test::StringSink* sink = new test::StringSink();
+  std::unique_ptr<FSWritableFile> holder(sink);
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(holder), "test_file_name", FileOptions()));
+
+  ImmutableOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  InternalKeyComparator ikc(options.comparator);
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+
+  std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, ikc,
+                          &int_tbl_prop_collector_factories, kSnappyCompression,
+                          options.compression_opts, kUnknownColumnFamily,
+                          "test_cf", -1 /* level */),
+      file_writer.get()));
+
+  std::string key1 = "key1";
+  std::string value1(kSizeDummyEntry, '0');
+  InternalKey ik1(key1, 0 /* sequnce number */, kTypeValue);
+  // Adding the first key won't trigger a flush by FlushBlockEveryKeyPolicy
+  // therefore won't trigger any data block's buffering
+  builder->Add(ik1.Encode(), value1);
+  ASSERT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
+
+  std::string key2 = "key2";
+  std::string value2(kSizeDummyEntry, '0');
+  InternalKey ik2(key2, 1 /* sequnce number */, kTypeValue);
+  // Adding the second key will trigger a flush of the last data block (the one
+  // containing key1 and value1) by FlushBlockEveryKeyPolicy and hence trigger
+  // buffering of the last data block.
+  builder->Add(ik2.Encode(), value2);
+  // Cache charging will increase for last buffered data block (the one
+  // containing key1 and value1) since the buffer limit is not exceeded after
+  // the buffering and the cache will not be full after this reservation
+  EXPECT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry);
+  EXPECT_LT(cache->GetPinnedUsage(),
+            2 * kSizeDummyEntry + kMetaDataChargeOverhead);
+
+  std::string key3 = "key3";
+  std::string value3 = "val3";
+  InternalKey ik3(key3, 2 /* sequnce number */, kTypeValue);
+  // Adding the third key will trigger a flush of the last data block (the one
+  // containing key2 and value2) by FlushBlockEveryKeyPolicy and hence trigger
+  // buffering of the last data block.
+  builder->Add(ik3.Encode(), value3);
+  // Cache charging will decrease since the buffer limit is now exceeded
+  // after the last buffering and EnterUnbuffered() is triggered
+  EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
+
+  ASSERT_OK(builder->Finish());
+  EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
+}
+
+TEST_F(ChargeCompressionDictionaryBuildingBufferTest, BasicWithCacheFull) {
+  constexpr std::size_t kSizeDummyEntry = 256 * 1024;
+  constexpr std::size_t kMetaDataChargeOverhead = 10000;
+  // A small kCacheCapacity is chosen so that increase cache charging for
+  // buffering two data blocks, each containing key1/value1, key2/a big
+  // value2, will cause cache full
+  constexpr std::size_t kCacheCapacity =
+      1 * kSizeDummyEntry + kSizeDummyEntry / 2;
+  constexpr std::size_t kMaxDictBytes = 1024;
+  // A big kMaxDictBufferBytes is chosen so that adding a big key value pair
+  // (key2, value2) won't exceed the buffer limit
+  constexpr std::size_t kMaxDictBufferBytes = 1024 * 1024 * 1024;
+
+  // `CacheEntryRoleOptions::charged` is enabled by default for
+  // CacheEntryRole::kCompressionDictionaryBuildingBuffer
+  BlockBasedTableOptions table_options;
+  LRUCacheOptions lo;
+  lo.capacity = kCacheCapacity;
+  lo.num_shard_bits = 0;  // 2^0 shard
+  lo.strict_capacity_limit = true;
+  std::shared_ptr<Cache> cache(NewLRUCache(lo));
+  table_options.block_cache = cache;
+  table_options.flush_block_policy_factory =
+      std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+
+  Options options;
+  options.compression = kSnappyCompression;
+  options.compression_opts.max_dict_bytes = kMaxDictBytes;
+  options.compression_opts.max_dict_buffer_bytes = kMaxDictBufferBytes;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  test::StringSink* sink = new test::StringSink();
+  std::unique_ptr<FSWritableFile> holder(sink);
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(holder), "test_file_name", FileOptions()));
+
+  ImmutableOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  InternalKeyComparator ikc(options.comparator);
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+
+  std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, ikc,
+                          &int_tbl_prop_collector_factories, kSnappyCompression,
+                          options.compression_opts, kUnknownColumnFamily,
+                          "test_cf", -1 /* level */),
+      file_writer.get()));
+
+  std::string key1 = "key1";
+  std::string value1 = "val1";
+  InternalKey ik1(key1, 0 /* sequnce number */, kTypeValue);
+  // Adding the first key won't trigger a flush by FlushBlockEveryKeyPolicy
+  // therefore won't trigger any data block's buffering
+  builder->Add(ik1.Encode(), value1);
+  ASSERT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
+
+  std::string key2 = "key2";
+  std::string value2(kSizeDummyEntry, '0');
+  InternalKey ik2(key2, 1 /* sequnce number */, kTypeValue);
+  // Adding the second key will trigger a flush of the last data block (the one
+  // containing key1 and value1) by FlushBlockEveryKeyPolicy and hence trigger
+  // buffering of the last data block.
+  builder->Add(ik2.Encode(), value2);
+  // Cache charging will increase for the last buffered data block (the one
+  // containing key1 and value1) since the buffer limit is not exceeded after
+  // the buffering and the cache will not be full after this reservation
+  EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry);
+  EXPECT_LT(cache->GetPinnedUsage(),
+            1 * kSizeDummyEntry + kMetaDataChargeOverhead);
+
+  std::string key3 = "key3";
+  std::string value3 = "value3";
+  InternalKey ik3(key3, 2 /* sequnce number */, kTypeValue);
+  // Adding the third key will trigger a flush of the last data block (the one
+  // containing key2 and value2) by FlushBlockEveryKeyPolicy and hence trigger
+  // buffering of the last data block.
+  builder->Add(ik3.Encode(), value3);
+  // Cache charging will decrease since the cache is now full after
+  // increasing reservation for the last buffered block and EnterUnbuffered() is
+  // triggered
+  EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
+
+  ASSERT_OK(builder->Finish());
+  EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
+}
+
+class CacheUsageOptionsOverridesTest : public DBTestBase {
+ public:
+  CacheUsageOptionsOverridesTest()
+      : DBTestBase("cache_usage_options_overrides_test",
+                   /*env_do_fsync=*/false) {}
+};
+
+TEST_F(CacheUsageOptionsOverridesTest, SanitizeAndValidateOptions) {
+  // To test `cache_usage_options.options_overrides` is sanitized
+  // where `cache_usage_options.options` is used when there is no entry in
+  // `cache_usage_options.options_overrides`
+  Options options;
+  options.create_if_missing = true;
+  BlockBasedTableOptions table_options = BlockBasedTableOptions();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Destroy(options);
+  Status s = TryReopen(options);
+  EXPECT_TRUE(s.ok());
+  const auto* sanitized_table_options =
+      options.table_factory->GetOptions<BlockBasedTableOptions>();
+  const auto sanitized_options_overrides =
+      sanitized_table_options->cache_usage_options.options_overrides;
+  EXPECT_EQ(sanitized_options_overrides.size(), kNumCacheEntryRoles);
+  for (auto options_overrides_iter = sanitized_options_overrides.cbegin();
+       options_overrides_iter != sanitized_options_overrides.cend();
+       ++options_overrides_iter) {
+    CacheEntryRoleOptions role_options = options_overrides_iter->second;
+    CacheEntryRoleOptions default_options =
+        sanitized_table_options->cache_usage_options.options;
+    EXPECT_TRUE(role_options == default_options);
+  }
+  Destroy(options);
+
+  // To test option validation on unsupported CacheEntryRole
+  table_options = BlockBasedTableOptions();
+  table_options.cache_usage_options.options_overrides.insert(
+      {CacheEntryRole::kDataBlock,
+       {/*.charged = */ CacheEntryRoleOptions::Decision::kDisabled}});
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Destroy(options);
+  s = TryReopen(options);
+  EXPECT_TRUE(s.IsNotSupported());
+  EXPECT_TRUE(
+      s.ToString().find("Enable/Disable CacheEntryRoleOptions::charged") !=
+      std::string::npos);
+  EXPECT_TRUE(
+      s.ToString().find(kCacheEntryRoleToCamelString[static_cast<uint32_t>(
+          CacheEntryRole::kDataBlock)]) != std::string::npos);
+  Destroy(options);
+
+  // To test option validation on existence of block cache
+  table_options = BlockBasedTableOptions();
+  table_options.no_block_cache = true;
+  table_options.cache_usage_options.options_overrides.insert(
+      {CacheEntryRole::kFilterConstruction,
+       {/*.charged = */ CacheEntryRoleOptions::Decision::kEnabled}});
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Destroy(options);
+  s = TryReopen(options);
+  EXPECT_TRUE(s.IsInvalidArgument());
+  EXPECT_TRUE(s.ToString().find("Enable CacheEntryRoleOptions::charged") !=
+              std::string::npos);
+  EXPECT_TRUE(
+      s.ToString().find(kCacheEntryRoleToCamelString[static_cast<std::size_t>(
+          CacheEntryRole::kFilterConstruction)]) != std::string::npos);
+  EXPECT_TRUE(s.ToString().find("block cache is disabled") !=
+              std::string::npos);
+  Destroy(options);
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/table/two_level_iterator.cc b/src/rocksdb/table/two_level_iterator.cc
new file mode 100644
index 000000000..4b6634e5c
--- /dev/null
+++ b/src/rocksdb/table/two_level_iterator.cc
@@ -0,0 +1,220 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/two_level_iterator.h"
+
+#include "db/pinned_iterators_manager.h"
+#include "memory/arena.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block.h"
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+class TwoLevelIndexIterator : public InternalIteratorBase<IndexValue> {
+ public:
+  explicit TwoLevelIndexIterator(
+      TwoLevelIteratorState* state,
+      InternalIteratorBase<IndexValue>* first_level_iter);
+
+  ~TwoLevelIndexIterator() override {
+    first_level_iter_.DeleteIter(false /* is_arena_mode */);
+    second_level_iter_.DeleteIter(false /* is_arena_mode */);
+    delete state_;
+  }
+
+  void Seek(const Slice& target) override;
+  void SeekForPrev(const Slice& target) override;
+  void SeekToFirst() override;
+  void SeekToLast() override;
+  void Next() override;
+  void Prev() override;
+
+  bool Valid() const override { return second_level_iter_.Valid(); }
+  Slice key() const override {
+    assert(Valid());
+    return second_level_iter_.key();
+  }
+  Slice user_key() const override {
+    assert(Valid());
+    return second_level_iter_.user_key();
+  }
+  IndexValue value() const override {
+    assert(Valid());
+    return second_level_iter_.value();
+  }
+  Status status() const override {
+    if (!first_level_iter_.status().ok()) {
+      assert(second_level_iter_.iter() == nullptr);
+      return first_level_iter_.status();
+    } else if (second_level_iter_.iter() != nullptr &&
+               !second_level_iter_.status().ok()) {
+      return second_level_iter_.status();
+    } else {
+      return status_;
+    }
+  }
+  void SetPinnedItersMgr(
+      PinnedIteratorsManager* /*pinned_iters_mgr*/) override {}
+  bool IsKeyPinned() const override { return false; }
+  bool IsValuePinned() const override { return false; }
+
+ private:
+  void SaveError(const Status& s) {
+    if (status_.ok() && !s.ok()) status_ = s;
+  }
+  void SkipEmptyDataBlocksForward();
+  void SkipEmptyDataBlocksBackward();
+  void SetSecondLevelIterator(InternalIteratorBase<IndexValue>* iter);
+  void InitDataBlock();
+
+  TwoLevelIteratorState* state_;
+  IteratorWrapperBase<IndexValue> first_level_iter_;
+  IteratorWrapperBase<IndexValue> second_level_iter_;  // May be nullptr
+  Status status_;
+  // If second_level_iter is non-nullptr, then "data_block_handle_" holds the
+  // "index_value" passed to block_function_ to create the second_level_iter.
+  BlockHandle data_block_handle_;
+};
+
+TwoLevelIndexIterator::TwoLevelIndexIterator(
+    TwoLevelIteratorState* state,
+    InternalIteratorBase<IndexValue>* first_level_iter)
+    : state_(state), first_level_iter_(first_level_iter) {}
+
+void TwoLevelIndexIterator::Seek(const Slice& target) {
+  first_level_iter_.Seek(target);
+
+  InitDataBlock();
+  if (second_level_iter_.iter() != nullptr) {
+    second_level_iter_.Seek(target);
+  }
+  SkipEmptyDataBlocksForward();
+}
+
+void TwoLevelIndexIterator::SeekForPrev(const Slice& target) {
+  first_level_iter_.Seek(target);
+  InitDataBlock();
+  if (second_level_iter_.iter() != nullptr) {
+    second_level_iter_.SeekForPrev(target);
+  }
+  if (!Valid()) {
+    if (!first_level_iter_.Valid() && first_level_iter_.status().ok()) {
+      first_level_iter_.SeekToLast();
+      InitDataBlock();
+      if (second_level_iter_.iter() != nullptr) {
+        second_level_iter_.SeekForPrev(target);
+      }
+    }
+    SkipEmptyDataBlocksBackward();
+  }
+}
+
+void TwoLevelIndexIterator::SeekToFirst() {
+  first_level_iter_.SeekToFirst();
+  InitDataBlock();
+  if (second_level_iter_.iter() != nullptr) {
+    second_level_iter_.SeekToFirst();
+  }
+  SkipEmptyDataBlocksForward();
+}
+
+void TwoLevelIndexIterator::SeekToLast() {
+  first_level_iter_.SeekToLast();
+  InitDataBlock();
+  if (second_level_iter_.iter() != nullptr) {
+    second_level_iter_.SeekToLast();
+  }
+  SkipEmptyDataBlocksBackward();
+}
+
+void TwoLevelIndexIterator::Next() {
+  assert(Valid());
+  second_level_iter_.Next();
+  SkipEmptyDataBlocksForward();
+}
+
+void TwoLevelIndexIterator::Prev() {
+  assert(Valid());
+  second_level_iter_.Prev();
+  SkipEmptyDataBlocksBackward();
+}
+
+void TwoLevelIndexIterator::SkipEmptyDataBlocksForward() {
+  while (second_level_iter_.iter() == nullptr ||
+         (!second_level_iter_.Valid() && second_level_iter_.status().ok())) {
+    // Move to next block
+    if (!first_level_iter_.Valid()) {
+      SetSecondLevelIterator(nullptr);
+      return;
+    }
+    first_level_iter_.Next();
+    InitDataBlock();
+    if (second_level_iter_.iter() != nullptr) {
+      second_level_iter_.SeekToFirst();
+    }
+  }
+}
+
+void TwoLevelIndexIterator::SkipEmptyDataBlocksBackward() {
+  while (second_level_iter_.iter() == nullptr ||
+         (!second_level_iter_.Valid() && second_level_iter_.status().ok())) {
+    // Move to next block
+    if (!first_level_iter_.Valid()) {
+      SetSecondLevelIterator(nullptr);
+      return;
+    }
+    first_level_iter_.Prev();
+    InitDataBlock();
+    if (second_level_iter_.iter() != nullptr) {
+      second_level_iter_.SeekToLast();
+    }
+  }
+}
+
+void TwoLevelIndexIterator::SetSecondLevelIterator(
+    InternalIteratorBase<IndexValue>* iter) {
+  InternalIteratorBase<IndexValue>* old_iter = second_level_iter_.Set(iter);
+  delete old_iter;
+}
+
+void TwoLevelIndexIterator::InitDataBlock() {
+  if (!first_level_iter_.Valid()) {
+    SetSecondLevelIterator(nullptr);
+  } else {
+    BlockHandle handle = first_level_iter_.value().handle;
+    if (second_level_iter_.iter() != nullptr &&
+        !second_level_iter_.status().IsIncomplete() &&
+        handle.offset() == data_block_handle_.offset()) {
+      // second_level_iter is already constructed with this iterator, so
+      // no need to change anything
+    } else {
+      InternalIteratorBase<IndexValue>* iter =
+          state_->NewSecondaryIterator(handle);
+      data_block_handle_ = handle;
+      SetSecondLevelIterator(iter);
+      if (iter == nullptr) {
+        status_ = Status::Corruption("Missing block for partition " +
+                                     handle.ToString());
+      }
+    }
+  }
+}
+
+}  // namespace
+
+InternalIteratorBase<IndexValue>* NewTwoLevelIterator(
+    TwoLevelIteratorState* state,
+    InternalIteratorBase<IndexValue>* first_level_iter) {
+  return new TwoLevelIndexIterator(state, first_level_iter);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/two_level_iterator.h b/src/rocksdb/table/two_level_iterator.h
new file mode 100644
index 000000000..1fed93417
--- /dev/null
+++ b/src/rocksdb/table/two_level_iterator.h
@@ -0,0 +1,43 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "table/iterator_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct ReadOptions;
+class InternalKeyComparator;
+
+// TwoLevelIteratorState expects iterators are not created using the arena
+struct TwoLevelIteratorState {
+  TwoLevelIteratorState() {}
+
+  virtual ~TwoLevelIteratorState() {}
+  virtual InternalIteratorBase<IndexValue>* NewSecondaryIterator(
+      const BlockHandle& handle) = 0;
+};
+
+// Return a new two level iterator.  A two-level iterator contains an
+// index iterator whose values point to a sequence of blocks where
+// each block is itself a sequence of key,value pairs.  The returned
+// two-level iterator yields the concatenation of all key/value pairs
+// in the sequence of blocks.  Takes ownership of "index_iter" and
+// will delete it when no longer needed.
+//
+// Uses a supplied function to convert an index_iter value into
+// an iterator over the contents of the corresponding block.
+// Note: this function expects first_level_iter was not created using the arena
+extern InternalIteratorBase<IndexValue>* NewTwoLevelIterator(
+    TwoLevelIteratorState* state,
+    InternalIteratorBase<IndexValue>* first_level_iter);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/unique_id.cc b/src/rocksdb/table/unique_id.cc
new file mode 100644
index 000000000..fcdd75650
--- /dev/null
+++ b/src/rocksdb/table/unique_id.cc
@@ -0,0 +1,223 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <cstdint>
+
+#include "table/unique_id_impl.h"
+#include "util/coding_lean.h"
+#include "util/hash.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::string EncodeSessionId(uint64_t upper, uint64_t lower) {
+  std::string db_session_id(20U, '\0');
+  char *buf = &db_session_id[0];
+  // Preserving `lower` is slightly tricky. 36^12 is slightly more than
+  // 62 bits, so we use 12 chars plus the bottom two bits of one more.
+  // (A tiny fraction of 20 digit strings go unused.)
+  uint64_t a = (upper << 2) | (lower >> 62);
+  uint64_t b = lower & (UINT64_MAX >> 2);
+  PutBaseChars<36>(&buf, 8, a, /*uppercase*/ true);
+  PutBaseChars<36>(&buf, 12, b, /*uppercase*/ true);
+  assert(buf == &db_session_id.back() + 1);
+  return db_session_id;
+}
+
+Status DecodeSessionId(const std::string &db_session_id, uint64_t *upper,
+                       uint64_t *lower) {
+  const size_t len = db_session_id.size();
+  if (len == 0) {
+    return Status::NotSupported("Missing db_session_id");
+  }
+  // Anything from 13 to 24 chars is reasonable. We don't have to limit to
+  // exactly 20.
+  if (len < 13) {
+    return Status::NotSupported("Too short db_session_id");
+  }
+  if (len > 24) {
+    return Status::NotSupported("Too long db_session_id");
+  }
+  uint64_t a = 0, b = 0;
+  const char *buf = &db_session_id.front();
+  bool success = ParseBaseChars<36>(&buf, len - 12U, &a);
+  if (!success) {
+    return Status::NotSupported("Bad digit in db_session_id");
+  }
+  success = ParseBaseChars<36>(&buf, 12U, &b);
+  if (!success) {
+    return Status::NotSupported("Bad digit in db_session_id");
+  }
+  assert(buf == &db_session_id.back() + 1);
+  *upper = a >> 2;
+  *lower = (b & (UINT64_MAX >> 2)) | (a << 62);
+  return Status::OK();
+}
+
+Status GetSstInternalUniqueId(const std::string &db_id,
+                              const std::string &db_session_id,
+                              uint64_t file_number, UniqueIdPtr out,
+                              bool force) {
+  if (!force) {
+    if (db_id.empty()) {
+      return Status::NotSupported("Missing db_id");
+    }
+    if (file_number == 0) {
+      return Status::NotSupported("Missing or bad file number");
+    }
+    if (db_session_id.empty()) {
+      return Status::NotSupported("Missing db_session_id");
+    }
+  }
+  uint64_t session_upper = 0;  // Assignment to appease clang-analyze
+  uint64_t session_lower = 0;  // Assignment to appease clang-analyze
+  {
+    Status s = DecodeSessionId(db_session_id, &session_upper, &session_lower);
+    if (!s.ok()) {
+      if (!force) {
+        return s;
+      } else {
+        // A reasonable fallback in case malformed
+        Hash2x64(db_session_id.data(), db_session_id.size(), &session_upper,
+                 &session_lower);
+        if (session_lower == 0) {
+          session_lower = session_upper | 1;
+        }
+      }
+    }
+  }
+
+  // Exactly preserve session lower to ensure that session ids generated
+  // during the same process lifetime are guaranteed unique.
+  // DBImpl also guarantees (in recent versions) that this is not zero,
+  // so that we can guarantee unique ID is never all zeros. (Can't assert
+  // that here because of testing and old versions.)
+  // We put this first in anticipation of matching a small-ish set of cache
+  // key prefixes to cover entries relevant to any DB.
+  out.ptr[0] = session_lower;
+
+  // Hash the session upper (~39 bits entropy) and DB id (120+ bits entropy)
+  // for very high global uniqueness entropy.
+  // (It is possible that many DBs descended from one common DB id are copied
+  // around and proliferate, in which case session id is critical, but it is
+  // more common for different DBs to have different DB ids.)
+  uint64_t db_a, db_b;
+  Hash2x64(db_id.data(), db_id.size(), session_upper, &db_a, &db_b);
+
+  // Xor in file number for guaranteed uniqueness by file number for a given
+  // session and DB id. (Xor slightly better than + here. See
+  // https://github.com/pdillinger/unique_id )
+  out.ptr[1] = db_a ^ file_number;
+
+  // Extra (optional) global uniqueness
+  if (out.extended) {
+    out.ptr[2] = db_b;
+  }
+
+  return Status::OK();
+}
+
+namespace {
+// For InternalUniqueIdToExternal / ExternalUniqueIdToInternal we want all
+// zeros in first 128 bits to map to itself, so that excluding zero in
+// internal IDs (session_lower != 0 above) does the same for external IDs.
+// These values are meaningless except for making that work.
+constexpr uint64_t kHiOffsetForZero = 17391078804906429400U;
+constexpr uint64_t kLoOffsetForZero = 6417269962128484497U;
+}  // namespace
+
+void InternalUniqueIdToExternal(UniqueIdPtr in_out) {
+  uint64_t hi, lo;
+  BijectiveHash2x64(in_out.ptr[1] + kHiOffsetForZero,
+                    in_out.ptr[0] + kLoOffsetForZero, &hi, &lo);
+  in_out.ptr[0] = lo;
+  in_out.ptr[1] = hi;
+  if (in_out.extended) {
+    in_out.ptr[2] += lo + hi;
+  }
+}
+
+void ExternalUniqueIdToInternal(UniqueIdPtr in_out) {
+  uint64_t lo = in_out.ptr[0];
+  uint64_t hi = in_out.ptr[1];
+  if (in_out.extended) {
+    in_out.ptr[2] -= lo + hi;
+  }
+  BijectiveUnhash2x64(hi, lo, &hi, &lo);
+  in_out.ptr[0] = lo - kLoOffsetForZero;
+  in_out.ptr[1] = hi - kHiOffsetForZero;
+}
+
+std::string EncodeUniqueIdBytes(UniqueIdPtr in) {
+  std::string ret(in.extended ? 24U : 16U, '\0');
+  EncodeFixed64(&ret[0], in.ptr[0]);
+  EncodeFixed64(&ret[8], in.ptr[1]);
+  if (in.extended) {
+    EncodeFixed64(&ret[16], in.ptr[2]);
+  }
+  return ret;
+}
+
+Status DecodeUniqueIdBytes(const std::string &unique_id, UniqueIdPtr out) {
+  if (unique_id.size() != (out.extended ? 24 : 16)) {
+    return Status::NotSupported("Not a valid unique_id");
+  }
+  const char *buf = &unique_id.front();
+  out.ptr[0] = DecodeFixed64(&buf[0]);
+  out.ptr[1] = DecodeFixed64(&buf[8]);
+  if (out.extended) {
+    out.ptr[2] = DecodeFixed64(&buf[16]);
+  }
+  return Status::OK();
+}
+
+template <typename ID>
+Status GetUniqueIdFromTablePropertiesHelper(const TableProperties &props,
+                                            std::string *out_id) {
+  ID tmp{};
+  Status s = GetSstInternalUniqueId(props.db_id, props.db_session_id,
+                                    props.orig_file_number, &tmp);
+  if (s.ok()) {
+    InternalUniqueIdToExternal(&tmp);
+    *out_id = EncodeUniqueIdBytes(&tmp);
+  } else {
+    out_id->clear();
+  }
+  return s;
+}
+
+Status GetExtendedUniqueIdFromTableProperties(const TableProperties &props,
+                                              std::string *out_id) {
+  return GetUniqueIdFromTablePropertiesHelper<UniqueId64x3>(props, out_id);
+}
+
+Status GetUniqueIdFromTableProperties(const TableProperties &props,
+                                      std::string *out_id) {
+  return GetUniqueIdFromTablePropertiesHelper<UniqueId64x2>(props, out_id);
+}
+
+std::string UniqueIdToHumanString(const std::string &id) {
+  // Not so efficient, but that's OK
+  std::string str = Slice(id).ToString(/*hex*/ true);
+  for (size_t i = 16; i < str.size(); i += 17) {
+    str.insert(i, "-");
+  }
+  return str;
+}
+
+std::string InternalUniqueIdToHumanString(UniqueIdPtr in) {
+  std::string str = "{";
+  str += std::to_string(in.ptr[0]);
+  str += ",";
+  str += std::to_string(in.ptr[1]);
+  if (in.extended) {
+    str += ",";
+    str += std::to_string(in.ptr[2]);
+  }
+  str += "}";
+  return str;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/table/unique_id_impl.h b/src/rocksdb/table/unique_id_impl.h
new file mode 100644
index 000000000..6e3dc62c7
--- /dev/null
+++ b/src/rocksdb/table/unique_id_impl.h
@@ -0,0 +1,93 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <array>
+
+#include "rocksdb/unique_id.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Standard size unique ID, good enough for almost all practical purposes
+using UniqueId64x2 = std::array<uint64_t, 2>;
+
+// Value never used as an actual unique ID so can be used for "null"
+constexpr UniqueId64x2 kNullUniqueId64x2 = {};
+
+// Extended size unique ID, for extra certainty of uniqueness among SST files
+// spanning many hosts over a long time (rarely if ever needed)
+using UniqueId64x3 = std::array<uint64_t, 3>;
+
+// Value never used as an actual unique ID so can be used for "null"
+constexpr UniqueId64x3 kNullUniqueId64x3 = {};
+
+// Dynamic pointer wrapper for one of the two above
+struct UniqueIdPtr {
+  uint64_t *ptr = nullptr;
+  bool extended = false;
+
+  /*implicit*/ UniqueIdPtr(UniqueId64x2 *id) {
+    ptr = (*id).data();
+    extended = false;
+  }
+  /*implicit*/ UniqueIdPtr(UniqueId64x3 *id) {
+    ptr = (*id).data();
+    extended = true;
+  }
+};
+
+// Helper for GetUniqueIdFromTableProperties. This function can also be used
+// for temporary ids for files without sufficient information in table
+// properties. The internal unique id is more structured than the public
+// unique id, so can be manipulated in more ways but very carefully.
+// These must be long term stable to ensure GetUniqueIdFromTableProperties
+// is long term stable.
+Status GetSstInternalUniqueId(const std::string &db_id,
+                              const std::string &db_session_id,
+                              uint64_t file_number, UniqueIdPtr out,
+                              bool force = false);
+
+// Helper for GetUniqueIdFromTableProperties. External unique ids go through
+// this extra hashing layer so that prefixes of the unique id have predictable
+// "full" entropy. This hashing layer is 1-to-1 on the first 128 bits and on
+// the full 192 bits.
+// This transformation must be long term stable to ensure
+// GetUniqueIdFromTableProperties is long term stable.
+void InternalUniqueIdToExternal(UniqueIdPtr in_out);
+
+// Reverse of InternalUniqueIdToExternal mostly for testing purposes
+// (demonstrably 1-to-1 on the first 128 bits and on the full 192 bits).
+void ExternalUniqueIdToInternal(UniqueIdPtr in_out);
+
+// Convert numerical format to byte format for public API
+std::string EncodeUniqueIdBytes(UniqueIdPtr in);
+
+// Reverse of EncodeUniqueIdBytes.
+Status DecodeUniqueIdBytes(const std::string &unique_id, UniqueIdPtr out);
+
+// For presenting internal IDs for debugging purposes. Visually distinct from
+// UniqueIdToHumanString for external IDs.
+std::string InternalUniqueIdToHumanString(UniqueIdPtr in);
+
+// Reformat a random value down to our "DB session id" format,
+// which is intended to be compact and friendly for use in file names.
+// `lower` is fully preserved and data is lost from `upper`.
+//
+// Detail: Encoded into 20 chars in base-36 ([0-9A-Z]), which is ~103 bits of
+// entropy, which is enough to expect no collisions across a billion servers
+// each opening DBs a million times (~2^50). Benefits vs. RFC-4122 unique id:
+// * Save ~ dozen bytes per SST file
+// * Shorter shared backup file names (some platforms have low limits)
+// * Visually distinct from DB id format (usually RFC-4122)
+std::string EncodeSessionId(uint64_t upper, uint64_t lower);
+
+// Reverse of EncodeSessionId. Returns NotSupported on error rather than
+// Corruption because non-standard session IDs should be allowed with degraded
+// functionality.
+Status DecodeSessionId(const std::string &db_session_id, uint64_t *upper,
+                       uint64_t *lower);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/test_util/mock_time_env.cc b/src/rocksdb/test_util/mock_time_env.cc
new file mode 100644
index 000000000..23888e69e
--- /dev/null
+++ b/src/rocksdb/test_util/mock_time_env.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "test_util/mock_time_env.h"
+
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO: this is a workaround for the different behavior on different platform
+// for timedwait timeout. Ideally timedwait API should be moved to env.
+// details: PR #7101.
+void MockSystemClock::InstallTimedWaitFixCallback() {
+#ifndef NDEBUG
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+#ifdef OS_MACOSX
+  // This is an alternate way (vs. SpecialEnv) of dealing with the fact
+  // that on some platforms, pthread_cond_timedwait does not appear to
+  // release the lock for other threads to operate if the deadline time
+  // is already passed. (TimedWait calls are currently a bad abstraction
+  // because the deadline parameter is usually computed from Env time,
+  // but is interpreted in real clock time.)
+  SyncPoint::GetInstance()->SetCallBack(
+      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
+        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
+        if (time_us < this->RealNowMicros()) {
+          *reinterpret_cast<uint64_t*>(arg) = this->RealNowMicros() + 1000;
+        }
+      });
+#endif  // OS_MACOSX
+  SyncPoint::GetInstance()->EnableProcessing();
+#endif  // !NDEBUG
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/test_util/mock_time_env.h b/src/rocksdb/test_util/mock_time_env.h
new file mode 100644
index 000000000..7834368e0
--- /dev/null
+++ b/src/rocksdb/test_util/mock_time_env.h
@@ -0,0 +1,78 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <limits>
+
+#include "rocksdb/system_clock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// NOTE: SpecialEnv offers most of this functionality, along with hooks
+// for safe DB behavior under a mock time environment, so should be used
+// instead of MockSystemClock for DB tests.
+class MockSystemClock : public SystemClockWrapper {
+ public:
+  explicit MockSystemClock(const std::shared_ptr<SystemClock>& base)
+      : SystemClockWrapper(base) {}
+
+  static const char* kClassName() { return "MockSystemClock"; }
+  const char* Name() const override { return kClassName(); }
+  virtual Status GetCurrentTime(int64_t* time_sec) override {
+    assert(time_sec != nullptr);
+    *time_sec = static_cast<int64_t>(current_time_us_ / kMicrosInSecond);
+    return Status::OK();
+  }
+
+  virtual uint64_t NowSeconds() { return current_time_us_ / kMicrosInSecond; }
+
+  virtual uint64_t NowMicros() override { return current_time_us_; }
+
+  virtual uint64_t NowNanos() override {
+    assert(current_time_us_ <= std::numeric_limits<uint64_t>::max() / 1000);
+    return current_time_us_ * 1000;
+  }
+
+  uint64_t RealNowMicros() { return target_->NowMicros(); }
+
+  void SetCurrentTime(uint64_t time_sec) {
+    assert(time_sec < std::numeric_limits<uint64_t>::max() / kMicrosInSecond);
+    assert(time_sec * kMicrosInSecond >= current_time_us_);
+    current_time_us_ = time_sec * kMicrosInSecond;
+  }
+
+  // It's a fake sleep that just updates the Env current time, which is similar
+  // to `NoSleepEnv.SleepForMicroseconds()` and
+  // `SpecialEnv.MockSleepForMicroseconds()`.
+  // It's also similar to `set_current_time()`, which takes an absolute time in
+  // seconds, vs. this one takes the sleep in microseconds.
+  // Note: Not thread safe.
+  void SleepForMicroseconds(int micros) override {
+    assert(micros >= 0);
+    assert(current_time_us_ + static_cast<uint64_t>(micros) >=
+           current_time_us_);
+    current_time_us_.fetch_add(micros);
+  }
+
+  void MockSleepForSeconds(int seconds) {
+    assert(seconds >= 0);
+    uint64_t micros = static_cast<uint64_t>(seconds) * kMicrosInSecond;
+    assert(current_time_us_ + micros >= current_time_us_);
+    current_time_us_.fetch_add(micros);
+  }
+
+  // TODO: this is a workaround for the different behavior on different platform
+  // for timedwait timeout. Ideally timedwait API should be moved to env.
+  // details: PR #7101.
+  void InstallTimedWaitFixCallback();
+
+ private:
+  std::atomic<uint64_t> current_time_us_{0};
+  static constexpr uint64_t kMicrosInSecond = 1000U * 1000U;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/test_util/sync_point.cc b/src/rocksdb/test_util/sync_point.cc
new file mode 100644
index 000000000..bec02d4f6
--- /dev/null
+++ b/src/rocksdb/test_util/sync_point.cc
@@ -0,0 +1,82 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "test_util/sync_point.h"
+
+#include <fcntl.h>
+
+#include "test_util/sync_point_impl.h"
+
+std::vector<std::string> rocksdb_kill_exclude_prefixes;
+
+#ifndef NDEBUG
+namespace ROCKSDB_NAMESPACE {
+
+SyncPoint* SyncPoint::GetInstance() {
+  static SyncPoint sync_point;
+  return &sync_point;
+}
+
+SyncPoint::SyncPoint() : impl_(new Data) {}
+
+SyncPoint::~SyncPoint() { delete impl_; }
+
+void SyncPoint::LoadDependency(const std::vector<SyncPointPair>& dependencies) {
+  impl_->LoadDependency(dependencies);
+}
+
+void SyncPoint::LoadDependencyAndMarkers(
+    const std::vector<SyncPointPair>& dependencies,
+    const std::vector<SyncPointPair>& markers) {
+  impl_->LoadDependencyAndMarkers(dependencies, markers);
+}
+
+void SyncPoint::SetCallBack(const std::string& point,
+                            const std::function<void(void*)>& callback) {
+  impl_->SetCallBack(point, callback);
+}
+
+void SyncPoint::ClearCallBack(const std::string& point) {
+  impl_->ClearCallBack(point);
+}
+
+void SyncPoint::ClearAllCallBacks() { impl_->ClearAllCallBacks(); }
+
+void SyncPoint::EnableProcessing() { impl_->EnableProcessing(); }
+
+void SyncPoint::DisableProcessing() { impl_->DisableProcessing(); }
+
+void SyncPoint::ClearTrace() { impl_->ClearTrace(); }
+
+void SyncPoint::Process(const Slice& point, void* cb_arg) {
+  impl_->Process(point, cb_arg);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // NDEBUG
+
+namespace ROCKSDB_NAMESPACE {
+void SetupSyncPointsToMockDirectIO() {
+#if !defined(NDEBUG) && !defined(OS_MACOSX) && !defined(OS_WIN) && \
+    !defined(OS_SOLARIS) && !defined(OS_AIX) && !defined(OS_OPENBSD)
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "NewWritableFile:O_DIRECT", [&](void* arg) {
+        int* val = static_cast<int*>(arg);
+        *val &= ~O_DIRECT;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "NewRandomAccessFile:O_DIRECT", [&](void* arg) {
+        int* val = static_cast<int*>(arg);
+        *val &= ~O_DIRECT;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "NewSequentialFile:O_DIRECT", [&](void* arg) {
+        int* val = static_cast<int*>(arg);
+        *val &= ~O_DIRECT;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+#endif
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/test_util/sync_point.h b/src/rocksdb/test_util/sync_point.h
new file mode 100644
index 000000000..65f1239ec
--- /dev/null
+++ b/src/rocksdb/test_util/sync_point.h
@@ -0,0 +1,180 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <assert.h>
+
+#include <functional>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/slice.h"
+
+#ifdef NDEBUG
+// empty in release build
+#define TEST_KILL_RANDOM_WITH_WEIGHT(kill_point, rocksdb_kill_odds_weight)
+#define TEST_KILL_RANDOM(kill_point)
+#else
+
+namespace ROCKSDB_NAMESPACE {
+
+// To avoid crashing always at some frequently executed codepaths (during
+// kill random test), use this factor to reduce odds
+#define REDUCE_ODDS 2
+#define REDUCE_ODDS2 4
+
+// A class used to pass when a kill point is reached.
+struct KillPoint {
+ public:
+  // This is only set from db_stress.cc and for testing only.
+  // If non-zero, kill at various points in source code with probability 1/this
+  int rocksdb_kill_odds = 0;
+  // If kill point has a prefix on this list, will skip killing.
+  std::vector<std::string> rocksdb_kill_exclude_prefixes;
+  // Kill the process with probability 1/odds for testing.
+  void TestKillRandom(std::string kill_point, int odds,
+                      const std::string& srcfile, int srcline);
+
+  static KillPoint* GetInstance();
+};
+
+#define TEST_KILL_RANDOM_WITH_WEIGHT(kill_point, rocksdb_kill_odds_weight) \
+  {                                                                        \
+    KillPoint::GetInstance()->TestKillRandom(                              \
+        kill_point, rocksdb_kill_odds_weight, __FILE__, __LINE__);         \
+  }
+#define TEST_KILL_RANDOM(kill_point) TEST_KILL_RANDOM_WITH_WEIGHT(kill_point, 1)
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif
+
+#ifdef NDEBUG
+#define TEST_SYNC_POINT(x)
+#define TEST_IDX_SYNC_POINT(x, index)
+#define TEST_SYNC_POINT_CALLBACK(x, y)
+#define INIT_SYNC_POINT_SINGLETONS()
+#else
+
+namespace ROCKSDB_NAMESPACE {
+
+// This class provides facility to reproduce race conditions deterministically
+// in unit tests.
+// Developer could specify sync points in the codebase via TEST_SYNC_POINT.
+// Each sync point represents a position in the execution stream of a thread.
+// In the unit test, 'Happens After' relationship among sync points could be
+// setup via SyncPoint::LoadDependency, to reproduce a desired interleave of
+// threads execution.
+// Refer to (DBTest,TransactionLogIteratorRace), for an example use case.
+
+class SyncPoint {
+ public:
+  static SyncPoint* GetInstance();
+
+  SyncPoint(const SyncPoint&) = delete;
+  SyncPoint& operator=(const SyncPoint&) = delete;
+  ~SyncPoint();
+
+  struct SyncPointPair {
+    std::string predecessor;
+    std::string successor;
+  };
+
+  // call once at the beginning of a test to setup the dependency between
+  // sync points
+  void LoadDependency(const std::vector<SyncPointPair>& dependencies);
+
+  // call once at the beginning of a test to setup the dependency between
+  // sync points and setup markers indicating the successor is only enabled
+  // when it is processed on the same thread as the predecessor.
+  // When adding a marker, it implicitly adds a dependency for the marker pair.
+  void LoadDependencyAndMarkers(const std::vector<SyncPointPair>& dependencies,
+                                const std::vector<SyncPointPair>& markers);
+
+  // The argument to the callback is passed through from
+  // TEST_SYNC_POINT_CALLBACK(); nullptr if TEST_SYNC_POINT or
+  // TEST_IDX_SYNC_POINT was used.
+  void SetCallBack(const std::string& point,
+                   const std::function<void(void*)>& callback);
+
+  // Clear callback function by point
+  void ClearCallBack(const std::string& point);
+
+  // Clear all call back functions.
+  void ClearAllCallBacks();
+
+  // enable sync point processing (disabled on startup)
+  void EnableProcessing();
+
+  // disable sync point processing
+  void DisableProcessing();
+
+  // remove the execution trace of all sync points
+  void ClearTrace();
+
+  // triggered by TEST_SYNC_POINT, blocking execution until all predecessors
+  // are executed.
+  // And/or call registered callback function, with argument `cb_arg`
+  void Process(const Slice& point, void* cb_arg = nullptr);
+
+  // template gets length of const string at compile time,
+  //  avoiding strlen() at runtime
+  template <size_t kLen>
+  void Process(const char (&point)[kLen], void* cb_arg = nullptr) {
+    static_assert(kLen > 0, "Must not be empty");
+    assert(point[kLen - 1] == '\0');
+    Process(Slice(point, kLen - 1), cb_arg);
+  }
+
+  // TODO: it might be useful to provide a function that blocks until all
+  // sync points are cleared.
+
+  // We want this to be public so we can
+  // subclass the implementation
+  struct Data;
+
+ private:
+  // Singleton
+  SyncPoint();
+  Data* impl_;
+};
+
+// Sets up sync points to mock direct IO instead of actually issuing direct IO
+// to the file system.
+void SetupSyncPointsToMockDirectIO();
+}  // namespace ROCKSDB_NAMESPACE
+
+// Use TEST_SYNC_POINT to specify sync points inside code base.
+// Sync points can have happens-after dependency on other sync points,
+// configured at runtime via SyncPoint::LoadDependency. This could be
+// utilized to re-produce race conditions between threads.
+// See TransactionLogIteratorRace in db_test.cc for an example use case.
+// TEST_SYNC_POINT is no op in release build.
+#define TEST_SYNC_POINT(x) \
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->Process(x)
+#define TEST_IDX_SYNC_POINT(x, index)                      \
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->Process(x + \
+                                                       std::to_string(index))
+#define TEST_SYNC_POINT_CALLBACK(x, y) \
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->Process(x, y)
+#define INIT_SYNC_POINT_SINGLETONS() \
+  (void)ROCKSDB_NAMESPACE::SyncPoint::GetInstance();
+#endif  // NDEBUG
+
+// Callback sync point for any read IO errors that should be ignored by
+// the fault injection framework
+// Disable in release mode
+#ifdef NDEBUG
+#define IGNORE_STATUS_IF_ERROR(_status_)
+#else
+#define IGNORE_STATUS_IF_ERROR(_status_)            \
+  {                                                 \
+    if (!_status_.ok()) {                           \
+      TEST_SYNC_POINT("FaultInjectionIgnoreError"); \
+    }                                               \
+  }
+#endif  // NDEBUG
diff --git a/src/rocksdb/test_util/sync_point_impl.cc b/src/rocksdb/test_util/sync_point_impl.cc
new file mode 100644
index 000000000..2a4bd3ccd
--- /dev/null
+++ b/src/rocksdb/test_util/sync_point_impl.cc
@@ -0,0 +1,152 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "test_util/sync_point_impl.h"
+
+#ifndef NDEBUG
+namespace ROCKSDB_NAMESPACE {
+KillPoint* KillPoint::GetInstance() {
+  static KillPoint kp;
+  return &kp;
+}
+
+void KillPoint::TestKillRandom(std::string kill_point, int odds_weight,
+                               const std::string& srcfile, int srcline) {
+  if (rocksdb_kill_odds <= 0) {
+    return;
+  }
+  int odds = rocksdb_kill_odds * odds_weight;
+  for (auto& p : rocksdb_kill_exclude_prefixes) {
+    if (kill_point.substr(0, p.length()) == p) {
+      return;
+    }
+  }
+
+  assert(odds > 0);
+  if (odds % 7 == 0) {
+    // class Random uses multiplier 16807, which is 7^5. If odds are
+    // multiplier of 7, there might be limited values generated.
+    odds++;
+  }
+  auto* r = Random::GetTLSInstance();
+  bool crash = r->OneIn(odds);
+  if (crash) {
+    port::Crash(srcfile, srcline);
+  }
+}
+
+void SyncPoint::Data::LoadDependency(
+    const std::vector<SyncPointPair>& dependencies) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  successors_.clear();
+  predecessors_.clear();
+  cleared_points_.clear();
+  for (const auto& dependency : dependencies) {
+    successors_[dependency.predecessor].push_back(dependency.successor);
+    predecessors_[dependency.successor].push_back(dependency.predecessor);
+    point_filter_.Add(dependency.successor);
+    point_filter_.Add(dependency.predecessor);
+  }
+  cv_.notify_all();
+}
+
+void SyncPoint::Data::LoadDependencyAndMarkers(
+    const std::vector<SyncPointPair>& dependencies,
+    const std::vector<SyncPointPair>& markers) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  successors_.clear();
+  predecessors_.clear();
+  cleared_points_.clear();
+  markers_.clear();
+  marked_thread_id_.clear();
+  for (const auto& dependency : dependencies) {
+    successors_[dependency.predecessor].push_back(dependency.successor);
+    predecessors_[dependency.successor].push_back(dependency.predecessor);
+    point_filter_.Add(dependency.successor);
+    point_filter_.Add(dependency.predecessor);
+  }
+  for (const auto& marker : markers) {
+    successors_[marker.predecessor].push_back(marker.successor);
+    predecessors_[marker.successor].push_back(marker.predecessor);
+    markers_[marker.predecessor].push_back(marker.successor);
+    point_filter_.Add(marker.predecessor);
+    point_filter_.Add(marker.successor);
+  }
+  cv_.notify_all();
+}
+
+bool SyncPoint::Data::PredecessorsAllCleared(const std::string& point) {
+  for (const auto& pred : predecessors_[point]) {
+    if (cleared_points_.count(pred) == 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+void SyncPoint::Data::ClearCallBack(const std::string& point) {
+  std::unique_lock<std::mutex> lock(mutex_);
+  while (num_callbacks_running_ > 0) {
+    cv_.wait(lock);
+  }
+  callbacks_.erase(point);
+}
+
+void SyncPoint::Data::ClearAllCallBacks() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  while (num_callbacks_running_ > 0) {
+    cv_.wait(lock);
+  }
+  callbacks_.clear();
+}
+
+void SyncPoint::Data::Process(const Slice& point, void* cb_arg) {
+  if (!enabled_) {
+    return;
+  }
+
+  // Use a filter to prevent mutex lock if possible.
+  if (!point_filter_.MayContain(point)) {
+    return;
+  }
+
+  // Must convert to std::string for remaining work.  Take
+  //  heap hit.
+  std::string point_string(point.ToString());
+  std::unique_lock<std::mutex> lock(mutex_);
+  auto thread_id = std::this_thread::get_id();
+
+  auto marker_iter = markers_.find(point_string);
+  if (marker_iter != markers_.end()) {
+    for (auto& marked_point : marker_iter->second) {
+      marked_thread_id_.emplace(marked_point, thread_id);
+      point_filter_.Add(marked_point);
+    }
+  }
+
+  if (DisabledByMarker(point_string, thread_id)) {
+    return;
+  }
+
+  while (!PredecessorsAllCleared(point_string)) {
+    cv_.wait(lock);
+    if (DisabledByMarker(point_string, thread_id)) {
+      return;
+    }
+  }
+
+  auto callback_pair = callbacks_.find(point_string);
+  if (callback_pair != callbacks_.end()) {
+    num_callbacks_running_++;
+    mutex_.unlock();
+    callback_pair->second(cb_arg);
+    mutex_.lock();
+    num_callbacks_running_--;
+  }
+  cleared_points_.insert(point_string);
+  cv_.notify_all();
+}
+}  // namespace ROCKSDB_NAMESPACE
+#endif
diff --git a/src/rocksdb/test_util/sync_point_impl.h b/src/rocksdb/test_util/sync_point_impl.h
new file mode 100644
index 000000000..64cc0445e
--- /dev/null
+++ b/src/rocksdb/test_util/sync_point_impl.h
@@ -0,0 +1,96 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <assert.h>
+
+#include <atomic>
+#include <condition_variable>
+#include <functional>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "memory/concurrent_arena.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/dynamic_bloom.h"
+#include "util/random.h"
+
+#pragma once
+
+#ifndef NDEBUG
+namespace ROCKSDB_NAMESPACE {
+// A hacky allocator for single use.
+// Arena depends on SyncPoint and create circular dependency.
+class SingleAllocator : public Allocator {
+ public:
+  char* Allocate(size_t) override {
+    assert(false);
+    return nullptr;
+  }
+  char* AllocateAligned(size_t bytes, size_t, Logger*) override {
+    buf_.resize(bytes);
+    return const_cast<char*>(buf_.data());
+  }
+  size_t BlockSize() const override {
+    assert(false);
+    return 0;
+  }
+
+ private:
+  std::string buf_;
+};
+
+struct SyncPoint::Data {
+  Data() : point_filter_(&alloc_, /*total_bits=*/8192), enabled_(false) {}
+  // Enable proper deletion by subclasses
+  virtual ~Data() {}
+  // successor/predecessor map loaded from LoadDependency
+  std::unordered_map<std::string, std::vector<std::string>> successors_;
+  std::unordered_map<std::string, std::vector<std::string>> predecessors_;
+  std::unordered_map<std::string, std::function<void(void*)>> callbacks_;
+  std::unordered_map<std::string, std::vector<std::string>> markers_;
+  std::unordered_map<std::string, std::thread::id> marked_thread_id_;
+
+  std::mutex mutex_;
+  std::condition_variable cv_;
+  // sync points that have been passed through
+  std::unordered_set<std::string> cleared_points_;
+  SingleAllocator alloc_;
+  // A filter before holding mutex to speed up process.
+  DynamicBloom point_filter_;
+  std::atomic<bool> enabled_;
+  int num_callbacks_running_ = 0;
+
+  void LoadDependency(const std::vector<SyncPointPair>& dependencies);
+  void LoadDependencyAndMarkers(const std::vector<SyncPointPair>& dependencies,
+                                const std::vector<SyncPointPair>& markers);
+  bool PredecessorsAllCleared(const std::string& point);
+  void SetCallBack(const std::string& point,
+                   const std::function<void(void*)>& callback) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    callbacks_[point] = callback;
+    point_filter_.Add(point);
+  }
+
+  void ClearCallBack(const std::string& point);
+  void ClearAllCallBacks();
+  void EnableProcessing() { enabled_ = true; }
+  void DisableProcessing() { enabled_ = false; }
+  void ClearTrace() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    cleared_points_.clear();
+  }
+  bool DisabledByMarker(const std::string& point, std::thread::id thread_id) {
+    auto marked_point_iter = marked_thread_id_.find(point);
+    return marked_point_iter != marked_thread_id_.end() &&
+           thread_id != marked_point_iter->second;
+  }
+  void Process(const Slice& point, void* cb_arg);
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // NDEBUG
diff --git a/src/rocksdb/test_util/testharness.cc b/src/rocksdb/test_util/testharness.cc
new file mode 100644
index 000000000..3c7b835d2
--- /dev/null
+++ b/src/rocksdb/test_util/testharness.cc
@@ -0,0 +1,107 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "test_util/testharness.h"
+
+#include <regex>
+#include <string>
+#include <thread>
+
+namespace ROCKSDB_NAMESPACE {
+namespace test {
+
+#ifdef OS_WIN
+#include <windows.h>
+
+std::string GetPidStr() { return std::to_string(GetCurrentProcessId()); }
+#else
+std::string GetPidStr() { return std::to_string(getpid()); }
+#endif
+
+::testing::AssertionResult AssertStatus(const char* s_expr, const Status& s) {
+  if (s.ok()) {
+    return ::testing::AssertionSuccess();
+  } else {
+    return ::testing::AssertionFailure() << s_expr << std::endl << s.ToString();
+  }
+}
+
+std::string TmpDir(Env* env) {
+  std::string dir;
+  Status s = env->GetTestDirectory(&dir);
+  EXPECT_OK(s);
+  return dir;
+}
+
+std::string PerThreadDBPath(std::string dir, std::string name) {
+  size_t tid = std::hash<std::thread::id>()(std::this_thread::get_id());
+  return dir + "/" + name + "_" + GetPidStr() + "_" + std::to_string(tid);
+}
+
+std::string PerThreadDBPath(std::string name) {
+  return PerThreadDBPath(test::TmpDir(), name);
+}
+
+std::string PerThreadDBPath(Env* env, std::string name) {
+  return PerThreadDBPath(test::TmpDir(env), name);
+}
+
+int RandomSeed() {
+  const char* env = getenv("TEST_RANDOM_SEED");
+  int result = (env != nullptr ? atoi(env) : 301);
+  if (result <= 0) {
+    result = 301;
+  }
+  return result;
+}
+
+TestRegex::TestRegex(const std::string& pattern)
+    : impl_(std::make_shared<Impl>(pattern)), pattern_(pattern) {}
+TestRegex::TestRegex(const char* pattern)
+    : impl_(std::make_shared<Impl>(pattern)), pattern_(pattern) {}
+
+const std::string& TestRegex::GetPattern() const { return pattern_; }
+
+class TestRegex::Impl : public std::regex {
+ public:
+  using std::regex::basic_regex;
+};
+
+bool TestRegex::Matches(const std::string& str) const {
+  if (impl_) {
+    return std::regex_match(str, *impl_);
+  } else {
+    // Should not call Matches on unset Regex
+    assert(false);
+    return false;
+  }
+}
+
+::testing::AssertionResult AssertMatchesRegex(const char* str_expr,
+                                              const char* pattern_expr,
+                                              const std::string& str,
+                                              const TestRegex& pattern) {
+  if (pattern.Matches(str)) {
+    return ::testing::AssertionSuccess();
+  } else if (TestRegex("\".*\"").Matches(pattern_expr)) {
+    // constant regex string
+    return ::testing::AssertionFailure()
+           << str << " (" << str_expr << ")" << std::endl
+           << "does not match regex " << pattern.GetPattern();
+  } else {
+    // runtime regex string
+    return ::testing::AssertionFailure()
+           << str << " (" << str_expr << ")" << std::endl
+           << "does not match regex" << std::endl
+           << pattern.GetPattern() << " (" << pattern_expr << ")";
+  }
+}
+
+}  // namespace test
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/test_util/testharness.h b/src/rocksdb/test_util/testharness.h
new file mode 100644
index 000000000..69018629a
--- /dev/null
+++ b/src/rocksdb/test_util/testharness.h
@@ -0,0 +1,119 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifdef OS_AIX
+#include "gtest/gtest.h"
+#else
+#include <gtest/gtest.h>
+#endif
+
+// A "skipped" test has a specific meaning in Facebook infrastructure: the
+// test is in good shape and should be run, but something about the
+// compilation or execution environment means the test cannot be run.
+// Specifically, there is a hole in intended testing if any
+// parameterization of a test (e.g. Foo/FooTest.Bar/42) is skipped for all
+// tested build configurations/platforms/etc.
+//
+// If GTEST_SKIP is available, use it. Otherwise, define skip as success.
+//
+// The GTEST macros do not seem to print the message, even with -verbose,
+// so these print to stderr. Note that these do not exit the test themselves;
+// calling code should 'return' or similar from the test.
+#ifdef GTEST_SKIP_
+#define ROCKSDB_GTEST_SKIP(m)          \
+  do {                                 \
+    fputs("SKIPPED: " m "\n", stderr); \
+    GTEST_SKIP_(m);                    \
+  } while (false) /* user ; */
+#else
+#define ROCKSDB_GTEST_SKIP(m)          \
+  do {                                 \
+    fputs("SKIPPED: " m "\n", stderr); \
+    GTEST_SUCCESS_("SKIPPED: " m);     \
+  } while (false) /* user ; */
+#endif
+
+// We add "bypass" as an alternative to ROCKSDB_GTEST_SKIP that is allowed to
+// be a permanent condition, e.g. for intentionally omitting or disabling some
+// parameterizations for some tests. (Use _DISABLED at the end of the test
+// name to disable an entire test.)
+#define ROCKSDB_GTEST_BYPASS(m)         \
+  do {                                  \
+    fputs("BYPASSED: " m "\n", stderr); \
+    GTEST_SUCCESS_("BYPASSED: " m);     \
+  } while (false) /* user ; */
+
+#include <string>
+
+#include "port/stack_trace.h"
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace test {
+
+// Return the directory to use for temporary storage.
+std::string TmpDir(Env* env = Env::Default());
+
+// A path unique within the thread
+std::string PerThreadDBPath(std::string name);
+std::string PerThreadDBPath(Env* env, std::string name);
+std::string PerThreadDBPath(std::string dir, std::string name);
+
+// Return a randomization seed for this run.  Typically returns the
+// same number on repeated invocations of this binary, but automated
+// runs may be able to vary the seed.
+int RandomSeed();
+
+::testing::AssertionResult AssertStatus(const char* s_expr, const Status& s);
+
+#define ASSERT_OK(s) \
+  ASSERT_PRED_FORMAT1(ROCKSDB_NAMESPACE::test::AssertStatus, s)
+#define ASSERT_NOK(s) ASSERT_FALSE((s).ok())
+#define EXPECT_OK(s) \
+  EXPECT_PRED_FORMAT1(ROCKSDB_NAMESPACE::test::AssertStatus, s)
+#define EXPECT_NOK(s) EXPECT_FALSE((s).ok())
+
+// Useful for testing
+// * No need to deal with Status like in Regex public API
+// * No triggering lint reports on use of std::regex in tests
+// * Available in LITE (unlike public API)
+class TestRegex {
+ public:
+  // These throw on bad pattern
+  /*implicit*/ TestRegex(const std::string& pattern);
+  /*implicit*/ TestRegex(const char* pattern);
+
+  // Checks that the whole of str is matched by this regex
+  bool Matches(const std::string& str) const;
+
+  const std::string& GetPattern() const;
+
+ private:
+  class Impl;
+  std::shared_ptr<Impl> impl_;  // shared_ptr for simple implementation
+  std::string pattern_;
+};
+
+::testing::AssertionResult AssertMatchesRegex(const char* str_expr,
+                                              const char* pattern_expr,
+                                              const std::string& str,
+                                              const TestRegex& pattern);
+
+#define ASSERT_MATCHES_REGEX(str, pattern) \
+  ASSERT_PRED_FORMAT2(ROCKSDB_NAMESPACE::test::AssertMatchesRegex, str, pattern)
+#define EXPECT_MATCHES_REGEX(str, pattern) \
+  EXPECT_PRED_FORMAT2(ROCKSDB_NAMESPACE::test::AssertMatchesRegex, str, pattern)
+
+}  // namespace test
+
+using test::TestRegex;
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/test_util/testutil.cc b/src/rocksdb/test_util/testutil.cc
new file mode 100644
index 000000000..5e1b909f9
--- /dev/null
+++ b/src/rocksdb/test_util/testutil.cc
@@ -0,0 +1,738 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "test_util/testutil.h"
+
+#include <fcntl.h>
+#include <sys/stat.h>
+
+#include <array>
+#include <cctype>
+#include <fstream>
+#include <sstream>
+
+#include "db/memtable_list.h"
+#include "env/composite_env_wrapper.h"
+#include "file/random_access_file_reader.h"
+#include "file/sequence_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "port/port.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "test_util/mock_time_env.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+
+#ifndef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+namespace test {
+
+const uint32_t kDefaultFormatVersion = BlockBasedTableOptions().format_version;
+const std::set<uint32_t> kFooterFormatVersionsToTest{
+    5U,
+    // In case any interesting future changes
+    kDefaultFormatVersion,
+    kLatestFormatVersion,
+};
+
+std::string RandomKey(Random* rnd, int len, RandomKeyType type) {
+  // Make sure to generate a wide variety of characters so we
+  // test the boundary conditions for short-key optimizations.
+  static const char kTestChars[] = {'\0', '\1', 'a',    'b',    'c',
+                                    'd',  'e',  '\xfd', '\xfe', '\xff'};
+  std::string result;
+  for (int i = 0; i < len; i++) {
+    std::size_t indx = 0;
+    switch (type) {
+      case RandomKeyType::RANDOM:
+        indx = rnd->Uniform(sizeof(kTestChars));
+        break;
+      case RandomKeyType::LARGEST:
+        indx = sizeof(kTestChars) - 1;
+        break;
+      case RandomKeyType::MIDDLE:
+        indx = sizeof(kTestChars) / 2;
+        break;
+      case RandomKeyType::SMALLEST:
+        indx = 0;
+        break;
+    }
+    result += kTestChars[indx];
+  }
+  return result;
+}
+
+extern Slice CompressibleString(Random* rnd, double compressed_fraction,
+                                int len, std::string* dst) {
+  int raw = static_cast<int>(len * compressed_fraction);
+  if (raw < 1) raw = 1;
+  std::string raw_data = rnd->RandomString(raw);
+
+  // Duplicate the random data until we have filled "len" bytes
+  dst->clear();
+  while (dst->size() < (unsigned int)len) {
+    dst->append(raw_data);
+  }
+  dst->resize(len);
+  return Slice(*dst);
+}
+
+namespace {
+class Uint64ComparatorImpl : public Comparator {
+ public:
+  Uint64ComparatorImpl() {}
+
+  const char* Name() const override { return "rocksdb.Uint64Comparator"; }
+
+  int Compare(const Slice& a, const Slice& b) const override {
+    assert(a.size() == sizeof(uint64_t) && b.size() == sizeof(uint64_t));
+    const uint64_t* left = reinterpret_cast<const uint64_t*>(a.data());
+    const uint64_t* right = reinterpret_cast<const uint64_t*>(b.data());
+    uint64_t leftValue;
+    uint64_t rightValue;
+    GetUnaligned(left, &leftValue);
+    GetUnaligned(right, &rightValue);
+    if (leftValue == rightValue) {
+      return 0;
+    } else if (leftValue < rightValue) {
+      return -1;
+    } else {
+      return 1;
+    }
+  }
+
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {
+    return;
+  }
+
+  void FindShortSuccessor(std::string* /*key*/) const override { return; }
+};
+}  // namespace
+
+const Comparator* Uint64Comparator() {
+  static Uint64ComparatorImpl uint64comp;
+  return &uint64comp;
+}
+
+const Comparator* BytewiseComparatorWithU64TsWrapper() {
+  ConfigOptions config_options;
+  const Comparator* user_comparator = nullptr;
+  Status s = Comparator::CreateFromString(
+      config_options, "leveldb.BytewiseComparator.u64ts", &user_comparator);
+  s.PermitUncheckedError();
+  return user_comparator;
+}
+
+void CorruptKeyType(InternalKey* ikey) {
+  std::string keystr = ikey->Encode().ToString();
+  keystr[keystr.size() - 8] = kTypeLogData;
+  ikey->DecodeFrom(Slice(keystr.data(), keystr.size()));
+}
+
+std::string KeyStr(const std::string& user_key, const SequenceNumber& seq,
+                   const ValueType& t, bool corrupt) {
+  InternalKey k(user_key, seq, t);
+  if (corrupt) {
+    CorruptKeyType(&k);
+  }
+  return k.Encode().ToString();
+}
+
+std::string KeyStr(uint64_t ts, const std::string& user_key,
+                   const SequenceNumber& seq, const ValueType& t,
+                   bool corrupt) {
+  std::string user_key_with_ts(user_key);
+  std::string ts_str;
+  PutFixed64(&ts_str, ts);
+  user_key_with_ts.append(ts_str);
+  return KeyStr(user_key_with_ts, seq, t, corrupt);
+}
+
+bool SleepingBackgroundTask::TimedWaitUntilSleeping(uint64_t wait_time) {
+  auto abs_time = SystemClock::Default()->NowMicros() + wait_time;
+  MutexLock l(&mutex_);
+  while (!sleeping_ || !should_sleep_) {
+    if (bg_cv_.TimedWait(abs_time)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool SleepingBackgroundTask::TimedWaitUntilDone(uint64_t wait_time) {
+  auto abs_time = SystemClock::Default()->NowMicros() + wait_time;
+  MutexLock l(&mutex_);
+  while (!done_with_sleep_) {
+    if (bg_cv_.TimedWait(abs_time)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+std::string RandomName(Random* rnd, const size_t len) {
+  std::stringstream ss;
+  for (size_t i = 0; i < len; ++i) {
+    ss << static_cast<char>(rnd->Uniform(26) + 'a');
+  }
+  return ss.str();
+}
+
+CompressionType RandomCompressionType(Random* rnd) {
+  auto ret = static_cast<CompressionType>(rnd->Uniform(6));
+  while (!CompressionTypeSupported(ret)) {
+    ret = static_cast<CompressionType>((static_cast<int>(ret) + 1) % 6);
+  }
+  return ret;
+}
+
+void RandomCompressionTypeVector(const size_t count,
+                                 std::vector<CompressionType>* types,
+                                 Random* rnd) {
+  types->clear();
+  for (size_t i = 0; i < count; ++i) {
+    types->emplace_back(RandomCompressionType(rnd));
+  }
+}
+
+const SliceTransform* RandomSliceTransform(Random* rnd, int pre_defined) {
+  int random_num = pre_defined >= 0 ? pre_defined : rnd->Uniform(4);
+  switch (random_num) {
+    case 0:
+      return NewFixedPrefixTransform(rnd->Uniform(20) + 1);
+    case 1:
+      return NewCappedPrefixTransform(rnd->Uniform(20) + 1);
+    case 2:
+      return NewNoopTransform();
+    default:
+      return nullptr;
+  }
+}
+
+BlockBasedTableOptions RandomBlockBasedTableOptions(Random* rnd) {
+  BlockBasedTableOptions opt;
+  opt.cache_index_and_filter_blocks = rnd->Uniform(2);
+  opt.pin_l0_filter_and_index_blocks_in_cache = rnd->Uniform(2);
+  opt.pin_top_level_index_and_filter = rnd->Uniform(2);
+  using IndexType = BlockBasedTableOptions::IndexType;
+  const std::array<IndexType, 4> index_types = {
+      {IndexType::kBinarySearch, IndexType::kHashSearch,
+       IndexType::kTwoLevelIndexSearch, IndexType::kBinarySearchWithFirstKey}};
+  opt.index_type =
+      index_types[rnd->Uniform(static_cast<int>(index_types.size()))];
+  opt.checksum = static_cast<ChecksumType>(rnd->Uniform(3));
+  opt.block_size = rnd->Uniform(10000000);
+  opt.block_size_deviation = rnd->Uniform(100);
+  opt.block_restart_interval = rnd->Uniform(100);
+  opt.index_block_restart_interval = rnd->Uniform(100);
+  opt.whole_key_filtering = rnd->Uniform(2);
+
+  return opt;
+}
+
+TableFactory* RandomTableFactory(Random* rnd, int pre_defined) {
+#ifndef ROCKSDB_LITE
+  int random_num = pre_defined >= 0 ? pre_defined : rnd->Uniform(4);
+  switch (random_num) {
+    case 0:
+      return NewPlainTableFactory();
+    case 1:
+      return NewCuckooTableFactory();
+    default:
+      return NewBlockBasedTableFactory();
+  }
+#else
+  (void)rnd;
+  (void)pre_defined;
+  return NewBlockBasedTableFactory();
+#endif  // !ROCKSDB_LITE
+}
+
+MergeOperator* RandomMergeOperator(Random* rnd) {
+  return new ChanglingMergeOperator(RandomName(rnd, 10));
+}
+
+CompactionFilter* RandomCompactionFilter(Random* rnd) {
+  return new ChanglingCompactionFilter(RandomName(rnd, 10));
+}
+
+CompactionFilterFactory* RandomCompactionFilterFactory(Random* rnd) {
+  return new ChanglingCompactionFilterFactory(RandomName(rnd, 10));
+}
+
+void RandomInitDBOptions(DBOptions* db_opt, Random* rnd) {
+  // boolean options
+  db_opt->advise_random_on_open = rnd->Uniform(2);
+  db_opt->allow_mmap_reads = rnd->Uniform(2);
+  db_opt->allow_mmap_writes = rnd->Uniform(2);
+  db_opt->use_direct_reads = rnd->Uniform(2);
+  db_opt->use_direct_io_for_flush_and_compaction = rnd->Uniform(2);
+  db_opt->create_if_missing = rnd->Uniform(2);
+  db_opt->create_missing_column_families = rnd->Uniform(2);
+  db_opt->enable_thread_tracking = rnd->Uniform(2);
+  db_opt->error_if_exists = rnd->Uniform(2);
+  db_opt->is_fd_close_on_exec = rnd->Uniform(2);
+  db_opt->paranoid_checks = rnd->Uniform(2);
+  db_opt->track_and_verify_wals_in_manifest = rnd->Uniform(2);
+  db_opt->verify_sst_unique_id_in_manifest = rnd->Uniform(2);
+  db_opt->skip_stats_update_on_db_open = rnd->Uniform(2);
+  db_opt->skip_checking_sst_file_sizes_on_db_open = rnd->Uniform(2);
+  db_opt->use_adaptive_mutex = rnd->Uniform(2);
+  db_opt->use_fsync = rnd->Uniform(2);
+  db_opt->recycle_log_file_num = rnd->Uniform(2);
+  db_opt->avoid_flush_during_recovery = rnd->Uniform(2);
+  db_opt->avoid_flush_during_shutdown = rnd->Uniform(2);
+  db_opt->enforce_single_del_contracts = rnd->Uniform(2);
+
+  // int options
+  db_opt->max_background_compactions = rnd->Uniform(100);
+  db_opt->max_background_flushes = rnd->Uniform(100);
+  db_opt->max_file_opening_threads = rnd->Uniform(100);
+  db_opt->max_open_files = rnd->Uniform(100);
+  db_opt->table_cache_numshardbits = rnd->Uniform(100);
+
+  // size_t options
+  db_opt->db_write_buffer_size = rnd->Uniform(10000);
+  db_opt->keep_log_file_num = rnd->Uniform(10000);
+  db_opt->log_file_time_to_roll = rnd->Uniform(10000);
+  db_opt->manifest_preallocation_size = rnd->Uniform(10000);
+  db_opt->max_log_file_size = rnd->Uniform(10000);
+
+  // std::string options
+  db_opt->db_log_dir = "path/to/db_log_dir";
+  db_opt->wal_dir = "path/to/wal_dir";
+
+  // uint32_t options
+  db_opt->max_subcompactions = rnd->Uniform(100000);
+
+  // uint64_t options
+  static const uint64_t uint_max = static_cast<uint64_t>(UINT_MAX);
+  db_opt->WAL_size_limit_MB = uint_max + rnd->Uniform(100000);
+  db_opt->WAL_ttl_seconds = uint_max + rnd->Uniform(100000);
+  db_opt->bytes_per_sync = uint_max + rnd->Uniform(100000);
+  db_opt->delayed_write_rate = uint_max + rnd->Uniform(100000);
+  db_opt->delete_obsolete_files_period_micros = uint_max + rnd->Uniform(100000);
+  db_opt->max_manifest_file_size = uint_max + rnd->Uniform(100000);
+  db_opt->max_total_wal_size = uint_max + rnd->Uniform(100000);
+  db_opt->wal_bytes_per_sync = uint_max + rnd->Uniform(100000);
+
+  // unsigned int options
+  db_opt->stats_dump_period_sec = rnd->Uniform(100000);
+}
+
+void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, DBOptions& db_options,
+                         Random* rnd) {
+  cf_opt->compaction_style = (CompactionStyle)(rnd->Uniform(4));
+
+  // boolean options
+  cf_opt->report_bg_io_stats = rnd->Uniform(2);
+  cf_opt->disable_auto_compactions = rnd->Uniform(2);
+  cf_opt->inplace_update_support = rnd->Uniform(2);
+  cf_opt->level_compaction_dynamic_level_bytes = rnd->Uniform(2);
+  cf_opt->optimize_filters_for_hits = rnd->Uniform(2);
+  cf_opt->paranoid_file_checks = rnd->Uniform(2);
+  cf_opt->force_consistency_checks = rnd->Uniform(2);
+  cf_opt->compaction_options_fifo.allow_compaction = rnd->Uniform(2);
+  cf_opt->memtable_whole_key_filtering = rnd->Uniform(2);
+  cf_opt->enable_blob_files = rnd->Uniform(2);
+  cf_opt->enable_blob_garbage_collection = rnd->Uniform(2);
+
+  // double options
+  cf_opt->memtable_prefix_bloom_size_ratio =
+      static_cast<double>(rnd->Uniform(10000)) / 20000.0;
+  cf_opt->blob_garbage_collection_age_cutoff = rnd->Uniform(10000) / 10000.0;
+  cf_opt->blob_garbage_collection_force_threshold =
+      rnd->Uniform(10000) / 10000.0;
+
+  // int options
+  cf_opt->level0_file_num_compaction_trigger = rnd->Uniform(100);
+  cf_opt->level0_slowdown_writes_trigger = rnd->Uniform(100);
+  cf_opt->level0_stop_writes_trigger = rnd->Uniform(100);
+  cf_opt->max_bytes_for_level_multiplier = rnd->Uniform(100);
+  cf_opt->max_write_buffer_number = rnd->Uniform(100);
+  cf_opt->max_write_buffer_number_to_maintain = rnd->Uniform(100);
+  cf_opt->max_write_buffer_size_to_maintain = rnd->Uniform(10000);
+  cf_opt->min_write_buffer_number_to_merge = rnd->Uniform(100);
+  cf_opt->num_levels = rnd->Uniform(100);
+  cf_opt->target_file_size_multiplier = rnd->Uniform(100);
+
+  // vector int options
+  cf_opt->max_bytes_for_level_multiplier_additional.resize(cf_opt->num_levels);
+  for (int i = 0; i < cf_opt->num_levels; i++) {
+    cf_opt->max_bytes_for_level_multiplier_additional[i] = rnd->Uniform(100);
+  }
+
+  // size_t options
+  cf_opt->arena_block_size = rnd->Uniform(10000);
+  cf_opt->inplace_update_num_locks = rnd->Uniform(10000);
+  cf_opt->max_successive_merges = rnd->Uniform(10000);
+  cf_opt->memtable_huge_page_size = rnd->Uniform(10000);
+  cf_opt->write_buffer_size = rnd->Uniform(10000);
+
+  // uint32_t options
+  cf_opt->bloom_locality = rnd->Uniform(10000);
+  cf_opt->max_bytes_for_level_base = rnd->Uniform(10000);
+
+  // uint64_t options
+  static const uint64_t uint_max = static_cast<uint64_t>(UINT_MAX);
+  cf_opt->ttl =
+      db_options.max_open_files == -1 ? uint_max + rnd->Uniform(10000) : 0;
+  cf_opt->periodic_compaction_seconds =
+      db_options.max_open_files == -1 ? uint_max + rnd->Uniform(10000) : 0;
+  cf_opt->max_sequential_skip_in_iterations = uint_max + rnd->Uniform(10000);
+  cf_opt->target_file_size_base = uint_max + rnd->Uniform(10000);
+  cf_opt->max_compaction_bytes =
+      cf_opt->target_file_size_base * rnd->Uniform(100);
+  cf_opt->compaction_options_fifo.max_table_files_size =
+      uint_max + rnd->Uniform(10000);
+  cf_opt->min_blob_size = uint_max + rnd->Uniform(10000);
+  cf_opt->blob_file_size = uint_max + rnd->Uniform(10000);
+  cf_opt->blob_compaction_readahead_size = uint_max + rnd->Uniform(10000);
+
+  // pointer typed options
+  cf_opt->prefix_extractor.reset(RandomSliceTransform(rnd));
+  cf_opt->table_factory.reset(RandomTableFactory(rnd));
+  cf_opt->merge_operator.reset(RandomMergeOperator(rnd));
+  if (cf_opt->compaction_filter) {
+    delete cf_opt->compaction_filter;
+  }
+  cf_opt->compaction_filter = RandomCompactionFilter(rnd);
+  cf_opt->compaction_filter_factory.reset(RandomCompactionFilterFactory(rnd));
+
+  // custom typed options
+  cf_opt->compression = RandomCompressionType(rnd);
+  RandomCompressionTypeVector(cf_opt->num_levels,
+                              &cf_opt->compression_per_level, rnd);
+  cf_opt->blob_compression_type = RandomCompressionType(rnd);
+}
+
+bool IsDirectIOSupported(Env* env, const std::string& dir) {
+  EnvOptions env_options;
+  env_options.use_mmap_writes = false;
+  env_options.use_direct_writes = true;
+  std::string tmp = TempFileName(dir, 999);
+  Status s;
+  {
+    std::unique_ptr<WritableFile> file;
+    s = env->NewWritableFile(tmp, &file, env_options);
+  }
+  if (s.ok()) {
+    s = env->DeleteFile(tmp);
+  }
+  return s.ok();
+}
+
+bool IsPrefetchSupported(const std::shared_ptr<FileSystem>& fs,
+                         const std::string& dir) {
+  bool supported = false;
+  std::string tmp = TempFileName(dir, 999);
+  Random rnd(301);
+  std::string test_string = rnd.RandomString(4096);
+  Slice data(test_string);
+  Status s = WriteStringToFile(fs.get(), data, tmp, true);
+  if (s.ok()) {
+    std::unique_ptr<FSRandomAccessFile> file;
+    auto io_s = fs->NewRandomAccessFile(tmp, FileOptions(), &file, nullptr);
+    if (io_s.ok()) {
+      supported = !(file->Prefetch(0, data.size(), IOOptions(), nullptr)
+                        .IsNotSupported());
+    }
+    s = fs->DeleteFile(tmp, IOOptions(), nullptr);
+  }
+  return s.ok() && supported;
+}
+
+size_t GetLinesCount(const std::string& fname, const std::string& pattern) {
+  std::stringstream ssbuf;
+  std::string line;
+  size_t count = 0;
+
+  std::ifstream inFile(fname.c_str());
+  ssbuf << inFile.rdbuf();
+
+  while (getline(ssbuf, line)) {
+    if (line.find(pattern) != std::string::npos) {
+      count++;
+    }
+  }
+
+  return count;
+}
+
+Status CorruptFile(Env* env, const std::string& fname, int offset,
+                   int bytes_to_corrupt, bool verify_checksum /*=true*/) {
+  uint64_t size;
+  Status s = env->GetFileSize(fname, &size);
+  if (!s.ok()) {
+    return s;
+  } else if (offset < 0) {
+    // Relative to end of file; make it absolute
+    if (-offset > static_cast<int>(size)) {
+      offset = 0;
+    } else {
+      offset = static_cast<int>(size + offset);
+    }
+  }
+  if (offset > static_cast<int>(size)) {
+    offset = static_cast<int>(size);
+  }
+  if (offset + bytes_to_corrupt > static_cast<int>(size)) {
+    bytes_to_corrupt = static_cast<int>(size - offset);
+  }
+
+  // Do it
+  std::string contents;
+  s = ReadFileToString(env, fname, &contents);
+  if (s.ok()) {
+    for (int i = 0; i < bytes_to_corrupt; i++) {
+      contents[i + offset] ^= 0x80;
+    }
+    s = WriteStringToFile(env, contents, fname);
+  }
+  if (s.ok() && verify_checksum) {
+#ifndef ROCKSDB_LITE
+    Options options;
+    options.env = env;
+    EnvOptions env_options;
+    Status v = VerifySstFileChecksum(options, env_options, fname);
+    assert(!v.ok());
+#endif
+  }
+  return s;
+}
+
+Status TruncateFile(Env* env, const std::string& fname, uint64_t new_length) {
+  uint64_t old_length;
+  Status s = env->GetFileSize(fname, &old_length);
+  if (!s.ok() || new_length == old_length) {
+    return s;
+  }
+  // Do it
+  std::string contents;
+  s = ReadFileToString(env, fname, &contents);
+  if (s.ok()) {
+    contents.resize(static_cast<size_t>(new_length), 'b');
+    s = WriteStringToFile(env, contents, fname);
+  }
+  return s;
+}
+
+// Try and delete a directory if it exists
+Status TryDeleteDir(Env* env, const std::string& dirname) {
+  bool is_dir = false;
+  Status s = env->IsDirectory(dirname, &is_dir);
+  if (s.ok() && is_dir) {
+    s = env->DeleteDir(dirname);
+  }
+  return s;
+}
+
+// Delete a directory if it exists
+void DeleteDir(Env* env, const std::string& dirname) {
+  TryDeleteDir(env, dirname).PermitUncheckedError();
+}
+
+Status CreateEnvFromSystem(const ConfigOptions& config_options, Env** result,
+                           std::shared_ptr<Env>* guard) {
+  const char* env_uri = getenv("TEST_ENV_URI");
+  const char* fs_uri = getenv("TEST_FS_URI");
+  if (env_uri || fs_uri) {
+    return Env::CreateFromUri(config_options,
+                              (env_uri != nullptr) ? env_uri : "",
+                              (fs_uri != nullptr) ? fs_uri : "", result, guard);
+  } else {
+    // Neither specified.  Use the default
+    *result = config_options.env;
+    guard->reset();
+    return Status::OK();
+  }
+}
+namespace {
+// A hacky skip list mem table that triggers flush after number of entries.
+class SpecialMemTableRep : public MemTableRep {
+ public:
+  explicit SpecialMemTableRep(Allocator* allocator, MemTableRep* memtable,
+                              int num_entries_flush)
+      : MemTableRep(allocator),
+        memtable_(memtable),
+        num_entries_flush_(num_entries_flush),
+        num_entries_(0) {}
+
+  virtual KeyHandle Allocate(const size_t len, char** buf) override {
+    return memtable_->Allocate(len, buf);
+  }
+
+  // Insert key into the list.
+  // REQUIRES: nothing that compares equal to key is currently in the list.
+  virtual void Insert(KeyHandle handle) override {
+    num_entries_++;
+    memtable_->Insert(handle);
+  }
+
+  void InsertConcurrently(KeyHandle handle) override {
+    num_entries_++;
+    memtable_->Insert(handle);
+  }
+
+  // Returns true iff an entry that compares equal to key is in the list.
+  virtual bool Contains(const char* key) const override {
+    return memtable_->Contains(key);
+  }
+
+  virtual size_t ApproximateMemoryUsage() override {
+    // Return a high memory usage when number of entries exceeds the threshold
+    // to trigger a flush.
+    return (num_entries_ < num_entries_flush_) ? 0 : 1024 * 1024 * 1024;
+  }
+
+  virtual void Get(const LookupKey& k, void* callback_args,
+                   bool (*callback_func)(void* arg,
+                                         const char* entry)) override {
+    memtable_->Get(k, callback_args, callback_func);
+  }
+
+  uint64_t ApproximateNumEntries(const Slice& start_ikey,
+                                 const Slice& end_ikey) override {
+    return memtable_->ApproximateNumEntries(start_ikey, end_ikey);
+  }
+
+  virtual MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override {
+    return memtable_->GetIterator(arena);
+  }
+
+  virtual ~SpecialMemTableRep() override {}
+
+ private:
+  std::unique_ptr<MemTableRep> memtable_;
+  int num_entries_flush_;
+  int num_entries_;
+};
+class SpecialSkipListFactory : public MemTableRepFactory {
+ public:
+#ifndef ROCKSDB_LITE
+  static bool Register(ObjectLibrary& library, const std::string& /*arg*/) {
+    library.AddFactory<MemTableRepFactory>(
+        ObjectLibrary::PatternEntry(SpecialSkipListFactory::kClassName(), true)
+            .AddNumber(":"),
+        [](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
+           std::string* /* errmsg */) {
+          auto colon = uri.find(":");
+          if (colon != std::string::npos) {
+            auto count = ParseInt(uri.substr(colon + 1));
+            guard->reset(new SpecialSkipListFactory(count));
+          } else {
+            guard->reset(new SpecialSkipListFactory(2));
+          }
+          return guard->get();
+        });
+    return true;
+  }
+#endif  // ROCKSDB_LITE
+  // After number of inserts exceeds `num_entries_flush` in a mem table, trigger
+  // flush.
+  explicit SpecialSkipListFactory(int num_entries_flush)
+      : num_entries_flush_(num_entries_flush) {}
+
+  using MemTableRepFactory::CreateMemTableRep;
+  virtual MemTableRep* CreateMemTableRep(
+      const MemTableRep::KeyComparator& compare, Allocator* allocator,
+      const SliceTransform* transform, Logger* /*logger*/) override {
+    return new SpecialMemTableRep(
+        allocator,
+        factory_.CreateMemTableRep(compare, allocator, transform, nullptr),
+        num_entries_flush_);
+  }
+  static const char* kClassName() { return "SpecialSkipListFactory"; }
+  virtual const char* Name() const override { return kClassName(); }
+  std::string GetId() const override {
+    std::string id = Name();
+    if (num_entries_flush_ > 0) {
+      id.append(":").append(std::to_string(num_entries_flush_));
+    }
+    return id;
+  }
+
+  bool IsInsertConcurrentlySupported() const override {
+    return factory_.IsInsertConcurrentlySupported();
+  }
+
+ private:
+  SkipListFactory factory_;
+  int num_entries_flush_;
+};
+}  // namespace
+
+MemTableRepFactory* NewSpecialSkipListFactory(int num_entries_per_flush) {
+  RegisterTestLibrary();
+  return new SpecialSkipListFactory(num_entries_per_flush);
+}
+
+#ifndef ROCKSDB_LITE
+// This method loads existing test classes into the ObjectRegistry
+int RegisterTestObjects(ObjectLibrary& library, const std::string& arg) {
+  size_t num_types;
+  library.AddFactory<const Comparator>(
+      test::SimpleSuffixReverseComparator::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<const Comparator>* /*guard*/,
+         std::string* /* errmsg */) {
+        static test::SimpleSuffixReverseComparator ssrc;
+        return &ssrc;
+      });
+  SpecialSkipListFactory::Register(library, arg);
+  library.AddFactory<MergeOperator>(
+      "Changling",
+      [](const std::string& uri, std::unique_ptr<MergeOperator>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new test::ChanglingMergeOperator(uri));
+        return guard->get();
+      });
+  library.AddFactory<CompactionFilter>(
+      "Changling",
+      [](const std::string& uri, std::unique_ptr<CompactionFilter>* /*guard*/,
+         std::string* /* errmsg */) {
+        return new test::ChanglingCompactionFilter(uri);
+      });
+  library.AddFactory<CompactionFilterFactory>(
+      "Changling", [](const std::string& uri,
+                      std::unique_ptr<CompactionFilterFactory>* guard,
+                      std::string* /* errmsg */) {
+        guard->reset(new test::ChanglingCompactionFilterFactory(uri));
+        return guard->get();
+      });
+  library.AddFactory<SystemClock>(
+      MockSystemClock::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<SystemClock>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new MockSystemClock(SystemClock::Default()));
+        return guard->get();
+      });
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+
+#endif  // ROCKSDB_LITE
+
+void RegisterTestLibrary(const std::string& arg) {
+  static bool registered = false;
+  if (!registered) {
+    registered = true;
+#ifndef ROCKSDB_LITE
+    ObjectRegistry::Default()->AddLibrary("test", RegisterTestObjects, arg);
+#else
+    (void)arg;
+#endif  // ROCKSDB_LITE
+  }
+}
+}  // namespace test
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/test_util/testutil.h b/src/rocksdb/test_util/testutil.h
new file mode 100644
index 000000000..1f43156ab
--- /dev/null
+++ b/src/rocksdb/test_util/testutil.h
@@ -0,0 +1,852 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <algorithm>
+#include <deque>
+#include <string>
+#include <vector>
+
+#include "env/composite_env_wrapper.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/table.h"
+#include "table/internal_iterator.h"
+#include "util/mutexlock.h"
+
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int argc, char** argv);
+#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
+namespace ROCKSDB_NAMESPACE {
+class FileSystem;
+class MemTableRepFactory;
+class ObjectLibrary;
+class Random;
+class SequentialFile;
+class SequentialFileReader;
+
+namespace test {
+
+extern const uint32_t kDefaultFormatVersion;
+extern const std::set<uint32_t> kFooterFormatVersionsToTest;
+
+// Return a random key with the specified length that may contain interesting
+// characters (e.g. \x00, \xff, etc.).
+enum RandomKeyType : char { RANDOM, LARGEST, SMALLEST, MIDDLE };
+extern std::string RandomKey(Random* rnd, int len,
+                             RandomKeyType type = RandomKeyType::RANDOM);
+
+// Store in *dst a string of length "len" that will compress to
+// "N*compressed_fraction" bytes and return a Slice that references
+// the generated data.
+extern Slice CompressibleString(Random* rnd, double compressed_fraction,
+                                int len, std::string* dst);
+
+#ifndef NDEBUG
+// An internal comparator that just forward comparing results from the
+// user comparator in it. Can be used to test entities that have no dependency
+// on internal key structure but consumes InternalKeyComparator, like
+// BlockBasedTable.
+class PlainInternalKeyComparator : public InternalKeyComparator {
+ public:
+  explicit PlainInternalKeyComparator(const Comparator* c)
+      : InternalKeyComparator(c) {}
+
+  virtual ~PlainInternalKeyComparator() {}
+
+  virtual int Compare(const Slice& a, const Slice& b) const override {
+    return user_comparator()->Compare(a, b);
+  }
+};
+#endif
+
+// A test comparator which compare two strings in this way:
+// (1) first compare prefix of 8 bytes in alphabet order,
+// (2) if two strings share the same prefix, sort the other part of the string
+//     in the reverse alphabet order.
+// This helps simulate the case of compounded key of [entity][timestamp] and
+// latest timestamp first.
+class SimpleSuffixReverseComparator : public Comparator {
+ public:
+  SimpleSuffixReverseComparator() {}
+  static const char* kClassName() { return "SimpleSuffixReverseComparator"; }
+  virtual const char* Name() const override { return kClassName(); }
+
+  virtual int Compare(const Slice& a, const Slice& b) const override {
+    Slice prefix_a = Slice(a.data(), 8);
+    Slice prefix_b = Slice(b.data(), 8);
+    int prefix_comp = prefix_a.compare(prefix_b);
+    if (prefix_comp != 0) {
+      return prefix_comp;
+    } else {
+      Slice suffix_a = Slice(a.data() + 8, a.size() - 8);
+      Slice suffix_b = Slice(b.data() + 8, b.size() - 8);
+      return -(suffix_a.compare(suffix_b));
+    }
+  }
+  virtual void FindShortestSeparator(std::string* /*start*/,
+                                     const Slice& /*limit*/) const override {}
+
+  virtual void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+// Returns a user key comparator that can be used for comparing two uint64_t
+// slices. Instead of comparing slices byte-wise, it compares all the 8 bytes
+// at once. Assumes same endian-ness is used though the database's lifetime.
+// Symantics of comparison would differ from Bytewise comparator in little
+// endian machines.
+extern const Comparator* Uint64Comparator();
+
+// A wrapper api for getting the ComparatorWithU64Ts<BytewiseComparator>
+extern const Comparator* BytewiseComparatorWithU64TsWrapper();
+
+class StringSink : public FSWritableFile {
+ public:
+  std::string contents_;
+
+  explicit StringSink(Slice* reader_contents = nullptr)
+      : FSWritableFile(),
+        contents_(""),
+        reader_contents_(reader_contents),
+        last_flush_(0) {
+    if (reader_contents_ != nullptr) {
+      *reader_contents_ = Slice(contents_.data(), 0);
+    }
+  }
+
+  const std::string& contents() const { return contents_; }
+
+  IOStatus Truncate(uint64_t size, const IOOptions& /*opts*/,
+                    IODebugContext* /*dbg*/) override {
+    contents_.resize(static_cast<size_t>(size));
+    return IOStatus::OK();
+  }
+  IOStatus Close(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
+  IOStatus Flush(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
+    if (reader_contents_ != nullptr) {
+      assert(reader_contents_->size() <= last_flush_);
+      size_t offset = last_flush_ - reader_contents_->size();
+      *reader_contents_ =
+          Slice(contents_.data() + offset, contents_.size() - offset);
+      last_flush_ = contents_.size();
+    }
+
+    return IOStatus::OK();
+  }
+  IOStatus Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
+
+  using FSWritableFile::Append;
+  IOStatus Append(const Slice& slice, const IOOptions& /*opts*/,
+                  IODebugContext* /*dbg*/) override {
+    contents_.append(slice.data(), slice.size());
+    return IOStatus::OK();
+  }
+  void Drop(size_t bytes) {
+    if (reader_contents_ != nullptr) {
+      contents_.resize(contents_.size() - bytes);
+      *reader_contents_ =
+          Slice(reader_contents_->data(), reader_contents_->size() - bytes);
+      last_flush_ = contents_.size();
+    }
+  }
+
+ private:
+  Slice* reader_contents_;
+  size_t last_flush_;
+};
+
+// A wrapper around a StringSink to give it a RandomRWFile interface
+class RandomRWStringSink : public FSRandomRWFile {
+ public:
+  explicit RandomRWStringSink(StringSink* ss) : ss_(ss) {}
+
+  IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& /*opts*/,
+                 IODebugContext* /*dbg*/) override {
+    if (offset + data.size() > ss_->contents_.size()) {
+      ss_->contents_.resize(static_cast<size_t>(offset) + data.size(), '\0');
+    }
+
+    char* pos = const_cast<char*>(ss_->contents_.data() + offset);
+    memcpy(pos, data.data(), data.size());
+    return IOStatus::OK();
+  }
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*opts*/,
+                Slice* result, char* /*scratch*/,
+                IODebugContext* /*dbg*/) const override {
+    *result = Slice(nullptr, 0);
+    if (offset < ss_->contents_.size()) {
+      size_t str_res_sz =
+          std::min(static_cast<size_t>(ss_->contents_.size() - offset), n);
+      *result = Slice(ss_->contents_.data() + offset, str_res_sz);
+    }
+    return IOStatus::OK();
+  }
+
+  IOStatus Flush(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
+
+  IOStatus Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
+
+  IOStatus Close(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
+
+  const std::string& contents() const { return ss_->contents(); }
+
+ private:
+  StringSink* ss_;
+};
+
+// Like StringSink, this writes into a string.  Unlink StringSink, it
+// has some initial content and overwrites it, just like a recycled
+// log file.
+class OverwritingStringSink : public FSWritableFile {
+ public:
+  explicit OverwritingStringSink(Slice* reader_contents)
+      : FSWritableFile(),
+        contents_(""),
+        reader_contents_(reader_contents),
+        last_flush_(0) {}
+
+  const std::string& contents() const { return contents_; }
+
+  IOStatus Truncate(uint64_t size, const IOOptions& /*opts*/,
+                    IODebugContext* /*dbg*/) override {
+    contents_.resize(static_cast<size_t>(size));
+    return IOStatus::OK();
+  }
+  IOStatus Close(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
+  IOStatus Flush(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
+    if (last_flush_ < contents_.size()) {
+      assert(reader_contents_->size() >= contents_.size());
+      memcpy((char*)reader_contents_->data() + last_flush_,
+             contents_.data() + last_flush_, contents_.size() - last_flush_);
+      last_flush_ = contents_.size();
+    }
+    return IOStatus::OK();
+  }
+  IOStatus Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
+
+  using FSWritableFile::Append;
+  IOStatus Append(const Slice& slice, const IOOptions& /*opts*/,
+                  IODebugContext* /*dbg*/) override {
+    contents_.append(slice.data(), slice.size());
+    return IOStatus::OK();
+  }
+  void Drop(size_t bytes) {
+    contents_.resize(contents_.size() - bytes);
+    if (last_flush_ > contents_.size()) last_flush_ = contents_.size();
+  }
+
+ private:
+  std::string contents_;
+  Slice* reader_contents_;
+  size_t last_flush_;
+};
+
+class StringSource : public FSRandomAccessFile {
+ public:
+  explicit StringSource(const Slice& contents, uint64_t uniq_id = 0,
+                        bool mmap = false)
+      : contents_(contents.data(), contents.size()),
+        uniq_id_(uniq_id),
+        mmap_(mmap),
+        total_reads_(0) {}
+
+  virtual ~StringSource() {}
+
+  uint64_t Size() const { return contents_.size(); }
+
+  IOStatus Prefetch(uint64_t /*offset*/, size_t /*n*/,
+                    const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    // If we are using mmap_, it is equivalent to performing a prefetch
+    if (mmap_) {
+      return IOStatus::OK();
+    } else {
+      return IOStatus::NotSupported("Prefetch not supported");
+    }
+  }
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*opts*/,
+                Slice* result, char* scratch,
+                IODebugContext* /*dbg*/) const override {
+    total_reads_++;
+    if (offset > contents_.size()) {
+      return IOStatus::InvalidArgument("invalid Read offset");
+    }
+    if (offset + n > contents_.size()) {
+      n = contents_.size() - static_cast<size_t>(offset);
+    }
+    if (!mmap_) {
+      memcpy(scratch, &contents_[static_cast<size_t>(offset)], n);
+      *result = Slice(scratch, n);
+    } else {
+      *result = Slice(&contents_[static_cast<size_t>(offset)], n);
+    }
+    return IOStatus::OK();
+  }
+
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    if (max_size < 20) {
+      return 0;
+    }
+
+    char* rid = id;
+    rid = EncodeVarint64(rid, uniq_id_);
+    rid = EncodeVarint64(rid, 0);
+    return static_cast<size_t>(rid - id);
+  }
+
+  int total_reads() const { return total_reads_; }
+
+  void set_total_reads(int tr) { total_reads_ = tr; }
+
+ private:
+  std::string contents_;
+  uint64_t uniq_id_;
+  bool mmap_;
+  mutable int total_reads_;
+};
+
+class NullLogger : public Logger {
+ public:
+  using Logger::Logv;
+  virtual void Logv(const char* /*format*/, va_list /*ap*/) override {}
+  virtual size_t GetLogFileSize() const override { return 0; }
+};
+
+// Corrupts key by changing the type
+extern void CorruptKeyType(InternalKey* ikey);
+
+extern std::string KeyStr(const std::string& user_key,
+                          const SequenceNumber& seq, const ValueType& t,
+                          bool corrupt = false);
+
+extern std::string KeyStr(uint64_t ts, const std::string& user_key,
+                          const SequenceNumber& seq, const ValueType& t,
+                          bool corrupt = false);
+
+class SleepingBackgroundTask {
+ public:
+  SleepingBackgroundTask()
+      : bg_cv_(&mutex_),
+        should_sleep_(true),
+        done_with_sleep_(false),
+        sleeping_(false) {}
+
+  bool IsSleeping() {
+    MutexLock l(&mutex_);
+    return sleeping_;
+  }
+  void DoSleep() {
+    MutexLock l(&mutex_);
+    sleeping_ = true;
+    bg_cv_.SignalAll();
+    while (should_sleep_) {
+      bg_cv_.Wait();
+    }
+    sleeping_ = false;
+    done_with_sleep_ = true;
+    bg_cv_.SignalAll();
+  }
+  void WaitUntilSleeping() {
+    MutexLock l(&mutex_);
+    while (!sleeping_ || !should_sleep_) {
+      bg_cv_.Wait();
+    }
+  }
+  // Waits for the status to change to sleeping,
+  // otherwise times out.
+  // wait_time is in microseconds.
+  // Returns true when times out, false otherwise.
+  bool TimedWaitUntilSleeping(uint64_t wait_time);
+
+  void WakeUp() {
+    MutexLock l(&mutex_);
+    should_sleep_ = false;
+    bg_cv_.SignalAll();
+  }
+  void WaitUntilDone() {
+    MutexLock l(&mutex_);
+    while (!done_with_sleep_) {
+      bg_cv_.Wait();
+    }
+  }
+  // Similar to TimedWaitUntilSleeping.
+  // Waits until the task is done.
+  bool TimedWaitUntilDone(uint64_t wait_time);
+
+  bool WokenUp() {
+    MutexLock l(&mutex_);
+    return should_sleep_ == false;
+  }
+
+  void Reset() {
+    MutexLock l(&mutex_);
+    should_sleep_ = true;
+    done_with_sleep_ = false;
+  }
+
+  static void DoSleepTask(void* arg) {
+    reinterpret_cast<SleepingBackgroundTask*>(arg)->DoSleep();
+  }
+
+ private:
+  port::Mutex mutex_;
+  port::CondVar bg_cv_;  // Signalled when background work finishes
+  bool should_sleep_;
+  bool done_with_sleep_;
+  bool sleeping_;
+};
+
+// Filters merge operands and values that are equal to `num`.
+class FilterNumber : public CompactionFilter {
+ public:
+  explicit FilterNumber(uint64_t num) : num_(num) {}
+
+  std::string last_merge_operand_key() { return last_merge_operand_key_; }
+
+  bool Filter(int /*level*/, const ROCKSDB_NAMESPACE::Slice& /*key*/,
+              const ROCKSDB_NAMESPACE::Slice& value, std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    if (value.size() == sizeof(uint64_t)) {
+      return num_ == DecodeFixed64(value.data());
+    }
+    return true;
+  }
+
+  bool FilterMergeOperand(
+      int /*level*/, const ROCKSDB_NAMESPACE::Slice& key,
+      const ROCKSDB_NAMESPACE::Slice& value) const override {
+    last_merge_operand_key_ = key.ToString();
+    if (value.size() == sizeof(uint64_t)) {
+      return num_ == DecodeFixed64(value.data());
+    }
+    return true;
+  }
+
+  const char* Name() const override { return "FilterBadMergeOperand"; }
+
+ private:
+  mutable std::string last_merge_operand_key_;
+  uint64_t num_;
+};
+
+inline std::string EncodeInt(uint64_t x) {
+  std::string result;
+  PutFixed64(&result, x);
+  return result;
+}
+
+class SeqStringSource : public FSSequentialFile {
+ public:
+  SeqStringSource(const std::string& data, std::atomic<int>* read_count)
+      : data_(data), offset_(0), read_count_(read_count) {}
+  ~SeqStringSource() override {}
+  IOStatus Read(size_t n, const IOOptions& /*opts*/, Slice* result,
+                char* scratch, IODebugContext* /*dbg*/) override {
+    std::string output;
+    if (offset_ < data_.size()) {
+      n = std::min(data_.size() - offset_, n);
+      memcpy(scratch, data_.data() + offset_, n);
+      offset_ += n;
+      *result = Slice(scratch, n);
+    } else {
+      return IOStatus::InvalidArgument(
+          "Attempt to read when it already reached eof.");
+    }
+    (*read_count_)++;
+    return IOStatus::OK();
+  }
+
+  IOStatus Skip(uint64_t n) override {
+    if (offset_ >= data_.size()) {
+      return IOStatus::InvalidArgument(
+          "Attempt to read when it already reached eof.");
+    }
+    // TODO(yhchiang): Currently doesn't handle the overflow case.
+    offset_ += static_cast<size_t>(n);
+    return IOStatus::OK();
+  }
+
+ private:
+  std::string data_;
+  size_t offset_;
+  std::atomic<int>* read_count_;
+};
+
+class StringFS : public FileSystemWrapper {
+ public:
+  class StringSink : public FSWritableFile {
+   public:
+    explicit StringSink(std::string* contents)
+        : FSWritableFile(), contents_(contents) {}
+    IOStatus Truncate(uint64_t size, const IOOptions& /*opts*/,
+                      IODebugContext* /*dbg*/) override {
+      contents_->resize(static_cast<size_t>(size));
+      return IOStatus::OK();
+    }
+    IOStatus Close(const IOOptions& /*opts*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Flush(const IOOptions& /*opts*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+
+    using FSWritableFile::Append;
+    IOStatus Append(const Slice& slice, const IOOptions& /*opts*/,
+                    IODebugContext* /*dbg*/) override {
+      contents_->append(slice.data(), slice.size());
+      return IOStatus::OK();
+    }
+
+   private:
+    std::string* contents_;
+  };
+
+  explicit StringFS(const std::shared_ptr<FileSystem>& t)
+      : FileSystemWrapper(t) {}
+  ~StringFS() override {}
+
+  static const char* kClassName() { return "StringFS"; }
+  const char* Name() const override { return kClassName(); }
+
+  const std::string& GetContent(const std::string& f) { return files_[f]; }
+
+  const IOStatus WriteToNewFile(const std::string& file_name,
+                                const std::string& content) {
+    std::unique_ptr<FSWritableFile> r;
+    FileOptions file_opts;
+    IOOptions io_opts;
+
+    auto s = NewWritableFile(file_name, file_opts, &r, nullptr);
+    if (s.ok()) {
+      s = r->Append(content, io_opts, nullptr);
+    }
+    if (s.ok()) {
+      s = r->Flush(io_opts, nullptr);
+    }
+    if (s.ok()) {
+      s = r->Close(io_opts, nullptr);
+    }
+    assert(!s.ok() || files_[file_name] == content);
+    return s;
+  }
+
+  // The following text is boilerplate that forwards all methods to target()
+  IOStatus NewSequentialFile(const std::string& f,
+                             const FileOptions& /*options*/,
+                             std::unique_ptr<FSSequentialFile>* r,
+                             IODebugContext* /*dbg*/) override {
+    auto iter = files_.find(f);
+    if (iter == files_.end()) {
+      return IOStatus::NotFound("The specified file does not exist", f);
+    }
+    r->reset(new SeqStringSource(iter->second, &num_seq_file_read_));
+    return IOStatus::OK();
+  }
+
+  IOStatus NewRandomAccessFile(const std::string& /*f*/,
+                               const FileOptions& /*options*/,
+                               std::unique_ptr<FSRandomAccessFile>* /*r*/,
+                               IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus NewWritableFile(const std::string& f, const FileOptions& /*options*/,
+                           std::unique_ptr<FSWritableFile>* r,
+                           IODebugContext* /*dbg*/) override {
+    auto iter = files_.find(f);
+    if (iter != files_.end()) {
+      return IOStatus::IOError("The specified file already exists", f);
+    }
+    r->reset(new StringSink(&files_[f]));
+    return IOStatus::OK();
+  }
+  IOStatus NewDirectory(const std::string& /*name*/,
+                        const IOOptions& /*options*/,
+                        std::unique_ptr<FSDirectory>* /*result*/,
+                        IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus FileExists(const std::string& f, const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+    if (files_.find(f) == files_.end()) {
+      return IOStatus::NotFound();
+    }
+    return IOStatus::OK();
+  }
+
+  IOStatus GetChildren(const std::string& /*dir*/, const IOOptions& /*options*/,
+                       std::vector<std::string>* /*r*/,
+                       IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus DeleteFile(const std::string& f, const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+    files_.erase(f);
+    return IOStatus::OK();
+  }
+
+  IOStatus CreateDir(const std::string& /*d*/, const IOOptions& /*options*/,
+                     IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus CreateDirIfMissing(const std::string& /*d*/,
+                              const IOOptions& /*options*/,
+                              IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus DeleteDir(const std::string& /*d*/, const IOOptions& /*options*/,
+                     IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus GetFileSize(const std::string& f, const IOOptions& /*options*/,
+                       uint64_t* s, IODebugContext* /*dbg*/) override {
+    auto iter = files_.find(f);
+    if (iter == files_.end()) {
+      return IOStatus::NotFound("The specified file does not exist:", f);
+    }
+    *s = iter->second.size();
+    return IOStatus::OK();
+  }
+
+  IOStatus GetFileModificationTime(const std::string& /*fname*/,
+                                   const IOOptions& /*options*/,
+                                   uint64_t* /*file_mtime*/,
+                                   IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus RenameFile(const std::string& /*s*/, const std::string& /*t*/,
+                      const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus LinkFile(const std::string& /*s*/, const std::string& /*t*/,
+                    const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus LockFile(const std::string& /*f*/, const IOOptions& /*options*/,
+                    FileLock** /*l*/, IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus UnlockFile(FileLock* /*l*/, const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  std::atomic<int> num_seq_file_read_;
+
+ protected:
+  std::unordered_map<std::string, std::string> files_;
+};
+
+// Randomly initialize the given DBOptions
+void RandomInitDBOptions(DBOptions* db_opt, Random* rnd);
+
+// Randomly initialize the given ColumnFamilyOptions
+// Note that the caller is responsible for releasing non-null
+// cf_opt->compaction_filter.
+void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, DBOptions&, Random* rnd);
+
+// A dummy merge operator which can change its name
+class ChanglingMergeOperator : public MergeOperator {
+ public:
+  explicit ChanglingMergeOperator(const std::string& name)
+      : name_(name + "MergeOperator") {}
+  ~ChanglingMergeOperator() {}
+
+  void SetName(const std::string& name) { name_ = name; }
+
+  virtual bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+                           MergeOperationOutput* /*merge_out*/) const override {
+    return false;
+  }
+  virtual bool PartialMergeMulti(const Slice& /*key*/,
+                                 const std::deque<Slice>& /*operand_list*/,
+                                 std::string* /*new_value*/,
+                                 Logger* /*logger*/) const override {
+    return false;
+  }
+  static const char* kClassName() { return "ChanglingMergeOperator"; }
+  const char* NickName() const override { return kNickName(); }
+  static const char* kNickName() { return "Changling"; }
+  bool IsInstanceOf(const std::string& id) const override {
+    if (id == kClassName()) {
+      return true;
+    } else {
+      return MergeOperator::IsInstanceOf(id);
+    }
+  }
+
+  virtual const char* Name() const override { return name_.c_str(); }
+
+ protected:
+  std::string name_;
+};
+
+// Returns a dummy merge operator with random name.
+MergeOperator* RandomMergeOperator(Random* rnd);
+
+// A dummy compaction filter which can change its name
+class ChanglingCompactionFilter : public CompactionFilter {
+ public:
+  explicit ChanglingCompactionFilter(const std::string& name)
+      : name_(name + "CompactionFilter") {}
+  ~ChanglingCompactionFilter() {}
+
+  void SetName(const std::string& name) { name_ = name; }
+
+  bool Filter(int /*level*/, const Slice& /*key*/,
+              const Slice& /*existing_value*/, std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    return false;
+  }
+
+  static const char* kClassName() { return "ChanglingCompactionFilter"; }
+  const char* NickName() const override { return kNickName(); }
+  static const char* kNickName() { return "Changling"; }
+
+  bool IsInstanceOf(const std::string& id) const override {
+    if (id == kClassName()) {
+      return true;
+    } else {
+      return CompactionFilter::IsInstanceOf(id);
+    }
+  }
+
+  const char* Name() const override { return name_.c_str(); }
+
+ private:
+  std::string name_;
+};
+
+// Returns a dummy compaction filter with a random name.
+CompactionFilter* RandomCompactionFilter(Random* rnd);
+
+// A dummy compaction filter factory which can change its name
+class ChanglingCompactionFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit ChanglingCompactionFilterFactory(const std::string& name)
+      : name_(name + "CompactionFilterFactory") {}
+  ~ChanglingCompactionFilterFactory() {}
+
+  void SetName(const std::string& name) { name_ = name; }
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& /*context*/) override {
+    return std::unique_ptr<CompactionFilter>();
+  }
+
+  // Returns a name that identifies this compaction filter factory.
+  const char* Name() const override { return name_.c_str(); }
+  static const char* kClassName() { return "ChanglingCompactionFilterFactory"; }
+  const char* NickName() const override { return kNickName(); }
+  static const char* kNickName() { return "Changling"; }
+
+  bool IsInstanceOf(const std::string& id) const override {
+    if (id == kClassName()) {
+      return true;
+    } else {
+      return CompactionFilterFactory::IsInstanceOf(id);
+    }
+  }
+
+ protected:
+  std::string name_;
+};
+
+// The factory for the hacky skip list mem table that triggers flush after
+// number of entries exceeds a threshold.
+extern MemTableRepFactory* NewSpecialSkipListFactory(int num_entries_per_flush);
+
+CompressionType RandomCompressionType(Random* rnd);
+
+void RandomCompressionTypeVector(const size_t count,
+                                 std::vector<CompressionType>* types,
+                                 Random* rnd);
+
+CompactionFilterFactory* RandomCompactionFilterFactory(Random* rnd);
+
+const SliceTransform* RandomSliceTransform(Random* rnd, int pre_defined = -1);
+
+TableFactory* RandomTableFactory(Random* rnd, int pre_defined = -1);
+
+std::string RandomName(Random* rnd, const size_t len);
+
+bool IsDirectIOSupported(Env* env, const std::string& dir);
+
+bool IsPrefetchSupported(const std::shared_ptr<FileSystem>& fs,
+                         const std::string& dir);
+
+// Return the number of lines where a given pattern was found in a file.
+size_t GetLinesCount(const std::string& fname, const std::string& pattern);
+
+Status CorruptFile(Env* env, const std::string& fname, int offset,
+                   int bytes_to_corrupt, bool verify_checksum = true);
+Status TruncateFile(Env* env, const std::string& fname, uint64_t length);
+
+// Try and delete a directory if it exists
+Status TryDeleteDir(Env* env, const std::string& dirname);
+
+// Delete a directory if it exists
+void DeleteDir(Env* env, const std::string& dirname);
+
+// Creates an Env from the system environment by looking at the system
+// environment variables.
+Status CreateEnvFromSystem(const ConfigOptions& options, Env** result,
+                           std::shared_ptr<Env>* guard);
+
+#ifndef ROCKSDB_LITE
+// Registers the testutil classes with the ObjectLibrary
+int RegisterTestObjects(ObjectLibrary& library, const std::string& /*arg*/);
+#endif  // ROCKSDB_LITE
+
+// Register the testutil classes with the default ObjectRegistry/Library
+void RegisterTestLibrary(const std::string& arg = "");
+}  // namespace test
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/test_util/testutil_test.cc b/src/rocksdb/test_util/testutil_test.cc
new file mode 100644
index 000000000..41f26e389
--- /dev/null
+++ b/src/rocksdb/test_util/testutil_test.cc
@@ -0,0 +1,43 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "test_util/testutil.h"
+
+#include "file/file_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void CreateFile(Env* env, const std::string& path) {
+  std::unique_ptr<WritableFile> f;
+  ASSERT_OK(env->NewWritableFile(path, &f, EnvOptions()));
+  f->Close();
+}
+
+TEST(TestUtil, DestroyDirRecursively) {
+  auto env = Env::Default();
+  // test_util/file
+  //          /dir
+  //          /dir/file
+  std::string test_dir = test::PerThreadDBPath("test_util");
+  ASSERT_OK(env->CreateDir(test_dir));
+  CreateFile(env, test_dir + "/file");
+  ASSERT_OK(env->CreateDir(test_dir + "/dir"));
+  CreateFile(env, test_dir + "/dir/file");
+
+  ASSERT_OK(DestroyDir(env, test_dir));
+  auto s = env->FileExists(test_dir);
+  ASSERT_TRUE(s.IsNotFound());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/test_util/transaction_test_util.cc b/src/rocksdb/test_util/transaction_test_util.cc
new file mode 100644
index 000000000..99286d836
--- /dev/null
+++ b/src/rocksdb/test_util/transaction_test_util.cc
@@ -0,0 +1,402 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#ifndef ROCKSDB_LITE
+
+#include "test_util/transaction_test_util.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <numeric>
+#include <random>
+#include <string>
+#include <thread>
+
+#include "db/dbformat.h"
+#include "db/snapshot_impl.h"
+#include "logging/logging.h"
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+RandomTransactionInserter::RandomTransactionInserter(
+    Random64* rand, const WriteOptions& write_options,
+    const ReadOptions& read_options, uint64_t num_keys, uint16_t num_sets,
+    const uint64_t cmt_delay_ms, const uint64_t first_id)
+    : rand_(rand),
+      write_options_(write_options),
+      read_options_(read_options),
+      num_keys_(num_keys),
+      num_sets_(num_sets),
+      txn_id_(first_id),
+      cmt_delay_ms_(cmt_delay_ms) {}
+
+RandomTransactionInserter::~RandomTransactionInserter() {
+  if (txn_ != nullptr) {
+    delete txn_;
+  }
+  if (optimistic_txn_ != nullptr) {
+    delete optimistic_txn_;
+  }
+}
+
+bool RandomTransactionInserter::TransactionDBInsert(
+    TransactionDB* db, const TransactionOptions& txn_options) {
+  txn_ = db->BeginTransaction(write_options_, txn_options, txn_);
+
+  std::hash<std::thread::id> hasher;
+  char name[64];
+  snprintf(name, 64, "txn%" ROCKSDB_PRIszt "-%" PRIu64,
+           hasher(std::this_thread::get_id()), txn_id_++);
+  assert(strlen(name) < 64 - 1);
+  assert(txn_->SetName(name).ok());
+
+  // Take a snapshot if set_snapshot was not set or with 50% change otherwise
+  bool take_snapshot = txn_->GetSnapshot() == nullptr || rand_->OneIn(2);
+  if (take_snapshot) {
+    txn_->SetSnapshot();
+    read_options_.snapshot = txn_->GetSnapshot();
+  }
+  auto res = DoInsert(db, txn_, false);
+  if (take_snapshot) {
+    read_options_.snapshot = nullptr;
+  }
+  return res;
+}
+
+bool RandomTransactionInserter::OptimisticTransactionDBInsert(
+    OptimisticTransactionDB* db,
+    const OptimisticTransactionOptions& txn_options) {
+  optimistic_txn_ =
+      db->BeginTransaction(write_options_, txn_options, optimistic_txn_);
+
+  return DoInsert(db, optimistic_txn_, true);
+}
+
+bool RandomTransactionInserter::DBInsert(DB* db) {
+  return DoInsert(db, nullptr, false);
+}
+
+Status RandomTransactionInserter::DBGet(
+    DB* db, Transaction* txn, ReadOptions& read_options, uint16_t set_i,
+    uint64_t ikey, bool get_for_update, uint64_t* int_value,
+    std::string* full_key, bool* unexpected_error) {
+  Status s;
+  // Five digits (since the largest uint16_t is 65535) plus the NUL
+  // end char.
+  char prefix_buf[6] = {0};
+  // Pad prefix appropriately so we can iterate over each set
+  assert(set_i + 1 <= 9999);
+  snprintf(prefix_buf, sizeof(prefix_buf), "%.4u", set_i + 1);
+  // key format:  [SET#][random#]
+  std::string skey = std::to_string(ikey);
+  Slice base_key(skey);
+  *full_key = std::string(prefix_buf) + base_key.ToString();
+  Slice key(*full_key);
+
+  std::string value;
+  if (txn != nullptr) {
+    if (get_for_update) {
+      s = txn->GetForUpdate(read_options, key, &value);
+    } else {
+      s = txn->Get(read_options, key, &value);
+    }
+  } else {
+    s = db->Get(read_options, key, &value);
+  }
+
+  if (s.ok()) {
+    // Found key, parse its value
+    *int_value = std::stoull(value);
+    if (*int_value == 0 || *int_value == ULONG_MAX) {
+      *unexpected_error = true;
+      fprintf(stderr, "Get returned unexpected value: %s\n", value.c_str());
+      s = Status::Corruption();
+    }
+  } else if (s.IsNotFound()) {
+    // Have not yet written to this key, so assume its value is 0
+    *int_value = 0;
+    s = Status::OK();
+  }
+  return s;
+}
+
+bool RandomTransactionInserter::DoInsert(DB* db, Transaction* txn,
+                                         bool is_optimistic) {
+  Status s;
+  WriteBatch batch;
+
+  // pick a random number to use to increment a key in each set
+  uint64_t incr = (rand_->Next() % 100) + 1;
+  bool unexpected_error = false;
+
+  std::vector<uint16_t> set_vec(num_sets_);
+  std::iota(set_vec.begin(), set_vec.end(), static_cast<uint16_t>(0));
+  RandomShuffle(set_vec.begin(), set_vec.end());
+
+  // For each set, pick a key at random and increment it
+  for (uint16_t set_i : set_vec) {
+    uint64_t int_value = 0;
+    std::string full_key;
+    uint64_t rand_key = rand_->Next() % num_keys_;
+    const bool get_for_update = txn ? rand_->OneIn(2) : false;
+    s = DBGet(db, txn, read_options_, set_i, rand_key, get_for_update,
+              &int_value, &full_key, &unexpected_error);
+    Slice key(full_key);
+    if (!s.ok()) {
+      // Optimistic transactions should never return non-ok status here.
+      // Non-optimistic transactions may return write-coflict/timeout errors.
+      if (is_optimistic || !(s.IsBusy() || s.IsTimedOut() || s.IsTryAgain())) {
+        fprintf(stderr, "Get returned an unexpected error: %s\n",
+                s.ToString().c_str());
+        unexpected_error = true;
+      }
+      break;
+    }
+
+    if (s.ok()) {
+      // Increment key
+      std::string sum = std::to_string(int_value + incr);
+      if (txn != nullptr) {
+        if ((set_i % 4) != 0) {
+          s = txn->SingleDelete(key);
+        } else {
+          s = txn->Delete(key);
+        }
+        if (!get_for_update && (s.IsBusy() || s.IsTimedOut())) {
+          // If the initial get was not for update, then the key is not locked
+          // before put and put could fail due to concurrent writes.
+          break;
+        } else if (!s.ok()) {
+          // Since we did a GetForUpdate, SingleDelete should not fail.
+          fprintf(stderr, "SingleDelete returned an unexpected error: %s\n",
+                  s.ToString().c_str());
+          unexpected_error = true;
+        }
+        s = txn->Put(key, sum);
+        if (!s.ok()) {
+          // Since we did a GetForUpdate, Put should not fail.
+          fprintf(stderr, "Put returned an unexpected error: %s\n",
+                  s.ToString().c_str());
+          unexpected_error = true;
+        }
+      } else {
+        batch.Put(key, sum);
+      }
+      bytes_inserted_ += key.size() + sum.size();
+    }
+    if (txn != nullptr) {
+      ROCKS_LOG_DEBUG(db->GetDBOptions().info_log,
+                      "Insert (%s) %s snap: %" PRIu64 " key:%s value: %" PRIu64
+                      "+%" PRIu64 "=%" PRIu64,
+                      txn->GetName().c_str(), s.ToString().c_str(),
+                      txn->GetSnapshot()->GetSequenceNumber(), full_key.c_str(),
+                      int_value, incr, int_value + incr);
+    }
+  }
+
+  if (s.ok()) {
+    if (txn != nullptr) {
+      bool with_prepare = !is_optimistic && !rand_->OneIn(10);
+      if (with_prepare) {
+        // Also try commit without prepare
+        s = txn->Prepare();
+        if (!s.ok()) {
+          fprintf(stderr, "Prepare returned an unexpected error: %s\n",
+                  s.ToString().c_str());
+        }
+        assert(s.ok());
+        ROCKS_LOG_DEBUG(db->GetDBOptions().info_log,
+                        "Prepare of %" PRIu64 " %s (%s)", txn->GetId(),
+                        s.ToString().c_str(), txn->GetName().c_str());
+        if (rand_->OneIn(20)) {
+          // This currently only tests the mechanics of writing commit time
+          // write batch so the exact values would not matter.
+          s = txn_->GetCommitTimeWriteBatch()->Put("cat", "dog");
+          assert(s.ok());
+        }
+        db->GetDBOptions().env->SleepForMicroseconds(
+            static_cast<int>(cmt_delay_ms_ * 1000));
+      }
+      if (!rand_->OneIn(20)) {
+        s = txn->Commit();
+        assert(!with_prepare || s.ok());
+        ROCKS_LOG_DEBUG(db->GetDBOptions().info_log,
+                        "Commit of %" PRIu64 " %s (%s)", txn->GetId(),
+                        s.ToString().c_str(), txn->GetName().c_str());
+      } else {
+        // Also try 5% rollback
+        s = txn->Rollback();
+        ROCKS_LOG_DEBUG(db->GetDBOptions().info_log,
+                        "Rollback %" PRIu64 " %s %s", txn->GetId(),
+                        txn->GetName().c_str(), s.ToString().c_str());
+        assert(s.ok());
+      }
+      assert(is_optimistic || s.ok());
+
+      if (!s.ok()) {
+        if (is_optimistic) {
+          // Optimistic transactions can have write-conflict errors on commit.
+          // Any other error is unexpected.
+          if (!(s.IsBusy() || s.IsTimedOut() || s.IsTryAgain())) {
+            unexpected_error = true;
+          }
+        } else {
+          // Non-optimistic transactions should only fail due to expiration
+          // or write failures.  For testing purproses, we do not expect any
+          // write failures.
+          if (!s.IsExpired()) {
+            unexpected_error = true;
+          }
+        }
+
+        if (unexpected_error) {
+          fprintf(stderr, "Commit returned an unexpected error: %s\n",
+                  s.ToString().c_str());
+        }
+      }
+    } else {
+      s = db->Write(write_options_, &batch);
+      if (!s.ok()) {
+        unexpected_error = true;
+        fprintf(stderr, "Write returned an unexpected error: %s\n",
+                s.ToString().c_str());
+      }
+    }
+  } else {
+    if (txn != nullptr) {
+      assert(txn->Rollback().ok());
+      ROCKS_LOG_DEBUG(db->GetDBOptions().info_log, "Error %s for txn %s",
+                      s.ToString().c_str(), txn->GetName().c_str());
+    }
+  }
+
+  if (s.ok()) {
+    success_count_++;
+  } else {
+    failure_count_++;
+  }
+
+  last_status_ = s;
+
+  // return success if we didn't get any unexpected errors
+  return !unexpected_error;
+}
+
+// Verify that the sum of the keys in each set are equal
+Status RandomTransactionInserter::Verify(DB* db, uint16_t num_sets,
+                                         uint64_t num_keys_per_set,
+                                         bool take_snapshot, Random64* rand,
+                                         uint64_t delay_ms) {
+  // delay_ms is the delay between taking a snapshot and doing the reads. It
+  // emulates reads from a long-running backup job.
+  assert(delay_ms == 0 || take_snapshot);
+  uint64_t prev_total = 0;
+  uint32_t prev_i = 0;
+  bool prev_assigned = false;
+
+  ReadOptions roptions;
+  if (take_snapshot) {
+    roptions.snapshot = db->GetSnapshot();
+    db->GetDBOptions().env->SleepForMicroseconds(
+        static_cast<int>(delay_ms * 1000));
+  }
+
+  std::vector<uint16_t> set_vec(num_sets);
+  std::iota(set_vec.begin(), set_vec.end(), static_cast<uint16_t>(0));
+  RandomShuffle(set_vec.begin(), set_vec.end());
+
+  // For each set of keys with the same prefix, sum all the values
+  for (uint16_t set_i : set_vec) {
+    // Five digits (since the largest uint16_t is 65535) plus the NUL
+    // end char.
+    char prefix_buf[6];
+    assert(set_i + 1 <= 9999);
+    snprintf(prefix_buf, sizeof(prefix_buf), "%.4u", set_i + 1);
+    uint64_t total = 0;
+
+    // Use either point lookup or iterator. Point lookups are slower so we use
+    // it less often.
+    const bool use_point_lookup =
+        num_keys_per_set != 0 && rand && rand->OneIn(10);
+    if (use_point_lookup) {
+      ReadOptions read_options;
+      for (uint64_t k = 0; k < num_keys_per_set; k++) {
+        std::string dont_care;
+        uint64_t int_value = 0;
+        bool unexpected_error = false;
+        const bool FOR_UPDATE = false;
+        Status s = DBGet(db, nullptr, roptions, set_i, k, FOR_UPDATE,
+                         &int_value, &dont_care, &unexpected_error);
+        assert(s.ok());
+        assert(!unexpected_error);
+        total += int_value;
+      }
+    } else {  // user iterators
+      Iterator* iter = db->NewIterator(roptions);
+      for (iter->Seek(Slice(prefix_buf, 4)); iter->Valid(); iter->Next()) {
+        Slice key = iter->key();
+        // stop when we reach a different prefix
+        if (key.ToString().compare(0, 4, prefix_buf) != 0) {
+          break;
+        }
+        Slice value = iter->value();
+        uint64_t int_value = std::stoull(value.ToString());
+        if (int_value == 0 || int_value == ULONG_MAX) {
+          fprintf(stderr, "Iter returned unexpected value: %s\n",
+                  value.ToString().c_str());
+          return Status::Corruption();
+        }
+        ROCKS_LOG_DEBUG(
+            db->GetDBOptions().info_log,
+            "VerifyRead at %" PRIu64 " (%" PRIu64 "): %.*s value: %" PRIu64,
+            roptions.snapshot ? roptions.snapshot->GetSequenceNumber() : 0ul,
+            roptions.snapshot
+                ? ((SnapshotImpl*)roptions.snapshot)->min_uncommitted_
+                : 0ul,
+            static_cast<int>(key.size()), key.data(), int_value);
+        total += int_value;
+      }
+      iter->status().PermitUncheckedError();
+      delete iter;
+    }
+
+    if (prev_assigned && total != prev_total) {
+      db->GetDBOptions().info_log->Flush();
+      fprintf(stdout,
+              "RandomTransactionVerify found inconsistent totals using "
+              "pointlookup? %d "
+              "Set[%" PRIu32 "]: %" PRIu64 ", Set[%" PRIu32 "]: %" PRIu64
+              " at snapshot %" PRIu64 "\n",
+              use_point_lookup, prev_i, prev_total, set_i, total,
+              roptions.snapshot ? roptions.snapshot->GetSequenceNumber() : 0ul);
+      fflush(stdout);
+      return Status::Corruption();
+    } else {
+      ROCKS_LOG_DEBUG(
+          db->GetDBOptions().info_log,
+          "RandomTransactionVerify pass pointlookup? %d total: %" PRIu64
+          " snap: %" PRIu64,
+          use_point_lookup, total,
+          roptions.snapshot ? roptions.snapshot->GetSequenceNumber() : 0ul);
+    }
+    prev_total = total;
+    prev_i = set_i;
+    prev_assigned = true;
+  }
+  if (take_snapshot) {
+    db->ReleaseSnapshot(roptions.snapshot);
+  }
+
+  return Status::OK();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/test_util/transaction_test_util.h b/src/rocksdb/test_util/transaction_test_util.h
new file mode 100644
index 000000000..7a38ab626
--- /dev/null
+++ b/src/rocksdb/test_util/transaction_test_util.h
@@ -0,0 +1,149 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include "port/port.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksdb/utilities/transaction_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DB;
+class Random64;
+
+// Utility class for stress testing transactions.  Can be used to write many
+// transactions in parallel and then validate that the data written is logically
+// consistent.  This class assumes the input DB is initially empty.
+//
+// Each call to TransactionDBInsert()/OptimisticTransactionDBInsert() will
+// increment the value of a key in #num_sets sets of keys.  Regardless of
+// whether the transaction succeeds, the total sum of values of keys in each
+// set is an invariant that should remain equal.
+//
+// After calling TransactionDBInsert()/OptimisticTransactionDBInsert() many
+// times, Verify() can be called to validate that the invariant holds.
+//
+// To test writing Transaction in parallel, multiple threads can create a
+// RandomTransactionInserter with similar arguments using the same DB.
+class RandomTransactionInserter {
+ public:
+  static bool RollbackDeletionTypeCallback(const Slice& key) {
+    // These are hard-coded atm. See how RandomTransactionInserter::DoInsert()
+    // determines whether to use SingleDelete or Delete for a key.
+    assert(key.size() >= 4);
+    const char* ptr = key.data();
+    assert(ptr);
+    while (ptr && ptr < key.data() + 4 && *ptr == '0') {
+      ++ptr;
+    }
+    std::string prefix(ptr, 4 - (ptr - key.data()));
+    unsigned long set_i = std::stoul(prefix);
+    assert(set_i > 0);
+    assert(set_i <= 9999);
+    --set_i;
+    return ((set_i % 4) != 0);
+  }
+
+  // num_keys is the number of keys in each set.
+  // num_sets is the number of sets of keys.
+  // cmt_delay_ms is the delay between prepare (if there is any) and commit
+  // first_id is the id of the first transaction
+  explicit RandomTransactionInserter(
+      Random64* rand, const WriteOptions& write_options = WriteOptions(),
+      const ReadOptions& read_options = ReadOptions(), uint64_t num_keys = 1000,
+      uint16_t num_sets = 3, const uint64_t cmt_delay_ms = 0,
+      const uint64_t first_id = 0);
+
+  ~RandomTransactionInserter();
+
+  // Increment a key in each set using a Transaction on a TransactionDB.
+  //
+  // Returns true if the transaction succeeded OR if any error encountered was
+  // expected (eg a write-conflict). Error status may be obtained by calling
+  // GetLastStatus();
+  bool TransactionDBInsert(
+      TransactionDB* db,
+      const TransactionOptions& txn_options = TransactionOptions());
+
+  // Increment a key in each set using a Transaction on an
+  // OptimisticTransactionDB
+  //
+  // Returns true if the transaction succeeded OR if any error encountered was
+  // expected (eg a write-conflict). Error status may be obtained by calling
+  // GetLastStatus();
+  bool OptimisticTransactionDBInsert(
+      OptimisticTransactionDB* db,
+      const OptimisticTransactionOptions& txn_options =
+          OptimisticTransactionOptions());
+  // Increment a key in each set without using a transaction.  If this function
+  // is called in parallel, then Verify() may fail.
+  //
+  // Returns true if the write succeeds.
+  // Error status may be obtained by calling GetLastStatus().
+  bool DBInsert(DB* db);
+
+  // Get the ikey'th key from set set_i
+  static Status DBGet(DB* db, Transaction* txn, ReadOptions& read_options,
+                      uint16_t set_i, uint64_t ikey, bool get_for_update,
+                      uint64_t* int_value, std::string* full_key,
+                      bool* unexpected_error);
+
+  // Returns OK if Invariant is true.
+  static Status Verify(DB* db, uint16_t num_sets, uint64_t num_keys_per_set = 0,
+                       bool take_snapshot = false, Random64* rand = nullptr,
+                       uint64_t delay_ms = 0);
+
+  // Returns the status of the previous Insert operation
+  Status GetLastStatus() { return last_status_; }
+
+  // Returns the number of successfully written calls to
+  // TransactionDBInsert/OptimisticTransactionDBInsert/DBInsert
+  uint64_t GetSuccessCount() { return success_count_; }
+
+  // Returns the number of calls to
+  // TransactionDBInsert/OptimisticTransactionDBInsert/DBInsert that did not
+  // write any data.
+  uint64_t GetFailureCount() { return failure_count_; }
+
+  // Returns the sum of user keys/values Put() to the DB.
+  size_t GetBytesInserted() { return bytes_inserted_; }
+
+ private:
+  // Input options
+  Random64* rand_;
+  const WriteOptions write_options_;
+  ReadOptions read_options_;
+  const uint64_t num_keys_;
+  const uint16_t num_sets_;
+
+  // Number of successful insert batches performed
+  uint64_t success_count_ = 0;
+
+  // Number of failed insert batches attempted
+  uint64_t failure_count_ = 0;
+
+  size_t bytes_inserted_ = 0;
+
+  // Status returned by most recent insert operation
+  Status last_status_;
+
+  // optimization: re-use allocated transaction objects.
+  Transaction* txn_ = nullptr;
+  Transaction* optimistic_txn_ = nullptr;
+
+  uint64_t txn_id_;
+  // The delay between ::Prepare and ::Commit
+  const uint64_t cmt_delay_ms_;
+
+  bool DoInsert(DB* db, Transaction* txn, bool is_optimistic);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/third-party/gcc/ppc-asm.h b/src/rocksdb/third-party/gcc/ppc-asm.h
new file mode 100644
index 000000000..e0bce9c5a
--- /dev/null
+++ b/src/rocksdb/third-party/gcc/ppc-asm.h
@@ -0,0 +1,390 @@
+/* PowerPC asm definitions for GNU C.
+
+Copyright (C) 2002-2020 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/* Under winnt, 1) gas supports the following as names and 2) in particular
+   defining "toc" breaks the FUNC_START macro as ".toc" becomes ".2" */
+
+#define r0	0
+#define sp	1
+#define toc	2
+#define r3	3
+#define r4	4
+#define r5	5
+#define r6	6
+#define r7	7
+#define r8	8
+#define r9	9
+#define r10	10
+#define r11	11
+#define r12	12
+#define r13	13
+#define r14	14
+#define r15	15
+#define r16	16
+#define r17	17
+#define r18	18
+#define r19     19
+#define r20	20
+#define r21	21
+#define r22	22
+#define r23	23
+#define r24	24
+#define r25	25
+#define r26	26
+#define r27	27
+#define r28	28
+#define r29	29
+#define r30	30
+#define r31	31
+
+#define cr0	0
+#define cr1	1
+#define cr2	2
+#define cr3	3
+#define cr4	4
+#define cr5	5
+#define cr6	6
+#define cr7	7
+
+#define f0	0
+#define f1	1
+#define f2	2
+#define f3	3
+#define f4	4
+#define f5	5
+#define f6	6
+#define f7	7
+#define f8	8
+#define f9	9
+#define f10	10
+#define f11	11
+#define f12	12
+#define f13	13
+#define f14	14
+#define f15	15
+#define f16	16
+#define f17	17
+#define f18	18
+#define f19	19
+#define f20	20
+#define f21	21
+#define f22	22
+#define f23	23
+#define f24	24
+#define f25	25
+#define f26	26
+#define f27	27
+#define f28	28
+#define f29	29
+#define f30	30
+#define f31	31
+
+#ifdef __VSX__
+#define f32	32
+#define f33	33
+#define f34	34
+#define f35	35
+#define f36	36
+#define f37	37
+#define f38	38
+#define f39	39
+#define f40	40
+#define f41	41
+#define f42	42
+#define f43	43
+#define f44	44
+#define f45	45
+#define f46	46
+#define f47	47
+#define f48	48
+#define f49	49
+#define f50	50
+#define f51	51
+#define f52	52
+#define f53	53
+#define f54	54
+#define f55	55
+#define f56	56
+#define f57	57
+#define f58	58
+#define f59	59
+#define f60	60
+#define f61	61
+#define f62	62
+#define f63	63
+#endif
+
+#ifdef __ALTIVEC__
+#define v0	0
+#define v1	1
+#define v2	2
+#define v3	3
+#define v4	4
+#define v5	5
+#define v6	6
+#define v7	7
+#define v8	8
+#define v9	9
+#define v10	10
+#define v11	11
+#define v12	12
+#define v13	13
+#define v14	14
+#define v15	15
+#define v16	16
+#define v17	17
+#define v18	18
+#define v19	19
+#define v20	20
+#define v21	21
+#define v22	22
+#define v23	23
+#define v24	24
+#define v25	25
+#define v26	26
+#define v27	27
+#define v28	28
+#define v29	29
+#define v30	30
+#define v31	31
+#endif
+
+#ifdef __VSX__
+#define vs0	0
+#define vs1	1
+#define vs2	2
+#define vs3	3
+#define vs4	4
+#define vs5	5
+#define vs6	6
+#define vs7	7
+#define vs8	8
+#define vs9	9
+#define vs10	10
+#define vs11	11
+#define vs12	12
+#define vs13	13
+#define vs14	14
+#define vs15	15
+#define vs16	16
+#define vs17	17
+#define vs18	18
+#define vs19	19
+#define vs20	20
+#define vs21	21
+#define vs22	22
+#define vs23	23
+#define vs24	24
+#define vs25	25
+#define vs26	26
+#define vs27	27
+#define vs28	28
+#define vs29	29
+#define vs30	30
+#define vs31	31
+#define vs32	32
+#define vs33	33
+#define vs34	34
+#define vs35	35
+#define vs36	36
+#define vs37	37
+#define vs38	38
+#define vs39	39
+#define vs40	40
+#define vs41	41
+#define vs42	42
+#define vs43	43
+#define vs44	44
+#define vs45	45
+#define vs46	46
+#define vs47	47
+#define vs48	48
+#define vs49	49
+#define vs50	50
+#define vs51	51
+#define vs52	52
+#define vs53	53
+#define vs54	54
+#define vs55	55
+#define vs56	56
+#define vs57	57
+#define vs58	58
+#define vs59	59
+#define vs60	60
+#define vs61	61
+#define vs62	62
+#define vs63	63
+#endif
+
+/*
+ * Macros to glue together two tokens.
+ */
+
+#ifdef __STDC__
+#define XGLUE(a,b) a##b
+#else
+#define XGLUE(a,b) a/**/b
+#endif
+
+#define GLUE(a,b) XGLUE(a,b)
+
+/*
+ * Macros to begin and end a function written in assembler.  If -mcall-aixdesc
+ * or -mcall-nt, create a function descriptor with the given name, and create
+ * the real function with one or two leading periods respectively.
+ */
+
+#if defined(__powerpc64__) && _CALL_ELF == 2
+
+/* Defining "toc" above breaks @toc in assembler code.  */
+#undef toc
+
+#define FUNC_NAME(name) GLUE(__USER_LABEL_PREFIX__,name)
+#ifdef __PCREL__
+#define JUMP_TARGET(name) GLUE(FUNC_NAME(name),@notoc)
+#define FUNC_START(name) \
+	.type FUNC_NAME(name),@function; \
+	.globl FUNC_NAME(name); \
+FUNC_NAME(name): \
+	.localentry FUNC_NAME(name),1
+#else
+#define JUMP_TARGET(name) FUNC_NAME(name)
+#define FUNC_START(name) \
+	.type FUNC_NAME(name),@function; \
+	.globl FUNC_NAME(name); \
+FUNC_NAME(name): \
+0:	addis 2,12,(.TOC.-0b)@ha; \
+	addi 2,2,(.TOC.-0b)@l; \
+	.localentry FUNC_NAME(name),.-FUNC_NAME(name)
+#endif /* !__PCREL__ */
+
+#define HIDDEN_FUNC(name) \
+  FUNC_START(name) \
+  .hidden FUNC_NAME(name);
+
+#define FUNC_END(name) \
+	.size FUNC_NAME(name),.-FUNC_NAME(name)
+
+#elif defined (__powerpc64__)
+
+#define FUNC_NAME(name) GLUE(.,name)
+#define JUMP_TARGET(name) FUNC_NAME(name)
+#define FUNC_START(name) \
+	.section ".opd","aw"; \
+name: \
+	.quad GLUE(.,name); \
+	.quad .TOC.@tocbase; \
+	.quad 0; \
+	.previous; \
+	.type GLUE(.,name),@function; \
+	.globl name; \
+	.globl GLUE(.,name); \
+GLUE(.,name):
+
+#define HIDDEN_FUNC(name) \
+  FUNC_START(name) \
+  .hidden name;	\
+  .hidden GLUE(.,name);
+
+#define FUNC_END(name) \
+GLUE(.L,name): \
+	.size GLUE(.,name),GLUE(.L,name)-GLUE(.,name)
+
+#elif defined(_CALL_AIXDESC)
+
+#ifdef _RELOCATABLE
+#define DESC_SECTION ".got2"
+#else
+#define DESC_SECTION ".got1"
+#endif
+
+#define FUNC_NAME(name) GLUE(.,name)
+#define JUMP_TARGET(name) FUNC_NAME(name)
+#define FUNC_START(name) \
+	.section DESC_SECTION,"aw"; \
+name: \
+	.long GLUE(.,name); \
+	.long _GLOBAL_OFFSET_TABLE_; \
+	.long 0; \
+	.previous; \
+	.type GLUE(.,name),@function; \
+	.globl name; \
+	.globl GLUE(.,name); \
+GLUE(.,name):
+
+#define HIDDEN_FUNC(name) \
+  FUNC_START(name) \
+  .hidden name; \
+  .hidden GLUE(.,name);
+
+#define FUNC_END(name) \
+GLUE(.L,name): \
+	.size GLUE(.,name),GLUE(.L,name)-GLUE(.,name)
+
+#else
+
+#define FUNC_NAME(name) GLUE(__USER_LABEL_PREFIX__,name)
+#if defined __PIC__ || defined __pic__
+#define JUMP_TARGET(name) FUNC_NAME(name@plt)
+#else
+#define JUMP_TARGET(name) FUNC_NAME(name)
+#endif
+#define FUNC_START(name) \
+	.type FUNC_NAME(name),@function; \
+	.globl FUNC_NAME(name); \
+FUNC_NAME(name):
+
+#define HIDDEN_FUNC(name) \
+  FUNC_START(name) \
+  .hidden FUNC_NAME(name);
+
+#define FUNC_END(name) \
+GLUE(.L,name): \
+	.size FUNC_NAME(name),GLUE(.L,name)-FUNC_NAME(name)
+#endif
+
+#ifdef IN_GCC
+/* For HAVE_GAS_CFI_DIRECTIVE.  */
+#include "auto-host.h"
+
+#ifdef HAVE_GAS_CFI_DIRECTIVE
+# define CFI_STARTPROC			.cfi_startproc
+# define CFI_ENDPROC			.cfi_endproc
+# define CFI_OFFSET(reg, off)		.cfi_offset reg, off
+# define CFI_DEF_CFA_REGISTER(reg)	.cfi_def_cfa_register reg
+# define CFI_RESTORE(reg)		.cfi_restore reg
+#else
+# define CFI_STARTPROC
+# define CFI_ENDPROC
+# define CFI_OFFSET(reg, off)
+# define CFI_DEF_CFA_REGISTER(reg)
+# define CFI_RESTORE(reg)
+#endif
+#endif
+
+#if defined __linux__ && !defined __powerpc64__
+	.section .note.GNU-stack
+	.previous
+#endif
diff --git a/src/rocksdb/third-party/gtest-1.8.1/fused-src/gtest/CMakeLists.txt b/src/rocksdb/third-party/gtest-1.8.1/fused-src/gtest/CMakeLists.txt
new file mode 100644
index 000000000..211e8a8e1
--- /dev/null
+++ b/src/rocksdb/third-party/gtest-1.8.1/fused-src/gtest/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_library(gtest gtest-all.cc)
+
+# Add dependency of gtest on thread library
+target_link_libraries(gtest ${CMAKE_THREAD_LIBS_INIT})
diff --git a/src/rocksdb/third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc b/src/rocksdb/third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc
new file mode 100644
index 000000000..9f2b3d565
--- /dev/null
+++ b/src/rocksdb/third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc
@@ -0,0 +1,11394 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Google C++ Testing and Mocking Framework (Google Test)
+//
+// Sometimes it's desirable to build Google Test by compiling a single file.
+// This file serves this purpose.
+
+// This line ensures that gtest.h can be compiled on its own, even
+// when it's fused.
+#include "gtest/gtest.h"
+
+// The following lines pull in the real gtest *.cc files.
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Utilities for testing Google Test itself and code that uses Google Test
+// (e.g. frameworks built on top of Google Test).
+
+// GOOGLETEST_CM0004 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+#define GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// This helper class can be used to mock out Google Test failure reporting
+// so that we can test Google Test or code that builds on Google Test.
+//
+// An object of this class appends a TestPartResult object to the
+// TestPartResultArray object given in the constructor whenever a Google Test
+// failure is reported. It can either intercept only failures that are
+// generated in the same thread that created this object or it can intercept
+// all generated failures. The scope of this mock object can be controlled with
+// the second argument to the two arguments constructor.
+class GTEST_API_ ScopedFakeTestPartResultReporter
+    : public TestPartResultReporterInterface {
+ public:
+  // The two possible mocking modes of this object.
+  enum InterceptMode {
+    INTERCEPT_ONLY_CURRENT_THREAD,  // Intercepts only thread local failures.
+    INTERCEPT_ALL_THREADS           // Intercepts all failures.
+  };
+
+  // The c'tor sets this object as the test part result reporter used
+  // by Google Test.  The 'result' parameter specifies where to report the
+  // results. This reporter will only catch failures generated in the current
+  // thread. DEPRECATED
+  explicit ScopedFakeTestPartResultReporter(TestPartResultArray* result);
+
+  // Same as above, but you can choose the interception scope of this object.
+  ScopedFakeTestPartResultReporter(InterceptMode intercept_mode,
+                                   TestPartResultArray* result);
+
+  // The d'tor restores the previous test part result reporter.
+  virtual ~ScopedFakeTestPartResultReporter();
+
+  // Appends the TestPartResult object to the TestPartResultArray
+  // received in the constructor.
+  //
+  // This method is from the TestPartResultReporterInterface
+  // interface.
+  virtual void ReportTestPartResult(const TestPartResult& result);
+ private:
+  void Init();
+
+  const InterceptMode intercept_mode_;
+  TestPartResultReporterInterface* old_reporter_;
+  TestPartResultArray* const result_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter);
+};
+
+namespace internal {
+
+// A helper class for implementing EXPECT_FATAL_FAILURE() and
+// EXPECT_NONFATAL_FAILURE().  Its destructor verifies that the given
+// TestPartResultArray contains exactly one failure that has the given
+// type and contains the given substring.  If that's not the case, a
+// non-fatal failure will be generated.
+class GTEST_API_ SingleFailureChecker {
+ public:
+  // The constructor remembers the arguments.
+  SingleFailureChecker(const TestPartResultArray* results,
+                       TestPartResult::Type type, const std::string& substr);
+  ~SingleFailureChecker();
+ private:
+  const TestPartResultArray* const results_;
+  const TestPartResult::Type type_;
+  const std::string substr_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker);
+};
+
+}  // namespace internal
+
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+// A set of macros for testing Google Test assertions or code that's expected
+// to generate Google Test fatal failures.  It verifies that the given
+// statement will cause exactly one fatal Google Test failure with 'substr'
+// being part of the failure message.
+//
+// There are two different versions of this macro. EXPECT_FATAL_FAILURE only
+// affects and considers failures generated in the current thread and
+// EXPECT_FATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
+//
+// The verification of the assertion is done correctly even when the statement
+// throws an exception or aborts the current function.
+//
+// Known restrictions:
+//   - 'statement' cannot reference local non-static variables or
+//     non-static members of the current object.
+//   - 'statement' cannot return a value.
+//   - You cannot stream a failure message to this macro.
+//
+// Note that even though the implementations of the following two
+// macros are much alike, we cannot refactor them to use a common
+// helper macro, due to some peculiarity in how the preprocessor
+// works.  The AcceptsMacroThatExpandsToUnprotectedComma test in
+// gtest_unittest.cc will fail to compile if we do that.
+#define EXPECT_FATAL_FAILURE(statement, substr) \
+  do { \
+    class GTestExpectFatalFailureHelper {\
+     public:\
+      static void Execute() { statement; }\
+    };\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
+      GTestExpectFatalFailureHelper::Execute();\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
+  do { \
+    class GTestExpectFatalFailureHelper {\
+     public:\
+      static void Execute() { statement; }\
+    };\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ALL_THREADS, &gtest_failures);\
+      GTestExpectFatalFailureHelper::Execute();\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+// A macro for testing Google Test assertions or code that's expected to
+// generate Google Test non-fatal failures.  It asserts that the given
+// statement will cause exactly one non-fatal Google Test failure with 'substr'
+// being part of the failure message.
+//
+// There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only
+// affects and considers failures generated in the current thread and
+// EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
+//
+// 'statement' is allowed to reference local variables and members of
+// the current object.
+//
+// The verification of the assertion is done correctly even when the statement
+// throws an exception or aborts the current function.
+//
+// Known restrictions:
+//   - You cannot stream a failure message to this macro.
+//
+// Note that even though the implementations of the following two
+// macros are much alike, we cannot refactor them to use a common
+// helper macro, due to some peculiarity in how the preprocessor
+// works.  If we do that, the code won't compile when the user gives
+// EXPECT_NONFATAL_FAILURE() a statement that contains a macro that
+// expands to code containing an unprotected comma.  The
+// AcceptsMacroThatExpandsToUnprotectedComma test in gtest_unittest.cc
+// catches that.
+//
+// For the same reason, we have to write
+//   if (::testing::internal::AlwaysTrue()) { statement; }
+// instead of
+//   GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
+// to avoid an MSVC warning on unreachable code.
+#define EXPECT_NONFATAL_FAILURE(statement, substr) \
+  do {\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
+        (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
+      if (::testing::internal::AlwaysTrue()) { statement; }\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
+  do {\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
+        (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
+          &gtest_failures);\
+      if (::testing::internal::AlwaysTrue()) { statement; }\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+
+#include <ctype.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <wchar.h>
+#include <wctype.h>
+
+#include <algorithm>
+#include <iomanip>
+#include <limits>
+#include <list>
+#include <map>
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <vector>
+
+#if GTEST_OS_LINUX
+
+// FIXME: Use autoconf to detect availability of
+// gettimeofday().
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+
+# include <fcntl.h>  // NOLINT
+# include <limits.h>  // NOLINT
+# include <sched.h>  // NOLINT
+// Declares vsnprintf().  This header is not available on Windows.
+# include <strings.h>  // NOLINT
+# include <sys/mman.h>  // NOLINT
+# include <sys/time.h>  // NOLINT
+# include <unistd.h>  // NOLINT
+# include <string>
+
+#elif GTEST_OS_SYMBIAN
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+# include <sys/time.h>  // NOLINT
+
+#elif GTEST_OS_ZOS
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+# include <sys/time.h>  // NOLINT
+
+// On z/OS we additionally need strings.h for strcasecmp.
+# include <strings.h>  // NOLINT
+
+#elif GTEST_OS_WINDOWS_MOBILE  // We are on Windows CE.
+
+# include <windows.h>  // NOLINT
+# undef min
+
+#elif GTEST_OS_WINDOWS  // We are on Windows proper.
+
+# include <io.h>  // NOLINT
+# include <sys/timeb.h>  // NOLINT
+# include <sys/types.h>  // NOLINT
+# include <sys/stat.h>  // NOLINT
+
+# if GTEST_OS_WINDOWS_MINGW
+// MinGW has gettimeofday() but not _ftime64().
+// FIXME: Use autoconf to detect availability of
+//   gettimeofday().
+// FIXME: There are other ways to get the time on
+//   Windows, like GetTickCount() or GetSystemTimeAsFileTime().  MinGW
+//   supports these.  consider using them instead.
+#  define GTEST_HAS_GETTIMEOFDAY_ 1
+#  include <sys/time.h>  // NOLINT
+# endif  // GTEST_OS_WINDOWS_MINGW
+
+// cpplint thinks that the header is already included, so we want to
+// silence it.
+# include <windows.h>  // NOLINT
+# undef min
+
+#else
+
+// Assume other platforms have gettimeofday().
+// FIXME: Use autoconf to detect availability of
+//   gettimeofday().
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+
+// cpplint thinks that the header is already included, so we want to
+// silence it.
+# include <sys/time.h>  // NOLINT
+# include <unistd.h>  // NOLINT
+
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_HAS_EXCEPTIONS
+# include <stdexcept>
+#endif
+
+#if GTEST_CAN_STREAM_RESULTS_
+# include <arpa/inet.h>  // NOLINT
+# include <netdb.h>  // NOLINT
+# include <sys/socket.h>  // NOLINT
+# include <sys/types.h>  // NOLINT
+#endif
+
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Utility functions and classes used by the Google C++ testing framework.//
+// This file contains purely Google Test's internal implementation.  Please
+// DO NOT #INCLUDE IT IN A USER PROGRAM.
+
+#ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_
+#define GTEST_SRC_GTEST_INTERNAL_INL_H_
+
+#ifndef _WIN32_WCE
+# include <errno.h>
+#endif  // !_WIN32_WCE
+#include <stddef.h>
+#include <stdlib.h>  // For strtoll/_strtoul64/malloc/free.
+#include <string.h>  // For memmove.
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+
+#if GTEST_CAN_STREAM_RESULTS_
+# include <arpa/inet.h>  // NOLINT
+# include <netdb.h>  // NOLINT
+#endif
+
+#if GTEST_OS_WINDOWS
+# include <windows.h>  // NOLINT
+#endif  // GTEST_OS_WINDOWS
+
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// Declares the flags.
+//
+// We don't want the users to modify this flag in the code, but want
+// Google Test's own unit tests to be able to access it. Therefore we
+// declare it here as opposed to in gtest.h.
+GTEST_DECLARE_bool_(death_test_use_fork);
+
+namespace internal {
+
+// The value of GetTestTypeId() as seen from within the Google Test
+// library.  This is solely for testing GetTestTypeId().
+GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest;
+
+// Names of the flags (needed for parsing Google Test flags).
+const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests";
+const char kBreakOnFailureFlag[] = "break_on_failure";
+const char kCatchExceptionsFlag[] = "catch_exceptions";
+const char kColorFlag[] = "color";
+const char kFilterFlag[] = "filter";
+const char kListTestsFlag[] = "list_tests";
+const char kOutputFlag[] = "output";
+const char kPrintTimeFlag[] = "print_time";
+const char kPrintUTF8Flag[] = "print_utf8";
+const char kRandomSeedFlag[] = "random_seed";
+const char kRepeatFlag[] = "repeat";
+const char kShuffleFlag[] = "shuffle";
+const char kStackTraceDepthFlag[] = "stack_trace_depth";
+const char kStreamResultToFlag[] = "stream_result_to";
+const char kThrowOnFailureFlag[] = "throw_on_failure";
+const char kFlagfileFlag[] = "flagfile";
+
+// A valid random seed must be in [1, kMaxRandomSeed].
+const int kMaxRandomSeed = 99999;
+
+// g_help_flag is true iff the --help flag or an equivalent form is
+// specified on the command line.
+GTEST_API_ extern bool g_help_flag;
+
+// Returns the current time in milliseconds.
+GTEST_API_ TimeInMillis GetTimeInMillis();
+
+// Returns true iff Google Test should use colors in the output.
+GTEST_API_ bool ShouldUseColor(bool stdout_is_tty);
+
+// Formats the given time in milliseconds as seconds.
+GTEST_API_ std::string FormatTimeInMillisAsSeconds(TimeInMillis ms);
+
+// Converts the given time in milliseconds to a date string in the ISO 8601
+// format, without the timezone information.  N.B.: due to the use the
+// non-reentrant localtime() function, this function is not thread safe.  Do
+// not use it in any code that can be called from multiple threads.
+GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms);
+
+// Parses a string for an Int32 flag, in the form of "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+GTEST_API_ bool ParseInt32Flag(
+    const char* str, const char* flag, Int32* value);
+
+// Returns a random seed in range [1, kMaxRandomSeed] based on the
+// given --gtest_random_seed flag value.
+inline int GetRandomSeedFromFlag(Int32 random_seed_flag) {
+  const unsigned int raw_seed = (random_seed_flag == 0) ?
+      static_cast<unsigned int>(GetTimeInMillis()) :
+      static_cast<unsigned int>(random_seed_flag);
+
+  // Normalizes the actual seed to range [1, kMaxRandomSeed] such that
+  // it's easy to type.
+  const int normalized_seed =
+      static_cast<int>((raw_seed - 1U) %
+                       static_cast<unsigned int>(kMaxRandomSeed)) + 1;
+  return normalized_seed;
+}
+
+// Returns the first valid random seed after 'seed'.  The behavior is
+// undefined if 'seed' is invalid.  The seed after kMaxRandomSeed is
+// considered to be 1.
+inline int GetNextRandomSeed(int seed) {
+  GTEST_CHECK_(1 <= seed && seed <= kMaxRandomSeed)
+      << "Invalid random seed " << seed << " - must be in [1, "
+      << kMaxRandomSeed << "].";
+  const int next_seed = seed + 1;
+  return (next_seed > kMaxRandomSeed) ? 1 : next_seed;
+}
+
+// This class saves the values of all Google Test flags in its c'tor, and
+// restores them in its d'tor.
+class GTestFlagSaver {
+ public:
+  // The c'tor.
+  GTestFlagSaver() {
+    also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests);
+    break_on_failure_ = GTEST_FLAG(break_on_failure);
+    catch_exceptions_ = GTEST_FLAG(catch_exceptions);
+    color_ = GTEST_FLAG(color);
+    death_test_style_ = GTEST_FLAG(death_test_style);
+    death_test_use_fork_ = GTEST_FLAG(death_test_use_fork);
+    filter_ = GTEST_FLAG(filter);
+    internal_run_death_test_ = GTEST_FLAG(internal_run_death_test);
+    list_tests_ = GTEST_FLAG(list_tests);
+    output_ = GTEST_FLAG(output);
+    print_time_ = GTEST_FLAG(print_time);
+    print_utf8_ = GTEST_FLAG(print_utf8);
+    random_seed_ = GTEST_FLAG(random_seed);
+    repeat_ = GTEST_FLAG(repeat);
+    shuffle_ = GTEST_FLAG(shuffle);
+    stack_trace_depth_ = GTEST_FLAG(stack_trace_depth);
+    stream_result_to_ = GTEST_FLAG(stream_result_to);
+    throw_on_failure_ = GTEST_FLAG(throw_on_failure);
+  }
+
+  // The d'tor is not virtual.  DO NOT INHERIT FROM THIS CLASS.
+  ~GTestFlagSaver() {
+    GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_;
+    GTEST_FLAG(break_on_failure) = break_on_failure_;
+    GTEST_FLAG(catch_exceptions) = catch_exceptions_;
+    GTEST_FLAG(color) = color_;
+    GTEST_FLAG(death_test_style) = death_test_style_;
+    GTEST_FLAG(death_test_use_fork) = death_test_use_fork_;
+    GTEST_FLAG(filter) = filter_;
+    GTEST_FLAG(internal_run_death_test) = internal_run_death_test_;
+    GTEST_FLAG(list_tests) = list_tests_;
+    GTEST_FLAG(output) = output_;
+    GTEST_FLAG(print_time) = print_time_;
+    GTEST_FLAG(print_utf8) = print_utf8_;
+    GTEST_FLAG(random_seed) = random_seed_;
+    GTEST_FLAG(repeat) = repeat_;
+    GTEST_FLAG(shuffle) = shuffle_;
+    GTEST_FLAG(stack_trace_depth) = stack_trace_depth_;
+    GTEST_FLAG(stream_result_to) = stream_result_to_;
+    GTEST_FLAG(throw_on_failure) = throw_on_failure_;
+  }
+
+ private:
+  // Fields for saving the original values of flags.
+  bool also_run_disabled_tests_;
+  bool break_on_failure_;
+  bool catch_exceptions_;
+  std::string color_;
+  std::string death_test_style_;
+  bool death_test_use_fork_;
+  std::string filter_;
+  std::string internal_run_death_test_;
+  bool list_tests_;
+  std::string output_;
+  bool print_time_;
+  bool print_utf8_;
+  internal::Int32 random_seed_;
+  internal::Int32 repeat_;
+  bool shuffle_;
+  internal::Int32 stack_trace_depth_;
+  std::string stream_result_to_;
+  bool throw_on_failure_;
+} GTEST_ATTRIBUTE_UNUSED_;
+
+// Converts a Unicode code point to a narrow string in UTF-8 encoding.
+// code_point parameter is of type UInt32 because wchar_t may not be
+// wide enough to contain a code point.
+// If the code_point is not a valid Unicode code point
+// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
+// to "(Invalid Unicode 0xXXXXXXXX)".
+GTEST_API_ std::string CodePointToUtf8(UInt32 code_point);
+
+// Converts a wide string to a narrow string in UTF-8 encoding.
+// The wide string is assumed to have the following encoding:
+//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
+//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
+// Parameter str points to a null-terminated wide string.
+// Parameter num_chars may additionally limit the number
+// of wchar_t characters processed. -1 is used when the entire string
+// should be processed.
+// If the string contains code points that are not valid Unicode code points
+// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
+// and contains invalid UTF-16 surrogate pairs, values in those pairs
+// will be encoded as individual Unicode characters from Basic Normal Plane.
+GTEST_API_ std::string WideStringToUtf8(const wchar_t* str, int num_chars);
+
+// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
+// if the variable is present. If a file already exists at this location, this
+// function will write over it. If the variable is present, but the file cannot
+// be created, prints an error and exits.
+void WriteToShardStatusFileIfNeeded();
+
+// Checks whether sharding is enabled by examining the relevant
+// environment variable values. If the variables are present,
+// but inconsistent (e.g., shard_index >= total_shards), prints
+// an error and exits. If in_subprocess_for_death_test, sharding is
+// disabled because it must only be applied to the original test
+// process. Otherwise, we could filter out death tests we intended to execute.
+GTEST_API_ bool ShouldShard(const char* total_shards_str,
+                            const char* shard_index_str,
+                            bool in_subprocess_for_death_test);
+
+// Parses the environment variable var as an Int32. If it is unset,
+// returns default_val. If it is not an Int32, prints an error and
+// and aborts.
+GTEST_API_ Int32 Int32FromEnvOrDie(const char* env_var, Int32 default_val);
+
+// Given the total number of shards, the shard index, and the test id,
+// returns true iff the test should be run on this shard. The test id is
+// some arbitrary but unique non-negative integer assigned to each test
+// method. Assumes that 0 <= shard_index < total_shards.
+GTEST_API_ bool ShouldRunTestOnShard(
+    int total_shards, int shard_index, int test_id);
+
+// STL container utilities.
+
+// Returns the number of elements in the given container that satisfy
+// the given predicate.
+template <class Container, typename Predicate>
+inline int CountIf(const Container& c, Predicate predicate) {
+  // Implemented as an explicit loop since std::count_if() in libCstd on
+  // Solaris has a non-standard signature.
+  int count = 0;
+  for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) {
+    if (predicate(*it))
+      ++count;
+  }
+  return count;
+}
+
+// Applies a function/functor to each element in the container.
+template <class Container, typename Functor>
+void ForEach(const Container& c, Functor functor) {
+  std::for_each(c.begin(), c.end(), functor);
+}
+
+// Returns the i-th element of the vector, or default_value if i is not
+// in range [0, v.size()).
+template <typename E>
+inline E GetElementOr(const std::vector<E>& v, int i, E default_value) {
+  return (i < 0 || i >= static_cast<int>(v.size())) ? default_value : v[i];
+}
+
+// Performs an in-place shuffle of a range of the vector's elements.
+// 'begin' and 'end' are element indices as an STL-style range;
+// i.e. [begin, end) are shuffled, where 'end' == size() means to
+// shuffle to the end of the vector.
+template <typename E>
+void ShuffleRange(internal::Random* random, int begin, int end,
+                  std::vector<E>* v) {
+  const int size = static_cast<int>(v->size());
+  GTEST_CHECK_(0 <= begin && begin <= size)
+      << "Invalid shuffle range start " << begin << ": must be in range [0, "
+      << size << "].";
+  GTEST_CHECK_(begin <= end && end <= size)
+      << "Invalid shuffle range finish " << end << ": must be in range ["
+      << begin << ", " << size << "].";
+
+  // Fisher-Yates shuffle, from
+  // http://en.wikipedia.org/wiki/Fisher-Yates_shuffle
+  for (int range_width = end - begin; range_width >= 2; range_width--) {
+    const int last_in_range = begin + range_width - 1;
+    const int selected = begin + random->Generate(range_width);
+    std::swap((*v)[selected], (*v)[last_in_range]);
+  }
+}
+
+// Performs an in-place shuffle of the vector's elements.
+template <typename E>
+inline void Shuffle(internal::Random* random, std::vector<E>* v) {
+  ShuffleRange(random, 0, static_cast<int>(v->size()), v);
+}
+
+// A function for deleting an object.  Handy for being used as a
+// functor.
+template <typename T>
+static void Delete(T* x) {
+  delete x;
+}
+
+// A predicate that checks the key of a TestProperty against a known key.
+//
+// TestPropertyKeyIs is copyable.
+class TestPropertyKeyIs {
+ public:
+  // Constructor.
+  //
+  // TestPropertyKeyIs has NO default constructor.
+  explicit TestPropertyKeyIs(const std::string& key) : key_(key) {}
+
+  // Returns true iff the test name of test property matches on key_.
+  bool operator()(const TestProperty& test_property) const {
+    return test_property.key() == key_;
+  }
+
+ private:
+  std::string key_;
+};
+
+// Class UnitTestOptions.
+//
+// This class contains functions for processing options the user
+// specifies when running the tests.  It has only static members.
+//
+// In most cases, the user can specify an option using either an
+// environment variable or a command line flag.  E.g. you can set the
+// test filter using either GTEST_FILTER or --gtest_filter.  If both
+// the variable and the flag are present, the latter overrides the
+// former.
+class GTEST_API_ UnitTestOptions {
+ public:
+  // Functions for processing the gtest_output flag.
+
+  // Returns the output format, or "" for normal printed output.
+  static std::string GetOutputFormat();
+
+  // Returns the absolute path of the requested output file, or the
+  // default (test_detail.xml in the original working directory) if
+  // none was explicitly specified.
+  static std::string GetAbsolutePathToOutputFile();
+
+  // Functions for processing the gtest_filter flag.
+
+  // Returns true iff the wildcard pattern matches the string.  The
+  // first ':' or '\0' character in pattern marks the end of it.
+  //
+  // This recursive algorithm isn't very efficient, but is clear and
+  // works well enough for matching test names, which are short.
+  static bool PatternMatchesString(const char *pattern, const char *str);
+
+  // Returns true iff the user-specified filter matches the test case
+  // name and the test name.
+  static bool FilterMatchesTest(const std::string &test_case_name,
+                                const std::string &test_name);
+
+#if GTEST_OS_WINDOWS
+  // Function for supporting the gtest_catch_exception flag.
+
+  // Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
+  // given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
+  // This function is useful as an __except condition.
+  static int GTestShouldProcessSEH(DWORD exception_code);
+#endif  // GTEST_OS_WINDOWS
+
+  // Returns true if "name" matches the ':' separated list of glob-style
+  // filters in "filter".
+  static bool MatchesFilter(const std::string& name, const char* filter);
+};
+
+// Returns the current application's name, removing directory path if that
+// is present.  Used by UnitTestOptions::GetOutputFile.
+GTEST_API_ FilePath GetCurrentExecutableName();
+
+// The role interface for getting the OS stack trace as a string.
+class OsStackTraceGetterInterface {
+ public:
+  OsStackTraceGetterInterface() {}
+  virtual ~OsStackTraceGetterInterface() {}
+
+  // Returns the current OS stack trace as an std::string.  Parameters:
+  //
+  //   max_depth  - the maximum number of stack frames to be included
+  //                in the trace.
+  //   skip_count - the number of top frames to be skipped; doesn't count
+  //                against max_depth.
+  virtual std::string CurrentStackTrace(int max_depth, int skip_count) = 0;
+
+  // UponLeavingGTest() should be called immediately before Google Test calls
+  // user code. It saves some information about the current stack that
+  // CurrentStackTrace() will use to find and hide Google Test stack frames.
+  virtual void UponLeavingGTest() = 0;
+
+  // This string is inserted in place of stack frames that are part of
+  // Google Test's implementation.
+  static const char* const kElidedFramesMarker;
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface);
+};
+
+// A working implementation of the OsStackTraceGetterInterface interface.
+class OsStackTraceGetter : public OsStackTraceGetterInterface {
+ public:
+  OsStackTraceGetter() {}
+
+  virtual std::string CurrentStackTrace(int max_depth, int skip_count);
+  virtual void UponLeavingGTest();
+
+ private:
+#if GTEST_HAS_ABSL
+  Mutex mutex_;  // Protects all internal state.
+
+  // We save the stack frame below the frame that calls user code.
+  // We do this because the address of the frame immediately below
+  // the user code changes between the call to UponLeavingGTest()
+  // and any calls to the stack trace code from within the user code.
+  void* caller_frame_ = nullptr;
+#endif  // GTEST_HAS_ABSL
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter);
+};
+
+// Information about a Google Test trace point.
+struct TraceInfo {
+  const char* file;
+  int line;
+  std::string message;
+};
+
+// This is the default global test part result reporter used in UnitTestImpl.
+// This class should only be used by UnitTestImpl.
+class DefaultGlobalTestPartResultReporter
+  : public TestPartResultReporterInterface {
+ public:
+  explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test);
+  // Implements the TestPartResultReporterInterface. Reports the test part
+  // result in the current test.
+  virtual void ReportTestPartResult(const TestPartResult& result);
+
+ private:
+  UnitTestImpl* const unit_test_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter);
+};
+
+// This is the default per thread test part result reporter used in
+// UnitTestImpl. This class should only be used by UnitTestImpl.
+class DefaultPerThreadTestPartResultReporter
+    : public TestPartResultReporterInterface {
+ public:
+  explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl* unit_test);
+  // Implements the TestPartResultReporterInterface. The implementation just
+  // delegates to the current global test part result reporter of *unit_test_.
+  virtual void ReportTestPartResult(const TestPartResult& result);
+
+ private:
+  UnitTestImpl* const unit_test_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter);
+};
+
+// The private implementation of the UnitTest class.  We don't protect
+// the methods under a mutex, as this class is not accessible by a
+// user and the UnitTest class that delegates work to this class does
+// proper locking.
+class GTEST_API_ UnitTestImpl {
+ public:
+  explicit UnitTestImpl(UnitTest* parent);
+  virtual ~UnitTestImpl();
+
+  // There are two different ways to register your own TestPartResultReporter.
+  // You can register your own repoter to listen either only for test results
+  // from the current thread or for results from all threads.
+  // By default, each per-thread test result repoter just passes a new
+  // TestPartResult to the global test result reporter, which registers the
+  // test part result for the currently running test.
+
+  // Returns the global test part result reporter.
+  TestPartResultReporterInterface* GetGlobalTestPartResultReporter();
+
+  // Sets the global test part result reporter.
+  void SetGlobalTestPartResultReporter(
+      TestPartResultReporterInterface* reporter);
+
+  // Returns the test part result reporter for the current thread.
+  TestPartResultReporterInterface* GetTestPartResultReporterForCurrentThread();
+
+  // Sets the test part result reporter for the current thread.
+  void SetTestPartResultReporterForCurrentThread(
+      TestPartResultReporterInterface* reporter);
+
+  // Gets the number of successful test cases.
+  int successful_test_case_count() const;
+
+  // Gets the number of failed test cases.
+  int failed_test_case_count() const;
+
+  // Gets the number of all test cases.
+  int total_test_case_count() const;
+
+  // Gets the number of all test cases that contain at least one test
+  // that should run.
+  int test_case_to_run_count() const;
+
+  // Gets the number of successful tests.
+  int successful_test_count() const;
+
+  // Gets the number of failed tests.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Gets the number of all tests.
+  int total_test_count() const;
+
+  // Gets the number of tests that should run.
+  int test_to_run_count() const;
+
+  // Gets the time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const { return start_timestamp_; }
+
+  // Gets the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Returns true iff the unit test passed (i.e. all test cases passed).
+  bool Passed() const { return !Failed(); }
+
+  // Returns true iff the unit test failed (i.e. some test case failed
+  // or something outside of all tests failed).
+  bool Failed() const {
+    return failed_test_case_count() > 0 || ad_hoc_test_result()->Failed();
+  }
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  const TestCase* GetTestCase(int i) const {
+    const int index = GetElementOr(test_case_indices_, i, -1);
+    return index < 0 ? NULL : test_cases_[i];
+  }
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  TestCase* GetMutableTestCase(int i) {
+    const int index = GetElementOr(test_case_indices_, i, -1);
+    return index < 0 ? NULL : test_cases_[index];
+  }
+
+  // Provides access to the event listener list.
+  TestEventListeners* listeners() { return &listeners_; }
+
+  // Returns the TestResult for the test that's currently running, or
+  // the TestResult for the ad hoc test if no test is running.
+  TestResult* current_test_result();
+
+  // Returns the TestResult for the ad hoc test.
+  const TestResult* ad_hoc_test_result() const { return &ad_hoc_test_result_; }
+
+  // Sets the OS stack trace getter.
+  //
+  // Does nothing if the input and the current OS stack trace getter
+  // are the same; otherwise, deletes the old getter and makes the
+  // input the current getter.
+  void set_os_stack_trace_getter(OsStackTraceGetterInterface* getter);
+
+  // Returns the current OS stack trace getter if it is not NULL;
+  // otherwise, creates an OsStackTraceGetter, makes it the current
+  // getter, and returns it.
+  OsStackTraceGetterInterface* os_stack_trace_getter();
+
+  // Returns the current OS stack trace as an std::string.
+  //
+  // The maximum number of stack frames to be included is specified by
+  // the gtest_stack_trace_depth flag.  The skip_count parameter
+  // specifies the number of top frames to be skipped, which doesn't
+  // count against the number of frames to be included.
+  //
+  // For example, if Foo() calls Bar(), which in turn calls
+  // CurrentOsStackTraceExceptTop(1), Foo() will be included in the
+  // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
+  std::string CurrentOsStackTraceExceptTop(int skip_count) GTEST_NO_INLINE_;
+
+  // Finds and returns a TestCase with the given name.  If one doesn't
+  // exist, creates one and returns it.
+  //
+  // Arguments:
+  //
+  //   test_case_name: name of the test case
+  //   type_param:     the name of the test's type parameter, or NULL if
+  //                   this is not a typed or a type-parameterized test.
+  //   set_up_tc:      pointer to the function that sets up the test case
+  //   tear_down_tc:   pointer to the function that tears down the test case
+  TestCase* GetTestCase(const char* test_case_name,
+                        const char* type_param,
+                        Test::SetUpTestCaseFunc set_up_tc,
+                        Test::TearDownTestCaseFunc tear_down_tc);
+
+  // Adds a TestInfo to the unit test.
+  //
+  // Arguments:
+  //
+  //   set_up_tc:    pointer to the function that sets up the test case
+  //   tear_down_tc: pointer to the function that tears down the test case
+  //   test_info:    the TestInfo object
+  void AddTestInfo(Test::SetUpTestCaseFunc set_up_tc,
+                   Test::TearDownTestCaseFunc tear_down_tc,
+                   TestInfo* test_info) {
+    // In order to support thread-safe death tests, we need to
+    // remember the original working directory when the test program
+    // was first invoked.  We cannot do this in RUN_ALL_TESTS(), as
+    // the user may have changed the current directory before calling
+    // RUN_ALL_TESTS().  Therefore we capture the current directory in
+    // AddTestInfo(), which is called to register a TEST or TEST_F
+    // before main() is reached.
+    if (original_working_dir_.IsEmpty()) {
+      original_working_dir_.Set(FilePath::GetCurrentDir());
+      GTEST_CHECK_(!original_working_dir_.IsEmpty())
+          << "Failed to get the current working directory.";
+    }
+
+    GetTestCase(test_info->test_case_name(),
+                test_info->type_param(),
+                set_up_tc,
+                tear_down_tc)->AddTestInfo(test_info);
+  }
+
+  // Returns ParameterizedTestCaseRegistry object used to keep track of
+  // value-parameterized tests and instantiate and register them.
+  internal::ParameterizedTestCaseRegistry& parameterized_test_registry() {
+    return parameterized_test_registry_;
+  }
+
+  // Sets the TestCase object for the test that's currently running.
+  void set_current_test_case(TestCase* a_current_test_case) {
+    current_test_case_ = a_current_test_case;
+  }
+
+  // Sets the TestInfo object for the test that's currently running.  If
+  // current_test_info is NULL, the assertion results will be stored in
+  // ad_hoc_test_result_.
+  void set_current_test_info(TestInfo* a_current_test_info) {
+    current_test_info_ = a_current_test_info;
+  }
+
+  // Registers all parameterized tests defined using TEST_P and
+  // INSTANTIATE_TEST_CASE_P, creating regular tests for each test/parameter
+  // combination. This method can be called more then once; it has guards
+  // protecting from registering the tests more then once.  If
+  // value-parameterized tests are disabled, RegisterParameterizedTests is
+  // present but does nothing.
+  void RegisterParameterizedTests();
+
+  // Runs all tests in this UnitTest object, prints the result, and
+  // returns true if all tests are successful.  If any exception is
+  // thrown during a test, this test is considered to be failed, but
+  // the rest of the tests will still be run.
+  bool RunAllTests();
+
+  // Clears the results of all tests, except the ad hoc tests.
+  void ClearNonAdHocTestResult() {
+    ForEach(test_cases_, TestCase::ClearTestCaseResult);
+  }
+
+  // Clears the results of ad-hoc test assertions.
+  void ClearAdHocTestResult() {
+    ad_hoc_test_result_.Clear();
+  }
+
+  // Adds a TestProperty to the current TestResult object when invoked in a
+  // context of a test or a test case, or to the global property set. If the
+  // result already contains a property with the same key, the value will be
+  // updated.
+  void RecordProperty(const TestProperty& test_property);
+
+  enum ReactionToSharding {
+    HONOR_SHARDING_PROTOCOL,
+    IGNORE_SHARDING_PROTOCOL
+  };
+
+  // Matches the full name of each test against the user-specified
+  // filter to decide whether the test should run, then records the
+  // result in each TestCase and TestInfo object.
+  // If shard_tests == HONOR_SHARDING_PROTOCOL, further filters tests
+  // based on sharding variables in the environment.
+  // Returns the number of tests that should run.
+  int FilterTests(ReactionToSharding shard_tests);
+
+  // Prints the names of the tests matching the user-specified filter flag.
+  void ListTestsMatchingFilter();
+
+  const TestCase* current_test_case() const { return current_test_case_; }
+  TestInfo* current_test_info() { return current_test_info_; }
+  const TestInfo* current_test_info() const { return current_test_info_; }
+
+  // Returns the vector of environments that need to be set-up/torn-down
+  // before/after the tests are run.
+  std::vector<Environment*>& environments() { return environments_; }
+
+  // Getters for the per-thread Google Test trace stack.
+  std::vector<TraceInfo>& gtest_trace_stack() {
+    return *(gtest_trace_stack_.pointer());
+  }
+  const std::vector<TraceInfo>& gtest_trace_stack() const {
+    return gtest_trace_stack_.get();
+  }
+
+#if GTEST_HAS_DEATH_TEST
+  void InitDeathTestSubprocessControlInfo() {
+    internal_run_death_test_flag_.reset(ParseInternalRunDeathTestFlag());
+  }
+  // Returns a pointer to the parsed --gtest_internal_run_death_test
+  // flag, or NULL if that flag was not specified.
+  // This information is useful only in a death test child process.
+  // Must not be called before a call to InitGoogleTest.
+  const InternalRunDeathTestFlag* internal_run_death_test_flag() const {
+    return internal_run_death_test_flag_.get();
+  }
+
+  // Returns a pointer to the current death test factory.
+  internal::DeathTestFactory* death_test_factory() {
+    return death_test_factory_.get();
+  }
+
+  void SuppressTestEventsIfInSubprocess();
+
+  friend class ReplaceDeathTestFactory;
+#endif  // GTEST_HAS_DEATH_TEST
+
+  // Initializes the event listener performing XML output as specified by
+  // UnitTestOptions. Must not be called before InitGoogleTest.
+  void ConfigureXmlOutput();
+
+#if GTEST_CAN_STREAM_RESULTS_
+  // Initializes the event listener for streaming test results to a socket.
+  // Must not be called before InitGoogleTest.
+  void ConfigureStreamingOutput();
+#endif
+
+  // Performs initialization dependent upon flag values obtained in
+  // ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
+  // ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
+  // this function is also called from RunAllTests.  Since this function can be
+  // called more than once, it has to be idempotent.
+  void PostFlagParsingInit();
+
+  // Gets the random seed used at the start of the current test iteration.
+  int random_seed() const { return random_seed_; }
+
+  // Gets the random number generator.
+  internal::Random* random() { return &random_; }
+
+  // Shuffles all test cases, and the tests within each test case,
+  // making sure that death tests are still run first.
+  void ShuffleTests();
+
+  // Restores the test cases and tests to their order before the first shuffle.
+  void UnshuffleTests();
+
+  // Returns the value of GTEST_FLAG(catch_exceptions) at the moment
+  // UnitTest::Run() starts.
+  bool catch_exceptions() const { return catch_exceptions_; }
+
+ private:
+  friend class ::testing::UnitTest;
+
+  // Used by UnitTest::Run() to capture the state of
+  // GTEST_FLAG(catch_exceptions) at the moment it starts.
+  void set_catch_exceptions(bool value) { catch_exceptions_ = value; }
+
+  // The UnitTest object that owns this implementation object.
+  UnitTest* const parent_;
+
+  // The working directory when the first TEST() or TEST_F() was
+  // executed.
+  internal::FilePath original_working_dir_;
+
+  // The default test part result reporters.
+  DefaultGlobalTestPartResultReporter default_global_test_part_result_reporter_;
+  DefaultPerThreadTestPartResultReporter
+      default_per_thread_test_part_result_reporter_;
+
+  // Points to (but doesn't own) the global test part result reporter.
+  TestPartResultReporterInterface* global_test_part_result_repoter_;
+
+  // Protects read and write access to global_test_part_result_reporter_.
+  internal::Mutex global_test_part_result_reporter_mutex_;
+
+  // Points to (but doesn't own) the per-thread test part result reporter.
+  internal::ThreadLocal<TestPartResultReporterInterface*>
+      per_thread_test_part_result_reporter_;
+
+  // The vector of environments that need to be set-up/torn-down
+  // before/after the tests are run.
+  std::vector<Environment*> environments_;
+
+  // The vector of TestCases in their original order.  It owns the
+  // elements in the vector.
+  std::vector<TestCase*> test_cases_;
+
+  // Provides a level of indirection for the test case list to allow
+  // easy shuffling and restoring the test case order.  The i-th
+  // element of this vector is the index of the i-th test case in the
+  // shuffled order.
+  std::vector<int> test_case_indices_;
+
+  // ParameterizedTestRegistry object used to register value-parameterized
+  // tests.
+  internal::ParameterizedTestCaseRegistry parameterized_test_registry_;
+
+  // Indicates whether RegisterParameterizedTests() has been called already.
+  bool parameterized_tests_registered_;
+
+  // Index of the last death test case registered.  Initially -1.
+  int last_death_test_case_;
+
+  // This points to the TestCase for the currently running test.  It
+  // changes as Google Test goes through one test case after another.
+  // When no test is running, this is set to NULL and Google Test
+  // stores assertion results in ad_hoc_test_result_.  Initially NULL.
+  TestCase* current_test_case_;
+
+  // This points to the TestInfo for the currently running test.  It
+  // changes as Google Test goes through one test after another.  When
+  // no test is running, this is set to NULL and Google Test stores
+  // assertion results in ad_hoc_test_result_.  Initially NULL.
+  TestInfo* current_test_info_;
+
+  // Normally, a user only writes assertions inside a TEST or TEST_F,
+  // or inside a function called by a TEST or TEST_F.  Since Google
+  // Test keeps track of which test is current running, it can
+  // associate such an assertion with the test it belongs to.
+  //
+  // If an assertion is encountered when no TEST or TEST_F is running,
+  // Google Test attributes the assertion result to an imaginary "ad hoc"
+  // test, and records the result in ad_hoc_test_result_.
+  TestResult ad_hoc_test_result_;
+
+  // The list of event listeners that can be used to track events inside
+  // Google Test.
+  TestEventListeners listeners_;
+
+  // The OS stack trace getter.  Will be deleted when the UnitTest
+  // object is destructed.  By default, an OsStackTraceGetter is used,
+  // but the user can set this field to use a custom getter if that is
+  // desired.
+  OsStackTraceGetterInterface* os_stack_trace_getter_;
+
+  // True iff PostFlagParsingInit() has been called.
+  bool post_flag_parse_init_performed_;
+
+  // The random number seed used at the beginning of the test run.
+  int random_seed_;
+
+  // Our random number generator.
+  internal::Random random_;
+
+  // The time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp_;
+
+  // How long the test took to run, in milliseconds.
+  TimeInMillis elapsed_time_;
+
+#if GTEST_HAS_DEATH_TEST
+  // The decomposed components of the gtest_internal_run_death_test flag,
+  // parsed when RUN_ALL_TESTS is called.
+  internal::scoped_ptr<InternalRunDeathTestFlag> internal_run_death_test_flag_;
+  internal::scoped_ptr<internal::DeathTestFactory> death_test_factory_;
+#endif  // GTEST_HAS_DEATH_TEST
+
+  // A per-thread stack of traces created by the SCOPED_TRACE() macro.
+  internal::ThreadLocal<std::vector<TraceInfo> > gtest_trace_stack_;
+
+  // The value of GTEST_FLAG(catch_exceptions) at the moment RunAllTests()
+  // starts.
+  bool catch_exceptions_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl);
+};  // class UnitTestImpl
+
+// Convenience function for accessing the global UnitTest
+// implementation object.
+inline UnitTestImpl* GetUnitTestImpl() {
+  return UnitTest::GetInstance()->impl();
+}
+
+#if GTEST_USES_SIMPLE_RE
+
+// Internal helper functions for implementing the simple regular
+// expression matcher.
+GTEST_API_ bool IsInSet(char ch, const char* str);
+GTEST_API_ bool IsAsciiDigit(char ch);
+GTEST_API_ bool IsAsciiPunct(char ch);
+GTEST_API_ bool IsRepeat(char ch);
+GTEST_API_ bool IsAsciiWhiteSpace(char ch);
+GTEST_API_ bool IsAsciiWordChar(char ch);
+GTEST_API_ bool IsValidEscape(char ch);
+GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch);
+GTEST_API_ bool ValidateRegex(const char* regex);
+GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str);
+GTEST_API_ bool MatchRepetitionAndRegexAtHead(
+    bool escaped, char ch, char repeat, const char* regex, const char* str);
+GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str);
+
+#endif  // GTEST_USES_SIMPLE_RE
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.
+GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, char** argv);
+GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv);
+
+#if GTEST_HAS_DEATH_TEST
+
+// Returns the message describing the last system error, regardless of the
+// platform.
+GTEST_API_ std::string GetLastErrnoDescription();
+
+// Attempts to parse a string into a positive integer pointed to by the
+// number parameter.  Returns true if that is possible.
+// GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we can use
+// it here.
+template <typename Integer>
+bool ParseNaturalNumber(const ::std::string& str, Integer* number) {
+  // Fail fast if the given string does not begin with a digit;
+  // this bypasses strtoXXX's "optional leading whitespace and plus
+  // or minus sign" semantics, which are undesirable here.
+  if (str.empty() || !IsDigit(str[0])) {
+    return false;
+  }
+  errno = 0;
+
+  char* end;
+  // BiggestConvertible is the largest integer type that system-provided
+  // string-to-number conversion routines can return.
+
+# if GTEST_OS_WINDOWS && !defined(__GNUC__)
+
+  // MSVC and C++ Builder define __int64 instead of the standard long long.
+  typedef unsigned __int64 BiggestConvertible;
+  const BiggestConvertible parsed = _strtoui64(str.c_str(), &end, 10);
+
+# else
+
+  typedef unsigned long long BiggestConvertible;  // NOLINT
+  const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10);
+
+# endif  // GTEST_OS_WINDOWS && !defined(__GNUC__)
+
+  const bool parse_success = *end == '\0' && errno == 0;
+
+  // FIXME: Convert this to compile time assertion when it is
+  // available.
+  GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed));
+
+  const Integer result = static_cast<Integer>(parsed);
+  if (parse_success && static_cast<BiggestConvertible>(result) == parsed) {
+    *number = result;
+    return true;
+  }
+  return false;
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+// TestResult contains some private methods that should be hidden from
+// Google Test user but are required for testing. This class allow our tests
+// to access them.
+//
+// This class is supplied only for the purpose of testing Google Test's own
+// constructs. Do not use it in user tests, either directly or indirectly.
+class TestResultAccessor {
+ public:
+  static void RecordProperty(TestResult* test_result,
+                             const std::string& xml_element,
+                             const TestProperty& property) {
+    test_result->RecordProperty(xml_element, property);
+  }
+
+  static void ClearTestPartResults(TestResult* test_result) {
+    test_result->ClearTestPartResults();
+  }
+
+  static const std::vector<testing::TestPartResult>& test_part_results(
+      const TestResult& test_result) {
+    return test_result.test_part_results();
+  }
+};
+
+#if GTEST_CAN_STREAM_RESULTS_
+
+// Streams test results to the given port on the given host machine.
+class StreamingListener : public EmptyTestEventListener {
+ public:
+  // Abstract base class for writing strings to a socket.
+  class AbstractSocketWriter {
+   public:
+    virtual ~AbstractSocketWriter() {}
+
+    // Sends a string to the socket.
+    virtual void Send(const std::string& message) = 0;
+
+    // Closes the socket.
+    virtual void CloseConnection() {}
+
+    // Sends a string and a newline to the socket.
+    void SendLn(const std::string& message) { Send(message + "\n"); }
+  };
+
+  // Concrete class for actually writing strings to a socket.
+  class SocketWriter : public AbstractSocketWriter {
+   public:
+    SocketWriter(const std::string& host, const std::string& port)
+        : sockfd_(-1), host_name_(host), port_num_(port) {
+      MakeConnection();
+    }
+
+    virtual ~SocketWriter() {
+      if (sockfd_ != -1)
+        CloseConnection();
+    }
+
+    // Sends a string to the socket.
+    virtual void Send(const std::string& message) {
+      GTEST_CHECK_(sockfd_ != -1)
+          << "Send() can be called only when there is a connection.";
+
+      const int len = static_cast<int>(message.length());
+      if (write(sockfd_, message.c_str(), len) != len) {
+        GTEST_LOG_(WARNING)
+            << "stream_result_to: failed to stream to "
+            << host_name_ << ":" << port_num_;
+      }
+    }
+
+   private:
+    // Creates a client socket and connects to the server.
+    void MakeConnection();
+
+    // Closes the socket.
+    void CloseConnection() {
+      GTEST_CHECK_(sockfd_ != -1)
+          << "CloseConnection() can be called only when there is a connection.";
+
+      close(sockfd_);
+      sockfd_ = -1;
+    }
+
+    int sockfd_;  // socket file descriptor
+    const std::string host_name_;
+    const std::string port_num_;
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter);
+  };  // class SocketWriter
+
+  // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
+  static std::string UrlEncode(const char* str);
+
+  StreamingListener(const std::string& host, const std::string& port)
+      : socket_writer_(new SocketWriter(host, port)) {
+    Start();
+  }
+
+  explicit StreamingListener(AbstractSocketWriter* socket_writer)
+      : socket_writer_(socket_writer) { Start(); }
+
+  void OnTestProgramStart(const UnitTest& /* unit_test */) {
+    SendLn("event=TestProgramStart");
+  }
+
+  void OnTestProgramEnd(const UnitTest& unit_test) {
+    // Note that Google Test current only report elapsed time for each
+    // test iteration, not for the entire test program.
+    SendLn("event=TestProgramEnd&passed=" + FormatBool(unit_test.Passed()));
+
+    // Notify the streaming server to stop.
+    socket_writer_->CloseConnection();
+  }
+
+  void OnTestIterationStart(const UnitTest& /* unit_test */, int iteration) {
+    SendLn("event=TestIterationStart&iteration=" +
+           StreamableToString(iteration));
+  }
+
+  void OnTestIterationEnd(const UnitTest& unit_test, int /* iteration */) {
+    SendLn("event=TestIterationEnd&passed=" +
+           FormatBool(unit_test.Passed()) + "&elapsed_time=" +
+           StreamableToString(unit_test.elapsed_time()) + "ms");
+  }
+
+  void OnTestCaseStart(const TestCase& test_case) {
+    SendLn(std::string("event=TestCaseStart&name=") + test_case.name());
+  }
+
+  void OnTestCaseEnd(const TestCase& test_case) {
+    SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed())
+           + "&elapsed_time=" + StreamableToString(test_case.elapsed_time())
+           + "ms");
+  }
+
+  void OnTestStart(const TestInfo& test_info) {
+    SendLn(std::string("event=TestStart&name=") + test_info.name());
+  }
+
+  void OnTestEnd(const TestInfo& test_info) {
+    SendLn("event=TestEnd&passed=" +
+           FormatBool((test_info.result())->Passed()) +
+           "&elapsed_time=" +
+           StreamableToString((test_info.result())->elapsed_time()) + "ms");
+  }
+
+  void OnTestPartResult(const TestPartResult& test_part_result) {
+    const char* file_name = test_part_result.file_name();
+    if (file_name == NULL)
+      file_name = "";
+    SendLn("event=TestPartResult&file=" + UrlEncode(file_name) +
+           "&line=" + StreamableToString(test_part_result.line_number()) +
+           "&message=" + UrlEncode(test_part_result.message()));
+  }
+
+ private:
+  // Sends the given message and a newline to the socket.
+  void SendLn(const std::string& message) { socket_writer_->SendLn(message); }
+
+  // Called at the start of streaming to notify the receiver what
+  // protocol we are using.
+  void Start() { SendLn("gtest_streaming_protocol_version=1.0"); }
+
+  std::string FormatBool(bool value) { return value ? "1" : "0"; }
+
+  const scoped_ptr<AbstractSocketWriter> socket_writer_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener);
+};  // class StreamingListener
+
+#endif  // GTEST_CAN_STREAM_RESULTS_
+
+}  // namespace internal
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+#endif  // GTEST_SRC_GTEST_INTERNAL_INL_H_
+
+#if GTEST_OS_WINDOWS
+# define vsnprintf _vsnprintf
+#endif  // GTEST_OS_WINDOWS
+
+#if GTEST_OS_MAC
+#ifndef GTEST_OS_IOS
+#include <crt_externs.h>
+#endif
+#endif
+
+#if GTEST_HAS_ABSL
+#include "absl/debugging/failure_signal_handler.h"
+#include "absl/debugging/stacktrace.h"
+#include "absl/debugging/symbolize.h"
+#include "absl/strings/str_cat.h"
+#endif  // GTEST_HAS_ABSL
+
+namespace testing {
+
+using internal::CountIf;
+using internal::ForEach;
+using internal::GetElementOr;
+using internal::Shuffle;
+
+// Constants.
+
+// A test whose test case name or test name matches this filter is
+// disabled and not run.
+static const char kDisableTestFilter[] = "DISABLED_*:*/DISABLED_*";
+
+// A test case whose name matches this filter is considered a death
+// test case and will be run before test cases whose name doesn't
+// match this filter.
+static const char kDeathTestCaseFilter[] = "*DeathTest:*DeathTest/*";
+
+// A test filter that matches everything.
+static const char kUniversalFilter[] = "*";
+
+// The default output format.
+static const char kDefaultOutputFormat[] = "xml";
+// The default output file.
+static const char kDefaultOutputFile[] = "test_detail";
+
+// The environment variable name for the test shard index.
+static const char kTestShardIndex[] = "GTEST_SHARD_INDEX";
+// The environment variable name for the total number of test shards.
+static const char kTestTotalShards[] = "GTEST_TOTAL_SHARDS";
+// The environment variable name for the test shard status file.
+static const char kTestShardStatusFile[] = "GTEST_SHARD_STATUS_FILE";
+
+namespace internal {
+
+// The text used in failure messages to indicate the start of the
+// stack trace.
+const char kStackTraceMarker[] = "\nStack trace:\n";
+
+// g_help_flag is true iff the --help flag or an equivalent form is
+// specified on the command line.
+bool g_help_flag = false;
+
+// Utilty function to Open File for Writing
+static FILE* OpenFileForWriting(const std::string& output_file) {
+  FILE* fileout = NULL;
+  FilePath output_file_path(output_file);
+  FilePath output_dir(output_file_path.RemoveFileName());
+
+  if (output_dir.CreateDirectoriesRecursively()) {
+    fileout = posix::FOpen(output_file.c_str(), "w");
+  }
+  if (fileout == NULL) {
+    GTEST_LOG_(FATAL) << "Unable to open file \"" << output_file << "\"";
+  }
+  return fileout;
+}
+
+}  // namespace internal
+
+// Bazel passes in the argument to '--test_filter' via the TESTBRIDGE_TEST_ONLY
+// environment variable.
+static const char* GetDefaultFilter() {
+  const char* const testbridge_test_only =
+      internal::posix::GetEnv("TESTBRIDGE_TEST_ONLY");
+  if (testbridge_test_only != NULL) {
+    return testbridge_test_only;
+  }
+  return kUniversalFilter;
+}
+
+GTEST_DEFINE_bool_(
+    also_run_disabled_tests,
+    internal::BoolFromGTestEnv("also_run_disabled_tests", false),
+    "Run disabled tests too, in addition to the tests normally being run.");
+
+GTEST_DEFINE_bool_(
+    break_on_failure,
+    internal::BoolFromGTestEnv("break_on_failure", false),
+    "True iff a failed assertion should be a debugger break-point.");
+
+GTEST_DEFINE_bool_(
+    catch_exceptions,
+    internal::BoolFromGTestEnv("catch_exceptions", true),
+    "True iff " GTEST_NAME_
+    " should catch exceptions and treat them as test failures.");
+
+GTEST_DEFINE_string_(
+    color,
+    internal::StringFromGTestEnv("color", "auto"),
+    "Whether to use colors in the output.  Valid values: yes, no, "
+    "and auto.  'auto' means to use colors if the output is "
+    "being sent to a terminal and the TERM environment variable "
+    "is set to a terminal type that supports colors.");
+
+GTEST_DEFINE_string_(
+    filter,
+    internal::StringFromGTestEnv("filter", GetDefaultFilter()),
+    "A colon-separated list of glob (not regex) patterns "
+    "for filtering the tests to run, optionally followed by a "
+    "'-' and a : separated list of negative patterns (tests to "
+    "exclude).  A test is run if it matches one of the positive "
+    "patterns and does not match any of the negative patterns.");
+
+GTEST_DEFINE_bool_(
+    install_failure_signal_handler,
+    internal::BoolFromGTestEnv("install_failure_signal_handler", false),
+    "If true and supported on the current platform, " GTEST_NAME_ " should "
+    "install a signal handler that dumps debugging information when fatal "
+    "signals are raised.");
+
+GTEST_DEFINE_bool_(list_tests, false,
+                   "List all tests without running them.");
+
+// The net priority order after flag processing is thus:
+//   --gtest_output command line flag
+//   GTEST_OUTPUT environment variable
+//   XML_OUTPUT_FILE environment variable
+//   ''
+GTEST_DEFINE_string_(
+    output,
+    internal::StringFromGTestEnv("output",
+      internal::OutputFlagAlsoCheckEnvVar().c_str()),
+    "A format (defaults to \"xml\" but can be specified to be \"json\"), "
+    "optionally followed by a colon and an output file name or directory. "
+    "A directory is indicated by a trailing pathname separator. "
+    "Examples: \"xml:filename.xml\", \"xml::directoryname/\". "
+    "If a directory is specified, output files will be created "
+    "within that directory, with file-names based on the test "
+    "executable's name and, if necessary, made unique by adding "
+    "digits.");
+
+GTEST_DEFINE_bool_(
+    print_time,
+    internal::BoolFromGTestEnv("print_time", true),
+    "True iff " GTEST_NAME_
+    " should display elapsed time in text output.");
+
+GTEST_DEFINE_bool_(
+    print_utf8,
+    internal::BoolFromGTestEnv("print_utf8", true),
+    "True iff " GTEST_NAME_
+    " prints UTF8 characters as text.");
+
+GTEST_DEFINE_int32_(
+    random_seed,
+    internal::Int32FromGTestEnv("random_seed", 0),
+    "Random number seed to use when shuffling test orders.  Must be in range "
+    "[1, 99999], or 0 to use a seed based on the current time.");
+
+GTEST_DEFINE_int32_(
+    repeat,
+    internal::Int32FromGTestEnv("repeat", 1),
+    "How many times to repeat each test.  Specify a negative number "
+    "for repeating forever.  Useful for shaking out flaky tests.");
+
+GTEST_DEFINE_bool_(
+    show_internal_stack_frames, false,
+    "True iff " GTEST_NAME_ " should include internal stack frames when "
+    "printing test failure stack traces.");
+
+GTEST_DEFINE_bool_(
+    shuffle,
+    internal::BoolFromGTestEnv("shuffle", false),
+    "True iff " GTEST_NAME_
+    " should randomize tests' order on every run.");
+
+GTEST_DEFINE_int32_(
+    stack_trace_depth,
+    internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth),
+    "The maximum number of stack frames to print when an "
+    "assertion fails.  The valid range is 0 through 100, inclusive.");
+
+GTEST_DEFINE_string_(
+    stream_result_to,
+    internal::StringFromGTestEnv("stream_result_to", ""),
+    "This flag specifies the host name and the port number on which to stream "
+    "test results. Example: \"localhost:555\". The flag is effective only on "
+    "Linux.");
+
+GTEST_DEFINE_bool_(
+    throw_on_failure,
+    internal::BoolFromGTestEnv("throw_on_failure", false),
+    "When this flag is specified, a failed assertion will throw an exception "
+    "if exceptions are enabled or exit the program with a non-zero code "
+    "otherwise. For use with an external test framework.");
+
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+GTEST_DEFINE_string_(
+    flagfile,
+    internal::StringFromGTestEnv("flagfile", ""),
+    "This flag specifies the flagfile to read command-line flags from.");
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+
+namespace internal {
+
+// Generates a random number from [0, range), using a Linear
+// Congruential Generator (LCG).  Crashes if 'range' is 0 or greater
+// than kMaxRange.
+UInt32 Random::Generate(UInt32 range) {
+  // These constants are the same as are used in glibc's rand(3).
+  // Use wider types than necessary to prevent unsigned overflow diagnostics.
+  state_ = static_cast<UInt32>(1103515245ULL*state_ + 12345U) % kMaxRange;
+
+  GTEST_CHECK_(range > 0)
+      << "Cannot generate a number in the range [0, 0).";
+  GTEST_CHECK_(range <= kMaxRange)
+      << "Generation of a number in [0, " << range << ") was requested, "
+      << "but this can only generate numbers in [0, " << kMaxRange << ").";
+
+  // Converting via modulus introduces a bit of downward bias, but
+  // it's simple, and a linear congruential generator isn't too good
+  // to begin with.
+  return state_ % range;
+}
+
+// GTestIsInitialized() returns true iff the user has initialized
+// Google Test.  Useful for catching the user mistake of not initializing
+// Google Test before calling RUN_ALL_TESTS().
+static bool GTestIsInitialized() { return GetArgvs().size() > 0; }
+
+// Iterates over a vector of TestCases, keeping a running sum of the
+// results of calling a given int-returning method on each.
+// Returns the sum.
+static int SumOverTestCaseList(const std::vector<TestCase*>& case_list,
+                               int (TestCase::*method)() const) {
+  int sum = 0;
+  for (size_t i = 0; i < case_list.size(); i++) {
+    sum += (case_list[i]->*method)();
+  }
+  return sum;
+}
+
+// Returns true iff the test case passed.
+static bool TestCasePassed(const TestCase* test_case) {
+  return test_case->should_run() && test_case->Passed();
+}
+
+// Returns true iff the test case failed.
+static bool TestCaseFailed(const TestCase* test_case) {
+  return test_case->should_run() && test_case->Failed();
+}
+
+// Returns true iff test_case contains at least one test that should
+// run.
+static bool ShouldRunTestCase(const TestCase* test_case) {
+  return test_case->should_run();
+}
+
+// AssertHelper constructor.
+AssertHelper::AssertHelper(TestPartResult::Type type,
+                           const char* file,
+                           int line,
+                           const char* message)
+    : data_(new AssertHelperData(type, file, line, message)) {
+}
+
+AssertHelper::~AssertHelper() {
+  delete data_;
+}
+
+// Message assignment, for assertion streaming support.
+void AssertHelper::operator=(const Message& message) const {
+  UnitTest::GetInstance()->
+    AddTestPartResult(data_->type, data_->file, data_->line,
+                      AppendUserMessage(data_->message, message),
+                      UnitTest::GetInstance()->impl()
+                      ->CurrentOsStackTraceExceptTop(1)
+                      // Skips the stack frame for this function itself.
+                      );  // NOLINT
+}
+
+// Mutex for linked pointers.
+GTEST_API_ GTEST_DEFINE_STATIC_MUTEX_(g_linked_ptr_mutex);
+
+// A copy of all command line arguments.  Set by InitGoogleTest().
+static ::std::vector<std::string> g_argvs;
+
+::std::vector<std::string> GetArgvs() {
+#if defined(GTEST_CUSTOM_GET_ARGVS_)
+  // GTEST_CUSTOM_GET_ARGVS_() may return a container of std::string or
+  // ::string. This code converts it to the appropriate type.
+  const auto& custom = GTEST_CUSTOM_GET_ARGVS_();
+  return ::std::vector<std::string>(custom.begin(), custom.end());
+#else   // defined(GTEST_CUSTOM_GET_ARGVS_)
+  return g_argvs;
+#endif  // defined(GTEST_CUSTOM_GET_ARGVS_)
+}
+
+// Returns the current application's name, removing directory path if that
+// is present.
+FilePath GetCurrentExecutableName() {
+  FilePath result;
+
+#if GTEST_OS_WINDOWS
+  result.Set(FilePath(GetArgvs()[0]).RemoveExtension("exe"));
+#else
+  result.Set(FilePath(GetArgvs()[0]));
+#endif  // GTEST_OS_WINDOWS
+
+  return result.RemoveDirectoryName();
+}
+
+// Functions for processing the gtest_output flag.
+
+// Returns the output format, or "" for normal printed output.
+std::string UnitTestOptions::GetOutputFormat() {
+  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+  const char* const colon = strchr(gtest_output_flag, ':');
+  return (colon == NULL) ?
+      std::string(gtest_output_flag) :
+      std::string(gtest_output_flag, colon - gtest_output_flag);
+}
+
+// Returns the name of the requested output file, or the default if none
+// was explicitly specified.
+std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
+  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+
+  std::string format = GetOutputFormat();
+  if (format.empty())
+    format = std::string(kDefaultOutputFormat);
+
+  const char* const colon = strchr(gtest_output_flag, ':');
+  if (colon == NULL)
+    return internal::FilePath::MakeFileName(
+        internal::FilePath(
+            UnitTest::GetInstance()->original_working_dir()),
+        internal::FilePath(kDefaultOutputFile), 0,
+        format.c_str()).string();
+
+  internal::FilePath output_name(colon + 1);
+  if (!output_name.IsAbsolutePath())
+    // FIXME: on Windows \some\path is not an absolute
+    // path (as its meaning depends on the current drive), yet the
+    // following logic for turning it into an absolute path is wrong.
+    // Fix it.
+    output_name = internal::FilePath::ConcatPaths(
+        internal::FilePath(UnitTest::GetInstance()->original_working_dir()),
+        internal::FilePath(colon + 1));
+
+  if (!output_name.IsDirectory())
+    return output_name.string();
+
+  internal::FilePath result(internal::FilePath::GenerateUniqueFileName(
+      output_name, internal::GetCurrentExecutableName(),
+      GetOutputFormat().c_str()));
+  return result.string();
+}
+
+// Returns true iff the wildcard pattern matches the string.  The
+// first ':' or '\0' character in pattern marks the end of it.
+//
+// This recursive algorithm isn't very efficient, but is clear and
+// works well enough for matching test names, which are short.
+bool UnitTestOptions::PatternMatchesString(const char *pattern,
+                                           const char *str) {
+  switch (*pattern) {
+    case '\0':
+    case ':':  // Either ':' or '\0' marks the end of the pattern.
+      return *str == '\0';
+    case '?':  // Matches any single character.
+      return *str != '\0' && PatternMatchesString(pattern + 1, str + 1);
+    case '*':  // Matches any string (possibly empty) of characters.
+      return (*str != '\0' && PatternMatchesString(pattern, str + 1)) ||
+          PatternMatchesString(pattern + 1, str);
+    default:  // Non-special character.  Matches itself.
+      return *pattern == *str &&
+          PatternMatchesString(pattern + 1, str + 1);
+  }
+}
+
+bool UnitTestOptions::MatchesFilter(
+    const std::string& name, const char* filter) {
+  const char *cur_pattern = filter;
+  for (;;) {
+    if (PatternMatchesString(cur_pattern, name.c_str())) {
+      return true;
+    }
+
+    // Finds the next pattern in the filter.
+    cur_pattern = strchr(cur_pattern, ':');
+
+    // Returns if no more pattern can be found.
+    if (cur_pattern == NULL) {
+      return false;
+    }
+
+    // Skips the pattern separater (the ':' character).
+    cur_pattern++;
+  }
+}
+
+// Returns true iff the user-specified filter matches the test case
+// name and the test name.
+bool UnitTestOptions::FilterMatchesTest(const std::string &test_case_name,
+                                        const std::string &test_name) {
+  const std::string& full_name = test_case_name + "." + test_name.c_str();
+
+  // Split --gtest_filter at '-', if there is one, to separate into
+  // positive filter and negative filter portions
+  const char* const p = GTEST_FLAG(filter).c_str();
+  const char* const dash = strchr(p, '-');
+  std::string positive;
+  std::string negative;
+  if (dash == NULL) {
+    positive = GTEST_FLAG(filter).c_str();  // Whole string is a positive filter
+    negative = "";
+  } else {
+    positive = std::string(p, dash);   // Everything up to the dash
+    negative = std::string(dash + 1);  // Everything after the dash
+    if (positive.empty()) {
+      // Treat '-test1' as the same as '*-test1'
+      positive = kUniversalFilter;
+    }
+  }
+
+  // A filter is a colon-separated list of patterns.  It matches a
+  // test if any pattern in it matches the test.
+  return (MatchesFilter(full_name, positive.c_str()) &&
+          !MatchesFilter(full_name, negative.c_str()));
+}
+
+#if GTEST_HAS_SEH
+// Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
+// given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
+// This function is useful as an __except condition.
+int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
+  // Google Test should handle a SEH exception if:
+  //   1. the user wants it to, AND
+  //   2. this is not a breakpoint exception, AND
+  //   3. this is not a C++ exception (VC++ implements them via SEH,
+  //      apparently).
+  //
+  // SEH exception code for C++ exceptions.
+  // (see http://support.microsoft.com/kb/185294 for more information).
+  const DWORD kCxxExceptionCode = 0xe06d7363;
+
+  bool should_handle = true;
+
+  if (!GTEST_FLAG(catch_exceptions))
+    should_handle = false;
+  else if (exception_code == EXCEPTION_BREAKPOINT)
+    should_handle = false;
+  else if (exception_code == kCxxExceptionCode)
+    should_handle = false;
+
+  return should_handle ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH;
+}
+#endif  // GTEST_HAS_SEH
+
+}  // namespace internal
+
+// The c'tor sets this object as the test part result reporter used by
+// Google Test.  The 'result' parameter specifies where to report the
+// results. Intercepts only failures from the current thread.
+ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
+    TestPartResultArray* result)
+    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD),
+      result_(result) {
+  Init();
+}
+
+// The c'tor sets this object as the test part result reporter used by
+// Google Test.  The 'result' parameter specifies where to report the
+// results.
+ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
+    InterceptMode intercept_mode, TestPartResultArray* result)
+    : intercept_mode_(intercept_mode),
+      result_(result) {
+  Init();
+}
+
+void ScopedFakeTestPartResultReporter::Init() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
+    old_reporter_ = impl->GetGlobalTestPartResultReporter();
+    impl->SetGlobalTestPartResultReporter(this);
+  } else {
+    old_reporter_ = impl->GetTestPartResultReporterForCurrentThread();
+    impl->SetTestPartResultReporterForCurrentThread(this);
+  }
+}
+
+// The d'tor restores the test part result reporter used by Google Test
+// before.
+ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
+    impl->SetGlobalTestPartResultReporter(old_reporter_);
+  } else {
+    impl->SetTestPartResultReporterForCurrentThread(old_reporter_);
+  }
+}
+
+// Increments the test part result count and remembers the result.
+// This method is from the TestPartResultReporterInterface interface.
+void ScopedFakeTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  result_->Append(result);
+}
+
+namespace internal {
+
+// Returns the type ID of ::testing::Test.  We should always call this
+// instead of GetTypeId< ::testing::Test>() to get the type ID of
+// testing::Test.  This is to work around a suspected linker bug when
+// using Google Test as a framework on Mac OS X.  The bug causes
+// GetTypeId< ::testing::Test>() to return different values depending
+// on whether the call is from the Google Test framework itself or
+// from user test code.  GetTestTypeId() is guaranteed to always
+// return the same value, as it always calls GetTypeId<>() from the
+// gtest.cc, which is within the Google Test framework.
+TypeId GetTestTypeId() {
+  return GetTypeId<Test>();
+}
+
+// The value of GetTestTypeId() as seen from within the Google Test
+// library.  This is solely for testing GetTestTypeId().
+extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId();
+
+// This predicate-formatter checks that 'results' contains a test part
+// failure of the given type and that the failure message contains the
+// given substring.
+static AssertionResult HasOneFailure(const char* /* results_expr */,
+                                     const char* /* type_expr */,
+                                     const char* /* substr_expr */,
+                                     const TestPartResultArray& results,
+                                     TestPartResult::Type type,
+                                     const std::string& substr) {
+  const std::string expected(type == TestPartResult::kFatalFailure ?
+                        "1 fatal failure" :
+                        "1 non-fatal failure");
+  Message msg;
+  if (results.size() != 1) {
+    msg << "Expected: " << expected << "\n"
+        << "  Actual: " << results.size() << " failures";
+    for (int i = 0; i < results.size(); i++) {
+      msg << "\n" << results.GetTestPartResult(i);
+    }
+    return AssertionFailure() << msg;
+  }
+
+  const TestPartResult& r = results.GetTestPartResult(0);
+  if (r.type() != type) {
+    return AssertionFailure() << "Expected: " << expected << "\n"
+                              << "  Actual:\n"
+                              << r;
+  }
+
+  if (strstr(r.message(), substr.c_str()) == NULL) {
+    return AssertionFailure() << "Expected: " << expected << " containing \""
+                              << substr << "\"\n"
+                              << "  Actual:\n"
+                              << r;
+  }
+
+  return AssertionSuccess();
+}
+
+// The constructor of SingleFailureChecker remembers where to look up
+// test part results, what type of failure we expect, and what
+// substring the failure message should contain.
+SingleFailureChecker::SingleFailureChecker(const TestPartResultArray* results,
+                                           TestPartResult::Type type,
+                                           const std::string& substr)
+    : results_(results), type_(type), substr_(substr) {}
+
+// The destructor of SingleFailureChecker verifies that the given
+// TestPartResultArray contains exactly one failure that has the given
+// type and contains the given substring.  If that's not the case, a
+// non-fatal failure will be generated.
+SingleFailureChecker::~SingleFailureChecker() {
+  EXPECT_PRED_FORMAT3(HasOneFailure, *results_, type_, substr_);
+}
+
+DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter(
+    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+
+void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  unit_test_->current_test_result()->AddTestPartResult(result);
+  unit_test_->listeners()->repeater()->OnTestPartResult(result);
+}
+
+DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter(
+    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+
+void DefaultPerThreadTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  unit_test_->GetGlobalTestPartResultReporter()->ReportTestPartResult(result);
+}
+
+// Returns the global test part result reporter.
+TestPartResultReporterInterface*
+UnitTestImpl::GetGlobalTestPartResultReporter() {
+  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
+  return global_test_part_result_repoter_;
+}
+
+// Sets the global test part result reporter.
+void UnitTestImpl::SetGlobalTestPartResultReporter(
+    TestPartResultReporterInterface* reporter) {
+  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
+  global_test_part_result_repoter_ = reporter;
+}
+
+// Returns the test part result reporter for the current thread.
+TestPartResultReporterInterface*
+UnitTestImpl::GetTestPartResultReporterForCurrentThread() {
+  return per_thread_test_part_result_reporter_.get();
+}
+
+// Sets the test part result reporter for the current thread.
+void UnitTestImpl::SetTestPartResultReporterForCurrentThread(
+    TestPartResultReporterInterface* reporter) {
+  per_thread_test_part_result_reporter_.set(reporter);
+}
+
+// Gets the number of successful test cases.
+int UnitTestImpl::successful_test_case_count() const {
+  return CountIf(test_cases_, TestCasePassed);
+}
+
+// Gets the number of failed test cases.
+int UnitTestImpl::failed_test_case_count() const {
+  return CountIf(test_cases_, TestCaseFailed);
+}
+
+// Gets the number of all test cases.
+int UnitTestImpl::total_test_case_count() const {
+  return static_cast<int>(test_cases_.size());
+}
+
+// Gets the number of all test cases that contain at least one test
+// that should run.
+int UnitTestImpl::test_case_to_run_count() const {
+  return CountIf(test_cases_, ShouldRunTestCase);
+}
+
+// Gets the number of successful tests.
+int UnitTestImpl::successful_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::successful_test_count);
+}
+
+// Gets the number of failed tests.
+int UnitTestImpl::failed_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::failed_test_count);
+}
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int UnitTestImpl::reportable_disabled_test_count() const {
+  return SumOverTestCaseList(test_cases_,
+                             &TestCase::reportable_disabled_test_count);
+}
+
+// Gets the number of disabled tests.
+int UnitTestImpl::disabled_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::disabled_test_count);
+}
+
+// Gets the number of tests to be printed in the XML report.
+int UnitTestImpl::reportable_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::reportable_test_count);
+}
+
+// Gets the number of all tests.
+int UnitTestImpl::total_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::total_test_count);
+}
+
+// Gets the number of tests that should run.
+int UnitTestImpl::test_to_run_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::test_to_run_count);
+}
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// CurrentOsStackTraceExceptTop(1), Foo() will be included in the
+// trace but Bar() and CurrentOsStackTraceExceptTop() won't.
+std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
+  return os_stack_trace_getter()->CurrentStackTrace(
+      static_cast<int>(GTEST_FLAG(stack_trace_depth)),
+      skip_count + 1
+      // Skips the user-specified number of frames plus this function
+      // itself.
+      );  // NOLINT
+}
+
+// Returns the current time in milliseconds.
+TimeInMillis GetTimeInMillis() {
+#if GTEST_OS_WINDOWS_MOBILE || defined(__BORLANDC__)
+  // Difference between 1970-01-01 and 1601-01-01 in milliseconds.
+  // http://analogous.blogspot.com/2005/04/epoch.html
+  const TimeInMillis kJavaEpochToWinFileTimeDelta =
+    static_cast<TimeInMillis>(116444736UL) * 100000UL;
+  const DWORD kTenthMicrosInMilliSecond = 10000;
+
+  SYSTEMTIME now_systime;
+  FILETIME now_filetime;
+  ULARGE_INTEGER now_int64;
+  // FIXME: Shouldn't this just use
+  //   GetSystemTimeAsFileTime()?
+  GetSystemTime(&now_systime);
+  if (SystemTimeToFileTime(&now_systime, &now_filetime)) {
+    now_int64.LowPart = now_filetime.dwLowDateTime;
+    now_int64.HighPart = now_filetime.dwHighDateTime;
+    now_int64.QuadPart = (now_int64.QuadPart / kTenthMicrosInMilliSecond) -
+      kJavaEpochToWinFileTimeDelta;
+    return now_int64.QuadPart;
+  }
+  return 0;
+#elif GTEST_OS_WINDOWS && !GTEST_HAS_GETTIMEOFDAY_
+  __timeb64 now;
+
+  // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996
+  // (deprecated function) there.
+  // FIXME: Use GetTickCount()?  Or use
+  //   SystemTimeToFileTime()
+  GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
+  _ftime64(&now);
+  GTEST_DISABLE_MSC_DEPRECATED_POP_()
+
+  return static_cast<TimeInMillis>(now.time) * 1000 + now.millitm;
+#elif GTEST_HAS_GETTIMEOFDAY_
+  struct timeval now;
+  gettimeofday(&now, NULL);
+  return static_cast<TimeInMillis>(now.tv_sec) * 1000 + now.tv_usec / 1000;
+#else
+# error "Don't know how to get the current time on your system."
+#endif
+}
+
+// Utilities
+
+// class String.
+
+#if GTEST_OS_WINDOWS_MOBILE
+// Creates a UTF-16 wide string from the given ANSI string, allocating
+// memory using new. The caller is responsible for deleting the return
+// value using delete[]. Returns the wide string, or NULL if the
+// input is NULL.
+LPCWSTR String::AnsiToUtf16(const char* ansi) {
+  if (!ansi) return NULL;
+  const int length = strlen(ansi);
+  const int unicode_length =
+      MultiByteToWideChar(CP_ACP, 0, ansi, length,
+                          NULL, 0);
+  WCHAR* unicode = new WCHAR[unicode_length + 1];
+  MultiByteToWideChar(CP_ACP, 0, ansi, length,
+                      unicode, unicode_length);
+  unicode[unicode_length] = 0;
+  return unicode;
+}
+
+// Creates an ANSI string from the given wide string, allocating
+// memory using new. The caller is responsible for deleting the return
+// value using delete[]. Returns the ANSI string, or NULL if the
+// input is NULL.
+const char* String::Utf16ToAnsi(LPCWSTR utf16_str)  {
+  if (!utf16_str) return NULL;
+  const int ansi_length =
+      WideCharToMultiByte(CP_ACP, 0, utf16_str, -1,
+                          NULL, 0, NULL, NULL);
+  char* ansi = new char[ansi_length + 1];
+  WideCharToMultiByte(CP_ACP, 0, utf16_str, -1,
+                      ansi, ansi_length, NULL, NULL);
+  ansi[ansi_length] = 0;
+  return ansi;
+}
+
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+// Compares two C strings.  Returns true iff they have the same content.
+//
+// Unlike strcmp(), this function can handle NULL argument(s).  A NULL
+// C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::CStringEquals(const char * lhs, const char * rhs) {
+  if ( lhs == NULL ) return rhs == NULL;
+
+  if ( rhs == NULL ) return false;
+
+  return strcmp(lhs, rhs) == 0;
+}
+
+#if GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
+
+// Converts an array of wide chars to a narrow string using the UTF-8
+// encoding, and streams the result to the given Message object.
+static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length,
+                                     Message* msg) {
+  for (size_t i = 0; i != length; ) {  // NOLINT
+    if (wstr[i] != L'\0') {
+      *msg << WideStringToUtf8(wstr + i, static_cast<int>(length - i));
+      while (i != length && wstr[i] != L'\0')
+        i++;
+    } else {
+      *msg << '\0';
+      i++;
+    }
+  }
+}
+
+#endif  // GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
+
+void SplitString(const ::std::string& str, char delimiter,
+                 ::std::vector< ::std::string>* dest) {
+  ::std::vector< ::std::string> parsed;
+  ::std::string::size_type pos = 0;
+  while (::testing::internal::AlwaysTrue()) {
+    const ::std::string::size_type colon = str.find(delimiter, pos);
+    if (colon == ::std::string::npos) {
+      parsed.push_back(str.substr(pos));
+      break;
+    } else {
+      parsed.push_back(str.substr(pos, colon - pos));
+      pos = colon + 1;
+    }
+  }
+  dest->swap(parsed);
+}
+
+}  // namespace internal
+
+// Constructs an empty Message.
+// We allocate the stringstream separately because otherwise each use of
+// ASSERT/EXPECT in a procedure adds over 200 bytes to the procedure's
+// stack frame leading to huge stack frames in some cases; gcc does not reuse
+// the stack space.
+Message::Message() : ss_(new ::std::stringstream) {
+  // By default, we want there to be enough precision when printing
+  // a double to a Message.
+  *ss_ << std::setprecision(std::numeric_limits<double>::digits10 + 2);
+}
+
+// These two overloads allow streaming a wide C string to a Message
+// using the UTF-8 encoding.
+Message& Message::operator <<(const wchar_t* wide_c_str) {
+  return *this << internal::String::ShowWideCString(wide_c_str);
+}
+Message& Message::operator <<(wchar_t* wide_c_str) {
+  return *this << internal::String::ShowWideCString(wide_c_str);
+}
+
+#if GTEST_HAS_STD_WSTRING
+// Converts the given wide string to a narrow string using the UTF-8
+// encoding, and streams the result to this Message object.
+Message& Message::operator <<(const ::std::wstring& wstr) {
+  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
+  return *this;
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+#if GTEST_HAS_GLOBAL_WSTRING
+// Converts the given wide string to a narrow string using the UTF-8
+// encoding, and streams the result to this Message object.
+Message& Message::operator <<(const ::wstring& wstr) {
+  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
+  return *this;
+}
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+// Gets the text streamed to this object so far as an std::string.
+// Each '\0' character in the buffer is replaced with "\\0".
+std::string Message::GetString() const {
+  return internal::StringStreamToString(ss_.get());
+}
+
+// AssertionResult constructors.
+// Used in EXPECT_TRUE/FALSE(assertion_result).
+AssertionResult::AssertionResult(const AssertionResult& other)
+    : success_(other.success_),
+      message_(other.message_.get() != NULL ?
+               new ::std::string(*other.message_) :
+               static_cast< ::std::string*>(NULL)) {
+}
+
+// Swaps two AssertionResults.
+void AssertionResult::swap(AssertionResult& other) {
+  using std::swap;
+  swap(success_, other.success_);
+  swap(message_, other.message_);
+}
+
+// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+AssertionResult AssertionResult::operator!() const {
+  AssertionResult negation(!success_);
+  if (message_.get() != NULL)
+    negation << *message_;
+  return negation;
+}
+
+// Makes a successful assertion result.
+AssertionResult AssertionSuccess() {
+  return AssertionResult(true);
+}
+
+// Makes a failed assertion result.
+AssertionResult AssertionFailure() {
+  return AssertionResult(false);
+}
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << message.
+AssertionResult AssertionFailure(const Message& message) {
+  return AssertionFailure() << message;
+}
+
+namespace internal {
+
+namespace edit_distance {
+std::vector<EditType> CalculateOptimalEdits(const std::vector<size_t>& left,
+                                            const std::vector<size_t>& right) {
+  std::vector<std::vector<double> > costs(
+      left.size() + 1, std::vector<double>(right.size() + 1));
+  std::vector<std::vector<EditType> > best_move(
+      left.size() + 1, std::vector<EditType>(right.size() + 1));
+
+  // Populate for empty right.
+  for (size_t l_i = 0; l_i < costs.size(); ++l_i) {
+    costs[l_i][0] = static_cast<double>(l_i);
+    best_move[l_i][0] = kRemove;
+  }
+  // Populate for empty left.
+  for (size_t r_i = 1; r_i < costs[0].size(); ++r_i) {
+    costs[0][r_i] = static_cast<double>(r_i);
+    best_move[0][r_i] = kAdd;
+  }
+
+  for (size_t l_i = 0; l_i < left.size(); ++l_i) {
+    for (size_t r_i = 0; r_i < right.size(); ++r_i) {
+      if (left[l_i] == right[r_i]) {
+        // Found a match. Consume it.
+        costs[l_i + 1][r_i + 1] = costs[l_i][r_i];
+        best_move[l_i + 1][r_i + 1] = kMatch;
+        continue;
+      }
+
+      const double add = costs[l_i + 1][r_i];
+      const double remove = costs[l_i][r_i + 1];
+      const double replace = costs[l_i][r_i];
+      if (add < remove && add < replace) {
+        costs[l_i + 1][r_i + 1] = add + 1;
+        best_move[l_i + 1][r_i + 1] = kAdd;
+      } else if (remove < add && remove < replace) {
+        costs[l_i + 1][r_i + 1] = remove + 1;
+        best_move[l_i + 1][r_i + 1] = kRemove;
+      } else {
+        // We make replace a little more expensive than add/remove to lower
+        // their priority.
+        costs[l_i + 1][r_i + 1] = replace + 1.00001;
+        best_move[l_i + 1][r_i + 1] = kReplace;
+      }
+    }
+  }
+
+  // Reconstruct the best path. We do it in reverse order.
+  std::vector<EditType> best_path;
+  for (size_t l_i = left.size(), r_i = right.size(); l_i > 0 || r_i > 0;) {
+    EditType move = best_move[l_i][r_i];
+    best_path.push_back(move);
+    l_i -= move != kAdd;
+    r_i -= move != kRemove;
+  }
+  std::reverse(best_path.begin(), best_path.end());
+  return best_path;
+}
+
+namespace {
+
+// Helper class to convert string into ids with deduplication.
+class InternalStrings {
+ public:
+  size_t GetId(const std::string& str) {
+    IdMap::iterator it = ids_.find(str);
+    if (it != ids_.end()) return it->second;
+    size_t id = ids_.size();
+    return ids_[str] = id;
+  }
+
+ private:
+  typedef std::map<std::string, size_t> IdMap;
+  IdMap ids_;
+};
+
+}  // namespace
+
+std::vector<EditType> CalculateOptimalEdits(
+    const std::vector<std::string>& left,
+    const std::vector<std::string>& right) {
+  std::vector<size_t> left_ids, right_ids;
+  {
+    InternalStrings intern_table;
+    for (size_t i = 0; i < left.size(); ++i) {
+      left_ids.push_back(intern_table.GetId(left[i]));
+    }
+    for (size_t i = 0; i < right.size(); ++i) {
+      right_ids.push_back(intern_table.GetId(right[i]));
+    }
+  }
+  return CalculateOptimalEdits(left_ids, right_ids);
+}
+
+namespace {
+
+// Helper class that holds the state for one hunk and prints it out to the
+// stream.
+// It reorders adds/removes when possible to group all removes before all
+// adds. It also adds the hunk header before printint into the stream.
+class Hunk {
+ public:
+  Hunk(size_t left_start, size_t right_start)
+      : left_start_(left_start),
+        right_start_(right_start),
+        adds_(),
+        removes_(),
+        common_() {}
+
+  void PushLine(char edit, const char* line) {
+    switch (edit) {
+      case ' ':
+        ++common_;
+        FlushEdits();
+        hunk_.push_back(std::make_pair(' ', line));
+        break;
+      case '-':
+        ++removes_;
+        hunk_removes_.push_back(std::make_pair('-', line));
+        break;
+      case '+':
+        ++adds_;
+        hunk_adds_.push_back(std::make_pair('+', line));
+        break;
+    }
+  }
+
+  void PrintTo(std::ostream* os) {
+    PrintHeader(os);
+    FlushEdits();
+    for (std::list<std::pair<char, const char*> >::const_iterator it =
+             hunk_.begin();
+         it != hunk_.end(); ++it) {
+      *os << it->first << it->second << "\n";
+    }
+  }
+
+  bool has_edits() const { return adds_ || removes_; }
+
+ private:
+  void FlushEdits() {
+    hunk_.splice(hunk_.end(), hunk_removes_);
+    hunk_.splice(hunk_.end(), hunk_adds_);
+  }
+
+  // Print a unified diff header for one hunk.
+  // The format is
+  //   "@@ -<left_start>,<left_length> +<right_start>,<right_length> @@"
+  // where the left/right parts are omitted if unnecessary.
+  void PrintHeader(std::ostream* ss) const {
+    *ss << "@@ ";
+    if (removes_) {
+      *ss << "-" << left_start_ << "," << (removes_ + common_);
+    }
+    if (removes_ && adds_) {
+      *ss << " ";
+    }
+    if (adds_) {
+      *ss << "+" << right_start_ << "," << (adds_ + common_);
+    }
+    *ss << " @@\n";
+  }
+
+  size_t left_start_, right_start_;
+  size_t adds_, removes_, common_;
+  std::list<std::pair<char, const char*> > hunk_, hunk_adds_, hunk_removes_;
+};
+
+}  // namespace
+
+// Create a list of diff hunks in Unified diff format.
+// Each hunk has a header generated by PrintHeader above plus a body with
+// lines prefixed with ' ' for no change, '-' for deletion and '+' for
+// addition.
+// 'context' represents the desired unchanged prefix/suffix around the diff.
+// If two hunks are close enough that their contexts overlap, then they are
+// joined into one hunk.
+std::string CreateUnifiedDiff(const std::vector<std::string>& left,
+                              const std::vector<std::string>& right,
+                              size_t context) {
+  const std::vector<EditType> edits = CalculateOptimalEdits(left, right);
+
+  size_t l_i = 0, r_i = 0, edit_i = 0;
+  std::stringstream ss;
+  while (edit_i < edits.size()) {
+    // Find first edit.
+    while (edit_i < edits.size() && edits[edit_i] == kMatch) {
+      ++l_i;
+      ++r_i;
+      ++edit_i;
+    }
+
+    // Find the first line to include in the hunk.
+    const size_t prefix_context = std::min(l_i, context);
+    Hunk hunk(l_i - prefix_context + 1, r_i - prefix_context + 1);
+    for (size_t i = prefix_context; i > 0; --i) {
+      hunk.PushLine(' ', left[l_i - i].c_str());
+    }
+
+    // Iterate the edits until we found enough suffix for the hunk or the input
+    // is over.
+    size_t n_suffix = 0;
+    for (; edit_i < edits.size(); ++edit_i) {
+      if (n_suffix >= context) {
+        // Continue only if the next hunk is very close.
+        std::vector<EditType>::const_iterator it = edits.begin() + edit_i;
+        while (it != edits.end() && *it == kMatch) ++it;
+        if (it == edits.end() || (it - edits.begin()) - edit_i >= context) {
+          // There is no next edit or it is too far away.
+          break;
+        }
+      }
+
+      EditType edit = edits[edit_i];
+      // Reset count when a non match is found.
+      n_suffix = edit == kMatch ? n_suffix + 1 : 0;
+
+      if (edit == kMatch || edit == kRemove || edit == kReplace) {
+        hunk.PushLine(edit == kMatch ? ' ' : '-', left[l_i].c_str());
+      }
+      if (edit == kAdd || edit == kReplace) {
+        hunk.PushLine('+', right[r_i].c_str());
+      }
+
+      // Advance indices, depending on edit type.
+      l_i += edit != kAdd;
+      r_i += edit != kRemove;
+    }
+
+    if (!hunk.has_edits()) {
+      // We are done. We don't want this hunk.
+      break;
+    }
+
+    hunk.PrintTo(&ss);
+  }
+  return ss.str();
+}
+
+}  // namespace edit_distance
+
+namespace {
+
+// The string representation of the values received in EqFailure() are already
+// escaped. Split them on escaped '\n' boundaries. Leave all other escaped
+// characters the same.
+std::vector<std::string> SplitEscapedString(const std::string& str) {
+  std::vector<std::string> lines;
+  size_t start = 0, end = str.size();
+  if (end > 2 && str[0] == '"' && str[end - 1] == '"') {
+    ++start;
+    --end;
+  }
+  bool escaped = false;
+  for (size_t i = start; i + 1 < end; ++i) {
+    if (escaped) {
+      escaped = false;
+      if (str[i] == 'n') {
+        lines.push_back(str.substr(start, i - start - 1));
+        start = i + 1;
+      }
+    } else {
+      escaped = str[i] == '\\';
+    }
+  }
+  lines.push_back(str.substr(start, end - start));
+  return lines;
+}
+
+}  // namespace
+
+// Constructs and returns the message for an equality assertion
+// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
+//
+// The first four parameters are the expressions used in the assertion
+// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
+// where foo is 5 and bar is 6, we have:
+//
+//   lhs_expression: "foo"
+//   rhs_expression: "bar"
+//   lhs_value:      "5"
+//   rhs_value:      "6"
+//
+// The ignoring_case parameter is true iff the assertion is a
+// *_STRCASEEQ*.  When it's true, the string "Ignoring case" will
+// be inserted into the message.
+AssertionResult EqFailure(const char* lhs_expression,
+                          const char* rhs_expression,
+                          const std::string& lhs_value,
+                          const std::string& rhs_value,
+                          bool ignoring_case) {
+  Message msg;
+  msg << "Expected equality of these values:";
+  msg << "\n  " << lhs_expression;
+  if (lhs_value != lhs_expression) {
+    msg << "\n    Which is: " << lhs_value;
+  }
+  msg << "\n  " << rhs_expression;
+  if (rhs_value != rhs_expression) {
+    msg << "\n    Which is: " << rhs_value;
+  }
+
+  if (ignoring_case) {
+    msg << "\nIgnoring case";
+  }
+
+  if (!lhs_value.empty() && !rhs_value.empty()) {
+    const std::vector<std::string> lhs_lines =
+        SplitEscapedString(lhs_value);
+    const std::vector<std::string> rhs_lines =
+        SplitEscapedString(rhs_value);
+    if (lhs_lines.size() > 1 || rhs_lines.size() > 1) {
+      msg << "\nWith diff:\n"
+          << edit_distance::CreateUnifiedDiff(lhs_lines, rhs_lines);
+    }
+  }
+
+  return AssertionFailure() << msg;
+}
+
+// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
+std::string GetBoolAssertionFailureMessage(
+    const AssertionResult& assertion_result,
+    const char* expression_text,
+    const char* actual_predicate_value,
+    const char* expected_predicate_value) {
+  const char* actual_message = assertion_result.message();
+  Message msg;
+  msg << "Value of: " << expression_text
+      << "\n  Actual: " << actual_predicate_value;
+  if (actual_message[0] != '\0')
+    msg << " (" << actual_message << ")";
+  msg << "\nExpected: " << expected_predicate_value;
+  return msg.GetString();
+}
+
+// Helper function for implementing ASSERT_NEAR.
+AssertionResult DoubleNearPredFormat(const char* expr1,
+                                     const char* expr2,
+                                     const char* abs_error_expr,
+                                     double val1,
+                                     double val2,
+                                     double abs_error) {
+  const double diff = fabs(val1 - val2);
+  if (diff <= abs_error) return AssertionSuccess();
+
+  // FIXME: do not print the value of an expression if it's
+  // already a literal.
+  return AssertionFailure()
+      << "The difference between " << expr1 << " and " << expr2
+      << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n"
+      << expr1 << " evaluates to " << val1 << ",\n"
+      << expr2 << " evaluates to " << val2 << ", and\n"
+      << abs_error_expr << " evaluates to " << abs_error << ".";
+}
+
+
+// Helper template for implementing FloatLE() and DoubleLE().
+template <typename RawType>
+AssertionResult FloatingPointLE(const char* expr1,
+                                const char* expr2,
+                                RawType val1,
+                                RawType val2) {
+  // Returns success if val1 is less than val2,
+  if (val1 < val2) {
+    return AssertionSuccess();
+  }
+
+  // or if val1 is almost equal to val2.
+  const FloatingPoint<RawType> lhs(val1), rhs(val2);
+  if (lhs.AlmostEquals(rhs)) {
+    return AssertionSuccess();
+  }
+
+  // Note that the above two checks will both fail if either val1 or
+  // val2 is NaN, as the IEEE floating-point standard requires that
+  // any predicate involving a NaN must return false.
+
+  ::std::stringstream val1_ss;
+  val1_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+          << val1;
+
+  ::std::stringstream val2_ss;
+  val2_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+          << val2;
+
+  return AssertionFailure()
+      << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
+      << "  Actual: " << StringStreamToString(&val1_ss) << " vs "
+      << StringStreamToString(&val2_ss);
+}
+
+}  // namespace internal
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+AssertionResult FloatLE(const char* expr1, const char* expr2,
+                        float val1, float val2) {
+  return internal::FloatingPointLE<float>(expr1, expr2, val1, val2);
+}
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+AssertionResult DoubleLE(const char* expr1, const char* expr2,
+                         double val1, double val2) {
+  return internal::FloatingPointLE<double>(expr1, expr2, val1, val2);
+}
+
+namespace internal {
+
+// The helper function for {ASSERT|EXPECT}_EQ with int or enum
+// arguments.
+AssertionResult CmpHelperEQ(const char* lhs_expression,
+                            const char* rhs_expression,
+                            BiggestInt lhs,
+                            BiggestInt rhs) {
+  if (lhs == rhs) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   FormatForComparisonFailureMessage(lhs, rhs),
+                   FormatForComparisonFailureMessage(rhs, lhs),
+                   false);
+}
+
+// A macro for implementing the helper functions needed to implement
+// ASSERT_?? and EXPECT_?? with integer or enum arguments.  It is here
+// just to avoid copy-and-paste of similar code.
+#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
+AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
+                                   BiggestInt val1, BiggestInt val2) {\
+  if (val1 op val2) {\
+    return AssertionSuccess();\
+  } else {\
+    return AssertionFailure() \
+        << "Expected: (" << expr1 << ") " #op " (" << expr2\
+        << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\
+        << " vs " << FormatForComparisonFailureMessage(val2, val1);\
+  }\
+}
+
+// Implements the helper function for {ASSERT|EXPECT}_NE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(NE, !=)
+// Implements the helper function for {ASSERT|EXPECT}_LE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(LE, <=)
+// Implements the helper function for {ASSERT|EXPECT}_LT with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(LT, < )
+// Implements the helper function for {ASSERT|EXPECT}_GE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(GE, >=)
+// Implements the helper function for {ASSERT|EXPECT}_GT with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(GT, > )
+
+#undef GTEST_IMPL_CMP_HELPER_
+
+// The helper function for {ASSERT|EXPECT}_STREQ.
+AssertionResult CmpHelperSTREQ(const char* lhs_expression,
+                               const char* rhs_expression,
+                               const char* lhs,
+                               const char* rhs) {
+  if (String::CStringEquals(lhs, rhs)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   PrintToString(lhs),
+                   PrintToString(rhs),
+                   false);
+}
+
+// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
+AssertionResult CmpHelperSTRCASEEQ(const char* lhs_expression,
+                                   const char* rhs_expression,
+                                   const char* lhs,
+                                   const char* rhs) {
+  if (String::CaseInsensitiveCStringEquals(lhs, rhs)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   PrintToString(lhs),
+                   PrintToString(rhs),
+                   true);
+}
+
+// The helper function for {ASSERT|EXPECT}_STRNE.
+AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                               const char* s2_expression,
+                               const char* s1,
+                               const char* s2) {
+  if (!String::CStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  } else {
+    return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
+                              << s2_expression << "), actual: \""
+                              << s1 << "\" vs \"" << s2 << "\"";
+  }
+}
+
+// The helper function for {ASSERT|EXPECT}_STRCASENE.
+AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
+                                   const char* s2_expression,
+                                   const char* s1,
+                                   const char* s2) {
+  if (!String::CaseInsensitiveCStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  } else {
+    return AssertionFailure()
+        << "Expected: (" << s1_expression << ") != ("
+        << s2_expression << ") (ignoring case), actual: \""
+        << s1 << "\" vs \"" << s2 << "\"";
+  }
+}
+
+}  // namespace internal
+
+namespace {
+
+// Helper functions for implementing IsSubString() and IsNotSubstring().
+
+// This group of overloaded functions return true iff needle is a
+// substring of haystack.  NULL is considered a substring of itself
+// only.
+
+bool IsSubstringPred(const char* needle, const char* haystack) {
+  if (needle == NULL || haystack == NULL)
+    return needle == haystack;
+
+  return strstr(haystack, needle) != NULL;
+}
+
+bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) {
+  if (needle == NULL || haystack == NULL)
+    return needle == haystack;
+
+  return wcsstr(haystack, needle) != NULL;
+}
+
+// StringType here can be either ::std::string or ::std::wstring.
+template <typename StringType>
+bool IsSubstringPred(const StringType& needle,
+                     const StringType& haystack) {
+  return haystack.find(needle) != StringType::npos;
+}
+
+// This function implements either IsSubstring() or IsNotSubstring(),
+// depending on the value of the expected_to_be_substring parameter.
+// StringType here can be const char*, const wchar_t*, ::std::string,
+// or ::std::wstring.
+template <typename StringType>
+AssertionResult IsSubstringImpl(
+    bool expected_to_be_substring,
+    const char* needle_expr, const char* haystack_expr,
+    const StringType& needle, const StringType& haystack) {
+  if (IsSubstringPred(needle, haystack) == expected_to_be_substring)
+    return AssertionSuccess();
+
+  const bool is_wide_string = sizeof(needle[0]) > 1;
+  const char* const begin_string_quote = is_wide_string ? "L\"" : "\"";
+  return AssertionFailure()
+      << "Value of: " << needle_expr << "\n"
+      << "  Actual: " << begin_string_quote << needle << "\"\n"
+      << "Expected: " << (expected_to_be_substring ? "" : "not ")
+      << "a substring of " << haystack_expr << "\n"
+      << "Which is: " << begin_string_quote << haystack << "\"";
+}
+
+}  // namespace
+
+// IsSubstring() and IsNotSubstring() check whether needle is a
+// substring of haystack (NULL is considered a substring of itself
+// only), and return an appropriate error message when they fail.
+
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+#if GTEST_HAS_STD_WSTRING
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+namespace internal {
+
+#if GTEST_OS_WINDOWS
+
+namespace {
+
+// Helper function for IsHRESULT{SuccessFailure} predicates
+AssertionResult HRESULTFailureHelper(const char* expr,
+                                     const char* expected,
+                                     long hr) {  // NOLINT
+# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE
+
+  // Windows CE doesn't support FormatMessage.
+  const char error_text[] = "";
+
+# else
+
+  // Looks up the human-readable system message for the HRESULT code
+  // and since we're not passing any params to FormatMessage, we don't
+  // want inserts expanded.
+  const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM |
+                       FORMAT_MESSAGE_IGNORE_INSERTS;
+  const DWORD kBufSize = 4096;
+  // Gets the system's human readable message string for this HRESULT.
+  char error_text[kBufSize] = { '\0' };
+  DWORD message_length = ::FormatMessageA(kFlags,
+                                          0,  // no source, we're asking system
+                                          hr,  // the error
+                                          0,  // no line width restrictions
+                                          error_text,  // output buffer
+                                          kBufSize,  // buf size
+                                          NULL);  // no arguments for inserts
+  // Trims tailing white space (FormatMessage leaves a trailing CR-LF)
+  for (; message_length && IsSpace(error_text[message_length - 1]);
+          --message_length) {
+    error_text[message_length - 1] = '\0';
+  }
+
+# endif  // GTEST_OS_WINDOWS_MOBILE
+
+  const std::string error_hex("0x" + String::FormatHexInt(hr));
+  return ::testing::AssertionFailure()
+      << "Expected: " << expr << " " << expected << ".\n"
+      << "  Actual: " << error_hex << " " << error_text << "\n";
+}
+
+}  // namespace
+
+AssertionResult IsHRESULTSuccess(const char* expr, long hr) {  // NOLINT
+  if (SUCCEEDED(hr)) {
+    return AssertionSuccess();
+  }
+  return HRESULTFailureHelper(expr, "succeeds", hr);
+}
+
+AssertionResult IsHRESULTFailure(const char* expr, long hr) {  // NOLINT
+  if (FAILED(hr)) {
+    return AssertionSuccess();
+  }
+  return HRESULTFailureHelper(expr, "fails", hr);
+}
+
+#endif  // GTEST_OS_WINDOWS
+
+// Utility functions for encoding Unicode text (wide strings) in
+// UTF-8.
+
+// A Unicode code-point can have up to 21 bits, and is encoded in UTF-8
+// like this:
+//
+// Code-point length   Encoding
+//   0 -  7 bits       0xxxxxxx
+//   8 - 11 bits       110xxxxx 10xxxxxx
+//  12 - 16 bits       1110xxxx 10xxxxxx 10xxxxxx
+//  17 - 21 bits       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+// The maximum code-point a one-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint1 = (static_cast<UInt32>(1) <<  7) - 1;
+
+// The maximum code-point a two-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint2 = (static_cast<UInt32>(1) << (5 + 6)) - 1;
+
+// The maximum code-point a three-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint3 = (static_cast<UInt32>(1) << (4 + 2*6)) - 1;
+
+// The maximum code-point a four-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint4 = (static_cast<UInt32>(1) << (3 + 3*6)) - 1;
+
+// Chops off the n lowest bits from a bit pattern.  Returns the n
+// lowest bits.  As a side effect, the original bit pattern will be
+// shifted to the right by n bits.
+inline UInt32 ChopLowBits(UInt32* bits, int n) {
+  const UInt32 low_bits = *bits & ((static_cast<UInt32>(1) << n) - 1);
+  *bits >>= n;
+  return low_bits;
+}
+
+// Converts a Unicode code point to a narrow string in UTF-8 encoding.
+// code_point parameter is of type UInt32 because wchar_t may not be
+// wide enough to contain a code point.
+// If the code_point is not a valid Unicode code point
+// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
+// to "(Invalid Unicode 0xXXXXXXXX)".
+std::string CodePointToUtf8(UInt32 code_point) {
+  if (code_point > kMaxCodePoint4) {
+    return "(Invalid Unicode 0x" + String::FormatHexInt(code_point) + ")";
+  }
+
+  char str[5];  // Big enough for the largest valid code point.
+  if (code_point <= kMaxCodePoint1) {
+    str[1] = '\0';
+    str[0] = static_cast<char>(code_point);                          // 0xxxxxxx
+  } else if (code_point <= kMaxCodePoint2) {
+    str[2] = '\0';
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xC0 | code_point);                   // 110xxxxx
+  } else if (code_point <= kMaxCodePoint3) {
+    str[3] = '\0';
+    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xE0 | code_point);                   // 1110xxxx
+  } else {  // code_point <= kMaxCodePoint4
+    str[4] = '\0';
+    str[3] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xF0 | code_point);                   // 11110xxx
+  }
+  return str;
+}
+
+// The following two functions only make sense if the system
+// uses UTF-16 for wide string encoding. All supported systems
+// with 16 bit wchar_t (Windows, Cygwin, Symbian OS) do use UTF-16.
+
+// Determines if the arguments constitute UTF-16 surrogate pair
+// and thus should be combined into a single Unicode code point
+// using CreateCodePointFromUtf16SurrogatePair.
+inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) {
+  return sizeof(wchar_t) == 2 &&
+      (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00;
+}
+
+// Creates a Unicode code point from UTF16 surrogate pair.
+inline UInt32 CreateCodePointFromUtf16SurrogatePair(wchar_t first,
+                                                    wchar_t second) {
+  const UInt32 mask = (1 << 10) - 1;
+  return (sizeof(wchar_t) == 2) ?
+      (((first & mask) << 10) | (second & mask)) + 0x10000 :
+      // This function should not be called when the condition is
+      // false, but we provide a sensible default in case it is.
+      static_cast<UInt32>(first);
+}
+
+// Converts a wide string to a narrow string in UTF-8 encoding.
+// The wide string is assumed to have the following encoding:
+//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
+//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
+// Parameter str points to a null-terminated wide string.
+// Parameter num_chars may additionally limit the number
+// of wchar_t characters processed. -1 is used when the entire string
+// should be processed.
+// If the string contains code points that are not valid Unicode code points
+// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
+// and contains invalid UTF-16 surrogate pairs, values in those pairs
+// will be encoded as individual Unicode characters from Basic Normal Plane.
+std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
+  if (num_chars == -1)
+    num_chars = static_cast<int>(wcslen(str));
+
+  ::std::stringstream stream;
+  for (int i = 0; i < num_chars; ++i) {
+    UInt32 unicode_code_point;
+
+    if (str[i] == L'\0') {
+      break;
+    } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) {
+      unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i],
+                                                                 str[i + 1]);
+      i++;
+    } else {
+      unicode_code_point = static_cast<UInt32>(str[i]);
+    }
+
+    stream << CodePointToUtf8(unicode_code_point);
+  }
+  return StringStreamToString(&stream);
+}
+
+// Converts a wide C string to an std::string using the UTF-8 encoding.
+// NULL will be converted to "(null)".
+std::string String::ShowWideCString(const wchar_t * wide_c_str) {
+  if (wide_c_str == NULL)  return "(null)";
+
+  return internal::WideStringToUtf8(wide_c_str, -1);
+}
+
+// Compares two wide C strings.  Returns true iff they have the same
+// content.
+//
+// Unlike wcscmp(), this function can handle NULL argument(s).  A NULL
+// C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
+  if (lhs == NULL) return rhs == NULL;
+
+  if (rhs == NULL) return false;
+
+  return wcscmp(lhs, rhs) == 0;
+}
+
+// Helper function for *_STREQ on wide strings.
+AssertionResult CmpHelperSTREQ(const char* lhs_expression,
+                               const char* rhs_expression,
+                               const wchar_t* lhs,
+                               const wchar_t* rhs) {
+  if (String::WideCStringEquals(lhs, rhs)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   PrintToString(lhs),
+                   PrintToString(rhs),
+                   false);
+}
+
+// Helper function for *_STRNE on wide strings.
+AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                               const char* s2_expression,
+                               const wchar_t* s1,
+                               const wchar_t* s2) {
+  if (!String::WideCStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  }
+
+  return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
+                            << s2_expression << "), actual: "
+                            << PrintToString(s1)
+                            << " vs " << PrintToString(s2);
+}
+
+// Compares two C strings, ignoring case.  Returns true iff they have
+// the same content.
+//
+// Unlike strcasecmp(), this function can handle NULL argument(s).  A
+// NULL C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) {
+  if (lhs == NULL)
+    return rhs == NULL;
+  if (rhs == NULL)
+    return false;
+  return posix::StrCaseCmp(lhs, rhs) == 0;
+}
+
+  // Compares two wide C strings, ignoring case.  Returns true iff they
+  // have the same content.
+  //
+  // Unlike wcscasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL wide C string,
+  // including the empty string.
+  // NB: The implementations on different platforms slightly differ.
+  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
+  // environment variable. On GNU platform this method uses wcscasecmp
+  // which compares according to LC_CTYPE category of the current locale.
+  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
+  // current locale.
+bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
+                                              const wchar_t* rhs) {
+  if (lhs == NULL) return rhs == NULL;
+
+  if (rhs == NULL) return false;
+
+#if GTEST_OS_WINDOWS
+  return _wcsicmp(lhs, rhs) == 0;
+#elif GTEST_OS_LINUX && !GTEST_OS_LINUX_ANDROID
+  return wcscasecmp(lhs, rhs) == 0;
+#else
+  // Android, Mac OS X and Cygwin don't define wcscasecmp.
+  // Other unknown OSes may not define it either.
+  wint_t left, right;
+  do {
+    left = towlower(*lhs++);
+    right = towlower(*rhs++);
+  } while (left && left == right);
+  return left == right;
+#endif  // OS selector
+}
+
+// Returns true iff str ends with the given suffix, ignoring case.
+// Any string is considered to end with an empty suffix.
+bool String::EndsWithCaseInsensitive(
+    const std::string& str, const std::string& suffix) {
+  const size_t str_len = str.length();
+  const size_t suffix_len = suffix.length();
+  return (str_len >= suffix_len) &&
+         CaseInsensitiveCStringEquals(str.c_str() + str_len - suffix_len,
+                                      suffix.c_str());
+}
+
+// Formats an int value as "%02d".
+std::string String::FormatIntWidth2(int value) {
+  std::stringstream ss;
+  ss << std::setfill('0') << std::setw(2) << value;
+  return ss.str();
+}
+
+// Formats an int value as "%X".
+std::string String::FormatHexInt(int value) {
+  std::stringstream ss;
+  ss << std::hex << std::uppercase << value;
+  return ss.str();
+}
+
+// Formats a byte as "%02X".
+std::string String::FormatByte(unsigned char value) {
+  std::stringstream ss;
+  ss << std::setfill('0') << std::setw(2) << std::hex << std::uppercase
+     << static_cast<unsigned int>(value);
+  return ss.str();
+}
+
+// Converts the buffer in a stringstream to an std::string, converting NUL
+// bytes to "\\0" along the way.
+std::string StringStreamToString(::std::stringstream* ss) {
+  const ::std::string& str = ss->str();
+  const char* const start = str.c_str();
+  const char* const end = start + str.length();
+
+  std::string result;
+  result.reserve(2 * (end - start));
+  for (const char* ch = start; ch != end; ++ch) {
+    if (*ch == '\0') {
+      result += "\\0";  // Replaces NUL with "\\0";
+    } else {
+      result += *ch;
+    }
+  }
+
+  return result;
+}
+
+// Appends the user-supplied message to the Google-Test-generated message.
+std::string AppendUserMessage(const std::string& gtest_msg,
+                              const Message& user_msg) {
+  // Appends the user message if it's non-empty.
+  const std::string user_msg_string = user_msg.GetString();
+  if (user_msg_string.empty()) {
+    return gtest_msg;
+  }
+
+  return gtest_msg + "\n" + user_msg_string;
+}
+
+}  // namespace internal
+
+// class TestResult
+
+// Creates an empty TestResult.
+TestResult::TestResult()
+    : death_test_count_(0),
+      elapsed_time_(0) {
+}
+
+// D'tor.
+TestResult::~TestResult() {
+}
+
+// Returns the i-th test part result among all the results. i can
+// range from 0 to total_part_count() - 1. If i is not in that range,
+// aborts the program.
+const TestPartResult& TestResult::GetTestPartResult(int i) const {
+  if (i < 0 || i >= total_part_count())
+    internal::posix::Abort();
+  return test_part_results_.at(i);
+}
+
+// Returns the i-th test property. i can range from 0 to
+// test_property_count() - 1. If i is not in that range, aborts the
+// program.
+const TestProperty& TestResult::GetTestProperty(int i) const {
+  if (i < 0 || i >= test_property_count())
+    internal::posix::Abort();
+  return test_properties_.at(i);
+}
+
+// Clears the test part results.
+void TestResult::ClearTestPartResults() {
+  test_part_results_.clear();
+}
+
+// Adds a test part result to the list.
+void TestResult::AddTestPartResult(const TestPartResult& test_part_result) {
+  test_part_results_.push_back(test_part_result);
+}
+
+// Adds a test property to the list. If a property with the same key as the
+// supplied property is already represented, the value of this test_property
+// replaces the old value for that key.
+void TestResult::RecordProperty(const std::string& xml_element,
+                                const TestProperty& test_property) {
+  if (!ValidateTestProperty(xml_element, test_property)) {
+    return;
+  }
+  internal::MutexLock lock(&test_properites_mutex_);
+  const std::vector<TestProperty>::iterator property_with_matching_key =
+      std::find_if(test_properties_.begin(), test_properties_.end(),
+                   internal::TestPropertyKeyIs(test_property.key()));
+  if (property_with_matching_key == test_properties_.end()) {
+    test_properties_.push_back(test_property);
+    return;
+  }
+  property_with_matching_key->SetValue(test_property.value());
+}
+
+// The list of reserved attributes used in the <testsuites> element of XML
+// output.
+static const char* const kReservedTestSuitesAttributes[] = {
+  "disabled",
+  "errors",
+  "failures",
+  "name",
+  "random_seed",
+  "tests",
+  "time",
+  "timestamp"
+};
+
+// The list of reserved attributes used in the <testsuite> element of XML
+// output.
+static const char* const kReservedTestSuiteAttributes[] = {
+  "disabled",
+  "errors",
+  "failures",
+  "name",
+  "tests",
+  "time"
+};
+
+// The list of reserved attributes used in the <testcase> element of XML output.
+static const char* const kReservedTestCaseAttributes[] = {
+    "classname",  "name",        "status", "time",
+    "type_param", "value_param", "file",   "line"};
+
+template <int kSize>
+std::vector<std::string> ArrayAsVector(const char* const (&array)[kSize]) {
+  return std::vector<std::string>(array, array + kSize);
+}
+
+static std::vector<std::string> GetReservedAttributesForElement(
+    const std::string& xml_element) {
+  if (xml_element == "testsuites") {
+    return ArrayAsVector(kReservedTestSuitesAttributes);
+  } else if (xml_element == "testsuite") {
+    return ArrayAsVector(kReservedTestSuiteAttributes);
+  } else if (xml_element == "testcase") {
+    return ArrayAsVector(kReservedTestCaseAttributes);
+  } else {
+    GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element;
+  }
+  // This code is unreachable but some compilers may not realizes that.
+  return std::vector<std::string>();
+}
+
+static std::string FormatWordList(const std::vector<std::string>& words) {
+  Message word_list;
+  for (size_t i = 0; i < words.size(); ++i) {
+    if (i > 0 && words.size() > 2) {
+      word_list << ", ";
+    }
+    if (i == words.size() - 1) {
+      word_list << "and ";
+    }
+    word_list << "'" << words[i] << "'";
+  }
+  return word_list.GetString();
+}
+
+static bool ValidateTestPropertyName(
+    const std::string& property_name,
+    const std::vector<std::string>& reserved_names) {
+  if (std::find(reserved_names.begin(), reserved_names.end(), property_name) !=
+          reserved_names.end()) {
+    ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name
+                  << " (" << FormatWordList(reserved_names)
+                  << " are reserved by " << GTEST_NAME_ << ")";
+    return false;
+  }
+  return true;
+}
+
+// Adds a failure if the key is a reserved attribute of the element named
+// xml_element.  Returns true if the property is valid.
+bool TestResult::ValidateTestProperty(const std::string& xml_element,
+                                      const TestProperty& test_property) {
+  return ValidateTestPropertyName(test_property.key(),
+                                  GetReservedAttributesForElement(xml_element));
+}
+
+// Clears the object.
+void TestResult::Clear() {
+  test_part_results_.clear();
+  test_properties_.clear();
+  death_test_count_ = 0;
+  elapsed_time_ = 0;
+}
+
+// Returns true iff the test failed.
+bool TestResult::Failed() const {
+  for (int i = 0; i < total_part_count(); ++i) {
+    if (GetTestPartResult(i).failed())
+      return true;
+  }
+  return false;
+}
+
+// Returns true iff the test part fatally failed.
+static bool TestPartFatallyFailed(const TestPartResult& result) {
+  return result.fatally_failed();
+}
+
+// Returns true iff the test fatally failed.
+bool TestResult::HasFatalFailure() const {
+  return CountIf(test_part_results_, TestPartFatallyFailed) > 0;
+}
+
+// Returns true iff the test part non-fatally failed.
+static bool TestPartNonfatallyFailed(const TestPartResult& result) {
+  return result.nonfatally_failed();
+}
+
+// Returns true iff the test has a non-fatal failure.
+bool TestResult::HasNonfatalFailure() const {
+  return CountIf(test_part_results_, TestPartNonfatallyFailed) > 0;
+}
+
+// Gets the number of all test parts.  This is the sum of the number
+// of successful test parts and the number of failed test parts.
+int TestResult::total_part_count() const {
+  return static_cast<int>(test_part_results_.size());
+}
+
+// Returns the number of the test properties.
+int TestResult::test_property_count() const {
+  return static_cast<int>(test_properties_.size());
+}
+
+// class Test
+
+// Creates a Test object.
+
+// The c'tor saves the states of all flags.
+Test::Test()
+    : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {
+}
+
+// The d'tor restores the states of all flags.  The actual work is
+// done by the d'tor of the gtest_flag_saver_ field, and thus not
+// visible here.
+Test::~Test() {
+}
+
+// Sets up the test fixture.
+//
+// A sub-class may override this.
+void Test::SetUp() {
+}
+
+// Tears down the test fixture.
+//
+// A sub-class may override this.
+void Test::TearDown() {
+}
+
+// Allows user supplied key value pairs to be recorded for later output.
+void Test::RecordProperty(const std::string& key, const std::string& value) {
+  UnitTest::GetInstance()->RecordProperty(key, value);
+}
+
+// Allows user supplied key value pairs to be recorded for later output.
+void Test::RecordProperty(const std::string& key, int value) {
+  Message value_message;
+  value_message << value;
+  RecordProperty(key, value_message.GetString().c_str());
+}
+
+namespace internal {
+
+void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
+                                    const std::string& message) {
+  // This function is a friend of UnitTest and as such has access to
+  // AddTestPartResult.
+  UnitTest::GetInstance()->AddTestPartResult(
+      result_type,
+      NULL,  // No info about the source file where the exception occurred.
+      -1,    // We have no info on which line caused the exception.
+      message,
+      "");   // No stack trace, either.
+}
+
+}  // namespace internal
+
+// Google Test requires all tests in the same test case to use the same test
+// fixture class.  This function checks if the current test has the
+// same fixture class as the first test in the current test case.  If
+// yes, it returns true; otherwise it generates a Google Test failure and
+// returns false.
+bool Test::HasSameFixtureClass() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  const TestCase* const test_case = impl->current_test_case();
+
+  // Info about the first test in the current test case.
+  const TestInfo* const first_test_info = test_case->test_info_list()[0];
+  const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_;
+  const char* const first_test_name = first_test_info->name();
+
+  // Info about the current test.
+  const TestInfo* const this_test_info = impl->current_test_info();
+  const internal::TypeId this_fixture_id = this_test_info->fixture_class_id_;
+  const char* const this_test_name = this_test_info->name();
+
+  if (this_fixture_id != first_fixture_id) {
+    // Is the first test defined using TEST?
+    const bool first_is_TEST = first_fixture_id == internal::GetTestTypeId();
+    // Is this test defined using TEST?
+    const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId();
+
+    if (first_is_TEST || this_is_TEST) {
+      // Both TEST and TEST_F appear in same test case, which is incorrect.
+      // Tell the user how to fix this.
+
+      // Gets the name of the TEST and the name of the TEST_F.  Note
+      // that first_is_TEST and this_is_TEST cannot both be true, as
+      // the fixture IDs are different for the two tests.
+      const char* const TEST_name =
+          first_is_TEST ? first_test_name : this_test_name;
+      const char* const TEST_F_name =
+          first_is_TEST ? this_test_name : first_test_name;
+
+      ADD_FAILURE()
+          << "All tests in the same test case must use the same test fixture\n"
+          << "class, so mixing TEST_F and TEST in the same test case is\n"
+          << "illegal.  In test case " << this_test_info->test_case_name()
+          << ",\n"
+          << "test " << TEST_F_name << " is defined using TEST_F but\n"
+          << "test " << TEST_name << " is defined using TEST.  You probably\n"
+          << "want to change the TEST to TEST_F or move it to another test\n"
+          << "case.";
+    } else {
+      // Two fixture classes with the same name appear in two different
+      // namespaces, which is not allowed. Tell the user how to fix this.
+      ADD_FAILURE()
+          << "All tests in the same test case must use the same test fixture\n"
+          << "class.  However, in test case "
+          << this_test_info->test_case_name() << ",\n"
+          << "you defined test " << first_test_name
+          << " and test " << this_test_name << "\n"
+          << "using two different test fixture classes.  This can happen if\n"
+          << "the two classes are from different namespaces or translation\n"
+          << "units and have the same name.  You should probably rename one\n"
+          << "of the classes to put the tests into different test cases.";
+    }
+    return false;
+  }
+
+  return true;
+}
+
+#if GTEST_HAS_SEH
+
+// Adds an "exception thrown" fatal failure to the current test.  This
+// function returns its result via an output parameter pointer because VC++
+// prohibits creation of objects with destructors on stack in functions
+// using __try (see error C2712).
+static std::string* FormatSehExceptionMessage(DWORD exception_code,
+                                              const char* location) {
+  Message message;
+  message << "SEH exception with code 0x" << std::setbase(16) <<
+    exception_code << std::setbase(10) << " thrown in " << location << ".";
+
+  return new std::string(message.GetString());
+}
+
+#endif  // GTEST_HAS_SEH
+
+namespace internal {
+
+#if GTEST_HAS_EXCEPTIONS
+
+// Adds an "exception thrown" fatal failure to the current test.
+static std::string FormatCxxExceptionMessage(const char* description,
+                                             const char* location) {
+  Message message;
+  if (description != NULL) {
+    message << "C++ exception with description \"" << description << "\"";
+  } else {
+    message << "Unknown C++ exception";
+  }
+  message << " thrown in " << location << ".";
+
+  return message.GetString();
+}
+
+static std::string PrintTestPartResultToString(
+    const TestPartResult& test_part_result);
+
+GoogleTestFailureException::GoogleTestFailureException(
+    const TestPartResult& failure)
+    : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {}
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// We put these helper functions in the internal namespace as IBM's xlC
+// compiler rejects the code if they were declared static.
+
+// Runs the given method and handles SEH exceptions it throws, when
+// SEH is supported; returns the 0-value for type Result in case of an
+// SEH exception.  (Microsoft compilers cannot handle SEH and C++
+// exceptions in the same function.  Therefore, we provide a separate
+// wrapper function for handling SEH exceptions.)
+template <class T, typename Result>
+Result HandleSehExceptionsInMethodIfSupported(
+    T* object, Result (T::*method)(), const char* location) {
+#if GTEST_HAS_SEH
+  __try {
+    return (object->*method)();
+  } __except (internal::UnitTestOptions::GTestShouldProcessSEH(  // NOLINT
+      GetExceptionCode())) {
+    // We create the exception message on the heap because VC++ prohibits
+    // creation of objects with destructors on stack in functions using __try
+    // (see error C2712).
+    std::string* exception_message = FormatSehExceptionMessage(
+        GetExceptionCode(), location);
+    internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure,
+                                             *exception_message);
+    delete exception_message;
+    return static_cast<Result>(0);
+  }
+#else
+  (void)location;
+  return (object->*method)();
+#endif  // GTEST_HAS_SEH
+}
+
+// Runs the given method and catches and reports C++ and/or SEH-style
+// exceptions, if they are supported; returns the 0-value for type
+// Result in case of an SEH exception.
+template <class T, typename Result>
+Result HandleExceptionsInMethodIfSupported(
+    T* object, Result (T::*method)(), const char* location) {
+  // NOTE: The user code can affect the way in which Google Test handles
+  // exceptions by setting GTEST_FLAG(catch_exceptions), but only before
+  // RUN_ALL_TESTS() starts. It is technically possible to check the flag
+  // after the exception is caught and either report or re-throw the
+  // exception based on the flag's value:
+  //
+  // try {
+  //   // Perform the test method.
+  // } catch (...) {
+  //   if (GTEST_FLAG(catch_exceptions))
+  //     // Report the exception as failure.
+  //   else
+  //     throw;  // Re-throws the original exception.
+  // }
+  //
+  // However, the purpose of this flag is to allow the program to drop into
+  // the debugger when the exception is thrown. On most platforms, once the
+  // control enters the catch block, the exception origin information is
+  // lost and the debugger will stop the program at the point of the
+  // re-throw in this function -- instead of at the point of the original
+  // throw statement in the code under test.  For this reason, we perform
+  // the check early, sacrificing the ability to affect Google Test's
+  // exception handling in the method where the exception is thrown.
+  if (internal::GetUnitTestImpl()->catch_exceptions()) {
+#if GTEST_HAS_EXCEPTIONS
+    try {
+      return HandleSehExceptionsInMethodIfSupported(object, method, location);
+    } catch (const AssertionException&) {  // NOLINT
+      // This failure was reported already.
+    } catch (const internal::GoogleTestFailureException&) {  // NOLINT
+      // This exception type can only be thrown by a failed Google
+      // Test assertion with the intention of letting another testing
+      // framework catch it.  Therefore we just re-throw it.
+      throw;
+    } catch (const std::exception& e) {  // NOLINT
+      internal::ReportFailureInUnknownLocation(
+          TestPartResult::kFatalFailure,
+          FormatCxxExceptionMessage(e.what(), location));
+    } catch (...) {  // NOLINT
+      internal::ReportFailureInUnknownLocation(
+          TestPartResult::kFatalFailure,
+          FormatCxxExceptionMessage(NULL, location));
+    }
+    return static_cast<Result>(0);
+#else
+    return HandleSehExceptionsInMethodIfSupported(object, method, location);
+#endif  // GTEST_HAS_EXCEPTIONS
+  } else {
+    return (object->*method)();
+  }
+}
+
+}  // namespace internal
+
+// Runs the test and updates the test result.
+void Test::Run() {
+  if (!HasSameFixtureClass()) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()");
+  // We will run the test only if SetUp() was successful.
+  if (!HasFatalFailure()) {
+    impl->os_stack_trace_getter()->UponLeavingGTest();
+    internal::HandleExceptionsInMethodIfSupported(
+        this, &Test::TestBody, "the test body");
+  }
+
+  // However, we want to clean up as much as possible.  Hence we will
+  // always call TearDown(), even if SetUp() or the test body has
+  // failed.
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &Test::TearDown, "TearDown()");
+}
+
+// Returns true iff the current test has a fatal failure.
+bool Test::HasFatalFailure() {
+  return internal::GetUnitTestImpl()->current_test_result()->HasFatalFailure();
+}
+
+// Returns true iff the current test has a non-fatal failure.
+bool Test::HasNonfatalFailure() {
+  return internal::GetUnitTestImpl()->current_test_result()->
+      HasNonfatalFailure();
+}
+
+// class TestInfo
+
+// Constructs a TestInfo object. It assumes ownership of the test factory
+// object.
+TestInfo::TestInfo(const std::string& a_test_case_name,
+                   const std::string& a_name,
+                   const char* a_type_param,
+                   const char* a_value_param,
+                   internal::CodeLocation a_code_location,
+                   internal::TypeId fixture_class_id,
+                   internal::TestFactoryBase* factory)
+    : test_case_name_(a_test_case_name),
+      name_(a_name),
+      type_param_(a_type_param ? new std::string(a_type_param) : NULL),
+      value_param_(a_value_param ? new std::string(a_value_param) : NULL),
+      location_(a_code_location),
+      fixture_class_id_(fixture_class_id),
+      should_run_(false),
+      is_disabled_(false),
+      matches_filter_(false),
+      factory_(factory),
+      result_() {}
+
+// Destructs a TestInfo object.
+TestInfo::~TestInfo() { delete factory_; }
+
+namespace internal {
+
+// Creates a new TestInfo object and registers it with Google Test;
+// returns the created object.
+//
+// Arguments:
+//
+//   test_case_name:   name of the test case
+//   name:             name of the test
+//   type_param:       the name of the test's type parameter, or NULL if
+//                     this is not a typed or a type-parameterized test.
+//   value_param:      text representation of the test's value parameter,
+//                     or NULL if this is not a value-parameterized test.
+//   code_location:    code location where the test is defined
+//   fixture_class_id: ID of the test fixture class
+//   set_up_tc:        pointer to the function that sets up the test case
+//   tear_down_tc:     pointer to the function that tears down the test case
+//   factory:          pointer to the factory that creates a test object.
+//                     The newly created TestInfo instance will assume
+//                     ownership of the factory object.
+TestInfo* MakeAndRegisterTestInfo(
+    const char* test_case_name,
+    const char* name,
+    const char* type_param,
+    const char* value_param,
+    CodeLocation code_location,
+    TypeId fixture_class_id,
+    SetUpTestCaseFunc set_up_tc,
+    TearDownTestCaseFunc tear_down_tc,
+    TestFactoryBase* factory) {
+  TestInfo* const test_info =
+      new TestInfo(test_case_name, name, type_param, value_param,
+                   code_location, fixture_class_id, factory);
+  GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info);
+  return test_info;
+}
+
+void ReportInvalidTestCaseType(const char* test_case_name,
+                               CodeLocation code_location) {
+  Message errors;
+  errors
+      << "Attempted redefinition of test case " << test_case_name << ".\n"
+      << "All tests in the same test case must use the same test fixture\n"
+      << "class.  However, in test case " << test_case_name << ", you tried\n"
+      << "to define a test using a fixture class different from the one\n"
+      << "used earlier. This can happen if the two fixture classes are\n"
+      << "from different namespaces and have the same name. You should\n"
+      << "probably rename one of the classes to put the tests into different\n"
+      << "test cases.";
+
+  GTEST_LOG_(ERROR) << FormatFileLocation(code_location.file.c_str(),
+                                          code_location.line)
+                    << " " << errors.GetString();
+}
+}  // namespace internal
+
+namespace {
+
+// A predicate that checks the test name of a TestInfo against a known
+// value.
+//
+// This is used for implementation of the TestCase class only.  We put
+// it in the anonymous namespace to prevent polluting the outer
+// namespace.
+//
+// TestNameIs is copyable.
+class TestNameIs {
+ public:
+  // Constructor.
+  //
+  // TestNameIs has NO default constructor.
+  explicit TestNameIs(const char* name)
+      : name_(name) {}
+
+  // Returns true iff the test name of test_info matches name_.
+  bool operator()(const TestInfo * test_info) const {
+    return test_info && test_info->name() == name_;
+  }
+
+ private:
+  std::string name_;
+};
+
+}  // namespace
+
+namespace internal {
+
+// This method expands all parameterized tests registered with macros TEST_P
+// and INSTANTIATE_TEST_CASE_P into regular tests and registers those.
+// This will be done just once during the program runtime.
+void UnitTestImpl::RegisterParameterizedTests() {
+  if (!parameterized_tests_registered_) {
+    parameterized_test_registry_.RegisterTests();
+    parameterized_tests_registered_ = true;
+  }
+}
+
+}  // namespace internal
+
+// Creates the test object, runs it, records its result, and then
+// deletes it.
+void TestInfo::Run() {
+  if (!should_run_) return;
+
+  // Tells UnitTest where to store test result.
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_info(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  // Notifies the unit test event listeners that a test is about to start.
+  repeater->OnTestStart(*this);
+
+  const TimeInMillis start = internal::GetTimeInMillis();
+
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+
+  // Creates the test object.
+  Test* const test = internal::HandleExceptionsInMethodIfSupported(
+      factory_, &internal::TestFactoryBase::CreateTest,
+      "the test fixture's constructor");
+
+  // Runs the test if the constructor didn't generate a fatal failure.
+  // Note that the object will not be null
+  if (!Test::HasFatalFailure()) {
+    // This doesn't throw as all user code that can throw are wrapped into
+    // exception handling code.
+    test->Run();
+  }
+
+    // Deletes the test object.
+    impl->os_stack_trace_getter()->UponLeavingGTest();
+    internal::HandleExceptionsInMethodIfSupported(
+        test, &Test::DeleteSelf_, "the test fixture's destructor");
+
+  result_.set_elapsed_time(internal::GetTimeInMillis() - start);
+
+  // Notifies the unit test event listener that a test has just finished.
+  repeater->OnTestEnd(*this);
+
+  // Tells UnitTest to stop associating assertion results to this
+  // test.
+  impl->set_current_test_info(NULL);
+}
+
+// class TestCase
+
+// Gets the number of successful tests in this test case.
+int TestCase::successful_test_count() const {
+  return CountIf(test_info_list_, TestPassed);
+}
+
+// Gets the number of failed tests in this test case.
+int TestCase::failed_test_count() const {
+  return CountIf(test_info_list_, TestFailed);
+}
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int TestCase::reportable_disabled_test_count() const {
+  return CountIf(test_info_list_, TestReportableDisabled);
+}
+
+// Gets the number of disabled tests in this test case.
+int TestCase::disabled_test_count() const {
+  return CountIf(test_info_list_, TestDisabled);
+}
+
+// Gets the number of tests to be printed in the XML report.
+int TestCase::reportable_test_count() const {
+  return CountIf(test_info_list_, TestReportable);
+}
+
+// Get the number of tests in this test case that should run.
+int TestCase::test_to_run_count() const {
+  return CountIf(test_info_list_, ShouldRunTest);
+}
+
+// Gets the number of all tests.
+int TestCase::total_test_count() const {
+  return static_cast<int>(test_info_list_.size());
+}
+
+// Creates a TestCase with the given name.
+//
+// Arguments:
+//
+//   name:         name of the test case
+//   a_type_param: the name of the test case's type parameter, or NULL if
+//                 this is not a typed or a type-parameterized test case.
+//   set_up_tc:    pointer to the function that sets up the test case
+//   tear_down_tc: pointer to the function that tears down the test case
+TestCase::TestCase(const char* a_name, const char* a_type_param,
+                   Test::SetUpTestCaseFunc set_up_tc,
+                   Test::TearDownTestCaseFunc tear_down_tc)
+    : name_(a_name),
+      type_param_(a_type_param ? new std::string(a_type_param) : NULL),
+      set_up_tc_(set_up_tc),
+      tear_down_tc_(tear_down_tc),
+      should_run_(false),
+      elapsed_time_(0) {
+}
+
+// Destructor of TestCase.
+TestCase::~TestCase() {
+  // Deletes every Test in the collection.
+  ForEach(test_info_list_, internal::Delete<TestInfo>);
+}
+
+// Returns the i-th test among all the tests. i can range from 0 to
+// total_test_count() - 1. If i is not in that range, returns NULL.
+const TestInfo* TestCase::GetTestInfo(int i) const {
+  const int index = GetElementOr(test_indices_, i, -1);
+  return index < 0 ? NULL : test_info_list_[index];
+}
+
+// Returns the i-th test among all the tests. i can range from 0 to
+// total_test_count() - 1. If i is not in that range, returns NULL.
+TestInfo* TestCase::GetMutableTestInfo(int i) {
+  const int index = GetElementOr(test_indices_, i, -1);
+  return index < 0 ? NULL : test_info_list_[index];
+}
+
+// Adds a test to this test case.  Will delete the test upon
+// destruction of the TestCase object.
+void TestCase::AddTestInfo(TestInfo * test_info) {
+  test_info_list_.push_back(test_info);
+  test_indices_.push_back(static_cast<int>(test_indices_.size()));
+}
+
+// Runs every test in this TestCase.
+void TestCase::Run() {
+  if (!should_run_) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_case(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  repeater->OnTestCaseStart(*this);
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &TestCase::RunSetUpTestCase, "SetUpTestCase()");
+
+  const internal::TimeInMillis start = internal::GetTimeInMillis();
+  for (int i = 0; i < total_test_count(); i++) {
+    GetMutableTestInfo(i)->Run();
+  }
+  elapsed_time_ = internal::GetTimeInMillis() - start;
+
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &TestCase::RunTearDownTestCase, "TearDownTestCase()");
+
+  repeater->OnTestCaseEnd(*this);
+  impl->set_current_test_case(NULL);
+}
+
+// Clears the results of all tests in this test case.
+void TestCase::ClearResult() {
+  ad_hoc_test_result_.Clear();
+  ForEach(test_info_list_, TestInfo::ClearTestResult);
+}
+
+// Shuffles the tests in this test case.
+void TestCase::ShuffleTests(internal::Random* random) {
+  Shuffle(random, &test_indices_);
+}
+
+// Restores the test order to before the first shuffle.
+void TestCase::UnshuffleTests() {
+  for (size_t i = 0; i < test_indices_.size(); i++) {
+    test_indices_[i] = static_cast<int>(i);
+  }
+}
+
+// Formats a countable noun.  Depending on its quantity, either the
+// singular form or the plural form is used. e.g.
+//
+// FormatCountableNoun(1, "formula", "formuli") returns "1 formula".
+// FormatCountableNoun(5, "book", "books") returns "5 books".
+static std::string FormatCountableNoun(int count,
+                                       const char * singular_form,
+                                       const char * plural_form) {
+  return internal::StreamableToString(count) + " " +
+      (count == 1 ? singular_form : plural_form);
+}
+
+// Formats the count of tests.
+static std::string FormatTestCount(int test_count) {
+  return FormatCountableNoun(test_count, "test", "tests");
+}
+
+// Formats the count of test cases.
+static std::string FormatTestCaseCount(int test_case_count) {
+  return FormatCountableNoun(test_case_count, "test case", "test cases");
+}
+
+// Converts a TestPartResult::Type enum to human-friendly string
+// representation.  Both kNonFatalFailure and kFatalFailure are translated
+// to "Failure", as the user usually doesn't care about the difference
+// between the two when viewing the test result.
+static const char * TestPartResultTypeToString(TestPartResult::Type type) {
+  switch (type) {
+    case TestPartResult::kSuccess:
+      return "Success";
+
+    case TestPartResult::kNonFatalFailure:
+    case TestPartResult::kFatalFailure:
+#ifdef _MSC_VER
+      return "error: ";
+#else
+      return "Failure\n";
+#endif
+    default:
+      return "Unknown result type";
+  }
+}
+
+namespace internal {
+
+// Prints a TestPartResult to an std::string.
+static std::string PrintTestPartResultToString(
+    const TestPartResult& test_part_result) {
+  return (Message()
+          << internal::FormatFileLocation(test_part_result.file_name(),
+                                          test_part_result.line_number())
+          << " " << TestPartResultTypeToString(test_part_result.type())
+          << test_part_result.message()).GetString();
+}
+
+// Prints a TestPartResult.
+static void PrintTestPartResult(const TestPartResult& test_part_result) {
+  const std::string& result =
+      PrintTestPartResultToString(test_part_result);
+  printf("%s\n", result.c_str());
+  fflush(stdout);
+  // If the test program runs in Visual Studio or a debugger, the
+  // following statements add the test part result message to the Output
+  // window such that the user can double-click on it to jump to the
+  // corresponding source code location; otherwise they do nothing.
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+  // We don't call OutputDebugString*() on Windows Mobile, as printing
+  // to stdout is done by OutputDebugString() there already - we don't
+  // want the same message printed twice.
+  ::OutputDebugStringA(result.c_str());
+  ::OutputDebugStringA("\n");
+#endif
+}
+
+// class PrettyUnitTestResultPrinter
+
+enum GTestColor {
+  COLOR_DEFAULT,
+  COLOR_RED,
+  COLOR_GREEN,
+  COLOR_YELLOW
+};
+
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
+    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
+
+// Returns the character attribute for the given color.
+static WORD GetColorAttribute(GTestColor color) {
+  switch (color) {
+    case COLOR_RED:    return FOREGROUND_RED;
+    case COLOR_GREEN:  return FOREGROUND_GREEN;
+    case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN;
+    default:           return 0;
+  }
+}
+
+static int GetBitOffset(WORD color_mask) {
+  if (color_mask == 0) return 0;
+
+  int bitOffset = 0;
+  while ((color_mask & 1) == 0) {
+    color_mask >>= 1;
+    ++bitOffset;
+  }
+  return bitOffset;
+}
+
+static WORD GetNewColor(GTestColor color, WORD old_color_attrs) {
+  // Let's reuse the BG
+  static const WORD background_mask = BACKGROUND_BLUE | BACKGROUND_GREEN |
+                                      BACKGROUND_RED | BACKGROUND_INTENSITY;
+  static const WORD foreground_mask = FOREGROUND_BLUE | FOREGROUND_GREEN |
+                                      FOREGROUND_RED | FOREGROUND_INTENSITY;
+  const WORD existing_bg = old_color_attrs & background_mask;
+
+  WORD new_color =
+      GetColorAttribute(color) | existing_bg | FOREGROUND_INTENSITY;
+  static const int bg_bitOffset = GetBitOffset(background_mask);
+  static const int fg_bitOffset = GetBitOffset(foreground_mask);
+
+  if (((new_color & background_mask) >> bg_bitOffset) ==
+      ((new_color & foreground_mask) >> fg_bitOffset)) {
+    new_color ^= FOREGROUND_INTENSITY;  // invert intensity
+  }
+  return new_color;
+}
+
+#else
+
+// Returns the ANSI color code for the given color.  COLOR_DEFAULT is
+// an invalid input.
+static const char* GetAnsiColorCode(GTestColor color) {
+  switch (color) {
+    case COLOR_RED:     return "1";
+    case COLOR_GREEN:   return "2";
+    case COLOR_YELLOW:  return "3";
+    default:            return NULL;
+  };
+}
+
+#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+
+// Returns true iff Google Test should use colors in the output.
+bool ShouldUseColor(bool stdout_is_tty) {
+  const char* const gtest_color = GTEST_FLAG(color).c_str();
+
+  if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) {
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
+    // On Windows the TERM variable is usually not set, but the
+    // console there does support colors.
+    return stdout_is_tty;
+#else
+    // On non-Windows platforms, we rely on the TERM variable.
+    const char* const term = posix::GetEnv("TERM");
+    const bool term_supports_color =
+        String::CStringEquals(term, "xterm") ||
+        String::CStringEquals(term, "xterm-color") ||
+        String::CStringEquals(term, "xterm-256color") ||
+        String::CStringEquals(term, "screen") ||
+        String::CStringEquals(term, "screen-256color") ||
+        String::CStringEquals(term, "tmux") ||
+        String::CStringEquals(term, "tmux-256color") ||
+        String::CStringEquals(term, "rxvt-unicode") ||
+        String::CStringEquals(term, "rxvt-unicode-256color") ||
+        String::CStringEquals(term, "linux") ||
+        String::CStringEquals(term, "cygwin");
+    return stdout_is_tty && term_supports_color;
+#endif  // GTEST_OS_WINDOWS
+  }
+
+  return String::CaseInsensitiveCStringEquals(gtest_color, "yes") ||
+      String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
+      String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
+      String::CStringEquals(gtest_color, "1");
+  // We take "yes", "true", "t", and "1" as meaning "yes".  If the
+  // value is neither one of these nor "auto", we treat it as "no" to
+  // be conservative.
+}
+
+// Helpers for printing colored strings to stdout. Note that on Windows, we
+// cannot simply emit special characters and have the terminal change colors.
+// This routine must actually emit the characters rather than return a string
+// that would be colored when printed, as can be done on Linux.
+static void ColoredPrintf(GTestColor color, const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS || \
+    GTEST_OS_IOS || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
+  const bool use_color = AlwaysFalse();
+#else
+  static const bool in_color_mode =
+      ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0);
+  const bool use_color = in_color_mode && (color != COLOR_DEFAULT);
+#endif  // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS
+  // The '!= 0' comparison is necessary to satisfy MSVC 7.1.
+
+  if (!use_color) {
+    vprintf(fmt, args);
+    va_end(args);
+    return;
+  }
+
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
+    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
+  const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
+
+  // Gets the current text color.
+  CONSOLE_SCREEN_BUFFER_INFO buffer_info;
+  GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
+  const WORD old_color_attrs = buffer_info.wAttributes;
+  const WORD new_color = GetNewColor(color, old_color_attrs);
+
+  // We need to flush the stream buffers into the console before each
+  // SetConsoleTextAttribute call lest it affect the text that is already
+  // printed but has not yet reached the console.
+  fflush(stdout);
+  SetConsoleTextAttribute(stdout_handle, new_color);
+
+  vprintf(fmt, args);
+
+  fflush(stdout);
+  // Restores the text color.
+  SetConsoleTextAttribute(stdout_handle, old_color_attrs);
+#else
+  printf("\033[0;3%sm", GetAnsiColorCode(color));
+  vprintf(fmt, args);
+  printf("\033[m");  // Resets the terminal to default.
+#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+  va_end(args);
+}
+
+// Text printed in Google Test's text output and --gtest_list_tests
+// output to label the type parameter and value parameter for a test.
+static const char kTypeParamLabel[] = "TypeParam";
+static const char kValueParamLabel[] = "GetParam()";
+
+static void PrintFullTestCommentIfPresent(const TestInfo& test_info) {
+  const char* const type_param = test_info.type_param();
+  const char* const value_param = test_info.value_param();
+
+  if (type_param != NULL || value_param != NULL) {
+    printf(", where ");
+    if (type_param != NULL) {
+      printf("%s = %s", kTypeParamLabel, type_param);
+      if (value_param != NULL)
+        printf(" and ");
+    }
+    if (value_param != NULL) {
+      printf("%s = %s", kValueParamLabel, value_param);
+    }
+  }
+}
+
+// This class implements the TestEventListener interface.
+//
+// Class PrettyUnitTestResultPrinter is copyable.
+class PrettyUnitTestResultPrinter : public TestEventListener {
+ public:
+  PrettyUnitTestResultPrinter() {}
+  static void PrintTestName(const char * test_case, const char * test) {
+    printf("%s.%s", test_case, test);
+  }
+
+  // The following methods override what's in the TestEventListener class.
+  virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration);
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestCaseStart(const TestCase& test_case);
+  virtual void OnTestStart(const TestInfo& test_info);
+  virtual void OnTestPartResult(const TestPartResult& result);
+  virtual void OnTestEnd(const TestInfo& test_info);
+  virtual void OnTestCaseEnd(const TestCase& test_case);
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+  virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {}
+
+ private:
+  static void PrintFailedTests(const UnitTest& unit_test);
+};
+
+  // Fired before each iteration of tests starts.
+void PrettyUnitTestResultPrinter::OnTestIterationStart(
+    const UnitTest& unit_test, int iteration) {
+  if (GTEST_FLAG(repeat) != 1)
+    printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1);
+
+  const char* const filter = GTEST_FLAG(filter).c_str();
+
+  // Prints the filter if it's not *.  This reminds the user that some
+  // tests may be skipped.
+  if (!String::CStringEquals(filter, kUniversalFilter)) {
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: %s filter = %s\n", GTEST_NAME_, filter);
+  }
+
+  if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) {
+    const Int32 shard_index = Int32FromEnvOrDie(kTestShardIndex, -1);
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: This is test shard %d of %s.\n",
+                  static_cast<int>(shard_index) + 1,
+                  internal::posix::GetEnv(kTestTotalShards));
+  }
+
+  if (GTEST_FLAG(shuffle)) {
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: Randomizing tests' orders with a seed of %d .\n",
+                  unit_test.random_seed());
+  }
+
+  ColoredPrintf(COLOR_GREEN,  "[==========] ");
+  printf("Running %s from %s.\n",
+         FormatTestCount(unit_test.test_to_run_count()).c_str(),
+         FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str());
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart(
+    const UnitTest& /*unit_test*/) {
+  ColoredPrintf(COLOR_GREEN,  "[----------] ");
+  printf("Global test environment set-up.\n");
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) {
+  const std::string counts =
+      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  printf("%s from %s", counts.c_str(), test_case.name());
+  if (test_case.type_param() == NULL) {
+    printf("\n");
+  } else {
+    printf(", where %s = %s\n", kTypeParamLabel, test_case.type_param());
+  }
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) {
+  ColoredPrintf(COLOR_GREEN,  "[ RUN      ] ");
+  PrintTestName(test_info.test_case_name(), test_info.name());
+  printf("\n");
+  fflush(stdout);
+}
+
+// Called after an assertion failure.
+void PrettyUnitTestResultPrinter::OnTestPartResult(
+    const TestPartResult& result) {
+  // If the test part succeeded, we don't need to do anything.
+  if (result.type() == TestPartResult::kSuccess)
+    return;
+
+  // Print failure message from the assertion (e.g. expected this and got that).
+  PrintTestPartResult(result);
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
+  if (test_info.result()->Passed()) {
+    ColoredPrintf(COLOR_GREEN, "[       OK ] ");
+  } else {
+    ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+  }
+  PrintTestName(test_info.test_case_name(), test_info.name());
+  if (test_info.result()->Failed())
+    PrintFullTestCommentIfPresent(test_info);
+
+  if (GTEST_FLAG(print_time)) {
+    printf(" (%s ms)\n", internal::StreamableToString(
+           test_info.result()->elapsed_time()).c_str());
+  } else {
+    printf("\n");
+  }
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) {
+  if (!GTEST_FLAG(print_time)) return;
+
+  const std::string counts =
+      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  printf("%s from %s (%s ms total)\n\n",
+         counts.c_str(), test_case.name(),
+         internal::StreamableToString(test_case.elapsed_time()).c_str());
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart(
+    const UnitTest& /*unit_test*/) {
+  ColoredPrintf(COLOR_GREEN,  "[----------] ");
+  printf("Global test environment tear-down\n");
+  fflush(stdout);
+}
+
+// Internal helper for printing the list of failed tests.
+void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) {
+  const int failed_test_count = unit_test.failed_test_count();
+  if (failed_test_count == 0) {
+    return;
+  }
+
+  for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
+    const TestCase& test_case = *unit_test.GetTestCase(i);
+    if (!test_case.should_run() || (test_case.failed_test_count() == 0)) {
+      continue;
+    }
+    for (int j = 0; j < test_case.total_test_count(); ++j) {
+      const TestInfo& test_info = *test_case.GetTestInfo(j);
+      if (!test_info.should_run() || test_info.result()->Passed()) {
+        continue;
+      }
+      ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+      printf("%s.%s", test_case.name(), test_info.name());
+      PrintFullTestCommentIfPresent(test_info);
+      printf("\n");
+    }
+  }
+}
+
+void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                     int /*iteration*/) {
+  ColoredPrintf(COLOR_GREEN,  "[==========] ");
+  printf("%s from %s ran.",
+         FormatTestCount(unit_test.test_to_run_count()).c_str(),
+         FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str());
+  if (GTEST_FLAG(print_time)) {
+    printf(" (%s ms total)",
+           internal::StreamableToString(unit_test.elapsed_time()).c_str());
+  }
+  printf("\n");
+  ColoredPrintf(COLOR_GREEN,  "[  PASSED  ] ");
+  printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str());
+
+  int num_failures = unit_test.failed_test_count();
+  if (!unit_test.Passed()) {
+    const int failed_test_count = unit_test.failed_test_count();
+    ColoredPrintf(COLOR_RED,  "[  FAILED  ] ");
+    printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str());
+    PrintFailedTests(unit_test);
+    printf("\n%2d FAILED %s\n", num_failures,
+                        num_failures == 1 ? "TEST" : "TESTS");
+  }
+
+  int num_disabled = unit_test.reportable_disabled_test_count();
+  if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
+    if (!num_failures) {
+      printf("\n");  // Add a spacer if no FAILURE banner is displayed.
+    }
+    ColoredPrintf(COLOR_YELLOW,
+                  "  YOU HAVE %d DISABLED %s\n\n",
+                  num_disabled,
+                  num_disabled == 1 ? "TEST" : "TESTS");
+  }
+  // Ensure that Google Test output is printed before, e.g., heapchecker output.
+  fflush(stdout);
+}
+
+// End PrettyUnitTestResultPrinter
+
+// class TestEventRepeater
+//
+// This class forwards events to other event listeners.
+class TestEventRepeater : public TestEventListener {
+ public:
+  TestEventRepeater() : forwarding_enabled_(true) {}
+  virtual ~TestEventRepeater();
+  void Append(TestEventListener *listener);
+  TestEventListener* Release(TestEventListener* listener);
+
+  // Controls whether events will be forwarded to listeners_. Set to false
+  // in death test child processes.
+  bool forwarding_enabled() const { return forwarding_enabled_; }
+  void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; }
+
+  virtual void OnTestProgramStart(const UnitTest& unit_test);
+  virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration);
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test);
+  virtual void OnTestCaseStart(const TestCase& test_case);
+  virtual void OnTestStart(const TestInfo& test_info);
+  virtual void OnTestPartResult(const TestPartResult& result);
+  virtual void OnTestEnd(const TestInfo& test_info);
+  virtual void OnTestCaseEnd(const TestCase& test_case);
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test);
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+  virtual void OnTestProgramEnd(const UnitTest& unit_test);
+
+ private:
+  // Controls whether events will be forwarded to listeners_. Set to false
+  // in death test child processes.
+  bool forwarding_enabled_;
+  // The list of listeners that receive events.
+  std::vector<TestEventListener*> listeners_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater);
+};
+
+TestEventRepeater::~TestEventRepeater() {
+  ForEach(listeners_, Delete<TestEventListener>);
+}
+
+void TestEventRepeater::Append(TestEventListener *listener) {
+  listeners_.push_back(listener);
+}
+
+// FIXME: Factor the search functionality into Vector::Find.
+TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
+  for (size_t i = 0; i < listeners_.size(); ++i) {
+    if (listeners_[i] == listener) {
+      listeners_.erase(listeners_.begin() + i);
+      return listener;
+    }
+  }
+
+  return NULL;
+}
+
+// Since most methods are very similar, use macros to reduce boilerplate.
+// This defines a member that forwards the call to all listeners.
+#define GTEST_REPEATER_METHOD_(Name, Type) \
+void TestEventRepeater::Name(const Type& parameter) { \
+  if (forwarding_enabled_) { \
+    for (size_t i = 0; i < listeners_.size(); i++) { \
+      listeners_[i]->Name(parameter); \
+    } \
+  } \
+}
+// This defines a member that forwards the call to all listeners in reverse
+// order.
+#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type) \
+void TestEventRepeater::Name(const Type& parameter) { \
+  if (forwarding_enabled_) { \
+    for (int i = static_cast<int>(listeners_.size()) - 1; i >= 0; i--) { \
+      listeners_[i]->Name(parameter); \
+    } \
+  } \
+}
+
+GTEST_REPEATER_METHOD_(OnTestProgramStart, UnitTest)
+GTEST_REPEATER_METHOD_(OnEnvironmentsSetUpStart, UnitTest)
+GTEST_REPEATER_METHOD_(OnTestCaseStart, TestCase)
+GTEST_REPEATER_METHOD_(OnTestStart, TestInfo)
+GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult)
+GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsTearDownEnd, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestEnd, TestInfo)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestCase)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestProgramEnd, UnitTest)
+
+#undef GTEST_REPEATER_METHOD_
+#undef GTEST_REVERSE_REPEATER_METHOD_
+
+void TestEventRepeater::OnTestIterationStart(const UnitTest& unit_test,
+                                             int iteration) {
+  if (forwarding_enabled_) {
+    for (size_t i = 0; i < listeners_.size(); i++) {
+      listeners_[i]->OnTestIterationStart(unit_test, iteration);
+    }
+  }
+}
+
+void TestEventRepeater::OnTestIterationEnd(const UnitTest& unit_test,
+                                           int iteration) {
+  if (forwarding_enabled_) {
+    for (int i = static_cast<int>(listeners_.size()) - 1; i >= 0; i--) {
+      listeners_[i]->OnTestIterationEnd(unit_test, iteration);
+    }
+  }
+}
+
+// End TestEventRepeater
+
+// This class generates an XML output file.
+class XmlUnitTestResultPrinter : public EmptyTestEventListener {
+ public:
+  explicit XmlUnitTestResultPrinter(const char* output_file);
+
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+  void ListTestsMatchingFilter(const std::vector<TestCase*>& test_cases);
+
+  // Prints an XML summary of all unit tests.
+  static void PrintXmlTestsList(std::ostream* stream,
+                                const std::vector<TestCase*>& test_cases);
+
+ private:
+  // Is c a whitespace character that is normalized to a space character
+  // when it appears in an XML attribute value?
+  static bool IsNormalizableWhitespace(char c) {
+    return c == 0x9 || c == 0xA || c == 0xD;
+  }
+
+  // May c appear in a well-formed XML document?
+  static bool IsValidXmlCharacter(char c) {
+    return IsNormalizableWhitespace(c) || c >= 0x20;
+  }
+
+  // Returns an XML-escaped copy of the input string str.  If
+  // is_attribute is true, the text is meant to appear as an attribute
+  // value, and normalizable whitespace is preserved by replacing it
+  // with character references.
+  static std::string EscapeXml(const std::string& str, bool is_attribute);
+
+  // Returns the given string with all characters invalid in XML removed.
+  static std::string RemoveInvalidXmlCharacters(const std::string& str);
+
+  // Convenience wrapper around EscapeXml when str is an attribute value.
+  static std::string EscapeXmlAttribute(const std::string& str) {
+    return EscapeXml(str, true);
+  }
+
+  // Convenience wrapper around EscapeXml when str is not an attribute value.
+  static std::string EscapeXmlText(const char* str) {
+    return EscapeXml(str, false);
+  }
+
+  // Verifies that the given attribute belongs to the given element and
+  // streams the attribute as XML.
+  static void OutputXmlAttribute(std::ostream* stream,
+                                 const std::string& element_name,
+                                 const std::string& name,
+                                 const std::string& value);
+
+  // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
+  static void OutputXmlCDataSection(::std::ostream* stream, const char* data);
+
+  // Streams an XML representation of a TestInfo object.
+  static void OutputXmlTestInfo(::std::ostream* stream,
+                                const char* test_case_name,
+                                const TestInfo& test_info);
+
+  // Prints an XML representation of a TestCase object
+  static void PrintXmlTestCase(::std::ostream* stream,
+                               const TestCase& test_case);
+
+  // Prints an XML summary of unit_test to output stream out.
+  static void PrintXmlUnitTest(::std::ostream* stream,
+                               const UnitTest& unit_test);
+
+  // Produces a string representing the test properties in a result as space
+  // delimited XML attributes based on the property key="value" pairs.
+  // When the std::string is not empty, it includes a space at the beginning,
+  // to delimit this attribute from prior attributes.
+  static std::string TestPropertiesAsXmlAttributes(const TestResult& result);
+
+  // Streams an XML representation of the test properties of a TestResult
+  // object.
+  static void OutputXmlTestProperties(std::ostream* stream,
+                                      const TestResult& result);
+
+  // The output file.
+  const std::string output_file_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter);
+};
+
+// Creates a new XmlUnitTestResultPrinter.
+XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file)
+    : output_file_(output_file) {
+  if (output_file_.empty()) {
+    GTEST_LOG_(FATAL) << "XML output file may not be null";
+  }
+}
+
+// Called after the unit test ends.
+void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                  int /*iteration*/) {
+  FILE* xmlout = OpenFileForWriting(output_file_);
+  std::stringstream stream;
+  PrintXmlUnitTest(&stream, unit_test);
+  fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
+  fclose(xmlout);
+}
+
+void XmlUnitTestResultPrinter::ListTestsMatchingFilter(
+    const std::vector<TestCase*>& test_cases) {
+  FILE* xmlout = OpenFileForWriting(output_file_);
+  std::stringstream stream;
+  PrintXmlTestsList(&stream, test_cases);
+  fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
+  fclose(xmlout);
+}
+
+// Returns an XML-escaped copy of the input string str.  If is_attribute
+// is true, the text is meant to appear as an attribute value, and
+// normalizable whitespace is preserved by replacing it with character
+// references.
+//
+// Invalid XML characters in str, if any, are stripped from the output.
+// It is expected that most, if not all, of the text processed by this
+// module will consist of ordinary English text.
+// If this module is ever modified to produce version 1.1 XML output,
+// most invalid characters can be retained using character references.
+// FIXME: It might be nice to have a minimally invasive, human-readable
+// escaping scheme for invalid characters, rather than dropping them.
+std::string XmlUnitTestResultPrinter::EscapeXml(
+    const std::string& str, bool is_attribute) {
+  Message m;
+
+  for (size_t i = 0; i < str.size(); ++i) {
+    const char ch = str[i];
+    switch (ch) {
+      case '<':
+        m << "&lt;";
+        break;
+      case '>':
+        m << "&gt;";
+        break;
+      case '&':
+        m << "&amp;";
+        break;
+      case '\'':
+        if (is_attribute)
+          m << "&apos;";
+        else
+          m << '\'';
+        break;
+      case '"':
+        if (is_attribute)
+          m << "&quot;";
+        else
+          m << '"';
+        break;
+      default:
+        if (IsValidXmlCharacter(ch)) {
+          if (is_attribute && IsNormalizableWhitespace(ch))
+            m << "&#x" << String::FormatByte(static_cast<unsigned char>(ch))
+              << ";";
+          else
+            m << ch;
+        }
+        break;
+    }
+  }
+
+  return m.GetString();
+}
+
+// Returns the given string with all characters invalid in XML removed.
+// Currently invalid characters are dropped from the string. An
+// alternative is to replace them with certain characters such as . or ?.
+std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
+    const std::string& str) {
+  std::string output;
+  output.reserve(str.size());
+  for (std::string::const_iterator it = str.begin(); it != str.end(); ++it)
+    if (IsValidXmlCharacter(*it))
+      output.push_back(*it);
+
+  return output;
+}
+
+// The following routines generate an XML representation of a UnitTest
+// object.
+// GOOGLETEST_CM0009 DO NOT DELETE
+//
+// This is how Google Test concepts map to the DTD:
+//
+// <testsuites name="AllTests">        <-- corresponds to a UnitTest object
+//   <testsuite name="testcase-name">  <-- corresponds to a TestCase object
+//     <testcase name="test-name">     <-- corresponds to a TestInfo object
+//       <failure message="...">...</failure>
+//       <failure message="...">...</failure>
+//       <failure message="...">...</failure>
+//                                     <-- individual assertion failures
+//     </testcase>
+//   </testsuite>
+// </testsuites>
+
+// Formats the given time in milliseconds as seconds.
+std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) {
+  ::std::stringstream ss;
+  ss << (static_cast<double>(ms) * 1e-3);
+  return ss.str();
+}
+
+static bool PortableLocaltime(time_t seconds, struct tm* out) {
+#if defined(_MSC_VER)
+  return localtime_s(out, &seconds) == 0;
+#elif defined(__MINGW32__) || defined(__MINGW64__)
+  // MINGW <time.h> provides neither localtime_r nor localtime_s, but uses
+  // Windows' localtime(), which has a thread-local tm buffer.
+  struct tm* tm_ptr = localtime(&seconds);  // NOLINT
+  if (tm_ptr == NULL)
+    return false;
+  *out = *tm_ptr;
+  return true;
+#else
+  return localtime_r(&seconds, out) != NULL;
+#endif
+}
+
+// Converts the given epoch time in milliseconds to a date string in the ISO
+// 8601 format, without the timezone information.
+std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) {
+  struct tm time_struct;
+  if (!PortableLocaltime(static_cast<time_t>(ms / 1000), &time_struct))
+    return "";
+  // YYYY-MM-DDThh:mm:ss
+  return StreamableToString(time_struct.tm_year + 1900) + "-" +
+      String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+      String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+      String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+      String::FormatIntWidth2(time_struct.tm_min) + ":" +
+      String::FormatIntWidth2(time_struct.tm_sec);
+}
+
+// Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
+void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
+                                                     const char* data) {
+  const char* segment = data;
+  *stream << "<![CDATA[";
+  for (;;) {
+    const char* const next_segment = strstr(segment, "]]>");
+    if (next_segment != NULL) {
+      stream->write(
+          segment, static_cast<std::streamsize>(next_segment - segment));
+      *stream << "]]>]]&gt;<![CDATA[";
+      segment = next_segment + strlen("]]>");
+    } else {
+      *stream << segment;
+      break;
+    }
+  }
+  *stream << "]]>";
+}
+
+void XmlUnitTestResultPrinter::OutputXmlAttribute(
+    std::ostream* stream,
+    const std::string& element_name,
+    const std::string& name,
+    const std::string& value) {
+  const std::vector<std::string>& allowed_names =
+      GetReservedAttributesForElement(element_name);
+
+  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+                   allowed_names.end())
+      << "Attribute " << name << " is not allowed for element <" << element_name
+      << ">.";
+
+  *stream << " " << name << "=\"" << EscapeXmlAttribute(value) << "\"";
+}
+
+// Prints an XML representation of a TestInfo object.
+// FIXME: There is also value in printing properties with the plain printer.
+void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
+                                                 const char* test_case_name,
+                                                 const TestInfo& test_info) {
+  const TestResult& result = *test_info.result();
+  const std::string kTestcase = "testcase";
+
+  if (test_info.is_in_another_shard()) {
+    return;
+  }
+
+  *stream << "    <testcase";
+  OutputXmlAttribute(stream, kTestcase, "name", test_info.name());
+
+  if (test_info.value_param() != NULL) {
+    OutputXmlAttribute(stream, kTestcase, "value_param",
+                       test_info.value_param());
+  }
+  if (test_info.type_param() != NULL) {
+    OutputXmlAttribute(stream, kTestcase, "type_param", test_info.type_param());
+  }
+  if (GTEST_FLAG(list_tests)) {
+    OutputXmlAttribute(stream, kTestcase, "file", test_info.file());
+    OutputXmlAttribute(stream, kTestcase, "line",
+                       StreamableToString(test_info.line()));
+    *stream << " />\n";
+    return;
+  }
+
+  OutputXmlAttribute(stream, kTestcase, "status",
+                     test_info.should_run() ? "run" : "notrun");
+  OutputXmlAttribute(stream, kTestcase, "time",
+                     FormatTimeInMillisAsSeconds(result.elapsed_time()));
+  OutputXmlAttribute(stream, kTestcase, "classname", test_case_name);
+
+  int failures = 0;
+  for (int i = 0; i < result.total_part_count(); ++i) {
+    const TestPartResult& part = result.GetTestPartResult(i);
+    if (part.failed()) {
+      if (++failures == 1) {
+        *stream << ">\n";
+      }
+      const std::string location =
+          internal::FormatCompilerIndependentFileLocation(part.file_name(),
+                                                          part.line_number());
+      const std::string summary = location + "\n" + part.summary();
+      *stream << "      <failure message=\""
+              << EscapeXmlAttribute(summary.c_str())
+              << "\" type=\"\">";
+      const std::string detail = location + "\n" + part.message();
+      OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
+      *stream << "</failure>\n";
+    }
+  }
+
+  if (failures == 0 && result.test_property_count() == 0) {
+    *stream << " />\n";
+  } else {
+    if (failures == 0) {
+      *stream << ">\n";
+    }
+    OutputXmlTestProperties(stream, result);
+    *stream << "    </testcase>\n";
+  }
+}
+
+// Prints an XML representation of a TestCase object
+void XmlUnitTestResultPrinter::PrintXmlTestCase(std::ostream* stream,
+                                                const TestCase& test_case) {
+  const std::string kTestsuite = "testsuite";
+  *stream << "  <" << kTestsuite;
+  OutputXmlAttribute(stream, kTestsuite, "name", test_case.name());
+  OutputXmlAttribute(stream, kTestsuite, "tests",
+                     StreamableToString(test_case.reportable_test_count()));
+  if (!GTEST_FLAG(list_tests)) {
+    OutputXmlAttribute(stream, kTestsuite, "failures",
+                       StreamableToString(test_case.failed_test_count()));
+    OutputXmlAttribute(
+        stream, kTestsuite, "disabled",
+        StreamableToString(test_case.reportable_disabled_test_count()));
+    OutputXmlAttribute(stream, kTestsuite, "errors", "0");
+    OutputXmlAttribute(stream, kTestsuite, "time",
+                       FormatTimeInMillisAsSeconds(test_case.elapsed_time()));
+    *stream << TestPropertiesAsXmlAttributes(test_case.ad_hoc_test_result());
+  }
+  *stream << ">\n";
+  for (int i = 0; i < test_case.total_test_count(); ++i) {
+    if (test_case.GetTestInfo(i)->is_reportable())
+      OutputXmlTestInfo(stream, test_case.name(), *test_case.GetTestInfo(i));
+  }
+  *stream << "  </" << kTestsuite << ">\n";
+}
+
+// Prints an XML summary of unit_test to output stream out.
+void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
+                                                const UnitTest& unit_test) {
+  const std::string kTestsuites = "testsuites";
+
+  *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+  *stream << "<" << kTestsuites;
+
+  OutputXmlAttribute(stream, kTestsuites, "tests",
+                     StreamableToString(unit_test.reportable_test_count()));
+  OutputXmlAttribute(stream, kTestsuites, "failures",
+                     StreamableToString(unit_test.failed_test_count()));
+  OutputXmlAttribute(
+      stream, kTestsuites, "disabled",
+      StreamableToString(unit_test.reportable_disabled_test_count()));
+  OutputXmlAttribute(stream, kTestsuites, "errors", "0");
+  OutputXmlAttribute(
+      stream, kTestsuites, "timestamp",
+      FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp()));
+  OutputXmlAttribute(stream, kTestsuites, "time",
+                     FormatTimeInMillisAsSeconds(unit_test.elapsed_time()));
+
+  if (GTEST_FLAG(shuffle)) {
+    OutputXmlAttribute(stream, kTestsuites, "random_seed",
+                       StreamableToString(unit_test.random_seed()));
+  }
+  *stream << TestPropertiesAsXmlAttributes(unit_test.ad_hoc_test_result());
+
+  OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
+  *stream << ">\n";
+
+  for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
+    if (unit_test.GetTestCase(i)->reportable_test_count() > 0)
+      PrintXmlTestCase(stream, *unit_test.GetTestCase(i));
+  }
+  *stream << "</" << kTestsuites << ">\n";
+}
+
+void XmlUnitTestResultPrinter::PrintXmlTestsList(
+    std::ostream* stream, const std::vector<TestCase*>& test_cases) {
+  const std::string kTestsuites = "testsuites";
+
+  *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+  *stream << "<" << kTestsuites;
+
+  int total_tests = 0;
+  for (size_t i = 0; i < test_cases.size(); ++i) {
+    total_tests += test_cases[i]->total_test_count();
+  }
+  OutputXmlAttribute(stream, kTestsuites, "tests",
+                     StreamableToString(total_tests));
+  OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
+  *stream << ">\n";
+
+  for (size_t i = 0; i < test_cases.size(); ++i) {
+    PrintXmlTestCase(stream, *test_cases[i]);
+  }
+  *stream << "</" << kTestsuites << ">\n";
+}
+
+// Produces a string representing the test properties in a result as space
+// delimited XML attributes based on the property key="value" pairs.
+std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
+    const TestResult& result) {
+  Message attributes;
+  for (int i = 0; i < result.test_property_count(); ++i) {
+    const TestProperty& property = result.GetTestProperty(i);
+    attributes << " " << property.key() << "="
+        << "\"" << EscapeXmlAttribute(property.value()) << "\"";
+  }
+  return attributes.GetString();
+}
+
+void XmlUnitTestResultPrinter::OutputXmlTestProperties(
+    std::ostream* stream, const TestResult& result) {
+  const std::string kProperties = "properties";
+  const std::string kProperty = "property";
+
+  if (result.test_property_count() <= 0) {
+    return;
+  }
+
+  *stream << "<" << kProperties << ">\n";
+  for (int i = 0; i < result.test_property_count(); ++i) {
+    const TestProperty& property = result.GetTestProperty(i);
+    *stream << "<" << kProperty;
+    *stream << " name=\"" << EscapeXmlAttribute(property.key()) << "\"";
+    *stream << " value=\"" << EscapeXmlAttribute(property.value()) << "\"";
+    *stream << "/>\n";
+  }
+  *stream << "</" << kProperties << ">\n";
+}
+
+// End XmlUnitTestResultPrinter
+
+// This class generates an JSON output file.
+class JsonUnitTestResultPrinter : public EmptyTestEventListener {
+ public:
+  explicit JsonUnitTestResultPrinter(const char* output_file);
+
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+
+  // Prints an JSON summary of all unit tests.
+  static void PrintJsonTestList(::std::ostream* stream,
+                                const std::vector<TestCase*>& test_cases);
+
+ private:
+  // Returns an JSON-escaped copy of the input string str.
+  static std::string EscapeJson(const std::string& str);
+
+  //// Verifies that the given attribute belongs to the given element and
+  //// streams the attribute as JSON.
+  static void OutputJsonKey(std::ostream* stream,
+                            const std::string& element_name,
+                            const std::string& name,
+                            const std::string& value,
+                            const std::string& indent,
+                            bool comma = true);
+  static void OutputJsonKey(std::ostream* stream,
+                            const std::string& element_name,
+                            const std::string& name,
+                            int value,
+                            const std::string& indent,
+                            bool comma = true);
+
+  // Streams a JSON representation of a TestInfo object.
+  static void OutputJsonTestInfo(::std::ostream* stream,
+                                 const char* test_case_name,
+                                 const TestInfo& test_info);
+
+  // Prints a JSON representation of a TestCase object
+  static void PrintJsonTestCase(::std::ostream* stream,
+                                const TestCase& test_case);
+
+  // Prints a JSON summary of unit_test to output stream out.
+  static void PrintJsonUnitTest(::std::ostream* stream,
+                                const UnitTest& unit_test);
+
+  // Produces a string representing the test properties in a result as
+  // a JSON dictionary.
+  static std::string TestPropertiesAsJson(const TestResult& result,
+                                          const std::string& indent);
+
+  // The output file.
+  const std::string output_file_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(JsonUnitTestResultPrinter);
+};
+
+// Creates a new JsonUnitTestResultPrinter.
+JsonUnitTestResultPrinter::JsonUnitTestResultPrinter(const char* output_file)
+    : output_file_(output_file) {
+  if (output_file_.empty()) {
+    GTEST_LOG_(FATAL) << "JSON output file may not be null";
+  }
+}
+
+void JsonUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                  int /*iteration*/) {
+  FILE* jsonout = OpenFileForWriting(output_file_);
+  std::stringstream stream;
+  PrintJsonUnitTest(&stream, unit_test);
+  fprintf(jsonout, "%s", StringStreamToString(&stream).c_str());
+  fclose(jsonout);
+}
+
+// Returns an JSON-escaped copy of the input string str.
+std::string JsonUnitTestResultPrinter::EscapeJson(const std::string& str) {
+  Message m;
+
+  for (size_t i = 0; i < str.size(); ++i) {
+    const char ch = str[i];
+    switch (ch) {
+      case '\\':
+      case '"':
+      case '/':
+        m << '\\' << ch;
+        break;
+      case '\b':
+        m << "\\b";
+        break;
+      case '\t':
+        m << "\\t";
+        break;
+      case '\n':
+        m << "\\n";
+        break;
+      case '\f':
+        m << "\\f";
+        break;
+      case '\r':
+        m << "\\r";
+        break;
+      default:
+        if (ch < ' ') {
+          m << "\\u00" << String::FormatByte(static_cast<unsigned char>(ch));
+        } else {
+          m << ch;
+        }
+        break;
+    }
+  }
+
+  return m.GetString();
+}
+
+// The following routines generate an JSON representation of a UnitTest
+// object.
+
+// Formats the given time in milliseconds as seconds.
+static std::string FormatTimeInMillisAsDuration(TimeInMillis ms) {
+  ::std::stringstream ss;
+  ss << (static_cast<double>(ms) * 1e-3) << "s";
+  return ss.str();
+}
+
+// Converts the given epoch time in milliseconds to a date string in the
+// RFC3339 format, without the timezone information.
+static std::string FormatEpochTimeInMillisAsRFC3339(TimeInMillis ms) {
+  struct tm time_struct;
+  if (!PortableLocaltime(static_cast<time_t>(ms / 1000), &time_struct))
+    return "";
+  // YYYY-MM-DDThh:mm:ss
+  return StreamableToString(time_struct.tm_year + 1900) + "-" +
+      String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+      String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+      String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+      String::FormatIntWidth2(time_struct.tm_min) + ":" +
+      String::FormatIntWidth2(time_struct.tm_sec) + "Z";
+}
+
+static inline std::string Indent(int width) {
+  return std::string(width, ' ');
+}
+
+void JsonUnitTestResultPrinter::OutputJsonKey(
+    std::ostream* stream,
+    const std::string& element_name,
+    const std::string& name,
+    const std::string& value,
+    const std::string& indent,
+    bool comma) {
+  const std::vector<std::string>& allowed_names =
+      GetReservedAttributesForElement(element_name);
+
+  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+                   allowed_names.end())
+      << "Key \"" << name << "\" is not allowed for value \"" << element_name
+      << "\".";
+
+  *stream << indent << "\"" << name << "\": \"" << EscapeJson(value) << "\"";
+  if (comma)
+    *stream << ",\n";
+}
+
+void JsonUnitTestResultPrinter::OutputJsonKey(
+    std::ostream* stream,
+    const std::string& element_name,
+    const std::string& name,
+    int value,
+    const std::string& indent,
+    bool comma) {
+  const std::vector<std::string>& allowed_names =
+      GetReservedAttributesForElement(element_name);
+
+  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+                   allowed_names.end())
+      << "Key \"" << name << "\" is not allowed for value \"" << element_name
+      << "\".";
+
+  *stream << indent << "\"" << name << "\": " << StreamableToString(value);
+  if (comma)
+    *stream << ",\n";
+}
+
+// Prints a JSON representation of a TestInfo object.
+void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream* stream,
+                                                   const char* test_case_name,
+                                                   const TestInfo& test_info) {
+  const TestResult& result = *test_info.result();
+  const std::string kTestcase = "testcase";
+  const std::string kIndent = Indent(10);
+
+  *stream << Indent(8) << "{\n";
+  OutputJsonKey(stream, kTestcase, "name", test_info.name(), kIndent);
+
+  if (test_info.value_param() != NULL) {
+    OutputJsonKey(stream, kTestcase, "value_param",
+                  test_info.value_param(), kIndent);
+  }
+  if (test_info.type_param() != NULL) {
+    OutputJsonKey(stream, kTestcase, "type_param", test_info.type_param(),
+                  kIndent);
+  }
+  if (GTEST_FLAG(list_tests)) {
+    OutputJsonKey(stream, kTestcase, "file", test_info.file(), kIndent);
+    OutputJsonKey(stream, kTestcase, "line", test_info.line(), kIndent, false);
+    *stream << "\n" << Indent(8) << "}";
+    return;
+  }
+
+  OutputJsonKey(stream, kTestcase, "status",
+                test_info.should_run() ? "RUN" : "NOTRUN", kIndent);
+  OutputJsonKey(stream, kTestcase, "time",
+                FormatTimeInMillisAsDuration(result.elapsed_time()), kIndent);
+  OutputJsonKey(stream, kTestcase, "classname", test_case_name, kIndent, false);
+  *stream << TestPropertiesAsJson(result, kIndent);
+
+  int failures = 0;
+  for (int i = 0; i < result.total_part_count(); ++i) {
+    const TestPartResult& part = result.GetTestPartResult(i);
+    if (part.failed()) {
+      *stream << ",\n";
+      if (++failures == 1) {
+        *stream << kIndent << "\"" << "failures" << "\": [\n";
+      }
+      const std::string location =
+          internal::FormatCompilerIndependentFileLocation(part.file_name(),
+                                                          part.line_number());
+      const std::string message = EscapeJson(location + "\n" + part.message());
+      *stream << kIndent << "  {\n"
+              << kIndent << "    \"failure\": \"" << message << "\",\n"
+              << kIndent << "    \"type\": \"\"\n"
+              << kIndent << "  }";
+    }
+  }
+
+  if (failures > 0)
+    *stream << "\n" << kIndent << "]";
+  *stream << "\n" << Indent(8) << "}";
+}
+
+// Prints an JSON representation of a TestCase object
+void JsonUnitTestResultPrinter::PrintJsonTestCase(std::ostream* stream,
+                                                  const TestCase& test_case) {
+  const std::string kTestsuite = "testsuite";
+  const std::string kIndent = Indent(6);
+
+  *stream << Indent(4) << "{\n";
+  OutputJsonKey(stream, kTestsuite, "name", test_case.name(), kIndent);
+  OutputJsonKey(stream, kTestsuite, "tests", test_case.reportable_test_count(),
+                kIndent);
+  if (!GTEST_FLAG(list_tests)) {
+    OutputJsonKey(stream, kTestsuite, "failures", test_case.failed_test_count(),
+                  kIndent);
+    OutputJsonKey(stream, kTestsuite, "disabled",
+                  test_case.reportable_disabled_test_count(), kIndent);
+    OutputJsonKey(stream, kTestsuite, "errors", 0, kIndent);
+    OutputJsonKey(stream, kTestsuite, "time",
+                  FormatTimeInMillisAsDuration(test_case.elapsed_time()),
+                  kIndent, false);
+    *stream << TestPropertiesAsJson(test_case.ad_hoc_test_result(), kIndent)
+            << ",\n";
+  }
+
+  *stream << kIndent << "\"" << kTestsuite << "\": [\n";
+
+  bool comma = false;
+  for (int i = 0; i < test_case.total_test_count(); ++i) {
+    if (test_case.GetTestInfo(i)->is_reportable()) {
+      if (comma) {
+        *stream << ",\n";
+      } else {
+        comma = true;
+      }
+      OutputJsonTestInfo(stream, test_case.name(), *test_case.GetTestInfo(i));
+    }
+  }
+  *stream << "\n" << kIndent << "]\n" << Indent(4) << "}";
+}
+
+// Prints a JSON summary of unit_test to output stream out.
+void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream,
+                                                  const UnitTest& unit_test) {
+  const std::string kTestsuites = "testsuites";
+  const std::string kIndent = Indent(2);
+  *stream << "{\n";
+
+  OutputJsonKey(stream, kTestsuites, "tests", unit_test.reportable_test_count(),
+                kIndent);
+  OutputJsonKey(stream, kTestsuites, "failures", unit_test.failed_test_count(),
+                kIndent);
+  OutputJsonKey(stream, kTestsuites, "disabled",
+                unit_test.reportable_disabled_test_count(), kIndent);
+  OutputJsonKey(stream, kTestsuites, "errors", 0, kIndent);
+  if (GTEST_FLAG(shuffle)) {
+    OutputJsonKey(stream, kTestsuites, "random_seed", unit_test.random_seed(),
+                  kIndent);
+  }
+  OutputJsonKey(stream, kTestsuites, "timestamp",
+                FormatEpochTimeInMillisAsRFC3339(unit_test.start_timestamp()),
+                kIndent);
+  OutputJsonKey(stream, kTestsuites, "time",
+                FormatTimeInMillisAsDuration(unit_test.elapsed_time()), kIndent,
+                false);
+
+  *stream << TestPropertiesAsJson(unit_test.ad_hoc_test_result(), kIndent)
+          << ",\n";
+
+  OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent);
+  *stream << kIndent << "\"" << kTestsuites << "\": [\n";
+
+  bool comma = false;
+  for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
+    if (unit_test.GetTestCase(i)->reportable_test_count() > 0) {
+      if (comma) {
+        *stream << ",\n";
+      } else {
+        comma = true;
+      }
+      PrintJsonTestCase(stream, *unit_test.GetTestCase(i));
+    }
+  }
+
+  *stream << "\n" << kIndent << "]\n" << "}\n";
+}
+
+void JsonUnitTestResultPrinter::PrintJsonTestList(
+    std::ostream* stream, const std::vector<TestCase*>& test_cases) {
+  const std::string kTestsuites = "testsuites";
+  const std::string kIndent = Indent(2);
+  *stream << "{\n";
+  int total_tests = 0;
+  for (size_t i = 0; i < test_cases.size(); ++i) {
+    total_tests += test_cases[i]->total_test_count();
+  }
+  OutputJsonKey(stream, kTestsuites, "tests", total_tests, kIndent);
+
+  OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent);
+  *stream << kIndent << "\"" << kTestsuites << "\": [\n";
+
+  for (size_t i = 0; i < test_cases.size(); ++i) {
+    if (i != 0) {
+      *stream << ",\n";
+    }
+    PrintJsonTestCase(stream, *test_cases[i]);
+  }
+
+  *stream << "\n"
+          << kIndent << "]\n"
+          << "}\n";
+}
+// Produces a string representing the test properties in a result as
+// a JSON dictionary.
+std::string JsonUnitTestResultPrinter::TestPropertiesAsJson(
+    const TestResult& result, const std::string& indent) {
+  Message attributes;
+  for (int i = 0; i < result.test_property_count(); ++i) {
+    const TestProperty& property = result.GetTestProperty(i);
+    attributes << ",\n" << indent << "\"" << property.key() << "\": "
+               << "\"" << EscapeJson(property.value()) << "\"";
+  }
+  return attributes.GetString();
+}
+
+// End JsonUnitTestResultPrinter
+
+#if GTEST_CAN_STREAM_RESULTS_
+
+// Checks if str contains '=', '&', '%' or '\n' characters. If yes,
+// replaces them by "%xx" where xx is their hexadecimal value. For
+// example, replaces "=" with "%3D".  This algorithm is O(strlen(str))
+// in both time and space -- important as the input str may contain an
+// arbitrarily long test failure message and stack trace.
+std::string StreamingListener::UrlEncode(const char* str) {
+  std::string result;
+  result.reserve(strlen(str) + 1);
+  for (char ch = *str; ch != '\0'; ch = *++str) {
+    switch (ch) {
+      case '%':
+      case '=':
+      case '&':
+      case '\n':
+        result.append("%" + String::FormatByte(static_cast<unsigned char>(ch)));
+        break;
+      default:
+        result.push_back(ch);
+        break;
+    }
+  }
+  return result;
+}
+
+void StreamingListener::SocketWriter::MakeConnection() {
+  GTEST_CHECK_(sockfd_ == -1)
+      << "MakeConnection() can't be called when there is already a connection.";
+
+  addrinfo hints;
+  memset(&hints, 0, sizeof(hints));
+  hints.ai_family = AF_UNSPEC;    // To allow both IPv4 and IPv6 addresses.
+  hints.ai_socktype = SOCK_STREAM;
+  addrinfo* servinfo = NULL;
+
+  // Use the getaddrinfo() to get a linked list of IP addresses for
+  // the given host name.
+  const int error_num = getaddrinfo(
+      host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
+  if (error_num != 0) {
+    GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: "
+                        << gai_strerror(error_num);
+  }
+
+  // Loop through all the results and connect to the first we can.
+  for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != NULL;
+       cur_addr = cur_addr->ai_next) {
+    sockfd_ = socket(
+        cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol);
+    if (sockfd_ != -1) {
+      // Connect the client socket to the server socket.
+      if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) {
+        close(sockfd_);
+        sockfd_ = -1;
+      }
+    }
+  }
+
+  freeaddrinfo(servinfo);  // all done with this structure
+
+  if (sockfd_ == -1) {
+    GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to "
+                        << host_name_ << ":" << port_num_;
+  }
+}
+
+// End of class Streaming Listener
+#endif  // GTEST_CAN_STREAM_RESULTS__
+
+// class OsStackTraceGetter
+
+const char* const OsStackTraceGetterInterface::kElidedFramesMarker =
+    "... " GTEST_NAME_ " internal frames ...";
+
+std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+#if GTEST_HAS_ABSL
+  std::string result;
+
+  if (max_depth <= 0) {
+    return result;
+  }
+
+  max_depth = std::min(max_depth, kMaxStackTraceDepth);
+
+  std::vector<void*> raw_stack(max_depth);
+  // Skips the frames requested by the caller, plus this function.
+  const int raw_stack_size =
+      absl::GetStackTrace(&raw_stack[0], max_depth, skip_count + 1);
+
+  void* caller_frame = nullptr;
+  {
+    MutexLock lock(&mutex_);
+    caller_frame = caller_frame_;
+  }
+
+  for (int i = 0; i < raw_stack_size; ++i) {
+    if (raw_stack[i] == caller_frame &&
+        !GTEST_FLAG(show_internal_stack_frames)) {
+      // Add a marker to the trace and stop adding frames.
+      absl::StrAppend(&result, kElidedFramesMarker, "\n");
+      break;
+    }
+
+    char tmp[1024];
+    const char* symbol = "(unknown)";
+    if (absl::Symbolize(raw_stack[i], tmp, sizeof(tmp))) {
+      symbol = tmp;
+    }
+
+    char line[1024];
+    snprintf(line, sizeof(line), "  %p: %s\n", raw_stack[i], symbol);
+    result += line;
+  }
+
+  return result;
+
+#else  // !GTEST_HAS_ABSL
+  static_cast<void>(max_depth);
+  static_cast<void>(skip_count);
+  return "";
+#endif  // GTEST_HAS_ABSL
+}
+
+void OsStackTraceGetter::UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_) {
+#if GTEST_HAS_ABSL
+  void* caller_frame = nullptr;
+  if (absl::GetStackTrace(&caller_frame, 1, 3) <= 0) {
+    caller_frame = nullptr;
+  }
+
+  MutexLock lock(&mutex_);
+  caller_frame_ = caller_frame;
+#endif  // GTEST_HAS_ABSL
+}
+
+// A helper class that creates the premature-exit file in its
+// constructor and deletes the file in its destructor.
+class ScopedPrematureExitFile {
+ public:
+  explicit ScopedPrematureExitFile(const char* premature_exit_filepath)
+      : premature_exit_filepath_(premature_exit_filepath ?
+                                 premature_exit_filepath : "") {
+    // If a path to the premature-exit file is specified...
+    if (!premature_exit_filepath_.empty()) {
+      // create the file with a single "0" character in it.  I/O
+      // errors are ignored as there's nothing better we can do and we
+      // don't want to fail the test because of this.
+      FILE* pfile = posix::FOpen(premature_exit_filepath, "w");
+      fwrite("0", 1, 1, pfile);
+      fclose(pfile);
+    }
+  }
+
+  ~ScopedPrematureExitFile() {
+    if (!premature_exit_filepath_.empty()) {
+      int retval = remove(premature_exit_filepath_.c_str());
+      if (retval) {
+        GTEST_LOG_(ERROR) << "Failed to remove premature exit filepath \""
+                          << premature_exit_filepath_ << "\" with error "
+                          << retval;
+      }
+    }
+  }
+
+ private:
+  const std::string premature_exit_filepath_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile);
+};
+
+}  // namespace internal
+
+// class TestEventListeners
+
+TestEventListeners::TestEventListeners()
+    : repeater_(new internal::TestEventRepeater()),
+      default_result_printer_(NULL),
+      default_xml_generator_(NULL) {
+}
+
+TestEventListeners::~TestEventListeners() { delete repeater_; }
+
+// Returns the standard listener responsible for the default console
+// output.  Can be removed from the listeners list to shut down default
+// console output.  Note that removing this object from the listener list
+// with Release transfers its ownership to the user.
+void TestEventListeners::Append(TestEventListener* listener) {
+  repeater_->Append(listener);
+}
+
+// Removes the given event listener from the list and returns it.  It then
+// becomes the caller's responsibility to delete the listener. Returns
+// NULL if the listener is not found in the list.
+TestEventListener* TestEventListeners::Release(TestEventListener* listener) {
+  if (listener == default_result_printer_)
+    default_result_printer_ = NULL;
+  else if (listener == default_xml_generator_)
+    default_xml_generator_ = NULL;
+  return repeater_->Release(listener);
+}
+
+// Returns repeater that broadcasts the TestEventListener events to all
+// subscribers.
+TestEventListener* TestEventListeners::repeater() { return repeater_; }
+
+// Sets the default_result_printer attribute to the provided listener.
+// The listener is also added to the listener list and previous
+// default_result_printer is removed from it and deleted. The listener can
+// also be NULL in which case it will not be added to the list. Does
+// nothing if the previous and the current listener objects are the same.
+void TestEventListeners::SetDefaultResultPrinter(TestEventListener* listener) {
+  if (default_result_printer_ != listener) {
+    // It is an error to pass this method a listener that is already in the
+    // list.
+    delete Release(default_result_printer_);
+    default_result_printer_ = listener;
+    if (listener != NULL)
+      Append(listener);
+  }
+}
+
+// Sets the default_xml_generator attribute to the provided listener.  The
+// listener is also added to the listener list and previous
+// default_xml_generator is removed from it and deleted. The listener can
+// also be NULL in which case it will not be added to the list. Does
+// nothing if the previous and the current listener objects are the same.
+void TestEventListeners::SetDefaultXmlGenerator(TestEventListener* listener) {
+  if (default_xml_generator_ != listener) {
+    // It is an error to pass this method a listener that is already in the
+    // list.
+    delete Release(default_xml_generator_);
+    default_xml_generator_ = listener;
+    if (listener != NULL)
+      Append(listener);
+  }
+}
+
+// Controls whether events will be forwarded by the repeater to the
+// listeners in the list.
+bool TestEventListeners::EventForwardingEnabled() const {
+  return repeater_->forwarding_enabled();
+}
+
+void TestEventListeners::SuppressEventForwarding() {
+  repeater_->set_forwarding_enabled(false);
+}
+
+// class UnitTest
+
+// Gets the singleton UnitTest object.  The first time this method is
+// called, a UnitTest object is constructed and returned.  Consecutive
+// calls will return the same object.
+//
+// We don't protect this under mutex_ as a user is not supposed to
+// call this before main() starts, from which point on the return
+// value will never change.
+UnitTest* UnitTest::GetInstance() {
+  // When compiled with MSVC 7.1 in optimized mode, destroying the
+  // UnitTest object upon exiting the program messes up the exit code,
+  // causing successful tests to appear failed.  We have to use a
+  // different implementation in this case to bypass the compiler bug.
+  // This implementation makes the compiler happy, at the cost of
+  // leaking the UnitTest object.
+
+  // CodeGear C++Builder insists on a public destructor for the
+  // default implementation.  Use this implementation to keep good OO
+  // design with private destructor.
+
+#if (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__)
+  static UnitTest* const instance = new UnitTest;
+  return instance;
+#else
+  static UnitTest instance;
+  return &instance;
+#endif  // (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__)
+}
+
+// Gets the number of successful test cases.
+int UnitTest::successful_test_case_count() const {
+  return impl()->successful_test_case_count();
+}
+
+// Gets the number of failed test cases.
+int UnitTest::failed_test_case_count() const {
+  return impl()->failed_test_case_count();
+}
+
+// Gets the number of all test cases.
+int UnitTest::total_test_case_count() const {
+  return impl()->total_test_case_count();
+}
+
+// Gets the number of all test cases that contain at least one test
+// that should run.
+int UnitTest::test_case_to_run_count() const {
+  return impl()->test_case_to_run_count();
+}
+
+// Gets the number of successful tests.
+int UnitTest::successful_test_count() const {
+  return impl()->successful_test_count();
+}
+
+// Gets the number of failed tests.
+int UnitTest::failed_test_count() const { return impl()->failed_test_count(); }
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int UnitTest::reportable_disabled_test_count() const {
+  return impl()->reportable_disabled_test_count();
+}
+
+// Gets the number of disabled tests.
+int UnitTest::disabled_test_count() const {
+  return impl()->disabled_test_count();
+}
+
+// Gets the number of tests to be printed in the XML report.
+int UnitTest::reportable_test_count() const {
+  return impl()->reportable_test_count();
+}
+
+// Gets the number of all tests.
+int UnitTest::total_test_count() const { return impl()->total_test_count(); }
+
+// Gets the number of tests that should run.
+int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); }
+
+// Gets the time of the test program start, in ms from the start of the
+// UNIX epoch.
+internal::TimeInMillis UnitTest::start_timestamp() const {
+    return impl()->start_timestamp();
+}
+
+// Gets the elapsed time, in milliseconds.
+internal::TimeInMillis UnitTest::elapsed_time() const {
+  return impl()->elapsed_time();
+}
+
+// Returns true iff the unit test passed (i.e. all test cases passed).
+bool UnitTest::Passed() const { return impl()->Passed(); }
+
+// Returns true iff the unit test failed (i.e. some test case failed
+// or something outside of all tests failed).
+bool UnitTest::Failed() const { return impl()->Failed(); }
+
+// Gets the i-th test case among all the test cases. i can range from 0 to
+// total_test_case_count() - 1. If i is not in that range, returns NULL.
+const TestCase* UnitTest::GetTestCase(int i) const {
+  return impl()->GetTestCase(i);
+}
+
+// Returns the TestResult containing information on test failures and
+// properties logged outside of individual test cases.
+const TestResult& UnitTest::ad_hoc_test_result() const {
+  return *impl()->ad_hoc_test_result();
+}
+
+// Gets the i-th test case among all the test cases. i can range from 0 to
+// total_test_case_count() - 1. If i is not in that range, returns NULL.
+TestCase* UnitTest::GetMutableTestCase(int i) {
+  return impl()->GetMutableTestCase(i);
+}
+
+// Returns the list of event listeners that can be used to track events
+// inside Google Test.
+TestEventListeners& UnitTest::listeners() {
+  return *impl()->listeners();
+}
+
+// Registers and returns a global test environment.  When a test
+// program is run, all global test environments will be set-up in the
+// order they were registered.  After all tests in the program have
+// finished, all global test environments will be torn-down in the
+// *reverse* order they were registered.
+//
+// The UnitTest object takes ownership of the given environment.
+//
+// We don't protect this under mutex_, as we only support calling it
+// from the main thread.
+Environment* UnitTest::AddEnvironment(Environment* env) {
+  if (env == NULL) {
+    return NULL;
+  }
+
+  impl_->environments().push_back(env);
+  return env;
+}
+
+// Adds a TestPartResult to the current TestResult object.  All Google Test
+// assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call
+// this to report their results.  The user code should use the
+// assertion macros instead of calling this directly.
+void UnitTest::AddTestPartResult(
+    TestPartResult::Type result_type,
+    const char* file_name,
+    int line_number,
+    const std::string& message,
+    const std::string& os_stack_trace) GTEST_LOCK_EXCLUDED_(mutex_) {
+  Message msg;
+  msg << message;
+
+  internal::MutexLock lock(&mutex_);
+  if (impl_->gtest_trace_stack().size() > 0) {
+    msg << "\n" << GTEST_NAME_ << " trace:";
+
+    for (int i = static_cast<int>(impl_->gtest_trace_stack().size());
+         i > 0; --i) {
+      const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1];
+      msg << "\n" << internal::FormatFileLocation(trace.file, trace.line)
+          << " " << trace.message;
+    }
+  }
+
+  if (os_stack_trace.c_str() != NULL && !os_stack_trace.empty()) {
+    msg << internal::kStackTraceMarker << os_stack_trace;
+  }
+
+  const TestPartResult result =
+    TestPartResult(result_type, file_name, line_number,
+                   msg.GetString().c_str());
+  impl_->GetTestPartResultReporterForCurrentThread()->
+      ReportTestPartResult(result);
+
+  if (result_type != TestPartResult::kSuccess) {
+    // gtest_break_on_failure takes precedence over
+    // gtest_throw_on_failure.  This allows a user to set the latter
+    // in the code (perhaps in order to use Google Test assertions
+    // with another testing framework) and specify the former on the
+    // command line for debugging.
+    if (GTEST_FLAG(break_on_failure)) {
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+      // Using DebugBreak on Windows allows gtest to still break into a debugger
+      // when a failure happens and both the --gtest_break_on_failure and
+      // the --gtest_catch_exceptions flags are specified.
+      DebugBreak();
+#elif (!defined(__native_client__)) &&            \
+    ((defined(__clang__) || defined(__GNUC__)) && \
+     (defined(__x86_64__) || defined(__i386__)))
+      // with clang/gcc we can achieve the same effect on x86 by invoking int3
+      asm("int3");
+#else
+      // Dereference NULL through a volatile pointer to prevent the compiler
+      // from removing. We use this rather than abort() or __builtin_trap() for
+      // portability: Symbian doesn't implement abort() well, and some debuggers
+      // don't correctly trap abort().
+      *static_cast<volatile int*>(NULL) = 1;
+#endif  // GTEST_OS_WINDOWS
+    } else if (GTEST_FLAG(throw_on_failure)) {
+#if GTEST_HAS_EXCEPTIONS
+      throw internal::GoogleTestFailureException(result);
+#else
+      // We cannot call abort() as it generates a pop-up in debug mode
+      // that cannot be suppressed in VC 7.1 or below.
+      exit(1);
+#endif
+    }
+  }
+}
+
+// Adds a TestProperty to the current TestResult object when invoked from
+// inside a test, to current TestCase's ad_hoc_test_result_ when invoked
+// from SetUpTestCase or TearDownTestCase, or to the global property set
+// when invoked elsewhere.  If the result already contains a property with
+// the same key, the value will be updated.
+void UnitTest::RecordProperty(const std::string& key,
+                              const std::string& value) {
+  impl_->RecordProperty(TestProperty(key, value));
+}
+
+// Runs all tests in this UnitTest object and prints the result.
+// Returns 0 if successful, or 1 otherwise.
+//
+// We don't protect this under mutex_, as we only support calling it
+// from the main thread.
+int UnitTest::Run() {
+  const bool in_death_test_child_process =
+      internal::GTEST_FLAG(internal_run_death_test).length() > 0;
+
+  // Google Test implements this protocol for catching that a test
+  // program exits before returning control to Google Test:
+  //
+  //   1. Upon start, Google Test creates a file whose absolute path
+  //      is specified by the environment variable
+  //      TEST_PREMATURE_EXIT_FILE.
+  //   2. When Google Test has finished its work, it deletes the file.
+  //
+  // This allows a test runner to set TEST_PREMATURE_EXIT_FILE before
+  // running a Google-Test-based test program and check the existence
+  // of the file at the end of the test execution to see if it has
+  // exited prematurely.
+
+  // If we are in the child process of a death test, don't
+  // create/delete the premature exit file, as doing so is unnecessary
+  // and will confuse the parent process.  Otherwise, create/delete
+  // the file upon entering/leaving this function.  If the program
+  // somehow exits before this function has a chance to return, the
+  // premature-exit file will be left undeleted, causing a test runner
+  // that understands the premature-exit-file protocol to report the
+  // test as having failed.
+  const internal::ScopedPrematureExitFile premature_exit_file(
+      in_death_test_child_process ?
+      NULL : internal::posix::GetEnv("TEST_PREMATURE_EXIT_FILE"));
+
+  // Captures the value of GTEST_FLAG(catch_exceptions).  This value will be
+  // used for the duration of the program.
+  impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions));
+
+#if GTEST_OS_WINDOWS
+  // Either the user wants Google Test to catch exceptions thrown by the
+  // tests or this is executing in the context of death test child
+  // process. In either case the user does not want to see pop-up dialogs
+  // about crashes - they are expected.
+  if (impl()->catch_exceptions() || in_death_test_child_process) {
+# if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+    // SetErrorMode doesn't exist on CE.
+    SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT |
+                 SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX);
+# endif  // !GTEST_OS_WINDOWS_MOBILE
+
+# if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
+    // Death test children can be terminated with _abort().  On Windows,
+    // _abort() can show a dialog with a warning message.  This forces the
+    // abort message to go to stderr instead.
+    _set_error_mode(_OUT_TO_STDERR);
+# endif
+
+# if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE
+    // In the debug version, Visual Studio pops up a separate dialog
+    // offering a choice to debug the aborted program. We need to suppress
+    // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement
+    // executed. Google Test will notify the user of any unexpected
+    // failure via stderr.
+    //
+    // VC++ doesn't define _set_abort_behavior() prior to the version 8.0.
+    // Users of prior VC versions shall suffer the agony and pain of
+    // clicking through the countless debug dialogs.
+    // FIXME: find a way to suppress the abort dialog() in the
+    // debug mode when compiled with VC 7.1 or lower.
+    if (!GTEST_FLAG(break_on_failure))
+      _set_abort_behavior(
+          0x0,                                    // Clear the following flags:
+          _WRITE_ABORT_MSG | _CALL_REPORTFAULT);  // pop-up window, core dump.
+# endif
+  }
+#endif  // GTEST_OS_WINDOWS
+
+  return internal::HandleExceptionsInMethodIfSupported(
+      impl(),
+      &internal::UnitTestImpl::RunAllTests,
+      "auxiliary test code (environments or event listeners)") ? 0 : 1;
+}
+
+// Returns the working directory when the first TEST() or TEST_F() was
+// executed.
+const char* UnitTest::original_working_dir() const {
+  return impl_->original_working_dir_.c_str();
+}
+
+// Returns the TestCase object for the test that's currently running,
+// or NULL if no test is running.
+const TestCase* UnitTest::current_test_case() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  return impl_->current_test_case();
+}
+
+// Returns the TestInfo object for the test that's currently running,
+// or NULL if no test is running.
+const TestInfo* UnitTest::current_test_info() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  return impl_->current_test_info();
+}
+
+// Returns the random seed used at the start of the current test run.
+int UnitTest::random_seed() const { return impl_->random_seed(); }
+
+// Returns ParameterizedTestCaseRegistry object used to keep track of
+// value-parameterized tests and instantiate and register them.
+internal::ParameterizedTestCaseRegistry&
+    UnitTest::parameterized_test_registry()
+        GTEST_LOCK_EXCLUDED_(mutex_) {
+  return impl_->parameterized_test_registry();
+}
+
+// Creates an empty UnitTest.
+UnitTest::UnitTest() {
+  impl_ = new internal::UnitTestImpl(this);
+}
+
+// Destructor of UnitTest.
+UnitTest::~UnitTest() {
+  delete impl_;
+}
+
+// Pushes a trace defined by SCOPED_TRACE() on to the per-thread
+// Google Test trace stack.
+void UnitTest::PushGTestTrace(const internal::TraceInfo& trace)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  impl_->gtest_trace_stack().push_back(trace);
+}
+
+// Pops a trace from the per-thread Google Test trace stack.
+void UnitTest::PopGTestTrace()
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  impl_->gtest_trace_stack().pop_back();
+}
+
+namespace internal {
+
+UnitTestImpl::UnitTestImpl(UnitTest* parent)
+    : parent_(parent),
+      GTEST_DISABLE_MSC_WARNINGS_PUSH_(4355 /* using this in initializer */)
+      default_global_test_part_result_reporter_(this),
+      default_per_thread_test_part_result_reporter_(this),
+      GTEST_DISABLE_MSC_WARNINGS_POP_()
+      global_test_part_result_repoter_(
+          &default_global_test_part_result_reporter_),
+      per_thread_test_part_result_reporter_(
+          &default_per_thread_test_part_result_reporter_),
+      parameterized_test_registry_(),
+      parameterized_tests_registered_(false),
+      last_death_test_case_(-1),
+      current_test_case_(NULL),
+      current_test_info_(NULL),
+      ad_hoc_test_result_(),
+      os_stack_trace_getter_(NULL),
+      post_flag_parse_init_performed_(false),
+      random_seed_(0),  // Will be overridden by the flag before first use.
+      random_(0),  // Will be reseeded before first use.
+      start_timestamp_(0),
+      elapsed_time_(0),
+#if GTEST_HAS_DEATH_TEST
+      death_test_factory_(new DefaultDeathTestFactory),
+#endif
+      // Will be overridden by the flag before first use.
+      catch_exceptions_(false) {
+  listeners()->SetDefaultResultPrinter(new PrettyUnitTestResultPrinter);
+}
+
+UnitTestImpl::~UnitTestImpl() {
+  // Deletes every TestCase.
+  ForEach(test_cases_, internal::Delete<TestCase>);
+
+  // Deletes every Environment.
+  ForEach(environments_, internal::Delete<Environment>);
+
+  delete os_stack_trace_getter_;
+}
+
+// Adds a TestProperty to the current TestResult object when invoked in a
+// context of a test, to current test case's ad_hoc_test_result when invoke
+// from SetUpTestCase/TearDownTestCase, or to the global property set
+// otherwise.  If the result already contains a property with the same key,
+// the value will be updated.
+void UnitTestImpl::RecordProperty(const TestProperty& test_property) {
+  std::string xml_element;
+  TestResult* test_result;  // TestResult appropriate for property recording.
+
+  if (current_test_info_ != NULL) {
+    xml_element = "testcase";
+    test_result = &(current_test_info_->result_);
+  } else if (current_test_case_ != NULL) {
+    xml_element = "testsuite";
+    test_result = &(current_test_case_->ad_hoc_test_result_);
+  } else {
+    xml_element = "testsuites";
+    test_result = &ad_hoc_test_result_;
+  }
+  test_result->RecordProperty(xml_element, test_property);
+}
+
+#if GTEST_HAS_DEATH_TEST
+// Disables event forwarding if the control is currently in a death test
+// subprocess. Must not be called before InitGoogleTest.
+void UnitTestImpl::SuppressTestEventsIfInSubprocess() {
+  if (internal_run_death_test_flag_.get() != NULL)
+    listeners()->SuppressEventForwarding();
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+// Initializes event listeners performing XML output as specified by
+// UnitTestOptions. Must not be called before InitGoogleTest.
+void UnitTestImpl::ConfigureXmlOutput() {
+  const std::string& output_format = UnitTestOptions::GetOutputFormat();
+  if (output_format == "xml") {
+    listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter(
+        UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
+  } else if (output_format == "json") {
+    listeners()->SetDefaultXmlGenerator(new JsonUnitTestResultPrinter(
+        UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
+  } else if (output_format != "") {
+    GTEST_LOG_(WARNING) << "WARNING: unrecognized output format \""
+                        << output_format << "\" ignored.";
+  }
+}
+
+#if GTEST_CAN_STREAM_RESULTS_
+// Initializes event listeners for streaming test results in string form.
+// Must not be called before InitGoogleTest.
+void UnitTestImpl::ConfigureStreamingOutput() {
+  const std::string& target = GTEST_FLAG(stream_result_to);
+  if (!target.empty()) {
+    const size_t pos = target.find(':');
+    if (pos != std::string::npos) {
+      listeners()->Append(new StreamingListener(target.substr(0, pos),
+                                                target.substr(pos+1)));
+    } else {
+      GTEST_LOG_(WARNING) << "unrecognized streaming target \"" << target
+                          << "\" ignored.";
+    }
+  }
+}
+#endif  // GTEST_CAN_STREAM_RESULTS_
+
+// Performs initialization dependent upon flag values obtained in
+// ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
+// ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
+// this function is also called from RunAllTests.  Since this function can be
+// called more than once, it has to be idempotent.
+void UnitTestImpl::PostFlagParsingInit() {
+  // Ensures that this function does not execute more than once.
+  if (!post_flag_parse_init_performed_) {
+    post_flag_parse_init_performed_ = true;
+
+#if defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_)
+    // Register to send notifications about key process state changes.
+    listeners()->Append(new GTEST_CUSTOM_TEST_EVENT_LISTENER_());
+#endif  // defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_)
+
+#if GTEST_HAS_DEATH_TEST
+    InitDeathTestSubprocessControlInfo();
+    SuppressTestEventsIfInSubprocess();
+#endif  // GTEST_HAS_DEATH_TEST
+
+    // Registers parameterized tests. This makes parameterized tests
+    // available to the UnitTest reflection API without running
+    // RUN_ALL_TESTS.
+    RegisterParameterizedTests();
+
+    // Configures listeners for XML output. This makes it possible for users
+    // to shut down the default XML output before invoking RUN_ALL_TESTS.
+    ConfigureXmlOutput();
+
+#if GTEST_CAN_STREAM_RESULTS_
+    // Configures listeners for streaming test results to the specified server.
+    ConfigureStreamingOutput();
+#endif  // GTEST_CAN_STREAM_RESULTS_
+
+#if GTEST_HAS_ABSL
+    if (GTEST_FLAG(install_failure_signal_handler)) {
+      absl::FailureSignalHandlerOptions options;
+      absl::InstallFailureSignalHandler(options);
+    }
+#endif  // GTEST_HAS_ABSL
+  }
+}
+
+// A predicate that checks the name of a TestCase against a known
+// value.
+//
+// This is used for implementation of the UnitTest class only.  We put
+// it in the anonymous namespace to prevent polluting the outer
+// namespace.
+//
+// TestCaseNameIs is copyable.
+class TestCaseNameIs {
+ public:
+  // Constructor.
+  explicit TestCaseNameIs(const std::string& name)
+      : name_(name) {}
+
+  // Returns true iff the name of test_case matches name_.
+  bool operator()(const TestCase* test_case) const {
+    return test_case != NULL && strcmp(test_case->name(), name_.c_str()) == 0;
+  }
+
+ private:
+  std::string name_;
+};
+
+// Finds and returns a TestCase with the given name.  If one doesn't
+// exist, creates one and returns it.  It's the CALLER'S
+// RESPONSIBILITY to ensure that this function is only called WHEN THE
+// TESTS ARE NOT SHUFFLED.
+//
+// Arguments:
+//
+//   test_case_name: name of the test case
+//   type_param:     the name of the test case's type parameter, or NULL if
+//                   this is not a typed or a type-parameterized test case.
+//   set_up_tc:      pointer to the function that sets up the test case
+//   tear_down_tc:   pointer to the function that tears down the test case
+TestCase* UnitTestImpl::GetTestCase(const char* test_case_name,
+                                    const char* type_param,
+                                    Test::SetUpTestCaseFunc set_up_tc,
+                                    Test::TearDownTestCaseFunc tear_down_tc) {
+  // Can we find a TestCase with the given name?
+  const std::vector<TestCase*>::const_reverse_iterator test_case =
+      std::find_if(test_cases_.rbegin(), test_cases_.rend(),
+                   TestCaseNameIs(test_case_name));
+
+  if (test_case != test_cases_.rend())
+    return *test_case;
+
+  // No.  Let's create one.
+  TestCase* const new_test_case =
+      new TestCase(test_case_name, type_param, set_up_tc, tear_down_tc);
+
+  // Is this a death test case?
+  if (internal::UnitTestOptions::MatchesFilter(test_case_name,
+                                               kDeathTestCaseFilter)) {
+    // Yes.  Inserts the test case after the last death test case
+    // defined so far.  This only works when the test cases haven't
+    // been shuffled.  Otherwise we may end up running a death test
+    // after a non-death test.
+    ++last_death_test_case_;
+    test_cases_.insert(test_cases_.begin() + last_death_test_case_,
+                       new_test_case);
+  } else {
+    // No.  Appends to the end of the list.
+    test_cases_.push_back(new_test_case);
+  }
+
+  test_case_indices_.push_back(static_cast<int>(test_case_indices_.size()));
+  return new_test_case;
+}
+
+// Helpers for setting up / tearing down the given environment.  They
+// are for use in the ForEach() function.
+static void SetUpEnvironment(Environment* env) { env->SetUp(); }
+static void TearDownEnvironment(Environment* env) { env->TearDown(); }
+
+// Runs all tests in this UnitTest object, prints the result, and
+// returns true if all tests are successful.  If any exception is
+// thrown during a test, the test is considered to be failed, but the
+// rest of the tests will still be run.
+//
+// When parameterized tests are enabled, it expands and registers
+// parameterized tests first in RegisterParameterizedTests().
+// All other functions called from RunAllTests() may safely assume that
+// parameterized tests are ready to be counted and run.
+bool UnitTestImpl::RunAllTests() {
+  // True iff Google Test is initialized before RUN_ALL_TESTS() is called.
+  const bool gtest_is_initialized_before_run_all_tests = GTestIsInitialized();
+
+  // Do not run any test if the --help flag was specified.
+  if (g_help_flag)
+    return true;
+
+  // Repeats the call to the post-flag parsing initialization in case the
+  // user didn't call InitGoogleTest.
+  PostFlagParsingInit();
+
+  // Even if sharding is not on, test runners may want to use the
+  // GTEST_SHARD_STATUS_FILE to query whether the test supports the sharding
+  // protocol.
+  internal::WriteToShardStatusFileIfNeeded();
+
+  // True iff we are in a subprocess for running a thread-safe-style
+  // death test.
+  bool in_subprocess_for_death_test = false;
+
+#if GTEST_HAS_DEATH_TEST
+  in_subprocess_for_death_test = (internal_run_death_test_flag_.get() != NULL);
+# if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+  if (in_subprocess_for_death_test) {
+    GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_();
+  }
+# endif  // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+#endif  // GTEST_HAS_DEATH_TEST
+
+  const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex,
+                                        in_subprocess_for_death_test);
+
+  // Compares the full test names with the filter to decide which
+  // tests to run.
+  const bool has_tests_to_run = FilterTests(should_shard
+                                              ? HONOR_SHARDING_PROTOCOL
+                                              : IGNORE_SHARDING_PROTOCOL) > 0;
+
+  // Lists the tests and exits if the --gtest_list_tests flag was specified.
+  if (GTEST_FLAG(list_tests)) {
+    // This must be called *after* FilterTests() has been called.
+    ListTestsMatchingFilter();
+    return true;
+  }
+
+  random_seed_ = GTEST_FLAG(shuffle) ?
+      GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0;
+
+  // True iff at least one test has failed.
+  bool failed = false;
+
+  TestEventListener* repeater = listeners()->repeater();
+
+  start_timestamp_ = GetTimeInMillis();
+  repeater->OnTestProgramStart(*parent_);
+
+  // How many times to repeat the tests?  We don't want to repeat them
+  // when we are inside the subprocess of a death test.
+  const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat);
+  // Repeats forever if the repeat count is negative.
+  const bool forever = repeat < 0;
+  for (int i = 0; forever || i != repeat; i++) {
+    // We want to preserve failures generated by ad-hoc test
+    // assertions executed before RUN_ALL_TESTS().
+    ClearNonAdHocTestResult();
+
+    const TimeInMillis start = GetTimeInMillis();
+
+    // Shuffles test cases and tests if requested.
+    if (has_tests_to_run && GTEST_FLAG(shuffle)) {
+      random()->Reseed(random_seed_);
+      // This should be done before calling OnTestIterationStart(),
+      // such that a test event listener can see the actual test order
+      // in the event.
+      ShuffleTests();
+    }
+
+    // Tells the unit test event listeners that the tests are about to start.
+    repeater->OnTestIterationStart(*parent_, i);
+
+    // Runs each test case if there is at least one test to run.
+    if (has_tests_to_run) {
+      // Sets up all environments beforehand.
+      repeater->OnEnvironmentsSetUpStart(*parent_);
+      ForEach(environments_, SetUpEnvironment);
+      repeater->OnEnvironmentsSetUpEnd(*parent_);
+
+      // Runs the tests only if there was no fatal failure during global
+      // set-up.
+      if (!Test::HasFatalFailure()) {
+        for (int test_index = 0; test_index < total_test_case_count();
+             test_index++) {
+          GetMutableTestCase(test_index)->Run();
+        }
+      }
+
+      // Tears down all environments in reverse order afterwards.
+      repeater->OnEnvironmentsTearDownStart(*parent_);
+      std::for_each(environments_.rbegin(), environments_.rend(),
+                    TearDownEnvironment);
+      repeater->OnEnvironmentsTearDownEnd(*parent_);
+    }
+
+    elapsed_time_ = GetTimeInMillis() - start;
+
+    // Tells the unit test event listener that the tests have just finished.
+    repeater->OnTestIterationEnd(*parent_, i);
+
+    // Gets the result and clears it.
+    if (!Passed()) {
+      failed = true;
+    }
+
+    // Restores the original test order after the iteration.  This
+    // allows the user to quickly repro a failure that happens in the
+    // N-th iteration without repeating the first (N - 1) iterations.
+    // This is not enclosed in "if (GTEST_FLAG(shuffle)) { ... }", in
+    // case the user somehow changes the value of the flag somewhere
+    // (it's always safe to unshuffle the tests).
+    UnshuffleTests();
+
+    if (GTEST_FLAG(shuffle)) {
+      // Picks a new random seed for each iteration.
+      random_seed_ = GetNextRandomSeed(random_seed_);
+    }
+  }
+
+  repeater->OnTestProgramEnd(*parent_);
+
+  if (!gtest_is_initialized_before_run_all_tests) {
+    ColoredPrintf(
+        COLOR_RED,
+        "\nIMPORTANT NOTICE - DO NOT IGNORE:\n"
+        "This test program did NOT call " GTEST_INIT_GOOGLE_TEST_NAME_
+        "() before calling RUN_ALL_TESTS(). This is INVALID. Soon " GTEST_NAME_
+        " will start to enforce the valid usage. "
+        "Please fix it ASAP, or IT WILL START TO FAIL.\n");  // NOLINT
+#if GTEST_FOR_GOOGLE_
+    ColoredPrintf(COLOR_RED,
+                  "For more details, see http://wiki/Main/ValidGUnitMain.\n");
+#endif  // GTEST_FOR_GOOGLE_
+  }
+
+  return !failed;
+}
+
+// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
+// if the variable is present. If a file already exists at this location, this
+// function will write over it. If the variable is present, but the file cannot
+// be created, prints an error and exits.
+void WriteToShardStatusFileIfNeeded() {
+  const char* const test_shard_file = posix::GetEnv(kTestShardStatusFile);
+  if (test_shard_file != NULL) {
+    FILE* const file = posix::FOpen(test_shard_file, "w");
+    if (file == NULL) {
+      ColoredPrintf(COLOR_RED,
+                    "Could not write to the test shard status file \"%s\" "
+                    "specified by the %s environment variable.\n",
+                    test_shard_file, kTestShardStatusFile);
+      fflush(stdout);
+      exit(EXIT_FAILURE);
+    }
+    fclose(file);
+  }
+}
+
+// Checks whether sharding is enabled by examining the relevant
+// environment variable values. If the variables are present,
+// but inconsistent (i.e., shard_index >= total_shards), prints
+// an error and exits. If in_subprocess_for_death_test, sharding is
+// disabled because it must only be applied to the original test
+// process. Otherwise, we could filter out death tests we intended to execute.
+bool ShouldShard(const char* total_shards_env,
+                 const char* shard_index_env,
+                 bool in_subprocess_for_death_test) {
+  if (in_subprocess_for_death_test) {
+    return false;
+  }
+
+  const Int32 total_shards = Int32FromEnvOrDie(total_shards_env, -1);
+  const Int32 shard_index = Int32FromEnvOrDie(shard_index_env, -1);
+
+  if (total_shards == -1 && shard_index == -1) {
+    return false;
+  } else if (total_shards == -1 && shard_index != -1) {
+    const Message msg = Message()
+      << "Invalid environment variables: you have "
+      << kTestShardIndex << " = " << shard_index
+      << ", but have left " << kTestTotalShards << " unset.\n";
+    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  } else if (total_shards != -1 && shard_index == -1) {
+    const Message msg = Message()
+      << "Invalid environment variables: you have "
+      << kTestTotalShards << " = " << total_shards
+      << ", but have left " << kTestShardIndex << " unset.\n";
+    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  } else if (shard_index < 0 || shard_index >= total_shards) {
+    const Message msg = Message()
+      << "Invalid environment variables: we require 0 <= "
+      << kTestShardIndex << " < " << kTestTotalShards
+      << ", but you have " << kTestShardIndex << "=" << shard_index
+      << ", " << kTestTotalShards << "=" << total_shards << ".\n";
+    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  }
+
+  return total_shards > 1;
+}
+
+// Parses the environment variable var as an Int32. If it is unset,
+// returns default_val. If it is not an Int32, prints an error
+// and aborts.
+Int32 Int32FromEnvOrDie(const char* var, Int32 default_val) {
+  const char* str_val = posix::GetEnv(var);
+  if (str_val == NULL) {
+    return default_val;
+  }
+
+  Int32 result;
+  if (!ParseInt32(Message() << "The value of environment variable " << var,
+                  str_val, &result)) {
+    exit(EXIT_FAILURE);
+  }
+  return result;
+}
+
+// Given the total number of shards, the shard index, and the test id,
+// returns true iff the test should be run on this shard. The test id is
+// some arbitrary but unique non-negative integer assigned to each test
+// method. Assumes that 0 <= shard_index < total_shards.
+bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) {
+  return (test_id % total_shards) == shard_index;
+}
+
+// Compares the name of each test with the user-specified filter to
+// decide whether the test should be run, then records the result in
+// each TestCase and TestInfo object.
+// If shard_tests == true, further filters tests based on sharding
+// variables in the environment - see
+// https://github.com/google/googletest/blob/master/googletest/docs/advanced.md
+// . Returns the number of tests that should run.
+int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
+  const Int32 total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ?
+      Int32FromEnvOrDie(kTestTotalShards, -1) : -1;
+  const Int32 shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ?
+      Int32FromEnvOrDie(kTestShardIndex, -1) : -1;
+
+  // num_runnable_tests are the number of tests that will
+  // run across all shards (i.e., match filter and are not disabled).
+  // num_selected_tests are the number of tests to be run on
+  // this shard.
+  int num_runnable_tests = 0;
+  int num_selected_tests = 0;
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    TestCase* const test_case = test_cases_[i];
+    const std::string &test_case_name = test_case->name();
+    test_case->set_should_run(false);
+
+    for (size_t j = 0; j < test_case->test_info_list().size(); j++) {
+      TestInfo* const test_info = test_case->test_info_list()[j];
+      const std::string test_name(test_info->name());
+      // A test is disabled if test case name or test name matches
+      // kDisableTestFilter.
+      const bool is_disabled =
+          internal::UnitTestOptions::MatchesFilter(test_case_name,
+                                                   kDisableTestFilter) ||
+          internal::UnitTestOptions::MatchesFilter(test_name,
+                                                   kDisableTestFilter);
+      test_info->is_disabled_ = is_disabled;
+
+      const bool matches_filter =
+          internal::UnitTestOptions::FilterMatchesTest(test_case_name,
+                                                       test_name);
+      test_info->matches_filter_ = matches_filter;
+
+      const bool is_runnable =
+          (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) &&
+          matches_filter;
+
+      const bool is_in_another_shard =
+          shard_tests != IGNORE_SHARDING_PROTOCOL &&
+          !ShouldRunTestOnShard(total_shards, shard_index, num_runnable_tests);
+      test_info->is_in_another_shard_ = is_in_another_shard;
+      const bool is_selected = is_runnable && !is_in_another_shard;
+
+      num_runnable_tests += is_runnable;
+      num_selected_tests += is_selected;
+
+      test_info->should_run_ = is_selected;
+      test_case->set_should_run(test_case->should_run() || is_selected);
+    }
+  }
+  return num_selected_tests;
+}
+
+// Prints the given C-string on a single line by replacing all '\n'
+// characters with string "\\n".  If the output takes more than
+// max_length characters, only prints the first max_length characters
+// and "...".
+static void PrintOnOneLine(const char* str, int max_length) {
+  if (str != NULL) {
+    for (int i = 0; *str != '\0'; ++str) {
+      if (i >= max_length) {
+        printf("...");
+        break;
+      }
+      if (*str == '\n') {
+        printf("\\n");
+        i += 2;
+      } else {
+        printf("%c", *str);
+        ++i;
+      }
+    }
+  }
+}
+
+// Prints the names of the tests matching the user-specified filter flag.
+void UnitTestImpl::ListTestsMatchingFilter() {
+  // Print at most this many characters for each type/value parameter.
+  const int kMaxParamLength = 250;
+
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    const TestCase* const test_case = test_cases_[i];
+    bool printed_test_case_name = false;
+
+    for (size_t j = 0; j < test_case->test_info_list().size(); j++) {
+      const TestInfo* const test_info =
+          test_case->test_info_list()[j];
+      if (test_info->matches_filter_) {
+        if (!printed_test_case_name) {
+          printed_test_case_name = true;
+          printf("%s.", test_case->name());
+          if (test_case->type_param() != NULL) {
+            printf("  # %s = ", kTypeParamLabel);
+            // We print the type parameter on a single line to make
+            // the output easy to parse by a program.
+            PrintOnOneLine(test_case->type_param(), kMaxParamLength);
+          }
+          printf("\n");
+        }
+        printf("  %s", test_info->name());
+        if (test_info->value_param() != NULL) {
+          printf("  # %s = ", kValueParamLabel);
+          // We print the value parameter on a single line to make the
+          // output easy to parse by a program.
+          PrintOnOneLine(test_info->value_param(), kMaxParamLength);
+        }
+        printf("\n");
+      }
+    }
+  }
+  fflush(stdout);
+  const std::string& output_format = UnitTestOptions::GetOutputFormat();
+  if (output_format == "xml" || output_format == "json") {
+    FILE* fileout = OpenFileForWriting(
+        UnitTestOptions::GetAbsolutePathToOutputFile().c_str());
+    std::stringstream stream;
+    if (output_format == "xml") {
+      XmlUnitTestResultPrinter(
+          UnitTestOptions::GetAbsolutePathToOutputFile().c_str())
+          .PrintXmlTestsList(&stream, test_cases_);
+    } else if (output_format == "json") {
+      JsonUnitTestResultPrinter(
+          UnitTestOptions::GetAbsolutePathToOutputFile().c_str())
+          .PrintJsonTestList(&stream, test_cases_);
+    }
+    fprintf(fileout, "%s", StringStreamToString(&stream).c_str());
+    fclose(fileout);
+  }
+}
+
+// Sets the OS stack trace getter.
+//
+// Does nothing if the input and the current OS stack trace getter are
+// the same; otherwise, deletes the old getter and makes the input the
+// current getter.
+void UnitTestImpl::set_os_stack_trace_getter(
+    OsStackTraceGetterInterface* getter) {
+  if (os_stack_trace_getter_ != getter) {
+    delete os_stack_trace_getter_;
+    os_stack_trace_getter_ = getter;
+  }
+}
+
+// Returns the current OS stack trace getter if it is not NULL;
+// otherwise, creates an OsStackTraceGetter, makes it the current
+// getter, and returns it.
+OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() {
+  if (os_stack_trace_getter_ == NULL) {
+#ifdef GTEST_OS_STACK_TRACE_GETTER_
+    os_stack_trace_getter_ = new GTEST_OS_STACK_TRACE_GETTER_;
+#else
+    os_stack_trace_getter_ = new OsStackTraceGetter;
+#endif  // GTEST_OS_STACK_TRACE_GETTER_
+  }
+
+  return os_stack_trace_getter_;
+}
+
+// Returns the most specific TestResult currently running.
+TestResult* UnitTestImpl::current_test_result() {
+  if (current_test_info_ != NULL) {
+    return &current_test_info_->result_;
+  }
+  if (current_test_case_ != NULL) {
+    return &current_test_case_->ad_hoc_test_result_;
+  }
+  return &ad_hoc_test_result_;
+}
+
+// Shuffles all test cases, and the tests within each test case,
+// making sure that death tests are still run first.
+void UnitTestImpl::ShuffleTests() {
+  // Shuffles the death test cases.
+  ShuffleRange(random(), 0, last_death_test_case_ + 1, &test_case_indices_);
+
+  // Shuffles the non-death test cases.
+  ShuffleRange(random(), last_death_test_case_ + 1,
+               static_cast<int>(test_cases_.size()), &test_case_indices_);
+
+  // Shuffles the tests inside each test case.
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    test_cases_[i]->ShuffleTests(random());
+  }
+}
+
+// Restores the test cases and tests to their order before the first shuffle.
+void UnitTestImpl::UnshuffleTests() {
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    // Unshuffles the tests in each test case.
+    test_cases_[i]->UnshuffleTests();
+    // Resets the index of each test case.
+    test_case_indices_[i] = static_cast<int>(i);
+  }
+}
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
+// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
+std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/,
+                                            int skip_count) {
+  // We pass skip_count + 1 to skip this wrapper function in addition
+  // to what the user really wants to skip.
+  return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1);
+}
+
+// Used by the GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_ macro to
+// suppress unreachable code warnings.
+namespace {
+class ClassUniqueToAlwaysTrue {};
+}
+
+bool IsTrue(bool condition) { return condition; }
+
+bool AlwaysTrue() {
+#if GTEST_HAS_EXCEPTIONS
+  // This condition is always false so AlwaysTrue() never actually throws,
+  // but it makes the compiler think that it may throw.
+  if (IsTrue(false))
+    throw ClassUniqueToAlwaysTrue();
+#endif  // GTEST_HAS_EXCEPTIONS
+  return true;
+}
+
+// If *pstr starts with the given prefix, modifies *pstr to be right
+// past the prefix and returns true; otherwise leaves *pstr unchanged
+// and returns false.  None of pstr, *pstr, and prefix can be NULL.
+bool SkipPrefix(const char* prefix, const char** pstr) {
+  const size_t prefix_len = strlen(prefix);
+  if (strncmp(*pstr, prefix, prefix_len) == 0) {
+    *pstr += prefix_len;
+    return true;
+  }
+  return false;
+}
+
+// Parses a string as a command line flag.  The string should have
+// the format "--flag=value".  When def_optional is true, the "=value"
+// part can be omitted.
+//
+// Returns the value of the flag, or NULL if the parsing failed.
+static const char* ParseFlagValue(const char* str, const char* flag,
+                                  bool def_optional) {
+  // str and flag must not be NULL.
+  if (str == NULL || flag == NULL) return NULL;
+
+  // The flag must start with "--" followed by GTEST_FLAG_PREFIX_.
+  const std::string flag_str = std::string("--") + GTEST_FLAG_PREFIX_ + flag;
+  const size_t flag_len = flag_str.length();
+  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return NULL;
+
+  // Skips the flag name.
+  const char* flag_end = str + flag_len;
+
+  // When def_optional is true, it's OK to not have a "=value" part.
+  if (def_optional && (flag_end[0] == '\0')) {
+    return flag_end;
+  }
+
+  // If def_optional is true and there are more characters after the
+  // flag name, or if def_optional is false, there must be a '=' after
+  // the flag name.
+  if (flag_end[0] != '=') return NULL;
+
+  // Returns the string after "=".
+  return flag_end + 1;
+}
+
+// Parses a string for a bool flag, in the form of either
+// "--flag=value" or "--flag".
+//
+// In the former case, the value is taken as true as long as it does
+// not start with '0', 'f', or 'F'.
+//
+// In the latter case, the value is taken as true.
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+static bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, true);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Converts the string value to a bool.
+  *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
+  return true;
+}
+
+// Parses a string for an Int32 flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseInt32Flag(const char* str, const char* flag, Int32* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Sets *value to the value of the flag.
+  return ParseInt32(Message() << "The value of flag --" << flag,
+                    value_str, value);
+}
+
+// Parses a string for a string flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+template <typename String>
+static bool ParseStringFlag(const char* str, const char* flag, String* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Sets *value to the value of the flag.
+  *value = value_str;
+  return true;
+}
+
+// Determines whether a string has a prefix that Google Test uses for its
+// flags, i.e., starts with GTEST_FLAG_PREFIX_ or GTEST_FLAG_PREFIX_DASH_.
+// If Google Test detects that a command line flag has its prefix but is not
+// recognized, it will print its help message. Flags starting with
+// GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test
+// internal flags and do not trigger the help message.
+static bool HasGoogleTestFlagPrefix(const char* str) {
+  return (SkipPrefix("--", &str) ||
+          SkipPrefix("-", &str) ||
+          SkipPrefix("/", &str)) &&
+         !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) &&
+         (SkipPrefix(GTEST_FLAG_PREFIX_, &str) ||
+          SkipPrefix(GTEST_FLAG_PREFIX_DASH_, &str));
+}
+
+// Prints a string containing code-encoded text.  The following escape
+// sequences can be used in the string to control the text color:
+//
+//   @@    prints a single '@' character.
+//   @R    changes the color to red.
+//   @G    changes the color to green.
+//   @Y    changes the color to yellow.
+//   @D    changes to the default terminal text color.
+//
+// FIXME: Write tests for this once we add stdout
+// capturing to Google Test.
+static void PrintColorEncoded(const char* str) {
+  GTestColor color = COLOR_DEFAULT;  // The current color.
+
+  // Conceptually, we split the string into segments divided by escape
+  // sequences.  Then we print one segment at a time.  At the end of
+  // each iteration, the str pointer advances to the beginning of the
+  // next segment.
+  for (;;) {
+    const char* p = strchr(str, '@');
+    if (p == NULL) {
+      ColoredPrintf(color, "%s", str);
+      return;
+    }
+
+    ColoredPrintf(color, "%s", std::string(str, p).c_str());
+
+    const char ch = p[1];
+    str = p + 2;
+    if (ch == '@') {
+      ColoredPrintf(color, "@");
+    } else if (ch == 'D') {
+      color = COLOR_DEFAULT;
+    } else if (ch == 'R') {
+      color = COLOR_RED;
+    } else if (ch == 'G') {
+      color = COLOR_GREEN;
+    } else if (ch == 'Y') {
+      color = COLOR_YELLOW;
+    } else {
+      --str;
+    }
+  }
+}
+
+static const char kColorEncodedHelpMessage[] =
+"This program contains tests written using " GTEST_NAME_ ". You can use the\n"
+"following command line flags to control its behavior:\n"
+"\n"
+"Test Selection:\n"
+"  @G--" GTEST_FLAG_PREFIX_ "list_tests@D\n"
+"      List the names of all tests instead of running them. The name of\n"
+"      TEST(Foo, Bar) is \"Foo.Bar\".\n"
+"  @G--" GTEST_FLAG_PREFIX_ "filter=@YPOSTIVE_PATTERNS"
+    "[@G-@YNEGATIVE_PATTERNS]@D\n"
+"      Run only the tests whose name matches one of the positive patterns but\n"
+"      none of the negative patterns. '?' matches any single character; '*'\n"
+"      matches any substring; ':' separates two patterns.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "also_run_disabled_tests@D\n"
+"      Run all disabled tests too.\n"
+"\n"
+"Test Execution:\n"
+"  @G--" GTEST_FLAG_PREFIX_ "repeat=@Y[COUNT]@D\n"
+"      Run the tests repeatedly; use a negative count to repeat forever.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "shuffle@D\n"
+"      Randomize tests' orders on every iteration.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "random_seed=@Y[NUMBER]@D\n"
+"      Random number seed to use for shuffling test orders (between 1 and\n"
+"      99999, or 0 to use a seed based on the current time).\n"
+"\n"
+"Test Output:\n"
+"  @G--" GTEST_FLAG_PREFIX_ "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n"
+"      Enable/disable colored output. The default is @Gauto@D.\n"
+"  -@G-" GTEST_FLAG_PREFIX_ "print_time=0@D\n"
+"      Don't print the elapsed time of each test.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "output=@Y(@Gjson@Y|@Gxml@Y)[@G:@YDIRECTORY_PATH@G"
+    GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n"
+"      Generate a JSON or XML report in the given directory or with the given\n"
+"      file name. @YFILE_PATH@D defaults to @Gtest_details.xml@D.\n"
+# if GTEST_CAN_STREAM_RESULTS_
+"  @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n"
+"      Stream test results to the given server.\n"
+# endif  // GTEST_CAN_STREAM_RESULTS_
+"\n"
+"Assertion Behavior:\n"
+# if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+"  @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
+"      Set the default death test style.\n"
+# endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+"  @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n"
+"      Turn assertion failures into debugger break-points.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "throw_on_failure@D\n"
+"      Turn assertion failures into C++ exceptions for use by an external\n"
+"      test framework.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "catch_exceptions=0@D\n"
+"      Do not report exceptions as test failures. Instead, allow them\n"
+"      to crash the program or throw a pop-up (on Windows).\n"
+"\n"
+"Except for @G--" GTEST_FLAG_PREFIX_ "list_tests@D, you can alternatively set "
+    "the corresponding\n"
+"environment variable of a flag (all letters in upper-case). For example, to\n"
+"disable colored text output, you can either specify @G--" GTEST_FLAG_PREFIX_
+    "color=no@D or set\n"
+"the @G" GTEST_FLAG_PREFIX_UPPER_ "COLOR@D environment variable to @Gno@D.\n"
+"\n"
+"For more information, please read the " GTEST_NAME_ " documentation at\n"
+"@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_ "\n"
+"(not one in your own code or tests), please report it to\n"
+"@G<" GTEST_DEV_EMAIL_ ">@D.\n";
+
+static bool ParseGoogleTestFlag(const char* const arg) {
+  return ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
+                       &GTEST_FLAG(also_run_disabled_tests)) ||
+      ParseBoolFlag(arg, kBreakOnFailureFlag,
+                    &GTEST_FLAG(break_on_failure)) ||
+      ParseBoolFlag(arg, kCatchExceptionsFlag,
+                    &GTEST_FLAG(catch_exceptions)) ||
+      ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
+      ParseStringFlag(arg, kDeathTestStyleFlag,
+                      &GTEST_FLAG(death_test_style)) ||
+      ParseBoolFlag(arg, kDeathTestUseFork,
+                    &GTEST_FLAG(death_test_use_fork)) ||
+      ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
+      ParseStringFlag(arg, kInternalRunDeathTestFlag,
+                      &GTEST_FLAG(internal_run_death_test)) ||
+      ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
+      ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
+      ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
+      ParseBoolFlag(arg, kPrintUTF8Flag, &GTEST_FLAG(print_utf8)) ||
+      ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
+      ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
+      ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
+      ParseInt32Flag(arg, kStackTraceDepthFlag,
+                     &GTEST_FLAG(stack_trace_depth)) ||
+      ParseStringFlag(arg, kStreamResultToFlag,
+                      &GTEST_FLAG(stream_result_to)) ||
+      ParseBoolFlag(arg, kThrowOnFailureFlag,
+                    &GTEST_FLAG(throw_on_failure));
+}
+
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+static void LoadFlagsFromFile(const std::string& path) {
+  FILE* flagfile = posix::FOpen(path.c_str(), "r");
+  if (!flagfile) {
+    GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG(flagfile)
+                      << "\"";
+  }
+  std::string contents(ReadEntireFile(flagfile));
+  posix::FClose(flagfile);
+  std::vector<std::string> lines;
+  SplitString(contents, '\n', &lines);
+  for (size_t i = 0; i < lines.size(); ++i) {
+    if (lines[i].empty())
+      continue;
+    if (!ParseGoogleTestFlag(lines[i].c_str()))
+      g_help_flag = true;
+  }
+}
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.  The type parameter CharType can be
+// instantiated to either char or wchar_t.
+template <typename CharType>
+void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
+  for (int i = 1; i < *argc; i++) {
+    const std::string arg_string = StreamableToString(argv[i]);
+    const char* const arg = arg_string.c_str();
+
+    using internal::ParseBoolFlag;
+    using internal::ParseInt32Flag;
+    using internal::ParseStringFlag;
+
+    bool remove_flag = false;
+    if (ParseGoogleTestFlag(arg)) {
+      remove_flag = true;
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+    } else if (ParseStringFlag(arg, kFlagfileFlag, &GTEST_FLAG(flagfile))) {
+      LoadFlagsFromFile(GTEST_FLAG(flagfile));
+      remove_flag = true;
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+    } else if (arg_string == "--help" || arg_string == "-h" ||
+               arg_string == "-?" || arg_string == "/?" ||
+               HasGoogleTestFlagPrefix(arg)) {
+      // Both help flag and unrecognized Google Test flags (excluding
+      // internal ones) trigger help display.
+      g_help_flag = true;
+    }
+
+    if (remove_flag) {
+      // Shift the remainder of the argv list left by one.  Note
+      // that argv has (*argc + 1) elements, the last one always being
+      // NULL.  The following loop moves the trailing NULL element as
+      // well.
+      for (int j = i; j != *argc; j++) {
+        argv[j] = argv[j + 1];
+      }
+
+      // Decrements the argument count.
+      (*argc)--;
+
+      // We also need to decrement the iterator as we just removed
+      // an element.
+      i--;
+    }
+  }
+
+  if (g_help_flag) {
+    // We print the help here instead of in RUN_ALL_TESTS(), as the
+    // latter may not be called at all if the user is using Google
+    // Test with another testing framework.
+    PrintColorEncoded(kColorEncodedHelpMessage);
+  }
+}
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.
+void ParseGoogleTestFlagsOnly(int* argc, char** argv) {
+  ParseGoogleTestFlagsOnlyImpl(argc, argv);
+
+  // Fix the value of *_NSGetArgc() on macOS, but iff
+  // *_NSGetArgv() == argv
+  // Only applicable to char** version of argv
+#if GTEST_OS_MAC
+#ifndef GTEST_OS_IOS
+  if (*_NSGetArgv() == argv) {
+    *_NSGetArgc() = *argc;
+  }
+#endif
+#endif
+}
+void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) {
+  ParseGoogleTestFlagsOnlyImpl(argc, argv);
+}
+
+// The internal implementation of InitGoogleTest().
+//
+// The type parameter CharType can be instantiated to either char or
+// wchar_t.
+template <typename CharType>
+void InitGoogleTestImpl(int* argc, CharType** argv) {
+  // We don't want to run the initialization code twice.
+  if (GTestIsInitialized()) return;
+
+  if (*argc <= 0) return;
+
+  g_argvs.clear();
+  for (int i = 0; i != *argc; i++) {
+    g_argvs.push_back(StreamableToString(argv[i]));
+  }
+
+#if GTEST_HAS_ABSL
+  absl::InitializeSymbolizer(g_argvs[0].c_str());
+#endif  // GTEST_HAS_ABSL
+
+  ParseGoogleTestFlagsOnly(argc, argv);
+  GetUnitTestImpl()->PostFlagParsingInit();
+}
+
+}  // namespace internal
+
+// Initializes Google Test.  This must be called before calling
+// RUN_ALL_TESTS().  In particular, it parses a command line for the
+// flags that Google Test recognizes.  Whenever a Google Test flag is
+// seen, it is removed from argv, and *argc is decremented.
+//
+// No value is returned.  Instead, the Google Test flag variables are
+// updated.
+//
+// Calling the function for the second time has no user-visible effect.
+void InitGoogleTest(int* argc, char** argv) {
+#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
+#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  internal::InitGoogleTestImpl(argc, argv);
+#endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+}
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+void InitGoogleTest(int* argc, wchar_t** argv) {
+#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
+#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  internal::InitGoogleTestImpl(argc, argv);
+#endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+}
+
+std::string TempDir() {
+#if defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_)
+  return GTEST_CUSTOM_TEMPDIR_FUNCTION_();
+#endif
+
+#if GTEST_OS_WINDOWS_MOBILE
+  return "\\temp\\";
+#elif GTEST_OS_WINDOWS
+  const char* temp_dir = internal::posix::GetEnv("TEMP");
+  if (temp_dir == NULL || temp_dir[0] == '\0')
+    return "\\temp\\";
+  else if (temp_dir[strlen(temp_dir) - 1] == '\\')
+    return temp_dir;
+  else
+    return std::string(temp_dir) + "\\";
+#elif GTEST_OS_LINUX_ANDROID
+  return "/sdcard/";
+#else
+  return "/tmp/";
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Class ScopedTrace
+
+// Pushes the given source file location and message onto a per-thread
+// trace stack maintained by Google Test.
+void ScopedTrace::PushTrace(const char* file, int line, std::string message) {
+  internal::TraceInfo trace;
+  trace.file = file;
+  trace.line = line;
+  trace.message.swap(message);
+
+  UnitTest::GetInstance()->PushGTestTrace(trace);
+}
+
+// Pops the info pushed by the c'tor.
+ScopedTrace::~ScopedTrace()
+    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
+  UnitTest::GetInstance()->PopGTestTrace();
+}
+
+}  // namespace testing
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// This file implements death tests.
+
+
+#if GTEST_HAS_DEATH_TEST
+
+# if GTEST_OS_MAC
+#  include <crt_externs.h>
+# endif  // GTEST_OS_MAC
+
+# include <errno.h>
+# include <fcntl.h>
+# include <limits.h>
+
+# if GTEST_OS_LINUX
+#  include <signal.h>
+# endif  // GTEST_OS_LINUX
+
+# include <stdarg.h>
+
+# if GTEST_OS_WINDOWS
+#  include <windows.h>
+# else
+#  include <sys/mman.h>
+#  include <sys/wait.h>
+# endif  // GTEST_OS_WINDOWS
+
+# if GTEST_OS_QNX
+#  include <spawn.h>
+# endif  // GTEST_OS_QNX
+
+# if GTEST_OS_FUCHSIA
+#  include <lib/fdio/io.h>
+#  include <lib/fdio/spawn.h>
+#  include <zircon/processargs.h>
+#  include <zircon/syscalls.h>
+#  include <zircon/syscalls/port.h>
+# endif  // GTEST_OS_FUCHSIA
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+
+namespace testing {
+
+// Constants.
+
+// The default death test style.
+//
+// This is defined in internal/gtest-port.h as "fast", but can be overridden by
+// a definition in internal/custom/gtest-port.h. The recommended value, which is
+// used internally at Google, is "threadsafe".
+static const char kDefaultDeathTestStyle[] = GTEST_DEFAULT_DEATH_TEST_STYLE;
+
+GTEST_DEFINE_string_(
+    death_test_style,
+    internal::StringFromGTestEnv("death_test_style", kDefaultDeathTestStyle),
+    "Indicates how to run a death test in a forked child process: "
+    "\"threadsafe\" (child process re-executes the test binary "
+    "from the beginning, running only the specific death test) or "
+    "\"fast\" (child process runs the death test immediately "
+    "after forking).");
+
+GTEST_DEFINE_bool_(
+    death_test_use_fork,
+    internal::BoolFromGTestEnv("death_test_use_fork", false),
+    "Instructs to use fork()/_exit() instead of clone() in death tests. "
+    "Ignored and always uses fork() on POSIX systems where clone() is not "
+    "implemented. Useful when running under valgrind or similar tools if "
+    "those do not support clone(). Valgrind 3.3.1 will just fail if "
+    "it sees an unsupported combination of clone() flags. "
+    "It is not recommended to use this flag w/o valgrind though it will "
+    "work in 99% of the cases. Once valgrind is fixed, this flag will "
+    "most likely be removed.");
+
+namespace internal {
+GTEST_DEFINE_string_(
+    internal_run_death_test, "",
+    "Indicates the file, line number, temporal index of "
+    "the single death test to run, and a file descriptor to "
+    "which a success code may be sent, all separated by "
+    "the '|' characters.  This flag is specified if and only if the current "
+    "process is a sub-process launched for running a thread-safe "
+    "death test.  FOR INTERNAL USE ONLY.");
+}  // namespace internal
+
+#if GTEST_HAS_DEATH_TEST
+
+namespace internal {
+
+// Valid only for fast death tests. Indicates the code is running in the
+// child process of a fast style death test.
+# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+static bool g_in_fast_death_test_child = false;
+# endif
+
+// Returns a Boolean value indicating whether the caller is currently
+// executing in the context of the death test child process.  Tools such as
+// Valgrind heap checkers may need this to modify their behavior in death
+// tests.  IMPORTANT: This is an internal utility.  Using it may break the
+// implementation of death tests.  User code MUST NOT use it.
+bool InDeathTestChild() {
+# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+
+  // On Windows and Fuchsia, death tests are thread-safe regardless of the value
+  // of the death_test_style flag.
+  return !GTEST_FLAG(internal_run_death_test).empty();
+
+# else
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe")
+    return !GTEST_FLAG(internal_run_death_test).empty();
+  else
+    return g_in_fast_death_test_child;
+#endif
+}
+
+}  // namespace internal
+
+// ExitedWithCode constructor.
+ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {
+}
+
+// ExitedWithCode function-call operator.
+bool ExitedWithCode::operator()(int exit_status) const {
+# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+
+  return exit_status == exit_code_;
+
+# else
+
+  return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_;
+
+# endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+}
+
+# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+// KilledBySignal constructor.
+KilledBySignal::KilledBySignal(int signum) : signum_(signum) {
+}
+
+// KilledBySignal function-call operator.
+bool KilledBySignal::operator()(int exit_status) const {
+#  if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+  {
+    bool result;
+    if (GTEST_KILLED_BY_SIGNAL_OVERRIDE_(signum_, exit_status, &result)) {
+      return result;
+    }
+  }
+#  endif  // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+  return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_;
+}
+# endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+
+namespace internal {
+
+// Utilities needed for death tests.
+
+// Generates a textual description of a given exit code, in the format
+// specified by wait(2).
+static std::string ExitSummary(int exit_code) {
+  Message m;
+
+# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+
+  m << "Exited with exit status " << exit_code;
+
+# else
+
+  if (WIFEXITED(exit_code)) {
+    m << "Exited with exit status " << WEXITSTATUS(exit_code);
+  } else if (WIFSIGNALED(exit_code)) {
+    m << "Terminated by signal " << WTERMSIG(exit_code);
+  }
+#  ifdef WCOREDUMP
+  if (WCOREDUMP(exit_code)) {
+    m << " (core dumped)";
+  }
+#  endif
+# endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+
+  return m.GetString();
+}
+
+// Returns true if exit_status describes a process that was terminated
+// by a signal, or exited normally with a nonzero exit code.
+bool ExitedUnsuccessfully(int exit_status) {
+  return !ExitedWithCode(0)(exit_status);
+}
+
+# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+// Generates a textual failure message when a death test finds more than
+// one thread running, or cannot determine the number of threads, prior
+// to executing the given statement.  It is the responsibility of the
+// caller not to pass a thread_count of 1.
+static std::string DeathTestThreadWarning(size_t thread_count) {
+  Message msg;
+  msg << "Death tests use fork(), which is unsafe particularly"
+      << " in a threaded context. For this test, " << GTEST_NAME_ << " ";
+  if (thread_count == 0) {
+    msg << "couldn't detect the number of threads.";
+  } else {
+    msg << "detected " << thread_count << " threads.";
+  }
+  msg << " See "
+         "https://github.com/google/googletest/blob/master/googletest/docs/"
+         "advanced.md#death-tests-and-threads"
+      << " for more explanation and suggested solutions, especially if"
+      << " this is the last message you see before your test times out.";
+  return msg.GetString();
+}
+# endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+
+// Flag characters for reporting a death test that did not die.
+static const char kDeathTestLived = 'L';
+static const char kDeathTestReturned = 'R';
+static const char kDeathTestThrew = 'T';
+static const char kDeathTestInternalError = 'I';
+
+#if GTEST_OS_FUCHSIA
+
+// File descriptor used for the pipe in the child process.
+static const int kFuchsiaReadPipeFd = 3;
+
+#endif
+
+// An enumeration describing all of the possible ways that a death test can
+// conclude.  DIED means that the process died while executing the test
+// code; LIVED means that process lived beyond the end of the test code;
+// RETURNED means that the test statement attempted to execute a return
+// statement, which is not allowed; THREW means that the test statement
+// returned control by throwing an exception.  IN_PROGRESS means the test
+// has not yet concluded.
+// FIXME: Unify names and possibly values for
+// AbortReason, DeathTestOutcome, and flag characters above.
+enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW };
+
+// Routine for aborting the program which is safe to call from an
+// exec-style death test child process, in which case the error
+// message is propagated back to the parent process.  Otherwise, the
+// message is simply printed to stderr.  In either case, the program
+// then exits with status 1.
+static void DeathTestAbort(const std::string& message) {
+  // On a POSIX system, this function may be called from a threadsafe-style
+  // death test child process, which operates on a very small stack.  Use
+  // the heap for any additional non-minuscule memory requirements.
+  const InternalRunDeathTestFlag* const flag =
+      GetUnitTestImpl()->internal_run_death_test_flag();
+  if (flag != NULL) {
+    FILE* parent = posix::FDOpen(flag->write_fd(), "w");
+    fputc(kDeathTestInternalError, parent);
+    fprintf(parent, "%s", message.c_str());
+    fflush(parent);
+    _exit(1);
+  } else {
+    fprintf(stderr, "%s", message.c_str());
+    fflush(stderr);
+    posix::Abort();
+  }
+}
+
+// A replacement for CHECK that calls DeathTestAbort if the assertion
+// fails.
+# define GTEST_DEATH_TEST_CHECK_(expression) \
+  do { \
+    if (!::testing::internal::IsTrue(expression)) { \
+      DeathTestAbort( \
+          ::std::string("CHECK failed: File ") + __FILE__ +  ", line " \
+          + ::testing::internal::StreamableToString(__LINE__) + ": " \
+          + #expression); \
+    } \
+  } while (::testing::internal::AlwaysFalse())
+
+// This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for
+// evaluating any system call that fulfills two conditions: it must return
+// -1 on failure, and set errno to EINTR when it is interrupted and
+// should be tried again.  The macro expands to a loop that repeatedly
+// evaluates the expression as long as it evaluates to -1 and sets
+// errno to EINTR.  If the expression evaluates to -1 but errno is
+// something other than EINTR, DeathTestAbort is called.
+# define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \
+  do { \
+    int gtest_retval; \
+    do { \
+      gtest_retval = (expression); \
+    } while (gtest_retval == -1 && errno == EINTR); \
+    if (gtest_retval == -1) { \
+      DeathTestAbort( \
+          ::std::string("CHECK failed: File ") + __FILE__ + ", line " \
+          + ::testing::internal::StreamableToString(__LINE__) + ": " \
+          + #expression + " != -1"); \
+    } \
+  } while (::testing::internal::AlwaysFalse())
+
+// Returns the message describing the last system error in errno.
+std::string GetLastErrnoDescription() {
+    return errno == 0 ? "" : posix::StrError(errno);
+}
+
+// This is called from a death test parent process to read a failure
+// message from the death test child process and log it with the FATAL
+// severity. On Windows, the message is read from a pipe handle. On other
+// platforms, it is read from a file descriptor.
+static void FailFromInternalError(int fd) {
+  Message error;
+  char buffer[256];
+  int num_read;
+
+  do {
+    while ((num_read = posix::Read(fd, buffer, 255)) > 0) {
+      buffer[num_read] = '\0';
+      error << buffer;
+    }
+  } while (num_read == -1 && errno == EINTR);
+
+  if (num_read == 0) {
+    GTEST_LOG_(FATAL) << error.GetString();
+  } else {
+    const int last_error = errno;
+    GTEST_LOG_(FATAL) << "Error while reading death test internal: "
+                      << GetLastErrnoDescription() << " [" << last_error << "]";
+  }
+}
+
+// Death test constructor.  Increments the running death test count
+// for the current test.
+DeathTest::DeathTest() {
+  TestInfo* const info = GetUnitTestImpl()->current_test_info();
+  if (info == NULL) {
+    DeathTestAbort("Cannot run a death test outside of a TEST or "
+                   "TEST_F construct");
+  }
+}
+
+// Creates and returns a death test by dispatching to the current
+// death test factory.
+bool DeathTest::Create(const char* statement, const RE* regex,
+                       const char* file, int line, DeathTest** test) {
+  return GetUnitTestImpl()->death_test_factory()->Create(
+      statement, regex, file, line, test);
+}
+
+const char* DeathTest::LastMessage() {
+  return last_death_test_message_.c_str();
+}
+
+void DeathTest::set_last_death_test_message(const std::string& message) {
+  last_death_test_message_ = message;
+}
+
+std::string DeathTest::last_death_test_message_;
+
+// Provides cross platform implementation for some death functionality.
+class DeathTestImpl : public DeathTest {
+ protected:
+  DeathTestImpl(const char* a_statement, const RE* a_regex)
+      : statement_(a_statement),
+        regex_(a_regex),
+        spawned_(false),
+        status_(-1),
+        outcome_(IN_PROGRESS),
+        read_fd_(-1),
+        write_fd_(-1) {}
+
+  // read_fd_ is expected to be closed and cleared by a derived class.
+  ~DeathTestImpl() { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); }
+
+  void Abort(AbortReason reason);
+  virtual bool Passed(bool status_ok);
+
+  const char* statement() const { return statement_; }
+  const RE* regex() const { return regex_; }
+  bool spawned() const { return spawned_; }
+  void set_spawned(bool is_spawned) { spawned_ = is_spawned; }
+  int status() const { return status_; }
+  void set_status(int a_status) { status_ = a_status; }
+  DeathTestOutcome outcome() const { return outcome_; }
+  void set_outcome(DeathTestOutcome an_outcome) { outcome_ = an_outcome; }
+  int read_fd() const { return read_fd_; }
+  void set_read_fd(int fd) { read_fd_ = fd; }
+  int write_fd() const { return write_fd_; }
+  void set_write_fd(int fd) { write_fd_ = fd; }
+
+  // Called in the parent process only. Reads the result code of the death
+  // test child process via a pipe, interprets it to set the outcome_
+  // member, and closes read_fd_.  Outputs diagnostics and terminates in
+  // case of unexpected codes.
+  void ReadAndInterpretStatusByte();
+
+ private:
+  // The textual content of the code this object is testing.  This class
+  // doesn't own this string and should not attempt to delete it.
+  const char* const statement_;
+  // The regular expression which test output must match.  DeathTestImpl
+  // doesn't own this object and should not attempt to delete it.
+  const RE* const regex_;
+  // True if the death test child process has been successfully spawned.
+  bool spawned_;
+  // The exit status of the child process.
+  int status_;
+  // How the death test concluded.
+  DeathTestOutcome outcome_;
+  // Descriptor to the read end of the pipe to the child process.  It is
+  // always -1 in the child process.  The child keeps its write end of the
+  // pipe in write_fd_.
+  int read_fd_;
+  // Descriptor to the child's write end of the pipe to the parent process.
+  // It is always -1 in the parent process.  The parent keeps its end of the
+  // pipe in read_fd_.
+  int write_fd_;
+};
+
+// Called in the parent process only. Reads the result code of the death
+// test child process via a pipe, interprets it to set the outcome_
+// member, and closes read_fd_.  Outputs diagnostics and terminates in
+// case of unexpected codes.
+void DeathTestImpl::ReadAndInterpretStatusByte() {
+  char flag;
+  int bytes_read;
+
+  // The read() here blocks until data is available (signifying the
+  // failure of the death test) or until the pipe is closed (signifying
+  // its success), so it's okay to call this in the parent before
+  // the child process has exited.
+  do {
+    bytes_read = posix::Read(read_fd(), &flag, 1);
+  } while (bytes_read == -1 && errno == EINTR);
+
+  if (bytes_read == 0) {
+    set_outcome(DIED);
+  } else if (bytes_read == 1) {
+    switch (flag) {
+      case kDeathTestReturned:
+        set_outcome(RETURNED);
+        break;
+      case kDeathTestThrew:
+        set_outcome(THREW);
+        break;
+      case kDeathTestLived:
+        set_outcome(LIVED);
+        break;
+      case kDeathTestInternalError:
+        FailFromInternalError(read_fd());  // Does not return.
+        break;
+      default:
+        GTEST_LOG_(FATAL) << "Death test child process reported "
+                          << "unexpected status byte ("
+                          << static_cast<unsigned int>(flag) << ")";
+    }
+  } else {
+    GTEST_LOG_(FATAL) << "Read from death test child process failed: "
+                      << GetLastErrnoDescription();
+  }
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Close(read_fd()));
+  set_read_fd(-1);
+}
+
+// Signals that the death test code which should have exited, didn't.
+// Should be called only in a death test child process.
+// Writes a status byte to the child's status file descriptor, then
+// calls _exit(1).
+void DeathTestImpl::Abort(AbortReason reason) {
+  // The parent process considers the death test to be a failure if
+  // it finds any data in our pipe.  So, here we write a single flag byte
+  // to the pipe, then exit.
+  const char status_ch =
+      reason == TEST_DID_NOT_DIE ? kDeathTestLived :
+      reason == TEST_THREW_EXCEPTION ? kDeathTestThrew : kDeathTestReturned;
+
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1));
+  // We are leaking the descriptor here because on some platforms (i.e.,
+  // when built as Windows DLL), destructors of global objects will still
+  // run after calling _exit(). On such systems, write_fd_ will be
+  // indirectly closed from the destructor of UnitTestImpl, causing double
+  // close if it is also closed here. On debug configurations, double close
+  // may assert. As there are no in-process buffers to flush here, we are
+  // relying on the OS to close the descriptor after the process terminates
+  // when the destructors are not run.
+  _exit(1);  // Exits w/o any normal exit hooks (we were supposed to crash)
+}
+
+// Returns an indented copy of stderr output for a death test.
+// This makes distinguishing death test output lines from regular log lines
+// much easier.
+static ::std::string FormatDeathTestOutput(const ::std::string& output) {
+  ::std::string ret;
+  for (size_t at = 0; ; ) {
+    const size_t line_end = output.find('\n', at);
+    ret += "[  DEATH   ] ";
+    if (line_end == ::std::string::npos) {
+      ret += output.substr(at);
+      break;
+    }
+    ret += output.substr(at, line_end + 1 - at);
+    at = line_end + 1;
+  }
+  return ret;
+}
+
+// Assesses the success or failure of a death test, using both private
+// members which have previously been set, and one argument:
+//
+// Private data members:
+//   outcome:  An enumeration describing how the death test
+//             concluded: DIED, LIVED, THREW, or RETURNED.  The death test
+//             fails in the latter three cases.
+//   status:   The exit status of the child process. On *nix, it is in the
+//             in the format specified by wait(2). On Windows, this is the
+//             value supplied to the ExitProcess() API or a numeric code
+//             of the exception that terminated the program.
+//   regex:    A regular expression object to be applied to
+//             the test's captured standard error output; the death test
+//             fails if it does not match.
+//
+// Argument:
+//   status_ok: true if exit_status is acceptable in the context of
+//              this particular death test, which fails if it is false
+//
+// Returns true iff all of the above conditions are met.  Otherwise, the
+// first failing condition, in the order given above, is the one that is
+// reported. Also sets the last death test message string.
+bool DeathTestImpl::Passed(bool status_ok) {
+  if (!spawned())
+    return false;
+
+  const std::string error_message = GetCapturedStderr();
+
+  bool success = false;
+  Message buffer;
+
+  buffer << "Death test: " << statement() << "\n";
+  switch (outcome()) {
+    case LIVED:
+      buffer << "    Result: failed to die.\n"
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+      break;
+    case THREW:
+      buffer << "    Result: threw an exception.\n"
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+      break;
+    case RETURNED:
+      buffer << "    Result: illegal return in test statement.\n"
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+      break;
+    case DIED:
+      if (status_ok) {
+# if GTEST_USES_PCRE
+        // PCRE regexes support embedded NULs.
+        const bool matched = RE::PartialMatch(error_message, *regex());
+# else
+        const bool matched = RE::PartialMatch(error_message.c_str(), *regex());
+# endif  // GTEST_USES_PCRE
+        if (matched) {
+          success = true;
+        } else {
+          buffer << "    Result: died but not with expected error.\n"
+                 << "  Expected: " << regex()->pattern() << "\n"
+                 << "Actual msg:\n" << FormatDeathTestOutput(error_message);
+        }
+      } else {
+        buffer << "    Result: died but not with expected exit code:\n"
+               << "            " << ExitSummary(status()) << "\n"
+               << "Actual msg:\n" << FormatDeathTestOutput(error_message);
+      }
+      break;
+    case IN_PROGRESS:
+    default:
+      GTEST_LOG_(FATAL)
+          << "DeathTest::Passed somehow called before conclusion of test";
+  }
+
+  DeathTest::set_last_death_test_message(buffer.GetString());
+  return success;
+}
+
+# if GTEST_OS_WINDOWS
+// WindowsDeathTest implements death tests on Windows. Due to the
+// specifics of starting new processes on Windows, death tests there are
+// always threadsafe, and Google Test considers the
+// --gtest_death_test_style=fast setting to be equivalent to
+// --gtest_death_test_style=threadsafe there.
+//
+// A few implementation notes:  Like the Linux version, the Windows
+// implementation uses pipes for child-to-parent communication. But due to
+// the specifics of pipes on Windows, some extra steps are required:
+//
+// 1. The parent creates a communication pipe and stores handles to both
+//    ends of it.
+// 2. The parent starts the child and provides it with the information
+//    necessary to acquire the handle to the write end of the pipe.
+// 3. The child acquires the write end of the pipe and signals the parent
+//    using a Windows event.
+// 4. Now the parent can release the write end of the pipe on its side. If
+//    this is done before step 3, the object's reference count goes down to
+//    0 and it is destroyed, preventing the child from acquiring it. The
+//    parent now has to release it, or read operations on the read end of
+//    the pipe will not return when the child terminates.
+// 5. The parent reads child's output through the pipe (outcome code and
+//    any possible error messages) from the pipe, and its stderr and then
+//    determines whether to fail the test.
+//
+// Note: to distinguish Win32 API calls from the local method and function
+// calls, the former are explicitly resolved in the global namespace.
+//
+class WindowsDeathTest : public DeathTestImpl {
+ public:
+  WindowsDeathTest(const char* a_statement,
+                   const RE* a_regex,
+                   const char* file,
+                   int line)
+      : DeathTestImpl(a_statement, a_regex), file_(file), line_(line) {}
+
+  // All of these virtual functions are inherited from DeathTest.
+  virtual int Wait();
+  virtual TestRole AssumeRole();
+
+ private:
+  // The name of the file in which the death test is located.
+  const char* const file_;
+  // The line number on which the death test is located.
+  const int line_;
+  // Handle to the write end of the pipe to the child process.
+  AutoHandle write_handle_;
+  // Child process handle.
+  AutoHandle child_handle_;
+  // Event the child process uses to signal the parent that it has
+  // acquired the handle to the write end of the pipe. After seeing this
+  // event the parent can release its own handles to make sure its
+  // ReadFile() calls return when the child terminates.
+  AutoHandle event_handle_;
+};
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int WindowsDeathTest::Wait() {
+  if (!spawned())
+    return 0;
+
+  // Wait until the child either signals that it has acquired the write end
+  // of the pipe or it dies.
+  const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() };
+  switch (::WaitForMultipleObjects(2,
+                                   wait_handles,
+                                   FALSE,  // Waits for any of the handles.
+                                   INFINITE)) {
+    case WAIT_OBJECT_0:
+    case WAIT_OBJECT_0 + 1:
+      break;
+    default:
+      GTEST_DEATH_TEST_CHECK_(false);  // Should not get here.
+  }
+
+  // The child has acquired the write end of the pipe or exited.
+  // We release the handle on our side and continue.
+  write_handle_.Reset();
+  event_handle_.Reset();
+
+  ReadAndInterpretStatusByte();
+
+  // Waits for the child process to exit if it haven't already. This
+  // returns immediately if the child has already exited, regardless of
+  // whether previous calls to WaitForMultipleObjects synchronized on this
+  // handle or not.
+  GTEST_DEATH_TEST_CHECK_(
+      WAIT_OBJECT_0 == ::WaitForSingleObject(child_handle_.Get(),
+                                             INFINITE));
+  DWORD status_code;
+  GTEST_DEATH_TEST_CHECK_(
+      ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE);
+  child_handle_.Reset();
+  set_status(static_cast<int>(status_code));
+  return status();
+}
+
+// The AssumeRole process for a Windows death test.  It creates a child
+// process with the same executable as the current process to run the
+// death test.  The child process is given the --gtest_filter and
+// --gtest_internal_run_death_test flags such that it knows to run the
+// current death test only.
+DeathTest::TestRole WindowsDeathTest::AssumeRole() {
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo* const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != NULL) {
+    // ParseInternalRunDeathTestFlag() has performed all the necessary
+    // processing.
+    set_write_fd(flag->write_fd());
+    return EXECUTE_TEST;
+  }
+
+  // WindowsDeathTest uses an anonymous pipe to communicate results of
+  // a death test.
+  SECURITY_ATTRIBUTES handles_are_inheritable = {
+    sizeof(SECURITY_ATTRIBUTES), NULL, TRUE };
+  HANDLE read_handle, write_handle;
+  GTEST_DEATH_TEST_CHECK_(
+      ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable,
+                   0)  // Default buffer size.
+      != FALSE);
+  set_read_fd(::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle),
+                                O_RDONLY));
+  write_handle_.Reset(write_handle);
+  event_handle_.Reset(::CreateEvent(
+      &handles_are_inheritable,
+      TRUE,    // The event will automatically reset to non-signaled state.
+      FALSE,   // The initial state is non-signalled.
+      NULL));  // The even is unnamed.
+  GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != NULL);
+  const std::string filter_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "=" +
+      info->test_case_name() + "." + info->name();
+  const std::string internal_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag +
+      "=" + file_ + "|" + StreamableToString(line_) + "|" +
+      StreamableToString(death_test_index) + "|" +
+      StreamableToString(static_cast<unsigned int>(::GetCurrentProcessId())) +
+      // size_t has the same width as pointers on both 32-bit and 64-bit
+      // Windows platforms.
+      // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx.
+      "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) +
+      "|" + StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
+
+  char executable_path[_MAX_PATH + 1];  // NOLINT
+  GTEST_DEATH_TEST_CHECK_(
+      _MAX_PATH + 1 != ::GetModuleFileNameA(NULL,
+                                            executable_path,
+                                            _MAX_PATH));
+
+  std::string command_line =
+      std::string(::GetCommandLineA()) + " " + filter_flag + " \"" +
+      internal_flag + "\"";
+
+  DeathTest::set_last_death_test_message("");
+
+  CaptureStderr();
+  // Flush the log buffers since the log streams are shared with the child.
+  FlushInfoLog();
+
+  // The child process will share the standard handles with the parent.
+  STARTUPINFOA startup_info;
+  memset(&startup_info, 0, sizeof(STARTUPINFO));
+  startup_info.dwFlags = STARTF_USESTDHANDLES;
+  startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE);
+  startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE);
+  startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE);
+
+  PROCESS_INFORMATION process_info;
+  GTEST_DEATH_TEST_CHECK_(::CreateProcessA(
+      executable_path,
+      const_cast<char*>(command_line.c_str()),
+      NULL,   // Retuned process handle is not inheritable.
+      NULL,   // Retuned thread handle is not inheritable.
+      TRUE,   // Child inherits all inheritable handles (for write_handle_).
+      0x0,    // Default creation flags.
+      NULL,   // Inherit the parent's environment.
+      UnitTest::GetInstance()->original_working_dir(),
+      &startup_info,
+      &process_info) != FALSE);
+  child_handle_.Reset(process_info.hProcess);
+  ::CloseHandle(process_info.hThread);
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+
+# elif GTEST_OS_FUCHSIA
+
+class FuchsiaDeathTest : public DeathTestImpl {
+ public:
+  FuchsiaDeathTest(const char* a_statement,
+                   const RE* a_regex,
+                   const char* file,
+                   int line)
+      : DeathTestImpl(a_statement, a_regex), file_(file), line_(line) {}
+  virtual ~FuchsiaDeathTest() {
+    zx_status_t status = zx_handle_close(child_process_);
+    GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+    status = zx_handle_close(port_);
+    GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+  }
+
+  // All of these virtual functions are inherited from DeathTest.
+  virtual int Wait();
+  virtual TestRole AssumeRole();
+
+ private:
+  // The name of the file in which the death test is located.
+  const char* const file_;
+  // The line number on which the death test is located.
+  const int line_;
+
+  zx_handle_t child_process_ = ZX_HANDLE_INVALID;
+  zx_handle_t port_ = ZX_HANDLE_INVALID;
+};
+
+// Utility class for accumulating command-line arguments.
+class Arguments {
+ public:
+  Arguments() {
+    args_.push_back(NULL);
+  }
+
+  ~Arguments() {
+    for (std::vector<char*>::iterator i = args_.begin(); i != args_.end();
+         ++i) {
+      free(*i);
+    }
+  }
+  void AddArgument(const char* argument) {
+    args_.insert(args_.end() - 1, posix::StrDup(argument));
+  }
+
+  template <typename Str>
+  void AddArguments(const ::std::vector<Str>& arguments) {
+    for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
+         i != arguments.end();
+         ++i) {
+      args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
+    }
+  }
+  char* const* Argv() {
+    return &args_[0];
+  }
+
+  int size() {
+    return args_.size() - 1;
+  }
+
+ private:
+  std::vector<char*> args_;
+};
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int FuchsiaDeathTest::Wait() {
+  if (!spawned())
+    return 0;
+
+  // Register to wait for the child process to terminate.
+  zx_status_t status_zx;
+  status_zx = zx_object_wait_async(child_process_,
+                                   port_,
+                                   0 /* key */,
+                                   ZX_PROCESS_TERMINATED,
+                                   ZX_WAIT_ASYNC_ONCE);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  // Wait for it to terminate, or an exception to be received.
+  zx_port_packet_t packet;
+  status_zx = zx_port_wait(port_, ZX_TIME_INFINITE, &packet);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  if (ZX_PKT_IS_EXCEPTION(packet.type)) {
+    // Process encountered an exception. Kill it directly rather than letting
+    // other handlers process the event.
+    status_zx = zx_task_kill(child_process_);
+    GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+    // Now wait for |child_process_| to terminate.
+    zx_signals_t signals = 0;
+    status_zx = zx_object_wait_one(
+        child_process_, ZX_PROCESS_TERMINATED, ZX_TIME_INFINITE, &signals);
+    GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+    GTEST_DEATH_TEST_CHECK_(signals & ZX_PROCESS_TERMINATED);
+  } else {
+    // Process terminated.
+    GTEST_DEATH_TEST_CHECK_(ZX_PKT_IS_SIGNAL_ONE(packet.type));
+    GTEST_DEATH_TEST_CHECK_(packet.signal.observed & ZX_PROCESS_TERMINATED);
+  }
+
+  ReadAndInterpretStatusByte();
+
+  zx_info_process_t buffer;
+  status_zx = zx_object_get_info(
+      child_process_,
+      ZX_INFO_PROCESS,
+      &buffer,
+      sizeof(buffer),
+      nullptr,
+      nullptr);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  GTEST_DEATH_TEST_CHECK_(buffer.exited);
+  set_status(buffer.return_code);
+  return status();
+}
+
+// The AssumeRole process for a Fuchsia death test.  It creates a child
+// process with the same executable as the current process to run the
+// death test.  The child process is given the --gtest_filter and
+// --gtest_internal_run_death_test flags such that it knows to run the
+// current death test only.
+DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo* const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != NULL) {
+    // ParseInternalRunDeathTestFlag() has performed all the necessary
+    // processing.
+    set_write_fd(kFuchsiaReadPipeFd);
+    return EXECUTE_TEST;
+  }
+
+  CaptureStderr();
+  // Flush the log buffers since the log streams are shared with the child.
+  FlushInfoLog();
+
+  // Build the child process command line.
+  const std::string filter_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "="
+      + info->test_case_name() + "." + info->name();
+  const std::string internal_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
+      + file_ + "|"
+      + StreamableToString(line_) + "|"
+      + StreamableToString(death_test_index);
+  Arguments args;
+  args.AddArguments(GetInjectableArgvs());
+  args.AddArgument(filter_flag.c_str());
+  args.AddArgument(internal_flag.c_str());
+
+  // Build the pipe for communication with the child.
+  zx_status_t status;
+  zx_handle_t child_pipe_handle;
+  uint32_t type;
+  status = fdio_pipe_half(&child_pipe_handle, &type);
+  GTEST_DEATH_TEST_CHECK_(status >= 0);
+  set_read_fd(status);
+
+  // Set the pipe handle for the child.
+  fdio_spawn_action_t add_handle_action = {};
+  add_handle_action.action = FDIO_SPAWN_ACTION_ADD_HANDLE;
+  add_handle_action.h.id = PA_HND(type, kFuchsiaReadPipeFd);
+  add_handle_action.h.handle = child_pipe_handle;
+
+  // Spawn the child process.
+  status = fdio_spawn_etc(ZX_HANDLE_INVALID, FDIO_SPAWN_CLONE_ALL,
+                          args.Argv()[0], args.Argv(), nullptr, 1,
+                          &add_handle_action, &child_process_, nullptr);
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+
+  // Create an exception port and attach it to the |child_process_|, to allow
+  // us to suppress the system default exception handler from firing.
+  status = zx_port_create(0, &port_);
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+  status = zx_task_bind_exception_port(
+      child_process_, port_, 0 /* key */, 0 /*options */);
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+
+#else  // We are neither on Windows, nor on Fuchsia.
+
+// ForkingDeathTest provides implementations for most of the abstract
+// methods of the DeathTest interface.  Only the AssumeRole method is
+// left undefined.
+class ForkingDeathTest : public DeathTestImpl {
+ public:
+  ForkingDeathTest(const char* statement, const RE* regex);
+
+  // All of these virtual functions are inherited from DeathTest.
+  virtual int Wait();
+
+ protected:
+  void set_child_pid(pid_t child_pid) { child_pid_ = child_pid; }
+
+ private:
+  // PID of child process during death test; 0 in the child process itself.
+  pid_t child_pid_;
+};
+
+// Constructs a ForkingDeathTest.
+ForkingDeathTest::ForkingDeathTest(const char* a_statement, const RE* a_regex)
+    : DeathTestImpl(a_statement, a_regex),
+      child_pid_(-1) {}
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int ForkingDeathTest::Wait() {
+  if (!spawned())
+    return 0;
+
+  ReadAndInterpretStatusByte();
+
+  int status_value;
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(waitpid(child_pid_, &status_value, 0));
+  set_status(status_value);
+  return status_value;
+}
+
+// A concrete death test class that forks, then immediately runs the test
+// in the child process.
+class NoExecDeathTest : public ForkingDeathTest {
+ public:
+  NoExecDeathTest(const char* a_statement, const RE* a_regex) :
+      ForkingDeathTest(a_statement, a_regex) { }
+  virtual TestRole AssumeRole();
+};
+
+// The AssumeRole process for a fork-and-run death test.  It implements a
+// straightforward fork, with a simple pipe to transmit the status byte.
+DeathTest::TestRole NoExecDeathTest::AssumeRole() {
+  const size_t thread_count = GetThreadCount();
+  if (thread_count != 1) {
+    GTEST_LOG_(WARNING) << DeathTestThreadWarning(thread_count);
+  }
+
+  int pipe_fd[2];
+  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
+
+  DeathTest::set_last_death_test_message("");
+  CaptureStderr();
+  // When we fork the process below, the log file buffers are copied, but the
+  // file descriptors are shared.  We flush all log files here so that closing
+  // the file descriptors in the child process doesn't throw off the
+  // synchronization between descriptors and buffers in the parent process.
+  // This is as close to the fork as possible to avoid a race condition in case
+  // there are multiple threads running before the death test, and another
+  // thread writes to the log file.
+  FlushInfoLog();
+
+  const pid_t child_pid = fork();
+  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
+  set_child_pid(child_pid);
+  if (child_pid == 0) {
+    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[0]));
+    set_write_fd(pipe_fd[1]);
+    // Redirects all logging to stderr in the child process to prevent
+    // concurrent writes to the log files.  We capture stderr in the parent
+    // process and append the child process' output to a log.
+    LogToStderr();
+    // Event forwarding to the listeners of event listener API mush be shut
+    // down in death test subprocesses.
+    GetUnitTestImpl()->listeners()->SuppressEventForwarding();
+    g_in_fast_death_test_child = true;
+    return EXECUTE_TEST;
+  } else {
+    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
+    set_read_fd(pipe_fd[0]);
+    set_spawned(true);
+    return OVERSEE_TEST;
+  }
+}
+
+// A concrete death test class that forks and re-executes the main
+// program from the beginning, with command-line flags set that cause
+// only this specific death test to be run.
+class ExecDeathTest : public ForkingDeathTest {
+ public:
+  ExecDeathTest(const char* a_statement, const RE* a_regex,
+                const char* file, int line) :
+      ForkingDeathTest(a_statement, a_regex), file_(file), line_(line) { }
+  virtual TestRole AssumeRole();
+ private:
+  static ::std::vector<std::string> GetArgvsForDeathTestChildProcess() {
+    ::std::vector<std::string> args = GetInjectableArgvs();
+#  if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+    ::std::vector<std::string> extra_args =
+        GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_();
+    args.insert(args.end(), extra_args.begin(), extra_args.end());
+#  endif  // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+    return args;
+  }
+  // The name of the file in which the death test is located.
+  const char* const file_;
+  // The line number on which the death test is located.
+  const int line_;
+};
+
+// Utility class for accumulating command-line arguments.
+class Arguments {
+ public:
+  Arguments() {
+    args_.push_back(NULL);
+  }
+
+  ~Arguments() {
+    for (std::vector<char*>::iterator i = args_.begin(); i != args_.end();
+         ++i) {
+      free(*i);
+    }
+  }
+  void AddArgument(const char* argument) {
+    args_.insert(args_.end() - 1, posix::StrDup(argument));
+  }
+
+  template <typename Str>
+  void AddArguments(const ::std::vector<Str>& arguments) {
+    for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
+         i != arguments.end();
+         ++i) {
+      args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
+    }
+  }
+  char* const* Argv() {
+    return &args_[0];
+  }
+
+ private:
+  std::vector<char*> args_;
+};
+
+// A struct that encompasses the arguments to the child process of a
+// threadsafe-style death test process.
+struct ExecDeathTestArgs {
+  char* const* argv;  // Command-line arguments for the child's call to exec
+  int close_fd;       // File descriptor to close; the read end of a pipe
+};
+
+#  if GTEST_OS_MAC
+inline char** GetEnviron() {
+  // When Google Test is built as a framework on MacOS X, the environ variable
+  // is unavailable. Apple's documentation (man environ) recommends using
+  // _NSGetEnviron() instead.
+  return *_NSGetEnviron();
+}
+#  else
+// Some POSIX platforms expect you to declare environ. extern "C" makes
+// it reside in the global namespace.
+extern "C" char** environ;
+inline char** GetEnviron() { return environ; }
+#  endif  // GTEST_OS_MAC
+
+#  if !GTEST_OS_QNX
+// The main function for a threadsafe-style death test child process.
+// This function is called in a clone()-ed process and thus must avoid
+// any potentially unsafe operations like malloc or libc functions.
+static int ExecDeathTestChildMain(void* child_arg) {
+  ExecDeathTestArgs* const args = static_cast<ExecDeathTestArgs*>(child_arg);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(args->close_fd));
+
+  // We need to execute the test program in the same environment where
+  // it was originally invoked.  Therefore we change to the original
+  // working directory first.
+  const char* const original_dir =
+      UnitTest::GetInstance()->original_working_dir();
+  // We can safely call chdir() as it's a direct system call.
+  if (chdir(original_dir) != 0) {
+    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
+                   GetLastErrnoDescription());
+    return EXIT_FAILURE;
+  }
+
+  // We can safely call execve() as it's a direct system call.  We
+  // cannot use execvp() as it's a libc function and thus potentially
+  // unsafe.  Since execve() doesn't search the PATH, the user must
+  // invoke the test program via a valid path that contains at least
+  // one path separator.
+  execve(args->argv[0], args->argv, GetEnviron());
+  DeathTestAbort(std::string("execve(") + args->argv[0] + ", ...) in " +
+                 original_dir + " failed: " +
+                 GetLastErrnoDescription());
+  return EXIT_FAILURE;
+}
+#  endif  // !GTEST_OS_QNX
+
+#  if GTEST_HAS_CLONE
+// Two utility routines that together determine the direction the stack
+// grows.
+// This could be accomplished more elegantly by a single recursive
+// function, but we want to guard against the unlikely possibility of
+// a smart compiler optimizing the recursion away.
+//
+// GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining
+// StackLowerThanAddress into StackGrowsDown, which then doesn't give
+// correct answer.
+static void StackLowerThanAddress(const void* ptr,
+                                  bool* result) GTEST_NO_INLINE_;
+static void StackLowerThanAddress(const void* ptr, bool* result) {
+  int dummy;
+  *result = (&dummy < ptr);
+}
+
+// Make sure AddressSanitizer does not tamper with the stack here.
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+static bool StackGrowsDown() {
+  int dummy;
+  bool result;
+  StackLowerThanAddress(&dummy, &result);
+  return result;
+}
+#  endif  // GTEST_HAS_CLONE
+
+// Spawns a child process with the same executable as the current process in
+// a thread-safe manner and instructs it to run the death test.  The
+// implementation uses fork(2) + exec.  On systems where clone(2) is
+// available, it is used instead, being slightly more thread-safe.  On QNX,
+// fork supports only single-threaded environments, so this function uses
+// spawn(2) there instead.  The function dies with an error message if
+// anything goes wrong.
+static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
+  ExecDeathTestArgs args = { argv, close_fd };
+  pid_t child_pid = -1;
+
+#  if GTEST_OS_QNX
+  // Obtains the current directory and sets it to be closed in the child
+  // process.
+  const int cwd_fd = open(".", O_RDONLY);
+  GTEST_DEATH_TEST_CHECK_(cwd_fd != -1);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(cwd_fd, F_SETFD, FD_CLOEXEC));
+  // We need to execute the test program in the same environment where
+  // it was originally invoked.  Therefore we change to the original
+  // working directory first.
+  const char* const original_dir =
+      UnitTest::GetInstance()->original_working_dir();
+  // We can safely call chdir() as it's a direct system call.
+  if (chdir(original_dir) != 0) {
+    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
+                   GetLastErrnoDescription());
+    return EXIT_FAILURE;
+  }
+
+  int fd_flags;
+  // Set close_fd to be closed after spawn.
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD));
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(close_fd, F_SETFD,
+                                        fd_flags | FD_CLOEXEC));
+  struct inheritance inherit = {0};
+  // spawn is a system call.
+  child_pid = spawn(args.argv[0], 0, NULL, &inherit, args.argv, GetEnviron());
+  // Restores the current working directory.
+  GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd));
+
+#  else   // GTEST_OS_QNX
+#   if GTEST_OS_LINUX
+  // When a SIGPROF signal is received while fork() or clone() are executing,
+  // the process may hang. To avoid this, we ignore SIGPROF here and re-enable
+  // it after the call to fork()/clone() is complete.
+  struct sigaction saved_sigprof_action;
+  struct sigaction ignore_sigprof_action;
+  memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action));
+  sigemptyset(&ignore_sigprof_action.sa_mask);
+  ignore_sigprof_action.sa_handler = SIG_IGN;
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(sigaction(
+      SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
+#   endif  // GTEST_OS_LINUX
+
+#   if GTEST_HAS_CLONE
+  const bool use_fork = GTEST_FLAG(death_test_use_fork);
+
+  if (!use_fork) {
+    static const bool stack_grows_down = StackGrowsDown();
+    const size_t stack_size = getpagesize();
+    // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead.
+    void* const stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
+                             MAP_ANON | MAP_PRIVATE, -1, 0);
+    GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED);
+
+    // Maximum stack alignment in bytes:  For a downward-growing stack, this
+    // amount is subtracted from size of the stack space to get an address
+    // that is within the stack space and is aligned on all systems we care
+    // about.  As far as I know there is no ABI with stack alignment greater
+    // than 64.  We assume stack and stack_size already have alignment of
+    // kMaxStackAlignment.
+    const size_t kMaxStackAlignment = 64;
+    void* const stack_top =
+        static_cast<char*>(stack) +
+            (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
+    GTEST_DEATH_TEST_CHECK_(stack_size > kMaxStackAlignment &&
+        reinterpret_cast<intptr_t>(stack_top) % kMaxStackAlignment == 0);
+
+    child_pid = clone(&ExecDeathTestChildMain, stack_top, SIGCHLD, &args);
+
+    GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1);
+  }
+#   else
+  const bool use_fork = true;
+#   endif  // GTEST_HAS_CLONE
+
+  if (use_fork && (child_pid = fork()) == 0) {
+      ExecDeathTestChildMain(&args);
+      _exit(0);
+  }
+#  endif  // GTEST_OS_QNX
+#  if GTEST_OS_LINUX
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(
+      sigaction(SIGPROF, &saved_sigprof_action, NULL));
+#  endif  // GTEST_OS_LINUX
+
+  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
+  return child_pid;
+}
+
+// The AssumeRole process for a fork-and-exec death test.  It re-executes the
+// main program from the beginning, setting the --gtest_filter
+// and --gtest_internal_run_death_test flags to cause only the current
+// death test to be re-run.
+DeathTest::TestRole ExecDeathTest::AssumeRole() {
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo* const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != NULL) {
+    set_write_fd(flag->write_fd());
+    return EXECUTE_TEST;
+  }
+
+  int pipe_fd[2];
+  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
+  // Clear the close-on-exec flag on the write end of the pipe, lest
+  // it be closed when the child process does an exec:
+  GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1);
+
+  const std::string filter_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "="
+      + info->test_case_name() + "." + info->name();
+  const std::string internal_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
+      + file_ + "|" + StreamableToString(line_) + "|"
+      + StreamableToString(death_test_index) + "|"
+      + StreamableToString(pipe_fd[1]);
+  Arguments args;
+  args.AddArguments(GetArgvsForDeathTestChildProcess());
+  args.AddArgument(filter_flag.c_str());
+  args.AddArgument(internal_flag.c_str());
+
+  DeathTest::set_last_death_test_message("");
+
+  CaptureStderr();
+  // See the comment in NoExecDeathTest::AssumeRole for why the next line
+  // is necessary.
+  FlushInfoLog();
+
+  const pid_t child_pid = ExecDeathTestSpawnChild(args.Argv(), pipe_fd[0]);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
+  set_child_pid(child_pid);
+  set_read_fd(pipe_fd[0]);
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+
+# endif  // !GTEST_OS_WINDOWS
+
+// Creates a concrete DeathTest-derived class that depends on the
+// --gtest_death_test_style flag, and sets the pointer pointed to
+// by the "test" argument to its address.  If the test should be
+// skipped, sets that pointer to NULL.  Returns true, unless the
+// flag is set to an invalid value.
+bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex,
+                                     const char* file, int line,
+                                     DeathTest** test) {
+  UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const int death_test_index = impl->current_test_info()
+      ->increment_death_test_count();
+
+  if (flag != NULL) {
+    if (death_test_index > flag->index()) {
+      DeathTest::set_last_death_test_message(
+          "Death test count (" + StreamableToString(death_test_index)
+          + ") somehow exceeded expected maximum ("
+          + StreamableToString(flag->index()) + ")");
+      return false;
+    }
+
+    if (!(flag->file() == file && flag->line() == line &&
+          flag->index() == death_test_index)) {
+      *test = NULL;
+      return true;
+    }
+  }
+
+# if GTEST_OS_WINDOWS
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
+      GTEST_FLAG(death_test_style) == "fast") {
+    *test = new WindowsDeathTest(statement, regex, file, line);
+  }
+
+# elif GTEST_OS_FUCHSIA
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
+      GTEST_FLAG(death_test_style) == "fast") {
+    *test = new FuchsiaDeathTest(statement, regex, file, line);
+  }
+
+# else
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe") {
+    *test = new ExecDeathTest(statement, regex, file, line);
+  } else if (GTEST_FLAG(death_test_style) == "fast") {
+    *test = new NoExecDeathTest(statement, regex);
+  }
+
+# endif  // GTEST_OS_WINDOWS
+
+  else {  // NOLINT - this is more readable than unbalanced brackets inside #if.
+    DeathTest::set_last_death_test_message(
+        "Unknown death test style \"" + GTEST_FLAG(death_test_style)
+        + "\" encountered");
+    return false;
+  }
+
+  return true;
+}
+
+# if GTEST_OS_WINDOWS
+// Recreates the pipe and event handles from the provided parameters,
+// signals the event, and returns a file descriptor wrapped around the pipe
+// handle. This function is called in the child process only.
+static int GetStatusFileDescriptor(unsigned int parent_process_id,
+                            size_t write_handle_as_size_t,
+                            size_t event_handle_as_size_t) {
+  AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE,
+                                                   FALSE,  // Non-inheritable.
+                                                   parent_process_id));
+  if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) {
+    DeathTestAbort("Unable to open parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  // FIXME: Replace the following check with a
+  // compile-time assertion when available.
+  GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t));
+
+  const HANDLE write_handle =
+      reinterpret_cast<HANDLE>(write_handle_as_size_t);
+  HANDLE dup_write_handle;
+
+  // The newly initialized handle is accessible only in the parent
+  // process. To obtain one accessible within the child, we need to use
+  // DuplicateHandle.
+  if (!::DuplicateHandle(parent_process_handle.Get(), write_handle,
+                         ::GetCurrentProcess(), &dup_write_handle,
+                         0x0,    // Requested privileges ignored since
+                                 // DUPLICATE_SAME_ACCESS is used.
+                         FALSE,  // Request non-inheritable handler.
+                         DUPLICATE_SAME_ACCESS)) {
+    DeathTestAbort("Unable to duplicate the pipe handle " +
+                   StreamableToString(write_handle_as_size_t) +
+                   " from the parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  const HANDLE event_handle = reinterpret_cast<HANDLE>(event_handle_as_size_t);
+  HANDLE dup_event_handle;
+
+  if (!::DuplicateHandle(parent_process_handle.Get(), event_handle,
+                         ::GetCurrentProcess(), &dup_event_handle,
+                         0x0,
+                         FALSE,
+                         DUPLICATE_SAME_ACCESS)) {
+    DeathTestAbort("Unable to duplicate the event handle " +
+                   StreamableToString(event_handle_as_size_t) +
+                   " from the parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  const int write_fd =
+      ::_open_osfhandle(reinterpret_cast<intptr_t>(dup_write_handle), O_APPEND);
+  if (write_fd == -1) {
+    DeathTestAbort("Unable to convert pipe handle " +
+                   StreamableToString(write_handle_as_size_t) +
+                   " to a file descriptor");
+  }
+
+  // Signals the parent that the write end of the pipe has been acquired
+  // so the parent can release its own write end.
+  ::SetEvent(dup_event_handle);
+
+  return write_fd;
+}
+# endif  // GTEST_OS_WINDOWS
+
+// Returns a newly created InternalRunDeathTestFlag object with fields
+// initialized from the GTEST_FLAG(internal_run_death_test) flag if
+// the flag is specified; otherwise returns NULL.
+InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
+  if (GTEST_FLAG(internal_run_death_test) == "") return NULL;
+
+  // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we
+  // can use it here.
+  int line = -1;
+  int index = -1;
+  ::std::vector< ::std::string> fields;
+  SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields);
+  int write_fd = -1;
+
+# if GTEST_OS_WINDOWS
+
+  unsigned int parent_process_id = 0;
+  size_t write_handle_as_size_t = 0;
+  size_t event_handle_as_size_t = 0;
+
+  if (fields.size() != 6
+      || !ParseNaturalNumber(fields[1], &line)
+      || !ParseNaturalNumber(fields[2], &index)
+      || !ParseNaturalNumber(fields[3], &parent_process_id)
+      || !ParseNaturalNumber(fields[4], &write_handle_as_size_t)
+      || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+                   GTEST_FLAG(internal_run_death_test));
+  }
+  write_fd = GetStatusFileDescriptor(parent_process_id,
+                                     write_handle_as_size_t,
+                                     event_handle_as_size_t);
+
+# elif GTEST_OS_FUCHSIA
+
+  if (fields.size() != 3
+      || !ParseNaturalNumber(fields[1], &line)
+      || !ParseNaturalNumber(fields[2], &index)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
+        + GTEST_FLAG(internal_run_death_test));
+  }
+
+# else
+
+  if (fields.size() != 4
+      || !ParseNaturalNumber(fields[1], &line)
+      || !ParseNaturalNumber(fields[2], &index)
+      || !ParseNaturalNumber(fields[3], &write_fd)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
+        + GTEST_FLAG(internal_run_death_test));
+  }
+
+# endif  // GTEST_OS_WINDOWS
+
+  return new InternalRunDeathTestFlag(fields[0], line, index, write_fd);
+}
+
+}  // namespace internal
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+}  // namespace testing
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+#include <stdlib.h>
+
+#if GTEST_OS_WINDOWS_MOBILE
+# include <windows.h>
+#elif GTEST_OS_WINDOWS
+# include <direct.h>
+# include <io.h>
+#elif GTEST_OS_SYMBIAN
+// Symbian OpenC has PATH_MAX in sys/syslimits.h
+# include <sys/syslimits.h>
+#else
+# include <limits.h>
+# include <climits>  // Some Linux distributions define PATH_MAX here.
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+
+#if GTEST_OS_WINDOWS
+# define GTEST_PATH_MAX_ _MAX_PATH
+#elif defined(PATH_MAX)
+# define GTEST_PATH_MAX_ PATH_MAX
+#elif defined(_XOPEN_PATH_MAX)
+# define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
+#else
+# define GTEST_PATH_MAX_ _POSIX_PATH_MAX
+#endif  // GTEST_OS_WINDOWS
+
+namespace testing {
+namespace internal {
+
+#if GTEST_OS_WINDOWS
+// On Windows, '\\' is the standard path separator, but many tools and the
+// Windows API also accept '/' as an alternate path separator. Unless otherwise
+// noted, a file path can contain either kind of path separators, or a mixture
+// of them.
+const char kPathSeparator = '\\';
+const char kAlternatePathSeparator = '/';
+const char kAlternatePathSeparatorString[] = "/";
+# if GTEST_OS_WINDOWS_MOBILE
+// Windows CE doesn't have a current directory. You should not use
+// the current directory in tests on Windows CE, but this at least
+// provides a reasonable fallback.
+const char kCurrentDirectoryString[] = "\\";
+// Windows CE doesn't define INVALID_FILE_ATTRIBUTES
+const DWORD kInvalidFileAttributes = 0xffffffff;
+# else
+const char kCurrentDirectoryString[] = ".\\";
+# endif  // GTEST_OS_WINDOWS_MOBILE
+#else
+const char kPathSeparator = '/';
+const char kCurrentDirectoryString[] = "./";
+#endif  // GTEST_OS_WINDOWS
+
+// Returns whether the given character is a valid path separator.
+static bool IsPathSeparator(char c) {
+#if GTEST_HAS_ALT_PATH_SEP_
+  return (c == kPathSeparator) || (c == kAlternatePathSeparator);
+#else
+  return c == kPathSeparator;
+#endif
+}
+
+// Returns the current working directory, or "" if unsuccessful.
+FilePath FilePath::GetCurrentDir() {
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
+  // Windows CE doesn't have a current directory, so we just return
+  // something reasonable.
+  return FilePath(kCurrentDirectoryString);
+#elif GTEST_OS_WINDOWS
+  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
+  return FilePath(_getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd);
+#else
+  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
+  char* result = getcwd(cwd, sizeof(cwd));
+# if GTEST_OS_NACL
+  // getcwd will likely fail in NaCl due to the sandbox, so return something
+  // reasonable. The user may have provided a shim implementation for getcwd,
+  // however, so fallback only when failure is detected.
+  return FilePath(result == NULL ? kCurrentDirectoryString : cwd);
+# endif  // GTEST_OS_NACL
+  return FilePath(result == NULL ? "" : cwd);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Returns a copy of the FilePath with the case-insensitive extension removed.
+// Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
+// FilePath("dir/file"). If a case-insensitive extension is not
+// found, returns a copy of the original FilePath.
+FilePath FilePath::RemoveExtension(const char* extension) const {
+  const std::string dot_extension = std::string(".") + extension;
+  if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) {
+    return FilePath(pathname_.substr(
+        0, pathname_.length() - dot_extension.length()));
+  }
+  return *this;
+}
+
+// Returns a pointer to the last occurrence of a valid path separator in
+// the FilePath. On Windows, for example, both '/' and '\' are valid path
+// separators. Returns NULL if no path separator was found.
+const char* FilePath::FindLastPathSeparator() const {
+  const char* const last_sep = strrchr(c_str(), kPathSeparator);
+#if GTEST_HAS_ALT_PATH_SEP_
+  const char* const last_alt_sep = strrchr(c_str(), kAlternatePathSeparator);
+  // Comparing two pointers of which only one is NULL is undefined.
+  if (last_alt_sep != NULL &&
+      (last_sep == NULL || last_alt_sep > last_sep)) {
+    return last_alt_sep;
+  }
+#endif
+  return last_sep;
+}
+
+// Returns a copy of the FilePath with the directory part removed.
+// Example: FilePath("path/to/file").RemoveDirectoryName() returns
+// FilePath("file"). If there is no directory part ("just_a_file"), it returns
+// the FilePath unmodified. If there is no file part ("just_a_dir/") it
+// returns an empty FilePath ("").
+// On Windows platform, '\' is the path separator, otherwise it is '/'.
+FilePath FilePath::RemoveDirectoryName() const {
+  const char* const last_sep = FindLastPathSeparator();
+  return last_sep ? FilePath(last_sep + 1) : *this;
+}
+
+// RemoveFileName returns the directory path with the filename removed.
+// Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
+// If the FilePath is "a_file" or "/a_file", RemoveFileName returns
+// FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
+// not have a file, like "just/a/dir/", it returns the FilePath unmodified.
+// On Windows platform, '\' is the path separator, otherwise it is '/'.
+FilePath FilePath::RemoveFileName() const {
+  const char* const last_sep = FindLastPathSeparator();
+  std::string dir;
+  if (last_sep) {
+    dir = std::string(c_str(), last_sep + 1 - c_str());
+  } else {
+    dir = kCurrentDirectoryString;
+  }
+  return FilePath(dir);
+}
+
+// Helper functions for naming files in a directory for xml output.
+
+// Given directory = "dir", base_name = "test", number = 0,
+// extension = "xml", returns "dir/test.xml". If number is greater
+// than zero (e.g., 12), returns "dir/test_12.xml".
+// On Windows platform, uses \ as the separator rather than /.
+FilePath FilePath::MakeFileName(const FilePath& directory,
+                                const FilePath& base_name,
+                                int number,
+                                const char* extension) {
+  std::string file;
+  if (number == 0) {
+    file = base_name.string() + "." + extension;
+  } else {
+    file = base_name.string() + "_" + StreamableToString(number)
+        + "." + extension;
+  }
+  return ConcatPaths(directory, FilePath(file));
+}
+
+// Given directory = "dir", relative_path = "test.xml", returns "dir/test.xml".
+// On Windows, uses \ as the separator rather than /.
+FilePath FilePath::ConcatPaths(const FilePath& directory,
+                               const FilePath& relative_path) {
+  if (directory.IsEmpty())
+    return relative_path;
+  const FilePath dir(directory.RemoveTrailingPathSeparator());
+  return FilePath(dir.string() + kPathSeparator + relative_path.string());
+}
+
+// Returns true if pathname describes something findable in the file-system,
+// either a file, directory, or whatever.
+bool FilePath::FileOrDirectoryExists() const {
+#if GTEST_OS_WINDOWS_MOBILE
+  LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str());
+  const DWORD attributes = GetFileAttributes(unicode);
+  delete [] unicode;
+  return attributes != kInvalidFileAttributes;
+#else
+  posix::StatStruct file_stat;
+  return posix::Stat(pathname_.c_str(), &file_stat) == 0;
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Returns true if pathname describes a directory in the file-system
+// that exists.
+bool FilePath::DirectoryExists() const {
+  bool result = false;
+#if GTEST_OS_WINDOWS
+  // Don't strip off trailing separator if path is a root directory on
+  // Windows (like "C:\\").
+  const FilePath& path(IsRootDirectory() ? *this :
+                                           RemoveTrailingPathSeparator());
+#else
+  const FilePath& path(*this);
+#endif
+
+#if GTEST_OS_WINDOWS_MOBILE
+  LPCWSTR unicode = String::AnsiToUtf16(path.c_str());
+  const DWORD attributes = GetFileAttributes(unicode);
+  delete [] unicode;
+  if ((attributes != kInvalidFileAttributes) &&
+      (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
+    result = true;
+  }
+#else
+  posix::StatStruct file_stat;
+  result = posix::Stat(path.c_str(), &file_stat) == 0 &&
+      posix::IsDir(file_stat);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+  return result;
+}
+
+// Returns true if pathname describes a root directory. (Windows has one
+// root directory per disk drive.)
+bool FilePath::IsRootDirectory() const {
+#if GTEST_OS_WINDOWS
+  // FIXME: on Windows a network share like
+  // \\server\share can be a root directory, although it cannot be the
+  // current directory.  Handle this properly.
+  return pathname_.length() == 3 && IsAbsolutePath();
+#else
+  return pathname_.length() == 1 && IsPathSeparator(pathname_.c_str()[0]);
+#endif
+}
+
+// Returns true if pathname describes an absolute path.
+bool FilePath::IsAbsolutePath() const {
+  const char* const name = pathname_.c_str();
+#if GTEST_OS_WINDOWS
+  return pathname_.length() >= 3 &&
+     ((name[0] >= 'a' && name[0] <= 'z') ||
+      (name[0] >= 'A' && name[0] <= 'Z')) &&
+     name[1] == ':' &&
+     IsPathSeparator(name[2]);
+#else
+  return IsPathSeparator(name[0]);
+#endif
+}
+
+// Returns a pathname for a file that does not currently exist. The pathname
+// will be directory/base_name.extension or
+// directory/base_name_<number>.extension if directory/base_name.extension
+// already exists. The number will be incremented until a pathname is found
+// that does not already exist.
+// Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
+// There could be a race condition if two or more processes are calling this
+// function at the same time -- they could both pick the same filename.
+FilePath FilePath::GenerateUniqueFileName(const FilePath& directory,
+                                          const FilePath& base_name,
+                                          const char* extension) {
+  FilePath full_pathname;
+  int number = 0;
+  do {
+    full_pathname.Set(MakeFileName(directory, base_name, number++, extension));
+  } while (full_pathname.FileOrDirectoryExists());
+  return full_pathname;
+}
+
+// Returns true if FilePath ends with a path separator, which indicates that
+// it is intended to represent a directory. Returns false otherwise.
+// This does NOT check that a directory (or file) actually exists.
+bool FilePath::IsDirectory() const {
+  return !pathname_.empty() &&
+         IsPathSeparator(pathname_.c_str()[pathname_.length() - 1]);
+}
+
+// Create directories so that path exists. Returns true if successful or if
+// the directories already exist; returns false if unable to create directories
+// for any reason.
+bool FilePath::CreateDirectoriesRecursively() const {
+  if (!this->IsDirectory()) {
+    return false;
+  }
+
+  if (pathname_.length() == 0 || this->DirectoryExists()) {
+    return true;
+  }
+
+  const FilePath parent(this->RemoveTrailingPathSeparator().RemoveFileName());
+  return parent.CreateDirectoriesRecursively() && this->CreateFolder();
+}
+
+// Create the directory so that path exists. Returns true if successful or
+// if the directory already exists; returns false if unable to create the
+// directory for any reason, including if the parent directory does not
+// exist. Not named "CreateDirectory" because that's a macro on Windows.
+bool FilePath::CreateFolder() const {
+#if GTEST_OS_WINDOWS_MOBILE
+  FilePath removed_sep(this->RemoveTrailingPathSeparator());
+  LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str());
+  int result = CreateDirectory(unicode, NULL) ? 0 : -1;
+  delete [] unicode;
+#elif GTEST_OS_WINDOWS
+  int result = _mkdir(pathname_.c_str());
+#else
+  int result = mkdir(pathname_.c_str(), 0777);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+  if (result == -1) {
+    return this->DirectoryExists();  // An error is OK if the directory exists.
+  }
+  return true;  // No error.
+}
+
+// If input name has a trailing separator character, remove it and return the
+// name, otherwise return the name string unmodified.
+// On Windows platform, uses \ as the separator, other platforms use /.
+FilePath FilePath::RemoveTrailingPathSeparator() const {
+  return IsDirectory()
+      ? FilePath(pathname_.substr(0, pathname_.length() - 1))
+      : *this;
+}
+
+// Removes any redundant separators that might be in the pathname.
+// For example, "bar///foo" becomes "bar/foo". Does not eliminate other
+// redundancies that might be in a pathname involving "." or "..".
+// FIXME: handle Windows network shares (e.g. \\server\share).
+void FilePath::Normalize() {
+  if (pathname_.c_str() == NULL) {
+    pathname_ = "";
+    return;
+  }
+  const char* src = pathname_.c_str();
+  char* const dest = new char[pathname_.length() + 1];
+  char* dest_ptr = dest;
+  memset(dest_ptr, 0, pathname_.length() + 1);
+
+  while (*src != '\0') {
+    *dest_ptr = *src;
+    if (!IsPathSeparator(*src)) {
+      src++;
+    } else {
+#if GTEST_HAS_ALT_PATH_SEP_
+      if (*dest_ptr == kAlternatePathSeparator) {
+        *dest_ptr = kPathSeparator;
+      }
+#endif
+      while (IsPathSeparator(*src))
+        src++;
+    }
+    dest_ptr++;
+  }
+  *dest_ptr = '\0';
+  pathname_ = dest;
+  delete[] dest;
+}
+
+}  // namespace internal
+}  // namespace testing
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+#include <limits.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <fstream>
+
+#if GTEST_OS_WINDOWS
+# include <windows.h>
+# include <io.h>
+# include <sys/stat.h>
+# include <map>  // Used in ThreadLocal.
+#else
+# include <unistd.h>
+#endif  // GTEST_OS_WINDOWS
+
+#if GTEST_OS_MAC
+# include <mach/mach_init.h>
+# include <mach/task.h>
+# include <mach/vm_map.h>
+#endif  // GTEST_OS_MAC
+
+#if GTEST_OS_QNX
+# include <devctl.h>
+# include <fcntl.h>
+# include <sys/procfs.h>
+#endif  // GTEST_OS_QNX
+
+#if GTEST_OS_AIX
+# include <procinfo.h>
+# include <sys/types.h>
+#endif  // GTEST_OS_AIX
+
+#if GTEST_OS_FUCHSIA
+# include <zircon/process.h>
+# include <zircon/syscalls.h>
+#endif  // GTEST_OS_FUCHSIA
+
+
+namespace testing {
+namespace internal {
+
+#if defined(_MSC_VER) || defined(__BORLANDC__)
+// MSVC and C++Builder do not provide a definition of STDERR_FILENO.
+const int kStdOutFileno = 1;
+const int kStdErrFileno = 2;
+#else
+const int kStdOutFileno = STDOUT_FILENO;
+const int kStdErrFileno = STDERR_FILENO;
+#endif  // _MSC_VER
+
+#if GTEST_OS_LINUX
+
+namespace {
+template <typename T>
+T ReadProcFileField(const std::string& filename, int field) {
+  std::string dummy;
+  std::ifstream file(filename.c_str());
+  while (field-- > 0) {
+    file >> dummy;
+  }
+  T output = 0;
+  file >> output;
+  return output;
+}
+}  // namespace
+
+// Returns the number of active threads, or 0 when there is an error.
+size_t GetThreadCount() {
+  const std::string filename =
+      (Message() << "/proc/" << getpid() << "/stat").GetString();
+  return ReadProcFileField<int>(filename, 19);
+}
+
+#elif GTEST_OS_MAC
+
+size_t GetThreadCount() {
+  const task_t task = mach_task_self();
+  mach_msg_type_number_t thread_count;
+  thread_act_array_t thread_list;
+  const kern_return_t status = task_threads(task, &thread_list, &thread_count);
+  if (status == KERN_SUCCESS) {
+    // task_threads allocates resources in thread_list and we need to free them
+    // to avoid leaks.
+    vm_deallocate(task,
+                  reinterpret_cast<vm_address_t>(thread_list),
+                  sizeof(thread_t) * thread_count);
+    return static_cast<size_t>(thread_count);
+  } else {
+    return 0;
+  }
+}
+
+#elif GTEST_OS_QNX
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+size_t GetThreadCount() {
+  const int fd = open("/proc/self/as", O_RDONLY);
+  if (fd < 0) {
+    return 0;
+  }
+  procfs_info process_info;
+  const int status =
+      devctl(fd, DCMD_PROC_INFO, &process_info, sizeof(process_info), NULL);
+  close(fd);
+  if (status == EOK) {
+    return static_cast<size_t>(process_info.num_threads);
+  } else {
+    return 0;
+  }
+}
+
+#elif GTEST_OS_AIX
+
+size_t GetThreadCount() {
+  struct procentry64 entry;
+  pid_t pid = getpid();
+  int status = getprocs64(&entry, sizeof(entry), NULL, 0, &pid, 1);
+  if (status == 1) {
+    return entry.pi_thcount;
+  } else {
+    return 0;
+  }
+}
+
+#elif GTEST_OS_FUCHSIA
+
+size_t GetThreadCount() {
+  int dummy_buffer;
+  size_t avail;
+  zx_status_t status = zx_object_get_info(
+      zx_process_self(),
+      ZX_INFO_PROCESS_THREADS,
+      &dummy_buffer,
+      0,
+      nullptr,
+      &avail);
+  if (status == ZX_OK) {
+    return avail;
+  } else {
+    return 0;
+  }
+}
+
+#else
+
+size_t GetThreadCount() {
+  // There's no portable way to detect the number of threads, so we just
+  // return 0 to indicate that we cannot detect it.
+  return 0;
+}
+
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
+
+void SleepMilliseconds(int n) {
+  ::Sleep(n);
+}
+
+AutoHandle::AutoHandle()
+    : handle_(INVALID_HANDLE_VALUE) {}
+
+AutoHandle::AutoHandle(Handle handle)
+    : handle_(handle) {}
+
+AutoHandle::~AutoHandle() {
+  Reset();
+}
+
+AutoHandle::Handle AutoHandle::Get() const {
+  return handle_;
+}
+
+void AutoHandle::Reset() {
+  Reset(INVALID_HANDLE_VALUE);
+}
+
+void AutoHandle::Reset(HANDLE handle) {
+  // Resetting with the same handle we already own is invalid.
+  if (handle_ != handle) {
+    if (IsCloseable()) {
+      ::CloseHandle(handle_);
+    }
+    handle_ = handle;
+  } else {
+    GTEST_CHECK_(!IsCloseable())
+        << "Resetting a valid handle to itself is likely a programmer error "
+            "and thus not allowed.";
+  }
+}
+
+bool AutoHandle::IsCloseable() const {
+  // Different Windows APIs may use either of these values to represent an
+  // invalid handle.
+  return handle_ != NULL && handle_ != INVALID_HANDLE_VALUE;
+}
+
+Notification::Notification()
+    : event_(::CreateEvent(NULL,   // Default security attributes.
+                           TRUE,   // Do not reset automatically.
+                           FALSE,  // Initially unset.
+                           NULL)) {  // Anonymous event.
+  GTEST_CHECK_(event_.Get() != NULL);
+}
+
+void Notification::Notify() {
+  GTEST_CHECK_(::SetEvent(event_.Get()) != FALSE);
+}
+
+void Notification::WaitForNotification() {
+  GTEST_CHECK_(
+      ::WaitForSingleObject(event_.Get(), INFINITE) == WAIT_OBJECT_0);
+}
+
+Mutex::Mutex()
+    : owner_thread_id_(0),
+      type_(kDynamic),
+      critical_section_init_phase_(0),
+      critical_section_(new CRITICAL_SECTION) {
+  ::InitializeCriticalSection(critical_section_);
+}
+
+Mutex::~Mutex() {
+  // Static mutexes are leaked intentionally. It is not thread-safe to try
+  // to clean them up.
+  // FIXME: Switch to Slim Reader/Writer (SRW) Locks, which requires
+  // nothing to clean it up but is available only on Vista and later.
+  // https://docs.microsoft.com/en-us/windows/desktop/Sync/slim-reader-writer--srw--locks
+  if (type_ == kDynamic) {
+    ::DeleteCriticalSection(critical_section_);
+    delete critical_section_;
+    critical_section_ = NULL;
+  }
+}
+
+void Mutex::Lock() {
+  ThreadSafeLazyInit();
+  ::EnterCriticalSection(critical_section_);
+  owner_thread_id_ = ::GetCurrentThreadId();
+}
+
+void Mutex::Unlock() {
+  ThreadSafeLazyInit();
+  // We don't protect writing to owner_thread_id_ here, as it's the
+  // caller's responsibility to ensure that the current thread holds the
+  // mutex when this is called.
+  owner_thread_id_ = 0;
+  ::LeaveCriticalSection(critical_section_);
+}
+
+// Does nothing if the current thread holds the mutex. Otherwise, crashes
+// with high probability.
+void Mutex::AssertHeld() {
+  ThreadSafeLazyInit();
+  GTEST_CHECK_(owner_thread_id_ == ::GetCurrentThreadId())
+      << "The current thread is not holding the mutex @" << this;
+}
+
+namespace {
+
+// Use the RAII idiom to flag mem allocs that are intentionally never
+// deallocated. The motivation is to silence the false positive mem leaks
+// that are reported by the debug version of MS's CRT which can only detect
+// if an alloc is missing a matching deallocation.
+// Example:
+//    MemoryIsNotDeallocated memory_is_not_deallocated;
+//    critical_section_ = new CRITICAL_SECTION;
+//
+class MemoryIsNotDeallocated
+{
+ public:
+  MemoryIsNotDeallocated() : old_crtdbg_flag_(0) {
+#ifdef _MSC_VER
+    old_crtdbg_flag_ = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG);
+    // Set heap allocation block type to _IGNORE_BLOCK so that MS debug CRT
+    // doesn't report mem leak if there's no matching deallocation.
+    _CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF);
+#endif  //  _MSC_VER
+  }
+
+  ~MemoryIsNotDeallocated() {
+#ifdef _MSC_VER
+    // Restore the original _CRTDBG_ALLOC_MEM_DF flag
+    _CrtSetDbgFlag(old_crtdbg_flag_);
+#endif  //  _MSC_VER
+  }
+
+ private:
+  int old_crtdbg_flag_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MemoryIsNotDeallocated);
+};
+
+}  // namespace
+
+// Initializes owner_thread_id_ and critical_section_ in static mutexes.
+void Mutex::ThreadSafeLazyInit() {
+  // Dynamic mutexes are initialized in the constructor.
+  if (type_ == kStatic) {
+    switch (
+        ::InterlockedCompareExchange(&critical_section_init_phase_, 1L, 0L)) {
+      case 0:
+        // If critical_section_init_phase_ was 0 before the exchange, we
+        // are the first to test it and need to perform the initialization.
+        owner_thread_id_ = 0;
+        {
+          // Use RAII to flag that following mem alloc is never deallocated.
+          MemoryIsNotDeallocated memory_is_not_deallocated;
+          critical_section_ = new CRITICAL_SECTION;
+        }
+        ::InitializeCriticalSection(critical_section_);
+        // Updates the critical_section_init_phase_ to 2 to signal
+        // initialization complete.
+        GTEST_CHECK_(::InterlockedCompareExchange(
+                          &critical_section_init_phase_, 2L, 1L) ==
+                      1L);
+        break;
+      case 1:
+        // Somebody else is already initializing the mutex; spin until they
+        // are done.
+        while (::InterlockedCompareExchange(&critical_section_init_phase_,
+                                            2L,
+                                            2L) != 2L) {
+          // Possibly yields the rest of the thread's time slice to other
+          // threads.
+          ::Sleep(0);
+        }
+        break;
+
+      case 2:
+        break;  // The mutex is already initialized and ready for use.
+
+      default:
+        GTEST_CHECK_(false)
+            << "Unexpected value of critical_section_init_phase_ "
+            << "while initializing a static mutex.";
+    }
+  }
+}
+
+namespace {
+
+class ThreadWithParamSupport : public ThreadWithParamBase {
+ public:
+  static HANDLE CreateThread(Runnable* runnable,
+                             Notification* thread_can_start) {
+    ThreadMainParam* param = new ThreadMainParam(runnable, thread_can_start);
+    DWORD thread_id;
+    // FIXME: Consider to use _beginthreadex instead.
+    HANDLE thread_handle = ::CreateThread(
+        NULL,    // Default security.
+        0,       // Default stack size.
+        &ThreadWithParamSupport::ThreadMain,
+        param,   // Parameter to ThreadMainStatic
+        0x0,     // Default creation flags.
+        &thread_id);  // Need a valid pointer for the call to work under Win98.
+    GTEST_CHECK_(thread_handle != NULL) << "CreateThread failed with error "
+                                        << ::GetLastError() << ".";
+    if (thread_handle == NULL) {
+      delete param;
+    }
+    return thread_handle;
+  }
+
+ private:
+  struct ThreadMainParam {
+    ThreadMainParam(Runnable* runnable, Notification* thread_can_start)
+        : runnable_(runnable),
+          thread_can_start_(thread_can_start) {
+    }
+    scoped_ptr<Runnable> runnable_;
+    // Does not own.
+    Notification* thread_can_start_;
+  };
+
+  static DWORD WINAPI ThreadMain(void* ptr) {
+    // Transfers ownership.
+    scoped_ptr<ThreadMainParam> param(static_cast<ThreadMainParam*>(ptr));
+    if (param->thread_can_start_ != NULL)
+      param->thread_can_start_->WaitForNotification();
+    param->runnable_->Run();
+    return 0;
+  }
+
+  // Prohibit instantiation.
+  ThreadWithParamSupport();
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParamSupport);
+};
+
+}  // namespace
+
+ThreadWithParamBase::ThreadWithParamBase(Runnable *runnable,
+                                         Notification* thread_can_start)
+      : thread_(ThreadWithParamSupport::CreateThread(runnable,
+                                                     thread_can_start)) {
+}
+
+ThreadWithParamBase::~ThreadWithParamBase() {
+  Join();
+}
+
+void ThreadWithParamBase::Join() {
+  GTEST_CHECK_(::WaitForSingleObject(thread_.Get(), INFINITE) == WAIT_OBJECT_0)
+      << "Failed to join the thread with error " << ::GetLastError() << ".";
+}
+
+// Maps a thread to a set of ThreadIdToThreadLocals that have values
+// instantiated on that thread and notifies them when the thread exits.  A
+// ThreadLocal instance is expected to persist until all threads it has
+// values on have terminated.
+class ThreadLocalRegistryImpl {
+ public:
+  // Registers thread_local_instance as having value on the current thread.
+  // Returns a value that can be used to identify the thread from other threads.
+  static ThreadLocalValueHolderBase* GetValueOnCurrentThread(
+      const ThreadLocalBase* thread_local_instance) {
+    DWORD current_thread = ::GetCurrentThreadId();
+    MutexLock lock(&mutex_);
+    ThreadIdToThreadLocals* const thread_to_thread_locals =
+        GetThreadLocalsMapLocked();
+    ThreadIdToThreadLocals::iterator thread_local_pos =
+        thread_to_thread_locals->find(current_thread);
+    if (thread_local_pos == thread_to_thread_locals->end()) {
+      thread_local_pos = thread_to_thread_locals->insert(
+          std::make_pair(current_thread, ThreadLocalValues())).first;
+      StartWatcherThreadFor(current_thread);
+    }
+    ThreadLocalValues& thread_local_values = thread_local_pos->second;
+    ThreadLocalValues::iterator value_pos =
+        thread_local_values.find(thread_local_instance);
+    if (value_pos == thread_local_values.end()) {
+      value_pos =
+          thread_local_values
+              .insert(std::make_pair(
+                  thread_local_instance,
+                  linked_ptr<ThreadLocalValueHolderBase>(
+                      thread_local_instance->NewValueForCurrentThread())))
+              .first;
+    }
+    return value_pos->second.get();
+  }
+
+  static void OnThreadLocalDestroyed(
+      const ThreadLocalBase* thread_local_instance) {
+    std::vector<linked_ptr<ThreadLocalValueHolderBase> > value_holders;
+    // Clean up the ThreadLocalValues data structure while holding the lock, but
+    // defer the destruction of the ThreadLocalValueHolderBases.
+    {
+      MutexLock lock(&mutex_);
+      ThreadIdToThreadLocals* const thread_to_thread_locals =
+          GetThreadLocalsMapLocked();
+      for (ThreadIdToThreadLocals::iterator it =
+          thread_to_thread_locals->begin();
+          it != thread_to_thread_locals->end();
+          ++it) {
+        ThreadLocalValues& thread_local_values = it->second;
+        ThreadLocalValues::iterator value_pos =
+            thread_local_values.find(thread_local_instance);
+        if (value_pos != thread_local_values.end()) {
+          value_holders.push_back(value_pos->second);
+          thread_local_values.erase(value_pos);
+          // This 'if' can only be successful at most once, so theoretically we
+          // could break out of the loop here, but we don't bother doing so.
+        }
+      }
+    }
+    // Outside the lock, let the destructor for 'value_holders' deallocate the
+    // ThreadLocalValueHolderBases.
+  }
+
+  static void OnThreadExit(DWORD thread_id) {
+    GTEST_CHECK_(thread_id != 0) << ::GetLastError();
+    std::vector<linked_ptr<ThreadLocalValueHolderBase> > value_holders;
+    // Clean up the ThreadIdToThreadLocals data structure while holding the
+    // lock, but defer the destruction of the ThreadLocalValueHolderBases.
+    {
+      MutexLock lock(&mutex_);
+      ThreadIdToThreadLocals* const thread_to_thread_locals =
+          GetThreadLocalsMapLocked();
+      ThreadIdToThreadLocals::iterator thread_local_pos =
+          thread_to_thread_locals->find(thread_id);
+      if (thread_local_pos != thread_to_thread_locals->end()) {
+        ThreadLocalValues& thread_local_values = thread_local_pos->second;
+        for (ThreadLocalValues::iterator value_pos =
+            thread_local_values.begin();
+            value_pos != thread_local_values.end();
+            ++value_pos) {
+          value_holders.push_back(value_pos->second);
+        }
+        thread_to_thread_locals->erase(thread_local_pos);
+      }
+    }
+    // Outside the lock, let the destructor for 'value_holders' deallocate the
+    // ThreadLocalValueHolderBases.
+  }
+
+ private:
+  // In a particular thread, maps a ThreadLocal object to its value.
+  typedef std::map<const ThreadLocalBase*,
+                   linked_ptr<ThreadLocalValueHolderBase> > ThreadLocalValues;
+  // Stores all ThreadIdToThreadLocals having values in a thread, indexed by
+  // thread's ID.
+  typedef std::map<DWORD, ThreadLocalValues> ThreadIdToThreadLocals;
+
+  // Holds the thread id and thread handle that we pass from
+  // StartWatcherThreadFor to WatcherThreadFunc.
+  typedef std::pair<DWORD, HANDLE> ThreadIdAndHandle;
+
+  static void StartWatcherThreadFor(DWORD thread_id) {
+    // The returned handle will be kept in thread_map and closed by
+    // watcher_thread in WatcherThreadFunc.
+    HANDLE thread = ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION,
+                                 FALSE,
+                                 thread_id);
+    GTEST_CHECK_(thread != NULL);
+    // We need to pass a valid thread ID pointer into CreateThread for it
+    // to work correctly under Win98.
+    DWORD watcher_thread_id;
+    HANDLE watcher_thread = ::CreateThread(
+        NULL,   // Default security.
+        0,      // Default stack size
+        &ThreadLocalRegistryImpl::WatcherThreadFunc,
+        reinterpret_cast<LPVOID>(new ThreadIdAndHandle(thread_id, thread)),
+        CREATE_SUSPENDED,
+        &watcher_thread_id);
+    GTEST_CHECK_(watcher_thread != NULL);
+    // Give the watcher thread the same priority as ours to avoid being
+    // blocked by it.
+    ::SetThreadPriority(watcher_thread,
+                        ::GetThreadPriority(::GetCurrentThread()));
+    ::ResumeThread(watcher_thread);
+    ::CloseHandle(watcher_thread);
+  }
+
+  // Monitors exit from a given thread and notifies those
+  // ThreadIdToThreadLocals about thread termination.
+  static DWORD WINAPI WatcherThreadFunc(LPVOID param) {
+    const ThreadIdAndHandle* tah =
+        reinterpret_cast<const ThreadIdAndHandle*>(param);
+    GTEST_CHECK_(
+        ::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0);
+    OnThreadExit(tah->first);
+    ::CloseHandle(tah->second);
+    delete tah;
+    return 0;
+  }
+
+  // Returns map of thread local instances.
+  static ThreadIdToThreadLocals* GetThreadLocalsMapLocked() {
+    mutex_.AssertHeld();
+    MemoryIsNotDeallocated memory_is_not_deallocated;
+    static ThreadIdToThreadLocals* map = new ThreadIdToThreadLocals();
+    return map;
+  }
+
+  // Protects access to GetThreadLocalsMapLocked() and its return value.
+  static Mutex mutex_;
+  // Protects access to GetThreadMapLocked() and its return value.
+  static Mutex thread_map_mutex_;
+};
+
+Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex);
+Mutex ThreadLocalRegistryImpl::thread_map_mutex_(Mutex::kStaticMutex);
+
+ThreadLocalValueHolderBase* ThreadLocalRegistry::GetValueOnCurrentThread(
+      const ThreadLocalBase* thread_local_instance) {
+  return ThreadLocalRegistryImpl::GetValueOnCurrentThread(
+      thread_local_instance);
+}
+
+void ThreadLocalRegistry::OnThreadLocalDestroyed(
+      const ThreadLocalBase* thread_local_instance) {
+  ThreadLocalRegistryImpl::OnThreadLocalDestroyed(thread_local_instance);
+}
+
+#endif  // GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
+
+#if GTEST_USES_POSIX_RE
+
+// Implements RE.  Currently only needed for death tests.
+
+RE::~RE() {
+  if (is_valid_) {
+    // regfree'ing an invalid regex might crash because the content
+    // of the regex is undefined. Since the regex's are essentially
+    // the same, one cannot be valid (or invalid) without the other
+    // being so too.
+    regfree(&partial_regex_);
+    regfree(&full_regex_);
+  }
+  free(const_cast<char*>(pattern_));
+}
+
+// Returns true iff regular expression re matches the entire str.
+bool RE::FullMatch(const char* str, const RE& re) {
+  if (!re.is_valid_) return false;
+
+  regmatch_t match;
+  return regexec(&re.full_regex_, str, 1, &match, 0) == 0;
+}
+
+// Returns true iff regular expression re matches a substring of str
+// (including str itself).
+bool RE::PartialMatch(const char* str, const RE& re) {
+  if (!re.is_valid_) return false;
+
+  regmatch_t match;
+  return regexec(&re.partial_regex_, str, 1, &match, 0) == 0;
+}
+
+// Initializes an RE from its string representation.
+void RE::Init(const char* regex) {
+  pattern_ = posix::StrDup(regex);
+
+  // Reserves enough bytes to hold the regular expression used for a
+  // full match.
+  const size_t full_regex_len = strlen(regex) + 10;
+  char* const full_pattern = new char[full_regex_len];
+
+  snprintf(full_pattern, full_regex_len, "^(%s)$", regex);
+  is_valid_ = regcomp(&full_regex_, full_pattern, REG_EXTENDED) == 0;
+  // We want to call regcomp(&partial_regex_, ...) even if the
+  // previous expression returns false.  Otherwise partial_regex_ may
+  // not be properly initialized can may cause trouble when it's
+  // freed.
+  //
+  // Some implementation of POSIX regex (e.g. on at least some
+  // versions of Cygwin) doesn't accept the empty string as a valid
+  // regex.  We change it to an equivalent form "()" to be safe.
+  if (is_valid_) {
+    const char* const partial_regex = (*regex == '\0') ? "()" : regex;
+    is_valid_ = regcomp(&partial_regex_, partial_regex, REG_EXTENDED) == 0;
+  }
+  EXPECT_TRUE(is_valid_)
+      << "Regular expression \"" << regex
+      << "\" is not a valid POSIX Extended regular expression.";
+
+  delete[] full_pattern;
+}
+
+#elif GTEST_USES_SIMPLE_RE
+
+// Returns true iff ch appears anywhere in str (excluding the
+// terminating '\0' character).
+bool IsInSet(char ch, const char* str) {
+  return ch != '\0' && strchr(str, ch) != NULL;
+}
+
+// Returns true iff ch belongs to the given classification.  Unlike
+// similar functions in <ctype.h>, these aren't affected by the
+// current locale.
+bool IsAsciiDigit(char ch) { return '0' <= ch && ch <= '9'; }
+bool IsAsciiPunct(char ch) {
+  return IsInSet(ch, "^-!\"#$%&'()*+,./:;<=>?@[\\]_`{|}~");
+}
+bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); }
+bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); }
+bool IsAsciiWordChar(char ch) {
+  return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') ||
+      ('0' <= ch && ch <= '9') || ch == '_';
+}
+
+// Returns true iff "\\c" is a supported escape sequence.
+bool IsValidEscape(char c) {
+  return (IsAsciiPunct(c) || IsInSet(c, "dDfnrsStvwW"));
+}
+
+// Returns true iff the given atom (specified by escaped and pattern)
+// matches ch.  The result is undefined if the atom is invalid.
+bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
+  if (escaped) {  // "\\p" where p is pattern_char.
+    switch (pattern_char) {
+      case 'd': return IsAsciiDigit(ch);
+      case 'D': return !IsAsciiDigit(ch);
+      case 'f': return ch == '\f';
+      case 'n': return ch == '\n';
+      case 'r': return ch == '\r';
+      case 's': return IsAsciiWhiteSpace(ch);
+      case 'S': return !IsAsciiWhiteSpace(ch);
+      case 't': return ch == '\t';
+      case 'v': return ch == '\v';
+      case 'w': return IsAsciiWordChar(ch);
+      case 'W': return !IsAsciiWordChar(ch);
+    }
+    return IsAsciiPunct(pattern_char) && pattern_char == ch;
+  }
+
+  return (pattern_char == '.' && ch != '\n') || pattern_char == ch;
+}
+
+// Helper function used by ValidateRegex() to format error messages.
+static std::string FormatRegexSyntaxError(const char* regex, int index) {
+  return (Message() << "Syntax error at index " << index
+          << " in simple regular expression \"" << regex << "\": ").GetString();
+}
+
+// Generates non-fatal failures and returns false if regex is invalid;
+// otherwise returns true.
+bool ValidateRegex(const char* regex) {
+  if (regex == NULL) {
+    // FIXME: fix the source file location in the
+    // assertion failures to match where the regex is used in user
+    // code.
+    ADD_FAILURE() << "NULL is not a valid simple regular expression.";
+    return false;
+  }
+
+  bool is_valid = true;
+
+  // True iff ?, *, or + can follow the previous atom.
+  bool prev_repeatable = false;
+  for (int i = 0; regex[i]; i++) {
+    if (regex[i] == '\\') {  // An escape sequence
+      i++;
+      if (regex[i] == '\0') {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
+                      << "'\\' cannot appear at the end.";
+        return false;
+      }
+
+      if (!IsValidEscape(regex[i])) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
+                      << "invalid escape sequence \"\\" << regex[i] << "\".";
+        is_valid = false;
+      }
+      prev_repeatable = true;
+    } else {  // Not an escape sequence.
+      const char ch = regex[i];
+
+      if (ch == '^' && i > 0) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'^' can only appear at the beginning.";
+        is_valid = false;
+      } else if (ch == '$' && regex[i + 1] != '\0') {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'$' can only appear at the end.";
+        is_valid = false;
+      } else if (IsInSet(ch, "()[]{}|")) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'" << ch << "' is unsupported.";
+        is_valid = false;
+      } else if (IsRepeat(ch) && !prev_repeatable) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'" << ch << "' can only follow a repeatable token.";
+        is_valid = false;
+      }
+
+      prev_repeatable = !IsInSet(ch, "^$?*+");
+    }
+  }
+
+  return is_valid;
+}
+
+// Matches a repeated regex atom followed by a valid simple regular
+// expression.  The regex atom is defined as c if escaped is false,
+// or \c otherwise.  repeat is the repetition meta character (?, *,
+// or +).  The behavior is undefined if str contains too many
+// characters to be indexable by size_t, in which case the test will
+// probably time out anyway.  We are fine with this limitation as
+// std::string has it too.
+bool MatchRepetitionAndRegexAtHead(
+    bool escaped, char c, char repeat, const char* regex,
+    const char* str) {
+  const size_t min_count = (repeat == '+') ? 1 : 0;
+  const size_t max_count = (repeat == '?') ? 1 :
+      static_cast<size_t>(-1) - 1;
+  // We cannot call numeric_limits::max() as it conflicts with the
+  // max() macro on Windows.
+
+  for (size_t i = 0; i <= max_count; ++i) {
+    // We know that the atom matches each of the first i characters in str.
+    if (i >= min_count && MatchRegexAtHead(regex, str + i)) {
+      // We have enough matches at the head, and the tail matches too.
+      // Since we only care about *whether* the pattern matches str
+      // (as opposed to *how* it matches), there is no need to find a
+      // greedy match.
+      return true;
+    }
+    if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i]))
+      return false;
+  }
+  return false;
+}
+
+// Returns true iff regex matches a prefix of str.  regex must be a
+// valid simple regular expression and not start with "^", or the
+// result is undefined.
+bool MatchRegexAtHead(const char* regex, const char* str) {
+  if (*regex == '\0')  // An empty regex matches a prefix of anything.
+    return true;
+
+  // "$" only matches the end of a string.  Note that regex being
+  // valid guarantees that there's nothing after "$" in it.
+  if (*regex == '$')
+    return *str == '\0';
+
+  // Is the first thing in regex an escape sequence?
+  const bool escaped = *regex == '\\';
+  if (escaped)
+    ++regex;
+  if (IsRepeat(regex[1])) {
+    // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so
+    // here's an indirect recursion.  It terminates as the regex gets
+    // shorter in each recursion.
+    return MatchRepetitionAndRegexAtHead(
+        escaped, regex[0], regex[1], regex + 2, str);
+  } else {
+    // regex isn't empty, isn't "$", and doesn't start with a
+    // repetition.  We match the first atom of regex with the first
+    // character of str and recurse.
+    return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) &&
+        MatchRegexAtHead(regex + 1, str + 1);
+  }
+}
+
+// Returns true iff regex matches any substring of str.  regex must be
+// a valid simple regular expression, or the result is undefined.
+//
+// The algorithm is recursive, but the recursion depth doesn't exceed
+// the regex length, so we won't need to worry about running out of
+// stack space normally.  In rare cases the time complexity can be
+// exponential with respect to the regex length + the string length,
+// but usually it's must faster (often close to linear).
+bool MatchRegexAnywhere(const char* regex, const char* str) {
+  if (regex == NULL || str == NULL)
+    return false;
+
+  if (*regex == '^')
+    return MatchRegexAtHead(regex + 1, str);
+
+  // A successful match can be anywhere in str.
+  do {
+    if (MatchRegexAtHead(regex, str))
+      return true;
+  } while (*str++ != '\0');
+  return false;
+}
+
+// Implements the RE class.
+
+RE::~RE() {
+  free(const_cast<char*>(pattern_));
+  free(const_cast<char*>(full_pattern_));
+}
+
+// Returns true iff regular expression re matches the entire str.
+bool RE::FullMatch(const char* str, const RE& re) {
+  return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str);
+}
+
+// Returns true iff regular expression re matches a substring of str
+// (including str itself).
+bool RE::PartialMatch(const char* str, const RE& re) {
+  return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str);
+}
+
+// Initializes an RE from its string representation.
+void RE::Init(const char* regex) {
+  pattern_ = full_pattern_ = NULL;
+  if (regex != NULL) {
+    pattern_ = posix::StrDup(regex);
+  }
+
+  is_valid_ = ValidateRegex(regex);
+  if (!is_valid_) {
+    // No need to calculate the full pattern when the regex is invalid.
+    return;
+  }
+
+  const size_t len = strlen(regex);
+  // Reserves enough bytes to hold the regular expression used for a
+  // full match: we need space to prepend a '^', append a '$', and
+  // terminate the string with '\0'.
+  char* buffer = static_cast<char*>(malloc(len + 3));
+  full_pattern_ = buffer;
+
+  if (*regex != '^')
+    *buffer++ = '^';  // Makes sure full_pattern_ starts with '^'.
+
+  // We don't use snprintf or strncpy, as they trigger a warning when
+  // compiled with VC++ 8.0.
+  memcpy(buffer, regex, len);
+  buffer += len;
+
+  if (len == 0 || regex[len - 1] != '$')
+    *buffer++ = '$';  // Makes sure full_pattern_ ends with '$'.
+
+  *buffer = '\0';
+}
+
+#endif  // GTEST_USES_POSIX_RE
+
+const char kUnknownFile[] = "unknown file";
+
+// Formats a source file path and a line number as they would appear
+// in an error message from the compiler used to compile this code.
+GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) {
+  const std::string file_name(file == NULL ? kUnknownFile : file);
+
+  if (line < 0) {
+    return file_name + ":";
+  }
+#ifdef _MSC_VER
+  return file_name + "(" + StreamableToString(line) + "):";
+#else
+  return file_name + ":" + StreamableToString(line) + ":";
+#endif  // _MSC_VER
+}
+
+// Formats a file location for compiler-independent XML output.
+// Although this function is not platform dependent, we put it next to
+// FormatFileLocation in order to contrast the two functions.
+// Note that FormatCompilerIndependentFileLocation() does NOT append colon
+// to the file location it produces, unlike FormatFileLocation().
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
+    const char* file, int line) {
+  const std::string file_name(file == NULL ? kUnknownFile : file);
+
+  if (line < 0)
+    return file_name;
+  else
+    return file_name + ":" + StreamableToString(line);
+}
+
+GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line)
+    : severity_(severity) {
+  const char* const marker =
+      severity == GTEST_INFO ?    "[  INFO ]" :
+      severity == GTEST_WARNING ? "[WARNING]" :
+      severity == GTEST_ERROR ?   "[ ERROR ]" : "[ FATAL ]";
+  GetStream() << ::std::endl << marker << " "
+              << FormatFileLocation(file, line).c_str() << ": ";
+}
+
+// Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
+GTestLog::~GTestLog() {
+  GetStream() << ::std::endl;
+  if (severity_ == GTEST_FATAL) {
+    fflush(stderr);
+    posix::Abort();
+  }
+}
+
+// Disable Microsoft deprecation warnings for POSIX functions called from
+// this class (creat, dup, dup2, and close)
+GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Object that captures an output stream (stdout/stderr).
+class CapturedStream {
+ public:
+  // The ctor redirects the stream to a temporary file.
+  explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) {
+# if GTEST_OS_WINDOWS
+    char temp_dir_path[MAX_PATH + 1] = { '\0' };  // NOLINT
+    char temp_file_path[MAX_PATH + 1] = { '\0' };  // NOLINT
+
+    ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path);
+    const UINT success = ::GetTempFileNameA(temp_dir_path,
+                                            "gtest_redir",
+                                            0,  // Generate unique file name.
+                                            temp_file_path);
+    GTEST_CHECK_(success != 0)
+        << "Unable to create a temporary file in " << temp_dir_path;
+    const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE);
+    GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file "
+                                    << temp_file_path;
+    filename_ = temp_file_path;
+# else
+    // There's no guarantee that a test has write access to the current
+    // directory, so we create the temporary file in the /tmp directory
+    // instead. We use /tmp on most systems, and /sdcard on Android.
+    // That's because Android doesn't have /tmp.
+#  if GTEST_OS_LINUX_ANDROID
+    // Note: Android applications are expected to call the framework's
+    // Context.getExternalStorageDirectory() method through JNI to get
+    // the location of the world-writable SD Card directory. However,
+    // this requires a Context handle, which cannot be retrieved
+    // globally from native code. Doing so also precludes running the
+    // code as part of a regular standalone executable, which doesn't
+    // run in a Dalvik process (e.g. when running it through 'adb shell').
+    //
+    // The location /sdcard is directly accessible from native code
+    // and is the only location (unofficially) supported by the Android
+    // team. It's generally a symlink to the real SD Card mount point
+    // which can be /mnt/sdcard, /mnt/sdcard0, /system/media/sdcard, or
+    // other OEM-customized locations. Never rely on these, and always
+    // use /sdcard.
+    char name_template[] = "/sdcard/gtest_captured_stream.XXXXXX";
+#  else
+    char name_template[] = "/tmp/captured_stream.XXXXXX";
+#  endif  // GTEST_OS_LINUX_ANDROID
+    const int captured_fd = mkstemp(name_template);
+    filename_ = name_template;
+# endif  // GTEST_OS_WINDOWS
+    fflush(NULL);
+    dup2(captured_fd, fd_);
+    close(captured_fd);
+  }
+
+  ~CapturedStream() {
+    remove(filename_.c_str());
+  }
+
+  std::string GetCapturedString() {
+    if (uncaptured_fd_ != -1) {
+      // Restores the original stream.
+      fflush(NULL);
+      dup2(uncaptured_fd_, fd_);
+      close(uncaptured_fd_);
+      uncaptured_fd_ = -1;
+    }
+
+    FILE* const file = posix::FOpen(filename_.c_str(), "r");
+    const std::string content = ReadEntireFile(file);
+    posix::FClose(file);
+    return content;
+  }
+
+ private:
+  const int fd_;  // A stream to capture.
+  int uncaptured_fd_;
+  // Name of the temporary file holding the stderr output.
+  ::std::string filename_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream);
+};
+
+GTEST_DISABLE_MSC_DEPRECATED_POP_()
+
+static CapturedStream* g_captured_stderr = NULL;
+static CapturedStream* g_captured_stdout = NULL;
+
+// Starts capturing an output stream (stdout/stderr).
+static void CaptureStream(int fd, const char* stream_name,
+                          CapturedStream** stream) {
+  if (*stream != NULL) {
+    GTEST_LOG_(FATAL) << "Only one " << stream_name
+                      << " capturer can exist at a time.";
+  }
+  *stream = new CapturedStream(fd);
+}
+
+// Stops capturing the output stream and returns the captured string.
+static std::string GetCapturedStream(CapturedStream** captured_stream) {
+  const std::string content = (*captured_stream)->GetCapturedString();
+
+  delete *captured_stream;
+  *captured_stream = NULL;
+
+  return content;
+}
+
+// Starts capturing stdout.
+void CaptureStdout() {
+  CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout);
+}
+
+// Starts capturing stderr.
+void CaptureStderr() {
+  CaptureStream(kStdErrFileno, "stderr", &g_captured_stderr);
+}
+
+// Stops capturing stdout and returns the captured string.
+std::string GetCapturedStdout() {
+  return GetCapturedStream(&g_captured_stdout);
+}
+
+// Stops capturing stderr and returns the captured string.
+std::string GetCapturedStderr() {
+  return GetCapturedStream(&g_captured_stderr);
+}
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+
+
+
+
+size_t GetFileSize(FILE* file) {
+  fseek(file, 0, SEEK_END);
+  return static_cast<size_t>(ftell(file));
+}
+
+std::string ReadEntireFile(FILE* file) {
+  const size_t file_size = GetFileSize(file);
+  char* const buffer = new char[file_size];
+
+  size_t bytes_last_read = 0;  // # of bytes read in the last fread()
+  size_t bytes_read = 0;       // # of bytes read so far
+
+  fseek(file, 0, SEEK_SET);
+
+  // Keeps reading the file until we cannot read further or the
+  // pre-determined file size is reached.
+  do {
+    bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file);
+    bytes_read += bytes_last_read;
+  } while (bytes_last_read > 0 && bytes_read < file_size);
+
+  const std::string content(buffer, bytes_read);
+  delete[] buffer;
+
+  return content;
+}
+
+#if GTEST_HAS_DEATH_TEST
+static const std::vector<std::string>* g_injected_test_argvs = NULL;  // Owned.
+
+std::vector<std::string> GetInjectableArgvs() {
+  if (g_injected_test_argvs != NULL) {
+    return *g_injected_test_argvs;
+  }
+  return GetArgvs();
+}
+
+void SetInjectableArgvs(const std::vector<std::string>* new_argvs) {
+  if (g_injected_test_argvs != new_argvs) delete g_injected_test_argvs;
+  g_injected_test_argvs = new_argvs;
+}
+
+void SetInjectableArgvs(const std::vector<std::string>& new_argvs) {
+  SetInjectableArgvs(
+      new std::vector<std::string>(new_argvs.begin(), new_argvs.end()));
+}
+
+#if GTEST_HAS_GLOBAL_STRING
+void SetInjectableArgvs(const std::vector< ::string>& new_argvs) {
+  SetInjectableArgvs(
+      new std::vector<std::string>(new_argvs.begin(), new_argvs.end()));
+}
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+void ClearInjectableArgvs() {
+  delete g_injected_test_argvs;
+  g_injected_test_argvs = NULL;
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+#if GTEST_OS_WINDOWS_MOBILE
+namespace posix {
+void Abort() {
+  DebugBreak();
+  TerminateProcess(GetCurrentProcess(), 1);
+}
+}  // namespace posix
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+// Returns the name of the environment variable corresponding to the
+// given flag.  For example, FlagToEnvVar("foo") will return
+// "GTEST_FOO" in the open-source version.
+static std::string FlagToEnvVar(const char* flag) {
+  const std::string full_flag =
+      (Message() << GTEST_FLAG_PREFIX_ << flag).GetString();
+
+  Message env_var;
+  for (size_t i = 0; i != full_flag.length(); i++) {
+    env_var << ToUpper(full_flag.c_str()[i]);
+  }
+
+  return env_var.GetString();
+}
+
+// Parses 'str' for a 32-bit signed integer.  If successful, writes
+// the result to *value and returns true; otherwise leaves *value
+// unchanged and returns false.
+bool ParseInt32(const Message& src_text, const char* str, Int32* value) {
+  // Parses the environment variable as a decimal integer.
+  char* end = NULL;
+  const long long_value = strtol(str, &end, 10);  // NOLINT
+
+  // Has strtol() consumed all characters in the string?
+  if (*end != '\0') {
+    // No - an invalid character was encountered.
+    Message msg;
+    msg << "WARNING: " << src_text
+        << " is expected to be a 32-bit integer, but actually"
+        << " has value \"" << str << "\".\n";
+    printf("%s", msg.GetString().c_str());
+    fflush(stdout);
+    return false;
+  }
+
+  // Is the parsed value in the range of an Int32?
+  const Int32 result = static_cast<Int32>(long_value);
+  if (long_value == LONG_MAX || long_value == LONG_MIN ||
+      // The parsed value overflows as a long.  (strtol() returns
+      // LONG_MAX or LONG_MIN when the input overflows.)
+      result != long_value
+      // The parsed value overflows as an Int32.
+      ) {
+    Message msg;
+    msg << "WARNING: " << src_text
+        << " is expected to be a 32-bit integer, but actually"
+        << " has value " << str << ", which overflows.\n";
+    printf("%s", msg.GetString().c_str());
+    fflush(stdout);
+    return false;
+  }
+
+  *value = result;
+  return true;
+}
+
+// Reads and returns the Boolean environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+//
+// The value is considered true iff it's not "0".
+bool BoolFromGTestEnv(const char* flag, bool default_value) {
+#if defined(GTEST_GET_BOOL_FROM_ENV_)
+  return GTEST_GET_BOOL_FROM_ENV_(flag, default_value);
+#else
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const string_value = posix::GetEnv(env_var.c_str());
+  return string_value == NULL ?
+      default_value : strcmp(string_value, "0") != 0;
+#endif  // defined(GTEST_GET_BOOL_FROM_ENV_)
+}
+
+// Reads and returns a 32-bit integer stored in the environment
+// variable corresponding to the given flag; if it isn't set or
+// doesn't represent a valid 32-bit integer, returns default_value.
+Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) {
+#if defined(GTEST_GET_INT32_FROM_ENV_)
+  return GTEST_GET_INT32_FROM_ENV_(flag, default_value);
+#else
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const string_value = posix::GetEnv(env_var.c_str());
+  if (string_value == NULL) {
+    // The environment variable is not set.
+    return default_value;
+  }
+
+  Int32 result = default_value;
+  if (!ParseInt32(Message() << "Environment variable " << env_var,
+                  string_value, &result)) {
+    printf("The default value %s is used.\n",
+           (Message() << default_value).GetString().c_str());
+    fflush(stdout);
+    return default_value;
+  }
+
+  return result;
+#endif  // defined(GTEST_GET_INT32_FROM_ENV_)
+}
+
+// As a special case for the 'output' flag, if GTEST_OUTPUT is not
+// set, we look for XML_OUTPUT_FILE, which is set by the Bazel build
+// system.  The value of XML_OUTPUT_FILE is a filename without the
+// "xml:" prefix of GTEST_OUTPUT.
+// Note that this is meant to be called at the call site so it does
+// not check that the flag is 'output'
+// In essence this checks an env variable called XML_OUTPUT_FILE
+// and if it is set we prepend "xml:" to its value, if it not set we return ""
+std::string OutputFlagAlsoCheckEnvVar(){
+  std::string default_value_for_output_flag = "";
+  const char* xml_output_file_env = posix::GetEnv("XML_OUTPUT_FILE");
+  if (NULL != xml_output_file_env) {
+    default_value_for_output_flag = std::string("xml:") + xml_output_file_env;
+  }
+  return default_value_for_output_flag;
+}
+
+// Reads and returns the string environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+const char* StringFromGTestEnv(const char* flag, const char* default_value) {
+#if defined(GTEST_GET_STRING_FROM_ENV_)
+  return GTEST_GET_STRING_FROM_ENV_(flag, default_value);
+#else
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const value = posix::GetEnv(env_var.c_str());
+  return value == NULL ? default_value : value;
+#endif  // defined(GTEST_GET_STRING_FROM_ENV_)
+}
+
+}  // namespace internal
+}  // namespace testing
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Test - The Google C++ Testing and Mocking Framework
+//
+// This file implements a universal value printer that can print a
+// value of any type T:
+//
+//   void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
+//
+// It uses the << operator when possible, and prints the bytes in the
+// object otherwise.  A user can override its behavior for a class
+// type Foo by defining either operator<<(::std::ostream&, const Foo&)
+// or void PrintTo(const Foo&, ::std::ostream*) in the namespace that
+// defines Foo.
+
+#include <stdio.h>
+#include <cctype>
+#include <cwchar>
+#include <ostream>  // NOLINT
+#include <string>
+
+namespace testing {
+
+namespace {
+
+using ::std::ostream;
+
+// Prints a segment of bytes in the given object.
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+void PrintByteSegmentInObjectTo(const unsigned char* obj_bytes, size_t start,
+                                size_t count, ostream* os) {
+  char text[5] = "";
+  for (size_t i = 0; i != count; i++) {
+    const size_t j = start + i;
+    if (i != 0) {
+      // Organizes the bytes into groups of 2 for easy parsing by
+      // human.
+      if ((j % 2) == 0)
+        *os << ' ';
+      else
+        *os << '-';
+    }
+    GTEST_SNPRINTF_(text, sizeof(text), "%02X", obj_bytes[j]);
+    *os << text;
+  }
+}
+
+// Prints the bytes in the given value to the given ostream.
+void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count,
+                              ostream* os) {
+  // Tells the user how big the object is.
+  *os << count << "-byte object <";
+
+  const size_t kThreshold = 132;
+  const size_t kChunkSize = 64;
+  // If the object size is bigger than kThreshold, we'll have to omit
+  // some details by printing only the first and the last kChunkSize
+  // bytes.
+  // FIXME: let the user control the threshold using a flag.
+  if (count < kThreshold) {
+    PrintByteSegmentInObjectTo(obj_bytes, 0, count, os);
+  } else {
+    PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os);
+    *os << " ... ";
+    // Rounds up to 2-byte boundary.
+    const size_t resume_pos = (count - kChunkSize + 1)/2*2;
+    PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os);
+  }
+  *os << ">";
+}
+
+}  // namespace
+
+namespace internal2 {
+
+// Delegates to PrintBytesInObjectToImpl() to print the bytes in the
+// given object.  The delegation simplifies the implementation, which
+// uses the << operator and thus is easier done outside of the
+// ::testing::internal namespace, which contains a << operator that
+// sometimes conflicts with the one in STL.
+void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count,
+                          ostream* os) {
+  PrintBytesInObjectToImpl(obj_bytes, count, os);
+}
+
+}  // namespace internal2
+
+namespace internal {
+
+// Depending on the value of a char (or wchar_t), we print it in one
+// of three formats:
+//   - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
+//   - as a hexadecimal escape sequence (e.g. '\x7F'), or
+//   - as a special escape sequence (e.g. '\r', '\n').
+enum CharFormat {
+  kAsIs,
+  kHexEscape,
+  kSpecialEscape
+};
+
+// Returns true if c is a printable ASCII character.  We test the
+// value of c directly instead of calling isprint(), which is buggy on
+// Windows Mobile.
+inline bool IsPrintableAscii(wchar_t c) {
+  return 0x20 <= c && c <= 0x7E;
+}
+
+// Prints a wide or narrow char c as a character literal without the
+// quotes, escaping it when necessary; returns how c was formatted.
+// The template argument UnsignedChar is the unsigned version of Char,
+// which is the type of c.
+template <typename UnsignedChar, typename Char>
+static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) {
+  switch (static_cast<wchar_t>(c)) {
+    case L'\0':
+      *os << "\\0";
+      break;
+    case L'\'':
+      *os << "\\'";
+      break;
+    case L'\\':
+      *os << "\\\\";
+      break;
+    case L'\a':
+      *os << "\\a";
+      break;
+    case L'\b':
+      *os << "\\b";
+      break;
+    case L'\f':
+      *os << "\\f";
+      break;
+    case L'\n':
+      *os << "\\n";
+      break;
+    case L'\r':
+      *os << "\\r";
+      break;
+    case L'\t':
+      *os << "\\t";
+      break;
+    case L'\v':
+      *os << "\\v";
+      break;
+    default:
+      if (IsPrintableAscii(c)) {
+        *os << static_cast<char>(c);
+        return kAsIs;
+      } else {
+        ostream::fmtflags flags = os->flags();
+        *os << "\\x" << std::hex << std::uppercase
+            << static_cast<int>(static_cast<UnsignedChar>(c));
+        os->flags(flags);
+        return kHexEscape;
+      }
+  }
+  return kSpecialEscape;
+}
+
+// Prints a wchar_t c as if it's part of a string literal, escaping it when
+// necessary; returns how c was formatted.
+static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) {
+  switch (c) {
+    case L'\'':
+      *os << "'";
+      return kAsIs;
+    case L'"':
+      *os << "\\\"";
+      return kSpecialEscape;
+    default:
+      return PrintAsCharLiteralTo<wchar_t>(c, os);
+  }
+}
+
+// Prints a char c as if it's part of a string literal, escaping it when
+// necessary; returns how c was formatted.
+static CharFormat PrintAsStringLiteralTo(char c, ostream* os) {
+  return PrintAsStringLiteralTo(
+      static_cast<wchar_t>(static_cast<unsigned char>(c)), os);
+}
+
+// Prints a wide or narrow character c and its code.  '\0' is printed
+// as "'\\0'", other unprintable characters are also properly escaped
+// using the standard C++ escape sequence.  The template argument
+// UnsignedChar is the unsigned version of Char, which is the type of c.
+template <typename UnsignedChar, typename Char>
+void PrintCharAndCodeTo(Char c, ostream* os) {
+  // First, print c as a literal in the most readable form we can find.
+  *os << ((sizeof(c) > 1) ? "L'" : "'");
+  const CharFormat format = PrintAsCharLiteralTo<UnsignedChar>(c, os);
+  *os << "'";
+
+  // To aid user debugging, we also print c's code in decimal, unless
+  // it's 0 (in which case c was printed as '\\0', making the code
+  // obvious).
+  if (c == 0)
+    return;
+  *os << " (" << static_cast<int>(c);
+
+  // For more convenience, we print c's code again in hexadecimal,
+  // unless c was already printed in the form '\x##' or the code is in
+  // [1, 9].
+  if (format == kHexEscape || (1 <= c && c <= 9)) {
+    // Do nothing.
+  } else {
+    *os << ", 0x" << String::FormatHexInt(static_cast<UnsignedChar>(c));
+  }
+  *os << ")";
+}
+
+void PrintTo(unsigned char c, ::std::ostream* os) {
+  PrintCharAndCodeTo<unsigned char>(c, os);
+}
+void PrintTo(signed char c, ::std::ostream* os) {
+  PrintCharAndCodeTo<unsigned char>(c, os);
+}
+
+// Prints a wchar_t as a symbol if it is printable or as its internal
+// code otherwise and also as its code.  L'\0' is printed as "L'\\0'".
+void PrintTo(wchar_t wc, ostream* os) {
+  PrintCharAndCodeTo<wchar_t>(wc, os);
+}
+
+// Prints the given array of characters to the ostream.  CharType must be either
+// char or wchar_t.
+// The array starts at begin, the length is len, it may include '\0' characters
+// and may not be NUL-terminated.
+template <typename CharType>
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+static CharFormat PrintCharsAsStringTo(
+    const CharType* begin, size_t len, ostream* os) {
+  const char* const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\"";
+  *os << kQuoteBegin;
+  bool is_previous_hex = false;
+  CharFormat print_format = kAsIs;
+  for (size_t index = 0; index < len; ++index) {
+    const CharType cur = begin[index];
+    if (is_previous_hex && IsXDigit(cur)) {
+      // Previous character is of '\x..' form and this character can be
+      // interpreted as another hexadecimal digit in its number. Break string to
+      // disambiguate.
+      *os << "\" " << kQuoteBegin;
+    }
+    is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape;
+    // Remember if any characters required hex escaping.
+    if (is_previous_hex) {
+      print_format = kHexEscape;
+    }
+  }
+  *os << "\"";
+  return print_format;
+}
+
+// Prints a (const) char/wchar_t array of 'len' elements, starting at address
+// 'begin'.  CharType must be either char or wchar_t.
+template <typename CharType>
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+static void UniversalPrintCharArray(
+    const CharType* begin, size_t len, ostream* os) {
+  // The code
+  //   const char kFoo[] = "foo";
+  // generates an array of 4, not 3, elements, with the last one being '\0'.
+  //
+  // Therefore when printing a char array, we don't print the last element if
+  // it's '\0', such that the output matches the string literal as it's
+  // written in the source code.
+  if (len > 0 && begin[len - 1] == '\0') {
+    PrintCharsAsStringTo(begin, len - 1, os);
+    return;
+  }
+
+  // If, however, the last element in the array is not '\0', e.g.
+  //    const char kFoo[] = { 'f', 'o', 'o' };
+  // we must print the entire array.  We also print a message to indicate
+  // that the array is not NUL-terminated.
+  PrintCharsAsStringTo(begin, len, os);
+  *os << " (no terminating NUL)";
+}
+
+// Prints a (const) char array of 'len' elements, starting at address 'begin'.
+void UniversalPrintArray(const char* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
+// Prints a (const) wchar_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const wchar_t* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
+// Prints the given C string to the ostream.
+void PrintTo(const char* s, ostream* os) {
+  if (s == NULL) {
+    *os << "NULL";
+  } else {
+    *os << ImplicitCast_<const void*>(s) << " pointing to ";
+    PrintCharsAsStringTo(s, strlen(s), os);
+  }
+}
+
+// MSVC compiler can be configured to define whar_t as a typedef
+// of unsigned short. Defining an overload for const wchar_t* in that case
+// would cause pointers to unsigned shorts be printed as wide strings,
+// possibly accessing more memory than intended and causing invalid
+// memory accesses. MSVC defines _NATIVE_WCHAR_T_DEFINED symbol when
+// wchar_t is implemented as a native type.
+#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
+// Prints the given wide C string to the ostream.
+void PrintTo(const wchar_t* s, ostream* os) {
+  if (s == NULL) {
+    *os << "NULL";
+  } else {
+    *os << ImplicitCast_<const void*>(s) << " pointing to ";
+    PrintCharsAsStringTo(s, std::wcslen(s), os);
+  }
+}
+#endif  // wchar_t is native
+
+namespace {
+
+bool ContainsUnprintableControlCodes(const char* str, size_t length) {
+  const unsigned char *s = reinterpret_cast<const unsigned char *>(str);
+
+  for (size_t i = 0; i < length; i++) {
+    unsigned char ch = *s++;
+    if (std::iscntrl(ch)) {
+        switch (ch) {
+        case '\t':
+        case '\n':
+        case '\r':
+          break;
+        default:
+          return true;
+        }
+      }
+  }
+  return false;
+}
+
+bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t<= 0xbf; }
+
+bool IsValidUTF8(const char* str, size_t length) {
+  const unsigned char *s = reinterpret_cast<const unsigned char *>(str);
+
+  for (size_t i = 0; i < length;) {
+    unsigned char lead = s[i++];
+
+    if (lead <= 0x7f) {
+      continue;  // single-byte character (ASCII) 0..7F
+    }
+    if (lead < 0xc2) {
+      return false;  // trail byte or non-shortest form
+    } else if (lead <= 0xdf && (i + 1) <= length && IsUTF8TrailByte(s[i])) {
+      ++i;  // 2-byte character
+    } else if (0xe0 <= lead && lead <= 0xef && (i + 2) <= length &&
+               IsUTF8TrailByte(s[i]) &&
+               IsUTF8TrailByte(s[i + 1]) &&
+               // check for non-shortest form and surrogate
+               (lead != 0xe0 || s[i] >= 0xa0) &&
+               (lead != 0xed || s[i] < 0xa0)) {
+      i += 2;  // 3-byte character
+    } else if (0xf0 <= lead && lead <= 0xf4 && (i + 3) <= length &&
+               IsUTF8TrailByte(s[i]) &&
+               IsUTF8TrailByte(s[i + 1]) &&
+               IsUTF8TrailByte(s[i + 2]) &&
+               // check for non-shortest form
+               (lead != 0xf0 || s[i] >= 0x90) &&
+               (lead != 0xf4 || s[i] < 0x90)) {
+      i += 3;  // 4-byte character
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+void ConditionalPrintAsText(const char* str, size_t length, ostream* os) {
+  if (!ContainsUnprintableControlCodes(str, length) &&
+      IsValidUTF8(str, length)) {
+    *os << "\n    As Text: \"" << str << "\"";
+  }
+}
+
+}  // anonymous namespace
+
+// Prints a ::string object.
+#if GTEST_HAS_GLOBAL_STRING
+void PrintStringTo(const ::string& s, ostream* os) {
+  if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) {
+    if (GTEST_FLAG(print_utf8)) {
+      ConditionalPrintAsText(s.data(), s.size(), os);
+    }
+  }
+}
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+void PrintStringTo(const ::std::string& s, ostream* os) {
+  if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) {
+    if (GTEST_FLAG(print_utf8)) {
+      ConditionalPrintAsText(s.data(), s.size(), os);
+    }
+  }
+}
+
+// Prints a ::wstring object.
+#if GTEST_HAS_GLOBAL_WSTRING
+void PrintWideStringTo(const ::wstring& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+#if GTEST_HAS_STD_WSTRING
+void PrintWideStringTo(const ::std::wstring& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+}  // namespace internal
+
+}  // namespace testing
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+
+
+namespace testing {
+
+using internal::GetUnitTestImpl;
+
+// Gets the summary of the failure message by omitting the stack trace
+// in it.
+std::string TestPartResult::ExtractSummary(const char* message) {
+  const char* const stack_trace = strstr(message, internal::kStackTraceMarker);
+  return stack_trace == NULL ? message :
+      std::string(message, stack_trace);
+}
+
+// Prints a TestPartResult object.
+std::ostream& operator<<(std::ostream& os, const TestPartResult& result) {
+  return os
+      << result.file_name() << ":" << result.line_number() << ": "
+      << (result.type() == TestPartResult::kSuccess ? "Success" :
+          result.type() == TestPartResult::kFatalFailure ? "Fatal failure" :
+          "Non-fatal failure") << ":\n"
+      << result.message() << std::endl;
+}
+
+// Appends a TestPartResult to the array.
+void TestPartResultArray::Append(const TestPartResult& result) {
+  array_.push_back(result);
+}
+
+// Returns the TestPartResult at the given index (0-based).
+const TestPartResult& TestPartResultArray::GetTestPartResult(int index) const {
+  if (index < 0 || index >= size()) {
+    printf("\nInvalid index (%d) into TestPartResultArray.\n", index);
+    internal::posix::Abort();
+  }
+
+  return array_[index];
+}
+
+// Returns the number of TestPartResult objects in the array.
+int TestPartResultArray::size() const {
+  return static_cast<int>(array_.size());
+}
+
+namespace internal {
+
+HasNewFatalFailureHelper::HasNewFatalFailureHelper()
+    : has_new_fatal_failure_(false),
+      original_reporter_(GetUnitTestImpl()->
+                         GetTestPartResultReporterForCurrentThread()) {
+  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this);
+}
+
+HasNewFatalFailureHelper::~HasNewFatalFailureHelper() {
+  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(
+      original_reporter_);
+}
+
+void HasNewFatalFailureHelper::ReportTestPartResult(
+    const TestPartResult& result) {
+  if (result.fatally_failed())
+    has_new_fatal_failure_ = true;
+  original_reporter_->ReportTestPartResult(result);
+}
+
+}  // namespace internal
+
+}  // namespace testing
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+
+namespace testing {
+namespace internal {
+
+#if GTEST_HAS_TYPED_TEST_P
+
+// Skips to the first non-space char in str. Returns an empty string if str
+// contains only whitespace characters.
+static const char* SkipSpaces(const char* str) {
+  while (IsSpace(*str))
+    str++;
+  return str;
+}
+
+static std::vector<std::string> SplitIntoTestNames(const char* src) {
+  std::vector<std::string> name_vec;
+  src = SkipSpaces(src);
+  for (; src != NULL; src = SkipComma(src)) {
+    name_vec.push_back(StripTrailingSpaces(GetPrefixUntilComma(src)));
+  }
+  return name_vec;
+}
+
+// Verifies that registered_tests match the test names in
+// registered_tests_; returns registered_tests if successful, or
+// aborts the program otherwise.
+const char* TypedTestCasePState::VerifyRegisteredTestNames(
+    const char* file, int line, const char* registered_tests) {
+  typedef RegisteredTestsMap::const_iterator RegisteredTestIter;
+  registered_ = true;
+
+  std::vector<std::string> name_vec = SplitIntoTestNames(registered_tests);
+
+  Message errors;
+
+  std::set<std::string> tests;
+  for (std::vector<std::string>::const_iterator name_it = name_vec.begin();
+       name_it != name_vec.end(); ++name_it) {
+    const std::string& name = *name_it;
+    if (tests.count(name) != 0) {
+      errors << "Test " << name << " is listed more than once.\n";
+      continue;
+    }
+
+    bool found = false;
+    for (RegisteredTestIter it = registered_tests_.begin();
+         it != registered_tests_.end();
+         ++it) {
+      if (name == it->first) {
+        found = true;
+        break;
+      }
+    }
+
+    if (found) {
+      tests.insert(name);
+    } else {
+      errors << "No test named " << name
+             << " can be found in this test case.\n";
+    }
+  }
+
+  for (RegisteredTestIter it = registered_tests_.begin();
+       it != registered_tests_.end();
+       ++it) {
+    if (tests.count(it->first) == 0) {
+      errors << "You forgot to list test " << it->first << ".\n";
+    }
+  }
+
+  const std::string& errors_str = errors.GetString();
+  if (errors_str != "") {
+    fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
+            errors_str.c_str());
+    fflush(stderr);
+    posix::Abort();
+  }
+
+  return registered_tests;
+}
+
+#endif  // GTEST_HAS_TYPED_TEST_P
+
+}  // namespace internal
+}  // namespace testing
diff --git a/src/rocksdb/third-party/gtest-1.8.1/fused-src/gtest/gtest.h b/src/rocksdb/third-party/gtest-1.8.1/fused-src/gtest/gtest.h
new file mode 100644
index 000000000..2d82d8e4d
--- /dev/null
+++ b/src/rocksdb/third-party/gtest-1.8.1/fused-src/gtest/gtest.h
@@ -0,0 +1,22115 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file defines the public API for Google Test.  It should be
+// included by any test program that uses Google Test.
+//
+// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
+// leave some internal implementation details in this header file.
+// They are clearly marked by comments like this:
+//
+//   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+//
+// Such code is NOT meant to be used by a user directly, and is subject
+// to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
+// program!
+//
+// Acknowledgment: Google Test borrowed the idea of automatic test
+// registration from Barthelemy Dagenais' (barthelemy@prologique.com)
+// easyUnit framework.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_H_
+
+#include <limits>
+#include <memory>
+#include <ostream>
+#include <vector>
+
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file declares functions and macros used internally by
+// Google Test.  They are subject to change without notice.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Low-level types and utilities for porting Google Test to various
+// platforms.  All macros ending with _ and symbols defined in an
+// internal namespace are subject to change without notice.  Code
+// outside Google Test MUST NOT USE THEM DIRECTLY.  Macros that don't
+// end with _ are part of Google Test's public API and can be used by
+// code outside Google Test.
+//
+// This file is fundamental to Google Test.  All other Google Test source
+// files are expected to #include this.  Therefore, it cannot #include
+// any other Google Test header.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+
+// Environment-describing macros
+// -----------------------------
+//
+// Google Test can be used in many different environments.  Macros in
+// this section tell Google Test what kind of environment it is being
+// used in, such that Google Test can provide environment-specific
+// features and implementations.
+//
+// Google Test tries to automatically detect the properties of its
+// environment, so users usually don't need to worry about these
+// macros.  However, the automatic detection is not perfect.
+// Sometimes it's necessary for a user to define some of the following
+// macros in the build script to override Google Test's decisions.
+//
+// If the user doesn't define a macro in the list, Google Test will
+// provide a default definition.  After this header is #included, all
+// macros in this list will be defined to either 1 or 0.
+//
+// Notes to maintainers:
+//   - Each macro here is a user-tweakable knob; do not grow the list
+//     lightly.
+//   - Use #if to key off these macros.  Don't use #ifdef or "#if
+//     defined(...)", which will not work as these macros are ALWAYS
+//     defined.
+//
+//   GTEST_HAS_CLONE          - Define it to 1/0 to indicate that clone(2)
+//                              is/isn't available.
+//   GTEST_HAS_EXCEPTIONS     - Define it to 1/0 to indicate that exceptions
+//                              are enabled.
+//   GTEST_HAS_GLOBAL_STRING  - Define it to 1/0 to indicate that ::string
+//                              is/isn't available
+//   GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::wstring
+//                              is/isn't available
+//   GTEST_HAS_POSIX_RE       - Define it to 1/0 to indicate that POSIX regular
+//                              expressions are/aren't available.
+//   GTEST_HAS_PTHREAD        - Define it to 1/0 to indicate that <pthread.h>
+//                              is/isn't available.
+//   GTEST_HAS_RTTI           - Define it to 1/0 to indicate that RTTI is/isn't
+//                              enabled.
+//   GTEST_HAS_STD_WSTRING    - Define it to 1/0 to indicate that
+//                              std::wstring does/doesn't work (Google Test can
+//                              be used where std::wstring is unavailable).
+//   GTEST_HAS_TR1_TUPLE      - Define it to 1/0 to indicate tr1::tuple
+//                              is/isn't available.
+//   GTEST_HAS_SEH            - Define it to 1/0 to indicate whether the
+//                              compiler supports Microsoft's "Structured
+//                              Exception Handling".
+//   GTEST_HAS_STREAM_REDIRECTION
+//                            - Define it to 1/0 to indicate whether the
+//                              platform supports I/O stream redirection using
+//                              dup() and dup2().
+//   GTEST_USE_OWN_TR1_TUPLE  - Define it to 1/0 to indicate whether Google
+//                              Test's own tr1 tuple implementation should be
+//                              used.  Unused when the user sets
+//                              GTEST_HAS_TR1_TUPLE to 0.
+//   GTEST_LANG_CXX11         - Define it to 1/0 to indicate that Google Test
+//                              is building in C++11/C++98 mode.
+//   GTEST_LINKED_AS_SHARED_LIBRARY
+//                            - Define to 1 when compiling tests that use
+//                              Google Test as a shared library (known as
+//                              DLL on Windows).
+//   GTEST_CREATE_SHARED_LIBRARY
+//                            - Define to 1 when compiling Google Test itself
+//                              as a shared library.
+//   GTEST_DEFAULT_DEATH_TEST_STYLE
+//                            - The default value of --gtest_death_test_style.
+//                              The legacy default has been "fast" in the open
+//                              source version since 2008. The recommended value
+//                              is "threadsafe", and can be set in
+//                              custom/gtest-port.h.
+
+// Platform-indicating macros
+// --------------------------
+//
+// Macros indicating the platform on which Google Test is being used
+// (a macro is defined to 1 if compiled on the given platform;
+// otherwise UNDEFINED -- it's never defined to 0.).  Google Test
+// defines these macros automatically.  Code outside Google Test MUST
+// NOT define them.
+//
+//   GTEST_OS_AIX      - IBM AIX
+//   GTEST_OS_CYGWIN   - Cygwin
+//   GTEST_OS_FREEBSD  - FreeBSD
+//   GTEST_OS_FUCHSIA  - Fuchsia
+//   GTEST_OS_HPUX     - HP-UX
+//   GTEST_OS_LINUX    - Linux
+//     GTEST_OS_LINUX_ANDROID - Google Android
+//   GTEST_OS_MAC      - Mac OS X
+//     GTEST_OS_IOS    - iOS
+//   GTEST_OS_NACL     - Google Native Client (NaCl)
+//   GTEST_OS_NETBSD   - NetBSD
+//   GTEST_OS_OPENBSD  - OpenBSD
+//   GTEST_OS_QNX      - QNX
+//   GTEST_OS_SOLARIS  - Sun Solaris
+//   GTEST_OS_SYMBIAN  - Symbian
+//   GTEST_OS_WINDOWS  - Windows (Desktop, MinGW, or Mobile)
+//     GTEST_OS_WINDOWS_DESKTOP  - Windows Desktop
+//     GTEST_OS_WINDOWS_MINGW    - MinGW
+//     GTEST_OS_WINDOWS_MOBILE   - Windows Mobile
+//     GTEST_OS_WINDOWS_PHONE    - Windows Phone
+//     GTEST_OS_WINDOWS_RT       - Windows Store App/WinRT
+//   GTEST_OS_ZOS      - z/OS
+//
+// Among the platforms, Cygwin, Linux, Max OS X, and Windows have the
+// most stable support.  Since core members of the Google Test project
+// don't have access to other platforms, support for them may be less
+// stable.  If you notice any problems on your platform, please notify
+// googletestframework@googlegroups.com (patches for fixing them are
+// even more welcome!).
+//
+// It is possible that none of the GTEST_OS_* macros are defined.
+
+// Feature-indicating macros
+// -------------------------
+//
+// Macros indicating which Google Test features are available (a macro
+// is defined to 1 if the corresponding feature is supported;
+// otherwise UNDEFINED -- it's never defined to 0.).  Google Test
+// defines these macros automatically.  Code outside Google Test MUST
+// NOT define them.
+//
+// These macros are public so that portable tests can be written.
+// Such tests typically surround code using a feature with an #if
+// which controls that code.  For example:
+//
+// #if GTEST_HAS_DEATH_TEST
+//   EXPECT_DEATH(DoSomethingDeadly());
+// #endif
+//
+//   GTEST_HAS_COMBINE      - the Combine() function (for value-parameterized
+//                            tests)
+//   GTEST_HAS_DEATH_TEST   - death tests
+//   GTEST_HAS_TYPED_TEST   - typed tests
+//   GTEST_HAS_TYPED_TEST_P - type-parameterized tests
+//   GTEST_IS_THREADSAFE    - Google Test is thread-safe.
+//   GOOGLETEST_CM0007 DO NOT DELETE
+//   GTEST_USES_POSIX_RE    - enhanced POSIX regex is used. Do not confuse with
+//                            GTEST_HAS_POSIX_RE (see above) which users can
+//                            define themselves.
+//   GTEST_USES_SIMPLE_RE   - our own simple regex is used;
+//                            the above RE\b(s) are mutually exclusive.
+//   GTEST_CAN_COMPARE_NULL - accepts untyped NULL in EXPECT_EQ().
+
+// Misc public macros
+// ------------------
+//
+//   GTEST_FLAG(flag_name)  - references the variable corresponding to
+//                            the given Google Test flag.
+
+// Internal utilities
+// ------------------
+//
+// The following macros and utilities are for Google Test's INTERNAL
+// use only.  Code outside Google Test MUST NOT USE THEM DIRECTLY.
+//
+// Macros for basic C++ coding:
+//   GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning.
+//   GTEST_ATTRIBUTE_UNUSED_  - declares that a class' instances or a
+//                              variable don't have to be used.
+//   GTEST_DISALLOW_ASSIGN_   - disables operator=.
+//   GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=.
+//   GTEST_MUST_USE_RESULT_   - declares that a function's result must be used.
+//   GTEST_INTENTIONAL_CONST_COND_PUSH_ - start code section where MSVC C4127 is
+//                                        suppressed (constant conditional).
+//   GTEST_INTENTIONAL_CONST_COND_POP_  - finish code section where MSVC C4127
+//                                        is suppressed.
+//
+// C++11 feature wrappers:
+//
+//   testing::internal::forward - portability wrapper for std::forward.
+//   testing::internal::move  - portability wrapper for std::move.
+//
+// Synchronization:
+//   Mutex, MutexLock, ThreadLocal, GetThreadCount()
+//                            - synchronization primitives.
+//
+// Template meta programming:
+//   is_pointer     - as in TR1; needed on Symbian and IBM XL C/C++ only.
+//   IteratorTraits - partial implementation of std::iterator_traits, which
+//                    is not available in libCstd when compiled with Sun C++.
+//
+// Smart pointers:
+//   scoped_ptr     - as in TR2.
+//
+// Regular expressions:
+//   RE             - a simple regular expression class using the POSIX
+//                    Extended Regular Expression syntax on UNIX-like platforms
+//                    GOOGLETEST_CM0008 DO NOT DELETE
+//                    or a reduced regular exception syntax on other
+//                    platforms, including Windows.
+// Logging:
+//   GTEST_LOG_()   - logs messages at the specified severity level.
+//   LogToStderr()  - directs all log messages to stderr.
+//   FlushInfoLog() - flushes informational log messages.
+//
+// Stdout and stderr capturing:
+//   CaptureStdout()     - starts capturing stdout.
+//   GetCapturedStdout() - stops capturing stdout and returns the captured
+//                         string.
+//   CaptureStderr()     - starts capturing stderr.
+//   GetCapturedStderr() - stops capturing stderr and returns the captured
+//                         string.
+//
+// Integer types:
+//   TypeWithSize   - maps an integer to a int type.
+//   Int32, UInt32, Int64, UInt64, TimeInMillis
+//                  - integers of known sizes.
+//   BiggestInt     - the biggest signed integer type.
+//
+// Command-line utilities:
+//   GTEST_DECLARE_*()  - declares a flag.
+//   GTEST_DEFINE_*()   - defines a flag.
+//   GetInjectableArgvs() - returns the command line as a vector of strings.
+//
+// Environment variable utilities:
+//   GetEnv()             - gets the value of an environment variable.
+//   BoolFromGTestEnv()   - parses a bool environment variable.
+//   Int32FromGTestEnv()  - parses an Int32 environment variable.
+//   StringFromGTestEnv() - parses a string environment variable.
+
+#include <ctype.h>   // for isspace, etc
+#include <stddef.h>  // for ptrdiff_t
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#ifndef _WIN32_WCE
+# include <sys/types.h>
+# include <sys/stat.h>
+#endif  // !_WIN32_WCE
+
+#if defined __APPLE__
+# include <AvailabilityMacros.h>
+# include <TargetConditionals.h>
+#endif
+
+// Brings in the definition of HAS_GLOBAL_STRING.  This must be done
+// BEFORE we test HAS_GLOBAL_STRING.
+#include <string>  // NOLINT
+#include <algorithm>  // NOLINT
+#include <iostream>  // NOLINT
+#include <sstream>  // NOLINT
+#include <utility>
+#include <vector>  // NOLINT
+
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file defines the GTEST_OS_* macro.
+// It is separate from gtest-port.h so that custom/gtest-port.h can include it.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+
+// Determines the platform on which Google Test is compiled.
+#ifdef __CYGWIN__
+# define GTEST_OS_CYGWIN 1
+#elif defined __SYMBIAN32__
+# define GTEST_OS_SYMBIAN 1
+#elif defined _WIN32
+# define GTEST_OS_WINDOWS 1
+# ifdef _WIN32_WCE
+#  define GTEST_OS_WINDOWS_MOBILE 1
+# elif defined(__MINGW__) || defined(__MINGW32__)
+#  define GTEST_OS_WINDOWS_MINGW 1
+# elif defined(WINAPI_FAMILY)
+#  include <winapifamily.h>
+#  if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#   define GTEST_OS_WINDOWS_DESKTOP 1
+#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
+#   define GTEST_OS_WINDOWS_PHONE 1
+#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
+#   define GTEST_OS_WINDOWS_RT 1
+#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE)
+#   define GTEST_OS_WINDOWS_PHONE 1
+#   define GTEST_OS_WINDOWS_TV_TITLE 1
+#  else
+    // WINAPI_FAMILY defined but no known partition matched.
+    // Default to desktop.
+#   define GTEST_OS_WINDOWS_DESKTOP 1
+#  endif
+# else
+#  define GTEST_OS_WINDOWS_DESKTOP 1
+# endif  // _WIN32_WCE
+#elif defined __APPLE__
+# define GTEST_OS_MAC 1
+# if TARGET_OS_IPHONE
+#  define GTEST_OS_IOS 1
+# endif
+#elif defined __FreeBSD__
+# define GTEST_OS_FREEBSD 1
+#elif defined __Fuchsia__
+# define GTEST_OS_FUCHSIA 1
+#elif defined __linux__
+# define GTEST_OS_LINUX 1
+# if defined __ANDROID__
+#  define GTEST_OS_LINUX_ANDROID 1
+# endif
+#elif defined __MVS__
+# define GTEST_OS_ZOS 1
+#elif defined(__sun) && defined(__SVR4)
+# define GTEST_OS_SOLARIS 1
+#elif defined(_AIX)
+# define GTEST_OS_AIX 1
+#elif defined(__hpux)
+# define GTEST_OS_HPUX 1
+#elif defined __native_client__
+# define GTEST_OS_NACL 1
+#elif defined __NetBSD__
+# define GTEST_OS_NETBSD 1
+#elif defined __OpenBSD__
+# define GTEST_OS_OPENBSD 1
+#elif defined __QNX__
+# define GTEST_OS_QNX 1
+#endif  // __CYGWIN__
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Injection point for custom user configurations. See README for details
+//
+// ** Custom implementation starts here **
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+
+#if !defined(GTEST_DEV_EMAIL_)
+# define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
+# define GTEST_FLAG_PREFIX_ "gtest_"
+# define GTEST_FLAG_PREFIX_DASH_ "gtest-"
+# define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
+# define GTEST_NAME_ "Google Test"
+# define GTEST_PROJECT_URL_ "https://github.com/google/googletest/"
+#endif  // !defined(GTEST_DEV_EMAIL_)
+
+#if !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
+# define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest"
+#endif  // !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
+
+// Determines the version of gcc that is used to compile this.
+#ifdef __GNUC__
+// 40302 means version 4.3.2.
+# define GTEST_GCC_VER_ \
+    (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
+#endif  // __GNUC__
+
+// Macros for disabling Microsoft Visual C++ warnings.
+//
+//   GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 4385)
+//   /* code that triggers warnings C4800 and C4385 */
+//   GTEST_DISABLE_MSC_WARNINGS_POP_()
+#if _MSC_VER >= 1400
+# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \
+    __pragma(warning(push))                        \
+    __pragma(warning(disable: warnings))
+# define GTEST_DISABLE_MSC_WARNINGS_POP_()          \
+    __pragma(warning(pop))
+#else
+// Older versions of MSVC don't have __pragma.
+# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings)
+# define GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
+
+// Clang on Windows does not understand MSVC's pragma warning.
+// We need clang-specific way to disable function deprecation warning.
+#ifdef __clang__
+# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_()                         \
+    _Pragma("clang diagnostic push")                                  \
+    _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \
+    _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"")
+#define GTEST_DISABLE_MSC_DEPRECATED_POP_() \
+    _Pragma("clang diagnostic pop")
+#else
+# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \
+    GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+# define GTEST_DISABLE_MSC_DEPRECATED_POP_() \
+    GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
+
+#ifndef GTEST_LANG_CXX11
+// gcc and clang define __GXX_EXPERIMENTAL_CXX0X__ when
+// -std={c,gnu}++{0x,11} is passed.  The C++11 standard specifies a
+// value for __cplusplus, and recent versions of clang, gcc, and
+// probably other compilers set that too in C++11 mode.
+# if __GXX_EXPERIMENTAL_CXX0X__ || __cplusplus >= 201103L || _MSC_VER >= 1900
+// Compiling in at least C++11 mode.
+#  define GTEST_LANG_CXX11 1
+# else
+#  define GTEST_LANG_CXX11 0
+# endif
+#endif
+
+// Distinct from C++11 language support, some environments don't provide
+// proper C++11 library support. Notably, it's possible to build in
+// C++11 mode when targeting Mac OS X 10.6, which has an old libstdc++
+// with no C++11 support.
+//
+// libstdc++ has sufficient C++11 support as of GCC 4.6.0, __GLIBCXX__
+// 20110325, but maintenance releases in the 4.4 and 4.5 series followed
+// this date, so check for those versions by their date stamps.
+// https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html#abi.versioning
+#if GTEST_LANG_CXX11 && \
+    (!defined(__GLIBCXX__) || ( \
+        __GLIBCXX__ >= 20110325ul &&  /* GCC >= 4.6.0 */ \
+        /* Blacklist of patch releases of older branches: */ \
+        __GLIBCXX__ != 20110416ul &&  /* GCC 4.4.6 */ \
+        __GLIBCXX__ != 20120313ul &&  /* GCC 4.4.7 */ \
+        __GLIBCXX__ != 20110428ul &&  /* GCC 4.5.3 */ \
+        __GLIBCXX__ != 20120702ul))   /* GCC 4.5.4 */
+# define GTEST_STDLIB_CXX11 1
+#endif
+
+// Only use C++11 library features if the library provides them.
+#if GTEST_STDLIB_CXX11
+# define GTEST_HAS_STD_BEGIN_AND_END_ 1
+# define GTEST_HAS_STD_FORWARD_LIST_ 1
+# if !defined(_MSC_VER) || (_MSC_FULL_VER >= 190023824)
+// works only with VS2015U2 and better
+#   define GTEST_HAS_STD_FUNCTION_ 1
+# endif
+# define GTEST_HAS_STD_INITIALIZER_LIST_ 1
+# define GTEST_HAS_STD_MOVE_ 1
+# define GTEST_HAS_STD_UNIQUE_PTR_ 1
+# define GTEST_HAS_STD_SHARED_PTR_ 1
+# define GTEST_HAS_UNORDERED_MAP_ 1
+# define GTEST_HAS_UNORDERED_SET_ 1
+#endif
+
+// C++11 specifies that <tuple> provides std::tuple.
+// Some platforms still might not have it, however.
+#if GTEST_LANG_CXX11
+# define GTEST_HAS_STD_TUPLE_ 1
+# if defined(__clang__)
+// Inspired by
+// https://clang.llvm.org/docs/LanguageExtensions.html#include-file-checking-macros
+#  if defined(__has_include) && !__has_include(<tuple>)
+#   undef GTEST_HAS_STD_TUPLE_
+#  endif
+# elif defined(_MSC_VER)
+// Inspired by boost/config/stdlib/dinkumware.hpp
+#  if defined(_CPPLIB_VER) && _CPPLIB_VER < 520
+#   undef GTEST_HAS_STD_TUPLE_
+#  endif
+# elif defined(__GLIBCXX__)
+// Inspired by boost/config/stdlib/libstdcpp3.hpp,
+// http://gcc.gnu.org/gcc-4.2/changes.html and
+// https://web.archive.org/web/20140227044429/gcc.gnu.org/onlinedocs/libstdc++/manual/bk01pt01ch01.html#manual.intro.status.standard.200x
+#  if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2)
+#   undef GTEST_HAS_STD_TUPLE_
+#  endif
+# endif
+#endif
+
+// Brings in definitions for functions used in the testing::internal::posix
+// namespace (read, write, close, chdir, isatty, stat). We do not currently
+// use them on Windows Mobile.
+#if GTEST_OS_WINDOWS
+# if !GTEST_OS_WINDOWS_MOBILE
+#  include <direct.h>
+#  include <io.h>
+# endif
+// In order to avoid having to include <windows.h>, use forward declaration
+#if GTEST_OS_WINDOWS_MINGW && !defined(__MINGW64_VERSION_MAJOR)
+// MinGW defined _CRITICAL_SECTION and _RTL_CRITICAL_SECTION as two
+// separate (equivalent) structs, instead of using typedef
+typedef struct _CRITICAL_SECTION GTEST_CRITICAL_SECTION;
+#else
+// Assume CRITICAL_SECTION is a typedef of _RTL_CRITICAL_SECTION.
+// This assumption is verified by
+// WindowsTypesTest.CRITICAL_SECTIONIs_RTL_CRITICAL_SECTION.
+typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
+#endif
+#else
+// This assumes that non-Windows OSes provide unistd.h. For OSes where this
+// is not the case, we need to include headers that provide the functions
+// mentioned above.
+# include <unistd.h>
+# include <strings.h>
+#endif  // GTEST_OS_WINDOWS
+
+#if GTEST_OS_LINUX_ANDROID
+// Used to define __ANDROID_API__ matching the target NDK API level.
+#  include <android/api-level.h>  // NOLINT
+#endif
+
+// Defines this to true iff Google Test can use POSIX regular expressions.
+#ifndef GTEST_HAS_POSIX_RE
+# if GTEST_OS_LINUX_ANDROID
+// On Android, <regex.h> is only available starting with Gingerbread.
+#  define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
+# else
+#  define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS)
+# endif
+#endif
+
+#if GTEST_USES_PCRE
+// The appropriate headers have already been included.
+
+#elif GTEST_HAS_POSIX_RE
+
+// On some platforms, <regex.h> needs someone to define size_t, and
+// won't compile otherwise.  We can #include it here as we already
+// included <stdlib.h>, which is guaranteed to define size_t through
+// <stddef.h>.
+# include <regex.h>  // NOLINT
+
+# define GTEST_USES_POSIX_RE 1
+
+#elif GTEST_OS_WINDOWS
+
+// <regex.h> is not available on Windows.  Use our own simple regex
+// implementation instead.
+# define GTEST_USES_SIMPLE_RE 1
+
+#else
+
+// <regex.h> may not be available on this platform.  Use our own
+// simple regex implementation instead.
+# define GTEST_USES_SIMPLE_RE 1
+
+#endif  // GTEST_USES_PCRE
+
+#ifndef GTEST_HAS_EXCEPTIONS
+// The user didn't tell us whether exceptions are enabled, so we need
+// to figure it out.
+# if defined(_MSC_VER) && defined(_CPPUNWIND)
+// MSVC defines _CPPUNWIND to 1 iff exceptions are enabled.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__BORLANDC__)
+// C++Builder's implementation of the STL uses the _HAS_EXCEPTIONS
+// macro to enable exceptions, so we'll do the same.
+// Assumes that exceptions are enabled by default.
+#  ifndef _HAS_EXCEPTIONS
+#   define _HAS_EXCEPTIONS 1
+#  endif  // _HAS_EXCEPTIONS
+#  define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
+# elif defined(__clang__)
+// clang defines __EXCEPTIONS iff exceptions are enabled before clang 220714,
+// but iff cleanups are enabled after that. In Obj-C++ files, there can be
+// cleanups for ObjC exceptions which also need cleanups, even if C++ exceptions
+// are disabled. clang has __has_feature(cxx_exceptions) which checks for C++
+// exceptions starting at clang r206352, but which checked for cleanups prior to
+// that. To reliably check for C++ exception availability with clang, check for
+// __EXCEPTIONS && __has_feature(cxx_exceptions).
+#  define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions))
+# elif defined(__GNUC__) && __EXCEPTIONS
+// gcc defines __EXCEPTIONS to 1 iff exceptions are enabled.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__SUNPRO_CC)
+// Sun Pro CC supports exceptions.  However, there is no compile-time way of
+// detecting whether they are enabled or not.  Therefore, we assume that
+// they are enabled unless the user tells us otherwise.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__IBMCPP__) && __EXCEPTIONS
+// xlC defines __EXCEPTIONS to 1 iff exceptions are enabled.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__HP_aCC)
+// Exception handling is in effect by default in HP aCC compiler. It has to
+// be turned of by +noeh compiler option if desired.
+#  define GTEST_HAS_EXCEPTIONS 1
+# else
+// For other compilers, we assume exceptions are disabled to be
+// conservative.
+#  define GTEST_HAS_EXCEPTIONS 0
+# endif  // defined(_MSC_VER) || defined(__BORLANDC__)
+#endif  // GTEST_HAS_EXCEPTIONS
+
+#if !defined(GTEST_HAS_STD_STRING)
+// Even though we don't use this macro any longer, we keep it in case
+// some clients still depend on it.
+# define GTEST_HAS_STD_STRING 1
+#elif !GTEST_HAS_STD_STRING
+// The user told us that ::std::string isn't available.
+# error "::std::string isn't available."
+#endif  // !defined(GTEST_HAS_STD_STRING)
+
+#ifndef GTEST_HAS_GLOBAL_STRING
+# define GTEST_HAS_GLOBAL_STRING 0
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+#ifndef GTEST_HAS_STD_WSTRING
+// The user didn't tell us whether ::std::wstring is available, so we need
+// to figure it out.
+// FIXME: uses autoconf to detect whether ::std::wstring
+//   is available.
+
+// Cygwin 1.7 and below doesn't support ::std::wstring.
+// Solaris' libc++ doesn't support it either.  Android has
+// no support for it at least as recent as Froyo (2.2).
+# define GTEST_HAS_STD_WSTRING \
+    (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS))
+
+#endif  // GTEST_HAS_STD_WSTRING
+
+#ifndef GTEST_HAS_GLOBAL_WSTRING
+// The user didn't tell us whether ::wstring is available, so we need
+// to figure it out.
+# define GTEST_HAS_GLOBAL_WSTRING \
+    (GTEST_HAS_STD_WSTRING && GTEST_HAS_GLOBAL_STRING)
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+// Determines whether RTTI is available.
+#ifndef GTEST_HAS_RTTI
+// The user didn't tell us whether RTTI is enabled, so we need to
+// figure it out.
+
+# ifdef _MSC_VER
+
+#  ifdef _CPPRTTI  // MSVC defines this macro iff RTTI is enabled.
+#   define GTEST_HAS_RTTI 1
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif
+
+// Starting with version 4.3.2, gcc defines __GXX_RTTI iff RTTI is enabled.
+# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40302)
+
+#  ifdef __GXX_RTTI
+// When building against STLport with the Android NDK and with
+// -frtti -fno-exceptions, the build fails at link time with undefined
+// references to __cxa_bad_typeid. Note sure if STL or toolchain bug,
+// so disable RTTI when detected.
+#   if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && \
+       !defined(__EXCEPTIONS)
+#    define GTEST_HAS_RTTI 0
+#   else
+#    define GTEST_HAS_RTTI 1
+#   endif  // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif  // __GXX_RTTI
+
+// Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends
+// using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the
+// first version with C++ support.
+# elif defined(__clang__)
+
+#  define GTEST_HAS_RTTI __has_feature(cxx_rtti)
+
+// Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if
+// both the typeid and dynamic_cast features are present.
+# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
+
+#  ifdef __RTTI_ALL__
+#   define GTEST_HAS_RTTI 1
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif
+
+# else
+
+// For all other compilers, we assume RTTI is enabled.
+#  define GTEST_HAS_RTTI 1
+
+# endif  // _MSC_VER
+
+#endif  // GTEST_HAS_RTTI
+
+// It's this header's responsibility to #include <typeinfo> when RTTI
+// is enabled.
+#if GTEST_HAS_RTTI
+# include <typeinfo>
+#endif
+
+// Determines whether Google Test can use the pthreads library.
+#ifndef GTEST_HAS_PTHREAD
+// The user didn't tell us explicitly, so we make reasonable assumptions about
+// which platforms have pthreads support.
+//
+// To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0
+// to your compiler flags.
+#define GTEST_HAS_PTHREAD                                             \
+  (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX || GTEST_OS_QNX || \
+   GTEST_OS_FREEBSD || GTEST_OS_NACL || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA)
+#endif  // GTEST_HAS_PTHREAD
+
+#if GTEST_HAS_PTHREAD
+// gtest-port.h guarantees to #include <pthread.h> when GTEST_HAS_PTHREAD is
+// true.
+# include <pthread.h>  // NOLINT
+
+// For timespec and nanosleep, used below.
+# include <time.h>  // NOLINT
+#endif
+
+// Determines if hash_map/hash_set are available.
+// Only used for testing against those containers.
+#if !defined(GTEST_HAS_HASH_MAP_)
+# if defined(_MSC_VER) && (_MSC_VER < 1900)
+#  define GTEST_HAS_HASH_MAP_ 1  // Indicates that hash_map is available.
+#  define GTEST_HAS_HASH_SET_ 1  // Indicates that hash_set is available.
+# endif  // _MSC_VER
+#endif  // !defined(GTEST_HAS_HASH_MAP_)
+
+// Determines whether Google Test can use tr1/tuple.  You can define
+// this macro to 0 to prevent Google Test from using tuple (any
+// feature depending on tuple with be disabled in this mode).
+#ifndef GTEST_HAS_TR1_TUPLE
+# if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR)
+// STLport, provided with the Android NDK, has neither <tr1/tuple> or <tuple>.
+#  define GTEST_HAS_TR1_TUPLE 0
+# elif defined(_MSC_VER) && (_MSC_VER >= 1910)
+// Prevent `warning C4996: 'std::tr1': warning STL4002:
+// The non-Standard std::tr1 namespace and TR1-only machinery
+// are deprecated and will be REMOVED.`
+#  define GTEST_HAS_TR1_TUPLE 0
+# elif GTEST_LANG_CXX11 && defined(_LIBCPP_VERSION)
+// libc++ doesn't support TR1.
+#  define GTEST_HAS_TR1_TUPLE 0
+# else
+// The user didn't tell us not to do it, so we assume it's OK.
+#  define GTEST_HAS_TR1_TUPLE 1
+# endif
+#endif  // GTEST_HAS_TR1_TUPLE
+
+// Determines whether Google Test's own tr1 tuple implementation
+// should be used.
+#ifndef GTEST_USE_OWN_TR1_TUPLE
+// We use our own tuple implementation on Symbian.
+# if GTEST_OS_SYMBIAN
+#  define GTEST_USE_OWN_TR1_TUPLE 1
+# else
+// The user didn't tell us, so we need to figure it out.
+
+// We use our own TR1 tuple if we aren't sure the user has an
+// implementation of it already.  At this time, libstdc++ 4.0.0+ and
+// MSVC 2010 are the only mainstream standard libraries that come
+// with a TR1 tuple implementation.  NVIDIA's CUDA NVCC compiler
+// pretends to be GCC by defining __GNUC__ and friends, but cannot
+// compile GCC's tuple implementation.  MSVC 2008 (9.0) provides TR1
+// tuple in a 323 MB Feature Pack download, which we cannot assume the
+// user has.  QNX's QCC compiler is a modified GCC but it doesn't
+// support TR1 tuple.  libc++ only provides std::tuple, in C++11 mode,
+// and it can be used with some compilers that define __GNUC__.
+# if (defined(__GNUC__) && !defined(__CUDACC__) && (GTEST_GCC_VER_ >= 40000) \
+      && !GTEST_OS_QNX && !defined(_LIBCPP_VERSION)) \
+      || (_MSC_VER >= 1600 && _MSC_VER < 1900)
+#  define GTEST_ENV_HAS_TR1_TUPLE_ 1
+# endif
+
+// C++11 specifies that <tuple> provides std::tuple. Use that if gtest is used
+// in C++11 mode and libstdc++ isn't very old (binaries targeting OS X 10.6
+// can build with clang but need to use gcc4.2's libstdc++).
+# if GTEST_LANG_CXX11 && (!defined(__GLIBCXX__) || __GLIBCXX__ > 20110325)
+#  define GTEST_ENV_HAS_STD_TUPLE_ 1
+# endif
+
+# if GTEST_ENV_HAS_TR1_TUPLE_ || GTEST_ENV_HAS_STD_TUPLE_
+#  define GTEST_USE_OWN_TR1_TUPLE 0
+# else
+#  define GTEST_USE_OWN_TR1_TUPLE 1
+# endif
+# endif  // GTEST_OS_SYMBIAN
+#endif  // GTEST_USE_OWN_TR1_TUPLE
+
+// To avoid conditional compilation we make it gtest-port.h's responsibility
+// to #include the header implementing tuple.
+#if GTEST_HAS_STD_TUPLE_
+# include <tuple>  // IWYU pragma: export
+# define GTEST_TUPLE_NAMESPACE_ ::std
+#endif  // GTEST_HAS_STD_TUPLE_
+
+// We include tr1::tuple even if std::tuple is available to define printers for
+// them.
+#if GTEST_HAS_TR1_TUPLE
+# ifndef GTEST_TUPLE_NAMESPACE_
+#  define GTEST_TUPLE_NAMESPACE_ ::std::tr1
+# endif  // GTEST_TUPLE_NAMESPACE_
+
+# if GTEST_USE_OWN_TR1_TUPLE
+// This file was GENERATED by command:
+//     pump.py gtest-tuple.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2009 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Implements a subset of TR1 tuple needed by Google Test and Google Mock.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
+
+#include <utility>  // For ::std::pair.
+
+// The compiler used in Symbian has a bug that prevents us from declaring the
+// tuple template as a friend (it complains that tuple is redefined).  This
+// bypasses the bug by declaring the members that should otherwise be
+// private as public.
+// Sun Studio versions < 12 also have the above bug.
+#if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590)
+# define GTEST_DECLARE_TUPLE_AS_FRIEND_ public:
+#else
+# define GTEST_DECLARE_TUPLE_AS_FRIEND_ \
+    template <GTEST_10_TYPENAMES_(U)> friend class tuple; \
+   private:
+#endif
+
+// Visual Studio 2010, 2012, and 2013 define symbols in std::tr1 that conflict
+// with our own definitions. Therefore using our own tuple does not work on
+// those compilers.
+#if defined(_MSC_VER) && _MSC_VER >= 1600  /* 1600 is Visual Studio 2010 */
+# error "gtest's tuple doesn't compile on Visual Studio 2010 or later. \
+GTEST_USE_OWN_TR1_TUPLE must be set to 0 on those compilers."
+#endif
+
+// GTEST_n_TUPLE_(T) is the type of an n-tuple.
+#define GTEST_0_TUPLE_(T) tuple<>
+#define GTEST_1_TUPLE_(T) tuple<T##0, void, void, void, void, void, void, \
+    void, void, void>
+#define GTEST_2_TUPLE_(T) tuple<T##0, T##1, void, void, void, void, void, \
+    void, void, void>
+#define GTEST_3_TUPLE_(T) tuple<T##0, T##1, T##2, void, void, void, void, \
+    void, void, void>
+#define GTEST_4_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, void, void, void, \
+    void, void, void>
+#define GTEST_5_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, void, void, \
+    void, void, void>
+#define GTEST_6_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, void, \
+    void, void, void>
+#define GTEST_7_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    void, void, void>
+#define GTEST_8_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    T##7, void, void>
+#define GTEST_9_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    T##7, T##8, void>
+#define GTEST_10_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    T##7, T##8, T##9>
+
+// GTEST_n_TYPENAMES_(T) declares a list of n typenames.
+#define GTEST_0_TYPENAMES_(T)
+#define GTEST_1_TYPENAMES_(T) typename T##0
+#define GTEST_2_TYPENAMES_(T) typename T##0, typename T##1
+#define GTEST_3_TYPENAMES_(T) typename T##0, typename T##1, typename T##2
+#define GTEST_4_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3
+#define GTEST_5_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4
+#define GTEST_6_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5
+#define GTEST_7_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6
+#define GTEST_8_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6, typename T##7
+#define GTEST_9_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6, \
+    typename T##7, typename T##8
+#define GTEST_10_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6, \
+    typename T##7, typename T##8, typename T##9
+
+// In theory, defining stuff in the ::std namespace is undefined
+// behavior.  We can do this as we are playing the role of a standard
+// library vendor.
+namespace std {
+namespace tr1 {
+
+template <typename T0 = void, typename T1 = void, typename T2 = void,
+    typename T3 = void, typename T4 = void, typename T5 = void,
+    typename T6 = void, typename T7 = void, typename T8 = void,
+    typename T9 = void>
+class tuple;
+
+// Anything in namespace gtest_internal is Google Test's INTERNAL
+// IMPLEMENTATION DETAIL and MUST NOT BE USED DIRECTLY in user code.
+namespace gtest_internal {
+
+// ByRef<T>::type is T if T is a reference; otherwise it's const T&.
+template <typename T>
+struct ByRef { typedef const T& type; };  // NOLINT
+template <typename T>
+struct ByRef<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper for ByRef.
+#define GTEST_BY_REF_(T) typename ::std::tr1::gtest_internal::ByRef<T>::type
+
+// AddRef<T>::type is T if T is a reference; otherwise it's T&.  This
+// is the same as tr1::add_reference<T>::type.
+template <typename T>
+struct AddRef { typedef T& type; };  // NOLINT
+template <typename T>
+struct AddRef<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper for AddRef.
+#define GTEST_ADD_REF_(T) typename ::std::tr1::gtest_internal::AddRef<T>::type
+
+// A helper for implementing get<k>().
+template <int k> class Get;
+
+// A helper for implementing tuple_element<k, T>.  kIndexValid is true
+// iff k < the number of fields in tuple type T.
+template <bool kIndexValid, int kIndex, class Tuple>
+struct TupleElement;
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 0, GTEST_10_TUPLE_(T) > {
+  typedef T0 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 1, GTEST_10_TUPLE_(T) > {
+  typedef T1 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 2, GTEST_10_TUPLE_(T) > {
+  typedef T2 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 3, GTEST_10_TUPLE_(T) > {
+  typedef T3 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 4, GTEST_10_TUPLE_(T) > {
+  typedef T4 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 5, GTEST_10_TUPLE_(T) > {
+  typedef T5 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 6, GTEST_10_TUPLE_(T) > {
+  typedef T6 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 7, GTEST_10_TUPLE_(T) > {
+  typedef T7 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 8, GTEST_10_TUPLE_(T) > {
+  typedef T8 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 9, GTEST_10_TUPLE_(T) > {
+  typedef T9 type;
+};
+
+}  // namespace gtest_internal
+
+template <>
+class tuple<> {
+ public:
+  tuple() {}
+  tuple(const tuple& /* t */)  {}
+  tuple& operator=(const tuple& /* t */) { return *this; }
+};
+
+template <GTEST_1_TYPENAMES_(T)>
+class GTEST_1_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0) : f0_(f0) {}
+
+  tuple(const tuple& t) : f0_(t.f0_) {}
+
+  template <GTEST_1_TYPENAMES_(U)>
+  tuple(const GTEST_1_TUPLE_(U)& t) : f0_(t.f0_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_1_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_1_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_1_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_1_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    return *this;
+  }
+
+  T0 f0_;
+};
+
+template <GTEST_2_TYPENAMES_(T)>
+class GTEST_2_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1) : f0_(f0),
+      f1_(f1) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_) {}
+
+  template <GTEST_2_TYPENAMES_(U)>
+  tuple(const GTEST_2_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_) {}
+  template <typename U0, typename U1>
+  tuple(const ::std::pair<U0, U1>& p) : f0_(p.first), f1_(p.second) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_2_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_2_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+  template <typename U0, typename U1>
+  tuple& operator=(const ::std::pair<U0, U1>& p) {
+    f0_ = p.first;
+    f1_ = p.second;
+    return *this;
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_2_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_2_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+};
+
+template <GTEST_3_TYPENAMES_(T)>
+class GTEST_3_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2) : f0_(f0), f1_(f1), f2_(f2) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {}
+
+  template <GTEST_3_TYPENAMES_(U)>
+  tuple(const GTEST_3_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_3_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_3_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_3_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_3_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+};
+
+template <GTEST_4_TYPENAMES_(T)>
+class GTEST_4_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3) : f0_(f0), f1_(f1), f2_(f2),
+      f3_(f3) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_) {}
+
+  template <GTEST_4_TYPENAMES_(U)>
+  tuple(const GTEST_4_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_4_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_4_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_4_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_4_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+};
+
+template <GTEST_5_TYPENAMES_(T)>
+class GTEST_5_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3,
+      GTEST_BY_REF_(T4) f4) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_) {}
+
+  template <GTEST_5_TYPENAMES_(U)>
+  tuple(const GTEST_5_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_5_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_5_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_5_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_5_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+};
+
+template <GTEST_6_TYPENAMES_(T)>
+class GTEST_6_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
+      f5_(f5) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_) {}
+
+  template <GTEST_6_TYPENAMES_(U)>
+  tuple(const GTEST_6_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_6_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_6_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_6_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_6_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+};
+
+template <GTEST_7_TYPENAMES_(T)>
+class GTEST_7_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6) : f0_(f0), f1_(f1), f2_(f2),
+      f3_(f3), f4_(f4), f5_(f5), f6_(f6) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {}
+
+  template <GTEST_7_TYPENAMES_(U)>
+  tuple(const GTEST_7_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_7_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_7_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_7_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_7_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+};
+
+template <GTEST_8_TYPENAMES_(T)>
+class GTEST_8_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6,
+      GTEST_BY_REF_(T7) f7) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
+      f5_(f5), f6_(f6), f7_(f7) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {}
+
+  template <GTEST_8_TYPENAMES_(U)>
+  tuple(const GTEST_8_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_8_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_8_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_8_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_8_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    f7_ = t.f7_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+  T7 f7_;
+};
+
+template <GTEST_9_TYPENAMES_(T)>
+class GTEST_9_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7,
+      GTEST_BY_REF_(T8) f8) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
+      f5_(f5), f6_(f6), f7_(f7), f8_(f8) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {}
+
+  template <GTEST_9_TYPENAMES_(U)>
+  tuple(const GTEST_9_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_9_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_9_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_9_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_9_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    f7_ = t.f7_;
+    f8_ = t.f8_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+  T7 f7_;
+  T8 f8_;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+class tuple {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_(),
+      f9_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7,
+      GTEST_BY_REF_(T8) f8, GTEST_BY_REF_(T9) f9) : f0_(f0), f1_(f1), f2_(f2),
+      f3_(f3), f4_(f4), f5_(f5), f6_(f6), f7_(f7), f8_(f8), f9_(f9) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_), f9_(t.f9_) {}
+
+  template <GTEST_10_TYPENAMES_(U)>
+  tuple(const GTEST_10_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_),
+      f9_(t.f9_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_10_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_10_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_10_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_10_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    f7_ = t.f7_;
+    f8_ = t.f8_;
+    f9_ = t.f9_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+  T7 f7_;
+  T8 f8_;
+  T9 f9_;
+};
+
+// 6.1.3.2 Tuple creation functions.
+
+// Known limitations: we don't support passing an
+// std::tr1::reference_wrapper<T> to make_tuple().  And we don't
+// implement tie().
+
+inline tuple<> make_tuple() { return tuple<>(); }
+
+template <GTEST_1_TYPENAMES_(T)>
+inline GTEST_1_TUPLE_(T) make_tuple(const T0& f0) {
+  return GTEST_1_TUPLE_(T)(f0);
+}
+
+template <GTEST_2_TYPENAMES_(T)>
+inline GTEST_2_TUPLE_(T) make_tuple(const T0& f0, const T1& f1) {
+  return GTEST_2_TUPLE_(T)(f0, f1);
+}
+
+template <GTEST_3_TYPENAMES_(T)>
+inline GTEST_3_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2) {
+  return GTEST_3_TUPLE_(T)(f0, f1, f2);
+}
+
+template <GTEST_4_TYPENAMES_(T)>
+inline GTEST_4_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3) {
+  return GTEST_4_TUPLE_(T)(f0, f1, f2, f3);
+}
+
+template <GTEST_5_TYPENAMES_(T)>
+inline GTEST_5_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4) {
+  return GTEST_5_TUPLE_(T)(f0, f1, f2, f3, f4);
+}
+
+template <GTEST_6_TYPENAMES_(T)>
+inline GTEST_6_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5) {
+  return GTEST_6_TUPLE_(T)(f0, f1, f2, f3, f4, f5);
+}
+
+template <GTEST_7_TYPENAMES_(T)>
+inline GTEST_7_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6) {
+  return GTEST_7_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6);
+}
+
+template <GTEST_8_TYPENAMES_(T)>
+inline GTEST_8_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7) {
+  return GTEST_8_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7);
+}
+
+template <GTEST_9_TYPENAMES_(T)>
+inline GTEST_9_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7,
+    const T8& f8) {
+  return GTEST_9_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8);
+}
+
+template <GTEST_10_TYPENAMES_(T)>
+inline GTEST_10_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7,
+    const T8& f8, const T9& f9) {
+  return GTEST_10_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8, f9);
+}
+
+// 6.1.3.3 Tuple helper classes.
+
+template <typename Tuple> struct tuple_size;
+
+template <GTEST_0_TYPENAMES_(T)>
+struct tuple_size<GTEST_0_TUPLE_(T) > {
+  static const int value = 0;
+};
+
+template <GTEST_1_TYPENAMES_(T)>
+struct tuple_size<GTEST_1_TUPLE_(T) > {
+  static const int value = 1;
+};
+
+template <GTEST_2_TYPENAMES_(T)>
+struct tuple_size<GTEST_2_TUPLE_(T) > {
+  static const int value = 2;
+};
+
+template <GTEST_3_TYPENAMES_(T)>
+struct tuple_size<GTEST_3_TUPLE_(T) > {
+  static const int value = 3;
+};
+
+template <GTEST_4_TYPENAMES_(T)>
+struct tuple_size<GTEST_4_TUPLE_(T) > {
+  static const int value = 4;
+};
+
+template <GTEST_5_TYPENAMES_(T)>
+struct tuple_size<GTEST_5_TUPLE_(T) > {
+  static const int value = 5;
+};
+
+template <GTEST_6_TYPENAMES_(T)>
+struct tuple_size<GTEST_6_TUPLE_(T) > {
+  static const int value = 6;
+};
+
+template <GTEST_7_TYPENAMES_(T)>
+struct tuple_size<GTEST_7_TUPLE_(T) > {
+  static const int value = 7;
+};
+
+template <GTEST_8_TYPENAMES_(T)>
+struct tuple_size<GTEST_8_TUPLE_(T) > {
+  static const int value = 8;
+};
+
+template <GTEST_9_TYPENAMES_(T)>
+struct tuple_size<GTEST_9_TUPLE_(T) > {
+  static const int value = 9;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct tuple_size<GTEST_10_TUPLE_(T) > {
+  static const int value = 10;
+};
+
+template <int k, class Tuple>
+struct tuple_element {
+  typedef typename gtest_internal::TupleElement<
+      k < (tuple_size<Tuple>::value), k, Tuple>::type type;
+};
+
+#define GTEST_TUPLE_ELEMENT_(k, Tuple) typename tuple_element<k, Tuple >::type
+
+// 6.1.3.4 Element access.
+
+namespace gtest_internal {
+
+template <>
+class Get<0> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple))
+  Field(Tuple& t) { return t.f0_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple))
+  ConstField(const Tuple& t) { return t.f0_; }
+};
+
+template <>
+class Get<1> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple))
+  Field(Tuple& t) { return t.f1_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple))
+  ConstField(const Tuple& t) { return t.f1_; }
+};
+
+template <>
+class Get<2> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple))
+  Field(Tuple& t) { return t.f2_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple))
+  ConstField(const Tuple& t) { return t.f2_; }
+};
+
+template <>
+class Get<3> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple))
+  Field(Tuple& t) { return t.f3_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple))
+  ConstField(const Tuple& t) { return t.f3_; }
+};
+
+template <>
+class Get<4> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple))
+  Field(Tuple& t) { return t.f4_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple))
+  ConstField(const Tuple& t) { return t.f4_; }
+};
+
+template <>
+class Get<5> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple))
+  Field(Tuple& t) { return t.f5_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple))
+  ConstField(const Tuple& t) { return t.f5_; }
+};
+
+template <>
+class Get<6> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple))
+  Field(Tuple& t) { return t.f6_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple))
+  ConstField(const Tuple& t) { return t.f6_; }
+};
+
+template <>
+class Get<7> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple))
+  Field(Tuple& t) { return t.f7_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple))
+  ConstField(const Tuple& t) { return t.f7_; }
+};
+
+template <>
+class Get<8> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple))
+  Field(Tuple& t) { return t.f8_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple))
+  ConstField(const Tuple& t) { return t.f8_; }
+};
+
+template <>
+class Get<9> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple))
+  Field(Tuple& t) { return t.f9_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple))
+  ConstField(const Tuple& t) { return t.f9_; }
+};
+
+}  // namespace gtest_internal
+
+template <int k, GTEST_10_TYPENAMES_(T)>
+GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_10_TUPLE_(T)))
+get(GTEST_10_TUPLE_(T)& t) {
+  return gtest_internal::Get<k>::Field(t);
+}
+
+template <int k, GTEST_10_TYPENAMES_(T)>
+GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(k,  GTEST_10_TUPLE_(T)))
+get(const GTEST_10_TUPLE_(T)& t) {
+  return gtest_internal::Get<k>::ConstField(t);
+}
+
+// 6.1.3.5 Relational operators
+
+// We only implement == and !=, as we don't have a need for the rest yet.
+
+namespace gtest_internal {
+
+// SameSizeTuplePrefixComparator<k, k>::Eq(t1, t2) returns true if the
+// first k fields of t1 equals the first k fields of t2.
+// SameSizeTuplePrefixComparator(k1, k2) would be a compiler error if
+// k1 != k2.
+template <int kSize1, int kSize2>
+struct SameSizeTuplePrefixComparator;
+
+template <>
+struct SameSizeTuplePrefixComparator<0, 0> {
+  template <class Tuple1, class Tuple2>
+  static bool Eq(const Tuple1& /* t1 */, const Tuple2& /* t2 */) {
+    return true;
+  }
+};
+
+template <int k>
+struct SameSizeTuplePrefixComparator<k, k> {
+  template <class Tuple1, class Tuple2>
+  static bool Eq(const Tuple1& t1, const Tuple2& t2) {
+    return SameSizeTuplePrefixComparator<k - 1, k - 1>::Eq(t1, t2) &&
+        ::std::tr1::get<k - 1>(t1) == ::std::tr1::get<k - 1>(t2);
+  }
+};
+
+}  // namespace gtest_internal
+
+template <GTEST_10_TYPENAMES_(T), GTEST_10_TYPENAMES_(U)>
+inline bool operator==(const GTEST_10_TUPLE_(T)& t,
+                       const GTEST_10_TUPLE_(U)& u) {
+  return gtest_internal::SameSizeTuplePrefixComparator<
+      tuple_size<GTEST_10_TUPLE_(T) >::value,
+      tuple_size<GTEST_10_TUPLE_(U) >::value>::Eq(t, u);
+}
+
+template <GTEST_10_TYPENAMES_(T), GTEST_10_TYPENAMES_(U)>
+inline bool operator!=(const GTEST_10_TUPLE_(T)& t,
+                       const GTEST_10_TUPLE_(U)& u) { return !(t == u); }
+
+// 6.1.4 Pairs.
+// Unimplemented.
+
+}  // namespace tr1
+}  // namespace std
+
+#undef GTEST_0_TUPLE_
+#undef GTEST_1_TUPLE_
+#undef GTEST_2_TUPLE_
+#undef GTEST_3_TUPLE_
+#undef GTEST_4_TUPLE_
+#undef GTEST_5_TUPLE_
+#undef GTEST_6_TUPLE_
+#undef GTEST_7_TUPLE_
+#undef GTEST_8_TUPLE_
+#undef GTEST_9_TUPLE_
+#undef GTEST_10_TUPLE_
+
+#undef GTEST_0_TYPENAMES_
+#undef GTEST_1_TYPENAMES_
+#undef GTEST_2_TYPENAMES_
+#undef GTEST_3_TYPENAMES_
+#undef GTEST_4_TYPENAMES_
+#undef GTEST_5_TYPENAMES_
+#undef GTEST_6_TYPENAMES_
+#undef GTEST_7_TYPENAMES_
+#undef GTEST_8_TYPENAMES_
+#undef GTEST_9_TYPENAMES_
+#undef GTEST_10_TYPENAMES_
+
+#undef GTEST_DECLARE_TUPLE_AS_FRIEND_
+#undef GTEST_BY_REF_
+#undef GTEST_ADD_REF_
+#undef GTEST_TUPLE_ELEMENT_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
+# elif GTEST_OS_SYMBIAN
+
+// On Symbian, BOOST_HAS_TR1_TUPLE causes Boost's TR1 tuple library to
+// use STLport's tuple implementation, which unfortunately doesn't
+// work as the copy of STLport distributed with Symbian is incomplete.
+// By making sure BOOST_HAS_TR1_TUPLE is undefined, we force Boost to
+// use its own tuple implementation.
+#  ifdef BOOST_HAS_TR1_TUPLE
+#   undef BOOST_HAS_TR1_TUPLE
+#  endif  // BOOST_HAS_TR1_TUPLE
+
+// This prevents <boost/tr1/detail/config.hpp>, which defines
+// BOOST_HAS_TR1_TUPLE, from being #included by Boost's <tuple>.
+#  define BOOST_TR1_DETAIL_CONFIG_HPP_INCLUDED
+#  include <tuple>  // IWYU pragma: export  // NOLINT
+
+# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40000)
+// GCC 4.0+ implements tr1/tuple in the <tr1/tuple> header.  This does
+// not conform to the TR1 spec, which requires the header to be <tuple>.
+
+#  if !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
+// Until version 4.3.2, gcc has a bug that causes <tr1/functional>,
+// which is #included by <tr1/tuple>, to not compile when RTTI is
+// disabled.  _TR1_FUNCTIONAL is the header guard for
+// <tr1/functional>.  Hence the following #define is used to prevent
+// <tr1/functional> from being included.
+#   define _TR1_FUNCTIONAL 1
+#   include <tr1/tuple>
+#   undef _TR1_FUNCTIONAL  // Allows the user to #include
+                        // <tr1/functional> if they choose to.
+#  else
+#   include <tr1/tuple>  // NOLINT
+#  endif  // !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
+
+// VS 2010 now has tr1 support.
+# elif _MSC_VER >= 1600
+#  include <tuple>  // IWYU pragma: export  // NOLINT
+
+# else  // GTEST_USE_OWN_TR1_TUPLE
+#  include <tr1/tuple>  // IWYU pragma: export  // NOLINT
+# endif  // GTEST_USE_OWN_TR1_TUPLE
+
+#endif  // GTEST_HAS_TR1_TUPLE
+
+// Determines whether clone(2) is supported.
+// Usually it will only be available on Linux, excluding
+// Linux on the Itanium architecture.
+// Also see http://linux.die.net/man/2/clone.
+#ifndef GTEST_HAS_CLONE
+// The user didn't tell us, so we need to figure it out.
+
+# if GTEST_OS_LINUX && !defined(__ia64__)
+#  if GTEST_OS_LINUX_ANDROID
+// On Android, clone() became available at different API levels for each 32-bit
+// architecture.
+#    if defined(__LP64__) || \
+        (defined(__arm__) && __ANDROID_API__ >= 9) || \
+        (defined(__mips__) && __ANDROID_API__ >= 12) || \
+        (defined(__i386__) && __ANDROID_API__ >= 17)
+#     define GTEST_HAS_CLONE 1
+#    else
+#     define GTEST_HAS_CLONE 0
+#    endif
+#  else
+#   define GTEST_HAS_CLONE 1
+#  endif
+# else
+#  define GTEST_HAS_CLONE 0
+# endif  // GTEST_OS_LINUX && !defined(__ia64__)
+
+#endif  // GTEST_HAS_CLONE
+
+// Determines whether to support stream redirection. This is used to test
+// output correctness and to implement death tests.
+#ifndef GTEST_HAS_STREAM_REDIRECTION
+// By default, we assume that stream redirection is supported on all
+// platforms except known mobile ones.
+# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || \
+    GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
+#  define GTEST_HAS_STREAM_REDIRECTION 0
+# else
+#  define GTEST_HAS_STREAM_REDIRECTION 1
+# endif  // !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_SYMBIAN
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+// Determines whether to support death tests.
+// Google Test does not support death tests for VC 7.1 and earlier as
+// abort() in a VC 7.1 application compiled as GUI in debug config
+// pops up a dialog window that cannot be suppressed programmatically.
+#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS ||   \
+     (GTEST_OS_MAC && !GTEST_OS_IOS) ||                         \
+     (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER >= 1400) ||          \
+     GTEST_OS_WINDOWS_MINGW || GTEST_OS_AIX || GTEST_OS_HPUX || \
+     GTEST_OS_OPENBSD || GTEST_OS_QNX || GTEST_OS_FREEBSD || \
+     GTEST_OS_NETBSD || GTEST_OS_FUCHSIA)
+# define GTEST_HAS_DEATH_TEST 1
+#endif
+
+// Determines whether to support type-driven tests.
+
+// Typed tests need <typeinfo> and variadic macros, which GCC, VC++ 8.0,
+// Sun Pro CC, IBM Visual Age, and HP aCC support.
+#if defined(__GNUC__) || (_MSC_VER >= 1400) || defined(__SUNPRO_CC) || \
+    defined(__IBMCPP__) || defined(__HP_aCC)
+# define GTEST_HAS_TYPED_TEST 1
+# define GTEST_HAS_TYPED_TEST_P 1
+#endif
+
+// Determines whether to support Combine(). This only makes sense when
+// value-parameterized tests are enabled.  The implementation doesn't
+// work on Sun Studio since it doesn't understand templated conversion
+// operators.
+#if (GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_) && !defined(__SUNPRO_CC)
+# define GTEST_HAS_COMBINE 1
+#endif
+
+// Determines whether the system compiler uses UTF-16 for encoding wide strings.
+#define GTEST_WIDE_STRING_USES_UTF16_ \
+    (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_SYMBIAN || GTEST_OS_AIX)
+
+// Determines whether test results can be streamed to a socket.
+#if GTEST_OS_LINUX
+# define GTEST_CAN_STREAM_RESULTS_ 1
+#endif
+
+// Defines some utility macros.
+
+// The GNU compiler emits a warning if nested "if" statements are followed by
+// an "else" statement and braces are not used to explicitly disambiguate the
+// "else" binding.  This leads to problems with code like:
+//
+//   if (gate)
+//     ASSERT_*(condition) << "Some message";
+//
+// The "switch (0) case 0:" idiom is used to suppress this.
+#ifdef __INTEL_COMPILER
+# define GTEST_AMBIGUOUS_ELSE_BLOCKER_
+#else
+# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ switch (0) case 0: default:  // NOLINT
+#endif
+
+// Use this annotation at the end of a struct/class definition to
+// prevent the compiler from optimizing away instances that are never
+// used.  This is useful when all interesting logic happens inside the
+// c'tor and / or d'tor.  Example:
+//
+//   struct Foo {
+//     Foo() { ... }
+//   } GTEST_ATTRIBUTE_UNUSED_;
+//
+// Also use it after a variable or parameter declaration to tell the
+// compiler the variable/parameter does not have to be used.
+#if defined(__GNUC__) && !defined(COMPILER_ICC)
+# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
+#elif defined(__clang__)
+# if __has_attribute(unused)
+#  define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
+# endif
+#endif
+#ifndef GTEST_ATTRIBUTE_UNUSED_
+# define GTEST_ATTRIBUTE_UNUSED_
+#endif
+
+#if GTEST_LANG_CXX11
+# define GTEST_CXX11_EQUALS_DELETE_ = delete
+#else  // GTEST_LANG_CXX11
+# define GTEST_CXX11_EQUALS_DELETE_
+#endif  // GTEST_LANG_CXX11
+
+// Use this annotation before a function that takes a printf format string.
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(COMPILER_ICC)
+# if defined(__MINGW_PRINTF_FORMAT)
+// MinGW has two different printf implementations. Ensure the format macro
+// matches the selected implementation. See
+// https://sourceforge.net/p/mingw-w64/wiki2/gnu%20printf/.
+#  define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+       __attribute__((__format__(__MINGW_PRINTF_FORMAT, string_index, \
+                                 first_to_check)))
+# else
+#  define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+       __attribute__((__format__(__printf__, string_index, first_to_check)))
+# endif
+#else
+# define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check)
+#endif
+
+
+// A macro to disallow operator=
+// This should be used in the private: declarations for a class.
+#define GTEST_DISALLOW_ASSIGN_(type) \
+  void operator=(type const &) GTEST_CXX11_EQUALS_DELETE_
+
+// A macro to disallow copy constructor and operator=
+// This should be used in the private: declarations for a class.
+#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type) \
+  type(type const &) GTEST_CXX11_EQUALS_DELETE_; \
+  GTEST_DISALLOW_ASSIGN_(type)
+
+// Tell the compiler to warn about unused return values for functions declared
+// with this macro.  The macro should be used on function declarations
+// following the argument list:
+//
+//   Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_;
+#if defined(__GNUC__) && (GTEST_GCC_VER_ >= 30400) && !defined(COMPILER_ICC)
+# define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result))
+#else
+# define GTEST_MUST_USE_RESULT_
+#endif  // __GNUC__ && (GTEST_GCC_VER_ >= 30400) && !COMPILER_ICC
+
+// MS C++ compiler emits warning when a conditional expression is compile time
+// constant. In some contexts this warning is false positive and needs to be
+// suppressed. Use the following two macros in such cases:
+//
+// GTEST_INTENTIONAL_CONST_COND_PUSH_()
+// while (true) {
+// GTEST_INTENTIONAL_CONST_COND_POP_()
+// }
+# define GTEST_INTENTIONAL_CONST_COND_PUSH_() \
+    GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127)
+# define GTEST_INTENTIONAL_CONST_COND_POP_() \
+    GTEST_DISABLE_MSC_WARNINGS_POP_()
+
+// Determine whether the compiler supports Microsoft's Structured Exception
+// Handling.  This is supported by several Windows compilers but generally
+// does not exist on any other system.
+#ifndef GTEST_HAS_SEH
+// The user didn't tell us, so we need to figure it out.
+
+# if defined(_MSC_VER) || defined(__BORLANDC__)
+// These two compilers are known to support SEH.
+#  define GTEST_HAS_SEH 1
+# else
+// Assume no SEH.
+#  define GTEST_HAS_SEH 0
+# endif
+
+#define GTEST_IS_THREADSAFE \
+    (GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ \
+     || (GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT) \
+     || GTEST_HAS_PTHREAD)
+
+#endif  // GTEST_HAS_SEH
+
+// GTEST_API_ qualifies all symbols that must be exported. The definitions below
+// are guarded by #ifndef to give embedders a chance to define GTEST_API_ in
+// gtest/internal/custom/gtest-port.h
+#ifndef GTEST_API_
+
+#ifdef _MSC_VER
+# if GTEST_LINKED_AS_SHARED_LIBRARY
+#  define GTEST_API_ __declspec(dllimport)
+# elif GTEST_CREATE_SHARED_LIBRARY
+#  define GTEST_API_ __declspec(dllexport)
+# endif
+#elif __GNUC__ >= 4 || defined(__clang__)
+# define GTEST_API_ __attribute__((visibility ("default")))
+#endif  // _MSC_VER
+
+#endif  // GTEST_API_
+
+#ifndef GTEST_API_
+# define GTEST_API_
+#endif  // GTEST_API_
+
+#ifndef GTEST_DEFAULT_DEATH_TEST_STYLE
+# define GTEST_DEFAULT_DEATH_TEST_STYLE  "fast"
+#endif  // GTEST_DEFAULT_DEATH_TEST_STYLE
+
+#ifdef __GNUC__
+// Ask the compiler to never inline a given function.
+# define GTEST_NO_INLINE_ __attribute__((noinline))
+#else
+# define GTEST_NO_INLINE_
+#endif
+
+// _LIBCPP_VERSION is defined by the libc++ library from the LLVM project.
+#if !defined(GTEST_HAS_CXXABI_H_)
+# if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))
+#  define GTEST_HAS_CXXABI_H_ 1
+# else
+#  define GTEST_HAS_CXXABI_H_ 0
+# endif
+#endif
+
+// A function level attribute to disable checking for use of uninitialized
+// memory when built with MemorySanitizer.
+#if defined(__clang__)
+# if __has_feature(memory_sanitizer)
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ \
+       __attribute__((no_sanitize_memory))
+# else
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+# endif  // __has_feature(memory_sanitizer)
+#else
+# define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+#endif  // __clang__
+
+// A function level attribute to disable AddressSanitizer instrumentation.
+#if defined(__clang__)
+# if __has_feature(address_sanitizer)
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \
+       __attribute__((no_sanitize_address))
+# else
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+# endif  // __has_feature(address_sanitizer)
+#else
+# define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+#endif  // __clang__
+
+// A function level attribute to disable ThreadSanitizer instrumentation.
+#if defined(__clang__)
+# if __has_feature(thread_sanitizer)
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ \
+       __attribute__((no_sanitize_thread))
+# else
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+# endif  // __has_feature(thread_sanitizer)
+#else
+# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+#endif  // __clang__
+
+namespace testing {
+
+class Message;
+
+#if defined(GTEST_TUPLE_NAMESPACE_)
+// Import tuple and friends into the ::testing namespace.
+// It is part of our interface, having them in ::testing allows us to change
+// their types as needed.
+using GTEST_TUPLE_NAMESPACE_::get;
+using GTEST_TUPLE_NAMESPACE_::make_tuple;
+using GTEST_TUPLE_NAMESPACE_::tuple;
+using GTEST_TUPLE_NAMESPACE_::tuple_size;
+using GTEST_TUPLE_NAMESPACE_::tuple_element;
+#endif  // defined(GTEST_TUPLE_NAMESPACE_)
+
+namespace internal {
+
+// A secret type that Google Test users don't know about.  It has no
+// definition on purpose.  Therefore it's impossible to create a
+// Secret object, which is what we want.
+class Secret;
+
+// The GTEST_COMPILE_ASSERT_ macro can be used to verify that a compile time
+// expression is true. For example, you could use it to verify the
+// size of a static array:
+//
+//   GTEST_COMPILE_ASSERT_(GTEST_ARRAY_SIZE_(names) == NUM_NAMES,
+//                         names_incorrect_size);
+//
+// or to make sure a struct is smaller than a certain size:
+//
+//   GTEST_COMPILE_ASSERT_(sizeof(foo) < 128, foo_too_large);
+//
+// The second argument to the macro is the name of the variable. If
+// the expression is false, most compilers will issue a warning/error
+// containing the name of the variable.
+
+#if GTEST_LANG_CXX11
+# define GTEST_COMPILE_ASSERT_(expr, msg) static_assert(expr, #msg)
+#else  // !GTEST_LANG_CXX11
+template <bool>
+  struct CompileAssert {
+};
+
+# define GTEST_COMPILE_ASSERT_(expr, msg) \
+  typedef ::testing::internal::CompileAssert<(static_cast<bool>(expr))> \
+      msg[static_cast<bool>(expr) ? 1 : -1] GTEST_ATTRIBUTE_UNUSED_
+#endif  // !GTEST_LANG_CXX11
+
+// Implementation details of GTEST_COMPILE_ASSERT_:
+//
+// (In C++11, we simply use static_assert instead of the following)
+//
+// - GTEST_COMPILE_ASSERT_ works by defining an array type that has -1
+//   elements (and thus is invalid) when the expression is false.
+//
+// - The simpler definition
+//
+//    #define GTEST_COMPILE_ASSERT_(expr, msg) typedef char msg[(expr) ? 1 : -1]
+//
+//   does not work, as gcc supports variable-length arrays whose sizes
+//   are determined at run-time (this is gcc's extension and not part
+//   of the C++ standard).  As a result, gcc fails to reject the
+//   following code with the simple definition:
+//
+//     int foo;
+//     GTEST_COMPILE_ASSERT_(foo, msg); // not supposed to compile as foo is
+//                                      // not a compile-time constant.
+//
+// - By using the type CompileAssert<(bool(expr))>, we ensures that
+//   expr is a compile-time constant.  (Template arguments must be
+//   determined at compile-time.)
+//
+// - The outter parentheses in CompileAssert<(bool(expr))> are necessary
+//   to work around a bug in gcc 3.4.4 and 4.0.1.  If we had written
+//
+//     CompileAssert<bool(expr)>
+//
+//   instead, these compilers will refuse to compile
+//
+//     GTEST_COMPILE_ASSERT_(5 > 0, some_message);
+//
+//   (They seem to think the ">" in "5 > 0" marks the end of the
+//   template argument list.)
+//
+// - The array size is (bool(expr) ? 1 : -1), instead of simply
+//
+//     ((expr) ? 1 : -1).
+//
+//   This is to avoid running into a bug in MS VC 7.1, which
+//   causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1.
+
+// StaticAssertTypeEqHelper is used by StaticAssertTypeEq defined in gtest.h.
+//
+// This template is declared, but intentionally undefined.
+template <typename T1, typename T2>
+struct StaticAssertTypeEqHelper;
+
+template <typename T>
+struct StaticAssertTypeEqHelper<T, T> {
+  enum { value = true };
+};
+
+// Same as std::is_same<>.
+template <typename T, typename U>
+struct IsSame {
+  enum { value = false };
+};
+template <typename T>
+struct IsSame<T, T> {
+  enum { value = true };
+};
+
+// Evaluates to the number of elements in 'array'.
+#define GTEST_ARRAY_SIZE_(array) (sizeof(array) / sizeof(array[0]))
+
+#if GTEST_HAS_GLOBAL_STRING
+typedef ::string string;
+#else
+typedef ::std::string string;
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+#if GTEST_HAS_GLOBAL_WSTRING
+typedef ::wstring wstring;
+#elif GTEST_HAS_STD_WSTRING
+typedef ::std::wstring wstring;
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+// A helper for suppressing warnings on constant condition.  It just
+// returns 'condition'.
+GTEST_API_ bool IsTrue(bool condition);
+
+// Defines scoped_ptr.
+
+// RocksDB: use unique_ptr to work around some clang-analyze false reports
+template <typename T>
+using scoped_ptr = std::unique_ptr<T>;
+/*
+// This implementation of scoped_ptr is PARTIAL - it only contains
+// enough stuff to satisfy Google Test's need.
+template <typename T>
+class scoped_ptr {
+ public:
+  typedef T element_type;
+
+  explicit scoped_ptr(T* p = NULL) : ptr_(p) {}
+  ~scoped_ptr() { reset(); }
+
+  T& operator*() const { return *ptr_; }
+  T* operator->() const { return ptr_; }
+  T* get() const { return ptr_; }
+
+  T* release() {
+    T* const ptr = ptr_;
+    ptr_ = NULL;
+    return ptr;
+  }
+
+  void reset(T* p = NULL) {
+    if (p != ptr_) {
+      if (IsTrue(sizeof(T) > 0)) {  // Makes sure T is a complete type.
+        delete ptr_;
+      }
+      ptr_ = p;
+    }
+  }
+
+  friend void swap(scoped_ptr& a, scoped_ptr& b) {
+    using std::swap;
+    swap(a.ptr_, b.ptr_);
+  }
+
+ private:
+  T* ptr_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(scoped_ptr);
+};
+*/
+
+// Defines RE.
+
+#if GTEST_USES_PCRE
+// if used, PCRE is injected by custom/gtest-port.h
+#elif GTEST_USES_POSIX_RE || GTEST_USES_SIMPLE_RE
+
+// A simple C++ wrapper for <regex.h>.  It uses the POSIX Extended
+// Regular Expression syntax.
+class GTEST_API_ RE {
+ public:
+  // A copy constructor is required by the Standard to initialize object
+  // references from r-values.
+  RE(const RE& other) { Init(other.pattern()); }
+
+  // Constructs an RE from a string.
+  RE(const ::std::string& regex) { Init(regex.c_str()); }  // NOLINT
+
+# if GTEST_HAS_GLOBAL_STRING
+
+  RE(const ::string& regex) { Init(regex.c_str()); }  // NOLINT
+
+# endif  // GTEST_HAS_GLOBAL_STRING
+
+  RE(const char* regex) { Init(regex); }  // NOLINT
+  ~RE();
+
+  // Returns the string representation of the regex.
+  const char* pattern() const { return pattern_; }
+
+  // FullMatch(str, re) returns true iff regular expression re matches
+  // the entire str.
+  // PartialMatch(str, re) returns true iff regular expression re
+  // matches a substring of str (including str itself).
+  //
+  // FIXME: make FullMatch() and PartialMatch() work
+  // when str contains NUL characters.
+  static bool FullMatch(const ::std::string& str, const RE& re) {
+    return FullMatch(str.c_str(), re);
+  }
+  static bool PartialMatch(const ::std::string& str, const RE& re) {
+    return PartialMatch(str.c_str(), re);
+  }
+
+# if GTEST_HAS_GLOBAL_STRING
+
+  static bool FullMatch(const ::string& str, const RE& re) {
+    return FullMatch(str.c_str(), re);
+  }
+  static bool PartialMatch(const ::string& str, const RE& re) {
+    return PartialMatch(str.c_str(), re);
+  }
+
+# endif  // GTEST_HAS_GLOBAL_STRING
+
+  static bool FullMatch(const char* str, const RE& re);
+  static bool PartialMatch(const char* str, const RE& re);
+
+ private:
+  void Init(const char* regex);
+
+  // We use a const char* instead of an std::string, as Google Test used to be
+  // used where std::string is not available.  FIXME: change to
+  // std::string.
+  const char* pattern_;
+  bool is_valid_;
+
+# if GTEST_USES_POSIX_RE
+
+  regex_t full_regex_;     // For FullMatch().
+  regex_t partial_regex_;  // For PartialMatch().
+
+# else  // GTEST_USES_SIMPLE_RE
+
+  const char* full_pattern_;  // For FullMatch();
+
+# endif
+
+  GTEST_DISALLOW_ASSIGN_(RE);
+};
+
+#endif  // GTEST_USES_PCRE
+
+// Formats a source file path and a line number as they would appear
+// in an error message from the compiler used to compile this code.
+GTEST_API_ ::std::string FormatFileLocation(const char* file, int line);
+
+// Formats a file location for compiler-independent XML output.
+// Although this function is not platform dependent, we put it next to
+// FormatFileLocation in order to contrast the two functions.
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
+                                                               int line);
+
+// Defines logging utilities:
+//   GTEST_LOG_(severity) - logs messages at the specified severity level. The
+//                          message itself is streamed into the macro.
+//   LogToStderr()  - directs all log messages to stderr.
+//   FlushInfoLog() - flushes informational log messages.
+
+enum GTestLogSeverity {
+  GTEST_INFO,
+  GTEST_WARNING,
+  GTEST_ERROR,
+  GTEST_FATAL
+};
+
+// Formats log entry severity, provides a stream object for streaming the
+// log message, and terminates the message with a newline when going out of
+// scope.
+class GTEST_API_ GTestLog {
+ public:
+  GTestLog(GTestLogSeverity severity, const char* file, int line);
+
+  // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
+  ~GTestLog();
+
+  ::std::ostream& GetStream() { return ::std::cerr; }
+
+ private:
+  const GTestLogSeverity severity_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog);
+};
+
+#if !defined(GTEST_LOG_)
+
+# define GTEST_LOG_(severity) \
+    ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
+                                  __FILE__, __LINE__).GetStream()
+
+inline void LogToStderr() {}
+inline void FlushInfoLog() { fflush(NULL); }
+
+#endif  // !defined(GTEST_LOG_)
+
+#if !defined(GTEST_CHECK_)
+// INTERNAL IMPLEMENTATION - DO NOT USE.
+//
+// GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition
+// is not satisfied.
+//  Synopsys:
+//    GTEST_CHECK_(boolean_condition);
+//     or
+//    GTEST_CHECK_(boolean_condition) << "Additional message";
+//
+//    This checks the condition and if the condition is not satisfied
+//    it prints message about the condition violation, including the
+//    condition itself, plus additional message streamed into it, if any,
+//    and then it aborts the program. It aborts the program irrespective of
+//    whether it is built in the debug mode or not.
+# define GTEST_CHECK_(condition) \
+    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+    if (::testing::internal::IsTrue(condition)) \
+      ; \
+    else \
+      GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
+#endif  // !defined(GTEST_CHECK_)
+
+// An all-mode assert to verify that the given POSIX-style function
+// call returns 0 (indicating success).  Known limitation: this
+// doesn't expand to a balanced 'if' statement, so enclose the macro
+// in {} if you need to use it as the only statement in an 'if'
+// branch.
+#define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \
+  if (const int gtest_error = (posix_call)) \
+    GTEST_LOG_(FATAL) << #posix_call << "failed with error " \
+                      << gtest_error
+
+// Adds reference to a type if it is not a reference type,
+// otherwise leaves it unchanged.  This is the same as
+// tr1::add_reference, which is not widely available yet.
+template <typename T>
+struct AddReference { typedef T& type; };  // NOLINT
+template <typename T>
+struct AddReference<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper around AddReference that works when the argument T
+// depends on template parameters.
+#define GTEST_ADD_REFERENCE_(T) \
+    typename ::testing::internal::AddReference<T>::type
+
+// Transforms "T" into "const T&" according to standard reference collapsing
+// rules (this is only needed as a backport for C++98 compilers that do not
+// support reference collapsing). Specifically, it transforms:
+//
+//   char         ==> const char&
+//   const char   ==> const char&
+//   char&        ==> char&
+//   const char&  ==> const char&
+//
+// Note that the non-const reference will not have "const" added. This is
+// standard, and necessary so that "T" can always bind to "const T&".
+template <typename T>
+struct ConstRef { typedef const T& type; };
+template <typename T>
+struct ConstRef<T&> { typedef T& type; };
+
+// The argument T must depend on some template parameters.
+#define GTEST_REFERENCE_TO_CONST_(T) \
+  typename ::testing::internal::ConstRef<T>::type
+
+#if GTEST_HAS_STD_MOVE_
+using std::forward;
+using std::move;
+
+template <typename T>
+struct RvalueRef {
+  typedef T&& type;
+};
+#else  // GTEST_HAS_STD_MOVE_
+template <typename T>
+const T& move(const T& t) {
+  return t;
+}
+template <typename T>
+GTEST_ADD_REFERENCE_(T) forward(GTEST_ADD_REFERENCE_(T) t) { return t; }
+
+template <typename T>
+struct RvalueRef {
+  typedef const T& type;
+};
+#endif  // GTEST_HAS_STD_MOVE_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Use ImplicitCast_ as a safe version of static_cast for upcasting in
+// the type hierarchy (e.g. casting a Foo* to a SuperclassOfFoo* or a
+// const Foo*).  When you use ImplicitCast_, the compiler checks that
+// the cast is safe.  Such explicit ImplicitCast_s are necessary in
+// surprisingly many situations where C++ demands an exact type match
+// instead of an argument type convertable to a target type.
+//
+// The syntax for using ImplicitCast_ is the same as for static_cast:
+//
+//   ImplicitCast_<ToType>(expr)
+//
+// ImplicitCast_ would have been part of the C++ standard library,
+// but the proposal was submitted too late.  It will probably make
+// its way into the language in the future.
+//
+// This relatively ugly name is intentional. It prevents clashes with
+// similar functions users may have (e.g., implicit_cast). The internal
+// namespace alone is not enough because the function can be found by ADL.
+template<typename To>
+inline To ImplicitCast_(To x) { return x; }
+
+// When you upcast (that is, cast a pointer from type Foo to type
+// SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts
+// always succeed.  When you downcast (that is, cast a pointer from
+// type Foo to type SubclassOfFoo), static_cast<> isn't safe, because
+// how do you know the pointer is really of type SubclassOfFoo?  It
+// could be a bare Foo, or of type DifferentSubclassOfFoo.  Thus,
+// when you downcast, you should use this macro.  In debug mode, we
+// use dynamic_cast<> to double-check the downcast is legal (we die
+// if it's not).  In normal mode, we do the efficient static_cast<>
+// instead.  Thus, it's important to test in debug mode to make sure
+// the cast is legal!
+//    This is the only place in the code we should use dynamic_cast<>.
+// In particular, you SHOULDN'T be using dynamic_cast<> in order to
+// do RTTI (eg code like this:
+//    if (dynamic_cast<Subclass1>(foo)) HandleASubclass1Object(foo);
+//    if (dynamic_cast<Subclass2>(foo)) HandleASubclass2Object(foo);
+// You should design the code some other way not to need this.
+//
+// This relatively ugly name is intentional. It prevents clashes with
+// similar functions users may have (e.g., down_cast). The internal
+// namespace alone is not enough because the function can be found by ADL.
+template<typename To, typename From>  // use like this: DownCast_<T*>(foo);
+inline To DownCast_(From* f) {  // so we only accept pointers
+  // Ensures that To is a sub-type of From *.  This test is here only
+  // for compile-time type checking, and has no overhead in an
+  // optimized build at run-time, as it will be optimized away
+  // completely.
+  GTEST_INTENTIONAL_CONST_COND_PUSH_()
+  if (false) {
+  GTEST_INTENTIONAL_CONST_COND_POP_()
+    const To to = NULL;
+    ::testing::internal::ImplicitCast_<From*>(to);
+  }
+
+#if GTEST_HAS_RTTI
+  // RTTI: debug mode only!
+  GTEST_CHECK_(f == NULL || dynamic_cast<To>(f) != NULL);
+#endif
+  return static_cast<To>(f);
+}
+
+// Downcasts the pointer of type Base to Derived.
+// Derived must be a subclass of Base. The parameter MUST
+// point to a class of type Derived, not any subclass of it.
+// When RTTI is available, the function performs a runtime
+// check to enforce this.
+template <class Derived, class Base>
+Derived* CheckedDowncastToActualType(Base* base) {
+#if GTEST_HAS_RTTI
+  GTEST_CHECK_(typeid(*base) == typeid(Derived));
+#endif
+
+#if GTEST_HAS_DOWNCAST_
+  return ::down_cast<Derived*>(base);
+#elif GTEST_HAS_RTTI
+  return dynamic_cast<Derived*>(base);  // NOLINT
+#else
+  return static_cast<Derived*>(base);  // Poor man's downcast.
+#endif
+}
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Defines the stderr capturer:
+//   CaptureStdout     - starts capturing stdout.
+//   GetCapturedStdout - stops capturing stdout and returns the captured string.
+//   CaptureStderr     - starts capturing stderr.
+//   GetCapturedStderr - stops capturing stderr and returns the captured string.
+//
+GTEST_API_ void CaptureStdout();
+GTEST_API_ std::string GetCapturedStdout();
+GTEST_API_ void CaptureStderr();
+GTEST_API_ std::string GetCapturedStderr();
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+// Returns the size (in bytes) of a file.
+GTEST_API_ size_t GetFileSize(FILE* file);
+
+// Reads the entire content of a file as a string.
+GTEST_API_ std::string ReadEntireFile(FILE* file);
+
+// All command line arguments.
+GTEST_API_ std::vector<std::string> GetArgvs();
+
+#if GTEST_HAS_DEATH_TEST
+
+std::vector<std::string> GetInjectableArgvs();
+// Deprecated: pass the args vector by value instead.
+void SetInjectableArgvs(const std::vector<std::string>* new_argvs);
+void SetInjectableArgvs(const std::vector<std::string>& new_argvs);
+#if GTEST_HAS_GLOBAL_STRING
+void SetInjectableArgvs(const std::vector< ::string>& new_argvs);
+#endif  // GTEST_HAS_GLOBAL_STRING
+void ClearInjectableArgvs();
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+// Defines synchronization primitives.
+#if GTEST_IS_THREADSAFE
+# if GTEST_HAS_PTHREAD
+// Sleeps for (roughly) n milliseconds.  This function is only for testing
+// Google Test's own constructs.  Don't use it in user tests, either
+// directly or indirectly.
+inline void SleepMilliseconds(int n) {
+  const timespec time = {
+    0,                  // 0 seconds.
+    n * 1000L * 1000L,  // And n ms.
+  };
+  nanosleep(&time, NULL);
+}
+# endif  // GTEST_HAS_PTHREAD
+
+# if GTEST_HAS_NOTIFICATION_
+// Notification has already been imported into the namespace.
+// Nothing to do here.
+
+# elif GTEST_HAS_PTHREAD
+// Allows a controller thread to pause execution of newly created
+// threads until notified.  Instances of this class must be created
+// and destroyed in the controller thread.
+//
+// This class is only for testing Google Test's own constructs. Do not
+// use it in user tests, either directly or indirectly.
+class Notification {
+ public:
+  Notification() : notified_(false) {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL));
+  }
+  ~Notification() {
+    pthread_mutex_destroy(&mutex_);
+  }
+
+  // Notifies all threads created with this notification to start. Must
+  // be called from the controller thread.
+  void Notify() {
+    pthread_mutex_lock(&mutex_);
+    notified_ = true;
+    pthread_mutex_unlock(&mutex_);
+  }
+
+  // Blocks until the controller thread notifies. Must be called from a test
+  // thread.
+  void WaitForNotification() {
+    for (;;) {
+      pthread_mutex_lock(&mutex_);
+      const bool notified = notified_;
+      pthread_mutex_unlock(&mutex_);
+      if (notified)
+        break;
+      SleepMilliseconds(10);
+    }
+  }
+
+ private:
+  pthread_mutex_t mutex_;
+  bool notified_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
+};
+
+# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+
+GTEST_API_ void SleepMilliseconds(int n);
+
+// Provides leak-safe Windows kernel handle ownership.
+// Used in death tests and in threading support.
+class GTEST_API_ AutoHandle {
+ public:
+  // Assume that Win32 HANDLE type is equivalent to void*. Doing so allows us to
+  // avoid including <windows.h> in this header file. Including <windows.h> is
+  // undesirable because it defines a lot of symbols and macros that tend to
+  // conflict with client code. This assumption is verified by
+  // WindowsTypesTest.HANDLEIsVoidStar.
+  typedef void* Handle;
+  AutoHandle();
+  explicit AutoHandle(Handle handle);
+
+  ~AutoHandle();
+
+  Handle Get() const;
+  void Reset();
+  void Reset(Handle handle);
+
+ private:
+  // Returns true iff the handle is a valid handle object that can be closed.
+  bool IsCloseable() const;
+
+  Handle handle_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle);
+};
+
+// Allows a controller thread to pause execution of newly created
+// threads until notified.  Instances of this class must be created
+// and destroyed in the controller thread.
+//
+// This class is only for testing Google Test's own constructs. Do not
+// use it in user tests, either directly or indirectly.
+class GTEST_API_ Notification {
+ public:
+  Notification();
+  void Notify();
+  void WaitForNotification();
+
+ private:
+  AutoHandle event_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
+};
+# endif  // GTEST_HAS_NOTIFICATION_
+
+// On MinGW, we can have both GTEST_OS_WINDOWS and GTEST_HAS_PTHREAD
+// defined, but we don't want to use MinGW's pthreads implementation, which
+// has conformance problems with some versions of the POSIX standard.
+# if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW
+
+// As a C-function, ThreadFuncWithCLinkage cannot be templated itself.
+// Consequently, it cannot select a correct instantiation of ThreadWithParam
+// in order to call its Run(). Introducing ThreadWithParamBase as a
+// non-templated base class for ThreadWithParam allows us to bypass this
+// problem.
+class ThreadWithParamBase {
+ public:
+  virtual ~ThreadWithParamBase() {}
+  virtual void Run() = 0;
+};
+
+// pthread_create() accepts a pointer to a function type with the C linkage.
+// According to the Standard (7.5/1), function types with different linkages
+// are different even if they are otherwise identical.  Some compilers (for
+// example, SunStudio) treat them as different types.  Since class methods
+// cannot be defined with C-linkage we need to define a free C-function to
+// pass into pthread_create().
+extern "C" inline void* ThreadFuncWithCLinkage(void* thread) {
+  static_cast<ThreadWithParamBase*>(thread)->Run();
+  return NULL;
+}
+
+// Helper class for testing Google Test's multi-threading constructs.
+// To use it, write:
+//
+//   void ThreadFunc(int param) { /* Do things with param */ }
+//   Notification thread_can_start;
+//   ...
+//   // The thread_can_start parameter is optional; you can supply NULL.
+//   ThreadWithParam<int> thread(&ThreadFunc, 5, &thread_can_start);
+//   thread_can_start.Notify();
+//
+// These classes are only for testing Google Test's own constructs. Do
+// not use them in user tests, either directly or indirectly.
+template <typename T>
+class ThreadWithParam : public ThreadWithParamBase {
+ public:
+  typedef void UserThreadFunc(T);
+
+  ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start)
+      : func_(func),
+        param_(param),
+        thread_can_start_(thread_can_start),
+        finished_(false) {
+    ThreadWithParamBase* const base = this;
+    // The thread can be created only after all fields except thread_
+    // have been initialized.
+    GTEST_CHECK_POSIX_SUCCESS_(
+        pthread_create(&thread_, 0, &ThreadFuncWithCLinkage, base));
+  }
+  ~ThreadWithParam() { Join(); }
+
+  void Join() {
+    if (!finished_) {
+      GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, 0));
+      finished_ = true;
+    }
+  }
+
+  virtual void Run() {
+    if (thread_can_start_ != NULL)
+      thread_can_start_->WaitForNotification();
+    func_(param_);
+  }
+
+ private:
+  UserThreadFunc* const func_;  // User-supplied thread function.
+  const T param_;  // User-supplied parameter to the thread function.
+  // When non-NULL, used to block execution until the controller thread
+  // notifies.
+  Notification* const thread_can_start_;
+  bool finished_;  // true iff we know that the thread function has finished.
+  pthread_t thread_;  // The native thread object.
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
+};
+# endif  // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD ||
+         // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+
+# if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+// Mutex and ThreadLocal have already been imported into the namespace.
+// Nothing to do here.
+
+# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+
+// Mutex implements mutex on Windows platforms.  It is used in conjunction
+// with class MutexLock:
+//
+//   Mutex mutex;
+//   ...
+//   MutexLock lock(&mutex);  // Acquires the mutex and releases it at the
+//                            // end of the current scope.
+//
+// A static Mutex *must* be defined or declared using one of the following
+// macros:
+//   GTEST_DEFINE_STATIC_MUTEX_(g_some_mutex);
+//   GTEST_DECLARE_STATIC_MUTEX_(g_some_mutex);
+//
+// (A non-static Mutex is defined/declared in the usual way).
+class GTEST_API_ Mutex {
+ public:
+  enum MutexType { kStatic = 0, kDynamic = 1 };
+  // We rely on kStaticMutex being 0 as it is to what the linker initializes
+  // type_ in static mutexes.  critical_section_ will be initialized lazily
+  // in ThreadSafeLazyInit().
+  enum StaticConstructorSelector { kStaticMutex = 0 };
+
+  // This constructor intentionally does nothing.  It relies on type_ being
+  // statically initialized to 0 (effectively setting it to kStatic) and on
+  // ThreadSafeLazyInit() to lazily initialize the rest of the members.
+  explicit Mutex(StaticConstructorSelector /*dummy*/) {}
+
+  Mutex();
+  ~Mutex();
+
+  void Lock();
+
+  void Unlock();
+
+  // Does nothing if the current thread holds the mutex. Otherwise, crashes
+  // with high probability.
+  void AssertHeld();
+
+ private:
+  // Initializes owner_thread_id_ and critical_section_ in static mutexes.
+  void ThreadSafeLazyInit();
+
+  // Per https://blogs.msdn.microsoft.com/oldnewthing/20040223-00/?p=40503,
+  // we assume that 0 is an invalid value for thread IDs.
+  unsigned int owner_thread_id_;
+
+  // For static mutexes, we rely on these members being initialized to zeros
+  // by the linker.
+  MutexType type_;
+  long critical_section_init_phase_;  // NOLINT
+  GTEST_CRITICAL_SECTION* critical_section_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
+};
+
+# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+    extern ::testing::internal::Mutex mutex
+
+# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+    ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex)
+
+// We cannot name this class MutexLock because the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms. That macro is used as a defensive measure to prevent against
+// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
+// "MutexLock l(&mu)".  Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(Mutex* mutex)
+      : mutex_(mutex) { mutex_->Lock(); }
+
+  ~GTestMutexLock() { mutex_->Unlock(); }
+
+ private:
+  Mutex* const mutex_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
+};
+
+typedef GTestMutexLock MutexLock;
+
+// Base class for ValueHolder<T>.  Allows a caller to hold and delete a value
+// without knowing its type.
+class ThreadLocalValueHolderBase {
+ public:
+  virtual ~ThreadLocalValueHolderBase() {}
+};
+
+// Provides a way for a thread to send notifications to a ThreadLocal
+// regardless of its parameter type.
+class ThreadLocalBase {
+ public:
+  // Creates a new ValueHolder<T> object holding a default value passed to
+  // this ThreadLocal<T>'s constructor and returns it.  It is the caller's
+  // responsibility not to call this when the ThreadLocal<T> instance already
+  // has a value on the current thread.
+  virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const = 0;
+
+ protected:
+  ThreadLocalBase() {}
+  virtual ~ThreadLocalBase() {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocalBase);
+};
+
+// Maps a thread to a set of ThreadLocals that have values instantiated on that
+// thread and notifies them when the thread exits.  A ThreadLocal instance is
+// expected to persist until all threads it has values on have terminated.
+class GTEST_API_ ThreadLocalRegistry {
+ public:
+  // Registers thread_local_instance as having value on the current thread.
+  // Returns a value that can be used to identify the thread from other threads.
+  static ThreadLocalValueHolderBase* GetValueOnCurrentThread(
+      const ThreadLocalBase* thread_local_instance);
+
+  // Invoked when a ThreadLocal instance is destroyed.
+  static void OnThreadLocalDestroyed(
+      const ThreadLocalBase* thread_local_instance);
+};
+
+class GTEST_API_ ThreadWithParamBase {
+ public:
+  void Join();
+
+ protected:
+  class Runnable {
+   public:
+    virtual ~Runnable() {}
+    virtual void Run() = 0;
+  };
+
+  ThreadWithParamBase(Runnable *runnable, Notification* thread_can_start);
+  virtual ~ThreadWithParamBase();
+
+ private:
+  AutoHandle thread_;
+};
+
+// Helper class for testing Google Test's multi-threading constructs.
+template <typename T>
+class ThreadWithParam : public ThreadWithParamBase {
+ public:
+  typedef void UserThreadFunc(T);
+
+  ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start)
+      : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {
+  }
+  virtual ~ThreadWithParam() {}
+
+ private:
+  class RunnableImpl : public Runnable {
+   public:
+    RunnableImpl(UserThreadFunc* func, T param)
+        : func_(func),
+          param_(param) {
+    }
+    virtual ~RunnableImpl() {}
+    virtual void Run() {
+      func_(param_);
+    }
+
+   private:
+    UserThreadFunc* const func_;
+    const T param_;
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(RunnableImpl);
+  };
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
+};
+
+// Implements thread-local storage on Windows systems.
+//
+//   // Thread 1
+//   ThreadLocal<int> tl(100);  // 100 is the default value for each thread.
+//
+//   // Thread 2
+//   tl.set(150);  // Changes the value for thread 2 only.
+//   EXPECT_EQ(150, tl.get());
+//
+//   // Thread 1
+//   EXPECT_EQ(100, tl.get());  // In thread 1, tl has the original value.
+//   tl.set(200);
+//   EXPECT_EQ(200, tl.get());
+//
+// The template type argument T must have a public copy constructor.
+// In addition, the default ThreadLocal constructor requires T to have
+// a public default constructor.
+//
+// The users of a TheadLocal instance have to make sure that all but one
+// threads (including the main one) using that instance have exited before
+// destroying it. Otherwise, the per-thread objects managed for them by the
+// ThreadLocal instance are not guaranteed to be destroyed on all platforms.
+//
+// Google Test only uses global ThreadLocal objects.  That means they
+// will die after main() has returned.  Therefore, no per-thread
+// object managed by Google Test will be leaked as long as all threads
+// using Google Test have exited when main() returns.
+template <typename T>
+class ThreadLocal : public ThreadLocalBase {
+ public:
+  ThreadLocal() : default_factory_(new DefaultValueHolderFactory()) {}
+  explicit ThreadLocal(const T& value)
+      : default_factory_(new InstanceValueHolderFactory(value)) {}
+
+  ~ThreadLocal() { ThreadLocalRegistry::OnThreadLocalDestroyed(this); }
+
+  T* pointer() { return GetOrCreateValue(); }
+  const T* pointer() const { return GetOrCreateValue(); }
+  const T& get() const { return *pointer(); }
+  void set(const T& value) { *pointer() = value; }
+
+ private:
+  // Holds a value of T.  Can be deleted via its base class without the caller
+  // knowing the type of T.
+  class ValueHolder : public ThreadLocalValueHolderBase {
+   public:
+    ValueHolder() : value_() {}
+    explicit ValueHolder(const T& value) : value_(value) {}
+
+    T* pointer() { return &value_; }
+
+   private:
+    T value_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
+  };
+
+
+  T* GetOrCreateValue() const {
+    return static_cast<ValueHolder*>(
+        ThreadLocalRegistry::GetValueOnCurrentThread(this))->pointer();
+  }
+
+  virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const {
+    return default_factory_->MakeNewHolder();
+  }
+
+  class ValueHolderFactory {
+   public:
+    ValueHolderFactory() {}
+    virtual ~ValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const = 0;
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
+  };
+
+  class DefaultValueHolderFactory : public ValueHolderFactory {
+   public:
+    DefaultValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const { return new ValueHolder(); }
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
+  };
+
+  class InstanceValueHolderFactory : public ValueHolderFactory {
+   public:
+    explicit InstanceValueHolderFactory(const T& value) : value_(value) {}
+    virtual ValueHolder* MakeNewHolder() const {
+      return new ValueHolder(value_);
+    }
+
+   private:
+    const T value_;  // The value for each thread.
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
+  };
+
+  scoped_ptr<ValueHolderFactory> default_factory_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
+};
+
+# elif GTEST_HAS_PTHREAD
+
+// MutexBase and Mutex implement mutex on pthreads-based platforms.
+class MutexBase {
+ public:
+  // Acquires this mutex.
+  void Lock() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_lock(&mutex_));
+    owner_ = pthread_self();
+    has_owner_ = true;
+  }
+
+  // Releases this mutex.
+  void Unlock() {
+    // Since the lock is being released the owner_ field should no longer be
+    // considered valid. We don't protect writing to has_owner_ here, as it's
+    // the caller's responsibility to ensure that the current thread holds the
+    // mutex when this is called.
+    has_owner_ = false;
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_unlock(&mutex_));
+  }
+
+  // Does nothing if the current thread holds the mutex. Otherwise, crashes
+  // with high probability.
+  void AssertHeld() const {
+    GTEST_CHECK_(has_owner_ && pthread_equal(owner_, pthread_self()))
+        << "The current thread is not holding the mutex @" << this;
+  }
+
+  // A static mutex may be used before main() is entered.  It may even
+  // be used before the dynamic initialization stage.  Therefore we
+  // must be able to initialize a static mutex object at link time.
+  // This means MutexBase has to be a POD and its member variables
+  // have to be public.
+ public:
+  pthread_mutex_t mutex_;  // The underlying pthread mutex.
+  // has_owner_ indicates whether the owner_ field below contains a valid thread
+  // ID and is therefore safe to inspect (e.g., to use in pthread_equal()). All
+  // accesses to the owner_ field should be protected by a check of this field.
+  // An alternative might be to memset() owner_ to all zeros, but there's no
+  // guarantee that a zero'd pthread_t is necessarily invalid or even different
+  // from pthread_self().
+  bool has_owner_;
+  pthread_t owner_;  // The thread holding the mutex.
+};
+
+// Forward-declares a static mutex.
+#  define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+     extern ::testing::internal::MutexBase mutex
+
+// Defines and statically (i.e. at link time) initializes a static mutex.
+// The initialization list here does not explicitly initialize each field,
+// instead relying on default initialization for the unspecified fields. In
+// particular, the owner_ field (a pthread_t) is not explicitly initialized.
+// This allows initialization to work whether pthread_t is a scalar or struct.
+// The flag -Wmissing-field-initializers must not be specified for this to work.
+#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+  ::testing::internal::MutexBase mutex = {PTHREAD_MUTEX_INITIALIZER, false, 0}
+
+// The Mutex class can only be used for mutexes created at runtime. It
+// shares its API with MutexBase otherwise.
+class Mutex : public MutexBase {
+ public:
+  Mutex() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL));
+    has_owner_ = false;
+  }
+  ~Mutex() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_));
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
+};
+
+// We cannot name this class MutexLock because the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms. That macro is used as a defensive measure to prevent against
+// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
+// "MutexLock l(&mu)".  Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(MutexBase* mutex)
+      : mutex_(mutex) { mutex_->Lock(); }
+
+  ~GTestMutexLock() { mutex_->Unlock(); }
+
+ private:
+  MutexBase* const mutex_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
+};
+
+typedef GTestMutexLock MutexLock;
+
+// Helpers for ThreadLocal.
+
+// pthread_key_create() requires DeleteThreadLocalValue() to have
+// C-linkage.  Therefore it cannot be templatized to access
+// ThreadLocal<T>.  Hence the need for class
+// ThreadLocalValueHolderBase.
+class ThreadLocalValueHolderBase {
+ public:
+  virtual ~ThreadLocalValueHolderBase() {}
+};
+
+// Called by pthread to delete thread-local data stored by
+// pthread_setspecific().
+extern "C" inline void DeleteThreadLocalValue(void* value_holder) {
+  delete static_cast<ThreadLocalValueHolderBase*>(value_holder);
+}
+
+// Implements thread-local storage on pthreads-based systems.
+template <typename T>
+class GTEST_API_ ThreadLocal {
+ public:
+  ThreadLocal()
+      : key_(CreateKey()), default_factory_(new DefaultValueHolderFactory()) {}
+  explicit ThreadLocal(const T& value)
+      : key_(CreateKey()),
+        default_factory_(new InstanceValueHolderFactory(value)) {}
+
+  ~ThreadLocal() {
+    // Destroys the managed object for the current thread, if any.
+    DeleteThreadLocalValue(pthread_getspecific(key_));
+
+    // Releases resources associated with the key.  This will *not*
+    // delete managed objects for other threads.
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_key_delete(key_));
+  }
+
+  T* pointer() { return GetOrCreateValue(); }
+  const T* pointer() const { return GetOrCreateValue(); }
+  const T& get() const { return *pointer(); }
+  void set(const T& value) { *pointer() = value; }
+
+ private:
+  // Holds a value of type T.
+  class ValueHolder : public ThreadLocalValueHolderBase {
+   public:
+    ValueHolder() : value_() {}
+    explicit ValueHolder(const T& value) : value_(value) {}
+
+    T* pointer() { return &value_; }
+
+   private:
+    T value_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
+  };
+
+  static pthread_key_t CreateKey() {
+    pthread_key_t key;
+    // When a thread exits, DeleteThreadLocalValue() will be called on
+    // the object managed for that thread.
+    GTEST_CHECK_POSIX_SUCCESS_(
+        pthread_key_create(&key, &DeleteThreadLocalValue));
+    return key;
+  }
+
+  T* GetOrCreateValue() const {
+    ThreadLocalValueHolderBase* const holder =
+        static_cast<ThreadLocalValueHolderBase*>(pthread_getspecific(key_));
+    if (holder != NULL) {
+      return CheckedDowncastToActualType<ValueHolder>(holder)->pointer();
+    }
+
+    ValueHolder* const new_holder = default_factory_->MakeNewHolder();
+    ThreadLocalValueHolderBase* const holder_base = new_holder;
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base));
+    return new_holder->pointer();
+  }
+
+  class ValueHolderFactory {
+   public:
+    ValueHolderFactory() {}
+    virtual ~ValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const = 0;
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
+  };
+
+  class DefaultValueHolderFactory : public ValueHolderFactory {
+   public:
+    DefaultValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const { return new ValueHolder(); }
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
+  };
+
+  class InstanceValueHolderFactory : public ValueHolderFactory {
+   public:
+    explicit InstanceValueHolderFactory(const T& value) : value_(value) {}
+    virtual ValueHolder* MakeNewHolder() const {
+      return new ValueHolder(value_);
+    }
+
+   private:
+    const T value_;  // The value for each thread.
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
+  };
+
+  // A key pthreads uses for looking up per-thread values.
+  const pthread_key_t key_;
+  scoped_ptr<ValueHolderFactory> default_factory_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
+};
+
+# endif  // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+
+#else  // GTEST_IS_THREADSAFE
+
+// A dummy implementation of synchronization primitives (mutex, lock,
+// and thread-local variable).  Necessary for compiling Google Test where
+// mutex is not supported - using Google Test in multiple threads is not
+// supported on such platforms.
+
+class Mutex {
+ public:
+  Mutex() {}
+  void Lock() {}
+  void Unlock() {}
+  void AssertHeld() const {}
+};
+
+# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+  extern ::testing::internal::Mutex mutex
+
+# define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
+
+// We cannot name this class MutexLock because the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms. That macro is used as a defensive measure to prevent against
+// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
+// "MutexLock l(&mu)".  Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(Mutex*) {}  // NOLINT
+};
+
+typedef GTestMutexLock MutexLock;
+
+template <typename T>
+class GTEST_API_ ThreadLocal {
+ public:
+  ThreadLocal() : value_() {}
+  explicit ThreadLocal(const T& value) : value_(value) {}
+  T* pointer() { return &value_; }
+  const T* pointer() const { return &value_; }
+  const T& get() const { return value_; }
+  void set(const T& value) { value_ = value; }
+ private:
+  T value_;
+};
+
+#endif  // GTEST_IS_THREADSAFE
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+GTEST_API_ size_t GetThreadCount();
+
+// Passing non-POD classes through ellipsis (...) crashes the ARM
+// compiler and generates a warning in Sun Studio before 12u4. The Nokia Symbian
+// and the IBM XL C/C++ compiler try to instantiate a copy constructor
+// for objects passed through ellipsis (...), failing for uncopyable
+// objects.  We define this to ensure that only POD is passed through
+// ellipsis on these systems.
+#if defined(__SYMBIAN32__) || defined(__IBMCPP__) || \
+     (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x5130)
+// We lose support for NULL detection where the compiler doesn't like
+// passing non-POD classes through ellipsis (...).
+# define GTEST_ELLIPSIS_NEEDS_POD_ 1
+#else
+# define GTEST_CAN_COMPARE_NULL 1
+#endif
+
+// The Nokia Symbian and IBM XL C/C++ compilers cannot decide between
+// const T& and const T* in a function template.  These compilers
+// _can_ decide between class template specializations for T and T*,
+// so a tr1::type_traits-like is_pointer works.
+#if defined(__SYMBIAN32__) || defined(__IBMCPP__)
+# define GTEST_NEEDS_IS_POINTER_ 1
+#endif
+
+template <bool bool_value>
+struct bool_constant {
+  typedef bool_constant<bool_value> type;
+  static const bool value = bool_value;
+};
+template <bool bool_value> const bool bool_constant<bool_value>::value;
+
+typedef bool_constant<false> false_type;
+typedef bool_constant<true> true_type;
+
+template <typename T, typename U>
+struct is_same : public false_type {};
+
+template <typename T>
+struct is_same<T, T> : public true_type {};
+
+
+template <typename T>
+struct is_pointer : public false_type {};
+
+template <typename T>
+struct is_pointer<T*> : public true_type {};
+
+template <typename Iterator>
+struct IteratorTraits {
+  typedef typename Iterator::value_type value_type;
+};
+
+
+template <typename T>
+struct IteratorTraits<T*> {
+  typedef T value_type;
+};
+
+template <typename T>
+struct IteratorTraits<const T*> {
+  typedef T value_type;
+};
+
+#if GTEST_OS_WINDOWS
+# define GTEST_PATH_SEP_ "\\"
+# define GTEST_HAS_ALT_PATH_SEP_ 1
+// The biggest signed integer type the compiler supports.
+typedef __int64 BiggestInt;
+#else
+# define GTEST_PATH_SEP_ "/"
+# define GTEST_HAS_ALT_PATH_SEP_ 0
+typedef long long BiggestInt;  // NOLINT
+#endif  // GTEST_OS_WINDOWS
+
+// Utilities for char.
+
+// isspace(int ch) and friends accept an unsigned char or EOF.  char
+// may be signed, depending on the compiler (or compiler flags).
+// Therefore we need to cast a char to unsigned char before calling
+// isspace(), etc.
+
+inline bool IsAlpha(char ch) {
+  return isalpha(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsAlNum(char ch) {
+  return isalnum(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsDigit(char ch) {
+  return isdigit(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsLower(char ch) {
+  return islower(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsSpace(char ch) {
+  return isspace(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsUpper(char ch) {
+  return isupper(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsXDigit(char ch) {
+  return isxdigit(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsXDigit(wchar_t ch) {
+  const unsigned char low_byte = static_cast<unsigned char>(ch);
+  return ch == low_byte && isxdigit(low_byte) != 0;
+}
+
+inline char ToLower(char ch) {
+  return static_cast<char>(tolower(static_cast<unsigned char>(ch)));
+}
+inline char ToUpper(char ch) {
+  return static_cast<char>(toupper(static_cast<unsigned char>(ch)));
+}
+
+inline std::string StripTrailingSpaces(std::string str) {
+  std::string::iterator it = str.end();
+  while (it != str.begin() && IsSpace(*--it))
+    it = str.erase(it);
+  return str;
+}
+
+// The testing::internal::posix namespace holds wrappers for common
+// POSIX functions.  These wrappers hide the differences between
+// Windows/MSVC and POSIX systems.  Since some compilers define these
+// standard functions as macros, the wrapper cannot have the same name
+// as the wrapped function.
+
+namespace posix {
+
+// Functions with a different name on Windows.
+
+#if GTEST_OS_WINDOWS
+
+typedef struct _stat StatStruct;
+
+# ifdef __BORLANDC__
+inline int IsATTY(int fd) { return isatty(fd); }
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return stricmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return strdup(src); }
+# else  // !__BORLANDC__
+#  if GTEST_OS_WINDOWS_MOBILE
+inline int IsATTY(int /* fd */) { return 0; }
+#  else
+inline int IsATTY(int fd) { return _isatty(fd); }
+#  endif  // GTEST_OS_WINDOWS_MOBILE
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return _stricmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return _strdup(src); }
+# endif  // __BORLANDC__
+
+# if GTEST_OS_WINDOWS_MOBILE
+inline int FileNo(FILE* file) { return reinterpret_cast<int>(_fileno(file)); }
+// Stat(), RmDir(), and IsDir() are not needed on Windows CE at this
+// time and thus not defined there.
+# else
+inline int FileNo(FILE* file) { return _fileno(file); }
+inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); }
+inline int RmDir(const char* dir) { return _rmdir(dir); }
+inline bool IsDir(const StatStruct& st) {
+  return (_S_IFDIR & st.st_mode) != 0;
+}
+# endif  // GTEST_OS_WINDOWS_MOBILE
+
+#else
+
+typedef struct stat StatStruct;
+
+inline int FileNo(FILE* file) { return fileno(file); }
+inline int IsATTY(int fd) { return isatty(fd); }
+inline int Stat(const char* path, StatStruct* buf) { return stat(path, buf); }
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return strcasecmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return strdup(src); }
+inline int RmDir(const char* dir) { return rmdir(dir); }
+inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); }
+
+#endif  // GTEST_OS_WINDOWS
+
+// Functions deprecated by MSVC 8.0.
+
+GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
+
+inline const char* StrNCpy(char* dest, const char* src, size_t n) {
+  return strncpy(dest, src, n);
+}
+
+// ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and
+// StrError() aren't needed on Windows CE at this time and thus not
+// defined there.
+
+#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+inline int ChDir(const char* dir) { return chdir(dir); }
+#endif
+inline FILE* FOpen(const char* path, const char* mode) {
+  return fopen(path, mode);
+}
+#if !GTEST_OS_WINDOWS_MOBILE
+inline FILE *FReopen(const char* path, const char* mode, FILE* stream) {
+  return freopen(path, mode, stream);
+}
+inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); }
+#endif
+inline int FClose(FILE* fp) { return fclose(fp); }
+#if !GTEST_OS_WINDOWS_MOBILE
+inline int Read(int fd, void* buf, unsigned int count) {
+  return static_cast<int>(read(fd, buf, count));
+}
+inline int Write(int fd, const void* buf, unsigned int count) {
+  return static_cast<int>(write(fd, buf, count));
+}
+inline int Close(int fd) { return close(fd); }
+inline const char* StrError(int errnum) { return strerror(errnum); }
+#endif
+inline const char* GetEnv(const char* name) {
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
+  // We are on Windows CE, which has no environment variables.
+  static_cast<void>(name);  // To prevent 'unused argument' warning.
+  return NULL;
+#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9)
+  // Environment variables which we programmatically clear will be set to the
+  // empty string rather than unset (NULL).  Handle that case.
+  const char* const env = getenv(name);
+  return (env != NULL && env[0] != '\0') ? env : NULL;
+#else
+  return getenv(name);
+#endif
+}
+
+GTEST_DISABLE_MSC_DEPRECATED_POP_()
+
+#if GTEST_OS_WINDOWS_MOBILE
+// Windows CE has no C library. The abort() function is used in
+// several places in Google Test. This implementation provides a reasonable
+// imitation of standard behaviour.
+void Abort();
+#else
+inline void Abort() { abort(); }
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+}  // namespace posix
+
+// MSVC "deprecates" snprintf and issues warnings wherever it is used.  In
+// order to avoid these warnings, we need to use _snprintf or _snprintf_s on
+// MSVC-based platforms.  We map the GTEST_SNPRINTF_ macro to the appropriate
+// function in order to achieve that.  We use macro definition here because
+// snprintf is a variadic function.
+#if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE
+// MSVC 2005 and above support variadic macros.
+# define GTEST_SNPRINTF_(buffer, size, format, ...) \
+     _snprintf_s(buffer, size, size, format, __VA_ARGS__)
+#elif defined(_MSC_VER)
+// Windows CE does not define _snprintf_s and MSVC prior to 2005 doesn't
+// complain about _snprintf.
+# define GTEST_SNPRINTF_ _snprintf
+#else
+# define GTEST_SNPRINTF_ snprintf
+#endif
+
+// The maximum number a BiggestInt can represent.  This definition
+// works no matter BiggestInt is represented in one's complement or
+// two's complement.
+//
+// We cannot rely on numeric_limits in STL, as __int64 and long long
+// are not part of standard C++ and numeric_limits doesn't need to be
+// defined for them.
+const BiggestInt kMaxBiggestInt =
+    ~(static_cast<BiggestInt>(1) << (8*sizeof(BiggestInt) - 1));
+
+// This template class serves as a compile-time function from size to
+// type.  It maps a size in bytes to a primitive type with that
+// size. e.g.
+//
+//   TypeWithSize<4>::UInt
+//
+// is typedef-ed to be unsigned int (unsigned integer made up of 4
+// bytes).
+//
+// Such functionality should belong to STL, but I cannot find it
+// there.
+//
+// Google Test uses this class in the implementation of floating-point
+// comparison.
+//
+// For now it only handles UInt (unsigned int) as that's all Google Test
+// needs.  Other types can be easily added in the future if need
+// arises.
+template <size_t size>
+class TypeWithSize {
+ public:
+  // This prevents the user from using TypeWithSize<N> with incorrect
+  // values of N.
+  typedef void UInt;
+};
+
+// The specialization for size 4.
+template <>
+class TypeWithSize<4> {
+ public:
+  // unsigned int has size 4 in both gcc and MSVC.
+  //
+  // As base/basictypes.h doesn't compile on Windows, we cannot use
+  // uint32, uint64, and etc here.
+  typedef int Int;
+  typedef unsigned int UInt;
+};
+
+// The specialization for size 8.
+template <>
+class TypeWithSize<8> {
+ public:
+#if GTEST_OS_WINDOWS
+  typedef __int64 Int;
+  typedef unsigned __int64 UInt;
+#else
+  typedef long long Int;  // NOLINT
+  typedef unsigned long long UInt;  // NOLINT
+#endif  // GTEST_OS_WINDOWS
+};
+
+// Integer types of known sizes.
+typedef TypeWithSize<4>::Int Int32;
+typedef TypeWithSize<4>::UInt UInt32;
+typedef TypeWithSize<8>::Int Int64;
+typedef TypeWithSize<8>::UInt UInt64;
+typedef TypeWithSize<8>::Int TimeInMillis;  // Represents time in milliseconds.
+
+// Utilities for command line flags and environment variables.
+
+// Macro for referencing flags.
+#if !defined(GTEST_FLAG)
+# define GTEST_FLAG(name) FLAGS_gtest_##name
+#endif  // !defined(GTEST_FLAG)
+
+#if !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
+# define GTEST_USE_OWN_FLAGFILE_FLAG_ 1
+#endif  // !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
+
+#if !defined(GTEST_DECLARE_bool_)
+# define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver
+
+// Macros for declaring flags.
+# define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
+# define GTEST_DECLARE_int32_(name) \
+    GTEST_API_ extern ::testing::internal::Int32 GTEST_FLAG(name)
+# define GTEST_DECLARE_string_(name) \
+    GTEST_API_ extern ::std::string GTEST_FLAG(name)
+
+// Macros for defining flags.
+# define GTEST_DEFINE_bool_(name, default_val, doc) \
+    GTEST_API_ bool GTEST_FLAG(name) = (default_val)
+# define GTEST_DEFINE_int32_(name, default_val, doc) \
+    GTEST_API_ ::testing::internal::Int32 GTEST_FLAG(name) = (default_val)
+# define GTEST_DEFINE_string_(name, default_val, doc) \
+    GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val)
+
+#endif  // !defined(GTEST_DECLARE_bool_)
+
+// Thread annotations
+#if !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
+# define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
+# define GTEST_LOCK_EXCLUDED_(locks)
+#endif  // !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
+
+// Parses 'str' for a 32-bit signed integer.  If successful, writes the result
+// to *value and returns true; otherwise leaves *value unchanged and returns
+// false.
+// FIXME: Find a better way to refactor flag and environment parsing
+// out of both gtest-port.cc and gtest.cc to avoid exporting this utility
+// function.
+bool ParseInt32(const Message& src_text, const char* str, Int32* value);
+
+// Parses a bool/Int32/string from the environment variable
+// corresponding to the given Google Test flag.
+bool BoolFromGTestEnv(const char* flag, bool default_val);
+GTEST_API_ Int32 Int32FromGTestEnv(const char* flag, Int32 default_val);
+std::string OutputFlagAlsoCheckEnvVar();
+const char* StringFromGTestEnv(const char* flag, const char* default_val);
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+
+#if GTEST_OS_LINUX
+# include <stdlib.h>
+# include <sys/types.h>
+# include <sys/wait.h>
+# include <unistd.h>
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_HAS_EXCEPTIONS
+# include <stdexcept>
+#endif
+
+#include <ctype.h>
+#include <float.h>
+#include <string.h>
+#include <iomanip>
+#include <limits>
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file defines the Message class.
+//
+// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
+// leave some internal implementation details in this header file.
+// They are clearly marked by comments like this:
+//
+//   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+//
+// Such code is NOT meant to be used by a user directly, and is subject
+// to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
+// program!
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+#define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+
+#include <limits>
+
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+// Ensures that there is at least one operator<< in the global namespace.
+// See Message& operator<<(...) below for why.
+void operator<<(const testing::internal::Secret&, int);
+
+namespace testing {
+
+// The Message class works like an ostream repeater.
+//
+// Typical usage:
+//
+//   1. You stream a bunch of values to a Message object.
+//      It will remember the text in a stringstream.
+//   2. Then you stream the Message object to an ostream.
+//      This causes the text in the Message to be streamed
+//      to the ostream.
+//
+// For example;
+//
+//   testing::Message foo;
+//   foo << 1 << " != " << 2;
+//   std::cout << foo;
+//
+// will print "1 != 2".
+//
+// Message is not intended to be inherited from.  In particular, its
+// destructor is not virtual.
+//
+// Note that stringstream behaves differently in gcc and in MSVC.  You
+// can stream a NULL char pointer to it in the former, but not in the
+// latter (it causes an access violation if you do).  The Message
+// class hides this difference by treating a NULL char pointer as
+// "(null)".
+class GTEST_API_ Message {
+ private:
+  // The type of basic IO manipulators (endl, ends, and flush) for
+  // narrow streams.
+  typedef std::ostream& (*BasicNarrowIoManip)(std::ostream&);
+
+ public:
+  // Constructs an empty Message.
+  Message();
+
+  // Copy constructor.
+  Message(const Message& msg) : ss_(new ::std::stringstream) {  // NOLINT
+    *ss_ << msg.GetString();
+  }
+
+  // Constructs a Message from a C-string.
+  explicit Message(const char* str) : ss_(new ::std::stringstream) {
+    *ss_ << str;
+  }
+
+#if GTEST_OS_SYMBIAN
+  // Streams a value (either a pointer or not) to this object.
+  template <typename T>
+  inline Message& operator <<(const T& value) {
+    StreamHelper(typename internal::is_pointer<T>::type(), value);
+    return *this;
+  }
+#else
+  // Streams a non-pointer value to this object.
+  template <typename T>
+  inline Message& operator <<(const T& val) {
+    // Some libraries overload << for STL containers.  These
+    // overloads are defined in the global namespace instead of ::std.
+    //
+    // C++'s symbol lookup rule (i.e. Koenig lookup) says that these
+    // overloads are visible in either the std namespace or the global
+    // namespace, but not other namespaces, including the testing
+    // namespace which Google Test's Message class is in.
+    //
+    // To allow STL containers (and other types that has a << operator
+    // defined in the global namespace) to be used in Google Test
+    // assertions, testing::Message must access the custom << operator
+    // from the global namespace.  With this using declaration,
+    // overloads of << defined in the global namespace and those
+    // visible via Koenig lookup are both exposed in this function.
+    using ::operator <<;
+    *ss_ << val;
+    return *this;
+  }
+
+  // Streams a pointer value to this object.
+  //
+  // This function is an overload of the previous one.  When you
+  // stream a pointer to a Message, this definition will be used as it
+  // is more specialized.  (The C++ Standard, section
+  // [temp.func.order].)  If you stream a non-pointer, then the
+  // previous definition will be used.
+  //
+  // The reason for this overload is that streaming a NULL pointer to
+  // ostream is undefined behavior.  Depending on the compiler, you
+  // may get "0", "(nil)", "(null)", or an access violation.  To
+  // ensure consistent result across compilers, we always treat NULL
+  // as "(null)".
+  template <typename T>
+  inline Message& operator <<(T* const& pointer) {  // NOLINT
+    if (pointer == NULL) {
+      *ss_ << "(null)";
+    } else {
+      *ss_ << pointer;
+    }
+    return *this;
+  }
+#endif  // GTEST_OS_SYMBIAN
+
+  // Since the basic IO manipulators are overloaded for both narrow
+  // and wide streams, we have to provide this specialized definition
+  // of operator <<, even though its body is the same as the
+  // templatized version above.  Without this definition, streaming
+  // endl or other basic IO manipulators to Message will confuse the
+  // compiler.
+  Message& operator <<(BasicNarrowIoManip val) {
+    *ss_ << val;
+    return *this;
+  }
+
+  // Instead of 1/0, we want to see true/false for bool values.
+  Message& operator <<(bool b) {
+    return *this << (b ? "true" : "false");
+  }
+
+  // These two overloads allow streaming a wide C string to a Message
+  // using the UTF-8 encoding.
+  Message& operator <<(const wchar_t* wide_c_str);
+  Message& operator <<(wchar_t* wide_c_str);
+
+#if GTEST_HAS_STD_WSTRING
+  // Converts the given wide string to a narrow string using the UTF-8
+  // encoding, and streams the result to this Message object.
+  Message& operator <<(const ::std::wstring& wstr);
+#endif  // GTEST_HAS_STD_WSTRING
+
+#if GTEST_HAS_GLOBAL_WSTRING
+  // Converts the given wide string to a narrow string using the UTF-8
+  // encoding, and streams the result to this Message object.
+  Message& operator <<(const ::wstring& wstr);
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+  // Gets the text streamed to this object so far as an std::string.
+  // Each '\0' character in the buffer is replaced with "\\0".
+  //
+  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+  std::string GetString() const;
+
+ private:
+#if GTEST_OS_SYMBIAN
+  // These are needed as the Nokia Symbian Compiler cannot decide between
+  // const T& and const T* in a function template. The Nokia compiler _can_
+  // decide between class template specializations for T and T*, so a
+  // tr1::type_traits-like is_pointer works, and we can overload on that.
+  template <typename T>
+  inline void StreamHelper(internal::true_type /*is_pointer*/, T* pointer) {
+    if (pointer == NULL) {
+      *ss_ << "(null)";
+    } else {
+      *ss_ << pointer;
+    }
+  }
+  template <typename T>
+  inline void StreamHelper(internal::false_type /*is_pointer*/,
+                           const T& value) {
+    // See the comments in Message& operator <<(const T&) above for why
+    // we need this using statement.
+    using ::operator <<;
+    *ss_ << value;
+  }
+#endif  // GTEST_OS_SYMBIAN
+
+  // We'll hold the text streamed to this object here.
+  const internal::scoped_ptr< ::std::stringstream> ss_;
+
+  // We declare (but don't implement) this to prevent the compiler
+  // from implementing the assignment operator.
+  void operator=(const Message&);
+};
+
+// Streams a Message to an ostream.
+inline std::ostream& operator <<(std::ostream& os, const Message& sb) {
+  return os << sb.GetString();
+}
+
+namespace internal {
+
+// Converts a streamable value to an std::string.  A NULL pointer is
+// converted to "(null)".  When the input value is a ::string,
+// ::std::string, ::wstring, or ::std::wstring object, each NUL
+// character in it is replaced with "\\0".
+template <typename T>
+std::string StreamableToString(const T& streamable) {
+  return (Message() << streamable).GetString();
+}
+
+}  // namespace internal
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Google Test filepath utilities
+//
+// This header file declares classes and functions used internally by
+// Google Test.  They are subject to change without notice.
+//
+// This file is #included in gtest/internal/gtest-internal.h.
+// Do not include this header file separately!
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file declares the String class and functions used internally by
+// Google Test.  They are subject to change without notice. They should not used
+// by code external to Google Test.
+//
+// This header file is #included by gtest-internal.h.
+// It should not be #included by other files.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+
+#ifdef __BORLANDC__
+// string.h is not guaranteed to provide strcpy on C++ Builder.
+# include <mem.h>
+#endif
+
+#include <string.h>
+#include <string>
+
+
+namespace testing {
+namespace internal {
+
+// String - an abstract class holding static string utilities.
+class GTEST_API_ String {
+ public:
+  // Static utility methods
+
+  // Clones a 0-terminated C string, allocating memory using new.  The
+  // caller is responsible for deleting the return value using
+  // delete[].  Returns the cloned string, or NULL if the input is
+  // NULL.
+  //
+  // This is different from strdup() in string.h, which allocates
+  // memory using malloc().
+  static const char* CloneCString(const char* c_str);
+
+#if GTEST_OS_WINDOWS_MOBILE
+  // Windows CE does not have the 'ANSI' versions of Win32 APIs. To be
+  // able to pass strings to Win32 APIs on CE we need to convert them
+  // to 'Unicode', UTF-16.
+
+  // Creates a UTF-16 wide string from the given ANSI string, allocating
+  // memory using new. The caller is responsible for deleting the return
+  // value using delete[]. Returns the wide string, or NULL if the
+  // input is NULL.
+  //
+  // The wide string is created using the ANSI codepage (CP_ACP) to
+  // match the behaviour of the ANSI versions of Win32 calls and the
+  // C runtime.
+  static LPCWSTR AnsiToUtf16(const char* c_str);
+
+  // Creates an ANSI string from the given wide string, allocating
+  // memory using new. The caller is responsible for deleting the return
+  // value using delete[]. Returns the ANSI string, or NULL if the
+  // input is NULL.
+  //
+  // The returned string is created using the ANSI codepage (CP_ACP) to
+  // match the behaviour of the ANSI versions of Win32 calls and the
+  // C runtime.
+  static const char* Utf16ToAnsi(LPCWSTR utf16_str);
+#endif
+
+  // Compares two C strings.  Returns true iff they have the same content.
+  //
+  // Unlike strcmp(), this function can handle NULL argument(s).  A
+  // NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool CStringEquals(const char* lhs, const char* rhs);
+
+  // Converts a wide C string to a String using the UTF-8 encoding.
+  // NULL will be converted to "(null)".  If an error occurred during
+  // the conversion, "(failed to convert from wide string)" is
+  // returned.
+  static std::string ShowWideCString(const wchar_t* wide_c_str);
+
+  // Compares two wide C strings.  Returns true iff they have the same
+  // content.
+  //
+  // Unlike wcscmp(), this function can handle NULL argument(s).  A
+  // NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs);
+
+  // Compares two C strings, ignoring case.  Returns true iff they
+  // have the same content.
+  //
+  // Unlike strcasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool CaseInsensitiveCStringEquals(const char* lhs,
+                                           const char* rhs);
+
+  // Compares two wide C strings, ignoring case.  Returns true iff they
+  // have the same content.
+  //
+  // Unlike wcscasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL wide C string,
+  // including the empty string.
+  // NB: The implementations on different platforms slightly differ.
+  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
+  // environment variable. On GNU platform this method uses wcscasecmp
+  // which compares according to LC_CTYPE category of the current locale.
+  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
+  // current locale.
+  static bool CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
+                                               const wchar_t* rhs);
+
+  // Returns true iff the given string ends with the given suffix, ignoring
+  // case. Any string is considered to end with an empty suffix.
+  static bool EndsWithCaseInsensitive(
+      const std::string& str, const std::string& suffix);
+
+  // Formats an int value as "%02d".
+  static std::string FormatIntWidth2(int value);  // "%02d" for width == 2
+
+  // Formats an int value as "%X".
+  static std::string FormatHexInt(int value);
+
+  // Formats a byte as "%02X".
+  static std::string FormatByte(unsigned char value);
+
+ private:
+  String();  // Not meant to be instantiated.
+};  // class String
+
+// Gets the content of the stringstream's buffer as an std::string.  Each '\0'
+// character in the buffer is replaced with "\\0".
+GTEST_API_ std::string StringStreamToString(::std::stringstream* stream);
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+namespace internal {
+
+// FilePath - a class for file and directory pathname manipulation which
+// handles platform-specific conventions (like the pathname separator).
+// Used for helper functions for naming files in a directory for xml output.
+// Except for Set methods, all methods are const or static, which provides an
+// "immutable value object" -- useful for peace of mind.
+// A FilePath with a value ending in a path separator ("like/this/") represents
+// a directory, otherwise it is assumed to represent a file. In either case,
+// it may or may not represent an actual file or directory in the file system.
+// Names are NOT checked for syntax correctness -- no checking for illegal
+// characters, malformed paths, etc.
+
+class GTEST_API_ FilePath {
+ public:
+  FilePath() : pathname_("") { }
+  FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) { }
+
+  explicit FilePath(const std::string& pathname) : pathname_(pathname) {
+    Normalize();
+  }
+
+  FilePath& operator=(const FilePath& rhs) {
+    Set(rhs);
+    return *this;
+  }
+
+  void Set(const FilePath& rhs) {
+    pathname_ = rhs.pathname_;
+  }
+
+  const std::string& string() const { return pathname_; }
+  const char* c_str() const { return pathname_.c_str(); }
+
+  // Returns the current working directory, or "" if unsuccessful.
+  static FilePath GetCurrentDir();
+
+  // Given directory = "dir", base_name = "test", number = 0,
+  // extension = "xml", returns "dir/test.xml". If number is greater
+  // than zero (e.g., 12), returns "dir/test_12.xml".
+  // On Windows platform, uses \ as the separator rather than /.
+  static FilePath MakeFileName(const FilePath& directory,
+                               const FilePath& base_name,
+                               int number,
+                               const char* extension);
+
+  // Given directory = "dir", relative_path = "test.xml",
+  // returns "dir/test.xml".
+  // On Windows, uses \ as the separator rather than /.
+  static FilePath ConcatPaths(const FilePath& directory,
+                              const FilePath& relative_path);
+
+  // Returns a pathname for a file that does not currently exist. The pathname
+  // will be directory/base_name.extension or
+  // directory/base_name_<number>.extension if directory/base_name.extension
+  // already exists. The number will be incremented until a pathname is found
+  // that does not already exist.
+  // Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
+  // There could be a race condition if two or more processes are calling this
+  // function at the same time -- they could both pick the same filename.
+  static FilePath GenerateUniqueFileName(const FilePath& directory,
+                                         const FilePath& base_name,
+                                         const char* extension);
+
+  // Returns true iff the path is "".
+  bool IsEmpty() const { return pathname_.empty(); }
+
+  // If input name has a trailing separator character, removes it and returns
+  // the name, otherwise return the name string unmodified.
+  // On Windows platform, uses \ as the separator, other platforms use /.
+  FilePath RemoveTrailingPathSeparator() const;
+
+  // Returns a copy of the FilePath with the directory part removed.
+  // Example: FilePath("path/to/file").RemoveDirectoryName() returns
+  // FilePath("file"). If there is no directory part ("just_a_file"), it returns
+  // the FilePath unmodified. If there is no file part ("just_a_dir/") it
+  // returns an empty FilePath ("").
+  // On Windows platform, '\' is the path separator, otherwise it is '/'.
+  FilePath RemoveDirectoryName() const;
+
+  // RemoveFileName returns the directory path with the filename removed.
+  // Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
+  // If the FilePath is "a_file" or "/a_file", RemoveFileName returns
+  // FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
+  // not have a file, like "just/a/dir/", it returns the FilePath unmodified.
+  // On Windows platform, '\' is the path separator, otherwise it is '/'.
+  FilePath RemoveFileName() const;
+
+  // Returns a copy of the FilePath with the case-insensitive extension removed.
+  // Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
+  // FilePath("dir/file"). If a case-insensitive extension is not
+  // found, returns a copy of the original FilePath.
+  FilePath RemoveExtension(const char* extension) const;
+
+  // Creates directories so that path exists. Returns true if successful or if
+  // the directories already exist; returns false if unable to create
+  // directories for any reason. Will also return false if the FilePath does
+  // not represent a directory (that is, it doesn't end with a path separator).
+  bool CreateDirectoriesRecursively() const;
+
+  // Create the directory so that path exists. Returns true if successful or
+  // if the directory already exists; returns false if unable to create the
+  // directory for any reason, including if the parent directory does not
+  // exist. Not named "CreateDirectory" because that's a macro on Windows.
+  bool CreateFolder() const;
+
+  // Returns true if FilePath describes something in the file-system,
+  // either a file, directory, or whatever, and that something exists.
+  bool FileOrDirectoryExists() const;
+
+  // Returns true if pathname describes a directory in the file-system
+  // that exists.
+  bool DirectoryExists() const;
+
+  // Returns true if FilePath ends with a path separator, which indicates that
+  // it is intended to represent a directory. Returns false otherwise.
+  // This does NOT check that a directory (or file) actually exists.
+  bool IsDirectory() const;
+
+  // Returns true if pathname describes a root directory. (Windows has one
+  // root directory per disk drive.)
+  bool IsRootDirectory() const;
+
+  // Returns true if pathname describes an absolute path.
+  bool IsAbsolutePath() const;
+
+ private:
+  // Replaces multiple consecutive separators with a single separator.
+  // For example, "bar///foo" becomes "bar/foo". Does not eliminate other
+  // redundancies that might be in a pathname involving "." or "..".
+  //
+  // A pathname with multiple consecutive separators may occur either through
+  // user error or as a result of some scripts or APIs that generate a pathname
+  // with a trailing separator. On other platforms the same API or script
+  // may NOT generate a pathname with a trailing "/". Then elsewhere that
+  // pathname may have another "/" and pathname components added to it,
+  // without checking for the separator already being there.
+  // The script language and operating system may allow paths like "foo//bar"
+  // but some of the functions in FilePath will not handle that correctly. In
+  // particular, RemoveTrailingPathSeparator() only removes one separator, and
+  // it is called in CreateDirectoriesRecursively() assuming that it will change
+  // a pathname from directory syntax (trailing separator) to filename syntax.
+  //
+  // On Windows this method also replaces the alternate path separator '/' with
+  // the primary path separator '\\', so that for example "bar\\/\\foo" becomes
+  // "bar\\foo".
+
+  void Normalize();
+
+  // Returns a pointer to the last occurence of a valid path separator in
+  // the FilePath. On Windows, for example, both '/' and '\' are valid path
+  // separators. Returns NULL if no path separator was found.
+  const char* FindLastPathSeparator() const;
+
+  std::string pathname_;
+};  // class FilePath
+
+}  // namespace internal
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+// This file was GENERATED by command:
+//     pump.py gtest-type-util.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Type utilities needed for implementing typed and type-parameterized
+// tests.  This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+// Currently we support at most 50 types in a list, and at most 50
+// type-parameterized tests in one type-parameterized test case.
+// Please contact googletestframework@googlegroups.com if you need
+// more.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+
+
+// #ifdef __GNUC__ is too general here.  It is possible to use gcc without using
+// libstdc++ (which is where cxxabi.h comes from).
+# if GTEST_HAS_CXXABI_H_
+#  include <cxxabi.h>
+# elif defined(__HP_aCC)
+#  include <acxx_demangle.h>
+# endif  // GTEST_HASH_CXXABI_H_
+
+namespace testing {
+namespace internal {
+
+// Canonicalizes a given name with respect to the Standard C++ Library.
+// This handles removing the inline namespace within `std` that is
+// used by various standard libraries (e.g., `std::__1`).  Names outside
+// of namespace std are returned unmodified.
+inline std::string CanonicalizeForStdLibVersioning(std::string s) {
+  static const char prefix[] = "std::__";
+  if (s.compare(0, strlen(prefix), prefix) == 0) {
+    std::string::size_type end = s.find("::", strlen(prefix));
+    if (end != s.npos) {
+      // Erase everything between the initial `std` and the second `::`.
+      s.erase(strlen("std"), end - strlen("std"));
+    }
+  }
+  return s;
+}
+
+// GetTypeName<T>() returns a human-readable name of type T.
+// NB: This function is also used in Google Mock, so don't move it inside of
+// the typed-test-only section below.
+template <typename T>
+std::string GetTypeName() {
+# if GTEST_HAS_RTTI
+
+  const char* const name = typeid(T).name();
+#  if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
+  int status = 0;
+  // gcc's implementation of typeid(T).name() mangles the type name,
+  // so we have to demangle it.
+#   if GTEST_HAS_CXXABI_H_
+  using abi::__cxa_demangle;
+#   endif  // GTEST_HAS_CXXABI_H_
+  char* const readable_name = __cxa_demangle(name, 0, 0, &status);
+  const std::string name_str(status == 0 ? readable_name : name);
+  free(readable_name);
+  return CanonicalizeForStdLibVersioning(name_str);
+#  else
+  return name;
+#  endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
+
+# else
+
+  return "<type>";
+
+# endif  // GTEST_HAS_RTTI
+}
+
+#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+// AssertyTypeEq<T1, T2>::type is defined iff T1 and T2 are the same
+// type.  This can be used as a compile-time assertion to ensure that
+// two types are equal.
+
+template <typename T1, typename T2>
+struct AssertTypeEq;
+
+template <typename T>
+struct AssertTypeEq<T, T> {
+  typedef bool type;
+};
+
+// A unique type used as the default value for the arguments of class
+// template Types.  This allows us to simulate variadic templates
+// (e.g. Types<int>, Type<int, double>, and etc), which C++ doesn't
+// support directly.
+struct None {};
+
+// The following family of struct and struct templates are used to
+// represent type lists.  In particular, TypesN<T1, T2, ..., TN>
+// represents a type list with N types (T1, T2, ..., and TN) in it.
+// Except for Types0, every struct in the family has two member types:
+// Head for the first type in the list, and Tail for the rest of the
+// list.
+
+// The empty type list.
+struct Types0 {};
+
+// Type lists of length 1, 2, 3, and so on.
+
+template <typename T1>
+struct Types1 {
+  typedef T1 Head;
+  typedef Types0 Tail;
+};
+template <typename T1, typename T2>
+struct Types2 {
+  typedef T1 Head;
+  typedef Types1<T2> Tail;
+};
+
+template <typename T1, typename T2, typename T3>
+struct Types3 {
+  typedef T1 Head;
+  typedef Types2<T2, T3> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4>
+struct Types4 {
+  typedef T1 Head;
+  typedef Types3<T2, T3, T4> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+struct Types5 {
+  typedef T1 Head;
+  typedef Types4<T2, T3, T4, T5> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+struct Types6 {
+  typedef T1 Head;
+  typedef Types5<T2, T3, T4, T5, T6> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+struct Types7 {
+  typedef T1 Head;
+  typedef Types6<T2, T3, T4, T5, T6, T7> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+struct Types8 {
+  typedef T1 Head;
+  typedef Types7<T2, T3, T4, T5, T6, T7, T8> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+struct Types9 {
+  typedef T1 Head;
+  typedef Types8<T2, T3, T4, T5, T6, T7, T8, T9> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+struct Types10 {
+  typedef T1 Head;
+  typedef Types9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+struct Types11 {
+  typedef T1 Head;
+  typedef Types10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+struct Types12 {
+  typedef T1 Head;
+  typedef Types11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+struct Types13 {
+  typedef T1 Head;
+  typedef Types12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+struct Types14 {
+  typedef T1 Head;
+  typedef Types13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+struct Types15 {
+  typedef T1 Head;
+  typedef Types14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+struct Types16 {
+  typedef T1 Head;
+  typedef Types15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+struct Types17 {
+  typedef T1 Head;
+  typedef Types16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+struct Types18 {
+  typedef T1 Head;
+  typedef Types17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+struct Types19 {
+  typedef T1 Head;
+  typedef Types18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+struct Types20 {
+  typedef T1 Head;
+  typedef Types19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+struct Types21 {
+  typedef T1 Head;
+  typedef Types20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+struct Types22 {
+  typedef T1 Head;
+  typedef Types21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+struct Types23 {
+  typedef T1 Head;
+  typedef Types22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+struct Types24 {
+  typedef T1 Head;
+  typedef Types23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+struct Types25 {
+  typedef T1 Head;
+  typedef Types24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+struct Types26 {
+  typedef T1 Head;
+  typedef Types25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+struct Types27 {
+  typedef T1 Head;
+  typedef Types26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+struct Types28 {
+  typedef T1 Head;
+  typedef Types27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+struct Types29 {
+  typedef T1 Head;
+  typedef Types28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+struct Types30 {
+  typedef T1 Head;
+  typedef Types29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+struct Types31 {
+  typedef T1 Head;
+  typedef Types30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+struct Types32 {
+  typedef T1 Head;
+  typedef Types31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+struct Types33 {
+  typedef T1 Head;
+  typedef Types32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+struct Types34 {
+  typedef T1 Head;
+  typedef Types33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+struct Types35 {
+  typedef T1 Head;
+  typedef Types34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+struct Types36 {
+  typedef T1 Head;
+  typedef Types35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+struct Types37 {
+  typedef T1 Head;
+  typedef Types36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+struct Types38 {
+  typedef T1 Head;
+  typedef Types37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+struct Types39 {
+  typedef T1 Head;
+  typedef Types38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+struct Types40 {
+  typedef T1 Head;
+  typedef Types39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+struct Types41 {
+  typedef T1 Head;
+  typedef Types40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+struct Types42 {
+  typedef T1 Head;
+  typedef Types41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+struct Types43 {
+  typedef T1 Head;
+  typedef Types42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+struct Types44 {
+  typedef T1 Head;
+  typedef Types43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+struct Types45 {
+  typedef T1 Head;
+  typedef Types44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+struct Types46 {
+  typedef T1 Head;
+  typedef Types45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+struct Types47 {
+  typedef T1 Head;
+  typedef Types46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+struct Types48 {
+  typedef T1 Head;
+  typedef Types47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47, T48> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+struct Types49 {
+  typedef T1 Head;
+  typedef Types48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47, T48, T49> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+struct Types50 {
+  typedef T1 Head;
+  typedef Types49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47, T48, T49, T50> Tail;
+};
+
+
+}  // namespace internal
+
+// We don't want to require the users to write TypesN<...> directly,
+// as that would require them to count the length.  Types<...> is much
+// easier to write, but generates horrible messages when there is a
+// compiler error, as gcc insists on printing out each template
+// argument, even if it has the default value (this means Types<int>
+// will appear as Types<int, None, None, ..., None> in the compiler
+// errors).
+//
+// Our solution is to combine the best part of the two approaches: a
+// user would write Types<T1, ..., TN>, and Google Test will translate
+// that to TypesN<T1, ..., TN> internally to make error messages
+// readable.  The translation is done by the 'type' member of the
+// Types template.
+template <typename T1 = internal::None, typename T2 = internal::None,
+    typename T3 = internal::None, typename T4 = internal::None,
+    typename T5 = internal::None, typename T6 = internal::None,
+    typename T7 = internal::None, typename T8 = internal::None,
+    typename T9 = internal::None, typename T10 = internal::None,
+    typename T11 = internal::None, typename T12 = internal::None,
+    typename T13 = internal::None, typename T14 = internal::None,
+    typename T15 = internal::None, typename T16 = internal::None,
+    typename T17 = internal::None, typename T18 = internal::None,
+    typename T19 = internal::None, typename T20 = internal::None,
+    typename T21 = internal::None, typename T22 = internal::None,
+    typename T23 = internal::None, typename T24 = internal::None,
+    typename T25 = internal::None, typename T26 = internal::None,
+    typename T27 = internal::None, typename T28 = internal::None,
+    typename T29 = internal::None, typename T30 = internal::None,
+    typename T31 = internal::None, typename T32 = internal::None,
+    typename T33 = internal::None, typename T34 = internal::None,
+    typename T35 = internal::None, typename T36 = internal::None,
+    typename T37 = internal::None, typename T38 = internal::None,
+    typename T39 = internal::None, typename T40 = internal::None,
+    typename T41 = internal::None, typename T42 = internal::None,
+    typename T43 = internal::None, typename T44 = internal::None,
+    typename T45 = internal::None, typename T46 = internal::None,
+    typename T47 = internal::None, typename T48 = internal::None,
+    typename T49 = internal::None, typename T50 = internal::None>
+struct Types {
+  typedef internal::Types50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48, T49, T50> type;
+};
+
+template <>
+struct Types<internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types0 type;
+};
+template <typename T1>
+struct Types<T1, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types1<T1> type;
+};
+template <typename T1, typename T2>
+struct Types<T1, T2, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types2<T1, T2> type;
+};
+template <typename T1, typename T2, typename T3>
+struct Types<T1, T2, T3, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types3<T1, T2, T3> type;
+};
+template <typename T1, typename T2, typename T3, typename T4>
+struct Types<T1, T2, T3, T4, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types4<T1, T2, T3, T4> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+struct Types<T1, T2, T3, T4, T5, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types5<T1, T2, T3, T4, T5> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+struct Types<T1, T2, T3, T4, T5, T6, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types6<T1, T2, T3, T4, T5, T6> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+struct Types<T1, T2, T3, T4, T5, T6, T7, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types7<T1, T2, T3, T4, T5, T6, T7> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types8<T1, T2, T3, T4, T5, T6, T7, T8> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, T47, internal::None, internal::None, internal::None> {
+  typedef internal::Types47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, T47, T48, internal::None, internal::None> {
+  typedef internal::Types48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, T47, T48, T49, internal::None> {
+  typedef internal::Types49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48, T49> type;
+};
+
+namespace internal {
+
+# define GTEST_TEMPLATE_ template <typename T> class
+
+// The template "selector" struct TemplateSel<Tmpl> is used to
+// represent Tmpl, which must be a class template with one type
+// parameter, as a type.  TemplateSel<Tmpl>::Bind<T>::type is defined
+// as the type Tmpl<T>.  This allows us to actually instantiate the
+// template "selected" by TemplateSel<Tmpl>.
+//
+// This trick is necessary for simulating typedef for class templates,
+// which C++ doesn't support directly.
+template <GTEST_TEMPLATE_ Tmpl>
+struct TemplateSel {
+  template <typename T>
+  struct Bind {
+    typedef Tmpl<T> type;
+  };
+};
+
+# define GTEST_BIND_(TmplSel, T) \
+  TmplSel::template Bind<T>::type
+
+// A unique struct template used as the default value for the
+// arguments of class template Templates.  This allows us to simulate
+// variadic templates (e.g. Templates<int>, Templates<int, double>,
+// and etc), which C++ doesn't support directly.
+template <typename T>
+struct NoneT {};
+
+// The following family of struct and struct templates are used to
+// represent template lists.  In particular, TemplatesN<T1, T2, ...,
+// TN> represents a list of N templates (T1, T2, ..., and TN).  Except
+// for Templates0, every struct in the family has two member types:
+// Head for the selector of the first template in the list, and Tail
+// for the rest of the list.
+
+// The empty template list.
+struct Templates0 {};
+
+// Template lists of length 1, 2, 3, and so on.
+
+template <GTEST_TEMPLATE_ T1>
+struct Templates1 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates0 Tail;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2>
+struct Templates2 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates1<T2> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3>
+struct Templates3 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates2<T2, T3> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4>
+struct Templates4 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates3<T2, T3, T4> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5>
+struct Templates5 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates4<T2, T3, T4, T5> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6>
+struct Templates6 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates5<T2, T3, T4, T5, T6> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7>
+struct Templates7 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates6<T2, T3, T4, T5, T6, T7> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8>
+struct Templates8 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates7<T2, T3, T4, T5, T6, T7, T8> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9>
+struct Templates9 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates8<T2, T3, T4, T5, T6, T7, T8, T9> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10>
+struct Templates10 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11>
+struct Templates11 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12>
+struct Templates12 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13>
+struct Templates13 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14>
+struct Templates14 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15>
+struct Templates15 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16>
+struct Templates16 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17>
+struct Templates17 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18>
+struct Templates18 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19>
+struct Templates19 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20>
+struct Templates20 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21>
+struct Templates21 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22>
+struct Templates22 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23>
+struct Templates23 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24>
+struct Templates24 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25>
+struct Templates25 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26>
+struct Templates26 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27>
+struct Templates27 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28>
+struct Templates28 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29>
+struct Templates29 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30>
+struct Templates30 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31>
+struct Templates31 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32>
+struct Templates32 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33>
+struct Templates33 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34>
+struct Templates34 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35>
+struct Templates35 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36>
+struct Templates36 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37>
+struct Templates37 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38>
+struct Templates38 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39>
+struct Templates39 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40>
+struct Templates40 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41>
+struct Templates41 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42>
+struct Templates42 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43>
+struct Templates43 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44>
+struct Templates44 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45>
+struct Templates45 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46>
+struct Templates46 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47>
+struct Templates47 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48>
+struct Templates48 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47, T48> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
+    GTEST_TEMPLATE_ T49>
+struct Templates49 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47, T48, T49> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
+    GTEST_TEMPLATE_ T49, GTEST_TEMPLATE_ T50>
+struct Templates50 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47, T48, T49, T50> Tail;
+};
+
+
+// We don't want to require the users to write TemplatesN<...> directly,
+// as that would require them to count the length.  Templates<...> is much
+// easier to write, but generates horrible messages when there is a
+// compiler error, as gcc insists on printing out each template
+// argument, even if it has the default value (this means Templates<list>
+// will appear as Templates<list, NoneT, NoneT, ..., NoneT> in the compiler
+// errors).
+//
+// Our solution is to combine the best part of the two approaches: a
+// user would write Templates<T1, ..., TN>, and Google Test will translate
+// that to TemplatesN<T1, ..., TN> internally to make error messages
+// readable.  The translation is done by the 'type' member of the
+// Templates template.
+template <GTEST_TEMPLATE_ T1 = NoneT, GTEST_TEMPLATE_ T2 = NoneT,
+    GTEST_TEMPLATE_ T3 = NoneT, GTEST_TEMPLATE_ T4 = NoneT,
+    GTEST_TEMPLATE_ T5 = NoneT, GTEST_TEMPLATE_ T6 = NoneT,
+    GTEST_TEMPLATE_ T7 = NoneT, GTEST_TEMPLATE_ T8 = NoneT,
+    GTEST_TEMPLATE_ T9 = NoneT, GTEST_TEMPLATE_ T10 = NoneT,
+    GTEST_TEMPLATE_ T11 = NoneT, GTEST_TEMPLATE_ T12 = NoneT,
+    GTEST_TEMPLATE_ T13 = NoneT, GTEST_TEMPLATE_ T14 = NoneT,
+    GTEST_TEMPLATE_ T15 = NoneT, GTEST_TEMPLATE_ T16 = NoneT,
+    GTEST_TEMPLATE_ T17 = NoneT, GTEST_TEMPLATE_ T18 = NoneT,
+    GTEST_TEMPLATE_ T19 = NoneT, GTEST_TEMPLATE_ T20 = NoneT,
+    GTEST_TEMPLATE_ T21 = NoneT, GTEST_TEMPLATE_ T22 = NoneT,
+    GTEST_TEMPLATE_ T23 = NoneT, GTEST_TEMPLATE_ T24 = NoneT,
+    GTEST_TEMPLATE_ T25 = NoneT, GTEST_TEMPLATE_ T26 = NoneT,
+    GTEST_TEMPLATE_ T27 = NoneT, GTEST_TEMPLATE_ T28 = NoneT,
+    GTEST_TEMPLATE_ T29 = NoneT, GTEST_TEMPLATE_ T30 = NoneT,
+    GTEST_TEMPLATE_ T31 = NoneT, GTEST_TEMPLATE_ T32 = NoneT,
+    GTEST_TEMPLATE_ T33 = NoneT, GTEST_TEMPLATE_ T34 = NoneT,
+    GTEST_TEMPLATE_ T35 = NoneT, GTEST_TEMPLATE_ T36 = NoneT,
+    GTEST_TEMPLATE_ T37 = NoneT, GTEST_TEMPLATE_ T38 = NoneT,
+    GTEST_TEMPLATE_ T39 = NoneT, GTEST_TEMPLATE_ T40 = NoneT,
+    GTEST_TEMPLATE_ T41 = NoneT, GTEST_TEMPLATE_ T42 = NoneT,
+    GTEST_TEMPLATE_ T43 = NoneT, GTEST_TEMPLATE_ T44 = NoneT,
+    GTEST_TEMPLATE_ T45 = NoneT, GTEST_TEMPLATE_ T46 = NoneT,
+    GTEST_TEMPLATE_ T47 = NoneT, GTEST_TEMPLATE_ T48 = NoneT,
+    GTEST_TEMPLATE_ T49 = NoneT, GTEST_TEMPLATE_ T50 = NoneT>
+struct Templates {
+  typedef Templates50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47, T48, T49, T50> type;
+};
+
+template <>
+struct Templates<NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates0 type;
+};
+template <GTEST_TEMPLATE_ T1>
+struct Templates<T1, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates1<T1> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2>
+struct Templates<T1, T2, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates2<T1, T2> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3>
+struct Templates<T1, T2, T3, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates3<T1, T2, T3> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4>
+struct Templates<T1, T2, T3, T4, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates4<T1, T2, T3, T4> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5>
+struct Templates<T1, T2, T3, T4, T5, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates5<T1, T2, T3, T4, T5> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6>
+struct Templates<T1, T2, T3, T4, T5, T6, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates6<T1, T2, T3, T4, T5, T6> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates7<T1, T2, T3, T4, T5, T6, T7> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates8<T1, T2, T3, T4, T5, T6, T7, T8> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT> {
+  typedef Templates22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT> {
+  typedef Templates23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT> {
+  typedef Templates24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, T47, NoneT, NoneT, NoneT> {
+  typedef Templates47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, T47, T48, NoneT, NoneT> {
+  typedef Templates48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47, T48> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
+    GTEST_TEMPLATE_ T49>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, T47, T48, T49, NoneT> {
+  typedef Templates49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47, T48, T49> type;
+};
+
+// The TypeList template makes it possible to use either a single type
+// or a Types<...> list in TYPED_TEST_CASE() and
+// INSTANTIATE_TYPED_TEST_CASE_P().
+
+template <typename T>
+struct TypeList {
+  typedef Types1<T> type;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+struct TypeList<Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48, T49, T50> > {
+  typedef typename Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48, T49, T50>::type type;
+};
+
+#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+
+// Due to C++ preprocessor weirdness, we need double indirection to
+// concatenate two tokens when one of them is __LINE__.  Writing
+//
+//   foo ## __LINE__
+//
+// will result in the token foo__LINE__, instead of foo followed by
+// the current line number.  For more details, see
+// http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6
+#define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar)
+#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar
+
+// Stringifies its argument.
+#define GTEST_STRINGIFY_(name) #name
+
+class ProtocolMessage;
+namespace proto2 { class Message; }
+
+namespace testing {
+
+// Forward declarations.
+
+class AssertionResult;                 // Result of an assertion.
+class Message;                         // Represents a failure message.
+class Test;                            // Represents a test.
+class TestInfo;                        // Information about a test.
+class TestPartResult;                  // Result of a test part.
+class UnitTest;                        // A collection of test cases.
+
+template <typename T>
+::std::string PrintToString(const T& value);
+
+namespace internal {
+
+struct TraceInfo;                      // Information about a trace point.
+class TestInfoImpl;                    // Opaque implementation of TestInfo
+class UnitTestImpl;                    // Opaque implementation of UnitTest
+
+// The text used in failure messages to indicate the start of the
+// stack trace.
+GTEST_API_ extern const char kStackTraceMarker[];
+
+// Two overloaded helpers for checking at compile time whether an
+// expression is a null pointer literal (i.e. NULL or any 0-valued
+// compile-time integral constant).  Their return values have
+// different sizes, so we can use sizeof() to test which version is
+// picked by the compiler.  These helpers have no implementations, as
+// we only need their signatures.
+//
+// Given IsNullLiteralHelper(x), the compiler will pick the first
+// version if x can be implicitly converted to Secret*, and pick the
+// second version otherwise.  Since Secret is a secret and incomplete
+// type, the only expression a user can write that has type Secret* is
+// a null pointer literal.  Therefore, we know that x is a null
+// pointer literal if and only if the first version is picked by the
+// compiler.
+char IsNullLiteralHelper(Secret* p);
+char (&IsNullLiteralHelper(...))[2];  // NOLINT
+
+// A compile-time bool constant that is true if and only if x is a
+// null pointer literal (i.e. NULL or any 0-valued compile-time
+// integral constant).
+#ifdef GTEST_ELLIPSIS_NEEDS_POD_
+// We lose support for NULL detection where the compiler doesn't like
+// passing non-POD classes through ellipsis (...).
+# define GTEST_IS_NULL_LITERAL_(x) false
+#else
+# define GTEST_IS_NULL_LITERAL_(x) \
+    (sizeof(::testing::internal::IsNullLiteralHelper(x)) == 1)
+#endif  // GTEST_ELLIPSIS_NEEDS_POD_
+
+// Appends the user-supplied message to the Google-Test-generated message.
+GTEST_API_ std::string AppendUserMessage(
+    const std::string& gtest_msg, const Message& user_msg);
+
+#if GTEST_HAS_EXCEPTIONS
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4275 \
+/* an exported class was derived from a class that was not exported */)
+
+// This exception is thrown by (and only by) a failed Google Test
+// assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions
+// are enabled).  We derive it from std::runtime_error, which is for
+// errors presumably detectable only at run time.  Since
+// std::runtime_error inherits from std::exception, many testing
+// frameworks know how to extract and print the message inside it.
+class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error {
+ public:
+  explicit GoogleTestFailureException(const TestPartResult& failure);
+};
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4275
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+namespace edit_distance {
+// Returns the optimal edits to go from 'left' to 'right'.
+// All edits cost the same, with replace having lower priority than
+// add/remove.
+// Simple implementation of the Wagner-Fischer algorithm.
+// See http://en.wikipedia.org/wiki/Wagner-Fischer_algorithm
+enum EditType { kMatch, kAdd, kRemove, kReplace };
+GTEST_API_ std::vector<EditType> CalculateOptimalEdits(
+    const std::vector<size_t>& left, const std::vector<size_t>& right);
+
+// Same as above, but the input is represented as strings.
+GTEST_API_ std::vector<EditType> CalculateOptimalEdits(
+    const std::vector<std::string>& left,
+    const std::vector<std::string>& right);
+
+// Create a diff of the input strings in Unified diff format.
+GTEST_API_ std::string CreateUnifiedDiff(const std::vector<std::string>& left,
+                                         const std::vector<std::string>& right,
+                                         size_t context = 2);
+
+}  // namespace edit_distance
+
+// Calculate the diff between 'left' and 'right' and return it in unified diff
+// format.
+// If not null, stores in 'total_line_count' the total number of lines found
+// in left + right.
+GTEST_API_ std::string DiffStrings(const std::string& left,
+                                   const std::string& right,
+                                   size_t* total_line_count);
+
+// Constructs and returns the message for an equality assertion
+// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
+//
+// The first four parameters are the expressions used in the assertion
+// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
+// where foo is 5 and bar is 6, we have:
+//
+//   expected_expression: "foo"
+//   actual_expression:   "bar"
+//   expected_value:      "5"
+//   actual_value:        "6"
+//
+// The ignoring_case parameter is true iff the assertion is a
+// *_STRCASEEQ*.  When it's true, the string " (ignoring case)" will
+// be inserted into the message.
+GTEST_API_ AssertionResult EqFailure(const char* expected_expression,
+                                     const char* actual_expression,
+                                     const std::string& expected_value,
+                                     const std::string& actual_value,
+                                     bool ignoring_case);
+
+// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
+GTEST_API_ std::string GetBoolAssertionFailureMessage(
+    const AssertionResult& assertion_result,
+    const char* expression_text,
+    const char* actual_predicate_value,
+    const char* expected_predicate_value);
+
+// This template class represents an IEEE floating-point number
+// (either single-precision or double-precision, depending on the
+// template parameters).
+//
+// The purpose of this class is to do more sophisticated number
+// comparison.  (Due to round-off error, etc, it's very unlikely that
+// two floating-points will be equal exactly.  Hence a naive
+// comparison by the == operation often doesn't work.)
+//
+// Format of IEEE floating-point:
+//
+//   The most-significant bit being the leftmost, an IEEE
+//   floating-point looks like
+//
+//     sign_bit exponent_bits fraction_bits
+//
+//   Here, sign_bit is a single bit that designates the sign of the
+//   number.
+//
+//   For float, there are 8 exponent bits and 23 fraction bits.
+//
+//   For double, there are 11 exponent bits and 52 fraction bits.
+//
+//   More details can be found at
+//   http://en.wikipedia.org/wiki/IEEE_floating-point_standard.
+//
+// Template parameter:
+//
+//   RawType: the raw floating-point type (either float or double)
+template <typename RawType>
+class FloatingPoint {
+ public:
+  // Defines the unsigned integer type that has the same size as the
+  // floating point number.
+  typedef typename TypeWithSize<sizeof(RawType)>::UInt Bits;
+
+  // Constants.
+
+  // # of bits in a number.
+  static const size_t kBitCount = 8*sizeof(RawType);
+
+  // # of fraction bits in a number.
+  static const size_t kFractionBitCount =
+    std::numeric_limits<RawType>::digits - 1;
+
+  // # of exponent bits in a number.
+  static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount;
+
+  // The mask for the sign bit.
+  static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1);
+
+  // The mask for the fraction bits.
+  static const Bits kFractionBitMask =
+    ~static_cast<Bits>(0) >> (kExponentBitCount + 1);
+
+  // The mask for the exponent bits.
+  static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask);
+
+  // How many ULP's (Units in the Last Place) we want to tolerate when
+  // comparing two numbers.  The larger the value, the more error we
+  // allow.  A 0 value means that two numbers must be exactly the same
+  // to be considered equal.
+  //
+  // The maximum error of a single floating-point operation is 0.5
+  // units in the last place.  On Intel CPU's, all floating-point
+  // calculations are done with 80-bit precision, while double has 64
+  // bits.  Therefore, 4 should be enough for ordinary use.
+  //
+  // See the following article for more details on ULP:
+  // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
+  static const size_t kMaxUlps = 4;
+
+  // Constructs a FloatingPoint from a raw floating-point number.
+  //
+  // On an Intel CPU, passing a non-normalized NAN (Not a Number)
+  // around may change its bits, although the new value is guaranteed
+  // to be also a NAN.  Therefore, don't expect this constructor to
+  // preserve the bits in x when x is a NAN.
+  explicit FloatingPoint(const RawType& x) { u_.value_ = x; }
+
+  // Static methods
+
+  // Reinterprets a bit pattern as a floating-point number.
+  //
+  // This function is needed to test the AlmostEquals() method.
+  static RawType ReinterpretBits(const Bits bits) {
+    FloatingPoint fp(0);
+    fp.u_.bits_ = bits;
+    return fp.u_.value_;
+  }
+
+  // Returns the floating-point number that represent positive infinity.
+  static RawType Infinity() {
+    return ReinterpretBits(kExponentBitMask);
+  }
+
+  // Returns the maximum representable finite floating-point number.
+  static RawType Max();
+
+  // Non-static methods
+
+  // Returns the bits that represents this number.
+  const Bits &bits() const { return u_.bits_; }
+
+  // Returns the exponent bits of this number.
+  Bits exponent_bits() const { return kExponentBitMask & u_.bits_; }
+
+  // Returns the fraction bits of this number.
+  Bits fraction_bits() const { return kFractionBitMask & u_.bits_; }
+
+  // Returns the sign bit of this number.
+  Bits sign_bit() const { return kSignBitMask & u_.bits_; }
+
+  // Returns true iff this is NAN (not a number).
+  bool is_nan() const {
+    // It's a NAN if the exponent bits are all ones and the fraction
+    // bits are not entirely zeros.
+    return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0);
+  }
+
+  // Returns true iff this number is at most kMaxUlps ULP's away from
+  // rhs.  In particular, this function:
+  //
+  //   - returns false if either number is (or both are) NAN.
+  //   - treats really large numbers as almost equal to infinity.
+  //   - thinks +0.0 and -0.0 are 0 DLP's apart.
+  bool AlmostEquals(const FloatingPoint& rhs) const {
+    // The IEEE standard says that any comparison operation involving
+    // a NAN must return false.
+    if (is_nan() || rhs.is_nan()) return false;
+
+    return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_)
+        <= kMaxUlps;
+  }
+
+ private:
+  // The data type used to store the actual floating-point number.
+  union FloatingPointUnion {
+    RawType value_;  // The raw floating-point number.
+    Bits bits_;      // The bits that represent the number.
+  };
+
+  // Converts an integer from the sign-and-magnitude representation to
+  // the biased representation.  More precisely, let N be 2 to the
+  // power of (kBitCount - 1), an integer x is represented by the
+  // unsigned number x + N.
+  //
+  // For instance,
+  //
+  //   -N + 1 (the most negative number representable using
+  //          sign-and-magnitude) is represented by 1;
+  //   0      is represented by N; and
+  //   N - 1  (the biggest number representable using
+  //          sign-and-magnitude) is represented by 2N - 1.
+  //
+  // Read http://en.wikipedia.org/wiki/Signed_number_representations
+  // for more details on signed number representations.
+  static Bits SignAndMagnitudeToBiased(const Bits &sam) {
+    if (kSignBitMask & sam) {
+      // sam represents a negative number.
+      return ~sam + 1;
+    } else {
+      // sam represents a positive number.
+      return kSignBitMask | sam;
+    }
+  }
+
+  // Given two numbers in the sign-and-magnitude representation,
+  // returns the distance between them as an unsigned number.
+  static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1,
+                                                     const Bits &sam2) {
+    const Bits biased1 = SignAndMagnitudeToBiased(sam1);
+    const Bits biased2 = SignAndMagnitudeToBiased(sam2);
+    return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1);
+  }
+
+  FloatingPointUnion u_;
+};
+
+// We cannot use std::numeric_limits<T>::max() as it clashes with the max()
+// macro defined by <windows.h>.
+template <>
+inline float FloatingPoint<float>::Max() { return FLT_MAX; }
+template <>
+inline double FloatingPoint<double>::Max() { return DBL_MAX; }
+
+// Typedefs the instances of the FloatingPoint template class that we
+// care to use.
+typedef FloatingPoint<float> Float;
+typedef FloatingPoint<double> Double;
+
+// In order to catch the mistake of putting tests that use different
+// test fixture classes in the same test case, we need to assign
+// unique IDs to fixture classes and compare them.  The TypeId type is
+// used to hold such IDs.  The user should treat TypeId as an opaque
+// type: the only operation allowed on TypeId values is to compare
+// them for equality using the == operator.
+typedef const void* TypeId;
+
+template <typename T>
+class TypeIdHelper {
+ public:
+  // dummy_ must not have a const type.  Otherwise an overly eager
+  // compiler (e.g. MSVC 7.1 & 8.0) may try to merge
+  // TypeIdHelper<T>::dummy_ for different Ts as an "optimization".
+  static bool dummy_;
+};
+
+template <typename T>
+bool TypeIdHelper<T>::dummy_ = false;
+
+// GetTypeId<T>() returns the ID of type T.  Different values will be
+// returned for different types.  Calling the function twice with the
+// same type argument is guaranteed to return the same ID.
+template <typename T>
+TypeId GetTypeId() {
+  // The compiler is required to allocate a different
+  // TypeIdHelper<T>::dummy_ variable for each T used to instantiate
+  // the template.  Therefore, the address of dummy_ is guaranteed to
+  // be unique.
+  return &(TypeIdHelper<T>::dummy_);
+}
+
+// Returns the type ID of ::testing::Test.  Always call this instead
+// of GetTypeId< ::testing::Test>() to get the type ID of
+// ::testing::Test, as the latter may give the wrong result due to a
+// suspected linker bug when compiling Google Test as a Mac OS X
+// framework.
+GTEST_API_ TypeId GetTestTypeId();
+
+// Defines the abstract factory interface that creates instances
+// of a Test object.
+class TestFactoryBase {
+ public:
+  virtual ~TestFactoryBase() {}
+
+  // Creates a test instance to run. The instance is both created and destroyed
+  // within TestInfoImpl::Run()
+  virtual Test* CreateTest() = 0;
+
+ protected:
+  TestFactoryBase() {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestFactoryBase);
+};
+
+// This class provides implementation of TeastFactoryBase interface.
+// It is used in TEST and TEST_F macros.
+template <class TestClass>
+class TestFactoryImpl : public TestFactoryBase {
+ public:
+  virtual Test* CreateTest() { return new TestClass; }
+};
+
+#if GTEST_OS_WINDOWS
+
+// Predicate-formatters for implementing the HRESULT checking macros
+// {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}
+// We pass a long instead of HRESULT to avoid causing an
+// include dependency for the HRESULT type.
+GTEST_API_ AssertionResult IsHRESULTSuccess(const char* expr,
+                                            long hr);  // NOLINT
+GTEST_API_ AssertionResult IsHRESULTFailure(const char* expr,
+                                            long hr);  // NOLINT
+
+#endif  // GTEST_OS_WINDOWS
+
+// Types of SetUpTestCase() and TearDownTestCase() functions.
+typedef void (*SetUpTestCaseFunc)();
+typedef void (*TearDownTestCaseFunc)();
+
+struct CodeLocation {
+  CodeLocation(const std::string& a_file, int a_line)
+      : file(a_file), line(a_line) {}
+
+  std::string file;
+  int line;
+};
+
+// Creates a new TestInfo object and registers it with Google Test;
+// returns the created object.
+//
+// Arguments:
+//
+//   test_case_name:   name of the test case
+//   name:             name of the test
+//   type_param        the name of the test's type parameter, or NULL if
+//                     this is not a typed or a type-parameterized test.
+//   value_param       text representation of the test's value parameter,
+//                     or NULL if this is not a type-parameterized test.
+//   code_location:    code location where the test is defined
+//   fixture_class_id: ID of the test fixture class
+//   set_up_tc:        pointer to the function that sets up the test case
+//   tear_down_tc:     pointer to the function that tears down the test case
+//   factory:          pointer to the factory that creates a test object.
+//                     The newly created TestInfo instance will assume
+//                     ownership of the factory object.
+GTEST_API_ TestInfo* MakeAndRegisterTestInfo(
+    const char* test_case_name,
+    const char* name,
+    const char* type_param,
+    const char* value_param,
+    CodeLocation code_location,
+    TypeId fixture_class_id,
+    SetUpTestCaseFunc set_up_tc,
+    TearDownTestCaseFunc tear_down_tc,
+    TestFactoryBase* factory);
+
+// If *pstr starts with the given prefix, modifies *pstr to be right
+// past the prefix and returns true; otherwise leaves *pstr unchanged
+// and returns false.  None of pstr, *pstr, and prefix can be NULL.
+GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr);
+
+#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+// State of the definition of a type-parameterized test case.
+class GTEST_API_ TypedTestCasePState {
+ public:
+  TypedTestCasePState() : registered_(false) {}
+
+  // Adds the given test name to defined_test_names_ and return true
+  // if the test case hasn't been registered; otherwise aborts the
+  // program.
+  bool AddTestName(const char* file, int line, const char* case_name,
+                   const char* test_name) {
+    if (registered_) {
+      fprintf(stderr, "%s Test %s must be defined before "
+              "REGISTER_TYPED_TEST_CASE_P(%s, ...).\n",
+              FormatFileLocation(file, line).c_str(), test_name, case_name);
+      fflush(stderr);
+      posix::Abort();
+    }
+    registered_tests_.insert(
+        ::std::make_pair(test_name, CodeLocation(file, line)));
+    return true;
+  }
+
+  bool TestExists(const std::string& test_name) const {
+    return registered_tests_.count(test_name) > 0;
+  }
+
+  const CodeLocation& GetCodeLocation(const std::string& test_name) const {
+    RegisteredTestsMap::const_iterator it = registered_tests_.find(test_name);
+    GTEST_CHECK_(it != registered_tests_.end());
+    return it->second;
+  }
+
+  // Verifies that registered_tests match the test names in
+  // defined_test_names_; returns registered_tests if successful, or
+  // aborts the program otherwise.
+  const char* VerifyRegisteredTestNames(
+      const char* file, int line, const char* registered_tests);
+
+ private:
+  typedef ::std::map<std::string, CodeLocation> RegisteredTestsMap;
+
+  bool registered_;
+  RegisteredTestsMap registered_tests_;
+};
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+// Skips to the first non-space char after the first comma in 'str';
+// returns NULL if no comma is found in 'str'.
+inline const char* SkipComma(const char* str) {
+  const char* comma = strchr(str, ',');
+  if (comma == NULL) {
+    return NULL;
+  }
+  while (IsSpace(*(++comma))) {}
+  return comma;
+}
+
+// Returns the prefix of 'str' before the first comma in it; returns
+// the entire string if it contains no comma.
+inline std::string GetPrefixUntilComma(const char* str) {
+  const char* comma = strchr(str, ',');
+  return comma == NULL ? str : std::string(str, comma);
+}
+
+// Splits a given string on a given delimiter, populating a given
+// vector with the fields.
+void SplitString(const ::std::string& str, char delimiter,
+                 ::std::vector< ::std::string>* dest);
+
+// The default argument to the template below for the case when the user does
+// not provide a name generator.
+struct DefaultNameGenerator {
+  template <typename T>
+  static std::string GetName(int i) {
+    return StreamableToString(i);
+  }
+};
+
+template <typename Provided = DefaultNameGenerator>
+struct NameGeneratorSelector {
+  typedef Provided type;
+};
+
+template <typename NameGenerator>
+void GenerateNamesRecursively(Types0, std::vector<std::string>*, int) {}
+
+template <typename NameGenerator, typename Types>
+void GenerateNamesRecursively(Types, std::vector<std::string>* result, int i) {
+  result->push_back(NameGenerator::template GetName<typename Types::Head>(i));
+  GenerateNamesRecursively<NameGenerator>(typename Types::Tail(), result,
+                                          i + 1);
+}
+
+template <typename NameGenerator, typename Types>
+std::vector<std::string> GenerateNames() {
+  std::vector<std::string> result;
+  GenerateNamesRecursively<NameGenerator>(Types(), &result, 0);
+  return result;
+}
+
+// TypeParameterizedTest<Fixture, TestSel, Types>::Register()
+// registers a list of type-parameterized tests with Google Test.  The
+// return value is insignificant - we just need to return something
+// such that we can call this function in a namespace scope.
+//
+// Implementation note: The GTEST_TEMPLATE_ macro declares a template
+// template parameter.  It's defined in gtest-type-util.h.
+template <GTEST_TEMPLATE_ Fixture, class TestSel, typename Types>
+class TypeParameterizedTest {
+ public:
+  // 'index' is the index of the test in the type list 'Types'
+  // specified in INSTANTIATE_TYPED_TEST_CASE_P(Prefix, TestCase,
+  // Types).  Valid values for 'index' are [0, N - 1] where N is the
+  // length of Types.
+  static bool Register(const char* prefix, const CodeLocation& code_location,
+                       const char* case_name, const char* test_names, int index,
+                       const std::vector<std::string>& type_names =
+                           GenerateNames<DefaultNameGenerator, Types>()) {
+    typedef typename Types::Head Type;
+    typedef Fixture<Type> FixtureClass;
+    typedef typename GTEST_BIND_(TestSel, Type) TestClass;
+
+    // First, registers the first type-parameterized test in the type
+    // list.
+    MakeAndRegisterTestInfo(
+        (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name +
+         "/" + type_names[index])
+            .c_str(),
+        StripTrailingSpaces(GetPrefixUntilComma(test_names)).c_str(),
+        GetTypeName<Type>().c_str(),
+        NULL,  // No value parameter.
+        code_location, GetTypeId<FixtureClass>(), TestClass::SetUpTestCase,
+        TestClass::TearDownTestCase, new TestFactoryImpl<TestClass>);
+
+    // Next, recurses (at compile time) with the tail of the type list.
+    return TypeParameterizedTest<Fixture, TestSel,
+                                 typename Types::Tail>::Register(prefix,
+                                                                 code_location,
+                                                                 case_name,
+                                                                 test_names,
+                                                                 index + 1,
+                                                                 type_names);
+  }
+};
+
+// The base case for the compile time recursion.
+template <GTEST_TEMPLATE_ Fixture, class TestSel>
+class TypeParameterizedTest<Fixture, TestSel, Types0> {
+ public:
+  static bool Register(const char* /*prefix*/, const CodeLocation&,
+                       const char* /*case_name*/, const char* /*test_names*/,
+                       int /*index*/,
+                       const std::vector<std::string>& =
+                           std::vector<std::string>() /*type_names*/) {
+    return true;
+  }
+};
+
+// TypeParameterizedTestCase<Fixture, Tests, Types>::Register()
+// registers *all combinations* of 'Tests' and 'Types' with Google
+// Test.  The return value is insignificant - we just need to return
+// something such that we can call this function in a namespace scope.
+template <GTEST_TEMPLATE_ Fixture, typename Tests, typename Types>
+class TypeParameterizedTestCase {
+ public:
+  static bool Register(const char* prefix, CodeLocation code_location,
+                       const TypedTestCasePState* state, const char* case_name,
+                       const char* test_names,
+                       const std::vector<std::string>& type_names =
+                           GenerateNames<DefaultNameGenerator, Types>()) {
+    std::string test_name = StripTrailingSpaces(
+        GetPrefixUntilComma(test_names));
+    if (!state->TestExists(test_name)) {
+      fprintf(stderr, "Failed to get code location for test %s.%s at %s.",
+              case_name, test_name.c_str(),
+              FormatFileLocation(code_location.file.c_str(),
+                                 code_location.line).c_str());
+      fflush(stderr);
+      posix::Abort();
+    }
+    const CodeLocation& test_location = state->GetCodeLocation(test_name);
+
+    typedef typename Tests::Head Head;
+
+    // First, register the first test in 'Test' for each type in 'Types'.
+    TypeParameterizedTest<Fixture, Head, Types>::Register(
+        prefix, test_location, case_name, test_names, 0, type_names);
+
+    // Next, recurses (at compile time) with the tail of the test list.
+    return TypeParameterizedTestCase<Fixture, typename Tests::Tail,
+                                     Types>::Register(prefix, code_location,
+                                                      state, case_name,
+                                                      SkipComma(test_names),
+                                                      type_names);
+  }
+};
+
+// The base case for the compile time recursion.
+template <GTEST_TEMPLATE_ Fixture, typename Types>
+class TypeParameterizedTestCase<Fixture, Templates0, Types> {
+ public:
+  static bool Register(const char* /*prefix*/, const CodeLocation&,
+                       const TypedTestCasePState* /*state*/,
+                       const char* /*case_name*/, const char* /*test_names*/,
+                       const std::vector<std::string>& =
+                           std::vector<std::string>() /*type_names*/) {
+    return true;
+  }
+};
+
+#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
+// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
+GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(
+    UnitTest* unit_test, int skip_count);
+
+// Helpers for suppressing warnings on unreachable code or constant
+// condition.
+
+// Always returns true.
+GTEST_API_ bool AlwaysTrue();
+
+// Always returns false.
+inline bool AlwaysFalse() { return !AlwaysTrue(); }
+
+// Helper for suppressing false warning from Clang on a const char*
+// variable declared in a conditional expression always being NULL in
+// the else branch.
+struct GTEST_API_ ConstCharPtr {
+  ConstCharPtr(const char* str) : value(str) {}
+  operator bool() const { return true; }
+  const char* value;
+};
+
+// A simple Linear Congruential Generator for generating random
+// numbers with a uniform distribution.  Unlike rand() and srand(), it
+// doesn't use global state (and therefore can't interfere with user
+// code).  Unlike rand_r(), it's portable.  An LCG isn't very random,
+// but it's good enough for our purposes.
+class GTEST_API_ Random {
+ public:
+  static const UInt32 kMaxRange = 1u << 31;
+
+  explicit Random(UInt32 seed) : state_(seed) {}
+
+  void Reseed(UInt32 seed) { state_ = seed; }
+
+  // Generates a random number from [0, range).  Crashes if 'range' is
+  // 0 or greater than kMaxRange.
+  UInt32 Generate(UInt32 range);
+
+ private:
+  UInt32 state_;
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Random);
+};
+
+// Defining a variable of type CompileAssertTypesEqual<T1, T2> will cause a
+// compiler error iff T1 and T2 are different types.
+template <typename T1, typename T2>
+struct CompileAssertTypesEqual;
+
+template <typename T>
+struct CompileAssertTypesEqual<T, T> {
+};
+
+// Removes the reference from a type if it is a reference type,
+// otherwise leaves it unchanged.  This is the same as
+// tr1::remove_reference, which is not widely available yet.
+template <typename T>
+struct RemoveReference { typedef T type; };  // NOLINT
+template <typename T>
+struct RemoveReference<T&> { typedef T type; };  // NOLINT
+
+// A handy wrapper around RemoveReference that works when the argument
+// T depends on template parameters.
+#define GTEST_REMOVE_REFERENCE_(T) \
+    typename ::testing::internal::RemoveReference<T>::type
+
+// Removes const from a type if it is a const type, otherwise leaves
+// it unchanged.  This is the same as tr1::remove_const, which is not
+// widely available yet.
+template <typename T>
+struct RemoveConst { typedef T type; };  // NOLINT
+template <typename T>
+struct RemoveConst<const T> { typedef T type; };  // NOLINT
+
+// MSVC 8.0, Sun C++, and IBM XL C++ have a bug which causes the above
+// definition to fail to remove the const in 'const int[3]' and 'const
+// char[3][4]'.  The following specialization works around the bug.
+template <typename T, size_t N>
+struct RemoveConst<const T[N]> {
+  typedef typename RemoveConst<T>::type type[N];
+};
+
+#if defined(_MSC_VER) && _MSC_VER < 1400
+// This is the only specialization that allows VC++ 7.1 to remove const in
+// 'const int[3] and 'const int[3][4]'.  However, it causes trouble with GCC
+// and thus needs to be conditionally compiled.
+template <typename T, size_t N>
+struct RemoveConst<T[N]> {
+  typedef typename RemoveConst<T>::type type[N];
+};
+#endif
+
+// A handy wrapper around RemoveConst that works when the argument
+// T depends on template parameters.
+#define GTEST_REMOVE_CONST_(T) \
+    typename ::testing::internal::RemoveConst<T>::type
+
+// Turns const U&, U&, const U, and U all into U.
+#define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \
+    GTEST_REMOVE_CONST_(GTEST_REMOVE_REFERENCE_(T))
+
+// ImplicitlyConvertible<From, To>::value is a compile-time bool
+// constant that's true iff type From can be implicitly converted to
+// type To.
+template <typename From, typename To>
+class ImplicitlyConvertible {
+ private:
+  // We need the following helper functions only for their types.
+  // They have no implementations.
+
+  // MakeFrom() is an expression whose type is From.  We cannot simply
+  // use From(), as the type From may not have a public default
+  // constructor.
+  static typename AddReference<From>::type MakeFrom();
+
+  // These two functions are overloaded.  Given an expression
+  // Helper(x), the compiler will pick the first version if x can be
+  // implicitly converted to type To; otherwise it will pick the
+  // second version.
+  //
+  // The first version returns a value of size 1, and the second
+  // version returns a value of size 2.  Therefore, by checking the
+  // size of Helper(x), which can be done at compile time, we can tell
+  // which version of Helper() is used, and hence whether x can be
+  // implicitly converted to type To.
+  static char Helper(To);
+  static char (&Helper(...))[2];  // NOLINT
+
+  // We have to put the 'public' section after the 'private' section,
+  // or MSVC refuses to compile the code.
+ public:
+#if defined(__BORLANDC__)
+  // C++Builder cannot use member overload resolution during template
+  // instantiation.  The simplest workaround is to use its C++0x type traits
+  // functions (C++Builder 2009 and above only).
+  static const bool value = __is_convertible(From, To);
+#else
+  // MSVC warns about implicitly converting from double to int for
+  // possible loss of data, so we need to temporarily disable the
+  // warning.
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4244)
+  static const bool value =
+      sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1;
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif  // __BORLANDC__
+};
+template <typename From, typename To>
+const bool ImplicitlyConvertible<From, To>::value;
+
+// IsAProtocolMessage<T>::value is a compile-time bool constant that's
+// true iff T is type ProtocolMessage, proto2::Message, or a subclass
+// of those.
+template <typename T>
+struct IsAProtocolMessage
+    : public bool_constant<
+  ImplicitlyConvertible<const T*, const ::ProtocolMessage*>::value ||
+  ImplicitlyConvertible<const T*, const ::proto2::Message*>::value> {
+};
+
+// When the compiler sees expression IsContainerTest<C>(0), if C is an
+// STL-style container class, the first overload of IsContainerTest
+// will be viable (since both C::iterator* and C::const_iterator* are
+// valid types and NULL can be implicitly converted to them).  It will
+// be picked over the second overload as 'int' is a perfect match for
+// the type of argument 0.  If C::iterator or C::const_iterator is not
+// a valid type, the first overload is not viable, and the second
+// overload will be picked.  Therefore, we can determine whether C is
+// a container class by checking the type of IsContainerTest<C>(0).
+// The value of the expression is insignificant.
+//
+// In C++11 mode we check the existence of a const_iterator and that an
+// iterator is properly implemented for the container.
+//
+// For pre-C++11 that we look for both C::iterator and C::const_iterator.
+// The reason is that C++ injects the name of a class as a member of the
+// class itself (e.g. you can refer to class iterator as either
+// 'iterator' or 'iterator::iterator').  If we look for C::iterator
+// only, for example, we would mistakenly think that a class named
+// iterator is an STL container.
+//
+// Also note that the simpler approach of overloading
+// IsContainerTest(typename C::const_iterator*) and
+// IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++.
+typedef int IsContainer;
+#if GTEST_LANG_CXX11
+template <class C,
+          class Iterator = decltype(::std::declval<const C&>().begin()),
+          class = decltype(::std::declval<const C&>().end()),
+          class = decltype(++::std::declval<Iterator&>()),
+          class = decltype(*::std::declval<Iterator>()),
+          class = typename C::const_iterator>
+IsContainer IsContainerTest(int /* dummy */) {
+  return 0;
+}
+#else
+template <class C>
+IsContainer IsContainerTest(int /* dummy */,
+                            typename C::iterator* /* it */ = NULL,
+                            typename C::const_iterator* /* const_it */ = NULL) {
+  return 0;
+}
+#endif  // GTEST_LANG_CXX11
+
+typedef char IsNotContainer;
+template <class C>
+IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; }
+
+// Trait to detect whether a type T is a hash table.
+// The heuristic used is that the type contains an inner type `hasher` and does
+// not contain an inner type `reverse_iterator`.
+// If the container is iterable in reverse, then order might actually matter.
+template <typename T>
+struct IsHashTable {
+ private:
+  template <typename U>
+  static char test(typename U::hasher*, typename U::reverse_iterator*);
+  template <typename U>
+  static int test(typename U::hasher*, ...);
+  template <typename U>
+  static char test(...);
+
+ public:
+  static const bool value = sizeof(test<T>(0, 0)) == sizeof(int);
+};
+
+template <typename T>
+const bool IsHashTable<T>::value;
+
+template<typename T>
+struct VoidT {
+    typedef void value_type;
+};
+
+template <typename T, typename = void>
+struct HasValueType : false_type {};
+template <typename T>
+struct HasValueType<T, VoidT<typename T::value_type> > : true_type {
+};
+
+template <typename C,
+          bool = sizeof(IsContainerTest<C>(0)) == sizeof(IsContainer),
+          bool = HasValueType<C>::value>
+struct IsRecursiveContainerImpl;
+
+template <typename C, bool HV>
+struct IsRecursiveContainerImpl<C, false, HV> : public false_type {};
+
+// Since the IsRecursiveContainerImpl depends on the IsContainerTest we need to
+// obey the same inconsistencies as the IsContainerTest, namely check if
+// something is a container is relying on only const_iterator in C++11 and
+// is relying on both const_iterator and iterator otherwise
+template <typename C>
+struct IsRecursiveContainerImpl<C, true, false> : public false_type {};
+
+template <typename C>
+struct IsRecursiveContainerImpl<C, true, true> {
+  #if GTEST_LANG_CXX11
+  typedef typename IteratorTraits<typename C::const_iterator>::value_type
+      value_type;
+#else
+  typedef typename IteratorTraits<typename C::iterator>::value_type value_type;
+#endif
+  typedef is_same<value_type, C> type;
+};
+
+// IsRecursiveContainer<Type> is a unary compile-time predicate that
+// evaluates whether C is a recursive container type. A recursive container
+// type is a container type whose value_type is equal to the container type
+// itself. An example for a recursive container type is
+// boost::filesystem::path, whose iterator has a value_type that is equal to
+// boost::filesystem::path.
+template <typename C>
+struct IsRecursiveContainer : public IsRecursiveContainerImpl<C>::type {};
+
+// EnableIf<condition>::type is void when 'Cond' is true, and
+// undefined when 'Cond' is false.  To use SFINAE to make a function
+// overload only apply when a particular expression is true, add
+// "typename EnableIf<expression>::type* = 0" as the last parameter.
+template<bool> struct EnableIf;
+template<> struct EnableIf<true> { typedef void type; };  // NOLINT
+
+// Utilities for native arrays.
+
+// ArrayEq() compares two k-dimensional native arrays using the
+// elements' operator==, where k can be any integer >= 0.  When k is
+// 0, ArrayEq() degenerates into comparing a single pair of values.
+
+template <typename T, typename U>
+bool ArrayEq(const T* lhs, size_t size, const U* rhs);
+
+// This generic version is used when k is 0.
+template <typename T, typename U>
+inline bool ArrayEq(const T& lhs, const U& rhs) { return lhs == rhs; }
+
+// This overload is used when k >= 1.
+template <typename T, typename U, size_t N>
+inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) {
+  return internal::ArrayEq(lhs, N, rhs);
+}
+
+// This helper reduces code bloat.  If we instead put its logic inside
+// the previous ArrayEq() function, arrays with different sizes would
+// lead to different copies of the template code.
+template <typename T, typename U>
+bool ArrayEq(const T* lhs, size_t size, const U* rhs) {
+  for (size_t i = 0; i != size; i++) {
+    if (!internal::ArrayEq(lhs[i], rhs[i]))
+      return false;
+  }
+  return true;
+}
+
+// Finds the first element in the iterator range [begin, end) that
+// equals elem.  Element may be a native array type itself.
+template <typename Iter, typename Element>
+Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) {
+  for (Iter it = begin; it != end; ++it) {
+    if (internal::ArrayEq(*it, elem))
+      return it;
+  }
+  return end;
+}
+
+// CopyArray() copies a k-dimensional native array using the elements'
+// operator=, where k can be any integer >= 0.  When k is 0,
+// CopyArray() degenerates into copying a single value.
+
+template <typename T, typename U>
+void CopyArray(const T* from, size_t size, U* to);
+
+// This generic version is used when k is 0.
+template <typename T, typename U>
+inline void CopyArray(const T& from, U* to) { *to = from; }
+
+// This overload is used when k >= 1.
+template <typename T, typename U, size_t N>
+inline void CopyArray(const T(&from)[N], U(*to)[N]) {
+  internal::CopyArray(from, N, *to);
+}
+
+// This helper reduces code bloat.  If we instead put its logic inside
+// the previous CopyArray() function, arrays with different sizes
+// would lead to different copies of the template code.
+template <typename T, typename U>
+void CopyArray(const T* from, size_t size, U* to) {
+  for (size_t i = 0; i != size; i++) {
+    internal::CopyArray(from[i], to + i);
+  }
+}
+
+// The relation between an NativeArray object (see below) and the
+// native array it represents.
+// We use 2 different structs to allow non-copyable types to be used, as long
+// as RelationToSourceReference() is passed.
+struct RelationToSourceReference {};
+struct RelationToSourceCopy {};
+
+// Adapts a native array to a read-only STL-style container.  Instead
+// of the complete STL container concept, this adaptor only implements
+// members useful for Google Mock's container matchers.  New members
+// should be added as needed.  To simplify the implementation, we only
+// support Element being a raw type (i.e. having no top-level const or
+// reference modifier).  It's the client's responsibility to satisfy
+// this requirement.  Element can be an array type itself (hence
+// multi-dimensional arrays are supported).
+template <typename Element>
+class NativeArray {
+ public:
+  // STL-style container typedefs.
+  typedef Element value_type;
+  typedef Element* iterator;
+  typedef const Element* const_iterator;
+
+  // Constructs from a native array. References the source.
+  NativeArray(const Element* array, size_t count, RelationToSourceReference) {
+    InitRef(array, count);
+  }
+
+  // Constructs from a native array. Copies the source.
+  NativeArray(const Element* array, size_t count, RelationToSourceCopy) {
+    InitCopy(array, count);
+  }
+
+  // Copy constructor.
+  NativeArray(const NativeArray& rhs) {
+    (this->*rhs.clone_)(rhs.array_, rhs.size_);
+  }
+
+  ~NativeArray() {
+    if (clone_ != &NativeArray::InitRef)
+      delete[] array_;
+  }
+
+  // STL-style container methods.
+  size_t size() const { return size_; }
+  const_iterator begin() const { return array_; }
+  const_iterator end() const { return array_ + size_; }
+  bool operator==(const NativeArray& rhs) const {
+    return size() == rhs.size() &&
+        ArrayEq(begin(), size(), rhs.begin());
+  }
+
+ private:
+  enum {
+    kCheckTypeIsNotConstOrAReference = StaticAssertTypeEqHelper<
+        Element, GTEST_REMOVE_REFERENCE_AND_CONST_(Element)>::value
+  };
+
+  // Initializes this object with a copy of the input.
+  void InitCopy(const Element* array, size_t a_size) {
+    Element* const copy = new Element[a_size];
+    CopyArray(array, a_size, copy);
+    array_ = copy;
+    size_ = a_size;
+    clone_ = &NativeArray::InitCopy;
+  }
+
+  // Initializes this object with a reference of the input.
+  void InitRef(const Element* array, size_t a_size) {
+    array_ = array;
+    size_ = a_size;
+    clone_ = &NativeArray::InitRef;
+  }
+
+  const Element* array_;
+  size_t size_;
+  void (NativeArray::*clone_)(const Element*, size_t);
+
+  GTEST_DISALLOW_ASSIGN_(NativeArray);
+};
+
+}  // namespace internal
+}  // namespace testing
+
+#define GTEST_MESSAGE_AT_(file, line, message, result_type) \
+  ::testing::internal::AssertHelper(result_type, file, line, message) \
+    = ::testing::Message()
+
+#define GTEST_MESSAGE_(message, result_type) \
+  GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type)
+
+#define GTEST_FATAL_FAILURE_(message) \
+  return GTEST_MESSAGE_(message, ::testing::TestPartResult::kFatalFailure)
+
+#define GTEST_NONFATAL_FAILURE_(message) \
+  GTEST_MESSAGE_(message, ::testing::TestPartResult::kNonFatalFailure)
+
+#define GTEST_SUCCESS_(message) \
+  GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess)
+
+// Suppress MSVC warning 4702 (unreachable code) for the code following
+// statement if it returns or throws (or doesn't return or throw in some
+// situations).
+#define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \
+  if (::testing::internal::AlwaysTrue()) { statement; }
+
+#define GTEST_TEST_THROW_(statement, expected_exception, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::ConstCharPtr gtest_msg = "") { \
+    bool gtest_caught_expected = false; \
+    try { \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    } \
+    catch (expected_exception const&) { \
+      gtest_caught_expected = true; \
+    } \
+    catch (...) { \
+      gtest_msg.value = \
+          "Expected: " #statement " throws an exception of type " \
+          #expected_exception ".\n  Actual: it throws a different type."; \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
+    } \
+    if (!gtest_caught_expected) { \
+      gtest_msg.value = \
+          "Expected: " #statement " throws an exception of type " \
+          #expected_exception ".\n  Actual: it throws nothing."; \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__): \
+      fail(gtest_msg.value)
+
+#define GTEST_TEST_NO_THROW_(statement, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    try { \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    } \
+    catch (...) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__): \
+      fail("Expected: " #statement " doesn't throw an exception.\n" \
+           "  Actual: it throws.")
+
+#define GTEST_TEST_ANY_THROW_(statement, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    bool gtest_caught_any = false; \
+    try { \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    } \
+    catch (...) { \
+      gtest_caught_any = true; \
+    } \
+    if (!gtest_caught_any) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__): \
+      fail("Expected: " #statement " throws an exception.\n" \
+           "  Actual: it doesn't.")
+
+
+// Implements Boolean test assertions such as EXPECT_TRUE. expression can be
+// either a boolean expression or an AssertionResult. text is a textual
+// represenation of expression as it was passed into the EXPECT_TRUE.
+#define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (const ::testing::AssertionResult gtest_ar_ = \
+      ::testing::AssertionResult(expression)) \
+    ; \
+  else \
+    fail(::testing::internal::GetBoolAssertionFailureMessage(\
+        gtest_ar_, text, #actual, #expected).c_str())
+
+#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__): \
+      fail("Expected: " #statement " doesn't generate new fatal " \
+           "failures in the current thread.\n" \
+           "  Actual: it does.")
+
+// Expands to the name of the class that implements the given test.
+#define GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
+  test_case_name##_##test_name##_Test
+
+// Helper macro for defining tests.
+#define GTEST_TEST_(test_case_name, test_name, parent_class, parent_id)\
+class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public parent_class {\
+ public:\
+  GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}\
+ private:\
+  virtual void TestBody();\
+  static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;\
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(\
+      GTEST_TEST_CLASS_NAME_(test_case_name, test_name));\
+};\
+\
+::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name, test_name)\
+  ::test_info_ =\
+    ::testing::internal::MakeAndRegisterTestInfo(\
+        #test_case_name, #test_name, NULL, NULL, \
+        ::testing::internal::CodeLocation(__FILE__, __LINE__), \
+        (parent_id), \
+        parent_class::SetUpTestCase, \
+        parent_class::TearDownTestCase, \
+        new ::testing::internal::TestFactoryImpl<\
+            GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>);\
+void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file defines the public API for death tests.  It is
+// #included by gtest.h so a user doesn't need to include this
+// directly.
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file defines internal utilities needed for implementing
+// death tests.  They are subject to change without notice.
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+
+
+#include <stdio.h>
+
+namespace testing {
+namespace internal {
+
+GTEST_DECLARE_string_(internal_run_death_test);
+
+// Names of the flags (needed for parsing Google Test flags).
+const char kDeathTestStyleFlag[] = "death_test_style";
+const char kDeathTestUseFork[] = "death_test_use_fork";
+const char kInternalRunDeathTestFlag[] = "internal_run_death_test";
+
+#if GTEST_HAS_DEATH_TEST
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+// DeathTest is a class that hides much of the complexity of the
+// GTEST_DEATH_TEST_ macro.  It is abstract; its static Create method
+// returns a concrete class that depends on the prevailing death test
+// style, as defined by the --gtest_death_test_style and/or
+// --gtest_internal_run_death_test flags.
+
+// In describing the results of death tests, these terms are used with
+// the corresponding definitions:
+//
+// exit status:  The integer exit information in the format specified
+//               by wait(2)
+// exit code:    The integer code passed to exit(3), _exit(2), or
+//               returned from main()
+class GTEST_API_ DeathTest {
+ public:
+  // Create returns false if there was an error determining the
+  // appropriate action to take for the current death test; for example,
+  // if the gtest_death_test_style flag is set to an invalid value.
+  // The LastMessage method will return a more detailed message in that
+  // case.  Otherwise, the DeathTest pointer pointed to by the "test"
+  // argument is set.  If the death test should be skipped, the pointer
+  // is set to NULL; otherwise, it is set to the address of a new concrete
+  // DeathTest object that controls the execution of the current test.
+  static bool Create(const char* statement, const RE* regex,
+                     const char* file, int line, DeathTest** test);
+  DeathTest();
+  virtual ~DeathTest() { }
+
+  // A helper class that aborts a death test when it's deleted.
+  class ReturnSentinel {
+   public:
+    explicit ReturnSentinel(DeathTest* test) : test_(test) { }
+    ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); }
+   private:
+    DeathTest* const test_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel);
+  } GTEST_ATTRIBUTE_UNUSED_;
+
+  // An enumeration of possible roles that may be taken when a death
+  // test is encountered.  EXECUTE means that the death test logic should
+  // be executed immediately.  OVERSEE means that the program should prepare
+  // the appropriate environment for a child process to execute the death
+  // test, then wait for it to complete.
+  enum TestRole { OVERSEE_TEST, EXECUTE_TEST };
+
+  // An enumeration of the three reasons that a test might be aborted.
+  enum AbortReason {
+    TEST_ENCOUNTERED_RETURN_STATEMENT,
+    TEST_THREW_EXCEPTION,
+    TEST_DID_NOT_DIE
+  };
+
+  // Assumes one of the above roles.
+  virtual TestRole AssumeRole() = 0;
+
+  // Waits for the death test to finish and returns its status.
+  virtual int Wait() = 0;
+
+  // Returns true if the death test passed; that is, the test process
+  // exited during the test, its exit status matches a user-supplied
+  // predicate, and its stderr output matches a user-supplied regular
+  // expression.
+  // The user-supplied predicate may be a macro expression rather
+  // than a function pointer or functor, or else Wait and Passed could
+  // be combined.
+  virtual bool Passed(bool exit_status_ok) = 0;
+
+  // Signals that the death test did not die as expected.
+  virtual void Abort(AbortReason reason) = 0;
+
+  // Returns a human-readable outcome message regarding the outcome of
+  // the last death test.
+  static const char* LastMessage();
+
+  static void set_last_death_test_message(const std::string& message);
+
+ private:
+  // A string containing a description of the outcome of the last death test.
+  static std::string last_death_test_message_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest);
+};
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+// Factory interface for death tests.  May be mocked out for testing.
+class DeathTestFactory {
+ public:
+  virtual ~DeathTestFactory() { }
+  virtual bool Create(const char* statement, const RE* regex,
+                      const char* file, int line, DeathTest** test) = 0;
+};
+
+// A concrete DeathTestFactory implementation for normal use.
+class DefaultDeathTestFactory : public DeathTestFactory {
+ public:
+  virtual bool Create(const char* statement, const RE* regex,
+                      const char* file, int line, DeathTest** test);
+};
+
+// Returns true if exit_status describes a process that was terminated
+// by a signal, or exited normally with a nonzero exit code.
+GTEST_API_ bool ExitedUnsuccessfully(int exit_status);
+
+// Traps C++ exceptions escaping statement and reports them as test
+// failures. Note that trapping SEH exceptions is not implemented here.
+# if GTEST_HAS_EXCEPTIONS
+#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+  try { \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+  } catch (const ::std::exception& gtest_exception) { \
+    fprintf(\
+        stderr, \
+        "\n%s: Caught std::exception-derived exception escaping the " \
+        "death test statement. Exception message: %s\n", \
+        ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \
+        gtest_exception.what()); \
+    fflush(stderr); \
+    death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
+  } catch (...) { \
+    death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
+  }
+
+# else
+#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+  GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
+
+# endif
+
+// This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*,
+// ASSERT_EXIT*, and EXPECT_EXIT*.
+# define GTEST_DEATH_TEST_(statement, predicate, regex, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    const ::testing::internal::RE& gtest_regex = (regex); \
+    ::testing::internal::DeathTest* gtest_dt; \
+    if (!::testing::internal::DeathTest::Create(#statement, &gtest_regex, \
+        __FILE__, __LINE__, &gtest_dt)) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \
+    } \
+    if (gtest_dt != NULL) { \
+      ::testing::internal::scoped_ptr< ::testing::internal::DeathTest> \
+          gtest_dt_ptr(gtest_dt); \
+      switch (gtest_dt->AssumeRole()) { \
+        case ::testing::internal::DeathTest::OVERSEE_TEST: \
+          if (!gtest_dt->Passed(predicate(gtest_dt->Wait()))) { \
+            goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \
+          } \
+          break; \
+        case ::testing::internal::DeathTest::EXECUTE_TEST: { \
+          ::testing::internal::DeathTest::ReturnSentinel \
+              gtest_sentinel(gtest_dt); \
+          GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, gtest_dt); \
+          gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE); \
+          break; \
+        } \
+        default: \
+          break; \
+      } \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__): \
+      fail(::testing::internal::DeathTest::LastMessage())
+// The symbol "fail" here expands to something into which a message
+// can be streamed.
+
+// This macro is for implementing ASSERT/EXPECT_DEBUG_DEATH when compiled in
+// NDEBUG mode. In this case we need the statements to be executed and the macro
+// must accept a streamed message even though the message is never printed.
+// The regex object is not evaluated, but it is used to prevent "unused"
+// warnings and to avoid an expression that doesn't compile in debug mode.
+#define GTEST_EXECUTE_STATEMENT_(statement, regex)             \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                \
+  if (::testing::internal::AlwaysTrue()) {                     \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+  } else if (!::testing::internal::AlwaysTrue()) {             \
+    const ::testing::internal::RE& gtest_regex = (regex);      \
+    static_cast<void>(gtest_regex);                            \
+  } else                                                       \
+    ::testing::Message()
+
+// A class representing the parsed contents of the
+// --gtest_internal_run_death_test flag, as it existed when
+// RUN_ALL_TESTS was called.
+class InternalRunDeathTestFlag {
+ public:
+  InternalRunDeathTestFlag(const std::string& a_file,
+                           int a_line,
+                           int an_index,
+                           int a_write_fd)
+      : file_(a_file), line_(a_line), index_(an_index),
+        write_fd_(a_write_fd) {}
+
+  ~InternalRunDeathTestFlag() {
+    if (write_fd_ >= 0)
+      posix::Close(write_fd_);
+  }
+
+  const std::string& file() const { return file_; }
+  int line() const { return line_; }
+  int index() const { return index_; }
+  int write_fd() const { return write_fd_; }
+
+ private:
+  std::string file_;
+  int line_;
+  int index_;
+  int write_fd_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(InternalRunDeathTestFlag);
+};
+
+// Returns a newly created InternalRunDeathTestFlag object with fields
+// initialized from the GTEST_FLAG(internal_run_death_test) flag if
+// the flag is specified; otherwise returns NULL.
+InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag();
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+
+namespace testing {
+
+// This flag controls the style of death tests.  Valid values are "threadsafe",
+// meaning that the death test child process will re-execute the test binary
+// from the start, running only a single death test, or "fast",
+// meaning that the child process will execute the test logic immediately
+// after forking.
+GTEST_DECLARE_string_(death_test_style);
+
+#if GTEST_HAS_DEATH_TEST
+
+namespace internal {
+
+// Returns a Boolean value indicating whether the caller is currently
+// executing in the context of the death test child process.  Tools such as
+// Valgrind heap checkers may need this to modify their behavior in death
+// tests.  IMPORTANT: This is an internal utility.  Using it may break the
+// implementation of death tests.  User code MUST NOT use it.
+GTEST_API_ bool InDeathTestChild();
+
+}  // namespace internal
+
+// The following macros are useful for writing death tests.
+
+// Here's what happens when an ASSERT_DEATH* or EXPECT_DEATH* is
+// executed:
+//
+//   1. It generates a warning if there is more than one active
+//   thread.  This is because it's safe to fork() or clone() only
+//   when there is a single thread.
+//
+//   2. The parent process clone()s a sub-process and runs the death
+//   test in it; the sub-process exits with code 0 at the end of the
+//   death test, if it hasn't exited already.
+//
+//   3. The parent process waits for the sub-process to terminate.
+//
+//   4. The parent process checks the exit code and error message of
+//   the sub-process.
+//
+// Examples:
+//
+//   ASSERT_DEATH(server.SendMessage(56, "Hello"), "Invalid port number");
+//   for (int i = 0; i < 5; i++) {
+//     EXPECT_DEATH(server.ProcessRequest(i),
+//                  "Invalid request .* in ProcessRequest()")
+//                  << "Failed to die on request " << i;
+//   }
+//
+//   ASSERT_EXIT(server.ExitNow(), ::testing::ExitedWithCode(0), "Exiting");
+//
+//   bool KilledBySIGHUP(int exit_code) {
+//     return WIFSIGNALED(exit_code) && WTERMSIG(exit_code) == SIGHUP;
+//   }
+//
+//   ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!");
+//
+// On the regular expressions used in death tests:
+//
+//   GOOGLETEST_CM0005 DO NOT DELETE
+//   On POSIX-compliant systems (*nix), we use the <regex.h> library,
+//   which uses the POSIX extended regex syntax.
+//
+//   On other platforms (e.g. Windows or Mac), we only support a simple regex
+//   syntax implemented as part of Google Test.  This limited
+//   implementation should be enough most of the time when writing
+//   death tests; though it lacks many features you can find in PCRE
+//   or POSIX extended regex syntax.  For example, we don't support
+//   union ("x|y"), grouping ("(xy)"), brackets ("[xy]"), and
+//   repetition count ("x{5,7}"), among others.
+//
+//   Below is the syntax that we do support.  We chose it to be a
+//   subset of both PCRE and POSIX extended regex, so it's easy to
+//   learn wherever you come from.  In the following: 'A' denotes a
+//   literal character, period (.), or a single \\ escape sequence;
+//   'x' and 'y' denote regular expressions; 'm' and 'n' are for
+//   natural numbers.
+//
+//     c     matches any literal character c
+//     \\d   matches any decimal digit
+//     \\D   matches any character that's not a decimal digit
+//     \\f   matches \f
+//     \\n   matches \n
+//     \\r   matches \r
+//     \\s   matches any ASCII whitespace, including \n
+//     \\S   matches any character that's not a whitespace
+//     \\t   matches \t
+//     \\v   matches \v
+//     \\w   matches any letter, _, or decimal digit
+//     \\W   matches any character that \\w doesn't match
+//     \\c   matches any literal character c, which must be a punctuation
+//     .     matches any single character except \n
+//     A?    matches 0 or 1 occurrences of A
+//     A*    matches 0 or many occurrences of A
+//     A+    matches 1 or many occurrences of A
+//     ^     matches the beginning of a string (not that of each line)
+//     $     matches the end of a string (not that of each line)
+//     xy    matches x followed by y
+//
+//   If you accidentally use PCRE or POSIX extended regex features
+//   not implemented by us, you will get a run-time failure.  In that
+//   case, please try to rewrite your regular expression within the
+//   above syntax.
+//
+//   This implementation is *not* meant to be as highly tuned or robust
+//   as a compiled regex library, but should perform well enough for a
+//   death test, which already incurs significant overhead by launching
+//   a child process.
+//
+// Known caveats:
+//
+//   A "threadsafe" style death test obtains the path to the test
+//   program from argv[0] and re-executes it in the sub-process.  For
+//   simplicity, the current implementation doesn't search the PATH
+//   when launching the sub-process.  This means that the user must
+//   invoke the test program via a path that contains at least one
+//   path separator (e.g. path/to/foo_test and
+//   /absolute/path/to/bar_test are fine, but foo_test is not).  This
+//   is rarely a problem as people usually don't put the test binary
+//   directory in PATH.
+//
+// FIXME: make thread-safe death tests search the PATH.
+
+// Asserts that a given statement causes the program to exit, with an
+// integer exit status that satisfies predicate, and emitting error output
+// that matches regex.
+# define ASSERT_EXIT(statement, predicate, regex) \
+    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_)
+
+// Like ASSERT_EXIT, but continues on to successive tests in the
+// test case, if any:
+# define EXPECT_EXIT(statement, predicate, regex) \
+    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_)
+
+// Asserts that a given statement causes the program to exit, either by
+// explicitly exiting with a nonzero exit code or being killed by a
+// signal, and emitting error output that matches regex.
+# define ASSERT_DEATH(statement, regex) \
+    ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
+
+// Like ASSERT_DEATH, but continues on to successive tests in the
+// test case, if any:
+# define EXPECT_DEATH(statement, regex) \
+    EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
+
+// Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*:
+
+// Tests that an exit code describes a normal exit with a given exit code.
+class GTEST_API_ ExitedWithCode {
+ public:
+  explicit ExitedWithCode(int exit_code);
+  bool operator()(int exit_status) const;
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ExitedWithCode& other);
+
+  const int exit_code_;
+};
+
+# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+// Tests that an exit code describes an exit due to termination by a
+// given signal.
+// GOOGLETEST_CM0006 DO NOT DELETE
+class GTEST_API_ KilledBySignal {
+ public:
+  explicit KilledBySignal(int signum);
+  bool operator()(int exit_status) const;
+ private:
+  const int signum_;
+};
+# endif  // !GTEST_OS_WINDOWS
+
+// EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode.
+// The death testing framework causes this to have interesting semantics,
+// since the sideeffects of the call are only visible in opt mode, and not
+// in debug mode.
+//
+// In practice, this can be used to test functions that utilize the
+// LOG(DFATAL) macro using the following style:
+//
+// int DieInDebugOr12(int* sideeffect) {
+//   if (sideeffect) {
+//     *sideeffect = 12;
+//   }
+//   LOG(DFATAL) << "death";
+//   return 12;
+// }
+//
+// TEST(TestCase, TestDieOr12WorksInDgbAndOpt) {
+//   int sideeffect = 0;
+//   // Only asserts in dbg.
+//   EXPECT_DEBUG_DEATH(DieInDebugOr12(&sideeffect), "death");
+//
+// #ifdef NDEBUG
+//   // opt-mode has sideeffect visible.
+//   EXPECT_EQ(12, sideeffect);
+// #else
+//   // dbg-mode no visible sideeffect.
+//   EXPECT_EQ(0, sideeffect);
+// #endif
+// }
+//
+// This will assert that DieInDebugReturn12InOpt() crashes in debug
+// mode, usually due to a DCHECK or LOG(DFATAL), but returns the
+// appropriate fallback value (12 in this case) in opt mode. If you
+// need to test that a function has appropriate side-effects in opt
+// mode, include assertions against the side-effects.  A general
+// pattern for this is:
+//
+// EXPECT_DEBUG_DEATH({
+//   // Side-effects here will have an effect after this statement in
+//   // opt mode, but none in debug mode.
+//   EXPECT_EQ(12, DieInDebugOr12(&sideeffect));
+// }, "death");
+//
+# ifdef NDEBUG
+
+#  define EXPECT_DEBUG_DEATH(statement, regex) \
+  GTEST_EXECUTE_STATEMENT_(statement, regex)
+
+#  define ASSERT_DEBUG_DEATH(statement, regex) \
+  GTEST_EXECUTE_STATEMENT_(statement, regex)
+
+# else
+
+#  define EXPECT_DEBUG_DEATH(statement, regex) \
+  EXPECT_DEATH(statement, regex)
+
+#  define ASSERT_DEBUG_DEATH(statement, regex) \
+  ASSERT_DEATH(statement, regex)
+
+# endif  // NDEBUG for EXPECT_DEBUG_DEATH
+#endif  // GTEST_HAS_DEATH_TEST
+
+// This macro is used for implementing macros such as
+// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where
+// death tests are not supported. Those macros must compile on such systems
+// iff EXPECT_DEATH and ASSERT_DEATH compile with the same parameters on
+// systems that support death tests. This allows one to write such a macro
+// on a system that does not support death tests and be sure that it will
+// compile on a death-test supporting system. It is exposed publicly so that
+// systems that have death-tests with stricter requirements than
+// GTEST_HAS_DEATH_TEST can write their own equivalent of
+// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED.
+//
+// Parameters:
+//   statement -  A statement that a macro such as EXPECT_DEATH would test
+//                for program termination. This macro has to make sure this
+//                statement is compiled but not executed, to ensure that
+//                EXPECT_DEATH_IF_SUPPORTED compiles with a certain
+//                parameter iff EXPECT_DEATH compiles with it.
+//   regex     -  A regex that a macro such as EXPECT_DEATH would use to test
+//                the output of statement.  This parameter has to be
+//                compiled but not evaluated by this macro, to ensure that
+//                this macro only accepts expressions that a macro such as
+//                EXPECT_DEATH would accept.
+//   terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED
+//                and a return statement for ASSERT_DEATH_IF_SUPPORTED.
+//                This ensures that ASSERT_DEATH_IF_SUPPORTED will not
+//                compile inside functions where ASSERT_DEATH doesn't
+//                compile.
+//
+//  The branch that has an always false condition is used to ensure that
+//  statement and regex are compiled (and thus syntactically correct) but
+//  never executed. The unreachable code macro protects the terminator
+//  statement from generating an 'unreachable code' warning in case
+//  statement unconditionally returns or throws. The Message constructor at
+//  the end allows the syntax of streaming additional messages into the
+//  macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
+# define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator) \
+    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+    if (::testing::internal::AlwaysTrue()) { \
+      GTEST_LOG_(WARNING) \
+          << "Death tests are not supported on this platform.\n" \
+          << "Statement '" #statement "' cannot be verified."; \
+    } else if (::testing::internal::AlwaysFalse()) { \
+      ::testing::internal::RE::PartialMatch(".*", (regex)); \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+      terminator; \
+    } else \
+      ::testing::Message()
+
+// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
+// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
+// death tests are supported; otherwise they just issue a warning.  This is
+// useful when you are combining death test assertions with normal test
+// assertions in one test.
+#if GTEST_HAS_DEATH_TEST
+# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+    EXPECT_DEATH(statement, regex)
+# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+    ASSERT_DEATH(statement, regex)
+#else
+# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+    GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, )
+# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+    GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return)
+#endif
+
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+// This file was GENERATED by command:
+//     pump.py gtest-param-test.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Macros and functions for implementing parameterized tests
+// in Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+// GOOGLETEST_CM0001 DO NOT DELETE
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+
+
+// Value-parameterized tests allow you to test your code with different
+// parameters without writing multiple copies of the same test.
+//
+// Here is how you use value-parameterized tests:
+
+#if 0
+
+// To write value-parameterized tests, first you should define a fixture
+// class. It is usually derived from testing::TestWithParam<T> (see below for
+// another inheritance scheme that's sometimes useful in more complicated
+// class hierarchies), where the type of your parameter values.
+// TestWithParam<T> is itself derived from testing::Test. T can be any
+// copyable type. If it's a raw pointer, you are responsible for managing the
+// lifespan of the pointed values.
+
+class FooTest : public ::testing::TestWithParam<const char*> {
+  // You can implement all the usual class fixture members here.
+};
+
+// Then, use the TEST_P macro to define as many parameterized tests
+// for this fixture as you want. The _P suffix is for "parameterized"
+// or "pattern", whichever you prefer to think.
+
+TEST_P(FooTest, DoesBlah) {
+  // Inside a test, access the test parameter with the GetParam() method
+  // of the TestWithParam<T> class:
+  EXPECT_TRUE(foo.Blah(GetParam()));
+  ...
+}
+
+TEST_P(FooTest, HasBlahBlah) {
+  ...
+}
+
+// Finally, you can use INSTANTIATE_TEST_CASE_P to instantiate the test
+// case with any set of parameters you want. Google Test defines a number
+// of functions for generating test parameters. They return what we call
+// (surprise!) parameter generators. Here is a summary of them, which
+// are all in the testing namespace:
+//
+//
+//  Range(begin, end [, step]) - Yields values {begin, begin+step,
+//                               begin+step+step, ...}. The values do not
+//                               include end. step defaults to 1.
+//  Values(v1, v2, ..., vN)    - Yields values {v1, v2, ..., vN}.
+//  ValuesIn(container)        - Yields values from a C-style array, an STL
+//  ValuesIn(begin,end)          container, or an iterator range [begin, end).
+//  Bool()                     - Yields sequence {false, true}.
+//  Combine(g1, g2, ..., gN)   - Yields all combinations (the Cartesian product
+//                               for the math savvy) of the values generated
+//                               by the N generators.
+//
+// For more details, see comments at the definitions of these functions below
+// in this file.
+//
+// The following statement will instantiate tests from the FooTest test case
+// each with parameter values "meeny", "miny", and "moe".
+
+INSTANTIATE_TEST_CASE_P(InstantiationName,
+                        FooTest,
+                        Values("meeny", "miny", "moe"));
+
+// To distinguish different instances of the pattern, (yes, you
+// can instantiate it more then once) the first argument to the
+// INSTANTIATE_TEST_CASE_P macro is a prefix that will be added to the
+// actual test case name. Remember to pick unique prefixes for different
+// instantiations. The tests from the instantiation above will have
+// these names:
+//
+//    * InstantiationName/FooTest.DoesBlah/0 for "meeny"
+//    * InstantiationName/FooTest.DoesBlah/1 for "miny"
+//    * InstantiationName/FooTest.DoesBlah/2 for "moe"
+//    * InstantiationName/FooTest.HasBlahBlah/0 for "meeny"
+//    * InstantiationName/FooTest.HasBlahBlah/1 for "miny"
+//    * InstantiationName/FooTest.HasBlahBlah/2 for "moe"
+//
+// You can use these names in --gtest_filter.
+//
+// This statement will instantiate all tests from FooTest again, each
+// with parameter values "cat" and "dog":
+
+const char* pets[] = {"cat", "dog"};
+INSTANTIATE_TEST_CASE_P(AnotherInstantiationName, FooTest, ValuesIn(pets));
+
+// The tests from the instantiation above will have these names:
+//
+//    * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat"
+//    * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog"
+//    * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat"
+//    * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog"
+//
+// Please note that INSTANTIATE_TEST_CASE_P will instantiate all tests
+// in the given test case, whether their definitions come before or
+// AFTER the INSTANTIATE_TEST_CASE_P statement.
+//
+// Please also note that generator expressions (including parameters to the
+// generators) are evaluated in InitGoogleTest(), after main() has started.
+// This allows the user on one hand, to adjust generator parameters in order
+// to dynamically determine a set of tests to run and on the other hand,
+// give the user a chance to inspect the generated tests with Google Test
+// reflection API before RUN_ALL_TESTS() is executed.
+//
+// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc
+// for more examples.
+//
+// In the future, we plan to publish the API for defining new parameter
+// generators. But for now this interface remains part of the internal
+// implementation and is subject to change.
+//
+//
+// A parameterized test fixture must be derived from testing::Test and from
+// testing::WithParamInterface<T>, where T is the type of the parameter
+// values. Inheriting from TestWithParam<T> satisfies that requirement because
+// TestWithParam<T> inherits from both Test and WithParamInterface. In more
+// complicated hierarchies, however, it is occasionally useful to inherit
+// separately from Test and WithParamInterface. For example:
+
+class BaseTest : public ::testing::Test {
+  // You can inherit all the usual members for a non-parameterized test
+  // fixture here.
+};
+
+class DerivedTest : public BaseTest, public ::testing::WithParamInterface<int> {
+  // The usual test fixture members go here too.
+};
+
+TEST_F(BaseTest, HasFoo) {
+  // This is an ordinary non-parameterized test.
+}
+
+TEST_P(DerivedTest, DoesBlah) {
+  // GetParam works just the same here as if you inherit from TestWithParam.
+  EXPECT_TRUE(foo.Blah(GetParam()));
+}
+
+#endif  // 0
+
+
+#if !GTEST_OS_SYMBIAN
+# include <utility>
+#endif
+
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Type and function utilities for implementing parameterized tests.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+
+#include <ctype.h>
+
+#include <iterator>
+#include <set>
+#include <utility>
+#include <vector>
+
+// Copyright 2003 Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// A "smart" pointer type with reference tracking.  Every pointer to a
+// particular object is kept on a circular linked list.  When the last pointer
+// to an object is destroyed or reassigned, the object is deleted.
+//
+// Used properly, this deletes the object when the last reference goes away.
+// There are several caveats:
+// - Like all reference counting schemes, cycles lead to leaks.
+// - Each smart pointer is actually two pointers (8 bytes instead of 4).
+// - Every time a pointer is assigned, the entire list of pointers to that
+//   object is traversed.  This class is therefore NOT SUITABLE when there
+//   will often be more than two or three pointers to a particular object.
+// - References are only tracked as long as linked_ptr<> objects are copied.
+//   If a linked_ptr<> is converted to a raw pointer and back, BAD THINGS
+//   will happen (double deletion).
+//
+// A good use of this class is storing object references in STL containers.
+// You can safely put linked_ptr<> in a vector<>.
+// Other uses may not be as good.
+//
+// Note: If you use an incomplete type with linked_ptr<>, the class
+// *containing* linked_ptr<> must have a constructor and destructor (even
+// if they do nothing!).
+//
+// Bill Gibbons suggested we use something like this.
+//
+// Thread Safety:
+//   Unlike other linked_ptr implementations, in this implementation
+//   a linked_ptr object is thread-safe in the sense that:
+//     - it's safe to copy linked_ptr objects concurrently,
+//     - it's safe to copy *from* a linked_ptr and read its underlying
+//       raw pointer (e.g. via get()) concurrently, and
+//     - it's safe to write to two linked_ptrs that point to the same
+//       shared object concurrently.
+// FIXME: rename this to safe_linked_ptr to avoid
+// confusion with normal linked_ptr.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
+
+#include <stdlib.h>
+#include <assert.h>
+
+
+namespace testing {
+namespace internal {
+
+// Protects copying of all linked_ptr objects.
+GTEST_API_ GTEST_DECLARE_STATIC_MUTEX_(g_linked_ptr_mutex);
+
+// This is used internally by all instances of linked_ptr<>.  It needs to be
+// a non-template class because different types of linked_ptr<> can refer to
+// the same object (linked_ptr<Superclass>(obj) vs linked_ptr<Subclass>(obj)).
+// So, it needs to be possible for different types of linked_ptr to participate
+// in the same circular linked list, so we need a single class type here.
+//
+// DO NOT USE THIS CLASS DIRECTLY YOURSELF.  Use linked_ptr<T>.
+class linked_ptr_internal {
+ public:
+  // Create a new circle that includes only this instance.
+  void join_new() {
+    next_ = this;
+  }
+
+  // Many linked_ptr operations may change p.link_ for some linked_ptr
+  // variable p in the same circle as this object.  Therefore we need
+  // to prevent two such operations from occurring concurrently.
+  //
+  // Note that different types of linked_ptr objects can coexist in a
+  // circle (e.g. linked_ptr<Base>, linked_ptr<Derived1>, and
+  // linked_ptr<Derived2>).  Therefore we must use a single mutex to
+  // protect all linked_ptr objects.  This can create serious
+  // contention in production code, but is acceptable in a testing
+  // framework.
+
+  // Join an existing circle.
+  void join(linked_ptr_internal const* ptr)
+      GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) {
+    MutexLock lock(&g_linked_ptr_mutex);
+
+    linked_ptr_internal const* p = ptr;
+    while (p->next_ != ptr) {
+      assert(p->next_ != this &&
+             "Trying to join() a linked ring we are already in. "
+             "Is GMock thread safety enabled?");
+      p = p->next_;
+    }
+    p->next_ = this;
+    next_ = ptr;
+  }
+
+  // Leave whatever circle we're part of.  Returns true if we were the
+  // last member of the circle.  Once this is done, you can join() another.
+  bool depart()
+      GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) {
+    MutexLock lock(&g_linked_ptr_mutex);
+
+    if (next_ == this) return true;
+    linked_ptr_internal const* p = next_;
+    while (p->next_ != this) {
+      assert(p->next_ != next_ &&
+             "Trying to depart() a linked ring we are not in. "
+             "Is GMock thread safety enabled?");
+      p = p->next_;
+    }
+    p->next_ = next_;
+    return false;
+  }
+
+ private:
+  mutable linked_ptr_internal const* next_;
+};
+
+template <typename T>
+class linked_ptr {
+ public:
+  typedef T element_type;
+
+  // Take over ownership of a raw pointer.  This should happen as soon as
+  // possible after the object is created.
+  explicit linked_ptr(T* ptr = NULL) { capture(ptr); }
+  ~linked_ptr() { depart(); }
+
+  // Copy an existing linked_ptr<>, adding ourselves to the list of references.
+  template <typename U> linked_ptr(linked_ptr<U> const& ptr) { copy(&ptr); }
+  linked_ptr(linked_ptr const& ptr) {  // NOLINT
+    assert(&ptr != this);
+    copy(&ptr);
+  }
+
+  // Assignment releases the old value and acquires the new.
+  template <typename U> linked_ptr& operator=(linked_ptr<U> const& ptr) {
+    depart();
+    copy(&ptr);
+    return *this;
+  }
+
+  linked_ptr& operator=(linked_ptr const& ptr) {
+    if (&ptr != this) {
+      depart();
+      copy(&ptr);
+    }
+    return *this;
+  }
+
+  // Smart pointer members.
+  void reset(T* ptr = NULL) {
+    depart();
+    capture(ptr);
+  }
+  T* get() const { return value_; }
+  T* operator->() const { return value_; }
+  T& operator*() const { return *value_; }
+
+  bool operator==(T* p) const { return value_ == p; }
+  bool operator!=(T* p) const { return value_ != p; }
+  template <typename U>
+  bool operator==(linked_ptr<U> const& ptr) const {
+    return value_ == ptr.get();
+  }
+  template <typename U>
+  bool operator!=(linked_ptr<U> const& ptr) const {
+    return value_ != ptr.get();
+  }
+
+ private:
+  template <typename U>
+  friend class linked_ptr;
+
+  T* value_;
+  linked_ptr_internal link_;
+
+  void depart() {
+    if (link_.depart()) delete value_;
+  }
+
+  void capture(T* ptr) {
+    value_ = ptr;
+    link_.join_new();
+  }
+
+  template <typename U> void copy(linked_ptr<U> const* ptr) {
+    value_ = ptr->get();
+    if (value_)
+      link_.join(&ptr->link_);
+    else
+      link_.join_new();
+  }
+};
+
+template<typename T> inline
+bool operator==(T* ptr, const linked_ptr<T>& x) {
+  return ptr == x.get();
+}
+
+template<typename T> inline
+bool operator!=(T* ptr, const linked_ptr<T>& x) {
+  return ptr != x.get();
+}
+
+// A function to convert T* into linked_ptr<T>
+// Doing e.g. make_linked_ptr(new FooBarBaz<type>(arg)) is a shorter notation
+// for linked_ptr<FooBarBaz<type> >(new FooBarBaz<type>(arg))
+template <typename T>
+linked_ptr<T> make_linked_ptr(T* ptr) {
+  return linked_ptr<T>(ptr);
+}
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Test - The Google C++ Testing and Mocking Framework
+//
+// This file implements a universal value printer that can print a
+// value of any type T:
+//
+//   void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
+//
+// A user can teach this function how to print a class type T by
+// defining either operator<<() or PrintTo() in the namespace that
+// defines T.  More specifically, the FIRST defined function in the
+// following list will be used (assuming T is defined in namespace
+// foo):
+//
+//   1. foo::PrintTo(const T&, ostream*)
+//   2. operator<<(ostream&, const T&) defined in either foo or the
+//      global namespace.
+//
+// However if T is an STL-style container then it is printed element-wise
+// unless foo::PrintTo(const T&, ostream*) is defined. Note that
+// operator<<() is ignored for container types.
+//
+// If none of the above is defined, it will print the debug string of
+// the value if it is a protocol buffer, or print the raw bytes in the
+// value otherwise.
+//
+// To aid debugging: when T is a reference type, the address of the
+// value is also printed; when T is a (const) char pointer, both the
+// pointer value and the NUL-terminated string it points to are
+// printed.
+//
+// We also provide some convenient wrappers:
+//
+//   // Prints a value to a string.  For a (const or not) char
+//   // pointer, the NUL-terminated string (but not the pointer) is
+//   // printed.
+//   std::string ::testing::PrintToString(const T& value);
+//
+//   // Prints a value tersely: for a reference type, the referenced
+//   // value (but not the address) is printed; for a (const or not) char
+//   // pointer, the NUL-terminated string (but not the pointer) is
+//   // printed.
+//   void ::testing::internal::UniversalTersePrint(const T& value, ostream*);
+//
+//   // Prints value using the type inferred by the compiler.  The difference
+//   // from UniversalTersePrint() is that this function prints both the
+//   // pointer and the NUL-terminated string for a (const or not) char pointer.
+//   void ::testing::internal::UniversalPrint(const T& value, ostream*);
+//
+//   // Prints the fields of a tuple tersely to a string vector, one
+//   // element for each field. Tuple support must be enabled in
+//   // gtest-port.h.
+//   std::vector<string> UniversalTersePrintTupleFieldsToStrings(
+//       const Tuple& value);
+//
+// Known limitation:
+//
+// The print primitives print the elements of an STL-style container
+// using the compiler-inferred type of *iter where iter is a
+// const_iterator of the container.  When const_iterator is an input
+// iterator but not a forward iterator, this inferred type may not
+// match value_type, and the print output may be incorrect.  In
+// practice, this is rarely a problem as for most containers
+// const_iterator is a forward iterator.  We'll fix this if there's an
+// actual need for it.  Note that this fix cannot rely on value_type
+// being defined as many user-defined container types don't have
+// value_type.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#if GTEST_HAS_STD_TUPLE_
+# include <tuple>
+#endif
+
+#if GTEST_HAS_ABSL
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "absl/types/variant.h"
+#endif  // GTEST_HAS_ABSL
+
+namespace testing {
+
+// Definitions in the 'internal' and 'internal2' name spaces are
+// subject to change without notice.  DO NOT USE THEM IN USER CODE!
+namespace internal2 {
+
+// Prints the given number of bytes in the given object to the given
+// ostream.
+GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes,
+                                     size_t count,
+                                     ::std::ostream* os);
+
+// For selecting which printer to use when a given type has neither <<
+// nor PrintTo().
+enum TypeKind {
+  kProtobuf,              // a protobuf type
+  kConvertibleToInteger,  // a type implicitly convertible to BiggestInt
+                          // (e.g. a named or unnamed enum type)
+#if GTEST_HAS_ABSL
+  kConvertibleToStringView,  // a type implicitly convertible to
+                             // absl::string_view
+#endif
+  kOtherType  // anything else
+};
+
+// TypeWithoutFormatter<T, kTypeKind>::PrintValue(value, os) is called
+// by the universal printer to print a value of type T when neither
+// operator<< nor PrintTo() is defined for T, where kTypeKind is the
+// "kind" of T as defined by enum TypeKind.
+template <typename T, TypeKind kTypeKind>
+class TypeWithoutFormatter {
+ public:
+  // This default version is called when kTypeKind is kOtherType.
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    PrintBytesInObjectTo(static_cast<const unsigned char*>(
+                             reinterpret_cast<const void*>(&value)),
+                         sizeof(value), os);
+  }
+};
+
+// We print a protobuf using its ShortDebugString() when the string
+// doesn't exceed this many characters; otherwise we print it using
+// DebugString() for better readability.
+const size_t kProtobufOneLinerMaxLength = 50;
+
+template <typename T>
+class TypeWithoutFormatter<T, kProtobuf> {
+ public:
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    std::string pretty_str = value.ShortDebugString();
+    if (pretty_str.length() > kProtobufOneLinerMaxLength) {
+      pretty_str = "\n" + value.DebugString();
+    }
+    *os << ("<" + pretty_str + ">");
+  }
+};
+
+template <typename T>
+class TypeWithoutFormatter<T, kConvertibleToInteger> {
+ public:
+  // Since T has no << operator or PrintTo() but can be implicitly
+  // converted to BiggestInt, we print it as a BiggestInt.
+  //
+  // Most likely T is an enum type (either named or unnamed), in which
+  // case printing it as an integer is the desired behavior.  In case
+  // T is not an enum, printing it as an integer is the best we can do
+  // given that it has no user-defined printer.
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    const internal::BiggestInt kBigInt = value;
+    *os << kBigInt;
+  }
+};
+
+#if GTEST_HAS_ABSL
+template <typename T>
+class TypeWithoutFormatter<T, kConvertibleToStringView> {
+ public:
+  // Since T has neither operator<< nor PrintTo() but can be implicitly
+  // converted to absl::string_view, we print it as a absl::string_view.
+  //
+  // Note: the implementation is further below, as it depends on
+  // internal::PrintTo symbol which is defined later in the file.
+  static void PrintValue(const T& value, ::std::ostream* os);
+};
+#endif
+
+// Prints the given value to the given ostream.  If the value is a
+// protocol message, its debug string is printed; if it's an enum or
+// of a type implicitly convertible to BiggestInt, it's printed as an
+// integer; otherwise the bytes in the value are printed.  This is
+// what UniversalPrinter<T>::Print() does when it knows nothing about
+// type T and T has neither << operator nor PrintTo().
+//
+// A user can override this behavior for a class type Foo by defining
+// a << operator in the namespace where Foo is defined.
+//
+// We put this operator in namespace 'internal2' instead of 'internal'
+// to simplify the implementation, as much code in 'internal' needs to
+// use << in STL, which would conflict with our own << were it defined
+// in 'internal'.
+//
+// Note that this operator<< takes a generic std::basic_ostream<Char,
+// CharTraits> type instead of the more restricted std::ostream.  If
+// we define it to take an std::ostream instead, we'll get an
+// "ambiguous overloads" compiler error when trying to print a type
+// Foo that supports streaming to std::basic_ostream<Char,
+// CharTraits>, as the compiler cannot tell whether
+// operator<<(std::ostream&, const T&) or
+// operator<<(std::basic_stream<Char, CharTraits>, const Foo&) is more
+// specific.
+template <typename Char, typename CharTraits, typename T>
+::std::basic_ostream<Char, CharTraits>& operator<<(
+    ::std::basic_ostream<Char, CharTraits>& os, const T& x) {
+  TypeWithoutFormatter<T, (internal::IsAProtocolMessage<T>::value
+                               ? kProtobuf
+                               : internal::ImplicitlyConvertible<
+                                     const T&, internal::BiggestInt>::value
+                                     ? kConvertibleToInteger
+                                     :
+#if GTEST_HAS_ABSL
+                                     internal::ImplicitlyConvertible<
+                                         const T&, absl::string_view>::value
+                                         ? kConvertibleToStringView
+                                         :
+#endif
+                                         kOtherType)>::PrintValue(x, &os);
+  return os;
+}
+
+}  // namespace internal2
+}  // namespace testing
+
+// This namespace MUST NOT BE NESTED IN ::testing, or the name look-up
+// magic needed for implementing UniversalPrinter won't work.
+namespace testing_internal {
+
+// Used to print a value that is not an STL-style container when the
+// user doesn't define PrintTo() for it.
+template <typename T>
+void DefaultPrintNonContainerTo(const T& value, ::std::ostream* os) {
+  // With the following statement, during unqualified name lookup,
+  // testing::internal2::operator<< appears as if it was declared in
+  // the nearest enclosing namespace that contains both
+  // ::testing_internal and ::testing::internal2, i.e. the global
+  // namespace.  For more details, refer to the C++ Standard section
+  // 7.3.4-1 [namespace.udir].  This allows us to fall back onto
+  // testing::internal2::operator<< in case T doesn't come with a <<
+  // operator.
+  //
+  // We cannot write 'using ::testing::internal2::operator<<;', which
+  // gcc 3.3 fails to compile due to a compiler bug.
+  using namespace ::testing::internal2;  // NOLINT
+
+  // Assuming T is defined in namespace foo, in the next statement,
+  // the compiler will consider all of:
+  //
+  //   1. foo::operator<< (thanks to Koenig look-up),
+  //   2. ::operator<< (as the current namespace is enclosed in ::),
+  //   3. testing::internal2::operator<< (thanks to the using statement above).
+  //
+  // The operator<< whose type matches T best will be picked.
+  //
+  // We deliberately allow #2 to be a candidate, as sometimes it's
+  // impossible to define #1 (e.g. when foo is ::std, defining
+  // anything in it is undefined behavior unless you are a compiler
+  // vendor.).
+  *os << value;
+}
+
+}  // namespace testing_internal
+
+namespace testing {
+namespace internal {
+
+// FormatForComparison<ToPrint, OtherOperand>::Format(value) formats a
+// value of type ToPrint that is an operand of a comparison assertion
+// (e.g. ASSERT_EQ).  OtherOperand is the type of the other operand in
+// the comparison, and is used to help determine the best way to
+// format the value.  In particular, when the value is a C string
+// (char pointer) and the other operand is an STL string object, we
+// want to format the C string as a string, since we know it is
+// compared by value with the string object.  If the value is a char
+// pointer but the other operand is not an STL string object, we don't
+// know whether the pointer is supposed to point to a NUL-terminated
+// string, and thus want to print it as a pointer to be safe.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+// The default case.
+template <typename ToPrint, typename OtherOperand>
+class FormatForComparison {
+ public:
+  static ::std::string Format(const ToPrint& value) {
+    return ::testing::PrintToString(value);
+  }
+};
+
+// Array.
+template <typename ToPrint, size_t N, typename OtherOperand>
+class FormatForComparison<ToPrint[N], OtherOperand> {
+ public:
+  static ::std::string Format(const ToPrint* value) {
+    return FormatForComparison<const ToPrint*, OtherOperand>::Format(value);
+  }
+};
+
+// By default, print C string as pointers to be safe, as we don't know
+// whether they actually point to a NUL-terminated string.
+
+#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType)                \
+  template <typename OtherOperand>                                      \
+  class FormatForComparison<CharType*, OtherOperand> {                  \
+   public:                                                              \
+    static ::std::string Format(CharType* value) {                      \
+      return ::testing::PrintToString(static_cast<const void*>(value)); \
+    }                                                                   \
+  }
+
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
+
+#undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_
+
+// If a C string is compared with an STL string object, we know it's meant
+// to point to a NUL-terminated string, and thus can print it as a string.
+
+#define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \
+  template <>                                                           \
+  class FormatForComparison<CharType*, OtherStringType> {               \
+   public:                                                              \
+    static ::std::string Format(CharType* value) {                      \
+      return ::testing::PrintToString(value);                           \
+    }                                                                   \
+  }
+
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string);
+
+#if GTEST_HAS_GLOBAL_STRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::string);
+#endif
+
+#if GTEST_HAS_GLOBAL_WSTRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::wstring);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::wstring);
+#endif
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring);
+#endif
+
+#undef GTEST_IMPL_FORMAT_C_STRING_AS_STRING_
+
+// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc)
+// operand to be used in a failure message.  The type (but not value)
+// of the other operand may affect the format.  This allows us to
+// print a char* as a raw pointer when it is compared against another
+// char* or void*, and print it as a C string when it is compared
+// against an std::string object, for example.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+template <typename T1, typename T2>
+std::string FormatForComparisonFailureMessage(
+    const T1& value, const T2& /* other_operand */) {
+  return FormatForComparison<T1, T2>::Format(value);
+}
+
+// UniversalPrinter<T>::Print(value, ostream_ptr) prints the given
+// value to the given ostream.  The caller must ensure that
+// 'ostream_ptr' is not NULL, or the behavior is undefined.
+//
+// We define UniversalPrinter as a class template (as opposed to a
+// function template), as we need to partially specialize it for
+// reference types, which cannot be done with function templates.
+template <typename T>
+class UniversalPrinter;
+
+template <typename T>
+void UniversalPrint(const T& value, ::std::ostream* os);
+
+enum DefaultPrinterType {
+  kPrintContainer,
+  kPrintPointer,
+  kPrintFunctionPointer,
+  kPrintOther,
+};
+template <DefaultPrinterType type> struct WrapPrinterType {};
+
+// Used to print an STL-style container when the user doesn't define
+// a PrintTo() for it.
+template <typename C>
+void DefaultPrintTo(WrapPrinterType<kPrintContainer> /* dummy */,
+                    const C& container, ::std::ostream* os) {
+  const size_t kMaxCount = 32;  // The maximum number of elements to print.
+  *os << '{';
+  size_t count = 0;
+  for (typename C::const_iterator it = container.begin();
+       it != container.end(); ++it, ++count) {
+    if (count > 0) {
+      *os << ',';
+      if (count == kMaxCount) {  // Enough has been printed.
+        *os << " ...";
+        break;
+      }
+    }
+    *os << ' ';
+    // We cannot call PrintTo(*it, os) here as PrintTo() doesn't
+    // handle *it being a native array.
+    internal::UniversalPrint(*it, os);
+  }
+
+  if (count > 0) {
+    *os << ' ';
+  }
+  *os << '}';
+}
+
+// Used to print a pointer that is neither a char pointer nor a member
+// pointer, when the user doesn't define PrintTo() for it.  (A member
+// variable pointer or member function pointer doesn't really point to
+// a location in the address space.  Their representation is
+// implementation-defined.  Therefore they will be printed as raw
+// bytes.)
+template <typename T>
+void DefaultPrintTo(WrapPrinterType<kPrintPointer> /* dummy */,
+                    T* p, ::std::ostream* os) {
+  if (p == NULL) {
+    *os << "NULL";
+  } else {
+    // T is not a function type.  We just call << to print p,
+    // relying on ADL to pick up user-defined << for their pointer
+    // types, if any.
+    *os << p;
+  }
+}
+template <typename T>
+void DefaultPrintTo(WrapPrinterType<kPrintFunctionPointer> /* dummy */,
+                    T* p, ::std::ostream* os) {
+  if (p == NULL) {
+    *os << "NULL";
+  } else {
+    // T is a function type, so '*os << p' doesn't do what we want
+    // (it just prints p as bool).  We want to print p as a const
+    // void*.
+    *os << reinterpret_cast<const void*>(p);
+  }
+}
+
+// Used to print a non-container, non-pointer value when the user
+// doesn't define PrintTo() for it.
+template <typename T>
+void DefaultPrintTo(WrapPrinterType<kPrintOther> /* dummy */,
+                    const T& value, ::std::ostream* os) {
+  ::testing_internal::DefaultPrintNonContainerTo(value, os);
+}
+
+// Prints the given value using the << operator if it has one;
+// otherwise prints the bytes in it.  This is what
+// UniversalPrinter<T>::Print() does when PrintTo() is not specialized
+// or overloaded for type T.
+//
+// A user can override this behavior for a class type Foo by defining
+// an overload of PrintTo() in the namespace where Foo is defined.  We
+// give the user this option as sometimes defining a << operator for
+// Foo is not desirable (e.g. the coding style may prevent doing it,
+// or there is already a << operator but it doesn't do what the user
+// wants).
+template <typename T>
+void PrintTo(const T& value, ::std::ostream* os) {
+  // DefaultPrintTo() is overloaded.  The type of its first argument
+  // determines which version will be picked.
+  //
+  // Note that we check for container types here, prior to we check
+  // for protocol message types in our operator<<.  The rationale is:
+  //
+  // For protocol messages, we want to give people a chance to
+  // override Google Mock's format by defining a PrintTo() or
+  // operator<<.  For STL containers, other formats can be
+  // incompatible with Google Mock's format for the container
+  // elements; therefore we check for container types here to ensure
+  // that our format is used.
+  //
+  // Note that MSVC and clang-cl do allow an implicit conversion from
+  // pointer-to-function to pointer-to-object, but clang-cl warns on it.
+  // So don't use ImplicitlyConvertible if it can be helped since it will
+  // cause this warning, and use a separate overload of DefaultPrintTo for
+  // function pointers so that the `*os << p` in the object pointer overload
+  // doesn't cause that warning either.
+  DefaultPrintTo(
+      WrapPrinterType <
+                  (sizeof(IsContainerTest<T>(0)) == sizeof(IsContainer)) &&
+              !IsRecursiveContainer<T>::value
+          ? kPrintContainer
+          : !is_pointer<T>::value
+                ? kPrintOther
+#if GTEST_LANG_CXX11
+                : std::is_function<typename std::remove_pointer<T>::type>::value
+#else
+                : !internal::ImplicitlyConvertible<T, const void*>::value
+#endif
+                      ? kPrintFunctionPointer
+                      : kPrintPointer > (),
+      value, os);
+}
+
+// The following list of PrintTo() overloads tells
+// UniversalPrinter<T>::Print() how to print standard types (built-in
+// types, strings, plain arrays, and pointers).
+
+// Overloads for various char types.
+GTEST_API_ void PrintTo(unsigned char c, ::std::ostream* os);
+GTEST_API_ void PrintTo(signed char c, ::std::ostream* os);
+inline void PrintTo(char c, ::std::ostream* os) {
+  // When printing a plain char, we always treat it as unsigned.  This
+  // way, the output won't be affected by whether the compiler thinks
+  // char is signed or not.
+  PrintTo(static_cast<unsigned char>(c), os);
+}
+
+// Overloads for other simple built-in types.
+inline void PrintTo(bool x, ::std::ostream* os) {
+  *os << (x ? "true" : "false");
+}
+
+// Overload for wchar_t type.
+// Prints a wchar_t as a symbol if it is printable or as its internal
+// code otherwise and also as its decimal code (except for L'\0').
+// The L'\0' char is printed as "L'\\0'". The decimal code is printed
+// as signed integer when wchar_t is implemented by the compiler
+// as a signed type and is printed as an unsigned integer when wchar_t
+// is implemented as an unsigned type.
+GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os);
+
+// Overloads for C strings.
+GTEST_API_ void PrintTo(const char* s, ::std::ostream* os);
+inline void PrintTo(char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const char*>(s), os);
+}
+
+// signed/unsigned char is often used for representing binary data, so
+// we print pointers to it as void* to be safe.
+inline void PrintTo(const signed char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(signed char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(const unsigned char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(unsigned char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+
+// MSVC can be configured to define wchar_t as a typedef of unsigned
+// short.  It defines _NATIVE_WCHAR_T_DEFINED when wchar_t is a native
+// type.  When wchar_t is a typedef, defining an overload for const
+// wchar_t* would cause unsigned short* be printed as a wide string,
+// possibly causing invalid memory accesses.
+#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
+// Overloads for wide C strings
+GTEST_API_ void PrintTo(const wchar_t* s, ::std::ostream* os);
+inline void PrintTo(wchar_t* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const wchar_t*>(s), os);
+}
+#endif
+
+// Overload for C arrays.  Multi-dimensional arrays are printed
+// properly.
+
+// Prints the given number of elements in an array, without printing
+// the curly braces.
+template <typename T>
+void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) {
+  UniversalPrint(a[0], os);
+  for (size_t i = 1; i != count; i++) {
+    *os << ", ";
+    UniversalPrint(a[i], os);
+  }
+}
+
+// Overloads for ::string and ::std::string.
+#if GTEST_HAS_GLOBAL_STRING
+GTEST_API_ void PrintStringTo(const ::string&s, ::std::ostream* os);
+inline void PrintTo(const ::string& s, ::std::ostream* os) {
+  PrintStringTo(s, os);
+}
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+GTEST_API_ void PrintStringTo(const ::std::string&s, ::std::ostream* os);
+inline void PrintTo(const ::std::string& s, ::std::ostream* os) {
+  PrintStringTo(s, os);
+}
+
+// Overloads for ::wstring and ::std::wstring.
+#if GTEST_HAS_GLOBAL_WSTRING
+GTEST_API_ void PrintWideStringTo(const ::wstring&s, ::std::ostream* os);
+inline void PrintTo(const ::wstring& s, ::std::ostream* os) {
+  PrintWideStringTo(s, os);
+}
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os);
+inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) {
+  PrintWideStringTo(s, os);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+#if GTEST_HAS_ABSL
+// Overload for absl::string_view.
+inline void PrintTo(absl::string_view sp, ::std::ostream* os) {
+  PrintTo(::std::string(sp), os);
+}
+#endif  // GTEST_HAS_ABSL
+
+#if GTEST_LANG_CXX11
+inline void PrintTo(std::nullptr_t, ::std::ostream* os) { *os << "(nullptr)"; }
+#endif  // GTEST_LANG_CXX11
+
+#if GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_
+// Helper function for printing a tuple.  T must be instantiated with
+// a tuple type.
+template <typename T>
+void PrintTupleTo(const T& t, ::std::ostream* os);
+#endif  // GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_
+
+#if GTEST_HAS_TR1_TUPLE
+// Overload for ::std::tr1::tuple.  Needed for printing function arguments,
+// which are packed as tuples.
+
+// Overloaded PrintTo() for tuples of various arities.  We support
+// tuples of up-to 10 fields.  The following implementation works
+// regardless of whether tr1::tuple is implemented using the
+// non-standard variadic template feature or not.
+
+inline void PrintTo(const ::std::tr1::tuple<>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1>
+void PrintTo(const ::std::tr1::tuple<T1>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2>
+void PrintTo(const ::std::tr1::tuple<T1, T2>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6, typename T7>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6, typename T7, typename T8>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6, typename T7, typename T8, typename T9>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6, typename T7, typename T8, typename T9, typename T10>
+void PrintTo(
+    const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>& t,
+    ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+#endif  // GTEST_HAS_TR1_TUPLE
+
+#if GTEST_HAS_STD_TUPLE_
+template <typename... Types>
+void PrintTo(const ::std::tuple<Types...>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+#endif  // GTEST_HAS_STD_TUPLE_
+
+// Overload for std::pair.
+template <typename T1, typename T2>
+void PrintTo(const ::std::pair<T1, T2>& value, ::std::ostream* os) {
+  *os << '(';
+  // We cannot use UniversalPrint(value.first, os) here, as T1 may be
+  // a reference type.  The same for printing value.second.
+  UniversalPrinter<T1>::Print(value.first, os);
+  *os << ", ";
+  UniversalPrinter<T2>::Print(value.second, os);
+  *os << ')';
+}
+
+// Implements printing a non-reference type T by letting the compiler
+// pick the right overload of PrintTo() for T.
+template <typename T>
+class UniversalPrinter {
+ public:
+  // MSVC warns about adding const to a function type, so we want to
+  // disable the warning.
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180)
+
+  // Note: we deliberately don't call this PrintTo(), as that name
+  // conflicts with ::testing::internal::PrintTo in the body of the
+  // function.
+  static void Print(const T& value, ::std::ostream* os) {
+    // By default, ::testing::internal::PrintTo() is used for printing
+    // the value.
+    //
+    // Thanks to Koenig look-up, if T is a class and has its own
+    // PrintTo() function defined in its namespace, that function will
+    // be visible here.  Since it is more specific than the generic ones
+    // in ::testing::internal, it will be picked by the compiler in the
+    // following statement - exactly what we want.
+    PrintTo(value, os);
+  }
+
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+};
+
+#if GTEST_HAS_ABSL
+
+// Printer for absl::optional
+
+template <typename T>
+class UniversalPrinter<::absl::optional<T>> {
+ public:
+  static void Print(const ::absl::optional<T>& value, ::std::ostream* os) {
+    *os << '(';
+    if (!value) {
+      *os << "nullopt";
+    } else {
+      UniversalPrint(*value, os);
+    }
+    *os << ')';
+  }
+};
+
+// Printer for absl::variant
+
+template <typename... T>
+class UniversalPrinter<::absl::variant<T...>> {
+ public:
+  static void Print(const ::absl::variant<T...>& value, ::std::ostream* os) {
+    *os << '(';
+    absl::visit(Visitor{os}, value);
+    *os << ')';
+  }
+
+ private:
+  struct Visitor {
+    template <typename U>
+    void operator()(const U& u) const {
+      *os << "'" << GetTypeName<U>() << "' with value ";
+      UniversalPrint(u, os);
+    }
+    ::std::ostream* os;
+  };
+};
+
+#endif  // GTEST_HAS_ABSL
+
+// UniversalPrintArray(begin, len, os) prints an array of 'len'
+// elements, starting at address 'begin'.
+template <typename T>
+void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) {
+  if (len == 0) {
+    *os << "{}";
+  } else {
+    *os << "{ ";
+    const size_t kThreshold = 18;
+    const size_t kChunkSize = 8;
+    // If the array has more than kThreshold elements, we'll have to
+    // omit some details by printing only the first and the last
+    // kChunkSize elements.
+    // FIXME: let the user control the threshold using a flag.
+    if (len <= kThreshold) {
+      PrintRawArrayTo(begin, len, os);
+    } else {
+      PrintRawArrayTo(begin, kChunkSize, os);
+      *os << ", ..., ";
+      PrintRawArrayTo(begin + len - kChunkSize, kChunkSize, os);
+    }
+    *os << " }";
+  }
+}
+// This overload prints a (const) char array compactly.
+GTEST_API_ void UniversalPrintArray(
+    const char* begin, size_t len, ::std::ostream* os);
+
+// This overload prints a (const) wchar_t array compactly.
+GTEST_API_ void UniversalPrintArray(
+    const wchar_t* begin, size_t len, ::std::ostream* os);
+
+// Implements printing an array type T[N].
+template <typename T, size_t N>
+class UniversalPrinter<T[N]> {
+ public:
+  // Prints the given array, omitting some elements when there are too
+  // many.
+  static void Print(const T (&a)[N], ::std::ostream* os) {
+    UniversalPrintArray(a, N, os);
+  }
+};
+
+// Implements printing a reference type T&.
+template <typename T>
+class UniversalPrinter<T&> {
+ public:
+  // MSVC warns about adding const to a function type, so we want to
+  // disable the warning.
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180)
+
+  static void Print(const T& value, ::std::ostream* os) {
+    // Prints the address of the value.  We use reinterpret_cast here
+    // as static_cast doesn't compile when T is a function type.
+    *os << "@" << reinterpret_cast<const void*>(&value) << " ";
+
+    // Then prints the value itself.
+    UniversalPrint(value, os);
+  }
+
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+};
+
+// Prints a value tersely: for a reference type, the referenced value
+// (but not the address) is printed; for a (const) char pointer, the
+// NUL-terminated string (but not the pointer) is printed.
+
+template <typename T>
+class UniversalTersePrinter {
+ public:
+  static void Print(const T& value, ::std::ostream* os) {
+    UniversalPrint(value, os);
+  }
+};
+template <typename T>
+class UniversalTersePrinter<T&> {
+ public:
+  static void Print(const T& value, ::std::ostream* os) {
+    UniversalPrint(value, os);
+  }
+};
+template <typename T, size_t N>
+class UniversalTersePrinter<T[N]> {
+ public:
+  static void Print(const T (&value)[N], ::std::ostream* os) {
+    UniversalPrinter<T[N]>::Print(value, os);
+  }
+};
+template <>
+class UniversalTersePrinter<const char*> {
+ public:
+  static void Print(const char* str, ::std::ostream* os) {
+    if (str == NULL) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(std::string(str), os);
+    }
+  }
+};
+template <>
+class UniversalTersePrinter<char*> {
+ public:
+  static void Print(char* str, ::std::ostream* os) {
+    UniversalTersePrinter<const char*>::Print(str, os);
+  }
+};
+
+#if GTEST_HAS_STD_WSTRING
+template <>
+class UniversalTersePrinter<const wchar_t*> {
+ public:
+  static void Print(const wchar_t* str, ::std::ostream* os) {
+    if (str == NULL) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(::std::wstring(str), os);
+    }
+  }
+};
+#endif
+
+template <>
+class UniversalTersePrinter<wchar_t*> {
+ public:
+  static void Print(wchar_t* str, ::std::ostream* os) {
+    UniversalTersePrinter<const wchar_t*>::Print(str, os);
+  }
+};
+
+template <typename T>
+void UniversalTersePrint(const T& value, ::std::ostream* os) {
+  UniversalTersePrinter<T>::Print(value, os);
+}
+
+// Prints a value using the type inferred by the compiler.  The
+// difference between this and UniversalTersePrint() is that for a
+// (const) char pointer, this prints both the pointer and the
+// NUL-terminated string.
+template <typename T>
+void UniversalPrint(const T& value, ::std::ostream* os) {
+  // A workarond for the bug in VC++ 7.1 that prevents us from instantiating
+  // UniversalPrinter with T directly.
+  typedef T T1;
+  UniversalPrinter<T1>::Print(value, os);
+}
+
+typedef ::std::vector< ::std::string> Strings;
+
+// TuplePolicy<TupleT> must provide:
+// - tuple_size
+//     size of tuple TupleT.
+// - get<size_t I>(const TupleT& t)
+//     static function extracting element I of tuple TupleT.
+// - tuple_element<size_t I>::type
+//     type of element I of tuple TupleT.
+template <typename TupleT>
+struct TuplePolicy;
+
+#if GTEST_HAS_TR1_TUPLE
+template <typename TupleT>
+struct TuplePolicy {
+  typedef TupleT Tuple;
+  static const size_t tuple_size = ::std::tr1::tuple_size<Tuple>::value;
+
+  template <size_t I>
+  struct tuple_element : ::std::tr1::tuple_element<static_cast<int>(I), Tuple> {
+  };
+
+  template <size_t I>
+  static typename AddReference<const typename ::std::tr1::tuple_element<
+      static_cast<int>(I), Tuple>::type>::type
+  get(const Tuple& tuple) {
+    return ::std::tr1::get<I>(tuple);
+  }
+};
+template <typename TupleT>
+const size_t TuplePolicy<TupleT>::tuple_size;
+#endif  // GTEST_HAS_TR1_TUPLE
+
+#if GTEST_HAS_STD_TUPLE_
+template <typename... Types>
+struct TuplePolicy< ::std::tuple<Types...> > {
+  typedef ::std::tuple<Types...> Tuple;
+  static const size_t tuple_size = ::std::tuple_size<Tuple>::value;
+
+  template <size_t I>
+  struct tuple_element : ::std::tuple_element<I, Tuple> {};
+
+  template <size_t I>
+  static const typename ::std::tuple_element<I, Tuple>::type& get(
+      const Tuple& tuple) {
+    return ::std::get<I>(tuple);
+  }
+};
+template <typename... Types>
+const size_t TuplePolicy< ::std::tuple<Types...> >::tuple_size;
+#endif  // GTEST_HAS_STD_TUPLE_
+
+#if GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_
+// This helper template allows PrintTo() for tuples and
+// UniversalTersePrintTupleFieldsToStrings() to be defined by
+// induction on the number of tuple fields.  The idea is that
+// TuplePrefixPrinter<N>::PrintPrefixTo(t, os) prints the first N
+// fields in tuple t, and can be defined in terms of
+// TuplePrefixPrinter<N - 1>.
+//
+// The inductive case.
+template <size_t N>
+struct TuplePrefixPrinter {
+  // Prints the first N fields of a tuple.
+  template <typename Tuple>
+  static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) {
+    TuplePrefixPrinter<N - 1>::PrintPrefixTo(t, os);
+    GTEST_INTENTIONAL_CONST_COND_PUSH_()
+    if (N > 1) {
+    GTEST_INTENTIONAL_CONST_COND_POP_()
+      *os << ", ";
+    }
+    UniversalPrinter<
+        typename TuplePolicy<Tuple>::template tuple_element<N - 1>::type>
+        ::Print(TuplePolicy<Tuple>::template get<N - 1>(t), os);
+  }
+
+  // Tersely prints the first N fields of a tuple to a string vector,
+  // one element for each field.
+  template <typename Tuple>
+  static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) {
+    TuplePrefixPrinter<N - 1>::TersePrintPrefixToStrings(t, strings);
+    ::std::stringstream ss;
+    UniversalTersePrint(TuplePolicy<Tuple>::template get<N - 1>(t), &ss);
+    strings->push_back(ss.str());
+  }
+};
+
+// Base case.
+template <>
+struct TuplePrefixPrinter<0> {
+  template <typename Tuple>
+  static void PrintPrefixTo(const Tuple&, ::std::ostream*) {}
+
+  template <typename Tuple>
+  static void TersePrintPrefixToStrings(const Tuple&, Strings*) {}
+};
+
+// Helper function for printing a tuple.
+// Tuple must be either std::tr1::tuple or std::tuple type.
+template <typename Tuple>
+void PrintTupleTo(const Tuple& t, ::std::ostream* os) {
+  *os << "(";
+  TuplePrefixPrinter<TuplePolicy<Tuple>::tuple_size>::PrintPrefixTo(t, os);
+  *os << ")";
+}
+
+// Prints the fields of a tuple tersely to a string vector, one
+// element for each field.  See the comment before
+// UniversalTersePrint() for how we define "tersely".
+template <typename Tuple>
+Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) {
+  Strings result;
+  TuplePrefixPrinter<TuplePolicy<Tuple>::tuple_size>::
+      TersePrintPrefixToStrings(value, &result);
+  return result;
+}
+#endif  // GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_
+
+}  // namespace internal
+
+#if GTEST_HAS_ABSL
+namespace internal2 {
+template <typename T>
+void TypeWithoutFormatter<T, kConvertibleToStringView>::PrintValue(
+    const T& value, ::std::ostream* os) {
+  internal::PrintTo(absl::string_view(value), os);
+}
+}  // namespace internal2
+#endif
+
+template <typename T>
+::std::string PrintToString(const T& value) {
+  ::std::stringstream ss;
+  internal::UniversalTersePrinter<T>::Print(value, &ss);
+  return ss.str();
+}
+
+}  // namespace testing
+
+// Include any custom printer added by the local installation.
+// We must include this header at the end to make sure it can use the
+// declarations from this file.
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// This file provides an injection point for custom printers in a local
+// installation of gTest.
+// It will be included from gtest-printers.h and the overrides in this file
+// will be visible to everyone.
+//
+// Injection point for custom user configurations. See README for details
+//
+// ** Custom implementation starts here **
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+
+namespace testing {
+
+// Input to a parameterized test name generator, describing a test parameter.
+// Consists of the parameter value and the integer parameter index.
+template <class ParamType>
+struct TestParamInfo {
+  TestParamInfo(const ParamType& a_param, size_t an_index) :
+    param(a_param),
+    index(an_index) {}
+  ParamType param;
+  size_t index;
+};
+
+// A builtin parameterized test name generator which returns the result of
+// testing::PrintToString.
+struct PrintToStringParamName {
+  template <class ParamType>
+  std::string operator()(const TestParamInfo<ParamType>& info) const {
+    return PrintToString(info.param);
+  }
+};
+
+namespace internal {
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Outputs a message explaining invalid registration of different
+// fixture class for the same test case. This may happen when
+// TEST_P macro is used to define two tests with the same name
+// but in different namespaces.
+GTEST_API_ void ReportInvalidTestCaseType(const char* test_case_name,
+                                          CodeLocation code_location);
+
+template <typename> class ParamGeneratorInterface;
+template <typename> class ParamGenerator;
+
+// Interface for iterating over elements provided by an implementation
+// of ParamGeneratorInterface<T>.
+template <typename T>
+class ParamIteratorInterface {
+ public:
+  virtual ~ParamIteratorInterface() {}
+  // A pointer to the base generator instance.
+  // Used only for the purposes of iterator comparison
+  // to make sure that two iterators belong to the same generator.
+  virtual const ParamGeneratorInterface<T>* BaseGenerator() const = 0;
+  // Advances iterator to point to the next element
+  // provided by the generator. The caller is responsible
+  // for not calling Advance() on an iterator equal to
+  // BaseGenerator()->End().
+  virtual void Advance() = 0;
+  // Clones the iterator object. Used for implementing copy semantics
+  // of ParamIterator<T>.
+  virtual ParamIteratorInterface* Clone() const = 0;
+  // Dereferences the current iterator and provides (read-only) access
+  // to the pointed value. It is the caller's responsibility not to call
+  // Current() on an iterator equal to BaseGenerator()->End().
+  // Used for implementing ParamGenerator<T>::operator*().
+  virtual const T* Current() const = 0;
+  // Determines whether the given iterator and other point to the same
+  // element in the sequence generated by the generator.
+  // Used for implementing ParamGenerator<T>::operator==().
+  virtual bool Equals(const ParamIteratorInterface& other) const = 0;
+};
+
+// Class iterating over elements provided by an implementation of
+// ParamGeneratorInterface<T>. It wraps ParamIteratorInterface<T>
+// and implements the const forward iterator concept.
+template <typename T>
+class ParamIterator {
+ public:
+  typedef T value_type;
+  typedef const T& reference;
+  typedef ptrdiff_t difference_type;
+
+  // ParamIterator assumes ownership of the impl_ pointer.
+  ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {}
+  ParamIterator& operator=(const ParamIterator& other) {
+    if (this != &other)
+      impl_.reset(other.impl_->Clone());
+    return *this;
+  }
+
+  const T& operator*() const { return *impl_->Current(); }
+  const T* operator->() const { return impl_->Current(); }
+  // Prefix version of operator++.
+  ParamIterator& operator++() {
+    impl_->Advance();
+    return *this;
+  }
+  // Postfix version of operator++.
+  ParamIterator operator++(int /*unused*/) {
+    ParamIteratorInterface<T>* clone = impl_->Clone();
+    impl_->Advance();
+    return ParamIterator(clone);
+  }
+  bool operator==(const ParamIterator& other) const {
+    return impl_.get() == other.impl_.get() || impl_->Equals(*other.impl_);
+  }
+  bool operator!=(const ParamIterator& other) const {
+    return !(*this == other);
+  }
+
+ private:
+  friend class ParamGenerator<T>;
+  explicit ParamIterator(ParamIteratorInterface<T>* impl) : impl_(impl) {}
+  scoped_ptr<ParamIteratorInterface<T> > impl_;
+};
+
+// ParamGeneratorInterface<T> is the binary interface to access generators
+// defined in other translation units.
+template <typename T>
+class ParamGeneratorInterface {
+ public:
+  typedef T ParamType;
+
+  virtual ~ParamGeneratorInterface() {}
+
+  // Generator interface definition
+  virtual ParamIteratorInterface<T>* Begin() const = 0;
+  virtual ParamIteratorInterface<T>* End() const = 0;
+};
+
+// Wraps ParamGeneratorInterface<T> and provides general generator syntax
+// compatible with the STL Container concept.
+// This class implements copy initialization semantics and the contained
+// ParamGeneratorInterface<T> instance is shared among all copies
+// of the original object. This is possible because that instance is immutable.
+template<typename T>
+class ParamGenerator {
+ public:
+  typedef ParamIterator<T> iterator;
+
+  explicit ParamGenerator(ParamGeneratorInterface<T>* impl) : impl_(impl) {}
+  ParamGenerator(const ParamGenerator& other) : impl_(other.impl_) {}
+
+  ParamGenerator& operator=(const ParamGenerator& other) {
+    impl_ = other.impl_;
+    return *this;
+  }
+
+  iterator begin() const { return iterator(impl_->Begin()); }
+  iterator end() const { return iterator(impl_->End()); }
+
+ private:
+  linked_ptr<const ParamGeneratorInterface<T> > impl_;
+};
+
+// Generates values from a range of two comparable values. Can be used to
+// generate sequences of user-defined types that implement operator+() and
+// operator<().
+// This class is used in the Range() function.
+template <typename T, typename IncrementT>
+class RangeGenerator : public ParamGeneratorInterface<T> {
+ public:
+  RangeGenerator(T begin, T end, IncrementT step)
+      : begin_(begin), end_(end),
+        step_(step), end_index_(CalculateEndIndex(begin, end, step)) {}
+  virtual ~RangeGenerator() {}
+
+  virtual ParamIteratorInterface<T>* Begin() const {
+    return new Iterator(this, begin_, 0, step_);
+  }
+  virtual ParamIteratorInterface<T>* End() const {
+    return new Iterator(this, end_, end_index_, step_);
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<T> {
+   public:
+    Iterator(const ParamGeneratorInterface<T>* base, T value, int index,
+             IncrementT step)
+        : base_(base), value_(value), index_(index), step_(step) {}
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<T>* BaseGenerator() const {
+      return base_;
+    }
+    virtual void Advance() {
+      value_ = static_cast<T>(value_ + step_);
+      index_++;
+    }
+    virtual ParamIteratorInterface<T>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const T* Current() const { return &value_; }
+    virtual bool Equals(const ParamIteratorInterface<T>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const int other_index =
+          CheckedDowncastToActualType<const Iterator>(&other)->index_;
+      return index_ == other_index;
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : ParamIteratorInterface<T>(),
+          base_(other.base_), value_(other.value_), index_(other.index_),
+          step_(other.step_) {}
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<T>* const base_;
+    T value_;
+    int index_;
+    const IncrementT step_;
+  };  // class RangeGenerator::Iterator
+
+  static int CalculateEndIndex(const T& begin,
+                               const T& end,
+                               const IncrementT& step) {
+    int end_index = 0;
+    for (T i = begin; i < end; i = static_cast<T>(i + step))
+      end_index++;
+    return end_index;
+  }
+
+  // No implementation - assignment is unsupported.
+  void operator=(const RangeGenerator& other);
+
+  const T begin_;
+  const T end_;
+  const IncrementT step_;
+  // The index for the end() iterator. All the elements in the generated
+  // sequence are indexed (0-based) to aid iterator comparison.
+  const int end_index_;
+};  // class RangeGenerator
+
+
+// Generates values from a pair of STL-style iterators. Used in the
+// ValuesIn() function. The elements are copied from the source range
+// since the source can be located on the stack, and the generator
+// is likely to persist beyond that stack frame.
+template <typename T>
+class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
+ public:
+  template <typename ForwardIterator>
+  ValuesInIteratorRangeGenerator(ForwardIterator begin, ForwardIterator end)
+      : container_(begin, end) {}
+  virtual ~ValuesInIteratorRangeGenerator() {}
+
+  virtual ParamIteratorInterface<T>* Begin() const {
+    return new Iterator(this, container_.begin());
+  }
+  virtual ParamIteratorInterface<T>* End() const {
+    return new Iterator(this, container_.end());
+  }
+
+ private:
+  typedef typename ::std::vector<T> ContainerType;
+
+  class Iterator : public ParamIteratorInterface<T> {
+   public:
+    Iterator(const ParamGeneratorInterface<T>* base,
+             typename ContainerType::const_iterator iterator)
+        : base_(base), iterator_(iterator) {}
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<T>* BaseGenerator() const {
+      return base_;
+    }
+    virtual void Advance() {
+      ++iterator_;
+      value_.reset();
+    }
+    virtual ParamIteratorInterface<T>* Clone() const {
+      return new Iterator(*this);
+    }
+    // We need to use cached value referenced by iterator_ because *iterator_
+    // can return a temporary object (and of type other then T), so just
+    // having "return &*iterator_;" doesn't work.
+    // value_ is updated here and not in Advance() because Advance()
+    // can advance iterator_ beyond the end of the range, and we cannot
+    // detect that fact. The client code, on the other hand, is
+    // responsible for not calling Current() on an out-of-range iterator.
+    virtual const T* Current() const {
+      if (value_.get() == NULL)
+        value_.reset(new T(*iterator_));
+      return value_.get();
+    }
+    virtual bool Equals(const ParamIteratorInterface<T>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      return iterator_ ==
+          CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
+    }
+
+   private:
+    Iterator(const Iterator& other)
+          // The explicit constructor call suppresses a false warning
+          // emitted by gcc when supplied with the -Wextra option.
+        : ParamIteratorInterface<T>(),
+          base_(other.base_),
+          iterator_(other.iterator_) {}
+
+    const ParamGeneratorInterface<T>* const base_;
+    typename ContainerType::const_iterator iterator_;
+    // A cached value of *iterator_. We keep it here to allow access by
+    // pointer in the wrapping iterator's operator->().
+    // value_ needs to be mutable to be accessed in Current().
+    // Use of scoped_ptr helps manage cached value's lifetime,
+    // which is bound by the lifespan of the iterator itself.
+    mutable scoped_ptr<const T> value_;
+  };  // class ValuesInIteratorRangeGenerator::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const ValuesInIteratorRangeGenerator& other);
+
+  const ContainerType container_;
+};  // class ValuesInIteratorRangeGenerator
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Default parameterized test name generator, returns a string containing the
+// integer test parameter index.
+template <class ParamType>
+std::string DefaultParamName(const TestParamInfo<ParamType>& info) {
+  Message name_stream;
+  name_stream << info.index;
+  return name_stream.GetString();
+}
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Parameterized test name overload helpers, which help the
+// INSTANTIATE_TEST_CASE_P macro choose between the default parameterized
+// test name generator and user param name generator.
+template <class ParamType, class ParamNameGenFunctor>
+ParamNameGenFunctor GetParamNameGen(ParamNameGenFunctor func) {
+  return func;
+}
+
+template <class ParamType>
+struct ParamNameGenFunc {
+  typedef std::string Type(const TestParamInfo<ParamType>&);
+};
+
+template <class ParamType>
+typename ParamNameGenFunc<ParamType>::Type *GetParamNameGen() {
+  return DefaultParamName;
+}
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Stores a parameter value and later creates tests parameterized with that
+// value.
+template <class TestClass>
+class ParameterizedTestFactory : public TestFactoryBase {
+ public:
+  typedef typename TestClass::ParamType ParamType;
+  explicit ParameterizedTestFactory(ParamType parameter) :
+      parameter_(parameter) {}
+  virtual Test* CreateTest() {
+    TestClass::SetParam(&parameter_);
+    return new TestClass();
+  }
+
+ private:
+  const ParamType parameter_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestFactory);
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// TestMetaFactoryBase is a base class for meta-factories that create
+// test factories for passing into MakeAndRegisterTestInfo function.
+template <class ParamType>
+class TestMetaFactoryBase {
+ public:
+  virtual ~TestMetaFactoryBase() {}
+
+  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) = 0;
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// TestMetaFactory creates test factories for passing into
+// MakeAndRegisterTestInfo function. Since MakeAndRegisterTestInfo receives
+// ownership of test factory pointer, same factory object cannot be passed
+// into that method twice. But ParameterizedTestCaseInfo is going to call
+// it for each Test/Parameter value combination. Thus it needs meta factory
+// creator class.
+template <class TestCase>
+class TestMetaFactory
+    : public TestMetaFactoryBase<typename TestCase::ParamType> {
+ public:
+  typedef typename TestCase::ParamType ParamType;
+
+  TestMetaFactory() {}
+
+  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) {
+    return new ParameterizedTestFactory<TestCase>(parameter);
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestMetaFactory);
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestCaseInfoBase is a generic interface
+// to ParameterizedTestCaseInfo classes. ParameterizedTestCaseInfoBase
+// accumulates test information provided by TEST_P macro invocations
+// and generators provided by INSTANTIATE_TEST_CASE_P macro invocations
+// and uses that information to register all resulting test instances
+// in RegisterTests method. The ParameterizeTestCaseRegistry class holds
+// a collection of pointers to the ParameterizedTestCaseInfo objects
+// and calls RegisterTests() on each of them when asked.
+class ParameterizedTestCaseInfoBase {
+ public:
+  virtual ~ParameterizedTestCaseInfoBase() {}
+
+  // Base part of test case name for display purposes.
+  virtual const std::string& GetTestCaseName() const = 0;
+  // Test case id to verify identity.
+  virtual TypeId GetTestCaseTypeId() const = 0;
+  // UnitTest class invokes this method to register tests in this
+  // test case right before running them in RUN_ALL_TESTS macro.
+  // This method should not be called more then once on any single
+  // instance of a ParameterizedTestCaseInfoBase derived class.
+  virtual void RegisterTests() = 0;
+
+ protected:
+  ParameterizedTestCaseInfoBase() {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfoBase);
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestCaseInfo accumulates tests obtained from TEST_P
+// macro invocations for a particular test case and generators
+// obtained from INSTANTIATE_TEST_CASE_P macro invocations for that
+// test case. It registers tests with all values generated by all
+// generators when asked.
+template <class TestCase>
+class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
+ public:
+  // ParamType and GeneratorCreationFunc are private types but are required
+  // for declarations of public methods AddTestPattern() and
+  // AddTestCaseInstantiation().
+  typedef typename TestCase::ParamType ParamType;
+  // A function that returns an instance of appropriate generator type.
+  typedef ParamGenerator<ParamType>(GeneratorCreationFunc)();
+  typedef typename ParamNameGenFunc<ParamType>::Type ParamNameGeneratorFunc;
+
+  explicit ParameterizedTestCaseInfo(
+      const char* name, CodeLocation code_location)
+      : test_case_name_(name), code_location_(code_location) {}
+
+  // Test case base name for display purposes.
+  virtual const std::string& GetTestCaseName() const { return test_case_name_; }
+  // Test case id to verify identity.
+  virtual TypeId GetTestCaseTypeId() const { return GetTypeId<TestCase>(); }
+  // TEST_P macro uses AddTestPattern() to record information
+  // about a single test in a LocalTestInfo structure.
+  // test_case_name is the base name of the test case (without invocation
+  // prefix). test_base_name is the name of an individual test without
+  // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is
+  // test case base name and DoBar is test base name.
+  void AddTestPattern(const char* test_case_name,
+                      const char* test_base_name,
+                      TestMetaFactoryBase<ParamType>* meta_factory) {
+    tests_.push_back(linked_ptr<TestInfo>(new TestInfo(test_case_name,
+                                                       test_base_name,
+                                                       meta_factory)));
+  }
+  // INSTANTIATE_TEST_CASE_P macro uses AddGenerator() to record information
+  // about a generator.
+  int AddTestCaseInstantiation(const std::string& instantiation_name,
+                               GeneratorCreationFunc* func,
+                               ParamNameGeneratorFunc* name_func,
+                               const char* file, int line) {
+    instantiations_.push_back(
+        InstantiationInfo(instantiation_name, func, name_func, file, line));
+    return 0;  // Return value used only to run this method in namespace scope.
+  }
+  // UnitTest class invokes this method to register tests in this test case
+  // test cases right before running tests in RUN_ALL_TESTS macro.
+  // This method should not be called more then once on any single
+  // instance of a ParameterizedTestCaseInfoBase derived class.
+  // UnitTest has a guard to prevent from calling this method more then once.
+  virtual void RegisterTests() {
+    for (typename TestInfoContainer::iterator test_it = tests_.begin();
+         test_it != tests_.end(); ++test_it) {
+      linked_ptr<TestInfo> test_info = *test_it;
+      for (typename InstantiationContainer::iterator gen_it =
+               instantiations_.begin(); gen_it != instantiations_.end();
+               ++gen_it) {
+        const std::string& instantiation_name = gen_it->name;
+        ParamGenerator<ParamType> generator((*gen_it->generator)());
+        ParamNameGeneratorFunc* name_func = gen_it->name_func;
+        const char* file = gen_it->file;
+        int line = gen_it->line;
+
+        std::string test_case_name;
+        if ( !instantiation_name.empty() )
+          test_case_name = instantiation_name + "/";
+        test_case_name += test_info->test_case_base_name;
+
+        size_t i = 0;
+        std::set<std::string> test_param_names;
+        for (typename ParamGenerator<ParamType>::iterator param_it =
+                 generator.begin();
+             param_it != generator.end(); ++param_it, ++i) {
+          Message test_name_stream;
+
+          std::string param_name = name_func(
+              TestParamInfo<ParamType>(*param_it, i));
+
+          GTEST_CHECK_(IsValidParamName(param_name))
+              << "Parameterized test name '" << param_name
+              << "' is invalid, in " << file
+              << " line " << line << std::endl;
+
+          GTEST_CHECK_(test_param_names.count(param_name) == 0)
+              << "Duplicate parameterized test name '" << param_name
+              << "', in " << file << " line " << line << std::endl;
+
+          test_param_names.insert(param_name);
+
+          test_name_stream << test_info->test_base_name << "/" << param_name;
+          MakeAndRegisterTestInfo(
+              test_case_name.c_str(),
+              test_name_stream.GetString().c_str(),
+              NULL,  // No type parameter.
+              PrintToString(*param_it).c_str(),
+              code_location_,
+              GetTestCaseTypeId(),
+              TestCase::SetUpTestCase,
+              TestCase::TearDownTestCase,
+              test_info->test_meta_factory->CreateTestFactory(*param_it));
+        }  // for param_it
+      }  // for gen_it
+    }  // for test_it
+  }  // RegisterTests
+
+ private:
+  // LocalTestInfo structure keeps information about a single test registered
+  // with TEST_P macro.
+  struct TestInfo {
+    TestInfo(const char* a_test_case_base_name,
+             const char* a_test_base_name,
+             TestMetaFactoryBase<ParamType>* a_test_meta_factory) :
+        test_case_base_name(a_test_case_base_name),
+        test_base_name(a_test_base_name),
+        test_meta_factory(a_test_meta_factory) {}
+
+    const std::string test_case_base_name;
+    const std::string test_base_name;
+    const scoped_ptr<TestMetaFactoryBase<ParamType> > test_meta_factory;
+  };
+  typedef ::std::vector<linked_ptr<TestInfo> > TestInfoContainer;
+  // Records data received from INSTANTIATE_TEST_CASE_P macros:
+  //  <Instantiation name, Sequence generator creation function,
+  //     Name generator function, Source file, Source line>
+  struct InstantiationInfo {
+      InstantiationInfo(const std::string &name_in,
+                        GeneratorCreationFunc* generator_in,
+                        ParamNameGeneratorFunc* name_func_in,
+                        const char* file_in,
+                        int line_in)
+          : name(name_in),
+            generator(generator_in),
+            name_func(name_func_in),
+            file(file_in),
+            line(line_in) {}
+
+      std::string name;
+      GeneratorCreationFunc* generator;
+      ParamNameGeneratorFunc* name_func;
+      const char* file;
+      int line;
+  };
+  typedef ::std::vector<InstantiationInfo> InstantiationContainer;
+
+  static bool IsValidParamName(const std::string& name) {
+    // Check for empty string
+    if (name.empty())
+      return false;
+
+    // Check for invalid characters
+    for (std::string::size_type index = 0; index < name.size(); ++index) {
+      if (!isalnum(name[index]) && name[index] != '_')
+        return false;
+    }
+
+    return true;
+  }
+
+  const std::string test_case_name_;
+  CodeLocation code_location_;
+  TestInfoContainer tests_;
+  InstantiationContainer instantiations_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfo);
+};  // class ParameterizedTestCaseInfo
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestCaseRegistry contains a map of ParameterizedTestCaseInfoBase
+// classes accessed by test case names. TEST_P and INSTANTIATE_TEST_CASE_P
+// macros use it to locate their corresponding ParameterizedTestCaseInfo
+// descriptors.
+class ParameterizedTestCaseRegistry {
+ public:
+  ParameterizedTestCaseRegistry() {}
+  ~ParameterizedTestCaseRegistry() {
+    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
+         it != test_case_infos_.end(); ++it) {
+      delete *it;
+    }
+  }
+
+  // Looks up or creates and returns a structure containing information about
+  // tests and instantiations of a particular test case.
+  template <class TestCase>
+  ParameterizedTestCaseInfo<TestCase>* GetTestCasePatternHolder(
+      const char* test_case_name,
+      CodeLocation code_location) {
+    ParameterizedTestCaseInfo<TestCase>* typed_test_info = NULL;
+    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
+         it != test_case_infos_.end(); ++it) {
+      if ((*it)->GetTestCaseName() == test_case_name) {
+        if ((*it)->GetTestCaseTypeId() != GetTypeId<TestCase>()) {
+          // Complain about incorrect usage of Google Test facilities
+          // and terminate the program since we cannot guaranty correct
+          // test case setup and tear-down in this case.
+          ReportInvalidTestCaseType(test_case_name, code_location);
+          posix::Abort();
+        } else {
+          // At this point we are sure that the object we found is of the same
+          // type we are looking for, so we downcast it to that type
+          // without further checks.
+          typed_test_info = CheckedDowncastToActualType<
+              ParameterizedTestCaseInfo<TestCase> >(*it);
+        }
+        break;
+      }
+    }
+    if (typed_test_info == NULL) {
+      typed_test_info = new ParameterizedTestCaseInfo<TestCase>(
+          test_case_name, code_location);
+      test_case_infos_.push_back(typed_test_info);
+    }
+    return typed_test_info;
+  }
+  void RegisterTests() {
+    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
+         it != test_case_infos_.end(); ++it) {
+      (*it)->RegisterTests();
+    }
+  }
+
+ private:
+  typedef ::std::vector<ParameterizedTestCaseInfoBase*> TestCaseInfoContainer;
+
+  TestCaseInfoContainer test_case_infos_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseRegistry);
+};
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+// This file was GENERATED by command:
+//     pump.py gtest-param-util-generated.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Type and function utilities for implementing parameterized tests.
+// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+// Currently Google Test supports at most 50 arguments in Values,
+// and at most 10 arguments in Combine. Please contact
+// googletestframework@googlegroups.com if you need more.
+// Please note that the number of arguments to Combine is limited
+// by the maximum arity of the implementation of tuple which is
+// currently set at 10.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
+
+
+namespace testing {
+
+// Forward declarations of ValuesIn(), which is implemented in
+// include/gtest/gtest-param-test.h.
+template <typename ForwardIterator>
+internal::ParamGenerator<
+  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
+ValuesIn(ForwardIterator begin, ForwardIterator end);
+
+template <typename T, size_t N>
+internal::ParamGenerator<T> ValuesIn(const T (&array)[N]);
+
+template <class Container>
+internal::ParamGenerator<typename Container::value_type> ValuesIn(
+    const Container& container);
+
+namespace internal {
+
+// Used in the Values() function to provide polymorphic capabilities.
+template <typename T1>
+class ValueArray1 {
+ public:
+  explicit ValueArray1(T1 v1) : v1_(v1) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray1(const ValueArray1& other) : v1_(other.v1_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray1& other);
+
+  const T1 v1_;
+};
+
+template <typename T1, typename T2>
+class ValueArray2 {
+ public:
+  ValueArray2(T1 v1, T2 v2) : v1_(v1), v2_(v2) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray2(const ValueArray2& other) : v1_(other.v1_), v2_(other.v2_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray2& other);
+
+  const T1 v1_;
+  const T2 v2_;
+};
+
+template <typename T1, typename T2, typename T3>
+class ValueArray3 {
+ public:
+  ValueArray3(T1 v1, T2 v2, T3 v3) : v1_(v1), v2_(v2), v3_(v3) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray3(const ValueArray3& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray3& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4>
+class ValueArray4 {
+ public:
+  ValueArray4(T1 v1, T2 v2, T3 v3, T4 v4) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray4(const ValueArray4& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray4& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class ValueArray5 {
+ public:
+  ValueArray5(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray5(const ValueArray5& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray5& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+class ValueArray6 {
+ public:
+  ValueArray6(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray6(const ValueArray6& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray6& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+class ValueArray7 {
+ public:
+  ValueArray7(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray7(const ValueArray7& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray7& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+class ValueArray8 {
+ public:
+  ValueArray8(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+      T8 v8) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray8(const ValueArray8& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray8& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+class ValueArray9 {
+ public:
+  ValueArray9(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
+      T9 v9) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray9(const ValueArray9& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray9& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+class ValueArray10 {
+ public:
+  ValueArray10(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray10(const ValueArray10& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray10& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+class ValueArray11 {
+ public:
+  ValueArray11(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray11(const ValueArray11& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray11& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+class ValueArray12 {
+ public:
+  ValueArray12(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray12(const ValueArray12& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray12& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+class ValueArray13 {
+ public:
+  ValueArray13(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray13(const ValueArray13& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray13& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+class ValueArray14 {
+ public:
+  ValueArray14(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray14(const ValueArray14& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray14& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+class ValueArray15 {
+ public:
+  ValueArray15(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray15(const ValueArray15& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray15& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+class ValueArray16 {
+ public:
+  ValueArray16(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray16(const ValueArray16& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray16& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+class ValueArray17 {
+ public:
+  ValueArray17(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
+      T17 v17) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray17(const ValueArray17& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray17& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+class ValueArray18 {
+ public:
+  ValueArray18(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray18(const ValueArray18& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray18& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+class ValueArray19 {
+ public:
+  ValueArray19(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
+      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray19(const ValueArray19& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray19& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+class ValueArray20 {
+ public:
+  ValueArray20(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
+      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
+      v19_(v19), v20_(v20) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray20(const ValueArray20& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray20& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+class ValueArray21 {
+ public:
+  ValueArray21(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
+      v18_(v18), v19_(v19), v20_(v20), v21_(v21) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray21(const ValueArray21& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray21& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+class ValueArray22 {
+ public:
+  ValueArray22(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray22(const ValueArray22& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray22& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+class ValueArray23 {
+ public:
+  ValueArray23(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray23(const ValueArray23& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray23& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+class ValueArray24 {
+ public:
+  ValueArray24(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
+      v22_(v22), v23_(v23), v24_(v24) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray24(const ValueArray24& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray24& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+class ValueArray25 {
+ public:
+  ValueArray25(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
+      T25 v25) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray25(const ValueArray25& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray25& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+class ValueArray26 {
+ public:
+  ValueArray26(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray26(const ValueArray26& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray26& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+class ValueArray27 {
+ public:
+  ValueArray27(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
+      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
+      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
+      v26_(v26), v27_(v27) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray27(const ValueArray27& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray27& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+class ValueArray28 {
+ public:
+  ValueArray28(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
+      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
+      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
+      v25_(v25), v26_(v26), v27_(v27), v28_(v28) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray28(const ValueArray28& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray28& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+class ValueArray29 {
+ public:
+  ValueArray29(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
+      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
+      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray29(const ValueArray29& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray29& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+class ValueArray30 {
+ public:
+  ValueArray30(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray30(const ValueArray30& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray30& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+class ValueArray31 {
+ public:
+  ValueArray31(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray31(const ValueArray31& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray31& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+class ValueArray32 {
+ public:
+  ValueArray32(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
+      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
+      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray32(const ValueArray32& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray32& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+class ValueArray33 {
+ public:
+  ValueArray33(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32,
+      T33 v33) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray33(const ValueArray33& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray33& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+class ValueArray34 {
+ public:
+  ValueArray34(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray34(const ValueArray34& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray34& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+class ValueArray35 {
+ public:
+  ValueArray35(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
+      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
+      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
+      v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31),
+      v32_(v32), v33_(v33), v34_(v34), v35_(v35) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray35(const ValueArray35& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray35& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+class ValueArray36 {
+ public:
+  ValueArray36(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
+      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
+      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
+      v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30),
+      v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray36(const ValueArray36& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray36& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+class ValueArray37 {
+ public:
+  ValueArray37(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
+      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
+      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29),
+      v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35),
+      v36_(v36), v37_(v37) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray37(const ValueArray37& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray37& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+class ValueArray38 {
+ public:
+  ValueArray38(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
+      v35_(v35), v36_(v36), v37_(v37), v38_(v38) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray38(const ValueArray38& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray38& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+class ValueArray39 {
+ public:
+  ValueArray39(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
+      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray39(const ValueArray39& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray39& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+class ValueArray40 {
+ public:
+  ValueArray40(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
+      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
+      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33),
+      v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39),
+      v40_(v40) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray40(const ValueArray40& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray40& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+class ValueArray41 {
+ public:
+  ValueArray41(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40,
+      T41 v41) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
+      v39_(v39), v40_(v40), v41_(v41) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray41(const ValueArray41& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray41& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+class ValueArray42 {
+ public:
+  ValueArray42(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
+      v39_(v39), v40_(v40), v41_(v41), v42_(v42) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray42(const ValueArray42& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray42& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+class ValueArray43 {
+ public:
+  ValueArray43(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
+      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
+      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
+      v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31),
+      v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37),
+      v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray43(const ValueArray43& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray43& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+class ValueArray44 {
+ public:
+  ValueArray44(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
+      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
+      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
+      v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30),
+      v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36),
+      v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42),
+      v43_(v43), v44_(v44) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray44(const ValueArray44& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_), v44_(other.v44_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray44& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+class ValueArray45 {
+ public:
+  ValueArray45(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
+      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
+      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29),
+      v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35),
+      v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41),
+      v42_(v42), v43_(v43), v44_(v44), v45_(v45) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray45(const ValueArray45& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_), v44_(other.v44_), v45_(other.v45_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray45& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+class ValueArray46 {
+ public:
+  ValueArray46(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
+      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40),
+      v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray46(const ValueArray46& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray46& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+class ValueArray47 {
+ public:
+  ValueArray47(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
+      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40),
+      v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46),
+      v47_(v47) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray47(const ValueArray47& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_),
+      v47_(other.v47_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray47& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+  const T47 v47_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+class ValueArray48 {
+ public:
+  ValueArray48(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
+      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
+      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33),
+      v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39),
+      v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45),
+      v46_(v46), v47_(v47), v48_(v48) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
+        static_cast<T>(v48_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray48(const ValueArray48& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_),
+      v47_(other.v47_), v48_(other.v48_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray48& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+  const T47 v47_;
+  const T48 v48_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+class ValueArray49 {
+ public:
+  ValueArray49(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48,
+      T49 v49) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
+      v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44),
+      v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
+        static_cast<T>(v48_), static_cast<T>(v49_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray49(const ValueArray49& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_),
+      v47_(other.v47_), v48_(other.v48_), v49_(other.v49_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray49& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+  const T47 v47_;
+  const T48 v48_;
+  const T49 v49_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+class ValueArray50 {
+ public:
+  ValueArray50(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48, T49 v49,
+      T50 v50) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
+      v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44),
+      v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49), v50_(v50) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
+        static_cast<T>(v48_), static_cast<T>(v49_), static_cast<T>(v50_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray50(const ValueArray50& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_),
+      v47_(other.v47_), v48_(other.v48_), v49_(other.v49_), v50_(other.v50_) {}
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray50& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+  const T47 v47_;
+  const T48 v48_;
+  const T49 v49_;
+  const T50 v50_;
+};
+
+# if GTEST_HAS_COMBINE
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Generates values from the Cartesian product of values produced
+// by the argument generators.
+//
+template <typename T1, typename T2>
+class CartesianProductGenerator2
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2> > {
+ public:
+  typedef ::testing::tuple<T1, T2> ParamType;
+
+  CartesianProductGenerator2(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2)
+      : g1_(g1), g2_(g2) {}
+  virtual ~CartesianProductGenerator2() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current2_;
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return current_value_.get(); }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_.reset(new ParamType(*current1_, *current2_));
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    linked_ptr<ParamType> current_value_;
+  };  // class CartesianProductGenerator2::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator2& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+};  // class CartesianProductGenerator2
+
+
+template <typename T1, typename T2, typename T3>
+class CartesianProductGenerator3
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3> ParamType;
+
+  CartesianProductGenerator3(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3)
+      : g1_(g1), g2_(g2), g3_(g3) {}
+  virtual ~CartesianProductGenerator3() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current3_;
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return current_value_.get(); }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_));
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    linked_ptr<ParamType> current_value_;
+  };  // class CartesianProductGenerator3::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator3& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+};  // class CartesianProductGenerator3
+
+
+template <typename T1, typename T2, typename T3, typename T4>
+class CartesianProductGenerator4
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3, T4> ParamType;
+
+  CartesianProductGenerator4(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {}
+  virtual ~CartesianProductGenerator4() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current4_;
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return current_value_.get(); }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_,
+            *current4_));
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    linked_ptr<ParamType> current_value_;
+  };  // class CartesianProductGenerator4::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator4& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+};  // class CartesianProductGenerator4
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class CartesianProductGenerator5
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3, T4, T5> ParamType;
+
+  CartesianProductGenerator5(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {}
+  virtual ~CartesianProductGenerator5() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current5_;
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return current_value_.get(); }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_));
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    linked_ptr<ParamType> current_value_;
+  };  // class CartesianProductGenerator5::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator5& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+};  // class CartesianProductGenerator5
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+class CartesianProductGenerator6
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5,
+        T6> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6> ParamType;
+
+  CartesianProductGenerator6(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {}
+  virtual ~CartesianProductGenerator6() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current6_;
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return current_value_.get(); }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_));
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    linked_ptr<ParamType> current_value_;
+  };  // class CartesianProductGenerator6::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator6& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+};  // class CartesianProductGenerator6
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+class CartesianProductGenerator7
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5, T6,
+        T7> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6, T7> ParamType;
+
+  CartesianProductGenerator7(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {}
+  virtual ~CartesianProductGenerator7() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
+        g7_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6,
+      const ParamGenerator<T7>& g7,
+      const typename ParamGenerator<T7>::iterator& current7)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
+          begin7_(g7.begin()), end7_(g7.end()), current7_(current7)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current7_;
+      if (current7_ == end7_) {
+        current7_ = begin7_;
+        ++current6_;
+      }
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return current_value_.get(); }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_ &&
+          current7_ == typed_other->current7_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_),
+        begin7_(other.begin7_),
+        end7_(other.end7_),
+        current7_(other.current7_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_));
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_ ||
+          current7_ == end7_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    const typename ParamGenerator<T7>::iterator begin7_;
+    const typename ParamGenerator<T7>::iterator end7_;
+    typename ParamGenerator<T7>::iterator current7_;
+    linked_ptr<ParamType> current_value_;
+  };  // class CartesianProductGenerator7::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator7& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+  const ParamGenerator<T7> g7_;
+};  // class CartesianProductGenerator7
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+class CartesianProductGenerator8
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5, T6,
+        T7, T8> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8> ParamType;
+
+  CartesianProductGenerator8(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
+      const ParamGenerator<T8>& g8)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7),
+          g8_(g8) {}
+  virtual ~CartesianProductGenerator8() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
+        g7_.begin(), g8_, g8_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
+        g8_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6,
+      const ParamGenerator<T7>& g7,
+      const typename ParamGenerator<T7>::iterator& current7,
+      const ParamGenerator<T8>& g8,
+      const typename ParamGenerator<T8>::iterator& current8)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
+          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
+          begin8_(g8.begin()), end8_(g8.end()), current8_(current8)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current8_;
+      if (current8_ == end8_) {
+        current8_ = begin8_;
+        ++current7_;
+      }
+      if (current7_ == end7_) {
+        current7_ = begin7_;
+        ++current6_;
+      }
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return current_value_.get(); }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_ &&
+          current7_ == typed_other->current7_ &&
+          current8_ == typed_other->current8_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_),
+        begin7_(other.begin7_),
+        end7_(other.end7_),
+        current7_(other.current7_),
+        begin8_(other.begin8_),
+        end8_(other.end8_),
+        current8_(other.current8_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_, *current8_));
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_ ||
+          current7_ == end7_ ||
+          current8_ == end8_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    const typename ParamGenerator<T7>::iterator begin7_;
+    const typename ParamGenerator<T7>::iterator end7_;
+    typename ParamGenerator<T7>::iterator current7_;
+    const typename ParamGenerator<T8>::iterator begin8_;
+    const typename ParamGenerator<T8>::iterator end8_;
+    typename ParamGenerator<T8>::iterator current8_;
+    linked_ptr<ParamType> current_value_;
+  };  // class CartesianProductGenerator8::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator8& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+  const ParamGenerator<T7> g7_;
+  const ParamGenerator<T8> g8_;
+};  // class CartesianProductGenerator8
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+class CartesianProductGenerator9
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5, T6,
+        T7, T8, T9> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9> ParamType;
+
+  CartesianProductGenerator9(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
+      const ParamGenerator<T8>& g8, const ParamGenerator<T9>& g9)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
+          g9_(g9) {}
+  virtual ~CartesianProductGenerator9() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
+        g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
+        g8_.end(), g9_, g9_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6,
+      const ParamGenerator<T7>& g7,
+      const typename ParamGenerator<T7>::iterator& current7,
+      const ParamGenerator<T8>& g8,
+      const typename ParamGenerator<T8>::iterator& current8,
+      const ParamGenerator<T9>& g9,
+      const typename ParamGenerator<T9>::iterator& current9)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
+          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
+          begin8_(g8.begin()), end8_(g8.end()), current8_(current8),
+          begin9_(g9.begin()), end9_(g9.end()), current9_(current9)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current9_;
+      if (current9_ == end9_) {
+        current9_ = begin9_;
+        ++current8_;
+      }
+      if (current8_ == end8_) {
+        current8_ = begin8_;
+        ++current7_;
+      }
+      if (current7_ == end7_) {
+        current7_ = begin7_;
+        ++current6_;
+      }
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return current_value_.get(); }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_ &&
+          current7_ == typed_other->current7_ &&
+          current8_ == typed_other->current8_ &&
+          current9_ == typed_other->current9_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_),
+        begin7_(other.begin7_),
+        end7_(other.end7_),
+        current7_(other.current7_),
+        begin8_(other.begin8_),
+        end8_(other.end8_),
+        current8_(other.current8_),
+        begin9_(other.begin9_),
+        end9_(other.end9_),
+        current9_(other.current9_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_, *current8_,
+            *current9_));
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_ ||
+          current7_ == end7_ ||
+          current8_ == end8_ ||
+          current9_ == end9_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    const typename ParamGenerator<T7>::iterator begin7_;
+    const typename ParamGenerator<T7>::iterator end7_;
+    typename ParamGenerator<T7>::iterator current7_;
+    const typename ParamGenerator<T8>::iterator begin8_;
+    const typename ParamGenerator<T8>::iterator end8_;
+    typename ParamGenerator<T8>::iterator current8_;
+    const typename ParamGenerator<T9>::iterator begin9_;
+    const typename ParamGenerator<T9>::iterator end9_;
+    typename ParamGenerator<T9>::iterator current9_;
+    linked_ptr<ParamType> current_value_;
+  };  // class CartesianProductGenerator9::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator9& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+  const ParamGenerator<T7> g7_;
+  const ParamGenerator<T8> g8_;
+  const ParamGenerator<T9> g9_;
+};  // class CartesianProductGenerator9
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+class CartesianProductGenerator10
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5, T6,
+        T7, T8, T9, T10> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> ParamType;
+
+  CartesianProductGenerator10(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
+      const ParamGenerator<T8>& g8, const ParamGenerator<T9>& g9,
+      const ParamGenerator<T10>& g10)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
+          g9_(g9), g10_(g10) {}
+  virtual ~CartesianProductGenerator10() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
+        g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin(), g10_, g10_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
+        g8_.end(), g9_, g9_.end(), g10_, g10_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6,
+      const ParamGenerator<T7>& g7,
+      const typename ParamGenerator<T7>::iterator& current7,
+      const ParamGenerator<T8>& g8,
+      const typename ParamGenerator<T8>::iterator& current8,
+      const ParamGenerator<T9>& g9,
+      const typename ParamGenerator<T9>::iterator& current9,
+      const ParamGenerator<T10>& g10,
+      const typename ParamGenerator<T10>::iterator& current10)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
+          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
+          begin8_(g8.begin()), end8_(g8.end()), current8_(current8),
+          begin9_(g9.begin()), end9_(g9.end()), current9_(current9),
+          begin10_(g10.begin()), end10_(g10.end()), current10_(current10)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current10_;
+      if (current10_ == end10_) {
+        current10_ = begin10_;
+        ++current9_;
+      }
+      if (current9_ == end9_) {
+        current9_ = begin9_;
+        ++current8_;
+      }
+      if (current8_ == end8_) {
+        current8_ = begin8_;
+        ++current7_;
+      }
+      if (current7_ == end7_) {
+        current7_ = begin7_;
+        ++current6_;
+      }
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return current_value_.get(); }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_ &&
+          current7_ == typed_other->current7_ &&
+          current8_ == typed_other->current8_ &&
+          current9_ == typed_other->current9_ &&
+          current10_ == typed_other->current10_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_),
+        begin7_(other.begin7_),
+        end7_(other.end7_),
+        current7_(other.current7_),
+        begin8_(other.begin8_),
+        end8_(other.end8_),
+        current8_(other.current8_),
+        begin9_(other.begin9_),
+        end9_(other.end9_),
+        current9_(other.current9_),
+        begin10_(other.begin10_),
+        end10_(other.end10_),
+        current10_(other.current10_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_, *current8_,
+            *current9_, *current10_));
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_ ||
+          current7_ == end7_ ||
+          current8_ == end8_ ||
+          current9_ == end9_ ||
+          current10_ == end10_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    const typename ParamGenerator<T7>::iterator begin7_;
+    const typename ParamGenerator<T7>::iterator end7_;
+    typename ParamGenerator<T7>::iterator current7_;
+    const typename ParamGenerator<T8>::iterator begin8_;
+    const typename ParamGenerator<T8>::iterator end8_;
+    typename ParamGenerator<T8>::iterator current8_;
+    const typename ParamGenerator<T9>::iterator begin9_;
+    const typename ParamGenerator<T9>::iterator end9_;
+    typename ParamGenerator<T9>::iterator current9_;
+    const typename ParamGenerator<T10>::iterator begin10_;
+    const typename ParamGenerator<T10>::iterator end10_;
+    typename ParamGenerator<T10>::iterator current10_;
+    linked_ptr<ParamType> current_value_;
+  };  // class CartesianProductGenerator10::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator10& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+  const ParamGenerator<T7> g7_;
+  const ParamGenerator<T8> g8_;
+  const ParamGenerator<T9> g9_;
+  const ParamGenerator<T10> g10_;
+};  // class CartesianProductGenerator10
+
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Helper classes providing Combine() with polymorphic features. They allow
+// casting CartesianProductGeneratorN<T> to ParamGenerator<U> if T is
+// convertible to U.
+//
+template <class Generator1, class Generator2>
+class CartesianProductHolder2 {
+ public:
+CartesianProductHolder2(const Generator1& g1, const Generator2& g2)
+      : g1_(g1), g2_(g2) {}
+  template <typename T1, typename T2>
+  operator ParamGenerator< ::testing::tuple<T1, T2> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2> >(
+        new CartesianProductGenerator2<T1, T2>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder2& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+};  // class CartesianProductHolder2
+
+template <class Generator1, class Generator2, class Generator3>
+class CartesianProductHolder3 {
+ public:
+CartesianProductHolder3(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3)
+      : g1_(g1), g2_(g2), g3_(g3) {}
+  template <typename T1, typename T2, typename T3>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3> >(
+        new CartesianProductGenerator3<T1, T2, T3>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder3& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+};  // class CartesianProductHolder3
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4>
+class CartesianProductHolder4 {
+ public:
+CartesianProductHolder4(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {}
+  template <typename T1, typename T2, typename T3, typename T4>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4> >(
+        new CartesianProductGenerator4<T1, T2, T3, T4>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder4& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+};  // class CartesianProductHolder4
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5>
+class CartesianProductHolder5 {
+ public:
+CartesianProductHolder5(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5> >(
+        new CartesianProductGenerator5<T1, T2, T3, T4, T5>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder5& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+};  // class CartesianProductHolder5
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6>
+class CartesianProductHolder6 {
+ public:
+CartesianProductHolder6(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6> >(
+        new CartesianProductGenerator6<T1, T2, T3, T4, T5, T6>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder6& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+};  // class CartesianProductHolder6
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6, class Generator7>
+class CartesianProductHolder7 {
+ public:
+CartesianProductHolder7(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6, const Generator7& g7)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6, typename T7>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6,
+      T7> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7> >(
+        new CartesianProductGenerator7<T1, T2, T3, T4, T5, T6, T7>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_),
+        static_cast<ParamGenerator<T7> >(g7_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder7& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+  const Generator7 g7_;
+};  // class CartesianProductHolder7
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6, class Generator7,
+    class Generator8>
+class CartesianProductHolder8 {
+ public:
+CartesianProductHolder8(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6, const Generator7& g7, const Generator8& g8)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7),
+          g8_(g8) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6, typename T7, typename T8>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7,
+      T8> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8> >(
+        new CartesianProductGenerator8<T1, T2, T3, T4, T5, T6, T7, T8>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_),
+        static_cast<ParamGenerator<T7> >(g7_),
+        static_cast<ParamGenerator<T8> >(g8_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder8& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+  const Generator7 g7_;
+  const Generator8 g8_;
+};  // class CartesianProductHolder8
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6, class Generator7,
+    class Generator8, class Generator9>
+class CartesianProductHolder9 {
+ public:
+CartesianProductHolder9(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6, const Generator7& g7, const Generator8& g8,
+    const Generator9& g9)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
+          g9_(g9) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6, typename T7, typename T8, typename T9>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
+      T9> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
+        T9> >(
+        new CartesianProductGenerator9<T1, T2, T3, T4, T5, T6, T7, T8, T9>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_),
+        static_cast<ParamGenerator<T7> >(g7_),
+        static_cast<ParamGenerator<T8> >(g8_),
+        static_cast<ParamGenerator<T9> >(g9_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder9& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+  const Generator7 g7_;
+  const Generator8 g8_;
+  const Generator9 g9_;
+};  // class CartesianProductHolder9
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6, class Generator7,
+    class Generator8, class Generator9, class Generator10>
+class CartesianProductHolder10 {
+ public:
+CartesianProductHolder10(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6, const Generator7& g7, const Generator8& g8,
+    const Generator9& g9, const Generator10& g10)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
+          g9_(g9), g10_(g10) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6, typename T7, typename T8, typename T9, typename T10>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9,
+      T10> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9,
+        T10> >(
+        new CartesianProductGenerator10<T1, T2, T3, T4, T5, T6, T7, T8, T9,
+            T10>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_),
+        static_cast<ParamGenerator<T7> >(g7_),
+        static_cast<ParamGenerator<T8> >(g8_),
+        static_cast<ParamGenerator<T9> >(g9_),
+        static_cast<ParamGenerator<T10> >(g10_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder10& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+  const Generator7 g7_;
+  const Generator8 g8_;
+  const Generator9 g9_;
+  const Generator10 g10_;
+};  // class CartesianProductHolder10
+
+# endif  // GTEST_HAS_COMBINE
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
+
+namespace testing {
+
+// Functions producing parameter generators.
+//
+// Google Test uses these generators to produce parameters for value-
+// parameterized tests. When a parameterized test case is instantiated
+// with a particular generator, Google Test creates and runs tests
+// for each element in the sequence produced by the generator.
+//
+// In the following sample, tests from test case FooTest are instantiated
+// each three times with parameter values 3, 5, and 8:
+//
+// class FooTest : public TestWithParam<int> { ... };
+//
+// TEST_P(FooTest, TestThis) {
+// }
+// TEST_P(FooTest, TestThat) {
+// }
+// INSTANTIATE_TEST_CASE_P(TestSequence, FooTest, Values(3, 5, 8));
+//
+
+// Range() returns generators providing sequences of values in a range.
+//
+// Synopsis:
+// Range(start, end)
+//   - returns a generator producing a sequence of values {start, start+1,
+//     start+2, ..., }.
+// Range(start, end, step)
+//   - returns a generator producing a sequence of values {start, start+step,
+//     start+step+step, ..., }.
+// Notes:
+//   * The generated sequences never include end. For example, Range(1, 5)
+//     returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2)
+//     returns a generator producing {1, 3, 5, 7}.
+//   * start and end must have the same type. That type may be any integral or
+//     floating-point type or a user defined type satisfying these conditions:
+//     * It must be assignable (have operator=() defined).
+//     * It must have operator+() (operator+(int-compatible type) for
+//       two-operand version).
+//     * It must have operator<() defined.
+//     Elements in the resulting sequences will also have that type.
+//   * Condition start < end must be satisfied in order for resulting sequences
+//     to contain any elements.
+//
+template <typename T, typename IncrementT>
+internal::ParamGenerator<T> Range(T start, T end, IncrementT step) {
+  return internal::ParamGenerator<T>(
+      new internal::RangeGenerator<T, IncrementT>(start, end, step));
+}
+
+template <typename T>
+internal::ParamGenerator<T> Range(T start, T end) {
+  return Range(start, end, 1);
+}
+
+// ValuesIn() function allows generation of tests with parameters coming from
+// a container.
+//
+// Synopsis:
+// ValuesIn(const T (&array)[N])
+//   - returns a generator producing sequences with elements from
+//     a C-style array.
+// ValuesIn(const Container& container)
+//   - returns a generator producing sequences with elements from
+//     an STL-style container.
+// ValuesIn(Iterator begin, Iterator end)
+//   - returns a generator producing sequences with elements from
+//     a range [begin, end) defined by a pair of STL-style iterators. These
+//     iterators can also be plain C pointers.
+//
+// Please note that ValuesIn copies the values from the containers
+// passed in and keeps them to generate tests in RUN_ALL_TESTS().
+//
+// Examples:
+//
+// This instantiates tests from test case StringTest
+// each with C-string values of "foo", "bar", and "baz":
+//
+// const char* strings[] = {"foo", "bar", "baz"};
+// INSTANTIATE_TEST_CASE_P(StringSequence, StringTest, ValuesIn(strings));
+//
+// This instantiates tests from test case StlStringTest
+// each with STL strings with values "a" and "b":
+//
+// ::std::vector< ::std::string> GetParameterStrings() {
+//   ::std::vector< ::std::string> v;
+//   v.push_back("a");
+//   v.push_back("b");
+//   return v;
+// }
+//
+// INSTANTIATE_TEST_CASE_P(CharSequence,
+//                         StlStringTest,
+//                         ValuesIn(GetParameterStrings()));
+//
+//
+// This will also instantiate tests from CharTest
+// each with parameter values 'a' and 'b':
+//
+// ::std::list<char> GetParameterChars() {
+//   ::std::list<char> list;
+//   list.push_back('a');
+//   list.push_back('b');
+//   return list;
+// }
+// ::std::list<char> l = GetParameterChars();
+// INSTANTIATE_TEST_CASE_P(CharSequence2,
+//                         CharTest,
+//                         ValuesIn(l.begin(), l.end()));
+//
+template <typename ForwardIterator>
+internal::ParamGenerator<
+  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
+ValuesIn(ForwardIterator begin, ForwardIterator end) {
+  typedef typename ::testing::internal::IteratorTraits<ForwardIterator>
+      ::value_type ParamType;
+  return internal::ParamGenerator<ParamType>(
+      new internal::ValuesInIteratorRangeGenerator<ParamType>(begin, end));
+}
+
+template <typename T, size_t N>
+internal::ParamGenerator<T> ValuesIn(const T (&array)[N]) {
+  return ValuesIn(array, array + N);
+}
+
+template <class Container>
+internal::ParamGenerator<typename Container::value_type> ValuesIn(
+    const Container& container) {
+  return ValuesIn(container.begin(), container.end());
+}
+
+// Values() allows generating tests from explicitly specified list of
+// parameters.
+//
+// Synopsis:
+// Values(T v1, T v2, ..., T vN)
+//   - returns a generator producing sequences with elements v1, v2, ..., vN.
+//
+// For example, this instantiates tests from test case BarTest each
+// with values "one", "two", and "three":
+//
+// INSTANTIATE_TEST_CASE_P(NumSequence, BarTest, Values("one", "two", "three"));
+//
+// This instantiates tests from test case BazTest each with values 1, 2, 3.5.
+// The exact type of values will depend on the type of parameter in BazTest.
+//
+// INSTANTIATE_TEST_CASE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5));
+//
+// Currently, Values() supports from 1 to 50 parameters.
+//
+template <typename T1>
+internal::ValueArray1<T1> Values(T1 v1) {
+  return internal::ValueArray1<T1>(v1);
+}
+
+template <typename T1, typename T2>
+internal::ValueArray2<T1, T2> Values(T1 v1, T2 v2) {
+  return internal::ValueArray2<T1, T2>(v1, v2);
+}
+
+template <typename T1, typename T2, typename T3>
+internal::ValueArray3<T1, T2, T3> Values(T1 v1, T2 v2, T3 v3) {
+  return internal::ValueArray3<T1, T2, T3>(v1, v2, v3);
+}
+
+template <typename T1, typename T2, typename T3, typename T4>
+internal::ValueArray4<T1, T2, T3, T4> Values(T1 v1, T2 v2, T3 v3, T4 v4) {
+  return internal::ValueArray4<T1, T2, T3, T4>(v1, v2, v3, v4);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+internal::ValueArray5<T1, T2, T3, T4, T5> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5) {
+  return internal::ValueArray5<T1, T2, T3, T4, T5>(v1, v2, v3, v4, v5);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+internal::ValueArray6<T1, T2, T3, T4, T5, T6> Values(T1 v1, T2 v2, T3 v3,
+    T4 v4, T5 v5, T6 v6) {
+  return internal::ValueArray6<T1, T2, T3, T4, T5, T6>(v1, v2, v3, v4, v5, v6);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+internal::ValueArray7<T1, T2, T3, T4, T5, T6, T7> Values(T1 v1, T2 v2, T3 v3,
+    T4 v4, T5 v5, T6 v6, T7 v7) {
+  return internal::ValueArray7<T1, T2, T3, T4, T5, T6, T7>(v1, v2, v3, v4, v5,
+      v6, v7);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+internal::ValueArray8<T1, T2, T3, T4, T5, T6, T7, T8> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8) {
+  return internal::ValueArray8<T1, T2, T3, T4, T5, T6, T7, T8>(v1, v2, v3, v4,
+      v5, v6, v7, v8);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+internal::ValueArray9<T1, T2, T3, T4, T5, T6, T7, T8, T9> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9) {
+  return internal::ValueArray9<T1, T2, T3, T4, T5, T6, T7, T8, T9>(v1, v2, v3,
+      v4, v5, v6, v7, v8, v9);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+internal::ValueArray10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> Values(T1 v1,
+    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10) {
+  return internal::ValueArray10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>(v1,
+      v2, v3, v4, v5, v6, v7, v8, v9, v10);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+internal::ValueArray11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10,
+    T11> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11) {
+  return internal::ValueArray11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10,
+      T11>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+internal::ValueArray12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+    T12> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12) {
+  return internal::ValueArray12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+internal::ValueArray13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+    T13> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13) {
+  return internal::ValueArray13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+internal::ValueArray14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) {
+  return internal::ValueArray14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+      v14);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+internal::ValueArray15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
+    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) {
+  return internal::ValueArray15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+      v13, v14, v15);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+internal::ValueArray16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16) {
+  return internal::ValueArray16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+      v12, v13, v14, v15, v16);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+internal::ValueArray17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17) {
+  return internal::ValueArray17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
+      v11, v12, v13, v14, v15, v16, v17);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+internal::ValueArray18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
+    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18) {
+  return internal::ValueArray18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
+      v10, v11, v12, v13, v14, v15, v16, v17, v18);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+internal::ValueArray19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
+    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
+    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19) {
+  return internal::ValueArray19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19>(v1, v2, v3, v4, v5, v6, v7, v8,
+      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+internal::ValueArray20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20) {
+  return internal::ValueArray20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20>(v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+internal::ValueArray21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21) {
+  return internal::ValueArray21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21>(v1, v2, v3, v4, v5, v6,
+      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+internal::ValueArray22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22> Values(T1 v1, T2 v2, T3 v3,
+    T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22) {
+  return internal::ValueArray22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22>(v1, v2, v3, v4,
+      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+internal::ValueArray23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23) {
+  return internal::ValueArray23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23>(v1, v2, v3,
+      v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22, v23);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+internal::ValueArray24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23, T24 v24) {
+  return internal::ValueArray24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24>(v1, v2,
+      v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,
+      v19, v20, v21, v22, v23, v24);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+internal::ValueArray25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Values(T1 v1,
+    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11,
+    T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19,
+    T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25) {
+  return internal::ValueArray25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25>(v1,
+      v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17,
+      v18, v19, v20, v21, v22, v23, v24, v25);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+internal::ValueArray26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+    T26> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26) {
+  return internal::ValueArray26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+      v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+internal::ValueArray27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+    T27> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27) {
+  return internal::ValueArray27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14,
+      v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+internal::ValueArray28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+    T28> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28) {
+  return internal::ValueArray28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+      v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27,
+      v28);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+internal::ValueArray29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29) {
+  return internal::ValueArray29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+      v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26,
+      v27, v28, v29);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+internal::ValueArray30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
+    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
+    T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
+    T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) {
+  return internal::ValueArray30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+      v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25,
+      v26, v27, v28, v29, v30);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+internal::ValueArray31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) {
+  return internal::ValueArray31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
+      v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24,
+      v25, v26, v27, v28, v29, v30, v31);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+internal::ValueArray32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32) {
+  return internal::ValueArray32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
+      v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
+      v24, v25, v26, v27, v28, v29, v30, v31, v32);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+internal::ValueArray33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
+    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32, T33 v33) {
+  return internal::ValueArray33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33>(v1, v2, v3, v4, v5, v6, v7, v8,
+      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
+      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+internal::ValueArray34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
+    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
+    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22,
+    T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30,
+    T31 v31, T32 v32, T33 v33, T34 v34) {
+  return internal::ValueArray34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34>(v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22,
+      v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+internal::ValueArray35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
+    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
+    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35) {
+  return internal::ValueArray35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35>(v1, v2, v3, v4, v5, v6,
+      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21,
+      v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+internal::ValueArray36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
+    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
+    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36) {
+  return internal::ValueArray36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36>(v1, v2, v3, v4,
+      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
+      v34, v35, v36);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+internal::ValueArray37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37> Values(T1 v1, T2 v2, T3 v3,
+    T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
+    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
+    T37 v37) {
+  return internal::ValueArray37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37>(v1, v2, v3,
+      v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
+      v34, v35, v36, v37);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+internal::ValueArray38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
+    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
+    T37 v37, T38 v38) {
+  return internal::ValueArray38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38>(v1, v2,
+      v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,
+      v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32,
+      v33, v34, v35, v36, v37, v38);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+internal::ValueArray39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
+    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
+    T37 v37, T38 v38, T39 v39) {
+  return internal::ValueArray39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39>(v1,
+      v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17,
+      v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
+      v32, v33, v34, v35, v36, v37, v38, v39);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+internal::ValueArray40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Values(T1 v1,
+    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11,
+    T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19,
+    T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27,
+    T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35,
+    T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) {
+  return internal::ValueArray40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+      v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29,
+      v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+internal::ValueArray41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+    T41> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41) {
+  return internal::ValueArray41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14,
+      v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28,
+      v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+internal::ValueArray42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+    T42> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+    T42 v42) {
+  return internal::ValueArray42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+      v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27,
+      v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41,
+      v42);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+internal::ValueArray43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+    T43> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+    T42 v42, T43 v43) {
+  return internal::ValueArray43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+      v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26,
+      v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40,
+      v41, v42, v43);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+internal::ValueArray44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+    T42 v42, T43 v43, T44 v44) {
+  return internal::ValueArray44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+      v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25,
+      v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39,
+      v40, v41, v42, v43, v44);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+internal::ValueArray45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
+    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
+    T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
+    T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32,
+    T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40,
+    T41 v41, T42 v42, T43 v43, T44 v44, T45 v45) {
+  return internal::ValueArray45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
+      v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24,
+      v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38,
+      v39, v40, v41, v42, v43, v44, v45);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+internal::ValueArray46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
+    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) {
+  return internal::ValueArray46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
+      v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
+      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37,
+      v38, v39, v40, v41, v42, v43, v44, v45, v46);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+internal::ValueArray47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
+    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) {
+  return internal::ValueArray47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46, T47>(v1, v2, v3, v4, v5, v6, v7, v8,
+      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
+      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37,
+      v38, v39, v40, v41, v42, v43, v44, v45, v46, v47);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+internal::ValueArray48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
+    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
+    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47,
+    T48 v48) {
+  return internal::ValueArray48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46, T47, T48>(v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22,
+      v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36,
+      v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+internal::ValueArray49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48, T49> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
+    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
+    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22,
+    T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30,
+    T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38,
+    T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46,
+    T47 v47, T48 v48, T49 v49) {
+  return internal::ValueArray49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46, T47, T48, T49>(v1, v2, v3, v4, v5, v6,
+      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21,
+      v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35,
+      v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+internal::ValueArray50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48, T49, T50> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
+    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
+    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37,
+    T38 v38, T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45,
+    T46 v46, T47 v47, T48 v48, T49 v49, T50 v50) {
+  return internal::ValueArray50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46, T47, T48, T49, T50>(v1, v2, v3, v4,
+      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
+      v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
+      v48, v49, v50);
+}
+
+// Bool() allows generating tests with parameters in a set of (false, true).
+//
+// Synopsis:
+// Bool()
+//   - returns a generator producing sequences with elements {false, true}.
+//
+// It is useful when testing code that depends on Boolean flags. Combinations
+// of multiple flags can be tested when several Bool()'s are combined using
+// Combine() function.
+//
+// In the following example all tests in the test case FlagDependentTest
+// will be instantiated twice with parameters false and true.
+//
+// class FlagDependentTest : public testing::TestWithParam<bool> {
+//   virtual void SetUp() {
+//     external_flag = GetParam();
+//   }
+// }
+// INSTANTIATE_TEST_CASE_P(BoolSequence, FlagDependentTest, Bool());
+//
+inline internal::ParamGenerator<bool> Bool() {
+  return Values(false, true);
+}
+
+# if GTEST_HAS_COMBINE
+// Combine() allows the user to combine two or more sequences to produce
+// values of a Cartesian product of those sequences' elements.
+//
+// Synopsis:
+// Combine(gen1, gen2, ..., genN)
+//   - returns a generator producing sequences with elements coming from
+//     the Cartesian product of elements from the sequences generated by
+//     gen1, gen2, ..., genN. The sequence elements will have a type of
+//     tuple<T1, T2, ..., TN> where T1, T2, ..., TN are the types
+//     of elements from sequences produces by gen1, gen2, ..., genN.
+//
+// Combine can have up to 10 arguments. This number is currently limited
+// by the maximum number of elements in the tuple implementation used by Google
+// Test.
+//
+// Example:
+//
+// This will instantiate tests in test case AnimalTest each one with
+// the parameter values tuple("cat", BLACK), tuple("cat", WHITE),
+// tuple("dog", BLACK), and tuple("dog", WHITE):
+//
+// enum Color { BLACK, GRAY, WHITE };
+// class AnimalTest
+//     : public testing::TestWithParam<tuple<const char*, Color> > {...};
+//
+// TEST_P(AnimalTest, AnimalLooksNice) {...}
+//
+// INSTANTIATE_TEST_CASE_P(AnimalVariations, AnimalTest,
+//                         Combine(Values("cat", "dog"),
+//                                 Values(BLACK, WHITE)));
+//
+// This will instantiate tests in FlagDependentTest with all variations of two
+// Boolean flags:
+//
+// class FlagDependentTest
+//     : public testing::TestWithParam<tuple<bool, bool> > {
+//   virtual void SetUp() {
+//     // Assigns external_flag_1 and external_flag_2 values from the tuple.
+//     tie(external_flag_1, external_flag_2) = GetParam();
+//   }
+// };
+//
+// TEST_P(FlagDependentTest, TestFeature1) {
+//   // Test your code using external_flag_1 and external_flag_2 here.
+// }
+// INSTANTIATE_TEST_CASE_P(TwoBoolSequence, FlagDependentTest,
+//                         Combine(Bool(), Bool()));
+//
+template <typename Generator1, typename Generator2>
+internal::CartesianProductHolder2<Generator1, Generator2> Combine(
+    const Generator1& g1, const Generator2& g2) {
+  return internal::CartesianProductHolder2<Generator1, Generator2>(
+      g1, g2);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3>
+internal::CartesianProductHolder3<Generator1, Generator2, Generator3> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3) {
+  return internal::CartesianProductHolder3<Generator1, Generator2, Generator3>(
+      g1, g2, g3);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4>
+internal::CartesianProductHolder4<Generator1, Generator2, Generator3,
+    Generator4> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4) {
+  return internal::CartesianProductHolder4<Generator1, Generator2, Generator3,
+      Generator4>(
+      g1, g2, g3, g4);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5>
+internal::CartesianProductHolder5<Generator1, Generator2, Generator3,
+    Generator4, Generator5> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5) {
+  return internal::CartesianProductHolder5<Generator1, Generator2, Generator3,
+      Generator4, Generator5>(
+      g1, g2, g3, g4, g5);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6>
+internal::CartesianProductHolder6<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6) {
+  return internal::CartesianProductHolder6<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6>(
+      g1, g2, g3, g4, g5, g6);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6,
+    typename Generator7>
+internal::CartesianProductHolder7<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6, Generator7> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6,
+        const Generator7& g7) {
+  return internal::CartesianProductHolder7<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6, Generator7>(
+      g1, g2, g3, g4, g5, g6, g7);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6,
+    typename Generator7, typename Generator8>
+internal::CartesianProductHolder8<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6, Generator7, Generator8> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6,
+        const Generator7& g7, const Generator8& g8) {
+  return internal::CartesianProductHolder8<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6, Generator7, Generator8>(
+      g1, g2, g3, g4, g5, g6, g7, g8);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6,
+    typename Generator7, typename Generator8, typename Generator9>
+internal::CartesianProductHolder9<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6, Generator7, Generator8,
+    Generator9> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6,
+        const Generator7& g7, const Generator8& g8, const Generator9& g9) {
+  return internal::CartesianProductHolder9<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6, Generator7, Generator8, Generator9>(
+      g1, g2, g3, g4, g5, g6, g7, g8, g9);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6,
+    typename Generator7, typename Generator8, typename Generator9,
+    typename Generator10>
+internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6, Generator7, Generator8, Generator9,
+    Generator10> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6,
+        const Generator7& g7, const Generator8& g8, const Generator9& g9,
+        const Generator10& g10) {
+  return internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6, Generator7, Generator8, Generator9,
+      Generator10>(
+      g1, g2, g3, g4, g5, g6, g7, g8, g9, g10);
+}
+# endif  // GTEST_HAS_COMBINE
+
+# define TEST_P(test_case_name, test_name) \
+  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
+      : public test_case_name { \
+   public: \
+    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \
+    virtual void TestBody(); \
+   private: \
+    static int AddToRegistry() { \
+      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
+          GetTestCasePatternHolder<test_case_name>(\
+              #test_case_name, \
+              ::testing::internal::CodeLocation(\
+                  __FILE__, __LINE__))->AddTestPattern(\
+                      GTEST_STRINGIFY_(test_case_name), \
+                      GTEST_STRINGIFY_(test_name), \
+                      new ::testing::internal::TestMetaFactory< \
+                          GTEST_TEST_CLASS_NAME_(\
+                              test_case_name, test_name)>()); \
+      return 0; \
+    } \
+    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(\
+        GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \
+  }; \
+  int GTEST_TEST_CLASS_NAME_(test_case_name, \
+                             test_name)::gtest_registering_dummy_ = \
+      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \
+  void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
+
+// The optional last argument to INSTANTIATE_TEST_CASE_P allows the user
+// to specify a function or functor that generates custom test name suffixes
+// based on the test parameters. The function should accept one argument of
+// type testing::TestParamInfo<class ParamType>, and return std::string.
+//
+// testing::PrintToStringParamName is a builtin test suffix generator that
+// returns the value of testing::PrintToString(GetParam()).
+//
+// Note: test names must be non-empty, unique, and may only contain ASCII
+// alphanumeric characters or underscore. Because PrintToString adds quotes
+// to std::string and C strings, it won't work for these types.
+
+# define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator, ...) \
+  static ::testing::internal::ParamGenerator<test_case_name::ParamType> \
+      gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \
+  static ::std::string gtest_##prefix##test_case_name##_EvalGenerateName_( \
+      const ::testing::TestParamInfo<test_case_name::ParamType>& info) { \
+    return ::testing::internal::GetParamNameGen<test_case_name::ParamType> \
+        (__VA_ARGS__)(info); \
+  } \
+  static int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \
+      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
+          GetTestCasePatternHolder<test_case_name>(\
+              #test_case_name, \
+              ::testing::internal::CodeLocation(\
+                  __FILE__, __LINE__))->AddTestCaseInstantiation(\
+                      #prefix, \
+                      &gtest_##prefix##test_case_name##_EvalGenerator_, \
+                      &gtest_##prefix##test_case_name##_EvalGenerateName_, \
+                      __FILE__, __LINE__)
+
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Google C++ Testing and Mocking Framework definitions useful in production code.
+// GOOGLETEST_CM0003 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+
+// When you need to test the private or protected members of a class,
+// use the FRIEND_TEST macro to declare your tests as friends of the
+// class.  For example:
+//
+// class MyClass {
+//  private:
+//   void PrivateMethod();
+//   FRIEND_TEST(MyClassTest, PrivateMethodWorks);
+// };
+//
+// class MyClassTest : public testing::Test {
+//   // ...
+// };
+//
+// TEST_F(MyClassTest, PrivateMethodWorks) {
+//   // Can call MyClass::PrivateMethod() here.
+// }
+//
+// Note: The test class must be in the same namespace as the class being tested.
+// For example, putting MyClassTest in an anonymous namespace will not work.
+
+#define FRIEND_TEST(test_case_name, test_name)\
+friend class test_case_name##_##test_name##_Test
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+#define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+
+#include <iosfwd>
+#include <vector>
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// A copyable object representing the result of a test part (i.e. an
+// assertion or an explicit FAIL(), ADD_FAILURE(), or SUCCESS()).
+//
+// Don't inherit from TestPartResult as its destructor is not virtual.
+class GTEST_API_ TestPartResult {
+ public:
+  // The possible outcomes of a test part (i.e. an assertion or an
+  // explicit SUCCEED(), FAIL(), or ADD_FAILURE()).
+  enum Type {
+    kSuccess,          // Succeeded.
+    kNonFatalFailure,  // Failed but the test can continue.
+    kFatalFailure      // Failed and the test should be terminated.
+  };
+
+  // C'tor.  TestPartResult does NOT have a default constructor.
+  // Always use this constructor (with parameters) to create a
+  // TestPartResult object.
+  TestPartResult(Type a_type,
+                 const char* a_file_name,
+                 int a_line_number,
+                 const char* a_message)
+      : type_(a_type),
+        file_name_(a_file_name == NULL ? "" : a_file_name),
+        line_number_(a_line_number),
+        summary_(ExtractSummary(a_message)),
+        message_(a_message) {
+  }
+
+  // Gets the outcome of the test part.
+  Type type() const { return type_; }
+
+  // Gets the name of the source file where the test part took place, or
+  // NULL if it's unknown.
+  const char* file_name() const {
+    return file_name_.empty() ? NULL : file_name_.c_str();
+  }
+
+  // Gets the line in the source file where the test part took place,
+  // or -1 if it's unknown.
+  int line_number() const { return line_number_; }
+
+  // Gets the summary of the failure message.
+  const char* summary() const { return summary_.c_str(); }
+
+  // Gets the message associated with the test part.
+  const char* message() const { return message_.c_str(); }
+
+  // Returns true iff the test part passed.
+  bool passed() const { return type_ == kSuccess; }
+
+  // Returns true iff the test part failed.
+  bool failed() const { return type_ != kSuccess; }
+
+  // Returns true iff the test part non-fatally failed.
+  bool nonfatally_failed() const { return type_ == kNonFatalFailure; }
+
+  // Returns true iff the test part fatally failed.
+  bool fatally_failed() const { return type_ == kFatalFailure; }
+
+ private:
+  Type type_;
+
+  // Gets the summary of the failure message by omitting the stack
+  // trace in it.
+  static std::string ExtractSummary(const char* message);
+
+  // The name of the source file where the test part took place, or
+  // "" if the source file is unknown.
+  std::string file_name_;
+  // The line in the source file where the test part took place, or -1
+  // if the line number is unknown.
+  int line_number_;
+  std::string summary_;  // The test failure summary.
+  std::string message_;  // The test failure message.
+};
+
+// Prints a TestPartResult object.
+std::ostream& operator<<(std::ostream& os, const TestPartResult& result);
+
+// An array of TestPartResult objects.
+//
+// Don't inherit from TestPartResultArray as its destructor is not
+// virtual.
+class GTEST_API_ TestPartResultArray {
+ public:
+  TestPartResultArray() {}
+
+  // Appends the given TestPartResult to the array.
+  void Append(const TestPartResult& result);
+
+  // Returns the TestPartResult at the given index (0-based).
+  const TestPartResult& GetTestPartResult(int index) const;
+
+  // Returns the number of TestPartResult objects in the array.
+  int size() const;
+
+ private:
+  std::vector<TestPartResult> array_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestPartResultArray);
+};
+
+// This interface knows how to report a test part result.
+class GTEST_API_ TestPartResultReporterInterface {
+ public:
+  virtual ~TestPartResultReporterInterface() {}
+
+  virtual void ReportTestPartResult(const TestPartResult& result) = 0;
+};
+
+namespace internal {
+
+// This helper class is used by {ASSERT|EXPECT}_NO_FATAL_FAILURE to check if a
+// statement generates new fatal failures. To do so it registers itself as the
+// current test part result reporter. Besides checking if fatal failures were
+// reported, it only delegates the reporting to the former result reporter.
+// The original result reporter is restored in the destructor.
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+class GTEST_API_ HasNewFatalFailureHelper
+    : public TestPartResultReporterInterface {
+ public:
+  HasNewFatalFailureHelper();
+  virtual ~HasNewFatalFailureHelper();
+  virtual void ReportTestPartResult(const TestPartResult& result);
+  bool has_new_fatal_failure() const { return has_new_fatal_failure_; }
+ private:
+  bool has_new_fatal_failure_;
+  TestPartResultReporterInterface* original_reporter_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper);
+};
+
+}  // namespace internal
+
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+
+// This header implements typed tests and type-parameterized tests.
+
+// Typed (aka type-driven) tests repeat the same test for types in a
+// list.  You must know which types you want to test with when writing
+// typed tests. Here's how you do it:
+
+#if 0
+
+// First, define a fixture class template.  It should be parameterized
+// by a type.  Remember to derive it from testing::Test.
+template <typename T>
+class FooTest : public testing::Test {
+ public:
+  ...
+  typedef std::list<T> List;
+  static T shared_;
+  T value_;
+};
+
+// Next, associate a list of types with the test case, which will be
+// repeated for each type in the list.  The typedef is necessary for
+// the macro to parse correctly.
+typedef testing::Types<char, int, unsigned int> MyTypes;
+TYPED_TEST_CASE(FooTest, MyTypes);
+
+// If the type list contains only one type, you can write that type
+// directly without Types<...>:
+//   TYPED_TEST_CASE(FooTest, int);
+
+// Then, use TYPED_TEST() instead of TEST_F() to define as many typed
+// tests for this test case as you want.
+TYPED_TEST(FooTest, DoesBlah) {
+  // Inside a test, refer to TypeParam to get the type parameter.
+  // Since we are inside a derived class template, C++ requires use to
+  // visit the members of FooTest via 'this'.
+  TypeParam n = this->value_;
+
+  // To visit static members of the fixture, add the TestFixture::
+  // prefix.
+  n += TestFixture::shared_;
+
+  // To refer to typedefs in the fixture, add the "typename
+  // TestFixture::" prefix.
+  typename TestFixture::List values;
+  values.push_back(n);
+  ...
+}
+
+TYPED_TEST(FooTest, HasPropertyA) { ... }
+
+// TYPED_TEST_CASE takes an optional third argument which allows to specify a
+// class that generates custom test name suffixes based on the type. This should
+// be a class which has a static template function GetName(int index) returning
+// a string for each type. The provided integer index equals the index of the
+// type in the provided type list. In many cases the index can be ignored.
+//
+// For example:
+//   class MyTypeNames {
+//    public:
+//     template <typename T>
+//     static std::string GetName(int) {
+//       if (std::is_same<T, char>()) return "char";
+//       if (std::is_same<T, int>()) return "int";
+//       if (std::is_same<T, unsigned int>()) return "unsignedInt";
+//     }
+//   };
+//   TYPED_TEST_CASE(FooTest, MyTypes, MyTypeNames);
+
+#endif  // 0
+
+// Type-parameterized tests are abstract test patterns parameterized
+// by a type.  Compared with typed tests, type-parameterized tests
+// allow you to define the test pattern without knowing what the type
+// parameters are.  The defined pattern can be instantiated with
+// different types any number of times, in any number of translation
+// units.
+//
+// If you are designing an interface or concept, you can define a
+// suite of type-parameterized tests to verify properties that any
+// valid implementation of the interface/concept should have.  Then,
+// each implementation can easily instantiate the test suite to verify
+// that it conforms to the requirements, without having to write
+// similar tests repeatedly.  Here's an example:
+
+#if 0
+
+// First, define a fixture class template.  It should be parameterized
+// by a type.  Remember to derive it from testing::Test.
+template <typename T>
+class FooTest : public testing::Test {
+  ...
+};
+
+// Next, declare that you will define a type-parameterized test case
+// (the _P suffix is for "parameterized" or "pattern", whichever you
+// prefer):
+TYPED_TEST_CASE_P(FooTest);
+
+// Then, use TYPED_TEST_P() to define as many type-parameterized tests
+// for this type-parameterized test case as you want.
+TYPED_TEST_P(FooTest, DoesBlah) {
+  // Inside a test, refer to TypeParam to get the type parameter.
+  TypeParam n = 0;
+  ...
+}
+
+TYPED_TEST_P(FooTest, HasPropertyA) { ... }
+
+// Now the tricky part: you need to register all test patterns before
+// you can instantiate them.  The first argument of the macro is the
+// test case name; the rest are the names of the tests in this test
+// case.
+REGISTER_TYPED_TEST_CASE_P(FooTest,
+                           DoesBlah, HasPropertyA);
+
+// Finally, you are free to instantiate the pattern with the types you
+// want.  If you put the above code in a header file, you can #include
+// it in multiple C++ source files and instantiate it multiple times.
+//
+// To distinguish different instances of the pattern, the first
+// argument to the INSTANTIATE_* macro is a prefix that will be added
+// to the actual test case name.  Remember to pick unique prefixes for
+// different instances.
+typedef testing::Types<char, int, unsigned int> MyTypes;
+INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes);
+
+// If the type list contains only one type, you can write that type
+// directly without Types<...>:
+//   INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, int);
+//
+// Similar to the optional argument of TYPED_TEST_CASE above,
+// INSTANTIATE_TEST_CASE_P takes an optional fourth argument which allows to
+// generate custom names.
+//   INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes, MyTypeNames);
+
+#endif  // 0
+
+
+// Implements typed tests.
+
+#if GTEST_HAS_TYPED_TEST
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the name of the typedef for the type parameters of the
+// given test case.
+# define GTEST_TYPE_PARAMS_(TestCaseName) gtest_type_params_##TestCaseName##_
+
+// Expands to the name of the typedef for the NameGenerator, responsible for
+// creating the suffixes of the name.
+#define GTEST_NAME_GENERATOR_(TestCaseName) \
+  gtest_type_params_##TestCaseName##_NameGenerator
+
+// The 'Types' template argument below must have spaces around it
+// since some compilers may choke on '>>' when passing a template
+// instance (e.g. Types<int>)
+# define TYPED_TEST_CASE(CaseName, Types, ...)                             \
+  typedef ::testing::internal::TypeList< Types >::type GTEST_TYPE_PARAMS_( \
+      CaseName);                                                           \
+  typedef ::testing::internal::NameGeneratorSelector<__VA_ARGS__>::type    \
+      GTEST_NAME_GENERATOR_(CaseName)
+
+# define TYPED_TEST(CaseName, TestName)                                       \
+  template <typename gtest_TypeParam_>                                        \
+  class GTEST_TEST_CLASS_NAME_(CaseName, TestName)                            \
+      : public CaseName<gtest_TypeParam_> {                                   \
+   private:                                                                   \
+    typedef CaseName<gtest_TypeParam_> TestFixture;                           \
+    typedef gtest_TypeParam_ TypeParam;                                       \
+    virtual void TestBody();                                                  \
+  };                                                                          \
+  static bool gtest_##CaseName##_##TestName##_registered_                     \
+        GTEST_ATTRIBUTE_UNUSED_ =                                             \
+      ::testing::internal::TypeParameterizedTest<                             \
+          CaseName,                                                           \
+          ::testing::internal::TemplateSel<GTEST_TEST_CLASS_NAME_(CaseName,   \
+                                                                  TestName)>, \
+          GTEST_TYPE_PARAMS_(                                                 \
+              CaseName)>::Register("",                                        \
+                                   ::testing::internal::CodeLocation(         \
+                                       __FILE__, __LINE__),                   \
+                                   #CaseName, #TestName, 0,                   \
+                                   ::testing::internal::GenerateNames<        \
+                                       GTEST_NAME_GENERATOR_(CaseName),       \
+                                       GTEST_TYPE_PARAMS_(CaseName)>());      \
+  template <typename gtest_TypeParam_>                                        \
+  void GTEST_TEST_CLASS_NAME_(CaseName,                                       \
+                              TestName)<gtest_TypeParam_>::TestBody()
+
+#endif  // GTEST_HAS_TYPED_TEST
+
+// Implements type-parameterized tests.
+
+#if GTEST_HAS_TYPED_TEST_P
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the namespace name that the type-parameterized tests for
+// the given type-parameterized test case are defined in.  The exact
+// name of the namespace is subject to change without notice.
+# define GTEST_CASE_NAMESPACE_(TestCaseName) \
+  gtest_case_##TestCaseName##_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the name of the variable used to remember the names of
+// the defined tests in the given test case.
+# define GTEST_TYPED_TEST_CASE_P_STATE_(TestCaseName) \
+  gtest_typed_test_case_p_state_##TestCaseName##_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE DIRECTLY.
+//
+// Expands to the name of the variable used to remember the names of
+// the registered tests in the given test case.
+# define GTEST_REGISTERED_TEST_NAMES_(TestCaseName) \
+  gtest_registered_test_names_##TestCaseName##_
+
+// The variables defined in the type-parameterized test macros are
+// static as typically these macros are used in a .h file that can be
+// #included in multiple translation units linked together.
+# define TYPED_TEST_CASE_P(CaseName) \
+  static ::testing::internal::TypedTestCasePState \
+      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName)
+
+# define TYPED_TEST_P(CaseName, TestName) \
+  namespace GTEST_CASE_NAMESPACE_(CaseName) { \
+  template <typename gtest_TypeParam_> \
+  class TestName : public CaseName<gtest_TypeParam_> { \
+   private: \
+    typedef CaseName<gtest_TypeParam_> TestFixture; \
+    typedef gtest_TypeParam_ TypeParam; \
+    virtual void TestBody(); \
+  }; \
+  static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \
+      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).AddTestName(\
+          __FILE__, __LINE__, #CaseName, #TestName); \
+  } \
+  template <typename gtest_TypeParam_> \
+  void GTEST_CASE_NAMESPACE_(CaseName)::TestName<gtest_TypeParam_>::TestBody()
+
+# define REGISTER_TYPED_TEST_CASE_P(CaseName, ...) \
+  namespace GTEST_CASE_NAMESPACE_(CaseName) { \
+  typedef ::testing::internal::Templates<__VA_ARGS__>::type gtest_AllTests_; \
+  } \
+  static const char* const GTEST_REGISTERED_TEST_NAMES_(CaseName) \
+      GTEST_ATTRIBUTE_UNUSED_ = \
+          GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).VerifyRegisteredTestNames( \
+              __FILE__, __LINE__, #__VA_ARGS__)
+
+// The 'Types' template argument below must have spaces around it
+// since some compilers may choke on '>>' when passing a template
+// instance (e.g. Types<int>)
+# define INSTANTIATE_TYPED_TEST_CASE_P(Prefix, CaseName, Types, ...)      \
+  static bool gtest_##Prefix##_##CaseName GTEST_ATTRIBUTE_UNUSED_ =       \
+      ::testing::internal::TypeParameterizedTestCase<                     \
+          CaseName, GTEST_CASE_NAMESPACE_(CaseName)::gtest_AllTests_,     \
+          ::testing::internal::TypeList< Types >::type>::                 \
+          Register(#Prefix,                                               \
+                   ::testing::internal::CodeLocation(__FILE__, __LINE__), \
+                   &GTEST_TYPED_TEST_CASE_P_STATE_(CaseName), #CaseName,  \
+                   GTEST_REGISTERED_TEST_NAMES_(CaseName),                \
+                   ::testing::internal::GenerateNames<                    \
+                       ::testing::internal::NameGeneratorSelector<        \
+                           __VA_ARGS__>::type,                            \
+                       ::testing::internal::TypeList< Types >::type>())
+
+#endif  // GTEST_HAS_TYPED_TEST_P
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+// Depending on the platform, different string classes are available.
+// On Linux, in addition to ::std::string, Google also makes use of
+// class ::string, which has the same interface as ::std::string, but
+// has a different implementation.
+//
+// You can define GTEST_HAS_GLOBAL_STRING to 1 to indicate that
+// ::string is available AND is a distinct type to ::std::string, or
+// define it to 0 to indicate otherwise.
+//
+// If ::std::string and ::string are the same class on your platform
+// due to aliasing, you should define GTEST_HAS_GLOBAL_STRING to 0.
+//
+// If you do not define GTEST_HAS_GLOBAL_STRING, it is defined
+// heuristically.
+
+namespace testing {
+
+// Silence C4100 (unreferenced formal parameter) and 4805
+// unsafe mix of type 'const int' and type 'const bool'
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4805)
+# pragma warning(disable:4100)
+#endif
+
+
+// Declares the flags.
+
+// This flag temporary enables the disabled tests.
+GTEST_DECLARE_bool_(also_run_disabled_tests);
+
+// This flag brings the debugger on an assertion failure.
+GTEST_DECLARE_bool_(break_on_failure);
+
+// This flag controls whether Google Test catches all test-thrown exceptions
+// and logs them as failures.
+GTEST_DECLARE_bool_(catch_exceptions);
+
+// This flag enables using colors in terminal output. Available values are
+// "yes" to enable colors, "no" (disable colors), or "auto" (the default)
+// to let Google Test decide.
+GTEST_DECLARE_string_(color);
+
+// This flag sets up the filter to select by name using a glob pattern
+// the tests to run. If the filter is not given all tests are executed.
+GTEST_DECLARE_string_(filter);
+
+// This flag controls whether Google Test installs a signal handler that dumps
+// debugging information when fatal signals are raised.
+GTEST_DECLARE_bool_(install_failure_signal_handler);
+
+// This flag causes the Google Test to list tests. None of the tests listed
+// are actually run if the flag is provided.
+GTEST_DECLARE_bool_(list_tests);
+
+// This flag controls whether Google Test emits a detailed XML report to a file
+// in addition to its normal textual output.
+GTEST_DECLARE_string_(output);
+
+// This flags control whether Google Test prints the elapsed time for each
+// test.
+GTEST_DECLARE_bool_(print_time);
+
+// This flags control whether Google Test prints UTF8 characters as text.
+GTEST_DECLARE_bool_(print_utf8);
+
+// This flag specifies the random number seed.
+GTEST_DECLARE_int32_(random_seed);
+
+// This flag sets how many times the tests are repeated. The default value
+// is 1. If the value is -1 the tests are repeating forever.
+GTEST_DECLARE_int32_(repeat);
+
+// This flag controls whether Google Test includes Google Test internal
+// stack frames in failure stack traces.
+GTEST_DECLARE_bool_(show_internal_stack_frames);
+
+// When this flag is specified, tests' order is randomized on every iteration.
+GTEST_DECLARE_bool_(shuffle);
+
+// This flag specifies the maximum number of stack frames to be
+// printed in a failure message.
+GTEST_DECLARE_int32_(stack_trace_depth);
+
+// When this flag is specified, a failed assertion will throw an
+// exception if exceptions are enabled, or exit the program with a
+// non-zero code otherwise. For use with an external test framework.
+GTEST_DECLARE_bool_(throw_on_failure);
+
+// When this flag is set with a "host:port" string, on supported
+// platforms test results are streamed to the specified port on
+// the specified host machine.
+GTEST_DECLARE_string_(stream_result_to);
+
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+GTEST_DECLARE_string_(flagfile);
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+
+// The upper limit for valid stack trace depths.
+const int kMaxStackTraceDepth = 100;
+
+namespace internal {
+
+class AssertHelper;
+class DefaultGlobalTestPartResultReporter;
+class ExecDeathTest;
+class NoExecDeathTest;
+class FinalSuccessChecker;
+class GTestFlagSaver;
+class StreamingListenerTest;
+class TestResultAccessor;
+class TestEventListenersAccessor;
+class TestEventRepeater;
+class UnitTestRecordPropertyTestHelper;
+class WindowsDeathTest;
+class FuchsiaDeathTest;
+class UnitTestImpl* GetUnitTestImpl();
+void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
+                                    const std::string& message);
+
+}  // namespace internal
+
+// The friend relationship of some of these classes is cyclic.
+// If we don't forward declare them the compiler might confuse the classes
+// in friendship clauses with same named classes on the scope.
+class Test;
+class TestCase;
+class TestInfo;
+class UnitTest;
+
+// A class for indicating whether an assertion was successful.  When
+// the assertion wasn't successful, the AssertionResult object
+// remembers a non-empty message that describes how it failed.
+//
+// To create an instance of this class, use one of the factory functions
+// (AssertionSuccess() and AssertionFailure()).
+//
+// This class is useful for two purposes:
+//   1. Defining predicate functions to be used with Boolean test assertions
+//      EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts
+//   2. Defining predicate-format functions to be
+//      used with predicate assertions (ASSERT_PRED_FORMAT*, etc).
+//
+// For example, if you define IsEven predicate:
+//
+//   testing::AssertionResult IsEven(int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess();
+//     else
+//       return testing::AssertionFailure() << n << " is odd";
+//   }
+//
+// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5)))
+// will print the message
+//
+//   Value of: IsEven(Fib(5))
+//     Actual: false (5 is odd)
+//   Expected: true
+//
+// instead of a more opaque
+//
+//   Value of: IsEven(Fib(5))
+//     Actual: false
+//   Expected: true
+//
+// in case IsEven is a simple Boolean predicate.
+//
+// If you expect your predicate to be reused and want to support informative
+// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up
+// about half as often as positive ones in our tests), supply messages for
+// both success and failure cases:
+//
+//   testing::AssertionResult IsEven(int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess() << n << " is even";
+//     else
+//       return testing::AssertionFailure() << n << " is odd";
+//   }
+//
+// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print
+//
+//   Value of: IsEven(Fib(6))
+//     Actual: true (8 is even)
+//   Expected: false
+//
+// NB: Predicates that support negative Boolean assertions have reduced
+// performance in positive ones so be careful not to use them in tests
+// that have lots (tens of thousands) of positive Boolean assertions.
+//
+// To use this class with EXPECT_PRED_FORMAT assertions such as:
+//
+//   // Verifies that Foo() returns an even number.
+//   EXPECT_PRED_FORMAT1(IsEven, Foo());
+//
+// you need to define:
+//
+//   testing::AssertionResult IsEven(const char* expr, int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess();
+//     else
+//       return testing::AssertionFailure()
+//         << "Expected: " << expr << " is even\n  Actual: it's " << n;
+//   }
+//
+// If Foo() returns 5, you will see the following message:
+//
+//   Expected: Foo() is even
+//     Actual: it's 5
+//
+class GTEST_API_ AssertionResult {
+ public:
+  // Copy constructor.
+  // Used in EXPECT_TRUE/FALSE(assertion_result).
+  AssertionResult(const AssertionResult& other);
+
+#if defined(_MSC_VER) && _MSC_VER < 1910
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */)
+#endif
+
+  // Used in the EXPECT_TRUE/FALSE(bool_expression).
+  //
+  // T must be contextually convertible to bool.
+  //
+  // The second parameter prevents this overload from being considered if
+  // the argument is implicitly convertible to AssertionResult. In that case
+  // we want AssertionResult's copy constructor to be used.
+  template <typename T>
+  explicit AssertionResult(
+      const T& success,
+      typename internal::EnableIf<
+          !internal::ImplicitlyConvertible<T, AssertionResult>::value>::type*
+          /*enabler*/ = NULL)
+      : success_(success) {}
+
+#if defined(_MSC_VER) && _MSC_VER < 1910
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
+
+  // Assignment operator.
+  AssertionResult& operator=(AssertionResult other) {
+    swap(other);
+    return *this;
+  }
+
+  // Returns true iff the assertion succeeded.
+  operator bool() const { return success_; }  // NOLINT
+
+  // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+  AssertionResult operator!() const;
+
+  // Returns the text streamed into this AssertionResult. Test assertions
+  // use it when they fail (i.e., the predicate's outcome doesn't match the
+  // assertion's expectation). When nothing has been streamed into the
+  // object, returns an empty string.
+  const char* message() const {
+    return message_.get() != NULL ?  message_->c_str() : "";
+  }
+  // FIXME: Remove this after making sure no clients use it.
+  // Deprecated; please use message() instead.
+  const char* failure_message() const { return message(); }
+
+  // Streams a custom failure message into this object.
+  template <typename T> AssertionResult& operator<<(const T& value) {
+    AppendMessage(Message() << value);
+    return *this;
+  }
+
+  // Allows streaming basic output manipulators such as endl or flush into
+  // this object.
+  AssertionResult& operator<<(
+      ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) {
+    AppendMessage(Message() << basic_manipulator);
+    return *this;
+  }
+
+ private:
+  // Appends the contents of message to message_.
+  void AppendMessage(const Message& a_message) {
+    if (message_.get() == NULL)
+      message_.reset(new ::std::string);
+    message_->append(a_message.GetString().c_str());
+  }
+
+  // Swap the contents of this AssertionResult with other.
+  void swap(AssertionResult& other);
+
+  // Stores result of the assertion predicate.
+  bool success_;
+  // Stores the message describing the condition in case the expectation
+  // construct is not satisfied with the predicate's outcome.
+  // Referenced via a pointer to avoid taking too much stack frame space
+  // with test assertions.
+  internal::scoped_ptr< ::std::string> message_;
+};
+
+// Makes a successful assertion result.
+GTEST_API_ AssertionResult AssertionSuccess();
+
+// Makes a failed assertion result.
+GTEST_API_ AssertionResult AssertionFailure();
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << msg.
+GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
+
+}  // namespace testing
+
+// Includes the auto-generated header that implements a family of generic
+// predicate assertion macros. This include comes late because it relies on
+// APIs declared above.
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This file is AUTOMATICALLY GENERATED on 01/02/2018 by command
+// 'gen_gtest_pred_impl.py 5'.  DO NOT EDIT BY HAND!
+//
+// Implements a family of generic predicate assertion macros.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+
+
+namespace testing {
+
+// This header implements a family of generic predicate assertion
+// macros:
+//
+//   ASSERT_PRED_FORMAT1(pred_format, v1)
+//   ASSERT_PRED_FORMAT2(pred_format, v1, v2)
+//   ...
+//
+// where pred_format is a function or functor that takes n (in the
+// case of ASSERT_PRED_FORMATn) values and their source expression
+// text, and returns a testing::AssertionResult.  See the definition
+// of ASSERT_EQ in gtest.h for an example.
+//
+// If you don't care about formatting, you can use the more
+// restrictive version:
+//
+//   ASSERT_PRED1(pred, v1)
+//   ASSERT_PRED2(pred, v1, v2)
+//   ...
+//
+// where pred is an n-ary function or functor that returns bool,
+// and the values v1, v2, ..., must support the << operator for
+// streaming to std::ostream.
+//
+// We also define the EXPECT_* variations.
+//
+// For now we only support predicates whose arity is at most 5.
+
+// GTEST_ASSERT_ is the basic statement to which all of the assertions
+// in this file reduce.  Don't use this in your code.
+
+#define GTEST_ASSERT_(expression, on_failure) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (const ::testing::AssertionResult gtest_ar = (expression)) \
+    ; \
+  else \
+    on_failure(gtest_ar.failure_message())
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED1.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1>
+AssertionResult AssertPred1Helper(const char* pred_text,
+                                  const char* e1,
+                                  Pred pred,
+                                  const T1& v1) {
+  if (pred(v1)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, v1), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED1.  Don't use
+// this in your code.
+#define GTEST_PRED1_(pred, v1, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \
+                                             #v1, \
+                                             pred, \
+                                             v1), on_failure)
+
+// Unary predicate assertion macros.
+#define EXPECT_PRED_FORMAT1(pred_format, v1) \
+  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED1(pred, v1) \
+  GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT1(pred_format, v1) \
+  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED1(pred, v1) \
+  GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED2.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2>
+AssertionResult AssertPred2Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2) {
+  if (pred(v1, v2)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED2.  Don't use
+// this in your code.
+#define GTEST_PRED2_(pred, v1, v2, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             pred, \
+                                             v1, \
+                                             v2), on_failure)
+
+// Binary predicate assertion macros.
+#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \
+  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED2(pred, v1, v2) \
+  GTEST_PRED2_(pred, v1, v2, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \
+  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED2(pred, v1, v2) \
+  GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED3.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3>
+AssertionResult AssertPred3Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3) {
+  if (pred(v1, v2, v3)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ", "
+                            << e3 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2
+                            << "\n" << e3 << " evaluates to " << v3;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED3.  Don't use
+// this in your code.
+#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3), on_failure)
+
+// Ternary predicate assertion macros.
+#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \
+  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED3(pred, v1, v2, v3) \
+  GTEST_PRED3_(pred, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT3(pred_format, v1, v2, v3) \
+  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED3(pred, v1, v2, v3) \
+  GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED4.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3,
+          typename T4>
+AssertionResult AssertPred4Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  const char* e4,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3,
+                                  const T4& v4) {
+  if (pred(v1, v2, v3, v4)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ", "
+                            << e3 << ", "
+                            << e4 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2
+                            << "\n" << e3 << " evaluates to " << v3
+                            << "\n" << e4 << " evaluates to " << v4;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED4.  Don't use
+// this in your code.
+#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             #v4, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3, \
+                                             v4), on_failure)
+
+// 4-ary predicate assertion macros.
+#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
+  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED4(pred, v1, v2, v3, v4) \
+  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
+  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED4(pred, v1, v2, v3, v4) \
+  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED5.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5>
+AssertionResult AssertPred5Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  const char* e4,
+                                  const char* e5,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3,
+                                  const T4& v4,
+                                  const T5& v5) {
+  if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ", "
+                            << e3 << ", "
+                            << e4 << ", "
+                            << e5 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2
+                            << "\n" << e3 << " evaluates to " << v3
+                            << "\n" << e4 << " evaluates to " << v4
+                            << "\n" << e5 << " evaluates to " << v5;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED5.  Don't use
+// this in your code.
+#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             #v4, \
+                                             #v5, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3, \
+                                             v4, \
+                                             v5), on_failure)
+
+// 5-ary predicate assertion macros.
+#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
+  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED5(pred, v1, v2, v3, v4, v5) \
+  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
+  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \
+  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
+
+
+
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+
+namespace testing {
+
+// The abstract class that all tests inherit from.
+//
+// In Google Test, a unit test program contains one or many TestCases, and
+// each TestCase contains one or many Tests.
+//
+// When you define a test using the TEST macro, you don't need to
+// explicitly derive from Test - the TEST macro automatically does
+// this for you.
+//
+// The only time you derive from Test is when defining a test fixture
+// to be used in a TEST_F.  For example:
+//
+//   class FooTest : public testing::Test {
+//    protected:
+//     void SetUp() override { ... }
+//     void TearDown() override { ... }
+//     ...
+//   };
+//
+//   TEST_F(FooTest, Bar) { ... }
+//   TEST_F(FooTest, Baz) { ... }
+//
+// Test is not copyable.
+class GTEST_API_ Test {
+ public:
+  friend class TestInfo;
+
+  // Defines types for pointers to functions that set up and tear down
+  // a test case.
+  typedef internal::SetUpTestCaseFunc SetUpTestCaseFunc;
+  typedef internal::TearDownTestCaseFunc TearDownTestCaseFunc;
+
+  // The d'tor is virtual as we intend to inherit from Test.
+  virtual ~Test();
+
+  // Sets up the stuff shared by all tests in this test case.
+  //
+  // Google Test will call Foo::SetUpTestCase() before running the first
+  // test in test case Foo.  Hence a sub-class can define its own
+  // SetUpTestCase() method to shadow the one defined in the super
+  // class.
+  static void SetUpTestCase() {}
+
+  // Tears down the stuff shared by all tests in this test case.
+  //
+  // Google Test will call Foo::TearDownTestCase() after running the last
+  // test in test case Foo.  Hence a sub-class can define its own
+  // TearDownTestCase() method to shadow the one defined in the super
+  // class.
+  static void TearDownTestCase() {}
+
+  // Returns true iff the current test has a fatal failure.
+  static bool HasFatalFailure();
+
+  // Returns true iff the current test has a non-fatal failure.
+  static bool HasNonfatalFailure();
+
+  // Returns true iff the current test has a (either fatal or
+  // non-fatal) failure.
+  static bool HasFailure() { return HasFatalFailure() || HasNonfatalFailure(); }
+
+  // Logs a property for the current test, test case, or for the entire
+  // invocation of the test program when used outside of the context of a
+  // test case.  Only the last value for a given key is remembered.  These
+  // are public static so they can be called from utility functions that are
+  // not members of the test fixture.  Calls to RecordProperty made during
+  // lifespan of the test (from the moment its constructor starts to the
+  // moment its destructor finishes) will be output in XML as attributes of
+  // the <testcase> element.  Properties recorded from fixture's
+  // SetUpTestCase or TearDownTestCase are logged as attributes of the
+  // corresponding <testsuite> element.  Calls to RecordProperty made in the
+  // global context (before or after invocation of RUN_ALL_TESTS and from
+  // SetUp/TearDown method of Environment objects registered with Google
+  // Test) will be output as attributes of the <testsuites> element.
+  static void RecordProperty(const std::string& key, const std::string& value);
+  static void RecordProperty(const std::string& key, int value);
+
+ protected:
+  // Creates a Test object.
+  Test();
+
+  // Sets up the test fixture.
+  virtual void SetUp();
+
+  // Tears down the test fixture.
+  virtual void TearDown();
+
+ private:
+  // Returns true iff the current test has the same fixture class as
+  // the first test in the current test case.
+  static bool HasSameFixtureClass();
+
+  // Runs the test after the test fixture has been set up.
+  //
+  // A sub-class must implement this to define the test logic.
+  //
+  // DO NOT OVERRIDE THIS FUNCTION DIRECTLY IN A USER PROGRAM.
+  // Instead, use the TEST or TEST_F macro.
+  virtual void TestBody() = 0;
+
+  // Sets up, executes, and tears down the test.
+  void Run();
+
+  // Deletes self.  We deliberately pick an unusual name for this
+  // internal method to avoid clashing with names used in user TESTs.
+  void DeleteSelf_() { delete this; }
+
+  const internal::scoped_ptr< GTEST_FLAG_SAVER_ > gtest_flag_saver_;
+
+  // Often a user misspells SetUp() as Setup() and spends a long time
+  // wondering why it is never called by Google Test.  The declaration of
+  // the following method is solely for catching such an error at
+  // compile time:
+  //
+  //   - The return type is deliberately chosen to be not void, so it
+  //   will be a conflict if void Setup() is declared in the user's
+  //   test fixture.
+  //
+  //   - This method is private, so it will be another compiler error
+  //   if the method is called from the user's test fixture.
+  //
+  // DO NOT OVERRIDE THIS FUNCTION.
+  //
+  // If you see an error about overriding the following function or
+  // about it being private, you have mis-spelled SetUp() as Setup().
+  struct Setup_should_be_spelled_SetUp {};
+  virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; }
+
+  // We disallow copying Tests.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Test);
+};
+
+typedef internal::TimeInMillis TimeInMillis;
+
+// A copyable object representing a user specified test property which can be
+// output as a key/value string pair.
+//
+// Don't inherit from TestProperty as its destructor is not virtual.
+class TestProperty {
+ public:
+  // C'tor.  TestProperty does NOT have a default constructor.
+  // Always use this constructor (with parameters) to create a
+  // TestProperty object.
+  TestProperty(const std::string& a_key, const std::string& a_value) :
+    key_(a_key), value_(a_value) {
+  }
+
+  // Gets the user supplied key.
+  const char* key() const {
+    return key_.c_str();
+  }
+
+  // Gets the user supplied value.
+  const char* value() const {
+    return value_.c_str();
+  }
+
+  // Sets a new value, overriding the one supplied in the constructor.
+  void SetValue(const std::string& new_value) {
+    value_ = new_value;
+  }
+
+ private:
+  // The key supplied by the user.
+  std::string key_;
+  // The value supplied by the user.
+  std::string value_;
+};
+
+// The result of a single Test.  This includes a list of
+// TestPartResults, a list of TestProperties, a count of how many
+// death tests there are in the Test, and how much time it took to run
+// the Test.
+//
+// TestResult is not copyable.
+class GTEST_API_ TestResult {
+ public:
+  // Creates an empty TestResult.
+  TestResult();
+
+  // D'tor.  Do not inherit from TestResult.
+  ~TestResult();
+
+  // Gets the number of all test parts.  This is the sum of the number
+  // of successful test parts and the number of failed test parts.
+  int total_part_count() const;
+
+  // Returns the number of the test properties.
+  int test_property_count() const;
+
+  // Returns true iff the test passed (i.e. no test part failed).
+  bool Passed() const { return !Failed(); }
+
+  // Returns true iff the test failed.
+  bool Failed() const;
+
+  // Returns true iff the test fatally failed.
+  bool HasFatalFailure() const;
+
+  // Returns true iff the test has a non-fatal failure.
+  bool HasNonfatalFailure() const;
+
+  // Returns the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Returns the i-th test part result among all the results. i can range from 0
+  // to total_part_count() - 1. If i is not in that range, aborts the program.
+  const TestPartResult& GetTestPartResult(int i) const;
+
+  // Returns the i-th test property. i can range from 0 to
+  // test_property_count() - 1. If i is not in that range, aborts the
+  // program.
+  const TestProperty& GetTestProperty(int i) const;
+
+ private:
+  friend class TestInfo;
+  friend class TestCase;
+  friend class UnitTest;
+  friend class internal::DefaultGlobalTestPartResultReporter;
+  friend class internal::ExecDeathTest;
+  friend class internal::TestResultAccessor;
+  friend class internal::UnitTestImpl;
+  friend class internal::WindowsDeathTest;
+  friend class internal::FuchsiaDeathTest;
+
+  // Gets the vector of TestPartResults.
+  const std::vector<TestPartResult>& test_part_results() const {
+    return test_part_results_;
+  }
+
+  // Gets the vector of TestProperties.
+  const std::vector<TestProperty>& test_properties() const {
+    return test_properties_;
+  }
+
+  // Sets the elapsed time.
+  void set_elapsed_time(TimeInMillis elapsed) { elapsed_time_ = elapsed; }
+
+  // Adds a test property to the list. The property is validated and may add
+  // a non-fatal failure if invalid (e.g., if it conflicts with reserved
+  // key names). If a property is already recorded for the same key, the
+  // value will be updated, rather than storing multiple values for the same
+  // key.  xml_element specifies the element for which the property is being
+  // recorded and is used for validation.
+  void RecordProperty(const std::string& xml_element,
+                      const TestProperty& test_property);
+
+  // Adds a failure if the key is a reserved attribute of Google Test
+  // testcase tags.  Returns true if the property is valid.
+  // FIXME: Validate attribute names are legal and human readable.
+  static bool ValidateTestProperty(const std::string& xml_element,
+                                   const TestProperty& test_property);
+
+  // Adds a test part result to the list.
+  void AddTestPartResult(const TestPartResult& test_part_result);
+
+  // Returns the death test count.
+  int death_test_count() const { return death_test_count_; }
+
+  // Increments the death test count, returning the new count.
+  int increment_death_test_count() { return ++death_test_count_; }
+
+  // Clears the test part results.
+  void ClearTestPartResults();
+
+  // Clears the object.
+  void Clear();
+
+  // Protects mutable state of the property vector and of owned
+  // properties, whose values may be updated.
+  internal::Mutex test_properites_mutex_;
+
+  // The vector of TestPartResults
+  std::vector<TestPartResult> test_part_results_;
+  // The vector of TestProperties
+  std::vector<TestProperty> test_properties_;
+  // Running count of death tests.
+  int death_test_count_;
+  // The elapsed time, in milliseconds.
+  TimeInMillis elapsed_time_;
+
+  // We disallow copying TestResult.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestResult);
+};  // class TestResult
+
+// A TestInfo object stores the following information about a test:
+//
+//   Test case name
+//   Test name
+//   Whether the test should be run
+//   A function pointer that creates the test object when invoked
+//   Test result
+//
+// The constructor of TestInfo registers itself with the UnitTest
+// singleton such that the RUN_ALL_TESTS() macro knows which tests to
+// run.
+class GTEST_API_ TestInfo {
+ public:
+  // Destructs a TestInfo object.  This function is not virtual, so
+  // don't inherit from TestInfo.
+  ~TestInfo();
+
+  // Returns the test case name.
+  const char* test_case_name() const { return test_case_name_.c_str(); }
+
+  // Returns the test name.
+  const char* name() const { return name_.c_str(); }
+
+  // Returns the name of the parameter type, or NULL if this is not a typed
+  // or a type-parameterized test.
+  const char* type_param() const {
+    if (type_param_.get() != NULL)
+      return type_param_->c_str();
+    return NULL;
+  }
+
+  // Returns the text representation of the value parameter, or NULL if this
+  // is not a value-parameterized test.
+  const char* value_param() const {
+    if (value_param_.get() != NULL)
+      return value_param_->c_str();
+    return NULL;
+  }
+
+  // Returns the file name where this test is defined.
+  const char* file() const { return location_.file.c_str(); }
+
+  // Returns the line where this test is defined.
+  int line() const { return location_.line; }
+
+  // Return true if this test should not be run because it's in another shard.
+  bool is_in_another_shard() const { return is_in_another_shard_; }
+
+  // Returns true if this test should run, that is if the test is not
+  // disabled (or it is disabled but the also_run_disabled_tests flag has
+  // been specified) and its full name matches the user-specified filter.
+  //
+  // Google Test allows the user to filter the tests by their full names.
+  // The full name of a test Bar in test case Foo is defined as
+  // "Foo.Bar".  Only the tests that match the filter will run.
+  //
+  // A filter is a colon-separated list of glob (not regex) patterns,
+  // optionally followed by a '-' and a colon-separated list of
+  // negative patterns (tests to exclude).  A test is run if it
+  // matches one of the positive patterns and does not match any of
+  // the negative patterns.
+  //
+  // For example, *A*:Foo.* is a filter that matches any string that
+  // contains the character 'A' or starts with "Foo.".
+  bool should_run() const { return should_run_; }
+
+  // Returns true iff this test will appear in the XML report.
+  bool is_reportable() const {
+    // The XML report includes tests matching the filter, excluding those
+    // run in other shards.
+    return matches_filter_ && !is_in_another_shard_;
+  }
+
+  // Returns the result of the test.
+  const TestResult* result() const { return &result_; }
+
+ private:
+#if GTEST_HAS_DEATH_TEST
+  friend class internal::DefaultDeathTestFactory;
+#endif  // GTEST_HAS_DEATH_TEST
+  friend class Test;
+  friend class TestCase;
+  friend class internal::UnitTestImpl;
+  friend class internal::StreamingListenerTest;
+  friend TestInfo* internal::MakeAndRegisterTestInfo(
+      const char* test_case_name,
+      const char* name,
+      const char* type_param,
+      const char* value_param,
+      internal::CodeLocation code_location,
+      internal::TypeId fixture_class_id,
+      Test::SetUpTestCaseFunc set_up_tc,
+      Test::TearDownTestCaseFunc tear_down_tc,
+      internal::TestFactoryBase* factory);
+
+  // Constructs a TestInfo object. The newly constructed instance assumes
+  // ownership of the factory object.
+  TestInfo(const std::string& test_case_name,
+           const std::string& name,
+           const char* a_type_param,   // NULL if not a type-parameterized test
+           const char* a_value_param,  // NULL if not a value-parameterized test
+           internal::CodeLocation a_code_location,
+           internal::TypeId fixture_class_id,
+           internal::TestFactoryBase* factory);
+
+  // Increments the number of death tests encountered in this test so
+  // far.
+  int increment_death_test_count() {
+    return result_.increment_death_test_count();
+  }
+
+  // Creates the test object, runs it, records its result, and then
+  // deletes it.
+  void Run();
+
+  static void ClearTestResult(TestInfo* test_info) {
+    test_info->result_.Clear();
+  }
+
+  // These fields are immutable properties of the test.
+  const std::string test_case_name_;     // Test case name
+  const std::string name_;               // Test name
+  // Name of the parameter type, or NULL if this is not a typed or a
+  // type-parameterized test.
+  const internal::scoped_ptr<const ::std::string> type_param_;
+  // Text representation of the value parameter, or NULL if this is not a
+  // value-parameterized test.
+  const internal::scoped_ptr<const ::std::string> value_param_;
+  internal::CodeLocation location_;
+  const internal::TypeId fixture_class_id_;   // ID of the test fixture class
+  bool should_run_;                 // True iff this test should run
+  bool is_disabled_;                // True iff this test is disabled
+  bool matches_filter_;             // True if this test matches the
+                                    // user-specified filter.
+  bool is_in_another_shard_;        // Will be run in another shard.
+  internal::TestFactoryBase* const factory_;  // The factory that creates
+                                              // the test object
+
+  // This field is mutable and needs to be reset before running the
+  // test for the second time.
+  TestResult result_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo);
+};
+
+// A test case, which consists of a vector of TestInfos.
+//
+// TestCase is not copyable.
+class GTEST_API_ TestCase {
+ public:
+  // Creates a TestCase with the given name.
+  //
+  // TestCase does NOT have a default constructor.  Always use this
+  // constructor to create a TestCase object.
+  //
+  // Arguments:
+  //
+  //   name:         name of the test case
+  //   a_type_param: the name of the test's type parameter, or NULL if
+  //                 this is not a type-parameterized test.
+  //   set_up_tc:    pointer to the function that sets up the test case
+  //   tear_down_tc: pointer to the function that tears down the test case
+  TestCase(const char* name, const char* a_type_param,
+           Test::SetUpTestCaseFunc set_up_tc,
+           Test::TearDownTestCaseFunc tear_down_tc);
+
+  // Destructor of TestCase.
+  virtual ~TestCase();
+
+  // Gets the name of the TestCase.
+  const char* name() const { return name_.c_str(); }
+
+  // Returns the name of the parameter type, or NULL if this is not a
+  // type-parameterized test case.
+  const char* type_param() const {
+    if (type_param_.get() != NULL)
+      return type_param_->c_str();
+    return NULL;
+  }
+
+  // Returns true if any test in this test case should run.
+  bool should_run() const { return should_run_; }
+
+  // Gets the number of successful tests in this test case.
+  int successful_test_count() const;
+
+  // Gets the number of failed tests in this test case.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests in this test case.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Get the number of tests in this test case that should run.
+  int test_to_run_count() const;
+
+  // Gets the number of all tests in this test case.
+  int total_test_count() const;
+
+  // Returns true iff the test case passed.
+  bool Passed() const { return !Failed(); }
+
+  // Returns true iff the test case failed.
+  bool Failed() const { return failed_test_count() > 0; }
+
+  // Returns the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Returns the i-th test among all the tests. i can range from 0 to
+  // total_test_count() - 1. If i is not in that range, returns NULL.
+  const TestInfo* GetTestInfo(int i) const;
+
+  // Returns the TestResult that holds test properties recorded during
+  // execution of SetUpTestCase and TearDownTestCase.
+  const TestResult& ad_hoc_test_result() const { return ad_hoc_test_result_; }
+
+ private:
+  friend class Test;
+  friend class internal::UnitTestImpl;
+
+  // Gets the (mutable) vector of TestInfos in this TestCase.
+  std::vector<TestInfo*>& test_info_list() { return test_info_list_; }
+
+  // Gets the (immutable) vector of TestInfos in this TestCase.
+  const std::vector<TestInfo*>& test_info_list() const {
+    return test_info_list_;
+  }
+
+  // Returns the i-th test among all the tests. i can range from 0 to
+  // total_test_count() - 1. If i is not in that range, returns NULL.
+  TestInfo* GetMutableTestInfo(int i);
+
+  // Sets the should_run member.
+  void set_should_run(bool should) { should_run_ = should; }
+
+  // Adds a TestInfo to this test case.  Will delete the TestInfo upon
+  // destruction of the TestCase object.
+  void AddTestInfo(TestInfo * test_info);
+
+  // Clears the results of all tests in this test case.
+  void ClearResult();
+
+  // Clears the results of all tests in the given test case.
+  static void ClearTestCaseResult(TestCase* test_case) {
+    test_case->ClearResult();
+  }
+
+  // Runs every test in this TestCase.
+  void Run();
+
+  // Runs SetUpTestCase() for this TestCase.  This wrapper is needed
+  // for catching exceptions thrown from SetUpTestCase().
+  void RunSetUpTestCase() { (*set_up_tc_)(); }
+
+  // Runs TearDownTestCase() for this TestCase.  This wrapper is
+  // needed for catching exceptions thrown from TearDownTestCase().
+  void RunTearDownTestCase() { (*tear_down_tc_)(); }
+
+  // Returns true iff test passed.
+  static bool TestPassed(const TestInfo* test_info) {
+    return test_info->should_run() && test_info->result()->Passed();
+  }
+
+  // Returns true iff test failed.
+  static bool TestFailed(const TestInfo* test_info) {
+    return test_info->should_run() && test_info->result()->Failed();
+  }
+
+  // Returns true iff the test is disabled and will be reported in the XML
+  // report.
+  static bool TestReportableDisabled(const TestInfo* test_info) {
+    return test_info->is_reportable() && test_info->is_disabled_;
+  }
+
+  // Returns true iff test is disabled.
+  static bool TestDisabled(const TestInfo* test_info) {
+    return test_info->is_disabled_;
+  }
+
+  // Returns true iff this test will appear in the XML report.
+  static bool TestReportable(const TestInfo* test_info) {
+    return test_info->is_reportable();
+  }
+
+  // Returns true if the given test should run.
+  static bool ShouldRunTest(const TestInfo* test_info) {
+    return test_info->should_run();
+  }
+
+  // Shuffles the tests in this test case.
+  void ShuffleTests(internal::Random* random);
+
+  // Restores the test order to before the first shuffle.
+  void UnshuffleTests();
+
+  // Name of the test case.
+  std::string name_;
+  // Name of the parameter type, or NULL if this is not a typed or a
+  // type-parameterized test.
+  const internal::scoped_ptr<const ::std::string> type_param_;
+  // The vector of TestInfos in their original order.  It owns the
+  // elements in the vector.
+  std::vector<TestInfo*> test_info_list_;
+  // Provides a level of indirection for the test list to allow easy
+  // shuffling and restoring the test order.  The i-th element in this
+  // vector is the index of the i-th test in the shuffled test list.
+  std::vector<int> test_indices_;
+  // Pointer to the function that sets up the test case.
+  Test::SetUpTestCaseFunc set_up_tc_;
+  // Pointer to the function that tears down the test case.
+  Test::TearDownTestCaseFunc tear_down_tc_;
+  // True iff any test in this test case should run.
+  bool should_run_;
+  // Elapsed time, in milliseconds.
+  TimeInMillis elapsed_time_;
+  // Holds test properties recorded during execution of SetUpTestCase and
+  // TearDownTestCase.
+  TestResult ad_hoc_test_result_;
+
+  // We disallow copying TestCases.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestCase);
+};
+
+// An Environment object is capable of setting up and tearing down an
+// environment.  You should subclass this to define your own
+// environment(s).
+//
+// An Environment object does the set-up and tear-down in virtual
+// methods SetUp() and TearDown() instead of the constructor and the
+// destructor, as:
+//
+//   1. You cannot safely throw from a destructor.  This is a problem
+//      as in some cases Google Test is used where exceptions are enabled, and
+//      we may want to implement ASSERT_* using exceptions where they are
+//      available.
+//   2. You cannot use ASSERT_* directly in a constructor or
+//      destructor.
+class Environment {
+ public:
+  // The d'tor is virtual as we need to subclass Environment.
+  virtual ~Environment() {}
+
+  // Override this to define how to set up the environment.
+  virtual void SetUp() {}
+
+  // Override this to define how to tear down the environment.
+  virtual void TearDown() {}
+ private:
+  // If you see an error about overriding the following function or
+  // about it being private, you have mis-spelled SetUp() as Setup().
+  struct Setup_should_be_spelled_SetUp {};
+  virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; }
+};
+
+#if GTEST_HAS_EXCEPTIONS
+
+// Exception which can be thrown from TestEventListener::OnTestPartResult.
+class GTEST_API_ AssertionException
+    : public internal::GoogleTestFailureException {
+ public:
+  explicit AssertionException(const TestPartResult& result)
+      : GoogleTestFailureException(result) {}
+};
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// The interface for tracing execution of tests. The methods are organized in
+// the order the corresponding events are fired.
+class TestEventListener {
+ public:
+  virtual ~TestEventListener() {}
+
+  // Fired before any test activity starts.
+  virtual void OnTestProgramStart(const UnitTest& unit_test) = 0;
+
+  // Fired before each iteration of tests starts.  There may be more than
+  // one iteration if GTEST_FLAG(repeat) is set. iteration is the iteration
+  // index, starting from 0.
+  virtual void OnTestIterationStart(const UnitTest& unit_test,
+                                    int iteration) = 0;
+
+  // Fired before environment set-up for each iteration of tests starts.
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test) = 0;
+
+  // Fired after environment set-up for each iteration of tests ends.
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test) = 0;
+
+  // Fired before the test case starts.
+  virtual void OnTestCaseStart(const TestCase& test_case) = 0;
+
+  // Fired before the test starts.
+  virtual void OnTestStart(const TestInfo& test_info) = 0;
+
+  // Fired after a failed assertion or a SUCCEED() invocation.
+  // If you want to throw an exception from this function to skip to the next
+  // TEST, it must be AssertionException defined above, or inherited from it.
+  virtual void OnTestPartResult(const TestPartResult& test_part_result) = 0;
+
+  // Fired after the test ends.
+  virtual void OnTestEnd(const TestInfo& test_info) = 0;
+
+  // Fired after the test case ends.
+  virtual void OnTestCaseEnd(const TestCase& test_case) = 0;
+
+  // Fired before environment tear-down for each iteration of tests starts.
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test) = 0;
+
+  // Fired after environment tear-down for each iteration of tests ends.
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) = 0;
+
+  // Fired after each iteration of tests finishes.
+  virtual void OnTestIterationEnd(const UnitTest& unit_test,
+                                  int iteration) = 0;
+
+  // Fired after all test activities have ended.
+  virtual void OnTestProgramEnd(const UnitTest& unit_test) = 0;
+};
+
+// The convenience class for users who need to override just one or two
+// methods and are not concerned that a possible change to a signature of
+// the methods they override will not be caught during the build.  For
+// comments about each method please see the definition of TestEventListener
+// above.
+class EmptyTestEventListener : public TestEventListener {
+ public:
+  virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationStart(const UnitTest& /*unit_test*/,
+                                    int /*iteration*/) {}
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestCaseStart(const TestCase& /*test_case*/) {}
+  virtual void OnTestStart(const TestInfo& /*test_info*/) {}
+  virtual void OnTestPartResult(const TestPartResult& /*test_part_result*/) {}
+  virtual void OnTestEnd(const TestInfo& /*test_info*/) {}
+  virtual void OnTestCaseEnd(const TestCase& /*test_case*/) {}
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationEnd(const UnitTest& /*unit_test*/,
+                                  int /*iteration*/) {}
+  virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {}
+};
+
+// TestEventListeners lets users add listeners to track events in Google Test.
+class GTEST_API_ TestEventListeners {
+ public:
+  TestEventListeners();
+  ~TestEventListeners();
+
+  // Appends an event listener to the end of the list. Google Test assumes
+  // the ownership of the listener (i.e. it will delete the listener when
+  // the test program finishes).
+  void Append(TestEventListener* listener);
+
+  // Removes the given event listener from the list and returns it.  It then
+  // becomes the caller's responsibility to delete the listener. Returns
+  // NULL if the listener is not found in the list.
+  TestEventListener* Release(TestEventListener* listener);
+
+  // Returns the standard listener responsible for the default console
+  // output.  Can be removed from the listeners list to shut down default
+  // console output.  Note that removing this object from the listener list
+  // with Release transfers its ownership to the caller and makes this
+  // function return NULL the next time.
+  TestEventListener* default_result_printer() const {
+    return default_result_printer_;
+  }
+
+  // Returns the standard listener responsible for the default XML output
+  // controlled by the --gtest_output=xml flag.  Can be removed from the
+  // listeners list by users who want to shut down the default XML output
+  // controlled by this flag and substitute it with custom one.  Note that
+  // removing this object from the listener list with Release transfers its
+  // ownership to the caller and makes this function return NULL the next
+  // time.
+  TestEventListener* default_xml_generator() const {
+    return default_xml_generator_;
+  }
+
+ private:
+  friend class TestCase;
+  friend class TestInfo;
+  friend class internal::DefaultGlobalTestPartResultReporter;
+  friend class internal::NoExecDeathTest;
+  friend class internal::TestEventListenersAccessor;
+  friend class internal::UnitTestImpl;
+
+  // Returns repeater that broadcasts the TestEventListener events to all
+  // subscribers.
+  TestEventListener* repeater();
+
+  // Sets the default_result_printer attribute to the provided listener.
+  // The listener is also added to the listener list and previous
+  // default_result_printer is removed from it and deleted. The listener can
+  // also be NULL in which case it will not be added to the list. Does
+  // nothing if the previous and the current listener objects are the same.
+  void SetDefaultResultPrinter(TestEventListener* listener);
+
+  // Sets the default_xml_generator attribute to the provided listener.  The
+  // listener is also added to the listener list and previous
+  // default_xml_generator is removed from it and deleted. The listener can
+  // also be NULL in which case it will not be added to the list. Does
+  // nothing if the previous and the current listener objects are the same.
+  void SetDefaultXmlGenerator(TestEventListener* listener);
+
+  // Controls whether events will be forwarded by the repeater to the
+  // listeners in the list.
+  bool EventForwardingEnabled() const;
+  void SuppressEventForwarding();
+
+  // The actual list of listeners.
+  internal::TestEventRepeater* repeater_;
+  // Listener responsible for the standard result output.
+  TestEventListener* default_result_printer_;
+  // Listener responsible for the creation of the XML output file.
+  TestEventListener* default_xml_generator_;
+
+  // We disallow copying TestEventListeners.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventListeners);
+};
+
+// A UnitTest consists of a vector of TestCases.
+//
+// This is a singleton class.  The only instance of UnitTest is
+// created when UnitTest::GetInstance() is first called.  This
+// instance is never deleted.
+//
+// UnitTest is not copyable.
+//
+// This class is thread-safe as long as the methods are called
+// according to their specification.
+class GTEST_API_ UnitTest {
+ public:
+  // Gets the singleton UnitTest object.  The first time this method
+  // is called, a UnitTest object is constructed and returned.
+  // Consecutive calls will return the same object.
+  static UnitTest* GetInstance();
+
+  // Runs all tests in this UnitTest object and prints the result.
+  // Returns 0 if successful, or 1 otherwise.
+  //
+  // This method can only be called from the main thread.
+  //
+  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+  int Run() GTEST_MUST_USE_RESULT_;
+
+  // Returns the working directory when the first TEST() or TEST_F()
+  // was executed.  The UnitTest object owns the string.
+  const char* original_working_dir() const;
+
+  // Returns the TestCase object for the test that's currently running,
+  // or NULL if no test is running.
+  const TestCase* current_test_case() const
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Returns the TestInfo object for the test that's currently running,
+  // or NULL if no test is running.
+  const TestInfo* current_test_info() const
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Returns the random seed used at the start of the current test run.
+  int random_seed() const;
+
+  // Returns the ParameterizedTestCaseRegistry object used to keep track of
+  // value-parameterized tests and instantiate and register them.
+  //
+  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+  internal::ParameterizedTestCaseRegistry& parameterized_test_registry()
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Gets the number of successful test cases.
+  int successful_test_case_count() const;
+
+  // Gets the number of failed test cases.
+  int failed_test_case_count() const;
+
+  // Gets the number of all test cases.
+  int total_test_case_count() const;
+
+  // Gets the number of all test cases that contain at least one test
+  // that should run.
+  int test_case_to_run_count() const;
+
+  // Gets the number of successful tests.
+  int successful_test_count() const;
+
+  // Gets the number of failed tests.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Gets the number of all tests.
+  int total_test_count() const;
+
+  // Gets the number of tests that should run.
+  int test_to_run_count() const;
+
+  // Gets the time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const;
+
+  // Gets the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const;
+
+  // Returns true iff the unit test passed (i.e. all test cases passed).
+  bool Passed() const;
+
+  // Returns true iff the unit test failed (i.e. some test case failed
+  // or something outside of all tests failed).
+  bool Failed() const;
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  const TestCase* GetTestCase(int i) const;
+
+  // Returns the TestResult containing information on test failures and
+  // properties logged outside of individual test cases.
+  const TestResult& ad_hoc_test_result() const;
+
+  // Returns the list of event listeners that can be used to track events
+  // inside Google Test.
+  TestEventListeners& listeners();
+
+ private:
+  // Registers and returns a global test environment.  When a test
+  // program is run, all global test environments will be set-up in
+  // the order they were registered.  After all tests in the program
+  // have finished, all global test environments will be torn-down in
+  // the *reverse* order they were registered.
+  //
+  // The UnitTest object takes ownership of the given environment.
+  //
+  // This method can only be called from the main thread.
+  Environment* AddEnvironment(Environment* env);
+
+  // Adds a TestPartResult to the current TestResult object.  All
+  // Google Test assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc)
+  // eventually call this to report their results.  The user code
+  // should use the assertion macros instead of calling this directly.
+  void AddTestPartResult(TestPartResult::Type result_type,
+                         const char* file_name,
+                         int line_number,
+                         const std::string& message,
+                         const std::string& os_stack_trace)
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Adds a TestProperty to the current TestResult object when invoked from
+  // inside a test, to current TestCase's ad_hoc_test_result_ when invoked
+  // from SetUpTestCase or TearDownTestCase, or to the global property set
+  // when invoked elsewhere.  If the result already contains a property with
+  // the same key, the value will be updated.
+  void RecordProperty(const std::string& key, const std::string& value);
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  TestCase* GetMutableTestCase(int i);
+
+  // Accessors for the implementation object.
+  internal::UnitTestImpl* impl() { return impl_; }
+  const internal::UnitTestImpl* impl() const { return impl_; }
+
+  // These classes and functions are friends as they need to access private
+  // members of UnitTest.
+  friend class ScopedTrace;
+  friend class Test;
+  friend class internal::AssertHelper;
+  friend class internal::StreamingListenerTest;
+  friend class internal::UnitTestRecordPropertyTestHelper;
+  friend Environment* AddGlobalTestEnvironment(Environment* env);
+  friend internal::UnitTestImpl* internal::GetUnitTestImpl();
+  friend void internal::ReportFailureInUnknownLocation(
+      TestPartResult::Type result_type,
+      const std::string& message);
+
+  // Creates an empty UnitTest.
+  UnitTest();
+
+  // D'tor
+  virtual ~UnitTest();
+
+  // Pushes a trace defined by SCOPED_TRACE() on to the per-thread
+  // Google Test trace stack.
+  void PushGTestTrace(const internal::TraceInfo& trace)
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Pops a trace from the per-thread Google Test trace stack.
+  void PopGTestTrace()
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Protects mutable state in *impl_.  This is mutable as some const
+  // methods need to lock it too.
+  mutable internal::Mutex mutex_;
+
+  // Opaque implementation object.  This field is never changed once
+  // the object is constructed.  We don't mark it as const here, as
+  // doing so will cause a warning in the constructor of UnitTest.
+  // Mutable state in *impl_ is protected by mutex_.
+  internal::UnitTestImpl* impl_;
+
+  // We disallow copying UnitTest.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTest);
+};
+
+// A convenient wrapper for adding an environment for the test
+// program.
+//
+// You should call this before RUN_ALL_TESTS() is called, probably in
+// main().  If you use gtest_main, you need to call this before main()
+// starts for it to take effect.  For example, you can define a global
+// variable like this:
+//
+//   testing::Environment* const foo_env =
+//       testing::AddGlobalTestEnvironment(new FooEnvironment);
+//
+// However, we strongly recommend you to write your own main() and
+// call AddGlobalTestEnvironment() there, as relying on initialization
+// of global variables makes the code harder to read and may cause
+// problems when you register multiple environments from different
+// translation units and the environments have dependencies among them
+// (remember that the compiler doesn't guarantee the order in which
+// global variables from different translation units are initialized).
+inline Environment* AddGlobalTestEnvironment(Environment* env) {
+  return UnitTest::GetInstance()->AddEnvironment(env);
+}
+
+// Initializes Google Test.  This must be called before calling
+// RUN_ALL_TESTS().  In particular, it parses a command line for the
+// flags that Google Test recognizes.  Whenever a Google Test flag is
+// seen, it is removed from argv, and *argc is decremented.
+//
+// No value is returned.  Instead, the Google Test flag variables are
+// updated.
+//
+// Calling the function for the second time has no user-visible effect.
+GTEST_API_ void InitGoogleTest(int* argc, char** argv);
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+GTEST_API_ void InitGoogleTest(int* argc, wchar_t** argv);
+
+namespace internal {
+
+// Separate the error generating code from the code path to reduce the stack
+// frame size of CmpHelperEQ. This helps reduce the overhead of some sanitizers
+// when calling EXPECT_* in a tight loop.
+template <typename T1, typename T2>
+AssertionResult CmpHelperEQFailure(const char* lhs_expression,
+                                   const char* rhs_expression,
+                                   const T1& lhs, const T2& rhs) {
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   FormatForComparisonFailureMessage(lhs, rhs),
+                   FormatForComparisonFailureMessage(rhs, lhs),
+                   false);
+}
+
+// The helper function for {ASSERT|EXPECT}_EQ.
+template <typename T1, typename T2>
+AssertionResult CmpHelperEQ(const char* lhs_expression,
+                            const char* rhs_expression,
+                            const T1& lhs,
+                            const T2& rhs) {
+  if (lhs == rhs) {
+    return AssertionSuccess();
+  }
+
+  return CmpHelperEQFailure(lhs_expression, rhs_expression, lhs, rhs);
+}
+
+// With this overloaded version, we allow anonymous enums to be used
+// in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums
+// can be implicitly cast to BiggestInt.
+GTEST_API_ AssertionResult CmpHelperEQ(const char* lhs_expression,
+                                       const char* rhs_expression,
+                                       BiggestInt lhs,
+                                       BiggestInt rhs);
+
+// The helper class for {ASSERT|EXPECT}_EQ.  The template argument
+// lhs_is_null_literal is true iff the first argument to ASSERT_EQ()
+// is a null pointer literal.  The following default implementation is
+// for lhs_is_null_literal being false.
+template <bool lhs_is_null_literal>
+class EqHelper {
+ public:
+  // This templatized version is for the general case.
+  template <typename T1, typename T2>
+  static AssertionResult Compare(const char* lhs_expression,
+                                 const char* rhs_expression,
+                                 const T1& lhs,
+                                 const T2& rhs) {
+    return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
+  }
+
+  // With this overloaded version, we allow anonymous enums to be used
+  // in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous
+  // enums can be implicitly cast to BiggestInt.
+  //
+  // Even though its body looks the same as the above version, we
+  // cannot merge the two, as it will make anonymous enums unhappy.
+  static AssertionResult Compare(const char* lhs_expression,
+                                 const char* rhs_expression,
+                                 BiggestInt lhs,
+                                 BiggestInt rhs) {
+    return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
+  }
+};
+
+// This specialization is used when the first argument to ASSERT_EQ()
+// is a null pointer literal, like NULL, false, or 0.
+template <>
+class EqHelper<true> {
+ public:
+  // We define two overloaded versions of Compare().  The first
+  // version will be picked when the second argument to ASSERT_EQ() is
+  // NOT a pointer, e.g. ASSERT_EQ(0, AnIntFunction()) or
+  // EXPECT_EQ(false, a_bool).
+  template <typename T1, typename T2>
+  static AssertionResult Compare(
+      const char* lhs_expression,
+      const char* rhs_expression,
+      const T1& lhs,
+      const T2& rhs,
+      // The following line prevents this overload from being considered if T2
+      // is not a pointer type.  We need this because ASSERT_EQ(NULL, my_ptr)
+      // expands to Compare("", "", NULL, my_ptr), which requires a conversion
+      // to match the Secret* in the other overload, which would otherwise make
+      // this template match better.
+      typename EnableIf<!is_pointer<T2>::value>::type* = 0) {
+    return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
+  }
+
+  // This version will be picked when the second argument to ASSERT_EQ() is a
+  // pointer, e.g. ASSERT_EQ(NULL, a_pointer).
+  template <typename T>
+  static AssertionResult Compare(
+      const char* lhs_expression,
+      const char* rhs_expression,
+      // We used to have a second template parameter instead of Secret*.  That
+      // template parameter would deduce to 'long', making this a better match
+      // than the first overload even without the first overload's EnableIf.
+      // Unfortunately, gcc with -Wconversion-null warns when "passing NULL to
+      // non-pointer argument" (even a deduced integral argument), so the old
+      // implementation caused warnings in user code.
+      Secret* /* lhs (NULL) */,
+      T* rhs) {
+    // We already know that 'lhs' is a null pointer.
+    return CmpHelperEQ(lhs_expression, rhs_expression,
+                       static_cast<T*>(NULL), rhs);
+  }
+};
+
+// Separate the error generating code from the code path to reduce the stack
+// frame size of CmpHelperOP. This helps reduce the overhead of some sanitizers
+// when calling EXPECT_OP in a tight loop.
+template <typename T1, typename T2>
+AssertionResult CmpHelperOpFailure(const char* expr1, const char* expr2,
+                                   const T1& val1, const T2& val2,
+                                   const char* op) {
+  return AssertionFailure()
+         << "Expected: (" << expr1 << ") " << op << " (" << expr2
+         << "), actual: " << FormatForComparisonFailureMessage(val1, val2)
+         << " vs " << FormatForComparisonFailureMessage(val2, val1);
+}
+
+// A macro for implementing the helper functions needed to implement
+// ASSERT_?? and EXPECT_??.  It is here just to avoid copy-and-paste
+// of similar code.
+//
+// For each templatized helper function, we also define an overloaded
+// version for BiggestInt in order to reduce code bloat and allow
+// anonymous enums to be used with {ASSERT|EXPECT}_?? when compiled
+// with gcc 4.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
+template <typename T1, typename T2>\
+AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
+                                   const T1& val1, const T2& val2) {\
+  if (val1 op val2) {\
+    return AssertionSuccess();\
+  } else {\
+    return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);\
+  }\
+}\
+GTEST_API_ AssertionResult CmpHelper##op_name(\
+    const char* expr1, const char* expr2, BiggestInt val1, BiggestInt val2)
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+// Implements the helper function for {ASSERT|EXPECT}_NE
+GTEST_IMPL_CMP_HELPER_(NE, !=);
+// Implements the helper function for {ASSERT|EXPECT}_LE
+GTEST_IMPL_CMP_HELPER_(LE, <=);
+// Implements the helper function for {ASSERT|EXPECT}_LT
+GTEST_IMPL_CMP_HELPER_(LT, <);
+// Implements the helper function for {ASSERT|EXPECT}_GE
+GTEST_IMPL_CMP_HELPER_(GE, >=);
+// Implements the helper function for {ASSERT|EXPECT}_GT
+GTEST_IMPL_CMP_HELPER_(GT, >);
+
+#undef GTEST_IMPL_CMP_HELPER_
+
+// The helper function for {ASSERT|EXPECT}_STREQ.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const char* s1,
+                                          const char* s2);
+
+// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* s1_expression,
+                                              const char* s2_expression,
+                                              const char* s1,
+                                              const char* s2);
+
+// The helper function for {ASSERT|EXPECT}_STRNE.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const char* s1,
+                                          const char* s2);
+
+// The helper function for {ASSERT|EXPECT}_STRCASENE.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
+                                              const char* s2_expression,
+                                              const char* s1,
+                                              const char* s2);
+
+
+// Helper function for *_STREQ on wide strings.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const wchar_t* s1,
+                                          const wchar_t* s2);
+
+// Helper function for *_STRNE on wide strings.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const wchar_t* s1,
+                                          const wchar_t* s2);
+
+}  // namespace internal
+
+// IsSubstring() and IsNotSubstring() are intended to be used as the
+// first argument to {EXPECT,ASSERT}_PRED_FORMAT2(), not by
+// themselves.  They check whether needle is a substring of haystack
+// (NULL is considered a substring of itself only), and return an
+// appropriate error message when they fail.
+//
+// The {needle,haystack}_expr arguments are the stringified
+// expressions that generated the two real arguments.
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack);
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack);
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack);
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack);
+#endif  // GTEST_HAS_STD_WSTRING
+
+namespace internal {
+
+// Helper template function for comparing floating-points.
+//
+// Template parameter:
+//
+//   RawType: the raw floating-point type (either float or double)
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+template <typename RawType>
+AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
+                                         const char* rhs_expression,
+                                         RawType lhs_value,
+                                         RawType rhs_value) {
+  const FloatingPoint<RawType> lhs(lhs_value), rhs(rhs_value);
+
+  if (lhs.AlmostEquals(rhs)) {
+    return AssertionSuccess();
+  }
+
+  ::std::stringstream lhs_ss;
+  lhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+         << lhs_value;
+
+  ::std::stringstream rhs_ss;
+  rhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+         << rhs_value;
+
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   StringStreamToString(&lhs_ss),
+                   StringStreamToString(&rhs_ss),
+                   false);
+}
+
+// Helper function for implementing ASSERT_NEAR.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1,
+                                                const char* expr2,
+                                                const char* abs_error_expr,
+                                                double val1,
+                                                double val2,
+                                                double abs_error);
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+// A class that enables one to stream messages to assertion macros
+class GTEST_API_ AssertHelper {
+ public:
+  // Constructor.
+  AssertHelper(TestPartResult::Type type,
+               const char* file,
+               int line,
+               const char* message);
+  ~AssertHelper();
+
+  // Message assignment is a semantic trick to enable assertion
+  // streaming; see the GTEST_MESSAGE_ macro below.
+  void operator=(const Message& message) const;
+
+ private:
+  // We put our data in a struct so that the size of the AssertHelper class can
+  // be as small as possible.  This is important because gcc is incapable of
+  // re-using stack space even for temporary variables, so every EXPECT_EQ
+  // reserves stack space for another AssertHelper.
+  struct AssertHelperData {
+    AssertHelperData(TestPartResult::Type t,
+                     const char* srcfile,
+                     int line_num,
+                     const char* msg)
+        : type(t), file(srcfile), line(line_num), message(msg) { }
+
+    TestPartResult::Type const type;
+    const char* const file;
+    int const line;
+    std::string const message;
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelperData);
+  };
+
+  AssertHelperData* const data_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper);
+};
+
+}  // namespace internal
+
+// The pure interface class that all value-parameterized tests inherit from.
+// A value-parameterized class must inherit from both ::testing::Test and
+// ::testing::WithParamInterface. In most cases that just means inheriting
+// from ::testing::TestWithParam, but more complicated test hierarchies
+// may need to inherit from Test and WithParamInterface at different levels.
+//
+// This interface has support for accessing the test parameter value via
+// the GetParam() method.
+//
+// Use it with one of the parameter generator defining functions, like Range(),
+// Values(), ValuesIn(), Bool(), and Combine().
+//
+// class FooTest : public ::testing::TestWithParam<int> {
+//  protected:
+//   FooTest() {
+//     // Can use GetParam() here.
+//   }
+//   virtual ~FooTest() {
+//     // Can use GetParam() here.
+//   }
+//   virtual void SetUp() {
+//     // Can use GetParam() here.
+//   }
+//   virtual void TearDown {
+//     // Can use GetParam() here.
+//   }
+// };
+// TEST_P(FooTest, DoesBar) {
+//   // Can use GetParam() method here.
+//   Foo foo;
+//   ASSERT_TRUE(foo.DoesBar(GetParam()));
+// }
+// INSTANTIATE_TEST_CASE_P(OneToTenRange, FooTest, ::testing::Range(1, 10));
+
+template <typename T>
+class WithParamInterface {
+ public:
+  typedef T ParamType;
+  virtual ~WithParamInterface() {}
+
+  // The current parameter value. Is also available in the test fixture's
+  // constructor. This member function is non-static, even though it only
+  // references static data, to reduce the opportunity for incorrect uses
+  // like writing 'WithParamInterface<bool>::GetParam()' for a test that
+  // uses a fixture whose parameter type is int.
+  const ParamType& GetParam() const {
+    GTEST_CHECK_(parameter_ != NULL)
+        << "GetParam() can only be called inside a value-parameterized test "
+        << "-- did you intend to write TEST_P instead of TEST_F?";
+    return *parameter_;
+  }
+
+ private:
+  // Sets parameter value. The caller is responsible for making sure the value
+  // remains alive and unchanged throughout the current test.
+  static void SetParam(const ParamType* parameter) {
+    parameter_ = parameter;
+  }
+
+  // Static value used for accessing parameter during a test lifetime.
+  static const ParamType* parameter_;
+
+  // TestClass must be a subclass of WithParamInterface<T> and Test.
+  template <class TestClass> friend class internal::ParameterizedTestFactory;
+};
+
+template <typename T>
+const T* WithParamInterface<T>::parameter_ = NULL;
+
+// Most value-parameterized classes can ignore the existence of
+// WithParamInterface, and can just inherit from ::testing::TestWithParam.
+
+template <typename T>
+class TestWithParam : public Test, public WithParamInterface<T> {
+};
+
+// Macros for indicating success/failure in test code.
+
+// ADD_FAILURE unconditionally adds a failure to the current test.
+// SUCCEED generates a success - it doesn't automatically make the
+// current test successful, as a test is only successful when it has
+// no failure.
+//
+// EXPECT_* verifies that a certain condition is satisfied.  If not,
+// it behaves like ADD_FAILURE.  In particular:
+//
+//   EXPECT_TRUE  verifies that a Boolean condition is true.
+//   EXPECT_FALSE verifies that a Boolean condition is false.
+//
+// FAIL and ASSERT_* are similar to ADD_FAILURE and EXPECT_*, except
+// that they will also abort the current function on failure.  People
+// usually want the fail-fast behavior of FAIL and ASSERT_*, but those
+// writing data-driven tests often find themselves using ADD_FAILURE
+// and EXPECT_* more.
+
+// Generates a nonfatal failure with a generic message.
+#define ADD_FAILURE() GTEST_NONFATAL_FAILURE_("Failed")
+
+// Generates a nonfatal failure at the given source file location with
+// a generic message.
+#define ADD_FAILURE_AT(file, line) \
+  GTEST_MESSAGE_AT_(file, line, "Failed", \
+                    ::testing::TestPartResult::kNonFatalFailure)
+
+// Generates a fatal failure with a generic message.
+#define GTEST_FAIL() GTEST_FATAL_FAILURE_("Failed")
+
+// Define this macro to 1 to omit the definition of FAIL(), which is a
+// generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_FAIL
+# define FAIL() GTEST_FAIL()
+#endif
+
+// Generates a success with a generic message.
+#define GTEST_SUCCEED() GTEST_SUCCESS_("Succeeded")
+
+// Define this macro to 1 to omit the definition of SUCCEED(), which
+// is a generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_SUCCEED
+# define SUCCEED() GTEST_SUCCEED()
+#endif
+
+// Macros for testing exceptions.
+//
+//    * {ASSERT|EXPECT}_THROW(statement, expected_exception):
+//         Tests that the statement throws the expected exception.
+//    * {ASSERT|EXPECT}_NO_THROW(statement):
+//         Tests that the statement doesn't throw any exception.
+//    * {ASSERT|EXPECT}_ANY_THROW(statement):
+//         Tests that the statement throws an exception.
+
+#define EXPECT_THROW(statement, expected_exception) \
+  GTEST_TEST_THROW_(statement, expected_exception, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_NO_THROW(statement) \
+  GTEST_TEST_NO_THROW_(statement, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_ANY_THROW(statement) \
+  GTEST_TEST_ANY_THROW_(statement, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_THROW(statement, expected_exception) \
+  GTEST_TEST_THROW_(statement, expected_exception, GTEST_FATAL_FAILURE_)
+#define ASSERT_NO_THROW(statement) \
+  GTEST_TEST_NO_THROW_(statement, GTEST_FATAL_FAILURE_)
+#define ASSERT_ANY_THROW(statement) \
+  GTEST_TEST_ANY_THROW_(statement, GTEST_FATAL_FAILURE_)
+
+// Boolean assertions. Condition can be either a Boolean expression or an
+// AssertionResult. For more information on how to use AssertionResult with
+// these macros see comments on that class.
+#define EXPECT_TRUE(condition) \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
+                      GTEST_NONFATAL_FAILURE_)
+#define EXPECT_FALSE(condition) \
+  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
+                      GTEST_NONFATAL_FAILURE_)
+#define ASSERT_TRUE(condition) \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
+                      GTEST_FATAL_FAILURE_)
+#define ASSERT_FALSE(condition) \
+  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
+                      GTEST_FATAL_FAILURE_)
+
+// Macros for testing equalities and inequalities.
+//
+//    * {ASSERT|EXPECT}_EQ(v1, v2): Tests that v1 == v2
+//    * {ASSERT|EXPECT}_NE(v1, v2): Tests that v1 != v2
+//    * {ASSERT|EXPECT}_LT(v1, v2): Tests that v1 < v2
+//    * {ASSERT|EXPECT}_LE(v1, v2): Tests that v1 <= v2
+//    * {ASSERT|EXPECT}_GT(v1, v2): Tests that v1 > v2
+//    * {ASSERT|EXPECT}_GE(v1, v2): Tests that v1 >= v2
+//
+// When they are not, Google Test prints both the tested expressions and
+// their actual values.  The values must be compatible built-in types,
+// or you will get a compiler error.  By "compatible" we mean that the
+// values can be compared by the respective operator.
+//
+// Note:
+//
+//   1. It is possible to make a user-defined type work with
+//   {ASSERT|EXPECT}_??(), but that requires overloading the
+//   comparison operators and is thus discouraged by the Google C++
+//   Usage Guide.  Therefore, you are advised to use the
+//   {ASSERT|EXPECT}_TRUE() macro to assert that two objects are
+//   equal.
+//
+//   2. The {ASSERT|EXPECT}_??() macros do pointer comparisons on
+//   pointers (in particular, C strings).  Therefore, if you use it
+//   with two C strings, you are testing how their locations in memory
+//   are related, not how their content is related.  To compare two C
+//   strings by content, use {ASSERT|EXPECT}_STR*().
+//
+//   3. {ASSERT|EXPECT}_EQ(v1, v2) is preferred to
+//   {ASSERT|EXPECT}_TRUE(v1 == v2), as the former tells you
+//   what the actual value is when it fails, and similarly for the
+//   other comparisons.
+//
+//   4. Do not depend on the order in which {ASSERT|EXPECT}_??()
+//   evaluate their arguments, which is undefined.
+//
+//   5. These macros evaluate their arguments exactly once.
+//
+// Examples:
+//
+//   EXPECT_NE(Foo(), 5);
+//   EXPECT_EQ(a_pointer, NULL);
+//   ASSERT_LT(i, array_size);
+//   ASSERT_GT(records.size(), 0) << "There is no record left.";
+
+#define EXPECT_EQ(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal:: \
+                      EqHelper<GTEST_IS_NULL_LITERAL_(val1)>::Compare, \
+                      val1, val2)
+#define EXPECT_NE(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
+#define EXPECT_LE(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2)
+#define EXPECT_LT(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2)
+#define EXPECT_GE(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2)
+#define EXPECT_GT(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
+
+#define GTEST_ASSERT_EQ(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal:: \
+                      EqHelper<GTEST_IS_NULL_LITERAL_(val1)>::Compare, \
+                      val1, val2)
+#define GTEST_ASSERT_NE(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
+#define GTEST_ASSERT_LE(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2)
+#define GTEST_ASSERT_LT(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2)
+#define GTEST_ASSERT_GE(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2)
+#define GTEST_ASSERT_GT(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
+
+// Define macro GTEST_DONT_DEFINE_ASSERT_XY to 1 to omit the definition of
+// ASSERT_XY(), which clashes with some users' own code.
+
+#if !GTEST_DONT_DEFINE_ASSERT_EQ
+# define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_NE
+# define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_LE
+# define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_LT
+# define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_GE
+# define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_GT
+# define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2)
+#endif
+
+// C-string Comparisons.  All tests treat NULL and any non-NULL string
+// as different.  Two NULLs are equal.
+//
+//    * {ASSERT|EXPECT}_STREQ(s1, s2):     Tests that s1 == s2
+//    * {ASSERT|EXPECT}_STRNE(s1, s2):     Tests that s1 != s2
+//    * {ASSERT|EXPECT}_STRCASEEQ(s1, s2): Tests that s1 == s2, ignoring case
+//    * {ASSERT|EXPECT}_STRCASENE(s1, s2): Tests that s1 != s2, ignoring case
+//
+// For wide or narrow string objects, you can use the
+// {ASSERT|EXPECT}_??() macros.
+//
+// Don't depend on the order in which the arguments are evaluated,
+// which is undefined.
+//
+// These macros evaluate their arguments exactly once.
+
+#define EXPECT_STREQ(s1, s2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2)
+#define EXPECT_STRNE(s1, s2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
+#define EXPECT_STRCASEEQ(s1, s2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
+#define EXPECT_STRCASENE(s1, s2)\
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
+
+#define ASSERT_STREQ(s1, s2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2)
+#define ASSERT_STRNE(s1, s2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
+#define ASSERT_STRCASEEQ(s1, s2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
+#define ASSERT_STRCASENE(s1, s2)\
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
+
+// Macros for comparing floating-point numbers.
+//
+//    * {ASSERT|EXPECT}_FLOAT_EQ(val1, val2):
+//         Tests that two float values are almost equal.
+//    * {ASSERT|EXPECT}_DOUBLE_EQ(val1, val2):
+//         Tests that two double values are almost equal.
+//    * {ASSERT|EXPECT}_NEAR(v1, v2, abs_error):
+//         Tests that v1 and v2 are within the given distance to each other.
+//
+// Google Test uses ULP-based comparison to automatically pick a default
+// error bound that is appropriate for the operands.  See the
+// FloatingPoint template class in gtest-internal.h if you are
+// interested in the implementation details.
+
+#define EXPECT_FLOAT_EQ(val1, val2)\
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
+                      val1, val2)
+
+#define EXPECT_DOUBLE_EQ(val1, val2)\
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
+                      val1, val2)
+
+#define ASSERT_FLOAT_EQ(val1, val2)\
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
+                      val1, val2)
+
+#define ASSERT_DOUBLE_EQ(val1, val2)\
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
+                      val1, val2)
+
+#define EXPECT_NEAR(val1, val2, abs_error)\
+  EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
+                      val1, val2, abs_error)
+
+#define ASSERT_NEAR(val1, val2, abs_error)\
+  ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
+                      val1, val2, abs_error)
+
+// These predicate format functions work on floating-point values, and
+// can be used in {ASSERT|EXPECT}_PRED_FORMAT2*(), e.g.
+//
+//   EXPECT_PRED_FORMAT2(testing::DoubleLE, Foo(), 5.0);
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+GTEST_API_ AssertionResult FloatLE(const char* expr1, const char* expr2,
+                                   float val1, float val2);
+GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
+                                    double val1, double val2);
+
+
+#if GTEST_OS_WINDOWS
+
+// Macros that test for HRESULT failure and success, these are only useful
+// on Windows, and rely on Windows SDK macros and APIs to compile.
+//
+//    * {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}(expr)
+//
+// When expr unexpectedly fails or succeeds, Google Test prints the
+// expected result and the actual result with both a human-readable
+// string representation of the error, if available, as well as the
+// hex result code.
+# define EXPECT_HRESULT_SUCCEEDED(expr) \
+    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+
+# define ASSERT_HRESULT_SUCCEEDED(expr) \
+    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+
+# define EXPECT_HRESULT_FAILED(expr) \
+    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+
+# define ASSERT_HRESULT_FAILED(expr) \
+    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+
+#endif  // GTEST_OS_WINDOWS
+
+// Macros that execute statement and check that it doesn't generate new fatal
+// failures in the current thread.
+//
+//   * {ASSERT|EXPECT}_NO_FATAL_FAILURE(statement);
+//
+// Examples:
+//
+//   EXPECT_NO_FATAL_FAILURE(Process());
+//   ASSERT_NO_FATAL_FAILURE(Process()) << "Process() failed";
+//
+#define ASSERT_NO_FATAL_FAILURE(statement) \
+    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_)
+#define EXPECT_NO_FATAL_FAILURE(statement) \
+    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
+
+// Causes a trace (including the given source file path and line number,
+// and the given message) to be included in every test failure message generated
+// by code in the scope of the lifetime of an instance of this class. The effect
+// is undone with the destruction of the instance.
+//
+// The message argument can be anything streamable to std::ostream.
+//
+// Example:
+//   testing::ScopedTrace trace("file.cc", 123, "message");
+//
+class GTEST_API_ ScopedTrace {
+ public:
+  // The c'tor pushes the given source file location and message onto
+  // a trace stack maintained by Google Test.
+
+  // Template version. Uses Message() to convert the values into strings.
+  // Slow, but flexible.
+  template <typename T>
+  ScopedTrace(const char* file, int line, const T& message) {
+    PushTrace(file, line, (Message() << message).GetString());
+  }
+
+  // Optimize for some known types.
+  ScopedTrace(const char* file, int line, const char* message) {
+    PushTrace(file, line, message ? message : "(null)");
+  }
+
+#if GTEST_HAS_GLOBAL_STRING
+  ScopedTrace(const char* file, int line, const ::string& message) {
+    PushTrace(file, line, message);
+  }
+#endif
+
+  ScopedTrace(const char* file, int line, const std::string& message) {
+    PushTrace(file, line, message);
+  }
+
+  // The d'tor pops the info pushed by the c'tor.
+  //
+  // Note that the d'tor is not virtual in order to be efficient.
+  // Don't inherit from ScopedTrace!
+  ~ScopedTrace();
+
+ private:
+  void PushTrace(const char* file, int line, std::string message);
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace);
+} GTEST_ATTRIBUTE_UNUSED_;  // A ScopedTrace object does its job in its
+                            // c'tor and d'tor.  Therefore it doesn't
+                            // need to be used otherwise.
+
+// Causes a trace (including the source file path, the current line
+// number, and the given message) to be included in every test failure
+// message generated by code in the current scope.  The effect is
+// undone when the control leaves the current scope.
+//
+// The message argument can be anything streamable to std::ostream.
+//
+// In the implementation, we include the current line number as part
+// of the dummy variable name, thus allowing multiple SCOPED_TRACE()s
+// to appear in the same block - as long as they are on different
+// lines.
+//
+// Assuming that each thread maintains its own stack of traces.
+// Therefore, a SCOPED_TRACE() would (correctly) only affect the
+// assertions in its own thread.
+#define SCOPED_TRACE(message) \
+  ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\
+    __FILE__, __LINE__, (message))
+
+
+// Compile-time assertion for type equality.
+// StaticAssertTypeEq<type1, type2>() compiles iff type1 and type2 are
+// the same type.  The value it returns is not interesting.
+//
+// Instead of making StaticAssertTypeEq a class template, we make it a
+// function template that invokes a helper class template.  This
+// prevents a user from misusing StaticAssertTypeEq<T1, T2> by
+// defining objects of that type.
+//
+// CAVEAT:
+//
+// When used inside a method of a class template,
+// StaticAssertTypeEq<T1, T2>() is effective ONLY IF the method is
+// instantiated.  For example, given:
+//
+//   template <typename T> class Foo {
+//    public:
+//     void Bar() { testing::StaticAssertTypeEq<int, T>(); }
+//   };
+//
+// the code:
+//
+//   void Test1() { Foo<bool> foo; }
+//
+// will NOT generate a compiler error, as Foo<bool>::Bar() is never
+// actually instantiated.  Instead, you need:
+//
+//   void Test2() { Foo<bool> foo; foo.Bar(); }
+//
+// to cause a compiler error.
+template <typename T1, typename T2>
+bool StaticAssertTypeEq() {
+  (void)internal::StaticAssertTypeEqHelper<T1, T2>();
+  return true;
+}
+
+// Defines a test.
+//
+// The first parameter is the name of the test case, and the second
+// parameter is the name of the test within the test case.
+//
+// The convention is to end the test case name with "Test".  For
+// example, a test case for the Foo class can be named FooTest.
+//
+// Test code should appear between braces after an invocation of
+// this macro.  Example:
+//
+//   TEST(FooTest, InitializesCorrectly) {
+//     Foo foo;
+//     EXPECT_TRUE(foo.StatusIsOK());
+//   }
+
+// Note that we call GetTestTypeId() instead of GetTypeId<
+// ::testing::Test>() here to get the type ID of testing::Test.  This
+// is to work around a suspected linker bug when using Google Test as
+// a framework on Mac OS X.  The bug causes GetTypeId<
+// ::testing::Test>() to return different values depending on whether
+// the call is from the Google Test framework itself or from user test
+// code.  GetTestTypeId() is guaranteed to always return the same
+// value, as it always calls GetTypeId<>() from the Google Test
+// framework.
+#define GTEST_TEST(test_case_name, test_name)\
+  GTEST_TEST_(test_case_name, test_name, \
+              ::testing::Test, ::testing::internal::GetTestTypeId())
+
+// Define this macro to 1 to omit the definition of TEST(), which
+// is a generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_TEST
+# define TEST(test_case_name, test_name) GTEST_TEST(test_case_name, test_name)
+#endif
+
+// Defines a test that uses a test fixture.
+//
+// The first parameter is the name of the test fixture class, which
+// also doubles as the test case name.  The second parameter is the
+// name of the test within the test case.
+//
+// A test fixture class must be declared earlier.  The user should put
+// the test code between braces after using this macro.  Example:
+//
+//   class FooTest : public testing::Test {
+//    protected:
+//     virtual void SetUp() { b_.AddElement(3); }
+//
+//     Foo a_;
+//     Foo b_;
+//   };
+//
+//   TEST_F(FooTest, InitializesCorrectly) {
+//     EXPECT_TRUE(a_.StatusIsOK());
+//   }
+//
+//   TEST_F(FooTest, ReturnsElementCountCorrectly) {
+//     EXPECT_EQ(a_.size(), 0);
+//     EXPECT_EQ(b_.size(), 1);
+//   }
+
+#define TEST_F(test_fixture, test_name)\
+  GTEST_TEST_(test_fixture, test_name, test_fixture, \
+              ::testing::internal::GetTypeId<test_fixture>())
+
+// Returns a path to temporary directory.
+// Tries to determine an appropriate directory for the platform.
+GTEST_API_ std::string TempDir();
+
+#ifdef _MSC_VER
+#  pragma warning(pop)
+#endif
+
+}  // namespace testing
+
+// Use this function in main() to run all tests.  It returns 0 if all
+// tests are successful, or 1 otherwise.
+//
+// RUN_ALL_TESTS() should be invoked after the command line has been
+// parsed by InitGoogleTest().
+//
+// This function was formerly a macro; thus, it is in the global
+// namespace and has an all-caps name.
+int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_;
+
+inline int RUN_ALL_TESTS() {
+  return ::testing::UnitTest::GetInstance()->Run();
+}
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_H_
diff --git a/src/rocksdb/third-party/gtest-1.8.1/fused-src/gtest/gtest_main.cc b/src/rocksdb/third-party/gtest-1.8.1/fused-src/gtest/gtest_main.cc
new file mode 100644
index 000000000..2113f621e
--- /dev/null
+++ b/src/rocksdb/third-party/gtest-1.8.1/fused-src/gtest/gtest_main.cc
@@ -0,0 +1,37 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <stdio.h>
+#include "gtest/gtest.h"
+
+GTEST_API_ int main(int argc, char **argv) {
+  printf("Running main() from %s\n", __FILE__);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/thirdparty.inc b/src/rocksdb/thirdparty.inc
new file mode 100644
index 000000000..25ecdab88
--- /dev/null
+++ b/src/rocksdb/thirdparty.inc
@@ -0,0 +1,268 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Edit definitions below to specify paths to include files and libraries of all 3rd party libraries
+
+# TODO: Make this work with find_package and/or get rid of it
+#
+# This example assumes all the libraries locate in directories under THIRDPARTY_HOME environment variable
+# Set environment variable THIRDPARTY_HOME to point to your third party libraries home (Unix style dir separators)
+# or change the paths below to reflect where the libraries actually reside
+#
+set (THIRDPARTY_LIBS "")         # Initialization, don't touch
+
+#
+# Defaults
+#
+set(GFLAGS_HOME $ENV{THIRDPARTY_HOME}/Gflags.Library)
+set(GFLAGS_INCLUDE ${GFLAGS_HOME}/build/native/include)
+set(GFLAGS_LIB_DEBUG ${GFLAGS_HOME}/lib/native/debug/amd64/gflags.lib)
+set(GFLAGS_LIB_RELEASE ${GFLAGS_HOME}/lib/native/retail/amd64/gflags.lib)
+
+# ================================================== GFLAGS ==================================================
+# For compatibility
+if (GFLAGS)
+  set(WITH_GFLAGS ON)
+endif ()
+
+if (WITH_GFLAGS)
+  message(STATUS "GFLAGS library is enabled")
+  
+  if(DEFINED ENV{GFLAGS_INCLUDE})
+    set(GFLAGS_INCLUDE $ENV{GFLAGS_INCLUDE})
+  endif()
+  
+  if(DEFINED ENV{GFLAGS_LIB_DEBUG})
+    set(GFLAGS_LIB_DEBUG $ENV{GFLAGS_LIB_DEBUG})
+  endif()
+
+  if(DEFINED ENV{GFLAGS_LIB_RELEASE})
+    set(GFLAGS_LIB_RELEASE $ENV{GFLAGS_LIB_RELEASE})
+  endif()
+  
+  set(GFLAGS_CXX_FLAGS -DGFLAGS=gflags)
+  set(GFLAGS_LIBS debug ${GFLAGS_LIB_DEBUG} optimized ${GFLAGS_LIB_RELEASE})
+
+  add_definitions(${GFLAGS_CXX_FLAGS})
+  include_directories(${GFLAGS_INCLUDE})
+  set (THIRDPARTY_LIBS ${THIRDPARTY_LIBS} ${GFLAGS_LIBS})
+else ()
+  message(STATUS "GFLAGS library is disabled")
+endif ()
+
+# ================================================== SNAPPY ==================================================
+#
+# Edit these 4 lines to define paths to Snappy
+#
+set(SNAPPY_HOME $ENV{THIRDPARTY_HOME}/Snappy.Library)
+set(SNAPPY_INCLUDE ${SNAPPY_HOME}/build/native/inc/inc)
+set(SNAPPY_LIB_DEBUG ${SNAPPY_HOME}/lib/native/debug/amd64/snappy.lib)
+set(SNAPPY_LIB_RELEASE ${SNAPPY_HOME}/lib/native/retail/amd64/snappy.lib)
+
+# For compatibility
+if(SNAPPY)
+  set(WITH_SNAPPY ON)
+endif ()
+
+if (WITH_SNAPPY)
+  message(STATUS "SNAPPY library is enabled")
+
+  if(DEFINED ENV{SNAPPY_INCLUDE})
+    set(SNAPPY_INCLUDE $ENV{SNAPPY_INCLUDE})
+  endif()
+
+  if(DEFINED ENV{SNAPPY_LIB_DEBUG})
+    set(SNAPPY_LIB_DEBUG $ENV{SNAPPY_LIB_DEBUG})
+  endif()
+
+  if(DEFINED ENV{SNAPPY_LIB_RELEASE})
+    set(SNAPPY_LIB_RELEASE $ENV{SNAPPY_LIB_RELEASE})
+  endif()
+
+  set(SNAPPY_CXX_FLAGS -DSNAPPY)
+  set(SNAPPY_LIBS debug ${SNAPPY_LIB_DEBUG} optimized ${SNAPPY_LIB_RELEASE})
+
+  add_definitions(${SNAPPY_CXX_FLAGS})
+  include_directories(${SNAPPY_INCLUDE})
+  set (THIRDPARTY_LIBS ${THIRDPARTY_LIBS} ${SNAPPY_LIBS})
+else ()
+  message(STATUS "SNAPPY library is disabled")
+endif ()
+
+# ================================================== LZ4 ==================================================
+#
+# Edit these 4 lines to define paths to LZ4
+#
+set(LZ4_HOME $ENV{THIRDPARTY_HOME}/LZ4.Library)
+set(LZ4_INCLUDE ${LZ4_HOME}/build/native/inc/inc)
+set(LZ4_LIB_DEBUG ${LZ4_HOME}/lib/native/debug/amd64/lz4.lib)
+set(LZ4_LIB_RELEASE ${LZ4_HOME}/lib/native/retail/amd64/lz4.lib)
+
+
+# For compatibility
+if (LZ4)
+  set(WITH_LZ4 ON)
+endif ()
+
+if (WITH_LZ4)
+  message(STATUS "LZ4 library is enabled")
+  
+  if(DEFINED ENV{LZ4_INCLUDE})
+    set(LZ4_INCLUDE $ENV{LZ4_INCLUDE})
+  endif()
+  
+  if(DEFINED ENV{LZ4_LIB_DEBUG})
+    set(LZ4_LIB_DEBUG $ENV{LZ4_LIB_DEBUG})
+  endif()
+
+  if(DEFINED ENV{LZ4_LIB_RELEASE})
+    set(LZ4_LIB_RELEASE $ENV{LZ4_LIB_RELEASE})
+  endif()
+  
+  set(LZ4_CXX_FLAGS -DLZ4)
+  set(LZ4_LIBS debug ${LZ4_LIB_DEBUG} optimized ${LZ4_LIB_RELEASE})
+
+  add_definitions(${LZ4_CXX_FLAGS})
+  include_directories(${LZ4_INCLUDE})
+  set (THIRDPARTY_LIBS ${THIRDPARTY_LIBS} ${LZ4_LIBS})
+else ()
+  message(STATUS "LZ4 library is disabled")
+endif ()
+
+# ================================================== ZLIB ==================================================
+#
+# Edit these 4 lines to define paths to ZLIB
+#
+set(ZLIB_HOME $ENV{THIRDPARTY_HOME}/ZLIB.Library)
+set(ZLIB_INCLUDE ${ZLIB_HOME}/build/native/inc/inc)
+set(ZLIB_LIB_DEBUG ${ZLIB_HOME}/lib/native/debug/amd64/zlib.lib)
+set(ZLIB_LIB_RELEASE ${ZLIB_HOME}/lib/native/retail/amd64/zlib.lib)
+
+# For compatibilty
+if (ZLIB)
+  set(WITH_ZLIB ON)
+endif ()
+
+if (WITH_ZLIB)
+  message(STATUS "ZLIB library is enabled")
+
+  if(DEFINED ENV{ZLIB_INCLUDE})
+    set(ZLIB_INCLUDE $ENV{ZLIB_INCLUDE})
+  endif()
+  
+  if(DEFINED ENV{ZLIB_LIB_DEBUG})
+    set(ZLIB_LIB_DEBUG $ENV{ZLIB_LIB_DEBUG})
+  endif()
+
+  if(DEFINED ENV{ZLIB_LIB_RELEASE})
+    set(ZLIB_LIB_RELEASE $ENV{ZLIB_LIB_RELEASE})
+  endif()
+  
+  set(ZLIB_CXX_FLAGS -DZLIB)
+  set(ZLIB_LIBS debug ${ZLIB_LIB_DEBUG} optimized ${ZLIB_LIB_RELEASE})
+
+  add_definitions(${ZLIB_CXX_FLAGS})
+  include_directories(${ZLIB_INCLUDE})
+  set (THIRDPARTY_LIBS ${THIRDPARTY_LIBS} ${ZLIB_LIBS})
+else ()
+  message(STATUS "ZLIB library is disabled")
+endif ()
+
+# ================================================== XPRESS ==================================================
+# This makes use of built-in Windows API, no additional includes, links to a system lib
+
+# For compatibilty
+if (XPRESS)
+  set(WITH_XPRESS ON)
+endif ()
+
+if (WITH_XPRESS)
+  message(STATUS "XPRESS is enabled")
+
+  add_definitions(-DXPRESS)
+  
+  # We are using the implementation provided by the system
+  set (SYSTEM_LIBS ${SYSTEM_LIBS} Cabinet.lib)
+else ()
+  message(STATUS "XPRESS is disabled")
+endif ()
+
+
+# ================================================== ZSTD ==================================================
+#
+# Edit these 4 lines to define paths to ZSTD
+#
+set(ZSTD_HOME $ENV{THIRDPARTY_HOME}/ZSTD.Library)
+set(ZSTD_INCLUDE ${ZSTD_HOME}/build/native/inc)
+set(ZSTD_LIB_DEBUG ${ZSTD_HOME}/lib/native/debug/amd64/libzstd_static.lib)
+set(ZSTD_LIB_RELEASE ${ZSTD_HOME}/lib/native/retail/amd64/libzstd_static.lib)
+
+# For compatibility
+if (ZSTD)
+  set(WITH_ZSTD ON)
+endif ()
+
+if (WITH_ZSTD)
+  message(STATUS "ZSTD library is enabled")
+
+  if(DEFINED ENV{ZSTD_INCLUDE})
+    set(ZSTD_INCLUDE $ENV{ZSTD_INCLUDE})
+  endif()
+  
+  if(DEFINED ENV{ZSTD_LIB_DEBUG})
+    set(ZSTD_LIB_DEBUG $ENV{ZSTD_LIB_DEBUG})
+  endif()
+
+  if(DEFINED ENV{ZSTD_LIB_RELEASE})
+    set(ZSTD_LIB_RELEASE $ENV{ZSTD_LIB_RELEASE})
+  endif()
+
+  # ZSTD_STATIC_LINKING_ONLY only allows us to create an allocation functions override
+  # When jemalloc is in use
+  set(ZSTD_LIBS debug ${ZSTD_LIB_DEBUG} optimized ${ZSTD_LIB_RELEASE})
+
+  add_definitions(-DZSTD -DZSTD_STATIC_LINKING_ONLY)
+  include_directories(${ZSTD_INCLUDE})
+  set (THIRDPARTY_LIBS ${THIRDPARTY_LIBS} ${ZSTD_LIBS})
+else ()
+  message(STATUS "ZSTD library is disabled")
+endif ()
+
+#
+# Edit these 4 lines to define paths to Jemalloc
+#
+set(JEMALLOC_HOME $ENV{THIRDPARTY_HOME}/Jemalloc.Library)
+set(JEMALLOC_INCLUDE ${JEMALLOC_HOME}/build/native/inc)
+set(JEMALLOC_LIB_DEBUG ${JEMALLOC_HOME}/lib/native/debug/amd64/jemalloc.lib)
+set(JEMALLOC_LIB_RELEASE ${JEMALLOC_HOME}/lib/native/retail/amd64/jemalloc.lib)
+
+# ================================================== JEMALLOC ==================================================
+if(JEMALLOC)
+  set(WITH_JEMALLOC ON)
+endif()
+
+if (WITH_JEMALLOC)
+  message(STATUS "JEMALLOC library is enabled")
+  set(JEMALLOC_CXX_FLAGS "-DROCKSDB_JEMALLOC -DJEMALLOC_EXPORT= -DJEMALLOC_NO_RENAME")
+  
+  if(DEFINED ENV{JEMALLOC_INCLUDE})
+    set(JEMALLOC_INCLUDE $ENV{JEMALLOC_INCLUDE})
+  endif()
+  
+  if(DEFINED ENV{JEMALLOC_LIB_DEBUG})
+    set(JEMALLOC_LIB_DEBUG $ENV{JEMALLOC_LIB_DEBUG})
+  endif()
+
+  if(DEFINED ENV{JEMALLOC_LIB_RELEASE})
+    set(JEMALLOC_LIB_RELEASE $ENV{JEMALLOC_LIB_RELEASE})
+  endif()
+
+  set(JEMALLOC_LIBS debug ${JEMALLOC_LIB_DEBUG} optimized ${JEMALLOC_LIB_RELEASE})
+
+  add_definitions(${JEMALLOC_CXX_FLAGS})
+  include_directories(${JEMALLOC_INCLUDE})
+  set (THIRDPARTY_LIBS ${THIRDPARTY_LIBS} ${JEMALLOC_LIBS})
+  set (ARTIFACT_SUFFIX "_je")
+
+else ()
+  set (ARTIFACT_SUFFIX "")
+  message(STATUS "JEMALLOC library is disabled")
+endif ()
diff --git a/src/rocksdb/tools/CMakeLists.txt b/src/rocksdb/tools/CMakeLists.txt
new file mode 100644
index 000000000..19030e84b
--- /dev/null
+++ b/src/rocksdb/tools/CMakeLists.txt
@@ -0,0 +1,30 @@
+set(CORE_TOOLS
+  sst_dump.cc
+  ldb.cc)
+foreach(src ${CORE_TOOLS})
+  get_filename_component(exename ${src} NAME_WE)
+  add_executable(${exename}${ARTIFACT_SUFFIX}
+    ${src})
+  target_link_libraries(${exename}${ARTIFACT_SUFFIX} ${ROCKSDB_LIB})
+  list(APPEND core_tool_deps ${exename})
+endforeach()
+
+if(WITH_TOOLS)
+  set(TOOLS
+    db_sanity_test.cc
+    write_stress.cc
+    db_repl_stress.cc
+    dump/rocksdb_dump.cc
+    dump/rocksdb_undump.cc)
+  foreach(src ${TOOLS})
+    get_filename_component(exename ${src} NAME_WE)
+    add_executable(${exename}${ARTIFACT_SUFFIX}
+      ${src})
+    target_link_libraries(${exename}${ARTIFACT_SUFFIX} ${ROCKSDB_LIB} ${THIRDPARTY_LIBS})
+    list(APPEND tool_deps ${exename})
+  endforeach()
+
+  add_custom_target(ldb_tests
+    COMMAND python ${CMAKE_CURRENT_SOURCE_DIR}/ldb_tests.py
+    DEPENDS ldb)
+endif()
diff --git a/src/rocksdb/tools/Dockerfile b/src/rocksdb/tools/Dockerfile
new file mode 100644
index 000000000..1d5ead7fd
--- /dev/null
+++ b/src/rocksdb/tools/Dockerfile
@@ -0,0 +1,5 @@
+FROM buildpack-deps:wheezy
+
+ADD ./ldb /rocksdb/tools/ldb
+
+CMD /rocksdb/tools/ldb
diff --git a/src/rocksdb/tools/advisor/README.md b/src/rocksdb/tools/advisor/README.md
new file mode 100644
index 000000000..b02d7ec50
--- /dev/null
+++ b/src/rocksdb/tools/advisor/README.md
@@ -0,0 +1,96 @@
+# Rocksdb Tuning Advisor
+
+## Motivation
+
+The performance of Rocksdb is contingent on its tuning. However,
+because of the complexity of its underlying technology and a large number of
+configurable parameters, a good configuration is sometimes hard to obtain. The aim of
+the python command-line tool, Rocksdb Advisor, is to automate the process of
+suggesting improvements in the configuration based on advice from Rocksdb
+experts.
+
+## Overview
+
+Experts share their wisdom as rules comprising of conditions and suggestions in the INI format (refer
+[rules.ini](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/rules.ini)).
+Users provide the Rocksdb configuration that they want to improve upon (as the
+familiar Rocksdb OPTIONS file —
+[example](https://github.com/facebook/rocksdb/blob/main/examples/rocksdb_option_file_example.ini))
+and the path of the file which contains Rocksdb logs and statistics.
+The [Advisor](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/rule_parser_example.py)
+creates appropriate DataSource objects (for Rocksdb
+[logs](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/db_log_parser.py),
+[options](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/db_options_parser.py),
+[statistics](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/db_stats_fetcher.py) etc.)
+and provides them to the [Rules Engine](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/rule_parser.py).
+The Rules uses rules from experts to parse data-sources and trigger appropriate rules.
+The Advisor's output gives information about which rules were triggered,
+why they were triggered and what each of them suggests. Each suggestion
+provided by a triggered rule advises some action on a Rocksdb
+configuration option, for example, increase CFOptions.write_buffer_size,
+set bloom_bits to 2 etc.
+
+## Usage
+
+### Prerequisites
+The tool needs the following to run:
+* python3
+
+### Running the tool
+An example command to run the tool:
+
+```shell
+cd rocksdb/tools/advisor
+python3 -m advisor.rule_parser_example --rules_spec=advisor/rules.ini --rocksdb_options=test/input_files/OPTIONS-000005 --log_files_path_prefix=test/input_files/LOG-0 --stats_dump_period_sec=20
+```
+
+### Command-line arguments
+
+Most important amongst all the input that the Advisor needs, are the rules
+spec and starting Rocksdb configuration. The configuration is provided as the
+familiar Rocksdb Options file (refer [example](https://github.com/facebook/rocksdb/blob/main/examples/rocksdb_option_file_example.ini)).
+The Rules spec is written in the INI format (more details in
+[rules.ini](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/rules.ini)).
+
+In brief, a Rule is made of conditions and is triggered when all its
+constituent conditions are triggered. When triggered, a Rule suggests changes
+(increase/decrease/set to a suggested value) to certain Rocksdb options that
+aim to improve Rocksdb performance. Every Condition has a 'source' i.e.
+the data source that would be checked for triggering that condition.
+For example, a log Condition (with 'source=LOG') is triggered if a particular
+'regex' is found in the Rocksdb LOG files. As of now the Rules Engine
+supports 3 types of Conditions (and consequently data-sources):
+LOG, OPTIONS, TIME_SERIES. The TIME_SERIES data can be sourced from the
+Rocksdb [statistics](https://github.com/facebook/rocksdb/blob/main/include/rocksdb/statistics.h)
+or [perf context](https://github.com/facebook/rocksdb/blob/main/include/rocksdb/perf_context.h).
+
+For more information about the remaining command-line arguments, run:
+
+```shell
+cd rocksdb/tools/advisor
+python3 -m advisor.rule_parser_example --help
+```
+
+### Sample output
+
+Here, a Rocksdb log-based rule has been triggered:
+
+```shell
+Rule: stall-too-many-memtables
+LogCondition: stall-too-many-memtables regex: Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+Suggestion: inc-bg-flush option : DBOptions.max_background_flushes action : increase suggested_values : ['2']
+Suggestion: inc-write-buffer option : CFOptions.max_write_buffer_number action : increase
+scope: col_fam:
+{'default'}
+```
+
+## Running the tests
+
+Tests for the code have been added to the
+[test/](https://github.com/facebook/rocksdb/tree/main/tools/advisor/test)
+directory. For example, to run the unit tests for db_log_parser.py:
+
+```shell
+cd rocksdb/tools/advisor
+python3 -m unittest -v test.test_db_log_parser
+```
diff --git a/src/rocksdb/tools/advisor/advisor/__init__.py b/src/rocksdb/tools/advisor/advisor/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/src/rocksdb/tools/advisor/advisor/__init__.py
diff --git a/src/rocksdb/tools/advisor/advisor/bench_runner.py b/src/rocksdb/tools/advisor/advisor/bench_runner.py
new file mode 100644
index 000000000..45d6c8313
--- /dev/null
+++ b/src/rocksdb/tools/advisor/advisor/bench_runner.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+import re
+from abc import ABC, abstractmethod
+
+
+class BenchmarkRunner(ABC):
+    @staticmethod
+    @abstractmethod
+    def is_metric_better(new_metric, old_metric):
+        pass
+
+    @abstractmethod
+    def run_experiment(self):
+        # should return a list of DataSource objects
+        pass
+
+    @staticmethod
+    def get_info_log_file_name(log_dir, db_path):
+        # Example: DB Path = /dev/shm and OPTIONS file has option
+        # db_log_dir=/tmp/rocks/, then the name of the log file will be
+        # 'dev_shm_LOG' and its location will be /tmp/rocks. If db_log_dir is
+        # not specified in the OPTIONS file, then the location of the log file
+        # will be /dev/shm and the name of the file will be 'LOG'
+        file_name = ""
+        if log_dir:
+            # refer GetInfoLogPrefix() in rocksdb/util/filename.cc
+            # example db_path: /dev/shm/dbbench
+            file_name = db_path[1:]  # to ignore the leading '/' character
+            to_be_replaced = re.compile("[^0-9a-zA-Z\-_\.]")  # noqa
+            for character in to_be_replaced.findall(db_path):
+                file_name = file_name.replace(character, "_")
+            if not file_name.endswith("_"):
+                file_name += "_"
+        file_name += "LOG"
+        return file_name
diff --git a/src/rocksdb/tools/advisor/advisor/config_optimizer_example.py b/src/rocksdb/tools/advisor/advisor/config_optimizer_example.py
new file mode 100644
index 000000000..40e2bb953
--- /dev/null
+++ b/src/rocksdb/tools/advisor/advisor/config_optimizer_example.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+import argparse
+
+from advisor.db_config_optimizer import ConfigOptimizer
+from advisor.db_log_parser import NO_COL_FAMILY
+from advisor.db_options_parser import DatabaseOptions
+from advisor.rule_parser import RulesSpec
+
+
+CONFIG_OPT_NUM_ITER = 10
+
+
+def main(args):
+    # initialise the RulesSpec parser
+    rule_spec_parser = RulesSpec(args.rules_spec)
+    # initialise the benchmark runner
+    bench_runner_module = __import__(
+        args.benchrunner_module, fromlist=[args.benchrunner_class]
+    )
+    bench_runner_class = getattr(bench_runner_module, args.benchrunner_class)
+    ods_args = {}
+    if args.ods_client and args.ods_entity:
+        ods_args["client_script"] = args.ods_client
+        ods_args["entity"] = args.ods_entity
+        if args.ods_key_prefix:
+            ods_args["key_prefix"] = args.ods_key_prefix
+    db_bench_runner = bench_runner_class(args.benchrunner_pos_args, ods_args)
+    # initialise the database configuration
+    db_options = DatabaseOptions(args.rocksdb_options, args.misc_options)
+    # set the frequency at which stats are dumped in the LOG file and the
+    # location of the LOG file.
+    db_log_dump_settings = {
+        "DBOptions.stats_dump_period_sec": {NO_COL_FAMILY: args.stats_dump_period_sec}
+    }
+    db_options.update_options(db_log_dump_settings)
+    # initialise the configuration optimizer
+    config_optimizer = ConfigOptimizer(
+        db_bench_runner, db_options, rule_spec_parser, args.base_db_path
+    )
+    # run the optimiser to improve the database configuration for given
+    # benchmarks, with the help of expert-specified rules
+    final_db_options = config_optimizer.run()
+    # generate the final rocksdb options file
+    print(
+        "Final configuration in: " + final_db_options.generate_options_config("final")
+    )
+    print("Final miscellaneous options: " + repr(final_db_options.get_misc_options()))
+
+
+if __name__ == "__main__":
+    """
+    An example run of this tool from the command-line would look like:
+    python3 -m advisor.config_optimizer_example
+    --base_db_path=/tmp/rocksdbtest-155919/dbbench
+    --rocksdb_options=temp/OPTIONS_boot.tmp --misc_options bloom_bits=2
+    --rules_spec=advisor/rules.ini --stats_dump_period_sec=20
+    --benchrunner_module=advisor.db_bench_runner
+    --benchrunner_class=DBBenchRunner --benchrunner_pos_args ./../../db_bench
+    readwhilewriting use_existing_db=true duration=90
+    """
+    parser = argparse.ArgumentParser(
+        description="This script is used for\
+        searching for a better database configuration"
+    )
+    parser.add_argument(
+        "--rocksdb_options",
+        required=True,
+        type=str,
+        help="path of the starting Rocksdb OPTIONS file",
+    )
+    # these are options that are column-family agnostic and are not yet
+    # supported by the Rocksdb Options file: eg. bloom_bits=2
+    parser.add_argument(
+        "--misc_options",
+        nargs="*",
+        help="whitespace-separated list of options that are not supported "
+        + "by the Rocksdb OPTIONS file, given in the "
+        + '<option_name>=<option_value> format eg. "bloom_bits=2 '
+        + 'rate_limiter_bytes_per_sec=128000000"',
+    )
+    parser.add_argument(
+        "--base_db_path", required=True, type=str, help="path for the Rocksdb database"
+    )
+    parser.add_argument(
+        "--rules_spec",
+        required=True,
+        type=str,
+        help="path of the file containing the expert-specified Rules",
+    )
+    parser.add_argument(
+        "--stats_dump_period_sec",
+        required=True,
+        type=int,
+        help="the frequency (in seconds) at which STATISTICS are printed to "
+        + "the Rocksdb LOG file",
+    )
+    # ODS arguments
+    parser.add_argument("--ods_client", type=str, help="the ODS client binary")
+    parser.add_argument(
+        "--ods_entity",
+        type=str,
+        help="the servers for which the ODS stats need to be fetched",
+    )
+    parser.add_argument(
+        "--ods_key_prefix",
+        type=str,
+        help="the prefix that needs to be attached to the keys of time "
+        + "series to be fetched from ODS",
+    )
+    # benchrunner_module example: advisor.db_benchmark_client
+    parser.add_argument(
+        "--benchrunner_module",
+        required=True,
+        type=str,
+        help="the module containing the BenchmarkRunner class to be used by "
+        + "the Optimizer, example: advisor.db_bench_runner",
+    )
+    # benchrunner_class example: DBBenchRunner
+    parser.add_argument(
+        "--benchrunner_class",
+        required=True,
+        type=str,
+        help="the name of the BenchmarkRunner class to be used by the "
+        + "Optimizer, should be present in the module provided in the "
+        + "benchrunner_module argument, example: DBBenchRunner",
+    )
+    parser.add_argument(
+        "--benchrunner_pos_args",
+        nargs="*",
+        help="whitespace-separated positional arguments that are passed on "
+        + "to the constructor of the BenchmarkRunner class provided in the "
+        + 'benchrunner_class argument, example: "use_existing_db=true '
+        + 'duration=900"',
+    )
+    args = parser.parse_args()
+    main(args)
diff --git a/src/rocksdb/tools/advisor/advisor/db_bench_runner.py b/src/rocksdb/tools/advisor/advisor/db_bench_runner.py
new file mode 100644
index 000000000..f5802ed15
--- /dev/null
+++ b/src/rocksdb/tools/advisor/advisor/db_bench_runner.py
@@ -0,0 +1,237 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+import shutil
+import subprocess
+import time
+
+from advisor.bench_runner import BenchmarkRunner
+from advisor.db_log_parser import DatabaseLogs, DataSource, NO_COL_FAMILY
+from advisor.db_stats_fetcher import (
+    DatabasePerfContext,
+    LogStatsParser,
+    OdsStatsFetcher,
+)
+
+
+"""
+NOTE: This is not thread-safe, because the output file is simply overwritten.
+"""
+
+
+class DBBenchRunner(BenchmarkRunner):
+    OUTPUT_FILE = "temp/dbbench_out.tmp"
+    ERROR_FILE = "temp/dbbench_err.tmp"
+    DB_PATH = "DB path"
+    THROUGHPUT = "ops/sec"
+    PERF_CON = " PERF_CONTEXT:"
+
+    @staticmethod
+    def is_metric_better(new_metric, old_metric):
+        # for db_bench 'throughput' is the metric returned by run_experiment
+        return new_metric >= old_metric
+
+    @staticmethod
+    def get_opt_args_str(misc_options_dict):
+        # given a dictionary of options and their values, return a string
+        # that can be appended as command-line arguments
+        optional_args_str = ""
+        for option_name, option_value in misc_options_dict.items():
+            if option_value:
+                optional_args_str += " --" + option_name + "=" + str(option_value)
+        return optional_args_str
+
+    def __init__(self, positional_args, ods_args=None):
+        # parse positional_args list appropriately
+        self.db_bench_binary = positional_args[0]
+        self.benchmark = positional_args[1]
+        self.db_bench_args = None
+        if len(positional_args) > 2:
+            # options list with each option given as "<option>=<value>"
+            self.db_bench_args = positional_args[2:]
+        # save ods_args, if provided
+        self.ods_args = ods_args
+
+    def _parse_output(self, get_perf_context=False):
+        """
+        Sample db_bench output after running 'readwhilewriting' benchmark:
+        DB path: [/tmp/rocksdbtest-155919/dbbench]\n
+        readwhilewriting : 16.582 micros/op 60305 ops/sec; 4.2 MB/s (3433828\
+        of 5427999 found)\n
+        PERF_CONTEXT:\n
+        user_key_comparison_count = 500466712, block_cache_hit_count = ...\n
+        """
+        output = {self.THROUGHPUT: None, self.DB_PATH: None, self.PERF_CON: None}
+        perf_context_begins = False
+        with open(self.OUTPUT_FILE, "r") as fp:
+            for line in fp:
+                if line.startswith(self.benchmark):
+                    # line from sample output:
+                    # readwhilewriting : 16.582 micros/op 60305 ops/sec; \
+                    # 4.2 MB/s (3433828 of 5427999 found)\n
+                    print(line)  # print output of the benchmark run
+                    token_list = line.strip().split()
+                    for ix, token in enumerate(token_list):
+                        if token.startswith(self.THROUGHPUT):
+                            # in above example, throughput = 60305 ops/sec
+                            output[self.THROUGHPUT] = float(token_list[ix - 1])
+                            break
+                elif get_perf_context and line.startswith(self.PERF_CON):
+                    # the following lines in the output contain perf context
+                    # statistics (refer example above)
+                    perf_context_begins = True
+                elif get_perf_context and perf_context_begins:
+                    # Sample perf_context output:
+                    # user_key_comparison_count = 500, block_cache_hit_count =\
+                    # 468, block_read_count = 580, block_read_byte = 445, ...
+                    token_list = line.strip().split(",")
+                    # token_list = ['user_key_comparison_count = 500',
+                    # 'block_cache_hit_count = 468','block_read_count = 580'...
+                    perf_context = {
+                        tk.split("=")[0].strip(): tk.split("=")[1].strip()
+                        for tk in token_list
+                        if tk
+                    }
+                    # TODO(poojam23): this is a hack and should be replaced
+                    # with the timestamp that db_bench will provide per printed
+                    # perf_context
+                    timestamp = int(time.time())
+                    perf_context_ts = {}
+                    for stat in perf_context.keys():
+                        perf_context_ts[stat] = {timestamp: int(perf_context[stat])}
+                    output[self.PERF_CON] = perf_context_ts
+                    perf_context_begins = False
+                elif line.startswith(self.DB_PATH):
+                    # line from sample output:
+                    # DB path: [/tmp/rocksdbtest-155919/dbbench]\n
+                    output[self.DB_PATH] = line.split("[")[1].split("]")[0]
+        return output
+
+    def get_log_options(self, db_options, db_path):
+        # get the location of the LOG file and the frequency at which stats are
+        # dumped in the LOG file
+        log_dir_path = None
+        stats_freq_sec = None
+        logs_file_prefix = None
+
+        # fetch frequency at which the stats are dumped in the Rocksdb logs
+        dump_period = "DBOptions.stats_dump_period_sec"
+        # fetch the directory, if specified, in which the Rocksdb logs are
+        # dumped, by default logs are dumped in same location as database
+        log_dir = "DBOptions.db_log_dir"
+        log_options = db_options.get_options([dump_period, log_dir])
+        if dump_period in log_options:
+            stats_freq_sec = int(log_options[dump_period][NO_COL_FAMILY])
+        if log_dir in log_options:
+            log_dir_path = log_options[log_dir][NO_COL_FAMILY]
+
+        log_file_name = DBBenchRunner.get_info_log_file_name(log_dir_path, db_path)
+
+        if not log_dir_path:
+            log_dir_path = db_path
+        if not log_dir_path.endswith("/"):
+            log_dir_path += "/"
+
+        logs_file_prefix = log_dir_path + log_file_name
+        return (logs_file_prefix, stats_freq_sec)
+
+    def _get_options_command_line_args_str(self, curr_options):
+        """
+        This method uses the provided Rocksdb OPTIONS to create a string of
+        command-line arguments for db_bench.
+        The --options_file argument is always given and the options that are
+        not supported by the OPTIONS file are given as separate arguments.
+        """
+        optional_args_str = DBBenchRunner.get_opt_args_str(
+            curr_options.get_misc_options()
+        )
+        # generate an options configuration file
+        options_file = curr_options.generate_options_config(nonce="12345")
+        optional_args_str += " --options_file=" + options_file
+        return optional_args_str
+
+    def _setup_db_before_experiment(self, curr_options, db_path):
+        # remove destination directory if it already exists
+        try:
+            shutil.rmtree(db_path, ignore_errors=True)
+        except OSError as e:
+            print("Error: rmdir " + e.filename + " " + e.strerror)
+        # setup database with a million keys using the fillrandom benchmark
+        command = "%s --benchmarks=fillrandom --db=%s --num=1000000" % (
+            self.db_bench_binary,
+            db_path,
+        )
+        args_str = self._get_options_command_line_args_str(curr_options)
+        command += args_str
+        self._run_command(command)
+
+    def _build_experiment_command(self, curr_options, db_path):
+        command = "%s --benchmarks=%s --statistics --perf_level=3 --db=%s" % (
+            self.db_bench_binary,
+            self.benchmark,
+            db_path,
+        )
+        # fetch the command-line arguments string for providing Rocksdb options
+        args_str = self._get_options_command_line_args_str(curr_options)
+        # handle the command-line args passed in the constructor, these
+        # arguments are specific to db_bench
+        for cmd_line_arg in self.db_bench_args:
+            args_str += " --" + cmd_line_arg
+        command += args_str
+        return command
+
+    def _run_command(self, command):
+        out_file = open(self.OUTPUT_FILE, "w+")
+        err_file = open(self.ERROR_FILE, "w+")
+        print("executing... - " + command)
+        subprocess.call(command, shell=True, stdout=out_file, stderr=err_file)
+        out_file.close()
+        err_file.close()
+
+    def run_experiment(self, db_options, db_path):
+        # setup the Rocksdb database before running experiment
+        self._setup_db_before_experiment(db_options, db_path)
+        # get the command to run the experiment
+        command = self._build_experiment_command(db_options, db_path)
+        experiment_start_time = int(time.time())
+        # run experiment
+        self._run_command(command)
+        experiment_end_time = int(time.time())
+        # parse the db_bench experiment output
+        parsed_output = self._parse_output(get_perf_context=True)
+
+        # get the log files path prefix and frequency at which Rocksdb stats
+        # are dumped in the logs
+        logs_file_prefix, stats_freq_sec = self.get_log_options(
+            db_options, parsed_output[self.DB_PATH]
+        )
+        # create the Rocksbd LOGS object
+        db_logs = DatabaseLogs(logs_file_prefix, db_options.get_column_families())
+        # Create the Log STATS object
+        db_log_stats = LogStatsParser(logs_file_prefix, stats_freq_sec)
+        # Create the PerfContext STATS object
+        db_perf_context = DatabasePerfContext(parsed_output[self.PERF_CON], 0, False)
+        # create the data-sources dictionary
+        data_sources = {
+            DataSource.Type.DB_OPTIONS: [db_options],
+            DataSource.Type.LOG: [db_logs],
+            DataSource.Type.TIME_SERIES: [db_log_stats, db_perf_context],
+        }
+        # Create the ODS STATS object
+        if self.ods_args:
+            key_prefix = ""
+            if "key_prefix" in self.ods_args:
+                key_prefix = self.ods_args["key_prefix"]
+            data_sources[DataSource.Type.TIME_SERIES].append(
+                OdsStatsFetcher(
+                    self.ods_args["client_script"],
+                    self.ods_args["entity"],
+                    experiment_start_time,
+                    experiment_end_time,
+                    key_prefix,
+                )
+            )
+        # return the experiment's data-sources and throughput
+        return data_sources, parsed_output[self.THROUGHPUT]
diff --git a/src/rocksdb/tools/advisor/advisor/db_config_optimizer.py b/src/rocksdb/tools/advisor/advisor/db_config_optimizer.py
new file mode 100644
index 000000000..413778478
--- /dev/null
+++ b/src/rocksdb/tools/advisor/advisor/db_config_optimizer.py
@@ -0,0 +1,293 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+import copy
+import random
+
+from advisor.db_log_parser import NO_COL_FAMILY
+from advisor.db_options_parser import DatabaseOptions
+from advisor.rule_parser import Suggestion
+
+
+class ConfigOptimizer:
+    SCOPE = "scope"
+    SUGG_VAL = "suggested values"
+
+    @staticmethod
+    def apply_action_on_value(old_value, action, suggested_values):
+        chosen_sugg_val = None
+        if suggested_values:
+            chosen_sugg_val = random.choice(list(suggested_values))
+        new_value = None
+        if action is Suggestion.Action.set or not old_value:
+            assert chosen_sugg_val
+            new_value = chosen_sugg_val
+        else:
+            # For increase/decrease actions, currently the code tries to make
+            # a 30% change in the option's value per iteration. An addend is
+            # also present (+1 or -1) to handle the cases when the option's
+            # old value was 0 or the final int() conversion suppressed the 30%
+            # change made to the option
+            old_value = float(old_value)
+            mul = 0
+            add = 0
+            if action is Suggestion.Action.increase:
+                if old_value < 0:
+                    mul = 0.7
+                    add = 2
+                else:
+                    mul = 1.3
+                    add = 2
+            elif action is Suggestion.Action.decrease:
+                if old_value < 0:
+                    mul = 1.3
+                    add = -2
+                else:
+                    mul = 0.7
+                    add = -2
+            new_value = int(old_value * mul + add)
+        return new_value
+
+    @staticmethod
+    def improve_db_config(options, rule, suggestions_dict):
+        # this method takes ONE 'rule' and applies all its suggestions on the
+        # appropriate options
+        required_options = []
+        rule_suggestions = []
+        for sugg_name in rule.get_suggestions():
+            option = suggestions_dict[sugg_name].option
+            action = suggestions_dict[sugg_name].action
+            # A Suggestion in the rules spec must have the 'option' and
+            # 'action' fields defined, always call perform_checks() method
+            # after parsing the rules file using RulesSpec
+            assert option
+            assert action
+            required_options.append(option)
+            rule_suggestions.append(suggestions_dict[sugg_name])
+        current_config = options.get_options(required_options)
+        # Create the updated configuration from the rule's suggestions
+        updated_config = {}
+        for sugg in rule_suggestions:
+            # case: when the option is not present in the current configuration
+            if sugg.option not in current_config:
+                try:
+                    new_value = ConfigOptimizer.apply_action_on_value(
+                        None, sugg.action, sugg.suggested_values
+                    )
+                    if sugg.option not in updated_config:
+                        updated_config[sugg.option] = {}
+                    if DatabaseOptions.is_misc_option(sugg.option):
+                        # this suggestion is on an option that is not yet
+                        # supported by the Rocksdb OPTIONS file and so it is
+                        # not prefixed by a section type.
+                        updated_config[sugg.option][NO_COL_FAMILY] = new_value
+                    else:
+                        for col_fam in rule.get_trigger_column_families():
+                            updated_config[sugg.option][col_fam] = new_value
+                except AssertionError:
+                    print(
+                        "WARNING(ConfigOptimizer): provide suggested_values "
+                        + "for "
+                        + sugg.option
+                    )
+                continue
+            # case: when the option is present in the current configuration
+            if NO_COL_FAMILY in current_config[sugg.option]:
+                old_value = current_config[sugg.option][NO_COL_FAMILY]
+                try:
+                    new_value = ConfigOptimizer.apply_action_on_value(
+                        old_value, sugg.action, sugg.suggested_values
+                    )
+                    if sugg.option not in updated_config:
+                        updated_config[sugg.option] = {}
+                    updated_config[sugg.option][NO_COL_FAMILY] = new_value
+                except AssertionError:
+                    print(
+                        "WARNING(ConfigOptimizer): provide suggested_values "
+                        + "for "
+                        + sugg.option
+                    )
+            else:
+                for col_fam in rule.get_trigger_column_families():
+                    old_value = None
+                    if col_fam in current_config[sugg.option]:
+                        old_value = current_config[sugg.option][col_fam]
+                    try:
+                        new_value = ConfigOptimizer.apply_action_on_value(
+                            old_value, sugg.action, sugg.suggested_values
+                        )
+                        if sugg.option not in updated_config:
+                            updated_config[sugg.option] = {}
+                        updated_config[sugg.option][col_fam] = new_value
+                    except AssertionError:
+                        print(
+                            "WARNING(ConfigOptimizer): provide "
+                            + "suggested_values for "
+                            + sugg.option
+                        )
+        return current_config, updated_config
+
+    @staticmethod
+    def pick_rule_to_apply(rules, last_rule_name, rules_tried, backtrack):
+        if not rules:
+            print("\nNo more rules triggered!")
+            return None
+        # if the last rule provided an improvement in the database performance,
+        # and it was triggered again (i.e. it is present in 'rules'), then pick
+        # the same rule for this iteration too.
+        if last_rule_name and not backtrack:
+            for rule in rules:
+                if rule.name == last_rule_name:
+                    return rule
+        # there was no previous rule OR the previous rule did not improve db
+        # performance OR it was not triggered for this iteration,
+        # then pick another rule that has not been tried yet
+        for rule in rules:
+            if rule.name not in rules_tried:
+                return rule
+        print("\nAll rules have been exhausted")
+        return None
+
+    @staticmethod
+    def apply_suggestions(
+        triggered_rules,
+        current_rule_name,
+        rules_tried,
+        backtrack,
+        curr_options,
+        suggestions_dict,
+    ):
+        curr_rule = ConfigOptimizer.pick_rule_to_apply(
+            triggered_rules, current_rule_name, rules_tried, backtrack
+        )
+        if not curr_rule:
+            return tuple([None] * 4)
+        # if a rule has been picked for improving db_config, update rules_tried
+        rules_tried.add(curr_rule.name)
+        # get updated config based on the picked rule
+        curr_conf, updated_conf = ConfigOptimizer.improve_db_config(
+            curr_options, curr_rule, suggestions_dict
+        )
+        conf_diff = DatabaseOptions.get_options_diff(curr_conf, updated_conf)
+        if not conf_diff:  # the current and updated configs are the same
+            (
+                curr_rule,
+                rules_tried,
+                curr_conf,
+                updated_conf,
+            ) = ConfigOptimizer.apply_suggestions(
+                triggered_rules,
+                None,
+                rules_tried,
+                backtrack,
+                curr_options,
+                suggestions_dict,
+            )
+        print("returning from apply_suggestions")
+        return (curr_rule, rules_tried, curr_conf, updated_conf)
+
+    # TODO(poojam23): check if this method is required or can we directly set
+    # the config equal to the curr_config
+    @staticmethod
+    def get_backtrack_config(curr_config, updated_config):
+        diff = DatabaseOptions.get_options_diff(curr_config, updated_config)
+        bt_config = {}
+        for option in diff:
+            bt_config[option] = {}
+            for col_fam in diff[option]:
+                bt_config[option][col_fam] = diff[option][col_fam][0]
+        print(bt_config)
+        return bt_config
+
+    def __init__(self, bench_runner, db_options, rule_parser, base_db):
+        self.bench_runner = bench_runner
+        self.db_options = db_options
+        self.rule_parser = rule_parser
+        self.base_db_path = base_db
+
+    def run(self):
+        # In every iteration of this method's optimization loop we pick ONE
+        # RULE from all the triggered rules and apply all its suggestions to
+        # the appropriate options.
+        # bootstrapping the optimizer
+        print("Bootstrapping optimizer:")
+        options = copy.deepcopy(self.db_options)
+        old_data_sources, old_metric = self.bench_runner.run_experiment(
+            options, self.base_db_path
+        )
+        print("Initial metric: " + str(old_metric))
+        self.rule_parser.load_rules_from_spec()
+        self.rule_parser.perform_section_checks()
+        triggered_rules = self.rule_parser.get_triggered_rules(
+            old_data_sources, options.get_column_families()
+        )
+        print("\nTriggered:")
+        self.rule_parser.print_rules(triggered_rules)
+        backtrack = False
+        rules_tried = set()
+        (
+            curr_rule,
+            rules_tried,
+            curr_conf,
+            updated_conf,
+        ) = ConfigOptimizer.apply_suggestions(
+            triggered_rules,
+            None,
+            rules_tried,
+            backtrack,
+            options,
+            self.rule_parser.get_suggestions_dict(),
+        )
+        # the optimizer loop
+        while curr_rule:
+            print("\nRule picked for next iteration:")
+            print(curr_rule.name)
+            print("\ncurrent config:")
+            print(curr_conf)
+            print("updated config:")
+            print(updated_conf)
+            options.update_options(updated_conf)
+            # run bench_runner with updated config
+            new_data_sources, new_metric = self.bench_runner.run_experiment(
+                options, self.base_db_path
+            )
+            print("\nnew metric: " + str(new_metric))
+            backtrack = not self.bench_runner.is_metric_better(new_metric, old_metric)
+            # update triggered_rules, metric, data_sources, if required
+            if backtrack:
+                # revert changes to options config
+                print("\nBacktracking to previous configuration")
+                backtrack_conf = ConfigOptimizer.get_backtrack_config(
+                    curr_conf, updated_conf
+                )
+                options.update_options(backtrack_conf)
+            else:
+                # run advisor on new data sources
+                self.rule_parser.load_rules_from_spec()  # reboot the advisor
+                self.rule_parser.perform_section_checks()
+                triggered_rules = self.rule_parser.get_triggered_rules(
+                    new_data_sources, options.get_column_families()
+                )
+                print("\nTriggered:")
+                self.rule_parser.print_rules(triggered_rules)
+                old_metric = new_metric
+                old_data_sources = new_data_sources
+                rules_tried = set()
+            # pick rule to work on and set curr_rule to that
+            (
+                curr_rule,
+                rules_tried,
+                curr_conf,
+                updated_conf,
+            ) = ConfigOptimizer.apply_suggestions(
+                triggered_rules,
+                curr_rule.name,
+                rules_tried,
+                backtrack,
+                options,
+                self.rule_parser.get_suggestions_dict(),
+            )
+        # return the final database options configuration
+        return options
diff --git a/src/rocksdb/tools/advisor/advisor/db_log_parser.py b/src/rocksdb/tools/advisor/advisor/db_log_parser.py
new file mode 100644
index 000000000..9ba541fc3
--- /dev/null
+++ b/src/rocksdb/tools/advisor/advisor/db_log_parser.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+import glob
+import re
+import time
+from abc import ABC, abstractmethod
+from calendar import timegm
+from enum import Enum
+
+
+NO_COL_FAMILY = "DB_WIDE"
+
+
+class DataSource(ABC):
+    class Type(Enum):
+        LOG = 1
+        DB_OPTIONS = 2
+        TIME_SERIES = 3
+
+    def __init__(self, type):
+        self.type = type
+
+    @abstractmethod
+    def check_and_trigger_conditions(self, conditions):
+        pass
+
+
+class Log:
+    @staticmethod
+    def is_new_log(log_line):
+        # The assumption is that a new log will start with a date printed in
+        # the below regex format.
+        date_regex = "\d{4}/\d{2}/\d{2}-\d{2}:\d{2}:\d{2}\.\d{6}"  # noqa
+        return re.match(date_regex, log_line)
+
+    def __init__(self, log_line, column_families):
+        token_list = log_line.strip().split()
+        self.time = token_list[0]
+        self.context = token_list[1]
+        self.message = " ".join(token_list[2:])
+        self.column_family = None
+        # example log for 'default' column family:
+        # "2018/07/25-17:29:05.176080 7f969de68700 [db/compaction_job.cc:1634]
+        # [default] [JOB 3] Compacting 24@0 + 16@1 files to L1, score 6.00\n"
+        for col_fam in column_families:
+            search_for_str = "\[" + col_fam + "\]"  # noqa
+            if re.search(search_for_str, self.message):
+                self.column_family = col_fam
+                break
+        if not self.column_family:
+            self.column_family = NO_COL_FAMILY
+
+    def get_human_readable_time(self):
+        # example from a log line: '2018/07/25-11:25:45.782710'
+        return self.time
+
+    def get_column_family(self):
+        return self.column_family
+
+    def get_context(self):
+        return self.context
+
+    def get_message(self):
+        return self.message
+
+    def append_message(self, remaining_log):
+        self.message = self.message + "\n" + remaining_log.strip()
+
+    def get_timestamp(self):
+        # example: '2018/07/25-11:25:45.782710' will be converted to the GMT
+        # Unix timestamp 1532517945 (note: this method assumes that self.time
+        # is in GMT)
+        hr_time = self.time + "GMT"
+        timestamp = timegm(time.strptime(hr_time, "%Y/%m/%d-%H:%M:%S.%f%Z"))
+        return timestamp
+
+    def __repr__(self):
+        return (
+            "time: "
+            + self.time
+            + "; context: "
+            + self.context
+            + "; col_fam: "
+            + self.column_family
+            + "; message: "
+            + self.message
+        )
+
+
+class DatabaseLogs(DataSource):
+    def __init__(self, logs_path_prefix, column_families):
+        super().__init__(DataSource.Type.LOG)
+        self.logs_path_prefix = logs_path_prefix
+        self.column_families = column_families
+
+    def trigger_conditions_for_log(self, conditions, log):
+        # For a LogCondition object, trigger is:
+        # Dict[column_family_name, List[Log]]. This explains why the condition
+        # was triggered and for which column families.
+        for cond in conditions:
+            if re.search(cond.regex, log.get_message(), re.IGNORECASE):
+                trigger = cond.get_trigger()
+                if not trigger:
+                    trigger = {}
+                if log.get_column_family() not in trigger:
+                    trigger[log.get_column_family()] = []
+                trigger[log.get_column_family()].append(log)
+                cond.set_trigger(trigger)
+
+    def check_and_trigger_conditions(self, conditions):
+        for file_name in glob.glob(self.logs_path_prefix + "*"):
+            # TODO(poojam23): find a way to distinguish between log files
+            # - generated in the current experiment but are labeled 'old'
+            # because they LOGs exceeded the file size limit  AND
+            # - generated in some previous experiment that are also labeled
+            # 'old' and were not deleted for some reason
+            if re.search("old", file_name, re.IGNORECASE):
+                continue
+            with open(file_name, "r") as db_logs:
+                new_log = None
+                for line in db_logs:
+                    if Log.is_new_log(line):
+                        if new_log:
+                            self.trigger_conditions_for_log(conditions, new_log)
+                        new_log = Log(line, self.column_families)
+                    else:
+                        # To account for logs split into multiple lines
+                        new_log.append_message(line)
+            # Check for the last log in the file.
+            if new_log:
+                self.trigger_conditions_for_log(conditions, new_log)
diff --git a/src/rocksdb/tools/advisor/advisor/db_options_parser.py b/src/rocksdb/tools/advisor/advisor/db_options_parser.py
new file mode 100644
index 000000000..062aeeec4
--- /dev/null
+++ b/src/rocksdb/tools/advisor/advisor/db_options_parser.py
@@ -0,0 +1,348 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+import copy
+import os
+
+from advisor.db_log_parser import DataSource, NO_COL_FAMILY
+from advisor.ini_parser import IniParser
+
+
+class OptionsSpecParser(IniParser):
+    @staticmethod
+    def is_new_option(line):
+        return "=" in line
+
+    @staticmethod
+    def get_section_type(line):
+        """
+        Example section header: [TableOptions/BlockBasedTable "default"]
+        Here ConfigurationOptimizer returned would be
+        'TableOptions.BlockBasedTable'
+        """
+        section_path = line.strip()[1:-1].split()[0]
+        section_type = ".".join(section_path.split("/"))
+        return section_type
+
+    @staticmethod
+    def get_section_name(line):
+        # example: get_section_name('[CFOptions "default"]')
+        token_list = line.strip()[1:-1].split('"')
+        # token_list = ['CFOptions', 'default', '']
+        if len(token_list) < 3:
+            return None
+        return token_list[1]  # return 'default'
+
+    @staticmethod
+    def get_section_str(section_type, section_name):
+        # Example:
+        # Case 1: get_section_str('DBOptions', NO_COL_FAMILY)
+        # Case 2: get_section_str('TableOptions.BlockBasedTable', 'default')
+        section_type = "/".join(section_type.strip().split("."))
+        # Case 1: section_type = 'DBOptions'
+        # Case 2: section_type = 'TableOptions/BlockBasedTable'
+        section_str = "[" + section_type
+        if section_name == NO_COL_FAMILY:
+            # Case 1: '[DBOptions]'
+            return section_str + "]"
+        else:
+            # Case 2: '[TableOptions/BlockBasedTable "default"]'
+            return section_str + ' "' + section_name + '"]'
+
+    @staticmethod
+    def get_option_str(key, values):
+        option_str = key + "="
+        # get_option_str('db_log_dir', None), returns 'db_log_dir='
+        if values:
+            # example:
+            # get_option_str('max_bytes_for_level_multiplier_additional',
+            # [1,1,1,1,1,1,1]), returned string:
+            # 'max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1'
+            if isinstance(values, list):
+                for value in values:
+                    option_str += str(value) + ":"
+                option_str = option_str[:-1]
+            else:
+                # example: get_option_str('write_buffer_size', 1048576)
+                # returned string: 'write_buffer_size=1048576'
+                option_str += str(values)
+        return option_str
+
+
+class DatabaseOptions(DataSource):
+    @staticmethod
+    def is_misc_option(option_name):
+        # these are miscellaneous options that are not yet supported by the
+        # Rocksdb options file, hence they are not prefixed with any section
+        # name
+        return "." not in option_name
+
+    @staticmethod
+    def get_options_diff(opt_old, opt_new):
+        # type: Dict[option, Dict[col_fam, value]] X 2 ->
+        # Dict[option, Dict[col_fam, Tuple(old_value, new_value)]]
+        # note: diff should contain a tuple of values only if they are
+        # different from each other
+        options_union = set(opt_old.keys()).union(set(opt_new.keys()))
+        diff = {}
+        for opt in options_union:
+            diff[opt] = {}
+            # if option in options_union, then it must be in one of the configs
+            if opt not in opt_old:
+                for col_fam in opt_new[opt]:
+                    diff[opt][col_fam] = (None, opt_new[opt][col_fam])
+            elif opt not in opt_new:
+                for col_fam in opt_old[opt]:
+                    diff[opt][col_fam] = (opt_old[opt][col_fam], None)
+            else:
+                for col_fam in opt_old[opt]:
+                    if col_fam in opt_new[opt]:
+                        if opt_old[opt][col_fam] != opt_new[opt][col_fam]:
+                            diff[opt][col_fam] = (
+                                opt_old[opt][col_fam],
+                                opt_new[opt][col_fam],
+                            )
+                    else:
+                        diff[opt][col_fam] = (opt_old[opt][col_fam], None)
+                for col_fam in opt_new[opt]:
+                    if col_fam in opt_old[opt]:
+                        if opt_old[opt][col_fam] != opt_new[opt][col_fam]:
+                            diff[opt][col_fam] = (
+                                opt_old[opt][col_fam],
+                                opt_new[opt][col_fam],
+                            )
+                    else:
+                        diff[opt][col_fam] = (None, opt_new[opt][col_fam])
+            if not diff[opt]:
+                diff.pop(opt)
+        return diff
+
+    def __init__(self, rocksdb_options, misc_options=None):
+        super().__init__(DataSource.Type.DB_OPTIONS)
+        # The options are stored in the following data structure:
+        # Dict[section_type, Dict[section_name, Dict[option_name, value]]]
+        self.options_dict = None
+        self.column_families = None
+        # Load the options from the given file to a dictionary.
+        self.load_from_source(rocksdb_options)
+        # Setup the miscellaneous options expected to be List[str], where each
+        # element in the List has the format "<option_name>=<option_value>"
+        # These options are the ones that are not yet supported by the Rocksdb
+        # OPTIONS file, so they are provided separately
+        self.setup_misc_options(misc_options)
+
+    def setup_misc_options(self, misc_options):
+        self.misc_options = {}
+        if misc_options:
+            for option_pair_str in misc_options:
+                option_name = option_pair_str.split("=")[0].strip()
+                option_value = option_pair_str.split("=")[1].strip()
+                self.misc_options[option_name] = option_value
+
+    def load_from_source(self, options_path):
+        self.options_dict = {}
+        with open(options_path, "r") as db_options:
+            for line in db_options:
+                line = OptionsSpecParser.remove_trailing_comment(line)
+                if not line:
+                    continue
+                if OptionsSpecParser.is_section_header(line):
+                    curr_sec_type = OptionsSpecParser.get_section_type(line)
+                    curr_sec_name = OptionsSpecParser.get_section_name(line)
+                    if curr_sec_type not in self.options_dict:
+                        self.options_dict[curr_sec_type] = {}
+                    if not curr_sec_name:
+                        curr_sec_name = NO_COL_FAMILY
+                    self.options_dict[curr_sec_type][curr_sec_name] = {}
+                    # example: if the line read from the Rocksdb OPTIONS file
+                    # is [CFOptions "default"], then the section type is
+                    # CFOptions and 'default' is the name of a column family
+                    # that for this database, so it's added to the list of
+                    # column families stored in this object
+                    if curr_sec_type == "CFOptions":
+                        if not self.column_families:
+                            self.column_families = []
+                        self.column_families.append(curr_sec_name)
+                elif OptionsSpecParser.is_new_option(line):
+                    key, value = OptionsSpecParser.get_key_value_pair(line)
+                    self.options_dict[curr_sec_type][curr_sec_name][key] = value
+                else:
+                    error = "Not able to parse line in Options file."
+                    OptionsSpecParser.exit_with_parse_error(line, error)
+
+    def get_misc_options(self):
+        # these are options that are not yet supported by the Rocksdb OPTIONS
+        # file, hence they are provided and stored separately
+        return self.misc_options
+
+    def get_column_families(self):
+        return self.column_families
+
+    def get_all_options(self):
+        # This method returns all the options that are stored in this object as
+        # a: Dict[<sec_type>.<option_name>: Dict[col_fam, option_value]]
+        all_options = []
+        # Example: in the section header '[CFOptions "default"]' read from the
+        # OPTIONS file, sec_type='CFOptions'
+        for sec_type in self.options_dict:
+            for col_fam in self.options_dict[sec_type]:
+                for opt_name in self.options_dict[sec_type][col_fam]:
+                    option = sec_type + "." + opt_name
+                    all_options.append(option)
+        all_options.extend(list(self.misc_options.keys()))
+        return self.get_options(all_options)
+
+    def get_options(self, reqd_options):
+        # type: List[str] -> Dict[str, Dict[str, Any]]
+        # List[option] -> Dict[option, Dict[col_fam, value]]
+        reqd_options_dict = {}
+        for option in reqd_options:
+            if DatabaseOptions.is_misc_option(option):
+                # the option is not prefixed by '<section_type>.' because it is
+                # not yet supported by the Rocksdb OPTIONS file; so it has to
+                # be fetched from the misc_options dictionary
+                if option not in self.misc_options:
+                    continue
+                if option not in reqd_options_dict:
+                    reqd_options_dict[option] = {}
+                reqd_options_dict[option][NO_COL_FAMILY] = self.misc_options[option]
+            else:
+                # Example: option = 'TableOptions.BlockBasedTable.block_align'
+                # then, sec_type = 'TableOptions.BlockBasedTable'
+                sec_type = ".".join(option.split(".")[:-1])
+                # opt_name = 'block_align'
+                opt_name = option.split(".")[-1]
+                if sec_type not in self.options_dict:
+                    continue
+                for col_fam in self.options_dict[sec_type]:
+                    if opt_name in self.options_dict[sec_type][col_fam]:
+                        if option not in reqd_options_dict:
+                            reqd_options_dict[option] = {}
+                        reqd_options_dict[option][col_fam] = self.options_dict[
+                            sec_type
+                        ][col_fam][opt_name]
+        return reqd_options_dict
+
+    def update_options(self, options):
+        # An example 'options' object looks like:
+        # {'DBOptions.max_background_jobs': {NO_COL_FAMILY: 2},
+        # 'CFOptions.write_buffer_size': {'default': 1048576, 'cf_A': 128000},
+        # 'bloom_bits': {NO_COL_FAMILY: 4}}
+        for option in options:
+            if DatabaseOptions.is_misc_option(option):
+                # this is a misc_option i.e. an option that is not yet
+                # supported by the Rocksdb OPTIONS file, so it is not prefixed
+                # by '<section_type>.' and must be stored in the separate
+                # misc_options dictionary
+                if NO_COL_FAMILY not in options[option]:
+                    print(
+                        "WARNING(DatabaseOptions.update_options): not "
+                        + "updating option "
+                        + option
+                        + " because it is in "
+                        + "misc_option format but its scope is not "
+                        + NO_COL_FAMILY
+                        + ". Check format of option."
+                    )
+                    continue
+                self.misc_options[option] = options[option][NO_COL_FAMILY]
+            else:
+                sec_name = ".".join(option.split(".")[:-1])
+                opt_name = option.split(".")[-1]
+                if sec_name not in self.options_dict:
+                    self.options_dict[sec_name] = {}
+                for col_fam in options[option]:
+                    # if the option is not already present in the dictionary,
+                    # it will be inserted, else it will be updated to the new
+                    # value
+                    if col_fam not in self.options_dict[sec_name]:
+                        self.options_dict[sec_name][col_fam] = {}
+                    self.options_dict[sec_name][col_fam][opt_name] = copy.deepcopy(
+                        options[option][col_fam]
+                    )
+
+    def generate_options_config(self, nonce):
+        # this method generates a Rocksdb OPTIONS file in the INI format from
+        # the options stored in self.options_dict
+        this_path = os.path.abspath(os.path.dirname(__file__))
+        file_name = "../temp/OPTIONS_" + str(nonce) + ".tmp"
+        file_path = os.path.join(this_path, file_name)
+        with open(file_path, "w") as fp:
+            for section in self.options_dict:
+                for col_fam in self.options_dict[section]:
+                    fp.write(OptionsSpecParser.get_section_str(section, col_fam) + "\n")
+                    for option in self.options_dict[section][col_fam]:
+                        values = self.options_dict[section][col_fam][option]
+                        fp.write(
+                            OptionsSpecParser.get_option_str(option, values) + "\n"
+                        )
+                fp.write("\n")
+        return file_path
+
+    def check_and_trigger_conditions(self, conditions):
+        for cond in conditions:
+            reqd_options_dict = self.get_options(cond.options)
+            # This contains the indices of options that are specific to some
+            # column family and are not database-wide options.
+            incomplete_option_ix = []
+            options = []
+            missing_reqd_option = False
+            for ix, option in enumerate(cond.options):
+                if option not in reqd_options_dict:
+                    print(
+                        "WARNING(DatabaseOptions.check_and_trigger): "
+                        + "skipping condition "
+                        + cond.name
+                        + " because it "
+                        "requires option "
+                        + option
+                        + " but this option is"
+                        + " not available"
+                    )
+                    missing_reqd_option = True
+                    break  # required option is absent
+                if NO_COL_FAMILY in reqd_options_dict[option]:
+                    options.append(reqd_options_dict[option][NO_COL_FAMILY])
+                else:
+                    options.append(None)
+                    incomplete_option_ix.append(ix)
+
+            if missing_reqd_option:
+                continue
+
+            # if all the options are database-wide options
+            if not incomplete_option_ix:
+                try:
+                    if eval(cond.eval_expr):
+                        cond.set_trigger({NO_COL_FAMILY: options})
+                except Exception as e:
+                    print("WARNING(DatabaseOptions) check_and_trigger:" + str(e))
+                continue
+
+            # for all the options that are not database-wide, we look for their
+            # values specific to column families
+            col_fam_options_dict = {}
+            for col_fam in self.column_families:
+                present = True
+                for ix in incomplete_option_ix:
+                    option = cond.options[ix]
+                    if col_fam not in reqd_options_dict[option]:
+                        present = False
+                        break
+                    options[ix] = reqd_options_dict[option][col_fam]
+                if present:
+                    try:
+                        if eval(cond.eval_expr):
+                            col_fam_options_dict[col_fam] = copy.deepcopy(options)
+                    except Exception as e:
+                        print("WARNING(DatabaseOptions) check_and_trigger: " + str(e))
+            # Trigger for an OptionCondition object is of the form:
+            # Dict[col_fam_name: List[option_value]]
+            # where col_fam_name is the name of a column family for which
+            # 'eval_expr' evaluated to True and List[option_value] is the list
+            # of values of the options specified in the condition's 'options'
+            # field
+            if col_fam_options_dict:
+                cond.set_trigger(col_fam_options_dict)
diff --git a/src/rocksdb/tools/advisor/advisor/db_stats_fetcher.py b/src/rocksdb/tools/advisor/advisor/db_stats_fetcher.py
new file mode 100755
index 000000000..30d1ad8b3
--- /dev/null
+++ b/src/rocksdb/tools/advisor/advisor/db_stats_fetcher.py
@@ -0,0 +1,346 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+import copy
+import glob
+import re
+import subprocess
+import time
+from typing import List
+
+from advisor.db_log_parser import Log
+from advisor.db_timeseries_parser import NO_ENTITY, TimeSeriesData
+
+
+class LogStatsParser(TimeSeriesData):
+    STATS = "STATISTICS:"
+
+    @staticmethod
+    def parse_log_line_for_stats(log_line):
+        # Example stat line (from LOG file):
+        # "rocksdb.db.get.micros P50 : 8.4 P95 : 21.8 P99 : 33.9 P100 : 92.0\n"
+        token_list = log_line.strip().split()
+        # token_list = ['rocksdb.db.get.micros', 'P50', ':', '8.4', 'P95', ':',
+        # '21.8', 'P99', ':', '33.9', 'P100', ':', '92.0']
+        stat_prefix = token_list[0] + "."  # 'rocksdb.db.get.micros.'
+        stat_values = [token for token in token_list[1:] if token != ":"]
+        # stat_values = ['P50', '8.4', 'P95', '21.8', 'P99', '33.9', 'P100',
+        # '92.0']
+        stat_dict = {}
+        for ix, metric in enumerate(stat_values):
+            if ix % 2 == 0:
+                stat_name = stat_prefix + metric
+                stat_name = stat_name.lower()  # Note: case insensitive names
+            else:
+                stat_dict[stat_name] = float(metric)
+        # stat_dict = {'rocksdb.db.get.micros.p50': 8.4,
+        # 'rocksdb.db.get.micros.p95': 21.8, 'rocksdb.db.get.micros.p99': 33.9,
+        # 'rocksdb.db.get.micros.p100': 92.0}
+        return stat_dict
+
+    def __init__(self, logs_path_prefix, stats_freq_sec):
+        super().__init__()
+        self.logs_file_prefix = logs_path_prefix
+        self.stats_freq_sec = stats_freq_sec
+        self.duration_sec = 60
+
+    def get_keys_from_conditions(self, conditions):
+        # Note: case insensitive stat names
+        reqd_stats = []
+        for cond in conditions:
+            for key in cond.keys:
+                key = key.lower()
+                # some keys are prepended with '[]' for OdsStatsFetcher to
+                # replace this with the appropriate key_prefix, remove these
+                # characters here since the LogStatsParser does not need
+                # a prefix
+                if key.startswith("[]"):
+                    reqd_stats.append(key[2:])
+                else:
+                    reqd_stats.append(key)
+        return reqd_stats
+
+    def add_to_timeseries(self, log, reqd_stats):
+        # this method takes in the Log object that contains the Rocksdb stats
+        # and a list of required stats, then it parses the stats line by line
+        # to fetch required stats and add them to the keys_ts object
+        # Example: reqd_stats = ['rocksdb.block.cache.hit.count',
+        # 'rocksdb.db.get.micros.p99']
+        # Let log.get_message() returns following string:
+        # "[WARN] [db/db_impl.cc:485] STATISTICS:\n
+        # rocksdb.block.cache.miss COUNT : 1459\n
+        # rocksdb.block.cache.hit COUNT : 37\n
+        # ...
+        # rocksdb.db.get.micros P50 : 15.6 P95 : 39.7 P99 : 62.6 P100 : 148.0\n
+        # ..."
+        new_lines = log.get_message().split("\n")
+        # let log_ts = 1532518219
+        log_ts = log.get_timestamp()
+        # example updates to keys_ts:
+        # keys_ts[NO_ENTITY]['rocksdb.db.get.micros.p99'][1532518219] = 62.6
+        # keys_ts[NO_ENTITY]['rocksdb.block.cache.hit.count'][1532518219] = 37
+        for line in new_lines[1:]:  # new_lines[0] does not contain any stats
+            stats_on_line = self.parse_log_line_for_stats(line)
+            for stat in stats_on_line:
+                if stat in reqd_stats:
+                    if stat not in self.keys_ts[NO_ENTITY]:
+                        self.keys_ts[NO_ENTITY][stat] = {}
+                    self.keys_ts[NO_ENTITY][stat][log_ts] = stats_on_line[stat]
+
+    def fetch_timeseries(self, reqd_stats):
+        # this method parses the Rocksdb LOG file and generates timeseries for
+        # each of the statistic in the list reqd_stats
+        self.keys_ts = {NO_ENTITY: {}}
+        for file_name in glob.glob(self.logs_file_prefix + "*"):
+            # TODO(poojam23): find a way to distinguish between 'old' log files
+            # from current and previous experiments, present in the same
+            # directory
+            if re.search("old", file_name, re.IGNORECASE):
+                continue
+            with open(file_name, "r") as db_logs:
+                new_log = None
+                for line in db_logs:
+                    if Log.is_new_log(line):
+                        if new_log and re.search(self.STATS, new_log.get_message()):
+                            self.add_to_timeseries(new_log, reqd_stats)
+                        new_log = Log(line, column_families=[])
+                    else:
+                        # To account for logs split into multiple lines
+                        new_log.append_message(line)
+            # Check for the last log in the file.
+            if new_log and re.search(self.STATS, new_log.get_message()):
+                self.add_to_timeseries(new_log, reqd_stats)
+
+
+class DatabasePerfContext(TimeSeriesData):
+    # TODO(poojam23): check if any benchrunner provides PerfContext sampled at
+    # regular intervals
+    def __init__(self, perf_context_ts, stats_freq_sec, cumulative):
+        """
+        perf_context_ts is expected to be in the following format:
+        Dict[metric, Dict[timestamp, value]], where for
+        each (metric, timestamp) pair, the value is database-wide (i.e.
+        summed over all the threads involved)
+        if stats_freq_sec == 0, per-metric only one value is reported
+        """
+        super().__init__()
+        self.stats_freq_sec = stats_freq_sec
+        self.keys_ts = {NO_ENTITY: perf_context_ts}
+        if cumulative:
+            self.unaccumulate_metrics()
+
+    def unaccumulate_metrics(self):
+        # if the perf context metrics provided are cumulative in nature, this
+        # method can be used to convert them to a disjoint format
+        epoch_ts = copy.deepcopy(self.keys_ts)
+        for stat in self.keys_ts[NO_ENTITY]:
+            timeseries = sorted(
+                list(self.keys_ts[NO_ENTITY][stat].keys()), reverse=True
+            )
+            if len(timeseries) < 2:
+                continue
+            for ix, ts in enumerate(timeseries[:-1]):
+                epoch_ts[NO_ENTITY][stat][ts] = (
+                    epoch_ts[NO_ENTITY][stat][ts]
+                    - epoch_ts[NO_ENTITY][stat][timeseries[ix + 1]]
+                )
+                if epoch_ts[NO_ENTITY][stat][ts] < 0:
+                    raise ValueError("DBPerfContext: really cumulative?")
+            # drop the smallest timestamp in the timeseries for this metric
+            epoch_ts[NO_ENTITY][stat].pop(timeseries[-1])
+        self.keys_ts = epoch_ts
+
+    def get_keys_from_conditions(self, conditions):
+        reqd_stats = []
+        for cond in conditions:
+            reqd_stats.extend([key.lower() for key in cond.keys])
+        return reqd_stats
+
+    def fetch_timeseries(self, statistics):
+        # this method is redundant for DatabasePerfContext because the __init__
+        # does the job of populating 'keys_ts'
+        pass
+
+
+class OdsStatsFetcher(TimeSeriesData):
+    # class constants
+    OUTPUT_FILE = "temp/stats_out.tmp"
+    ERROR_FILE = "temp/stats_err.tmp"
+    RAPIDO_COMMAND = "%s --entity=%s --key=%s --tstart=%s --tend=%s --showtime"
+
+    # static methods
+    @staticmethod
+    def _get_string_in_quotes(value):
+        return '"' + str(value) + '"'
+
+    @staticmethod
+    def _get_time_value_pair(pair_string):
+        # example pair_string: '[1532544591, 97.3653601828]'
+        pair_string = pair_string.replace("[", "")
+        pair_string = pair_string.replace("]", "")
+        pair = pair_string.split(",")
+        first = int(pair[0].strip())
+        second = float(pair[1].strip())
+        return [first, second]
+
+    @staticmethod
+    def _get_ods_cli_stime(start_time):
+        diff = int(time.time() - int(start_time))
+        stime = str(diff) + "_s"
+        return stime
+
+    def __init__(self, client, entities, start_time, end_time, key_prefix=None):
+        super().__init__()
+        self.client = client
+        self.entities = entities
+        self.start_time = start_time
+        self.end_time = end_time
+        self.key_prefix = key_prefix
+        self.stats_freq_sec = 60
+        self.duration_sec = 60
+
+    def execute_script(self, command):
+        print("executing...")
+        print(command)
+        out_file = open(self.OUTPUT_FILE, "w+")
+        err_file = open(self.ERROR_FILE, "w+")
+        subprocess.call(command, shell=True, stdout=out_file, stderr=err_file)
+        out_file.close()
+        err_file.close()
+
+    def parse_rapido_output(self):
+        # Output looks like the following:
+        # <entity_name>\t<key_name>\t[[ts, value], [ts, value], ...]
+        # ts = timestamp; value = value of key_name in entity_name at time ts
+        self.keys_ts = {}
+        with open(self.OUTPUT_FILE, "r") as fp:
+            for line in fp:
+                token_list = line.strip().split("\t")
+                entity = token_list[0]
+                key = token_list[1]
+                if entity not in self.keys_ts:
+                    self.keys_ts[entity] = {}
+                if key not in self.keys_ts[entity]:
+                    self.keys_ts[entity][key] = {}
+                list_of_lists = [
+                    self._get_time_value_pair(pair_string)
+                    for pair_string in token_list[2].split("],")
+                ]
+                value = {pair[0]: pair[1] for pair in list_of_lists}
+                self.keys_ts[entity][key] = value
+
+    def parse_ods_output(self):
+        # Output looks like the following:
+        # <entity_name>\t<key_name>\t<timestamp>\t<value>
+        # there is one line per (entity_name, key_name, timestamp)
+        self.keys_ts = {}
+        with open(self.OUTPUT_FILE, "r") as fp:
+            for line in fp:
+                token_list = line.split()
+                entity = token_list[0]
+                if entity not in self.keys_ts:
+                    self.keys_ts[entity] = {}
+                key = token_list[1]
+                if key not in self.keys_ts[entity]:
+                    self.keys_ts[entity][key] = {}
+                self.keys_ts[entity][key][token_list[2]] = token_list[3]
+
+    def fetch_timeseries(self, statistics):
+        # this method fetches the timeseries of required stats from the ODS
+        # service and populates the 'keys_ts' object appropriately
+        print("OdsStatsFetcher: fetching " + str(statistics))
+        if re.search("rapido", self.client, re.IGNORECASE):
+            command = self.RAPIDO_COMMAND % (
+                self.client,
+                self._get_string_in_quotes(self.entities),
+                self._get_string_in_quotes(",".join(statistics)),
+                self._get_string_in_quotes(self.start_time),
+                self._get_string_in_quotes(self.end_time),
+            )
+            # Run the tool and fetch the time-series data
+            self.execute_script(command)
+            # Parse output and populate the 'keys_ts' map
+            self.parse_rapido_output()
+        elif re.search("ods", self.client, re.IGNORECASE):
+            command = (
+                self.client
+                + " "
+                + "--stime="
+                + self._get_ods_cli_stime(self.start_time)
+                + " "
+                + self._get_string_in_quotes(self.entities)
+                + " "
+                + self._get_string_in_quotes(",".join(statistics))
+            )
+            # Run the tool and fetch the time-series data
+            self.execute_script(command)
+            # Parse output and populate the 'keys_ts' map
+            self.parse_ods_output()
+
+    def get_keys_from_conditions(self, conditions):
+        reqd_stats = []
+        for cond in conditions:
+            for key in cond.keys:
+                use_prefix = False
+                if key.startswith("[]"):
+                    use_prefix = True
+                    key = key[2:]
+                # TODO(poojam23): this is very hacky and needs to be improved
+                if key.startswith("rocksdb"):
+                    key += ".60"
+                if use_prefix:
+                    if not self.key_prefix:
+                        print("Warning: OdsStatsFetcher might need key prefix")
+                        print("for the key: " + key)
+                    else:
+                        key = self.key_prefix + "." + key
+                reqd_stats.append(key)
+        return reqd_stats
+
+    def fetch_rate_url(
+        self,
+        entities: List[str],
+        keys: List[str],
+        window_len: str,
+        percent: str,
+        display: bool,
+    ) -> str:
+        transform_desc = (
+            "rate(" + str(window_len) + ",duration=" + str(self.duration_sec)
+        )
+        if percent:
+            transform_desc = transform_desc + ",%)"
+        else:
+            transform_desc = transform_desc + ")"
+        if re.search("rapido", self.client, re.IGNORECASE):
+            command = self.RAPIDO_COMMAND + " --transform=%s --url=%s"
+            command = command % (
+                self.client,
+                self._get_string_in_quotes(",".join(entities)),
+                self._get_string_in_quotes(",".join(keys)),
+                self._get_string_in_quotes(self.start_time),
+                self._get_string_in_quotes(self.end_time),
+                self._get_string_in_quotes(transform_desc),
+                self._get_string_in_quotes(display),
+            )
+        elif re.search("ods", self.client, re.IGNORECASE):
+            command = (
+                self.client
+                + " "
+                + "--stime="
+                + self._get_ods_cli_stime(self.start_time)
+                + " "
+                + "--fburlonly "
+                + self._get_string_in_quotes(entities)
+                + " "
+                + self._get_string_in_quotes(",".join(keys))
+                + " "
+                + self._get_string_in_quotes(transform_desc)
+            )
+        self.execute_script(command)
+        url = ""
+        with open(self.OUTPUT_FILE, "r") as fp:
+            url = fp.readline()
+        return url
diff --git a/src/rocksdb/tools/advisor/advisor/db_timeseries_parser.py b/src/rocksdb/tools/advisor/advisor/db_timeseries_parser.py
new file mode 100644
index 000000000..5840d7b90
--- /dev/null
+++ b/src/rocksdb/tools/advisor/advisor/db_timeseries_parser.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+import math
+from abc import abstractmethod
+from enum import Enum
+from typing import Dict
+
+from advisor.db_log_parser import DataSource
+
+
+NO_ENTITY = "ENTITY_PLACEHOLDER"
+
+
+class TimeSeriesData(DataSource):
+    class Behavior(Enum):
+        bursty = 1
+        evaluate_expression = 2
+
+    class AggregationOperator(Enum):
+        avg = 1
+        max = 2
+        min = 3
+        latest = 4
+        oldest = 5
+
+    def __init__(self):
+        super().__init__(DataSource.Type.TIME_SERIES)
+        self.keys_ts = None  # Dict[entity, Dict[key, Dict[timestamp, value]]]
+        self.stats_freq_sec = None
+
+    @abstractmethod
+    def get_keys_from_conditions(self, conditions):
+        # This method takes in a list of time-series conditions; for each
+        # condition it manipulates the 'keys' in the way that is supported by
+        # the subclass implementing this method
+        pass
+
+    @abstractmethod
+    def fetch_timeseries(self, required_statistics):
+        # this method takes in a list of statistics and fetches the timeseries
+        # for each of them and populates the 'keys_ts' dictionary
+        pass
+
+    def fetch_burst_epochs(
+        self,
+        entities: str,
+        statistic: int,
+        window_sec: float,
+        threshold: bool,
+        percent: bool,
+    ) -> Dict[str, Dict[int, float]]:
+        # this method calculates the (percent) rate change in the 'statistic'
+        # for each entity (over 'window_sec' seconds) and returns the epochs
+        # where this rate change is greater than or equal to the 'threshold'
+        # value
+        if self.stats_freq_sec == 0:
+            # not time series data, cannot check for bursty behavior
+            return
+        if window_sec < self.stats_freq_sec:
+            window_sec = self.stats_freq_sec
+        # 'window_samples' is the number of windows to go back to
+        # compare the current window with, while calculating rate change.
+        window_samples = math.ceil(window_sec / self.stats_freq_sec)
+        burst_epochs = {}
+        # if percent = False:
+        # curr_val = value at window for which rate change is being calculated
+        # prev_val = value at window that is window_samples behind curr_window
+        # Then rate_without_percent =
+        # ((curr_val-prev_val)*duration_sec)/(curr_timestamp-prev_timestamp)
+        # if percent = True:
+        # rate_with_percent = (rate_without_percent * 100) / prev_val
+        # These calculations are in line with the rate() transform supported
+        # by ODS
+        for entity in entities:
+            if statistic not in self.keys_ts[entity]:
+                continue
+            timestamps = sorted(list(self.keys_ts[entity][statistic].keys()))
+            for ix in range(window_samples, len(timestamps), 1):
+                first_ts = timestamps[ix - window_samples]
+                last_ts = timestamps[ix]
+                first_val = self.keys_ts[entity][statistic][first_ts]
+                last_val = self.keys_ts[entity][statistic][last_ts]
+                diff = last_val - first_val
+                if percent:
+                    diff = diff * 100 / first_val
+                rate = (diff * self.duration_sec) / (last_ts - first_ts)
+                # if the rate change is greater than the provided threshold,
+                # then the condition is triggered for entity at time 'last_ts'
+                if rate >= threshold:
+                    if entity not in burst_epochs:
+                        burst_epochs[entity] = {}
+                    burst_epochs[entity][last_ts] = rate
+        return burst_epochs
+
+    def fetch_aggregated_values(self, entity, statistics, aggregation_op):
+        # this method performs the aggregation specified by 'aggregation_op'
+        # on the timeseries of 'statistics' for 'entity' and returns:
+        # Dict[statistic, aggregated_value]
+        result = {}
+        for stat in statistics:
+            if stat not in self.keys_ts[entity]:
+                continue
+            agg_val = None
+            if aggregation_op is self.AggregationOperator.latest:
+                latest_timestamp = max(list(self.keys_ts[entity][stat].keys()))
+                agg_val = self.keys_ts[entity][stat][latest_timestamp]
+            elif aggregation_op is self.AggregationOperator.oldest:
+                oldest_timestamp = min(list(self.keys_ts[entity][stat].keys()))
+                agg_val = self.keys_ts[entity][stat][oldest_timestamp]
+            elif aggregation_op is self.AggregationOperator.max:
+                agg_val = max(list(self.keys_ts[entity][stat].values()))
+            elif aggregation_op is self.AggregationOperator.min:
+                agg_val = min(list(self.keys_ts[entity][stat].values()))
+            elif aggregation_op is self.AggregationOperator.avg:
+                values = list(self.keys_ts[entity][stat].values())
+                agg_val = sum(values) / len(values)
+            result[stat] = agg_val
+        return result
+
+    def check_and_trigger_conditions(self, conditions):
+        # get the list of statistics that need to be fetched
+        reqd_keys = self.get_keys_from_conditions(conditions)
+        # fetch the required statistics and populate the map 'keys_ts'
+        self.fetch_timeseries(reqd_keys)
+        # Trigger the appropriate conditions
+        for cond in conditions:
+            complete_keys = self.get_keys_from_conditions([cond])
+            # Get the entities that have all statistics required by 'cond':
+            # an entity is checked for a given condition only if we possess all
+            # of the condition's 'keys' for that entity
+            entities_with_stats = []
+            for entity in self.keys_ts:
+                stat_missing = False
+                for stat in complete_keys:
+                    if stat not in self.keys_ts[entity]:
+                        stat_missing = True
+                        break
+                if not stat_missing:
+                    entities_with_stats.append(entity)
+            if not entities_with_stats:
+                continue
+            if cond.behavior is self.Behavior.bursty:
+                # for a condition that checks for bursty behavior, only one key
+                # should be present in the condition's 'keys' field
+                result = self.fetch_burst_epochs(
+                    entities_with_stats,
+                    complete_keys[0],  # there should be only one key
+                    cond.window_sec,
+                    cond.rate_threshold,
+                    True,
+                )
+                # Trigger in this case is:
+                # Dict[entity_name, Dict[timestamp, rate_change]]
+                # where the inner dictionary contains rate_change values when
+                # the rate_change >= threshold provided, with the
+                # corresponding timestamps
+                if result:
+                    cond.set_trigger(result)
+            elif cond.behavior is self.Behavior.evaluate_expression:
+                self.handle_evaluate_expression(
+                    cond, complete_keys, entities_with_stats
+                )
+
+    def handle_evaluate_expression(self, condition, statistics, entities):
+        trigger = {}
+        # check 'condition' for each of these entities
+        for entity in entities:
+            if hasattr(condition, "aggregation_op"):
+                # in this case, the aggregation operation is performed on each
+                # of the condition's 'keys' and then with aggregated values
+                # condition's 'expression' is evaluated; if it evaluates to
+                # True, then list of the keys values is added to the
+                # condition's trigger: Dict[entity_name, List[stats]]
+                result = self.fetch_aggregated_values(
+                    entity, statistics, condition.aggregation_op
+                )
+                keys = [result[key] for key in statistics]
+                try:
+                    if eval(condition.expression):
+                        trigger[entity] = keys
+                except Exception as e:
+                    print("WARNING(TimeSeriesData) check_and_trigger: " + str(e))
+            else:
+                # assumption: all stats have same series of timestamps
+                # this is similar to the above but 'expression' is evaluated at
+                # each timestamp, since there is no aggregation, and all the
+                # epochs are added to the trigger when the condition's
+                # 'expression' evaluated to true; so trigger is:
+                # Dict[entity, Dict[timestamp, List[stats]]]
+                for epoch in self.keys_ts[entity][statistics[0]].keys():
+                    keys = [self.keys_ts[entity][key][epoch] for key in statistics]
+                    try:
+                        if eval(condition.expression):
+                            if entity not in trigger:
+                                trigger[entity] = {}
+                            trigger[entity][epoch] = keys
+                    except Exception as e:
+                        print("WARNING(TimeSeriesData) check_and_trigger: " + str(e))
+        if trigger:
+            condition.set_trigger(trigger)
diff --git a/src/rocksdb/tools/advisor/advisor/ini_parser.py b/src/rocksdb/tools/advisor/advisor/ini_parser.py
new file mode 100644
index 000000000..3379ea3cd
--- /dev/null
+++ b/src/rocksdb/tools/advisor/advisor/ini_parser.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+from enum import Enum
+
+
+class IniParser:
+    class Element(Enum):
+        rule = 1
+        cond = 2
+        sugg = 3
+        key_val = 4
+        comment = 5
+
+    @staticmethod
+    def remove_trailing_comment(line):
+        line = line.strip()
+        comment_start = line.find("#")
+        if comment_start > -1:
+            return line[:comment_start]
+        return line
+
+    @staticmethod
+    def is_section_header(line):
+        # A section header looks like: [Rule "my-new-rule"]. Essentially,
+        # a line that is in square-brackets.
+        line = line.strip()
+        if line.startswith("[") and line.endswith("]"):
+            return True
+        return False
+
+    @staticmethod
+    def get_section_name(line):
+        # For a section header: [Rule "my-new-rule"], this method will return
+        # "my-new-rule".
+        token_list = line.strip()[1:-1].split('"')
+        if len(token_list) < 3:
+            error = 'needed section header: [<section_type> "<section_name>"]'
+            raise ValueError("Parsing error: " + error + "\n" + line)
+        return token_list[1]
+
+    @staticmethod
+    def get_element(line):
+        line = IniParser.remove_trailing_comment(line)
+        if not line:
+            return IniParser.Element.comment
+        if IniParser.is_section_header(line):
+            if line.strip()[1:-1].startswith("Suggestion"):
+                return IniParser.Element.sugg
+            if line.strip()[1:-1].startswith("Rule"):
+                return IniParser.Element.rule
+            if line.strip()[1:-1].startswith("Condition"):
+                return IniParser.Element.cond
+        if "=" in line:
+            return IniParser.Element.key_val
+        error = "not a recognizable RulesSpec element"
+        raise ValueError("Parsing error: " + error + "\n" + line)
+
+    @staticmethod
+    def get_key_value_pair(line):
+        line = line.strip()
+        key = line.split("=")[0].strip()
+        value = "=".join(line.split("=")[1:])
+        if value == "":  # if the option has no value
+            return (key, None)
+        values = IniParser.get_list_from_value(value)
+        if len(values) == 1:
+            return (key, value)
+        return (key, values)
+
+    @staticmethod
+    def get_list_from_value(value):
+        values = value.strip().split(":")
+        return values
diff --git a/src/rocksdb/tools/advisor/advisor/rule_parser.py b/src/rocksdb/tools/advisor/advisor/rule_parser.py
new file mode 100644
index 000000000..169a55363
--- /dev/null
+++ b/src/rocksdb/tools/advisor/advisor/rule_parser.py
@@ -0,0 +1,510 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+import re
+from abc import ABC, abstractmethod
+from enum import Enum
+
+from advisor.db_log_parser import DataSource, NO_COL_FAMILY
+from advisor.db_timeseries_parser import TimeSeriesData
+from advisor.ini_parser import IniParser
+
+
+class Section(ABC):
+    def __init__(self, name):
+        self.name = name
+
+    @abstractmethod
+    def set_parameter(self, key, value):
+        pass
+
+    @abstractmethod
+    def perform_checks(self):
+        pass
+
+
+class Rule(Section):
+    def __init__(self, name):
+        super().__init__(name)
+        self.conditions = None
+        self.suggestions = None
+        self.overlap_time_seconds = None
+        self.trigger_entities = None
+        self.trigger_column_families = None
+
+    def set_parameter(self, key, value):
+        # If the Rule is associated with a single suggestion/condition, then
+        # value will be a string and not a list. Hence, convert it to a single
+        # element list before storing it in self.suggestions or
+        # self.conditions.
+        if key == "conditions":
+            if isinstance(value, str):
+                self.conditions = [value]
+            else:
+                self.conditions = value
+        elif key == "suggestions":
+            if isinstance(value, str):
+                self.suggestions = [value]
+            else:
+                self.suggestions = value
+        elif key == "overlap_time_period":
+            self.overlap_time_seconds = value
+
+    def get_suggestions(self):
+        return self.suggestions
+
+    def perform_checks(self):
+        if not self.conditions or len(self.conditions) < 1:
+            raise ValueError(self.name + ": rule must have at least one condition")
+        if not self.suggestions or len(self.suggestions) < 1:
+            raise ValueError(self.name + ": rule must have at least one suggestion")
+        if self.overlap_time_seconds:
+            if len(self.conditions) != 2:
+                raise ValueError(
+                    self.name
+                    + ": rule must be associated with 2 conditions\
+                    in order to check for a time dependency between them"
+                )
+            time_format = "^\d+[s|m|h|d]$"  # noqa
+            if not re.match(time_format, self.overlap_time_seconds, re.IGNORECASE):
+                raise ValueError(
+                    self.name + ": overlap_time_seconds format: \d+[s|m|h|d]"
+                )
+            else:  # convert to seconds
+                in_seconds = int(self.overlap_time_seconds[:-1])
+                if self.overlap_time_seconds[-1] == "m":
+                    in_seconds *= 60
+                elif self.overlap_time_seconds[-1] == "h":
+                    in_seconds *= 60 * 60
+                elif self.overlap_time_seconds[-1] == "d":
+                    in_seconds *= 24 * 60 * 60
+                self.overlap_time_seconds = in_seconds
+
+    def get_overlap_timestamps(self, key1_trigger_epochs, key2_trigger_epochs):
+        # this method takes in 2 timeseries i.e. timestamps at which the
+        # rule's 2 TIME_SERIES conditions were triggered and it finds
+        # (if present) the first pair of timestamps at which the 2 conditions
+        # were triggered within 'overlap_time_seconds' of each other
+        key1_lower_bounds = [
+            epoch - self.overlap_time_seconds for epoch in key1_trigger_epochs
+        ]
+        key1_lower_bounds.sort()
+        key2_trigger_epochs.sort()
+        trigger_ix = 0
+        overlap_pair = None
+        for key1_lb in key1_lower_bounds:
+            while key2_trigger_epochs[trigger_ix] < key1_lb and trigger_ix < len(
+                key2_trigger_epochs
+            ):
+                trigger_ix += 1
+            if trigger_ix >= len(key2_trigger_epochs):
+                break
+            if key2_trigger_epochs[trigger_ix] <= key1_lb + (
+                2 * self.overlap_time_seconds
+            ):
+                overlap_pair = (
+                    key2_trigger_epochs[trigger_ix],
+                    key1_lb + self.overlap_time_seconds,
+                )
+                break
+        return overlap_pair
+
+    def get_trigger_entities(self):
+        return self.trigger_entities
+
+    def get_trigger_column_families(self):
+        return self.trigger_column_families
+
+    def is_triggered(self, conditions_dict, column_families):
+        if self.overlap_time_seconds:
+            condition1 = conditions_dict[self.conditions[0]]
+            condition2 = conditions_dict[self.conditions[1]]
+            if not (
+                condition1.get_data_source() is DataSource.Type.TIME_SERIES
+                and condition2.get_data_source() is DataSource.Type.TIME_SERIES
+            ):
+                raise ValueError(self.name + ": need 2 timeseries conditions")
+
+            map1 = condition1.get_trigger()
+            map2 = condition2.get_trigger()
+            if not (map1 and map2):
+                return False
+
+            self.trigger_entities = {}
+            is_triggered = False
+            entity_intersection = set(map1.keys()).intersection(set(map2.keys()))
+            for entity in entity_intersection:
+                overlap_timestamps_pair = self.get_overlap_timestamps(
+                    list(map1[entity].keys()), list(map2[entity].keys())
+                )
+                if overlap_timestamps_pair:
+                    self.trigger_entities[entity] = overlap_timestamps_pair
+                    is_triggered = True
+            if is_triggered:
+                self.trigger_column_families = set(column_families)
+            return is_triggered
+        else:
+            all_conditions_triggered = True
+            self.trigger_column_families = set(column_families)
+            for cond_name in self.conditions:
+                cond = conditions_dict[cond_name]
+                if not cond.get_trigger():
+                    all_conditions_triggered = False
+                    break
+                if (
+                    cond.get_data_source() is DataSource.Type.LOG
+                    or cond.get_data_source() is DataSource.Type.DB_OPTIONS
+                ):
+                    cond_col_fam = set(cond.get_trigger().keys())
+                    if NO_COL_FAMILY in cond_col_fam:
+                        cond_col_fam = set(column_families)
+                    self.trigger_column_families = (
+                        self.trigger_column_families.intersection(cond_col_fam)
+                    )
+                elif cond.get_data_source() is DataSource.Type.TIME_SERIES:
+                    cond_entities = set(cond.get_trigger().keys())
+                    if self.trigger_entities is None:
+                        self.trigger_entities = cond_entities
+                    else:
+                        self.trigger_entities = self.trigger_entities.intersection(
+                            cond_entities
+                        )
+                if not (self.trigger_entities or self.trigger_column_families):
+                    all_conditions_triggered = False
+                    break
+            if not all_conditions_triggered:  # clean up if rule not triggered
+                self.trigger_column_families = None
+                self.trigger_entities = None
+            return all_conditions_triggered
+
+    def __repr__(self):
+        # Append conditions
+        rule_string = "Rule: " + self.name + " has conditions:: "
+        is_first = True
+        for cond in self.conditions:
+            if is_first:
+                rule_string += cond
+                is_first = False
+            else:
+                rule_string += " AND " + cond
+        # Append suggestions
+        rule_string += "\nsuggestions:: "
+        is_first = True
+        for sugg in self.suggestions:
+            if is_first:
+                rule_string += sugg
+                is_first = False
+            else:
+                rule_string += ", " + sugg
+        if self.trigger_entities:
+            rule_string += ", entities:: " + str(self.trigger_entities)
+        if self.trigger_column_families:
+            rule_string += ", col_fam:: " + str(self.trigger_column_families)
+        # Return constructed string
+        return rule_string
+
+
+class Suggestion(Section):
+    class Action(Enum):
+        set = 1
+        increase = 2
+        decrease = 3
+
+    def __init__(self, name):
+        super().__init__(name)
+        self.option = None
+        self.action = None
+        self.suggested_values = None
+        self.description = None
+
+    def set_parameter(self, key, value):
+        if key == "option":
+            # Note:
+            # case 1: 'option' is supported by Rocksdb OPTIONS file; in this
+            # case the option belongs to one of the sections in the config
+            # file and it's name is prefixed by "<section_type>."
+            # case 2: 'option' is not supported by Rocksdb OPTIONS file; the
+            # option is not expected to have the character '.' in its name
+            self.option = value
+        elif key == "action":
+            if self.option and not value:
+                raise ValueError(self.name + ": provide action for option")
+            self.action = self.Action[value]
+        elif key == "suggested_values":
+            if isinstance(value, str):
+                self.suggested_values = [value]
+            else:
+                self.suggested_values = value
+        elif key == "description":
+            self.description = value
+
+    def perform_checks(self):
+        if not self.description:
+            if not self.option:
+                raise ValueError(self.name + ": provide option or description")
+            if not self.action:
+                raise ValueError(self.name + ": provide action for option")
+            if self.action is self.Action.set and not self.suggested_values:
+                raise ValueError(self.name + ": provide suggested value for option")
+
+    def __repr__(self):
+        sugg_string = "Suggestion: " + self.name
+        if self.description:
+            sugg_string += " description : " + self.description
+        else:
+            sugg_string += " option : " + self.option + " action : " + self.action.name
+            if self.suggested_values:
+                sugg_string += " suggested_values : " + str(self.suggested_values)
+        return sugg_string
+
+
+class Condition(Section):
+    def __init__(self, name):
+        super().__init__(name)
+        self.data_source = None
+        self.trigger = None
+
+    def perform_checks(self):
+        if not self.data_source:
+            raise ValueError(self.name + ": condition not tied to data source")
+
+    def set_data_source(self, data_source):
+        self.data_source = data_source
+
+    def get_data_source(self):
+        return self.data_source
+
+    def reset_trigger(self):
+        self.trigger = None
+
+    def set_trigger(self, condition_trigger):
+        self.trigger = condition_trigger
+
+    def get_trigger(self):
+        return self.trigger
+
+    def is_triggered(self):
+        if self.trigger:
+            return True
+        return False
+
+    def set_parameter(self, key, value):
+        # must be defined by the subclass
+        raise NotImplementedError(self.name + ": provide source for condition")
+
+
+class LogCondition(Condition):
+    @classmethod
+    def create(cls, base_condition):
+        base_condition.set_data_source(DataSource.Type["LOG"])
+        base_condition.__class__ = cls
+        return base_condition
+
+    def set_parameter(self, key, value):
+        if key == "regex":
+            self.regex = value
+
+    def perform_checks(self):
+        super().perform_checks()
+        if not self.regex:
+            raise ValueError(self.name + ": provide regex for log condition")
+
+    def __repr__(self):
+        log_cond_str = "LogCondition: " + self.name
+        log_cond_str += " regex: " + self.regex
+        # if self.trigger:
+        #     log_cond_str += (" trigger: " + str(self.trigger))
+        return log_cond_str
+
+
+class OptionCondition(Condition):
+    @classmethod
+    def create(cls, base_condition):
+        base_condition.set_data_source(DataSource.Type["DB_OPTIONS"])
+        base_condition.__class__ = cls
+        return base_condition
+
+    def set_parameter(self, key, value):
+        if key == "options":
+            if isinstance(value, str):
+                self.options = [value]
+            else:
+                self.options = value
+        elif key == "evaluate":
+            self.eval_expr = value
+
+    def perform_checks(self):
+        super().perform_checks()
+        if not self.options:
+            raise ValueError(self.name + ": options missing in condition")
+        if not self.eval_expr:
+            raise ValueError(self.name + ": expression missing in condition")
+
+    def __repr__(self):
+        opt_cond_str = "OptionCondition: " + self.name
+        opt_cond_str += " options: " + str(self.options)
+        opt_cond_str += " expression: " + self.eval_expr
+        if self.trigger:
+            opt_cond_str += " trigger: " + str(self.trigger)
+        return opt_cond_str
+
+
+class TimeSeriesCondition(Condition):
+    @classmethod
+    def create(cls, base_condition):
+        base_condition.set_data_source(DataSource.Type["TIME_SERIES"])
+        base_condition.__class__ = cls
+        return base_condition
+
+    def set_parameter(self, key, value):
+        if key == "keys":
+            if isinstance(value, str):
+                self.keys = [value]
+            else:
+                self.keys = value
+        elif key == "behavior":
+            self.behavior = TimeSeriesData.Behavior[value]
+        elif key == "rate_threshold":
+            self.rate_threshold = float(value)
+        elif key == "window_sec":
+            self.window_sec = int(value)
+        elif key == "evaluate":
+            self.expression = value
+        elif key == "aggregation_op":
+            self.aggregation_op = TimeSeriesData.AggregationOperator[value]
+
+    def perform_checks(self):
+        if not self.keys:
+            raise ValueError(self.name + ": specify timeseries key")
+        if not self.behavior:
+            raise ValueError(self.name + ": specify triggering behavior")
+        if self.behavior is TimeSeriesData.Behavior.bursty:
+            if not self.rate_threshold:
+                raise ValueError(self.name + ": specify rate burst threshold")
+            if not self.window_sec:
+                self.window_sec = 300  # default window length is 5 minutes
+            if len(self.keys) > 1:
+                raise ValueError(self.name + ": specify only one key")
+        elif self.behavior is TimeSeriesData.Behavior.evaluate_expression:
+            if not (self.expression):
+                raise ValueError(self.name + ": specify evaluation expression")
+        else:
+            raise ValueError(self.name + ": trigger behavior not supported")
+
+    def __repr__(self):
+        ts_cond_str = "TimeSeriesCondition: " + self.name
+        ts_cond_str += " statistics: " + str(self.keys)
+        ts_cond_str += " behavior: " + self.behavior.name
+        if self.behavior is TimeSeriesData.Behavior.bursty:
+            ts_cond_str += " rate_threshold: " + str(self.rate_threshold)
+            ts_cond_str += " window_sec: " + str(self.window_sec)
+        if self.behavior is TimeSeriesData.Behavior.evaluate_expression:
+            ts_cond_str += " expression: " + self.expression
+            if hasattr(self, "aggregation_op"):
+                ts_cond_str += " aggregation_op: " + self.aggregation_op.name
+        if self.trigger:
+            ts_cond_str += " trigger: " + str(self.trigger)
+        return ts_cond_str
+
+
+class RulesSpec:
+    def __init__(self, rules_path):
+        self.file_path = rules_path
+
+    def initialise_fields(self):
+        self.rules_dict = {}
+        self.conditions_dict = {}
+        self.suggestions_dict = {}
+
+    def perform_section_checks(self):
+        for rule in self.rules_dict.values():
+            rule.perform_checks()
+        for cond in self.conditions_dict.values():
+            cond.perform_checks()
+        for sugg in self.suggestions_dict.values():
+            sugg.perform_checks()
+
+    def load_rules_from_spec(self):
+        self.initialise_fields()
+        with open(self.file_path, "r") as db_rules:
+            curr_section = None
+            for line in db_rules:
+                line = IniParser.remove_trailing_comment(line)
+                if not line:
+                    continue
+                element = IniParser.get_element(line)
+                if element is IniParser.Element.comment:
+                    continue
+                elif element is not IniParser.Element.key_val:
+                    curr_section = element  # it's a new IniParser header
+                    section_name = IniParser.get_section_name(line)
+                    if element is IniParser.Element.rule:
+                        new_rule = Rule(section_name)
+                        self.rules_dict[section_name] = new_rule
+                    elif element is IniParser.Element.cond:
+                        new_cond = Condition(section_name)
+                        self.conditions_dict[section_name] = new_cond
+                    elif element is IniParser.Element.sugg:
+                        new_suggestion = Suggestion(section_name)
+                        self.suggestions_dict[section_name] = new_suggestion
+                elif element is IniParser.Element.key_val:
+                    key, value = IniParser.get_key_value_pair(line)
+                    if curr_section is IniParser.Element.rule:
+                        new_rule.set_parameter(key, value)
+                    elif curr_section is IniParser.Element.cond:
+                        if key == "source":
+                            if value == "LOG":
+                                new_cond = LogCondition.create(new_cond)
+                            elif value == "OPTIONS":
+                                new_cond = OptionCondition.create(new_cond)
+                            elif value == "TIME_SERIES":
+                                new_cond = TimeSeriesCondition.create(new_cond)
+                        else:
+                            new_cond.set_parameter(key, value)
+                    elif curr_section is IniParser.Element.sugg:
+                        new_suggestion.set_parameter(key, value)
+
+    def get_rules_dict(self):
+        return self.rules_dict
+
+    def get_conditions_dict(self):
+        return self.conditions_dict
+
+    def get_suggestions_dict(self):
+        return self.suggestions_dict
+
+    def get_triggered_rules(self, data_sources, column_families):
+        self.trigger_conditions(data_sources)
+        triggered_rules = []
+        for rule in self.rules_dict.values():
+            if rule.is_triggered(self.conditions_dict, column_families):
+                triggered_rules.append(rule)
+        return triggered_rules
+
+    def trigger_conditions(self, data_sources):
+        for source_type in data_sources:
+            cond_subset = [
+                cond
+                for cond in self.conditions_dict.values()
+                if cond.get_data_source() is source_type
+            ]
+            if not cond_subset:
+                continue
+            for source in data_sources[source_type]:
+                source.check_and_trigger_conditions(cond_subset)
+
+    def print_rules(self, rules):
+        for rule in rules:
+            print("\nRule: " + rule.name)
+            for cond_name in rule.conditions:
+                print(repr(self.conditions_dict[cond_name]))
+            for sugg_name in rule.suggestions:
+                print(repr(self.suggestions_dict[sugg_name]))
+            if rule.trigger_entities:
+                print("scope: entities:")
+                print(rule.trigger_entities)
+            if rule.trigger_column_families:
+                print("scope: col_fam:")
+                print(rule.trigger_column_families)
diff --git a/src/rocksdb/tools/advisor/advisor/rule_parser_example.py b/src/rocksdb/tools/advisor/advisor/rule_parser_example.py
new file mode 100644
index 000000000..6c04ff2bf
--- /dev/null
+++ b/src/rocksdb/tools/advisor/advisor/rule_parser_example.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+import argparse
+
+from advisor.db_log_parser import DatabaseLogs, DataSource
+from advisor.db_options_parser import DatabaseOptions
+from advisor.db_stats_fetcher import LogStatsParser, OdsStatsFetcher
+from advisor.rule_parser import RulesSpec
+
+
+def main(args):
+    # initialise the RulesSpec parser
+    rule_spec_parser = RulesSpec(args.rules_spec)
+    rule_spec_parser.load_rules_from_spec()
+    rule_spec_parser.perform_section_checks()
+    # initialize the DatabaseOptions object
+    db_options = DatabaseOptions(args.rocksdb_options)
+    # Create DatabaseLogs object
+    db_logs = DatabaseLogs(args.log_files_path_prefix, db_options.get_column_families())
+    # Create the Log STATS object
+    db_log_stats = LogStatsParser(
+        args.log_files_path_prefix, args.stats_dump_period_sec
+    )
+    data_sources = {
+        DataSource.Type.DB_OPTIONS: [db_options],
+        DataSource.Type.LOG: [db_logs],
+        DataSource.Type.TIME_SERIES: [db_log_stats],
+    }
+    if args.ods_client:
+        data_sources[DataSource.Type.TIME_SERIES].append(
+            OdsStatsFetcher(
+                args.ods_client,
+                args.ods_entity,
+                args.ods_tstart,
+                args.ods_tend,
+                args.ods_key_prefix,
+            )
+        )
+    triggered_rules = rule_spec_parser.get_triggered_rules(
+        data_sources, db_options.get_column_families()
+    )
+    rule_spec_parser.print_rules(triggered_rules)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Use this script to get\
+        suggestions for improving Rocksdb performance."
+    )
+    parser.add_argument(
+        "--rules_spec",
+        required=True,
+        type=str,
+        help="path of the file containing the expert-specified Rules",
+    )
+    parser.add_argument(
+        "--rocksdb_options",
+        required=True,
+        type=str,
+        help="path of the starting Rocksdb OPTIONS file",
+    )
+    parser.add_argument(
+        "--log_files_path_prefix",
+        required=True,
+        type=str,
+        help="path prefix of the Rocksdb LOG files",
+    )
+    parser.add_argument(
+        "--stats_dump_period_sec",
+        required=True,
+        type=int,
+        help="the frequency (in seconds) at which STATISTICS are printed to "
+        + "the Rocksdb LOG file",
+    )
+    # ODS arguments
+    parser.add_argument("--ods_client", type=str, help="the ODS client binary")
+    parser.add_argument(
+        "--ods_entity",
+        type=str,
+        help="the servers for which the ODS stats need to be fetched",
+    )
+    parser.add_argument(
+        "--ods_key_prefix",
+        type=str,
+        help="the prefix that needs to be attached to the keys of time "
+        + "series to be fetched from ODS",
+    )
+    parser.add_argument(
+        "--ods_tstart", type=int, help="start time of timeseries to be fetched from ODS"
+    )
+    parser.add_argument(
+        "--ods_tend", type=int, help="end time of timeseries to be fetched from ODS"
+    )
+    args = parser.parse_args()
+    main(args)
diff --git a/src/rocksdb/tools/advisor/advisor/rules.ini b/src/rocksdb/tools/advisor/advisor/rules.ini
new file mode 100644
index 000000000..ec7a07e60
--- /dev/null
+++ b/src/rocksdb/tools/advisor/advisor/rules.ini
@@ -0,0 +1,214 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+#
+# FORMAT: very similar to the Rocksdb ini file in terms of syntax
+# (refer rocksdb/examples/rocksdb_option_file_example.ini)
+#
+# The Rules INI file is made up of multiple sections and each section is made
+# up of multiple key-value pairs. The recognized section types are:
+# Rule, Suggestion, Condition. Each section must have a name specified in ""
+# in the section header. This name acts as an identifier in that section
+# type's namespace. A section header looks like:
+# [<section_type> "<section_name_identifier>"]
+#
+# There should be at least one Rule section in the file with its corresponding
+# Condition and Suggestion sections. A Rule is triggered only when all of its
+# conditions are triggered. The order in which a Rule's conditions and
+# suggestions are specified has no significance.
+#
+# A Condition must be associated with a data source specified by the parameter
+# 'source' and this must be the first parameter specified for the Condition.
+# A condition can be associated with one or more Rules.
+#
+# A Suggestion is an advised change to a Rocksdb option to improve the
+# performance of the database in some way. Every suggestion can be a part of
+# one or more Rules.
+
+[Rule "stall-too-many-memtables"]
+suggestions=inc-bg-flush:inc-write-buffer
+conditions=stall-too-many-memtables
+
+[Condition "stall-too-many-memtables"]
+source=LOG
+regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+
+[Rule "stall-too-many-L0"]
+suggestions=inc-max-subcompactions:inc-max-bg-compactions:inc-write-buffer-size:dec-max-bytes-for-level-base:inc-l0-slowdown-writes-trigger
+conditions=stall-too-many-L0
+
+[Condition "stall-too-many-L0"]
+source=LOG
+regex=Stalling writes because we have \d+ level-0 files
+
+[Rule "stop-too-many-L0"]
+suggestions=inc-max-bg-compactions:inc-write-buffer-size:inc-l0-stop-writes-trigger
+conditions=stop-too-many-L0
+
+[Condition "stop-too-many-L0"]
+source=LOG
+regex=Stopping writes because we have \d+ level-0 files
+
+[Rule "stall-too-many-compaction-bytes"]
+suggestions=inc-max-bg-compactions:inc-write-buffer-size:inc-hard-pending-compaction-bytes-limit:inc-soft-pending-compaction-bytes-limit
+conditions=stall-too-many-compaction-bytes
+
+[Condition "stall-too-many-compaction-bytes"]
+source=LOG
+regex=Stalling writes because of estimated pending compaction bytes \d+
+
+[Suggestion "inc-bg-flush"]
+option=DBOptions.max_background_flushes
+action=increase
+suggested_values=2
+
+[Suggestion "inc-write-buffer"]
+option=CFOptions.max_write_buffer_number
+action=increase
+
+[Suggestion "inc-max-subcompactions"]
+option=DBOptions.max_subcompactions
+action=increase
+
+[Suggestion "inc-max-bg-compactions"]
+option=DBOptions.max_background_compactions
+action=increase
+suggested_values=2
+
+[Suggestion "inc-write-buffer-size"]
+option=CFOptions.write_buffer_size
+action=increase
+
+[Suggestion "dec-max-bytes-for-level-base"]
+option=CFOptions.max_bytes_for_level_base
+action=decrease
+
+[Suggestion "inc-l0-slowdown-writes-trigger"]
+option=CFOptions.level0_slowdown_writes_trigger
+action=increase
+
+[Suggestion "inc-l0-stop-writes-trigger"]
+option=CFOptions.level0_stop_writes_trigger
+action=increase
+
+[Suggestion "inc-hard-pending-compaction-bytes-limit"]
+option=CFOptions.hard_pending_compaction_bytes_limit
+action=increase
+
+[Suggestion "inc-soft-pending-compaction-bytes-limit"]
+option=CFOptions.soft_pending_compaction_bytes_limit
+action=increase
+
+[Rule "level0-level1-ratio"]
+conditions=level0-level1-ratio
+suggestions=inc-base-max-bytes
+
+[Condition "level0-level1-ratio"]
+source=OPTIONS
+options=CFOptions.level0_file_num_compaction_trigger:CFOptions.write_buffer_size:CFOptions.max_bytes_for_level_base
+evaluate=int(options[0])*int(options[1])-int(options[2])>=1  # should evaluate to a boolean, condition triggered if evaluates to true
+
+[Suggestion "inc-base-max-bytes"]
+option=CFOptions.max_bytes_for_level_base
+action=increase
+
+[Rules "tuning-iostat-burst"]
+conditions=large-db-get-p99
+suggestions=bytes-per-sync-non0:wal-bytes-per-sync-non0:set-rate-limiter
+#overlap_time_period=10m
+
+[Condition "write-burst"]
+source=TIME_SERIES
+keys=dyno.flash_write_bytes_per_sec
+behavior=bursty
+window_sec=300  # the smaller this window, the more sensitivity to changes in the time series, so the rate_threshold should be bigger; when it's 60, then same as diff(%)
+rate_threshold=20
+
+[Condition "large-p99-read-latency"]
+source=TIME_SERIES
+keys=[]rocksdb.read.block.get.micros.p99
+behavior=bursty
+window_sec=300
+rate_threshold=10
+
+[Condition "large-db-get-p99"]
+source=TIME_SERIES
+keys=[]rocksdb.db.get.micros.p50:[]rocksdb.db.get.micros.p99
+behavior=evaluate_expression
+evaluate=(keys[1]/keys[0])>5
+
+[Suggestion "bytes-per-sync-non0"]
+option=DBOptions.bytes_per_sync
+action=set
+suggested_values=1048576
+
+[Suggestion "wal-bytes-per-sync-non0"]
+option=DBOptions.wal_bytes_per_sync
+action=set
+suggested_values=1048576
+
+[Suggestion "set-rate-limiter"]
+option=rate_limiter_bytes_per_sec
+action=set
+suggested_values=1024000
+
+[Rule "bloom-filter-percent-useful"]
+conditions=bloom-filter-percent-useful
+suggestions=inc-bloom-bits-per-key
+
+[Condition "bloom-filter-percent-useful"]
+source=TIME_SERIES
+keys=[]rocksdb.bloom.filter.useful.count:[]rocksdb.bloom.filter.full.positive.count:[]rocksdb.bloom.filter.full.true.positive.count
+behavior=evaluate_expression
+evaluate=((keys[0]+keys[2])/(keys[0]+keys[1]))<0.9  # should evaluate to a boolean
+aggregation_op=latest
+
+[Rule "bloom-not-enabled"]
+conditions=bloom-not-enabled
+suggestions=inc-bloom-bits-per-key
+
+[Condition "bloom-not-enabled"]
+source=TIME_SERIES
+keys=[]rocksdb.bloom.filter.useful.count:[]rocksdb.bloom.filter.full.positive.count:[]rocksdb.bloom.filter.full.true.positive.count
+behavior=evaluate_expression
+evaluate=keys[0]+keys[1]+keys[2]==0
+aggregation_op=avg
+
+[Suggestion "inc-bloom-bits-per-key"]
+option=bloom_bits
+action=increase
+suggested_values=2
+
+[Rule "small-l0-files"]
+conditions=small-l0-files
+suggestions=dec-max-bytes-for-level-base:inc-write-buffer-size
+
+[Condition "small-l0-files"]
+source=OPTIONS
+options=CFOptions.max_bytes_for_level_base:CFOptions.level0_file_num_compaction_trigger:CFOptions.write_buffer_size
+evaluate=int(options[0])>(10*int(options[1])*int(options[2]))
+
+[Rule "decompress-time-long"]
+conditions=decompress-time-long
+suggestions=dec-block-size:inc-block-cache-size:faster-compression-type
+
+[Condition "decompress-time-long"]
+source=TIME_SERIES
+keys=block_decompress_time:block_read_time:block_checksum_time
+behavior=evaluate_expression
+evaluate=(keys[0]/(keys[0]+keys[1]+keys[2]))>0.3
+
+[Suggestion "dec-block-size"]
+option=TableOptions.BlockBasedTable.block_size
+action=decrease
+
+[Suggestion "inc-block-cache-size"]
+option=cache_size
+action=increase
+suggested_values=16000000
+
+[Suggestion "faster-compression-type"]
+option=CFOptions.compression
+action=set
+suggested_values=kLZ4Compression
diff --git a/src/rocksdb/tools/advisor/test/__init__.py b/src/rocksdb/tools/advisor/test/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/__init__.py
diff --git a/src/rocksdb/tools/advisor/test/input_files/LOG-0 b/src/rocksdb/tools/advisor/test/input_files/LOG-0
new file mode 100644
index 000000000..3c9d51641
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/input_files/LOG-0
@@ -0,0 +1,30 @@
+2018/05/25-14:30:05.601692 7f82bd676200 RocksDB version: 5.14.0
+2018/05/25-14:30:07.626719 7f82ba72e700 (Original Log Time 2018/05/25-14:30:07.621966) [db/db_impl_compaction_flush.cc:1424] Calling FlushMemTableToOutputFile with column family [default], flush slots available 1, compaction slots available 1, flush slots scheduled 1, compaction slots scheduled 0
+2018/05/25-14:30:07.626725 7f82ba72e700 [db/flush_job.cc:301] [default] [JOB 3] Flushing memtable with next log file: 8
+2018/05/25-14:30:07.626738 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527283807626732, "job": 3, "event": "flush_started", "num_memtables": 1, "num_entries": 28018, "num_deletes": 0, "memory_usage": 4065512, "flush_reason": "Write Buffer Full"}
+2018/05/25-14:30:07.626740 7f82ba72e700 [db/flush_job.cc:331] [default] [JOB 3] Level-0 flush table #10: started
+2018/05/25-14:30:07.764232 7f82b2f20700 [db/db_impl_write.cc:1373] [default] New memtable created with log file: #11. Immutable memtables: 1.
+2018/05/25-14:30:07.764240 7f82b2f20700 [WARN] [db/column_family.cc:743] [default] Stopping writes because we have 2 immutable memtables (waiting for flush), max_write_buffer_number is set to 2
+2018/05/23-11:53:12.800143 7f9f36b40700 [WARN] [db/column_family.cc:799] [default] Stalling writes because we have 4 level-0 files rate 39886
+2018/05/23-11:53:12.800143 7f9f36b40700 [WARN] [db/column_family.cc:799] [default] Stopping writes because we have 4 level-0 files rate 39886
+2018/05/25-14:30:09.398302 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527283809398276, "cf_name": "default", "job": 3, "event": "table_file_creation", "file_number": 10, "file_size": 1890434, "table_properties": {"data_size": 1876749, "index_size": 23346, "filter_size": 0, "raw_key_size": 663120, "raw_average_key_size": 24, "raw_value_size": 2763000, "raw_average_value_size": 100, "num_data_blocks": 838, "num_entries": 27630, "filter_policy_name": "", "kDeletedKeys": "0", "kMergeOperands": "0"}}
+2018/05/25-14:30:09.398351 7f82ba72e700 [db/flush_job.cc:371] [default] [JOB 3] Level-0 flush table #10: 1890434 bytes OK
+2018/05/25-14:30:25.491635 7f82ba72e700 [db/flush_job.cc:331] [default] [JOB 10] Level-0 flush table #23: started
+2018/05/25-14:30:25.643618 7f82b2f20700 [db/db_impl_write.cc:1373] [default] New memtable created with log file: #24. Immutable memtables: 1.
+2018/05/25-14:30:25.643633 7f82b2f20700 [WARN] [db/column_family.cc:743] [default] Stopping writes because we have 2 immutable memtables (waiting for flush), max_write_buffer_number is set to 2
+2018/05/25-14:30:27.288181 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527283827288158, "cf_name": "default", "job": 10, "event": "table_file_creation", "file_number": 23, "file_size": 1893200, "table_properties": {"data_size": 1879460, "index_size": 23340, "filter_size": 0, "raw_key_size": 663360, "raw_average_key_size": 24, "raw_value_size": 2764000, "raw_average_value_size": 100, "num_data_blocks": 838, "num_entries": 27640, "filter_policy_name": "", "kDeletedKeys": "0", "kMergeOperands": "0"}}
+2018/05/25-14:30:27.288210 7f82ba72e700 [db/flush_job.cc:371] [default] [JOB 10] Level-0 flush table #23: 1893200 bytes OK
+2018/05/25-14:30:27.289353 7f82ba72e700 [WARN] [db/column_family.cc:764] [default] Stalling writes because of estimated pending compaction bytes 14410584
+2018/05/25-14:30:27.289390 7f82ba72e700 (Original Log Time 2018/05/25-14:30:27.288829) [db/memtable_list.cc:377] [default] Level-0 commit table #23 started
+2018/05/25-14:30:27.289393 7f82ba72e700 (Original Log Time 2018/05/25-14:30:27.289332) [db/memtable_list.cc:409] [default] Level-0 commit table #23: memtable #1 done
+2018/05/25-14:34:21.047206 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527284061047181, "cf_name": "default", "job": 44, "event": "table_file_creation", "file_number": 84, "file_size": 1890780, "table_properties": {"data_size": 1877100, "index_size": 23309, "filter_size": 0, "raw_key_size": 662808, "raw_average_key_size": 24, "raw_value_size": 2761700, "raw_average_value_size": 100, "num_data_blocks": 837, "num_entries": 27617, "filter_policy_name": "", "kDeletedKeys": "0", "kMergeOperands": "0"}}
+2018/05/25-14:34:21.047233 7f82ba72e700 [db/flush_job.cc:371] [default] [JOB 44] Level-0 flush table #84: 1890780 bytes OK
+2018/05/25-14:34:21.048017 7f82ba72e700 (Original Log Time 2018/05/25-14:34:21.048005) EVENT_LOG_v1 {"time_micros": 1527284061047997, "job": 44, "event": "flush_finished", "output_compression": "Snappy", "lsm_state": [2, 1, 0, 0, 0, 0, 0], "immutable_memtables": 1}
+2018/05/25-14:34:21.048592 7f82bd676200 [DEBUG] [db/db_impl_files.cc:261] [JOB 45] Delete /tmp/rocksdbtest-155919/dbbench/000084.sst type=2 #84 -- OK
+2018/05/25-14:34:21.048603 7f82bd676200 EVENT_LOG_v1 {"time_micros": 1527284061048600, "job": 45, "event": "table_file_deletion", "file_number": 84}
+2018/05/25-14:34:21.048981 7f82bd676200 [db/db_impl.cc:398] Shutdown complete
+2018/05/25-14:34:21.049000 7f82bd676200 [db/db_impl.cc:563] [col-fam-A] random log message for testing
+2018/05/25-14:34:21.049010 7f82bd676200 [db/db_impl.cc:234] [col-fam-B] log continuing on next line
+remaining part of the log
+2018/05/25-14:34:21.049020 7f82bd676200 [db/db_impl.cc:653] [col-fam-A] another random log message
+2018/05/25-14:34:21.049025 7f82bd676200 [db/db_impl.cc:331] [unknown] random log message no column family
diff --git a/src/rocksdb/tools/advisor/test/input_files/LOG-1 b/src/rocksdb/tools/advisor/test/input_files/LOG-1
new file mode 100644
index 000000000..b163f9a99
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/input_files/LOG-1
@@ -0,0 +1,25 @@
+2018/05/25-14:30:05.601692 7f82bd676200 RocksDB version: 5.14.0
+2018/05/25-14:30:07.626719 7f82ba72e700 (Original Log Time 2018/05/25-14:30:07.621966) [db/db_impl_compaction_flush.cc:1424] Calling FlushMemTableToOutputFile with column family [default], flush slots available 1, compaction slots available 1, flush slots scheduled 1, compaction slots scheduled 0
+2018/05/25-14:30:07.626725 7f82ba72e700 [db/flush_job.cc:301] [default] [JOB 3] Flushing memtable with next log file: 8
+2018/05/25-14:30:07.626738 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527283807626732, "job": 3, "event": "flush_started", "num_memtables": 1, "num_entries": 28018, "num_deletes": 0, "memory_usage": 4065512, "flush_reason": "Write Buffer Full"}
+2018/05/25-14:30:07.626740 7f82ba72e700 [db/flush_job.cc:331] [default] [JOB 3] Level-0 flush table #10: started
+2018/05/25-14:30:07.764232 7f82b2f20700 [db/db_impl_write.cc:1373] [default] New memtable created with log file: #11. Immutable memtables: 1.
+2018/05/25-14:30:07.764240 7f82b2f20700 [WARN] [db/column_family.cc:743] [default] Stopping writes because we have 2 immutable memtables (waiting for flush), max_write_buffer_number is set to 2
+2018/05/23-11:53:12.800143 7f9f36b40700 [WARN] [db/column_family.cc:799] [default] Stalling writes because we have 4 level-0 files rate 39886
+2018/05/23-11:53:12.800143 7f9f36b40700 [WARN] [db/column_family.cc:799] [default] Stopping writes because we have 4 level-0 files rate 39886
+2018/05/25-14:30:09.398302 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527283809398276, "cf_name": "default", "job": 3, "event": "table_file_creation", "file_number": 10, "file_size": 1890434, "table_properties": {"data_size": 1876749, "index_size": 23346, "filter_size": 0, "raw_key_size": 663120, "raw_average_key_size": 24, "raw_value_size": 2763000, "raw_average_value_size": 100, "num_data_blocks": 838, "num_entries": 27630, "filter_policy_name": "", "kDeletedKeys": "0", "kMergeOperands": "0"}}
+2018/05/25-14:30:09.398351 7f82ba72e700 [db/flush_job.cc:371] [default] [JOB 3] Level-0 flush table #10: 1890434 bytes OK
+2018/05/25-14:30:25.491635 7f82ba72e700 [db/flush_job.cc:331] [default] [JOB 10] Level-0 flush table #23: started
+2018/05/25-14:30:25.643618 7f82b2f20700 [db/db_impl_write.cc:1373] [default] New memtable created with log file: #24. Immutable memtables: 1.
+2018/05/25-14:30:25.643633 7f82b2f20700 [WARN] [db/column_family.cc:743] [default] Stopping writes because we have 2 immutable memtables (waiting for flush), max_write_buffer_number is set to 2
+2018/05/25-14:30:27.288181 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527283827288158, "cf_name": "default", "job": 10, "event": "table_file_creation", "file_number": 23, "file_size": 1893200, "table_properties": {"data_size": 1879460, "index_size": 23340, "filter_size": 0, "raw_key_size": 663360, "raw_average_key_size": 24, "raw_value_size": 2764000, "raw_average_value_size": 100, "num_data_blocks": 838, "num_entries": 27640, "filter_policy_name": "", "kDeletedKeys": "0", "kMergeOperands": "0"}}
+2018/05/25-14:30:27.288210 7f82ba72e700 [db/flush_job.cc:371] [default] [JOB 10] Level-0 flush table #23: 1893200 bytes OK
+2018/05/25-14:30:27.289353 7f82ba72e700 [WARN] [db/column_family.cc:764] [default] Stopping writes because of estimated pending compaction bytes 14410584
+2018/05/25-14:30:27.289390 7f82ba72e700 (Original Log Time 2018/05/25-14:30:27.288829) [db/memtable_list.cc:377] [default] Level-0 commit table #23 started
+2018/05/25-14:30:27.289393 7f82ba72e700 (Original Log Time 2018/05/25-14:30:27.289332) [db/memtable_list.cc:409] [default] Level-0 commit table #23: memtable #1 done
+2018/05/25-14:34:21.047206 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527284061047181, "cf_name": "default", "job": 44, "event": "table_file_creation", "file_number": 84, "file_size": 1890780, "table_properties": {"data_size": 1877100, "index_size": 23309, "filter_size": 0, "raw_key_size": 662808, "raw_average_key_size": 24, "raw_value_size": 2761700, "raw_average_value_size": 100, "num_data_blocks": 837, "num_entries": 27617, "filter_policy_name": "", "kDeletedKeys": "0", "kMergeOperands": "0"}}
+2018/05/25-14:34:21.047233 7f82ba72e700 [db/flush_job.cc:371] [default] [JOB 44] Level-0 flush table #84: 1890780 bytes OK
+2018/05/25-14:34:21.048017 7f82ba72e700 (Original Log Time 2018/05/25-14:34:21.048005) EVENT_LOG_v1 {"time_micros": 1527284061047997, "job": 44, "event": "flush_finished", "output_compression": "Snappy", "lsm_state": [2, 1, 0, 0, 0, 0, 0], "immutable_memtables": 1}
+2018/05/25-14:34:21.048592 7f82bd676200 [DEBUG] [db/db_impl_files.cc:261] [JOB 45] Delete /tmp/rocksdbtest-155919/dbbench/000084.sst type=2 #84 -- OK
+2018/05/25-14:34:21.048603 7f82bd676200 EVENT_LOG_v1 {"time_micros": 1527284061048600, "job": 45, "event": "table_file_deletion", "file_number": 84}
+2018/05/25-14:34:21.048981 7f82bd676200 [db/db_impl.cc:398] Shutdown complete
diff --git a/src/rocksdb/tools/advisor/test/input_files/OPTIONS-000005 b/src/rocksdb/tools/advisor/test/input_files/OPTIONS-000005
new file mode 100644
index 000000000..009edb04d
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/input_files/OPTIONS-000005
@@ -0,0 +1,49 @@
+# This is a RocksDB option file.
+#
+# For detailed file format spec, please refer to the example file
+# in examples/rocksdb_option_file_example.ini
+#
+
+[Version]
+  rocksdb_version=5.14.0
+  options_file_version=1.1
+
+[DBOptions]
+  manual_wal_flush=false
+  allow_ingest_behind=false
+  db_write_buffer_size=0
+  db_log_dir=
+  random_access_max_buffer_size=1048576
+
+[CFOptions "default"]
+  ttl=0
+  max_bytes_for_level_base=268435456
+  max_bytes_for_level_multiplier=10.000000
+  level0_file_num_compaction_trigger=4
+  level0_stop_writes_trigger=36
+  write_buffer_size=4194000
+  min_write_buffer_number_to_merge=1
+  num_levels=7
+  compaction_filter_factory=nullptr
+  compaction_style=kCompactionStyleLevel
+
+[TableOptions/BlockBasedTable "default"]
+  block_align=false
+  index_type=kBinarySearch
+
+[CFOptions "col_fam_A"]
+ttl=0
+max_bytes_for_level_base=268435456
+max_bytes_for_level_multiplier=10.000000
+level0_file_num_compaction_trigger=5
+level0_stop_writes_trigger=36
+write_buffer_size=1024000
+min_write_buffer_number_to_merge=1
+num_levels=5
+compaction_filter_factory=nullptr
+compaction_style=kCompactionStyleLevel
+
+[TableOptions/BlockBasedTable "col_fam_A"]
+block_align=true
+block_restart_interval=16
+index_type=kBinarySearch
diff --git a/src/rocksdb/tools/advisor/test/input_files/log_stats_parser_keys_ts b/src/rocksdb/tools/advisor/test/input_files/log_stats_parser_keys_ts
new file mode 100644
index 000000000..e8ade9e3e
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/input_files/log_stats_parser_keys_ts
@@ -0,0 +1,3 @@
+rocksdb.number.block.decompressed.count: 1530896335 88.0, 1530896361 788338.0, 1530896387 1539256.0, 1530896414 2255696.0, 1530896440 3009325.0, 1530896466 3767183.0, 1530896492 4529775.0, 1530896518 5297809.0, 1530896545 6033802.0, 1530896570 6794129.0
+rocksdb.db.get.micros.p50: 1530896335 295.5, 1530896361 16.561841, 1530896387 16.20677, 1530896414 16.31508, 1530896440 16.346602, 1530896466 16.284669, 1530896492 16.16005, 1530896518 16.069096, 1530896545 16.028746, 1530896570 15.9638
+rocksdb.manifest.file.sync.micros.p99: 1530896335 649.0, 1530896361 835.0, 1530896387 1435.0, 1530896414 9938.0, 1530896440 9938.0, 1530896466 9938.0, 1530896492 9938.0, 1530896518 1882.0, 1530896545 1837.0, 1530896570 1792.0
diff --git a/src/rocksdb/tools/advisor/test/input_files/rules_err1.ini b/src/rocksdb/tools/advisor/test/input_files/rules_err1.ini
new file mode 100644
index 000000000..23be55dde
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/input_files/rules_err1.ini
@@ -0,0 +1,56 @@
+[Rule "missing-suggestions"]
+suggestions=
+conditions=missing-source
+
+[Condition "normal-rule"]
+source=LOG
+regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+
+[Suggestion "inc-bg-flush"]
+option=DBOptions.max_background_flushes
+action=increase
+
+[Suggestion "inc-write-buffer"]
+option=CFOptions.max_write_buffer_number
+action=increase
+
+[Rule "missing-conditions"]
+conditions=
+suggestions=missing-description
+
+[Condition "missing-options"]
+source=OPTIONS
+options=
+evaluate=int(options[0])*int(options[1])-int(options[2])<(-251659456)  # should evaluate to a boolean
+
+[Rule "missing-expression"]
+conditions=missing-expression
+suggestions=missing-description
+
+[Condition "missing-expression"]
+source=OPTIONS
+options=CFOptions.level0_file_num_compaction_trigger:CFOptions.write_buffer_size:CFOptions.max_bytes_for_level_base
+evaluate=
+
+[Suggestion "missing-description"]
+description=
+
+[Rule "stop-too-many-L0"]
+suggestions=inc-max-bg-compactions:missing-action:inc-l0-stop-writes-trigger
+conditions=missing-regex
+
+[Condition "missing-regex"]
+source=LOG
+regex=
+
+[Suggestion "missing-option"]
+option=
+action=increase
+
+[Suggestion "normal-suggestion"]
+option=CFOptions.write_buffer_size
+action=increase
+
+[Suggestion "inc-l0-stop-writes-trigger"]
+option=CFOptions.level0_stop_writes_trigger
+action=increase
diff --git a/src/rocksdb/tools/advisor/test/input_files/rules_err2.ini b/src/rocksdb/tools/advisor/test/input_files/rules_err2.ini
new file mode 100644
index 000000000..bce21dba9
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/input_files/rules_err2.ini
@@ -0,0 +1,15 @@
+[Rule "normal-rule"]
+suggestions=inc-bg-flush:inc-write-buffer
+conditions=missing-source
+
+[Condition "missing-source"]
+source=
+regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+
+[Suggestion "inc-bg-flush"]
+option=DBOptions.max_background_flushes
+action=increase
+
+[Suggestion "inc-write-buffer"]
+option=CFOptions.max_write_buffer_number
+action=increase
diff --git a/src/rocksdb/tools/advisor/test/input_files/rules_err3.ini b/src/rocksdb/tools/advisor/test/input_files/rules_err3.ini
new file mode 100644
index 000000000..73c06e469
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/input_files/rules_err3.ini
@@ -0,0 +1,15 @@
+[Rule "normal-rule"]
+suggestions=missing-action:inc-write-buffer
+conditions=missing-source
+
+[Condition "normal-condition"]
+source=LOG
+regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+
+[Suggestion "missing-action"]
+option=DBOptions.max_background_flushes
+action=
+
+[Suggestion "inc-write-buffer"]
+option=CFOptions.max_write_buffer_number
+action=increase
diff --git a/src/rocksdb/tools/advisor/test/input_files/rules_err4.ini b/src/rocksdb/tools/advisor/test/input_files/rules_err4.ini
new file mode 100644
index 000000000..4d4aa3c70
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/input_files/rules_err4.ini
@@ -0,0 +1,15 @@
+[Rule "normal-rule"]
+suggestions=inc-bg-flush
+conditions=missing-source
+
+[Condition "normal-condition"]
+source=LOG
+regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+
+[Suggestion "inc-bg-flush"]
+option=DBOptions.max_background_flushes
+action=increase
+
+[Suggestion]  # missing section name
+option=CFOptions.max_write_buffer_number
+action=increase
diff --git a/src/rocksdb/tools/advisor/test/input_files/test_rules.ini b/src/rocksdb/tools/advisor/test/input_files/test_rules.ini
new file mode 100644
index 000000000..97b9374fc
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/input_files/test_rules.ini
@@ -0,0 +1,47 @@
+[Rule "single-condition-false"]
+suggestions=inc-bg-flush:inc-write-buffer
+conditions=log-4-false
+
+[Rule "multiple-conds-true"]
+suggestions=inc-write-buffer
+conditions=log-1-true:log-2-true:log-3-true
+
+[Rule "multiple-conds-one-false"]
+suggestions=inc-bg-flush
+conditions=log-1-true:log-4-false:log-3-true
+
+[Rule "multiple-conds-all-false"]
+suggestions=l0-l1-ratio-health-check
+conditions=log-4-false:options-1-false
+
+[Condition "log-1-true"]
+source=LOG
+regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+
+[Condition "log-2-true"]
+source=LOG
+regex=Stalling writes because we have \d+ level-0 files
+
+[Condition "log-3-true"]
+source=LOG
+regex=Stopping writes because we have \d+ level-0 files
+
+[Condition "log-4-false"]
+source=LOG
+regex=Stalling writes because of estimated pending compaction bytes \d+
+
+[Condition "options-1-false"]
+source=OPTIONS
+options=CFOptions.level0_file_num_compaction_trigger:CFOptions.write_buffer_size:DBOptions.random_access_max_buffer_size
+evaluate=int(options[0])*int(options[1])-int(options[2])<0  # should evaluate to a boolean
+
+[Suggestion "inc-bg-flush"]
+option=DBOptions.max_background_flushes
+action=increase
+
+[Suggestion "inc-write-buffer"]
+option=CFOptions.max_write_buffer_number
+action=increase
+
+[Suggestion "l0-l1-ratio-health-check"]
+description='modify options such that (level0_file_num_compaction_trigger * write_buffer_size - max_bytes_for_level_base < 5) is satisfied'
diff --git a/src/rocksdb/tools/advisor/test/input_files/triggered_rules.ini b/src/rocksdb/tools/advisor/test/input_files/triggered_rules.ini
new file mode 100644
index 000000000..83b96da2b
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/input_files/triggered_rules.ini
@@ -0,0 +1,83 @@
+[Rule "stall-too-many-memtables"]
+suggestions=inc-bg-flush:inc-write-buffer
+conditions=stall-too-many-memtables
+
+[Condition "stall-too-many-memtables"]
+source=LOG
+regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+
+[Rule "stall-too-many-L0"]
+suggestions=inc-max-subcompactions:inc-max-bg-compactions:inc-write-buffer-size:dec-max-bytes-for-level-base:inc-l0-slowdown-writes-trigger
+conditions=stall-too-many-L0
+
+[Condition "stall-too-many-L0"]
+source=LOG
+regex=Stalling writes because we have \d+ level-0 files
+
+[Rule "stop-too-many-L0"]
+suggestions=inc-max-bg-compactions:inc-write-buffer-size:inc-l0-stop-writes-trigger
+conditions=stop-too-many-L0
+
+[Condition "stop-too-many-L0"]
+source=LOG
+regex=Stopping writes because we have \d+ level-0 files
+
+[Rule "stall-too-many-compaction-bytes"]
+suggestions=inc-max-bg-compactions:inc-write-buffer-size:inc-hard-pending-compaction-bytes-limit:inc-soft-pending-compaction-bytes-limit
+conditions=stall-too-many-compaction-bytes
+
+[Condition "stall-too-many-compaction-bytes"]
+source=LOG
+regex=Stalling writes because of estimated pending compaction bytes \d+
+
+[Suggestion "inc-bg-flush"]
+option=DBOptions.max_background_flushes
+action=increase
+
+[Suggestion "inc-write-buffer"]
+option=CFOptions.max_write_buffer_number
+action=increase
+
+[Suggestion "inc-max-subcompactions"]
+option=DBOptions.max_subcompactions
+action=increase
+
+[Suggestion "inc-max-bg-compactions"]
+option=DBOptions.max_background_compactions
+action=increase
+
+[Suggestion "inc-write-buffer-size"]
+option=CFOptions.write_buffer_size
+action=increase
+
+[Suggestion "dec-max-bytes-for-level-base"]
+option=CFOptions.max_bytes_for_level_base
+action=decrease
+
+[Suggestion "inc-l0-slowdown-writes-trigger"]
+option=CFOptions.level0_slowdown_writes_trigger
+action=increase
+
+[Suggestion "inc-l0-stop-writes-trigger"]
+option=CFOptions.level0_stop_writes_trigger
+action=increase
+
+[Suggestion "inc-hard-pending-compaction-bytes-limit"]
+option=CFOptions.hard_pending_compaction_bytes_limit
+action=increase
+
+[Suggestion "inc-soft-pending-compaction-bytes-limit"]
+option=CFOptions.soft_pending_compaction_bytes_limit
+action=increase
+
+[Rule "level0-level1-ratio"]
+conditions=level0-level1-ratio
+suggestions=l0-l1-ratio-health-check
+
+[Condition "level0-level1-ratio"]
+source=OPTIONS
+options=CFOptions.level0_file_num_compaction_trigger:CFOptions.write_buffer_size:CFOptions.max_bytes_for_level_base
+evaluate=int(options[0])*int(options[1])-int(options[2])>=-268173312  # should evaluate to a boolean, condition triggered if evaluates to true
+
+[Suggestion "l0-l1-ratio-health-check"]
+description='modify options such that (level0_file_num_compaction_trigger * write_buffer_size - max_bytes_for_level_base < -268173312) is satisfied'
diff --git a/src/rocksdb/tools/advisor/test/test_db_bench_runner.py b/src/rocksdb/tools/advisor/test/test_db_bench_runner.py
new file mode 100644
index 000000000..57306c942
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/test_db_bench_runner.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+import os
+import unittest
+
+from advisor.db_bench_runner import DBBenchRunner
+from advisor.db_log_parser import DataSource, NO_COL_FAMILY
+from advisor.db_options_parser import DatabaseOptions
+
+
+class TestDBBenchRunnerMethods(unittest.TestCase):
+    def setUp(self):
+        self.pos_args = [
+            "./../../db_bench",
+            "overwrite",
+            "use_existing_db=true",
+            "duration=10",
+        ]
+        self.bench_runner = DBBenchRunner(self.pos_args)
+        this_path = os.path.abspath(os.path.dirname(__file__))
+        options_path = os.path.join(this_path, "input_files/OPTIONS-000005")
+        self.db_options = DatabaseOptions(options_path)
+
+    def test_setup(self):
+        self.assertEqual(self.bench_runner.db_bench_binary, self.pos_args[0])
+        self.assertEqual(self.bench_runner.benchmark, self.pos_args[1])
+        self.assertSetEqual(
+            set(self.bench_runner.db_bench_args), set(self.pos_args[2:])
+        )
+
+    def test_get_info_log_file_name(self):
+        log_file_name = DBBenchRunner.get_info_log_file_name(None, "random_path")
+        self.assertEqual(log_file_name, "LOG")
+
+        log_file_name = DBBenchRunner.get_info_log_file_name(
+            "/dev/shm/", "/tmp/rocksdbtest-155919/dbbench/"
+        )
+        self.assertEqual(log_file_name, "tmp_rocksdbtest-155919_dbbench_LOG")
+
+    def test_get_opt_args_str(self):
+        misc_opt_dict = {"bloom_bits": 2, "empty_opt": None, "rate_limiter": 3}
+        optional_args_str = DBBenchRunner.get_opt_args_str(misc_opt_dict)
+        self.assertEqual(optional_args_str, " --bloom_bits=2 --rate_limiter=3")
+
+    def test_get_log_options(self):
+        db_path = "/tmp/rocksdb-155919/dbbench"
+        # when db_log_dir is present in the db_options
+        update_dict = {
+            "DBOptions.db_log_dir": {NO_COL_FAMILY: "/dev/shm"},
+            "DBOptions.stats_dump_period_sec": {NO_COL_FAMILY: "20"},
+        }
+        self.db_options.update_options(update_dict)
+        log_file_prefix, stats_freq = self.bench_runner.get_log_options(
+            self.db_options, db_path
+        )
+        self.assertEqual(log_file_prefix, "/dev/shm/tmp_rocksdb-155919_dbbench_LOG")
+        self.assertEqual(stats_freq, 20)
+
+        update_dict = {
+            "DBOptions.db_log_dir": {NO_COL_FAMILY: None},
+            "DBOptions.stats_dump_period_sec": {NO_COL_FAMILY: "30"},
+        }
+        self.db_options.update_options(update_dict)
+        log_file_prefix, stats_freq = self.bench_runner.get_log_options(
+            self.db_options, db_path
+        )
+        self.assertEqual(log_file_prefix, "/tmp/rocksdb-155919/dbbench/LOG")
+        self.assertEqual(stats_freq, 30)
+
+    def test_build_experiment_command(self):
+        # add some misc_options to db_options
+        update_dict = {
+            "bloom_bits": {NO_COL_FAMILY: 2},
+            "rate_limiter_bytes_per_sec": {NO_COL_FAMILY: 128000000},
+        }
+        self.db_options.update_options(update_dict)
+        db_path = "/dev/shm"
+        experiment_command = self.bench_runner._build_experiment_command(
+            self.db_options, db_path
+        )
+        opt_args_str = DBBenchRunner.get_opt_args_str(
+            self.db_options.get_misc_options()
+        )
+        opt_args_str += " --options_file=" + self.db_options.generate_options_config(
+            "12345"
+        )
+        for arg in self.pos_args[2:]:
+            opt_args_str += " --" + arg
+        expected_command = (
+            self.pos_args[0]
+            + " --benchmarks="
+            + self.pos_args[1]
+            + " --statistics --perf_level=3 --db="
+            + db_path
+            + opt_args_str
+        )
+        self.assertEqual(experiment_command, expected_command)
+
+
+class TestDBBenchRunner(unittest.TestCase):
+    def setUp(self):
+        # Note: the db_bench binary should be present in the rocksdb/ directory
+        self.pos_args = [
+            "./../../db_bench",
+            "overwrite",
+            "use_existing_db=true",
+            "duration=20",
+        ]
+        self.bench_runner = DBBenchRunner(self.pos_args)
+        this_path = os.path.abspath(os.path.dirname(__file__))
+        options_path = os.path.join(this_path, "input_files/OPTIONS-000005")
+        self.db_options = DatabaseOptions(options_path)
+
+    def test_experiment_output(self):
+        update_dict = {"bloom_bits": {NO_COL_FAMILY: 2}}
+        self.db_options.update_options(update_dict)
+        db_path = "/dev/shm"
+        data_sources, throughput = self.bench_runner.run_experiment(
+            self.db_options, db_path
+        )
+        self.assertEqual(
+            data_sources[DataSource.Type.DB_OPTIONS][0].type, DataSource.Type.DB_OPTIONS
+        )
+        self.assertEqual(data_sources[DataSource.Type.LOG][0].type, DataSource.Type.LOG)
+        self.assertEqual(len(data_sources[DataSource.Type.TIME_SERIES]), 2)
+        self.assertEqual(
+            data_sources[DataSource.Type.TIME_SERIES][0].type,
+            DataSource.Type.TIME_SERIES,
+        )
+        self.assertEqual(
+            data_sources[DataSource.Type.TIME_SERIES][1].type,
+            DataSource.Type.TIME_SERIES,
+        )
+        self.assertEqual(data_sources[DataSource.Type.TIME_SERIES][1].stats_freq_sec, 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/src/rocksdb/tools/advisor/test/test_db_log_parser.py b/src/rocksdb/tools/advisor/test/test_db_log_parser.py
new file mode 100644
index 000000000..6862691c1
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/test_db_log_parser.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+import os
+import unittest
+
+from advisor.db_log_parser import DatabaseLogs, Log, NO_COL_FAMILY
+from advisor.rule_parser import Condition, LogCondition
+
+
+class TestLog(unittest.TestCase):
+    def setUp(self):
+        self.column_families = ["default", "col_fam_A"]
+
+    def test_get_column_family(self):
+        test_log = (
+            "2018/05/25-14:34:21.047233 7f82ba72e700 [db/flush_job.cc:371] "
+            + "[col_fam_A] [JOB 44] Level-0 flush table #84: 1890780 bytes OK"
+        )
+        db_log = Log(test_log, self.column_families)
+        self.assertEqual("col_fam_A", db_log.get_column_family())
+
+        test_log = (
+            "2018/05/25-14:34:21.047233 7f82ba72e700 [db/flush_job.cc:371] "
+            + "[JOB 44] Level-0 flush table #84: 1890780 bytes OK"
+        )
+        db_log = Log(test_log, self.column_families)
+        db_log.append_message("[default] some remaining part of log")
+        self.assertEqual(NO_COL_FAMILY, db_log.get_column_family())
+
+    def test_get_methods(self):
+        hr_time = "2018/05/25-14:30:25.491635"
+        context = "7f82ba72e700"
+        message = (
+            "[db/flush_job.cc:331] [default] [JOB 10] Level-0 flush table "
+            + "#23: started"
+        )
+        test_log = hr_time + " " + context + " " + message
+        db_log = Log(test_log, self.column_families)
+        self.assertEqual(db_log.get_message(), message)
+        remaining_message = "[col_fam_A] some more logs"
+        db_log.append_message(remaining_message)
+        self.assertEqual(db_log.get_human_readable_time(), "2018/05/25-14:30:25.491635")
+        self.assertEqual(db_log.get_context(), "7f82ba72e700")
+        self.assertEqual(db_log.get_timestamp(), 1527258625)
+        self.assertEqual(db_log.get_message(), str(message + "\n" + remaining_message))
+
+    def test_is_new_log(self):
+        new_log = "2018/05/25-14:34:21.047233 context random new log"
+        remaining_log = "2018/05/25 not really a new log"
+        self.assertTrue(Log.is_new_log(new_log))
+        self.assertFalse(Log.is_new_log(remaining_log))
+
+
+class TestDatabaseLogs(unittest.TestCase):
+    def test_check_and_trigger_conditions(self):
+        this_path = os.path.abspath(os.path.dirname(__file__))
+        logs_path_prefix = os.path.join(this_path, "input_files/LOG-0")
+        column_families = ["default", "col-fam-A", "col-fam-B"]
+        db_logs = DatabaseLogs(logs_path_prefix, column_families)
+        # matches, has 2 col_fams
+        condition1 = LogCondition.create(Condition("cond-A"))
+        condition1.set_parameter("regex", "random log message")
+        # matches, multiple lines message
+        condition2 = LogCondition.create(Condition("cond-B"))
+        condition2.set_parameter("regex", "continuing on next line")
+        # does not match
+        condition3 = LogCondition.create(Condition("cond-C"))
+        condition3.set_parameter("regex", "this should match no log")
+        db_logs.check_and_trigger_conditions([condition1, condition2, condition3])
+        cond1_trigger = condition1.get_trigger()
+        self.assertEqual(2, len(cond1_trigger.keys()))
+        self.assertSetEqual({"col-fam-A", NO_COL_FAMILY}, set(cond1_trigger.keys()))
+        self.assertEqual(2, len(cond1_trigger["col-fam-A"]))
+        messages = [
+            "[db/db_impl.cc:563] [col-fam-A] random log message for testing",
+            "[db/db_impl.cc:653] [col-fam-A] another random log message",
+        ]
+        self.assertIn(cond1_trigger["col-fam-A"][0].get_message(), messages)
+        self.assertIn(cond1_trigger["col-fam-A"][1].get_message(), messages)
+        self.assertEqual(1, len(cond1_trigger[NO_COL_FAMILY]))
+        self.assertEqual(
+            cond1_trigger[NO_COL_FAMILY][0].get_message(),
+            "[db/db_impl.cc:331] [unknown] random log message no column family",
+        )
+        cond2_trigger = condition2.get_trigger()
+        self.assertEqual(["col-fam-B"], list(cond2_trigger.keys()))
+        self.assertEqual(1, len(cond2_trigger["col-fam-B"]))
+        self.assertEqual(
+            cond2_trigger["col-fam-B"][0].get_message(),
+            "[db/db_impl.cc:234] [col-fam-B] log continuing on next line\n"
+            + "remaining part of the log",
+        )
+        self.assertIsNone(condition3.get_trigger())
diff --git a/src/rocksdb/tools/advisor/test/test_db_options_parser.py b/src/rocksdb/tools/advisor/test/test_db_options_parser.py
new file mode 100644
index 000000000..cdeebaefa
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/test_db_options_parser.py
@@ -0,0 +1,214 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+import os
+import unittest
+
+from advisor.db_log_parser import NO_COL_FAMILY
+from advisor.db_options_parser import DatabaseOptions
+from advisor.rule_parser import Condition, OptionCondition
+
+
+class TestDatabaseOptions(unittest.TestCase):
+    def setUp(self):
+        self.this_path = os.path.abspath(os.path.dirname(__file__))
+        self.og_options = os.path.join(self.this_path, "input_files/OPTIONS-000005")
+        misc_options = ["bloom_bits = 4", "rate_limiter_bytes_per_sec = 1024000"]
+        # create the options object
+        self.db_options = DatabaseOptions(self.og_options, misc_options)
+        # perform clean-up before running tests
+        self.generated_options = os.path.join(
+            self.this_path, "../temp/OPTIONS_testing.tmp"
+        )
+        if os.path.isfile(self.generated_options):
+            os.remove(self.generated_options)
+
+    def test_get_options_diff(self):
+        old_opt = {
+            "DBOptions.stats_dump_freq_sec": {NO_COL_FAMILY: "20"},
+            "CFOptions.write_buffer_size": {
+                "default": "1024000",
+                "col_fam_A": "128000",
+                "col_fam_B": "128000000",
+            },
+            "DBOptions.use_fsync": {NO_COL_FAMILY: "true"},
+            "DBOptions.max_log_file_size": {NO_COL_FAMILY: "128000000"},
+        }
+        new_opt = {
+            "bloom_bits": {NO_COL_FAMILY: "4"},
+            "CFOptions.write_buffer_size": {
+                "default": "128000000",
+                "col_fam_A": "128000",
+                "col_fam_C": "128000000",
+            },
+            "DBOptions.use_fsync": {NO_COL_FAMILY: "true"},
+            "DBOptions.max_log_file_size": {NO_COL_FAMILY: "0"},
+        }
+        diff = DatabaseOptions.get_options_diff(old_opt, new_opt)
+
+        expected_diff = {
+            "DBOptions.stats_dump_freq_sec": {NO_COL_FAMILY: ("20", None)},
+            "bloom_bits": {NO_COL_FAMILY: (None, "4")},
+            "CFOptions.write_buffer_size": {
+                "default": ("1024000", "128000000"),
+                "col_fam_B": ("128000000", None),
+                "col_fam_C": (None, "128000000"),
+            },
+            "DBOptions.max_log_file_size": {NO_COL_FAMILY: ("128000000", "0")},
+        }
+        self.assertDictEqual(diff, expected_diff)
+
+    def test_is_misc_option(self):
+        self.assertTrue(DatabaseOptions.is_misc_option("bloom_bits"))
+        self.assertFalse(
+            DatabaseOptions.is_misc_option("DBOptions.stats_dump_freq_sec")
+        )
+
+    def test_set_up(self):
+        options = self.db_options.get_all_options()
+        self.assertEqual(22, len(options.keys()))
+        expected_misc_options = {
+            "bloom_bits": "4",
+            "rate_limiter_bytes_per_sec": "1024000",
+        }
+        self.assertDictEqual(expected_misc_options, self.db_options.get_misc_options())
+        self.assertListEqual(
+            ["default", "col_fam_A"], self.db_options.get_column_families()
+        )
+
+    def test_get_options(self):
+        opt_to_get = [
+            "DBOptions.manual_wal_flush",
+            "DBOptions.db_write_buffer_size",
+            "bloom_bits",
+            "CFOptions.compaction_filter_factory",
+            "CFOptions.num_levels",
+            "rate_limiter_bytes_per_sec",
+            "TableOptions.BlockBasedTable.block_align",
+            "random_option",
+        ]
+        options = self.db_options.get_options(opt_to_get)
+        expected_options = {
+            "DBOptions.manual_wal_flush": {NO_COL_FAMILY: "false"},
+            "DBOptions.db_write_buffer_size": {NO_COL_FAMILY: "0"},
+            "bloom_bits": {NO_COL_FAMILY: "4"},
+            "CFOptions.compaction_filter_factory": {
+                "default": "nullptr",
+                "col_fam_A": "nullptr",
+            },
+            "CFOptions.num_levels": {"default": "7", "col_fam_A": "5"},
+            "rate_limiter_bytes_per_sec": {NO_COL_FAMILY: "1024000"},
+            "TableOptions.BlockBasedTable.block_align": {
+                "default": "false",
+                "col_fam_A": "true",
+            },
+        }
+        self.assertDictEqual(expected_options, options)
+
+    def test_update_options(self):
+        # add new, update old, set old
+        # before updating
+        expected_old_opts = {
+            "DBOptions.db_log_dir": {NO_COL_FAMILY: None},
+            "DBOptions.manual_wal_flush": {NO_COL_FAMILY: "false"},
+            "bloom_bits": {NO_COL_FAMILY: "4"},
+            "CFOptions.num_levels": {"default": "7", "col_fam_A": "5"},
+            "TableOptions.BlockBasedTable.block_restart_interval": {"col_fam_A": "16"},
+        }
+        get_opts = list(expected_old_opts.keys())
+        options = self.db_options.get_options(get_opts)
+        self.assertEqual(expected_old_opts, options)
+        # after updating options
+        update_opts = {
+            "DBOptions.db_log_dir": {NO_COL_FAMILY: "/dev/shm"},
+            "DBOptions.manual_wal_flush": {NO_COL_FAMILY: "true"},
+            "bloom_bits": {NO_COL_FAMILY: "2"},
+            "CFOptions.num_levels": {"col_fam_A": "7"},
+            "TableOptions.BlockBasedTable.block_restart_interval": {"default": "32"},
+            "random_misc_option": {NO_COL_FAMILY: "something"},
+        }
+        self.db_options.update_options(update_opts)
+        update_opts["CFOptions.num_levels"]["default"] = "7"
+        update_opts["TableOptions.BlockBasedTable.block_restart_interval"] = {
+            "default": "32",
+            "col_fam_A": "16",
+        }
+        get_opts.append("random_misc_option")
+        options = self.db_options.get_options(get_opts)
+        self.assertDictEqual(update_opts, options)
+        expected_misc_options = {
+            "bloom_bits": "2",
+            "rate_limiter_bytes_per_sec": "1024000",
+            "random_misc_option": "something",
+        }
+        self.assertDictEqual(expected_misc_options, self.db_options.get_misc_options())
+
+    def test_generate_options_config(self):
+        # make sure file does not exist from before
+        self.assertFalse(os.path.isfile(self.generated_options))
+        self.db_options.generate_options_config("testing")
+        self.assertTrue(os.path.isfile(self.generated_options))
+
+    def test_check_and_trigger_conditions(self):
+        # options only from CFOptions
+        # setup the OptionCondition objects to check and trigger
+        update_dict = {
+            "CFOptions.level0_file_num_compaction_trigger": {"col_fam_A": "4"},
+            "CFOptions.max_bytes_for_level_base": {"col_fam_A": "10"},
+        }
+        self.db_options.update_options(update_dict)
+        cond1 = Condition("opt-cond-1")
+        cond1 = OptionCondition.create(cond1)
+        cond1.set_parameter(
+            "options",
+            [
+                "CFOptions.level0_file_num_compaction_trigger",
+                "TableOptions.BlockBasedTable.block_restart_interval",
+                "CFOptions.max_bytes_for_level_base",
+            ],
+        )
+        cond1.set_parameter(
+            "evaluate", "int(options[0])*int(options[1])-int(options[2])>=0"
+        )
+        # only DBOptions
+        cond2 = Condition("opt-cond-2")
+        cond2 = OptionCondition.create(cond2)
+        cond2.set_parameter(
+            "options",
+            [
+                "DBOptions.db_write_buffer_size",
+                "bloom_bits",
+                "rate_limiter_bytes_per_sec",
+            ],
+        )
+        cond2.set_parameter(
+            "evaluate", "(int(options[2]) * int(options[1]) * int(options[0]))==0"
+        )
+        # mix of CFOptions and DBOptions
+        cond3 = Condition("opt-cond-3")
+        cond3 = OptionCondition.create(cond3)
+        cond3.set_parameter(
+            "options",
+            [
+                "DBOptions.db_write_buffer_size",  # 0
+                "CFOptions.num_levels",  # 5, 7
+                "bloom_bits",  # 4
+            ],
+        )
+        cond3.set_parameter(
+            "evaluate", "int(options[2])*int(options[0])+int(options[1])>6"
+        )
+        self.db_options.check_and_trigger_conditions([cond1, cond2, cond3])
+
+        cond1_trigger = {"col_fam_A": ["4", "16", "10"]}
+        self.assertDictEqual(cond1_trigger, cond1.get_trigger())
+        cond2_trigger = {NO_COL_FAMILY: ["0", "4", "1024000"]}
+        self.assertDictEqual(cond2_trigger, cond2.get_trigger())
+        cond3_trigger = {"default": ["0", "7", "4"]}
+        self.assertDictEqual(cond3_trigger, cond3.get_trigger())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/src/rocksdb/tools/advisor/test/test_db_stats_fetcher.py b/src/rocksdb/tools/advisor/test/test_db_stats_fetcher.py
new file mode 100644
index 000000000..e2c29ab74
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/test_db_stats_fetcher.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+import os
+import time
+import unittest
+from unittest.mock import MagicMock
+
+from advisor.db_stats_fetcher import DatabasePerfContext, LogStatsParser
+from advisor.db_timeseries_parser import NO_ENTITY
+from advisor.rule_parser import Condition, TimeSeriesCondition
+
+
+class TestLogStatsParser(unittest.TestCase):
+    def setUp(self):
+        this_path = os.path.abspath(os.path.dirname(__file__))
+        stats_file = os.path.join(this_path, "input_files/log_stats_parser_keys_ts")
+        # populate the keys_ts dictionary of LogStatsParser
+        self.stats_dict = {NO_ENTITY: {}}
+        with open(stats_file, "r") as fp:
+            for line in fp:
+                stat_name = line.split(":")[0].strip()
+                self.stats_dict[NO_ENTITY][stat_name] = {}
+                token_list = line.split(":")[1].strip().split(",")
+                for token in token_list:
+                    timestamp = int(token.split()[0])
+                    value = float(token.split()[1])
+                    self.stats_dict[NO_ENTITY][stat_name][timestamp] = value
+        self.log_stats_parser = LogStatsParser("dummy_log_file", 20)
+        self.log_stats_parser.keys_ts = self.stats_dict
+
+    def test_check_and_trigger_conditions_bursty(self):
+        # mock fetch_timeseries() because 'keys_ts' has been pre-populated
+        self.log_stats_parser.fetch_timeseries = MagicMock()
+        # condition: bursty
+        cond1 = Condition("cond-1")
+        cond1 = TimeSeriesCondition.create(cond1)
+        cond1.set_parameter("keys", "rocksdb.db.get.micros.p50")
+        cond1.set_parameter("behavior", "bursty")
+        cond1.set_parameter("window_sec", 40)
+        cond1.set_parameter("rate_threshold", 0)
+        self.log_stats_parser.check_and_trigger_conditions([cond1])
+        expected_cond_trigger = {NO_ENTITY: {1530896440: 0.9767546362322214}}
+        self.assertDictEqual(expected_cond_trigger, cond1.get_trigger())
+        # ensure that fetch_timeseries() was called once
+        self.log_stats_parser.fetch_timeseries.assert_called_once()
+
+    def test_check_and_trigger_conditions_eval_agg(self):
+        # mock fetch_timeseries() because 'keys_ts' has been pre-populated
+        self.log_stats_parser.fetch_timeseries = MagicMock()
+        # condition: evaluate_expression
+        cond1 = Condition("cond-1")
+        cond1 = TimeSeriesCondition.create(cond1)
+        cond1.set_parameter("keys", "rocksdb.db.get.micros.p50")
+        cond1.set_parameter("behavior", "evaluate_expression")
+        keys = ["rocksdb.manifest.file.sync.micros.p99", "rocksdb.db.get.micros.p50"]
+        cond1.set_parameter("keys", keys)
+        cond1.set_parameter("aggregation_op", "latest")
+        # condition evaluates to FALSE
+        cond1.set_parameter("evaluate", "keys[0]-(keys[1]*100)>200")
+        self.log_stats_parser.check_and_trigger_conditions([cond1])
+        expected_cond_trigger = {NO_ENTITY: [1792.0, 15.9638]}
+        self.assertIsNone(cond1.get_trigger())
+        # condition evaluates to TRUE
+        cond1.set_parameter("evaluate", "keys[0]-(keys[1]*100)<200")
+        self.log_stats_parser.check_and_trigger_conditions([cond1])
+        expected_cond_trigger = {NO_ENTITY: [1792.0, 15.9638]}
+        self.assertDictEqual(expected_cond_trigger, cond1.get_trigger())
+        # ensure that fetch_timeseries() was called
+        self.log_stats_parser.fetch_timeseries.assert_called()
+
+    def test_check_and_trigger_conditions_eval(self):
+        # mock fetch_timeseries() because 'keys_ts' has been pre-populated
+        self.log_stats_parser.fetch_timeseries = MagicMock()
+        # condition: evaluate_expression
+        cond1 = Condition("cond-1")
+        cond1 = TimeSeriesCondition.create(cond1)
+        cond1.set_parameter("keys", "rocksdb.db.get.micros.p50")
+        cond1.set_parameter("behavior", "evaluate_expression")
+        keys = ["rocksdb.manifest.file.sync.micros.p99", "rocksdb.db.get.micros.p50"]
+        cond1.set_parameter("keys", keys)
+        cond1.set_parameter("evaluate", "keys[0]-(keys[1]*100)>500")
+        self.log_stats_parser.check_and_trigger_conditions([cond1])
+        expected_trigger = {
+            NO_ENTITY: {
+                1530896414: [9938.0, 16.31508],
+                1530896440: [9938.0, 16.346602],
+                1530896466: [9938.0, 16.284669],
+                1530896492: [9938.0, 16.16005],
+            }
+        }
+        self.assertDictEqual(expected_trigger, cond1.get_trigger())
+        self.log_stats_parser.fetch_timeseries.assert_called_once()
+
+
+class TestDatabasePerfContext(unittest.TestCase):
+    def test_unaccumulate_metrics(self):
+        perf_dict = {
+            "user_key_comparison_count": 675903942,
+            "block_cache_hit_count": 830086,
+        }
+        timestamp = int(time.time())
+        perf_ts = {}
+        for key in perf_dict:
+            perf_ts[key] = {}
+            start_val = perf_dict[key]
+            for ix in range(5):
+                perf_ts[key][timestamp + (ix * 10)] = start_val + (2 * ix * ix)
+        db_perf_context = DatabasePerfContext(perf_ts, 10, True)
+        timestamps = [timestamp + (ix * 10) for ix in range(1, 5, 1)]
+        values = [val for val in range(2, 15, 4)]
+        inner_dict = {timestamps[ix]: values[ix] for ix in range(4)}
+        expected_keys_ts = {
+            NO_ENTITY: {
+                "user_key_comparison_count": inner_dict,
+                "block_cache_hit_count": inner_dict,
+            }
+        }
+        self.assertDictEqual(expected_keys_ts, db_perf_context.keys_ts)
diff --git a/src/rocksdb/tools/advisor/test/test_rule_parser.py b/src/rocksdb/tools/advisor/test/test_rule_parser.py
new file mode 100644
index 000000000..4ea4ca159
--- /dev/null
+++ b/src/rocksdb/tools/advisor/test/test_rule_parser.py
@@ -0,0 +1,226 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+import os
+import unittest
+
+from advisor.db_log_parser import DatabaseLogs, DataSource
+from advisor.db_options_parser import DatabaseOptions
+from advisor.rule_parser import RulesSpec
+
+RuleToSuggestions = {
+    "stall-too-many-memtables": ["inc-bg-flush", "inc-write-buffer"],
+    "stall-too-many-L0": [
+        "inc-max-subcompactions",
+        "inc-max-bg-compactions",
+        "inc-write-buffer-size",
+        "dec-max-bytes-for-level-base",
+        "inc-l0-slowdown-writes-trigger",
+    ],
+    "stop-too-many-L0": [
+        "inc-max-bg-compactions",
+        "inc-write-buffer-size",
+        "inc-l0-stop-writes-trigger",
+    ],
+    "stall-too-many-compaction-bytes": [
+        "inc-max-bg-compactions",
+        "inc-write-buffer-size",
+        "inc-hard-pending-compaction-bytes-limit",
+        "inc-soft-pending-compaction-bytes-limit",
+    ],
+    "level0-level1-ratio": ["l0-l1-ratio-health-check"],
+}
+
+
+class TestAllRulesTriggered(unittest.TestCase):
+    def setUp(self):
+        # load the Rules
+        this_path = os.path.abspath(os.path.dirname(__file__))
+        ini_path = os.path.join(this_path, "input_files/triggered_rules.ini")
+        self.db_rules = RulesSpec(ini_path)
+        self.db_rules.load_rules_from_spec()
+        self.db_rules.perform_section_checks()
+        # load the data sources: LOG and OPTIONS
+        log_path = os.path.join(this_path, "input_files/LOG-0")
+        options_path = os.path.join(this_path, "input_files/OPTIONS-000005")
+        db_options_parser = DatabaseOptions(options_path)
+        self.column_families = db_options_parser.get_column_families()
+        db_logs_parser = DatabaseLogs(log_path, self.column_families)
+        self.data_sources = {
+            DataSource.Type.DB_OPTIONS: [db_options_parser],
+            DataSource.Type.LOG: [db_logs_parser],
+        }
+
+    def test_triggered_conditions(self):
+        conditions_dict = self.db_rules.get_conditions_dict()
+        rules_dict = self.db_rules.get_rules_dict()
+        # Make sure none of the conditions is triggered beforehand
+        for cond in conditions_dict.values():
+            self.assertFalse(cond.is_triggered(), repr(cond))
+        for rule in rules_dict.values():
+            self.assertFalse(
+                rule.is_triggered(conditions_dict, self.column_families), repr(rule)
+            )
+
+        # # Trigger the conditions as per the data sources.
+        # trigger_conditions(, conditions_dict)
+
+        # Get the set of rules that have been triggered
+        triggered_rules = self.db_rules.get_triggered_rules(
+            self.data_sources, self.column_families
+        )
+
+        # Make sure each condition and rule is triggered
+        for cond in conditions_dict.values():
+            if cond.get_data_source() is DataSource.Type.TIME_SERIES:
+                continue
+            self.assertTrue(cond.is_triggered(), repr(cond))
+
+        for rule in rules_dict.values():
+            self.assertIn(rule, triggered_rules)
+            # Check the suggestions made by the triggered rules
+            for sugg in rule.get_suggestions():
+                self.assertIn(sugg, RuleToSuggestions[rule.name])
+
+        for rule in triggered_rules:
+            self.assertIn(rule, rules_dict.values())
+            for sugg in RuleToSuggestions[rule.name]:
+                self.assertIn(sugg, rule.get_suggestions())
+
+
+class TestConditionsConjunctions(unittest.TestCase):
+    def setUp(self):
+        # load the Rules
+        this_path = os.path.abspath(os.path.dirname(__file__))
+        ini_path = os.path.join(this_path, "input_files/test_rules.ini")
+        self.db_rules = RulesSpec(ini_path)
+        self.db_rules.load_rules_from_spec()
+        self.db_rules.perform_section_checks()
+        # load the data sources: LOG and OPTIONS
+        log_path = os.path.join(this_path, "input_files/LOG-1")
+        options_path = os.path.join(this_path, "input_files/OPTIONS-000005")
+        db_options_parser = DatabaseOptions(options_path)
+        self.column_families = db_options_parser.get_column_families()
+        db_logs_parser = DatabaseLogs(log_path, self.column_families)
+        self.data_sources = {
+            DataSource.Type.DB_OPTIONS: [db_options_parser],
+            DataSource.Type.LOG: [db_logs_parser],
+        }
+
+    def test_condition_conjunctions(self):
+        conditions_dict = self.db_rules.get_conditions_dict()
+        rules_dict = self.db_rules.get_rules_dict()
+        # Make sure none of the conditions is triggered beforehand
+        for cond in conditions_dict.values():
+            self.assertFalse(cond.is_triggered(), repr(cond))
+        for rule in rules_dict.values():
+            self.assertFalse(
+                rule.is_triggered(conditions_dict, self.column_families), repr(rule)
+            )
+
+        # Trigger the conditions as per the data sources.
+        self.db_rules.trigger_conditions(self.data_sources)
+
+        # Check for the conditions
+        conds_triggered = ["log-1-true", "log-2-true", "log-3-true"]
+        conds_not_triggered = ["log-4-false", "options-1-false"]
+        for cond in conds_triggered:
+            self.assertTrue(conditions_dict[cond].is_triggered(), repr(cond))
+        for cond in conds_not_triggered:
+            self.assertFalse(conditions_dict[cond].is_triggered(), repr(cond))
+
+        # Check for the rules
+        rules_triggered = ["multiple-conds-true"]
+        rules_not_triggered = [
+            "single-condition-false",
+            "multiple-conds-one-false",
+            "multiple-conds-all-false",
+        ]
+        for rule_name in rules_triggered:
+            rule = rules_dict[rule_name]
+            self.assertTrue(
+                rule.is_triggered(conditions_dict, self.column_families), repr(rule)
+            )
+        for rule_name in rules_not_triggered:
+            rule = rules_dict[rule_name]
+            self.assertFalse(
+                rule.is_triggered(conditions_dict, self.column_families), repr(rule)
+            )
+
+
+class TestSanityChecker(unittest.TestCase):
+    def setUp(self):
+        this_path = os.path.abspath(os.path.dirname(__file__))
+        ini_path = os.path.join(this_path, "input_files/rules_err1.ini")
+        db_rules = RulesSpec(ini_path)
+        db_rules.load_rules_from_spec()
+        self.rules_dict = db_rules.get_rules_dict()
+        self.conditions_dict = db_rules.get_conditions_dict()
+        self.suggestions_dict = db_rules.get_suggestions_dict()
+
+    def test_rule_missing_suggestions(self):
+        regex = ".*rule must have at least one suggestion.*"
+        with self.assertRaisesRegex(ValueError, regex):
+            self.rules_dict["missing-suggestions"].perform_checks()
+
+    def test_rule_missing_conditions(self):
+        regex = ".*rule must have at least one condition.*"
+        with self.assertRaisesRegex(ValueError, regex):
+            self.rules_dict["missing-conditions"].perform_checks()
+
+    def test_condition_missing_regex(self):
+        regex = ".*provide regex for log condition.*"
+        with self.assertRaisesRegex(ValueError, regex):
+            self.conditions_dict["missing-regex"].perform_checks()
+
+    def test_condition_missing_options(self):
+        regex = ".*options missing in condition.*"
+        with self.assertRaisesRegex(ValueError, regex):
+            self.conditions_dict["missing-options"].perform_checks()
+
+    def test_condition_missing_expression(self):
+        regex = ".*expression missing in condition.*"
+        with self.assertRaisesRegex(ValueError, regex):
+            self.conditions_dict["missing-expression"].perform_checks()
+
+    def test_suggestion_missing_option(self):
+        regex = ".*provide option or description.*"
+        with self.assertRaisesRegex(ValueError, regex):
+            self.suggestions_dict["missing-option"].perform_checks()
+
+    def test_suggestion_missing_description(self):
+        regex = ".*provide option or description.*"
+        with self.assertRaisesRegex(ValueError, regex):
+            self.suggestions_dict["missing-description"].perform_checks()
+
+
+class TestParsingErrors(unittest.TestCase):
+    def setUp(self):
+        self.this_path = os.path.abspath(os.path.dirname(__file__))
+
+    def test_condition_missing_source(self):
+        ini_path = os.path.join(self.this_path, "input_files/rules_err2.ini")
+        db_rules = RulesSpec(ini_path)
+        regex = ".*provide source for condition.*"
+        with self.assertRaisesRegex(NotImplementedError, regex):
+            db_rules.load_rules_from_spec()
+
+    def test_suggestion_missing_action(self):
+        ini_path = os.path.join(self.this_path, "input_files/rules_err3.ini")
+        db_rules = RulesSpec(ini_path)
+        regex = ".*provide action for option.*"
+        with self.assertRaisesRegex(ValueError, regex):
+            db_rules.load_rules_from_spec()
+
+    def test_section_no_name(self):
+        ini_path = os.path.join(self.this_path, "input_files/rules_err4.ini")
+        db_rules = RulesSpec(ini_path)
+        regex = "Parsing error: needed section header:.*"
+        with self.assertRaisesRegex(ValueError, regex):
+            db_rules.load_rules_from_spec()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/src/rocksdb/tools/analyze_txn_stress_test.sh b/src/rocksdb/tools/analyze_txn_stress_test.sh
new file mode 100755
index 000000000..477b1fac5
--- /dev/null
+++ b/src/rocksdb/tools/analyze_txn_stress_test.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Usage:
+# 1. Enable ROCKS_LOG_DETAILS in util/logging.h
+# 2. Run ./transaction_test --gtest_filter="MySQLStyleTransactionTest/MySQLStyleTransactionTest.TransactionStressTest/*" --gtest_break_on_failure
+# 3. SET=1 # 2 or 3
+# 4. LOG=/dev/shm/transaction_testdb_8600601584148590297/LOG
+# 5. grep RandomTransactionVerify $LOG | cut -d' ' -f 12 | sort -n # to find verify snapshots
+# 5. vn=1345
+# 6. vn_1=1340
+# 4. . tools/tools/analyze_txn_stress_test.sh
+echo Input params:
+# The rocksdb LOG path
+echo $LOG
+# Snapshot at which we got RandomTransactionVerify failure
+echo $vn
+# The snapshot before that where RandomTransactionVerify passed
+echo $vn_1
+# The stress tests use 3 sets, one or more might have shown inconsistent results.
+SET=${SET-1} # 1 or 2 or 3
+echo Checking set number $SET
+
+# Find the txns that committed between the two snapshots, and gather their changes made by them in /tmp/changes.txt
+# 2019/02/28-15:25:51.655477 7fffec9ff700 [DEBUG] [ilities/transactions/write_prepared_txn_db.cc:416] Txn 68497 Committing with 68498
+grep Committing $LOG | awk '{if ($9 <= vn && $9 > vn_1) print $0}' vn=$vn vn_1=${vn_1} > /tmp/txn.txt
+# 2019/02/28-15:25:49.046464 7fffe81f5700 [DEBUG] [il/transaction_test_util.cc:216] Commit of 65541 OK (txn12936193128775589751-9089)
+for i in `cat /tmp/txn.txt | awk '{print $6}'`; do grep "Commit of $i " $LOG; done > /tmp/names.txt
+for n in `cat /tmp/names.txt | awk '{print $9}'`; do grep $n $LOG; done > /tmp/changes.txt
+echo "Sum of the changes:"
+cat /tmp/changes.txt | grep Insert | awk '{print $12}' | cut -d= -f1 | cut -d+ -f2 | awk '{sum+=$1} END{print sum}'
+
+# Gather read values at each snapshot
+# 2019/02/28-15:25:51.655926 7fffebbff700 [DEBUG] [il/transaction_test_util.cc:347] VerifyRead at 67972 (67693): 000230 value: 15983
+grep "VerifyRead at ${vn_1} (.*): 000${SET}" $LOG | cut -d' ' -f 9- > /tmp/va.txt
+grep "VerifyRead at ${vn} (.*): 000${SET}" $LOG | cut -d' ' -f 9- > /tmp/vb.txt
+
+# For each key in the 2nd snapshot, find the value read by 1st, do the adds, and see if the results match.
+IFS=$'\n'
+for l in `cat /tmp/vb.txt`; 
+do
+  grep $l /tmp/va.txt > /dev/null ; 
+  if [[ $? -ne 0 ]]; then 
+    #echo $l
+    k=`echo $l | awk '{print $1}'`;
+    v=`echo $l | awk '{print $3}'`;
+    # 2019/02/28-15:25:19.350111 7fffe81f5700 [DEBUG] [il/transaction_test_util.cc:194] Insert (txn12936193128775589751-2298) OK snap: 16289 key:000219 value: 3772+95=3867
+    exp=`grep "\<$k\>" /tmp/changes.txt | tail -1 | cut -d= -f2`;
+    if [[ $v -ne $exp ]]; then echo $l; fi
+  else
+    k=`echo $l | awk '{print $1}'`;
+    grep "\<$k\>" /tmp/changes.txt
+  fi;
+done
+
+# Check that all the keys read in the 1st snapshot are still visible in the 2nd
+for l in `cat /tmp/va.txt`; 
+do
+  k=`echo $l | awk '{print $1}'`;
+  grep "\<$k\>" /tmp/vb.txt > /dev/null
+  if [[ $? -ne 0 ]]; then
+    echo missing key $k
+  fi
+done
+
+# The following found a bug in ValidateSnapshot. It checks if the adds on each key match up.
+grep Insert /tmp/changes.txt | cut -d' ' -f 10 | sort | uniq > /tmp/keys.txt
+for k in `cat /tmp/keys.txt`;
+do
+  grep "\<$k\>" /tmp/changes.txt > /tmp/adds.txt;
+  # 2019/02/28-15:25:19.350111 7fffe81f5700 [DEBUG] [il/transaction_test_util.cc:194] Insert (txn12936193128775589751-2298) OK snap: 16289 key:000219 value: 3772+95=3867
+  START=`head -1 /tmp/adds.txt | cut -d' ' -f 12 | cut -d+ -f1`
+  END=`tail -1 /tmp/adds.txt | cut -d' ' -f 12 | cut -d= -f2`
+  ADDS=`cat /tmp/adds.txt | grep Insert | awk '{print $12}' | cut -d= -f1 | cut -d+ -f2 | awk '{sum+=$1} END{print sum}'`
+  EXP=$((START+ADDS))
+  # If first + all the adds != last then there was an issue with ValidateSnapshot.
+  if [[ $END -ne $EXP ]]; then echo inconsistent txn: $k $START+$ADDS=$END; cat /tmp/adds.txt; return 1; fi
+done
diff --git a/src/rocksdb/tools/auto_sanity_test.sh b/src/rocksdb/tools/auto_sanity_test.sh
new file mode 100755
index 000000000..4670ef9bb
--- /dev/null
+++ b/src/rocksdb/tools/auto_sanity_test.sh
@@ -0,0 +1,93 @@
+# shellcheck disable=SC2148
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+TMP_DIR="${TMPDIR:-/tmp}/rocksdb-sanity-test"
+
+if [ "$#" -lt 2 ]; then
+  echo "usage: ./auto_sanity_test.sh [new_commit] [old_commit]"
+  echo "Missing either [new_commit] or [old_commit], perform sanity check with the latest and 10th latest commits."
+  recent_commits=`git log | grep -e "^commit [a-z0-9]\+$"| head -n10 | sed -e 's/commit //g'`
+  commit_new=`echo "$recent_commits" | head -n1`
+  commit_old=`echo "$recent_commits" | tail -n1`
+  echo "the most recent commits are:"
+  echo "$recent_commits"
+else
+  commit_new=$1
+  commit_old=$2
+fi
+
+if [ ! -d $TMP_DIR ]; then
+  mkdir $TMP_DIR
+fi
+dir_new="${TMP_DIR}/${commit_new}"
+dir_old="${TMP_DIR}/${commit_old}"
+
+function makestuff() {
+  echo "make clean"
+  make clean > /dev/null
+  echo "make db_sanity_test -j32"
+  make db_sanity_test -j32 > /dev/null
+  if [ $? -ne 0 ]; then
+    echo "[ERROR] Failed to perform 'make db_sanity_test'"
+    exit 1
+  fi
+}
+
+rm -r -f $dir_new
+rm -r -f $dir_old
+
+echo "Running db sanity check with commits $commit_new and $commit_old."
+
+echo "============================================================="
+echo "Making build $commit_new"
+git checkout $commit_new
+if [ $? -ne 0 ]; then
+  echo "[ERROR] Can't checkout $commit_new"
+  exit 1
+fi
+makestuff
+mv db_sanity_test new_db_sanity_test
+echo "Creating db based on the new commit --- $commit_new"
+./new_db_sanity_test $dir_new create
+cp ./tools/db_sanity_test.cc $dir_new
+cp ./tools/auto_sanity_test.sh $dir_new
+
+echo "============================================================="
+echo "Making build $commit_old"
+git checkout $commit_old
+if [ $? -ne 0 ]; then
+  echo "[ERROR] Can't checkout $commit_old"
+  exit 1
+fi
+cp -f $dir_new/db_sanity_test.cc ./tools/.
+cp -f $dir_new/auto_sanity_test.sh ./tools/.
+makestuff
+mv db_sanity_test old_db_sanity_test
+echo "Creating db based on the old commit --- $commit_old"
+./old_db_sanity_test $dir_old create
+
+echo "============================================================="
+echo "[Backward Compatibility Check]"
+echo "Verifying old db $dir_old using the new commit --- $commit_new"
+./new_db_sanity_test $dir_old verify
+if [ $? -ne 0 ]; then
+  echo "[ERROR] Backward Compatibility Check fails:"
+  echo "    Verification of $dir_old using commit $commit_new failed."
+  exit 2
+fi
+
+echo "============================================================="
+echo "[Forward Compatibility Check]"
+echo "Verifying new db $dir_new using the old commit --- $commit_old"
+./old_db_sanity_test $dir_new verify
+if [ $? -ne 0 ]; then
+  echo "[ERROR] Forward Compatibility Check fails:"
+  echo "    $dir_new using commit $commit_old failed."
+  exit 2
+fi
+
+rm old_db_sanity_test
+rm new_db_sanity_test
+rm -rf $dir_new
+rm -rf $dir_old
+
+echo "Auto sanity test passed!"
diff --git a/src/rocksdb/tools/backup_db.sh b/src/rocksdb/tools/backup_db.sh
new file mode 100755
index 000000000..aa82f1dba
--- /dev/null
+++ b/src/rocksdb/tools/backup_db.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+#
+
+if [ "$#" -lt 2 ]; then
+  echo "usage: ${BASH_SOURCE[0]} <DB Path> <Backup Dir>"
+  exit 1
+fi
+
+db_dir="$1"
+backup_dir="$2"
+
+echo "== Backing up DB $db_dir to $backup_dir"
+./ldb backup --db="$db_dir" --backup_dir="$backup_dir"
diff --git a/src/rocksdb/tools/benchmark.sh b/src/rocksdb/tools/benchmark.sh
new file mode 100755
index 000000000..b41d25c78
--- /dev/null
+++ b/src/rocksdb/tools/benchmark.sh
@@ -0,0 +1,1173 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# REQUIRE: db_bench binary exists in the current directory
+
+# Exit Codes
+EXIT_INVALID_ARGS=1
+EXIT_NOT_COMPACTION_TEST=2
+EXIT_UNKNOWN_JOB=3
+
+# Size Constants
+K=1024
+M=$((1024 * K))
+G=$((1024 * M))
+T=$((1024 * G))
+
+function display_usage() {
+  echo "usage: benchmark.sh [--help] <test>"
+  echo ""
+  echo "These are the available benchmark tests:"
+  echo -e "\tbulkload"
+  echo -e "\tfillseq_disable_wal\t\tSequentially fill the database with no WAL"
+  echo -e "\tfillseq_enable_wal\t\tSequentially fill the database with WAL"
+  echo -e "\toverwrite"
+  echo -e "\tupdaterandom"
+  echo -e "\treadrandom"
+  echo -e "\tmergerandom"
+  echo -e "\tfilluniquerandom"
+  echo -e "\tmultireadrandom"
+  echo -e "\tfwdrange"
+  echo -e "\trevrange"
+  echo -e "\treadwhilewriting"
+  echo -e "\treadwhilemerging"
+  echo -e "\tfwdrangewhilewriting"
+  echo -e "\trevrangewhilewriting"
+  echo -e "\tfwdrangewhilemerging"
+  echo -e "\trevrangewhilemerging"
+  echo -e "\trandomtransaction"
+  echo -e "\tuniversal_compaction"
+  echo -e "\tdebug"
+  echo ""
+  echo "Generic enviroment Variables:"
+  echo -e "\tJOB_ID\t\t\t\tAn identifier for the benchmark job, will appear in the results"
+  echo -e "\tDB_DIR\t\t\t\tPath to write the database data directory"
+  echo -e "\tWAL_DIR\t\t\t\tPath to write the database WAL directory"
+  echo -e "\tOUTPUT_DIR\t\t\tPath to write the benchmark results to (default: /tmp)"
+  echo -e "\tNUM_KEYS\t\t\tThe number of keys to use in the benchmark"
+  echo -e "\tKEY_SIZE\t\t\tThe size of the keys to use in the benchmark (default: 20 bytes)"
+  echo -e "\tVALUE_SIZE\t\t\tThe size of the values to use in the benchmark (default: 400 bytes)"
+  echo -e "\tBLOCK_SIZE\t\t\tThe size of the database blocks in the benchmark (default: 8 KB)"
+  echo -e "\tDB_BENCH_NO_SYNC\t\tDisable fsync on the WAL"
+  echo -e "\tNUMACTL\t\t\t\tWhen defined use numactl --interleave=all"
+  echo -e "\tNUM_THREADS\t\t\tThe number of threads to use (default: 64)"
+  echo -e "\tMB_WRITE_PER_SEC\t\t\tRate limit for background writer"
+  echo -e "\tNUM_NEXTS_PER_SEEK\t\t(default: 10)"
+  echo -e "\tCACHE_SIZE\t\t\tSize of the block cache (default: 16GB)"
+  echo -e "\tCACHE_NUMSHARDBITS\t\t\tNumber of shards for the block cache is 2 ** cache_numshardbits (default: 6)"
+  echo -e "\tCOMPRESSION_MAX_DICT_BYTES"
+  echo -e "\tCOMPRESSION_TYPE\t\tDefault compression(default: zstd)"
+  echo -e "\tBOTTOMMOST_COMPRESSION\t\t(default: none)"
+  echo -e "\tMIN_LEVEL_TO_COMPRESS\t\tValue for min_level_to_compress for Leveled"
+  echo -e "\tCOMPRESSION_SIZE_PERCENT\tValue for compression_size_percent for Universal"
+  echo -e "\tDURATION\t\t\tNumber of seconds for which the test runs"
+  echo -e "\tWRITES\t\t\t\tNumber of writes for which the test runs"
+  echo -e "\tWRITE_BUFFER_SIZE_MB\t\tThe size of the write buffer in MB (default: 128)"
+  echo -e "\tTARGET_FILE_SIZE_BASE_MB\tThe value for target_file_size_base in MB (default: 128)"
+  echo -e "\tMAX_BYTES_FOR_LEVEL_BASE_MB\tThe value for max_bytes_for_level_base in MB (default: 128)"
+  echo -e "\tMAX_BACKGROUND_JOBS\t\tThe value for max_background_jobs (default: 16)"
+  echo -e "\tCACHE_INDEX_AND_FILTER_BLOCKS\tThe value for cache_index_and_filter_blocks (default: 0)"
+  echo -e "\tUSE_O_DIRECT\t\t\tUse O_DIRECT for user reads and compaction"
+  echo -e "\tBYTES_PER_SYNC\t\t\tValue for bytes_per_sync, set to zero when USE_O_DIRECT is true"
+  echo -e "\tSTATS_INTERVAL_SECONDS\t\tValue for stats_interval_seconds"
+  echo -e "\tREPORT_INTERVAL_SECONDS\t\tValue for report_interval_seconds"
+  echo -e "\tSUBCOMPACTIONS\t\t\tValue for subcompactions"
+  echo -e "\tCOMPACTION_STYLE\t\tOne of leveled, universal, blob. Default is leveled."
+  echo -e "\nEnvironment variables (mostly) for leveled compaction:"
+  echo -e "\tLEVEL0_FILE_NUM_COMPACTION_TRIGGER\t\tValue for level0_file_num_compaction_trigger"
+  echo -e "\tLEVEL0_SLOWDOWN_WRITES_TRIGGER\t\t\tValue for level0_slowdown_writes_trigger"
+  echo -e "\tLEVEL0_STOP_WRITES_TRIGGER\t\t\tValue for level0_stop_writes_trigger"
+  echo -e "\tPER_LEVEL_FANOUT\t\t\t\tValue for max_bytes_for_level_multiplier"
+  echo -e "\tSOFT_PENDING_COMPACTION_BYTES_LIMIT_IN_GB\tThe value for soft_pending_compaction_bytes_limit in GB"
+  echo -e "\tHARD_PENDING_COMPACTION_BYTES_LIMIT_IN_GB\tThe value for hard_pending_compaction_bytes_limit in GB"
+  echo -e "\nEnvironment variables for universal compaction:"
+  echo -e "\tUNIVERSAL_MIN_MERGE_WIDTH\tValue of min_merge_width option for universal"
+  echo -e "\tUNIVERSAL_MAX_MERGE_WIDTH\tValue of min_merge_width option for universal"
+  echo -e "\tUNIVERSAL_SIZE_RATIO\t\tValue of size_ratio option for universal"
+  echo -e "\tUNIVERSAL_MAX_SIZE_AMP\t\tmax_size_amplification_percent for universal"
+  echo -e "\tUNIVERSAL_ALLOW_TRIVIAL_MOVE\tSet allow_trivial_move to true for universal, default is false"
+  echo -e "\nOptions for integrated BlobDB"
+  echo -e "\tMIN_BLOB_SIZE\tValue for min_blob_size"
+  echo -e "\tBLOB_FILE_SIZE\tValue for blob_file_size"
+  echo -e "\tBLOB_COMPRESSION_TYPE\tValue for blob_compression_type"
+  echo -e "\tBLOB_GC_AGE_CUTOFF\tValue for blob_garbage_collection_age_cutoff"
+  echo -e "\tBLOB_GC_FORCE_THRESHOLD\tValue for blob_garbage_collection_force_threshold"
+  echo -e "\tBLOB_FILE_STARTING_LEVEL\t\tBlob file starting level (default: 0)"
+  echo -e "\tUSE_BLOB_CACHE\t\t\tEnable blob cache (default: 1)"
+  echo -e "\tUSE_SHARED_BLOCK_AND_BLOB_CACHE\t\t\tUse the same backing cache for block cache and blob cache (default: 1)"
+  echo -e "\tBLOB_CACHE_SIZE\t\t\tSize of the blob cache (default: 16GB)"
+  echo -e "\tBLOB_CACHE_NUMSHARDBITS\t\t\tNumber of shards for the blob cache is 2 ** blob_cache_numshardbits (default: 6)"
+  echo -e "\tPREPOPULATE_BLOB_CACHE\t\t\tPre-populate hot/warm blobs in blob cache (default: 0)"
+}
+
+if [ $# -lt 1 ]; then
+  display_usage
+  exit $EXIT_INVALID_ARGS
+fi
+bench_cmd=$1
+shift
+bench_args=$*
+
+if [[ "$bench_cmd" == "--help" ]]; then
+  display_usage
+  exit
+fi
+
+job_id=${JOB_ID}
+
+# Make it easier to run only the compaction test. Getting valid data requires
+# a number of iterations and having an ability to run the test separately from
+# rest of the benchmarks helps.
+if [ "$COMPACTION_TEST" == "1" -a "$bench_cmd" != "universal_compaction" ]; then
+  echo "Skipping $1 because it's not a compaction test."
+  exit $EXIT_NOT_COMPACTION_TEST
+fi
+
+if [ -z $DB_DIR ]; then
+  echo "DB_DIR is not defined"
+  exit $EXIT_INVALID_ARGS
+fi
+
+if [ -z $WAL_DIR ]; then
+  echo "WAL_DIR is not defined"
+  exit $EXIT_INVALID_ARGS
+fi
+
+output_dir=${OUTPUT_DIR:-/tmp}
+if [ ! -d $output_dir ]; then
+  mkdir -p $output_dir
+fi
+
+report="$output_dir/report.tsv"
+schedule="$output_dir/schedule.txt"
+
+# all multithreaded tests run with sync=1 unless
+# $DB_BENCH_NO_SYNC is defined
+syncval="1"
+if [ ! -z $DB_BENCH_NO_SYNC ]; then
+  echo "Turning sync off for all multithreaded tests"
+  syncval="0";
+fi
+
+compaction_style=${COMPACTION_STYLE:-leveled}
+if [ $compaction_style = "leveled" ]; then
+  echo Use leveled compaction
+elif [ $compaction_style = "universal" ]; then
+  echo Use universal compaction
+elif [ $compaction_style = "blob" ]; then
+  echo Use blob compaction
+else
+  echo COMPACTION_STYLE is :: $COMPACTION_STYLE :: and must be one of leveled, universal, blob
+  exit $EXIT_INVALID_ARGS
+fi
+
+num_threads=${NUM_THREADS:-64}
+mb_written_per_sec=${MB_WRITE_PER_SEC:-0}
+# Only for tests that do range scans
+num_nexts_per_seek=${NUM_NEXTS_PER_SEEK:-10}
+cache_size=${CACHE_SIZE:-$(( 16 * $G ))}
+cache_numshardbits=${CACHE_NUMSHARDBITS:-6}
+compression_max_dict_bytes=${COMPRESSION_MAX_DICT_BYTES:-0}
+compression_type=${COMPRESSION_TYPE:-zstd}
+min_level_to_compress=${MIN_LEVEL_TO_COMPRESS:-"-1"}
+compression_size_percent=${COMPRESSION_SIZE_PERCENT:-"-1"}
+
+duration=${DURATION:-0}
+writes=${WRITES:-0}
+
+num_keys=${NUM_KEYS:-8000000000}
+key_size=${KEY_SIZE:-20}
+value_size=${VALUE_SIZE:-400}
+block_size=${BLOCK_SIZE:-8192}
+write_buffer_mb=${WRITE_BUFFER_SIZE_MB:-128}
+target_file_mb=${TARGET_FILE_SIZE_BASE_MB:-128}
+l1_mb=${MAX_BYTES_FOR_LEVEL_BASE_MB:-1024}
+max_background_jobs=${MAX_BACKGROUND_JOBS:-16}
+stats_interval_seconds=${STATS_INTERVAL_SECONDS:-60}
+report_interval_seconds=${REPORT_INTERVAL_SECONDS:-1}
+subcompactions=${SUBCOMPACTIONS:-1}
+per_level_fanout=${PER_LEVEL_FANOUT:-8}
+
+cache_index_and_filter=${CACHE_INDEX_AND_FILTER_BLOCKS:-0}
+if [[ $cache_index_and_filter -eq 0 ]]; then
+  cache_meta_flags=""
+elif [[ $cache_index_and_filter -eq 1 ]]; then
+  cache_meta_flags="\
+  --cache_index_and_filter_blocks=$cache_index_and_filter \
+  --cache_high_pri_pool_ratio=0.5 --cache_low_pri_pool_ratio=0"
+else
+  echo CACHE_INDEX_AND_FILTER_BLOCKS was $CACHE_INDEX_AND_FILTER_BLOCKS but must be 0 or 1
+  exit $EXIT_INVALID_ARGS
+fi
+
+soft_pending_arg=""
+if [ ! -z $SOFT_PENDING_COMPACTION_BYTES_LIMIT_IN_GB ]; then
+  soft_pending_bytes=$( echo $SOFT_PENDING_COMPACTION_BYTES_LIMIT_IN_GB | \
+    awk '{ printf "%.0f", $1 * GB }' GB=$G )
+  soft_pending_arg="--soft_pending_compaction_bytes_limit=$soft_pending_bytes"
+fi
+
+hard_pending_arg=""
+if [ ! -z $HARD_PENDING_COMPACTION_BYTES_LIMIT_IN_GB ]; then
+  hard_pending_bytes=$( echo $HARD_PENDING_COMPACTION_BYTES_LIMIT_IN_GB | \
+    awk '{ printf "%.0f", $1 * GB }' GB=$G )
+  hard_pending_arg="--hard_pending_compaction_bytes_limit=$hard_pending_bytes"
+fi
+
+o_direct_flags=""
+if [ ! -z $USE_O_DIRECT ]; then
+  # Some of these flags are only supported in new versions and --undefok makes that work
+  o_direct_flags="--use_direct_reads --use_direct_io_for_flush_and_compaction --prepopulate_block_cache=1"
+  bytes_per_sync=0
+else
+  bytes_per_sync=${BYTES_PER_SYNC:-$(( 1 * M ))}
+fi
+
+univ_min_merge_width=${UNIVERSAL_MIN_MERGE_WIDTH:-2}
+univ_max_merge_width=${UNIVERSAL_MAX_MERGE_WIDTH:-20}
+univ_size_ratio=${UNIVERSAL_SIZE_RATIO:-1}
+univ_max_size_amp=${UNIVERSAL_MAX_SIZE_AMP:-200}
+
+if [ ! -z $UNIVERSAL_ALLOW_TRIVIAL_MOVE ]; then
+  univ_allow_trivial_move=1
+else
+  univ_allow_trivial_move=0
+fi
+
+min_blob_size=${MIN_BLOB_SIZE:-0}
+blob_file_size=${BLOB_FILE_SIZE:-$(( 256 * $M ))}
+blob_compression_type=${BLOB_COMPRESSION_TYPE:-${compression_type}}
+blob_gc_age_cutoff=${BLOB_GC_AGE_CUTOFF:-"0.25"}
+blob_gc_force_threshold=${BLOB_GC_FORCE_THRESHOLD:-1}
+blob_file_starting_level=${BLOB_FILE_STARTING_LEVEL:-0}
+use_blob_cache=${USE_BLOB_CACHE:-1}
+use_shared_block_and_blob_cache=${USE_SHARED_BLOCK_AND_BLOB_CACHE:-1}
+blob_cache_size=${BLOB_CACHE_SIZE:-$(( 16 * $G ))}
+blob_cache_numshardbits=${BLOB_CACHE_NUMSHARDBITS:-6}
+prepopulate_blob_cache=${PREPOPULATE_BLOB_CACHE:-0}
+
+# This script still works back to RocksDB 6.0
+undef_params="\
+use_blob_cache,\
+use_shared_block_and_blob_cache,\
+blob_cache_size,blob_cache_numshardbits,\
+prepopulate_blob_cache,\
+multiread_batched,\
+cache_low_pri_pool_ratio,\
+prepopulate_block_cache"
+
+const_params_base="
+  --undefok=$undef_params \
+  --db=$DB_DIR \
+  --wal_dir=$WAL_DIR \
+  \
+  --num=$num_keys \
+  --key_size=$key_size \
+  --value_size=$value_size \
+  --block_size=$block_size \
+  --cache_size=$cache_size \
+  --cache_numshardbits=$cache_numshardbits \
+  --compression_max_dict_bytes=$compression_max_dict_bytes \
+  --compression_ratio=0.5 \
+  --compression_type=$compression_type \
+  --bytes_per_sync=$bytes_per_sync \
+  $cache_meta_flags \
+  $o_direct_flags \
+  --benchmark_write_rate_limit=$(( 1024 * 1024 * $mb_written_per_sec )) \
+  \
+  --write_buffer_size=$(( $write_buffer_mb * M)) \
+  --target_file_size_base=$(( $target_file_mb * M)) \
+  --max_bytes_for_level_base=$(( $l1_mb * M)) \
+  \
+  --verify_checksum=1 \
+  --delete_obsolete_files_period_micros=$((60 * M)) \
+  --max_bytes_for_level_multiplier=$per_level_fanout \
+  \
+  --statistics=0 \
+  --stats_per_interval=1 \
+  --stats_interval_seconds=$stats_interval_seconds \
+  --report_interval_seconds=$report_interval_seconds \
+  --histogram=1 \
+  \
+  --memtablerep=skip_list \
+  --bloom_bits=10 \
+  --open_files=-1 \
+  --subcompactions=$subcompactions \
+  \
+  $bench_args"
+
+level_const_params="
+  $const_params_base \
+  --compaction_style=0 \
+  --num_levels=8 \
+  --min_level_to_compress=$min_level_to_compress \
+  --level_compaction_dynamic_level_bytes=true \
+  --pin_l0_filter_and_index_blocks_in_cache=1 \
+  $soft_pending_arg \
+  $hard_pending_arg \
+"
+
+# These inherit level_const_params because the non-blob LSM tree uses leveled compaction.
+blob_const_params="
+  $level_const_params \
+  --enable_blob_files=true \
+  --min_blob_size=$min_blob_size \
+  --blob_file_size=$blob_file_size \
+  --blob_compression_type=$blob_compression_type \
+  --enable_blob_garbage_collection=true \
+  --blob_garbage_collection_age_cutoff=$blob_gc_age_cutoff \
+  --blob_garbage_collection_force_threshold=$blob_gc_force_threshold \
+  --blob_file_starting_level=$blob_file_starting_level \
+  --use_blob_cache=$use_blob_cache \
+  --use_shared_block_and_blob_cache=$use_shared_block_and_blob_cache \
+  --blob_cache_size=$blob_cache_size \
+  --blob_cache_numshardbits=$blob_cache_numshardbits \
+  --prepopulate_blob_cache=$prepopulate_blob_cache \
+"
+
+# TODO:
+#   pin_l0_filter_and..., is this OK?
+univ_const_params="
+  $const_params_base \
+  --compaction_style=1 \
+  --num_levels=40 \
+  --universal_compression_size_percent=$compression_size_percent \
+  --pin_l0_filter_and_index_blocks_in_cache=1 \
+  --universal_min_merge_width=$univ_min_merge_width \
+  --universal_max_merge_width=$univ_max_merge_width \
+  --universal_size_ratio=$univ_size_ratio \
+  --universal_max_size_amplification_percent=$univ_max_size_amp \
+  --universal_allow_trivial_move=$univ_allow_trivial_move \
+"
+
+if [ $compaction_style == "leveled" ]; then
+  const_params="$level_const_params"
+  l0_file_num_compaction_trigger=${LEVEL0_FILE_NUM_COMPACTION_TRIGGER:-4}
+  l0_slowdown_writes_trigger=${LEVEL0_SLOWDOWN_WRITES_TRIGGER:-20}
+  l0_stop_writes_trigger=${LEVEL0_STOP_WRITES_TRIGGER:-30}
+elif [ $compaction_style == "universal" ]; then
+  const_params="$univ_const_params"
+  l0_file_num_compaction_trigger=${LEVEL0_FILE_NUM_COMPACTION_TRIGGER:-8}
+  l0_slowdown_writes_trigger=${LEVEL0_SLOWDOWN_WRITES_TRIGGER:-20}
+  l0_stop_writes_trigger=${LEVEL0_STOP_WRITES_TRIGGER:-30}
+else
+  # compaction_style == "blob"
+  const_params="$blob_const_params"
+  l0_file_num_compaction_trigger=${LEVEL0_FILE_NUM_COMPACTION_TRIGGER:-4}
+  l0_slowdown_writes_trigger=${LEVEL0_SLOWDOWN_WRITES_TRIGGER:-20}
+  l0_stop_writes_trigger=${LEVEL0_STOP_WRITES_TRIGGER:-30}
+fi
+
+l0_config="
+  --level0_file_num_compaction_trigger=$l0_file_num_compaction_trigger \
+  --level0_slowdown_writes_trigger=$l0_slowdown_writes_trigger \
+  --level0_stop_writes_trigger=$l0_stop_writes_trigger"
+
+# You probably don't want to set both --writes and --duration
+if [ $duration -gt 0 ]; then
+  const_params="$const_params --duration=$duration"
+fi
+if [ $writes -gt 0 ]; then
+  const_params="$const_params --writes=$writes"
+fi
+
+params_w="$l0_config \
+          --max_background_jobs=$max_background_jobs \
+          --max_write_buffer_number=8 \
+          $const_params"
+
+params_bulkload="--max_background_jobs=$max_background_jobs \
+                 --max_write_buffer_number=8 \
+                 --allow_concurrent_memtable_write=false \
+                 --level0_file_num_compaction_trigger=$((10 * M)) \
+                 --level0_slowdown_writes_trigger=$((10 * M)) \
+                 --level0_stop_writes_trigger=$((10 * M)) \
+                 $const_params "
+
+params_fillseq="--allow_concurrent_memtable_write=false \
+                $params_w "
+
+#
+# Tune values for level and universal compaction.
+# For universal compaction, these level0_* options mean total sorted of runs in
+# LSM. In level-based compaction, it means number of L0 files.
+#
+params_level_compact="$const_params \
+                --max_background_flushes=4 \
+                --max_write_buffer_number=4 \
+                --level0_file_num_compaction_trigger=4 \
+                --level0_slowdown_writes_trigger=16 \
+                --level0_stop_writes_trigger=20"
+
+params_univ_compact="$const_params \
+                --max_background_flushes=4 \
+                --max_write_buffer_number=4 \
+                --level0_file_num_compaction_trigger=8 \
+                --level0_slowdown_writes_trigger=16 \
+                --level0_stop_writes_trigger=20"
+
+tsv_header="ops_sec\tmb_sec\tlsm_sz\tblob_sz\tc_wgb\tw_amp\tc_mbps\tc_wsecs\tc_csecs\tb_rgb\tb_wgb\tusec_op\tp50\tp99\tp99.9\tp99.99\tpmax\tuptime\tstall%\tNstall\tu_cpu\ts_cpu\trss\ttest\tdate\tversion\tjob_id\tgithash"
+
+function get_cmd() {
+  output=$1
+
+  numa=""
+  if [ ! -z $NUMACTL ]; then
+    numa="numactl --interleave=all "
+  fi
+
+  # Try to use timeout when duration is set because some tests (revrange*) hang
+  # for some versions (v6.10, v6.11).
+  timeout_cmd=""
+  if [ $duration -gt 0 ]; then
+    if hash timeout ; then
+      timeout_cmd="timeout $(( $duration + 600 ))"
+    fi
+  fi
+
+  echo "/usr/bin/time -f '%e %U %S' -o $output $numa $timeout_cmd"
+}
+
+function month_to_num() {
+    local date_str=$1
+    date_str="${date_str/Jan/01}"
+    date_str="${date_str/Feb/02}"
+    date_str="${date_str/Mar/03}"
+    date_str="${date_str/Apr/04}"
+    date_str="${date_str/May/05}"
+    date_str="${date_str/Jun/06}"
+    date_str="${date_str/Jul/07}"
+    date_str="${date_str/Aug/08}"
+    date_str="${date_str/Sep/09}"
+    date_str="${date_str/Oct/10}"
+    date_str="${date_str/Nov/11}"
+    date_str="${date_str/Dec/12}"
+    echo $date_str
+}
+
+function start_stats {
+  output=$1
+  iostat -y -mx 1  >& $output.io &
+  vmstat 1 >& $output.vm &
+  # tail -1 because "ps | grep db_bench" returns 2 entries and we want the second
+  while :; do ps aux | grep db_bench | grep -v grep | tail -1; sleep 10; done >& $output.ps &
+  # This sets a global value
+  pspid=$!
+
+  while :; do
+    b_gb=$( ls -l $DB_DIR 2> /dev/null | grep blob | awk '{ c += 1; b += $5 } END { printf "%.1f", b / (1024*1024*1024) }' )
+    s_gb=$( ls -l $DB_DIR 2> /dev/null | grep sst | awk '{ c += 1; b += $5 } END { printf "%.1f", b / (1024*1024*1024) }' )
+    l_gb=$( ls -l $WAL_DIR 2> /dev/null | grep log | awk '{ c += 1; b += $5 } END { printf "%.1f", b / (1024*1024*1024) }' )
+    a_gb=$( ls -l $DB_DIR 2> /dev/null | awk '{ c += 1; b += $5 } END { printf "%.1f", b / (1024*1024*1024) }' )
+    ts=$( date +%H%M%S )
+    echo -e "${a_gb}\t${s_gb}\t${l_gb}\t${b_gb}\t${ts}"
+    sleep 10
+  done >& $output.sizes &
+  # This sets a global value
+  szpid=$!
+}
+
+function stop_stats {
+  output=$1
+  kill $pspid
+  kill $szpid
+  killall iostat
+  killall vmstat
+  sleep 1
+  gzip $output.io
+  gzip $output.vm
+
+  am=$( sort -nk 1,1 $output.sizes | tail -1 | awk '{ print $1 }' )
+  sm=$( sort -nk 2,2 $output.sizes | tail -1 | awk '{ print $2 }' )
+  lm=$( sort -nk 3,3 $output.sizes | tail -1 | awk '{ print $3 }' )
+  bm=$( sort -nk 4,4 $output.sizes | tail -1 | awk '{ print $4 }' )
+  echo -e "max sizes (GB): $am all, $sm sst, $lm log, $bm blob" >> $output.sizes
+}
+
+function units_as_gb {
+  size=$1
+  units=$2
+
+  case $units in
+    MB)
+      echo "$size" | awk '{ printf "%.1f", $1 / 1024.0 }'
+      ;;
+    GB)
+      echo "$size"
+      ;;
+    TB)
+      echo "$size" | awk '{ printf "%.1f", $1 * 1024.0 }'
+      ;;
+    *)
+      echo "NA"
+      ;;
+  esac
+}
+
+function summarize_result {
+  test_out=$1
+  test_name=$2
+  bench_name=$3
+
+  # In recent versions these can be found directly via db_bench --version, --build_info but
+  # grepping from the log lets this work on older versions.
+  version="$( grep "RocksDB version:" "$DB_DIR"/LOG | head -1 | awk '{ printf "%s", $5 }' )"
+  git_hash="$( grep "Git sha" "$DB_DIR"/LOG | head -1 | awk '{ printf "%s", substr($5, 1, 10) }' )"
+
+  # Note that this function assumes that the benchmark executes long enough so
+  # that "Compaction Stats" is written to stdout at least once. If it won't
+  # happen then empty output from grep when searching for "Sum" will cause
+  # syntax errors.
+  date=$( grep ^Date: $test_out | awk '{ print $6 "-" $3 "-" $4 "T" $5 }' )
+  my_date=$( month_to_num $date )
+  uptime=$( grep ^Uptime\(secs $test_out | tail -1 | awk '{ printf "%.0f", $2 }' )
+  stall_pct=$( grep "^Cumulative stall" $test_out| tail -1  | awk '{  print $5 }' )
+  nstall=$( grep ^Stalls\(count\):  $test_out | tail -1 | awk '{ print $2 + $6 + $10 + $14 + $18 + $20 }' )
+
+  if ! grep ^"$bench_name" "$test_out" > /dev/null 2>&1 ; then
+    echo -e "failed\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t$test_name\t$my_date\t$version\t$job_id\t$git_hash"
+    return
+  fi
+
+  # Output formats
+  # V1: readrandom   :      10.218 micros/op 3131616 ops/sec; 1254.3 MB/s (176144999 of 176144999 found)
+  # The MB/s is mssing for multireadrandom
+  # V1a: multireadrandom :      10.164 micros/op 3148272 ops/sec; (177099990 of 177099990 found)
+  # V1: overwrite    :       7.939 micros/op 125963 ops/sec;   50.5 MB/s
+  # V2: overwrite    :       7.854 micros/op 127320 ops/sec 1800.001 seconds 229176999 operations;   51.0 MB/s
+
+  format_version=$( grep ^"$bench_name" "$test_out" \
+    | awk '{ if (NF >= 10 && $8 == "seconds") { print "V2" } else { print "V1" } }' )
+  if [ $format_version == "V1" ]; then
+    ops_sec=$( grep ^"$bench_name" "$test_out" | awk '{ print $5 }' )
+    usecs_op=$( grep ^"$bench_name" "$test_out" | awk '{ printf "%.1f", $3 }' )
+    if [ "$bench_name" == "multireadrandom" ]; then
+      mb_sec="NA"
+    else
+      mb_sec=$( grep ^"$bench_name" "$test_out" | awk '{ print $7 }' )
+    fi
+  else
+    ops_sec=$( grep ^"$bench_name" "$test_out" | awk '{ print $5 }' )
+    usecs_op=$( grep ^"$bench_name" "$test_out" | awk '{ printf "%.1f", $3 }' )
+    mb_sec=$( grep ^"$bench_name" "$test_out" | awk '{ print $11 }' )
+  fi
+
+  # For RocksDB version 4.x there are fewer fields but this still parses correctly
+  # Cumulative writes: 242M writes, 242M keys, 18M commit groups, 12.9 writes per commit group, ingest: 95.96 GB, 54.69 MB/s
+  cum_writes_gb_orig=$( grep "^Cumulative writes" "$test_out" | tail -1 | awk '{ for (x=1; x<=NF; x++) { if ($x == "ingest:") { printf "%.1f", $(x+1) } } }' )
+  cum_writes_units=$( grep "^Cumulative writes" "$test_out" | tail -1 | awk '{ for (x=1; x<=NF; x++) { if ($x == "ingest:") { print $(x+2) } } }' | sed 's/,//g' )
+  cum_writes_gb=$( units_as_gb "$cum_writes_gb_orig" "$cum_writes_units" )
+
+  # Cumulative compaction: 1159.74 GB write, 661.03 MB/s write, 1108.89 GB read, 632.04 MB/s read, 6284.3 seconds
+  cmb_ps=$( grep "^Cumulative compaction" "$test_out" | tail -1 | awk '{ printf "%.1f", $6 }' )
+  sum_wgb_orig=$( grep "^Cumulative compaction" "$test_out" | tail -1 | awk '{ printf "%.1f", $3 }' )
+  sum_wgb_units=$( grep "^Cumulative compaction" "$test_out" | tail -1 | awk '{ print $4 }' )
+  sum_wgb=$( units_as_gb "$sum_wgb_orig" "$sum_wgb_units" )
+
+  # Flush(GB): cumulative 97.193, interval 1.247
+  flush_wgb=$( grep "^Flush(GB)" "$test_out" | tail -1 | awk '{ print $3 }' | tr ',' ' ' | awk '{ print $1 }' )
+
+  if [[ "$sum_wgb" == "NA" || \
+        "$cum_writes_gb" == "NA" || \
+        "$cum_writes_gb_orig" == "0.0" || \
+        -z "$cum_writes_gb_orig" || \
+        -z "$flush_wgb" ]]; then
+    wamp="NA"
+  else
+    wamp=$( echo "( $sum_wgb + $flush_wgb ) / $cum_writes_gb" | bc -l | awk '{ printf "%.1f", $1 }' )
+  fi
+
+  c_wsecs=$( grep "^ Sum" $test_out | tail -1 | awk '{ printf "%.0f", $15 }' )
+  c_csecs=$( grep "^ Sum" $test_out | tail -1 | awk '{ printf "%.0f", $16 }' )
+
+  lsm_size=$( grep "^ Sum" "$test_out" | tail -1 | awk '{ printf "%.0f%s", $3, $4 }' )
+  blob_size=$( grep "^Blob file count:" "$test_out" | tail -1 | awk '{ printf "%.0f%s", $7, $8 }' )
+  # Remove the trailing comma from blob_size: 3.0GB, -> 3.0GB
+  blob_size="${blob_size/,/}"
+
+  b_rgb=$( grep "^ Sum" $test_out | tail -1 | awk '{ printf "%.0f", $21 }' )
+  b_wgb=$( grep "^ Sum" $test_out | tail -1 | awk '{ printf "%.0f", $22 }' )
+
+  p50=$( grep "^Percentiles:" $test_out | tail -1 | awk '{ printf "%.1f", $3 }' )
+  p99=$( grep "^Percentiles:" $test_out | tail -1 | awk '{ printf "%.0f", $7 }' )
+  p999=$( grep "^Percentiles:" $test_out | tail -1 | awk '{ printf "%.0f", $9 }' )
+  p9999=$( grep "^Percentiles:" $test_out | tail -1 | awk '{ printf "%.0f", $11 }' )
+  pmax=$( grep "^Min: " $test_out | grep Median: | grep Max: | awk '{ printf "%.0f", $6 }' )
+
+  # Use the last line because there might be extra lines when the db_bench process exits with an error
+  time_out="$test_out".time
+  u_cpu=$( tail -1 "$time_out" | awk '{ printf "%.1f", $2 / 1000.0 }' )
+  s_cpu=$( tail -1 "$time_out" | awk '{ printf "%.1f", $3 / 1000.0  }' )
+
+  rss="NA"
+  if [ -f $test_out.stats.ps ]; then
+    rss=$( awk '{ printf "%.1f\n", $6 / (1024 * 1024) }' "$test_out".stats.ps | sort -n | tail -1 )
+  fi
+
+  # if the report TSV (Tab Separate Values) file does not yet exist, create it and write the header row to it
+  if [ ! -f "$report" ]; then
+    echo -e "# ops_sec - operations per second" >> "$report"
+    echo -e "# mb_sec - ops_sec * size-of-operation-in-MB" >> "$report"
+    echo -e "# lsm_sz - size of LSM tree" >> "$report"
+    echo -e "# blob_sz - size of BlobDB logs" >> "$report"
+    echo -e "# c_wgb - GB written by compaction" >> "$report"
+    echo -e "# w_amp - Write-amplification as (bytes written by compaction / bytes written by memtable flush)" >> "$report"
+    echo -e "# c_mbps - Average write rate for compaction" >> "$report"
+    echo -e "# c_wsecs - Wall clock seconds doing compaction" >> "$report"
+    echo -e "# c_csecs - CPU seconds doing compaction" >> "$report"
+    echo -e "# b_rgb - Blob compaction read GB" >> "$report"
+    echo -e "# b_wgb - Blob compaction write GB" >> "$report"
+    echo -e "# usec_op - Microseconds per operation" >> "$report"
+    echo -e "# p50, p99, p99.9, p99.99 - 50th, 99th, 99.9th, 99.99th percentile response time in usecs" >> "$report"
+    echo -e "# pmax - max response time in usecs" >> "$report"
+    echo -e "# uptime - RocksDB uptime in seconds" >> "$report"
+    echo -e "# stall% - Percentage of time writes are stalled" >> "$report"
+    echo -e "# Nstall - Number of stalls" >> "$report"
+    echo -e "# u_cpu - #seconds/1000 of user CPU" >> "$report"
+    echo -e "# s_cpu - #seconds/1000 of system CPU" >> "$report"
+    echo -e "# rss - max RSS in GB for db_bench process" >> "$report"
+    echo -e "# test - Name of test" >> "$report"
+    echo -e "# date - Date/time of test" >> "$report"
+    echo -e "# version - RocksDB version" >> "$report"
+    echo -e "# job_id - User-provided job ID" >> "$report"
+    echo -e "# githash - git hash at which db_bench was compiled" >> "$report"
+    echo -e $tsv_header >> "$report"
+  fi
+
+  echo -e "$ops_sec\t$mb_sec\t$lsm_size\t$blob_size\t$sum_wgb\t$wamp\t$cmb_ps\t$c_wsecs\t$c_csecs\t$b_rgb\t$b_wgb\t$usecs_op\t$p50\t$p99\t$p999\t$p9999\t$pmax\t$uptime\t$stall_pct\t$nstall\t$u_cpu\t$s_cpu\t$rss\t$test_name\t$my_date\t$version\t$job_id\t$git_hash" \
+    >> "$report"
+}
+
+function run_bulkload {
+  # This runs with a vector memtable and the WAL disabled to load faster. It is still crash safe and the
+  # client can discover where to restart a load after a crash. I think this is a good way to load.
+  echo "Bulk loading $num_keys random keys"
+  log_file_name=$output_dir/benchmark_bulkload_fillrandom.log
+  time_cmd=$( get_cmd $log_file_name.time )
+  cmd="$time_cmd ./db_bench --benchmarks=fillrandom,stats \
+       --use_existing_db=0 \
+       --disable_auto_compactions=1 \
+       --sync=0 \
+       $params_bulkload \
+       --threads=1 \
+       --memtablerep=vector \
+       --allow_concurrent_memtable_write=false \
+       --disable_wal=1 \
+       --seed=$( date +%s ) \
+       --report_file=${log_file_name}.r.csv \
+       2>&1 | tee -a $log_file_name"
+  if [[ "$job_id" != "" ]]; then
+    echo "Job ID: ${job_id}" > $log_file_name
+    echo $cmd | tee -a $log_file_name
+  else
+    echo $cmd | tee $log_file_name
+  fi
+  eval $cmd
+  summarize_result $log_file_name bulkload fillrandom
+
+  echo "Compacting..."
+  log_file_name=$output_dir/benchmark_bulkload_compact.log
+  time_cmd=$( get_cmd $log_file_name.time )
+  cmd="$time_cmd ./db_bench --benchmarks=compact,stats \
+       --use_existing_db=1 \
+       --disable_auto_compactions=1 \
+       --sync=0 \
+       $params_w \
+       --threads=1 \
+       2>&1 | tee -a $log_file_name"
+  if [[ "$job_id" != "" ]]; then
+    echo "Job ID: ${job_id}" > $log_file_name
+    echo $cmd | tee -a $log_file_name
+  else
+    echo $cmd | tee $log_file_name
+  fi
+  eval $cmd
+}
+
+#
+# Parameter description:
+#
+# $1 - 1 if I/O statistics should be collected.
+# $2 - compaction type to use (level=0, universal=1).
+# $3 - number of subcompactions.
+# $4 - number of maximum background compactions.
+#
+function run_manual_compaction_worker {
+  # This runs with a vector memtable and the WAL disabled to load faster.
+  # It is still crash safe and the client can discover where to restart a
+  # load after a crash. I think this is a good way to load.
+  echo "Bulk loading $num_keys random keys for manual compaction."
+
+  log_file_name=$output_dir/benchmark_man_compact_fillrandom_$3.log
+
+  if [ "$2" == "1" ]; then
+    extra_params=$params_univ_compact
+  else
+    extra_params=$params_level_compact
+  fi
+
+  # Make sure that fillrandom uses the same compaction options as compact.
+  time_cmd=$( get_cmd $log_file_name.time )
+  cmd="$time_cmd ./db_bench --benchmarks=fillrandom,stats \
+       --use_existing_db=0 \
+       --disable_auto_compactions=0 \
+       --sync=0 \
+       $extra_params \
+       --threads=$num_threads \
+       --compaction_measure_io_stats=$1 \
+       --compaction_style=$2 \
+       --subcompactions=$3 \
+       --memtablerep=vector \
+       --allow_concurrent_memtable_write=false \
+       --disable_wal=1 \
+       --max_background_compactions=$4 \
+       --seed=$( date +%s ) \
+       2>&1 | tee -a $log_file_name"
+
+  if [[ "$job_id" != "" ]]; then
+    echo "Job ID: ${job_id}" > $log_file_name
+    echo $cmd | tee -a $log_file_name
+  else
+    echo $cmd | tee $log_file_name
+  fi
+  eval $cmd
+
+  summarize_result $log_file_namefillrandom_output_file man_compact_fillrandom_$3 fillrandom
+
+  echo "Compacting with $3 subcompactions specified ..."
+
+  log_file_name=$output_dir/benchmark_man_compact_$3.log
+
+  # This is the part we're really interested in. Given that compact benchmark
+  # doesn't output regular statistics then we'll just use the time command to
+  # measure how long this step takes.
+  cmd="{ \
+       time ./db_bench --benchmarks=compact,stats \
+       --use_existing_db=1 \
+       --disable_auto_compactions=0 \
+       --sync=0 \
+       $extra_params \
+       --threads=$num_threads \
+       --compaction_measure_io_stats=$1 \
+       --compaction_style=$2 \
+       --subcompactions=$3 \
+       --max_background_compactions=$4 \
+       ;}
+       2>&1 | tee -a $log_file_name"
+
+  if [[ "$job_id" != "" ]]; then
+    echo "Job ID: ${job_id}" > $log_file_name
+    echo $cmd | tee -a $log_file_name
+  else
+    echo $cmd | tee $log_file_name
+  fi
+  eval $cmd
+
+  # Can't use summarize_result here. One way to analyze the results is to run
+  # "grep real" on the resulting log files.
+}
+
+function run_univ_compaction {
+  # Always ask for I/O statistics to be measured.
+  io_stats=1
+
+  # Values: kCompactionStyleLevel = 0x0, kCompactionStyleUniversal = 0x1.
+  compaction_style=1
+
+  # Define a set of benchmarks.
+  subcompactions=(1 2 4 8 16)
+  max_background_compactions=(16 16 8 4 2)
+
+  i=0
+  total=${#subcompactions[@]}
+
+  # Execute a set of benchmarks to cover variety of scenarios.
+  while [ "$i" -lt "$total" ]
+  do
+    run_manual_compaction_worker $io_stats $compaction_style ${subcompactions[$i]} \
+      ${max_background_compactions[$i]}
+    ((i++))
+  done
+}
+
+function run_fillseq {
+  # This runs with a vector memtable. WAL can be either disabled or enabled
+  # depending on the input parameter (1 for disabled, 0 for enabled). The main
+  # benefit behind disabling WAL is to make loading faster. It is still crash
+  # safe and the client can discover where to restart a load after a crash. I
+  # think this is a good way to load.
+
+  # Make sure that we'll have unique names for all the files so that data won't
+  # be overwritten.
+  if [ $1 == 1 ]; then
+    log_file_name="${output_dir}/benchmark_fillseq.wal_disabled.v${value_size}.log"
+    test_name=fillseq.wal_disabled.v${value_size}
+  else
+    log_file_name="${output_dir}/benchmark_fillseq.wal_enabled.v${value_size}.log"
+    test_name=fillseq.wal_enabled.v${value_size}
+  fi
+
+  # For Leveled compaction hardwire this to 0 so that data that is trivial-moved
+  # to larger levels (3, 4, etc) will be compressed.
+  if [ $compaction_style == "leveled" ]; then
+    comp_arg="--min_level_to_compress=0"
+  elif [ $compaction_style == "universal" ]; then
+    if [ ! -z $UNIVERSAL_ALLOW_TRIVIAL_MOVE ]; then
+      # See GetCompressionFlush where compression_size_percent < 1 means use the default
+      # compression which is needed because trivial moves are enabled
+      comp_arg="--universal_compression_size_percent=-1"
+    else
+      # See GetCompressionFlush where compression_size_percent > 0 means no compression.
+      # Don't set anything here because compression_size_percent is set in univ_const_params
+      comp_arg=""
+    fi
+  else
+    # compaction_style == "blob"
+    comp_arg="--min_level_to_compress=0"
+  fi
+
+  echo "Loading $num_keys keys sequentially"
+  time_cmd=$( get_cmd $log_file_name.time )
+  cmd="$time_cmd ./db_bench --benchmarks=fillseq,stats \
+       $params_fillseq \
+       $comp_arg \
+       --use_existing_db=0 \
+       --sync=0 \
+       --threads=1 \
+       --memtablerep=vector \
+       --allow_concurrent_memtable_write=false \
+       --disable_wal=$1 \
+       --seed=$( date +%s ) \
+       --report_file=${log_file_name}.r.csv \
+       2>&1 | tee -a $log_file_name"
+  if [[ "$job_id" != "" ]]; then
+    echo "Job ID: ${job_id}" > $log_file_name
+    echo $cmd | tee -a $log_file_name
+  else
+    echo $cmd | tee $log_file_name
+  fi
+  start_stats $log_file_name.stats
+  eval $cmd
+  stop_stats $log_file_name.stats
+
+  # The constant "fillseq" which we pass to db_bench is the benchmark name.
+  summarize_result $log_file_name $test_name fillseq
+}
+
+function run_lsm {
+  # This flushes the memtable and L0 to get the LSM tree into a deterministic
+  # state for read-only tests that will follow.
+  echo "Flush memtable, wait, compact L0, wait"
+  job=$1
+
+  if [ $job = flush_mt_l0 ]; then
+    benchmarks=levelstats,flush,waitforcompaction,compact0,waitforcompaction,memstats,levelstats
+  elif [ $job = waitforcompaction ]; then
+    benchmarks=levelstats,waitforcompaction,memstats,levelstats
+  else
+    echo Job unknown: $job
+    exit $EXIT_NOT_COMPACTION_TEST
+  fi
+
+  log_file_name=$output_dir/benchmark_${job}.log
+  time_cmd=$( get_cmd $log_file_name.time )
+  cmd="$time_cmd ./db_bench --benchmarks=$benchmarks,stats \
+       --use_existing_db=1 \
+       --sync=0 \
+       $params_w \
+       --threads=1 \
+       --seed=$( date +%s ) \
+       2>&1 | tee -a $log_file_name"
+  if [[ "$job_id" != "" ]]; then
+    echo "Job ID: ${job_id}" > $log_file_name
+    echo $cmd | tee -a $log_file_name
+  else
+    echo $cmd | tee $log_file_name
+  fi
+  start_stats $log_file_name.stats
+  # waitforcompaction can hang with universal (compaction_style=1)
+  # see bug https://github.com/facebook/rocksdb/issues/9275
+  eval $cmd
+  stop_stats $log_file_name.stats
+  # Don't summarize, the log doesn't have the output needed for it
+}
+
+function run_change {
+  output_name=$1
+  grep_name=$2
+  benchmarks=$3
+  echo "Do $num_keys random $output_name"
+  log_file_name="$output_dir/benchmark_${output_name}.t${num_threads}.s${syncval}.log"
+  time_cmd=$( get_cmd $log_file_name.time )
+  cmd="$time_cmd ./db_bench --benchmarks=$benchmarks,stats \
+       --use_existing_db=1 \
+       --sync=$syncval \
+       $params_w \
+       --threads=$num_threads \
+       --merge_operator=\"put\" \
+       --seed=$( date +%s ) \
+       --report_file=${log_file_name}.r.csv \
+       2>&1 | tee -a $log_file_name"
+  if [[ "$job_id" != "" ]]; then
+    echo "Job ID: ${job_id}" > $log_file_name
+    echo $cmd | tee -a $log_file_name
+  else
+    echo $cmd | tee $log_file_name
+  fi
+  start_stats $log_file_name.stats
+  eval $cmd
+  stop_stats $log_file_name.stats
+  summarize_result $log_file_name ${output_name}.t${num_threads}.s${syncval} $grep_name
+}
+
+function run_filluniquerandom {
+  echo "Loading $num_keys unique keys randomly"
+  log_file_name=$output_dir/benchmark_filluniquerandom.log
+  time_cmd=$( get_cmd $log_file_name.time )
+  cmd="$time_cmd ./db_bench --benchmarks=filluniquerandom,stats \
+       --use_existing_db=0 \
+       --sync=0 \
+       $params_w \
+       --threads=1 \
+       --seed=$( date +%s ) \
+       --report_file=${log_file_name}.r.csv \
+       2>&1 | tee -a $log_file_name"
+  if [[ "$job_id" != "" ]]; then
+    echo "Job ID: ${job_id}" > $log_file_name
+    echo $cmd | tee -a $log_file_name
+  else
+    echo $cmd | tee $log_file_name
+  fi
+  start_stats $log_file_name.stats
+  eval $cmd
+  stop_stats $log_file_name.stats
+  summarize_result $log_file_name filluniquerandom filluniquerandom
+}
+
+function run_readrandom {
+  echo "Reading $num_keys random keys"
+  log_file_name="${output_dir}/benchmark_readrandom.t${num_threads}.log"
+  time_cmd=$( get_cmd $log_file_name.time )
+  cmd="$time_cmd ./db_bench --benchmarks=readrandom,stats \
+       --use_existing_db=1 \
+       $params_w \
+       --threads=$num_threads \
+       --seed=$( date +%s ) \
+       --report_file=${log_file_name}.r.csv \
+       2>&1 | tee -a $log_file_name"
+  if [[ "$job_id" != "" ]]; then
+    echo "Job ID: ${job_id}" > $log_file_name
+    echo $cmd | tee -a $log_file_name
+  else
+    echo $cmd | tee $log_file_name
+  fi
+  start_stats $log_file_name.stats
+  eval $cmd
+  stop_stats $log_file_name.stats
+  summarize_result $log_file_name readrandom.t${num_threads} readrandom
+}
+
+function run_multireadrandom {
+  echo "Multi-Reading $num_keys random keys"
+  log_file_name="${output_dir}/benchmark_multireadrandom.t${num_threads}.log"
+  time_cmd=$( get_cmd $log_file_name.time )
+  cmd="$time_cmd ./db_bench --benchmarks=multireadrandom,stats \
+       --use_existing_db=1 \
+       --threads=$num_threads \
+       --batch_size=10 \
+       $params_w \
+       --seed=$( date +%s ) \
+       --report_file=${log_file_name}.r.csv \
+       2>&1 | tee -a $log_file_name"
+  if [[ "$job_id" != "" ]]; then
+    echo "Job ID: ${job_id}" > $log_file_name
+    echo $cmd | tee -a $log_file_name
+  else
+    echo $cmd | tee $log_file_name
+  fi
+  start_stats $log_file_name.stats
+  eval $cmd
+  stop_stats $log_file_name.stats
+  summarize_result $log_file_name multireadrandom.t${num_threads} multireadrandom
+}
+
+function run_readwhile {
+  operation=$1
+  echo "Reading $num_keys random keys while $operation"
+  log_file_name="${output_dir}/benchmark_readwhile${operation}.t${num_threads}.log"
+  time_cmd=$( get_cmd $log_file_name.time )
+  cmd="$time_cmd ./db_bench --benchmarks=readwhile${operation},stats \
+       --use_existing_db=1 \
+       --sync=$syncval \
+       $params_w \
+       --threads=$num_threads \
+       --merge_operator=\"put\" \
+       --seed=$( date +%s ) \
+       --report_file=${log_file_name}.r.csv \
+       2>&1 | tee -a $log_file_name"
+  if [[ "$job_id" != "" ]]; then
+    echo "Job ID: ${job_id}" > $log_file_name
+    echo $cmd | tee -a $log_file_name
+  else
+    echo $cmd | tee $log_file_name
+  fi
+  start_stats $log_file_name.stats
+  eval $cmd
+  stop_stats $log_file_name.stats
+  summarize_result $log_file_name readwhile${operation}.t${num_threads} readwhile${operation}
+}
+
+function run_rangewhile {
+  operation=$1
+  full_name=$2
+  reverse_arg=$3
+  log_file_name="${output_dir}/benchmark_${full_name}.t${num_threads}.log"
+  time_cmd=$( get_cmd $log_file_name.time )
+  echo "Range scan $num_keys random keys while ${operation} for reverse_iter=${reverse_arg}"
+  cmd="$time_cmd ./db_bench --benchmarks=seekrandomwhile${operation},stats \
+       --use_existing_db=1 \
+       --sync=$syncval \
+       $params_w \
+       --threads=$num_threads \
+       --merge_operator=\"put\" \
+       --seek_nexts=$num_nexts_per_seek \
+       --reverse_iterator=$reverse_arg \
+       --seed=$( date +%s ) \
+       --report_file=${log_file_name}.r.csv \
+       2>&1 | tee -a $log_file_name"
+  echo $cmd | tee $log_file_name
+  start_stats $log_file_name.stats
+  eval $cmd
+  stop_stats $log_file_name.stats
+  summarize_result $log_file_name ${full_name}.t${num_threads} seekrandomwhile${operation}
+}
+
+function run_range {
+  full_name=$1
+  reverse_arg=$2
+  log_file_name="${output_dir}/benchmark_${full_name}.t${num_threads}.log"
+  time_cmd=$( get_cmd $log_file_name.time )
+  echo "Range scan $num_keys random keys for reverse_iter=${reverse_arg}"
+  cmd="$time_cmd ./db_bench --benchmarks=seekrandom,stats \
+       --use_existing_db=1 \
+       $params_w \
+       --threads=$num_threads \
+       --seek_nexts=$num_nexts_per_seek \
+       --reverse_iterator=$reverse_arg \
+       --seed=$( date +%s ) \
+       --report_file=${log_file_name}.r.csv \
+       2>&1 | tee -a $log_file_name"
+  if [[ "$job_id" != "" ]]; then
+    echo "Job ID: ${job_id}" > $log_file_name
+    echo $cmd | tee -a $log_file_name
+  else
+    echo $cmd | tee $log_file_name
+  fi
+  start_stats $log_file_name.stats
+  eval $cmd
+  stop_stats $log_file_name.stats
+  summarize_result $log_file_name ${full_name}.t${num_threads} seekrandom
+}
+
+function run_randomtransaction {
+  echo "..."
+  log_file_name=$output_dir/benchmark_randomtransaction.log
+  time_cmd=$( get_cmd $log_file_name.time )
+  cmd="$time_cmd ./db_bench $params_w --benchmarks=randomtransaction,stats \
+       --num=$num_keys \
+       --transaction_db \
+       --threads=5 \
+       --transaction_sets=5 \
+       --report_file=${log_file_name}.r.csv \
+       2>&1 | tee $log_file_name"
+  if [[ "$job_id" != "" ]]; then
+    echo "Job ID: ${job_id}" > $log_file_name
+    echo $cmd | tee -a $log_file_name
+  else
+    echo $cmd | tee $log_file_name
+  fi
+  start_stats $log_file_name.stats
+  eval $cmd
+  stop_stats $log_file_name.stats
+}
+
+function now() {
+  echo `date +"%s"`
+}
+
+
+echo "===== Benchmark ====="
+
+# Run!!!
+IFS=',' read -a jobs <<< $bench_cmd
+# shellcheck disable=SC2068
+for job in ${jobs[@]}; do
+
+  if [ $job != debug ]; then
+    echo "Starting $job (ID: $job_id) at `date`" | tee -a $schedule
+  fi
+
+  start=$(now)
+  if [ $job = bulkload ]; then
+    run_bulkload
+  elif [ $job = flush_mt_l0 ]; then
+    run_lsm flush_mt_l0
+  elif [ $job = waitforcompaction ]; then
+    run_lsm waitforcompaction
+  elif [ $job = fillseq_disable_wal ]; then
+    run_fillseq 1
+  elif [ $job = fillseq_enable_wal ]; then
+    run_fillseq 0
+  elif [ $job = overwrite ]; then
+    run_change overwrite overwrite overwrite
+  elif [ $job = overwritesome ]; then
+    # This uses a different name for overwrite results so it can be run twice in one benchmark run.
+    run_change overwritesome overwrite overwrite
+  elif [ $job = overwriteandwait ]; then
+    run_change overwriteandwait overwrite overwrite,waitforcompaction
+  elif [ $job = updaterandom ]; then
+    run_change updaterandom updaterandom updaterandom
+  elif [ $job = mergerandom ]; then
+    run_change mergerandom mergerandom mergerandom
+  elif [ $job = filluniquerandom ]; then
+    run_filluniquerandom
+  elif [ $job = readrandom ]; then
+    run_readrandom
+  elif [ $job = multireadrandom ]; then
+    run_multireadrandom
+  elif [ $job = fwdrange ]; then
+    run_range $job false
+  elif [ $job = revrange ]; then
+    run_range $job true
+  elif [ $job = readwhilewriting ]; then
+    run_readwhile writing
+  elif [ $job = readwhilemerging ]; then
+    run_readwhile merging
+  elif [ $job = fwdrangewhilewriting ]; then
+    run_rangewhile writing $job false
+  elif [ $job = revrangewhilewriting ]; then
+    run_rangewhile writing $job true
+  elif [ $job = fwdrangewhilemerging ]; then
+    run_rangewhile merging $job false
+  elif [ $job = revrangewhilemerging ]; then
+    run_rangewhile merging $job true
+  elif [ $job = randomtransaction ]; then
+    run_randomtransaction
+  elif [ $job = universal_compaction ]; then
+    run_univ_compaction
+  elif [ $job = debug ]; then
+    num_keys=1000; # debug
+    echo "Setting num_keys to $num_keys"
+  else
+    echo "unknown job $job"
+    exit $EXIT_UNKNOWN_JOB
+  fi
+  end=$(now)
+
+  if [ $job != debug ]; then
+    echo "Completed $job (ID: $job_id) in $((end-start)) seconds" | tee -a $schedule
+  fi
+
+  echo -e $tsv_header
+  tail -1 $report
+
+done
diff --git a/src/rocksdb/tools/benchmark_ci.py b/src/rocksdb/tools/benchmark_ci.py
new file mode 100755
index 000000000..de9f69cf9
--- /dev/null
+++ b/src/rocksdb/tools/benchmark_ci.py
@@ -0,0 +1,182 @@
+#!/usr/bin/env python3
+#  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+"""Run benchmark_compare.sh on the most recent build, for CI
+"""
+
+import argparse
+import glob
+import logging
+import os
+import re
+import shutil
+import subprocess
+import sys
+
+logging.basicConfig(level=logging.INFO)
+
+
+class Config:
+    def __init__(self, args):
+        self.version_file = "./include/rocksdb/version.h"
+        self.data_dir = os.path.expanduser(f"{args.db_dir}")
+        self.results_dir = os.path.expanduser(f"{args.output_dir}")
+        self.benchmark_script = f"{os.getcwd()}/tools/benchmark_compare.sh"
+        self.benchmark_cwd = f"{os.getcwd()}/tools"
+
+    benchmark_env_keys = [
+        "LD_LIBRARY_PATH",
+        "NUM_KEYS",
+        "KEY_SIZE",
+        "VALUE_SIZE",
+        "CACHE_SIZE_MB",
+        "DURATION_RW",
+        "DURATION_RO",
+        "MB_WRITE_PER_SEC",
+        "NUM_THREADS",
+        "COMPRESSION_TYPE",
+        "MIN_LEVEL_TO_COMPRESS",
+        "WRITE_BUFFER_SIZE_MB",
+        "TARGET_FILE_SIZE_BASE_MB",
+        "MAX_BYTES_FOR_LEVEL_BASE_MB",
+        "MAX_BACKGROUND_JOBS",
+        "CACHE_INDEX_AND_FILTER_BLOCKS",
+        "USE_O_DIRECT",
+        "STATS_INTERVAL_SECONDS",
+        "SUBCOMPACTIONS",
+        "COMPACTION_STYLE",
+        "CI_TESTS_ONLY",
+    ]
+
+
+def read_version(config):
+    majorRegex = re.compile(r"#define ROCKSDB_MAJOR\s([0-9]+)")
+    minorRegex = re.compile(r"#define ROCKSDB_MINOR\s([0-9]+)")
+    patchRegex = re.compile(r"#define ROCKSDB_PATCH\s([0-9]+)")
+    with open(config.version_file, "r") as reader:
+        major = None
+        minor = None
+        patch = None
+        for line in reader:
+            if major is None:
+                major = majorRegex.match(line)
+            elif minor is None:
+                minor = minorRegex.match(line)
+            elif patch is None:
+                patch = patchRegex.match(line)
+
+            if patch is not None:
+                break
+
+        if patch is not None:
+            return (major.group(1), minor.group(1), patch.group(1))
+
+    # Didn't complete a match
+    return None
+
+
+def prepare(version_str, config):
+    old_files = glob.glob(f"{config.results_dir}/{version_str}/**", recursive=True)
+    for f in old_files:
+        if os.path.isfile(f):
+            logging.debug(f"remove file {f}")
+            os.remove(f)
+    for f in old_files:
+        if os.path.isdir(f):
+            logging.debug(f"remove dir {f}")
+            os.rmdir(f)
+
+    db_bench_vers = f"{config.benchmark_cwd}/db_bench.{version_str}"
+
+    # Create a symlink to the db_bench executable
+    os.symlink(f"{os.getcwd()}/db_bench", db_bench_vers)
+
+
+def results(version_str, config):
+    # Copy the report TSV file back to the top level of results
+    shutil.copyfile(
+        f"{config.results_dir}/{version_str}/report.tsv",
+        f"{config.results_dir}/report.tsv",
+    )
+
+
+def cleanup(version_str, config):
+    # Remove the symlink to the db_bench executable
+    db_bench_vers = f"{config.benchmark_cwd}/db_bench.{version_str}"
+    os.remove(db_bench_vers)
+
+
+def get_benchmark_env():
+    env = []
+    for key in Config.benchmark_env_keys:
+        value = os.getenv(key)
+        if value is not None:
+            env.append((key, value))
+    return env
+
+
+def main():
+    """Tool for running benchmark_compare.sh on the most recent build, for CI
+    This tool will
+
+    (1) Work out the current version of RocksDB
+    (2) Run benchmark_compare with that version alone
+    """
+
+    parser = argparse.ArgumentParser(
+        description="benchmark_compare.sh Python wrapper for CI."
+    )
+
+    # --tsvfile is the name of the file to read results from
+    # --esdocument is the ElasticSearch document to push these results into
+    #
+    parser.add_argument(
+        "--db_dir",
+        default="~/tmp/rocksdb-benchmark-datadir",
+        help="Database directory hierarchy to use",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default="~/tmp/benchmark-results",
+        help="Benchmark output goes here",
+    )
+    parser.add_argument(
+        "--num_keys",
+        default="10000",
+        help="Number of database keys to use in benchmark test(s) (determines size of test job)",
+    )
+    args = parser.parse_args()
+    config = Config(args)
+
+    version = read_version(config)
+    if version is None:
+        raise Exception(f"Could not read RocksDB version from {config.version_file}")
+    version_str = f"{version[0]}.{version[1]}.{version[2]}"
+    logging.info(f"Run benchmark_ci with RocksDB version {version_str}")
+
+    prepare(version_str, config)
+
+    try:
+        env = get_benchmark_env()
+        env.append(("NUM_KEYS", args.num_keys))
+        cmd = [
+            config.benchmark_script,
+            config.data_dir,
+            config.results_dir,
+            version_str,
+        ]
+        logging.info(f"Run {cmd} env={env} cwd={config.benchmark_cwd}")
+        subprocess.run(cmd, env=dict(env), cwd=config.benchmark_cwd)
+
+        results(version_str, config)
+    finally:
+        cleanup(version_str, config)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/rocksdb/tools/benchmark_compare.sh b/src/rocksdb/tools/benchmark_compare.sh
new file mode 100755
index 000000000..ef7990279
--- /dev/null
+++ b/src/rocksdb/tools/benchmark_compare.sh
@@ -0,0 +1,342 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# REQUIRE: db_bench binary exists in the current directory
+
+dbdir=$1
+odir=$2
+
+# Size Constants
+K=1024
+M=$((1024 * K))
+
+# Dynamic loader configuration
+ld_library_path=${LD_LIBRARY_PATH:-""}
+
+# Benchmark configuration
+duration_rw=${DURATION_RW:-65}
+duration_ro=${DURATION_RO:-65}
+num_keys=${NUM_KEYS:-1000000}
+num_threads=${NUM_THREADS:-16}
+key_size=${KEY_SIZE:-20}
+value_size=${VALUE_SIZE:-400}
+mb_write_per_sec=${MB_WRITE_PER_SEC:-2}
+ci_tests_only=${CI_TESTS_ONLY:-"false"}
+
+# RocksDB configuration
+compression_type=${COMPRESSION_TYPE:-lz4}
+subcompactions=${SUBCOMPACTIONS:-1}
+write_buffer_size_mb=${WRITE_BUFFER_SIZE_MB:-32}
+target_file_size_base_mb=${TARGET_FILE_SIZE_BASE_MB:-32}
+max_bytes_for_level_base_mb=${MAX_BYTES_FOR_LEVEL_BASE_MB:-128}
+max_background_jobs=${MAX_BACKGROUND_JOBS:-8}
+stats_interval_seconds=${STATS_INTERVAL_SECONDS:-20}
+cache_index_and_filter_blocks=${CACHE_INDEX_AND_FILTER_BLOCKS:-0}
+# USE_O_DIRECT doesn't need a default
+bytes_per_sync=${BYTES_PER_SYNC:-$(( 1 * M ))}
+# CACHE_SIZE_MB doesn't need a default
+min_level_to_compress=${MIN_LEVEL_TO_COMPRESS:-"-1"}
+
+compaction_style=${COMPACTION_STYLE:-leveled}
+if [ "$compaction_style" = "leveled" ]; then
+  echo Use leveled compaction
+elif [ "$compaction_style" = "universal" ]; then
+  echo Use universal compaction
+elif [ "$compaction_style" = "blob" ]; then
+  echo Use blob compaction
+else
+  echo COMPACTION_STYLE is :: "$COMPACTION_STYLE" :: and must be one of leveled, universal, blob
+  exit 1
+fi
+
+# Leveled compaction configuration
+level0_file_num_compaction_trigger=${LEVEL0_FILE_NUM_COMPACTION_TRIGGER:-4}
+level0_slowdown_writes_trigger=${LEVEL0_SLOWDOWN_WRITES_TRIGGER:-20}
+level0_stop_writes_trigger=${LEVEL0_STOP_WRITES_TRIGGER:-30}
+per_level_fanout=${PER_LEVEL_FANOUT:-8}
+
+# Universal compaction configuration
+universal_min_merge_width=${UNIVERSAL_MIN_MERGE_WIDTH:-2}
+universal_max_merge_width=${UNIVERSAL_MAX_MERGE_WIDTH:-20}
+universal_size_ratio=${UNIVERSAL_SIZE_RATIO:-1}
+universal_max_size_amp=${UNIVERSAL_MAX_SIZE_AMP:-200}
+universal_compression_size_percent=${UNIVERSAL_COMPRESSION_SIZE_PERCENT:-"-1"}
+
+# Integrated BlobDB configuration
+
+min_blob_size=${MIN_BLOB_SIZE:-0}
+blob_file_size=${BLOB_FILE_SIZE:-$(( 256 * M ))}
+blob_compression_type=${BLOB_COMPRESSION_TYPE:-${compression_type}}
+blob_gc_age_cutoff=${BLOB_GC_AGE_CUTOFF:-"0.25"}
+blob_gc_force_threshold=${BLOB_GC_FORCE_THRESHOLD:-1}
+
+# Arguments for dynamic loading
+base_args=( LD_LIBRARY_PATH="$ld_library_path" )
+
+# Arguments used for all tests
+base_args+=( NUM_KEYS="$num_keys" )
+base_args+=( NUM_THREADS="$num_threads" )
+base_args+=( KEY_SIZE="$key_size" )
+base_args+=( VALUE_SIZE="$value_size" )
+
+base_args+=( SUBCOMPACTIONS="$subcompactions" )
+base_args+=( COMPRESSION_TYPE="$compression_type" )
+base_args+=( WRITE_BUFFER_SIZE_MB="$write_buffer_size_mb" )
+base_args+=( TARGET_FILE_SIZE_BASE_MB="$target_file_size_base_mb" )
+base_args+=( MAX_BYTES_FOR_LEVEL_BASE_MB="$max_bytes_for_level_base_mb" )
+base_args+=( MAX_BACKGROUND_JOBS="$max_background_jobs" )
+base_args+=( STATS_INTERVAL_SECONDS="$stats_interval_seconds" )
+base_args+=( CACHE_INDEX_AND_FILTER_BLOCKS="$cache_index_and_filter_blocks" )
+base_args+=( COMPACTION_STYLE="$compaction_style" )
+base_args+=( BYTES_PER_SYNC="$bytes_per_sync" )
+
+if [ -n "$USE_O_DIRECT" ]; then
+  base_args+=( USE_O_DIRECT=1 )
+fi
+
+if [ -n "$NUMA" ]; then
+  base_args+=( NUMACTL=1 )
+fi
+
+if [ -n "$CACHE_SIZE_MB" ]; then
+  cacheb=$(( CACHE_SIZE_MB * M ))
+  base_args+=( CACHE_SIZE="$cacheb" )
+fi
+
+if [ "$compaction_style" == "leveled" ]; then
+  base_args+=( LEVEL0_FILE_NUM_COMPACTION_TRIGGER="$level0_file_num_compaction_trigger" )
+  base_args+=( LEVEL0_SLOWDOWN_WRITES_TRIGGER="$level0_slowdown_writes_trigger" )
+  base_args+=( LEVEL0_STOP_WRITES_TRIGGER="$level0_stop_writes_trigger" )
+  base_args+=( PER_LEVEL_FANOUT="$per_level_fanout" )
+elif [ "$compaction_style" == "universal" ]; then
+  base_args+=( LEVEL0_FILE_NUM_COMPACTION_TRIGGER="$level0_file_num_compaction_trigger" )
+  base_args+=( LEVEL0_SLOWDOWN_WRITES_TRIGGER="$level0_slowdown_writes_trigger" )
+  base_args+=( LEVEL0_STOP_WRITES_TRIGGER="$level0_stop_writes_trigger" )
+  base_args+=( UNIVERSAL_MIN_MERGE_WIDTH="$universal_min_merge_width" )
+  base_args+=( UNIVERSAL_MAX_MERGE_WIDTH="$universal_max_merge_width" )
+  base_args+=( UNIVERSAL_SIZE_RATIO="$universal_size_ratio" )
+  base_args+=( UNIVERSAL_MAX_SIZE_AMP="$universal_max_size_amp" )
+  if [ -n "$UNIVERSAL_ALLOW_TRIVIAL_MOVE" ]; then
+    base_args+=( UNIVERSAL_ALLOW_TRIVIAL_MOVE=1 )
+  fi
+else
+  # Inherit settings for leveled because index uses leveled LSM
+  base_args+=( LEVEL0_FILE_NUM_COMPACTION_TRIGGER="$level0_file_num_compaction_trigger" )
+  base_args+=( LEVEL0_SLOWDOWN_WRITES_TRIGGER="$level0_slowdown_writes_trigger" )
+  base_args+=( LEVEL0_STOP_WRITES_TRIGGER="$level0_stop_writes_trigger" )
+  base_args+=( PER_LEVEL_FANOUT="$per_level_fanout" )
+  # Then add BlobDB specific settings
+  base_args+=( MIN_BLOB_SIZE="$min_blob_size" )
+  base_args+=( BLOB_FILE_SIZE="$blob_file_size" )
+  base_args+=( BLOB_COMPRESSION_TYPE="$blob_compression_type" )
+  base_args+=( BLOB_GC_AGE_CUTOFF="$blob_gc_age_cutoff" )
+  base_args+=( BLOB_GC_FORCE_THRESHOLD="$blob_gc_force_threshold" )
+fi
+
+function usage {
+  echo "usage: benchmark_compare.sh db_dir output_dir version+"
+  echo -e "\tdb_dir\t\tcreate RocksDB database in this directory"
+  echo -e "\toutput_dir\twrite output from performance tests in this directory"
+  echo -e "\tversion+\tspace separated sequence of RocksDB versions to test."
+  echo -e "\nThis expects that db_bench.\$version exists in \$PWD for each version in the sequence."
+  echo -e "An example value for version+ is 6.23.0 6.24.0"
+  echo ""
+  echo -e "Environment variables for options"
+  echo -e "\tNUM_KEYS\t\t\tnumber of keys to load"
+  echo -e "\tKEY_SIZE\t\t\tsize of key"
+  echo -e "\tVALUE_SIZE\t\t\tsize of value"
+  echo -e "\tCACHE_SIZE_MB\t\t\tsize of block cache in MB"
+  echo -e "\tDURATION_RW\t\t\tnumber of seconds for which each test runs, except for read-only tests"
+  echo -e "\tDURATION_RO\t\t\tnumber of seconds for which each read-only test runs"
+  echo -e "\tMB_WRITE_PER_SEC\t\trate limit for writer that runs concurrent with queries for some tests"
+  echo -e "\tNUM_THREADS\t\t\tnumber of user threads"
+  echo -e "\tCOMPRESSION_TYPE\t\tcompression type (zstd, lz4, none, etc)"
+  echo -e "\tMIN_LEVEL_TO_COMPRESS\t\tmin_level_to_compress for leveled"
+  echo -e "\tWRITE_BUFFER_SIZE_MB\t\tsize of write buffer in MB"
+  echo -e "\tTARGET_FILE_SIZE_BASE_MB\tvalue for target_file_size_base in MB"
+  echo -e "\tMAX_BYTES_FOR_LEVEL_BASE_MB\tvalue for max_bytes_for_level_base in MB"
+  echo -e "\tMAX_BACKGROUND_JOBS\t\tvalue for max_background_jobs"
+  echo -e "\tCACHE_INDEX_AND_FILTER_BLOCKS\tvalue for cache_index_and_filter_blocks"
+  echo -e "\tUSE_O_DIRECT\t\t\tUse O_DIRECT for user reads and compaction"
+  echo -e "\tBYTES_PER_SYNC\t\t\tValue for bytes_per_sync"
+  echo -e "\tSTATS_INTERVAL_SECONDS\t\tvalue for stats_interval_seconds"
+  echo -e "\tSUBCOMPACTIONS\t\t\tvalue for subcompactions"
+  echo -e "\tCOMPACTION_STYLE\t\tCompaction style to use, one of: leveled, universal, blob"
+  echo -e "\tCI_TESTS_ONLY\t\tRun a subset of tests tailored to a CI regression job, one of: true, false (default)"
+  echo ""
+  echo -e "\tOptions specific to leveled compaction:"
+  echo -e "\t\tLEVEL0_FILE_NUM_COMPACTION_TRIGGER\tvalue for level0_file_num_compaction_trigger"
+  echo -e "\t\tLEVEL0_SLOWDOWN_WRITES_TRIGGER\t\tvalue for level0_slowdown_writes_trigger"
+  echo -e "\t\tLEVEL0_STOP_WRITES_TRIGGER\t\tvalue for level0_stop_writes_trigger"
+  echo -e "\t\tPER_LEVEL_FANOUT\t\t\tvalue for max_bytes_for_level_multiplier"
+  echo ""
+  echo -e "\tOptions specific to universal compaction:"
+  echo -e "\t\tSee LEVEL0_*_TRIGGER above"
+  echo -e "\t\tUNIVERSAL_MIN_MERGE_WIDTH\t\tvalue of min_merge_width option for universal"
+  echo -e "\t\tUNIVERSAL_MAX_MERGE_WIDTH\t\tvalue of min_merge_width option for universal"
+  echo -e "\t\tUNIVERSAL_SIZE_RATIO\t\t\tvalue of size_ratio option for universal"
+  echo -e "\t\tUNIVERSAL_MAX_SIZE_AMP\t\t\tmax_size_amplification_percent for universal"
+  echo -e "\t\tUNIVERSAL_ALLOW_TRIVIAL_MOVE\t\tSet allow_trivial_move to true for universal, default is false"
+  echo -e "\t\tUNIVERSAL_COMPRESSION_SIZE_PERCENT\tpercentage of LSM tree that should be compressed"
+  echo ""
+  echo -e "\tOptions for integrated BlobDB:"
+  echo -e "\t\tMIN_BLOB_SIZE\t\t\t\tvalue for min_blob_size"
+  echo -e "\t\tBLOB_FILE_SIZE\t\t\t\tvalue for blob_file_size"
+  echo -e "\t\tBLOB_COMPRESSION_TYPE\t\t\tvalue for blob_compression_type"
+  echo -e "\t\tBLOB_GC_AGE_CUTOFF\t\t\tvalue for blog_garbage_collection_age_cutoff"
+  echo -e "\t\tBLOB_GC_FORCE_THRESHOLD\t\t\tvalue for blog_garbage_collection_force_threshold"
+}
+
+function dump_env {
+  echo "Base args" > "$odir"/args
+  echo "${base_args[@]}" | tr ' ' '\n' >> "$odir"/args
+
+  echo -e "\nOther args" >> "$odir"/args
+  echo -e "dbdir\t$dbdir" >> "$odir"/args
+  echo -e "duration_rw\t$duration_rw" >> "$odir"/args
+  echo -e "duration_ro\t$duration_ro" >> "$odir"/args
+  echo -e "per_level_fanout\t$per_level_fanout" >> "$odir"/args
+
+  echo -e "\nargs_load:" >> "$odir"/args
+  echo "${args_load[@]}" | tr ' ' '\n' >> "$odir"/args
+  echo -e "\nargs_nolim:" >> "$odir"/args
+  echo "${args_nolim[@]}" | tr ' ' '\n' >> "$odir"/args
+  echo -e "\nargs_lim:" >> "$odir"/args
+  echo "${args_lim[@]}" | tr ' ' '\n' >> "$odir"/args
+}
+
+if [ $# -lt 3 ]; then
+  usage
+  echo
+  echo "Need at least 3 arguments"
+  exit 1
+fi
+
+shift 2
+
+mkdir -p "$odir"
+
+echo Test versions: "$@"
+echo Test versions: "$@" >> "$odir"/args
+
+for v in "$@" ; do
+  my_odir="$odir"/"$v"
+
+  if [ -d "$my_odir" ]; then
+    echo Exiting because the output directory exists: "$my_odir"
+    exit 1
+  fi
+
+  args_common=("${base_args[@]}")
+
+  args_common+=( OUTPUT_DIR="$my_odir" DB_DIR="$dbdir" WAL_DIR="$dbdir" DB_BENCH_NO_SYNC=1 )
+
+  if [ "$compaction_style" == "leveled" ]; then
+    args_common+=( MIN_LEVEL_TO_COMPRESS="$min_level_to_compress" )
+  elif [ "$compaction_style" == "universal" ]; then
+    args_common+=( UNIVERSAL=1 COMPRESSION_SIZE_PERCENT="$universal_compression_size_percent" )
+  else
+    args_common+=( MIN_LEVEL_TO_COMPRESS="$min_level_to_compress" )
+  fi
+
+  args_load=("${args_common[@]}")
+
+  args_nolim=("${args_common[@]}")
+
+  args_lim=("${args_nolim[@]}")
+  args_lim+=( MB_WRITE_PER_SEC="$mb_write_per_sec" )
+
+  dump_env
+
+  echo Run benchmark for "$v" at "$( date )" with results at "$my_odir"
+  rm -f db_bench
+  echo ln -s db_bench."$v" db_bench
+  ln -s db_bench."$v" db_bench
+
+  find "$dbdir" -type f -exec rm \{\} \;
+
+  # Load in key order
+  echo env "${args_load[@]}" bash ./benchmark.sh fillseq_disable_wal
+  env -i "${args_load[@]}" bash ./benchmark.sh fillseq_disable_wal
+
+  # Read-only tests. The LSM tree shape is in a deterministic state if trivial move
+  # was used during the load.
+
+  # Add revrange with a fixed duration and hardwired number of keys and threads to give
+  # compaction debt leftover from fillseq a chance at being removed. Not using waitforcompaction
+  # here because it isn't supported on older db_bench versions.
+  env -i "${args_nolim[@]}" DURATION=300 NUM_KEYS=100 NUM_THREADS=1 bash ./benchmark.sh revrange
+  env -i "${args_nolim[@]}" DURATION="$duration_ro" bash ./benchmark.sh readrandom
+
+  # Skipped for CI - a single essentail readrandom is enough to set up for other tests
+  if [ "$ci_tests_only" != "true" ]; then
+    env -i "${args_nolim[@]}" DURATION="$duration_ro" bash ./benchmark.sh fwdrange
+    env -i "${args_lim[@]}"   DURATION="$duration_ro" bash ./benchmark.sh multireadrandom --multiread_batched
+  else
+    echo "CI_TESTS_ONLY is set, skipping optional read steps."
+  fi
+
+  # Write 10% of the keys. The goal is to randomize keys prior to Lmax
+  p10=$( echo "$num_keys" "$num_threads" | awk '{ printf "%.0f", $1 / $2 / 10.0 }' )
+  env -i "${args_nolim[@]}" WRITES="$p10"        bash ./benchmark.sh overwritesome
+
+  if [ "$compaction_style" == "leveled" ]; then
+    # These are not supported by older versions
+    # Flush memtable & L0 to get LSM tree into deterministic state
+    env -i "${args_nolim[@]}"                  bash ./benchmark.sh flush_mt_l0
+  elif [ "$compaction_style" == "universal" ]; then
+    # For universal don't compact L0 as can have too many sorted runs
+    # waitforcompaction can hang, see https://github.com/facebook/rocksdb/issues/9275
+    # While this is disabled the test that follows will have more variance from compaction debt.
+    # env -i "${args_nolim[@]}"                    bash ./benchmark.sh waitforcompaction
+    echo TODO enable when waitforcompaction hang is fixed
+  else
+    # These are not supported by older versions
+    # Flush memtable & L0 to get LSM tree into deterministic state
+    env -i "${args_nolim[@]}"                  bash ./benchmark.sh flush_mt_l0
+  fi
+
+  # Read-mostly tests with a rate-limited writer
+  env -i "${args_lim[@]}" DURATION="$duration_rw" bash ./benchmark.sh revrangewhilewriting
+  env -i "${args_lim[@]}" DURATION="$duration_rw" bash ./benchmark.sh fwdrangewhilewriting
+  env -i "${args_lim[@]}" DURATION="$duration_rw" bash ./benchmark.sh readwhilewriting
+
+  # Write-only tests
+
+  # This creates much compaction debt which will be a problem for tests added after it.
+  # Also, the compaction stats measured at test end can underestimate write-amp depending
+  # on how much compaction debt is allowed.
+  if [ "$compaction_style" == "leveled" ] && ./db_bench --benchmarks=waitforcompaction ; then
+    # Use waitforcompaction to get more accurate write-amp measurement
+    env -i "${args_nolim[@]}" DURATION="$duration_rw" bash ./benchmark.sh overwriteandwait
+  else
+    # waitforcompaction hangs with universal, see https://github.com/facebook/rocksdb/issues/9275
+    env -i "${args_nolim[@]}" DURATION="$duration_rw" bash ./benchmark.sh overwrite
+  fi
+
+  cp "$dbdir"/LOG* "$my_odir"
+  gzip -9 "$my_odir"/LOG*
+
+done
+
+# Generate a file that groups lines from the same test for all versions
+basev=$1
+nlines=$( awk '/^ops_sec/,/END/' "$odir"/"$basev"/report.tsv | grep -v ops_sec | wc -l )
+hline=$( awk '/^ops_sec/ { print NR }' "$odir"/"$basev"/report.tsv )
+sline=$(( hline + 1 ))
+eline=$(( sline + nlines - 1 ))
+
+sum_file="$odir"/summary.tsv
+
+for v in "$@" ; do
+  echo "$odir"/"$v"/report.tsv
+done >> "$sum_file"
+echo >> "$sum_file"
+
+for x in $( seq "$sline" "$eline" ); do
+  awk '{ if (NR == lno) { print $0 } }' lno="$hline" "$odir"/"$basev"/report.tsv >> "$sum_file"
+  for v in "$@" ; do
+    r="$odir"/"$v"/report.tsv
+    awk '{ if (NR == lno) { print $0 } }' lno="$x" "$r" >> "$sum_file"
+  done
+echo >> "$sum_file"
+done
diff --git a/src/rocksdb/tools/benchmark_leveldb.sh b/src/rocksdb/tools/benchmark_leveldb.sh
new file mode 100755
index 000000000..069b53a9f
--- /dev/null
+++ b/src/rocksdb/tools/benchmark_leveldb.sh
@@ -0,0 +1,187 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# REQUIRE: db_bench binary exists in the current directory
+#
+# This should be used with the LevelDB fork listed here to use additional test options.
+# For more details on the changes see the blog post listed below.
+#   https://github.com/mdcallag/leveldb-1
+#   http://smalldatum.blogspot.com/2015/04/comparing-leveldb-and-rocksdb-take-2.html
+
+if [ $# -ne 1 ]; then
+  echo -n "./benchmark.sh [fillseq/overwrite/readrandom/readwhilewriting]"
+  exit 0
+fi
+
+# size constants
+K=1024
+M=$((1024 * K))
+G=$((1024 * M))
+
+if [ -z $DB_DIR ]; then
+  echo "DB_DIR is not defined"
+  exit 0
+fi
+
+output_dir=${OUTPUT_DIR:-/tmp/}
+if [ ! -d $output_dir ]; then
+  mkdir -p $output_dir
+fi
+
+# all multithreaded tests run with sync=1 unless
+# $DB_BENCH_NO_SYNC is defined
+syncval="1"
+if [ ! -z $DB_BENCH_NO_SYNC ]; then
+  echo "Turning sync off for all multithreaded tests"
+  syncval="0";
+fi
+
+num_threads=${NUM_THREADS:-16}
+# Only for *whilewriting, *whilemerging
+writes_per_second=${WRITES_PER_SECOND:-$((10 * K))}
+cache_size=${CACHE_SIZE:-$((1 * G))}
+
+num_keys=${NUM_KEYS:-$((1 * G))}
+key_size=20
+value_size=${VALUE_SIZE:-400}
+block_size=${BLOCK_SIZE:-4096}
+
+const_params="
+  --db=$DB_DIR \
+  \
+  --num=$num_keys \
+  --value_size=$value_size \
+  --cache_size=$cache_size \
+  --compression_ratio=0.5 \
+  \
+  --write_buffer_size=$((2 * M)) \
+  \
+  --histogram=1 \
+  \
+  --bloom_bits=10 \
+  --open_files=$((20 * K))"
+
+params_w="$const_params "
+
+function summarize_result {
+  test_out=$1
+  test_name=$2
+  bench_name=$3
+  nthr=$4
+
+  usecs_op=$( grep ^${bench_name} $test_out | awk '{ printf "%.1f", $3 }' )
+  mb_sec=$( grep ^${bench_name} $test_out | awk '{ printf "%.1f", $5 }' )
+  ops=$( grep "^Count:" $test_out | awk '{ print $2 }' )
+  ops_sec=$( echo "scale=0; (1000000.0 * $nthr) / $usecs_op" | bc )
+  avg=$( grep "^Count:" $test_out | awk '{ printf "%.1f", $4 }' )
+  p50=$( grep "^Min:" $test_out | awk '{ printf "%.1f", $4 }' )
+  echo -e "$ops_sec\t$mb_sec\t$usecs_op\t$avg\t$p50\t$test_name" \
+    >> $output_dir/report.txt
+}
+
+function run_fillseq {
+  # This runs with a vector memtable and the WAL disabled to load faster. It is still crash safe and the
+  # client can discover where to restart a load after a crash. I think this is a good way to load.
+  echo "Loading $num_keys keys sequentially"
+  cmd="./db_bench --benchmarks=fillseq \
+       --use_existing_db=0 \
+       --sync=0 \
+       $params_w \
+       --threads=1 \
+       --seed=$( date +%s ) \
+       2>&1 | tee -a $output_dir/benchmark_fillseq.v${value_size}.log"
+  echo $cmd | tee $output_dir/benchmark_fillseq.v${value_size}.log
+  eval $cmd
+  summarize_result $output_dir/benchmark_fillseq.v${value_size}.log fillseq.v${value_size} fillseq 1
+}
+
+function run_change {
+  operation=$1
+  echo "Do $num_keys random $operation"
+  out_name="benchmark_${operation}.t${num_threads}.s${syncval}.log"
+  cmd="./db_bench --benchmarks=$operation \
+       --use_existing_db=1 \
+       --sync=$syncval \
+       $params_w \
+       --threads=$num_threads \
+       --seed=$( date +%s ) \
+       2>&1 | tee -a $output_dir/${out_name}"
+  echo $cmd | tee $output_dir/${out_name}
+  eval $cmd
+  summarize_result $output_dir/${out_name} ${operation}.t${num_threads}.s${syncval} $operation $num_threads
+}
+
+function run_readrandom {
+  echo "Reading $num_keys random keys"
+  out_name="benchmark_readrandom.t${num_threads}.log"
+  cmd="./db_bench --benchmarks=readrandom \
+       --use_existing_db=1 \
+       $params_w \
+       --threads=$num_threads \
+       --seed=$( date +%s ) \
+       2>&1 | tee -a $output_dir/${out_name}"
+  echo $cmd | tee $output_dir/${out_name}
+  eval $cmd
+  summarize_result $output_dir/${out_name} readrandom.t${num_threads} readrandom $num_threads
+}
+
+function run_readwhile {
+  operation=$1
+  echo "Reading $num_keys random keys while $operation"
+  out_name="benchmark_readwhile${operation}.t${num_threads}.log"
+  cmd="./db_bench --benchmarks=readwhile${operation} \
+       --use_existing_db=1 \
+       --sync=$syncval \
+       $params_w \
+       --threads=$num_threads \
+       --writes_per_second=$writes_per_second \
+       --seed=$( date +%s ) \
+       2>&1 | tee -a $output_dir/${out_name}"
+  echo $cmd | tee $output_dir/${out_name}
+  eval $cmd
+  summarize_result $output_dir/${out_name} readwhile${operation}.t${num_threads} readwhile${operation} $num_threads
+}
+
+function now() {
+  echo `date +"%s"`
+}
+
+report="$output_dir/report.txt"
+schedule="$output_dir/schedule.txt"
+
+echo "===== Benchmark ====="
+
+# Run!!!
+IFS=',' read -a jobs <<< $1
+# shellcheck disable=SC2068
+for job in ${jobs[@]}; do
+
+  if [ $job != debug ]; then
+    echo "Start $job at `date`" | tee -a $schedule
+  fi
+
+  start=$(now)
+  if [ $job = fillseq ]; then
+    run_fillseq
+  elif [ $job = overwrite ]; then
+    run_change overwrite
+  elif [ $job = readrandom ]; then
+    run_readrandom
+  elif [ $job = readwhilewriting ]; then
+    run_readwhile writing
+  elif [ $job = debug ]; then
+    num_keys=1000; # debug
+    echo "Setting num_keys to $num_keys"
+  else
+    echo "unknown job $job"
+    exit
+  fi
+  end=$(now)
+
+  if [ $job != debug ]; then
+    echo "Complete $job in $((end-start)) seconds" | tee -a $schedule
+  fi
+
+  echo -e "ops/sec\tmb/sec\tusec/op\tavg\tp50\tTest"
+  tail -1 $output_dir/report.txt
+
+done
diff --git a/src/rocksdb/tools/blob_dump.cc b/src/rocksdb/tools/blob_dump.cc
new file mode 100644
index 000000000..1f75eb20d
--- /dev/null
+++ b/src/rocksdb/tools/blob_dump.cc
@@ -0,0 +1,111 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include <getopt.h>
+
+#include <cstdio>
+#include <string>
+#include <unordered_map>
+
+#include "utilities/blob_db/blob_dump_tool.h"
+
+using ROCKSDB_NAMESPACE::Status;
+using ROCKSDB_NAMESPACE::blob_db::BlobDumpTool;
+
+int main(int argc, char** argv) {
+  using DisplayType = BlobDumpTool::DisplayType;
+  const std::unordered_map<std::string, DisplayType> display_types = {
+      {"none", DisplayType::kNone},
+      {"raw", DisplayType::kRaw},
+      {"hex", DisplayType::kHex},
+      {"detail", DisplayType::kDetail},
+  };
+  const struct option options[] = {
+      {"help", no_argument, nullptr, 'h'},
+      {"file", required_argument, nullptr, 'f'},
+      {"show_key", optional_argument, nullptr, 'k'},
+      {"show_blob", optional_argument, nullptr, 'b'},
+      {"show_uncompressed_blob", optional_argument, nullptr, 'r'},
+      {"show_summary", optional_argument, nullptr, 's'},
+  };
+  DisplayType show_key = DisplayType::kRaw;
+  DisplayType show_blob = DisplayType::kNone;
+  DisplayType show_uncompressed_blob = DisplayType::kNone;
+  bool show_summary = false;
+  std::string file;
+  while (true) {
+    int c = getopt_long(argc, argv, "hk::b::f:", options, nullptr);
+    if (c < 0) {
+      break;
+    }
+    std::string arg_str(optarg ? optarg : "");
+    switch (c) {
+      case 'h':
+        fprintf(stdout,
+                "Usage: blob_dump --file=filename "
+                "[--show_key[=none|raw|hex|detail]] "
+                "[--show_blob[=none|raw|hex|detail]] "
+                "[--show_uncompressed_blob[=none|raw|hex|detail]] "
+                "[--show_summary]\n");
+        return 0;
+      case 'f':
+        file = optarg;
+        break;
+      case 'k':
+        if (optarg) {
+          if (display_types.count(arg_str) == 0) {
+            fprintf(stderr, "Unrecognized key display type.\n");
+            return -1;
+          }
+          show_key = display_types.at(arg_str);
+        }
+        break;
+      case 'b':
+        if (optarg) {
+          if (display_types.count(arg_str) == 0) {
+            fprintf(stderr, "Unrecognized blob display type.\n");
+            return -1;
+          }
+          show_blob = display_types.at(arg_str);
+        } else {
+          show_blob = DisplayType::kHex;
+        }
+        break;
+      case 'r':
+        if (optarg) {
+          if (display_types.count(arg_str) == 0) {
+            fprintf(stderr, "Unrecognized blob display type.\n");
+            return -1;
+          }
+          show_uncompressed_blob = display_types.at(arg_str);
+        } else {
+          show_uncompressed_blob = DisplayType::kHex;
+        }
+        break;
+      case 's':
+        show_summary = true;
+        break;
+      default:
+        fprintf(stderr, "Unrecognized option.\n");
+        return -1;
+    }
+  }
+  BlobDumpTool tool;
+  Status s =
+      tool.Run(file, show_key, show_blob, show_uncompressed_blob, show_summary);
+  if (!s.ok()) {
+    fprintf(stderr, "Failed: %s\n", s.ToString().c_str());
+    return -1;
+  }
+  return 0;
+}
+#else
+#include <stdio.h>
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "Not supported in lite mode.\n");
+  return -1;
+}
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/block_cache_analyzer/__init__.py b/src/rocksdb/tools/block_cache_analyzer/__init__.py
new file mode 100644
index 000000000..8dbe96a78
--- /dev/null
+++ b/src/rocksdb/tools/block_cache_analyzer/__init__.py
@@ -0,0 +1,2 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
diff --git a/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim.py b/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim.py
new file mode 100644
index 000000000..67307df53
--- /dev/null
+++ b/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim.py
@@ -0,0 +1,2000 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import gc
+import heapq
+import random
+import sys
+import time
+from collections import OrderedDict
+from os import path
+
+import numpy as np
+
+
+kSampleSize = 64  # The sample size used when performing eviction.
+kMicrosInSecond = 1000000
+kSecondsInMinute = 60
+kSecondsInHour = 3600
+
+
+class TraceRecord:
+    """
+    A trace record represents a block access.
+    It holds the same struct as BlockCacheTraceRecord in
+    trace_replay/block_cache_tracer.h
+    """
+
+    def __init__(
+        self,
+        access_time,
+        block_id,
+        block_type,
+        block_size,
+        cf_id,
+        cf_name,
+        level,
+        fd,
+        caller,
+        no_insert,
+        get_id,
+        key_id,
+        kv_size,
+        is_hit,
+        referenced_key_exist_in_block,
+        num_keys_in_block,
+        table_id,
+        seq_number,
+        block_key_size,
+        key_size,
+        block_offset_in_file,
+        next_access_seq_no,
+    ):
+        self.access_time = access_time
+        self.block_id = block_id
+        self.block_type = block_type
+        self.block_size = block_size + block_key_size
+        self.cf_id = cf_id
+        self.cf_name = cf_name
+        self.level = level
+        self.fd = fd
+        self.caller = caller
+        if no_insert == 1:
+            self.no_insert = True
+        else:
+            self.no_insert = False
+        self.get_id = get_id
+        self.key_id = key_id
+        self.kv_size = kv_size
+        if is_hit == 1:
+            self.is_hit = True
+        else:
+            self.is_hit = False
+        if referenced_key_exist_in_block == 1:
+            self.referenced_key_exist_in_block = True
+        else:
+            self.referenced_key_exist_in_block = False
+        self.num_keys_in_block = num_keys_in_block
+        self.table_id = table_id
+        self.seq_number = seq_number
+        self.block_key_size = block_key_size
+        self.key_size = key_size
+        self.block_offset_in_file = block_offset_in_file
+        self.next_access_seq_no = next_access_seq_no
+
+
+class CacheEntry:
+    """A cache entry stored in the cache."""
+
+    def __init__(
+        self,
+        value_size,
+        cf_id,
+        level,
+        block_type,
+        table_id,
+        access_number,
+        time_s,
+        num_hits=0,
+    ):
+        self.value_size = value_size
+        self.last_access_number = access_number
+        self.num_hits = num_hits
+        self.cf_id = 0
+        self.level = level
+        self.block_type = block_type
+        self.last_access_time = time_s
+        self.insertion_time = time_s
+        self.table_id = table_id
+
+    def __repr__(self):
+        """Debug string."""
+        return "(s={},last={},hits={},cf={},l={},bt={})\n".format(
+            self.value_size,
+            self.last_access_number,
+            self.num_hits,
+            self.cf_id,
+            self.level,
+            self.block_type,
+        )
+
+    def cost_class(self, cost_class_label):
+        if cost_class_label == "table_bt":
+            return "{}-{}".format(self.table_id, self.block_type)
+        elif cost_class_label == "table":
+            return "{}".format(self.table_id)
+        elif cost_class_label == "bt":
+            return "{}".format(self.block_type)
+        elif cost_class_label == "cf":
+            return "{}".format(self.cf_id)
+        elif cost_class_label == "cf_bt":
+            return "{}-{}".format(self.cf_id, self.block_type)
+        elif cost_class_label == "table_level_bt":
+            return "{}-{}-{}".format(self.table_id, self.level, self.block_type)
+        assert False, "Unknown cost class label {}".format(cost_class_label)
+        return None
+
+
+class HashEntry:
+    """A hash entry stored in a hash table."""
+
+    def __init__(self, key, hash, value):
+        self.key = key
+        self.hash = hash
+        self.value = value
+
+    def __repr__(self):
+        return "k={},h={},v=[{}]".format(self.key, self.hash, self.value)
+
+
+class HashTable:
+    """
+    A custom implementation of hash table to support fast random sampling.
+    It is closed hashing and uses chaining to resolve hash conflicts.
+    It grows/shrinks the hash table upon insertion/deletion to support
+    fast lookups and random samplings.
+    """
+
+    def __init__(self):
+        self.initial_size = 32
+        self.table = [None] * self.initial_size
+        self.elements = 0
+
+    def random_sample(self, sample_size):
+        """Randomly sample 'sample_size' hash entries from the table."""
+        samples = []
+        index = random.randint(0, len(self.table) - 1)
+        pos = index
+        # Starting from index, adding hash entries to the sample list until
+        # sample_size is met or we ran out of entries.
+        while True:
+            if self.table[pos] is not None:
+                for i in range(len(self.table[pos])):
+                    if self.table[pos][i] is None:
+                        continue
+                    samples.append(self.table[pos][i])
+                    if len(samples) == sample_size:
+                        break
+            pos += 1
+            pos = pos % len(self.table)
+            if pos == index or len(samples) == sample_size:
+                break
+        assert len(samples) <= sample_size
+        return samples
+
+    def __repr__(self):
+        all_entries = []
+        for i in range(len(self.table)):
+            if self.table[i] is None:
+                continue
+            for j in range(len(self.table[i])):
+                if self.table[i][j] is not None:
+                    all_entries.append(self.table[i][j])
+        return "{}".format(all_entries)
+
+    def values(self):
+        all_values = []
+        for i in range(len(self.table)):
+            if self.table[i] is None:
+                continue
+            for j in range(len(self.table[i])):
+                if self.table[i][j] is not None:
+                    all_values.append(self.table[i][j].value)
+        return all_values
+
+    def __len__(self):
+        return self.elements
+
+    def insert(self, key, hash, value):
+        """
+        Insert a hash entry in the table. Replace the old entry if it already
+        exists.
+        """
+        self.grow()
+        inserted = False
+        index = hash % len(self.table)
+        if self.table[index] is None:
+            self.table[index] = []
+        # Search for the entry first.
+        for i in range(len(self.table[index])):
+            if self.table[index][i] is None:
+                continue
+            if self.table[index][i].hash == hash and self.table[index][i].key == key:
+                # The entry already exists in the table.
+                self.table[index][i] = HashEntry(key, hash, value)
+                return
+
+        # Find an empty slot.
+        for i in range(len(self.table[index])):
+            if self.table[index][i] is None:
+                self.table[index][i] = HashEntry(key, hash, value)
+                inserted = True
+                break
+        if not inserted:
+            self.table[index].append(HashEntry(key, hash, value))
+        self.elements += 1
+
+    def resize(self, new_size):
+        if new_size == len(self.table):
+            return
+        if new_size < self.initial_size:
+            return
+        if self.elements < 100:
+            return
+        new_table = [None] * new_size
+        # Copy 'self.table' to new_table.
+        for i in range(len(self.table)):
+            entries = self.table[i]
+            if entries is None:
+                continue
+            for j in range(len(entries)):
+                if entries[j] is None:
+                    continue
+                index = entries[j].hash % new_size
+                if new_table[index] is None:
+                    new_table[index] = []
+                new_table[index].append(entries[j])
+        self.table = new_table
+        del new_table
+        # Manually call python gc here to free the memory as 'self.table'
+        # might be very large.
+        gc.collect()
+
+    def grow(self):
+        if self.elements < 4 * len(self.table):
+            return
+        new_size = int(len(self.table) * 1.5)
+        self.resize(new_size)
+
+    def delete(self, key, hash):
+        index = hash % len(self.table)
+        deleted = False
+        deleted_entry = None
+        if self.table[index] is None:
+            return
+        for i in range(len(self.table[index])):
+            if (
+                self.table[index][i] is not None
+                and self.table[index][i].hash == hash
+                and self.table[index][i].key == key
+            ):
+                deleted_entry = self.table[index][i]
+                self.table[index][i] = None
+                self.elements -= 1
+                deleted = True
+                break
+        if deleted:
+            self.shrink()
+        return deleted_entry
+
+    def shrink(self):
+        if self.elements * 2 >= len(self.table):
+            return
+        new_size = int(len(self.table) * 0.7)
+        self.resize(new_size)
+
+    def lookup(self, key, hash):
+        index = hash % len(self.table)
+        if self.table[index] is None:
+            return None
+        for i in range(len(self.table[index])):
+            if (
+                self.table[index][i] is not None
+                and self.table[index][i].hash == hash
+                and self.table[index][i].key == key
+            ):
+                return self.table[index][i].value
+        return None
+
+
+class MissRatioStats:
+    def __init__(self, time_unit):
+        self.num_misses = 0
+        self.num_accesses = 0
+        self.time_unit = time_unit
+        self.time_misses = {}
+        self.time_miss_bytes = {}
+        self.time_accesses = {}
+
+    def update_metrics(self, access_time, is_hit, miss_bytes):
+        access_time /= kMicrosInSecond * self.time_unit
+        self.num_accesses += 1
+        if access_time not in self.time_accesses:
+            self.time_accesses[access_time] = 0
+        self.time_accesses[access_time] += 1
+        if not is_hit:
+            self.num_misses += 1
+            if access_time not in self.time_misses:
+                self.time_misses[access_time] = 0
+                self.time_miss_bytes[access_time] = 0
+            self.time_misses[access_time] += 1
+            self.time_miss_bytes[access_time] += miss_bytes
+
+    def reset_counter(self):
+        self.num_misses = 0
+        self.num_accesses = 0
+        self.time_miss_bytes.clear()
+        self.time_misses.clear()
+        self.time_accesses.clear()
+
+    def compute_miss_bytes(self):
+        miss_bytes = []
+        for at in self.time_miss_bytes:
+            miss_bytes.append(self.time_miss_bytes[at])
+        miss_bytes = sorted(miss_bytes)
+        avg_miss_bytes = 0
+        p95_miss_bytes = 0
+        for i in range(len(miss_bytes)):
+            avg_miss_bytes += float(miss_bytes[i]) / float(len(miss_bytes))
+
+        p95_index = min(int(0.95 * float(len(miss_bytes))), len(miss_bytes) - 1)
+        p95_miss_bytes = miss_bytes[p95_index]
+        return avg_miss_bytes, p95_miss_bytes
+
+    def miss_ratio(self):
+        return float(self.num_misses) * 100.0 / float(self.num_accesses)
+
+    def write_miss_timeline(
+        self, cache_type, cache_size, target_cf_name, result_dir, start, end
+    ):
+        start /= kMicrosInSecond * self.time_unit
+        end /= kMicrosInSecond * self.time_unit
+        header_file_path = "{}/header-ml-miss-timeline-{}-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size, target_cf_name
+        )
+        if not path.exists(header_file_path):
+            with open(header_file_path, "w+") as header_file:
+                header = "time"
+                for trace_time in range(start, end):
+                    header += ",{}".format(trace_time)
+                header_file.write(header + "\n")
+        file_path = "{}/data-ml-miss-timeline-{}-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size, target_cf_name
+        )
+        with open(file_path, "w+") as file:
+            row = "{}".format(cache_type)
+            for trace_time in range(start, end):
+                row += ",{}".format(self.time_misses.get(trace_time, 0))
+            file.write(row + "\n")
+
+    def write_miss_ratio_timeline(
+        self, cache_type, cache_size, target_cf_name, result_dir, start, end
+    ):
+        start /= kMicrosInSecond * self.time_unit
+        end /= kMicrosInSecond * self.time_unit
+        header_file_path = "{}/header-ml-miss-ratio-timeline-{}-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size, target_cf_name
+        )
+        if not path.exists(header_file_path):
+            with open(header_file_path, "w+") as header_file:
+                header = "time"
+                for trace_time in range(start, end):
+                    header += ",{}".format(trace_time)
+                header_file.write(header + "\n")
+        file_path = "{}/data-ml-miss-ratio-timeline-{}-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size, target_cf_name
+        )
+        with open(file_path, "w+") as file:
+            row = "{}".format(cache_type)
+            for trace_time in range(start, end):
+                naccesses = self.time_accesses.get(trace_time, 0)
+                miss_ratio = 0
+                if naccesses > 0:
+                    miss_ratio = float(
+                        self.time_misses.get(trace_time, 0) * 100.0
+                    ) / float(naccesses)
+                row += ",{0:.2f}".format(miss_ratio)
+            file.write(row + "\n")
+
+
+class PolicyStats:
+    def __init__(self, time_unit, policies):
+        self.time_selected_polices = {}
+        self.time_accesses = {}
+        self.policy_names = {}
+        self.time_unit = time_unit
+        for i in range(len(policies)):
+            self.policy_names[i] = policies[i].policy_name()
+
+    def update_metrics(self, access_time, selected_policy):
+        access_time /= kMicrosInSecond * self.time_unit
+        if access_time not in self.time_accesses:
+            self.time_accesses[access_time] = 0
+        self.time_accesses[access_time] += 1
+        if access_time not in self.time_selected_polices:
+            self.time_selected_polices[access_time] = {}
+        policy_name = self.policy_names[selected_policy]
+        if policy_name not in self.time_selected_polices[access_time]:
+            self.time_selected_polices[access_time][policy_name] = 0
+        self.time_selected_polices[access_time][policy_name] += 1
+
+    def write_policy_timeline(
+        self, cache_type, cache_size, target_cf_name, result_dir, start, end
+    ):
+        start /= kMicrosInSecond * self.time_unit
+        end /= kMicrosInSecond * self.time_unit
+        header_file_path = "{}/header-ml-policy-timeline-{}-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size, target_cf_name
+        )
+        if not path.exists(header_file_path):
+            with open(header_file_path, "w+") as header_file:
+                header = "time"
+                for trace_time in range(start, end):
+                    header += ",{}".format(trace_time)
+                header_file.write(header + "\n")
+        file_path = "{}/data-ml-policy-timeline-{}-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size, target_cf_name
+        )
+        with open(file_path, "w+") as file:
+            for policy in self.policy_names:
+                policy_name = self.policy_names[policy]
+                row = "{}-{}".format(cache_type, policy_name)
+                for trace_time in range(start, end):
+                    row += ",{}".format(
+                        self.time_selected_polices.get(trace_time, {}).get(
+                            policy_name, 0
+                        )
+                    )
+                file.write(row + "\n")
+
+    def write_policy_ratio_timeline(
+        self, cache_type, cache_size, target_cf_name, file_path, start, end
+    ):
+        start /= kMicrosInSecond * self.time_unit
+        end /= kMicrosInSecond * self.time_unit
+        header_file_path = "{}/header-ml-policy-ratio-timeline-{}-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size, target_cf_name
+        )
+        if not path.exists(header_file_path):
+            with open(header_file_path, "w+") as header_file:
+                header = "time"
+                for trace_time in range(start, end):
+                    header += ",{}".format(trace_time)
+                header_file.write(header + "\n")
+        file_path = "{}/data-ml-policy-ratio-timeline-{}-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size, target_cf_name
+        )
+        with open(file_path, "w+") as file:
+            for policy in self.policy_names:
+                policy_name = self.policy_names[policy]
+                row = "{}-{}".format(cache_type, policy_name)
+                for trace_time in range(start, end):
+                    naccesses = self.time_accesses.get(trace_time, 0)
+                    ratio = 0
+                    if naccesses > 0:
+                        ratio = float(
+                            self.time_selected_polices.get(trace_time, {}).get(
+                                policy_name, 0
+                            )
+                            * 100.0
+                        ) / float(naccesses)
+                    row += ",{0:.2f}".format(ratio)
+                file.write(row + "\n")
+
+
+class Policy(object):
+    """
+    A policy maintains a set of evicted keys. It returns a reward of one to
+    itself if it has not evicted a missing key. Otherwise, it gives itself 0
+    reward.
+    """
+
+    def __init__(self):
+        self.evicted_keys = {}
+
+    def evict(self, key, max_size):
+        self.evicted_keys[key] = 0
+
+    def delete(self, key):
+        self.evicted_keys.pop(key, None)
+
+    def prioritize_samples(self, samples, auxilliary_info):
+        raise NotImplementedError
+
+    def policy_name(self):
+        raise NotImplementedError
+
+    def generate_reward(self, key):
+        if key in self.evicted_keys:
+            return 0
+        return 1
+
+
+class LRUPolicy(Policy):
+    def prioritize_samples(self, samples, auxilliary_info):
+        return sorted(
+            samples,
+            cmp=lambda e1, e2: e1.value.last_access_number
+            - e2.value.last_access_number,
+        )
+
+    def policy_name(self):
+        return "lru"
+
+
+class MRUPolicy(Policy):
+    def prioritize_samples(self, samples, auxilliary_info):
+        return sorted(
+            samples,
+            cmp=lambda e1, e2: e2.value.last_access_number
+            - e1.value.last_access_number,
+        )
+
+    def policy_name(self):
+        return "mru"
+
+
+class LFUPolicy(Policy):
+    def prioritize_samples(self, samples, auxilliary_info):
+        return sorted(samples, cmp=lambda e1, e2: e1.value.num_hits - e2.value.num_hits)
+
+    def policy_name(self):
+        return "lfu"
+
+
+class HyperbolicPolicy(Policy):
+    """
+    An implementation of Hyperbolic caching.
+
+    Aaron Blankstein, Siddhartha Sen, and Michael J. Freedman. 2017.
+    Hyperbolic caching: flexible caching for web applications. In Proceedings
+    of the 2017 USENIX Conference on Usenix Annual Technical Conference
+    (USENIX ATC '17). USENIX Association, Berkeley, CA, USA, 499-511.
+    """
+
+    def compare(self, e1, e2, now):
+        e1_duration = max(0, (now - e1.value.insertion_time) / kMicrosInSecond) * float(
+            e1.value.value_size
+        )
+        e2_duration = max(0, (now - e2.value.insertion_time) / kMicrosInSecond) * float(
+            e2.value.value_size
+        )
+        if e1_duration == e2_duration:
+            return e1.value.num_hits - e2.value.num_hits
+        if e1_duration == 0:
+            return 1
+        if e2_duration == 0:
+            return 1
+        diff = (float(e1.value.num_hits) / (float(e1_duration))) - (
+            float(e2.value.num_hits) / float(e2_duration)
+        )
+        if diff == 0:
+            return 0
+        elif diff > 0:
+            return 1
+        else:
+            return -1
+
+    def prioritize_samples(self, samples, auxilliary_info):
+        assert len(auxilliary_info) == 3
+        now = auxilliary_info[0]
+        return sorted(samples, cmp=lambda e1, e2: self.compare(e1, e2, now))
+
+    def policy_name(self):
+        return "hb"
+
+
+class CostClassPolicy(Policy):
+    """
+    We calculate the hit density of a cost class as
+    number of hits / total size in cache * average duration in the cache.
+
+    An entry has a higher priority if its class's hit density is higher.
+    """
+
+    def compare(self, e1, e2, now, cost_classes, cost_class_label):
+        e1_class = e1.value.cost_class(cost_class_label)
+        e2_class = e2.value.cost_class(cost_class_label)
+
+        assert e1_class in cost_classes
+        assert e2_class in cost_classes
+
+        e1_entry = cost_classes[e1_class]
+        e2_entry = cost_classes[e2_class]
+        e1_density = e1_entry.density(now)
+        e2_density = e2_entry.density(now)
+        e1_hits = cost_classes[e1_class].hits
+        e2_hits = cost_classes[e2_class].hits
+
+        if e1_density == e2_density:
+            return e1_hits - e2_hits
+
+        if e1_entry.num_entries_in_cache == 0:
+            return -1
+        if e2_entry.num_entries_in_cache == 0:
+            return 1
+
+        if e1_density == 0:
+            return 1
+        if e2_density == 0:
+            return -1
+        diff = (float(e1_hits) / float(e1_density)) - (
+            float(e2_hits) / float(e2_density)
+        )
+        if diff == 0:
+            return 0
+        elif diff > 0:
+            return 1
+        else:
+            return -1
+
+    def prioritize_samples(self, samples, auxilliary_info):
+        assert len(auxilliary_info) == 3
+        now = auxilliary_info[0]
+        cost_classes = auxilliary_info[1]
+        cost_class_label = auxilliary_info[2]
+        return sorted(
+            samples,
+            cmp=lambda e1, e2: self.compare(
+                e1, e2, now, cost_classes, cost_class_label
+            ),
+        )
+
+    def policy_name(self):
+        return "cc"
+
+
+class Cache(object):
+    """
+    This is the base class for the implementations of alternative cache
+    replacement policies.
+    """
+
+    def __init__(self, cache_size, enable_cache_row_key):
+        self.cache_size = cache_size
+        self.used_size = 0
+        self.per_second_miss_ratio_stats = MissRatioStats(1)
+        self.miss_ratio_stats = MissRatioStats(kSecondsInMinute)
+        self.per_hour_miss_ratio_stats = MissRatioStats(kSecondsInHour)
+        # 0: disabled. 1: enabled. Insert both row and the refereneced data block.
+        # 2: enabled. Insert only the row but NOT the referenced data block.
+        self.enable_cache_row_key = enable_cache_row_key
+        self.get_id_row_key_map = {}
+        self.max_seen_get_id = 0
+        self.retain_get_id_range = 100000
+
+    def block_key(self, trace_record):
+        return "b{}".format(trace_record.block_id)
+
+    def row_key(self, trace_record):
+        return "g{}-{}".format(trace_record.fd, trace_record.key_id)
+
+    def _lookup(self, trace_record, key, hash):
+        """
+        Look up the key in the cache.
+        Returns true upon a cache hit, false otherwise.
+        """
+        raise NotImplementedError
+
+    def _evict(self, trace_record, key, hash, value_size):
+        """
+        Evict entries in the cache until there is enough room to insert the new
+        entry with 'value_size'.
+        """
+        raise NotImplementedError
+
+    def _insert(self, trace_record, key, hash, value_size):
+        """
+        Insert the new entry into the cache.
+        """
+        raise NotImplementedError
+
+    def _should_admit(self, trace_record, key, hash, value_size):
+        """
+        A custom admission policy to decide whether we should admit the new
+        entry upon a cache miss.
+        Returns true if the new entry should be admitted, false otherwise.
+        """
+        raise NotImplementedError
+
+    def cache_name(self):
+        """
+        The name of the replacement policy.
+        """
+        raise NotImplementedError
+
+    def is_ml_cache(self):
+        return False
+
+    def _update_stats(self, access_time, is_hit, miss_bytes):
+        self.per_second_miss_ratio_stats.update_metrics(access_time, is_hit, miss_bytes)
+        self.miss_ratio_stats.update_metrics(access_time, is_hit, miss_bytes)
+        self.per_hour_miss_ratio_stats.update_metrics(access_time, is_hit, miss_bytes)
+
+    def access(self, trace_record):
+        """
+        Access a trace record. The simulator calls this function to access a
+        trace record.
+        """
+        assert self.used_size <= self.cache_size
+        if (
+            self.enable_cache_row_key > 0
+            and trace_record.caller == 1
+            and trace_record.key_id != 0
+            and trace_record.get_id != 0
+        ):
+            # This is a get request.
+            self._access_row(trace_record)
+            return
+        is_hit = self._access_kv(
+            trace_record,
+            self.block_key(trace_record),
+            trace_record.block_id,
+            trace_record.block_size,
+            trace_record.no_insert,
+        )
+        self._update_stats(
+            trace_record.access_time, is_hit=is_hit, miss_bytes=trace_record.block_size
+        )
+
+    def _access_row(self, trace_record):
+        row_key = self.row_key(trace_record)
+        self.max_seen_get_id = max(self.max_seen_get_id, trace_record.get_id)
+        self.get_id_row_key_map.pop(
+            self.max_seen_get_id - self.retain_get_id_range, None
+        )
+        if trace_record.get_id not in self.get_id_row_key_map:
+            self.get_id_row_key_map[trace_record.get_id] = {}
+            self.get_id_row_key_map[trace_record.get_id]["h"] = False
+        if self.get_id_row_key_map[trace_record.get_id]["h"]:
+            # We treat future accesses as hits since this get request
+            # completes.
+            # print("row hit 1")
+            self._update_stats(trace_record.access_time, is_hit=True, miss_bytes=0)
+            return
+        if row_key not in self.get_id_row_key_map[trace_record.get_id]:
+            # First time seen this key.
+            is_hit = self._access_kv(
+                trace_record,
+                key=row_key,
+                hash=trace_record.key_id,
+                value_size=trace_record.kv_size,
+                no_insert=False,
+            )
+            inserted = False
+            if trace_record.kv_size > 0:
+                inserted = True
+            self.get_id_row_key_map[trace_record.get_id][row_key] = inserted
+            self.get_id_row_key_map[trace_record.get_id]["h"] = is_hit
+        if self.get_id_row_key_map[trace_record.get_id]["h"]:
+            # We treat future accesses as hits since this get request
+            # completes.
+            # print("row hit 2")
+            self._update_stats(trace_record.access_time, is_hit=True, miss_bytes=0)
+            return
+        # Access its blocks.
+        no_insert = trace_record.no_insert
+        if (
+            self.enable_cache_row_key == 2
+            and trace_record.kv_size > 0
+            and trace_record.block_type == 9
+        ):
+            no_insert = True
+        is_hit = self._access_kv(
+            trace_record,
+            key=self.block_key(trace_record),
+            hash=trace_record.block_id,
+            value_size=trace_record.block_size,
+            no_insert=no_insert,
+        )
+        self._update_stats(
+            trace_record.access_time, is_hit, miss_bytes=trace_record.block_size
+        )
+        if (
+            trace_record.kv_size > 0
+            and not self.get_id_row_key_map[trace_record.get_id][row_key]
+        ):
+            # Insert the row key-value pair.
+            self._access_kv(
+                trace_record,
+                key=row_key,
+                hash=trace_record.key_id,
+                value_size=trace_record.kv_size,
+                no_insert=False,
+            )
+            # Mark as inserted.
+            self.get_id_row_key_map[trace_record.get_id][row_key] = True
+
+    def _access_kv(self, trace_record, key, hash, value_size, no_insert):
+        # Sanity checks.
+        assert self.used_size <= self.cache_size
+        if self._lookup(trace_record, key, hash):
+            # A cache hit.
+            return True
+        if no_insert or value_size <= 0:
+            return False
+        # A cache miss.
+        if value_size > self.cache_size:
+            # The block is too large to fit into the cache.
+            return False
+        self._evict(trace_record, key, hash, value_size)
+        if self._should_admit(trace_record, key, hash, value_size):
+            self._insert(trace_record, key, hash, value_size)
+            self.used_size += value_size
+        return False
+
+
+class CostClassEntry:
+    """
+    A cost class maintains aggregated statistics of cached entries in a class.
+    For example, we may define block type as a class. Then, cached blocks of the
+    same type will share one cost class entry.
+    """
+
+    def __init__(self):
+        self.hits = 0
+        self.num_entries_in_cache = 0
+        self.size_in_cache = 0
+        self.sum_insertion_times = 0
+        self.sum_last_access_time = 0
+
+    def insert(self, trace_record, key, value_size):
+        self.size_in_cache += value_size
+        self.num_entries_in_cache += 1
+        self.sum_insertion_times += trace_record.access_time / kMicrosInSecond
+        self.sum_last_access_time += trace_record.access_time / kMicrosInSecond
+
+    def remove(self, insertion_time, last_access_time, key, value_size, num_hits):
+        self.hits -= num_hits
+        self.num_entries_in_cache -= 1
+        self.sum_insertion_times -= insertion_time / kMicrosInSecond
+        self.size_in_cache -= value_size
+        self.sum_last_access_time -= last_access_time / kMicrosInSecond
+
+    def update_on_hit(self, trace_record, last_access_time):
+        self.hits += 1
+        self.sum_last_access_time -= last_access_time / kMicrosInSecond
+        self.sum_last_access_time += trace_record.access_time / kMicrosInSecond
+
+    def avg_lifetime_in_cache(self, now):
+        avg_insertion_time = self.sum_insertion_times / self.num_entries_in_cache
+        return now / kMicrosInSecond - avg_insertion_time
+
+    def avg_last_access_time(self):
+        if self.num_entries_in_cache == 0:
+            return 0
+        return float(self.sum_last_access_time) / float(self.num_entries_in_cache)
+
+    def avg_size(self):
+        if self.num_entries_in_cache == 0:
+            return 0
+        return float(self.sum_last_access_time) / float(self.num_entries_in_cache)
+
+    def density(self, now):
+        avg_insertion_time = self.sum_insertion_times / self.num_entries_in_cache
+        in_cache_duration = now / kMicrosInSecond - avg_insertion_time
+        return self.size_in_cache * in_cache_duration
+
+
+class MLCache(Cache):
+    """
+    MLCache is the base class for implementations of alternative replacement
+    policies using reinforcement learning.
+    """
+
+    def __init__(self, cache_size, enable_cache_row_key, policies, cost_class_label):
+        super(MLCache, self).__init__(cache_size, enable_cache_row_key)
+        self.table = HashTable()
+        self.policy_stats = PolicyStats(kSecondsInMinute, policies)
+        self.per_hour_policy_stats = PolicyStats(kSecondsInHour, policies)
+        self.policies = policies
+        self.cost_classes = {}
+        self.cost_class_label = cost_class_label
+
+    def is_ml_cache(self):
+        return True
+
+    def _lookup(self, trace_record, key, hash):
+        value = self.table.lookup(key, hash)
+        if value is not None:
+            # Update the entry's cost class statistics.
+            if self.cost_class_label is not None:
+                cost_class = value.cost_class(self.cost_class_label)
+                assert cost_class in self.cost_classes
+                self.cost_classes[cost_class].update_on_hit(
+                    trace_record, value.last_access_time
+                )
+            # Update the entry's last access time.
+            self.table.insert(
+                key,
+                hash,
+                CacheEntry(
+                    value_size=value.value_size,
+                    cf_id=value.cf_id,
+                    level=value.level,
+                    block_type=value.block_type,
+                    table_id=value.table_id,
+                    access_number=self.miss_ratio_stats.num_accesses,
+                    time_s=trace_record.access_time,
+                    num_hits=value.num_hits + 1,
+                ),
+            )
+            return True
+        return False
+
+    def _evict(self, trace_record, key, hash, value_size):
+        # Select a policy, random sample kSampleSize keys from the cache, then
+        # evict keys in the sample set until we have enough room for the new
+        # entry.
+        policy_index = self._select_policy(trace_record, key)
+        assert policy_index < len(self.policies) and policy_index >= 0
+        self.policies[policy_index].delete(key)
+        self.policy_stats.update_metrics(trace_record.access_time, policy_index)
+        self.per_hour_policy_stats.update_metrics(
+            trace_record.access_time, policy_index
+        )
+        while self.used_size + value_size > self.cache_size:
+            # Randomly sample n entries.
+            samples = self.table.random_sample(kSampleSize)
+            samples = self.policies[policy_index].prioritize_samples(
+                samples,
+                [trace_record.access_time, self.cost_classes, self.cost_class_label],
+            )
+            for hash_entry in samples:
+                assert self.table.delete(hash_entry.key, hash_entry.hash) is not None
+                self.used_size -= hash_entry.value.value_size
+                self.policies[policy_index].evict(
+                    key=hash_entry.key, max_size=self.table.elements
+                )
+                # Update the entry's cost class statistics.
+                if self.cost_class_label is not None:
+                    cost_class = hash_entry.value.cost_class(self.cost_class_label)
+                    assert cost_class in self.cost_classes
+                    self.cost_classes[cost_class].remove(
+                        hash_entry.value.insertion_time,
+                        hash_entry.value.last_access_time,
+                        key,
+                        hash_entry.value.value_size,
+                        hash_entry.value.num_hits,
+                    )
+                if self.used_size + value_size <= self.cache_size:
+                    break
+
+    def _insert(self, trace_record, key, hash, value_size):
+        assert self.used_size + value_size <= self.cache_size
+        entry = CacheEntry(
+            value_size,
+            trace_record.cf_id,
+            trace_record.level,
+            trace_record.block_type,
+            trace_record.table_id,
+            self.miss_ratio_stats.num_accesses,
+            trace_record.access_time,
+        )
+        # Update the entry's cost class statistics.
+        if self.cost_class_label is not None:
+            cost_class = entry.cost_class(self.cost_class_label)
+            if cost_class not in self.cost_classes:
+                self.cost_classes[cost_class] = CostClassEntry()
+            self.cost_classes[cost_class].insert(trace_record, key, value_size)
+        self.table.insert(key, hash, entry)
+
+    def _should_admit(self, trace_record, key, hash, value_size):
+        return True
+
+    def _select_policy(self, trace_record, key):
+        raise NotImplementedError
+
+
+class ThompsonSamplingCache(MLCache):
+    """
+    An implementation of Thompson Sampling for the Bernoulli Bandit.
+
+    Daniel J. Russo, Benjamin Van Roy, Abbas Kazerouni, Ian Osband,
+    and Zheng Wen. 2018. A Tutorial on Thompson Sampling. Found.
+    Trends Mach. Learn. 11, 1 (July 2018), 1-96.
+    DOI: https://doi.org/10.1561/2200000070
+    """
+
+    def __init__(
+        self,
+        cache_size,
+        enable_cache_row_key,
+        policies,
+        cost_class_label,
+        init_a=1,
+        init_b=1,
+    ):
+        super(ThompsonSamplingCache, self).__init__(
+            cache_size, enable_cache_row_key, policies, cost_class_label
+        )
+        self._as = {}
+        self._bs = {}
+        for _i in range(len(policies)):
+            self._as = [init_a] * len(self.policies)
+            self._bs = [init_b] * len(self.policies)
+
+    def _select_policy(self, trace_record, key):
+        if len(self.policies) == 1:
+            return 0
+        samples = [
+            np.random.beta(self._as[x], self._bs[x]) for x in range(len(self.policies))
+        ]
+        selected_policy = max(range(len(self.policies)), key=lambda x: samples[x])
+        reward = self.policies[selected_policy].generate_reward(key)
+        assert reward <= 1 and reward >= 0
+        self._as[selected_policy] += reward
+        self._bs[selected_policy] += 1 - reward
+        return selected_policy
+
+    def cache_name(self):
+        if self.enable_cache_row_key:
+            return "Hybrid ThompsonSampling with cost class {} (ts_hybrid)".format(
+                self.cost_class_label
+            )
+        return "ThompsonSampling with cost class {} (ts)".format(self.cost_class_label)
+
+
+class LinUCBCache(MLCache):
+    """
+    An implementation of LinUCB with disjoint linear models.
+
+    Lihong Li, Wei Chu, John Langford, and Robert E. Schapire. 2010.
+    A contextual-bandit approach to personalized news article recommendation.
+    In Proceedings of the 19th international conference on World wide web
+    (WWW '10). ACM, New York, NY, USA, 661-670.
+    DOI=http://dx.doi.org/10.1145/1772690.1772758
+    """
+
+    def __init__(self, cache_size, enable_cache_row_key, policies, cost_class_label):
+        super(LinUCBCache, self).__init__(
+            cache_size, enable_cache_row_key, policies, cost_class_label
+        )
+        self.nfeatures = 4  # Block type, level, cf.
+        self.th = np.zeros((len(self.policies), self.nfeatures))
+        self.eps = 0.2
+        self.b = np.zeros_like(self.th)
+        self.A = np.zeros((len(self.policies), self.nfeatures, self.nfeatures))
+        self.A_inv = np.zeros((len(self.policies), self.nfeatures, self.nfeatures))
+        for i in range(len(self.policies)):
+            self.A[i] = np.identity(self.nfeatures)
+        self.th_hat = np.zeros_like(self.th)
+        self.p = np.zeros(len(self.policies))
+        self.alph = 0.2
+
+    def _select_policy(self, trace_record, key):
+        if len(self.policies) == 1:
+            return 0
+        x_i = np.zeros(self.nfeatures)  # The current context vector
+        x_i[0] = trace_record.block_type
+        x_i[1] = trace_record.level
+        x_i[2] = trace_record.cf_id
+        p = np.zeros(len(self.policies))
+        for a in range(len(self.policies)):
+            self.th_hat[a] = self.A_inv[a].dot(self.b[a])
+            ta = x_i.dot(self.A_inv[a]).dot(x_i)
+            a_upper_ci = self.alph * np.sqrt(ta)
+            a_mean = self.th_hat[a].dot(x_i)
+            p[a] = a_mean + a_upper_ci
+        p = p + (np.random.random(len(p)) * 0.000001)
+        selected_policy = p.argmax()
+        reward = self.policies[selected_policy].generate_reward(key)
+        assert reward <= 1 and reward >= 0
+        self.A[selected_policy] += np.outer(x_i, x_i)
+        self.b[selected_policy] += reward * x_i
+        self.A_inv[selected_policy] = np.linalg.inv(self.A[selected_policy])
+        del x_i
+        return selected_policy
+
+    def cache_name(self):
+        if self.enable_cache_row_key:
+            return "Hybrid LinUCB with cost class {} (linucb_hybrid)".format(
+                self.cost_class_label
+            )
+        return "LinUCB with cost class {} (linucb)".format(self.cost_class_label)
+
+
+class OPTCacheEntry:
+    """
+    A cache entry for the OPT algorithm. The entries are sorted based on its
+    next access sequence number in reverse order, i.e., the entry which next
+    access is the furthest in the future is ordered before other entries.
+    """
+
+    def __init__(self, key, next_access_seq_no, value_size):
+        self.key = key
+        self.next_access_seq_no = next_access_seq_no
+        self.value_size = value_size
+        self.is_removed = False
+
+    def __cmp__(self, other):
+        if other.next_access_seq_no != self.next_access_seq_no:
+            return other.next_access_seq_no - self.next_access_seq_no
+        return self.value_size - other.value_size
+
+    def __repr__(self):
+        return "({} {} {} {})".format(
+            self.key, self.next_access_seq_no, self.value_size, self.is_removed
+        )
+
+
+class PQTable:
+    """
+    A hash table with a priority queue.
+    """
+
+    def __init__(self):
+        # A list of entries arranged in a heap sorted based on the entry custom
+        # implementation of __cmp__
+        self.pq = []
+        self.table = {}
+
+    def pqinsert(self, entry):
+        "Add a new key or update the priority of an existing key"
+        # Remove the entry from the table first.
+        removed_entry = self.table.pop(entry.key, None)
+        if removed_entry:
+            # Mark as removed since there is no 'remove' API in heappq.
+            # Instead, an entry in pq is removed lazily when calling pop.
+            removed_entry.is_removed = True
+        self.table[entry.key] = entry
+        heapq.heappush(self.pq, entry)
+        return removed_entry
+
+    def pqpop(self):
+        while self.pq:
+            entry = heapq.heappop(self.pq)
+            if not entry.is_removed:
+                del self.table[entry.key]
+                return entry
+        return None
+
+    def pqpeek(self):
+        while self.pq:
+            entry = self.pq[0]
+            if not entry.is_removed:
+                return entry
+            heapq.heappop(self.pq)
+        return
+
+    def __contains__(self, k):
+        return k in self.table
+
+    def __getitem__(self, k):
+        return self.table[k]
+
+    def __len__(self):
+        return len(self.table)
+
+    def values(self):
+        return self.table.values()
+
+
+class OPTCache(Cache):
+    """
+    An implementation of the Belady MIN algorithm. OPTCache evicts an entry
+    in the cache whose next access occurs furthest in the future.
+
+    Note that Belady MIN algorithm is optimal assuming all blocks having the
+    same size and a missing entry will be inserted in the cache.
+    These are NOT true for the block cache trace since blocks have different
+    sizes and we may not insert a block into the cache upon a cache miss.
+    However, it is still useful to serve as a "theoretical upper bound" on the
+    lowest miss ratio we can achieve given a cache size.
+
+    L. A. Belady. 1966. A Study of Replacement Algorithms for a
+    Virtual-storage Computer. IBM Syst. J. 5, 2 (June 1966), 78-101.
+    DOI=http://dx.doi.org/10.1147/sj.52.0078
+    """
+
+    def __init__(self, cache_size):
+        super(OPTCache, self).__init__(cache_size, enable_cache_row_key=0)
+        self.table = PQTable()
+
+    def _lookup(self, trace_record, key, hash):
+        if key not in self.table:
+            return False
+        # A cache hit. Update its next access time.
+        assert (
+            self.table.pqinsert(
+                OPTCacheEntry(
+                    key, trace_record.next_access_seq_no, self.table[key].value_size
+                )
+            )
+            is not None
+        )
+        return True
+
+    def _evict(self, trace_record, key, hash, value_size):
+        while self.used_size + value_size > self.cache_size:
+            evict_entry = self.table.pqpop()
+            assert evict_entry is not None
+            self.used_size -= evict_entry.value_size
+
+    def _insert(self, trace_record, key, hash, value_size):
+        assert (
+            self.table.pqinsert(
+                OPTCacheEntry(key, trace_record.next_access_seq_no, value_size)
+            )
+            is None
+        )
+
+    def _should_admit(self, trace_record, key, hash, value_size):
+        return True
+
+    def cache_name(self):
+        return "Belady MIN (opt)"
+
+
+class GDSizeEntry:
+    """
+    A cache entry for the greedy dual size replacement policy.
+    """
+
+    def __init__(self, key, value_size, priority):
+        self.key = key
+        self.value_size = value_size
+        self.priority = priority
+        self.is_removed = False
+
+    def __cmp__(self, other):
+        if other.priority != self.priority:
+            return self.priority - other.priority
+        return self.value_size - other.value_size
+
+    def __repr__(self):
+        return "({} {} {} {})".format(
+            self.key, self.next_access_seq_no, self.value_size, self.is_removed
+        )
+
+
+class GDSizeCache(Cache):
+    """
+    An implementation of the greedy dual size algorithm.
+    We define cost as an entry's size.
+
+    See https://www.usenix.org/legacy/publications/library/proceedings/usits97/full_papers/cao/cao_html/node8.html
+    and N. Young. The k-server dual and loose competitiveness for paging.
+    Algorithmica,June 1994, vol. 11,(no.6):525-41.
+    Rewritten version of ''On-line caching as cache size varies'',
+    in The 2nd Annual ACM-SIAM Symposium on Discrete Algorithms, 241-250, 1991.
+    """
+
+    def __init__(self, cache_size, enable_cache_row_key):
+        super(GDSizeCache, self).__init__(cache_size, enable_cache_row_key)
+        self.table = PQTable()
+        self.L = 0.0
+
+    def cache_name(self):
+        if self.enable_cache_row_key:
+            return "Hybrid GreedyDualSize (gdsize_hybrid)"
+        return "GreedyDualSize (gdsize)"
+
+    def _lookup(self, trace_record, key, hash):
+        if key not in self.table:
+            return False
+        # A cache hit. Update its priority.
+        entry = self.table[key]
+        assert (
+            self.table.pqinsert(
+                GDSizeEntry(key, entry.value_size, self.L + entry.value_size)
+            )
+            is not None
+        )
+        return True
+
+    def _evict(self, trace_record, key, hash, value_size):
+        while self.used_size + value_size > self.cache_size:
+            evict_entry = self.table.pqpop()
+            assert evict_entry is not None
+            self.L = evict_entry.priority
+            self.used_size -= evict_entry.value_size
+
+    def _insert(self, trace_record, key, hash, value_size):
+        assert (
+            self.table.pqinsert(GDSizeEntry(key, value_size, self.L + value_size))
+            is None
+        )
+
+    def _should_admit(self, trace_record, key, hash, value_size):
+        return True
+
+
+class Deque(object):
+    """A Deque class facilitates the implementation of LRU and ARC."""
+
+    def __init__(self):
+        self.od = OrderedDict()
+
+    def appendleft(self, k):
+        if k in self.od:
+            del self.od[k]
+        self.od[k] = None
+
+    def pop(self):
+        item = self.od.popitem(last=False) if self.od else None
+        if item is not None:
+            return item[0]
+        return None
+
+    def remove(self, k):
+        del self.od[k]
+
+    def __len__(self):
+        return len(self.od)
+
+    def __contains__(self, k):
+        return k in self.od
+
+    def __iter__(self):
+        return reversed(self.od)
+
+    def __repr__(self):
+        return "Deque(%r)" % (list(self),)
+
+
+class ARCCache(Cache):
+    """
+    An implementation of ARC. ARC assumes that all blocks are having the
+    same size. The size of index and filter blocks are variable. To accommodate
+    this, we modified ARC as follows:
+    1) We use 16 KB as the average block size and calculate the number of blocks
+       (c) in the cache.
+    2) When we insert an entry, the cache evicts entries in both t1 and t2
+       queues until it has enough space for the new entry. This also requires
+       modification of the algorithm to maintain a maximum of 2*c blocks.
+
+    Nimrod Megiddo and Dharmendra S. Modha. 2003. ARC: A Self-Tuning, Low
+    Overhead Replacement Cache. In Proceedings of the 2nd USENIX Conference on
+    File and Storage Technologies (FAST '03). USENIX Association, Berkeley, CA,
+    USA, 115-130.
+    """
+
+    def __init__(self, cache_size, enable_cache_row_key):
+        super(ARCCache, self).__init__(cache_size, enable_cache_row_key)
+        self.table = {}
+        self.c = cache_size / 16 * 1024  # Number of elements in the cache.
+        self.p = 0  # Target size for the list T1
+        # L1: only once recently
+        self.t1 = Deque()  # T1: recent cache entries
+        self.b1 = Deque()  # B1: ghost entries recently evicted from the T1 cache
+        # L2: at least twice recently
+        self.t2 = Deque()  # T2: frequent entries
+        self.b2 = Deque()  # B2: ghost entries recently evicted from the T2 cache
+
+    def _replace(self, key, value_size):
+        while self.used_size + value_size > self.cache_size:
+            if self.t1 and ((key in self.b2) or (len(self.t1) > self.p)):
+                old = self.t1.pop()
+                self.b1.appendleft(old)
+            else:
+                if self.t2:
+                    old = self.t2.pop()
+                    self.b2.appendleft(old)
+                else:
+                    old = self.t1.pop()
+                    self.b1.appendleft(old)
+            self.used_size -= self.table[old].value_size
+            del self.table[old]
+
+    def _lookup(self, trace_record, key, hash):
+        # Case I: key is in T1 or T2.
+        #   Move key to MRU position in T2.
+        if key in self.t1:
+            self.t1.remove(key)
+            self.t2.appendleft(key)
+            return True
+
+        if key in self.t2:
+            self.t2.remove(key)
+            self.t2.appendleft(key)
+            return True
+        return False
+
+    def _evict(self, trace_record, key, hash, value_size):
+        # Case II: key is in B1
+        #   Move x from B1 to the MRU position in T2 (also fetch x to the cache).
+        if key in self.b1:
+            self.p = min(self.c, self.p + max(len(self.b2) / len(self.b1), 1))
+            self._replace(key, value_size)
+            self.b1.remove(key)
+            self.t2.appendleft(key)
+            return
+
+        # Case III: key is in B2
+        #   Move x from B2 to the MRU position in T2 (also fetch x to the cache).
+        if key in self.b2:
+            self.p = max(0, self.p - max(len(self.b1) / len(self.b2), 1))
+            self._replace(key, value_size)
+            self.b2.remove(key)
+            self.t2.appendleft(key)
+            return
+
+        # Case IV: key is not in (T1 u B1 u T2 u B2)
+        self._replace(key, value_size)
+        while len(self.t1) + len(self.b1) >= self.c and self.b1:
+            self.b1.pop()
+
+        total = len(self.t1) + len(self.b1) + len(self.t2) + len(self.b2)
+        while total >= (2 * self.c) and self.b2:
+            self.b2.pop()
+            total -= 1
+        # Finally, move it to MRU position in T1.
+        self.t1.appendleft(key)
+        return
+
+    def _insert(self, trace_record, key, hash, value_size):
+        self.table[key] = CacheEntry(
+            value_size,
+            trace_record.cf_id,
+            trace_record.level,
+            trace_record.block_type,
+            trace_record.table_id,
+            0,
+            trace_record.access_time,
+        )
+
+    def _should_admit(self, trace_record, key, hash, value_size):
+        return True
+
+    def cache_name(self):
+        if self.enable_cache_row_key:
+            return "Hybrid Adaptive Replacement Cache (arc_hybrid)"
+        return "Adaptive Replacement Cache (arc)"
+
+
+class LRUCache(Cache):
+    """
+    A strict LRU queue.
+    """
+
+    def __init__(self, cache_size, enable_cache_row_key):
+        super(LRUCache, self).__init__(cache_size, enable_cache_row_key)
+        self.table = {}
+        self.lru = Deque()
+
+    def cache_name(self):
+        if self.enable_cache_row_key:
+            return "Hybrid LRU (lru_hybrid)"
+        return "LRU (lru)"
+
+    def _lookup(self, trace_record, key, hash):
+        if key not in self.table:
+            return False
+        # A cache hit. Update LRU queue.
+        self.lru.remove(key)
+        self.lru.appendleft(key)
+        return True
+
+    def _evict(self, trace_record, key, hash, value_size):
+        while self.used_size + value_size > self.cache_size:
+            evict_key = self.lru.pop()
+            self.used_size -= self.table[evict_key].value_size
+            del self.table[evict_key]
+
+    def _insert(self, trace_record, key, hash, value_size):
+        self.table[key] = CacheEntry(
+            value_size,
+            trace_record.cf_id,
+            trace_record.level,
+            trace_record.block_type,
+            trace_record.table_id,
+            0,
+            trace_record.access_time,
+        )
+        self.lru.appendleft(key)
+
+    def _should_admit(self, trace_record, key, hash, value_size):
+        return True
+
+
+class TraceCache(Cache):
+    """
+    A trace cache. Lookup returns true if the trace observes a cache hit.
+    It is used to maintain cache hits observed in the trace.
+    """
+
+    def __init__(self, cache_size):
+        super(TraceCache, self).__init__(cache_size, enable_cache_row_key=0)
+
+    def _lookup(self, trace_record, key, hash):
+        return trace_record.is_hit
+
+    def _evict(self, trace_record, key, hash, value_size):
+        pass
+
+    def _insert(self, trace_record, key, hash, value_size):
+        pass
+
+    def _should_admit(self, trace_record, key, hash, value_size):
+        return False
+
+    def cache_name(self):
+        return "Trace"
+
+
+def parse_cache_size(cs):
+    cs = cs.replace("\n", "")
+    if cs[-1] == "M":
+        return int(cs[: len(cs) - 1]) * 1024 * 1024
+    if cs[-1] == "G":
+        return int(cs[: len(cs) - 1]) * 1024 * 1024 * 1024
+    if cs[-1] == "T":
+        return int(cs[: len(cs) - 1]) * 1024 * 1024 * 1024 * 1024
+    return int(cs)
+
+
+def create_cache(cache_type, cache_size, downsample_size):
+    cache_size = cache_size / downsample_size
+    enable_cache_row_key = 0
+    if "hybridn" in cache_type:
+        enable_cache_row_key = 2
+        cache_type = cache_type[:-8]
+    if "hybrid" in cache_type:
+        enable_cache_row_key = 1
+        cache_type = cache_type[:-7]
+    if cache_type == "ts":
+        return ThompsonSamplingCache(
+            cache_size,
+            enable_cache_row_key,
+            [LRUPolicy(), LFUPolicy(), HyperbolicPolicy()],
+            cost_class_label=None,
+        )
+    elif cache_type == "linucb":
+        return LinUCBCache(
+            cache_size,
+            enable_cache_row_key,
+            [LRUPolicy(), LFUPolicy(), HyperbolicPolicy()],
+            cost_class_label=None,
+        )
+    elif cache_type == "pylru":
+        return ThompsonSamplingCache(
+            cache_size, enable_cache_row_key, [LRUPolicy()], cost_class_label=None
+        )
+    elif cache_type == "pymru":
+        return ThompsonSamplingCache(
+            cache_size, enable_cache_row_key, [MRUPolicy()], cost_class_label=None
+        )
+    elif cache_type == "pylfu":
+        return ThompsonSamplingCache(
+            cache_size, enable_cache_row_key, [LFUPolicy()], cost_class_label=None
+        )
+    elif cache_type == "pyhb":
+        return ThompsonSamplingCache(
+            cache_size,
+            enable_cache_row_key,
+            [HyperbolicPolicy()],
+            cost_class_label=None,
+        )
+    elif cache_type == "pycctbbt":
+        return ThompsonSamplingCache(
+            cache_size,
+            enable_cache_row_key,
+            [CostClassPolicy()],
+            cost_class_label="table_bt",
+        )
+    elif cache_type == "pycccf":
+        return ThompsonSamplingCache(
+            cache_size, enable_cache_row_key, [CostClassPolicy()], cost_class_label="cf"
+        )
+    elif cache_type == "pycctblevelbt":
+        return ThompsonSamplingCache(
+            cache_size,
+            enable_cache_row_key,
+            [CostClassPolicy()],
+            cost_class_label="table_level_bt",
+        )
+    elif cache_type == "pycccfbt":
+        return ThompsonSamplingCache(
+            cache_size,
+            enable_cache_row_key,
+            [CostClassPolicy()],
+            cost_class_label="cf_bt",
+        )
+    elif cache_type == "pycctb":
+        return ThompsonSamplingCache(
+            cache_size,
+            enable_cache_row_key,
+            [CostClassPolicy()],
+            cost_class_label="table",
+        )
+    elif cache_type == "pyccbt":
+        return ThompsonSamplingCache(
+            cache_size, enable_cache_row_key, [CostClassPolicy()], cost_class_label="bt"
+        )
+    elif cache_type == "opt":
+        if enable_cache_row_key:
+            print("opt does not support hybrid mode.")
+            assert False
+        return OPTCache(cache_size)
+    elif cache_type == "trace":
+        if enable_cache_row_key:
+            print("trace does not support hybrid mode.")
+            assert False
+        return TraceCache(cache_size)
+    elif cache_type == "lru":
+        return LRUCache(cache_size, enable_cache_row_key)
+    elif cache_type == "arc":
+        return ARCCache(cache_size, enable_cache_row_key)
+    elif cache_type == "gdsize":
+        return GDSizeCache(cache_size, enable_cache_row_key)
+    else:
+        print("Unknown cache type {}".format(cache_type))
+        assert False
+    return None
+
+
+class BlockAccessTimeline:
+    """
+    BlockAccessTimeline stores all accesses of a block.
+    """
+
+    def __init__(self):
+        self.accesses = []
+        self.current_access_index = 1
+
+    def get_next_access(self):
+        if self.current_access_index == len(self.accesses):
+            return sys.maxsize
+        next_access_seq_no = self.accesses[self.current_access_index]
+        self.current_access_index += 1
+        return next_access_seq_no
+
+
+def percent(e1, e2):
+    if e2 == 0:
+        return -1
+    return float(e1) * 100.0 / float(e2)
+
+
+def is_target_cf(access_cf, target_cf_name):
+    if target_cf_name == "all":
+        return True
+    return access_cf == target_cf_name
+
+
+def run(
+    trace_file_path,
+    cache_type,
+    cache,
+    warmup_seconds,
+    max_accesses_to_process,
+    target_cf_name,
+):
+    warmup_complete = False
+    trace_miss_ratio_stats = MissRatioStats(kSecondsInMinute)
+    access_seq_no = 0
+    time_interval = 1
+    start_time = time.time()
+    trace_start_time = 0
+    trace_duration = 0
+    is_opt_cache = False
+    if cache.cache_name() == "Belady MIN (opt)":
+        is_opt_cache = True
+
+    block_access_timelines = {}
+    num_no_inserts = 0
+    num_blocks_with_no_size = 0
+    num_inserts_block_with_no_size = 0
+
+    if is_opt_cache:
+        # Read all blocks in memory and stores their access times so that OPT
+        # can use this information to evict the cached key which next access is
+        # the furthest in the future.
+        print("Preprocessing block traces.")
+        with open(trace_file_path, "r") as trace_file:
+            for line in trace_file:
+                if (
+                    max_accesses_to_process != -1
+                    and access_seq_no > max_accesses_to_process
+                ):
+                    break
+                ts = line.split(",")
+                timestamp = int(ts[0])
+                cf_name = ts[5]
+                if not is_target_cf(cf_name, target_cf_name):
+                    continue
+                if trace_start_time == 0:
+                    trace_start_time = timestamp
+                trace_duration = timestamp - trace_start_time
+                block_id = int(ts[1])
+                block_size = int(ts[3])
+                no_insert = int(ts[9])
+                if block_id not in block_access_timelines:
+                    block_access_timelines[block_id] = BlockAccessTimeline()
+                    if block_size == 0:
+                        num_blocks_with_no_size += 1
+                block_access_timelines[block_id].accesses.append(access_seq_no)
+                access_seq_no += 1
+                if no_insert == 1:
+                    num_no_inserts += 1
+                if no_insert == 0 and block_size == 0:
+                    num_inserts_block_with_no_size += 1
+                if access_seq_no % 100 != 0:
+                    continue
+                now = time.time()
+                if now - start_time > time_interval * 10:
+                    print(
+                        "Take {} seconds to process {} trace records with trace "
+                        "duration of {} seconds. Throughput: {} records/second.".format(
+                            now - start_time,
+                            access_seq_no,
+                            trace_duration / 1000000,
+                            access_seq_no / (now - start_time),
+                        )
+                    )
+                    time_interval += 1
+            print(
+                "Trace contains {0} blocks, {1}({2:.2f}%) blocks with no size."
+                "{3} accesses, {4}({5:.2f}%) accesses with no_insert,"
+                "{6}({7:.2f}%) accesses that want to insert but block size is 0.".format(
+                    len(block_access_timelines),
+                    num_blocks_with_no_size,
+                    percent(num_blocks_with_no_size, len(block_access_timelines)),
+                    access_seq_no,
+                    num_no_inserts,
+                    percent(num_no_inserts, access_seq_no),
+                    num_inserts_block_with_no_size,
+                    percent(num_inserts_block_with_no_size, access_seq_no),
+                )
+            )
+
+    access_seq_no = 0
+    time_interval = 1
+    start_time = time.time()
+    trace_start_time = 0
+    trace_duration = 0
+    print("Running simulated {} cache on block traces.".format(cache.cache_name()))
+    with open(trace_file_path, "r") as trace_file:
+        for line in trace_file:
+            if (
+                max_accesses_to_process != -1
+                and access_seq_no > max_accesses_to_process
+            ):
+                break
+            if access_seq_no % 1000000 == 0:
+                # Force a python gc periodically to reduce memory usage.
+                gc.collect()
+            ts = line.split(",")
+            timestamp = int(ts[0])
+            cf_name = ts[5]
+            if not is_target_cf(cf_name, target_cf_name):
+                continue
+            if trace_start_time == 0:
+                trace_start_time = timestamp
+            trace_duration = timestamp - trace_start_time
+            if (
+                not warmup_complete
+                and warmup_seconds > 0
+                and trace_duration > warmup_seconds * 1000000
+            ):
+                cache.miss_ratio_stats.reset_counter()
+                warmup_complete = True
+            next_access_seq_no = 0
+            block_id = int(ts[1])
+            if is_opt_cache:
+                next_access_seq_no = block_access_timelines[block_id].get_next_access()
+            record = TraceRecord(
+                access_time=int(ts[0]),
+                block_id=int(ts[1]),
+                block_type=int(ts[2]),
+                block_size=int(ts[3]),
+                cf_id=int(ts[4]),
+                cf_name=ts[5],
+                level=int(ts[6]),
+                fd=int(ts[7]),
+                caller=int(ts[8]),
+                no_insert=int(ts[9]),
+                get_id=int(ts[10]),
+                key_id=int(ts[11]),
+                kv_size=int(ts[12]),
+                is_hit=int(ts[13]),
+                referenced_key_exist_in_block=int(ts[14]),
+                num_keys_in_block=int(ts[15]),
+                table_id=int(ts[16]),
+                seq_number=int(ts[17]),
+                block_key_size=int(ts[18]),
+                key_size=int(ts[19]),
+                block_offset_in_file=int(ts[20]),
+                next_access_seq_no=next_access_seq_no,
+            )
+            trace_miss_ratio_stats.update_metrics(
+                record.access_time, is_hit=record.is_hit, miss_bytes=record.block_size
+            )
+            cache.access(record)
+            access_seq_no += 1
+            del record
+            del ts
+            if access_seq_no % 100 != 0:
+                continue
+            # Report progress every 10 seconds.
+            now = time.time()
+            if now - start_time > time_interval * 10:
+                print(
+                    "Take {} seconds to process {} trace records with trace "
+                    "duration of {} seconds. Throughput: {} records/second. "
+                    "Trace miss ratio {}".format(
+                        now - start_time,
+                        access_seq_no,
+                        trace_duration / 1000000,
+                        access_seq_no / (now - start_time),
+                        trace_miss_ratio_stats.miss_ratio(),
+                    )
+                )
+                time_interval += 1
+                print(
+                    "{},0,0,{},{},{}".format(
+                        cache_type,
+                        cache.cache_size,
+                        cache.miss_ratio_stats.miss_ratio(),
+                        cache.miss_ratio_stats.num_accesses,
+                    )
+                )
+    now = time.time()
+    print(
+        "Take {} seconds to process {} trace records with trace duration of {} "
+        "seconds. Throughput: {} records/second. Trace miss ratio {}".format(
+            now - start_time,
+            access_seq_no,
+            trace_duration / 1000000,
+            access_seq_no / (now - start_time),
+            trace_miss_ratio_stats.miss_ratio(),
+        )
+    )
+    print(
+        "{},0,0,{},{},{}".format(
+            cache_type,
+            cache.cache_size,
+            cache.miss_ratio_stats.miss_ratio(),
+            cache.miss_ratio_stats.num_accesses,
+        )
+    )
+    return trace_start_time, trace_duration
+
+
+def report_stats(
+    cache,
+    cache_type,
+    cache_size,
+    target_cf_name,
+    result_dir,
+    trace_start_time,
+    trace_end_time,
+):
+    cache_label = "{}-{}-{}".format(cache_type, cache_size, target_cf_name)
+    with open("{}/data-ml-mrc-{}".format(result_dir, cache_label), "w+") as mrc_file:
+        mrc_file.write(
+            "{},0,0,{},{},{}\n".format(
+                cache_type,
+                cache_size,
+                cache.miss_ratio_stats.miss_ratio(),
+                cache.miss_ratio_stats.num_accesses,
+            )
+        )
+
+    cache_stats = [
+        cache.per_second_miss_ratio_stats,
+        cache.miss_ratio_stats,
+        cache.per_hour_miss_ratio_stats,
+    ]
+    for i in range(len(cache_stats)):
+        avg_miss_bytes, p95_miss_bytes = cache_stats[i].compute_miss_bytes()
+
+        with open(
+            "{}/data-ml-avgmb-{}-{}".format(
+                result_dir, cache_stats[i].time_unit, cache_label
+            ),
+            "w+",
+        ) as mb_file:
+            mb_file.write(
+                "{},0,0,{},{}\n".format(cache_type, cache_size, avg_miss_bytes)
+            )
+
+        with open(
+            "{}/data-ml-p95mb-{}-{}".format(
+                result_dir, cache_stats[i].time_unit, cache_label
+            ),
+            "w+",
+        ) as mb_file:
+            mb_file.write(
+                "{},0,0,{},{}\n".format(cache_type, cache_size, p95_miss_bytes)
+            )
+
+        cache_stats[i].write_miss_timeline(
+            cache_type,
+            cache_size,
+            target_cf_name,
+            result_dir,
+            trace_start_time,
+            trace_end_time,
+        )
+        cache_stats[i].write_miss_ratio_timeline(
+            cache_type,
+            cache_size,
+            target_cf_name,
+            result_dir,
+            trace_start_time,
+            trace_end_time,
+        )
+
+    if not cache.is_ml_cache():
+        return
+
+    policy_stats = [cache.policy_stats, cache.per_hour_policy_stats]
+    for i in range(len(policy_stats)):
+        policy_stats[i].write_policy_timeline(
+            cache_type,
+            cache_size,
+            target_cf_name,
+            result_dir,
+            trace_start_time,
+            trace_end_time,
+        )
+        policy_stats[i].write_policy_ratio_timeline(
+            cache_type,
+            cache_size,
+            target_cf_name,
+            result_dir,
+            trace_start_time,
+            trace_end_time,
+        )
+
+
+if __name__ == "__main__":
+    if len(sys.argv) <= 8:
+        print(
+            "Must provide 8 arguments.\n"
+            "1) Cache type (ts, linucb, arc, lru, opt, pylru, pymru, pylfu, "
+            "pyhb, gdsize, trace). One may evaluate the hybrid row_block cache "
+            "by appending '_hybrid' to a cache_type, e.g., ts_hybrid. "
+            "Note that hybrid is not supported with opt and trace. \n"
+            "2) Cache size (xM, xG, xT).\n"
+            "3) The sampling frequency used to collect the trace. (The "
+            "simulation scales down the cache size by the sampling frequency).\n"
+            "4) Warmup seconds (The number of seconds used for warmup).\n"
+            "5) Trace file path.\n"
+            "6) Result directory (A directory that saves generated results)\n"
+            "7) Max number of accesses to process\n"
+            "8) The target column family. (The simulation will only run "
+            "accesses on the target column family. If it is set to all, "
+            "it will run against all accesses.)"
+        )
+        exit(1)
+    print("Arguments: {}".format(sys.argv))
+    cache_type = sys.argv[1]
+    cache_size = parse_cache_size(sys.argv[2])
+    downsample_size = int(sys.argv[3])
+    warmup_seconds = int(sys.argv[4])
+    trace_file_path = sys.argv[5]
+    result_dir = sys.argv[6]
+    max_accesses_to_process = int(sys.argv[7])
+    target_cf_name = sys.argv[8]
+    cache = create_cache(cache_type, cache_size, downsample_size)
+    trace_start_time, trace_duration = run(
+        trace_file_path,
+        cache_type,
+        cache,
+        warmup_seconds,
+        max_accesses_to_process,
+        target_cf_name,
+    )
+    trace_end_time = trace_start_time + trace_duration
+    report_stats(
+        cache,
+        cache_type,
+        cache_size,
+        target_cf_name,
+        result_dir,
+        trace_start_time,
+        trace_end_time,
+    )
diff --git a/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim.sh b/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim.sh
new file mode 100644
index 000000000..295f734aa
--- /dev/null
+++ b/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim.sh
@@ -0,0 +1,156 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+# A shell script to run a batch of pysims and combine individual pysim output files.
+#
+# Usage: bash block_cache_pysim.sh trace_file_path result_dir downsample_size warmup_seconds max_jobs
+# trace_file_path: The file path that stores the traces.
+# result_dir: The directory to store pysim results. The output files from a pysim is stores in result_dir/ml
+# downsample_size: The downsample size used to collect the trace.
+# warmup_seconds: The number of seconds used for warmup.
+# max_jobs: The max number of concurrent pysims to run.
+
+# Install required packages to run simulations.
+# sudo dnf install -y numpy scipy python-matplotlib ipython python-pandas sympy python-nose atlas-devel
+ulimit -c 0
+
+if [ $# -ne 5 ]; then
+  echo "Usage: ./block_cache_pysim.sh trace_file_path result_dir downsample_size warmup_seconds max_jobs"
+  exit 0
+fi
+
+trace_file="$1"
+result_dir="$2"
+downsample_size="$3"
+warmup_seconds="$4"
+max_jobs="$5"
+max_num_accesses=100000000
+current_jobs=1
+
+ml_tmp_result_dir="$result_dir/ml"
+rm -rf "$ml_tmp_result_dir"
+mkdir -p "$result_dir"
+mkdir -p "$ml_tmp_result_dir"
+
+# Report miss ratio in the trace.
+current_jobs=$(ps aux | grep pysim | grep python | grep -cv grep)
+for cf_name in "all"
+do
+for cache_size in "1G" "2G" "4G" "8G" "16G" #"12G" "16G" "1T"
+do
+for cache_type in "opt" "lru" "pylru" "pycctbbt" "pyhb" "ts" "trace" "lru_hybrid"  #"pycctblevelbt" #"lru_hybridn" "opt" #"pylru" "pylru_hybrid" "pycctbbt" "pycccfbt" "trace"
+do
+    if [[ $cache_type == "trace" && $cache_size != "16G" ]]; then
+      # We only need to collect miss ratios observed in the trace once.
+      continue
+    fi
+    while [ "$current_jobs" -ge "$max_jobs" ]
+    do
+      sleep 10
+      echo "Waiting jobs to complete. Number of running jobs: $current_jobs"
+      current_jobs=$(ps aux | grep pysim | grep python | grep -cv grep)
+      echo "Waiting jobs to complete. Number of running jobs: $current_jobs"
+    done
+    output="log-ml-$cache_type-$cache_size-$cf_name"
+    echo "Running simulation for $cache_type, cache size $cache_size, and cf_name $cf_name. Number of running jobs: $current_jobs. "
+    nohup python block_cache_pysim.py "$cache_type" "$cache_size" "$downsample_size" "$warmup_seconds" "$trace_file" "$ml_tmp_result_dir" "$max_num_accesses" "$cf_name" >& "$ml_tmp_result_dir/$output" &
+    current_jobs=$((current_jobs+1))
+done
+done
+done
+
+# Wait for all jobs to complete.
+while [ $current_jobs -gt 0 ]
+do
+  sleep 10
+  echo "Waiting jobs to complete. Number of running jobs: $current_jobs"
+  current_jobs=$(ps aux | grep pysim | grep python | grep -cv grep)
+  echo "Waiting jobs to complete. Number of running jobs: $current_jobs"
+done
+
+echo "Combine individual pysim output files"
+
+rm -rf "$result_dir/ml_*"
+for header in "header-" "data-"
+do
+for fn in "$ml_tmp_result_dir"/*
+do
+  sum_file=""
+  time_unit=""
+  capacity=""
+  target_cf_name=""
+  if [[ $fn == *"timeline"* ]]; then
+    tmpfn="$fn"
+    IFS='-' read -ra elements <<< "$tmpfn"
+    time_unit_index=0
+    capacity_index=0
+    for i in "${elements[@]}"
+    do
+       if [[ $i == "timeline" ]]; then
+         break
+       fi
+       time_unit_index=$((time_unit_index+1))
+    done
+    time_unit_index=$((time_unit_index+1))
+    capacity_index=$((time_unit_index+2))
+    target_cf_name_index=$((time_unit_index+3))
+    time_unit="${elements[$time_unit_index]}_"
+    capacity="${elements[$capacity_index]}_"
+    target_cf_name="${elements[$target_cf_name_index]}_"
+  fi
+
+  if [[ $fn == *"${header}ml-policy-timeline"* ]]; then
+    sum_file="$result_dir/ml_${target_cf_name}${capacity}${time_unit}policy_timeline"
+  fi
+  if [[ $fn == *"${header}ml-policy-ratio-timeline"* ]]; then
+    sum_file="$result_dir/ml_${target_cf_name}${capacity}${time_unit}policy_ratio_timeline"
+  fi
+  if [[ $fn == *"${header}ml-miss-timeline"* ]]; then
+    sum_file="$result_dir/ml_${target_cf_name}${capacity}${time_unit}miss_timeline"
+  fi
+  if [[ $fn == *"${header}ml-miss-ratio-timeline"* ]]; then
+    sum_file="$result_dir/ml_${target_cf_name}${capacity}${time_unit}miss_ratio_timeline"
+  fi
+  if [[ $fn == *"${header}ml-mrc"* ]]; then
+    tmpfn="$fn"
+    IFS='-' read -ra elements <<< "$tmpfn"
+    target_cf_name=${elements[-1]}
+    sum_file="${result_dir}/ml_${target_cf_name}_mrc"
+  fi
+  if [[ $fn == *"${header}ml-avgmb"* ]]; then
+    tmpfn="$fn"
+    IFS='-' read -ra elements <<< "$tmpfn"
+    time_unit=${elements[3]}
+    target_cf_name=${elements[-1]}
+    sum_file="${result_dir}/ml_${time_unit}_${target_cf_name}_avgmb"
+  fi
+  if [[ $fn == *"${header}ml-p95mb"* ]]; then
+    tmpfn="$fn"
+    IFS='-' read -ra elements <<< "$tmpfn"
+    time_unit=${elements[3]}
+    target_cf_name=${elements[-1]}
+    sum_file="${result_dir}/ml_${time_unit}_${target_cf_name}_p95mb"
+  fi
+  if [[ $sum_file == "" ]]; then
+    continue
+  fi
+  if [[ $header == "header-" ]]; then
+    if [ -e "$sum_file" ]; then
+      continue
+    fi
+  fi
+  cat "$fn" >> "$sum_file"
+done
+done
+
+echo "Done"
+for fn in $result_dir/*
+do
+  if [[ $fn == *"_mrc" || $fn == *"_avgmb" || $fn == *"_p95mb" ]]; then
+    # Sort MRC file by cache_type and cache_size.
+    tmp_file="$result_dir/tmp_mrc"
+    cat "$fn" | sort -t ',' -k1,1 -k4,4n > "$tmp_file"
+    cat "$tmp_file" > "$fn"
+    rm -rf "$tmp_file"
+  fi
+done
diff --git a/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim_test.py b/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim_test.py
new file mode 100644
index 000000000..eed1b94af
--- /dev/null
+++ b/src/rocksdb/tools/block_cache_analyzer/block_cache_pysim_test.py
@@ -0,0 +1,734 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import os
+import random
+import sys
+
+from block_cache_pysim import (
+    ARCCache,
+    CacheEntry,
+    create_cache,
+    GDSizeCache,
+    HashTable,
+    HyperbolicPolicy,
+    kMicrosInSecond,
+    kSampleSize,
+    LFUPolicy,
+    LinUCBCache,
+    LRUCache,
+    LRUPolicy,
+    MRUPolicy,
+    OPTCache,
+    OPTCacheEntry,
+    run,
+    ThompsonSamplingCache,
+    TraceCache,
+    TraceRecord,
+)
+
+
+def test_hash_table():
+    print("Test hash table")
+    table = HashTable()
+    data_size = 10000
+    for i in range(data_size):
+        table.insert("k{}".format(i), i, "v{}".format(i))
+    for i in range(data_size):
+        assert table.lookup("k{}".format(i), i) is not None
+    for i in range(data_size):
+        table.delete("k{}".format(i), i)
+    for i in range(data_size):
+        assert table.lookup("k{}".format(i), i) is None
+
+    truth_map = {}
+    n = 1000000
+    records = 100
+    for i in range(n):
+        key_id = random.randint(0, records)
+        v = random.randint(0, records)
+        key = "k{}".format(key_id)
+        value = CacheEntry(v, v, v, v, v, v, v)
+        action = random.randint(0, 10)
+        assert len(truth_map) == table.elements, "{} {} {}".format(
+            len(truth_map), table.elements, i
+        )
+        if action <= 8:
+            if key in truth_map:
+                assert table.lookup(key, key_id) is not None
+                assert truth_map[key].value_size == table.lookup(key, key_id).value_size
+            else:
+                assert table.lookup(key, key_id) is None
+            table.insert(key, key_id, value)
+            truth_map[key] = value
+        else:
+            deleted = table.delete(key, key_id)
+            if deleted:
+                assert key in truth_map
+            if key in truth_map:
+                del truth_map[key]
+
+    # Check all keys are unique in the sample set.
+    for _i in range(10):
+        samples = table.random_sample(kSampleSize)
+        unique_keys = {}
+        for sample in samples:
+            unique_keys[sample.key] = True
+        assert len(samples) == len(unique_keys)
+
+    assert len(table) == len(truth_map)
+    for key in truth_map:
+        assert table.lookup(key, int(key[1:])) is not None
+        assert truth_map[key].value_size == table.lookup(key, int(key[1:])).value_size
+    print("Test hash table: Success")
+
+
+def assert_metrics(cache, expected_value, expected_value_size=1, custom_hashtable=True):
+    assert cache.used_size == expected_value[0], "Expected {}, Actual {}".format(
+        expected_value[0], cache.used_size
+    )
+    assert (
+        cache.miss_ratio_stats.num_accesses == expected_value[1]
+    ), "Expected {}, Actual {}".format(
+        expected_value[1], cache.miss_ratio_stats.num_accesses
+    )
+    assert (
+        cache.miss_ratio_stats.num_misses == expected_value[2]
+    ), "Expected {}, Actual {}".format(
+        expected_value[2], cache.miss_ratio_stats.num_misses
+    )
+    assert len(cache.table) == len(expected_value[3]) + len(
+        expected_value[4]
+    ), "Expected {}, Actual {}".format(
+        len(expected_value[3]) + len(expected_value[4]), cache.table.elements
+    )
+    for expeceted_k in expected_value[3]:
+        if custom_hashtable:
+            val = cache.table.lookup("b{}".format(expeceted_k), expeceted_k)
+        else:
+            val = cache.table["b{}".format(expeceted_k)]
+        assert val is not None, "Expected {} Actual: Not Exist {}, Table: {}".format(
+            expeceted_k, expected_value, cache.table
+        )
+        assert val.value_size == expected_value_size
+    for expeceted_k in expected_value[4]:
+        if custom_hashtable:
+            val = cache.table.lookup("g0-{}".format(expeceted_k), expeceted_k)
+        else:
+            val = cache.table["g0-{}".format(expeceted_k)]
+        assert val is not None
+        assert val.value_size == expected_value_size
+
+
+# Access k1, k1, k2, k3, k3, k3, k4
+# When k4 is inserted,
+#   LRU should evict k1.
+#   LFU should evict k2.
+#   MRU should evict k3.
+def test_cache(cache, expected_value, custom_hashtable=True):
+    k1 = TraceRecord(
+        access_time=0,
+        block_id=1,
+        block_type=1,
+        block_size=1,
+        cf_id=0,
+        cf_name="",
+        level=0,
+        fd=0,
+        caller=1,
+        no_insert=0,
+        get_id=1,
+        key_id=1,
+        kv_size=5,
+        is_hit=1,
+        referenced_key_exist_in_block=1,
+        num_keys_in_block=0,
+        table_id=0,
+        seq_number=0,
+        block_key_size=0,
+        key_size=0,
+        block_offset_in_file=0,
+        next_access_seq_no=0,
+    )
+    k2 = TraceRecord(
+        access_time=1,
+        block_id=2,
+        block_type=1,
+        block_size=1,
+        cf_id=0,
+        cf_name="",
+        level=0,
+        fd=0,
+        caller=1,
+        no_insert=0,
+        get_id=1,
+        key_id=1,
+        kv_size=5,
+        is_hit=1,
+        referenced_key_exist_in_block=1,
+        num_keys_in_block=0,
+        table_id=0,
+        seq_number=0,
+        block_key_size=0,
+        key_size=0,
+        block_offset_in_file=0,
+        next_access_seq_no=0,
+    )
+    k3 = TraceRecord(
+        access_time=2,
+        block_id=3,
+        block_type=1,
+        block_size=1,
+        cf_id=0,
+        cf_name="",
+        level=0,
+        fd=0,
+        caller=1,
+        no_insert=0,
+        get_id=1,
+        key_id=1,
+        kv_size=5,
+        is_hit=1,
+        referenced_key_exist_in_block=1,
+        num_keys_in_block=0,
+        table_id=0,
+        seq_number=0,
+        block_key_size=0,
+        key_size=0,
+        block_offset_in_file=0,
+        next_access_seq_no=0,
+    )
+    k4 = TraceRecord(
+        access_time=3,
+        block_id=4,
+        block_type=1,
+        block_size=1,
+        cf_id=0,
+        cf_name="",
+        level=0,
+        fd=0,
+        caller=1,
+        no_insert=0,
+        get_id=1,
+        key_id=1,
+        kv_size=5,
+        is_hit=1,
+        referenced_key_exist_in_block=1,
+        num_keys_in_block=0,
+        table_id=0,
+        seq_number=0,
+        block_key_size=0,
+        key_size=0,
+        block_offset_in_file=0,
+        next_access_seq_no=0,
+    )
+    sequence = [k1, k1, k2, k3, k3, k3]
+    index = 0
+    expected_values = []
+    # Access k1, miss.
+    expected_values.append([1, 1, 1, [1], []])
+    # Access k1, hit.
+    expected_values.append([1, 2, 1, [1], []])
+    # Access k2, miss.
+    expected_values.append([2, 3, 2, [1, 2], []])
+    # Access k3, miss.
+    expected_values.append([3, 4, 3, [1, 2, 3], []])
+    # Access k3, hit.
+    expected_values.append([3, 5, 3, [1, 2, 3], []])
+    # Access k3, hit.
+    expected_values.append([3, 6, 3, [1, 2, 3], []])
+    access_time = 0
+    for access in sequence:
+        access.access_time = access_time
+        cache.access(access)
+        assert_metrics(
+            cache,
+            expected_values[index],
+            expected_value_size=1,
+            custom_hashtable=custom_hashtable,
+        )
+        access_time += 1
+        index += 1
+    k4.access_time = access_time
+    cache.access(k4)
+    assert_metrics(
+        cache, expected_value, expected_value_size=1, custom_hashtable=custom_hashtable
+    )
+
+
+def test_lru_cache(cache, custom_hashtable):
+    print("Test LRU cache")
+    # Access k4, miss. evict k1
+    test_cache(cache, [3, 7, 4, [2, 3, 4], []], custom_hashtable)
+    print("Test LRU cache: Success")
+
+
+def test_mru_cache():
+    print("Test MRU cache")
+    policies = []
+    policies.append(MRUPolicy())
+    # Access k4, miss. evict k3
+    test_cache(
+        ThompsonSamplingCache(3, False, policies, cost_class_label=None),
+        [3, 7, 4, [1, 2, 4], []],
+    )
+    print("Test MRU cache: Success")
+
+
+def test_lfu_cache():
+    print("Test LFU cache")
+    policies = []
+    policies.append(LFUPolicy())
+    # Access k4, miss. evict k2
+    test_cache(
+        ThompsonSamplingCache(3, False, policies, cost_class_label=None),
+        [3, 7, 4, [1, 3, 4], []],
+    )
+    print("Test LFU cache: Success")
+
+
+def test_mix(cache):
+    print("Test Mix {} cache".format(cache.cache_name()))
+    n = 100000
+    records = 100
+    block_size_table = {}
+    trace_num_misses = 0
+    for i in range(n):
+        key_id = random.randint(0, records)
+        vs = random.randint(0, 10)
+        now = i * kMicrosInSecond
+        block_size = vs
+        if key_id in block_size_table:
+            block_size = block_size_table[key_id]
+        else:
+            block_size_table[key_id] = block_size
+        is_hit = key_id % 2
+        if is_hit == 0:
+            trace_num_misses += 1
+        k = TraceRecord(
+            access_time=now,
+            block_id=key_id,
+            block_type=1,
+            block_size=block_size,
+            cf_id=0,
+            cf_name="",
+            level=0,
+            fd=0,
+            caller=1,
+            no_insert=0,
+            get_id=key_id,
+            key_id=key_id,
+            kv_size=5,
+            is_hit=is_hit,
+            referenced_key_exist_in_block=1,
+            num_keys_in_block=0,
+            table_id=0,
+            seq_number=0,
+            block_key_size=0,
+            key_size=0,
+            block_offset_in_file=0,
+            next_access_seq_no=vs,
+        )
+        cache.access(k)
+    assert cache.miss_ratio_stats.miss_ratio() > 0
+    if cache.cache_name() == "Trace":
+        assert cache.miss_ratio_stats.num_accesses == n
+        assert cache.miss_ratio_stats.num_misses == trace_num_misses
+    else:
+        assert cache.used_size <= cache.cache_size
+        all_values = cache.table.values()
+        cached_size = 0
+        for value in all_values:
+            cached_size += value.value_size
+        assert cached_size == cache.used_size, "Expeced {} Actual {}".format(
+            cache.used_size, cached_size
+        )
+    print("Test Mix {} cache: Success".format(cache.cache_name()))
+
+
+def test_end_to_end():
+    print("Test All caches")
+    n = 100000
+    nblocks = 1000
+    block_size = 16 * 1024
+    ncfs = 7
+    nlevels = 6
+    nfds = 100000
+    trace_file_path = "test_trace"
+    # All blocks are of the same size so that OPT must achieve the lowest miss
+    # ratio.
+    with open(trace_file_path, "w+") as trace_file:
+        access_records = ""
+        for i in range(n):
+            key_id = random.randint(0, nblocks)
+            cf_id = random.randint(0, ncfs)
+            level = random.randint(0, nlevels)
+            fd = random.randint(0, nfds)
+            now = i * kMicrosInSecond
+            access_record = ""
+            access_record += "{},".format(now)
+            access_record += "{},".format(key_id)
+            access_record += "{},".format(9)  # block type
+            access_record += "{},".format(block_size)  # block size
+            access_record += "{},".format(cf_id)
+            access_record += "cf_{},".format(cf_id)
+            access_record += "{},".format(level)
+            access_record += "{},".format(fd)
+            access_record += "{},".format(key_id % 3)  # caller
+            access_record += "{},".format(0)  # no insert
+            access_record += "{},".format(i)  # get_id
+            access_record += "{},".format(i)  # key_id
+            access_record += "{},".format(100)  # kv_size
+            access_record += "{},".format(1)  # is_hit
+            access_record += "{},".format(1)  # referenced_key_exist_in_block
+            access_record += "{},".format(10)  # num_keys_in_block
+            access_record += "{},".format(1)  # table_id
+            access_record += "{},".format(0)  # seq_number
+            access_record += "{},".format(10)  # block key size
+            access_record += "{},".format(20)  # key size
+            access_record += "{},".format(0)  # block offset
+            access_record = access_record[:-1]
+            access_records += access_record + "\n"
+        trace_file.write(access_records)
+
+    print("Test All caches: Start testing caches")
+    cache_size = block_size * nblocks / 10
+    downsample_size = 1
+    cache_ms = {}
+    for cache_type in [
+        "ts",
+        "opt",
+        "lru",
+        "pylru",
+        "linucb",
+        "gdsize",
+        "pyccbt",
+        "pycctbbt",
+    ]:
+        cache = create_cache(cache_type, cache_size, downsample_size)
+        run(trace_file_path, cache_type, cache, 0, -1, "all")
+        cache_ms[cache_type] = cache
+        assert cache.miss_ratio_stats.num_accesses == n
+
+    for cache_type in cache_ms:
+        cache = cache_ms[cache_type]
+        ms = cache.miss_ratio_stats.miss_ratio()
+        assert ms <= 100.0 and ms >= 0.0
+        # OPT should perform the best.
+        assert cache_ms["opt"].miss_ratio_stats.miss_ratio() <= ms
+        assert cache.used_size <= cache.cache_size
+        all_values = cache.table.values()
+        cached_size = 0
+        for value in all_values:
+            cached_size += value.value_size
+        assert cached_size == cache.used_size, "Expeced {} Actual {}".format(
+            cache.used_size, cached_size
+        )
+        print("Test All {}: Success".format(cache.cache_name()))
+
+    os.remove(trace_file_path)
+    print("Test All: Success")
+
+
+def test_hybrid(cache):
+    print("Test {} cache".format(cache.cache_name()))
+    k = TraceRecord(
+        access_time=0,
+        block_id=1,
+        block_type=1,
+        block_size=1,
+        cf_id=0,
+        cf_name="",
+        level=0,
+        fd=0,
+        caller=1,
+        no_insert=0,
+        get_id=1,  # the first get request.
+        key_id=1,
+        kv_size=0,  # no size.
+        is_hit=1,
+        referenced_key_exist_in_block=1,
+        num_keys_in_block=0,
+        table_id=0,
+        seq_number=0,
+        block_key_size=0,
+        key_size=0,
+        block_offset_in_file=0,
+        next_access_seq_no=0,
+    )
+    cache.access(k)  # Expect a miss.
+    # used size, num accesses, num misses, hash table size, blocks, get keys.
+    assert_metrics(cache, [1, 1, 1, [1], []])
+    k.access_time += 1
+    k.kv_size = 1
+    k.block_id = 2
+    cache.access(k)  # k should be inserted.
+    assert_metrics(cache, [3, 2, 2, [1, 2], [1]])
+    k.access_time += 1
+    k.block_id = 3
+    cache.access(k)  # k should not be inserted again.
+    assert_metrics(cache, [4, 3, 3, [1, 2, 3], [1]])
+    # A second get request referencing the same key.
+    k.access_time += 1
+    k.get_id = 2
+    k.block_id = 4
+    k.kv_size = 0
+    cache.access(k)  # k should observe a hit. No block access.
+    assert_metrics(cache, [4, 4, 3, [1, 2, 3], [1]])
+
+    # A third get request searches three files, three different keys.
+    # And the second key observes a hit.
+    k.access_time += 1
+    k.kv_size = 1
+    k.get_id = 3
+    k.block_id = 3
+    k.key_id = 2
+    cache.access(k)  # k should observe a miss. block 3 observes a hit.
+    assert_metrics(cache, [5, 5, 3, [1, 2, 3], [1, 2]])
+
+    k.access_time += 1
+    k.kv_size = 1
+    k.get_id = 3
+    k.block_id = 4
+    k.kv_size = 1
+    k.key_id = 1
+    cache.access(k)  # k1 should observe a hit.
+    assert_metrics(cache, [5, 6, 3, [1, 2, 3], [1, 2]])
+
+    k.access_time += 1
+    k.kv_size = 1
+    k.get_id = 3
+    k.block_id = 4
+    k.kv_size = 1
+    k.key_id = 3
+    # k3 should observe a miss.
+    # However, as the get already complete, we should not access k3 any more.
+    cache.access(k)
+    assert_metrics(cache, [5, 7, 3, [1, 2, 3], [1, 2]])
+
+    # A fourth get request searches one file and two blocks. One row key.
+    k.access_time += 1
+    k.get_id = 4
+    k.block_id = 5
+    k.key_id = 4
+    k.kv_size = 1
+    cache.access(k)
+    assert_metrics(cache, [7, 8, 4, [1, 2, 3, 5], [1, 2, 4]])
+
+    # A bunch of insertions which evict cached row keys.
+    for i in range(6, 100):
+        k.access_time += 1
+        k.get_id = 0
+        k.block_id = i
+        cache.access(k)
+
+    k.get_id = 4
+    k.block_id = 100  # A different block.
+    k.key_id = 4  # Same row key and should not be inserted again.
+    k.kv_size = 1
+    cache.access(k)
+    assert_metrics(
+        cache, [kSampleSize, 103, 99, [i for i in range(101 - kSampleSize, 101)], []]
+    )
+    print("Test {} cache: Success".format(cache.cache_name()))
+
+
+def test_opt_cache():
+    print("Test OPT cache")
+    cache = OPTCache(3)
+    # seq:         0,  1,  2,  3,  4,  5,  6,  7,  8
+    # key:         k1, k2, k3, k4, k5, k6, k7, k1, k8
+    # next_access: 7,  19, 18, M,  M,  17, 16, 25, M
+    k = TraceRecord(
+        access_time=0,
+        block_id=1,
+        block_type=1,
+        block_size=1,
+        cf_id=0,
+        cf_name="",
+        level=0,
+        fd=0,
+        caller=1,
+        no_insert=0,
+        get_id=1,  # the first get request.
+        key_id=1,
+        kv_size=0,  # no size.
+        is_hit=1,
+        referenced_key_exist_in_block=1,
+        num_keys_in_block=0,
+        table_id=0,
+        seq_number=0,
+        block_key_size=0,
+        key_size=0,
+        block_offset_in_file=0,
+        next_access_seq_no=7,
+    )
+    cache.access(k)
+    assert_metrics(
+        cache, [1, 1, 1, [1], []], expected_value_size=1, custom_hashtable=False
+    )
+    k.access_time += 1
+    k.block_id = 2
+    k.next_access_seq_no = 19
+    cache.access(k)
+    assert_metrics(
+        cache, [2, 2, 2, [1, 2], []], expected_value_size=1, custom_hashtable=False
+    )
+    k.access_time += 1
+    k.block_id = 3
+    k.next_access_seq_no = 18
+    cache.access(k)
+    assert_metrics(
+        cache, [3, 3, 3, [1, 2, 3], []], expected_value_size=1, custom_hashtable=False
+    )
+    k.access_time += 1
+    k.block_id = 4
+    k.next_access_seq_no = sys.maxsize  # Never accessed again.
+    cache.access(k)
+    # Evict 2 since its next access 19 is the furthest in the future.
+    assert_metrics(
+        cache, [3, 4, 4, [1, 3, 4], []], expected_value_size=1, custom_hashtable=False
+    )
+    k.access_time += 1
+    k.block_id = 5
+    k.next_access_seq_no = sys.maxsize  # Never accessed again.
+    cache.access(k)
+    # Evict 4 since its next access MAXINT is the furthest in the future.
+    assert_metrics(
+        cache, [3, 5, 5, [1, 3, 5], []], expected_value_size=1, custom_hashtable=False
+    )
+    k.access_time += 1
+    k.block_id = 6
+    k.next_access_seq_no = 17
+    cache.access(k)
+    # Evict 5 since its next access MAXINT is the furthest in the future.
+    assert_metrics(
+        cache, [3, 6, 6, [1, 3, 6], []], expected_value_size=1, custom_hashtable=False
+    )
+    k.access_time += 1
+    k.block_id = 7
+    k.next_access_seq_no = 16
+    cache.access(k)
+    # Evict 3 since its next access 18 is the furthest in the future.
+    assert_metrics(
+        cache, [3, 7, 7, [1, 6, 7], []], expected_value_size=1, custom_hashtable=False
+    )
+    k.access_time += 1
+    k.block_id = 1
+    k.next_access_seq_no = 25
+    cache.access(k)
+    assert_metrics(
+        cache, [3, 8, 7, [1, 6, 7], []], expected_value_size=1, custom_hashtable=False
+    )
+    k.access_time += 1
+    k.block_id = 8
+    k.next_access_seq_no = sys.maxsize
+    cache.access(k)
+    # Evict 1 since its next access 25 is the furthest in the future.
+    assert_metrics(
+        cache, [3, 9, 8, [6, 7, 8], []], expected_value_size=1, custom_hashtable=False
+    )
+
+    # Insert a large kv pair to evict all keys.
+    k.access_time += 1
+    k.block_id = 10
+    k.block_size = 3
+    k.next_access_seq_no = sys.maxsize
+    cache.access(k)
+    assert_metrics(
+        cache, [3, 10, 9, [10], []], expected_value_size=3, custom_hashtable=False
+    )
+    print("Test OPT cache: Success")
+
+
+def test_trace_cache():
+    print("Test trace cache")
+    cache = TraceCache(0)
+    k = TraceRecord(
+        access_time=0,
+        block_id=1,
+        block_type=1,
+        block_size=1,
+        cf_id=0,
+        cf_name="",
+        level=0,
+        fd=0,
+        caller=1,
+        no_insert=0,
+        get_id=1,
+        key_id=1,
+        kv_size=0,
+        is_hit=1,
+        referenced_key_exist_in_block=1,
+        num_keys_in_block=0,
+        table_id=0,
+        seq_number=0,
+        block_key_size=0,
+        key_size=0,
+        block_offset_in_file=0,
+        next_access_seq_no=7,
+    )
+    cache.access(k)
+    assert cache.miss_ratio_stats.num_accesses == 1
+    assert cache.miss_ratio_stats.num_misses == 0
+    k.is_hit = 0
+    cache.access(k)
+    assert cache.miss_ratio_stats.num_accesses == 2
+    assert cache.miss_ratio_stats.num_misses == 1
+    print("Test trace cache: Success")
+
+
+if __name__ == "__main__":
+    test_hash_table()
+    test_trace_cache()
+    test_opt_cache()
+    test_lru_cache(
+        ThompsonSamplingCache(
+            3, enable_cache_row_key=0, policies=[LRUPolicy()], cost_class_label=None
+        ),
+        custom_hashtable=True,
+    )
+    test_lru_cache(LRUCache(3, enable_cache_row_key=0), custom_hashtable=False)
+    test_mru_cache()
+    test_lfu_cache()
+    test_hybrid(
+        ThompsonSamplingCache(
+            kSampleSize,
+            enable_cache_row_key=1,
+            policies=[LRUPolicy()],
+            cost_class_label=None,
+        )
+    )
+    test_hybrid(
+        LinUCBCache(
+            kSampleSize,
+            enable_cache_row_key=1,
+            policies=[LRUPolicy()],
+            cost_class_label=None,
+        )
+    )
+    for cache_type in [
+        "ts",
+        "opt",
+        "arc",
+        "pylfu",
+        "pymru",
+        "trace",
+        "pyhb",
+        "lru",
+        "pylru",
+        "linucb",
+        "gdsize",
+        "pycctbbt",
+        "pycctb",
+        "pyccbt",
+    ]:
+        for enable_row_cache in [0, 1, 2]:
+            cache_type_str = cache_type
+            if cache_type != "opt" and cache_type != "trace":
+                if enable_row_cache == 1:
+                    cache_type_str += "_hybrid"
+                elif enable_row_cache == 2:
+                    cache_type_str += "_hybridn"
+            test_mix(create_cache(cache_type_str, cache_size=100, downsample_size=1))
+    test_end_to_end()
diff --git a/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.cc b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
new file mode 100644
index 000000000..f0bb6975b
--- /dev/null
+++ b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
@@ -0,0 +1,2316 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#ifdef GFLAGS
+#include "tools/block_cache_analyzer/block_cache_trace_analyzer.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <sstream>
+
+#include "monitoring/histogram.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/trace_record.h"
+#include "util/gflags_compat.h"
+#include "util/string_util.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_string(block_cache_trace_path, "", "The trace file path.");
+DEFINE_bool(is_block_cache_human_readable_trace, false,
+            "Is the trace file provided for analysis generated by running "
+            "block_cache_trace_analyzer with "
+            "FLAGS_human_readable_trace_file_path is specified.");
+DEFINE_string(
+    block_cache_sim_config_path, "",
+    "The config file path. One cache configuration per line. The format of a "
+    "cache configuration is "
+    "cache_name,num_shard_bits,ghost_capacity,cache_capacity_1,...,cache_"
+    "capacity_N. Supported cache names are lru, lru_priority, lru_hybrid, and "
+    "lru_hybrid_no_insert_on_row_miss. User may also add a prefix 'ghost_' to "
+    "a cache_name to add a ghost cache in front of the real cache. "
+    "ghost_capacity and cache_capacity can be xK, xM or xG where x is a "
+    "positive number.");
+DEFINE_int32(block_cache_trace_downsample_ratio, 1,
+             "The trace collected accesses on one in every "
+             "block_cache_trace_downsample_ratio blocks. We scale "
+             "down the simulated cache size by this ratio.");
+DEFINE_bool(print_block_size_stats, false,
+            "Print block size distribution and the distribution break down by "
+            "block type and column family.");
+DEFINE_bool(print_access_count_stats, false,
+            "Print access count distribution and the distribution break down "
+            "by block type and column family.");
+DEFINE_bool(print_data_block_access_count_stats, false,
+            "Print data block accesses by user Get and Multi-Get.");
+DEFINE_int32(cache_sim_warmup_seconds, 0,
+             "The number of seconds to warmup simulated caches. The hit/miss "
+             "counters are reset after the warmup completes.");
+DEFINE_int32(analyze_bottom_k_access_count_blocks, 0,
+             "Print out detailed access information for blocks with their "
+             "number of accesses are the bottom k among all blocks.");
+DEFINE_int32(analyze_top_k_access_count_blocks, 0,
+             "Print out detailed access information for blocks with their "
+             "number of accesses are the top k among all blocks.");
+DEFINE_string(block_cache_analysis_result_dir, "",
+              "The directory that saves block cache analysis results.");
+DEFINE_string(
+    timeline_labels, "",
+    "Group the number of accesses per block per second using these labels. "
+    "Possible labels are a combination of the following: cf (column family), "
+    "sst, level, bt (block type), caller, block. For example, label \"cf_bt\" "
+    "means the number of access per second is grouped by unique pairs of "
+    "\"cf_bt\". A label \"all\" contains the aggregated number of accesses per "
+    "second across all possible labels.");
+DEFINE_string(reuse_distance_labels, "",
+              "Group the reuse distance of a block using these labels. Reuse "
+              "distance is defined as the cumulated size of unique blocks read "
+              "between two consecutive accesses on the same block.");
+DEFINE_string(
+    reuse_distance_buckets, "",
+    "Group blocks by their reuse distances given these buckets. For "
+    "example, if 'reuse_distance_buckets' is '1K,1M,1G', we will "
+    "create four buckets. The first three buckets contain the number of "
+    "blocks with reuse distance less than 1KB, between 1K and 1M, between 1M "
+    "and 1G, respectively. The last bucket contains the number of blocks with "
+    "reuse distance larger than 1G. ");
+DEFINE_string(
+    reuse_interval_labels, "",
+    "Group the reuse interval of a block using these labels. Reuse "
+    "interval is defined as the time between two consecutive accesses "
+    "on the same block.");
+DEFINE_string(
+    reuse_interval_buckets, "",
+    "Group blocks by their reuse interval given these buckets. For "
+    "example, if 'reuse_distance_buckets' is '1,10,100', we will "
+    "create four buckets. The first three buckets contain the number of "
+    "blocks with reuse interval less than 1 second, between 1 second and 10 "
+    "seconds, between 10 seconds and 100 seconds, respectively. The last "
+    "bucket contains the number of blocks with reuse interval longer than 100 "
+    "seconds.");
+DEFINE_string(
+    reuse_lifetime_labels, "",
+    "Group the reuse lifetime of a block using these labels. Reuse "
+    "lifetime is defined as the time interval between the first access on a "
+    "block and the last access on the same block. For blocks that are only "
+    "accessed once, its lifetime is set to kMaxUint64.");
+DEFINE_string(
+    reuse_lifetime_buckets, "",
+    "Group blocks by their reuse lifetime given these buckets. For "
+    "example, if 'reuse_lifetime_buckets' is '1,10,100', we will "
+    "create four buckets. The first three buckets contain the number of "
+    "blocks with reuse lifetime less than 1 second, between 1 second and 10 "
+    "seconds, between 10 seconds and 100 seconds, respectively. The last "
+    "bucket contains the number of blocks with reuse lifetime longer than 100 "
+    "seconds.");
+DEFINE_string(
+    analyze_callers, "",
+    "The list of callers to perform a detailed analysis on. If speicfied, the "
+    "analyzer will output a detailed percentage of accesses for each caller "
+    "break down by column family, level, and block type. A list of available "
+    "callers are: Get, MultiGet, Iterator, ApproximateSize, VerifyChecksum, "
+    "SSTDumpTool, ExternalSSTIngestion, Repair, Prefetch, Compaction, "
+    "CompactionRefill, Flush, SSTFileReader, Uncategorized.");
+DEFINE_string(access_count_buckets, "",
+              "Group number of blocks by their access count given these "
+              "buckets. If specified, the analyzer will output a detailed "
+              "analysis on the number of blocks grouped by their access count "
+              "break down by block type and column family.");
+DEFINE_int32(analyze_blocks_reuse_k_reuse_window, 0,
+             "Analyze the percentage of blocks that are accessed in the "
+             "[k, 2*k] seconds are accessed again in the next [2*k, 3*k], "
+             "[3*k, 4*k],...,[k*(n-1), k*n] seconds. ");
+DEFINE_string(analyze_get_spatial_locality_labels, "",
+              "Group data blocks using these labels.");
+DEFINE_string(analyze_get_spatial_locality_buckets, "",
+              "Group data blocks by their statistics using these buckets.");
+DEFINE_string(skew_labels, "",
+              "Group the access count of a block using these labels.");
+DEFINE_string(skew_buckets, "", "Group the skew labels using these buckets.");
+DEFINE_bool(mrc_only, false,
+            "Evaluate alternative cache policies only. When this flag is true, "
+            "the analyzer does NOT maintain states of each block in memory for "
+            "analysis. It only feeds the accesses into the cache simulators.");
+DEFINE_string(
+    analyze_correlation_coefficients_labels, "",
+    "Analyze the correlation coefficients of features such as number of past "
+    "accesses with regard to the number of accesses till the next access.");
+DEFINE_int32(analyze_correlation_coefficients_max_number_of_values, 1000000,
+             "The maximum number of values for a feature. If the number of "
+             "values for a feature is larger than this max, it randomly "
+             "selects 'max' number of values.");
+DEFINE_string(human_readable_trace_file_path, "",
+              "The filt path that saves human readable access records.");
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+
+const std::string kMissRatioCurveFileName = "mrc";
+const std::string kGroupbyBlock = "block";
+const std::string kGroupbyTable = "table";
+const std::string kGroupbyColumnFamily = "cf";
+const std::string kGroupbySSTFile = "sst";
+const std::string kGroupbyBlockType = "bt";
+const std::string kGroupbyCaller = "caller";
+const std::string kGroupbyLevel = "level";
+const std::string kGroupbyAll = "all";
+const std::set<std::string> kGroupbyLabels{
+    kGroupbyBlock,     kGroupbyColumnFamily, kGroupbySSTFile, kGroupbyLevel,
+    kGroupbyBlockType, kGroupbyCaller,       kGroupbyAll};
+const std::string kSupportedCacheNames =
+    " lru ghost_lru lru_priority ghost_lru_priority lru_hybrid "
+    "ghost_lru_hybrid lru_hybrid_no_insert_on_row_miss "
+    "ghost_lru_hybrid_no_insert_on_row_miss ";
+
+// The suffix for the generated csv files.
+const std::string kFileNameSuffixMissRatioTimeline = "miss_ratio_timeline";
+const std::string kFileNameSuffixMissTimeline = "miss_timeline";
+const std::string kFileNameSuffixSkew = "skewness";
+const std::string kFileNameSuffixAccessTimeline = "access_timeline";
+const std::string kFileNameSuffixCorrelation = "correlation_input";
+const std::string kFileNameSuffixAvgReuseIntervalNaccesses =
+    "avg_reuse_interval_naccesses";
+const std::string kFileNameSuffixAvgReuseInterval = "avg_reuse_interval";
+const std::string kFileNameSuffixReuseInterval = "access_reuse_interval";
+const std::string kFileNameSuffixReuseLifetime = "reuse_lifetime";
+const std::string kFileNameSuffixAccessReuseBlocksTimeline =
+    "reuse_blocks_timeline";
+const std::string kFileNameSuffixPercentOfAccessSummary =
+    "percentage_of_accesses_summary";
+const std::string kFileNameSuffixPercentRefKeys = "percent_ref_keys";
+const std::string kFileNameSuffixPercentDataSizeOnRefKeys =
+    "percent_data_size_on_ref_keys";
+const std::string kFileNameSuffixPercentAccessesOnRefKeys =
+    "percent_accesses_on_ref_keys";
+const std::string kFileNameSuffixAccessCountSummary = "access_count_summary";
+
+std::string block_type_to_string(TraceType type) {
+  switch (type) {
+    case kBlockTraceFilterBlock:
+      return "Filter";
+    case kBlockTraceDataBlock:
+      return "Data";
+    case kBlockTraceIndexBlock:
+      return "Index";
+    case kBlockTraceRangeDeletionBlock:
+      return "RangeDeletion";
+    case kBlockTraceUncompressionDictBlock:
+      return "UncompressionDict";
+    default:
+      break;
+  }
+  // This cannot happen.
+  return "InvalidType";
+}
+
+std::string caller_to_string(TableReaderCaller caller) {
+  switch (caller) {
+    case kUserGet:
+      return "Get";
+    case kUserMultiGet:
+      return "MultiGet";
+    case kUserIterator:
+      return "Iterator";
+    case kUserApproximateSize:
+      return "ApproximateSize";
+    case kUserVerifyChecksum:
+      return "VerifyChecksum";
+    case kSSTDumpTool:
+      return "SSTDumpTool";
+    case kExternalSSTIngestion:
+      return "ExternalSSTIngestion";
+    case kRepair:
+      return "Repair";
+    case kPrefetch:
+      return "Prefetch";
+    case kCompaction:
+      return "Compaction";
+    case kCompactionRefill:
+      return "CompactionRefill";
+    case kFlush:
+      return "Flush";
+    case kSSTFileReader:
+      return "SSTFileReader";
+    case kUncategorized:
+      return "Uncategorized";
+    default:
+      break;
+  }
+  // This cannot happen.
+  return "InvalidCaller";
+}
+
+TableReaderCaller string_to_caller(std::string caller_str) {
+  if (caller_str == "Get") {
+    return kUserGet;
+  } else if (caller_str == "MultiGet") {
+    return kUserMultiGet;
+  } else if (caller_str == "Iterator") {
+    return kUserIterator;
+  } else if (caller_str == "ApproximateSize") {
+    return kUserApproximateSize;
+  } else if (caller_str == "VerifyChecksum") {
+    return kUserVerifyChecksum;
+  } else if (caller_str == "SSTDumpTool") {
+    return kSSTDumpTool;
+  } else if (caller_str == "ExternalSSTIngestion") {
+    return kExternalSSTIngestion;
+  } else if (caller_str == "Repair") {
+    return kRepair;
+  } else if (caller_str == "Prefetch") {
+    return kPrefetch;
+  } else if (caller_str == "Compaction") {
+    return kCompaction;
+  } else if (caller_str == "CompactionRefill") {
+    return kCompactionRefill;
+  } else if (caller_str == "Flush") {
+    return kFlush;
+  } else if (caller_str == "SSTFileReader") {
+    return kSSTFileReader;
+  } else if (caller_str == "Uncategorized") {
+    return kUncategorized;
+  }
+  return TableReaderCaller::kMaxBlockCacheLookupCaller;
+}
+
+bool is_user_access(TableReaderCaller caller) {
+  switch (caller) {
+    case kUserGet:
+    case kUserMultiGet:
+    case kUserIterator:
+    case kUserApproximateSize:
+    case kUserVerifyChecksum:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+const char kBreakLine[] =
+    "***************************************************************\n";
+
+void print_break_lines(uint32_t num_break_lines) {
+  for (uint32_t i = 0; i < num_break_lines; i++) {
+    fprintf(stdout, kBreakLine);
+  }
+}
+
+double percent(uint64_t numerator, uint64_t denomenator) {
+  if (denomenator == 0) {
+    return -1;
+  }
+  return static_cast<double>(numerator * 100.0 / denomenator);
+}
+
+std::map<uint64_t, uint64_t> adjust_time_unit(
+    const std::map<uint64_t, uint64_t>& time_stats, uint64_t time_unit) {
+  if (time_unit == 1) {
+    return time_stats;
+  }
+  std::map<uint64_t, uint64_t> adjusted_time_stats;
+  for (auto const& time : time_stats) {
+    adjusted_time_stats[static_cast<uint64_t>(time.first / time_unit)] +=
+        time.second;
+  }
+  return adjusted_time_stats;
+}
+}  // namespace
+
+void BlockCacheTraceAnalyzer::WriteMissRatioCurves() const {
+  if (!cache_simulator_) {
+    return;
+  }
+  if (output_dir_.empty()) {
+    return;
+  }
+  uint64_t trace_duration =
+      trace_end_timestamp_in_seconds_ - trace_start_timestamp_in_seconds_;
+  uint64_t total_accesses = access_sequence_number_;
+  const std::string output_miss_ratio_curve_path =
+      output_dir_ + "/" + std::to_string(trace_duration) + "_" +
+      std::to_string(total_accesses) + "_" + kMissRatioCurveFileName;
+  std::ofstream out(output_miss_ratio_curve_path);
+  if (!out.is_open()) {
+    return;
+  }
+  // Write header.
+  const std::string header =
+      "cache_name,num_shard_bits,ghost_capacity,capacity,miss_ratio,total_"
+      "accesses";
+  out << header << std::endl;
+  for (auto const& config_caches : cache_simulator_->sim_caches()) {
+    const CacheConfiguration& config = config_caches.first;
+    for (uint32_t i = 0; i < config.cache_capacities.size(); i++) {
+      double miss_ratio =
+          config_caches.second[i]->miss_ratio_stats().miss_ratio();
+      // Write the body.
+      out << config.cache_name;
+      out << ",";
+      out << config.num_shard_bits;
+      out << ",";
+      out << config.ghost_cache_capacity;
+      out << ",";
+      out << config.cache_capacities[i];
+      out << ",";
+      out << std::fixed << std::setprecision(4) << miss_ratio;
+      out << ",";
+      out << config_caches.second[i]->miss_ratio_stats().total_accesses();
+      out << std::endl;
+    }
+  }
+  out.close();
+}
+
+void BlockCacheTraceAnalyzer::UpdateFeatureVectors(
+    const std::vector<uint64_t>& access_sequence_number_timeline,
+    const std::vector<uint64_t>& access_timeline, const std::string& label,
+    std::map<std::string, Features>* label_features,
+    std::map<std::string, Predictions>* label_predictions) const {
+  if (access_sequence_number_timeline.empty() || access_timeline.empty()) {
+    return;
+  }
+  assert(access_timeline.size() == access_sequence_number_timeline.size());
+  uint64_t prev_access_sequence_number = access_sequence_number_timeline[0];
+  uint64_t prev_access_timestamp = access_timeline[0];
+  for (uint32_t i = 0; i < access_sequence_number_timeline.size(); i++) {
+    uint64_t num_accesses_since_last_access =
+        access_sequence_number_timeline[i] - prev_access_sequence_number;
+    uint64_t elapsed_time_since_last_access =
+        access_timeline[i] - prev_access_timestamp;
+    prev_access_sequence_number = access_sequence_number_timeline[i];
+    prev_access_timestamp = access_timeline[i];
+    if (i < access_sequence_number_timeline.size() - 1) {
+      (*label_features)[label].num_accesses_since_last_access.push_back(
+          num_accesses_since_last_access);
+      (*label_features)[label].num_past_accesses.push_back(i);
+      (*label_features)[label].elapsed_time_since_last_access.push_back(
+          elapsed_time_since_last_access);
+    }
+    if (i >= 1) {
+      (*label_predictions)[label].num_accesses_till_next_access.push_back(
+          num_accesses_since_last_access);
+      (*label_predictions)[label].elapsed_time_till_next_access.push_back(
+          elapsed_time_since_last_access);
+    }
+  }
+}
+
+void BlockCacheTraceAnalyzer::WriteMissRatioTimeline(uint64_t time_unit) const {
+  if (!cache_simulator_ || output_dir_.empty()) {
+    return;
+  }
+  std::map<uint64_t, std::map<std::string, std::map<uint64_t, double>>>
+      cs_name_timeline;
+  uint64_t start_time = std::numeric_limits<uint64_t>::max();
+  uint64_t end_time = 0;
+  const std::map<uint64_t, uint64_t>& trace_num_misses =
+      adjust_time_unit(miss_ratio_stats_.num_misses_timeline(), time_unit);
+  const std::map<uint64_t, uint64_t>& trace_num_accesses =
+      adjust_time_unit(miss_ratio_stats_.num_accesses_timeline(), time_unit);
+  assert(trace_num_misses.size() == trace_num_accesses.size());
+  for (auto const& num_miss : trace_num_misses) {
+    uint64_t time = num_miss.first;
+    start_time = std::min(start_time, time);
+    end_time = std::max(end_time, time);
+    uint64_t miss = num_miss.second;
+    auto it = trace_num_accesses.find(time);
+    assert(it != trace_num_accesses.end());
+    uint64_t access = it->second;
+    cs_name_timeline[std::numeric_limits<uint64_t>::max()]["trace"][time] =
+        percent(miss, access);
+  }
+  for (auto const& config_caches : cache_simulator_->sim_caches()) {
+    const CacheConfiguration& config = config_caches.first;
+    std::string cache_label = config.cache_name + "-" +
+                              std::to_string(config.num_shard_bits) + "-" +
+                              std::to_string(config.ghost_cache_capacity);
+    for (uint32_t i = 0; i < config.cache_capacities.size(); i++) {
+      const std::map<uint64_t, uint64_t>& num_misses = adjust_time_unit(
+          config_caches.second[i]->miss_ratio_stats().num_misses_timeline(),
+          time_unit);
+      const std::map<uint64_t, uint64_t>& num_accesses = adjust_time_unit(
+          config_caches.second[i]->miss_ratio_stats().num_accesses_timeline(),
+          time_unit);
+      assert(num_misses.size() == num_accesses.size());
+      for (auto const& num_miss : num_misses) {
+        uint64_t time = num_miss.first;
+        start_time = std::min(start_time, time);
+        end_time = std::max(end_time, time);
+        uint64_t miss = num_miss.second;
+        auto it = num_accesses.find(time);
+        assert(it != num_accesses.end());
+        uint64_t access = it->second;
+        cs_name_timeline[config.cache_capacities[i]][cache_label][time] =
+            percent(miss, access);
+      }
+    }
+  }
+  for (auto const& it : cs_name_timeline) {
+    const std::string output_miss_ratio_timeline_path =
+        output_dir_ + "/" + std::to_string(it.first) + "_" +
+        std::to_string(time_unit) + "_" + kFileNameSuffixMissRatioTimeline;
+    std::ofstream out(output_miss_ratio_timeline_path);
+    if (!out.is_open()) {
+      return;
+    }
+    std::string header("time");
+    for (uint64_t now = start_time; now <= end_time; now++) {
+      header += ",";
+      header += std::to_string(now);
+    }
+    out << header << std::endl;
+    for (auto const& label : it.second) {
+      std::string row(label.first);
+      for (uint64_t now = start_time; now <= end_time; now++) {
+        auto misses = label.second.find(now);
+        row += ",";
+        if (misses != label.second.end()) {
+          row += std::to_string(misses->second);
+        } else {
+          row += "0";
+        }
+      }
+      out << row << std::endl;
+    }
+    out.close();
+  }
+}
+
+void BlockCacheTraceAnalyzer::WriteMissTimeline(uint64_t time_unit) const {
+  if (!cache_simulator_ || output_dir_.empty()) {
+    return;
+  }
+  std::map<uint64_t, std::map<std::string, std::map<uint64_t, uint64_t>>>
+      cs_name_timeline;
+  uint64_t start_time = std::numeric_limits<uint64_t>::max();
+  uint64_t end_time = 0;
+  const std::map<uint64_t, uint64_t>& trace_num_misses =
+      adjust_time_unit(miss_ratio_stats_.num_misses_timeline(), time_unit);
+  for (auto const& num_miss : trace_num_misses) {
+    uint64_t time = num_miss.first;
+    start_time = std::min(start_time, time);
+    end_time = std::max(end_time, time);
+    uint64_t miss = num_miss.second;
+    cs_name_timeline[std::numeric_limits<uint64_t>::max()]["trace"][time] =
+        miss;
+  }
+  for (auto const& config_caches : cache_simulator_->sim_caches()) {
+    const CacheConfiguration& config = config_caches.first;
+    std::string cache_label = config.cache_name + "-" +
+                              std::to_string(config.num_shard_bits) + "-" +
+                              std::to_string(config.ghost_cache_capacity);
+    for (uint32_t i = 0; i < config.cache_capacities.size(); i++) {
+      const std::map<uint64_t, uint64_t>& num_misses = adjust_time_unit(
+          config_caches.second[i]->miss_ratio_stats().num_misses_timeline(),
+          time_unit);
+      for (auto const& num_miss : num_misses) {
+        uint64_t time = num_miss.first;
+        start_time = std::min(start_time, time);
+        end_time = std::max(end_time, time);
+        uint64_t miss = num_miss.second;
+        cs_name_timeline[config.cache_capacities[i]][cache_label][time] = miss;
+      }
+    }
+  }
+  for (auto const& it : cs_name_timeline) {
+    const std::string output_miss_ratio_timeline_path =
+        output_dir_ + "/" + std::to_string(it.first) + "_" +
+        std::to_string(time_unit) + "_" + kFileNameSuffixMissTimeline;
+    std::ofstream out(output_miss_ratio_timeline_path);
+    if (!out.is_open()) {
+      return;
+    }
+    std::string header("time");
+    for (uint64_t now = start_time; now <= end_time; now++) {
+      header += ",";
+      header += std::to_string(now);
+    }
+    out << header << std::endl;
+    for (auto const& label : it.second) {
+      std::string row(label.first);
+      for (uint64_t now = start_time; now <= end_time; now++) {
+        auto misses = label.second.find(now);
+        row += ",";
+        if (misses != label.second.end()) {
+          row += std::to_string(misses->second);
+        } else {
+          row += "0";
+        }
+      }
+      out << row << std::endl;
+    }
+    out.close();
+  }
+}
+
+void BlockCacheTraceAnalyzer::WriteSkewness(
+    const std::string& label_str, const std::vector<uint64_t>& percent_buckets,
+    TraceType target_block_type) const {
+  std::set<std::string> labels = ParseLabelStr(label_str);
+  std::map<std::string, uint64_t> label_naccesses;
+  uint64_t total_naccesses = 0;
+  auto block_callback = [&](const std::string& cf_name, uint64_t fd,
+                            uint32_t level, TraceType type,
+                            const std::string& /*block_key*/, uint64_t block_id,
+                            const BlockAccessInfo& block) {
+    if (target_block_type != TraceType::kTraceMax &&
+        target_block_type != type) {
+      return;
+    }
+    const std::string label = BuildLabel(
+        labels, cf_name, fd, level, type,
+        TableReaderCaller::kMaxBlockCacheLookupCaller, block_id, block);
+    label_naccesses[label] += block.num_accesses;
+    total_naccesses += block.num_accesses;
+  };
+  TraverseBlocks(block_callback, &labels);
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_bucket_naccesses;
+  std::vector<std::pair<std::string, uint64_t>> pairs;
+  for (auto const& itr : label_naccesses) {
+    pairs.push_back(itr);
+  }
+  // Sort in descending order.
+  sort(pairs.begin(), pairs.end(),
+       [](const std::pair<std::string, uint64_t>& a,
+          const std::pair<std::string, uint64_t>& b) {
+         return b.second < a.second;
+       });
+
+  size_t prev_start_index = 0;
+  for (auto const& percent : percent_buckets) {
+    label_bucket_naccesses[label_str][percent] = 0;
+    size_t end_index = 0;
+    if (percent == std::numeric_limits<uint64_t>::max()) {
+      end_index = label_naccesses.size();
+    } else {
+      end_index = percent * label_naccesses.size() / 100;
+    }
+    for (size_t i = prev_start_index; i < end_index; i++) {
+      label_bucket_naccesses[label_str][percent] += pairs[i].second;
+    }
+    prev_start_index = end_index;
+  }
+  std::string filename_suffix;
+  if (target_block_type != TraceType::kTraceMax) {
+    filename_suffix = block_type_to_string(target_block_type);
+    filename_suffix += "_";
+  }
+  filename_suffix += kFileNameSuffixSkew;
+  WriteStatsToFile(label_str, percent_buckets, filename_suffix,
+                   label_bucket_naccesses, total_naccesses);
+}
+
+void BlockCacheTraceAnalyzer::WriteCorrelationFeatures(
+    const std::string& label_str, uint32_t max_number_of_values) const {
+  std::set<std::string> labels = ParseLabelStr(label_str);
+  std::map<std::string, Features> label_features;
+  std::map<std::string, Predictions> label_predictions;
+  auto block_callback =
+      [&](const std::string& cf_name, uint64_t fd, uint32_t level,
+          TraceType block_type, const std::string& /*block_key*/,
+          uint64_t /*block_key_id*/, const BlockAccessInfo& block) {
+        if (block.table_id == 0 && labels.find(kGroupbyTable) != labels.end()) {
+          // We only know table id information for get requests.
+          return;
+        }
+        if (labels.find(kGroupbyCaller) != labels.end()) {
+          // Group by caller.
+          for (auto const& caller_map : block.caller_access_timeline) {
+            const std::string label =
+                BuildLabel(labels, cf_name, fd, level, block_type,
+                           caller_map.first, /*block_id=*/0, block);
+            auto it = block.caller_access_sequence__number_timeline.find(
+                caller_map.first);
+            assert(it != block.caller_access_sequence__number_timeline.end());
+            UpdateFeatureVectors(it->second, caller_map.second, label,
+                                 &label_features, &label_predictions);
+          }
+          return;
+        }
+        const std::string label =
+            BuildLabel(labels, cf_name, fd, level, block_type,
+                       TableReaderCaller::kMaxBlockCacheLookupCaller,
+                       /*block_id=*/0, block);
+        UpdateFeatureVectors(block.access_sequence_number_timeline,
+                             block.access_timeline, label, &label_features,
+                             &label_predictions);
+      };
+  TraverseBlocks(block_callback, &labels);
+  WriteCorrelationFeaturesToFile(label_str, label_features, label_predictions,
+                                 max_number_of_values);
+}
+
+void BlockCacheTraceAnalyzer::WriteCorrelationFeaturesToFile(
+    const std::string& label,
+    const std::map<std::string, Features>& label_features,
+    const std::map<std::string, Predictions>& label_predictions,
+    uint32_t max_number_of_values) const {
+  for (auto const& label_feature_vectors : label_features) {
+    const Features& past = label_feature_vectors.second;
+    auto it = label_predictions.find(label_feature_vectors.first);
+    assert(it != label_predictions.end());
+    const Predictions& future = it->second;
+    const std::string output_path = output_dir_ + "/" + label + "_" +
+                                    label_feature_vectors.first + "_" +
+                                    kFileNameSuffixCorrelation;
+    std::ofstream out(output_path);
+    if (!out.is_open()) {
+      return;
+    }
+    std::string header(
+        "num_accesses_since_last_access,elapsed_time_since_last_access,num_"
+        "past_accesses,num_accesses_till_next_access,elapsed_time_till_next_"
+        "access");
+    out << header << std::endl;
+    std::vector<uint32_t> indexes;
+    for (uint32_t i = 0; i < past.num_accesses_since_last_access.size(); i++) {
+      indexes.push_back(i);
+    }
+    RandomShuffle(indexes.begin(), indexes.end());
+    for (uint32_t i = 0; i < max_number_of_values && i < indexes.size(); i++) {
+      uint32_t rand_index = indexes[i];
+      out << std::to_string(past.num_accesses_since_last_access[rand_index])
+          << ",";
+      out << std::to_string(past.elapsed_time_since_last_access[rand_index])
+          << ",";
+      out << std::to_string(past.num_past_accesses[rand_index]) << ",";
+      out << std::to_string(future.num_accesses_till_next_access[rand_index])
+          << ",";
+      out << std::to_string(future.elapsed_time_till_next_access[rand_index])
+          << std::endl;
+    }
+    out.close();
+  }
+}
+
+void BlockCacheTraceAnalyzer::WriteCorrelationFeaturesForGet(
+    uint32_t max_number_of_values) const {
+  std::string label = "GetKeyInfo";
+  std::map<std::string, Features> label_features;
+  std::map<std::string, Predictions> label_predictions;
+  for (auto const& get_info : get_key_info_map_) {
+    const GetKeyInfo& info = get_info.second;
+    UpdateFeatureVectors(info.access_sequence_number_timeline,
+                         info.access_timeline, label, &label_features,
+                         &label_predictions);
+  }
+  WriteCorrelationFeaturesToFile(label, label_features, label_predictions,
+                                 max_number_of_values);
+}
+
+std::set<std::string> BlockCacheTraceAnalyzer::ParseLabelStr(
+    const std::string& label_str) const {
+  std::stringstream ss(label_str);
+  std::set<std::string> labels;
+  // label_str is in the form of "label1_label2_label3", e.g., cf_bt.
+  while (ss.good()) {
+    std::string label_name;
+    getline(ss, label_name, '_');
+    if (kGroupbyLabels.find(label_name) == kGroupbyLabels.end()) {
+      // Unknown label name.
+      fprintf(stderr, "Unknown label name %s, label string %s\n",
+              label_name.c_str(), label_str.c_str());
+      return {};
+    }
+    labels.insert(label_name);
+  }
+  return labels;
+}
+
+std::string BlockCacheTraceAnalyzer::BuildLabel(
+    const std::set<std::string>& labels, const std::string& cf_name,
+    uint64_t fd, uint32_t level, TraceType type, TableReaderCaller caller,
+    uint64_t block_key, const BlockAccessInfo& block) const {
+  std::map<std::string, std::string> label_value_map;
+  label_value_map[kGroupbyAll] = kGroupbyAll;
+  label_value_map[kGroupbyLevel] = std::to_string(level);
+  label_value_map[kGroupbyCaller] = caller_to_string(caller);
+  label_value_map[kGroupbySSTFile] = std::to_string(fd);
+  label_value_map[kGroupbyBlockType] = block_type_to_string(type);
+  label_value_map[kGroupbyColumnFamily] = cf_name;
+  label_value_map[kGroupbyBlock] = std::to_string(block_key);
+  label_value_map[kGroupbyTable] = std::to_string(block.table_id);
+  // Concatenate the label values.
+  std::string label;
+  for (auto const& l : labels) {
+    label += label_value_map[l];
+    label += "-";
+  }
+  if (!label.empty()) {
+    label.pop_back();
+  }
+  return label;
+}
+
+void BlockCacheTraceAnalyzer::TraverseBlocks(
+    std::function<void(const std::string& /*cf_name*/, uint64_t /*fd*/,
+                       uint32_t /*level*/, TraceType /*block_type*/,
+                       const std::string& /*block_key*/,
+                       uint64_t /*block_key_id*/,
+                       const BlockAccessInfo& /*block_access_info*/)>
+        block_callback,
+    std::set<std::string>* labels) const {
+  for (auto const& cf_aggregates : cf_aggregates_map_) {
+    // Stats per column family.
+    const std::string& cf_name = cf_aggregates.first;
+    for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
+      // Stats per SST file.
+      const uint64_t fd = file_aggregates.first;
+      const uint32_t level = file_aggregates.second.level;
+      for (auto const& block_type_aggregates :
+           file_aggregates.second.block_type_aggregates_map) {
+        // Stats per block type.
+        const TraceType type = block_type_aggregates.first;
+        for (auto const& block_access_info :
+             block_type_aggregates.second.block_access_info_map) {
+          // Stats per block.
+          if (labels && block_access_info.second.table_id == 0 &&
+              labels->find(kGroupbyTable) != labels->end()) {
+            // We only know table id information for get requests.
+            return;
+          }
+          block_callback(cf_name, fd, level, type, block_access_info.first,
+                         block_access_info.second.block_id,
+                         block_access_info.second);
+        }
+      }
+    }
+  }
+}
+
+void BlockCacheTraceAnalyzer::WriteGetSpatialLocality(
+    const std::string& label_str,
+    const std::vector<uint64_t>& percent_buckets) const {
+  std::set<std::string> labels = ParseLabelStr(label_str);
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_pnrefkeys_nblocks;
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_pnrefs_nblocks;
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_pndatasize_nblocks;
+  uint64_t nblocks = 0;
+  auto block_callback = [&](const std::string& cf_name, uint64_t fd,
+                            uint32_t level, TraceType /*block_type*/,
+                            const std::string& /*block_key*/,
+                            uint64_t /*block_key_id*/,
+                            const BlockAccessInfo& block) {
+    if (block.num_keys == 0) {
+      return;
+    }
+    uint64_t naccesses = 0;
+    for (auto const& key_access : block.key_num_access_map) {
+      for (auto const& caller_access : key_access.second) {
+        if (caller_access.first == TableReaderCaller::kUserGet) {
+          naccesses += caller_access.second;
+        }
+      }
+    }
+    const std::string label =
+        BuildLabel(labels, cf_name, fd, level, TraceType::kBlockTraceDataBlock,
+                   TableReaderCaller::kUserGet, /*block_id=*/0, block);
+
+    const uint64_t percent_referenced_for_existing_keys =
+        static_cast<uint64_t>(std::max(
+            percent(block.key_num_access_map.size(), block.num_keys), 0.0));
+    const uint64_t percent_accesses_for_existing_keys =
+        static_cast<uint64_t>(std::max(
+            percent(block.num_referenced_key_exist_in_block, naccesses), 0.0));
+    const uint64_t percent_referenced_data_size = static_cast<uint64_t>(
+        std::max(percent(block.referenced_data_size, block.block_size), 0.0));
+    if (label_pnrefkeys_nblocks.find(label) == label_pnrefkeys_nblocks.end()) {
+      for (auto const& percent_bucket : percent_buckets) {
+        label_pnrefkeys_nblocks[label][percent_bucket] = 0;
+        label_pnrefs_nblocks[label][percent_bucket] = 0;
+        label_pndatasize_nblocks[label][percent_bucket] = 0;
+      }
+    }
+    label_pnrefkeys_nblocks[label]
+        .upper_bound(percent_referenced_for_existing_keys)
+        ->second += 1;
+    label_pnrefs_nblocks[label]
+        .upper_bound(percent_accesses_for_existing_keys)
+        ->second += 1;
+    label_pndatasize_nblocks[label]
+        .upper_bound(percent_referenced_data_size)
+        ->second += 1;
+    nblocks += 1;
+  };
+  TraverseBlocks(block_callback, &labels);
+  WriteStatsToFile(label_str, percent_buckets, kFileNameSuffixPercentRefKeys,
+                   label_pnrefkeys_nblocks, nblocks);
+  WriteStatsToFile(label_str, percent_buckets,
+                   kFileNameSuffixPercentAccessesOnRefKeys,
+                   label_pnrefs_nblocks, nblocks);
+  WriteStatsToFile(label_str, percent_buckets,
+                   kFileNameSuffixPercentDataSizeOnRefKeys,
+                   label_pndatasize_nblocks, nblocks);
+}
+
+void BlockCacheTraceAnalyzer::WriteAccessTimeline(const std::string& label_str,
+                                                  uint64_t time_unit,
+                                                  bool user_access_only) const {
+  std::set<std::string> labels = ParseLabelStr(label_str);
+  uint64_t start_time = std::numeric_limits<uint64_t>::max();
+  uint64_t end_time = 0;
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_access_timeline;
+  std::map<uint64_t, std::vector<std::string>> access_count_block_id_map;
+
+  auto block_callback = [&](const std::string& cf_name, uint64_t fd,
+                            uint32_t level, TraceType type,
+                            const std::string& /*block_key*/, uint64_t block_id,
+                            const BlockAccessInfo& block) {
+    uint64_t naccesses = 0;
+    for (auto const& timeline : block.caller_num_accesses_timeline) {
+      const TableReaderCaller caller = timeline.first;
+      if (user_access_only && !is_user_access(caller)) {
+        continue;
+      }
+      const std::string label =
+          BuildLabel(labels, cf_name, fd, level, type, caller, block_id, block);
+      for (auto const& naccess : timeline.second) {
+        const uint64_t timestamp = naccess.first / time_unit;
+        const uint64_t num = naccess.second;
+        label_access_timeline[label][timestamp] += num;
+        start_time = std::min(start_time, timestamp);
+        end_time = std::max(end_time, timestamp);
+        naccesses += num;
+      }
+    }
+    if (naccesses > 0) {
+      access_count_block_id_map[naccesses].push_back(std::to_string(block_id));
+    }
+  };
+  TraverseBlocks(block_callback, &labels);
+
+  // We have label_access_timeline now. Write them into a file.
+  const std::string user_access_prefix =
+      user_access_only ? "user_access_only_" : "all_access_";
+  const std::string output_path = output_dir_ + "/" + user_access_prefix +
+                                  label_str + "_" + std::to_string(time_unit) +
+                                  "_" + kFileNameSuffixAccessTimeline;
+  std::ofstream out(output_path);
+  if (!out.is_open()) {
+    return;
+  }
+  std::string header("time");
+  if (labels.find("block") != labels.end()) {
+    for (uint64_t now = start_time; now <= end_time; now++) {
+      header += ",";
+      header += std::to_string(now);
+    }
+    out << header << std::endl;
+    // Write the most frequently accessed blocks first.
+    for (auto naccess_it = access_count_block_id_map.rbegin();
+         naccess_it != access_count_block_id_map.rend(); naccess_it++) {
+      for (auto& block_id_it : naccess_it->second) {
+        std::string row(block_id_it);
+        for (uint64_t now = start_time; now <= end_time; now++) {
+          auto it = label_access_timeline[block_id_it].find(now);
+          row += ",";
+          if (it != label_access_timeline[block_id_it].end()) {
+            row += std::to_string(it->second);
+          } else {
+            row += "0";
+          }
+        }
+        out << row << std::endl;
+      }
+    }
+    out.close();
+    return;
+  }
+  for (uint64_t now = start_time; now <= end_time; now++) {
+    header += ",";
+    header += std::to_string(now);
+  }
+  out << header << std::endl;
+  for (auto const& label : label_access_timeline) {
+    std::string row(label.first);
+    for (uint64_t now = start_time; now <= end_time; now++) {
+      auto it = label.second.find(now);
+      row += ",";
+      if (it != label.second.end()) {
+        row += std::to_string(it->second);
+      } else {
+        row += "0";
+      }
+    }
+    out << row << std::endl;
+  }
+
+  out.close();
+}
+
+void BlockCacheTraceAnalyzer::WriteReuseDistance(
+    const std::string& label_str,
+    const std::vector<uint64_t>& distance_buckets) const {
+  std::set<std::string> labels = ParseLabelStr(label_str);
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_distance_num_reuses;
+  uint64_t total_num_reuses = 0;
+  auto block_callback = [&](const std::string& cf_name, uint64_t fd,
+                            uint32_t level, TraceType type,
+                            const std::string& /*block_key*/, uint64_t block_id,
+                            const BlockAccessInfo& block) {
+    const std::string label = BuildLabel(
+        labels, cf_name, fd, level, type,
+        TableReaderCaller::kMaxBlockCacheLookupCaller, block_id, block);
+    if (label_distance_num_reuses.find(label) ==
+        label_distance_num_reuses.end()) {
+      // The first time we encounter this label.
+      for (auto const& distance_bucket : distance_buckets) {
+        label_distance_num_reuses[label][distance_bucket] = 0;
+      }
+    }
+    for (auto const& reuse_distance : block.reuse_distance_count) {
+      label_distance_num_reuses[label]
+          .upper_bound(reuse_distance.first)
+          ->second += reuse_distance.second;
+      total_num_reuses += reuse_distance.second;
+    }
+  };
+  TraverseBlocks(block_callback, &labels);
+  // We have label_naccesses and label_distance_num_reuses now. Write them into
+  // a file.
+  const std::string output_path =
+      output_dir_ + "/" + label_str + "_reuse_distance";
+  std::ofstream out(output_path);
+  if (!out.is_open()) {
+    return;
+  }
+  std::string header("bucket");
+  for (auto const& label_it : label_distance_num_reuses) {
+    header += ",";
+    header += label_it.first;
+  }
+  out << header << std::endl;
+  for (auto const& bucket : distance_buckets) {
+    std::string row(std::to_string(bucket));
+    for (auto const& label_it : label_distance_num_reuses) {
+      auto const& it = label_it.second.find(bucket);
+      assert(it != label_it.second.end());
+      row += ",";
+      row += std::to_string(percent(it->second, total_num_reuses));
+    }
+    out << row << std::endl;
+  }
+  out.close();
+}
+
+void BlockCacheTraceAnalyzer::UpdateReuseIntervalStats(
+    const std::string& label, const std::vector<uint64_t>& time_buckets,
+    const std::map<uint64_t, uint64_t> timeline,
+    std::map<std::string, std::map<uint64_t, uint64_t>>* label_time_num_reuses,
+    uint64_t* total_num_reuses) const {
+  assert(label_time_num_reuses);
+  assert(total_num_reuses);
+  if (label_time_num_reuses->find(label) == label_time_num_reuses->end()) {
+    // The first time we encounter this label.
+    for (auto const& time_bucket : time_buckets) {
+      (*label_time_num_reuses)[label][time_bucket] = 0;
+    }
+  }
+  auto it = timeline.begin();
+  uint64_t prev_timestamp = it->first;
+  const uint64_t prev_num = it->second;
+  it++;
+  // Reused within one second.
+  if (prev_num > 1) {
+    (*label_time_num_reuses)[label].upper_bound(0)->second += prev_num - 1;
+    *total_num_reuses += prev_num - 1;
+  }
+  while (it != timeline.end()) {
+    const uint64_t timestamp = it->first;
+    const uint64_t num = it->second;
+    const uint64_t reuse_interval = timestamp - prev_timestamp;
+    (*label_time_num_reuses)[label].upper_bound(reuse_interval)->second += 1;
+    if (num > 1) {
+      (*label_time_num_reuses)[label].upper_bound(0)->second += num - 1;
+    }
+    prev_timestamp = timestamp;
+    *total_num_reuses += num;
+    it++;
+  }
+}
+
+void BlockCacheTraceAnalyzer::WriteStatsToFile(
+    const std::string& label_str, const std::vector<uint64_t>& time_buckets,
+    const std::string& filename_suffix,
+    const std::map<std::string, std::map<uint64_t, uint64_t>>& label_data,
+    uint64_t ntotal) const {
+  const std::string output_path =
+      output_dir_ + "/" + label_str + "_" + filename_suffix;
+  std::ofstream out(output_path);
+  if (!out.is_open()) {
+    return;
+  }
+  std::string header("bucket");
+  for (auto const& label_it : label_data) {
+    header += ",";
+    header += label_it.first;
+  }
+  out << header << std::endl;
+  for (auto const& bucket : time_buckets) {
+    std::string row(std::to_string(bucket));
+    for (auto const& label_it : label_data) {
+      auto const& it = label_it.second.find(bucket);
+      assert(it != label_it.second.end());
+      row += ",";
+      row += std::to_string(percent(it->second, ntotal));
+    }
+    out << row << std::endl;
+  }
+  out.close();
+}
+
+void BlockCacheTraceAnalyzer::WriteReuseInterval(
+    const std::string& label_str,
+    const std::vector<uint64_t>& time_buckets) const {
+  std::set<std::string> labels = ParseLabelStr(label_str);
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_time_num_reuses;
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_avg_reuse_nblocks;
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_avg_reuse_naccesses;
+
+  uint64_t total_num_reuses = 0;
+  uint64_t total_nblocks = 0;
+  uint64_t total_accesses = 0;
+  auto block_callback = [&](const std::string& cf_name, uint64_t fd,
+                            uint32_t level, TraceType type,
+                            const std::string& /*block_key*/, uint64_t block_id,
+                            const BlockAccessInfo& block) {
+    total_nblocks++;
+    total_accesses += block.num_accesses;
+    uint64_t avg_reuse_interval = 0;
+    if (block.num_accesses > 1) {
+      avg_reuse_interval = ((block.last_access_time - block.first_access_time) /
+                            kMicrosInSecond) /
+                           block.num_accesses;
+    } else {
+      avg_reuse_interval = std::numeric_limits<uint64_t>::max() - 1;
+    }
+    if (labels.find(kGroupbyCaller) != labels.end()) {
+      for (auto const& timeline : block.caller_num_accesses_timeline) {
+        const TableReaderCaller caller = timeline.first;
+        const std::string label = BuildLabel(labels, cf_name, fd, level, type,
+                                             caller, block_id, block);
+        UpdateReuseIntervalStats(label, time_buckets, timeline.second,
+                                 &label_time_num_reuses, &total_num_reuses);
+      }
+      return;
+    }
+    // Does not group by caller so we need to flatten the access timeline.
+    const std::string label = BuildLabel(
+        labels, cf_name, fd, level, type,
+        TableReaderCaller::kMaxBlockCacheLookupCaller, block_id, block);
+    std::map<uint64_t, uint64_t> timeline;
+    for (auto const& caller_timeline : block.caller_num_accesses_timeline) {
+      for (auto const& time_naccess : caller_timeline.second) {
+        timeline[time_naccess.first] += time_naccess.second;
+      }
+    }
+    UpdateReuseIntervalStats(label, time_buckets, timeline,
+                             &label_time_num_reuses, &total_num_reuses);
+    if (label_avg_reuse_nblocks.find(label) == label_avg_reuse_nblocks.end()) {
+      for (auto const& time_bucket : time_buckets) {
+        label_avg_reuse_nblocks[label][time_bucket] = 0;
+        label_avg_reuse_naccesses[label][time_bucket] = 0;
+      }
+    }
+    label_avg_reuse_nblocks[label].upper_bound(avg_reuse_interval)->second += 1;
+    label_avg_reuse_naccesses[label].upper_bound(avg_reuse_interval)->second +=
+        block.num_accesses;
+  };
+  TraverseBlocks(block_callback, &labels);
+
+  // Write the stats into files.
+  WriteStatsToFile(label_str, time_buckets, kFileNameSuffixReuseInterval,
+                   label_time_num_reuses, total_num_reuses);
+  WriteStatsToFile(label_str, time_buckets, kFileNameSuffixAvgReuseInterval,
+                   label_avg_reuse_nblocks, total_nblocks);
+  WriteStatsToFile(label_str, time_buckets,
+                   kFileNameSuffixAvgReuseIntervalNaccesses,
+                   label_avg_reuse_naccesses, total_accesses);
+}
+
+void BlockCacheTraceAnalyzer::WriteReuseLifetime(
+    const std::string& label_str,
+    const std::vector<uint64_t>& time_buckets) const {
+  std::set<std::string> labels = ParseLabelStr(label_str);
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_lifetime_nblocks;
+  uint64_t total_nblocks = 0;
+  auto block_callback = [&](const std::string& cf_name, uint64_t fd,
+                            uint32_t level, TraceType type,
+                            const std::string& /*block_key*/, uint64_t block_id,
+                            const BlockAccessInfo& block) {
+    uint64_t lifetime = 0;
+    if (block.num_accesses > 1) {
+      lifetime =
+          (block.last_access_time - block.first_access_time) / kMicrosInSecond;
+    } else {
+      lifetime = std::numeric_limits<uint64_t>::max() - 1;
+    }
+    const std::string label = BuildLabel(
+        labels, cf_name, fd, level, type,
+        TableReaderCaller::kMaxBlockCacheLookupCaller, block_id, block);
+
+    if (label_lifetime_nblocks.find(label) == label_lifetime_nblocks.end()) {
+      // The first time we encounter this label.
+      for (auto const& time_bucket : time_buckets) {
+        label_lifetime_nblocks[label][time_bucket] = 0;
+      }
+    }
+    label_lifetime_nblocks[label].upper_bound(lifetime)->second += 1;
+    total_nblocks += 1;
+  };
+  TraverseBlocks(block_callback, &labels);
+  WriteStatsToFile(label_str, time_buckets, kFileNameSuffixReuseLifetime,
+                   label_lifetime_nblocks, total_nblocks);
+}
+
+void BlockCacheTraceAnalyzer::WriteBlockReuseTimeline(
+    const uint64_t reuse_window, bool user_access_only,
+    TraceType block_type) const {
+  // A map from block key to an array of bools that states whether a block is
+  // accessed in a time window.
+  std::map<uint64_t, std::vector<bool>> block_accessed;
+  const uint64_t trace_duration =
+      trace_end_timestamp_in_seconds_ - trace_start_timestamp_in_seconds_;
+  const uint64_t reuse_vector_size = (trace_duration / reuse_window);
+  if (reuse_vector_size < 2) {
+    // The reuse window is less than 2. We cannot calculate the reused
+    // percentage of blocks.
+    return;
+  }
+  auto block_callback = [&](const std::string& /*cf_name*/, uint64_t /*fd*/,
+                            uint32_t /*level*/, TraceType /*type*/,
+                            const std::string& /*block_key*/, uint64_t block_id,
+                            const BlockAccessInfo& block) {
+    if (block_accessed.find(block_id) == block_accessed.end()) {
+      block_accessed[block_id].resize(reuse_vector_size);
+      for (uint64_t i = 0; i < reuse_vector_size; i++) {
+        block_accessed[block_id][i] = false;
+      }
+    }
+    for (auto const& caller_num : block.caller_num_accesses_timeline) {
+      const TableReaderCaller caller = caller_num.first;
+      for (auto const& timeline : caller_num.second) {
+        const uint64_t timestamp = timeline.first;
+        const uint64_t elapsed_time =
+            timestamp - trace_start_timestamp_in_seconds_;
+        if (!user_access_only || is_user_access(caller)) {
+          uint64_t index =
+              std::min(elapsed_time / reuse_window, reuse_vector_size - 1);
+          block_accessed[block_id][index] = true;
+        }
+      }
+    }
+  };
+  TraverseBlocks(block_callback);
+
+  // A cell is the number of blocks accessed in a reuse window.
+  std::unique_ptr<uint64_t[]> reuse_table(
+      new uint64_t[reuse_vector_size * reuse_vector_size]);
+  for (uint64_t start_time = 0; start_time < reuse_vector_size; start_time++) {
+    // Initialize the reuse_table.
+    for (uint64_t i = 0; i < reuse_vector_size; i++) {
+      reuse_table[start_time * reuse_vector_size + i] = 0;
+    }
+    // Examine all blocks.
+    for (auto const& block : block_accessed) {
+      for (uint64_t i = start_time; i < reuse_vector_size; i++) {
+        if (block.second[start_time] && block.second[i]) {
+          // This block is accessed at start time and at the current time. We
+          // increment reuse_table[start_time][i] since it is reused at the ith
+          // window.
+          reuse_table[start_time * reuse_vector_size + i]++;
+        }
+      }
+    }
+  }
+  const std::string user_access_prefix =
+      user_access_only ? "_user_access_only_" : "_all_access_";
+  const std::string output_path =
+      output_dir_ + "/" + block_type_to_string(block_type) +
+      user_access_prefix + std::to_string(reuse_window) + "_" +
+      kFileNameSuffixAccessReuseBlocksTimeline;
+  std::ofstream out(output_path);
+  if (!out.is_open()) {
+    return;
+  }
+  std::string header("start_time");
+  for (uint64_t start_time = 0; start_time < reuse_vector_size; start_time++) {
+    header += ",";
+    header += std::to_string(start_time);
+  }
+  out << header << std::endl;
+  for (uint64_t start_time = 0; start_time < reuse_vector_size; start_time++) {
+    std::string row(std::to_string(start_time * reuse_window));
+    for (uint64_t j = 0; j < reuse_vector_size; j++) {
+      row += ",";
+      if (j < start_time) {
+        row += "100.0";
+      } else {
+        row += std::to_string(
+            percent(reuse_table[start_time * reuse_vector_size + j],
+                    reuse_table[start_time * reuse_vector_size + start_time]));
+      }
+    }
+    out << row << std::endl;
+  }
+  out.close();
+}
+
+std::string BlockCacheTraceAnalyzer::OutputPercentAccessStats(
+    uint64_t total_accesses,
+    const std::map<std::string, uint64_t>& cf_access_count) const {
+  std::string row;
+  for (auto const& cf_aggregates : cf_aggregates_map_) {
+    const std::string& cf_name = cf_aggregates.first;
+    const auto& naccess = cf_access_count.find(cf_name);
+    row += ",";
+    if (naccess != cf_access_count.end()) {
+      row += std::to_string(percent(naccess->second, total_accesses));
+    } else {
+      row += "0";
+    }
+  }
+  return row;
+}
+
+void BlockCacheTraceAnalyzer::WritePercentAccessSummaryStats() const {
+  std::map<TableReaderCaller, std::map<std::string, uint64_t>>
+      caller_cf_accesses;
+  uint64_t total_accesses = 0;
+  auto block_callback =
+      [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t /*level*/,
+          TraceType /*type*/, const std::string& /*block_key*/,
+          uint64_t /*block_id*/, const BlockAccessInfo& block) {
+        for (auto const& caller_num : block.caller_num_access_map) {
+          const TableReaderCaller caller = caller_num.first;
+          const uint64_t naccess = caller_num.second;
+          caller_cf_accesses[caller][cf_name] += naccess;
+          total_accesses += naccess;
+        }
+      };
+  TraverseBlocks(block_callback);
+
+  const std::string output_path =
+      output_dir_ + "/" + kFileNameSuffixPercentOfAccessSummary;
+  std::ofstream out(output_path);
+  if (!out.is_open()) {
+    return;
+  }
+  std::string header("caller");
+  for (auto const& cf_name : cf_aggregates_map_) {
+    header += ",";
+    header += cf_name.first;
+  }
+  out << header << std::endl;
+  for (auto const& cf_naccess_it : caller_cf_accesses) {
+    const TableReaderCaller caller = cf_naccess_it.first;
+    std::string row;
+    row += caller_to_string(caller);
+    row += OutputPercentAccessStats(total_accesses, cf_naccess_it.second);
+    out << row << std::endl;
+  }
+  out.close();
+}
+
+void BlockCacheTraceAnalyzer::WriteDetailedPercentAccessSummaryStats(
+    TableReaderCaller analyzing_caller) const {
+  std::map<uint32_t, std::map<std::string, uint64_t>> level_cf_accesses;
+  std::map<TraceType, std::map<std::string, uint64_t>> bt_cf_accesses;
+  uint64_t total_accesses = 0;
+  auto block_callback =
+      [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t level,
+          TraceType type, const std::string& /*block_key*/,
+          uint64_t /*block_id*/, const BlockAccessInfo& block) {
+        for (auto const& caller_num : block.caller_num_access_map) {
+          const TableReaderCaller caller = caller_num.first;
+          if (caller == analyzing_caller) {
+            const uint64_t naccess = caller_num.second;
+            level_cf_accesses[level][cf_name] += naccess;
+            bt_cf_accesses[type][cf_name] += naccess;
+            total_accesses += naccess;
+          }
+        }
+      };
+  TraverseBlocks(block_callback);
+  {
+    const std::string output_path =
+        output_dir_ + "/" + caller_to_string(analyzing_caller) + "_level_" +
+        kFileNameSuffixPercentOfAccessSummary;
+    std::ofstream out(output_path);
+    if (!out.is_open()) {
+      return;
+    }
+    std::string header("level");
+    for (auto const& cf_name : cf_aggregates_map_) {
+      header += ",";
+      header += cf_name.first;
+    }
+    out << header << std::endl;
+    for (auto const& level_naccess_it : level_cf_accesses) {
+      const uint32_t level = level_naccess_it.first;
+      std::string row;
+      row += std::to_string(level);
+      row += OutputPercentAccessStats(total_accesses, level_naccess_it.second);
+      out << row << std::endl;
+    }
+    out.close();
+  }
+  {
+    const std::string output_path =
+        output_dir_ + "/" + caller_to_string(analyzing_caller) + "_bt_" +
+        kFileNameSuffixPercentOfAccessSummary;
+    std::ofstream out(output_path);
+    if (!out.is_open()) {
+      return;
+    }
+    std::string header("bt");
+    for (auto const& cf_name : cf_aggregates_map_) {
+      header += ",";
+      header += cf_name.first;
+    }
+    out << header << std::endl;
+    for (auto const& bt_naccess_it : bt_cf_accesses) {
+      const TraceType bt = bt_naccess_it.first;
+      std::string row;
+      row += block_type_to_string(bt);
+      row += OutputPercentAccessStats(total_accesses, bt_naccess_it.second);
+      out << row << std::endl;
+    }
+    out.close();
+  }
+}
+
+void BlockCacheTraceAnalyzer::WriteAccessCountSummaryStats(
+    const std::vector<uint64_t>& access_count_buckets,
+    bool user_access_only) const {
+  // x: buckets.
+  // y: # of accesses.
+  std::map<std::string, std::map<uint64_t, uint64_t>> bt_access_nblocks;
+  std::map<std::string, std::map<uint64_t, uint64_t>> cf_access_nblocks;
+  uint64_t total_nblocks = 0;
+  auto block_callback =
+      [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t /*level*/,
+          TraceType type, const std::string& /*block_key*/,
+          uint64_t /*block_id*/, const BlockAccessInfo& block) {
+        const std::string type_str = block_type_to_string(type);
+        if (cf_access_nblocks.find(cf_name) == cf_access_nblocks.end()) {
+          // initialize.
+          for (auto& access : access_count_buckets) {
+            cf_access_nblocks[cf_name][access] = 0;
+          }
+        }
+        if (bt_access_nblocks.find(type_str) == bt_access_nblocks.end()) {
+          // initialize.
+          for (auto& access : access_count_buckets) {
+            bt_access_nblocks[type_str][access] = 0;
+          }
+        }
+        uint64_t naccesses = 0;
+        for (auto const& caller_access : block.caller_num_access_map) {
+          if (!user_access_only || is_user_access(caller_access.first)) {
+            naccesses += caller_access.second;
+          }
+        }
+        if (naccesses == 0) {
+          return;
+        }
+        total_nblocks += 1;
+        bt_access_nblocks[type_str].upper_bound(naccesses)->second += 1;
+        cf_access_nblocks[cf_name].upper_bound(naccesses)->second += 1;
+      };
+  TraverseBlocks(block_callback);
+  const std::string user_access_prefix =
+      user_access_only ? "user_access_only_" : "all_access_";
+  WriteStatsToFile("cf", access_count_buckets,
+                   user_access_prefix + kFileNameSuffixAccessCountSummary,
+                   cf_access_nblocks, total_nblocks);
+  WriteStatsToFile("bt", access_count_buckets,
+                   user_access_prefix + kFileNameSuffixAccessCountSummary,
+                   bt_access_nblocks, total_nblocks);
+}
+
+BlockCacheTraceAnalyzer::BlockCacheTraceAnalyzer(
+    const std::string& trace_file_path, const std::string& output_dir,
+    const std::string& human_readable_trace_file_path,
+    bool compute_reuse_distance, bool mrc_only,
+    bool is_human_readable_trace_file,
+    std::unique_ptr<BlockCacheTraceSimulator>&& cache_simulator)
+    : env_(ROCKSDB_NAMESPACE::Env::Default()),
+      trace_file_path_(trace_file_path),
+      output_dir_(output_dir),
+      human_readable_trace_file_path_(human_readable_trace_file_path),
+      compute_reuse_distance_(compute_reuse_distance),
+      mrc_only_(mrc_only),
+      is_human_readable_trace_file_(is_human_readable_trace_file),
+      cache_simulator_(std::move(cache_simulator)) {}
+
+void BlockCacheTraceAnalyzer::ComputeReuseDistance(
+    BlockAccessInfo* info) const {
+  assert(info);
+  if (info->num_accesses == 0) {
+    return;
+  }
+  uint64_t reuse_distance = 0;
+  for (auto const& block_key : info->unique_blocks_since_last_access) {
+    auto const& it = block_info_map_.find(block_key);
+    // This block must exist.
+    assert(it != block_info_map_.end());
+    reuse_distance += it->second->block_size;
+  }
+  info->reuse_distance_count[reuse_distance] += 1;
+  // We clear this hash set since this is the second access on this block.
+  info->unique_blocks_since_last_access.clear();
+}
+
+Status BlockCacheTraceAnalyzer::RecordAccess(
+    const BlockCacheTraceRecord& access) {
+  ColumnFamilyAccessInfoAggregate& cf_aggr = cf_aggregates_map_[access.cf_name];
+  SSTFileAccessInfoAggregate& file_aggr =
+      cf_aggr.fd_aggregates_map[access.sst_fd_number];
+  file_aggr.level = access.level;
+  BlockTypeAccessInfoAggregate& block_type_aggr =
+      file_aggr.block_type_aggregates_map[access.block_type];
+  if (block_type_aggr.block_access_info_map.find(access.block_key) ==
+      block_type_aggr.block_access_info_map.end()) {
+    block_type_aggr.block_access_info_map[access.block_key].block_id =
+        unique_block_id_;
+    unique_block_id_++;
+  }
+  BlockAccessInfo& block_access_info =
+      block_type_aggr.block_access_info_map[access.block_key];
+  if (compute_reuse_distance_) {
+    ComputeReuseDistance(&block_access_info);
+  }
+  block_access_info.AddAccess(access, access_sequence_number_);
+  block_info_map_[access.block_key] = &block_access_info;
+  uint64_t get_key_id = 0;
+  if (access.caller == TableReaderCaller::kUserGet &&
+      access.get_id != BlockCacheTraceHelper::kReservedGetId) {
+    std::string user_key = ExtractUserKey(access.referenced_key).ToString();
+    if (get_key_info_map_.find(user_key) == get_key_info_map_.end()) {
+      get_key_info_map_[user_key].key_id = unique_get_key_id_;
+      unique_get_key_id_++;
+    }
+    get_key_id = get_key_info_map_[user_key].key_id;
+    get_key_info_map_[user_key].AddAccess(access, access_sequence_number_);
+  }
+
+  if (compute_reuse_distance_) {
+    // Add this block to all existing blocks.
+    for (auto& cf_aggregates : cf_aggregates_map_) {
+      for (auto& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
+        for (auto& block_type_aggregates :
+             file_aggregates.second.block_type_aggregates_map) {
+          for (auto& existing_block :
+               block_type_aggregates.second.block_access_info_map) {
+            existing_block.second.unique_blocks_since_last_access.insert(
+                access.block_key);
+          }
+        }
+      }
+    }
+  }
+  return human_readable_trace_writer_.WriteHumanReadableTraceRecord(
+      access, block_access_info.block_id, get_key_id);
+}
+
+Status BlockCacheTraceAnalyzer::Analyze() {
+  SystemClock* clock = env_->GetSystemClock().get();
+  std::unique_ptr<BlockCacheTraceReader> reader;
+  Status s = Status::OK();
+  if (is_human_readable_trace_file_) {
+    reader.reset(new BlockCacheHumanReadableTraceReader(trace_file_path_));
+  } else {
+    std::unique_ptr<TraceReader> trace_reader;
+    s = NewFileTraceReader(env_, EnvOptions(), trace_file_path_, &trace_reader);
+    if (!s.ok()) {
+      return s;
+    }
+    reader.reset(new BlockCacheTraceReader(std::move(trace_reader)));
+    s = reader->ReadHeader(&header_);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  if (!human_readable_trace_file_path_.empty()) {
+    s = human_readable_trace_writer_.NewWritableFile(
+        human_readable_trace_file_path_, env_);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  uint64_t start = clock->NowMicros();
+  uint64_t time_interval = 0;
+  while (s.ok()) {
+    BlockCacheTraceRecord access;
+    s = reader->ReadAccess(&access);
+    if (!s.ok()) {
+      break;
+    }
+    if (!mrc_only_) {
+      s = RecordAccess(access);
+      if (!s.ok()) {
+        break;
+      }
+    }
+    if (trace_start_timestamp_in_seconds_ == 0) {
+      trace_start_timestamp_in_seconds_ =
+          access.access_timestamp / kMicrosInSecond;
+    }
+    trace_end_timestamp_in_seconds_ = access.access_timestamp / kMicrosInSecond;
+    miss_ratio_stats_.UpdateMetrics(access.access_timestamp,
+                                    is_user_access(access.caller),
+                                    !access.is_cache_hit);
+    if (cache_simulator_) {
+      cache_simulator_->Access(access);
+    }
+    access_sequence_number_++;
+    uint64_t now = clock->NowMicros();
+    uint64_t duration = (now - start) / kMicrosInSecond;
+    if (duration > 10 * time_interval) {
+      uint64_t trace_duration =
+          trace_end_timestamp_in_seconds_ - trace_start_timestamp_in_seconds_;
+      fprintf(stdout,
+              "Running for %" PRIu64 " seconds: Processed %" PRIu64
+              " records/second. Trace duration %" PRIu64
+              " seconds. Observed miss ratio %.2f\n",
+              duration, duration > 0 ? access_sequence_number_ / duration : 0,
+              trace_duration, miss_ratio_stats_.miss_ratio());
+      time_interval++;
+    }
+  }
+  uint64_t now = clock->NowMicros();
+  uint64_t duration = (now - start) / kMicrosInSecond;
+  uint64_t trace_duration =
+      trace_end_timestamp_in_seconds_ - trace_start_timestamp_in_seconds_;
+  fprintf(stdout,
+          "Running for %" PRIu64 " seconds: Processed %" PRIu64
+          " records/second. Trace duration %" PRIu64
+          " seconds. Observed miss ratio %.2f\n",
+          duration, duration > 0 ? access_sequence_number_ / duration : 0,
+          trace_duration, miss_ratio_stats_.miss_ratio());
+  return s;
+}
+
+void BlockCacheTraceAnalyzer::PrintBlockSizeStats() const {
+  HistogramStat bs_stats;
+  std::map<TraceType, HistogramStat> bt_stats_map;
+  std::map<std::string, std::map<TraceType, HistogramStat>> cf_bt_stats_map;
+  auto block_callback =
+      [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t /*level*/,
+          TraceType type, const std::string& /*block_key*/,
+          uint64_t /*block_id*/, const BlockAccessInfo& block) {
+        if (block.block_size == 0) {
+          // Block size may be 0 when 1) compaction observes a cache miss and
+          // does not insert the missing block into the cache again. 2)
+          // fetching filter blocks in SST files at the last level.
+          return;
+        }
+        bs_stats.Add(block.block_size);
+        bt_stats_map[type].Add(block.block_size);
+        cf_bt_stats_map[cf_name][type].Add(block.block_size);
+      };
+  TraverseBlocks(block_callback);
+  fprintf(stdout, "Block size stats: \n%s", bs_stats.ToString().c_str());
+  for (auto const& bt_stats : bt_stats_map) {
+    print_break_lines(/*num_break_lines=*/1);
+    fprintf(stdout, "Block size stats for block type %s: \n%s",
+            block_type_to_string(bt_stats.first).c_str(),
+            bt_stats.second.ToString().c_str());
+  }
+  for (auto const& cf_bt_stats : cf_bt_stats_map) {
+    const std::string& cf_name = cf_bt_stats.first;
+    for (auto const& bt_stats : cf_bt_stats.second) {
+      print_break_lines(/*num_break_lines=*/1);
+      fprintf(stdout,
+              "Block size stats for column family %s and block type %s: \n%s",
+              cf_name.c_str(), block_type_to_string(bt_stats.first).c_str(),
+              bt_stats.second.ToString().c_str());
+    }
+  }
+}
+
+void BlockCacheTraceAnalyzer::PrintAccessCountStats(bool user_access_only,
+                                                    uint32_t bottom_k,
+                                                    uint32_t top_k) const {
+  HistogramStat access_stats;
+  std::map<TraceType, HistogramStat> bt_stats_map;
+  std::map<std::string, std::map<TraceType, HistogramStat>> cf_bt_stats_map;
+  std::map<uint64_t, std::vector<std::string>> access_count_blocks;
+  auto block_callback = [&](const std::string& cf_name, uint64_t /*fd*/,
+                            uint32_t /*level*/, TraceType type,
+                            const std::string& block_key, uint64_t /*block_id*/,
+                            const BlockAccessInfo& block) {
+    uint64_t naccesses = 0;
+    for (auto const& caller_access : block.caller_num_access_map) {
+      if (!user_access_only || is_user_access(caller_access.first)) {
+        naccesses += caller_access.second;
+      }
+    }
+    if (naccesses == 0) {
+      return;
+    }
+    if (type == TraceType::kBlockTraceDataBlock) {
+      access_count_blocks[naccesses].push_back(block_key);
+    }
+    access_stats.Add(naccesses);
+    bt_stats_map[type].Add(naccesses);
+    cf_bt_stats_map[cf_name][type].Add(naccesses);
+  };
+  TraverseBlocks(block_callback);
+  fprintf(stdout,
+          "Block access count stats: The number of accesses per block. %s\n%s",
+          user_access_only ? "User accesses only" : "All accesses",
+          access_stats.ToString().c_str());
+  uint32_t bottom_k_index = 0;
+  for (auto naccess_it = access_count_blocks.begin();
+       naccess_it != access_count_blocks.end(); naccess_it++) {
+    bottom_k_index++;
+    if (bottom_k_index >= bottom_k) {
+      break;
+    }
+    std::map<TableReaderCaller, uint64_t> caller_naccesses;
+    uint64_t naccesses = 0;
+    for (auto const& block_id : naccess_it->second) {
+      BlockAccessInfo* block = block_info_map_.find(block_id)->second;
+      for (auto const& caller_access : block->caller_num_access_map) {
+        if (!user_access_only || is_user_access(caller_access.first)) {
+          caller_naccesses[caller_access.first] += caller_access.second;
+          naccesses += caller_access.second;
+        }
+      }
+    }
+    std::string statistics("Caller:");
+    for (auto const& caller_naccessess_it : caller_naccesses) {
+      statistics += caller_to_string(caller_naccessess_it.first);
+      statistics += ":";
+      statistics +=
+          std::to_string(percent(caller_naccessess_it.second, naccesses));
+      statistics += ",";
+    }
+    fprintf(stdout,
+            "Bottom %" PRIu32 " access count. Access count=%" PRIu64
+            " nblocks=%" ROCKSDB_PRIszt " %s\n",
+            bottom_k, naccess_it->first, naccess_it->second.size(),
+            statistics.c_str());
+  }
+
+  uint32_t top_k_index = 0;
+  for (auto naccess_it = access_count_blocks.rbegin();
+       naccess_it != access_count_blocks.rend(); naccess_it++) {
+    top_k_index++;
+    if (top_k_index >= top_k) {
+      break;
+    }
+    for (auto const& block_id : naccess_it->second) {
+      BlockAccessInfo* block = block_info_map_.find(block_id)->second;
+      std::string statistics("Caller:");
+      uint64_t naccesses = 0;
+      for (auto const& caller_access : block->caller_num_access_map) {
+        if (!user_access_only || is_user_access(caller_access.first)) {
+          naccesses += caller_access.second;
+        }
+      }
+      assert(naccesses > 0);
+      for (auto const& caller_access : block->caller_num_access_map) {
+        if (!user_access_only || is_user_access(caller_access.first)) {
+          statistics += ",";
+          statistics += caller_to_string(caller_access.first);
+          statistics += ":";
+          statistics +=
+              std::to_string(percent(caller_access.second, naccesses));
+        }
+      }
+      uint64_t ref_keys_accesses = 0;
+      uint64_t ref_keys_does_not_exist_accesses = 0;
+      for (auto const& ref_key_caller_access : block->key_num_access_map) {
+        for (auto const& caller_access : ref_key_caller_access.second) {
+          if (!user_access_only || is_user_access(caller_access.first)) {
+            ref_keys_accesses += caller_access.second;
+          }
+        }
+      }
+      for (auto const& ref_key_caller_access :
+           block->non_exist_key_num_access_map) {
+        for (auto const& caller_access : ref_key_caller_access.second) {
+          if (!user_access_only || is_user_access(caller_access.first)) {
+            ref_keys_does_not_exist_accesses += caller_access.second;
+          }
+        }
+      }
+      statistics += ",nkeys=";
+      statistics += std::to_string(block->num_keys);
+      statistics += ",block_size=";
+      statistics += std::to_string(block->block_size);
+      statistics += ",num_ref_keys=";
+      statistics += std::to_string(block->key_num_access_map.size());
+      statistics += ",percent_access_ref_keys=";
+      statistics += std::to_string(percent(ref_keys_accesses, naccesses));
+      statistics += ",num_ref_keys_does_not_exist=";
+      statistics += std::to_string(block->non_exist_key_num_access_map.size());
+      statistics += ",percent_access_ref_keys_does_not_exist=";
+      statistics +=
+          std::to_string(percent(ref_keys_does_not_exist_accesses, naccesses));
+      statistics += ",ref_data_size=";
+      statistics += std::to_string(block->referenced_data_size);
+      fprintf(stdout,
+              "Top %" PRIu32 " access count blocks access_count=%" PRIu64
+              " %s\n",
+              top_k, naccess_it->first, statistics.c_str());
+    }
+  }
+
+  for (auto const& bt_stats : bt_stats_map) {
+    print_break_lines(/*num_break_lines=*/1);
+    fprintf(stdout, "Break down by block type %s: \n%s",
+            block_type_to_string(bt_stats.first).c_str(),
+            bt_stats.second.ToString().c_str());
+  }
+  for (auto const& cf_bt_stats : cf_bt_stats_map) {
+    const std::string& cf_name = cf_bt_stats.first;
+    for (auto const& bt_stats : cf_bt_stats.second) {
+      print_break_lines(/*num_break_lines=*/1);
+      fprintf(stdout,
+              "Break down by column family %s and block type "
+              "%s: \n%s",
+              cf_name.c_str(), block_type_to_string(bt_stats.first).c_str(),
+              bt_stats.second.ToString().c_str());
+    }
+  }
+}
+
+void BlockCacheTraceAnalyzer::PrintDataBlockAccessStats() const {
+  HistogramStat existing_keys_stats;
+  std::map<std::string, HistogramStat> cf_existing_keys_stats_map;
+  HistogramStat non_existing_keys_stats;
+  std::map<std::string, HistogramStat> cf_non_existing_keys_stats_map;
+  HistogramStat block_access_stats;
+  std::map<std::string, HistogramStat> cf_block_access_info;
+  HistogramStat percent_referenced_bytes;
+  std::map<std::string, HistogramStat> cf_percent_referenced_bytes;
+  // Total number of accesses in a data block / number of keys in a data block.
+  HistogramStat avg_naccesses_per_key_in_a_data_block;
+  std::map<std::string, HistogramStat> cf_avg_naccesses_per_key_in_a_data_block;
+  // The standard deviation on the number of accesses of a key in a data block.
+  HistogramStat stdev_naccesses_per_key_in_a_data_block;
+  std::map<std::string, HistogramStat>
+      cf_stdev_naccesses_per_key_in_a_data_block;
+  auto block_callback =
+      [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t /*level*/,
+          TraceType /*type*/, const std::string& /*block_key*/,
+          uint64_t /*block_id*/, const BlockAccessInfo& block) {
+        if (block.num_keys == 0) {
+          return;
+        }
+        // Use four decimal points.
+        uint64_t percent_referenced_for_existing_keys =
+            (uint64_t)(((double)block.key_num_access_map.size() /
+                        (double)block.num_keys) *
+                       10000.0);
+        uint64_t percent_referenced_for_non_existing_keys =
+            (uint64_t)(((double)block.non_exist_key_num_access_map.size() /
+                        (double)block.num_keys) *
+                       10000.0);
+        uint64_t percent_accesses_for_existing_keys =
+            (uint64_t)(((double)block.num_referenced_key_exist_in_block /
+                        (double)block.num_accesses) *
+                       10000.0);
+
+        HistogramStat hist_naccess_per_key;
+        for (auto const& key_access : block.key_num_access_map) {
+          for (auto const& caller_access : key_access.second) {
+            hist_naccess_per_key.Add(caller_access.second);
+          }
+        }
+        uint64_t avg_accesses =
+            static_cast<uint64_t>(hist_naccess_per_key.Average());
+        uint64_t stdev_accesses =
+            static_cast<uint64_t>(hist_naccess_per_key.StandardDeviation());
+        avg_naccesses_per_key_in_a_data_block.Add(avg_accesses);
+        cf_avg_naccesses_per_key_in_a_data_block[cf_name].Add(avg_accesses);
+        stdev_naccesses_per_key_in_a_data_block.Add(stdev_accesses);
+        cf_stdev_naccesses_per_key_in_a_data_block[cf_name].Add(stdev_accesses);
+
+        existing_keys_stats.Add(percent_referenced_for_existing_keys);
+        cf_existing_keys_stats_map[cf_name].Add(
+            percent_referenced_for_existing_keys);
+        non_existing_keys_stats.Add(percent_referenced_for_non_existing_keys);
+        cf_non_existing_keys_stats_map[cf_name].Add(
+            percent_referenced_for_non_existing_keys);
+        block_access_stats.Add(percent_accesses_for_existing_keys);
+        cf_block_access_info[cf_name].Add(percent_accesses_for_existing_keys);
+      };
+  TraverseBlocks(block_callback);
+  fprintf(stdout,
+          "Histogram on the number of referenced keys existing in a block over "
+          "the total number of keys in a block: \n%s",
+          existing_keys_stats.ToString().c_str());
+  for (auto const& cf_stats : cf_existing_keys_stats_map) {
+    print_break_lines(/*num_break_lines=*/1);
+    fprintf(stdout, "Break down by column family %s: \n%s",
+            cf_stats.first.c_str(), cf_stats.second.ToString().c_str());
+  }
+  print_break_lines(/*num_break_lines=*/1);
+  fprintf(
+      stdout,
+      "Histogram on the number of referenced keys DO NOT exist in a block over "
+      "the total number of keys in a block: \n%s",
+      non_existing_keys_stats.ToString().c_str());
+  for (auto const& cf_stats : cf_non_existing_keys_stats_map) {
+    print_break_lines(/*num_break_lines=*/1);
+    fprintf(stdout, "Break down by column family %s: \n%s",
+            cf_stats.first.c_str(), cf_stats.second.ToString().c_str());
+  }
+  print_break_lines(/*num_break_lines=*/1);
+  fprintf(stdout,
+          "Histogram on the number of accesses on keys exist in a block over "
+          "the total number of accesses in a block: \n%s",
+          block_access_stats.ToString().c_str());
+  for (auto const& cf_stats : cf_block_access_info) {
+    print_break_lines(/*num_break_lines=*/1);
+    fprintf(stdout, "Break down by column family %s: \n%s",
+            cf_stats.first.c_str(), cf_stats.second.ToString().c_str());
+  }
+  print_break_lines(/*num_break_lines=*/1);
+  fprintf(
+      stdout,
+      "Histogram on the average number of accesses per key in a block: \n%s",
+      avg_naccesses_per_key_in_a_data_block.ToString().c_str());
+  for (auto const& cf_stats : cf_avg_naccesses_per_key_in_a_data_block) {
+    fprintf(stdout, "Break down by column family %s: \n%s",
+            cf_stats.first.c_str(), cf_stats.second.ToString().c_str());
+  }
+  print_break_lines(/*num_break_lines=*/1);
+  fprintf(stdout,
+          "Histogram on the standard deviation of the number of accesses per "
+          "key in a block: \n%s",
+          stdev_naccesses_per_key_in_a_data_block.ToString().c_str());
+  for (auto const& cf_stats : cf_stdev_naccesses_per_key_in_a_data_block) {
+    fprintf(stdout, "Break down by column family %s: \n%s",
+            cf_stats.first.c_str(), cf_stats.second.ToString().c_str());
+  }
+}
+
+void BlockCacheTraceAnalyzer::PrintStatsSummary() const {
+  uint64_t total_num_files = 0;
+  uint64_t total_num_blocks = 0;
+  uint64_t total_num_accesses = 0;
+  std::map<TraceType, uint64_t> bt_num_blocks_map;
+  std::map<TableReaderCaller, uint64_t> caller_num_access_map;
+  std::map<TableReaderCaller, std::map<TraceType, uint64_t>>
+      caller_bt_num_access_map;
+  std::map<TableReaderCaller, std::map<uint32_t, uint64_t>>
+      caller_level_num_access_map;
+  for (auto const& cf_aggregates : cf_aggregates_map_) {
+    // Stats per column family.
+    const std::string& cf_name = cf_aggregates.first;
+    uint64_t cf_num_files = 0;
+    uint64_t cf_num_blocks = 0;
+    std::map<TraceType, uint64_t> cf_bt_blocks;
+    uint64_t cf_num_accesses = 0;
+    std::map<TableReaderCaller, uint64_t> cf_caller_num_accesses_map;
+    std::map<TableReaderCaller, std::map<uint64_t, uint64_t>>
+        cf_caller_level_num_accesses_map;
+    std::map<TableReaderCaller, std::map<uint64_t, uint64_t>>
+        cf_caller_file_num_accesses_map;
+    std::map<TableReaderCaller, std::map<TraceType, uint64_t>>
+        cf_caller_bt_num_accesses_map;
+    total_num_files += cf_aggregates.second.fd_aggregates_map.size();
+    for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
+      // Stats per SST file.
+      const uint64_t fd = file_aggregates.first;
+      const uint32_t level = file_aggregates.second.level;
+      cf_num_files++;
+      for (auto const& block_type_aggregates :
+           file_aggregates.second.block_type_aggregates_map) {
+        // Stats per block type.
+        const TraceType type = block_type_aggregates.first;
+        cf_bt_blocks[type] +=
+            block_type_aggregates.second.block_access_info_map.size();
+        total_num_blocks +=
+            block_type_aggregates.second.block_access_info_map.size();
+        bt_num_blocks_map[type] +=
+            block_type_aggregates.second.block_access_info_map.size();
+        for (auto const& block_access_info :
+             block_type_aggregates.second.block_access_info_map) {
+          // Stats per block.
+          cf_num_blocks++;
+          for (auto const& stats :
+               block_access_info.second.caller_num_access_map) {
+            // Stats per caller.
+            const TableReaderCaller caller = stats.first;
+            const uint64_t num_accesses = stats.second;
+            // Overall stats.
+            total_num_accesses += num_accesses;
+            caller_num_access_map[caller] += num_accesses;
+            caller_bt_num_access_map[caller][type] += num_accesses;
+            caller_level_num_access_map[caller][level] += num_accesses;
+            // Column Family stats.
+            cf_num_accesses += num_accesses;
+            cf_caller_num_accesses_map[caller] += num_accesses;
+            cf_caller_level_num_accesses_map[caller][level] += num_accesses;
+            cf_caller_file_num_accesses_map[caller][fd] += num_accesses;
+            cf_caller_bt_num_accesses_map[caller][type] += num_accesses;
+          }
+        }
+      }
+    }
+
+    // Print stats.
+    print_break_lines(/*num_break_lines=*/3);
+    fprintf(stdout, "Statistics for column family %s:\n", cf_name.c_str());
+    fprintf(stdout,
+            " Number of files:%" PRIu64 " Number of blocks: %" PRIu64
+            " Number of accesses: %" PRIu64 "\n",
+            cf_num_files, cf_num_blocks, cf_num_accesses);
+    for (auto block_type : cf_bt_blocks) {
+      fprintf(stdout, "Number of %s blocks: %" PRIu64 " Percent: %.2f\n",
+              block_type_to_string(block_type.first).c_str(), block_type.second,
+              percent(block_type.second, cf_num_blocks));
+    }
+    for (auto caller : cf_caller_num_accesses_map) {
+      const uint64_t naccesses = caller.second;
+      print_break_lines(/*num_break_lines=*/1);
+      fprintf(stdout,
+              "Caller %s: Number of accesses %" PRIu64 " Percent: %.2f\n",
+              caller_to_string(caller.first).c_str(), naccesses,
+              percent(naccesses, cf_num_accesses));
+      fprintf(stdout, "Caller %s: Number of accesses per level break down\n",
+              caller_to_string(caller.first).c_str());
+      for (auto naccess_level :
+           cf_caller_level_num_accesses_map[caller.first]) {
+        fprintf(stdout,
+                "\t Level %" PRIu64 ": Number of accesses: %" PRIu64
+                " Percent: %.2f\n",
+                naccess_level.first, naccess_level.second,
+                percent(naccess_level.second, naccesses));
+      }
+      fprintf(stdout, "Caller %s: Number of accesses per file break down\n",
+              caller_to_string(caller.first).c_str());
+      for (auto naccess_file : cf_caller_file_num_accesses_map[caller.first]) {
+        fprintf(stdout,
+                "\t File %" PRIu64 ": Number of accesses: %" PRIu64
+                " Percent: %.2f\n",
+                naccess_file.first, naccess_file.second,
+                percent(naccess_file.second, naccesses));
+      }
+      fprintf(stdout,
+              "Caller %s: Number of accesses per block type break down\n",
+              caller_to_string(caller.first).c_str());
+      for (auto naccess_type : cf_caller_bt_num_accesses_map[caller.first]) {
+        fprintf(stdout,
+                "\t Block Type %s: Number of accesses: %" PRIu64
+                " Percent: %.2f\n",
+                block_type_to_string(naccess_type.first).c_str(),
+                naccess_type.second, percent(naccess_type.second, naccesses));
+      }
+    }
+  }
+  print_break_lines(/*num_break_lines=*/3);
+  fprintf(stdout, "Overall statistics:\n");
+  fprintf(stdout,
+          "Number of files: %" PRIu64 " Number of blocks: %" PRIu64
+          " Number of accesses: %" PRIu64 "\n",
+          total_num_files, total_num_blocks, total_num_accesses);
+  for (auto block_type : bt_num_blocks_map) {
+    fprintf(stdout, "Number of %s blocks: %" PRIu64 " Percent: %.2f\n",
+            block_type_to_string(block_type.first).c_str(), block_type.second,
+            percent(block_type.second, total_num_blocks));
+  }
+  for (auto caller : caller_num_access_map) {
+    print_break_lines(/*num_break_lines=*/1);
+    uint64_t naccesses = caller.second;
+    fprintf(stdout, "Caller %s: Number of accesses %" PRIu64 " Percent: %.2f\n",
+            caller_to_string(caller.first).c_str(), naccesses,
+            percent(naccesses, total_num_accesses));
+    fprintf(stdout, "Caller %s: Number of accesses per level break down\n",
+            caller_to_string(caller.first).c_str());
+    for (auto naccess_level : caller_level_num_access_map[caller.first]) {
+      fprintf(stdout,
+              "\t Level %d: Number of accesses: %" PRIu64 " Percent: %.2f\n",
+              naccess_level.first, naccess_level.second,
+              percent(naccess_level.second, naccesses));
+    }
+    fprintf(stdout, "Caller %s: Number of accesses per block type break down\n",
+            caller_to_string(caller.first).c_str());
+    for (auto naccess_type : caller_bt_num_access_map[caller.first]) {
+      fprintf(stdout,
+              "\t Block Type %s: Number of accesses: %" PRIu64
+              " Percent: %.2f\n",
+              block_type_to_string(naccess_type.first).c_str(),
+              naccess_type.second, percent(naccess_type.second, naccesses));
+    }
+  }
+}
+
+std::vector<CacheConfiguration> parse_cache_config_file(
+    const std::string& config_path) {
+  std::ifstream file(config_path);
+  if (!file.is_open()) {
+    return {};
+  }
+  std::vector<CacheConfiguration> configs;
+  std::string line;
+  while (getline(file, line)) {
+    CacheConfiguration cache_config;
+    std::stringstream ss(line);
+    std::vector<std::string> config_strs;
+    while (ss.good()) {
+      std::string substr;
+      getline(ss, substr, ',');
+      config_strs.push_back(substr);
+    }
+    // Sanity checks.
+    if (config_strs.size() < 4) {
+      fprintf(stderr, "Invalid cache simulator configuration %s\n",
+              line.c_str());
+      exit(1);
+    }
+    if (kSupportedCacheNames.find(" " + config_strs[0] + " ") ==
+        std::string::npos) {
+      fprintf(stderr, "Invalid cache name %s. Supported cache names are %s\n",
+              line.c_str(), kSupportedCacheNames.c_str());
+      exit(1);
+    }
+    cache_config.cache_name = config_strs[0];
+    cache_config.num_shard_bits = ParseUint32(config_strs[1]);
+    cache_config.ghost_cache_capacity = ParseUint64(config_strs[2]);
+    for (uint32_t i = 3; i < config_strs.size(); i++) {
+      uint64_t capacity = ParseUint64(config_strs[i]);
+      if (capacity == 0) {
+        fprintf(stderr, "Invalid cache capacity %s, %s\n",
+                config_strs[i].c_str(), line.c_str());
+        exit(1);
+      }
+      cache_config.cache_capacities.push_back(capacity);
+    }
+    configs.push_back(cache_config);
+  }
+  file.close();
+  return configs;
+}
+
+std::vector<uint64_t> parse_buckets(const std::string& bucket_str) {
+  std::vector<uint64_t> buckets;
+  std::stringstream ss(bucket_str);
+  while (ss.good()) {
+    std::string bucket;
+    getline(ss, bucket, ',');
+    buckets.push_back(ParseUint64(bucket));
+  }
+  buckets.push_back(std::numeric_limits<uint64_t>::max());
+  return buckets;
+}
+
+int block_cache_trace_analyzer_tool(int argc, char** argv) {
+  ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_block_cache_trace_path.empty()) {
+    fprintf(stderr, "block cache trace path is empty\n");
+    exit(1);
+  }
+  uint64_t warmup_seconds =
+      FLAGS_cache_sim_warmup_seconds > 0 ? FLAGS_cache_sim_warmup_seconds : 0;
+  uint32_t downsample_ratio = FLAGS_block_cache_trace_downsample_ratio > 0
+                                  ? FLAGS_block_cache_trace_downsample_ratio
+                                  : 0;
+  std::vector<CacheConfiguration> cache_configs =
+      parse_cache_config_file(FLAGS_block_cache_sim_config_path);
+  std::unique_ptr<BlockCacheTraceSimulator> cache_simulator;
+  if (!cache_configs.empty()) {
+    cache_simulator.reset(new BlockCacheTraceSimulator(
+        warmup_seconds, downsample_ratio, cache_configs));
+    Status s = cache_simulator->InitializeCaches();
+    if (!s.ok()) {
+      fprintf(stderr, "Cannot initialize cache simulators %s\n",
+              s.ToString().c_str());
+      exit(1);
+    }
+  }
+  BlockCacheTraceAnalyzer analyzer(
+      FLAGS_block_cache_trace_path, FLAGS_block_cache_analysis_result_dir,
+      FLAGS_human_readable_trace_file_path,
+      !FLAGS_reuse_distance_labels.empty(), FLAGS_mrc_only,
+      FLAGS_is_block_cache_human_readable_trace, std::move(cache_simulator));
+  Status s = analyzer.Analyze();
+  if (!s.IsIncomplete() && !s.ok()) {
+    // Read all traces.
+    fprintf(stderr, "Cannot process the trace %s\n", s.ToString().c_str());
+    exit(1);
+  }
+  fprintf(stdout, "Status: %s\n", s.ToString().c_str());
+  analyzer.WriteMissRatioCurves();
+  analyzer.WriteMissRatioTimeline(1);
+  analyzer.WriteMissRatioTimeline(kSecondInMinute);
+  analyzer.WriteMissRatioTimeline(kSecondInHour);
+  analyzer.WriteMissTimeline(1);
+  analyzer.WriteMissTimeline(kSecondInMinute);
+  analyzer.WriteMissTimeline(kSecondInHour);
+
+  if (FLAGS_mrc_only) {
+    fprintf(stdout,
+            "Skipping the analysis statistics since the user wants to compute "
+            "MRC only");
+    return 0;
+  }
+
+  analyzer.PrintStatsSummary();
+  if (FLAGS_print_access_count_stats) {
+    print_break_lines(/*num_break_lines=*/3);
+    analyzer.PrintAccessCountStats(
+        /*user_access_only=*/false, FLAGS_analyze_bottom_k_access_count_blocks,
+        FLAGS_analyze_top_k_access_count_blocks);
+    print_break_lines(/*num_break_lines=*/3);
+    analyzer.PrintAccessCountStats(
+        /*user_access_only=*/true, FLAGS_analyze_bottom_k_access_count_blocks,
+        FLAGS_analyze_top_k_access_count_blocks);
+  }
+  if (FLAGS_print_block_size_stats) {
+    print_break_lines(/*num_break_lines=*/3);
+    analyzer.PrintBlockSizeStats();
+  }
+  if (FLAGS_print_data_block_access_count_stats) {
+    print_break_lines(/*num_break_lines=*/3);
+    analyzer.PrintDataBlockAccessStats();
+  }
+  print_break_lines(/*num_break_lines=*/3);
+
+  if (!FLAGS_timeline_labels.empty()) {
+    std::stringstream ss(FLAGS_timeline_labels);
+    while (ss.good()) {
+      std::string label;
+      getline(ss, label, ',');
+      if (label.find("block") != std::string::npos) {
+        analyzer.WriteAccessTimeline(label, kSecondInMinute, true);
+        analyzer.WriteAccessTimeline(label, kSecondInMinute, false);
+        analyzer.WriteAccessTimeline(label, kSecondInHour, true);
+        analyzer.WriteAccessTimeline(label, kSecondInHour, false);
+      } else {
+        analyzer.WriteAccessTimeline(label, kSecondInMinute, false);
+        analyzer.WriteAccessTimeline(label, kSecondInHour, false);
+      }
+    }
+  }
+
+  if (!FLAGS_analyze_callers.empty()) {
+    analyzer.WritePercentAccessSummaryStats();
+    std::stringstream ss(FLAGS_analyze_callers);
+    while (ss.good()) {
+      std::string caller;
+      getline(ss, caller, ',');
+      analyzer.WriteDetailedPercentAccessSummaryStats(string_to_caller(caller));
+    }
+  }
+
+  if (!FLAGS_access_count_buckets.empty()) {
+    std::vector<uint64_t> buckets = parse_buckets(FLAGS_access_count_buckets);
+    analyzer.WriteAccessCountSummaryStats(buckets, /*user_access_only=*/true);
+    analyzer.WriteAccessCountSummaryStats(buckets, /*user_access_only=*/false);
+  }
+
+  if (!FLAGS_reuse_distance_labels.empty() &&
+      !FLAGS_reuse_distance_buckets.empty()) {
+    std::vector<uint64_t> buckets = parse_buckets(FLAGS_reuse_distance_buckets);
+    std::stringstream ss(FLAGS_reuse_distance_labels);
+    while (ss.good()) {
+      std::string label;
+      getline(ss, label, ',');
+      analyzer.WriteReuseDistance(label, buckets);
+    }
+  }
+
+  if (!FLAGS_reuse_interval_labels.empty() &&
+      !FLAGS_reuse_interval_buckets.empty()) {
+    std::vector<uint64_t> buckets = parse_buckets(FLAGS_reuse_interval_buckets);
+    std::stringstream ss(FLAGS_reuse_interval_labels);
+    while (ss.good()) {
+      std::string label;
+      getline(ss, label, ',');
+      analyzer.WriteReuseInterval(label, buckets);
+    }
+  }
+
+  if (!FLAGS_reuse_lifetime_labels.empty() &&
+      !FLAGS_reuse_lifetime_buckets.empty()) {
+    std::vector<uint64_t> buckets = parse_buckets(FLAGS_reuse_lifetime_buckets);
+    std::stringstream ss(FLAGS_reuse_lifetime_labels);
+    while (ss.good()) {
+      std::string label;
+      getline(ss, label, ',');
+      analyzer.WriteReuseLifetime(label, buckets);
+    }
+  }
+
+  if (FLAGS_analyze_blocks_reuse_k_reuse_window != 0) {
+    std::vector<TraceType> block_types{TraceType::kBlockTraceIndexBlock,
+                                       TraceType::kBlockTraceDataBlock,
+                                       TraceType::kBlockTraceFilterBlock};
+    for (auto block_type : block_types) {
+      analyzer.WriteBlockReuseTimeline(
+          FLAGS_analyze_blocks_reuse_k_reuse_window,
+          /*user_access_only=*/true, block_type);
+      analyzer.WriteBlockReuseTimeline(
+          FLAGS_analyze_blocks_reuse_k_reuse_window,
+          /*user_access_only=*/false, block_type);
+    }
+  }
+
+  if (!FLAGS_analyze_get_spatial_locality_labels.empty() &&
+      !FLAGS_analyze_get_spatial_locality_buckets.empty()) {
+    std::vector<uint64_t> buckets =
+        parse_buckets(FLAGS_analyze_get_spatial_locality_buckets);
+    std::stringstream ss(FLAGS_analyze_get_spatial_locality_labels);
+    while (ss.good()) {
+      std::string label;
+      getline(ss, label, ',');
+      analyzer.WriteGetSpatialLocality(label, buckets);
+    }
+  }
+
+  if (!FLAGS_analyze_correlation_coefficients_labels.empty()) {
+    std::stringstream ss(FLAGS_analyze_correlation_coefficients_labels);
+    while (ss.good()) {
+      std::string label;
+      getline(ss, label, ',');
+      analyzer.WriteCorrelationFeatures(
+          label, FLAGS_analyze_correlation_coefficients_max_number_of_values);
+    }
+    analyzer.WriteCorrelationFeaturesForGet(
+        FLAGS_analyze_correlation_coefficients_max_number_of_values);
+  }
+
+  if (!FLAGS_skew_labels.empty() && !FLAGS_skew_buckets.empty()) {
+    std::vector<uint64_t> buckets = parse_buckets(FLAGS_skew_buckets);
+    std::stringstream ss(FLAGS_skew_labels);
+    while (ss.good()) {
+      std::string label;
+      getline(ss, label, ',');
+      if (label.find("block") != std::string::npos) {
+        analyzer.WriteSkewness(label, buckets,
+                               TraceType::kBlockTraceIndexBlock);
+        analyzer.WriteSkewness(label, buckets,
+                               TraceType::kBlockTraceFilterBlock);
+        analyzer.WriteSkewness(label, buckets, TraceType::kBlockTraceDataBlock);
+        analyzer.WriteSkewness(label, buckets, TraceType::kTraceMax);
+      } else {
+        analyzer.WriteSkewness(label, buckets, TraceType::kTraceMax);
+      }
+    }
+  }
+  return 0;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // GFLAGS
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.h b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.h
new file mode 100644
index 000000000..2f1ebd139
--- /dev/null
+++ b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.h
@@ -0,0 +1,397 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <map>
+#include <set>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/env.h"
+#include "rocksdb/trace_record.h"
+#include "rocksdb/utilities/sim_cache.h"
+#include "trace_replay/block_cache_tracer.h"
+#include "utilities/simulator_cache/cache_simulator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Statistics of a key refereneced by a Get.
+struct GetKeyInfo {
+  uint64_t key_id = 0;
+  std::vector<uint64_t> access_sequence_number_timeline;
+  std::vector<uint64_t> access_timeline;
+
+  void AddAccess(const BlockCacheTraceRecord& access,
+                 uint64_t access_sequnce_number) {
+    access_sequence_number_timeline.push_back(access_sequnce_number);
+    access_timeline.push_back(access.access_timestamp);
+  }
+};
+
+// Statistics of a block.
+struct BlockAccessInfo {
+  uint64_t block_id = 0;
+  uint64_t table_id = 0;
+  uint64_t block_offset = 0;
+  uint64_t num_accesses = 0;
+  uint64_t block_size = 0;
+  uint64_t first_access_time = 0;
+  uint64_t last_access_time = 0;
+  uint64_t num_keys = 0;
+  std::map<std::string, std::map<TableReaderCaller, uint64_t>>
+      key_num_access_map;  // for keys exist in this block.
+  std::map<std::string, std::map<TableReaderCaller, uint64_t>>
+      non_exist_key_num_access_map;  // for keys do not exist in this block.
+  uint64_t num_referenced_key_exist_in_block = 0;
+  uint64_t referenced_data_size = 0;
+  std::map<TableReaderCaller, uint64_t> caller_num_access_map;
+  // caller:timestamp:number_of_accesses. The granularity of the timestamp is
+  // seconds.
+  std::map<TableReaderCaller, std::map<uint64_t, uint64_t>>
+      caller_num_accesses_timeline;
+  // Unique blocks since the last access.
+  std::set<std::string> unique_blocks_since_last_access;
+  // Number of reuses grouped by reuse distance.
+  std::map<uint64_t, uint64_t> reuse_distance_count;
+
+  // The access sequence numbers of this block.
+  std::vector<uint64_t> access_sequence_number_timeline;
+  std::map<TableReaderCaller, std::vector<uint64_t>>
+      caller_access_sequence__number_timeline;
+  // The access timestamp in microseconds of this block.
+  std::vector<uint64_t> access_timeline;
+  std::map<TableReaderCaller, std::vector<uint64_t>> caller_access_timeline;
+
+  void AddAccess(const BlockCacheTraceRecord& access,
+                 uint64_t access_sequnce_number) {
+    if (block_size != 0 && access.block_size != 0) {
+      assert(block_size == access.block_size);
+    }
+    if (num_keys != 0 && access.num_keys_in_block != 0) {
+      assert(num_keys == access.num_keys_in_block);
+    }
+    if (first_access_time == 0) {
+      first_access_time = access.access_timestamp;
+    }
+    table_id = BlockCacheTraceHelper::GetTableId(access);
+    block_offset = BlockCacheTraceHelper::GetBlockOffsetInFile(access);
+    last_access_time = access.access_timestamp;
+    block_size = access.block_size;
+    caller_num_access_map[access.caller]++;
+    num_accesses++;
+    // access.access_timestamp is in microsecond.
+    const uint64_t timestamp_in_seconds =
+        access.access_timestamp / kMicrosInSecond;
+    caller_num_accesses_timeline[access.caller][timestamp_in_seconds] += 1;
+    // Populate the feature vectors.
+    access_sequence_number_timeline.push_back(access_sequnce_number);
+    caller_access_sequence__number_timeline[access.caller].push_back(
+        access_sequnce_number);
+    access_timeline.push_back(access.access_timestamp);
+    caller_access_timeline[access.caller].push_back(access.access_timestamp);
+    if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(access.block_type,
+                                                          access.caller)) {
+      num_keys = access.num_keys_in_block;
+      if (access.referenced_key_exist_in_block) {
+        if (key_num_access_map.find(access.referenced_key) ==
+            key_num_access_map.end()) {
+          referenced_data_size += access.referenced_data_size;
+        }
+        key_num_access_map[access.referenced_key][access.caller]++;
+        num_referenced_key_exist_in_block++;
+        if (referenced_data_size > block_size && block_size != 0) {
+          ParsedInternalKey internal_key;
+          Status s = ParseInternalKey(access.referenced_key, &internal_key,
+                                      false /* log_err_key */);  // TODO
+          assert(s.ok());                                        // TODO
+        }
+      } else {
+        non_exist_key_num_access_map[access.referenced_key][access.caller]++;
+      }
+    }
+  }
+};
+
+// Aggregates stats of a block given a block type.
+struct BlockTypeAccessInfoAggregate {
+  std::map<std::string, BlockAccessInfo> block_access_info_map;
+};
+
+// Aggregates BlockTypeAggregate given a SST file.
+struct SSTFileAccessInfoAggregate {
+  uint32_t level;
+  std::map<TraceType, BlockTypeAccessInfoAggregate> block_type_aggregates_map;
+};
+
+// Aggregates SSTFileAggregate given a column family.
+struct ColumnFamilyAccessInfoAggregate {
+  std::map<uint64_t, SSTFileAccessInfoAggregate> fd_aggregates_map;
+};
+
+struct Features {
+  std::vector<uint64_t> elapsed_time_since_last_access;
+  std::vector<uint64_t> num_accesses_since_last_access;
+  std::vector<uint64_t> num_past_accesses;
+};
+
+struct Predictions {
+  std::vector<uint64_t> elapsed_time_till_next_access;
+  std::vector<uint64_t> num_accesses_till_next_access;
+};
+
+class BlockCacheTraceAnalyzer {
+ public:
+  BlockCacheTraceAnalyzer(
+      const std::string& trace_file_path, const std::string& output_dir,
+      const std::string& human_readable_trace_file_path,
+      bool compute_reuse_distance, bool mrc_only,
+      bool is_human_readable_trace_file,
+      std::unique_ptr<BlockCacheTraceSimulator>&& cache_simulator);
+  ~BlockCacheTraceAnalyzer() = default;
+  // No copy and move.
+  BlockCacheTraceAnalyzer(const BlockCacheTraceAnalyzer&) = delete;
+  BlockCacheTraceAnalyzer& operator=(const BlockCacheTraceAnalyzer&) = delete;
+  BlockCacheTraceAnalyzer(BlockCacheTraceAnalyzer&&) = delete;
+  BlockCacheTraceAnalyzer& operator=(BlockCacheTraceAnalyzer&&) = delete;
+
+  // Read all access records in the given trace_file, maintains the stats of
+  // a block, and aggregates the information by block type, sst file, and column
+  // family. Subsequently, the caller may call Print* functions to print
+  // statistics.
+  Status Analyze();
+
+  // Print a summary of statistics of the trace, e.g.,
+  // Number of files: 2 Number of blocks: 50 Number of accesses: 50
+  // Number of Index blocks: 10
+  // Number of Filter blocks: 10
+  // Number of Data blocks: 10
+  // Number of UncompressionDict blocks: 10
+  // Number of RangeDeletion blocks: 10
+  // ***************************************************************
+  // Caller Get: Number of accesses 10
+  // Caller Get: Number of accesses per level break down
+  //          Level 0: Number of accesses: 10
+  // Caller Get: Number of accesses per block type break down
+  //          Block Type Index: Number of accesses: 2
+  //          Block Type Filter: Number of accesses: 2
+  //          Block Type Data: Number of accesses: 2
+  //          Block Type UncompressionDict: Number of accesses: 2
+  //          Block Type RangeDeletion: Number of accesses: 2
+  void PrintStatsSummary() const;
+
+  // Print block size distribution and the distribution break down by block type
+  // and column family.
+  void PrintBlockSizeStats() const;
+
+  // Print access count distribution and the distribution break down by block
+  // type and column family.
+  void PrintAccessCountStats(bool user_access_only, uint32_t bottom_k,
+                             uint32_t top_k) const;
+
+  // Print data block accesses by user Get and Multi-Get.
+  // It prints out 1) A histogram on the percentage of keys accessed in a data
+  // block break down by if a referenced key exists in the data block andthe
+  // histogram break down by column family. 2) A histogram on the percentage of
+  // accesses on keys exist in a data block and its break down by column family.
+  void PrintDataBlockAccessStats() const;
+
+  // Write the percentage of accesses break down by column family into a csv
+  // file saved in 'output_dir'.
+  //
+  // The file is named "percentage_of_accesses_summary". The file format is
+  // caller,cf_0,cf_1,...,cf_n where the cf_i is the column family name found in
+  // the trace.
+  void WritePercentAccessSummaryStats() const;
+
+  // Write the percentage of accesses for the given caller break down by column
+  // family, level, and block type into a csv file saved in 'output_dir'.
+  //
+  // It generates two files: 1) caller_level_percentage_of_accesses_summary and
+  // 2) caller_bt_percentage_of_accesses_summary which break down by the level
+  // and block type, respectively. The file format is
+  // level/bt,cf_0,cf_1,...,cf_n where cf_i is the column family name found in
+  // the trace.
+  void WriteDetailedPercentAccessSummaryStats(TableReaderCaller caller) const;
+
+  // Write the access count summary into a csv file saved in 'output_dir'.
+  // It groups blocks by their access count.
+  //
+  // It generates two files: 1) cf_access_count_summary and 2)
+  // bt_access_count_summary which break down the access count by column family
+  // and block type, respectively. The file format is
+  // cf/bt,bucket_0,bucket_1,...,bucket_N.
+  void WriteAccessCountSummaryStats(
+      const std::vector<uint64_t>& access_count_buckets,
+      bool user_access_only) const;
+
+  // Write miss ratio curves of simulated cache configurations into a csv file
+  // named "mrc" saved in 'output_dir'.
+  //
+  // The file format is
+  // "cache_name,num_shard_bits,capacity,miss_ratio,total_accesses".
+  void WriteMissRatioCurves() const;
+
+  // Write miss ratio timeline of simulated cache configurations into several
+  // csv files, one per cache capacity saved in 'output_dir'.
+  //
+  // The file format is
+  // "time,label_1_access_per_second,label_2_access_per_second,...,label_N_access_per_second"
+  // where N is the number of unique cache names
+  // (cache_name+num_shard_bits+ghost_capacity).
+  void WriteMissRatioTimeline(uint64_t time_unit) const;
+
+  // Write misses timeline of simulated cache configurations into several
+  // csv files, one per cache capacity saved in 'output_dir'.
+  //
+  // The file format is
+  // "time,label_1_access_per_second,label_2_access_per_second,...,label_N_access_per_second"
+  // where N is the number of unique cache names
+  // (cache_name+num_shard_bits+ghost_capacity).
+  void WriteMissTimeline(uint64_t time_unit) const;
+
+  // Write the access timeline into a csv file saved in 'output_dir'.
+  //
+  // The file is named "label_access_timeline".The file format is
+  // "time,label_1_access_per_second,label_2_access_per_second,...,label_N_access_per_second"
+  // where N is the number of unique labels found in the trace.
+  void WriteAccessTimeline(const std::string& label, uint64_t time_unit,
+                           bool user_access_only) const;
+
+  // Write the reuse distance into a csv file saved in 'output_dir'. Reuse
+  // distance is defined as the cumulated size of unique blocks read between two
+  // consective accesses on the same block.
+  //
+  // The file is named "label_reuse_distance". The file format is
+  // bucket,label_1,label_2,...,label_N.
+  void WriteReuseDistance(const std::string& label_str,
+                          const std::vector<uint64_t>& distance_buckets) const;
+
+  // Write the reuse interval into a csv file saved in 'output_dir'. Reuse
+  // interval is defined as the time between two consecutive accesses on the
+  // same block.
+  //
+  // The file is named "label_reuse_interval". The file format is
+  // bucket,label_1,label_2,...,label_N.
+  void WriteReuseInterval(const std::string& label_str,
+                          const std::vector<uint64_t>& time_buckets) const;
+
+  // Write the reuse lifetime into a csv file saved in 'output_dir'. Reuse
+  // lifetime is defined as the time interval between the first access of a
+  // block and its last access.
+  //
+  // The file is named "label_reuse_lifetime". The file format is
+  // bucket,label_1,label_2,...,label_N.
+  void WriteReuseLifetime(const std::string& label_str,
+                          const std::vector<uint64_t>& time_buckets) const;
+
+  // Write the reuse timeline into a csv file saved in 'output_dir'.
+  //
+  // The file is named
+  // "block_type_user_access_only_reuse_window_reuse_timeline". The file format
+  // is start_time,0,1,...,N where N equals trace_duration / reuse_window.
+  void WriteBlockReuseTimeline(const uint64_t reuse_window,
+                               bool user_access_only,
+                               TraceType block_type) const;
+
+  // Write the Get spatical locality into csv files saved in 'output_dir'.
+  //
+  // It generates three csv files. label_percent_ref_keys,
+  // label_percent_accesses_on_ref_keys, and
+  // label_percent_data_size_on_ref_keys.
+  void WriteGetSpatialLocality(
+      const std::string& label_str,
+      const std::vector<uint64_t>& percent_buckets) const;
+
+  void WriteCorrelationFeatures(const std::string& label_str,
+                                uint32_t max_number_of_values) const;
+
+  void WriteCorrelationFeaturesForGet(uint32_t max_number_of_values) const;
+
+  void WriteSkewness(const std::string& label_str,
+                     const std::vector<uint64_t>& percent_buckets,
+                     TraceType target_block_type) const;
+
+  const std::map<std::string, ColumnFamilyAccessInfoAggregate>&
+  TEST_cf_aggregates_map() const {
+    return cf_aggregates_map_;
+  }
+
+ private:
+  std::set<std::string> ParseLabelStr(const std::string& label_str) const;
+
+  std::string BuildLabel(const std::set<std::string>& labels,
+                         const std::string& cf_name, uint64_t fd,
+                         uint32_t level, TraceType type,
+                         TableReaderCaller caller, uint64_t block_key,
+                         const BlockAccessInfo& block) const;
+
+  void ComputeReuseDistance(BlockAccessInfo* info) const;
+
+  Status RecordAccess(const BlockCacheTraceRecord& access);
+
+  void UpdateReuseIntervalStats(
+      const std::string& label, const std::vector<uint64_t>& time_buckets,
+      const std::map<uint64_t, uint64_t> timeline,
+      std::map<std::string, std::map<uint64_t, uint64_t>>*
+          label_time_num_reuses,
+      uint64_t* total_num_reuses) const;
+
+  std::string OutputPercentAccessStats(
+      uint64_t total_accesses,
+      const std::map<std::string, uint64_t>& cf_access_count) const;
+
+  void WriteStatsToFile(
+      const std::string& label_str, const std::vector<uint64_t>& time_buckets,
+      const std::string& filename_suffix,
+      const std::map<std::string, std::map<uint64_t, uint64_t>>& label_data,
+      uint64_t ntotal) const;
+
+  void TraverseBlocks(
+      std::function<void(const std::string& /*cf_name*/, uint64_t /*fd*/,
+                         uint32_t /*level*/, TraceType /*block_type*/,
+                         const std::string& /*block_key*/,
+                         uint64_t /*block_key_id*/,
+                         const BlockAccessInfo& /*block_access_info*/)>
+          block_callback,
+      std::set<std::string>* labels = nullptr) const;
+
+  void UpdateFeatureVectors(
+      const std::vector<uint64_t>& access_sequence_number_timeline,
+      const std::vector<uint64_t>& access_timeline, const std::string& label,
+      std::map<std::string, Features>* label_features,
+      std::map<std::string, Predictions>* label_predictions) const;
+
+  void WriteCorrelationFeaturesToFile(
+      const std::string& label,
+      const std::map<std::string, Features>& label_features,
+      const std::map<std::string, Predictions>& label_predictions,
+      uint32_t max_number_of_values) const;
+
+  ROCKSDB_NAMESPACE::Env* env_;
+  const std::string trace_file_path_;
+  const std::string output_dir_;
+  std::string human_readable_trace_file_path_;
+  const bool compute_reuse_distance_;
+  const bool mrc_only_;
+  const bool is_human_readable_trace_file_;
+
+  BlockCacheTraceHeader header_;
+  std::unique_ptr<BlockCacheTraceSimulator> cache_simulator_;
+  std::map<std::string, ColumnFamilyAccessInfoAggregate> cf_aggregates_map_;
+  std::map<std::string, BlockAccessInfo*> block_info_map_;
+  std::unordered_map<std::string, GetKeyInfo> get_key_info_map_;
+  uint64_t access_sequence_number_ = 0;
+  uint64_t trace_start_timestamp_in_seconds_ = 0;
+  uint64_t trace_end_timestamp_in_seconds_ = 0;
+  MissRatioStats miss_ratio_stats_;
+  uint64_t unique_block_id_ = 1;
+  uint64_t unique_get_key_id_ = 1;
+  BlockCacheHumanReadableTraceWriter human_readable_trace_writer_;
+};
+
+int block_cache_trace_analyzer_tool(int argc, char** argv);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py
new file mode 100644
index 000000000..37166bcb4
--- /dev/null
+++ b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py
@@ -0,0 +1,729 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+#!/usr/bin/env python3
+
+import csv
+import math
+import os
+import random
+import sys
+
+import matplotlib
+
+matplotlib.use("Agg")
+import matplotlib.backends.backend_pdf
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+
+
+# Make sure a legend has the same color across all generated graphs.
+def get_cmap(n, name="hsv"):
+    """Returns a function that maps each index in 0, 1, ..., n-1 to a distinct
+    RGB color; the keyword argument name must be a standard mpl colormap name."""
+    return plt.cm.get_cmap(name, n)
+
+
+color_index = 0
+bar_color_maps = {}
+colors = []
+n_colors = 360
+linear_colors = get_cmap(n_colors)
+for i in range(n_colors):
+    colors.append(linear_colors(i))
+# Shuffle the colors so that adjacent bars in a graph are obvious to differentiate.
+random.shuffle(colors)
+
+
+def num_to_gb(n):
+    one_gb = 1024 * 1024 * 1024
+    if float(n) % one_gb == 0:
+        return "{}".format(n / one_gb)
+    # Keep two decimal points.
+    return "{0:.2f}".format(float(n) / one_gb)
+
+
+def plot_miss_stats_graphs(
+    csv_result_dir, output_result_dir, file_prefix, file_suffix, ylabel, pdf_file_name
+):
+    miss_ratios = {}
+    for file in os.listdir(csv_result_dir):
+        if not file.startswith(file_prefix):
+            continue
+        if not file.endswith(file_suffix):
+            continue
+        print("Processing file {}/{}".format(csv_result_dir, file))
+        mrc_file_path = csv_result_dir + "/" + file
+        with open(mrc_file_path, "r") as csvfile:
+            rows = csv.reader(csvfile, delimiter=",")
+            for row in rows:
+                cache_name = row[0]
+                num_shard_bits = int(row[1])
+                ghost_capacity = int(row[2])
+                capacity = int(row[3])
+                miss_ratio = float(row[4])
+                config = "{}-{}-{}".format(cache_name, num_shard_bits, ghost_capacity)
+                if config not in miss_ratios:
+                    miss_ratios[config] = {}
+                    miss_ratios[config]["x"] = []
+                    miss_ratios[config]["y"] = []
+                miss_ratios[config]["x"].append(capacity)
+                miss_ratios[config]["y"].append(miss_ratio)
+            fig = plt.figure()
+            for config in miss_ratios:
+                plt.plot(
+                    miss_ratios[config]["x"], miss_ratios[config]["y"], label=config
+                )
+            plt.xlabel("Cache capacity")
+            plt.ylabel(ylabel)
+            plt.xscale("log", basex=2)
+            plt.ylim(ymin=0)
+            plt.title("{}".format(file))
+            plt.legend()
+            fig.savefig(
+                output_result_dir + "/{}.pdf".format(pdf_file_name), bbox_inches="tight"
+            )
+
+
+def plot_miss_stats_diff_lru_graphs(
+    csv_result_dir, output_result_dir, file_prefix, file_suffix, ylabel, pdf_file_name
+):
+    miss_ratios = {}
+    for file in os.listdir(csv_result_dir):
+        if not file.startswith(file_prefix):
+            continue
+        if not file.endswith(file_suffix):
+            continue
+        print("Processing file {}/{}".format(csv_result_dir, file))
+        mrc_file_path = csv_result_dir + "/" + file
+        with open(mrc_file_path, "r") as csvfile:
+            rows = csv.reader(csvfile, delimiter=",")
+            for row in rows:
+                cache_name = row[0]
+                num_shard_bits = int(row[1])
+                ghost_capacity = int(row[2])
+                capacity = int(row[3])
+                miss_ratio = float(row[4])
+                config = "{}-{}-{}".format(cache_name, num_shard_bits, ghost_capacity)
+                if config not in miss_ratios:
+                    miss_ratios[config] = {}
+                    miss_ratios[config]["x"] = []
+                    miss_ratios[config]["y"] = []
+                miss_ratios[config]["x"].append(capacity)
+                miss_ratios[config]["y"].append(miss_ratio)
+    if "lru-0-0" not in miss_ratios:
+        return
+    fig = plt.figure()
+    for config in miss_ratios:
+        diffs = [0] * len(miss_ratios["lru-0-0"]["x"])
+        for i in range(len(miss_ratios["lru-0-0"]["x"])):
+            for j in range(len(miss_ratios[config]["x"])):
+                if miss_ratios["lru-0-0"]["x"][i] == miss_ratios[config]["x"][j]:
+                    diffs[i] = (
+                        miss_ratios[config]["y"][j] - miss_ratios["lru-0-0"]["y"][i]
+                    )
+                    break
+        plt.plot(miss_ratios["lru-0-0"]["x"], diffs, label=config)
+    plt.xlabel("Cache capacity")
+    plt.ylabel(ylabel)
+    plt.xscale("log", basex=2)
+    plt.title("{}".format(file))
+    plt.legend()
+    fig.savefig(
+        output_result_dir + "/{}.pdf".format(pdf_file_name), bbox_inches="tight"
+    )
+
+
+def sanitize(label):
+    # matplotlib cannot plot legends that is prefixed with "_"
+    # so we need to remove them here.
+    index = 0
+    for i in range(len(label)):
+        if label[i] == "_":
+            index += 1
+        else:
+            break
+    data = label[index:]
+    # The value of uint64_max in c++.
+    if "18446744073709551615" in data:
+        return "max"
+    return data
+
+
+# Read the csv file vertically, i.e., group the data by columns.
+def read_data_for_plot_vertical(csvfile):
+    x = []
+    labels = []
+    label_stats = {}
+    csv_rows = csv.reader(csvfile, delimiter=",")
+    data_rows = []
+    for row in csv_rows:
+        data_rows.append(row)
+    # header
+    for i in range(1, len(data_rows[0])):
+        labels.append(sanitize(data_rows[0][i]))
+        label_stats[i - 1] = []
+    for i in range(1, len(data_rows)):
+        for j in range(len(data_rows[i])):
+            if j == 0:
+                x.append(sanitize(data_rows[i][j]))
+                continue
+            label_stats[j - 1].append(float(data_rows[i][j]))
+    return x, labels, label_stats
+
+
+# Read the csv file horizontally, i.e., group the data by rows.
+def read_data_for_plot_horizontal(csvfile):
+    x = []
+    labels = []
+    label_stats = {}
+    csv_rows = csv.reader(csvfile, delimiter=",")
+    data_rows = []
+    for row in csv_rows:
+        data_rows.append(row)
+    # header
+    for i in range(1, len(data_rows)):
+        labels.append(sanitize(data_rows[i][0]))
+        label_stats[i - 1] = []
+    for i in range(1, len(data_rows[0])):
+        x.append(sanitize(data_rows[0][i]))
+    for i in range(1, len(data_rows)):
+        for j in range(len(data_rows[i])):
+            if j == 0:
+                # label
+                continue
+            label_stats[i - 1].append(float(data_rows[i][j]))
+    return x, labels, label_stats
+
+
+def read_data_for_plot(csvfile, vertical):
+    if vertical:
+        return read_data_for_plot_vertical(csvfile)
+    return read_data_for_plot_horizontal(csvfile)
+
+
+def plot_line_charts(
+    csv_result_dir,
+    output_result_dir,
+    filename_prefix,
+    filename_suffix,
+    pdf_name,
+    xlabel,
+    ylabel,
+    title,
+    vertical,
+    legend,
+):
+    global color_index, bar_color_maps, colors
+    pdf = matplotlib.backends.backend_pdf.PdfPages(output_result_dir + "/" + pdf_name)
+    for file in os.listdir(csv_result_dir):
+        if not file.endswith(filename_suffix):
+            continue
+        if not file.startswith(filename_prefix):
+            continue
+        print("Processing file {}/{}".format(csv_result_dir, file))
+        with open(csv_result_dir + "/" + file, "r") as csvfile:
+            x, labels, label_stats = read_data_for_plot(csvfile, vertical)
+            if len(x) == 0 or len(labels) == 0:
+                continue
+            # plot figure
+            fig = plt.figure()
+            for label_index in label_stats:
+                # Assign a unique color to this label.
+                if labels[label_index] not in bar_color_maps:
+                    bar_color_maps[labels[label_index]] = colors[color_index]
+                    color_index += 1
+                plt.plot(
+                    [int(x[i]) for i in range(len(x) - 1)],
+                    label_stats[label_index][:-1],
+                    label=labels[label_index],
+                    color=bar_color_maps[labels[label_index]],
+                )
+
+            # Translate time unit into x labels.
+            if "_60" in file:
+                plt.xlabel("{} (Minute)".format(xlabel))
+            if "_3600" in file:
+                plt.xlabel("{} (Hour)".format(xlabel))
+            plt.ylabel(ylabel)
+            plt.title("{} {}".format(title, file))
+            if legend:
+                plt.legend()
+            pdf.savefig(fig)
+    pdf.close()
+
+
+def plot_stacked_bar_charts(
+    csv_result_dir,
+    output_result_dir,
+    filename_suffix,
+    pdf_name,
+    xlabel,
+    ylabel,
+    title,
+    vertical,
+    x_prefix,
+):
+    global color_index, bar_color_maps, colors
+    pdf = matplotlib.backends.backend_pdf.PdfPages(
+        "{}/{}".format(output_result_dir, pdf_name)
+    )
+    for file in os.listdir(csv_result_dir):
+        if not file.endswith(filename_suffix):
+            continue
+        with open(csv_result_dir + "/" + file, "r") as csvfile:
+            print("Processing file {}/{}".format(csv_result_dir, file))
+            x, labels, label_stats = read_data_for_plot(csvfile, vertical)
+            if len(x) == 0 or len(label_stats) == 0:
+                continue
+            # Plot figure
+            fig = plt.figure()
+            ind = np.arange(len(x))  # the x locations for the groups
+            width = 0.5  # the width of the bars: can also be len(x) sequence
+            bars = []
+            bottom_bars = []
+            for _i in label_stats[0]:
+                bottom_bars.append(0)
+            for i in range(0, len(label_stats)):
+                # Assign a unique color to this label.
+                if labels[i] not in bar_color_maps:
+                    bar_color_maps[labels[i]] = colors[color_index]
+                    color_index += 1
+                p = plt.bar(
+                    ind,
+                    label_stats[i],
+                    width,
+                    bottom=bottom_bars,
+                    color=bar_color_maps[labels[i]],
+                )
+                bars.append(p[0])
+                for j in range(len(label_stats[i])):
+                    bottom_bars[j] += label_stats[i][j]
+            plt.xlabel(xlabel)
+            plt.ylabel(ylabel)
+            plt.xticks(
+                ind, [x_prefix + x[i] for i in range(len(x))], rotation=20, fontsize=8
+            )
+            plt.legend(bars, labels)
+            plt.title("{} filename:{}".format(title, file))
+            pdf.savefig(fig)
+    pdf.close()
+
+
+def plot_heatmap(csv_result_dir, output_result_dir, filename_suffix, pdf_name, title):
+    pdf = matplotlib.backends.backend_pdf.PdfPages(
+        "{}/{}".format(output_result_dir, pdf_name)
+    )
+    for file in os.listdir(csv_result_dir):
+        if not file.endswith(filename_suffix):
+            continue
+        csv_file_name = "{}/{}".format(csv_result_dir, file)
+        print("Processing file {}/{}".format(csv_result_dir, file))
+        corr_table = pd.read_csv(csv_file_name)
+        corr_table = corr_table.pivot("label", "corr", "value")
+        fig = plt.figure()
+        sns.heatmap(corr_table, annot=True, linewidths=0.5, fmt=".2")
+        plt.title("{} filename:{}".format(title, file))
+        pdf.savefig(fig)
+    pdf.close()
+
+
+def plot_timeline(csv_result_dir, output_result_dir):
+    plot_line_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_prefix="",
+        filename_suffix="access_timeline",
+        pdf_name="access_time.pdf",
+        xlabel="Time",
+        ylabel="Throughput",
+        title="Access timeline with group by label",
+        vertical=False,
+        legend=True,
+    )
+
+
+def convert_to_0_if_nan(n):
+    if math.isnan(n):
+        return 0.0
+    return n
+
+
+def plot_correlation(csv_result_dir, output_result_dir):
+    # Processing the correlation input first.
+    label_str_file = {}
+    for file in os.listdir(csv_result_dir):
+        if not file.endswith("correlation_input"):
+            continue
+        csv_file_name = "{}/{}".format(csv_result_dir, file)
+        print("Processing file {}/{}".format(csv_result_dir, file))
+        corr_table = pd.read_csv(csv_file_name)
+        label_str = file.split("_")[0]
+        label = file[len(label_str) + 1 :]
+        label = label[: len(label) - len("_correlation_input")]
+
+        output_file = "{}/{}_correlation_output".format(csv_result_dir, label_str)
+        if output_file not in label_str_file:
+            f = open("{}/{}_correlation_output".format(csv_result_dir, label_str), "w+")
+            label_str_file[output_file] = f
+            f.write("label,corr,value\n")
+        f = label_str_file[output_file]
+        f.write(
+            "{},{},{}\n".format(
+                label,
+                "LA+A",
+                convert_to_0_if_nan(
+                    corr_table["num_accesses_since_last_access"].corr(
+                        corr_table["num_accesses_till_next_access"], method="spearman"
+                    )
+                ),
+            )
+        )
+        f.write(
+            "{},{},{}\n".format(
+                label,
+                "PA+A",
+                convert_to_0_if_nan(
+                    corr_table["num_past_accesses"].corr(
+                        corr_table["num_accesses_till_next_access"], method="spearman"
+                    )
+                ),
+            )
+        )
+        f.write(
+            "{},{},{}\n".format(
+                label,
+                "LT+A",
+                convert_to_0_if_nan(
+                    corr_table["elapsed_time_since_last_access"].corr(
+                        corr_table["num_accesses_till_next_access"], method="spearman"
+                    )
+                ),
+            )
+        )
+        f.write(
+            "{},{},{}\n".format(
+                label,
+                "LA+T",
+                convert_to_0_if_nan(
+                    corr_table["num_accesses_since_last_access"].corr(
+                        corr_table["elapsed_time_till_next_access"], method="spearman"
+                    )
+                ),
+            )
+        )
+        f.write(
+            "{},{},{}\n".format(
+                label,
+                "LT+T",
+                convert_to_0_if_nan(
+                    corr_table["elapsed_time_since_last_access"].corr(
+                        corr_table["elapsed_time_till_next_access"], method="spearman"
+                    )
+                ),
+            )
+        )
+        f.write(
+            "{},{},{}\n".format(
+                label,
+                "PA+T",
+                convert_to_0_if_nan(
+                    corr_table["num_past_accesses"].corr(
+                        corr_table["elapsed_time_till_next_access"], method="spearman"
+                    )
+                ),
+            )
+        )
+    for label_str in label_str_file:
+        label_str_file[label_str].close()
+
+    plot_heatmap(
+        csv_result_dir,
+        output_result_dir,
+        "correlation_output",
+        "correlation.pdf",
+        "Correlation",
+    )
+
+
+def plot_reuse_graphs(csv_result_dir, output_result_dir):
+    plot_stacked_bar_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="avg_reuse_interval_naccesses",
+        pdf_name="avg_reuse_interval_naccesses.pdf",
+        xlabel="",
+        ylabel="Percentage of accesses",
+        title="Average reuse interval",
+        vertical=True,
+        x_prefix="< ",
+    )
+    plot_stacked_bar_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="avg_reuse_interval",
+        pdf_name="avg_reuse_interval.pdf",
+        xlabel="",
+        ylabel="Percentage of blocks",
+        title="Average reuse interval",
+        vertical=True,
+        x_prefix="< ",
+    )
+    plot_stacked_bar_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="access_reuse_interval",
+        pdf_name="reuse_interval.pdf",
+        xlabel="Seconds",
+        ylabel="Percentage of accesses",
+        title="Reuse interval",
+        vertical=True,
+        x_prefix="< ",
+    )
+    plot_stacked_bar_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="reuse_lifetime",
+        pdf_name="reuse_lifetime.pdf",
+        xlabel="Seconds",
+        ylabel="Percentage of blocks",
+        title="Reuse lifetime",
+        vertical=True,
+        x_prefix="< ",
+    )
+    plot_line_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_prefix="",
+        filename_suffix="reuse_blocks_timeline",
+        pdf_name="reuse_blocks_timeline.pdf",
+        xlabel="",
+        ylabel="Percentage of blocks",
+        title="Reuse blocks timeline",
+        vertical=False,
+        legend=False,
+    )
+
+
+def plot_percentage_access_summary(csv_result_dir, output_result_dir):
+    plot_stacked_bar_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="percentage_of_accesses_summary",
+        pdf_name="percentage_access.pdf",
+        xlabel="",
+        ylabel="Percentage of accesses",
+        title="",
+        vertical=True,
+        x_prefix="",
+    )
+    plot_stacked_bar_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="percent_ref_keys",
+        pdf_name="percent_ref_keys.pdf",
+        xlabel="",
+        ylabel="Percentage of blocks",
+        title="",
+        vertical=True,
+        x_prefix="",
+    )
+    plot_stacked_bar_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="percent_data_size_on_ref_keys",
+        pdf_name="percent_data_size_on_ref_keys.pdf",
+        xlabel="",
+        ylabel="Percentage of blocks",
+        title="",
+        vertical=True,
+        x_prefix="",
+    )
+    plot_stacked_bar_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="percent_accesses_on_ref_keys",
+        pdf_name="percent_accesses_on_ref_keys.pdf",
+        xlabel="",
+        ylabel="Percentage of blocks",
+        title="",
+        vertical=True,
+        x_prefix="",
+    )
+
+
+def plot_access_count_summary(csv_result_dir, output_result_dir):
+    plot_stacked_bar_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="access_count_summary",
+        pdf_name="access_count_summary.pdf",
+        xlabel="Access count",
+        ylabel="Percentage of blocks",
+        title="",
+        vertical=True,
+        x_prefix="< ",
+    )
+    plot_line_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_prefix="",
+        filename_suffix="skewness",
+        pdf_name="skew.pdf",
+        xlabel="",
+        ylabel="Percentage of accesses",
+        title="Skewness",
+        vertical=True,
+        legend=False,
+    )
+
+
+def plot_miss_ratio_timeline(csv_result_dir, output_result_dir):
+    plot_line_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_prefix="",
+        filename_suffix="3600_miss_ratio_timeline",
+        pdf_name="miss_ratio_timeline.pdf",
+        xlabel="Time",
+        ylabel="Miss Ratio (%)",
+        title="Miss ratio timeline",
+        vertical=False,
+        legend=True,
+    )
+    plot_line_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_prefix="",
+        filename_suffix="3600_miss_timeline",
+        pdf_name="miss_timeline.pdf",
+        xlabel="Time",
+        ylabel="# of misses ",
+        title="Miss timeline",
+        vertical=False,
+        legend=True,
+    )
+    plot_line_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_prefix="",
+        filename_suffix="3600_miss_timeline",
+        pdf_name="miss_timeline.pdf",
+        xlabel="Time",
+        ylabel="# of misses ",
+        title="Miss timeline",
+        vertical=False,
+        legend=True,
+    )
+    plot_line_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_prefix="",
+        filename_suffix="3600_policy_timeline",
+        pdf_name="policy_timeline.pdf",
+        xlabel="Time",
+        ylabel="# of times a policy is selected ",
+        title="Policy timeline",
+        vertical=False,
+        legend=True,
+    )
+    plot_line_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_prefix="",
+        filename_suffix="3600_policy_ratio_timeline",
+        pdf_name="policy_ratio_timeline.pdf",
+        xlabel="Time",
+        ylabel="Percentage of times a policy is selected ",
+        title="Policy timeline",
+        vertical=False,
+        legend=True,
+    )
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print(
+            "Must provide two arguments: \n"
+            "1) The directory that saves a list of "
+            "directories which contain block cache trace analyzer result files. \n"
+            "2) the directory to save plotted graphs. \n"
+        )
+        exit(1)
+    csv_result_dir = sys.argv[1]
+    output_result_dir = sys.argv[2]
+    print(
+        "Processing directory {} and save graphs to {}.".format(
+            csv_result_dir, output_result_dir
+        )
+    )
+    for csv_relative_dir in os.listdir(csv_result_dir):
+        csv_abs_dir = csv_result_dir + "/" + csv_relative_dir
+        result_dir = output_result_dir + "/" + csv_relative_dir
+        if not os.path.isdir(csv_abs_dir):
+            print("{} is not a directory".format(csv_abs_dir))
+            continue
+        print("Processing experiment dir: {}".format(csv_relative_dir))
+        if not os.path.exists(result_dir):
+            os.makedirs(result_dir)
+        plot_access_count_summary(csv_abs_dir, result_dir)
+        plot_timeline(csv_abs_dir, result_dir)
+        plot_miss_ratio_timeline(csv_result_dir, output_result_dir)
+        plot_correlation(csv_abs_dir, result_dir)
+        plot_reuse_graphs(csv_abs_dir, result_dir)
+        plot_percentage_access_summary(csv_abs_dir, result_dir)
+        plot_miss_stats_graphs(
+            csv_abs_dir,
+            result_dir,
+            file_prefix="",
+            file_suffix="mrc",
+            ylabel="Miss ratio (%)",
+            pdf_file_name="mrc",
+        )
+        plot_miss_stats_diff_lru_graphs(
+            csv_abs_dir,
+            result_dir,
+            file_prefix="",
+            file_suffix="mrc",
+            ylabel="Miss ratio (%)",
+            pdf_file_name="mrc_diff_lru",
+        )
+        # The following stats are only available in pysim.
+        for time_unit in ["1", "60", "3600"]:
+            plot_miss_stats_graphs(
+                csv_abs_dir,
+                result_dir,
+                file_prefix="ml_{}_".format(time_unit),
+                file_suffix="p95mb",
+                ylabel="p95 number of byte miss per {} seconds".format(time_unit),
+                pdf_file_name="p95mb_per{}_seconds".format(time_unit),
+            )
+            plot_miss_stats_graphs(
+                csv_abs_dir,
+                result_dir,
+                file_prefix="ml_{}_".format(time_unit),
+                file_suffix="avgmb",
+                ylabel="Average number of byte miss per {} seconds".format(time_unit),
+                pdf_file_name="avgmb_per{}_seconds".format(time_unit),
+            )
+            plot_miss_stats_diff_lru_graphs(
+                csv_abs_dir,
+                result_dir,
+                file_prefix="ml_{}_".format(time_unit),
+                file_suffix="p95mb",
+                ylabel="p95 number of byte miss per {} seconds".format(time_unit),
+                pdf_file_name="p95mb_per{}_seconds_diff_lru".format(time_unit),
+            )
+            plot_miss_stats_diff_lru_graphs(
+                csv_abs_dir,
+                result_dir,
+                file_prefix="ml_{}_".format(time_unit),
+                file_suffix="avgmb",
+                ylabel="Average number of byte miss per {} seconds".format(time_unit),
+                pdf_file_name="avgmb_per{}_seconds_diff_lru".format(time_unit),
+            )
diff --git a/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
new file mode 100644
index 000000000..c5d9b1452
--- /dev/null
+++ b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
@@ -0,0 +1,735 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr,
+          "Please install gflags to run block_cache_trace_analyzer_test\n");
+  return 0;
+}
+#else
+
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "rocksdb/trace_record.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "tools/block_cache_analyzer/block_cache_trace_analyzer.h"
+#include "trace_replay/block_cache_tracer.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+const uint64_t kBlockSize = 1024;
+const std::string kBlockKeyPrefix = "test-block-";
+const uint32_t kCFId = 0;
+const uint32_t kLevel = 1;
+const uint64_t kSSTStoringEvenKeys = 100;
+const uint64_t kSSTStoringOddKeys = 101;
+const std::string kRefKeyPrefix = "test-get-";
+const uint64_t kNumKeysInBlock = 1024;
+const int kMaxArgCount = 100;
+const size_t kArgBufferSize = 100000;
+}  // namespace
+
+class BlockCacheTracerTest : public testing::Test {
+ public:
+  BlockCacheTracerTest() {
+    test_path_ = test::PerThreadDBPath("block_cache_trace_analyzer_test");
+    env_ = ROCKSDB_NAMESPACE::Env::Default();
+    EXPECT_OK(env_->CreateDir(test_path_));
+    trace_file_path_ = test_path_ + "/block_cache_trace";
+    block_cache_sim_config_path_ = test_path_ + "/block_cache_sim_config";
+    timeline_labels_ =
+        "block,all,cf,sst,level,bt,caller,cf_sst,cf_level,cf_bt,cf_caller";
+    reuse_distance_labels_ =
+        "block,all,cf,sst,level,bt,caller,cf_sst,cf_level,cf_bt,cf_caller";
+    reuse_distance_buckets_ = "1,1K,1M,1G";
+    reuse_interval_labels_ = "block,all,cf,sst,level,bt,cf_sst,cf_level,cf_bt";
+    reuse_interval_buckets_ = "1,10,100,1000";
+    reuse_lifetime_labels_ = "block,all,cf,sst,level,bt,cf_sst,cf_level,cf_bt";
+    reuse_lifetime_buckets_ = "1,10,100,1000";
+    analyzing_callers_ = "Get,Iterator";
+    access_count_buckets_ = "2,3,4,5,10";
+    analyze_get_spatial_locality_labels_ = "all";
+    analyze_get_spatial_locality_buckets_ = "10,20,30,40,50,60,70,80,90,100";
+  }
+
+  ~BlockCacheTracerTest() override {
+    if (getenv("KEEP_DB")) {
+      printf("The trace file is still at %s\n", trace_file_path_.c_str());
+      return;
+    }
+    EXPECT_OK(env_->DeleteFile(trace_file_path_));
+    EXPECT_OK(env_->DeleteDir(test_path_));
+  }
+
+  TableReaderCaller GetCaller(uint32_t key_id) {
+    uint32_t n = key_id % 5;
+    switch (n) {
+      case 0:
+        return TableReaderCaller::kPrefetch;
+      case 1:
+        return TableReaderCaller::kCompaction;
+      case 2:
+        return TableReaderCaller::kUserGet;
+      case 3:
+        return TableReaderCaller::kUserMultiGet;
+      case 4:
+        return TableReaderCaller::kUserIterator;
+    }
+    // This cannot happend.
+    assert(false);
+    return TableReaderCaller::kMaxBlockCacheLookupCaller;
+  }
+
+  void WriteBlockAccess(BlockCacheTraceWriter* writer, uint32_t from_key_id,
+                        TraceType block_type, uint32_t nblocks) {
+    assert(writer);
+    for (uint32_t i = 0; i < nblocks; i++) {
+      uint32_t key_id = from_key_id + i;
+      uint64_t timestamp = (key_id + 1) * kMicrosInSecond;
+      BlockCacheTraceRecord record;
+      record.block_type = block_type;
+      record.block_size = kBlockSize + key_id;
+      record.block_key = kBlockKeyPrefix + std::to_string(key_id);
+      record.access_timestamp = timestamp;
+      record.cf_id = kCFId;
+      record.cf_name = kDefaultColumnFamilyName;
+      record.caller = GetCaller(key_id);
+      record.level = kLevel;
+      if (key_id % 2 == 0) {
+        record.sst_fd_number = kSSTStoringEvenKeys;
+      } else {
+        record.sst_fd_number = kSSTStoringOddKeys;
+      }
+      record.is_cache_hit = false;
+      record.no_insert = false;
+      // Provide these fields for all block types.
+      // The writer should only write these fields for data blocks and the
+      // caller is either GET or MGET.
+      record.referenced_key =
+          kRefKeyPrefix + std::to_string(key_id) + std::string(8, 0);
+      record.referenced_key_exist_in_block = true;
+      record.num_keys_in_block = kNumKeysInBlock;
+      ASSERT_OK(writer->WriteBlockAccess(
+          record, record.block_key, record.cf_name, record.referenced_key));
+    }
+  }
+
+  void AssertBlockAccessInfo(
+      uint32_t key_id, TraceType type,
+      const std::map<std::string, BlockAccessInfo>& block_access_info_map) {
+    auto key_id_str = kBlockKeyPrefix + std::to_string(key_id);
+    ASSERT_TRUE(block_access_info_map.find(key_id_str) !=
+                block_access_info_map.end());
+    auto& block_access_info = block_access_info_map.find(key_id_str)->second;
+    ASSERT_EQ(1, block_access_info.num_accesses);
+    ASSERT_EQ(kBlockSize + key_id, block_access_info.block_size);
+    ASSERT_GT(block_access_info.first_access_time, 0);
+    ASSERT_GT(block_access_info.last_access_time, 0);
+    ASSERT_EQ(1, block_access_info.caller_num_access_map.size());
+    TableReaderCaller expected_caller = GetCaller(key_id);
+    ASSERT_TRUE(block_access_info.caller_num_access_map.find(expected_caller) !=
+                block_access_info.caller_num_access_map.end());
+    ASSERT_EQ(
+        1,
+        block_access_info.caller_num_access_map.find(expected_caller)->second);
+
+    if ((expected_caller == TableReaderCaller::kUserGet ||
+         expected_caller == TableReaderCaller::kUserMultiGet) &&
+        type == TraceType::kBlockTraceDataBlock) {
+      ASSERT_EQ(kNumKeysInBlock, block_access_info.num_keys);
+      ASSERT_EQ(1, block_access_info.key_num_access_map.size());
+      ASSERT_EQ(0, block_access_info.non_exist_key_num_access_map.size());
+      ASSERT_EQ(1, block_access_info.num_referenced_key_exist_in_block);
+    }
+  }
+
+  void RunBlockCacheTraceAnalyzer() {
+    std::vector<std::string> params = {
+        "./block_cache_trace_analyzer",
+        "-block_cache_trace_path=" + trace_file_path_,
+        "-block_cache_sim_config_path=" + block_cache_sim_config_path_,
+        "-block_cache_analysis_result_dir=" + test_path_,
+        "-print_block_size_stats",
+        "-print_access_count_stats",
+        "-print_data_block_access_count_stats",
+        "-cache_sim_warmup_seconds=0",
+        "-analyze_bottom_k_access_count_blocks=5",
+        "-analyze_top_k_access_count_blocks=5",
+        "-analyze_blocks_reuse_k_reuse_window=5",
+        "-timeline_labels=" + timeline_labels_,
+        "-reuse_distance_labels=" + reuse_distance_labels_,
+        "-reuse_distance_buckets=" + reuse_distance_buckets_,
+        "-reuse_interval_labels=" + reuse_interval_labels_,
+        "-reuse_interval_buckets=" + reuse_interval_buckets_,
+        "-reuse_lifetime_labels=" + reuse_lifetime_labels_,
+        "-reuse_lifetime_buckets=" + reuse_lifetime_buckets_,
+        "-analyze_callers=" + analyzing_callers_,
+        "-access_count_buckets=" + access_count_buckets_,
+        "-analyze_get_spatial_locality_labels=" +
+            analyze_get_spatial_locality_labels_,
+        "-analyze_get_spatial_locality_buckets=" +
+            analyze_get_spatial_locality_buckets_,
+        "-analyze_correlation_coefficients_labels=all",
+        "-skew_labels=all",
+        "-skew_buckets=10,50,100"};
+    char arg_buffer[kArgBufferSize];
+    char* argv[kMaxArgCount];
+    int argc = 0;
+    int cursor = 0;
+    for (const auto& arg : params) {
+      ASSERT_LE(cursor + arg.size() + 1, kArgBufferSize);
+      ASSERT_LE(argc + 1, kMaxArgCount);
+      snprintf(arg_buffer + cursor, arg.size() + 1, "%s", arg.c_str());
+
+      argv[argc++] = arg_buffer + cursor;
+      cursor += static_cast<int>(arg.size()) + 1;
+    }
+    ASSERT_EQ(0,
+              ROCKSDB_NAMESPACE::block_cache_trace_analyzer_tool(argc, argv));
+  }
+
+  Env* env_;
+  EnvOptions env_options_;
+  std::string block_cache_sim_config_path_;
+  std::string trace_file_path_;
+  std::string test_path_;
+  std::string timeline_labels_;
+  std::string reuse_distance_labels_;
+  std::string reuse_distance_buckets_;
+  std::string reuse_interval_labels_;
+  std::string reuse_interval_buckets_;
+  std::string reuse_lifetime_labels_;
+  std::string reuse_lifetime_buckets_;
+  std::string analyzing_callers_;
+  std::string access_count_buckets_;
+  std::string analyze_get_spatial_locality_labels_;
+  std::string analyze_get_spatial_locality_buckets_;
+};
+
+TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) {
+  {
+    // Generate a trace file.
+    BlockCacheTraceWriterOptions trace_writer_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    const auto& clock = env_->GetSystemClock();
+    std::unique_ptr<BlockCacheTraceWriter> block_cache_trace_writer =
+        NewBlockCacheTraceWriter(clock.get(), trace_writer_opt,
+                                 std::move(trace_writer));
+    ASSERT_NE(block_cache_trace_writer, nullptr);
+    ASSERT_OK(block_cache_trace_writer->WriteHeader());
+    WriteBlockAccess(block_cache_trace_writer.get(), 0,
+                     TraceType::kBlockTraceDataBlock, 50);
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+  {
+    // Generate a cache sim config.
+    std::string config = "lru,1,0,1K,1M,1G";
+    std::ofstream out(block_cache_sim_config_path_);
+    ASSERT_TRUE(out.is_open());
+    out << config << std::endl;
+    out.close();
+  }
+  RunBlockCacheTraceAnalyzer();
+  {
+    // Validate the cache miss ratios.
+    std::vector<uint64_t> expected_capacities{1024, 1024 * 1024,
+                                              1024 * 1024 * 1024};
+    const std::string mrc_path = test_path_ + "/49_50_mrc";
+    std::ifstream infile(mrc_path);
+    uint32_t config_index = 0;
+    std::string line;
+    // Read header.
+    ASSERT_TRUE(getline(infile, line));
+    while (getline(infile, line)) {
+      std::stringstream ss(line);
+      std::vector<std::string> result_strs;
+      while (ss.good()) {
+        std::string substr;
+        getline(ss, substr, ',');
+        result_strs.push_back(substr);
+      }
+      ASSERT_EQ(6, result_strs.size());
+      ASSERT_LT(config_index, expected_capacities.size());
+      ASSERT_EQ("lru", result_strs[0]);  // cache_name
+      ASSERT_EQ("1", result_strs[1]);    // num_shard_bits
+      ASSERT_EQ("0", result_strs[2]);    // ghost_cache_capacity
+      ASSERT_EQ(std::to_string(expected_capacities[config_index]),
+                result_strs[3]);              // cache_capacity
+      ASSERT_EQ("100.0000", result_strs[4]);  // miss_ratio
+      ASSERT_EQ("50", result_strs[5]);        // number of accesses.
+      config_index++;
+    }
+    ASSERT_EQ(expected_capacities.size(), config_index);
+    infile.close();
+    ASSERT_OK(env_->DeleteFile(mrc_path));
+
+    const std::vector<std::string> time_units{"1", "60", "3600"};
+    expected_capacities.push_back(std::numeric_limits<uint64_t>::max());
+    for (auto const& expected_capacity : expected_capacities) {
+      for (auto const& time_unit : time_units) {
+        const std::string miss_ratio_timeline_path =
+            test_path_ + "/" + std::to_string(expected_capacity) + "_" +
+            time_unit + "_miss_ratio_timeline";
+        std::ifstream mrt_file(miss_ratio_timeline_path);
+        // Read header.
+        ASSERT_TRUE(getline(mrt_file, line));
+        ASSERT_TRUE(getline(mrt_file, line));
+        std::stringstream ss(line);
+        bool read_header = false;
+        while (ss.good()) {
+          std::string substr;
+          getline(ss, substr, ',');
+          if (!read_header) {
+            if (expected_capacity == std::numeric_limits<uint64_t>::max()) {
+              ASSERT_EQ("trace", substr);
+            } else {
+              ASSERT_EQ("lru-1-0", substr);
+            }
+            read_header = true;
+            continue;
+          }
+          ASSERT_DOUBLE_EQ(100.0, ParseDouble(substr));
+        }
+        ASSERT_FALSE(getline(mrt_file, line));
+        mrt_file.close();
+        ASSERT_OK(env_->DeleteFile(miss_ratio_timeline_path));
+      }
+      for (auto const& time_unit : time_units) {
+        const std::string miss_timeline_path =
+            test_path_ + "/" + std::to_string(expected_capacity) + "_" +
+            time_unit + "_miss_timeline";
+        std::ifstream mt_file(miss_timeline_path);
+        // Read header.
+        ASSERT_TRUE(getline(mt_file, line));
+        ASSERT_TRUE(getline(mt_file, line));
+        std::stringstream ss(line);
+        uint32_t num_misses = 0;
+        while (ss.good()) {
+          std::string substr;
+          getline(ss, substr, ',');
+          if (num_misses == 0) {
+            if (expected_capacity == std::numeric_limits<uint64_t>::max()) {
+              ASSERT_EQ("trace", substr);
+            } else {
+              ASSERT_EQ("lru-1-0", substr);
+            }
+            num_misses++;
+            continue;
+          }
+          num_misses += ParseInt(substr);
+        }
+        ASSERT_EQ(51u, num_misses);
+        ASSERT_FALSE(getline(mt_file, line));
+        mt_file.close();
+        ASSERT_OK(env_->DeleteFile(miss_timeline_path));
+      }
+    }
+  }
+  {
+    // Validate the skewness csv file.
+    const std::string skewness_file_path = test_path_ + "/all_skewness";
+    std::ifstream skew_file(skewness_file_path);
+    // Read header.
+    std::string line;
+    ASSERT_TRUE(getline(skew_file, line));
+    std::stringstream ss(line);
+    double sum_percent = 0;
+    while (getline(skew_file, line)) {
+      std::stringstream ss_naccess(line);
+      std::string substr;
+      bool read_label = false;
+      while (ss_naccess.good()) {
+        ASSERT_TRUE(getline(ss_naccess, substr, ','));
+        if (!read_label) {
+          read_label = true;
+          continue;
+        }
+        sum_percent += ParseDouble(substr);
+      }
+    }
+    ASSERT_EQ(100.0, sum_percent);
+    ASSERT_FALSE(getline(skew_file, line));
+    skew_file.close();
+    ASSERT_OK(env_->DeleteFile(skewness_file_path));
+  }
+  {
+    // Validate the timeline csv files.
+    const std::vector<std::string> time_units{"_60", "_3600"};
+    const std::vector<std::string> user_access_only_flags{"user_access_only_",
+                                                          "all_access_"};
+    for (auto const& user_access_only : user_access_only_flags) {
+      for (auto const& unit : time_units) {
+        std::stringstream ss(timeline_labels_);
+        while (ss.good()) {
+          std::string l;
+          ASSERT_TRUE(getline(ss, l, ','));
+          if (l.find("block") == std::string::npos) {
+            if (user_access_only != "all_access_") {
+              continue;
+            }
+          }
+          const std::string timeline_file = test_path_ + "/" +
+                                            user_access_only + l + unit +
+                                            "_access_timeline";
+          std::ifstream infile(timeline_file);
+          std::string line;
+          const uint64_t expected_naccesses = 50;
+          const uint64_t expected_user_accesses = 30;
+          ASSERT_TRUE(getline(infile, line)) << timeline_file;
+          uint32_t naccesses = 0;
+          while (getline(infile, line)) {
+            std::stringstream ss_naccess(line);
+            std::string substr;
+            bool read_label = false;
+            while (ss_naccess.good()) {
+              ASSERT_TRUE(getline(ss_naccess, substr, ','));
+              if (!read_label) {
+                read_label = true;
+                continue;
+              }
+              naccesses += ParseUint32(substr);
+            }
+          }
+          if (user_access_only == "user_access_only_") {
+            ASSERT_EQ(expected_user_accesses, naccesses) << timeline_file;
+          } else {
+            ASSERT_EQ(expected_naccesses, naccesses) << timeline_file;
+          }
+          ASSERT_OK(env_->DeleteFile(timeline_file));
+        }
+      }
+    }
+  }
+  {
+    // Validate the reuse_interval and reuse_distance csv files.
+    std::map<std::string, std::string> test_reuse_csv_files;
+    test_reuse_csv_files["_access_reuse_interval"] = reuse_interval_labels_;
+    test_reuse_csv_files["_reuse_distance"] = reuse_distance_labels_;
+    test_reuse_csv_files["_reuse_lifetime"] = reuse_lifetime_labels_;
+    test_reuse_csv_files["_avg_reuse_interval"] = reuse_interval_labels_;
+    test_reuse_csv_files["_avg_reuse_interval_naccesses"] =
+        reuse_interval_labels_;
+    for (auto const& test : test_reuse_csv_files) {
+      const std::string& file_suffix = test.first;
+      const std::string& labels = test.second;
+      const uint32_t expected_num_rows = 5;
+      std::stringstream ss(labels);
+      while (ss.good()) {
+        std::string l;
+        ASSERT_TRUE(getline(ss, l, ','));
+        const std::string reuse_csv_file = test_path_ + "/" + l + file_suffix;
+        std::ifstream infile(reuse_csv_file);
+        std::string line;
+        ASSERT_TRUE(getline(infile, line));
+        double npercentage = 0;
+        uint32_t nrows = 0;
+        while (getline(infile, line)) {
+          std::stringstream ss_naccess(line);
+          bool label_read = false;
+          nrows++;
+          while (ss_naccess.good()) {
+            std::string substr;
+            ASSERT_TRUE(getline(ss_naccess, substr, ','));
+            if (!label_read) {
+              label_read = true;
+              continue;
+            }
+            npercentage += ParseDouble(substr);
+          }
+        }
+        ASSERT_EQ(expected_num_rows, nrows);
+        if ("_reuse_lifetime" == test.first ||
+            "_avg_reuse_interval" == test.first ||
+            "_avg_reuse_interval_naccesses" == test.first) {
+          ASSERT_EQ(100, npercentage) << reuse_csv_file;
+        } else {
+          ASSERT_LT(npercentage, 0);
+        }
+        ASSERT_OK(env_->DeleteFile(reuse_csv_file));
+      }
+    }
+  }
+
+  {
+    // Validate the percentage of accesses summary.
+    const std::string percent_access_summary_file =
+        test_path_ + "/percentage_of_accesses_summary";
+    std::ifstream infile(percent_access_summary_file);
+    std::string line;
+    ASSERT_TRUE(getline(infile, line));
+    std::set<std::string> callers;
+    std::set<std::string> expected_callers{"Get", "MultiGet", "Iterator",
+                                           "Prefetch", "Compaction"};
+    while (getline(infile, line)) {
+      std::stringstream caller_percent(line);
+      std::string caller;
+      ASSERT_TRUE(getline(caller_percent, caller, ','));
+      std::string percent;
+      ASSERT_TRUE(getline(caller_percent, percent, ','));
+      ASSERT_FALSE(caller_percent.good());
+      callers.insert(caller);
+      ASSERT_EQ(20, ParseDouble(percent));
+    }
+    ASSERT_EQ(expected_callers.size(), callers.size());
+    for (auto caller : callers) {
+      ASSERT_TRUE(expected_callers.find(caller) != expected_callers.end());
+    }
+    ASSERT_OK(env_->DeleteFile(percent_access_summary_file));
+  }
+  {
+    // Validate the percentage of accesses summary by analyzing callers.
+    std::stringstream analyzing_callers(analyzing_callers_);
+    while (analyzing_callers.good()) {
+      std::string caller;
+      ASSERT_TRUE(getline(analyzing_callers, caller, ','));
+      std::vector<std::string> breakdowns{"level", "bt"};
+      for (auto breakdown : breakdowns) {
+        const std::string file_name = test_path_ + "/" + caller + "_" +
+                                      breakdown +
+                                      "_percentage_of_accesses_summary";
+        std::ifstream infile(file_name);
+        std::string line;
+        ASSERT_TRUE(getline(infile, line));
+        double sum = 0;
+        while (getline(infile, line)) {
+          std::stringstream label_percent(line);
+          std::string label;
+          ASSERT_TRUE(getline(label_percent, label, ','));
+          std::string percent;
+          ASSERT_TRUE(getline(label_percent, percent, ','));
+          ASSERT_FALSE(label_percent.good());
+          sum += ParseDouble(percent);
+        }
+        ASSERT_EQ(100, sum);
+        ASSERT_OK(env_->DeleteFile(file_name));
+      }
+    }
+  }
+  const std::vector<std::string> access_types{"user_access_only", "all_access"};
+  const std::vector<std::string> prefix{"bt", "cf"};
+  for (auto const& pre : prefix) {
+    for (auto const& access_type : access_types) {
+      {
+        // Validate the access count summary.
+        const std::string bt_access_count_summary = test_path_ + "/" + pre +
+                                                    "_" + access_type +
+                                                    "_access_count_summary";
+        std::ifstream infile(bt_access_count_summary);
+        std::string line;
+        ASSERT_TRUE(getline(infile, line));
+        double sum_percent = 0;
+        while (getline(infile, line)) {
+          std::stringstream bt_percent(line);
+          std::string bt;
+          ASSERT_TRUE(getline(bt_percent, bt, ','));
+          std::string percent;
+          ASSERT_TRUE(getline(bt_percent, percent, ','));
+          sum_percent += ParseDouble(percent);
+        }
+        ASSERT_EQ(100.0, sum_percent);
+        ASSERT_OK(env_->DeleteFile(bt_access_count_summary));
+      }
+    }
+  }
+  for (auto const& access_type : access_types) {
+    std::vector<std::string> block_types{"Index", "Data", "Filter"};
+    for (auto block_type : block_types) {
+      // Validate reuse block timeline.
+      const std::string reuse_blocks_timeline = test_path_ + "/" + block_type +
+                                                "_" + access_type +
+                                                "_5_reuse_blocks_timeline";
+      std::ifstream infile(reuse_blocks_timeline);
+      std::string line;
+      ASSERT_TRUE(getline(infile, line)) << reuse_blocks_timeline;
+      uint32_t index = 0;
+      while (getline(infile, line)) {
+        std::stringstream timeline(line);
+        bool start_time = false;
+        double sum = 0;
+        while (timeline.good()) {
+          std::string value;
+          ASSERT_TRUE(getline(timeline, value, ','));
+          if (!start_time) {
+            start_time = true;
+            continue;
+          }
+          sum += ParseDouble(value);
+        }
+        index++;
+        ASSERT_LT(sum, 100.0 * index + 1) << reuse_blocks_timeline;
+      }
+      ASSERT_OK(env_->DeleteFile(reuse_blocks_timeline));
+    }
+  }
+
+  std::stringstream ss(analyze_get_spatial_locality_labels_);
+  while (ss.good()) {
+    std::string l;
+    ASSERT_TRUE(getline(ss, l, ','));
+    const std::vector<std::string> spatial_locality_files{
+        "_percent_ref_keys", "_percent_accesses_on_ref_keys",
+        "_percent_data_size_on_ref_keys"};
+    for (auto const& spatial_locality_file : spatial_locality_files) {
+      const std::string filename = test_path_ + "/" + l + spatial_locality_file;
+      std::ifstream infile(filename);
+      std::string line;
+      ASSERT_TRUE(getline(infile, line));
+      double sum_percent = 0;
+      uint32_t nrows = 0;
+      while (getline(infile, line)) {
+        std::stringstream bt_percent(line);
+        std::string bt;
+        ASSERT_TRUE(getline(bt_percent, bt, ','));
+        std::string percent;
+        ASSERT_TRUE(getline(bt_percent, percent, ','));
+        sum_percent += ParseDouble(percent);
+        nrows++;
+      }
+      ASSERT_EQ(11u, nrows);
+      ASSERT_EQ(100.0, sum_percent);
+      ASSERT_OK(env_->DeleteFile(filename));
+    }
+  }
+  ASSERT_OK(env_->DeleteFile(block_cache_sim_config_path_));
+}
+
+TEST_F(BlockCacheTracerTest, MixedBlocks) {
+  {
+    // Generate a trace file containing a mix of blocks.
+    // It contains two SST files with 25 blocks of odd numbered block_key in
+    // kSSTStoringOddKeys and 25 blocks of even numbered blocks_key in
+    // kSSTStoringEvenKeys.
+    BlockCacheTraceWriterOptions trace_writer_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    const auto& clock = env_->GetSystemClock();
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    std::unique_ptr<BlockCacheTraceWriter> block_cache_trace_writer =
+        NewBlockCacheTraceWriter(clock.get(), trace_writer_opt,
+                                 std::move(trace_writer));
+    ASSERT_NE(block_cache_trace_writer, nullptr);
+    ASSERT_OK(block_cache_trace_writer->WriteHeader());
+    // Write blocks of different types.
+    WriteBlockAccess(block_cache_trace_writer.get(), 0,
+                     TraceType::kBlockTraceUncompressionDictBlock, 10);
+    WriteBlockAccess(block_cache_trace_writer.get(), 10,
+                     TraceType::kBlockTraceDataBlock, 10);
+    WriteBlockAccess(block_cache_trace_writer.get(), 20,
+                     TraceType::kBlockTraceFilterBlock, 10);
+    WriteBlockAccess(block_cache_trace_writer.get(), 30,
+                     TraceType::kBlockTraceIndexBlock, 10);
+    WriteBlockAccess(block_cache_trace_writer.get(), 40,
+                     TraceType::kBlockTraceRangeDeletionBlock, 10);
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+
+  {
+    // Verify trace file is generated correctly.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_,
+                                 &trace_reader));
+    BlockCacheTraceReader reader(std::move(trace_reader));
+    BlockCacheTraceHeader header;
+    ASSERT_OK(reader.ReadHeader(&header));
+    ASSERT_EQ(static_cast<uint32_t>(kMajorVersion),
+              header.rocksdb_major_version);
+    ASSERT_EQ(static_cast<uint32_t>(kMinorVersion),
+              header.rocksdb_minor_version);
+    // Read blocks.
+    BlockCacheTraceAnalyzer analyzer(
+        trace_file_path_,
+        /*output_miss_ratio_curve_path=*/"",
+        /*human_readable_trace_file_path=*/"",
+        /*compute_reuse_distance=*/true,
+        /*mrc_only=*/false,
+        /*is_block_cache_human_readable_trace=*/false,
+        /*simulator=*/nullptr);
+    // The analyzer ends when it detects an incomplete access record.
+    ASSERT_EQ(Status::Incomplete(""), analyzer.Analyze());
+    const uint64_t expected_num_cfs = 1;
+    std::vector<uint64_t> expected_fds{kSSTStoringOddKeys, kSSTStoringEvenKeys};
+    const std::vector<TraceType> expected_types{
+        TraceType::kBlockTraceUncompressionDictBlock,
+        TraceType::kBlockTraceDataBlock, TraceType::kBlockTraceFilterBlock,
+        TraceType::kBlockTraceIndexBlock,
+        TraceType::kBlockTraceRangeDeletionBlock};
+    const uint64_t expected_num_keys_per_type = 5;
+
+    auto& stats = analyzer.TEST_cf_aggregates_map();
+    ASSERT_EQ(expected_num_cfs, stats.size());
+    ASSERT_TRUE(stats.find(kDefaultColumnFamilyName) != stats.end());
+    auto& cf_stats = stats.find(kDefaultColumnFamilyName)->second;
+    ASSERT_EQ(expected_fds.size(), cf_stats.fd_aggregates_map.size());
+    for (auto fd_id : expected_fds) {
+      ASSERT_TRUE(cf_stats.fd_aggregates_map.find(fd_id) !=
+                  cf_stats.fd_aggregates_map.end());
+      ASSERT_EQ(kLevel, cf_stats.fd_aggregates_map.find(fd_id)->second.level);
+      auto& block_type_aggregates_map = cf_stats.fd_aggregates_map.find(fd_id)
+                                            ->second.block_type_aggregates_map;
+      ASSERT_EQ(expected_types.size(), block_type_aggregates_map.size());
+      uint32_t key_id = 0;
+      for (auto type : expected_types) {
+        ASSERT_TRUE(block_type_aggregates_map.find(type) !=
+                    block_type_aggregates_map.end());
+        auto& block_access_info_map =
+            block_type_aggregates_map.find(type)->second.block_access_info_map;
+        // Each block type has 5 blocks.
+        ASSERT_EQ(expected_num_keys_per_type, block_access_info_map.size());
+        for (uint32_t i = 0; i < 10; i++) {
+          // Verify that odd numbered blocks are stored in kSSTStoringOddKeys
+          // and even numbered blocks are stored in kSSTStoringEvenKeys.
+          auto key_id_str = kBlockKeyPrefix + std::to_string(key_id);
+          if (fd_id == kSSTStoringOddKeys) {
+            if (key_id % 2 == 1) {
+              AssertBlockAccessInfo(key_id, type, block_access_info_map);
+            } else {
+              ASSERT_TRUE(block_access_info_map.find(key_id_str) ==
+                          block_access_info_map.end());
+            }
+          } else {
+            if (key_id % 2 == 1) {
+              ASSERT_TRUE(block_access_info_map.find(key_id_str) ==
+                          block_access_info_map.end());
+            } else {
+              AssertBlockAccessInfo(key_id, type, block_access_info_map);
+            }
+          }
+          key_id++;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+#endif  // GFLAG
+#else
+#include <stdio.h>
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "block_cache_trace_analyzer_test is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc
new file mode 100644
index 000000000..44fec5598
--- /dev/null
+++ b/src/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc
@@ -0,0 +1,25 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else  // GFLAGS
+#include "tools/block_cache_analyzer/block_cache_trace_analyzer.h"
+int main(int argc, char** argv) {
+  return ROCKSDB_NAMESPACE::block_cache_trace_analyzer_tool(argc, argv);
+}
+#endif  // GFLAGS
+#else   // ROCKSDB_LITE
+#include <stdio.h>
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "Not supported in lite mode.\n");
+  return 1;
+}
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/check_all_python.py b/src/rocksdb/tools/check_all_python.py
new file mode 100755
index 000000000..708339a67
--- /dev/null
+++ b/src/rocksdb/tools/check_all_python.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import glob
+
+# Checks that all python files in the repository are at least free of syntax
+# errors. This provides a minimal pre-/post-commit check for python file
+# modifications.
+
+filenames = []
+# Avoid scanning all of ./ because there might be other external repos
+# linked in.
+for base in ["buckifier", "build_tools", "coverage", "tools"]:
+    # Clean this up when we finally upgrade to Python 3
+    for suff in ["*", "*/*", "*/*/*"]:
+        filenames += glob.glob(base + "/" + suff + ".py")
+
+for filename in filenames:
+    source = open(filename, "r").read() + "\n"
+    # Parses and syntax checks the file, throwing on error. (No pyc written.)
+    _ = compile(source, filename, "exec")
+
+print("No syntax errors in {0} .py files".format(len(filenames)))
diff --git a/src/rocksdb/tools/check_format_compatible.sh b/src/rocksdb/tools/check_format_compatible.sh
new file mode 100755
index 000000000..8a3f1b379
--- /dev/null
+++ b/src/rocksdb/tools/check_format_compatible.sh
@@ -0,0 +1,379 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+# A shell script to build and run different versions of ldb to check for
+# expected forward and backward compatibility with "current" version. The
+# working copy must have no uncommitted changes.
+#
+# Usage: <SCRIPT> [ref_for_current]
+# `ref_for_current` can be a revision, tag, commit or branch name. Default is HEAD.
+#
+# Return value 0 means all regression tests pass. 1 if not pass.
+#
+# Environment options:
+#  SHORT_TEST=1 - Test only the oldest branch for each kind of test. This is
+#    a good choice for PR validation as it is relatively fast and will find
+#    most issues.
+#  USE_SSH=1 - Connect to GitHub with ssh instead of https
+
+if ! git diff-index --quiet HEAD; then
+  echo "You have uncommitted changes. Aborting."
+  exit 1
+fi
+
+current_checkout_name=${1:-HEAD}
+# This allows the script to work even if with transient refs like "HEAD"
+current_checkout_hash="$(git rev-parse --quiet --verify $current_checkout_name)"
+
+if [ "$current_checkout_hash" == "" ]; then
+  echo "Not a recognized ref: $current_checkout_name"
+  exit 1
+fi
+
+# To restore to prior branch at the end
+orig_branch="$(git rev-parse --abbrev-ref HEAD)"
+tmp_branch=_tmp_format_compatible
+tmp_origin=_tmp_origin
+
+# Don't depend on what current "origin" might be
+set -e
+git remote remove $tmp_origin 2>/dev/null || true
+if [ "$USE_SSH" ]; then
+  git remote add $tmp_origin "git@github.com:facebook/rocksdb.git"
+else
+  git remote add $tmp_origin "https://github.com/facebook/rocksdb.git"
+fi
+git fetch $tmp_origin
+
+# Used in building some ancient RocksDB versions where by default it tries to
+# use a precompiled libsnappy.a checked in to the repo.
+export SNAPPY_LDFLAGS=-lsnappy
+
+cleanup() {
+  echo "== Cleaning up"
+  git reset --hard || true
+  git checkout "$orig_branch" || true
+  git branch -D $tmp_branch || true
+  git remote remove $tmp_origin || true
+}
+trap cleanup EXIT # Always clean up, even on failure or Ctrl+C
+
+scriptpath=`dirname ${BASH_SOURCE[0]}`
+
+test_dir=${TEST_TMPDIR:-"/tmp"}"/rocksdb_format_compatible_$USER"
+rm -rf ${test_dir:?}
+
+# Prevent 'make clean' etc. from wiping out test_dir
+export TEST_TMPDIR=$test_dir"/misc"
+
+# For saving current version of scripts as we checkout different versions to test
+script_copy_dir=$test_dir"/script_copy"
+mkdir -p $script_copy_dir
+cp -f $scriptpath/*.sh $script_copy_dir
+
+# For shared raw input data
+input_data_path=$test_dir"/test_data_input"
+mkdir -p $input_data_path
+# For external sst ingestion test
+ext_test_dir=$test_dir"/ext"
+mkdir -p $ext_test_dir
+# For DB dump test
+db_test_dir=$test_dir"/db"
+mkdir -p $db_test_dir
+# For backup/restore test (uses DB test)
+bak_test_dir=$test_dir"/bak"
+mkdir -p $bak_test_dir
+
+python_bin=$(which python3 || which python || echo python3)
+
+# Generate random files.
+for i in {1..6}
+do
+  input_data[$i]=$input_data_path/data$i
+  echo == Generating random input file ${input_data[$i]}
+  $python_bin - <<EOF
+import random
+random.seed($i)
+symbols=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
+with open('${input_data[$i]}', 'w') as f:
+  for i in range(1,1024):
+    k = ""
+    for j in range(1, random.randint(1,32)):
+      k=k + symbols[random.randint(0, len(symbols) - 1)]
+    vb = ""
+    for j in range(1, random.randint(0,128)):
+      vb = vb + symbols[random.randint(0, len(symbols) - 1)]
+    v = ""
+    for j in range(1, random.randint(1, 5)):
+      v = v + vb
+    print(k + " ==> " + v, file=f)
+EOF
+done
+
+# Generate file(s) with sorted keys.
+sorted_input_data=$input_data_path/sorted_data
+echo == Generating file with sorted keys ${sorted_input_data}
+$python_bin - <<EOF
+with open('${sorted_input_data}', 'w') as f:
+  for i in range(0,10):
+    k = str(i)
+    v = "value" + k
+    print(k + " ==> " + v, file=f)
+EOF
+
+# db_backward_only_refs defined below the rest
+
+# To check for DB forward compatibility with loading options (old version
+# reading data from new), as well as backward compatibility
+declare -a db_forward_with_options_refs=("6.27.fb" "6.28.fb" "6.29.fb" "7.0.fb" "7.1.fb" "7.2.fb" "7.3.fb" "7.4.fb" "7.5.fb" "7.6.fb" "7.7.fb" "7.8.fb")
+# To check for DB forward compatibility without loading options (in addition
+# to the "with loading options" set), as well as backward compatibility
+declare -a db_forward_no_options_refs=() # N/A at the moment
+
+# To check for SST ingestion backward compatibility (new version reading
+# data from old) (ldb ingest_extern_sst added in 5.16.x, back-ported to
+# 5.14.x, 5.15.x)
+declare -a ext_backward_only_refs=("5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb" "6.4.fb" "6.5.fb" "6.6.fb" "6.7.fb" "6.8.fb" "6.9.fb" "6.10.fb" "6.11.fb" "6.12.fb" "6.13.fb" "6.14.fb" "6.15.fb" "6.16.fb" "6.17.fb" "6.18.fb" "6.19.fb" "6.20.fb" "6.21.fb" "6.22.fb" "6.23.fb" "6.24.fb" "6.25.fb" "6.26.fb")
+# To check for SST ingestion forward compatibility (old version reading
+# data from new) as well as backward compatibility
+declare -a ext_forward_refs=("${db_forward_no_options_refs[@]}" "${db_forward_with_options_refs[@]}")
+
+# To check for backup backward compatibility (new version reading data
+# from old) (ldb backup/restore added in 4.11.x)
+declare -a bak_backward_only_refs=("4.11.fb" "4.12.fb" "4.13.fb" "5.0.fb" "5.1.fb" "5.2.fb" "5.3.fb" "5.4.fb" "5.5.fb" "5.6.fb" "5.7.fb" "5.8.fb" "5.9.fb" "5.10.fb" "5.11.fb" "5.12.fb" "5.13.fb" "${ext_backward_only_refs[@]}")
+# To check for backup forward compatibility (old version reading data
+# from new) as well as backward compatibility
+declare -a bak_forward_refs=("${db_forward_no_options_refs[@]}" "${db_forward_with_options_refs[@]}")
+
+# Branches (git refs) to check for DB backward compatibility (new version
+# reading data from old) (in addition to the "forward compatible" list)
+# NOTE: 2.7.fb.branch shows assertion violation in some configurations
+declare -a db_backward_only_refs=("2.2.fb.branch" "2.3.fb.branch" "2.4.fb.branch" "2.5.fb.branch" "2.6.fb.branch" "2.7.fb.branch" "2.8.1.fb" "3.0.fb.branch" "3.1.fb" "3.2.fb" "3.3.fb" "3.4.fb" "3.5.fb" "3.6.fb" "3.7.fb" "3.8.fb" "3.9.fb" "4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "${bak_backward_only_refs[@]}")
+
+if [ "$SHORT_TEST" ]; then
+  # Use only the first (if exists) of each list
+  db_backward_only_refs=(${db_backward_only_refs[0]})
+  db_forward_no_options_refs=(${db_forward_no_options_refs[0]})
+  db_forward_with_options_refs=(${db_forward_with_options_refs[0]})
+  ext_backward_only_refs=(${ext_backward_only_refs[0]})
+  ext_forward_refs=(${ext_forward_refs[0]})
+  bak_backward_only_refs=(${bak_backward_only_refs[0]})
+  bak_forward_refs=(${bak_forward_refs[0]})
+fi
+
+# De-duplicate & accumulate
+declare -a checkout_refs=()
+for checkout_ref in "${db_backward_only_refs[@]}" "${db_forward_no_options_refs[@]}" "${db_forward_with_options_refs[@]}" "${ext_backward_only_refs[@]}" "${ext_forward_refs[@]}" "${bak_backward_only_refs[@]}" "${bak_forward_refs[@]}"
+do
+  if [ ! -e $db_test_dir/$checkout_ref ]; then
+    mkdir -p $db_test_dir/$checkout_ref
+    checkout_refs+=($checkout_ref)
+  fi
+done
+
+generate_db()
+{
+    set +e
+    bash "$script_copy_dir"/generate_random_db.sh "$1" "$2"
+    if [ $? -ne 0 ]; then
+        echo ==== Error loading data from $2 to $1 ====
+        exit 1
+    fi
+    set -e
+}
+
+compare_db()
+{
+    set +e
+    bash "$script_copy_dir"/verify_random_db.sh "$1" "$2" "$3" "$4" "$5"
+    if [ $? -ne 0 ]; then
+        echo ==== Read different content from $1 and $2 or error happened. ====
+        exit 1
+    fi
+    set -e
+}
+
+write_external_sst()
+{
+    set +e
+    bash "$script_copy_dir"/write_external_sst.sh "$1" "$2" "$3"
+    if [ $? -ne 0 ]; then
+        echo ==== Error writing external SST file using data from $1 to $3 ====
+        exit 1
+    fi
+    set -e
+}
+
+ingest_external_sst()
+{
+    set +e
+    bash "$script_copy_dir"/ingest_external_sst.sh "$1" "$2"
+    if [ $? -ne 0 ]; then
+        echo ==== Error ingesting external SST in $2 to DB at $1 ====
+        exit 1
+    fi
+    set -e
+}
+
+backup_db()
+{
+    set +e
+    bash "$script_copy_dir"/backup_db.sh "$1" "$2"
+    if [ $? -ne 0 ]; then
+        echo ==== Error backing up DB $1 to $2 ====
+        exit 1
+    fi
+    set -e
+}
+
+restore_db()
+{
+    set +e
+    bash "$script_copy_dir"/restore_db.sh "$1" "$2"
+    if [ $? -ne 0 ]; then
+        echo ==== Error restoring from $1 to $2 ====
+        exit 1
+    fi
+    set -e
+}
+
+member_of_array()
+{
+  local e match="$1"
+  shift
+  for e; do [[ "$e" == "$match" ]] && return 0; done
+  return 1
+}
+
+force_no_fbcode()
+{
+  # Not all branches recognize ROCKSDB_NO_FBCODE and we should not need
+  # to patch old branches for changes to available FB compilers.
+  sed -i -e 's|-d /mnt/gvfs/third-party|"$ROCKSDB_FORCE_FBCODE"|' build_tools/build_detect_platform
+}
+
+# General structure from here:
+# * Check out, build, and do stuff with the "current" branch.
+# * For each older branch under consideration,
+#   * Check out, build, and do stuff with it, potentially using data
+#     generated from "current" branch.
+# * (Again) check out, build, and do (other) stuff with the "current"
+#    branch, potentially using data from older branches.
+#
+# This way, we only do at most n+1 checkout+build steps, without the
+# need to stash away executables.
+
+# Decorate name
+current_checkout_name="$current_checkout_name ($current_checkout_hash)"
+
+echo "== Building $current_checkout_name debug"
+git checkout -B $tmp_branch $current_checkout_hash
+force_no_fbcode
+make clean
+DISABLE_WARNING_AS_ERROR=1 make ldb -j32
+
+echo "== Using $current_checkout_name, generate DB with extern SST and ingest"
+current_ext_test_dir=$ext_test_dir"/current"
+write_external_sst $input_data_path ${current_ext_test_dir}_pointless $current_ext_test_dir
+ingest_external_sst ${current_ext_test_dir}_ingest $current_ext_test_dir
+
+echo "== Generating DB from $current_checkout_name ..."
+current_db_test_dir=$db_test_dir"/current"
+generate_db $input_data_path $current_db_test_dir
+
+echo "== Creating backup of DB from $current_checkout_name ..."
+current_bak_test_dir=$bak_test_dir"/current"
+backup_db $current_db_test_dir $current_bak_test_dir
+
+for checkout_ref in "${checkout_refs[@]}"
+do
+  echo "== Building $checkout_ref debug"
+  git reset --hard $tmp_origin/$checkout_ref
+  force_no_fbcode
+  make clean
+  DISABLE_WARNING_AS_ERROR=1 make ldb -j32
+
+  # We currently assume DB backward compatibility for every branch listed
+  echo "== Use $checkout_ref to generate a DB ..."
+  generate_db $input_data_path $db_test_dir/$checkout_ref
+
+  if member_of_array "$checkout_ref" "${ext_backward_only_refs[@]}" ||
+    member_of_array "$checkout_ref" "${ext_forward_refs[@]}"
+  then
+    echo "== Use $checkout_ref to generate DB with extern SST file"
+    write_external_sst $input_data_path $ext_test_dir/${checkout_ref}_pointless $ext_test_dir/$checkout_ref
+  fi
+
+  if member_of_array "$checkout_ref" "${ext_forward_refs[@]}"
+  then
+    echo "== Use $checkout_ref to ingest extern SST file and compare vs. $current_checkout_name"
+    ingest_external_sst $ext_test_dir/${checkout_ref}_ingest $ext_test_dir/$checkout_ref
+    compare_db $ext_test_dir/${checkout_ref}_ingest ${current_ext_test_dir}_ingest db_dump.txt 1 1
+
+    rm -rf ${ext_test_dir:?}/${checkout_ref}_ingest
+    echo "== Use $checkout_ref to ingest extern SST file from $current_checkout_name"
+    ingest_external_sst $ext_test_dir/${checkout_ref}_ingest $current_ext_test_dir
+    compare_db $ext_test_dir/${checkout_ref}_ingest ${current_ext_test_dir}_ingest db_dump.txt 1 1
+  fi
+
+  if member_of_array "$checkout_ref" "${db_forward_no_options_refs[@]}" ||
+    member_of_array "$checkout_ref" "${db_forward_with_options_refs[@]}"
+  then
+    echo "== Use $checkout_ref to open DB generated using $current_checkout_name..."
+    compare_db $db_test_dir/$checkout_ref $current_db_test_dir forward_${checkout_ref}_dump.txt 0
+  fi
+
+  if member_of_array "$checkout_ref" "${db_forward_with_options_refs[@]}"
+  then
+    echo "== Use $checkout_ref to open DB generated using $current_checkout_name with its options..."
+    compare_db $db_test_dir/$checkout_ref $current_db_test_dir forward_${checkout_ref}_dump.txt 1 1
+  fi
+
+  if member_of_array "$checkout_ref" "${bak_backward_only_refs[@]}" ||
+    member_of_array "$checkout_ref" "${bak_forward_refs[@]}"
+  then
+    echo "== Use $checkout_ref to backup DB"
+    backup_db $db_test_dir/$checkout_ref $bak_test_dir/$checkout_ref
+  fi
+
+  if member_of_array "$checkout_ref" "${bak_forward_refs[@]}"
+  then
+    echo "== Use $checkout_ref to restore DB from $current_checkout_name"
+    rm -rf ${db_test_dir:?}/$checkout_ref
+    restore_db $current_bak_test_dir $db_test_dir/$checkout_ref
+    compare_db $db_test_dir/$checkout_ref $current_db_test_dir forward_${checkout_ref}_dump.txt 0
+  fi
+done
+
+echo "== Building $current_checkout_name debug (again, final)"
+git reset --hard $current_checkout_hash
+force_no_fbcode
+make clean
+DISABLE_WARNING_AS_ERROR=1 make ldb -j32
+
+for checkout_ref in "${checkout_refs[@]}"
+do
+  # We currently assume DB backward compatibility for every branch listed
+  echo "== Use $current_checkout_name to open DB generated using $checkout_ref..."
+  compare_db $db_test_dir/$checkout_ref $current_db_test_dir db_dump.txt 1 0
+
+  if member_of_array "$checkout_ref" "${ext_backward_only_refs[@]}" ||
+    member_of_array "$checkout_ref" "${ext_forward_refs[@]}"
+  then
+    rm -rf ${ext_test_dir:?}/${checkout_ref}_ingest
+    echo "== Use $current_checkout_name to ingest extern SST file from $checkout_ref"
+    ingest_external_sst $ext_test_dir/${checkout_ref}_ingest $current_ext_test_dir
+    compare_db $ext_test_dir/${checkout_ref}_ingest ${current_ext_test_dir}_ingest db_dump.txt 1 1
+  fi
+
+  if member_of_array "$checkout_ref" "${bak_backward_only_refs[@]}" ||
+    member_of_array "$checkout_ref" "${bak_forward_refs[@]}"
+  then
+    echo "== Use $current_checkout_name to restore DB from $checkout_ref"
+    rm -rf ${db_test_dir:?}/$checkout_ref
+    restore_db $bak_test_dir/$checkout_ref $db_test_dir/$checkout_ref
+    compare_db $db_test_dir/$checkout_ref $current_db_test_dir db_dump.txt 1 0
+  fi
+done
+
+echo ==== Compatibility Test PASSED ====
diff --git a/src/rocksdb/tools/db_bench.cc b/src/rocksdb/tools/db_bench.cc
new file mode 100644
index 000000000..f13de83fe
--- /dev/null
+++ b/src/rocksdb/tools/db_bench.cc
@@ -0,0 +1,21 @@
+//  Copyright (c) 2013-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+#include "rocksdb/db_bench_tool.h"
+int main(int argc, char** argv) {
+  return ROCKSDB_NAMESPACE::db_bench_tool(argc, argv);
+}
+#endif  // GFLAGS
diff --git a/src/rocksdb/tools/db_bench_tool.cc b/src/rocksdb/tools/db_bench_tool.cc
new file mode 100644
index 000000000..7182528b3
--- /dev/null
+++ b/src/rocksdb/tools/db_bench_tool.cc
@@ -0,0 +1,8707 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifdef GFLAGS
+#ifdef NUMA
+#include <numa.h>
+#endif
+#ifndef OS_WIN
+#include <unistd.h>
+#endif
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#ifdef __APPLE__
+#include <mach/host_info.h>
+#include <mach/mach_host.h>
+#include <sys/sysctl.h>
+#endif
+#ifdef __FreeBSD__
+#include <sys/sysctl.h>
+#endif
+#include <atomic>
+#include <cinttypes>
+#include <condition_variable>
+#include <cstddef>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <unordered_map>
+
+#include "db/db_impl/db_impl.h"
+#include "db/malloc_stats.h"
+#include "db/version_set.h"
+#include "monitoring/histogram.h"
+#include "monitoring/statistics.h"
+#include "options/cf_options.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/persistent_cache.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/secondary_cache.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/stats_history.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/backup_engine.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksdb/utilities/options_type.h"
+#include "rocksdb/utilities/options_util.h"
+#ifndef ROCKSDB_LITE
+#include "rocksdb/utilities/replayer.h"
+#endif  // ROCKSDB_LITE
+#include "rocksdb/utilities/sim_cache.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/write_batch.h"
+#include "test_util/testutil.h"
+#include "test_util/transaction_test_util.h"
+#include "tools/simulated_hybrid_file_system.h"
+#include "util/cast_util.h"
+#include "util/compression.h"
+#include "util/crc32c.h"
+#include "util/file_checksum_helper.h"
+#include "util/gflags_compat.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/stderr_logger.h"
+#include "util/string_util.h"
+#include "util/xxhash.h"
+#include "utilities/blob_db/blob_db.h"
+#include "utilities/counted_fs.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/bytesxor.h"
+#include "utilities/merge_operators/sortlist.h"
+#include "utilities/persistent_cache/block_cache_tier.h"
+
+#ifdef MEMKIND
+#include "memory/memkind_kmem_allocator.h"
+#endif
+
+#ifdef OS_WIN
+#include <io.h>  // open/close
+#endif
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::RegisterFlagValidator;
+using GFLAGS_NAMESPACE::SetUsageMessage;
+using GFLAGS_NAMESPACE::SetVersionString;
+
+#ifdef ROCKSDB_LITE
+#define IF_ROCKSDB_LITE(Then, Else) Then
+#else
+#define IF_ROCKSDB_LITE(Then, Else) Else
+#endif
+
+DEFINE_string(
+    benchmarks,
+    "fillseq,"
+    "fillseqdeterministic,"
+    "fillsync,"
+    "fillrandom,"
+    "filluniquerandomdeterministic,"
+    "overwrite,"
+    "readrandom,"
+    "newiterator,"
+    "newiteratorwhilewriting,"
+    "seekrandom,"
+    "seekrandomwhilewriting,"
+    "seekrandomwhilemerging,"
+    "readseq,"
+    "readreverse,"
+    "compact,"
+    "compactall,"
+    "flush,"
+IF_ROCKSDB_LITE("",
+    "compact0,"
+    "compact1,"
+    "waitforcompaction,"
+)
+    "multireadrandom,"
+    "mixgraph,"
+    "readseq,"
+    "readtorowcache,"
+    "readtocache,"
+    "readreverse,"
+    "readwhilewriting,"
+    "readwhilemerging,"
+    "readwhilescanning,"
+    "readrandomwriterandom,"
+    "updaterandom,"
+    "xorupdaterandom,"
+    "approximatesizerandom,"
+    "randomwithverify,"
+    "fill100K,"
+    "crc32c,"
+    "xxhash,"
+    "xxhash64,"
+    "xxh3,"
+    "compress,"
+    "uncompress,"
+    "acquireload,"
+    "fillseekseq,"
+    "randomtransaction,"
+    "randomreplacekeys,"
+    "timeseries,"
+    "getmergeoperands,",
+    "readrandomoperands,"
+    "backup,"
+    "restore"
+
+    "Comma-separated list of operations to run in the specified"
+    " order. Available benchmarks:\n"
+    "\tfillseq       -- write N values in sequential key"
+    " order in async mode\n"
+    "\tfillseqdeterministic       -- write N values in the specified"
+    " key order and keep the shape of the LSM tree\n"
+    "\tfillrandom    -- write N values in random key order in async"
+    " mode\n"
+    "\tfilluniquerandomdeterministic       -- write N values in a random"
+    " key order and keep the shape of the LSM tree\n"
+    "\toverwrite     -- overwrite N values in random key order in "
+    "async mode\n"
+    "\tfillsync      -- write N/1000 values in random key order in "
+    "sync mode\n"
+    "\tfill100K      -- write N/1000 100K values in random order in"
+    " async mode\n"
+    "\tdeleteseq     -- delete N keys in sequential order\n"
+    "\tdeleterandom  -- delete N keys in random order\n"
+    "\treadseq       -- read N times sequentially\n"
+    "\treadtocache   -- 1 thread reading database sequentially\n"
+    "\treadreverse   -- read N times in reverse order\n"
+    "\treadrandom    -- read N times in random order\n"
+    "\treadmissing   -- read N missing keys in random order\n"
+    "\treadwhilewriting      -- 1 writer, N threads doing random "
+    "reads\n"
+    "\treadwhilemerging      -- 1 merger, N threads doing random "
+    "reads\n"
+    "\treadwhilescanning     -- 1 thread doing full table scan, "
+    "N threads doing random reads\n"
+    "\treadrandomwriterandom -- N threads doing random-read, "
+    "random-write\n"
+    "\tupdaterandom  -- N threads doing read-modify-write for random "
+    "keys\n"
+    "\txorupdaterandom  -- N threads doing read-XOR-write for "
+    "random keys\n"
+    "\tappendrandom  -- N threads doing read-modify-write with "
+    "growing values\n"
+    "\tmergerandom   -- same as updaterandom/appendrandom using merge"
+    " operator. "
+    "Must be used with merge_operator\n"
+    "\treadrandommergerandom -- perform N random read-or-merge "
+    "operations. Must be used with merge_operator\n"
+    "\tnewiterator   -- repeated iterator creation\n"
+    "\tseekrandom    -- N random seeks, call Next seek_nexts times "
+    "per seek\n"
+    "\tseekrandomwhilewriting -- seekrandom and 1 thread doing "
+    "overwrite\n"
+    "\tseekrandomwhilemerging -- seekrandom and 1 thread doing "
+    "merge\n"
+    "\tcrc32c        -- repeated crc32c of <block size> data\n"
+    "\txxhash        -- repeated xxHash of <block size> data\n"
+    "\txxhash64      -- repeated xxHash64 of <block size> data\n"
+    "\txxh3          -- repeated XXH3 of <block size> data\n"
+    "\tacquireload   -- load N*1000 times\n"
+    "\tfillseekseq   -- write N values in sequential key, then read "
+    "them by seeking to each key\n"
+    "\trandomtransaction     -- execute N random transactions and "
+    "verify correctness\n"
+    "\trandomreplacekeys     -- randomly replaces N keys by deleting "
+    "the old version and putting the new version\n\n"
+    "\ttimeseries            -- 1 writer generates time series data "
+    "and multiple readers doing random reads on id\n\n"
+    "Meta operations:\n"
+    "\tcompact     -- Compact the entire DB; If multiple, randomly choose one\n"
+    "\tcompactall  -- Compact the entire DB\n"
+IF_ROCKSDB_LITE("",
+    "\tcompact0  -- compact L0 into L1\n"
+    "\tcompact1  -- compact L1 into L2\n"
+    "\twaitforcompaction - pause until compaction is (probably) done\n"
+)
+    "\tflush - flush the memtable\n"
+    "\tstats       -- Print DB stats\n"
+    "\tresetstats  -- Reset DB stats\n"
+    "\tlevelstats  -- Print the number of files and bytes per level\n"
+    "\tmemstats  -- Print memtable stats\n"
+    "\tsstables    -- Print sstable info\n"
+    "\theapprofile -- Dump a heap profile (if supported by this port)\n"
+IF_ROCKSDB_LITE("",
+    "\treplay      -- replay the trace file specified with trace_file\n"
+)
+    "\tgetmergeoperands -- Insert lots of merge records which are a list of "
+    "sorted ints for a key and then compare performance of lookup for another "
+    "key by doing a Get followed by binary searching in the large sorted list "
+    "vs doing a GetMergeOperands and binary searching in the operands which "
+    "are sorted sub-lists. The MergeOperator used is sortlist.h\n"
+    "\treadrandomoperands -- read random keys using `GetMergeOperands()`. An "
+    "operation includes a rare but possible retry in case it got "
+    "`Status::Incomplete()`. This happens upon encountering more keys than "
+    "have ever been seen by the thread (or eight initially)\n"
+    "\tbackup --  Create a backup of the current DB and verify that a new backup is corrected. "
+    "Rate limit can be specified through --backup_rate_limit\n"
+    "\trestore -- Restore the DB from the latest backup available, rate limit can be specified through --restore_rate_limit\n");
+
+DEFINE_int64(num, 1000000, "Number of key/values to place in database");
+
+DEFINE_int64(numdistinct, 1000,
+             "Number of distinct keys to use. Used in RandomWithVerify to "
+             "read/write on fewer keys so that gets are more likely to find the"
+             " key and puts are more likely to update the same key");
+
+DEFINE_int64(merge_keys, -1,
+             "Number of distinct keys to use for MergeRandom and "
+             "ReadRandomMergeRandom. "
+             "If negative, there will be FLAGS_num keys.");
+DEFINE_int32(num_column_families, 1, "Number of Column Families to use.");
+
+DEFINE_int32(
+    num_hot_column_families, 0,
+    "Number of Hot Column Families. If more than 0, only write to this "
+    "number of column families. After finishing all the writes to them, "
+    "create new set of column families and insert to them. Only used "
+    "when num_column_families > 1.");
+
+DEFINE_string(column_family_distribution, "",
+              "Comma-separated list of percentages, where the ith element "
+              "indicates the probability of an op using the ith column family. "
+              "The number of elements must be `num_hot_column_families` if "
+              "specified; otherwise, it must be `num_column_families`. The "
+              "sum of elements must be 100. E.g., if `num_column_families=4`, "
+              "and `num_hot_column_families=0`, a valid list could be "
+              "\"10,20,30,40\".");
+
+DEFINE_int64(reads, -1,
+             "Number of read operations to do.  "
+             "If negative, do FLAGS_num reads.");
+
+DEFINE_int64(deletes, -1,
+             "Number of delete operations to do.  "
+             "If negative, do FLAGS_num deletions.");
+
+DEFINE_int32(bloom_locality, 0, "Control bloom filter probes locality");
+
+DEFINE_int64(seed, 0,
+             "Seed base for random number generators. "
+             "When 0 it is derived from the current time.");
+static int64_t seed_base;
+
+DEFINE_int32(threads, 1, "Number of concurrent threads to run.");
+
+DEFINE_int32(duration, 0,
+             "Time in seconds for the random-ops tests to run."
+             " When 0 then num & reads determine the test duration");
+
+DEFINE_string(value_size_distribution_type, "fixed",
+              "Value size distribution type: fixed, uniform, normal");
+
+DEFINE_int32(value_size, 100, "Size of each value in fixed distribution");
+static unsigned int value_size = 100;
+
+DEFINE_int32(value_size_min, 100, "Min size of random value");
+
+DEFINE_int32(value_size_max, 102400, "Max size of random value");
+
+DEFINE_int32(seek_nexts, 0,
+             "How many times to call Next() after Seek() in "
+             "fillseekseq, seekrandom, seekrandomwhilewriting and "
+             "seekrandomwhilemerging");
+
+DEFINE_bool(reverse_iterator, false,
+            "When true use Prev rather than Next for iterators that do "
+            "Seek and then Next");
+
+DEFINE_bool(auto_prefix_mode, false, "Set auto_prefix_mode for seek benchmark");
+
+DEFINE_int64(max_scan_distance, 0,
+             "Used to define iterate_upper_bound (or iterate_lower_bound "
+             "if FLAGS_reverse_iterator is set to true) when value is nonzero");
+
+DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator");
+
+DEFINE_int64(batch_size, 1, "Batch size");
+
+static bool ValidateKeySize(const char* /*flagname*/, int32_t /*value*/) {
+  return true;
+}
+
+static bool ValidateUint32Range(const char* flagname, uint64_t value) {
+  if (value > std::numeric_limits<uint32_t>::max()) {
+    fprintf(stderr, "Invalid value for --%s: %lu, overflow\n", flagname,
+            (unsigned long)value);
+    return false;
+  }
+  return true;
+}
+
+DEFINE_int32(key_size, 16, "size of each key");
+
+DEFINE_int32(user_timestamp_size, 0,
+             "number of bytes in a user-defined timestamp");
+
+DEFINE_int32(num_multi_db, 0,
+             "Number of DBs used in the benchmark. 0 means single DB.");
+
+DEFINE_double(compression_ratio, 0.5,
+              "Arrange to generate values that shrink to this fraction of "
+              "their original size after compression");
+
+DEFINE_double(
+    overwrite_probability, 0.0,
+    "Used in 'filluniquerandom' benchmark: for each write operation, "
+    "we give a probability to perform an overwrite instead. The key used for "
+    "the overwrite is randomly chosen from the last 'overwrite_window_size' "
+    "keys previously inserted into the DB. "
+    "Valid overwrite_probability values: [0.0, 1.0].");
+
+DEFINE_uint32(overwrite_window_size, 1,
+              "Used in 'filluniquerandom' benchmark. For each write operation,"
+              " when the overwrite_probability flag is set by the user, the "
+              "key used to perform an overwrite is randomly chosen from the "
+              "last 'overwrite_window_size' keys previously inserted into DB. "
+              "Warning: large values can affect throughput. "
+              "Valid overwrite_window_size values: [1, kMaxUint32].");
+
+DEFINE_uint64(
+    disposable_entries_delete_delay, 0,
+    "Minimum delay in microseconds for the series of Deletes "
+    "to be issued. When 0 the insertion of the last disposable entry is "
+    "immediately followed by the issuance of the Deletes. "
+    "(only compatible with fillanddeleteuniquerandom benchmark).");
+
+DEFINE_uint64(disposable_entries_batch_size, 0,
+              "Number of consecutively inserted disposable KV entries "
+              "that will be deleted after 'delete_delay' microseconds. "
+              "A series of Deletes is always issued once all the "
+              "disposable KV entries it targets have been inserted "
+              "into the DB. When 0 no deletes are issued and a "
+              "regular 'filluniquerandom' benchmark occurs. "
+              "(only compatible with fillanddeleteuniquerandom benchmark)");
+
+DEFINE_int32(disposable_entries_value_size, 64,
+             "Size of the values (in bytes) of the entries targeted by "
+             "selective deletes. "
+             "(only compatible with fillanddeleteuniquerandom benchmark)");
+
+DEFINE_uint64(
+    persistent_entries_batch_size, 0,
+    "Number of KV entries being inserted right before the deletes "
+    "targeting the disposable KV entries are issued. These "
+    "persistent keys are not targeted by the deletes, and will always "
+    "remain valid in the DB. (only compatible with "
+    "--benchmarks='fillanddeleteuniquerandom' "
+    "and used when--disposable_entries_batch_size is > 0).");
+
+DEFINE_int32(persistent_entries_value_size, 64,
+             "Size of the values (in bytes) of the entries not targeted by "
+             "deletes. (only compatible with "
+             "--benchmarks='fillanddeleteuniquerandom' "
+             "and used when--disposable_entries_batch_size is > 0).");
+
+DEFINE_double(read_random_exp_range, 0.0,
+              "Read random's key will be generated using distribution of "
+              "num * exp(-r) where r is uniform number from 0 to this value. "
+              "The larger the number is, the more skewed the reads are. "
+              "Only used in readrandom and multireadrandom benchmarks.");
+
+DEFINE_bool(histogram, false, "Print histogram of operation timings");
+
+DEFINE_bool(confidence_interval_only, false,
+            "Print 95% confidence interval upper and lower bounds only for "
+            "aggregate stats.");
+
+DEFINE_bool(enable_numa, false,
+            "Make operations aware of NUMA architecture and bind memory "
+            "and cpus corresponding to nodes together. In NUMA, memory "
+            "in same node as CPUs are closer when compared to memory in "
+            "other nodes. Reads can be faster when the process is bound to "
+            "CPU and memory of same node. Use \"$numactl --hardware\" command "
+            "to see NUMA memory architecture.");
+
+DEFINE_int64(db_write_buffer_size,
+             ROCKSDB_NAMESPACE::Options().db_write_buffer_size,
+             "Number of bytes to buffer in all memtables before compacting");
+
+DEFINE_bool(cost_write_buffer_to_cache, false,
+            "The usage of memtable is costed to the block cache");
+
+DEFINE_int64(arena_block_size, ROCKSDB_NAMESPACE::Options().arena_block_size,
+             "The size, in bytes, of one block in arena memory allocation.");
+
+DEFINE_int64(write_buffer_size, ROCKSDB_NAMESPACE::Options().write_buffer_size,
+             "Number of bytes to buffer in memtable before compacting");
+
+DEFINE_int32(max_write_buffer_number,
+             ROCKSDB_NAMESPACE::Options().max_write_buffer_number,
+             "The number of in-memory memtables. Each memtable is of size"
+             " write_buffer_size bytes.");
+
+DEFINE_int32(min_write_buffer_number_to_merge,
+             ROCKSDB_NAMESPACE::Options().min_write_buffer_number_to_merge,
+             "The minimum number of write buffers that will be merged together"
+             "before writing to storage. This is cheap because it is an"
+             "in-memory merge. If this feature is not enabled, then all these"
+             "write buffers are flushed to L0 as separate files and this "
+             "increases read amplification because a get request has to check"
+             " in all of these files. Also, an in-memory merge may result in"
+             " writing less data to storage if there are duplicate records "
+             " in each of these individual write buffers.");
+
+DEFINE_int32(max_write_buffer_number_to_maintain,
+             ROCKSDB_NAMESPACE::Options().max_write_buffer_number_to_maintain,
+             "The total maximum number of write buffers to maintain in memory "
+             "including copies of buffers that have already been flushed. "
+             "Unlike max_write_buffer_number, this parameter does not affect "
+             "flushing. This controls the minimum amount of write history "
+             "that will be available in memory for conflict checking when "
+             "Transactions are used. If this value is too low, some "
+             "transactions may fail at commit time due to not being able to "
+             "determine whether there were any write conflicts. Setting this "
+             "value to 0 will cause write buffers to be freed immediately "
+             "after they are flushed.  If this value is set to -1, "
+             "'max_write_buffer_number' will be used.");
+
+DEFINE_int64(max_write_buffer_size_to_maintain,
+             ROCKSDB_NAMESPACE::Options().max_write_buffer_size_to_maintain,
+             "The total maximum size of write buffers to maintain in memory "
+             "including copies of buffers that have already been flushed. "
+             "Unlike max_write_buffer_number, this parameter does not affect "
+             "flushing. This controls the minimum amount of write history "
+             "that will be available in memory for conflict checking when "
+             "Transactions are used. If this value is too low, some "
+             "transactions may fail at commit time due to not being able to "
+             "determine whether there were any write conflicts. Setting this "
+             "value to 0 will cause write buffers to be freed immediately "
+             "after they are flushed.  If this value is set to -1, "
+             "'max_write_buffer_number' will be used.");
+
+DEFINE_int32(max_background_jobs,
+             ROCKSDB_NAMESPACE::Options().max_background_jobs,
+             "The maximum number of concurrent background jobs that can occur "
+             "in parallel.");
+
+DEFINE_int32(num_bottom_pri_threads, 0,
+             "The number of threads in the bottom-priority thread pool (used "
+             "by universal compaction only).");
+
+DEFINE_int32(num_high_pri_threads, 0,
+             "The maximum number of concurrent background compactions"
+             " that can occur in parallel.");
+
+DEFINE_int32(num_low_pri_threads, 0,
+             "The maximum number of concurrent background compactions"
+             " that can occur in parallel.");
+
+DEFINE_int32(max_background_compactions,
+             ROCKSDB_NAMESPACE::Options().max_background_compactions,
+             "The maximum number of concurrent background compactions"
+             " that can occur in parallel.");
+
+DEFINE_uint64(subcompactions, 1,
+              "Maximum number of subcompactions to divide L0-L1 compactions "
+              "into.");
+static const bool FLAGS_subcompactions_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range);
+
+DEFINE_int32(max_background_flushes,
+             ROCKSDB_NAMESPACE::Options().max_background_flushes,
+             "The maximum number of concurrent background flushes"
+             " that can occur in parallel.");
+
+static ROCKSDB_NAMESPACE::CompactionStyle FLAGS_compaction_style_e;
+DEFINE_int32(compaction_style,
+             (int32_t)ROCKSDB_NAMESPACE::Options().compaction_style,
+             "style of compaction: level-based, universal and fifo");
+
+static ROCKSDB_NAMESPACE::CompactionPri FLAGS_compaction_pri_e;
+DEFINE_int32(compaction_pri,
+             (int32_t)ROCKSDB_NAMESPACE::Options().compaction_pri,
+             "priority of files to compaction: by size or by data age");
+
+DEFINE_int32(universal_size_ratio, 0,
+             "Percentage flexibility while comparing file size "
+             "(for universal compaction only).");
+
+DEFINE_int32(universal_min_merge_width, 0,
+             "The minimum number of files in a single compaction run "
+             "(for universal compaction only).");
+
+DEFINE_int32(universal_max_merge_width, 0,
+             "The max number of files to compact in universal style "
+             "compaction");
+
+DEFINE_int32(universal_max_size_amplification_percent, 0,
+             "The max size amplification for universal style compaction");
+
+DEFINE_int32(universal_compression_size_percent, -1,
+             "The percentage of the database to compress for universal "
+             "compaction. -1 means compress everything.");
+
+DEFINE_bool(universal_allow_trivial_move, false,
+            "Allow trivial move in universal compaction.");
+
+DEFINE_bool(universal_incremental, false,
+            "Enable incremental compactions in universal compaction.");
+
+DEFINE_int64(cache_size, 8 << 20,  // 8MB
+             "Number of bytes to use as a cache of uncompressed data");
+
+DEFINE_int32(cache_numshardbits, -1,
+             "Number of shards for the block cache"
+             " is 2 ** cache_numshardbits. Negative means use default settings."
+             " This is applied only if FLAGS_cache_size is non-negative.");
+
+DEFINE_double(cache_high_pri_pool_ratio, 0.0,
+              "Ratio of block cache reserve for high pri blocks. "
+              "If > 0.0, we also enable "
+              "cache_index_and_filter_blocks_with_high_priority.");
+
+DEFINE_double(cache_low_pri_pool_ratio, 0.0,
+              "Ratio of block cache reserve for low pri blocks.");
+
+DEFINE_string(cache_type, "lru_cache", "Type of block cache.");
+
+DEFINE_bool(use_compressed_secondary_cache, false,
+            "Use the CompressedSecondaryCache as the secondary cache.");
+
+DEFINE_int64(compressed_secondary_cache_size, 8 << 20,  // 8MB
+             "Number of bytes to use as a cache of data");
+
+DEFINE_int32(compressed_secondary_cache_numshardbits, 6,
+             "Number of shards for the block cache"
+             " is 2 ** compressed_secondary_cache_numshardbits."
+             " Negative means use default settings."
+             " This is applied only if FLAGS_cache_size is non-negative.");
+
+DEFINE_double(compressed_secondary_cache_high_pri_pool_ratio, 0.0,
+              "Ratio of block cache reserve for high pri blocks. "
+              "If > 0.0, we also enable "
+              "cache_index_and_filter_blocks_with_high_priority.");
+
+DEFINE_double(compressed_secondary_cache_low_pri_pool_ratio, 0.0,
+              "Ratio of block cache reserve for low pri blocks.");
+
+DEFINE_string(compressed_secondary_cache_compression_type, "lz4",
+              "The compression algorithm to use for large "
+              "values stored in CompressedSecondaryCache.");
+static enum ROCKSDB_NAMESPACE::CompressionType
+    FLAGS_compressed_secondary_cache_compression_type_e =
+        ROCKSDB_NAMESPACE::kLZ4Compression;
+
+DEFINE_uint32(
+    compressed_secondary_cache_compress_format_version, 2,
+    "compress_format_version can have two values: "
+    "compress_format_version == 1 -- decompressed size is not included"
+    " in the block header."
+    "compress_format_version == 2 -- decompressed size is included"
+    " in the block header in varint32 format.");
+
+DEFINE_int64(simcache_size, -1,
+             "Number of bytes to use as a simcache of "
+             "uncompressed data. Nagative value disables simcache.");
+
+DEFINE_bool(cache_index_and_filter_blocks, false,
+            "Cache index/filter blocks in block cache.");
+
+DEFINE_bool(use_cache_jemalloc_no_dump_allocator, false,
+            "Use JemallocNodumpAllocator for block/blob cache.");
+
+DEFINE_bool(use_cache_memkind_kmem_allocator, false,
+            "Use memkind kmem allocator for block/blob cache.");
+
+DEFINE_bool(partition_index_and_filters, false,
+            "Partition index and filter blocks.");
+
+DEFINE_bool(partition_index, false, "Partition index blocks");
+
+DEFINE_bool(index_with_first_key, false, "Include first key in the index");
+
+DEFINE_bool(
+    optimize_filters_for_memory,
+    ROCKSDB_NAMESPACE::BlockBasedTableOptions().optimize_filters_for_memory,
+    "Minimize memory footprint of filters");
+
+DEFINE_int64(
+    index_shortening_mode, 2,
+    "mode to shorten index: 0 for no shortening; 1 for only shortening "
+    "separaters; 2 for shortening shortening and successor");
+
+DEFINE_int64(metadata_block_size,
+             ROCKSDB_NAMESPACE::BlockBasedTableOptions().metadata_block_size,
+             "Max partition size when partitioning index/filters");
+
+// The default reduces the overhead of reading time with flash. With HDD, which
+// offers much less throughput, however, this number better to be set to 1.
+DEFINE_int32(ops_between_duration_checks, 1000,
+             "Check duration limit every x ops");
+
+DEFINE_bool(pin_l0_filter_and_index_blocks_in_cache, false,
+            "Pin index/filter blocks of L0 files in block cache.");
+
+DEFINE_bool(
+    pin_top_level_index_and_filter, false,
+    "Pin top-level index of partitioned index/filter blocks in block cache.");
+
+DEFINE_int32(block_size,
+             static_cast<int32_t>(
+                 ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_size),
+             "Number of bytes in a block.");
+
+DEFINE_int32(format_version,
+             static_cast<int32_t>(
+                 ROCKSDB_NAMESPACE::BlockBasedTableOptions().format_version),
+             "Format version of SST files.");
+
+DEFINE_int32(block_restart_interval,
+             ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_restart_interval,
+             "Number of keys between restart points "
+             "for delta encoding of keys in data block.");
+
+DEFINE_int32(
+    index_block_restart_interval,
+    ROCKSDB_NAMESPACE::BlockBasedTableOptions().index_block_restart_interval,
+    "Number of keys between restart points "
+    "for delta encoding of keys in index block.");
+
+DEFINE_int32(read_amp_bytes_per_bit,
+             ROCKSDB_NAMESPACE::BlockBasedTableOptions().read_amp_bytes_per_bit,
+             "Number of bytes per bit to be used in block read-amp bitmap");
+
+DEFINE_bool(
+    enable_index_compression,
+    ROCKSDB_NAMESPACE::BlockBasedTableOptions().enable_index_compression,
+    "Compress the index block");
+
+DEFINE_bool(block_align,
+            ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_align,
+            "Align data blocks on page size");
+
+DEFINE_int64(prepopulate_block_cache, 0,
+             "Pre-populate hot/warm blocks in block cache. 0 to disable and 1 "
+             "to insert during flush");
+
+DEFINE_bool(use_data_block_hash_index, false,
+            "if use kDataBlockBinaryAndHash "
+            "instead of kDataBlockBinarySearch. "
+            "This is valid if only we use BlockTable");
+
+DEFINE_double(data_block_hash_table_util_ratio, 0.75,
+              "util ratio for data block hash index table. "
+              "This is only valid if use_data_block_hash_index is "
+              "set to true");
+
+DEFINE_int64(compressed_cache_size, -1,
+             "Number of bytes to use as a cache of compressed data.");
+
+DEFINE_int64(row_cache_size, 0,
+             "Number of bytes to use as a cache of individual rows"
+             " (0 = disabled).");
+
+DEFINE_int32(open_files, ROCKSDB_NAMESPACE::Options().max_open_files,
+             "Maximum number of files to keep open at the same time"
+             " (use default if == 0)");
+
+DEFINE_int32(file_opening_threads,
+             ROCKSDB_NAMESPACE::Options().max_file_opening_threads,
+             "If open_files is set to -1, this option set the number of "
+             "threads that will be used to open files during DB::Open()");
+
+DEFINE_int32(compaction_readahead_size, 0, "Compaction readahead size");
+
+DEFINE_int32(log_readahead_size, 0, "WAL and manifest readahead size");
+
+DEFINE_int32(random_access_max_buffer_size, 1024 * 1024,
+             "Maximum windows randomaccess buffer size");
+
+DEFINE_int32(writable_file_max_buffer_size, 1024 * 1024,
+             "Maximum write buffer for Writable File");
+
+DEFINE_int32(bloom_bits, -1,
+             "Bloom filter bits per key. Negative means use default."
+             "Zero disables.");
+
+DEFINE_bool(use_ribbon_filter, false, "Use Ribbon instead of Bloom filter");
+
+DEFINE_double(memtable_bloom_size_ratio, 0,
+              "Ratio of memtable size used for bloom filter. 0 means no bloom "
+              "filter.");
+DEFINE_bool(memtable_whole_key_filtering, false,
+            "Try to use whole key bloom filter in memtables.");
+DEFINE_bool(memtable_use_huge_page, false,
+            "Try to use huge page in memtables.");
+
+DEFINE_bool(whole_key_filtering,
+            ROCKSDB_NAMESPACE::BlockBasedTableOptions().whole_key_filtering,
+            "Use whole keys (in addition to prefixes) in SST bloom filter.");
+
+DEFINE_bool(use_existing_db, false,
+            "If true, do not destroy the existing database.  If you set this "
+            "flag and also specify a benchmark that wants a fresh database, "
+            "that benchmark will fail.");
+
+DEFINE_bool(use_existing_keys, false,
+            "If true, uses existing keys in the DB, "
+            "rather than generating new ones. This involves some startup "
+            "latency to load all keys into memory. It is supported for the "
+            "same read/overwrite benchmarks as `-use_existing_db=true`, which "
+            "must also be set for this flag to be enabled. When this flag is "
+            "set, the value for `-num` will be ignored.");
+
+DEFINE_bool(show_table_properties, false,
+            "If true, then per-level table"
+            " properties will be printed on every stats-interval when"
+            " stats_interval is set and stats_per_interval is on.");
+
+DEFINE_string(db, "", "Use the db with the following name.");
+
+DEFINE_bool(progress_reports, true,
+            "If true, db_bench will report number of finished operations.");
+
+// Read cache flags
+
+DEFINE_string(read_cache_path, "",
+              "If not empty string, a read cache will be used in this path");
+
+DEFINE_int64(read_cache_size, 4LL * 1024 * 1024 * 1024,
+             "Maximum size of the read cache");
+
+DEFINE_bool(read_cache_direct_write, true,
+            "Whether to use Direct IO for writing to the read cache");
+
+DEFINE_bool(read_cache_direct_read, true,
+            "Whether to use Direct IO for reading from read cache");
+
+DEFINE_bool(use_keep_filter, false, "Whether to use a noop compaction filter");
+
+static bool ValidateCacheNumshardbits(const char* flagname, int32_t value) {
+  if (value >= 20) {
+    fprintf(stderr, "Invalid value for --%s: %d, must be < 20\n", flagname,
+            value);
+    return false;
+  }
+  return true;
+}
+
+DEFINE_bool(verify_checksum, true,
+            "Verify checksum for every block read from storage");
+
+DEFINE_int32(checksum_type,
+             ROCKSDB_NAMESPACE::BlockBasedTableOptions().checksum,
+             "ChecksumType as an int");
+
+DEFINE_bool(statistics, false, "Database statistics");
+DEFINE_int32(stats_level, ROCKSDB_NAMESPACE::StatsLevel::kExceptDetailedTimers,
+             "stats level for statistics");
+DEFINE_string(statistics_string, "", "Serialized statistics string");
+static class std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats;
+
+DEFINE_int64(writes, -1,
+             "Number of write operations to do. If negative, do --num reads.");
+
+DEFINE_bool(finish_after_writes, false,
+            "Write thread terminates after all writes are finished");
+
+DEFINE_bool(sync, false, "Sync all writes to disk");
+
+DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync");
+
+DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
+
+DEFINE_bool(manual_wal_flush, false,
+            "If true, buffer WAL until buffer is full or a manual FlushWAL().");
+
+DEFINE_string(wal_compression, "none",
+              "Algorithm to use for WAL compression. none to disable.");
+static enum ROCKSDB_NAMESPACE::CompressionType FLAGS_wal_compression_e =
+    ROCKSDB_NAMESPACE::kNoCompression;
+
+DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL");
+
+DEFINE_string(truth_db, "/dev/shm/truth_db/dbbench",
+              "Truth key/values used when using verify");
+
+DEFINE_int32(num_levels, 7, "The total number of levels");
+
+DEFINE_int64(target_file_size_base,
+             ROCKSDB_NAMESPACE::Options().target_file_size_base,
+             "Target file size at level-1");
+
+DEFINE_int32(target_file_size_multiplier,
+             ROCKSDB_NAMESPACE::Options().target_file_size_multiplier,
+             "A multiplier to compute target level-N file size (N >= 2)");
+
+DEFINE_uint64(max_bytes_for_level_base,
+              ROCKSDB_NAMESPACE::Options().max_bytes_for_level_base,
+              "Max bytes for level-1");
+
+DEFINE_bool(level_compaction_dynamic_level_bytes, false,
+            "Whether level size base is dynamic");
+
+DEFINE_double(max_bytes_for_level_multiplier, 10,
+              "A multiplier to compute max bytes for level-N (N >= 2)");
+
+static std::vector<int> FLAGS_max_bytes_for_level_multiplier_additional_v;
+DEFINE_string(max_bytes_for_level_multiplier_additional, "",
+              "A vector that specifies additional fanout per level");
+
+DEFINE_int32(level0_stop_writes_trigger,
+             ROCKSDB_NAMESPACE::Options().level0_stop_writes_trigger,
+             "Number of files in level-0 that will trigger put stop.");
+
+DEFINE_int32(level0_slowdown_writes_trigger,
+             ROCKSDB_NAMESPACE::Options().level0_slowdown_writes_trigger,
+             "Number of files in level-0 that will slow down writes.");
+
+DEFINE_int32(level0_file_num_compaction_trigger,
+             ROCKSDB_NAMESPACE::Options().level0_file_num_compaction_trigger,
+             "Number of files in level-0 when compactions start.");
+
+DEFINE_uint64(periodic_compaction_seconds,
+              ROCKSDB_NAMESPACE::Options().periodic_compaction_seconds,
+              "Files older than this will be picked up for compaction and"
+              " rewritten to the same level");
+
+DEFINE_uint64(ttl_seconds, ROCKSDB_NAMESPACE::Options().ttl, "Set options.ttl");
+
+static bool ValidateInt32Percent(const char* flagname, int32_t value) {
+  if (value <= 0 || value >= 100) {
+    fprintf(stderr, "Invalid value for --%s: %d, 0< pct <100 \n", flagname,
+            value);
+    return false;
+  }
+  return true;
+}
+DEFINE_int32(readwritepercent, 90,
+             "Ratio of reads to reads/writes (expressed as percentage) for "
+             "the ReadRandomWriteRandom workload. The default value 90 means "
+             "90% operations out of all reads and writes operations are "
+             "reads. In other words, 9 gets for every 1 put.");
+
+DEFINE_int32(mergereadpercent, 70,
+             "Ratio of merges to merges&reads (expressed as percentage) for "
+             "the ReadRandomMergeRandom workload. The default value 70 means "
+             "70% out of all read and merge operations are merges. In other "
+             "words, 7 merges for every 3 gets.");
+
+DEFINE_int32(deletepercent, 2,
+             "Percentage of deletes out of reads/writes/deletes (used in "
+             "RandomWithVerify only). RandomWithVerify "
+             "calculates writepercent as (100 - FLAGS_readwritepercent - "
+             "deletepercent), so deletepercent must be smaller than (100 - "
+             "FLAGS_readwritepercent)");
+
+DEFINE_bool(optimize_filters_for_hits,
+            ROCKSDB_NAMESPACE::Options().optimize_filters_for_hits,
+            "Optimizes bloom filters for workloads for most lookups return "
+            "a value. For now this doesn't create bloom filters for the max "
+            "level of the LSM to reduce metadata that should fit in RAM. ");
+
+DEFINE_bool(paranoid_checks, ROCKSDB_NAMESPACE::Options().paranoid_checks,
+            "RocksDB will aggressively check consistency of the data.");
+
+DEFINE_bool(force_consistency_checks,
+            ROCKSDB_NAMESPACE::Options().force_consistency_checks,
+            "Runs consistency checks on the LSM every time a change is "
+            "applied.");
+
+DEFINE_bool(check_flush_compaction_key_order,
+            ROCKSDB_NAMESPACE::Options().check_flush_compaction_key_order,
+            "During flush or compaction, check whether keys inserted to "
+            "output files are in order.");
+
+DEFINE_uint64(delete_obsolete_files_period_micros, 0,
+              "Ignored. Left here for backward compatibility");
+
+DEFINE_int64(writes_before_delete_range, 0,
+             "Number of writes before DeleteRange is called regularly.");
+
+DEFINE_int64(writes_per_range_tombstone, 0,
+             "Number of writes between range tombstones");
+
+DEFINE_int64(range_tombstone_width, 100, "Number of keys in tombstone's range");
+
+DEFINE_int64(max_num_range_tombstones, 0,
+             "Maximum number of range tombstones to insert.");
+
+DEFINE_bool(expand_range_tombstones, false,
+            "Expand range tombstone into sequential regular tombstones.");
+
+#ifndef ROCKSDB_LITE
+// Transactions Options
+DEFINE_bool(optimistic_transaction_db, false,
+            "Open a OptimisticTransactionDB instance. "
+            "Required for randomtransaction benchmark.");
+
+DEFINE_bool(transaction_db, false,
+            "Open a TransactionDB instance. "
+            "Required for randomtransaction benchmark.");
+
+DEFINE_uint64(transaction_sets, 2,
+              "Number of keys each transaction will "
+              "modify (use in RandomTransaction only).  Max: 9999");
+
+DEFINE_bool(transaction_set_snapshot, false,
+            "Setting to true will have each transaction call SetSnapshot()"
+            " upon creation.");
+
+DEFINE_int32(transaction_sleep, 0,
+             "Max microseconds to sleep in between "
+             "reading and writing a value (used in RandomTransaction only). ");
+
+DEFINE_uint64(transaction_lock_timeout, 100,
+              "If using a transaction_db, specifies the lock wait timeout in"
+              " milliseconds before failing a transaction waiting on a lock");
+DEFINE_string(
+    options_file, "",
+    "The path to a RocksDB options file.  If specified, then db_bench will "
+    "run with the RocksDB options in the default column family of the "
+    "specified options file. "
+    "Note that with this setting, db_bench will ONLY accept the following "
+    "RocksDB options related command-line arguments, all other arguments "
+    "that are related to RocksDB options will be ignored:\n"
+    "\t--use_existing_db\n"
+    "\t--use_existing_keys\n"
+    "\t--statistics\n"
+    "\t--row_cache_size\n"
+    "\t--row_cache_numshardbits\n"
+    "\t--enable_io_prio\n"
+    "\t--dump_malloc_stats\n"
+    "\t--num_multi_db\n");
+
+// FIFO Compaction Options
+DEFINE_uint64(fifo_compaction_max_table_files_size_mb, 0,
+              "The limit of total table file sizes to trigger FIFO compaction");
+
+DEFINE_bool(fifo_compaction_allow_compaction, true,
+            "Allow compaction in FIFO compaction.");
+
+DEFINE_uint64(fifo_compaction_ttl, 0, "TTL for the SST Files in seconds.");
+
+DEFINE_uint64(fifo_age_for_warm, 0, "age_for_warm for FIFO compaction.");
+
+// Stacked BlobDB Options
+DEFINE_bool(use_blob_db, false, "[Stacked BlobDB] Open a BlobDB instance.");
+
+DEFINE_bool(
+    blob_db_enable_gc,
+    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().enable_garbage_collection,
+    "[Stacked BlobDB] Enable BlobDB garbage collection.");
+
+DEFINE_double(
+    blob_db_gc_cutoff,
+    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().garbage_collection_cutoff,
+    "[Stacked BlobDB] Cutoff ratio for BlobDB garbage collection.");
+
+DEFINE_bool(blob_db_is_fifo,
+            ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().is_fifo,
+            "[Stacked BlobDB] Enable FIFO eviction strategy in BlobDB.");
+
+DEFINE_uint64(blob_db_max_db_size,
+              ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().max_db_size,
+              "[Stacked BlobDB] Max size limit of the directory where blob "
+              "files are stored.");
+
+DEFINE_uint64(blob_db_max_ttl_range, 0,
+              "[Stacked BlobDB] TTL range to generate BlobDB data (in "
+              "seconds). 0 means no TTL.");
+
+DEFINE_uint64(
+    blob_db_ttl_range_secs,
+    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().ttl_range_secs,
+    "[Stacked BlobDB] TTL bucket size to use when creating blob files.");
+
+DEFINE_uint64(
+    blob_db_min_blob_size,
+    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size,
+    "[Stacked BlobDB] Smallest blob to store in a file. Blobs "
+    "smaller than this will be inlined with the key in the LSM tree.");
+
+DEFINE_uint64(blob_db_bytes_per_sync,
+              ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync,
+              "[Stacked BlobDB] Bytes to sync blob file at.");
+
+DEFINE_uint64(blob_db_file_size,
+              ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().blob_file_size,
+              "[Stacked BlobDB] Target size of each blob file.");
+
+DEFINE_string(
+    blob_db_compression_type, "snappy",
+    "[Stacked BlobDB] Algorithm to use to compress blobs in blob files.");
+static enum ROCKSDB_NAMESPACE::CompressionType
+    FLAGS_blob_db_compression_type_e = ROCKSDB_NAMESPACE::kSnappyCompression;
+
+#endif  // ROCKSDB_LITE
+
+// Integrated BlobDB options
+DEFINE_bool(
+    enable_blob_files,
+    ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().enable_blob_files,
+    "[Integrated BlobDB] Enable writing large values to separate blob files.");
+
+DEFINE_uint64(min_blob_size,
+              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().min_blob_size,
+              "[Integrated BlobDB] The size of the smallest value to be stored "
+              "separately in a blob file.");
+
+DEFINE_uint64(blob_file_size,
+              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().blob_file_size,
+              "[Integrated BlobDB] The size limit for blob files.");
+
+DEFINE_string(blob_compression_type, "none",
+              "[Integrated BlobDB] The compression algorithm to use for large "
+              "values stored in blob files.");
+
+DEFINE_bool(enable_blob_garbage_collection,
+            ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
+                .enable_blob_garbage_collection,
+            "[Integrated BlobDB] Enable blob garbage collection.");
+
+DEFINE_double(blob_garbage_collection_age_cutoff,
+              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
+                  .blob_garbage_collection_age_cutoff,
+              "[Integrated BlobDB] The cutoff in terms of blob file age for "
+              "garbage collection.");
+
+DEFINE_double(blob_garbage_collection_force_threshold,
+              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
+                  .blob_garbage_collection_force_threshold,
+              "[Integrated BlobDB] The threshold for the ratio of garbage in "
+              "the oldest blob files for forcing garbage collection.");
+
+DEFINE_uint64(blob_compaction_readahead_size,
+              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
+                  .blob_compaction_readahead_size,
+              "[Integrated BlobDB] Compaction readahead for blob files.");
+
+DEFINE_int32(
+    blob_file_starting_level,
+    ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().blob_file_starting_level,
+    "[Integrated BlobDB] The starting level for blob files.");
+
+DEFINE_bool(use_blob_cache, false, "[Integrated BlobDB] Enable blob cache.");
+
+DEFINE_bool(
+    use_shared_block_and_blob_cache, true,
+    "[Integrated BlobDB] Use a shared backing cache for both block "
+    "cache and blob cache. It only takes effect if use_blob_cache is enabled.");
+
+DEFINE_uint64(
+    blob_cache_size, 8 << 20,
+    "[Integrated BlobDB] Number of bytes to use as a cache of blobs. It only "
+    "takes effect if the block and blob caches are different "
+    "(use_shared_block_and_blob_cache = false).");
+
+DEFINE_int32(blob_cache_numshardbits, 6,
+             "[Integrated BlobDB] Number of shards for the blob cache is 2 ** "
+             "blob_cache_numshardbits. Negative means use default settings. "
+             "It only takes effect if blob_cache_size is greater than 0, and "
+             "the block and blob caches are different "
+             "(use_shared_block_and_blob_cache = false).");
+
+DEFINE_int32(prepopulate_blob_cache, 0,
+             "[Integrated BlobDB] Pre-populate hot/warm blobs in blob cache. 0 "
+             "to disable and 1 to insert during flush.");
+
+#ifndef ROCKSDB_LITE
+
+// Secondary DB instance Options
+DEFINE_bool(use_secondary_db, false,
+            "Open a RocksDB secondary instance. A primary instance can be "
+            "running in another db_bench process.");
+
+DEFINE_string(secondary_path, "",
+              "Path to a directory used by the secondary instance to store "
+              "private files, e.g. info log.");
+
+DEFINE_int32(secondary_update_interval, 5,
+             "Secondary instance attempts to catch up with the primary every "
+             "secondary_update_interval seconds.");
+
+#endif  // ROCKSDB_LITE
+
+DEFINE_bool(report_bg_io_stats, false,
+            "Measure times spents on I/Os while in compactions. ");
+
+DEFINE_bool(use_stderr_info_logger, false,
+            "Write info logs to stderr instead of to LOG file. ");
+
+#ifndef ROCKSDB_LITE
+
+DEFINE_string(trace_file, "", "Trace workload to a file. ");
+
+DEFINE_double(trace_replay_fast_forward, 1.0,
+              "Fast forward trace replay, must > 0.0.");
+DEFINE_int32(block_cache_trace_sampling_frequency, 1,
+             "Block cache trace sampling frequency, termed s. It uses spatial "
+             "downsampling and samples accesses to one out of s blocks.");
+DEFINE_int64(
+    block_cache_trace_max_trace_file_size_in_bytes,
+    uint64_t{64} * 1024 * 1024 * 1024,
+    "The maximum block cache trace file size in bytes. Block cache accesses "
+    "will not be logged if the trace file size exceeds this threshold. Default "
+    "is 64 GB.");
+DEFINE_string(block_cache_trace_file, "", "Block cache trace file path.");
+DEFINE_int32(trace_replay_threads, 1,
+             "The number of threads to replay, must >=1.");
+
+DEFINE_bool(io_uring_enabled, true,
+            "If true, enable the use of IO uring if the platform supports it");
+extern "C" bool RocksDbIOUringEnable() { return FLAGS_io_uring_enabled; }
+#endif  // ROCKSDB_LITE
+
+DEFINE_bool(adaptive_readahead, false,
+            "carry forward internal auto readahead size from one file to next "
+            "file at each level during iteration");
+
+DEFINE_bool(rate_limit_user_ops, false,
+            "When true use Env::IO_USER priority level to charge internal rate "
+            "limiter for reads associated with user operations.");
+
+DEFINE_bool(file_checksum, false,
+            "When true use FileChecksumGenCrc32cFactory for "
+            "file_checksum_gen_factory.");
+
+DEFINE_bool(rate_limit_auto_wal_flush, false,
+            "When true use Env::IO_USER priority level to charge internal rate "
+            "limiter for automatic WAL flush (`Options::manual_wal_flush` == "
+            "false) after the user write operation.");
+
+DEFINE_bool(async_io, false,
+            "When set true, RocksDB does asynchronous reads for internal auto "
+            "readahead prefetching.");
+
+DEFINE_bool(optimize_multiget_for_io, true,
+            "When set true, RocksDB does asynchronous reads for SST files in "
+            "multiple levels for MultiGet.");
+
+DEFINE_bool(charge_compression_dictionary_building_buffer, false,
+            "Setting for "
+            "CacheEntryRoleOptions::charged of "
+            "CacheEntryRole::kCompressionDictionaryBuildingBuffer");
+
+DEFINE_bool(charge_filter_construction, false,
+            "Setting for "
+            "CacheEntryRoleOptions::charged of "
+            "CacheEntryRole::kFilterConstruction");
+
+DEFINE_bool(charge_table_reader, false,
+            "Setting for "
+            "CacheEntryRoleOptions::charged of "
+            "CacheEntryRole::kBlockBasedTableReader");
+
+DEFINE_bool(charge_file_metadata, false,
+            "Setting for "
+            "CacheEntryRoleOptions::charged of "
+            "CacheEntryRole::kFileMetadata");
+
+DEFINE_bool(charge_blob_cache, false,
+            "Setting for "
+            "CacheEntryRoleOptions::charged of "
+            "CacheEntryRole::kBlobCache");
+
+DEFINE_uint64(backup_rate_limit, 0ull,
+              "If non-zero, db_bench will rate limit reads and writes for DB "
+              "backup. This "
+              "is the global rate in ops/second.");
+
+DEFINE_uint64(restore_rate_limit, 0ull,
+              "If non-zero, db_bench will rate limit reads and writes for DB "
+              "restore. This "
+              "is the global rate in ops/second.");
+
+DEFINE_string(backup_dir, "",
+              "If not empty string, use the given dir for backup.");
+
+DEFINE_string(restore_dir, "",
+              "If not empty string, use the given dir for restore.");
+
+DEFINE_uint64(
+    initial_auto_readahead_size,
+    ROCKSDB_NAMESPACE::BlockBasedTableOptions().initial_auto_readahead_size,
+    "RocksDB does auto-readahead for iterators on noticing more than two reads "
+    "for a table file if user doesn't provide readahead_size. The readahead "
+    "size starts at initial_auto_readahead_size");
+
+DEFINE_uint64(
+    max_auto_readahead_size,
+    ROCKSDB_NAMESPACE::BlockBasedTableOptions().max_auto_readahead_size,
+    "Rocksdb implicit readahead starts at "
+    "BlockBasedTableOptions.initial_auto_readahead_size and doubles on every "
+    "additional read upto max_auto_readahead_size");
+
+DEFINE_uint64(
+    num_file_reads_for_auto_readahead,
+    ROCKSDB_NAMESPACE::BlockBasedTableOptions()
+        .num_file_reads_for_auto_readahead,
+    "Rocksdb implicit readahead is enabled if reads are sequential and "
+    "num_file_reads_for_auto_readahead indicates after how many sequential "
+    "reads into that file internal auto prefetching should be start.");
+
+static enum ROCKSDB_NAMESPACE::CompressionType StringToCompressionType(
+    const char* ctype) {
+  assert(ctype);
+
+  if (!strcasecmp(ctype, "none"))
+    return ROCKSDB_NAMESPACE::kNoCompression;
+  else if (!strcasecmp(ctype, "snappy"))
+    return ROCKSDB_NAMESPACE::kSnappyCompression;
+  else if (!strcasecmp(ctype, "zlib"))
+    return ROCKSDB_NAMESPACE::kZlibCompression;
+  else if (!strcasecmp(ctype, "bzip2"))
+    return ROCKSDB_NAMESPACE::kBZip2Compression;
+  else if (!strcasecmp(ctype, "lz4"))
+    return ROCKSDB_NAMESPACE::kLZ4Compression;
+  else if (!strcasecmp(ctype, "lz4hc"))
+    return ROCKSDB_NAMESPACE::kLZ4HCCompression;
+  else if (!strcasecmp(ctype, "xpress"))
+    return ROCKSDB_NAMESPACE::kXpressCompression;
+  else if (!strcasecmp(ctype, "zstd"))
+    return ROCKSDB_NAMESPACE::kZSTD;
+  else {
+    fprintf(stderr, "Cannot parse compression type '%s'\n", ctype);
+    exit(1);
+  }
+}
+
+static std::string ColumnFamilyName(size_t i) {
+  if (i == 0) {
+    return ROCKSDB_NAMESPACE::kDefaultColumnFamilyName;
+  } else {
+    char name[100];
+    snprintf(name, sizeof(name), "column_family_name_%06zu", i);
+    return std::string(name);
+  }
+}
+
+DEFINE_string(compression_type, "snappy",
+              "Algorithm to use to compress the database");
+static enum ROCKSDB_NAMESPACE::CompressionType FLAGS_compression_type_e =
+    ROCKSDB_NAMESPACE::kSnappyCompression;
+
+DEFINE_int64(sample_for_compression, 0, "Sample every N block for compression");
+
+DEFINE_int32(compression_level, ROCKSDB_NAMESPACE::CompressionOptions().level,
+             "Compression level. The meaning of this value is library-"
+             "dependent. If unset, we try to use the default for the library "
+             "specified in `--compression_type`");
+
+DEFINE_int32(compression_max_dict_bytes,
+             ROCKSDB_NAMESPACE::CompressionOptions().max_dict_bytes,
+             "Maximum size of dictionary used to prime the compression "
+             "library.");
+
+DEFINE_int32(compression_zstd_max_train_bytes,
+             ROCKSDB_NAMESPACE::CompressionOptions().zstd_max_train_bytes,
+             "Maximum size of training data passed to zstd's dictionary "
+             "trainer.");
+
+DEFINE_int32(min_level_to_compress, -1,
+             "If non-negative, compression starts"
+             " from this level. Levels with number < min_level_to_compress are"
+             " not compressed. Otherwise, apply compression_type to "
+             "all levels.");
+
+DEFINE_int32(compression_parallel_threads, 1,
+             "Number of threads for parallel compression.");
+
+DEFINE_uint64(compression_max_dict_buffer_bytes,
+              ROCKSDB_NAMESPACE::CompressionOptions().max_dict_buffer_bytes,
+              "Maximum bytes to buffer to collect samples for dictionary.");
+
+DEFINE_bool(compression_use_zstd_dict_trainer,
+            ROCKSDB_NAMESPACE::CompressionOptions().use_zstd_dict_trainer,
+            "If true, use ZSTD_TrainDictionary() to create dictionary, else"
+            "use ZSTD_FinalizeDictionary() to create dictionary");
+
+static bool ValidateTableCacheNumshardbits(const char* flagname,
+                                           int32_t value) {
+  if (0 >= value || value >= 20) {
+    fprintf(stderr, "Invalid value for --%s: %d, must be  0 < val < 20\n",
+            flagname, value);
+    return false;
+  }
+  return true;
+}
+DEFINE_int32(table_cache_numshardbits, 4, "");
+
+#ifndef ROCKSDB_LITE
+DEFINE_string(env_uri, "",
+              "URI for registry Env lookup. Mutually exclusive with --fs_uri");
+DEFINE_string(fs_uri, "",
+              "URI for registry Filesystem lookup. Mutually exclusive"
+              " with --env_uri."
+              " Creates a default environment with the specified filesystem.");
+#endif  // ROCKSDB_LITE
+DEFINE_string(simulate_hybrid_fs_file, "",
+              "File for Store Metadata for Simulate hybrid FS. Empty means "
+              "disable the feature. Now, if it is set, last_level_temperature "
+              "is set to kWarm.");
+DEFINE_int32(simulate_hybrid_hdd_multipliers, 1,
+             "In simulate_hybrid_fs_file or simulate_hdd mode, how many HDDs "
+             "are simulated.");
+DEFINE_bool(simulate_hdd, false, "Simulate read/write latency on HDD.");
+
+DEFINE_int64(
+    preclude_last_level_data_seconds, 0,
+    "Preclude the latest data from the last level. (Used for tiered storage)");
+
+DEFINE_int64(preserve_internal_time_seconds, 0,
+             "Preserve the internal time information which stores with SST.");
+
+static std::shared_ptr<ROCKSDB_NAMESPACE::Env> env_guard;
+
+static ROCKSDB_NAMESPACE::Env* FLAGS_env = ROCKSDB_NAMESPACE::Env::Default();
+
+DEFINE_int64(stats_interval, 0,
+             "Stats are reported every N operations when this is greater than "
+             "zero. When 0 the interval grows over time.");
+
+DEFINE_int64(stats_interval_seconds, 0,
+             "Report stats every N seconds. This overrides stats_interval when"
+             " both are > 0.");
+
+DEFINE_int32(stats_per_interval, 0,
+             "Reports additional stats per interval when this is greater than "
+             "0.");
+
+DEFINE_uint64(slow_usecs, 1000000,
+              "A message is printed for operations that take at least this "
+              "many microseconds.");
+
+DEFINE_int64(report_interval_seconds, 0,
+             "If greater than zero, it will write simple stats in CSV format "
+             "to --report_file every N seconds");
+
+DEFINE_string(report_file, "report.csv",
+              "Filename where some simple stats are reported to (if "
+              "--report_interval_seconds is bigger than 0)");
+
+DEFINE_int32(thread_status_per_interval, 0,
+             "Takes and report a snapshot of the current status of each thread"
+             " when this is greater than 0.");
+
+DEFINE_int32(perf_level, ROCKSDB_NAMESPACE::PerfLevel::kDisable,
+             "Level of perf collection");
+
+DEFINE_uint64(soft_pending_compaction_bytes_limit, 64ull * 1024 * 1024 * 1024,
+              "Slowdown writes if pending compaction bytes exceed this number");
+
+DEFINE_uint64(hard_pending_compaction_bytes_limit, 128ull * 1024 * 1024 * 1024,
+              "Stop writes if pending compaction bytes exceed this number");
+
+DEFINE_uint64(delayed_write_rate, 8388608u,
+              "Limited bytes allowed to DB when soft_rate_limit or "
+              "level0_slowdown_writes_trigger triggers");
+
+DEFINE_bool(enable_pipelined_write, true,
+            "Allow WAL and memtable writes to be pipelined");
+
+DEFINE_bool(
+    unordered_write, false,
+    "Enable the unordered write feature, which provides higher throughput but "
+    "relaxes the guarantees around atomic reads and immutable snapshots");
+
+DEFINE_bool(allow_concurrent_memtable_write, true,
+            "Allow multi-writers to update mem tables in parallel.");
+
+DEFINE_double(experimental_mempurge_threshold, 0.0,
+              "Maximum useful payload ratio estimate that triggers a mempurge "
+              "(memtable garbage collection).");
+
+DEFINE_bool(inplace_update_support,
+            ROCKSDB_NAMESPACE::Options().inplace_update_support,
+            "Support in-place memtable update for smaller or same-size values");
+
+DEFINE_uint64(inplace_update_num_locks,
+              ROCKSDB_NAMESPACE::Options().inplace_update_num_locks,
+              "Number of RW locks to protect in-place memtable updates");
+
+DEFINE_bool(enable_write_thread_adaptive_yield, true,
+            "Use a yielding spin loop for brief writer thread waits.");
+
+DEFINE_uint64(
+    write_thread_max_yield_usec, 100,
+    "Maximum microseconds for enable_write_thread_adaptive_yield operation.");
+
+DEFINE_uint64(write_thread_slow_yield_usec, 3,
+              "The threshold at which a slow yield is considered a signal that "
+              "other processes or threads want the core.");
+
+DEFINE_uint64(rate_limiter_bytes_per_sec, 0, "Set options.rate_limiter value.");
+
+DEFINE_int64(rate_limiter_refill_period_us, 100 * 1000,
+             "Set refill period on rate limiter.");
+
+DEFINE_bool(rate_limiter_auto_tuned, false,
+            "Enable dynamic adjustment of rate limit according to demand for "
+            "background I/O");
+
+DEFINE_bool(sine_write_rate, false, "Use a sine wave write_rate_limit");
+
+DEFINE_uint64(
+    sine_write_rate_interval_milliseconds, 10000,
+    "Interval of which the sine wave write_rate_limit is recalculated");
+
+DEFINE_double(sine_a, 1, "A in f(x) = A sin(bx + c) + d");
+
+DEFINE_double(sine_b, 1, "B in f(x) = A sin(bx + c) + d");
+
+DEFINE_double(sine_c, 0, "C in f(x) = A sin(bx + c) + d");
+
+DEFINE_double(sine_d, 1, "D in f(x) = A sin(bx + c) + d");
+
+DEFINE_bool(rate_limit_bg_reads, false,
+            "Use options.rate_limiter on compaction reads");
+
+DEFINE_uint64(
+    benchmark_write_rate_limit, 0,
+    "If non-zero, db_bench will rate-limit the writes going into RocksDB. This "
+    "is the global rate in bytes/second.");
+
+// the parameters of mix_graph
+DEFINE_double(keyrange_dist_a, 0.0,
+              "The parameter 'a' of prefix average access distribution "
+              "f(x)=a*exp(b*x)+c*exp(d*x)");
+DEFINE_double(keyrange_dist_b, 0.0,
+              "The parameter 'b' of prefix average access distribution "
+              "f(x)=a*exp(b*x)+c*exp(d*x)");
+DEFINE_double(keyrange_dist_c, 0.0,
+              "The parameter 'c' of prefix average access distribution"
+              "f(x)=a*exp(b*x)+c*exp(d*x)");
+DEFINE_double(keyrange_dist_d, 0.0,
+              "The parameter 'd' of prefix average access distribution"
+              "f(x)=a*exp(b*x)+c*exp(d*x)");
+DEFINE_int64(keyrange_num, 1,
+             "The number of key ranges that are in the same prefix "
+             "group, each prefix range will have its key access distribution");
+DEFINE_double(key_dist_a, 0.0,
+              "The parameter 'a' of key access distribution model f(x)=a*x^b");
+DEFINE_double(key_dist_b, 0.0,
+              "The parameter 'b' of key access distribution model f(x)=a*x^b");
+DEFINE_double(value_theta, 0.0,
+              "The parameter 'theta' of Generized Pareto Distribution "
+              "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
+// Use reasonable defaults based on the mixgraph paper
+DEFINE_double(value_k, 0.2615,
+              "The parameter 'k' of Generized Pareto Distribution "
+              "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
+// Use reasonable defaults based on the mixgraph paper
+DEFINE_double(value_sigma, 25.45,
+              "The parameter 'theta' of Generized Pareto Distribution "
+              "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
+DEFINE_double(iter_theta, 0.0,
+              "The parameter 'theta' of Generized Pareto Distribution "
+              "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
+// Use reasonable defaults based on the mixgraph paper
+DEFINE_double(iter_k, 2.517,
+              "The parameter 'k' of Generized Pareto Distribution "
+              "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
+// Use reasonable defaults based on the mixgraph paper
+DEFINE_double(iter_sigma, 14.236,
+              "The parameter 'sigma' of Generized Pareto Distribution "
+              "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
+DEFINE_double(mix_get_ratio, 1.0,
+              "The ratio of Get queries of mix_graph workload");
+DEFINE_double(mix_put_ratio, 0.0,
+              "The ratio of Put queries of mix_graph workload");
+DEFINE_double(mix_seek_ratio, 0.0,
+              "The ratio of Seek queries of mix_graph workload");
+DEFINE_int64(mix_max_scan_len, 10000, "The max scan length of Iterator");
+DEFINE_int64(mix_max_value_size, 1024, "The max value size of this workload");
+DEFINE_double(
+    sine_mix_rate_noise, 0.0,
+    "Add the noise ratio to the sine rate, it is between 0.0 and 1.0");
+DEFINE_bool(sine_mix_rate, false,
+            "Enable the sine QPS control on the mix workload");
+DEFINE_uint64(
+    sine_mix_rate_interval_milliseconds, 10000,
+    "Interval of which the sine wave read_rate_limit is recalculated");
+DEFINE_int64(mix_accesses, -1,
+             "The total query accesses of mix_graph workload");
+
+DEFINE_uint64(
+    benchmark_read_rate_limit, 0,
+    "If non-zero, db_bench will rate-limit the reads from RocksDB. This "
+    "is the global rate in ops/second.");
+
+DEFINE_uint64(max_compaction_bytes,
+              ROCKSDB_NAMESPACE::Options().max_compaction_bytes,
+              "Max bytes allowed in one compaction");
+
+#ifndef ROCKSDB_LITE
+DEFINE_bool(readonly, false, "Run read only benchmarks.");
+
+DEFINE_bool(print_malloc_stats, false,
+            "Print malloc stats to stdout after benchmarks finish.");
+#endif  // ROCKSDB_LITE
+
+DEFINE_bool(disable_auto_compactions, false, "Do not auto trigger compactions");
+
+DEFINE_uint64(wal_ttl_seconds, 0, "Set the TTL for the WAL Files in seconds.");
+DEFINE_uint64(wal_size_limit_MB, 0,
+              "Set the size limit for the WAL Files in MB.");
+DEFINE_uint64(max_total_wal_size, 0, "Set total max WAL size");
+
+DEFINE_bool(mmap_read, ROCKSDB_NAMESPACE::Options().allow_mmap_reads,
+            "Allow reads to occur via mmap-ing files");
+
+DEFINE_bool(mmap_write, ROCKSDB_NAMESPACE::Options().allow_mmap_writes,
+            "Allow writes to occur via mmap-ing files");
+
+DEFINE_bool(use_direct_reads, ROCKSDB_NAMESPACE::Options().use_direct_reads,
+            "Use O_DIRECT for reading data");
+
+DEFINE_bool(use_direct_io_for_flush_and_compaction,
+            ROCKSDB_NAMESPACE::Options().use_direct_io_for_flush_and_compaction,
+            "Use O_DIRECT for background flush and compaction writes");
+
+DEFINE_bool(advise_random_on_open,
+            ROCKSDB_NAMESPACE::Options().advise_random_on_open,
+            "Advise random access on table file open");
+
+DEFINE_string(compaction_fadvice, "NORMAL",
+              "Access pattern advice when a file is compacted");
+static auto FLAGS_compaction_fadvice_e =
+    ROCKSDB_NAMESPACE::Options().access_hint_on_compaction_start;
+
+DEFINE_bool(use_tailing_iterator, false,
+            "Use tailing iterator to access a series of keys instead of get");
+
+DEFINE_bool(use_adaptive_mutex, ROCKSDB_NAMESPACE::Options().use_adaptive_mutex,
+            "Use adaptive mutex");
+
+DEFINE_uint64(bytes_per_sync, ROCKSDB_NAMESPACE::Options().bytes_per_sync,
+              "Allows OS to incrementally sync SST files to disk while they are"
+              " being written, in the background. Issue one request for every"
+              " bytes_per_sync written. 0 turns it off.");
+
+DEFINE_uint64(wal_bytes_per_sync,
+              ROCKSDB_NAMESPACE::Options().wal_bytes_per_sync,
+              "Allows OS to incrementally sync WAL files to disk while they are"
+              " being written, in the background. Issue one request for every"
+              " wal_bytes_per_sync written. 0 turns it off.");
+
+DEFINE_bool(use_single_deletes, true,
+            "Use single deletes (used in RandomReplaceKeys only).");
+
+DEFINE_double(stddev, 2000.0,
+              "Standard deviation of normal distribution used for picking keys"
+              " (used in RandomReplaceKeys only).");
+
+DEFINE_int32(key_id_range, 100000,
+             "Range of possible value of key id (used in TimeSeries only).");
+
+DEFINE_string(expire_style, "none",
+              "Style to remove expired time entries. Can be one of the options "
+              "below: none (do not expired data), compaction_filter (use a "
+              "compaction filter to remove expired data), delete (seek IDs and "
+              "remove expired data) (used in TimeSeries only).");
+
+DEFINE_uint64(
+    time_range, 100000,
+    "Range of timestamp that store in the database (used in TimeSeries"
+    " only).");
+
+DEFINE_int32(num_deletion_threads, 1,
+             "Number of threads to do deletion (used in TimeSeries and delete "
+             "expire_style only).");
+
+DEFINE_int32(max_successive_merges, 0,
+             "Maximum number of successive merge operations on a key in the "
+             "memtable");
+
+static bool ValidatePrefixSize(const char* flagname, int32_t value) {
+  if (value < 0 || value >= 2000000000) {
+    fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n",
+            flagname, value);
+    return false;
+  }
+  return true;
+}
+
+DEFINE_int32(prefix_size, 0,
+             "control the prefix size for HashSkipList and plain table");
+DEFINE_int64(keys_per_prefix, 0,
+             "control average number of keys generated per prefix, 0 means no "
+             "special handling of the prefix, i.e. use the prefix comes with "
+             "the generated random number.");
+DEFINE_bool(total_order_seek, false,
+            "Enable total order seek regardless of index format.");
+DEFINE_bool(prefix_same_as_start, false,
+            "Enforce iterator to return keys with prefix same as seek key.");
+DEFINE_bool(
+    seek_missing_prefix, false,
+    "Iterator seek to keys with non-exist prefixes. Require prefix_size > 8");
+
+DEFINE_int32(memtable_insert_with_hint_prefix_size, 0,
+             "If non-zero, enable "
+             "memtable insert with hint with the given prefix size.");
+DEFINE_bool(enable_io_prio, false,
+            "Lower the background flush/compaction threads' IO priority");
+DEFINE_bool(enable_cpu_prio, false,
+            "Lower the background flush/compaction threads' CPU priority");
+DEFINE_bool(identity_as_first_hash, false,
+            "the first hash function of cuckoo table becomes an identity "
+            "function. This is only valid when key is 8 bytes");
+DEFINE_bool(dump_malloc_stats, true, "Dump malloc stats in LOG ");
+DEFINE_uint64(stats_dump_period_sec,
+              ROCKSDB_NAMESPACE::Options().stats_dump_period_sec,
+              "Gap between printing stats to log in seconds");
+DEFINE_uint64(stats_persist_period_sec,
+              ROCKSDB_NAMESPACE::Options().stats_persist_period_sec,
+              "Gap between persisting stats in seconds");
+DEFINE_bool(persist_stats_to_disk,
+            ROCKSDB_NAMESPACE::Options().persist_stats_to_disk,
+            "whether to persist stats to disk");
+DEFINE_uint64(stats_history_buffer_size,
+              ROCKSDB_NAMESPACE::Options().stats_history_buffer_size,
+              "Max number of stats snapshots to keep in memory");
+DEFINE_bool(avoid_flush_during_recovery,
+            ROCKSDB_NAMESPACE::Options().avoid_flush_during_recovery,
+            "If true, avoids flushing the recovered WAL data where possible.");
+DEFINE_int64(multiread_stride, 0,
+             "Stride length for the keys in a MultiGet batch");
+DEFINE_bool(multiread_batched, false, "Use the new MultiGet API");
+
+DEFINE_string(memtablerep, "skip_list", "");
+DEFINE_int64(hash_bucket_count, 1024 * 1024, "hash bucket count");
+DEFINE_bool(use_plain_table, false,
+            "if use plain table instead of block-based table format");
+DEFINE_bool(use_cuckoo_table, false, "if use cuckoo table format");
+DEFINE_double(cuckoo_hash_ratio, 0.9, "Hash ratio for Cuckoo SST table.");
+DEFINE_bool(use_hash_search, false,
+            "if use kHashSearch instead of kBinarySearch. "
+            "This is valid if only we use BlockTable");
+DEFINE_string(merge_operator, "",
+              "The merge operator to use with the database."
+              "If a new merge operator is specified, be sure to use fresh"
+              " database The possible merge operators are defined in"
+              " utilities/merge_operators.h");
+DEFINE_int32(skip_list_lookahead, 0,
+             "Used with skip_list memtablerep; try linear search first for "
+             "this many steps from the previous position");
+DEFINE_bool(report_file_operations, false,
+            "if report number of file operations");
+DEFINE_bool(report_open_timing, false, "if report open timing");
+DEFINE_int32(readahead_size, 0, "Iterator readahead size");
+
+DEFINE_bool(read_with_latest_user_timestamp, true,
+            "If true, always use the current latest timestamp for read. If "
+            "false, choose a random timestamp from the past.");
+
+#ifndef ROCKSDB_LITE
+DEFINE_string(secondary_cache_uri, "",
+              "Full URI for creating a custom secondary cache object");
+static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
+#endif  // ROCKSDB_LITE
+
+static const bool FLAGS_prefix_size_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
+
+static const bool FLAGS_key_size_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_key_size, &ValidateKeySize);
+
+static const bool FLAGS_cache_numshardbits_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_cache_numshardbits,
+                          &ValidateCacheNumshardbits);
+
+static const bool FLAGS_readwritepercent_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_readwritepercent, &ValidateInt32Percent);
+
+DEFINE_int32(disable_seek_compaction, false,
+             "Not used, left here for backwards compatibility");
+
+DEFINE_bool(allow_data_in_errors,
+            ROCKSDB_NAMESPACE::Options().allow_data_in_errors,
+            "If true, allow logging data, e.g. key, value in LOG files.");
+
+static const bool FLAGS_deletepercent_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_deletepercent, &ValidateInt32Percent);
+static const bool FLAGS_table_cache_numshardbits_dummy
+    __attribute__((__unused__)) = RegisterFlagValidator(
+        &FLAGS_table_cache_numshardbits, &ValidateTableCacheNumshardbits);
+
+DEFINE_uint32(write_batch_protection_bytes_per_key, 0,
+              "Size of per-key-value checksum in each write batch. Currently "
+              "only value 0 and 8 are supported.");
+
+DEFINE_uint32(
+    memtable_protection_bytes_per_key, 0,
+    "Enable memtable per key-value checksum protection. "
+    "Each entry in memtable will be suffixed by a per key-value checksum. "
+    "This options determines the size of such checksums. "
+    "Supported values: 0, 1, 2, 4, 8.");
+
+DEFINE_bool(build_info, false,
+            "Print the build info via GetRocksBuildInfoAsString");
+
+DEFINE_bool(track_and_verify_wals_in_manifest, false,
+            "If true, enable WAL tracking in the MANIFEST");
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+static Status CreateMemTableRepFactory(
+    const ConfigOptions& config_options,
+    std::shared_ptr<MemTableRepFactory>* factory) {
+  Status s;
+  if (!strcasecmp(FLAGS_memtablerep.c_str(), SkipListFactory::kNickName())) {
+    factory->reset(new SkipListFactory(FLAGS_skip_list_lookahead));
+#ifndef ROCKSDB_LITE
+  } else if (!strcasecmp(FLAGS_memtablerep.c_str(), "prefix_hash")) {
+    factory->reset(NewHashSkipListRepFactory(FLAGS_hash_bucket_count));
+  } else if (!strcasecmp(FLAGS_memtablerep.c_str(),
+                         VectorRepFactory::kNickName())) {
+    factory->reset(new VectorRepFactory());
+  } else if (!strcasecmp(FLAGS_memtablerep.c_str(), "hash_linkedlist")) {
+    factory->reset(NewHashLinkListRepFactory(FLAGS_hash_bucket_count));
+#endif  // ROCKSDB_LITE
+  } else {
+    std::unique_ptr<MemTableRepFactory> unique;
+    s = MemTableRepFactory::CreateFromString(config_options, FLAGS_memtablerep,
+                                             &unique);
+    if (s.ok()) {
+      factory->reset(unique.release());
+    }
+  }
+  return s;
+}
+
+}  // namespace
+
+enum DistributionType : unsigned char { kFixed = 0, kUniform, kNormal };
+
+static enum DistributionType FLAGS_value_size_distribution_type_e = kFixed;
+
+static enum DistributionType StringToDistributionType(const char* ctype) {
+  assert(ctype);
+
+  if (!strcasecmp(ctype, "fixed"))
+    return kFixed;
+  else if (!strcasecmp(ctype, "uniform"))
+    return kUniform;
+  else if (!strcasecmp(ctype, "normal"))
+    return kNormal;
+
+  fprintf(stdout, "Cannot parse distribution type '%s'\n", ctype);
+  exit(1);
+}
+
+class BaseDistribution {
+ public:
+  BaseDistribution(unsigned int _min, unsigned int _max)
+      : min_value_size_(_min), max_value_size_(_max) {}
+  virtual ~BaseDistribution() {}
+
+  unsigned int Generate() {
+    auto val = Get();
+    if (NeedTruncate()) {
+      val = std::max(min_value_size_, val);
+      val = std::min(max_value_size_, val);
+    }
+    return val;
+  }
+
+ private:
+  virtual unsigned int Get() = 0;
+  virtual bool NeedTruncate() { return true; }
+  unsigned int min_value_size_;
+  unsigned int max_value_size_;
+};
+
+class FixedDistribution : public BaseDistribution {
+ public:
+  FixedDistribution(unsigned int size)
+      : BaseDistribution(size, size), size_(size) {}
+
+ private:
+  virtual unsigned int Get() override { return size_; }
+  virtual bool NeedTruncate() override { return false; }
+  unsigned int size_;
+};
+
+class NormalDistribution : public BaseDistribution,
+                           public std::normal_distribution<double> {
+ public:
+  NormalDistribution(unsigned int _min, unsigned int _max)
+      : BaseDistribution(_min, _max),
+        // 99.7% values within the range [min, max].
+        std::normal_distribution<double>(
+            (double)(_min + _max) / 2.0 /*mean*/,
+            (double)(_max - _min) / 6.0 /*stddev*/),
+        gen_(rd_()) {}
+
+ private:
+  virtual unsigned int Get() override {
+    return static_cast<unsigned int>((*this)(gen_));
+  }
+  std::random_device rd_;
+  std::mt19937 gen_;
+};
+
+class UniformDistribution : public BaseDistribution,
+                            public std::uniform_int_distribution<unsigned int> {
+ public:
+  UniformDistribution(unsigned int _min, unsigned int _max)
+      : BaseDistribution(_min, _max),
+        std::uniform_int_distribution<unsigned int>(_min, _max),
+        gen_(rd_()) {}
+
+ private:
+  virtual unsigned int Get() override { return (*this)(gen_); }
+  virtual bool NeedTruncate() override { return false; }
+  std::random_device rd_;
+  std::mt19937 gen_;
+};
+
+// Helper for quickly generating random data.
+class RandomGenerator {
+ private:
+  std::string data_;
+  unsigned int pos_;
+  std::unique_ptr<BaseDistribution> dist_;
+
+ public:
+  RandomGenerator() {
+    auto max_value_size = FLAGS_value_size_max;
+    switch (FLAGS_value_size_distribution_type_e) {
+      case kUniform:
+        dist_.reset(new UniformDistribution(FLAGS_value_size_min,
+                                            FLAGS_value_size_max));
+        break;
+      case kNormal:
+        dist_.reset(
+            new NormalDistribution(FLAGS_value_size_min, FLAGS_value_size_max));
+        break;
+      case kFixed:
+      default:
+        dist_.reset(new FixedDistribution(value_size));
+        max_value_size = value_size;
+    }
+    // We use a limited amount of data over and over again and ensure
+    // that it is larger than the compression window (32KB), and also
+    // large enough to serve all typical value sizes we want to write.
+    Random rnd(301);
+    std::string piece;
+    while (data_.size() < (unsigned)std::max(1048576, max_value_size)) {
+      // Add a short fragment that is as compressible as specified
+      // by FLAGS_compression_ratio.
+      test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece);
+      data_.append(piece);
+    }
+    pos_ = 0;
+  }
+
+  Slice Generate(unsigned int len) {
+    assert(len <= data_.size());
+    if (pos_ + len > data_.size()) {
+      pos_ = 0;
+    }
+    pos_ += len;
+    return Slice(data_.data() + pos_ - len, len);
+  }
+
+  Slice Generate() {
+    auto len = dist_->Generate();
+    return Generate(len);
+  }
+};
+
+static void AppendWithSpace(std::string* str, Slice msg) {
+  if (msg.empty()) return;
+  if (!str->empty()) {
+    str->push_back(' ');
+  }
+  str->append(msg.data(), msg.size());
+}
+
+struct DBWithColumnFamilies {
+  std::vector<ColumnFamilyHandle*> cfh;
+  DB* db;
+#ifndef ROCKSDB_LITE
+  OptimisticTransactionDB* opt_txn_db;
+#endif                              // ROCKSDB_LITE
+  std::atomic<size_t> num_created;  // Need to be updated after all the
+                                    // new entries in cfh are set.
+  size_t num_hot;  // Number of column families to be queried at each moment.
+                   // After each CreateNewCf(), another num_hot number of new
+                   // Column families will be created and used to be queried.
+  port::Mutex create_cf_mutex;  // Only one thread can execute CreateNewCf()
+  std::vector<int> cfh_idx_to_prob;  // ith index holds probability of operating
+                                     // on cfh[i].
+
+  DBWithColumnFamilies()
+      : db(nullptr)
+#ifndef ROCKSDB_LITE
+        ,
+        opt_txn_db(nullptr)
+#endif  // ROCKSDB_LITE
+  {
+    cfh.clear();
+    num_created = 0;
+    num_hot = 0;
+  }
+
+  DBWithColumnFamilies(const DBWithColumnFamilies& other)
+      : cfh(other.cfh),
+        db(other.db),
+#ifndef ROCKSDB_LITE
+        opt_txn_db(other.opt_txn_db),
+#endif  // ROCKSDB_LITE
+        num_created(other.num_created.load()),
+        num_hot(other.num_hot),
+        cfh_idx_to_prob(other.cfh_idx_to_prob) {
+  }
+
+  void DeleteDBs() {
+    std::for_each(cfh.begin(), cfh.end(),
+                  [](ColumnFamilyHandle* cfhi) { delete cfhi; });
+    cfh.clear();
+#ifndef ROCKSDB_LITE
+    if (opt_txn_db) {
+      delete opt_txn_db;
+      opt_txn_db = nullptr;
+    } else {
+      delete db;
+      db = nullptr;
+    }
+#else
+    delete db;
+    db = nullptr;
+#endif  // ROCKSDB_LITE
+  }
+
+  ColumnFamilyHandle* GetCfh(int64_t rand_num) {
+    assert(num_hot > 0);
+    size_t rand_offset = 0;
+    if (!cfh_idx_to_prob.empty()) {
+      assert(cfh_idx_to_prob.size() == num_hot);
+      int sum = 0;
+      while (sum + cfh_idx_to_prob[rand_offset] < rand_num % 100) {
+        sum += cfh_idx_to_prob[rand_offset];
+        ++rand_offset;
+      }
+      assert(rand_offset < cfh_idx_to_prob.size());
+    } else {
+      rand_offset = rand_num % num_hot;
+    }
+    return cfh[num_created.load(std::memory_order_acquire) - num_hot +
+               rand_offset];
+  }
+
+  // stage: assume CF from 0 to stage * num_hot has be created. Need to create
+  //        stage * num_hot + 1 to stage * (num_hot + 1).
+  void CreateNewCf(ColumnFamilyOptions options, int64_t stage) {
+    MutexLock l(&create_cf_mutex);
+    if ((stage + 1) * num_hot <= num_created) {
+      // Already created.
+      return;
+    }
+    auto new_num_created = num_created + num_hot;
+    assert(new_num_created <= cfh.size());
+    for (size_t i = num_created; i < new_num_created; i++) {
+      Status s =
+          db->CreateColumnFamily(options, ColumnFamilyName(i), &(cfh[i]));
+      if (!s.ok()) {
+        fprintf(stderr, "create column family error: %s\n",
+                s.ToString().c_str());
+        abort();
+      }
+    }
+    num_created.store(new_num_created, std::memory_order_release);
+  }
+};
+
+// A class that reports stats to CSV file.
+class ReporterAgent {
+ public:
+  ReporterAgent(Env* env, const std::string& fname,
+                uint64_t report_interval_secs)
+      : env_(env),
+        total_ops_done_(0),
+        last_report_(0),
+        report_interval_secs_(report_interval_secs),
+        stop_(false) {
+    auto s = env_->NewWritableFile(fname, &report_file_, EnvOptions());
+    if (s.ok()) {
+      s = report_file_->Append(Header() + "\n");
+    }
+    if (s.ok()) {
+      s = report_file_->Flush();
+    }
+    if (!s.ok()) {
+      fprintf(stderr, "Can't open %s: %s\n", fname.c_str(),
+              s.ToString().c_str());
+      abort();
+    }
+
+    reporting_thread_ = port::Thread([&]() { SleepAndReport(); });
+  }
+
+  ~ReporterAgent() {
+    {
+      std::unique_lock<std::mutex> lk(mutex_);
+      stop_ = true;
+      stop_cv_.notify_all();
+    }
+    reporting_thread_.join();
+  }
+
+  // thread safe
+  void ReportFinishedOps(int64_t num_ops) {
+    total_ops_done_.fetch_add(num_ops);
+  }
+
+ private:
+  std::string Header() const { return "secs_elapsed,interval_qps"; }
+  void SleepAndReport() {
+    auto* clock = env_->GetSystemClock().get();
+    auto time_started = clock->NowMicros();
+    while (true) {
+      {
+        std::unique_lock<std::mutex> lk(mutex_);
+        if (stop_ ||
+            stop_cv_.wait_for(lk, std::chrono::seconds(report_interval_secs_),
+                              [&]() { return stop_; })) {
+          // stopping
+          break;
+        }
+        // else -> timeout, which means time for a report!
+      }
+      auto total_ops_done_snapshot = total_ops_done_.load();
+      // round the seconds elapsed
+      auto secs_elapsed =
+          (clock->NowMicros() - time_started + kMicrosInSecond / 2) /
+          kMicrosInSecond;
+      std::string report =
+          std::to_string(secs_elapsed) + "," +
+          std::to_string(total_ops_done_snapshot - last_report_) + "\n";
+      auto s = report_file_->Append(report);
+      if (s.ok()) {
+        s = report_file_->Flush();
+      }
+      if (!s.ok()) {
+        fprintf(stderr,
+                "Can't write to report file (%s), stopping the reporting\n",
+                s.ToString().c_str());
+        break;
+      }
+      last_report_ = total_ops_done_snapshot;
+    }
+  }
+
+  Env* env_;
+  std::unique_ptr<WritableFile> report_file_;
+  std::atomic<int64_t> total_ops_done_;
+  int64_t last_report_;
+  const uint64_t report_interval_secs_;
+  ROCKSDB_NAMESPACE::port::Thread reporting_thread_;
+  std::mutex mutex_;
+  // will notify on stop
+  std::condition_variable stop_cv_;
+  bool stop_;
+};
+
+enum OperationType : unsigned char {
+  kRead = 0,
+  kWrite,
+  kDelete,
+  kSeek,
+  kMerge,
+  kUpdate,
+  kCompress,
+  kUncompress,
+  kCrc,
+  kHash,
+  kOthers
+};
+
+static std::unordered_map<OperationType, std::string, std::hash<unsigned char>>
+    OperationTypeString = {{kRead, "read"},         {kWrite, "write"},
+                           {kDelete, "delete"},     {kSeek, "seek"},
+                           {kMerge, "merge"},       {kUpdate, "update"},
+                           {kCompress, "compress"}, {kCompress, "uncompress"},
+                           {kCrc, "crc"},           {kHash, "hash"},
+                           {kOthers, "op"}};
+
+class CombinedStats;
+class Stats {
+ private:
+  SystemClock* clock_;
+  int id_;
+  uint64_t start_ = 0;
+  uint64_t sine_interval_;
+  uint64_t finish_;
+  double seconds_;
+  uint64_t done_;
+  uint64_t last_report_done_;
+  uint64_t next_report_;
+  uint64_t bytes_;
+  uint64_t last_op_finish_;
+  uint64_t last_report_finish_;
+  std::unordered_map<OperationType, std::shared_ptr<HistogramImpl>,
+                     std::hash<unsigned char>>
+      hist_;
+  std::string message_;
+  bool exclude_from_merge_;
+  ReporterAgent* reporter_agent_;  // does not own
+  friend class CombinedStats;
+
+ public:
+  Stats() : clock_(FLAGS_env->GetSystemClock().get()) { Start(-1); }
+
+  void SetReporterAgent(ReporterAgent* reporter_agent) {
+    reporter_agent_ = reporter_agent;
+  }
+
+  void Start(int id) {
+    id_ = id;
+    next_report_ = FLAGS_stats_interval ? FLAGS_stats_interval : 100;
+    last_op_finish_ = start_;
+    hist_.clear();
+    done_ = 0;
+    last_report_done_ = 0;
+    bytes_ = 0;
+    seconds_ = 0;
+    start_ = clock_->NowMicros();
+    sine_interval_ = clock_->NowMicros();
+    finish_ = start_;
+    last_report_finish_ = start_;
+    message_.clear();
+    // When set, stats from this thread won't be merged with others.
+    exclude_from_merge_ = false;
+  }
+
+  void Merge(const Stats& other) {
+    if (other.exclude_from_merge_) return;
+
+    for (auto it = other.hist_.begin(); it != other.hist_.end(); ++it) {
+      auto this_it = hist_.find(it->first);
+      if (this_it != hist_.end()) {
+        this_it->second->Merge(*(other.hist_.at(it->first)));
+      } else {
+        hist_.insert({it->first, it->second});
+      }
+    }
+
+    done_ += other.done_;
+    bytes_ += other.bytes_;
+    seconds_ += other.seconds_;
+    if (other.start_ < start_) start_ = other.start_;
+    if (other.finish_ > finish_) finish_ = other.finish_;
+
+    // Just keep the messages from one thread.
+    if (message_.empty()) message_ = other.message_;
+  }
+
+  void Stop() {
+    finish_ = clock_->NowMicros();
+    seconds_ = (finish_ - start_) * 1e-6;
+  }
+
+  void AddMessage(Slice msg) { AppendWithSpace(&message_, msg); }
+
+  void SetId(int id) { id_ = id; }
+  void SetExcludeFromMerge() { exclude_from_merge_ = true; }
+
+  void PrintThreadStatus() {
+    std::vector<ThreadStatus> thread_list;
+    FLAGS_env->GetThreadList(&thread_list);
+
+    fprintf(stderr, "\n%18s %10s %12s %20s %13s %45s %12s %s\n", "ThreadID",
+            "ThreadType", "cfName", "Operation", "ElapsedTime", "Stage",
+            "State", "OperationProperties");
+
+    int64_t current_time = 0;
+    clock_->GetCurrentTime(&current_time).PermitUncheckedError();
+    for (auto ts : thread_list) {
+      fprintf(stderr, "%18" PRIu64 " %10s %12s %20s %13s %45s %12s",
+              ts.thread_id,
+              ThreadStatus::GetThreadTypeName(ts.thread_type).c_str(),
+              ts.cf_name.c_str(),
+              ThreadStatus::GetOperationName(ts.operation_type).c_str(),
+              ThreadStatus::MicrosToString(ts.op_elapsed_micros).c_str(),
+              ThreadStatus::GetOperationStageName(ts.operation_stage).c_str(),
+              ThreadStatus::GetStateName(ts.state_type).c_str());
+
+      auto op_properties = ThreadStatus::InterpretOperationProperties(
+          ts.operation_type, ts.op_properties);
+      for (const auto& op_prop : op_properties) {
+        fprintf(stderr, " %s %" PRIu64 " |", op_prop.first.c_str(),
+                op_prop.second);
+      }
+      fprintf(stderr, "\n");
+    }
+  }
+
+  void ResetSineInterval() { sine_interval_ = clock_->NowMicros(); }
+
+  uint64_t GetSineInterval() { return sine_interval_; }
+
+  uint64_t GetStart() { return start_; }
+
+  void ResetLastOpTime() {
+    // Set to now to avoid latency from calls to SleepForMicroseconds.
+    last_op_finish_ = clock_->NowMicros();
+  }
+
+  void FinishedOps(DBWithColumnFamilies* db_with_cfh, DB* db, int64_t num_ops,
+                   enum OperationType op_type = kOthers) {
+    if (reporter_agent_) {
+      reporter_agent_->ReportFinishedOps(num_ops);
+    }
+    if (FLAGS_histogram) {
+      uint64_t now = clock_->NowMicros();
+      uint64_t micros = now - last_op_finish_;
+
+      if (hist_.find(op_type) == hist_.end()) {
+        auto hist_temp = std::make_shared<HistogramImpl>();
+        hist_.insert({op_type, std::move(hist_temp)});
+      }
+      hist_[op_type]->Add(micros);
+
+      if (micros >= FLAGS_slow_usecs && !FLAGS_stats_interval) {
+        fprintf(stderr, "long op: %" PRIu64 " micros%30s\r", micros, "");
+        fflush(stderr);
+      }
+      last_op_finish_ = now;
+    }
+
+    done_ += num_ops;
+    if (done_ >= next_report_ && FLAGS_progress_reports) {
+      if (!FLAGS_stats_interval) {
+        if (next_report_ < 1000)
+          next_report_ += 100;
+        else if (next_report_ < 5000)
+          next_report_ += 500;
+        else if (next_report_ < 10000)
+          next_report_ += 1000;
+        else if (next_report_ < 50000)
+          next_report_ += 5000;
+        else if (next_report_ < 100000)
+          next_report_ += 10000;
+        else if (next_report_ < 500000)
+          next_report_ += 50000;
+        else
+          next_report_ += 100000;
+        fprintf(stderr, "... finished %" PRIu64 " ops%30s\r", done_, "");
+      } else {
+        uint64_t now = clock_->NowMicros();
+        int64_t usecs_since_last = now - last_report_finish_;
+
+        // Determine whether to print status where interval is either
+        // each N operations or each N seconds.
+
+        if (FLAGS_stats_interval_seconds &&
+            usecs_since_last < (FLAGS_stats_interval_seconds * 1000000)) {
+          // Don't check again for this many operations.
+          next_report_ += FLAGS_stats_interval;
+
+        } else {
+          fprintf(stderr,
+                  "%s ... thread %d: (%" PRIu64 ",%" PRIu64
+                  ") ops and "
+                  "(%.1f,%.1f) ops/second in (%.6f,%.6f) seconds\n",
+                  clock_->TimeToString(now / 1000000).c_str(), id_,
+                  done_ - last_report_done_, done_,
+                  (done_ - last_report_done_) / (usecs_since_last / 1000000.0),
+                  done_ / ((now - start_) / 1000000.0),
+                  (now - last_report_finish_) / 1000000.0,
+                  (now - start_) / 1000000.0);
+
+          if (id_ == 0 && FLAGS_stats_per_interval) {
+            std::string stats;
+
+            if (db_with_cfh && db_with_cfh->num_created.load()) {
+              for (size_t i = 0; i < db_with_cfh->num_created.load(); ++i) {
+                if (db->GetProperty(db_with_cfh->cfh[i], "rocksdb.cfstats",
+                                    &stats))
+                  fprintf(stderr, "%s\n", stats.c_str());
+                if (FLAGS_show_table_properties) {
+                  for (int level = 0; level < FLAGS_num_levels; ++level) {
+                    if (db->GetProperty(
+                            db_with_cfh->cfh[i],
+                            "rocksdb.aggregated-table-properties-at-level" +
+                                std::to_string(level),
+                            &stats)) {
+                      if (stats.find("# entries=0") == std::string::npos) {
+                        fprintf(stderr, "Level[%d]: %s\n", level,
+                                stats.c_str());
+                      }
+                    }
+                  }
+                }
+              }
+            } else if (db) {
+              if (db->GetProperty("rocksdb.stats", &stats)) {
+                fprintf(stderr, "%s", stats.c_str());
+              }
+              if (db->GetProperty("rocksdb.num-running-compactions", &stats)) {
+                fprintf(stderr, "num-running-compactions: %s\n", stats.c_str());
+              }
+              if (db->GetProperty("rocksdb.num-running-flushes", &stats)) {
+                fprintf(stderr, "num-running-flushes: %s\n\n", stats.c_str());
+              }
+              if (FLAGS_show_table_properties) {
+                for (int level = 0; level < FLAGS_num_levels; ++level) {
+                  if (db->GetProperty(
+                          "rocksdb.aggregated-table-properties-at-level" +
+                              std::to_string(level),
+                          &stats)) {
+                    if (stats.find("# entries=0") == std::string::npos) {
+                      fprintf(stderr, "Level[%d]: %s\n", level, stats.c_str());
+                    }
+                  }
+                }
+              }
+            }
+          }
+
+          next_report_ += FLAGS_stats_interval;
+          last_report_finish_ = now;
+          last_report_done_ = done_;
+        }
+      }
+      if (id_ == 0 && FLAGS_thread_status_per_interval) {
+        PrintThreadStatus();
+      }
+      fflush(stderr);
+    }
+  }
+
+  void AddBytes(int64_t n) { bytes_ += n; }
+
+  void Report(const Slice& name) {
+    // Pretend at least one op was done in case we are running a benchmark
+    // that does not call FinishedOps().
+    if (done_ < 1) done_ = 1;
+
+    std::string extra;
+    double elapsed = (finish_ - start_) * 1e-6;
+    if (bytes_ > 0) {
+      // Rate is computed on actual elapsed time, not the sum of per-thread
+      // elapsed times.
+      char rate[100];
+      snprintf(rate, sizeof(rate), "%6.1f MB/s",
+               (bytes_ / 1048576.0) / elapsed);
+      extra = rate;
+    }
+    AppendWithSpace(&extra, message_);
+    double throughput = (double)done_ / elapsed;
+
+    fprintf(stdout,
+            "%-12s : %11.3f micros/op %ld ops/sec %.3f seconds %" PRIu64
+            " operations;%s%s\n",
+            name.ToString().c_str(), seconds_ * 1e6 / done_, (long)throughput,
+            elapsed, done_, (extra.empty() ? "" : " "), extra.c_str());
+    if (FLAGS_histogram) {
+      for (auto it = hist_.begin(); it != hist_.end(); ++it) {
+        fprintf(stdout, "Microseconds per %s:\n%s\n",
+                OperationTypeString[it->first].c_str(),
+                it->second->ToString().c_str());
+      }
+    }
+    if (FLAGS_report_file_operations) {
+      auto* counted_fs =
+          FLAGS_env->GetFileSystem()->CheckedCast<CountedFileSystem>();
+      assert(counted_fs);
+      fprintf(stdout, "%s", counted_fs->PrintCounters().c_str());
+      counted_fs->ResetCounters();
+    }
+    fflush(stdout);
+  }
+};
+
+class CombinedStats {
+ public:
+  void AddStats(const Stats& stat) {
+    uint64_t total_ops = stat.done_;
+    uint64_t total_bytes_ = stat.bytes_;
+    double elapsed;
+
+    if (total_ops < 1) {
+      total_ops = 1;
+    }
+
+    elapsed = (stat.finish_ - stat.start_) * 1e-6;
+    throughput_ops_.emplace_back(total_ops / elapsed);
+
+    if (total_bytes_ > 0) {
+      double mbs = (total_bytes_ / 1048576.0);
+      throughput_mbs_.emplace_back(mbs / elapsed);
+    }
+  }
+
+  void Report(const std::string& bench_name) {
+    if (throughput_ops_.size() < 2) {
+      // skip if there are not enough samples
+      return;
+    }
+
+    const char* name = bench_name.c_str();
+    int num_runs = static_cast<int>(throughput_ops_.size());
+
+    if (throughput_mbs_.size() == throughput_ops_.size()) {
+      fprintf(stdout,
+              "%s [AVG %d runs] : %d (\xC2\xB1 %d) ops/sec; %6.1f (\xC2\xB1 "
+              "%.1f) MB/sec\n",
+              name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)),
+              static_cast<int>(CalcConfidence95(throughput_ops_)),
+              CalcAvg(throughput_mbs_), CalcConfidence95(throughput_mbs_));
+    } else {
+      fprintf(stdout, "%s [AVG %d runs] : %d (\xC2\xB1 %d) ops/sec\n", name,
+              num_runs, static_cast<int>(CalcAvg(throughput_ops_)),
+              static_cast<int>(CalcConfidence95(throughput_ops_)));
+    }
+  }
+
+  void ReportWithConfidenceIntervals(const std::string& bench_name) {
+    if (throughput_ops_.size() < 2) {
+      // skip if there are not enough samples
+      return;
+    }
+
+    const char* name = bench_name.c_str();
+    int num_runs = static_cast<int>(throughput_ops_.size());
+
+    int ops_avg = static_cast<int>(CalcAvg(throughput_ops_));
+    int ops_confidence_95 = static_cast<int>(CalcConfidence95(throughput_ops_));
+
+    if (throughput_mbs_.size() == throughput_ops_.size()) {
+      double mbs_avg = CalcAvg(throughput_mbs_);
+      double mbs_confidence_95 = CalcConfidence95(throughput_mbs_);
+      fprintf(stdout,
+              "%s [CI95 %d runs] : (%d, %d) ops/sec; (%.1f, %.1f) MB/sec\n",
+              name, num_runs, ops_avg - ops_confidence_95,
+              ops_avg + ops_confidence_95, mbs_avg - mbs_confidence_95,
+              mbs_avg + mbs_confidence_95);
+    } else {
+      fprintf(stdout, "%s [CI95 %d runs] : (%d, %d) ops/sec\n", name, num_runs,
+              ops_avg - ops_confidence_95, ops_avg + ops_confidence_95);
+    }
+  }
+
+  void ReportFinal(const std::string& bench_name) {
+    if (throughput_ops_.size() < 2) {
+      // skip if there are not enough samples
+      return;
+    }
+
+    const char* name = bench_name.c_str();
+    int num_runs = static_cast<int>(throughput_ops_.size());
+
+    if (throughput_mbs_.size() == throughput_ops_.size()) {
+      // \xC2\xB1 is +/- character in UTF-8
+      fprintf(stdout,
+              "%s [AVG    %d runs] : %d (\xC2\xB1 %d) ops/sec; %6.1f (\xC2\xB1 "
+              "%.1f) MB/sec\n"
+              "%s [MEDIAN %d runs] : %d ops/sec; %6.1f MB/sec\n",
+              name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)),
+              static_cast<int>(CalcConfidence95(throughput_ops_)),
+              CalcAvg(throughput_mbs_), CalcConfidence95(throughput_mbs_), name,
+              num_runs, static_cast<int>(CalcMedian(throughput_ops_)),
+              CalcMedian(throughput_mbs_));
+    } else {
+      fprintf(stdout,
+              "%s [AVG    %d runs] : %d (\xC2\xB1 %d) ops/sec\n"
+              "%s [MEDIAN %d runs] : %d ops/sec\n",
+              name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)),
+              static_cast<int>(CalcConfidence95(throughput_ops_)), name,
+              num_runs, static_cast<int>(CalcMedian(throughput_ops_)));
+    }
+  }
+
+ private:
+  double CalcAvg(std::vector<double>& data) {
+    double avg = 0;
+    for (double x : data) {
+      avg += x;
+    }
+    avg = avg / data.size();
+    return avg;
+  }
+
+  // Calculates 95% CI assuming a normal distribution of samples.
+  // Samples are not from a normal distribution, but it still
+  // provides useful approximation.
+  double CalcConfidence95(std::vector<double>& data) {
+    assert(data.size() > 1);
+    double avg = CalcAvg(data);
+    double std_error = CalcStdDev(data, avg) / std::sqrt(data.size());
+
+    // Z score for the 97.5 percentile
+    // see https://en.wikipedia.org/wiki/1.96
+    return 1.959964 * std_error;
+  }
+
+  double CalcMedian(std::vector<double>& data) {
+    assert(data.size() > 0);
+    std::sort(data.begin(), data.end());
+
+    size_t mid = data.size() / 2;
+    if (data.size() % 2 == 1) {
+      // Odd number of entries
+      return data[mid];
+    } else {
+      // Even number of entries
+      return (data[mid] + data[mid - 1]) / 2;
+    }
+  }
+
+  double CalcStdDev(std::vector<double>& data, double average) {
+    assert(data.size() > 1);
+    double squared_sum = 0.0;
+    for (double x : data) {
+      squared_sum += std::pow(x - average, 2);
+    }
+
+    // using samples count - 1 following Bessel's correction
+    // see https://en.wikipedia.org/wiki/Bessel%27s_correction
+    return std::sqrt(squared_sum / (data.size() - 1));
+  }
+
+  std::vector<double> throughput_ops_;
+  std::vector<double> throughput_mbs_;
+};
+
+class TimestampEmulator {
+ private:
+  std::atomic<uint64_t> timestamp_;
+
+ public:
+  TimestampEmulator() : timestamp_(0) {}
+  uint64_t Get() const { return timestamp_.load(); }
+  void Inc() { timestamp_++; }
+  Slice Allocate(char* scratch) {
+    // TODO: support larger timestamp sizes
+    assert(FLAGS_user_timestamp_size == 8);
+    assert(scratch);
+    uint64_t ts = timestamp_.fetch_add(1);
+    EncodeFixed64(scratch, ts);
+    return Slice(scratch, FLAGS_user_timestamp_size);
+  }
+  Slice GetTimestampForRead(Random64& rand, char* scratch) {
+    assert(FLAGS_user_timestamp_size == 8);
+    assert(scratch);
+    if (FLAGS_read_with_latest_user_timestamp) {
+      return Allocate(scratch);
+    }
+    // Choose a random timestamp from the past.
+    uint64_t ts = rand.Next() % Get();
+    EncodeFixed64(scratch, ts);
+    return Slice(scratch, FLAGS_user_timestamp_size);
+  }
+};
+
+// State shared by all concurrent executions of the same benchmark.
+struct SharedState {
+  port::Mutex mu;
+  port::CondVar cv;
+  int total;
+  int perf_level;
+  std::shared_ptr<RateLimiter> write_rate_limiter;
+  std::shared_ptr<RateLimiter> read_rate_limiter;
+
+  // Each thread goes through the following states:
+  //    (1) initializing
+  //    (2) waiting for others to be initialized
+  //    (3) running
+  //    (4) done
+
+  long num_initialized;
+  long num_done;
+  bool start;
+
+  SharedState() : cv(&mu), perf_level(FLAGS_perf_level) {}
+};
+
+// Per-thread state for concurrent executions of the same benchmark.
+struct ThreadState {
+  int tid;        // 0..n-1 when running in n threads
+  Random64 rand;  // Has different seeds for different threads
+  Stats stats;
+  SharedState* shared;
+
+  explicit ThreadState(int index, int my_seed)
+      : tid(index), rand(seed_base + my_seed) {}
+};
+
+class Duration {
+ public:
+  Duration(uint64_t max_seconds, int64_t max_ops, int64_t ops_per_stage = 0) {
+    max_seconds_ = max_seconds;
+    max_ops_ = max_ops;
+    ops_per_stage_ = (ops_per_stage > 0) ? ops_per_stage : max_ops;
+    ops_ = 0;
+    start_at_ = FLAGS_env->NowMicros();
+  }
+
+  int64_t GetStage() { return std::min(ops_, max_ops_ - 1) / ops_per_stage_; }
+
+  bool Done(int64_t increment) {
+    if (increment <= 0) increment = 1;  // avoid Done(0) and infinite loops
+    ops_ += increment;
+
+    if (max_seconds_) {
+      // Recheck every appx 1000 ops (exact iff increment is factor of 1000)
+      auto granularity = FLAGS_ops_between_duration_checks;
+      if ((ops_ / granularity) != ((ops_ - increment) / granularity)) {
+        uint64_t now = FLAGS_env->NowMicros();
+        return ((now - start_at_) / 1000000) >= max_seconds_;
+      } else {
+        return false;
+      }
+    } else {
+      return ops_ > max_ops_;
+    }
+  }
+
+ private:
+  uint64_t max_seconds_;
+  int64_t max_ops_;
+  int64_t ops_per_stage_;
+  int64_t ops_;
+  uint64_t start_at_;
+};
+
+class Benchmark {
+ private:
+  std::shared_ptr<Cache> cache_;
+  std::shared_ptr<Cache> compressed_cache_;
+  std::shared_ptr<const SliceTransform> prefix_extractor_;
+  DBWithColumnFamilies db_;
+  std::vector<DBWithColumnFamilies> multi_dbs_;
+  int64_t num_;
+  int key_size_;
+  int user_timestamp_size_;
+  int prefix_size_;
+  int total_thread_count_;
+  int64_t keys_per_prefix_;
+  int64_t entries_per_batch_;
+  int64_t writes_before_delete_range_;
+  int64_t writes_per_range_tombstone_;
+  int64_t range_tombstone_width_;
+  int64_t max_num_range_tombstones_;
+  ReadOptions read_options_;
+  WriteOptions write_options_;
+  Options open_options_;  // keep options around to properly destroy db later
+#ifndef ROCKSDB_LITE
+  TraceOptions trace_options_;
+  TraceOptions block_cache_trace_options_;
+#endif
+  int64_t reads_;
+  int64_t deletes_;
+  double read_random_exp_range_;
+  int64_t writes_;
+  int64_t readwrites_;
+  int64_t merge_keys_;
+  bool report_file_operations_;
+  bool use_blob_db_;    // Stacked BlobDB
+  bool read_operands_;  // read via GetMergeOperands()
+  std::vector<std::string> keys_;
+
+  class ErrorHandlerListener : public EventListener {
+   public:
+#ifndef ROCKSDB_LITE
+    ErrorHandlerListener()
+        : mutex_(),
+          cv_(&mutex_),
+          no_auto_recovery_(false),
+          recovery_complete_(false) {}
+
+    ~ErrorHandlerListener() override {}
+
+    const char* Name() const override { return kClassName(); }
+    static const char* kClassName() { return "ErrorHandlerListener"; }
+
+    void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/,
+                              Status /*bg_error*/,
+                              bool* auto_recovery) override {
+      if (*auto_recovery && no_auto_recovery_) {
+        *auto_recovery = false;
+      }
+    }
+
+    void OnErrorRecoveryCompleted(Status /*old_bg_error*/) override {
+      InstrumentedMutexLock l(&mutex_);
+      recovery_complete_ = true;
+      cv_.SignalAll();
+    }
+
+    bool WaitForRecovery(uint64_t abs_time_us) {
+      InstrumentedMutexLock l(&mutex_);
+      if (!recovery_complete_) {
+        cv_.TimedWait(abs_time_us);
+      }
+      if (recovery_complete_) {
+        recovery_complete_ = false;
+        return true;
+      }
+      return false;
+    }
+
+    void EnableAutoRecovery(bool enable = true) { no_auto_recovery_ = !enable; }
+
+   private:
+    InstrumentedMutex mutex_;
+    InstrumentedCondVar cv_;
+    bool no_auto_recovery_;
+    bool recovery_complete_;
+#else   // ROCKSDB_LITE
+    bool WaitForRecovery(uint64_t /*abs_time_us*/) { return true; }
+    void EnableAutoRecovery(bool /*enable*/) {}
+#endif  // ROCKSDB_LITE
+  };
+
+  std::shared_ptr<ErrorHandlerListener> listener_;
+
+  std::unique_ptr<TimestampEmulator> mock_app_clock_;
+
+  bool SanityCheck() {
+    if (FLAGS_compression_ratio > 1) {
+      fprintf(stderr, "compression_ratio should be between 0 and 1\n");
+      return false;
+    }
+    return true;
+  }
+
+  inline bool CompressSlice(const CompressionInfo& compression_info,
+                            const Slice& input, std::string* compressed) {
+    constexpr uint32_t compress_format_version = 2;
+
+    return CompressData(input, compression_info, compress_format_version,
+                        compressed);
+  }
+
+  void PrintHeader(const Options& options) {
+    PrintEnvironment();
+    fprintf(stdout,
+            "Keys:       %d bytes each (+ %d bytes user-defined timestamp)\n",
+            FLAGS_key_size, FLAGS_user_timestamp_size);
+    auto avg_value_size = FLAGS_value_size;
+    if (FLAGS_value_size_distribution_type_e == kFixed) {
+      fprintf(stdout,
+              "Values:     %d bytes each (%d bytes after compression)\n",
+              avg_value_size,
+              static_cast<int>(avg_value_size * FLAGS_compression_ratio + 0.5));
+    } else {
+      avg_value_size = (FLAGS_value_size_min + FLAGS_value_size_max) / 2;
+      fprintf(stdout,
+              "Values:     %d avg bytes each (%d bytes after compression)\n",
+              avg_value_size,
+              static_cast<int>(avg_value_size * FLAGS_compression_ratio + 0.5));
+      fprintf(stdout, "Values Distribution: %s (min: %d, max: %d)\n",
+              FLAGS_value_size_distribution_type.c_str(), FLAGS_value_size_min,
+              FLAGS_value_size_max);
+    }
+    fprintf(stdout, "Entries:    %" PRIu64 "\n", num_);
+    fprintf(stdout, "Prefix:    %d bytes\n", FLAGS_prefix_size);
+    fprintf(stdout, "Keys per prefix:    %" PRIu64 "\n", keys_per_prefix_);
+    fprintf(stdout, "RawSize:    %.1f MB (estimated)\n",
+            ((static_cast<int64_t>(FLAGS_key_size + avg_value_size) * num_) /
+             1048576.0));
+    fprintf(
+        stdout, "FileSize:   %.1f MB (estimated)\n",
+        (((FLAGS_key_size + avg_value_size * FLAGS_compression_ratio) * num_) /
+         1048576.0));
+    fprintf(stdout, "Write rate: %" PRIu64 " bytes/second\n",
+            FLAGS_benchmark_write_rate_limit);
+    fprintf(stdout, "Read rate: %" PRIu64 " ops/second\n",
+            FLAGS_benchmark_read_rate_limit);
+    if (FLAGS_enable_numa) {
+      fprintf(stderr, "Running in NUMA enabled mode.\n");
+#ifndef NUMA
+      fprintf(stderr, "NUMA is not defined in the system.\n");
+      exit(1);
+#else
+      if (numa_available() == -1) {
+        fprintf(stderr, "NUMA is not supported by the system.\n");
+        exit(1);
+      }
+#endif
+    }
+
+    auto compression = CompressionTypeToString(FLAGS_compression_type_e);
+    fprintf(stdout, "Compression: %s\n", compression.c_str());
+    fprintf(stdout, "Compression sampling rate: %" PRId64 "\n",
+            FLAGS_sample_for_compression);
+    if (options.memtable_factory != nullptr) {
+      fprintf(stdout, "Memtablerep: %s\n",
+              options.memtable_factory->GetId().c_str());
+    }
+    fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level);
+
+    PrintWarnings(compression.c_str());
+    fprintf(stdout, "------------------------------------------------\n");
+  }
+
+  void PrintWarnings(const char* compression) {
+#if defined(__GNUC__) && !defined(__OPTIMIZE__)
+    fprintf(
+        stdout,
+        "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n");
+#endif
+#ifndef NDEBUG
+    fprintf(stdout,
+            "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
+#endif
+    if (FLAGS_compression_type_e != ROCKSDB_NAMESPACE::kNoCompression) {
+      // The test string should not be too small.
+      const int len = FLAGS_block_size;
+      std::string input_str(len, 'y');
+      std::string compressed;
+      CompressionOptions opts;
+      CompressionContext context(FLAGS_compression_type_e);
+      CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
+                           FLAGS_compression_type_e,
+                           FLAGS_sample_for_compression);
+      bool result = CompressSlice(info, Slice(input_str), &compressed);
+
+      if (!result) {
+        fprintf(stdout, "WARNING: %s compression is not enabled\n",
+                compression);
+      } else if (compressed.size() >= input_str.size()) {
+        fprintf(stdout, "WARNING: %s compression is not effective\n",
+                compression);
+      }
+    }
+  }
+
+// Current the following isn't equivalent to OS_LINUX.
+#if defined(__linux)
+  static Slice TrimSpace(Slice s) {
+    unsigned int start = 0;
+    while (start < s.size() && isspace(s[start])) {
+      start++;
+    }
+    unsigned int limit = static_cast<unsigned int>(s.size());
+    while (limit > start && isspace(s[limit - 1])) {
+      limit--;
+    }
+    return Slice(s.data() + start, limit - start);
+  }
+#endif
+
+  void PrintEnvironment() {
+    fprintf(stderr, "RocksDB:    version %s\n",
+            GetRocksVersionAsString(true).c_str());
+
+#if defined(__linux) || defined(__APPLE__) || defined(__FreeBSD__)
+    time_t now = time(nullptr);
+    char buf[52];
+    // Lint complains about ctime() usage, so replace it with ctime_r(). The
+    // requirement is to provide a buffer which is at least 26 bytes.
+    fprintf(stderr, "Date:       %s",
+            ctime_r(&now, buf));  // ctime_r() adds newline
+
+#if defined(__linux)
+    FILE* cpuinfo = fopen("/proc/cpuinfo", "r");
+    if (cpuinfo != nullptr) {
+      char line[1000];
+      int num_cpus = 0;
+      std::string cpu_type;
+      std::string cache_size;
+      while (fgets(line, sizeof(line), cpuinfo) != nullptr) {
+        const char* sep = strchr(line, ':');
+        if (sep == nullptr) {
+          continue;
+        }
+        Slice key = TrimSpace(Slice(line, sep - 1 - line));
+        Slice val = TrimSpace(Slice(sep + 1));
+        if (key == "model name") {
+          ++num_cpus;
+          cpu_type = val.ToString();
+        } else if (key == "cache size") {
+          cache_size = val.ToString();
+        }
+      }
+      fclose(cpuinfo);
+      fprintf(stderr, "CPU:        %d * %s\n", num_cpus, cpu_type.c_str());
+      fprintf(stderr, "CPUCache:   %s\n", cache_size.c_str());
+    }
+#elif defined(__APPLE__)
+    struct host_basic_info h;
+    size_t hlen = HOST_BASIC_INFO_COUNT;
+    if (host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&h,
+                  (uint32_t*)&hlen) == KERN_SUCCESS) {
+      std::string cpu_type;
+      std::string cache_size;
+      size_t hcache_size;
+      hlen = sizeof(hcache_size);
+      if (sysctlbyname("hw.cachelinesize", &hcache_size, &hlen, NULL, 0) == 0) {
+        cache_size = std::to_string(hcache_size);
+      }
+      switch (h.cpu_type) {
+        case CPU_TYPE_X86_64:
+          cpu_type = "x86_64";
+          break;
+        case CPU_TYPE_ARM64:
+          cpu_type = "arm64";
+          break;
+        default:
+          break;
+      }
+      fprintf(stderr, "CPU:        %d * %s\n", h.max_cpus, cpu_type.c_str());
+      fprintf(stderr, "CPUCache:   %s\n", cache_size.c_str());
+    }
+#elif defined(__FreeBSD__)
+    int ncpus;
+    size_t len = sizeof(ncpus);
+    int mib[2] = {CTL_HW, HW_NCPU};
+    if (sysctl(mib, 2, &ncpus, &len, nullptr, 0) == 0) {
+      char cpu_type[16];
+      len = sizeof(cpu_type) - 1;
+      mib[1] = HW_MACHINE;
+      if (sysctl(mib, 2, cpu_type, &len, nullptr, 0) == 0) cpu_type[len] = 0;
+
+      fprintf(stderr, "CPU:        %d * %s\n", ncpus, cpu_type);
+      // no programmatic way to get the cache line size except on PPC
+    }
+#endif
+#endif
+  }
+
+  static bool KeyExpired(const TimestampEmulator* timestamp_emulator,
+                         const Slice& key) {
+    const char* pos = key.data();
+    pos += 8;
+    uint64_t timestamp = 0;
+    if (port::kLittleEndian) {
+      int bytes_to_fill = 8;
+      for (int i = 0; i < bytes_to_fill; ++i) {
+        timestamp |= (static_cast<uint64_t>(static_cast<unsigned char>(pos[i]))
+                      << ((bytes_to_fill - i - 1) << 3));
+      }
+    } else {
+      memcpy(&timestamp, pos, sizeof(timestamp));
+    }
+    return timestamp_emulator->Get() - timestamp > FLAGS_time_range;
+  }
+
+  class ExpiredTimeFilter : public CompactionFilter {
+   public:
+    explicit ExpiredTimeFilter(
+        const std::shared_ptr<TimestampEmulator>& timestamp_emulator)
+        : timestamp_emulator_(timestamp_emulator) {}
+    bool Filter(int /*level*/, const Slice& key,
+                const Slice& /*existing_value*/, std::string* /*new_value*/,
+                bool* /*value_changed*/) const override {
+      return KeyExpired(timestamp_emulator_.get(), key);
+    }
+    const char* Name() const override { return "ExpiredTimeFilter"; }
+
+   private:
+    std::shared_ptr<TimestampEmulator> timestamp_emulator_;
+  };
+
+  class KeepFilter : public CompactionFilter {
+   public:
+    bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+                std::string* /*new_value*/,
+                bool* /*value_changed*/) const override {
+      return false;
+    }
+
+    const char* Name() const override { return "KeepFilter"; }
+  };
+
+  static std::shared_ptr<MemoryAllocator> GetCacheAllocator() {
+    std::shared_ptr<MemoryAllocator> allocator;
+
+    if (FLAGS_use_cache_jemalloc_no_dump_allocator) {
+      JemallocAllocatorOptions jemalloc_options;
+      if (!NewJemallocNodumpAllocator(jemalloc_options, &allocator).ok()) {
+        fprintf(stderr, "JemallocNodumpAllocator not supported.\n");
+        exit(1);
+      }
+    } else if (FLAGS_use_cache_memkind_kmem_allocator) {
+#ifdef MEMKIND
+      allocator = std::make_shared<MemkindKmemAllocator>();
+#else
+      fprintf(stderr, "Memkind library is not linked with the binary.\n");
+      exit(1);
+#endif
+    }
+
+    return allocator;
+  }
+
+  static std::shared_ptr<Cache> NewCache(int64_t capacity) {
+    if (capacity <= 0) {
+      return nullptr;
+    }
+    if (FLAGS_cache_type == "clock_cache") {
+      fprintf(stderr, "Old clock cache implementation has been removed.\n");
+      exit(1);
+    } else if (FLAGS_cache_type == "hyper_clock_cache") {
+      return HyperClockCacheOptions(static_cast<size_t>(capacity),
+                                    FLAGS_block_size /*estimated_entry_charge*/,
+                                    FLAGS_cache_numshardbits)
+          .MakeSharedCache();
+    } else if (FLAGS_cache_type == "lru_cache") {
+      LRUCacheOptions opts(
+          static_cast<size_t>(capacity), FLAGS_cache_numshardbits,
+          false /*strict_capacity_limit*/, FLAGS_cache_high_pri_pool_ratio,
+          GetCacheAllocator(), kDefaultToAdaptiveMutex,
+          kDefaultCacheMetadataChargePolicy, FLAGS_cache_low_pri_pool_ratio);
+
+#ifndef ROCKSDB_LITE
+      if (!FLAGS_secondary_cache_uri.empty()) {
+        Status s = SecondaryCache::CreateFromString(
+            ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache);
+        if (secondary_cache == nullptr) {
+          fprintf(
+              stderr,
+              "No secondary cache registered matching string: %s status=%s\n",
+              FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str());
+          exit(1);
+        }
+        opts.secondary_cache = secondary_cache;
+      }
+#endif  // ROCKSDB_LITE
+
+      if (FLAGS_use_compressed_secondary_cache) {
+        CompressedSecondaryCacheOptions secondary_cache_opts;
+        secondary_cache_opts.capacity = FLAGS_compressed_secondary_cache_size;
+        secondary_cache_opts.num_shard_bits =
+            FLAGS_compressed_secondary_cache_numshardbits;
+        secondary_cache_opts.high_pri_pool_ratio =
+            FLAGS_compressed_secondary_cache_high_pri_pool_ratio;
+        secondary_cache_opts.low_pri_pool_ratio =
+            FLAGS_compressed_secondary_cache_low_pri_pool_ratio;
+        secondary_cache_opts.compression_type =
+            FLAGS_compressed_secondary_cache_compression_type_e;
+        secondary_cache_opts.compress_format_version =
+            FLAGS_compressed_secondary_cache_compress_format_version;
+        opts.secondary_cache =
+            NewCompressedSecondaryCache(secondary_cache_opts);
+      }
+
+      return NewLRUCache(opts);
+    } else {
+      fprintf(stderr, "Cache type not supported.");
+      exit(1);
+    }
+  }
+
+ public:
+  Benchmark()
+      : cache_(NewCache(FLAGS_cache_size)),
+        compressed_cache_(NewCache(FLAGS_compressed_cache_size)),
+        prefix_extractor_(FLAGS_prefix_size != 0
+                              ? NewFixedPrefixTransform(FLAGS_prefix_size)
+                              : nullptr),
+        num_(FLAGS_num),
+        key_size_(FLAGS_key_size),
+        user_timestamp_size_(FLAGS_user_timestamp_size),
+        prefix_size_(FLAGS_prefix_size),
+        total_thread_count_(0),
+        keys_per_prefix_(FLAGS_keys_per_prefix),
+        entries_per_batch_(1),
+        reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
+        read_random_exp_range_(0.0),
+        writes_(FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes),
+        readwrites_(
+            (FLAGS_writes < 0 && FLAGS_reads < 0)
+                ? FLAGS_num
+                : ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads)),
+        merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys),
+        report_file_operations_(FLAGS_report_file_operations),
+#ifndef ROCKSDB_LITE
+        use_blob_db_(FLAGS_use_blob_db),  // Stacked BlobDB
+#else
+        use_blob_db_(false),  // Stacked BlobDB
+#endif  // !ROCKSDB_LITE
+        read_operands_(false) {
+    // use simcache instead of cache
+    if (FLAGS_simcache_size >= 0) {
+      if (FLAGS_cache_numshardbits >= 1) {
+        cache_ =
+            NewSimCache(cache_, FLAGS_simcache_size, FLAGS_cache_numshardbits);
+      } else {
+        cache_ = NewSimCache(cache_, FLAGS_simcache_size, 0);
+      }
+    }
+
+    if (report_file_operations_) {
+      FLAGS_env = new CompositeEnvWrapper(
+          FLAGS_env,
+          std::make_shared<CountedFileSystem>(FLAGS_env->GetFileSystem()));
+    }
+
+    if (FLAGS_prefix_size > FLAGS_key_size) {
+      fprintf(stderr, "prefix size is larger than key size");
+      exit(1);
+    }
+
+    std::vector<std::string> files;
+    FLAGS_env->GetChildren(FLAGS_db, &files);
+    for (size_t i = 0; i < files.size(); i++) {
+      if (Slice(files[i]).starts_with("heap-")) {
+        FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]);
+      }
+    }
+    if (!FLAGS_use_existing_db) {
+      Options options;
+      options.env = FLAGS_env;
+      if (!FLAGS_wal_dir.empty()) {
+        options.wal_dir = FLAGS_wal_dir;
+      }
+#ifndef ROCKSDB_LITE
+      if (use_blob_db_) {
+        // Stacked BlobDB
+        blob_db::DestroyBlobDB(FLAGS_db, options, blob_db::BlobDBOptions());
+      }
+#endif  // !ROCKSDB_LITE
+      DestroyDB(FLAGS_db, options);
+      if (!FLAGS_wal_dir.empty()) {
+        FLAGS_env->DeleteDir(FLAGS_wal_dir);
+      }
+
+      if (FLAGS_num_multi_db > 1) {
+        FLAGS_env->CreateDir(FLAGS_db);
+        if (!FLAGS_wal_dir.empty()) {
+          FLAGS_env->CreateDir(FLAGS_wal_dir);
+        }
+      }
+    }
+
+    listener_.reset(new ErrorHandlerListener());
+    if (user_timestamp_size_ > 0) {
+      mock_app_clock_.reset(new TimestampEmulator());
+    }
+  }
+
+  void DeleteDBs() {
+    db_.DeleteDBs();
+    for (const DBWithColumnFamilies& dbwcf : multi_dbs_) {
+      delete dbwcf.db;
+    }
+  }
+
+  ~Benchmark() {
+    DeleteDBs();
+    if (cache_.get() != nullptr) {
+      // Clear cache reference first
+      open_options_.write_buffer_manager.reset();
+      // this will leak, but we're shutting down so nobody cares
+      cache_->DisownData();
+    }
+  }
+
+  Slice AllocateKey(std::unique_ptr<const char[]>* key_guard) {
+    char* data = new char[key_size_];
+    const char* const_data = data;
+    key_guard->reset(const_data);
+    return Slice(key_guard->get(), key_size_);
+  }
+
+  // Generate key according to the given specification and random number.
+  // The resulting key will have the following format:
+  //   - If keys_per_prefix_ is positive, extra trailing bytes are either cut
+  //     off or padded with '0'.
+  //     The prefix value is derived from key value.
+  //     ----------------------------
+  //     | prefix 00000 | key 00000 |
+  //     ----------------------------
+  //
+  //   - If keys_per_prefix_ is 0, the key is simply a binary representation of
+  //     random number followed by trailing '0's
+  //     ----------------------------
+  //     |        key 00000         |
+  //     ----------------------------
+  void GenerateKeyFromInt(uint64_t v, int64_t num_keys, Slice* key) {
+    if (!keys_.empty()) {
+      assert(FLAGS_use_existing_keys);
+      assert(keys_.size() == static_cast<size_t>(num_keys));
+      assert(v < static_cast<uint64_t>(num_keys));
+      *key = keys_[v];
+      return;
+    }
+    char* start = const_cast<char*>(key->data());
+    char* pos = start;
+    if (keys_per_prefix_ > 0) {
+      int64_t num_prefix = num_keys / keys_per_prefix_;
+      int64_t prefix = v % num_prefix;
+      int bytes_to_fill = std::min(prefix_size_, 8);
+      if (port::kLittleEndian) {
+        for (int i = 0; i < bytes_to_fill; ++i) {
+          pos[i] = (prefix >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
+        }
+      } else {
+        memcpy(pos, static_cast<void*>(&prefix), bytes_to_fill);
+      }
+      if (prefix_size_ > 8) {
+        // fill the rest with 0s
+        memset(pos + 8, '0', prefix_size_ - 8);
+      }
+      pos += prefix_size_;
+    }
+
+    int bytes_to_fill = std::min(key_size_ - static_cast<int>(pos - start), 8);
+    if (port::kLittleEndian) {
+      for (int i = 0; i < bytes_to_fill; ++i) {
+        pos[i] = (v >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
+      }
+    } else {
+      memcpy(pos, static_cast<void*>(&v), bytes_to_fill);
+    }
+    pos += bytes_to_fill;
+    if (key_size_ > pos - start) {
+      memset(pos, '0', key_size_ - (pos - start));
+    }
+  }
+
+  void GenerateKeyFromIntForSeek(uint64_t v, int64_t num_keys, Slice* key) {
+    GenerateKeyFromInt(v, num_keys, key);
+    if (FLAGS_seek_missing_prefix) {
+      assert(prefix_size_ > 8);
+      char* key_ptr = const_cast<char*>(key->data());
+      // This rely on GenerateKeyFromInt filling paddings with '0's.
+      // Putting a '1' will create a non-existing prefix.
+      key_ptr[8] = '1';
+    }
+  }
+
+  std::string GetPathForMultiple(std::string base_name, size_t id) {
+    if (!base_name.empty()) {
+#ifndef OS_WIN
+      if (base_name.back() != '/') {
+        base_name += '/';
+      }
+#else
+      if (base_name.back() != '\\') {
+        base_name += '\\';
+      }
+#endif
+    }
+    return base_name + std::to_string(id);
+  }
+
+  void VerifyDBFromDB(std::string& truth_db_name) {
+    DBWithColumnFamilies truth_db;
+    auto s = DB::OpenForReadOnly(open_options_, truth_db_name, &truth_db.db);
+    if (!s.ok()) {
+      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
+      exit(1);
+    }
+    ReadOptions ro;
+    ro.total_order_seek = true;
+    std::unique_ptr<Iterator> truth_iter(truth_db.db->NewIterator(ro));
+    std::unique_ptr<Iterator> db_iter(db_.db->NewIterator(ro));
+    // Verify that all the key/values in truth_db are retrivable in db with
+    // ::Get
+    fprintf(stderr, "Verifying db >= truth_db with ::Get...\n");
+    for (truth_iter->SeekToFirst(); truth_iter->Valid(); truth_iter->Next()) {
+      std::string value;
+      s = db_.db->Get(ro, truth_iter->key(), &value);
+      assert(s.ok());
+      // TODO(myabandeh): provide debugging hints
+      assert(Slice(value) == truth_iter->value());
+    }
+    // Verify that the db iterator does not give any extra key/value
+    fprintf(stderr, "Verifying db == truth_db...\n");
+    for (db_iter->SeekToFirst(), truth_iter->SeekToFirst(); db_iter->Valid();
+         db_iter->Next(), truth_iter->Next()) {
+      assert(truth_iter->Valid());
+      assert(truth_iter->value() == db_iter->value());
+    }
+    // No more key should be left unchecked in truth_db
+    assert(!truth_iter->Valid());
+    fprintf(stderr, "...Verified\n");
+  }
+
+  void ErrorExit() {
+    DeleteDBs();
+    exit(1);
+  }
+
+  void Run() {
+    if (!SanityCheck()) {
+      ErrorExit();
+    }
+    Open(&open_options_);
+    PrintHeader(open_options_);
+    std::stringstream benchmark_stream(FLAGS_benchmarks);
+    std::string name;
+    std::unique_ptr<ExpiredTimeFilter> filter;
+    while (std::getline(benchmark_stream, name, ',')) {
+      // Sanitize parameters
+      num_ = FLAGS_num;
+      reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads);
+      writes_ = (FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes);
+      deletes_ = (FLAGS_deletes < 0 ? FLAGS_num : FLAGS_deletes);
+      value_size = FLAGS_value_size;
+      key_size_ = FLAGS_key_size;
+      entries_per_batch_ = FLAGS_batch_size;
+      writes_before_delete_range_ = FLAGS_writes_before_delete_range;
+      writes_per_range_tombstone_ = FLAGS_writes_per_range_tombstone;
+      range_tombstone_width_ = FLAGS_range_tombstone_width;
+      max_num_range_tombstones_ = FLAGS_max_num_range_tombstones;
+      write_options_ = WriteOptions();
+      read_random_exp_range_ = FLAGS_read_random_exp_range;
+      if (FLAGS_sync) {
+        write_options_.sync = true;
+      }
+      write_options_.disableWAL = FLAGS_disable_wal;
+      write_options_.rate_limiter_priority =
+          FLAGS_rate_limit_auto_wal_flush ? Env::IO_USER : Env::IO_TOTAL;
+      read_options_ = ReadOptions(FLAGS_verify_checksum, true);
+      read_options_.total_order_seek = FLAGS_total_order_seek;
+      read_options_.prefix_same_as_start = FLAGS_prefix_same_as_start;
+      read_options_.rate_limiter_priority =
+          FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL;
+      read_options_.tailing = FLAGS_use_tailing_iterator;
+      read_options_.readahead_size = FLAGS_readahead_size;
+      read_options_.adaptive_readahead = FLAGS_adaptive_readahead;
+      read_options_.async_io = FLAGS_async_io;
+      read_options_.optimize_multiget_for_io = FLAGS_optimize_multiget_for_io;
+
+      void (Benchmark::*method)(ThreadState*) = nullptr;
+      void (Benchmark::*post_process_method)() = nullptr;
+
+      bool fresh_db = false;
+      int num_threads = FLAGS_threads;
+
+      int num_repeat = 1;
+      int num_warmup = 0;
+      if (!name.empty() && *name.rbegin() == ']') {
+        auto it = name.find('[');
+        if (it == std::string::npos) {
+          fprintf(stderr, "unknown benchmark arguments '%s'\n", name.c_str());
+          ErrorExit();
+        }
+        std::string args = name.substr(it + 1);
+        args.resize(args.size() - 1);
+        name.resize(it);
+
+        std::string bench_arg;
+        std::stringstream args_stream(args);
+        while (std::getline(args_stream, bench_arg, '-')) {
+          if (bench_arg.empty()) {
+            continue;
+          }
+          if (bench_arg[0] == 'X') {
+            // Repeat the benchmark n times
+            std::string num_str = bench_arg.substr(1);
+            num_repeat = std::stoi(num_str);
+          } else if (bench_arg[0] == 'W') {
+            // Warm up the benchmark for n times
+            std::string num_str = bench_arg.substr(1);
+            num_warmup = std::stoi(num_str);
+          }
+        }
+      }
+
+      // Both fillseqdeterministic and filluniquerandomdeterministic
+      // fill the levels except the max level with UNIQUE_RANDOM
+      // and fill the max level with fillseq and filluniquerandom, respectively
+      if (name == "fillseqdeterministic" ||
+          name == "filluniquerandomdeterministic") {
+        if (!FLAGS_disable_auto_compactions) {
+          fprintf(stderr,
+                  "Please disable_auto_compactions in FillDeterministic "
+                  "benchmark\n");
+          ErrorExit();
+        }
+        if (num_threads > 1) {
+          fprintf(stderr,
+                  "filldeterministic multithreaded not supported"
+                  ", use 1 thread\n");
+          num_threads = 1;
+        }
+        fresh_db = true;
+        if (name == "fillseqdeterministic") {
+          method = &Benchmark::WriteSeqDeterministic;
+        } else {
+          method = &Benchmark::WriteUniqueRandomDeterministic;
+        }
+      } else if (name == "fillseq") {
+        fresh_db = true;
+        method = &Benchmark::WriteSeq;
+      } else if (name == "fillbatch") {
+        fresh_db = true;
+        entries_per_batch_ = 1000;
+        method = &Benchmark::WriteSeq;
+      } else if (name == "fillrandom") {
+        fresh_db = true;
+        method = &Benchmark::WriteRandom;
+      } else if (name == "filluniquerandom" ||
+                 name == "fillanddeleteuniquerandom") {
+        fresh_db = true;
+        if (num_threads > 1) {
+          fprintf(stderr,
+                  "filluniquerandom and fillanddeleteuniquerandom "
+                  "multithreaded not supported, use 1 thread");
+          num_threads = 1;
+        }
+        method = &Benchmark::WriteUniqueRandom;
+      } else if (name == "overwrite") {
+        method = &Benchmark::WriteRandom;
+      } else if (name == "fillsync") {
+        fresh_db = true;
+        num_ /= 1000;
+        write_options_.sync = true;
+        method = &Benchmark::WriteRandom;
+      } else if (name == "fill100K") {
+        fresh_db = true;
+        num_ /= 1000;
+        value_size = 100 * 1000;
+        method = &Benchmark::WriteRandom;
+      } else if (name == "readseq") {
+        method = &Benchmark::ReadSequential;
+      } else if (name == "readtorowcache") {
+        if (!FLAGS_use_existing_keys || !FLAGS_row_cache_size) {
+          fprintf(stderr,
+                  "Please set use_existing_keys to true and specify a "
+                  "row cache size in readtorowcache benchmark\n");
+          ErrorExit();
+        }
+        method = &Benchmark::ReadToRowCache;
+      } else if (name == "readtocache") {
+        method = &Benchmark::ReadSequential;
+        num_threads = 1;
+        reads_ = num_;
+      } else if (name == "readreverse") {
+        method = &Benchmark::ReadReverse;
+      } else if (name == "readrandom") {
+        if (FLAGS_multiread_stride) {
+          fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
+                  entries_per_batch_);
+        }
+        method = &Benchmark::ReadRandom;
+      } else if (name == "readrandomfast") {
+        method = &Benchmark::ReadRandomFast;
+      } else if (name == "multireadrandom") {
+        fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
+                entries_per_batch_);
+        method = &Benchmark::MultiReadRandom;
+      } else if (name == "multireadwhilewriting") {
+        fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
+                entries_per_batch_);
+        num_threads++;
+        method = &Benchmark::MultiReadWhileWriting;
+      } else if (name == "approximatesizerandom") {
+        fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
+                entries_per_batch_);
+        method = &Benchmark::ApproximateSizeRandom;
+      } else if (name == "mixgraph") {
+        method = &Benchmark::MixGraph;
+      } else if (name == "readmissing") {
+        ++key_size_;
+        method = &Benchmark::ReadRandom;
+      } else if (name == "newiterator") {
+        method = &Benchmark::IteratorCreation;
+      } else if (name == "newiteratorwhilewriting") {
+        num_threads++;  // Add extra thread for writing
+        method = &Benchmark::IteratorCreationWhileWriting;
+      } else if (name == "seekrandom") {
+        method = &Benchmark::SeekRandom;
+      } else if (name == "seekrandomwhilewriting") {
+        num_threads++;  // Add extra thread for writing
+        method = &Benchmark::SeekRandomWhileWriting;
+      } else if (name == "seekrandomwhilemerging") {
+        num_threads++;  // Add extra thread for merging
+        method = &Benchmark::SeekRandomWhileMerging;
+      } else if (name == "readrandomsmall") {
+        reads_ /= 1000;
+        method = &Benchmark::ReadRandom;
+      } else if (name == "deleteseq") {
+        method = &Benchmark::DeleteSeq;
+      } else if (name == "deleterandom") {
+        method = &Benchmark::DeleteRandom;
+      } else if (name == "readwhilewriting") {
+        num_threads++;  // Add extra thread for writing
+        method = &Benchmark::ReadWhileWriting;
+      } else if (name == "readwhilemerging") {
+        num_threads++;  // Add extra thread for writing
+        method = &Benchmark::ReadWhileMerging;
+      } else if (name == "readwhilescanning") {
+        num_threads++;  // Add extra thread for scaning
+        method = &Benchmark::ReadWhileScanning;
+      } else if (name == "readrandomwriterandom") {
+        method = &Benchmark::ReadRandomWriteRandom;
+      } else if (name == "readrandommergerandom") {
+        if (FLAGS_merge_operator.empty()) {
+          fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
+                  name.c_str());
+          ErrorExit();
+        }
+        method = &Benchmark::ReadRandomMergeRandom;
+      } else if (name == "updaterandom") {
+        method = &Benchmark::UpdateRandom;
+      } else if (name == "xorupdaterandom") {
+        method = &Benchmark::XORUpdateRandom;
+      } else if (name == "appendrandom") {
+        method = &Benchmark::AppendRandom;
+      } else if (name == "mergerandom") {
+        if (FLAGS_merge_operator.empty()) {
+          fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
+                  name.c_str());
+          exit(1);
+        }
+        method = &Benchmark::MergeRandom;
+      } else if (name == "randomwithverify") {
+        method = &Benchmark::RandomWithVerify;
+      } else if (name == "fillseekseq") {
+        method = &Benchmark::WriteSeqSeekSeq;
+      } else if (name == "compact") {
+        method = &Benchmark::Compact;
+      } else if (name == "compactall") {
+        CompactAll();
+#ifndef ROCKSDB_LITE
+      } else if (name == "compact0") {
+        CompactLevel(0);
+      } else if (name == "compact1") {
+        CompactLevel(1);
+      } else if (name == "waitforcompaction") {
+        WaitForCompaction();
+#endif
+      } else if (name == "flush") {
+        Flush();
+      } else if (name == "crc32c") {
+        method = &Benchmark::Crc32c;
+      } else if (name == "xxhash") {
+        method = &Benchmark::xxHash;
+      } else if (name == "xxhash64") {
+        method = &Benchmark::xxHash64;
+      } else if (name == "xxh3") {
+        method = &Benchmark::xxh3;
+      } else if (name == "acquireload") {
+        method = &Benchmark::AcquireLoad;
+      } else if (name == "compress") {
+        method = &Benchmark::Compress;
+      } else if (name == "uncompress") {
+        method = &Benchmark::Uncompress;
+#ifndef ROCKSDB_LITE
+      } else if (name == "randomtransaction") {
+        method = &Benchmark::RandomTransaction;
+        post_process_method = &Benchmark::RandomTransactionVerify;
+#endif  // ROCKSDB_LITE
+      } else if (name == "randomreplacekeys") {
+        fresh_db = true;
+        method = &Benchmark::RandomReplaceKeys;
+      } else if (name == "timeseries") {
+        timestamp_emulator_.reset(new TimestampEmulator());
+        if (FLAGS_expire_style == "compaction_filter") {
+          filter.reset(new ExpiredTimeFilter(timestamp_emulator_));
+          fprintf(stdout, "Compaction filter is used to remove expired data");
+          open_options_.compaction_filter = filter.get();
+        }
+        fresh_db = true;
+        method = &Benchmark::TimeSeries;
+      } else if (name == "block_cache_entry_stats") {
+        // DB::Properties::kBlockCacheEntryStats
+        PrintStats("rocksdb.block-cache-entry-stats");
+      } else if (name == "stats") {
+        PrintStats("rocksdb.stats");
+      } else if (name == "resetstats") {
+        ResetStats();
+      } else if (name == "verify") {
+        VerifyDBFromDB(FLAGS_truth_db);
+      } else if (name == "levelstats") {
+        PrintStats("rocksdb.levelstats");
+      } else if (name == "memstats") {
+        std::vector<std::string> keys{"rocksdb.num-immutable-mem-table",
+                                      "rocksdb.cur-size-active-mem-table",
+                                      "rocksdb.cur-size-all-mem-tables",
+                                      "rocksdb.size-all-mem-tables",
+                                      "rocksdb.num-entries-active-mem-table",
+                                      "rocksdb.num-entries-imm-mem-tables"};
+        PrintStats(keys);
+      } else if (name == "sstables") {
+        PrintStats("rocksdb.sstables");
+      } else if (name == "stats_history") {
+        PrintStatsHistory();
+#ifndef ROCKSDB_LITE
+      } else if (name == "replay") {
+        if (num_threads > 1) {
+          fprintf(stderr, "Multi-threaded replay is not yet supported\n");
+          ErrorExit();
+        }
+        if (FLAGS_trace_file == "") {
+          fprintf(stderr, "Please set --trace_file to be replayed from\n");
+          ErrorExit();
+        }
+        method = &Benchmark::Replay;
+#endif  // ROCKSDB_LITE
+      } else if (name == "getmergeoperands") {
+        method = &Benchmark::GetMergeOperands;
+#ifndef ROCKSDB_LITE
+      } else if (name == "verifychecksum") {
+        method = &Benchmark::VerifyChecksum;
+      } else if (name == "verifyfilechecksums") {
+        method = &Benchmark::VerifyFileChecksums;
+#endif  // ROCKSDB_LITE
+      } else if (name == "readrandomoperands") {
+        read_operands_ = true;
+        method = &Benchmark::ReadRandom;
+#ifndef ROCKSDB_LITE
+      } else if (name == "backup") {
+        method = &Benchmark::Backup;
+      } else if (name == "restore") {
+        method = &Benchmark::Restore;
+#endif
+      } else if (!name.empty()) {  // No error message for empty name
+        fprintf(stderr, "unknown benchmark '%s'\n", name.c_str());
+        ErrorExit();
+      }
+
+      if (fresh_db) {
+        if (FLAGS_use_existing_db) {
+          fprintf(stdout, "%-12s : skipped (--use_existing_db is true)\n",
+                  name.c_str());
+          method = nullptr;
+        } else {
+          if (db_.db != nullptr) {
+            db_.DeleteDBs();
+            DestroyDB(FLAGS_db, open_options_);
+          }
+          Options options = open_options_;
+          for (size_t i = 0; i < multi_dbs_.size(); i++) {
+            delete multi_dbs_[i].db;
+            if (!open_options_.wal_dir.empty()) {
+              options.wal_dir = GetPathForMultiple(open_options_.wal_dir, i);
+            }
+            DestroyDB(GetPathForMultiple(FLAGS_db, i), options);
+          }
+          multi_dbs_.clear();
+        }
+        Open(&open_options_);  // use open_options for the last accessed
+      }
+
+      if (method != nullptr) {
+        fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
+
+#ifndef ROCKSDB_LITE
+        if (name == "backup") {
+          std::cout << "Backup path: [" << FLAGS_backup_dir << "]" << std::endl;
+        } else if (name == "restore") {
+          std::cout << "Backup path: [" << FLAGS_backup_dir << "]" << std::endl;
+          std::cout << "Restore path: [" << FLAGS_restore_dir << "]"
+                    << std::endl;
+        }
+        // A trace_file option can be provided both for trace and replay
+        // operations. But db_bench does not support tracing and replaying at
+        // the same time, for now. So, start tracing only when it is not a
+        // replay.
+        if (FLAGS_trace_file != "" && name != "replay") {
+          std::unique_ptr<TraceWriter> trace_writer;
+          Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(),
+                                        FLAGS_trace_file, &trace_writer);
+          if (!s.ok()) {
+            fprintf(stderr, "Encountered an error starting a trace, %s\n",
+                    s.ToString().c_str());
+            ErrorExit();
+          }
+          s = db_.db->StartTrace(trace_options_, std::move(trace_writer));
+          if (!s.ok()) {
+            fprintf(stderr, "Encountered an error starting a trace, %s\n",
+                    s.ToString().c_str());
+            ErrorExit();
+          }
+          fprintf(stdout, "Tracing the workload to: [%s]\n",
+                  FLAGS_trace_file.c_str());
+        }
+        // Start block cache tracing.
+        if (!FLAGS_block_cache_trace_file.empty()) {
+          // Sanity checks.
+          if (FLAGS_block_cache_trace_sampling_frequency <= 0) {
+            fprintf(stderr,
+                    "Block cache trace sampling frequency must be higher than "
+                    "0.\n");
+            ErrorExit();
+          }
+          if (FLAGS_block_cache_trace_max_trace_file_size_in_bytes <= 0) {
+            fprintf(stderr,
+                    "The maximum file size for block cache tracing must be "
+                    "higher than 0.\n");
+            ErrorExit();
+          }
+          block_cache_trace_options_.max_trace_file_size =
+              FLAGS_block_cache_trace_max_trace_file_size_in_bytes;
+          block_cache_trace_options_.sampling_frequency =
+              FLAGS_block_cache_trace_sampling_frequency;
+          std::unique_ptr<TraceWriter> block_cache_trace_writer;
+          Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(),
+                                        FLAGS_block_cache_trace_file,
+                                        &block_cache_trace_writer);
+          if (!s.ok()) {
+            fprintf(stderr,
+                    "Encountered an error when creating trace writer, %s\n",
+                    s.ToString().c_str());
+            ErrorExit();
+          }
+          s = db_.db->StartBlockCacheTrace(block_cache_trace_options_,
+                                           std::move(block_cache_trace_writer));
+          if (!s.ok()) {
+            fprintf(
+                stderr,
+                "Encountered an error when starting block cache tracing, %s\n",
+                s.ToString().c_str());
+            ErrorExit();
+          }
+          fprintf(stdout, "Tracing block cache accesses to: [%s]\n",
+                  FLAGS_block_cache_trace_file.c_str());
+        }
+#endif  // ROCKSDB_LITE
+
+        if (num_warmup > 0) {
+          printf("Warming up benchmark by running %d times\n", num_warmup);
+        }
+
+        for (int i = 0; i < num_warmup; i++) {
+          RunBenchmark(num_threads, name, method);
+        }
+
+        if (num_repeat > 1) {
+          printf("Running benchmark for %d times\n", num_repeat);
+        }
+
+        CombinedStats combined_stats;
+        for (int i = 0; i < num_repeat; i++) {
+          Stats stats = RunBenchmark(num_threads, name, method);
+          combined_stats.AddStats(stats);
+          if (FLAGS_confidence_interval_only) {
+            combined_stats.ReportWithConfidenceIntervals(name);
+          } else {
+            combined_stats.Report(name);
+          }
+        }
+        if (num_repeat > 1) {
+          combined_stats.ReportFinal(name);
+        }
+      }
+      if (post_process_method != nullptr) {
+        (this->*post_process_method)();
+      }
+    }
+
+    if (secondary_update_thread_) {
+      secondary_update_stopped_.store(1, std::memory_order_relaxed);
+      secondary_update_thread_->join();
+      secondary_update_thread_.reset();
+    }
+
+#ifndef ROCKSDB_LITE
+    if (name != "replay" && FLAGS_trace_file != "") {
+      Status s = db_.db->EndTrace();
+      if (!s.ok()) {
+        fprintf(stderr, "Encountered an error ending the trace, %s\n",
+                s.ToString().c_str());
+      }
+    }
+    if (!FLAGS_block_cache_trace_file.empty()) {
+      Status s = db_.db->EndBlockCacheTrace();
+      if (!s.ok()) {
+        fprintf(stderr,
+                "Encountered an error ending the block cache tracing, %s\n",
+                s.ToString().c_str());
+      }
+    }
+#endif  // ROCKSDB_LITE
+
+    if (FLAGS_statistics) {
+      fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
+    }
+    if (FLAGS_simcache_size >= 0) {
+      fprintf(
+          stdout, "SIMULATOR CACHE STATISTICS:\n%s\n",
+          static_cast_with_check<SimCache>(cache_.get())->ToString().c_str());
+    }
+
+#ifndef ROCKSDB_LITE
+    if (FLAGS_use_secondary_db) {
+      fprintf(stdout, "Secondary instance updated  %" PRIu64 " times.\n",
+              secondary_db_updates_);
+    }
+#endif  // ROCKSDB_LITE
+  }
+
+ private:
+  std::shared_ptr<TimestampEmulator> timestamp_emulator_;
+  std::unique_ptr<port::Thread> secondary_update_thread_;
+  std::atomic<int> secondary_update_stopped_{0};
+#ifndef ROCKSDB_LITE
+  uint64_t secondary_db_updates_ = 0;
+#endif  // ROCKSDB_LITE
+  struct ThreadArg {
+    Benchmark* bm;
+    SharedState* shared;
+    ThreadState* thread;
+    void (Benchmark::*method)(ThreadState*);
+  };
+
+  static void ThreadBody(void* v) {
+    ThreadArg* arg = reinterpret_cast<ThreadArg*>(v);
+    SharedState* shared = arg->shared;
+    ThreadState* thread = arg->thread;
+    {
+      MutexLock l(&shared->mu);
+      shared->num_initialized++;
+      if (shared->num_initialized >= shared->total) {
+        shared->cv.SignalAll();
+      }
+      while (!shared->start) {
+        shared->cv.Wait();
+      }
+    }
+
+    SetPerfLevel(static_cast<PerfLevel>(shared->perf_level));
+    perf_context.EnablePerLevelPerfContext();
+    thread->stats.Start(thread->tid);
+    (arg->bm->*(arg->method))(thread);
+    if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
+      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
+                               get_perf_context()->ToString());
+    }
+    thread->stats.Stop();
+
+    {
+      MutexLock l(&shared->mu);
+      shared->num_done++;
+      if (shared->num_done >= shared->total) {
+        shared->cv.SignalAll();
+      }
+    }
+  }
+
+  Stats RunBenchmark(int n, Slice name,
+                     void (Benchmark::*method)(ThreadState*)) {
+    SharedState shared;
+    shared.total = n;
+    shared.num_initialized = 0;
+    shared.num_done = 0;
+    shared.start = false;
+    if (FLAGS_benchmark_write_rate_limit > 0) {
+      shared.write_rate_limiter.reset(
+          NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
+    }
+    if (FLAGS_benchmark_read_rate_limit > 0) {
+      shared.read_rate_limiter.reset(NewGenericRateLimiter(
+          FLAGS_benchmark_read_rate_limit, 100000 /* refill_period_us */,
+          10 /* fairness */, RateLimiter::Mode::kReadsOnly));
+    }
+
+    std::unique_ptr<ReporterAgent> reporter_agent;
+    if (FLAGS_report_interval_seconds > 0) {
+      reporter_agent.reset(new ReporterAgent(FLAGS_env, FLAGS_report_file,
+                                             FLAGS_report_interval_seconds));
+    }
+
+    ThreadArg* arg = new ThreadArg[n];
+
+    for (int i = 0; i < n; i++) {
+#ifdef NUMA
+      if (FLAGS_enable_numa) {
+        // Performs a local allocation of memory to threads in numa node.
+        int n_nodes = numa_num_task_nodes();  // Number of nodes in NUMA.
+        numa_exit_on_error = 1;
+        int numa_node = i % n_nodes;
+        bitmask* nodes = numa_allocate_nodemask();
+        numa_bitmask_clearall(nodes);
+        numa_bitmask_setbit(nodes, numa_node);
+        // numa_bind() call binds the process to the node and these
+        // properties are passed on to the thread that is created in
+        // StartThread method called later in the loop.
+        numa_bind(nodes);
+        numa_set_strict(1);
+        numa_free_nodemask(nodes);
+      }
+#endif
+      arg[i].bm = this;
+      arg[i].method = method;
+      arg[i].shared = &shared;
+      total_thread_count_++;
+      arg[i].thread = new ThreadState(i, total_thread_count_);
+      arg[i].thread->stats.SetReporterAgent(reporter_agent.get());
+      arg[i].thread->shared = &shared;
+      FLAGS_env->StartThread(ThreadBody, &arg[i]);
+    }
+
+    shared.mu.Lock();
+    while (shared.num_initialized < n) {
+      shared.cv.Wait();
+    }
+
+    shared.start = true;
+    shared.cv.SignalAll();
+    while (shared.num_done < n) {
+      shared.cv.Wait();
+    }
+    shared.mu.Unlock();
+
+    // Stats for some threads can be excluded.
+    Stats merge_stats;
+    for (int i = 0; i < n; i++) {
+      merge_stats.Merge(arg[i].thread->stats);
+    }
+    merge_stats.Report(name);
+
+    for (int i = 0; i < n; i++) {
+      delete arg[i].thread;
+    }
+    delete[] arg;
+
+    return merge_stats;
+  }
+
+  template <OperationType kOpType, typename FnType, typename... Args>
+  static inline void ChecksumBenchmark(FnType fn, ThreadState* thread,
+                                       Args... args) {
+    const int size = FLAGS_block_size;  // use --block_size option for db_bench
+    std::string labels = "(" + std::to_string(FLAGS_block_size) + " per op)";
+    const char* label = labels.c_str();
+
+    std::string data(size, 'x');
+    uint64_t bytes = 0;
+    uint32_t val = 0;
+    while (bytes < 5000U * uint64_t{1048576}) {  // ~5GB
+      val += static_cast<uint32_t>(fn(data.data(), size, args...));
+      thread->stats.FinishedOps(nullptr, nullptr, 1, kOpType);
+      bytes += size;
+    }
+    // Print so result is not dead
+    fprintf(stderr, "... val=0x%x\r", static_cast<unsigned int>(val));
+
+    thread->stats.AddBytes(bytes);
+    thread->stats.AddMessage(label);
+  }
+
+  void Crc32c(ThreadState* thread) {
+    ChecksumBenchmark<kCrc>(crc32c::Value, thread);
+  }
+
+  void xxHash(ThreadState* thread) {
+    ChecksumBenchmark<kHash>(XXH32, thread, /*seed*/ 0);
+  }
+
+  void xxHash64(ThreadState* thread) {
+    ChecksumBenchmark<kHash>(XXH64, thread, /*seed*/ 0);
+  }
+
+  void xxh3(ThreadState* thread) {
+    ChecksumBenchmark<kHash>(XXH3_64bits, thread);
+  }
+
+  void AcquireLoad(ThreadState* thread) {
+    int dummy;
+    std::atomic<void*> ap(&dummy);
+    int count = 0;
+    void* ptr = nullptr;
+    thread->stats.AddMessage("(each op is 1000 loads)");
+    while (count < 100000) {
+      for (int i = 0; i < 1000; i++) {
+        ptr = ap.load(std::memory_order_acquire);
+      }
+      count++;
+      thread->stats.FinishedOps(nullptr, nullptr, 1, kOthers);
+    }
+    if (ptr == nullptr) exit(1);  // Disable unused variable warning.
+  }
+
+  void Compress(ThreadState* thread) {
+    RandomGenerator gen;
+    Slice input = gen.Generate(FLAGS_block_size);
+    int64_t bytes = 0;
+    int64_t produced = 0;
+    bool ok = true;
+    std::string compressed;
+    CompressionOptions opts;
+    CompressionContext context(FLAGS_compression_type_e);
+    CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
+                         FLAGS_compression_type_e,
+                         FLAGS_sample_for_compression);
+    // Compress 1G
+    while (ok && bytes < int64_t(1) << 30) {
+      compressed.clear();
+      ok = CompressSlice(info, input, &compressed);
+      produced += compressed.size();
+      bytes += input.size();
+      thread->stats.FinishedOps(nullptr, nullptr, 1, kCompress);
+    }
+
+    if (!ok) {
+      thread->stats.AddMessage("(compression failure)");
+    } else {
+      char buf[340];
+      snprintf(buf, sizeof(buf), "(output: %.1f%%)",
+               (produced * 100.0) / bytes);
+      thread->stats.AddMessage(buf);
+      thread->stats.AddBytes(bytes);
+    }
+  }
+
+  void Uncompress(ThreadState* thread) {
+    RandomGenerator gen;
+    Slice input = gen.Generate(FLAGS_block_size);
+    std::string compressed;
+
+    CompressionContext compression_ctx(FLAGS_compression_type_e);
+    CompressionOptions compression_opts;
+    CompressionInfo compression_info(
+        compression_opts, compression_ctx, CompressionDict::GetEmptyDict(),
+        FLAGS_compression_type_e, FLAGS_sample_for_compression);
+    UncompressionContext uncompression_ctx(FLAGS_compression_type_e);
+    UncompressionInfo uncompression_info(uncompression_ctx,
+                                         UncompressionDict::GetEmptyDict(),
+                                         FLAGS_compression_type_e);
+
+    bool ok = CompressSlice(compression_info, input, &compressed);
+    int64_t bytes = 0;
+    size_t uncompressed_size = 0;
+    while (ok && bytes < 1024 * 1048576) {
+      constexpr uint32_t compress_format_version = 2;
+
+      CacheAllocationPtr uncompressed = UncompressData(
+          uncompression_info, compressed.data(), compressed.size(),
+          &uncompressed_size, compress_format_version);
+
+      ok = uncompressed.get() != nullptr;
+      bytes += input.size();
+      thread->stats.FinishedOps(nullptr, nullptr, 1, kUncompress);
+    }
+
+    if (!ok) {
+      thread->stats.AddMessage("(compression failure)");
+    } else {
+      thread->stats.AddBytes(bytes);
+    }
+  }
+
+  // Returns true if the options is initialized from the specified
+  // options file.
+  bool InitializeOptionsFromFile(Options* opts) {
+#ifndef ROCKSDB_LITE
+    printf("Initializing RocksDB Options from the specified file\n");
+    DBOptions db_opts;
+    std::vector<ColumnFamilyDescriptor> cf_descs;
+    if (FLAGS_options_file != "") {
+      auto s = LoadOptionsFromFile(FLAGS_options_file, FLAGS_env, &db_opts,
+                                   &cf_descs);
+      db_opts.env = FLAGS_env;
+      if (s.ok()) {
+        *opts = Options(db_opts, cf_descs[0].options);
+        return true;
+      }
+      fprintf(stderr, "Unable to load options file %s --- %s\n",
+              FLAGS_options_file.c_str(), s.ToString().c_str());
+      exit(1);
+    }
+#else
+    (void)opts;
+#endif
+    return false;
+  }
+
+  void InitializeOptionsFromFlags(Options* opts) {
+    printf("Initializing RocksDB Options from command-line flags\n");
+    Options& options = *opts;
+    ConfigOptions config_options(options);
+    config_options.ignore_unsupported_options = false;
+
+    assert(db_.db == nullptr);
+
+    options.env = FLAGS_env;
+    options.wal_dir = FLAGS_wal_dir;
+    options.dump_malloc_stats = FLAGS_dump_malloc_stats;
+    options.stats_dump_period_sec =
+        static_cast<unsigned int>(FLAGS_stats_dump_period_sec);
+    options.stats_persist_period_sec =
+        static_cast<unsigned int>(FLAGS_stats_persist_period_sec);
+    options.persist_stats_to_disk = FLAGS_persist_stats_to_disk;
+    options.stats_history_buffer_size =
+        static_cast<size_t>(FLAGS_stats_history_buffer_size);
+    options.avoid_flush_during_recovery = FLAGS_avoid_flush_during_recovery;
+
+    options.compression_opts.level = FLAGS_compression_level;
+    options.compression_opts.max_dict_bytes = FLAGS_compression_max_dict_bytes;
+    options.compression_opts.zstd_max_train_bytes =
+        FLAGS_compression_zstd_max_train_bytes;
+    options.compression_opts.parallel_threads =
+        FLAGS_compression_parallel_threads;
+    options.compression_opts.max_dict_buffer_bytes =
+        FLAGS_compression_max_dict_buffer_bytes;
+    options.compression_opts.use_zstd_dict_trainer =
+        FLAGS_compression_use_zstd_dict_trainer;
+
+    options.max_open_files = FLAGS_open_files;
+    if (FLAGS_cost_write_buffer_to_cache || FLAGS_db_write_buffer_size != 0) {
+      options.write_buffer_manager.reset(
+          new WriteBufferManager(FLAGS_db_write_buffer_size, cache_));
+    }
+    options.arena_block_size = FLAGS_arena_block_size;
+    options.write_buffer_size = FLAGS_write_buffer_size;
+    options.max_write_buffer_number = FLAGS_max_write_buffer_number;
+    options.min_write_buffer_number_to_merge =
+        FLAGS_min_write_buffer_number_to_merge;
+    options.max_write_buffer_number_to_maintain =
+        FLAGS_max_write_buffer_number_to_maintain;
+    options.max_write_buffer_size_to_maintain =
+        FLAGS_max_write_buffer_size_to_maintain;
+    options.max_background_jobs = FLAGS_max_background_jobs;
+    options.max_background_compactions = FLAGS_max_background_compactions;
+    options.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
+    options.max_background_flushes = FLAGS_max_background_flushes;
+    options.compaction_style = FLAGS_compaction_style_e;
+    options.compaction_pri = FLAGS_compaction_pri_e;
+    options.allow_mmap_reads = FLAGS_mmap_read;
+    options.allow_mmap_writes = FLAGS_mmap_write;
+    options.use_direct_reads = FLAGS_use_direct_reads;
+    options.use_direct_io_for_flush_and_compaction =
+        FLAGS_use_direct_io_for_flush_and_compaction;
+    options.manual_wal_flush = FLAGS_manual_wal_flush;
+    options.wal_compression = FLAGS_wal_compression_e;
+#ifndef ROCKSDB_LITE
+    options.ttl = FLAGS_fifo_compaction_ttl;
+    options.compaction_options_fifo = CompactionOptionsFIFO(
+        FLAGS_fifo_compaction_max_table_files_size_mb * 1024 * 1024,
+        FLAGS_fifo_compaction_allow_compaction);
+    options.compaction_options_fifo.age_for_warm = FLAGS_fifo_age_for_warm;
+#endif  // ROCKSDB_LITE
+    options.prefix_extractor = prefix_extractor_;
+    if (FLAGS_use_uint64_comparator) {
+      options.comparator = test::Uint64Comparator();
+      if (FLAGS_key_size != 8) {
+        fprintf(stderr, "Using Uint64 comparator but key size is not 8.\n");
+        exit(1);
+      }
+    }
+    if (FLAGS_use_stderr_info_logger) {
+      options.info_log.reset(new StderrLogger());
+    }
+    options.memtable_huge_page_size = FLAGS_memtable_use_huge_page ? 2048 : 0;
+    options.memtable_prefix_bloom_size_ratio = FLAGS_memtable_bloom_size_ratio;
+    options.memtable_whole_key_filtering = FLAGS_memtable_whole_key_filtering;
+    if (FLAGS_memtable_insert_with_hint_prefix_size > 0) {
+      options.memtable_insert_with_hint_prefix_extractor.reset(
+          NewCappedPrefixTransform(
+              FLAGS_memtable_insert_with_hint_prefix_size));
+    }
+    options.bloom_locality = FLAGS_bloom_locality;
+    options.max_file_opening_threads = FLAGS_file_opening_threads;
+    options.compaction_readahead_size = FLAGS_compaction_readahead_size;
+    options.log_readahead_size = FLAGS_log_readahead_size;
+    options.random_access_max_buffer_size = FLAGS_random_access_max_buffer_size;
+    options.writable_file_max_buffer_size = FLAGS_writable_file_max_buffer_size;
+    options.use_fsync = FLAGS_use_fsync;
+    options.num_levels = FLAGS_num_levels;
+    options.target_file_size_base = FLAGS_target_file_size_base;
+    options.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
+    options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
+    options.level_compaction_dynamic_level_bytes =
+        FLAGS_level_compaction_dynamic_level_bytes;
+    options.max_bytes_for_level_multiplier =
+        FLAGS_max_bytes_for_level_multiplier;
+    Status s =
+        CreateMemTableRepFactory(config_options, &options.memtable_factory);
+    if (!s.ok()) {
+      fprintf(stderr, "Could not create memtable factory: %s\n",
+              s.ToString().c_str());
+      exit(1);
+    } else if ((FLAGS_prefix_size == 0) &&
+               (options.memtable_factory->IsInstanceOf("prefix_hash") ||
+                options.memtable_factory->IsInstanceOf("hash_linkedlist"))) {
+      fprintf(stderr,
+              "prefix_size should be non-zero if PrefixHash or "
+              "HashLinkedList memtablerep is used\n");
+      exit(1);
+    }
+    if (FLAGS_use_plain_table) {
+#ifndef ROCKSDB_LITE
+      if (!options.memtable_factory->IsInstanceOf("prefix_hash") &&
+          !options.memtable_factory->IsInstanceOf("hash_linkedlist")) {
+        fprintf(stderr, "Warning: plain table is used with %s\n",
+                options.memtable_factory->Name());
+      }
+
+      int bloom_bits_per_key = FLAGS_bloom_bits;
+      if (bloom_bits_per_key < 0) {
+        bloom_bits_per_key = PlainTableOptions().bloom_bits_per_key;
+      }
+
+      PlainTableOptions plain_table_options;
+      plain_table_options.user_key_len = FLAGS_key_size;
+      plain_table_options.bloom_bits_per_key = bloom_bits_per_key;
+      plain_table_options.hash_table_ratio = 0.75;
+      options.table_factory = std::shared_ptr<TableFactory>(
+          NewPlainTableFactory(plain_table_options));
+#else
+      fprintf(stderr, "Plain table is not supported in lite mode\n");
+      exit(1);
+#endif  // ROCKSDB_LITE
+    } else if (FLAGS_use_cuckoo_table) {
+#ifndef ROCKSDB_LITE
+      if (FLAGS_cuckoo_hash_ratio > 1 || FLAGS_cuckoo_hash_ratio < 0) {
+        fprintf(stderr, "Invalid cuckoo_hash_ratio\n");
+        exit(1);
+      }
+
+      if (!FLAGS_mmap_read) {
+        fprintf(stderr, "cuckoo table format requires mmap read to operate\n");
+        exit(1);
+      }
+
+      ROCKSDB_NAMESPACE::CuckooTableOptions table_options;
+      table_options.hash_table_ratio = FLAGS_cuckoo_hash_ratio;
+      table_options.identity_as_first_hash = FLAGS_identity_as_first_hash;
+      options.table_factory =
+          std::shared_ptr<TableFactory>(NewCuckooTableFactory(table_options));
+#else
+      fprintf(stderr, "Cuckoo table is not supported in lite mode\n");
+      exit(1);
+#endif  // ROCKSDB_LITE
+    } else {
+      BlockBasedTableOptions block_based_options;
+      block_based_options.checksum =
+          static_cast<ChecksumType>(FLAGS_checksum_type);
+      if (FLAGS_use_hash_search) {
+        if (FLAGS_prefix_size == 0) {
+          fprintf(stderr,
+                  "prefix_size not assigned when enable use_hash_search \n");
+          exit(1);
+        }
+        block_based_options.index_type = BlockBasedTableOptions::kHashSearch;
+      } else {
+        block_based_options.index_type = BlockBasedTableOptions::kBinarySearch;
+      }
+      if (FLAGS_partition_index_and_filters || FLAGS_partition_index) {
+        if (FLAGS_index_with_first_key) {
+          fprintf(stderr,
+                  "--index_with_first_key is not compatible with"
+                  " partition index.");
+        }
+        if (FLAGS_use_hash_search) {
+          fprintf(stderr,
+                  "use_hash_search is incompatible with "
+                  "partition index and is ignored");
+        }
+        block_based_options.index_type =
+            BlockBasedTableOptions::kTwoLevelIndexSearch;
+        block_based_options.metadata_block_size = FLAGS_metadata_block_size;
+        if (FLAGS_partition_index_and_filters) {
+          block_based_options.partition_filters = true;
+        }
+      } else if (FLAGS_index_with_first_key) {
+        block_based_options.index_type =
+            BlockBasedTableOptions::kBinarySearchWithFirstKey;
+      }
+      BlockBasedTableOptions::IndexShorteningMode index_shortening =
+          block_based_options.index_shortening;
+      switch (FLAGS_index_shortening_mode) {
+        case 0:
+          index_shortening =
+              BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+          break;
+        case 1:
+          index_shortening =
+              BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators;
+          break;
+        case 2:
+          index_shortening = BlockBasedTableOptions::IndexShorteningMode::
+              kShortenSeparatorsAndSuccessor;
+          break;
+        default:
+          fprintf(stderr, "Unknown key shortening mode\n");
+      }
+      block_based_options.optimize_filters_for_memory =
+          FLAGS_optimize_filters_for_memory;
+      block_based_options.index_shortening = index_shortening;
+      if (cache_ == nullptr) {
+        block_based_options.no_block_cache = true;
+      }
+      block_based_options.cache_index_and_filter_blocks =
+          FLAGS_cache_index_and_filter_blocks;
+      block_based_options.pin_l0_filter_and_index_blocks_in_cache =
+          FLAGS_pin_l0_filter_and_index_blocks_in_cache;
+      block_based_options.pin_top_level_index_and_filter =
+          FLAGS_pin_top_level_index_and_filter;
+      if (FLAGS_cache_high_pri_pool_ratio > 1e-6) {  // > 0.0 + eps
+        block_based_options.cache_index_and_filter_blocks_with_high_priority =
+            true;
+      }
+      if (FLAGS_cache_high_pri_pool_ratio + FLAGS_cache_low_pri_pool_ratio >
+          1.0) {
+        fprintf(stderr,
+                "Sum of high_pri_pool_ratio and low_pri_pool_ratio "
+                "cannot exceed 1.0.\n");
+      }
+      block_based_options.block_cache = cache_;
+      block_based_options.cache_usage_options.options_overrides.insert(
+          {CacheEntryRole::kCompressionDictionaryBuildingBuffer,
+           {/*.charged = */ FLAGS_charge_compression_dictionary_building_buffer
+                ? CacheEntryRoleOptions::Decision::kEnabled
+                : CacheEntryRoleOptions::Decision::kDisabled}});
+      block_based_options.cache_usage_options.options_overrides.insert(
+          {CacheEntryRole::kFilterConstruction,
+           {/*.charged = */ FLAGS_charge_filter_construction
+                ? CacheEntryRoleOptions::Decision::kEnabled
+                : CacheEntryRoleOptions::Decision::kDisabled}});
+      block_based_options.cache_usage_options.options_overrides.insert(
+          {CacheEntryRole::kBlockBasedTableReader,
+           {/*.charged = */ FLAGS_charge_table_reader
+                ? CacheEntryRoleOptions::Decision::kEnabled
+                : CacheEntryRoleOptions::Decision::kDisabled}});
+      block_based_options.cache_usage_options.options_overrides.insert(
+          {CacheEntryRole::kFileMetadata,
+           {/*.charged = */ FLAGS_charge_file_metadata
+                ? CacheEntryRoleOptions::Decision::kEnabled
+                : CacheEntryRoleOptions::Decision::kDisabled}});
+      block_based_options.cache_usage_options.options_overrides.insert(
+          {CacheEntryRole::kBlobCache,
+           {/*.charged = */ FLAGS_charge_blob_cache
+                ? CacheEntryRoleOptions::Decision::kEnabled
+                : CacheEntryRoleOptions::Decision::kDisabled}});
+      block_based_options.block_cache_compressed = compressed_cache_;
+      block_based_options.block_size = FLAGS_block_size;
+      block_based_options.block_restart_interval = FLAGS_block_restart_interval;
+      block_based_options.index_block_restart_interval =
+          FLAGS_index_block_restart_interval;
+      block_based_options.format_version =
+          static_cast<uint32_t>(FLAGS_format_version);
+      block_based_options.read_amp_bytes_per_bit = FLAGS_read_amp_bytes_per_bit;
+      block_based_options.enable_index_compression =
+          FLAGS_enable_index_compression;
+      block_based_options.block_align = FLAGS_block_align;
+      block_based_options.whole_key_filtering = FLAGS_whole_key_filtering;
+      block_based_options.max_auto_readahead_size =
+          FLAGS_max_auto_readahead_size;
+      block_based_options.initial_auto_readahead_size =
+          FLAGS_initial_auto_readahead_size;
+      block_based_options.num_file_reads_for_auto_readahead =
+          FLAGS_num_file_reads_for_auto_readahead;
+      BlockBasedTableOptions::PrepopulateBlockCache prepopulate_block_cache =
+          block_based_options.prepopulate_block_cache;
+      switch (FLAGS_prepopulate_block_cache) {
+        case 0:
+          prepopulate_block_cache =
+              BlockBasedTableOptions::PrepopulateBlockCache::kDisable;
+          break;
+        case 1:
+          prepopulate_block_cache =
+              BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly;
+          break;
+        default:
+          fprintf(stderr, "Unknown prepopulate block cache mode\n");
+      }
+      block_based_options.prepopulate_block_cache = prepopulate_block_cache;
+      if (FLAGS_use_data_block_hash_index) {
+        block_based_options.data_block_index_type =
+            ROCKSDB_NAMESPACE::BlockBasedTableOptions::kDataBlockBinaryAndHash;
+      } else {
+        block_based_options.data_block_index_type =
+            ROCKSDB_NAMESPACE::BlockBasedTableOptions::kDataBlockBinarySearch;
+      }
+      block_based_options.data_block_hash_table_util_ratio =
+          FLAGS_data_block_hash_table_util_ratio;
+      if (FLAGS_read_cache_path != "") {
+#ifndef ROCKSDB_LITE
+        Status rc_status;
+
+        // Read cache need to be provided with a the Logger, we will put all
+        // reac cache logs in the read cache path in a file named rc_LOG
+        rc_status = FLAGS_env->CreateDirIfMissing(FLAGS_read_cache_path);
+        std::shared_ptr<Logger> read_cache_logger;
+        if (rc_status.ok()) {
+          rc_status = FLAGS_env->NewLogger(FLAGS_read_cache_path + "/rc_LOG",
+                                           &read_cache_logger);
+        }
+
+        if (rc_status.ok()) {
+          PersistentCacheConfig rc_cfg(FLAGS_env, FLAGS_read_cache_path,
+                                       FLAGS_read_cache_size,
+                                       read_cache_logger);
+
+          rc_cfg.enable_direct_reads = FLAGS_read_cache_direct_read;
+          rc_cfg.enable_direct_writes = FLAGS_read_cache_direct_write;
+          rc_cfg.writer_qdepth = 4;
+          rc_cfg.writer_dispatch_size = 4 * 1024;
+
+          auto pcache = std::make_shared<BlockCacheTier>(rc_cfg);
+          block_based_options.persistent_cache = pcache;
+          rc_status = pcache->Open();
+        }
+
+        if (!rc_status.ok()) {
+          fprintf(stderr, "Error initializing read cache, %s\n",
+                  rc_status.ToString().c_str());
+          exit(1);
+        }
+#else
+        fprintf(stderr, "Read cache is not supported in LITE\n");
+        exit(1);
+
+#endif
+      }
+
+      if (FLAGS_use_blob_cache) {
+        if (FLAGS_use_shared_block_and_blob_cache) {
+          options.blob_cache = cache_;
+        } else {
+          if (FLAGS_blob_cache_size > 0) {
+            LRUCacheOptions co;
+            co.capacity = FLAGS_blob_cache_size;
+            co.num_shard_bits = FLAGS_blob_cache_numshardbits;
+            co.memory_allocator = GetCacheAllocator();
+
+            options.blob_cache = NewLRUCache(co);
+          } else {
+            fprintf(
+                stderr,
+                "Unable to create a standalone blob cache if blob_cache_size "
+                "<= 0.\n");
+            exit(1);
+          }
+        }
+        switch (FLAGS_prepopulate_blob_cache) {
+          case 0:
+            options.prepopulate_blob_cache = PrepopulateBlobCache::kDisable;
+            break;
+          case 1:
+            options.prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly;
+            break;
+          default:
+            fprintf(stderr, "Unknown prepopulate blob cache mode\n");
+            exit(1);
+        }
+
+        fprintf(stdout,
+                "Integrated BlobDB: blob cache enabled"
+                ", block and blob caches shared: %d",
+                FLAGS_use_shared_block_and_blob_cache);
+        if (!FLAGS_use_shared_block_and_blob_cache) {
+          fprintf(stdout,
+                  ", blob cache size %" PRIu64
+                  ", blob cache num shard bits: %d",
+                  FLAGS_blob_cache_size, FLAGS_blob_cache_numshardbits);
+        }
+        fprintf(stdout, ", blob cache prepopulated: %d\n",
+                FLAGS_prepopulate_blob_cache);
+      } else {
+        fprintf(stdout, "Integrated BlobDB: blob cache disabled\n");
+      }
+
+      options.table_factory.reset(
+          NewBlockBasedTableFactory(block_based_options));
+    }
+    if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() > 0) {
+      if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() !=
+          static_cast<unsigned int>(FLAGS_num_levels)) {
+        fprintf(stderr, "Insufficient number of fanouts specified %d\n",
+                static_cast<int>(
+                    FLAGS_max_bytes_for_level_multiplier_additional_v.size()));
+        exit(1);
+      }
+      options.max_bytes_for_level_multiplier_additional =
+          FLAGS_max_bytes_for_level_multiplier_additional_v;
+    }
+    options.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
+    options.level0_file_num_compaction_trigger =
+        FLAGS_level0_file_num_compaction_trigger;
+    options.level0_slowdown_writes_trigger =
+        FLAGS_level0_slowdown_writes_trigger;
+    options.compression = FLAGS_compression_type_e;
+    if (FLAGS_simulate_hybrid_fs_file != "") {
+      options.bottommost_temperature = Temperature::kWarm;
+    }
+    options.preclude_last_level_data_seconds =
+        FLAGS_preclude_last_level_data_seconds;
+    options.preserve_internal_time_seconds =
+        FLAGS_preserve_internal_time_seconds;
+    options.sample_for_compression = FLAGS_sample_for_compression;
+    options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds;
+    options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB;
+    options.max_total_wal_size = FLAGS_max_total_wal_size;
+
+    if (FLAGS_min_level_to_compress >= 0) {
+      assert(FLAGS_min_level_to_compress <= FLAGS_num_levels);
+      options.compression_per_level.resize(FLAGS_num_levels);
+      for (int i = 0; i < FLAGS_min_level_to_compress; i++) {
+        options.compression_per_level[i] = kNoCompression;
+      }
+      for (int i = FLAGS_min_level_to_compress; i < FLAGS_num_levels; i++) {
+        options.compression_per_level[i] = FLAGS_compression_type_e;
+      }
+    }
+    options.soft_pending_compaction_bytes_limit =
+        FLAGS_soft_pending_compaction_bytes_limit;
+    options.hard_pending_compaction_bytes_limit =
+        FLAGS_hard_pending_compaction_bytes_limit;
+    options.delayed_write_rate = FLAGS_delayed_write_rate;
+    options.allow_concurrent_memtable_write =
+        FLAGS_allow_concurrent_memtable_write;
+    options.experimental_mempurge_threshold =
+        FLAGS_experimental_mempurge_threshold;
+    options.inplace_update_support = FLAGS_inplace_update_support;
+    options.inplace_update_num_locks = FLAGS_inplace_update_num_locks;
+    options.enable_write_thread_adaptive_yield =
+        FLAGS_enable_write_thread_adaptive_yield;
+    options.enable_pipelined_write = FLAGS_enable_pipelined_write;
+    options.unordered_write = FLAGS_unordered_write;
+    options.write_thread_max_yield_usec = FLAGS_write_thread_max_yield_usec;
+    options.write_thread_slow_yield_usec = FLAGS_write_thread_slow_yield_usec;
+    options.table_cache_numshardbits = FLAGS_table_cache_numshardbits;
+    options.max_compaction_bytes = FLAGS_max_compaction_bytes;
+    options.disable_auto_compactions = FLAGS_disable_auto_compactions;
+    options.optimize_filters_for_hits = FLAGS_optimize_filters_for_hits;
+    options.paranoid_checks = FLAGS_paranoid_checks;
+    options.force_consistency_checks = FLAGS_force_consistency_checks;
+    options.check_flush_compaction_key_order =
+        FLAGS_check_flush_compaction_key_order;
+    options.periodic_compaction_seconds = FLAGS_periodic_compaction_seconds;
+    options.ttl = FLAGS_ttl_seconds;
+    // fill storage options
+    options.advise_random_on_open = FLAGS_advise_random_on_open;
+    options.access_hint_on_compaction_start = FLAGS_compaction_fadvice_e;
+    options.use_adaptive_mutex = FLAGS_use_adaptive_mutex;
+    options.bytes_per_sync = FLAGS_bytes_per_sync;
+    options.wal_bytes_per_sync = FLAGS_wal_bytes_per_sync;
+
+    // merge operator options
+    if (!FLAGS_merge_operator.empty()) {
+      s = MergeOperator::CreateFromString(config_options, FLAGS_merge_operator,
+                                          &options.merge_operator);
+      if (!s.ok()) {
+        fprintf(stderr, "invalid merge operator[%s]: %s\n",
+                FLAGS_merge_operator.c_str(), s.ToString().c_str());
+        exit(1);
+      }
+    }
+    options.max_successive_merges = FLAGS_max_successive_merges;
+    options.report_bg_io_stats = FLAGS_report_bg_io_stats;
+
+    // set universal style compaction configurations, if applicable
+    if (FLAGS_universal_size_ratio != 0) {
+      options.compaction_options_universal.size_ratio =
+          FLAGS_universal_size_ratio;
+    }
+    if (FLAGS_universal_min_merge_width != 0) {
+      options.compaction_options_universal.min_merge_width =
+          FLAGS_universal_min_merge_width;
+    }
+    if (FLAGS_universal_max_merge_width != 0) {
+      options.compaction_options_universal.max_merge_width =
+          FLAGS_universal_max_merge_width;
+    }
+    if (FLAGS_universal_max_size_amplification_percent != 0) {
+      options.compaction_options_universal.max_size_amplification_percent =
+          FLAGS_universal_max_size_amplification_percent;
+    }
+    if (FLAGS_universal_compression_size_percent != -1) {
+      options.compaction_options_universal.compression_size_percent =
+          FLAGS_universal_compression_size_percent;
+    }
+    options.compaction_options_universal.allow_trivial_move =
+        FLAGS_universal_allow_trivial_move;
+    options.compaction_options_universal.incremental =
+        FLAGS_universal_incremental;
+    if (FLAGS_thread_status_per_interval > 0) {
+      options.enable_thread_tracking = true;
+    }
+
+    if (FLAGS_user_timestamp_size > 0) {
+      if (FLAGS_user_timestamp_size != 8) {
+        fprintf(stderr, "Only 64 bits timestamps are supported.\n");
+        exit(1);
+      }
+      options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+    }
+
+    options.allow_data_in_errors = FLAGS_allow_data_in_errors;
+    options.track_and_verify_wals_in_manifest =
+        FLAGS_track_and_verify_wals_in_manifest;
+
+    // Integrated BlobDB
+    options.enable_blob_files = FLAGS_enable_blob_files;
+    options.min_blob_size = FLAGS_min_blob_size;
+    options.blob_file_size = FLAGS_blob_file_size;
+    options.blob_compression_type =
+        StringToCompressionType(FLAGS_blob_compression_type.c_str());
+    options.enable_blob_garbage_collection =
+        FLAGS_enable_blob_garbage_collection;
+    options.blob_garbage_collection_age_cutoff =
+        FLAGS_blob_garbage_collection_age_cutoff;
+    options.blob_garbage_collection_force_threshold =
+        FLAGS_blob_garbage_collection_force_threshold;
+    options.blob_compaction_readahead_size =
+        FLAGS_blob_compaction_readahead_size;
+    options.blob_file_starting_level = FLAGS_blob_file_starting_level;
+
+#ifndef ROCKSDB_LITE
+    if (FLAGS_readonly && FLAGS_transaction_db) {
+      fprintf(stderr, "Cannot use readonly flag with transaction_db\n");
+      exit(1);
+    }
+    if (FLAGS_use_secondary_db &&
+        (FLAGS_transaction_db || FLAGS_optimistic_transaction_db)) {
+      fprintf(stderr, "Cannot use use_secondary_db flag with transaction_db\n");
+      exit(1);
+    }
+#endif  // ROCKSDB_LITE
+    options.memtable_protection_bytes_per_key =
+        FLAGS_memtable_protection_bytes_per_key;
+  }
+
+  void InitializeOptionsGeneral(Options* opts) {
+    // Be careful about what is set here to avoid accidentally overwriting
+    // settings already configured by OPTIONS file. Only configure settings that
+    // are needed for the benchmark to run, settings for shared objects that
+    // were not configured already, settings that require dynamically invoking
+    // APIs, and settings for the benchmark itself.
+    Options& options = *opts;
+
+    // Always set these since they are harmless when not needed and prevent
+    // a guaranteed failure when they are needed.
+    options.create_missing_column_families = true;
+    options.create_if_missing = true;
+
+    if (options.statistics == nullptr) {
+      options.statistics = dbstats;
+    }
+
+    auto table_options =
+        options.table_factory->GetOptions<BlockBasedTableOptions>();
+    if (table_options != nullptr) {
+      if (FLAGS_cache_size > 0) {
+        // This violates this function's rules on when to set options. But we
+        // have to do it because the case of unconfigured block cache in OPTIONS
+        // file is indistinguishable (it is sanitized to 8MB by this point, not
+        // nullptr), and our regression tests assume this will be the shared
+        // block cache, even with OPTIONS file provided.
+        table_options->block_cache = cache_;
+      }
+      if (table_options->filter_policy == nullptr) {
+        if (FLAGS_bloom_bits < 0) {
+          table_options->filter_policy = BlockBasedTableOptions().filter_policy;
+        } else if (FLAGS_bloom_bits == 0) {
+          table_options->filter_policy.reset();
+        } else {
+          table_options->filter_policy.reset(
+              FLAGS_use_ribbon_filter ? NewRibbonFilterPolicy(FLAGS_bloom_bits)
+                                      : NewBloomFilterPolicy(FLAGS_bloom_bits));
+        }
+      }
+    }
+
+    if (options.row_cache == nullptr) {
+      if (FLAGS_row_cache_size) {
+        if (FLAGS_cache_numshardbits >= 1) {
+          options.row_cache =
+              NewLRUCache(FLAGS_row_cache_size, FLAGS_cache_numshardbits);
+        } else {
+          options.row_cache = NewLRUCache(FLAGS_row_cache_size);
+        }
+      }
+    }
+
+    if (options.env == Env::Default()) {
+      options.env = FLAGS_env;
+    }
+    if (FLAGS_enable_io_prio) {
+      options.env->LowerThreadPoolIOPriority(Env::LOW);
+      options.env->LowerThreadPoolIOPriority(Env::HIGH);
+    }
+    if (FLAGS_enable_cpu_prio) {
+      options.env->LowerThreadPoolCPUPriority(Env::LOW);
+      options.env->LowerThreadPoolCPUPriority(Env::HIGH);
+    }
+
+    if (FLAGS_sine_write_rate) {
+      FLAGS_benchmark_write_rate_limit = static_cast<uint64_t>(SineRate(0));
+    }
+
+    if (options.rate_limiter == nullptr) {
+      if (FLAGS_rate_limiter_bytes_per_sec > 0) {
+        options.rate_limiter.reset(NewGenericRateLimiter(
+            FLAGS_rate_limiter_bytes_per_sec,
+            FLAGS_rate_limiter_refill_period_us, 10 /* fairness */,
+            // TODO: replace this with a more general FLAG for deciding
+            // RateLimiter::Mode as now we also rate-limit foreground reads e.g,
+            // Get()/MultiGet()
+            FLAGS_rate_limit_bg_reads ? RateLimiter::Mode::kReadsOnly
+                                      : RateLimiter::Mode::kWritesOnly,
+            FLAGS_rate_limiter_auto_tuned));
+      }
+    }
+
+    options.listeners.emplace_back(listener_);
+
+    if (options.file_checksum_gen_factory == nullptr) {
+      if (FLAGS_file_checksum) {
+        options.file_checksum_gen_factory.reset(
+            new FileChecksumGenCrc32cFactory());
+      }
+    }
+
+    if (FLAGS_num_multi_db <= 1) {
+      OpenDb(options, FLAGS_db, &db_);
+    } else {
+      multi_dbs_.clear();
+      multi_dbs_.resize(FLAGS_num_multi_db);
+      auto wal_dir = options.wal_dir;
+      for (int i = 0; i < FLAGS_num_multi_db; i++) {
+        if (!wal_dir.empty()) {
+          options.wal_dir = GetPathForMultiple(wal_dir, i);
+        }
+        OpenDb(options, GetPathForMultiple(FLAGS_db, i), &multi_dbs_[i]);
+      }
+      options.wal_dir = wal_dir;
+    }
+
+    // KeepFilter is a noop filter, this can be used to test compaction filter
+    if (options.compaction_filter == nullptr) {
+      if (FLAGS_use_keep_filter) {
+        options.compaction_filter = new KeepFilter();
+        fprintf(stdout, "A noop compaction filter is used\n");
+      }
+    }
+
+    if (FLAGS_use_existing_keys) {
+      // Only work on single database
+      assert(db_.db != nullptr);
+      ReadOptions read_opts;  // before read_options_ initialized
+      read_opts.total_order_seek = true;
+      Iterator* iter = db_.db->NewIterator(read_opts);
+      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+        keys_.emplace_back(iter->key().ToString());
+      }
+      delete iter;
+      FLAGS_num = keys_.size();
+    }
+  }
+
+  void Open(Options* opts) {
+    if (!InitializeOptionsFromFile(opts)) {
+      InitializeOptionsFromFlags(opts);
+    }
+
+    InitializeOptionsGeneral(opts);
+  }
+
+  void OpenDb(Options options, const std::string& db_name,
+              DBWithColumnFamilies* db) {
+    uint64_t open_start = FLAGS_report_open_timing ? FLAGS_env->NowNanos() : 0;
+    Status s;
+    // Open with column families if necessary.
+    if (FLAGS_num_column_families > 1) {
+      size_t num_hot = FLAGS_num_column_families;
+      if (FLAGS_num_hot_column_families > 0 &&
+          FLAGS_num_hot_column_families < FLAGS_num_column_families) {
+        num_hot = FLAGS_num_hot_column_families;
+      } else {
+        FLAGS_num_hot_column_families = FLAGS_num_column_families;
+      }
+      std::vector<ColumnFamilyDescriptor> column_families;
+      for (size_t i = 0; i < num_hot; i++) {
+        column_families.push_back(ColumnFamilyDescriptor(
+            ColumnFamilyName(i), ColumnFamilyOptions(options)));
+      }
+      std::vector<int> cfh_idx_to_prob;
+      if (!FLAGS_column_family_distribution.empty()) {
+        std::stringstream cf_prob_stream(FLAGS_column_family_distribution);
+        std::string cf_prob;
+        int sum = 0;
+        while (std::getline(cf_prob_stream, cf_prob, ',')) {
+          cfh_idx_to_prob.push_back(std::stoi(cf_prob));
+          sum += cfh_idx_to_prob.back();
+        }
+        if (sum != 100) {
+          fprintf(stderr, "column_family_distribution items must sum to 100\n");
+          exit(1);
+        }
+        if (cfh_idx_to_prob.size() != num_hot) {
+          fprintf(stderr,
+                  "got %" ROCKSDB_PRIszt
+                  " column_family_distribution items; expected "
+                  "%" ROCKSDB_PRIszt "\n",
+                  cfh_idx_to_prob.size(), num_hot);
+          exit(1);
+        }
+      }
+#ifndef ROCKSDB_LITE
+      if (FLAGS_readonly) {
+        s = DB::OpenForReadOnly(options, db_name, column_families, &db->cfh,
+                                &db->db);
+      } else if (FLAGS_optimistic_transaction_db) {
+        s = OptimisticTransactionDB::Open(options, db_name, column_families,
+                                          &db->cfh, &db->opt_txn_db);
+        if (s.ok()) {
+          db->db = db->opt_txn_db->GetBaseDB();
+        }
+      } else if (FLAGS_transaction_db) {
+        TransactionDB* ptr;
+        TransactionDBOptions txn_db_options;
+        if (options.unordered_write) {
+          options.two_write_queues = true;
+          txn_db_options.skip_concurrency_control = true;
+          txn_db_options.write_policy = WRITE_PREPARED;
+        }
+        s = TransactionDB::Open(options, txn_db_options, db_name,
+                                column_families, &db->cfh, &ptr);
+        if (s.ok()) {
+          db->db = ptr;
+        }
+      } else {
+        s = DB::Open(options, db_name, column_families, &db->cfh, &db->db);
+      }
+#else
+      s = DB::Open(options, db_name, column_families, &db->cfh, &db->db);
+#endif  // ROCKSDB_LITE
+      db->cfh.resize(FLAGS_num_column_families);
+      db->num_created = num_hot;
+      db->num_hot = num_hot;
+      db->cfh_idx_to_prob = std::move(cfh_idx_to_prob);
+#ifndef ROCKSDB_LITE
+    } else if (FLAGS_readonly) {
+      s = DB::OpenForReadOnly(options, db_name, &db->db);
+    } else if (FLAGS_optimistic_transaction_db) {
+      s = OptimisticTransactionDB::Open(options, db_name, &db->opt_txn_db);
+      if (s.ok()) {
+        db->db = db->opt_txn_db->GetBaseDB();
+      }
+    } else if (FLAGS_transaction_db) {
+      TransactionDB* ptr = nullptr;
+      TransactionDBOptions txn_db_options;
+      if (options.unordered_write) {
+        options.two_write_queues = true;
+        txn_db_options.skip_concurrency_control = true;
+        txn_db_options.write_policy = WRITE_PREPARED;
+      }
+      s = CreateLoggerFromOptions(db_name, options, &options.info_log);
+      if (s.ok()) {
+        s = TransactionDB::Open(options, txn_db_options, db_name, &ptr);
+      }
+      if (s.ok()) {
+        db->db = ptr;
+      }
+    } else if (FLAGS_use_blob_db) {
+      // Stacked BlobDB
+      blob_db::BlobDBOptions blob_db_options;
+      blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc;
+      blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff;
+      blob_db_options.is_fifo = FLAGS_blob_db_is_fifo;
+      blob_db_options.max_db_size = FLAGS_blob_db_max_db_size;
+      blob_db_options.ttl_range_secs = FLAGS_blob_db_ttl_range_secs;
+      blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size;
+      blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync;
+      blob_db_options.blob_file_size = FLAGS_blob_db_file_size;
+      blob_db_options.compression = FLAGS_blob_db_compression_type_e;
+      blob_db::BlobDB* ptr = nullptr;
+      s = blob_db::BlobDB::Open(options, blob_db_options, db_name, &ptr);
+      if (s.ok()) {
+        db->db = ptr;
+      }
+    } else if (FLAGS_use_secondary_db) {
+      if (FLAGS_secondary_path.empty()) {
+        std::string default_secondary_path;
+        FLAGS_env->GetTestDirectory(&default_secondary_path);
+        default_secondary_path += "/dbbench_secondary";
+        FLAGS_secondary_path = default_secondary_path;
+      }
+      s = DB::OpenAsSecondary(options, db_name, FLAGS_secondary_path, &db->db);
+      if (s.ok() && FLAGS_secondary_update_interval > 0) {
+        secondary_update_thread_.reset(new port::Thread(
+            [this](int interval, DBWithColumnFamilies* _db) {
+              while (0 == secondary_update_stopped_.load(
+                              std::memory_order_relaxed)) {
+                Status secondary_update_status =
+                    _db->db->TryCatchUpWithPrimary();
+                if (!secondary_update_status.ok()) {
+                  fprintf(stderr, "Failed to catch up with primary: %s\n",
+                          secondary_update_status.ToString().c_str());
+                  break;
+                }
+                ++secondary_db_updates_;
+                FLAGS_env->SleepForMicroseconds(interval * 1000000);
+              }
+            },
+            FLAGS_secondary_update_interval, db));
+      }
+#endif  // ROCKSDB_LITE
+    } else {
+      s = DB::Open(options, db_name, &db->db);
+    }
+    if (FLAGS_report_open_timing) {
+      std::cout << "OpenDb:     "
+                << (FLAGS_env->NowNanos() - open_start) / 1000000.0
+                << " milliseconds\n";
+    }
+    if (!s.ok()) {
+      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
+      exit(1);
+    }
+  }
+
+  enum WriteMode { RANDOM, SEQUENTIAL, UNIQUE_RANDOM };
+
+  void WriteSeqDeterministic(ThreadState* thread) {
+    DoDeterministicCompact(thread, open_options_.compaction_style, SEQUENTIAL);
+  }
+
+  void WriteUniqueRandomDeterministic(ThreadState* thread) {
+    DoDeterministicCompact(thread, open_options_.compaction_style,
+                           UNIQUE_RANDOM);
+  }
+
+  void WriteSeq(ThreadState* thread) { DoWrite(thread, SEQUENTIAL); }
+
+  void WriteRandom(ThreadState* thread) { DoWrite(thread, RANDOM); }
+
+  void WriteUniqueRandom(ThreadState* thread) {
+    DoWrite(thread, UNIQUE_RANDOM);
+  }
+
+  class KeyGenerator {
+   public:
+    KeyGenerator(Random64* rand, WriteMode mode, uint64_t num,
+                 uint64_t /*num_per_set*/ = 64 * 1024)
+        : rand_(rand), mode_(mode), num_(num), next_(0) {
+      if (mode_ == UNIQUE_RANDOM) {
+        // NOTE: if memory consumption of this approach becomes a concern,
+        // we can either break it into pieces and only random shuffle a section
+        // each time. Alternatively, use a bit map implementation
+        // (https://reviews.facebook.net/differential/diff/54627/)
+        values_.resize(num_);
+        for (uint64_t i = 0; i < num_; ++i) {
+          values_[i] = i;
+        }
+        RandomShuffle(values_.begin(), values_.end(),
+                      static_cast<uint32_t>(seed_base));
+      }
+    }
+
+    uint64_t Next() {
+      switch (mode_) {
+        case SEQUENTIAL:
+          return next_++;
+        case RANDOM:
+          return rand_->Next() % num_;
+        case UNIQUE_RANDOM:
+          assert(next_ < num_);
+          return values_[next_++];
+      }
+      assert(false);
+      return std::numeric_limits<uint64_t>::max();
+    }
+
+    // Only available for UNIQUE_RANDOM mode.
+    uint64_t Fetch(uint64_t index) {
+      assert(mode_ == UNIQUE_RANDOM);
+      assert(index < values_.size());
+      return values_[index];
+    }
+
+   private:
+    Random64* rand_;
+    WriteMode mode_;
+    const uint64_t num_;
+    uint64_t next_;
+    std::vector<uint64_t> values_;
+  };
+
+  DB* SelectDB(ThreadState* thread) { return SelectDBWithCfh(thread)->db; }
+
+  DBWithColumnFamilies* SelectDBWithCfh(ThreadState* thread) {
+    return SelectDBWithCfh(thread->rand.Next());
+  }
+
+  DBWithColumnFamilies* SelectDBWithCfh(uint64_t rand_int) {
+    if (db_.db != nullptr) {
+      return &db_;
+    } else {
+      return &multi_dbs_[rand_int % multi_dbs_.size()];
+    }
+  }
+
+  double SineRate(double x) {
+    return FLAGS_sine_a * sin((FLAGS_sine_b * x) + FLAGS_sine_c) + FLAGS_sine_d;
+  }
+
+  void DoWrite(ThreadState* thread, WriteMode write_mode) {
+    const int test_duration = write_mode == RANDOM ? FLAGS_duration : 0;
+    const int64_t num_ops = writes_ == 0 ? num_ : writes_;
+
+    size_t num_key_gens = 1;
+    if (db_.db == nullptr) {
+      num_key_gens = multi_dbs_.size();
+    }
+    std::vector<std::unique_ptr<KeyGenerator>> key_gens(num_key_gens);
+    int64_t max_ops = num_ops * num_key_gens;
+    int64_t ops_per_stage = max_ops;
+    if (FLAGS_num_column_families > 1 && FLAGS_num_hot_column_families > 0) {
+      ops_per_stage = (max_ops - 1) / (FLAGS_num_column_families /
+                                       FLAGS_num_hot_column_families) +
+                      1;
+    }
+
+    Duration duration(test_duration, max_ops, ops_per_stage);
+    const uint64_t num_per_key_gen = num_ + max_num_range_tombstones_;
+    for (size_t i = 0; i < num_key_gens; i++) {
+      key_gens[i].reset(new KeyGenerator(&(thread->rand), write_mode,
+                                         num_per_key_gen, ops_per_stage));
+    }
+
+    if (num_ != FLAGS_num) {
+      char msg[100];
+      snprintf(msg, sizeof(msg), "(%" PRIu64 " ops)", num_);
+      thread->stats.AddMessage(msg);
+    }
+
+    RandomGenerator gen;
+    WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0,
+                     FLAGS_write_batch_protection_bytes_per_key,
+                     user_timestamp_size_);
+    Status s;
+    int64_t bytes = 0;
+
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
+    std::unique_ptr<const char[]> begin_key_guard;
+    Slice begin_key = AllocateKey(&begin_key_guard);
+    std::unique_ptr<const char[]> end_key_guard;
+    Slice end_key = AllocateKey(&end_key_guard);
+    double p = 0.0;
+    uint64_t num_overwrites = 0, num_unique_keys = 0, num_selective_deletes = 0;
+    // If user set overwrite_probability flag,
+    // check if value is in [0.0,1.0].
+    if (FLAGS_overwrite_probability > 0.0) {
+      p = FLAGS_overwrite_probability > 1.0 ? 1.0 : FLAGS_overwrite_probability;
+      // If overwrite set by user, and UNIQUE_RANDOM mode on,
+      // the overwrite_window_size must be > 0.
+      if (write_mode == UNIQUE_RANDOM && FLAGS_overwrite_window_size == 0) {
+        fprintf(stderr,
+                "Overwrite_window_size must be  strictly greater than 0.\n");
+        ErrorExit();
+      }
+    }
+
+    // Default_random_engine provides slightly
+    // improved throughput over mt19937.
+    std::default_random_engine overwrite_gen{
+        static_cast<unsigned int>(seed_base)};
+    std::bernoulli_distribution overwrite_decider(p);
+
+    // Inserted key window is filled with the last N
+    // keys previously inserted into the DB (with
+    // N=FLAGS_overwrite_window_size).
+    // We use a deque struct because:
+    // - random access is O(1)
+    // - insertion/removal at beginning/end is also O(1).
+    std::deque<int64_t> inserted_key_window;
+    Random64 reservoir_id_gen(seed_base);
+
+    // --- Variables used in disposable/persistent keys simulation:
+    // The following variables are used when
+    // disposable_entries_batch_size is >0. We simualte a workload
+    // where the following sequence is repeated multiple times:
+    // "A set of keys S1 is inserted ('disposable entries'), then after
+    // some delay another set of keys S2 is inserted ('persistent entries')
+    // and the first set of keys S1 is deleted. S2 artificially represents
+    // the insertion of hypothetical results from some undefined computation
+    // done on the first set of keys S1. The next sequence can start as soon
+    // as the last disposable entry in the set S1 of this sequence is
+    // inserted, if the delay is non negligible"
+    bool skip_for_loop = false, is_disposable_entry = true;
+    std::vector<uint64_t> disposable_entries_index(num_key_gens, 0);
+    std::vector<uint64_t> persistent_ent_and_del_index(num_key_gens, 0);
+    const uint64_t kNumDispAndPersEntries =
+        FLAGS_disposable_entries_batch_size +
+        FLAGS_persistent_entries_batch_size;
+    if (kNumDispAndPersEntries > 0) {
+      if ((write_mode != UNIQUE_RANDOM) || (writes_per_range_tombstone_ > 0) ||
+          (p > 0.0)) {
+        fprintf(
+            stderr,
+            "Disposable/persistent deletes are not compatible with overwrites "
+            "and DeleteRanges; and are only supported in filluniquerandom.\n");
+        ErrorExit();
+      }
+      if (FLAGS_disposable_entries_value_size < 0 ||
+          FLAGS_persistent_entries_value_size < 0) {
+        fprintf(
+            stderr,
+            "disposable_entries_value_size and persistent_entries_value_size"
+            "have to be positive.\n");
+        ErrorExit();
+      }
+    }
+    Random rnd_disposable_entry(static_cast<uint32_t>(seed_base));
+    std::string random_value;
+    // Queue that stores scheduled timestamp of disposable entries deletes,
+    // along with starting index of disposable entry keys to delete.
+    std::vector<std::queue<std::pair<uint64_t, uint64_t>>> disposable_entries_q(
+        num_key_gens);
+    // --- End of variables used in disposable/persistent keys simulation.
+
+    std::vector<std::unique_ptr<const char[]>> expanded_key_guards;
+    std::vector<Slice> expanded_keys;
+    if (FLAGS_expand_range_tombstones) {
+      expanded_key_guards.resize(range_tombstone_width_);
+      for (auto& expanded_key_guard : expanded_key_guards) {
+        expanded_keys.emplace_back(AllocateKey(&expanded_key_guard));
+      }
+    }
+
+    std::unique_ptr<char[]> ts_guard;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+    }
+
+    int64_t stage = 0;
+    int64_t num_written = 0;
+    int64_t next_seq_db_at = num_ops;
+    size_t id = 0;
+    int64_t num_range_deletions = 0;
+
+    while ((num_per_key_gen != 0) && !duration.Done(entries_per_batch_)) {
+      if (duration.GetStage() != stage) {
+        stage = duration.GetStage();
+        if (db_.db != nullptr) {
+          db_.CreateNewCf(open_options_, stage);
+        } else {
+          for (auto& db : multi_dbs_) {
+            db.CreateNewCf(open_options_, stage);
+          }
+        }
+      }
+
+      if (write_mode != SEQUENTIAL) {
+        id = thread->rand.Next() % num_key_gens;
+      } else {
+        // When doing a sequential load with multiple databases, load them in
+        // order rather than all at the same time to avoid:
+        // 1) long delays between flushing memtables
+        // 2) flushing memtables for all of them at the same point in time
+        // 3) not putting the same number of keys in each database
+        if (num_written >= next_seq_db_at) {
+          next_seq_db_at += num_ops;
+          id++;
+          if (id >= num_key_gens) {
+            fprintf(stderr, "Logic error. Filled all databases\n");
+            ErrorExit();
+          }
+        }
+      }
+      DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(id);
+
+      batch.Clear();
+      int64_t batch_bytes = 0;
+
+      for (int64_t j = 0; j < entries_per_batch_; j++) {
+        int64_t rand_num = 0;
+        if ((write_mode == UNIQUE_RANDOM) && (p > 0.0)) {
+          if ((inserted_key_window.size() > 0) &&
+              overwrite_decider(overwrite_gen)) {
+            num_overwrites++;
+            rand_num = inserted_key_window[reservoir_id_gen.Next() %
+                                           inserted_key_window.size()];
+          } else {
+            num_unique_keys++;
+            rand_num = key_gens[id]->Next();
+            if (inserted_key_window.size() < FLAGS_overwrite_window_size) {
+              inserted_key_window.push_back(rand_num);
+            } else {
+              inserted_key_window.pop_front();
+              inserted_key_window.push_back(rand_num);
+            }
+          }
+        } else if (kNumDispAndPersEntries > 0) {
+          // Check if queue is non-empty and if we need to insert
+          // 'persistent' KV entries (KV entries that are never deleted)
+          // and delete disposable entries previously inserted.
+          if (!disposable_entries_q[id].empty() &&
+              (disposable_entries_q[id].front().first <
+               FLAGS_env->NowMicros())) {
+            // If we need to perform a "merge op" pattern,
+            // we first write all the persistent KV entries not targeted
+            // by deletes, and then we write the disposable entries deletes.
+            if (persistent_ent_and_del_index[id] <
+                FLAGS_persistent_entries_batch_size) {
+              // Generate key to insert.
+              rand_num =
+                  key_gens[id]->Fetch(disposable_entries_q[id].front().second +
+                                      FLAGS_disposable_entries_batch_size +
+                                      persistent_ent_and_del_index[id]);
+              persistent_ent_and_del_index[id]++;
+              is_disposable_entry = false;
+              skip_for_loop = false;
+            } else if (persistent_ent_and_del_index[id] <
+                       kNumDispAndPersEntries) {
+              // Find key of the entry to delete.
+              rand_num =
+                  key_gens[id]->Fetch(disposable_entries_q[id].front().second +
+                                      (persistent_ent_and_del_index[id] -
+                                       FLAGS_persistent_entries_batch_size));
+              persistent_ent_and_del_index[id]++;
+              GenerateKeyFromInt(rand_num, FLAGS_num, &key);
+              // For the delete operation, everything happens here and we
+              // skip the rest of the for-loop, which is designed for
+              // inserts.
+              if (FLAGS_num_column_families <= 1) {
+                batch.Delete(key);
+              } else {
+                // We use same rand_num as seed for key and column family so
+                // that we can deterministically find the cfh corresponding to a
+                // particular key while reading the key.
+                batch.Delete(db_with_cfh->GetCfh(rand_num), key);
+              }
+              // A delete only includes Key+Timestamp (no value).
+              batch_bytes += key_size_ + user_timestamp_size_;
+              bytes += key_size_ + user_timestamp_size_;
+              num_selective_deletes++;
+              // Skip rest of the for-loop (j=0, j<entries_per_batch_,j++).
+              skip_for_loop = true;
+            } else {
+              assert(false);  // should never reach this point.
+            }
+            // If disposable_entries_q needs to be updated (ie: when a selective
+            // insert+delete was successfully completed, pop the job out of the
+            // queue).
+            if (!disposable_entries_q[id].empty() &&
+                (disposable_entries_q[id].front().first <
+                 FLAGS_env->NowMicros()) &&
+                persistent_ent_and_del_index[id] == kNumDispAndPersEntries) {
+              disposable_entries_q[id].pop();
+              persistent_ent_and_del_index[id] = 0;
+            }
+
+            // If we are deleting disposable entries, skip the rest of the
+            // for-loop since there is no key-value inserts at this moment in
+            // time.
+            if (skip_for_loop) {
+              continue;
+            }
+
+          }
+          // If no job is in the queue, then we keep inserting disposable KV
+          // entries that will be deleted later by a series of deletes.
+          else {
+            rand_num = key_gens[id]->Fetch(disposable_entries_index[id]);
+            disposable_entries_index[id]++;
+            is_disposable_entry = true;
+            if ((disposable_entries_index[id] %
+                 FLAGS_disposable_entries_batch_size) == 0) {
+              // Skip the persistent KV entries inserts for now
+              disposable_entries_index[id] +=
+                  FLAGS_persistent_entries_batch_size;
+            }
+          }
+        } else {
+          rand_num = key_gens[id]->Next();
+        }
+        GenerateKeyFromInt(rand_num, FLAGS_num, &key);
+        Slice val;
+        if (kNumDispAndPersEntries > 0) {
+          random_value = rnd_disposable_entry.RandomString(
+              is_disposable_entry ? FLAGS_disposable_entries_value_size
+                                  : FLAGS_persistent_entries_value_size);
+          val = Slice(random_value);
+          num_unique_keys++;
+        } else {
+          val = gen.Generate();
+        }
+        if (use_blob_db_) {
+#ifndef ROCKSDB_LITE
+          // Stacked BlobDB
+          blob_db::BlobDB* blobdb =
+              static_cast<blob_db::BlobDB*>(db_with_cfh->db);
+          if (FLAGS_blob_db_max_ttl_range > 0) {
+            int ttl = rand() % FLAGS_blob_db_max_ttl_range;
+            s = blobdb->PutWithTTL(write_options_, key, val, ttl);
+          } else {
+            s = blobdb->Put(write_options_, key, val);
+          }
+#endif  //  ROCKSDB_LITE
+        } else if (FLAGS_num_column_families <= 1) {
+          batch.Put(key, val);
+        } else {
+          // We use same rand_num as seed for key and column family so that we
+          // can deterministically find the cfh corresponding to a particular
+          // key while reading the key.
+          batch.Put(db_with_cfh->GetCfh(rand_num), key, val);
+        }
+        batch_bytes += val.size() + key_size_ + user_timestamp_size_;
+        bytes += val.size() + key_size_ + user_timestamp_size_;
+        ++num_written;
+
+        // If all disposable entries have been inserted, then we need to
+        // add in the job queue a call for 'persistent entry insertions +
+        // disposable entry deletions'.
+        if (kNumDispAndPersEntries > 0 && is_disposable_entry &&
+            ((disposable_entries_index[id] % kNumDispAndPersEntries) == 0)) {
+          // Queue contains [timestamp, starting_idx],
+          // timestamp = current_time + delay (minimum aboslute time when to
+          // start inserting the selective deletes) starting_idx = index in the
+          // keygen of the rand_num to generate the key of the first KV entry to
+          // delete (= key of the first selective delete).
+          disposable_entries_q[id].push(std::make_pair(
+              FLAGS_env->NowMicros() +
+                  FLAGS_disposable_entries_delete_delay /* timestamp */,
+              disposable_entries_index[id] - kNumDispAndPersEntries
+              /*starting idx*/));
+        }
+        if (writes_per_range_tombstone_ > 0 &&
+            num_written > writes_before_delete_range_ &&
+            (num_written - writes_before_delete_range_) /
+                    writes_per_range_tombstone_ <=
+                max_num_range_tombstones_ &&
+            (num_written - writes_before_delete_range_) %
+                    writes_per_range_tombstone_ ==
+                0) {
+          num_range_deletions++;
+          int64_t begin_num = key_gens[id]->Next();
+          if (FLAGS_expand_range_tombstones) {
+            for (int64_t offset = 0; offset < range_tombstone_width_;
+                 ++offset) {
+              GenerateKeyFromInt(begin_num + offset, FLAGS_num,
+                                 &expanded_keys[offset]);
+              if (use_blob_db_) {
+#ifndef ROCKSDB_LITE
+                // Stacked BlobDB
+                s = db_with_cfh->db->Delete(write_options_,
+                                            expanded_keys[offset]);
+#endif  //  ROCKSDB_LITE
+              } else if (FLAGS_num_column_families <= 1) {
+                batch.Delete(expanded_keys[offset]);
+              } else {
+                batch.Delete(db_with_cfh->GetCfh(rand_num),
+                             expanded_keys[offset]);
+              }
+            }
+          } else {
+            GenerateKeyFromInt(begin_num, FLAGS_num, &begin_key);
+            GenerateKeyFromInt(begin_num + range_tombstone_width_, FLAGS_num,
+                               &end_key);
+            if (use_blob_db_) {
+#ifndef ROCKSDB_LITE
+              // Stacked BlobDB
+              s = db_with_cfh->db->DeleteRange(
+                  write_options_, db_with_cfh->db->DefaultColumnFamily(),
+                  begin_key, end_key);
+#endif  //  ROCKSDB_LITE
+            } else if (FLAGS_num_column_families <= 1) {
+              batch.DeleteRange(begin_key, end_key);
+            } else {
+              batch.DeleteRange(db_with_cfh->GetCfh(rand_num), begin_key,
+                                end_key);
+            }
+          }
+        }
+      }
+      if (thread->shared->write_rate_limiter.get() != nullptr) {
+        thread->shared->write_rate_limiter->Request(
+            batch_bytes, Env::IO_HIGH, nullptr /* stats */,
+            RateLimiter::OpType::kWrite);
+        // Set time at which last op finished to Now() to hide latency and
+        // sleep from rate limiter. Also, do the check once per batch, not
+        // once per write.
+        thread->stats.ResetLastOpTime();
+      }
+      if (user_timestamp_size_ > 0) {
+        Slice user_ts = mock_app_clock_->Allocate(ts_guard.get());
+        s = batch.UpdateTimestamps(
+            user_ts, [this](uint32_t) { return user_timestamp_size_; });
+        if (!s.ok()) {
+          fprintf(stderr, "assign timestamp to write batch: %s\n",
+                  s.ToString().c_str());
+          ErrorExit();
+        }
+      }
+      if (!use_blob_db_) {
+        // Not stacked BlobDB
+        s = db_with_cfh->db->Write(write_options_, &batch);
+      }
+      thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db,
+                                entries_per_batch_, kWrite);
+      if (FLAGS_sine_write_rate) {
+        uint64_t now = FLAGS_env->NowMicros();
+
+        uint64_t usecs_since_last;
+        if (now > thread->stats.GetSineInterval()) {
+          usecs_since_last = now - thread->stats.GetSineInterval();
+        } else {
+          usecs_since_last = 0;
+        }
+
+        if (usecs_since_last >
+            (FLAGS_sine_write_rate_interval_milliseconds * uint64_t{1000})) {
+          double usecs_since_start =
+              static_cast<double>(now - thread->stats.GetStart());
+          thread->stats.ResetSineInterval();
+          uint64_t write_rate =
+              static_cast<uint64_t>(SineRate(usecs_since_start / 1000000.0));
+          thread->shared->write_rate_limiter.reset(
+              NewGenericRateLimiter(write_rate));
+        }
+      }
+      if (!s.ok()) {
+        s = listener_->WaitForRecovery(600000000) ? Status::OK() : s;
+      }
+
+      if (!s.ok()) {
+        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+        ErrorExit();
+      }
+    }
+    if ((write_mode == UNIQUE_RANDOM) && (p > 0.0)) {
+      fprintf(stdout,
+              "Number of unique keys inserted: %" PRIu64
+              ".\nNumber of overwrites: %" PRIu64 "\n",
+              num_unique_keys, num_overwrites);
+    } else if (kNumDispAndPersEntries > 0) {
+      fprintf(stdout,
+              "Number of unique keys inserted (disposable+persistent): %" PRIu64
+              ".\nNumber of 'disposable entry delete': %" PRIu64 "\n",
+              num_written, num_selective_deletes);
+    }
+    if (num_range_deletions > 0) {
+      std::cout << "Number of range deletions: " << num_range_deletions
+                << std::endl;
+    }
+    thread->stats.AddBytes(bytes);
+  }
+
+  Status DoDeterministicCompact(ThreadState* thread,
+                                CompactionStyle compaction_style,
+                                WriteMode write_mode) {
+#ifndef ROCKSDB_LITE
+    ColumnFamilyMetaData meta;
+    std::vector<DB*> db_list;
+    if (db_.db != nullptr) {
+      db_list.push_back(db_.db);
+    } else {
+      for (auto& db : multi_dbs_) {
+        db_list.push_back(db.db);
+      }
+    }
+    std::vector<Options> options_list;
+    for (auto db : db_list) {
+      options_list.push_back(db->GetOptions());
+      if (compaction_style != kCompactionStyleFIFO) {
+        db->SetOptions({{"disable_auto_compactions", "1"},
+                        {"level0_slowdown_writes_trigger", "400000000"},
+                        {"level0_stop_writes_trigger", "400000000"}});
+      } else {
+        db->SetOptions({{"disable_auto_compactions", "1"}});
+      }
+    }
+
+    assert(!db_list.empty());
+    auto num_db = db_list.size();
+    size_t num_levels = static_cast<size_t>(open_options_.num_levels);
+    size_t output_level = open_options_.num_levels - 1;
+    std::vector<std::vector<std::vector<SstFileMetaData>>> sorted_runs(num_db);
+    std::vector<size_t> num_files_at_level0(num_db, 0);
+    if (compaction_style == kCompactionStyleLevel) {
+      if (num_levels == 0) {
+        return Status::InvalidArgument("num_levels should be larger than 1");
+      }
+      bool should_stop = false;
+      while (!should_stop) {
+        if (sorted_runs[0].empty()) {
+          DoWrite(thread, write_mode);
+        } else {
+          DoWrite(thread, UNIQUE_RANDOM);
+        }
+        for (size_t i = 0; i < num_db; i++) {
+          auto db = db_list[i];
+          db->Flush(FlushOptions());
+          db->GetColumnFamilyMetaData(&meta);
+          if (num_files_at_level0[i] == meta.levels[0].files.size() ||
+              writes_ == 0) {
+            should_stop = true;
+            continue;
+          }
+          sorted_runs[i].emplace_back(
+              meta.levels[0].files.begin(),
+              meta.levels[0].files.end() - num_files_at_level0[i]);
+          num_files_at_level0[i] = meta.levels[0].files.size();
+          if (sorted_runs[i].back().size() == 1) {
+            should_stop = true;
+            continue;
+          }
+          if (sorted_runs[i].size() == output_level) {
+            auto& L1 = sorted_runs[i].back();
+            L1.erase(L1.begin(), L1.begin() + L1.size() / 3);
+            should_stop = true;
+            continue;
+          }
+        }
+        writes_ /=
+            static_cast<int64_t>(open_options_.max_bytes_for_level_multiplier);
+      }
+      for (size_t i = 0; i < num_db; i++) {
+        if (sorted_runs[i].size() < num_levels - 1) {
+          fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n",
+                  num_levels);
+          exit(1);
+        }
+      }
+      for (size_t i = 0; i < num_db; i++) {
+        auto db = db_list[i];
+        auto compactionOptions = CompactionOptions();
+        compactionOptions.compression = FLAGS_compression_type_e;
+        auto options = db->GetOptions();
+        MutableCFOptions mutable_cf_options(options);
+        for (size_t j = 0; j < sorted_runs[i].size(); j++) {
+          compactionOptions.output_file_size_limit = MaxFileSizeForLevel(
+              mutable_cf_options, static_cast<int>(output_level),
+              compaction_style);
+          std::cout << sorted_runs[i][j].size() << std::endl;
+          db->CompactFiles(
+              compactionOptions,
+              {sorted_runs[i][j].back().name, sorted_runs[i][j].front().name},
+              static_cast<int>(output_level - j) /*level*/);
+        }
+      }
+    } else if (compaction_style == kCompactionStyleUniversal) {
+      auto ratio = open_options_.compaction_options_universal.size_ratio;
+      bool should_stop = false;
+      while (!should_stop) {
+        if (sorted_runs[0].empty()) {
+          DoWrite(thread, write_mode);
+        } else {
+          DoWrite(thread, UNIQUE_RANDOM);
+        }
+        for (size_t i = 0; i < num_db; i++) {
+          auto db = db_list[i];
+          db->Flush(FlushOptions());
+          db->GetColumnFamilyMetaData(&meta);
+          if (num_files_at_level0[i] == meta.levels[0].files.size() ||
+              writes_ == 0) {
+            should_stop = true;
+            continue;
+          }
+          sorted_runs[i].emplace_back(
+              meta.levels[0].files.begin(),
+              meta.levels[0].files.end() - num_files_at_level0[i]);
+          num_files_at_level0[i] = meta.levels[0].files.size();
+          if (sorted_runs[i].back().size() == 1) {
+            should_stop = true;
+            continue;
+          }
+          num_files_at_level0[i] = meta.levels[0].files.size();
+        }
+        writes_ = static_cast<int64_t>(writes_ * static_cast<double>(100) /
+                                       (ratio + 200));
+      }
+      for (size_t i = 0; i < num_db; i++) {
+        if (sorted_runs[i].size() < num_levels) {
+          fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n",
+                  num_levels);
+          exit(1);
+        }
+      }
+      for (size_t i = 0; i < num_db; i++) {
+        auto db = db_list[i];
+        auto compactionOptions = CompactionOptions();
+        compactionOptions.compression = FLAGS_compression_type_e;
+        auto options = db->GetOptions();
+        MutableCFOptions mutable_cf_options(options);
+        for (size_t j = 0; j < sorted_runs[i].size(); j++) {
+          compactionOptions.output_file_size_limit = MaxFileSizeForLevel(
+              mutable_cf_options, static_cast<int>(output_level),
+              compaction_style);
+          db->CompactFiles(
+              compactionOptions,
+              {sorted_runs[i][j].back().name, sorted_runs[i][j].front().name},
+              (output_level > j ? static_cast<int>(output_level - j)
+                                : 0) /*level*/);
+        }
+      }
+    } else if (compaction_style == kCompactionStyleFIFO) {
+      if (num_levels != 1) {
+        return Status::InvalidArgument(
+            "num_levels should be 1 for FIFO compaction");
+      }
+      if (FLAGS_num_multi_db != 0) {
+        return Status::InvalidArgument("Doesn't support multiDB");
+      }
+      auto db = db_list[0];
+      std::vector<std::string> file_names;
+      while (true) {
+        if (sorted_runs[0].empty()) {
+          DoWrite(thread, write_mode);
+        } else {
+          DoWrite(thread, UNIQUE_RANDOM);
+        }
+        db->Flush(FlushOptions());
+        db->GetColumnFamilyMetaData(&meta);
+        auto total_size = meta.levels[0].size;
+        if (total_size >=
+            db->GetOptions().compaction_options_fifo.max_table_files_size) {
+          for (auto file_meta : meta.levels[0].files) {
+            file_names.emplace_back(file_meta.name);
+          }
+          break;
+        }
+      }
+      // TODO(shuzhang1989): Investigate why CompactFiles not working
+      // auto compactionOptions = CompactionOptions();
+      // db->CompactFiles(compactionOptions, file_names, 0);
+      auto compactionOptions = CompactRangeOptions();
+      db->CompactRange(compactionOptions, nullptr, nullptr);
+    } else {
+      fprintf(stdout,
+              "%-12s : skipped (-compaction_stype=kCompactionStyleNone)\n",
+              "filldeterministic");
+      return Status::InvalidArgument("None compaction is not supported");
+    }
+
+// Verify seqno and key range
+// Note: the seqno get changed at the max level by implementation
+// optimization, so skip the check of the max level.
+#ifndef NDEBUG
+    for (size_t k = 0; k < num_db; k++) {
+      auto db = db_list[k];
+      db->GetColumnFamilyMetaData(&meta);
+      // verify the number of sorted runs
+      if (compaction_style == kCompactionStyleLevel) {
+        assert(num_levels - 1 == sorted_runs[k].size());
+      } else if (compaction_style == kCompactionStyleUniversal) {
+        assert(meta.levels[0].files.size() + num_levels - 1 ==
+               sorted_runs[k].size());
+      } else if (compaction_style == kCompactionStyleFIFO) {
+        // TODO(gzh): FIFO compaction
+        db->GetColumnFamilyMetaData(&meta);
+        auto total_size = meta.levels[0].size;
+        assert(total_size <=
+               db->GetOptions().compaction_options_fifo.max_table_files_size);
+        break;
+      }
+
+      // verify smallest/largest seqno and key range of each sorted run
+      auto max_level = num_levels - 1;
+      int level;
+      for (size_t i = 0; i < sorted_runs[k].size(); i++) {
+        level = static_cast<int>(max_level - i);
+        SequenceNumber sorted_run_smallest_seqno = kMaxSequenceNumber;
+        SequenceNumber sorted_run_largest_seqno = 0;
+        std::string sorted_run_smallest_key, sorted_run_largest_key;
+        bool first_key = true;
+        for (auto fileMeta : sorted_runs[k][i]) {
+          sorted_run_smallest_seqno =
+              std::min(sorted_run_smallest_seqno, fileMeta.smallest_seqno);
+          sorted_run_largest_seqno =
+              std::max(sorted_run_largest_seqno, fileMeta.largest_seqno);
+          if (first_key ||
+              db->DefaultColumnFamily()->GetComparator()->Compare(
+                  fileMeta.smallestkey, sorted_run_smallest_key) < 0) {
+            sorted_run_smallest_key = fileMeta.smallestkey;
+          }
+          if (first_key ||
+              db->DefaultColumnFamily()->GetComparator()->Compare(
+                  fileMeta.largestkey, sorted_run_largest_key) > 0) {
+            sorted_run_largest_key = fileMeta.largestkey;
+          }
+          first_key = false;
+        }
+        if (compaction_style == kCompactionStyleLevel ||
+            (compaction_style == kCompactionStyleUniversal && level > 0)) {
+          SequenceNumber level_smallest_seqno = kMaxSequenceNumber;
+          SequenceNumber level_largest_seqno = 0;
+          for (auto fileMeta : meta.levels[level].files) {
+            level_smallest_seqno =
+                std::min(level_smallest_seqno, fileMeta.smallest_seqno);
+            level_largest_seqno =
+                std::max(level_largest_seqno, fileMeta.largest_seqno);
+          }
+          assert(sorted_run_smallest_key ==
+                 meta.levels[level].files.front().smallestkey);
+          assert(sorted_run_largest_key ==
+                 meta.levels[level].files.back().largestkey);
+          if (level != static_cast<int>(max_level)) {
+            // compaction at max_level would change sequence number
+            assert(sorted_run_smallest_seqno == level_smallest_seqno);
+            assert(sorted_run_largest_seqno == level_largest_seqno);
+          }
+        } else if (compaction_style == kCompactionStyleUniversal) {
+          // level <= 0 means sorted runs on level 0
+          auto level0_file =
+              meta.levels[0].files[sorted_runs[k].size() - 1 - i];
+          assert(sorted_run_smallest_key == level0_file.smallestkey);
+          assert(sorted_run_largest_key == level0_file.largestkey);
+          if (level != static_cast<int>(max_level)) {
+            assert(sorted_run_smallest_seqno == level0_file.smallest_seqno);
+            assert(sorted_run_largest_seqno == level0_file.largest_seqno);
+          }
+        }
+      }
+    }
+#endif
+    // print the size of each sorted_run
+    for (size_t k = 0; k < num_db; k++) {
+      auto db = db_list[k];
+      fprintf(stdout,
+              "---------------------- DB %" ROCKSDB_PRIszt
+              " LSM ---------------------\n",
+              k);
+      db->GetColumnFamilyMetaData(&meta);
+      for (auto& levelMeta : meta.levels) {
+        if (levelMeta.files.empty()) {
+          continue;
+        }
+        if (levelMeta.level == 0) {
+          for (auto& fileMeta : levelMeta.files) {
+            fprintf(stdout, "Level[%d]: %s(size: %" PRIi64 " bytes)\n",
+                    levelMeta.level, fileMeta.name.c_str(), fileMeta.size);
+          }
+        } else {
+          fprintf(stdout, "Level[%d]: %s - %s(total size: %" PRIi64 " bytes)\n",
+                  levelMeta.level, levelMeta.files.front().name.c_str(),
+                  levelMeta.files.back().name.c_str(), levelMeta.size);
+        }
+      }
+    }
+    for (size_t i = 0; i < num_db; i++) {
+      db_list[i]->SetOptions(
+          {{"disable_auto_compactions",
+            std::to_string(options_list[i].disable_auto_compactions)},
+           {"level0_slowdown_writes_trigger",
+            std::to_string(options_list[i].level0_slowdown_writes_trigger)},
+           {"level0_stop_writes_trigger",
+            std::to_string(options_list[i].level0_stop_writes_trigger)}});
+    }
+    return Status::OK();
+#else
+    (void)thread;
+    (void)compaction_style;
+    (void)write_mode;
+    fprintf(stderr, "Rocksdb Lite doesn't support filldeterministic\n");
+    return Status::NotSupported(
+        "Rocksdb Lite doesn't support filldeterministic");
+#endif  // ROCKSDB_LITE
+  }
+
+  void ReadSequential(ThreadState* thread) {
+    if (db_.db != nullptr) {
+      ReadSequential(thread, db_.db);
+    } else {
+      for (const auto& db_with_cfh : multi_dbs_) {
+        ReadSequential(thread, db_with_cfh.db);
+      }
+    }
+  }
+
+  void ReadSequential(ThreadState* thread, DB* db) {
+    ReadOptions options = read_options_;
+    std::unique_ptr<char[]> ts_guard;
+    Slice ts;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+      ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
+      options.timestamp = &ts;
+    }
+
+    options.adaptive_readahead = FLAGS_adaptive_readahead;
+    options.async_io = FLAGS_async_io;
+
+    Iterator* iter = db->NewIterator(options);
+    int64_t i = 0;
+    int64_t bytes = 0;
+    for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) {
+      bytes += iter->key().size() + iter->value().size();
+      thread->stats.FinishedOps(nullptr, db, 1, kRead);
+      ++i;
+
+      if (thread->shared->read_rate_limiter.get() != nullptr &&
+          i % 1024 == 1023) {
+        thread->shared->read_rate_limiter->Request(1024, Env::IO_HIGH,
+                                                   nullptr /* stats */,
+                                                   RateLimiter::OpType::kRead);
+      }
+    }
+
+    delete iter;
+    thread->stats.AddBytes(bytes);
+  }
+
+  void ReadToRowCache(ThreadState* thread) {
+    int64_t read = 0;
+    int64_t found = 0;
+    int64_t bytes = 0;
+    int64_t key_rand = 0;
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
+    PinnableSlice pinnable_val;
+
+    while (key_rand < FLAGS_num) {
+      DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
+      // We use same key_rand as seed for key and column family so that we can
+      // deterministically find the cfh corresponding to a particular key, as it
+      // is done in DoWrite method.
+      GenerateKeyFromInt(key_rand, FLAGS_num, &key);
+      key_rand++;
+      read++;
+      Status s;
+      if (FLAGS_num_column_families > 1) {
+        s = db_with_cfh->db->Get(read_options_, db_with_cfh->GetCfh(key_rand),
+                                 key, &pinnable_val);
+      } else {
+        pinnable_val.Reset();
+        s = db_with_cfh->db->Get(read_options_,
+                                 db_with_cfh->db->DefaultColumnFamily(), key,
+                                 &pinnable_val);
+      }
+
+      if (s.ok()) {
+        found++;
+        bytes += key.size() + pinnable_val.size();
+      } else if (!s.IsNotFound()) {
+        fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
+        abort();
+      }
+
+      if (thread->shared->read_rate_limiter.get() != nullptr &&
+          read % 256 == 255) {
+        thread->shared->read_rate_limiter->Request(
+            256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
+      }
+
+      thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
+    }
+
+    char msg[100];
+    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", found,
+             read);
+
+    thread->stats.AddBytes(bytes);
+    thread->stats.AddMessage(msg);
+  }
+
+  void ReadReverse(ThreadState* thread) {
+    if (db_.db != nullptr) {
+      ReadReverse(thread, db_.db);
+    } else {
+      for (const auto& db_with_cfh : multi_dbs_) {
+        ReadReverse(thread, db_with_cfh.db);
+      }
+    }
+  }
+
+  void ReadReverse(ThreadState* thread, DB* db) {
+    Iterator* iter = db->NewIterator(read_options_);
+    int64_t i = 0;
+    int64_t bytes = 0;
+    for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) {
+      bytes += iter->key().size() + iter->value().size();
+      thread->stats.FinishedOps(nullptr, db, 1, kRead);
+      ++i;
+      if (thread->shared->read_rate_limiter.get() != nullptr &&
+          i % 1024 == 1023) {
+        thread->shared->read_rate_limiter->Request(1024, Env::IO_HIGH,
+                                                   nullptr /* stats */,
+                                                   RateLimiter::OpType::kRead);
+      }
+    }
+    delete iter;
+    thread->stats.AddBytes(bytes);
+  }
+
+  void ReadRandomFast(ThreadState* thread) {
+    int64_t read = 0;
+    int64_t found = 0;
+    int64_t nonexist = 0;
+    ReadOptions options = read_options_;
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
+    std::string value;
+    Slice ts;
+    std::unique_ptr<char[]> ts_guard;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+    }
+    DB* db = SelectDBWithCfh(thread)->db;
+
+    int64_t pot = 1;
+    while (pot < FLAGS_num) {
+      pot <<= 1;
+    }
+
+    Duration duration(FLAGS_duration, reads_);
+    do {
+      for (int i = 0; i < 100; ++i) {
+        int64_t key_rand = thread->rand.Next() & (pot - 1);
+        GenerateKeyFromInt(key_rand, FLAGS_num, &key);
+        ++read;
+        std::string ts_ret;
+        std::string* ts_ptr = nullptr;
+        if (user_timestamp_size_ > 0) {
+          ts = mock_app_clock_->GetTimestampForRead(thread->rand,
+                                                    ts_guard.get());
+          options.timestamp = &ts;
+          ts_ptr = &ts_ret;
+        }
+        auto status = db->Get(options, key, &value, ts_ptr);
+        if (status.ok()) {
+          ++found;
+        } else if (!status.IsNotFound()) {
+          fprintf(stderr, "Get returned an error: %s\n",
+                  status.ToString().c_str());
+          abort();
+        }
+        if (key_rand >= FLAGS_num) {
+          ++nonexist;
+        }
+      }
+      if (thread->shared->read_rate_limiter.get() != nullptr) {
+        thread->shared->read_rate_limiter->Request(
+            100, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
+      }
+
+      thread->stats.FinishedOps(nullptr, db, 100, kRead);
+    } while (!duration.Done(100));
+
+    char msg[100];
+    snprintf(msg, sizeof(msg),
+             "(%" PRIu64 " of %" PRIu64
+             " found, "
+             "issued %" PRIu64 " non-exist keys)\n",
+             found, read, nonexist);
+
+    thread->stats.AddMessage(msg);
+  }
+
+  int64_t GetRandomKey(Random64* rand) {
+    uint64_t rand_int = rand->Next();
+    int64_t key_rand;
+    if (read_random_exp_range_ == 0) {
+      key_rand = rand_int % FLAGS_num;
+    } else {
+      const uint64_t kBigInt = static_cast<uint64_t>(1U) << 62;
+      long double order = -static_cast<long double>(rand_int % kBigInt) /
+                          static_cast<long double>(kBigInt) *
+                          read_random_exp_range_;
+      long double exp_ran = std::exp(order);
+      uint64_t rand_num =
+          static_cast<int64_t>(exp_ran * static_cast<long double>(FLAGS_num));
+      // Map to a different number to avoid locality.
+      const uint64_t kBigPrime = 0x5bd1e995;
+      // Overflow is like %(2^64). Will have little impact of results.
+      key_rand = static_cast<int64_t>((rand_num * kBigPrime) % FLAGS_num);
+    }
+    return key_rand;
+  }
+
+  void ReadRandom(ThreadState* thread) {
+    int64_t read = 0;
+    int64_t found = 0;
+    int64_t bytes = 0;
+    int num_keys = 0;
+    int64_t key_rand = 0;
+    ReadOptions options = read_options_;
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
+    PinnableSlice pinnable_val;
+    std::vector<PinnableSlice> pinnable_vals;
+    if (read_operands_) {
+      // Start off with a small-ish value that'll be increased later if
+      // `GetMergeOperands()` tells us it is not large enough.
+      pinnable_vals.resize(8);
+    }
+    std::unique_ptr<char[]> ts_guard;
+    Slice ts;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+    }
+
+    Duration duration(FLAGS_duration, reads_);
+    while (!duration.Done(1)) {
+      DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
+      // We use same key_rand as seed for key and column family so that we can
+      // deterministically find the cfh corresponding to a particular key, as it
+      // is done in DoWrite method.
+      if (entries_per_batch_ > 1 && FLAGS_multiread_stride) {
+        if (++num_keys == entries_per_batch_) {
+          num_keys = 0;
+          key_rand = GetRandomKey(&thread->rand);
+          if ((key_rand + (entries_per_batch_ - 1) * FLAGS_multiread_stride) >=
+              FLAGS_num) {
+            key_rand = FLAGS_num - entries_per_batch_ * FLAGS_multiread_stride;
+          }
+        } else {
+          key_rand += FLAGS_multiread_stride;
+        }
+      } else {
+        key_rand = GetRandomKey(&thread->rand);
+      }
+      GenerateKeyFromInt(key_rand, FLAGS_num, &key);
+      read++;
+      std::string ts_ret;
+      std::string* ts_ptr = nullptr;
+      if (user_timestamp_size_ > 0) {
+        ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
+        options.timestamp = &ts;
+        ts_ptr = &ts_ret;
+      }
+      Status s;
+      pinnable_val.Reset();
+      for (size_t i = 0; i < pinnable_vals.size(); ++i) {
+        pinnable_vals[i].Reset();
+      }
+      ColumnFamilyHandle* cfh;
+      if (FLAGS_num_column_families > 1) {
+        cfh = db_with_cfh->GetCfh(key_rand);
+      } else {
+        cfh = db_with_cfh->db->DefaultColumnFamily();
+      }
+      if (read_operands_) {
+        GetMergeOperandsOptions get_merge_operands_options;
+        get_merge_operands_options.expected_max_number_of_operands =
+            static_cast<int>(pinnable_vals.size());
+        int number_of_operands;
+        s = db_with_cfh->db->GetMergeOperands(
+            options, cfh, key, pinnable_vals.data(),
+            &get_merge_operands_options, &number_of_operands);
+        if (s.IsIncomplete()) {
+          // Should only happen a few times when we encounter a key that had
+          // more merge operands than any key seen so far. Production use case
+          // would typically retry in such event to get all the operands so do
+          // that here.
+          pinnable_vals.resize(number_of_operands);
+          get_merge_operands_options.expected_max_number_of_operands =
+              static_cast<int>(pinnable_vals.size());
+          s = db_with_cfh->db->GetMergeOperands(
+              options, cfh, key, pinnable_vals.data(),
+              &get_merge_operands_options, &number_of_operands);
+        }
+      } else {
+        s = db_with_cfh->db->Get(options, cfh, key, &pinnable_val, ts_ptr);
+      }
+
+      if (s.ok()) {
+        found++;
+        bytes += key.size() + pinnable_val.size() + user_timestamp_size_;
+        for (size_t i = 0; i < pinnable_vals.size(); ++i) {
+          bytes += pinnable_vals[i].size();
+          pinnable_vals[i].Reset();
+        }
+      } else if (!s.IsNotFound()) {
+        fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
+        abort();
+      }
+
+      if (thread->shared->read_rate_limiter.get() != nullptr &&
+          read % 256 == 255) {
+        thread->shared->read_rate_limiter->Request(
+            256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
+      }
+
+      thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
+    }
+
+    char msg[100];
+    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", found,
+             read);
+
+    thread->stats.AddBytes(bytes);
+    thread->stats.AddMessage(msg);
+  }
+
+  // Calls MultiGet over a list of keys from a random distribution.
+  // Returns the total number of keys found.
+  void MultiReadRandom(ThreadState* thread) {
+    int64_t read = 0;
+    int64_t bytes = 0;
+    int64_t num_multireads = 0;
+    int64_t found = 0;
+    ReadOptions options = read_options_;
+    std::vector<Slice> keys;
+    std::vector<std::unique_ptr<const char[]>> key_guards;
+    std::vector<std::string> values(entries_per_batch_);
+    PinnableSlice* pin_values = new PinnableSlice[entries_per_batch_];
+    std::unique_ptr<PinnableSlice[]> pin_values_guard(pin_values);
+    std::vector<Status> stat_list(entries_per_batch_);
+    while (static_cast<int64_t>(keys.size()) < entries_per_batch_) {
+      key_guards.push_back(std::unique_ptr<const char[]>());
+      keys.push_back(AllocateKey(&key_guards.back()));
+    }
+
+    std::unique_ptr<char[]> ts_guard;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+    }
+
+    Duration duration(FLAGS_duration, reads_);
+    while (!duration.Done(entries_per_batch_)) {
+      DB* db = SelectDB(thread);
+      if (FLAGS_multiread_stride) {
+        int64_t key = GetRandomKey(&thread->rand);
+        if ((key + (entries_per_batch_ - 1) * FLAGS_multiread_stride) >=
+            static_cast<int64_t>(FLAGS_num)) {
+          key = FLAGS_num - entries_per_batch_ * FLAGS_multiread_stride;
+        }
+        for (int64_t i = 0; i < entries_per_batch_; ++i) {
+          GenerateKeyFromInt(key, FLAGS_num, &keys[i]);
+          key += FLAGS_multiread_stride;
+        }
+      } else {
+        for (int64_t i = 0; i < entries_per_batch_; ++i) {
+          GenerateKeyFromInt(GetRandomKey(&thread->rand), FLAGS_num, &keys[i]);
+        }
+      }
+      Slice ts;
+      if (user_timestamp_size_ > 0) {
+        ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
+        options.timestamp = &ts;
+      }
+      if (!FLAGS_multiread_batched) {
+        std::vector<Status> statuses = db->MultiGet(options, keys, &values);
+        assert(static_cast<int64_t>(statuses.size()) == entries_per_batch_);
+
+        read += entries_per_batch_;
+        num_multireads++;
+        for (int64_t i = 0; i < entries_per_batch_; ++i) {
+          if (statuses[i].ok()) {
+            bytes += keys[i].size() + values[i].size() + user_timestamp_size_;
+            ++found;
+          } else if (!statuses[i].IsNotFound()) {
+            fprintf(stderr, "MultiGet returned an error: %s\n",
+                    statuses[i].ToString().c_str());
+            abort();
+          }
+        }
+      } else {
+        db->MultiGet(options, db->DefaultColumnFamily(), keys.size(),
+                     keys.data(), pin_values, stat_list.data());
+
+        read += entries_per_batch_;
+        num_multireads++;
+        for (int64_t i = 0; i < entries_per_batch_; ++i) {
+          if (stat_list[i].ok()) {
+            bytes +=
+                keys[i].size() + pin_values[i].size() + user_timestamp_size_;
+            ++found;
+          } else if (!stat_list[i].IsNotFound()) {
+            fprintf(stderr, "MultiGet returned an error: %s\n",
+                    stat_list[i].ToString().c_str());
+            abort();
+          }
+          stat_list[i] = Status::OK();
+          pin_values[i].Reset();
+        }
+      }
+      if (thread->shared->read_rate_limiter.get() != nullptr &&
+          num_multireads % 256 == 255) {
+        thread->shared->read_rate_limiter->Request(
+            256 * entries_per_batch_, Env::IO_HIGH, nullptr /* stats */,
+            RateLimiter::OpType::kRead);
+      }
+      thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kRead);
+    }
+
+    char msg[100];
+    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", found,
+             read);
+    thread->stats.AddBytes(bytes);
+    thread->stats.AddMessage(msg);
+  }
+
+  // Calls ApproximateSize over random key ranges.
+  void ApproximateSizeRandom(ThreadState* thread) {
+    int64_t size_sum = 0;
+    int64_t num_sizes = 0;
+    const size_t batch_size = entries_per_batch_;
+    std::vector<Range> ranges;
+    std::vector<Slice> lkeys;
+    std::vector<std::unique_ptr<const char[]>> lkey_guards;
+    std::vector<Slice> rkeys;
+    std::vector<std::unique_ptr<const char[]>> rkey_guards;
+    std::vector<uint64_t> sizes;
+    while (ranges.size() < batch_size) {
+      // Ugly without C++17 return from emplace_back
+      lkey_guards.emplace_back();
+      rkey_guards.emplace_back();
+      lkeys.emplace_back(AllocateKey(&lkey_guards.back()));
+      rkeys.emplace_back(AllocateKey(&rkey_guards.back()));
+      ranges.emplace_back(lkeys.back(), rkeys.back());
+      sizes.push_back(0);
+    }
+    Duration duration(FLAGS_duration, reads_);
+    while (!duration.Done(1)) {
+      DB* db = SelectDB(thread);
+      for (size_t i = 0; i < batch_size; ++i) {
+        int64_t lkey = GetRandomKey(&thread->rand);
+        int64_t rkey = GetRandomKey(&thread->rand);
+        if (lkey > rkey) {
+          std::swap(lkey, rkey);
+        }
+        GenerateKeyFromInt(lkey, FLAGS_num, &lkeys[i]);
+        GenerateKeyFromInt(rkey, FLAGS_num, &rkeys[i]);
+      }
+      db->GetApproximateSizes(&ranges[0], static_cast<int>(entries_per_batch_),
+                              &sizes[0]);
+      num_sizes += entries_per_batch_;
+      for (int64_t size : sizes) {
+        size_sum += size;
+      }
+      thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kOthers);
+    }
+
+    char msg[100];
+    snprintf(msg, sizeof(msg), "(Avg approx size=%g)",
+             static_cast<double>(size_sum) / static_cast<double>(num_sizes));
+    thread->stats.AddMessage(msg);
+  }
+
+  // The inverse function of Pareto distribution
+  int64_t ParetoCdfInversion(double u, double theta, double k, double sigma) {
+    double ret;
+    if (k == 0.0) {
+      ret = theta - sigma * std::log(u);
+    } else {
+      ret = theta + sigma * (std::pow(u, -1 * k) - 1) / k;
+    }
+    return static_cast<int64_t>(ceil(ret));
+  }
+  // The inverse function of power distribution (y=ax^b)
+  int64_t PowerCdfInversion(double u, double a, double b) {
+    double ret;
+    ret = std::pow((u / a), (1 / b));
+    return static_cast<int64_t>(ceil(ret));
+  }
+
+  // Add the noice to the QPS
+  double AddNoise(double origin, double noise_ratio) {
+    if (noise_ratio < 0.0 || noise_ratio > 1.0) {
+      return origin;
+    }
+    int band_int = static_cast<int>(FLAGS_sine_a);
+    double delta = (rand() % band_int - band_int / 2) * noise_ratio;
+    if (origin + delta < 0) {
+      return origin;
+    } else {
+      return (origin + delta);
+    }
+  }
+
+  // Decide the ratio of different query types
+  // 0 Get, 1 Put, 2 Seek, 3 SeekForPrev, 4 Delete, 5 SingleDelete, 6 merge
+  class QueryDecider {
+   public:
+    std::vector<int> type_;
+    std::vector<double> ratio_;
+    int range_;
+
+    QueryDecider() {}
+    ~QueryDecider() {}
+
+    Status Initiate(std::vector<double> ratio_input) {
+      int range_max = 1000;
+      double sum = 0.0;
+      for (auto& ratio : ratio_input) {
+        sum += ratio;
+      }
+      range_ = 0;
+      for (auto& ratio : ratio_input) {
+        range_ += static_cast<int>(ceil(range_max * (ratio / sum)));
+        type_.push_back(range_);
+        ratio_.push_back(ratio / sum);
+      }
+      return Status::OK();
+    }
+
+    int GetType(int64_t rand_num) {
+      if (rand_num < 0) {
+        rand_num = rand_num * (-1);
+      }
+      assert(range_ != 0);
+      int pos = static_cast<int>(rand_num % range_);
+      for (int i = 0; i < static_cast<int>(type_.size()); i++) {
+        if (pos < type_[i]) {
+          return i;
+        }
+      }
+      return 0;
+    }
+  };
+
+  // KeyrangeUnit is the struct of a keyrange. It is used in a keyrange vector
+  // to transfer a random value to one keyrange based on the hotness.
+  struct KeyrangeUnit {
+    int64_t keyrange_start;
+    int64_t keyrange_access;
+    int64_t keyrange_keys;
+  };
+
+  // From our observations, the prefix hotness (key-range hotness) follows
+  // the two-term-exponential distribution: f(x) = a*exp(b*x) + c*exp(d*x).
+  // However, we cannot directly use the inverse function to decide a
+  // key-range from a random distribution. To achieve it, we create a list of
+  // KeyrangeUnit, each KeyrangeUnit occupies a range of integers whose size is
+  // decided based on the hotness of the key-range. When a random value is
+  // generated based on uniform distribution, we map it to the KeyrangeUnit Vec
+  // and one KeyrangeUnit is selected. The probability of a  KeyrangeUnit being
+  // selected is the same as the hotness of this KeyrangeUnit. After that, the
+  // key can be randomly allocated to the key-range of this KeyrangeUnit, or we
+  // can based on the power distribution (y=ax^b) to generate the offset of
+  // the key in the selected key-range. In this way, we generate the keyID
+  // based on the hotness of the prefix and also the key hotness distribution.
+  class GenerateTwoTermExpKeys {
+   public:
+    // Avoid uninitialized warning-as-error in some compilers
+    int64_t keyrange_rand_max_ = 0;
+    int64_t keyrange_size_ = 0;
+    int64_t keyrange_num_ = 0;
+    std::vector<KeyrangeUnit> keyrange_set_;
+
+    // Initiate the KeyrangeUnit vector and calculate the size of each
+    // KeyrangeUnit.
+    Status InitiateExpDistribution(int64_t total_keys, double prefix_a,
+                                   double prefix_b, double prefix_c,
+                                   double prefix_d) {
+      int64_t amplify = 0;
+      int64_t keyrange_start = 0;
+      if (FLAGS_keyrange_num <= 0) {
+        keyrange_num_ = 1;
+      } else {
+        keyrange_num_ = FLAGS_keyrange_num;
+      }
+      keyrange_size_ = total_keys / keyrange_num_;
+
+      // Calculate the key-range shares size based on the input parameters
+      for (int64_t pfx = keyrange_num_; pfx >= 1; pfx--) {
+        // Step 1. Calculate the probability that this key range will be
+        // accessed in a query. It is based on the two-term expoential
+        // distribution
+        double keyrange_p = prefix_a * std::exp(prefix_b * pfx) +
+                            prefix_c * std::exp(prefix_d * pfx);
+        if (keyrange_p < std::pow(10.0, -16.0)) {
+          keyrange_p = 0.0;
+        }
+        // Step 2. Calculate the amplify
+        // In order to allocate a query to a key-range based on the random
+        // number generated for this query, we need to extend the probability
+        // of each key range from [0,1] to [0, amplify]. Amplify is calculated
+        // by 1/(smallest key-range probability). In this way, we ensure that
+        // all key-ranges are assigned with an Integer that  >=0
+        if (amplify == 0 && keyrange_p > 0) {
+          amplify = static_cast<int64_t>(std::floor(1 / keyrange_p)) + 1;
+        }
+
+        // Step 3. For each key-range, we calculate its position in the
+        // [0, amplify] range, including the start, the size (keyrange_access)
+        KeyrangeUnit p_unit;
+        p_unit.keyrange_start = keyrange_start;
+        if (0.0 >= keyrange_p) {
+          p_unit.keyrange_access = 0;
+        } else {
+          p_unit.keyrange_access =
+              static_cast<int64_t>(std::floor(amplify * keyrange_p));
+        }
+        p_unit.keyrange_keys = keyrange_size_;
+        keyrange_set_.push_back(p_unit);
+        keyrange_start += p_unit.keyrange_access;
+      }
+      keyrange_rand_max_ = keyrange_start;
+
+      // Step 4. Shuffle the key-ranges randomly
+      // Since the access probability is calculated from small to large,
+      // If we do not re-allocate them, hot key-ranges are always at the end
+      // and cold key-ranges are at the begin of the key space. Therefore, the
+      // key-ranges are shuffled and the rand seed is only decide by the
+      // key-range hotness distribution. With the same distribution parameters
+      // the shuffle results are the same.
+      Random64 rand_loca(keyrange_rand_max_);
+      for (int64_t i = 0; i < FLAGS_keyrange_num; i++) {
+        int64_t pos = rand_loca.Next() % FLAGS_keyrange_num;
+        assert(i >= 0 && i < static_cast<int64_t>(keyrange_set_.size()) &&
+               pos >= 0 && pos < static_cast<int64_t>(keyrange_set_.size()));
+        std::swap(keyrange_set_[i], keyrange_set_[pos]);
+      }
+
+      // Step 5. Recalculate the prefix start postion after shuffling
+      int64_t offset = 0;
+      for (auto& p_unit : keyrange_set_) {
+        p_unit.keyrange_start = offset;
+        offset += p_unit.keyrange_access;
+      }
+
+      return Status::OK();
+    }
+
+    // Generate the Key ID according to the input ini_rand and key distribution
+    int64_t DistGetKeyID(int64_t ini_rand, double key_dist_a,
+                         double key_dist_b) {
+      int64_t keyrange_rand = ini_rand % keyrange_rand_max_;
+
+      // Calculate and select one key-range that contains the new key
+      int64_t start = 0, end = static_cast<int64_t>(keyrange_set_.size());
+      while (start + 1 < end) {
+        int64_t mid = start + (end - start) / 2;
+        assert(mid >= 0 && mid < static_cast<int64_t>(keyrange_set_.size()));
+        if (keyrange_rand < keyrange_set_[mid].keyrange_start) {
+          end = mid;
+        } else {
+          start = mid;
+        }
+      }
+      int64_t keyrange_id = start;
+
+      // Select one key in the key-range and compose the keyID
+      int64_t key_offset = 0, key_seed;
+      if (key_dist_a == 0.0 || key_dist_b == 0.0) {
+        key_offset = ini_rand % keyrange_size_;
+      } else {
+        double u =
+            static_cast<double>(ini_rand % keyrange_size_) / keyrange_size_;
+        key_seed = static_cast<int64_t>(
+            ceil(std::pow((u / key_dist_a), (1 / key_dist_b))));
+        Random64 rand_key(key_seed);
+        key_offset = rand_key.Next() % keyrange_size_;
+      }
+      return keyrange_size_ * keyrange_id + key_offset;
+    }
+  };
+
+  // The social graph workload mixed with Get, Put, Iterator queries.
+  // The value size and iterator length follow Pareto distribution.
+  // The overall key access follow power distribution. If user models the
+  // workload based on different key-ranges (or different prefixes), user
+  // can use two-term-exponential distribution to fit the workload. User
+  // needs to decide the ratio between Get, Put, Iterator queries before
+  // starting the benchmark.
+  void MixGraph(ThreadState* thread) {
+    int64_t gets = 0;
+    int64_t puts = 0;
+    int64_t get_found = 0;
+    int64_t seek = 0;
+    int64_t seek_found = 0;
+    int64_t bytes = 0;
+    double total_scan_length = 0;
+    double total_val_size = 0;
+    const int64_t default_value_max = 1 * 1024 * 1024;
+    int64_t value_max = default_value_max;
+    int64_t scan_len_max = FLAGS_mix_max_scan_len;
+    double write_rate = 1000000.0;
+    double read_rate = 1000000.0;
+    bool use_prefix_modeling = false;
+    bool use_random_modeling = false;
+    GenerateTwoTermExpKeys gen_exp;
+    std::vector<double> ratio{FLAGS_mix_get_ratio, FLAGS_mix_put_ratio,
+                              FLAGS_mix_seek_ratio};
+    char value_buffer[default_value_max];
+    QueryDecider query;
+    RandomGenerator gen;
+    Status s;
+    if (value_max > FLAGS_mix_max_value_size) {
+      value_max = FLAGS_mix_max_value_size;
+    }
+
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
+    PinnableSlice pinnable_val;
+    query.Initiate(ratio);
+
+    // the limit of qps initiation
+    if (FLAGS_sine_mix_rate) {
+      thread->shared->read_rate_limiter.reset(
+          NewGenericRateLimiter(static_cast<int64_t>(read_rate)));
+      thread->shared->write_rate_limiter.reset(
+          NewGenericRateLimiter(static_cast<int64_t>(write_rate)));
+    }
+
+    // Decide if user wants to use prefix based key generation
+    if (FLAGS_keyrange_dist_a != 0.0 || FLAGS_keyrange_dist_b != 0.0 ||
+        FLAGS_keyrange_dist_c != 0.0 || FLAGS_keyrange_dist_d != 0.0) {
+      use_prefix_modeling = true;
+      gen_exp.InitiateExpDistribution(
+          FLAGS_num, FLAGS_keyrange_dist_a, FLAGS_keyrange_dist_b,
+          FLAGS_keyrange_dist_c, FLAGS_keyrange_dist_d);
+    }
+    if (FLAGS_key_dist_a == 0 || FLAGS_key_dist_b == 0) {
+      use_random_modeling = true;
+    }
+
+    Duration duration(FLAGS_duration, reads_);
+    while (!duration.Done(1)) {
+      DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
+      int64_t ini_rand, rand_v, key_rand, key_seed;
+      ini_rand = GetRandomKey(&thread->rand);
+      rand_v = ini_rand % FLAGS_num;
+      double u = static_cast<double>(rand_v) / FLAGS_num;
+
+      // Generate the keyID based on the key hotness and prefix hotness
+      if (use_random_modeling) {
+        key_rand = ini_rand;
+      } else if (use_prefix_modeling) {
+        key_rand =
+            gen_exp.DistGetKeyID(ini_rand, FLAGS_key_dist_a, FLAGS_key_dist_b);
+      } else {
+        key_seed = PowerCdfInversion(u, FLAGS_key_dist_a, FLAGS_key_dist_b);
+        Random64 rand(key_seed);
+        key_rand = static_cast<int64_t>(rand.Next()) % FLAGS_num;
+      }
+      GenerateKeyFromInt(key_rand, FLAGS_num, &key);
+      int query_type = query.GetType(rand_v);
+
+      // change the qps
+      uint64_t now = FLAGS_env->NowMicros();
+      uint64_t usecs_since_last;
+      if (now > thread->stats.GetSineInterval()) {
+        usecs_since_last = now - thread->stats.GetSineInterval();
+      } else {
+        usecs_since_last = 0;
+      }
+
+      if (FLAGS_sine_mix_rate &&
+          usecs_since_last >
+              (FLAGS_sine_mix_rate_interval_milliseconds * uint64_t{1000})) {
+        double usecs_since_start =
+            static_cast<double>(now - thread->stats.GetStart());
+        thread->stats.ResetSineInterval();
+        double mix_rate_with_noise = AddNoise(
+            SineRate(usecs_since_start / 1000000.0), FLAGS_sine_mix_rate_noise);
+        read_rate = mix_rate_with_noise * (query.ratio_[0] + query.ratio_[2]);
+        write_rate = mix_rate_with_noise * query.ratio_[1];
+
+        if (read_rate > 0) {
+          thread->shared->read_rate_limiter->SetBytesPerSecond(
+              static_cast<int64_t>(read_rate));
+        }
+        if (write_rate > 0) {
+          thread->shared->write_rate_limiter->SetBytesPerSecond(
+              static_cast<int64_t>(write_rate));
+        }
+      }
+      // Start the query
+      if (query_type == 0) {
+        // the Get query
+        gets++;
+        if (FLAGS_num_column_families > 1) {
+          s = db_with_cfh->db->Get(read_options_, db_with_cfh->GetCfh(key_rand),
+                                   key, &pinnable_val);
+        } else {
+          pinnable_val.Reset();
+          s = db_with_cfh->db->Get(read_options_,
+                                   db_with_cfh->db->DefaultColumnFamily(), key,
+                                   &pinnable_val);
+        }
+
+        if (s.ok()) {
+          get_found++;
+          bytes += key.size() + pinnable_val.size();
+        } else if (!s.IsNotFound()) {
+          fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
+          abort();
+        }
+
+        if (thread->shared->read_rate_limiter && (gets + seek) % 100 == 0) {
+          thread->shared->read_rate_limiter->Request(100, Env::IO_HIGH,
+                                                     nullptr /*stats*/);
+        }
+        thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
+      } else if (query_type == 1) {
+        // the Put query
+        puts++;
+        int64_t val_size = ParetoCdfInversion(u, FLAGS_value_theta,
+                                              FLAGS_value_k, FLAGS_value_sigma);
+        if (val_size < 10) {
+          val_size = 10;
+        } else if (val_size > value_max) {
+          val_size = val_size % value_max;
+        }
+        total_val_size += val_size;
+
+        s = db_with_cfh->db->Put(
+            write_options_, key,
+            gen.Generate(static_cast<unsigned int>(val_size)));
+        if (!s.ok()) {
+          fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+          ErrorExit();
+        }
+
+        if (thread->shared->write_rate_limiter && puts % 100 == 0) {
+          thread->shared->write_rate_limiter->Request(100, Env::IO_HIGH,
+                                                      nullptr /*stats*/);
+        }
+        thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kWrite);
+      } else if (query_type == 2) {
+        // Seek query
+        if (db_with_cfh->db != nullptr) {
+          Iterator* single_iter = nullptr;
+          single_iter = db_with_cfh->db->NewIterator(read_options_);
+          if (single_iter != nullptr) {
+            single_iter->Seek(key);
+            seek++;
+            if (single_iter->Valid() && single_iter->key().compare(key) == 0) {
+              seek_found++;
+            }
+            int64_t scan_length =
+                ParetoCdfInversion(u, FLAGS_iter_theta, FLAGS_iter_k,
+                                   FLAGS_iter_sigma) %
+                scan_len_max;
+            for (int64_t j = 0; j < scan_length && single_iter->Valid(); j++) {
+              Slice value = single_iter->value();
+              memcpy(value_buffer, value.data(),
+                     std::min(value.size(), sizeof(value_buffer)));
+              bytes += single_iter->key().size() + single_iter->value().size();
+              single_iter->Next();
+              assert(single_iter->status().ok());
+              total_scan_length++;
+            }
+          }
+          delete single_iter;
+        }
+        thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kSeek);
+      }
+    }
+    char msg[256];
+    snprintf(msg, sizeof(msg),
+             "( Gets:%" PRIu64 " Puts:%" PRIu64 " Seek:%" PRIu64
+             ", reads %" PRIu64 " in %" PRIu64
+             " found, "
+             "avg size: %.1f value, %.1f scan)\n",
+             gets, puts, seek, get_found + seek_found, gets + seek,
+             total_val_size / puts, total_scan_length / seek);
+
+    thread->stats.AddBytes(bytes);
+    thread->stats.AddMessage(msg);
+  }
+
+  void IteratorCreation(ThreadState* thread) {
+    Duration duration(FLAGS_duration, reads_);
+    ReadOptions options = read_options_;
+    std::unique_ptr<char[]> ts_guard;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+    }
+    while (!duration.Done(1)) {
+      DB* db = SelectDB(thread);
+      Slice ts;
+      if (user_timestamp_size_ > 0) {
+        ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
+        options.timestamp = &ts;
+      }
+      Iterator* iter = db->NewIterator(options);
+      delete iter;
+      thread->stats.FinishedOps(nullptr, db, 1, kOthers);
+    }
+  }
+
+  void IteratorCreationWhileWriting(ThreadState* thread) {
+    if (thread->tid > 0) {
+      IteratorCreation(thread);
+    } else {
+      BGWriter(thread, kWrite);
+    }
+  }
+
+  void SeekRandom(ThreadState* thread) {
+    int64_t read = 0;
+    int64_t found = 0;
+    int64_t bytes = 0;
+    ReadOptions options = read_options_;
+    std::unique_ptr<char[]> ts_guard;
+    Slice ts;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+      ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
+      options.timestamp = &ts;
+    }
+
+    std::vector<Iterator*> tailing_iters;
+    if (FLAGS_use_tailing_iterator) {
+      if (db_.db != nullptr) {
+        tailing_iters.push_back(db_.db->NewIterator(options));
+      } else {
+        for (const auto& db_with_cfh : multi_dbs_) {
+          tailing_iters.push_back(db_with_cfh.db->NewIterator(options));
+        }
+      }
+    }
+    options.auto_prefix_mode = FLAGS_auto_prefix_mode;
+
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
+
+    std::unique_ptr<const char[]> upper_bound_key_guard;
+    Slice upper_bound = AllocateKey(&upper_bound_key_guard);
+    std::unique_ptr<const char[]> lower_bound_key_guard;
+    Slice lower_bound = AllocateKey(&lower_bound_key_guard);
+
+    Duration duration(FLAGS_duration, reads_);
+    char value_buffer[256];
+    while (!duration.Done(1)) {
+      int64_t seek_pos = thread->rand.Next() % FLAGS_num;
+      GenerateKeyFromIntForSeek(static_cast<uint64_t>(seek_pos), FLAGS_num,
+                                &key);
+      if (FLAGS_max_scan_distance != 0) {
+        if (FLAGS_reverse_iterator) {
+          GenerateKeyFromInt(
+              static_cast<uint64_t>(std::max(
+                  static_cast<int64_t>(0), seek_pos - FLAGS_max_scan_distance)),
+              FLAGS_num, &lower_bound);
+          options.iterate_lower_bound = &lower_bound;
+        } else {
+          auto min_num =
+              std::min(FLAGS_num, seek_pos + FLAGS_max_scan_distance);
+          GenerateKeyFromInt(static_cast<uint64_t>(min_num), FLAGS_num,
+                             &upper_bound);
+          options.iterate_upper_bound = &upper_bound;
+        }
+      } else if (FLAGS_auto_prefix_mode && prefix_extractor_ &&
+                 !FLAGS_reverse_iterator) {
+        // Set upper bound to next prefix
+        auto mutable_upper_bound = const_cast<char*>(upper_bound.data());
+        std::memcpy(mutable_upper_bound, key.data(), prefix_size_);
+        mutable_upper_bound[prefix_size_ - 1]++;
+        upper_bound = Slice(upper_bound.data(), prefix_size_);
+        options.iterate_upper_bound = &upper_bound;
+      }
+
+      // Pick a Iterator to use
+      uint64_t db_idx_to_use =
+          (db_.db == nullptr)
+              ? (uint64_t{thread->rand.Next()} % multi_dbs_.size())
+              : 0;
+      std::unique_ptr<Iterator> single_iter;
+      Iterator* iter_to_use;
+      if (FLAGS_use_tailing_iterator) {
+        iter_to_use = tailing_iters[db_idx_to_use];
+      } else {
+        if (db_.db != nullptr) {
+          single_iter.reset(db_.db->NewIterator(options));
+        } else {
+          single_iter.reset(multi_dbs_[db_idx_to_use].db->NewIterator(options));
+        }
+        iter_to_use = single_iter.get();
+      }
+
+      iter_to_use->Seek(key);
+      read++;
+      if (iter_to_use->Valid() && iter_to_use->key().compare(key) == 0) {
+        found++;
+      }
+
+      for (int j = 0; j < FLAGS_seek_nexts && iter_to_use->Valid(); ++j) {
+        // Copy out iterator's value to make sure we read them.
+        Slice value = iter_to_use->value();
+        memcpy(value_buffer, value.data(),
+               std::min(value.size(), sizeof(value_buffer)));
+        bytes += iter_to_use->key().size() + iter_to_use->value().size();
+
+        if (!FLAGS_reverse_iterator) {
+          iter_to_use->Next();
+        } else {
+          iter_to_use->Prev();
+        }
+        assert(iter_to_use->status().ok());
+      }
+
+      if (thread->shared->read_rate_limiter.get() != nullptr &&
+          read % 256 == 255) {
+        thread->shared->read_rate_limiter->Request(
+            256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
+      }
+
+      thread->stats.FinishedOps(&db_, db_.db, 1, kSeek);
+    }
+    for (auto iter : tailing_iters) {
+      delete iter;
+    }
+
+    char msg[100];
+    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", found,
+             read);
+    thread->stats.AddBytes(bytes);
+    thread->stats.AddMessage(msg);
+  }
+
+  void SeekRandomWhileWriting(ThreadState* thread) {
+    if (thread->tid > 0) {
+      SeekRandom(thread);
+    } else {
+      BGWriter(thread, kWrite);
+    }
+  }
+
+  void SeekRandomWhileMerging(ThreadState* thread) {
+    if (thread->tid > 0) {
+      SeekRandom(thread);
+    } else {
+      BGWriter(thread, kMerge);
+    }
+  }
+
+  void DoDelete(ThreadState* thread, bool seq) {
+    WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0,
+                     FLAGS_write_batch_protection_bytes_per_key,
+                     user_timestamp_size_);
+    Duration duration(seq ? 0 : FLAGS_duration, deletes_);
+    int64_t i = 0;
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
+    std::unique_ptr<char[]> ts_guard;
+    Slice ts;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+    }
+
+    while (!duration.Done(entries_per_batch_)) {
+      DB* db = SelectDB(thread);
+      batch.Clear();
+      for (int64_t j = 0; j < entries_per_batch_; ++j) {
+        const int64_t k = seq ? i + j : (thread->rand.Next() % FLAGS_num);
+        GenerateKeyFromInt(k, FLAGS_num, &key);
+        batch.Delete(key);
+      }
+      Status s;
+      if (user_timestamp_size_ > 0) {
+        ts = mock_app_clock_->Allocate(ts_guard.get());
+        s = batch.UpdateTimestamps(
+            ts, [this](uint32_t) { return user_timestamp_size_; });
+        if (!s.ok()) {
+          fprintf(stderr, "assign timestamp: %s\n", s.ToString().c_str());
+          ErrorExit();
+        }
+      }
+      s = db->Write(write_options_, &batch);
+      thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kDelete);
+      if (!s.ok()) {
+        fprintf(stderr, "del error: %s\n", s.ToString().c_str());
+        exit(1);
+      }
+      i += entries_per_batch_;
+    }
+  }
+
+  void DeleteSeq(ThreadState* thread) { DoDelete(thread, true); }
+
+  void DeleteRandom(ThreadState* thread) { DoDelete(thread, false); }
+
+  void ReadWhileWriting(ThreadState* thread) {
+    if (thread->tid > 0) {
+      ReadRandom(thread);
+    } else {
+      BGWriter(thread, kWrite);
+    }
+  }
+
+  void MultiReadWhileWriting(ThreadState* thread) {
+    if (thread->tid > 0) {
+      MultiReadRandom(thread);
+    } else {
+      BGWriter(thread, kWrite);
+    }
+  }
+
+  void ReadWhileMerging(ThreadState* thread) {
+    if (thread->tid > 0) {
+      ReadRandom(thread);
+    } else {
+      BGWriter(thread, kMerge);
+    }
+  }
+
+  void BGWriter(ThreadState* thread, enum OperationType write_merge) {
+    // Special thread that keeps writing until other threads are done.
+    RandomGenerator gen;
+    int64_t bytes = 0;
+
+    std::unique_ptr<RateLimiter> write_rate_limiter;
+    if (FLAGS_benchmark_write_rate_limit > 0) {
+      write_rate_limiter.reset(
+          NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
+    }
+
+    // Don't merge stats from this thread with the readers.
+    thread->stats.SetExcludeFromMerge();
+
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
+    std::unique_ptr<char[]> ts_guard;
+    std::unique_ptr<const char[]> begin_key_guard;
+    Slice begin_key = AllocateKey(&begin_key_guard);
+    std::unique_ptr<const char[]> end_key_guard;
+    Slice end_key = AllocateKey(&end_key_guard);
+    uint64_t num_range_deletions = 0;
+    std::vector<std::unique_ptr<const char[]>> expanded_key_guards;
+    std::vector<Slice> expanded_keys;
+    if (FLAGS_expand_range_tombstones) {
+      expanded_key_guards.resize(range_tombstone_width_);
+      for (auto& expanded_key_guard : expanded_key_guards) {
+        expanded_keys.emplace_back(AllocateKey(&expanded_key_guard));
+      }
+    }
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+    }
+    uint32_t written = 0;
+    bool hint_printed = false;
+
+    while (true) {
+      DB* db = SelectDB(thread);
+      {
+        MutexLock l(&thread->shared->mu);
+        if (FLAGS_finish_after_writes && written == writes_) {
+          fprintf(stderr, "Exiting the writer after %u writes...\n", written);
+          break;
+        }
+        if (thread->shared->num_done + 1 >= thread->shared->num_initialized) {
+          // Other threads have finished
+          if (FLAGS_finish_after_writes) {
+            // Wait for the writes to be finished
+            if (!hint_printed) {
+              fprintf(stderr, "Reads are finished. Have %d more writes to do\n",
+                      static_cast<int>(writes_) - written);
+              hint_printed = true;
+            }
+          } else {
+            // Finish the write immediately
+            break;
+          }
+        }
+      }
+
+      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
+      Status s;
+
+      Slice val = gen.Generate();
+      Slice ts;
+      if (user_timestamp_size_ > 0) {
+        ts = mock_app_clock_->Allocate(ts_guard.get());
+      }
+      if (write_merge == kWrite) {
+        if (user_timestamp_size_ == 0) {
+          s = db->Put(write_options_, key, val);
+        } else {
+          s = db->Put(write_options_, key, ts, val);
+        }
+      } else {
+        s = db->Merge(write_options_, key, val);
+      }
+      // Restore write_options_
+      written++;
+
+      if (!s.ok()) {
+        fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
+        exit(1);
+      }
+      bytes += key.size() + val.size() + user_timestamp_size_;
+      thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
+
+      if (FLAGS_benchmark_write_rate_limit > 0) {
+        write_rate_limiter->Request(key.size() + val.size(), Env::IO_HIGH,
+                                    nullptr /* stats */,
+                                    RateLimiter::OpType::kWrite);
+      }
+
+      if (writes_per_range_tombstone_ > 0 &&
+          written > writes_before_delete_range_ &&
+          (written - writes_before_delete_range_) /
+                  writes_per_range_tombstone_ <=
+              max_num_range_tombstones_ &&
+          (written - writes_before_delete_range_) %
+                  writes_per_range_tombstone_ ==
+              0) {
+        num_range_deletions++;
+        int64_t begin_num = thread->rand.Next() % FLAGS_num;
+        if (FLAGS_expand_range_tombstones) {
+          for (int64_t offset = 0; offset < range_tombstone_width_; ++offset) {
+            GenerateKeyFromInt(begin_num + offset, FLAGS_num,
+                               &expanded_keys[offset]);
+            if (!db->Delete(write_options_, expanded_keys[offset]).ok()) {
+              fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
+              exit(1);
+            }
+          }
+        } else {
+          GenerateKeyFromInt(begin_num, FLAGS_num, &begin_key);
+          GenerateKeyFromInt(begin_num + range_tombstone_width_, FLAGS_num,
+                             &end_key);
+          if (!db->DeleteRange(write_options_, db->DefaultColumnFamily(),
+                               begin_key, end_key)
+                   .ok()) {
+            fprintf(stderr, "deleterange error: %s\n", s.ToString().c_str());
+            exit(1);
+          }
+        }
+        thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
+        // TODO: DeleteRange is not included in calculcation of bytes/rate
+        // limiter request
+      }
+    }
+    if (num_range_deletions > 0) {
+      std::cout << "Number of range deletions: " << num_range_deletions
+                << std::endl;
+    }
+    thread->stats.AddBytes(bytes);
+  }
+
+  void ReadWhileScanning(ThreadState* thread) {
+    if (thread->tid > 0) {
+      ReadRandom(thread);
+    } else {
+      BGScan(thread);
+    }
+  }
+
+  void BGScan(ThreadState* thread) {
+    if (FLAGS_num_multi_db > 0) {
+      fprintf(stderr, "Not supporting multiple DBs.\n");
+      abort();
+    }
+    assert(db_.db != nullptr);
+    ReadOptions read_options = read_options_;
+    std::unique_ptr<char[]> ts_guard;
+    Slice ts;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+      ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
+      read_options.timestamp = &ts;
+    }
+    Iterator* iter = db_.db->NewIterator(read_options);
+
+    fprintf(stderr, "num reads to do %" PRIu64 "\n", reads_);
+    Duration duration(FLAGS_duration, reads_);
+    uint64_t num_seek_to_first = 0;
+    uint64_t num_next = 0;
+    while (!duration.Done(1)) {
+      if (!iter->Valid()) {
+        iter->SeekToFirst();
+        num_seek_to_first++;
+      } else if (!iter->status().ok()) {
+        fprintf(stderr, "Iterator error: %s\n",
+                iter->status().ToString().c_str());
+        abort();
+      } else {
+        iter->Next();
+        num_next++;
+      }
+
+      thread->stats.FinishedOps(&db_, db_.db, 1, kSeek);
+    }
+    delete iter;
+  }
+
+  // Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V)
+  // in DB atomically i.e in a single batch. Also refer GetMany.
+  Status PutMany(DB* db, const WriteOptions& writeoptions, const Slice& key,
+                 const Slice& value) {
+    std::string suffixes[3] = {"2", "1", "0"};
+    std::string keys[3];
+
+    WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0,
+                     FLAGS_write_batch_protection_bytes_per_key,
+                     user_timestamp_size_);
+    Status s;
+    for (int i = 0; i < 3; i++) {
+      keys[i] = key.ToString() + suffixes[i];
+      batch.Put(keys[i], value);
+    }
+
+    std::unique_ptr<char[]> ts_guard;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+      Slice ts = mock_app_clock_->Allocate(ts_guard.get());
+      s = batch.UpdateTimestamps(
+          ts, [this](uint32_t) { return user_timestamp_size_; });
+      if (!s.ok()) {
+        fprintf(stderr, "assign timestamp to batch: %s\n",
+                s.ToString().c_str());
+        ErrorExit();
+      }
+    }
+
+    s = db->Write(writeoptions, &batch);
+    return s;
+  }
+
+  // Given a key K, this deletes (K+"0", V), (K+"1", V), (K+"2", V)
+  // in DB atomically i.e in a single batch. Also refer GetMany.
+  Status DeleteMany(DB* db, const WriteOptions& writeoptions,
+                    const Slice& key) {
+    std::string suffixes[3] = {"1", "2", "0"};
+    std::string keys[3];
+
+    WriteBatch batch(0, 0, FLAGS_write_batch_protection_bytes_per_key,
+                     user_timestamp_size_);
+    Status s;
+    for (int i = 0; i < 3; i++) {
+      keys[i] = key.ToString() + suffixes[i];
+      batch.Delete(keys[i]);
+    }
+
+    std::unique_ptr<char[]> ts_guard;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+      Slice ts = mock_app_clock_->Allocate(ts_guard.get());
+      s = batch.UpdateTimestamps(
+          ts, [this](uint32_t) { return user_timestamp_size_; });
+      if (!s.ok()) {
+        fprintf(stderr, "assign timestamp to batch: %s\n",
+                s.ToString().c_str());
+        ErrorExit();
+      }
+    }
+
+    s = db->Write(writeoptions, &batch);
+    return s;
+  }
+
+  // Given a key K and value V, this gets values for K+"0", K+"1" and K+"2"
+  // in the same snapshot, and verifies that all the values are identical.
+  // ASSUMES that PutMany was used to put (K, V) into the DB.
+  Status GetMany(DB* db, const Slice& key, std::string* value) {
+    std::string suffixes[3] = {"0", "1", "2"};
+    std::string keys[3];
+    Slice key_slices[3];
+    std::string values[3];
+    ReadOptions readoptionscopy = read_options_;
+
+    std::unique_ptr<char[]> ts_guard;
+    Slice ts;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+      ts = mock_app_clock_->Allocate(ts_guard.get());
+      readoptionscopy.timestamp = &ts;
+    }
+
+    readoptionscopy.snapshot = db->GetSnapshot();
+    Status s;
+    for (int i = 0; i < 3; i++) {
+      keys[i] = key.ToString() + suffixes[i];
+      key_slices[i] = keys[i];
+      s = db->Get(readoptionscopy, key_slices[i], value);
+      if (!s.ok() && !s.IsNotFound()) {
+        fprintf(stderr, "get error: %s\n", s.ToString().c_str());
+        values[i] = "";
+        // we continue after error rather than exiting so that we can
+        // find more errors if any
+      } else if (s.IsNotFound()) {
+        values[i] = "";
+      } else {
+        values[i] = *value;
+      }
+    }
+    db->ReleaseSnapshot(readoptionscopy.snapshot);
+
+    if ((values[0] != values[1]) || (values[1] != values[2])) {
+      fprintf(stderr, "inconsistent values for key %s: %s, %s, %s\n",
+              key.ToString().c_str(), values[0].c_str(), values[1].c_str(),
+              values[2].c_str());
+      // we continue after error rather than exiting so that we can
+      // find more errors if any
+    }
+
+    return s;
+  }
+
+  // Differs from readrandomwriterandom in the following ways:
+  // (a) Uses GetMany/PutMany to read/write key values. Refer to those funcs.
+  // (b) Does deletes as well (per FLAGS_deletepercent)
+  // (c) In order to achieve high % of 'found' during lookups, and to do
+  //     multiple writes (including puts and deletes) it uses upto
+  //     FLAGS_numdistinct distinct keys instead of FLAGS_num distinct keys.
+  // (d) Does not have a MultiGet option.
+  void RandomWithVerify(ThreadState* thread) {
+    RandomGenerator gen;
+    std::string value;
+    int64_t found = 0;
+    int get_weight = 0;
+    int put_weight = 0;
+    int delete_weight = 0;
+    int64_t gets_done = 0;
+    int64_t puts_done = 0;
+    int64_t deletes_done = 0;
+
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
+
+    // the number of iterations is the larger of read_ or write_
+    for (int64_t i = 0; i < readwrites_; i++) {
+      DB* db = SelectDB(thread);
+      if (get_weight == 0 && put_weight == 0 && delete_weight == 0) {
+        // one batch completed, reinitialize for next batch
+        get_weight = FLAGS_readwritepercent;
+        delete_weight = FLAGS_deletepercent;
+        put_weight = 100 - get_weight - delete_weight;
+      }
+      GenerateKeyFromInt(thread->rand.Next() % FLAGS_numdistinct,
+                         FLAGS_numdistinct, &key);
+      if (get_weight > 0) {
+        // do all the gets first
+        Status s = GetMany(db, key, &value);
+        if (!s.ok() && !s.IsNotFound()) {
+          fprintf(stderr, "getmany error: %s\n", s.ToString().c_str());
+          // we continue after error rather than exiting so that we can
+          // find more errors if any
+        } else if (!s.IsNotFound()) {
+          found++;
+        }
+        get_weight--;
+        gets_done++;
+        thread->stats.FinishedOps(&db_, db_.db, 1, kRead);
+      } else if (put_weight > 0) {
+        // then do all the corresponding number of puts
+        // for all the gets we have done earlier
+        Status s = PutMany(db, write_options_, key, gen.Generate());
+        if (!s.ok()) {
+          fprintf(stderr, "putmany error: %s\n", s.ToString().c_str());
+          exit(1);
+        }
+        put_weight--;
+        puts_done++;
+        thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
+      } else if (delete_weight > 0) {
+        Status s = DeleteMany(db, write_options_, key);
+        if (!s.ok()) {
+          fprintf(stderr, "deletemany error: %s\n", s.ToString().c_str());
+          exit(1);
+        }
+        delete_weight--;
+        deletes_done++;
+        thread->stats.FinishedOps(&db_, db_.db, 1, kDelete);
+      }
+    }
+    char msg[128];
+    snprintf(msg, sizeof(msg),
+             "( get:%" PRIu64 " put:%" PRIu64 " del:%" PRIu64 " total:%" PRIu64
+             " found:%" PRIu64 ")",
+             gets_done, puts_done, deletes_done, readwrites_, found);
+    thread->stats.AddMessage(msg);
+  }
+
+  // This is different from ReadWhileWriting because it does not use
+  // an extra thread.
+  void ReadRandomWriteRandom(ThreadState* thread) {
+    ReadOptions options = read_options_;
+    RandomGenerator gen;
+    std::string value;
+    int64_t found = 0;
+    int get_weight = 0;
+    int put_weight = 0;
+    int64_t reads_done = 0;
+    int64_t writes_done = 0;
+    Duration duration(FLAGS_duration, readwrites_);
+
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
+
+    std::unique_ptr<char[]> ts_guard;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+    }
+
+    // the number of iterations is the larger of read_ or write_
+    while (!duration.Done(1)) {
+      DB* db = SelectDB(thread);
+      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
+      if (get_weight == 0 && put_weight == 0) {
+        // one batch completed, reinitialize for next batch
+        get_weight = FLAGS_readwritepercent;
+        put_weight = 100 - get_weight;
+      }
+      if (get_weight > 0) {
+        // do all the gets first
+        Slice ts;
+        if (user_timestamp_size_ > 0) {
+          ts = mock_app_clock_->GetTimestampForRead(thread->rand,
+                                                    ts_guard.get());
+          options.timestamp = &ts;
+        }
+        Status s = db->Get(options, key, &value);
+        if (!s.ok() && !s.IsNotFound()) {
+          fprintf(stderr, "get error: %s\n", s.ToString().c_str());
+          // we continue after error rather than exiting so that we can
+          // find more errors if any
+        } else if (!s.IsNotFound()) {
+          found++;
+        }
+        get_weight--;
+        reads_done++;
+        thread->stats.FinishedOps(nullptr, db, 1, kRead);
+      } else if (put_weight > 0) {
+        // then do all the corresponding number of puts
+        // for all the gets we have done earlier
+        Status s;
+        if (user_timestamp_size_ > 0) {
+          Slice ts = mock_app_clock_->Allocate(ts_guard.get());
+          s = db->Put(write_options_, key, ts, gen.Generate());
+        } else {
+          s = db->Put(write_options_, key, gen.Generate());
+        }
+        if (!s.ok()) {
+          fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+          ErrorExit();
+        }
+        put_weight--;
+        writes_done++;
+        thread->stats.FinishedOps(nullptr, db, 1, kWrite);
+      }
+    }
+    char msg[100];
+    snprintf(msg, sizeof(msg),
+             "( reads:%" PRIu64 " writes:%" PRIu64 " total:%" PRIu64
+             " found:%" PRIu64 ")",
+             reads_done, writes_done, readwrites_, found);
+    thread->stats.AddMessage(msg);
+  }
+
+  //
+  // Read-modify-write for random keys
+  void UpdateRandom(ThreadState* thread) {
+    ReadOptions options = read_options_;
+    RandomGenerator gen;
+    std::string value;
+    int64_t found = 0;
+    int64_t bytes = 0;
+    Duration duration(FLAGS_duration, readwrites_);
+
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
+    std::unique_ptr<char[]> ts_guard;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+    }
+    // the number of iterations is the larger of read_ or write_
+    while (!duration.Done(1)) {
+      DB* db = SelectDB(thread);
+      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
+      Slice ts;
+      if (user_timestamp_size_ > 0) {
+        // Read with newest timestamp because we are doing rmw.
+        ts = mock_app_clock_->Allocate(ts_guard.get());
+        options.timestamp = &ts;
+      }
+
+      auto status = db->Get(options, key, &value);
+      if (status.ok()) {
+        ++found;
+        bytes += key.size() + value.size() + user_timestamp_size_;
+      } else if (!status.IsNotFound()) {
+        fprintf(stderr, "Get returned an error: %s\n",
+                status.ToString().c_str());
+        abort();
+      }
+
+      if (thread->shared->write_rate_limiter) {
+        thread->shared->write_rate_limiter->Request(
+            key.size() + value.size(), Env::IO_HIGH, nullptr /*stats*/,
+            RateLimiter::OpType::kWrite);
+      }
+
+      Slice val = gen.Generate();
+      Status s;
+      if (user_timestamp_size_ > 0) {
+        ts = mock_app_clock_->Allocate(ts_guard.get());
+        s = db->Put(write_options_, key, ts, val);
+      } else {
+        s = db->Put(write_options_, key, val);
+      }
+      if (!s.ok()) {
+        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+        exit(1);
+      }
+      bytes += key.size() + val.size() + user_timestamp_size_;
+      thread->stats.FinishedOps(nullptr, db, 1, kUpdate);
+    }
+    char msg[100];
+    snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")",
+             readwrites_, found);
+    thread->stats.AddBytes(bytes);
+    thread->stats.AddMessage(msg);
+  }
+
+  // Read-XOR-write for random keys. Xors the existing value with a randomly
+  // generated value, and stores the result. Assuming A in the array of bytes
+  // representing the existing value, we generate an array B of the same size,
+  // then compute C = A^B as C[i]=A[i]^B[i], and store C
+  void XORUpdateRandom(ThreadState* thread) {
+    ReadOptions options = read_options_;
+    RandomGenerator gen;
+    std::string existing_value;
+    int64_t found = 0;
+    Duration duration(FLAGS_duration, readwrites_);
+
+    BytesXOROperator xor_operator;
+
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
+    std::unique_ptr<char[]> ts_guard;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+    }
+    // the number of iterations is the larger of read_ or write_
+    while (!duration.Done(1)) {
+      DB* db = SelectDB(thread);
+      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
+      Slice ts;
+      if (user_timestamp_size_ > 0) {
+        ts = mock_app_clock_->Allocate(ts_guard.get());
+        options.timestamp = &ts;
+      }
+
+      auto status = db->Get(options, key, &existing_value);
+      if (status.ok()) {
+        ++found;
+      } else if (!status.IsNotFound()) {
+        fprintf(stderr, "Get returned an error: %s\n",
+                status.ToString().c_str());
+        exit(1);
+      }
+
+      Slice value =
+          gen.Generate(static_cast<unsigned int>(existing_value.size()));
+      std::string new_value;
+
+      if (status.ok()) {
+        Slice existing_value_slice = Slice(existing_value);
+        xor_operator.XOR(&existing_value_slice, value, &new_value);
+      } else {
+        xor_operator.XOR(nullptr, value, &new_value);
+      }
+
+      Status s;
+      if (user_timestamp_size_ > 0) {
+        ts = mock_app_clock_->Allocate(ts_guard.get());
+        s = db->Put(write_options_, key, ts, Slice(new_value));
+      } else {
+        s = db->Put(write_options_, key, Slice(new_value));
+      }
+      if (!s.ok()) {
+        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+        ErrorExit();
+      }
+      thread->stats.FinishedOps(nullptr, db, 1);
+    }
+    char msg[100];
+    snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")",
+             readwrites_, found);
+    thread->stats.AddMessage(msg);
+  }
+
+  // Read-modify-write for random keys.
+  // Each operation causes the key grow by value_size (simulating an append).
+  // Generally used for benchmarking against merges of similar type
+  void AppendRandom(ThreadState* thread) {
+    ReadOptions options = read_options_;
+    RandomGenerator gen;
+    std::string value;
+    int64_t found = 0;
+    int64_t bytes = 0;
+
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
+    std::unique_ptr<char[]> ts_guard;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+    }
+    // The number of iterations is the larger of read_ or write_
+    Duration duration(FLAGS_duration, readwrites_);
+    while (!duration.Done(1)) {
+      DB* db = SelectDB(thread);
+      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
+      Slice ts;
+      if (user_timestamp_size_ > 0) {
+        ts = mock_app_clock_->Allocate(ts_guard.get());
+        options.timestamp = &ts;
+      }
+
+      auto status = db->Get(options, key, &value);
+      if (status.ok()) {
+        ++found;
+        bytes += key.size() + value.size() + user_timestamp_size_;
+      } else if (!status.IsNotFound()) {
+        fprintf(stderr, "Get returned an error: %s\n",
+                status.ToString().c_str());
+        abort();
+      } else {
+        // If not existing, then just assume an empty string of data
+        value.clear();
+      }
+
+      // Update the value (by appending data)
+      Slice operand = gen.Generate();
+      if (value.size() > 0) {
+        // Use a delimiter to match the semantics for StringAppendOperator
+        value.append(1, ',');
+      }
+      value.append(operand.data(), operand.size());
+
+      Status s;
+      if (user_timestamp_size_ > 0) {
+        ts = mock_app_clock_->Allocate(ts_guard.get());
+        s = db->Put(write_options_, key, ts, value);
+      } else {
+        // Write back to the database
+        s = db->Put(write_options_, key, value);
+      }
+      if (!s.ok()) {
+        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+        ErrorExit();
+      }
+      bytes += key.size() + value.size() + user_timestamp_size_;
+      thread->stats.FinishedOps(nullptr, db, 1, kUpdate);
+    }
+
+    char msg[100];
+    snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")",
+             readwrites_, found);
+    thread->stats.AddBytes(bytes);
+    thread->stats.AddMessage(msg);
+  }
+
+  // Read-modify-write for random keys (using MergeOperator)
+  // The merge operator to use should be defined by FLAGS_merge_operator
+  // Adjust FLAGS_value_size so that the keys are reasonable for this operator
+  // Assumes that the merge operator is non-null (i.e.: is well-defined)
+  //
+  // For example, use FLAGS_merge_operator="uint64add" and FLAGS_value_size=8
+  // to simulate random additions over 64-bit integers using merge.
+  //
+  // The number of merges on the same key can be controlled by adjusting
+  // FLAGS_merge_keys.
+  void MergeRandom(ThreadState* thread) {
+    RandomGenerator gen;
+    int64_t bytes = 0;
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
+    // The number of iterations is the larger of read_ or write_
+    Duration duration(FLAGS_duration, readwrites_);
+    while (!duration.Done(1)) {
+      DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
+      int64_t key_rand = thread->rand.Next() % merge_keys_;
+      GenerateKeyFromInt(key_rand, merge_keys_, &key);
+
+      Status s;
+      Slice val = gen.Generate();
+      if (FLAGS_num_column_families > 1) {
+        s = db_with_cfh->db->Merge(write_options_,
+                                   db_with_cfh->GetCfh(key_rand), key, val);
+      } else {
+        s = db_with_cfh->db->Merge(
+            write_options_, db_with_cfh->db->DefaultColumnFamily(), key, val);
+      }
+
+      if (!s.ok()) {
+        fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
+        exit(1);
+      }
+      bytes += key.size() + val.size();
+      thread->stats.FinishedOps(nullptr, db_with_cfh->db, 1, kMerge);
+    }
+
+    // Print some statistics
+    char msg[100];
+    snprintf(msg, sizeof(msg), "( updates:%" PRIu64 ")", readwrites_);
+    thread->stats.AddBytes(bytes);
+    thread->stats.AddMessage(msg);
+  }
+
+  // Read and merge random keys. The amount of reads and merges are controlled
+  // by adjusting FLAGS_num and FLAGS_mergereadpercent. The number of distinct
+  // keys (and thus also the number of reads and merges on the same key) can be
+  // adjusted with FLAGS_merge_keys.
+  //
+  // As with MergeRandom, the merge operator to use should be defined by
+  // FLAGS_merge_operator.
+  void ReadRandomMergeRandom(ThreadState* thread) {
+    RandomGenerator gen;
+    std::string value;
+    int64_t num_hits = 0;
+    int64_t num_gets = 0;
+    int64_t num_merges = 0;
+    size_t max_length = 0;
+
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
+    // the number of iterations is the larger of read_ or write_
+    Duration duration(FLAGS_duration, readwrites_);
+    while (!duration.Done(1)) {
+      DB* db = SelectDB(thread);
+      GenerateKeyFromInt(thread->rand.Next() % merge_keys_, merge_keys_, &key);
+
+      bool do_merge = int(thread->rand.Next() % 100) < FLAGS_mergereadpercent;
+
+      if (do_merge) {
+        Status s = db->Merge(write_options_, key, gen.Generate());
+        if (!s.ok()) {
+          fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
+          exit(1);
+        }
+        num_merges++;
+        thread->stats.FinishedOps(nullptr, db, 1, kMerge);
+      } else {
+        Status s = db->Get(read_options_, key, &value);
+        if (value.length() > max_length) max_length = value.length();
+
+        if (!s.ok() && !s.IsNotFound()) {
+          fprintf(stderr, "get error: %s\n", s.ToString().c_str());
+          // we continue after error rather than exiting so that we can
+          // find more errors if any
+        } else if (!s.IsNotFound()) {
+          num_hits++;
+        }
+        num_gets++;
+        thread->stats.FinishedOps(nullptr, db, 1, kRead);
+      }
+    }
+
+    char msg[100];
+    snprintf(msg, sizeof(msg),
+             "(reads:%" PRIu64 " merges:%" PRIu64 " total:%" PRIu64
+             " hits:%" PRIu64 " maxlength:%" ROCKSDB_PRIszt ")",
+             num_gets, num_merges, readwrites_, num_hits, max_length);
+    thread->stats.AddMessage(msg);
+  }
+
+  void WriteSeqSeekSeq(ThreadState* thread) {
+    writes_ = FLAGS_num;
+    DoWrite(thread, SEQUENTIAL);
+    // exclude writes from the ops/sec calculation
+    thread->stats.Start(thread->tid);
+
+    DB* db = SelectDB(thread);
+    ReadOptions read_opts = read_options_;
+    std::unique_ptr<char[]> ts_guard;
+    Slice ts;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+      ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
+      read_opts.timestamp = &ts;
+    }
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_opts));
+
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
+    for (int64_t i = 0; i < FLAGS_num; ++i) {
+      GenerateKeyFromInt(i, FLAGS_num, &key);
+      iter->Seek(key);
+      assert(iter->Valid() && iter->key() == key);
+      thread->stats.FinishedOps(nullptr, db, 1, kSeek);
+
+      for (int j = 0; j < FLAGS_seek_nexts && i + 1 < FLAGS_num; ++j) {
+        if (!FLAGS_reverse_iterator) {
+          iter->Next();
+        } else {
+          iter->Prev();
+        }
+        GenerateKeyFromInt(++i, FLAGS_num, &key);
+        assert(iter->Valid() && iter->key() == key);
+        thread->stats.FinishedOps(nullptr, db, 1, kSeek);
+      }
+
+      iter->Seek(key);
+      assert(iter->Valid() && iter->key() == key);
+      thread->stats.FinishedOps(nullptr, db, 1, kSeek);
+    }
+  }
+
+  bool binary_search(std::vector<int>& data, int start, int end, int key) {
+    if (data.empty()) return false;
+    if (start > end) return false;
+    int mid = start + (end - start) / 2;
+    if (mid > static_cast<int>(data.size()) - 1) return false;
+    if (data[mid] == key) {
+      return true;
+    } else if (data[mid] > key) {
+      return binary_search(data, start, mid - 1, key);
+    } else {
+      return binary_search(data, mid + 1, end, key);
+    }
+  }
+
+  // Does a bunch of merge operations for a key(key1) where the merge operand
+  // is a sorted list. Next performance comparison is done between doing a Get
+  // for key1 followed by searching for another key(key2) in the large sorted
+  // list vs calling GetMergeOperands for key1 and then searching for the key2
+  // in all the sorted sub-lists. Later case is expected to be a lot faster.
+  void GetMergeOperands(ThreadState* thread) {
+    DB* db = SelectDB(thread);
+    const int kTotalValues = 100000;
+    const int kListSize = 100;
+    std::string key = "my_key";
+    std::string value;
+
+    for (int i = 1; i < kTotalValues; i++) {
+      if (i % kListSize == 0) {
+        // Remove trailing ','
+        value.pop_back();
+        db->Merge(WriteOptions(), key, value);
+        value.clear();
+      } else {
+        value.append(std::to_string(i)).append(",");
+      }
+    }
+
+    SortList s;
+    std::vector<int> data;
+    // This value can be experimented with and it will demonstrate the
+    // perf difference between doing a Get and searching for lookup_key in the
+    // resultant large sorted list vs doing GetMergeOperands and searching
+    // for lookup_key within this resultant sorted sub-lists.
+    int lookup_key = 1;
+
+    // Get API call
+    std::cout << "--- Get API call --- \n";
+    PinnableSlice p_slice;
+    uint64_t st = FLAGS_env->NowNanos();
+    db->Get(ReadOptions(), db->DefaultColumnFamily(), key, &p_slice);
+    s.MakeVector(data, p_slice);
+    bool found =
+        binary_search(data, 0, static_cast<int>(data.size() - 1), lookup_key);
+    std::cout << "Found key? " << std::to_string(found) << "\n";
+    uint64_t sp = FLAGS_env->NowNanos();
+    std::cout << "Get: " << (sp - st) / 1000000000.0 << " seconds\n";
+    std::string* dat_ = p_slice.GetSelf();
+    std::cout << "Sample data from Get API call: " << dat_->substr(0, 10)
+              << "\n";
+    data.clear();
+
+    // GetMergeOperands API call
+    std::cout << "--- GetMergeOperands API --- \n";
+    std::vector<PinnableSlice> a_slice((kTotalValues / kListSize) + 1);
+    st = FLAGS_env->NowNanos();
+    int number_of_operands = 0;
+    GetMergeOperandsOptions get_merge_operands_options;
+    get_merge_operands_options.expected_max_number_of_operands =
+        (kTotalValues / 100) + 1;
+    db->GetMergeOperands(ReadOptions(), db->DefaultColumnFamily(), key,
+                         a_slice.data(), &get_merge_operands_options,
+                         &number_of_operands);
+    for (PinnableSlice& psl : a_slice) {
+      s.MakeVector(data, psl);
+      found =
+          binary_search(data, 0, static_cast<int>(data.size() - 1), lookup_key);
+      data.clear();
+      if (found) break;
+    }
+    std::cout << "Found key? " << std::to_string(found) << "\n";
+    sp = FLAGS_env->NowNanos();
+    std::cout << "Get Merge operands: " << (sp - st) / 1000000000.0
+              << " seconds \n";
+    int to_print = 0;
+    std::cout << "Sample data from GetMergeOperands API call: ";
+    for (PinnableSlice& psl : a_slice) {
+      std::cout << "List: " << to_print << " : " << *psl.GetSelf() << "\n";
+      if (to_print++ > 2) break;
+    }
+  }
+
+#ifndef ROCKSDB_LITE
+  void VerifyChecksum(ThreadState* thread) {
+    DB* db = SelectDB(thread);
+    ReadOptions ro;
+    ro.adaptive_readahead = FLAGS_adaptive_readahead;
+    ro.async_io = FLAGS_async_io;
+    ro.rate_limiter_priority =
+        FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL;
+    ro.readahead_size = FLAGS_readahead_size;
+    Status s = db->VerifyChecksum(ro);
+    if (!s.ok()) {
+      fprintf(stderr, "VerifyChecksum() failed: %s\n", s.ToString().c_str());
+      exit(1);
+    }
+  }
+
+  void VerifyFileChecksums(ThreadState* thread) {
+    DB* db = SelectDB(thread);
+    ReadOptions ro;
+    ro.adaptive_readahead = FLAGS_adaptive_readahead;
+    ro.async_io = FLAGS_async_io;
+    ro.rate_limiter_priority =
+        FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL;
+    ro.readahead_size = FLAGS_readahead_size;
+    Status s = db->VerifyFileChecksums(ro);
+    if (!s.ok()) {
+      fprintf(stderr, "VerifyFileChecksums() failed: %s\n",
+              s.ToString().c_str());
+      exit(1);
+    }
+  }
+
+  // This benchmark stress tests Transactions.  For a given --duration (or
+  // total number of --writes, a Transaction will perform a read-modify-write
+  // to increment the value of a key in each of N(--transaction-sets) sets of
+  // keys (where each set has --num keys).  If --threads is set, this will be
+  // done in parallel.
+  //
+  // To test transactions, use --transaction_db=true.  Not setting this
+  // parameter
+  // will run the same benchmark without transactions.
+  //
+  // RandomTransactionVerify() will then validate the correctness of the results
+  // by checking if the sum of all keys in each set is the same.
+  void RandomTransaction(ThreadState* thread) {
+    Duration duration(FLAGS_duration, readwrites_);
+    uint16_t num_prefix_ranges = static_cast<uint16_t>(FLAGS_transaction_sets);
+    uint64_t transactions_done = 0;
+
+    if (num_prefix_ranges == 0 || num_prefix_ranges > 9999) {
+      fprintf(stderr, "invalid value for transaction_sets\n");
+      abort();
+    }
+
+    TransactionOptions txn_options;
+    txn_options.lock_timeout = FLAGS_transaction_lock_timeout;
+    txn_options.set_snapshot = FLAGS_transaction_set_snapshot;
+
+    RandomTransactionInserter inserter(&thread->rand, write_options_,
+                                       read_options_, FLAGS_num,
+                                       num_prefix_ranges);
+
+    if (FLAGS_num_multi_db > 1) {
+      fprintf(stderr,
+              "Cannot run RandomTransaction benchmark with "
+              "FLAGS_multi_db > 1.");
+      abort();
+    }
+
+    while (!duration.Done(1)) {
+      bool success;
+
+      // RandomTransactionInserter will attempt to insert a key for each
+      // # of FLAGS_transaction_sets
+      if (FLAGS_optimistic_transaction_db) {
+        success = inserter.OptimisticTransactionDBInsert(db_.opt_txn_db);
+      } else if (FLAGS_transaction_db) {
+        TransactionDB* txn_db = reinterpret_cast<TransactionDB*>(db_.db);
+        success = inserter.TransactionDBInsert(txn_db, txn_options);
+      } else {
+        success = inserter.DBInsert(db_.db);
+      }
+
+      if (!success) {
+        fprintf(stderr, "Unexpected error: %s\n",
+                inserter.GetLastStatus().ToString().c_str());
+        abort();
+      }
+
+      thread->stats.FinishedOps(nullptr, db_.db, 1, kOthers);
+      transactions_done++;
+    }
+
+    char msg[100];
+    if (FLAGS_optimistic_transaction_db || FLAGS_transaction_db) {
+      snprintf(msg, sizeof(msg),
+               "( transactions:%" PRIu64 " aborts:%" PRIu64 ")",
+               transactions_done, inserter.GetFailureCount());
+    } else {
+      snprintf(msg, sizeof(msg), "( batches:%" PRIu64 " )", transactions_done);
+    }
+    thread->stats.AddMessage(msg);
+    thread->stats.AddBytes(static_cast<int64_t>(inserter.GetBytesInserted()));
+  }
+
+  // Verifies consistency of data after RandomTransaction() has been run.
+  // Since each iteration of RandomTransaction() incremented a key in each set
+  // by the same value, the sum of the keys in each set should be the same.
+  void RandomTransactionVerify() {
+    if (!FLAGS_transaction_db && !FLAGS_optimistic_transaction_db) {
+      // transactions not used, nothing to verify.
+      return;
+    }
+
+    Status s = RandomTransactionInserter::Verify(
+        db_.db, static_cast<uint16_t>(FLAGS_transaction_sets));
+
+    if (s.ok()) {
+      fprintf(stdout, "RandomTransactionVerify Success.\n");
+    } else {
+      fprintf(stdout, "RandomTransactionVerify FAILED!!\n");
+    }
+  }
+#endif  // ROCKSDB_LITE
+
+  // Writes and deletes random keys without overwriting keys.
+  //
+  // This benchmark is intended to partially replicate the behavior of MyRocks
+  // secondary indices: All data is stored in keys and updates happen by
+  // deleting the old version of the key and inserting the new version.
+  void RandomReplaceKeys(ThreadState* thread) {
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
+    std::unique_ptr<char[]> ts_guard;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+    }
+    std::vector<uint32_t> counters(FLAGS_numdistinct, 0);
+    size_t max_counter = 50;
+    RandomGenerator gen;
+
+    Status s;
+    DB* db = SelectDB(thread);
+    for (int64_t i = 0; i < FLAGS_numdistinct; i++) {
+      GenerateKeyFromInt(i * max_counter, FLAGS_num, &key);
+      if (user_timestamp_size_ > 0) {
+        Slice ts = mock_app_clock_->Allocate(ts_guard.get());
+        s = db->Put(write_options_, key, ts, gen.Generate());
+      } else {
+        s = db->Put(write_options_, key, gen.Generate());
+      }
+      if (!s.ok()) {
+        fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str());
+        exit(1);
+      }
+    }
+
+    db->GetSnapshot();
+
+    std::default_random_engine generator;
+    std::normal_distribution<double> distribution(FLAGS_numdistinct / 2.0,
+                                                  FLAGS_stddev);
+    Duration duration(FLAGS_duration, FLAGS_num);
+    while (!duration.Done(1)) {
+      int64_t rnd_id = static_cast<int64_t>(distribution(generator));
+      int64_t key_id = std::max(std::min(FLAGS_numdistinct - 1, rnd_id),
+                                static_cast<int64_t>(0));
+      GenerateKeyFromInt(key_id * max_counter + counters[key_id], FLAGS_num,
+                         &key);
+      if (user_timestamp_size_ > 0) {
+        Slice ts = mock_app_clock_->Allocate(ts_guard.get());
+        s = FLAGS_use_single_deletes ? db->SingleDelete(write_options_, key, ts)
+                                     : db->Delete(write_options_, key, ts);
+      } else {
+        s = FLAGS_use_single_deletes ? db->SingleDelete(write_options_, key)
+                                     : db->Delete(write_options_, key);
+      }
+      if (s.ok()) {
+        counters[key_id] = (counters[key_id] + 1) % max_counter;
+        GenerateKeyFromInt(key_id * max_counter + counters[key_id], FLAGS_num,
+                           &key);
+        if (user_timestamp_size_ > 0) {
+          Slice ts = mock_app_clock_->Allocate(ts_guard.get());
+          s = db->Put(write_options_, key, ts, Slice());
+        } else {
+          s = db->Put(write_options_, key, Slice());
+        }
+      }
+
+      if (!s.ok()) {
+        fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str());
+        exit(1);
+      }
+
+      thread->stats.FinishedOps(nullptr, db, 1, kOthers);
+    }
+
+    char msg[200];
+    snprintf(msg, sizeof(msg),
+             "use single deletes: %d, "
+             "standard deviation: %lf\n",
+             FLAGS_use_single_deletes, FLAGS_stddev);
+    thread->stats.AddMessage(msg);
+  }
+
+  void TimeSeriesReadOrDelete(ThreadState* thread, bool do_deletion) {
+    int64_t read = 0;
+    int64_t found = 0;
+    int64_t bytes = 0;
+
+    Iterator* iter = nullptr;
+    // Only work on single database
+    assert(db_.db != nullptr);
+    iter = db_.db->NewIterator(read_options_);
+
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
+
+    char value_buffer[256];
+    while (true) {
+      {
+        MutexLock l(&thread->shared->mu);
+        if (thread->shared->num_done >= 1) {
+          // Write thread have finished
+          break;
+        }
+      }
+      if (!FLAGS_use_tailing_iterator) {
+        delete iter;
+        iter = db_.db->NewIterator(read_options_);
+      }
+      // Pick a Iterator to use
+
+      int64_t key_id = thread->rand.Next() % FLAGS_key_id_range;
+      GenerateKeyFromInt(key_id, FLAGS_num, &key);
+      // Reset last 8 bytes to 0
+      char* start = const_cast<char*>(key.data());
+      start += key.size() - 8;
+      memset(start, 0, 8);
+      ++read;
+
+      bool key_found = false;
+      // Seek the prefix
+      for (iter->Seek(key); iter->Valid() && iter->key().starts_with(key);
+           iter->Next()) {
+        key_found = true;
+        // Copy out iterator's value to make sure we read them.
+        if (do_deletion) {
+          bytes += iter->key().size();
+          if (KeyExpired(timestamp_emulator_.get(), iter->key())) {
+            thread->stats.FinishedOps(&db_, db_.db, 1, kDelete);
+            db_.db->Delete(write_options_, iter->key());
+          } else {
+            break;
+          }
+        } else {
+          bytes += iter->key().size() + iter->value().size();
+          thread->stats.FinishedOps(&db_, db_.db, 1, kRead);
+          Slice value = iter->value();
+          memcpy(value_buffer, value.data(),
+                 std::min(value.size(), sizeof(value_buffer)));
+
+          assert(iter->status().ok());
+        }
+      }
+      found += key_found;
+
+      if (thread->shared->read_rate_limiter.get() != nullptr) {
+        thread->shared->read_rate_limiter->Request(
+            1, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
+      }
+    }
+    delete iter;
+
+    char msg[100];
+    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", found,
+             read);
+    thread->stats.AddBytes(bytes);
+    thread->stats.AddMessage(msg);
+  }
+
+  void TimeSeriesWrite(ThreadState* thread) {
+    // Special thread that keeps writing until other threads are done.
+    RandomGenerator gen;
+    int64_t bytes = 0;
+
+    // Don't merge stats from this thread with the readers.
+    thread->stats.SetExcludeFromMerge();
+
+    std::unique_ptr<RateLimiter> write_rate_limiter;
+    if (FLAGS_benchmark_write_rate_limit > 0) {
+      write_rate_limiter.reset(
+          NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
+    }
+
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
+
+    Duration duration(FLAGS_duration, writes_);
+    while (!duration.Done(1)) {
+      DB* db = SelectDB(thread);
+
+      uint64_t key_id = thread->rand.Next() % FLAGS_key_id_range;
+      // Write key id
+      GenerateKeyFromInt(key_id, FLAGS_num, &key);
+      // Write timestamp
+
+      char* start = const_cast<char*>(key.data());
+      char* pos = start + 8;
+      int bytes_to_fill =
+          std::min(key_size_ - static_cast<int>(pos - start), 8);
+      uint64_t timestamp_value = timestamp_emulator_->Get();
+      if (port::kLittleEndian) {
+        for (int i = 0; i < bytes_to_fill; ++i) {
+          pos[i] = (timestamp_value >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
+        }
+      } else {
+        memcpy(pos, static_cast<void*>(&timestamp_value), bytes_to_fill);
+      }
+
+      timestamp_emulator_->Inc();
+
+      Status s;
+      Slice val = gen.Generate();
+      s = db->Put(write_options_, key, val);
+
+      if (!s.ok()) {
+        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+        ErrorExit();
+      }
+      bytes = key.size() + val.size();
+      thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
+      thread->stats.AddBytes(bytes);
+
+      if (FLAGS_benchmark_write_rate_limit > 0) {
+        write_rate_limiter->Request(key.size() + val.size(), Env::IO_HIGH,
+                                    nullptr /* stats */,
+                                    RateLimiter::OpType::kWrite);
+      }
+    }
+  }
+
+  void TimeSeries(ThreadState* thread) {
+    if (thread->tid > 0) {
+      bool do_deletion = FLAGS_expire_style == "delete" &&
+                         thread->tid <= FLAGS_num_deletion_threads;
+      TimeSeriesReadOrDelete(thread, do_deletion);
+    } else {
+      TimeSeriesWrite(thread);
+      thread->stats.Stop();
+      thread->stats.Report("timeseries write");
+    }
+  }
+
+  void Compact(ThreadState* thread) {
+    DB* db = SelectDB(thread);
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction =
+        BottommostLevelCompaction::kForceOptimized;
+    db->CompactRange(cro, nullptr, nullptr);
+  }
+
+  void CompactAll() {
+    if (db_.db != nullptr) {
+      db_.db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    }
+    for (const auto& db_with_cfh : multi_dbs_) {
+      db_with_cfh.db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    }
+  }
+
+#ifndef ROCKSDB_LITE
+  void WaitForCompactionHelper(DBWithColumnFamilies& db) {
+    // This is an imperfect way of waiting for compaction. The loop and sleep
+    // is done because a thread that finishes a compaction job should get a
+    // chance to pickup a new compaction job.
+
+    std::vector<std::string> keys = {DB::Properties::kMemTableFlushPending,
+                                     DB::Properties::kNumRunningFlushes,
+                                     DB::Properties::kCompactionPending,
+                                     DB::Properties::kNumRunningCompactions};
+
+    fprintf(stdout, "waitforcompaction(%s): started\n",
+            db.db->GetName().c_str());
+
+    while (true) {
+      bool retry = false;
+
+      for (const auto& k : keys) {
+        uint64_t v;
+        if (!db.db->GetIntProperty(k, &v)) {
+          fprintf(stderr, "waitforcompaction(%s): GetIntProperty(%s) failed\n",
+                  db.db->GetName().c_str(), k.c_str());
+          exit(1);
+        } else if (v > 0) {
+          fprintf(stdout,
+                  "waitforcompaction(%s): active(%s). Sleep 10 seconds\n",
+                  db.db->GetName().c_str(), k.c_str());
+          FLAGS_env->SleepForMicroseconds(10 * 1000000);
+          retry = true;
+          break;
+        }
+      }
+
+      if (!retry) {
+        fprintf(stdout, "waitforcompaction(%s): finished\n",
+                db.db->GetName().c_str());
+        return;
+      }
+    }
+  }
+
+  void WaitForCompaction() {
+    // Give background threads a chance to wake
+    FLAGS_env->SleepForMicroseconds(5 * 1000000);
+
+    // I am skeptical that this check race free. I hope that checking twice
+    // reduces the chance.
+    if (db_.db != nullptr) {
+      WaitForCompactionHelper(db_);
+      WaitForCompactionHelper(db_);
+    } else {
+      for (auto& db_with_cfh : multi_dbs_) {
+        WaitForCompactionHelper(db_with_cfh);
+        WaitForCompactionHelper(db_with_cfh);
+      }
+    }
+  }
+
+  bool CompactLevelHelper(DBWithColumnFamilies& db_with_cfh, int from_level) {
+    std::vector<LiveFileMetaData> files;
+    db_with_cfh.db->GetLiveFilesMetaData(&files);
+
+    assert(from_level == 0 || from_level == 1);
+
+    int real_from_level = from_level;
+    if (real_from_level > 0) {
+      // With dynamic leveled compaction the first level with data beyond L0
+      // might not be L1.
+      real_from_level = std::numeric_limits<int>::max();
+
+      for (auto& f : files) {
+        if (f.level > 0 && f.level < real_from_level) real_from_level = f.level;
+      }
+
+      if (real_from_level == std::numeric_limits<int>::max()) {
+        fprintf(stdout, "compact%d found 0 files to compact\n", from_level);
+        return true;
+      }
+    }
+
+    // The goal is to compact from from_level to the level that follows it,
+    // and with dynamic leveled compaction the next level might not be
+    // real_from_level+1
+    int next_level = std::numeric_limits<int>::max();
+
+    std::vector<std::string> files_to_compact;
+    for (auto& f : files) {
+      if (f.level == real_from_level)
+        files_to_compact.push_back(f.name);
+      else if (f.level > real_from_level && f.level < next_level)
+        next_level = f.level;
+    }
+
+    if (files_to_compact.empty()) {
+      fprintf(stdout, "compact%d found 0 files to compact\n", from_level);
+      return true;
+    } else if (next_level == std::numeric_limits<int>::max()) {
+      // There is no data beyond real_from_level. So we are done.
+      fprintf(stdout, "compact%d found no data beyond L%d\n", from_level,
+              real_from_level);
+      return true;
+    }
+
+    fprintf(stdout, "compact%d found %d files to compact from L%d to L%d\n",
+            from_level, static_cast<int>(files_to_compact.size()),
+            real_from_level, next_level);
+
+    ROCKSDB_NAMESPACE::CompactionOptions options;
+    // Lets RocksDB use the configured compression for this level
+    options.compression = ROCKSDB_NAMESPACE::kDisableCompressionOption;
+
+    ROCKSDB_NAMESPACE::ColumnFamilyDescriptor cfDesc;
+    db_with_cfh.db->DefaultColumnFamily()->GetDescriptor(&cfDesc);
+    options.output_file_size_limit = cfDesc.options.target_file_size_base;
+
+    Status status =
+        db_with_cfh.db->CompactFiles(options, files_to_compact, next_level);
+    if (!status.ok()) {
+      // This can fail for valid reasons including the operation was aborted
+      // or a filename is invalid because background compaction removed it.
+      // Having read the current cases for which an error is raised I prefer
+      // not to figure out whether an exception should be thrown here.
+      fprintf(stderr, "compact%d CompactFiles failed: %s\n", from_level,
+              status.ToString().c_str());
+      return false;
+    }
+    return true;
+  }
+
+  void CompactLevel(int from_level) {
+    if (db_.db != nullptr) {
+      while (!CompactLevelHelper(db_, from_level)) WaitForCompaction();
+    }
+    for (auto& db_with_cfh : multi_dbs_) {
+      while (!CompactLevelHelper(db_with_cfh, from_level)) WaitForCompaction();
+    }
+  }
+#endif
+
+  void Flush() {
+    FlushOptions flush_opt;
+    flush_opt.wait = true;
+
+    if (db_.db != nullptr) {
+      Status s;
+      if (FLAGS_num_column_families > 1) {
+        s = db_.db->Flush(flush_opt, db_.cfh);
+      } else {
+        s = db_.db->Flush(flush_opt, db_.db->DefaultColumnFamily());
+      }
+
+      if (!s.ok()) {
+        fprintf(stderr, "Flush failed: %s\n", s.ToString().c_str());
+        exit(1);
+      }
+    } else {
+      for (const auto& db_with_cfh : multi_dbs_) {
+        Status s;
+        if (FLAGS_num_column_families > 1) {
+          s = db_with_cfh.db->Flush(flush_opt, db_with_cfh.cfh);
+        } else {
+          s = db_with_cfh.db->Flush(flush_opt,
+                                    db_with_cfh.db->DefaultColumnFamily());
+        }
+
+        if (!s.ok()) {
+          fprintf(stderr, "Flush failed: %s\n", s.ToString().c_str());
+          exit(1);
+        }
+      }
+    }
+    fprintf(stdout, "flush memtable\n");
+  }
+
+  void ResetStats() {
+    if (db_.db != nullptr) {
+      db_.db->ResetStats();
+    }
+    for (const auto& db_with_cfh : multi_dbs_) {
+      db_with_cfh.db->ResetStats();
+    }
+  }
+
+  void PrintStatsHistory() {
+    if (db_.db != nullptr) {
+      PrintStatsHistoryImpl(db_.db, false);
+    }
+    for (const auto& db_with_cfh : multi_dbs_) {
+      PrintStatsHistoryImpl(db_with_cfh.db, true);
+    }
+  }
+
+  void PrintStatsHistoryImpl(DB* db, bool print_header) {
+    if (print_header) {
+      fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
+    }
+
+    std::unique_ptr<StatsHistoryIterator> shi;
+    Status s =
+        db->GetStatsHistory(0, std::numeric_limits<uint64_t>::max(), &shi);
+    if (!s.ok()) {
+      fprintf(stdout, "%s\n", s.ToString().c_str());
+      return;
+    }
+    assert(shi);
+    while (shi->Valid()) {
+      uint64_t stats_time = shi->GetStatsTime();
+      fprintf(stdout, "------ %s ------\n",
+              TimeToHumanString(static_cast<int>(stats_time)).c_str());
+      for (auto& entry : shi->GetStatsMap()) {
+        fprintf(stdout, " %" PRIu64 "   %s  %" PRIu64 "\n", stats_time,
+                entry.first.c_str(), entry.second);
+      }
+      shi->Next();
+    }
+  }
+
+  void PrintStats(const char* key) {
+    if (db_.db != nullptr) {
+      PrintStats(db_.db, key, false);
+    }
+    for (const auto& db_with_cfh : multi_dbs_) {
+      PrintStats(db_with_cfh.db, key, true);
+    }
+  }
+
+  void PrintStats(DB* db, const char* key, bool print_header = false) {
+    if (print_header) {
+      fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
+    }
+    std::string stats;
+    if (!db->GetProperty(key, &stats)) {
+      stats = "(failed)";
+    }
+    fprintf(stdout, "\n%s\n", stats.c_str());
+  }
+
+  void PrintStats(const std::vector<std::string>& keys) {
+    if (db_.db != nullptr) {
+      PrintStats(db_.db, keys);
+    }
+    for (const auto& db_with_cfh : multi_dbs_) {
+      PrintStats(db_with_cfh.db, keys, true);
+    }
+  }
+
+  void PrintStats(DB* db, const std::vector<std::string>& keys,
+                  bool print_header = false) {
+    if (print_header) {
+      fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
+    }
+
+    for (const auto& key : keys) {
+      std::string stats;
+      if (!db->GetProperty(key, &stats)) {
+        stats = "(failed)";
+      }
+      fprintf(stdout, "%s: %s\n", key.c_str(), stats.c_str());
+    }
+  }
+
+#ifndef ROCKSDB_LITE
+
+  void Replay(ThreadState* thread) {
+    if (db_.db != nullptr) {
+      Replay(thread, &db_);
+    }
+  }
+
+  void Replay(ThreadState* /*thread*/, DBWithColumnFamilies* db_with_cfh) {
+    Status s;
+    std::unique_ptr<TraceReader> trace_reader;
+    s = NewFileTraceReader(FLAGS_env, EnvOptions(), FLAGS_trace_file,
+                           &trace_reader);
+    if (!s.ok()) {
+      fprintf(
+          stderr,
+          "Encountered an error creating a TraceReader from the trace file. "
+          "Error: %s\n",
+          s.ToString().c_str());
+      exit(1);
+    }
+    std::unique_ptr<Replayer> replayer;
+    s = db_with_cfh->db->NewDefaultReplayer(db_with_cfh->cfh,
+                                            std::move(trace_reader), &replayer);
+    if (!s.ok()) {
+      fprintf(stderr,
+              "Encountered an error creating a default Replayer. "
+              "Error: %s\n",
+              s.ToString().c_str());
+      exit(1);
+    }
+    s = replayer->Prepare();
+    if (!s.ok()) {
+      fprintf(stderr, "Prepare for replay failed. Error: %s\n",
+              s.ToString().c_str());
+    }
+    s = replayer->Replay(
+        ReplayOptions(static_cast<uint32_t>(FLAGS_trace_replay_threads),
+                      FLAGS_trace_replay_fast_forward),
+        nullptr);
+    replayer.reset();
+    if (s.ok()) {
+      fprintf(stdout, "Replay completed from trace_file: %s\n",
+              FLAGS_trace_file.c_str());
+    } else {
+      fprintf(stderr, "Replay failed. Error: %s\n", s.ToString().c_str());
+    }
+  }
+
+  void Backup(ThreadState* thread) {
+    DB* db = SelectDB(thread);
+    std::unique_ptr<BackupEngineOptions> engine_options(
+        new BackupEngineOptions(FLAGS_backup_dir));
+    Status s;
+    BackupEngine* backup_engine;
+    if (FLAGS_backup_rate_limit > 0) {
+      engine_options->backup_rate_limiter.reset(NewGenericRateLimiter(
+          FLAGS_backup_rate_limit, 100000 /* refill_period_us */,
+          10 /* fairness */, RateLimiter::Mode::kAllIo));
+    }
+    // Build new backup of the entire DB
+    engine_options->destroy_old_data = true;
+    s = BackupEngine::Open(FLAGS_env, *engine_options, &backup_engine);
+    assert(s.ok());
+    s = backup_engine->CreateNewBackup(db);
+    assert(s.ok());
+    std::vector<BackupInfo> backup_info;
+    backup_engine->GetBackupInfo(&backup_info);
+    // Verify that a new backup is created
+    assert(backup_info.size() == 1);
+  }
+
+  void Restore(ThreadState* /* thread */) {
+    std::unique_ptr<BackupEngineOptions> engine_options(
+        new BackupEngineOptions(FLAGS_backup_dir));
+    if (FLAGS_restore_rate_limit > 0) {
+      engine_options->restore_rate_limiter.reset(NewGenericRateLimiter(
+          FLAGS_restore_rate_limit, 100000 /* refill_period_us */,
+          10 /* fairness */, RateLimiter::Mode::kAllIo));
+    }
+    BackupEngineReadOnly* backup_engine;
+    Status s =
+        BackupEngineReadOnly::Open(FLAGS_env, *engine_options, &backup_engine);
+    assert(s.ok());
+    s = backup_engine->RestoreDBFromLatestBackup(FLAGS_restore_dir,
+                                                 FLAGS_restore_dir);
+    assert(s.ok());
+    delete backup_engine;
+  }
+
+#endif  // ROCKSDB_LITE
+};
+
+int db_bench_tool(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ConfigOptions config_options;
+  static bool initialized = false;
+  if (!initialized) {
+    SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                    " [OPTIONS]...");
+    SetVersionString(GetRocksVersionAsString(true));
+    initialized = true;
+  }
+  ParseCommandLineFlags(&argc, &argv, true);
+  FLAGS_compaction_style_e =
+      (ROCKSDB_NAMESPACE::CompactionStyle)FLAGS_compaction_style;
+#ifndef ROCKSDB_LITE
+  if (FLAGS_statistics && !FLAGS_statistics_string.empty()) {
+    fprintf(stderr,
+            "Cannot provide both --statistics and --statistics_string.\n");
+    exit(1);
+  }
+  if (!FLAGS_statistics_string.empty()) {
+    Status s = Statistics::CreateFromString(config_options,
+                                            FLAGS_statistics_string, &dbstats);
+    if (dbstats == nullptr) {
+      fprintf(stderr,
+              "No Statistics registered matching string: %s status=%s\n",
+              FLAGS_statistics_string.c_str(), s.ToString().c_str());
+      exit(1);
+    }
+  }
+#endif  // ROCKSDB_LITE
+  if (FLAGS_statistics) {
+    dbstats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  }
+  if (dbstats) {
+    dbstats->set_stats_level(static_cast<StatsLevel>(FLAGS_stats_level));
+  }
+  FLAGS_compaction_pri_e =
+      (ROCKSDB_NAMESPACE::CompactionPri)FLAGS_compaction_pri;
+
+  std::vector<std::string> fanout = ROCKSDB_NAMESPACE::StringSplit(
+      FLAGS_max_bytes_for_level_multiplier_additional, ',');
+  for (size_t j = 0; j < fanout.size(); j++) {
+    FLAGS_max_bytes_for_level_multiplier_additional_v.push_back(
+#ifndef CYGWIN
+        std::stoi(fanout[j]));
+#else
+        stoi(fanout[j]));
+#endif
+  }
+
+  FLAGS_compression_type_e =
+      StringToCompressionType(FLAGS_compression_type.c_str());
+
+  FLAGS_wal_compression_e =
+      StringToCompressionType(FLAGS_wal_compression.c_str());
+
+  FLAGS_compressed_secondary_cache_compression_type_e = StringToCompressionType(
+      FLAGS_compressed_secondary_cache_compression_type.c_str());
+
+#ifndef ROCKSDB_LITE
+  // Stacked BlobDB
+  FLAGS_blob_db_compression_type_e =
+      StringToCompressionType(FLAGS_blob_db_compression_type.c_str());
+
+  int env_opts = !FLAGS_env_uri.empty() + !FLAGS_fs_uri.empty();
+  if (env_opts > 1) {
+    fprintf(stderr, "Error: --env_uri and --fs_uri are mutually exclusive\n");
+    exit(1);
+  }
+
+  if (env_opts == 1) {
+    Status s = Env::CreateFromUri(config_options, FLAGS_env_uri, FLAGS_fs_uri,
+                                  &FLAGS_env, &env_guard);
+    if (!s.ok()) {
+      fprintf(stderr, "Failed creating env: %s\n", s.ToString().c_str());
+      exit(1);
+    }
+  } else if (FLAGS_simulate_hdd || FLAGS_simulate_hybrid_fs_file != "") {
+    //**TODO: Make the simulate fs something that can be loaded
+    // from the ObjectRegistry...
+    static std::shared_ptr<ROCKSDB_NAMESPACE::Env> composite_env =
+        NewCompositeEnv(std::make_shared<SimulatedHybridFileSystem>(
+            FileSystem::Default(), FLAGS_simulate_hybrid_fs_file,
+            /*throughput_multiplier=*/
+            int{FLAGS_simulate_hybrid_hdd_multipliers},
+            /*is_full_fs_warm=*/FLAGS_simulate_hdd));
+    FLAGS_env = composite_env.get();
+  }
+
+  // Let -readonly imply -use_existing_db
+  FLAGS_use_existing_db |= FLAGS_readonly;
+#endif  // ROCKSDB_LITE
+
+  if (FLAGS_build_info) {
+    std::string build_info;
+    std::cout << GetRocksBuildInfoAsString(build_info, true) << std::endl;
+    // Similar to --version, nothing else will be done when this flag is set
+    exit(0);
+  }
+
+  if (!FLAGS_seed) {
+    uint64_t now = FLAGS_env->GetSystemClock()->NowMicros();
+    seed_base = static_cast<int64_t>(now);
+    fprintf(stdout, "Set seed to %" PRIu64 " because --seed was 0\n",
+            seed_base);
+  } else {
+    seed_base = FLAGS_seed;
+  }
+
+  if (FLAGS_use_existing_keys && !FLAGS_use_existing_db) {
+    fprintf(stderr,
+            "`-use_existing_db` must be true for `-use_existing_keys` to be "
+            "settable\n");
+    exit(1);
+  }
+
+  if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NONE"))
+    FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::NONE;
+  else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NORMAL"))
+    FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::NORMAL;
+  else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "SEQUENTIAL"))
+    FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::SEQUENTIAL;
+  else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "WILLNEED"))
+    FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::WILLNEED;
+  else {
+    fprintf(stdout, "Unknown compaction fadvice:%s\n",
+            FLAGS_compaction_fadvice.c_str());
+    exit(1);
+  }
+
+  FLAGS_value_size_distribution_type_e =
+      StringToDistributionType(FLAGS_value_size_distribution_type.c_str());
+
+  // Note options sanitization may increase thread pool sizes according to
+  // max_background_flushes/max_background_compactions/max_background_jobs
+  FLAGS_env->SetBackgroundThreads(FLAGS_num_high_pri_threads,
+                                  ROCKSDB_NAMESPACE::Env::Priority::HIGH);
+  FLAGS_env->SetBackgroundThreads(FLAGS_num_bottom_pri_threads,
+                                  ROCKSDB_NAMESPACE::Env::Priority::BOTTOM);
+  FLAGS_env->SetBackgroundThreads(FLAGS_num_low_pri_threads,
+                                  ROCKSDB_NAMESPACE::Env::Priority::LOW);
+
+  // Choose a location for the test database if none given with --db=<path>
+  if (FLAGS_db.empty()) {
+    std::string default_db_path;
+    FLAGS_env->GetTestDirectory(&default_db_path);
+    default_db_path += "/dbbench";
+    FLAGS_db = default_db_path;
+  }
+
+  if (FLAGS_backup_dir.empty()) {
+    FLAGS_backup_dir = FLAGS_db + "/backup";
+  }
+
+  if (FLAGS_restore_dir.empty()) {
+    FLAGS_restore_dir = FLAGS_db + "/restore";
+  }
+
+  if (FLAGS_stats_interval_seconds > 0) {
+    // When both are set then FLAGS_stats_interval determines the frequency
+    // at which the timer is checked for FLAGS_stats_interval_seconds
+    FLAGS_stats_interval = 1000;
+  }
+
+  if (FLAGS_seek_missing_prefix && FLAGS_prefix_size <= 8) {
+    fprintf(stderr, "prefix_size > 8 required by --seek_missing_prefix\n");
+    exit(1);
+  }
+
+  ROCKSDB_NAMESPACE::Benchmark benchmark;
+  benchmark.Run();
+
+#ifndef ROCKSDB_LITE
+  if (FLAGS_print_malloc_stats) {
+    std::string stats_string;
+    ROCKSDB_NAMESPACE::DumpMallocStats(&stats_string);
+    fprintf(stdout, "Malloc stats:\n%s\n", stats_string.c_str());
+  }
+#endif  // ROCKSDB_LITE
+
+  return 0;
+}
+}  // namespace ROCKSDB_NAMESPACE
+#endif
diff --git a/src/rocksdb/tools/db_bench_tool_test.cc b/src/rocksdb/tools/db_bench_tool_test.cc
new file mode 100644
index 000000000..a406ff66c
--- /dev/null
+++ b/src/rocksdb/tools/db_bench_tool_test.cc
@@ -0,0 +1,334 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/db_bench_tool.h"
+
+#include "db/db_impl/db_impl.h"
+#include "options/options_parser.h"
+#include "rocksdb/utilities/options_util.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+#ifdef GFLAGS
+#include "util/gflags_compat.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+static const int kMaxArgCount = 100;
+static const size_t kArgBufferSize = 100000;
+}  // namespace
+
+class DBBenchTest : public testing::Test {
+ public:
+  DBBenchTest() : rnd_(0xFB) {
+    test_path_ = test::PerThreadDBPath("db_bench_test");
+    Env::Default()->CreateDir(test_path_);
+    db_path_ = test_path_ + "/db";
+    wal_path_ = test_path_ + "/wal";
+  }
+
+  ~DBBenchTest() {
+    //  DestroyDB(db_path_, Options());
+  }
+
+  void ResetArgs() {
+    argc_ = 0;
+    cursor_ = 0;
+    memset(arg_buffer_, 0, kArgBufferSize);
+  }
+
+  void AppendArgs(const std::vector<std::string>& args) {
+    for (const auto& arg : args) {
+      ASSERT_LE(cursor_ + arg.size() + 1, kArgBufferSize);
+      ASSERT_LE(argc_ + 1, kMaxArgCount);
+      snprintf(arg_buffer_ + cursor_, arg.size() + 1, "%s", arg.c_str());
+
+      argv_[argc_++] = arg_buffer_ + cursor_;
+      cursor_ += arg.size() + 1;
+    }
+  }
+
+  // Gets the default options for this test/db_bench.
+  // Note that db_bench may change some of the default option values and that
+  // the database might as well.  The options changed by db_bench are
+  // specified here; the ones by the DB are set via SanitizeOptions
+  Options GetDefaultOptions(CompactionStyle style = kCompactionStyleLevel,
+                            int levels = 7) const {
+    Options opt;
+
+    opt.create_if_missing = true;
+    opt.max_open_files = 256;
+    opt.max_background_compactions = 10;
+    opt.dump_malloc_stats = true;  // db_bench uses a different default
+    opt.compaction_style = style;
+    opt.num_levels = levels;
+    opt.compression = kNoCompression;
+    opt.arena_block_size = 8388608;
+
+    return SanitizeOptions(db_path_, opt);
+  }
+
+  void RunDbBench(const std::string& options_file_name) {
+    AppendArgs({"./db_bench", "--benchmarks=fillseq", "--use_existing_db=0",
+                "--num=1000", "--compression_type=none",
+                std::string(std::string("--db=") + db_path_).c_str(),
+                std::string(std::string("--wal_dir=") + wal_path_).c_str(),
+                std::string(std::string("--options_file=") + options_file_name)
+                    .c_str()});
+    ASSERT_EQ(0, db_bench_tool(argc(), argv()));
+  }
+
+  void VerifyOptions(const Options& opt) {
+    DBOptions loaded_db_opts;
+    std::vector<ColumnFamilyDescriptor> cf_descs;
+    ASSERT_OK(LoadLatestOptions(db_path_, Env::Default(), &loaded_db_opts,
+                                &cf_descs));
+
+    ConfigOptions exact;
+    exact.input_strings_escaped = false;
+    exact.sanity_level = ConfigOptions::kSanityLevelExactMatch;
+    ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(exact, DBOptions(opt),
+                                                    loaded_db_opts));
+    ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
+        exact, ColumnFamilyOptions(opt), cf_descs[0].options));
+
+    // check with the default rocksdb options and expect failure
+    ASSERT_NOK(RocksDBOptionsParser::VerifyDBOptions(exact, DBOptions(),
+                                                     loaded_db_opts));
+    ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions(
+        exact, ColumnFamilyOptions(), cf_descs[0].options));
+  }
+
+  char** argv() { return argv_; }
+
+  int argc() { return argc_; }
+
+  std::string db_path_;
+  std::string test_path_;
+  std::string wal_path_;
+
+  char arg_buffer_[kArgBufferSize];
+  char* argv_[kMaxArgCount];
+  int argc_ = 0;
+  int cursor_ = 0;
+  Random rnd_;
+};
+
+namespace {}  // namespace
+
+TEST_F(DBBenchTest, OptionsFile) {
+  const std::string kOptionsFileName = test_path_ + "/OPTIONS_test";
+  Options opt = GetDefaultOptions();
+  ASSERT_OK(PersistRocksDBOptions(DBOptions(opt), {"default"},
+                                  {ColumnFamilyOptions(opt)}, kOptionsFileName,
+                                  opt.env->GetFileSystem().get()));
+
+  // override the following options as db_bench will not take these
+  // options from the options file
+  opt.wal_dir = wal_path_;
+
+  RunDbBench(kOptionsFileName);
+  opt.delayed_write_rate = 16 * 1024 * 1024;  // Set by SanitizeOptions
+
+  VerifyOptions(opt);
+}
+
+TEST_F(DBBenchTest, OptionsFileUniversal) {
+  const std::string kOptionsFileName = test_path_ + "/OPTIONS_test";
+
+  Options opt = GetDefaultOptions(kCompactionStyleUniversal, 1);
+
+  ASSERT_OK(PersistRocksDBOptions(DBOptions(opt), {"default"},
+                                  {ColumnFamilyOptions(opt)}, kOptionsFileName,
+                                  opt.env->GetFileSystem().get()));
+
+  // override the following options as db_bench will not take these
+  // options from the options file
+  opt.wal_dir = wal_path_;
+  RunDbBench(kOptionsFileName);
+
+  VerifyOptions(opt);
+}
+
+TEST_F(DBBenchTest, OptionsFileMultiLevelUniversal) {
+  const std::string kOptionsFileName = test_path_ + "/OPTIONS_test";
+
+  Options opt = GetDefaultOptions(kCompactionStyleUniversal, 12);
+
+  ASSERT_OK(PersistRocksDBOptions(DBOptions(opt), {"default"},
+                                  {ColumnFamilyOptions(opt)}, kOptionsFileName,
+                                  opt.env->GetFileSystem().get()));
+
+  // override the following options as db_bench will not take these
+  // options from the options file
+  opt.wal_dir = wal_path_;
+
+  RunDbBench(kOptionsFileName);
+  VerifyOptions(opt);
+}
+
+const std::string options_file_content = R"OPTIONS_FILE(
+[Version]
+  rocksdb_version=4.3.1
+  options_file_version=1.1
+
+[DBOptions]
+  wal_bytes_per_sync=1048576
+  delete_obsolete_files_period_micros=0
+  WAL_ttl_seconds=0
+  WAL_size_limit_MB=0
+  db_write_buffer_size=0
+  max_subcompactions=1
+  table_cache_numshardbits=4
+  max_open_files=-1
+  max_file_opening_threads=10
+  max_background_compactions=5
+  use_fsync=false
+  use_adaptive_mutex=false
+  max_total_wal_size=18446744073709551615
+  compaction_readahead_size=0
+  keep_log_file_num=10
+  skip_stats_update_on_db_open=false
+  max_manifest_file_size=18446744073709551615
+  db_log_dir=
+  writable_file_max_buffer_size=1048576
+  paranoid_checks=true
+  is_fd_close_on_exec=true
+  bytes_per_sync=1048576
+  enable_thread_tracking=true
+  recycle_log_file_num=0
+  create_missing_column_families=false
+  log_file_time_to_roll=0
+  max_background_flushes=1
+  create_if_missing=true
+  error_if_exists=false
+  delayed_write_rate=1048576
+  manifest_preallocation_size=4194304
+  allow_mmap_reads=false
+  allow_mmap_writes=false
+  use_direct_reads=false
+  use_direct_io_for_flush_and_compaction=false
+  stats_dump_period_sec=600
+  allow_fallocate=true
+  max_log_file_size=83886080
+  random_access_max_buffer_size=1048576
+  advise_random_on_open=true
+  dump_malloc_stats=true
+
+[CFOptions "default"]
+  compaction_filter_factory=nullptr
+  table_factory=BlockBasedTable
+  prefix_extractor=nullptr
+  comparator=leveldb.BytewiseComparator
+  compression_per_level=
+  max_bytes_for_level_base=104857600
+  bloom_locality=0
+  target_file_size_base=10485760
+  memtable_huge_page_size=0
+  max_successive_merges=1000
+  max_sequential_skip_in_iterations=8
+  arena_block_size=52428800
+  target_file_size_multiplier=1
+  source_compaction_factor=1
+  min_write_buffer_number_to_merge=1
+  max_write_buffer_number=2
+  write_buffer_size=419430400
+  max_grandparent_overlap_factor=10
+  max_bytes_for_level_multiplier=10
+  memtable_factory=SkipListFactory
+  compression=kNoCompression
+  min_partial_merge_operands=2
+  level0_stop_writes_trigger=100
+  num_levels=1
+  level0_slowdown_writes_trigger=50
+  level0_file_num_compaction_trigger=10
+  expanded_compaction_factor=25
+  max_write_buffer_number_to_maintain=0
+  max_write_buffer_size_to_maintain=0
+  verify_checksums_in_compaction=true
+  merge_operator=nullptr
+  memtable_prefix_bloom_bits=0
+  memtable_whole_key_filtering=true
+  paranoid_file_checks=false
+  inplace_update_num_locks=10000
+  optimize_filters_for_hits=false
+  level_compaction_dynamic_level_bytes=false
+  inplace_update_support=false
+  compaction_style=kCompactionStyleUniversal
+  memtable_prefix_bloom_probes=6
+  filter_deletes=false
+  hard_pending_compaction_bytes_limit=0
+  disable_auto_compactions=false
+  compaction_measure_io_stats=false
+  enable_blob_files=true
+  min_blob_size=16
+  blob_file_size=10485760
+  blob_compression_type=kNoCompression
+  enable_blob_garbage_collection=true
+  blob_garbage_collection_age_cutoff=0.5
+  blob_garbage_collection_force_threshold=0.75
+  blob_compaction_readahead_size=262144
+  blob_file_starting_level=0
+  prepopulate_blob_cache=kDisable;
+
+[TableOptions/BlockBasedTable "default"]
+  format_version=0
+  skip_table_builder_flush=false
+  cache_index_and_filter_blocks=false
+  flush_block_policy_factory=FlushBlockBySizePolicyFactory
+  index_type=kBinarySearch
+  whole_key_filtering=true
+  checksum=kCRC32c
+  no_block_cache=false
+  block_size=32768
+  block_size_deviation=10
+  block_restart_interval=16
+  filter_policy=rocksdb.BuiltinBloomFilter
+)OPTIONS_FILE";
+
+TEST_F(DBBenchTest, OptionsFileFromFile) {
+  const std::string kOptionsFileName = test_path_ + "/OPTIONS_flash";
+  std::unique_ptr<WritableFile> writable;
+  ASSERT_OK(Env::Default()->NewWritableFile(kOptionsFileName, &writable,
+                                            EnvOptions()));
+  ASSERT_OK(writable->Append(options_file_content));
+  ASSERT_OK(writable->Close());
+
+  DBOptions db_opt;
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  ASSERT_OK(LoadOptionsFromFile(kOptionsFileName, Env::Default(), &db_opt,
+                                &cf_descs));
+  Options opt(db_opt, cf_descs[0].options);
+  opt.create_if_missing = true;
+
+  // override the following options as db_bench will not take these
+  // options from the options file
+  opt.wal_dir = wal_path_;
+
+  RunDbBench(kOptionsFileName);
+
+  VerifyOptions(SanitizeOptions(db_path_, opt));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+  return RUN_ALL_TESTS();
+}
+
+#else
+
+int main(int argc, char** argv) {
+  printf("Skip db_bench_tool_test as the required library GFLAG is missing.");
+}
+#endif  // #ifdef GFLAGS
diff --git a/src/rocksdb/tools/db_crashtest.py b/src/rocksdb/tools/db_crashtest.py
new file mode 100644
index 000000000..7035908cb
--- /dev/null
+++ b/src/rocksdb/tools/db_crashtest.py
@@ -0,0 +1,1016 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import argparse
+
+import os
+import random
+import shutil
+import subprocess
+import sys
+import tempfile
+import time
+
+# params overwrite priority:
+#   for default:
+#       default_params < {blackbox,whitebox}_default_params < args
+#   for simple:
+#       default_params < {blackbox,whitebox}_default_params <
+#       simple_default_params <
+#       {blackbox,whitebox}_simple_default_params < args
+#   for cf_consistency:
+#       default_params < {blackbox,whitebox}_default_params <
+#       cf_consistency_params < args
+#   for txn:
+#       default_params < {blackbox,whitebox}_default_params < txn_params < args
+#   for ts:
+#       default_params < {blackbox,whitebox}_default_params < ts_params < args
+#   for multiops_txn:
+#       default_params < {blackbox,whitebox}_default_params < multiops_txn_params < args
+
+
+default_params = {
+    "acquire_snapshot_one_in": 10000,
+    "backup_max_size": 100 * 1024 * 1024,
+    # Consider larger number when backups considered more stable
+    "backup_one_in": 100000,
+    "batch_protection_bytes_per_key": lambda: random.choice([0, 8]),
+    "memtable_protection_bytes_per_key": lambda: random.choice([0, 1, 2, 4, 8]),
+    "block_size": 16384,
+    "bloom_bits": lambda: random.choice(
+        [random.randint(0, 19), random.lognormvariate(2.3, 1.3)]
+    ),
+    "cache_index_and_filter_blocks": lambda: random.randint(0, 1),
+    "cache_size": 8388608,
+    "charge_compression_dictionary_building_buffer": lambda: random.choice([0, 1]),
+    "charge_filter_construction": lambda: random.choice([0, 1]),
+    "charge_table_reader": lambda: random.choice([0, 1]),
+    "charge_file_metadata": lambda: random.choice([0, 1]),
+    "checkpoint_one_in": 1000000,
+    "compression_type": lambda: random.choice(
+        ["none", "snappy", "zlib", "lz4", "lz4hc", "xpress", "zstd"]
+    ),
+    "bottommost_compression_type": lambda: "disable"
+    if random.randint(0, 1) == 0
+    else random.choice(["none", "snappy", "zlib", "lz4", "lz4hc", "xpress", "zstd"]),
+    "checksum_type": lambda: random.choice(
+        ["kCRC32c", "kxxHash", "kxxHash64", "kXXH3"]
+    ),
+    "compression_max_dict_bytes": lambda: 16384 * random.randint(0, 1),
+    "compression_zstd_max_train_bytes": lambda: 65536 * random.randint(0, 1),
+    # Disabled compression_parallel_threads as the feature is not stable
+    # lambda: random.choice([1] * 9 + [4])
+    "compression_parallel_threads": 1,
+    "compression_max_dict_buffer_bytes": lambda: (1 << random.randint(0, 40)) - 1,
+    "compression_use_zstd_dict_trainer": lambda: random.randint(0, 1),
+    "clear_column_family_one_in": 0,
+    "compact_files_one_in": 1000000,
+    "compact_range_one_in": 1000000,
+    "compaction_pri": random.randint(0, 4),
+    "data_block_index_type": lambda: random.choice([0, 1]),
+    "delpercent": 4,
+    "delrangepercent": 1,
+    "destroy_db_initially": 0,
+    "enable_pipelined_write": lambda: random.randint(0, 1),
+    "enable_compaction_filter": lambda: random.choice([0, 0, 0, 1]),
+    "expected_values_dir": lambda: setup_expected_values_dir(),
+    "fail_if_options_file_error": lambda: random.randint(0, 1),
+    "flush_one_in": 1000000,
+    "manual_wal_flush_one_in": lambda: random.choice([0, 0, 1000, 1000000]),
+    "file_checksum_impl": lambda: random.choice(["none", "crc32c", "xxh64", "big"]),
+    "get_live_files_one_in": 1000000,
+    # Note: the following two are intentionally disabled as the corresponding
+    # APIs are not guaranteed to succeed.
+    "get_sorted_wal_files_one_in": 0,
+    "get_current_wal_file_one_in": 0,
+    # Temporarily disable hash index
+    "index_type": lambda: random.choice([0, 0, 0, 2, 2, 3]),
+    "ingest_external_file_one_in": 1000000,
+    "iterpercent": 10,
+    "mark_for_compaction_one_file_in": lambda: 10 * random.randint(0, 1),
+    "max_background_compactions": 20,
+    "max_bytes_for_level_base": 10485760,
+    "max_key": 25000000,
+    "max_write_buffer_number": 3,
+    "mmap_read": lambda: random.randint(0, 1),
+    # Setting `nooverwritepercent > 0` is only possible because we do not vary
+    # the random seed, so the same keys are chosen by every run for disallowing
+    # overwrites.
+    "nooverwritepercent": 1,
+    "open_files": lambda: random.choice([-1, -1, 100, 500000]),
+    "optimize_filters_for_memory": lambda: random.randint(0, 1),
+    "partition_filters": lambda: random.randint(0, 1),
+    "partition_pinning": lambda: random.randint(0, 3),
+    "pause_background_one_in": 1000000,
+    "prefix_size": lambda: random.choice([-1, 1, 5, 7, 8]),
+    "prefixpercent": 5,
+    "progress_reports": 0,
+    "readpercent": 45,
+    "recycle_log_file_num": lambda: random.randint(0, 1),
+    "snapshot_hold_ops": 100000,
+    "sst_file_manager_bytes_per_sec": lambda: random.choice([0, 104857600]),
+    "sst_file_manager_bytes_per_truncate": lambda: random.choice([0, 1048576]),
+    "long_running_snapshots": lambda: random.randint(0, 1),
+    "subcompactions": lambda: random.randint(1, 4),
+    "target_file_size_base": 2097152,
+    "target_file_size_multiplier": 2,
+    "test_batches_snapshots": random.randint(0, 1),
+    "top_level_index_pinning": lambda: random.randint(0, 3),
+    "unpartitioned_pinning": lambda: random.randint(0, 3),
+    "use_direct_reads": lambda: random.randint(0, 1),
+    "use_direct_io_for_flush_and_compaction": lambda: random.randint(0, 1),
+    "mock_direct_io": False,
+    "cache_type": lambda: random.choice(["lru_cache", "hyper_clock_cache"]),
+    "use_full_merge_v1": lambda: random.randint(0, 1),
+    "use_merge": lambda: random.randint(0, 1),
+    # use_put_entity_one_in has to be the same across invocations for verification to work, hence no lambda
+    "use_put_entity_one_in": random.choice([0] * 7 + [1, 5, 10]),
+    # 999 -> use Bloom API
+    "ribbon_starting_level": lambda: random.choice([random.randint(-1, 10), 999]),
+    "value_size_mult": 32,
+    "verify_checksum": 1,
+    "write_buffer_size": 4 * 1024 * 1024,
+    "writepercent": 35,
+    "format_version": lambda: random.choice([2, 3, 4, 5, 5]),
+    "index_block_restart_interval": lambda: random.choice(range(1, 16)),
+    "use_multiget": lambda: random.randint(0, 1),
+    "periodic_compaction_seconds": lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]),
+    # 0 = never (used by some), 10 = often (for threading bugs), 600 = default
+    "stats_dump_period_sec": lambda: random.choice([0, 10, 600]),
+    "compaction_ttl": lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]),
+    # Test small max_manifest_file_size in a smaller chance, as most of the
+    # time we wnat manifest history to be preserved to help debug
+    "max_manifest_file_size": lambda: random.choice(
+        [t * 16384 if t < 3 else 1024 * 1024 * 1024 for t in range(1, 30)]
+    ),
+    # Sync mode might make test runs slower so running it in a smaller chance
+    "sync": lambda: random.choice([1 if t == 0 else 0 for t in range(0, 20)]),
+    "bytes_per_sync": lambda: random.choice([0, 262144]),
+    "wal_bytes_per_sync": lambda: random.choice([0, 524288]),
+    # Disable compaction_readahead_size because the test is not passing.
+    # "compaction_readahead_size" : lambda : random.choice(
+    #    [0, 0, 1024 * 1024]),
+    "db_write_buffer_size": lambda: random.choice(
+        [0, 0, 0, 1024 * 1024, 8 * 1024 * 1024, 128 * 1024 * 1024]
+    ),
+    "avoid_unnecessary_blocking_io": random.randint(0, 1),
+    "write_dbid_to_manifest": random.randint(0, 1),
+    "avoid_flush_during_recovery": lambda: random.choice(
+        [1 if t == 0 else 0 for t in range(0, 8)]
+    ),
+    "max_write_batch_group_size_bytes": lambda: random.choice(
+        [16, 64, 1024 * 1024, 16 * 1024 * 1024]
+    ),
+    "level_compaction_dynamic_level_bytes": True,
+    "verify_checksum_one_in": 1000000,
+    "verify_db_one_in": 100000,
+    "continuous_verification_interval": 0,
+    "max_key_len": 3,
+    "key_len_percent_dist": "1,30,69",
+    "read_fault_one_in": lambda: random.choice([0, 32, 1000]),
+    "open_metadata_write_fault_one_in": lambda: random.choice([0, 0, 8]),
+    "open_write_fault_one_in": lambda: random.choice([0, 0, 16]),
+    "open_read_fault_one_in": lambda: random.choice([0, 0, 32]),
+    "sync_fault_injection": lambda: random.randint(0, 1),
+    "get_property_one_in": 1000000,
+    "paranoid_file_checks": lambda: random.choice([0, 1, 1, 1]),
+    "max_write_buffer_size_to_maintain": lambda: random.choice(
+        [0, 1024 * 1024, 2 * 1024 * 1024, 4 * 1024 * 1024, 8 * 1024 * 1024]
+    ),
+    "user_timestamp_size": 0,
+    "secondary_cache_fault_one_in": lambda: random.choice([0, 0, 32]),
+    "prepopulate_block_cache": lambda: random.choice([0, 1]),
+    "memtable_prefix_bloom_size_ratio": lambda: random.choice([0.001, 0.01, 0.1, 0.5]),
+    "memtable_whole_key_filtering": lambda: random.randint(0, 1),
+    "detect_filter_construct_corruption": lambda: random.choice([0, 1]),
+    "adaptive_readahead": lambda: random.choice([0, 1]),
+    "async_io": lambda: random.choice([0, 1]),
+    "wal_compression": lambda: random.choice(["none", "zstd"]),
+    "verify_sst_unique_id_in_manifest": 1,  # always do unique_id verification
+    "secondary_cache_uri": lambda: random.choice(
+        [
+            "",
+            "compressed_secondary_cache://capacity=8388608",
+            "compressed_secondary_cache://capacity=8388608;enable_custom_split_merge=true",
+        ]
+    ),
+    "allow_data_in_errors": True,
+    "readahead_size": lambda: random.choice([0, 16384, 524288]),
+    "initial_auto_readahead_size": lambda: random.choice([0, 16384, 524288]),
+    "max_auto_readahead_size": lambda: random.choice([0, 16384, 524288]),
+    "num_file_reads_for_auto_readahead": lambda: random.choice([0, 1, 2]),
+    "min_write_buffer_number_to_merge": lambda: random.choice([1, 2]),
+    "preserve_internal_time_seconds": lambda: random.choice([0, 60, 3600, 36000]),
+}
+
+_TEST_DIR_ENV_VAR = "TEST_TMPDIR"
+_DEBUG_LEVEL_ENV_VAR = "DEBUG_LEVEL"
+
+stress_cmd = "./db_stress"
+cleanup_cmd = None
+
+
+def is_release_mode():
+    return os.environ.get(_DEBUG_LEVEL_ENV_VAR) == "0"
+
+
+def get_dbname(test_name):
+    test_dir_name = "rocksdb_crashtest_" + test_name
+    test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
+    if test_tmpdir is None or test_tmpdir == "":
+        dbname = tempfile.mkdtemp(prefix=test_dir_name)
+    else:
+        dbname = test_tmpdir + "/" + test_dir_name
+        shutil.rmtree(dbname, True)
+        if cleanup_cmd is not None:
+            print("Running DB cleanup command - %s\n" % cleanup_cmd)
+            # Ignore failure
+            os.system(cleanup_cmd)
+        os.mkdir(dbname)
+    return dbname
+
+
+expected_values_dir = None
+
+
+def setup_expected_values_dir():
+    global expected_values_dir
+    if expected_values_dir is not None:
+        return expected_values_dir
+    expected_dir_prefix = "rocksdb_crashtest_expected_"
+    test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
+    if test_tmpdir is None or test_tmpdir == "":
+        expected_values_dir = tempfile.mkdtemp(prefix=expected_dir_prefix)
+    else:
+        # if tmpdir is specified, store the expected_values_dir under that dir
+        expected_values_dir = test_tmpdir + "/rocksdb_crashtest_expected"
+        if os.path.exists(expected_values_dir):
+            shutil.rmtree(expected_values_dir)
+        os.mkdir(expected_values_dir)
+    return expected_values_dir
+
+
+multiops_txn_key_spaces_file = None
+
+
+def setup_multiops_txn_key_spaces_file():
+    global multiops_txn_key_spaces_file
+    if multiops_txn_key_spaces_file is not None:
+        return multiops_txn_key_spaces_file
+    key_spaces_file_prefix = "rocksdb_crashtest_multiops_txn_key_spaces"
+    test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
+    if test_tmpdir is None or test_tmpdir == "":
+        multiops_txn_key_spaces_file = tempfile.mkstemp(prefix=key_spaces_file_prefix)[
+            1
+        ]
+    else:
+        if not os.path.exists(test_tmpdir):
+            os.mkdir(test_tmpdir)
+        multiops_txn_key_spaces_file = tempfile.mkstemp(
+            prefix=key_spaces_file_prefix, dir=test_tmpdir
+        )[1]
+    return multiops_txn_key_spaces_file
+
+
+def is_direct_io_supported(dbname):
+    with tempfile.NamedTemporaryFile(dir=dbname) as f:
+        try:
+            os.open(f.name, os.O_DIRECT)
+        except BaseException:
+            return False
+        return True
+
+
+blackbox_default_params = {
+    "disable_wal": lambda: random.choice([0, 0, 0, 1]),
+    # total time for this script to test db_stress
+    "duration": 6000,
+    # time for one db_stress instance to run
+    "interval": 120,
+    # since we will be killing anyway, use large value for ops_per_thread
+    "ops_per_thread": 100000000,
+    "reopen": 0,
+    "set_options_one_in": 10000,
+}
+
+whitebox_default_params = {
+    # TODO: enable this once we figure out how to adjust kill odds for WAL-
+    # disabled runs, and either (1) separate full `db_stress` runs out of
+    # whitebox crash or (2) support verification at end of `db_stress` runs
+    # that ran with WAL disabled.
+    "disable_wal": 0,
+    "duration": 10000,
+    "log2_keys_per_lock": 10,
+    "ops_per_thread": 200000,
+    "random_kill_odd": 888887,
+    "reopen": 20,
+}
+
+simple_default_params = {
+    "allow_concurrent_memtable_write": lambda: random.randint(0, 1),
+    "column_families": 1,
+    # TODO: re-enable once internal task T124324915 is fixed.
+    # "experimental_mempurge_threshold": lambda: 10.0*random.random(),
+    "max_background_compactions": 1,
+    "max_bytes_for_level_base": 67108864,
+    "memtablerep": "skip_list",
+    "target_file_size_base": 16777216,
+    "target_file_size_multiplier": 1,
+    "test_batches_snapshots": 0,
+    "write_buffer_size": 32 * 1024 * 1024,
+    "level_compaction_dynamic_level_bytes": False,
+    "paranoid_file_checks": lambda: random.choice([0, 1, 1, 1]),
+    "verify_iterator_with_expected_state_one_in": 5,  # this locks a range of keys
+}
+
+blackbox_simple_default_params = {
+    "open_files": -1,
+    "set_options_one_in": 0,
+}
+
+whitebox_simple_default_params = {}
+
+cf_consistency_params = {
+    "disable_wal": lambda: random.randint(0, 1),
+    "reopen": 0,
+    "test_cf_consistency": 1,
+    # use small value for write_buffer_size so that RocksDB triggers flush
+    # more frequently
+    "write_buffer_size": 1024 * 1024,
+    "enable_pipelined_write": lambda: random.randint(0, 1),
+    # Snapshots are used heavily in this test mode, while they are incompatible
+    # with compaction filter.
+    "enable_compaction_filter": 0,
+    # `CfConsistencyStressTest::TestIngestExternalFile()` is not implemented.
+    "ingest_external_file_one_in": 0,
+}
+
+txn_params = {
+    "use_txn": 1,
+    # Avoid lambda to set it once for the entire test
+    "txn_write_policy": random.randint(0, 2),
+    "unordered_write": random.randint(0, 1),
+    # TODO: there is such a thing as transactions with WAL disabled. We should
+    # cover that case.
+    "disable_wal": 0,
+    # OpenReadOnly after checkpoint is not currnetly compatible with WritePrepared txns
+    "checkpoint_one_in": 0,
+    # pipeline write is not currnetly compatible with WritePrepared txns
+    "enable_pipelined_write": 0,
+    "create_timestamped_snapshot_one_in": random.choice([0, 20]),
+    # PutEntity in transactions is not yet implemented
+    "use_put_entity_one_in" : 0,
+}
+
+best_efforts_recovery_params = {
+    "best_efforts_recovery": 1,
+    "atomic_flush": 0,
+    "disable_wal": 1,
+    "column_families": 1,
+}
+
+blob_params = {
+    "allow_setting_blob_options_dynamically": 1,
+    # Enable blob files and GC with a 75% chance initially; note that they might still be
+    # enabled/disabled during the test via SetOptions
+    "enable_blob_files": lambda: random.choice([0] + [1] * 3),
+    "min_blob_size": lambda: random.choice([0, 8, 16]),
+    "blob_file_size": lambda: random.choice([1048576, 16777216, 268435456, 1073741824]),
+    "blob_compression_type": lambda: random.choice(["none", "snappy", "lz4", "zstd"]),
+    "enable_blob_garbage_collection": lambda: random.choice([0] + [1] * 3),
+    "blob_garbage_collection_age_cutoff": lambda: random.choice(
+        [0.0, 0.25, 0.5, 0.75, 1.0]
+    ),
+    "blob_garbage_collection_force_threshold": lambda: random.choice([0.5, 0.75, 1.0]),
+    "blob_compaction_readahead_size": lambda: random.choice([0, 1048576, 4194304]),
+    "blob_file_starting_level": lambda: random.choice(
+        [0] * 4 + [1] * 3 + [2] * 2 + [3]
+    ),
+    "use_blob_cache": lambda: random.randint(0, 1),
+    "use_shared_block_and_blob_cache": lambda: random.randint(0, 1),
+    "blob_cache_size": lambda: random.choice([1048576, 2097152, 4194304, 8388608]),
+    "prepopulate_blob_cache": lambda: random.randint(0, 1),
+}
+
+ts_params = {
+    "test_cf_consistency": 0,
+    "test_batches_snapshots": 0,
+    "user_timestamp_size": 8,
+    "use_merge": 0,
+    "use_full_merge_v1": 0,
+    "use_txn": 0,
+    "enable_blob_files": 0,
+    "use_blob_db": 0,
+    "ingest_external_file_one_in": 0,
+    # PutEntity with timestamps is not yet implemented
+    "use_put_entity_one_in" : 0,
+}
+
+tiered_params = {
+    "enable_tiered_storage": 1,
+    # Set tiered compaction hot data time as: 1 minute, 1 hour, 10 hour
+    "preclude_last_level_data_seconds": lambda: random.choice([60, 3600, 36000]),
+    # only test universal compaction for now, level has known issue of
+    # endless compaction
+    "compaction_style": 1,
+    # tiered storage doesn't support blob db yet
+    "enable_blob_files": 0,
+    "use_blob_db": 0,
+}
+
+multiops_txn_default_params = {
+    "test_cf_consistency": 0,
+    "test_batches_snapshots": 0,
+    "test_multi_ops_txns": 1,
+    "use_txn": 1,
+    "two_write_queues": lambda: random.choice([0, 1]),
+    # TODO: enable write-prepared
+    "disable_wal": 0,
+    "use_only_the_last_commit_time_batch_for_recovery": lambda: random.choice([0, 1]),
+    "clear_column_family_one_in": 0,
+    "column_families": 1,
+    "enable_pipelined_write": lambda: random.choice([0, 1]),
+    # This test already acquires snapshots in reads
+    "acquire_snapshot_one_in": 0,
+    "backup_one_in": 0,
+    "writepercent": 0,
+    "delpercent": 0,
+    "delrangepercent": 0,
+    "customopspercent": 80,
+    "readpercent": 5,
+    "iterpercent": 15,
+    "prefixpercent": 0,
+    "verify_db_one_in": 1000,
+    "continuous_verification_interval": 1000,
+    "delay_snapshot_read_one_in": 3,
+    # 65536 is the smallest possible value for write_buffer_size. Smaller
+    # values will be sanitized to 65536 during db open. SetOptions currently
+    # does not sanitize options, but very small write_buffer_size may cause
+    # assertion failure in
+    # https://github.com/facebook/rocksdb/blob/7.0.fb/db/memtable.cc#L117.
+    "write_buffer_size": 65536,
+    # flush more frequently to generate more files, thus trigger more
+    # compactions.
+    "flush_one_in": 1000,
+    "key_spaces_path": setup_multiops_txn_key_spaces_file(),
+    "rollback_one_in": 4,
+    # Re-enable once we have a compaction for MultiOpsTxnStressTest
+    "enable_compaction_filter": 0,
+    "create_timestamped_snapshot_one_in": 50,
+    "sync_fault_injection": 0,
+    # PutEntity in transactions is not yet implemented
+    "use_put_entity_one_in" : 0,
+}
+
+multiops_wc_txn_params = {
+    "txn_write_policy": 0,
+    # TODO re-enable pipelined write. Not well tested atm
+    "enable_pipelined_write": 0,
+}
+
+multiops_wp_txn_params = {
+    "txn_write_policy": 1,
+    "wp_snapshot_cache_bits": 1,
+    # try small wp_commit_cache_bits, e.g. 0 once we explore storing full
+    # commit sequence numbers in commit cache
+    "wp_commit_cache_bits": 10,
+    # pipeline write is not currnetly compatible with WritePrepared txns
+    "enable_pipelined_write": 0,
+    # OpenReadOnly after checkpoint is not currnetly compatible with WritePrepared txns
+    "checkpoint_one_in": 0,
+    # Required to be 1 in order to use commit-time-batch
+    "use_only_the_last_commit_time_batch_for_recovery": 1,
+    "clear_wp_commit_cache_one_in": 10,
+    "create_timestamped_snapshot_one_in": 0,
+}
+
+
+def finalize_and_sanitize(src_params):
+    dest_params = {k: v() if callable(v) else v for (k, v) in src_params.items()}
+    if is_release_mode():
+        dest_params["read_fault_one_in"] = 0
+    if dest_params.get("compression_max_dict_bytes") == 0:
+        dest_params["compression_zstd_max_train_bytes"] = 0
+        dest_params["compression_max_dict_buffer_bytes"] = 0
+    if dest_params.get("compression_type") != "zstd":
+        dest_params["compression_zstd_max_train_bytes"] = 0
+    if dest_params.get("allow_concurrent_memtable_write", 1) == 1:
+        dest_params["memtablerep"] = "skip_list"
+    if dest_params["mmap_read"] == 1:
+        dest_params["use_direct_io_for_flush_and_compaction"] = 0
+        dest_params["use_direct_reads"] = 0
+        if dest_params["file_checksum_impl"] != "none":
+            # TODO(T109283569): there is a bug in `GenerateOneFileChecksum()`,
+            # used by `IngestExternalFile()`, causing it to fail with mmap
+            # reads. Remove this once it is fixed.
+            dest_params["ingest_external_file_one_in"] = 0
+    if (
+        dest_params["use_direct_io_for_flush_and_compaction"] == 1
+        or dest_params["use_direct_reads"] == 1
+    ) and not is_direct_io_supported(dest_params["db"]):
+        if is_release_mode():
+            print(
+                "{} does not support direct IO. Disabling use_direct_reads and "
+                "use_direct_io_for_flush_and_compaction.\n".format(dest_params["db"])
+            )
+            dest_params["use_direct_reads"] = 0
+            dest_params["use_direct_io_for_flush_and_compaction"] = 0
+        else:
+            dest_params["mock_direct_io"] = True
+
+    if dest_params["test_batches_snapshots"] == 1:
+        dest_params["enable_compaction_filter"] = 0
+        if dest_params["prefix_size"] < 0:
+            dest_params["prefix_size"] = 1
+
+    # Multi-key operations are not currently compatible with transactions or
+    # timestamp.
+    if (dest_params.get("test_batches_snapshots") == 1 or
+        dest_params.get("use_txn") == 1 or
+        dest_params.get("user_timestamp_size") > 0):
+        dest_params["ingest_external_file_one_in"] = 0
+    if (dest_params.get("test_batches_snapshots") == 1 or
+        dest_params.get("use_txn") == 1):
+        dest_params["delpercent"] += dest_params["delrangepercent"]
+        dest_params["delrangepercent"] = 0
+    if (
+        dest_params.get("disable_wal") == 1
+        or dest_params.get("sync_fault_injection") == 1
+        or dest_params.get("manual_wal_flush_one_in") > 0
+    ):
+        # File ingestion does not guarantee prefix-recoverability when unsynced
+        # data can be lost. Ingesting a file syncs data immediately that is
+        # newer than unsynced memtable data that can be lost on restart.
+        #
+        # Even if the above issue is fixed or worked around, our
+        # trace-and-replay does not trace file ingestion, so in its current form
+        # it would not recover the expected state to the correct point in time.
+        dest_params["ingest_external_file_one_in"] = 0
+        # The `DbStressCompactionFilter` can apply memtable updates to SST
+        # files, which would be problematic when unsynced data can be lost in
+        # crash recoveries.
+        dest_params["enable_compaction_filter"] = 0
+    # Only under WritePrepared txns, unordered_write would provide the same guarnatees as vanilla rocksdb
+    if dest_params.get("unordered_write", 0) == 1:
+        dest_params["txn_write_policy"] = 1
+        dest_params["allow_concurrent_memtable_write"] = 1
+    if dest_params.get("disable_wal", 0) == 1:
+        dest_params["atomic_flush"] = 1
+        dest_params["sync"] = 0
+        dest_params["write_fault_one_in"] = 0
+    if dest_params.get("open_files", 1) != -1:
+        # Compaction TTL and periodic compactions are only compatible
+        # with open_files = -1
+        dest_params["compaction_ttl"] = 0
+        dest_params["periodic_compaction_seconds"] = 0
+    if dest_params.get("compaction_style", 0) == 2:
+        # Disable compaction TTL in FIFO compaction, because right
+        # now assertion failures are triggered.
+        dest_params["compaction_ttl"] = 0
+        dest_params["periodic_compaction_seconds"] = 0
+    if dest_params["partition_filters"] == 1:
+        if dest_params["index_type"] != 2:
+            dest_params["partition_filters"] = 0
+    if dest_params.get("atomic_flush", 0) == 1:
+        # disable pipelined write when atomic flush is used.
+        dest_params["enable_pipelined_write"] = 0
+    if dest_params.get("sst_file_manager_bytes_per_sec", 0) == 0:
+        dest_params["sst_file_manager_bytes_per_truncate"] = 0
+    if dest_params.get("enable_compaction_filter", 0) == 1:
+        # Compaction filter is incompatible with snapshots. Need to avoid taking
+        # snapshots, as well as avoid operations that use snapshots for
+        # verification.
+        dest_params["acquire_snapshot_one_in"] = 0
+        dest_params["compact_range_one_in"] = 0
+        # Give the iterator ops away to reads.
+        dest_params["readpercent"] += dest_params.get("iterpercent", 10)
+        dest_params["iterpercent"] = 0
+    if dest_params.get("prefix_size") == -1:
+        dest_params["readpercent"] += dest_params.get("prefixpercent", 20)
+        dest_params["prefixpercent"] = 0
+    if (
+        dest_params.get("prefix_size") == -1
+        and dest_params.get("memtable_whole_key_filtering") == 0
+    ):
+        dest_params["memtable_prefix_bloom_size_ratio"] = 0
+    if dest_params.get("two_write_queues") == 1:
+        dest_params["enable_pipelined_write"] = 0
+    if dest_params.get("best_efforts_recovery") == 1:
+        dest_params["disable_wal"] = 1
+        dest_params["atomic_flush"] = 0
+        dest_params["enable_compaction_filter"] = 0
+        dest_params["sync"] = 0
+        dest_params["write_fault_one_in"] = 0
+    if dest_params["secondary_cache_uri"] != "":
+        # Currently the only cache type compatible with a secondary cache is LRUCache
+        dest_params["cache_type"] = "lru_cache"
+    # Remove the following once write-prepared/write-unprepared with/without
+    # unordered write supports timestamped snapshots
+    if dest_params.get("create_timestamped_snapshot_one_in", 0) > 0:
+        dest_params["txn_write_policy"] = 0
+        dest_params["unordered_write"] = 0
+    # For TransactionDB, correctness testing with unsync data loss is currently
+    # compatible with only write committed policy
+    if (dest_params.get("use_txn") == 1 and dest_params.get("txn_write_policy") != 0):
+        dest_params["sync_fault_injection"] = 0
+        dest_params["manual_wal_flush_one_in"] = 0
+    # PutEntity is currently not supported by SstFileWriter or in conjunction with Merge
+    if dest_params["use_put_entity_one_in"] != 0:
+        dest_params["ingest_external_file_one_in"] = 0
+        dest_params["use_merge"] = 0
+        dest_params["use_full_merge_v1"] = 0
+
+    return dest_params
+
+
+def gen_cmd_params(args):
+    params = {}
+
+    params.update(default_params)
+    if args.test_type == "blackbox":
+        params.update(blackbox_default_params)
+    if args.test_type == "whitebox":
+        params.update(whitebox_default_params)
+    if args.simple:
+        params.update(simple_default_params)
+        if args.test_type == "blackbox":
+            params.update(blackbox_simple_default_params)
+        if args.test_type == "whitebox":
+            params.update(whitebox_simple_default_params)
+    if args.cf_consistency:
+        params.update(cf_consistency_params)
+    if args.txn:
+        params.update(txn_params)
+    if args.test_best_efforts_recovery:
+        params.update(best_efforts_recovery_params)
+    if args.enable_ts:
+        params.update(ts_params)
+    if args.test_multiops_txn:
+        params.update(multiops_txn_default_params)
+        if args.write_policy == "write_committed":
+            params.update(multiops_wc_txn_params)
+        elif args.write_policy == "write_prepared":
+            params.update(multiops_wp_txn_params)
+    if args.test_tiered_storage:
+        params.update(tiered_params)
+
+    # Best-effort recovery, user defined timestamp, tiered storage are currently
+    # incompatible with BlobDB. Test BE recovery if specified on the command
+    # line; otherwise, apply BlobDB related overrides with a 10% chance.
+    if (
+        not args.test_best_efforts_recovery
+        and not args.enable_ts
+        and not args.test_tiered_storage
+        and random.choice([0] * 9 + [1]) == 1
+    ):
+        params.update(blob_params)
+
+    for k, v in vars(args).items():
+        if v is not None:
+            params[k] = v
+    return params
+
+
+def gen_cmd(params, unknown_params):
+    finalzied_params = finalize_and_sanitize(params)
+    cmd = (
+        [stress_cmd]
+        + [
+            "--{0}={1}".format(k, v)
+            for k, v in [(k, finalzied_params[k]) for k in sorted(finalzied_params)]
+            if k
+            not in {
+                "test_type",
+                "simple",
+                "duration",
+                "interval",
+                "random_kill_odd",
+                "cf_consistency",
+                "txn",
+                "test_best_efforts_recovery",
+                "enable_ts",
+                "test_multiops_txn",
+                "write_policy",
+                "stress_cmd",
+                "test_tiered_storage",
+                "cleanup_cmd",
+            }
+            and v is not None
+        ]
+        + unknown_params
+    )
+    return cmd
+
+
+def execute_cmd(cmd, timeout):
+    child = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
+    print("Running db_stress with pid=%d: %s\n\n" % (child.pid, " ".join(cmd)))
+
+    try:
+        outs, errs = child.communicate(timeout=timeout)
+        hit_timeout = False
+        print("WARNING: db_stress ended before kill: exitcode=%d\n" % child.returncode)
+    except subprocess.TimeoutExpired:
+        hit_timeout = True
+        child.kill()
+        print("KILLED %d\n" % child.pid)
+        outs, errs = child.communicate()
+
+    return hit_timeout, child.returncode, outs.decode("utf-8"), errs.decode("utf-8")
+
+
+# This script runs and kills db_stress multiple times. It checks consistency
+# in case of unsafe crashes in RocksDB.
+def blackbox_crash_main(args, unknown_args):
+    cmd_params = gen_cmd_params(args)
+    dbname = get_dbname("blackbox")
+    exit_time = time.time() + cmd_params["duration"]
+
+    print(
+        "Running blackbox-crash-test with \n"
+        + "interval_between_crash="
+        + str(cmd_params["interval"])
+        + "\n"
+        + "total-duration="
+        + str(cmd_params["duration"])
+        + "\n"
+    )
+
+    while time.time() < exit_time:
+        cmd = gen_cmd(
+            dict(list(cmd_params.items()) + list({"db": dbname}.items())), unknown_args
+        )
+
+        hit_timeout, retcode, outs, errs = execute_cmd(cmd, cmd_params["interval"])
+
+        if not hit_timeout:
+            print("Exit Before Killing")
+            print("stdout:")
+            print(outs)
+            print("stderr:")
+            print(errs)
+            sys.exit(2)
+
+        for line in errs.split("\n"):
+            if line != "" and not line.startswith("WARNING"):
+                print("stderr has error message:")
+                print("***" + line + "***")
+
+        time.sleep(1)  # time to stabilize before the next run
+
+        time.sleep(1)  # time to stabilize before the next run
+
+    # we need to clean up after ourselves -- only do this on test success
+    shutil.rmtree(dbname, True)
+
+
+# This python script runs db_stress multiple times. Some runs with
+# kill_random_test that causes rocksdb to crash at various points in code.
+def whitebox_crash_main(args, unknown_args):
+    cmd_params = gen_cmd_params(args)
+    dbname = get_dbname("whitebox")
+
+    cur_time = time.time()
+    exit_time = cur_time + cmd_params["duration"]
+    half_time = cur_time + cmd_params["duration"] // 2
+
+    print(
+        "Running whitebox-crash-test with \n"
+        + "total-duration="
+        + str(cmd_params["duration"])
+        + "\n"
+    )
+
+    total_check_mode = 4
+    check_mode = 0
+    kill_random_test = cmd_params["random_kill_odd"]
+    kill_mode = 0
+    prev_compaction_style = -1
+    while time.time() < exit_time:
+        if check_mode == 0:
+            additional_opts = {
+                # use large ops per thread since we will kill it anyway
+                "ops_per_thread": 100
+                * cmd_params["ops_per_thread"],
+            }
+            # run with kill_random_test, with three modes.
+            # Mode 0 covers all kill points. Mode 1 covers less kill points but
+            # increases change of triggering them. Mode 2 covers even less
+            # frequent kill points and further increases triggering change.
+            if kill_mode == 0:
+                additional_opts.update(
+                    {
+                        "kill_random_test": kill_random_test,
+                    }
+                )
+            elif kill_mode == 1:
+                if cmd_params.get("disable_wal", 0) == 1:
+                    my_kill_odd = kill_random_test // 50 + 1
+                else:
+                    my_kill_odd = kill_random_test // 10 + 1
+                additional_opts.update(
+                    {
+                        "kill_random_test": my_kill_odd,
+                        "kill_exclude_prefixes": "WritableFileWriter::Append,"
+                        + "WritableFileWriter::WriteBuffered",
+                    }
+                )
+            elif kill_mode == 2:
+                # TODO: May need to adjust random odds if kill_random_test
+                # is too small.
+                additional_opts.update(
+                    {
+                        "kill_random_test": (kill_random_test // 5000 + 1),
+                        "kill_exclude_prefixes": "WritableFileWriter::Append,"
+                        "WritableFileWriter::WriteBuffered,"
+                        "PosixMmapFile::Allocate,WritableFileWriter::Flush",
+                    }
+                )
+            # Run kill mode 0, 1 and 2 by turn.
+            kill_mode = (kill_mode + 1) % 3
+        elif check_mode == 1:
+            # normal run with universal compaction mode
+            additional_opts = {
+                "kill_random_test": None,
+                "ops_per_thread": cmd_params["ops_per_thread"],
+                "compaction_style": 1,
+            }
+            # Single level universal has a lot of special logic. Ensure we cover
+            # it sometimes.
+            if random.randint(0, 1) == 1:
+                additional_opts.update(
+                    {
+                        "num_levels": 1,
+                    }
+                )
+        elif check_mode == 2:
+            # normal run with FIFO compaction mode
+            # ops_per_thread is divided by 5 because FIFO compaction
+            # style is quite a bit slower on reads with lot of files
+            additional_opts = {
+                "kill_random_test": None,
+                "ops_per_thread": cmd_params["ops_per_thread"] // 5,
+                "compaction_style": 2,
+            }
+        else:
+            # normal run
+            additional_opts = {
+                "kill_random_test": None,
+                "ops_per_thread": cmd_params["ops_per_thread"],
+            }
+
+        cur_compaction_style = additional_opts.get("compaction_style", cmd_params.get("compaction_style", 0))
+        if prev_compaction_style != -1 and prev_compaction_style != cur_compaction_style:
+            print("`compaction_style` is changed in current run so `destroy_db_initially` is set to 1 as a short-term solution to avoid cycling through previous db of different compaction style." + "\n")
+            additional_opts["destroy_db_initially"] = 1
+        prev_compaction_style = cur_compaction_style
+
+        cmd = gen_cmd(
+            dict(
+                list(cmd_params.items())
+                + list(additional_opts.items())
+                + list({"db": dbname}.items())
+            ),
+            unknown_args,
+        )
+
+        print(
+            "Running:" + " ".join(cmd) + "\n"
+        )  # noqa: E999 T25377293 Grandfathered in
+
+        # If the running time is 15 minutes over the run time, explicit kill and
+        # exit even if white box kill didn't hit. This is to guarantee run time
+        # limit, as if it runs as a job, running too long will create problems
+        # for job scheduling or execution.
+        # TODO detect a hanging condition. The job might run too long as RocksDB
+        # hits a hanging bug.
+        hit_timeout, retncode, stdoutdata, stderrdata = execute_cmd(
+            cmd, exit_time - time.time() + 900
+        )
+        msg = "check_mode={0}, kill option={1}, exitcode={2}\n".format(
+            check_mode, additional_opts["kill_random_test"], retncode
+        )
+
+        print(msg)
+        print(stdoutdata)
+        print(stderrdata)
+
+        if hit_timeout:
+            print("Killing the run for running too long")
+            break
+
+        expected = False
+        if additional_opts["kill_random_test"] is None and (retncode == 0):
+            # we expect zero retncode if no kill option
+            expected = True
+        elif additional_opts["kill_random_test"] is not None and retncode <= 0:
+            # When kill option is given, the test MIGHT kill itself.
+            # If it does, negative retncode is expected. Otherwise 0.
+            expected = True
+
+        if not expected:
+            print("TEST FAILED. See kill option and exit code above!!!\n")
+            sys.exit(1)
+
+        stderrdata = stderrdata.lower()
+        errorcount = stderrdata.count("error") - stderrdata.count("got errors 0 times")
+        print("#times error occurred in output is " + str(errorcount) + "\n")
+
+        if errorcount > 0:
+            print("TEST FAILED. Output has 'error'!!!\n")
+            sys.exit(2)
+        if stderrdata.find("fail") >= 0:
+            print("TEST FAILED. Output has 'fail'!!!\n")
+            sys.exit(2)
+
+        # First half of the duration, keep doing kill test. For the next half,
+        # try different modes.
+        if time.time() > half_time:
+            # we need to clean up after ourselves -- only do this on test
+            # success
+            shutil.rmtree(dbname, True)
+            if cleanup_cmd is not None:
+                print("Running DB cleanup command - %s\n" % cleanup_cmd)
+                ret = os.system(cleanup_cmd)
+                if ret != 0:
+                    print("TEST FAILED. DB cleanup returned error %d\n" % ret)
+                    sys.exit(1)
+            os.mkdir(dbname)
+            if (expected_values_dir is not None):
+                shutil.rmtree(expected_values_dir, True)
+                os.mkdir(expected_values_dir)
+
+            check_mode = (check_mode + 1) % total_check_mode
+
+        time.sleep(1)  # time to stabilize after a kill
+
+
+def main():
+    global stress_cmd
+    global cleanup_cmd
+
+    parser = argparse.ArgumentParser(
+        description="This script runs and kills \
+        db_stress multiple times"
+    )
+    parser.add_argument("test_type", choices=["blackbox", "whitebox"])
+    parser.add_argument("--simple", action="store_true")
+    parser.add_argument("--cf_consistency", action="store_true")
+    parser.add_argument("--txn", action="store_true")
+    parser.add_argument("--test_best_efforts_recovery", action="store_true")
+    parser.add_argument("--enable_ts", action="store_true")
+    parser.add_argument("--test_multiops_txn", action="store_true")
+    parser.add_argument("--write_policy", choices=["write_committed", "write_prepared"])
+    parser.add_argument("--stress_cmd")
+    parser.add_argument("--test_tiered_storage", action="store_true")
+    parser.add_argument("--cleanup_cmd")
+
+    all_params = dict(
+        list(default_params.items())
+        + list(blackbox_default_params.items())
+        + list(whitebox_default_params.items())
+        + list(simple_default_params.items())
+        + list(blackbox_simple_default_params.items())
+        + list(whitebox_simple_default_params.items())
+        + list(blob_params.items())
+        + list(ts_params.items())
+        + list(multiops_txn_default_params.items())
+        + list(multiops_wc_txn_params.items())
+        + list(multiops_wp_txn_params.items())
+        + list(best_efforts_recovery_params.items())
+        + list(cf_consistency_params.items())
+        + list(tiered_params.items())
+        + list(txn_params.items())
+    )
+
+    for k, v in all_params.items():
+        parser.add_argument("--" + k, type=type(v() if callable(v) else v))
+    # unknown_args are passed directly to db_stress
+    args, unknown_args = parser.parse_known_args()
+
+    test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
+    if test_tmpdir is not None and not os.path.isdir(test_tmpdir):
+        print(
+            "%s env var is set to a non-existent directory: %s"
+            % (_TEST_DIR_ENV_VAR, test_tmpdir)
+        )
+        sys.exit(1)
+
+    if args.stress_cmd:
+        stress_cmd = args.stress_cmd
+    if args.cleanup_cmd:
+        cleanup_cmd = args.cleanup_cmd
+    if args.test_type == "blackbox":
+        blackbox_crash_main(args, unknown_args)
+    if args.test_type == "whitebox":
+        whitebox_crash_main(args, unknown_args)
+    # Only delete the `expected_values_dir` if test passes
+    if expected_values_dir is not None:
+        shutil.rmtree(expected_values_dir)
+    if multiops_txn_key_spaces_file is not None:
+        os.remove(multiops_txn_key_spaces_file)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/rocksdb/tools/db_repl_stress.cc b/src/rocksdb/tools/db_repl_stress.cc
new file mode 100644
index 000000000..ba680f4f2
--- /dev/null
+++ b/src/rocksdb/tools/db_repl_stress.cc
@@ -0,0 +1,140 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+
+#include <atomic>
+#include <cstdio>
+
+#include "db/write_batch_internal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/types.h"
+#include "test_util/testutil.h"
+#include "util/gflags_compat.h"
+
+// Run a thread to perform Put's.
+// Another thread uses GetUpdatesSince API to keep getting the updates.
+// options :
+// --num_inserts = the num of inserts the first thread should perform.
+// --wal_ttl = the wal ttl for the run.
+
+DEFINE_uint64(num_inserts, 1000,
+              "the num of inserts the first thread should"
+              " perform.");
+DEFINE_uint64(wal_ttl_seconds, 1000, "the wal ttl for the run(in seconds)");
+DEFINE_uint64(wal_size_limit_MB, 10,
+              "the wal size limit for the run"
+              "(in MB)");
+
+using ROCKSDB_NAMESPACE::BatchResult;
+using ROCKSDB_NAMESPACE::DB;
+using ROCKSDB_NAMESPACE::DestroyDB;
+using ROCKSDB_NAMESPACE::Env;
+using ROCKSDB_NAMESPACE::Options;
+using ROCKSDB_NAMESPACE::Random;
+using ROCKSDB_NAMESPACE::SequenceNumber;
+using ROCKSDB_NAMESPACE::Slice;
+using ROCKSDB_NAMESPACE::Status;
+using ROCKSDB_NAMESPACE::TransactionLogIterator;
+using ROCKSDB_NAMESPACE::WriteOptions;
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::SetUsageMessage;
+
+struct DataPumpThread {
+  DB* db;  // Assumption DB is Open'ed already.
+};
+
+static void DataPumpThreadBody(void* arg) {
+  DataPumpThread* t = reinterpret_cast<DataPumpThread*>(arg);
+  DB* db = t->db;
+  Random rnd(301);
+  uint64_t i = 0;
+  while (i++ < FLAGS_num_inserts) {
+    if (!db->Put(WriteOptions(), Slice(rnd.RandomString(500)),
+                 Slice(rnd.RandomString(500)))
+             .ok()) {
+      fprintf(stderr, "Error in put\n");
+      exit(1);
+    }
+  }
+}
+
+int main(int argc, const char** argv) {
+  SetUsageMessage(
+      std::string("\nUSAGE:\n") + std::string(argv[0]) +
+      " --num_inserts=<num_inserts> --wal_ttl_seconds=<WAL_ttl_seconds>" +
+      " --wal_size_limit_MB=<WAL_size_limit_MB>");
+  ParseCommandLineFlags(&argc, const_cast<char***>(&argv), true);
+
+  Env* env = Env::Default();
+  std::string default_db_path;
+  env->GetTestDirectory(&default_db_path);
+  default_db_path += "db_repl_stress";
+  Options options;
+  options.create_if_missing = true;
+  options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds;
+  options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB;
+  DB* db;
+  DestroyDB(default_db_path, options);
+
+  Status s = DB::Open(options, default_db_path, &db);
+
+  if (!s.ok()) {
+    fprintf(stderr, "Could not open DB due to %s\n", s.ToString().c_str());
+    exit(1);
+  }
+
+  DataPumpThread dataPump;
+  dataPump.db = db;
+  env->StartThread(DataPumpThreadBody, &dataPump);
+
+  std::unique_ptr<TransactionLogIterator> iter;
+  SequenceNumber currentSeqNum = 1;
+  uint64_t num_read = 0;
+  for (;;) {
+    iter.reset();
+    // Continue to probe a bit more after all received
+    size_t probes = 0;
+    while (!db->GetUpdatesSince(currentSeqNum, &iter).ok()) {
+      probes++;
+      if (probes > 100 && num_read >= FLAGS_num_inserts) {
+        if (num_read > FLAGS_num_inserts) {
+          fprintf(stderr, "Too many updates read: %ld expected: %ld\n",
+                  (long)num_read, (long)FLAGS_num_inserts);
+          exit(1);
+        }
+        fprintf(stderr, "Successful!\n");
+        return 0;
+      }
+    }
+    fprintf(stderr, "Refreshing iterator\n");
+    for (; iter->Valid(); iter->Next(), num_read++, currentSeqNum++) {
+      BatchResult res = iter->GetBatch();
+      if (res.sequence != currentSeqNum) {
+        fprintf(stderr, "Missed a seq no. b/w %ld and %ld\n",
+                (long)currentSeqNum, (long)res.sequence);
+        exit(1);
+      }
+    }
+  }
+}
+
+#endif  // GFLAGS
+
+#else  // ROCKSDB_LITE
+#include <stdio.h>
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "Not supported in lite mode.\n");
+  return 1;
+}
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/db_sanity_test.cc b/src/rocksdb/tools/db_sanity_test.cc
new file mode 100644
index 000000000..8cc67f5d5
--- /dev/null
+++ b/src/rocksdb/tools/db_sanity_test.cc
@@ -0,0 +1,300 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+#include <vector>
+
+#include "port/port.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SanityTest {
+ public:
+  explicit SanityTest(const std::string& path)
+      : env_(Env::Default()), path_(path) {
+    env_->CreateDirIfMissing(path);
+  }
+  virtual ~SanityTest() {}
+
+  virtual std::string Name() const = 0;
+  virtual Options GetOptions() const = 0;
+
+  Status Create() {
+    Options options = GetOptions();
+    options.create_if_missing = true;
+    std::string dbname = path_ + Name();
+    Status s = DestroyDB(dbname, options);
+    if (!s.ok()) {
+      return s;
+    }
+    DB* db = nullptr;
+    s = DB::Open(options, dbname, &db);
+    std::unique_ptr<DB> db_guard(db);
+    if (!s.ok()) {
+      return s;
+    }
+    for (int i = 0; i < 1000000; ++i) {
+      std::string k = "key" + std::to_string(i);
+      std::string v = "value" + std::to_string(i);
+      s = db->Put(WriteOptions(), Slice(k), Slice(v));
+      if (!s.ok()) {
+        return s;
+      }
+    }
+    return db->Flush(FlushOptions());
+  }
+  Status Verify() {
+    DB* db = nullptr;
+    std::string dbname = path_ + Name();
+    Status s = DB::Open(GetOptions(), dbname, &db);
+    std::unique_ptr<DB> db_guard(db);
+    if (!s.ok()) {
+      return s;
+    }
+    for (int i = 0; i < 1000000; ++i) {
+      std::string k = "key" + std::to_string(i);
+      std::string v = "value" + std::to_string(i);
+      std::string result;
+      s = db->Get(ReadOptions(), Slice(k), &result);
+      if (!s.ok()) {
+        return s;
+      }
+      if (result != v) {
+        return Status::Corruption("Unexpected value for key " + k);
+      }
+    }
+    return Status::OK();
+  }
+
+ private:
+  Env* env_;
+  std::string const path_;
+};
+
+class SanityTestBasic : public SanityTest {
+ public:
+  explicit SanityTestBasic(const std::string& path) : SanityTest(path) {}
+  virtual Options GetOptions() const override {
+    Options options;
+    options.create_if_missing = true;
+    return options;
+  }
+  virtual std::string Name() const override { return "Basic"; }
+};
+
+class SanityTestSpecialComparator : public SanityTest {
+ public:
+  explicit SanityTestSpecialComparator(const std::string& path)
+      : SanityTest(path) {
+    options_.comparator = new NewComparator();
+  }
+  ~SanityTestSpecialComparator() { delete options_.comparator; }
+  virtual Options GetOptions() const override { return options_; }
+  virtual std::string Name() const override { return "SpecialComparator"; }
+
+ private:
+  class NewComparator : public Comparator {
+   public:
+    virtual const char* Name() const override {
+      return "rocksdb.NewComparator";
+    }
+    virtual int Compare(const Slice& a, const Slice& b) const override {
+      return BytewiseComparator()->Compare(a, b);
+    }
+    virtual void FindShortestSeparator(std::string* s,
+                                       const Slice& l) const override {
+      BytewiseComparator()->FindShortestSeparator(s, l);
+    }
+    virtual void FindShortSuccessor(std::string* key) const override {
+      BytewiseComparator()->FindShortSuccessor(key);
+    }
+  };
+  Options options_;
+};
+
+class SanityTestZlibCompression : public SanityTest {
+ public:
+  explicit SanityTestZlibCompression(const std::string& path)
+      : SanityTest(path) {
+    options_.compression = kZlibCompression;
+  }
+  virtual Options GetOptions() const override { return options_; }
+  virtual std::string Name() const override { return "ZlibCompression"; }
+
+ private:
+  Options options_;
+};
+
+class SanityTestZlibCompressionVersion2 : public SanityTest {
+ public:
+  explicit SanityTestZlibCompressionVersion2(const std::string& path)
+      : SanityTest(path) {
+    options_.compression = kZlibCompression;
+    BlockBasedTableOptions table_options;
+#if ROCKSDB_MAJOR > 3 || (ROCKSDB_MAJOR == 3 && ROCKSDB_MINOR >= 10)
+    table_options.format_version = 2;
+#endif
+    options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  }
+  virtual Options GetOptions() const override { return options_; }
+  virtual std::string Name() const override {
+    return "ZlibCompressionVersion2";
+  }
+
+ private:
+  Options options_;
+};
+
+class SanityTestLZ4Compression : public SanityTest {
+ public:
+  explicit SanityTestLZ4Compression(const std::string& path)
+      : SanityTest(path) {
+    options_.compression = kLZ4Compression;
+  }
+  virtual Options GetOptions() const override { return options_; }
+  virtual std::string Name() const override { return "LZ4Compression"; }
+
+ private:
+  Options options_;
+};
+
+class SanityTestLZ4HCCompression : public SanityTest {
+ public:
+  explicit SanityTestLZ4HCCompression(const std::string& path)
+      : SanityTest(path) {
+    options_.compression = kLZ4HCCompression;
+  }
+  virtual Options GetOptions() const override { return options_; }
+  virtual std::string Name() const override { return "LZ4HCCompression"; }
+
+ private:
+  Options options_;
+};
+
+class SanityTestZSTDCompression : public SanityTest {
+ public:
+  explicit SanityTestZSTDCompression(const std::string& path)
+      : SanityTest(path) {
+    options_.compression = kZSTD;
+  }
+  virtual Options GetOptions() const override { return options_; }
+  virtual std::string Name() const override { return "ZSTDCompression"; }
+
+ private:
+  Options options_;
+};
+
+#ifndef ROCKSDB_LITE
+class SanityTestPlainTableFactory : public SanityTest {
+ public:
+  explicit SanityTestPlainTableFactory(const std::string& path)
+      : SanityTest(path) {
+    options_.table_factory.reset(NewPlainTableFactory());
+    options_.prefix_extractor.reset(NewFixedPrefixTransform(2));
+    options_.allow_mmap_reads = true;
+  }
+  ~SanityTestPlainTableFactory() {}
+  virtual Options GetOptions() const override { return options_; }
+  virtual std::string Name() const override { return "PlainTable"; }
+
+ private:
+  Options options_;
+};
+#endif  // ROCKSDB_LITE
+
+class SanityTestBloomFilter : public SanityTest {
+ public:
+  explicit SanityTestBloomFilter(const std::string& path) : SanityTest(path) {
+    BlockBasedTableOptions table_options;
+    table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+    options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  }
+  ~SanityTestBloomFilter() {}
+  virtual Options GetOptions() const override { return options_; }
+  virtual std::string Name() const override { return "BloomFilter"; }
+
+ private:
+  Options options_;
+};
+
+namespace {
+bool RunSanityTests(const std::string& command, const std::string& path) {
+  bool result = true;
+// Suppress false positive clang static anaylzer warnings.
+#ifndef __clang_analyzer__
+  std::vector<SanityTest*> sanity_tests = {
+      new SanityTestBasic(path),
+      new SanityTestSpecialComparator(path),
+      new SanityTestZlibCompression(path),
+      new SanityTestZlibCompressionVersion2(path),
+      new SanityTestLZ4Compression(path),
+      new SanityTestLZ4HCCompression(path),
+      new SanityTestZSTDCompression(path),
+#ifndef ROCKSDB_LITE
+      new SanityTestPlainTableFactory(path),
+#endif  // ROCKSDB_LITE
+      new SanityTestBloomFilter(path)};
+
+  if (command == "create") {
+    fprintf(stderr, "Creating...\n");
+  } else {
+    fprintf(stderr, "Verifying...\n");
+  }
+  for (auto sanity_test : sanity_tests) {
+    Status s;
+    fprintf(stderr, "%s -- ", sanity_test->Name().c_str());
+    if (command == "create") {
+      s = sanity_test->Create();
+    } else {
+      assert(command == "verify");
+      s = sanity_test->Verify();
+    }
+    fprintf(stderr, "%s\n", s.ToString().c_str());
+    if (!s.ok()) {
+      fprintf(stderr, "FAIL\n");
+      result = false;
+    }
+
+    delete sanity_test;
+  }
+#endif  // __clang_analyzer__
+  return result;
+}
+}  // namespace
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  std::string path, command;
+  bool ok = (argc == 3);
+  if (ok) {
+    path = std::string(argv[1]);
+    command = std::string(argv[2]);
+    ok = (command == "create" || command == "verify");
+  }
+  if (!ok) {
+    fprintf(stderr, "Usage: %s <path> [create|verify] \n", argv[0]);
+    exit(1);
+  }
+  if (path.back() != '/') {
+    path += "/";
+  }
+
+  bool sanity_ok = ROCKSDB_NAMESPACE::RunSanityTests(command, path);
+
+  return sanity_ok ? 0 : 1;
+}
diff --git a/src/rocksdb/tools/dbench_monitor b/src/rocksdb/tools/dbench_monitor
new file mode 100755
index 000000000..d85f9d070
--- /dev/null
+++ b/src/rocksdb/tools/dbench_monitor
@@ -0,0 +1,102 @@
+#!/usr/bin/env bash
+#
+#(c) 2004-present, Facebook Inc. All rights reserved.
+#
+#see LICENSE file for more information on use/redistribution rights.
+#
+
+#
+#dbench_monitor: monitor db_bench process for violation of memory utilization
+#
+#default usage will monitor 'virtual memory size'. See below for standard options
+#passed to db_bench during this test.
+#
+# See also: ./pflag for the actual monitoring script that does the work
+#
+#NOTE:
+#  You may end up with some /tmp/ files if db_bench OR
+#  this script OR ./pflag was killed unceremoniously
+#
+#  If you see the script taking a long time, trying "kill"
+#  will usually cleanly exit.
+#
+#
+DIR=`dirname $0`
+LOG=/tmp/`basename $0`.$$
+DB_BENCH="$DIR/../db_bench";
+PFLAG=${DIR}/pflag
+
+usage() {
+    cat <<HELP; exit
+
+Usage: $0  [-h]
+
+-h: prints this help message
+
+This program will run the db_bench script to monitor memory usage
+using the 'pflag' program. It launches db_bench with default settings
+for certain arguments. You can change the defaults passed to
+'db_bench' program, by setting the following environment 
+variables:
+
+  bs [block_size]
+  ztype [compression_type]
+  benches [benchmarks]
+  reads [reads]
+  threads [threads]
+  cs [cache_size]
+  vsize [value_size]
+  comp [compression_ratio]
+  num [num]
+
+See the code for more info
+
+HELP
+
+}
+
+[ ! -x ${DB_BENCH} ] && echo "WARNING: ${DB_BENCH} doesn't exist, abort!" && exit -1;
+
+[ "x$1" = "x-h" ] && usage;
+
+trap 'rm -f ${LOG}; kill ${PID}; echo "Interrupted, exiting";' 1 2 3 15
+
+touch $LOG;
+
+: ${bs:=16384}
+: ${ztype:=zlib}
+: ${benches:=readwhilewriting}
+: ${reads:=$((1*1024*1024))};
+: ${threads:=8}
+: ${vsize:=2000}
+: ${comp:=0.5}
+: ${num:=10000}
+: ${cs:=$((1*1024*1024*1024))};
+
+DEBUG=1    #Set to 0 to remove chattiness 
+
+
+if [ "x$DEBUG" != "x" ]; then
+  #
+  #NOTE: under some circumstances, --use_existing_db may leave LOCK files under ${TMPDIR}/rocksdb/*
+  #cleanup the dir and re-run
+  #
+  echo DEBUG: Will run $DB_BENCH --block_size=$bs --compression_type=$ztype --benchmarks="$benches" --reads="$reads" --threads="$threads" --cache_size=$cs  --value_size=$vsize --compression_ratio=$comp --num=$num --use_existing_db 
+
+fi
+
+$DB_BENCH --block_size=$bs --compression_type=$ztype --benchmarks="$benches" --reads="$reads" --threads="$threads" --cache_size=$cs  --value_size=$vsize --compression_ratio=$comp --num=$num --use_existing_db >$LOG 2>&1 &
+
+if [ $? -ne 0 ]; then
+  warn "WARNING: ${DB_BENCH} did not launch successfully! Abort!";
+  exit;
+fi
+PID=$!
+
+#
+#Start the monitoring. Default is "vsz" monitoring for upto cache_size ($cs) value of virtual mem
+#You could also monitor RSS and CPUTIME (bsdtime). Try 'pflag -h' for how to do this
+#
+${PFLAG} -p $PID -v
+
+rm -f $LOG;
diff --git a/src/rocksdb/tools/dump/db_dump_tool.cc b/src/rocksdb/tools/dump/db_dump_tool.cc
new file mode 100644
index 000000000..427a54d99
--- /dev/null
+++ b/src/rocksdb/tools/dump/db_dump_tool.cc
@@ -0,0 +1,260 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/db_dump_tool.h"
+
+#include <cinttypes>
+#include <iostream>
+
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool DbDumpTool::Run(const DumpOptions& dump_options,
+                     ROCKSDB_NAMESPACE::Options options) {
+  ROCKSDB_NAMESPACE::DB* dbptr;
+  ROCKSDB_NAMESPACE::Status status;
+  std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> dumpfile;
+  char hostname[1024];
+  int64_t timesec = 0;
+  std::string abspath;
+  char json[4096];
+
+  static const char* magicstr = "ROCKDUMP";
+  static const char versionstr[8] = {0, 0, 0, 0, 0, 0, 0, 1};
+
+  ROCKSDB_NAMESPACE::Env* env = ROCKSDB_NAMESPACE::Env::Default();
+
+  // Open the database
+  options.create_if_missing = false;
+  status = ROCKSDB_NAMESPACE::DB::OpenForReadOnly(options, dump_options.db_path,
+                                                  &dbptr);
+  if (!status.ok()) {
+    std::cerr << "Unable to open database '" << dump_options.db_path
+              << "' for reading: " << status.ToString() << std::endl;
+    return false;
+  }
+
+  const std::unique_ptr<ROCKSDB_NAMESPACE::DB> db(dbptr);
+
+  status = env->NewWritableFile(dump_options.dump_location, &dumpfile,
+                                ROCKSDB_NAMESPACE::EnvOptions());
+  if (!status.ok()) {
+    std::cerr << "Unable to open dump file '" << dump_options.dump_location
+              << "' for writing: " << status.ToString() << std::endl;
+    return false;
+  }
+
+  ROCKSDB_NAMESPACE::Slice magicslice(magicstr, 8);
+  status = dumpfile->Append(magicslice);
+  if (!status.ok()) {
+    std::cerr << "Append failed: " << status.ToString() << std::endl;
+    return false;
+  }
+
+  ROCKSDB_NAMESPACE::Slice versionslice(versionstr, 8);
+  status = dumpfile->Append(versionslice);
+  if (!status.ok()) {
+    std::cerr << "Append failed: " << status.ToString() << std::endl;
+    return false;
+  }
+
+  if (dump_options.anonymous) {
+    snprintf(json, sizeof(json), "{}");
+  } else {
+    status = env->GetHostName(hostname, sizeof(hostname));
+    status = env->GetCurrentTime(&timesec);
+    status = env->GetAbsolutePath(dump_options.db_path, &abspath);
+    snprintf(json, sizeof(json),
+             "{ \"database-path\": \"%s\", \"hostname\": \"%s\", "
+             "\"creation-time\": %" PRIi64 " }",
+             abspath.c_str(), hostname, timesec);
+  }
+
+  ROCKSDB_NAMESPACE::Slice infoslice(json, strlen(json));
+  char infosize[4];
+  ROCKSDB_NAMESPACE::EncodeFixed32(infosize, (uint32_t)infoslice.size());
+  ROCKSDB_NAMESPACE::Slice infosizeslice(infosize, 4);
+  status = dumpfile->Append(infosizeslice);
+  if (!status.ok()) {
+    std::cerr << "Append failed: " << status.ToString() << std::endl;
+    return false;
+  }
+  status = dumpfile->Append(infoslice);
+  if (!status.ok()) {
+    std::cerr << "Append failed: " << status.ToString() << std::endl;
+    return false;
+  }
+
+  const std::unique_ptr<ROCKSDB_NAMESPACE::Iterator> it(
+      db->NewIterator(ROCKSDB_NAMESPACE::ReadOptions()));
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    char keysize[4];
+    ROCKSDB_NAMESPACE::EncodeFixed32(keysize, (uint32_t)it->key().size());
+    ROCKSDB_NAMESPACE::Slice keysizeslice(keysize, 4);
+    status = dumpfile->Append(keysizeslice);
+    if (!status.ok()) {
+      std::cerr << "Append failed: " << status.ToString() << std::endl;
+      return false;
+    }
+    status = dumpfile->Append(it->key());
+    if (!status.ok()) {
+      std::cerr << "Append failed: " << status.ToString() << std::endl;
+      return false;
+    }
+
+    char valsize[4];
+    ROCKSDB_NAMESPACE::EncodeFixed32(valsize, (uint32_t)it->value().size());
+    ROCKSDB_NAMESPACE::Slice valsizeslice(valsize, 4);
+    status = dumpfile->Append(valsizeslice);
+    if (!status.ok()) {
+      std::cerr << "Append failed: " << status.ToString() << std::endl;
+      return false;
+    }
+    status = dumpfile->Append(it->value());
+    if (!status.ok()) {
+      std::cerr << "Append failed: " << status.ToString() << std::endl;
+      return false;
+    }
+  }
+  if (!it->status().ok()) {
+    std::cerr << "Database iteration failed: " << status.ToString()
+              << std::endl;
+    return false;
+  }
+  return true;
+}
+
+bool DbUndumpTool::Run(const UndumpOptions& undump_options,
+                       ROCKSDB_NAMESPACE::Options options) {
+  ROCKSDB_NAMESPACE::DB* dbptr;
+  ROCKSDB_NAMESPACE::Status status;
+  ROCKSDB_NAMESPACE::Env* env;
+  std::unique_ptr<ROCKSDB_NAMESPACE::SequentialFile> dumpfile;
+  ROCKSDB_NAMESPACE::Slice slice;
+  char scratch8[8];
+
+  static const char* magicstr = "ROCKDUMP";
+  static const char versionstr[8] = {0, 0, 0, 0, 0, 0, 0, 1};
+
+  env = ROCKSDB_NAMESPACE::Env::Default();
+
+  status = env->NewSequentialFile(undump_options.dump_location, &dumpfile,
+                                  ROCKSDB_NAMESPACE::EnvOptions());
+  if (!status.ok()) {
+    std::cerr << "Unable to open dump file '" << undump_options.dump_location
+              << "' for reading: " << status.ToString() << std::endl;
+    return false;
+  }
+
+  status = dumpfile->Read(8, &slice, scratch8);
+  if (!status.ok() || slice.size() != 8 ||
+      memcmp(slice.data(), magicstr, 8) != 0) {
+    std::cerr << "File '" << undump_options.dump_location
+              << "' is not a recognizable dump file." << std::endl;
+    return false;
+  }
+
+  status = dumpfile->Read(8, &slice, scratch8);
+  if (!status.ok() || slice.size() != 8 ||
+      memcmp(slice.data(), versionstr, 8) != 0) {
+    std::cerr << "File '" << undump_options.dump_location
+              << "' version not recognized." << std::endl;
+    return false;
+  }
+
+  status = dumpfile->Read(4, &slice, scratch8);
+  if (!status.ok() || slice.size() != 4) {
+    std::cerr << "Unable to read info blob size." << std::endl;
+    return false;
+  }
+  uint32_t infosize = ROCKSDB_NAMESPACE::DecodeFixed32(slice.data());
+  status = dumpfile->Skip(infosize);
+  if (!status.ok()) {
+    std::cerr << "Unable to skip info blob: " << status.ToString() << std::endl;
+    return false;
+  }
+
+  options.create_if_missing = true;
+  status = ROCKSDB_NAMESPACE::DB::Open(options, undump_options.db_path, &dbptr);
+  if (!status.ok()) {
+    std::cerr << "Unable to open database '" << undump_options.db_path
+              << "' for writing: " << status.ToString() << std::endl;
+    return false;
+  }
+
+  const std::unique_ptr<ROCKSDB_NAMESPACE::DB> db(dbptr);
+
+  uint32_t last_keysize = 64;
+  size_t last_valsize = 1 << 20;
+  std::unique_ptr<char[]> keyscratch(new char[last_keysize]);
+  std::unique_ptr<char[]> valscratch(new char[last_valsize]);
+
+  while (1) {
+    uint32_t keysize, valsize;
+    ROCKSDB_NAMESPACE::Slice keyslice;
+    ROCKSDB_NAMESPACE::Slice valslice;
+
+    status = dumpfile->Read(4, &slice, scratch8);
+    if (!status.ok() || slice.size() != 4) break;
+    keysize = ROCKSDB_NAMESPACE::DecodeFixed32(slice.data());
+    if (keysize > last_keysize) {
+      while (keysize > last_keysize) last_keysize *= 2;
+      keyscratch = std::unique_ptr<char[]>(new char[last_keysize]);
+    }
+
+    status = dumpfile->Read(keysize, &keyslice, keyscratch.get());
+    if (!status.ok() || keyslice.size() != keysize) {
+      std::cerr << "Key read failure: "
+                << (status.ok() ? "insufficient data" : status.ToString())
+                << std::endl;
+      return false;
+    }
+
+    status = dumpfile->Read(4, &slice, scratch8);
+    if (!status.ok() || slice.size() != 4) {
+      std::cerr << "Unable to read value size: "
+                << (status.ok() ? "insufficient data" : status.ToString())
+                << std::endl;
+      return false;
+    }
+    valsize = ROCKSDB_NAMESPACE::DecodeFixed32(slice.data());
+    if (valsize > last_valsize) {
+      while (valsize > last_valsize) last_valsize *= 2;
+      valscratch = std::unique_ptr<char[]>(new char[last_valsize]);
+    }
+
+    status = dumpfile->Read(valsize, &valslice, valscratch.get());
+    if (!status.ok() || valslice.size() != valsize) {
+      std::cerr << "Unable to read value: "
+                << (status.ok() ? "insufficient data" : status.ToString())
+                << std::endl;
+      return false;
+    }
+
+    status = db->Put(ROCKSDB_NAMESPACE::WriteOptions(), keyslice, valslice);
+    if (!status.ok()) {
+      fprintf(stderr, "Unable to write database entry\n");
+      return false;
+    }
+  }
+
+  if (undump_options.compact_db) {
+    status = db->CompactRange(ROCKSDB_NAMESPACE::CompactRangeOptions(), nullptr,
+                              nullptr);
+    if (!status.ok()) {
+      fprintf(stderr,
+              "Unable to compact the database after loading the dumped file\n");
+      return false;
+    }
+  }
+  return true;
+}
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/dump/rocksdb_dump.cc b/src/rocksdb/tools/dump/rocksdb_dump.cc
new file mode 100644
index 000000000..358457e92
--- /dev/null
+++ b/src/rocksdb/tools/dump/rocksdb_dump.cc
@@ -0,0 +1,63 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#if !(defined GFLAGS) || defined(ROCKSDB_LITE)
+
+#include <cstdio>
+int main() {
+#ifndef GFLAGS
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+#endif
+#ifdef ROCKSDB_LITE
+  fprintf(stderr, "DbDumpTool is not supported in ROCKSDB_LITE\n");
+#endif
+  return 1;
+}
+
+#else
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/db_dump_tool.h"
+#include "util/gflags_compat.h"
+
+DEFINE_string(db_path, "", "Path to the db that will be dumped");
+DEFINE_string(dump_location, "", "Path to where the dump file location");
+DEFINE_bool(anonymous, false,
+            "Remove information like db path, creation time from dumped file");
+DEFINE_string(db_options, "",
+              "Options string used to open the database that will be dumped");
+
+int main(int argc, char** argv) {
+  GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+
+  if (FLAGS_db_path == "" || FLAGS_dump_location == "") {
+    fprintf(stderr, "Please set --db_path and --dump_location\n");
+    return 1;
+  }
+
+  ROCKSDB_NAMESPACE::DumpOptions dump_options;
+  dump_options.db_path = FLAGS_db_path;
+  dump_options.dump_location = FLAGS_dump_location;
+  dump_options.anonymous = FLAGS_anonymous;
+
+  ROCKSDB_NAMESPACE::Options db_options;
+  if (FLAGS_db_options != "") {
+    ROCKSDB_NAMESPACE::Options parsed_options;
+    ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::GetOptionsFromString(
+        db_options, FLAGS_db_options, &parsed_options);
+    if (!s.ok()) {
+      fprintf(stderr, "Cannot parse provided db_options\n");
+      return 1;
+    }
+    db_options = parsed_options;
+  }
+
+  ROCKSDB_NAMESPACE::DbDumpTool tool;
+  if (!tool.Run(dump_options, db_options)) {
+    return 1;
+  }
+  return 0;
+}
+#endif  // !(defined GFLAGS) || defined(ROCKSDB_LITE)
diff --git a/src/rocksdb/tools/dump/rocksdb_undump.cc b/src/rocksdb/tools/dump/rocksdb_undump.cc
new file mode 100644
index 000000000..2ff128548
--- /dev/null
+++ b/src/rocksdb/tools/dump/rocksdb_undump.cc
@@ -0,0 +1,62 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#if !(defined GFLAGS) || defined(ROCKSDB_LITE)
+
+#include <cstdio>
+int main() {
+#ifndef GFLAGS
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+#endif
+#ifdef ROCKSDB_LITE
+  fprintf(stderr, "DbUndumpTool is not supported in ROCKSDB_LITE\n");
+#endif
+  return 1;
+}
+
+#else
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/db_dump_tool.h"
+#include "util/gflags_compat.h"
+
+DEFINE_string(dump_location, "", "Path to the dump file that will be loaded");
+DEFINE_string(db_path, "", "Path to the db that we will undump the file into");
+DEFINE_bool(compact, false, "Compact the db after loading the dumped file");
+DEFINE_string(db_options, "",
+              "Options string used to open the database that will be loaded");
+
+int main(int argc, char **argv) {
+  GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+
+  if (FLAGS_db_path == "" || FLAGS_dump_location == "") {
+    fprintf(stderr, "Please set --db_path and --dump_location\n");
+    return 1;
+  }
+
+  ROCKSDB_NAMESPACE::UndumpOptions undump_options;
+  undump_options.db_path = FLAGS_db_path;
+  undump_options.dump_location = FLAGS_dump_location;
+  undump_options.compact_db = FLAGS_compact;
+
+  ROCKSDB_NAMESPACE::Options db_options;
+  if (FLAGS_db_options != "") {
+    ROCKSDB_NAMESPACE::Options parsed_options;
+    ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::GetOptionsFromString(
+        db_options, FLAGS_db_options, &parsed_options);
+    if (!s.ok()) {
+      fprintf(stderr, "Cannot parse provided db_options\n");
+      return 1;
+    }
+    db_options = parsed_options;
+  }
+
+  ROCKSDB_NAMESPACE::DbUndumpTool tool;
+  if (!tool.Run(undump_options, db_options)) {
+    return 1;
+  }
+  return 0;
+}
+#endif  // !(defined GFLAGS) || defined(ROCKSDB_LITE)
diff --git a/src/rocksdb/tools/generate_random_db.sh b/src/rocksdb/tools/generate_random_db.sh
new file mode 100755
index 000000000..5b5962617
--- /dev/null
+++ b/src/rocksdb/tools/generate_random_db.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+# A shell script to load some pre generated data file to a DB using ldb tool
+# ./ldb needs to be avaible to be executed.
+#
+# Usage: <SCRIPT> <input_data_path> <DB Path>
+
+if [ "$#" -lt 2 ]; then
+  echo "usage: $BASH_SOURCE <input_data_path> <DB Path>"
+  exit 1
+fi
+
+input_data_dir=$1
+db_dir=$2
+rm -rf $db_dir
+
+echo == Loading data from $input_data_dir to $db_dir
+
+declare -a compression_opts=("no" "snappy" "zlib" "bzip2")
+
+set -e
+
+n=0
+
+for f in `ls -1 $input_data_dir`
+do
+  echo == Loading $f with compression ${compression_opts[n % 4]}
+  ./ldb load --db=$db_dir --compression_type=${compression_opts[n % 4]} --bloom_bits=10 --auto_compaction=false --create_if_missing < $input_data_dir/$f
+  let "n = n + 1"
+done
diff --git a/src/rocksdb/tools/ingest_external_sst.sh b/src/rocksdb/tools/ingest_external_sst.sh
new file mode 100755
index 000000000..8e2fed7ce
--- /dev/null
+++ b/src/rocksdb/tools/ingest_external_sst.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+#
+
+if [ "$#" -lt 2 ]; then
+  echo "usage: $BASH_SOURCE <DB Path> <External SST Dir>"
+  exit 1
+fi
+
+db_dir=$1
+external_sst_dir=$2
+
+for f in `find $external_sst_dir -name extern_sst*`
+do
+  echo == Ingesting external SST file $f to DB at $db_dir
+  ./ldb --db=$db_dir --create_if_missing ingest_extern_sst $f
+done
diff --git a/src/rocksdb/tools/io_tracer_parser.cc b/src/rocksdb/tools/io_tracer_parser.cc
new file mode 100644
index 000000000..41ef45d97
--- /dev/null
+++ b/src/rocksdb/tools/io_tracer_parser.cc
@@ -0,0 +1,25 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else  // GFLAGS
+#include "tools/io_tracer_parser_tool.h"
+int main(int argc, char** argv) {
+  return ROCKSDB_NAMESPACE::io_tracer_parser(argc, argv);
+}
+#endif  // GFLAGS
+#else   // ROCKSDB_LITE
+#include <stdio.h>
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "Not supported in lite mode.\n");
+  return 1;
+}
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/io_tracer_parser_test.cc b/src/rocksdb/tools/io_tracer_parser_test.cc
new file mode 100644
index 000000000..41be5fa96
--- /dev/null
+++ b/src/rocksdb/tools/io_tracer_parser_test.cc
@@ -0,0 +1,190 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#ifndef ROCKSDB_LITE
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run io_tracer_parser_test\n");
+  return 0;
+}
+#else
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "tools/io_tracer_parser_tool.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+const int kMaxArgCount = 100;
+const size_t kArgBufferSize = 100000;
+}  // namespace
+
+class IOTracerParserTest : public testing::Test {
+ public:
+  IOTracerParserTest() {
+    test_path_ = test::PerThreadDBPath("io_tracer_parser_test");
+    env_ = ROCKSDB_NAMESPACE::Env::Default();
+    EXPECT_OK(env_->CreateDirIfMissing(test_path_));
+    trace_file_path_ = test_path_ + "/io_trace_file";
+    dbname_ = test_path_ + "/db";
+    Options options;
+    options.create_if_missing = true;
+    EXPECT_OK(DB::Open(options, dbname_, &db_));
+  }
+
+  ~IOTracerParserTest() {
+    if (env_->FileExists(trace_file_path_).ok()) {
+      EXPECT_OK(env_->DeleteFile(trace_file_path_));
+    }
+    if (db_ != nullptr) {
+      Options options;
+      options.env = env_;
+      delete db_;
+      db_ = nullptr;
+      EXPECT_OK(DestroyDB(dbname_, options));
+    }
+    EXPECT_OK(env_->DeleteDir(test_path_));
+  }
+
+  void GenerateIOTrace() {
+    WriteOptions write_opt;
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+
+    ASSERT_OK(db_->StartIOTrace(trace_opt, std::move(trace_writer)));
+
+    for (int i = 0; i < 10; i++) {
+      ASSERT_OK(db_->Put(write_opt, "key_" + std::to_string(i),
+                         "value_" + std::to_string(i)));
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    }
+
+    ASSERT_OK(db_->EndIOTrace());
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+
+  void RunIOTracerParserTool() {
+    std::vector<std::string> params = {"./io_tracer_parser",
+                                       "-io_trace_file=" + trace_file_path_};
+
+    char arg_buffer[kArgBufferSize];
+    char* argv[kMaxArgCount];
+    int argc = 0;
+    int cursor = 0;
+    for (const auto& arg : params) {
+      ASSERT_LE(cursor + arg.size() + 1, kArgBufferSize);
+      ASSERT_LE(argc + 1, kMaxArgCount);
+
+      snprintf(arg_buffer + cursor, arg.size() + 1, "%s", arg.c_str());
+
+      argv[argc++] = arg_buffer + cursor;
+      cursor += static_cast<int>(arg.size()) + 1;
+    }
+    ASSERT_EQ(0, ROCKSDB_NAMESPACE::io_tracer_parser(argc, argv));
+  }
+
+  DB* db_;
+  Env* env_;
+  EnvOptions env_options_;
+  std::string trace_file_path_;
+  std::string output_file_;
+  std::string test_path_;
+  std::string dbname_;
+};
+
+TEST_F(IOTracerParserTest, InvalidArguments) {
+  {
+    std::vector<std::string> params = {"./io_tracer_parser"};
+    char arg_buffer[kArgBufferSize];
+    char* argv[kMaxArgCount];
+    int argc = 0;
+    int cursor = 0;
+    for (const auto& arg : params) {
+      ASSERT_LE(cursor + arg.size() + 1, kArgBufferSize);
+      ASSERT_LE(argc + 1, kMaxArgCount);
+
+      snprintf(arg_buffer + cursor, arg.size() + 1, "%s", arg.c_str());
+
+      argv[argc++] = arg_buffer + cursor;
+      cursor += static_cast<int>(arg.size()) + 1;
+    }
+    ASSERT_EQ(1, ROCKSDB_NAMESPACE::io_tracer_parser(argc, argv));
+  }
+}
+
+TEST_F(IOTracerParserTest, DumpAndParseIOTraceRecords) {
+  GenerateIOTrace();
+  RunIOTracerParserTool();
+}
+
+TEST_F(IOTracerParserTest, NoRecordingAfterEndIOTrace) {
+  uint64_t file_size = 0;
+  // Generate IO trace records and parse them.
+  {
+    GenerateIOTrace();
+    RunIOTracerParserTool();
+    ASSERT_OK(env_->GetFileSize(trace_file_path_, &file_size));
+  }
+  // Once DB::EndIOTrace is invoked in GenerateIOTrace(), no new records should
+  // be appended.
+  {
+    WriteOptions write_opt;
+    for (int i = 10; i < 20; i++) {
+      ASSERT_OK(db_->Put(write_opt, "key_" + std::to_string(i),
+                         "value_" + std::to_string(i)));
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    }
+  }
+
+  uint64_t new_file_size = 0;
+  ASSERT_OK(env_->GetFileSize(trace_file_path_, &new_file_size));
+  ASSERT_EQ(file_size, new_file_size);
+}
+
+TEST_F(IOTracerParserTest, NoRecordingBeforeStartIOTrace) {
+  {
+    WriteOptions write_opt;
+    for (int i = 10; i < 20; i++) {
+      ASSERT_OK(db_->Put(write_opt, "key_" + std::to_string(i),
+                         "value_" + std::to_string(i)));
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    }
+    // IO trace file doesn't exist
+    ASSERT_NOK(env_->FileExists(trace_file_path_));
+  }
+  // Generate IO trace records and parse them.
+  {
+    GenerateIOTrace();
+    RunIOTracerParserTool();
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+#endif  // GFLAGS
+#else
+#include <stdio.h>
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "io_tracer_parser_test is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/io_tracer_parser_tool.cc b/src/rocksdb/tools/io_tracer_parser_tool.cc
new file mode 100644
index 000000000..01b920f3b
--- /dev/null
+++ b/src/rocksdb/tools/io_tracer_parser_tool.cc
@@ -0,0 +1,144 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//    This source code is licensed under both the GPLv2 (found in the
+//    COPYING file in the root directory) and Apache 2.0 License
+//    (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#ifdef GFLAGS
+#include "tools/io_tracer_parser_tool.h"
+
+#include <cinttypes>
+#include <cstdio>
+#include <iomanip>
+#include <memory>
+#include <sstream>
+
+#include "port/lang.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "trace_replay/io_tracer.h"
+#include "util/gflags_compat.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_string(io_trace_file, "", "The IO trace file path.");
+
+namespace ROCKSDB_NAMESPACE {
+
+IOTraceRecordParser::IOTraceRecordParser(const std::string& input_file)
+    : input_file_(input_file) {}
+
+void IOTraceRecordParser::PrintHumanReadableHeader(
+    const IOTraceHeader& header) {
+  std::stringstream ss;
+  ss << "Start Time: " << header.start_time
+     << "\nRocksDB Major Version: " << header.rocksdb_major_version
+     << "\nRocksDB Minor Version: " << header.rocksdb_minor_version << "\n";
+  fprintf(stdout, "%s", ss.str().c_str());
+}
+
+void IOTraceRecordParser::PrintHumanReadableIOTraceRecord(
+    const IOTraceRecord& record) {
+  std::stringstream ss;
+  ss << "Access Time : " << std::setw(20) << std::left
+     << record.access_timestamp << ", File Name: " << std::setw(20) << std::left
+     << record.file_name.c_str() << ", File Operation: " << std::setw(18)
+     << std::left << record.file_operation.c_str()
+     << ", Latency: " << std::setw(10) << std::left << record.latency
+     << ", IO Status: " << record.io_status.c_str();
+
+  // Each bit in io_op_data stores which corresponding info from IOTraceOp will
+  // be added in the trace. Foreg, if bit at position 1 is set then
+  // IOTraceOp::kIOLen (length) will be logged in the record (Since
+  // IOTraceOp::kIOLen = 1 in the enum). So find all the set positions in
+  // io_op_data one by one and, update corresponsing info in the trace record,
+  // unset that bit to find other set bits until io_op_data = 0.
+  /* Read remaining options based on io_op_data set by file operation */
+  int64_t io_op_data = static_cast<int64_t>(record.io_op_data);
+  while (io_op_data) {
+    // Find the rightmost set bit.
+    uint32_t set_pos = static_cast<uint32_t>(log2(io_op_data & -io_op_data));
+    switch (set_pos) {
+      case IOTraceOp::kIOFileSize:
+        ss << ", File Size: " << record.file_size;
+        break;
+      case IOTraceOp::kIOLen:
+        ss << ", Length: " << record.len;
+        break;
+      case IOTraceOp::kIOOffset:
+        ss << ", Offset: " << record.offset;
+        break;
+      default:
+        assert(false);
+    }
+    // unset the rightmost bit.
+    io_op_data &= (io_op_data - 1);
+  }
+
+  int64_t trace_data = static_cast<int64_t>(record.trace_data);
+  while (trace_data) {
+    // Find the rightmost set bit.
+    uint32_t set_pos = static_cast<uint32_t>(log2(trace_data & -trace_data));
+    switch (set_pos) {
+      case IODebugContext::TraceData::kRequestID:
+        ss << ", Request Id: " << record.request_id;
+        break;
+      default:
+        assert(false);
+    }
+    // unset the rightmost bit.
+    trace_data &= (trace_data - 1);
+  }
+
+  ss << "\n";
+  fprintf(stdout, "%s", ss.str().c_str());
+}
+
+int IOTraceRecordParser::ReadIOTraceRecords() {
+  Status status;
+  Env* env(Env::Default());
+  std::unique_ptr<TraceReader> trace_reader;
+  std::unique_ptr<IOTraceReader> io_trace_reader;
+
+  status = NewFileTraceReader(env, EnvOptions(), input_file_, &trace_reader);
+  if (!status.ok()) {
+    fprintf(stderr, "%s: %s\n", input_file_.c_str(), status.ToString().c_str());
+    return 1;
+  }
+  io_trace_reader.reset(new IOTraceReader(std::move(trace_reader)));
+
+  // Read the header and dump it in a file.
+  IOTraceHeader header;
+  status = io_trace_reader->ReadHeader(&header);
+  if (!status.ok()) {
+    fprintf(stderr, "%s: %s\n", input_file_.c_str(), status.ToString().c_str());
+    return 1;
+  }
+  PrintHumanReadableHeader(header);
+
+  // Read the records one by one and print them in human readable format.
+  while (status.ok()) {
+    IOTraceRecord record;
+    status = io_trace_reader->ReadIOOp(&record);
+    if (!status.ok()) {
+      break;
+    }
+    PrintHumanReadableIOTraceRecord(record);
+  }
+  return 0;
+}
+
+int io_tracer_parser(int argc, char** argv) {
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  if (FLAGS_io_trace_file.empty()) {
+    fprintf(stderr, "IO Trace file path is empty\n");
+    return 1;
+  }
+
+  IOTraceRecordParser io_tracer_parser(FLAGS_io_trace_file);
+  return io_tracer_parser.ReadIOTraceRecords();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // GFLAGS
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/io_tracer_parser_tool.h b/src/rocksdb/tools/io_tracer_parser_tool.h
new file mode 100644
index 000000000..6c22c8f89
--- /dev/null
+++ b/src/rocksdb/tools/io_tracer_parser_tool.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#pragma once
+
+#include <memory>
+
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct IOTraceHeader;
+struct IOTraceRecord;
+
+// IOTraceRecordParser class reads the IO trace file (in binary format) and
+// dumps the human readable records in output_file_.
+class IOTraceRecordParser {
+ public:
+  explicit IOTraceRecordParser(const std::string& input_file);
+
+  // ReadIOTraceRecords reads the binary trace file records one by one and
+  // invoke PrintHumanReadableIOTraceRecord to dump the records in output_file_.
+  int ReadIOTraceRecords();
+
+ private:
+  void PrintHumanReadableHeader(const IOTraceHeader& header);
+  void PrintHumanReadableIOTraceRecord(const IOTraceRecord& record);
+
+  // Binary file that contains IO trace records.
+  std::string input_file_;
+};
+
+int io_tracer_parser(int argc, char** argv);
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/ldb.cc b/src/rocksdb/tools/ldb.cc
new file mode 100644
index 000000000..482383be8
--- /dev/null
+++ b/src/rocksdb/tools/ldb.cc
@@ -0,0 +1,21 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/ldb_tool.h"
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::LDBTool tool;
+  tool.Run(argc, argv);
+  return 0;
+}
+#else
+#include <stdio.h>
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "Not supported in lite mode.\n");
+  return 1;
+}
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/ldb_cmd.cc b/src/rocksdb/tools/ldb_cmd.cc
new file mode 100644
index 000000000..ecd2d2977
--- /dev/null
+++ b/src/rocksdb/tools/ldb_cmd.cc
@@ -0,0 +1,4263 @@
+
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+#include "rocksdb/utilities/ldb_cmd.h"
+
+#include <cinttypes>
+#include <cstdlib>
+#include <ctime>
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <limits>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+#include "db/blob/blob_index.h"
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/log_reader.h"
+#include "db/version_util.h"
+#include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/experimental.h"
+#include "rocksdb/file_checksum.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/utilities/backup_engine.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "rocksdb/utilities/debug.h"
+#include "rocksdb/utilities/options_util.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/sst_file_dumper.h"
+#include "tools/ldb_cmd_impl.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/file_checksum_helper.h"
+#include "util/stderr_logger.h"
+#include "util/string_util.h"
+#include "utilities/blob_db/blob_dump_tool.h"
+#include "utilities/merge_operators.h"
+#include "utilities/ttl/db_ttl_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class FileChecksumGenCrc32c;
+class FileChecksumGenCrc32cFactory;
+
+const std::string LDBCommand::ARG_ENV_URI = "env_uri";
+const std::string LDBCommand::ARG_FS_URI = "fs_uri";
+const std::string LDBCommand::ARG_DB = "db";
+const std::string LDBCommand::ARG_PATH = "path";
+const std::string LDBCommand::ARG_SECONDARY_PATH = "secondary_path";
+const std::string LDBCommand::ARG_HEX = "hex";
+const std::string LDBCommand::ARG_KEY_HEX = "key_hex";
+const std::string LDBCommand::ARG_VALUE_HEX = "value_hex";
+const std::string LDBCommand::ARG_CF_NAME = "column_family";
+const std::string LDBCommand::ARG_TTL = "ttl";
+const std::string LDBCommand::ARG_TTL_START = "start_time";
+const std::string LDBCommand::ARG_TTL_END = "end_time";
+const std::string LDBCommand::ARG_TIMESTAMP = "timestamp";
+const std::string LDBCommand::ARG_TRY_LOAD_OPTIONS = "try_load_options";
+const std::string LDBCommand::ARG_DISABLE_CONSISTENCY_CHECKS =
+    "disable_consistency_checks";
+const std::string LDBCommand::ARG_IGNORE_UNKNOWN_OPTIONS =
+    "ignore_unknown_options";
+const std::string LDBCommand::ARG_FROM = "from";
+const std::string LDBCommand::ARG_TO = "to";
+const std::string LDBCommand::ARG_MAX_KEYS = "max_keys";
+const std::string LDBCommand::ARG_BLOOM_BITS = "bloom_bits";
+const std::string LDBCommand::ARG_FIX_PREFIX_LEN = "fix_prefix_len";
+const std::string LDBCommand::ARG_COMPRESSION_TYPE = "compression_type";
+const std::string LDBCommand::ARG_COMPRESSION_MAX_DICT_BYTES =
+    "compression_max_dict_bytes";
+const std::string LDBCommand::ARG_BLOCK_SIZE = "block_size";
+const std::string LDBCommand::ARG_AUTO_COMPACTION = "auto_compaction";
+const std::string LDBCommand::ARG_DB_WRITE_BUFFER_SIZE = "db_write_buffer_size";
+const std::string LDBCommand::ARG_WRITE_BUFFER_SIZE = "write_buffer_size";
+const std::string LDBCommand::ARG_FILE_SIZE = "file_size";
+const std::string LDBCommand::ARG_CREATE_IF_MISSING = "create_if_missing";
+const std::string LDBCommand::ARG_NO_VALUE = "no_value";
+const std::string LDBCommand::ARG_ENABLE_BLOB_FILES = "enable_blob_files";
+const std::string LDBCommand::ARG_MIN_BLOB_SIZE = "min_blob_size";
+const std::string LDBCommand::ARG_BLOB_FILE_SIZE = "blob_file_size";
+const std::string LDBCommand::ARG_BLOB_COMPRESSION_TYPE =
+    "blob_compression_type";
+const std::string LDBCommand::ARG_ENABLE_BLOB_GARBAGE_COLLECTION =
+    "enable_blob_garbage_collection";
+const std::string LDBCommand::ARG_BLOB_GARBAGE_COLLECTION_AGE_CUTOFF =
+    "blob_garbage_collection_age_cutoff";
+const std::string LDBCommand::ARG_BLOB_GARBAGE_COLLECTION_FORCE_THRESHOLD =
+    "blob_garbage_collection_force_threshold";
+const std::string LDBCommand::ARG_BLOB_COMPACTION_READAHEAD_SIZE =
+    "blob_compaction_readahead_size";
+const std::string LDBCommand::ARG_BLOB_FILE_STARTING_LEVEL =
+    "blob_file_starting_level";
+const std::string LDBCommand::ARG_PREPOPULATE_BLOB_CACHE =
+    "prepopulate_blob_cache";
+const std::string LDBCommand::ARG_DECODE_BLOB_INDEX = "decode_blob_index";
+const std::string LDBCommand::ARG_DUMP_UNCOMPRESSED_BLOBS =
+    "dump_uncompressed_blobs";
+
+const char* LDBCommand::DELIM = " ==> ";
+
+namespace {
+
+void DumpWalFile(Options options, std::string wal_file, bool print_header,
+                 bool print_values, bool is_write_committed,
+                 LDBCommandExecuteResult* exec_state);
+
+void DumpSstFile(Options options, std::string filename, bool output_hex,
+                 bool show_properties, bool decode_blob_index,
+                 std::string from_key = "", std::string to_key = "");
+
+void DumpBlobFile(const std::string& filename, bool is_key_hex,
+                  bool is_value_hex, bool dump_uncompressed_blobs);
+};  // namespace
+
+LDBCommand* LDBCommand::InitFromCmdLineArgs(
+    int argc, char const* const* argv, const Options& options,
+    const LDBOptions& ldb_options,
+    const std::vector<ColumnFamilyDescriptor>* column_families) {
+  std::vector<std::string> args;
+  for (int i = 1; i < argc; i++) {
+    args.push_back(argv[i]);
+  }
+  return InitFromCmdLineArgs(args, options, ldb_options, column_families,
+                             SelectCommand);
+}
+
+/**
+ * Parse the command-line arguments and create the appropriate LDBCommand2
+ * instance.
+ * The command line arguments must be in the following format:
+ * ./ldb --db=PATH_TO_DB [--commonOpt1=commonOpt1Val] ..
+ *        COMMAND <PARAM1> <PARAM2> ... [-cmdSpecificOpt1=cmdSpecificOpt1Val] ..
+ * This is similar to the command line format used by HBaseClientTool.
+ * Command name is not included in args.
+ * Returns nullptr if the command-line cannot be parsed.
+ */
+LDBCommand* LDBCommand::InitFromCmdLineArgs(
+    const std::vector<std::string>& args, const Options& options,
+    const LDBOptions& ldb_options,
+    const std::vector<ColumnFamilyDescriptor>* /*column_families*/,
+    const std::function<LDBCommand*(const ParsedParams&)>& selector) {
+  // --x=y command line arguments are added as x->y map entries in
+  // parsed_params.option_map.
+  //
+  // Command-line arguments of the form --hex end up in this array as hex to
+  // parsed_params.flags
+  ParsedParams parsed_params;
+
+  // Everything other than option_map and flags. Represents commands
+  // and their parameters.  For eg: put key1 value1 go into this vector.
+  std::vector<std::string> cmdTokens;
+
+  const std::string OPTION_PREFIX = "--";
+
+  for (const auto& arg : args) {
+    if (arg[0] == '-' && arg[1] == '-') {
+      std::vector<std::string> splits = StringSplit(arg, '=');
+      // --option_name=option_value
+      if (splits.size() == 2) {
+        std::string optionKey = splits[0].substr(OPTION_PREFIX.size());
+        parsed_params.option_map[optionKey] = splits[1];
+      } else if (splits.size() == 1) {
+        // --flag_name
+        std::string optionKey = splits[0].substr(OPTION_PREFIX.size());
+        parsed_params.flags.push_back(optionKey);
+      } else {
+        // --option_name=option_value, option_value contains '='
+        std::string optionKey = splits[0].substr(OPTION_PREFIX.size());
+        parsed_params.option_map[optionKey] =
+            arg.substr(splits[0].length() + 1);
+      }
+    } else {
+      cmdTokens.push_back(arg);
+    }
+  }
+
+  if (cmdTokens.size() < 1) {
+    fprintf(stderr, "Command not specified!");
+    return nullptr;
+  }
+
+  parsed_params.cmd = cmdTokens[0];
+  parsed_params.cmd_params.assign(cmdTokens.begin() + 1, cmdTokens.end());
+
+  LDBCommand* command = selector(parsed_params);
+
+  if (command) {
+    command->SetDBOptions(options);
+    command->SetLDBOptions(ldb_options);
+  }
+  return command;
+}
+
+LDBCommand* LDBCommand::SelectCommand(const ParsedParams& parsed_params) {
+  if (parsed_params.cmd == GetCommand::Name()) {
+    return new GetCommand(parsed_params.cmd_params, parsed_params.option_map,
+                          parsed_params.flags);
+  } else if (parsed_params.cmd == PutCommand::Name()) {
+    return new PutCommand(parsed_params.cmd_params, parsed_params.option_map,
+                          parsed_params.flags);
+  } else if (parsed_params.cmd == BatchPutCommand::Name()) {
+    return new BatchPutCommand(parsed_params.cmd_params,
+                               parsed_params.option_map, parsed_params.flags);
+  } else if (parsed_params.cmd == ScanCommand::Name()) {
+    return new ScanCommand(parsed_params.cmd_params, parsed_params.option_map,
+                           parsed_params.flags);
+  } else if (parsed_params.cmd == DeleteCommand::Name()) {
+    return new DeleteCommand(parsed_params.cmd_params, parsed_params.option_map,
+                             parsed_params.flags);
+  } else if (parsed_params.cmd == SingleDeleteCommand::Name()) {
+    return new SingleDeleteCommand(parsed_params.cmd_params,
+                                   parsed_params.option_map,
+                                   parsed_params.flags);
+  } else if (parsed_params.cmd == DeleteRangeCommand::Name()) {
+    return new DeleteRangeCommand(parsed_params.cmd_params,
+                                  parsed_params.option_map,
+                                  parsed_params.flags);
+  } else if (parsed_params.cmd == ApproxSizeCommand::Name()) {
+    return new ApproxSizeCommand(parsed_params.cmd_params,
+                                 parsed_params.option_map, parsed_params.flags);
+  } else if (parsed_params.cmd == DBQuerierCommand::Name()) {
+    return new DBQuerierCommand(parsed_params.cmd_params,
+                                parsed_params.option_map, parsed_params.flags);
+  } else if (parsed_params.cmd == CompactorCommand::Name()) {
+    return new CompactorCommand(parsed_params.cmd_params,
+                                parsed_params.option_map, parsed_params.flags);
+  } else if (parsed_params.cmd == WALDumperCommand::Name()) {
+    return new WALDumperCommand(parsed_params.cmd_params,
+                                parsed_params.option_map, parsed_params.flags);
+  } else if (parsed_params.cmd == ReduceDBLevelsCommand::Name()) {
+    return new ReduceDBLevelsCommand(parsed_params.cmd_params,
+                                     parsed_params.option_map,
+                                     parsed_params.flags);
+  } else if (parsed_params.cmd == ChangeCompactionStyleCommand::Name()) {
+    return new ChangeCompactionStyleCommand(parsed_params.cmd_params,
+                                            parsed_params.option_map,
+                                            parsed_params.flags);
+  } else if (parsed_params.cmd == DBDumperCommand::Name()) {
+    return new DBDumperCommand(parsed_params.cmd_params,
+                               parsed_params.option_map, parsed_params.flags);
+  } else if (parsed_params.cmd == DBLoaderCommand::Name()) {
+    return new DBLoaderCommand(parsed_params.cmd_params,
+                               parsed_params.option_map, parsed_params.flags);
+  } else if (parsed_params.cmd == ManifestDumpCommand::Name()) {
+    return new ManifestDumpCommand(parsed_params.cmd_params,
+                                   parsed_params.option_map,
+                                   parsed_params.flags);
+  } else if (parsed_params.cmd == FileChecksumDumpCommand::Name()) {
+    return new FileChecksumDumpCommand(parsed_params.cmd_params,
+                                       parsed_params.option_map,
+                                       parsed_params.flags);
+  } else if (parsed_params.cmd == GetPropertyCommand::Name()) {
+    return new GetPropertyCommand(parsed_params.cmd_params,
+                                  parsed_params.option_map,
+                                  parsed_params.flags);
+  } else if (parsed_params.cmd == ListColumnFamiliesCommand::Name()) {
+    return new ListColumnFamiliesCommand(parsed_params.cmd_params,
+                                         parsed_params.option_map,
+                                         parsed_params.flags);
+  } else if (parsed_params.cmd == CreateColumnFamilyCommand::Name()) {
+    return new CreateColumnFamilyCommand(parsed_params.cmd_params,
+                                         parsed_params.option_map,
+                                         parsed_params.flags);
+  } else if (parsed_params.cmd == DropColumnFamilyCommand::Name()) {
+    return new DropColumnFamilyCommand(parsed_params.cmd_params,
+                                       parsed_params.option_map,
+                                       parsed_params.flags);
+  } else if (parsed_params.cmd == DBFileDumperCommand::Name()) {
+    return new DBFileDumperCommand(parsed_params.cmd_params,
+                                   parsed_params.option_map,
+                                   parsed_params.flags);
+  } else if (parsed_params.cmd == DBLiveFilesMetadataDumperCommand::Name()) {
+    return new DBLiveFilesMetadataDumperCommand(parsed_params.cmd_params,
+                                                parsed_params.option_map,
+                                                parsed_params.flags);
+  } else if (parsed_params.cmd == InternalDumpCommand::Name()) {
+    return new InternalDumpCommand(parsed_params.cmd_params,
+                                   parsed_params.option_map,
+                                   parsed_params.flags);
+  } else if (parsed_params.cmd == CheckConsistencyCommand::Name()) {
+    return new CheckConsistencyCommand(parsed_params.cmd_params,
+                                       parsed_params.option_map,
+                                       parsed_params.flags);
+  } else if (parsed_params.cmd == CheckPointCommand::Name()) {
+    return new CheckPointCommand(parsed_params.cmd_params,
+                                 parsed_params.option_map, parsed_params.flags);
+  } else if (parsed_params.cmd == RepairCommand::Name()) {
+    return new RepairCommand(parsed_params.cmd_params, parsed_params.option_map,
+                             parsed_params.flags);
+  } else if (parsed_params.cmd == BackupCommand::Name()) {
+    return new BackupCommand(parsed_params.cmd_params, parsed_params.option_map,
+                             parsed_params.flags);
+  } else if (parsed_params.cmd == RestoreCommand::Name()) {
+    return new RestoreCommand(parsed_params.cmd_params,
+                              parsed_params.option_map, parsed_params.flags);
+  } else if (parsed_params.cmd == WriteExternalSstFilesCommand::Name()) {
+    return new WriteExternalSstFilesCommand(parsed_params.cmd_params,
+                                            parsed_params.option_map,
+                                            parsed_params.flags);
+  } else if (parsed_params.cmd == IngestExternalSstFilesCommand::Name()) {
+    return new IngestExternalSstFilesCommand(parsed_params.cmd_params,
+                                             parsed_params.option_map,
+                                             parsed_params.flags);
+  } else if (parsed_params.cmd == ListFileRangeDeletesCommand::Name()) {
+    return new ListFileRangeDeletesCommand(parsed_params.option_map,
+                                           parsed_params.flags);
+  } else if (parsed_params.cmd == UnsafeRemoveSstFileCommand::Name()) {
+    return new UnsafeRemoveSstFileCommand(parsed_params.cmd_params,
+                                          parsed_params.option_map,
+                                          parsed_params.flags);
+  } else if (parsed_params.cmd == UpdateManifestCommand::Name()) {
+    return new UpdateManifestCommand(parsed_params.cmd_params,
+                                     parsed_params.option_map,
+                                     parsed_params.flags);
+  }
+  return nullptr;
+}
+
+/* Run the command, and return the execute result. */
+void LDBCommand::Run() {
+  if (!exec_state_.IsNotStarted()) {
+    return;
+  }
+
+  if (!options_.env || options_.env == Env::Default()) {
+    Env* env = Env::Default();
+    Status s = Env::CreateFromUri(config_options_, env_uri_, fs_uri_, &env,
+                                  &env_guard_);
+    if (!s.ok()) {
+      fprintf(stderr, "%s\n", s.ToString().c_str());
+      exec_state_ = LDBCommandExecuteResult::Failed(s.ToString());
+      return;
+    }
+    options_.env = env;
+  }
+
+  if (db_ == nullptr && !NoDBOpen()) {
+    OpenDB();
+    if (exec_state_.IsFailed() && try_load_options_) {
+      // We don't always return if there is a failure because a WAL file or
+      // manifest file can be given to "dump" command so we should continue.
+      // --try_load_options is not valid in those cases.
+      return;
+    }
+  }
+
+  // We'll intentionally proceed even if the DB can't be opened because users
+  // can also specify a filename, not just a directory.
+  DoCommand();
+
+  if (exec_state_.IsNotStarted()) {
+    exec_state_ = LDBCommandExecuteResult::Succeed("");
+  }
+
+  if (db_ != nullptr) {
+    CloseDB();
+  }
+}
+
+LDBCommand::LDBCommand(const std::map<std::string, std::string>& options,
+                       const std::vector<std::string>& flags, bool is_read_only,
+                       const std::vector<std::string>& valid_cmd_line_options)
+    : db_(nullptr),
+      db_ttl_(nullptr),
+      is_read_only_(is_read_only),
+      is_key_hex_(false),
+      is_value_hex_(false),
+      is_db_ttl_(false),
+      timestamp_(false),
+      try_load_options_(false),
+      create_if_missing_(false),
+      option_map_(options),
+      flags_(flags),
+      valid_cmd_line_options_(valid_cmd_line_options) {
+  auto itr = options.find(ARG_DB);
+  if (itr != options.end()) {
+    db_path_ = itr->second;
+  }
+
+  itr = options.find(ARG_ENV_URI);
+  if (itr != options.end()) {
+    env_uri_ = itr->second;
+  }
+
+  itr = options.find(ARG_FS_URI);
+  if (itr != options.end()) {
+    fs_uri_ = itr->second;
+  }
+
+  itr = options.find(ARG_CF_NAME);
+  if (itr != options.end()) {
+    column_family_name_ = itr->second;
+  } else {
+    column_family_name_ = kDefaultColumnFamilyName;
+  }
+
+  itr = options.find(ARG_SECONDARY_PATH);
+  secondary_path_ = "";
+  if (itr != options.end()) {
+    secondary_path_ = itr->second;
+  }
+
+  is_key_hex_ = IsKeyHex(options, flags);
+  is_value_hex_ = IsValueHex(options, flags);
+  is_db_ttl_ = IsFlagPresent(flags, ARG_TTL);
+  timestamp_ = IsFlagPresent(flags, ARG_TIMESTAMP);
+  try_load_options_ = IsTryLoadOptions(options, flags);
+  force_consistency_checks_ =
+      !IsFlagPresent(flags, ARG_DISABLE_CONSISTENCY_CHECKS);
+  enable_blob_files_ = IsFlagPresent(flags, ARG_ENABLE_BLOB_FILES);
+  enable_blob_garbage_collection_ =
+      IsFlagPresent(flags, ARG_ENABLE_BLOB_GARBAGE_COLLECTION);
+  config_options_.ignore_unknown_options =
+      IsFlagPresent(flags, ARG_IGNORE_UNKNOWN_OPTIONS);
+}
+
+void LDBCommand::OpenDB() {
+  PrepareOptions();
+  if (!exec_state_.IsNotStarted()) {
+    return;
+  }
+  if (column_families_.empty() && !options_.merge_operator) {
+    // No harm to add a general merge operator if it is not specified.
+    options_.merge_operator = MergeOperators::CreateStringAppendOperator(':');
+  }
+  // Open the DB.
+  Status st;
+  std::vector<ColumnFamilyHandle*> handles_opened;
+  if (is_db_ttl_) {
+    // ldb doesn't yet support TTL DB with multiple column families
+    if (!column_family_name_.empty() || !column_families_.empty()) {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          "ldb doesn't support TTL DB with multiple column families");
+    }
+    if (!secondary_path_.empty()) {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          "Open as secondary is not supported for TTL DB yet.");
+    }
+    if (is_read_only_) {
+      st = DBWithTTL::Open(options_, db_path_, &db_ttl_, 0, true);
+    } else {
+      st = DBWithTTL::Open(options_, db_path_, &db_ttl_);
+    }
+    db_ = db_ttl_;
+  } else {
+    if (is_read_only_ && secondary_path_.empty()) {
+      if (column_families_.empty()) {
+        st = DB::OpenForReadOnly(options_, db_path_, &db_);
+      } else {
+        st = DB::OpenForReadOnly(options_, db_path_, column_families_,
+                                 &handles_opened, &db_);
+      }
+    } else {
+      if (column_families_.empty()) {
+        if (secondary_path_.empty()) {
+          st = DB::Open(options_, db_path_, &db_);
+        } else {
+          st = DB::OpenAsSecondary(options_, db_path_, secondary_path_, &db_);
+        }
+      } else {
+        if (secondary_path_.empty()) {
+          st = DB::Open(options_, db_path_, column_families_, &handles_opened,
+                        &db_);
+        } else {
+          st = DB::OpenAsSecondary(options_, db_path_, secondary_path_,
+                                   column_families_, &handles_opened, &db_);
+        }
+      }
+    }
+  }
+  if (!st.ok()) {
+    std::string msg = st.ToString();
+    exec_state_ = LDBCommandExecuteResult::Failed(msg);
+  } else if (!handles_opened.empty()) {
+    assert(handles_opened.size() == column_families_.size());
+    bool found_cf_name = false;
+    for (size_t i = 0; i < handles_opened.size(); i++) {
+      cf_handles_[column_families_[i].name] = handles_opened[i];
+      if (column_family_name_ == column_families_[i].name) {
+        found_cf_name = true;
+      }
+    }
+    if (!found_cf_name) {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          "Non-existing column family " + column_family_name_);
+      CloseDB();
+    }
+  } else {
+    // We successfully opened DB in single column family mode.
+    assert(column_families_.empty());
+    if (column_family_name_ != kDefaultColumnFamilyName) {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          "Non-existing column family " + column_family_name_);
+      CloseDB();
+    }
+  }
+}
+
+void LDBCommand::CloseDB() {
+  if (db_ != nullptr) {
+    for (auto& pair : cf_handles_) {
+      delete pair.second;
+    }
+    Status s = db_->Close();
+    s.PermitUncheckedError();
+    delete db_;
+    db_ = nullptr;
+  }
+}
+
+ColumnFamilyHandle* LDBCommand::GetCfHandle() {
+  if (!cf_handles_.empty()) {
+    auto it = cf_handles_.find(column_family_name_);
+    if (it == cf_handles_.end()) {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          "Cannot find column family " + column_family_name_);
+    } else {
+      return it->second;
+    }
+  }
+  return db_->DefaultColumnFamily();
+}
+
+std::vector<std::string> LDBCommand::BuildCmdLineOptions(
+    std::vector<std::string> options) {
+  std::vector<std::string> ret = {ARG_ENV_URI,
+                                  ARG_FS_URI,
+                                  ARG_DB,
+                                  ARG_SECONDARY_PATH,
+                                  ARG_BLOOM_BITS,
+                                  ARG_BLOCK_SIZE,
+                                  ARG_AUTO_COMPACTION,
+                                  ARG_COMPRESSION_TYPE,
+                                  ARG_COMPRESSION_MAX_DICT_BYTES,
+                                  ARG_WRITE_BUFFER_SIZE,
+                                  ARG_FILE_SIZE,
+                                  ARG_FIX_PREFIX_LEN,
+                                  ARG_TRY_LOAD_OPTIONS,
+                                  ARG_DISABLE_CONSISTENCY_CHECKS,
+                                  ARG_ENABLE_BLOB_FILES,
+                                  ARG_MIN_BLOB_SIZE,
+                                  ARG_BLOB_FILE_SIZE,
+                                  ARG_BLOB_COMPRESSION_TYPE,
+                                  ARG_ENABLE_BLOB_GARBAGE_COLLECTION,
+                                  ARG_BLOB_GARBAGE_COLLECTION_AGE_CUTOFF,
+                                  ARG_BLOB_GARBAGE_COLLECTION_FORCE_THRESHOLD,
+                                  ARG_BLOB_COMPACTION_READAHEAD_SIZE,
+                                  ARG_BLOB_FILE_STARTING_LEVEL,
+                                  ARG_PREPOPULATE_BLOB_CACHE,
+                                  ARG_IGNORE_UNKNOWN_OPTIONS,
+                                  ARG_CF_NAME};
+  ret.insert(ret.end(), options.begin(), options.end());
+  return ret;
+}
+
+/**
+ * Parses the specific double option and fills in the value.
+ * Returns true if the option is found.
+ * Returns false if the option is not found or if there is an error parsing the
+ * value.  If there is an error, the specified exec_state is also
+ * updated.
+ */
+bool LDBCommand::ParseDoubleOption(
+    const std::map<std::string, std::string>& /*options*/,
+    const std::string& option, double& value,
+    LDBCommandExecuteResult& exec_state) {
+  auto itr = option_map_.find(option);
+  if (itr != option_map_.end()) {
+#if defined(CYGWIN)
+    char* str_end = nullptr;
+    value = std::strtod(itr->second.c_str(), &str_end);
+    if (str_end == itr->second.c_str()) {
+      exec_state =
+          LDBCommandExecuteResult::Failed(option + " has an invalid value.");
+    } else if (errno == ERANGE) {
+      exec_state = LDBCommandExecuteResult::Failed(
+          option + " has a value out-of-range.");
+    } else {
+      return true;
+    }
+#else
+    try {
+      value = std::stod(itr->second);
+      return true;
+    } catch (const std::invalid_argument&) {
+      exec_state =
+          LDBCommandExecuteResult::Failed(option + " has an invalid value.");
+    } catch (const std::out_of_range&) {
+      exec_state = LDBCommandExecuteResult::Failed(
+          option + " has a value out-of-range.");
+    }
+#endif
+  }
+  return false;
+}
+
+/**
+ * Parses the specific integer option and fills in the value.
+ * Returns true if the option is found.
+ * Returns false if the option is not found or if there is an error parsing the
+ * value.  If there is an error, the specified exec_state is also
+ * updated.
+ */
+bool LDBCommand::ParseIntOption(
+    const std::map<std::string, std::string>& /*options*/,
+    const std::string& option, int& value,
+    LDBCommandExecuteResult& exec_state) {
+  auto itr = option_map_.find(option);
+  if (itr != option_map_.end()) {
+#if defined(CYGWIN)
+    char* str_end = nullptr;
+    value = strtol(itr->second.c_str(), &str_end, 10);
+    if (str_end == itr->second.c_str()) {
+      exec_state =
+          LDBCommandExecuteResult::Failed(option + " has an invalid value.");
+    } else if (errno == ERANGE) {
+      exec_state = LDBCommandExecuteResult::Failed(
+          option + " has a value out-of-range.");
+    } else {
+      return true;
+    }
+#else
+    try {
+      value = std::stoi(itr->second);
+      return true;
+    } catch (const std::invalid_argument&) {
+      exec_state =
+          LDBCommandExecuteResult::Failed(option + " has an invalid value.");
+    } catch (const std::out_of_range&) {
+      exec_state = LDBCommandExecuteResult::Failed(
+          option + " has a value out-of-range.");
+    }
+#endif
+  }
+  return false;
+}
+
+/**
+ * Parses the specified option and fills in the value.
+ * Returns true if the option is found.
+ * Returns false otherwise.
+ */
+bool LDBCommand::ParseStringOption(
+    const std::map<std::string, std::string>& /*options*/,
+    const std::string& option, std::string* value) {
+  auto itr = option_map_.find(option);
+  if (itr != option_map_.end()) {
+    *value = itr->second;
+    return true;
+  }
+  return false;
+}
+
+/**
+ * Parses the specified compression type and fills in the value.
+ * Returns true if the compression type is found.
+ * Returns false otherwise.
+ */
+bool LDBCommand::ParseCompressionTypeOption(
+    const std::map<std::string, std::string>& /*options*/,
+    const std::string& option, CompressionType& value,
+    LDBCommandExecuteResult& exec_state) {
+  auto itr = option_map_.find(option);
+  if (itr != option_map_.end()) {
+    const std::string& comp = itr->second;
+    if (comp == "no") {
+      value = kNoCompression;
+      return true;
+    } else if (comp == "snappy") {
+      value = kSnappyCompression;
+      return true;
+    } else if (comp == "zlib") {
+      value = kZlibCompression;
+      return true;
+    } else if (comp == "bzip2") {
+      value = kBZip2Compression;
+      return true;
+    } else if (comp == "lz4") {
+      value = kLZ4Compression;
+      return true;
+    } else if (comp == "lz4hc") {
+      value = kLZ4HCCompression;
+      return true;
+    } else if (comp == "xpress") {
+      value = kXpressCompression;
+      return true;
+    } else if (comp == "zstd") {
+      value = kZSTD;
+      return true;
+    } else {
+      // Unknown compression.
+      exec_state = LDBCommandExecuteResult::Failed(
+          "Unknown compression algorithm: " + comp);
+    }
+  }
+  return false;
+}
+
+void LDBCommand::OverrideBaseOptions() {
+  options_.create_if_missing = false;
+
+  int db_write_buffer_size;
+  if (ParseIntOption(option_map_, ARG_DB_WRITE_BUFFER_SIZE,
+                     db_write_buffer_size, exec_state_)) {
+    if (db_write_buffer_size >= 0) {
+      options_.db_write_buffer_size = db_write_buffer_size;
+    } else {
+      exec_state_ = LDBCommandExecuteResult::Failed(ARG_DB_WRITE_BUFFER_SIZE +
+                                                    " must be >= 0.");
+    }
+  }
+
+  if (options_.db_paths.size() == 0) {
+    options_.db_paths.emplace_back(db_path_,
+                                   std::numeric_limits<uint64_t>::max());
+  }
+
+  OverrideBaseCFOptions(static_cast<ColumnFamilyOptions*>(&options_));
+}
+
+void LDBCommand::OverrideBaseCFOptions(ColumnFamilyOptions* cf_opts) {
+  BlockBasedTableOptions table_options;
+  bool use_table_options = false;
+  int bits;
+  if (ParseIntOption(option_map_, ARG_BLOOM_BITS, bits, exec_state_)) {
+    if (bits > 0) {
+      use_table_options = true;
+      table_options.filter_policy.reset(NewBloomFilterPolicy(bits));
+    } else {
+      exec_state_ =
+          LDBCommandExecuteResult::Failed(ARG_BLOOM_BITS + " must be > 0.");
+    }
+  }
+
+  int block_size;
+  if (ParseIntOption(option_map_, ARG_BLOCK_SIZE, block_size, exec_state_)) {
+    if (block_size > 0) {
+      use_table_options = true;
+      table_options.block_size = block_size;
+    } else {
+      exec_state_ =
+          LDBCommandExecuteResult::Failed(ARG_BLOCK_SIZE + " must be > 0.");
+    }
+  }
+
+  cf_opts->force_consistency_checks = force_consistency_checks_;
+  if (use_table_options) {
+    cf_opts->table_factory.reset(NewBlockBasedTableFactory(table_options));
+  }
+
+  cf_opts->enable_blob_files = enable_blob_files_;
+
+  int min_blob_size;
+  if (ParseIntOption(option_map_, ARG_MIN_BLOB_SIZE, min_blob_size,
+                     exec_state_)) {
+    if (min_blob_size >= 0) {
+      cf_opts->min_blob_size = min_blob_size;
+    } else {
+      exec_state_ =
+          LDBCommandExecuteResult::Failed(ARG_MIN_BLOB_SIZE + " must be >= 0.");
+    }
+  }
+
+  int blob_file_size;
+  if (ParseIntOption(option_map_, ARG_BLOB_FILE_SIZE, blob_file_size,
+                     exec_state_)) {
+    if (blob_file_size > 0) {
+      cf_opts->blob_file_size = blob_file_size;
+    } else {
+      exec_state_ =
+          LDBCommandExecuteResult::Failed(ARG_BLOB_FILE_SIZE + " must be > 0.");
+    }
+  }
+
+  cf_opts->enable_blob_garbage_collection = enable_blob_garbage_collection_;
+
+  double blob_garbage_collection_age_cutoff;
+  if (ParseDoubleOption(option_map_, ARG_BLOB_GARBAGE_COLLECTION_AGE_CUTOFF,
+                        blob_garbage_collection_age_cutoff, exec_state_)) {
+    if (blob_garbage_collection_age_cutoff >= 0 &&
+        blob_garbage_collection_age_cutoff <= 1) {
+      cf_opts->blob_garbage_collection_age_cutoff =
+          blob_garbage_collection_age_cutoff;
+    } else {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          ARG_BLOB_GARBAGE_COLLECTION_AGE_CUTOFF + " must be >= 0 and <= 1.");
+    }
+  }
+
+  double blob_garbage_collection_force_threshold;
+  if (ParseDoubleOption(option_map_,
+                        ARG_BLOB_GARBAGE_COLLECTION_FORCE_THRESHOLD,
+                        blob_garbage_collection_force_threshold, exec_state_)) {
+    if (blob_garbage_collection_force_threshold >= 0 &&
+        blob_garbage_collection_force_threshold <= 1) {
+      cf_opts->blob_garbage_collection_force_threshold =
+          blob_garbage_collection_force_threshold;
+    } else {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          ARG_BLOB_GARBAGE_COLLECTION_FORCE_THRESHOLD +
+          " must be >= 0 and <= 1.");
+    }
+  }
+
+  int blob_compaction_readahead_size;
+  if (ParseIntOption(option_map_, ARG_BLOB_COMPACTION_READAHEAD_SIZE,
+                     blob_compaction_readahead_size, exec_state_)) {
+    if (blob_compaction_readahead_size > 0) {
+      cf_opts->blob_compaction_readahead_size = blob_compaction_readahead_size;
+    } else {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          ARG_BLOB_COMPACTION_READAHEAD_SIZE + " must be > 0.");
+    }
+  }
+
+  int blob_file_starting_level;
+  if (ParseIntOption(option_map_, ARG_BLOB_FILE_STARTING_LEVEL,
+                     blob_file_starting_level, exec_state_)) {
+    if (blob_file_starting_level >= 0) {
+      cf_opts->blob_file_starting_level = blob_file_starting_level;
+    } else {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          ARG_BLOB_FILE_STARTING_LEVEL + " must be >= 0.");
+    }
+  }
+
+  int prepopulate_blob_cache;
+  if (ParseIntOption(option_map_, ARG_PREPOPULATE_BLOB_CACHE,
+                     prepopulate_blob_cache, exec_state_)) {
+    switch (prepopulate_blob_cache) {
+      case 0:
+        cf_opts->prepopulate_blob_cache = PrepopulateBlobCache::kDisable;
+        break;
+      case 1:
+        cf_opts->prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly;
+        break;
+      default:
+        exec_state_ = LDBCommandExecuteResult::Failed(
+            ARG_PREPOPULATE_BLOB_CACHE +
+            " must be 0 (disable) or 1 (flush only).");
+    }
+  }
+
+  auto itr = option_map_.find(ARG_AUTO_COMPACTION);
+  if (itr != option_map_.end()) {
+    cf_opts->disable_auto_compactions = !StringToBool(itr->second);
+  }
+
+  CompressionType compression_type;
+  if (ParseCompressionTypeOption(option_map_, ARG_COMPRESSION_TYPE,
+                                 compression_type, exec_state_)) {
+    cf_opts->compression = compression_type;
+  }
+
+  CompressionType blob_compression_type;
+  if (ParseCompressionTypeOption(option_map_, ARG_BLOB_COMPRESSION_TYPE,
+                                 blob_compression_type, exec_state_)) {
+    cf_opts->blob_compression_type = blob_compression_type;
+  }
+
+  int compression_max_dict_bytes;
+  if (ParseIntOption(option_map_, ARG_COMPRESSION_MAX_DICT_BYTES,
+                     compression_max_dict_bytes, exec_state_)) {
+    if (compression_max_dict_bytes >= 0) {
+      cf_opts->compression_opts.max_dict_bytes = compression_max_dict_bytes;
+    } else {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          ARG_COMPRESSION_MAX_DICT_BYTES + " must be >= 0.");
+    }
+  }
+
+  int write_buffer_size;
+  if (ParseIntOption(option_map_, ARG_WRITE_BUFFER_SIZE, write_buffer_size,
+                     exec_state_)) {
+    if (write_buffer_size > 0) {
+      cf_opts->write_buffer_size = write_buffer_size;
+    } else {
+      exec_state_ = LDBCommandExecuteResult::Failed(ARG_WRITE_BUFFER_SIZE +
+                                                    " must be > 0.");
+    }
+  }
+
+  int file_size;
+  if (ParseIntOption(option_map_, ARG_FILE_SIZE, file_size, exec_state_)) {
+    if (file_size > 0) {
+      cf_opts->target_file_size_base = file_size;
+    } else {
+      exec_state_ =
+          LDBCommandExecuteResult::Failed(ARG_FILE_SIZE + " must be > 0.");
+    }
+  }
+
+  int fix_prefix_len;
+  if (ParseIntOption(option_map_, ARG_FIX_PREFIX_LEN, fix_prefix_len,
+                     exec_state_)) {
+    if (fix_prefix_len > 0) {
+      cf_opts->prefix_extractor.reset(
+          NewFixedPrefixTransform(static_cast<size_t>(fix_prefix_len)));
+    } else {
+      exec_state_ =
+          LDBCommandExecuteResult::Failed(ARG_FIX_PREFIX_LEN + " must be > 0.");
+    }
+  }
+}
+
+// First, initializes the options state using the OPTIONS file when enabled.
+// Second, overrides the options according to the CLI arguments and the
+// specific subcommand being run.
+void LDBCommand::PrepareOptions() {
+  if (!create_if_missing_ && try_load_options_) {
+    config_options_.env = options_.env;
+    Status s = LoadLatestOptions(config_options_, db_path_, &options_,
+                                 &column_families_);
+    if (!s.ok() && !s.IsNotFound()) {
+      // Option file exists but load option file error.
+      std::string msg = s.ToString();
+      exec_state_ = LDBCommandExecuteResult::Failed(msg);
+      db_ = nullptr;
+      return;
+    }
+    if (!options_.wal_dir.empty()) {
+      if (options_.env->FileExists(options_.wal_dir).IsNotFound()) {
+        options_.wal_dir = db_path_;
+        fprintf(
+            stderr,
+            "wal_dir loaded from the option file doesn't exist. Ignore it.\n");
+      }
+    }
+
+    // If merge operator is not set, set a string append operator.
+    for (auto& cf_entry : column_families_) {
+      if (!cf_entry.options.merge_operator) {
+        cf_entry.options.merge_operator =
+            MergeOperators::CreateStringAppendOperator(':');
+      }
+    }
+  }
+
+  if (options_.env == Env::Default()) {
+    options_.env = config_options_.env;
+  }
+
+  OverrideBaseOptions();
+  if (exec_state_.IsFailed()) {
+    return;
+  }
+
+  if (column_families_.empty()) {
+    // Reads the MANIFEST to figure out what column families exist. In this
+    // case, the option overrides from the CLI argument/specific subcommand
+    // apply to all column families.
+    std::vector<std::string> cf_list;
+    Status st = DB::ListColumnFamilies(options_, db_path_, &cf_list);
+    // It is possible the DB doesn't exist yet, for "create if not
+    // existing" case. The failure is ignored here. We rely on DB::Open()
+    // to give us the correct error message for problem with opening
+    // existing DB.
+    if (st.ok() && cf_list.size() > 1) {
+      // Ignore single column family DB.
+      for (auto cf_name : cf_list) {
+        column_families_.emplace_back(cf_name, options_);
+      }
+    }
+  } else {
+    // We got column families from the OPTIONS file. In this case, the option
+    // overrides from the CLI argument/specific subcommand only apply to the
+    // column family specified by `--column_family_name`.
+    auto column_families_iter =
+        std::find_if(column_families_.begin(), column_families_.end(),
+                     [this](const ColumnFamilyDescriptor& cf_desc) {
+                       return cf_desc.name == column_family_name_;
+                     });
+    if (column_families_iter == column_families_.end()) {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          "Non-existing column family " + column_family_name_);
+      return;
+    }
+    OverrideBaseCFOptions(&column_families_iter->options);
+  }
+}
+
+bool LDBCommand::ParseKeyValue(const std::string& line, std::string* key,
+                               std::string* value, bool is_key_hex,
+                               bool is_value_hex) {
+  size_t pos = line.find(DELIM);
+  if (pos != std::string::npos) {
+    *key = line.substr(0, pos);
+    *value = line.substr(pos + strlen(DELIM));
+    if (is_key_hex) {
+      *key = HexToString(*key);
+    }
+    if (is_value_hex) {
+      *value = HexToString(*value);
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+/**
+ * Make sure that ONLY the command-line options and flags expected by this
+ * command are specified on the command-line.  Extraneous options are usually
+ * the result of user error.
+ * Returns true if all checks pass.  Else returns false, and prints an
+ * appropriate error msg to stderr.
+ */
+bool LDBCommand::ValidateCmdLineOptions() {
+  for (auto itr = option_map_.begin(); itr != option_map_.end(); ++itr) {
+    if (std::find(valid_cmd_line_options_.begin(),
+                  valid_cmd_line_options_.end(),
+                  itr->first) == valid_cmd_line_options_.end()) {
+      fprintf(stderr, "Invalid command-line option %s\n", itr->first.c_str());
+      return false;
+    }
+  }
+
+  for (std::vector<std::string>::const_iterator itr = flags_.begin();
+       itr != flags_.end(); ++itr) {
+    if (std::find(valid_cmd_line_options_.begin(),
+                  valid_cmd_line_options_.end(),
+                  *itr) == valid_cmd_line_options_.end()) {
+      fprintf(stderr, "Invalid command-line flag %s\n", itr->c_str());
+      return false;
+    }
+  }
+
+  if (!NoDBOpen() && option_map_.find(ARG_DB) == option_map_.end() &&
+      option_map_.find(ARG_PATH) == option_map_.end()) {
+    fprintf(stderr, "Either %s or %s must be specified.\n", ARG_DB.c_str(),
+            ARG_PATH.c_str());
+    return false;
+  }
+
+  return true;
+}
+
+std::string LDBCommand::HexToString(const std::string& str) {
+  std::string result;
+  std::string::size_type len = str.length();
+  if (len < 2 || str[0] != '0' || str[1] != 'x') {
+    fprintf(stderr, "Invalid hex input %s.  Must start with 0x\n", str.c_str());
+    throw "Invalid hex input";
+  }
+  if (!Slice(str.data() + 2, len - 2).DecodeHex(&result)) {
+    throw "Invalid hex input";
+  }
+  return result;
+}
+
+std::string LDBCommand::StringToHex(const std::string& str) {
+  std::string result("0x");
+  result.append(Slice(str).ToString(true));
+  return result;
+}
+
+std::string LDBCommand::PrintKeyValue(const std::string& key,
+                                      const std::string& value, bool is_key_hex,
+                                      bool is_value_hex) {
+  std::string result;
+  result.append(is_key_hex ? StringToHex(key) : key);
+  result.append(DELIM);
+  result.append(is_value_hex ? StringToHex(value) : value);
+  return result;
+}
+
+std::string LDBCommand::PrintKeyValue(const std::string& key,
+                                      const std::string& value, bool is_hex) {
+  return PrintKeyValue(key, value, is_hex, is_hex);
+}
+
+std::string LDBCommand::HelpRangeCmdArgs() {
+  std::ostringstream str_stream;
+  str_stream << " ";
+  str_stream << "[--" << ARG_FROM << "] ";
+  str_stream << "[--" << ARG_TO << "] ";
+  return str_stream.str();
+}
+
+bool LDBCommand::IsKeyHex(const std::map<std::string, std::string>& options,
+                          const std::vector<std::string>& flags) {
+  return (IsFlagPresent(flags, ARG_HEX) || IsFlagPresent(flags, ARG_KEY_HEX) ||
+          ParseBooleanOption(options, ARG_HEX, false) ||
+          ParseBooleanOption(options, ARG_KEY_HEX, false));
+}
+
+bool LDBCommand::IsValueHex(const std::map<std::string, std::string>& options,
+                            const std::vector<std::string>& flags) {
+  return (IsFlagPresent(flags, ARG_HEX) ||
+          IsFlagPresent(flags, ARG_VALUE_HEX) ||
+          ParseBooleanOption(options, ARG_HEX, false) ||
+          ParseBooleanOption(options, ARG_VALUE_HEX, false));
+}
+
+bool LDBCommand::IsTryLoadOptions(
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags) {
+  if (IsFlagPresent(flags, ARG_TRY_LOAD_OPTIONS)) {
+    return true;
+  }
+  // if `DB` is specified and not explicitly to create a new db, default
+  // `try_load_options` to true. The user could still disable that by set
+  // `try_load_options=false`.
+  // Note: Opening as TTL DB doesn't support `try_load_options`, so it's default
+  // to false. TODO: TTL_DB may need to fix that, otherwise it's unable to open
+  // DB which has incompatible setting with default options.
+  bool default_val = (options.find(ARG_DB) != options.end()) &&
+                     !IsFlagPresent(flags, ARG_CREATE_IF_MISSING) &&
+                     !IsFlagPresent(flags, ARG_TTL);
+  return ParseBooleanOption(options, ARG_TRY_LOAD_OPTIONS, default_val);
+}
+
+bool LDBCommand::ParseBooleanOption(
+    const std::map<std::string, std::string>& options,
+    const std::string& option, bool default_val) {
+  auto itr = options.find(option);
+  if (itr != options.end()) {
+    std::string option_val = itr->second;
+    return StringToBool(itr->second);
+  }
+  return default_val;
+}
+
+bool LDBCommand::StringToBool(std::string val) {
+  std::transform(val.begin(), val.end(), val.begin(),
+                 [](char ch) -> char { return (char)::tolower(ch); });
+
+  if (val == "true") {
+    return true;
+  } else if (val == "false") {
+    return false;
+  } else {
+    throw "Invalid value for boolean argument";
+  }
+}
+
+CompactorCommand::CompactorCommand(
+    const std::vector<std::string>& /*params*/,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, false,
+                 BuildCmdLineOptions({ARG_FROM, ARG_TO, ARG_HEX, ARG_KEY_HEX,
+                                      ARG_VALUE_HEX, ARG_TTL})),
+      null_from_(true),
+      null_to_(true) {
+  auto itr = options.find(ARG_FROM);
+  if (itr != options.end()) {
+    null_from_ = false;
+    from_ = itr->second;
+  }
+
+  itr = options.find(ARG_TO);
+  if (itr != options.end()) {
+    null_to_ = false;
+    to_ = itr->second;
+  }
+
+  if (is_key_hex_) {
+    if (!null_from_) {
+      from_ = HexToString(from_);
+    }
+    if (!null_to_) {
+      to_ = HexToString(to_);
+    }
+  }
+}
+
+void CompactorCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(CompactorCommand::Name());
+  ret.append(HelpRangeCmdArgs());
+  ret.append("\n");
+}
+
+void CompactorCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+
+  Slice* begin = nullptr;
+  Slice* end = nullptr;
+  if (!null_from_) {
+    begin = new Slice(from_);
+  }
+  if (!null_to_) {
+    end = new Slice(to_);
+  }
+
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+
+  Status s = db_->CompactRange(cro, GetCfHandle(), begin, end);
+  if (!s.ok()) {
+    std::stringstream oss;
+    oss << "Compaction failed: " << s.ToString();
+    exec_state_ = LDBCommandExecuteResult::Failed(oss.str());
+  } else {
+    exec_state_ = LDBCommandExecuteResult::Succeed("");
+  }
+
+  delete begin;
+  delete end;
+}
+
+// ---------------------------------------------------------------------------
+const std::string DBLoaderCommand::ARG_DISABLE_WAL = "disable_wal";
+const std::string DBLoaderCommand::ARG_BULK_LOAD = "bulk_load";
+const std::string DBLoaderCommand::ARG_COMPACT = "compact";
+
+DBLoaderCommand::DBLoaderCommand(
+    const std::vector<std::string>& /*params*/,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(
+          options, flags, false,
+          BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, ARG_FROM,
+                               ARG_TO, ARG_CREATE_IF_MISSING, ARG_DISABLE_WAL,
+                               ARG_BULK_LOAD, ARG_COMPACT})),
+      disable_wal_(false),
+      bulk_load_(false),
+      compact_(false) {
+  create_if_missing_ = IsFlagPresent(flags, ARG_CREATE_IF_MISSING);
+  disable_wal_ = IsFlagPresent(flags, ARG_DISABLE_WAL);
+  bulk_load_ = IsFlagPresent(flags, ARG_BULK_LOAD);
+  compact_ = IsFlagPresent(flags, ARG_COMPACT);
+}
+
+void DBLoaderCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(DBLoaderCommand::Name());
+  ret.append(" [--" + ARG_CREATE_IF_MISSING + "]");
+  ret.append(" [--" + ARG_DISABLE_WAL + "]");
+  ret.append(" [--" + ARG_BULK_LOAD + "]");
+  ret.append(" [--" + ARG_COMPACT + "]");
+  ret.append("\n");
+}
+
+void DBLoaderCommand::OverrideBaseOptions() {
+  LDBCommand::OverrideBaseOptions();
+  options_.create_if_missing = create_if_missing_;
+  if (bulk_load_) {
+    options_.PrepareForBulkLoad();
+  }
+}
+
+void DBLoaderCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+
+  WriteOptions write_options;
+  if (disable_wal_) {
+    write_options.disableWAL = true;
+  }
+
+  int bad_lines = 0;
+  std::string line;
+  // prefer ifstream getline performance vs that from std::cin istream
+  std::ifstream ifs_stdin("/dev/stdin");
+  std::istream* istream_p = ifs_stdin.is_open() ? &ifs_stdin : &std::cin;
+  Status s;
+  while (s.ok() && getline(*istream_p, line, '\n')) {
+    std::string key;
+    std::string value;
+    if (ParseKeyValue(line, &key, &value, is_key_hex_, is_value_hex_)) {
+      s = db_->Put(write_options, GetCfHandle(), Slice(key), Slice(value));
+    } else if (0 == line.find("Keys in range:")) {
+      // ignore this line
+    } else if (0 == line.find("Created bg thread 0x")) {
+      // ignore this line
+    } else {
+      bad_lines++;
+    }
+  }
+
+  if (bad_lines > 0) {
+    std::cout << "Warning: " << bad_lines << " bad lines ignored." << std::endl;
+  }
+  if (!s.ok()) {
+    std::stringstream oss;
+    oss << "Load failed: " << s.ToString();
+    exec_state_ = LDBCommandExecuteResult::Failed(oss.str());
+  }
+  if (compact_ && s.ok()) {
+    s = db_->CompactRange(CompactRangeOptions(), GetCfHandle(), nullptr,
+                          nullptr);
+  }
+  if (!s.ok()) {
+    std::stringstream oss;
+    oss << "Compaction failed: " << s.ToString();
+    exec_state_ = LDBCommandExecuteResult::Failed(oss.str());
+  }
+}
+
+// ----------------------------------------------------------------------------
+
+namespace {
+
+void DumpManifestFile(Options options, std::string file, bool verbose, bool hex,
+                      bool json) {
+  EnvOptions sopt;
+  std::string dbname("dummy");
+  std::shared_ptr<Cache> tc(NewLRUCache(options.max_open_files - 10,
+                                        options.table_cache_numshardbits));
+  // Notice we are using the default options not through SanitizeOptions(),
+  // if VersionSet::DumpManifest() depends on any option done by
+  // SanitizeOptions(), we need to initialize it manually.
+  options.db_paths.emplace_back("dummy", 0);
+  options.num_levels = 64;
+  WriteController wc(options.delayed_write_rate);
+  WriteBufferManager wb(options.db_write_buffer_size);
+  ImmutableDBOptions immutable_db_options(options);
+  VersionSet versions(dbname, &immutable_db_options, sopt, tc.get(), &wb, &wc,
+                      /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                      /*db_id*/ "", /*db_session_id*/ "");
+  Status s = versions.DumpManifest(options, file, verbose, hex, json);
+  if (!s.ok()) {
+    fprintf(stderr, "Error in processing file %s %s\n", file.c_str(),
+            s.ToString().c_str());
+  }
+}
+
+}  // namespace
+
+const std::string ManifestDumpCommand::ARG_VERBOSE = "verbose";
+const std::string ManifestDumpCommand::ARG_JSON = "json";
+const std::string ManifestDumpCommand::ARG_PATH = "path";
+
+void ManifestDumpCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(ManifestDumpCommand::Name());
+  ret.append(" [--" + ARG_VERBOSE + "]");
+  ret.append(" [--" + ARG_JSON + "]");
+  ret.append(" [--" + ARG_PATH + "=<path_to_manifest_file>]");
+  ret.append("\n");
+}
+
+ManifestDumpCommand::ManifestDumpCommand(
+    const std::vector<std::string>& /*params*/,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(
+          options, flags, false,
+          BuildCmdLineOptions({ARG_VERBOSE, ARG_PATH, ARG_HEX, ARG_JSON})),
+      verbose_(false),
+      json_(false),
+      path_("") {
+  verbose_ = IsFlagPresent(flags, ARG_VERBOSE);
+  json_ = IsFlagPresent(flags, ARG_JSON);
+
+  auto itr = options.find(ARG_PATH);
+  if (itr != options.end()) {
+    path_ = itr->second;
+    if (path_.empty()) {
+      exec_state_ = LDBCommandExecuteResult::Failed("--path: missing pathname");
+    }
+  }
+}
+
+void ManifestDumpCommand::DoCommand() {
+  std::string manifestfile;
+
+  if (!path_.empty()) {
+    manifestfile = path_;
+  } else {
+    // We need to find the manifest file by searching the directory
+    // containing the db for files of the form MANIFEST_[0-9]+
+
+    std::vector<std::string> files;
+    Status s = options_.env->GetChildren(db_path_, &files);
+    if (!s.ok()) {
+      std::string err_msg = s.ToString();
+      err_msg.append(": Failed to list the content of ");
+      err_msg.append(db_path_);
+      exec_state_ = LDBCommandExecuteResult::Failed(err_msg);
+      return;
+    }
+    const std::string kManifestNamePrefix = "MANIFEST-";
+    std::string matched_file;
+#ifdef OS_WIN
+    const char kPathDelim = '\\';
+#else
+    const char kPathDelim = '/';
+#endif
+    for (const auto& file_path : files) {
+      // Some Env::GetChildren() return absolute paths. Some directories' path
+      // end with path delim, e.g. '/' or '\\'.
+      size_t pos = file_path.find_last_of(kPathDelim);
+      if (pos == file_path.size() - 1) {
+        continue;
+      }
+      std::string fname;
+      if (pos != std::string::npos) {
+        // Absolute path.
+        fname.assign(file_path, pos + 1, file_path.size() - pos - 1);
+      } else {
+        fname = file_path;
+      }
+      uint64_t file_num = 0;
+      FileType file_type = kWalFile;  // Just for initialization
+      if (ParseFileName(fname, &file_num, &file_type) &&
+          file_type == kDescriptorFile) {
+        if (!matched_file.empty()) {
+          exec_state_ = LDBCommandExecuteResult::Failed(
+              "Multiple MANIFEST files found; use --path to select one");
+          return;
+        } else {
+          matched_file.swap(fname);
+        }
+      }
+    }
+    if (matched_file.empty()) {
+      std::string err_msg("No MANIFEST found in ");
+      err_msg.append(db_path_);
+      exec_state_ = LDBCommandExecuteResult::Failed(err_msg);
+      return;
+    }
+    if (db_path_.back() != '/') {
+      db_path_.append("/");
+    }
+    manifestfile = db_path_ + matched_file;
+  }
+
+  if (verbose_) {
+    fprintf(stdout, "Processing Manifest file %s\n", manifestfile.c_str());
+  }
+
+  DumpManifestFile(options_, manifestfile, verbose_, is_key_hex_, json_);
+
+  if (verbose_) {
+    fprintf(stdout, "Processing Manifest file %s done\n", manifestfile.c_str());
+  }
+}
+
+// ----------------------------------------------------------------------------
+namespace {
+
+Status GetLiveFilesChecksumInfoFromVersionSet(Options options,
+                                              const std::string& db_path,
+                                              FileChecksumList* checksum_list) {
+  EnvOptions sopt;
+  Status s;
+  std::string dbname(db_path);
+  std::shared_ptr<Cache> tc(NewLRUCache(options.max_open_files - 10,
+                                        options.table_cache_numshardbits));
+  // Notice we are using the default options not through SanitizeOptions(),
+  // if VersionSet::GetLiveFilesChecksumInfo depends on any option done by
+  // SanitizeOptions(), we need to initialize it manually.
+  options.db_paths.emplace_back(db_path, 0);
+  options.num_levels = 64;
+  WriteController wc(options.delayed_write_rate);
+  WriteBufferManager wb(options.db_write_buffer_size);
+  ImmutableDBOptions immutable_db_options(options);
+  VersionSet versions(dbname, &immutable_db_options, sopt, tc.get(), &wb, &wc,
+                      /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                      /*db_id*/ "", /*db_session_id*/ "");
+  std::vector<std::string> cf_name_list;
+  s = versions.ListColumnFamilies(&cf_name_list, db_path,
+                                  immutable_db_options.fs.get());
+  if (s.ok()) {
+    std::vector<ColumnFamilyDescriptor> cf_list;
+    for (const auto& name : cf_name_list) {
+      cf_list.emplace_back(name, ColumnFamilyOptions(options));
+    }
+    s = versions.Recover(cf_list, true);
+  }
+  if (s.ok()) {
+    s = versions.GetLiveFilesChecksumInfo(checksum_list);
+  }
+  return s;
+}
+
+}  // namespace
+
+const std::string FileChecksumDumpCommand::ARG_PATH = "path";
+
+void FileChecksumDumpCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(FileChecksumDumpCommand::Name());
+  ret.append(" [--" + ARG_PATH + "=<path_to_manifest_file>]");
+  ret.append("\n");
+}
+
+FileChecksumDumpCommand::FileChecksumDumpCommand(
+    const std::vector<std::string>& /*params*/,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, false,
+                 BuildCmdLineOptions({ARG_PATH, ARG_HEX})),
+      path_("") {
+  auto itr = options.find(ARG_PATH);
+  if (itr != options.end()) {
+    path_ = itr->second;
+    if (path_.empty()) {
+      exec_state_ = LDBCommandExecuteResult::Failed("--path: missing pathname");
+    }
+  }
+  is_checksum_hex_ = IsFlagPresent(flags, ARG_HEX);
+}
+
+void FileChecksumDumpCommand::DoCommand() {
+  // print out the checksum information in the following format:
+  //  sst file number, checksum function name, checksum value
+  //  sst file number, checksum function name, checksum value
+  //  ......
+
+  std::unique_ptr<FileChecksumList> checksum_list(NewFileChecksumList());
+  Status s = GetLiveFilesChecksumInfoFromVersionSet(options_, db_path_,
+                                                    checksum_list.get());
+  if (s.ok() && checksum_list != nullptr) {
+    std::vector<uint64_t> file_numbers;
+    std::vector<std::string> checksums;
+    std::vector<std::string> checksum_func_names;
+    s = checksum_list->GetAllFileChecksums(&file_numbers, &checksums,
+                                           &checksum_func_names);
+    if (s.ok()) {
+      for (size_t i = 0; i < file_numbers.size(); i++) {
+        assert(i < file_numbers.size());
+        assert(i < checksums.size());
+        assert(i < checksum_func_names.size());
+        std::string checksum;
+        if (is_checksum_hex_) {
+          checksum = StringToHex(checksums[i]);
+        } else {
+          checksum = std::move(checksums[i]);
+        }
+        fprintf(stdout, "%" PRId64 ", %s, %s\n", file_numbers[i],
+                checksum_func_names[i].c_str(), checksum.c_str());
+      }
+      fprintf(stdout, "Print SST file checksum information finished \n");
+    }
+  }
+
+  if (!s.ok()) {
+    exec_state_ = LDBCommandExecuteResult::Failed(s.ToString());
+  }
+}
+
+// ----------------------------------------------------------------------------
+
+void GetPropertyCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(GetPropertyCommand::Name());
+  ret.append(" <property_name>");
+  ret.append("\n");
+}
+
+GetPropertyCommand::GetPropertyCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, true, BuildCmdLineOptions({})) {
+  if (params.size() != 1) {
+    exec_state_ =
+        LDBCommandExecuteResult::Failed("property name must be specified");
+  } else {
+    property_ = params[0];
+  }
+}
+
+void GetPropertyCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+
+  std::map<std::string, std::string> value_map;
+  std::string value;
+
+  // Rather than having different ldb command for map properties vs. string
+  // properties, we simply try Map property first. (This order only chosen
+  // because I prefer the map-style output for
+  // "rocksdb.aggregated-table-properties".)
+  if (db_->GetMapProperty(GetCfHandle(), property_, &value_map)) {
+    if (value_map.empty()) {
+      fprintf(stdout, "%s: <empty map>\n", property_.c_str());
+    } else {
+      for (auto& e : value_map) {
+        fprintf(stdout, "%s.%s: %s\n", property_.c_str(), e.first.c_str(),
+                e.second.c_str());
+      }
+    }
+  } else if (db_->GetProperty(GetCfHandle(), property_, &value)) {
+    fprintf(stdout, "%s: %s\n", property_.c_str(), value.c_str());
+  } else {
+    exec_state_ =
+        LDBCommandExecuteResult::Failed("failed to get property: " + property_);
+  }
+}
+
+// ----------------------------------------------------------------------------
+
+void ListColumnFamiliesCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(ListColumnFamiliesCommand::Name());
+  ret.append("\n");
+}
+
+ListColumnFamiliesCommand::ListColumnFamiliesCommand(
+    const std::vector<std::string>& /*params*/,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, false, BuildCmdLineOptions({})) {}
+
+void ListColumnFamiliesCommand::DoCommand() {
+  std::vector<std::string> column_families;
+  Status s = DB::ListColumnFamilies(options_, db_path_, &column_families);
+  if (!s.ok()) {
+    fprintf(stderr, "Error in processing db %s %s\n", db_path_.c_str(),
+            s.ToString().c_str());
+  } else {
+    fprintf(stdout, "Column families in %s: \n{", db_path_.c_str());
+    bool first = true;
+    for (auto cf : column_families) {
+      if (!first) {
+        fprintf(stdout, ", ");
+      }
+      first = false;
+      fprintf(stdout, "%s", cf.c_str());
+    }
+    fprintf(stdout, "}\n");
+  }
+}
+
+void CreateColumnFamilyCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(CreateColumnFamilyCommand::Name());
+  ret.append(" --db=<db_path> <new_column_family_name>");
+  ret.append("\n");
+}
+
+CreateColumnFamilyCommand::CreateColumnFamilyCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, true, {ARG_DB}) {
+  if (params.size() != 1) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "new column family name must be specified");
+  } else {
+    new_cf_name_ = params[0];
+  }
+}
+
+void CreateColumnFamilyCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+  ColumnFamilyHandle* new_cf_handle = nullptr;
+  Status st = db_->CreateColumnFamily(options_, new_cf_name_, &new_cf_handle);
+  if (st.ok()) {
+    fprintf(stdout, "OK\n");
+  } else {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "Fail to create new column family: " + st.ToString());
+  }
+  delete new_cf_handle;
+  CloseDB();
+}
+
+void DropColumnFamilyCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(DropColumnFamilyCommand::Name());
+  ret.append(" --db=<db_path> <column_family_name_to_drop>");
+  ret.append("\n");
+}
+
+DropColumnFamilyCommand::DropColumnFamilyCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, true, {ARG_DB}) {
+  if (params.size() != 1) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "The name of column family to drop must be specified");
+  } else {
+    cf_name_to_drop_ = params[0];
+  }
+}
+
+void DropColumnFamilyCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+  auto iter = cf_handles_.find(cf_name_to_drop_);
+  if (iter == cf_handles_.end()) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "Column family: " + cf_name_to_drop_ + " doesn't exist in db.");
+    return;
+  }
+  ColumnFamilyHandle* cf_handle_to_drop = iter->second;
+  Status st = db_->DropColumnFamily(cf_handle_to_drop);
+  if (st.ok()) {
+    fprintf(stdout, "OK\n");
+  } else {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "Fail to drop column family: " + st.ToString());
+  }
+  CloseDB();
+}
+
+// ----------------------------------------------------------------------------
+namespace {
+
+// This function only called when it's the sane case of >1 buckets in time-range
+// Also called only when timekv falls between ttl_start and ttl_end provided
+void IncBucketCounts(std::vector<uint64_t>& bucket_counts, int ttl_start,
+                     int time_range, int bucket_size, int timekv,
+                     int num_buckets) {
+#ifdef NDEBUG
+  (void)time_range;
+  (void)num_buckets;
+#endif
+  assert(time_range > 0 && timekv >= ttl_start && bucket_size > 0 &&
+         timekv < (ttl_start + time_range) && num_buckets > 1);
+  int bucket = (timekv - ttl_start) / bucket_size;
+  bucket_counts[bucket]++;
+}
+
+void PrintBucketCounts(const std::vector<uint64_t>& bucket_counts,
+                       int ttl_start, int ttl_end, int bucket_size,
+                       int num_buckets) {
+  int time_point = ttl_start;
+  for (int i = 0; i < num_buckets - 1; i++, time_point += bucket_size) {
+    fprintf(stdout, "Keys in range %s to %s : %lu\n",
+            TimeToHumanString(time_point).c_str(),
+            TimeToHumanString(time_point + bucket_size).c_str(),
+            (unsigned long)bucket_counts[i]);
+  }
+  fprintf(stdout, "Keys in range %s to %s : %lu\n",
+          TimeToHumanString(time_point).c_str(),
+          TimeToHumanString(ttl_end).c_str(),
+          (unsigned long)bucket_counts[num_buckets - 1]);
+}
+
+}  // namespace
+
+const std::string InternalDumpCommand::ARG_COUNT_ONLY = "count_only";
+const std::string InternalDumpCommand::ARG_COUNT_DELIM = "count_delim";
+const std::string InternalDumpCommand::ARG_STATS = "stats";
+const std::string InternalDumpCommand::ARG_INPUT_KEY_HEX = "input_key_hex";
+
+InternalDumpCommand::InternalDumpCommand(
+    const std::vector<std::string>& /*params*/,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, true,
+                 BuildCmdLineOptions(
+                     {ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, ARG_FROM, ARG_TO,
+                      ARG_MAX_KEYS, ARG_COUNT_ONLY, ARG_COUNT_DELIM, ARG_STATS,
+                      ARG_INPUT_KEY_HEX, ARG_DECODE_BLOB_INDEX})),
+      has_from_(false),
+      has_to_(false),
+      max_keys_(-1),
+      delim_("."),
+      count_only_(false),
+      count_delim_(false),
+      print_stats_(false),
+      is_input_key_hex_(false),
+      decode_blob_index_(false) {
+  has_from_ = ParseStringOption(options, ARG_FROM, &from_);
+  has_to_ = ParseStringOption(options, ARG_TO, &to_);
+
+  ParseIntOption(options, ARG_MAX_KEYS, max_keys_, exec_state_);
+  auto itr = options.find(ARG_COUNT_DELIM);
+  if (itr != options.end()) {
+    delim_ = itr->second;
+    count_delim_ = true;
+    // fprintf(stdout,"delim = %c\n",delim_[0]);
+  } else {
+    count_delim_ = IsFlagPresent(flags, ARG_COUNT_DELIM);
+    delim_ = ".";
+  }
+
+  print_stats_ = IsFlagPresent(flags, ARG_STATS);
+  count_only_ = IsFlagPresent(flags, ARG_COUNT_ONLY);
+  is_input_key_hex_ = IsFlagPresent(flags, ARG_INPUT_KEY_HEX);
+  decode_blob_index_ = IsFlagPresent(flags, ARG_DECODE_BLOB_INDEX);
+
+  if (is_input_key_hex_) {
+    if (has_from_) {
+      from_ = HexToString(from_);
+    }
+    if (has_to_) {
+      to_ = HexToString(to_);
+    }
+  }
+}
+
+void InternalDumpCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(InternalDumpCommand::Name());
+  ret.append(HelpRangeCmdArgs());
+  ret.append(" [--" + ARG_INPUT_KEY_HEX + "]");
+  ret.append(" [--" + ARG_MAX_KEYS + "=<N>]");
+  ret.append(" [--" + ARG_COUNT_ONLY + "]");
+  ret.append(" [--" + ARG_COUNT_DELIM + "=<char>]");
+  ret.append(" [--" + ARG_STATS + "]");
+  ret.append(" [--" + ARG_DECODE_BLOB_INDEX + "]");
+  ret.append("\n");
+}
+
+void InternalDumpCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+
+  if (print_stats_) {
+    std::string stats;
+    if (db_->GetProperty(GetCfHandle(), "rocksdb.stats", &stats)) {
+      fprintf(stdout, "%s\n", stats.c_str());
+    }
+  }
+
+  // Cast as DBImpl to get internal iterator
+  std::vector<KeyVersion> key_versions;
+  Status st = GetAllKeyVersions(db_, GetCfHandle(), from_, to_, max_keys_,
+                                &key_versions);
+  if (!st.ok()) {
+    exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
+    return;
+  }
+  std::string rtype1, rtype2, row, val;
+  rtype2 = "";
+  uint64_t c = 0;
+  uint64_t s1 = 0, s2 = 0;
+
+  long long count = 0;
+  for (auto& key_version : key_versions) {
+    ValueType value_type = static_cast<ValueType>(key_version.type);
+    InternalKey ikey(key_version.user_key, key_version.sequence, value_type);
+    if (has_to_ && ikey.user_key() == to_) {
+      // GetAllKeyVersions() includes keys with user key `to_`, but idump has
+      // traditionally excluded such keys.
+      break;
+    }
+    ++count;
+    int k;
+    if (count_delim_) {
+      rtype1 = "";
+      s1 = 0;
+      row = ikey.Encode().ToString();
+      val = key_version.value;
+      for (k = 0; row[k] != '\x01' && row[k] != '\0'; k++) s1++;
+      for (k = 0; val[k] != '\x01' && val[k] != '\0'; k++) s1++;
+      for (int j = 0; row[j] != delim_[0] && row[j] != '\0' && row[j] != '\x01';
+           j++)
+        rtype1 += row[j];
+      if (rtype2.compare("") && rtype2.compare(rtype1) != 0) {
+        fprintf(stdout, "%s => count:%" PRIu64 "\tsize:%" PRIu64 "\n",
+                rtype2.c_str(), c, s2);
+        c = 1;
+        s2 = s1;
+        rtype2 = rtype1;
+      } else {
+        c++;
+        s2 += s1;
+        rtype2 = rtype1;
+      }
+    }
+
+    if (!count_only_ && !count_delim_) {
+      std::string key = ikey.DebugString(is_key_hex_);
+      Slice value(key_version.value);
+      if (!decode_blob_index_ || value_type != kTypeBlobIndex) {
+        fprintf(stdout, "%s => %s\n", key.c_str(),
+                value.ToString(is_value_hex_).c_str());
+      } else {
+        BlobIndex blob_index;
+
+        const Status s = blob_index.DecodeFrom(value);
+        if (!s.ok()) {
+          fprintf(stderr, "%s => error decoding blob index =>\n", key.c_str());
+        } else {
+          fprintf(stdout, "%s => %s\n", key.c_str(),
+                  blob_index.DebugString(is_value_hex_).c_str());
+        }
+      }
+    }
+
+    // Terminate if maximum number of keys have been dumped
+    if (max_keys_ > 0 && count >= max_keys_) break;
+  }
+  if (count_delim_) {
+    fprintf(stdout, "%s => count:%" PRIu64 "\tsize:%" PRIu64 "\n",
+            rtype2.c_str(), c, s2);
+  } else {
+    fprintf(stdout, "Internal keys in range: %lld\n", count);
+  }
+}
+
+const std::string DBDumperCommand::ARG_COUNT_ONLY = "count_only";
+const std::string DBDumperCommand::ARG_COUNT_DELIM = "count_delim";
+const std::string DBDumperCommand::ARG_STATS = "stats";
+const std::string DBDumperCommand::ARG_TTL_BUCKET = "bucket";
+
+DBDumperCommand::DBDumperCommand(
+    const std::vector<std::string>& /*params*/,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(
+          options, flags, true,
+          BuildCmdLineOptions(
+              {ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, ARG_FROM, ARG_TO,
+               ARG_MAX_KEYS, ARG_COUNT_ONLY, ARG_COUNT_DELIM, ARG_STATS,
+               ARG_TTL_START, ARG_TTL_END, ARG_TTL_BUCKET, ARG_TIMESTAMP,
+               ARG_PATH, ARG_DECODE_BLOB_INDEX, ARG_DUMP_UNCOMPRESSED_BLOBS})),
+      null_from_(true),
+      null_to_(true),
+      max_keys_(-1),
+      count_only_(false),
+      count_delim_(false),
+      print_stats_(false),
+      decode_blob_index_(false) {
+  auto itr = options.find(ARG_FROM);
+  if (itr != options.end()) {
+    null_from_ = false;
+    from_ = itr->second;
+  }
+
+  itr = options.find(ARG_TO);
+  if (itr != options.end()) {
+    null_to_ = false;
+    to_ = itr->second;
+  }
+
+  itr = options.find(ARG_MAX_KEYS);
+  if (itr != options.end()) {
+    try {
+#if defined(CYGWIN)
+      max_keys_ = strtol(itr->second.c_str(), 0, 10);
+#else
+      max_keys_ = std::stoi(itr->second);
+#endif
+    } catch (const std::invalid_argument&) {
+      exec_state_ = LDBCommandExecuteResult::Failed(ARG_MAX_KEYS +
+                                                    " has an invalid value");
+    } catch (const std::out_of_range&) {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          ARG_MAX_KEYS + " has a value out-of-range");
+    }
+  }
+  itr = options.find(ARG_COUNT_DELIM);
+  if (itr != options.end()) {
+    delim_ = itr->second;
+    count_delim_ = true;
+  } else {
+    count_delim_ = IsFlagPresent(flags, ARG_COUNT_DELIM);
+    delim_ = ".";
+  }
+
+  print_stats_ = IsFlagPresent(flags, ARG_STATS);
+  count_only_ = IsFlagPresent(flags, ARG_COUNT_ONLY);
+  decode_blob_index_ = IsFlagPresent(flags, ARG_DECODE_BLOB_INDEX);
+  dump_uncompressed_blobs_ = IsFlagPresent(flags, ARG_DUMP_UNCOMPRESSED_BLOBS);
+
+  if (is_key_hex_) {
+    if (!null_from_) {
+      from_ = HexToString(from_);
+    }
+    if (!null_to_) {
+      to_ = HexToString(to_);
+    }
+  }
+
+  itr = options.find(ARG_PATH);
+  if (itr != options.end()) {
+    path_ = itr->second;
+    if (db_path_.empty()) {
+      db_path_ = path_;
+    }
+  }
+}
+
+void DBDumperCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(DBDumperCommand::Name());
+  ret.append(HelpRangeCmdArgs());
+  ret.append(" [--" + ARG_TTL + "]");
+  ret.append(" [--" + ARG_MAX_KEYS + "=<N>]");
+  ret.append(" [--" + ARG_TIMESTAMP + "]");
+  ret.append(" [--" + ARG_COUNT_ONLY + "]");
+  ret.append(" [--" + ARG_COUNT_DELIM + "=<char>]");
+  ret.append(" [--" + ARG_STATS + "]");
+  ret.append(" [--" + ARG_TTL_BUCKET + "=<N>]");
+  ret.append(" [--" + ARG_TTL_START + "=<N>:- is inclusive]");
+  ret.append(" [--" + ARG_TTL_END + "=<N>:- is exclusive]");
+  ret.append(" [--" + ARG_PATH + "=<path_to_a_file>]");
+  ret.append(" [--" + ARG_DECODE_BLOB_INDEX + "]");
+  ret.append(" [--" + ARG_DUMP_UNCOMPRESSED_BLOBS + "]");
+  ret.append("\n");
+}
+
+/**
+ * Handles two separate cases:
+ *
+ * 1) --db is specified - just dump the database.
+ *
+ * 2) --path is specified - determine based on file extension what dumping
+ *    function to call. Please note that we intentionally use the extension
+ *    and avoid probing the file contents under the assumption that renaming
+ *    the files is not a supported scenario.
+ *
+ */
+void DBDumperCommand::DoCommand() {
+  if (!db_) {
+    assert(!path_.empty());
+    std::string fileName = GetFileNameFromPath(path_);
+    uint64_t number;
+    FileType type;
+
+    exec_state_ = LDBCommandExecuteResult::Succeed("");
+
+    if (!ParseFileName(fileName, &number, &type)) {
+      exec_state_ =
+          LDBCommandExecuteResult::Failed("Can't parse file type: " + path_);
+      return;
+    }
+
+    switch (type) {
+      case kWalFile:
+        // TODO(myabandeh): allow configuring is_write_commited
+        DumpWalFile(options_, path_, /* print_header_ */ true,
+                    /* print_values_ */ true, true /* is_write_commited */,
+                    &exec_state_);
+        break;
+      case kTableFile:
+        DumpSstFile(options_, path_, is_key_hex_, /* show_properties */ true,
+                    decode_blob_index_, from_, to_);
+        break;
+      case kDescriptorFile:
+        DumpManifestFile(options_, path_, /* verbose_ */ false, is_key_hex_,
+                         /*  json_ */ false);
+        break;
+      case kBlobFile:
+        DumpBlobFile(path_, is_key_hex_, is_value_hex_,
+                     dump_uncompressed_blobs_);
+        break;
+      default:
+        exec_state_ = LDBCommandExecuteResult::Failed(
+            "File type not supported: " + path_);
+        break;
+    }
+
+  } else {
+    DoDumpCommand();
+  }
+}
+
+void DBDumperCommand::DoDumpCommand() {
+  assert(nullptr != db_);
+  assert(path_.empty());
+
+  // Parse command line args
+  uint64_t count = 0;
+  if (print_stats_) {
+    std::string stats;
+    if (db_->GetProperty("rocksdb.stats", &stats)) {
+      fprintf(stdout, "%s\n", stats.c_str());
+    }
+  }
+
+  // Setup key iterator
+  ReadOptions scan_read_opts;
+  scan_read_opts.total_order_seek = true;
+  Iterator* iter = db_->NewIterator(scan_read_opts, GetCfHandle());
+  Status st = iter->status();
+  if (!st.ok()) {
+    exec_state_ =
+        LDBCommandExecuteResult::Failed("Iterator error." + st.ToString());
+  }
+
+  if (!null_from_) {
+    iter->Seek(from_);
+  } else {
+    iter->SeekToFirst();
+  }
+
+  int max_keys = max_keys_;
+  int ttl_start;
+  if (!ParseIntOption(option_map_, ARG_TTL_START, ttl_start, exec_state_)) {
+    ttl_start = DBWithTTLImpl::kMinTimestamp;  // TTL introduction time
+  }
+  int ttl_end;
+  if (!ParseIntOption(option_map_, ARG_TTL_END, ttl_end, exec_state_)) {
+    ttl_end = DBWithTTLImpl::kMaxTimestamp;  // Max time allowed by TTL feature
+  }
+  if (ttl_end < ttl_start) {
+    fprintf(stderr, "Error: End time can't be less than start time\n");
+    delete iter;
+    return;
+  }
+  int time_range = ttl_end - ttl_start;
+  int bucket_size;
+  if (!ParseIntOption(option_map_, ARG_TTL_BUCKET, bucket_size, exec_state_) ||
+      bucket_size <= 0) {
+    bucket_size = time_range;  // Will have just 1 bucket by default
+  }
+  // cretaing variables for row count of each type
+  std::string rtype1, rtype2, row, val;
+  rtype2 = "";
+  uint64_t c = 0;
+  uint64_t s1 = 0, s2 = 0;
+
+  // At this point, bucket_size=0 => time_range=0
+  int num_buckets = (bucket_size >= time_range)
+                        ? 1
+                        : ((time_range + bucket_size - 1) / bucket_size);
+  std::vector<uint64_t> bucket_counts(num_buckets, 0);
+  if (is_db_ttl_ && !count_only_ && timestamp_ && !count_delim_) {
+    fprintf(stdout, "Dumping key-values from %s to %s\n",
+            TimeToHumanString(ttl_start).c_str(),
+            TimeToHumanString(ttl_end).c_str());
+  }
+
+  HistogramImpl vsize_hist;
+
+  for (; iter->Valid(); iter->Next()) {
+    int rawtime = 0;
+    // If end marker was specified, we stop before it
+    if (!null_to_ && (iter->key().ToString() >= to_)) break;
+    // Terminate if maximum number of keys have been dumped
+    if (max_keys == 0) break;
+    if (is_db_ttl_) {
+      TtlIterator* it_ttl = static_cast_with_check<TtlIterator>(iter);
+      rawtime = it_ttl->ttl_timestamp();
+      if (rawtime < ttl_start || rawtime >= ttl_end) {
+        continue;
+      }
+    }
+    if (max_keys > 0) {
+      --max_keys;
+    }
+    if (is_db_ttl_ && num_buckets > 1) {
+      IncBucketCounts(bucket_counts, ttl_start, time_range, bucket_size,
+                      rawtime, num_buckets);
+    }
+    ++count;
+    if (count_delim_) {
+      rtype1 = "";
+      row = iter->key().ToString();
+      val = iter->value().ToString();
+      s1 = row.size() + val.size();
+      for (int j = 0; row[j] != delim_[0] && row[j] != '\0'; j++)
+        rtype1 += row[j];
+      if (rtype2.compare("") && rtype2.compare(rtype1) != 0) {
+        fprintf(stdout, "%s => count:%" PRIu64 "\tsize:%" PRIu64 "\n",
+                rtype2.c_str(), c, s2);
+        c = 1;
+        s2 = s1;
+        rtype2 = rtype1;
+      } else {
+        c++;
+        s2 += s1;
+        rtype2 = rtype1;
+      }
+    }
+
+    if (count_only_) {
+      vsize_hist.Add(iter->value().size());
+    }
+
+    if (!count_only_ && !count_delim_) {
+      if (is_db_ttl_ && timestamp_) {
+        fprintf(stdout, "%s ", TimeToHumanString(rawtime).c_str());
+      }
+      std::string str =
+          PrintKeyValue(iter->key().ToString(), iter->value().ToString(),
+                        is_key_hex_, is_value_hex_);
+      fprintf(stdout, "%s\n", str.c_str());
+    }
+  }
+
+  if (num_buckets > 1 && is_db_ttl_) {
+    PrintBucketCounts(bucket_counts, ttl_start, ttl_end, bucket_size,
+                      num_buckets);
+  } else if (count_delim_) {
+    fprintf(stdout, "%s => count:%" PRIu64 "\tsize:%" PRIu64 "\n",
+            rtype2.c_str(), c, s2);
+  } else {
+    fprintf(stdout, "Keys in range: %" PRIu64 "\n", count);
+  }
+
+  if (count_only_) {
+    fprintf(stdout, "Value size distribution: \n");
+    fprintf(stdout, "%s\n", vsize_hist.ToString().c_str());
+  }
+  // Clean up
+  delete iter;
+}
+
+const std::string ReduceDBLevelsCommand::ARG_NEW_LEVELS = "new_levels";
+const std::string ReduceDBLevelsCommand::ARG_PRINT_OLD_LEVELS =
+    "print_old_levels";
+
+ReduceDBLevelsCommand::ReduceDBLevelsCommand(
+    const std::vector<std::string>& /*params*/,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, false,
+                 BuildCmdLineOptions({ARG_NEW_LEVELS, ARG_PRINT_OLD_LEVELS})),
+      old_levels_(1 << 7),
+      new_levels_(-1),
+      print_old_levels_(false) {
+  ParseIntOption(option_map_, ARG_NEW_LEVELS, new_levels_, exec_state_);
+  print_old_levels_ = IsFlagPresent(flags, ARG_PRINT_OLD_LEVELS);
+
+  if (new_levels_ <= 0) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        " Use --" + ARG_NEW_LEVELS + " to specify a new level number\n");
+  }
+}
+
+std::vector<std::string> ReduceDBLevelsCommand::PrepareArgs(
+    const std::string& db_path, int new_levels, bool print_old_level) {
+  std::vector<std::string> ret;
+  ret.push_back("reduce_levels");
+  ret.push_back("--" + ARG_DB + "=" + db_path);
+  ret.push_back("--" + ARG_NEW_LEVELS + "=" + std::to_string(new_levels));
+  if (print_old_level) {
+    ret.push_back("--" + ARG_PRINT_OLD_LEVELS);
+  }
+  return ret;
+}
+
+void ReduceDBLevelsCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(ReduceDBLevelsCommand::Name());
+  ret.append(" --" + ARG_NEW_LEVELS + "=<New number of levels>");
+  ret.append(" [--" + ARG_PRINT_OLD_LEVELS + "]");
+  ret.append("\n");
+}
+
+void ReduceDBLevelsCommand::OverrideBaseCFOptions(
+    ColumnFamilyOptions* cf_opts) {
+  LDBCommand::OverrideBaseCFOptions(cf_opts);
+  cf_opts->num_levels = old_levels_;
+  cf_opts->max_bytes_for_level_multiplier_additional.resize(cf_opts->num_levels,
+                                                            1);
+  // Disable size compaction
+  cf_opts->max_bytes_for_level_base = 1ULL << 50;
+  cf_opts->max_bytes_for_level_multiplier = 1;
+}
+
+Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt, int* levels) {
+  ImmutableDBOptions db_options(opt);
+  EnvOptions soptions;
+  std::shared_ptr<Cache> tc(
+      NewLRUCache(opt.max_open_files - 10, opt.table_cache_numshardbits));
+  const InternalKeyComparator cmp(opt.comparator);
+  WriteController wc(opt.delayed_write_rate);
+  WriteBufferManager wb(opt.db_write_buffer_size);
+  VersionSet versions(db_path_, &db_options, soptions, tc.get(), &wb, &wc,
+                      /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                      /*db_id*/ "", /*db_session_id*/ "");
+  std::vector<ColumnFamilyDescriptor> dummy;
+  ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName,
+                                          ColumnFamilyOptions(opt));
+  dummy.push_back(dummy_descriptor);
+  // We rely the VersionSet::Recover to tell us the internal data structures
+  // in the db. And the Recover() should never do any change
+  // (like LogAndApply) to the manifest file.
+  Status st = versions.Recover(dummy);
+  if (!st.ok()) {
+    return st;
+  }
+  int max = -1;
+  auto default_cfd = versions.GetColumnFamilySet()->GetDefault();
+  for (int i = 0; i < default_cfd->NumberLevels(); i++) {
+    if (default_cfd->current()->storage_info()->NumLevelFiles(i)) {
+      max = i;
+    }
+  }
+
+  *levels = max + 1;
+  return st;
+}
+
+void ReduceDBLevelsCommand::DoCommand() {
+  if (new_levels_ <= 1) {
+    exec_state_ =
+        LDBCommandExecuteResult::Failed("Invalid number of levels.\n");
+    return;
+  }
+
+  Status st;
+  PrepareOptions();
+  int old_level_num = -1;
+  st = GetOldNumOfLevels(options_, &old_level_num);
+  if (!st.ok()) {
+    exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
+    return;
+  }
+
+  if (print_old_levels_) {
+    fprintf(stdout, "The old number of levels in use is %d\n", old_level_num);
+  }
+
+  if (old_level_num <= new_levels_) {
+    return;
+  }
+
+  old_levels_ = old_level_num;
+
+  OpenDB();
+  if (exec_state_.IsFailed()) {
+    return;
+  }
+  assert(db_ != nullptr);
+  // Compact the whole DB to put all files to the highest level.
+  fprintf(stdout, "Compacting the db...\n");
+  st =
+      db_->CompactRange(CompactRangeOptions(), GetCfHandle(), nullptr, nullptr);
+
+  CloseDB();
+
+  if (st.ok()) {
+    EnvOptions soptions;
+    st = VersionSet::ReduceNumberOfLevels(db_path_, &options_, soptions,
+                                          new_levels_);
+  }
+  if (!st.ok()) {
+    exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
+    return;
+  }
+}
+
+const std::string ChangeCompactionStyleCommand::ARG_OLD_COMPACTION_STYLE =
+    "old_compaction_style";
+const std::string ChangeCompactionStyleCommand::ARG_NEW_COMPACTION_STYLE =
+    "new_compaction_style";
+
+ChangeCompactionStyleCommand::ChangeCompactionStyleCommand(
+    const std::vector<std::string>& /*params*/,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, false,
+                 BuildCmdLineOptions(
+                     {ARG_OLD_COMPACTION_STYLE, ARG_NEW_COMPACTION_STYLE})),
+      old_compaction_style_(-1),
+      new_compaction_style_(-1) {
+  ParseIntOption(option_map_, ARG_OLD_COMPACTION_STYLE, old_compaction_style_,
+                 exec_state_);
+  if (old_compaction_style_ != kCompactionStyleLevel &&
+      old_compaction_style_ != kCompactionStyleUniversal) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "Use --" + ARG_OLD_COMPACTION_STYLE + " to specify old compaction " +
+        "style. Check ldb help for proper compaction style value.\n");
+    return;
+  }
+
+  ParseIntOption(option_map_, ARG_NEW_COMPACTION_STYLE, new_compaction_style_,
+                 exec_state_);
+  if (new_compaction_style_ != kCompactionStyleLevel &&
+      new_compaction_style_ != kCompactionStyleUniversal) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "Use --" + ARG_NEW_COMPACTION_STYLE + " to specify new compaction " +
+        "style. Check ldb help for proper compaction style value.\n");
+    return;
+  }
+
+  if (new_compaction_style_ == old_compaction_style_) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "Old compaction style is the same as new compaction style. "
+        "Nothing to do.\n");
+    return;
+  }
+
+  if (old_compaction_style_ == kCompactionStyleUniversal &&
+      new_compaction_style_ == kCompactionStyleLevel) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "Convert from universal compaction to level compaction. "
+        "Nothing to do.\n");
+    return;
+  }
+}
+
+void ChangeCompactionStyleCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(ChangeCompactionStyleCommand::Name());
+  ret.append(" --" + ARG_OLD_COMPACTION_STYLE + "=<Old compaction style: 0 " +
+             "for level compaction, 1 for universal compaction>");
+  ret.append(" --" + ARG_NEW_COMPACTION_STYLE + "=<New compaction style: 0 " +
+             "for level compaction, 1 for universal compaction>");
+  ret.append("\n");
+}
+
+void ChangeCompactionStyleCommand::OverrideBaseCFOptions(
+    ColumnFamilyOptions* cf_opts) {
+  LDBCommand::OverrideBaseCFOptions(cf_opts);
+  if (old_compaction_style_ == kCompactionStyleLevel &&
+      new_compaction_style_ == kCompactionStyleUniversal) {
+    // In order to convert from level compaction to universal compaction, we
+    // need to compact all data into a single file and move it to level 0.
+    cf_opts->disable_auto_compactions = true;
+    cf_opts->target_file_size_base = INT_MAX;
+    cf_opts->target_file_size_multiplier = 1;
+    cf_opts->max_bytes_for_level_base = INT_MAX;
+    cf_opts->max_bytes_for_level_multiplier = 1;
+  }
+}
+
+void ChangeCompactionStyleCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+  // print db stats before we have made any change
+  std::string property;
+  std::string files_per_level;
+  for (int i = 0; i < db_->NumberLevels(GetCfHandle()); i++) {
+    db_->GetProperty(GetCfHandle(),
+                     "rocksdb.num-files-at-level" + std::to_string(i),
+                     &property);
+
+    // format print string
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%s%s", (i ? "," : ""), property.c_str());
+    files_per_level += buf;
+  }
+  fprintf(stdout, "files per level before compaction: %s\n",
+          files_per_level.c_str());
+
+  // manual compact into a single file and move the file to level 0
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 0;
+  Status s =
+      db_->CompactRange(compact_options, GetCfHandle(), nullptr, nullptr);
+  if (!s.ok()) {
+    std::stringstream oss;
+    oss << "Compaction failed: " << s.ToString();
+    exec_state_ = LDBCommandExecuteResult::Failed(oss.str());
+    return;
+  }
+
+  // verify compaction result
+  files_per_level = "";
+  int num_files = 0;
+  for (int i = 0; i < db_->NumberLevels(GetCfHandle()); i++) {
+    db_->GetProperty(GetCfHandle(),
+                     "rocksdb.num-files-at-level" + std::to_string(i),
+                     &property);
+
+    // format print string
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%s%s", (i ? "," : ""), property.c_str());
+    files_per_level += buf;
+
+    num_files = atoi(property.c_str());
+
+    // level 0 should have only 1 file
+    if (i == 0 && num_files != 1) {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          "Number of db files at "
+          "level 0 after compaction is " +
+          std::to_string(num_files) + ", not 1.\n");
+      return;
+    }
+    // other levels should have no file
+    if (i > 0 && num_files != 0) {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          "Number of db files at "
+          "level " +
+          std::to_string(i) + " after compaction is " +
+          std::to_string(num_files) + ", not 0.\n");
+      return;
+    }
+  }
+
+  fprintf(stdout, "files per level after compaction: %s\n",
+          files_per_level.c_str());
+}
+
+// ----------------------------------------------------------------------------
+
+namespace {
+
+struct StdErrReporter : public log::Reader::Reporter {
+  void Corruption(size_t /*bytes*/, const Status& s) override {
+    std::cerr << "Corruption detected in log file " << s.ToString() << "\n";
+  }
+};
+
+class InMemoryHandler : public WriteBatch::Handler {
+ public:
+  InMemoryHandler(std::stringstream& row, bool print_values,
+                  bool write_after_commit = false)
+      : Handler(),
+        row_(row),
+        print_values_(print_values),
+        write_after_commit_(write_after_commit) {}
+
+  void commonPutMerge(const Slice& key, const Slice& value) {
+    std::string k = LDBCommand::StringToHex(key.ToString());
+    if (print_values_) {
+      std::string v = LDBCommand::StringToHex(value.ToString());
+      row_ << k << " : ";
+      row_ << v << " ";
+    } else {
+      row_ << k << " ";
+    }
+  }
+
+  Status PutCF(uint32_t cf, const Slice& key, const Slice& value) override {
+    row_ << "PUT(" << cf << ") : ";
+    commonPutMerge(key, value);
+    return Status::OK();
+  }
+
+  Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) override {
+    row_ << "MERGE(" << cf << ") : ";
+    commonPutMerge(key, value);
+    return Status::OK();
+  }
+
+  Status MarkNoop(bool) override {
+    row_ << "NOOP ";
+    return Status::OK();
+  }
+
+  Status DeleteCF(uint32_t cf, const Slice& key) override {
+    row_ << "DELETE(" << cf << ") : ";
+    row_ << LDBCommand::StringToHex(key.ToString()) << " ";
+    return Status::OK();
+  }
+
+  Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+    row_ << "SINGLE_DELETE(" << cf << ") : ";
+    row_ << LDBCommand::StringToHex(key.ToString()) << " ";
+    return Status::OK();
+  }
+
+  Status DeleteRangeCF(uint32_t cf, const Slice& begin_key,
+                       const Slice& end_key) override {
+    row_ << "DELETE_RANGE(" << cf << ") : ";
+    row_ << LDBCommand::StringToHex(begin_key.ToString()) << " ";
+    row_ << LDBCommand::StringToHex(end_key.ToString()) << " ";
+    return Status::OK();
+  }
+
+  Status MarkBeginPrepare(bool unprepare) override {
+    row_ << "BEGIN_PREPARE(";
+    row_ << (unprepare ? "true" : "false") << ") ";
+    return Status::OK();
+  }
+
+  Status MarkEndPrepare(const Slice& xid) override {
+    row_ << "END_PREPARE(";
+    row_ << LDBCommand::StringToHex(xid.ToString()) << ") ";
+    return Status::OK();
+  }
+
+  Status MarkRollback(const Slice& xid) override {
+    row_ << "ROLLBACK(";
+    row_ << LDBCommand::StringToHex(xid.ToString()) << ") ";
+    return Status::OK();
+  }
+
+  Status MarkCommit(const Slice& xid) override {
+    row_ << "COMMIT(";
+    row_ << LDBCommand::StringToHex(xid.ToString()) << ") ";
+    return Status::OK();
+  }
+
+  Status MarkCommitWithTimestamp(const Slice& xid,
+                                 const Slice& commit_ts) override {
+    row_ << "COMMIT_WITH_TIMESTAMP(";
+    row_ << LDBCommand::StringToHex(xid.ToString()) << ", ";
+    row_ << LDBCommand::StringToHex(commit_ts.ToString()) << ") ";
+    return Status::OK();
+  }
+
+  ~InMemoryHandler() override {}
+
+ protected:
+  Handler::OptionState WriteAfterCommit() const override {
+    return write_after_commit_ ? Handler::OptionState::kEnabled
+                               : Handler::OptionState::kDisabled;
+  }
+
+ private:
+  std::stringstream& row_;
+  bool print_values_;
+  bool write_after_commit_;
+};
+
+void DumpWalFile(Options options, std::string wal_file, bool print_header,
+                 bool print_values, bool is_write_committed,
+                 LDBCommandExecuteResult* exec_state) {
+  const auto& fs = options.env->GetFileSystem();
+  FileOptions soptions(options);
+  std::unique_ptr<SequentialFileReader> wal_file_reader;
+  Status status = SequentialFileReader::Create(
+      fs, wal_file, soptions, &wal_file_reader, nullptr /* dbg */,
+      nullptr /* rate_limiter */);
+  if (!status.ok()) {
+    if (exec_state) {
+      *exec_state = LDBCommandExecuteResult::Failed("Failed to open WAL file " +
+                                                    status.ToString());
+    } else {
+      std::cerr << "Error: Failed to open WAL file " << status.ToString()
+                << std::endl;
+    }
+  } else {
+    StdErrReporter reporter;
+    uint64_t log_number;
+    FileType type;
+
+    // we need the log number, but ParseFilename expects dbname/NNN.log.
+    std::string sanitized = wal_file;
+    size_t lastslash = sanitized.rfind('/');
+    if (lastslash != std::string::npos)
+      sanitized = sanitized.substr(lastslash + 1);
+    if (!ParseFileName(sanitized, &log_number, &type)) {
+      // bogus input, carry on as best we can
+      log_number = 0;
+    }
+    log::Reader reader(options.info_log, std::move(wal_file_reader), &reporter,
+                       true /* checksum */, log_number);
+    std::string scratch;
+    WriteBatch batch;
+    Slice record;
+    std::stringstream row;
+    if (print_header) {
+      std::cout << "Sequence,Count,ByteSize,Physical Offset,Key(s)";
+      if (print_values) {
+        std::cout << " : value ";
+      }
+      std::cout << "\n";
+    }
+    while (status.ok() && reader.ReadRecord(&record, &scratch)) {
+      row.str("");
+      if (record.size() < WriteBatchInternal::kHeader) {
+        reporter.Corruption(record.size(),
+                            Status::Corruption("log record too small"));
+      } else {
+        status = WriteBatchInternal::SetContents(&batch, record);
+        if (!status.ok()) {
+          std::stringstream oss;
+          oss << "Parsing write batch failed: " << status.ToString();
+          if (exec_state) {
+            *exec_state = LDBCommandExecuteResult::Failed(oss.str());
+          } else {
+            std::cerr << oss.str() << std::endl;
+          }
+          break;
+        }
+        row << WriteBatchInternal::Sequence(&batch) << ",";
+        row << WriteBatchInternal::Count(&batch) << ",";
+        row << WriteBatchInternal::ByteSize(&batch) << ",";
+        row << reader.LastRecordOffset() << ",";
+        InMemoryHandler handler(row, print_values, is_write_committed);
+        status = batch.Iterate(&handler);
+        if (!status.ok()) {
+          if (exec_state) {
+            std::stringstream oss;
+            oss << "Print write batch error: " << status.ToString();
+            *exec_state = LDBCommandExecuteResult::Failed(oss.str());
+          }
+          row << "error: " << status.ToString();
+          break;
+        }
+        row << "\n";
+      }
+      std::cout << row.str();
+    }
+  }
+}
+
+}  // namespace
+
+const std::string WALDumperCommand::ARG_WAL_FILE = "walfile";
+const std::string WALDumperCommand::ARG_WRITE_COMMITTED = "write_committed";
+const std::string WALDumperCommand::ARG_PRINT_VALUE = "print_value";
+const std::string WALDumperCommand::ARG_PRINT_HEADER = "header";
+
+WALDumperCommand::WALDumperCommand(
+    const std::vector<std::string>& /*params*/,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, true,
+                 BuildCmdLineOptions({ARG_WAL_FILE, ARG_WRITE_COMMITTED,
+                                      ARG_PRINT_HEADER, ARG_PRINT_VALUE})),
+      print_header_(false),
+      print_values_(false),
+      is_write_committed_(false) {
+  wal_file_.clear();
+
+  auto itr = options.find(ARG_WAL_FILE);
+  if (itr != options.end()) {
+    wal_file_ = itr->second;
+  }
+
+  print_header_ = IsFlagPresent(flags, ARG_PRINT_HEADER);
+  print_values_ = IsFlagPresent(flags, ARG_PRINT_VALUE);
+  is_write_committed_ = ParseBooleanOption(options, ARG_WRITE_COMMITTED, true);
+
+  if (wal_file_.empty()) {
+    exec_state_ = LDBCommandExecuteResult::Failed("Argument " + ARG_WAL_FILE +
+                                                  " must be specified.");
+  }
+}
+
+void WALDumperCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(WALDumperCommand::Name());
+  ret.append(" --" + ARG_WAL_FILE + "=<write_ahead_log_file_path>");
+  ret.append(" [--" + ARG_PRINT_HEADER + "] ");
+  ret.append(" [--" + ARG_PRINT_VALUE + "] ");
+  ret.append(" [--" + ARG_WRITE_COMMITTED + "=true|false] ");
+  ret.append("\n");
+}
+
+void WALDumperCommand::DoCommand() {
+  DumpWalFile(options_, wal_file_, print_header_, print_values_,
+              is_write_committed_, &exec_state_);
+}
+
+// ----------------------------------------------------------------------------
+
+GetCommand::GetCommand(const std::vector<std::string>& params,
+                       const std::map<std::string, std::string>& options,
+                       const std::vector<std::string>& flags)
+    : LDBCommand(
+          options, flags, true,
+          BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) {
+  if (params.size() != 1) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "<key> must be specified for the get command");
+  } else {
+    key_ = params.at(0);
+  }
+
+  if (is_key_hex_) {
+    key_ = HexToString(key_);
+  }
+}
+
+void GetCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(GetCommand::Name());
+  ret.append(" <key>");
+  ret.append(" [--" + ARG_TTL + "]");
+  ret.append("\n");
+}
+
+void GetCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+  std::string value;
+  Status st = db_->Get(ReadOptions(), GetCfHandle(), key_, &value);
+  if (st.ok()) {
+    fprintf(stdout, "%s\n",
+            (is_value_hex_ ? StringToHex(value) : value).c_str());
+  } else {
+    std::stringstream oss;
+    oss << "Get failed: " << st.ToString();
+    exec_state_ = LDBCommandExecuteResult::Failed(oss.str());
+  }
+}
+
+// ----------------------------------------------------------------------------
+
+ApproxSizeCommand::ApproxSizeCommand(
+    const std::vector<std::string>& /*params*/,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, true,
+                 BuildCmdLineOptions(
+                     {ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, ARG_FROM, ARG_TO})) {
+  if (options.find(ARG_FROM) != options.end()) {
+    start_key_ = options.find(ARG_FROM)->second;
+  } else {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        ARG_FROM + " must be specified for approxsize command");
+    return;
+  }
+
+  if (options.find(ARG_TO) != options.end()) {
+    end_key_ = options.find(ARG_TO)->second;
+  } else {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        ARG_TO + " must be specified for approxsize command");
+    return;
+  }
+
+  if (is_key_hex_) {
+    start_key_ = HexToString(start_key_);
+    end_key_ = HexToString(end_key_);
+  }
+}
+
+void ApproxSizeCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(ApproxSizeCommand::Name());
+  ret.append(HelpRangeCmdArgs());
+  ret.append("\n");
+}
+
+void ApproxSizeCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+  Range ranges[1];
+  ranges[0] = Range(start_key_, end_key_);
+  uint64_t sizes[1];
+  Status s = db_->GetApproximateSizes(GetCfHandle(), ranges, 1, sizes);
+  if (!s.ok()) {
+    std::stringstream oss;
+    oss << "ApproximateSize failed: " << s.ToString();
+    exec_state_ = LDBCommandExecuteResult::Failed(oss.str());
+  } else {
+    fprintf(stdout, "%lu\n", (unsigned long)sizes[0]);
+  }
+}
+
+// ----------------------------------------------------------------------------
+
+BatchPutCommand::BatchPutCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, false,
+                 BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX,
+                                      ARG_VALUE_HEX, ARG_CREATE_IF_MISSING})) {
+  if (params.size() < 2) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "At least one <key> <value> pair must be specified batchput.");
+  } else if (params.size() % 2 != 0) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "Equal number of <key>s and <value>s must be specified for batchput.");
+  } else {
+    for (size_t i = 0; i < params.size(); i += 2) {
+      std::string key = params.at(i);
+      std::string value = params.at(i + 1);
+      key_values_.push_back(std::pair<std::string, std::string>(
+          is_key_hex_ ? HexToString(key) : key,
+          is_value_hex_ ? HexToString(value) : value));
+    }
+  }
+  create_if_missing_ = IsFlagPresent(flags_, ARG_CREATE_IF_MISSING);
+}
+
+void BatchPutCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(BatchPutCommand::Name());
+  ret.append(" <key> <value> [<key> <value>] [..]");
+  ret.append(" [--" + ARG_CREATE_IF_MISSING + "]");
+  ret.append(" [--" + ARG_TTL + "]");
+  ret.append("\n");
+}
+
+void BatchPutCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+  WriteBatch batch;
+
+  Status st;
+  std::stringstream oss;
+  for (std::vector<std::pair<std::string, std::string>>::const_iterator itr =
+           key_values_.begin();
+       itr != key_values_.end(); ++itr) {
+    st = batch.Put(GetCfHandle(), itr->first, itr->second);
+    if (!st.ok()) {
+      oss << "Put to write batch failed: " << itr->first << "=>" << itr->second
+          << " error: " << st.ToString();
+      break;
+    }
+  }
+  if (st.ok()) {
+    st = db_->Write(WriteOptions(), &batch);
+    if (!st.ok()) {
+      oss << "Write failed: " << st.ToString();
+    }
+  }
+  if (st.ok()) {
+    fprintf(stdout, "OK\n");
+  } else {
+    exec_state_ = LDBCommandExecuteResult::Failed(oss.str());
+  }
+}
+
+void BatchPutCommand::OverrideBaseOptions() {
+  LDBCommand::OverrideBaseOptions();
+  options_.create_if_missing = create_if_missing_;
+}
+
+// ----------------------------------------------------------------------------
+
+ScanCommand::ScanCommand(const std::vector<std::string>& /*params*/,
+                         const std::map<std::string, std::string>& options,
+                         const std::vector<std::string>& flags)
+    : LDBCommand(
+          options, flags, true,
+          BuildCmdLineOptions({ARG_TTL, ARG_NO_VALUE, ARG_HEX, ARG_KEY_HEX,
+                               ARG_TO, ARG_VALUE_HEX, ARG_FROM, ARG_TIMESTAMP,
+                               ARG_MAX_KEYS, ARG_TTL_START, ARG_TTL_END})),
+      start_key_specified_(false),
+      end_key_specified_(false),
+      max_keys_scanned_(-1),
+      no_value_(false) {
+  auto itr = options.find(ARG_FROM);
+  if (itr != options.end()) {
+    start_key_ = itr->second;
+    if (is_key_hex_) {
+      start_key_ = HexToString(start_key_);
+    }
+    start_key_specified_ = true;
+  }
+  itr = options.find(ARG_TO);
+  if (itr != options.end()) {
+    end_key_ = itr->second;
+    if (is_key_hex_) {
+      end_key_ = HexToString(end_key_);
+    }
+    end_key_specified_ = true;
+  }
+
+  std::vector<std::string>::const_iterator vitr =
+      std::find(flags.begin(), flags.end(), ARG_NO_VALUE);
+  if (vitr != flags.end()) {
+    no_value_ = true;
+  }
+
+  itr = options.find(ARG_MAX_KEYS);
+  if (itr != options.end()) {
+    try {
+#if defined(CYGWIN)
+      max_keys_scanned_ = strtol(itr->second.c_str(), 0, 10);
+#else
+      max_keys_scanned_ = std::stoi(itr->second);
+#endif
+    } catch (const std::invalid_argument&) {
+      exec_state_ = LDBCommandExecuteResult::Failed(ARG_MAX_KEYS +
+                                                    " has an invalid value");
+    } catch (const std::out_of_range&) {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          ARG_MAX_KEYS + " has a value out-of-range");
+    }
+  }
+}
+
+void ScanCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(ScanCommand::Name());
+  ret.append(HelpRangeCmdArgs());
+  ret.append(" [--" + ARG_TTL + "]");
+  ret.append(" [--" + ARG_TIMESTAMP + "]");
+  ret.append(" [--" + ARG_MAX_KEYS + "=<N>q] ");
+  ret.append(" [--" + ARG_TTL_START + "=<N>:- is inclusive]");
+  ret.append(" [--" + ARG_TTL_END + "=<N>:- is exclusive]");
+  ret.append(" [--" + ARG_NO_VALUE + "]");
+  ret.append("\n");
+}
+
+void ScanCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+
+  int num_keys_scanned = 0;
+  ReadOptions scan_read_opts;
+  scan_read_opts.total_order_seek = true;
+  Iterator* it = db_->NewIterator(scan_read_opts, GetCfHandle());
+  if (start_key_specified_) {
+    it->Seek(start_key_);
+  } else {
+    it->SeekToFirst();
+  }
+  int ttl_start;
+  if (!ParseIntOption(option_map_, ARG_TTL_START, ttl_start, exec_state_)) {
+    ttl_start = DBWithTTLImpl::kMinTimestamp;  // TTL introduction time
+  }
+  int ttl_end;
+  if (!ParseIntOption(option_map_, ARG_TTL_END, ttl_end, exec_state_)) {
+    ttl_end = DBWithTTLImpl::kMaxTimestamp;  // Max time allowed by TTL feature
+  }
+  if (ttl_end < ttl_start) {
+    fprintf(stderr, "Error: End time can't be less than start time\n");
+    delete it;
+    return;
+  }
+  if (is_db_ttl_ && timestamp_) {
+    fprintf(stdout, "Scanning key-values from %s to %s\n",
+            TimeToHumanString(ttl_start).c_str(),
+            TimeToHumanString(ttl_end).c_str());
+  }
+  for (;
+       it->Valid() && (!end_key_specified_ || it->key().ToString() < end_key_);
+       it->Next()) {
+    if (is_db_ttl_) {
+      TtlIterator* it_ttl = static_cast_with_check<TtlIterator>(it);
+      int rawtime = it_ttl->ttl_timestamp();
+      if (rawtime < ttl_start || rawtime >= ttl_end) {
+        continue;
+      }
+      if (timestamp_) {
+        fprintf(stdout, "%s ", TimeToHumanString(rawtime).c_str());
+      }
+    }
+
+    Slice key_slice = it->key();
+
+    std::string formatted_key;
+    if (is_key_hex_) {
+      formatted_key = "0x" + key_slice.ToString(true /* hex */);
+      key_slice = formatted_key;
+    } else if (ldb_options_.key_formatter) {
+      formatted_key = ldb_options_.key_formatter->Format(key_slice);
+      key_slice = formatted_key;
+    }
+
+    if (no_value_) {
+      fprintf(stdout, "%.*s\n", static_cast<int>(key_slice.size()),
+              key_slice.data());
+    } else {
+      Slice val_slice = it->value();
+      std::string formatted_value;
+      if (is_value_hex_) {
+        formatted_value = "0x" + val_slice.ToString(true /* hex */);
+        val_slice = formatted_value;
+      }
+      fprintf(stdout, "%.*s : %.*s\n", static_cast<int>(key_slice.size()),
+              key_slice.data(), static_cast<int>(val_slice.size()),
+              val_slice.data());
+    }
+
+    num_keys_scanned++;
+    if (max_keys_scanned_ >= 0 && num_keys_scanned >= max_keys_scanned_) {
+      break;
+    }
+  }
+  if (!it->status().ok()) {  // Check for any errors found during the scan
+    exec_state_ = LDBCommandExecuteResult::Failed(it->status().ToString());
+  }
+  delete it;
+}
+
+// ----------------------------------------------------------------------------
+
+DeleteCommand::DeleteCommand(const std::vector<std::string>& params,
+                             const std::map<std::string, std::string>& options,
+                             const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, false,
+                 BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) {
+  if (params.size() != 1) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "KEY must be specified for the delete command");
+  } else {
+    key_ = params.at(0);
+    if (is_key_hex_) {
+      key_ = HexToString(key_);
+    }
+  }
+}
+
+void DeleteCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(DeleteCommand::Name() + " <key>");
+  ret.append("\n");
+}
+
+void DeleteCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+  Status st = db_->Delete(WriteOptions(), GetCfHandle(), key_);
+  if (st.ok()) {
+    fprintf(stdout, "OK\n");
+  } else {
+    exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
+  }
+}
+
+SingleDeleteCommand::SingleDeleteCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, false,
+                 BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) {
+  if (params.size() != 1) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "KEY must be specified for the single delete command");
+  } else {
+    key_ = params.at(0);
+    if (is_key_hex_) {
+      key_ = HexToString(key_);
+    }
+  }
+}
+
+void SingleDeleteCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(SingleDeleteCommand::Name() + " <key>");
+  ret.append("\n");
+}
+
+void SingleDeleteCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+  Status st = db_->SingleDelete(WriteOptions(), GetCfHandle(), key_);
+  if (st.ok()) {
+    fprintf(stdout, "OK\n");
+  } else {
+    exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
+  }
+}
+
+DeleteRangeCommand::DeleteRangeCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, false,
+                 BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) {
+  if (params.size() != 2) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "begin and end keys must be specified for the delete command");
+  } else {
+    begin_key_ = params.at(0);
+    end_key_ = params.at(1);
+    if (is_key_hex_) {
+      begin_key_ = HexToString(begin_key_);
+      end_key_ = HexToString(end_key_);
+    }
+  }
+}
+
+void DeleteRangeCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(DeleteRangeCommand::Name() + " <begin key> <end key>");
+  ret.append("\n");
+}
+
+void DeleteRangeCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+  Status st =
+      db_->DeleteRange(WriteOptions(), GetCfHandle(), begin_key_, end_key_);
+  if (st.ok()) {
+    fprintf(stdout, "OK\n");
+  } else {
+    exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
+  }
+}
+
+PutCommand::PutCommand(const std::vector<std::string>& params,
+                       const std::map<std::string, std::string>& options,
+                       const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, false,
+                 BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX,
+                                      ARG_VALUE_HEX, ARG_CREATE_IF_MISSING})) {
+  if (params.size() != 2) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "<key> and <value> must be specified for the put command");
+  } else {
+    key_ = params.at(0);
+    value_ = params.at(1);
+  }
+
+  if (is_key_hex_) {
+    key_ = HexToString(key_);
+  }
+
+  if (is_value_hex_) {
+    value_ = HexToString(value_);
+  }
+  create_if_missing_ = IsFlagPresent(flags_, ARG_CREATE_IF_MISSING);
+}
+
+void PutCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(PutCommand::Name());
+  ret.append(" <key> <value>");
+  ret.append(" [--" + ARG_CREATE_IF_MISSING + "]");
+  ret.append(" [--" + ARG_TTL + "]");
+  ret.append("\n");
+}
+
+void PutCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+  Status st = db_->Put(WriteOptions(), GetCfHandle(), key_, value_);
+  if (st.ok()) {
+    fprintf(stdout, "OK\n");
+  } else {
+    exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
+  }
+}
+
+void PutCommand::OverrideBaseOptions() {
+  LDBCommand::OverrideBaseOptions();
+  options_.create_if_missing = create_if_missing_;
+}
+
+// ----------------------------------------------------------------------------
+
+const char* DBQuerierCommand::HELP_CMD = "help";
+const char* DBQuerierCommand::GET_CMD = "get";
+const char* DBQuerierCommand::PUT_CMD = "put";
+const char* DBQuerierCommand::DELETE_CMD = "delete";
+
+DBQuerierCommand::DBQuerierCommand(
+    const std::vector<std::string>& /*params*/,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(
+          options, flags, false,
+          BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) {
+
+}
+
+void DBQuerierCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(DBQuerierCommand::Name());
+  ret.append(" [--" + ARG_TTL + "]");
+  ret.append("\n");
+  ret.append(
+      "    Starts a REPL shell.  Type help for list of available "
+      "commands.");
+  ret.append("\n");
+}
+
+void DBQuerierCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+
+  ReadOptions read_options;
+  WriteOptions write_options;
+
+  std::string line;
+  std::string key;
+  std::string value;
+  Status s;
+  std::stringstream oss;
+  while (s.ok() && getline(std::cin, line, '\n')) {
+    // Parse line into std::vector<std::string>
+    std::vector<std::string> tokens;
+    size_t pos = 0;
+    while (true) {
+      size_t pos2 = line.find(' ', pos);
+      if (pos2 == std::string::npos) {
+        break;
+      }
+      tokens.push_back(line.substr(pos, pos2 - pos));
+      pos = pos2 + 1;
+    }
+    tokens.push_back(line.substr(pos));
+
+    const std::string& cmd = tokens[0];
+
+    if (cmd == HELP_CMD) {
+      fprintf(stdout,
+              "get <key>\n"
+              "put <key> <value>\n"
+              "delete <key>\n");
+    } else if (cmd == DELETE_CMD && tokens.size() == 2) {
+      key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]);
+      s = db_->Delete(write_options, GetCfHandle(), Slice(key));
+      if (s.ok()) {
+        fprintf(stdout, "Successfully deleted %s\n", tokens[1].c_str());
+      } else {
+        oss << "delete " << key << " failed: " << s.ToString();
+      }
+    } else if (cmd == PUT_CMD && tokens.size() == 3) {
+      key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]);
+      value = (is_value_hex_ ? HexToString(tokens[2]) : tokens[2]);
+      s = db_->Put(write_options, GetCfHandle(), Slice(key), Slice(value));
+      if (s.ok()) {
+        fprintf(stdout, "Successfully put %s %s\n", tokens[1].c_str(),
+                tokens[2].c_str());
+      } else {
+        oss << "put " << key << "=>" << value << " failed: " << s.ToString();
+      }
+    } else if (cmd == GET_CMD && tokens.size() == 2) {
+      key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]);
+      s = db_->Get(read_options, GetCfHandle(), Slice(key), &value);
+      if (s.ok()) {
+        fprintf(stdout, "%s\n",
+                PrintKeyValue(key, value, is_key_hex_, is_value_hex_).c_str());
+      } else {
+        if (s.IsNotFound()) {
+          fprintf(stdout, "Not found %s\n", tokens[1].c_str());
+        } else {
+          oss << "get " << key << " error: " << s.ToString();
+        }
+      }
+    } else {
+      fprintf(stdout, "Unknown command %s\n", line.c_str());
+    }
+  }
+  if (!s.ok()) {
+    exec_state_ = LDBCommandExecuteResult::Failed(oss.str());
+  }
+}
+
+// ----------------------------------------------------------------------------
+
+CheckConsistencyCommand::CheckConsistencyCommand(
+    const std::vector<std::string>& /*params*/,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, true, BuildCmdLineOptions({})) {}
+
+void CheckConsistencyCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(CheckConsistencyCommand::Name());
+  ret.append("\n");
+}
+
+void CheckConsistencyCommand::DoCommand() {
+  options_.paranoid_checks = true;
+  options_.num_levels = 64;
+  OpenDB();
+  if (exec_state_.IsSucceed() || exec_state_.IsNotStarted()) {
+    fprintf(stdout, "OK\n");
+  }
+  CloseDB();
+}
+
+// ----------------------------------------------------------------------------
+
+const std::string CheckPointCommand::ARG_CHECKPOINT_DIR = "checkpoint_dir";
+
+CheckPointCommand::CheckPointCommand(
+    const std::vector<std::string>& /*params*/,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, false /* is_read_only */,
+                 BuildCmdLineOptions({ARG_CHECKPOINT_DIR})) {
+  auto itr = options.find(ARG_CHECKPOINT_DIR);
+  if (itr != options.end()) {
+    checkpoint_dir_ = itr->second;
+  }
+}
+
+void CheckPointCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(CheckPointCommand::Name());
+  ret.append(" [--" + ARG_CHECKPOINT_DIR + "] ");
+  ret.append("\n");
+}
+
+void CheckPointCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+  Checkpoint* checkpoint;
+  Status status = Checkpoint::Create(db_, &checkpoint);
+  status = checkpoint->CreateCheckpoint(checkpoint_dir_);
+  if (status.ok()) {
+    fprintf(stdout, "OK\n");
+  } else {
+    exec_state_ = LDBCommandExecuteResult::Failed(status.ToString());
+  }
+}
+
+// ----------------------------------------------------------------------------
+
+const std::string RepairCommand::ARG_VERBOSE = "verbose";
+
+RepairCommand::RepairCommand(const std::vector<std::string>& /*params*/,
+                             const std::map<std::string, std::string>& options,
+                             const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, false, BuildCmdLineOptions({ARG_VERBOSE})) {
+  verbose_ = IsFlagPresent(flags, ARG_VERBOSE);
+}
+
+void RepairCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(RepairCommand::Name());
+  ret.append(" [--" + ARG_VERBOSE + "]");
+  ret.append("\n");
+}
+
+void RepairCommand::OverrideBaseOptions() {
+  LDBCommand::OverrideBaseOptions();
+  auto level = verbose_ ? InfoLogLevel::INFO_LEVEL : InfoLogLevel::WARN_LEVEL;
+  options_.info_log.reset(new StderrLogger(level));
+}
+
+void RepairCommand::DoCommand() {
+  PrepareOptions();
+  Status status = RepairDB(db_path_, options_);
+  if (status.ok()) {
+    fprintf(stdout, "OK\n");
+  } else {
+    exec_state_ = LDBCommandExecuteResult::Failed(status.ToString());
+  }
+}
+
+// ----------------------------------------------------------------------------
+
+const std::string BackupEngineCommand::ARG_NUM_THREADS = "num_threads";
+const std::string BackupEngineCommand::ARG_BACKUP_ENV_URI = "backup_env_uri";
+const std::string BackupEngineCommand::ARG_BACKUP_FS_URI = "backup_fs_uri";
+const std::string BackupEngineCommand::ARG_BACKUP_DIR = "backup_dir";
+const std::string BackupEngineCommand::ARG_STDERR_LOG_LEVEL =
+    "stderr_log_level";
+
+BackupEngineCommand::BackupEngineCommand(
+    const std::vector<std::string>& /*params*/,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, false /* is_read_only */,
+                 BuildCmdLineOptions({ARG_BACKUP_ENV_URI, ARG_BACKUP_FS_URI,
+                                      ARG_BACKUP_DIR, ARG_NUM_THREADS,
+                                      ARG_STDERR_LOG_LEVEL})),
+      num_threads_(1) {
+  auto itr = options.find(ARG_NUM_THREADS);
+  if (itr != options.end()) {
+    num_threads_ = std::stoi(itr->second);
+  }
+  itr = options.find(ARG_BACKUP_ENV_URI);
+  if (itr != options.end()) {
+    backup_env_uri_ = itr->second;
+  }
+  itr = options.find(ARG_BACKUP_FS_URI);
+  if (itr != options.end()) {
+    backup_fs_uri_ = itr->second;
+  }
+  if (!backup_env_uri_.empty() && !backup_fs_uri_.empty()) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "you may not specity both --" + ARG_BACKUP_ENV_URI + " and --" +
+        ARG_BACKUP_FS_URI);
+  }
+  itr = options.find(ARG_BACKUP_DIR);
+  if (itr == options.end()) {
+    exec_state_ = LDBCommandExecuteResult::Failed("--" + ARG_BACKUP_DIR +
+                                                  ": missing backup directory");
+  } else {
+    backup_dir_ = itr->second;
+  }
+
+  itr = options.find(ARG_STDERR_LOG_LEVEL);
+  if (itr != options.end()) {
+    int stderr_log_level = std::stoi(itr->second);
+    if (stderr_log_level < 0 ||
+        stderr_log_level >= InfoLogLevel::NUM_INFO_LOG_LEVELS) {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          ARG_STDERR_LOG_LEVEL + " must be >= 0 and < " +
+          std::to_string(InfoLogLevel::NUM_INFO_LOG_LEVELS) + ".");
+    } else {
+      logger_.reset(
+          new StderrLogger(static_cast<InfoLogLevel>(stderr_log_level)));
+    }
+  }
+}
+
+void BackupEngineCommand::Help(const std::string& name, std::string& ret) {
+  ret.append("  ");
+  ret.append(name);
+  ret.append(" [--" + ARG_BACKUP_ENV_URI + " | --" + ARG_BACKUP_FS_URI + "] ");
+  ret.append(" [--" + ARG_BACKUP_DIR + "] ");
+  ret.append(" [--" + ARG_NUM_THREADS + "] ");
+  ret.append(" [--" + ARG_STDERR_LOG_LEVEL + "=<int (InfoLogLevel)>] ");
+  ret.append("\n");
+}
+
+// ----------------------------------------------------------------------------
+
+BackupCommand::BackupCommand(const std::vector<std::string>& params,
+                             const std::map<std::string, std::string>& options,
+                             const std::vector<std::string>& flags)
+    : BackupEngineCommand(params, options, flags) {}
+
+void BackupCommand::Help(std::string& ret) {
+  BackupEngineCommand::Help(Name(), ret);
+}
+
+void BackupCommand::DoCommand() {
+  BackupEngine* backup_engine;
+  Status status;
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+  fprintf(stdout, "open db OK\n");
+
+  Env* custom_env = backup_env_guard_.get();
+  if (custom_env == nullptr) {
+    Status s =
+        Env::CreateFromUri(config_options_, backup_env_uri_, backup_fs_uri_,
+                           &custom_env, &backup_env_guard_);
+    if (!s.ok()) {
+      exec_state_ = LDBCommandExecuteResult::Failed(s.ToString());
+      return;
+    }
+  }
+  assert(custom_env != nullptr);
+
+  BackupEngineOptions backup_options =
+      BackupEngineOptions(backup_dir_, custom_env);
+  backup_options.info_log = logger_.get();
+  backup_options.max_background_operations = num_threads_;
+  status = BackupEngine::Open(options_.env, backup_options, &backup_engine);
+  if (status.ok()) {
+    fprintf(stdout, "open backup engine OK\n");
+  } else {
+    exec_state_ = LDBCommandExecuteResult::Failed(status.ToString());
+    return;
+  }
+  status = backup_engine->CreateNewBackup(db_);
+  if (status.ok()) {
+    fprintf(stdout, "create new backup OK\n");
+  } else {
+    exec_state_ = LDBCommandExecuteResult::Failed(status.ToString());
+    return;
+  }
+}
+
+// ----------------------------------------------------------------------------
+
+RestoreCommand::RestoreCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : BackupEngineCommand(params, options, flags) {}
+
+void RestoreCommand::Help(std::string& ret) {
+  BackupEngineCommand::Help(Name(), ret);
+}
+
+void RestoreCommand::DoCommand() {
+  Env* custom_env = backup_env_guard_.get();
+  if (custom_env == nullptr) {
+    Status s =
+        Env::CreateFromUri(config_options_, backup_env_uri_, backup_fs_uri_,
+                           &custom_env, &backup_env_guard_);
+    if (!s.ok()) {
+      exec_state_ = LDBCommandExecuteResult::Failed(s.ToString());
+      return;
+    }
+  }
+  assert(custom_env != nullptr);
+
+  std::unique_ptr<BackupEngineReadOnly> restore_engine;
+  Status status;
+  {
+    BackupEngineOptions opts(backup_dir_, custom_env);
+    opts.info_log = logger_.get();
+    opts.max_background_operations = num_threads_;
+    BackupEngineReadOnly* raw_restore_engine_ptr;
+    status =
+        BackupEngineReadOnly::Open(options_.env, opts, &raw_restore_engine_ptr);
+    if (status.ok()) {
+      restore_engine.reset(raw_restore_engine_ptr);
+    }
+  }
+  if (status.ok()) {
+    fprintf(stdout, "open restore engine OK\n");
+    status = restore_engine->RestoreDBFromLatestBackup(db_path_, db_path_);
+  }
+  if (status.ok()) {
+    fprintf(stdout, "restore from backup OK\n");
+  } else {
+    exec_state_ = LDBCommandExecuteResult::Failed(status.ToString());
+  }
+}
+
+// ----------------------------------------------------------------------------
+
+namespace {
+
+void DumpSstFile(Options options, std::string filename, bool output_hex,
+                 bool show_properties, bool decode_blob_index,
+                 std::string from_key, std::string to_key) {
+  if (filename.length() <= 4 ||
+      filename.rfind(".sst") != filename.length() - 4) {
+    std::cout << "Invalid sst file name." << std::endl;
+    return;
+  }
+  // no verification
+  ROCKSDB_NAMESPACE::SstFileDumper dumper(
+      options, filename, Temperature::kUnknown,
+      2 * 1024 * 1024 /* readahead_size */,
+      /* verify_checksum */ false, output_hex, decode_blob_index);
+  Status st = dumper.ReadSequential(true, std::numeric_limits<uint64_t>::max(),
+                                    !from_key.empty(), from_key,
+                                    !to_key.empty(), to_key);
+  if (!st.ok()) {
+    std::cerr << "Error in reading SST file " << filename << st.ToString()
+              << std::endl;
+    return;
+  }
+
+  if (show_properties) {
+    const ROCKSDB_NAMESPACE::TableProperties* table_properties;
+
+    std::shared_ptr<const ROCKSDB_NAMESPACE::TableProperties>
+        table_properties_from_reader;
+    st = dumper.ReadTableProperties(&table_properties_from_reader);
+    if (!st.ok()) {
+      std::cerr << filename << ": " << st.ToString()
+                << ". Try to use initial table properties" << std::endl;
+      table_properties = dumper.GetInitTableProperties();
+    } else {
+      table_properties = table_properties_from_reader.get();
+    }
+    if (table_properties != nullptr) {
+      std::cout << std::endl << "Table Properties:" << std::endl;
+      std::cout << table_properties->ToString("\n") << std::endl;
+    }
+  }
+}
+
+void DumpBlobFile(const std::string& filename, bool is_key_hex,
+                  bool is_value_hex, bool dump_uncompressed_blobs) {
+  using ROCKSDB_NAMESPACE::blob_db::BlobDumpTool;
+  BlobDumpTool tool;
+  BlobDumpTool::DisplayType blob_type = is_value_hex
+                                            ? BlobDumpTool::DisplayType::kHex
+                                            : BlobDumpTool::DisplayType::kRaw;
+  BlobDumpTool::DisplayType show_uncompressed_blob =
+      dump_uncompressed_blobs ? blob_type : BlobDumpTool::DisplayType::kNone;
+  BlobDumpTool::DisplayType show_blob =
+      dump_uncompressed_blobs ? BlobDumpTool::DisplayType::kNone : blob_type;
+
+  BlobDumpTool::DisplayType show_key = is_key_hex
+                                           ? BlobDumpTool::DisplayType::kHex
+                                           : BlobDumpTool::DisplayType::kRaw;
+  Status s = tool.Run(filename, show_key, show_blob, show_uncompressed_blob,
+                      /* show_summary */ true);
+  if (!s.ok()) {
+    fprintf(stderr, "Failed: %s\n", s.ToString().c_str());
+  }
+}
+}  // namespace
+
+DBFileDumperCommand::DBFileDumperCommand(
+    const std::vector<std::string>& /*params*/,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, true,
+                 BuildCmdLineOptions(
+                     {ARG_DECODE_BLOB_INDEX, ARG_DUMP_UNCOMPRESSED_BLOBS})),
+      decode_blob_index_(IsFlagPresent(flags, ARG_DECODE_BLOB_INDEX)),
+      dump_uncompressed_blobs_(
+          IsFlagPresent(flags, ARG_DUMP_UNCOMPRESSED_BLOBS)) {}
+
+void DBFileDumperCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(DBFileDumperCommand::Name());
+  ret.append(" [--" + ARG_DECODE_BLOB_INDEX + "] ");
+  ret.append(" [--" + ARG_DUMP_UNCOMPRESSED_BLOBS + "] ");
+  ret.append("\n");
+}
+
+void DBFileDumperCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+  Status s;
+
+  // TODO: Use --hex, --key_hex, --value_hex flags consistently for
+  // dumping manifest file, sst files and blob files.
+  std::cout << "Manifest File" << std::endl;
+  std::cout << "==============================" << std::endl;
+  std::string manifest_filename;
+  s = ReadFileToString(db_->GetEnv(), CurrentFileName(db_->GetName()),
+                       &manifest_filename);
+  if (!s.ok() || manifest_filename.empty() ||
+      manifest_filename.back() != '\n') {
+    std::cerr << "Error when reading CURRENT file "
+              << CurrentFileName(db_->GetName()) << std::endl;
+  }
+  // remove the trailing '\n'
+  manifest_filename.resize(manifest_filename.size() - 1);
+  std::string manifest_filepath = db_->GetName() + "/" + manifest_filename;
+  // Correct concatenation of filepath and filename:
+  // Check that there is no double slashes (or more!) when concatenation
+  // happens.
+  manifest_filepath = NormalizePath(manifest_filepath);
+
+  std::cout << manifest_filepath << std::endl;
+  DumpManifestFile(options_, manifest_filepath, false, false, false);
+  std::cout << std::endl;
+
+  std::vector<ColumnFamilyMetaData> column_families;
+  db_->GetAllColumnFamilyMetaData(&column_families);
+  for (const auto& column_family : column_families) {
+    std::cout << "Column family name: " << column_family.name << std::endl;
+    std::cout << "==============================" << std::endl;
+    std::cout << std::endl;
+    std::cout << "SST Files" << std::endl;
+    std::cout << "==============================" << std::endl;
+    for (const LevelMetaData& level : column_family.levels) {
+      for (const SstFileMetaData& sst_file : level.files) {
+        std::string filename = sst_file.db_path + "/" + sst_file.name;
+        // Correct concatenation of filepath and filename:
+        // Check that there is no double slashes (or more!) when concatenation
+        // happens.
+        filename = NormalizePath(filename);
+        std::cout << filename << " level:" << level.level << std::endl;
+        std::cout << "------------------------------" << std::endl;
+        DumpSstFile(options_, filename, false, true, decode_blob_index_);
+        std::cout << std::endl;
+      }
+    }
+    std::cout << "Blob Files" << std::endl;
+    std::cout << "==============================" << std::endl;
+    for (const BlobMetaData& blob_file : column_family.blob_files) {
+      std::string filename =
+          blob_file.blob_file_path + "/" + blob_file.blob_file_name;
+      // Correct concatenation of filepath and filename:
+      // Check that there is no double slashes (or more!) when concatenation
+      // happens.
+      filename = NormalizePath(filename);
+      std::cout << filename << std::endl;
+      std::cout << "------------------------------" << std::endl;
+      DumpBlobFile(filename, /* is_key_hex */ false, /* is_value_hex */ false,
+                   dump_uncompressed_blobs_);
+      std::cout << std::endl;
+    }
+  }
+  std::cout << std::endl;
+
+  std::cout << "Write Ahead Log Files" << std::endl;
+  std::cout << "==============================" << std::endl;
+  ROCKSDB_NAMESPACE::VectorLogPtr wal_files;
+  s = db_->GetSortedWalFiles(wal_files);
+  if (!s.ok()) {
+    std::cerr << "Error when getting WAL files" << std::endl;
+  } else {
+    std::string wal_dir;
+    if (options_.wal_dir.empty()) {
+      wal_dir = db_->GetName();
+    } else {
+      wal_dir = NormalizePath(options_.wal_dir + "/");
+    }
+    for (auto& wal : wal_files) {
+      // TODO(qyang): option.wal_dir should be passed into ldb command
+      std::string filename = wal_dir + wal->PathName();
+      std::cout << filename << std::endl;
+      // TODO(myabandeh): allow configuring is_write_commited
+      DumpWalFile(options_, filename, true, true, true /* is_write_commited */,
+                  &exec_state_);
+    }
+  }
+}
+
+const std::string DBLiveFilesMetadataDumperCommand::ARG_SORT_BY_FILENAME =
+    "sort_by_filename";
+
+DBLiveFilesMetadataDumperCommand::DBLiveFilesMetadataDumperCommand(
+    const std::vector<std::string>& /*params*/,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, true,
+                 BuildCmdLineOptions({ARG_SORT_BY_FILENAME})) {
+  sort_by_filename_ = IsFlagPresent(flags, ARG_SORT_BY_FILENAME);
+}
+
+void DBLiveFilesMetadataDumperCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(DBLiveFilesMetadataDumperCommand::Name());
+  ret.append(" [--" + ARG_SORT_BY_FILENAME + "] ");
+  ret.append("\n");
+}
+
+void DBLiveFilesMetadataDumperCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+  Status s;
+
+  std::vector<ColumnFamilyMetaData> metadata;
+  db_->GetAllColumnFamilyMetaData(&metadata);
+  if (sort_by_filename_) {
+    std::cout << "Live SST and Blob Files:" << std::endl;
+    // tuple of <file path, level, column family name>
+    std::vector<std::tuple<std::string, int, std::string>> all_files;
+
+    for (const auto& column_metadata : metadata) {
+      // Iterate Levels
+      const auto& levels = column_metadata.levels;
+      const std::string& cf = column_metadata.name;
+      for (const auto& level_metadata : levels) {
+        // Iterate SST files
+        const auto& sst_files = level_metadata.files;
+        int level = level_metadata.level;
+        for (const auto& sst_metadata : sst_files) {
+          // The SstFileMetaData.name always starts with "/",
+          // however SstFileMetaData.db_path is the string provided by
+          // the user as an input. Therefore we check if we can
+          // concantenate the two strings directly or if we need to
+          // drop a possible extra "/" at the end of SstFileMetaData.db_path.
+          std::string filename =
+              NormalizePath(sst_metadata.db_path + "/" + sst_metadata.name);
+          all_files.emplace_back(filename, level, cf);
+        }  // End of for-loop over sst files
+      }    // End of for-loop over levels
+
+      const auto& blob_files = column_metadata.blob_files;
+      for (const auto& blob_metadata : blob_files) {
+        // The BlobMetaData.blob_file_name always starts with "/",
+        // however BlobMetaData.blob_file_path is the string provided by
+        // the user as an input. Therefore we check if we can
+        // concantenate the two strings directly or if we need to
+        // drop a possible extra "/" at the end of BlobMetaData.blob_file_path.
+        std::string filename = NormalizePath(
+            blob_metadata.blob_file_path + "/" + blob_metadata.blob_file_name);
+        // Level for blob files is encoded as -1
+        all_files.emplace_back(filename, -1, cf);
+      }  // End of for-loop over blob files
+    }    // End of for-loop over column metadata
+
+    // Sort by filename (i.e. first entry in tuple)
+    std::sort(all_files.begin(), all_files.end());
+
+    for (const auto& item : all_files) {
+      const std::string& filename = std::get<0>(item);
+      int level = std::get<1>(item);
+      const std::string& cf = std::get<2>(item);
+      if (level == -1) {  // Blob File
+        std::cout << filename << ", column family '" << cf << "'" << std::endl;
+      } else {  // SST file
+        std::cout << filename << " : level " << level << ", column family '"
+                  << cf << "'" << std::endl;
+      }
+    }
+  } else {
+    for (const auto& column_metadata : metadata) {
+      std::cout << "===== Column Family: " << column_metadata.name
+                << " =====" << std::endl;
+
+      std::cout << "Live SST Files:" << std::endl;
+      // Iterate levels
+      const auto& levels = column_metadata.levels;
+      for (const auto& level_metadata : levels) {
+        std::cout << "---------- level " << level_metadata.level
+                  << " ----------" << std::endl;
+        // Iterate SST files
+        const auto& sst_files = level_metadata.files;
+        for (const auto& sst_metadata : sst_files) {
+          // The SstFileMetaData.name always starts with "/",
+          // however SstFileMetaData.db_path is the string provided by
+          // the user as an input. Therefore we check if we can
+          // concantenate the two strings directly or if we need to
+          // drop a possible extra "/" at the end of SstFileMetaData.db_path.
+          std::string filename =
+              NormalizePath(sst_metadata.db_path + "/" + sst_metadata.name);
+          std::cout << filename << std::endl;
+        }  // End of for-loop over sst files
+      }    // End of for-loop over levels
+
+      std::cout << "Live Blob Files:" << std::endl;
+      const auto& blob_files = column_metadata.blob_files;
+      for (const auto& blob_metadata : blob_files) {
+        // The BlobMetaData.blob_file_name always starts with "/",
+        // however BlobMetaData.blob_file_path is the string provided by
+        // the user as an input. Therefore we check if we can
+        // concantenate the two strings directly or if we need to
+        // drop a possible extra "/" at the end of BlobMetaData.blob_file_path.
+        std::string filename = NormalizePath(
+            blob_metadata.blob_file_path + "/" + blob_metadata.blob_file_name);
+        std::cout << filename << std::endl;
+      }  // End of for-loop over blob files
+    }    // End of for-loop over column metadata
+  }      // End of else ("not sort_by_filename")
+  std::cout << "------------------------------" << std::endl;
+}
+
+void WriteExternalSstFilesCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(WriteExternalSstFilesCommand::Name());
+  ret.append(" <output_sst_path>");
+  ret.append("\n");
+}
+
+WriteExternalSstFilesCommand::WriteExternalSstFilesCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(
+          options, flags, false /* is_read_only */,
+          BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, ARG_FROM,
+                               ARG_TO, ARG_CREATE_IF_MISSING})) {
+  create_if_missing_ =
+      IsFlagPresent(flags, ARG_CREATE_IF_MISSING) ||
+      ParseBooleanOption(options, ARG_CREATE_IF_MISSING, false);
+  if (params.size() != 1) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "output SST file path must be specified");
+  } else {
+    output_sst_path_ = params.at(0);
+  }
+}
+
+void WriteExternalSstFilesCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+  ColumnFamilyHandle* cfh = GetCfHandle();
+  SstFileWriter sst_file_writer(EnvOptions(), db_->GetOptions(), cfh);
+  Status status = sst_file_writer.Open(output_sst_path_);
+  if (!status.ok()) {
+    exec_state_ = LDBCommandExecuteResult::Failed("failed to open SST file: " +
+                                                  status.ToString());
+    return;
+  }
+
+  int bad_lines = 0;
+  std::string line;
+  std::ifstream ifs_stdin("/dev/stdin");
+  std::istream* istream_p = ifs_stdin.is_open() ? &ifs_stdin : &std::cin;
+  while (getline(*istream_p, line, '\n')) {
+    std::string key;
+    std::string value;
+    if (ParseKeyValue(line, &key, &value, is_key_hex_, is_value_hex_)) {
+      status = sst_file_writer.Put(key, value);
+      if (!status.ok()) {
+        exec_state_ = LDBCommandExecuteResult::Failed(
+            "failed to write record to file: " + status.ToString());
+        return;
+      }
+    } else if (0 == line.find("Keys in range:")) {
+      // ignore this line
+    } else if (0 == line.find("Created bg thread 0x")) {
+      // ignore this line
+    } else {
+      bad_lines++;
+    }
+  }
+
+  status = sst_file_writer.Finish();
+  if (!status.ok()) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "Failed to finish writing to file: " + status.ToString());
+    return;
+  }
+
+  if (bad_lines > 0) {
+    fprintf(stderr, "Warning: %d bad lines ignored.\n", bad_lines);
+  }
+  exec_state_ = LDBCommandExecuteResult::Succeed(
+      "external SST file written to " + output_sst_path_);
+}
+
+void WriteExternalSstFilesCommand::OverrideBaseOptions() {
+  LDBCommand::OverrideBaseOptions();
+  options_.create_if_missing = create_if_missing_;
+}
+
+const std::string IngestExternalSstFilesCommand::ARG_MOVE_FILES = "move_files";
+const std::string IngestExternalSstFilesCommand::ARG_SNAPSHOT_CONSISTENCY =
+    "snapshot_consistency";
+const std::string IngestExternalSstFilesCommand::ARG_ALLOW_GLOBAL_SEQNO =
+    "allow_global_seqno";
+const std::string IngestExternalSstFilesCommand::ARG_ALLOW_BLOCKING_FLUSH =
+    "allow_blocking_flush";
+const std::string IngestExternalSstFilesCommand::ARG_INGEST_BEHIND =
+    "ingest_behind";
+const std::string IngestExternalSstFilesCommand::ARG_WRITE_GLOBAL_SEQNO =
+    "write_global_seqno";
+
+void IngestExternalSstFilesCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(IngestExternalSstFilesCommand::Name());
+  ret.append(" <input_sst_path>");
+  ret.append(" [--" + ARG_MOVE_FILES + "] ");
+  ret.append(" [--" + ARG_SNAPSHOT_CONSISTENCY + "] ");
+  ret.append(" [--" + ARG_ALLOW_GLOBAL_SEQNO + "] ");
+  ret.append(" [--" + ARG_ALLOW_BLOCKING_FLUSH + "] ");
+  ret.append(" [--" + ARG_INGEST_BEHIND + "] ");
+  ret.append(" [--" + ARG_WRITE_GLOBAL_SEQNO + "] ");
+  ret.append("\n");
+}
+
+IngestExternalSstFilesCommand::IngestExternalSstFilesCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(
+          options, flags, false /* is_read_only */,
+          BuildCmdLineOptions({ARG_MOVE_FILES, ARG_SNAPSHOT_CONSISTENCY,
+                               ARG_ALLOW_GLOBAL_SEQNO, ARG_CREATE_IF_MISSING,
+                               ARG_ALLOW_BLOCKING_FLUSH, ARG_INGEST_BEHIND,
+                               ARG_WRITE_GLOBAL_SEQNO})),
+      move_files_(false),
+      snapshot_consistency_(true),
+      allow_global_seqno_(true),
+      allow_blocking_flush_(true),
+      ingest_behind_(false),
+      write_global_seqno_(true) {
+  create_if_missing_ =
+      IsFlagPresent(flags, ARG_CREATE_IF_MISSING) ||
+      ParseBooleanOption(options, ARG_CREATE_IF_MISSING, false);
+  move_files_ = IsFlagPresent(flags, ARG_MOVE_FILES) ||
+                ParseBooleanOption(options, ARG_MOVE_FILES, false);
+  snapshot_consistency_ =
+      IsFlagPresent(flags, ARG_SNAPSHOT_CONSISTENCY) ||
+      ParseBooleanOption(options, ARG_SNAPSHOT_CONSISTENCY, true);
+  allow_global_seqno_ =
+      IsFlagPresent(flags, ARG_ALLOW_GLOBAL_SEQNO) ||
+      ParseBooleanOption(options, ARG_ALLOW_GLOBAL_SEQNO, true);
+  allow_blocking_flush_ =
+      IsFlagPresent(flags, ARG_ALLOW_BLOCKING_FLUSH) ||
+      ParseBooleanOption(options, ARG_ALLOW_BLOCKING_FLUSH, true);
+  ingest_behind_ = IsFlagPresent(flags, ARG_INGEST_BEHIND) ||
+                   ParseBooleanOption(options, ARG_INGEST_BEHIND, false);
+  write_global_seqno_ =
+      IsFlagPresent(flags, ARG_WRITE_GLOBAL_SEQNO) ||
+      ParseBooleanOption(options, ARG_WRITE_GLOBAL_SEQNO, true);
+
+  if (allow_global_seqno_) {
+    if (!write_global_seqno_) {
+      fprintf(stderr,
+              "Warning: not writing global_seqno to the ingested SST can\n"
+              "prevent older versions of RocksDB from being able to open it\n");
+    }
+  } else {
+    if (write_global_seqno_) {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          "ldb cannot write global_seqno to the ingested SST when global_seqno "
+          "is not allowed");
+    }
+  }
+
+  if (params.size() != 1) {
+    exec_state_ =
+        LDBCommandExecuteResult::Failed("input SST path must be specified");
+  } else {
+    input_sst_path_ = params.at(0);
+  }
+}
+
+void IngestExternalSstFilesCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+  if (GetExecuteState().IsFailed()) {
+    return;
+  }
+  ColumnFamilyHandle* cfh = GetCfHandle();
+  IngestExternalFileOptions ifo;
+  ifo.move_files = move_files_;
+  ifo.snapshot_consistency = snapshot_consistency_;
+  ifo.allow_global_seqno = allow_global_seqno_;
+  ifo.allow_blocking_flush = allow_blocking_flush_;
+  ifo.ingest_behind = ingest_behind_;
+  ifo.write_global_seqno = write_global_seqno_;
+  Status status = db_->IngestExternalFile(cfh, {input_sst_path_}, ifo);
+  if (!status.ok()) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "failed to ingest external SST: " + status.ToString());
+  } else {
+    exec_state_ =
+        LDBCommandExecuteResult::Succeed("external SST files ingested");
+  }
+}
+
+void IngestExternalSstFilesCommand::OverrideBaseOptions() {
+  LDBCommand::OverrideBaseOptions();
+  options_.create_if_missing = create_if_missing_;
+}
+
+ListFileRangeDeletesCommand::ListFileRangeDeletesCommand(
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, true, BuildCmdLineOptions({ARG_MAX_KEYS})) {
+  auto itr = options.find(ARG_MAX_KEYS);
+  if (itr != options.end()) {
+    try {
+#if defined(CYGWIN)
+      max_keys_ = strtol(itr->second.c_str(), 0, 10);
+#else
+      max_keys_ = std::stoi(itr->second);
+#endif
+    } catch (const std::invalid_argument&) {
+      exec_state_ = LDBCommandExecuteResult::Failed(ARG_MAX_KEYS +
+                                                    " has an invalid value");
+    } catch (const std::out_of_range&) {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          ARG_MAX_KEYS + " has a value out-of-range");
+    }
+  }
+}
+
+void ListFileRangeDeletesCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(ListFileRangeDeletesCommand::Name());
+  ret.append(" [--" + ARG_MAX_KEYS + "=<N>]");
+  ret.append(" : print tombstones in SST files.\n");
+}
+
+void ListFileRangeDeletesCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+
+  DBImpl* db_impl = static_cast_with_check<DBImpl>(db_->GetRootDB());
+
+  std::string out_str;
+
+  Status st =
+      db_impl->TablesRangeTombstoneSummary(GetCfHandle(), max_keys_, &out_str);
+  if (st.ok()) {
+    TEST_SYNC_POINT_CALLBACK(
+        "ListFileRangeDeletesCommand::DoCommand:BeforePrint", &out_str);
+    fprintf(stdout, "%s\n", out_str.c_str());
+  }
+}
+
+void UnsafeRemoveSstFileCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(UnsafeRemoveSstFileCommand::Name());
+  ret.append(" <SST file number>");
+  ret.append("  ");
+  ret.append("    MUST NOT be used on a live DB.");
+  ret.append("\n");
+}
+
+UnsafeRemoveSstFileCommand::UnsafeRemoveSstFileCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, false /* is_read_only */,
+                 BuildCmdLineOptions({})) {
+  if (params.size() != 1) {
+    exec_state_ =
+        LDBCommandExecuteResult::Failed("SST file number must be specified");
+  } else {
+    char* endptr = nullptr;
+    sst_file_number_ = strtoull(params.at(0).c_str(), &endptr, 10 /* base */);
+    if (endptr == nullptr || *endptr != '\0') {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          "Failed to parse SST file number " + params.at(0));
+    }
+  }
+}
+
+void UnsafeRemoveSstFileCommand::DoCommand() {
+  PrepareOptions();
+
+  OfflineManifestWriter w(options_, db_path_);
+  if (column_families_.empty()) {
+    column_families_.emplace_back(kDefaultColumnFamilyName, options_);
+  }
+  Status s = w.Recover(column_families_);
+
+  ColumnFamilyData* cfd = nullptr;
+  int level = -1;
+  if (s.ok()) {
+    FileMetaData* metadata = nullptr;
+    s = w.Versions().GetMetadataForFile(sst_file_number_, &level, &metadata,
+                                        &cfd);
+  }
+
+  if (s.ok()) {
+    VersionEdit edit;
+    edit.SetColumnFamily(cfd->GetID());
+    edit.DeleteFile(level, sst_file_number_);
+    std::unique_ptr<FSDirectory> db_dir;
+    s = options_.env->GetFileSystem()->NewDirectory(db_path_, IOOptions(),
+                                                    &db_dir, nullptr);
+    if (s.ok()) {
+      s = w.LogAndApply(cfd, &edit, db_dir.get());
+    }
+  }
+
+  if (!s.ok()) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "failed to unsafely remove SST file: " + s.ToString());
+  } else {
+    exec_state_ = LDBCommandExecuteResult::Succeed("unsafely removed SST file");
+  }
+}
+
+const std::string UpdateManifestCommand::ARG_VERBOSE = "verbose";
+const std::string UpdateManifestCommand::ARG_UPDATE_TEMPERATURES =
+    "update_temperatures";
+
+void UpdateManifestCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(UpdateManifestCommand::Name());
+  ret.append(" [--update_temperatures]");
+  ret.append("  ");
+  ret.append("    MUST NOT be used on a live DB.");
+  ret.append("\n");
+}
+
+UpdateManifestCommand::UpdateManifestCommand(
+    const std::vector<std::string>& /*params*/,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, false /* is_read_only */,
+                 BuildCmdLineOptions({ARG_VERBOSE, ARG_UPDATE_TEMPERATURES})) {
+  verbose_ = IsFlagPresent(flags, ARG_VERBOSE) ||
+             ParseBooleanOption(options, ARG_VERBOSE, false);
+  update_temperatures_ =
+      IsFlagPresent(flags, ARG_UPDATE_TEMPERATURES) ||
+      ParseBooleanOption(options, ARG_UPDATE_TEMPERATURES, false);
+
+  if (!update_temperatures_) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "No action like --update_temperatures specified for update_manifest");
+  }
+}
+
+void UpdateManifestCommand::DoCommand() {
+  PrepareOptions();
+
+  auto level = verbose_ ? InfoLogLevel::INFO_LEVEL : InfoLogLevel::WARN_LEVEL;
+  options_.info_log.reset(new StderrLogger(level));
+
+  experimental::UpdateManifestForFilesStateOptions opts;
+  opts.update_temperatures = update_temperatures_;
+  if (column_families_.empty()) {
+    column_families_.emplace_back(kDefaultColumnFamilyName, options_);
+  }
+  Status s = experimental::UpdateManifestForFilesState(options_, db_path_,
+                                                       column_families_);
+
+  if (!s.ok()) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "failed to update manifest: " + s.ToString());
+  } else {
+    exec_state_ =
+        LDBCommandExecuteResult::Succeed("Manifest updates successful");
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/ldb_cmd_impl.h b/src/rocksdb/tools/ldb_cmd_impl.h
new file mode 100644
index 000000000..97de981b1
--- /dev/null
+++ b/src/rocksdb/tools/ldb_cmd_impl.h
@@ -0,0 +1,744 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "rocksdb/utilities/ldb_cmd.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CompactorCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "compact"; }
+
+  CompactorCommand(const std::vector<std::string>& params,
+                   const std::map<std::string, std::string>& options,
+                   const std::vector<std::string>& flags);
+
+  static void Help(std::string& ret);
+
+  void DoCommand() override;
+
+ private:
+  bool null_from_;
+  std::string from_;
+  bool null_to_;
+  std::string to_;
+};
+
+class DBFileDumperCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "dump_live_files"; }
+
+  DBFileDumperCommand(const std::vector<std::string>& params,
+                      const std::map<std::string, std::string>& options,
+                      const std::vector<std::string>& flags);
+
+  static void Help(std::string& ret);
+
+  void DoCommand() override;
+
+ private:
+  bool decode_blob_index_;
+  bool dump_uncompressed_blobs_;
+};
+
+class DBLiveFilesMetadataDumperCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "list_live_files_metadata"; }
+
+  DBLiveFilesMetadataDumperCommand(
+      const std::vector<std::string>& params,
+      const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags);
+
+  static void Help(std::string& ret);
+
+  void DoCommand() override;
+
+ private:
+  bool sort_by_filename_;
+
+  static const std::string ARG_SORT_BY_FILENAME;
+};
+
+class DBDumperCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "dump"; }
+
+  DBDumperCommand(const std::vector<std::string>& params,
+                  const std::map<std::string, std::string>& options,
+                  const std::vector<std::string>& flags);
+
+  static void Help(std::string& ret);
+
+  void DoCommand() override;
+
+ private:
+  /**
+   * Extract file name from the full path. We handle both the forward slash (/)
+   * and backslash (\) to make sure that different OS-s are supported.
+   */
+  static std::string GetFileNameFromPath(const std::string& s) {
+    std::size_t n = s.find_last_of("/\\");
+
+    if (std::string::npos == n) {
+      return s;
+    } else {
+      return s.substr(n + 1);
+    }
+  }
+
+  void DoDumpCommand();
+
+  bool null_from_;
+  std::string from_;
+  bool null_to_;
+  std::string to_;
+  int max_keys_;
+  std::string delim_;
+  bool count_only_;
+  bool count_delim_;
+  bool print_stats_;
+  std::string path_;
+  bool decode_blob_index_;
+  bool dump_uncompressed_blobs_;
+
+  static const std::string ARG_COUNT_ONLY;
+  static const std::string ARG_COUNT_DELIM;
+  static const std::string ARG_STATS;
+  static const std::string ARG_TTL_BUCKET;
+};
+
+class InternalDumpCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "idump"; }
+
+  InternalDumpCommand(const std::vector<std::string>& params,
+                      const std::map<std::string, std::string>& options,
+                      const std::vector<std::string>& flags);
+
+  static void Help(std::string& ret);
+
+  void DoCommand() override;
+
+ private:
+  bool has_from_;
+  std::string from_;
+  bool has_to_;
+  std::string to_;
+  int max_keys_;
+  std::string delim_;
+  bool count_only_;
+  bool count_delim_;
+  bool print_stats_;
+  bool is_input_key_hex_;
+  bool decode_blob_index_;
+
+  static const std::string ARG_DELIM;
+  static const std::string ARG_COUNT_ONLY;
+  static const std::string ARG_COUNT_DELIM;
+  static const std::string ARG_STATS;
+  static const std::string ARG_INPUT_KEY_HEX;
+};
+
+class DBLoaderCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "load"; }
+
+  DBLoaderCommand(std::string& db_name, std::vector<std::string>& args);
+
+  DBLoaderCommand(const std::vector<std::string>& params,
+                  const std::map<std::string, std::string>& options,
+                  const std::vector<std::string>& flags);
+
+  static void Help(std::string& ret);
+
+  void DoCommand() override;
+
+  void OverrideBaseOptions() override;
+
+ private:
+  bool disable_wal_;
+  bool bulk_load_;
+  bool compact_;
+
+  static const std::string ARG_DISABLE_WAL;
+  static const std::string ARG_BULK_LOAD;
+  static const std::string ARG_COMPACT;
+};
+
+class ManifestDumpCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "manifest_dump"; }
+
+  ManifestDumpCommand(const std::vector<std::string>& params,
+                      const std::map<std::string, std::string>& options,
+                      const std::vector<std::string>& flags);
+
+  static void Help(std::string& ret);
+
+  void DoCommand() override;
+
+  bool NoDBOpen() override { return true; }
+
+ private:
+  bool verbose_;
+  bool json_;
+  std::string path_;
+
+  static const std::string ARG_VERBOSE;
+  static const std::string ARG_JSON;
+  static const std::string ARG_PATH;
+};
+
+class UpdateManifestCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "update_manifest"; }
+
+  UpdateManifestCommand(const std::vector<std::string>& params,
+                        const std::map<std::string, std::string>& options,
+                        const std::vector<std::string>& flags);
+
+  static void Help(std::string& ret);
+  virtual void DoCommand() override;
+
+  virtual bool NoDBOpen() override { return true; }
+
+ private:
+  bool verbose_;
+  bool update_temperatures_;
+  // TODO future: checksum_func for populating checksums
+
+  static const std::string ARG_VERBOSE;
+  static const std::string ARG_UPDATE_TEMPERATURES;
+};
+
+class FileChecksumDumpCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "file_checksum_dump"; }
+
+  FileChecksumDumpCommand(const std::vector<std::string>& params,
+                          const std::map<std::string, std::string>& options,
+                          const std::vector<std::string>& flags);
+
+  static void Help(std::string& ret);
+  void DoCommand() override;
+
+  bool NoDBOpen() override { return true; }
+
+ private:
+  std::string path_;
+  bool is_checksum_hex_;
+
+  static const std::string ARG_PATH;
+};
+
+class GetPropertyCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "get_property"; }
+
+  GetPropertyCommand(const std::vector<std::string>& params,
+                     const std::map<std::string, std::string>& options,
+                     const std::vector<std::string>& flags);
+
+  static void Help(std::string& ret);
+  void DoCommand() override;
+
+ private:
+  std::string property_;
+};
+
+class ListColumnFamiliesCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "list_column_families"; }
+
+  ListColumnFamiliesCommand(const std::vector<std::string>& params,
+                            const std::map<std::string, std::string>& options,
+                            const std::vector<std::string>& flags);
+
+  static void Help(std::string& ret);
+
+  void DoCommand() override;
+
+  bool NoDBOpen() override { return true; }
+};
+
+class CreateColumnFamilyCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "create_column_family"; }
+
+  CreateColumnFamilyCommand(const std::vector<std::string>& params,
+                            const std::map<std::string, std::string>& options,
+                            const std::vector<std::string>& flags);
+
+  static void Help(std::string& ret);
+
+  void DoCommand() override;
+
+  bool NoDBOpen() override { return false; }
+
+ private:
+  std::string new_cf_name_;
+};
+
+class DropColumnFamilyCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "drop_column_family"; }
+
+  DropColumnFamilyCommand(const std::vector<std::string>& params,
+                          const std::map<std::string, std::string>& options,
+                          const std::vector<std::string>& flags);
+
+  static void Help(std::string& ret);
+
+  void DoCommand() override;
+
+  bool NoDBOpen() override { return false; }
+
+ private:
+  std::string cf_name_to_drop_;
+};
+
+class ReduceDBLevelsCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "reduce_levels"; }
+
+  ReduceDBLevelsCommand(const std::vector<std::string>& params,
+                        const std::map<std::string, std::string>& options,
+                        const std::vector<std::string>& flags);
+
+  void OverrideBaseCFOptions(ColumnFamilyOptions* cf_opts) override;
+
+  void DoCommand() override;
+
+  bool NoDBOpen() override { return true; }
+
+  static void Help(std::string& msg);
+
+  static std::vector<std::string> PrepareArgs(const std::string& db_path,
+                                              int new_levels,
+                                              bool print_old_level = false);
+
+ private:
+  int old_levels_;
+  int new_levels_;
+  bool print_old_levels_;
+
+  static const std::string ARG_NEW_LEVELS;
+  static const std::string ARG_PRINT_OLD_LEVELS;
+
+  Status GetOldNumOfLevels(Options& opt, int* levels);
+};
+
+class ChangeCompactionStyleCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "change_compaction_style"; }
+
+  ChangeCompactionStyleCommand(
+      const std::vector<std::string>& params,
+      const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags);
+
+  void OverrideBaseCFOptions(ColumnFamilyOptions* cf_opts) override;
+
+  void DoCommand() override;
+
+  static void Help(std::string& msg);
+
+ private:
+  int old_compaction_style_;
+  int new_compaction_style_;
+
+  static const std::string ARG_OLD_COMPACTION_STYLE;
+  static const std::string ARG_NEW_COMPACTION_STYLE;
+};
+
+class WALDumperCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "dump_wal"; }
+
+  WALDumperCommand(const std::vector<std::string>& params,
+                   const std::map<std::string, std::string>& options,
+                   const std::vector<std::string>& flags);
+
+  bool NoDBOpen() override { return true; }
+
+  static void Help(std::string& ret);
+
+  void DoCommand() override;
+
+ private:
+  bool print_header_;
+  std::string wal_file_;
+  bool print_values_;
+  bool is_write_committed_;  // default will be set to true
+
+  static const std::string ARG_WAL_FILE;
+  static const std::string ARG_WRITE_COMMITTED;
+  static const std::string ARG_PRINT_HEADER;
+  static const std::string ARG_PRINT_VALUE;
+};
+
+class GetCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "get"; }
+
+  GetCommand(const std::vector<std::string>& params,
+             const std::map<std::string, std::string>& options,
+             const std::vector<std::string>& flags);
+
+  void DoCommand() override;
+
+  static void Help(std::string& ret);
+
+ private:
+  std::string key_;
+};
+
+class ApproxSizeCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "approxsize"; }
+
+  ApproxSizeCommand(const std::vector<std::string>& params,
+                    const std::map<std::string, std::string>& options,
+                    const std::vector<std::string>& flags);
+
+  void DoCommand() override;
+
+  static void Help(std::string& ret);
+
+ private:
+  std::string start_key_;
+  std::string end_key_;
+};
+
+class BatchPutCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "batchput"; }
+
+  BatchPutCommand(const std::vector<std::string>& params,
+                  const std::map<std::string, std::string>& options,
+                  const std::vector<std::string>& flags);
+
+  void DoCommand() override;
+
+  static void Help(std::string& ret);
+
+  void OverrideBaseOptions() override;
+
+ private:
+  /**
+   * The key-values to be inserted.
+   */
+  std::vector<std::pair<std::string, std::string>> key_values_;
+};
+
+class ScanCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "scan"; }
+
+  ScanCommand(const std::vector<std::string>& params,
+              const std::map<std::string, std::string>& options,
+              const std::vector<std::string>& flags);
+
+  void DoCommand() override;
+
+  static void Help(std::string& ret);
+
+ private:
+  std::string start_key_;
+  std::string end_key_;
+  bool start_key_specified_;
+  bool end_key_specified_;
+  int max_keys_scanned_;
+  bool no_value_;
+};
+
+class DeleteCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "delete"; }
+
+  DeleteCommand(const std::vector<std::string>& params,
+                const std::map<std::string, std::string>& options,
+                const std::vector<std::string>& flags);
+
+  void DoCommand() override;
+
+  static void Help(std::string& ret);
+
+ private:
+  std::string key_;
+};
+
+class SingleDeleteCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "singledelete"; }
+
+  SingleDeleteCommand(const std::vector<std::string>& params,
+                      const std::map<std::string, std::string>& options,
+                      const std::vector<std::string>& flags);
+
+  void DoCommand() override;
+
+  static void Help(std::string& ret);
+
+ private:
+  std::string key_;
+};
+
+class DeleteRangeCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "deleterange"; }
+
+  DeleteRangeCommand(const std::vector<std::string>& params,
+                     const std::map<std::string, std::string>& options,
+                     const std::vector<std::string>& flags);
+
+  void DoCommand() override;
+
+  static void Help(std::string& ret);
+
+ private:
+  std::string begin_key_;
+  std::string end_key_;
+};
+
+class PutCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "put"; }
+
+  PutCommand(const std::vector<std::string>& params,
+             const std::map<std::string, std::string>& options,
+             const std::vector<std::string>& flags);
+
+  void DoCommand() override;
+
+  static void Help(std::string& ret);
+
+  void OverrideBaseOptions() override;
+
+ private:
+  std::string key_;
+  std::string value_;
+};
+
+/**
+ * Command that starts up a REPL shell that allows
+ * get/put/delete.
+ */
+class DBQuerierCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "query"; }
+
+  DBQuerierCommand(const std::vector<std::string>& params,
+                   const std::map<std::string, std::string>& options,
+                   const std::vector<std::string>& flags);
+
+  static void Help(std::string& ret);
+
+  void DoCommand() override;
+
+ private:
+  static const char* HELP_CMD;
+  static const char* GET_CMD;
+  static const char* PUT_CMD;
+  static const char* DELETE_CMD;
+};
+
+class CheckConsistencyCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "checkconsistency"; }
+
+  CheckConsistencyCommand(const std::vector<std::string>& params,
+                          const std::map<std::string, std::string>& options,
+                          const std::vector<std::string>& flags);
+
+  void DoCommand() override;
+
+  bool NoDBOpen() override { return true; }
+
+  static void Help(std::string& ret);
+};
+
+class CheckPointCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "checkpoint"; }
+
+  CheckPointCommand(const std::vector<std::string>& params,
+                    const std::map<std::string, std::string>& options,
+                    const std::vector<std::string>& flags);
+
+  void DoCommand() override;
+
+  static void Help(std::string& ret);
+
+  std::string checkpoint_dir_;
+
+ private:
+  static const std::string ARG_CHECKPOINT_DIR;
+};
+
+class RepairCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "repair"; }
+
+  RepairCommand(const std::vector<std::string>& params,
+                const std::map<std::string, std::string>& options,
+                const std::vector<std::string>& flags);
+
+  void DoCommand() override;
+
+  bool NoDBOpen() override { return true; }
+
+  void OverrideBaseOptions() override;
+
+  static void Help(std::string& ret);
+
+ protected:
+  bool verbose_;
+
+ private:
+  static const std::string ARG_VERBOSE;
+};
+
+class BackupEngineCommand : public LDBCommand {
+ public:
+  BackupEngineCommand(const std::vector<std::string>& params,
+                      const std::map<std::string, std::string>& options,
+                      const std::vector<std::string>& flags);
+
+ protected:
+  static void Help(const std::string& name, std::string& ret);
+  std::string backup_env_uri_;
+  std::string backup_fs_uri_;
+  std::string backup_dir_;
+  int num_threads_;
+  std::unique_ptr<Logger> logger_;
+  std::shared_ptr<Env> backup_env_guard_;
+
+ private:
+  static const std::string ARG_BACKUP_DIR;
+  static const std::string ARG_BACKUP_ENV_URI;
+  static const std::string ARG_BACKUP_FS_URI;
+  static const std::string ARG_NUM_THREADS;
+  static const std::string ARG_STDERR_LOG_LEVEL;
+};
+
+class BackupCommand : public BackupEngineCommand {
+ public:
+  static std::string Name() { return "backup"; }
+  BackupCommand(const std::vector<std::string>& params,
+                const std::map<std::string, std::string>& options,
+                const std::vector<std::string>& flags);
+  void DoCommand() override;
+  static void Help(std::string& ret);
+};
+
+class RestoreCommand : public BackupEngineCommand {
+ public:
+  static std::string Name() { return "restore"; }
+  RestoreCommand(const std::vector<std::string>& params,
+                 const std::map<std::string, std::string>& options,
+                 const std::vector<std::string>& flags);
+  void DoCommand() override;
+  bool NoDBOpen() override { return true; }
+  static void Help(std::string& ret);
+};
+
+class WriteExternalSstFilesCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "write_extern_sst"; }
+  WriteExternalSstFilesCommand(
+      const std::vector<std::string>& params,
+      const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags);
+
+  void DoCommand() override;
+
+  bool NoDBOpen() override { return false; }
+
+  void OverrideBaseOptions() override;
+
+  static void Help(std::string& ret);
+
+ private:
+  std::string output_sst_path_;
+};
+
+class IngestExternalSstFilesCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "ingest_extern_sst"; }
+  IngestExternalSstFilesCommand(
+      const std::vector<std::string>& params,
+      const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags);
+
+  void DoCommand() override;
+
+  bool NoDBOpen() override { return false; }
+
+  void OverrideBaseOptions() override;
+
+  static void Help(std::string& ret);
+
+ private:
+  std::string input_sst_path_;
+  bool move_files_;
+  bool snapshot_consistency_;
+  bool allow_global_seqno_;
+  bool allow_blocking_flush_;
+  bool ingest_behind_;
+  bool write_global_seqno_;
+
+  static const std::string ARG_MOVE_FILES;
+  static const std::string ARG_SNAPSHOT_CONSISTENCY;
+  static const std::string ARG_ALLOW_GLOBAL_SEQNO;
+  static const std::string ARG_ALLOW_BLOCKING_FLUSH;
+  static const std::string ARG_INGEST_BEHIND;
+  static const std::string ARG_WRITE_GLOBAL_SEQNO;
+};
+
+// Command that prints out range delete tombstones in SST files.
+class ListFileRangeDeletesCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "list_file_range_deletes"; }
+
+  ListFileRangeDeletesCommand(const std::map<std::string, std::string>& options,
+                              const std::vector<std::string>& flags);
+
+  void DoCommand() override;
+
+  static void Help(std::string& ret);
+
+ private:
+  int max_keys_ = 1000;
+};
+
+// Command that removes the SST file forcibly from the manifest.
+class UnsafeRemoveSstFileCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "unsafe_remove_sst_file"; }
+
+  UnsafeRemoveSstFileCommand(const std::vector<std::string>& params,
+                             const std::map<std::string, std::string>& options,
+                             const std::vector<std::string>& flags);
+
+  static void Help(std::string& ret);
+
+  void DoCommand() override;
+
+  bool NoDBOpen() override { return true; }
+
+ private:
+  uint64_t sst_file_number_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/tools/ldb_cmd_test.cc b/src/rocksdb/tools/ldb_cmd_test.cc
new file mode 100644
index 000000000..5d83a6cd9
--- /dev/null
+++ b/src/rocksdb/tools/ldb_cmd_test.cc
@@ -0,0 +1,1226 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+#include "rocksdb/utilities/ldb_cmd.h"
+
+#include <cinttypes>
+
+#include "db/db_test_util.h"
+#include "db/version_edit.h"
+#include "db/version_set.h"
+#include "env/composite_env_wrapper.h"
+#include "file/filename.h"
+#include "port/stack_trace.h"
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/file_checksum.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/utilities/options_util.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/file_checksum_helper.h"
+#include "util/random.h"
+
+using std::map;
+using std::string;
+using std::vector;
+
+namespace ROCKSDB_NAMESPACE {
+
+class LdbCmdTest : public testing::Test {
+ public:
+  LdbCmdTest() : testing::Test() {}
+
+  Env* TryLoadCustomOrDefaultEnv() {
+    Env* env = Env::Default();
+    EXPECT_OK(test::CreateEnvFromSystem(ConfigOptions(), &env, &env_guard_));
+    return env;
+  }
+
+ private:
+  std::shared_ptr<Env> env_guard_;
+};
+
+TEST_F(LdbCmdTest, HelpAndVersion) {
+  Options o;
+  o.env = TryLoadCustomOrDefaultEnv();
+  LDBOptions lo;
+  static const char* help[] = {"./ldb", "--help"};
+  ASSERT_EQ(0, LDBCommandRunner::RunCommand(2, help, o, lo, nullptr));
+  static const char* version[] = {"./ldb", "--version"};
+  ASSERT_EQ(0, LDBCommandRunner::RunCommand(2, version, o, lo, nullptr));
+  static const char* bad[] = {"./ldb", "--not_an_option"};
+  ASSERT_NE(0, LDBCommandRunner::RunCommand(2, bad, o, lo, nullptr));
+}
+
+TEST_F(LdbCmdTest, HexToString) {
+  // map input to expected outputs.
+  // odd number of "hex" half bytes doesn't make sense
+  map<string, vector<int>> inputMap = {
+      {"0x07", {7}},        {"0x5050", {80, 80}},          {"0xFF", {-1}},
+      {"0x1234", {18, 52}}, {"0xaaAbAC", {-86, -85, -84}}, {"0x1203", {18, 3}},
+  };
+
+  for (const auto& inPair : inputMap) {
+    auto actual = ROCKSDB_NAMESPACE::LDBCommand::HexToString(inPair.first);
+    auto expected = inPair.second;
+    for (unsigned int i = 0; i < actual.length(); i++) {
+      EXPECT_EQ(expected[i], static_cast<int>((signed char)actual[i]));
+    }
+    auto reverse = ROCKSDB_NAMESPACE::LDBCommand::StringToHex(actual);
+    EXPECT_STRCASEEQ(inPair.first.c_str(), reverse.c_str());
+  }
+}
+
+TEST_F(LdbCmdTest, HexToStringBadInputs) {
+  const vector<string> badInputs = {
+      "0xZZ", "123", "0xx5", "0x111G", "0x123", "Ox12", "0xT", "0x1Q1",
+  };
+  for (const auto& badInput : badInputs) {
+    try {
+      ROCKSDB_NAMESPACE::LDBCommand::HexToString(badInput);
+      std::cerr << "Should fail on bad hex value: " << badInput << "\n";
+      FAIL();
+    } catch (...) {
+    }
+  }
+}
+
+TEST_F(LdbCmdTest, MemEnv) {
+  Env* base_env = TryLoadCustomOrDefaultEnv();
+  std::unique_ptr<Env> env(NewMemEnv(base_env));
+  Options opts;
+  opts.env = env.get();
+  opts.create_if_missing = true;
+
+  DB* db = nullptr;
+  std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  WriteOptions wopts;
+  for (int i = 0; i < 100; i++) {
+    char buf[16];
+    snprintf(buf, sizeof(buf), "%08d", i);
+    ASSERT_OK(db->Put(wopts, buf, buf));
+  }
+  FlushOptions fopts;
+  fopts.wait = true;
+  ASSERT_OK(db->Flush(fopts));
+
+  delete db;
+
+  char arg1[] = "./ldb";
+  char arg2[1024];
+  snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str());
+  char arg3[] = "dump_live_files";
+  char* argv[] = {arg1, arg2, arg3};
+
+  ASSERT_EQ(0,
+            LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr));
+}
+
+class FileChecksumTestHelper {
+ private:
+  Options options_;
+  DB* db_;
+  std::string dbname_;
+
+  Status VerifyChecksum(LiveFileMetaData& file_meta) {
+    std::string cur_checksum;
+    std::string checksum_func_name;
+
+    Status s;
+    EnvOptions soptions;
+    std::unique_ptr<SequentialFile> file_reader;
+    std::string file_path = dbname_ + "/" + file_meta.name;
+    s = options_.env->NewSequentialFile(file_path, &file_reader, soptions);
+    if (!s.ok()) {
+      return s;
+    }
+    std::unique_ptr<char[]> scratch(new char[2048]);
+    Slice result;
+    FileChecksumGenFactory* file_checksum_gen_factory =
+        options_.file_checksum_gen_factory.get();
+    if (file_checksum_gen_factory == nullptr) {
+      cur_checksum = kUnknownFileChecksum;
+      checksum_func_name = kUnknownFileChecksumFuncName;
+    } else {
+      FileChecksumGenContext gen_context;
+      gen_context.file_name = file_meta.name;
+      std::unique_ptr<FileChecksumGenerator> file_checksum_gen =
+          file_checksum_gen_factory->CreateFileChecksumGenerator(gen_context);
+      checksum_func_name = file_checksum_gen->Name();
+      s = file_reader->Read(2048, &result, scratch.get());
+      if (!s.ok()) {
+        return s;
+      }
+      while (result.size() != 0) {
+        file_checksum_gen->Update(scratch.get(), result.size());
+        s = file_reader->Read(2048, &result, scratch.get());
+        if (!s.ok()) {
+          return s;
+        }
+      }
+      file_checksum_gen->Finalize();
+      cur_checksum = file_checksum_gen->GetChecksum();
+    }
+
+    std::string stored_checksum = file_meta.file_checksum;
+    std::string stored_checksum_func_name = file_meta.file_checksum_func_name;
+    if ((cur_checksum != stored_checksum) ||
+        (checksum_func_name != stored_checksum_func_name)) {
+      return Status::Corruption(
+          "Checksum does not match! The file: " + file_meta.name +
+          ", checksum name: " + stored_checksum_func_name + " and checksum " +
+          stored_checksum + ". However, expected checksum name: " +
+          checksum_func_name + " and checksum " + cur_checksum);
+    }
+    return Status::OK();
+  }
+
+ public:
+  FileChecksumTestHelper(Options& options, DB* db, std::string db_name)
+      : options_(options), db_(db), dbname_(db_name) {}
+  ~FileChecksumTestHelper() {}
+
+  // Verify the checksum information in Manifest.
+  Status VerifyChecksumInManifest(
+      const std::vector<LiveFileMetaData>& live_files) {
+    // Step 1: verify if the dbname_ is correct
+    if (dbname_.back() != '/') {
+      dbname_.append("/");
+    }
+
+    // Step 2, get the the checksum information by recovering the VersionSet
+    // from Manifest.
+    std::unique_ptr<FileChecksumList> checksum_list(NewFileChecksumList());
+    EnvOptions sopt;
+    std::shared_ptr<Cache> tc(NewLRUCache(options_.max_open_files - 10,
+                                          options_.table_cache_numshardbits));
+    options_.db_paths.emplace_back(dbname_, 0);
+    options_.num_levels = 64;
+    WriteController wc(options_.delayed_write_rate);
+    WriteBufferManager wb(options_.db_write_buffer_size);
+    ImmutableDBOptions immutable_db_options(options_);
+    VersionSet versions(dbname_, &immutable_db_options, sopt, tc.get(), &wb,
+                        &wc, nullptr, nullptr, "", "");
+    std::vector<std::string> cf_name_list;
+    Status s;
+    s = versions.ListColumnFamilies(&cf_name_list, dbname_,
+                                    immutable_db_options.fs.get());
+    if (s.ok()) {
+      std::vector<ColumnFamilyDescriptor> cf_list;
+      for (const auto& name : cf_name_list) {
+        fprintf(stdout, "cf_name: %s", name.c_str());
+        cf_list.emplace_back(name, ColumnFamilyOptions(options_));
+      }
+      s = versions.Recover(cf_list, true);
+    }
+    if (s.ok()) {
+      s = versions.GetLiveFilesChecksumInfo(checksum_list.get());
+    }
+    if (!s.ok()) {
+      return s;
+    }
+
+    // Step 3 verify the checksum
+    if (live_files.size() != checksum_list->size()) {
+      return Status::Corruption("The number of files does not match!");
+    }
+    for (size_t i = 0; i < live_files.size(); i++) {
+      std::string stored_checksum = "";
+      std::string stored_func_name = "";
+      s = checksum_list->SearchOneFileChecksum(
+          live_files[i].file_number, &stored_checksum, &stored_func_name);
+      if (s.IsNotFound()) {
+        return s;
+      }
+      if (live_files[i].file_checksum != stored_checksum ||
+          live_files[i].file_checksum_func_name != stored_func_name) {
+        return Status::Corruption(
+            "Checksum does not match! The file: " +
+            std::to_string(live_files[i].file_number) +
+            ". In Manifest, checksum name: " + stored_func_name +
+            " and checksum " + stored_checksum +
+            ". However, expected checksum name: " +
+            live_files[i].file_checksum_func_name + " and checksum " +
+            live_files[i].file_checksum);
+      }
+    }
+    return Status::OK();
+  }
+
+  // Verify the checksum of each file by recalculting the checksum and
+  // comparing it with the one being generated when a SST file is created.
+  Status VerifyEachFileChecksum() {
+    assert(db_ != nullptr);
+    EXPECT_OK(db_->DisableFileDeletions());
+    std::vector<LiveFileMetaData> live_files;
+    db_->GetLiveFilesMetaData(&live_files);
+    Status cs;
+    for (auto a_file : live_files) {
+      cs = VerifyChecksum(a_file);
+      if (!cs.ok()) {
+        break;
+      }
+    }
+    EXPECT_OK(db_->EnableFileDeletions());
+    return cs;
+  }
+};
+
+TEST_F(LdbCmdTest, DumpFileChecksumNoChecksum) {
+  Env* base_env = TryLoadCustomOrDefaultEnv();
+  std::unique_ptr<Env> env(NewMemEnv(base_env));
+  Options opts;
+  opts.env = env.get();
+  opts.create_if_missing = true;
+
+  DB* db = nullptr;
+  std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  WriteOptions wopts;
+  FlushOptions fopts;
+  fopts.wait = true;
+  Random rnd(test::RandomSeed());
+  for (int i = 0; i < 200; i++) {
+    char buf[16];
+    snprintf(buf, sizeof(buf), "%08d", i);
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, buf, v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  for (int i = 100; i < 300; i++) {
+    char buf[16];
+    snprintf(buf, sizeof(buf), "%08d", i);
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, buf, v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  for (int i = 200; i < 400; i++) {
+    char buf[16];
+    snprintf(buf, sizeof(buf), "%08d", i);
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, buf, v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  for (int i = 300; i < 400; i++) {
+    char buf[16];
+    snprintf(buf, sizeof(buf), "%08d", i);
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, buf, v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  ASSERT_OK(db->Close());
+  delete db;
+
+  char arg1[] = "./ldb";
+  char arg2[1024];
+  snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str());
+  char arg3[] = "file_checksum_dump";
+  char arg4[] = "--hex";
+  char* argv[] = {arg1, arg2, arg3, arg4};
+
+  ASSERT_EQ(0,
+            LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  // Verify each sst file checksum value and checksum name
+  FileChecksumTestHelper fct_helper(opts, db, dbname);
+  ASSERT_OK(fct_helper.VerifyEachFileChecksum());
+
+  // Manually trigger compaction
+  char b_buf[16];
+  snprintf(b_buf, sizeof(b_buf), "%08d", 0);
+  char e_buf[16];
+  snprintf(e_buf, sizeof(e_buf), "%08d", 399);
+  Slice begin(b_buf);
+  Slice end(e_buf);
+  CompactRangeOptions options;
+  ASSERT_OK(db->CompactRange(options, &begin, &end));
+  // Verify each sst file checksum after compaction
+  FileChecksumTestHelper fct_helper_ac(opts, db, dbname);
+  ASSERT_OK(fct_helper_ac.VerifyEachFileChecksum());
+
+  ASSERT_OK(db->Close());
+  delete db;
+
+  ASSERT_EQ(0,
+            LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  // Verify the checksum information in memory is the same as that in Manifest;
+  std::vector<LiveFileMetaData> live_files;
+  db->GetLiveFilesMetaData(&live_files);
+  delete db;
+  ASSERT_OK(fct_helper_ac.VerifyChecksumInManifest(live_files));
+}
+
+TEST_F(LdbCmdTest, BlobDBDumpFileChecksumNoChecksum) {
+  Env* base_env = TryLoadCustomOrDefaultEnv();
+  std::unique_ptr<Env> env(NewMemEnv(base_env));
+  Options opts;
+  opts.env = env.get();
+  opts.create_if_missing = true;
+  opts.enable_blob_files = true;
+
+  DB* db = nullptr;
+  std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  WriteOptions wopts;
+  FlushOptions fopts;
+  fopts.wait = true;
+  Random rnd(test::RandomSeed());
+  for (int i = 0; i < 200; i++) {
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(8) << std::fixed << i;
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, oss.str(), v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  for (int i = 100; i < 300; i++) {
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(8) << std::fixed << i;
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, oss.str(), v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  for (int i = 200; i < 400; i++) {
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(8) << std::fixed << i;
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, oss.str(), v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  for (int i = 300; i < 400; i++) {
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(8) << std::fixed << i;
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, oss.str(), v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  ASSERT_OK(db->Close());
+  delete db;
+
+  char arg1[] = "./ldb";
+  std::string arg2_str = "--db=" + dbname;
+  char arg3[] = "file_checksum_dump";
+  char arg4[] = "--hex";
+  char* argv[] = {arg1, const_cast<char*>(arg2_str.c_str()), arg3, arg4};
+
+  ASSERT_EQ(0,
+            LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  // Verify each sst and blob file checksum value and checksum name
+  FileChecksumTestHelper fct_helper(opts, db, dbname);
+  ASSERT_OK(fct_helper.VerifyEachFileChecksum());
+
+  // Manually trigger compaction
+  std::ostringstream oss_b_buf;
+  oss_b_buf << std::setfill('0') << std::setw(8) << std::fixed << 0;
+  std::ostringstream oss_e_buf;
+  oss_e_buf << std::setfill('0') << std::setw(8) << std::fixed << 399;
+  std::string b_buf = oss_b_buf.str();
+  std::string e_buf = oss_e_buf.str();
+  Slice begin(b_buf);
+  Slice end(e_buf);
+
+  CompactRangeOptions options;
+  ASSERT_OK(db->CompactRange(options, &begin, &end));
+  // Verify each sst file checksum after compaction
+  FileChecksumTestHelper fct_helper_ac(opts, db, dbname);
+  ASSERT_OK(fct_helper_ac.VerifyEachFileChecksum());
+
+  ASSERT_OK(db->Close());
+  delete db;
+
+  ASSERT_EQ(0,
+            LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+}
+
+TEST_F(LdbCmdTest, DumpFileChecksumCRC32) {
+  Env* base_env = TryLoadCustomOrDefaultEnv();
+  std::unique_ptr<Env> env(NewMemEnv(base_env));
+  Options opts;
+  opts.env = env.get();
+  opts.create_if_missing = true;
+  opts.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+
+  DB* db = nullptr;
+  std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  WriteOptions wopts;
+  FlushOptions fopts;
+  fopts.wait = true;
+  Random rnd(test::RandomSeed());
+  for (int i = 0; i < 100; i++) {
+    char buf[16];
+    snprintf(buf, sizeof(buf), "%08d", i);
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, buf, v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  for (int i = 50; i < 150; i++) {
+    char buf[16];
+    snprintf(buf, sizeof(buf), "%08d", i);
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, buf, v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  for (int i = 100; i < 200; i++) {
+    char buf[16];
+    snprintf(buf, sizeof(buf), "%08d", i);
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, buf, v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  for (int i = 150; i < 250; i++) {
+    char buf[16];
+    snprintf(buf, sizeof(buf), "%08d", i);
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, buf, v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  ASSERT_OK(db->Close());
+  delete db;
+
+  char arg1[] = "./ldb";
+  char arg2[1024];
+  snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str());
+  char arg3[] = "file_checksum_dump";
+  char arg4[] = "--hex";
+  char* argv[] = {arg1, arg2, arg3, arg4};
+
+  ASSERT_EQ(0,
+            LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  // Verify each sst file checksum value and checksum name
+  FileChecksumTestHelper fct_helper(opts, db, dbname);
+  ASSERT_OK(fct_helper.VerifyEachFileChecksum());
+
+  // Manually trigger compaction
+  char b_buf[16];
+  snprintf(b_buf, sizeof(b_buf), "%08d", 0);
+  char e_buf[16];
+  snprintf(e_buf, sizeof(e_buf), "%08d", 249);
+  Slice begin(b_buf);
+  Slice end(e_buf);
+  CompactRangeOptions options;
+  ASSERT_OK(db->CompactRange(options, &begin, &end));
+  // Verify each sst file checksum after compaction
+  FileChecksumTestHelper fct_helper_ac(opts, db, dbname);
+  ASSERT_OK(fct_helper_ac.VerifyEachFileChecksum());
+
+  ASSERT_OK(db->Close());
+  delete db;
+
+  ASSERT_EQ(0,
+            LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  // Verify the checksum information in memory is the same as that in Manifest;
+  std::vector<LiveFileMetaData> live_files;
+  db->GetLiveFilesMetaData(&live_files);
+  ASSERT_OK(fct_helper_ac.VerifyChecksumInManifest(live_files));
+
+  ASSERT_OK(db->Close());
+  delete db;
+}
+
+TEST_F(LdbCmdTest, BlobDBDumpFileChecksumCRC32) {
+  Env* base_env = TryLoadCustomOrDefaultEnv();
+  std::unique_ptr<Env> env(NewMemEnv(base_env));
+  Options opts;
+  opts.env = env.get();
+  opts.create_if_missing = true;
+  opts.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  opts.enable_blob_files = true;
+
+  DB* db = nullptr;
+  std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  WriteOptions wopts;
+  FlushOptions fopts;
+  fopts.wait = true;
+  Random rnd(test::RandomSeed());
+  for (int i = 0; i < 100; i++) {
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(8) << std::fixed << i;
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, oss.str(), v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  for (int i = 50; i < 150; i++) {
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(8) << std::fixed << i;
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, oss.str(), v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  for (int i = 100; i < 200; i++) {
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(8) << std::fixed << i;
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, oss.str(), v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  for (int i = 150; i < 250; i++) {
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(8) << std::fixed << i;
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, oss.str(), v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  ASSERT_OK(db->Close());
+  delete db;
+
+  char arg1[] = "./ldb";
+  std::string arg2_str = "--db=" + dbname;
+  char arg3[] = "file_checksum_dump";
+  char arg4[] = "--hex";
+  char* argv[] = {arg1, const_cast<char*>(arg2_str.c_str()), arg3, arg4};
+
+  ASSERT_EQ(0,
+            LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  // Verify each sst and blob file checksum value and checksum name
+  FileChecksumTestHelper fct_helper(opts, db, dbname);
+  ASSERT_OK(fct_helper.VerifyEachFileChecksum());
+
+  // Manually trigger compaction
+  std::ostringstream oss_b_buf;
+  oss_b_buf << std::setfill('0') << std::setw(8) << std::fixed << 0;
+  std::ostringstream oss_e_buf;
+  oss_e_buf << std::setfill('0') << std::setw(8) << std::fixed << 249;
+  std::string b_buf = oss_b_buf.str();
+  std::string e_buf = oss_e_buf.str();
+  Slice begin(b_buf);
+  Slice end(e_buf);
+
+  CompactRangeOptions options;
+  ASSERT_OK(db->CompactRange(options, &begin, &end));
+  // Verify each sst file checksum after compaction
+  FileChecksumTestHelper fct_helper_ac(opts, db, dbname);
+  ASSERT_OK(fct_helper_ac.VerifyEachFileChecksum());
+
+  ASSERT_OK(db->Close());
+  delete db;
+
+  ASSERT_EQ(0,
+            LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+}
+
+TEST_F(LdbCmdTest, OptionParsing) {
+  // test parsing flags
+  Options opts;
+  opts.env = TryLoadCustomOrDefaultEnv();
+  {
+    std::vector<std::string> args;
+    args.push_back("scan");
+    args.push_back("--ttl");
+    args.push_back("--timestamp");
+    LDBCommand* command = ROCKSDB_NAMESPACE::LDBCommand::InitFromCmdLineArgs(
+        args, opts, LDBOptions(), nullptr);
+    const std::vector<std::string> flags = command->TEST_GetFlags();
+    EXPECT_EQ(flags.size(), 2);
+    EXPECT_EQ(flags[0], "ttl");
+    EXPECT_EQ(flags[1], "timestamp");
+    delete command;
+  }
+  // test parsing options which contains equal sign in the option value
+  {
+    std::vector<std::string> args;
+    args.push_back("scan");
+    args.push_back("--db=/dev/shm/ldbtest/");
+    args.push_back(
+        "--from='abcd/efg/hijk/lmn/"
+        "opq:__rst.uvw.xyz?a=3+4+bcd+efghi&jk=lm_no&pq=rst-0&uv=wx-8&yz=a&bcd_"
+        "ef=gh.ijk'");
+    LDBCommand* command = ROCKSDB_NAMESPACE::LDBCommand::InitFromCmdLineArgs(
+        args, opts, LDBOptions(), nullptr);
+    const std::map<std::string, std::string> option_map =
+        command->TEST_GetOptionMap();
+    EXPECT_EQ(option_map.at("db"), "/dev/shm/ldbtest/");
+    EXPECT_EQ(option_map.at("from"),
+              "'abcd/efg/hijk/lmn/"
+              "opq:__rst.uvw.xyz?a=3+4+bcd+efghi&jk=lm_no&pq=rst-0&uv=wx-8&yz="
+              "a&bcd_ef=gh.ijk'");
+    delete command;
+  }
+}
+
+TEST_F(LdbCmdTest, ListFileTombstone) {
+  Env* base_env = TryLoadCustomOrDefaultEnv();
+  std::unique_ptr<Env> env(NewMemEnv(base_env));
+  Options opts;
+  opts.env = env.get();
+  opts.create_if_missing = true;
+
+  DB* db = nullptr;
+  std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  WriteOptions wopts;
+  ASSERT_OK(db->Put(wopts, "foo", "1"));
+  ASSERT_OK(db->Put(wopts, "bar", "2"));
+
+  FlushOptions fopts;
+  fopts.wait = true;
+  ASSERT_OK(db->Flush(fopts));
+
+  ASSERT_OK(db->DeleteRange(wopts, db->DefaultColumnFamily(), "foo", "foo2"));
+  ASSERT_OK(db->DeleteRange(wopts, db->DefaultColumnFamily(), "bar", "foo2"));
+  ASSERT_OK(db->Flush(fopts));
+
+  delete db;
+
+  {
+    char arg1[] = "./ldb";
+    char arg2[1024];
+    snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str());
+    char arg3[] = "list_file_range_deletes";
+    char* argv[] = {arg1, arg2, arg3};
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "ListFileRangeDeletesCommand::DoCommand:BeforePrint", [&](void* arg) {
+          std::string* out_str = reinterpret_cast<std::string*>(arg);
+
+          // Count number of tombstones printed
+          int num_tb = 0;
+          const std::string kFingerprintStr = "start: ";
+          auto offset = out_str->find(kFingerprintStr);
+          while (offset != std::string::npos) {
+            num_tb++;
+            offset =
+                out_str->find(kFingerprintStr, offset + kFingerprintStr.size());
+          }
+          EXPECT_EQ(2, num_tb);
+        });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    ASSERT_EQ(
+        0, LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr));
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+
+  // Test the case of limiting tombstones
+  {
+    char arg1[] = "./ldb";
+    char arg2[1024];
+    snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str());
+    char arg3[] = "list_file_range_deletes";
+    char arg4[] = "--max_keys=1";
+    char* argv[] = {arg1, arg2, arg3, arg4};
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "ListFileRangeDeletesCommand::DoCommand:BeforePrint", [&](void* arg) {
+          std::string* out_str = reinterpret_cast<std::string*>(arg);
+
+          // Count number of tombstones printed
+          int num_tb = 0;
+          const std::string kFingerprintStr = "start: ";
+          auto offset = out_str->find(kFingerprintStr);
+          while (offset != std::string::npos) {
+            num_tb++;
+            offset =
+                out_str->find(kFingerprintStr, offset + kFingerprintStr.size());
+          }
+          EXPECT_EQ(1, num_tb);
+        });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    ASSERT_EQ(
+        0, LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(LdbCmdTest, DisableConsistencyChecks) {
+  Env* base_env = TryLoadCustomOrDefaultEnv();
+  std::unique_ptr<Env> env(NewMemEnv(base_env));
+  Options opts;
+  opts.env = env.get();
+  opts.create_if_missing = true;
+
+  std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
+
+  {
+    DB* db = nullptr;
+    ASSERT_OK(DB::Open(opts, dbname, &db));
+
+    WriteOptions wopts;
+    FlushOptions fopts;
+    fopts.wait = true;
+
+    ASSERT_OK(db->Put(wopts, "foo1", "1"));
+    ASSERT_OK(db->Put(wopts, "bar1", "2"));
+    ASSERT_OK(db->Flush(fopts));
+
+    ASSERT_OK(db->Put(wopts, "foo2", "3"));
+    ASSERT_OK(db->Put(wopts, "bar2", "4"));
+    ASSERT_OK(db->Flush(fopts));
+
+    delete db;
+  }
+
+  {
+    char arg1[] = "./ldb";
+    char arg2[1024];
+    snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str());
+    char arg3[] = "checkconsistency";
+    char* argv[] = {arg1, arg2, arg3};
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "Version::PrepareAppend:forced_check", [&](void* arg) {
+          bool* forced = reinterpret_cast<bool*>(arg);
+          ASSERT_TRUE(*forced);
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    ASSERT_EQ(
+        0, LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr));
+
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    SyncPoint::GetInstance()->DisableProcessing();
+  }
+  {
+    char arg1[] = "./ldb";
+    char arg2[1024];
+    snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str());
+    char arg3[] = "scan";
+    char* argv[] = {arg1, arg2, arg3};
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "Version::PrepareAppend:forced_check", [&](void* arg) {
+          bool* forced = reinterpret_cast<bool*>(arg);
+          ASSERT_TRUE(*forced);
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    ASSERT_EQ(
+        0, LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr));
+
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    SyncPoint::GetInstance()->DisableProcessing();
+  }
+  {
+    char arg1[] = "./ldb";
+    char arg2[1024];
+    snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str());
+    char arg3[] = "scan";
+    char arg4[] = "--disable_consistency_checks";
+    char* argv[] = {arg1, arg2, arg3, arg4};
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "ColumnFamilyData::ColumnFamilyData", [&](void* arg) {
+          ColumnFamilyOptions* cfo =
+              reinterpret_cast<ColumnFamilyOptions*>(arg);
+          ASSERT_FALSE(cfo->force_consistency_checks);
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    ASSERT_EQ(
+        0, LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(LdbCmdTest, TestBadDbPath) {
+  Env* base_env = TryLoadCustomOrDefaultEnv();
+  std::unique_ptr<Env> env(NewMemEnv(base_env));
+  Options opts;
+  opts.env = env.get();
+  opts.create_if_missing = true;
+
+  std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
+  char arg1[] = "./ldb";
+  char arg2[1024];
+  snprintf(arg2, sizeof(arg2), "--db=%s/.no_such_dir", dbname.c_str());
+  char arg3[1024];
+  snprintf(arg3, sizeof(arg3), "create_column_family");
+  char arg4[] = "bad cf";
+  char* argv[] = {arg1, arg2, arg3, arg4};
+
+  ASSERT_EQ(1,
+            LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+  snprintf(arg3, sizeof(arg3), "drop_column_family");
+  ASSERT_EQ(1,
+            LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+}
+namespace {
+class WrappedEnv : public EnvWrapper {
+ public:
+  explicit WrappedEnv(Env* t) : EnvWrapper(t) {}
+  static const char* kClassName() { return "WrappedEnv"; }
+  const char* Name() const override { return kClassName(); }
+};
+}  // namespace
+TEST_F(LdbCmdTest, LoadCFOptionsAndOverride) {
+  // Env* base_env = TryLoadCustomOrDefaultEnv();
+  // std::unique_ptr<Env> env(NewMemEnv(base_env));
+  std::unique_ptr<Env> env(new WrappedEnv(Env::Default()));
+  Options opts;
+  opts.env = env.get();
+  opts.create_if_missing = true;
+
+  DB* db = nullptr;
+  std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
+  ASSERT_OK(DestroyDB(dbname, opts));
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  ColumnFamilyHandle* cf_handle;
+  ColumnFamilyOptions cf_opts;
+  cf_opts.num_levels = 20;
+  ASSERT_OK(db->CreateColumnFamily(cf_opts, "cf1", &cf_handle));
+
+  delete cf_handle;
+  delete db;
+
+  char arg1[] = "./ldb";
+  char arg2[1024];
+  snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str());
+  char arg3[] = "put";
+  char arg4[] = "key1";
+  char arg5[] = "value1";
+  char arg6[] = "--try_load_options";
+  char arg7[] = "--column_family=cf1";
+  char arg8[] = "--write_buffer_size=268435456";
+  char* argv[] = {arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8};
+
+  ASSERT_EQ(0,
+            LDBCommandRunner::RunCommand(8, argv, opts, LDBOptions(), nullptr));
+
+  ConfigOptions config_opts;
+  Options options;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  config_opts.env = env.get();
+  ASSERT_OK(LoadLatestOptions(config_opts, dbname, &options, &column_families));
+  ASSERT_EQ(column_families.size(), 2);
+  ASSERT_EQ(options.num_levels, opts.num_levels);
+  ASSERT_EQ(column_families[1].options.num_levels, cf_opts.num_levels);
+  ASSERT_EQ(column_families[1].options.write_buffer_size, 268435456);
+}
+
+TEST_F(LdbCmdTest, UnsafeRemoveSstFile) {
+  Options opts;
+  opts.level0_file_num_compaction_trigger = 10;
+  opts.create_if_missing = true;
+
+  DB* db = nullptr;
+  std::string dbname = test::PerThreadDBPath(Env::Default(), "ldb_cmd_test");
+  ASSERT_OK(DestroyDB(dbname, opts));
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  // Create three SST files
+  for (size_t i = 0; i < 3; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), std::to_string(i), std::to_string(i)));
+    ASSERT_OK(db->Flush(FlushOptions()));
+  }
+
+  // Determine which is the "middle" one
+  std::vector<LiveFileMetaData> sst_files;
+  db->GetLiveFilesMetaData(&sst_files);
+
+  std::vector<uint64_t> numbers;
+  for (auto& f : sst_files) {
+    numbers.push_back(f.file_number);
+  }
+  ASSERT_EQ(numbers.size(), 3);
+  std::sort(numbers.begin(), numbers.end());
+  uint64_t to_remove = numbers[1];
+
+  // Close for unsafe_remove_sst_file
+  delete db;
+  db = nullptr;
+
+  char arg1[] = "./ldb";
+  char arg2[1024];
+  snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str());
+  char arg3[] = "unsafe_remove_sst_file";
+  char arg4[20];
+  snprintf(arg4, sizeof(arg4), "%" PRIu64, to_remove);
+  char* argv[] = {arg1, arg2, arg3, arg4};
+
+  ASSERT_EQ(0,
+            LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+
+  // Re-open, and verify with Get that middle file is gone
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  std::string val;
+  ASSERT_OK(db->Get(ReadOptions(), "0", &val));
+  ASSERT_EQ(val, "0");
+
+  ASSERT_OK(db->Get(ReadOptions(), "2", &val));
+  ASSERT_EQ(val, "2");
+
+  ASSERT_TRUE(db->Get(ReadOptions(), "1", &val).IsNotFound());
+
+  // Now with extra CF, two more files
+  ColumnFamilyHandle* cf_handle;
+  ColumnFamilyOptions cf_opts;
+  ASSERT_OK(db->CreateColumnFamily(cf_opts, "cf1", &cf_handle));
+  for (size_t i = 3; i < 5; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), cf_handle, std::to_string(i),
+                      std::to_string(i)));
+    ASSERT_OK(db->Flush(FlushOptions(), cf_handle));
+  }
+
+  // Determine which is the "last" one
+  sst_files.clear();
+  db->GetLiveFilesMetaData(&sst_files);
+
+  numbers.clear();
+  for (auto& f : sst_files) {
+    numbers.push_back(f.file_number);
+  }
+  ASSERT_EQ(numbers.size(), 4);
+  std::sort(numbers.begin(), numbers.end());
+  to_remove = numbers.back();
+
+  // Close for unsafe_remove_sst_file
+  delete cf_handle;
+  delete db;
+  db = nullptr;
+
+  snprintf(arg4, sizeof(arg4), "%" PRIu64, to_remove);
+  ASSERT_EQ(0,
+            LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+
+  std::vector<ColumnFamilyDescriptor> cfds = {{kDefaultColumnFamilyName, opts},
+                                              {"cf1", cf_opts}};
+  std::vector<ColumnFamilyHandle*> handles;
+  ASSERT_OK(DB::Open(opts, dbname, cfds, &handles, &db));
+
+  ASSERT_OK(db->Get(ReadOptions(), handles[1], "3", &val));
+  ASSERT_EQ(val, "3");
+
+  ASSERT_TRUE(db->Get(ReadOptions(), handles[1], "4", &val).IsNotFound());
+
+  ASSERT_OK(db->Get(ReadOptions(), handles[0], "0", &val));
+  ASSERT_EQ(val, "0");
+
+  // Determine which is the "first" one (most likely to be opened in recovery)
+  sst_files.clear();
+  db->GetLiveFilesMetaData(&sst_files);
+
+  numbers.clear();
+  for (auto& f : sst_files) {
+    numbers.push_back(f.file_number);
+  }
+  ASSERT_EQ(numbers.size(), 3);
+  std::sort(numbers.begin(), numbers.end());
+  to_remove = numbers.front();
+
+  // This time physically delete the file before unsafe_remove
+  {
+    std::string f = dbname + "/" + MakeTableFileName(to_remove);
+    ASSERT_OK(Env::Default()->DeleteFile(f));
+  }
+
+  // Close for unsafe_remove_sst_file
+  for (auto& h : handles) {
+    delete h;
+  }
+  delete db;
+  db = nullptr;
+
+  snprintf(arg4, sizeof(arg4), "%" PRIu64, to_remove);
+  ASSERT_EQ(0,
+            LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+
+  ASSERT_OK(DB::Open(opts, dbname, cfds, &handles, &db));
+
+  ASSERT_OK(db->Get(ReadOptions(), handles[1], "3", &val));
+  ASSERT_EQ(val, "3");
+
+  ASSERT_TRUE(db->Get(ReadOptions(), handles[0], "0", &val).IsNotFound());
+
+  for (auto& h : handles) {
+    delete h;
+  }
+  delete db;
+}
+
+TEST_F(LdbCmdTest, FileTemperatureUpdateManifest) {
+  auto test_fs = std::make_shared<FileTemperatureTestFS>(FileSystem::Default());
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(Env::Default(), test_fs));
+  Options opts;
+  opts.bottommost_temperature = Temperature::kWarm;
+  opts.level0_file_num_compaction_trigger = 10;
+  opts.create_if_missing = true;
+  opts.env = env.get();
+
+  DB* db = nullptr;
+  std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
+  ASSERT_OK(DestroyDB(dbname, opts));
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  std::array<Temperature, 5> kTestTemps = {
+      Temperature::kCold, Temperature::kWarm, Temperature::kHot,
+      Temperature::kWarm, Temperature::kCold};
+  std::map<uint64_t, Temperature> number_to_temp;
+  for (size_t i = 0; i < kTestTemps.size(); ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), std::to_string(i), std::to_string(i)));
+    ASSERT_OK(db->Flush(FlushOptions()));
+
+    std::map<uint64_t, Temperature> current_temps;
+    test_fs->CopyCurrentSstFileTemperatures(&current_temps);
+    for (auto e : current_temps) {
+      if (e.second == Temperature::kUnknown) {
+        test_fs->OverrideSstFileTemperature(e.first, kTestTemps[i]);
+        number_to_temp[e.first] = kTestTemps[i];
+      }
+    }
+  }
+
+  // Close & reopen
+  delete db;
+  db = nullptr;
+  test_fs->PopRequestedSstFileTemperatures();
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  for (size_t i = 0; i < kTestTemps.size(); ++i) {
+    std::string val;
+    ASSERT_OK(db->Get(ReadOptions(), std::to_string(i), &val));
+    ASSERT_EQ(val, std::to_string(i));
+  }
+
+  // Still all unknown
+  std::vector<std::pair<uint64_t, Temperature>> requests;
+  test_fs->PopRequestedSstFileTemperatures(&requests);
+  ASSERT_EQ(requests.size(), kTestTemps.size());
+  for (auto& r : requests) {
+    ASSERT_EQ(r.second, Temperature::kUnknown);
+  }
+
+  // Close for update_manifest
+  delete db;
+  db = nullptr;
+
+  char arg1[] = "./ldb";
+  char arg2[1024];
+  snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str());
+  char arg3[] = "update_manifest";
+  char arg4[] = "--update_temperatures";
+  char* argv[] = {arg1, arg2, arg3, arg4};
+
+  ASSERT_EQ(0,
+            LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+
+  // Re-open, get, and verify manifest temps (based on request)
+  test_fs->PopRequestedSstFileTemperatures();
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  for (size_t i = 0; i < kTestTemps.size(); ++i) {
+    std::string val;
+    ASSERT_OK(db->Get(ReadOptions(), std::to_string(i), &val));
+    ASSERT_EQ(val, std::to_string(i));
+  }
+
+  requests.clear();
+  test_fs->PopRequestedSstFileTemperatures(&requests);
+  ASSERT_EQ(requests.size(), kTestTemps.size());
+  for (auto& r : requests) {
+    ASSERT_EQ(r.second, number_to_temp[r.first]);
+  }
+  delete db;
+}
+
+TEST_F(LdbCmdTest, RenameDbAndLoadOptions) {
+  Env* env = TryLoadCustomOrDefaultEnv();
+  Options opts;
+  opts.env = env;
+  opts.create_if_missing = false;
+
+  std::string old_dbname = test::PerThreadDBPath(env, "ldb_cmd_test");
+  std::string new_dbname = old_dbname + "_2";
+  ASSERT_OK(DestroyDB(old_dbname, opts));
+  ASSERT_OK(DestroyDB(new_dbname, opts));
+
+  char old_arg[1024];
+  snprintf(old_arg, sizeof(old_arg), "--db=%s", old_dbname.c_str());
+  char new_arg[1024];
+  snprintf(new_arg, sizeof(old_arg), "--db=%s", new_dbname.c_str());
+  const char* argv1[] = {"./ldb",
+                         old_arg,
+                         "put",
+                         "key1",
+                         "value1",
+                         "--try_load_options",
+                         "--create_if_missing"};
+
+  const char* argv2[] = {"./ldb", old_arg, "get", "key1", "--try_load_options"};
+  const char* argv3[] = {"./ldb", new_arg,  "put",
+                         "key2",  "value2", "--try_load_options"};
+
+  const char* argv4[] = {"./ldb", new_arg, "get", "key1", "--try_load_options"};
+  const char* argv5[] = {"./ldb", new_arg, "get", "key2", "--try_load_options"};
+
+  ASSERT_EQ(
+      0, LDBCommandRunner::RunCommand(7, argv1, opts, LDBOptions(), nullptr));
+  ASSERT_EQ(
+      0, LDBCommandRunner::RunCommand(5, argv2, opts, LDBOptions(), nullptr));
+  ConfigOptions config_opts;
+  Options options;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  config_opts.env = env;
+  ASSERT_OK(
+      LoadLatestOptions(config_opts, old_dbname, &options, &column_families));
+  ASSERT_EQ(options.wal_dir, "");
+
+  ASSERT_OK(env->RenameFile(old_dbname, new_dbname));
+  ASSERT_NE(
+      0, LDBCommandRunner::RunCommand(6, argv1, opts, LDBOptions(), nullptr));
+  ASSERT_NE(
+      0, LDBCommandRunner::RunCommand(5, argv2, opts, LDBOptions(), nullptr));
+  ASSERT_EQ(
+      0, LDBCommandRunner::RunCommand(6, argv3, opts, LDBOptions(), nullptr));
+  ASSERT_EQ(
+      0, LDBCommandRunner::RunCommand(5, argv4, opts, LDBOptions(), nullptr));
+  ASSERT_EQ(
+      0, LDBCommandRunner::RunCommand(5, argv5, opts, LDBOptions(), nullptr));
+  ASSERT_OK(DestroyDB(new_dbname, opts));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as LDBCommand is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/ldb_test.py b/src/rocksdb/tools/ldb_test.py
new file mode 100644
index 000000000..e243d69c0
--- /dev/null
+++ b/src/rocksdb/tools/ldb_test.py
@@ -0,0 +1,955 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import glob
+
+import os
+import os.path
+import re
+import shutil
+import subprocess
+import tempfile
+import time
+import unittest
+
+
+def my_check_output(*popenargs, **kwargs):
+    """
+    If we had python 2.7, we should simply use subprocess.check_output.
+    This is a stop-gap solution for python 2.6
+    """
+    if "stdout" in kwargs:
+        raise ValueError("stdout argument not allowed, it will be overridden.")
+    process = subprocess.Popen(
+        stderr=subprocess.PIPE, stdout=subprocess.PIPE, *popenargs, **kwargs
+    )
+    output, unused_err = process.communicate()
+    retcode = process.poll()
+    if retcode:
+        cmd = kwargs.get("args")
+        if cmd is None:
+            cmd = popenargs[0]
+        raise Exception("Exit code is not 0.  It is %d.  Command: %s" % (retcode, cmd))
+    return output.decode("utf-8")
+
+
+def run_err_null(cmd):
+    return os.system(cmd + " 2>/dev/null ")
+
+
+class LDBTestCase(unittest.TestCase):
+    def setUp(self):
+        self.TMP_DIR = tempfile.mkdtemp(prefix="ldb_test_")
+        self.DB_NAME = "testdb"
+
+    def tearDown(self):
+        assert (
+            self.TMP_DIR.strip() != "/"
+            and self.TMP_DIR.strip() != "/tmp"
+            and self.TMP_DIR.strip() != "/tmp/"
+        )  # Just some paranoia
+
+        shutil.rmtree(self.TMP_DIR)
+
+    def dbParam(self, dbName):
+        return "--db=%s" % os.path.join(self.TMP_DIR, dbName)
+
+    def assertRunOKFull(
+        self, params, expectedOutput, unexpected=False, isPattern=False
+    ):
+        """
+        All command-line params must be specified.
+        Allows full flexibility in testing; for example: missing db param.
+        """
+        output = my_check_output(
+            './ldb %s |grep -v "Created bg thread"' % params, shell=True
+        )
+        if not unexpected:
+            if isPattern:
+                self.assertNotEqual(expectedOutput.search(output.strip()), None)
+            else:
+                self.assertEqual(output.strip(), expectedOutput.strip())
+        else:
+            if isPattern:
+                self.assertEqual(expectedOutput.search(output.strip()), None)
+            else:
+                self.assertNotEqual(output.strip(), expectedOutput.strip())
+
+    def assertRunFAILFull(self, params):
+        """
+        All command-line params must be specified.
+        Allows full flexibility in testing; for example: missing db param.
+        """
+        try:
+
+            my_check_output(
+                './ldb %s >/dev/null 2>&1 |grep -v "Created bg \
+                thread"'
+                % params,
+                shell=True,
+            )
+        except Exception:
+            return
+        self.fail(
+            "Exception should have been raised for command with params: %s" % params
+        )
+
+    def assertRunOK(self, params, expectedOutput, unexpected=False):
+        """
+        Uses the default test db.
+        """
+        self.assertRunOKFull(
+            "%s %s" % (self.dbParam(self.DB_NAME), params), expectedOutput, unexpected
+        )
+
+    def assertRunFAIL(self, params):
+        """
+        Uses the default test db.
+        """
+        self.assertRunFAILFull("%s %s" % (self.dbParam(self.DB_NAME), params))
+
+    def testSimpleStringPutGet(self):
+        print("Running testSimpleStringPutGet...")
+        self.assertRunFAIL("put x1 y1")
+        self.assertRunOK("put --create_if_missing x1 y1", "OK")
+        self.assertRunOK("get x1", "y1")
+        self.assertRunFAIL("get x2")
+
+        self.assertRunOK("put x2 y2", "OK")
+        self.assertRunOK("get x1", "y1")
+        self.assertRunOK("get x2", "y2")
+        self.assertRunFAIL("get x3")
+
+        self.assertRunOK("scan --from=x1 --to=z", "x1 : y1\nx2 : y2")
+        self.assertRunOK("put x3 y3", "OK")
+
+        self.assertRunOK("scan --from=x1 --to=z", "x1 : y1\nx2 : y2\nx3 : y3")
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3")
+        self.assertRunOK("scan --from=x", "x1 : y1\nx2 : y2\nx3 : y3")
+
+        self.assertRunOK("scan --to=x2", "x1 : y1")
+        self.assertRunOK("scan --from=x1 --to=z --max_keys=1", "x1 : y1")
+        self.assertRunOK("scan --from=x1 --to=z --max_keys=2", "x1 : y1\nx2 : y2")
+
+        self.assertRunOK(
+            "scan --from=x1 --to=z --max_keys=3", "x1 : y1\nx2 : y2\nx3 : y3"
+        )
+        self.assertRunOK(
+            "scan --from=x1 --to=z --max_keys=4", "x1 : y1\nx2 : y2\nx3 : y3"
+        )
+        self.assertRunOK("scan --from=x1 --to=x2", "x1 : y1")
+        self.assertRunOK("scan --from=x2 --to=x4", "x2 : y2\nx3 : y3")
+        self.assertRunFAIL("scan --from=x4 --to=z")  # No results => FAIL
+        self.assertRunFAIL("scan --from=x1 --to=z --max_keys=foo")
+
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3")
+
+        self.assertRunOK("delete x1", "OK")
+        self.assertRunOK("scan", "x2 : y2\nx3 : y3")
+
+        self.assertRunOK("delete NonExistentKey", "OK")
+        # It is weird that GET and SCAN raise exception for
+        # non-existent key, while delete does not
+
+        self.assertRunOK("checkconsistency", "OK")
+
+    def dumpDb(self, params, dumpFile):
+        return 0 == run_err_null("./ldb dump %s > %s" % (params, dumpFile))
+
+    def loadDb(self, params, dumpFile):
+        return 0 == run_err_null("cat %s | ./ldb load %s" % (dumpFile, params))
+
+    def writeExternSst(self, params, inputDumpFile, outputSst):
+        return 0 == run_err_null(
+            "cat %s | ./ldb write_extern_sst %s %s" % (inputDumpFile, outputSst, params)
+        )
+
+    def ingestExternSst(self, params, inputSst):
+        return 0 == run_err_null("./ldb ingest_extern_sst %s %s" % (inputSst, params))
+
+    def testStringBatchPut(self):
+        print("Running testStringBatchPut...")
+        self.assertRunOK("batchput x1 y1 --create_if_missing", "OK")
+        self.assertRunOK("scan", "x1 : y1")
+        self.assertRunOK('batchput x2 y2 x3 y3 "x4 abc" "y4 xyz"', "OK")
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 abc : y4 xyz")
+        self.assertRunFAIL("batchput")
+        self.assertRunFAIL("batchput k1")
+        self.assertRunFAIL("batchput k1 v1 k2")
+
+    def testBlobBatchPut(self):
+        print("Running testBlobBatchPut...")
+
+        dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+        self.assertRunOK("batchput x1 y1 --create_if_missing --enable_blob_files", "OK")
+        self.assertRunOK("scan", "x1 : y1")
+        self.assertRunOK(
+            'batchput --enable_blob_files x2 y2 x3 y3 "x4 abc" "y4 xyz"', "OK"
+        )
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 abc : y4 xyz")
+
+        blob_files = self.getBlobFiles(dbPath)
+        self.assertTrue(len(blob_files) >= 1)
+
+    def testBlobPut(self):
+        print("Running testBlobPut...")
+
+        dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+        self.assertRunOK("put --create_if_missing --enable_blob_files x1 y1", "OK")
+        self.assertRunOK("get x1", "y1")
+        self.assertRunOK("put --enable_blob_files x2 y2", "OK")
+        self.assertRunOK("get x1", "y1")
+        self.assertRunOK("get x2", "y2")
+        self.assertRunFAIL("get x3")
+
+        blob_files = self.getBlobFiles(dbPath)
+        self.assertTrue(len(blob_files) >= 1)
+
+    def testBlobStartingLevel(self):
+        print("Running testBlobStartingLevel...")
+
+        dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+        self.assertRunOK(
+            "put --create_if_missing --enable_blob_files --blob_file_starting_level=10 x1 y1",
+            "OK",
+        )
+        self.assertRunOK("get x1", "y1")
+
+        blob_files = self.getBlobFiles(dbPath)
+        self.assertTrue(len(blob_files) == 0)
+
+        self.assertRunOK(
+            "put --enable_blob_files --blob_file_starting_level=0 x2 y2", "OK"
+        )
+        self.assertRunOK("get x1", "y1")
+        self.assertRunOK("get x2", "y2")
+        self.assertRunFAIL("get x3")
+
+        blob_files = self.getBlobFiles(dbPath)
+        self.assertTrue(len(blob_files) >= 1)
+
+    def testCountDelimDump(self):
+        print("Running testCountDelimDump...")
+        self.assertRunOK("batchput x.1 x1 --create_if_missing", "OK")
+        self.assertRunOK("batchput y.abc abc y.2 2 z.13c pqr", "OK")
+        self.assertRunOK(
+            "dump --count_delim",
+            "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8",
+        )
+        self.assertRunOK(
+            'dump --count_delim="."',
+            "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8",
+        )
+        self.assertRunOK("batchput x,2 x2 x,abc xabc", "OK")
+        self.assertRunOK(
+            'dump --count_delim=","',
+            "x => count:2\tsize:14\nx.1 => count:1\tsize:5\ny.2 => count:1\tsize:4\ny.abc => count:1\tsize:8\nz.13c => count:1\tsize:8",
+        )
+
+    def testCountDelimIDump(self):
+        print("Running testCountDelimIDump...")
+        self.assertRunOK("batchput x.1 x1 --create_if_missing", "OK")
+        self.assertRunOK("batchput y.abc abc y.2 2 z.13c pqr", "OK")
+        self.assertRunOK(
+            "idump --count_delim",
+            "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8",
+        )
+        self.assertRunOK(
+            'idump --count_delim="."',
+            "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8",
+        )
+        self.assertRunOK("batchput x,2 x2 x,abc xabc", "OK")
+        self.assertRunOK(
+            'idump --count_delim=","',
+            "x => count:2\tsize:14\nx.1 => count:1\tsize:5\ny.2 => count:1\tsize:4\ny.abc => count:1\tsize:8\nz.13c => count:1\tsize:8",
+        )
+
+    def testInvalidCmdLines(self):
+        print("Running testInvalidCmdLines...")
+        # db not specified
+        self.assertRunFAILFull("put 0x6133 0x6233 --hex --create_if_missing")
+        # No param called he
+        self.assertRunFAIL("put 0x6133 0x6233 --he --create_if_missing")
+        # max_keys is not applicable for put
+        self.assertRunFAIL("put 0x6133 0x6233 --max_keys=1 --create_if_missing")
+        # hex has invalid boolean value
+
+    def testHexPutGet(self):
+        print("Running testHexPutGet...")
+        self.assertRunOK("put a1 b1 --create_if_missing", "OK")
+        self.assertRunOK("scan", "a1 : b1")
+        self.assertRunOK("scan --hex", "0x6131 : 0x6231")
+        self.assertRunFAIL("put --hex 6132 6232")
+        self.assertRunOK("put --hex 0x6132 0x6232", "OK")
+        self.assertRunOK("scan --hex", "0x6131 : 0x6231\n0x6132 : 0x6232")
+        self.assertRunOK("scan", "a1 : b1\na2 : b2")
+        self.assertRunOK("get a1", "b1")
+        self.assertRunOK("get --hex 0x6131", "0x6231")
+        self.assertRunOK("get a2", "b2")
+        self.assertRunOK("get --hex 0x6132", "0x6232")
+        self.assertRunOK("get --key_hex 0x6132", "b2")
+        self.assertRunOK("get --key_hex --value_hex 0x6132", "0x6232")
+        self.assertRunOK("get --value_hex a2", "0x6232")
+        self.assertRunOK(
+            "scan --key_hex --value_hex", "0x6131 : 0x6231\n0x6132 : 0x6232"
+        )
+        self.assertRunOK(
+            "scan --hex --from=0x6131 --to=0x6133", "0x6131 : 0x6231\n0x6132 : 0x6232"
+        )
+        self.assertRunOK("scan --hex --from=0x6131 --to=0x6132", "0x6131 : 0x6231")
+        self.assertRunOK("scan --key_hex", "0x6131 : b1\n0x6132 : b2")
+        self.assertRunOK("scan --value_hex", "a1 : 0x6231\na2 : 0x6232")
+        self.assertRunOK("batchput --hex 0x6133 0x6233 0x6134 0x6234", "OK")
+        self.assertRunOK("scan", "a1 : b1\na2 : b2\na3 : b3\na4 : b4")
+        self.assertRunOK("delete --hex 0x6133", "OK")
+        self.assertRunOK("scan", "a1 : b1\na2 : b2\na4 : b4")
+        self.assertRunOK("checkconsistency", "OK")
+
+    def testTtlPutGet(self):
+        print("Running testTtlPutGet...")
+        self.assertRunOK("put a1 b1 --ttl --create_if_missing", "OK")
+        self.assertRunOK("scan --hex", "0x6131 : 0x6231", True)
+        self.assertRunOK("dump --ttl ", "a1 ==> b1", True)
+        self.assertRunOK("dump --hex --ttl ", "0x6131 ==> 0x6231\nKeys in range: 1")
+        self.assertRunOK("scan --hex --ttl", "0x6131 : 0x6231")
+        self.assertRunOK("get --value_hex a1", "0x6231", True)
+        self.assertRunOK("get --ttl a1", "b1")
+        self.assertRunOK("put a3 b3 --create_if_missing", "OK")
+        # fails because timstamp's length is greater than value's
+        self.assertRunFAIL("get --ttl a3")
+        self.assertRunOK("checkconsistency", "OK")
+
+    def testInvalidCmdLines(self):  # noqa: F811 T25377293 Grandfathered in
+        print("Running testInvalidCmdLines...")
+        # db not specified
+        self.assertRunFAILFull("put 0x6133 0x6233 --hex --create_if_missing")
+        # No param called he
+        self.assertRunFAIL("put 0x6133 0x6233 --he --create_if_missing")
+        # max_keys is not applicable for put
+        self.assertRunFAIL("put 0x6133 0x6233 --max_keys=1 --create_if_missing")
+        # hex has invalid boolean value
+        self.assertRunFAIL("put 0x6133 0x6233 --hex=Boo --create_if_missing")
+
+    def testDumpLoad(self):
+        print("Running testDumpLoad...")
+        self.assertRunOK("batchput --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4", "OK")
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+        origDbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+
+        # Dump and load without any additional params specified
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump1")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump1")
+        self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath))
+        self.assertTrue(
+            self.loadDb("--db=%s --create_if_missing" % loadedDbPath, dumpFilePath)
+        )
+        self.assertRunOKFull(
+            "scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4"
+        )
+
+        # Dump and load in hex
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump2")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump2")
+        self.assertTrue(self.dumpDb("--db=%s --hex" % origDbPath, dumpFilePath))
+        self.assertTrue(
+            self.loadDb(
+                "--db=%s --hex --create_if_missing" % loadedDbPath, dumpFilePath
+            )
+        )
+        self.assertRunOKFull(
+            "scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4"
+        )
+
+        # Dump only a portion of the key range
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump3")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump3")
+        self.assertTrue(
+            self.dumpDb("--db=%s --from=x1 --to=x3" % origDbPath, dumpFilePath)
+        )
+        self.assertTrue(
+            self.loadDb("--db=%s --create_if_missing" % loadedDbPath, dumpFilePath)
+        )
+        self.assertRunOKFull("scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2")
+
+        # Dump upto max_keys rows
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump4")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump4")
+        self.assertTrue(self.dumpDb("--db=%s --max_keys=3" % origDbPath, dumpFilePath))
+        self.assertTrue(
+            self.loadDb("--db=%s --create_if_missing" % loadedDbPath, dumpFilePath)
+        )
+        self.assertRunOKFull("scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2\nx3 : y3")
+
+        # Load into an existing db, create_if_missing is not specified
+        self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath))
+        self.assertTrue(self.loadDb("--db=%s" % loadedDbPath, dumpFilePath))
+        self.assertRunOKFull(
+            "scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4"
+        )
+
+        # Dump and load with WAL disabled
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump5")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump5")
+        self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath))
+        self.assertTrue(
+            self.loadDb(
+                "--db=%s --disable_wal --create_if_missing" % loadedDbPath, dumpFilePath
+            )
+        )
+        self.assertRunOKFull(
+            "scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4"
+        )
+
+        # Dump and load with lots of extra params specified
+        extraParams = " ".join(
+            [
+                "--bloom_bits=14",
+                "--block_size=1024",
+                "--auto_compaction=true",
+                "--write_buffer_size=4194304",
+                "--file_size=2097152",
+            ]
+        )
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump6")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump6")
+        self.assertTrue(
+            self.dumpDb("--db=%s %s" % (origDbPath, extraParams), dumpFilePath)
+        )
+        self.assertTrue(
+            self.loadDb(
+                "--db=%s %s --create_if_missing" % (loadedDbPath, extraParams),
+                dumpFilePath,
+            )
+        )
+        self.assertRunOKFull(
+            "scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4"
+        )
+
+        # Dump with count_only
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump7")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump7")
+        self.assertTrue(self.dumpDb("--db=%s --count_only" % origDbPath, dumpFilePath))
+        self.assertTrue(
+            self.loadDb("--db=%s --create_if_missing" % loadedDbPath, dumpFilePath)
+        )
+        # DB should have atleast one value for scan to work
+        self.assertRunOKFull("put --db=%s k1 v1" % loadedDbPath, "OK")
+        self.assertRunOKFull("scan --db=%s" % loadedDbPath, "k1 : v1")
+
+        # Dump command fails because of typo in params
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump8")
+        self.assertFalse(
+            self.dumpDb("--db=%s --create_if_missing" % origDbPath, dumpFilePath)
+        )
+
+        # Dump and load with BlobDB enabled
+        blobParams = " ".join(
+            ["--enable_blob_files", "--min_blob_size=1", "--blob_file_size=2097152"]
+        )
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump9")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump9")
+        self.assertTrue(self.dumpDb("--db=%s" % (origDbPath), dumpFilePath))
+        self.assertTrue(
+            self.loadDb(
+                "--db=%s %s --create_if_missing --disable_wal"
+                % (loadedDbPath, blobParams),
+                dumpFilePath,
+            )
+        )
+        self.assertRunOKFull(
+            "scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4"
+        )
+        blob_files = self.getBlobFiles(loadedDbPath)
+        self.assertTrue(len(blob_files) >= 1)
+
+    def testIDumpBasics(self):
+        print("Running testIDumpBasics...")
+        self.assertRunOK("put a val --create_if_missing", "OK")
+        self.assertRunOK("put b val", "OK")
+        self.assertRunOK(
+            "idump",
+            "'a' seq:1, type:1 => val\n"
+            "'b' seq:2, type:1 => val\nInternal keys in range: 2",
+        )
+        self.assertRunOK(
+            "idump --input_key_hex --from=%s --to=%s" % (hex(ord("a")), hex(ord("b"))),
+            "'a' seq:1, type:1 => val\nInternal keys in range: 1",
+        )
+
+    def testIDumpDecodeBlobIndex(self):
+        print("Running testIDumpDecodeBlobIndex...")
+        self.assertRunOK("put a val --create_if_missing", "OK")
+        self.assertRunOK("put b val --enable_blob_files", "OK")
+
+        # Pattern to expect from dump with decode_blob_index flag enabled.
+        regex = ".*\[blob ref\].*"
+        expected_pattern = re.compile(regex)
+        cmd = "idump %s --decode_blob_index"
+        self.assertRunOKFull(
+            (cmd) % (self.dbParam(self.DB_NAME)),
+            expected_pattern,
+            unexpected=False,
+            isPattern=True,
+        )
+
+    def testMiscAdminTask(self):
+        print("Running testMiscAdminTask...")
+        # These tests need to be improved; for example with asserts about
+        # whether compaction or level reduction actually took place.
+        self.assertRunOK("batchput --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4", "OK")
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+        origDbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+
+        self.assertTrue(0 == run_err_null("./ldb compact --db=%s" % origDbPath))
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        self.assertTrue(
+            0 == run_err_null("./ldb reduce_levels --db=%s --new_levels=2" % origDbPath)
+        )
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        self.assertTrue(
+            0 == run_err_null("./ldb reduce_levels --db=%s --new_levels=3" % origDbPath)
+        )
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        self.assertTrue(
+            0 == run_err_null("./ldb compact --db=%s --from=x1 --to=x3" % origDbPath)
+        )
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        self.assertTrue(
+            0
+            == run_err_null(
+                "./ldb compact --db=%s --hex --from=0x6131 --to=0x6134" % origDbPath
+            )
+        )
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        # TODO(dilip): Not sure what should be passed to WAL.Currently corrupted.
+        self.assertTrue(
+            0
+            == run_err_null(
+                "./ldb dump_wal --db=%s --walfile=%s --header"
+                % (origDbPath, os.path.join(origDbPath, "LOG"))
+            )
+        )
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+    def testCheckConsistency(self):
+        print("Running testCheckConsistency...")
+
+        dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+        self.assertRunOK("put x1 y1 --create_if_missing", "OK")
+        self.assertRunOK("put x2 y2", "OK")
+        self.assertRunOK("get x1", "y1")
+        self.assertRunOK("checkconsistency", "OK")
+
+        sstFilePath = my_check_output(
+            "ls %s" % os.path.join(dbPath, "*.sst"), shell=True
+        )
+
+        # Modify the file
+        my_check_output("echo 'evil' > %s" % sstFilePath, shell=True)
+        self.assertRunFAIL("checkconsistency")
+
+        # Delete the file
+        my_check_output("rm -f %s" % sstFilePath, shell=True)
+        self.assertRunFAIL("checkconsistency")
+
+    def dumpLiveFiles(self, params, dumpFile):
+        return 0 == run_err_null("./ldb dump_live_files %s > %s" % (params, dumpFile))
+
+    def testDumpLiveFiles(self):
+        print("Running testDumpLiveFiles...")
+
+        dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+        self.assertRunOK("put x1 y1 --create_if_missing", "OK")
+        self.assertRunOK("put x2 y2 --enable_blob_files", "OK")
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump1")
+        self.assertTrue(self.dumpLiveFiles("--db=%s" % dbPath, dumpFilePath))
+        self.assertRunOK("delete x1", "OK")
+        self.assertRunOK("put x3 y3", "OK")
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump2")
+
+        # Test that if the user provides a db path that ends with
+        # a slash '/', there is no double (or more!) slashes in the
+        # SST and manifest file names.
+
+        # Add a '/' at the end of dbPath (which normally shouldnt contain any)
+        if dbPath[-1] != "/":
+            dbPath += "/"
+
+        # Call the dump_live_files function with the edited dbPath name.
+        self.assertTrue(
+            self.dumpLiveFiles(
+                "--db=%s --decode_blob_index --dump_uncompressed_blobs" % dbPath,
+                dumpFilePath,
+            )
+        )
+
+        # Investigate the output
+        with open(dumpFilePath, "r") as tmp:
+            data = tmp.read()
+
+        # Check that all the SST filenames have a correct full path (no multiple '/').
+        sstFileList = re.findall(r"%s.*\d+.sst" % dbPath, data)
+        self.assertTrue(len(sstFileList) >= 1)
+        for sstFilename in sstFileList:
+            filenumber = re.findall(r"\d+.sst", sstFilename)[0]
+            self.assertEqual(sstFilename, dbPath + filenumber)
+
+        # Check that all the Blob filenames have a correct full path (no multiple '/').
+        blobFileList = re.findall(r"%s.*\d+.blob" % dbPath, data)
+        self.assertTrue(len(blobFileList) >= 1)
+        for blobFilename in blobFileList:
+            filenumber = re.findall(r"\d+.blob", blobFilename)[0]
+            self.assertEqual(blobFilename, dbPath + filenumber)
+
+        # Check that all the manifest filenames
+        # have a correct full path (no multiple '/').
+        manifestFileList = re.findall(r"%s.*MANIFEST-\d+" % dbPath, data)
+        self.assertTrue(len(manifestFileList) >= 1)
+        for manifestFilename in manifestFileList:
+            filenumber = re.findall(r"(?<=MANIFEST-)\d+", manifestFilename)[0]
+            self.assertEqual(manifestFilename, dbPath + "MANIFEST-" + filenumber)
+
+        # Check that the blob file index is decoded.
+        decodedBlobIndex = re.findall(r"\[blob ref\]", data)
+        self.assertTrue(len(decodedBlobIndex) >= 1)
+
+    def listLiveFilesMetadata(self, params, dumpFile):
+        return 0 == run_err_null(
+            "./ldb list_live_files_metadata %s > %s" % (params, dumpFile)
+        )
+
+    def testListLiveFilesMetadata(self):
+        print("Running testListLiveFilesMetadata...")
+
+        dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+        self.assertRunOK("put x1 y1 --create_if_missing", "OK")
+        self.assertRunOK("put x2 y2", "OK")
+
+        # Compare the SST filename and the level of list_live_files_metadata
+        # with the data collected from dump_live_files.
+        dumpFilePath1 = os.path.join(self.TMP_DIR, "dump1")
+        self.assertTrue(self.dumpLiveFiles("--db=%s" % dbPath, dumpFilePath1))
+        dumpFilePath2 = os.path.join(self.TMP_DIR, "dump2")
+        self.assertTrue(
+            self.listLiveFilesMetadata(
+                "--sort_by_filename --db=%s" % dbPath, dumpFilePath2
+            )
+        )
+
+        # Collect SST filename and level from dump_live_files
+        with open(dumpFilePath1, "r") as tmp:
+            data = tmp.read()
+            filename1 = re.findall(r".*\d+\.sst", data)[0]
+            level1 = re.findall(r"level:\d+", data)[0].split(":")[1]
+
+        # Collect SST filename and level from list_live_files_metadata
+        with open(dumpFilePath2, "r") as tmp:
+            data = tmp.read()
+            filename2 = re.findall(r".*\d+\.sst", data)[0]
+            level2 = re.findall(r"level \d+", data)[0].split(" ")[1]
+
+        # Assert equality between filenames and levels.
+        self.assertEqual(filename1, filename2)
+        self.assertEqual(level1, level2)
+
+        # Create multiple column families and compare the output
+        # of list_live_files_metadata with dump_live_files once again.
+        # Create new CF, and insert data:
+        self.assertRunOK("create_column_family mycol1", "OK")
+        self.assertRunOK("put --column_family=mycol1 v1 v2", "OK")
+        self.assertRunOK("create_column_family mycol2", "OK")
+        self.assertRunOK("put --column_family=mycol2 h1 h2", "OK")
+        self.assertRunOK("put --column_family=mycol2 h3 h4", "OK")
+
+        # Call dump_live_files and list_live_files_metadata
+        # and pipe the output to compare them later.
+        dumpFilePath3 = os.path.join(self.TMP_DIR, "dump3")
+        self.assertTrue(self.dumpLiveFiles("--db=%s" % dbPath, dumpFilePath3))
+        dumpFilePath4 = os.path.join(self.TMP_DIR, "dump4")
+        self.assertTrue(
+            self.listLiveFilesMetadata(
+                "--sort_by_filename --db=%s" % dbPath, dumpFilePath4
+            )
+        )
+
+        # dump_live_files:
+        # parse the output and create a map:
+        # [key: sstFilename]->[value:[LSM level, Column Family Name]]
+        referenceMap = {}
+        with open(dumpFilePath3, "r") as tmp:
+            data = tmp.read()
+            # Note: the following regex are contingent on what the
+            # dump_live_files outputs.
+            namesAndLevels = re.findall(r"\d+.sst level:\d+", data)
+            cfs = re.findall(r"(?<=column family name=)\w+", data)
+            # re.findall should not reorder the data.
+            # Therefore namesAndLevels[i] matches the data from cfs[i].
+            for count, nameAndLevel in enumerate(namesAndLevels):
+                sstFilename = re.findall(r"\d+.sst", nameAndLevel)[0]
+                sstLevel = re.findall(r"(?<=level:)\d+", nameAndLevel)[0]
+                cf = cfs[count]
+                referenceMap[sstFilename] = [sstLevel, cf]
+
+        # list_live_files_metadata:
+        # parse the output and create a map:
+        # [key: sstFilename]->[value:[LSM level, Column Family Name]]
+        testMap = {}
+        with open(dumpFilePath4, "r") as tmp:
+            data = tmp.read()
+            # Since for each SST file, all the information is contained
+            # on one line, the parsing is easy to perform and relies on
+            # the appearance of an "00xxx.sst" pattern.
+            sstLines = re.findall(r".*\d+.sst.*", data)
+            for line in sstLines:
+                sstFilename = re.findall(r"\d+.sst", line)[0]
+                sstLevel = re.findall(r"(?<=level )\d+", line)[0]
+                cf = re.findall(r"(?<=column family \')\w+(?=\')", line)[0]
+                testMap[sstFilename] = [sstLevel, cf]
+
+        # Compare the map obtained from dump_live_files and the map
+        # obtained from list_live_files_metadata. Everything should match.
+        self.assertEqual(referenceMap, testMap)
+
+    def getManifests(self, directory):
+        return glob.glob(directory + "/MANIFEST-*")
+
+    def getSSTFiles(self, directory):
+        return glob.glob(directory + "/*.sst")
+
+    def getWALFiles(self, directory):
+        return glob.glob(directory + "/*.log")
+
+    def getBlobFiles(self, directory):
+        return glob.glob(directory + "/*.blob")
+
+    def copyManifests(self, src, dest):
+        return 0 == run_err_null("cp " + src + " " + dest)
+
+    def testManifestDump(self):
+        print("Running testManifestDump...")
+        dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+        self.assertRunOK("put 1 1 --create_if_missing", "OK")
+        self.assertRunOK("put 2 2", "OK")
+        self.assertRunOK("put 3 3", "OK")
+        # Pattern to expect from manifest_dump.
+        num = "[0-9]+"
+        st = ".*"
+        subpat = st + " seq:" + num + ", type:" + num
+        regex = num + ":" + num + "\[" + subpat + ".." + subpat + "\]"
+        expected_pattern = re.compile(regex)
+        cmd = "manifest_dump --db=%s"
+        manifest_files = self.getManifests(dbPath)
+        self.assertTrue(len(manifest_files) == 1)
+        # Test with the default manifest file in dbPath.
+        self.assertRunOKFull(
+            cmd % dbPath, expected_pattern, unexpected=False, isPattern=True
+        )
+        self.copyManifests(manifest_files[0], manifest_files[0] + "1")
+        manifest_files = self.getManifests(dbPath)
+        self.assertTrue(len(manifest_files) == 2)
+        # Test with multiple manifest files in dbPath.
+        self.assertRunFAILFull(cmd % dbPath)
+        # Running it with the copy we just created should pass.
+        self.assertRunOKFull(
+            (cmd + " --path=%s") % (dbPath, manifest_files[1]),
+            expected_pattern,
+            unexpected=False,
+            isPattern=True,
+        )
+        # Make sure that using the dump with --path will result in identical
+        # output as just using manifest_dump.
+        cmd = "dump --path=%s"
+        self.assertRunOKFull(
+            (cmd) % (manifest_files[1]),
+            expected_pattern,
+            unexpected=False,
+            isPattern=True,
+        )
+
+        # Check if null characters doesn't infer with output format.
+        self.assertRunOK("put a1 b1", "OK")
+        self.assertRunOK("put a2 b2", "OK")
+        self.assertRunOK("put --hex 0x12000DA0 0x80C0000B", "OK")
+        self.assertRunOK("put --hex 0x7200004f 0x80000004", "OK")
+        self.assertRunOK("put --hex 0xa000000a 0xf000000f", "OK")
+        self.assertRunOK("put a3 b3", "OK")
+        self.assertRunOK("put a4 b4", "OK")
+
+        # Verifies that all "levels" are printed out.
+        # There should be 66 mentions of levels.
+        expected_verbose_output = re.compile("matched")
+        # Test manifest_dump verbose and verify that key 0x7200004f
+        # is present. Note that we are forced to use grep here because
+        # an output with a non-terminating null character in it isn't piped
+        # correctly through the Python subprocess object.
+        # Also note that 0x72=r and 0x4f=O, hence the regex \'r.{2}O\'
+        # (we cannot use null character in the subprocess input either,
+        # so we have to use '.{2}')
+        cmd_verbose = (
+            "manifest_dump --verbose --db=%s | grep -aq $''r.{2}O'' && echo 'matched' || echo 'not matched'"
+            % dbPath
+        )
+
+        self.assertRunOKFull(
+            cmd_verbose, expected_verbose_output, unexpected=False, isPattern=True
+        )
+
+    def testGetProperty(self):
+        print("Running testGetProperty...")
+        dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+        self.assertRunOK("put 1 1 --create_if_missing", "OK")
+        self.assertRunOK("put 2 2", "OK")
+        # A "string" property
+        cmd = "--db=%s get_property rocksdb.estimate-num-keys"
+        self.assertRunOKFull(cmd % dbPath, "rocksdb.estimate-num-keys: 2")
+        # A "map" property
+        # FIXME: why doesn't this pick up two entries?
+        cmd = "--db=%s get_property rocksdb.aggregated-table-properties"
+        part = "rocksdb.aggregated-table-properties.num_entries: "
+        expected_pattern = re.compile(part)
+        self.assertRunOKFull(
+            cmd % dbPath, expected_pattern, unexpected=False, isPattern=True
+        )
+        # An invalid property
+        cmd = "--db=%s get_property rocksdb.this-property-does-not-exist"
+        self.assertRunFAILFull(cmd % dbPath)
+
+    def testSSTDump(self):
+        print("Running testSSTDump...")
+
+        dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+        self.assertRunOK("put sst1 sst1_val --create_if_missing", "OK")
+        self.assertRunOK("put sst2 sst2_val --enable_blob_files", "OK")
+        self.assertRunOK("get sst1", "sst1_val")
+
+        # Pattern to expect from SST dump.
+        regex = ".*Sst file format:.*\n.*\[blob ref\].*"
+        expected_pattern = re.compile(regex)
+
+        sst_files = self.getSSTFiles(dbPath)
+        self.assertTrue(len(sst_files) >= 1)
+        cmd = "dump --path=%s --decode_blob_index"
+        self.assertRunOKFull(
+            (cmd) % (sst_files[0]), expected_pattern, unexpected=False, isPattern=True
+        )
+
+    def testBlobDump(self):
+        print("Running testBlobDump")
+        dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+        self.assertRunOK("batchput x1 y1 --create_if_missing --enable_blob_files", "OK")
+        self.assertRunOK(
+            'batchput --enable_blob_files x2 y2 x3 y3 "x4 abc" "y4 xyz"', "OK"
+        )
+
+        # Pattern to expect from blob file dump.
+        regex = ".*Blob log header[\s\S]*Blob log footer[\s\S]*Read record[\s\S]*Summary"  # noqa
+        expected_pattern = re.compile(regex)
+        blob_files = self.getBlobFiles(dbPath)
+        self.assertTrue(len(blob_files) >= 1)
+        cmd = "dump --path=%s --dump_uncompressed_blobs"
+        self.assertRunOKFull(
+            (cmd) % (blob_files[0]), expected_pattern, unexpected=False, isPattern=True
+        )
+
+    def testWALDump(self):
+        print("Running testWALDump...")
+
+        dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+        self.assertRunOK("put wal1 wal1_val --create_if_missing", "OK")
+        self.assertRunOK("put wal2 wal2_val", "OK")
+        self.assertRunOK("get wal1", "wal1_val")
+
+        # Pattern to expect from WAL dump.
+        regex = "^Sequence,Count,ByteSize,Physical Offset,Key\(s\).*"
+        expected_pattern = re.compile(regex)
+
+        wal_files = self.getWALFiles(dbPath)
+        self.assertTrue(len(wal_files) >= 1)
+        cmd = "dump --path=%s"
+        self.assertRunOKFull(
+            (cmd) % (wal_files[0]), expected_pattern, unexpected=False, isPattern=True
+        )
+
+    def testListColumnFamilies(self):
+        print("Running testListColumnFamilies...")
+        self.assertRunOK("put x1 y1 --create_if_missing", "OK")
+        cmd = 'list_column_families | grep -v "Column families"'
+        # Test on valid dbPath.
+        self.assertRunOK(cmd, "{default}")
+        # Test on empty path.
+        self.assertRunFAIL(cmd)
+
+    def testColumnFamilies(self):
+        print("Running testColumnFamilies...")
+        _ = os.path.join(self.TMP_DIR, self.DB_NAME)
+        self.assertRunOK("put cf1_1 1 --create_if_missing", "OK")
+        self.assertRunOK("put cf1_2 2 --create_if_missing", "OK")
+        self.assertRunOK("put cf1_3 3 --try_load_options", "OK")
+        # Given non-default column family to single CF DB.
+        self.assertRunFAIL("get cf1_1 --column_family=two")
+        self.assertRunOK("create_column_family two", "OK")
+        self.assertRunOK("put cf2_1 1 --create_if_missing --column_family=two", "OK")
+        self.assertRunOK("put cf2_2 2 --create_if_missing --column_family=two", "OK")
+        self.assertRunOK("delete cf1_2", "OK")
+        self.assertRunOK("create_column_family three", "OK")
+        self.assertRunOK("delete cf2_2 --column_family=two", "OK")
+        self.assertRunOK("put cf3_1 3 --create_if_missing --column_family=three", "OK")
+        self.assertRunOK("get cf1_1 --column_family=default", "1")
+        self.assertRunOK("dump --column_family=two", "cf2_1 ==> 1\nKeys in range: 1")
+        self.assertRunOK(
+            "dump --column_family=two --try_load_options",
+            "cf2_1 ==> 1\nKeys in range: 1",
+        )
+        self.assertRunOK("dump", "cf1_1 ==> 1\ncf1_3 ==> 3\nKeys in range: 2")
+        self.assertRunOK("get cf2_1 --column_family=two", "1")
+        self.assertRunOK("get cf3_1 --column_family=three", "3")
+        self.assertRunOK("drop_column_family three", "OK")
+        # non-existing column family.
+        self.assertRunFAIL("get cf3_1 --column_family=four")
+        self.assertRunFAIL("drop_column_family four")
+
+    def testIngestExternalSst(self):
+        print("Running testIngestExternalSst...")
+
+        # Dump, load, write external sst and ingest it in another db
+        dbPath = os.path.join(self.TMP_DIR, "db1")
+        self.assertRunOK(
+            "batchput --db=%s --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4" % dbPath,
+            "OK",
+        )
+        self.assertRunOK("scan --db=%s" % dbPath, "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump1")
+        with open(dumpFilePath, "w") as f:
+            f.write("x1 ==> y10\nx2 ==> y20\nx3 ==> y30\nx4 ==> y40")
+        externSstPath = os.path.join(self.TMP_DIR, "extern_data1.sst")
+        self.assertTrue(
+            self.writeExternSst(
+                "--create_if_missing --db=%s" % dbPath, dumpFilePath, externSstPath
+            )
+        )
+        # cannot ingest if allow_global_seqno is false
+        self.assertFalse(
+            self.ingestExternSst(
+                "--create_if_missing --allow_global_seqno=false --db=%s" % dbPath,
+                externSstPath,
+            )
+        )
+        self.assertTrue(
+            self.ingestExternSst(
+                "--create_if_missing --allow_global_seqno --db=%s" % dbPath,
+                externSstPath,
+            )
+        )
+        self.assertRunOKFull(
+            "scan --db=%s" % dbPath, "x1 : y10\nx2 : y20\nx3 : y30\nx4 : y40"
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/src/rocksdb/tools/ldb_tool.cc b/src/rocksdb/tools/ldb_tool.cc
new file mode 100644
index 000000000..eadb6a095
--- /dev/null
+++ b/src/rocksdb/tools/ldb_tool.cc
@@ -0,0 +1,184 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+#include "rocksdb/ldb_tool.h"
+
+#include "rocksdb/utilities/ldb_cmd.h"
+#include "tools/ldb_cmd_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+LDBOptions::LDBOptions() {}
+
+void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options,
+                                 const char* /*exec_name*/, bool to_stderr) {
+  std::string ret;
+
+  ret.append(ldb_options.print_help_header);
+  ret.append("\n\n");
+  ret.append("commands MUST specify --" + LDBCommand::ARG_DB +
+             "=<full_path_to_db_directory> when necessary\n");
+  ret.append("\n");
+  ret.append("commands can optionally specify\n");
+  ret.append("  --" + LDBCommand::ARG_ENV_URI + "=<uri_of_environment> or --" +
+             LDBCommand::ARG_FS_URI + "=<uri_of_filesystem> if necessary");
+  ret.append("\n");
+  ret.append("  --" + LDBCommand::ARG_SECONDARY_PATH +
+             "=<secondary_path> to open DB as secondary instance. Operations "
+             "not supported in secondary instance will fail.\n\n");
+  ret.append(
+      "The following optional parameters control if keys/values are "
+      "input/output as hex or as plain strings:\n");
+  ret.append("  --" + LDBCommand::ARG_KEY_HEX +
+             " : Keys are input/output as hex\n");
+  ret.append("  --" + LDBCommand::ARG_VALUE_HEX +
+             " : Values are input/output as hex\n");
+  ret.append("  --" + LDBCommand::ARG_HEX +
+             " : Both keys and values are input/output as hex\n");
+  ret.append("\n");
+
+  ret.append(
+      "The following optional parameters control the database "
+      "internals:\n");
+  ret.append(
+      "  --" + LDBCommand::ARG_CF_NAME +
+      "=<string> : name of the column family to operate on. default: default "
+      "column family\n");
+  ret.append("  --" + LDBCommand::ARG_TTL +
+             " with 'put','get','scan','dump','query','batchput'"
+             " : DB supports ttl and value is internally timestamp-suffixed\n");
+  ret.append("  --" + LDBCommand::ARG_TRY_LOAD_OPTIONS +
+             " : Try to load option file from DB. Default to true if " +
+             LDBCommand::ARG_DB +
+             " is specified and not creating a new DB and not open as TTL DB. "
+             "Can be set to false explicitly.\n");
+  ret.append("  --" + LDBCommand::ARG_DISABLE_CONSISTENCY_CHECKS +
+             " : Set options.force_consistency_checks = false.\n");
+  ret.append("  --" + LDBCommand::ARG_IGNORE_UNKNOWN_OPTIONS +
+             " : Ignore unknown options when loading option file.\n");
+  ret.append("  --" + LDBCommand::ARG_BLOOM_BITS + "=<int,e.g.:14>\n");
+  ret.append("  --" + LDBCommand::ARG_FIX_PREFIX_LEN + "=<int,e.g.:14>\n");
+  ret.append("  --" + LDBCommand::ARG_COMPRESSION_TYPE +
+             "=<no|snappy|zlib|bzip2|lz4|lz4hc|xpress|zstd>\n");
+  ret.append("  --" + LDBCommand::ARG_COMPRESSION_MAX_DICT_BYTES +
+             "=<int,e.g.:16384>\n");
+  ret.append("  --" + LDBCommand::ARG_BLOCK_SIZE + "=<block_size_in_bytes>\n");
+  ret.append("  --" + LDBCommand::ARG_AUTO_COMPACTION + "=<true|false>\n");
+  ret.append("  --" + LDBCommand::ARG_DB_WRITE_BUFFER_SIZE +
+             "=<int,e.g.:16777216>\n");
+  ret.append("  --" + LDBCommand::ARG_WRITE_BUFFER_SIZE +
+             "=<int,e.g.:4194304>\n");
+  ret.append("  --" + LDBCommand::ARG_FILE_SIZE + "=<int,e.g.:2097152>\n");
+  ret.append("  --" + LDBCommand::ARG_ENABLE_BLOB_FILES +
+             " : Enable key-value separation using BlobDB\n");
+  ret.append("  --" + LDBCommand::ARG_MIN_BLOB_SIZE + "=<int,e.g.:2097152>\n");
+  ret.append("  --" + LDBCommand::ARG_BLOB_FILE_SIZE + "=<int,e.g.:2097152>\n");
+  ret.append("  --" + LDBCommand::ARG_BLOB_COMPRESSION_TYPE +
+             "=<no|snappy|zlib|bzip2|lz4|lz4hc|xpress|zstd>\n");
+  ret.append("  --" + LDBCommand::ARG_ENABLE_BLOB_GARBAGE_COLLECTION +
+             " : Enable blob garbage collection\n");
+  ret.append("  --" + LDBCommand::ARG_BLOB_GARBAGE_COLLECTION_AGE_CUTOFF +
+             "=<double,e.g.:0.25>\n");
+  ret.append("  --" + LDBCommand::ARG_BLOB_GARBAGE_COLLECTION_FORCE_THRESHOLD +
+             "=<double,e.g.:0.25>\n");
+  ret.append("  --" + LDBCommand::ARG_BLOB_COMPACTION_READAHEAD_SIZE +
+             "=<int,e.g.:2097152>\n");
+
+  ret.append("\n\n");
+  ret.append("Data Access Commands:\n");
+  PutCommand::Help(ret);
+  GetCommand::Help(ret);
+  BatchPutCommand::Help(ret);
+  ScanCommand::Help(ret);
+  DeleteCommand::Help(ret);
+  DeleteRangeCommand::Help(ret);
+  DBQuerierCommand::Help(ret);
+  ApproxSizeCommand::Help(ret);
+  CheckConsistencyCommand::Help(ret);
+  ListFileRangeDeletesCommand::Help(ret);
+
+  ret.append("\n\n");
+  ret.append("Admin Commands:\n");
+  WALDumperCommand::Help(ret);
+  CompactorCommand::Help(ret);
+  ReduceDBLevelsCommand::Help(ret);
+  ChangeCompactionStyleCommand::Help(ret);
+  DBDumperCommand::Help(ret);
+  DBLoaderCommand::Help(ret);
+  ManifestDumpCommand::Help(ret);
+  UpdateManifestCommand::Help(ret);
+  FileChecksumDumpCommand::Help(ret);
+  GetPropertyCommand::Help(ret);
+  ListColumnFamiliesCommand::Help(ret);
+  CreateColumnFamilyCommand::Help(ret);
+  DropColumnFamilyCommand::Help(ret);
+  DBFileDumperCommand::Help(ret);
+  InternalDumpCommand::Help(ret);
+  DBLiveFilesMetadataDumperCommand::Help(ret);
+  RepairCommand::Help(ret);
+  BackupCommand::Help(ret);
+  RestoreCommand::Help(ret);
+  CheckPointCommand::Help(ret);
+  WriteExternalSstFilesCommand::Help(ret);
+  IngestExternalSstFilesCommand::Help(ret);
+  UnsafeRemoveSstFileCommand::Help(ret);
+
+  fprintf(to_stderr ? stderr : stdout, "%s\n", ret.c_str());
+}
+
+int LDBCommandRunner::RunCommand(
+    int argc, char const* const* argv, Options options,
+    const LDBOptions& ldb_options,
+    const std::vector<ColumnFamilyDescriptor>* column_families) {
+  if (argc <= 2) {
+    if (argc <= 1) {
+      PrintHelp(ldb_options, argv[0], /*to_stderr*/ true);
+      return 1;
+    } else if (std::string(argv[1]) == "--version") {
+      printf("ldb from RocksDB %d.%d.%d\n", ROCKSDB_MAJOR, ROCKSDB_MINOR,
+             ROCKSDB_PATCH);
+      return 0;
+    } else if (std::string(argv[1]) == "--help") {
+      PrintHelp(ldb_options, argv[0], /*to_stderr*/ false);
+      return 0;
+    } else {
+      PrintHelp(ldb_options, argv[0], /*to_stderr*/ true);
+      return 1;
+    }
+  }
+
+  LDBCommand* cmdObj = LDBCommand::InitFromCmdLineArgs(
+      argc, argv, options, ldb_options, column_families);
+  if (cmdObj == nullptr) {
+    fprintf(stderr, "Unknown command\n");
+    PrintHelp(ldb_options, argv[0], /*to_stderr*/ true);
+    return 1;
+  }
+
+  if (!cmdObj->ValidateCmdLineOptions()) {
+    return 1;
+  }
+
+  cmdObj->Run();
+  LDBCommandExecuteResult ret = cmdObj->GetExecuteState();
+  if (!ret.ToString().empty()) {
+    fprintf(stderr, "%s\n", ret.ToString().c_str());
+  }
+  delete cmdObj;
+
+  return ret.IsFailed() ? 1 : 0;
+}
+
+void LDBTool::Run(int argc, char** argv, Options options,
+                  const LDBOptions& ldb_options,
+                  const std::vector<ColumnFamilyDescriptor>* column_families) {
+  int error_code = LDBCommandRunner::RunCommand(argc, argv, options,
+                                                ldb_options, column_families);
+  exit(error_code);
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/pflag b/src/rocksdb/tools/pflag
new file mode 100755
index 000000000..f3394a666
--- /dev/null
+++ b/src/rocksdb/tools/pflag
@@ -0,0 +1,217 @@
+#!/usr/bin/env bash
+#
+#(c) 2004-present, Facebook, all rights reserved. 
+# See the LICENSE file for usage and distribution rights.
+#
+
+trap 'echo "Caught exception, dying"; exit' 1 2 3 15
+
+ME=`basename $0`
+SERVER=`hostname`
+
+#parameters used
+#
+Dump_Config=0
+DEBUG=
+OS=`/bin/uname -s`
+VMEM=
+RSS=
+CPU=
+VERBOSE=
+VAR=
+LIMIT=
+ACTION=
+N=
+WAIT=
+
+#
+#supported OS: Linux only for now. Easy to add
+#
+oscheck() {
+  case ${OS} in
+    Linux)
+     VMEM=vsz
+     RSS=rss
+     CPU=bsdtime
+     ;;
+    *)
+      die "Unsupported OS ${OS}. Send a bug report with OS you need supported. Thanks."
+      ;;
+  esac
+}
+
+
+verbose() {
+  if [ "x$DEBUG" != "x" ]; then
+    echo "$@" >&2
+  fi
+}
+
+warn() {
+  echo "$@" >&2
+}
+
+die() {
+    echo "ERROR: " "$@" >&2;
+    exit;
+}
+
+dump_config() {
+  cat <<EOCONFIG;
+$ME running on ${HOSTNAME} at `date`
+
+Configuration for this run:
+  PID to monitor     : ${PID}
+  Resource monitored : ${VAR}
+  Resource limit     : ${LIMIT}
+  Check every        : ${WAIT} seconds
+  No. of times run   : ${N}
+  What to do         : ${ACTION}
+EOCONFIG
+
+}
+
+usage() {
+  cat <<USAGE; exit
+$@
+
+Usage ${ME} -p pid [-x {VMEM|RSS|CPU}] -l limit [-a {warn|die|kill}] [-n cycles] [-w wait]
+
+Monitor a process for set of violations. Options:
+
+  -p: PID of process to monitor
+
+  -x: metric to sense. Currently only VMEM/RSS/CPU are supported. Defaults to VMEM
+
+  -l: what is the threshold/limit for the metric that is being sensed.
+    Examples: "-l 100m", "-l 1.5g" (for VMEM/RSS), "-l 5:04" 5:04 in BSDTIME for CPU
+    NOTE: defaults to 1GB
+
+  -a: action. Currently {warn|die|kill} are supported. 
+    The default action is to 'warn'. Here is the behavior:
+
+    warn: complain if usage exceeds threshold, but continue monitoring
+    kill: complain, kill the db_bench process and exit
+    die:  if usage exceeds threshold, die immediately
+
+  -n: number of cycles to monitor. Default is to monitor until PID no longer exists.
+
+  -w: wait time per cycle of monitoring. Default is 5 seconds.
+
+  -v: verbose messaging
+
+USAGE
+
+}
+
+#set default values if none given
+set_defaults_if_noopt_given() {
+
+  : ${VAR:=vsz}
+  : ${LIMIT:=1024000}
+  : ${WAIT:=5}
+  : ${N:=999999}
+  : ${ACTION:=warn}
+}
+
+validate_options() {
+  if [ "x$PID" = "x" -a $Dump_Config -ne 1 ]; then
+    usage "PID is mandatory"
+  fi
+}
+
+###### START
+
+
+  while getopts ":p:x:l:a:n:t:vhd" opt; do
+    case $opt in
+      d)
+          Dump_Config=1
+          ;;
+      h)
+          usage;
+          ;;
+      a)
+        ACTION=${OPTARG};
+        ;;
+      v)
+        DEBUG=1;
+        ;;
+      p)
+        PID=$OPTARG;
+        ;;
+      x)
+        VAR=$OPTARG;
+        ;;
+      l)
+        LIMIT=$OPTARG;
+        ;;
+      w)
+        WAIT=$OPTARG;
+        ;;
+      n)
+        N=$OPTARG;
+        ;;
+      \?) 
+        usage;
+        ;;
+    esac
+  done
+
+oscheck;
+set_defaults_if_noopt_given;
+validate_options;
+
+if [ $Dump_Config -eq 1 ]; then
+    dump_config;
+    exit;
+fi
+
+Done=0
+
+verbose "Trying ${N} times, Waiting ${WAIT} seconds each iteration";
+
+while [ $Done -eq 0 ]; do
+  VAL=`/bin/ps h -p $PID -o ${VAR} | perl -pe 'chomp; s/(.*)m/$1 * 1024/e; s/(.*)g/$1 * 1024 * 1024/e;'`
+  if [ ${VAL:=0} -eq 0 ]; then
+    warn "Process $PID ended without incident."
+    Done=1;
+    break;
+  fi
+
+  if [ $VAL -ge $LIMIT ]; then
+    Done=1;
+  else
+    echo "Value of '${VAR}' (${VAL}) is less than ${LIMIT} for PID ${PID}"
+    sleep $WAIT;
+  fi
+  if [ $Done -eq 1 ]; then
+
+    if [ "$ACTION" = "kill" ]; then
+        kill ${PID} || kill -3 ${PID}
+        exit;
+
+    elif [ "$ACTION" = "warn" ]; then
+
+      # go back to monitoring.
+
+      warn "`date` WARNING: ${VAR} breached threshold ${LIMIT}, actual is ${VAL}"
+      Done=0  #go back to monitoring
+
+    elif [ "$ACTION" = "die" ]; then
+      warn "WARNING: dying without killing process ${PID} on ${SERVER}"
+      warn "The process details are below: "
+      warn "`ps -p ${PID} -o pid,ppid,bsdtime,rss,vsz,cmd,args`"
+      warn ""
+
+      #should we send email/notify someone? TODO... for now, bail.
+
+      exit -1;
+
+    fi
+  else
+      :
+      #warn "INFO: PID $PID, $VAR = $VAL, limit ($LIMIT) not exceeded";
+  fi
+done
+
diff --git a/src/rocksdb/tools/reduce_levels_test.cc b/src/rocksdb/tools/reduce_levels_test.cc
new file mode 100644
index 000000000..c8604bf43
--- /dev/null
+++ b/src/rocksdb/tools/reduce_levels_test.cc
@@ -0,0 +1,222 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#ifndef ROCKSDB_LITE
+
+#include "db/db_impl/db_impl.h"
+#include "db/version_set.h"
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/ldb_cmd.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "tools/ldb_cmd_impl.h"
+#include "util/cast_util.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ReduceLevelTest : public testing::Test {
+ public:
+  ReduceLevelTest() {
+    dbname_ = test::PerThreadDBPath("db_reduce_levels_test");
+    EXPECT_OK(DestroyDB(dbname_, Options()));
+    db_ = nullptr;
+  }
+
+  Status OpenDB(bool create_if_missing, int levels);
+
+  Status Put(const std::string& k, const std::string& v) {
+    return db_->Put(WriteOptions(), k, v);
+  }
+
+  std::string Get(const std::string& k) {
+    ReadOptions options;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  Status Flush() {
+    if (db_ == nullptr) {
+      return Status::InvalidArgument("DB not opened.");
+    }
+    DBImpl* db_impl = static_cast_with_check<DBImpl>(db_);
+    return db_impl->TEST_FlushMemTable();
+  }
+
+  void MoveL0FileToLevel(int level) {
+    DBImpl* db_impl = static_cast_with_check<DBImpl>(db_);
+    for (int i = 0; i < level; ++i) {
+      ASSERT_OK(db_impl->TEST_CompactRange(i, nullptr, nullptr));
+    }
+  }
+
+  void CloseDB() {
+    if (db_ != nullptr) {
+      delete db_;
+      db_ = nullptr;
+    }
+  }
+
+  bool ReduceLevels(int target_level);
+
+  int FilesOnLevel(int level) {
+    std::string property;
+    EXPECT_TRUE(db_->GetProperty(
+        "rocksdb.num-files-at-level" + std::to_string(level), &property));
+    return atoi(property.c_str());
+  }
+
+ private:
+  std::string dbname_;
+  DB* db_;
+};
+
+Status ReduceLevelTest::OpenDB(bool create_if_missing, int num_levels) {
+  ROCKSDB_NAMESPACE::Options opt;
+  opt.num_levels = num_levels;
+  opt.create_if_missing = create_if_missing;
+  ROCKSDB_NAMESPACE::Status st =
+      ROCKSDB_NAMESPACE::DB::Open(opt, dbname_, &db_);
+  if (!st.ok()) {
+    fprintf(stderr, "Can't open the db:%s\n", st.ToString().c_str());
+  }
+  return st;
+}
+
+bool ReduceLevelTest::ReduceLevels(int target_level) {
+  std::vector<std::string> args =
+      ROCKSDB_NAMESPACE::ReduceDBLevelsCommand::PrepareArgs(
+          dbname_, target_level, false);
+  LDBCommand* level_reducer = LDBCommand::InitFromCmdLineArgs(
+      args, Options(), LDBOptions(), nullptr, LDBCommand::SelectCommand);
+  level_reducer->Run();
+  bool is_succeed = level_reducer->GetExecuteState().IsSucceed();
+  delete level_reducer;
+  return is_succeed;
+}
+
+TEST_F(ReduceLevelTest, Last_Level) {
+  ASSERT_OK(OpenDB(true, 4));
+  ASSERT_OK(Put("aaaa", "11111"));
+  ASSERT_OK(Flush());
+  MoveL0FileToLevel(3);
+  ASSERT_EQ(FilesOnLevel(3), 1);
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(3));
+  ASSERT_OK(OpenDB(true, 3));
+  ASSERT_EQ(FilesOnLevel(2), 1);
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(2));
+  ASSERT_OK(OpenDB(true, 2));
+  ASSERT_EQ(FilesOnLevel(1), 1);
+  CloseDB();
+}
+
+TEST_F(ReduceLevelTest, Top_Level) {
+  ASSERT_OK(OpenDB(true, 5));
+  ASSERT_OK(Put("aaaa", "11111"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ(FilesOnLevel(0), 1);
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(4));
+  ASSERT_OK(OpenDB(true, 4));
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(3));
+  ASSERT_OK(OpenDB(true, 3));
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(2));
+  ASSERT_OK(OpenDB(true, 2));
+  CloseDB();
+}
+
+TEST_F(ReduceLevelTest, All_Levels) {
+  ASSERT_OK(OpenDB(true, 5));
+  ASSERT_OK(Put("a", "a11111"));
+  ASSERT_OK(Flush());
+  MoveL0FileToLevel(4);
+  ASSERT_EQ(FilesOnLevel(4), 1);
+  CloseDB();
+
+  ASSERT_OK(OpenDB(true, 5));
+  ASSERT_OK(Put("b", "b11111"));
+  ASSERT_OK(Flush());
+  MoveL0FileToLevel(3);
+  ASSERT_EQ(FilesOnLevel(3), 1);
+  ASSERT_EQ(FilesOnLevel(4), 1);
+  CloseDB();
+
+  ASSERT_OK(OpenDB(true, 5));
+  ASSERT_OK(Put("c", "c11111"));
+  ASSERT_OK(Flush());
+  MoveL0FileToLevel(2);
+  ASSERT_EQ(FilesOnLevel(2), 1);
+  ASSERT_EQ(FilesOnLevel(3), 1);
+  ASSERT_EQ(FilesOnLevel(4), 1);
+  CloseDB();
+
+  ASSERT_OK(OpenDB(true, 5));
+  ASSERT_OK(Put("d", "d11111"));
+  ASSERT_OK(Flush());
+  MoveL0FileToLevel(1);
+  ASSERT_EQ(FilesOnLevel(1), 1);
+  ASSERT_EQ(FilesOnLevel(2), 1);
+  ASSERT_EQ(FilesOnLevel(3), 1);
+  ASSERT_EQ(FilesOnLevel(4), 1);
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(4));
+  ASSERT_OK(OpenDB(true, 4));
+  ASSERT_EQ("a11111", Get("a"));
+  ASSERT_EQ("b11111", Get("b"));
+  ASSERT_EQ("c11111", Get("c"));
+  ASSERT_EQ("d11111", Get("d"));
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(3));
+  ASSERT_OK(OpenDB(true, 3));
+  ASSERT_EQ("a11111", Get("a"));
+  ASSERT_EQ("b11111", Get("b"));
+  ASSERT_EQ("c11111", Get("c"));
+  ASSERT_EQ("d11111", Get("d"));
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(2));
+  ASSERT_OK(OpenDB(true, 2));
+  ASSERT_EQ("a11111", Get("a"));
+  ASSERT_EQ("b11111", Get("b"));
+  ASSERT_EQ("c11111", Get("c"));
+  ASSERT_EQ("d11111", Get("d"));
+  CloseDB();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as LDBCommand is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/tools/regression_test.sh b/src/rocksdb/tools/regression_test.sh
new file mode 100755
index 000000000..2743c5aee
--- /dev/null
+++ b/src/rocksdb/tools/regression_test.sh
@@ -0,0 +1,477 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# The RocksDB regression test script.
+# REQUIREMENT: must be able to run make db_bench in the current directory
+#
+# This script will do the following things in order:
+#
+# 1. check out the specified rocksdb commit.
+# 2. build db_bench using the specified commit
+# 3. setup test directory $TEST_PATH.  If not specified, then the test directory
+#    will be "/tmp/rocksdb/regression_test"
+# 4. run set of benchmarks on the specified host
+#    (can be either locally or remotely)
+# 5. generate report in the $RESULT_PATH.  If RESULT_PATH is not specified,
+#    RESULT_PATH will be set to $TEST_PATH/current_time
+#
+# = Examples =
+# * Run the regression test using rocksdb commit abcdef that outputs results
+#   and temp files in "/my/output/dir"
+#r
+#   TEST_PATH=/my/output/dir COMMIT_ID=abcdef ./tools/regression_test.sh
+#
+# * Run the regression test on a remost host under "/my/output/dir" directory
+#   and stores the result locally in "/my/benchmark/results" using commit
+#   abcdef and with the rocksdb options specified in /my/path/to/OPTIONS-012345
+#   with 1000000000 keys in each benchmark in the regression test where each
+#   key and value are 100 and 900 bytes respectively:
+#
+#   REMOTE_USER_AT_HOST=yhchiang@my.remote.host \
+#       TEST_PATH=/my/output/dir \
+#       RESULT_PATH=/my/benchmark/results \
+#       COMMIT_ID=abcdef \
+#       OPTIONS_FILE=/my/path/to/OPTIONS-012345 \
+#       NUM_KEYS=1000000000 \
+#       KEY_SIZE=100 \
+#       VALUE_SIZE=900 \
+#       ./tools/regression_test.sh
+#
+# = Regression test environmental parameters =
+#   DEBUG: If true, then the script will not build db_bench if db_bench already
+#       exists
+#       Default: 0
+#   TEST_MODE: If 1, run fillseqdeterminstic and benchmarks both
+#       if 0, only run fillseqdeterministc
+#       if 2, only run benchmarks
+#       Default: 1
+#   TEST_PATH: the root directory of the regression test.
+#       Default: "/tmp/rocksdb/regression_test"
+#       !!! NOTE !!! - a DB will also be saved in $TEST_PATH/../db
+#   RESULT_PATH: the directory where the regression results will be generated.
+#       Default: "$TEST_PATH/current_time"
+#   REMOTE_USER_AT_HOST: If set, then test will run on the specified host under
+#       TEST_PATH directory and outputs test results locally in RESULT_PATH
+#       The REMOTE_USER_AT_HOST should follow the format user-id@host.name
+#   DB_PATH: the path where the rocksdb database will be created during the
+#       regression test.  Default:  $TEST_PATH/db
+#   WAL_PATH: the path where the rocksdb WAL will be outputed.
+#       Default:  $TEST_PATH/wal
+#   OPTIONS_FILE:  If specified, then the regression test will use the specified
+#       file to initialize the RocksDB options in its benchmarks.  Note that
+#       this feature only work for commits after 88acd93 or rocksdb version
+#       later than 4.9.
+#   DELETE_TEST_PATH: If true, then the test directory will be deleted
+#       after the script ends.
+#       Default: 0
+#
+# = db_bench parameters =
+#   NUM_THREADS:  The number of concurrent foreground threads that will issue
+#       database operations in the benchmark.  Default: 16.
+#   NUM_KEYS:  The key range that will be used in the entire regression test.
+#       Default: 1G.
+#   NUM_OPS:  The number of operations (reads, writes, or deletes) that will
+#       be issued in EACH thread.
+#       Default: $NUM_KEYS / $NUM_THREADS
+#   KEY_SIZE:  The size of each key in bytes in db_bench.  Default: 100.
+#   VALUE_SIZE:  The size of each value in bytes in db_bench.  Default: 900.
+#   CACHE_SIZE:  The size of RocksDB block cache used in db_bench.  Default: 1G
+#   STATISTICS:  If 1, then statistics is on in db_bench.  Default: 0.
+#   COMPRESSION_RATIO:  The compression ratio of the key generated in db_bench.
+#       Default: 0.5.
+#   HISTOGRAM:  If 1, then the histogram feature on performance feature is on.
+#   STATS_PER_INTERVAL:  If 1, then the statistics will be reported for every
+#       STATS_INTERVAL_SECONDS seconds.  Default 1.
+#   STATS_INTERVAL_SECONDS:  If STATS_PER_INTERVAL is set to 1, then statistics
+#       will be reported for every STATS_INTERVAL_SECONDS.  Default 60.
+#   MAX_BACKGROUND_FLUSHES:  The maxinum number of concurrent flushes in
+#       db_bench.  Default: 4.
+#   MAX_BACKGROUND_COMPACTIONS:  The maximum number of concurrent compactions
+#       in db_bench.  Default: 16.
+#   NUM_HIGH_PRI_THREADS:  The number of high-pri threads available for
+#       concurrent flushes in db_bench.  Default: 4.
+#   NUM_LOW_PRI_THREADS:  The number of low-pri threads available for
+#       concurrent compactions in db_bench.  Default: 16.
+#   SEEK_NEXTS:  Controls how many Next() will be called after seek.
+#       Default: 10.
+#   SEED:  random seed that controls the randomness of the benchmark.
+#       Default: $( date +%s )
+
+#==============================================================================
+#  CONSTANT
+#==============================================================================
+TITLE_FORMAT="%40s,%25s,%30s,%7s,%9s,%8s,"
+TITLE_FORMAT+="%10s,%13s,%14s,%11s,%12s,"
+TITLE_FORMAT+="%7s,%11s,"
+TITLE_FORMAT+="%9s,%10s,%10s,%10s,%10s,%10s,%5s,"
+TITLE_FORMAT+="%5s,%5s,%5s" # time
+TITLE_FORMAT+="\n"
+
+DATA_FORMAT="%40s,%25s,%30s,%7s,%9s,%8s,"
+DATA_FORMAT+="%10s,%13.0f,%14s,%11s,%12s,"
+DATA_FORMAT+="%7s,%11s,"
+DATA_FORMAT+="%9.0f,%10.0f,%10.0f,%10.0f,%10.0f,%10.0f,%5.0f,"
+DATA_FORMAT+="%5.0f,%5.0f,%5.0f" # time
+DATA_FORMAT+="\n"
+
+MAIN_PATTERN="$1""[[:blank:]]+:.*[[:blank:]]+([0-9\.]+)[[:blank:]]+ops/sec"
+PERC_PATTERN="Percentiles: P50: ([0-9\.]+) P75: ([0-9\.]+) "
+PERC_PATTERN+="P99: ([0-9\.]+) P99.9: ([0-9\.]+) P99.99: ([0-9\.]+)"
+#==============================================================================
+
+function main {
+  TEST_ROOT_DIR=${TEST_PATH:-"/tmp/rocksdb/regression_test"}
+  init_arguments $TEST_ROOT_DIR
+
+  build_db_bench_and_ldb
+
+  setup_test_directory
+  if [ $TEST_MODE -le 1 ]; then
+      test_remote "test -d $ORIGIN_PATH"
+      if [[ $? -ne 0 ]]; then
+          echo "Building DB..."
+          # compactall alone will not print ops or threads, which will fail update_report
+          run_db_bench "fillseq,compactall" $NUM_KEYS 1 0 0
+          # only save for future use on success
+          test_remote "mv $DB_PATH $ORIGIN_PATH"
+      fi
+  fi
+  if [ $TEST_MODE -ge 1 ]; then
+      build_checkpoint
+      run_db_bench "readrandom"
+      run_db_bench "readwhilewriting"
+      run_db_bench "deleterandom"
+      run_db_bench "seekrandom"
+      run_db_bench "seekrandomwhilewriting"
+      run_db_bench "multireadrandom"
+  fi
+
+  cleanup_test_directory $TEST_ROOT_DIR
+  echo ""
+  echo "Benchmark completed!  Results are available in $RESULT_PATH"
+}
+
+############################################################################
+function init_arguments {
+  K=1024
+  M=$((1024 * K))
+  G=$((1024 * M))
+
+  current_time=$(date +"%F-%H:%M:%S")
+  RESULT_PATH=${RESULT_PATH:-"$1/results/$current_time"}
+  COMMIT_ID=`hg id -i 2>/dev/null || git rev-parse HEAD 2>/dev/null || echo 'unknown'`
+  SUMMARY_FILE="$RESULT_PATH/SUMMARY.csv"
+
+  DB_PATH=${3:-"$1/db"}
+  ORIGIN_PATH=${ORIGIN_PATH:-"$(dirname $(dirname $DB_PATH))/db"}
+  WAL_PATH=${4:-""}
+  if [ -z "$REMOTE_USER_AT_HOST" ]; then
+    DB_BENCH_DIR=${5:-"."}
+  else
+    DB_BENCH_DIR=${5:-"$1/db_bench"}
+  fi
+
+  DEBUG=${DEBUG:-0}
+  TEST_MODE=${TEST_MODE:-1}
+  SCP=${SCP:-"scp"}
+  SSH=${SSH:-"ssh"}
+  NUM_THREADS=${NUM_THREADS:-16}
+  NUM_KEYS=${NUM_KEYS:-$((1 * G))}  # key range
+  NUM_OPS=${NUM_OPS:-$(($NUM_KEYS / $NUM_THREADS))}
+  KEY_SIZE=${KEY_SIZE:-100}
+  VALUE_SIZE=${VALUE_SIZE:-900}
+  CACHE_SIZE=${CACHE_SIZE:-$((1 * G))}
+  STATISTICS=${STATISTICS:-0}
+  COMPRESSION_RATIO=${COMPRESSION_RATIO:-0.5}
+  HISTOGRAM=${HISTOGRAM:-1}
+  NUM_MULTI_DB=${NUM_MULTI_DB:-1}
+  STATS_PER_INTERVAL=${STATS_PER_INTERVAL:-1}
+  STATS_INTERVAL_SECONDS=${STATS_INTERVAL_SECONDS:-600}
+  MAX_BACKGROUND_FLUSHES=${MAX_BACKGROUND_FLUSHES:-4}
+  MAX_BACKGROUND_COMPACTIONS=${MAX_BACKGROUND_COMPACTIONS:-16}
+  NUM_HIGH_PRI_THREADS=${NUM_HIGH_PRI_THREADS:-4}
+  NUM_LOW_PRI_THREADS=${NUM_LOW_PRI_THREADS:-16}
+  DELETE_TEST_PATH=${DELETE_TEST_PATH:-0}
+  SEEK_NEXTS=${SEEK_NEXTS:-10}
+  SEED=${SEED:-$( date +%s )}
+  MULTIREAD_BATCH_SIZE=${MULTIREAD_BATCH_SIZE:-128}
+  MULTIREAD_STRIDE=${MULTIREAD_STRIDE:-12}
+  PERF_LEVEL=${PERF_LEVEL:-1}
+}
+
+# $1 --- benchmark name
+# $2 --- number of operations.  Default: $NUM_KEYS
+# $3 --- number of threads.  Default $NUM_THREADS
+# $4 --- use_existing_db.  Default: 1
+# $5 --- update_report. Default: 1
+function run_db_bench {
+  # Make sure no other db_bench is running. (Make sure command succeeds if pidof
+  # command exists but finds nothing.)
+  pids_cmd='pidof db_bench || pidof --version > /dev/null'
+  # But first, make best effort to kill any db_bench that have run for more
+  # than 12 hours, as that indicates a hung or runaway process.
+  kill_old_cmd='for PID in $(pidof db_bench); do [ "$(($(stat -c %Y /proc/$PID) + 43200))" -lt "$(date +%s)" ] && echo "Killing old db_bench $PID" && kill $PID && sleep 5 && kill -9 $PID && sleep 5; done; pidof --version > /dev/null'
+  if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
+    pids_cmd="$SSH $REMOTE_USER_AT_HOST '$pids_cmd'"
+    kill_old_cmd="$SSH $REMOTE_USER_AT_HOST '$kill_old_cmd'"
+  fi
+
+  eval $kill_old_cmd
+  exit_on_error $? "$kill_old_cmd"
+
+  pids_output="$(eval $pids_cmd)"
+  exit_on_error $? "$pids_cmd"
+
+  if [ "$pids_output" != "" ]; then
+    echo "Stopped regression_test.sh as there're still recent db_bench "
+    echo "processes running: $pids_output"
+    echo "Clean up test directory"
+    cleanup_test_directory $TEST_ROOT_DIR
+    exit 2
+  fi
+
+  # Build db_bench command
+  ops=${2:-$NUM_OPS}
+  threads=${3:-$NUM_THREADS}
+  USE_EXISTING_DB=${4:-1}
+  UPDATE_REPORT=${5:-1}
+  echo ""
+  echo "======================================================================="
+  echo "Benchmark $1"
+  echo "======================================================================="
+  echo ""
+  db_bench_error=0
+  options_file_arg=$(setup_options_file)
+  echo "$options_file_arg"
+  # use `which time` to avoid using bash's internal time command
+  db_bench_cmd="\$(which time) -p $DB_BENCH_DIR/db_bench \
+      --benchmarks=$1 --db=$DB_PATH --wal_dir=$WAL_PATH \
+      --use_existing_db=$USE_EXISTING_DB \
+      --perf_level=$PERF_LEVEL \
+      --disable_auto_compactions \
+      --threads=$threads \
+      --num=$NUM_KEYS \
+      --reads=$ops \
+      --writes=$ops \
+      --deletes=$ops \
+      --key_size=$KEY_SIZE \
+      --value_size=$VALUE_SIZE \
+      --cache_size=$CACHE_SIZE \
+      --statistics=$STATISTICS \
+      $options_file_arg \
+      --compression_ratio=$COMPRESSION_RATIO \
+      --histogram=$HISTOGRAM \
+      --seek_nexts=$SEEK_NEXTS \
+      --stats_per_interval=$STATS_PER_INTERVAL \
+      --stats_interval_seconds=$STATS_INTERVAL_SECONDS \
+      --max_background_flushes=$MAX_BACKGROUND_FLUSHES \
+      --num_multi_db=$NUM_MULTI_DB \
+      --max_background_compactions=$MAX_BACKGROUND_COMPACTIONS \
+      --num_high_pri_threads=$NUM_HIGH_PRI_THREADS \
+      --num_low_pri_threads=$NUM_LOW_PRI_THREADS \
+      --seed=$SEED \
+      --multiread_batched=true \
+      --batch_size=$MULTIREAD_BATCH_SIZE \
+      --multiread_stride=$MULTIREAD_STRIDE 2>&1"
+  if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
+    echo "Running benchmark remotely on $REMOTE_USER_AT_HOST"
+    db_bench_cmd="$SSH $REMOTE_USER_AT_HOST '$db_bench_cmd'"
+  fi
+  echo db_bench_cmd="$db_bench_cmd"
+
+  # Run the db_bench command
+  eval $db_bench_cmd | tee -a "$RESULT_PATH/$1"
+  exit_on_error ${PIPESTATUS[0]} db_bench
+  if [ $UPDATE_REPORT -ne 0 ]; then
+    update_report "$1" "$RESULT_PATH/$1" $ops $threads
+  fi
+}
+
+function build_checkpoint {
+    cmd_prefix=""
+    if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
+        cmd_prefix="$SSH $REMOTE_USER_AT_HOST "
+    fi
+    if [ $NUM_MULTI_DB -gt 1 ]; then
+        dirs=$($cmd_prefix find $ORIGIN_PATH -type d -links 2)
+        for dir in $dirs; do
+            db_index=$(basename $dir)
+            echo "Building checkpoints: $ORIGIN_PATH/$db_index -> $DB_PATH/$db_index ..."
+            $cmd_prefix $DB_BENCH_DIR/ldb checkpoint --checkpoint_dir=$DB_PATH/$db_index \
+                        --db=$ORIGIN_PATH/$db_index --try_load_options 2>&1
+            exit_on_error $?
+        done
+    else
+        # checkpoint cannot build in directory already exists
+        $cmd_prefix rm -rf $DB_PATH
+        echo "Building checkpoint: $ORIGIN_PATH -> $DB_PATH ..."
+        $cmd_prefix $DB_BENCH_DIR/ldb checkpoint --checkpoint_dir=$DB_PATH \
+                    --db=$ORIGIN_PATH --try_load_options 2>&1
+        exit_on_error $?
+    fi
+}
+
+function multiply {
+  echo "$1 * $2" | bc
+}
+
+# $1 --- name of the benchmark
+# $2 --- the filename of the output log of db_bench
+function update_report {
+  main_result=`cat $2 | grep $1`
+  exit_on_error $?
+  perc_statement=`cat $2 | grep Percentile`
+  exit_on_error $?
+
+  # Obtain micros / op
+
+  [[ $main_result =~ $MAIN_PATTERN ]]
+  ops_per_s=${BASH_REMATCH[1]}
+
+  # Obtain percentile information
+  [[ $perc_statement =~ $PERC_PATTERN ]]
+  perc[0]=${BASH_REMATCH[1]}  # p50
+  perc[1]=${BASH_REMATCH[2]}  # p75
+  perc[2]=${BASH_REMATCH[3]}  # p99
+  perc[3]=${BASH_REMATCH[4]}  # p99.9
+  perc[4]=${BASH_REMATCH[5]}  # p99.99
+
+  # Parse the output of the time command
+  real_sec=`tail -3 $2 | grep real | awk '{print $2}'`
+  user_sec=`tail -3 $2 | grep user | awk '{print $2}'`
+  sys_sec=`tail -3 $2 | grep sys | awk '{print $2}'`
+
+  (printf "$DATA_FORMAT" \
+    $COMMIT_ID $1 $REMOTE_USER_AT_HOST $NUM_MULTI_DB $NUM_KEYS $KEY_SIZE $VALUE_SIZE \
+       $(multiply $COMPRESSION_RATIO 100) \
+       $3 $4 $CACHE_SIZE \
+       $MAX_BACKGROUND_FLUSHES $MAX_BACKGROUND_COMPACTIONS \
+       $ops_per_s \
+       $(multiply ${perc[0]} 1000) \
+       $(multiply ${perc[1]} 1000) \
+       $(multiply ${perc[2]} 1000) \
+       $(multiply ${perc[3]} 1000) \
+       $(multiply ${perc[4]} 1000) \
+       $DEBUG \
+       $real_sec \
+       $user_sec \
+       $sys_sec \
+       >> $SUMMARY_FILE)
+  exit_on_error $?
+}
+
+function exit_on_error {
+  if [ $1 -ne 0 ]; then
+    echo ""
+    echo "ERROR: Benchmark did not complete successfully."
+    if ! [ -z "$2" ]; then
+      echo "Failure command: $2"
+    fi
+    echo "Partial results are output to $RESULT_PATH"
+    echo "ERROR" >> $SUMMARY_FILE
+    exit $1
+  fi
+}
+
+function build_db_bench_and_ldb {
+  echo "Building db_bench & ldb ..."
+
+  make clean
+  exit_on_error $?
+
+  DEBUG_LEVEL=0 make db_bench ldb -j32
+  exit_on_error $?
+}
+
+function run_remote {
+  test_remote "$1"
+  exit_on_error $? "$1"
+}
+
+function test_remote {
+  if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
+      cmd="$SSH $REMOTE_USER_AT_HOST '$1'"
+  else
+      cmd="$1"
+  fi
+  eval "$cmd"
+}
+
+function run_local {
+  eval "$1"
+  exit_on_error $? "$1"
+}
+
+function setup_options_file {
+  if ! [ -z "$OPTIONS_FILE" ]; then
+    if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
+      options_file="$DB_BENCH_DIR/OPTIONS_FILE"
+      run_local "$SCP $OPTIONS_FILE $REMOTE_USER_AT_HOST:$options_file"
+    else
+      options_file="$OPTIONS_FILE"
+    fi
+    echo "--options_file=$options_file"
+  fi
+  echo ""
+}
+
+function setup_test_directory {
+  echo "Deleting old regression test directories and creating new ones"
+
+  run_local 'test "$DB_PATH" != "."'
+  run_remote "rm -rf $DB_PATH"
+
+  if [ "$DB_BENCH_DIR" != "." ]; then
+    run_remote "rm -rf $DB_BENCH_DIR"
+  fi
+
+  run_local 'test "$RESULT_PATH" != "."'
+  run_local "rm -rf $RESULT_PATH"
+
+  if ! [ -z "$WAL_PATH" ]; then
+    run_remote "rm -rf $WAL_PATH"
+    run_remote "mkdir -p $WAL_PATH"
+  fi
+
+  run_remote "mkdir -p $DB_PATH"
+
+  run_remote "mkdir -p $DB_BENCH_DIR"
+  run_remote "ls -l $DB_BENCH_DIR"
+
+  if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
+      run_local "$SCP ./db_bench $REMOTE_USER_AT_HOST:$DB_BENCH_DIR/db_bench"
+      run_local "$SCP ./ldb $REMOTE_USER_AT_HOST:$DB_BENCH_DIR/ldb"
+  fi
+
+  run_local "mkdir -p $RESULT_PATH"
+
+  (printf $TITLE_FORMAT \
+      "commit id" "benchmark" "user@host" "num-dbs" "key-range" "key-size" \
+      "value-size" "compress-rate" "ops-per-thread" "num-threads" "cache-size" \
+      "flushes" "compactions" \
+      "ops-per-s" "p50" "p75" "p99" "p99.9" "p99.99" "debug" \
+      "real-sec" "user-sec" "sys-sec" \
+      >> $SUMMARY_FILE)
+  exit_on_error $?
+}
+
+function cleanup_test_directory {
+
+  if [ $DELETE_TEST_PATH -ne 0 ]; then
+    echo "Clear old regression test directories and creating new ones"
+    run_remote "rm -rf $DB_PATH"
+    run_remote "rm -rf $WAL_PATH"
+    if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
+      run_remote "rm -rf $DB_BENCH_DIR"
+    fi
+    run_remote "rm -rf $1"
+  else
+    echo "------------ DEBUG MODE ------------"
+    echo "DB  PATH: $DB_PATH"
+    echo "WAL PATH: $WAL_PATH"
+  fi
+}
+
+############################################################################
+
+# shellcheck disable=SC2068
+main $@
diff --git a/src/rocksdb/tools/restore_db.sh b/src/rocksdb/tools/restore_db.sh
new file mode 100755
index 000000000..ed89794b2
--- /dev/null
+++ b/src/rocksdb/tools/restore_db.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+#
+
+if [ "$#" -lt 2 ]; then
+  echo "usage: ${BASH_SOURCE[0]} <Backup Dir> <DB Path>"
+  exit 1
+fi
+
+backup_dir="$1"
+db_dir="$2"
+
+echo "== Restoring latest from $backup_dir to $db_dir"
+./ldb restore --db="$db_dir" --backup_dir="$backup_dir"
diff --git a/src/rocksdb/tools/rocksdb_dump_test.sh b/src/rocksdb/tools/rocksdb_dump_test.sh
new file mode 100755
index 000000000..532c53267
--- /dev/null
+++ b/src/rocksdb/tools/rocksdb_dump_test.sh
@@ -0,0 +1,9 @@
+# shellcheck disable=SC2148
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+TESTDIR=`mktemp -d ${TMPDIR:-/tmp}/rocksdb-dump-test.XXXXX`
+DUMPFILE="tools/sample-dump.dmp"
+
+# Verify that the sample dump file is undumpable and then redumpable.
+./rocksdb_undump --dump_location=$DUMPFILE --db_path=$TESTDIR/db
+./rocksdb_dump --anonymous --db_path=$TESTDIR/db --dump_location=$TESTDIR/dump
+cmp $DUMPFILE $TESTDIR/dump
diff --git a/src/rocksdb/tools/run_blob_bench.sh b/src/rocksdb/tools/run_blob_bench.sh
new file mode 100755
index 000000000..3755a9e56
--- /dev/null
+++ b/src/rocksdb/tools/run_blob_bench.sh
@@ -0,0 +1,223 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+# BlobDB benchmark script
+#
+# REQUIRES: benchmark.sh is in the tools subdirectory
+#
+# After the execution of this script, log files are available in $output_dir.
+# report.tsv provides high level statistics.
+#
+# Should be run from the parent of the tools directory. The command line is:
+#   [$env_vars] tools/run_blob_bench.sh
+#
+# This runs the following sequence of BlobDB performance tests:
+#   phase 1) write-only - bulkload+compact, overwrite+waitforcompaction
+#   phase 2) read-write - readwhilewriting, fwdrangewhilewriting
+#   phase 3) read-only - readrandom, fwdrange
+#
+
+# Exit Codes
+EXIT_INVALID_ARGS=1
+
+# Size constants
+K=1024
+M=$((1024 * K))
+G=$((1024 * M))
+T=$((1024 * G))
+
+function display_usage() {
+  echo "usage: run_blob_bench.sh [--help]"
+  echo ""
+  echo "Runs the following sequence of BlobDB benchmark tests using tools/benchmark.sh:"
+  echo -e "\tPhase 1: write-only tests: bulkload+compact, overwrite+waitforcompaction"
+  echo -e "\tPhase 2: read-write tests: readwhilewriting, fwdrangewhilewriting"
+  echo -e "\tPhase 3: read-only tests: readrandom, fwdrange"
+  echo ""
+  echo "Environment Variables:"
+  echo -e "\tJOB_ID\t\t\t\tIdentifier for the benchmark job, will appear in the results (default: empty)"
+  echo -e "\tDB_DIR\t\t\t\tPath for the RocksDB data directory (mandatory)"
+  echo -e "\tWAL_DIR\t\t\t\tPath for the RocksDB WAL directory (mandatory)"
+  echo -e "\tOUTPUT_DIR\t\t\tPath for the benchmark results (mandatory)"
+  echo -e "\tNUM_THREADS\t\t\tNumber of threads (default: 16)"
+  echo -e "\tCOMPRESSION_TYPE\t\tCompression type for the SST files (default: lz4)"
+  echo -e "\tDB_SIZE\t\t\t\tRaw (uncompressed) database size (default: 1 TB)"
+  echo -e "\tVALUE_SIZE\t\t\tValue size (default: 1 KB)"
+  echo -e "\tNUM_KEYS\t\t\tNumber of keys (default: raw database size divided by value size)"
+  echo -e "\tDURATION\t\t\tIndividual duration for read-write/read-only tests in seconds (default: 1800)"
+  echo -e "\tWRITE_BUFFER_SIZE\t\tWrite buffer (memtable) size (default: 1 GB)"
+  echo -e "\tENABLE_BLOB_FILES\t\tEnable blob files (default: 1)"
+  echo -e "\tMIN_BLOB_SIZE\t\t\tSize threshold for storing values in blob files (default: 0)"
+  echo -e "\tBLOB_FILE_SIZE\t\t\tBlob file size (default: same as write buffer size)"
+  echo -e "\tBLOB_COMPRESSION_TYPE\t\tCompression type for the blob files (default: lz4)"
+  echo -e "\tENABLE_BLOB_GC\t\t\tEnable blob garbage collection (default: 1)"
+  echo -e "\tBLOB_GC_AGE_CUTOFF\t\tBlob garbage collection age cutoff (default: 0.25)"
+  echo -e "\tBLOB_GC_FORCE_THRESHOLD\t\tThreshold for forcing garbage collection of the oldest blob files (default: 1.0)"
+  echo -e "\tBLOB_COMPACTION_READAHEAD_SIZE\tBlob compaction readahead size (default: 0)"
+  echo -e "\tBLOB_FILE_STARTING_LEVEL\t\tBlob file starting level (default: 0)"
+  echo -e "\tUSE_BLOB_CACHE\t\t\tEnable blob cache. (default: 1)"
+  echo -e "\tUSE_SHARED_BLOCK_AND_BLOB_CACHE\t\t\tUse the same backing cache for block cache and blob cache. (default: 1)"
+  echo -e "\tBLOB_CACHE_SIZE\t\t\tSize of the blob cache (default: 16GB)"
+  echo -e "\tBLOB_CACHE_NUMSHARDBITS\t\t\tNumber of shards for the blob cache is 2 ** blob_cache_numshardbits (default: 6)"
+  echo -e "\tPREPOPULATE_BLOB_CACHE\t\t\tPre-populate hot/warm blobs in blob cache (default: 0)"
+  echo -e "\tTARGET_FILE_SIZE_BASE\t\tTarget SST file size for compactions (default: write buffer size, scaled down if blob files are enabled)"
+  echo -e "\tMAX_BYTES_FOR_LEVEL_BASE\tMaximum size for the base level (default: 8 * target SST file size)"
+}
+
+if [ $# -ge 1 ]; then
+  display_usage
+
+  if [ "$1" == "--help" ]; then
+    exit
+  else
+    exit $EXIT_INVALID_ARGS
+  fi
+fi
+
+# shellcheck disable=SC2153
+if [ -z "$DB_DIR" ]; then
+  echo "DB_DIR is not defined"
+  exit $EXIT_INVALID_ARGS
+fi
+
+# shellcheck disable=SC2153
+if [ -z "$WAL_DIR" ]; then
+  echo "WAL_DIR is not defined"
+  exit $EXIT_INVALID_ARGS
+fi
+
+# shellcheck disable=SC2153
+if [ -z "$OUTPUT_DIR" ]; then
+  echo "OUTPUT_DIR is not defined"
+  exit $EXIT_INVALID_ARGS
+fi
+
+# shellcheck disable=SC2153
+job_id=$JOB_ID
+
+db_dir=$DB_DIR
+wal_dir=$WAL_DIR
+output_dir=$OUTPUT_DIR
+
+num_threads=${NUM_THREADS:-16}
+
+compression_type=${COMPRESSION_TYPE:-lz4}
+
+db_size=${DB_SIZE:-$((1 * T))}
+value_size=${VALUE_SIZE:-$((1 * K))}
+num_keys=${NUM_KEYS:-$((db_size / value_size))}
+
+duration=${DURATION:-1800}
+
+write_buffer_size=${WRITE_BUFFER_SIZE:-$((1 * G))}
+
+enable_blob_files=${ENABLE_BLOB_FILES:-1}
+min_blob_size=${MIN_BLOB_SIZE:-0}
+blob_file_size=${BLOB_FILE_SIZE:-$write_buffer_size}
+blob_compression_type=${BLOB_COMPRESSION_TYPE:-lz4}
+enable_blob_garbage_collection=${ENABLE_BLOB_GC:-1}
+blob_garbage_collection_age_cutoff=${BLOB_GC_AGE_CUTOFF:-0.25}
+blob_garbage_collection_force_threshold=${BLOB_GC_FORCE_THRESHOLD:-1.0}
+blob_compaction_readahead_size=${BLOB_COMPACTION_READAHEAD_SIZE:-0}
+blob_file_starting_level=${BLOB_FILE_STARTING_LEVEL:-0}
+use_blob_cache=${USE_BLOB_CACHE:-1}
+use_shared_block_and_blob_cache=${USE_SHARED_BLOCK_AND_BLOB_CACHE:-1}
+blob_cache_size=${BLOB_CACHE_SIZE:-$((16 * G))}
+blob_cache_numshardbits=${BLOB_CACHE_NUMSHARDBITS:-6}
+prepopulate_blob_cache=${PREPOPULATE_BLOB_CACHE:-0}
+
+if [ "$enable_blob_files" == "1" ]; then
+  target_file_size_base=${TARGET_FILE_SIZE_BASE:-$((32 * write_buffer_size / value_size))}
+else
+  target_file_size_base=${TARGET_FILE_SIZE_BASE:-$write_buffer_size}
+fi
+
+max_bytes_for_level_base=${MAX_BYTES_FOR_LEVEL_BASE:-$((8 * target_file_size_base))}
+
+echo "======================== Benchmark setup ========================"
+echo -e "Job ID:\t\t\t\t\t$job_id"
+echo -e "Data directory:\t\t\t\t$db_dir"
+echo -e "WAL directory:\t\t\t\t$wal_dir"
+echo -e "Output directory:\t\t\t$output_dir"
+echo -e "Number of threads:\t\t\t$num_threads"
+echo -e "Compression type for SST files:\t\t$compression_type"
+echo -e "Raw database size:\t\t\t$db_size"
+echo -e "Value size:\t\t\t\t$value_size"
+echo -e "Number of keys:\t\t\t\t$num_keys"
+echo -e "Duration of read-write/read-only tests:\t$duration"
+echo -e "Write buffer size:\t\t\t$write_buffer_size"
+echo -e "Blob files enabled:\t\t\t$enable_blob_files"
+echo -e "Blob size threshold:\t\t\t$min_blob_size"
+echo -e "Blob file size:\t\t\t\t$blob_file_size"
+echo -e "Compression type for blob files:\t$blob_compression_type"
+echo -e "Blob GC enabled:\t\t\t$enable_blob_garbage_collection"
+echo -e "Blob GC age cutoff:\t\t\t$blob_garbage_collection_age_cutoff"
+echo -e "Blob GC force threshold:\t\t$blob_garbage_collection_force_threshold"
+echo -e "Blob compaction readahead size:\t\t$blob_compaction_readahead_size"
+echo -e "Blob file starting level:\t\t$blob_file_starting_level"
+echo -e "Blob cache enabled:\t\t\t$use_blob_cache"
+echo -e "Blob cache and block cache shared:\t\t\t$use_shared_block_and_blob_cache"
+echo -e "Blob cache size:\t\t$blob_cache_size"
+echo -e "Blob cache number of shard bits:\t\t$blob_cache_numshardbits"
+echo -e "Blob cache prepopulated:\t\t\t$prepopulate_blob_cache"
+echo -e "Target SST file size:\t\t\t$target_file_size_base"
+echo -e "Maximum size of base level:\t\t$max_bytes_for_level_base"
+echo "================================================================="
+
+rm -rf "$db_dir"
+rm -rf "$wal_dir"
+rm -rf "$output_dir"
+
+ENV_VARS="\
+  JOB_ID=$job_id \
+  DB_DIR=$db_dir \
+  WAL_DIR=$wal_dir \
+  OUTPUT_DIR=$output_dir \
+  NUM_THREADS=$num_threads \
+  COMPRESSION_TYPE=$compression_type \
+  VALUE_SIZE=$value_size \
+  NUM_KEYS=$num_keys"
+
+ENV_VARS_D="$ENV_VARS DURATION=$duration"
+
+PARAMS="\
+  --enable_blob_files=$enable_blob_files \
+  --min_blob_size=$min_blob_size \
+  --blob_file_size=$blob_file_size \
+  --blob_compression_type=$blob_compression_type \
+  --blob_file_starting_level=$blob_file_starting_level \
+  --use_blob_cache=$use_blob_cache \
+  --use_shared_block_and_blob_cache=$use_shared_block_and_blob_cache \
+  --blob_cache_size=$blob_cache_size \
+  --blob_cache_numshardbits=$blob_cache_numshardbits \
+  --prepopulate_blob_cache=$prepopulate_blob_cache \
+  --write_buffer_size=$write_buffer_size \
+  --target_file_size_base=$target_file_size_base \
+  --max_bytes_for_level_base=$max_bytes_for_level_base"
+
+PARAMS_GC="$PARAMS \
+  --enable_blob_garbage_collection=$enable_blob_garbage_collection \
+  --blob_garbage_collection_age_cutoff=$blob_garbage_collection_age_cutoff \
+  --blob_garbage_collection_force_threshold=$blob_garbage_collection_force_threshold \
+  --blob_compaction_readahead_size=$blob_compaction_readahead_size"
+
+# bulk load (using fillrandom) + compact
+env -u DURATION -S "$ENV_VARS" ./tools/benchmark.sh bulkload "$PARAMS"
+
+# overwrite + waitforcompaction
+env -u DURATION -S "$ENV_VARS" ./tools/benchmark.sh overwrite "$PARAMS_GC"
+
+# readwhilewriting
+env -S "$ENV_VARS_D" ./tools/benchmark.sh readwhilewriting "$PARAMS_GC"
+
+# fwdrangewhilewriting
+env -S "$ENV_VARS_D" ./tools/benchmark.sh fwdrangewhilewriting "$PARAMS_GC"
+
+# readrandom
+env -S "$ENV_VARS_D" ./tools/benchmark.sh readrandom "$PARAMS_GC"
+
+# fwdrange
+env -S "$ENV_VARS_D" ./tools/benchmark.sh fwdrange "$PARAMS_GC"
+
+# save logs to output directory
+cp "$db_dir"/LOG* "$output_dir/"
diff --git a/src/rocksdb/tools/run_flash_bench.sh b/src/rocksdb/tools/run_flash_bench.sh
new file mode 100755
index 000000000..26e253843
--- /dev/null
+++ b/src/rocksdb/tools/run_flash_bench.sh
@@ -0,0 +1,359 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# REQUIRE: benchmark.sh exists in the current directory
+# After execution of this script, log files are generated in $output_dir.
+# report.txt provides a high level statistics
+
+# This should be run from the parent of the tools directory. The command line is:
+#   [$env_vars] tools/run_flash_bench.sh [list-of-threads]
+#
+# This runs a sequence of tests in the following sequence:
+#   step 1) load - bulkload, compact, fillseq, overwrite
+#   step 2) read-only for each number of threads
+#   step 3) read-write for each number of threads
+#   step 4) merge for each number of threads
+#
+# The list of threads is optional and when not set is equivalent to "24". 
+# Were list-of-threads specified as "1 2 4" then the tests in steps 2, 3 and
+# 4 above would be repeated for 1, 2 and 4 threads. The tests in step 1 are
+# only run for 1 thread.
+
+# Test output is written to $OUTPUT_DIR, currently /tmp/output. The performance
+# summary is in $OUTPUT_DIR/report.txt. There is one file in $OUTPUT_DIR per
+# test and the tests are listed below.
+#
+# The environment variables are also optional. The variables are:
+#
+#   NKEYS         - number of key/value pairs to load
+#   BG_MBWRITEPERSEC - write rate limit in MB/second for tests in which
+#                   there is one thread doing writes and stats are
+#                   reported for read threads. "BG" stands for background.
+#                   If this is too large then the non-writer threads can get
+#                   starved. This is used for the "readwhile" tests.
+#   FG_MBWRITEPERSEC - write rate limit in MB/second for tests like overwrite
+#                   where stats are reported for the write threads.
+#   NSECONDS      - number of seconds for which to run each test in steps 2,
+#                   3 and 4. There are currently 15 tests in those steps and
+#                   they are repeated for each entry in list-of-threads so
+#                   this variable lets you control the total duration to
+#                   finish the benchmark.
+#   RANGE_LIMIT   - the number of rows to read per range query for tests that
+#                   do range queries.
+#   VAL_SIZE      - the length of the value in the key/value pairs loaded.
+#                   You can estimate the size of the test database from this,
+#                   NKEYS and the compression rate (--compression_ratio) set
+#                   in tools/benchmark.sh
+#   BLOCK_LENGTH  - value for db_bench --block_size
+#   CACHE_BYTES   - the size of the RocksDB block cache in bytes
+#   DATA_DIR      - directory in which to create database files
+#   LOG_DIR       - directory in which to create WAL files, may be the same
+#                   as DATA_DIR
+#   DO_SETUP      - when set to 0 then a backup of the database is copied from
+#                   $DATA_DIR.bak to $DATA_DIR and the load tests from step 1
+#                   The WAL directory is also copied from a backup if
+#                   DATA_DIR != LOG_DIR. This allows tests from steps 2, 3, 4
+#                   to be repeated faster.
+#   SAVE_SETUP    - saves a copy of the database at the end of step 1 to
+#                   $DATA_DIR.bak. When LOG_DIR != DATA_DIR then it is copied
+#                   to $LOG_DIR.bak.
+#   SKIP_LOW_PRI_TESTS - skip some of the tests which aren't crucial for getting
+#                   actionable benchmarking data (look for keywords "bulkload",
+#                   "sync=1", and "while merging").
+#
+
+# Size constants
+K=1024
+M=$((1024 * K))
+G=$((1024 * M))
+
+num_keys=${NKEYS:-$((1 * G))}
+# write rate for readwhile... tests
+bg_mbwps=${BG_MBWRITEPERSEC:-4}
+# write rate for tests other than readwhile, 0 means no limit
+fg_mbwps=${FG_MBWRITEPERSEC:-0}
+duration=${NSECONDS:-$((60 * 60))}
+nps=${RANGE_LIMIT:-10}
+vs=${VAL_SIZE:-400}
+cs=${CACHE_BYTES:-$(( 1 * G ))}
+bs=${BLOCK_LENGTH:-8192}
+
+# If no command line arguments then run for 24 threads.
+if [[ $# -eq 0 ]]; then
+  nthreads=( 24 )
+else
+  nthreads=( "$@" )
+fi
+
+for num_thr in "${nthreads[@]}" ; do
+  echo Will run for $num_thr threads
+done
+
+# Update these parameters before execution !!!
+db_dir=${DATA_DIR:-"/tmp/rocksdb/"}
+wal_dir=${LOG_DIR:-"/tmp/rocksdb/"}
+
+do_setup=${DO_SETUP:-1}
+save_setup=${SAVE_SETUP:-0}
+
+# By default we'll run all the tests. Set this to skip a set of tests which
+# aren't critical for getting key metrics.
+skip_low_pri_tests=${SKIP_LOW_PRI_TESTS:-0}
+
+if [[ $skip_low_pri_tests == 1 ]]; then
+  echo "Skipping some non-critical tests because SKIP_LOW_PRI_TESTS is set."
+fi
+
+output_dir="${TMPDIR:-/tmp}/output"
+
+ARGS="\
+OUTPUT_DIR=$output_dir \
+NUM_KEYS=$num_keys \
+DB_DIR=$db_dir \
+WAL_DIR=$wal_dir \
+VALUE_SIZE=$vs \
+BLOCK_SIZE=$bs \
+CACHE_SIZE=$cs"
+
+mkdir -p $output_dir
+echo -e "ops/sec\tmb/sec\tSize-GB\tL0_GB\tSum_GB\tW-Amp\tW-MB/s\tusec/op\tp50\tp75\tp99\tp99.9\tp99.99\tUptime\tStall-time\tStall%\tTest" \
+  > $output_dir/report.txt
+
+# Notes on test sequence:
+#   step 1) Setup database via sequential fill followed by overwrite to fragment it.
+#           Done without setting DURATION to make sure that overwrite does $num_keys writes
+#   step 2) read-only tests for all levels of concurrency requested
+#   step 3) non read-only tests for all levels of concurrency requested
+#   step 4) merge tests for all levels of concurrency requested. These must come last.
+
+###### Setup the database
+
+if [[ $do_setup != 0 ]]; then
+  echo Doing setup
+
+  if [[ $skip_low_pri_tests != 1 ]]; then
+    # Test 1: bulk load
+    env $ARGS ./tools/benchmark.sh bulkload
+  fi
+
+  # Test 2a: sequential fill with large values to get peak ingest
+  #          adjust NUM_KEYS given the use of larger values
+  env $ARGS BLOCK_SIZE=$((1 * M)) VALUE_SIZE=$((32 * K)) NUM_KEYS=$(( num_keys / 64 )) \
+       ./tools/benchmark.sh fillseq_disable_wal
+
+  # Test 2b: sequential fill with the configured value size
+  env $ARGS ./tools/benchmark.sh fillseq_disable_wal
+
+  # Test 2c: same as 2a, but with WAL being enabled.
+  env $ARGS BLOCK_SIZE=$((1 * M)) VALUE_SIZE=$((32 * K)) NUM_KEYS=$(( num_keys / 64 )) \
+       ./tools/benchmark.sh fillseq_enable_wal
+
+  # Test 2d: same as 2b, but with WAL being enabled.
+  env $ARGS ./tools/benchmark.sh fillseq_enable_wal
+
+  # Test 3: single-threaded overwrite
+  env $ARGS NUM_THREADS=1 DB_BENCH_NO_SYNC=1 ./tools/benchmark.sh overwrite
+
+else
+  echo Restoring from backup
+
+  rm -rf $db_dir
+
+  if [ ! -d ${db_dir}.bak ]; then
+    echo Database backup does not exist at ${db_dir}.bak
+    exit -1
+  fi
+
+  echo Restore database from ${db_dir}.bak
+  cp -p -r ${db_dir}.bak $db_dir
+
+  if [[ $db_dir != $wal_dir ]]; then
+    rm -rf $wal_dir
+
+    if [ ! -d ${wal_dir}.bak ]; then
+      echo WAL backup does not exist at ${wal_dir}.bak
+      exit -1
+    fi
+
+    echo Restore WAL from ${wal_dir}.bak
+    cp -p -r ${wal_dir}.bak $wal_dir
+  fi
+fi
+
+if [[ $save_setup != 0 ]]; then
+  echo Save database to ${db_dir}.bak
+  cp -p -r $db_dir ${db_dir}.bak
+
+  if [[ $db_dir != $wal_dir ]]; then
+    echo Save WAL to ${wal_dir}.bak
+    cp -p -r $wal_dir ${wal_dir}.bak
+  fi
+fi
+
+###### Read-only tests
+
+for num_thr in "${nthreads[@]}" ; do
+  # Test 4: random read
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr ./tools/benchmark.sh readrandom
+
+  # Test 5: random range scans
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr NUM_NEXTS_PER_SEEK=$nps \
+    ./tools/benchmark.sh fwdrange
+
+  # Test 6: random reverse range scans
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr NUM_NEXTS_PER_SEEK=$nps \
+    ./tools/benchmark.sh revrange
+done
+
+###### Non read-only tests
+
+for num_thr in "${nthreads[@]}" ; do
+  # Test 7: overwrite with sync=0
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$fg_mbwps \
+    DB_BENCH_NO_SYNC=1 ./tools/benchmark.sh overwrite
+
+  if [[ $skip_low_pri_tests != 1 ]]; then
+    # Test 8: overwrite with sync=1
+    env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$fg_mbwps \
+      ./tools/benchmark.sh overwrite
+  fi
+
+  # Test 9: random update with sync=0
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr DB_BENCH_NO_SYNC=1 \
+      ./tools/benchmark.sh updaterandom
+
+  if [[ $skip_low_pri_tests != 1 ]]; then
+    # Test 10: random update with sync=1
+   env $ARGS DURATION=$duration NUM_THREADS=$num_thr ./tools/benchmark.sh updaterandom
+  fi
+
+  # Test 11: random read while writing
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$bg_mbwps \
+    DB_BENCH_NO_SYNC=1 ./tools/benchmark.sh readwhilewriting
+
+  # Test 12: range scan while writing
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$bg_mbwps \
+    DB_BENCH_NO_SYNC=1 NUM_NEXTS_PER_SEEK=$nps ./tools/benchmark.sh fwdrangewhilewriting
+
+  # Test 13: reverse range scan while writing
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$bg_mbwps \
+    DB_BENCH_NO_SYNC=1 NUM_NEXTS_PER_SEEK=$nps ./tools/benchmark.sh revrangewhilewriting
+done
+
+###### Merge tests
+
+for num_thr in "${nthreads[@]}" ; do
+  # Test 14: random merge with sync=0
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$fg_mbwps \
+    DB_BENCH_NO_SYNC=1 ./tools/benchmark.sh mergerandom
+
+  if [[ $skip_low_pri_tests != 1 ]]; then
+    # Test 15: random merge with sync=1
+    env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$fg_mbwps \
+      ./tools/benchmark.sh mergerandom
+
+    # Test 16: random read while merging 
+    env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$bg_mbwps \
+      DB_BENCH_NO_SYNC=1 ./tools/benchmark.sh readwhilemerging
+
+    # Test 17: range scan while merging 
+    env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$bg_mbwps \
+      DB_BENCH_NO_SYNC=1 NUM_NEXTS_PER_SEEK=$nps ./tools/benchmark.sh fwdrangewhilemerging
+
+    # Test 18: reverse range scan while merging 
+    env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$bg_mbwps \
+      DB_BENCH_NO_SYNC=1 NUM_NEXTS_PER_SEEK=$nps ./tools/benchmark.sh revrangewhilemerging
+  fi
+done
+
+###### Universal compaction tests.
+
+# Use a single thread to reduce the variability in the benchmark.
+env $ARGS COMPACTION_TEST=1 NUM_THREADS=1 ./tools/benchmark.sh universal_compaction
+
+if [[ $skip_low_pri_tests != 1 ]]; then
+  echo bulkload > $output_dir/report2.txt
+  head -1 $output_dir/report.txt >> $output_dir/report2.txt
+  grep bulkload $output_dir/report.txt >> $output_dir/report2.txt
+fi
+
+echo fillseq_wal_disabled >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep fillseq.wal_disabled $output_dir/report.txt >> $output_dir/report2.txt
+
+echo fillseq_wal_enabled >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep fillseq.wal_enabled $output_dir/report.txt >> $output_dir/report2.txt
+
+echo overwrite sync=0 >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep overwrite $output_dir/report.txt | grep \.s0  >> $output_dir/report2.txt
+
+if [[ $skip_low_pri_tests != 1 ]]; then
+  echo overwrite sync=1 >> $output_dir/report2.txt
+  head -1 $output_dir/report.txt >> $output_dir/report2.txt
+  grep overwrite $output_dir/report.txt | grep \.s1  >> $output_dir/report2.txt
+fi
+
+echo updaterandom sync=0 >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep updaterandom $output_dir/report.txt | grep \.s0 >> $output_dir/report2.txt
+
+if [[ $skip_low_pri_tests != 1 ]]; then
+  echo updaterandom sync=1 >> $output_dir/report2.txt
+  head -1 $output_dir/report.txt >> $output_dir/report2.txt
+  grep updaterandom $output_dir/report.txt | grep \.s1 >> $output_dir/report2.txt
+fi
+
+echo mergerandom sync=0 >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep mergerandom $output_dir/report.txt | grep \.s0 >> $output_dir/report2.txt
+
+if [[ $skip_low_pri_tests != 1 ]]; then
+  echo mergerandom sync=1 >> $output_dir/report2.txt
+  head -1 $output_dir/report.txt >> $output_dir/report2.txt
+  grep mergerandom $output_dir/report.txt | grep \.s1 >> $output_dir/report2.txt
+fi
+
+echo readrandom >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep readrandom $output_dir/report.txt  >> $output_dir/report2.txt
+
+echo fwdrange >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep fwdrange\.t $output_dir/report.txt >> $output_dir/report2.txt
+
+echo revrange >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep revrange\.t $output_dir/report.txt >> $output_dir/report2.txt
+
+echo readwhile >> $output_dir/report2.txt >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep readwhilewriting $output_dir/report.txt >> $output_dir/report2.txt
+
+if [[ $skip_low_pri_tests != 1 ]]; then
+  echo readwhile >> $output_dir/report2.txt
+  head -1 $output_dir/report.txt >> $output_dir/report2.txt
+  grep readwhilemerging $output_dir/report.txt >> $output_dir/report2.txt
+fi
+
+echo fwdreadwhilewriting >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep fwdrangewhilewriting $output_dir/report.txt >> $output_dir/report2.txt
+
+if [[ $skip_low_pri_tests != 1 ]]; then
+  echo fwdreadwhilemerging >> $output_dir/report2.txt
+  head -1 $output_dir/report.txt >> $output_dir/report2.txt
+  grep fwdrangewhilemerg $output_dir/report.txt >> $output_dir/report2.txt
+fi
+
+echo revreadwhilewriting >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep revrangewhilewriting $output_dir/report.txt >> $output_dir/report2.txt
+
+if [[ $skip_low_pri_tests != 1 ]]; then
+  echo revreadwhilemerging >> $output_dir/report2.txt
+  head -1 $output_dir/report.txt >> $output_dir/report2.txt
+  grep revrangewhilemerg $output_dir/report.txt >> $output_dir/report2.txt
+fi
+
+cat $output_dir/report2.txt
diff --git a/src/rocksdb/tools/run_leveldb.sh b/src/rocksdb/tools/run_leveldb.sh
new file mode 100755
index 000000000..2fce8b12d
--- /dev/null
+++ b/src/rocksdb/tools/run_leveldb.sh
@@ -0,0 +1,175 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# REQUIRE: benchmark_leveldb.sh exists in the current directory
+# After execution of this script, log files are generated in $output_dir.
+# report.txt provides a high level statistics
+#
+# This should be used with the LevelDB fork listed here to use additional test options.
+# For more details on the changes see the blog post listed below.
+#   https://github.com/mdcallag/leveldb-1
+#   http://smalldatum.blogspot.com/2015/04/comparing-leveldb-and-rocksdb-take-2.html
+#
+# This should be run from the parent of the tools directory. The command line is:
+#   [$env_vars] tools/run_flash_bench.sh [list-of-threads]
+#
+# This runs a sequence of tests in the following sequence:
+#   step 1) load - bulkload, compact, fillseq, overwrite
+#   step 2) read-only for each number of threads
+#   step 3) read-write for each number of threads
+#
+# The list of threads is optional and when not set is equivalent to "24". 
+# Were list-of-threads specified as "1 2 4" then the tests in steps 2, 3 and
+# 4 above would be repeated for 1, 2 and 4 threads. The tests in step 1 are
+# only run for 1 thread.
+
+# Test output is written to $OUTPUT_DIR, currently /tmp/output. The performance
+# summary is in $OUTPUT_DIR/report.txt. There is one file in $OUTPUT_DIR per
+# test and the tests are listed below.
+#
+# The environment variables are also optional. The variables are:
+#   NKEYS         - number of key/value pairs to load
+#   NWRITESPERSEC - the writes/second rate limit for the *whilewriting* tests.
+#                   If this is too large then the non-writer threads can get
+#                   starved.
+#   VAL_SIZE      - the length of the value in the key/value pairs loaded.
+#                   You can estimate the size of the test database from this,
+#                   NKEYS and the compression rate (--compression_ratio) set
+#                   in tools/benchmark_leveldb.sh
+#   BLOCK_LENGTH  - value for db_bench --block_size
+#   CACHE_BYTES   - the size of the RocksDB block cache in bytes
+#   DATA_DIR      - directory in which to create database files
+#   DO_SETUP      - when set to 0 then a backup of the database is copied from
+#                   $DATA_DIR.bak to $DATA_DIR and the load tests from step 1
+#                   This allows tests from steps 2, 3 to be repeated faster.
+#   SAVE_SETUP    - saves a copy of the database at the end of step 1 to
+#                   $DATA_DIR.bak.
+
+# Size constants
+K=1024
+M=$((1024 * K))
+G=$((1024 * M))
+
+num_keys=${NKEYS:-$((1 * G))}
+wps=${NWRITESPERSEC:-$((10 * K))}
+vs=${VAL_SIZE:-400}
+cs=${CACHE_BYTES:-$(( 1 * G ))}
+bs=${BLOCK_LENGTH:-4096}
+
+# If no command line arguments then run for 24 threads.
+if [[ $# -eq 0 ]]; then
+  nthreads=( 24 )
+else
+  nthreads=( "$@" )
+fi
+
+for num_thr in "${nthreads[@]}" ; do
+  echo Will run for $num_thr threads
+done
+
+# Update these parameters before execution !!!
+db_dir=${DATA_DIR:-"/tmp/rocksdb/"}
+
+do_setup=${DO_SETUP:-1}
+save_setup=${SAVE_SETUP:-0}
+
+output_dir="${TMPDIR:-/tmp}/output"
+
+ARGS="\
+OUTPUT_DIR=$output_dir \
+NUM_KEYS=$num_keys \
+DB_DIR=$db_dir \
+VALUE_SIZE=$vs \
+BLOCK_SIZE=$bs \
+CACHE_SIZE=$cs"
+
+mkdir -p $output_dir
+echo -e "ops/sec\tmb/sec\tusec/op\tavg\tp50\tTest" \
+  > $output_dir/report.txt
+
+# Notes on test sequence:
+#   step 1) Setup database via sequential fill followed by overwrite to fragment it.
+#           Done without setting DURATION to make sure that overwrite does $num_keys writes
+#   step 2) read-only tests for all levels of concurrency requested
+#   step 3) non read-only tests for all levels of concurrency requested
+
+###### Setup the database
+
+if [[ $do_setup != 0 ]]; then
+  echo Doing setup
+
+  # Test 2a: sequential fill with large values to get peak ingest
+  #          adjust NUM_KEYS given the use of larger values
+  env $ARGS BLOCK_SIZE=$((1 * M)) VALUE_SIZE=$((32 * K)) NUM_KEYS=$(( num_keys / 64 )) \
+       ./tools/benchmark_leveldb.sh fillseq
+
+  # Test 2b: sequential fill with the configured value size
+  env $ARGS ./tools/benchmark_leveldb.sh fillseq
+
+  # Test 3: single-threaded overwrite
+  env $ARGS NUM_THREADS=1 DB_BENCH_NO_SYNC=1 ./tools/benchmark_leveldb.sh overwrite
+
+else
+  echo Restoring from backup
+
+  rm -rf $db_dir
+
+  if [ ! -d ${db_dir}.bak ]; then
+    echo Database backup does not exist at ${db_dir}.bak
+    exit -1
+  fi
+
+  echo Restore database from ${db_dir}.bak
+  cp -p -r ${db_dir}.bak $db_dir
+fi
+
+if [[ $save_setup != 0 ]]; then
+  echo Save database to ${db_dir}.bak
+  cp -p -r $db_dir ${db_dir}.bak
+fi
+
+###### Read-only tests
+
+for num_thr in "${nthreads[@]}" ; do
+  # Test 4: random read
+  env $ARGS NUM_THREADS=$num_thr ./tools/benchmark_leveldb.sh readrandom
+
+done
+
+###### Non read-only tests
+
+for num_thr in "${nthreads[@]}" ; do
+  # Test 7: overwrite with sync=0
+  env $ARGS NUM_THREADS=$num_thr DB_BENCH_NO_SYNC=1 \
+    ./tools/benchmark_leveldb.sh overwrite
+
+  # Test 8: overwrite with sync=1
+  # Not run for now because LevelDB db_bench doesn't have an option to limit the
+  # test run to X seconds and doing sync-per-commit for --num can take too long.
+  # env $ARGS NUM_THREADS=$num_thr ./tools/benchmark_leveldb.sh overwrite
+
+  # Test 11: random read while writing
+  env $ARGS NUM_THREADS=$num_thr WRITES_PER_SECOND=$wps \
+    ./tools/benchmark_leveldb.sh readwhilewriting
+
+done
+
+echo bulkload > $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep bulkload $output_dir/report.txt >> $output_dir/report2.txt
+echo fillseq >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep fillseq $output_dir/report.txt >> $output_dir/report2.txt
+echo overwrite sync=0 >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep overwrite $output_dir/report.txt | grep \.s0  >> $output_dir/report2.txt
+echo overwrite sync=1 >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep overwrite $output_dir/report.txt | grep \.s1  >> $output_dir/report2.txt
+echo readrandom >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep readrandom $output_dir/report.txt  >> $output_dir/report2.txt
+echo readwhile >> $output_dir/report2.txt >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep readwhilewriting $output_dir/report.txt >> $output_dir/report2.txt
+
+cat $output_dir/report2.txt
diff --git a/src/rocksdb/tools/sample-dump.dmp b/src/rocksdb/tools/sample-dump.dmp
new file mode 100644
index 000000000..4ec3a7732
--- /dev/null
+++ b/src/rocksdb/tools/sample-dump.dmp
diff --git a/src/rocksdb/tools/simulated_hybrid_file_system.cc b/src/rocksdb/tools/simulated_hybrid_file_system.cc
new file mode 100644
index 000000000..a474417c7
--- /dev/null
+++ b/src/rocksdb/tools/simulated_hybrid_file_system.cc
@@ -0,0 +1,245 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/stop_watch.h"
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <sstream>
+#include <string>
+
+#include "rocksdb/rate_limiter.h"
+#include "tools/simulated_hybrid_file_system.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const int64_t kUsPerSec = 1000000;
+const int64_t kDummyBytesPerUs = 1024;
+
+namespace {
+// From bytes to read/write, calculate service time needed by an HDD.
+// This is used to simulate latency from HDD.
+int CalculateServeTimeUs(size_t bytes) {
+  return 12200 + static_cast<int>(static_cast<double>(bytes) * 0.005215);
+}
+
+// There is a bug in rater limiter that would crash with small requests
+// Hack to get it around.
+void RateLimiterRequest(RateLimiter* rater_limiter, int64_t amount) {
+  int64_t left = amount * kDummyBytesPerUs;
+  const int64_t kMaxToRequest = kDummyBytesPerUs * kUsPerSec / 1024;
+  while (left > 0) {
+    int64_t to_request = std::min(kMaxToRequest, left);
+    rater_limiter->Request(to_request, Env::IOPriority::IO_LOW, nullptr);
+    left -= to_request;
+  }
+}
+}  // namespace
+
+// The metadata file format: each line is a full filename of a file which is
+// warm
+SimulatedHybridFileSystem::SimulatedHybridFileSystem(
+    const std::shared_ptr<FileSystem>& base,
+    const std::string& metadata_file_name, int throughput_multiplier,
+    bool is_full_fs_warm)
+    : FileSystemWrapper(base),
+      // Limit to 100 requests per second.
+      rate_limiter_(NewGenericRateLimiter(
+          int64_t{throughput_multiplier} * kDummyBytesPerUs *
+              kUsPerSec /* rate_bytes_per_sec */,
+          1000 /* refill_period_us */)),
+      metadata_file_name_(metadata_file_name),
+      name_("SimulatedHybridFileSystem: " + std::string(target()->Name())),
+      is_full_fs_warm_(is_full_fs_warm) {
+  IOStatus s = base->FileExists(metadata_file_name, IOOptions(), nullptr);
+  if (s.IsNotFound()) {
+    return;
+  }
+  std::string metadata;
+  s = ReadFileToString(base.get(), metadata_file_name, &metadata);
+  if (!s.ok()) {
+    fprintf(stderr, "Error reading from file %s: %s",
+            metadata_file_name.c_str(), s.ToString().c_str());
+    // Exit rather than assert as this file system is built to run with
+    // benchmarks, which usually run on release mode.
+    std::exit(1);
+  }
+  std::istringstream input;
+  input.str(metadata);
+  std::string line;
+  while (std::getline(input, line)) {
+    fprintf(stderr, "Warm file %s\n", line.c_str());
+    warm_file_set_.insert(line);
+  }
+}
+
+// Need to write out the metadata file to file. See comment of
+// SimulatedHybridFileSystem::SimulatedHybridFileSystem() for format of the
+// file.
+SimulatedHybridFileSystem::~SimulatedHybridFileSystem() {
+  if (metadata_file_name_.empty()) {
+    return;
+  }
+  std::string metadata;
+  for (const auto& f : warm_file_set_) {
+    metadata += f;
+    metadata += "\n";
+  }
+  IOStatus s = WriteStringToFile(target(), metadata, metadata_file_name_, true);
+  if (!s.ok()) {
+    fprintf(stderr, "Error writing to file %s: %s", metadata_file_name_.c_str(),
+            s.ToString().c_str());
+  }
+}
+
+IOStatus SimulatedHybridFileSystem::NewRandomAccessFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) {
+  Temperature temperature = Temperature::kUnknown;
+  if (is_full_fs_warm_) {
+    temperature = Temperature::kWarm;
+  } else {
+    const std::lock_guard<std::mutex> lock(mutex_);
+    if (warm_file_set_.find(fname) != warm_file_set_.end()) {
+      temperature = Temperature::kWarm;
+    }
+    assert(temperature == file_opts.temperature);
+  }
+  IOStatus s = target()->NewRandomAccessFile(fname, file_opts, result, dbg);
+  result->reset(
+      new SimulatedHybridRaf(std::move(*result), rate_limiter_, temperature));
+  return s;
+}
+
+IOStatus SimulatedHybridFileSystem::NewWritableFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) {
+  if (file_opts.temperature == Temperature::kWarm) {
+    const std::lock_guard<std::mutex> lock(mutex_);
+    warm_file_set_.insert(fname);
+  }
+
+  IOStatus s = target()->NewWritableFile(fname, file_opts, result, dbg);
+  if (file_opts.temperature == Temperature::kWarm || is_full_fs_warm_) {
+    result->reset(new SimulatedWritableFile(std::move(*result), rate_limiter_));
+  }
+  return s;
+}
+
+IOStatus SimulatedHybridFileSystem::DeleteFile(const std::string& fname,
+                                               const IOOptions& options,
+                                               IODebugContext* dbg) {
+  {
+    const std::lock_guard<std::mutex> lock(mutex_);
+    warm_file_set_.erase(fname);
+  }
+  return target()->DeleteFile(fname, options, dbg);
+}
+
+IOStatus SimulatedHybridRaf::Read(uint64_t offset, size_t n,
+                                  const IOOptions& options, Slice* result,
+                                  char* scratch, IODebugContext* dbg) const {
+  if (temperature_ == Temperature::kWarm) {
+    SimulateIOWait(n);
+  }
+  return target()->Read(offset, n, options, result, scratch, dbg);
+}
+
+IOStatus SimulatedHybridRaf::MultiRead(FSReadRequest* reqs, size_t num_reqs,
+                                       const IOOptions& options,
+                                       IODebugContext* dbg) {
+  if (temperature_ == Temperature::kWarm) {
+    for (size_t i = 0; i < num_reqs; i++) {
+      SimulateIOWait(reqs[i].len);
+    }
+  }
+  return target()->MultiRead(reqs, num_reqs, options, dbg);
+}
+
+IOStatus SimulatedHybridRaf::Prefetch(uint64_t offset, size_t n,
+                                      const IOOptions& options,
+                                      IODebugContext* dbg) {
+  if (temperature_ == Temperature::kWarm) {
+    SimulateIOWait(n);
+  }
+  return target()->Prefetch(offset, n, options, dbg);
+}
+
+void SimulatedHybridRaf::SimulateIOWait(int64_t bytes) const {
+  int serve_time = CalculateServeTimeUs(bytes);
+  {
+    StopWatchNano stop_watch(Env::Default()->GetSystemClock().get(),
+                             /*auto_start=*/true);
+    RateLimiterRequest(rate_limiter_.get(), serve_time);
+    int time_passed_us = static_cast<int>(stop_watch.ElapsedNanos() / 1000);
+    if (time_passed_us < serve_time) {
+      Env::Default()->SleepForMicroseconds(serve_time - time_passed_us);
+    }
+  }
+}
+
+void SimulatedWritableFile::SimulateIOWait(int64_t bytes) const {
+  int serve_time = CalculateServeTimeUs(bytes);
+  Env::Default()->SleepForMicroseconds(serve_time);
+  RateLimiterRequest(rate_limiter_.get(), serve_time);
+}
+
+IOStatus SimulatedWritableFile::Append(const Slice& data, const IOOptions& ioo,
+                                       IODebugContext* idc) {
+  if (use_direct_io()) {
+    SimulateIOWait(data.size());
+  } else {
+    unsynced_bytes += data.size();
+  }
+  return target()->Append(data, ioo, idc);
+}
+
+IOStatus SimulatedWritableFile::Append(
+    const Slice& data, const IOOptions& options,
+    const DataVerificationInfo& verification_info, IODebugContext* dbg) {
+  if (use_direct_io()) {
+    SimulateIOWait(data.size());
+  } else {
+    unsynced_bytes += data.size();
+  }
+  return target()->Append(data, options, verification_info, dbg);
+}
+
+IOStatus SimulatedWritableFile::PositionedAppend(const Slice& data,
+                                                 uint64_t offset,
+                                                 const IOOptions& options,
+                                                 IODebugContext* dbg) {
+  if (use_direct_io()) {
+    SimulateIOWait(data.size());
+  } else {
+    // This might be overcalculated, but it's probably OK.
+    unsynced_bytes += data.size();
+  }
+  return target()->PositionedAppend(data, offset, options, dbg);
+}
+IOStatus SimulatedWritableFile::PositionedAppend(
+    const Slice& data, uint64_t offset, const IOOptions& options,
+    const DataVerificationInfo& verification_info, IODebugContext* dbg) {
+  if (use_direct_io()) {
+    SimulateIOWait(data.size());
+  } else {
+    // This might be overcalculated, but it's probably OK.
+    unsynced_bytes += data.size();
+  }
+  return target()->PositionedAppend(data, offset, options, verification_info,
+                                    dbg);
+}
+
+IOStatus SimulatedWritableFile::Sync(const IOOptions& options,
+                                     IODebugContext* dbg) {
+  if (unsynced_bytes > 0) {
+    SimulateIOWait(unsynced_bytes);
+    unsynced_bytes = 0;
+  }
+  return target()->Sync(options, dbg);
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/simulated_hybrid_file_system.h b/src/rocksdb/tools/simulated_hybrid_file_system.h
new file mode 100644
index 000000000..251d89df7
--- /dev/null
+++ b/src/rocksdb/tools/simulated_hybrid_file_system.h
@@ -0,0 +1,126 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <utility>
+
+#include "rocksdb/file_system.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A FileSystem simulates hybrid file system by ingesting latency and limit
+// IOPs.
+// This class is only used for development purpose and should not be used
+// in production.
+// Right now we ingest 15ms latency and allow 100 requests per second when
+// the file is for warm temperature.
+// When the object is destroyed, the list of warm files are written to a
+// file, which can be used to reopen a FileSystem and still recover the
+// list. This is to allow the information to preserve between db_bench
+// runs.
+class SimulatedHybridFileSystem : public FileSystemWrapper {
+ public:
+  // metadata_file_name stores metadata of the files, so that it can be
+  // loaded after process restarts. If the file doesn't exist, create
+  // one. The file is written when the class is destroyed.
+  // throughput_multiplier: multiplier of throughput. For example, 1 is to
+  //      simulate single disk spindle. 4 is to simualte 4 disk spindles.
+  // is_full_fs_warm: if true, all files are all included in slow I/O
+  // simulation.
+  SimulatedHybridFileSystem(const std::shared_ptr<FileSystem>& base,
+                            const std::string& metadata_file_name,
+                            int throughput_multiplier, bool is_full_fs_warm);
+
+  ~SimulatedHybridFileSystem() override;
+
+ public:
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& file_opts,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override;
+  IOStatus NewWritableFile(const std::string& fname,
+                           const FileOptions& file_opts,
+                           std::unique_ptr<FSWritableFile>* result,
+                           IODebugContext* dbg) override;
+  IOStatus DeleteFile(const std::string& fname, const IOOptions& options,
+                      IODebugContext* dbg) override;
+
+  const char* Name() const override { return name_.c_str(); }
+
+ private:
+  // Limit 100 requests per second. Rate limiter is designed to byte but
+  // we use it as fixed bytes is one request.
+  std::shared_ptr<RateLimiter> rate_limiter_;
+  std::mutex mutex_;
+  std::unordered_set<std::string> warm_file_set_;
+  std::string metadata_file_name_;
+  std::string name_;
+  bool is_full_fs_warm_;
+};
+
+// Simulated random access file that can control IOPs and latency to simulate
+// specific storage media
+class SimulatedHybridRaf : public FSRandomAccessFileOwnerWrapper {
+ public:
+  SimulatedHybridRaf(std::unique_ptr<FSRandomAccessFile>&& t,
+                     std::shared_ptr<RateLimiter> rate_limiter,
+                     Temperature temperature)
+      : FSRandomAccessFileOwnerWrapper(std::move(t)),
+        rate_limiter_(rate_limiter),
+        temperature_(temperature) {}
+
+  ~SimulatedHybridRaf() override {}
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
+
+  IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
+                     const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
+                    IODebugContext* dbg) override;
+
+ private:
+  std::shared_ptr<RateLimiter> rate_limiter_;
+  Temperature temperature_;
+
+  void SimulateIOWait(int64_t num_requests) const;
+};
+
+class SimulatedWritableFile : public FSWritableFileWrapper {
+ public:
+  SimulatedWritableFile(std::unique_ptr<FSWritableFile>&& t,
+                        std::shared_ptr<RateLimiter> rate_limiter)
+      : FSWritableFileWrapper(t.get()),
+        file_guard_(std::move(t)),
+        rate_limiter_(rate_limiter) {}
+  IOStatus Append(const Slice& data, const IOOptions&,
+                  IODebugContext*) override;
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  const DataVerificationInfo& verification_info,
+                  IODebugContext* dbg) override;
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& options,
+                            IODebugContext* dbg) override;
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& options,
+                            const DataVerificationInfo& verification_info,
+                            IODebugContext* dbg) override;
+
+ private:
+  std::unique_ptr<FSWritableFile> file_guard_;
+  std::shared_ptr<RateLimiter> rate_limiter_;
+  size_t unsynced_bytes = 0;
+
+  void SimulateIOWait(int64_t num_requests) const;
+};
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/sst_dump.cc b/src/rocksdb/tools/sst_dump.cc
new file mode 100644
index 000000000..becf67316
--- /dev/null
+++ b/src/rocksdb/tools/sst_dump.cc
@@ -0,0 +1,20 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/sst_dump_tool.h"
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::SSTDumpTool tool;
+  return tool.Run(argc, argv);
+}
+#else
+#include <stdio.h>
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "Not supported in lite mode.\n");
+  return 1;
+}
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/sst_dump_test.cc b/src/rocksdb/tools/sst_dump_test.cc
new file mode 100644
index 000000000..aa1ff810f
--- /dev/null
+++ b/src/rocksdb/tools/sst_dump_test.cc
@@ -0,0 +1,481 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#ifndef ROCKSDB_LITE
+
+#include <stdint.h>
+
+#include "file/random_access_file_reader.h"
+#include "port/stack_trace.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/sst_dump_tool.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/table_builder.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const uint32_t kOptLength = 1024;
+
+namespace {
+static std::string MakeKey(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "k_%04d", i);
+  InternalKey key(std::string(buf), 0, ValueType::kTypeValue);
+  return key.Encode().ToString();
+}
+
+static std::string MakeKeyWithTimeStamp(int i, uint64_t ts) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "k_%04d", i);
+  return test::KeyStr(ts, std::string(buf), /*seq=*/0, kTypeValue);
+}
+
+static std::string MakeValue(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "v_%04d", i);
+  InternalKey key(std::string(buf), 0, ValueType::kTypeValue);
+  return key.Encode().ToString();
+}
+
+void cleanup(const Options& opts, const std::string& file_name) {
+  Env* env = opts.env;
+  ASSERT_OK(env->DeleteFile(file_name));
+  std::string outfile_name = file_name.substr(0, file_name.length() - 4);
+  outfile_name.append("_dump.txt");
+  env->DeleteFile(outfile_name).PermitUncheckedError();
+}
+}  // namespace
+
+// Test for sst dump tool "raw" mode
+class SSTDumpToolTest : public testing::Test {
+  std::string test_dir_;
+  Env* env_;
+  std::shared_ptr<Env> env_guard_;
+
+ public:
+  SSTDumpToolTest() : env_(Env::Default()) {
+    EXPECT_OK(test::CreateEnvFromSystem(ConfigOptions(), &env_, &env_guard_));
+    test_dir_ = test::PerThreadDBPath(env_, "sst_dump_test_db");
+    Status s = env_->CreateDirIfMissing(test_dir_);
+    EXPECT_OK(s);
+  }
+
+  ~SSTDumpToolTest() override {
+    if (getenv("KEEP_DB")) {
+      fprintf(stdout, "Data is still at %s\n", test_dir_.c_str());
+    } else {
+      EXPECT_OK(env_->DeleteDir(test_dir_));
+    }
+  }
+
+  Env* env() { return env_; }
+
+  std::string MakeFilePath(const std::string& file_name) const {
+    std::string path(test_dir_);
+    path.append("/").append(file_name);
+    return path;
+  }
+
+  template <std::size_t N>
+  void PopulateCommandArgs(const std::string& file_path, const char* command,
+                           char* (&usage)[N]) const {
+    for (int i = 0; i < static_cast<int>(N); ++i) {
+      usage[i] = new char[kOptLength];
+    }
+    snprintf(usage[0], kOptLength, "./sst_dump");
+    snprintf(usage[1], kOptLength, "%s", command);
+    snprintf(usage[2], kOptLength, "--file=%s", file_path.c_str());
+  }
+
+  void createSST(const Options& opts, const std::string& file_name) {
+    Env* test_env = opts.env;
+    FileOptions file_options(opts);
+    ReadOptions read_options;
+    const ImmutableOptions imoptions(opts);
+    const MutableCFOptions moptions(opts);
+    ROCKSDB_NAMESPACE::InternalKeyComparator ikc(opts.comparator);
+    std::unique_ptr<TableBuilder> tb;
+
+    IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+    std::unique_ptr<WritableFileWriter> file_writer;
+    ASSERT_OK(WritableFileWriter::Create(test_env->GetFileSystem(), file_name,
+                                         file_options, &file_writer, nullptr));
+
+    std::string column_family_name;
+    int unknown_level = -1;
+    tb.reset(opts.table_factory->NewTableBuilder(
+        TableBuilderOptions(
+            imoptions, moptions, ikc, &int_tbl_prop_collector_factories,
+            CompressionType::kNoCompression, CompressionOptions(),
+            TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+            column_family_name, unknown_level),
+        file_writer.get()));
+
+    // Populate slightly more than 1K keys
+    uint32_t num_keys = kNumKey;
+    const char* comparator_name = ikc.user_comparator()->Name();
+    if (strcmp(comparator_name, ReverseBytewiseComparator()->Name()) == 0) {
+      for (int32_t i = num_keys; i >= 0; i--) {
+        tb->Add(MakeKey(i), MakeValue(i));
+      }
+    } else if (strcmp(comparator_name,
+                      test::BytewiseComparatorWithU64TsWrapper()->Name()) ==
+               0) {
+      for (uint32_t i = 0; i < num_keys; i++) {
+        tb->Add(MakeKeyWithTimeStamp(i, 100 + i), MakeValue(i));
+      }
+    } else {
+      for (uint32_t i = 0; i < num_keys; i++) {
+        tb->Add(MakeKey(i), MakeValue(i));
+      }
+    }
+    ASSERT_OK(tb->Finish());
+    ASSERT_OK(file_writer->Close());
+  }
+
+ protected:
+  constexpr static int kNumKey = 1024;
+};
+
+constexpr int SSTDumpToolTest::kNumKey;
+
+TEST_F(SSTDumpToolTest, HelpAndVersion) {
+  Options opts;
+  opts.env = env();
+
+  ROCKSDB_NAMESPACE::SSTDumpTool tool;
+
+  static const char* help[] = {"./sst_dump", "--help"};
+  ASSERT_TRUE(!tool.Run(2, help, opts));
+  static const char* version[] = {"./sst_dump", "--version"};
+  ASSERT_TRUE(!tool.Run(2, version, opts));
+  static const char* bad[] = {"./sst_dump", "--not_an_option"};
+  ASSERT_TRUE(tool.Run(2, bad, opts));
+}
+
+TEST_F(SSTDumpToolTest, EmptyFilter) {
+  Options opts;
+  opts.env = env();
+  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+  createSST(opts, file_path);
+
+  char* usage[3];
+  PopulateCommandArgs(file_path, "--command=raw", usage);
+
+  ROCKSDB_NAMESPACE::SSTDumpTool tool;
+  ASSERT_TRUE(!tool.Run(3, usage, opts));
+
+  cleanup(opts, file_path);
+  for (int i = 0; i < 3; i++) {
+    delete[] usage[i];
+  }
+}
+
+TEST_F(SSTDumpToolTest, SstDumpReverseBytewiseComparator) {
+  Options opts;
+  opts.env = env();
+  opts.comparator = ReverseBytewiseComparator();
+  BlockBasedTableOptions table_opts;
+  table_opts.filter_policy.reset(
+      ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, false));
+  opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
+  std::string file_path =
+      MakeFilePath("rocksdb_sst_reverse_bytewise_comparator.sst");
+  createSST(opts, file_path);
+
+  char* usage[3];
+  PopulateCommandArgs(file_path, "--command=raw", usage);
+
+  ROCKSDB_NAMESPACE::SSTDumpTool tool;
+  ASSERT_TRUE(!tool.Run(3, usage, opts));
+
+  cleanup(opts, file_path);
+  for (int i = 0; i < 3; i++) {
+    delete[] usage[i];
+  }
+}
+
+TEST_F(SSTDumpToolTest, SstDumpComparatorWithU64Ts) {
+  Options opts;
+  opts.env = env();
+  opts.comparator = test::BytewiseComparatorWithU64TsWrapper();
+  BlockBasedTableOptions table_opts;
+  table_opts.filter_policy.reset(
+      ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, false));
+  opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
+  std::string file_path =
+      MakeFilePath("rocksdb_sst_comparator_with_u64_ts.sst");
+  createSST(opts, file_path);
+
+  char* usage[3];
+  PopulateCommandArgs(file_path, "--command=raw", usage);
+
+  ROCKSDB_NAMESPACE::SSTDumpTool tool;
+  ASSERT_TRUE(!tool.Run(3, usage, opts));
+
+  cleanup(opts, file_path);
+  for (int i = 0; i < 3; i++) {
+    delete[] usage[i];
+  }
+}
+
+TEST_F(SSTDumpToolTest, FilterBlock) {
+  Options opts;
+  opts.env = env();
+  BlockBasedTableOptions table_opts;
+  table_opts.filter_policy.reset(
+      ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, true));
+  opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
+  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+  createSST(opts, file_path);
+
+  char* usage[3];
+  PopulateCommandArgs(file_path, "--command=raw", usage);
+
+  ROCKSDB_NAMESPACE::SSTDumpTool tool;
+  ASSERT_TRUE(!tool.Run(3, usage, opts));
+
+  cleanup(opts, file_path);
+  for (int i = 0; i < 3; i++) {
+    delete[] usage[i];
+  }
+}
+
+TEST_F(SSTDumpToolTest, FullFilterBlock) {
+  Options opts;
+  opts.env = env();
+  BlockBasedTableOptions table_opts;
+  table_opts.filter_policy.reset(
+      ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, false));
+  opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
+  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+  createSST(opts, file_path);
+
+  char* usage[3];
+  PopulateCommandArgs(file_path, "--command=raw", usage);
+
+  ROCKSDB_NAMESPACE::SSTDumpTool tool;
+  ASSERT_TRUE(!tool.Run(3, usage, opts));
+
+  cleanup(opts, file_path);
+  for (int i = 0; i < 3; i++) {
+    delete[] usage[i];
+  }
+}
+
+TEST_F(SSTDumpToolTest, GetProperties) {
+  Options opts;
+  opts.env = env();
+  BlockBasedTableOptions table_opts;
+  table_opts.filter_policy.reset(
+      ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, false));
+  opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
+  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+  createSST(opts, file_path);
+
+  char* usage[3];
+  PopulateCommandArgs(file_path, "--show_properties", usage);
+
+  ROCKSDB_NAMESPACE::SSTDumpTool tool;
+  ASSERT_TRUE(!tool.Run(3, usage, opts));
+
+  cleanup(opts, file_path);
+  for (int i = 0; i < 3; i++) {
+    delete[] usage[i];
+  }
+}
+
+TEST_F(SSTDumpToolTest, CompressedSizes) {
+  Options opts;
+  opts.env = env();
+  BlockBasedTableOptions table_opts;
+  table_opts.filter_policy.reset(
+      ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, false));
+  opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
+  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+  createSST(opts, file_path);
+
+  char* usage[3];
+  PopulateCommandArgs(file_path, "--command=recompress", usage);
+
+  ROCKSDB_NAMESPACE::SSTDumpTool tool;
+  ASSERT_TRUE(!tool.Run(3, usage, opts));
+
+  cleanup(opts, file_path);
+  for (int i = 0; i < 3; i++) {
+    delete[] usage[i];
+  }
+}
+
+TEST_F(SSTDumpToolTest, MemEnv) {
+  std::unique_ptr<Env> mem_env(NewMemEnv(env()));
+  Options opts;
+  opts.env = mem_env.get();
+  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+  createSST(opts, file_path);
+
+  char* usage[3];
+  PopulateCommandArgs(file_path, "--command=verify_checksum", usage);
+
+  ROCKSDB_NAMESPACE::SSTDumpTool tool;
+  ASSERT_TRUE(!tool.Run(3, usage, opts));
+
+  cleanup(opts, file_path);
+  for (int i = 0; i < 3; i++) {
+    delete[] usage[i];
+  }
+}
+
+TEST_F(SSTDumpToolTest, ReadaheadSize) {
+  Options opts;
+  opts.env = env();
+  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+  createSST(opts, file_path);
+
+  char* usage[4];
+  PopulateCommandArgs(file_path, "--command=verify", usage);
+  snprintf(usage[3], kOptLength, "--readahead_size=4000000");
+
+  int num_reads = 0;
+  SyncPoint::GetInstance()->SetCallBack("RandomAccessFileReader::Read",
+                                        [&](void*) { num_reads++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  SSTDumpTool tool;
+  ASSERT_TRUE(!tool.Run(4, usage, opts));
+
+  // The file is approximately 10MB. Readahead is 4MB.
+  // We usually need 3 reads + one metadata read.
+  // One extra read is needed before opening the file for metadata.
+  ASSERT_EQ(5, num_reads);
+
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  cleanup(opts, file_path);
+  for (int i = 0; i < 4; i++) {
+    delete[] usage[i];
+  }
+}
+
+TEST_F(SSTDumpToolTest, NoSstFile) {
+  Options opts;
+  opts.env = env();
+  std::string file_path = MakeFilePath("no_such_file.sst");
+  char* usage[3];
+  PopulateCommandArgs(file_path, "", usage);
+  ROCKSDB_NAMESPACE::SSTDumpTool tool;
+  for (const auto& command :
+       {"--command=check", "--command=dump", "--command=raw",
+        "--command=verify", "--command=recompress", "--command=verify_checksum",
+        "--show_properties"}) {
+    snprintf(usage[1], kOptLength, "%s", command);
+    ASSERT_TRUE(tool.Run(3, usage, opts));
+  }
+  for (int i = 0; i < 3; i++) {
+    delete[] usage[i];
+  }
+}
+
+TEST_F(SSTDumpToolTest, ValidSSTPath) {
+  Options opts;
+  opts.env = env();
+  char* usage[3];
+  PopulateCommandArgs("", "", usage);
+  SSTDumpTool tool;
+  std::string file_not_exists = MakeFilePath("file_not_exists.sst");
+  std::string sst_file = MakeFilePath("rocksdb_sst_test.sst");
+  createSST(opts, sst_file);
+  std::string text_file = MakeFilePath("text_file");
+  ASSERT_OK(WriteStringToFile(opts.env, "Hello World!", text_file));
+  std::string fake_sst = MakeFilePath("fake_sst.sst");
+  ASSERT_OK(WriteStringToFile(opts.env, "Not an SST file!", fake_sst));
+
+  for (const auto& command_arg : {"--command=verify", "--command=identify"}) {
+    snprintf(usage[1], kOptLength, "%s", command_arg);
+
+    snprintf(usage[2], kOptLength, "--file=%s", file_not_exists.c_str());
+    ASSERT_TRUE(tool.Run(3, usage, opts));
+
+    snprintf(usage[2], kOptLength, "--file=%s", sst_file.c_str());
+    ASSERT_TRUE(!tool.Run(3, usage, opts));
+
+    snprintf(usage[2], kOptLength, "--file=%s", text_file.c_str());
+    ASSERT_TRUE(tool.Run(3, usage, opts));
+
+    snprintf(usage[2], kOptLength, "--file=%s", fake_sst.c_str());
+    ASSERT_TRUE(tool.Run(3, usage, opts));
+  }
+  ASSERT_OK(opts.env->DeleteFile(sst_file));
+  ASSERT_OK(opts.env->DeleteFile(text_file));
+  ASSERT_OK(opts.env->DeleteFile(fake_sst));
+
+  for (int i = 0; i < 3; i++) {
+    delete[] usage[i];
+  }
+}
+
+TEST_F(SSTDumpToolTest, RawOutput) {
+  Options opts;
+  opts.env = env();
+  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+  createSST(opts, file_path);
+
+  char* usage[3];
+  PopulateCommandArgs(file_path, "--command=raw", usage);
+
+  ROCKSDB_NAMESPACE::SSTDumpTool tool;
+  ASSERT_TRUE(!tool.Run(3, usage, opts));
+
+  const std::string raw_path = MakeFilePath("rocksdb_sst_test_dump.txt");
+  std::ifstream raw_file(raw_path);
+
+  std::string tp;
+  bool is_data_block = false;
+  int key_count = 0;
+  while (getline(raw_file, tp)) {
+    if (tp.find("Data Block #") != std::string::npos) {
+      is_data_block = true;
+    }
+
+    if (is_data_block && tp.find("HEX") != std::string::npos) {
+      key_count++;
+    }
+  }
+
+  ASSERT_EQ(kNumKey, key_count);
+
+  raw_file.close();
+
+  cleanup(opts, file_path);
+  for (int i = 0; i < 3; i++) {
+    delete[] usage[i];
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as SSTDumpTool is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE  return RUN_ALL_TESTS();
diff --git a/src/rocksdb/tools/sst_dump_tool.cc b/src/rocksdb/tools/sst_dump_tool.cc
new file mode 100644
index 000000000..0a2c28280
--- /dev/null
+++ b/src/rocksdb/tools/sst_dump_tool.cc
@@ -0,0 +1,584 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/sst_dump_tool.h"
+
+#include <cinttypes>
+#include <iostream>
+
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/utilities/ldb_cmd.h"
+#include "table/sst_file_dumper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static const std::vector<std::pair<CompressionType, const char*>>
+    kCompressions = {
+        {CompressionType::kNoCompression, "kNoCompression"},
+        {CompressionType::kSnappyCompression, "kSnappyCompression"},
+        {CompressionType::kZlibCompression, "kZlibCompression"},
+        {CompressionType::kBZip2Compression, "kBZip2Compression"},
+        {CompressionType::kLZ4Compression, "kLZ4Compression"},
+        {CompressionType::kLZ4HCCompression, "kLZ4HCCompression"},
+        {CompressionType::kXpressCompression, "kXpressCompression"},
+        {CompressionType::kZSTD, "kZSTD"}};
+
+namespace {
+
+void print_help(bool to_stderr) {
+  std::string supported_compressions;
+  for (CompressionType ct : GetSupportedCompressions()) {
+    if (!supported_compressions.empty()) {
+      supported_compressions += ", ";
+    }
+    std::string str;
+    Status s = GetStringFromCompressionType(&str, ct);
+    assert(s.ok());
+    supported_compressions += str;
+  }
+  fprintf(
+      to_stderr ? stderr : stdout,
+      R"(sst_dump --file=<data_dir_OR_sst_file> [--command=check|scan|raw|recompress|identify]
+    --file=<data_dir_OR_sst_file>
+      Path to SST file or directory containing SST files
+
+    --env_uri=<uri of underlying Env>
+      URI of underlying Env, mutually exclusive with fs_uri
+
+    --fs_uri=<uri of underlying FileSystem>
+      URI of underlying FileSystem, mutually exclusive with env_uri
+
+    --command=check|scan|raw|verify|identify
+        check: Iterate over entries in files but don't print anything except if an error is encountered (default command)
+        scan: Iterate over entries in files and print them to screen
+        raw: Dump all the table contents to <file_name>_dump.txt
+        verify: Iterate all the blocks in files verifying checksum to detect possible corruption but don't print anything except if a corruption is encountered
+        recompress: reports the SST file size if recompressed with different
+                    compression types
+        identify: Reports a file is a valid SST file or lists all valid SST files under a directory
+
+    --output_hex
+      Can be combined with scan command to print the keys and values in Hex
+
+    --decode_blob_index
+      Decode blob indexes and print them in a human-readable format during scans.
+
+    --from=<user_key>
+      Key to start reading from when executing check|scan
+
+    --to=<user_key>
+      Key to stop reading at when executing check|scan
+
+    --prefix=<user_key>
+      Returns all keys with this prefix when executing check|scan
+      Cannot be used in conjunction with --from
+
+    --read_num=<num>
+      Maximum number of entries to read when executing check|scan
+
+    --verify_checksum
+      Verify file checksum when executing check|scan
+
+    --input_key_hex
+      Can be combined with --from and --to to indicate that these values are encoded in Hex
+
+    --show_properties
+      Print table properties after iterating over the file when executing
+      check|scan|raw|identify
+
+    --set_block_size=<block_size>
+      Can be combined with --command=recompress to set the block size that will
+      be used when trying different compression algorithms
+
+    --compression_types=<comma-separated list of CompressionType members, e.g.,
+      kSnappyCompression>
+      Can be combined with --command=recompress to run recompression for this
+      list of compression types
+      Supported compression types: %s
+
+    --parse_internal_key=<0xKEY>
+      Convenience option to parse an internal key on the command line. Dumps the
+      internal key in hex format {'key' @ SN: type}
+
+    --compression_level_from=<compression_level>
+      Compression level to start compressing when executing recompress. One compression type
+      and compression_level_to must also be specified
+
+    --compression_level_to=<compression_level>
+      Compression level to stop compressing when executing recompress. One compression type
+      and compression_level_from must also be specified
+
+    --compression_max_dict_bytes=<uint32_t>
+      Maximum size of dictionary used to prime the compression library
+
+    --compression_zstd_max_train_bytes=<uint32_t>
+      Maximum size of training data passed to zstd's dictionary trainer
+
+    --compression_max_dict_buffer_bytes=<int64_t>
+      Limit on buffer size from which we collect samples for dictionary generation.
+
+    --compression_use_zstd_finalize_dict
+      Use zstd's finalizeDictionary() API instead of zstd's dictionary trainer to generate dictionary.
+)",
+      supported_compressions.c_str());
+}
+
+// arg_name would include all prefix, e.g. "--my_arg="
+// arg_val is the parses value.
+// True if there is a match. False otherwise.
+// Woud exit after printing errmsg if cannot be parsed.
+bool ParseIntArg(const char* arg, const std::string arg_name,
+                 const std::string err_msg, int64_t* arg_val) {
+  if (strncmp(arg, arg_name.c_str(), arg_name.size()) == 0) {
+    std::string input_str = arg + arg_name.size();
+    std::istringstream iss(input_str);
+    iss >> *arg_val;
+    if (iss.fail()) {
+      fprintf(stderr, "%s\n", err_msg.c_str());
+      exit(1);
+    }
+    return true;
+  }
+  return false;
+}
+}  // namespace
+
+int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
+  std::string env_uri, fs_uri;
+  const char* dir_or_file = nullptr;
+  uint64_t read_num = std::numeric_limits<uint64_t>::max();
+  std::string command;
+
+  char junk;
+  uint64_t n;
+  bool verify_checksum = false;
+  bool output_hex = false;
+  bool decode_blob_index = false;
+  bool input_key_hex = false;
+  bool has_from = false;
+  bool has_to = false;
+  bool use_from_as_prefix = false;
+  bool show_properties = false;
+  bool show_summary = false;
+  bool set_block_size = false;
+  bool has_compression_level_from = false;
+  bool has_compression_level_to = false;
+  bool has_specified_compression_types = false;
+  std::string from_key;
+  std::string to_key;
+  std::string block_size_str;
+  std::string compression_level_from_str;
+  std::string compression_level_to_str;
+  size_t block_size = 0;
+  size_t readahead_size = 2 * 1024 * 1024;
+  std::vector<std::pair<CompressionType, const char*>> compression_types;
+  uint64_t total_num_files = 0;
+  uint64_t total_num_data_blocks = 0;
+  uint64_t total_data_block_size = 0;
+  uint64_t total_index_block_size = 0;
+  uint64_t total_filter_block_size = 0;
+  int32_t compress_level_from = CompressionOptions::kDefaultCompressionLevel;
+  int32_t compress_level_to = CompressionOptions::kDefaultCompressionLevel;
+  uint32_t compression_max_dict_bytes =
+      ROCKSDB_NAMESPACE::CompressionOptions().max_dict_bytes;
+  uint32_t compression_zstd_max_train_bytes =
+      ROCKSDB_NAMESPACE::CompressionOptions().zstd_max_train_bytes;
+  uint64_t compression_max_dict_buffer_bytes =
+      ROCKSDB_NAMESPACE::CompressionOptions().max_dict_buffer_bytes;
+  bool compression_use_zstd_finalize_dict =
+      !ROCKSDB_NAMESPACE::CompressionOptions().use_zstd_dict_trainer;
+
+  int64_t tmp_val;
+
+  for (int i = 1; i < argc; i++) {
+    if (strncmp(argv[i], "--env_uri=", 10) == 0) {
+      env_uri = argv[i] + 10;
+    } else if (strncmp(argv[i], "--fs_uri=", 9) == 0) {
+      fs_uri = argv[i] + 9;
+    } else if (strncmp(argv[i], "--file=", 7) == 0) {
+      dir_or_file = argv[i] + 7;
+    } else if (strcmp(argv[i], "--output_hex") == 0) {
+      output_hex = true;
+    } else if (strcmp(argv[i], "--decode_blob_index") == 0) {
+      decode_blob_index = true;
+    } else if (strcmp(argv[i], "--input_key_hex") == 0) {
+      input_key_hex = true;
+    } else if (sscanf(argv[i], "--read_num=%lu%c", (unsigned long*)&n, &junk) ==
+               1) {
+      read_num = n;
+    } else if (strcmp(argv[i], "--verify_checksum") == 0) {
+      verify_checksum = true;
+    } else if (strncmp(argv[i], "--command=", 10) == 0) {
+      command = argv[i] + 10;
+    } else if (strncmp(argv[i], "--from=", 7) == 0) {
+      from_key = argv[i] + 7;
+      has_from = true;
+    } else if (strncmp(argv[i], "--to=", 5) == 0) {
+      to_key = argv[i] + 5;
+      has_to = true;
+    } else if (strncmp(argv[i], "--prefix=", 9) == 0) {
+      from_key = argv[i] + 9;
+      use_from_as_prefix = true;
+    } else if (strcmp(argv[i], "--show_properties") == 0) {
+      show_properties = true;
+    } else if (strcmp(argv[i], "--show_summary") == 0) {
+      show_summary = true;
+    } else if (ParseIntArg(argv[i], "--set_block_size=",
+                           "block size must be numeric", &tmp_val)) {
+      set_block_size = true;
+      block_size = static_cast<size_t>(tmp_val);
+    } else if (ParseIntArg(argv[i], "--readahead_size=",
+                           "readahead_size must be numeric", &tmp_val)) {
+      readahead_size = static_cast<size_t>(tmp_val);
+    } else if (strncmp(argv[i], "--compression_types=", 20) == 0) {
+      std::string compression_types_csv = argv[i] + 20;
+      std::istringstream iss(compression_types_csv);
+      std::string compression_type;
+      has_specified_compression_types = true;
+      while (std::getline(iss, compression_type, ',')) {
+        auto iter = std::find_if(
+            kCompressions.begin(), kCompressions.end(),
+            [&compression_type](std::pair<CompressionType, const char*> curr) {
+              return curr.second == compression_type;
+            });
+        if (iter == kCompressions.end()) {
+          fprintf(stderr, "%s is not a valid CompressionType\n",
+                  compression_type.c_str());
+          exit(1);
+        }
+        compression_types.emplace_back(*iter);
+      }
+    } else if (strncmp(argv[i], "--parse_internal_key=", 21) == 0) {
+      std::string in_key(argv[i] + 21);
+      try {
+        in_key = ROCKSDB_NAMESPACE::LDBCommand::HexToString(in_key);
+      } catch (...) {
+        std::cerr << "ERROR: Invalid key input '" << in_key
+                  << "' Use 0x{hex representation of internal rocksdb key}"
+                  << std::endl;
+        return -1;
+      }
+      Slice sl_key = ROCKSDB_NAMESPACE::Slice(in_key);
+      ParsedInternalKey ikey;
+      int retc = 0;
+      Status pik_status =
+          ParseInternalKey(sl_key, &ikey, true /* log_err_key */);
+      if (!pik_status.ok()) {
+        std::cerr << pik_status.getState() << "\n";
+        retc = -1;
+      }
+      fprintf(stdout, "key=%s\n", ikey.DebugString(true, true).c_str());
+      return retc;
+    } else if (ParseIntArg(argv[i], "--compression_level_from=",
+                           "compression_level_from must be numeric",
+                           &tmp_val)) {
+      has_compression_level_from = true;
+      compress_level_from = static_cast<int>(tmp_val);
+    } else if (ParseIntArg(argv[i], "--compression_level_to=",
+                           "compression_level_to must be numeric", &tmp_val)) {
+      has_compression_level_to = true;
+      compress_level_to = static_cast<int>(tmp_val);
+    } else if (ParseIntArg(argv[i], "--compression_max_dict_bytes=",
+                           "compression_max_dict_bytes must be numeric",
+                           &tmp_val)) {
+      if (tmp_val < 0 || tmp_val > std::numeric_limits<uint32_t>::max()) {
+        fprintf(stderr, "compression_max_dict_bytes must be a uint32_t: '%s'\n",
+                argv[i]);
+        print_help(/*to_stderr*/ true);
+        return 1;
+      }
+      compression_max_dict_bytes = static_cast<uint32_t>(tmp_val);
+    } else if (ParseIntArg(argv[i], "--compression_zstd_max_train_bytes=",
+                           "compression_zstd_max_train_bytes must be numeric",
+                           &tmp_val)) {
+      if (tmp_val < 0 || tmp_val > std::numeric_limits<uint32_t>::max()) {
+        fprintf(stderr,
+                "compression_zstd_max_train_bytes must be a uint32_t: '%s'\n",
+                argv[i]);
+        print_help(/*to_stderr*/ true);
+        return 1;
+      }
+      compression_zstd_max_train_bytes = static_cast<uint32_t>(tmp_val);
+    } else if (ParseIntArg(argv[i], "--compression_max_dict_buffer_bytes=",
+                           "compression_max_dict_buffer_bytes must be numeric",
+                           &tmp_val)) {
+      if (tmp_val < 0) {
+        fprintf(stderr,
+                "compression_max_dict_buffer_bytes must be positive: '%s'\n",
+                argv[i]);
+        print_help(/*to_stderr*/ true);
+        return 1;
+      }
+      compression_max_dict_buffer_bytes = static_cast<uint64_t>(tmp_val);
+    } else if (strcmp(argv[i], "--compression_use_zstd_finalize_dict") == 0) {
+      compression_use_zstd_finalize_dict = true;
+    } else if (strcmp(argv[i], "--help") == 0) {
+      print_help(/*to_stderr*/ false);
+      return 0;
+    } else if (strcmp(argv[i], "--version") == 0) {
+      printf("%s\n", GetRocksBuildInfoAsString("sst_dump").c_str());
+      return 0;
+    } else {
+      fprintf(stderr, "Unrecognized argument '%s'\n\n", argv[i]);
+      print_help(/*to_stderr*/ true);
+      return 1;
+    }
+  }
+
+  if (has_compression_level_from && has_compression_level_to) {
+    if (!has_specified_compression_types || compression_types.size() != 1) {
+      fprintf(stderr, "Specify one compression type.\n\n");
+      exit(1);
+    }
+  } else if (has_compression_level_from || has_compression_level_to) {
+    fprintf(stderr,
+            "Specify both --compression_level_from and "
+            "--compression_level_to.\n\n");
+    exit(1);
+  }
+
+  if (use_from_as_prefix && has_from) {
+    fprintf(stderr, "Cannot specify --prefix and --from\n\n");
+    exit(1);
+  }
+
+  if (input_key_hex) {
+    if (has_from || use_from_as_prefix) {
+      from_key = ROCKSDB_NAMESPACE::LDBCommand::HexToString(from_key);
+    }
+    if (has_to) {
+      to_key = ROCKSDB_NAMESPACE::LDBCommand::HexToString(to_key);
+    }
+  }
+
+  if (dir_or_file == nullptr) {
+    fprintf(stderr, "file or directory must be specified.\n\n");
+    print_help(/*to_stderr*/ true);
+    exit(1);
+  }
+
+  std::shared_ptr<ROCKSDB_NAMESPACE::Env> env_guard;
+
+  // If caller of SSTDumpTool::Run(...) does not specify a different env other
+  // than Env::Default(), then try to load custom env based on env_uri/fs_uri.
+  // Otherwise, the caller is responsible for creating custom env.
+  {
+    ConfigOptions config_options;
+    config_options.env = options.env;
+    Status s = Env::CreateFromUri(config_options, env_uri, fs_uri, &options.env,
+                                  &env_guard);
+    if (!s.ok()) {
+      fprintf(stderr, "CreateEnvFromUri: %s\n", s.ToString().c_str());
+      exit(1);
+    } else {
+      fprintf(stdout, "options.env is %p\n", options.env);
+    }
+  }
+
+  std::vector<std::string> filenames;
+  ROCKSDB_NAMESPACE::Env* env = options.env;
+  ROCKSDB_NAMESPACE::Status st = env->GetChildren(dir_or_file, &filenames);
+  bool dir = true;
+  if (!st.ok() || filenames.empty()) {
+    // dir_or_file does not exist or does not contain children
+    // Check its existence first
+    Status s = env->FileExists(dir_or_file);
+    // dir_or_file does not exist
+    if (!s.ok()) {
+      fprintf(stderr, "%s%s: No such file or directory\n", s.ToString().c_str(),
+              dir_or_file);
+      return 1;
+    }
+    // dir_or_file exists and is treated as a "file"
+    // since it has no children
+    // This is ok since later it will be checked
+    // that whether it is a valid sst or not
+    // (A directory "file" is not a valid sst)
+    filenames.clear();
+    filenames.push_back(dir_or_file);
+    dir = false;
+  }
+
+  uint64_t total_read = 0;
+  // List of RocksDB SST file without corruption
+  std::vector<std::string> valid_sst_files;
+  for (size_t i = 0; i < filenames.size(); i++) {
+    std::string filename = filenames.at(i);
+    if (filename.length() <= 4 ||
+        filename.rfind(".sst") != filename.length() - 4) {
+      // ignore
+      continue;
+    }
+
+    if (dir) {
+      filename = std::string(dir_or_file) + "/" + filename;
+    }
+
+    ROCKSDB_NAMESPACE::SstFileDumper dumper(
+        options, filename, Temperature::kUnknown, readahead_size,
+        verify_checksum, output_hex, decode_blob_index);
+    // Not a valid SST
+    if (!dumper.getStatus().ok()) {
+      fprintf(stderr, "%s: %s\n", filename.c_str(),
+              dumper.getStatus().ToString().c_str());
+      continue;
+    } else {
+      valid_sst_files.push_back(filename);
+      // Print out from and to key information once
+      // where there is at least one valid SST
+      if (valid_sst_files.size() == 1) {
+        // from_key and to_key are only used for "check", "scan", or ""
+        if (command == "check" || command == "scan" || command == "") {
+          fprintf(stdout, "from [%s] to [%s]\n",
+                  ROCKSDB_NAMESPACE::Slice(from_key).ToString(true).c_str(),
+                  ROCKSDB_NAMESPACE::Slice(to_key).ToString(true).c_str());
+        }
+      }
+    }
+
+    if (command == "recompress") {
+      st = dumper.ShowAllCompressionSizes(
+          set_block_size ? block_size : 16384,
+          compression_types.empty() ? kCompressions : compression_types,
+          compress_level_from, compress_level_to, compression_max_dict_bytes,
+          compression_zstd_max_train_bytes, compression_max_dict_buffer_bytes,
+          !compression_use_zstd_finalize_dict);
+      if (!st.ok()) {
+        fprintf(stderr, "Failed to recompress: %s\n", st.ToString().c_str());
+        exit(1);
+      }
+      return 0;
+    }
+
+    if (command == "raw") {
+      std::string out_filename = filename.substr(0, filename.length() - 4);
+      out_filename.append("_dump.txt");
+
+      st = dumper.DumpTable(out_filename);
+      if (!st.ok()) {
+        fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str());
+        exit(1);
+      } else {
+        fprintf(stdout, "raw dump written to file %s\n", &out_filename[0]);
+      }
+      continue;
+    }
+
+    // scan all files in give file path.
+    if (command == "" || command == "scan" || command == "check") {
+      st = dumper.ReadSequential(
+          command == "scan", read_num > 0 ? (read_num - total_read) : read_num,
+          has_from || use_from_as_prefix, from_key, has_to, to_key,
+          use_from_as_prefix);
+      if (!st.ok()) {
+        fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str());
+      }
+      total_read += dumper.GetReadNumber();
+      if (read_num > 0 && total_read > read_num) {
+        break;
+      }
+    }
+
+    if (command == "verify") {
+      st = dumper.VerifyChecksum();
+      if (!st.ok()) {
+        fprintf(stderr, "%s is corrupted: %s\n", filename.c_str(),
+                st.ToString().c_str());
+      } else {
+        fprintf(stdout, "The file is ok\n");
+      }
+      continue;
+    }
+
+    if (show_properties || show_summary) {
+      const ROCKSDB_NAMESPACE::TableProperties* table_properties;
+
+      std::shared_ptr<const ROCKSDB_NAMESPACE::TableProperties>
+          table_properties_from_reader;
+      st = dumper.ReadTableProperties(&table_properties_from_reader);
+      if (!st.ok()) {
+        fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str());
+        fprintf(stderr, "Try to use initial table properties\n");
+        table_properties = dumper.GetInitTableProperties();
+      } else {
+        table_properties = table_properties_from_reader.get();
+      }
+      if (table_properties != nullptr) {
+        if (show_properties) {
+          fprintf(stdout,
+                  "Table Properties:\n"
+                  "------------------------------\n"
+                  "  %s",
+                  table_properties->ToString("\n  ", ": ").c_str());
+        }
+        total_num_files += 1;
+        total_num_data_blocks += table_properties->num_data_blocks;
+        total_data_block_size += table_properties->data_size;
+        total_index_block_size += table_properties->index_size;
+        total_filter_block_size += table_properties->filter_size;
+        if (show_properties) {
+          fprintf(stdout,
+                  "Raw user collected properties\n"
+                  "------------------------------\n");
+          for (const auto& kv : table_properties->user_collected_properties) {
+            std::string prop_name = kv.first;
+            std::string prop_val = Slice(kv.second).ToString(true);
+            fprintf(stdout, "  # %s: 0x%s\n", prop_name.c_str(),
+                    prop_val.c_str());
+          }
+        }
+      } else {
+        fprintf(stderr, "Reader unexpectedly returned null properties\n");
+      }
+    }
+  }
+  if (show_summary) {
+    fprintf(stdout, "total number of files: %" PRIu64 "\n", total_num_files);
+    fprintf(stdout, "total number of data blocks: %" PRIu64 "\n",
+            total_num_data_blocks);
+    fprintf(stdout, "total data block size: %" PRIu64 "\n",
+            total_data_block_size);
+    fprintf(stdout, "total index block size: %" PRIu64 "\n",
+            total_index_block_size);
+    fprintf(stdout, "total filter block size: %" PRIu64 "\n",
+            total_filter_block_size);
+  }
+
+  if (valid_sst_files.empty()) {
+    // No valid SST files are found
+    // Exit with an error state
+    if (dir) {
+      fprintf(stdout, "------------------------------\n");
+      fprintf(stderr, "No valid SST files found in %s\n", dir_or_file);
+    } else {
+      fprintf(stderr, "%s is not a valid SST file\n", dir_or_file);
+    }
+    return 1;
+  } else {
+    if (command == "identify") {
+      if (dir) {
+        fprintf(stdout, "------------------------------\n");
+        fprintf(stdout, "List of valid SST files found in %s:\n", dir_or_file);
+        for (const auto& f : valid_sst_files) {
+          fprintf(stdout, "%s\n", f.c_str());
+        }
+        fprintf(stdout, "Number of valid SST files: %zu\n",
+                valid_sst_files.size());
+      } else {
+        fprintf(stdout, "%s is a valid SST file\n", dir_or_file);
+      }
+    }
+    // At least one valid SST
+    // exit with a success state
+    return 0;
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/trace_analyzer.cc b/src/rocksdb/tools/trace_analyzer.cc
new file mode 100644
index 000000000..958078d1c
--- /dev/null
+++ b/src/rocksdb/tools/trace_analyzer.cc
@@ -0,0 +1,25 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+#include "tools/trace_analyzer_tool.h"
+int main(int argc, char** argv) {
+  return ROCKSDB_NAMESPACE::trace_analyzer_tool(argc, argv);
+}
+#endif
+#else
+#include <stdio.h>
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "Not supported in lite mode.\n");
+  return 1;
+}
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/trace_analyzer_test.cc b/src/rocksdb/tools/trace_analyzer_test.cc
new file mode 100644
index 000000000..d7f9e4da8
--- /dev/null
+++ b/src/rocksdb/tools/trace_analyzer_test.cc
@@ -0,0 +1,890 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run trace_analyzer test\n");
+  return 0;
+}
+#else
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <sstream>
+#include <thread>
+
+#include "db/db_test_util.h"
+#include "file/line_file_reader.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "tools/trace_analyzer_tool.h"
+#include "trace_replay/trace_replay.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+static const int kMaxArgCount = 100;
+static const size_t kArgBufferSize = 100000;
+}  // namespace
+
+// Note that, the QPS part verification of the analyzing result is not robost
+// enough and causes the failure in some rare cases. Disable them temporally and
+// wait for future refactor.
+
+// The helper functions for the test
+class TraceAnalyzerTest : public testing::Test {
+ public:
+  TraceAnalyzerTest() : rnd_(0xFB) {
+    // test_path_ = test::TmpDir() + "trace_analyzer_test";
+    test_path_ = test::PerThreadDBPath("trace_analyzer_test");
+    env_ = ROCKSDB_NAMESPACE::Env::Default();
+    env_->CreateDir(test_path_).PermitUncheckedError();
+    dbname_ = test_path_ + "/db";
+  }
+
+  ~TraceAnalyzerTest() override {}
+
+  void GenerateTrace(std::string trace_path) {
+    Options options;
+    options.create_if_missing = true;
+    options.merge_operator = MergeOperators::CreatePutOperator();
+    Slice upper_bound("a");
+    Slice lower_bound("abce");
+    ReadOptions ro;
+    ro.iterate_upper_bound = &upper_bound;
+    ro.iterate_lower_bound = &lower_bound;
+    WriteOptions wo;
+    TraceOptions trace_opt;
+    DB* db_ = nullptr;
+    std::string value;
+    std::unique_ptr<TraceWriter> trace_writer;
+    Iterator* single_iter = nullptr;
+
+    ASSERT_OK(
+        NewFileTraceWriter(env_, env_options_, trace_path, &trace_writer));
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    ASSERT_OK(db_->StartTrace(trace_opt, std::move(trace_writer)));
+
+    WriteBatch batch;
+    ASSERT_OK(batch.Put("a", "aaaaaaaaa"));
+    ASSERT_OK(batch.Merge("b", "aaaaaaaaaaaaaaaaaaaa"));
+    ASSERT_OK(batch.Delete("c"));
+    ASSERT_OK(batch.SingleDelete("d"));
+    ASSERT_OK(batch.DeleteRange("e", "f"));
+    ASSERT_OK(db_->Write(wo, &batch));
+    std::vector<Slice> keys;
+    keys.push_back("a");
+    keys.push_back("b");
+    keys.push_back("df");
+    keys.push_back("gege");
+    keys.push_back("hjhjhj");
+    std::vector<std::string> values;
+    std::vector<Status> ss = db_->MultiGet(ro, keys, &values);
+    ASSERT_GE(ss.size(), 0);
+    ASSERT_OK(ss[0]);
+    ASSERT_NOK(ss[2]);
+    std::vector<ColumnFamilyHandle*> cfs(2, db_->DefaultColumnFamily());
+    std::vector<PinnableSlice> values2(keys.size());
+    db_->MultiGet(ro, 2, cfs.data(), keys.data(), values2.data(), ss.data(),
+                  false);
+    ASSERT_OK(ss[0]);
+    db_->MultiGet(ro, db_->DefaultColumnFamily(), 2, keys.data() + 3,
+                  values2.data(), ss.data(), false);
+    ASSERT_OK(db_->Get(ro, "a", &value));
+
+    single_iter = db_->NewIterator(ro);
+    single_iter->Seek("a");
+    ASSERT_OK(single_iter->status());
+    single_iter->SeekForPrev("b");
+    ASSERT_OK(single_iter->status());
+    delete single_iter;
+    std::this_thread::sleep_for(std::chrono::seconds(1));
+
+    db_->Get(ro, "g", &value).PermitUncheckedError();
+
+    ASSERT_OK(db_->EndTrace());
+
+    ASSERT_OK(env_->FileExists(trace_path));
+
+    std::unique_ptr<WritableFile> whole_f;
+    std::string whole_path = test_path_ + "/0.txt";
+    ASSERT_OK(env_->NewWritableFile(whole_path, &whole_f, env_options_));
+    std::string whole_str = "0x61\n0x62\n0x63\n0x64\n0x65\n0x66\n";
+    ASSERT_OK(whole_f->Append(whole_str));
+    delete db_;
+    ASSERT_OK(DestroyDB(dbname_, options));
+  }
+
+  void RunTraceAnalyzer(const std::vector<std::string>& args) {
+    char arg_buffer[kArgBufferSize];
+    char* argv[kMaxArgCount];
+    int argc = 0;
+    int cursor = 0;
+
+    for (const auto& arg : args) {
+      ASSERT_LE(cursor + arg.size() + 1, kArgBufferSize);
+      ASSERT_LE(argc + 1, kMaxArgCount);
+      snprintf(arg_buffer + cursor, arg.size() + 1, "%s", arg.c_str());
+
+      argv[argc++] = arg_buffer + cursor;
+      cursor += static_cast<int>(arg.size()) + 1;
+    }
+
+    ASSERT_EQ(0, ROCKSDB_NAMESPACE::trace_analyzer_tool(argc, argv));
+  }
+
+  void CheckFileContent(const std::vector<std::string>& cnt,
+                        std::string file_path, bool full_content) {
+    const auto& fs = env_->GetFileSystem();
+    FileOptions fopts(env_options_);
+
+    ASSERT_OK(fs->FileExists(file_path, fopts.io_options, nullptr));
+    std::unique_ptr<FSSequentialFile> file;
+    ASSERT_OK(fs->NewSequentialFile(file_path, fopts, &file, nullptr));
+
+    LineFileReader lf_reader(std::move(file), file_path,
+                             4096 /* filereadahead_size */);
+
+    std::vector<std::string> result;
+    std::string line;
+    while (
+        lf_reader.ReadLine(&line, Env::IO_TOTAL /* rate_limiter_priority */)) {
+      result.push_back(line);
+    }
+
+    ASSERT_OK(lf_reader.GetStatus());
+
+    size_t min_size = std::min(cnt.size(), result.size());
+    for (size_t i = 0; i < min_size; i++) {
+      if (full_content) {
+        ASSERT_EQ(result[i], cnt[i]);
+      } else {
+        ASSERT_EQ(result[i][0], cnt[i][0]);
+      }
+    }
+
+    return;
+  }
+
+  void AnalyzeTrace(std::vector<std::string>& paras_diff,
+                    std::string output_path, std::string trace_path) {
+    std::vector<std::string> paras = {"./trace_analyzer",
+                                      "-convert_to_human_readable_trace",
+                                      "-output_key_stats",
+                                      "-output_access_count_stats",
+                                      "-output_prefix=test",
+                                      "-output_prefix_cut=1",
+                                      "-output_time_series",
+                                      "-output_value_distribution",
+                                      "-output_qps_stats",
+                                      "-no_key",
+                                      "-no_print"};
+    for (auto& para : paras_diff) {
+      paras.push_back(para);
+    }
+    Status s = env_->FileExists(trace_path);
+    if (!s.ok()) {
+      GenerateTrace(trace_path);
+    }
+    ASSERT_OK(env_->CreateDir(output_path));
+    RunTraceAnalyzer(paras);
+  }
+
+  ROCKSDB_NAMESPACE::Env* env_;
+  EnvOptions env_options_;
+  std::string test_path_;
+  std::string dbname_;
+  Random rnd_;
+};
+
+TEST_F(TraceAnalyzerTest, Get) {
+  std::string trace_path = test_path_ + "/trace";
+  std::string output_path = test_path_ + "/get";
+  std::string file_path;
+  std::vector<std::string> paras = {
+      "-analyze_get=true",           "-analyze_put=false",
+      "-analyze_delete=false",       "-analyze_single_delete=false",
+      "-analyze_range_delete=false", "-analyze_iterator=false",
+      "-analyze_multiget=false"};
+  paras.push_back("-output_dir=" + output_path);
+  paras.push_back("-trace_path=" + trace_path);
+  paras.push_back("-key_space_dir=" + test_path_);
+  AnalyzeTrace(paras, output_path, trace_path);
+
+  // check the key_stats file
+  std::vector<std::string> k_stats = {"0 10 0 1 1.000000", "0 10 1 1 1.000000"};
+  file_path = output_path + "/test-get-0-accessed_key_stats.txt";
+  CheckFileContent(k_stats, file_path, true);
+
+  // Check the access count distribution
+  std::vector<std::string> k_dist = {"access_count: 1 num: 2"};
+  file_path = output_path + "/test-get-0-accessed_key_count_distribution.txt";
+  CheckFileContent(k_dist, file_path, true);
+
+  // Check the trace sequence
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4", "8",
+                                         "8", "8", "8", "8", "8", "8",
+                                         "8", "8", "0", "6", "7", "0"};
+  file_path = output_path + "/test-human_readable_trace.txt";
+  CheckFileContent(k_sequence, file_path, false);
+
+  // Check the prefix
+  std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30",
+                                       "1 1 1 1.000000 1.000000 0x61"};
+  file_path = output_path + "/test-get-0-accessed_key_prefix_cut.txt";
+  CheckFileContent(k_prefix, file_path, true);
+
+  // Check the time series
+  std::vector<std::string> k_series = {"0 1533000630 0", "0 1533000630 1"};
+  file_path = output_path + "/test-get-0-time_series.txt";
+  CheckFileContent(k_series, file_path, false);
+
+  // Check the accessed key in whole key space
+  std::vector<std::string> k_whole_access = {"0 1"};
+  file_path = output_path + "/test-get-0-whole_key_stats.txt";
+  CheckFileContent(k_whole_access, file_path, true);
+
+  // Check the whole key prefix cut
+  std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+                                             "3 0x64", "4 0x65", "5 0x66"};
+  file_path = output_path + "/test-get-0-whole_key_prefix_cut.txt";
+  CheckFileContent(k_whole_prefix, file_path, true);
+
+  /*
+  // Check the overall qps
+  std::vector<std::string> all_qps = {"1 0 0 0 0 0 0 0 0 1"};
+  file_path = output_path + "/test-qps_stats.txt";
+  CheckFileContent(all_qps, file_path, true);
+
+  // Check the qps of get
+  std::vector<std::string> get_qps = {"1"};
+  file_path = output_path + "/test-get-0-qps_stats.txt";
+  CheckFileContent(get_qps, file_path, true);
+
+  // Check the top k qps prefix cut
+  std::vector<std::string> top_qps = {"At time: 0 with QPS: 1",
+                                      "The prefix: 0x61 Access count: 1"};
+  file_path = output_path + "/test-get-0-accessed_top_k_qps_prefix_cut.txt";
+  CheckFileContent(top_qps, file_path, true);
+  */
+}
+
+// Test analyzing of Put
+TEST_F(TraceAnalyzerTest, Put) {
+  std::string trace_path = test_path_ + "/trace";
+  std::string output_path = test_path_ + "/put";
+  std::string file_path;
+  std::vector<std::string> paras = {
+      "-analyze_get=false",          "-analyze_put=true",
+      "-analyze_delete=false",       "-analyze_single_delete=false",
+      "-analyze_range_delete=false", "-analyze_iterator=false",
+      "-analyze_multiget=false"};
+  paras.push_back("-output_dir=" + output_path);
+  paras.push_back("-trace_path=" + trace_path);
+  paras.push_back("-key_space_dir=" + test_path_);
+  AnalyzeTrace(paras, output_path, trace_path);
+
+  // check the key_stats file
+  std::vector<std::string> k_stats = {"0 9 0 1 1.000000"};
+  file_path = output_path + "/test-put-0-accessed_key_stats.txt";
+  CheckFileContent(k_stats, file_path, true);
+
+  // Check the access count distribution
+  std::vector<std::string> k_dist = {"access_count: 1 num: 1"};
+  file_path = output_path + "/test-put-0-accessed_key_count_distribution.txt";
+  CheckFileContent(k_dist, file_path, true);
+
+  // Check the trace sequence
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4", "8",
+                                         "8", "8", "8", "8", "8", "8",
+                                         "8", "8", "0", "6", "7", "0"};
+  file_path = output_path + "/test-human_readable_trace.txt";
+  CheckFileContent(k_sequence, file_path, false);
+
+  // Check the prefix
+  std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30"};
+  file_path = output_path + "/test-put-0-accessed_key_prefix_cut.txt";
+  CheckFileContent(k_prefix, file_path, true);
+
+  // Check the time series
+  std::vector<std::string> k_series = {"1 1533056278 0"};
+  file_path = output_path + "/test-put-0-time_series.txt";
+  CheckFileContent(k_series, file_path, false);
+
+  // Check the accessed key in whole key space
+  std::vector<std::string> k_whole_access = {"0 1"};
+  file_path = output_path + "/test-put-0-whole_key_stats.txt";
+  CheckFileContent(k_whole_access, file_path, true);
+
+  // Check the whole key prefix cut
+  std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+                                             "3 0x64", "4 0x65", "5 0x66"};
+  file_path = output_path + "/test-put-0-whole_key_prefix_cut.txt";
+  CheckFileContent(k_whole_prefix, file_path, true);
+
+  // Check the overall qps
+  std::vector<std::string> all_qps = {"0 1 0 0 0 0 0 0 0 1"};
+  file_path = output_path + "/test-qps_stats.txt";
+  CheckFileContent(all_qps, file_path, true);
+
+  /*
+  // Check the qps of Put
+  std::vector<std::string> get_qps = {"1"};
+  file_path = output_path + "/test-put-0-qps_stats.txt";
+  CheckFileContent(get_qps, file_path, true);
+
+  // Check the top k qps prefix cut
+  std::vector<std::string> top_qps = {"At time: 0 with QPS: 1",
+                                      "The prefix: 0x61 Access count: 1"};
+  file_path = output_path + "/test-put-0-accessed_top_k_qps_prefix_cut.txt";
+  CheckFileContent(top_qps, file_path, true);
+
+  // Check the value size distribution
+  std::vector<std::string> value_dist = {
+      "Number_of_value_size_between 0 and 16 is: 1"};
+  file_path = output_path + "/test-put-0-accessed_value_size_distribution.txt";
+  CheckFileContent(value_dist, file_path, true);
+  */
+}
+
+// Test analyzing of delete
+TEST_F(TraceAnalyzerTest, Delete) {
+  std::string trace_path = test_path_ + "/trace";
+  std::string output_path = test_path_ + "/delete";
+  std::string file_path;
+  std::vector<std::string> paras = {
+      "-analyze_get=false",          "-analyze_put=false",
+      "-analyze_delete=true",        "-analyze_single_delete=false",
+      "-analyze_range_delete=false", "-analyze_iterator=false",
+      "-analyze_multiget=false"};
+  paras.push_back("-output_dir=" + output_path);
+  paras.push_back("-trace_path=" + trace_path);
+  paras.push_back("-key_space_dir=" + test_path_);
+  AnalyzeTrace(paras, output_path, trace_path);
+
+  // check the key_stats file
+  std::vector<std::string> k_stats = {"0 10 0 1 1.000000"};
+  file_path = output_path + "/test-delete-0-accessed_key_stats.txt";
+  CheckFileContent(k_stats, file_path, true);
+
+  // Check the access count distribution
+  std::vector<std::string> k_dist = {"access_count: 1 num: 1"};
+  file_path =
+      output_path + "/test-delete-0-accessed_key_count_distribution.txt";
+  CheckFileContent(k_dist, file_path, true);
+
+  // Check the trace sequence
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4", "8",
+                                         "8", "8", "8", "8", "8", "8",
+                                         "8", "8", "0", "6", "7", "0"};
+  file_path = output_path + "/test-human_readable_trace.txt";
+  CheckFileContent(k_sequence, file_path, false);
+
+  // Check the prefix
+  std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30"};
+  file_path = output_path + "/test-delete-0-accessed_key_prefix_cut.txt";
+  CheckFileContent(k_prefix, file_path, true);
+
+  // Check the time series
+  std::vector<std::string> k_series = {"2 1533000630 0"};
+  file_path = output_path + "/test-delete-0-time_series.txt";
+  CheckFileContent(k_series, file_path, false);
+
+  // Check the accessed key in whole key space
+  std::vector<std::string> k_whole_access = {"2 1"};
+  file_path = output_path + "/test-delete-0-whole_key_stats.txt";
+  CheckFileContent(k_whole_access, file_path, true);
+
+  // Check the whole key prefix cut
+  std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+                                             "3 0x64", "4 0x65", "5 0x66"};
+  file_path = output_path + "/test-delete-0-whole_key_prefix_cut.txt";
+  CheckFileContent(k_whole_prefix, file_path, true);
+
+  /*
+  // Check the overall qps
+  std::vector<std::string> all_qps = {"0 0 1 0 0 0 0 0 0 1"};
+  file_path = output_path + "/test-qps_stats.txt";
+  CheckFileContent(all_qps, file_path, true);
+
+  // Check the qps of Delete
+  std::vector<std::string> get_qps = {"1"};
+  file_path = output_path + "/test-delete-0-qps_stats.txt";
+  CheckFileContent(get_qps, file_path, true);
+
+  // Check the top k qps prefix cut
+  std::vector<std::string> top_qps = {"At time: 0 with QPS: 1",
+                                      "The prefix: 0x63 Access count: 1"};
+  file_path = output_path + "/test-delete-0-accessed_top_k_qps_prefix_cut.txt";
+  CheckFileContent(top_qps, file_path, true);
+  */
+}
+
+// Test analyzing of Merge
+TEST_F(TraceAnalyzerTest, Merge) {
+  std::string trace_path = test_path_ + "/trace";
+  std::string output_path = test_path_ + "/merge";
+  std::string file_path;
+  std::vector<std::string> paras = {
+      "-analyze_get=false",           "-analyze_put=false",
+      "-analyze_delete=false",        "-analyze_merge=true",
+      "-analyze_single_delete=false", "-analyze_range_delete=false",
+      "-analyze_iterator=false",      "-analyze_multiget=false"};
+  paras.push_back("-output_dir=" + output_path);
+  paras.push_back("-trace_path=" + trace_path);
+  paras.push_back("-key_space_dir=" + test_path_);
+  AnalyzeTrace(paras, output_path, trace_path);
+
+  // check the key_stats file
+  std::vector<std::string> k_stats = {"0 20 0 1 1.000000"};
+  file_path = output_path + "/test-merge-0-accessed_key_stats.txt";
+  CheckFileContent(k_stats, file_path, true);
+
+  // Check the access count distribution
+  std::vector<std::string> k_dist = {"access_count: 1 num: 1"};
+  file_path = output_path + "/test-merge-0-accessed_key_count_distribution.txt";
+  CheckFileContent(k_dist, file_path, true);
+
+  // Check the trace sequence
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4", "8",
+                                         "8", "8", "8", "8", "8", "8",
+                                         "8", "8", "0", "6", "7", "0"};
+  file_path = output_path + "/test-human_readable_trace.txt";
+  CheckFileContent(k_sequence, file_path, false);
+
+  // Check the prefix
+  std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30"};
+  file_path = output_path + "/test-merge-0-accessed_key_prefix_cut.txt";
+  CheckFileContent(k_prefix, file_path, true);
+
+  // Check the time series
+  std::vector<std::string> k_series = {"5 1533000630 0"};
+  file_path = output_path + "/test-merge-0-time_series.txt";
+  CheckFileContent(k_series, file_path, false);
+
+  // Check the accessed key in whole key space
+  std::vector<std::string> k_whole_access = {"1 1"};
+  file_path = output_path + "/test-merge-0-whole_key_stats.txt";
+  CheckFileContent(k_whole_access, file_path, true);
+
+  // Check the whole key prefix cut
+  std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+                                             "3 0x64", "4 0x65", "5 0x66"};
+  file_path = output_path + "/test-merge-0-whole_key_prefix_cut.txt";
+  CheckFileContent(k_whole_prefix, file_path, true);
+
+  /*
+  // Check the overall qps
+  std::vector<std::string> all_qps = {"0 0 0 0 0 1 0 0 0 1"};
+  file_path = output_path + "/test-qps_stats.txt";
+  CheckFileContent(all_qps, file_path, true);
+
+  // Check the qps of Merge
+  std::vector<std::string> get_qps = {"1"};
+  file_path = output_path + "/test-merge-0-qps_stats.txt";
+  CheckFileContent(get_qps, file_path, true);
+
+  // Check the top k qps prefix cut
+  std::vector<std::string> top_qps = {"At time: 0 with QPS: 1",
+                                      "The prefix: 0x62 Access count: 1"};
+  file_path = output_path + "/test-merge-0-accessed_top_k_qps_prefix_cut.txt";
+  CheckFileContent(top_qps, file_path, true);
+  */
+
+  // Check the value size distribution
+  std::vector<std::string> value_dist = {
+      "Number_of_value_size_between 0 and 24 is: 1"};
+  file_path =
+      output_path + "/test-merge-0-accessed_value_size_distribution.txt";
+  CheckFileContent(value_dist, file_path, true);
+}
+
+// Test analyzing of SingleDelete
+TEST_F(TraceAnalyzerTest, SingleDelete) {
+  std::string trace_path = test_path_ + "/trace";
+  std::string output_path = test_path_ + "/single_delete";
+  std::string file_path;
+  std::vector<std::string> paras = {
+      "-analyze_get=false",          "-analyze_put=false",
+      "-analyze_delete=false",       "-analyze_merge=false",
+      "-analyze_single_delete=true", "-analyze_range_delete=false",
+      "-analyze_iterator=false",     "-analyze_multiget=false"};
+  paras.push_back("-output_dir=" + output_path);
+  paras.push_back("-trace_path=" + trace_path);
+  paras.push_back("-key_space_dir=" + test_path_);
+  AnalyzeTrace(paras, output_path, trace_path);
+
+  // check the key_stats file
+  std::vector<std::string> k_stats = {"0 10 0 1 1.000000"};
+  file_path = output_path + "/test-single_delete-0-accessed_key_stats.txt";
+  CheckFileContent(k_stats, file_path, true);
+
+  // Check the access count distribution
+  std::vector<std::string> k_dist = {"access_count: 1 num: 1"};
+  file_path =
+      output_path + "/test-single_delete-0-accessed_key_count_distribution.txt";
+  CheckFileContent(k_dist, file_path, true);
+
+  // Check the trace sequence
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4", "8",
+                                         "8", "8", "8", "8", "8", "8",
+                                         "8", "8", "0", "6", "7", "0"};
+  file_path = output_path + "/test-human_readable_trace.txt";
+  CheckFileContent(k_sequence, file_path, false);
+
+  // Check the prefix
+  std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30"};
+  file_path = output_path + "/test-single_delete-0-accessed_key_prefix_cut.txt";
+  CheckFileContent(k_prefix, file_path, true);
+
+  // Check the time series
+  std::vector<std::string> k_series = {"3 1533000630 0"};
+  file_path = output_path + "/test-single_delete-0-time_series.txt";
+  CheckFileContent(k_series, file_path, false);
+
+  // Check the accessed key in whole key space
+  std::vector<std::string> k_whole_access = {"3 1"};
+  file_path = output_path + "/test-single_delete-0-whole_key_stats.txt";
+  CheckFileContent(k_whole_access, file_path, true);
+
+  // Check the whole key prefix cut
+  std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+                                             "3 0x64", "4 0x65", "5 0x66"};
+  file_path = output_path + "/test-single_delete-0-whole_key_prefix_cut.txt";
+  CheckFileContent(k_whole_prefix, file_path, true);
+
+  /*
+  // Check the overall qps
+  std::vector<std::string> all_qps = {"0 0 0 1 0 0 0 0 0 1"};
+  file_path = output_path + "/test-qps_stats.txt";
+  CheckFileContent(all_qps, file_path, true);
+
+  // Check the qps of SingleDelete
+  std::vector<std::string> get_qps = {"1"};
+  file_path = output_path + "/test-single_delete-0-qps_stats.txt";
+  CheckFileContent(get_qps, file_path, true);
+
+  // Check the top k qps prefix cut
+  std::vector<std::string> top_qps = {"At time: 0 with QPS: 1",
+                                      "The prefix: 0x64 Access count: 1"};
+  file_path =
+      output_path + "/test-single_delete-0-accessed_top_k_qps_prefix_cut.txt";
+  CheckFileContent(top_qps, file_path, true);
+  */
+}
+
+// Test analyzing of delete
+TEST_F(TraceAnalyzerTest, DeleteRange) {
+  std::string trace_path = test_path_ + "/trace";
+  std::string output_path = test_path_ + "/range_delete";
+  std::string file_path;
+  std::vector<std::string> paras = {
+      "-analyze_get=false",           "-analyze_put=false",
+      "-analyze_delete=false",        "-analyze_merge=false",
+      "-analyze_single_delete=false", "-analyze_range_delete=true",
+      "-analyze_iterator=false",      "-analyze_multiget=false"};
+  paras.push_back("-output_dir=" + output_path);
+  paras.push_back("-trace_path=" + trace_path);
+  paras.push_back("-key_space_dir=" + test_path_);
+  AnalyzeTrace(paras, output_path, trace_path);
+
+  // check the key_stats file
+  std::vector<std::string> k_stats = {"0 10 0 1 1.000000", "0 10 1 1 1.000000"};
+  file_path = output_path + "/test-range_delete-0-accessed_key_stats.txt";
+  CheckFileContent(k_stats, file_path, true);
+
+  // Check the access count distribution
+  std::vector<std::string> k_dist = {"access_count: 1 num: 2"};
+  file_path =
+      output_path + "/test-range_delete-0-accessed_key_count_distribution.txt";
+  CheckFileContent(k_dist, file_path, true);
+
+  // Check the trace sequence
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4", "8",
+                                         "8", "8", "8", "8", "8", "8",
+                                         "8", "8", "0", "6", "7", "0"};
+  file_path = output_path + "/test-human_readable_trace.txt";
+  CheckFileContent(k_sequence, file_path, false);
+
+  // Check the prefix
+  std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30",
+                                       "1 1 1 1.000000 1.000000 0x65"};
+  file_path = output_path + "/test-range_delete-0-accessed_key_prefix_cut.txt";
+  CheckFileContent(k_prefix, file_path, true);
+
+  // Check the time series
+  std::vector<std::string> k_series = {"4 1533000630 0", "4 1533060100 1"};
+  file_path = output_path + "/test-range_delete-0-time_series.txt";
+  CheckFileContent(k_series, file_path, false);
+
+  // Check the accessed key in whole key space
+  std::vector<std::string> k_whole_access = {"4 1", "5 1"};
+  file_path = output_path + "/test-range_delete-0-whole_key_stats.txt";
+  CheckFileContent(k_whole_access, file_path, true);
+
+  // Check the whole key prefix cut
+  std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+                                             "3 0x64", "4 0x65", "5 0x66"};
+  file_path = output_path + "/test-range_delete-0-whole_key_prefix_cut.txt";
+  CheckFileContent(k_whole_prefix, file_path, true);
+
+  /*
+  // Check the overall qps
+  std::vector<std::string> all_qps = {"0 0 0 0 2 0 0 0 0 2"};
+  file_path = output_path + "/test-qps_stats.txt";
+  CheckFileContent(all_qps, file_path, true);
+
+  // Check the qps of DeleteRange
+  std::vector<std::string> get_qps = {"2"};
+  file_path = output_path + "/test-range_delete-0-qps_stats.txt";
+  CheckFileContent(get_qps, file_path, true);
+
+  // Check the top k qps prefix cut
+  std::vector<std::string> top_qps = {"At time: 0 with QPS: 2",
+                                      "The prefix: 0x65 Access count: 1",
+                                      "The prefix: 0x66 Access count: 1"};
+  file_path =
+      output_path + "/test-range_delete-0-accessed_top_k_qps_prefix_cut.txt";
+  CheckFileContent(top_qps, file_path, true);
+  */
+}
+
+// Test analyzing of Iterator
+TEST_F(TraceAnalyzerTest, Iterator) {
+  std::string trace_path = test_path_ + "/trace";
+  std::string output_path = test_path_ + "/iterator";
+  std::string file_path;
+  std::vector<std::string> paras = {
+      "-analyze_get=false",           "-analyze_put=false",
+      "-analyze_delete=false",        "-analyze_merge=false",
+      "-analyze_single_delete=false", "-analyze_range_delete=false",
+      "-analyze_iterator=true",       "-analyze_multiget=false"};
+  paras.push_back("-output_dir=" + output_path);
+  paras.push_back("-trace_path=" + trace_path);
+  paras.push_back("-key_space_dir=" + test_path_);
+  AnalyzeTrace(paras, output_path, trace_path);
+
+  // Check the output of Seek
+  // check the key_stats file
+  std::vector<std::string> k_stats = {"0 10 0 1 1.000000"};
+  file_path = output_path + "/test-iterator_Seek-0-accessed_key_stats.txt";
+  CheckFileContent(k_stats, file_path, true);
+
+  // Check the access count distribution
+  std::vector<std::string> k_dist = {"access_count: 1 num: 1"};
+  file_path =
+      output_path + "/test-iterator_Seek-0-accessed_key_count_distribution.txt";
+  CheckFileContent(k_dist, file_path, true);
+
+  // Check the trace sequence
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4", "8",
+                                         "8", "8", "8", "8", "8", "8",
+                                         "8", "8", "0", "6", "7", "0"};
+  file_path = output_path + "/test-human_readable_trace.txt";
+  CheckFileContent(k_sequence, file_path, false);
+
+  // Check the prefix
+  std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30"};
+  file_path = output_path + "/test-iterator_Seek-0-accessed_key_prefix_cut.txt";
+  CheckFileContent(k_prefix, file_path, true);
+
+  // Check the time series
+  std::vector<std::string> k_series = {"6 1 0"};
+  file_path = output_path + "/test-iterator_Seek-0-time_series.txt";
+  CheckFileContent(k_series, file_path, false);
+
+  // Check the accessed key in whole key space
+  std::vector<std::string> k_whole_access = {"0 1"};
+  file_path = output_path + "/test-iterator_Seek-0-whole_key_stats.txt";
+  CheckFileContent(k_whole_access, file_path, true);
+
+  // Check the whole key prefix cut
+  std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+                                             "3 0x64", "4 0x65", "5 0x66"};
+  file_path = output_path + "/test-iterator_Seek-0-whole_key_prefix_cut.txt";
+  CheckFileContent(k_whole_prefix, file_path, true);
+
+  /*
+  // Check the overall qps
+  std::vector<std::string> all_qps = {"0 0 0 0 0 0 1 1 0 2"};
+  file_path = output_path + "/test-qps_stats.txt";
+  CheckFileContent(all_qps, file_path, true);
+
+  // Check the qps of Iterator_Seek
+  std::vector<std::string> get_qps = {"1"};
+  file_path = output_path + "/test-iterator_Seek-0-qps_stats.txt";
+  CheckFileContent(get_qps, file_path, true);
+
+  // Check the top k qps prefix cut
+  std::vector<std::string> top_qps = {"At time: 0 with QPS: 1",
+                                      "The prefix: 0x61 Access count: 1"};
+  file_path =
+      output_path + "/test-iterator_Seek-0-accessed_top_k_qps_prefix_cut.txt";
+  CheckFileContent(top_qps, file_path, true);
+  */
+
+  // Check the output of SeekForPrev
+  // check the key_stats file
+  k_stats = {"0 10 0 1 1.000000"};
+  file_path =
+      output_path + "/test-iterator_SeekForPrev-0-accessed_key_stats.txt";
+  CheckFileContent(k_stats, file_path, true);
+
+  // Check the access count distribution
+  k_dist = {"access_count: 1 num: 1"};
+  file_path =
+      output_path +
+      "/test-iterator_SeekForPrev-0-accessed_key_count_distribution.txt";
+  CheckFileContent(k_dist, file_path, true);
+
+  // Check the prefix
+  k_prefix = {"0 0 0 0.000000 0.000000 0x30"};
+  file_path =
+      output_path + "/test-iterator_SeekForPrev-0-accessed_key_prefix_cut.txt";
+  CheckFileContent(k_prefix, file_path, true);
+
+  // Check the time series
+  k_series = {"7 0 0"};
+  file_path = output_path + "/test-iterator_SeekForPrev-0-time_series.txt";
+  CheckFileContent(k_series, file_path, false);
+
+  // Check the accessed key in whole key space
+  k_whole_access = {"1 1"};
+  file_path = output_path + "/test-iterator_SeekForPrev-0-whole_key_stats.txt";
+  CheckFileContent(k_whole_access, file_path, true);
+
+  // Check the whole key prefix cut
+  k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63", "3 0x64", "4 0x65", "5 0x66"};
+  file_path =
+      output_path + "/test-iterator_SeekForPrev-0-whole_key_prefix_cut.txt";
+  CheckFileContent(k_whole_prefix, file_path, true);
+
+  /*
+  // Check the qps of Iterator_SeekForPrev
+  get_qps = {"1"};
+  file_path = output_path + "/test-iterator_SeekForPrev-0-qps_stats.txt";
+  CheckFileContent(get_qps, file_path, true);
+
+  // Check the top k qps prefix cut
+  top_qps = {"At time: 0 with QPS: 1", "The prefix: 0x62 Access count: 1"};
+  file_path = output_path +
+              "/test-iterator_SeekForPrev-0-accessed_top_k_qps_prefix_cut.txt";
+  CheckFileContent(top_qps, file_path, true);
+  */
+}
+
+// Test analyzing of multiget
+TEST_F(TraceAnalyzerTest, MultiGet) {
+  std::string trace_path = test_path_ + "/trace";
+  std::string output_path = test_path_ + "/multiget";
+  std::string file_path;
+  std::vector<std::string> paras = {
+      "-analyze_get=false",           "-analyze_put=false",
+      "-analyze_delete=false",        "-analyze_merge=false",
+      "-analyze_single_delete=false", "-analyze_range_delete=true",
+      "-analyze_iterator=false",      "-analyze_multiget=true"};
+  paras.push_back("-output_dir=" + output_path);
+  paras.push_back("-trace_path=" + trace_path);
+  paras.push_back("-key_space_dir=" + test_path_);
+  AnalyzeTrace(paras, output_path, trace_path);
+
+  // check the key_stats file
+  std::vector<std::string> k_stats = {"0 10 0 2 1.000000", "0 10 1 2 1.000000",
+                                      "0 10 2 1 1.000000", "0 10 3 2 1.000000",
+                                      "0 10 4 2 1.000000"};
+  file_path = output_path + "/test-multiget-0-accessed_key_stats.txt";
+  CheckFileContent(k_stats, file_path, true);
+
+  // Check the access count distribution
+  std::vector<std::string> k_dist = {"access_count: 1 num: 1",
+                                     "access_count: 2 num: 4"};
+  file_path =
+      output_path + "/test-multiget-0-accessed_key_count_distribution.txt";
+  CheckFileContent(k_dist, file_path, true);
+
+  // Check the trace sequence
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4", "8",
+                                         "8", "8", "8", "8", "8", "8",
+                                         "8", "8", "0", "6", "7", "0"};
+  file_path = output_path + "/test-human_readable_trace.txt";
+  CheckFileContent(k_sequence, file_path, false);
+
+  // Check the prefix
+  std::vector<std::string> k_prefix = {
+      "0 0 0 0.000000 0.000000 0x30", "1 2 1 2.000000 1.000000 0x61",
+      "2 2 1 2.000000 1.000000 0x62", "3 1 1 1.000000 1.000000 0x64",
+      "4 2 1 2.000000 1.000000 0x67"};
+  file_path = output_path + "/test-multiget-0-accessed_key_prefix_cut.txt";
+  CheckFileContent(k_prefix, file_path, true);
+
+  // Check the time series
+  std::vector<std::string> k_series = {"8 0 0", "8 0 1", "8 0 2",
+                                       "8 0 3", "8 0 4", "8 0 0",
+                                       "8 0 1", "8 0 3", "8 0 4"};
+  file_path = output_path + "/test-multiget-0-time_series.txt";
+  CheckFileContent(k_series, file_path, false);
+
+  // Check the accessed key in whole key space
+  std::vector<std::string> k_whole_access = {"0 2", "1 2"};
+  file_path = output_path + "/test-multiget-0-whole_key_stats.txt";
+  CheckFileContent(k_whole_access, file_path, true);
+
+  // Check the whole key prefix cut
+  std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+                                             "3 0x64", "4 0x65", "5 0x66"};
+  file_path = output_path + "/test-multiget-0-whole_key_prefix_cut.txt";
+  CheckFileContent(k_whole_prefix, file_path, true);
+
+  /*
+  // Check the overall qps. We have 3 MultiGet queries and it requested 9 keys
+  // in total
+  std::vector<std::string> all_qps = {"0 0 0 0 2 0 0 0 9 11"};
+  file_path = output_path + "/test-qps_stats.txt";
+  CheckFileContent(all_qps, file_path, true);
+
+  // Check the qps of DeleteRange
+  std::vector<std::string> get_qps = {"9"};
+  file_path = output_path + "/test-multiget-0-qps_stats.txt";
+  CheckFileContent(get_qps, file_path, true);
+
+  // Check the top k qps prefix cut
+  std::vector<std::string> top_qps = {
+      "At time: 0 with QPS: 9",           "The prefix: 0x61 Access count: 2",
+      "The prefix: 0x62 Access count: 2", "The prefix: 0x64 Access count: 1",
+      "The prefix: 0x67 Access count: 2", "The prefix: 0x68 Access count: 2"};
+  file_path =
+      output_path + "/test-multiget-0-accessed_top_k_qps_prefix_cut.txt";
+  CheckFileContent(top_qps, file_path, true);
+  */
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+#endif  // GFLAG
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "Trace_analyzer test is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE  return RUN_ALL_TESTS();
diff --git a/src/rocksdb/tools/trace_analyzer_tool.cc b/src/rocksdb/tools/trace_analyzer_tool.cc
new file mode 100644
index 000000000..5a6d67864
--- /dev/null
+++ b/src/rocksdb/tools/trace_analyzer_tool.cc
@@ -0,0 +1,1925 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#ifndef ROCKSDB_LITE
+
+#ifdef GFLAGS
+#ifdef NUMA
+#include <numa.h>
+#endif
+#ifndef OS_WIN
+#include <unistd.h>
+#endif
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+
+#include "db/db_impl/db_impl.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "env/composite_env_wrapper.h"
+#include "file/line_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/utilities/ldb_cmd.h"
+#include "rocksdb/write_batch.h"
+#include "table/meta_blocks.h"
+#include "table/table_reader.h"
+#include "tools/trace_analyzer_tool.h"
+#include "trace_replay/trace_replay.h"
+#include "util/coding.h"
+#include "util/compression.h"
+#include "util/gflags_compat.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_string(trace_path, "", "The trace file path.");
+DEFINE_string(output_dir, "", "The directory to store the output files.");
+DEFINE_string(output_prefix, "trace",
+              "The prefix used for all the output files.");
+DEFINE_bool(output_key_stats, false,
+            "Output the key access count statistics to file\n"
+            "for accessed keys:\n"
+            "file name: <prefix>-<query_type>-<cf_id>-accessed_key_stats.txt\n"
+            "Format:[cf_id value_size access_keyid access_count]\n"
+            "for the whole key space keys:\n"
+            "File name: <prefix>-<query_type>-<cf_id>-whole_key_stats.txt\n"
+            "Format:[whole_key_space_keyid access_count]");
+DEFINE_bool(output_access_count_stats, false,
+            "Output the access count distribution statistics to file.\n"
+            "File name:  <prefix>-<query_type>-<cf_id>-accessed_"
+            "key_count_distribution.txt \n"
+            "Format:[access_count number_of_access_count]");
+DEFINE_bool(output_time_series, false,
+            "Output the access time in second of each key, "
+            "such that we can have the time series data of the queries \n"
+            "File name: <prefix>-<query_type>-<cf_id>-time_series.txt\n"
+            "Format:[type_id time_in_sec access_keyid].");
+DEFINE_bool(try_process_corrupted_trace, false,
+            "In default, trace_analyzer will exit if the trace file is "
+            "corrupted due to the unexpected tracing cases. If this option "
+            "is enabled, trace_analyzer will stop reading the trace file, "
+            "and start analyzing the read-in data.");
+DEFINE_int32(output_prefix_cut, 0,
+             "The number of bytes as prefix to cut the keys.\n"
+             "If it is enabled, it will generate the following:\n"
+             "For accessed keys:\n"
+             "File name: <prefix>-<query_type>-<cf_id>-"
+             "accessed_key_prefix_cut.txt \n"
+             "Format:[acessed_keyid access_count_of_prefix "
+             "number_of_keys_in_prefix average_key_access "
+             "prefix_succ_ratio prefix]\n"
+             "For whole key space keys:\n"
+             "File name: <prefix>-<query_type>-<cf_id>"
+             "-whole_key_prefix_cut.txt\n"
+             "Format:[start_keyid_in_whole_keyspace prefix]\n"
+             "if 'output_qps_stats' and 'top_k' are enabled, it will output:\n"
+             "File name: <prefix>-<query_type>-<cf_id>"
+             "-accessed_top_k_qps_prefix_cut.txt\n"
+             "Format:[the_top_ith_qps_time QPS], [prefix qps_of_this_second].");
+DEFINE_bool(convert_to_human_readable_trace, false,
+            "Convert the binary trace file to a human readable txt file "
+            "for further processing. "
+            "This file will be extremely large "
+            "(similar size as the original binary trace file). "
+            "You can specify 'no_key' to reduce the size, if key is not "
+            "needed in the next step.\n"
+            "File name: <prefix>_human_readable_trace.txt\n"
+            "Format:[<key> type_id cf_id value_size time_in_micorsec].");
+DEFINE_bool(output_qps_stats, false,
+            "Output the query per second(qps) statistics \n"
+            "For the overall qps, it will contain all qps of each query type. "
+            "The time is started from the first trace record\n"
+            "File name: <prefix>_qps_stats.txt\n"
+            "Format: [qps_type_1 qps_type_2 ...... overall_qps]\n"
+            "For each cf and query, it will have its own qps output.\n"
+            "File name: <prefix>-<query_type>-<cf_id>_qps_stats.txt \n"
+            "Format:[query_count_in_this_second].");
+DEFINE_bool(no_print, false, "Do not print out any result");
+DEFINE_string(
+    print_correlation, "",
+    "intput format: [correlation pairs][.,.]\n"
+    "Output the query correlations between the pairs of query types "
+    "listed in the parameter, input should select the operations from:\n"
+    "get, put, delete, single_delete, rangle_delete, merge. No space "
+    "between the pairs separated by commar. Example: =[get,get]... "
+    "It will print out the number of pairs of 'A after B' and "
+    "the average time interval between the two query.");
+DEFINE_string(key_space_dir, "",
+              "<the directory stores full key space files> \n"
+              "The key space files should be: <column family id>.txt");
+DEFINE_bool(analyze_get, false, "Analyze the Get query.");
+DEFINE_bool(analyze_put, false, "Analyze the Put query.");
+DEFINE_bool(analyze_delete, false, "Analyze the Delete query.");
+DEFINE_bool(analyze_single_delete, false, "Analyze the SingleDelete query.");
+DEFINE_bool(analyze_range_delete, false, "Analyze the DeleteRange query.");
+DEFINE_bool(analyze_merge, false, "Analyze the Merge query.");
+DEFINE_bool(analyze_iterator, false,
+            " Analyze the iterate query like Seek() and SeekForPrev().");
+DEFINE_bool(analyze_multiget, false,
+            " Analyze the MultiGet query. NOTE: for"
+            " MultiGet, we analyze each KV-pair read in one MultiGet query. "
+            "Therefore, the total queries and QPS are calculated based on "
+            "the number of KV-pairs being accessed not the number of MultiGet."
+            "It can be improved in the future if needed");
+DEFINE_bool(no_key, false,
+            " Does not output the key to the result files to make smaller.");
+DEFINE_bool(print_overall_stats, true,
+            " Print the stats of the whole trace, "
+            "like total requests, keys, and etc.");
+DEFINE_bool(output_key_distribution, false, "Print the key size distribution.");
+DEFINE_bool(
+    output_value_distribution, false,
+    "Out put the value size distribution, only available for Put and Merge.\n"
+    "File name: <prefix>-<query_type>-<cf_id>"
+    "-accessed_value_size_distribution.txt\n"
+    "Format:[Number_of_value_size_between x and "
+    "x+value_interval is: <the count>]");
+DEFINE_int32(print_top_k_access, 1,
+             "<top K of the variables to be printed> "
+             "Print the top k accessed keys, top k accessed prefix "
+             "and etc.");
+DEFINE_int32(output_ignore_count, 0,
+             "<threshold>, ignores the access count <= this value, "
+             "it will shorter the output.");
+DEFINE_int32(value_interval, 8,
+             "To output the value distribution, we need to set the value "
+             "intervals and make the statistic of the value size distribution "
+             "in different intervals. The default is 8.");
+DEFINE_double(sample_ratio, 1.0,
+              "If the trace size is extremely huge or user want to sample "
+              "the trace when analyzing, sample ratio can be set (0, 1.0]");
+
+namespace ROCKSDB_NAMESPACE {
+
+const size_t kShadowValueSize = 10;
+
+std::map<std::string, int> taOptToIndex = {
+    {"get", 0},           {"put", 1},
+    {"delete", 2},        {"single_delete", 3},
+    {"range_delete", 4},  {"merge", 5},
+    {"iterator_Seek", 6}, {"iterator_SeekForPrev", 7},
+    {"multiget", 8}};
+
+std::map<int, std::string> taIndexToOpt = {
+    {0, "get"},           {1, "put"},
+    {2, "delete"},        {3, "single_delete"},
+    {4, "range_delete"},  {5, "merge"},
+    {6, "iterator_Seek"}, {7, "iterator_SeekForPrev"},
+    {8, "multiget"}};
+
+namespace {
+
+uint64_t MultiplyCheckOverflow(uint64_t op1, uint64_t op2) {
+  if (op1 == 0 || op2 == 0) {
+    return 0;
+  }
+  if (std::numeric_limits<uint64_t>::max() / op1 < op2) {
+    return op1;
+  }
+  return (op1 * op2);
+}
+
+}  // namespace
+
+// The default constructor of AnalyzerOptions
+AnalyzerOptions::AnalyzerOptions()
+    : correlation_map(kTaTypeNum, std::vector<int>(kTaTypeNum, -1)) {}
+
+AnalyzerOptions::~AnalyzerOptions() {}
+
+void AnalyzerOptions::SparseCorrelationInput(const std::string& in_str) {
+  std::string cur = in_str;
+  if (cur.size() == 0) {
+    return;
+  }
+  while (!cur.empty()) {
+    if (cur.compare(0, 1, "[") != 0) {
+      fprintf(stderr, "Invalid correlation input: %s\n", in_str.c_str());
+      exit(1);
+    }
+    std::string opt1, opt2;
+    std::size_t split = cur.find_first_of(",");
+    if (split != std::string::npos) {
+      opt1 = cur.substr(1, split - 1);
+    } else {
+      fprintf(stderr, "Invalid correlation input: %s\n", in_str.c_str());
+      exit(1);
+    }
+    std::size_t end = cur.find_first_of("]");
+    if (end != std::string::npos) {
+      opt2 = cur.substr(split + 1, end - split - 1);
+    } else {
+      fprintf(stderr, "Invalid correlation input: %s\n", in_str.c_str());
+      exit(1);
+    }
+    cur = cur.substr(end + 1);
+
+    if (taOptToIndex.find(opt1) != taOptToIndex.end() &&
+        taOptToIndex.find(opt2) != taOptToIndex.end()) {
+      correlation_list.push_back(
+          std::make_pair(taOptToIndex[opt1], taOptToIndex[opt2]));
+    } else {
+      fprintf(stderr, "Invalid correlation input: %s\n", in_str.c_str());
+      exit(1);
+    }
+  }
+
+  int sequence = 0;
+  for (auto& it : correlation_list) {
+    correlation_map[it.first][it.second] = sequence;
+    sequence++;
+  }
+  return;
+}
+
+// The trace statistic struct constructor
+TraceStats::TraceStats() {
+  cf_id = 0;
+  cf_name = "0";
+  a_count = 0;
+  a_key_id = 0;
+  a_key_size_sqsum = 0;
+  a_key_size_sum = 0;
+  a_key_mid = 0;
+  a_value_size_sqsum = 0;
+  a_value_size_sum = 0;
+  a_value_mid = 0;
+  a_peak_qps = 0;
+  a_ave_qps = 0.0;
+}
+
+TraceStats::~TraceStats() {}
+
+// The trace analyzer constructor
+TraceAnalyzer::TraceAnalyzer(std::string& trace_path, std::string& output_path,
+                             AnalyzerOptions _analyzer_opts)
+    : write_batch_ts_(0),
+      trace_name_(trace_path),
+      output_path_(output_path),
+      analyzer_opts_(_analyzer_opts) {
+  ROCKSDB_NAMESPACE::EnvOptions env_options;
+  env_ = ROCKSDB_NAMESPACE::Env::Default();
+  offset_ = 0;
+  total_requests_ = 0;
+  total_access_keys_ = 0;
+  total_gets_ = 0;
+  total_writes_ = 0;
+  total_seeks_ = 0;
+  total_seek_prevs_ = 0;
+  total_multigets_ = 0;
+  trace_create_time_ = 0;
+  begin_time_ = 0;
+  end_time_ = 0;
+  time_series_start_ = 0;
+  cur_time_sec_ = 0;
+  if (FLAGS_sample_ratio > 1.0 || FLAGS_sample_ratio <= 0) {
+    sample_max_ = 1;
+  } else {
+    sample_max_ = static_cast<uint32_t>(1.0 / FLAGS_sample_ratio);
+  }
+
+  ta_.resize(kTaTypeNum);
+  ta_[0].type_name = "get";
+  if (FLAGS_analyze_get) {
+    ta_[0].enabled = true;
+  } else {
+    ta_[0].enabled = false;
+  }
+  ta_[1].type_name = "put";
+  if (FLAGS_analyze_put) {
+    ta_[1].enabled = true;
+  } else {
+    ta_[1].enabled = false;
+  }
+  ta_[2].type_name = "delete";
+  if (FLAGS_analyze_delete) {
+    ta_[2].enabled = true;
+  } else {
+    ta_[2].enabled = false;
+  }
+  ta_[3].type_name = "single_delete";
+  if (FLAGS_analyze_single_delete) {
+    ta_[3].enabled = true;
+  } else {
+    ta_[3].enabled = false;
+  }
+  ta_[4].type_name = "range_delete";
+  if (FLAGS_analyze_range_delete) {
+    ta_[4].enabled = true;
+  } else {
+    ta_[4].enabled = false;
+  }
+  ta_[5].type_name = "merge";
+  if (FLAGS_analyze_merge) {
+    ta_[5].enabled = true;
+  } else {
+    ta_[5].enabled = false;
+  }
+  ta_[6].type_name = "iterator_Seek";
+  if (FLAGS_analyze_iterator) {
+    ta_[6].enabled = true;
+  } else {
+    ta_[6].enabled = false;
+  }
+  ta_[7].type_name = "iterator_SeekForPrev";
+  if (FLAGS_analyze_iterator) {
+    ta_[7].enabled = true;
+  } else {
+    ta_[7].enabled = false;
+  }
+  ta_[8].type_name = "multiget";
+  if (FLAGS_analyze_multiget) {
+    ta_[8].enabled = true;
+  } else {
+    ta_[8].enabled = false;
+  }
+  for (int i = 0; i < kTaTypeNum; i++) {
+    ta_[i].sample_count = 0;
+  }
+}
+
+TraceAnalyzer::~TraceAnalyzer() {}
+
+// Prepare the processing
+// Initiate the global trace reader and writer here
+Status TraceAnalyzer::PrepareProcessing() {
+  Status s;
+  // Prepare the trace reader
+  if (trace_reader_ == nullptr) {
+    s = NewFileTraceReader(env_, env_options_, trace_name_, &trace_reader_);
+  } else {
+    s = trace_reader_->Reset();
+  }
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Prepare and open the trace sequence file writer if needed
+  if (FLAGS_convert_to_human_readable_trace) {
+    std::string trace_sequence_name;
+    trace_sequence_name =
+        output_path_ + "/" + FLAGS_output_prefix + "-human_readable_trace.txt";
+    s = env_->NewWritableFile(trace_sequence_name, &trace_sequence_f_,
+                              env_options_);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  // prepare the general QPS file writer
+  if (FLAGS_output_qps_stats) {
+    std::string qps_stats_name;
+    qps_stats_name =
+        output_path_ + "/" + FLAGS_output_prefix + "-qps_stats.txt";
+    s = env_->NewWritableFile(qps_stats_name, &qps_f_, env_options_);
+    if (!s.ok()) {
+      return s;
+    }
+
+    qps_stats_name =
+        output_path_ + "/" + FLAGS_output_prefix + "-cf_qps_stats.txt";
+    s = env_->NewWritableFile(qps_stats_name, &cf_qps_f_, env_options_);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  return Status::OK();
+}
+
+Status TraceAnalyzer::ReadTraceHeader(Trace* header) {
+  assert(header != nullptr);
+  std::string encoded_trace;
+  // Read the trace head
+  Status s = trace_reader_->Read(&encoded_trace);
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = TracerHelper::DecodeTrace(encoded_trace, header);
+
+  if (header->type != kTraceBegin) {
+    return Status::Corruption("Corrupted trace file. Incorrect header.");
+  }
+  if (header->payload.substr(0, kTraceMagic.length()) != kTraceMagic) {
+    return Status::Corruption("Corrupted trace file. Incorrect magic.");
+  }
+
+  return s;
+}
+
+Status TraceAnalyzer::ReadTraceFooter(Trace* footer) {
+  assert(footer != nullptr);
+  Status s = ReadTraceRecord(footer);
+  if (!s.ok()) {
+    return s;
+  }
+  if (footer->type != kTraceEnd) {
+    return Status::Corruption("Corrupted trace file. Incorrect footer.");
+  }
+  return s;
+}
+
+Status TraceAnalyzer::ReadTraceRecord(Trace* trace) {
+  assert(trace != nullptr);
+  std::string encoded_trace;
+  Status s = trace_reader_->Read(&encoded_trace);
+  if (!s.ok()) {
+    return s;
+  }
+  return TracerHelper::DecodeTrace(encoded_trace, trace);
+}
+
+// process the trace itself and redirect the trace content
+// to different operation type handler. With different race
+// format, this function can be changed
+Status TraceAnalyzer::StartProcessing() {
+  Status s;
+  Trace header;
+  s = ReadTraceHeader(&header);
+  if (!s.ok()) {
+    fprintf(stderr, "Cannot read the header\n");
+    return s;
+  }
+  // Set the default trace file version as version 0.2
+  int trace_file_version = 2;
+  s = TracerHelper::ParseTraceHeader(header, &trace_file_version, &db_version_);
+  if (!s.ok()) {
+    return s;
+  }
+  trace_create_time_ = header.ts;
+  if (FLAGS_output_time_series) {
+    time_series_start_ = header.ts;
+  }
+
+  Trace trace;
+  std::unique_ptr<TraceRecord> record;
+  while (s.ok()) {
+    trace.reset();
+    s = ReadTraceRecord(&trace);
+    if (!s.ok()) {
+      break;
+    }
+
+    end_time_ = trace.ts;
+    if (trace.type == kTraceEnd) {
+      break;
+    }
+    // Do not count TraceEnd (if there is one)
+    total_requests_++;
+
+    s = TracerHelper::DecodeTraceRecord(&trace, trace_file_version, &record);
+    if (s.IsNotSupported()) {
+      continue;
+    }
+    if (!s.ok()) {
+      return s;
+    }
+    s = record->Accept(this, nullptr);
+    if (!s.ok()) {
+      fprintf(stderr, "Cannot process the TraceRecord\n");
+      return s;
+    }
+  }
+  if (s.IsIncomplete()) {
+    // Fix it: Reaching eof returns Incomplete status at the moment.
+    return Status::OK();
+  }
+  return s;
+}
+
+// After the trace is processed by StartProcessing, the statistic data
+// is stored in the map or other in memory data structures. To get the
+// other statistic result such as key size distribution, value size
+// distribution, these data structures are re-processed here.
+Status TraceAnalyzer::MakeStatistics() {
+  int ret;
+  Status s;
+  for (int type = 0; type < kTaTypeNum; type++) {
+    if (!ta_[type].enabled) {
+      continue;
+    }
+    for (auto& stat : ta_[type].stats) {
+      stat.second.a_key_id = 0;
+      for (auto& record : stat.second.a_key_stats) {
+        record.second.key_id = stat.second.a_key_id;
+        stat.second.a_key_id++;
+        if (record.second.access_count <=
+            static_cast<uint64_t>(FLAGS_output_ignore_count)) {
+          continue;
+        }
+
+        // Generate the key access count distribution data
+        if (FLAGS_output_access_count_stats) {
+          if (stat.second.a_count_stats.find(record.second.access_count) ==
+              stat.second.a_count_stats.end()) {
+            stat.second.a_count_stats[record.second.access_count] = 1;
+          } else {
+            stat.second.a_count_stats[record.second.access_count]++;
+          }
+        }
+
+        // Generate the key size distribution data
+        if (FLAGS_output_key_distribution) {
+          if (stat.second.a_key_size_stats.find(record.first.size()) ==
+              stat.second.a_key_size_stats.end()) {
+            stat.second.a_key_size_stats[record.first.size()] = 1;
+          } else {
+            stat.second.a_key_size_stats[record.first.size()]++;
+          }
+        }
+
+        if (!FLAGS_print_correlation.empty()) {
+          s = MakeStatisticCorrelation(stat.second, record.second);
+          if (!s.ok()) {
+            return s;
+          }
+        }
+      }
+
+      // Output the prefix cut or the whole content of the accessed key space
+      if (FLAGS_output_key_stats || FLAGS_output_prefix_cut > 0) {
+        s = MakeStatisticKeyStatsOrPrefix(stat.second);
+        if (!s.ok()) {
+          return s;
+        }
+      }
+
+      // output the access count distribution
+      if (FLAGS_output_access_count_stats && stat.second.a_count_dist_f) {
+        for (auto& record : stat.second.a_count_stats) {
+          ret = snprintf(buffer_, sizeof(buffer_),
+                         "access_count: %" PRIu64 " num: %" PRIu64 "\n",
+                         record.first, record.second);
+          if (ret < 0) {
+            return Status::IOError("Format the output failed");
+          }
+          std::string printout(buffer_);
+          s = stat.second.a_count_dist_f->Append(printout);
+          if (!s.ok()) {
+            fprintf(stderr, "Write access count distribution file failed\n");
+            return s;
+          }
+        }
+      }
+
+      // find the medium of the key size
+      uint64_t k_count = 0;
+      bool get_mid = false;
+      for (auto& record : stat.second.a_key_size_stats) {
+        k_count += record.second;
+        if (!get_mid && k_count >= stat.second.a_key_mid) {
+          stat.second.a_key_mid = record.first;
+          get_mid = true;
+        }
+        if (FLAGS_output_key_distribution && stat.second.a_key_size_f) {
+          ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %" PRIu64 "\n",
+                         record.first, record.second);
+          if (ret < 0) {
+            return Status::IOError("Format output failed");
+          }
+          std::string printout(buffer_);
+          s = stat.second.a_key_size_f->Append(printout);
+          if (!s.ok()) {
+            fprintf(stderr, "Write key size distribution file failed\n");
+            return s;
+          }
+        }
+      }
+
+      // output the value size distribution
+      uint64_t v_begin = 0, v_end = 0, v_count = 0;
+      get_mid = false;
+      for (auto& record : stat.second.a_value_size_stats) {
+        v_begin = v_end;
+        v_end = (record.first + 1) * FLAGS_value_interval;
+        v_count += record.second;
+        if (!get_mid && v_count >= stat.second.a_count / 2) {
+          stat.second.a_value_mid = (v_begin + v_end) / 2;
+          get_mid = true;
+        }
+        if (FLAGS_output_value_distribution && stat.second.a_value_size_f &&
+            (type == TraceOperationType::kPut ||
+             type == TraceOperationType::kMerge)) {
+          ret = snprintf(buffer_, sizeof(buffer_),
+                         "Number_of_value_size_between %" PRIu64 " and %" PRIu64
+                         " is: %" PRIu64 "\n",
+                         v_begin, v_end, record.second);
+          if (ret < 0) {
+            return Status::IOError("Format output failed");
+          }
+          std::string printout(buffer_);
+          s = stat.second.a_value_size_f->Append(printout);
+          if (!s.ok()) {
+            fprintf(stderr, "Write value size distribution file failed\n");
+            return s;
+          }
+        }
+      }
+    }
+  }
+
+  // Make the QPS statistics
+  if (FLAGS_output_qps_stats) {
+    s = MakeStatisticQPS();
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  return Status::OK();
+}
+
+// Process the statistics of the key access and
+// prefix of the accessed keys if required
+Status TraceAnalyzer::MakeStatisticKeyStatsOrPrefix(TraceStats& stats) {
+  int ret;
+  Status s;
+  std::string prefix = "0";
+  uint64_t prefix_access = 0;
+  uint64_t prefix_count = 0;
+  uint64_t prefix_succ_access = 0;
+  double prefix_ave_access = 0.0;
+  stats.a_succ_count = 0;
+  for (auto& record : stats.a_key_stats) {
+    // write the key access statistic file
+    if (!stats.a_key_f) {
+      return Status::IOError("Failed to open accessed_key_stats file.");
+    }
+    stats.a_succ_count += record.second.succ_count;
+    double succ_ratio = 0.0;
+    if (record.second.access_count > 0) {
+      succ_ratio = (static_cast<double>(record.second.succ_count)) /
+                   record.second.access_count;
+    }
+    ret = snprintf(buffer_, sizeof(buffer_),
+                   "%u %zu %" PRIu64 " %" PRIu64 " %f\n", record.second.cf_id,
+                   record.second.value_size, record.second.key_id,
+                   record.second.access_count, succ_ratio);
+    if (ret < 0) {
+      return Status::IOError("Format output failed");
+    }
+    std::string printout(buffer_);
+    s = stats.a_key_f->Append(printout);
+    if (!s.ok()) {
+      fprintf(stderr, "Write key access file failed\n");
+      return s;
+    }
+
+    // write the prefix cut of the accessed keys
+    if (FLAGS_output_prefix_cut > 0 && stats.a_prefix_cut_f) {
+      if (record.first.compare(0, FLAGS_output_prefix_cut, prefix) != 0) {
+        std::string prefix_out =
+            ROCKSDB_NAMESPACE::LDBCommand::StringToHex(prefix);
+        if (prefix_count == 0) {
+          prefix_ave_access = 0.0;
+        } else {
+          prefix_ave_access =
+              (static_cast<double>(prefix_access)) / prefix_count;
+        }
+        double prefix_succ_ratio = 0.0;
+        if (prefix_access > 0) {
+          prefix_succ_ratio =
+              (static_cast<double>(prefix_succ_access)) / prefix_access;
+        }
+        ret =
+            snprintf(buffer_, sizeof(buffer_),
+                     "%" PRIu64 " %" PRIu64 " %" PRIu64 " %f %f %s\n",
+                     record.second.key_id, prefix_access, prefix_count,
+                     prefix_ave_access, prefix_succ_ratio, prefix_out.c_str());
+        if (ret < 0) {
+          return Status::IOError("Format output failed");
+        }
+        std::string pout(buffer_);
+        s = stats.a_prefix_cut_f->Append(pout);
+        if (!s.ok()) {
+          fprintf(stderr, "Write accessed key prefix file failed\n");
+          return s;
+        }
+
+        // make the top k statistic for the prefix
+        if (static_cast<int32_t>(stats.top_k_prefix_access.size()) <
+            FLAGS_print_top_k_access) {
+          stats.top_k_prefix_access.push(
+              std::make_pair(prefix_access, prefix_out));
+        } else {
+          if (prefix_access > stats.top_k_prefix_access.top().first) {
+            stats.top_k_prefix_access.pop();
+            stats.top_k_prefix_access.push(
+                std::make_pair(prefix_access, prefix_out));
+          }
+        }
+
+        if (static_cast<int32_t>(stats.top_k_prefix_ave.size()) <
+            FLAGS_print_top_k_access) {
+          stats.top_k_prefix_ave.push(
+              std::make_pair(prefix_ave_access, prefix_out));
+        } else {
+          if (prefix_ave_access > stats.top_k_prefix_ave.top().first) {
+            stats.top_k_prefix_ave.pop();
+            stats.top_k_prefix_ave.push(
+                std::make_pair(prefix_ave_access, prefix_out));
+          }
+        }
+
+        prefix = record.first.substr(0, FLAGS_output_prefix_cut);
+        prefix_access = 0;
+        prefix_count = 0;
+        prefix_succ_access = 0;
+      }
+      prefix_access += record.second.access_count;
+      prefix_count += 1;
+      prefix_succ_access += record.second.succ_count;
+    }
+  }
+  return Status::OK();
+}
+
+// Process the statistics of different query type
+// correlations
+Status TraceAnalyzer::MakeStatisticCorrelation(TraceStats& stats,
+                                               StatsUnit& unit) {
+  if (stats.correlation_output.size() !=
+      analyzer_opts_.correlation_list.size()) {
+    return Status::Corruption("Cannot make the statistic of correlation.");
+  }
+
+  for (int i = 0; i < static_cast<int>(analyzer_opts_.correlation_list.size());
+       i++) {
+    if (i >= static_cast<int>(stats.correlation_output.size()) ||
+        i >= static_cast<int>(unit.v_correlation.size())) {
+      break;
+    }
+    stats.correlation_output[i].first += unit.v_correlation[i].count;
+    stats.correlation_output[i].second += unit.v_correlation[i].total_ts;
+  }
+  return Status::OK();
+}
+
+// Process the statistics of QPS
+Status TraceAnalyzer::MakeStatisticQPS() {
+  if (begin_time_ == 0) {
+    begin_time_ = trace_create_time_;
+  }
+  uint32_t duration =
+      static_cast<uint32_t>((end_time_ - begin_time_) / 1000000);
+  int ret;
+  Status s;
+  std::vector<std::vector<uint32_t>> type_qps(
+      duration, std::vector<uint32_t>(kTaTypeNum + 1, 0));
+  std::vector<uint64_t> qps_sum(kTaTypeNum + 1, 0);
+  std::vector<uint32_t> qps_peak(kTaTypeNum + 1, 0);
+  qps_ave_.resize(kTaTypeNum + 1);
+
+  for (int type = 0; type < kTaTypeNum; type++) {
+    if (!ta_[type].enabled) {
+      continue;
+    }
+    for (auto& stat : ta_[type].stats) {
+      uint32_t time_line = 0;
+      uint64_t cf_qps_sum = 0;
+      for (auto& time_it : stat.second.a_qps_stats) {
+        if (time_it.first >= duration) {
+          continue;
+        }
+        type_qps[time_it.first][kTaTypeNum] += time_it.second;
+        type_qps[time_it.first][type] += time_it.second;
+        cf_qps_sum += time_it.second;
+        if (time_it.second > stat.second.a_peak_qps) {
+          stat.second.a_peak_qps = time_it.second;
+        }
+        if (stat.second.a_qps_f) {
+          while (time_line < time_it.first) {
+            ret = snprintf(buffer_, sizeof(buffer_), "%u\n", 0);
+            if (ret < 0) {
+              return Status::IOError("Format the output failed");
+            }
+            std::string printout(buffer_);
+            s = stat.second.a_qps_f->Append(printout);
+            if (!s.ok()) {
+              fprintf(stderr, "Write QPS file failed\n");
+              return s;
+            }
+            time_line++;
+          }
+          ret = snprintf(buffer_, sizeof(buffer_), "%u\n", time_it.second);
+          if (ret < 0) {
+            return Status::IOError("Format the output failed");
+          }
+          std::string printout(buffer_);
+          s = stat.second.a_qps_f->Append(printout);
+          if (!s.ok()) {
+            fprintf(stderr, "Write QPS file failed\n");
+            return s;
+          }
+          if (time_line == time_it.first) {
+            time_line++;
+          }
+        }
+
+        // Process the top k QPS peaks
+        if (FLAGS_output_prefix_cut > 0) {
+          if (static_cast<int32_t>(stat.second.top_k_qps_sec.size()) <
+              FLAGS_print_top_k_access) {
+            stat.second.top_k_qps_sec.push(
+                std::make_pair(time_it.second, time_it.first));
+          } else {
+            if (stat.second.top_k_qps_sec.size() > 0 &&
+                stat.second.top_k_qps_sec.top().first < time_it.second) {
+              stat.second.top_k_qps_sec.pop();
+              stat.second.top_k_qps_sec.push(
+                  std::make_pair(time_it.second, time_it.first));
+            }
+          }
+        }
+      }
+      if (duration == 0) {
+        stat.second.a_ave_qps = 0;
+      } else {
+        stat.second.a_ave_qps = (static_cast<double>(cf_qps_sum)) / duration;
+      }
+
+      // Output the accessed unique key number change overtime
+      if (stat.second.a_key_num_f) {
+        uint64_t cur_uni_key =
+            static_cast<uint64_t>(stat.second.a_key_stats.size());
+        double cur_ratio = 0.0;
+        uint64_t cur_num = 0;
+        for (uint32_t i = 0; i < duration; i++) {
+          auto find_time = stat.second.uni_key_num.find(i);
+          if (find_time != stat.second.uni_key_num.end()) {
+            cur_ratio = (static_cast<double>(find_time->second)) / cur_uni_key;
+            cur_num = find_time->second;
+          }
+          ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %.12f\n",
+                         cur_num, cur_ratio);
+          if (ret < 0) {
+            return Status::IOError("Format the output failed");
+          }
+          std::string printout(buffer_);
+          s = stat.second.a_key_num_f->Append(printout);
+          if (!s.ok()) {
+            fprintf(stderr,
+                    "Write accessed unique key number change file failed\n");
+            return s;
+          }
+        }
+      }
+
+      // output the prefix of top k access peak
+      if (FLAGS_output_prefix_cut > 0 && stat.second.a_top_qps_prefix_f) {
+        while (!stat.second.top_k_qps_sec.empty()) {
+          ret = snprintf(buffer_, sizeof(buffer_), "At time: %u with QPS: %u\n",
+                         stat.second.top_k_qps_sec.top().second,
+                         stat.second.top_k_qps_sec.top().first);
+          if (ret < 0) {
+            return Status::IOError("Format the output failed");
+          }
+          std::string printout(buffer_);
+          s = stat.second.a_top_qps_prefix_f->Append(printout);
+          if (!s.ok()) {
+            fprintf(stderr, "Write prefix QPS top K file failed\n");
+            return s;
+          }
+          uint32_t qps_time = stat.second.top_k_qps_sec.top().second;
+          stat.second.top_k_qps_sec.pop();
+          if (stat.second.a_qps_prefix_stats.find(qps_time) !=
+              stat.second.a_qps_prefix_stats.end()) {
+            for (auto& qps_prefix : stat.second.a_qps_prefix_stats[qps_time]) {
+              std::string qps_prefix_out =
+                  ROCKSDB_NAMESPACE::LDBCommand::StringToHex(qps_prefix.first);
+              ret = snprintf(buffer_, sizeof(buffer_),
+                             "The prefix: %s Access count: %u\n",
+                             qps_prefix_out.c_str(), qps_prefix.second);
+              if (ret < 0) {
+                return Status::IOError("Format the output failed");
+              }
+              std::string pout(buffer_);
+              s = stat.second.a_top_qps_prefix_f->Append(pout);
+              if (!s.ok()) {
+                fprintf(stderr, "Write prefix QPS top K file failed\n");
+                return s;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (qps_f_) {
+    for (uint32_t i = 0; i < duration; i++) {
+      for (int type = 0; type <= kTaTypeNum; type++) {
+        if (type < kTaTypeNum) {
+          ret = snprintf(buffer_, sizeof(buffer_), "%u ", type_qps[i][type]);
+        } else {
+          ret = snprintf(buffer_, sizeof(buffer_), "%u\n", type_qps[i][type]);
+        }
+        if (ret < 0) {
+          return Status::IOError("Format the output failed");
+        }
+        std::string printout(buffer_);
+        s = qps_f_->Append(printout);
+        if (!s.ok()) {
+          return s;
+        }
+        qps_sum[type] += type_qps[i][type];
+        if (type_qps[i][type] > qps_peak[type]) {
+          qps_peak[type] = type_qps[i][type];
+        }
+      }
+    }
+  }
+
+  if (cf_qps_f_) {
+    int cfs_size = static_cast<uint32_t>(cfs_.size());
+    uint32_t v;
+    for (uint32_t i = 0; i < duration; i++) {
+      for (int cf = 0; cf < cfs_size; cf++) {
+        if (cfs_[cf].cf_qps.find(i) != cfs_[cf].cf_qps.end()) {
+          v = cfs_[cf].cf_qps[i];
+        } else {
+          v = 0;
+        }
+        if (cf < cfs_size - 1) {
+          ret = snprintf(buffer_, sizeof(buffer_), "%u ", v);
+        } else {
+          ret = snprintf(buffer_, sizeof(buffer_), "%u\n", v);
+        }
+        if (ret < 0) {
+          return Status::IOError("Format the output failed");
+        }
+        std::string printout(buffer_);
+        s = cf_qps_f_->Append(printout);
+        if (!s.ok()) {
+          return s;
+        }
+      }
+    }
+  }
+
+  qps_peak_ = qps_peak;
+  for (int type = 0; type <= kTaTypeNum; type++) {
+    if (duration == 0) {
+      qps_ave_[type] = 0;
+    } else {
+      qps_ave_[type] = (static_cast<double>(qps_sum[type])) / duration;
+    }
+  }
+
+  return Status::OK();
+}
+
+// In reprocessing, if we have the whole key space
+// we can output the access count of all keys in a cf
+// we can make some statistics of the whole key space
+// also, we output the top k accessed keys here
+Status TraceAnalyzer::ReProcessing() {
+  int ret;
+  Status s;
+  for (auto& cf_it : cfs_) {
+    uint32_t cf_id = cf_it.first;
+
+    // output the time series;
+    if (FLAGS_output_time_series) {
+      for (int type = 0; type < kTaTypeNum; type++) {
+        if (!ta_[type].enabled ||
+            ta_[type].stats.find(cf_id) == ta_[type].stats.end()) {
+          continue;
+        }
+        TraceStats& stat = ta_[type].stats[cf_id];
+        if (!stat.time_series_f) {
+          fprintf(stderr, "Cannot write time_series of '%s' in '%u'\n",
+                  ta_[type].type_name.c_str(), cf_id);
+          continue;
+        }
+        while (!stat.time_series.empty()) {
+          uint64_t key_id = 0;
+          auto found = stat.a_key_stats.find(stat.time_series.front().key);
+          if (found != stat.a_key_stats.end()) {
+            key_id = found->second.key_id;
+          }
+          ret =
+              snprintf(buffer_, sizeof(buffer_), "%u %" PRIu64 " %" PRIu64 "\n",
+                       stat.time_series.front().type,
+                       stat.time_series.front().ts, key_id);
+          if (ret < 0) {
+            return Status::IOError("Format the output failed");
+          }
+          std::string printout(buffer_);
+          s = stat.time_series_f->Append(printout);
+          if (!s.ok()) {
+            fprintf(stderr, "Write time series file failed\n");
+            return s;
+          }
+          stat.time_series.pop_front();
+        }
+      }
+    }
+
+    // process the whole key space if needed
+    if (!FLAGS_key_space_dir.empty()) {
+      std::string whole_key_path =
+          FLAGS_key_space_dir + "/" + std::to_string(cf_id) + ".txt";
+      std::string input_key, get_key;
+      std::vector<std::string> prefix(kTaTypeNum);
+      std::unique_ptr<FSSequentialFile> file;
+
+      s = env_->GetFileSystem()->NewSequentialFile(
+          whole_key_path, FileOptions(env_options_), &file, nullptr);
+      if (!s.ok()) {
+        fprintf(stderr, "Cannot open the whole key space file of CF: %u\n",
+                cf_id);
+        file.reset();
+      }
+
+      if (file) {
+        size_t kTraceFileReadaheadSize = 2 * 1024 * 1024;
+        LineFileReader lf_reader(
+            std::move(file), whole_key_path,
+            kTraceFileReadaheadSize /* filereadahead_size */);
+        for (cfs_[cf_id].w_count = 0; lf_reader.ReadLine(
+                 &get_key, Env::IO_TOTAL /* rate_limiter_priority */);
+             ++cfs_[cf_id].w_count) {
+          input_key = ROCKSDB_NAMESPACE::LDBCommand::HexToString(get_key);
+          for (int type = 0; type < kTaTypeNum; type++) {
+            if (!ta_[type].enabled) {
+              continue;
+            }
+            TraceStats& stat = ta_[type].stats[cf_id];
+            if (stat.w_key_f) {
+              if (stat.a_key_stats.find(input_key) != stat.a_key_stats.end()) {
+                ret = snprintf(buffer_, sizeof(buffer_),
+                               "%" PRIu64 " %" PRIu64 "\n", cfs_[cf_id].w_count,
+                               stat.a_key_stats[input_key].access_count);
+                if (ret < 0) {
+                  return Status::IOError("Format the output failed");
+                }
+                std::string printout(buffer_);
+                s = stat.w_key_f->Append(printout);
+                if (!s.ok()) {
+                  fprintf(stderr, "Write whole key space access file failed\n");
+                  return s;
+                }
+              }
+            }
+
+            // Output the prefix cut file of the whole key space
+            if (FLAGS_output_prefix_cut > 0 && stat.w_prefix_cut_f) {
+              if (input_key.compare(0, FLAGS_output_prefix_cut, prefix[type]) !=
+                  0) {
+                prefix[type] = input_key.substr(0, FLAGS_output_prefix_cut);
+                std::string prefix_out =
+                    ROCKSDB_NAMESPACE::LDBCommand::StringToHex(prefix[type]);
+                ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %s\n",
+                               cfs_[cf_id].w_count, prefix_out.c_str());
+                if (ret < 0) {
+                  return Status::IOError("Format the output failed");
+                }
+                std::string printout(buffer_);
+                s = stat.w_prefix_cut_f->Append(printout);
+                if (!s.ok()) {
+                  fprintf(stderr,
+                          "Write whole key space prefix cut file failed\n");
+                  return s;
+                }
+              }
+            }
+          }
+
+          // Make the statistics fo the key size distribution
+          if (FLAGS_output_key_distribution) {
+            if (cfs_[cf_id].w_key_size_stats.find(input_key.size()) ==
+                cfs_[cf_id].w_key_size_stats.end()) {
+              cfs_[cf_id].w_key_size_stats[input_key.size()] = 1;
+            } else {
+              cfs_[cf_id].w_key_size_stats[input_key.size()]++;
+            }
+          }
+        }
+        s = lf_reader.GetStatus();
+        if (!s.ok()) {
+          fprintf(stderr, "Read whole key space file failed\n");
+          return s;
+        }
+      }
+    }
+
+    // process the top k accessed keys
+    if (FLAGS_print_top_k_access > 0) {
+      for (int type = 0; type < kTaTypeNum; type++) {
+        if (!ta_[type].enabled ||
+            ta_[type].stats.find(cf_id) == ta_[type].stats.end()) {
+          continue;
+        }
+        TraceStats& stat = ta_[type].stats[cf_id];
+        for (auto& record : stat.a_key_stats) {
+          if (static_cast<int32_t>(stat.top_k_queue.size()) <
+              FLAGS_print_top_k_access) {
+            stat.top_k_queue.push(
+                std::make_pair(record.second.access_count, record.first));
+          } else {
+            if (record.second.access_count > stat.top_k_queue.top().first) {
+              stat.top_k_queue.pop();
+              stat.top_k_queue.push(
+                  std::make_pair(record.second.access_count, record.first));
+            }
+          }
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+
+// End the processing, print the requested results
+Status TraceAnalyzer::EndProcessing() {
+  Status s;
+  if (trace_sequence_f_) {
+    s = trace_sequence_f_->Close();
+  }
+  if (FLAGS_no_print) {
+    return s;
+  }
+  PrintStatistics();
+  if (s.ok()) {
+    s = CloseOutputFiles();
+  }
+  return s;
+}
+
+// Insert the corresponding key statistics to the correct type
+// and correct CF, output the time-series file if needed
+Status TraceAnalyzer::KeyStatsInsertion(const uint32_t& type,
+                                        const uint32_t& cf_id,
+                                        const std::string& key,
+                                        const size_t value_size,
+                                        const uint64_t ts) {
+  Status s;
+  StatsUnit unit;
+  unit.key_id = 0;
+  unit.cf_id = cf_id;
+  unit.value_size = value_size;
+  unit.access_count = 1;
+  unit.latest_ts = ts;
+  if ((type != TraceOperationType::kGet &&
+       type != TraceOperationType::kMultiGet) ||
+      value_size > 0) {
+    unit.succ_count = 1;
+  } else {
+    unit.succ_count = 0;
+  }
+  unit.v_correlation.resize(analyzer_opts_.correlation_list.size());
+  for (int i = 0;
+       i < (static_cast<int>(analyzer_opts_.correlation_list.size())); i++) {
+    unit.v_correlation[i].count = 0;
+    unit.v_correlation[i].total_ts = 0;
+  }
+  std::string prefix;
+  if (FLAGS_output_prefix_cut > 0) {
+    prefix = key.substr(0, FLAGS_output_prefix_cut);
+  }
+
+  if (begin_time_ == 0) {
+    begin_time_ = ts;
+  }
+  uint32_t time_in_sec;
+  if (ts < begin_time_) {
+    time_in_sec = 0;
+  } else {
+    time_in_sec = static_cast<uint32_t>((ts - begin_time_) / 1000000);
+  }
+
+  uint64_t dist_value_size = value_size / FLAGS_value_interval;
+  auto found_stats = ta_[type].stats.find(cf_id);
+  if (found_stats == ta_[type].stats.end()) {
+    ta_[type].stats[cf_id].cf_id = cf_id;
+    ta_[type].stats[cf_id].cf_name = std::to_string(cf_id);
+    ta_[type].stats[cf_id].a_count = 1;
+    ta_[type].stats[cf_id].a_key_id = 0;
+    ta_[type].stats[cf_id].a_key_size_sqsum = MultiplyCheckOverflow(
+        static_cast<uint64_t>(key.size()), static_cast<uint64_t>(key.size()));
+    ta_[type].stats[cf_id].a_key_size_sum = key.size();
+    ta_[type].stats[cf_id].a_value_size_sqsum = MultiplyCheckOverflow(
+        static_cast<uint64_t>(value_size), static_cast<uint64_t>(value_size));
+    ta_[type].stats[cf_id].a_value_size_sum = value_size;
+    s = OpenStatsOutputFiles(ta_[type].type_name, ta_[type].stats[cf_id]);
+    if (!FLAGS_print_correlation.empty()) {
+      s = StatsUnitCorrelationUpdate(unit, type, ts, key);
+    }
+    ta_[type].stats[cf_id].a_key_stats[key] = unit;
+    ta_[type].stats[cf_id].a_value_size_stats[dist_value_size] = 1;
+    ta_[type].stats[cf_id].a_qps_stats[time_in_sec] = 1;
+    ta_[type].stats[cf_id].correlation_output.resize(
+        analyzer_opts_.correlation_list.size());
+    if (FLAGS_output_prefix_cut > 0) {
+      std::map<std::string, uint32_t> tmp_qps_map;
+      tmp_qps_map[prefix] = 1;
+      ta_[type].stats[cf_id].a_qps_prefix_stats[time_in_sec] = tmp_qps_map;
+    }
+    if (time_in_sec != cur_time_sec_) {
+      ta_[type].stats[cf_id].uni_key_num[cur_time_sec_] =
+          static_cast<uint64_t>(ta_[type].stats[cf_id].a_key_stats.size());
+      cur_time_sec_ = time_in_sec;
+    }
+  } else {
+    found_stats->second.a_count++;
+    found_stats->second.a_key_size_sqsum += MultiplyCheckOverflow(
+        static_cast<uint64_t>(key.size()), static_cast<uint64_t>(key.size()));
+    found_stats->second.a_key_size_sum += key.size();
+    found_stats->second.a_value_size_sqsum += MultiplyCheckOverflow(
+        static_cast<uint64_t>(value_size), static_cast<uint64_t>(value_size));
+    found_stats->second.a_value_size_sum += value_size;
+    auto found_key = found_stats->second.a_key_stats.find(key);
+    if (found_key == found_stats->second.a_key_stats.end()) {
+      found_stats->second.a_key_stats[key] = unit;
+    } else {
+      found_key->second.access_count++;
+      if (type != TraceOperationType::kGet || value_size > 0) {
+        found_key->second.succ_count++;
+      }
+      if (!FLAGS_print_correlation.empty()) {
+        s = StatsUnitCorrelationUpdate(found_key->second, type, ts, key);
+      }
+    }
+    if (time_in_sec != cur_time_sec_) {
+      found_stats->second.uni_key_num[cur_time_sec_] =
+          static_cast<uint64_t>(found_stats->second.a_key_stats.size());
+      cur_time_sec_ = time_in_sec;
+    }
+
+    auto found_value =
+        found_stats->second.a_value_size_stats.find(dist_value_size);
+    if (found_value == found_stats->second.a_value_size_stats.end()) {
+      found_stats->second.a_value_size_stats[dist_value_size] = 1;
+    } else {
+      found_value->second++;
+    }
+
+    auto found_qps = found_stats->second.a_qps_stats.find(time_in_sec);
+    if (found_qps == found_stats->second.a_qps_stats.end()) {
+      found_stats->second.a_qps_stats[time_in_sec] = 1;
+    } else {
+      found_qps->second++;
+    }
+
+    if (FLAGS_output_prefix_cut > 0) {
+      auto found_qps_prefix =
+          found_stats->second.a_qps_prefix_stats.find(time_in_sec);
+      if (found_qps_prefix == found_stats->second.a_qps_prefix_stats.end()) {
+        std::map<std::string, uint32_t> tmp_qps_map;
+        found_stats->second.a_qps_prefix_stats[time_in_sec] = tmp_qps_map;
+      }
+      if (found_stats->second.a_qps_prefix_stats[time_in_sec].find(prefix) ==
+          found_stats->second.a_qps_prefix_stats[time_in_sec].end()) {
+        found_stats->second.a_qps_prefix_stats[time_in_sec][prefix] = 1;
+      } else {
+        found_stats->second.a_qps_prefix_stats[time_in_sec][prefix]++;
+      }
+    }
+  }
+
+  if (cfs_.find(cf_id) == cfs_.end()) {
+    CfUnit cf_unit;
+    cf_unit.cf_id = cf_id;
+    cf_unit.w_count = 0;
+    cf_unit.a_count = 0;
+    cfs_[cf_id] = cf_unit;
+  }
+
+  if (FLAGS_output_qps_stats) {
+    cfs_[cf_id].cf_qps[time_in_sec]++;
+  }
+
+  if (FLAGS_output_time_series) {
+    TraceUnit trace_u;
+    trace_u.type = type;
+    trace_u.key = key;
+    trace_u.value_size = value_size;
+    trace_u.ts = (ts - time_series_start_) / 1000000;
+    trace_u.cf_id = cf_id;
+    ta_[type].stats[cf_id].time_series.push_back(trace_u);
+  }
+
+  return s;
+}
+
+// Update the correlation unit of each key if enabled
+Status TraceAnalyzer::StatsUnitCorrelationUpdate(StatsUnit& unit,
+                                                 const uint32_t& type_second,
+                                                 const uint64_t& ts,
+                                                 const std::string& key) {
+  if (type_second >= kTaTypeNum) {
+    fprintf(stderr, "Unknown Type Id: %u\n", type_second);
+    return Status::NotFound();
+  }
+
+  for (int type_first = 0; type_first < kTaTypeNum; type_first++) {
+    if (type_first >= static_cast<int>(ta_.size()) ||
+        type_first >= static_cast<int>(analyzer_opts_.correlation_map.size())) {
+      break;
+    }
+    if (analyzer_opts_.correlation_map[type_first][type_second] < 0 ||
+        ta_[type_first].stats.find(unit.cf_id) == ta_[type_first].stats.end() ||
+        ta_[type_first].stats[unit.cf_id].a_key_stats.find(key) ==
+            ta_[type_first].stats[unit.cf_id].a_key_stats.end() ||
+        ta_[type_first].stats[unit.cf_id].a_key_stats[key].latest_ts == ts) {
+      continue;
+    }
+
+    int correlation_id =
+        analyzer_opts_.correlation_map[type_first][type_second];
+
+    // after get the x-y operation time or x, update;
+    if (correlation_id < 0 ||
+        correlation_id >= static_cast<int>(unit.v_correlation.size())) {
+      continue;
+    }
+    unit.v_correlation[correlation_id].count++;
+    unit.v_correlation[correlation_id].total_ts +=
+        (ts - ta_[type_first].stats[unit.cf_id].a_key_stats[key].latest_ts);
+  }
+
+  unit.latest_ts = ts;
+  return Status::OK();
+}
+
+// when a new trace statistic is created, the file handler
+// pointers should be initiated if needed according to
+// the trace analyzer options
+Status TraceAnalyzer::OpenStatsOutputFiles(const std::string& type,
+                                           TraceStats& new_stats) {
+  Status s;
+  if (FLAGS_output_key_stats) {
+    s = CreateOutputFile(type, new_stats.cf_name, "accessed_key_stats.txt",
+                         &new_stats.a_key_f);
+    s = CreateOutputFile(type, new_stats.cf_name,
+                         "accessed_unique_key_num_change.txt",
+                         &new_stats.a_key_num_f);
+    if (!FLAGS_key_space_dir.empty()) {
+      s = CreateOutputFile(type, new_stats.cf_name, "whole_key_stats.txt",
+                           &new_stats.w_key_f);
+    }
+  }
+
+  if (FLAGS_output_access_count_stats) {
+    s = CreateOutputFile(type, new_stats.cf_name,
+                         "accessed_key_count_distribution.txt",
+                         &new_stats.a_count_dist_f);
+  }
+
+  if (FLAGS_output_prefix_cut > 0) {
+    s = CreateOutputFile(type, new_stats.cf_name, "accessed_key_prefix_cut.txt",
+                         &new_stats.a_prefix_cut_f);
+    if (!FLAGS_key_space_dir.empty()) {
+      s = CreateOutputFile(type, new_stats.cf_name, "whole_key_prefix_cut.txt",
+                           &new_stats.w_prefix_cut_f);
+    }
+
+    if (FLAGS_output_qps_stats) {
+      s = CreateOutputFile(type, new_stats.cf_name,
+                           "accessed_top_k_qps_prefix_cut.txt",
+                           &new_stats.a_top_qps_prefix_f);
+    }
+  }
+
+  if (FLAGS_output_time_series) {
+    s = CreateOutputFile(type, new_stats.cf_name, "time_series.txt",
+                         &new_stats.time_series_f);
+  }
+
+  if (FLAGS_output_value_distribution) {
+    s = CreateOutputFile(type, new_stats.cf_name,
+                         "accessed_value_size_distribution.txt",
+                         &new_stats.a_value_size_f);
+  }
+
+  if (FLAGS_output_key_distribution) {
+    s = CreateOutputFile(type, new_stats.cf_name,
+                         "accessed_key_size_distribution.txt",
+                         &new_stats.a_key_size_f);
+  }
+
+  if (FLAGS_output_qps_stats) {
+    s = CreateOutputFile(type, new_stats.cf_name, "qps_stats.txt",
+                         &new_stats.a_qps_f);
+  }
+
+  return s;
+}
+
+// create the output path of the files to be opened
+Status TraceAnalyzer::CreateOutputFile(
+    const std::string& type, const std::string& cf_name,
+    const std::string& ending,
+    std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile>* f_ptr) {
+  std::string path;
+  path = output_path_ + "/" + FLAGS_output_prefix + "-" + type + "-" + cf_name +
+         "-" + ending;
+  Status s;
+  s = env_->NewWritableFile(path, f_ptr, env_options_);
+  if (!s.ok()) {
+    fprintf(stderr, "Cannot open file: %s\n", path.c_str());
+    exit(1);
+  }
+  return Status::OK();
+}
+
+// Close the output files in the TraceStats if they are opened
+Status TraceAnalyzer::CloseOutputFiles() {
+  Status s;
+  for (int type = 0; type < kTaTypeNum; type++) {
+    if (!ta_[type].enabled) {
+      continue;
+    }
+    for (auto& stat : ta_[type].stats) {
+      if (s.ok() && stat.second.time_series_f) {
+        s = stat.second.time_series_f->Close();
+      }
+
+      if (s.ok() && stat.second.a_key_f) {
+        s = stat.second.a_key_f->Close();
+      }
+
+      if (s.ok() && stat.second.a_key_num_f) {
+        s = stat.second.a_key_num_f->Close();
+      }
+
+      if (s.ok() && stat.second.a_count_dist_f) {
+        s = stat.second.a_count_dist_f->Close();
+      }
+
+      if (s.ok() && stat.second.a_prefix_cut_f) {
+        s = stat.second.a_prefix_cut_f->Close();
+      }
+
+      if (s.ok() && stat.second.a_value_size_f) {
+        s = stat.second.a_value_size_f->Close();
+      }
+
+      if (s.ok() && stat.second.a_key_size_f) {
+        s = stat.second.a_key_size_f->Close();
+      }
+
+      if (s.ok() && stat.second.a_qps_f) {
+        s = stat.second.a_qps_f->Close();
+      }
+
+      if (s.ok() && stat.second.a_top_qps_prefix_f) {
+        s = stat.second.a_top_qps_prefix_f->Close();
+      }
+
+      if (s.ok() && stat.second.w_key_f) {
+        s = stat.second.w_key_f->Close();
+      }
+      if (s.ok() && stat.second.w_prefix_cut_f) {
+        s = stat.second.w_prefix_cut_f->Close();
+      }
+    }
+  }
+  return s;
+}
+
+Status TraceAnalyzer::Handle(const WriteQueryTraceRecord& record,
+                             std::unique_ptr<TraceRecordResult>* /*result*/) {
+  total_writes_++;
+  // Note that, if the write happens in a transaction,
+  // 'Write' will be called twice, one for Prepare, one for
+  // Commit. Thus, in the trace, for the same WriteBatch, there
+  // will be two records if it is in a transaction. Here, we only
+  // process the reord that is committed. If write is non-transaction,
+  // HasBeginPrepare()==false, so we process it normally.
+  WriteBatch batch(record.GetWriteBatchRep().ToString());
+  if (batch.Count() == 0 || (batch.HasBeginPrepare() && !batch.HasCommit())) {
+    return Status::OK();
+  }
+  write_batch_ts_ = record.GetTimestamp();
+
+  // write_result_ will be updated in batch's handler during iteration.
+  Status s = batch.Iterate(this);
+  write_batch_ts_ = 0;
+  if (!s.ok()) {
+    fprintf(stderr, "Cannot process the write batch in the trace\n");
+    return s;
+  }
+
+  return Status::OK();
+}
+
+Status TraceAnalyzer::Handle(const GetQueryTraceRecord& record,
+                             std::unique_ptr<TraceRecordResult>* /*result*/) {
+  total_gets_++;
+  return OutputAnalysisResult(TraceOperationType::kGet, record.GetTimestamp(),
+                              record.GetColumnFamilyID(),
+                              std::move(record.GetKey()), 0);
+}
+
+Status TraceAnalyzer::Handle(const IteratorSeekQueryTraceRecord& record,
+                             std::unique_ptr<TraceRecordResult>* /*result*/) {
+  TraceOperationType op_type;
+  if (record.GetSeekType() == IteratorSeekQueryTraceRecord::kSeek) {
+    op_type = TraceOperationType::kIteratorSeek;
+    total_seeks_++;
+  } else {
+    op_type = TraceOperationType::kIteratorSeekForPrev;
+    total_seek_prevs_++;
+  }
+
+  // To do: shall we add lower/upper bounds?
+
+  return OutputAnalysisResult(op_type, record.GetTimestamp(),
+                              record.GetColumnFamilyID(),
+                              std::move(record.GetKey()), 0);
+}
+
+Status TraceAnalyzer::Handle(const MultiGetQueryTraceRecord& record,
+                             std::unique_ptr<TraceRecordResult>* /*result*/) {
+  total_multigets_++;
+
+  std::vector<uint32_t> cf_ids = record.GetColumnFamilyIDs();
+  std::vector<Slice> keys = record.GetKeys();
+  std::vector<size_t> value_sizes;
+
+  // If the size does not match is not the error of tracing and anayzing, we
+  // just report it to the user. The analyzing continues.
+  if (cf_ids.size() > keys.size()) {
+    printf("The CF ID vector size does not match the keys vector size!\n");
+    // Make the sure the 2 vectors are of the same (smaller) size.
+    cf_ids.resize(keys.size());
+  } else if (cf_ids.size() < keys.size()) {
+    printf("The CF ID vector size does not match the keys vector size!\n");
+    // Make the sure the 2 vectors are of the same (smaller) size.
+    keys.resize(cf_ids.size());
+  }
+  // Now the 2 vectors must be of the same size.
+  value_sizes.resize(cf_ids.size(), 0);
+
+  return OutputAnalysisResult(TraceOperationType::kMultiGet,
+                              record.GetTimestamp(), std::move(cf_ids),
+                              std::move(keys), std::move(value_sizes));
+}
+
+// Handle the Put request in the write batch of the trace
+Status TraceAnalyzer::PutCF(uint32_t column_family_id, const Slice& key,
+                            const Slice& value) {
+  return OutputAnalysisResult(TraceOperationType::kPut, write_batch_ts_,
+                              column_family_id, key, value.size());
+}
+
+// Handle the Delete request in the write batch of the trace
+Status TraceAnalyzer::DeleteCF(uint32_t column_family_id, const Slice& key) {
+  return OutputAnalysisResult(TraceOperationType::kDelete, write_batch_ts_,
+                              column_family_id, key, 0);
+}
+
+// Handle the SingleDelete request in the write batch of the trace
+Status TraceAnalyzer::SingleDeleteCF(uint32_t column_family_id,
+                                     const Slice& key) {
+  return OutputAnalysisResult(TraceOperationType::kSingleDelete,
+                              write_batch_ts_, column_family_id, key, 0);
+}
+
+// Handle the DeleteRange request in the write batch of the trace
+Status TraceAnalyzer::DeleteRangeCF(uint32_t column_family_id,
+                                    const Slice& begin_key,
+                                    const Slice& end_key) {
+  return OutputAnalysisResult(TraceOperationType::kRangeDelete, write_batch_ts_,
+                              {column_family_id, column_family_id},
+                              {begin_key, end_key}, {0, 0});
+}
+
+// Handle the Merge request in the write batch of the trace
+Status TraceAnalyzer::MergeCF(uint32_t column_family_id, const Slice& key,
+                              const Slice& value) {
+  return OutputAnalysisResult(TraceOperationType::kMerge, write_batch_ts_,
+                              column_family_id, key, value.size());
+}
+
+Status TraceAnalyzer::OutputAnalysisResult(TraceOperationType op_type,
+                                           uint64_t timestamp,
+                                           std::vector<uint32_t> cf_ids,
+                                           std::vector<Slice> keys,
+                                           std::vector<size_t> value_sizes) {
+  assert(!cf_ids.empty());
+  assert(cf_ids.size() == keys.size());
+  assert(cf_ids.size() == value_sizes.size());
+
+  Status s;
+
+  if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
+    // DeleteRane only writes the begin_key.
+    size_t cnt =
+        op_type == TraceOperationType::kRangeDelete ? 1 : cf_ids.size();
+    for (size_t i = 0; i < cnt; i++) {
+      s = WriteTraceSequence(op_type, cf_ids[i], keys[i], value_sizes[i],
+                             timestamp);
+      if (!s.ok()) {
+        return Status::Corruption("Failed to write the trace sequence to file");
+      }
+    }
+  }
+
+  if (ta_[op_type].sample_count >= sample_max_) {
+    ta_[op_type].sample_count = 0;
+  }
+  if (ta_[op_type].sample_count > 0) {
+    ta_[op_type].sample_count++;
+    return Status::OK();
+  }
+  ta_[op_type].sample_count++;
+
+  if (!ta_[op_type].enabled) {
+    return Status::OK();
+  }
+
+  for (size_t i = 0; i < cf_ids.size(); i++) {
+    // Get query does not have value part, just give a fixed value 10 for easy
+    // calculation.
+    s = KeyStatsInsertion(
+        op_type, cf_ids[i], keys[i].ToString(),
+        value_sizes[i] == 0 ? kShadowValueSize : value_sizes[i], timestamp);
+    if (!s.ok()) {
+      return Status::Corruption("Failed to insert key statistics");
+    }
+  }
+
+  return Status::OK();
+}
+
+Status TraceAnalyzer::OutputAnalysisResult(TraceOperationType op_type,
+                                           uint64_t timestamp, uint32_t cf_id,
+                                           const Slice& key,
+                                           size_t value_size) {
+  return OutputAnalysisResult(
+      op_type, timestamp, std::vector<uint32_t>({cf_id}),
+      std::vector<Slice>({key}), std::vector<size_t>({value_size}));
+}
+
+// Before the analyzer is closed, the requested general statistic results are
+// printed out here. In current stage, these information are not output to
+// the files.
+// -----type
+//          |__cf_id
+//                |_statistics
+void TraceAnalyzer::PrintStatistics() {
+  for (int type = 0; type < kTaTypeNum; type++) {
+    if (!ta_[type].enabled) {
+      continue;
+    }
+    ta_[type].total_keys = 0;
+    ta_[type].total_access = 0;
+    ta_[type].total_succ_access = 0;
+    printf("\n################# Operation Type: %s #####################\n",
+           ta_[type].type_name.c_str());
+    if (qps_ave_.size() == kTaTypeNum + 1) {
+      printf("Peak QPS is: %u Average QPS is: %f\n", qps_peak_[type],
+             qps_ave_[type]);
+    }
+    for (auto& stat_it : ta_[type].stats) {
+      if (stat_it.second.a_count == 0) {
+        continue;
+      }
+      TraceStats& stat = stat_it.second;
+      uint64_t total_a_keys = static_cast<uint64_t>(stat.a_key_stats.size());
+      double key_size_ave = 0.0;
+      double value_size_ave = 0.0;
+      double key_size_vari = 0.0;
+      double value_size_vari = 0.0;
+      if (stat.a_count > 0) {
+        key_size_ave =
+            (static_cast<double>(stat.a_key_size_sum)) / stat.a_count;
+        value_size_ave =
+            (static_cast<double>(stat.a_value_size_sum)) / stat.a_count;
+        key_size_vari = std::sqrt((static_cast<double>(stat.a_key_size_sqsum)) /
+                                      stat.a_count -
+                                  key_size_ave * key_size_ave);
+        value_size_vari = std::sqrt(
+            (static_cast<double>(stat.a_value_size_sqsum)) / stat.a_count -
+            value_size_ave * value_size_ave);
+      }
+      if (value_size_ave == 0.0) {
+        stat.a_value_mid = 0;
+      }
+      cfs_[stat.cf_id].a_count += total_a_keys;
+      ta_[type].total_keys += total_a_keys;
+      ta_[type].total_access += stat.a_count;
+      ta_[type].total_succ_access += stat.a_succ_count;
+      printf("*********************************************************\n");
+      printf("colume family id: %u\n", stat.cf_id);
+      printf("Total number of queries to this cf by %s: %" PRIu64 "\n",
+             ta_[type].type_name.c_str(), stat.a_count);
+      printf("Total unique keys in this cf: %" PRIu64 "\n", total_a_keys);
+      printf("Average key size: %f key size medium: %" PRIu64
+             " Key size Variation: %f\n",
+             key_size_ave, stat.a_key_mid, key_size_vari);
+      if (type == kPut || type == kMerge) {
+        printf("Average value size: %f Value size medium: %" PRIu64
+               " Value size variation: %f\n",
+               value_size_ave, stat.a_value_mid, value_size_vari);
+      }
+      printf("Peak QPS is: %u Average QPS is: %f\n", stat.a_peak_qps,
+             stat.a_ave_qps);
+
+      // print the top k accessed key and its access count
+      if (FLAGS_print_top_k_access > 0) {
+        printf("The Top %d keys that are accessed:\n",
+               FLAGS_print_top_k_access);
+        while (!stat.top_k_queue.empty()) {
+          std::string hex_key = ROCKSDB_NAMESPACE::LDBCommand::StringToHex(
+              stat.top_k_queue.top().second);
+          printf("Access_count: %" PRIu64 " %s\n", stat.top_k_queue.top().first,
+                 hex_key.c_str());
+          stat.top_k_queue.pop();
+        }
+      }
+
+      // print the top k access prefix range and
+      // top k prefix range with highest average access per key
+      if (FLAGS_output_prefix_cut > 0) {
+        printf("The Top %d accessed prefix range:\n", FLAGS_print_top_k_access);
+        while (!stat.top_k_prefix_access.empty()) {
+          printf("Prefix: %s Access count: %" PRIu64 "\n",
+                 stat.top_k_prefix_access.top().second.c_str(),
+                 stat.top_k_prefix_access.top().first);
+          stat.top_k_prefix_access.pop();
+        }
+
+        printf("The Top %d prefix with highest access per key:\n",
+               FLAGS_print_top_k_access);
+        while (!stat.top_k_prefix_ave.empty()) {
+          printf("Prefix: %s access per key: %f\n",
+                 stat.top_k_prefix_ave.top().second.c_str(),
+                 stat.top_k_prefix_ave.top().first);
+          stat.top_k_prefix_ave.pop();
+        }
+      }
+
+      // print the operation correlations
+      if (!FLAGS_print_correlation.empty()) {
+        for (int correlation = 0;
+             correlation <
+             static_cast<int>(analyzer_opts_.correlation_list.size());
+             correlation++) {
+          printf(
+              "The correlation statistics of '%s' after '%s' is:",
+              taIndexToOpt[analyzer_opts_.correlation_list[correlation].second]
+                  .c_str(),
+              taIndexToOpt[analyzer_opts_.correlation_list[correlation].first]
+                  .c_str());
+          double correlation_ave = 0.0;
+          if (stat.correlation_output[correlation].first > 0) {
+            correlation_ave =
+                (static_cast<double>(
+                    stat.correlation_output[correlation].second)) /
+                (stat.correlation_output[correlation].first * 1000);
+          }
+          printf(" total numbers: %" PRIu64 " average time: %f(ms)\n",
+                 stat.correlation_output[correlation].first, correlation_ave);
+        }
+      }
+    }
+    printf("*********************************************************\n");
+    printf("Total keys of '%s' is: %" PRIu64 "\n", ta_[type].type_name.c_str(),
+           ta_[type].total_keys);
+    printf("Total access is: %" PRIu64 "\n", ta_[type].total_access);
+    total_access_keys_ += ta_[type].total_keys;
+  }
+
+  // Print the overall statistic information of the trace
+  printf("\n*********************************************************\n");
+  printf("*********************************************************\n");
+  printf("The column family based statistics\n");
+  for (auto& cf : cfs_) {
+    printf("The column family id: %u\n", cf.first);
+    printf("The whole key space key numbers: %" PRIu64 "\n", cf.second.w_count);
+    printf("The accessed key space key numbers: %" PRIu64 "\n",
+           cf.second.a_count);
+  }
+
+  if (FLAGS_print_overall_stats) {
+    printf("\n*********************************************************\n");
+    printf("*********************************************************\n");
+    if (qps_peak_.size() == kTaTypeNum + 1) {
+      printf("Average QPS per second: %f Peak QPS: %u\n", qps_ave_[kTaTypeNum],
+             qps_peak_[kTaTypeNum]);
+    }
+    printf("The statistics related to query number need to times: %u\n",
+           sample_max_);
+    printf("Total_requests: %" PRIu64 " Total_accessed_keys: %" PRIu64
+           " Total_gets: %" PRIu64 " Total_write_batches: %" PRIu64
+           " Total_seeks: %" PRIu64 " Total_seek_for_prevs: %" PRIu64
+           " Total_multigets: %" PRIu64 "\n",
+           total_requests_, total_access_keys_, total_gets_, total_writes_,
+           total_seeks_, total_seek_prevs_, total_multigets_);
+    for (int type = 0; type < kTaTypeNum; type++) {
+      if (!ta_[type].enabled) {
+        continue;
+      }
+      printf("Operation: '%s' has: %" PRIu64 "\n", ta_[type].type_name.c_str(),
+             ta_[type].total_access);
+    }
+  }
+}
+
+// Write the trace sequence to file
+Status TraceAnalyzer::WriteTraceSequence(const uint32_t& type,
+                                         const uint32_t& cf_id,
+                                         const Slice& key,
+                                         const size_t value_size,
+                                         const uint64_t ts) {
+  std::string hex_key =
+      ROCKSDB_NAMESPACE::LDBCommand::StringToHex(key.ToString());
+  int ret;
+  ret = snprintf(buffer_, sizeof(buffer_), "%u %u %zu %" PRIu64 "\n", type,
+                 cf_id, value_size, ts);
+  if (ret < 0) {
+    return Status::IOError("failed to format the output");
+  }
+  std::string printout(buffer_);
+  if (!FLAGS_no_key) {
+    printout = hex_key + " " + printout;
+  }
+  return trace_sequence_f_->Append(printout);
+}
+
+// The entrance function of Trace_Analyzer
+int trace_analyzer_tool(int argc, char** argv) {
+  std::string trace_path;
+  std::string output_path;
+
+  AnalyzerOptions analyzer_opts;
+
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  if (!FLAGS_print_correlation.empty()) {
+    analyzer_opts.SparseCorrelationInput(FLAGS_print_correlation);
+  }
+
+  std::unique_ptr<TraceAnalyzer> analyzer(
+      new TraceAnalyzer(FLAGS_trace_path, FLAGS_output_dir, analyzer_opts));
+
+  if (!analyzer) {
+    fprintf(stderr, "Cannot initiate the trace analyzer\n");
+    exit(1);
+  }
+
+  ROCKSDB_NAMESPACE::Status s = analyzer->PrepareProcessing();
+  if (!s.ok()) {
+    fprintf(stderr, "%s\n", s.getState());
+    fprintf(stderr, "Cannot initiate the trace reader\n");
+    exit(1);
+  }
+
+  s = analyzer->StartProcessing();
+  if (!s.ok() && !FLAGS_try_process_corrupted_trace) {
+    fprintf(stderr, "%s\n", s.getState());
+    fprintf(stderr, "Cannot process the trace\n");
+    exit(1);
+  }
+
+  s = analyzer->MakeStatistics();
+  if (!s.ok()) {
+    fprintf(stderr, "%s\n", s.getState());
+    analyzer->EndProcessing();
+    fprintf(stderr, "Cannot make the statistics\n");
+    exit(1);
+  }
+
+  s = analyzer->ReProcessing();
+  if (!s.ok()) {
+    fprintf(stderr, "%s\n", s.getState());
+    fprintf(stderr, "Cannot re-process the trace for more statistics\n");
+    analyzer->EndProcessing();
+    exit(1);
+  }
+
+  s = analyzer->EndProcessing();
+  if (!s.ok()) {
+    fprintf(stderr, "%s\n", s.getState());
+    fprintf(stderr, "Cannot close the trace analyzer\n");
+    exit(1);
+  }
+
+  return 0;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // Endif of Gflag
+#endif  // RocksDB LITE
diff --git a/src/rocksdb/tools/trace_analyzer_tool.h b/src/rocksdb/tools/trace_analyzer_tool.h
new file mode 100644
index 000000000..4b885b18c
--- /dev/null
+++ b/src/rocksdb/tools/trace_analyzer_tool.h
@@ -0,0 +1,326 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <list>
+#include <map>
+#include <queue>
+#include <set>
+#include <utility>
+#include <vector>
+
+#include "rocksdb/env.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "rocksdb/trace_record.h"
+#include "rocksdb/write_batch.h"
+#include "trace_replay/trace_replay.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Value sizes may be used as denominators. Replacing 0 value sizes with this
+// positive integer avoids division error.
+extern const size_t kShadowValueSize /* = 10*/;
+
+enum TraceOperationType : int {
+  kGet = 0,
+  kPut = 1,
+  kDelete = 2,
+  kSingleDelete = 3,
+  kRangeDelete = 4,
+  kMerge = 5,
+  kIteratorSeek = 6,
+  kIteratorSeekForPrev = 7,
+  kMultiGet = 8,
+  kTaTypeNum = 9
+};
+
+struct TraceUnit {
+  uint64_t ts;
+  uint32_t type;
+  uint32_t cf_id;
+  size_t value_size;
+  std::string key;
+};
+
+struct TypeCorrelation {
+  uint64_t count;
+  uint64_t total_ts;
+};
+
+struct StatsUnit {
+  uint64_t key_id;
+  uint64_t access_count;
+  uint64_t latest_ts;
+  uint64_t succ_count;  // current only used to count Get if key found
+  uint32_t cf_id;
+  size_t value_size;
+  std::vector<TypeCorrelation> v_correlation;
+};
+
+class AnalyzerOptions {
+ public:
+  std::vector<std::vector<int>> correlation_map;
+  std::vector<std::pair<int, int>> correlation_list;
+
+  AnalyzerOptions();
+
+  ~AnalyzerOptions();
+
+  void SparseCorrelationInput(const std::string& in_str);
+};
+
+// Note that, for the variable names  in the trace_analyzer,
+// Starting with 'a_' means the variable is used for 'accessed_keys'.
+// Starting with 'w_' means it is used for 'the whole key space'.
+// Ending with '_f' means a file write or reader pointer.
+// For example, 'a_count' means 'accessed_keys_count',
+// 'w_key_f' means 'whole_key_space_file'.
+
+struct TraceStats {
+  uint32_t cf_id;
+  std::string cf_name;
+  uint64_t a_count;
+  uint64_t a_succ_count;
+  uint64_t a_key_id;
+  uint64_t a_key_size_sqsum;
+  uint64_t a_key_size_sum;
+  uint64_t a_key_mid;
+  uint64_t a_value_size_sqsum;
+  uint64_t a_value_size_sum;
+  uint64_t a_value_mid;
+  uint32_t a_peak_qps;
+  double a_ave_qps;
+  std::map<std::string, StatsUnit> a_key_stats;
+  std::map<uint64_t, uint64_t> a_count_stats;
+  std::map<uint64_t, uint64_t> a_key_size_stats;
+  std::map<uint64_t, uint64_t> a_value_size_stats;
+  std::map<uint32_t, uint32_t> a_qps_stats;
+  std::map<uint32_t, std::map<std::string, uint32_t>> a_qps_prefix_stats;
+  std::priority_queue<std::pair<uint64_t, std::string>,
+                      std::vector<std::pair<uint64_t, std::string>>,
+                      std::greater<std::pair<uint64_t, std::string>>>
+      top_k_queue;
+  std::priority_queue<std::pair<uint64_t, std::string>,
+                      std::vector<std::pair<uint64_t, std::string>>,
+                      std::greater<std::pair<uint64_t, std::string>>>
+      top_k_prefix_access;
+  std::priority_queue<std::pair<double, std::string>,
+                      std::vector<std::pair<double, std::string>>,
+                      std::greater<std::pair<double, std::string>>>
+      top_k_prefix_ave;
+  std::priority_queue<std::pair<uint32_t, uint32_t>,
+                      std::vector<std::pair<uint32_t, uint32_t>>,
+                      std::greater<std::pair<uint32_t, uint32_t>>>
+      top_k_qps_sec;
+  std::list<TraceUnit> time_series;
+  std::vector<std::pair<uint64_t, uint64_t>> correlation_output;
+  std::map<uint32_t, uint64_t> uni_key_num;
+
+  std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> time_series_f;
+  std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> a_key_f;
+  std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> a_count_dist_f;
+  std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> a_prefix_cut_f;
+  std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> a_value_size_f;
+  std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> a_key_size_f;
+  std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> a_key_num_f;
+  std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> a_qps_f;
+  std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> a_top_qps_prefix_f;
+  std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> w_key_f;
+  std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> w_prefix_cut_f;
+
+  TraceStats();
+  ~TraceStats();
+  TraceStats(const TraceStats&) = delete;
+  TraceStats& operator=(const TraceStats&) = delete;
+  TraceStats(TraceStats&&) = default;
+  TraceStats& operator=(TraceStats&&) = default;
+};
+
+struct TypeUnit {
+  std::string type_name;
+  bool enabled;
+  uint64_t total_keys;
+  uint64_t total_access;
+  uint64_t total_succ_access;
+  uint32_t sample_count;
+  std::map<uint32_t, TraceStats> stats;
+  TypeUnit() = default;
+  ~TypeUnit() = default;
+  TypeUnit(const TypeUnit&) = delete;
+  TypeUnit& operator=(const TypeUnit&) = delete;
+  TypeUnit(TypeUnit&&) = default;
+  TypeUnit& operator=(TypeUnit&&) = default;
+};
+
+struct CfUnit {
+  uint32_t cf_id;
+  uint64_t w_count;  // total keys in this cf if we use the whole key space
+  uint64_t a_count;  // the total keys in this cf that are accessed
+  std::map<uint64_t, uint64_t> w_key_size_stats;  // whole key space key size
+                                                  // statistic this cf
+  std::map<uint32_t, uint32_t> cf_qps;
+};
+
+class TraceAnalyzer : private TraceRecord::Handler,
+                      private WriteBatch::Handler {
+ public:
+  TraceAnalyzer(std::string& trace_path, std::string& output_path,
+                AnalyzerOptions _analyzer_opts);
+  ~TraceAnalyzer();
+
+  Status PrepareProcessing();
+
+  Status StartProcessing();
+
+  Status MakeStatistics();
+
+  Status ReProcessing();
+
+  Status EndProcessing();
+
+  Status WriteTraceUnit(TraceUnit& unit);
+
+  std::vector<TypeUnit>& GetTaVector() { return ta_; }
+
+ private:
+  using TraceRecord::Handler::Handle;
+  Status Handle(const WriteQueryTraceRecord& record,
+                std::unique_ptr<TraceRecordResult>* result) override;
+  Status Handle(const GetQueryTraceRecord& record,
+                std::unique_ptr<TraceRecordResult>* result) override;
+  Status Handle(const IteratorSeekQueryTraceRecord& record,
+                std::unique_ptr<TraceRecordResult>* result) override;
+  Status Handle(const MultiGetQueryTraceRecord& record,
+                std::unique_ptr<TraceRecordResult>* result) override;
+
+  using WriteBatch::Handler::PutCF;
+  Status PutCF(uint32_t column_family_id, const Slice& key,
+               const Slice& value) override;
+
+  using WriteBatch::Handler::DeleteCF;
+  Status DeleteCF(uint32_t column_family_id, const Slice& key) override;
+
+  using WriteBatch::Handler::SingleDeleteCF;
+  Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override;
+
+  using WriteBatch::Handler::DeleteRangeCF;
+  Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key,
+                       const Slice& end_key) override;
+
+  using WriteBatch::Handler::MergeCF;
+  Status MergeCF(uint32_t column_family_id, const Slice& key,
+                 const Slice& value) override;
+
+  // The following hanlders are not implemented, return Status::OK() to avoid
+  // the running time assertion and other irrelevant falures.
+  using WriteBatch::Handler::PutBlobIndexCF;
+  Status PutBlobIndexCF(uint32_t /*column_family_id*/, const Slice& /*key*/,
+                        const Slice& /*value*/) override {
+    return Status::OK();
+  }
+
+  // The default implementation of LogData does nothing.
+  using WriteBatch::Handler::LogData;
+  void LogData(const Slice& /*blob*/) override {}
+
+  using WriteBatch::Handler::MarkBeginPrepare;
+  Status MarkBeginPrepare(bool = false) override { return Status::OK(); }
+
+  using WriteBatch::Handler::MarkEndPrepare;
+  Status MarkEndPrepare(const Slice& /*xid*/) override { return Status::OK(); }
+
+  using WriteBatch::Handler::MarkNoop;
+  Status MarkNoop(bool /*empty_batch*/) override { return Status::OK(); }
+
+  using WriteBatch::Handler::MarkRollback;
+  Status MarkRollback(const Slice& /*xid*/) override { return Status::OK(); }
+
+  using WriteBatch::Handler::MarkCommit;
+  Status MarkCommit(const Slice& /*xid*/) override { return Status::OK(); }
+
+  using WriteBatch::Handler::MarkCommitWithTimestamp;
+  Status MarkCommitWithTimestamp(const Slice& /*xid*/,
+                                 const Slice& /*commit_ts*/) override {
+    return Status::OK();
+  }
+
+  // Process each trace operation and output the analysis result to
+  // stdout/files.
+  Status OutputAnalysisResult(TraceOperationType op_type, uint64_t timestamp,
+                              std::vector<uint32_t> cf_ids,
+                              std::vector<Slice> keys,
+                              std::vector<size_t> value_sizes);
+
+  Status OutputAnalysisResult(TraceOperationType op_type, uint64_t timestamp,
+                              uint32_t cf_id, const Slice& key,
+                              size_t value_size);
+
+  ROCKSDB_NAMESPACE::Env* env_;
+  EnvOptions env_options_;
+  std::unique_ptr<TraceReader> trace_reader_;
+  size_t offset_;
+  char buffer_[1024];
+  // Timestamp of a WriteBatch, used in its iteration.
+  uint64_t write_batch_ts_;
+  std::string trace_name_;
+  std::string output_path_;
+  AnalyzerOptions analyzer_opts_;
+  uint64_t total_requests_;
+  uint64_t total_access_keys_;
+  uint64_t total_gets_;
+  uint64_t total_writes_;
+  uint64_t total_seeks_;
+  uint64_t total_seek_prevs_;
+  uint64_t total_multigets_;
+  uint64_t trace_create_time_;
+  uint64_t begin_time_;
+  uint64_t end_time_;
+  uint64_t time_series_start_;
+  uint32_t sample_max_;
+  uint32_t cur_time_sec_;
+  std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile>
+      trace_sequence_f_;                                    // readable trace
+  std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> qps_f_;  // overall qps
+  std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile>
+      cf_qps_f_;              // The qps of each CF>
+  std::vector<TypeUnit> ta_;  // The main statistic collecting data structure
+  std::map<uint32_t, CfUnit> cfs_;  // All the cf_id appears in this trace;
+  std::vector<uint32_t> qps_peak_;
+  std::vector<double> qps_ave_;
+
+  Status ReadTraceHeader(Trace* header);
+  Status ReadTraceFooter(Trace* footer);
+  Status ReadTraceRecord(Trace* trace);
+  Status KeyStatsInsertion(const uint32_t& type, const uint32_t& cf_id,
+                           const std::string& key, const size_t value_size,
+                           const uint64_t ts);
+  Status StatsUnitCorrelationUpdate(StatsUnit& unit, const uint32_t& type,
+                                    const uint64_t& ts, const std::string& key);
+  Status OpenStatsOutputFiles(const std::string& type, TraceStats& new_stats);
+  Status CreateOutputFile(
+      const std::string& type, const std::string& cf_name,
+      const std::string& ending,
+      std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile>* f_ptr);
+  Status CloseOutputFiles();
+
+  void PrintStatistics();
+  Status TraceUnitWriter(
+      std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile>& f_ptr, TraceUnit& unit);
+  Status WriteTraceSequence(const uint32_t& type, const uint32_t& cf_id,
+                            const Slice& key, const size_t value_size,
+                            const uint64_t ts);
+  Status MakeStatisticKeyStatsOrPrefix(TraceStats& stats);
+  Status MakeStatisticCorrelation(TraceStats& stats, StatsUnit& unit);
+  Status MakeStatisticQPS();
+  int db_version_;
+};
+
+int trace_analyzer_tool(int argc, char** argv);
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/verify_random_db.sh b/src/rocksdb/tools/verify_random_db.sh
new file mode 100755
index 000000000..fbe5b75fd
--- /dev/null
+++ b/src/rocksdb/tools/verify_random_db.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+# A shell script to verify DB generated by generate_random_db.sh cannot opened and read correct data.
+# ./ldb needs to be avaible to be executed.
+#
+# Usage: <SCRIPT> <DB Path>
+
+scriptpath=`dirname $BASH_SOURCE`
+if [ "$#" -lt 2 ]; then
+  echo "usage: $BASH_SOURCE <db_directory> <compare_base_db_directory> [dump_file_name] [if_try_load_options] [if_ignore_unknown_options]"
+  exit 1
+fi
+
+db_dir=$1
+base_db_dir=$2
+dump_file_name=${3:-"dump_file.txt"}
+try_load_options=${4:-"1"}
+ignore_unknown_options=${5:-"0"}
+db_dump=$db_dir"/"$dump_file_name
+base_db_dump=$base_db_dir"/"$dump_file_name
+extra_params=
+
+if [ "$try_load_options" = "0" ]; then
+ extra_params=" --try_load_options=false"
+elif [ "$try_load_options" = "1" ]; then
+ extra_params=" --try_load_options=true"
+fi
+
+if [ "$ignore_unknown_options" = "1" ]; then
+ extra_params="$extra_params --ignore_unknown_options"
+fi
+
+set -e
+echo == Dumping data from $db_dir to $db_dump
+./ldb dump --db=$db_dir $extra_params > $db_dump
+
+echo == Dumping data from $base_db_dir to $base_db_dump
+./ldb dump --db=$base_db_dir $extra_params > $base_db_dump
+
+diff $db_dump $base_db_dump
diff --git a/src/rocksdb/tools/write_external_sst.sh b/src/rocksdb/tools/write_external_sst.sh
new file mode 100755
index 000000000..be01ae022
--- /dev/null
+++ b/src/rocksdb/tools/write_external_sst.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+#
+
+if [ "$#" -lt 3 ]; then
+  echo "usage: $BASH_SOURCE <input_data_path> <DB Path> <extern SST dir>"
+  exit 1
+fi
+
+input_data_dir=$1
+db_dir=$2
+extern_sst_dir=$3
+rm -rf $db_dir
+mkdir -p $extern_sst_dir
+
+set -e
+
+n=0
+
+for f in `find $input_data_dir -name sorted_data*`
+do
+  echo == Writing external SST file $f to $extern_sst_dir/extern_sst${n}
+  ./ldb --db=$db_dir --create_if_missing write_extern_sst $extern_sst_dir/extern_sst${n} < $f
+  let "n = n + 1"
+done
diff --git a/src/rocksdb/tools/write_stress.cc b/src/rocksdb/tools/write_stress.cc
new file mode 100644
index 000000000..ba5bd3f4f
--- /dev/null
+++ b/src/rocksdb/tools/write_stress.cc
@@ -0,0 +1,309 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+//
+// The goal of this tool is to be a simple stress test with focus on catching:
+// * bugs in compaction/flush processes, especially the ones that cause
+// assertion errors
+// * bugs in the code that deletes obsolete files
+//
+// There are two parts of the test:
+// * write_stress, a binary that writes to the database
+// * write_stress_runner.py, a script that invokes and kills write_stress
+//
+// Here are some interesting parts of write_stress:
+// * Runs with very high concurrency of compactions and flushes (32 threads
+// total) and tries to create a huge amount of small files
+// * The keys written to the database are not uniformly distributed -- there is
+// a 3-character prefix that mutates occasionally (in prefix mutator thread), in
+// such a way that the first character mutates slower than second, which mutates
+// slower than third character. That way, the compaction stress tests some
+// interesting compaction features like trivial moves and bottommost level
+// calculation
+// * There is a thread that creates an iterator, holds it for couple of seconds
+// and then iterates over all keys. This is supposed to test RocksDB's abilities
+// to keep the files alive when there are references to them.
+// * Some writes trigger WAL sync. This is stress testing our WAL sync code.
+// * At the end of the run, we make sure that we didn't leak any of the sst
+// files
+//
+// write_stress_runner.py changes the mode in which we run write_stress and also
+// kills and restarts it. There are some interesting characteristics:
+// * At the beginning we divide the full test runtime into smaller parts --
+// shorter runtimes (couple of seconds) and longer runtimes (100, 1000) seconds
+// * The first time we run write_stress, we destroy the old DB. Every next time
+// during the test, we use the same DB.
+// * We can run in kill mode or clean-restart mode. Kill mode kills the
+// write_stress violently.
+// * We can run in mode where delete_obsolete_files_with_fullscan is true or
+// false
+// * We can run with low_open_files mode turned on or off. When it's turned on,
+// we configure table cache to only hold a couple of files -- that way we need
+// to reopen files every time we access them.
+//
+// Another goal was to create a stress test without a lot of parameters. So
+// tools/write_stress_runner.py should only take one parameter -- runtime_sec
+// and it should figure out everything else on its own.
+
+#include <cstdio>
+
+#ifndef GFLAGS
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+
+#include <atomic>
+#include <cinttypes>
+#include <random>
+#include <set>
+#include <string>
+#include <thread>
+
+#include "file/filename.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/system_clock.h"
+#include "util/gflags_compat.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::RegisterFlagValidator;
+using GFLAGS_NAMESPACE::SetUsageMessage;
+
+DEFINE_int32(key_size, 10, "Key size");
+DEFINE_int32(value_size, 100, "Value size");
+DEFINE_string(db, "", "Use the db with the following name.");
+DEFINE_bool(destroy_db, true,
+            "Destroy the existing DB before running the test");
+
+DEFINE_int32(runtime_sec, 10 * 60, "How long are we running for, in seconds");
+DEFINE_int32(seed, 139, "Random seed");
+
+DEFINE_double(prefix_mutate_period_sec, 1.0,
+              "How often are we going to mutate the prefix");
+DEFINE_double(first_char_mutate_probability, 0.1,
+              "How likely are we to mutate the first char every period");
+DEFINE_double(second_char_mutate_probability, 0.2,
+              "How likely are we to mutate the second char every period");
+DEFINE_double(third_char_mutate_probability, 0.5,
+              "How likely are we to mutate the third char every period");
+
+DEFINE_int32(iterator_hold_sec, 5,
+             "How long will the iterator hold files before it gets destroyed");
+
+DEFINE_double(sync_probability, 0.01, "How often are we syncing writes");
+DEFINE_bool(delete_obsolete_files_with_fullscan, false,
+            "If true, we delete obsolete files after each compaction/flush "
+            "using GetChildren() API");
+DEFINE_bool(low_open_files_mode, false,
+            "If true, we set max_open_files to 20, so that every file access "
+            "needs to reopen it");
+
+namespace ROCKSDB_NAMESPACE {
+
+static const int kPrefixSize = 3;
+
+class WriteStress {
+ public:
+  WriteStress() : stop_(false) {
+    // initialize key_prefix
+    for (int i = 0; i < kPrefixSize; ++i) {
+      key_prefix_[i].store('a');
+    }
+
+    // Choose a location for the test database if none given with --db=<path>
+    if (FLAGS_db.empty()) {
+      std::string default_db_path;
+      Env::Default()->GetTestDirectory(&default_db_path);
+      default_db_path += "/write_stress";
+      FLAGS_db = default_db_path;
+    }
+
+    Options options;
+    if (FLAGS_destroy_db) {
+      DestroyDB(FLAGS_db, options);  // ignore
+    }
+
+    // make the LSM tree deep, so that we have many concurrent flushes and
+    // compactions
+    options.create_if_missing = true;
+    options.write_buffer_size = 256 * 1024;              // 256k
+    options.max_bytes_for_level_base = 1 * 1024 * 1024;  // 1MB
+    options.target_file_size_base = 100 * 1024;          // 100k
+    options.max_write_buffer_number = 16;
+    options.max_background_compactions = 16;
+    options.max_background_flushes = 16;
+    options.max_open_files = FLAGS_low_open_files_mode ? 20 : -1;
+    if (FLAGS_delete_obsolete_files_with_fullscan) {
+      options.delete_obsolete_files_period_micros = 0;
+    }
+
+    // open DB
+    DB* db;
+    Status s = DB::Open(options, FLAGS_db, &db);
+    if (!s.ok()) {
+      fprintf(stderr, "Can't open database: %s\n", s.ToString().c_str());
+      std::abort();
+    }
+    db_.reset(db);
+  }
+
+  void WriteThread() {
+    std::mt19937 rng(static_cast<unsigned int>(FLAGS_seed));
+    std::uniform_real_distribution<double> dist(0, 1);
+
+    auto random_string = [](std::mt19937& r, int len) {
+      std::uniform_int_distribution<int> char_dist('a', 'z');
+      std::string ret;
+      for (int i = 0; i < len; ++i) {
+        ret += static_cast<char>(char_dist(r));
+      }
+      return ret;
+    };
+
+    while (!stop_.load(std::memory_order_relaxed)) {
+      std::string prefix;
+      prefix.resize(kPrefixSize);
+      for (int i = 0; i < kPrefixSize; ++i) {
+        prefix[i] = key_prefix_[i].load(std::memory_order_relaxed);
+      }
+      auto key = prefix + random_string(rng, FLAGS_key_size - kPrefixSize);
+      auto value = random_string(rng, FLAGS_value_size);
+      WriteOptions woptions;
+      woptions.sync = dist(rng) < FLAGS_sync_probability;
+      auto s = db_->Put(woptions, key, value);
+      if (!s.ok()) {
+        fprintf(stderr, "Write to DB failed: %s\n", s.ToString().c_str());
+        std::abort();
+      }
+    }
+  }
+
+  void IteratorHoldThread() {
+    while (!stop_.load(std::memory_order_relaxed)) {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
+      SystemClock::Default()->SleepForMicroseconds(FLAGS_iterator_hold_sec *
+                                                   1000 * 1000LL);
+      for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+      }
+      if (!iterator->status().ok()) {
+        fprintf(stderr, "Iterator statuts not OK: %s\n",
+                iterator->status().ToString().c_str());
+        std::abort();
+      }
+    }
+  }
+
+  void PrefixMutatorThread() {
+    std::mt19937 rng(static_cast<unsigned int>(FLAGS_seed));
+    std::uniform_real_distribution<double> dist(0, 1);
+    std::uniform_int_distribution<int> char_dist('a', 'z');
+    while (!stop_.load(std::memory_order_relaxed)) {
+      SystemClock::Default()->SleepForMicroseconds(
+          static_cast<int>(FLAGS_prefix_mutate_period_sec * 1000 * 1000LL));
+      if (dist(rng) < FLAGS_first_char_mutate_probability) {
+        key_prefix_[0].store(static_cast<char>(char_dist(rng)),
+                             std::memory_order_relaxed);
+      }
+      if (dist(rng) < FLAGS_second_char_mutate_probability) {
+        key_prefix_[1].store(static_cast<char>(char_dist(rng)),
+                             std::memory_order_relaxed);
+      }
+      if (dist(rng) < FLAGS_third_char_mutate_probability) {
+        key_prefix_[2].store(static_cast<char>(char_dist(rng)),
+                             std::memory_order_relaxed);
+      }
+    }
+  }
+
+  int Run() {
+    threads_.emplace_back([&]() { WriteThread(); });
+    threads_.emplace_back([&]() { PrefixMutatorThread(); });
+    threads_.emplace_back([&]() { IteratorHoldThread(); });
+
+    if (FLAGS_runtime_sec == -1) {
+      // infinite runtime, until we get killed
+      while (true) {
+        SystemClock::Default()->SleepForMicroseconds(1000 * 1000);
+      }
+    }
+
+    SystemClock::Default()->SleepForMicroseconds(FLAGS_runtime_sec * 1000 *
+                                                 1000);
+
+    stop_.store(true, std::memory_order_relaxed);
+    for (auto& t : threads_) {
+      t.join();
+    }
+    threads_.clear();
+
+// Skip checking for leaked files in ROCKSDB_LITE since we don't have access to
+// function GetLiveFilesMetaData
+#ifndef ROCKSDB_LITE
+    // let's see if we leaked some files
+    db_->PauseBackgroundWork();
+    std::vector<LiveFileMetaData> metadata;
+    db_->GetLiveFilesMetaData(&metadata);
+    std::set<uint64_t> sst_file_numbers;
+    for (const auto& file : metadata) {
+      uint64_t number;
+      FileType type;
+      if (!ParseFileName(file.name, &number, "LOG", &type)) {
+        continue;
+      }
+      if (type == kTableFile) {
+        sst_file_numbers.insert(number);
+      }
+    }
+
+    std::vector<std::string> children;
+    Env::Default()->GetChildren(FLAGS_db, &children);
+    for (const auto& child : children) {
+      uint64_t number;
+      FileType type;
+      if (!ParseFileName(child, &number, "LOG", &type)) {
+        continue;
+      }
+      if (type == kTableFile) {
+        if (sst_file_numbers.find(number) == sst_file_numbers.end()) {
+          fprintf(stderr,
+                  "Found a table file in DB path that should have been "
+                  "deleted: %s\n",
+                  child.c_str());
+          std::abort();
+        }
+      }
+    }
+    db_->ContinueBackgroundWork();
+#endif  // !ROCKSDB_LITE
+
+    return 0;
+  }
+
+ private:
+  // each key is prepended with this prefix. we occasionally change it. third
+  // letter is changed more frequently than second, which is changed more
+  // frequently than the first one.
+  std::atomic<char> key_prefix_[kPrefixSize];
+  std::atomic<bool> stop_;
+  std::vector<port::Thread> threads_;
+  std::unique_ptr<DB> db_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                  " [OPTIONS]...");
+  ParseCommandLineFlags(&argc, &argv, true);
+  ROCKSDB_NAMESPACE::WriteStress write_stress;
+  return write_stress.Run();
+}
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/tools/write_stress_runner.py b/src/rocksdb/tools/write_stress_runner.py
new file mode 100644
index 000000000..f39f79cd4
--- /dev/null
+++ b/src/rocksdb/tools/write_stress_runner.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import argparse
+import random
+
+import subprocess
+import sys
+import time
+
+
+def generate_runtimes(total_runtime):
+    # combination of short runtimes and long runtimes, with heavier
+    # weight on short runtimes
+    possible_runtimes_sec = list(range(1, 10)) + list(range(1, 20)) + [100, 1000]
+    runtimes = []
+    while total_runtime > 0:
+        chosen = random.choice(possible_runtimes_sec)
+        chosen = min(chosen, total_runtime)
+        runtimes.append(chosen)
+        total_runtime -= chosen
+    return runtimes
+
+
+def main(args):
+    runtimes = generate_runtimes(int(args.runtime_sec))
+    print(
+        "Going to execute write stress for " + str(runtimes)
+    )  # noqa: E999 T25377293 Grandfathered in
+    first_time = True
+
+    for runtime in runtimes:
+        kill = random.choice([False, True])
+
+        cmd = "./write_stress --runtime_sec=" + ("-1" if kill else str(runtime))
+
+        if len(args.db) > 0:
+            cmd = cmd + " --db=" + args.db
+
+        if first_time:
+            first_time = False
+        else:
+            # use current db
+            cmd = cmd + " --destroy_db=false"
+        if random.choice([False, True]):
+            cmd = cmd + " --delete_obsolete_files_with_fullscan=true"
+        if random.choice([False, True]):
+            cmd = cmd + " --low_open_files_mode=true"
+
+        print(
+            "Running write_stress for %d seconds (%s): %s"
+            % (runtime, ("kill-mode" if kill else "clean-shutdown-mode"), cmd)
+        )
+
+        child = subprocess.Popen([cmd], shell=True)
+        killtime = time.time() + runtime
+        while not kill or time.time() < killtime:
+            time.sleep(1)
+            if child.poll() is not None:
+                if child.returncode == 0:
+                    break
+                else:
+                    print(
+                        "ERROR: write_stress died with exitcode=%d\n" % child.returncode
+                    )
+                    sys.exit(1)
+        if kill:
+            child.kill()
+        # breathe
+        time.sleep(3)
+
+
+if __name__ == "__main__":
+    random.seed(time.time())
+    parser = argparse.ArgumentParser(
+        description="This script runs and kills \
+        write_stress multiple times"
+    )
+    parser.add_argument("--runtime_sec", default="1000")
+    parser.add_argument("--db", default="")
+    args = parser.parse_args()
+    main(args)
diff --git a/src/rocksdb/trace_replay/block_cache_tracer.cc b/src/rocksdb/trace_replay/block_cache_tracer.cc
new file mode 100644
index 000000000..508596913
--- /dev/null
+++ b/src/rocksdb/trace_replay/block_cache_tracer.cc
@@ -0,0 +1,504 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "trace_replay/block_cache_tracer.h"
+
+#include <cinttypes>
+#include <cstdio>
+#include <cstdlib>
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/trace_record.h"
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+bool ShouldTrace(const Slice& block_key,
+                 const BlockCacheTraceOptions& trace_options) {
+  if (trace_options.sampling_frequency == 0 ||
+      trace_options.sampling_frequency == 1) {
+    return true;
+  }
+  // We use spatial downsampling so that we have a complete access history for a
+  // block.
+  return 0 == GetSliceRangedNPHash(block_key, trace_options.sampling_frequency);
+}
+}  // namespace
+
+const uint64_t kMicrosInSecond = 1000 * 1000;
+const uint64_t kSecondInMinute = 60;
+const uint64_t kSecondInHour = 3600;
+const std::string BlockCacheTraceHelper::kUnknownColumnFamilyName =
+    "UnknownColumnFamily";
+const uint64_t BlockCacheTraceRecord::kReservedGetId = 0;
+const uint64_t BlockCacheTraceHelper::kReservedGetId = 0;
+
+bool BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(
+    TraceType block_type, TableReaderCaller caller) {
+  return (block_type == TraceType::kBlockTraceDataBlock) &&
+         IsGetOrMultiGet(caller);
+}
+
+bool BlockCacheTraceHelper::IsGetOrMultiGet(TableReaderCaller caller) {
+  return caller == TableReaderCaller::kUserGet ||
+         caller == TableReaderCaller::kUserMultiGet;
+}
+
+bool BlockCacheTraceHelper::IsUserAccess(TableReaderCaller caller) {
+  return caller == TableReaderCaller::kUserGet ||
+         caller == TableReaderCaller::kUserMultiGet ||
+         caller == TableReaderCaller::kUserIterator ||
+         caller == TableReaderCaller::kUserApproximateSize ||
+         caller == TableReaderCaller::kUserVerifyChecksum;
+}
+
+std::string BlockCacheTraceHelper::ComputeRowKey(
+    const BlockCacheTraceRecord& access) {
+  if (!IsGetOrMultiGet(access.caller)) {
+    return "";
+  }
+  Slice key = ExtractUserKey(access.referenced_key);
+  return std::to_string(access.sst_fd_number) + "_" + key.ToString();
+}
+
+uint64_t BlockCacheTraceHelper::GetTableId(
+    const BlockCacheTraceRecord& access) {
+  if (!IsGetOrMultiGet(access.caller) || access.referenced_key.size() < 4) {
+    return 0;
+  }
+  return static_cast<uint64_t>(DecodeFixed32(access.referenced_key.data())) + 1;
+}
+
+uint64_t BlockCacheTraceHelper::GetSequenceNumber(
+    const BlockCacheTraceRecord& access) {
+  if (!IsGetOrMultiGet(access.caller)) {
+    return 0;
+  }
+  return access.get_from_user_specified_snapshot
+             ? 1 + GetInternalKeySeqno(access.referenced_key)
+             : 0;
+}
+
+uint64_t BlockCacheTraceHelper::GetBlockOffsetInFile(
+    const BlockCacheTraceRecord& access) {
+  Slice input(access.block_key);
+  uint64_t offset = 0;
+  while (true) {
+    uint64_t tmp = 0;
+    if (GetVarint64(&input, &tmp)) {
+      offset = tmp;
+    } else {
+      break;
+    }
+  }
+  return offset;
+}
+
+BlockCacheTraceWriterImpl::BlockCacheTraceWriterImpl(
+    SystemClock* clock, const BlockCacheTraceWriterOptions& trace_options,
+    std::unique_ptr<TraceWriter>&& trace_writer)
+    : clock_(clock),
+      trace_options_(trace_options),
+      trace_writer_(std::move(trace_writer)) {}
+
+Status BlockCacheTraceWriterImpl::WriteBlockAccess(
+    const BlockCacheTraceRecord& record, const Slice& block_key,
+    const Slice& cf_name, const Slice& referenced_key) {
+  uint64_t trace_file_size = trace_writer_->GetFileSize();
+  if (trace_file_size > trace_options_.max_trace_file_size) {
+    return Status::OK();
+  }
+  Trace trace;
+  trace.ts = record.access_timestamp;
+  trace.type = record.block_type;
+  PutLengthPrefixedSlice(&trace.payload, block_key);
+  PutFixed64(&trace.payload, record.block_size);
+  PutFixed64(&trace.payload, record.cf_id);
+  PutLengthPrefixedSlice(&trace.payload, cf_name);
+  PutFixed32(&trace.payload, record.level);
+  PutFixed64(&trace.payload, record.sst_fd_number);
+  trace.payload.push_back(record.caller);
+  trace.payload.push_back(record.is_cache_hit);
+  trace.payload.push_back(record.no_insert);
+  if (BlockCacheTraceHelper::IsGetOrMultiGet(record.caller)) {
+    PutFixed64(&trace.payload, record.get_id);
+    trace.payload.push_back(record.get_from_user_specified_snapshot);
+    PutLengthPrefixedSlice(&trace.payload, referenced_key);
+  }
+  if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(record.block_type,
+                                                        record.caller)) {
+    PutFixed64(&trace.payload, record.referenced_data_size);
+    PutFixed64(&trace.payload, record.num_keys_in_block);
+    trace.payload.push_back(record.referenced_key_exist_in_block);
+  }
+  std::string encoded_trace;
+  TracerHelper::EncodeTrace(trace, &encoded_trace);
+  return trace_writer_->Write(encoded_trace);
+}
+
+Status BlockCacheTraceWriterImpl::WriteHeader() {
+  Trace trace;
+  trace.ts = clock_->NowMicros();
+  trace.type = TraceType::kTraceBegin;
+  PutLengthPrefixedSlice(&trace.payload, kTraceMagic);
+  PutFixed32(&trace.payload, kMajorVersion);
+  PutFixed32(&trace.payload, kMinorVersion);
+  std::string encoded_trace;
+  TracerHelper::EncodeTrace(trace, &encoded_trace);
+  return trace_writer_->Write(encoded_trace);
+}
+
+BlockCacheTraceReader::BlockCacheTraceReader(
+    std::unique_ptr<TraceReader>&& reader)
+    : trace_reader_(std::move(reader)) {}
+
+Status BlockCacheTraceReader::ReadHeader(BlockCacheTraceHeader* header) {
+  assert(header != nullptr);
+  std::string encoded_trace;
+  Status s = trace_reader_->Read(&encoded_trace);
+  if (!s.ok()) {
+    return s;
+  }
+  Trace trace;
+  s = TracerHelper::DecodeTrace(encoded_trace, &trace);
+  if (!s.ok()) {
+    return s;
+  }
+  header->start_time = trace.ts;
+  Slice enc_slice = Slice(trace.payload);
+  Slice magnic_number;
+  if (!GetLengthPrefixedSlice(&enc_slice, &magnic_number)) {
+    return Status::Corruption(
+        "Corrupted header in the trace file: Failed to read the magic number.");
+  }
+  if (magnic_number.ToString() != kTraceMagic) {
+    return Status::Corruption(
+        "Corrupted header in the trace file: Magic number does not match.");
+  }
+  if (!GetFixed32(&enc_slice, &header->rocksdb_major_version)) {
+    return Status::Corruption(
+        "Corrupted header in the trace file: Failed to read rocksdb major "
+        "version number.");
+  }
+  if (!GetFixed32(&enc_slice, &header->rocksdb_minor_version)) {
+    return Status::Corruption(
+        "Corrupted header in the trace file: Failed to read rocksdb minor "
+        "version number.");
+  }
+  // We should have retrieved all information in the header.
+  if (!enc_slice.empty()) {
+    return Status::Corruption(
+        "Corrupted header in the trace file: The length of header is too "
+        "long.");
+  }
+  return Status::OK();
+}
+
+Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) {
+  assert(record);
+  std::string encoded_trace;
+  Status s = trace_reader_->Read(&encoded_trace);
+  if (!s.ok()) {
+    return s;
+  }
+  Trace trace;
+  s = TracerHelper::DecodeTrace(encoded_trace, &trace);
+  if (!s.ok()) {
+    return s;
+  }
+  record->access_timestamp = trace.ts;
+  record->block_type = trace.type;
+  Slice enc_slice = Slice(trace.payload);
+
+  const unsigned int kCharSize = 1;
+
+  Slice block_key;
+  if (!GetLengthPrefixedSlice(&enc_slice, &block_key)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read block key.");
+  }
+  record->block_key = block_key.ToString();
+  if (!GetFixed64(&enc_slice, &record->block_size)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read block size.");
+  }
+  if (!GetFixed64(&enc_slice, &record->cf_id)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read column family ID.");
+  }
+  Slice cf_name;
+  if (!GetLengthPrefixedSlice(&enc_slice, &cf_name)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read column family name.");
+  }
+  record->cf_name = cf_name.ToString();
+  if (!GetFixed32(&enc_slice, &record->level)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read level.");
+  }
+  if (!GetFixed64(&enc_slice, &record->sst_fd_number)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read SST file number.");
+  }
+  if (enc_slice.empty()) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read caller.");
+  }
+  record->caller = static_cast<TableReaderCaller>(enc_slice[0]);
+  enc_slice.remove_prefix(kCharSize);
+  if (enc_slice.empty()) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read is_cache_hit.");
+  }
+  record->is_cache_hit = static_cast<char>(enc_slice[0]);
+  enc_slice.remove_prefix(kCharSize);
+  if (enc_slice.empty()) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read no_insert.");
+  }
+  record->no_insert = static_cast<char>(enc_slice[0]);
+  enc_slice.remove_prefix(kCharSize);
+  if (BlockCacheTraceHelper::IsGetOrMultiGet(record->caller)) {
+    if (!GetFixed64(&enc_slice, &record->get_id)) {
+      return Status::Incomplete(
+          "Incomplete access record: Failed to read the get id.");
+    }
+    if (enc_slice.empty()) {
+      return Status::Incomplete(
+          "Incomplete access record: Failed to read "
+          "get_from_user_specified_snapshot.");
+    }
+    record->get_from_user_specified_snapshot = static_cast<char>(enc_slice[0]);
+    enc_slice.remove_prefix(kCharSize);
+    Slice referenced_key;
+    if (!GetLengthPrefixedSlice(&enc_slice, &referenced_key)) {
+      return Status::Incomplete(
+          "Incomplete access record: Failed to read the referenced key.");
+    }
+    record->referenced_key = referenced_key.ToString();
+  }
+  if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(record->block_type,
+                                                        record->caller)) {
+    if (!GetFixed64(&enc_slice, &record->referenced_data_size)) {
+      return Status::Incomplete(
+          "Incomplete access record: Failed to read the referenced data size.");
+    }
+    if (!GetFixed64(&enc_slice, &record->num_keys_in_block)) {
+      return Status::Incomplete(
+          "Incomplete access record: Failed to read the number of keys in the "
+          "block.");
+    }
+    if (enc_slice.empty()) {
+      return Status::Incomplete(
+          "Incomplete access record: Failed to read "
+          "referenced_key_exist_in_block.");
+    }
+    record->referenced_key_exist_in_block = static_cast<char>(enc_slice[0]);
+  }
+  return Status::OK();
+}
+
+BlockCacheHumanReadableTraceWriter::~BlockCacheHumanReadableTraceWriter() {
+  if (human_readable_trace_file_writer_) {
+    human_readable_trace_file_writer_->Flush().PermitUncheckedError();
+    human_readable_trace_file_writer_->Close().PermitUncheckedError();
+  }
+}
+
+Status BlockCacheHumanReadableTraceWriter::NewWritableFile(
+    const std::string& human_readable_trace_file_path,
+    ROCKSDB_NAMESPACE::Env* env) {
+  if (human_readable_trace_file_path.empty()) {
+    return Status::InvalidArgument(
+        "The provided human_readable_trace_file_path is null.");
+  }
+  return env->NewWritableFile(human_readable_trace_file_path,
+                              &human_readable_trace_file_writer_, EnvOptions());
+}
+
+Status BlockCacheHumanReadableTraceWriter::WriteHumanReadableTraceRecord(
+    const BlockCacheTraceRecord& access, uint64_t block_id,
+    uint64_t get_key_id) {
+  if (!human_readable_trace_file_writer_) {
+    return Status::OK();
+  }
+  int ret = snprintf(
+      trace_record_buffer_, sizeof(trace_record_buffer_),
+      "%" PRIu64 ",%" PRIu64 ",%u,%" PRIu64 ",%" PRIu64 ",%s,%" PRIu32
+      ",%" PRIu64 ",%u,%u,%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%u,%u,%" PRIu64
+      ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 "\n",
+      access.access_timestamp, block_id, access.block_type, access.block_size,
+      access.cf_id, access.cf_name.c_str(), access.level, access.sst_fd_number,
+      access.caller, access.no_insert, access.get_id, get_key_id,
+      access.referenced_data_size, access.is_cache_hit,
+      access.referenced_key_exist_in_block, access.num_keys_in_block,
+      BlockCacheTraceHelper::GetTableId(access),
+      BlockCacheTraceHelper::GetSequenceNumber(access),
+      static_cast<uint64_t>(access.block_key.size()),
+      static_cast<uint64_t>(access.referenced_key.size()),
+      BlockCacheTraceHelper::GetBlockOffsetInFile(access));
+  if (ret < 0) {
+    return Status::IOError("failed to format the output");
+  }
+  std::string printout(trace_record_buffer_);
+  return human_readable_trace_file_writer_->Append(printout);
+}
+
+BlockCacheHumanReadableTraceReader::BlockCacheHumanReadableTraceReader(
+    const std::string& trace_file_path)
+    : BlockCacheTraceReader(/*trace_reader=*/nullptr) {
+  human_readable_trace_reader_.open(trace_file_path, std::ifstream::in);
+}
+
+BlockCacheHumanReadableTraceReader::~BlockCacheHumanReadableTraceReader() {
+  human_readable_trace_reader_.close();
+}
+
+Status BlockCacheHumanReadableTraceReader::ReadHeader(
+    BlockCacheTraceHeader* /*header*/) {
+  return Status::OK();
+}
+
+Status BlockCacheHumanReadableTraceReader::ReadAccess(
+    BlockCacheTraceRecord* record) {
+  std::string line;
+  if (!std::getline(human_readable_trace_reader_, line)) {
+    return Status::Incomplete("No more records to read.");
+  }
+  std::stringstream ss(line);
+  std::vector<std::string> record_strs;
+  while (ss.good()) {
+    std::string substr;
+    getline(ss, substr, ',');
+    record_strs.push_back(substr);
+  }
+  if (record_strs.size() != 21) {
+    return Status::Incomplete("Records format is wrong.");
+  }
+
+  record->access_timestamp = ParseUint64(record_strs[0]);
+  uint64_t block_key = ParseUint64(record_strs[1]);
+  record->block_type = static_cast<TraceType>(ParseUint64(record_strs[2]));
+  record->block_size = ParseUint64(record_strs[3]);
+  record->cf_id = ParseUint64(record_strs[4]);
+  record->cf_name = record_strs[5];
+  record->level = static_cast<uint32_t>(ParseUint64(record_strs[6]));
+  record->sst_fd_number = ParseUint64(record_strs[7]);
+  record->caller = static_cast<TableReaderCaller>(ParseUint64(record_strs[8]));
+  record->no_insert = static_cast<char>(ParseUint64(record_strs[9]));
+  record->get_id = ParseUint64(record_strs[10]);
+  uint64_t get_key_id = ParseUint64(record_strs[11]);
+
+  record->referenced_data_size = ParseUint64(record_strs[12]);
+  record->is_cache_hit = static_cast<char>(ParseUint64(record_strs[13]));
+  record->referenced_key_exist_in_block =
+      static_cast<char>(ParseUint64(record_strs[14]));
+  record->num_keys_in_block = ParseUint64(record_strs[15]);
+  uint64_t table_id = ParseUint64(record_strs[16]);
+  if (table_id > 0) {
+    // Decrement since valid table id in the trace file equals traced table id
+    // + 1.
+    table_id -= 1;
+  }
+  uint64_t get_sequence_number = ParseUint64(record_strs[17]);
+  if (get_sequence_number > 0) {
+    record->get_from_user_specified_snapshot = true;
+    // Decrement since valid seq number in the trace file equals traced seq
+    // number + 1.
+    get_sequence_number -= 1;
+  }
+  uint64_t block_key_size = ParseUint64(record_strs[18]);
+  uint64_t get_key_size = ParseUint64(record_strs[19]);
+  uint64_t block_offset = ParseUint64(record_strs[20]);
+
+  std::string tmp_block_key;
+  PutVarint64(&tmp_block_key, block_key);
+  PutVarint64(&tmp_block_key, block_offset);
+  // Append 1 until the size is the same as traced block key size.
+  while (record->block_key.size() < block_key_size - tmp_block_key.size()) {
+    record->block_key += "1";
+  }
+  record->block_key += tmp_block_key;
+
+  if (get_key_id != 0) {
+    std::string tmp_get_key;
+    PutFixed64(&tmp_get_key, get_key_id);
+    PutFixed64(&tmp_get_key, get_sequence_number << 8);
+    PutFixed32(&record->referenced_key, static_cast<uint32_t>(table_id));
+    // Append 1 until the size is the same as traced key size.
+    while (record->referenced_key.size() < get_key_size - tmp_get_key.size()) {
+      record->referenced_key += "1";
+    }
+    record->referenced_key += tmp_get_key;
+  }
+  return Status::OK();
+}
+
+BlockCacheTracer::BlockCacheTracer() { writer_.store(nullptr); }
+
+BlockCacheTracer::~BlockCacheTracer() { EndTrace(); }
+
+Status BlockCacheTracer::StartTrace(
+    const BlockCacheTraceOptions& trace_options,
+    std::unique_ptr<BlockCacheTraceWriter>&& trace_writer) {
+  InstrumentedMutexLock lock_guard(&trace_writer_mutex_);
+  if (writer_.load()) {
+    return Status::Busy();
+  }
+  get_id_counter_.store(1);
+  trace_options_ = trace_options;
+  writer_.store(trace_writer.release());
+  return writer_.load()->WriteHeader();
+}
+
+void BlockCacheTracer::EndTrace() {
+  InstrumentedMutexLock lock_guard(&trace_writer_mutex_);
+  if (!writer_.load()) {
+    return;
+  }
+  delete writer_.load();
+  writer_.store(nullptr);
+}
+
+Status BlockCacheTracer::WriteBlockAccess(const BlockCacheTraceRecord& record,
+                                          const Slice& block_key,
+                                          const Slice& cf_name,
+                                          const Slice& referenced_key) {
+  if (!writer_.load() || !ShouldTrace(block_key, trace_options_)) {
+    return Status::OK();
+  }
+  InstrumentedMutexLock lock_guard(&trace_writer_mutex_);
+  if (!writer_.load()) {
+    return Status::OK();
+  }
+  return writer_.load()->WriteBlockAccess(record, block_key, cf_name,
+                                          referenced_key);
+}
+
+uint64_t BlockCacheTracer::NextGetId() {
+  if (!writer_.load(std::memory_order_relaxed)) {
+    return BlockCacheTraceHelper::kReservedGetId;
+  }
+  uint64_t prev_value = get_id_counter_.fetch_add(1);
+  if (prev_value == BlockCacheTraceHelper::kReservedGetId) {
+    // fetch and add again.
+    return get_id_counter_.fetch_add(1);
+  }
+  return prev_value;
+}
+
+std::unique_ptr<BlockCacheTraceWriter> NewBlockCacheTraceWriter(
+    SystemClock* clock, const BlockCacheTraceWriterOptions& trace_options,
+    std::unique_ptr<TraceWriter>&& trace_writer) {
+  return std::unique_ptr<BlockCacheTraceWriter>(new BlockCacheTraceWriterImpl(
+      clock, trace_options, std::move(trace_writer)));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/trace_replay/block_cache_tracer.h b/src/rocksdb/trace_replay/block_cache_tracer.h
new file mode 100644
index 000000000..4a749608f
--- /dev/null
+++ b/src/rocksdb/trace_replay/block_cache_tracer.h
@@ -0,0 +1,239 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <fstream>
+
+#include "monitoring/instrumented_mutex.h"
+#include "rocksdb/block_cache_trace_writer.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table_reader_caller.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "trace_replay/trace_replay.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Env;
+class SystemClock;
+
+extern const uint64_t kMicrosInSecond;
+extern const uint64_t kSecondInMinute;
+extern const uint64_t kSecondInHour;
+
+struct BlockCacheTraceRecord;
+
+class BlockCacheTraceHelper {
+ public:
+  static bool IsGetOrMultiGetOnDataBlock(TraceType block_type,
+                                         TableReaderCaller caller);
+  static bool IsGetOrMultiGet(TableReaderCaller caller);
+  static bool IsUserAccess(TableReaderCaller caller);
+  // Row key is a concatenation of the access's fd_number and the referenced
+  // user key.
+  static std::string ComputeRowKey(const BlockCacheTraceRecord& access);
+  // The first four bytes of the referenced key in a Get request is the table
+  // id.
+  static uint64_t GetTableId(const BlockCacheTraceRecord& access);
+  // The sequence number of a get request is the last part of the referenced
+  // key.
+  static uint64_t GetSequenceNumber(const BlockCacheTraceRecord& access);
+  // Block offset in a file is the last varint64 in the block key.
+  static uint64_t GetBlockOffsetInFile(const BlockCacheTraceRecord& access);
+
+  static const std::string kUnknownColumnFamilyName;
+  static const uint64_t kReservedGetId;
+};
+
+// Lookup context for tracing block cache accesses.
+// We trace block accesses at five places:
+// 1. BlockBasedTable::GetFilter
+// 2. BlockBasedTable::GetUncompressedDict.
+// 3. BlockBasedTable::MaybeReadAndLoadToCache. (To trace access on data, index,
+// and range deletion block.)
+// 4. BlockBasedTable::Get. (To trace the referenced key and whether the
+// referenced key exists in a fetched data block.)
+// 5. BlockBasedTable::MultiGet. (To trace the referenced key and whether the
+// referenced key exists in a fetched data block.)
+// The context is created at:
+// 1. BlockBasedTable::Get. (kUserGet)
+// 2. BlockBasedTable::MultiGet. (kUserMGet)
+// 3. BlockBasedTable::NewIterator. (either kUserIterator, kCompaction, or
+// external SST ingestion calls this function.)
+// 4. BlockBasedTable::Open. (kPrefetch)
+// 5. Index/Filter::CacheDependencies. (kPrefetch)
+// 6. BlockBasedTable::ApproximateOffsetOf. (kCompaction or
+// kUserApproximateSize).
+struct BlockCacheLookupContext {
+  BlockCacheLookupContext(const TableReaderCaller& _caller) : caller(_caller) {}
+  BlockCacheLookupContext(const TableReaderCaller& _caller, uint64_t _get_id,
+                          bool _get_from_user_specified_snapshot)
+      : caller(_caller),
+        get_id(_get_id),
+        get_from_user_specified_snapshot(_get_from_user_specified_snapshot) {}
+  const TableReaderCaller caller;
+  // These are populated when we perform lookup/insert on block cache. The block
+  // cache tracer uses these inforation when logging the block access at
+  // BlockBasedTable::GET and BlockBasedTable::MultiGet.
+  bool is_cache_hit = false;
+  bool no_insert = false;
+  TraceType block_type = TraceType::kTraceMax;
+  uint64_t block_size = 0;
+  std::string block_key;
+  uint64_t num_keys_in_block = 0;
+  // The unique id associated with Get and MultiGet. This enables us to track
+  // how many blocks a Get/MultiGet request accesses. We can also measure the
+  // impact of row cache vs block cache.
+  uint64_t get_id = 0;
+  std::string referenced_key;
+  bool get_from_user_specified_snapshot = false;
+
+  void FillLookupContext(bool _is_cache_hit, bool _no_insert,
+                         TraceType _block_type, uint64_t _block_size,
+                         const std::string& _block_key,
+                         uint64_t _num_keys_in_block) {
+    is_cache_hit = _is_cache_hit;
+    no_insert = _no_insert;
+    block_type = _block_type;
+    block_size = _block_size;
+    block_key = _block_key;
+    num_keys_in_block = _num_keys_in_block;
+  }
+};
+
+struct BlockCacheTraceHeader {
+  uint64_t start_time;
+  uint32_t rocksdb_major_version;
+  uint32_t rocksdb_minor_version;
+};
+
+// BlockCacheTraceWriter captures all RocksDB block cache accesses using a
+// user-provided TraceWriter. Every RocksDB operation is written as a single
+// trace. Each trace will have a timestamp and type, followed by the trace
+// payload.
+class BlockCacheTraceWriterImpl : public BlockCacheTraceWriter {
+ public:
+  BlockCacheTraceWriterImpl(SystemClock* clock,
+                            const BlockCacheTraceWriterOptions& trace_options,
+                            std::unique_ptr<TraceWriter>&& trace_writer);
+  ~BlockCacheTraceWriterImpl() = default;
+  // No copy and move.
+  BlockCacheTraceWriterImpl(const BlockCacheTraceWriterImpl&) = delete;
+  BlockCacheTraceWriterImpl& operator=(const BlockCacheTraceWriterImpl&) =
+      delete;
+  BlockCacheTraceWriterImpl(BlockCacheTraceWriterImpl&&) = delete;
+  BlockCacheTraceWriterImpl& operator=(BlockCacheTraceWriterImpl&&) = delete;
+
+  // Pass Slice references to avoid copy.
+  Status WriteBlockAccess(const BlockCacheTraceRecord& record,
+                          const Slice& block_key, const Slice& cf_name,
+                          const Slice& referenced_key);
+
+  // Write a trace header at the beginning, typically on initiating a trace,
+  // with some metadata like a magic number and RocksDB version.
+  Status WriteHeader();
+
+ private:
+  SystemClock* clock_;
+  BlockCacheTraceWriterOptions trace_options_;
+  std::unique_ptr<TraceWriter> trace_writer_;
+};
+
+// Write a trace record in human readable format, see
+// https://github.com/facebook/rocksdb/wiki/Block-cache-analysis-and-simulation-tools#trace-format
+// for details.
+class BlockCacheHumanReadableTraceWriter {
+ public:
+  ~BlockCacheHumanReadableTraceWriter();
+
+  Status NewWritableFile(const std::string& human_readable_trace_file_path,
+                         ROCKSDB_NAMESPACE::Env* env);
+
+  Status WriteHumanReadableTraceRecord(const BlockCacheTraceRecord& access,
+                                       uint64_t block_id, uint64_t get_key_id);
+
+ private:
+  char trace_record_buffer_[1024 * 1024];
+  std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile>
+      human_readable_trace_file_writer_;
+};
+
+// BlockCacheTraceReader helps read the trace file generated by
+// BlockCacheTraceWriter using a user provided TraceReader.
+class BlockCacheTraceReader {
+ public:
+  BlockCacheTraceReader(std::unique_ptr<TraceReader>&& reader);
+  virtual ~BlockCacheTraceReader() = default;
+  // No copy and move.
+  BlockCacheTraceReader(const BlockCacheTraceReader&) = delete;
+  BlockCacheTraceReader& operator=(const BlockCacheTraceReader&) = delete;
+  BlockCacheTraceReader(BlockCacheTraceReader&&) = delete;
+  BlockCacheTraceReader& operator=(BlockCacheTraceReader&&) = delete;
+
+  Status ReadHeader(BlockCacheTraceHeader* header);
+
+  Status ReadAccess(BlockCacheTraceRecord* record);
+
+ private:
+  std::unique_ptr<TraceReader> trace_reader_;
+};
+
+// Read a trace record in human readable format, see
+// https://github.com/facebook/rocksdb/wiki/Block-cache-analysis-and-simulation-tools#trace-format
+// for detailed.
+class BlockCacheHumanReadableTraceReader : public BlockCacheTraceReader {
+ public:
+  BlockCacheHumanReadableTraceReader(const std::string& trace_file_path);
+
+  ~BlockCacheHumanReadableTraceReader();
+
+  Status ReadHeader(BlockCacheTraceHeader* header);
+
+  Status ReadAccess(BlockCacheTraceRecord* record);
+
+ private:
+  std::ifstream human_readable_trace_reader_;
+};
+
+// A block cache tracer. It downsamples the accesses according to
+// trace_options and uses BlockCacheTraceWriter to write the access record to
+// the trace file.
+class BlockCacheTracer {
+ public:
+  BlockCacheTracer();
+  ~BlockCacheTracer();
+  // No copy and move.
+  BlockCacheTracer(const BlockCacheTracer&) = delete;
+  BlockCacheTracer& operator=(const BlockCacheTracer&) = delete;
+  BlockCacheTracer(BlockCacheTracer&&) = delete;
+  BlockCacheTracer& operator=(BlockCacheTracer&&) = delete;
+
+  // Start writing block cache accesses to the trace_writer.
+  Status StartTrace(const BlockCacheTraceOptions& trace_options,
+                    std::unique_ptr<BlockCacheTraceWriter>&& trace_writer);
+
+  // Stop writing block cache accesses to the trace_writer.
+  void EndTrace();
+
+  bool is_tracing_enabled() const {
+    return writer_.load(std::memory_order_relaxed);
+  }
+
+  Status WriteBlockAccess(const BlockCacheTraceRecord& record,
+                          const Slice& block_key, const Slice& cf_name,
+                          const Slice& referenced_key);
+
+  // GetId cycles from 1 to std::numeric_limits<uint64_t>::max().
+  uint64_t NextGetId();
+
+ private:
+  BlockCacheTraceOptions trace_options_;
+  // A mutex protects the writer_.
+  InstrumentedMutex trace_writer_mutex_;
+  std::atomic<BlockCacheTraceWriter*> writer_;
+  std::atomic<uint64_t> get_id_counter_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/trace_replay/block_cache_tracer_test.cc b/src/rocksdb/trace_replay/block_cache_tracer_test.cc
new file mode 100644
index 000000000..f9d0773bf
--- /dev/null
+++ b/src/rocksdb/trace_replay/block_cache_tracer_test.cc
@@ -0,0 +1,421 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "trace_replay/block_cache_tracer.h"
+
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "rocksdb/trace_record.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+const uint64_t kBlockSize = 1024;
+const std::string kBlockKeyPrefix = "test-block-";
+const uint32_t kCFId = 0;
+const uint32_t kLevel = 1;
+const uint64_t kSSTFDNumber = 100;
+const std::string kRefKeyPrefix = "test-get-";
+const uint64_t kNumKeysInBlock = 1024;
+const uint64_t kReferencedDataSize = 10;
+}  // namespace
+
+class BlockCacheTracerTest : public testing::Test {
+ public:
+  BlockCacheTracerTest() {
+    test_path_ = test::PerThreadDBPath("block_cache_tracer_test");
+    env_ = ROCKSDB_NAMESPACE::Env::Default();
+    clock_ = env_->GetSystemClock().get();
+    EXPECT_OK(env_->CreateDir(test_path_));
+    trace_file_path_ = test_path_ + "/block_cache_trace";
+  }
+
+  ~BlockCacheTracerTest() override {
+    EXPECT_OK(env_->DeleteFile(trace_file_path_));
+    EXPECT_OK(env_->DeleteDir(test_path_));
+  }
+
+  TableReaderCaller GetCaller(uint32_t key_id) {
+    uint32_t n = key_id % 5;
+    switch (n) {
+      case 0:
+        return TableReaderCaller::kPrefetch;
+      case 1:
+        return TableReaderCaller::kCompaction;
+      case 2:
+        return TableReaderCaller::kUserGet;
+      case 3:
+        return TableReaderCaller::kUserMultiGet;
+      case 4:
+        return TableReaderCaller::kUserIterator;
+    }
+    assert(false);
+    return TableReaderCaller::kMaxBlockCacheLookupCaller;
+  }
+
+  void WriteBlockAccess(BlockCacheTraceWriter* writer, uint32_t from_key_id,
+                        TraceType block_type, uint32_t nblocks) {
+    assert(writer);
+    for (uint32_t i = 0; i < nblocks; i++) {
+      uint32_t key_id = from_key_id + i;
+      BlockCacheTraceRecord record;
+      record.block_type = block_type;
+      record.block_size = kBlockSize + key_id;
+      record.block_key = (kBlockKeyPrefix + std::to_string(key_id));
+      record.access_timestamp = clock_->NowMicros();
+      record.cf_id = kCFId;
+      record.cf_name = kDefaultColumnFamilyName;
+      record.caller = GetCaller(key_id);
+      record.level = kLevel;
+      record.sst_fd_number = kSSTFDNumber + key_id;
+      record.is_cache_hit = false;
+      record.no_insert = false;
+      // Provide get_id for all callers. The writer should only write get_id
+      // when the caller is either GET or MGET.
+      record.get_id = key_id + 1;
+      record.get_from_user_specified_snapshot = true;
+      // Provide these fields for all block types.
+      // The writer should only write these fields for data blocks and the
+      // caller is either GET or MGET.
+      record.referenced_key = (kRefKeyPrefix + std::to_string(key_id));
+      record.referenced_key_exist_in_block = true;
+      record.num_keys_in_block = kNumKeysInBlock;
+      record.referenced_data_size = kReferencedDataSize + key_id;
+      ASSERT_OK(writer->WriteBlockAccess(
+          record, record.block_key, record.cf_name, record.referenced_key));
+    }
+  }
+
+  BlockCacheTraceRecord GenerateAccessRecord() {
+    uint32_t key_id = 0;
+    BlockCacheTraceRecord record;
+    record.block_type = TraceType::kBlockTraceDataBlock;
+    record.block_size = kBlockSize;
+    record.block_key = kBlockKeyPrefix + std::to_string(key_id);
+    record.access_timestamp = clock_->NowMicros();
+    record.cf_id = kCFId;
+    record.cf_name = kDefaultColumnFamilyName;
+    record.caller = GetCaller(key_id);
+    record.level = kLevel;
+    record.sst_fd_number = kSSTFDNumber + key_id;
+    record.is_cache_hit = false;
+    record.no_insert = false;
+    record.referenced_key = kRefKeyPrefix + std::to_string(key_id);
+    record.referenced_key_exist_in_block = true;
+    record.num_keys_in_block = kNumKeysInBlock;
+    return record;
+  }
+
+  void VerifyAccess(BlockCacheTraceReader* reader, uint32_t from_key_id,
+                    TraceType block_type, uint32_t nblocks) {
+    assert(reader);
+    for (uint32_t i = 0; i < nblocks; i++) {
+      uint32_t key_id = from_key_id + i;
+      BlockCacheTraceRecord record;
+      ASSERT_OK(reader->ReadAccess(&record));
+      ASSERT_EQ(block_type, record.block_type);
+      ASSERT_EQ(kBlockSize + key_id, record.block_size);
+      ASSERT_EQ(kBlockKeyPrefix + std::to_string(key_id), record.block_key);
+      ASSERT_EQ(kCFId, record.cf_id);
+      ASSERT_EQ(kDefaultColumnFamilyName, record.cf_name);
+      ASSERT_EQ(GetCaller(key_id), record.caller);
+      ASSERT_EQ(kLevel, record.level);
+      ASSERT_EQ(kSSTFDNumber + key_id, record.sst_fd_number);
+      ASSERT_FALSE(record.is_cache_hit);
+      ASSERT_FALSE(record.no_insert);
+      if (record.caller == TableReaderCaller::kUserGet ||
+          record.caller == TableReaderCaller::kUserMultiGet) {
+        ASSERT_EQ(key_id + 1, record.get_id);
+        ASSERT_TRUE(record.get_from_user_specified_snapshot);
+        ASSERT_EQ(kRefKeyPrefix + std::to_string(key_id),
+                  record.referenced_key);
+      } else {
+        ASSERT_EQ(BlockCacheTraceHelper::kReservedGetId, record.get_id);
+        ASSERT_FALSE(record.get_from_user_specified_snapshot);
+        ASSERT_EQ("", record.referenced_key);
+      }
+      if (block_type == TraceType::kBlockTraceDataBlock &&
+          (record.caller == TableReaderCaller::kUserGet ||
+           record.caller == TableReaderCaller::kUserMultiGet)) {
+        ASSERT_TRUE(record.referenced_key_exist_in_block);
+        ASSERT_EQ(kNumKeysInBlock, record.num_keys_in_block);
+        ASSERT_EQ(kReferencedDataSize + key_id, record.referenced_data_size);
+        continue;
+      }
+      ASSERT_FALSE(record.referenced_key_exist_in_block);
+      ASSERT_EQ(0, record.num_keys_in_block);
+      ASSERT_EQ(0, record.referenced_data_size);
+    }
+  }
+
+  Env* env_;
+  SystemClock* clock_;
+  EnvOptions env_options_;
+  std::string trace_file_path_;
+  std::string test_path_;
+};
+
+TEST_F(BlockCacheTracerTest, AtomicWriteBeforeStartTrace) {
+  BlockCacheTraceRecord record = GenerateAccessRecord();
+  {
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    BlockCacheTracer writer;
+    // The record should be written to the trace_file since StartTrace is not
+    // called.
+    ASSERT_OK(writer.WriteBlockAccess(record, record.block_key, record.cf_name,
+                                      record.referenced_key));
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+  {
+    // Verify trace file contains nothing.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_,
+                                 &trace_reader));
+    BlockCacheTraceReader reader(std::move(trace_reader));
+    BlockCacheTraceHeader header;
+    ASSERT_NOK(reader.ReadHeader(&header));
+  }
+}
+
+TEST_F(BlockCacheTracerTest, AtomicWrite) {
+  BlockCacheTraceRecord record = GenerateAccessRecord();
+  {
+    BlockCacheTraceWriterOptions trace_writer_opt;
+    BlockCacheTraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    std::unique_ptr<BlockCacheTraceWriter> block_cache_trace_writer =
+        NewBlockCacheTraceWriter(env_->GetSystemClock().get(), trace_writer_opt,
+                                 std::move(trace_writer));
+    ASSERT_NE(block_cache_trace_writer, nullptr);
+    BlockCacheTracer writer;
+    ASSERT_OK(
+        writer.StartTrace(trace_opt, std::move(block_cache_trace_writer)));
+    ASSERT_OK(writer.WriteBlockAccess(record, record.block_key, record.cf_name,
+                                      record.referenced_key));
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+  {
+    // Verify trace file contains one record.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_,
+                                 &trace_reader));
+    BlockCacheTraceReader reader(std::move(trace_reader));
+    BlockCacheTraceHeader header;
+    ASSERT_OK(reader.ReadHeader(&header));
+    ASSERT_EQ(kMajorVersion, static_cast<int>(header.rocksdb_major_version));
+    ASSERT_EQ(kMinorVersion, static_cast<int>(header.rocksdb_minor_version));
+    VerifyAccess(&reader, 0, TraceType::kBlockTraceDataBlock, 1);
+    ASSERT_NOK(reader.ReadAccess(&record));
+  }
+}
+
+TEST_F(BlockCacheTracerTest, ConsecutiveStartTrace) {
+  BlockCacheTraceWriterOptions trace_writer_opt;
+  BlockCacheTraceOptions trace_opt;
+  std::unique_ptr<TraceWriter> trace_writer;
+  ASSERT_OK(
+      NewFileTraceWriter(env_, env_options_, trace_file_path_, &trace_writer));
+  std::unique_ptr<BlockCacheTraceWriter> block_cache_trace_writer =
+      NewBlockCacheTraceWriter(env_->GetSystemClock().get(), trace_writer_opt,
+                               std::move(trace_writer));
+  ASSERT_NE(block_cache_trace_writer, nullptr);
+  BlockCacheTracer writer;
+  ASSERT_OK(writer.StartTrace(trace_opt, std::move(block_cache_trace_writer)));
+  ASSERT_NOK(writer.StartTrace(trace_opt, std::move(block_cache_trace_writer)));
+  ASSERT_OK(env_->FileExists(trace_file_path_));
+}
+
+TEST_F(BlockCacheTracerTest, AtomicNoWriteAfterEndTrace) {
+  BlockCacheTraceRecord record = GenerateAccessRecord();
+  {
+    BlockCacheTraceWriterOptions trace_writer_opt;
+    BlockCacheTraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    std::unique_ptr<BlockCacheTraceWriter> block_cache_trace_writer =
+        NewBlockCacheTraceWriter(env_->GetSystemClock().get(), trace_writer_opt,
+                                 std::move(trace_writer));
+    ASSERT_NE(block_cache_trace_writer, nullptr);
+    BlockCacheTracer writer;
+    ASSERT_OK(
+        writer.StartTrace(trace_opt, std::move(block_cache_trace_writer)));
+    ASSERT_OK(writer.WriteBlockAccess(record, record.block_key, record.cf_name,
+                                      record.referenced_key));
+    writer.EndTrace();
+    // Write the record again. This time the record should not be written since
+    // EndTrace is called.
+    ASSERT_OK(writer.WriteBlockAccess(record, record.block_key, record.cf_name,
+                                      record.referenced_key));
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+  {
+    // Verify trace file contains one record.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_,
+                                 &trace_reader));
+    BlockCacheTraceReader reader(std::move(trace_reader));
+    BlockCacheTraceHeader header;
+    ASSERT_OK(reader.ReadHeader(&header));
+    ASSERT_EQ(kMajorVersion, static_cast<int>(header.rocksdb_major_version));
+    ASSERT_EQ(kMinorVersion, static_cast<int>(header.rocksdb_minor_version));
+    VerifyAccess(&reader, 0, TraceType::kBlockTraceDataBlock, 1);
+    ASSERT_NOK(reader.ReadAccess(&record));
+  }
+}
+
+TEST_F(BlockCacheTracerTest, NextGetId) {
+  BlockCacheTracer writer;
+  {
+    BlockCacheTraceWriterOptions trace_writer_opt;
+    BlockCacheTraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    std::unique_ptr<BlockCacheTraceWriter> block_cache_trace_writer =
+        NewBlockCacheTraceWriter(env_->GetSystemClock().get(), trace_writer_opt,
+                                 std::move(trace_writer));
+    ASSERT_NE(block_cache_trace_writer, nullptr);
+    // next get id should always return 0 before we call StartTrace.
+    ASSERT_EQ(0, writer.NextGetId());
+    ASSERT_EQ(0, writer.NextGetId());
+    ASSERT_OK(
+        writer.StartTrace(trace_opt, std::move(block_cache_trace_writer)));
+    ASSERT_EQ(1, writer.NextGetId());
+    ASSERT_EQ(2, writer.NextGetId());
+    writer.EndTrace();
+    // next get id should return 0.
+    ASSERT_EQ(0, writer.NextGetId());
+  }
+
+  // Start trace again and next get id should return 1.
+  {
+    BlockCacheTraceWriterOptions trace_writer_opt;
+    BlockCacheTraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    std::unique_ptr<BlockCacheTraceWriter> block_cache_trace_writer =
+        NewBlockCacheTraceWriter(env_->GetSystemClock().get(), trace_writer_opt,
+                                 std::move(trace_writer));
+    ASSERT_NE(block_cache_trace_writer, nullptr);
+    ASSERT_OK(
+        writer.StartTrace(trace_opt, std::move(block_cache_trace_writer)));
+    ASSERT_EQ(1, writer.NextGetId());
+  }
+}
+
+TEST_F(BlockCacheTracerTest, MixedBlocks) {
+  {
+    // Generate a trace file containing a mix of blocks.
+    BlockCacheTraceWriterOptions trace_writer_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    std::unique_ptr<BlockCacheTraceWriter> block_cache_trace_writer =
+        NewBlockCacheTraceWriter(env_->GetSystemClock().get(), trace_writer_opt,
+                                 std::move(trace_writer));
+    ASSERT_NE(block_cache_trace_writer, nullptr);
+    ASSERT_OK(block_cache_trace_writer->WriteHeader());
+    // Write blocks of different types.
+    WriteBlockAccess(block_cache_trace_writer.get(), 0,
+                     TraceType::kBlockTraceUncompressionDictBlock, 10);
+    WriteBlockAccess(block_cache_trace_writer.get(), 10,
+                     TraceType::kBlockTraceDataBlock, 10);
+    WriteBlockAccess(block_cache_trace_writer.get(), 20,
+                     TraceType::kBlockTraceFilterBlock, 10);
+    WriteBlockAccess(block_cache_trace_writer.get(), 30,
+                     TraceType::kBlockTraceIndexBlock, 10);
+    WriteBlockAccess(block_cache_trace_writer.get(), 40,
+                     TraceType::kBlockTraceRangeDeletionBlock, 10);
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+
+  {
+    // Verify trace file is generated correctly.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_,
+                                 &trace_reader));
+    BlockCacheTraceReader reader(std::move(trace_reader));
+    BlockCacheTraceHeader header;
+    ASSERT_OK(reader.ReadHeader(&header));
+    ASSERT_EQ(kMajorVersion, static_cast<int>(header.rocksdb_major_version));
+    ASSERT_EQ(kMinorVersion, static_cast<int>(header.rocksdb_minor_version));
+    // Read blocks.
+    VerifyAccess(&reader, 0, TraceType::kBlockTraceUncompressionDictBlock, 10);
+    VerifyAccess(&reader, 10, TraceType::kBlockTraceDataBlock, 10);
+    VerifyAccess(&reader, 20, TraceType::kBlockTraceFilterBlock, 10);
+    VerifyAccess(&reader, 30, TraceType::kBlockTraceIndexBlock, 10);
+    VerifyAccess(&reader, 40, TraceType::kBlockTraceRangeDeletionBlock, 10);
+    // Read one more record should report an error.
+    BlockCacheTraceRecord record;
+    ASSERT_NOK(reader.ReadAccess(&record));
+  }
+}
+
+TEST_F(BlockCacheTracerTest, HumanReadableTrace) {
+  BlockCacheTraceRecord record = GenerateAccessRecord();
+  record.get_id = 1;
+  record.referenced_key = "";
+  record.caller = TableReaderCaller::kUserGet;
+  record.get_from_user_specified_snapshot = true;
+  record.referenced_data_size = kReferencedDataSize;
+  PutFixed32(&record.referenced_key, 111);
+  PutLengthPrefixedSlice(&record.referenced_key, "get_key");
+  PutFixed64(&record.referenced_key, 2 << 8);
+  PutLengthPrefixedSlice(&record.block_key, "block_key");
+  PutVarint64(&record.block_key, 333);
+  {
+    // Generate a human readable trace file.
+    BlockCacheHumanReadableTraceWriter writer;
+    ASSERT_OK(writer.NewWritableFile(trace_file_path_, env_));
+    ASSERT_OK(writer.WriteHumanReadableTraceRecord(record, 1, 1));
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+  {
+    BlockCacheHumanReadableTraceReader reader(trace_file_path_);
+    BlockCacheTraceHeader header;
+    BlockCacheTraceRecord read_record;
+    ASSERT_OK(reader.ReadHeader(&header));
+    ASSERT_OK(reader.ReadAccess(&read_record));
+    ASSERT_EQ(TraceType::kBlockTraceDataBlock, read_record.block_type);
+    ASSERT_EQ(kBlockSize, read_record.block_size);
+    ASSERT_EQ(kCFId, read_record.cf_id);
+    ASSERT_EQ(kDefaultColumnFamilyName, read_record.cf_name);
+    ASSERT_EQ(TableReaderCaller::kUserGet, read_record.caller);
+    ASSERT_EQ(kLevel, read_record.level);
+    ASSERT_EQ(kSSTFDNumber, read_record.sst_fd_number);
+    ASSERT_FALSE(read_record.is_cache_hit);
+    ASSERT_FALSE(read_record.no_insert);
+    ASSERT_EQ(1, read_record.get_id);
+    ASSERT_TRUE(read_record.get_from_user_specified_snapshot);
+    ASSERT_TRUE(read_record.referenced_key_exist_in_block);
+    ASSERT_EQ(kNumKeysInBlock, read_record.num_keys_in_block);
+    ASSERT_EQ(kReferencedDataSize, read_record.referenced_data_size);
+    ASSERT_EQ(record.block_key.size(), read_record.block_key.size());
+    ASSERT_EQ(record.referenced_key.size(), record.referenced_key.size());
+    ASSERT_EQ(112, BlockCacheTraceHelper::GetTableId(read_record));
+    ASSERT_EQ(3, BlockCacheTraceHelper::GetSequenceNumber(read_record));
+    ASSERT_EQ(333, BlockCacheTraceHelper::GetBlockOffsetInFile(read_record));
+    // Read again should fail.
+    ASSERT_NOK(reader.ReadAccess(&read_record));
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/trace_replay/io_tracer.cc b/src/rocksdb/trace_replay/io_tracer.cc
new file mode 100644
index 000000000..a860130f8
--- /dev/null
+++ b/src/rocksdb/trace_replay/io_tracer.cc
@@ -0,0 +1,303 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "trace_replay/io_tracer.h"
+
+#include <cinttypes>
+#include <cstdio>
+#include <cstdlib>
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+IOTraceWriter::IOTraceWriter(SystemClock* clock,
+                             const TraceOptions& trace_options,
+                             std::unique_ptr<TraceWriter>&& trace_writer)
+    : clock_(clock),
+      trace_options_(trace_options),
+      trace_writer_(std::move(trace_writer)) {}
+
+Status IOTraceWriter::WriteIOOp(const IOTraceRecord& record,
+                                IODebugContext* dbg) {
+  uint64_t trace_file_size = trace_writer_->GetFileSize();
+  if (trace_file_size > trace_options_.max_trace_file_size) {
+    return Status::OK();
+  }
+  Trace trace;
+  trace.ts = record.access_timestamp;
+  trace.type = record.trace_type;
+  PutFixed64(&trace.payload, record.io_op_data);
+  Slice file_operation(record.file_operation);
+  PutLengthPrefixedSlice(&trace.payload, file_operation);
+  PutFixed64(&trace.payload, record.latency);
+  Slice io_status(record.io_status);
+  PutLengthPrefixedSlice(&trace.payload, io_status);
+  Slice file_name(record.file_name);
+  PutLengthPrefixedSlice(&trace.payload, file_name);
+
+  // Each bit in io_op_data stores which corresponding info from IOTraceOp will
+  // be added in the trace. Foreg, if bit at position 1 is set then
+  // IOTraceOp::kIOLen (length) will be logged in the record (Since
+  // IOTraceOp::kIOLen = 1 in the enum). So find all the set positions in
+  // io_op_data one by one and, update corresponsing info in the trace record,
+  // unset that bit to find other set bits until io_op_data = 0.
+  /* Write remaining options based on io_op_data set by file operation */
+  int64_t io_op_data = static_cast<int64_t>(record.io_op_data);
+  while (io_op_data) {
+    // Find the rightmost set bit.
+    uint32_t set_pos = static_cast<uint32_t>(log2(io_op_data & -io_op_data));
+    switch (set_pos) {
+      case IOTraceOp::kIOFileSize:
+        PutFixed64(&trace.payload, record.file_size);
+        break;
+      case IOTraceOp::kIOLen:
+        PutFixed64(&trace.payload, record.len);
+        break;
+      case IOTraceOp::kIOOffset:
+        PutFixed64(&trace.payload, record.offset);
+        break;
+      default:
+        assert(false);
+    }
+    // unset the rightmost bit.
+    io_op_data &= (io_op_data - 1);
+  }
+
+  int64_t trace_data = 0;
+  if (dbg) {
+    trace_data = static_cast<int64_t>(dbg->trace_data);
+  }
+  PutFixed64(&trace.payload, trace_data);
+  while (trace_data) {
+    // Find the rightmost set bit.
+    uint32_t set_pos = static_cast<uint32_t>(log2(trace_data & -trace_data));
+    switch (set_pos) {
+      case IODebugContext::TraceData::kRequestID: {
+        Slice request_id(dbg->request_id);
+        PutLengthPrefixedSlice(&trace.payload, request_id);
+      } break;
+      default:
+        assert(false);
+    }
+    // unset the rightmost bit.
+    trace_data &= (trace_data - 1);
+  }
+
+  std::string encoded_trace;
+  TracerHelper::EncodeTrace(trace, &encoded_trace);
+  return trace_writer_->Write(encoded_trace);
+}
+
+Status IOTraceWriter::WriteHeader() {
+  Trace trace;
+  trace.ts = clock_->NowMicros();
+  trace.type = TraceType::kTraceBegin;
+  PutLengthPrefixedSlice(&trace.payload, kTraceMagic);
+  PutFixed32(&trace.payload, kMajorVersion);
+  PutFixed32(&trace.payload, kMinorVersion);
+  std::string encoded_trace;
+  TracerHelper::EncodeTrace(trace, &encoded_trace);
+  return trace_writer_->Write(encoded_trace);
+}
+
+IOTraceReader::IOTraceReader(std::unique_ptr<TraceReader>&& reader)
+    : trace_reader_(std::move(reader)) {}
+
+Status IOTraceReader::ReadHeader(IOTraceHeader* header) {
+  assert(header != nullptr);
+  std::string encoded_trace;
+  Status s = trace_reader_->Read(&encoded_trace);
+  if (!s.ok()) {
+    return s;
+  }
+  Trace trace;
+  s = TracerHelper::DecodeTrace(encoded_trace, &trace);
+  if (!s.ok()) {
+    return s;
+  }
+  header->start_time = trace.ts;
+  Slice enc_slice = Slice(trace.payload);
+  Slice magic_number;
+  if (!GetLengthPrefixedSlice(&enc_slice, &magic_number)) {
+    return Status::Corruption(
+        "Corrupted header in the trace file: Failed to read the magic number.");
+  }
+  if (magic_number.ToString() != kTraceMagic) {
+    return Status::Corruption(
+        "Corrupted header in the trace file: Magic number does not match.");
+  }
+  if (!GetFixed32(&enc_slice, &header->rocksdb_major_version)) {
+    return Status::Corruption(
+        "Corrupted header in the trace file: Failed to read rocksdb major "
+        "version number.");
+  }
+  if (!GetFixed32(&enc_slice, &header->rocksdb_minor_version)) {
+    return Status::Corruption(
+        "Corrupted header in the trace file: Failed to read rocksdb minor "
+        "version number.");
+  }
+  // We should have retrieved all information in the header.
+  if (!enc_slice.empty()) {
+    return Status::Corruption(
+        "Corrupted header in the trace file: The length of header is too "
+        "long.");
+  }
+  return Status::OK();
+}
+
+Status IOTraceReader::ReadIOOp(IOTraceRecord* record) {
+  assert(record);
+  std::string encoded_trace;
+  Status s = trace_reader_->Read(&encoded_trace);
+  if (!s.ok()) {
+    return s;
+  }
+  Trace trace;
+  s = TracerHelper::DecodeTrace(encoded_trace, &trace);
+  if (!s.ok()) {
+    return s;
+  }
+  record->access_timestamp = trace.ts;
+  record->trace_type = trace.type;
+  Slice enc_slice = Slice(trace.payload);
+
+  if (!GetFixed64(&enc_slice, &record->io_op_data)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read trace data.");
+  }
+  Slice file_operation;
+  if (!GetLengthPrefixedSlice(&enc_slice, &file_operation)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read file operation.");
+  }
+  record->file_operation = file_operation.ToString();
+  if (!GetFixed64(&enc_slice, &record->latency)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read latency.");
+  }
+  Slice io_status;
+  if (!GetLengthPrefixedSlice(&enc_slice, &io_status)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read IO status.");
+  }
+  record->io_status = io_status.ToString();
+  Slice file_name;
+  if (!GetLengthPrefixedSlice(&enc_slice, &file_name)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read file name.");
+  }
+  record->file_name = file_name.ToString();
+
+  // Each bit in io_op_data stores which corresponding info from IOTraceOp will
+  // be added in the trace. Foreg, if bit at position 1 is set then
+  // IOTraceOp::kIOLen (length) will be logged in the record (Since
+  // IOTraceOp::kIOLen = 1 in the enum). So find all the set positions in
+  // io_op_data one by one and, update corresponsing info in the trace record,
+  // unset that bit to find other set bits until io_op_data = 0.
+  /* Read remaining options based on io_op_data set by file operation */
+  // Assuming 63 bits will be used at max.
+  int64_t io_op_data = static_cast<int64_t>(record->io_op_data);
+  while (io_op_data) {
+    // Find the rightmost set bit.
+    uint32_t set_pos = static_cast<uint32_t>(log2(io_op_data & -io_op_data));
+    switch (set_pos) {
+      case IOTraceOp::kIOFileSize:
+        if (!GetFixed64(&enc_slice, &record->file_size)) {
+          return Status::Incomplete(
+              "Incomplete access record: Failed to read file size.");
+        }
+        break;
+      case IOTraceOp::kIOLen:
+        if (!GetFixed64(&enc_slice, &record->len)) {
+          return Status::Incomplete(
+              "Incomplete access record: Failed to read length.");
+        }
+        break;
+      case IOTraceOp::kIOOffset:
+        if (!GetFixed64(&enc_slice, &record->offset)) {
+          return Status::Incomplete(
+              "Incomplete access record: Failed to read offset.");
+        }
+        break;
+      default:
+        assert(false);
+    }
+    // unset the rightmost bit.
+    io_op_data &= (io_op_data - 1);
+  }
+
+  if (!GetFixed64(&enc_slice, &record->trace_data)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read trace op.");
+  }
+  int64_t trace_data = static_cast<int64_t>(record->trace_data);
+  while (trace_data) {
+    // Find the rightmost set bit.
+    uint32_t set_pos = static_cast<uint32_t>(log2(trace_data & -trace_data));
+    switch (set_pos) {
+      case IODebugContext::TraceData::kRequestID: {
+        Slice request_id;
+        if (!GetLengthPrefixedSlice(&enc_slice, &request_id)) {
+          return Status::Incomplete(
+              "Incomplete access record: Failed to request id.");
+        }
+        record->request_id = request_id.ToString();
+      } break;
+      default:
+        assert(false);
+    }
+    // unset the rightmost bit.
+    trace_data &= (trace_data - 1);
+  }
+
+  return Status::OK();
+}
+
+IOTracer::IOTracer() : tracing_enabled(false) { writer_.store(nullptr); }
+
+IOTracer::~IOTracer() { EndIOTrace(); }
+
+Status IOTracer::StartIOTrace(SystemClock* clock,
+                              const TraceOptions& trace_options,
+                              std::unique_ptr<TraceWriter>&& trace_writer) {
+  InstrumentedMutexLock lock_guard(&trace_writer_mutex_);
+  if (writer_.load()) {
+    return Status::Busy();
+  }
+  trace_options_ = trace_options;
+  writer_.store(
+      new IOTraceWriter(clock, trace_options, std::move(trace_writer)));
+  tracing_enabled = true;
+  return writer_.load()->WriteHeader();
+}
+
+void IOTracer::EndIOTrace() {
+  InstrumentedMutexLock lock_guard(&trace_writer_mutex_);
+  if (!writer_.load()) {
+    return;
+  }
+  delete writer_.load();
+  writer_.store(nullptr);
+  tracing_enabled = false;
+}
+
+void IOTracer::WriteIOOp(const IOTraceRecord& record, IODebugContext* dbg) {
+  if (!writer_.load()) {
+    return;
+  }
+  InstrumentedMutexLock lock_guard(&trace_writer_mutex_);
+  if (!writer_.load()) {
+    return;
+  }
+  writer_.load()->WriteIOOp(record, dbg).PermitUncheckedError();
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/trace_replay/io_tracer.h b/src/rocksdb/trace_replay/io_tracer.h
new file mode 100644
index 000000000..3fc7cdba0
--- /dev/null
+++ b/src/rocksdb/trace_replay/io_tracer.h
@@ -0,0 +1,185 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <fstream>
+
+#include "monitoring/instrumented_mutex.h"
+#include "port/lang.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/options.h"
+#include "rocksdb/trace_record.h"
+#include "trace_replay/trace_replay.h"
+
+namespace ROCKSDB_NAMESPACE {
+class SystemClock;
+class TraceReader;
+class TraceWriter;
+
+/* In order to log new data in trace record for specified operations, do
+   following:
+   1. Add new data in IOTraceOP (say kIONewData= 3)
+   2. Log it in IOTraceWriter::WriteIOOp, and read that in
+   IOTraceReader::ReadIOOp and
+   IOTraceRecordParser::PrintHumanReadableIOTraceRecord in the switch case.
+   3. In the FileSystemTracer APIs where this data will be logged with, update
+   io_op_data |= (1 << IOTraceOp::kIONewData).
+*/
+enum IOTraceOp : char {
+  // The value of each enum represents the bitwise position for
+  // IOTraceRecord.io_op_data.
+  kIOFileSize = 0,
+  kIOLen = 1,
+  kIOOffset = 2,
+};
+
+struct IOTraceRecord {
+  // Required fields for all accesses.
+  uint64_t access_timestamp = 0;
+  TraceType trace_type = TraceType::kTraceMax;
+  // Each bit in io_op_data stores which corresponding info from IOTraceOp will
+  // be added in the trace. Foreg, if bit at position 1 is set then
+  // IOTraceOp::kIOLen (length) will be logged in the record.
+  uint64_t io_op_data = 0;
+  std::string file_operation;
+  uint64_t latency = 0;
+  std::string io_status;
+  // Stores file name instead of full path.
+  std::string file_name;
+
+  // Fields added to record based on IO operation.
+  uint64_t len = 0;
+  uint64_t offset = 0;
+  uint64_t file_size = 0;
+
+  // Additional information passed in IODebugContext.
+  uint64_t trace_data = 0;
+  std::string request_id;
+
+  IOTraceRecord() {}
+
+  IOTraceRecord(const uint64_t& _access_timestamp, const TraceType& _trace_type,
+                const uint64_t& _io_op_data, const std::string& _file_operation,
+                const uint64_t& _latency, const std::string& _io_status,
+                const std::string& _file_name, const uint64_t& _file_size = 0)
+      : access_timestamp(_access_timestamp),
+        trace_type(_trace_type),
+        io_op_data(_io_op_data),
+        file_operation(_file_operation),
+        latency(_latency),
+        io_status(_io_status),
+        file_name(_file_name),
+        file_size(_file_size) {}
+
+  IOTraceRecord(const uint64_t& _access_timestamp, const TraceType& _trace_type,
+                const uint64_t& _io_op_data, const std::string& _file_operation,
+                const uint64_t& _latency, const std::string& _io_status,
+                const std::string& _file_name, const uint64_t& _len,
+                const uint64_t& _offset)
+      : access_timestamp(_access_timestamp),
+        trace_type(_trace_type),
+        io_op_data(_io_op_data),
+        file_operation(_file_operation),
+        latency(_latency),
+        io_status(_io_status),
+        file_name(_file_name),
+        len(_len),
+        offset(_offset) {}
+};
+
+struct IOTraceHeader {
+  uint64_t start_time;
+  uint32_t rocksdb_major_version;
+  uint32_t rocksdb_minor_version;
+};
+
+// IOTraceWriter writes IO operation as a single trace. Each trace will have a
+// timestamp and type, followed by the trace payload.
+class IOTraceWriter {
+ public:
+  IOTraceWriter(SystemClock* clock, const TraceOptions& trace_options,
+                std::unique_ptr<TraceWriter>&& trace_writer);
+  ~IOTraceWriter() = default;
+  // No copy and move.
+  IOTraceWriter(const IOTraceWriter&) = delete;
+  IOTraceWriter& operator=(const IOTraceWriter&) = delete;
+  IOTraceWriter(IOTraceWriter&&) = delete;
+  IOTraceWriter& operator=(IOTraceWriter&&) = delete;
+
+  Status WriteIOOp(const IOTraceRecord& record, IODebugContext* dbg);
+
+  // Write a trace header at the beginning, typically on initiating a trace,
+  // with some metadata like a magic number and RocksDB version.
+  Status WriteHeader();
+
+ private:
+  SystemClock* clock_;
+  TraceOptions trace_options_;
+  std::unique_ptr<TraceWriter> trace_writer_;
+};
+
+// IOTraceReader helps read the trace file generated by IOTraceWriter.
+class IOTraceReader {
+ public:
+  explicit IOTraceReader(std::unique_ptr<TraceReader>&& reader);
+  ~IOTraceReader() = default;
+  // No copy and move.
+  IOTraceReader(const IOTraceReader&) = delete;
+  IOTraceReader& operator=(const IOTraceReader&) = delete;
+  IOTraceReader(IOTraceReader&&) = delete;
+  IOTraceReader& operator=(IOTraceReader&&) = delete;
+
+  Status ReadHeader(IOTraceHeader* header);
+
+  Status ReadIOOp(IOTraceRecord* record);
+
+ private:
+  std::unique_ptr<TraceReader> trace_reader_;
+};
+
+// An IO tracer. It uses IOTraceWriter to write the access record to the
+// trace file.
+class IOTracer {
+ public:
+  IOTracer();
+  ~IOTracer();
+  // No copy and move.
+  IOTracer(const IOTracer&) = delete;
+  IOTracer& operator=(const IOTracer&) = delete;
+  IOTracer(IOTracer&&) = delete;
+  IOTracer& operator=(IOTracer&&) = delete;
+
+  // no_sanitize is added for tracing_enabled. writer_ is protected under mutex
+  // so even if user call Start/EndIOTrace and tracing_enabled is not updated in
+  // the meanwhile, WriteIOOp will anyways check the writer_ protected under
+  // mutex and ignore the operation if writer_is null. So its ok if
+  // tracing_enabled shows non updated value.
+
+  // Start writing IO operations to the trace_writer.
+  TSAN_SUPPRESSION Status
+  StartIOTrace(SystemClock* clock, const TraceOptions& trace_options,
+               std::unique_ptr<TraceWriter>&& trace_writer);
+
+  // Stop writing IO operations to the trace_writer.
+  TSAN_SUPPRESSION void EndIOTrace();
+
+  TSAN_SUPPRESSION bool is_tracing_enabled() const { return tracing_enabled; }
+
+  void WriteIOOp(const IOTraceRecord& record, IODebugContext* dbg);
+
+ private:
+  TraceOptions trace_options_;
+  // A mutex protects the writer_.
+  InstrumentedMutex trace_writer_mutex_;
+  std::atomic<IOTraceWriter*> writer_;
+  // bool tracing_enabled is added to avoid costly operation of checking atomic
+  // variable 'writer_' is nullptr or not in is_tracing_enabled().
+  // is_tracing_enabled() is invoked multiple times by FileSystem classes.
+  bool tracing_enabled;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/trace_replay/io_tracer_test.cc b/src/rocksdb/trace_replay/io_tracer_test.cc
new file mode 100644
index 000000000..be3af4fb3
--- /dev/null
+++ b/src/rocksdb/trace_replay/io_tracer_test.cc
@@ -0,0 +1,353 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "trace_replay/io_tracer.h"
+
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "rocksdb/trace_record.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+const std::string kDummyFile = "/dummy/file";
+
+}  // namespace
+
+class IOTracerTest : public testing::Test {
+ public:
+  IOTracerTest() {
+    test_path_ = test::PerThreadDBPath("io_tracer_test");
+    env_ = ROCKSDB_NAMESPACE::Env::Default();
+    clock_ = env_->GetSystemClock().get();
+    EXPECT_OK(env_->CreateDir(test_path_));
+    trace_file_path_ = test_path_ + "/io_trace";
+  }
+
+  ~IOTracerTest() override {
+    EXPECT_OK(env_->DeleteFile(trace_file_path_));
+    EXPECT_OK(env_->DeleteDir(test_path_));
+  }
+
+  std::string GetFileOperation(uint64_t id) {
+    id = id % 4;
+    switch (id) {
+      case 0:
+        return "CreateDir";
+      case 1:
+        return "GetChildren";
+      case 2:
+        return "FileSize";
+      case 3:
+        return "DeleteDir";
+      default:
+        assert(false);
+    }
+    return "";
+  }
+
+  void WriteIOOp(IOTraceWriter* writer, uint64_t nrecords) {
+    assert(writer);
+    for (uint64_t i = 0; i < nrecords; i++) {
+      IOTraceRecord record;
+      record.io_op_data = 0;
+      record.trace_type = TraceType::kIOTracer;
+      record.io_op_data |= (1 << IOTraceOp::kIOLen);
+      record.io_op_data |= (1 << IOTraceOp::kIOOffset);
+      record.file_operation = GetFileOperation(i);
+      record.io_status = IOStatus::OK().ToString();
+      record.file_name = kDummyFile + std::to_string(i);
+      record.len = i;
+      record.offset = i + 20;
+      EXPECT_OK(writer->WriteIOOp(record, nullptr));
+    }
+  }
+
+  void VerifyIOOp(IOTraceReader* reader, uint32_t nrecords) {
+    assert(reader);
+    for (uint32_t i = 0; i < nrecords; i++) {
+      IOTraceRecord record;
+      ASSERT_OK(reader->ReadIOOp(&record));
+      ASSERT_EQ(record.file_operation, GetFileOperation(i));
+      ASSERT_EQ(record.io_status, IOStatus::OK().ToString());
+      ASSERT_EQ(record.len, i);
+      ASSERT_EQ(record.offset, i + 20);
+    }
+  }
+
+  Env* env_;
+  SystemClock* clock_;
+  EnvOptions env_options_;
+  std::string trace_file_path_;
+  std::string test_path_;
+};
+
+TEST_F(IOTracerTest, MultipleRecordsWithDifferentIOOpOptions) {
+  std::string file_name = kDummyFile + std::to_string(5);
+  {
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    IOTracer writer;
+    ASSERT_OK(writer.StartIOTrace(clock_, trace_opt, std::move(trace_writer)));
+
+    // Write general record.
+    IOTraceRecord record0(0, TraceType::kIOTracer, 0 /*io_op_data*/,
+                          GetFileOperation(0), 155 /*latency*/,
+                          IOStatus::OK().ToString(), file_name);
+    writer.WriteIOOp(record0, nullptr);
+
+    // Write record with FileSize.
+    uint64_t io_op_data = 0;
+    io_op_data |= (1 << IOTraceOp::kIOFileSize);
+    IOTraceRecord record1(0, TraceType::kIOTracer, io_op_data,
+                          GetFileOperation(1), 10 /*latency*/,
+                          IOStatus::OK().ToString(), file_name,
+                          256 /*file_size*/);
+    writer.WriteIOOp(record1, nullptr);
+
+    // Write record with Length.
+    io_op_data = 0;
+    io_op_data |= (1 << IOTraceOp::kIOLen);
+    IOTraceRecord record2(0, TraceType::kIOTracer, io_op_data,
+                          GetFileOperation(2), 10 /*latency*/,
+                          IOStatus::OK().ToString(), file_name, 100 /*length*/,
+                          200 /*offset*/);
+    writer.WriteIOOp(record2, nullptr);
+
+    // Write record with Length and offset.
+    io_op_data = 0;
+    io_op_data |= (1 << IOTraceOp::kIOLen);
+    io_op_data |= (1 << IOTraceOp::kIOOffset);
+    IOTraceRecord record3(0, TraceType::kIOTracer, io_op_data,
+                          GetFileOperation(3), 10 /*latency*/,
+                          IOStatus::OK().ToString(), file_name, 120 /*length*/,
+                          17 /*offset*/);
+    writer.WriteIOOp(record3, nullptr);
+
+    // Write record with offset.
+    io_op_data = 0;
+    io_op_data |= (1 << IOTraceOp::kIOOffset);
+    IOTraceRecord record4(0, TraceType::kIOTracer, io_op_data,
+                          GetFileOperation(4), 10 /*latency*/,
+                          IOStatus::OK().ToString(), file_name, 13 /*length*/,
+                          50 /*offset*/);
+    writer.WriteIOOp(record4, nullptr);
+
+    // Write record with IODebugContext.
+    io_op_data = 0;
+    IODebugContext dbg;
+    dbg.SetRequestId("request_id_1");
+    IOTraceRecord record5(0, TraceType::kIOTracer, io_op_data,
+                          GetFileOperation(5), 10 /*latency*/,
+                          IOStatus::OK().ToString(), file_name);
+    writer.WriteIOOp(record5, &dbg);
+
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+  {
+    // Verify trace file is generated correctly.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_,
+                                 &trace_reader));
+    IOTraceReader reader(std::move(trace_reader));
+    IOTraceHeader header;
+    ASSERT_OK(reader.ReadHeader(&header));
+    ASSERT_EQ(kMajorVersion, static_cast<int>(header.rocksdb_major_version));
+    ASSERT_EQ(kMinorVersion, static_cast<int>(header.rocksdb_minor_version));
+
+    // Read general record.
+    IOTraceRecord record0;
+    ASSERT_OK(reader.ReadIOOp(&record0));
+    ASSERT_EQ(record0.file_operation, GetFileOperation(0));
+    ASSERT_EQ(record0.latency, 155);
+    ASSERT_EQ(record0.file_name, file_name);
+
+    // Read record with FileSize.
+    IOTraceRecord record1;
+    ASSERT_OK(reader.ReadIOOp(&record1));
+    ASSERT_EQ(record1.file_size, 256);
+    ASSERT_EQ(record1.len, 0);
+    ASSERT_EQ(record1.offset, 0);
+
+    // Read record with Length.
+    IOTraceRecord record2;
+    ASSERT_OK(reader.ReadIOOp(&record2));
+    ASSERT_EQ(record2.len, 100);
+    ASSERT_EQ(record2.file_size, 0);
+    ASSERT_EQ(record2.offset, 0);
+
+    // Read record with Length and offset.
+    IOTraceRecord record3;
+    ASSERT_OK(reader.ReadIOOp(&record3));
+    ASSERT_EQ(record3.len, 120);
+    ASSERT_EQ(record3.file_size, 0);
+    ASSERT_EQ(record3.offset, 17);
+
+    // Read record with offset.
+    IOTraceRecord record4;
+    ASSERT_OK(reader.ReadIOOp(&record4));
+    ASSERT_EQ(record4.len, 0);
+    ASSERT_EQ(record4.file_size, 0);
+    ASSERT_EQ(record4.offset, 50);
+
+    IOTraceRecord record5;
+    ASSERT_OK(reader.ReadIOOp(&record5));
+    ASSERT_EQ(record5.len, 0);
+    ASSERT_EQ(record5.file_size, 0);
+    ASSERT_EQ(record5.offset, 0);
+    ASSERT_EQ(record5.request_id, "request_id_1");
+    // Read one more record and it should report error.
+    IOTraceRecord record6;
+    ASSERT_NOK(reader.ReadIOOp(&record6));
+  }
+}
+
+TEST_F(IOTracerTest, AtomicWrite) {
+  std::string file_name = kDummyFile + std::to_string(0);
+  {
+    IOTraceRecord record(0, TraceType::kIOTracer, 0 /*io_op_data*/,
+                         GetFileOperation(0), 10 /*latency*/,
+                         IOStatus::OK().ToString(), file_name);
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    IOTracer writer;
+    ASSERT_OK(writer.StartIOTrace(clock_, trace_opt, std::move(trace_writer)));
+    writer.WriteIOOp(record, nullptr);
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+  {
+    // Verify trace file contains one record.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_,
+                                 &trace_reader));
+    IOTraceReader reader(std::move(trace_reader));
+    IOTraceHeader header;
+    ASSERT_OK(reader.ReadHeader(&header));
+    ASSERT_EQ(kMajorVersion, static_cast<int>(header.rocksdb_major_version));
+    ASSERT_EQ(kMinorVersion, static_cast<int>(header.rocksdb_minor_version));
+    // Read record and verify data.
+    IOTraceRecord access_record;
+    ASSERT_OK(reader.ReadIOOp(&access_record));
+    ASSERT_EQ(access_record.file_operation, GetFileOperation(0));
+    ASSERT_EQ(access_record.io_status, IOStatus::OK().ToString());
+    ASSERT_EQ(access_record.file_name, file_name);
+    ASSERT_NOK(reader.ReadIOOp(&access_record));
+  }
+}
+
+TEST_F(IOTracerTest, AtomicWriteBeforeStartTrace) {
+  std::string file_name = kDummyFile + std::to_string(0);
+  {
+    IOTraceRecord record(0, TraceType::kIOTracer, 0 /*io_op_data*/,
+                         GetFileOperation(0), 0, IOStatus::OK().ToString(),
+                         file_name);
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    IOTracer writer;
+    // The record should not be written to the trace_file since StartIOTrace is
+    // not called.
+    writer.WriteIOOp(record, nullptr);
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+  {
+    // Verify trace file contains nothing.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_,
+                                 &trace_reader));
+    IOTraceReader reader(std::move(trace_reader));
+    IOTraceHeader header;
+    ASSERT_NOK(reader.ReadHeader(&header));
+  }
+}
+
+TEST_F(IOTracerTest, AtomicNoWriteAfterEndTrace) {
+  std::string file_name = kDummyFile + std::to_string(0);
+  {
+    uint64_t io_op_data = 0;
+    io_op_data |= (1 << IOTraceOp::kIOFileSize);
+    IOTraceRecord record(
+        0, TraceType::kIOTracer, io_op_data, GetFileOperation(2), 0 /*latency*/,
+        IOStatus::OK().ToString(), file_name, 10 /*file_size*/);
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    IOTracer writer;
+    ASSERT_OK(writer.StartIOTrace(clock_, trace_opt, std::move(trace_writer)));
+    writer.WriteIOOp(record, nullptr);
+    writer.EndIOTrace();
+    // Write the record again. This time the record should not be written since
+    // EndIOTrace is called.
+    writer.WriteIOOp(record, nullptr);
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+  {
+    // Verify trace file contains one record.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_,
+                                 &trace_reader));
+    IOTraceReader reader(std::move(trace_reader));
+    IOTraceHeader header;
+    ASSERT_OK(reader.ReadHeader(&header));
+    ASSERT_EQ(kMajorVersion, static_cast<int>(header.rocksdb_major_version));
+    ASSERT_EQ(kMinorVersion, static_cast<int>(header.rocksdb_minor_version));
+
+    IOTraceRecord access_record;
+    ASSERT_OK(reader.ReadIOOp(&access_record));
+    ASSERT_EQ(access_record.file_operation, GetFileOperation(2));
+    ASSERT_EQ(access_record.io_status, IOStatus::OK().ToString());
+    ASSERT_EQ(access_record.file_size, 10);
+    // No more record.
+    ASSERT_NOK(reader.ReadIOOp(&access_record));
+  }
+}
+
+TEST_F(IOTracerTest, AtomicMultipleWrites) {
+  {
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    IOTraceWriter writer(clock_, trace_opt, std::move(trace_writer));
+    ASSERT_OK(writer.WriteHeader());
+    // Write 10 records
+    WriteIOOp(&writer, 10);
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+
+  {
+    // Verify trace file is generated correctly.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_,
+                                 &trace_reader));
+    IOTraceReader reader(std::move(trace_reader));
+    IOTraceHeader header;
+    ASSERT_OK(reader.ReadHeader(&header));
+    ASSERT_EQ(kMajorVersion, static_cast<int>(header.rocksdb_major_version));
+    ASSERT_EQ(kMinorVersion, static_cast<int>(header.rocksdb_minor_version));
+    // Read 10 records.
+    VerifyIOOp(&reader, 10);
+    // Read one more and record and it should report error.
+    IOTraceRecord record;
+    ASSERT_NOK(reader.ReadIOOp(&record));
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/trace_replay/trace_record.cc b/src/rocksdb/trace_replay/trace_record.cc
new file mode 100644
index 000000000..21df0275d
--- /dev/null
+++ b/src/rocksdb/trace_replay/trace_record.cc
@@ -0,0 +1,206 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/trace_record.h"
+
+#include <utility>
+
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_record_result.h"
+#include "trace_replay/trace_record_handler.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TraceRecord
+TraceRecord::TraceRecord(uint64_t timestamp) : timestamp_(timestamp) {}
+
+uint64_t TraceRecord::GetTimestamp() const { return timestamp_; }
+
+TraceRecord::Handler* TraceRecord::NewExecutionHandler(
+    DB* db, const std::vector<ColumnFamilyHandle*>& handles) {
+  return new TraceExecutionHandler(db, handles);
+}
+
+// QueryTraceRecord
+QueryTraceRecord::QueryTraceRecord(uint64_t timestamp)
+    : TraceRecord(timestamp) {}
+
+// WriteQueryTraceRecord
+WriteQueryTraceRecord::WriteQueryTraceRecord(PinnableSlice&& write_batch_rep,
+                                             uint64_t timestamp)
+    : QueryTraceRecord(timestamp), rep_(std::move(write_batch_rep)) {}
+
+WriteQueryTraceRecord::WriteQueryTraceRecord(const std::string& write_batch_rep,
+                                             uint64_t timestamp)
+    : QueryTraceRecord(timestamp) {
+  rep_.PinSelf(write_batch_rep);
+}
+
+WriteQueryTraceRecord::~WriteQueryTraceRecord() { rep_.clear(); }
+
+Slice WriteQueryTraceRecord::GetWriteBatchRep() const { return Slice(rep_); }
+
+Status WriteQueryTraceRecord::Accept(
+    Handler* handler, std::unique_ptr<TraceRecordResult>* result) {
+  assert(handler != nullptr);
+  return handler->Handle(*this, result);
+}
+
+// GetQueryTraceRecord
+GetQueryTraceRecord::GetQueryTraceRecord(uint32_t column_family_id,
+                                         PinnableSlice&& key,
+                                         uint64_t timestamp)
+    : QueryTraceRecord(timestamp),
+      cf_id_(column_family_id),
+      key_(std::move(key)) {}
+
+GetQueryTraceRecord::GetQueryTraceRecord(uint32_t column_family_id,
+                                         const std::string& key,
+                                         uint64_t timestamp)
+    : QueryTraceRecord(timestamp), cf_id_(column_family_id) {
+  key_.PinSelf(key);
+}
+
+GetQueryTraceRecord::~GetQueryTraceRecord() { key_.clear(); }
+
+uint32_t GetQueryTraceRecord::GetColumnFamilyID() const { return cf_id_; }
+
+Slice GetQueryTraceRecord::GetKey() const { return Slice(key_); }
+
+Status GetQueryTraceRecord::Accept(Handler* handler,
+                                   std::unique_ptr<TraceRecordResult>* result) {
+  assert(handler != nullptr);
+  return handler->Handle(*this, result);
+}
+
+// IteratorQueryTraceRecord
+IteratorQueryTraceRecord::IteratorQueryTraceRecord(uint64_t timestamp)
+    : QueryTraceRecord(timestamp) {}
+
+IteratorQueryTraceRecord::IteratorQueryTraceRecord(PinnableSlice&& lower_bound,
+                                                   PinnableSlice&& upper_bound,
+                                                   uint64_t timestamp)
+    : QueryTraceRecord(timestamp),
+      lower_(std::move(lower_bound)),
+      upper_(std::move(upper_bound)) {}
+
+IteratorQueryTraceRecord::IteratorQueryTraceRecord(
+    const std::string& lower_bound, const std::string& upper_bound,
+    uint64_t timestamp)
+    : QueryTraceRecord(timestamp) {
+  lower_.PinSelf(lower_bound);
+  upper_.PinSelf(upper_bound);
+}
+
+IteratorQueryTraceRecord::~IteratorQueryTraceRecord() {}
+
+Slice IteratorQueryTraceRecord::GetLowerBound() const { return Slice(lower_); }
+
+Slice IteratorQueryTraceRecord::GetUpperBound() const { return Slice(upper_); }
+
+// IteratorSeekQueryTraceRecord
+IteratorSeekQueryTraceRecord::IteratorSeekQueryTraceRecord(
+    SeekType seek_type, uint32_t column_family_id, PinnableSlice&& key,
+    uint64_t timestamp)
+    : IteratorQueryTraceRecord(timestamp),
+      type_(seek_type),
+      cf_id_(column_family_id),
+      key_(std::move(key)) {}
+
+IteratorSeekQueryTraceRecord::IteratorSeekQueryTraceRecord(
+    SeekType seek_type, uint32_t column_family_id, const std::string& key,
+    uint64_t timestamp)
+    : IteratorQueryTraceRecord(timestamp),
+      type_(seek_type),
+      cf_id_(column_family_id) {
+  key_.PinSelf(key);
+}
+
+IteratorSeekQueryTraceRecord::IteratorSeekQueryTraceRecord(
+    SeekType seek_type, uint32_t column_family_id, PinnableSlice&& key,
+    PinnableSlice&& lower_bound, PinnableSlice&& upper_bound,
+    uint64_t timestamp)
+    : IteratorQueryTraceRecord(std::move(lower_bound), std::move(upper_bound),
+                               timestamp),
+      type_(seek_type),
+      cf_id_(column_family_id),
+      key_(std::move(key)) {}
+
+IteratorSeekQueryTraceRecord::IteratorSeekQueryTraceRecord(
+    SeekType seek_type, uint32_t column_family_id, const std::string& key,
+    const std::string& lower_bound, const std::string& upper_bound,
+    uint64_t timestamp)
+    : IteratorQueryTraceRecord(lower_bound, upper_bound, timestamp),
+      type_(seek_type),
+      cf_id_(column_family_id) {
+  key_.PinSelf(key);
+}
+
+IteratorSeekQueryTraceRecord::~IteratorSeekQueryTraceRecord() { key_.clear(); }
+
+TraceType IteratorSeekQueryTraceRecord::GetTraceType() const {
+  return static_cast<TraceType>(type_);
+}
+
+IteratorSeekQueryTraceRecord::SeekType
+IteratorSeekQueryTraceRecord::GetSeekType() const {
+  return type_;
+}
+
+uint32_t IteratorSeekQueryTraceRecord::GetColumnFamilyID() const {
+  return cf_id_;
+}
+
+Slice IteratorSeekQueryTraceRecord::GetKey() const { return Slice(key_); }
+
+Status IteratorSeekQueryTraceRecord::Accept(
+    Handler* handler, std::unique_ptr<TraceRecordResult>* result) {
+  assert(handler != nullptr);
+  return handler->Handle(*this, result);
+}
+
+// MultiGetQueryTraceRecord
+MultiGetQueryTraceRecord::MultiGetQueryTraceRecord(
+    std::vector<uint32_t> column_family_ids, std::vector<PinnableSlice>&& keys,
+    uint64_t timestamp)
+    : QueryTraceRecord(timestamp),
+      cf_ids_(column_family_ids),
+      keys_(std::move(keys)) {}
+
+MultiGetQueryTraceRecord::MultiGetQueryTraceRecord(
+    std::vector<uint32_t> column_family_ids,
+    const std::vector<std::string>& keys, uint64_t timestamp)
+    : QueryTraceRecord(timestamp), cf_ids_(column_family_ids) {
+  keys_.reserve(keys.size());
+  for (const std::string& key : keys) {
+    PinnableSlice ps;
+    ps.PinSelf(key);
+    keys_.push_back(std::move(ps));
+  }
+}
+
+MultiGetQueryTraceRecord::~MultiGetQueryTraceRecord() {
+  cf_ids_.clear();
+  keys_.clear();
+}
+
+std::vector<uint32_t> MultiGetQueryTraceRecord::GetColumnFamilyIDs() const {
+  return cf_ids_;
+}
+
+std::vector<Slice> MultiGetQueryTraceRecord::GetKeys() const {
+  return std::vector<Slice>(keys_.begin(), keys_.end());
+}
+
+Status MultiGetQueryTraceRecord::Accept(
+    Handler* handler, std::unique_ptr<TraceRecordResult>* result) {
+  assert(handler != nullptr);
+  return handler->Handle(*this, result);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/trace_replay/trace_record_handler.cc b/src/rocksdb/trace_replay/trace_record_handler.cc
new file mode 100644
index 000000000..ca179e870
--- /dev/null
+++ b/src/rocksdb/trace_replay/trace_record_handler.cc
@@ -0,0 +1,190 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "trace_replay/trace_record_handler.h"
+
+#include "rocksdb/iterator.h"
+#include "rocksdb/trace_record_result.h"
+#include "rocksdb/write_batch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TraceExecutionHandler
+TraceExecutionHandler::TraceExecutionHandler(
+    DB* db, const std::vector<ColumnFamilyHandle*>& handles)
+    : TraceRecord::Handler(),
+      db_(db),
+      write_opts_(WriteOptions()),
+      read_opts_(ReadOptions()) {
+  assert(db != nullptr);
+  assert(!handles.empty());
+  cf_map_.reserve(handles.size());
+  for (ColumnFamilyHandle* handle : handles) {
+    assert(handle != nullptr);
+    cf_map_.insert({handle->GetID(), handle});
+  }
+  clock_ = db_->GetEnv()->GetSystemClock().get();
+}
+
+TraceExecutionHandler::~TraceExecutionHandler() { cf_map_.clear(); }
+
+Status TraceExecutionHandler::Handle(
+    const WriteQueryTraceRecord& record,
+    std::unique_ptr<TraceRecordResult>* result) {
+  if (result != nullptr) {
+    result->reset(nullptr);
+  }
+  uint64_t start = clock_->NowMicros();
+
+  WriteBatch batch(record.GetWriteBatchRep().ToString());
+  Status s = db_->Write(write_opts_, &batch);
+
+  uint64_t end = clock_->NowMicros();
+
+  if (s.ok() && result != nullptr) {
+    result->reset(new StatusOnlyTraceExecutionResult(s, start, end,
+                                                     record.GetTraceType()));
+  }
+
+  return s;
+}
+
+Status TraceExecutionHandler::Handle(
+    const GetQueryTraceRecord& record,
+    std::unique_ptr<TraceRecordResult>* result) {
+  if (result != nullptr) {
+    result->reset(nullptr);
+  }
+  auto it = cf_map_.find(record.GetColumnFamilyID());
+  if (it == cf_map_.end()) {
+    return Status::Corruption("Invalid Column Family ID.");
+  }
+
+  uint64_t start = clock_->NowMicros();
+
+  std::string value;
+  Status s = db_->Get(read_opts_, it->second, record.GetKey(), &value);
+
+  uint64_t end = clock_->NowMicros();
+
+  // Treat not found as ok, return other errors.
+  if (!s.ok() && !s.IsNotFound()) {
+    return s;
+  }
+
+  if (result != nullptr) {
+    // Report the actual opetation status in TraceExecutionResult
+    result->reset(new SingleValueTraceExecutionResult(
+        std::move(s), std::move(value), start, end, record.GetTraceType()));
+  }
+  return Status::OK();
+}
+
+Status TraceExecutionHandler::Handle(
+    const IteratorSeekQueryTraceRecord& record,
+    std::unique_ptr<TraceRecordResult>* result) {
+  if (result != nullptr) {
+    result->reset(nullptr);
+  }
+  auto it = cf_map_.find(record.GetColumnFamilyID());
+  if (it == cf_map_.end()) {
+    return Status::Corruption("Invalid Column Family ID.");
+  }
+
+  ReadOptions r_opts = read_opts_;
+  Slice lower = record.GetLowerBound();
+  if (!lower.empty()) {
+    r_opts.iterate_lower_bound = &lower;
+  }
+  Slice upper = record.GetUpperBound();
+  if (!upper.empty()) {
+    r_opts.iterate_upper_bound = &upper;
+  }
+  Iterator* single_iter = db_->NewIterator(r_opts, it->second);
+
+  uint64_t start = clock_->NowMicros();
+
+  switch (record.GetSeekType()) {
+    case IteratorSeekQueryTraceRecord::kSeekForPrev: {
+      single_iter->SeekForPrev(record.GetKey());
+      break;
+    }
+    default: {
+      single_iter->Seek(record.GetKey());
+      break;
+    }
+  }
+
+  uint64_t end = clock_->NowMicros();
+
+  Status s = single_iter->status();
+  if (s.ok() && result != nullptr) {
+    if (single_iter->Valid()) {
+      PinnableSlice ps_key;
+      ps_key.PinSelf(single_iter->key());
+      PinnableSlice ps_value;
+      ps_value.PinSelf(single_iter->value());
+      result->reset(new IteratorTraceExecutionResult(
+          true, s, std::move(ps_key), std::move(ps_value), start, end,
+          record.GetTraceType()));
+    } else {
+      result->reset(new IteratorTraceExecutionResult(
+          false, s, "", "", start, end, record.GetTraceType()));
+    }
+  }
+  delete single_iter;
+
+  return s;
+}
+
+Status TraceExecutionHandler::Handle(
+    const MultiGetQueryTraceRecord& record,
+    std::unique_ptr<TraceRecordResult>* result) {
+  if (result != nullptr) {
+    result->reset(nullptr);
+  }
+  std::vector<ColumnFamilyHandle*> handles;
+  handles.reserve(record.GetColumnFamilyIDs().size());
+  for (uint32_t cf_id : record.GetColumnFamilyIDs()) {
+    auto it = cf_map_.find(cf_id);
+    if (it == cf_map_.end()) {
+      return Status::Corruption("Invalid Column Family ID.");
+    }
+    handles.push_back(it->second);
+  }
+
+  std::vector<Slice> keys = record.GetKeys();
+
+  if (handles.empty() || keys.empty()) {
+    return Status::InvalidArgument("Empty MultiGet cf_ids or keys.");
+  }
+  if (handles.size() != keys.size()) {
+    return Status::InvalidArgument("MultiGet cf_ids and keys size mismatch.");
+  }
+
+  uint64_t start = clock_->NowMicros();
+
+  std::vector<std::string> values;
+  std::vector<Status> ss = db_->MultiGet(read_opts_, handles, keys, &values);
+
+  uint64_t end = clock_->NowMicros();
+
+  // Treat not found as ok, return other errors.
+  for (const Status& s : ss) {
+    if (!s.ok() && !s.IsNotFound()) {
+      return s;
+    }
+  }
+
+  if (result != nullptr) {
+    // Report the actual opetation status in TraceExecutionResult
+    result->reset(new MultiValuesTraceExecutionResult(
+        std::move(ss), std::move(values), start, end, record.GetTraceType()));
+  }
+
+  return Status::OK();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/trace_replay/trace_record_handler.h b/src/rocksdb/trace_replay/trace_record_handler.h
new file mode 100644
index 000000000..88cf317dd
--- /dev/null
+++ b/src/rocksdb/trace_replay/trace_record_handler.h
@@ -0,0 +1,46 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/trace_record.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Handler to execute TraceRecord.
+class TraceExecutionHandler : public TraceRecord::Handler {
+ public:
+  TraceExecutionHandler(DB* db,
+                        const std::vector<ColumnFamilyHandle*>& handles);
+  virtual ~TraceExecutionHandler() override;
+
+  virtual Status Handle(const WriteQueryTraceRecord& record,
+                        std::unique_ptr<TraceRecordResult>* result) override;
+  virtual Status Handle(const GetQueryTraceRecord& record,
+                        std::unique_ptr<TraceRecordResult>* result) override;
+  virtual Status Handle(const IteratorSeekQueryTraceRecord& record,
+                        std::unique_ptr<TraceRecordResult>* result) override;
+  virtual Status Handle(const MultiGetQueryTraceRecord& record,
+                        std::unique_ptr<TraceRecordResult>* result) override;
+
+ private:
+  DB* db_;
+  std::unordered_map<uint32_t, ColumnFamilyHandle*> cf_map_;
+  WriteOptions write_opts_;
+  ReadOptions read_opts_;
+  SystemClock* clock_;
+};
+
+// To do: Handler for trace_analyzer.
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/trace_replay/trace_record_result.cc b/src/rocksdb/trace_replay/trace_record_result.cc
new file mode 100644
index 000000000..9c0cb43ad
--- /dev/null
+++ b/src/rocksdb/trace_replay/trace_record_result.cc
@@ -0,0 +1,146 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/trace_record_result.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TraceRecordResult
+TraceRecordResult::TraceRecordResult(TraceType trace_type)
+    : trace_type_(trace_type) {}
+
+TraceType TraceRecordResult::GetTraceType() const { return trace_type_; }
+
+// TraceExecutionResult
+TraceExecutionResult::TraceExecutionResult(uint64_t start_timestamp,
+                                           uint64_t end_timestamp,
+                                           TraceType trace_type)
+    : TraceRecordResult(trace_type),
+      ts_start_(start_timestamp),
+      ts_end_(end_timestamp) {
+  assert(ts_start_ <= ts_end_);
+}
+
+uint64_t TraceExecutionResult::GetStartTimestamp() const { return ts_start_; }
+
+uint64_t TraceExecutionResult::GetEndTimestamp() const { return ts_end_; }
+
+// StatusOnlyTraceExecutionResult
+StatusOnlyTraceExecutionResult::StatusOnlyTraceExecutionResult(
+    Status status, uint64_t start_timestamp, uint64_t end_timestamp,
+    TraceType trace_type)
+    : TraceExecutionResult(start_timestamp, end_timestamp, trace_type),
+      status_(std::move(status)) {}
+
+const Status& StatusOnlyTraceExecutionResult::GetStatus() const {
+  return status_;
+}
+
+Status StatusOnlyTraceExecutionResult::Accept(Handler* handler) {
+  assert(handler != nullptr);
+  return handler->Handle(*this);
+}
+
+// SingleValueTraceExecutionResult
+SingleValueTraceExecutionResult::SingleValueTraceExecutionResult(
+    Status status, const std::string& value, uint64_t start_timestamp,
+    uint64_t end_timestamp, TraceType trace_type)
+    : TraceExecutionResult(start_timestamp, end_timestamp, trace_type),
+      status_(std::move(status)),
+      value_(value) {}
+
+SingleValueTraceExecutionResult::SingleValueTraceExecutionResult(
+    Status status, std::string&& value, uint64_t start_timestamp,
+    uint64_t end_timestamp, TraceType trace_type)
+    : TraceExecutionResult(start_timestamp, end_timestamp, trace_type),
+      status_(std::move(status)),
+      value_(std::move(value)) {}
+
+SingleValueTraceExecutionResult::~SingleValueTraceExecutionResult() {
+  value_.clear();
+}
+
+const Status& SingleValueTraceExecutionResult::GetStatus() const {
+  return status_;
+}
+
+const std::string& SingleValueTraceExecutionResult::GetValue() const {
+  return value_;
+}
+
+Status SingleValueTraceExecutionResult::Accept(Handler* handler) {
+  assert(handler != nullptr);
+  return handler->Handle(*this);
+}
+
+// MultiValuesTraceExecutionResult
+MultiValuesTraceExecutionResult::MultiValuesTraceExecutionResult(
+    std::vector<Status> multi_status, std::vector<std::string> values,
+    uint64_t start_timestamp, uint64_t end_timestamp, TraceType trace_type)
+    : TraceExecutionResult(start_timestamp, end_timestamp, trace_type),
+      multi_status_(std::move(multi_status)),
+      values_(std::move(values)) {}
+
+MultiValuesTraceExecutionResult::~MultiValuesTraceExecutionResult() {
+  multi_status_.clear();
+  values_.clear();
+}
+
+const std::vector<Status>& MultiValuesTraceExecutionResult::GetMultiStatus()
+    const {
+  return multi_status_;
+}
+
+const std::vector<std::string>& MultiValuesTraceExecutionResult::GetValues()
+    const {
+  return values_;
+}
+
+Status MultiValuesTraceExecutionResult::Accept(Handler* handler) {
+  assert(handler != nullptr);
+  return handler->Handle(*this);
+}
+
+// IteratorTraceExecutionResult
+IteratorTraceExecutionResult::IteratorTraceExecutionResult(
+    bool valid, Status status, PinnableSlice&& key, PinnableSlice&& value,
+    uint64_t start_timestamp, uint64_t end_timestamp, TraceType trace_type)
+    : TraceExecutionResult(start_timestamp, end_timestamp, trace_type),
+      valid_(valid),
+      status_(std::move(status)),
+      key_(std::move(key)),
+      value_(std::move(value)) {}
+
+IteratorTraceExecutionResult::IteratorTraceExecutionResult(
+    bool valid, Status status, const std::string& key, const std::string& value,
+    uint64_t start_timestamp, uint64_t end_timestamp, TraceType trace_type)
+    : TraceExecutionResult(start_timestamp, end_timestamp, trace_type),
+      valid_(valid),
+      status_(std::move(status)) {
+  key_.PinSelf(key);
+  value_.PinSelf(value);
+}
+
+IteratorTraceExecutionResult::~IteratorTraceExecutionResult() {
+  key_.clear();
+  value_.clear();
+}
+
+bool IteratorTraceExecutionResult::GetValid() const { return valid_; }
+
+const Status& IteratorTraceExecutionResult::GetStatus() const {
+  return status_;
+}
+
+Slice IteratorTraceExecutionResult::GetKey() const { return Slice(key_); }
+
+Slice IteratorTraceExecutionResult::GetValue() const { return Slice(value_); }
+
+Status IteratorTraceExecutionResult::Accept(Handler* handler) {
+  assert(handler != nullptr);
+  return handler->Handle(*this);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/trace_replay/trace_replay.cc b/src/rocksdb/trace_replay/trace_replay.cc
new file mode 100644
index 000000000..37b95852b
--- /dev/null
+++ b/src/rocksdb/trace_replay/trace_replay.cc
@@ -0,0 +1,622 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "trace_replay/trace_replay.h"
+
+#include <chrono>
+#include <sstream>
+#include <thread>
+
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "rocksdb/write_batch.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const std::string kTraceMagic = "feedcafedeadbeef";
+
+namespace {
+void DecodeCFAndKey(std::string& buffer, uint32_t* cf_id, Slice* key) {
+  Slice buf(buffer);
+  GetFixed32(&buf, cf_id);
+  GetLengthPrefixedSlice(&buf, key);
+}
+}  // namespace
+
+Status TracerHelper::ParseVersionStr(std::string& v_string, int* v_num) {
+  if (v_string.find_first_of('.') == std::string::npos ||
+      v_string.find_first_of('.') != v_string.find_last_of('.')) {
+    return Status::Corruption(
+        "Corrupted trace file. Incorrect version format.");
+  }
+  int tmp_num = 0;
+  for (int i = 0; i < static_cast<int>(v_string.size()); i++) {
+    if (v_string[i] == '.') {
+      continue;
+    } else if (isdigit(v_string[i])) {
+      tmp_num = tmp_num * 10 + (v_string[i] - '0');
+    } else {
+      return Status::Corruption(
+          "Corrupted trace file. Incorrect version format");
+    }
+  }
+  *v_num = tmp_num;
+  return Status::OK();
+}
+
+Status TracerHelper::ParseTraceHeader(const Trace& header, int* trace_version,
+                                      int* db_version) {
+  std::vector<std::string> s_vec;
+  int begin = 0, end;
+  for (int i = 0; i < 3; i++) {
+    assert(header.payload.find("\t", begin) != std::string::npos);
+    end = static_cast<int>(header.payload.find("\t", begin));
+    s_vec.push_back(header.payload.substr(begin, end - begin));
+    begin = end + 1;
+  }
+
+  std::string t_v_str, db_v_str;
+  assert(s_vec.size() == 3);
+  assert(s_vec[1].find("Trace Version: ") != std::string::npos);
+  t_v_str = s_vec[1].substr(15);
+  assert(s_vec[2].find("RocksDB Version: ") != std::string::npos);
+  db_v_str = s_vec[2].substr(17);
+
+  Status s;
+  s = ParseVersionStr(t_v_str, trace_version);
+  if (s != Status::OK()) {
+    return s;
+  }
+  s = ParseVersionStr(db_v_str, db_version);
+  return s;
+}
+
+void TracerHelper::EncodeTrace(const Trace& trace, std::string* encoded_trace) {
+  assert(encoded_trace);
+  PutFixed64(encoded_trace, trace.ts);
+  encoded_trace->push_back(trace.type);
+  PutFixed32(encoded_trace, static_cast<uint32_t>(trace.payload.size()));
+  encoded_trace->append(trace.payload);
+}
+
+Status TracerHelper::DecodeTrace(const std::string& encoded_trace,
+                                 Trace* trace) {
+  assert(trace != nullptr);
+  Slice enc_slice = Slice(encoded_trace);
+  if (!GetFixed64(&enc_slice, &trace->ts)) {
+    return Status::Incomplete("Decode trace string failed");
+  }
+  if (enc_slice.size() < kTraceTypeSize + kTracePayloadLengthSize) {
+    return Status::Incomplete("Decode trace string failed");
+  }
+  trace->type = static_cast<TraceType>(enc_slice[0]);
+  enc_slice.remove_prefix(kTraceTypeSize + kTracePayloadLengthSize);
+  trace->payload = enc_slice.ToString();
+  return Status::OK();
+}
+
+Status TracerHelper::DecodeHeader(const std::string& encoded_trace,
+                                  Trace* header) {
+  Status s = TracerHelper::DecodeTrace(encoded_trace, header);
+
+  if (header->type != kTraceBegin) {
+    return Status::Corruption("Corrupted trace file. Incorrect header.");
+  }
+  if (header->payload.substr(0, kTraceMagic.length()) != kTraceMagic) {
+    return Status::Corruption("Corrupted trace file. Incorrect magic.");
+  }
+
+  return s;
+}
+
+bool TracerHelper::SetPayloadMap(uint64_t& payload_map,
+                                 const TracePayloadType payload_type) {
+  uint64_t old_state = payload_map;
+  uint64_t tmp = 1;
+  payload_map |= (tmp << payload_type);
+  return old_state != payload_map;
+}
+
+Status TracerHelper::DecodeTraceRecord(Trace* trace, int trace_file_version,
+                                       std::unique_ptr<TraceRecord>* record) {
+  assert(trace != nullptr);
+
+  if (record != nullptr) {
+    record->reset(nullptr);
+  }
+
+  switch (trace->type) {
+    // Write
+    case kTraceWrite: {
+      PinnableSlice rep;
+      if (trace_file_version < 2) {
+        rep.PinSelf(trace->payload);
+      } else {
+        Slice buf(trace->payload);
+        GetFixed64(&buf, &trace->payload_map);
+        int64_t payload_map = static_cast<int64_t>(trace->payload_map);
+        Slice write_batch_data;
+        while (payload_map) {
+          // Find the rightmost set bit.
+          uint32_t set_pos =
+              static_cast<uint32_t>(log2(payload_map & -payload_map));
+          switch (set_pos) {
+            case TracePayloadType::kWriteBatchData: {
+              GetLengthPrefixedSlice(&buf, &write_batch_data);
+              break;
+            }
+            default: {
+              assert(false);
+            }
+          }
+          // unset the rightmost bit.
+          payload_map &= (payload_map - 1);
+        }
+        rep.PinSelf(write_batch_data);
+      }
+
+      if (record != nullptr) {
+        record->reset(new WriteQueryTraceRecord(std::move(rep), trace->ts));
+      }
+
+      return Status::OK();
+    }
+    // Get
+    case kTraceGet: {
+      uint32_t cf_id = 0;
+      Slice get_key;
+
+      if (trace_file_version < 2) {
+        DecodeCFAndKey(trace->payload, &cf_id, &get_key);
+      } else {
+        Slice buf(trace->payload);
+        GetFixed64(&buf, &trace->payload_map);
+        int64_t payload_map = static_cast<int64_t>(trace->payload_map);
+        while (payload_map) {
+          // Find the rightmost set bit.
+          uint32_t set_pos =
+              static_cast<uint32_t>(log2(payload_map & -payload_map));
+          switch (set_pos) {
+            case TracePayloadType::kGetCFID: {
+              GetFixed32(&buf, &cf_id);
+              break;
+            }
+            case TracePayloadType::kGetKey: {
+              GetLengthPrefixedSlice(&buf, &get_key);
+              break;
+            }
+            default: {
+              assert(false);
+            }
+          }
+          // unset the rightmost bit.
+          payload_map &= (payload_map - 1);
+        }
+      }
+
+      if (record != nullptr) {
+        PinnableSlice ps;
+        ps.PinSelf(get_key);
+        record->reset(new GetQueryTraceRecord(cf_id, std::move(ps), trace->ts));
+      }
+
+      return Status::OK();
+    }
+    // Iterator Seek and SeekForPrev
+    case kTraceIteratorSeek:
+    case kTraceIteratorSeekForPrev: {
+      uint32_t cf_id = 0;
+      Slice iter_key;
+      Slice lower_bound;
+      Slice upper_bound;
+
+      if (trace_file_version < 2) {
+        DecodeCFAndKey(trace->payload, &cf_id, &iter_key);
+      } else {
+        Slice buf(trace->payload);
+        GetFixed64(&buf, &trace->payload_map);
+        int64_t payload_map = static_cast<int64_t>(trace->payload_map);
+        while (payload_map) {
+          // Find the rightmost set bit.
+          uint32_t set_pos =
+              static_cast<uint32_t>(log2(payload_map & -payload_map));
+          switch (set_pos) {
+            case TracePayloadType::kIterCFID: {
+              GetFixed32(&buf, &cf_id);
+              break;
+            }
+            case TracePayloadType::kIterKey: {
+              GetLengthPrefixedSlice(&buf, &iter_key);
+              break;
+            }
+            case TracePayloadType::kIterLowerBound: {
+              GetLengthPrefixedSlice(&buf, &lower_bound);
+              break;
+            }
+            case TracePayloadType::kIterUpperBound: {
+              GetLengthPrefixedSlice(&buf, &upper_bound);
+              break;
+            }
+            default: {
+              assert(false);
+            }
+          }
+          // unset the rightmost bit.
+          payload_map &= (payload_map - 1);
+        }
+      }
+
+      if (record != nullptr) {
+        PinnableSlice ps_key;
+        ps_key.PinSelf(iter_key);
+        PinnableSlice ps_lower;
+        ps_lower.PinSelf(lower_bound);
+        PinnableSlice ps_upper;
+        ps_upper.PinSelf(upper_bound);
+        record->reset(new IteratorSeekQueryTraceRecord(
+            static_cast<IteratorSeekQueryTraceRecord::SeekType>(trace->type),
+            cf_id, std::move(ps_key), std::move(ps_lower), std::move(ps_upper),
+            trace->ts));
+      }
+
+      return Status::OK();
+    }
+    // MultiGet
+    case kTraceMultiGet: {
+      if (trace_file_version < 2) {
+        return Status::Corruption("MultiGet is not supported.");
+      }
+
+      uint32_t multiget_size = 0;
+      std::vector<uint32_t> cf_ids;
+      std::vector<PinnableSlice> multiget_keys;
+
+      Slice cfids_payload;
+      Slice keys_payload;
+      Slice buf(trace->payload);
+      GetFixed64(&buf, &trace->payload_map);
+      int64_t payload_map = static_cast<int64_t>(trace->payload_map);
+      while (payload_map) {
+        // Find the rightmost set bit.
+        uint32_t set_pos =
+            static_cast<uint32_t>(log2(payload_map & -payload_map));
+        switch (set_pos) {
+          case TracePayloadType::kMultiGetSize: {
+            GetFixed32(&buf, &multiget_size);
+            break;
+          }
+          case TracePayloadType::kMultiGetCFIDs: {
+            GetLengthPrefixedSlice(&buf, &cfids_payload);
+            break;
+          }
+          case TracePayloadType::kMultiGetKeys: {
+            GetLengthPrefixedSlice(&buf, &keys_payload);
+            break;
+          }
+          default: {
+            assert(false);
+          }
+        }
+        // unset the rightmost bit.
+        payload_map &= (payload_map - 1);
+      }
+      if (multiget_size == 0) {
+        return Status::InvalidArgument("Empty MultiGet cf_ids or keys.");
+      }
+
+      // Decode the cfids_payload and keys_payload
+      cf_ids.reserve(multiget_size);
+      multiget_keys.reserve(multiget_size);
+      for (uint32_t i = 0; i < multiget_size; i++) {
+        uint32_t tmp_cfid;
+        Slice tmp_key;
+        GetFixed32(&cfids_payload, &tmp_cfid);
+        GetLengthPrefixedSlice(&keys_payload, &tmp_key);
+        cf_ids.push_back(tmp_cfid);
+        Slice s(tmp_key);
+        PinnableSlice ps;
+        ps.PinSelf(s);
+        multiget_keys.push_back(std::move(ps));
+      }
+
+      if (record != nullptr) {
+        record->reset(new MultiGetQueryTraceRecord(
+            std::move(cf_ids), std::move(multiget_keys), trace->ts));
+      }
+
+      return Status::OK();
+    }
+    default:
+      return Status::NotSupported("Unsupported trace type.");
+  }
+}
+
+Tracer::Tracer(SystemClock* clock, const TraceOptions& trace_options,
+               std::unique_ptr<TraceWriter>&& trace_writer)
+    : clock_(clock),
+      trace_options_(trace_options),
+      trace_writer_(std::move(trace_writer)),
+      trace_request_count_(0) {
+  // TODO: What if this fails?
+  WriteHeader().PermitUncheckedError();
+}
+
+Tracer::~Tracer() { trace_writer_.reset(); }
+
+Status Tracer::Write(WriteBatch* write_batch) {
+  TraceType trace_type = kTraceWrite;
+  if (ShouldSkipTrace(trace_type)) {
+    return Status::OK();
+  }
+  Trace trace;
+  trace.ts = clock_->NowMicros();
+  trace.type = trace_type;
+  TracerHelper::SetPayloadMap(trace.payload_map,
+                              TracePayloadType::kWriteBatchData);
+  PutFixed64(&trace.payload, trace.payload_map);
+  PutLengthPrefixedSlice(&trace.payload, Slice(write_batch->Data()));
+  return WriteTrace(trace);
+}
+
+Status Tracer::Get(ColumnFamilyHandle* column_family, const Slice& key) {
+  TraceType trace_type = kTraceGet;
+  if (ShouldSkipTrace(trace_type)) {
+    return Status::OK();
+  }
+  Trace trace;
+  trace.ts = clock_->NowMicros();
+  trace.type = trace_type;
+  // Set the payloadmap of the struct member that will be encoded in the
+  // payload.
+  TracerHelper::SetPayloadMap(trace.payload_map, TracePayloadType::kGetCFID);
+  TracerHelper::SetPayloadMap(trace.payload_map, TracePayloadType::kGetKey);
+  // Encode the Get struct members into payload. Make sure add them in order.
+  PutFixed64(&trace.payload, trace.payload_map);
+  PutFixed32(&trace.payload, column_family->GetID());
+  PutLengthPrefixedSlice(&trace.payload, key);
+  return WriteTrace(trace);
+}
+
+Status Tracer::IteratorSeek(const uint32_t& cf_id, const Slice& key,
+                            const Slice& lower_bound, const Slice upper_bound) {
+  TraceType trace_type = kTraceIteratorSeek;
+  if (ShouldSkipTrace(trace_type)) {
+    return Status::OK();
+  }
+  Trace trace;
+  trace.ts = clock_->NowMicros();
+  trace.type = trace_type;
+  // Set the payloadmap of the struct member that will be encoded in the
+  // payload.
+  TracerHelper::SetPayloadMap(trace.payload_map, TracePayloadType::kIterCFID);
+  TracerHelper::SetPayloadMap(trace.payload_map, TracePayloadType::kIterKey);
+  if (lower_bound.size() > 0) {
+    TracerHelper::SetPayloadMap(trace.payload_map,
+                                TracePayloadType::kIterLowerBound);
+  }
+  if (upper_bound.size() > 0) {
+    TracerHelper::SetPayloadMap(trace.payload_map,
+                                TracePayloadType::kIterUpperBound);
+  }
+  // Encode the Iterator struct members into payload. Make sure add them in
+  // order.
+  PutFixed64(&trace.payload, trace.payload_map);
+  PutFixed32(&trace.payload, cf_id);
+  PutLengthPrefixedSlice(&trace.payload, key);
+  if (lower_bound.size() > 0) {
+    PutLengthPrefixedSlice(&trace.payload, lower_bound);
+  }
+  if (upper_bound.size() > 0) {
+    PutLengthPrefixedSlice(&trace.payload, upper_bound);
+  }
+  return WriteTrace(trace);
+}
+
+Status Tracer::IteratorSeekForPrev(const uint32_t& cf_id, const Slice& key,
+                                   const Slice& lower_bound,
+                                   const Slice upper_bound) {
+  TraceType trace_type = kTraceIteratorSeekForPrev;
+  if (ShouldSkipTrace(trace_type)) {
+    return Status::OK();
+  }
+  Trace trace;
+  trace.ts = clock_->NowMicros();
+  trace.type = trace_type;
+  // Set the payloadmap of the struct member that will be encoded in the
+  // payload.
+  TracerHelper::SetPayloadMap(trace.payload_map, TracePayloadType::kIterCFID);
+  TracerHelper::SetPayloadMap(trace.payload_map, TracePayloadType::kIterKey);
+  if (lower_bound.size() > 0) {
+    TracerHelper::SetPayloadMap(trace.payload_map,
+                                TracePayloadType::kIterLowerBound);
+  }
+  if (upper_bound.size() > 0) {
+    TracerHelper::SetPayloadMap(trace.payload_map,
+                                TracePayloadType::kIterUpperBound);
+  }
+  // Encode the Iterator struct members into payload. Make sure add them in
+  // order.
+  PutFixed64(&trace.payload, trace.payload_map);
+  PutFixed32(&trace.payload, cf_id);
+  PutLengthPrefixedSlice(&trace.payload, key);
+  if (lower_bound.size() > 0) {
+    PutLengthPrefixedSlice(&trace.payload, lower_bound);
+  }
+  if (upper_bound.size() > 0) {
+    PutLengthPrefixedSlice(&trace.payload, upper_bound);
+  }
+  return WriteTrace(trace);
+}
+
+Status Tracer::MultiGet(const size_t num_keys,
+                        ColumnFamilyHandle** column_families,
+                        const Slice* keys) {
+  if (num_keys == 0) {
+    return Status::OK();
+  }
+  std::vector<ColumnFamilyHandle*> v_column_families;
+  std::vector<Slice> v_keys;
+  v_column_families.resize(num_keys);
+  v_keys.resize(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    v_column_families[i] = column_families[i];
+    v_keys[i] = keys[i];
+  }
+  return MultiGet(v_column_families, v_keys);
+}
+
+Status Tracer::MultiGet(const size_t num_keys,
+                        ColumnFamilyHandle* column_family, const Slice* keys) {
+  if (num_keys == 0) {
+    return Status::OK();
+  }
+  std::vector<ColumnFamilyHandle*> column_families;
+  std::vector<Slice> v_keys;
+  column_families.resize(num_keys);
+  v_keys.resize(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    column_families[i] = column_family;
+    v_keys[i] = keys[i];
+  }
+  return MultiGet(column_families, v_keys);
+}
+
+Status Tracer::MultiGet(const std::vector<ColumnFamilyHandle*>& column_families,
+                        const std::vector<Slice>& keys) {
+  if (column_families.size() != keys.size()) {
+    return Status::Corruption("the CFs size and keys size does not match!");
+  }
+  TraceType trace_type = kTraceMultiGet;
+  if (ShouldSkipTrace(trace_type)) {
+    return Status::OK();
+  }
+  uint32_t multiget_size = static_cast<uint32_t>(keys.size());
+  Trace trace;
+  trace.ts = clock_->NowMicros();
+  trace.type = trace_type;
+  // Set the payloadmap of the struct member that will be encoded in the
+  // payload.
+  TracerHelper::SetPayloadMap(trace.payload_map,
+                              TracePayloadType::kMultiGetSize);
+  TracerHelper::SetPayloadMap(trace.payload_map,
+                              TracePayloadType::kMultiGetCFIDs);
+  TracerHelper::SetPayloadMap(trace.payload_map,
+                              TracePayloadType::kMultiGetKeys);
+  // Encode the CFIDs inorder
+  std::string cfids_payload;
+  std::string keys_payload;
+  for (uint32_t i = 0; i < multiget_size; i++) {
+    assert(i < column_families.size());
+    assert(i < keys.size());
+    PutFixed32(&cfids_payload, column_families[i]->GetID());
+    PutLengthPrefixedSlice(&keys_payload, keys[i]);
+  }
+  // Encode the Get struct members into payload. Make sure add them in order.
+  PutFixed64(&trace.payload, trace.payload_map);
+  PutFixed32(&trace.payload, multiget_size);
+  PutLengthPrefixedSlice(&trace.payload, cfids_payload);
+  PutLengthPrefixedSlice(&trace.payload, keys_payload);
+  return WriteTrace(trace);
+}
+
+bool Tracer::ShouldSkipTrace(const TraceType& trace_type) {
+  if (IsTraceFileOverMax()) {
+    return true;
+  }
+
+  TraceFilterType filter_mask = kTraceFilterNone;
+  switch (trace_type) {
+    case kTraceNone:
+    case kTraceBegin:
+    case kTraceEnd:
+      filter_mask = kTraceFilterNone;
+      break;
+    case kTraceWrite:
+      filter_mask = kTraceFilterWrite;
+      break;
+    case kTraceGet:
+      filter_mask = kTraceFilterGet;
+      break;
+    case kTraceIteratorSeek:
+      filter_mask = kTraceFilterIteratorSeek;
+      break;
+    case kTraceIteratorSeekForPrev:
+      filter_mask = kTraceFilterIteratorSeekForPrev;
+      break;
+    case kBlockTraceIndexBlock:
+    case kBlockTraceFilterBlock:
+    case kBlockTraceDataBlock:
+    case kBlockTraceUncompressionDictBlock:
+    case kBlockTraceRangeDeletionBlock:
+    case kIOTracer:
+      filter_mask = kTraceFilterNone;
+      break;
+    case kTraceMultiGet:
+      filter_mask = kTraceFilterMultiGet;
+      break;
+    case kTraceMax:
+      assert(false);
+      filter_mask = kTraceFilterNone;
+      break;
+  }
+  if (filter_mask != kTraceFilterNone && trace_options_.filter & filter_mask) {
+    return true;
+  }
+
+  ++trace_request_count_;
+  if (trace_request_count_ < trace_options_.sampling_frequency) {
+    return true;
+  }
+  trace_request_count_ = 0;
+  return false;
+}
+
+bool Tracer::IsTraceFileOverMax() {
+  uint64_t trace_file_size = trace_writer_->GetFileSize();
+  return (trace_file_size > trace_options_.max_trace_file_size);
+}
+
+Status Tracer::WriteHeader() {
+  std::ostringstream s;
+  s << kTraceMagic << "\t"
+    << "Trace Version: " << kTraceFileMajorVersion << "."
+    << kTraceFileMinorVersion << "\t"
+    << "RocksDB Version: " << kMajorVersion << "." << kMinorVersion << "\t"
+    << "Format: Timestamp OpType Payload\n";
+  std::string header(s.str());
+
+  Trace trace;
+  trace.ts = clock_->NowMicros();
+  trace.type = kTraceBegin;
+  trace.payload = header;
+  return WriteTrace(trace);
+}
+
+Status Tracer::WriteFooter() {
+  Trace trace;
+  trace.ts = clock_->NowMicros();
+  trace.type = kTraceEnd;
+  TracerHelper::SetPayloadMap(trace.payload_map,
+                              TracePayloadType::kEmptyPayload);
+  trace.payload = "";
+  return WriteTrace(trace);
+}
+
+Status Tracer::WriteTrace(const Trace& trace) {
+  std::string encoded_trace;
+  TracerHelper::EncodeTrace(trace, &encoded_trace);
+  return trace_writer_->Write(Slice(encoded_trace));
+}
+
+Status Tracer::Close() { return WriteFooter(); }
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/trace_replay/trace_replay.h b/src/rocksdb/trace_replay/trace_replay.h
new file mode 100644
index 000000000..9aba5ceb7
--- /dev/null
+++ b/src/rocksdb/trace_replay/trace_replay.h
@@ -0,0 +1,183 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <utility>
+
+#include "rocksdb/options.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_record.h"
+#include "rocksdb/utilities/replayer.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This file contains Tracer and Replayer classes that enable capturing and
+// replaying RocksDB traces.
+
+class ColumnFamilyHandle;
+class ColumnFamilyData;
+class DB;
+class DBImpl;
+class Env;
+class Slice;
+class SystemClock;
+class TraceReader;
+class TraceWriter;
+class WriteBatch;
+
+struct ReadOptions;
+struct TraceOptions;
+struct WriteOptions;
+
+extern const std::string kTraceMagic;
+const unsigned int kTraceTimestampSize = 8;
+const unsigned int kTraceTypeSize = 1;
+const unsigned int kTracePayloadLengthSize = 4;
+const unsigned int kTraceMetadataSize =
+    kTraceTimestampSize + kTraceTypeSize + kTracePayloadLengthSize;
+
+static const int kTraceFileMajorVersion = 0;
+static const int kTraceFileMinorVersion = 2;
+
+// The data structure that defines a single trace.
+struct Trace {
+  uint64_t ts;  // timestamp
+  TraceType type;
+  // Each bit in payload_map stores which corresponding struct member added in
+  // the payload. Each TraceType has its corresponding payload struct. For
+  // example, if bit at position 0 is set in write payload, then the write batch
+  // will be addedd.
+  uint64_t payload_map = 0;
+  // Each trace type has its own payload_struct, which will be serilized in the
+  // payload.
+  std::string payload;
+
+  void reset() {
+    ts = 0;
+    type = kTraceMax;
+    payload_map = 0;
+    payload.clear();
+  }
+};
+
+enum TracePayloadType : char {
+  // Each member of all query payload structs should have a corresponding flag
+  // here. Make sure to add them sequentially in the order of it is added.
+  kEmptyPayload = 0,
+  kWriteBatchData = 1,
+  kGetCFID = 2,
+  kGetKey = 3,
+  kIterCFID = 4,
+  kIterKey = 5,
+  kIterLowerBound = 6,
+  kIterUpperBound = 7,
+  kMultiGetSize = 8,
+  kMultiGetCFIDs = 9,
+  kMultiGetKeys = 10,
+};
+
+class TracerHelper {
+ public:
+  // Parse the string with major and minor version only
+  static Status ParseVersionStr(std::string& v_string, int* v_num);
+
+  // Parse the trace file version and db version in trace header
+  static Status ParseTraceHeader(const Trace& header, int* trace_version,
+                                 int* db_version);
+
+  // Encode a version 0.1 trace object into the given string.
+  static void EncodeTrace(const Trace& trace, std::string* encoded_trace);
+
+  // Decode a string into the given trace object.
+  static Status DecodeTrace(const std::string& encoded_trace, Trace* trace);
+
+  // Decode a string into the given trace header.
+  static Status DecodeHeader(const std::string& encoded_trace, Trace* header);
+
+  // Set the payload map based on the payload type
+  static bool SetPayloadMap(uint64_t& payload_map,
+                            const TracePayloadType payload_type);
+
+  // Decode a Trace object into the corresponding TraceRecord.
+  // Return Status::OK() if nothing is wrong, record will be set accordingly.
+  // Return Status::NotSupported() if the trace type is not support, or the
+  // corresponding error status, record will be set to nullptr.
+  static Status DecodeTraceRecord(Trace* trace, int trace_file_version,
+                                  std::unique_ptr<TraceRecord>* record);
+};
+
+// Tracer captures all RocksDB operations using a user-provided TraceWriter.
+// Every RocksDB operation is written as a single trace. Each trace will have a
+// timestamp and type, followed by the trace payload.
+class Tracer {
+ public:
+  Tracer(SystemClock* clock, const TraceOptions& trace_options,
+         std::unique_ptr<TraceWriter>&& trace_writer);
+  ~Tracer();
+
+  // Trace all write operations -- Put, Merge, Delete, SingleDelete, Write
+  Status Write(WriteBatch* write_batch);
+
+  // Trace Get operations.
+  Status Get(ColumnFamilyHandle* cfname, const Slice& key);
+
+  // Trace Iterators.
+  Status IteratorSeek(const uint32_t& cf_id, const Slice& key,
+                      const Slice& lower_bound, const Slice upper_bound);
+  Status IteratorSeekForPrev(const uint32_t& cf_id, const Slice& key,
+                             const Slice& lower_bound, const Slice upper_bound);
+
+  // Trace MultiGet
+
+  Status MultiGet(const size_t num_keys, ColumnFamilyHandle** column_families,
+                  const Slice* keys);
+
+  Status MultiGet(const size_t num_keys, ColumnFamilyHandle* column_family,
+                  const Slice* keys);
+
+  Status MultiGet(const std::vector<ColumnFamilyHandle*>& column_family,
+                  const std::vector<Slice>& keys);
+
+  // Returns true if the trace is over the configured max trace file limit.
+  // False otherwise.
+  bool IsTraceFileOverMax();
+
+  // Returns true if the order of write trace records must match the order of
+  // the corresponding records logged to WAL and applied to the DB.
+  bool IsWriteOrderPreserved() { return trace_options_.preserve_write_order; }
+
+  // Writes a trace footer at the end of the tracing
+  Status Close();
+
+ private:
+  // Write a trace header at the beginning, typically on initiating a trace,
+  // with some metadata like a magic number, trace version, RocksDB version, and
+  // trace format.
+  Status WriteHeader();
+
+  // Write a trace footer, typically on ending a trace, with some metadata.
+  Status WriteFooter();
+
+  // Write a single trace using the provided TraceWriter to the underlying
+  // system, say, a filesystem or a streaming service.
+  Status WriteTrace(const Trace& trace);
+
+  // Helps in filtering and sampling of traces.
+  // Returns true if a trace should be skipped, false otherwise.
+  bool ShouldSkipTrace(const TraceType& type);
+
+  SystemClock* clock_;
+  TraceOptions trace_options_;
+  std::unique_ptr<TraceWriter> trace_writer_;
+  uint64_t trace_request_count_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/aligned_buffer.h b/src/rocksdb/util/aligned_buffer.h
new file mode 100644
index 000000000..95ee5dfe8
--- /dev/null
+++ b/src/rocksdb/util/aligned_buffer.h
@@ -0,0 +1,234 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <algorithm>
+
+#include "port/port.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This file contains utilities to handle the alignment of pages and buffers.
+
+// Truncate to a multiple of page_size, which is also a page boundary. This
+// helps to figuring out the right alignment.
+// Example:
+//   TruncateToPageBoundary(4096, 5000)  => 4096
+//   TruncateToPageBoundary((4096, 10000) => 8192
+inline size_t TruncateToPageBoundary(size_t page_size, size_t s) {
+  s -= (s & (page_size - 1));
+  assert((s % page_size) == 0);
+  return s;
+}
+
+// Round up x to a multiple of y.
+// Example:
+//   Roundup(13, 5)   => 15
+//   Roundup(201, 16) => 208
+inline size_t Roundup(size_t x, size_t y) { return ((x + y - 1) / y) * y; }
+
+// Round down x to a multiple of y.
+// Example:
+//   Rounddown(13, 5)   => 10
+//   Rounddown(201, 16) => 192
+inline size_t Rounddown(size_t x, size_t y) { return (x / y) * y; }
+
+// AlignedBuffer manages a buffer by taking alignment into consideration, and
+// aligns the buffer start and end positions. It is mainly used for direct I/O,
+// though it can be used other purposes as well.
+// It also supports expanding the managed buffer, and copying whole or part of
+// the data from old buffer into the new expanded buffer. Such a copy especially
+// helps in cases avoiding an IO to re-fetch the data from disk.
+//
+// Example:
+//   AlignedBuffer buf;
+//   buf.Alignment(alignment);
+//   buf.AllocateNewBuffer(user_requested_buf_size);
+//   ...
+//   buf.AllocateNewBuffer(2*user_requested_buf_size, /*copy_data*/ true,
+//                         copy_offset, copy_len);
+class AlignedBuffer {
+  size_t alignment_;
+  std::unique_ptr<char[]> buf_;
+  size_t capacity_;
+  size_t cursize_;
+  char* bufstart_;
+
+ public:
+  AlignedBuffer()
+      : alignment_(), capacity_(0), cursize_(0), bufstart_(nullptr) {}
+
+  AlignedBuffer(AlignedBuffer&& o) noexcept { *this = std::move(o); }
+
+  AlignedBuffer& operator=(AlignedBuffer&& o) noexcept {
+    alignment_ = std::move(o.alignment_);
+    buf_ = std::move(o.buf_);
+    capacity_ = std::move(o.capacity_);
+    cursize_ = std::move(o.cursize_);
+    bufstart_ = std::move(o.bufstart_);
+    return *this;
+  }
+
+  AlignedBuffer(const AlignedBuffer&) = delete;
+
+  AlignedBuffer& operator=(const AlignedBuffer&) = delete;
+
+  static bool isAligned(const void* ptr, size_t alignment) {
+    return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
+  }
+
+  static bool isAligned(size_t n, size_t alignment) {
+    return n % alignment == 0;
+  }
+
+  size_t Alignment() const { return alignment_; }
+
+  size_t Capacity() const { return capacity_; }
+
+  size_t CurrentSize() const { return cursize_; }
+
+  const char* BufferStart() const { return bufstart_; }
+
+  char* BufferStart() { return bufstart_; }
+
+  void Clear() { cursize_ = 0; }
+
+  char* Release() {
+    cursize_ = 0;
+    capacity_ = 0;
+    bufstart_ = nullptr;
+    return buf_.release();
+  }
+
+  void Alignment(size_t alignment) {
+    assert(alignment > 0);
+    assert((alignment & (alignment - 1)) == 0);
+    alignment_ = alignment;
+  }
+
+  // Allocates a new buffer and sets the start position to the first aligned
+  // byte.
+  //
+  // requested_capacity: requested new buffer capacity. This capacity will be
+  //     rounded up based on alignment.
+  // copy_data: Copy data from old buffer to new buffer. If copy_offset and
+  //     copy_len are not passed in and the new requested capacity is bigger
+  //     than the existing buffer's capacity, the data in the exising buffer is
+  //     fully copied over to the new buffer.
+  // copy_offset: Copy data from this offset in old buffer.
+  // copy_len: Number of bytes to copy.
+  //
+  // The function does nothing if the new requested_capacity is smaller than
+  // the current buffer capacity and copy_data is true i.e. the old buffer is
+  // retained as is.
+  void AllocateNewBuffer(size_t requested_capacity, bool copy_data = false,
+                         uint64_t copy_offset = 0, size_t copy_len = 0) {
+    assert(alignment_ > 0);
+    assert((alignment_ & (alignment_ - 1)) == 0);
+
+    copy_len = copy_len > 0 ? copy_len : cursize_;
+    if (copy_data && requested_capacity < copy_len) {
+      // If we are downsizing to a capacity that is smaller than the current
+      // data in the buffer -- Ignore the request.
+      return;
+    }
+
+    size_t new_capacity = Roundup(requested_capacity, alignment_);
+    char* new_buf = new char[new_capacity + alignment_];
+    char* new_bufstart = reinterpret_cast<char*>(
+        (reinterpret_cast<uintptr_t>(new_buf) + (alignment_ - 1)) &
+        ~static_cast<uintptr_t>(alignment_ - 1));
+
+    if (copy_data) {
+      assert(bufstart_ + copy_offset + copy_len <= bufstart_ + cursize_);
+      memcpy(new_bufstart, bufstart_ + copy_offset, copy_len);
+      cursize_ = copy_len;
+    } else {
+      cursize_ = 0;
+    }
+
+    bufstart_ = new_bufstart;
+    capacity_ = new_capacity;
+    buf_.reset(new_buf);
+  }
+
+  // Append to the buffer.
+  //
+  // src         : source to copy the data from.
+  // append_size : number of bytes to copy from src.
+  // Returns the number of bytes appended.
+  //
+  // If append_size is more than the remaining buffer size only the
+  // remaining-size worth of bytes are copied.
+  size_t Append(const char* src, size_t append_size) {
+    size_t buffer_remaining = capacity_ - cursize_;
+    size_t to_copy = std::min(append_size, buffer_remaining);
+
+    if (to_copy > 0) {
+      memcpy(bufstart_ + cursize_, src, to_copy);
+      cursize_ += to_copy;
+    }
+    return to_copy;
+  }
+
+  // Read from the buffer.
+  //
+  // dest      : destination buffer to copy the data to.
+  // offset    : the buffer offset to start reading from.
+  // read_size : the number of bytes to copy from the buffer to dest.
+  // Returns the number of bytes read/copied to dest.
+  size_t Read(char* dest, size_t offset, size_t read_size) const {
+    assert(offset < cursize_);
+
+    size_t to_read = 0;
+    if (offset < cursize_) {
+      to_read = std::min(cursize_ - offset, read_size);
+    }
+    if (to_read > 0) {
+      memcpy(dest, bufstart_ + offset, to_read);
+    }
+    return to_read;
+  }
+
+  // Pad to the end of alignment with "padding"
+  void PadToAlignmentWith(int padding) {
+    size_t total_size = Roundup(cursize_, alignment_);
+    size_t pad_size = total_size - cursize_;
+
+    if (pad_size > 0) {
+      assert((pad_size + cursize_) <= capacity_);
+      memset(bufstart_ + cursize_, padding, pad_size);
+      cursize_ += pad_size;
+    }
+  }
+
+  void PadWith(size_t pad_size, int padding) {
+    assert((pad_size + cursize_) <= capacity_);
+    memset(bufstart_ + cursize_, padding, pad_size);
+    cursize_ += pad_size;
+  }
+
+  // After a partial flush move the tail to the beginning of the buffer.
+  void RefitTail(size_t tail_offset, size_t tail_size) {
+    if (tail_size > 0) {
+      memmove(bufstart_, bufstart_ + tail_offset, tail_size);
+    }
+    cursize_ = tail_size;
+  }
+
+  // Returns a place to start appending.
+  // WARNING: Note that it is possible to write past the end of the buffer if
+  // the buffer is modified without using the write APIs or encapsulation
+  // offered by AlignedBuffer. It is up to the user to guard against such
+  // errors.
+  char* Destination() { return bufstart_ + cursize_; }
+
+  void Size(size_t cursize) { cursize_ = cursize; }
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/async_file_reader.cc b/src/rocksdb/util/async_file_reader.cc
new file mode 100644
index 000000000..8401a6b44
--- /dev/null
+++ b/src/rocksdb/util/async_file_reader.cc
@@ -0,0 +1,73 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#if USE_COROUTINES
+#include "util/async_file_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+bool AsyncFileReader::MultiReadAsyncImpl(ReadAwaiter* awaiter) {
+  if (tail_) {
+    tail_->next_ = awaiter;
+  }
+  tail_ = awaiter;
+  if (!head_) {
+    head_ = awaiter;
+  }
+  num_reqs_ += awaiter->num_reqs_;
+  awaiter->io_handle_.resize(awaiter->num_reqs_);
+  awaiter->del_fn_.resize(awaiter->num_reqs_);
+  for (size_t i = 0; i < awaiter->num_reqs_; ++i) {
+    awaiter->file_
+        ->ReadAsync(
+            awaiter->read_reqs_[i], awaiter->opts_,
+            [](const FSReadRequest& req, void* cb_arg) {
+              FSReadRequest* read_req = static_cast<FSReadRequest*>(cb_arg);
+              read_req->status = req.status;
+              read_req->result = req.result;
+            },
+            &awaiter->read_reqs_[i], &awaiter->io_handle_[i],
+            &awaiter->del_fn_[i], /*aligned_buf=*/nullptr)
+        .PermitUncheckedError();
+  }
+  return true;
+}
+
+void AsyncFileReader::Wait() {
+  if (!head_) {
+    return;
+  }
+  ReadAwaiter* waiter;
+  std::vector<void*> io_handles;
+  io_handles.reserve(num_reqs_);
+  waiter = head_;
+  do {
+    for (size_t i = 0; i < waiter->num_reqs_; ++i) {
+      if (waiter->io_handle_[i]) {
+        io_handles.push_back(waiter->io_handle_[i]);
+      }
+    }
+  } while (waiter != tail_ && (waiter = waiter->next_));
+  if (io_handles.size() > 0) {
+    StopWatch sw(SystemClock::Default().get(), stats_, POLL_WAIT_MICROS);
+    fs_->Poll(io_handles, io_handles.size()).PermitUncheckedError();
+  }
+  do {
+    waiter = head_;
+    head_ = waiter->next_;
+
+    for (size_t i = 0; i < waiter->num_reqs_; ++i) {
+      if (waiter->io_handle_[i] && waiter->del_fn_[i]) {
+        waiter->del_fn_[i](waiter->io_handle_[i]);
+      }
+    }
+    waiter->awaiting_coro_.resume();
+  } while (waiter != tail_);
+  head_ = tail_ = nullptr;
+  RecordInHistogram(stats_, MULTIGET_IO_BATCH_SIZE, num_reqs_);
+  num_reqs_ = 0;
+}
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // USE_COROUTINES
diff --git a/src/rocksdb/util/async_file_reader.h b/src/rocksdb/util/async_file_reader.h
new file mode 100644
index 000000000..df69a840e
--- /dev/null
+++ b/src/rocksdb/util/async_file_reader.h
@@ -0,0 +1,144 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).#pragma once
+#pragma once
+
+#if USE_COROUTINES
+#include "file/random_access_file_reader.h"
+#include "folly/experimental/coro/ViaIfAsync.h"
+#include "port/port.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/statistics.h"
+#include "util/autovector.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+class SingleThreadExecutor;
+
+// AsyncFileReader implements the Awaitable concept, which allows calling
+// coroutines to co_await it. When the AsyncFileReader Awaitable is
+// resumed, it initiates the fie reads requested by the awaiting caller
+// by calling RandomAccessFileReader's ReadAsync. It then suspends the
+// awaiting coroutine. The suspended awaiter is later resumed by Wait().
+class AsyncFileReader {
+  class ReadAwaiter;
+  template <typename Awaiter>
+  class ReadOperation;
+
+ public:
+  AsyncFileReader(FileSystem* fs, Statistics* stats) : fs_(fs), stats_(stats) {}
+
+  ~AsyncFileReader() {}
+
+  ReadOperation<ReadAwaiter> MultiReadAsync(RandomAccessFileReader* file,
+                                            const IOOptions& opts,
+                                            FSReadRequest* read_reqs,
+                                            size_t num_reqs,
+                                            AlignedBuf* aligned_buf) noexcept {
+    return ReadOperation<ReadAwaiter>{*this,     file,     opts,
+                                      read_reqs, num_reqs, aligned_buf};
+  }
+
+ private:
+  friend SingleThreadExecutor;
+
+  // Implementation of the Awaitable concept
+  class ReadAwaiter {
+   public:
+    explicit ReadAwaiter(AsyncFileReader& reader, RandomAccessFileReader* file,
+                         const IOOptions& opts, FSReadRequest* read_reqs,
+                         size_t num_reqs, AlignedBuf* /*aligned_buf*/) noexcept
+        : reader_(reader),
+          file_(file),
+          opts_(opts),
+          read_reqs_(read_reqs),
+          num_reqs_(num_reqs),
+          next_(nullptr) {}
+
+    bool await_ready() noexcept { return false; }
+
+    // A return value of true means suspend the awaiter (calling coroutine). The
+    // awaiting_coro parameter is the handle of the awaiter. The handle can be
+    // resumed later, so we cache it here.
+    bool await_suspend(
+        folly::coro::impl::coroutine_handle<> awaiting_coro) noexcept {
+      awaiting_coro_ = awaiting_coro;
+      // MultiReadAsyncImpl always returns true, so caller will be suspended
+      return reader_.MultiReadAsyncImpl(this);
+    }
+
+    void await_resume() noexcept {}
+
+   private:
+    friend AsyncFileReader;
+
+    // The parameters passed to MultiReadAsync are cached here when the caller
+    // calls MultiReadAsync. Later, when the execution of this awaitable is
+    // started, these are used to do the actual IO
+    AsyncFileReader& reader_;
+    RandomAccessFileReader* file_;
+    const IOOptions& opts_;
+    FSReadRequest* read_reqs_;
+    size_t num_reqs_;
+    autovector<void*, 32> io_handle_;
+    autovector<IOHandleDeleter, 32> del_fn_;
+    folly::coro::impl::coroutine_handle<> awaiting_coro_;
+    // Use this to link to the next ReadAwaiter in the suspended coroutine
+    // list. The head and tail of the list are tracked by AsyncFileReader.
+    // We use this approach rather than an STL container in order to avoid
+    // extra memory allocations. The coroutine call already allocates a
+    // ReadAwaiter object.
+    ReadAwaiter* next_;
+  };
+
+  // An instance of ReadOperation is returned to the caller of MultiGetAsync.
+  // This represents an awaitable that can be started later.
+  template <typename Awaiter>
+  class ReadOperation {
+   public:
+    explicit ReadOperation(AsyncFileReader& reader,
+                           RandomAccessFileReader* file, const IOOptions& opts,
+                           FSReadRequest* read_reqs, size_t num_reqs,
+                           AlignedBuf* aligned_buf) noexcept
+        : reader_(reader),
+          file_(file),
+          opts_(opts),
+          read_reqs_(read_reqs),
+          num_reqs_(num_reqs),
+          aligned_buf_(aligned_buf) {}
+
+    auto viaIfAsync(folly::Executor::KeepAlive<> executor) const {
+      return folly::coro::co_viaIfAsync(
+          std::move(executor),
+          Awaiter{reader_, file_, opts_, read_reqs_, num_reqs_, aligned_buf_});
+    }
+
+   private:
+    AsyncFileReader& reader_;
+    RandomAccessFileReader* file_;
+    const IOOptions& opts_;
+    FSReadRequest* read_reqs_;
+    size_t num_reqs_;
+    AlignedBuf* aligned_buf_;
+  };
+
+  // This function does the actual work when this awaitable starts execution
+  bool MultiReadAsyncImpl(ReadAwaiter* awaiter);
+
+  // Called by the SingleThreadExecutor to poll for async IO completion.
+  // This also resumes the awaiting coroutines.
+  void Wait();
+
+  // Head of the queue of awaiters waiting for async IO completion
+  ReadAwaiter* head_ = nullptr;
+  // Tail of the awaiter queue
+  ReadAwaiter* tail_ = nullptr;
+  // Total number of pending async IOs
+  size_t num_reqs_ = 0;
+  FileSystem* fs_;
+  Statistics* stats_;
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // USE_COROUTINES
diff --git a/src/rocksdb/util/autovector.h b/src/rocksdb/util/autovector.h
new file mode 100644
index 000000000..f758473b7
--- /dev/null
+++ b/src/rocksdb/util/autovector.h
@@ -0,0 +1,406 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <initializer_list>
+#include <iterator>
+#include <stdexcept>
+#include <vector>
+
+#include "port/lang.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifdef ROCKSDB_LITE
+template <class T, size_t kSize = 8>
+class autovector : public std::vector<T> {
+  using std::vector<T>::vector;
+
+ public:
+  autovector() {
+    // Make sure the initial vector has space for kSize elements
+    std::vector<T>::reserve(kSize);
+  }
+};
+#else
+// A vector that leverages pre-allocated stack-based array to achieve better
+// performance for array with small amount of items.
+//
+// The interface resembles that of vector, but with less features since we aim
+// to solve the problem that we have in hand, rather than implementing a
+// full-fledged generic container.
+//
+// Currently we don't support:
+//  * shrink_to_fit()
+//     If used correctly, in most cases, people should not touch the
+//     underlying vector at all.
+//  * random insert()/erase(), please only use push_back()/pop_back().
+//  * No move/swap operations. Each autovector instance has a
+//     stack-allocated array and if we want support move/swap operations, we
+//     need to copy the arrays other than just swapping the pointers. In this
+//     case we'll just explicitly forbid these operations since they may
+//     lead users to make false assumption by thinking they are inexpensive
+//     operations.
+//
+// Naming style of public methods almost follows that of the STL's.
+template <class T, size_t kSize = 8>
+class autovector {
+ public:
+  // General STL-style container member types.
+  using value_type = T;
+  using difference_type = typename std::vector<T>::difference_type;
+  using size_type = typename std::vector<T>::size_type;
+  using reference = value_type&;
+  using const_reference = const value_type&;
+  using pointer = value_type*;
+  using const_pointer = const value_type*;
+
+  // This class is the base for regular/const iterator
+  template <class TAutoVector, class TValueType>
+  class iterator_impl {
+   public:
+    // -- iterator traits
+    using self_type = iterator_impl<TAutoVector, TValueType>;
+    using value_type = TValueType;
+    using reference = TValueType&;
+    using pointer = TValueType*;
+    using difference_type = typename TAutoVector::difference_type;
+    using iterator_category = std::random_access_iterator_tag;
+
+    iterator_impl(TAutoVector* vect, size_t index)
+        : vect_(vect), index_(index){};
+    iterator_impl(const iterator_impl&) = default;
+    ~iterator_impl() {}
+    iterator_impl& operator=(const iterator_impl&) = default;
+
+    // -- Advancement
+    // ++iterator
+    self_type& operator++() {
+      ++index_;
+      return *this;
+    }
+
+    // iterator++
+    self_type operator++(int) {
+      auto old = *this;
+      ++index_;
+      return old;
+    }
+
+    // --iterator
+    self_type& operator--() {
+      --index_;
+      return *this;
+    }
+
+    // iterator--
+    self_type operator--(int) {
+      auto old = *this;
+      --index_;
+      return old;
+    }
+
+    self_type operator-(difference_type len) const {
+      return self_type(vect_, index_ - len);
+    }
+
+    difference_type operator-(const self_type& other) const {
+      assert(vect_ == other.vect_);
+      return index_ - other.index_;
+    }
+
+    self_type operator+(difference_type len) const {
+      return self_type(vect_, index_ + len);
+    }
+
+    self_type& operator+=(difference_type len) {
+      index_ += len;
+      return *this;
+    }
+
+    self_type& operator-=(difference_type len) {
+      index_ -= len;
+      return *this;
+    }
+
+    // -- Reference
+    reference operator*() const {
+      assert(vect_->size() >= index_);
+      return (*vect_)[index_];
+    }
+
+    pointer operator->() const {
+      assert(vect_->size() >= index_);
+      return &(*vect_)[index_];
+    }
+
+    reference operator[](difference_type len) const { return *(*this + len); }
+
+    // -- Logical Operators
+    bool operator==(const self_type& other) const {
+      assert(vect_ == other.vect_);
+      return index_ == other.index_;
+    }
+
+    bool operator!=(const self_type& other) const { return !(*this == other); }
+
+    bool operator>(const self_type& other) const {
+      assert(vect_ == other.vect_);
+      return index_ > other.index_;
+    }
+
+    bool operator<(const self_type& other) const {
+      assert(vect_ == other.vect_);
+      return index_ < other.index_;
+    }
+
+    bool operator>=(const self_type& other) const {
+      assert(vect_ == other.vect_);
+      return index_ >= other.index_;
+    }
+
+    bool operator<=(const self_type& other) const {
+      assert(vect_ == other.vect_);
+      return index_ <= other.index_;
+    }
+
+   private:
+    TAutoVector* vect_ = nullptr;
+    size_t index_ = 0;
+  };
+
+  using iterator = iterator_impl<autovector, value_type>;
+  using const_iterator = iterator_impl<const autovector, const value_type>;
+  using reverse_iterator = std::reverse_iterator<iterator>;
+  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+
+  autovector() : values_(reinterpret_cast<pointer>(buf_)) {}
+
+  autovector(std::initializer_list<T> init_list)
+      : values_(reinterpret_cast<pointer>(buf_)) {
+    for (const T& item : init_list) {
+      push_back(item);
+    }
+  }
+
+  ~autovector() { clear(); }
+
+  // -- Immutable operations
+  // Indicate if all data resides in in-stack data structure.
+  bool only_in_stack() const {
+    // If no element was inserted at all, the vector's capacity will be `0`.
+    return vect_.capacity() == 0;
+  }
+
+  size_type size() const { return num_stack_items_ + vect_.size(); }
+
+  // resize does not guarantee anything about the contents of the newly
+  // available elements
+  void resize(size_type n) {
+    if (n > kSize) {
+      vect_.resize(n - kSize);
+      while (num_stack_items_ < kSize) {
+        new ((void*)(&values_[num_stack_items_++])) value_type();
+      }
+      num_stack_items_ = kSize;
+    } else {
+      vect_.clear();
+      while (num_stack_items_ < n) {
+        new ((void*)(&values_[num_stack_items_++])) value_type();
+      }
+      while (num_stack_items_ > n) {
+        values_[--num_stack_items_].~value_type();
+      }
+    }
+  }
+
+  bool empty() const { return size() == 0; }
+
+  size_type capacity() const { return kSize + vect_.capacity(); }
+
+  void reserve(size_t cap) {
+    if (cap > kSize) {
+      vect_.reserve(cap - kSize);
+    }
+
+    assert(cap <= capacity());
+  }
+
+  const_reference operator[](size_type n) const {
+    assert(n < size());
+    if (n < kSize) {
+      return values_[n];
+    }
+    return vect_[n - kSize];
+  }
+
+  reference operator[](size_type n) {
+    assert(n < size());
+    if (n < kSize) {
+      return values_[n];
+    }
+    return vect_[n - kSize];
+  }
+
+  const_reference at(size_type n) const {
+    assert(n < size());
+    return (*this)[n];
+  }
+
+  reference at(size_type n) {
+    assert(n < size());
+    return (*this)[n];
+  }
+
+  reference front() {
+    assert(!empty());
+    return *begin();
+  }
+
+  const_reference front() const {
+    assert(!empty());
+    return *begin();
+  }
+
+  reference back() {
+    assert(!empty());
+    return *(end() - 1);
+  }
+
+  const_reference back() const {
+    assert(!empty());
+    return *(end() - 1);
+  }
+
+  // -- Mutable Operations
+  void push_back(T&& item) {
+    if (num_stack_items_ < kSize) {
+      new ((void*)(&values_[num_stack_items_])) value_type();
+      values_[num_stack_items_++] = std::move(item);
+    } else {
+      vect_.push_back(item);
+    }
+  }
+
+  void push_back(const T& item) {
+    if (num_stack_items_ < kSize) {
+      new ((void*)(&values_[num_stack_items_])) value_type();
+      values_[num_stack_items_++] = item;
+    } else {
+      vect_.push_back(item);
+    }
+  }
+
+  template <class... Args>
+#if _LIBCPP_STD_VER > 14
+  reference emplace_back(Args&&... args) {
+    if (num_stack_items_ < kSize) {
+      return *(new ((void*)(&values_[num_stack_items_++]))
+                   value_type(std::forward<Args>(args)...));
+    } else {
+      return vect_.emplace_back(std::forward<Args>(args)...);
+    }
+  }
+#else
+  void emplace_back(Args&&... args) {
+    if (num_stack_items_ < kSize) {
+      new ((void*)(&values_[num_stack_items_++]))
+          value_type(std::forward<Args>(args)...);
+    } else {
+      vect_.emplace_back(std::forward<Args>(args)...);
+    }
+  }
+#endif
+
+  void pop_back() {
+    assert(!empty());
+    if (!vect_.empty()) {
+      vect_.pop_back();
+    } else {
+      values_[--num_stack_items_].~value_type();
+    }
+  }
+
+  void clear() {
+    while (num_stack_items_ > 0) {
+      values_[--num_stack_items_].~value_type();
+    }
+    vect_.clear();
+  }
+
+  // -- Copy and Assignment
+  autovector& assign(const autovector& other);
+
+  autovector(const autovector& other) { assign(other); }
+
+  autovector& operator=(const autovector& other) { return assign(other); }
+
+  autovector(autovector&& other) noexcept { *this = std::move(other); }
+  autovector& operator=(autovector&& other);
+
+  // -- Iterator Operations
+  iterator begin() { return iterator(this, 0); }
+
+  const_iterator begin() const { return const_iterator(this, 0); }
+
+  iterator end() { return iterator(this, this->size()); }
+
+  const_iterator end() const { return const_iterator(this, this->size()); }
+
+  reverse_iterator rbegin() { return reverse_iterator(end()); }
+
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(end());
+  }
+
+  reverse_iterator rend() { return reverse_iterator(begin()); }
+
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(begin());
+  }
+
+ private:
+  size_type num_stack_items_ = 0;  // current number of items
+  alignas(alignof(
+      value_type)) char buf_[kSize *
+                             sizeof(value_type)];  // the first `kSize` items
+  pointer values_;
+  // used only if there are more than `kSize` items.
+  std::vector<T> vect_;
+};
+
+template <class T, size_t kSize>
+autovector<T, kSize>& autovector<T, kSize>::assign(
+    const autovector<T, kSize>& other) {
+  values_ = reinterpret_cast<pointer>(buf_);
+  // copy the internal vector
+  vect_.assign(other.vect_.begin(), other.vect_.end());
+
+  // copy array
+  num_stack_items_ = other.num_stack_items_;
+  std::copy(other.values_, other.values_ + num_stack_items_, values_);
+
+  return *this;
+}
+
+template <class T, size_t kSize>
+autovector<T, kSize>& autovector<T, kSize>::operator=(
+    autovector<T, kSize>&& other) {
+  values_ = reinterpret_cast<pointer>(buf_);
+  vect_ = std::move(other.vect_);
+  size_t n = other.num_stack_items_;
+  num_stack_items_ = n;
+  other.num_stack_items_ = 0;
+  for (size_t i = 0; i < n; ++i) {
+    values_[i] = std::move(other.values_[i]);
+  }
+  return *this;
+}
+
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/autovector_test.cc b/src/rocksdb/util/autovector_test.cc
new file mode 100644
index 000000000..8c7c39ce6
--- /dev/null
+++ b/src/rocksdb/util/autovector_test.cc
@@ -0,0 +1,331 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/autovector.h"
+
+#include <atomic>
+#include <iostream>
+#include <string>
+#include <utility>
+
+#include "rocksdb/env.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+using std::cout;
+using std::endl;
+
+namespace ROCKSDB_NAMESPACE {
+
+class AutoVectorTest : public testing::Test {};
+const unsigned long kSize = 8;
+
+namespace {
+template <class T>
+void AssertAutoVectorOnlyInStack(autovector<T, kSize>* vec, bool result) {
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ(vec->only_in_stack(), result);
+#else
+  (void)vec;
+  (void)result;
+#endif  // !ROCKSDB_LITE
+}
+}  // namespace
+
+TEST_F(AutoVectorTest, PushBackAndPopBack) {
+  autovector<size_t, kSize> vec;
+  ASSERT_TRUE(vec.empty());
+  ASSERT_EQ(0ul, vec.size());
+
+  for (size_t i = 0; i < 1000 * kSize; ++i) {
+    vec.push_back(i);
+    ASSERT_TRUE(!vec.empty());
+    if (i < kSize) {
+      AssertAutoVectorOnlyInStack(&vec, true);
+    } else {
+      AssertAutoVectorOnlyInStack(&vec, false);
+    }
+    ASSERT_EQ(i + 1, vec.size());
+    ASSERT_EQ(i, vec[i]);
+    ASSERT_EQ(i, vec.at(i));
+  }
+
+  size_t size = vec.size();
+  while (size != 0) {
+    vec.pop_back();
+    // will always be in heap
+    AssertAutoVectorOnlyInStack(&vec, false);
+    ASSERT_EQ(--size, vec.size());
+  }
+
+  ASSERT_TRUE(vec.empty());
+}
+
+TEST_F(AutoVectorTest, EmplaceBack) {
+  using ValType = std::pair<size_t, std::string>;
+  autovector<ValType, kSize> vec;
+
+  for (size_t i = 0; i < 1000 * kSize; ++i) {
+    vec.emplace_back(i, std::to_string(i + 123));
+    ASSERT_TRUE(!vec.empty());
+    if (i < kSize) {
+      AssertAutoVectorOnlyInStack(&vec, true);
+    } else {
+      AssertAutoVectorOnlyInStack(&vec, false);
+    }
+
+    ASSERT_EQ(i + 1, vec.size());
+    ASSERT_EQ(i, vec[i].first);
+    ASSERT_EQ(std::to_string(i + 123), vec[i].second);
+  }
+
+  vec.clear();
+  ASSERT_TRUE(vec.empty());
+  AssertAutoVectorOnlyInStack(&vec, false);
+}
+
+TEST_F(AutoVectorTest, Resize) {
+  autovector<size_t, kSize> vec;
+
+  vec.resize(kSize);
+  AssertAutoVectorOnlyInStack(&vec, true);
+  for (size_t i = 0; i < kSize; ++i) {
+    vec[i] = i;
+  }
+
+  vec.resize(kSize * 2);
+  AssertAutoVectorOnlyInStack(&vec, false);
+  for (size_t i = 0; i < kSize; ++i) {
+    ASSERT_EQ(vec[i], i);
+  }
+  for (size_t i = 0; i < kSize; ++i) {
+    vec[i + kSize] = i;
+  }
+
+  vec.resize(1);
+  ASSERT_EQ(1U, vec.size());
+}
+
+namespace {
+void AssertEqual(const autovector<size_t, kSize>& a,
+                 const autovector<size_t, kSize>& b) {
+  ASSERT_EQ(a.size(), b.size());
+  ASSERT_EQ(a.empty(), b.empty());
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ(a.only_in_stack(), b.only_in_stack());
+#endif  // !ROCKSDB_LITE
+  for (size_t i = 0; i < a.size(); ++i) {
+    ASSERT_EQ(a[i], b[i]);
+  }
+}
+}  // namespace
+
+TEST_F(AutoVectorTest, CopyAndAssignment) {
+  // Test both heap-allocated and stack-allocated cases.
+  for (auto size : {kSize / 2, kSize * 1000}) {
+    autovector<size_t, kSize> vec;
+    for (size_t i = 0; i < size; ++i) {
+      vec.push_back(i);
+    }
+
+    {
+      autovector<size_t, kSize> other;
+      other = vec;
+      AssertEqual(other, vec);
+    }
+
+    {
+      autovector<size_t, kSize> other(vec);
+      AssertEqual(other, vec);
+    }
+  }
+}
+
+TEST_F(AutoVectorTest, Iterators) {
+  autovector<std::string, kSize> vec;
+  for (size_t i = 0; i < kSize * 1000; ++i) {
+    vec.push_back(std::to_string(i));
+  }
+
+  // basic operator test
+  ASSERT_EQ(vec.front(), *vec.begin());
+  ASSERT_EQ(vec.back(), *(vec.end() - 1));
+  ASSERT_TRUE(vec.begin() < vec.end());
+
+  // non-const iterator
+  size_t index = 0;
+  for (const auto& item : vec) {
+    ASSERT_EQ(vec[index++], item);
+  }
+
+  index = vec.size() - 1;
+  for (auto pos = vec.rbegin(); pos != vec.rend(); ++pos) {
+    ASSERT_EQ(vec[index--], *pos);
+  }
+
+  // const iterator
+  const auto& cvec = vec;
+  index = 0;
+  for (const auto& item : cvec) {
+    ASSERT_EQ(cvec[index++], item);
+  }
+
+  index = vec.size() - 1;
+  for (auto pos = cvec.rbegin(); pos != cvec.rend(); ++pos) {
+    ASSERT_EQ(cvec[index--], *pos);
+  }
+
+  // forward and backward
+  auto pos = vec.begin();
+  while (pos != vec.end()) {
+    auto old_val = *pos;
+    auto old = pos++;
+    // HACK: make sure -> works
+    ASSERT_TRUE(!old->empty());
+    ASSERT_EQ(old_val, *old);
+    ASSERT_TRUE(pos == vec.end() || old_val != *pos);
+  }
+
+  pos = vec.begin();
+  for (size_t i = 0; i < vec.size(); i += 2) {
+    // Cannot use ASSERT_EQ since that macro depends on iostream serialization
+    ASSERT_TRUE(pos + 2 - 2 == pos);
+    pos += 2;
+    ASSERT_TRUE(pos >= vec.begin());
+    ASSERT_TRUE(pos <= vec.end());
+
+    size_t diff = static_cast<size_t>(pos - vec.begin());
+    ASSERT_EQ(i + 2, diff);
+  }
+}
+
+namespace {
+std::vector<std::string> GetTestKeys(size_t size) {
+  std::vector<std::string> keys;
+  keys.resize(size);
+
+  int index = 0;
+  for (auto& key : keys) {
+    key = "item-" + std::to_string(index++);
+  }
+  return keys;
+}
+}  // namespace
+
+template <class TVector>
+void BenchmarkVectorCreationAndInsertion(
+    std::string name, size_t ops, size_t item_size,
+    const std::vector<typename TVector::value_type>& items) {
+  auto env = Env::Default();
+
+  int index = 0;
+  auto start_time = env->NowNanos();
+  auto ops_remaining = ops;
+  while (ops_remaining--) {
+    TVector v;
+    for (size_t i = 0; i < item_size; ++i) {
+      v.push_back(items[index++]);
+    }
+  }
+  auto elapsed = env->NowNanos() - start_time;
+  cout << "created " << ops << " " << name << " instances:\n\t"
+       << "each was inserted with " << item_size << " elements\n\t"
+       << "total time elapsed: " << elapsed << " (ns)" << endl;
+}
+
+template <class TVector>
+size_t BenchmarkSequenceAccess(std::string name, size_t ops, size_t elem_size) {
+  TVector v;
+  for (const auto& item : GetTestKeys(elem_size)) {
+    v.push_back(item);
+  }
+  auto env = Env::Default();
+
+  auto ops_remaining = ops;
+  auto start_time = env->NowNanos();
+  size_t total = 0;
+  while (ops_remaining--) {
+    auto end = v.end();
+    for (auto pos = v.begin(); pos != end; ++pos) {
+      total += pos->size();
+    }
+  }
+  auto elapsed = env->NowNanos() - start_time;
+  cout << "performed " << ops << " sequence access against " << name << "\n\t"
+       << "size: " << elem_size << "\n\t"
+       << "total time elapsed: " << elapsed << " (ns)" << endl;
+  // HACK avoid compiler's optimization to ignore total
+  return total;
+}
+
+// This test case only reports the performance between std::vector<std::string>
+// and autovector<std::string>. We chose string for comparison because in most
+// of our use cases we used std::vector<std::string>.
+TEST_F(AutoVectorTest, PerfBench) {
+  // We run same operations for kOps times in order to get a more fair result.
+  size_t kOps = 100000;
+
+  // Creation and insertion test
+  // Test the case when there is:
+  //  * no element inserted: internal array of std::vector may not really get
+  //    initialize.
+  //  * one element inserted: internal array of std::vector must have
+  //    initialized.
+  //  * kSize elements inserted. This shows the most time we'll spend if we
+  //    keep everything in stack.
+  //  * 2 * kSize elements inserted. The internal vector of
+  //    autovector must have been initialized.
+  cout << "=====================================================" << endl;
+  cout << "Creation and Insertion Test (value type: std::string)" << endl;
+  cout << "=====================================================" << endl;
+
+  // pre-generated unique keys
+  auto string_keys = GetTestKeys(kOps * 2 * kSize);
+  for (auto insertions : {0ul, 1ul, kSize / 2, kSize, 2 * kSize}) {
+    BenchmarkVectorCreationAndInsertion<std::vector<std::string>>(
+        "std::vector<std::string>", kOps, insertions, string_keys);
+    BenchmarkVectorCreationAndInsertion<autovector<std::string, kSize>>(
+        "autovector<std::string>", kOps, insertions, string_keys);
+    cout << "-----------------------------------" << endl;
+  }
+
+  cout << "=====================================================" << endl;
+  cout << "Creation and Insertion Test (value type: uint64_t)" << endl;
+  cout << "=====================================================" << endl;
+
+  // pre-generated unique keys
+  std::vector<uint64_t> int_keys(kOps * 2 * kSize);
+  for (size_t i = 0; i < kOps * 2 * kSize; ++i) {
+    int_keys[i] = i;
+  }
+  for (auto insertions : {0ul, 1ul, kSize / 2, kSize, 2 * kSize}) {
+    BenchmarkVectorCreationAndInsertion<std::vector<uint64_t>>(
+        "std::vector<uint64_t>", kOps, insertions, int_keys);
+    BenchmarkVectorCreationAndInsertion<autovector<uint64_t, kSize>>(
+        "autovector<uint64_t>", kOps, insertions, int_keys);
+    cout << "-----------------------------------" << endl;
+  }
+
+  // Sequence Access Test
+  cout << "=====================================================" << endl;
+  cout << "Sequence Access Test" << endl;
+  cout << "=====================================================" << endl;
+  for (auto elem_size : {kSize / 2, kSize, 2 * kSize}) {
+    BenchmarkSequenceAccess<std::vector<std::string>>("std::vector", kOps,
+                                                      elem_size);
+    BenchmarkSequenceAccess<autovector<std::string, kSize>>("autovector", kOps,
+                                                            elem_size);
+    cout << "-----------------------------------" << endl;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/bloom_impl.h b/src/rocksdb/util/bloom_impl.h
new file mode 100644
index 000000000..fadd012d3
--- /dev/null
+++ b/src/rocksdb/util/bloom_impl.h
@@ -0,0 +1,489 @@
+//  Copyright (c) 2019-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Implementation details of various Bloom filter implementations used in
+// RocksDB. (DynamicBloom is in a separate file for now because it
+// supports concurrent write.)
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+
+#include <cmath>
+
+#include "port/port.h"  // for PREFETCH
+#include "rocksdb/slice.h"
+#include "util/hash.h"
+
+#ifdef HAVE_AVX2
+#include <immintrin.h>
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class BloomMath {
+ public:
+  // False positive rate of a standard Bloom filter, for given ratio of
+  // filter memory bits to added keys, and number of probes per operation.
+  // (The false positive rate is effectively independent of scale, assuming
+  // the implementation scales OK.)
+  static double StandardFpRate(double bits_per_key, int num_probes) {
+    // Standard very-good-estimate formula. See
+    // https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
+    return std::pow(1.0 - std::exp(-num_probes / bits_per_key), num_probes);
+  }
+
+  // False positive rate of a "blocked"/"shareded"/"cache-local" Bloom filter,
+  // for given ratio of filter memory bits to added keys, number of probes per
+  // operation (all within the given block or cache line size), and block or
+  // cache line size.
+  static double CacheLocalFpRate(double bits_per_key, int num_probes,
+                                 int cache_line_bits) {
+    if (bits_per_key <= 0.0) {
+      // Fix a discontinuity
+      return 1.0;
+    }
+    double keys_per_cache_line = cache_line_bits / bits_per_key;
+    // A reasonable estimate is the average of the FP rates for one standard
+    // deviation above and below the mean bucket occupancy. See
+    // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter#the-math
+    double keys_stddev = std::sqrt(keys_per_cache_line);
+    double crowded_fp = StandardFpRate(
+        cache_line_bits / (keys_per_cache_line + keys_stddev), num_probes);
+    double uncrowded_fp = StandardFpRate(
+        cache_line_bits / (keys_per_cache_line - keys_stddev), num_probes);
+    return (crowded_fp + uncrowded_fp) / 2;
+  }
+
+  // False positive rate of querying a new item against `num_keys` items, all
+  // hashed to `fingerprint_bits` bits. (This assumes the fingerprint hashes
+  // themselves are stored losslessly. See Section 4 of
+  // http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf)
+  static double FingerprintFpRate(size_t num_keys, int fingerprint_bits) {
+    double inv_fingerprint_space = std::pow(0.5, fingerprint_bits);
+    // Base estimate assumes each key maps to a unique fingerprint.
+    // Could be > 1 in extreme cases.
+    double base_estimate = num_keys * inv_fingerprint_space;
+    // To account for potential overlap, we choose between two formulas
+    if (base_estimate > 0.0001) {
+      // A very good formula assuming we don't construct a floating point
+      // number extremely close to 1. Always produces a probability < 1.
+      return 1.0 - std::exp(-base_estimate);
+    } else {
+      // A very good formula when base_estimate is far below 1. (Subtract
+      // away the integral-approximated sum that some key has same hash as
+      // one coming before it in a list.)
+      return base_estimate - (base_estimate * base_estimate * 0.5);
+    }
+  }
+
+  // Returns the probably of either of two independent(-ish) events
+  // happening, given their probabilities. (This is useful for combining
+  // results from StandardFpRate or CacheLocalFpRate with FingerprintFpRate
+  // for a hash-efficient Bloom filter's FP rate. See Section 4 of
+  // http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf)
+  static double IndependentProbabilitySum(double rate1, double rate2) {
+    // Use formula that avoids floating point extremely close to 1 if
+    // rates are extremely small.
+    return rate1 + rate2 - (rate1 * rate2);
+  }
+};
+
+// A fast, flexible, and accurate cache-local Bloom implementation with
+// SIMD-optimized query performance (currently using AVX2 on Intel). Write
+// performance and non-SIMD read are very good, benefiting from FastRange32
+// used in place of % and single-cycle multiplication on recent processors.
+//
+// Most other SIMD Bloom implementations sacrifice flexibility and/or
+// accuracy by requiring num_probes to be a power of two and restricting
+// where each probe can occur in a cache line. This implementation sacrifices
+// SIMD-optimization for add (might still be possible, especially with AVX512)
+// in favor of allowing any num_probes, not crossing cache line boundary,
+// and accuracy close to theoretical best accuracy for a cache-local Bloom.
+// E.g. theoretical best for 10 bits/key, num_probes=6, and 512-bit bucket
+// (Intel cache line size) is 0.9535% FP rate. This implementation yields
+// about 0.957%. (Compare to LegacyLocalityBloomImpl<false> at 1.138%, or
+// about 0.951% for 1024-bit buckets, cache line size for some ARM CPUs.)
+//
+// This implementation can use a 32-bit hash (let h2 be h1 * 0x9e3779b9) or
+// a 64-bit hash (split into two uint32s). With many millions of keys, the
+// false positive rate associated with using a 32-bit hash can dominate the
+// false positive rate of the underlying filter. At 10 bits/key setting, the
+// inflection point is about 40 million keys, so 32-bit hash is a bad idea
+// with 10s of millions of keys or more.
+//
+// Despite accepting a 64-bit hash, this implementation uses 32-bit fastrange
+// to pick a cache line, which can be faster than 64-bit in some cases.
+// This only hurts accuracy as you get into 10s of GB for a single filter,
+// and accuracy abruptly breaks down at 256GB (2^32 cache lines). Switch to
+// 64-bit fastrange if you need filters so big. ;)
+//
+// Using only a 32-bit input hash within each cache line has negligible
+// impact for any reasonable cache line / bucket size, for arbitrary filter
+// size, and potentially saves intermediate data size in some cases vs.
+// tracking full 64 bits. (Even in an implementation using 64-bit arithmetic
+// to generate indices, I might do the same, as a single multiplication
+// suffices to generate a sufficiently mixed 64 bits from 32 bits.)
+//
+// This implementation is currently tied to Intel cache line size, 64 bytes ==
+// 512 bits. If there's sufficient demand for other cache line sizes, this is
+// a pretty good implementation to extend, but slight performance enhancements
+// are possible with an alternate implementation (probably not very compatible
+// with SIMD):
+// (1) Use rotation in addition to multiplication for remixing
+// (like murmur hash). (Using multiplication alone *slightly* hurts accuracy
+// because lower bits never depend on original upper bits.)
+// (2) Extract more than one bit index from each re-mix. (Only if rotation
+// or similar is part of remix, because otherwise you're making the
+// multiplication-only problem worse.)
+// (3) Re-mix full 64 bit hash, to get maximum number of bit indices per
+// re-mix.
+//
+class FastLocalBloomImpl {
+ public:
+  // NOTE: this has only been validated to enough accuracy for producing
+  // reasonable warnings / user feedback, not for making functional decisions.
+  static double EstimatedFpRate(size_t keys, size_t bytes, int num_probes,
+                                int hash_bits) {
+    return BloomMath::IndependentProbabilitySum(
+        BloomMath::CacheLocalFpRate(8.0 * bytes / keys, num_probes,
+                                    /*cache line bits*/ 512),
+        BloomMath::FingerprintFpRate(keys, hash_bits));
+  }
+
+  static inline int ChooseNumProbes(int millibits_per_key) {
+    // Since this implementation can (with AVX2) make up to 8 probes
+    // for the same cost, we pick the most accurate num_probes, based
+    // on actual tests of the implementation. Note that for higher
+    // bits/key, the best choice for cache-local Bloom can be notably
+    // smaller than standard bloom, e.g. 9 instead of 11 @ 16 b/k.
+    if (millibits_per_key <= 2080) {
+      return 1;
+    } else if (millibits_per_key <= 3580) {
+      return 2;
+    } else if (millibits_per_key <= 5100) {
+      return 3;
+    } else if (millibits_per_key <= 6640) {
+      return 4;
+    } else if (millibits_per_key <= 8300) {
+      return 5;
+    } else if (millibits_per_key <= 10070) {
+      return 6;
+    } else if (millibits_per_key <= 11720) {
+      return 7;
+    } else if (millibits_per_key <= 14001) {
+      // Would be something like <= 13800 but sacrificing *slightly* for
+      // more settings using <= 8 probes.
+      return 8;
+    } else if (millibits_per_key <= 16050) {
+      return 9;
+    } else if (millibits_per_key <= 18300) {
+      return 10;
+    } else if (millibits_per_key <= 22001) {
+      return 11;
+    } else if (millibits_per_key <= 25501) {
+      return 12;
+    } else if (millibits_per_key > 50000) {
+      // Top out at 24 probes (three sets of 8)
+      return 24;
+    } else {
+      // Roughly optimal choices for remaining range
+      // e.g.
+      // 28000 -> 12, 28001 -> 13
+      // 50000 -> 23, 50001 -> 24
+      return (millibits_per_key - 1) / 2000 - 1;
+    }
+  }
+
+  static inline void AddHash(uint32_t h1, uint32_t h2, uint32_t len_bytes,
+                             int num_probes, char *data) {
+    uint32_t bytes_to_cache_line = FastRange32(len_bytes >> 6, h1) << 6;
+    AddHashPrepared(h2, num_probes, data + bytes_to_cache_line);
+  }
+
+  static inline void AddHashPrepared(uint32_t h2, int num_probes,
+                                     char *data_at_cache_line) {
+    uint32_t h = h2;
+    for (int i = 0; i < num_probes; ++i, h *= uint32_t{0x9e3779b9}) {
+      // 9-bit address within 512 bit cache line
+      int bitpos = h >> (32 - 9);
+      data_at_cache_line[bitpos >> 3] |= (uint8_t{1} << (bitpos & 7));
+    }
+  }
+
+  static inline void PrepareHash(uint32_t h1, uint32_t len_bytes,
+                                 const char *data,
+                                 uint32_t /*out*/ *byte_offset) {
+    uint32_t bytes_to_cache_line = FastRange32(len_bytes >> 6, h1) << 6;
+    PREFETCH(data + bytes_to_cache_line, 0 /* rw */, 1 /* locality */);
+    PREFETCH(data + bytes_to_cache_line + 63, 0 /* rw */, 1 /* locality */);
+    *byte_offset = bytes_to_cache_line;
+  }
+
+  static inline bool HashMayMatch(uint32_t h1, uint32_t h2, uint32_t len_bytes,
+                                  int num_probes, const char *data) {
+    uint32_t bytes_to_cache_line = FastRange32(len_bytes >> 6, h1) << 6;
+    return HashMayMatchPrepared(h2, num_probes, data + bytes_to_cache_line);
+  }
+
+  static inline bool HashMayMatchPrepared(uint32_t h2, int num_probes,
+                                          const char *data_at_cache_line) {
+    uint32_t h = h2;
+#ifdef HAVE_AVX2
+    int rem_probes = num_probes;
+
+    // NOTE: For better performance for num_probes in {1, 2, 9, 10, 17, 18,
+    // etc.} one can insert specialized code for rem_probes <= 2, bypassing
+    // the SIMD code in those cases. There is a detectable but minor overhead
+    // applied to other values of num_probes (when not statically determined),
+    // but smoother performance curve vs. num_probes. But for now, when
+    // in doubt, don't add unnecessary code.
+
+    // Powers of 32-bit golden ratio, mod 2**32.
+    const __m256i multipliers =
+        _mm256_setr_epi32(0x00000001, 0x9e3779b9, 0xe35e67b1, 0x734297e9,
+                          0x35fbe861, 0xdeb7c719, 0x448b211, 0x3459b749);
+
+    for (;;) {
+      // Eight copies of hash
+      __m256i hash_vector = _mm256_set1_epi32(h);
+
+      // Same effect as repeated multiplication by 0x9e3779b9 thanks to
+      // associativity of multiplication.
+      hash_vector = _mm256_mullo_epi32(hash_vector, multipliers);
+
+      // Now the top 9 bits of each of the eight 32-bit values in
+      // hash_vector are bit addresses for probes within the cache line.
+      // While the platform-independent code uses byte addressing (6 bits
+      // to pick a byte + 3 bits to pick a bit within a byte), here we work
+      // with 32-bit words (4 bits to pick a word + 5 bits to pick a bit
+      // within a word) because that works well with AVX2 and is equivalent
+      // under little-endian.
+
+      // Shift each right by 28 bits to get 4-bit word addresses.
+      const __m256i word_addresses = _mm256_srli_epi32(hash_vector, 28);
+
+      // Gather 32-bit values spread over 512 bits by 4-bit address. In
+      // essence, we are dereferencing eight pointers within the cache
+      // line.
+      //
+      // Option 1: AVX2 gather (seems to be a little slow - understandable)
+      // const __m256i value_vector =
+      //     _mm256_i32gather_epi32(static_cast<const int
+      //     *>(data_at_cache_line),
+      //                            word_addresses,
+      //                            /*bytes / i32*/ 4);
+      // END Option 1
+      // Potentially unaligned as we're not *always* cache-aligned -> loadu
+      const __m256i *mm_data =
+          reinterpret_cast<const __m256i *>(data_at_cache_line);
+      __m256i lower = _mm256_loadu_si256(mm_data);
+      __m256i upper = _mm256_loadu_si256(mm_data + 1);
+      // Option 2: AVX512VL permute hack
+      // Only negligibly faster than Option 3, so not yet worth supporting
+      // const __m256i value_vector =
+      //    _mm256_permutex2var_epi32(lower, word_addresses, upper);
+      // END Option 2
+      // Option 3: AVX2 permute+blend hack
+      // Use lowest three bits to order probing values, as if all from same
+      // 256 bit piece.
+      lower = _mm256_permutevar8x32_epi32(lower, word_addresses);
+      upper = _mm256_permutevar8x32_epi32(upper, word_addresses);
+      // Just top 1 bit of address, to select between lower and upper.
+      const __m256i upper_lower_selector = _mm256_srai_epi32(hash_vector, 31);
+      // Finally: the next 8 probed 32-bit values, in probing sequence order.
+      const __m256i value_vector =
+          _mm256_blendv_epi8(lower, upper, upper_lower_selector);
+      // END Option 3
+
+      // We might not need to probe all 8, so build a mask for selecting only
+      // what we need. (The k_selector(s) could be pre-computed but that
+      // doesn't seem to make a noticeable performance difference.)
+      const __m256i zero_to_seven = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+      // Subtract rem_probes from each of those constants
+      __m256i k_selector =
+          _mm256_sub_epi32(zero_to_seven, _mm256_set1_epi32(rem_probes));
+      // Negative after subtract -> use/select
+      // Keep only high bit (logical shift right each by 31).
+      k_selector = _mm256_srli_epi32(k_selector, 31);
+
+      // Strip off the 4 bit word address (shift left)
+      __m256i bit_addresses = _mm256_slli_epi32(hash_vector, 4);
+      // And keep only 5-bit (32 - 27) bit-within-32-bit-word addresses.
+      bit_addresses = _mm256_srli_epi32(bit_addresses, 27);
+      // Build a bit mask
+      const __m256i bit_mask = _mm256_sllv_epi32(k_selector, bit_addresses);
+
+      // Like ((~value_vector) & bit_mask) == 0)
+      bool match = _mm256_testc_si256(value_vector, bit_mask) != 0;
+
+      // This check first so that it's easy for branch predictor to optimize
+      // num_probes <= 8 case, making it free of unpredictable branches.
+      if (rem_probes <= 8) {
+        return match;
+      } else if (!match) {
+        return false;
+      }
+      // otherwise
+      // Need another iteration. 0xab25f4c1 == golden ratio to the 8th power
+      h *= 0xab25f4c1;
+      rem_probes -= 8;
+    }
+#else
+    for (int i = 0; i < num_probes; ++i, h *= uint32_t{0x9e3779b9}) {
+      // 9-bit address within 512 bit cache line
+      int bitpos = h >> (32 - 9);
+      if ((data_at_cache_line[bitpos >> 3] & (char(1) << (bitpos & 7))) == 0) {
+        return false;
+      }
+    }
+    return true;
+#endif
+  }
+};
+
+// A legacy Bloom filter implementation with no locality of probes (slow).
+// It uses double hashing to generate a sequence of hash values.
+// Asymptotic analysis is in [Kirsch,Mitzenmacher 2006], but known to have
+// subtle accuracy flaws for practical sizes [Dillinger,Manolios 2004].
+//
+// DO NOT REUSE
+//
+class LegacyNoLocalityBloomImpl {
+ public:
+  static inline int ChooseNumProbes(int bits_per_key) {
+    // We intentionally round down to reduce probing cost a little bit
+    int num_probes = static_cast<int>(bits_per_key * 0.69);  // 0.69 =~ ln(2)
+    if (num_probes < 1) num_probes = 1;
+    if (num_probes > 30) num_probes = 30;
+    return num_probes;
+  }
+
+  static inline void AddHash(uint32_t h, uint32_t total_bits, int num_probes,
+                             char *data) {
+    const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+    for (int i = 0; i < num_probes; i++) {
+      const uint32_t bitpos = h % total_bits;
+      data[bitpos / 8] |= (1 << (bitpos % 8));
+      h += delta;
+    }
+  }
+
+  static inline bool HashMayMatch(uint32_t h, uint32_t total_bits,
+                                  int num_probes, const char *data) {
+    const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+    for (int i = 0; i < num_probes; i++) {
+      const uint32_t bitpos = h % total_bits;
+      if ((data[bitpos / 8] & (1 << (bitpos % 8))) == 0) {
+        return false;
+      }
+      h += delta;
+    }
+    return true;
+  }
+};
+
+// A legacy Bloom filter implementation with probes local to a single
+// cache line (fast). Because SST files might be transported between
+// platforms, the cache line size is a parameter rather than hard coded.
+// (But if specified as a constant parameter, an optimizing compiler
+// should take advantage of that.)
+//
+// When ExtraRotates is false, this implementation is notably deficient in
+// accuracy. Specifically, it uses double hashing with a 1/512 chance of the
+// increment being zero (when cache line size is 512 bits). Thus, there's a
+// 1/512 chance of probing only one index, which we'd expect to incur about
+// a 1/2 * 1/512 or absolute 0.1% FP rate penalty. More detail at
+// https://github.com/facebook/rocksdb/issues/4120
+//
+// DO NOT REUSE
+//
+template <bool ExtraRotates>
+class LegacyLocalityBloomImpl {
+ private:
+  static inline uint32_t GetLine(uint32_t h, uint32_t num_lines) {
+    uint32_t offset_h = ExtraRotates ? (h >> 11) | (h << 21) : h;
+    return offset_h % num_lines;
+  }
+
+ public:
+  // NOTE: this has only been validated to enough accuracy for producing
+  // reasonable warnings / user feedback, not for making functional decisions.
+  static double EstimatedFpRate(size_t keys, size_t bytes, int num_probes) {
+    double bits_per_key = 8.0 * bytes / keys;
+    double filter_rate = BloomMath::CacheLocalFpRate(bits_per_key, num_probes,
+                                                     /*cache line bits*/ 512);
+    if (!ExtraRotates) {
+      // Good estimate of impact of flaw in index computation.
+      // Adds roughly 0.002 around 50 bits/key and 0.001 around 100 bits/key.
+      // The + 22 shifts it nicely to fit for lower bits/key.
+      filter_rate += 0.1 / (bits_per_key * 0.75 + 22);
+    } else {
+      // Not yet validated
+      assert(false);
+    }
+    // Always uses 32-bit hash
+    double fingerprint_rate = BloomMath::FingerprintFpRate(keys, 32);
+    return BloomMath::IndependentProbabilitySum(filter_rate, fingerprint_rate);
+  }
+
+  static inline void AddHash(uint32_t h, uint32_t num_lines, int num_probes,
+                             char *data, int log2_cache_line_bytes) {
+    const int log2_cache_line_bits = log2_cache_line_bytes + 3;
+
+    char *data_at_offset =
+        data + (GetLine(h, num_lines) << log2_cache_line_bytes);
+    const uint32_t delta = (h >> 17) | (h << 15);
+    for (int i = 0; i < num_probes; ++i) {
+      // Mask to bit-within-cache-line address
+      const uint32_t bitpos = h & ((1 << log2_cache_line_bits) - 1);
+      data_at_offset[bitpos / 8] |= (1 << (bitpos % 8));
+      if (ExtraRotates) {
+        h = (h >> log2_cache_line_bits) | (h << (32 - log2_cache_line_bits));
+      }
+      h += delta;
+    }
+  }
+
+  static inline void PrepareHashMayMatch(uint32_t h, uint32_t num_lines,
+                                         const char *data,
+                                         uint32_t /*out*/ *byte_offset,
+                                         int log2_cache_line_bytes) {
+    uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes;
+    PREFETCH(data + b, 0 /* rw */, 1 /* locality */);
+    PREFETCH(data + b + ((1 << log2_cache_line_bytes) - 1), 0 /* rw */,
+             1 /* locality */);
+    *byte_offset = b;
+  }
+
+  static inline bool HashMayMatch(uint32_t h, uint32_t num_lines,
+                                  int num_probes, const char *data,
+                                  int log2_cache_line_bytes) {
+    uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes;
+    return HashMayMatchPrepared(h, num_probes, data + b, log2_cache_line_bytes);
+  }
+
+  static inline bool HashMayMatchPrepared(uint32_t h, int num_probes,
+                                          const char *data_at_offset,
+                                          int log2_cache_line_bytes) {
+    const int log2_cache_line_bits = log2_cache_line_bytes + 3;
+
+    const uint32_t delta = (h >> 17) | (h << 15);
+    for (int i = 0; i < num_probes; ++i) {
+      // Mask to bit-within-cache-line address
+      const uint32_t bitpos = h & ((1 << log2_cache_line_bits) - 1);
+      if (((data_at_offset[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
+        return false;
+      }
+      if (ExtraRotates) {
+        h = (h >> log2_cache_line_bits) | (h << (32 - log2_cache_line_bits));
+      }
+      h += delta;
+    }
+    return true;
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/bloom_test.cc b/src/rocksdb/util/bloom_test.cc
new file mode 100644
index 000000000..9d509ac3d
--- /dev/null
+++ b/src/rocksdb/util/bloom_test.cc
@@ -0,0 +1,1175 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run this test... Skipping...\n");
+  return 0;
+}
+#else
+
+#include <array>
+#include <cmath>
+#include <vector>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_reservation_manager.h"
+#include "memory/arena.h"
+#include "port/jemalloc_helper.h"
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/gflags_compat.h"
+#include "util/hash.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+// The test is not fully designed for bits_per_key other than 10, but with
+// this parameter you can easily explore the behavior of other bits_per_key.
+// See also filter_bench.
+DEFINE_int32(bits_per_key, 10, "");
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+const std::string kLegacyBloom = test::LegacyBloomFilterPolicy::kClassName();
+const std::string kFastLocalBloom =
+    test::FastLocalBloomFilterPolicy::kClassName();
+const std::string kStandard128Ribbon =
+    test::Standard128RibbonFilterPolicy::kClassName();
+}  // namespace
+
+static const int kVerbose = 1;
+
+static Slice Key(int i, char* buffer) {
+  std::string s;
+  PutFixed32(&s, static_cast<uint32_t>(i));
+  memcpy(buffer, s.c_str(), sizeof(i));
+  return Slice(buffer, sizeof(i));
+}
+
+static int NextLength(int length) {
+  if (length < 10) {
+    length += 1;
+  } else if (length < 100) {
+    length += 10;
+  } else if (length < 1000) {
+    length += 100;
+  } else {
+    length += 1000;
+  }
+  return length;
+}
+
+class FullBloomTest : public testing::TestWithParam<std::string> {
+ protected:
+  BlockBasedTableOptions table_options_;
+
+ private:
+  std::shared_ptr<const FilterPolicy>& policy_;
+  std::unique_ptr<FilterBitsBuilder> bits_builder_;
+  std::unique_ptr<FilterBitsReader> bits_reader_;
+  std::unique_ptr<const char[]> buf_;
+  size_t filter_size_;
+
+ public:
+  FullBloomTest() : policy_(table_options_.filter_policy), filter_size_(0) {
+    ResetPolicy();
+  }
+
+  BuiltinFilterBitsBuilder* GetBuiltinFilterBitsBuilder() {
+    // Throws on bad cast
+    return dynamic_cast<BuiltinFilterBitsBuilder*>(bits_builder_.get());
+  }
+
+  const BloomLikeFilterPolicy* GetBloomLikeFilterPolicy() {
+    // Throws on bad cast
+    return &dynamic_cast<const BloomLikeFilterPolicy&>(*policy_);
+  }
+
+  void Reset() {
+    bits_builder_.reset(BloomFilterPolicy::GetBuilderFromContext(
+        FilterBuildingContext(table_options_)));
+    bits_reader_.reset(nullptr);
+    buf_.reset(nullptr);
+    filter_size_ = 0;
+  }
+
+  void ResetPolicy(double bits_per_key) {
+    policy_ = BloomLikeFilterPolicy::Create(GetParam(), bits_per_key);
+    Reset();
+  }
+
+  void ResetPolicy() { ResetPolicy(FLAGS_bits_per_key); }
+
+  void Add(const Slice& s) { bits_builder_->AddKey(s); }
+
+  void OpenRaw(const Slice& s) {
+    bits_reader_.reset(policy_->GetFilterBitsReader(s));
+  }
+
+  void Build() {
+    Slice filter = bits_builder_->Finish(&buf_);
+    bits_reader_.reset(policy_->GetFilterBitsReader(filter));
+    filter_size_ = filter.size();
+  }
+
+  size_t FilterSize() const { return filter_size_; }
+
+  Slice FilterData() { return Slice(buf_.get(), filter_size_); }
+
+  int GetNumProbesFromFilterData() {
+    assert(filter_size_ >= 5);
+    int8_t raw_num_probes = static_cast<int8_t>(buf_.get()[filter_size_ - 5]);
+    if (raw_num_probes == -1) {  // New bloom filter marker
+      return static_cast<uint8_t>(buf_.get()[filter_size_ - 3]);
+    } else {
+      return raw_num_probes;
+    }
+  }
+
+  int GetRibbonSeedFromFilterData() {
+    assert(filter_size_ >= 5);
+    // Check for ribbon marker
+    assert(-2 == static_cast<int8_t>(buf_.get()[filter_size_ - 5]));
+    return static_cast<uint8_t>(buf_.get()[filter_size_ - 4]);
+  }
+
+  bool Matches(const Slice& s) {
+    if (bits_reader_ == nullptr) {
+      Build();
+    }
+    return bits_reader_->MayMatch(s);
+  }
+
+  // Provides a kind of fingerprint on the Bloom filter's
+  // behavior, for reasonbly high FP rates.
+  uint64_t PackedMatches() {
+    char buffer[sizeof(int)];
+    uint64_t result = 0;
+    for (int i = 0; i < 64; i++) {
+      if (Matches(Key(i + 12345, buffer))) {
+        result |= uint64_t{1} << i;
+      }
+    }
+    return result;
+  }
+
+  // Provides a kind of fingerprint on the Bloom filter's
+  // behavior, for lower FP rates.
+  std::string FirstFPs(int count) {
+    char buffer[sizeof(int)];
+    std::string rv;
+    int fp_count = 0;
+    for (int i = 0; i < 1000000; i++) {
+      // Pack four match booleans into each hexadecimal digit
+      if (Matches(Key(i + 1000000, buffer))) {
+        ++fp_count;
+        rv += std::to_string(i);
+        if (fp_count == count) {
+          break;
+        }
+        rv += ',';
+      }
+    }
+    return rv;
+  }
+
+  double FalsePositiveRate() {
+    char buffer[sizeof(int)];
+    int result = 0;
+    for (int i = 0; i < 10000; i++) {
+      if (Matches(Key(i + 1000000000, buffer))) {
+        result++;
+      }
+    }
+    return result / 10000.0;
+  }
+};
+
+TEST_P(FullBloomTest, FilterSize) {
+  // In addition to checking the consistency of space computation, we are
+  // checking that denoted and computed doubles are interpreted as expected
+  // as bits_per_key values.
+  bool some_computed_less_than_denoted = false;
+  // Note: to avoid unproductive configurations, bits_per_key < 0.5 is rounded
+  // down to 0 (no filter), and 0.5 <= bits_per_key < 1.0 is rounded up to 1
+  // bit per key (1000 millibits). Also, enforced maximum is 100 bits per key
+  // (100000 millibits).
+  for (auto bpk : std::vector<std::pair<double, int> >{{-HUGE_VAL, 0},
+                                                       {-INFINITY, 0},
+                                                       {0.0, 0},
+                                                       {0.499, 0},
+                                                       {0.5, 1000},
+                                                       {1.234, 1234},
+                                                       {3.456, 3456},
+                                                       {9.5, 9500},
+                                                       {10.0, 10000},
+                                                       {10.499, 10499},
+                                                       {21.345, 21345},
+                                                       {99.999, 99999},
+                                                       {1234.0, 100000},
+                                                       {HUGE_VAL, 100000},
+                                                       {INFINITY, 100000},
+                                                       {NAN, 100000}}) {
+    ResetPolicy(bpk.first);
+    auto bfp = GetBloomLikeFilterPolicy();
+    EXPECT_EQ(bpk.second, bfp->GetMillibitsPerKey());
+    EXPECT_EQ((bpk.second + 500) / 1000, bfp->GetWholeBitsPerKey());
+
+    double computed = bpk.first;
+    // This transforms e.g. 9.5 -> 9.499999999999998, which we still
+    // round to 10 for whole bits per key.
+    computed += 0.5;
+    computed /= 1234567.0;
+    computed *= 1234567.0;
+    computed -= 0.5;
+    some_computed_less_than_denoted |= (computed < bpk.first);
+    ResetPolicy(computed);
+    bfp = GetBloomLikeFilterPolicy();
+    EXPECT_EQ(bpk.second, bfp->GetMillibitsPerKey());
+    EXPECT_EQ((bpk.second + 500) / 1000, bfp->GetWholeBitsPerKey());
+
+    auto bits_builder = GetBuiltinFilterBitsBuilder();
+    if (bpk.second == 0) {
+      ASSERT_EQ(bits_builder, nullptr);
+      continue;
+    }
+
+    size_t n = 1;
+    size_t space = 0;
+    for (; n < 1000000; n += 1 + n / 1000) {
+      // Ensure consistency between CalculateSpace and ApproximateNumEntries
+      space = bits_builder->CalculateSpace(n);
+      size_t n2 = bits_builder->ApproximateNumEntries(space);
+      EXPECT_GE(n2, n);
+      size_t space2 = bits_builder->CalculateSpace(n2);
+      if (n > 12000 && GetParam() == kStandard128Ribbon) {
+        // TODO(peterd): better approximation?
+        EXPECT_GE(space2, space);
+        EXPECT_LE(space2 * 0.998, space * 1.0);
+      } else {
+        EXPECT_EQ(space2, space);
+      }
+    }
+    // Until size_t overflow
+    for (; n < (n + n / 3); n += n / 3) {
+      // Ensure space computation is not overflowing; capped is OK
+      size_t space2 = bits_builder->CalculateSpace(n);
+      EXPECT_GE(space2, space);
+      space = space2;
+    }
+  }
+  // Check that the compiler hasn't optimized our computation into nothing
+  EXPECT_TRUE(some_computed_less_than_denoted);
+  ResetPolicy();
+}
+
+TEST_P(FullBloomTest, FullEmptyFilter) {
+  // Empty filter is not match, at this level
+  ASSERT_TRUE(!Matches("hello"));
+  ASSERT_TRUE(!Matches("world"));
+}
+
+TEST_P(FullBloomTest, FullSmall) {
+  Add("hello");
+  Add("world");
+  ASSERT_TRUE(Matches("hello"));
+  ASSERT_TRUE(Matches("world"));
+  ASSERT_TRUE(!Matches("x"));
+  ASSERT_TRUE(!Matches("foo"));
+}
+
+TEST_P(FullBloomTest, FullVaryingLengths) {
+  char buffer[sizeof(int)];
+
+  // Count number of filters that significantly exceed the false positive rate
+  int mediocre_filters = 0;
+  int good_filters = 0;
+
+  for (int length = 1; length <= 10000; length = NextLength(length)) {
+    Reset();
+    for (int i = 0; i < length; i++) {
+      Add(Key(i, buffer));
+    }
+    Build();
+
+    EXPECT_LE(FilterSize(), (size_t)((length * FLAGS_bits_per_key / 8) +
+                                     CACHE_LINE_SIZE * 2 + 5));
+
+    // All added keys must match
+    for (int i = 0; i < length; i++) {
+      ASSERT_TRUE(Matches(Key(i, buffer)))
+          << "Length " << length << "; key " << i;
+    }
+
+    // Check false positive rate
+    double rate = FalsePositiveRate();
+    if (kVerbose >= 1) {
+      fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n",
+              rate * 100.0, length, static_cast<int>(FilterSize()));
+    }
+    if (FLAGS_bits_per_key == 10) {
+      EXPECT_LE(rate, 0.02);  // Must not be over 2%
+      if (rate > 0.0125) {
+        mediocre_filters++;  // Allowed, but not too often
+      } else {
+        good_filters++;
+      }
+    }
+  }
+  if (kVerbose >= 1) {
+    fprintf(stderr, "Filters: %d good, %d mediocre\n", good_filters,
+            mediocre_filters);
+  }
+  EXPECT_LE(mediocre_filters, good_filters / 5);
+}
+
+TEST_P(FullBloomTest, OptimizeForMemory) {
+  char buffer[sizeof(int)];
+  for (bool offm : {true, false}) {
+    table_options_.optimize_filters_for_memory = offm;
+    ResetPolicy();
+    Random32 rnd(12345);
+    uint64_t total_size = 0;
+    uint64_t total_mem = 0;
+    int64_t total_keys = 0;
+    double total_fp_rate = 0;
+    constexpr int nfilters = 100;
+    for (int i = 0; i < nfilters; ++i) {
+      int nkeys = static_cast<int>(rnd.Uniformish(10000)) + 100;
+      Reset();
+      for (int j = 0; j < nkeys; ++j) {
+        Add(Key(j, buffer));
+      }
+      Build();
+      size_t size = FilterData().size();
+      total_size += size;
+      // optimize_filters_for_memory currently depends on malloc_usable_size
+      // but we run the rest of the test to ensure no bad behavior without it.
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+      size = malloc_usable_size(const_cast<char*>(FilterData().data()));
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+      total_mem += size;
+      total_keys += nkeys;
+      total_fp_rate += FalsePositiveRate();
+    }
+    if (FLAGS_bits_per_key == 10) {
+      EXPECT_LE(total_fp_rate / double{nfilters}, 0.011);
+      EXPECT_GE(total_fp_rate / double{nfilters},
+                CACHE_LINE_SIZE >= 256 ? 0.007 : 0.008);
+    }
+
+    int64_t ex_min_total_size = int64_t{FLAGS_bits_per_key} * total_keys / 8;
+    if (GetParam() == kStandard128Ribbon) {
+      // ~ 30% savings vs. Bloom filter
+      ex_min_total_size = 7 * ex_min_total_size / 10;
+    }
+    EXPECT_GE(static_cast<int64_t>(total_size), ex_min_total_size);
+
+    int64_t blocked_bloom_overhead = nfilters * (CACHE_LINE_SIZE + 5);
+    if (GetParam() == kLegacyBloom) {
+      // this config can add extra cache line to make odd number
+      blocked_bloom_overhead += nfilters * CACHE_LINE_SIZE;
+    }
+
+    EXPECT_GE(total_mem, total_size);
+
+    // optimize_filters_for_memory not implemented with legacy Bloom
+    if (offm && GetParam() != kLegacyBloom) {
+      // This value can include a small extra penalty for kExtraPadding
+      fprintf(stderr, "Internal fragmentation (optimized): %g%%\n",
+              (total_mem - total_size) * 100.0 / total_size);
+      // Less than 1% internal fragmentation
+      EXPECT_LE(total_mem, total_size * 101 / 100);
+      // Up to 2% storage penalty
+      EXPECT_LE(static_cast<int64_t>(total_size),
+                ex_min_total_size * 102 / 100 + blocked_bloom_overhead);
+    } else {
+      fprintf(stderr, "Internal fragmentation (not optimized): %g%%\n",
+              (total_mem - total_size) * 100.0 / total_size);
+      // TODO: add control checks for more allocators?
+#ifdef ROCKSDB_JEMALLOC
+      fprintf(stderr, "Jemalloc detected? %d\n", HasJemalloc());
+      if (HasJemalloc()) {
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+        // More than 5% internal fragmentation
+        EXPECT_GE(total_mem, total_size * 105 / 100);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+      }
+#endif  // ROCKSDB_JEMALLOC
+      // No storage penalty, just usual overhead
+      EXPECT_LE(static_cast<int64_t>(total_size),
+                ex_min_total_size + blocked_bloom_overhead);
+    }
+  }
+}
+
+class ChargeFilterConstructionTest : public testing::Test {};
+TEST_F(ChargeFilterConstructionTest, RibbonFilterFallBackOnLargeBanding) {
+  constexpr std::size_t kCacheCapacity =
+      8 * CacheReservationManagerImpl<
+              CacheEntryRole::kFilterConstruction>::GetDummyEntrySize();
+  constexpr std::size_t num_entries_for_cache_full = kCacheCapacity / 8;
+
+  for (CacheEntryRoleOptions::Decision charge_filter_construction_mem :
+       {CacheEntryRoleOptions::Decision::kEnabled,
+        CacheEntryRoleOptions::Decision::kDisabled}) {
+    bool will_fall_back = charge_filter_construction_mem ==
+                          CacheEntryRoleOptions::Decision::kEnabled;
+
+    BlockBasedTableOptions table_options;
+    table_options.cache_usage_options.options_overrides.insert(
+        {CacheEntryRole::kFilterConstruction,
+         {/*.charged = */ charge_filter_construction_mem}});
+    LRUCacheOptions lo;
+    lo.capacity = kCacheCapacity;
+    lo.num_shard_bits = 0;  // 2^0 shard
+    lo.strict_capacity_limit = true;
+    std::shared_ptr<Cache> cache(NewLRUCache(lo));
+    table_options.block_cache = cache;
+    table_options.filter_policy =
+        BloomLikeFilterPolicy::Create(kStandard128Ribbon, FLAGS_bits_per_key);
+    FilterBuildingContext ctx(table_options);
+    std::unique_ptr<FilterBitsBuilder> filter_bits_builder(
+        table_options.filter_policy->GetBuilderWithContext(ctx));
+
+    char key_buffer[sizeof(int)];
+    for (std::size_t i = 0; i < num_entries_for_cache_full; ++i) {
+      filter_bits_builder->AddKey(Key(static_cast<int>(i), key_buffer));
+    }
+
+    std::unique_ptr<const char[]> buf;
+    Slice filter = filter_bits_builder->Finish(&buf);
+
+    // To verify Ribbon Filter fallbacks to Bloom Filter properly
+    // based on cache charging result
+    // See BloomFilterPolicy::GetBloomBitsReader re: metadata
+    // -1 = Marker for newer Bloom implementations
+    // -2 = Marker for Standard128 Ribbon
+    if (will_fall_back) {
+      EXPECT_EQ(filter.data()[filter.size() - 5], static_cast<char>(-1));
+    } else {
+      EXPECT_EQ(filter.data()[filter.size() - 5], static_cast<char>(-2));
+    }
+
+    if (charge_filter_construction_mem ==
+        CacheEntryRoleOptions::Decision::kEnabled) {
+      const size_t dummy_entry_num = static_cast<std::size_t>(std::ceil(
+          filter.size() * 1.0 /
+          CacheReservationManagerImpl<
+              CacheEntryRole::kFilterConstruction>::GetDummyEntrySize()));
+      EXPECT_GE(
+          cache->GetPinnedUsage(),
+          dummy_entry_num *
+              CacheReservationManagerImpl<
+                  CacheEntryRole::kFilterConstruction>::GetDummyEntrySize());
+      EXPECT_LT(
+          cache->GetPinnedUsage(),
+          (dummy_entry_num + 1) *
+              CacheReservationManagerImpl<
+                  CacheEntryRole::kFilterConstruction>::GetDummyEntrySize());
+    } else {
+      EXPECT_EQ(cache->GetPinnedUsage(), 0);
+    }
+  }
+}
+
+namespace {
+inline uint32_t SelectByCacheLineSize(uint32_t for64, uint32_t for128,
+                                      uint32_t for256) {
+  (void)for64;
+  (void)for128;
+  (void)for256;
+#if CACHE_LINE_SIZE == 64
+  return for64;
+#elif CACHE_LINE_SIZE == 128
+  return for128;
+#elif CACHE_LINE_SIZE == 256
+  return for256;
+#else
+#error "CACHE_LINE_SIZE unknown or unrecognized"
+#endif
+}
+}  // namespace
+
+// Ensure the implementation doesn't accidentally change in an
+// incompatible way. This test doesn't check the reading side
+// (FirstFPs/PackedMatches) for LegacyBloom because it requires the
+// ability to read filters generated using other cache line sizes.
+// See RawSchema.
+TEST_P(FullBloomTest, Schema) {
+#define EXPECT_EQ_Bloom(a, b)               \
+  {                                         \
+    if (GetParam() != kStandard128Ribbon) { \
+      EXPECT_EQ(a, b);                      \
+    }                                       \
+  }
+#define EXPECT_EQ_Ribbon(a, b)              \
+  {                                         \
+    if (GetParam() == kStandard128Ribbon) { \
+      EXPECT_EQ(a, b);                      \
+    }                                       \
+  }
+#define EXPECT_EQ_FastBloom(a, b)        \
+  {                                      \
+    if (GetParam() == kFastLocalBloom) { \
+      EXPECT_EQ(a, b);                   \
+    }                                    \
+  }
+#define EXPECT_EQ_LegacyBloom(a, b)   \
+  {                                   \
+    if (GetParam() == kLegacyBloom) { \
+      EXPECT_EQ(a, b);                \
+    }                                 \
+  }
+#define EXPECT_EQ_NotLegacy(a, b)     \
+  {                                   \
+    if (GetParam() != kLegacyBloom) { \
+      EXPECT_EQ(a, b);                \
+    }                                 \
+  }
+
+  char buffer[sizeof(int)];
+
+  // First do a small number of keys, where Ribbon config will fall back on
+  // fast Bloom filter and generate the same data
+  ResetPolicy(5);  // num_probes = 3
+  for (int key = 0; key < 87; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ(GetNumProbesFromFilterData(), 3);
+
+  EXPECT_EQ_NotLegacy(BloomHash(FilterData()), 4130687756U);
+
+  EXPECT_EQ_NotLegacy("31,38,40,43,61,83,86,112,125,131", FirstFPs(10));
+
+  // Now use enough keys so that changing bits / key by 1 is guaranteed to
+  // change number of allocated cache lines. So keys > max cache line bits.
+
+  // Note that the first attempted Ribbon seed is determined by the hash
+  // of the first key added (for pseudorandomness in practice, determinism in
+  // testing)
+
+  ResetPolicy(2);  // num_probes = 1
+  for (int key = 0; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 1);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61);
+
+  EXPECT_EQ_LegacyBloom(
+      BloomHash(FilterData()),
+      SelectByCacheLineSize(1567096579, 1964771444, 2659542661U));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 3817481309U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 1705851228U);
+
+  EXPECT_EQ_FastBloom("11,13,17,25,29,30,35,37,45,53", FirstFPs(10));
+  EXPECT_EQ_Ribbon("3,8,10,17,19,20,23,28,31,32", FirstFPs(10));
+
+  ResetPolicy(3);  // num_probes = 2
+  for (int key = 0; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 2);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61);
+
+  EXPECT_EQ_LegacyBloom(
+      BloomHash(FilterData()),
+      SelectByCacheLineSize(2707206547U, 2571983456U, 218344685));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 2807269961U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 1095342358U);
+
+  EXPECT_EQ_FastBloom("4,15,17,24,27,28,29,53,63,70", FirstFPs(10));
+  EXPECT_EQ_Ribbon("3,17,20,28,32,33,36,43,49,54", FirstFPs(10));
+
+  ResetPolicy(5);  // num_probes = 3
+  for (int key = 0; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 3);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61);
+
+  EXPECT_EQ_LegacyBloom(
+      BloomHash(FilterData()),
+      SelectByCacheLineSize(515748486, 94611728, 2436112214U));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 204628445U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 3971337699U);
+
+  EXPECT_EQ_FastBloom("15,24,29,39,53,87,89,100,103,104", FirstFPs(10));
+  EXPECT_EQ_Ribbon("3,33,36,43,67,70,76,78,84,102", FirstFPs(10));
+
+  ResetPolicy(8);  // num_probes = 5
+  for (int key = 0; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 5);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61);
+
+  EXPECT_EQ_LegacyBloom(
+      BloomHash(FilterData()),
+      SelectByCacheLineSize(1302145999, 2811644657U, 756553699));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 355564975U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 3651449053U);
+
+  EXPECT_EQ_FastBloom("16,60,66,126,220,238,244,256,265,287", FirstFPs(10));
+  EXPECT_EQ_Ribbon("33,187,203,296,300,322,411,419,547,582", FirstFPs(10));
+
+  ResetPolicy(9);  // num_probes = 6
+  for (int key = 0; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 6);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61);
+
+  EXPECT_EQ_LegacyBloom(
+      BloomHash(FilterData()),
+      SelectByCacheLineSize(2092755149, 661139132, 1182970461));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 2137566013U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 1005676675U);
+
+  EXPECT_EQ_FastBloom("156,367,791,872,945,1015,1139,1159,1265", FirstFPs(9));
+  EXPECT_EQ_Ribbon("33,187,203,296,411,419,604,612,615,619", FirstFPs(10));
+
+  ResetPolicy(11);  // num_probes = 7
+  for (int key = 0; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 7);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61);
+
+  EXPECT_EQ_LegacyBloom(
+      BloomHash(FilterData()),
+      SelectByCacheLineSize(3755609649U, 1812694762, 1449142939));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 2561502687U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 3129900846U);
+
+  EXPECT_EQ_FastBloom("34,74,130,236,643,882,962,1015,1035,1110", FirstFPs(10));
+  EXPECT_EQ_Ribbon("411,419,623,665,727,794,955,1052,1323,1330", FirstFPs(10));
+
+  // This used to be 9 probes, but 8 is a better choice for speed,
+  // especially with SIMD groups of 8 probes, with essentially no
+  // change in FP rate.
+  // FP rate @ 9 probes, old Bloom: 0.4321%
+  // FP rate @ 9 probes, new Bloom: 0.1846%
+  // FP rate @ 8 probes, new Bloom: 0.1843%
+  ResetPolicy(14);  // num_probes = 8 (new), 9 (old)
+  for (int key = 0; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ_LegacyBloom(GetNumProbesFromFilterData(), 9);
+  EXPECT_EQ_FastBloom(GetNumProbesFromFilterData(), 8);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61);
+
+  EXPECT_EQ_LegacyBloom(
+      BloomHash(FilterData()),
+      SelectByCacheLineSize(178861123, 379087593, 2574136516U));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 3709876890U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 1855638875U);
+
+  EXPECT_EQ_FastBloom("130,240,522,565,989,2002,2526,3147,3543", FirstFPs(9));
+  EXPECT_EQ_Ribbon("665,727,1323,1755,3866,4232,4442,4492,4736", FirstFPs(9));
+
+  // This used to be 11 probes, but 9 is a better choice for speed
+  // AND accuracy.
+  // FP rate @ 11 probes, old Bloom: 0.3571%
+  // FP rate @ 11 probes, new Bloom: 0.0884%
+  // FP rate @  9 probes, new Bloom: 0.0843%
+  ResetPolicy(16);  // num_probes = 9 (new), 11 (old)
+  for (int key = 0; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ_LegacyBloom(GetNumProbesFromFilterData(), 11);
+  EXPECT_EQ_FastBloom(GetNumProbesFromFilterData(), 9);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61);
+
+  EXPECT_EQ_LegacyBloom(
+      BloomHash(FilterData()),
+      SelectByCacheLineSize(1129406313, 3049154394U, 1727750964));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 1087138490U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 459379967U);
+
+  EXPECT_EQ_FastBloom("3299,3611,3916,6620,7822,8079,8482,8942", FirstFPs(8));
+  EXPECT_EQ_Ribbon("727,1323,1755,4442,4736,5386,6974,7154,8222", FirstFPs(9));
+
+  ResetPolicy(10);  // num_probes = 6, but different memory ratio vs. 9
+  for (int key = 0; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 6);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61);
+
+  EXPECT_EQ_LegacyBloom(
+      BloomHash(FilterData()),
+      SelectByCacheLineSize(1478976371, 2910591341U, 1182970461));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 2498541272U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 1273231667U);
+
+  EXPECT_EQ_FastBloom("16,126,133,422,466,472,813,1002,1035", FirstFPs(9));
+  EXPECT_EQ_Ribbon("296,411,419,612,619,623,630,665,686,727", FirstFPs(10));
+
+  ResetPolicy(10);
+  for (int key = /*CHANGED*/ 1; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 6);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), /*CHANGED*/ 184);
+
+  EXPECT_EQ_LegacyBloom(
+      BloomHash(FilterData()),
+      SelectByCacheLineSize(4205696321U, 1132081253U, 2385981855U));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 2058382345U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 3007790572U);
+
+  EXPECT_EQ_FastBloom("16,126,133,422,466,472,813,1002,1035", FirstFPs(9));
+  EXPECT_EQ_Ribbon("33,152,383,497,589,633,737,781,911,990", FirstFPs(10));
+
+  ResetPolicy(10);
+  for (int key = 1; key < /*CHANGED*/ 2088; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 6);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 184);
+
+  EXPECT_EQ_LegacyBloom(
+      BloomHash(FilterData()),
+      SelectByCacheLineSize(2885052954U, 769447944, 4175124908U));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 23699164U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 1942323379U);
+
+  EXPECT_EQ_FastBloom("16,126,133,422,466,472,813,1002,1035", FirstFPs(9));
+  EXPECT_EQ_Ribbon("33,95,360,589,737,911,990,1048,1081,1414", FirstFPs(10));
+
+  // With new fractional bits_per_key, check that we are rounding to
+  // whole bits per key for old Bloom filters but fractional for
+  // new Bloom filter.
+  ResetPolicy(9.5);
+  for (int key = 1; key < 2088; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 6);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 184);
+
+  EXPECT_EQ_LegacyBloom(
+      BloomHash(FilterData()),
+      /*SAME*/ SelectByCacheLineSize(2885052954U, 769447944, 4175124908U));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 3166884174U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 1148258663U);
+
+  EXPECT_EQ_FastBloom("126,156,367,444,458,791,813,976,1015", FirstFPs(9));
+  EXPECT_EQ_Ribbon("33,54,95,360,589,693,737,911,990,1048", FirstFPs(10));
+
+  ResetPolicy(10.499);
+  for (int key = 1; key < 2088; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ_LegacyBloom(GetNumProbesFromFilterData(), 6);
+  EXPECT_EQ_FastBloom(GetNumProbesFromFilterData(), 7);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 184);
+
+  EXPECT_EQ_LegacyBloom(
+      BloomHash(FilterData()),
+      /*SAME*/ SelectByCacheLineSize(2885052954U, 769447944, 4175124908U));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 4098502778U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 792138188U);
+
+  EXPECT_EQ_FastBloom("16,236,240,472,1015,1045,1111,1409,1465", FirstFPs(9));
+  EXPECT_EQ_Ribbon("33,95,360,589,737,990,1048,1081,1414,1643", FirstFPs(10));
+
+  ResetPolicy();
+}
+
+// A helper class for testing custom or corrupt filter bits as read by
+// built-in FilterBitsReaders.
+struct RawFilterTester {
+  // Buffer, from which we always return a tail Slice, so the
+  // last five bytes are always the metadata bytes.
+  std::array<char, 3000> data_;
+  // Points five bytes from the end
+  char* metadata_ptr_;
+
+  RawFilterTester() : metadata_ptr_(&*(data_.end() - 5)) {}
+
+  Slice ResetNoFill(uint32_t len_without_metadata, uint32_t num_lines,
+                    uint32_t num_probes) {
+    metadata_ptr_[0] = static_cast<char>(num_probes);
+    EncodeFixed32(metadata_ptr_ + 1, num_lines);
+    uint32_t len = len_without_metadata + /*metadata*/ 5;
+    assert(len <= data_.size());
+    return Slice(metadata_ptr_ - len_without_metadata, len);
+  }
+
+  Slice Reset(uint32_t len_without_metadata, uint32_t num_lines,
+              uint32_t num_probes, bool fill_ones) {
+    data_.fill(fill_ones ? 0xff : 0);
+    return ResetNoFill(len_without_metadata, num_lines, num_probes);
+  }
+
+  Slice ResetWeirdFill(uint32_t len_without_metadata, uint32_t num_lines,
+                       uint32_t num_probes) {
+    for (uint32_t i = 0; i < data_.size(); ++i) {
+      data_[i] = static_cast<char>(0x7b7b >> (i % 7));
+    }
+    return ResetNoFill(len_without_metadata, num_lines, num_probes);
+  }
+};
+
+TEST_P(FullBloomTest, RawSchema) {
+  RawFilterTester cft;
+  // Legacy Bloom configurations
+  // Two probes, about 3/4 bits set: ~50% "FP" rate
+  // One 256-byte cache line.
+  OpenRaw(cft.ResetWeirdFill(256, 1, 2));
+  EXPECT_EQ(uint64_t{11384799501900898790U}, PackedMatches());
+
+  // Two 128-byte cache lines.
+  OpenRaw(cft.ResetWeirdFill(256, 2, 2));
+  EXPECT_EQ(uint64_t{10157853359773492589U}, PackedMatches());
+
+  // Four 64-byte cache lines.
+  OpenRaw(cft.ResetWeirdFill(256, 4, 2));
+  EXPECT_EQ(uint64_t{7123594913907464682U}, PackedMatches());
+
+  // Fast local Bloom configurations (marker 255 -> -1)
+  // Two probes, about 3/4 bits set: ~50% "FP" rate
+  // Four 64-byte cache lines.
+  OpenRaw(cft.ResetWeirdFill(256, 2U << 8, 255));
+  EXPECT_EQ(uint64_t{9957045189927952471U}, PackedMatches());
+
+  // Ribbon configurations (marker 254 -> -2)
+
+  // Even though the builder never builds configurations this
+  // small (preferring Bloom), we can test that the configuration
+  // can be read, for possible future-proofing.
+
+  // 256 slots, one result column = 32 bytes (2 blocks, seed 0)
+  // ~50% FP rate:
+  // 0b0101010111110101010000110000011011011111100100001110010011101010
+  OpenRaw(cft.ResetWeirdFill(32, 2U << 8, 254));
+  EXPECT_EQ(uint64_t{6193930559317665002U}, PackedMatches());
+
+  // 256 slots, three-to-four result columns = 112 bytes
+  // ~ 1 in 10 FP rate:
+  // 0b0000000000100000000000000000000001000001000000010000101000000000
+  OpenRaw(cft.ResetWeirdFill(112, 2U << 8, 254));
+  EXPECT_EQ(uint64_t{9007200345328128U}, PackedMatches());
+}
+
+TEST_P(FullBloomTest, CorruptFilters) {
+  RawFilterTester cft;
+
+  for (bool fill : {false, true}) {
+    // Legacy Bloom configurations
+    // Good filter bits - returns same as fill
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 6, fill));
+    ASSERT_EQ(fill, Matches("hello"));
+    ASSERT_EQ(fill, Matches("world"));
+
+    // Good filter bits - returns same as fill
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE * 3, 3, 6, fill));
+    ASSERT_EQ(fill, Matches("hello"));
+    ASSERT_EQ(fill, Matches("world"));
+
+    // Good filter bits - returns same as fill
+    // 256 is unusual but legal cache line size
+    OpenRaw(cft.Reset(256 * 3, 3, 6, fill));
+    ASSERT_EQ(fill, Matches("hello"));
+    ASSERT_EQ(fill, Matches("world"));
+
+    // Good filter bits - returns same as fill
+    // 30 should be max num_probes
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 30, fill));
+    ASSERT_EQ(fill, Matches("hello"));
+    ASSERT_EQ(fill, Matches("world"));
+
+    // Good filter bits - returns same as fill
+    // 1 should be min num_probes
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 1, fill));
+    ASSERT_EQ(fill, Matches("hello"));
+    ASSERT_EQ(fill, Matches("world"));
+
+    // Type 1 trivial filter bits - returns true as if FP by zero probes
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 0, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    // Type 2 trivial filter bits - returns false as if built from zero keys
+    OpenRaw(cft.Reset(0, 0, 6, fill));
+    ASSERT_FALSE(Matches("hello"));
+    ASSERT_FALSE(Matches("world"));
+
+    // Type 2 trivial filter bits - returns false as if built from zero keys
+    OpenRaw(cft.Reset(0, 37, 6, fill));
+    ASSERT_FALSE(Matches("hello"));
+    ASSERT_FALSE(Matches("world"));
+
+    // Type 2 trivial filter bits - returns false as 0 size trumps 0 probes
+    OpenRaw(cft.Reset(0, 0, 0, fill));
+    ASSERT_FALSE(Matches("hello"));
+    ASSERT_FALSE(Matches("world"));
+
+    // Bad filter bits - returns true for safety
+    // No solution to 0 * x == CACHE_LINE_SIZE
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 0, 6, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    // Bad filter bits - returns true for safety
+    // Can't have 3 * x == 4 for integer x
+    OpenRaw(cft.Reset(4, 3, 6, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    // Bad filter bits - returns true for safety
+    // 97 bytes is not a power of two, so not a legal cache line size
+    OpenRaw(cft.Reset(97 * 3, 3, 6, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    // Bad filter bits - returns true for safety
+    // 65 bytes is not a power of two, so not a legal cache line size
+    OpenRaw(cft.Reset(65 * 3, 3, 6, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    // Bad filter bits - returns false as if built from zero keys
+    // < 5 bytes overall means missing even metadata
+    OpenRaw(cft.Reset(static_cast<uint32_t>(-1), 3, 6, fill));
+    ASSERT_FALSE(Matches("hello"));
+    ASSERT_FALSE(Matches("world"));
+
+    OpenRaw(cft.Reset(static_cast<uint32_t>(-5), 3, 6, fill));
+    ASSERT_FALSE(Matches("hello"));
+    ASSERT_FALSE(Matches("world"));
+
+    // Dubious filter bits - returns same as fill (for now)
+    // 31 is not a useful num_probes, nor generated by RocksDB unless directly
+    // using filter bits API without BloomFilterPolicy.
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 31, fill));
+    ASSERT_EQ(fill, Matches("hello"));
+    ASSERT_EQ(fill, Matches("world"));
+
+    // Dubious filter bits - returns same as fill (for now)
+    // Similar, with 127, largest positive char
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 127, fill));
+    ASSERT_EQ(fill, Matches("hello"));
+    ASSERT_EQ(fill, Matches("world"));
+
+    // Dubious filter bits - returns true (for now)
+    // num_probes set to 128 / -128, lowest negative char
+    // NB: Bug in implementation interprets this as negative and has same
+    // effect as zero probes, but effectively reserves negative char values
+    // for future use.
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 128, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    // Dubious filter bits - returns true (for now)
+    // Similar, with 253 / -3
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 253, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    // #########################################################
+    // Fast local Bloom configurations (marker 255 -> -1)
+    // Good config with six probes
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 6U << 8, 255, fill));
+    ASSERT_EQ(fill, Matches("hello"));
+    ASSERT_EQ(fill, Matches("world"));
+
+    // Becomes bad/reserved config (always true) if any other byte set
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, (6U << 8) | 1U, 255, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, (6U << 8) | (1U << 16), 255, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, (6U << 8) | (1U << 24), 255, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    // Good config, max 30 probes
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 30U << 8, 255, fill));
+    ASSERT_EQ(fill, Matches("hello"));
+    ASSERT_EQ(fill, Matches("world"));
+
+    // Bad/reserved config (always true) if more than 30
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 31U << 8, 255, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 33U << 8, 255, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 66U << 8, 255, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 130U << 8, 255, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+  }
+
+  // #########################################################
+  // Ribbon configurations (marker 254 -> -2)
+  // ("fill" doesn't work to detect good configurations, we just
+  // have to rely on TN probability)
+
+  // Good: 2 blocks * 16 bytes / segment * 4 columns = 128 bytes
+  // seed = 123
+  OpenRaw(cft.Reset(128, (2U << 8) + 123U, 254, false));
+  ASSERT_FALSE(Matches("hello"));
+  ASSERT_FALSE(Matches("world"));
+
+  // Good: 2 blocks * 16 bytes / segment * 8 columns = 256 bytes
+  OpenRaw(cft.Reset(256, (2U << 8) + 123U, 254, false));
+  ASSERT_FALSE(Matches("hello"));
+  ASSERT_FALSE(Matches("world"));
+
+  // Surprisingly OK: 5000 blocks (640,000 slots) in only 1024 bits
+  // -> average close to 0 columns
+  OpenRaw(cft.Reset(128, (5000U << 8) + 123U, 254, false));
+  // *Almost* all FPs
+  ASSERT_TRUE(Matches("hello"));
+  ASSERT_TRUE(Matches("world"));
+  // Need many queries to find a "true negative"
+  for (int i = 0; Matches(std::to_string(i)); ++i) {
+    ASSERT_LT(i, 1000);
+  }
+
+  // Bad: 1 block not allowed (for implementation detail reasons)
+  OpenRaw(cft.Reset(128, (1U << 8) + 123U, 254, false));
+  ASSERT_TRUE(Matches("hello"));
+  ASSERT_TRUE(Matches("world"));
+
+  // Bad: 0 blocks not allowed
+  OpenRaw(cft.Reset(128, (0U << 8) + 123U, 254, false));
+  ASSERT_TRUE(Matches("hello"));
+  ASSERT_TRUE(Matches("world"));
+}
+
+INSTANTIATE_TEST_CASE_P(Full, FullBloomTest,
+                        testing::Values(kLegacyBloom, kFastLocalBloom,
+                                        kStandard128Ribbon));
+
+static double GetEffectiveBitsPerKey(FilterBitsBuilder* builder) {
+  union {
+    uint64_t key_value = 0;
+    char key_bytes[8];
+  };
+
+  const unsigned kNumKeys = 1000;
+
+  Slice key_slice{key_bytes, 8};
+  for (key_value = 0; key_value < kNumKeys; ++key_value) {
+    builder->AddKey(key_slice);
+  }
+
+  std::unique_ptr<const char[]> buf;
+  auto filter = builder->Finish(&buf);
+  return filter.size() * /*bits per byte*/ 8 / (1.0 * kNumKeys);
+}
+
+static void SetTestingLevel(int levelish, FilterBuildingContext* ctx) {
+  if (levelish == -1) {
+    // Flush is treated as level -1 for this option but actually level 0
+    ctx->level_at_creation = 0;
+    ctx->reason = TableFileCreationReason::kFlush;
+  } else {
+    ctx->level_at_creation = levelish;
+    ctx->reason = TableFileCreationReason::kCompaction;
+  }
+}
+
+TEST(RibbonTest, RibbonTestLevelThreshold) {
+  BlockBasedTableOptions opts;
+  FilterBuildingContext ctx(opts);
+  // A few settings
+  for (CompactionStyle cs : {kCompactionStyleLevel, kCompactionStyleUniversal,
+                             kCompactionStyleFIFO, kCompactionStyleNone}) {
+    ctx.compaction_style = cs;
+    for (int bloom_before_level : {-1, 0, 1, 10}) {
+      std::vector<std::unique_ptr<const FilterPolicy> > policies;
+      policies.emplace_back(NewRibbonFilterPolicy(10, bloom_before_level));
+
+      if (bloom_before_level == 0) {
+        // Also test new API default
+        policies.emplace_back(NewRibbonFilterPolicy(10));
+      }
+
+      for (std::unique_ptr<const FilterPolicy>& policy : policies) {
+        // Claim to be generating filter for this level
+        SetTestingLevel(bloom_before_level, &ctx);
+
+        std::unique_ptr<FilterBitsBuilder> builder{
+            policy->GetBuilderWithContext(ctx)};
+
+        // Must be Ribbon (more space efficient than 10 bits per key)
+        ASSERT_LT(GetEffectiveBitsPerKey(builder.get()), 8);
+
+        if (bloom_before_level >= 0) {
+          // Claim to be generating filter for previous level
+          SetTestingLevel(bloom_before_level - 1, &ctx);
+
+          builder.reset(policy->GetBuilderWithContext(ctx));
+
+          if (cs == kCompactionStyleLevel || cs == kCompactionStyleUniversal) {
+            // Level is considered.
+            // Must be Bloom (~ 10 bits per key)
+            ASSERT_GT(GetEffectiveBitsPerKey(builder.get()), 9);
+          } else {
+            // Level is ignored under non-traditional compaction styles.
+            // Must be Ribbon (more space efficient than 10 bits per key)
+            ASSERT_LT(GetEffectiveBitsPerKey(builder.get()), 8);
+          }
+        }
+
+        // Like SST file writer
+        ctx.level_at_creation = -1;
+        ctx.reason = TableFileCreationReason::kMisc;
+
+        builder.reset(policy->GetBuilderWithContext(ctx));
+
+        // Must be Ribbon (more space efficient than 10 bits per key)
+        ASSERT_LT(GetEffectiveBitsPerKey(builder.get()), 8);
+      }
+    }
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  return RUN_ALL_TESTS();
+}
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/util/build_version.cc.in b/src/rocksdb/util/build_version.cc.in
new file mode 100644
index 000000000..c1706dc1f
--- /dev/null
+++ b/src/rocksdb/util/build_version.cc.in
@@ -0,0 +1,81 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+#include <memory>
+
+#include "rocksdb/version.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "util/string_util.h"
+
+// The build script may replace these values with real values based
+// on whether or not GIT is available and the platform settings
+static const std::string rocksdb_build_git_sha  = "rocksdb_build_git_sha:@GIT_SHA@";
+static const std::string rocksdb_build_git_tag = "rocksdb_build_git_tag:@GIT_TAG@";
+#define HAS_GIT_CHANGES @GIT_MOD@
+#if HAS_GIT_CHANGES == 0
+// If HAS_GIT_CHANGES is 0, the GIT date is used.
+// Use the time the branch/tag was last modified
+static const std::string rocksdb_build_date = "rocksdb_build_date:@GIT_DATE@";
+#else
+// If HAS_GIT_CHANGES is > 0, the branch/tag has modifications.
+// Use the time the build was created.
+static const std::string rocksdb_build_date = "rocksdb_build_date:@BUILD_DATE@";
+#endif
+
+#ifndef ROCKSDB_LITE
+extern "C" {
+@ROCKSDB_PLUGIN_EXTERNS@
+} // extern "C"
+
+std::unordered_map<std::string, ROCKSDB_NAMESPACE::RegistrarFunc> ROCKSDB_NAMESPACE::ObjectRegistry::builtins_ = {
+  @ROCKSDB_PLUGIN_BUILTINS@
+};
+#endif //ROCKSDB_LITE
+
+namespace ROCKSDB_NAMESPACE {
+static void AddProperty(std::unordered_map<std::string, std::string> *props, const std::string& name) {
+  size_t colon = name.find(":");
+  if (colon != std::string::npos && colon > 0 && colon < name.length() - 1) {
+    // If we found a "@:", then this property was a build-time substitution that failed.  Skip it
+    size_t at = name.find("@", colon);
+    if (at != colon + 1) {
+      // Everything before the colon is the name, after is the value
+      (*props)[name.substr(0, colon)] = name.substr(colon + 1);
+    }
+  }
+}
+
+static std::unordered_map<std::string, std::string>* LoadPropertiesSet() {
+  auto * properties = new std::unordered_map<std::string, std::string>();
+  AddProperty(properties, rocksdb_build_git_sha);
+  AddProperty(properties, rocksdb_build_git_tag);
+  AddProperty(properties, rocksdb_build_date);
+  return properties;
+}
+
+const std::unordered_map<std::string, std::string>& GetRocksBuildProperties() {
+  static std::unique_ptr<std::unordered_map<std::string, std::string>> props(LoadPropertiesSet());
+  return *props;
+}
+
+std::string GetRocksVersionAsString(bool with_patch) {
+  std::string version = std::to_string(ROCKSDB_MAJOR) + "." + std::to_string(ROCKSDB_MINOR);
+  if (with_patch) {
+    return version + "." + std::to_string(ROCKSDB_PATCH);
+  } else {
+    return version;
+ }
+}
+
+std::string GetRocksBuildInfoAsString(const std::string& program, bool verbose) {
+  std::string info = program + " (RocksDB) " + GetRocksVersionAsString(true);
+  if (verbose) {
+    for (const auto& it : GetRocksBuildProperties()) {
+      info.append("\n    ");
+      info.append(it.first);
+      info.append(": ");
+      info.append(it.second);
+    }
+  }
+  return info;
+}
+} // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/cast_util.h b/src/rocksdb/util/cast_util.h
new file mode 100644
index 000000000..c91b6ff1e
--- /dev/null
+++ b/src/rocksdb/util/cast_util.h
@@ -0,0 +1,42 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <type_traits>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+// The helper function to assert the move from dynamic_cast<> to
+// static_cast<> is correct. This function is to deal with legacy code.
+// It is not recommended to add new code to issue class casting. The preferred
+// solution is to implement the functionality without a need of casting.
+template <class DestClass, class SrcClass>
+inline DestClass* static_cast_with_check(SrcClass* x) {
+  DestClass* ret = static_cast<DestClass*>(x);
+#ifdef ROCKSDB_USE_RTTI
+  assert(ret == dynamic_cast<DestClass*>(x));
+#endif
+  return ret;
+}
+
+// A wrapper around static_cast for lossless conversion between integral
+// types, including enum types. For example, this can be used for converting
+// between signed/unsigned or enum type and underlying type without fear of
+// stripping away data, now or in the future.
+template <typename To, typename From>
+inline To lossless_cast(From x) {
+  using FromValue = typename std::remove_reference<From>::type;
+  static_assert(
+      std::is_integral<FromValue>::value || std::is_enum<FromValue>::value,
+      "Only works on integral types");
+  static_assert(std::is_integral<To>::value || std::is_enum<To>::value,
+                "Only works on integral types");
+  static_assert(sizeof(To) >= sizeof(FromValue), "Must be lossless");
+  return static_cast<To>(x);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/channel.h b/src/rocksdb/util/channel.h
new file mode 100644
index 000000000..19b956297
--- /dev/null
+++ b/src/rocksdb/util/channel.h
@@ -0,0 +1,69 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <condition_variable>
+#include <mutex>
+#include <queue>
+#include <utility>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+template <class T>
+class channel {
+ public:
+  explicit channel() : eof_(false) {}
+
+  channel(const channel&) = delete;
+  void operator=(const channel&) = delete;
+
+  void sendEof() {
+    std::lock_guard<std::mutex> lk(lock_);
+    eof_ = true;
+    cv_.notify_all();
+  }
+
+  bool eof() {
+    std::lock_guard<std::mutex> lk(lock_);
+    return buffer_.empty() && eof_;
+  }
+
+  size_t size() const {
+    std::lock_guard<std::mutex> lk(lock_);
+    return buffer_.size();
+  }
+
+  // writes elem to the queue
+  void write(T&& elem) {
+    std::unique_lock<std::mutex> lk(lock_);
+    buffer_.emplace(std::forward<T>(elem));
+    cv_.notify_one();
+  }
+
+  /// Moves a dequeued element onto elem, blocking until an element
+  /// is available.
+  // returns false if EOF
+  bool read(T& elem) {
+    std::unique_lock<std::mutex> lk(lock_);
+    cv_.wait(lk, [&] { return eof_ || !buffer_.empty(); });
+    if (eof_ && buffer_.empty()) {
+      return false;
+    }
+    elem = std::move(buffer_.front());
+    buffer_.pop();
+    cv_.notify_one();
+    return true;
+  }
+
+ private:
+  std::condition_variable cv_;
+  mutable std::mutex lock_;
+  std::queue<T> buffer_;
+  bool eof_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/cleanable.cc b/src/rocksdb/util/cleanable.cc
new file mode 100644
index 000000000..89a7ab9be
--- /dev/null
+++ b/src/rocksdb/util/cleanable.cc
@@ -0,0 +1,181 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/cleanable.h"
+
+#include <atomic>
+#include <cassert>
+#include <utility>
+
+namespace ROCKSDB_NAMESPACE {
+
+Cleanable::Cleanable() {
+  cleanup_.function = nullptr;
+  cleanup_.next = nullptr;
+}
+
+Cleanable::~Cleanable() { DoCleanup(); }
+
+Cleanable::Cleanable(Cleanable&& other) noexcept { *this = std::move(other); }
+
+Cleanable& Cleanable::operator=(Cleanable&& other) noexcept {
+  assert(this != &other);  // https://stackoverflow.com/a/9322542/454544
+  cleanup_ = other.cleanup_;
+  other.cleanup_.function = nullptr;
+  other.cleanup_.next = nullptr;
+  return *this;
+}
+
+// If the entire linked list was on heap we could have simply add attach one
+// link list to another. However the head is an embeded object to avoid the cost
+// of creating objects for most of the use cases when the Cleanable has only one
+// Cleanup to do. We could put evernything on heap if benchmarks show no
+// negative impact on performance.
+// Also we need to iterate on the linked list since there is no pointer to the
+// tail. We can add the tail pointer but maintainin it might negatively impact
+// the perforamnce for the common case of one cleanup where tail pointer is not
+// needed. Again benchmarks could clarify that.
+// Even without a tail pointer we could iterate on the list, find the tail, and
+// have only that node updated without the need to insert the Cleanups one by
+// one. This however would be redundant when the source Cleanable has one or a
+// few Cleanups which is the case most of the time.
+// TODO(myabandeh): if the list is too long we should maintain a tail pointer
+// and have the entire list (minus the head that has to be inserted separately)
+// merged with the target linked list at once.
+void Cleanable::DelegateCleanupsTo(Cleanable* other) {
+  assert(other != nullptr);
+  if (cleanup_.function == nullptr) {
+    return;
+  }
+  Cleanup* c = &cleanup_;
+  other->RegisterCleanup(c->function, c->arg1, c->arg2);
+  c = c->next;
+  while (c != nullptr) {
+    Cleanup* next = c->next;
+    other->RegisterCleanup(c);
+    c = next;
+  }
+  cleanup_.function = nullptr;
+  cleanup_.next = nullptr;
+}
+
+void Cleanable::RegisterCleanup(Cleanable::Cleanup* c) {
+  assert(c != nullptr);
+  if (cleanup_.function == nullptr) {
+    cleanup_.function = c->function;
+    cleanup_.arg1 = c->arg1;
+    cleanup_.arg2 = c->arg2;
+    delete c;
+  } else {
+    c->next = cleanup_.next;
+    cleanup_.next = c;
+  }
+}
+
+void Cleanable::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) {
+  assert(func != nullptr);
+  Cleanup* c;
+  if (cleanup_.function == nullptr) {
+    c = &cleanup_;
+  } else {
+    c = new Cleanup;
+    c->next = cleanup_.next;
+    cleanup_.next = c;
+  }
+  c->function = func;
+  c->arg1 = arg1;
+  c->arg2 = arg2;
+}
+
+struct SharedCleanablePtr::Impl : public Cleanable {
+  std::atomic<unsigned> ref_count{1};  // Start with 1 ref
+  void Ref() { ref_count.fetch_add(1, std::memory_order_relaxed); }
+  void Unref() {
+    if (ref_count.fetch_sub(1, std::memory_order_relaxed) == 1) {
+      // Last ref
+      delete this;
+    }
+  }
+  static void UnrefWrapper(void* arg1, void* /*arg2*/) {
+    static_cast<SharedCleanablePtr::Impl*>(arg1)->Unref();
+  }
+};
+
+void SharedCleanablePtr::Reset() {
+  if (ptr_) {
+    ptr_->Unref();
+    ptr_ = nullptr;
+  }
+}
+
+void SharedCleanablePtr::Allocate() {
+  Reset();
+  ptr_ = new Impl();
+}
+
+SharedCleanablePtr::SharedCleanablePtr(const SharedCleanablePtr& from) {
+  *this = from;
+}
+
+SharedCleanablePtr::SharedCleanablePtr(SharedCleanablePtr&& from) noexcept {
+  *this = std::move(from);
+}
+
+SharedCleanablePtr& SharedCleanablePtr::operator=(
+    const SharedCleanablePtr& from) {
+  if (this != &from) {
+    Reset();
+    ptr_ = from.ptr_;
+    if (ptr_) {
+      ptr_->Ref();
+    }
+  }
+  return *this;
+}
+
+SharedCleanablePtr& SharedCleanablePtr::operator=(
+    SharedCleanablePtr&& from) noexcept {
+  assert(this != &from);  // https://stackoverflow.com/a/9322542/454544
+  Reset();
+  ptr_ = from.ptr_;
+  from.ptr_ = nullptr;
+  return *this;
+}
+
+SharedCleanablePtr::~SharedCleanablePtr() { Reset(); }
+
+Cleanable& SharedCleanablePtr::operator*() {
+  return *ptr_;  // implicit upcast
+}
+
+Cleanable* SharedCleanablePtr::operator->() {
+  return ptr_;  // implicit upcast
+}
+
+Cleanable* SharedCleanablePtr::get() {
+  return ptr_;  // implicit upcast
+}
+
+void SharedCleanablePtr::RegisterCopyWith(Cleanable* target) {
+  if (ptr_) {
+    // "Virtual" copy of the pointer
+    ptr_->Ref();
+    target->RegisterCleanup(&Impl::UnrefWrapper, ptr_, nullptr);
+  }
+}
+
+void SharedCleanablePtr::MoveAsCleanupTo(Cleanable* target) {
+  if (ptr_) {
+    // "Virtual" move of the pointer
+    target->RegisterCleanup(&Impl::UnrefWrapper, ptr_, nullptr);
+    ptr_ = nullptr;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/coding.cc b/src/rocksdb/util/coding.cc
new file mode 100644
index 000000000..3da8afaa2
--- /dev/null
+++ b/src/rocksdb/util/coding.cc
@@ -0,0 +1,90 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/coding.h"
+
+#include <algorithm>
+
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// conversion' conversion from 'type1' to 'type2', possible loss of data
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4244)
+#endif
+char* EncodeVarint32(char* dst, uint32_t v) {
+  // Operate on characters as unsigneds
+  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
+  static const int B = 128;
+  if (v < (1 << 7)) {
+    *(ptr++) = v;
+  } else if (v < (1 << 14)) {
+    *(ptr++) = v | B;
+    *(ptr++) = v >> 7;
+  } else if (v < (1 << 21)) {
+    *(ptr++) = v | B;
+    *(ptr++) = (v >> 7) | B;
+    *(ptr++) = v >> 14;
+  } else if (v < (1 << 28)) {
+    *(ptr++) = v | B;
+    *(ptr++) = (v >> 7) | B;
+    *(ptr++) = (v >> 14) | B;
+    *(ptr++) = v >> 21;
+  } else {
+    *(ptr++) = v | B;
+    *(ptr++) = (v >> 7) | B;
+    *(ptr++) = (v >> 14) | B;
+    *(ptr++) = (v >> 21) | B;
+    *(ptr++) = v >> 28;
+  }
+  return reinterpret_cast<char*>(ptr);
+}
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+const char* GetVarint32PtrFallback(const char* p, const char* limit,
+                                   uint32_t* value) {
+  uint32_t result = 0;
+  for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) {
+    uint32_t byte = *(reinterpret_cast<const unsigned char*>(p));
+    p++;
+    if (byte & 128) {
+      // More bytes are present
+      result |= ((byte & 127) << shift);
+    } else {
+      result |= (byte << shift);
+      *value = result;
+      return reinterpret_cast<const char*>(p);
+    }
+  }
+  return nullptr;
+}
+
+const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) {
+  uint64_t result = 0;
+  for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) {
+    uint64_t byte = *(reinterpret_cast<const unsigned char*>(p));
+    p++;
+    if (byte & 128) {
+      // More bytes are present
+      result |= ((byte & 127) << shift);
+    } else {
+      result |= (byte << shift);
+      *value = result;
+      return reinterpret_cast<const char*>(p);
+    }
+  }
+  return nullptr;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/coding.h b/src/rocksdb/util/coding.h
new file mode 100644
index 000000000..3168fd2fd
--- /dev/null
+++ b/src/rocksdb/util/coding.h
@@ -0,0 +1,389 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Encoding independent of machine byte order:
+// * Fixed-length numbers are encoded with least-significant byte first
+//   (little endian, native order on Intel and others)
+// * In addition we support variable length "varint" encoding
+// * Strings are encoded prefixed by their length in varint format
+//
+// Some related functions are provided in coding_lean.h
+
+#pragma once
+#include <algorithm>
+#include <string>
+
+#include "port/port.h"
+#include "rocksdb/slice.h"
+#include "util/coding_lean.h"
+
+// Some processors does not allow unaligned access to memory
+#if defined(__sparc)
+#define PLATFORM_UNALIGNED_ACCESS_NOT_ALLOWED
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+// The maximum length of a varint in bytes for 64-bit.
+const uint32_t kMaxVarint64Length = 10;
+
+// Standard Put... routines append to a string
+extern void PutFixed16(std::string* dst, uint16_t value);
+extern void PutFixed32(std::string* dst, uint32_t value);
+extern void PutFixed64(std::string* dst, uint64_t value);
+extern void PutVarint32(std::string* dst, uint32_t value);
+extern void PutVarint32Varint32(std::string* dst, uint32_t value1,
+                                uint32_t value2);
+extern void PutVarint32Varint32Varint32(std::string* dst, uint32_t value1,
+                                        uint32_t value2, uint32_t value3);
+extern void PutVarint64(std::string* dst, uint64_t value);
+extern void PutVarint64Varint64(std::string* dst, uint64_t value1,
+                                uint64_t value2);
+extern void PutVarint32Varint64(std::string* dst, uint32_t value1,
+                                uint64_t value2);
+extern void PutVarint32Varint32Varint64(std::string* dst, uint32_t value1,
+                                        uint32_t value2, uint64_t value3);
+extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value);
+extern void PutLengthPrefixedSliceParts(std::string* dst,
+                                        const SliceParts& slice_parts);
+extern void PutLengthPrefixedSlicePartsWithPadding(
+    std::string* dst, const SliceParts& slice_parts, size_t pad_sz);
+
+// Standard Get... routines parse a value from the beginning of a Slice
+// and advance the slice past the parsed value.
+extern bool GetFixed64(Slice* input, uint64_t* value);
+extern bool GetFixed32(Slice* input, uint32_t* value);
+extern bool GetFixed16(Slice* input, uint16_t* value);
+extern bool GetVarint32(Slice* input, uint32_t* value);
+extern bool GetVarint64(Slice* input, uint64_t* value);
+extern bool GetVarsignedint64(Slice* input, int64_t* value);
+extern bool GetLengthPrefixedSlice(Slice* input, Slice* result);
+// This function assumes data is well-formed.
+extern Slice GetLengthPrefixedSlice(const char* data);
+
+extern Slice GetSliceUntil(Slice* slice, char delimiter);
+
+// Borrowed from
+// https://github.com/facebook/fbthrift/blob/449a5f77f9f9bae72c9eb5e78093247eef185c04/thrift/lib/cpp/util/VarintUtils-inl.h#L202-L208
+constexpr inline uint64_t i64ToZigzag(const int64_t l) {
+  return (static_cast<uint64_t>(l) << 1) ^ static_cast<uint64_t>(l >> 63);
+}
+inline int64_t zigzagToI64(uint64_t n) {
+  return (n >> 1) ^ -static_cast<int64_t>(n & 1);
+}
+
+// Pointer-based variants of GetVarint...  These either store a value
+// in *v and return a pointer just past the parsed value, or return
+// nullptr on error.  These routines only look at bytes in the range
+// [p..limit-1]
+extern const char* GetVarint32Ptr(const char* p, const char* limit,
+                                  uint32_t* v);
+extern const char* GetVarint64Ptr(const char* p, const char* limit,
+                                  uint64_t* v);
+inline const char* GetVarsignedint64Ptr(const char* p, const char* limit,
+                                        int64_t* value) {
+  uint64_t u = 0;
+  const char* ret = GetVarint64Ptr(p, limit, &u);
+  *value = zigzagToI64(u);
+  return ret;
+}
+
+// Returns the length of the varint32 or varint64 encoding of "v"
+extern int VarintLength(uint64_t v);
+
+// Lower-level versions of Put... that write directly into a character buffer
+// and return a pointer just past the last byte written.
+// REQUIRES: dst has enough space for the value being written
+extern char* EncodeVarint32(char* dst, uint32_t value);
+extern char* EncodeVarint64(char* dst, uint64_t value);
+
+// Internal routine for use by fallback path of GetVarint32Ptr
+extern const char* GetVarint32PtrFallback(const char* p, const char* limit,
+                                          uint32_t* value);
+inline const char* GetVarint32Ptr(const char* p, const char* limit,
+                                  uint32_t* value) {
+  if (p < limit) {
+    uint32_t result = *(reinterpret_cast<const unsigned char*>(p));
+    if ((result & 128) == 0) {
+      *value = result;
+      return p + 1;
+    }
+  }
+  return GetVarint32PtrFallback(p, limit, value);
+}
+
+// Pull the last 8 bits and cast it to a character
+inline void PutFixed16(std::string* dst, uint16_t value) {
+  if (port::kLittleEndian) {
+    dst->append(const_cast<const char*>(reinterpret_cast<char*>(&value)),
+                sizeof(value));
+  } else {
+    char buf[sizeof(value)];
+    EncodeFixed16(buf, value);
+    dst->append(buf, sizeof(buf));
+  }
+}
+
+inline void PutFixed32(std::string* dst, uint32_t value) {
+  if (port::kLittleEndian) {
+    dst->append(const_cast<const char*>(reinterpret_cast<char*>(&value)),
+                sizeof(value));
+  } else {
+    char buf[sizeof(value)];
+    EncodeFixed32(buf, value);
+    dst->append(buf, sizeof(buf));
+  }
+}
+
+inline void PutFixed64(std::string* dst, uint64_t value) {
+  if (port::kLittleEndian) {
+    dst->append(const_cast<const char*>(reinterpret_cast<char*>(&value)),
+                sizeof(value));
+  } else {
+    char buf[sizeof(value)];
+    EncodeFixed64(buf, value);
+    dst->append(buf, sizeof(buf));
+  }
+}
+
+inline void PutVarint32(std::string* dst, uint32_t v) {
+  char buf[5];
+  char* ptr = EncodeVarint32(buf, v);
+  dst->append(buf, static_cast<size_t>(ptr - buf));
+}
+
+inline void PutVarint32Varint32(std::string* dst, uint32_t v1, uint32_t v2) {
+  char buf[10];
+  char* ptr = EncodeVarint32(buf, v1);
+  ptr = EncodeVarint32(ptr, v2);
+  dst->append(buf, static_cast<size_t>(ptr - buf));
+}
+
+inline void PutVarint32Varint32Varint32(std::string* dst, uint32_t v1,
+                                        uint32_t v2, uint32_t v3) {
+  char buf[15];
+  char* ptr = EncodeVarint32(buf, v1);
+  ptr = EncodeVarint32(ptr, v2);
+  ptr = EncodeVarint32(ptr, v3);
+  dst->append(buf, static_cast<size_t>(ptr - buf));
+}
+
+inline char* EncodeVarint64(char* dst, uint64_t v) {
+  static const unsigned int B = 128;
+  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
+  while (v >= B) {
+    *(ptr++) = (v & (B - 1)) | B;
+    v >>= 7;
+  }
+  *(ptr++) = static_cast<unsigned char>(v);
+  return reinterpret_cast<char*>(ptr);
+}
+
+inline void PutVarint64(std::string* dst, uint64_t v) {
+  char buf[kMaxVarint64Length];
+  char* ptr = EncodeVarint64(buf, v);
+  dst->append(buf, static_cast<size_t>(ptr - buf));
+}
+
+inline void PutVarsignedint64(std::string* dst, int64_t v) {
+  char buf[kMaxVarint64Length];
+  // Using Zigzag format to convert signed to unsigned
+  char* ptr = EncodeVarint64(buf, i64ToZigzag(v));
+  dst->append(buf, static_cast<size_t>(ptr - buf));
+}
+
+inline void PutVarint64Varint64(std::string* dst, uint64_t v1, uint64_t v2) {
+  char buf[20];
+  char* ptr = EncodeVarint64(buf, v1);
+  ptr = EncodeVarint64(ptr, v2);
+  dst->append(buf, static_cast<size_t>(ptr - buf));
+}
+
+inline void PutVarint32Varint64(std::string* dst, uint32_t v1, uint64_t v2) {
+  char buf[15];
+  char* ptr = EncodeVarint32(buf, v1);
+  ptr = EncodeVarint64(ptr, v2);
+  dst->append(buf, static_cast<size_t>(ptr - buf));
+}
+
+inline void PutVarint32Varint32Varint64(std::string* dst, uint32_t v1,
+                                        uint32_t v2, uint64_t v3) {
+  char buf[20];
+  char* ptr = EncodeVarint32(buf, v1);
+  ptr = EncodeVarint32(ptr, v2);
+  ptr = EncodeVarint64(ptr, v3);
+  dst->append(buf, static_cast<size_t>(ptr - buf));
+}
+
+inline void PutLengthPrefixedSlice(std::string* dst, const Slice& value) {
+  PutVarint32(dst, static_cast<uint32_t>(value.size()));
+  dst->append(value.data(), value.size());
+}
+
+inline void PutLengthPrefixedSliceParts(std::string* dst, size_t total_bytes,
+                                        const SliceParts& slice_parts) {
+  for (int i = 0; i < slice_parts.num_parts; ++i) {
+    total_bytes += slice_parts.parts[i].size();
+  }
+  PutVarint32(dst, static_cast<uint32_t>(total_bytes));
+  for (int i = 0; i < slice_parts.num_parts; ++i) {
+    dst->append(slice_parts.parts[i].data(), slice_parts.parts[i].size());
+  }
+}
+
+inline void PutLengthPrefixedSliceParts(std::string* dst,
+                                        const SliceParts& slice_parts) {
+  PutLengthPrefixedSliceParts(dst, /*total_bytes=*/0, slice_parts);
+}
+
+inline void PutLengthPrefixedSlicePartsWithPadding(
+    std::string* dst, const SliceParts& slice_parts, size_t pad_sz) {
+  PutLengthPrefixedSliceParts(dst, /*total_bytes=*/pad_sz, slice_parts);
+  dst->append(pad_sz, '\0');
+}
+
+inline int VarintLength(uint64_t v) {
+  int len = 1;
+  while (v >= 128) {
+    v >>= 7;
+    len++;
+  }
+  return len;
+}
+
+inline bool GetFixed64(Slice* input, uint64_t* value) {
+  if (input->size() < sizeof(uint64_t)) {
+    return false;
+  }
+  *value = DecodeFixed64(input->data());
+  input->remove_prefix(sizeof(uint64_t));
+  return true;
+}
+
+inline bool GetFixed32(Slice* input, uint32_t* value) {
+  if (input->size() < sizeof(uint32_t)) {
+    return false;
+  }
+  *value = DecodeFixed32(input->data());
+  input->remove_prefix(sizeof(uint32_t));
+  return true;
+}
+
+inline bool GetFixed16(Slice* input, uint16_t* value) {
+  if (input->size() < sizeof(uint16_t)) {
+    return false;
+  }
+  *value = DecodeFixed16(input->data());
+  input->remove_prefix(sizeof(uint16_t));
+  return true;
+}
+
+inline bool GetVarint32(Slice* input, uint32_t* value) {
+  const char* p = input->data();
+  const char* limit = p + input->size();
+  const char* q = GetVarint32Ptr(p, limit, value);
+  if (q == nullptr) {
+    return false;
+  } else {
+    *input = Slice(q, static_cast<size_t>(limit - q));
+    return true;
+  }
+}
+
+inline bool GetVarint64(Slice* input, uint64_t* value) {
+  const char* p = input->data();
+  const char* limit = p + input->size();
+  const char* q = GetVarint64Ptr(p, limit, value);
+  if (q == nullptr) {
+    return false;
+  } else {
+    *input = Slice(q, static_cast<size_t>(limit - q));
+    return true;
+  }
+}
+
+inline bool GetVarsignedint64(Slice* input, int64_t* value) {
+  const char* p = input->data();
+  const char* limit = p + input->size();
+  const char* q = GetVarsignedint64Ptr(p, limit, value);
+  if (q == nullptr) {
+    return false;
+  } else {
+    *input = Slice(q, static_cast<size_t>(limit - q));
+    return true;
+  }
+}
+
+inline bool GetLengthPrefixedSlice(Slice* input, Slice* result) {
+  uint32_t len = 0;
+  if (GetVarint32(input, &len) && input->size() >= len) {
+    *result = Slice(input->data(), len);
+    input->remove_prefix(len);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+inline Slice GetLengthPrefixedSlice(const char* data) {
+  uint32_t len = 0;
+  // +5: we assume "data" is not corrupted
+  // unsigned char is 7 bits, uint32_t is 32 bits, need 5 unsigned char
+  auto p = GetVarint32Ptr(data, data + 5 /* limit */, &len);
+  return Slice(p, len);
+}
+
+inline Slice GetSliceUntil(Slice* slice, char delimiter) {
+  uint32_t len = 0;
+  for (len = 0; len < slice->size() && slice->data()[len] != delimiter; ++len) {
+    // nothing
+  }
+
+  Slice ret(slice->data(), len);
+  slice->remove_prefix(len + ((len < slice->size()) ? 1 : 0));
+  return ret;
+}
+
+template <class T>
+#ifdef ROCKSDB_UBSAN_RUN
+#if defined(__clang__)
+__attribute__((__no_sanitize__("alignment")))
+#elif defined(__GNUC__)
+__attribute__((__no_sanitize_undefined__))
+#endif
+#endif
+inline void
+PutUnaligned(T* memory, const T& value) {
+#if defined(PLATFORM_UNALIGNED_ACCESS_NOT_ALLOWED)
+  char* nonAlignedMemory = reinterpret_cast<char*>(memory);
+  memcpy(nonAlignedMemory, reinterpret_cast<const char*>(&value), sizeof(T));
+#else
+  *memory = value;
+#endif
+}
+
+template <class T>
+#ifdef ROCKSDB_UBSAN_RUN
+#if defined(__clang__)
+__attribute__((__no_sanitize__("alignment")))
+#elif defined(__GNUC__)
+__attribute__((__no_sanitize_undefined__))
+#endif
+#endif
+inline void
+GetUnaligned(const T* memory, T* value) {
+#if defined(PLATFORM_UNALIGNED_ACCESS_NOT_ALLOWED)
+  char* nonAlignedMemory = reinterpret_cast<char*>(value);
+  memcpy(nonAlignedMemory, reinterpret_cast<const char*>(memory), sizeof(T));
+#else
+  *value = *memory;
+#endif
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/coding_lean.h b/src/rocksdb/util/coding_lean.h
new file mode 100644
index 000000000..6966f7a66
--- /dev/null
+++ b/src/rocksdb/util/coding_lean.h
@@ -0,0 +1,101 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+// Encoding independent of machine byte order:
+// * Fixed-length numbers are encoded with least-significant byte first
+//   (little endian, native order on Intel and others)
+//
+// More functions in coding.h
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+
+#include "port/port.h"  // for port::kLittleEndian
+
+namespace ROCKSDB_NAMESPACE {
+
+// Lower-level versions of Put... that write directly into a character buffer
+// REQUIRES: dst has enough space for the value being written
+// -- Implementation of the functions declared above
+inline void EncodeFixed16(char* buf, uint16_t value) {
+  if (port::kLittleEndian) {
+    memcpy(buf, &value, sizeof(value));
+  } else {
+    buf[0] = value & 0xff;
+    buf[1] = (value >> 8) & 0xff;
+  }
+}
+
+inline void EncodeFixed32(char* buf, uint32_t value) {
+  if (port::kLittleEndian) {
+    memcpy(buf, &value, sizeof(value));
+  } else {
+    buf[0] = value & 0xff;
+    buf[1] = (value >> 8) & 0xff;
+    buf[2] = (value >> 16) & 0xff;
+    buf[3] = (value >> 24) & 0xff;
+  }
+}
+
+inline void EncodeFixed64(char* buf, uint64_t value) {
+  if (port::kLittleEndian) {
+    memcpy(buf, &value, sizeof(value));
+  } else {
+    buf[0] = value & 0xff;
+    buf[1] = (value >> 8) & 0xff;
+    buf[2] = (value >> 16) & 0xff;
+    buf[3] = (value >> 24) & 0xff;
+    buf[4] = (value >> 32) & 0xff;
+    buf[5] = (value >> 40) & 0xff;
+    buf[6] = (value >> 48) & 0xff;
+    buf[7] = (value >> 56) & 0xff;
+  }
+}
+
+// Lower-level versions of Get... that read directly from a character buffer
+// without any bounds checking.
+
+inline uint16_t DecodeFixed16(const char* ptr) {
+  if (port::kLittleEndian) {
+    // Load the raw bytes
+    uint16_t result;
+    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
+    return result;
+  } else {
+    return ((static_cast<uint16_t>(static_cast<unsigned char>(ptr[0]))) |
+            (static_cast<uint16_t>(static_cast<unsigned char>(ptr[1])) << 8));
+  }
+}
+
+inline uint32_t DecodeFixed32(const char* ptr) {
+  if (port::kLittleEndian) {
+    // Load the raw bytes
+    uint32_t result;
+    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
+    return result;
+  } else {
+    return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0]))) |
+            (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8) |
+            (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16) |
+            (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
+  }
+}
+
+inline uint64_t DecodeFixed64(const char* ptr) {
+  if (port::kLittleEndian) {
+    // Load the raw bytes
+    uint64_t result;
+    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
+    return result;
+  } else {
+    uint64_t lo = DecodeFixed32(ptr);
+    uint64_t hi = DecodeFixed32(ptr + 4);
+    return (hi << 32) | lo;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/coding_test.cc b/src/rocksdb/util/coding_test.cc
new file mode 100644
index 000000000..79dd7b82e
--- /dev/null
+++ b/src/rocksdb/util/coding_test.cc
@@ -0,0 +1,217 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/coding.h"
+
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Coding {};
+TEST(Coding, Fixed16) {
+  std::string s;
+  for (uint16_t v = 0; v < 0xFFFF; v++) {
+    PutFixed16(&s, v);
+  }
+
+  const char* p = s.data();
+  for (uint16_t v = 0; v < 0xFFFF; v++) {
+    uint16_t actual = DecodeFixed16(p);
+    ASSERT_EQ(v, actual);
+    p += sizeof(uint16_t);
+  }
+}
+
+TEST(Coding, Fixed32) {
+  std::string s;
+  for (uint32_t v = 0; v < 100000; v++) {
+    PutFixed32(&s, v);
+  }
+
+  const char* p = s.data();
+  for (uint32_t v = 0; v < 100000; v++) {
+    uint32_t actual = DecodeFixed32(p);
+    ASSERT_EQ(v, actual);
+    p += sizeof(uint32_t);
+  }
+}
+
+TEST(Coding, Fixed64) {
+  std::string s;
+  for (int power = 0; power <= 63; power++) {
+    uint64_t v = static_cast<uint64_t>(1) << power;
+    PutFixed64(&s, v - 1);
+    PutFixed64(&s, v + 0);
+    PutFixed64(&s, v + 1);
+  }
+
+  const char* p = s.data();
+  for (int power = 0; power <= 63; power++) {
+    uint64_t v = static_cast<uint64_t>(1) << power;
+    uint64_t actual = 0;
+    actual = DecodeFixed64(p);
+    ASSERT_EQ(v - 1, actual);
+    p += sizeof(uint64_t);
+
+    actual = DecodeFixed64(p);
+    ASSERT_EQ(v + 0, actual);
+    p += sizeof(uint64_t);
+
+    actual = DecodeFixed64(p);
+    ASSERT_EQ(v + 1, actual);
+    p += sizeof(uint64_t);
+  }
+}
+
+// Test that encoding routines generate little-endian encodings
+TEST(Coding, EncodingOutput) {
+  std::string dst;
+  PutFixed32(&dst, 0x04030201);
+  ASSERT_EQ(4U, dst.size());
+  ASSERT_EQ(0x01, static_cast<int>(dst[0]));
+  ASSERT_EQ(0x02, static_cast<int>(dst[1]));
+  ASSERT_EQ(0x03, static_cast<int>(dst[2]));
+  ASSERT_EQ(0x04, static_cast<int>(dst[3]));
+
+  dst.clear();
+  PutFixed64(&dst, 0x0807060504030201ull);
+  ASSERT_EQ(8U, dst.size());
+  ASSERT_EQ(0x01, static_cast<int>(dst[0]));
+  ASSERT_EQ(0x02, static_cast<int>(dst[1]));
+  ASSERT_EQ(0x03, static_cast<int>(dst[2]));
+  ASSERT_EQ(0x04, static_cast<int>(dst[3]));
+  ASSERT_EQ(0x05, static_cast<int>(dst[4]));
+  ASSERT_EQ(0x06, static_cast<int>(dst[5]));
+  ASSERT_EQ(0x07, static_cast<int>(dst[6]));
+  ASSERT_EQ(0x08, static_cast<int>(dst[7]));
+}
+
+TEST(Coding, Varint32) {
+  std::string s;
+  for (uint32_t i = 0; i < (32 * 32); i++) {
+    uint32_t v = (i / 32) << (i % 32);
+    PutVarint32(&s, v);
+  }
+
+  const char* p = s.data();
+  const char* limit = p + s.size();
+  for (uint32_t i = 0; i < (32 * 32); i++) {
+    uint32_t expected = (i / 32) << (i % 32);
+    uint32_t actual = 0;
+    const char* start = p;
+    p = GetVarint32Ptr(p, limit, &actual);
+    ASSERT_TRUE(p != nullptr);
+    ASSERT_EQ(expected, actual);
+    ASSERT_EQ(VarintLength(actual), p - start);
+  }
+  ASSERT_EQ(p, s.data() + s.size());
+}
+
+TEST(Coding, Varint64) {
+  // Construct the list of values to check
+  std::vector<uint64_t> values;
+  // Some special values
+  values.push_back(0);
+  values.push_back(100);
+  values.push_back(~static_cast<uint64_t>(0));
+  values.push_back(~static_cast<uint64_t>(0) - 1);
+  for (uint32_t k = 0; k < 64; k++) {
+    // Test values near powers of two
+    const uint64_t power = 1ull << k;
+    values.push_back(power);
+    values.push_back(power - 1);
+    values.push_back(power + 1);
+  };
+
+  std::string s;
+  for (unsigned int i = 0; i < values.size(); i++) {
+    PutVarint64(&s, values[i]);
+  }
+
+  const char* p = s.data();
+  const char* limit = p + s.size();
+  for (unsigned int i = 0; i < values.size(); i++) {
+    ASSERT_TRUE(p < limit);
+    uint64_t actual = 0;
+    const char* start = p;
+    p = GetVarint64Ptr(p, limit, &actual);
+    ASSERT_TRUE(p != nullptr);
+    ASSERT_EQ(values[i], actual);
+    ASSERT_EQ(VarintLength(actual), p - start);
+  }
+  ASSERT_EQ(p, limit);
+}
+
+TEST(Coding, Varint32Overflow) {
+  uint32_t result;
+  std::string input("\x81\x82\x83\x84\x85\x11");
+  ASSERT_TRUE(GetVarint32Ptr(input.data(), input.data() + input.size(),
+                             &result) == nullptr);
+}
+
+TEST(Coding, Varint32Truncation) {
+  uint32_t large_value = (1u << 31) + 100;
+  std::string s;
+  PutVarint32(&s, large_value);
+  uint32_t result;
+  for (unsigned int len = 0; len + 1 < s.size(); len++) {
+    ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == nullptr);
+  }
+  ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + s.size(), &result) !=
+              nullptr);
+  ASSERT_EQ(large_value, result);
+}
+
+TEST(Coding, Varint64Overflow) {
+  uint64_t result;
+  std::string input("\x81\x82\x83\x84\x85\x81\x82\x83\x84\x85\x11");
+  ASSERT_TRUE(GetVarint64Ptr(input.data(), input.data() + input.size(),
+                             &result) == nullptr);
+}
+
+TEST(Coding, Varint64Truncation) {
+  uint64_t large_value = (1ull << 63) + 100ull;
+  std::string s;
+  PutVarint64(&s, large_value);
+  uint64_t result;
+  for (unsigned int len = 0; len + 1 < s.size(); len++) {
+    ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == nullptr);
+  }
+  ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + s.size(), &result) !=
+              nullptr);
+  ASSERT_EQ(large_value, result);
+}
+
+TEST(Coding, Strings) {
+  std::string s;
+  PutLengthPrefixedSlice(&s, Slice(""));
+  PutLengthPrefixedSlice(&s, Slice("foo"));
+  PutLengthPrefixedSlice(&s, Slice("bar"));
+  PutLengthPrefixedSlice(&s, Slice(std::string(200, 'x')));
+
+  Slice input(s);
+  Slice v;
+  ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v));
+  ASSERT_EQ("", v.ToString());
+  ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v));
+  ASSERT_EQ("foo", v.ToString());
+  ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v));
+  ASSERT_EQ("bar", v.ToString());
+  ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v));
+  ASSERT_EQ(std::string(200, 'x'), v.ToString());
+  ASSERT_EQ("", input.ToString());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/compaction_job_stats_impl.cc b/src/rocksdb/util/compaction_job_stats_impl.cc
new file mode 100644
index 000000000..cfab2a4fe
--- /dev/null
+++ b/src/rocksdb/util/compaction_job_stats_impl.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/compaction_job_stats.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+
+void CompactionJobStats::Reset() {
+  elapsed_micros = 0;
+  cpu_micros = 0;
+
+  num_input_records = 0;
+  num_blobs_read = 0;
+  num_input_files = 0;
+  num_input_files_at_output_level = 0;
+
+  num_output_records = 0;
+  num_output_files = 0;
+  num_output_files_blob = 0;
+
+  is_full_compaction = false;
+  is_manual_compaction = false;
+
+  total_input_bytes = 0;
+  total_blob_bytes_read = 0;
+  total_output_bytes = 0;
+  total_output_bytes_blob = 0;
+
+  num_records_replaced = 0;
+
+  total_input_raw_key_bytes = 0;
+  total_input_raw_value_bytes = 0;
+
+  num_input_deletion_records = 0;
+  num_expired_deletion_records = 0;
+
+  num_corrupt_keys = 0;
+
+  file_write_nanos = 0;
+  file_range_sync_nanos = 0;
+  file_fsync_nanos = 0;
+  file_prepare_write_nanos = 0;
+
+  smallest_output_key_prefix.clear();
+  largest_output_key_prefix.clear();
+
+  num_single_del_fallthru = 0;
+  num_single_del_mismatch = 0;
+}
+
+void CompactionJobStats::Add(const CompactionJobStats& stats) {
+  elapsed_micros += stats.elapsed_micros;
+  cpu_micros += stats.cpu_micros;
+
+  num_input_records += stats.num_input_records;
+  num_blobs_read += stats.num_blobs_read;
+  num_input_files += stats.num_input_files;
+  num_input_files_at_output_level += stats.num_input_files_at_output_level;
+
+  num_output_records += stats.num_output_records;
+  num_output_files += stats.num_output_files;
+  num_output_files_blob += stats.num_output_files_blob;
+
+  total_input_bytes += stats.total_input_bytes;
+  total_blob_bytes_read += stats.total_blob_bytes_read;
+  total_output_bytes += stats.total_output_bytes;
+  total_output_bytes_blob += stats.total_output_bytes_blob;
+
+  num_records_replaced += stats.num_records_replaced;
+
+  total_input_raw_key_bytes += stats.total_input_raw_key_bytes;
+  total_input_raw_value_bytes += stats.total_input_raw_value_bytes;
+
+  num_input_deletion_records += stats.num_input_deletion_records;
+  num_expired_deletion_records += stats.num_expired_deletion_records;
+
+  num_corrupt_keys += stats.num_corrupt_keys;
+
+  file_write_nanos += stats.file_write_nanos;
+  file_range_sync_nanos += stats.file_range_sync_nanos;
+  file_fsync_nanos += stats.file_fsync_nanos;
+  file_prepare_write_nanos += stats.file_prepare_write_nanos;
+
+  num_single_del_fallthru += stats.num_single_del_fallthru;
+  num_single_del_mismatch += stats.num_single_del_mismatch;
+}
+
+#else
+
+void CompactionJobStats::Reset() {}
+
+void CompactionJobStats::Add(const CompactionJobStats& /*stats*/) {}
+
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/comparator.cc b/src/rocksdb/util/comparator.cc
new file mode 100644
index 000000000..f85ed69ee
--- /dev/null
+++ b/src/rocksdb/util/comparator.cc
@@ -0,0 +1,391 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/comparator.h"
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <memory>
+#include <mutex>
+#include <sstream>
+
+#include "db/dbformat.h"
+#include "port/lang.h"
+#include "port/port.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+class BytewiseComparatorImpl : public Comparator {
+ public:
+  BytewiseComparatorImpl() {}
+  static const char* kClassName() { return "leveldb.BytewiseComparator"; }
+  const char* Name() const override { return kClassName(); }
+
+  int Compare(const Slice& a, const Slice& b) const override {
+    return a.compare(b);
+  }
+
+  bool Equal(const Slice& a, const Slice& b) const override { return a == b; }
+
+  void FindShortestSeparator(std::string* start,
+                             const Slice& limit) const override {
+    // Find length of common prefix
+    size_t min_length = std::min(start->size(), limit.size());
+    size_t diff_index = 0;
+    while ((diff_index < min_length) &&
+           ((*start)[diff_index] == limit[diff_index])) {
+      diff_index++;
+    }
+
+    if (diff_index >= min_length) {
+      // Do not shorten if one string is a prefix of the other
+    } else {
+      uint8_t start_byte = static_cast<uint8_t>((*start)[diff_index]);
+      uint8_t limit_byte = static_cast<uint8_t>(limit[diff_index]);
+      if (start_byte >= limit_byte) {
+        // Cannot shorten since limit is smaller than start or start is
+        // already the shortest possible.
+        return;
+      }
+      assert(start_byte < limit_byte);
+
+      if (diff_index < limit.size() - 1 || start_byte + 1 < limit_byte) {
+        (*start)[diff_index]++;
+        start->resize(diff_index + 1);
+      } else {
+        //     v
+        // A A 1 A A A
+        // A A 2
+        //
+        // Incrementing the current byte will make start bigger than limit, we
+        // will skip this byte, and find the first non 0xFF byte in start and
+        // increment it.
+        diff_index++;
+
+        while (diff_index < start->size()) {
+          // Keep moving until we find the first non 0xFF byte to
+          // increment it
+          if (static_cast<uint8_t>((*start)[diff_index]) <
+              static_cast<uint8_t>(0xff)) {
+            (*start)[diff_index]++;
+            start->resize(diff_index + 1);
+            break;
+          }
+          diff_index++;
+        }
+      }
+      assert(Compare(*start, limit) < 0);
+    }
+  }
+
+  void FindShortSuccessor(std::string* key) const override {
+    // Find first character that can be incremented
+    size_t n = key->size();
+    for (size_t i = 0; i < n; i++) {
+      const uint8_t byte = (*key)[i];
+      if (byte != static_cast<uint8_t>(0xff)) {
+        (*key)[i] = byte + 1;
+        key->resize(i + 1);
+        return;
+      }
+    }
+    // *key is a run of 0xffs.  Leave it alone.
+  }
+
+  bool IsSameLengthImmediateSuccessor(const Slice& s,
+                                      const Slice& t) const override {
+    if (s.size() != t.size() || s.size() == 0) {
+      return false;
+    }
+    size_t diff_ind = s.difference_offset(t);
+    // same slice
+    if (diff_ind >= s.size()) return false;
+    uint8_t byte_s = static_cast<uint8_t>(s[diff_ind]);
+    uint8_t byte_t = static_cast<uint8_t>(t[diff_ind]);
+    // first different byte must be consecutive, and remaining bytes must be
+    // 0xff for s and 0x00 for t
+    if (byte_s != uint8_t{0xff} && byte_s + 1 == byte_t) {
+      for (size_t i = diff_ind + 1; i < s.size(); ++i) {
+        byte_s = static_cast<uint8_t>(s[i]);
+        byte_t = static_cast<uint8_t>(t[i]);
+        if (byte_s != uint8_t{0xff} || byte_t != uint8_t{0x00}) {
+          return false;
+        }
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  bool CanKeysWithDifferentByteContentsBeEqual() const override {
+    return false;
+  }
+
+  using Comparator::CompareWithoutTimestamp;
+  int CompareWithoutTimestamp(const Slice& a, bool /*a_has_ts*/, const Slice& b,
+                              bool /*b_has_ts*/) const override {
+    return a.compare(b);
+  }
+
+  bool EqualWithoutTimestamp(const Slice& a, const Slice& b) const override {
+    return a == b;
+  }
+};
+
+class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl {
+ public:
+  ReverseBytewiseComparatorImpl() {}
+
+  static const char* kClassName() {
+    return "rocksdb.ReverseBytewiseComparator";
+  }
+  const char* Name() const override { return kClassName(); }
+
+  int Compare(const Slice& a, const Slice& b) const override {
+    return -a.compare(b);
+  }
+
+  void FindShortestSeparator(std::string* start,
+                             const Slice& limit) const override {
+    // Find length of common prefix
+    size_t min_length = std::min(start->size(), limit.size());
+    size_t diff_index = 0;
+    while ((diff_index < min_length) &&
+           ((*start)[diff_index] == limit[diff_index])) {
+      diff_index++;
+    }
+
+    assert(diff_index <= min_length);
+    if (diff_index == min_length) {
+      // Do not shorten if one string is a prefix of the other
+      //
+      // We could handle cases like:
+      //     V
+      // A A 2 X Y
+      // A A 2
+      // in a similar way as BytewiseComparator::FindShortestSeparator().
+      // We keep it simple by not implementing it. We can come back to it
+      // later when needed.
+    } else {
+      uint8_t start_byte = static_cast<uint8_t>((*start)[diff_index]);
+      uint8_t limit_byte = static_cast<uint8_t>(limit[diff_index]);
+      if (start_byte > limit_byte && diff_index < start->size() - 1) {
+        // Case like
+        //     V
+        // A A 3 A A
+        // A A 1 B B
+        //
+        // or
+        //     v
+        // A A 2 A A
+        // A A 1 B B
+        // In this case "AA2" will be good.
+#ifndef NDEBUG
+        std::string old_start = *start;
+#endif
+        start->resize(diff_index + 1);
+#ifndef NDEBUG
+        assert(old_start >= *start);
+#endif
+        assert(Slice(*start).compare(limit) > 0);
+      }
+    }
+  }
+
+  void FindShortSuccessor(std::string* /*key*/) const override {
+    // Don't do anything for simplicity.
+  }
+
+  bool IsSameLengthImmediateSuccessor(const Slice& s,
+                                      const Slice& t) const override {
+    // Always returning false to prevent surfacing design flaws in
+    // auto_prefix_mode
+    (void)s, (void)t;
+    return false;
+    // "Correct" implementation:
+    // return BytewiseComparatorImpl::IsSameLengthImmediateSuccessor(t, s);
+  }
+
+  bool CanKeysWithDifferentByteContentsBeEqual() const override {
+    return false;
+  }
+
+  using Comparator::CompareWithoutTimestamp;
+  int CompareWithoutTimestamp(const Slice& a, bool /*a_has_ts*/, const Slice& b,
+                              bool /*b_has_ts*/) const override {
+    return -a.compare(b);
+  }
+};
+
+// EXPERIMENTAL
+// Comparator with 64-bit integer timestamp.
+// We did not performance test this yet.
+template <typename TComparator>
+class ComparatorWithU64TsImpl : public Comparator {
+  static_assert(std::is_base_of<Comparator, TComparator>::value,
+                "template type must be a inherited type of comparator");
+
+ public:
+  explicit ComparatorWithU64TsImpl() : Comparator(/*ts_sz=*/sizeof(uint64_t)) {
+    assert(cmp_without_ts_.timestamp_size() == 0);
+  }
+
+  static const char* kClassName() {
+    static std::string class_name = kClassNameInternal();
+    return class_name.c_str();
+  }
+
+  const char* Name() const override { return kClassName(); }
+
+  void FindShortSuccessor(std::string*) const override {}
+  void FindShortestSeparator(std::string*, const Slice&) const override {}
+  int Compare(const Slice& a, const Slice& b) const override {
+    int ret = CompareWithoutTimestamp(a, b);
+    size_t ts_sz = timestamp_size();
+    if (ret != 0) {
+      return ret;
+    }
+    // Compare timestamp.
+    // For the same user key with different timestamps, larger (newer) timestamp
+    // comes first.
+    return -CompareTimestamp(ExtractTimestampFromUserKey(a, ts_sz),
+                             ExtractTimestampFromUserKey(b, ts_sz));
+  }
+  using Comparator::CompareWithoutTimestamp;
+  int CompareWithoutTimestamp(const Slice& a, bool a_has_ts, const Slice& b,
+                              bool b_has_ts) const override {
+    const size_t ts_sz = timestamp_size();
+    assert(!a_has_ts || a.size() >= ts_sz);
+    assert(!b_has_ts || b.size() >= ts_sz);
+    Slice lhs = a_has_ts ? StripTimestampFromUserKey(a, ts_sz) : a;
+    Slice rhs = b_has_ts ? StripTimestampFromUserKey(b, ts_sz) : b;
+    return cmp_without_ts_.Compare(lhs, rhs);
+  }
+  int CompareTimestamp(const Slice& ts1, const Slice& ts2) const override {
+    assert(ts1.size() == sizeof(uint64_t));
+    assert(ts2.size() == sizeof(uint64_t));
+    uint64_t lhs = DecodeFixed64(ts1.data());
+    uint64_t rhs = DecodeFixed64(ts2.data());
+    if (lhs < rhs) {
+      return -1;
+    } else if (lhs > rhs) {
+      return 1;
+    } else {
+      return 0;
+    }
+  }
+
+ private:
+  static std::string kClassNameInternal() {
+    std::stringstream ss;
+    ss << TComparator::kClassName() << ".u64ts";
+    return ss.str();
+  }
+
+  TComparator cmp_without_ts_;
+};
+
+}  // namespace
+
+const Comparator* BytewiseComparator() {
+  STATIC_AVOID_DESTRUCTION(BytewiseComparatorImpl, bytewise);
+  return &bytewise;
+}
+
+const Comparator* ReverseBytewiseComparator() {
+  STATIC_AVOID_DESTRUCTION(ReverseBytewiseComparatorImpl, rbytewise);
+  return &rbytewise;
+}
+
+const Comparator* BytewiseComparatorWithU64Ts() {
+  STATIC_AVOID_DESTRUCTION(ComparatorWithU64TsImpl<BytewiseComparatorImpl>,
+                           comp_with_u64_ts);
+  return &comp_with_u64_ts;
+}
+
+#ifndef ROCKSDB_LITE
+static int RegisterBuiltinComparators(ObjectLibrary& library,
+                                      const std::string& /*arg*/) {
+  library.AddFactory<const Comparator>(
+      BytewiseComparatorImpl::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<const Comparator>* /*guard */,
+         std::string* /* errmsg */) { return BytewiseComparator(); });
+  library.AddFactory<const Comparator>(
+      ReverseBytewiseComparatorImpl::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<const Comparator>* /*guard */,
+         std::string* /* errmsg */) { return ReverseBytewiseComparator(); });
+  library.AddFactory<const Comparator>(
+      ComparatorWithU64TsImpl<BytewiseComparatorImpl>::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<const Comparator>* /*guard */,
+         std::string* /* errmsg */) { return BytewiseComparatorWithU64Ts(); });
+  return 3;
+}
+#endif  // ROCKSDB_LITE
+
+Status Comparator::CreateFromString(const ConfigOptions& config_options,
+                                    const std::string& value,
+                                    const Comparator** result) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    RegisterBuiltinComparators(*(ObjectLibrary::Default().get()), "");
+  });
+#endif  // ROCKSDB_LITE
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+  Status status = Customizable::GetOptionsMap(config_options, *result, value,
+                                              &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
+  }
+  if (id == BytewiseComparatorImpl::kClassName()) {
+    *result = BytewiseComparator();
+  } else if (id == ReverseBytewiseComparatorImpl::kClassName()) {
+    *result = ReverseBytewiseComparator();
+  } else if (id ==
+             ComparatorWithU64TsImpl<BytewiseComparatorImpl>::kClassName()) {
+    *result = BytewiseComparatorWithU64Ts();
+  } else if (value.empty()) {
+    // No Id and no options.  Clear the object
+    *result = nullptr;
+    return Status::OK();
+  } else if (id.empty()) {  // We have no Id but have options.  Not good
+    return Status::NotSupported("Cannot reset object ", id);
+  } else {
+#ifndef ROCKSDB_LITE
+    status = config_options.registry->NewStaticObject(id, result);
+#else
+    status = Status::NotSupported("Cannot load object in LITE mode ", id);
+#endif  // ROCKSDB_LITE
+    if (!status.ok()) {
+      if (config_options.ignore_unsupported_options &&
+          status.IsNotSupported()) {
+        return Status::OK();
+      } else {
+        return status;
+      }
+    } else {
+      Comparator* comparator = const_cast<Comparator*>(*result);
+      status =
+          Customizable::ConfigureNewObject(config_options, comparator, opt_map);
+    }
+  }
+  return status;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/compression.cc b/src/rocksdb/util/compression.cc
new file mode 100644
index 000000000..8e2f01b12
--- /dev/null
+++ b/src/rocksdb/util/compression.cc
@@ -0,0 +1,122 @@
+// Copyright (c) 2022-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+StreamingCompress* StreamingCompress::Create(CompressionType compression_type,
+                                             const CompressionOptions& opts,
+                                             uint32_t compress_format_version,
+                                             size_t max_output_len) {
+  switch (compression_type) {
+    case kZSTD: {
+      if (!ZSTD_Streaming_Supported()) {
+        return nullptr;
+      }
+      return new ZSTDStreamingCompress(opts, compress_format_version,
+                                       max_output_len);
+    }
+    default:
+      return nullptr;
+  }
+}
+
+StreamingUncompress* StreamingUncompress::Create(
+    CompressionType compression_type, uint32_t compress_format_version,
+    size_t max_output_len) {
+  switch (compression_type) {
+    case kZSTD: {
+      if (!ZSTD_Streaming_Supported()) {
+        return nullptr;
+      }
+      return new ZSTDStreamingUncompress(compress_format_version,
+                                         max_output_len);
+    }
+    default:
+      return nullptr;
+  }
+}
+
+int ZSTDStreamingCompress::Compress(const char* input, size_t input_size,
+                                    char* output, size_t* output_pos) {
+  assert(input != nullptr && output != nullptr && output_pos != nullptr);
+  *output_pos = 0;
+  // Don't need to compress an empty input
+  if (input_size == 0) {
+    return 0;
+  }
+#ifndef ZSTD_STREAMING
+  (void)input;
+  (void)input_size;
+  (void)output;
+  return -1;
+#else
+  if (input_buffer_.src == nullptr || input_buffer_.src != input) {
+    // New input
+    // Catch errors where the previous input was not fully decompressed.
+    assert(input_buffer_.pos == input_buffer_.size);
+    input_buffer_ = {input, input_size, /*pos=*/0};
+  } else if (input_buffer_.src == input) {
+    // Same input, not fully compressed.
+  }
+  ZSTD_outBuffer output_buffer = {output, max_output_len_, /*pos=*/0};
+  const size_t remaining =
+      ZSTD_compressStream2(cctx_, &output_buffer, &input_buffer_, ZSTD_e_end);
+  if (ZSTD_isError(remaining)) {
+    // Failure
+    Reset();
+    return -1;
+  }
+  // Success
+  *output_pos = output_buffer.pos;
+  return (int)remaining;
+#endif
+}
+
+void ZSTDStreamingCompress::Reset() {
+#ifdef ZSTD_STREAMING
+  ZSTD_CCtx_reset(cctx_, ZSTD_ResetDirective::ZSTD_reset_session_only);
+  input_buffer_ = {/*src=*/nullptr, /*size=*/0, /*pos=*/0};
+#endif
+}
+
+int ZSTDStreamingUncompress::Uncompress(const char* input, size_t input_size,
+                                        char* output, size_t* output_pos) {
+  assert(input != nullptr && output != nullptr && output_pos != nullptr);
+  *output_pos = 0;
+  // Don't need to uncompress an empty input
+  if (input_size == 0) {
+    return 0;
+  }
+#ifdef ZSTD_STREAMING
+  if (input_buffer_.src != input) {
+    // New input
+    input_buffer_ = {input, input_size, /*pos=*/0};
+  }
+  ZSTD_outBuffer output_buffer = {output, max_output_len_, /*pos=*/0};
+  size_t ret = ZSTD_decompressStream(dctx_, &output_buffer, &input_buffer_);
+  if (ZSTD_isError(ret)) {
+    Reset();
+    return -1;
+  }
+  *output_pos = output_buffer.pos;
+  return (int)(input_buffer_.size - input_buffer_.pos);
+#else
+  (void)input;
+  (void)input_size;
+  (void)output;
+  return -1;
+#endif
+}
+
+void ZSTDStreamingUncompress::Reset() {
+#ifdef ZSTD_STREAMING
+  ZSTD_DCtx_reset(dctx_, ZSTD_ResetDirective::ZSTD_reset_session_only);
+  input_buffer_ = {/*src=*/nullptr, /*size=*/0, /*pos=*/0};
+#endif
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/compression.h b/src/rocksdb/util/compression.h
new file mode 100644
index 000000000..0d4febcfb
--- /dev/null
+++ b/src/rocksdb/util/compression.h
@@ -0,0 +1,1786 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+#pragma once
+
+#include <algorithm>
+#include <limits>
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+#ifdef OS_FREEBSD
+#include <malloc_np.h>
+#else  // OS_FREEBSD
+#include <malloc.h>
+#endif  // OS_FREEBSD
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+#include <string>
+
+#include "memory/memory_allocator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/compression_context_cache.h"
+#include "util/string_util.h"
+
+#ifdef SNAPPY
+#include <snappy.h>
+#endif
+
+#ifdef ZLIB
+#include <zlib.h>
+#endif
+
+#ifdef BZIP2
+#include <bzlib.h>
+#endif
+
+#if defined(LZ4)
+#include <lz4.h>
+#include <lz4hc.h>
+#endif
+
+#if defined(ZSTD)
+#include <zstd.h>
+// v1.1.3+
+#if ZSTD_VERSION_NUMBER >= 10103
+#include <zdict.h>
+#endif  // ZSTD_VERSION_NUMBER >= 10103
+// v1.4.0+
+#if ZSTD_VERSION_NUMBER >= 10400
+#define ZSTD_STREAMING
+#endif  // ZSTD_VERSION_NUMBER >= 10400
+namespace ROCKSDB_NAMESPACE {
+// Need this for the context allocation override
+// On windows we need to do this explicitly
+#if (ZSTD_VERSION_NUMBER >= 500)
+#if defined(ROCKSDB_JEMALLOC) && defined(OS_WIN) && \
+    defined(ZSTD_STATIC_LINKING_ONLY)
+#define ROCKSDB_ZSTD_CUSTOM_MEM
+namespace port {
+ZSTD_customMem GetJeZstdAllocationOverrides();
+}  // namespace port
+#endif  // defined(ROCKSDB_JEMALLOC) && defined(OS_WIN) &&
+        // defined(ZSTD_STATIC_LINKING_ONLY)
+
+// We require `ZSTD_sizeof_DDict` and `ZSTD_createDDict_byReference` to use
+// `ZSTD_DDict`. The former was introduced in v1.0.0 and the latter was
+// introduced in v1.1.3. But an important bug fix for `ZSTD_sizeof_DDict` came
+// in v1.1.4, so that is the version we require. As of today's latest version
+// (v1.3.8), they are both still in the experimental API, which means they are
+// only exported when the compiler flag `ZSTD_STATIC_LINKING_ONLY` is set.
+#if defined(ZSTD_STATIC_LINKING_ONLY) && ZSTD_VERSION_NUMBER >= 10104
+#define ROCKSDB_ZSTD_DDICT
+#endif  // defined(ZSTD_STATIC_LINKING_ONLY) && ZSTD_VERSION_NUMBER >= 10104
+
+// Cached data represents a portion that can be re-used
+// If, in the future we have more than one native context to
+// cache we can arrange this as a tuple
+class ZSTDUncompressCachedData {
+ public:
+  using ZSTDNativeContext = ZSTD_DCtx*;
+  ZSTDUncompressCachedData() {}
+  // Init from cache
+  ZSTDUncompressCachedData(const ZSTDUncompressCachedData& o) = delete;
+  ZSTDUncompressCachedData& operator=(const ZSTDUncompressCachedData&) = delete;
+  ZSTDUncompressCachedData(ZSTDUncompressCachedData&& o) noexcept
+      : ZSTDUncompressCachedData() {
+    *this = std::move(o);
+  }
+  ZSTDUncompressCachedData& operator=(ZSTDUncompressCachedData&& o) noexcept {
+    assert(zstd_ctx_ == nullptr);
+    std::swap(zstd_ctx_, o.zstd_ctx_);
+    std::swap(cache_idx_, o.cache_idx_);
+    return *this;
+  }
+  ZSTDNativeContext Get() const { return zstd_ctx_; }
+  int64_t GetCacheIndex() const { return cache_idx_; }
+  void CreateIfNeeded() {
+    if (zstd_ctx_ == nullptr) {
+#ifdef ROCKSDB_ZSTD_CUSTOM_MEM
+      zstd_ctx_ =
+          ZSTD_createDCtx_advanced(port::GetJeZstdAllocationOverrides());
+#else   // ROCKSDB_ZSTD_CUSTOM_MEM
+      zstd_ctx_ = ZSTD_createDCtx();
+#endif  // ROCKSDB_ZSTD_CUSTOM_MEM
+      cache_idx_ = -1;
+    }
+  }
+  void InitFromCache(const ZSTDUncompressCachedData& o, int64_t idx) {
+    zstd_ctx_ = o.zstd_ctx_;
+    cache_idx_ = idx;
+  }
+  ~ZSTDUncompressCachedData() {
+    if (zstd_ctx_ != nullptr && cache_idx_ == -1) {
+      ZSTD_freeDCtx(zstd_ctx_);
+    }
+  }
+
+ private:
+  ZSTDNativeContext zstd_ctx_ = nullptr;
+  int64_t cache_idx_ = -1;  // -1 means this instance owns the context
+};
+#endif  // (ZSTD_VERSION_NUMBER >= 500)
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ZSTD
+
+#if !(defined ZSTD) || !(ZSTD_VERSION_NUMBER >= 500)
+namespace ROCKSDB_NAMESPACE {
+class ZSTDUncompressCachedData {
+  void* padding;  // unused
+ public:
+  using ZSTDNativeContext = void*;
+  ZSTDUncompressCachedData() {}
+  ZSTDUncompressCachedData(const ZSTDUncompressCachedData&) {}
+  ZSTDUncompressCachedData& operator=(const ZSTDUncompressCachedData&) = delete;
+  ZSTDUncompressCachedData(ZSTDUncompressCachedData&&) noexcept = default;
+  ZSTDUncompressCachedData& operator=(ZSTDUncompressCachedData&&) noexcept =
+      default;
+  ZSTDNativeContext Get() const { return nullptr; }
+  int64_t GetCacheIndex() const { return -1; }
+  void CreateIfNeeded() {}
+  void InitFromCache(const ZSTDUncompressCachedData&, int64_t) {}
+
+ private:
+  void ignore_padding__() { padding = nullptr; }
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif
+
+#if defined(XPRESS)
+#include "port/xpress.h"
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+// Holds dictionary and related data, like ZSTD's digested compression
+// dictionary.
+struct CompressionDict {
+#if ZSTD_VERSION_NUMBER >= 700
+  ZSTD_CDict* zstd_cdict_ = nullptr;
+#endif  // ZSTD_VERSION_NUMBER >= 700
+  std::string dict_;
+
+ public:
+#if ZSTD_VERSION_NUMBER >= 700
+  CompressionDict(std::string dict, CompressionType type, int level) {
+#else   // ZSTD_VERSION_NUMBER >= 700
+  CompressionDict(std::string dict, CompressionType /*type*/, int /*level*/) {
+#endif  // ZSTD_VERSION_NUMBER >= 700
+    dict_ = std::move(dict);
+#if ZSTD_VERSION_NUMBER >= 700
+    zstd_cdict_ = nullptr;
+    if (!dict_.empty() && (type == kZSTD || type == kZSTDNotFinalCompression)) {
+      if (level == CompressionOptions::kDefaultCompressionLevel) {
+        // 3 is the value of ZSTD_CLEVEL_DEFAULT (not exposed publicly), see
+        // https://github.com/facebook/zstd/issues/1148
+        level = 3;
+      }
+      // Should be safe (but slower) if below call fails as we'll use the
+      // raw dictionary to compress.
+      zstd_cdict_ = ZSTD_createCDict(dict_.data(), dict_.size(), level);
+      assert(zstd_cdict_ != nullptr);
+    }
+#endif  // ZSTD_VERSION_NUMBER >= 700
+  }
+
+  ~CompressionDict() {
+#if ZSTD_VERSION_NUMBER >= 700
+    size_t res = 0;
+    if (zstd_cdict_ != nullptr) {
+      res = ZSTD_freeCDict(zstd_cdict_);
+    }
+    assert(res == 0);  // Last I checked they can't fail
+    (void)res;         // prevent unused var warning
+#endif                 // ZSTD_VERSION_NUMBER >= 700
+  }
+
+#if ZSTD_VERSION_NUMBER >= 700
+  const ZSTD_CDict* GetDigestedZstdCDict() const { return zstd_cdict_; }
+#endif  // ZSTD_VERSION_NUMBER >= 700
+
+  Slice GetRawDict() const { return dict_; }
+
+  static const CompressionDict& GetEmptyDict() {
+    static CompressionDict empty_dict{};
+    return empty_dict;
+  }
+
+  CompressionDict() = default;
+  // Disable copy/move
+  CompressionDict(const CompressionDict&) = delete;
+  CompressionDict& operator=(const CompressionDict&) = delete;
+  CompressionDict(CompressionDict&&) = delete;
+  CompressionDict& operator=(CompressionDict&&) = delete;
+};
+
+// Holds dictionary and related data, like ZSTD's digested uncompression
+// dictionary.
+struct UncompressionDict {
+  // Block containing the data for the compression dictionary in case the
+  // constructor that takes a string parameter is used.
+  std::string dict_;
+
+  // Block containing the data for the compression dictionary in case the
+  // constructor that takes a Slice parameter is used and the passed in
+  // CacheAllocationPtr is not nullptr.
+  CacheAllocationPtr allocation_;
+
+  // Slice pointing to the compression dictionary data. Can point to
+  // dict_, allocation_, or some other memory location, depending on how
+  // the object was constructed.
+  Slice slice_;
+
+#ifdef ROCKSDB_ZSTD_DDICT
+  // Processed version of the contents of slice_ for ZSTD compression.
+  ZSTD_DDict* zstd_ddict_ = nullptr;
+#endif  // ROCKSDB_ZSTD_DDICT
+
+#ifdef ROCKSDB_ZSTD_DDICT
+  UncompressionDict(std::string dict, bool using_zstd)
+#else   // ROCKSDB_ZSTD_DDICT
+  UncompressionDict(std::string dict, bool /* using_zstd */)
+#endif  // ROCKSDB_ZSTD_DDICT
+      : dict_(std::move(dict)), slice_(dict_) {
+#ifdef ROCKSDB_ZSTD_DDICT
+    if (!slice_.empty() && using_zstd) {
+      zstd_ddict_ = ZSTD_createDDict_byReference(slice_.data(), slice_.size());
+      assert(zstd_ddict_ != nullptr);
+    }
+#endif  // ROCKSDB_ZSTD_DDICT
+  }
+
+#ifdef ROCKSDB_ZSTD_DDICT
+  UncompressionDict(Slice slice, CacheAllocationPtr&& allocation,
+                    bool using_zstd)
+#else   // ROCKSDB_ZSTD_DDICT
+  UncompressionDict(Slice slice, CacheAllocationPtr&& allocation,
+                    bool /* using_zstd */)
+#endif  // ROCKSDB_ZSTD_DDICT
+      : allocation_(std::move(allocation)), slice_(std::move(slice)) {
+#ifdef ROCKSDB_ZSTD_DDICT
+    if (!slice_.empty() && using_zstd) {
+      zstd_ddict_ = ZSTD_createDDict_byReference(slice_.data(), slice_.size());
+      assert(zstd_ddict_ != nullptr);
+    }
+#endif  // ROCKSDB_ZSTD_DDICT
+  }
+
+  UncompressionDict(UncompressionDict&& rhs)
+      : dict_(std::move(rhs.dict_)),
+        allocation_(std::move(rhs.allocation_)),
+        slice_(std::move(rhs.slice_))
+#ifdef ROCKSDB_ZSTD_DDICT
+        ,
+        zstd_ddict_(rhs.zstd_ddict_)
+#endif
+  {
+#ifdef ROCKSDB_ZSTD_DDICT
+    rhs.zstd_ddict_ = nullptr;
+#endif
+  }
+
+  ~UncompressionDict() {
+#ifdef ROCKSDB_ZSTD_DDICT
+    size_t res = 0;
+    if (zstd_ddict_ != nullptr) {
+      res = ZSTD_freeDDict(zstd_ddict_);
+    }
+    assert(res == 0);  // Last I checked they can't fail
+    (void)res;         // prevent unused var warning
+#endif                 // ROCKSDB_ZSTD_DDICT
+  }
+
+  UncompressionDict& operator=(UncompressionDict&& rhs) {
+    if (this == &rhs) {
+      return *this;
+    }
+
+    dict_ = std::move(rhs.dict_);
+    allocation_ = std::move(rhs.allocation_);
+    slice_ = std::move(rhs.slice_);
+
+#ifdef ROCKSDB_ZSTD_DDICT
+    zstd_ddict_ = rhs.zstd_ddict_;
+    rhs.zstd_ddict_ = nullptr;
+#endif
+
+    return *this;
+  }
+
+  // The object is self-contained if the string constructor is used, or the
+  // Slice constructor is invoked with a non-null allocation. Otherwise, it
+  // is the caller's responsibility to ensure that the underlying storage
+  // outlives this object.
+  bool own_bytes() const { return !dict_.empty() || allocation_; }
+
+  const Slice& GetRawDict() const { return slice_; }
+
+#ifdef ROCKSDB_ZSTD_DDICT
+  const ZSTD_DDict* GetDigestedZstdDDict() const { return zstd_ddict_; }
+#endif  // ROCKSDB_ZSTD_DDICT
+
+  static const UncompressionDict& GetEmptyDict() {
+    static UncompressionDict empty_dict{};
+    return empty_dict;
+  }
+
+  size_t ApproximateMemoryUsage() const {
+    size_t usage = sizeof(struct UncompressionDict);
+    usage += dict_.size();
+    if (allocation_) {
+      auto allocator = allocation_.get_deleter().allocator;
+      if (allocator) {
+        usage += allocator->UsableSize(allocation_.get(), slice_.size());
+      } else {
+        usage += slice_.size();
+      }
+    }
+#ifdef ROCKSDB_ZSTD_DDICT
+    usage += ZSTD_sizeof_DDict(zstd_ddict_);
+#endif  // ROCKSDB_ZSTD_DDICT
+    return usage;
+  }
+
+  UncompressionDict() = default;
+  // Disable copy
+  UncompressionDict(const CompressionDict&) = delete;
+  UncompressionDict& operator=(const CompressionDict&) = delete;
+};
+
+class CompressionContext {
+ private:
+#if defined(ZSTD) && (ZSTD_VERSION_NUMBER >= 500)
+  ZSTD_CCtx* zstd_ctx_ = nullptr;
+  void CreateNativeContext(CompressionType type) {
+    if (type == kZSTD || type == kZSTDNotFinalCompression) {
+#ifdef ROCKSDB_ZSTD_CUSTOM_MEM
+      zstd_ctx_ =
+          ZSTD_createCCtx_advanced(port::GetJeZstdAllocationOverrides());
+#else   // ROCKSDB_ZSTD_CUSTOM_MEM
+      zstd_ctx_ = ZSTD_createCCtx();
+#endif  // ROCKSDB_ZSTD_CUSTOM_MEM
+    }
+  }
+  void DestroyNativeContext() {
+    if (zstd_ctx_ != nullptr) {
+      ZSTD_freeCCtx(zstd_ctx_);
+    }
+  }
+
+ public:
+  // callable inside ZSTD_Compress
+  ZSTD_CCtx* ZSTDPreallocCtx() const {
+    assert(zstd_ctx_ != nullptr);
+    return zstd_ctx_;
+  }
+
+#else   // ZSTD && (ZSTD_VERSION_NUMBER >= 500)
+ private:
+  void CreateNativeContext(CompressionType /* type */) {}
+  void DestroyNativeContext() {}
+#endif  // ZSTD && (ZSTD_VERSION_NUMBER >= 500)
+ public:
+  explicit CompressionContext(CompressionType type) {
+    CreateNativeContext(type);
+  }
+  ~CompressionContext() { DestroyNativeContext(); }
+  CompressionContext(const CompressionContext&) = delete;
+  CompressionContext& operator=(const CompressionContext&) = delete;
+};
+
+class CompressionInfo {
+  const CompressionOptions& opts_;
+  const CompressionContext& context_;
+  const CompressionDict& dict_;
+  const CompressionType type_;
+  const uint64_t sample_for_compression_;
+
+ public:
+  CompressionInfo(const CompressionOptions& _opts,
+                  const CompressionContext& _context,
+                  const CompressionDict& _dict, CompressionType _type,
+                  uint64_t _sample_for_compression)
+      : opts_(_opts),
+        context_(_context),
+        dict_(_dict),
+        type_(_type),
+        sample_for_compression_(_sample_for_compression) {}
+
+  const CompressionOptions& options() const { return opts_; }
+  const CompressionContext& context() const { return context_; }
+  const CompressionDict& dict() const { return dict_; }
+  CompressionType type() const { return type_; }
+  uint64_t SampleForCompression() const { return sample_for_compression_; }
+};
+
+class UncompressionContext {
+ private:
+  CompressionContextCache* ctx_cache_ = nullptr;
+  ZSTDUncompressCachedData uncomp_cached_data_;
+
+ public:
+  explicit UncompressionContext(CompressionType type) {
+    if (type == kZSTD || type == kZSTDNotFinalCompression) {
+      ctx_cache_ = CompressionContextCache::Instance();
+      uncomp_cached_data_ = ctx_cache_->GetCachedZSTDUncompressData();
+    }
+  }
+  ~UncompressionContext() {
+    if (uncomp_cached_data_.GetCacheIndex() != -1) {
+      assert(ctx_cache_ != nullptr);
+      ctx_cache_->ReturnCachedZSTDUncompressData(
+          uncomp_cached_data_.GetCacheIndex());
+    }
+  }
+  UncompressionContext(const UncompressionContext&) = delete;
+  UncompressionContext& operator=(const UncompressionContext&) = delete;
+
+  ZSTDUncompressCachedData::ZSTDNativeContext GetZSTDContext() const {
+    return uncomp_cached_data_.Get();
+  }
+};
+
+class UncompressionInfo {
+  const UncompressionContext& context_;
+  const UncompressionDict& dict_;
+  const CompressionType type_;
+
+ public:
+  UncompressionInfo(const UncompressionContext& _context,
+                    const UncompressionDict& _dict, CompressionType _type)
+      : context_(_context), dict_(_dict), type_(_type) {}
+
+  const UncompressionContext& context() const { return context_; }
+  const UncompressionDict& dict() const { return dict_; }
+  CompressionType type() const { return type_; }
+};
+
+inline bool Snappy_Supported() {
+#ifdef SNAPPY
+  return true;
+#else
+  return false;
+#endif
+}
+
+inline bool Zlib_Supported() {
+#ifdef ZLIB
+  return true;
+#else
+  return false;
+#endif
+}
+
+inline bool BZip2_Supported() {
+#ifdef BZIP2
+  return true;
+#else
+  return false;
+#endif
+}
+
+inline bool LZ4_Supported() {
+#ifdef LZ4
+  return true;
+#else
+  return false;
+#endif
+}
+
+inline bool XPRESS_Supported() {
+#ifdef XPRESS
+  return true;
+#else
+  return false;
+#endif
+}
+
+inline bool ZSTD_Supported() {
+#ifdef ZSTD
+  // ZSTD format is finalized since version 0.8.0.
+  return (ZSTD_versionNumber() >= 800);
+#else
+  return false;
+#endif
+}
+
+inline bool ZSTDNotFinal_Supported() {
+#ifdef ZSTD
+  return true;
+#else
+  return false;
+#endif
+}
+
+inline bool ZSTD_Streaming_Supported() {
+#if defined(ZSTD) && defined(ZSTD_STREAMING)
+  return true;
+#else
+  return false;
+#endif
+}
+
+inline bool StreamingCompressionTypeSupported(
+    CompressionType compression_type) {
+  switch (compression_type) {
+    case kNoCompression:
+      return true;
+    case kZSTD:
+      return ZSTD_Streaming_Supported();
+    default:
+      return false;
+  }
+}
+
+inline bool CompressionTypeSupported(CompressionType compression_type) {
+  switch (compression_type) {
+    case kNoCompression:
+      return true;
+    case kSnappyCompression:
+      return Snappy_Supported();
+    case kZlibCompression:
+      return Zlib_Supported();
+    case kBZip2Compression:
+      return BZip2_Supported();
+    case kLZ4Compression:
+      return LZ4_Supported();
+    case kLZ4HCCompression:
+      return LZ4_Supported();
+    case kXpressCompression:
+      return XPRESS_Supported();
+    case kZSTDNotFinalCompression:
+      return ZSTDNotFinal_Supported();
+    case kZSTD:
+      return ZSTD_Supported();
+    default:
+      assert(false);
+      return false;
+  }
+}
+
+inline bool DictCompressionTypeSupported(CompressionType compression_type) {
+  switch (compression_type) {
+    case kNoCompression:
+      return false;
+    case kSnappyCompression:
+      return false;
+    case kZlibCompression:
+      return Zlib_Supported();
+    case kBZip2Compression:
+      return false;
+    case kLZ4Compression:
+    case kLZ4HCCompression:
+#if LZ4_VERSION_NUMBER >= 10400  // r124+
+      return LZ4_Supported();
+#else
+      return false;
+#endif
+    case kXpressCompression:
+      return false;
+    case kZSTDNotFinalCompression:
+#if ZSTD_VERSION_NUMBER >= 500  // v0.5.0+
+      return ZSTDNotFinal_Supported();
+#else
+      return false;
+#endif
+    case kZSTD:
+#if ZSTD_VERSION_NUMBER >= 500  // v0.5.0+
+      return ZSTD_Supported();
+#else
+      return false;
+#endif
+    default:
+      assert(false);
+      return false;
+  }
+}
+
+inline std::string CompressionTypeToString(CompressionType compression_type) {
+  switch (compression_type) {
+    case kNoCompression:
+      return "NoCompression";
+    case kSnappyCompression:
+      return "Snappy";
+    case kZlibCompression:
+      return "Zlib";
+    case kBZip2Compression:
+      return "BZip2";
+    case kLZ4Compression:
+      return "LZ4";
+    case kLZ4HCCompression:
+      return "LZ4HC";
+    case kXpressCompression:
+      return "Xpress";
+    case kZSTD:
+      return "ZSTD";
+    case kZSTDNotFinalCompression:
+      return "ZSTDNotFinal";
+    case kDisableCompressionOption:
+      return "DisableOption";
+    default:
+      assert(false);
+      return "";
+  }
+}
+
+inline std::string CompressionOptionsToString(
+    CompressionOptions& compression_options) {
+  std::string result;
+  result.reserve(512);
+  result.append("window_bits=")
+      .append(std::to_string(compression_options.window_bits))
+      .append("; ");
+  result.append("level=")
+      .append(std::to_string(compression_options.level))
+      .append("; ");
+  result.append("strategy=")
+      .append(std::to_string(compression_options.strategy))
+      .append("; ");
+  result.append("max_dict_bytes=")
+      .append(std::to_string(compression_options.max_dict_bytes))
+      .append("; ");
+  result.append("zstd_max_train_bytes=")
+      .append(std::to_string(compression_options.zstd_max_train_bytes))
+      .append("; ");
+  result.append("enabled=")
+      .append(std::to_string(compression_options.enabled))
+      .append("; ");
+  result.append("max_dict_buffer_bytes=")
+      .append(std::to_string(compression_options.max_dict_buffer_bytes))
+      .append("; ");
+  result.append("use_zstd_dict_trainer=")
+      .append(std::to_string(compression_options.use_zstd_dict_trainer))
+      .append("; ");
+  return result;
+}
+
+// compress_format_version can have two values:
+// 1 -- decompressed sizes for BZip2 and Zlib are not included in the compressed
+// block. Also, decompressed sizes for LZ4 are encoded in platform-dependent
+// way.
+// 2 -- Zlib, BZip2 and LZ4 encode decompressed size as Varint32 just before the
+// start of compressed block. Snappy format is the same as version 1.
+
+inline bool Snappy_Compress(const CompressionInfo& /*info*/, const char* input,
+                            size_t length, ::std::string* output) {
+#ifdef SNAPPY
+  output->resize(snappy::MaxCompressedLength(length));
+  size_t outlen;
+  snappy::RawCompress(input, length, &(*output)[0], &outlen);
+  output->resize(outlen);
+  return true;
+#else
+  (void)input;
+  (void)length;
+  (void)output;
+  return false;
+#endif
+}
+
+inline CacheAllocationPtr Snappy_Uncompress(
+    const char* input, size_t length, size_t* uncompressed_size,
+    MemoryAllocator* allocator = nullptr) {
+#ifdef SNAPPY
+  size_t uncompressed_length = 0;
+  if (!snappy::GetUncompressedLength(input, length, &uncompressed_length)) {
+    return nullptr;
+  }
+
+  CacheAllocationPtr output = AllocateBlock(uncompressed_length, allocator);
+
+  if (!snappy::RawUncompress(input, length, output.get())) {
+    return nullptr;
+  }
+
+  *uncompressed_size = uncompressed_length;
+
+  return output;
+#else
+  (void)input;
+  (void)length;
+  (void)uncompressed_size;
+  (void)allocator;
+  return nullptr;
+#endif
+}
+
+namespace compression {
+// returns size
+inline size_t PutDecompressedSizeInfo(std::string* output, uint32_t length) {
+  PutVarint32(output, length);
+  return output->size();
+}
+
+inline bool GetDecompressedSizeInfo(const char** input_data,
+                                    size_t* input_length,
+                                    uint32_t* output_len) {
+  auto new_input_data =
+      GetVarint32Ptr(*input_data, *input_data + *input_length, output_len);
+  if (new_input_data == nullptr) {
+    return false;
+  }
+  *input_length -= (new_input_data - *input_data);
+  *input_data = new_input_data;
+  return true;
+}
+}  // namespace compression
+
+// compress_format_version == 1 -- decompressed size is not included in the
+// block header
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+// @param compression_dict Data for presetting the compression library's
+//    dictionary.
+inline bool Zlib_Compress(const CompressionInfo& info,
+                          uint32_t compress_format_version, const char* input,
+                          size_t length, ::std::string* output) {
+#ifdef ZLIB
+  if (length > std::numeric_limits<uint32_t>::max()) {
+    // Can't compress more than 4GB
+    return false;
+  }
+
+  size_t output_header_len = 0;
+  if (compress_format_version == 2) {
+    output_header_len = compression::PutDecompressedSizeInfo(
+        output, static_cast<uint32_t>(length));
+  }
+
+  // The memLevel parameter specifies how much memory should be allocated for
+  // the internal compression state.
+  // memLevel=1 uses minimum memory but is slow and reduces compression ratio.
+  // memLevel=9 uses maximum memory for optimal speed.
+  // The default value is 8. See zconf.h for more details.
+  static const int memLevel = 8;
+  int level;
+  if (info.options().level == CompressionOptions::kDefaultCompressionLevel) {
+    level = Z_DEFAULT_COMPRESSION;
+  } else {
+    level = info.options().level;
+  }
+  z_stream _stream;
+  memset(&_stream, 0, sizeof(z_stream));
+  int st = deflateInit2(&_stream, level, Z_DEFLATED, info.options().window_bits,
+                        memLevel, info.options().strategy);
+  if (st != Z_OK) {
+    return false;
+  }
+
+  Slice compression_dict = info.dict().GetRawDict();
+  if (compression_dict.size()) {
+    // Initialize the compression library's dictionary
+    st = deflateSetDictionary(
+        &_stream, reinterpret_cast<const Bytef*>(compression_dict.data()),
+        static_cast<unsigned int>(compression_dict.size()));
+    if (st != Z_OK) {
+      deflateEnd(&_stream);
+      return false;
+    }
+  }
+
+  // Get an upper bound on the compressed size.
+  size_t upper_bound =
+      deflateBound(&_stream, static_cast<unsigned long>(length));
+  output->resize(output_header_len + upper_bound);
+
+  // Compress the input, and put compressed data in output.
+  _stream.next_in = (Bytef*)input;
+  _stream.avail_in = static_cast<unsigned int>(length);
+
+  // Initialize the output size.
+  _stream.avail_out = static_cast<unsigned int>(upper_bound);
+  _stream.next_out = reinterpret_cast<Bytef*>(&(*output)[output_header_len]);
+
+  bool compressed = false;
+  st = deflate(&_stream, Z_FINISH);
+  if (st == Z_STREAM_END) {
+    compressed = true;
+    output->resize(output->size() - _stream.avail_out);
+  }
+  // The only return value we really care about is Z_STREAM_END.
+  // Z_OK means insufficient output space. This means the compression is
+  // bigger than decompressed size. Just fail the compression in that case.
+
+  deflateEnd(&_stream);
+  return compressed;
+#else
+  (void)info;
+  (void)compress_format_version;
+  (void)input;
+  (void)length;
+  (void)output;
+  return false;
+#endif
+}
+
+// compress_format_version == 1 -- decompressed size is not included in the
+// block header
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+// @param compression_dict Data for presetting the compression library's
+//    dictionary.
+inline CacheAllocationPtr Zlib_Uncompress(
+    const UncompressionInfo& info, const char* input_data, size_t input_length,
+    size_t* uncompressed_size, uint32_t compress_format_version,
+    MemoryAllocator* allocator = nullptr, int windowBits = -14) {
+#ifdef ZLIB
+  uint32_t output_len = 0;
+  if (compress_format_version == 2) {
+    if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
+                                              &output_len)) {
+      return nullptr;
+    }
+  } else {
+    // Assume the decompressed data size will 5x of compressed size, but round
+    // to the page size
+    size_t proposed_output_len = ((input_length * 5) & (~(4096 - 1))) + 4096;
+    output_len = static_cast<uint32_t>(
+        std::min(proposed_output_len,
+                 static_cast<size_t>(std::numeric_limits<uint32_t>::max())));
+  }
+
+  z_stream _stream;
+  memset(&_stream, 0, sizeof(z_stream));
+
+  // For raw inflate, the windowBits should be -8..-15.
+  // If windowBits is bigger than zero, it will use either zlib
+  // header or gzip header. Adding 32 to it will do automatic detection.
+  int st =
+      inflateInit2(&_stream, windowBits > 0 ? windowBits + 32 : windowBits);
+  if (st != Z_OK) {
+    return nullptr;
+  }
+
+  const Slice& compression_dict = info.dict().GetRawDict();
+  if (compression_dict.size()) {
+    // Initialize the compression library's dictionary
+    st = inflateSetDictionary(
+        &_stream, reinterpret_cast<const Bytef*>(compression_dict.data()),
+        static_cast<unsigned int>(compression_dict.size()));
+    if (st != Z_OK) {
+      return nullptr;
+    }
+  }
+
+  _stream.next_in = (Bytef*)input_data;
+  _stream.avail_in = static_cast<unsigned int>(input_length);
+
+  auto output = AllocateBlock(output_len, allocator);
+
+  _stream.next_out = (Bytef*)output.get();
+  _stream.avail_out = static_cast<unsigned int>(output_len);
+
+  bool done = false;
+  while (!done) {
+    st = inflate(&_stream, Z_SYNC_FLUSH);
+    switch (st) {
+      case Z_STREAM_END:
+        done = true;
+        break;
+      case Z_OK: {
+        // No output space. Increase the output space by 20%.
+        // We should never run out of output space if
+        // compress_format_version == 2
+        assert(compress_format_version != 2);
+        size_t old_sz = output_len;
+        uint32_t output_len_delta = output_len / 5;
+        output_len += output_len_delta < 10 ? 10 : output_len_delta;
+        auto tmp = AllocateBlock(output_len, allocator);
+        memcpy(tmp.get(), output.get(), old_sz);
+        output = std::move(tmp);
+
+        // Set more output.
+        _stream.next_out = (Bytef*)(output.get() + old_sz);
+        _stream.avail_out = static_cast<unsigned int>(output_len - old_sz);
+        break;
+      }
+      case Z_BUF_ERROR:
+      default:
+        inflateEnd(&_stream);
+        return nullptr;
+    }
+  }
+
+  // If we encoded decompressed block size, we should have no bytes left
+  assert(compress_format_version != 2 || _stream.avail_out == 0);
+  assert(output_len >= _stream.avail_out);
+  *uncompressed_size = output_len - _stream.avail_out;
+  inflateEnd(&_stream);
+  return output;
+#else
+  (void)info;
+  (void)input_data;
+  (void)input_length;
+  (void)uncompressed_size;
+  (void)compress_format_version;
+  (void)allocator;
+  (void)windowBits;
+  return nullptr;
+#endif
+}
+
+// compress_format_version == 1 -- decompressed size is not included in the
+// block header
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+inline bool BZip2_Compress(const CompressionInfo& /*info*/,
+                           uint32_t compress_format_version, const char* input,
+                           size_t length, ::std::string* output) {
+#ifdef BZIP2
+  if (length > std::numeric_limits<uint32_t>::max()) {
+    // Can't compress more than 4GB
+    return false;
+  }
+  size_t output_header_len = 0;
+  if (compress_format_version == 2) {
+    output_header_len = compression::PutDecompressedSizeInfo(
+        output, static_cast<uint32_t>(length));
+  }
+  // Resize output to be the plain data length.
+  // This may not be big enough if the compression actually expands data.
+  output->resize(output_header_len + length);
+
+  bz_stream _stream;
+  memset(&_stream, 0, sizeof(bz_stream));
+
+  // Block size 1 is 100K.
+  // 0 is for silent.
+  // 30 is the default workFactor
+  int st = BZ2_bzCompressInit(&_stream, 1, 0, 30);
+  if (st != BZ_OK) {
+    return false;
+  }
+
+  // Compress the input, and put compressed data in output.
+  _stream.next_in = (char*)input;
+  _stream.avail_in = static_cast<unsigned int>(length);
+
+  // Initialize the output size.
+  _stream.avail_out = static_cast<unsigned int>(length);
+  _stream.next_out = reinterpret_cast<char*>(&(*output)[output_header_len]);
+
+  bool compressed = false;
+  st = BZ2_bzCompress(&_stream, BZ_FINISH);
+  if (st == BZ_STREAM_END) {
+    compressed = true;
+    output->resize(output->size() - _stream.avail_out);
+  }
+  // The only return value we really care about is BZ_STREAM_END.
+  // BZ_FINISH_OK means insufficient output space. This means the compression
+  // is bigger than decompressed size. Just fail the compression in that case.
+
+  BZ2_bzCompressEnd(&_stream);
+  return compressed;
+#else
+  (void)compress_format_version;
+  (void)input;
+  (void)length;
+  (void)output;
+  return false;
+#endif
+}
+
+// compress_format_version == 1 -- decompressed size is not included in the
+// block header
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+inline CacheAllocationPtr BZip2_Uncompress(
+    const char* input_data, size_t input_length, size_t* uncompressed_size,
+    uint32_t compress_format_version, MemoryAllocator* allocator = nullptr) {
+#ifdef BZIP2
+  uint32_t output_len = 0;
+  if (compress_format_version == 2) {
+    if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
+                                              &output_len)) {
+      return nullptr;
+    }
+  } else {
+    // Assume the decompressed data size will 5x of compressed size, but round
+    // to the next page size
+    size_t proposed_output_len = ((input_length * 5) & (~(4096 - 1))) + 4096;
+    output_len = static_cast<uint32_t>(
+        std::min(proposed_output_len,
+                 static_cast<size_t>(std::numeric_limits<uint32_t>::max())));
+  }
+
+  bz_stream _stream;
+  memset(&_stream, 0, sizeof(bz_stream));
+
+  int st = BZ2_bzDecompressInit(&_stream, 0, 0);
+  if (st != BZ_OK) {
+    return nullptr;
+  }
+
+  _stream.next_in = (char*)input_data;
+  _stream.avail_in = static_cast<unsigned int>(input_length);
+
+  auto output = AllocateBlock(output_len, allocator);
+
+  _stream.next_out = (char*)output.get();
+  _stream.avail_out = static_cast<unsigned int>(output_len);
+
+  bool done = false;
+  while (!done) {
+    st = BZ2_bzDecompress(&_stream);
+    switch (st) {
+      case BZ_STREAM_END:
+        done = true;
+        break;
+      case BZ_OK: {
+        // No output space. Increase the output space by 20%.
+        // We should never run out of output space if
+        // compress_format_version == 2
+        assert(compress_format_version != 2);
+        uint32_t old_sz = output_len;
+        output_len = output_len * 1.2;
+        auto tmp = AllocateBlock(output_len, allocator);
+        memcpy(tmp.get(), output.get(), old_sz);
+        output = std::move(tmp);
+
+        // Set more output.
+        _stream.next_out = (char*)(output.get() + old_sz);
+        _stream.avail_out = static_cast<unsigned int>(output_len - old_sz);
+        break;
+      }
+      default:
+        BZ2_bzDecompressEnd(&_stream);
+        return nullptr;
+    }
+  }
+
+  // If we encoded decompressed block size, we should have no bytes left
+  assert(compress_format_version != 2 || _stream.avail_out == 0);
+  assert(output_len >= _stream.avail_out);
+  *uncompressed_size = output_len - _stream.avail_out;
+  BZ2_bzDecompressEnd(&_stream);
+  return output;
+#else
+  (void)input_data;
+  (void)input_length;
+  (void)uncompressed_size;
+  (void)compress_format_version;
+  (void)allocator;
+  return nullptr;
+#endif
+}
+
+// compress_format_version == 1 -- decompressed size is included in the
+// block header using memcpy, which makes database non-portable)
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+// @param compression_dict Data for presetting the compression library's
+//    dictionary.
+inline bool LZ4_Compress(const CompressionInfo& info,
+                         uint32_t compress_format_version, const char* input,
+                         size_t length, ::std::string* output) {
+#ifdef LZ4
+  if (length > std::numeric_limits<uint32_t>::max()) {
+    // Can't compress more than 4GB
+    return false;
+  }
+
+  size_t output_header_len = 0;
+  if (compress_format_version == 2) {
+    // new encoding, using varint32 to store size information
+    output_header_len = compression::PutDecompressedSizeInfo(
+        output, static_cast<uint32_t>(length));
+  } else {
+    // legacy encoding, which is not really portable (depends on big/little
+    // endianness)
+    output_header_len = 8;
+    output->resize(output_header_len);
+    char* p = const_cast<char*>(output->c_str());
+    memcpy(p, &length, sizeof(length));
+  }
+  int compress_bound = LZ4_compressBound(static_cast<int>(length));
+  output->resize(static_cast<size_t>(output_header_len + compress_bound));
+
+  int outlen;
+#if LZ4_VERSION_NUMBER >= 10400  // r124+
+  LZ4_stream_t* stream = LZ4_createStream();
+  Slice compression_dict = info.dict().GetRawDict();
+  if (compression_dict.size()) {
+    LZ4_loadDict(stream, compression_dict.data(),
+                 static_cast<int>(compression_dict.size()));
+  }
+#if LZ4_VERSION_NUMBER >= 10700  // r129+
+  outlen =
+      LZ4_compress_fast_continue(stream, input, &(*output)[output_header_len],
+                                 static_cast<int>(length), compress_bound, 1);
+#else  // up to r128
+  outlen = LZ4_compress_limitedOutput_continue(
+      stream, input, &(*output)[output_header_len], static_cast<int>(length),
+      compress_bound);
+#endif
+  LZ4_freeStream(stream);
+#else   // up to r123
+  outlen = LZ4_compress_limitedOutput(input, &(*output)[output_header_len],
+                                      static_cast<int>(length), compress_bound);
+#endif  // LZ4_VERSION_NUMBER >= 10400
+
+  if (outlen == 0) {
+    return false;
+  }
+  output->resize(static_cast<size_t>(output_header_len + outlen));
+  return true;
+#else  // LZ4
+  (void)info;
+  (void)compress_format_version;
+  (void)input;
+  (void)length;
+  (void)output;
+  return false;
+#endif
+}
+
+// compress_format_version == 1 -- decompressed size is included in the
+// block header using memcpy, which makes database non-portable)
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+// @param compression_dict Data for presetting the compression library's
+//    dictionary.
+inline CacheAllocationPtr LZ4_Uncompress(const UncompressionInfo& info,
+                                         const char* input_data,
+                                         size_t input_length,
+                                         size_t* uncompressed_size,
+                                         uint32_t compress_format_version,
+                                         MemoryAllocator* allocator = nullptr) {
+#ifdef LZ4
+  uint32_t output_len = 0;
+  if (compress_format_version == 2) {
+    // new encoding, using varint32 to store size information
+    if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
+                                              &output_len)) {
+      return nullptr;
+    }
+  } else {
+    // legacy encoding, which is not really portable (depends on big/little
+    // endianness)
+    if (input_length < 8) {
+      return nullptr;
+    }
+    if (port::kLittleEndian) {
+      memcpy(&output_len, input_data, sizeof(output_len));
+    } else {
+      memcpy(&output_len, input_data + 4, sizeof(output_len));
+    }
+    input_length -= 8;
+    input_data += 8;
+  }
+
+  auto output = AllocateBlock(output_len, allocator);
+
+  int decompress_bytes = 0;
+
+#if LZ4_VERSION_NUMBER >= 10400  // r124+
+  LZ4_streamDecode_t* stream = LZ4_createStreamDecode();
+  const Slice& compression_dict = info.dict().GetRawDict();
+  if (compression_dict.size()) {
+    LZ4_setStreamDecode(stream, compression_dict.data(),
+                        static_cast<int>(compression_dict.size()));
+  }
+  decompress_bytes = LZ4_decompress_safe_continue(
+      stream, input_data, output.get(), static_cast<int>(input_length),
+      static_cast<int>(output_len));
+  LZ4_freeStreamDecode(stream);
+#else   // up to r123
+  decompress_bytes = LZ4_decompress_safe(input_data, output.get(),
+                                         static_cast<int>(input_length),
+                                         static_cast<int>(output_len));
+#endif  // LZ4_VERSION_NUMBER >= 10400
+
+  if (decompress_bytes < 0) {
+    return nullptr;
+  }
+  assert(decompress_bytes == static_cast<int>(output_len));
+  *uncompressed_size = decompress_bytes;
+  return output;
+#else  // LZ4
+  (void)info;
+  (void)input_data;
+  (void)input_length;
+  (void)uncompressed_size;
+  (void)compress_format_version;
+  (void)allocator;
+  return nullptr;
+#endif
+}
+
+// compress_format_version == 1 -- decompressed size is included in the
+// block header using memcpy, which makes database non-portable)
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+// @param compression_dict Data for presetting the compression library's
+//    dictionary.
+inline bool LZ4HC_Compress(const CompressionInfo& info,
+                           uint32_t compress_format_version, const char* input,
+                           size_t length, ::std::string* output) {
+#ifdef LZ4
+  if (length > std::numeric_limits<uint32_t>::max()) {
+    // Can't compress more than 4GB
+    return false;
+  }
+
+  size_t output_header_len = 0;
+  if (compress_format_version == 2) {
+    // new encoding, using varint32 to store size information
+    output_header_len = compression::PutDecompressedSizeInfo(
+        output, static_cast<uint32_t>(length));
+  } else {
+    // legacy encoding, which is not really portable (depends on big/little
+    // endianness)
+    output_header_len = 8;
+    output->resize(output_header_len);
+    char* p = const_cast<char*>(output->c_str());
+    memcpy(p, &length, sizeof(length));
+  }
+  int compress_bound = LZ4_compressBound(static_cast<int>(length));
+  output->resize(static_cast<size_t>(output_header_len + compress_bound));
+
+  int outlen;
+  int level;
+  if (info.options().level == CompressionOptions::kDefaultCompressionLevel) {
+    level = 0;  // lz4hc.h says any value < 1 will be sanitized to default
+  } else {
+    level = info.options().level;
+  }
+#if LZ4_VERSION_NUMBER >= 10400  // r124+
+  LZ4_streamHC_t* stream = LZ4_createStreamHC();
+  LZ4_resetStreamHC(stream, level);
+  Slice compression_dict = info.dict().GetRawDict();
+  const char* compression_dict_data =
+      compression_dict.size() > 0 ? compression_dict.data() : nullptr;
+  size_t compression_dict_size = compression_dict.size();
+  if (compression_dict_data != nullptr) {
+    LZ4_loadDictHC(stream, compression_dict_data,
+                   static_cast<int>(compression_dict_size));
+  }
+
+#if LZ4_VERSION_NUMBER >= 10700  // r129+
+  outlen =
+      LZ4_compress_HC_continue(stream, input, &(*output)[output_header_len],
+                               static_cast<int>(length), compress_bound);
+#else   // r124-r128
+  outlen = LZ4_compressHC_limitedOutput_continue(
+      stream, input, &(*output)[output_header_len], static_cast<int>(length),
+      compress_bound);
+#endif  // LZ4_VERSION_NUMBER >= 10700
+  LZ4_freeStreamHC(stream);
+
+#elif LZ4_VERSION_MAJOR  // r113-r123
+  outlen = LZ4_compressHC2_limitedOutput(input, &(*output)[output_header_len],
+                                         static_cast<int>(length),
+                                         compress_bound, level);
+#else                    // up to r112
+  outlen =
+      LZ4_compressHC_limitedOutput(input, &(*output)[output_header_len],
+                                   static_cast<int>(length), compress_bound);
+#endif                   // LZ4_VERSION_NUMBER >= 10400
+
+  if (outlen == 0) {
+    return false;
+  }
+  output->resize(static_cast<size_t>(output_header_len + outlen));
+  return true;
+#else  // LZ4
+  (void)info;
+  (void)compress_format_version;
+  (void)input;
+  (void)length;
+  (void)output;
+  return false;
+#endif
+}
+
+#ifdef XPRESS
+inline bool XPRESS_Compress(const char* input, size_t length,
+                            std::string* output) {
+  return port::xpress::Compress(input, length, output);
+}
+#else
+inline bool XPRESS_Compress(const char* /*input*/, size_t /*length*/,
+                            std::string* /*output*/) {
+  return false;
+}
+#endif
+
+#ifdef XPRESS
+inline char* XPRESS_Uncompress(const char* input_data, size_t input_length,
+                               size_t* uncompressed_size) {
+  return port::xpress::Decompress(input_data, input_length, uncompressed_size);
+}
+#else
+inline char* XPRESS_Uncompress(const char* /*input_data*/,
+                               size_t /*input_length*/,
+                               size_t* /*uncompressed_size*/) {
+  return nullptr;
+}
+#endif
+
+inline bool ZSTD_Compress(const CompressionInfo& info, const char* input,
+                          size_t length, ::std::string* output) {
+#ifdef ZSTD
+  if (length > std::numeric_limits<uint32_t>::max()) {
+    // Can't compress more than 4GB
+    return false;
+  }
+
+  size_t output_header_len = compression::PutDecompressedSizeInfo(
+      output, static_cast<uint32_t>(length));
+
+  size_t compressBound = ZSTD_compressBound(length);
+  output->resize(static_cast<size_t>(output_header_len + compressBound));
+  size_t outlen = 0;
+  int level;
+  if (info.options().level == CompressionOptions::kDefaultCompressionLevel) {
+    // 3 is the value of ZSTD_CLEVEL_DEFAULT (not exposed publicly), see
+    // https://github.com/facebook/zstd/issues/1148
+    level = 3;
+  } else {
+    level = info.options().level;
+  }
+#if ZSTD_VERSION_NUMBER >= 500  // v0.5.0+
+  ZSTD_CCtx* context = info.context().ZSTDPreallocCtx();
+  assert(context != nullptr);
+#if ZSTD_VERSION_NUMBER >= 700  // v0.7.0+
+  if (info.dict().GetDigestedZstdCDict() != nullptr) {
+    outlen = ZSTD_compress_usingCDict(context, &(*output)[output_header_len],
+                                      compressBound, input, length,
+                                      info.dict().GetDigestedZstdCDict());
+  }
+#endif  // ZSTD_VERSION_NUMBER >= 700
+  if (outlen == 0) {
+    outlen = ZSTD_compress_usingDict(context, &(*output)[output_header_len],
+                                     compressBound, input, length,
+                                     info.dict().GetRawDict().data(),
+                                     info.dict().GetRawDict().size(), level);
+  }
+#else   // up to v0.4.x
+  outlen = ZSTD_compress(&(*output)[output_header_len], compressBound, input,
+                         length, level);
+#endif  // ZSTD_VERSION_NUMBER >= 500
+  if (outlen == 0) {
+    return false;
+  }
+  output->resize(output_header_len + outlen);
+  return true;
+#else  // ZSTD
+  (void)info;
+  (void)input;
+  (void)length;
+  (void)output;
+  return false;
+#endif
+}
+
+// @param compression_dict Data for presetting the compression library's
+//    dictionary.
+inline CacheAllocationPtr ZSTD_Uncompress(
+    const UncompressionInfo& info, const char* input_data, size_t input_length,
+    size_t* uncompressed_size, MemoryAllocator* allocator = nullptr) {
+#ifdef ZSTD
+  uint32_t output_len = 0;
+  if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
+                                            &output_len)) {
+    return nullptr;
+  }
+
+  auto output = AllocateBlock(output_len, allocator);
+  size_t actual_output_length = 0;
+#if ZSTD_VERSION_NUMBER >= 500  // v0.5.0+
+  ZSTD_DCtx* context = info.context().GetZSTDContext();
+  assert(context != nullptr);
+#ifdef ROCKSDB_ZSTD_DDICT
+  if (info.dict().GetDigestedZstdDDict() != nullptr) {
+    actual_output_length = ZSTD_decompress_usingDDict(
+        context, output.get(), output_len, input_data, input_length,
+        info.dict().GetDigestedZstdDDict());
+  }
+#endif  // ROCKSDB_ZSTD_DDICT
+  if (actual_output_length == 0) {
+    actual_output_length = ZSTD_decompress_usingDict(
+        context, output.get(), output_len, input_data, input_length,
+        info.dict().GetRawDict().data(), info.dict().GetRawDict().size());
+  }
+#else   // up to v0.4.x
+  (void)info;
+  actual_output_length =
+      ZSTD_decompress(output.get(), output_len, input_data, input_length);
+#endif  // ZSTD_VERSION_NUMBER >= 500
+  assert(actual_output_length == output_len);
+  *uncompressed_size = actual_output_length;
+  return output;
+#else  // ZSTD
+  (void)info;
+  (void)input_data;
+  (void)input_length;
+  (void)uncompressed_size;
+  (void)allocator;
+  return nullptr;
+#endif
+}
+
+inline bool ZSTD_TrainDictionarySupported() {
+#ifdef ZSTD
+  // Dictionary trainer is available since v0.6.1 for static linking, but not
+  // available for dynamic linking until v1.1.3. For now we enable the feature
+  // in v1.1.3+ only.
+  return (ZSTD_versionNumber() >= 10103);
+#else
+  return false;
+#endif
+}
+
+inline std::string ZSTD_TrainDictionary(const std::string& samples,
+                                        const std::vector<size_t>& sample_lens,
+                                        size_t max_dict_bytes) {
+  // Dictionary trainer is available since v0.6.1 for static linking, but not
+  // available for dynamic linking until v1.1.3. For now we enable the feature
+  // in v1.1.3+ only.
+#if ZSTD_VERSION_NUMBER >= 10103  // v1.1.3+
+  assert(samples.empty() == sample_lens.empty());
+  if (samples.empty()) {
+    return "";
+  }
+  std::string dict_data(max_dict_bytes, '\0');
+  size_t dict_len = ZDICT_trainFromBuffer(
+      &dict_data[0], max_dict_bytes, &samples[0], &sample_lens[0],
+      static_cast<unsigned>(sample_lens.size()));
+  if (ZDICT_isError(dict_len)) {
+    return "";
+  }
+  assert(dict_len <= max_dict_bytes);
+  dict_data.resize(dict_len);
+  return dict_data;
+#else   // up to v1.1.2
+  assert(false);
+  (void)samples;
+  (void)sample_lens;
+  (void)max_dict_bytes;
+  return "";
+#endif  // ZSTD_VERSION_NUMBER >= 10103
+}
+
+inline std::string ZSTD_TrainDictionary(const std::string& samples,
+                                        size_t sample_len_shift,
+                                        size_t max_dict_bytes) {
+  // Dictionary trainer is available since v0.6.1, but ZSTD was marked stable
+  // only since v0.8.0. For now we enable the feature in stable versions only.
+#if ZSTD_VERSION_NUMBER >= 10103  // v1.1.3+
+  // skips potential partial sample at the end of "samples"
+  size_t num_samples = samples.size() >> sample_len_shift;
+  std::vector<size_t> sample_lens(num_samples, size_t(1) << sample_len_shift);
+  return ZSTD_TrainDictionary(samples, sample_lens, max_dict_bytes);
+#else   // up to v1.1.2
+  assert(false);
+  (void)samples;
+  (void)sample_len_shift;
+  (void)max_dict_bytes;
+  return "";
+#endif  // ZSTD_VERSION_NUMBER >= 10103
+}
+
+inline bool ZSTD_FinalizeDictionarySupported() {
+#ifdef ZSTD
+  // ZDICT_finalizeDictionary API is stable since v1.4.5
+  return (ZSTD_versionNumber() >= 10405);
+#else
+  return false;
+#endif
+}
+
+inline std::string ZSTD_FinalizeDictionary(
+    const std::string& samples, const std::vector<size_t>& sample_lens,
+    size_t max_dict_bytes, int level) {
+  // ZDICT_finalizeDictionary is stable since version v1.4.5
+#if ZSTD_VERSION_NUMBER >= 10405  // v1.4.5+
+  assert(samples.empty() == sample_lens.empty());
+  if (samples.empty()) {
+    return "";
+  }
+  if (level == CompressionOptions::kDefaultCompressionLevel) {
+    // 3 is the value of ZSTD_CLEVEL_DEFAULT (not exposed publicly), see
+    // https://github.com/facebook/zstd/issues/1148
+    level = 3;
+  }
+  std::string dict_data(max_dict_bytes, '\0');
+  size_t dict_len = ZDICT_finalizeDictionary(
+      dict_data.data(), max_dict_bytes, samples.data(),
+      std::min(static_cast<size_t>(samples.size()), max_dict_bytes),
+      samples.data(), sample_lens.data(),
+      static_cast<unsigned>(sample_lens.size()),
+      {level, 0 /* notificationLevel */, 0 /* dictID */});
+  if (ZDICT_isError(dict_len)) {
+    return "";
+  } else {
+    assert(dict_len <= max_dict_bytes);
+    dict_data.resize(dict_len);
+    return dict_data;
+  }
+#else   // up to v1.4.4
+  (void)samples;
+  (void)sample_lens;
+  (void)max_dict_bytes;
+  (void)level;
+  return "";
+#endif  // ZSTD_VERSION_NUMBER >= 10405
+}
+
+inline bool CompressData(const Slice& raw,
+                         const CompressionInfo& compression_info,
+                         uint32_t compress_format_version,
+                         std::string* compressed_output) {
+  bool ret = false;
+
+  // Will return compressed block contents if (1) the compression method is
+  // supported in this platform and (2) the compression rate is "good enough".
+  switch (compression_info.type()) {
+    case kSnappyCompression:
+      ret = Snappy_Compress(compression_info, raw.data(), raw.size(),
+                            compressed_output);
+      break;
+    case kZlibCompression:
+      ret = Zlib_Compress(compression_info, compress_format_version, raw.data(),
+                          raw.size(), compressed_output);
+      break;
+    case kBZip2Compression:
+      ret = BZip2_Compress(compression_info, compress_format_version,
+                           raw.data(), raw.size(), compressed_output);
+      break;
+    case kLZ4Compression:
+      ret = LZ4_Compress(compression_info, compress_format_version, raw.data(),
+                         raw.size(), compressed_output);
+      break;
+    case kLZ4HCCompression:
+      ret = LZ4HC_Compress(compression_info, compress_format_version,
+                           raw.data(), raw.size(), compressed_output);
+      break;
+    case kXpressCompression:
+      ret = XPRESS_Compress(raw.data(), raw.size(), compressed_output);
+      break;
+    case kZSTD:
+    case kZSTDNotFinalCompression:
+      ret = ZSTD_Compress(compression_info, raw.data(), raw.size(),
+                          compressed_output);
+      break;
+    default:
+      // Do not recognize this compression type
+      break;
+  }
+
+  TEST_SYNC_POINT_CALLBACK("CompressData:TamperWithReturnValue",
+                           static_cast<void*>(&ret));
+
+  return ret;
+}
+
+inline CacheAllocationPtr UncompressData(
+    const UncompressionInfo& uncompression_info, const char* data, size_t n,
+    size_t* uncompressed_size, uint32_t compress_format_version,
+    MemoryAllocator* allocator = nullptr) {
+  switch (uncompression_info.type()) {
+    case kSnappyCompression:
+      return Snappy_Uncompress(data, n, uncompressed_size, allocator);
+    case kZlibCompression:
+      return Zlib_Uncompress(uncompression_info, data, n, uncompressed_size,
+                             compress_format_version, allocator);
+    case kBZip2Compression:
+      return BZip2_Uncompress(data, n, uncompressed_size,
+                              compress_format_version, allocator);
+    case kLZ4Compression:
+    case kLZ4HCCompression:
+      return LZ4_Uncompress(uncompression_info, data, n, uncompressed_size,
+                            compress_format_version, allocator);
+    case kXpressCompression:
+      // XPRESS allocates memory internally, thus no support for custom
+      // allocator.
+      return CacheAllocationPtr(XPRESS_Uncompress(data, n, uncompressed_size));
+    case kZSTD:
+    case kZSTDNotFinalCompression:
+      return ZSTD_Uncompress(uncompression_info, data, n, uncompressed_size,
+                             allocator);
+    default:
+      return CacheAllocationPtr();
+  }
+}
+
+// Records the compression type for subsequent WAL records.
+class CompressionTypeRecord {
+ public:
+  explicit CompressionTypeRecord(CompressionType compression_type)
+      : compression_type_(compression_type) {}
+
+  CompressionType GetCompressionType() const { return compression_type_; }
+
+  inline void EncodeTo(std::string* dst) const {
+    assert(dst != nullptr);
+    PutFixed32(dst, compression_type_);
+  }
+
+  inline Status DecodeFrom(Slice* src) {
+    constexpr char class_name[] = "CompressionTypeRecord";
+
+    uint32_t val;
+    if (!GetFixed32(src, &val)) {
+      return Status::Corruption(class_name,
+                                "Error decoding WAL compression type");
+    }
+    CompressionType compression_type = static_cast<CompressionType>(val);
+    if (!StreamingCompressionTypeSupported(compression_type)) {
+      return Status::Corruption(class_name,
+                                "WAL compression type not supported");
+    }
+    compression_type_ = compression_type;
+    return Status::OK();
+  }
+
+  inline std::string DebugString() const {
+    return "compression_type: " + CompressionTypeToString(compression_type_);
+  }
+
+ private:
+  CompressionType compression_type_;
+};
+
+// Base class to implement compression for a stream of buffers.
+// Instantiate an implementation of the class using Create() with the
+// compression type and use Compress() repeatedly.
+// The output buffer needs to be at least max_output_len.
+// Call Reset() in between frame boundaries or in case of an error.
+// NOTE: This class is not thread safe.
+class StreamingCompress {
+ public:
+  StreamingCompress(CompressionType compression_type,
+                    const CompressionOptions& opts,
+                    uint32_t compress_format_version, size_t max_output_len)
+      : compression_type_(compression_type),
+        opts_(opts),
+        compress_format_version_(compress_format_version),
+        max_output_len_(max_output_len) {}
+  virtual ~StreamingCompress() = default;
+  // compress should be called repeatedly with the same input till the method
+  // returns 0
+  // Parameters:
+  // input - buffer to compress
+  // input_size - size of input buffer
+  // output - compressed buffer allocated by caller, should be at least
+  // max_output_len
+  // output_size - size of the output buffer
+  // Returns -1 for errors, the remaining size of the input buffer that needs to
+  // be compressed
+  virtual int Compress(const char* input, size_t input_size, char* output,
+                       size_t* output_pos) = 0;
+  // static method to create object of a class inherited from StreamingCompress
+  // based on the actual compression type.
+  static StreamingCompress* Create(CompressionType compression_type,
+                                   const CompressionOptions& opts,
+                                   uint32_t compress_format_version,
+                                   size_t max_output_len);
+  virtual void Reset() = 0;
+
+ protected:
+  const CompressionType compression_type_;
+  const CompressionOptions opts_;
+  const uint32_t compress_format_version_;
+  const size_t max_output_len_;
+};
+
+// Base class to uncompress a stream of compressed buffers.
+// Instantiate an implementation of the class using Create() with the
+// compression type and use Uncompress() repeatedly.
+// The output buffer needs to be at least max_output_len.
+// Call Reset() in between frame boundaries or in case of an error.
+// NOTE: This class is not thread safe.
+class StreamingUncompress {
+ public:
+  StreamingUncompress(CompressionType compression_type,
+                      uint32_t compress_format_version, size_t max_output_len)
+      : compression_type_(compression_type),
+        compress_format_version_(compress_format_version),
+        max_output_len_(max_output_len) {}
+  virtual ~StreamingUncompress() = default;
+  // uncompress should be called again with the same input if output_size is
+  // equal to max_output_len or with the next input fragment.
+  // Parameters:
+  // input - buffer to uncompress
+  // input_size - size of input buffer
+  // output - uncompressed buffer allocated by caller, should be at least
+  // max_output_len
+  // output_size - size of the output buffer
+  // Returns -1 for errors, remaining input to be processed otherwise.
+  virtual int Uncompress(const char* input, size_t input_size, char* output,
+                         size_t* output_pos) = 0;
+  static StreamingUncompress* Create(CompressionType compression_type,
+                                     uint32_t compress_format_version,
+                                     size_t max_output_len);
+  virtual void Reset() = 0;
+
+ protected:
+  CompressionType compression_type_;
+  uint32_t compress_format_version_;
+  size_t max_output_len_;
+};
+
+class ZSTDStreamingCompress final : public StreamingCompress {
+ public:
+  explicit ZSTDStreamingCompress(const CompressionOptions& opts,
+                                 uint32_t compress_format_version,
+                                 size_t max_output_len)
+      : StreamingCompress(kZSTD, opts, compress_format_version,
+                          max_output_len) {
+#ifdef ZSTD_STREAMING
+    cctx_ = ZSTD_createCCtx();
+    // Each compressed frame will have a checksum
+    ZSTD_CCtx_setParameter(cctx_, ZSTD_c_checksumFlag, 1);
+    assert(cctx_ != nullptr);
+    input_buffer_ = {/*src=*/nullptr, /*size=*/0, /*pos=*/0};
+#endif
+  }
+  ~ZSTDStreamingCompress() override {
+#ifdef ZSTD_STREAMING
+    ZSTD_freeCCtx(cctx_);
+#endif
+  }
+  int Compress(const char* input, size_t input_size, char* output,
+               size_t* output_pos) override;
+  void Reset() override;
+#ifdef ZSTD_STREAMING
+  ZSTD_CCtx* cctx_;
+  ZSTD_inBuffer input_buffer_;
+#endif
+};
+
+class ZSTDStreamingUncompress final : public StreamingUncompress {
+ public:
+  explicit ZSTDStreamingUncompress(uint32_t compress_format_version,
+                                   size_t max_output_len)
+      : StreamingUncompress(kZSTD, compress_format_version, max_output_len) {
+#ifdef ZSTD_STREAMING
+    dctx_ = ZSTD_createDCtx();
+    assert(dctx_ != nullptr);
+    input_buffer_ = {/*src=*/nullptr, /*size=*/0, /*pos=*/0};
+#endif
+  }
+  ~ZSTDStreamingUncompress() override {
+#ifdef ZSTD_STREAMING
+    ZSTD_freeDCtx(dctx_);
+#endif
+  }
+  int Uncompress(const char* input, size_t input_size, char* output,
+                 size_t* output_size) override;
+  void Reset() override;
+
+ private:
+#ifdef ZSTD_STREAMING
+  ZSTD_DCtx* dctx_;
+  ZSTD_inBuffer input_buffer_;
+#endif
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/compression_context_cache.cc b/src/rocksdb/util/compression_context_cache.cc
new file mode 100644
index 000000000..52c3fac72
--- /dev/null
+++ b/src/rocksdb/util/compression_context_cache.cc
@@ -0,0 +1,106 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+
+#include "util/compression_context_cache.h"
+
+#include <atomic>
+
+#include "util/compression.h"
+#include "util/core_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace compression_cache {
+
+void* const SentinelValue = nullptr;
+// Cache ZSTD uncompression contexts for reads
+// if needed we can add ZSTD compression context caching
+// which is currently is not done since BlockBasedTableBuilder
+// simply creates one compression context per new SST file.
+struct ZSTDCachedData {
+  // We choose to cache the below structure instead of a ptr
+  // because we want to avoid a) native types leak b) make
+  // cache use transparent for the user
+  ZSTDUncompressCachedData uncomp_cached_data_;
+  std::atomic<void*> zstd_uncomp_sentinel_;
+
+  char
+      padding[(CACHE_LINE_SIZE -
+               (sizeof(ZSTDUncompressCachedData) + sizeof(std::atomic<void*>)) %
+                   CACHE_LINE_SIZE)];  // unused padding field
+
+  ZSTDCachedData() : zstd_uncomp_sentinel_(&uncomp_cached_data_) {}
+  ZSTDCachedData(const ZSTDCachedData&) = delete;
+  ZSTDCachedData& operator=(const ZSTDCachedData&) = delete;
+
+  ZSTDUncompressCachedData GetUncompressData(int64_t idx) {
+    ZSTDUncompressCachedData result;
+    void* expected = &uncomp_cached_data_;
+    if (zstd_uncomp_sentinel_.compare_exchange_strong(expected,
+                                                      SentinelValue)) {
+      uncomp_cached_data_.CreateIfNeeded();
+      result.InitFromCache(uncomp_cached_data_, idx);
+    } else {
+      // Creates one time use data
+      result.CreateIfNeeded();
+    }
+    return result;
+  }
+  // Return the entry back into circulation
+  // This is executed only when we successfully obtained
+  // in the first place
+  void ReturnUncompressData() {
+    if (zstd_uncomp_sentinel_.exchange(&uncomp_cached_data_) != SentinelValue) {
+      // Means we are returning while not having it acquired.
+      assert(false);
+    }
+  }
+};
+static_assert(sizeof(ZSTDCachedData) % CACHE_LINE_SIZE == 0,
+              "Expected CACHE_LINE_SIZE alignment");
+}  // namespace compression_cache
+
+class CompressionContextCache::Rep {
+ public:
+  Rep() {}
+  ZSTDUncompressCachedData GetZSTDUncompressData() {
+    auto p = per_core_uncompr_.AccessElementAndIndex();
+    int64_t idx = static_cast<int64_t>(p.second);
+    return p.first->GetUncompressData(idx);
+  }
+  void ReturnZSTDUncompressData(int64_t idx) {
+    assert(idx >= 0);
+    auto* cn = per_core_uncompr_.AccessAtCore(static_cast<size_t>(idx));
+    cn->ReturnUncompressData();
+  }
+
+ private:
+  CoreLocalArray<compression_cache::ZSTDCachedData> per_core_uncompr_;
+};
+
+CompressionContextCache::CompressionContextCache() : rep_(new Rep()) {}
+
+CompressionContextCache* CompressionContextCache::Instance() {
+  static CompressionContextCache instance;
+  return &instance;
+}
+
+void CompressionContextCache::InitSingleton() { Instance(); }
+
+ZSTDUncompressCachedData
+CompressionContextCache::GetCachedZSTDUncompressData() {
+  return rep_->GetZSTDUncompressData();
+}
+
+void CompressionContextCache::ReturnCachedZSTDUncompressData(int64_t idx) {
+  rep_->ReturnZSTDUncompressData(idx);
+}
+
+CompressionContextCache::~CompressionContextCache() { delete rep_; }
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/compression_context_cache.h b/src/rocksdb/util/compression_context_cache.h
new file mode 100644
index 000000000..7b7b2d507
--- /dev/null
+++ b/src/rocksdb/util/compression_context_cache.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+
+// Compression context cache allows to cache compression/uncompression contexts
+// This helps with Random Read latencies and reduces CPU utilization
+// Caching is implemented using CoreLocal facility. Compression/Uncompression
+// instances are cached on a per core basis using CoreLocalArray. A borrowed
+// instance is atomically replaced with a sentinel value for the time of being
+// used. If it turns out that another thread is already makes use of the
+// instance we still create one on the heap which is later is destroyed.
+
+#pragma once
+
+#include <stdint.h>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+class ZSTDUncompressCachedData;
+
+class CompressionContextCache {
+ public:
+  // Singleton
+  static CompressionContextCache* Instance();
+  static void InitSingleton();
+  CompressionContextCache(const CompressionContextCache&) = delete;
+  CompressionContextCache& operator=(const CompressionContextCache&) = delete;
+
+  ZSTDUncompressCachedData GetCachedZSTDUncompressData();
+  void ReturnCachedZSTDUncompressData(int64_t idx);
+
+ private:
+  // Singleton
+  CompressionContextCache();
+  ~CompressionContextCache();
+
+  class Rep;
+  Rep* rep_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/concurrent_task_limiter_impl.cc b/src/rocksdb/util/concurrent_task_limiter_impl.cc
new file mode 100644
index 000000000..a0fc7331f
--- /dev/null
+++ b/src/rocksdb/util/concurrent_task_limiter_impl.cc
@@ -0,0 +1,64 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/concurrent_task_limiter_impl.h"
+
+#include "rocksdb/concurrent_task_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ConcurrentTaskLimiterImpl::ConcurrentTaskLimiterImpl(
+    const std::string& name, int32_t max_outstanding_task)
+    : name_(name),
+      max_outstanding_tasks_{max_outstanding_task},
+      outstanding_tasks_{0} {}
+
+ConcurrentTaskLimiterImpl::~ConcurrentTaskLimiterImpl() {
+  assert(outstanding_tasks_ == 0);
+}
+
+const std::string& ConcurrentTaskLimiterImpl::GetName() const { return name_; }
+
+void ConcurrentTaskLimiterImpl::SetMaxOutstandingTask(int32_t limit) {
+  max_outstanding_tasks_.store(limit, std::memory_order_relaxed);
+}
+
+void ConcurrentTaskLimiterImpl::ResetMaxOutstandingTask() {
+  max_outstanding_tasks_.store(-1, std::memory_order_relaxed);
+}
+
+int32_t ConcurrentTaskLimiterImpl::GetOutstandingTask() const {
+  return outstanding_tasks_.load(std::memory_order_relaxed);
+}
+
+std::unique_ptr<TaskLimiterToken> ConcurrentTaskLimiterImpl::GetToken(
+    bool force) {
+  int32_t limit = max_outstanding_tasks_.load(std::memory_order_relaxed);
+  int32_t tasks = outstanding_tasks_.load(std::memory_order_relaxed);
+  // force = true, bypass the throttle.
+  // limit < 0 means unlimited tasks.
+  while (force || limit < 0 || tasks < limit) {
+    if (outstanding_tasks_.compare_exchange_weak(tasks, tasks + 1)) {
+      return std::unique_ptr<TaskLimiterToken>(new TaskLimiterToken(this));
+    }
+  }
+  return nullptr;
+}
+
+ConcurrentTaskLimiter* NewConcurrentTaskLimiter(const std::string& name,
+                                                int32_t limit) {
+  return new ConcurrentTaskLimiterImpl(name, limit);
+}
+
+TaskLimiterToken::~TaskLimiterToken() {
+  --limiter_->outstanding_tasks_;
+  assert(limiter_->outstanding_tasks_ >= 0);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/concurrent_task_limiter_impl.h b/src/rocksdb/util/concurrent_task_limiter_impl.h
new file mode 100644
index 000000000..4952ae23a
--- /dev/null
+++ b/src/rocksdb/util/concurrent_task_limiter_impl.h
@@ -0,0 +1,67 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <memory>
+
+#include "rocksdb/concurrent_task_limiter.h"
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TaskLimiterToken;
+
+class ConcurrentTaskLimiterImpl : public ConcurrentTaskLimiter {
+ public:
+  explicit ConcurrentTaskLimiterImpl(const std::string& name,
+                                     int32_t max_outstanding_task);
+  // No copying allowed
+  ConcurrentTaskLimiterImpl(const ConcurrentTaskLimiterImpl&) = delete;
+  ConcurrentTaskLimiterImpl& operator=(const ConcurrentTaskLimiterImpl&) =
+      delete;
+
+  virtual ~ConcurrentTaskLimiterImpl();
+
+  virtual const std::string& GetName() const override;
+
+  virtual void SetMaxOutstandingTask(int32_t limit) override;
+
+  virtual void ResetMaxOutstandingTask() override;
+
+  virtual int32_t GetOutstandingTask() const override;
+
+  // Request token for adding a new task.
+  // If force == true, it requests a token bypassing throttle.
+  // Returns nullptr if it got throttled.
+  virtual std::unique_ptr<TaskLimiterToken> GetToken(bool force);
+
+ private:
+  friend class TaskLimiterToken;
+
+  std::string name_;
+  std::atomic<int32_t> max_outstanding_tasks_;
+  std::atomic<int32_t> outstanding_tasks_;
+};
+
+class TaskLimiterToken {
+ public:
+  explicit TaskLimiterToken(ConcurrentTaskLimiterImpl* limiter)
+      : limiter_(limiter) {}
+  ~TaskLimiterToken();
+
+ private:
+  ConcurrentTaskLimiterImpl* limiter_;
+
+  // no copying allowed
+  TaskLimiterToken(const TaskLimiterToken&) = delete;
+  void operator=(const TaskLimiterToken&) = delete;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/core_local.h b/src/rocksdb/util/core_local.h
new file mode 100644
index 000000000..b444a1152
--- /dev/null
+++ b/src/rocksdb/util/core_local.h
@@ -0,0 +1,83 @@
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstddef>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include "port/likely.h"
+#include "port/port.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// An array of core-local values. Ideally the value type, T, is cache aligned to
+// prevent false sharing.
+template <typename T>
+class CoreLocalArray {
+ public:
+  CoreLocalArray();
+
+  size_t Size() const;
+  // returns pointer to the element corresponding to the core that the thread
+  // currently runs on.
+  T* Access() const;
+  // same as above, but also returns the core index, which the client can cache
+  // to reduce how often core ID needs to be retrieved. Only do this if some
+  // inaccuracy is tolerable, as the thread may migrate to a different core.
+  std::pair<T*, size_t> AccessElementAndIndex() const;
+  // returns pointer to element for the specified core index. This can be used,
+  // e.g., for aggregation, or if the client caches core index.
+  T* AccessAtCore(size_t core_idx) const;
+
+ private:
+  std::unique_ptr<T[]> data_;
+  int size_shift_;
+};
+
+template <typename T>
+CoreLocalArray<T>::CoreLocalArray() {
+  int num_cpus = static_cast<int>(std::thread::hardware_concurrency());
+  // find a power of two >= num_cpus and >= 8
+  size_shift_ = 3;
+  while (1 << size_shift_ < num_cpus) {
+    ++size_shift_;
+  }
+  data_.reset(new T[static_cast<size_t>(1) << size_shift_]);
+}
+
+template <typename T>
+size_t CoreLocalArray<T>::Size() const {
+  return static_cast<size_t>(1) << size_shift_;
+}
+
+template <typename T>
+T* CoreLocalArray<T>::Access() const {
+  return AccessElementAndIndex().first;
+}
+
+template <typename T>
+std::pair<T*, size_t> CoreLocalArray<T>::AccessElementAndIndex() const {
+  int cpuid = port::PhysicalCoreID();
+  size_t core_idx;
+  if (UNLIKELY(cpuid < 0)) {
+    // cpu id unavailable, just pick randomly
+    core_idx = Random::GetTLSInstance()->Uniform(1 << size_shift_);
+  } else {
+    core_idx = static_cast<size_t>(cpuid & ((1 << size_shift_) - 1));
+  }
+  return {AccessAtCore(core_idx), core_idx};
+}
+
+template <typename T>
+T* CoreLocalArray<T>::AccessAtCore(size_t core_idx) const {
+  assert(core_idx < static_cast<size_t>(1) << size_shift_);
+  return &data_[core_idx];
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/coro_utils.h b/src/rocksdb/util/coro_utils.h
new file mode 100644
index 000000000..5b4211135
--- /dev/null
+++ b/src/rocksdb/util/coro_utils.h
@@ -0,0 +1,112 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#if defined(USE_COROUTINES)
+#include "folly/experimental/coro/Coroutine.h"
+#include "folly/experimental/coro/Task.h"
+#endif
+#include "rocksdb/rocksdb_namespace.h"
+
+// This file has two sctions. The first section applies to all instances of
+// header file inclusion and has an include guard. The second section is
+// meant for multiple inclusions in the same source file, and is idempotent.
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef UTIL_CORO_UTILS_H_
+#define UTIL_CORO_UTILS_H_
+
+#if defined(USE_COROUTINES)
+
+// The follwoing macros expand to regular and coroutine function
+// declarations for a given function
+#define DECLARE_SYNC_AND_ASYNC(__ret_type__, __func_name__, ...) \
+  __ret_type__ __func_name__(__VA_ARGS__);                       \
+  folly::coro::Task<__ret_type__> __func_name__##Coroutine(__VA_ARGS__);
+
+#define DECLARE_SYNC_AND_ASYNC_OVERRIDE(__ret_type__, __func_name__, ...) \
+  __ret_type__ __func_name__(__VA_ARGS__) override;                       \
+  folly::coro::Task<__ret_type__> __func_name__##Coroutine(__VA_ARGS__)   \
+      override;
+
+#define DECLARE_SYNC_AND_ASYNC_CONST(__ret_type__, __func_name__, ...) \
+  __ret_type__ __func_name__(__VA_ARGS__) const;                       \
+  folly::coro::Task<__ret_type__> __func_name__##Coroutine(__VA_ARGS__) const;
+
+constexpr bool using_coroutines() { return true; }
+#else  // !USE_COROUTINES
+
+// The follwoing macros expand to a regular function declaration for a given
+// function
+#define DECLARE_SYNC_AND_ASYNC(__ret_type__, __func_name__, ...) \
+  __ret_type__ __func_name__(__VA_ARGS__);
+
+#define DECLARE_SYNC_AND_ASYNC_OVERRIDE(__ret_type__, __func_name__, ...) \
+  __ret_type__ __func_name__(__VA_ARGS__) override;
+
+#define DECLARE_SYNC_AND_ASYNC_CONST(__ret_type__, __func_name__, ...) \
+  __ret_type__ __func_name__(__VA_ARGS__) const;
+
+constexpr bool using_coroutines() { return false; }
+#endif  // USE_COROUTINES
+#endif  // UTIL_CORO_UTILS_H_
+
+// The following section of the file is meant to be included twice in a
+// source file - once defining WITH_COROUTINES and once defining
+// WITHOUT_COROUTINES
+#undef DEFINE_SYNC_AND_ASYNC
+#undef CO_AWAIT
+#undef CO_RETURN
+
+#if defined(WITH_COROUTINES) && defined(USE_COROUTINES)
+
+// This macro should be used in the beginning of the function
+// definition. The declaration should have been done using one of the
+// DECLARE_SYNC_AND_ASYNC* macros. It expands to the return type and
+// the function name with the Coroutine suffix. For example -
+// DEFINE_SYNC_AND_ASYNC(int, foo)(bool bar) {}
+// would expand to -
+// folly::coro::Task<int> fooCoroutine(bool bar) {}
+#define DEFINE_SYNC_AND_ASYNC(__ret_type__, __func_name__) \
+  folly::coro::Task<__ret_type__> __func_name__##Coroutine
+
+// This macro should be used to call a function that might be a
+// coroutine. It expands to the correct function name and prefixes
+// the co_await operator if necessary. For example -
+// s = CO_AWAIT(foo)(true);
+// if the code is compiled WITH_COROUTINES, would expand to
+// s = co_await fooCoroutine(true);
+// if compiled WITHOUT_COROUTINES, would expand to
+// s = foo(true);
+#define CO_AWAIT(__func_name__) co_await __func_name__##Coroutine
+
+#define CO_RETURN co_return
+
+#elif defined(WITHOUT_COROUTINES)
+
+// This macro should be used in the beginning of the function
+// definition. The declaration should have been done using one of the
+// DECLARE_SYNC_AND_ASYNC* macros. It expands to the return type and
+// the function name without the Coroutine suffix. For example -
+// DEFINE_SYNC_AND_ASYNC(int, foo)(bool bar) {}
+// would expand to -
+// int foo(bool bar) {}
+#define DEFINE_SYNC_AND_ASYNC(__ret_type__, __func_name__) \
+  __ret_type__ __func_name__
+
+// This macro should be used to call a function that might be a
+// coroutine. It expands to the correct function name and prefixes
+// the co_await operator if necessary. For example -
+// s = CO_AWAIT(foo)(true);
+// if the code is compiled WITH_COROUTINES, would expand to
+// s = co_await fooCoroutine(true);
+// if compiled WITHOUT_COROUTINES, would expand to
+// s = foo(true);
+#define CO_AWAIT(__func_name__) __func_name__
+
+#define CO_RETURN return
+
+#endif  // DO_NOT_USE_COROUTINES
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/crc32c.cc b/src/rocksdb/util/crc32c.cc
new file mode 100644
index 000000000..d71c71c2e
--- /dev/null
+++ b/src/rocksdb/util/crc32c.cc
@@ -0,0 +1,1351 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A portable implementation of crc32c, optimized to handle
+// four bytes at a time.
+#include "util/crc32c.h"
+
+#include <stdint.h>
+
+#include <array>
+#include <utility>
+#ifdef HAVE_SSE42
+#include <nmmintrin.h>
+#include <wmmintrin.h>
+#endif
+
+#include "port/lang.h"
+#include "util/coding.h"
+#include "util/crc32c_arm64.h"
+#include "util/math.h"
+
+#ifdef __powerpc64__
+#include "util/crc32c_ppc.h"
+#include "util/crc32c_ppc_constants.h"
+
+#if __linux__
+#ifdef ROCKSDB_AUXV_GETAUXVAL_PRESENT
+#include <sys/auxv.h>
+#endif
+
+#ifndef PPC_FEATURE2_VEC_CRYPTO
+#define PPC_FEATURE2_VEC_CRYPTO 0x02000000
+#endif
+
+#ifndef AT_HWCAP2
+#define AT_HWCAP2 26
+#endif
+
+#elif __FreeBSD__
+#include <machine/cpu.h>
+#include <sys/auxv.h>
+#include <sys/elf_common.h>
+#endif /* __linux__ */
+
+#endif
+
+#if defined(HAVE_ARM64_CRC)
+bool pmull_runtime_flag = false;
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+namespace crc32c {
+
+#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC)
+#ifdef __powerpc64__
+static int arch_ppc_crc32 = 0;
+#endif /* __powerpc64__ */
+#endif
+
+static const uint32_t table0_[256] = {
+    0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c,
+    0x26a1e7e8, 0xd4ca64eb, 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b,
+    0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, 0x105ec76f, 0xe235446c,
+    0xf165b798, 0x030e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384,
+    0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc,
+    0xbc267848, 0x4e4dfb4b, 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a,
+    0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, 0xaa64d611, 0x580f5512,
+    0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa,
+    0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x05125dad,
+    0x1642ae59, 0xe4292d5a, 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a,
+    0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, 0x417b1dbc, 0xb3109ebf,
+    0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957,
+    0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0x0c38d26c, 0xfe53516f,
+    0xed03a29b, 0x1f682198, 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927,
+    0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, 0xdbfc821c, 0x2997011f,
+    0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7,
+    0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e,
+    0x4767748a, 0xb50cf789, 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859,
+    0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, 0x7198540d, 0x83f3d70e,
+    0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6,
+    0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de,
+    0xdde0eb2a, 0x2f8b6829, 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c,
+    0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, 0x082f63b7, 0xfa44e0b4,
+    0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c,
+    0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b,
+    0xb4091bff, 0x466298fc, 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c,
+    0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, 0xa24bb5a6, 0x502036a5,
+    0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d,
+    0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975,
+    0x0e330a81, 0xfc588982, 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d,
+    0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, 0x38cc2a06, 0xcaa7a905,
+    0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed,
+    0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x0417b1db, 0xf67c32d8,
+    0xe52cc12c, 0x1747422f, 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff,
+    0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, 0xd3d3e1ab, 0x21b862a8,
+    0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540,
+    0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78,
+    0x7fab5e8c, 0x8dc0dd8f, 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee,
+    0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, 0x69e9f0d5, 0x9b8273d6,
+    0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e,
+    0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69,
+    0xd5cf889d, 0x27a40b9e, 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e,
+    0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351};
+static const uint32_t table1_[256] = {
+    0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, 0x4e8a61dc, 0x5d28f9ab,
+    0x69cf5132, 0x7a6dc945, 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21,
+    0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd, 0x3fc5f181, 0x2c6769f6,
+    0x1880c16f, 0x0b225918, 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4,
+    0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, 0xec5b53e5, 0xfff9cb92,
+    0xcb1e630b, 0xd8bcfb7c, 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b,
+    0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47, 0xe29f20ba, 0xf13db8cd,
+    0xc5da1054, 0xd6788823, 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff,
+    0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, 0x0ec4735f, 0x1d66eb28,
+    0x298143b1, 0x3a23dbc6, 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2,
+    0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e, 0xff17c604, 0xecb55e73,
+    0xd852f6ea, 0xcbf06e9d, 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41,
+    0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, 0x2c896460, 0x3f2bfc17,
+    0x0bcc548e, 0x186eccf9, 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c,
+    0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0, 0x5dc6f43d, 0x4e646c4a,
+    0x7a83c4d3, 0x69215ca4, 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78,
+    0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, 0xce1644da, 0xddb4dcad,
+    0xe9537434, 0xfaf1ec43, 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27,
+    0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb, 0xbf59d487, 0xacfb4cf0,
+    0x981ce469, 0x8bbe7c1e, 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2,
+    0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, 0x6cc776e3, 0x7f65ee94,
+    0x4b82460d, 0x5820de7a, 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260,
+    0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc, 0x66d73941, 0x7575a136,
+    0x419209af, 0x523091d8, 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004,
+    0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, 0x8a8c6aa4, 0x992ef2d3,
+    0xadc95a4a, 0xbe6bc23d, 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059,
+    0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185, 0x844819fb, 0x97ea818c,
+    0xa30d2915, 0xb0afb162, 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be,
+    0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, 0x57d6bb9f, 0x447423e8,
+    0x70938b71, 0x63311306, 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3,
+    0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f, 0x26992bc2, 0x353bb3b5,
+    0x01dc1b2c, 0x127e835b, 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287,
+    0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, 0x4a5e5d21, 0x59fcc556,
+    0x6d1b6dcf, 0x7eb9f5b8, 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc,
+    0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600, 0x3b11cd7c, 0x28b3550b,
+    0x1c54fd92, 0x0ff665e5, 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439,
+    0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, 0xe88f6f18, 0xfb2df76f,
+    0xcfca5ff6, 0xdc68c781, 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766,
+    0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba, 0xe64b1c47, 0xf5e98430,
+    0xc10e2ca9, 0xd2acb4de, 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502,
+    0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, 0x0a104fa2, 0x19b2d7d5,
+    0x2d557f4c, 0x3ef7e73b, 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f,
+    0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483};
+static const uint32_t table2_[256] = {
+    0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, 0x9edea41a, 0x3b9f3664,
+    0xd1b1f617, 0x74f06469, 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6,
+    0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac, 0x70a27d8a, 0xd5e3eff4,
+    0x3fcd2f87, 0x9a8cbdf9, 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3,
+    0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, 0xd62de755, 0x736c752b,
+    0x9942b558, 0x3c032726, 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67,
+    0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d, 0xd915c5d1, 0x7c5457af,
+    0x967a97dc, 0x333b05a2, 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8,
+    0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, 0x0f382284, 0xaa79b0fa,
+    0x40577089, 0xe516e2f7, 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828,
+    0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32, 0xc76580d9, 0x622412a7,
+    0x880ad2d4, 0x2d4b40aa, 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0,
+    0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, 0x61ea1a06, 0xc4ab8878,
+    0x2e85480b, 0x8bc4da75, 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20,
+    0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a, 0x8f96c396, 0x2ad751e8,
+    0xc0f9919b, 0x65b803e5, 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff,
+    0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, 0xb8ffdfd7, 0x1dbe4da9,
+    0xf7908dda, 0x52d11fa4, 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b,
+    0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161, 0x56830647, 0xf3c29439,
+    0x19ec544a, 0xbcadc634, 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e,
+    0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, 0xf00c9c98, 0x554d0ee6,
+    0xbf63ce95, 0x1a225ceb, 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730,
+    0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a, 0xb3764986, 0x1637dbf8,
+    0xfc191b8b, 0x595889f5, 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def,
+    0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, 0x655baed3, 0xc01a3cad,
+    0x2a34fcde, 0x8f756ea0, 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f,
+    0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065, 0x6a638c57, 0xcf221e29,
+    0x250cde5a, 0x804d4c24, 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e,
+    0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, 0xccec1688, 0x69ad84f6,
+    0x83834485, 0x26c2d6fb, 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae,
+    0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4, 0x2290cf18, 0x87d15d66,
+    0x6dff9d15, 0xc8be0f6b, 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71,
+    0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, 0xd29c5380, 0x77ddc1fe,
+    0x9df3018d, 0x38b293f3, 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c,
+    0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36, 0x3ce08a10, 0x99a1186e,
+    0x738fd81d, 0xd6ce4a63, 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79,
+    0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, 0x9a6f10cf, 0x3f2e82b1,
+    0xd50042c2, 0x7041d0bc, 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd,
+    0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7, 0x9557324b, 0x3016a035,
+    0xda386046, 0x7f79f238, 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622,
+    0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, 0x437ad51e, 0xe63b4760,
+    0x0c158713, 0xa954156d, 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2,
+    0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8};
+static const uint32_t table3_[256] = {
+    0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, 0x7b2231f3, 0xa6679b4b,
+    0xc4451272, 0x1900b8ca, 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf,
+    0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c, 0xe964b13d, 0x34211b85,
+    0x560392bc, 0x8b463804, 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7,
+    0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, 0x6402e328, 0xb9474990,
+    0xdb65c0a9, 0x06206a11, 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2,
+    0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41, 0x2161776d, 0xfc24ddd5,
+    0x9e0654ec, 0x4343fe54, 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7,
+    0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, 0x45639445, 0x98263efd,
+    0xfa04b7c4, 0x27411d7c, 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69,
+    0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a, 0xaba65fe7, 0x76e3f55f,
+    0x14c17c66, 0xc984d6de, 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d,
+    0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, 0x26c00df2, 0xfb85a74a,
+    0x99a72e73, 0x44e284cb, 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3,
+    0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610, 0xb4868d3c, 0x69c32784,
+    0x0be1aebd, 0xd6a40405, 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6,
+    0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, 0x07a17a9f, 0xdae4d027,
+    0xb8c6591e, 0x6583f3a6, 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3,
+    0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040, 0x95e7fa51, 0x48a250e9,
+    0x2a80d9d0, 0xf7c57368, 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b,
+    0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, 0x1881a844, 0xc5c402fc,
+    0xa7e68bc5, 0x7aa3217d, 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006,
+    0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5, 0xa4e4aad9, 0x79a10061,
+    0x1b838958, 0xc6c623e0, 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213,
+    0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, 0xc0e649f1, 0x1da3e349,
+    0x7f816a70, 0xa2c4c0c8, 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd,
+    0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e, 0x8585ddb4, 0x58c0770c,
+    0x3ae2fe35, 0xe7a7548d, 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e,
+    0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, 0x08e38fa1, 0xd5a62519,
+    0xb784ac20, 0x6ac10698, 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0,
+    0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443, 0x9aa50f6f, 0x47e0a5d7,
+    0x25c22cee, 0xf8878656, 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5,
+    0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, 0x8224a72b, 0x5f610d93,
+    0x3d4384aa, 0xe0062e12, 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07,
+    0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4, 0x106227e5, 0xcd278d5d,
+    0xaf050464, 0x7240aedc, 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f,
+    0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, 0x9d0475f0, 0x4041df48,
+    0x22635671, 0xff26fcc9, 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a,
+    0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99, 0xd867e1b5, 0x05224b0d,
+    0x6700c234, 0xba45688c, 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f,
+    0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, 0xbc65029d, 0x6120a825,
+    0x0302211c, 0xde478ba4, 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1,
+    0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842};
+
+// Used to fetch a naturally-aligned 32-bit word in little endian byte-order
+static inline uint32_t LE_LOAD32(const uint8_t* p) {
+  return DecodeFixed32(reinterpret_cast<const char*>(p));
+}
+
+#if defined(HAVE_SSE42) && (defined(__LP64__) || defined(_WIN64))
+static inline uint64_t LE_LOAD64(const uint8_t* p) {
+  return DecodeFixed64(reinterpret_cast<const char*>(p));
+}
+#endif
+
+static inline void Slow_CRC32(uint64_t* l, uint8_t const** p) {
+  uint32_t c = static_cast<uint32_t>(*l ^ LE_LOAD32(*p));
+  *p += 4;
+  *l = table3_[c & 0xff] ^ table2_[(c >> 8) & 0xff] ^
+       table1_[(c >> 16) & 0xff] ^ table0_[c >> 24];
+  // DO it twice.
+  c = static_cast<uint32_t>(*l ^ LE_LOAD32(*p));
+  *p += 4;
+  *l = table3_[c & 0xff] ^ table2_[(c >> 8) & 0xff] ^
+       table1_[(c >> 16) & 0xff] ^ table0_[c >> 24];
+}
+
+#if (!(defined(HAVE_POWER8) && defined(HAS_ALTIVEC))) && \
+        (!defined(HAVE_ARM64_CRC)) ||                    \
+    defined(NO_THREEWAY_CRC32C)
+static inline void Fast_CRC32(uint64_t* l, uint8_t const** p) {
+#ifndef HAVE_SSE42
+  Slow_CRC32(l, p);
+#elif defined(__LP64__) || defined(_WIN64)
+  *l = _mm_crc32_u64(*l, LE_LOAD64(*p));
+  *p += 8;
+#else
+  *l = _mm_crc32_u32(static_cast<unsigned int>(*l), LE_LOAD32(*p));
+  *p += 4;
+  *l = _mm_crc32_u32(static_cast<unsigned int>(*l), LE_LOAD32(*p));
+  *p += 4;
+#endif
+}
+#endif
+
+template <void (*CRC32)(uint64_t*, uint8_t const**)>
+uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
+  const uint8_t* p = reinterpret_cast<const uint8_t*>(buf);
+  const uint8_t* e = p + size;
+  uint64_t l = crc ^ 0xffffffffu;
+
+// Align n to (1 << m) byte boundary
+#define ALIGN(n, m) ((n + ((1 << m) - 1)) & ~((1 << m) - 1))
+
+#define STEP1                  \
+  do {                         \
+    int c = (l & 0xff) ^ *p++; \
+    l = table0_[c] ^ (l >> 8); \
+  } while (0)
+
+  // Point x at first 16-byte aligned byte in string.  This might be
+  // just past the end of the string.
+  const uintptr_t pval = reinterpret_cast<uintptr_t>(p);
+  const uint8_t* x = reinterpret_cast<const uint8_t*>(ALIGN(pval, 4));
+  if (x <= e) {
+    // Process bytes until finished or p is 16-byte aligned
+    while (p != x) {
+      STEP1;
+    }
+  }
+  // Process bytes 16 at a time
+  while ((e - p) >= 16) {
+    CRC32(&l, &p);
+    CRC32(&l, &p);
+  }
+  // Process bytes 8 at a time
+  while ((e - p) >= 8) {
+    CRC32(&l, &p);
+  }
+  // Process the last few bytes
+  while (p != e) {
+    STEP1;
+  }
+#undef STEP1
+#undef ALIGN
+  return static_cast<uint32_t>(l ^ 0xffffffffu);
+}
+
+// Detect if ARM64 CRC or not.
+#ifndef HAVE_ARM64_CRC
+// Detect if SS42 or not.
+#ifndef HAVE_POWER8
+
+static bool isSSE42() {
+#ifndef HAVE_SSE42
+  return false;
+#elif defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE)
+  uint32_t c_;
+  __asm__("cpuid" : "=c"(c_) : "a"(1) : "ebx", "edx");
+  return c_ & (1U << 20);  // copied from CpuId.h in Folly. Test SSE42
+#elif defined(_WIN64)
+  int info[4];
+  __cpuidex(info, 0x00000001, 0);
+  return (info[2] & ((int)1 << 20)) != 0;
+#else
+  return false;
+#endif
+}
+
+static bool isPCLMULQDQ() {
+#ifndef HAVE_SSE42
+  // in build_detect_platform we set this macro when both SSE42 and PCLMULQDQ
+  // are supported by compiler
+  return false;
+#elif defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE)
+  uint32_t c_;
+  __asm__("cpuid" : "=c"(c_) : "a"(1) : "ebx", "edx");
+  return c_ & (1U << 1);  // PCLMULQDQ is in bit 1 (not bit 0)
+#elif defined(_WIN64)
+  int info[4];
+  __cpuidex(info, 0x00000001, 0);
+  return (info[2] & ((int)1 << 1)) != 0;
+#else
+  return false;
+#endif
+}
+
+#endif  // HAVE_POWER8
+#endif  // HAVE_ARM64_CRC
+
+using Function = uint32_t (*)(uint32_t, const char*, size_t);
+
+#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC)
+uint32_t ExtendPPCImpl(uint32_t crc, const char* buf, size_t size) {
+  return crc32c_ppc(crc, (const unsigned char*)buf, size);
+}
+
+#if __linux__
+static int arch_ppc_probe(void) {
+  arch_ppc_crc32 = 0;
+
+#if defined(__powerpc64__) && defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT)
+  if (getauxval(AT_HWCAP2) & PPC_FEATURE2_VEC_CRYPTO) arch_ppc_crc32 = 1;
+#endif /* __powerpc64__ */
+
+  return arch_ppc_crc32;
+}
+#elif __FreeBSD__
+static int arch_ppc_probe(void) {
+  unsigned long cpufeatures;
+  arch_ppc_crc32 = 0;
+
+#if defined(__powerpc64__)
+  elf_aux_info(AT_HWCAP2, &cpufeatures, sizeof(cpufeatures));
+  if (cpufeatures & PPC_FEATURE2_HAS_VEC_CRYPTO) arch_ppc_crc32 = 1;
+#endif  /* __powerpc64__ */
+
+  return arch_ppc_crc32;
+}
+#endif  // __linux__
+
+static bool isAltiVec() {
+  if (arch_ppc_probe()) {
+    return true;
+  } else {
+    return false;
+  }
+}
+#endif
+
+#if defined(HAVE_ARM64_CRC)
+uint32_t ExtendARMImpl(uint32_t crc, const char* buf, size_t size) {
+  return crc32c_arm64(crc, (const unsigned char*)buf, size);
+}
+#endif
+
+std::string IsFastCrc32Supported() {
+  bool has_fast_crc = false;
+  std::string fast_zero_msg;
+  std::string arch;
+#ifdef HAVE_POWER8
+#ifdef HAS_ALTIVEC
+  if (arch_ppc_probe()) {
+    has_fast_crc = true;
+    arch = "PPC";
+  }
+#else
+  has_fast_crc = false;
+  arch = "PPC";
+#endif
+#elif defined(HAVE_ARM64_CRC)
+  if (crc32c_runtime_check()) {
+    has_fast_crc = true;
+    arch = "Arm64";
+    pmull_runtime_flag = crc32c_pmull_runtime_check();
+  } else {
+    has_fast_crc = false;
+    arch = "Arm64";
+  }
+#else
+  has_fast_crc = isSSE42();
+  arch = "x86";
+#endif
+  if (has_fast_crc) {
+    fast_zero_msg.append("Supported on " + arch);
+  } else {
+    fast_zero_msg.append("Not supported on " + arch);
+  }
+  return fast_zero_msg;
+}
+
+/*
+ * Copyright 2016 Ferry Toth, Exalon Delft BV, The Netherlands
+ *  This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the author be held liable for any damages
+ * arising from the use of this software.
+ *  Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *  1. The origin of this software must not be misrepresented; you must not
+ *   claim that you wrote the original software. If you use this software
+ *   in a product, an acknowledgment in the product documentation would be
+ *   appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *   misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *  Ferry Toth
+ * ftoth@exalondelft.nl
+ *
+ * https://github.com/htot/crc32c
+ *
+ * Modified by Facebook
+ *
+ * Original intel whitepaper:
+ * "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction"
+ * https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
+ *
+ * This version is from the folly library, created by Dave Watson
+ * <davejwatson@fb.com>
+ *
+ */
+#if defined HAVE_SSE42 && defined HAVE_PCLMUL
+
+#define CRCtriplet(crc, buf, offset)                  \
+  crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \
+  crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset)); \
+  crc##2 = _mm_crc32_u64(crc##2, *(buf##2 + offset));
+
+#define CRCduplet(crc, buf, offset)                   \
+  crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \
+  crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset));
+
+#define CRCsinglet(crc, buf, offset) \
+  crc = _mm_crc32_u64(crc, *(uint64_t*)(buf + offset));
+
+// Numbers taken directly from intel whitepaper.
+// clang-format off
+const uint64_t clmul_constants[] = {
+    0x14cd00bd6, 0x105ec76f0, 0x0ba4fc28e, 0x14cd00bd6,
+    0x1d82c63da, 0x0f20c0dfe, 0x09e4addf8, 0x0ba4fc28e,
+    0x039d3b296, 0x1384aa63a, 0x102f9b8a2, 0x1d82c63da,
+    0x14237f5e6, 0x01c291d04, 0x00d3b6092, 0x09e4addf8,
+    0x0c96cfdc0, 0x0740eef02, 0x18266e456, 0x039d3b296,
+    0x0daece73e, 0x0083a6eec, 0x0ab7aff2a, 0x102f9b8a2,
+    0x1248ea574, 0x1c1733996, 0x083348832, 0x14237f5e6,
+    0x12c743124, 0x02ad91c30, 0x0b9e02b86, 0x00d3b6092,
+    0x018b33a4e, 0x06992cea2, 0x1b331e26a, 0x0c96cfdc0,
+    0x17d35ba46, 0x07e908048, 0x1bf2e8b8a, 0x18266e456,
+    0x1a3e0968a, 0x11ed1f9d8, 0x0ce7f39f4, 0x0daece73e,
+    0x061d82e56, 0x0f1d0f55e, 0x0d270f1a2, 0x0ab7aff2a,
+    0x1c3f5f66c, 0x0a87ab8a8, 0x12ed0daac, 0x1248ea574,
+    0x065863b64, 0x08462d800, 0x11eef4f8e, 0x083348832,
+    0x1ee54f54c, 0x071d111a8, 0x0b3e32c28, 0x12c743124,
+    0x0064f7f26, 0x0ffd852c6, 0x0dd7e3b0c, 0x0b9e02b86,
+    0x0f285651c, 0x0dcb17aa4, 0x010746f3c, 0x018b33a4e,
+    0x1c24afea4, 0x0f37c5aee, 0x0271d9844, 0x1b331e26a,
+    0x08e766a0c, 0x06051d5a2, 0x093a5f730, 0x17d35ba46,
+    0x06cb08e5c, 0x11d5ca20e, 0x06b749fb2, 0x1bf2e8b8a,
+    0x1167f94f2, 0x021f3d99c, 0x0cec3662e, 0x1a3e0968a,
+    0x19329634a, 0x08f158014, 0x0e6fc4e6a, 0x0ce7f39f4,
+    0x08227bb8a, 0x1a5e82106, 0x0b0cd4768, 0x061d82e56,
+    0x13c2b89c4, 0x188815ab2, 0x0d7a4825c, 0x0d270f1a2,
+    0x10f5ff2ba, 0x105405f3e, 0x00167d312, 0x1c3f5f66c,
+    0x0f6076544, 0x0e9adf796, 0x026f6a60a, 0x12ed0daac,
+    0x1a2adb74e, 0x096638b34, 0x19d34af3a, 0x065863b64,
+    0x049c3cc9c, 0x1e50585a0, 0x068bce87a, 0x11eef4f8e,
+    0x1524fa6c6, 0x19f1c69dc, 0x16cba8aca, 0x1ee54f54c,
+    0x042d98888, 0x12913343e, 0x1329d9f7e, 0x0b3e32c28,
+    0x1b1c69528, 0x088f25a3a, 0x02178513a, 0x0064f7f26,
+    0x0e0ac139e, 0x04e36f0b0, 0x0170076fa, 0x0dd7e3b0c,
+    0x141a1a2e2, 0x0bd6f81f8, 0x16ad828b4, 0x0f285651c,
+    0x041d17b64, 0x19425cbba, 0x1fae1cc66, 0x010746f3c,
+    0x1a75b4b00, 0x18db37e8a, 0x0f872e54c, 0x1c24afea4,
+    0x01e41e9fc, 0x04c144932, 0x086d8e4d2, 0x0271d9844,
+    0x160f7af7a, 0x052148f02, 0x05bb8f1bc, 0x08e766a0c,
+    0x0a90fd27a, 0x0a3c6f37a, 0x0b3af077a, 0x093a5f730,
+    0x04984d782, 0x1d22c238e, 0x0ca6ef3ac, 0x06cb08e5c,
+    0x0234e0b26, 0x063ded06a, 0x1d88abd4a, 0x06b749fb2,
+    0x04597456a, 0x04d56973c, 0x0e9e28eb4, 0x1167f94f2,
+    0x07b3ff57a, 0x19385bf2e, 0x0c9c8b782, 0x0cec3662e,
+    0x13a9cba9e, 0x0e417f38a, 0x093e106a4, 0x19329634a,
+    0x167001a9c, 0x14e727980, 0x1ddffc5d4, 0x0e6fc4e6a,
+    0x00df04680, 0x0d104b8fc, 0x02342001e, 0x08227bb8a,
+    0x00a2a8d7e, 0x05b397730, 0x168763fa6, 0x0b0cd4768,
+    0x1ed5a407a, 0x0e78eb416, 0x0d2c3ed1a, 0x13c2b89c4,
+    0x0995a5724, 0x1641378f0, 0x19b1afbc4, 0x0d7a4825c,
+    0x109ffedc0, 0x08d96551c, 0x0f2271e60, 0x10f5ff2ba,
+    0x00b0bf8ca, 0x00bf80dd2, 0x123888b7a, 0x00167d312,
+    0x1e888f7dc, 0x18dcddd1c, 0x002ee03b2, 0x0f6076544,
+    0x183e8d8fe, 0x06a45d2b2, 0x133d7a042, 0x026f6a60a,
+    0x116b0f50c, 0x1dd3e10e8, 0x05fabe670, 0x1a2adb74e,
+    0x130004488, 0x0de87806c, 0x000bcf5f6, 0x19d34af3a,
+    0x18f0c7078, 0x014338754, 0x017f27698, 0x049c3cc9c,
+    0x058ca5f00, 0x15e3e77ee, 0x1af900c24, 0x068bce87a,
+    0x0b5cfca28, 0x0dd07448e, 0x0ded288f8, 0x1524fa6c6,
+    0x059f229bc, 0x1d8048348, 0x06d390dec, 0x16cba8aca,
+    0x037170390, 0x0a3e3e02c, 0x06353c1cc, 0x042d98888,
+    0x0c4584f5c, 0x0d73c7bea, 0x1f16a3418, 0x1329d9f7e,
+    0x0531377e2, 0x185137662, 0x1d8d9ca7c, 0x1b1c69528,
+    0x0b25b29f2, 0x18a08b5bc, 0x19fb2a8b0, 0x02178513a,
+    0x1a08fe6ac, 0x1da758ae0, 0x045cddf4e, 0x0e0ac139e,
+    0x1a91647f2, 0x169cf9eb0, 0x1a0f717c4, 0x0170076fa,
+};
+
+// Compute the crc32c value for buffer smaller than 8
+#ifdef ROCKSDB_UBSAN_RUN
+#if defined(__clang__)
+__attribute__((__no_sanitize__("alignment")))
+#elif defined(__GNUC__)
+__attribute__((__no_sanitize_undefined__))
+#endif
+#endif
+inline void align_to_8(
+    size_t len,
+    uint64_t& crc0, // crc so far, updated on return
+    const unsigned char*& next) { // next data pointer, updated on return
+  uint32_t crc32bit = static_cast<uint32_t>(crc0);
+  if (len & 0x04) {
+    crc32bit = _mm_crc32_u32(crc32bit, *(uint32_t*)next);
+    next += sizeof(uint32_t);
+  }
+  if (len & 0x02) {
+    crc32bit = _mm_crc32_u16(crc32bit, *(uint16_t*)next);
+    next += sizeof(uint16_t);
+  }
+  if (len & 0x01) {
+    crc32bit = _mm_crc32_u8(crc32bit, *(next));
+    next++;
+  }
+  crc0 = crc32bit;
+}
+
+//
+// CombineCRC performs pclmulqdq multiplication of 2 partial CRC's and a well
+// chosen constant and xor's these with the remaining CRC.
+//
+inline uint64_t CombineCRC(
+    size_t block_size,
+    uint64_t crc0,
+    uint64_t crc1,
+    uint64_t crc2,
+    const uint64_t* next2) {
+  const auto multiplier =
+      *(reinterpret_cast<const __m128i*>(clmul_constants) + block_size - 1);
+  const auto crc0_xmm = _mm_set_epi64x(0, crc0);
+  const auto res0 = _mm_clmulepi64_si128(crc0_xmm, multiplier, 0x00);
+  const auto crc1_xmm = _mm_set_epi64x(0, crc1);
+  const auto res1 = _mm_clmulepi64_si128(crc1_xmm, multiplier, 0x10);
+  const auto res = _mm_xor_si128(res0, res1);
+  crc0 = _mm_cvtsi128_si64(res);
+  crc0 = crc0 ^ *((uint64_t*)next2 - 1);
+  crc2 = _mm_crc32_u64(crc2, crc0);
+  return crc2;
+}
+
+// Compute CRC-32C using the Intel hardware instruction.
+#ifdef ROCKSDB_UBSAN_RUN
+#if defined(__clang__)
+__attribute__((__no_sanitize__("alignment")))
+#elif defined(__GNUC__)
+__attribute__((__no_sanitize_undefined__))
+#endif
+#endif
+uint32_t crc32c_3way(uint32_t crc, const char* buf, size_t len) {
+  const unsigned char* next = (const unsigned char*)buf;
+  uint64_t count;
+  uint64_t crc0, crc1, crc2;
+  crc0 = crc ^ 0xffffffffu;
+
+
+  if (len >= 8) {
+    // if len > 216 then align and use triplets
+    if (len > 216) {
+      {
+        // Work on the bytes (< 8) before the first 8-byte alignment addr starts
+        uint64_t align_bytes = (8 - (uintptr_t)next) & 7;
+        len -= align_bytes;
+        align_to_8(align_bytes, crc0, next);
+      }
+
+      // Now work on the remaining blocks
+      count = len / 24; // number of triplets
+      len %= 24; // bytes remaining
+      uint64_t n = count >> 7; // #blocks = first block + full blocks
+      uint64_t block_size = count & 127;
+      if (block_size == 0) {
+        block_size = 128;
+      } else {
+        n++;
+      }
+      // points to the first byte of the next block
+      const uint64_t* next0 = (uint64_t*)next + block_size;
+      const uint64_t* next1 = next0 + block_size;
+      const uint64_t* next2 = next1 + block_size;
+
+      crc1 = crc2 = 0;
+      // Use Duff's device, a for() loop inside a switch()
+      // statement. This needs to execute at least once, round len
+      // down to nearest triplet multiple
+      switch (block_size) {
+        case 128:
+          do {
+            // jumps here for a full block of len 128
+            CRCtriplet(crc, next, -128);
+            FALLTHROUGH_INTENDED;
+            case 127:
+              // jumps here or below for the first block smaller
+              CRCtriplet(crc, next, -127);
+              FALLTHROUGH_INTENDED;
+            case 126:
+              CRCtriplet(crc, next, -126); // than 128
+              FALLTHROUGH_INTENDED;
+            case 125:
+              CRCtriplet(crc, next, -125);
+              FALLTHROUGH_INTENDED;
+            case 124:
+              CRCtriplet(crc, next, -124);
+              FALLTHROUGH_INTENDED;
+            case 123:
+              CRCtriplet(crc, next, -123);
+              FALLTHROUGH_INTENDED;
+            case 122:
+              CRCtriplet(crc, next, -122);
+              FALLTHROUGH_INTENDED;
+            case 121:
+              CRCtriplet(crc, next, -121);
+              FALLTHROUGH_INTENDED;
+            case 120:
+              CRCtriplet(crc, next, -120);
+              FALLTHROUGH_INTENDED;
+            case 119:
+              CRCtriplet(crc, next, -119);
+              FALLTHROUGH_INTENDED;
+            case 118:
+              CRCtriplet(crc, next, -118);
+              FALLTHROUGH_INTENDED;
+            case 117:
+              CRCtriplet(crc, next, -117);
+              FALLTHROUGH_INTENDED;
+            case 116:
+              CRCtriplet(crc, next, -116);
+              FALLTHROUGH_INTENDED;
+            case 115:
+              CRCtriplet(crc, next, -115);
+              FALLTHROUGH_INTENDED;
+            case 114:
+              CRCtriplet(crc, next, -114);
+              FALLTHROUGH_INTENDED;
+            case 113:
+              CRCtriplet(crc, next, -113);
+              FALLTHROUGH_INTENDED;
+            case 112:
+              CRCtriplet(crc, next, -112);
+              FALLTHROUGH_INTENDED;
+            case 111:
+              CRCtriplet(crc, next, -111);
+              FALLTHROUGH_INTENDED;
+            case 110:
+              CRCtriplet(crc, next, -110);
+              FALLTHROUGH_INTENDED;
+            case 109:
+              CRCtriplet(crc, next, -109);
+              FALLTHROUGH_INTENDED;
+            case 108:
+              CRCtriplet(crc, next, -108);
+              FALLTHROUGH_INTENDED;
+            case 107:
+              CRCtriplet(crc, next, -107);
+              FALLTHROUGH_INTENDED;
+            case 106:
+              CRCtriplet(crc, next, -106);
+              FALLTHROUGH_INTENDED;
+            case 105:
+              CRCtriplet(crc, next, -105);
+              FALLTHROUGH_INTENDED;
+            case 104:
+              CRCtriplet(crc, next, -104);
+              FALLTHROUGH_INTENDED;
+            case 103:
+              CRCtriplet(crc, next, -103);
+              FALLTHROUGH_INTENDED;
+            case 102:
+              CRCtriplet(crc, next, -102);
+              FALLTHROUGH_INTENDED;
+            case 101:
+              CRCtriplet(crc, next, -101);
+              FALLTHROUGH_INTENDED;
+            case 100:
+              CRCtriplet(crc, next, -100);
+              FALLTHROUGH_INTENDED;
+            case 99:
+              CRCtriplet(crc, next, -99);
+              FALLTHROUGH_INTENDED;
+            case 98:
+              CRCtriplet(crc, next, -98);
+              FALLTHROUGH_INTENDED;
+            case 97:
+              CRCtriplet(crc, next, -97);
+              FALLTHROUGH_INTENDED;
+            case 96:
+              CRCtriplet(crc, next, -96);
+              FALLTHROUGH_INTENDED;
+            case 95:
+              CRCtriplet(crc, next, -95);
+              FALLTHROUGH_INTENDED;
+            case 94:
+              CRCtriplet(crc, next, -94);
+              FALLTHROUGH_INTENDED;
+            case 93:
+              CRCtriplet(crc, next, -93);
+              FALLTHROUGH_INTENDED;
+            case 92:
+              CRCtriplet(crc, next, -92);
+              FALLTHROUGH_INTENDED;
+            case 91:
+              CRCtriplet(crc, next, -91);
+              FALLTHROUGH_INTENDED;
+            case 90:
+              CRCtriplet(crc, next, -90);
+              FALLTHROUGH_INTENDED;
+            case 89:
+              CRCtriplet(crc, next, -89);
+              FALLTHROUGH_INTENDED;
+            case 88:
+              CRCtriplet(crc, next, -88);
+              FALLTHROUGH_INTENDED;
+            case 87:
+              CRCtriplet(crc, next, -87);
+              FALLTHROUGH_INTENDED;
+            case 86:
+              CRCtriplet(crc, next, -86);
+              FALLTHROUGH_INTENDED;
+            case 85:
+              CRCtriplet(crc, next, -85);
+              FALLTHROUGH_INTENDED;
+            case 84:
+              CRCtriplet(crc, next, -84);
+              FALLTHROUGH_INTENDED;
+            case 83:
+              CRCtriplet(crc, next, -83);
+              FALLTHROUGH_INTENDED;
+            case 82:
+              CRCtriplet(crc, next, -82);
+              FALLTHROUGH_INTENDED;
+            case 81:
+              CRCtriplet(crc, next, -81);
+              FALLTHROUGH_INTENDED;
+            case 80:
+              CRCtriplet(crc, next, -80);
+              FALLTHROUGH_INTENDED;
+            case 79:
+              CRCtriplet(crc, next, -79);
+              FALLTHROUGH_INTENDED;
+            case 78:
+              CRCtriplet(crc, next, -78);
+              FALLTHROUGH_INTENDED;
+            case 77:
+              CRCtriplet(crc, next, -77);
+              FALLTHROUGH_INTENDED;
+            case 76:
+              CRCtriplet(crc, next, -76);
+              FALLTHROUGH_INTENDED;
+            case 75:
+              CRCtriplet(crc, next, -75);
+              FALLTHROUGH_INTENDED;
+            case 74:
+              CRCtriplet(crc, next, -74);
+              FALLTHROUGH_INTENDED;
+            case 73:
+              CRCtriplet(crc, next, -73);
+              FALLTHROUGH_INTENDED;
+            case 72:
+              CRCtriplet(crc, next, -72);
+              FALLTHROUGH_INTENDED;
+            case 71:
+              CRCtriplet(crc, next, -71);
+              FALLTHROUGH_INTENDED;
+            case 70:
+              CRCtriplet(crc, next, -70);
+              FALLTHROUGH_INTENDED;
+            case 69:
+              CRCtriplet(crc, next, -69);
+              FALLTHROUGH_INTENDED;
+            case 68:
+              CRCtriplet(crc, next, -68);
+              FALLTHROUGH_INTENDED;
+            case 67:
+              CRCtriplet(crc, next, -67);
+              FALLTHROUGH_INTENDED;
+            case 66:
+              CRCtriplet(crc, next, -66);
+              FALLTHROUGH_INTENDED;
+            case 65:
+              CRCtriplet(crc, next, -65);
+              FALLTHROUGH_INTENDED;
+            case 64:
+              CRCtriplet(crc, next, -64);
+              FALLTHROUGH_INTENDED;
+            case 63:
+              CRCtriplet(crc, next, -63);
+              FALLTHROUGH_INTENDED;
+            case 62:
+              CRCtriplet(crc, next, -62);
+              FALLTHROUGH_INTENDED;
+            case 61:
+              CRCtriplet(crc, next, -61);
+              FALLTHROUGH_INTENDED;
+            case 60:
+              CRCtriplet(crc, next, -60);
+              FALLTHROUGH_INTENDED;
+            case 59:
+              CRCtriplet(crc, next, -59);
+              FALLTHROUGH_INTENDED;
+            case 58:
+              CRCtriplet(crc, next, -58);
+              FALLTHROUGH_INTENDED;
+            case 57:
+              CRCtriplet(crc, next, -57);
+              FALLTHROUGH_INTENDED;
+            case 56:
+              CRCtriplet(crc, next, -56);
+              FALLTHROUGH_INTENDED;
+            case 55:
+              CRCtriplet(crc, next, -55);
+              FALLTHROUGH_INTENDED;
+            case 54:
+              CRCtriplet(crc, next, -54);
+              FALLTHROUGH_INTENDED;
+            case 53:
+              CRCtriplet(crc, next, -53);
+              FALLTHROUGH_INTENDED;
+            case 52:
+              CRCtriplet(crc, next, -52);
+              FALLTHROUGH_INTENDED;
+            case 51:
+              CRCtriplet(crc, next, -51);
+              FALLTHROUGH_INTENDED;
+            case 50:
+              CRCtriplet(crc, next, -50);
+              FALLTHROUGH_INTENDED;
+            case 49:
+              CRCtriplet(crc, next, -49);
+              FALLTHROUGH_INTENDED;
+            case 48:
+              CRCtriplet(crc, next, -48);
+              FALLTHROUGH_INTENDED;
+            case 47:
+              CRCtriplet(crc, next, -47);
+              FALLTHROUGH_INTENDED;
+            case 46:
+              CRCtriplet(crc, next, -46);
+              FALLTHROUGH_INTENDED;
+            case 45:
+              CRCtriplet(crc, next, -45);
+              FALLTHROUGH_INTENDED;
+            case 44:
+              CRCtriplet(crc, next, -44);
+              FALLTHROUGH_INTENDED;
+            case 43:
+              CRCtriplet(crc, next, -43);
+              FALLTHROUGH_INTENDED;
+            case 42:
+              CRCtriplet(crc, next, -42);
+              FALLTHROUGH_INTENDED;
+            case 41:
+              CRCtriplet(crc, next, -41);
+              FALLTHROUGH_INTENDED;
+            case 40:
+              CRCtriplet(crc, next, -40);
+              FALLTHROUGH_INTENDED;
+            case 39:
+              CRCtriplet(crc, next, -39);
+              FALLTHROUGH_INTENDED;
+            case 38:
+              CRCtriplet(crc, next, -38);
+              FALLTHROUGH_INTENDED;
+            case 37:
+              CRCtriplet(crc, next, -37);
+              FALLTHROUGH_INTENDED;
+            case 36:
+              CRCtriplet(crc, next, -36);
+              FALLTHROUGH_INTENDED;
+            case 35:
+              CRCtriplet(crc, next, -35);
+              FALLTHROUGH_INTENDED;
+            case 34:
+              CRCtriplet(crc, next, -34);
+              FALLTHROUGH_INTENDED;
+            case 33:
+              CRCtriplet(crc, next, -33);
+              FALLTHROUGH_INTENDED;
+            case 32:
+              CRCtriplet(crc, next, -32);
+              FALLTHROUGH_INTENDED;
+            case 31:
+              CRCtriplet(crc, next, -31);
+              FALLTHROUGH_INTENDED;
+            case 30:
+              CRCtriplet(crc, next, -30);
+              FALLTHROUGH_INTENDED;
+            case 29:
+              CRCtriplet(crc, next, -29);
+              FALLTHROUGH_INTENDED;
+            case 28:
+              CRCtriplet(crc, next, -28);
+              FALLTHROUGH_INTENDED;
+            case 27:
+              CRCtriplet(crc, next, -27);
+              FALLTHROUGH_INTENDED;
+            case 26:
+              CRCtriplet(crc, next, -26);
+              FALLTHROUGH_INTENDED;
+            case 25:
+              CRCtriplet(crc, next, -25);
+              FALLTHROUGH_INTENDED;
+            case 24:
+              CRCtriplet(crc, next, -24);
+              FALLTHROUGH_INTENDED;
+            case 23:
+              CRCtriplet(crc, next, -23);
+              FALLTHROUGH_INTENDED;
+            case 22:
+              CRCtriplet(crc, next, -22);
+              FALLTHROUGH_INTENDED;
+            case 21:
+              CRCtriplet(crc, next, -21);
+              FALLTHROUGH_INTENDED;
+            case 20:
+              CRCtriplet(crc, next, -20);
+              FALLTHROUGH_INTENDED;
+            case 19:
+              CRCtriplet(crc, next, -19);
+              FALLTHROUGH_INTENDED;
+            case 18:
+              CRCtriplet(crc, next, -18);
+              FALLTHROUGH_INTENDED;
+            case 17:
+              CRCtriplet(crc, next, -17);
+              FALLTHROUGH_INTENDED;
+            case 16:
+              CRCtriplet(crc, next, -16);
+              FALLTHROUGH_INTENDED;
+            case 15:
+              CRCtriplet(crc, next, -15);
+              FALLTHROUGH_INTENDED;
+            case 14:
+              CRCtriplet(crc, next, -14);
+              FALLTHROUGH_INTENDED;
+            case 13:
+              CRCtriplet(crc, next, -13);
+              FALLTHROUGH_INTENDED;
+            case 12:
+              CRCtriplet(crc, next, -12);
+              FALLTHROUGH_INTENDED;
+            case 11:
+              CRCtriplet(crc, next, -11);
+              FALLTHROUGH_INTENDED;
+            case 10:
+              CRCtriplet(crc, next, -10);
+              FALLTHROUGH_INTENDED;
+            case 9:
+              CRCtriplet(crc, next, -9);
+              FALLTHROUGH_INTENDED;
+            case 8:
+              CRCtriplet(crc, next, -8);
+              FALLTHROUGH_INTENDED;
+            case 7:
+              CRCtriplet(crc, next, -7);
+              FALLTHROUGH_INTENDED;
+            case 6:
+              CRCtriplet(crc, next, -6);
+              FALLTHROUGH_INTENDED;
+            case 5:
+              CRCtriplet(crc, next, -5);
+              FALLTHROUGH_INTENDED;
+            case 4:
+              CRCtriplet(crc, next, -4);
+              FALLTHROUGH_INTENDED;
+            case 3:
+              CRCtriplet(crc, next, -3);
+              FALLTHROUGH_INTENDED;
+            case 2:
+              CRCtriplet(crc, next, -2);
+              FALLTHROUGH_INTENDED;
+            case 1:
+              CRCduplet(crc, next, -1); // the final triplet is actually only 2
+              //{ CombineCRC(); }
+              crc0 = CombineCRC(block_size, crc0, crc1, crc2, next2);
+              if (--n > 0) {
+                crc1 = crc2 = 0;
+                block_size = 128;
+                // points to the first byte of the next block
+                next0 = next2 + 128;
+                next1 = next0 + 128; // from here on all blocks are 128 long
+                next2 = next1 + 128;
+              }
+              FALLTHROUGH_INTENDED;
+            case 0:;
+          } while (n > 0);
+      }
+      next = (const unsigned char*)next2;
+    }
+    uint64_t count2 = len >> 3; // 216 of less bytes is 27 or less singlets
+    len = len & 7;
+    next += (count2 * 8);
+    switch (count2) {
+      case 27:
+        CRCsinglet(crc0, next, -27 * 8);
+        FALLTHROUGH_INTENDED;
+      case 26:
+        CRCsinglet(crc0, next, -26 * 8);
+        FALLTHROUGH_INTENDED;
+      case 25:
+        CRCsinglet(crc0, next, -25 * 8);
+        FALLTHROUGH_INTENDED;
+      case 24:
+        CRCsinglet(crc0, next, -24 * 8);
+        FALLTHROUGH_INTENDED;
+      case 23:
+        CRCsinglet(crc0, next, -23 * 8);
+        FALLTHROUGH_INTENDED;
+      case 22:
+        CRCsinglet(crc0, next, -22 * 8);
+        FALLTHROUGH_INTENDED;
+      case 21:
+        CRCsinglet(crc0, next, -21 * 8);
+        FALLTHROUGH_INTENDED;
+      case 20:
+        CRCsinglet(crc0, next, -20 * 8);
+        FALLTHROUGH_INTENDED;
+      case 19:
+        CRCsinglet(crc0, next, -19 * 8);
+        FALLTHROUGH_INTENDED;
+      case 18:
+        CRCsinglet(crc0, next, -18 * 8);
+        FALLTHROUGH_INTENDED;
+      case 17:
+        CRCsinglet(crc0, next, -17 * 8);
+        FALLTHROUGH_INTENDED;
+      case 16:
+        CRCsinglet(crc0, next, -16 * 8);
+        FALLTHROUGH_INTENDED;
+      case 15:
+        CRCsinglet(crc0, next, -15 * 8);
+        FALLTHROUGH_INTENDED;
+      case 14:
+        CRCsinglet(crc0, next, -14 * 8);
+        FALLTHROUGH_INTENDED;
+      case 13:
+        CRCsinglet(crc0, next, -13 * 8);
+        FALLTHROUGH_INTENDED;
+      case 12:
+        CRCsinglet(crc0, next, -12 * 8);
+        FALLTHROUGH_INTENDED;
+      case 11:
+        CRCsinglet(crc0, next, -11 * 8);
+        FALLTHROUGH_INTENDED;
+      case 10:
+        CRCsinglet(crc0, next, -10 * 8);
+        FALLTHROUGH_INTENDED;
+      case 9:
+        CRCsinglet(crc0, next, -9 * 8);
+        FALLTHROUGH_INTENDED;
+      case 8:
+        CRCsinglet(crc0, next, -8 * 8);
+        FALLTHROUGH_INTENDED;
+      case 7:
+        CRCsinglet(crc0, next, -7 * 8);
+        FALLTHROUGH_INTENDED;
+      case 6:
+        CRCsinglet(crc0, next, -6 * 8);
+        FALLTHROUGH_INTENDED;
+      case 5:
+        CRCsinglet(crc0, next, -5 * 8);
+        FALLTHROUGH_INTENDED;
+      case 4:
+        CRCsinglet(crc0, next, -4 * 8);
+        FALLTHROUGH_INTENDED;
+      case 3:
+        CRCsinglet(crc0, next, -3 * 8);
+        FALLTHROUGH_INTENDED;
+      case 2:
+        CRCsinglet(crc0, next, -2 * 8);
+        FALLTHROUGH_INTENDED;
+      case 1:
+        CRCsinglet(crc0, next, -1 * 8);
+        FALLTHROUGH_INTENDED;
+      case 0:;
+    }
+  }
+  {
+    align_to_8(len, crc0, next);
+    return (uint32_t)crc0 ^ 0xffffffffu;
+  }
+}
+
+#endif //HAVE_SSE42 && HAVE_PCLMUL
+
+static inline Function Choose_Extend() {
+#ifdef HAVE_POWER8
+  return isAltiVec() ? ExtendPPCImpl : ExtendImpl<Slow_CRC32>;
+#elif defined(HAVE_ARM64_CRC)
+  if(crc32c_runtime_check()) {
+    pmull_runtime_flag = crc32c_pmull_runtime_check();
+    return ExtendARMImpl;
+  } else {
+    return ExtendImpl<Slow_CRC32>;
+  }
+#else
+  if (isSSE42()) {
+    if (isPCLMULQDQ()) {
+#if (defined HAVE_SSE42 && defined HAVE_PCLMUL) && !defined NO_THREEWAY_CRC32C
+      return crc32c_3way;
+#else
+    return ExtendImpl<Fast_CRC32>; // Fast_CRC32 will check HAVE_SSE42 itself
+#endif
+    }
+    else {  // no runtime PCLMULQDQ support but has SSE42 support
+      return ExtendImpl<Fast_CRC32>;
+    }
+  } // end of isSSE42()
+  else {
+    return ExtendImpl<Slow_CRC32>;
+  }
+#endif
+}
+
+static Function ChosenExtend = Choose_Extend();
+uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
+  return ChosenExtend(crc, buf, size);
+}
+
+// The code for crc32c combine, copied with permission from folly
+
+// Standard galois-field multiply.  The only modification is that a,
+// b, m, and p are all bit-reflected.
+//
+// https://en.wikipedia.org/wiki/Finite_field_arithmetic
+static constexpr uint32_t gf_multiply_sw_1(
+    size_t i, uint32_t p, uint32_t a, uint32_t b, uint32_t m) {
+  // clang-format off
+  return i == 32 ? p : gf_multiply_sw_1(
+      /* i = */ i + 1,
+      /* p = */ p ^ ((0u-((b >> 31) & 1)) & a),
+      /* a = */ (a >> 1) ^ ((0u-(a & 1)) & m),
+      /* b = */ b << 1,
+      /* m = */ m);
+  // clang-format on
+}
+static constexpr uint32_t gf_multiply_sw(uint32_t a, uint32_t b, uint32_t m) {
+  return gf_multiply_sw_1(/* i = */ 0, /* p = */ 0, a, b, m);
+}
+
+static constexpr uint32_t gf_square_sw(uint32_t a, uint32_t m) {
+  return gf_multiply_sw(a, a, m);
+}
+
+template <size_t i, uint32_t m>
+struct gf_powers_memo {
+  static constexpr uint32_t value =
+      gf_square_sw(gf_powers_memo<i - 1, m>::value, m);
+};
+template <uint32_t m>
+struct gf_powers_memo<0, m> {
+  static constexpr uint32_t value = m;
+};
+
+template <typename T, T... Ints>
+struct integer_sequence {
+  using value_type = T;
+  static constexpr size_t size() { return sizeof...(Ints); }
+};
+
+template <typename T, std::size_t N, T... Is>
+struct make_integer_sequence : make_integer_sequence<T, N - 1, N - 1, Is...> {};
+
+template <typename T, T... Is>
+struct make_integer_sequence<T, 0, Is...> : integer_sequence<T, Is...> {};
+
+template <std::size_t N>
+using make_index_sequence = make_integer_sequence<std::size_t, N>;
+
+template <uint32_t m>
+struct gf_powers_make {
+  template <size_t... i>
+  using index_sequence = integer_sequence<size_t, i...>;
+  template <size_t... i>
+  constexpr std::array<uint32_t, sizeof...(i)> operator()(
+      index_sequence<i...>) const {
+    return std::array<uint32_t, sizeof...(i)>{{gf_powers_memo<i, m>::value...}};
+  }
+};
+
+static constexpr uint32_t crc32c_m = 0x82f63b78;
+
+static constexpr std::array<uint32_t, 62> const crc32c_powers =
+    gf_powers_make<crc32c_m>{}(make_index_sequence<62>{});
+
+// Expects a "pure" crc (see Crc32cCombine)
+static uint32_t Crc32AppendZeroes(
+    uint32_t crc, size_t len_over_4, uint32_t polynomial,
+    std::array<uint32_t, 62> const& powers_array) {
+  auto powers = powers_array.data();
+  // Append by multiplying by consecutive powers of two of the zeroes
+  // array
+  size_t len_bits = len_over_4;
+
+  while (len_bits) {
+    // Advance directly to next bit set.
+    auto r = CountTrailingZeroBits(len_bits);
+    len_bits >>= r;
+    powers += r;
+
+    crc = gf_multiply_sw(crc, *powers, polynomial);
+
+    len_bits >>= 1;
+    powers++;
+  }
+
+  return crc;
+}
+
+static inline uint32_t InvertedToPure(uint32_t crc) { return ~crc; }
+
+static inline uint32_t PureToInverted(uint32_t crc) { return ~crc; }
+
+static inline uint32_t PureExtend(uint32_t crc, const char* buf, size_t size) {
+  return InvertedToPure(Extend(PureToInverted(crc), buf, size));
+}
+
+// Background:
+// RocksDB uses two kinds of crc32c values: masked and unmasked. Neither is
+// a "pure" CRC because a pure CRC satisfies (^ for xor)
+//  crc(a ^ b) = crc(a) ^ crc(b)
+// The unmasked is closest, and this function takes unmasked crc32c values.
+// The unmasked values are impure in two ways:
+// * The initial setting at the start of CRC computation is all 1 bits
+// (like -1) instead of zero.
+// * The result has all bits invered.
+// Note that together, these result in the empty string having a crc32c of
+// zero. See
+// https://en.wikipedia.org/wiki/Computation_of_cyclic_redundancy_checks#CRC_variants
+//
+// Simplified version of strategy, using xor through pure CRCs (+ for concat):
+//
+// pure_crc(str1 + str2) = pure_crc(str1 + zeros(len(str2))) ^
+//                         pure_crc(zeros(len(str1)) + str2)
+//
+// because the xor of these two zero-padded strings is str1 + str2. For pure
+// CRC, leading zeros don't affect the result, so we only need
+//
+// pure_crc(str1 + str2) = pure_crc(str1 + zeros(len(str2))) ^
+//                         pure_crc(str2)
+//
+// Considering we aren't working with pure CRCs, what is actually in the input?
+//
+// crc1 = PureToInverted(PureExtendCrc32c(-1, zeros, crc1len) ^
+//                       PureCrc32c(str1, crc1len))
+// crc2 = PureToInverted(PureExtendCrc32c(-1, zeros, crc2len) ^
+//                       PureCrc32c(str2, crc2len))
+//
+// The result we want to compute is
+// combined = PureToInverted(PureExtendCrc32c(PureExtendCrc32c(-1, zeros,
+//                                                             crc1len) ^
+//                                            PureCrc32c(str1, crc1len),
+//                                            zeros, crc2len) ^
+//                           PureCrc32c(str2, crc2len))
+//
+// Thus, in addition to extending crc1 over the length of str2 in (virtual)
+// zeros, we need to cancel out the -1 initializer that was used in computing
+// crc2. To cancel it out, we also need to extend it over crc2len in zeros.
+// To simplify, since the end of str1 and that -1 initializer for crc2 are at
+// the same logical position, we can combine them before we extend over the
+// zeros.
+uint32_t Crc32cCombine(uint32_t crc1, uint32_t crc2, size_t crc2len) {
+  uint32_t pure_crc1_with_init = InvertedToPure(crc1);
+  uint32_t pure_crc2_with_init = InvertedToPure(crc2);
+  uint32_t pure_crc2_init = static_cast<uint32_t>(-1);
+
+  // Append up to 32 bits of zeroes in the normal way
+  char zeros[4] = {0, 0, 0, 0};
+  auto len = crc2len & 3;
+  uint32_t tmp = pure_crc1_with_init ^ pure_crc2_init;
+  if (len) {
+    tmp = PureExtend(tmp, zeros, len);
+  }
+  return PureToInverted(
+      Crc32AppendZeroes(tmp, crc2len / 4, crc32c_m, crc32c_powers) ^
+      pure_crc2_with_init);
+}
+
+}  // namespace crc32c
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/crc32c.h b/src/rocksdb/util/crc32c.h
new file mode 100644
index 000000000..a08ad60af
--- /dev/null
+++ b/src/rocksdb/util/crc32c.h
@@ -0,0 +1,56 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace crc32c {
+
+extern std::string IsFastCrc32Supported();
+
+// Return the crc32c of concat(A, data[0,n-1]) where init_crc is the
+// crc32c of some string A.  Extend() is often used to maintain the
+// crc32c of a stream of data.
+extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n);
+
+// Takes two unmasked crc32c values, and the length of the string from
+// which `crc2` was computed, and computes a crc32c value for the
+// concatenation of the original two input strings. Running time is
+// ~ log(crc2len).
+extern uint32_t Crc32cCombine(uint32_t crc1, uint32_t crc2, size_t crc2len);
+
+// Return the crc32c of data[0,n-1]
+inline uint32_t Value(const char* data, size_t n) { return Extend(0, data, n); }
+
+static const uint32_t kMaskDelta = 0xa282ead8ul;
+
+// Return a masked representation of crc.
+//
+// Motivation: it is problematic to compute the CRC of a string that
+// contains embedded CRCs.  Therefore we recommend that CRCs stored
+// somewhere (e.g., in files) should be masked before being stored.
+inline uint32_t Mask(uint32_t crc) {
+  // Rotate right by 15 bits and add a constant.
+  return ((crc >> 15) | (crc << 17)) + kMaskDelta;
+}
+
+// Return the crc whose masked representation is masked_crc.
+inline uint32_t Unmask(uint32_t masked_crc) {
+  uint32_t rot = masked_crc - kMaskDelta;
+  return ((rot >> 17) | (rot << 15));
+}
+
+}  // namespace crc32c
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/crc32c_arm64.cc b/src/rocksdb/util/crc32c_arm64.cc
new file mode 100644
index 000000000..4885f4fe1
--- /dev/null
+++ b/src/rocksdb/util/crc32c_arm64.cc
@@ -0,0 +1,215 @@
+//  Copyright (c) 2018, Arm Limited and affiliates. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/crc32c_arm64.h"
+
+#if defined(HAVE_ARM64_CRC)
+
+#if defined(__linux__)
+#include <asm/hwcap.h>
+#endif
+#ifdef ROCKSDB_AUXV_GETAUXVAL_PRESENT
+#include <sys/auxv.h>
+#endif
+#ifndef HWCAP_CRC32
+#define HWCAP_CRC32 (1 << 7)
+#endif
+#ifndef HWCAP_PMULL
+#define HWCAP_PMULL (1 << 4)
+#endif
+#if defined(__APPLE__)
+#include <sys/sysctl.h>
+#endif
+#if defined(__OpenBSD__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <machine/cpu.h>
+#include <machine/armreg.h>
+#endif
+
+#ifdef HAVE_ARM64_CRYPTO
+/* unfolding to compute 8 * 3 = 24 bytes parallelly */
+#define CRC32C24BYTES(ITR)                                    \
+  crc1 = crc32c_u64(crc1, *(buf64 + BLK_LENGTH + (ITR)));     \
+  crc2 = crc32c_u64(crc2, *(buf64 + BLK_LENGTH * 2 + (ITR))); \
+  crc0 = crc32c_u64(crc0, *(buf64 + (ITR)));
+
+/* unfolding to compute 24 * 7 = 168 bytes parallelly */
+#define CRC32C7X24BYTES(ITR)   \
+  do {                         \
+    CRC32C24BYTES((ITR)*7 + 0) \
+    CRC32C24BYTES((ITR)*7 + 1) \
+    CRC32C24BYTES((ITR)*7 + 2) \
+    CRC32C24BYTES((ITR)*7 + 3) \
+    CRC32C24BYTES((ITR)*7 + 4) \
+    CRC32C24BYTES((ITR)*7 + 5) \
+    CRC32C24BYTES((ITR)*7 + 6) \
+  } while (0)
+#endif
+
+extern bool pmull_runtime_flag;
+
+uint32_t crc32c_runtime_check(void) {
+#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) || defined(__FreeBSD__)
+  uint64_t auxv = 0;
+#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT)
+  auxv = getauxval(AT_HWCAP);
+#elif defined(__FreeBSD__)
+  elf_aux_info(AT_HWCAP, &auxv, sizeof(auxv));
+#endif
+  return (auxv & HWCAP_CRC32) != 0;
+#elif defined(__APPLE__)
+  int r;
+  size_t l = sizeof(r);
+  if (sysctlbyname("hw.optional.armv8_crc32", &r, &l, NULL, 0) == -1) return 0;
+  return r == 1;
+#elif defined(__OpenBSD__)
+  int r = 0;
+  const int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 };
+  uint64_t isar0;
+  size_t len = sizeof(isar0);
+
+  if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) {
+      if (ID_AA64ISAR0_CRC32(isar0) >= ID_AA64ISAR0_CRC32_BASE)
+        r = 1;
+  }
+  return r;
+#else
+  return 0;
+#endif
+}
+
+bool crc32c_pmull_runtime_check(void) {
+#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) || defined(__FreeBSD__)
+  uint64_t auxv = 0;
+#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT)
+  auxv = getauxval(AT_HWCAP);
+#elif defined(__FreeBSD__)
+  elf_aux_info(AT_HWCAP, &auxv, sizeof(auxv));
+#endif
+  return (auxv & HWCAP_PMULL) != 0;
+#elif defined(__APPLE__)
+  return true;
+#elif defined(__OpenBSD__)
+  bool r = false;
+  const int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 };
+  uint64_t isar0;
+  size_t len = sizeof(isar0);
+
+  if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) {
+      if (ID_AA64ISAR0_AES(isar0) >= ID_AA64ISAR0_AES_PMULL)
+        r = true;
+  }
+  return r;
+#else
+  return false;
+#endif
+}
+
+#ifdef ROCKSDB_UBSAN_RUN
+#if defined(__clang__)
+__attribute__((__no_sanitize__("alignment")))
+#elif defined(__GNUC__)
+__attribute__((__no_sanitize_undefined__))
+#endif
+#endif
+uint32_t
+crc32c_arm64(uint32_t crc, unsigned char const *data, size_t len) {
+  const uint8_t *buf8;
+  const uint64_t *buf64 = (uint64_t *)data;
+  int length = (int)len;
+  crc ^= 0xffffffff;
+
+  /*
+   * Pmull runtime check here.
+   * Raspberry Pi supports crc32 but doesn't support pmull.
+   * Skip Crc32c Parallel computation if no crypto extension available.
+   */
+  if (pmull_runtime_flag) {
+/* Macro (HAVE_ARM64_CRYPTO) is used for compiling check  */
+#ifdef HAVE_ARM64_CRYPTO
+/* Crc32c Parallel computation
+ *   Algorithm comes from Intel whitepaper:
+ *   crc-iscsi-polynomial-crc32-instruction-paper
+ *
+ * Input data is divided into three equal-sized blocks
+ *   Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes
+ *   One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes
+ */
+#define BLK_LENGTH 42
+    while (length >= 1024) {
+      uint64_t t0, t1;
+      uint32_t crc0 = 0, crc1 = 0, crc2 = 0;
+
+      /* Parallel Param:
+       *   k0 = CRC32(x ^ (42 * 8 * 8 * 2 - 1));
+       *   k1 = CRC32(x ^ (42 * 8 * 8 - 1));
+       */
+      uint32_t k0 = 0xe417f38a, k1 = 0x8f158014;
+
+      /* Prefetch data for following block to avoid cache miss */
+      PREF1KL1((uint8_t *)buf64, 1024);
+
+      /* First 8 byte for better pipelining */
+      crc0 = crc32c_u64(crc, *buf64++);
+
+      /* 3 blocks crc32c parallel computation
+       * Macro unfolding to compute parallelly
+       * 168 * 6 = 1008 (bytes)
+       */
+      CRC32C7X24BYTES(0);
+      CRC32C7X24BYTES(1);
+      CRC32C7X24BYTES(2);
+      CRC32C7X24BYTES(3);
+      CRC32C7X24BYTES(4);
+      CRC32C7X24BYTES(5);
+      buf64 += (BLK_LENGTH * 3);
+
+      /* Last 8 bytes */
+      crc = crc32c_u64(crc2, *buf64++);
+
+      t0 = (uint64_t)vmull_p64(crc0, k0);
+      t1 = (uint64_t)vmull_p64(crc1, k1);
+
+      /* Merge (crc0, crc1, crc2) -> crc */
+      crc1 = crc32c_u64(0, t1);
+      crc ^= crc1;
+      crc0 = crc32c_u64(0, t0);
+      crc ^= crc0;
+
+      length -= 1024;
+    }
+
+    if (length == 0) return crc ^ (0xffffffffU);
+#endif
+  }  // if Pmull runtime check here
+
+  buf8 = (const uint8_t *)buf64;
+  while (length >= 8) {
+    crc = crc32c_u64(crc, *(const uint64_t *)buf8);
+    buf8 += 8;
+    length -= 8;
+  }
+
+  /* The following is more efficient than the straight loop */
+  if (length >= 4) {
+    crc = crc32c_u32(crc, *(const uint32_t *)buf8);
+    buf8 += 4;
+    length -= 4;
+  }
+
+  if (length >= 2) {
+    crc = crc32c_u16(crc, *(const uint16_t *)buf8);
+    buf8 += 2;
+    length -= 2;
+  }
+
+  if (length >= 1) crc = crc32c_u8(crc, *buf8);
+
+  crc ^= 0xffffffff;
+  return crc;
+}
+
+#endif
diff --git a/src/rocksdb/util/crc32c_arm64.h b/src/rocksdb/util/crc32c_arm64.h
new file mode 100644
index 000000000..4b27fe871
--- /dev/null
+++ b/src/rocksdb/util/crc32c_arm64.h
@@ -0,0 +1,52 @@
+//  Copyright (c) 2018, Arm Limited and affiliates. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef UTIL_CRC32C_ARM64_H
+#define UTIL_CRC32C_ARM64_H
+
+#include <cinttypes>
+#include <cstddef>
+
+#if defined(__aarch64__) || defined(__AARCH64__)
+
+#ifdef __ARM_FEATURE_CRC32
+#define HAVE_ARM64_CRC
+#include <arm_acle.h>
+#define crc32c_u8(crc, v) __crc32cb(crc, v)
+#define crc32c_u16(crc, v) __crc32ch(crc, v)
+#define crc32c_u32(crc, v) __crc32cw(crc, v)
+#define crc32c_u64(crc, v) __crc32cd(crc, v)
+// clang-format off
+#define PREF4X64L1(buffer, PREF_OFFSET, ITR)                \
+  __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]" ::[v] "r"(buffer), \
+          [c] "I"((PREF_OFFSET) + ((ITR) + 0) * 64));       \
+  __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]" ::[v] "r"(buffer), \
+          [c] "I"((PREF_OFFSET) + ((ITR) + 1) * 64));       \
+  __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]" ::[v] "r"(buffer), \
+          [c] "I"((PREF_OFFSET) + ((ITR) + 2) * 64));       \
+  __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]" ::[v] "r"(buffer), \
+          [c] "I"((PREF_OFFSET) + ((ITR) + 3) * 64));
+// clang-format on
+
+#define PREF1KL1(buffer, PREF_OFFSET)  \
+  PREF4X64L1(buffer, (PREF_OFFSET), 0) \
+  PREF4X64L1(buffer, (PREF_OFFSET), 4) \
+  PREF4X64L1(buffer, (PREF_OFFSET), 8) \
+  PREF4X64L1(buffer, (PREF_OFFSET), 12)
+
+extern uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data,
+                             size_t len);
+extern uint32_t crc32c_runtime_check(void);
+extern bool crc32c_pmull_runtime_check(void);
+
+#ifdef __ARM_FEATURE_CRYPTO
+#define HAVE_ARM64_CRYPTO
+#include <arm_neon.h>
+#endif  // __ARM_FEATURE_CRYPTO
+#endif  // __ARM_FEATURE_CRC32
+
+#endif  // defined(__aarch64__) || defined(__AARCH64__)
+
+#endif
diff --git a/src/rocksdb/util/crc32c_ppc.c b/src/rocksdb/util/crc32c_ppc.c
new file mode 100644
index 000000000..b37dfb158
--- /dev/null
+++ b/src/rocksdb/util/crc32c_ppc.c
@@ -0,0 +1,94 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  Copyright (c) 2017 International Business Machines Corp.
+//  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#define CRC_TABLE
+#include <stdint.h>
+#include <stdlib.h>
+#include <strings.h>
+#include "util/crc32c_ppc_constants.h"
+
+#define VMX_ALIGN 16
+#define VMX_ALIGN_MASK (VMX_ALIGN - 1)
+
+#ifdef REFLECT
+static unsigned int crc32_align(unsigned int crc, unsigned char const *p,
+                                unsigned long len) {
+  while (len--) crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
+  return crc;
+}
+#endif
+
+#ifdef HAVE_POWER8
+unsigned int __crc32_vpmsum(unsigned int crc, unsigned char const *p,
+                            unsigned long len);
+
+static uint32_t crc32_vpmsum(uint32_t crc, unsigned char const *data,
+                             size_t len) {
+  unsigned int prealign;
+  unsigned int tail;
+
+#ifdef CRC_XOR
+  crc ^= 0xffffffff;
+#endif
+
+  if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
+    crc = crc32_align(crc, data, (unsigned long)len);
+    goto out;
+  }
+
+  if ((unsigned long)data & VMX_ALIGN_MASK) {
+    prealign = VMX_ALIGN - ((unsigned long)data & VMX_ALIGN_MASK);
+    crc = crc32_align(crc, data, prealign);
+    len -= prealign;
+    data += prealign;
+  }
+
+  crc = __crc32_vpmsum(crc, data, (unsigned long)len & ~VMX_ALIGN_MASK);
+
+  tail = len & VMX_ALIGN_MASK;
+  if (tail) {
+    data += len & ~VMX_ALIGN_MASK;
+    crc = crc32_align(crc, data, tail);
+  }
+
+out:
+#ifdef CRC_XOR
+  crc ^= 0xffffffff;
+#endif
+
+  return crc;
+}
+
+/* This wrapper function works around the fact that crc32_vpmsum
+ * does not gracefully handle the case where the data pointer is NULL.  There
+ * may be room for performance improvement here.
+ */
+uint32_t crc32c_ppc(uint32_t crc, unsigned char const *data, size_t len) {
+  unsigned char *buf2;
+
+  if (!data) {
+    buf2 = (unsigned char *)malloc(len);
+    bzero(buf2, len);
+    crc = crc32_vpmsum(crc, buf2, len);
+    free(buf2);
+  } else {
+    crc = crc32_vpmsum(crc, data, (unsigned long)len);
+  }
+  return crc;
+}
+
+#else /* HAVE_POWER8 */
+
+/* This symbol has to exist on non-ppc architectures (and on legacy
+ * ppc systems using power7 or below) in order to compile properly
+ * there, even though it won't be called.
+ */
+uint32_t crc32c_ppc(uint32_t crc, unsigned char const *data, size_t len) {
+  return 0;
+}
+
+#endif /* HAVE_POWER8 */
diff --git a/src/rocksdb/util/crc32c_ppc.h b/src/rocksdb/util/crc32c_ppc.h
new file mode 100644
index 000000000..f0b0b66d5
--- /dev/null
+++ b/src/rocksdb/util/crc32c_ppc.h
@@ -0,0 +1,22 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  Copyright (c) 2017 International Business Machines Corp.
+//  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern uint32_t crc32c_ppc(uint32_t crc, unsigned char const *buffer,
+                           size_t len);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/rocksdb/util/crc32c_ppc_asm.S b/src/rocksdb/util/crc32c_ppc_asm.S
new file mode 100644
index 000000000..6959ba839
--- /dev/null
+++ b/src/rocksdb/util/crc32c_ppc_asm.S
@@ -0,0 +1,756 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  Copyright (c) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+//  Copyright (c) 2017 International Business Machines Corp.
+//  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#if defined (__clang__)
+#include "third-party/gcc/ppc-asm.h"
+#else
+#include <ppc-asm.h>
+#endif
+#include "ppc-opcode.h"
+
+#undef toc
+
+#ifndef r1
+#define r1 1
+#endif
+
+#ifndef r2
+#define r2 2
+#endif
+
+	.section	.rodata
+.balign 16
+
+.byteswap_constant:
+	/* byte reverse permute constant */
+	.octa 0x0F0E0D0C0B0A09080706050403020100
+
+#define __ASSEMBLY__
+#include "crc32c_ppc_constants.h"
+
+	.text
+
+#if defined(__BIG_ENDIAN__) && defined(REFLECT)
+#define BYTESWAP_DATA
+#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
+#define BYTESWAP_DATA
+#else
+#undef BYTESWAP_DATA
+#endif
+
+#define off16		r25
+#define off32		r26
+#define off48		r27
+#define off64		r28
+#define off80		r29
+#define off96		r30
+#define off112		r31
+
+#define const1		v24
+#define const2		v25
+
+#define byteswap	v26
+#define	mask_32bit	v27
+#define	mask_64bit	v28
+#define zeroes		v29
+
+#ifdef BYTESWAP_DATA
+#define VPERM(A, B, C, D) vperm	A, B, C, D
+#else
+#define VPERM(A, B, C, D)
+#endif
+
+/* unsigned int __crc32_vpmsum(unsigned int crc, void *p, unsigned long len) */
+FUNC_START(__crc32_vpmsum)
+	std	r31,-8(r1)
+	std	r30,-16(r1)
+	std	r29,-24(r1)
+	std	r28,-32(r1)
+	std	r27,-40(r1)
+	std	r26,-48(r1)
+	std	r25,-56(r1)
+
+	li	off16,16
+	li	off32,32
+	li	off48,48
+	li	off64,64
+	li	off80,80
+	li	off96,96
+	li	off112,112
+	li	r0,0
+
+	/* Enough room for saving 10 non volatile VMX registers */
+	subi	r6,r1,56+10*16
+	subi	r7,r1,56+2*16
+
+	stvx	v20,0,r6
+	stvx	v21,off16,r6
+	stvx	v22,off32,r6
+	stvx	v23,off48,r6
+	stvx	v24,off64,r6
+	stvx	v25,off80,r6
+	stvx	v26,off96,r6
+	stvx	v27,off112,r6
+	stvx	v28,0,r7
+	stvx	v29,off16,r7
+
+	mr	r10,r3
+
+	vxor	zeroes,zeroes,zeroes
+	vspltisw v0,-1
+
+	vsldoi	mask_32bit,zeroes,v0,4
+	vsldoi	mask_64bit,zeroes,v0,8
+
+	/* Get the initial value into v8 */
+	vxor	v8,v8,v8
+	MTVRD(v8, r3)
+#ifdef REFLECT
+	vsldoi	v8,zeroes,v8,8	/* shift into bottom 32 bits */
+#else
+	vsldoi	v8,v8,zeroes,4	/* shift into top 32 bits */
+#endif
+
+#ifdef BYTESWAP_DATA
+	addis	r3,r2,.byteswap_constant@toc@ha
+	addi	r3,r3,.byteswap_constant@toc@l
+
+	lvx	byteswap,0,r3
+	addi	r3,r3,16
+#endif
+
+	cmpdi	r5,256
+	blt	.Lshort
+
+	rldicr	r6,r5,0,56
+
+	/* Checksum in blocks of MAX_SIZE */
+1:	lis	r7,MAX_SIZE@h
+	ori	r7,r7,MAX_SIZE@l
+	mr	r9,r7
+	cmpd	r6,r7
+	bgt	2f
+	mr	r7,r6
+2:	subf	r6,r7,r6
+
+	/* our main loop does 128 bytes at a time */
+	srdi	r7,r7,7
+
+	/*
+	 * Work out the offset into the constants table to start at. Each
+	 * constant is 16 bytes, and it is used against 128 bytes of input
+	 * data - 128 / 16 = 8
+	 */
+	sldi	r8,r7,4
+	srdi	r9,r9,3
+	subf	r8,r8,r9
+
+	/* We reduce our final 128 bytes in a separate step */
+	addi	r7,r7,-1
+	mtctr	r7
+
+	addis	r3,r2,.constants@toc@ha
+	addi	r3,r3,.constants@toc@l
+
+	/* Find the start of our constants */
+	add	r3,r3,r8
+
+	/* zero v0-v7 which will contain our checksums */
+	vxor	v0,v0,v0
+	vxor	v1,v1,v1
+	vxor	v2,v2,v2
+	vxor	v3,v3,v3
+	vxor	v4,v4,v4
+	vxor	v5,v5,v5
+	vxor	v6,v6,v6
+	vxor	v7,v7,v7
+
+	lvx	const1,0,r3
+
+	/*
+	 * If we are looping back to consume more data we use the values
+	 * already in v16-v23.
+	 */
+	cmpdi	r0,1
+	beq	2f
+
+	/* First warm up pass */
+	lvx	v16,0,r4
+	lvx	v17,off16,r4
+	VPERM(v16,v16,v16,byteswap)
+	VPERM(v17,v17,v17,byteswap)
+	lvx	v18,off32,r4
+	lvx	v19,off48,r4
+	VPERM(v18,v18,v18,byteswap)
+	VPERM(v19,v19,v19,byteswap)
+	lvx	v20,off64,r4
+	lvx	v21,off80,r4
+	VPERM(v20,v20,v20,byteswap)
+	VPERM(v21,v21,v21,byteswap)
+	lvx	v22,off96,r4
+	lvx	v23,off112,r4
+	VPERM(v22,v22,v22,byteswap)
+	VPERM(v23,v23,v23,byteswap)
+	addi	r4,r4,8*16
+
+	/* xor in initial value */
+	vxor	v16,v16,v8
+
+2:	bdz	.Lfirst_warm_up_done
+
+	addi	r3,r3,16
+	lvx	const2,0,r3
+
+	/* Second warm up pass */
+	VPMSUMD(v8,v16,const1)
+	lvx	v16,0,r4
+	VPERM(v16,v16,v16,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v9,v17,const1)
+	lvx	v17,off16,r4
+	VPERM(v17,v17,v17,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v10,v18,const1)
+	lvx	v18,off32,r4
+	VPERM(v18,v18,v18,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v11,v19,const1)
+	lvx	v19,off48,r4
+	VPERM(v19,v19,v19,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v12,v20,const1)
+	lvx	v20,off64,r4
+	VPERM(v20,v20,v20,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v13,v21,const1)
+	lvx	v21,off80,r4
+	VPERM(v21,v21,v21,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v14,v22,const1)
+	lvx	v22,off96,r4
+	VPERM(v22,v22,v22,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v15,v23,const1)
+	lvx	v23,off112,r4
+	VPERM(v23,v23,v23,byteswap)
+
+	addi	r4,r4,8*16
+
+	bdz	.Lfirst_cool_down
+
+	/*
+	 * main loop. We modulo schedule it such that it takes three iterations
+	 * to complete - first iteration load, second iteration vpmsum, third
+	 * iteration xor.
+	 */
+	.balign	16
+4:	lvx	const1,0,r3
+	addi	r3,r3,16
+	ori	r2,r2,0
+
+	vxor	v0,v0,v8
+	VPMSUMD(v8,v16,const2)
+	lvx	v16,0,r4
+	VPERM(v16,v16,v16,byteswap)
+	ori	r2,r2,0
+
+	vxor	v1,v1,v9
+	VPMSUMD(v9,v17,const2)
+	lvx	v17,off16,r4
+	VPERM(v17,v17,v17,byteswap)
+	ori	r2,r2,0
+
+	vxor	v2,v2,v10
+	VPMSUMD(v10,v18,const2)
+	lvx	v18,off32,r4
+	VPERM(v18,v18,v18,byteswap)
+	ori	r2,r2,0
+
+	vxor	v3,v3,v11
+	VPMSUMD(v11,v19,const2)
+	lvx	v19,off48,r4
+	VPERM(v19,v19,v19,byteswap)
+	lvx	const2,0,r3
+	ori	r2,r2,0
+
+	vxor	v4,v4,v12
+	VPMSUMD(v12,v20,const1)
+	lvx	v20,off64,r4
+	VPERM(v20,v20,v20,byteswap)
+	ori	r2,r2,0
+
+	vxor	v5,v5,v13
+	VPMSUMD(v13,v21,const1)
+	lvx	v21,off80,r4
+	VPERM(v21,v21,v21,byteswap)
+	ori	r2,r2,0
+
+	vxor	v6,v6,v14
+	VPMSUMD(v14,v22,const1)
+	lvx	v22,off96,r4
+	VPERM(v22,v22,v22,byteswap)
+	ori	r2,r2,0
+
+	vxor	v7,v7,v15
+	VPMSUMD(v15,v23,const1)
+	lvx	v23,off112,r4
+	VPERM(v23,v23,v23,byteswap)
+
+	addi	r4,r4,8*16
+
+	bdnz	4b
+
+.Lfirst_cool_down:
+	/* First cool down pass */
+	lvx	const1,0,r3
+	addi	r3,r3,16
+
+	vxor	v0,v0,v8
+	VPMSUMD(v8,v16,const1)
+	ori	r2,r2,0
+
+	vxor	v1,v1,v9
+	VPMSUMD(v9,v17,const1)
+	ori	r2,r2,0
+
+	vxor	v2,v2,v10
+	VPMSUMD(v10,v18,const1)
+	ori	r2,r2,0
+
+	vxor	v3,v3,v11
+	VPMSUMD(v11,v19,const1)
+	ori	r2,r2,0
+
+	vxor	v4,v4,v12
+	VPMSUMD(v12,v20,const1)
+	ori	r2,r2,0
+
+	vxor	v5,v5,v13
+	VPMSUMD(v13,v21,const1)
+	ori	r2,r2,0
+
+	vxor	v6,v6,v14
+	VPMSUMD(v14,v22,const1)
+	ori	r2,r2,0
+
+	vxor	v7,v7,v15
+	VPMSUMD(v15,v23,const1)
+	ori	r2,r2,0
+
+.Lsecond_cool_down:
+	/* Second cool down pass */
+	vxor	v0,v0,v8
+	vxor	v1,v1,v9
+	vxor	v2,v2,v10
+	vxor	v3,v3,v11
+	vxor	v4,v4,v12
+	vxor	v5,v5,v13
+	vxor	v6,v6,v14
+	vxor	v7,v7,v15
+
+#ifdef REFLECT
+	/*
+	 * vpmsumd produces a 96 bit result in the least significant bits
+	 * of the register. Since we are bit reflected we have to shift it
+	 * left 32 bits so it occupies the least significant bits in the
+	 * bit reflected domain.
+	 */
+	vsldoi	v0,v0,zeroes,4
+	vsldoi	v1,v1,zeroes,4
+	vsldoi	v2,v2,zeroes,4
+	vsldoi	v3,v3,zeroes,4
+	vsldoi	v4,v4,zeroes,4
+	vsldoi	v5,v5,zeroes,4
+	vsldoi	v6,v6,zeroes,4
+	vsldoi	v7,v7,zeroes,4
+#endif
+
+	/* xor with last 1024 bits */
+	lvx	v8,0,r4
+	lvx	v9,off16,r4
+	VPERM(v8,v8,v8,byteswap)
+	VPERM(v9,v9,v9,byteswap)
+	lvx	v10,off32,r4
+	lvx	v11,off48,r4
+	VPERM(v10,v10,v10,byteswap)
+	VPERM(v11,v11,v11,byteswap)
+	lvx	v12,off64,r4
+	lvx	v13,off80,r4
+	VPERM(v12,v12,v12,byteswap)
+	VPERM(v13,v13,v13,byteswap)
+	lvx	v14,off96,r4
+	lvx	v15,off112,r4
+	VPERM(v14,v14,v14,byteswap)
+	VPERM(v15,v15,v15,byteswap)
+
+	addi	r4,r4,8*16
+
+	vxor	v16,v0,v8
+	vxor	v17,v1,v9
+	vxor	v18,v2,v10
+	vxor	v19,v3,v11
+	vxor	v20,v4,v12
+	vxor	v21,v5,v13
+	vxor	v22,v6,v14
+	vxor	v23,v7,v15
+
+	li	r0,1
+	cmpdi	r6,0
+	addi	r6,r6,128
+	bne	1b
+
+	/* Work out how many bytes we have left */
+	andi.	r5,r5,127
+
+	/* Calculate where in the constant table we need to start */
+	subfic	r6,r5,128
+	add	r3,r3,r6
+
+	/* How many 16 byte chunks are in the tail */
+	srdi	r7,r5,4
+	mtctr	r7
+
+	/*
+	 * Reduce the previously calculated 1024 bits to 64 bits, shifting
+	 * 32 bits to include the trailing 32 bits of zeros
+	 */
+	lvx	v0,0,r3
+	lvx	v1,off16,r3
+	lvx	v2,off32,r3
+	lvx	v3,off48,r3
+	lvx	v4,off64,r3
+	lvx	v5,off80,r3
+	lvx	v6,off96,r3
+	lvx	v7,off112,r3
+	addi	r3,r3,8*16
+
+	VPMSUMW(v0,v16,v0)
+	VPMSUMW(v1,v17,v1)
+	VPMSUMW(v2,v18,v2)
+	VPMSUMW(v3,v19,v3)
+	VPMSUMW(v4,v20,v4)
+	VPMSUMW(v5,v21,v5)
+	VPMSUMW(v6,v22,v6)
+	VPMSUMW(v7,v23,v7)
+
+	/* Now reduce the tail (0 - 112 bytes) */
+	cmpdi	r7,0
+	beq	1f
+
+	lvx	v16,0,r4
+	lvx	v17,0,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off16,r4
+	lvx	v17,off16,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off32,r4
+	lvx	v17,off32,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off48,r4
+	lvx	v17,off48,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off64,r4
+	lvx	v17,off64,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off80,r4
+	lvx	v17,off80,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off96,r4
+	lvx	v17,off96,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+
+	/* Now xor all the parallel chunks together */
+1:	vxor	v0,v0,v1
+	vxor	v2,v2,v3
+	vxor	v4,v4,v5
+	vxor	v6,v6,v7
+
+	vxor	v0,v0,v2
+	vxor	v4,v4,v6
+
+	vxor	v0,v0,v4
+
+.Lbarrett_reduction:
+	/* Barrett constants */
+	addis	r3,r2,.barrett_constants@toc@ha
+	addi	r3,r3,.barrett_constants@toc@l
+
+	lvx	const1,0,r3
+	lvx	const2,off16,r3
+
+	vsldoi	v1,v0,v0,8
+	vxor	v0,v0,v1		/* xor two 64 bit results together */
+
+#ifdef REFLECT
+	/* shift left one bit */
+	vspltisb v1,1
+	vsl	v0,v0,v1
+#endif
+
+	vand	v0,v0,mask_64bit
+
+#ifndef REFLECT
+	/*
+	 * Now for the Barrett reduction algorithm. The idea is to calculate q,
+	 * the multiple of our polynomial that we need to subtract. By
+	 * doing the computation 2x bits higher (ie 64 bits) and shifting the
+	 * result back down 2x bits, we round down to the nearest multiple.
+	 */
+	VPMSUMD(v1,v0,const1)	/* ma */
+	vsldoi	v1,zeroes,v1,8	/* q = floor(ma/(2^64)) */
+	VPMSUMD(v1,v1,const2)	/* qn */
+	vxor	v0,v0,v1	/* a - qn, subtraction is xor in GF(2) */
+
+	/*
+	 * Get the result into r3. We need to shift it left 8 bytes:
+	 * V0 [ 0 1 2 X ]
+	 * V0 [ 0 X 2 3 ]
+	 */
+	vsldoi	v0,v0,zeroes,8	/* shift result into top 64 bits */
+#else
+	/*
+	 * The reflected version of Barrett reduction. Instead of bit
+	 * reflecting our data (which is expensive to do), we bit reflect our
+	 * constants and our algorithm, which means the intermediate data in
+	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
+	 * the algorithm because we don't carry in mod 2 arithmetic.
+	 */
+	vand	v1,v0,mask_32bit	/* bottom 32 bits of a */
+	VPMSUMD(v1,v1,const1)		/* ma */
+	vand	v1,v1,mask_32bit	/* bottom 32bits of ma */
+	VPMSUMD(v1,v1,const2)		/* qn */
+	vxor	v0,v0,v1		/* a - qn, subtraction is xor in GF(2) */
+
+	/*
+	 * Since we are bit reflected, the result (ie the low 32 bits) is in
+	 * the high 32 bits. We just need to shift it left 4 bytes
+	 * V0 [ 0 1 X 3 ]
+	 * V0 [ 0 X 2 3 ]
+	 */
+	vsldoi	v0,v0,zeroes,4		/* shift result into top 64 bits of */
+#endif
+
+	/* Get it into r3 */
+	MFVRD(r3, v0)
+
+.Lout:
+	subi	r6,r1,56+10*16
+	subi	r7,r1,56+2*16
+
+	lvx	v20,0,r6
+	lvx	v21,off16,r6
+	lvx	v22,off32,r6
+	lvx	v23,off48,r6
+	lvx	v24,off64,r6
+	lvx	v25,off80,r6
+	lvx	v26,off96,r6
+	lvx	v27,off112,r6
+	lvx	v28,0,r7
+	lvx	v29,off16,r7
+
+	ld	r31,-8(r1)
+	ld	r30,-16(r1)
+	ld	r29,-24(r1)
+	ld	r28,-32(r1)
+	ld	r27,-40(r1)
+	ld	r26,-48(r1)
+	ld	r25,-56(r1)
+
+	blr
+
+.Lfirst_warm_up_done:
+	lvx	const1,0,r3
+	addi	r3,r3,16
+
+	VPMSUMD(v8,v16,const1)
+	VPMSUMD(v9,v17,const1)
+	VPMSUMD(v10,v18,const1)
+	VPMSUMD(v11,v19,const1)
+	VPMSUMD(v12,v20,const1)
+	VPMSUMD(v13,v21,const1)
+	VPMSUMD(v14,v22,const1)
+	VPMSUMD(v15,v23,const1)
+
+	b	.Lsecond_cool_down
+
+.Lshort:
+	cmpdi	r5,0
+	beq	.Lzero
+
+	addis	r3,r2,.short_constants@toc@ha
+	addi	r3,r3,.short_constants@toc@l
+
+	/* Calculate where in the constant table we need to start */
+	subfic	r6,r5,256
+	add	r3,r3,r6
+
+	/* How many 16 byte chunks? */
+	srdi	r7,r5,4
+	mtctr	r7
+
+	vxor	v19,v19,v19
+	vxor	v20,v20,v20
+
+	lvx	v0,0,r4
+	lvx	v16,0,r3
+	VPERM(v0,v0,v16,byteswap)
+	vxor	v0,v0,v8	/* xor in initial value */
+	VPMSUMW(v0,v0,v16)
+	bdz	.Lv0
+
+	lvx	v1,off16,r4
+	lvx	v17,off16,r3
+	VPERM(v1,v1,v17,byteswap)
+	VPMSUMW(v1,v1,v17)
+	bdz	.Lv1
+
+	lvx	v2,off32,r4
+	lvx	v16,off32,r3
+	VPERM(v2,v2,v16,byteswap)
+	VPMSUMW(v2,v2,v16)
+	bdz	.Lv2
+
+	lvx	v3,off48,r4
+	lvx	v17,off48,r3
+	VPERM(v3,v3,v17,byteswap)
+	VPMSUMW(v3,v3,v17)
+	bdz	.Lv3
+
+	lvx	v4,off64,r4
+	lvx	v16,off64,r3
+	VPERM(v4,v4,v16,byteswap)
+	VPMSUMW(v4,v4,v16)
+	bdz	.Lv4
+
+	lvx	v5,off80,r4
+	lvx	v17,off80,r3
+	VPERM(v5,v5,v17,byteswap)
+	VPMSUMW(v5,v5,v17)
+	bdz	.Lv5
+
+	lvx	v6,off96,r4
+	lvx	v16,off96,r3
+	VPERM(v6,v6,v16,byteswap)
+	VPMSUMW(v6,v6,v16)
+	bdz	.Lv6
+
+	lvx	v7,off112,r4
+	lvx	v17,off112,r3
+	VPERM(v7,v7,v17,byteswap)
+	VPMSUMW(v7,v7,v17)
+	bdz	.Lv7
+
+	addi	r3,r3,128
+	addi	r4,r4,128
+
+	lvx	v8,0,r4
+	lvx	v16,0,r3
+	VPERM(v8,v8,v16,byteswap)
+	VPMSUMW(v8,v8,v16)
+	bdz	.Lv8
+
+	lvx	v9,off16,r4
+	lvx	v17,off16,r3
+	VPERM(v9,v9,v17,byteswap)
+	VPMSUMW(v9,v9,v17)
+	bdz	.Lv9
+
+	lvx	v10,off32,r4
+	lvx	v16,off32,r3
+	VPERM(v10,v10,v16,byteswap)
+	VPMSUMW(v10,v10,v16)
+	bdz	.Lv10
+
+	lvx	v11,off48,r4
+	lvx	v17,off48,r3
+	VPERM(v11,v11,v17,byteswap)
+	VPMSUMW(v11,v11,v17)
+	bdz	.Lv11
+
+	lvx	v12,off64,r4
+	lvx	v16,off64,r3
+	VPERM(v12,v12,v16,byteswap)
+	VPMSUMW(v12,v12,v16)
+	bdz	.Lv12
+
+	lvx	v13,off80,r4
+	lvx	v17,off80,r3
+	VPERM(v13,v13,v17,byteswap)
+	VPMSUMW(v13,v13,v17)
+	bdz	.Lv13
+
+	lvx	v14,off96,r4
+	lvx	v16,off96,r3
+	VPERM(v14,v14,v16,byteswap)
+	VPMSUMW(v14,v14,v16)
+	bdz	.Lv14
+
+	lvx	v15,off112,r4
+	lvx	v17,off112,r3
+	VPERM(v15,v15,v17,byteswap)
+	VPMSUMW(v15,v15,v17)
+
+.Lv15:	vxor	v19,v19,v15
+.Lv14:	vxor	v20,v20,v14
+.Lv13:	vxor	v19,v19,v13
+.Lv12:	vxor	v20,v20,v12
+.Lv11:	vxor	v19,v19,v11
+.Lv10:	vxor	v20,v20,v10
+.Lv9:	vxor	v19,v19,v9
+.Lv8:	vxor	v20,v20,v8
+.Lv7:	vxor	v19,v19,v7
+.Lv6:	vxor	v20,v20,v6
+.Lv5:	vxor	v19,v19,v5
+.Lv4:	vxor	v20,v20,v4
+.Lv3:	vxor	v19,v19,v3
+.Lv2:	vxor	v20,v20,v2
+.Lv1:	vxor	v19,v19,v1
+.Lv0:	vxor	v20,v20,v0
+
+	vxor	v0,v19,v20
+
+	b	.Lbarrett_reduction
+
+.Lzero:
+	mr	r3,r10
+	b	.Lout
+
+FUNC_END(__crc32_vpmsum)
diff --git a/src/rocksdb/util/crc32c_ppc_constants.h b/src/rocksdb/util/crc32c_ppc_constants.h
new file mode 100644
index 000000000..f6494cd01
--- /dev/null
+++ b/src/rocksdb/util/crc32c_ppc_constants.h
@@ -0,0 +1,900 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  Copyright (C) 2015, 2017 International Business Machines Corp.
+//  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#define CRC 0x1edc6f41
+#define REFLECT
+#define CRC_XOR
+
+#ifndef __ASSEMBLY__
+#ifdef CRC_TABLE
+static const unsigned int crc_table[] = {
+    0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c,
+    0x26a1e7e8, 0xd4ca64eb, 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b,
+    0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, 0x105ec76f, 0xe235446c,
+    0xf165b798, 0x030e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384,
+    0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc,
+    0xbc267848, 0x4e4dfb4b, 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a,
+    0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, 0xaa64d611, 0x580f5512,
+    0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa,
+    0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x05125dad,
+    0x1642ae59, 0xe4292d5a, 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a,
+    0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, 0x417b1dbc, 0xb3109ebf,
+    0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957,
+    0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0x0c38d26c, 0xfe53516f,
+    0xed03a29b, 0x1f682198, 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927,
+    0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, 0xdbfc821c, 0x2997011f,
+    0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7,
+    0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e,
+    0x4767748a, 0xb50cf789, 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859,
+    0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, 0x7198540d, 0x83f3d70e,
+    0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6,
+    0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de,
+    0xdde0eb2a, 0x2f8b6829, 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c,
+    0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, 0x082f63b7, 0xfa44e0b4,
+    0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c,
+    0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b,
+    0xb4091bff, 0x466298fc, 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c,
+    0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, 0xa24bb5a6, 0x502036a5,
+    0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d,
+    0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975,
+    0x0e330a81, 0xfc588982, 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d,
+    0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, 0x38cc2a06, 0xcaa7a905,
+    0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed,
+    0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x0417b1db, 0xf67c32d8,
+    0xe52cc12c, 0x1747422f, 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff,
+    0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, 0xd3d3e1ab, 0x21b862a8,
+    0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540,
+    0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78,
+    0x7fab5e8c, 0x8dc0dd8f, 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee,
+    0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, 0x69e9f0d5, 0x9b8273d6,
+    0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e,
+    0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69,
+    0xd5cf889d, 0x27a40b9e, 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e,
+    0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351,
+};
+
+#endif
+
+#else
+#define MAX_SIZE 32768
+.constants :
+
+        /* Reduce 262144 kbits to 1024 bits */
+        /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */
+        .octa 0x00000000b6ca9e20000000009c37c408
+
+        /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */
+        .octa 0x00000000350249a800000001b51df26c
+
+        /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */
+        .octa 0x00000001862dac54000000000724b9d0
+
+        /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */
+        .octa 0x00000001d87fb48c00000001c00532fe
+
+        /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */
+        .octa 0x00000001f39b699e00000000f05a9362
+
+        /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */
+        .octa 0x0000000101da11b400000001e1007970
+
+        /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */
+        .octa 0x00000001cab571e000000000a57366ee
+
+        /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */
+        .octa 0x00000000c7020cfe0000000192011284
+
+        /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */
+        .octa 0x00000000cdaed1ae0000000162716d9a
+
+        /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */
+        .octa 0x00000001e804effc00000000cd97ecde
+
+        /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */
+        .octa 0x0000000077c3ea3a0000000058812bc0
+
+        /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */
+        .octa 0x0000000068df31b40000000088b8c12e
+
+        /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */
+        .octa 0x00000000b059b6c200000001230b234c
+
+        /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */
+        .octa 0x0000000145fb8ed800000001120b416e
+
+        /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */
+        .octa 0x00000000cbc0916800000001974aecb0
+
+        /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */
+        .octa 0x000000005ceeedc2000000008ee3f226
+
+        /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */
+        .octa 0x0000000047d74e8600000001089aba9a
+
+        /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */
+        .octa 0x00000001407e9e220000000065113872
+
+        /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */
+        .octa 0x00000001da967bda000000005c07ec10
+
+        /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */
+        .octa 0x000000006c8983680000000187590924
+
+        /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */
+        .octa 0x00000000f2d14c9800000000e35da7c6
+
+        /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */
+        .octa 0x00000001993c6ad4000000000415855a
+
+        /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */
+        .octa 0x000000014683d1ac0000000073617758
+
+        /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */
+        .octa 0x00000001a7c93e6c0000000176021d28
+
+        /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */
+        .octa 0x000000010211e90a00000001c358fd0a
+
+        /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */
+        .octa 0x000000001119403e00000001ff7a2c18
+
+        /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */
+        .octa 0x000000001c3261aa00000000f2d9f7e4
+
+        /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */
+        .octa 0x000000014e37a634000000016cf1f9c8
+
+        /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */
+        .octa 0x0000000073786c0c000000010af9279a
+
+        /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */
+        .octa 0x000000011dc037f80000000004f101e8
+
+        /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */
+        .octa 0x0000000031433dfc0000000070bcf184
+
+        /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */
+        .octa 0x000000009cde8348000000000a8de642
+
+        /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */
+        .octa 0x0000000038d3c2a60000000062ea130c
+
+        /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */
+        .octa 0x000000011b25f26000000001eb31cbb2
+
+        /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */
+        .octa 0x000000001629e6f00000000170783448
+
+        /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */
+        .octa 0x0000000160838b4c00000001a684b4c6
+
+        /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */
+        .octa 0x000000007a44011c00000000253ca5b4
+
+        /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */
+        .octa 0x00000000226f417a0000000057b4b1e2
+
+        /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */
+        .octa 0x0000000045eb2eb400000000b6bd084c
+
+        /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */
+        .octa 0x000000014459d70c0000000123c2d592
+
+        /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */
+        .octa 0x00000001d406ed8200000000159dafce
+
+        /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */
+        .octa 0x0000000160c8e1a80000000127e1a64e
+
+        /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */
+        .octa 0x0000000027ba80980000000056860754
+
+        /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */
+        .octa 0x000000006d92d01800000001e661aae8
+
+        /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */
+        .octa 0x000000012ed7e3f200000000f82c6166
+
+        /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */
+        .octa 0x000000002dc8778800000000c4f9c7ae
+
+        /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */
+        .octa 0x0000000018240bb80000000074203d20
+
+        /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */
+        .octa 0x000000001ad381580000000198173052
+
+        /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */
+        .octa 0x00000001396b78f200000001ce8aba54
+
+        /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */
+        .octa 0x000000011a68133400000001850d5d94
+
+        /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */
+        .octa 0x000000012104732e00000001d609239c
+
+        /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */
+        .octa 0x00000000a140d90c000000001595f048
+
+        /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */
+        .octa 0x00000001b7215eda0000000042ccee08
+
+        /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */
+        .octa 0x00000001aaf1df3c000000010a389d74
+
+        /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */
+        .octa 0x0000000029d15b8a000000012a840da6
+
+        /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */
+        .octa 0x00000000f1a96922000000001d181c0c
+
+        /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */
+        .octa 0x00000001ac80d03c0000000068b7d1f6
+
+        /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */
+        .octa 0x000000000f11d56a000000005b0f14fc
+
+        /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */
+        .octa 0x00000001f1c022a20000000179e9e730
+
+        /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */
+        .octa 0x0000000173d00ae200000001ce1368d6
+
+        /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */
+        .octa 0x00000001d4ffe4ac0000000112c3a84c
+
+        /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */
+        .octa 0x000000016edc5ae400000000de940fee
+
+        /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */
+        .octa 0x00000001f1a0214000000000fe896b7e
+
+        /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */
+        .octa 0x00000000ca0b28a000000001f797431c
+
+        /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */
+        .octa 0x00000001928e30a20000000053e989ba
+
+        /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */
+        .octa 0x0000000097b1b002000000003920cd16
+
+        /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */
+        .octa 0x00000000b15bf90600000001e6f579b8
+
+        /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */
+        .octa 0x00000000411c5d52000000007493cb0a
+
+        /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */
+        .octa 0x00000001c36f330000000001bdd376d8
+
+        /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */
+        .octa 0x00000001119227e0000000016badfee6
+
+        /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */
+        .octa 0x00000000114d47020000000071de5c58
+
+        /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */
+        .octa 0x00000000458b5b9800000000453f317c
+
+        /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */
+        .octa 0x000000012e31fb8e0000000121675cce
+
+        /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */
+        .octa 0x000000005cf619d800000001f409ee92
+
+        /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */
+        .octa 0x0000000063f4d8b200000000f36b9c88
+
+        /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */
+        .octa 0x000000004138dc8a0000000036b398f4
+
+        /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */
+        .octa 0x00000001d29ee8e000000001748f9adc
+
+        /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */
+        .octa 0x000000006a08ace800000001be94ec00
+
+        /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */
+        .octa 0x0000000127d4201000000000b74370d6
+
+        /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */
+        .octa 0x0000000019d76b6200000001174d0b98
+
+        /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */
+        .octa 0x00000001b1471f6e00000000befc06a4
+
+        /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */
+        .octa 0x00000001f64c19cc00000001ae125288
+
+        /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */
+        .octa 0x00000000003c0ea00000000095c19b34
+
+        /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */
+        .octa 0x000000014d73abf600000001a78496f2
+
+        /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */
+        .octa 0x00000001620eb84400000001ac5390a0
+
+        /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */
+        .octa 0x0000000147655048000000002a80ed6e
+
+        /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */
+        .octa 0x0000000067b5077e00000001fa9b0128
+
+        /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */
+        .octa 0x0000000010ffe20600000001ea94929e
+
+        /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */
+        .octa 0x000000000fee8f1e0000000125f4305c
+
+        /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */
+        .octa 0x00000001da26fbae00000001471e2002
+
+        /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */
+        .octa 0x00000001b3a8bd880000000132d2253a
+
+        /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */
+        .octa 0x00000000e8f3898e00000000f26b3592
+
+        /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */
+        .octa 0x00000000b0d0d28c00000000bc8b67b0
+
+        /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */
+        .octa 0x0000000030f2a798000000013a826ef2
+
+        /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */
+        .octa 0x000000000fba10020000000081482c84
+
+        /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */
+        .octa 0x00000000bdb9bd7200000000e77307c2
+
+        /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */
+        .octa 0x0000000075d3bf5a00000000d4a07ec8
+
+        /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */
+        .octa 0x00000000ef1f98a00000000017102100
+
+        /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */
+        .octa 0x00000000689c760200000000db406486
+
+        /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */
+        .octa 0x000000016d5fa5fe0000000192db7f88
+
+        /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */
+        .octa 0x00000001d0d2b9ca000000018bf67b1e
+
+        /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */
+        .octa 0x0000000041e7b470000000007c09163e
+
+        /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */
+        .octa 0x00000001cbb6495e000000000adac060
+
+        /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */
+        .octa 0x000000010052a0b000000000bd8316ae
+
+        /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */
+        .octa 0x00000001d8effb5c000000019f09ab54
+
+        /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */
+        .octa 0x00000001d969853c0000000125155542
+
+        /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */
+        .octa 0x00000000523ccce2000000018fdb5882
+
+        /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */
+        .octa 0x000000001e2436bc00000000e794b3f4
+
+        /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */
+        .octa 0x00000000ddd1c3a2000000016f9bb022
+
+        /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */
+        .octa 0x0000000019fcfe3800000000290c9978
+
+        /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */
+        .octa 0x00000001ce95db640000000083c0f350
+
+        /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */
+        .octa 0x00000000af5828060000000173ea6628
+
+        /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */
+        .octa 0x00000001006388f600000001c8b4e00a
+
+        /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */
+        .octa 0x0000000179eca00a00000000de95d6aa
+
+        /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */
+        .octa 0x0000000122410a6a000000010b7f7248
+
+        /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */
+        .octa 0x000000004288e87c00000001326e3a06
+
+        /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */
+        .octa 0x000000016c5490da00000000bb62c2e6
+
+        /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */
+        .octa 0x00000000d1c71f6e0000000156a4b2c2
+
+        /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */
+        .octa 0x00000001b4ce08a6000000011dfe763a
+
+        /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */
+        .octa 0x00000001466ba60c000000007bcca8e2
+
+        /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */
+        .octa 0x00000001f6c488a40000000186118faa
+
+        /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */
+        .octa 0x000000013bfb06820000000111a65a88
+
+        /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */
+        .octa 0x00000000690e9e54000000003565e1c4
+
+        /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */
+        .octa 0x00000000281346b6000000012ed02a82
+
+        /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */
+        .octa 0x000000015646402400000000c486ecfc
+
+        /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */
+        .octa 0x000000016063a8dc0000000001b951b2
+
+        /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */
+        .octa 0x0000000116a663620000000048143916
+
+        /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */
+        .octa 0x000000017e8aa4d200000001dc2ae124
+
+        /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */
+        .octa 0x00000001728eb10c00000001416c58d6
+
+        /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */
+        .octa 0x00000001b08fd7fa00000000a479744a
+
+        /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */
+        .octa 0x00000001092a16e80000000096ca3a26
+
+        /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */
+        .octa 0x00000000a505637c00000000ff223d4e
+
+        /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */
+        .octa 0x00000000d94869b2000000010e84da42
+
+        /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */
+        .octa 0x00000001c8b203ae00000001b61ba3d0
+
+        /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */
+        .octa 0x000000005704aea000000000680f2de8
+
+        /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */
+        .octa 0x000000012e295fa2000000008772a9a8
+
+        /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */
+        .octa 0x000000011d0908bc0000000155f295bc
+
+        /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */
+        .octa 0x0000000193ed97ea00000000595f9282
+
+        /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */
+        .octa 0x000000013a0f1c520000000164b1c25a
+
+        /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */
+        .octa 0x000000010c2c40c000000000fbd67c50
+
+        /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */
+        .octa 0x00000000ff6fac3e0000000096076268
+
+        /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */
+        .octa 0x000000017b3609c000000001d288e4cc
+
+        /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */
+        .octa 0x0000000088c8c92200000001eaac1bdc
+
+        /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */
+        .octa 0x00000001751baae600000001f1ea39e2
+
+        /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */
+        .octa 0x000000010795297200000001eb6506fc
+
+        /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */
+        .octa 0x0000000162b00abe000000010f806ffe
+
+        /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */
+        .octa 0x000000000d7b404c000000010408481e
+
+        /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */
+        .octa 0x00000000763b13d40000000188260534
+
+        /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */
+        .octa 0x00000000f6dc22d80000000058fc73e0
+
+        /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */
+        .octa 0x000000007daae06000000000391c59b8
+
+        /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */
+        .octa 0x000000013359ab7c000000018b638400
+
+        /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */
+        .octa 0x000000008add438a000000011738f5c4
+
+        /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */
+        .octa 0x00000001edbefdea000000008cf7c6da
+
+        /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */
+        .octa 0x000000004104e0f800000001ef97fb16
+
+        /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */
+        .octa 0x00000000b48a82220000000102130e20
+
+        /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */
+        .octa 0x00000001bcb4684400000000db968898
+
+        /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */
+        .octa 0x000000013293ce0a00000000b5047b5e
+
+        /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */
+        .octa 0x00000001710d0844000000010b90fdb2
+
+        /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */
+        .octa 0x0000000117907f6e000000004834a32e
+
+        /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */
+        .octa 0x0000000087ddf93e0000000059c8f2b0
+
+        /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */
+        .octa 0x000000005970e9b00000000122cec508
+
+        /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */
+        .octa 0x0000000185b2b7d0000000000a330cda
+
+        /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */
+        .octa 0x00000001dcee0efc000000014a47148c
+
+        /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */
+        .octa 0x0000000030da27220000000042c61cb8
+
+        /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */
+        .octa 0x000000012f925a180000000012fe6960
+
+        /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */
+        .octa 0x00000000dd2e357c00000000dbda2c20
+
+        /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */
+        .octa 0x00000000071c80de000000011122410c
+
+        /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */
+        .octa 0x000000011513140a00000000977b2070
+
+        /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */
+        .octa 0x00000001df876e8e000000014050438e
+
+        /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */
+        .octa 0x000000015f81d6ce0000000147c840e8
+
+        /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */
+        .octa 0x000000019dd94dbe00000001cc7c88ce
+
+        /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */
+        .octa 0x00000001373d206e00000001476b35a4
+
+        /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */
+        .octa 0x00000000668ccade000000013d52d508
+
+        /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */
+        .octa 0x00000001b192d268000000008e4be32e
+
+        /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */
+        .octa 0x00000000e30f3a7800000000024120fe
+
+        /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */
+        .octa 0x000000010ef1f7bc00000000ddecddb4
+
+        /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */
+        .octa 0x00000001f5ac738000000000d4d403bc
+
+        /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */
+        .octa 0x000000011822ea7000000001734b89aa
+
+        /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */
+        .octa 0x00000000c3a33848000000010e7a58d6
+
+        /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */
+        .octa 0x00000001bd151c2400000001f9f04e9c
+
+        /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */
+        .octa 0x0000000056002d7600000000b692225e
+
+        /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */
+        .octa 0x000000014657c4f4000000019b8d3f3e
+
+        /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */
+        .octa 0x0000000113742d7c00000001a874f11e
+
+        /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */
+        .octa 0x000000019c5920ba000000010d5a4254
+
+        /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */
+        .octa 0x000000005216d2d600000000bbb2f5d6
+
+        /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */
+        .octa 0x0000000136f5ad8a0000000179cc0e36
+
+        /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */
+        .octa 0x000000018b07beb600000001dca1da4a
+
+        /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */
+        .octa 0x00000000db1e93b000000000feb1a192
+
+        /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */
+        .octa 0x000000000b96fa3a00000000d1eeedd6
+
+        /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */
+        .octa 0x00000001d9968af0000000008fad9bb4
+
+        /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */
+        .octa 0x000000000e4a77a200000001884938e4
+
+        /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */
+        .octa 0x00000000508c2ac800000001bc2e9bc0
+
+        /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */
+        .octa 0x0000000021572a8000000001f9658a68
+
+        /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */
+        .octa 0x00000001b859daf2000000001b9224fc
+
+        /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */
+        .octa 0x000000016f7884740000000055b2fb84
+
+        /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */
+        .octa 0x00000001b438810e000000018b090348
+
+        /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */
+        .octa 0x0000000095ddc6f2000000011ccbd5ea
+
+        /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */
+        .octa 0x00000001d977c20c0000000007ae47f8
+
+        /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */
+        .octa 0x00000000ebedb99a0000000172acbec0
+
+        /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */
+        .octa 0x00000001df9e9e9200000001c6e3ff20
+
+        /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */
+        .octa 0x00000001a4a3f95200000000e1b38744
+
+        /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */
+        .octa 0x00000000e2f5122000000000791585b2
+
+        /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */
+        .octa 0x000000004aa01f3e00000000ac53b894
+
+        /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */
+        .octa 0x00000000b3e90a5800000001ed5f2cf4
+
+        /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */
+        .octa 0x000000000c9ca2aa00000001df48b2e0
+
+        /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */
+        .octa 0x000000015168231600000000049c1c62
+
+        /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */
+        .octa 0x0000000036fce78c000000017c460c12
+
+        /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */
+        .octa 0x000000009037dc10000000015be4da7e
+
+        /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */
+        .octa 0x00000000d3298582000000010f38f668
+
+        /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */
+        .octa 0x00000001b42e8ad60000000039f40a00
+
+        /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */
+        .octa 0x00000000142a983800000000bd4c10c4
+
+        /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */
+        .octa 0x0000000109c7f1900000000042db1d98
+
+        /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */
+        .octa 0x0000000056ff931000000001c905bae6
+
+        /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */
+        .octa 0x00000001594513aa00000000069d40ea
+
+        /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */
+        .octa 0x00000001e3b5b1e8000000008e4fbad0
+
+        /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */
+        .octa 0x000000011dd5fc080000000047bedd46
+
+        /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */
+        .octa 0x00000001675f0cc20000000026396bf8
+
+        /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */
+        .octa 0x00000000d1c8dd4400000000379beb92
+
+        /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */
+        .octa 0x0000000115ebd3d8000000000abae54a
+
+        /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */
+        .octa 0x00000001ecbd0dac0000000007e6a128
+
+        /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */
+        .octa 0x00000000cdf67af2000000000ade29d2
+
+        /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */
+        .octa 0x000000004c01ff4c00000000f974c45c
+
+        /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */
+        .octa 0x00000000f2d8657e00000000e77ac60a
+
+        /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */
+        .octa 0x000000006bae74c40000000145895816
+
+        /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */
+        .octa 0x0000000152af8aa00000000038e362be
+
+        /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */
+        .octa 0x0000000004663802000000007f991a64
+
+        /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */
+        .octa 0x00000001ab2f5afc00000000fa366d3a
+
+        /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */
+        .octa 0x0000000074a4ebd400000001a2bb34f0
+
+        /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */
+        .octa 0x00000001d7ab3a4c0000000028a9981e
+
+        /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */
+        .octa 0x00000001a8da60c600000001dbc672be
+
+        /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */
+        .octa 0x000000013cf6382000000000b04d77f6
+
+        /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */
+        .octa 0x00000000bec12e1e0000000124400d96
+
+        /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */
+        .octa 0x00000001c6368010000000014ca4b414
+
+        /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */
+        .octa 0x00000001e6e78758000000012fe2c938
+
+        /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */
+        .octa 0x000000008d7f2b3c00000001faed01e6
+
+        /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */
+        .octa 0x000000016b4a156e000000007e80ecfe
+
+        /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */
+        .octa 0x00000001c63cfeb60000000098daee94
+
+        /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */
+        .octa 0x000000015f902670000000010a04edea
+
+        /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */
+        .octa 0x00000001cd5de11e00000001c00b4524
+
+        /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */
+        .octa 0x000000001acaec540000000170296550
+
+        /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */
+        .octa 0x000000002bd0ca780000000181afaa48
+
+        /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */
+        .octa 0x0000000032d63d5c0000000185a31ffa
+
+        /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */
+        .octa 0x000000001c6d4e4c000000002469f608
+
+        /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */
+        .octa 0x0000000106a60b92000000006980102a
+
+        /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */
+        .octa 0x00000000d3855e120000000111ea9ca8
+
+        /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */
+        .octa 0x00000000e312563600000001bd1d29ce
+
+        /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */
+        .octa 0x000000009e8f7ea400000001b34b9580
+
+        /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */
+        .octa 0x00000001c82e562c000000003076054e
+
+        /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */
+        .octa 0x00000000ca9f09ce000000012a608ea4
+
+        /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */
+        .octa 0x00000000c63764e600000000784d05fe
+
+        /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */
+        .octa 0x0000000168d2e49e000000016ef0d82a
+
+        /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */
+        .octa 0x00000000e986c1480000000075bda454
+
+        /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */
+        .octa 0x00000000cfb65894000000003dc0a1c4
+
+        /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */
+        .octa 0x0000000111cadee400000000e9a5d8be
+
+        /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */
+        .octa 0x0000000171fb63ce00000001609bc4b4
+
+        .short_constants :
+
+        /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include
+           the trailing 32 bits of zeros */
+        /* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod
+           p(x)` */
+        .octa 0x7fec2963e5bf80485cf015c388e56f72
+
+        /* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod
+           p(x)` */
+        .octa 0x38e888d4844752a9963a18920246e2e6
+
+        /* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod
+           p(x)` */
+        .octa 0x42316c00730206ad419a441956993a31
+
+        /* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod
+           p(x)` */
+        .octa 0x543d5c543e65ddf9924752ba2b830011
+
+        /* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod
+           p(x)` */
+        .octa 0x78e87aaf56767c9255bd7f9518e4a304
+
+        /* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod
+           p(x)` */
+        .octa 0x8f68fcec1903da7f6d76739fe0553f1e
+
+        /* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod
+           p(x)` */
+        .octa 0x3f4840246791d588c133722b1fe0b5c3
+
+        /* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod
+           p(x)` */
+        .octa 0x34c96751b04de25a64b67ee0e55ef1f3
+
+        /* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)`
+         */
+        .octa 0x156c8e180b4a395b069db049b8fdb1e7
+
+        /* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */
+        .octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e
+
+        /* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */
+        .octa 0x041d37768cd75659817cdc5119b29a35
+
+        /* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */
+        .octa 0x3a0777818cfaa9651ce9d94b36c41f1c
+
+        /* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */
+        .octa 0x0e148e8252377a554f256efcb82be955
+
+        /* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */
+        .octa 0x9c25531d19e65ddeec1631edb2dea967
+
+        /* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */
+        .octa 0x790606ff9957c0a65d27e147510ac59a
+
+        /* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */
+        .octa 0x82f63b786ea2d55ca66805eb18b8ea18
+
+        .barrett_constants :
+        /* 33 bit reflected Barrett constant m - (4^32)/n */
+        .octa 0x000000000000000000000000dea713f1 /* x^64 div p(x)` */
+        /* 33 bit reflected Barrett constant n */
+        .octa 0x00000000000000000000000105ec76f1
+#endif
diff --git a/src/rocksdb/util/crc32c_test.cc b/src/rocksdb/util/crc32c_test.cc
new file mode 100644
index 000000000..715d63e2d
--- /dev/null
+++ b/src/rocksdb/util/crc32c_test.cc
@@ -0,0 +1,213 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "util/crc32c.h"
+
+#include "test_util/testharness.h"
+#include "util/coding.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace crc32c {
+
+class CRC {};
+
+// Tests for 3-way crc32c algorithm. We need these tests because it uses
+// different lookup tables than the original Fast_CRC32
+const unsigned int BUFFER_SIZE = 512 * 1024 * sizeof(uint64_t);
+char buffer[BUFFER_SIZE];
+
+struct ExpectedResult {
+  size_t offset;
+  size_t length;
+  uint32_t crc32c;
+};
+
+ExpectedResult expectedResults[] = {
+    // Zero-byte input
+    {0, 0, ~0U},
+    // Small aligned inputs to test special cases in SIMD implementations
+    {8, 1, 1543413366},
+    {8, 2, 523493126},
+    {8, 3, 1560427360},
+    {8, 4, 3422504776},
+    {8, 5, 447841138},
+    {8, 6, 3910050499},
+    {8, 7, 3346241981},
+    // Small unaligned inputs
+    {9, 1, 3855826643},
+    {10, 2, 560880875},
+    {11, 3, 1479707779},
+    {12, 4, 2237687071},
+    {13, 5, 4063855784},
+    {14, 6, 2553454047},
+    {15, 7, 1349220140},
+    // Larger inputs to test leftover chunks at the end of aligned blocks
+    {8, 8, 627613930},
+    {8, 9, 2105929409},
+    {8, 10, 2447068514},
+    {8, 11, 863807079},
+    {8, 12, 292050879},
+    {8, 13, 1411837737},
+    {8, 14, 2614515001},
+    {8, 15, 3579076296},
+    {8, 16, 2897079161},
+    {8, 17, 675168386},
+    // // Much larger inputs
+    {0, BUFFER_SIZE, 2096790750},
+    {1, BUFFER_SIZE / 2, 3854797577},
+
+};
+
+TEST(CRC, StandardResults) {
+  // Original Fast_CRC32 tests.
+  // From rfc3720 section B.4.
+  char buf[32];
+
+  memset(buf, 0, sizeof(buf));
+  ASSERT_EQ(0x8a9136aaU, Value(buf, sizeof(buf)));
+
+  memset(buf, 0xff, sizeof(buf));
+  ASSERT_EQ(0x62a8ab43U, Value(buf, sizeof(buf)));
+
+  for (int i = 0; i < 32; i++) {
+    buf[i] = static_cast<char>(i);
+  }
+  ASSERT_EQ(0x46dd794eU, Value(buf, sizeof(buf)));
+
+  for (int i = 0; i < 32; i++) {
+    buf[i] = static_cast<char>(31 - i);
+  }
+  ASSERT_EQ(0x113fdb5cU, Value(buf, sizeof(buf)));
+
+  unsigned char data[48] = {
+      0x01, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00,
+      0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x18, 0x28, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  };
+  ASSERT_EQ(0xd9963a56, Value(reinterpret_cast<char*>(data), sizeof(data)));
+
+  // 3-Way Crc32c tests ported from folly.
+  // Test 1: single computation
+  for (auto expected : expectedResults) {
+    uint32_t result = Value(buffer + expected.offset, expected.length);
+    EXPECT_EQ(~expected.crc32c, result);
+  }
+
+  // Test 2: stitching two computations
+  for (auto expected : expectedResults) {
+    size_t partialLength = expected.length / 2;
+    uint32_t partialChecksum = Value(buffer + expected.offset, partialLength);
+    uint32_t result =
+        Extend(partialChecksum, buffer + expected.offset + partialLength,
+               expected.length - partialLength);
+    EXPECT_EQ(~expected.crc32c, result);
+  }
+}
+
+TEST(CRC, Values) { ASSERT_NE(Value("a", 1), Value("foo", 3)); }
+
+TEST(CRC, Extend) {
+  ASSERT_EQ(Value("hello world", 11), Extend(Value("hello ", 6), "world", 5));
+}
+
+TEST(CRC, Mask) {
+  uint32_t crc = Value("foo", 3);
+  ASSERT_NE(crc, Mask(crc));
+  ASSERT_NE(crc, Mask(Mask(crc)));
+  ASSERT_EQ(crc, Unmask(Mask(crc)));
+  ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc)))));
+}
+
+TEST(CRC, Crc32cCombineBasicTest) {
+  uint32_t crc1 = Value("hello ", 6);
+  uint32_t crc2 = Value("world", 5);
+  uint32_t crc3 = Value("hello world", 11);
+  uint32_t crc1_2_combine = Crc32cCombine(crc1, crc2, 5);
+  ASSERT_EQ(crc3, crc1_2_combine);
+}
+
+TEST(CRC, Crc32cCombineOrderMattersTest) {
+  uint32_t crc1 = Value("hello ", 6);
+  uint32_t crc2 = Value("world", 5);
+  uint32_t crc3 = Value("hello world", 11);
+  uint32_t crc2_1_combine = Crc32cCombine(crc2, crc1, 6);
+  ASSERT_NE(crc3, crc2_1_combine);
+}
+
+TEST(CRC, Crc32cCombineFullCoverTest) {
+  int scale = 4 * 1024;
+  Random rnd(test::RandomSeed());
+  int size_1 = 1024 * 1024;
+  std::string s1 = rnd.RandomBinaryString(size_1);
+  uint32_t crc1 = Value(s1.data(), size_1);
+  for (int i = 0; i < scale; i++) {
+    int size_2 = i;
+    std::string s2 = rnd.RandomBinaryString(size_2);
+    uint32_t crc2 = Value(s2.data(), s2.size());
+    uint32_t crc1_2 = Extend(crc1, s2.data(), s2.size());
+    uint32_t crc1_2_combine = Crc32cCombine(crc1, crc2, size_2);
+    ASSERT_EQ(crc1_2, crc1_2_combine);
+  }
+}
+
+TEST(CRC, Crc32cCombineBigSizeTest) {
+  Random rnd(test::RandomSeed());
+  int size_1 = 1024 * 1024;
+  std::string s1 = rnd.RandomBinaryString(size_1);
+  uint32_t crc1 = Value(s1.data(), size_1);
+  int size_2 = 16 * 1024 * 1024 - 1;
+  std::string s2 = rnd.RandomBinaryString(size_2);
+  uint32_t crc2 = Value(s2.data(), s2.size());
+  uint32_t crc1_2 = Extend(crc1, s2.data(), s2.size());
+  uint32_t crc1_2_combine = Crc32cCombine(crc1, crc2, size_2);
+  ASSERT_EQ(crc1_2, crc1_2_combine);
+}
+
+}  // namespace crc32c
+}  // namespace ROCKSDB_NAMESPACE
+
+// copied from folly
+const uint64_t FNV_64_HASH_START = 14695981039346656037ULL;
+inline uint64_t fnv64_buf(const void* buf, size_t n,
+                          uint64_t hash = FNV_64_HASH_START) {
+  // forcing signed char, since other platforms can use unsigned
+  const signed char* char_buf = reinterpret_cast<const signed char*>(buf);
+
+  for (size_t i = 0; i < n; ++i) {
+    hash += (hash << 1) + (hash << 4) + (hash << 5) + (hash << 7) +
+            (hash << 8) + (hash << 40);
+    hash ^= char_buf[i];
+  }
+  return hash;
+}
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+
+  // Populate a buffer with a deterministic pattern
+  // on which to compute checksums
+
+  const uint8_t* src = (uint8_t*)ROCKSDB_NAMESPACE::crc32c::buffer;
+  uint64_t* dst = (uint64_t*)ROCKSDB_NAMESPACE::crc32c::buffer;
+  const uint64_t* end =
+      (const uint64_t*)(ROCKSDB_NAMESPACE::crc32c::buffer +
+                        ROCKSDB_NAMESPACE::crc32c::BUFFER_SIZE);
+  *dst++ = 0;
+  while (dst < end) {
+    ROCKSDB_NAMESPACE::EncodeFixed64(
+        reinterpret_cast<char*>(dst),
+        fnv64_buf((const char*)src, sizeof(uint64_t)));
+    dst++;
+    src += sizeof(uint64_t);
+  }
+
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/defer.h b/src/rocksdb/util/defer.h
new file mode 100644
index 000000000..f71e67ba9
--- /dev/null
+++ b/src/rocksdb/util/defer.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <functional>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Defers the execution of the provided function until the Defer
+// object goes out of scope.
+//
+// Usage example:
+//
+// Status DeferTest() {
+//   Status s;
+//   Defer defer([&s]() {
+//     if (!s.ok()) {
+//       // do cleanups ...
+//     }
+//   });
+//   // do something ...
+//   if (!s.ok()) return;
+//   // do some other things ...
+//   return s;
+// }
+//
+// The above code ensures that cleanups will always happen on returning.
+//
+// Without the help of Defer, you can
+// 1. every time when !s.ok(), do the cleanup;
+// 2. instead of returning when !s.ok(), continue the work only when s.ok(),
+//    but sometimes, this might lead to nested blocks of "if (s.ok()) {...}".
+//
+// With the help of Defer, you can centralize the cleanup logic inside the
+// lambda passed to Defer, and you can return immediately on failure when
+// necessary.
+class Defer final {
+ public:
+  explicit Defer(std::function<void()>&& fn) : fn_(std::move(fn)) {}
+  ~Defer() { fn_(); }
+
+  // Disallow copy.
+  Defer(const Defer&) = delete;
+  Defer& operator=(const Defer&) = delete;
+
+ private:
+  std::function<void()> fn_;
+};
+
+// An RAII utility object that saves the current value of an object so that
+// it can be overwritten, and restores it to the saved value when the
+// SaveAndRestore object goes out of scope.
+template <typename T>
+class SaveAndRestore {
+ public:
+  // obj is non-null pointer to value to be saved and later restored.
+  explicit SaveAndRestore(T* obj) : obj_(obj), saved_(*obj) {}
+  // new_value is stored in *obj
+  SaveAndRestore(T* obj, const T& new_value)
+      : obj_(obj), saved_(std::move(*obj)) {
+    *obj = new_value;
+  }
+  SaveAndRestore(T* obj, T&& new_value) : obj_(obj), saved_(std::move(*obj)) {
+    *obj = std::move(new_value);
+  }
+  ~SaveAndRestore() { *obj_ = std::move(saved_); }
+
+  // No copies
+  SaveAndRestore(const SaveAndRestore&) = delete;
+  SaveAndRestore& operator=(const SaveAndRestore&) = delete;
+
+ private:
+  T* const obj_;
+  T saved_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/defer_test.cc b/src/rocksdb/util/defer_test.cc
new file mode 100644
index 000000000..0e98f68b6
--- /dev/null
+++ b/src/rocksdb/util/defer_test.cc
@@ -0,0 +1,51 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/defer.h"
+
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DeferTest {};
+
+TEST(DeferTest, BlockScope) {
+  int v = 1;
+  {
+    Defer defer([&v]() { v *= 2; });
+  }
+  ASSERT_EQ(2, v);
+}
+
+TEST(DeferTest, FunctionScope) {
+  int v = 1;
+  auto f = [&v]() {
+    Defer defer([&v]() { v *= 2; });
+    v = 2;
+  };
+  f();
+  ASSERT_EQ(4, v);
+}
+
+TEST(SaveAndRestoreTest, BlockScope) {
+  int v = 1;
+  {
+    SaveAndRestore<int> sr(&v);
+    ASSERT_EQ(v, 1);
+    v = 2;
+    ASSERT_EQ(v, 2);
+  }
+  ASSERT_EQ(v, 1);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/distributed_mutex.h b/src/rocksdb/util/distributed_mutex.h
new file mode 100644
index 000000000..9675a1e2d
--- /dev/null
+++ b/src/rocksdb/util/distributed_mutex.h
@@ -0,0 +1,48 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+// This file declares a wrapper around the efficient folly DistributedMutex
+// that falls back on a standard mutex when not available. See
+// https://github.com/facebook/folly/blob/main/folly/synchronization/DistributedMutex.h
+// for benefits and limitations.
+
+// At the moment, only scoped locking is supported using DMutexLock
+// RAII wrapper, because lock/unlock APIs will vary.
+
+#ifdef USE_FOLLY
+
+#include <folly/synchronization/DistributedMutex.h>
+
+namespace ROCKSDB_NAMESPACE {
+
+class DMutex : public folly::DistributedMutex {
+ public:
+  static const char* kName() { return "folly::DistributedMutex"; }
+
+  explicit DMutex(bool IGNORED_adaptive = false) { (void)IGNORED_adaptive; }
+
+  // currently no-op
+  void AssertHeld() {}
+};
+using DMutexLock = std::lock_guard<folly::DistributedMutex>;
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#else
+
+#include "port/port.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+using DMutex = port::Mutex;
+using DMutexLock = std::lock_guard<DMutex>;
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/util/duplicate_detector.h b/src/rocksdb/util/duplicate_detector.h
new file mode 100644
index 000000000..d778622db
--- /dev/null
+++ b/src/rocksdb/util/duplicate_detector.h
@@ -0,0 +1,71 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+
+#include "db/db_impl/db_impl.h"
+#include "logging/logging.h"
+#include "util/set_comparator.h"
+
+namespace ROCKSDB_NAMESPACE {
+// During recovery if the memtable is flushed we cannot rely on its help on
+// duplicate key detection and as key insert will not be attempted. This class
+// will be used as a emulator of memtable to tell if insertion of a key/seq
+// would have resulted in duplication.
+class DuplicateDetector {
+ public:
+  explicit DuplicateDetector(DBImpl* db) : db_(db) {}
+  bool IsDuplicateKeySeq(uint32_t cf, const Slice& key, SequenceNumber seq) {
+    assert(seq >= batch_seq_);
+    if (batch_seq_ != seq) {  // it is a new batch
+      keys_.clear();
+    }
+    batch_seq_ = seq;
+    CFKeys& cf_keys = keys_[cf];
+    if (cf_keys.size() == 0) {  // just inserted
+      InitWithComp(cf);
+    }
+    auto it = cf_keys.insert(key);
+    if (it.second == false) {  // second is false if a element already existed.
+      keys_.clear();
+      InitWithComp(cf);
+      keys_[cf].insert(key);
+      return true;
+    }
+    return false;
+  }
+
+ private:
+  SequenceNumber batch_seq_ = 0;
+  DBImpl* db_;
+  using CFKeys = std::set<Slice, SetComparator>;
+  std::map<uint32_t, CFKeys> keys_;
+  void InitWithComp(const uint32_t cf) {
+    auto h = db_->GetColumnFamilyHandle(cf);
+    if (!h) {
+      // TODO(myabandeh): This is not a concern in MyRocks as drop cf is not
+      // implemented yet. When it does, we should return proper error instead
+      // of throwing exception.
+      ROCKS_LOG_FATAL(
+          db_->immutable_db_options().info_log,
+          "Recovering an entry from the dropped column family %" PRIu32
+          ". WAL must must have been emptied before dropping the column "
+          "family",
+          cf);
+#ifndef ROCKSDB_LITE
+      throw std::runtime_error(
+          "Recovering an entry from a dropped column family. "
+          "WAL must must have been flushed before dropping the column "
+          "family");
+#endif
+      return;
+    }
+    auto cmp = h->GetComparator();
+    keys_[cf] = CFKeys(SetComparator(cmp));
+  }
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/dynamic_bloom.cc b/src/rocksdb/util/dynamic_bloom.cc
new file mode 100644
index 000000000..0ff3b4a75
--- /dev/null
+++ b/src/rocksdb/util/dynamic_bloom.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "dynamic_bloom.h"
+
+#include <algorithm>
+
+#include "memory/allocator.h"
+#include "port/port.h"
+#include "rocksdb/slice.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+uint32_t roundUpToPow2(uint32_t x) {
+  uint32_t rv = 1;
+  while (rv < x) {
+    rv <<= 1;
+  }
+  return rv;
+}
+}  // namespace
+
+DynamicBloom::DynamicBloom(Allocator* allocator, uint32_t total_bits,
+                           uint32_t num_probes, size_t huge_page_tlb_size,
+                           Logger* logger)
+    // Round down, except round up with 1
+    : kNumDoubleProbes((num_probes + (num_probes == 1)) / 2) {
+  assert(num_probes % 2 == 0);  // limitation of current implementation
+  assert(num_probes <= 10);     // limitation of current implementation
+  assert(kNumDoubleProbes > 0);
+
+  // Determine how much to round off + align by so that x ^ i (that's xor) is
+  // a valid u64 index if x is a valid u64 index and 0 <= i < kNumDoubleProbes.
+  uint32_t block_bytes = /*bytes/u64*/ 8 *
+                         /*u64s*/ std::max(1U, roundUpToPow2(kNumDoubleProbes));
+  uint32_t block_bits = block_bytes * 8;
+  uint32_t blocks = (total_bits + block_bits - 1) / block_bits;
+  uint32_t sz = blocks * block_bytes;
+  kLen = sz / /*bytes/u64*/ 8;
+  assert(kLen > 0);
+#ifndef NDEBUG
+  for (uint32_t i = 0; i < kNumDoubleProbes; ++i) {
+    // Ensure probes starting at last word are in range
+    assert(((kLen - 1) ^ i) < kLen);
+  }
+#endif
+
+  // Padding to correct for allocation not originally aligned on block_bytes
+  // boundary
+  sz += block_bytes - 1;
+  assert(allocator);
+
+  char* raw = allocator->AllocateAligned(sz, huge_page_tlb_size, logger);
+  memset(raw, 0, sz);
+  auto block_offset = reinterpret_cast<uintptr_t>(raw) % block_bytes;
+  if (block_offset > 0) {
+    // Align on block_bytes boundary
+    raw += block_bytes - block_offset;
+  }
+  static_assert(sizeof(std::atomic<uint64_t>) == sizeof(uint64_t),
+                "Expecting zero-space-overhead atomic");
+  data_ = reinterpret_cast<std::atomic<uint64_t>*>(raw);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/dynamic_bloom.h b/src/rocksdb/util/dynamic_bloom.h
new file mode 100644
index 000000000..40cd29404
--- /dev/null
+++ b/src/rocksdb/util/dynamic_bloom.h
@@ -0,0 +1,214 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <array>
+#include <atomic>
+#include <memory>
+#include <string>
+
+#include "port/port.h"
+#include "rocksdb/slice.h"
+#include "table/multiget_context.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+class Allocator;
+class Logger;
+
+// A Bloom filter intended only to be used in memory, never serialized in a way
+// that could lead to schema incompatibility. Supports opt-in lock-free
+// concurrent access.
+//
+// This implementation is also intended for applications generally preferring
+// speed vs. maximum accuracy: roughly 0.9x BF op latency for 1.1x FP rate.
+// For 1% FP rate, that means that the latency of a look-up triggered by an FP
+// should be less than roughly 100x the cost of a Bloom filter op.
+//
+// For simplicity and performance, the current implementation requires
+// num_probes to be a multiple of two and <= 10.
+//
+class DynamicBloom {
+ public:
+  // allocator: pass allocator to bloom filter, hence trace the usage of memory
+  // total_bits: fixed total bits for the bloom
+  // num_probes: number of hash probes for a single key
+  // hash_func:  customized hash function
+  // huge_page_tlb_size:  if >0, try to allocate bloom bytes from huge page TLB
+  //                      within this page size. Need to reserve huge pages for
+  //                      it to be allocated, like:
+  //                         sysctl -w vm.nr_hugepages=20
+  //                     See linux doc Documentation/vm/hugetlbpage.txt
+  explicit DynamicBloom(Allocator* allocator, uint32_t total_bits,
+                        uint32_t num_probes = 6, size_t huge_page_tlb_size = 0,
+                        Logger* logger = nullptr);
+
+  ~DynamicBloom() {}
+
+  // Assuming single threaded access to this function.
+  void Add(const Slice& key);
+
+  // Like Add, but may be called concurrent with other functions.
+  void AddConcurrently(const Slice& key);
+
+  // Assuming single threaded access to this function.
+  void AddHash(uint32_t hash);
+
+  // Like AddHash, but may be called concurrent with other functions.
+  void AddHashConcurrently(uint32_t hash);
+
+  // Multithreaded access to this function is OK
+  bool MayContain(const Slice& key) const;
+
+  void MayContain(int num_keys, Slice* keys, bool* may_match) const;
+
+  // Multithreaded access to this function is OK
+  bool MayContainHash(uint32_t hash) const;
+
+  void Prefetch(uint32_t h);
+
+ private:
+  // Length of the structure, in 64-bit words. For this structure, "word"
+  // will always refer to 64-bit words.
+  uint32_t kLen;
+  // We make the k probes in pairs, two for each 64-bit read/write. Thus,
+  // this stores k/2, the number of words to double-probe.
+  const uint32_t kNumDoubleProbes;
+
+  std::atomic<uint64_t>* data_;
+
+  // or_func(ptr, mask) should effect *ptr |= mask with the appropriate
+  // concurrency safety, working with bytes.
+  template <typename OrFunc>
+  void AddHash(uint32_t hash, const OrFunc& or_func);
+
+  bool DoubleProbe(uint32_t h32, size_t a) const;
+};
+
+inline void DynamicBloom::Add(const Slice& key) { AddHash(BloomHash(key)); }
+
+inline void DynamicBloom::AddConcurrently(const Slice& key) {
+  AddHashConcurrently(BloomHash(key));
+}
+
+inline void DynamicBloom::AddHash(uint32_t hash) {
+  AddHash(hash, [](std::atomic<uint64_t>* ptr, uint64_t mask) {
+    ptr->store(ptr->load(std::memory_order_relaxed) | mask,
+               std::memory_order_relaxed);
+  });
+}
+
+inline void DynamicBloom::AddHashConcurrently(uint32_t hash) {
+  AddHash(hash, [](std::atomic<uint64_t>* ptr, uint64_t mask) {
+    // Happens-before between AddHash and MaybeContains is handled by
+    // access to versions_->LastSequence(), so all we have to do here is
+    // avoid races (so we don't give the compiler a license to mess up
+    // our code) and not lose bits.  std::memory_order_relaxed is enough
+    // for that.
+    if ((mask & ptr->load(std::memory_order_relaxed)) != mask) {
+      ptr->fetch_or(mask, std::memory_order_relaxed);
+    }
+  });
+}
+
+inline bool DynamicBloom::MayContain(const Slice& key) const {
+  return (MayContainHash(BloomHash(key)));
+}
+
+inline void DynamicBloom::MayContain(int num_keys, Slice* keys,
+                                     bool* may_match) const {
+  std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes;
+  std::array<size_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets;
+  for (int i = 0; i < num_keys; ++i) {
+    hashes[i] = BloomHash(keys[i]);
+    size_t a = FastRange32(kLen, hashes[i]);
+    PREFETCH(data_ + a, 0, 3);
+    byte_offsets[i] = a;
+  }
+
+  for (int i = 0; i < num_keys; i++) {
+    may_match[i] = DoubleProbe(hashes[i], byte_offsets[i]);
+  }
+}
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+// local variable is initialized but not referenced
+#pragma warning(disable : 4189)
+#endif
+inline void DynamicBloom::Prefetch(uint32_t h32) {
+  size_t a = FastRange32(kLen, h32);
+  PREFETCH(data_ + a, 0, 3);
+}
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+// Speed hacks in this implementation:
+// * Uses fastrange instead of %
+// * Minimum logic to determine first (and all) probed memory addresses.
+//   (Uses constant bit-xor offsets from the starting probe address.)
+// * (Major) Two probes per 64-bit memory fetch/write.
+//   Code simplification / optimization: only allow even number of probes.
+// * Very fast and effective (murmur-like) hash expansion/re-mixing. (At
+// least on recent CPUs, integer multiplication is very cheap. Each 64-bit
+// remix provides five pairs of bit addresses within a uint64_t.)
+//   Code simplification / optimization: only allow up to 10 probes, from a
+//   single 64-bit remix.
+//
+// The FP rate penalty for this implementation, vs. standard Bloom filter, is
+// roughly 1.12x on top of the 1.15x penalty for a 512-bit cache-local Bloom.
+// This implementation does not explicitly use the cache line size, but is
+// effectively cache-local (up to 16 probes) because of the bit-xor offsetting.
+//
+// NB: could easily be upgraded to support a 64-bit hash and
+// total_bits > 2^32 (512MB). (The latter is a bad idea without the former,
+// because of false positives.)
+
+inline bool DynamicBloom::MayContainHash(uint32_t h32) const {
+  size_t a = FastRange32(kLen, h32);
+  PREFETCH(data_ + a, 0, 3);
+  return DoubleProbe(h32, a);
+}
+
+inline bool DynamicBloom::DoubleProbe(uint32_t h32, size_t byte_offset) const {
+  // Expand/remix with 64-bit golden ratio
+  uint64_t h = 0x9e3779b97f4a7c13ULL * h32;
+  for (unsigned i = 0;; ++i) {
+    // Two bit probes per uint64_t probe
+    uint64_t mask =
+        ((uint64_t)1 << (h & 63)) | ((uint64_t)1 << ((h >> 6) & 63));
+    uint64_t val = data_[byte_offset ^ i].load(std::memory_order_relaxed);
+    if (i + 1 >= kNumDoubleProbes) {
+      return (val & mask) == mask;
+    } else if ((val & mask) != mask) {
+      return false;
+    }
+    h = (h >> 12) | (h << 52);
+  }
+}
+
+template <typename OrFunc>
+inline void DynamicBloom::AddHash(uint32_t h32, const OrFunc& or_func) {
+  size_t a = FastRange32(kLen, h32);
+  PREFETCH(data_ + a, 0, 3);
+  // Expand/remix with 64-bit golden ratio
+  uint64_t h = 0x9e3779b97f4a7c13ULL * h32;
+  for (unsigned i = 0;; ++i) {
+    // Two bit probes per uint64_t probe
+    uint64_t mask =
+        ((uint64_t)1 << (h & 63)) | ((uint64_t)1 << ((h >> 6) & 63));
+    or_func(&data_[a ^ i], mask);
+    if (i + 1 >= kNumDoubleProbes) {
+      return;
+    }
+    h = (h >> 12) | (h << 52);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/dynamic_bloom_test.cc b/src/rocksdb/util/dynamic_bloom_test.cc
new file mode 100644
index 000000000..925c5479a
--- /dev/null
+++ b/src/rocksdb/util/dynamic_bloom_test.cc
@@ -0,0 +1,325 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run this test... Skipping...\n");
+  return 0;
+}
+#else
+
+#include <algorithm>
+#include <atomic>
+#include <cinttypes>
+#include <functional>
+#include <memory>
+#include <thread>
+#include <vector>
+
+#include "dynamic_bloom.h"
+#include "memory/arena.h"
+#include "port/port.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/gflags_compat.h"
+#include "util/stop_watch.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_int32(bits_per_key, 10, "");
+DEFINE_int32(num_probes, 6, "");
+DEFINE_bool(enable_perf, false, "");
+
+namespace ROCKSDB_NAMESPACE {
+
+struct KeyMaker {
+  uint64_t a;
+  uint64_t b;
+
+  // Sequential, within a hash function block
+  inline Slice Seq(uint64_t i) {
+    a = i;
+    return Slice(reinterpret_cast<char *>(&a), sizeof(a));
+  }
+  // Not quite sequential, varies across hash function blocks
+  inline Slice Nonseq(uint64_t i) {
+    a = i;
+    b = i * 123;
+    return Slice(reinterpret_cast<char *>(this), sizeof(*this));
+  }
+  inline Slice Key(uint64_t i, bool nonseq) {
+    return nonseq ? Nonseq(i) : Seq(i);
+  }
+};
+
+class DynamicBloomTest : public testing::Test {};
+
+TEST_F(DynamicBloomTest, EmptyFilter) {
+  Arena arena;
+  DynamicBloom bloom1(&arena, 100, 2);
+  ASSERT_TRUE(!bloom1.MayContain("hello"));
+  ASSERT_TRUE(!bloom1.MayContain("world"));
+
+  DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 2);
+  ASSERT_TRUE(!bloom2.MayContain("hello"));
+  ASSERT_TRUE(!bloom2.MayContain("world"));
+}
+
+TEST_F(DynamicBloomTest, Small) {
+  Arena arena;
+  DynamicBloom bloom1(&arena, 100, 2);
+  bloom1.Add("hello");
+  bloom1.Add("world");
+  ASSERT_TRUE(bloom1.MayContain("hello"));
+  ASSERT_TRUE(bloom1.MayContain("world"));
+  ASSERT_TRUE(!bloom1.MayContain("x"));
+  ASSERT_TRUE(!bloom1.MayContain("foo"));
+
+  DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 2);
+  bloom2.Add("hello");
+  bloom2.Add("world");
+  ASSERT_TRUE(bloom2.MayContain("hello"));
+  ASSERT_TRUE(bloom2.MayContain("world"));
+  ASSERT_TRUE(!bloom2.MayContain("x"));
+  ASSERT_TRUE(!bloom2.MayContain("foo"));
+}
+
+TEST_F(DynamicBloomTest, SmallConcurrentAdd) {
+  Arena arena;
+  DynamicBloom bloom1(&arena, 100, 2);
+  bloom1.AddConcurrently("hello");
+  bloom1.AddConcurrently("world");
+  ASSERT_TRUE(bloom1.MayContain("hello"));
+  ASSERT_TRUE(bloom1.MayContain("world"));
+  ASSERT_TRUE(!bloom1.MayContain("x"));
+  ASSERT_TRUE(!bloom1.MayContain("foo"));
+
+  DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 2);
+  bloom2.AddConcurrently("hello");
+  bloom2.AddConcurrently("world");
+  ASSERT_TRUE(bloom2.MayContain("hello"));
+  ASSERT_TRUE(bloom2.MayContain("world"));
+  ASSERT_TRUE(!bloom2.MayContain("x"));
+  ASSERT_TRUE(!bloom2.MayContain("foo"));
+}
+
+static uint32_t NextNum(uint32_t num) {
+  if (num < 10) {
+    num += 1;
+  } else if (num < 100) {
+    num += 10;
+  } else if (num < 1000) {
+    num += 100;
+  } else {
+    num = num * 26 / 10;
+  }
+  return num;
+}
+
+TEST_F(DynamicBloomTest, VaryingLengths) {
+  KeyMaker km;
+
+  // Count number of filters that significantly exceed the false positive rate
+  int mediocre_filters = 0;
+  int good_filters = 0;
+  uint32_t num_probes = static_cast<uint32_t>(FLAGS_num_probes);
+
+  fprintf(stderr, "bits_per_key: %d  num_probes: %d\n", FLAGS_bits_per_key,
+          num_probes);
+
+  // NB: FP rate impact of 32-bit hash is noticeable starting around 10M keys.
+  // But that effect is hidden if using sequential keys (unique hashes).
+  for (bool nonseq : {false, true}) {
+    const uint32_t max_num = FLAGS_enable_perf ? 40000000 : 400000;
+    for (uint32_t num = 1; num <= max_num; num = NextNum(num)) {
+      uint32_t bloom_bits = 0;
+      Arena arena;
+      bloom_bits = num * FLAGS_bits_per_key;
+      DynamicBloom bloom(&arena, bloom_bits, num_probes);
+      for (uint64_t i = 0; i < num; i++) {
+        bloom.Add(km.Key(i, nonseq));
+        ASSERT_TRUE(bloom.MayContain(km.Key(i, nonseq)));
+      }
+
+      // All added keys must match
+      for (uint64_t i = 0; i < num; i++) {
+        ASSERT_TRUE(bloom.MayContain(km.Key(i, nonseq)));
+      }
+
+      // Check false positive rate
+      int result = 0;
+      for (uint64_t i = 0; i < 30000; i++) {
+        if (bloom.MayContain(km.Key(i + 1000000000, nonseq))) {
+          result++;
+        }
+      }
+      double rate = result / 30000.0;
+
+      fprintf(stderr,
+              "False positives (%s keys): "
+              "%5.2f%% @ num = %6u, bloom_bits = %6u\n",
+              nonseq ? "nonseq" : "seq", rate * 100.0, num, bloom_bits);
+
+      if (rate > 0.0125)
+        mediocre_filters++;  // Allowed, but not too often
+      else
+        good_filters++;
+    }
+  }
+
+  fprintf(stderr, "Filters: %d good, %d mediocre\n", good_filters,
+          mediocre_filters);
+  ASSERT_LE(mediocre_filters, good_filters / 25);
+}
+
+TEST_F(DynamicBloomTest, perf) {
+  KeyMaker km;
+  StopWatchNano timer(SystemClock::Default().get());
+  uint32_t num_probes = static_cast<uint32_t>(FLAGS_num_probes);
+
+  if (!FLAGS_enable_perf) {
+    return;
+  }
+
+  for (uint32_t m = 1; m <= 8; ++m) {
+    Arena arena;
+    const uint32_t num_keys = m * 8 * 1024 * 1024;
+    fprintf(stderr, "testing %" PRIu32 "M keys\n", m * 8);
+
+    DynamicBloom std_bloom(&arena, num_keys * 10, num_probes);
+
+    timer.Start();
+    for (uint64_t i = 1; i <= num_keys; ++i) {
+      std_bloom.Add(km.Seq(i));
+    }
+
+    uint64_t elapsed = timer.ElapsedNanos();
+    fprintf(stderr, "dynamic bloom, avg add latency %3g\n",
+            static_cast<double>(elapsed) / num_keys);
+
+    uint32_t count = 0;
+    timer.Start();
+    for (uint64_t i = 1; i <= num_keys; ++i) {
+      if (std_bloom.MayContain(km.Seq(i))) {
+        ++count;
+      }
+    }
+    ASSERT_EQ(count, num_keys);
+    elapsed = timer.ElapsedNanos();
+    assert(count > 0);
+    fprintf(stderr, "dynamic bloom, avg query latency %3g\n",
+            static_cast<double>(elapsed) / count);
+  }
+}
+
+TEST_F(DynamicBloomTest, concurrent_with_perf) {
+  uint32_t num_probes = static_cast<uint32_t>(FLAGS_num_probes);
+
+  uint32_t m_limit = FLAGS_enable_perf ? 8 : 1;
+
+  uint32_t num_threads = 4;
+  std::vector<port::Thread> threads;
+
+  // NB: Uses sequential keys for speed, but that hides the FP rate
+  // impact of 32-bit hash, which is noticeable starting around 10M keys
+  // when they vary across hashing blocks.
+  for (uint32_t m = 1; m <= m_limit; ++m) {
+    Arena arena;
+    const uint32_t num_keys = m * 8 * 1024 * 1024;
+    fprintf(stderr, "testing %" PRIu32 "M keys\n", m * 8);
+
+    DynamicBloom std_bloom(&arena, num_keys * 10, num_probes);
+
+    std::atomic<uint64_t> elapsed(0);
+
+    std::function<void(size_t)> adder([&](size_t t) {
+      KeyMaker km;
+      StopWatchNano timer(SystemClock::Default().get());
+      timer.Start();
+      for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
+        std_bloom.AddConcurrently(km.Seq(i));
+      }
+      elapsed += timer.ElapsedNanos();
+    });
+    for (size_t t = 0; t < num_threads; ++t) {
+      threads.emplace_back(adder, t);
+    }
+    while (threads.size() > 0) {
+      threads.back().join();
+      threads.pop_back();
+    }
+
+    fprintf(stderr,
+            "dynamic bloom, avg parallel add latency %3g"
+            " nanos/key\n",
+            static_cast<double>(elapsed) / num_threads / num_keys);
+
+    elapsed = 0;
+    std::function<void(size_t)> hitter([&](size_t t) {
+      KeyMaker km;
+      StopWatchNano timer(SystemClock::Default().get());
+      timer.Start();
+      for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
+        bool f = std_bloom.MayContain(km.Seq(i));
+        ASSERT_TRUE(f);
+      }
+      elapsed += timer.ElapsedNanos();
+    });
+    for (size_t t = 0; t < num_threads; ++t) {
+      threads.emplace_back(hitter, t);
+    }
+    while (threads.size() > 0) {
+      threads.back().join();
+      threads.pop_back();
+    }
+
+    fprintf(stderr,
+            "dynamic bloom, avg parallel hit latency %3g"
+            " nanos/key\n",
+            static_cast<double>(elapsed) / num_threads / num_keys);
+
+    elapsed = 0;
+    std::atomic<uint32_t> false_positives(0);
+    std::function<void(size_t)> misser([&](size_t t) {
+      KeyMaker km;
+      StopWatchNano timer(SystemClock::Default().get());
+      timer.Start();
+      for (uint64_t i = num_keys + 1 + t; i <= 2 * num_keys; i += num_threads) {
+        bool f = std_bloom.MayContain(km.Seq(i));
+        if (f) {
+          ++false_positives;
+        }
+      }
+      elapsed += timer.ElapsedNanos();
+    });
+    for (size_t t = 0; t < num_threads; ++t) {
+      threads.emplace_back(misser, t);
+    }
+    while (threads.size() > 0) {
+      threads.back().join();
+      threads.pop_back();
+    }
+
+    fprintf(stderr,
+            "dynamic bloom, avg parallel miss latency %3g"
+            " nanos/key, %f%% false positive rate\n",
+            static_cast<double>(elapsed) / num_threads / num_keys,
+            false_positives.load() * 100.0 / num_keys);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char **argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  return RUN_ALL_TESTS();
+}
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/util/fastrange.h b/src/rocksdb/util/fastrange.h
new file mode 100644
index 000000000..a70a980f6
--- /dev/null
+++ b/src/rocksdb/util/fastrange.h
@@ -0,0 +1,114 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+// fastrange/FastRange: A faster alternative to % for mapping a hash value
+// to an arbitrary range. See https://github.com/lemire/fastrange
+//
+// Generally recommended are FastRange32 for mapping results of 32-bit
+// hash functions and FastRange64 for mapping results of 64-bit hash
+// functions. FastRange is less forgiving than % if the input hashes are
+// not well distributed over the full range of the type (32 or 64 bits).
+//
+// Also included is a templated implementation FastRangeGeneric for use
+// in generic algorithms, but not otherwise recommended because of
+// potential ambiguity. Unlike with %, it is critical to use the right
+// FastRange variant for the output size of your hash function.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+#ifdef TEST_UINT128_COMPAT
+#undef HAVE_UINT128_EXTENSION
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace detail {
+
+// Using a class template to support partial specialization
+template <typename Hash, typename Range>
+struct FastRangeGenericImpl {
+  // only reach this on no supported specialization
+};
+
+template <typename Range>
+struct FastRangeGenericImpl<uint32_t, Range> {
+  static inline Range Fn(uint32_t hash, Range range) {
+    static_assert(std::is_unsigned<Range>::value, "must be unsigned");
+    static_assert(sizeof(Range) <= sizeof(uint32_t),
+                  "cannot be larger than hash (32 bits)");
+
+    uint64_t product = uint64_t{range} * hash;
+    return static_cast<Range>(product >> 32);
+  }
+};
+
+template <typename Range>
+struct FastRangeGenericImpl<uint64_t, Range> {
+  static inline Range Fn(uint64_t hash, Range range) {
+    static_assert(std::is_unsigned<Range>::value, "must be unsigned");
+    static_assert(sizeof(Range) <= sizeof(uint64_t),
+                  "cannot be larger than hash (64 bits)");
+
+#ifdef HAVE_UINT128_EXTENSION
+    // Can use compiler's 128-bit type. Trust it to do the right thing.
+    __uint128_t wide = __uint128_t{range} * hash;
+    return static_cast<Range>(wide >> 64);
+#else
+    // Fall back: full decomposition.
+    // NOTE: GCC seems to fully understand this code as 64-bit x 64-bit
+    // -> 128-bit multiplication and optimize it appropriately
+    uint64_t range64 = range;  // ok to shift by 32, even if Range is 32-bit
+    uint64_t tmp = uint64_t{range64 & 0xffffFFFF} * uint64_t{hash & 0xffffFFFF};
+    tmp >>= 32;
+    tmp += uint64_t{range64 & 0xffffFFFF} * uint64_t{hash >> 32};
+    // Avoid overflow: first add lower 32 of tmp2, and later upper 32
+    uint64_t tmp2 = uint64_t{range64 >> 32} * uint64_t{hash & 0xffffFFFF};
+    tmp += static_cast<uint32_t>(tmp2);
+    tmp >>= 32;
+    tmp += (tmp2 >> 32);
+    tmp += uint64_t{range64 >> 32} * uint64_t{hash >> 32};
+    return static_cast<Range>(tmp);
+#endif
+  }
+};
+
+}  // namespace detail
+
+// Now an omnibus templated function (yay parameter inference).
+//
+// NOTICE:
+// This templated version is not recommended for typical use because
+// of the potential to mix a 64-bit FastRange with a 32-bit bit hash,
+// most likely because you put your 32-bit hash in an "unsigned long"
+// which is 64 bits on some platforms. That doesn't really matter for
+// an operation like %, but 64-bit FastRange gives extremely bad results,
+// mostly zero, on 32-bit hash values. And because good hashing is not
+// generally required for correctness, this kind of mistake could go
+// unnoticed with just unit tests. Plus it could vary by platform.
+template <typename Hash, typename Range>
+inline Range FastRangeGeneric(Hash hash, Range range) {
+  return detail::FastRangeGenericImpl<Hash, Range>::Fn(hash, range);
+}
+
+// The most popular / convenient / recommended variants:
+
+// Map a quality 64-bit hash value down to an arbitrary size_t range.
+// (size_t is standard for mapping to things in memory.)
+inline size_t FastRange64(uint64_t hash, size_t range) {
+  return FastRangeGeneric(hash, range);
+}
+
+// Map a quality 32-bit hash value down to an arbitrary uint32_t range.
+inline uint32_t FastRange32(uint32_t hash, uint32_t range) {
+  return FastRangeGeneric(hash, range);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/file_checksum_helper.cc b/src/rocksdb/util/file_checksum_helper.cc
new file mode 100644
index 000000000..a73920352
--- /dev/null
+++ b/src/rocksdb/util/file_checksum_helper.cc
@@ -0,0 +1,172 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+//  Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+//  Use of this source code is governed by a BSD-style license that can be
+//  found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/file_checksum_helper.h"
+
+#include <unordered_set>
+
+#include "db/log_reader.h"
+#include "db/version_edit.h"
+#include "db/version_edit_handler.h"
+#include "file/sequence_file_reader.h"
+#include "rocksdb/utilities/customizable_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void FileChecksumListImpl::reset() { checksum_map_.clear(); }
+
+size_t FileChecksumListImpl::size() const { return checksum_map_.size(); }
+
+Status FileChecksumListImpl::GetAllFileChecksums(
+    std::vector<uint64_t>* file_numbers, std::vector<std::string>* checksums,
+    std::vector<std::string>* checksum_func_names) {
+  if (file_numbers == nullptr || checksums == nullptr ||
+      checksum_func_names == nullptr) {
+    return Status::InvalidArgument("Pointer has not been initiated");
+  }
+
+  for (auto i : checksum_map_) {
+    file_numbers->push_back(i.first);
+    checksums->push_back(i.second.first);
+    checksum_func_names->push_back(i.second.second);
+  }
+  return Status::OK();
+}
+
+Status FileChecksumListImpl::SearchOneFileChecksum(
+    uint64_t file_number, std::string* checksum,
+    std::string* checksum_func_name) {
+  if (checksum == nullptr || checksum_func_name == nullptr) {
+    return Status::InvalidArgument("Pointer has not been initiated");
+  }
+
+  auto it = checksum_map_.find(file_number);
+  if (it == checksum_map_.end()) {
+    return Status::NotFound();
+  } else {
+    *checksum = it->second.first;
+    *checksum_func_name = it->second.second;
+  }
+  return Status::OK();
+}
+
+Status FileChecksumListImpl::InsertOneFileChecksum(
+    uint64_t file_number, const std::string& checksum,
+    const std::string& checksum_func_name) {
+  auto it = checksum_map_.find(file_number);
+  if (it == checksum_map_.end()) {
+    checksum_map_.insert(std::make_pair(
+        file_number, std::make_pair(checksum, checksum_func_name)));
+  } else {
+    it->second.first = checksum;
+    it->second.second = checksum_func_name;
+  }
+  return Status::OK();
+}
+
+Status FileChecksumListImpl::RemoveOneFileChecksum(uint64_t file_number) {
+  auto it = checksum_map_.find(file_number);
+  if (it == checksum_map_.end()) {
+    return Status::NotFound();
+  } else {
+    checksum_map_.erase(it);
+  }
+  return Status::OK();
+}
+
+FileChecksumList* NewFileChecksumList() {
+  FileChecksumListImpl* checksum_list = new FileChecksumListImpl();
+  return checksum_list;
+}
+
+std::shared_ptr<FileChecksumGenFactory> GetFileChecksumGenCrc32cFactory() {
+  static std::shared_ptr<FileChecksumGenFactory> default_crc32c_gen_factory(
+      new FileChecksumGenCrc32cFactory());
+  return default_crc32c_gen_factory;
+}
+
+Status GetFileChecksumsFromManifest(Env* src_env, const std::string& abs_path,
+                                    uint64_t manifest_file_size,
+                                    FileChecksumList* checksum_list) {
+  if (checksum_list == nullptr) {
+    return Status::InvalidArgument("checksum_list is nullptr");
+  }
+  assert(checksum_list);
+  checksum_list->reset();
+  Status s;
+
+  std::unique_ptr<SequentialFileReader> file_reader;
+  {
+    std::unique_ptr<FSSequentialFile> file;
+    const std::shared_ptr<FileSystem>& fs = src_env->GetFileSystem();
+    s = fs->NewSequentialFile(abs_path,
+                              fs->OptimizeForManifestRead(FileOptions()), &file,
+                              nullptr /* dbg */);
+    if (!s.ok()) {
+      return s;
+    }
+    file_reader.reset(new SequentialFileReader(std::move(file), abs_path));
+  }
+
+  struct LogReporter : public log::Reader::Reporter {
+    Status* status_ptr;
+    virtual void Corruption(size_t /*bytes*/, const Status& st) override {
+      if (status_ptr->ok()) {
+        *status_ptr = st;
+      }
+    }
+  } reporter;
+  reporter.status_ptr = &s;
+  log::Reader reader(nullptr, std::move(file_reader), &reporter,
+                     true /* checksum */, 0 /* log_number */);
+  FileChecksumRetriever retriever(manifest_file_size, *checksum_list);
+  retriever.Iterate(reader, &s);
+  assert(!retriever.status().ok() ||
+         manifest_file_size == std::numeric_limits<uint64_t>::max() ||
+         reader.LastRecordEnd() == manifest_file_size);
+
+  return retriever.status();
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+static int RegisterFileChecksumGenFactories(ObjectLibrary& library,
+                                            const std::string& /*arg*/) {
+  library.AddFactory<FileChecksumGenFactory>(
+      FileChecksumGenCrc32cFactory::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<FileChecksumGenFactory>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new FileChecksumGenCrc32cFactory());
+        return guard->get();
+      });
+  return 1;
+}
+}  // namespace
+#endif  // !ROCKSDB_LITE
+
+Status FileChecksumGenFactory::CreateFromString(
+    const ConfigOptions& options, const std::string& value,
+    std::shared_ptr<FileChecksumGenFactory>* result) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    RegisterFileChecksumGenFactories(*(ObjectLibrary::Default().get()), "");
+  });
+#endif  // ROCKSDB_LITE
+  if (value == FileChecksumGenCrc32cFactory::kClassName()) {
+    *result = GetFileChecksumGenCrc32cFactory();
+    return Status::OK();
+  } else {
+    Status s = LoadSharedObject<FileChecksumGenFactory>(options, value, nullptr,
+                                                        result);
+    return s;
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/file_checksum_helper.h b/src/rocksdb/util/file_checksum_helper.h
new file mode 100644
index 000000000..d622e9bba
--- /dev/null
+++ b/src/rocksdb/util/file_checksum_helper.h
@@ -0,0 +1,100 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <cassert>
+#include <unordered_map>
+
+#include "port/port.h"
+#include "rocksdb/file_checksum.h"
+#include "rocksdb/status.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/math.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This is the class to generate the file checksum based on Crc32. It
+// will be used as the default checksum method for SST file checksum
+class FileChecksumGenCrc32c : public FileChecksumGenerator {
+ public:
+  FileChecksumGenCrc32c(const FileChecksumGenContext& /*context*/) {
+    checksum_ = 0;
+  }
+
+  void Update(const char* data, size_t n) override {
+    checksum_ = crc32c::Extend(checksum_, data, n);
+  }
+
+  void Finalize() override {
+    assert(checksum_str_.empty());
+    // Store as big endian raw bytes
+    PutFixed32(&checksum_str_, EndianSwapValue(checksum_));
+  }
+
+  std::string GetChecksum() const override {
+    assert(!checksum_str_.empty());
+    return checksum_str_;
+  }
+
+  const char* Name() const override { return "FileChecksumCrc32c"; }
+
+ private:
+  uint32_t checksum_;
+  std::string checksum_str_;
+};
+
+class FileChecksumGenCrc32cFactory : public FileChecksumGenFactory {
+ public:
+  std::unique_ptr<FileChecksumGenerator> CreateFileChecksumGenerator(
+      const FileChecksumGenContext& context) override {
+    if (context.requested_checksum_func_name.empty() ||
+        context.requested_checksum_func_name == "FileChecksumCrc32c") {
+      return std::unique_ptr<FileChecksumGenerator>(
+          new FileChecksumGenCrc32c(context));
+    } else {
+      return nullptr;
+    }
+  }
+
+  static const char* kClassName() { return "FileChecksumGenCrc32cFactory"; }
+  const char* Name() const override { return kClassName(); }
+};
+
+// The default implementaion of FileChecksumList
+class FileChecksumListImpl : public FileChecksumList {
+ public:
+  FileChecksumListImpl() {}
+  void reset() override;
+
+  size_t size() const override;
+
+  Status GetAllFileChecksums(
+      std::vector<uint64_t>* file_numbers, std::vector<std::string>* checksums,
+      std::vector<std::string>* checksum_func_names) override;
+
+  Status SearchOneFileChecksum(uint64_t file_number, std::string* checksum,
+                               std::string* checksum_func_name) override;
+
+  Status InsertOneFileChecksum(uint64_t file_number,
+                               const std::string& checksum,
+                               const std::string& checksum_func_name) override;
+
+  Status RemoveOneFileChecksum(uint64_t file_number) override;
+
+ private:
+  // Key is the file number, the first portion of the value is checksum, the
+  // second portion of the value is checksum function name.
+  std::unordered_map<uint64_t, std::pair<std::string, std::string>>
+      checksum_map_;
+};
+
+// If manifest_file_size < std::numeric_limits<uint64_t>::max(), only use
+// that length prefix of the manifest file.
+Status GetFileChecksumsFromManifest(Env* src_env, const std::string& abs_path,
+                                    uint64_t manifest_file_size,
+                                    FileChecksumList* checksum_list);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/file_reader_writer_test.cc b/src/rocksdb/util/file_reader_writer_test.cc
new file mode 100644
index 000000000..e778efc3c
--- /dev/null
+++ b/src/rocksdb/util/file_reader_writer_test.cc
@@ -0,0 +1,1066 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include <algorithm>
+#include <vector>
+
+#include "db/db_test_util.h"
+#include "env/mock_env.h"
+#include "file/line_file_reader.h"
+#include "file/random_access_file_reader.h"
+#include "file/read_write_util.h"
+#include "file/readahead_raf.h"
+#include "file/sequence_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/file_system.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/crc32c.h"
+#include "util/random.h"
+#include "utilities/fault_injection_fs.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WritableFileWriterTest : public testing::Test {};
+
+constexpr uint32_t kMb = static_cast<uint32_t>(1) << 20;
+
+TEST_F(WritableFileWriterTest, RangeSync) {
+  class FakeWF : public FSWritableFile {
+   public:
+    explicit FakeWF() : size_(0), last_synced_(0) {}
+    ~FakeWF() override {}
+
+    using FSWritableFile::Append;
+    IOStatus Append(const Slice& data, const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+      size_ += data.size();
+      return IOStatus::OK();
+    }
+    IOStatus Truncate(uint64_t /*size*/, const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Close(const IOOptions& /*options*/,
+                   IODebugContext* /*dbg*/) override {
+      EXPECT_GE(size_, last_synced_ + kMb);
+      EXPECT_LT(size_, last_synced_ + 2 * kMb);
+      // Make sure random writes generated enough writes.
+      EXPECT_GT(size_, 10 * kMb);
+      return IOStatus::OK();
+    }
+    IOStatus Flush(const IOOptions& /*options*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Sync(const IOOptions& /*options*/,
+                  IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Fsync(const IOOptions& /*options*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    void SetIOPriority(Env::IOPriority /*pri*/) override {}
+    uint64_t GetFileSize(const IOOptions& /*options*/,
+                         IODebugContext* /*dbg*/) override {
+      return size_;
+    }
+    void GetPreallocationStatus(size_t* /*block_size*/,
+                                size_t* /*last_allocated_block*/) override {}
+    size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const override {
+      return 0;
+    }
+    IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) override {
+      return IOStatus::OK();
+    }
+
+   protected:
+    IOStatus Allocate(uint64_t /*offset*/, uint64_t /*len*/,
+                      const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus RangeSync(uint64_t offset, uint64_t nbytes,
+                       const IOOptions& /*options*/,
+                       IODebugContext* /*dbg*/) override {
+      EXPECT_EQ(offset % 4096, 0u);
+      EXPECT_EQ(nbytes % 4096, 0u);
+
+      EXPECT_EQ(offset, last_synced_);
+      last_synced_ = offset + nbytes;
+      EXPECT_GE(size_, last_synced_ + kMb);
+      if (size_ > 2 * kMb) {
+        EXPECT_LT(size_, last_synced_ + 2 * kMb);
+      }
+      return IOStatus::OK();
+    }
+
+    uint64_t size_;
+    uint64_t last_synced_;
+  };
+
+  EnvOptions env_options;
+  env_options.bytes_per_sync = kMb;
+  std::unique_ptr<FakeWF> wf(new FakeWF);
+  std::unique_ptr<WritableFileWriter> writer(
+      new WritableFileWriter(std::move(wf), "" /* don't care */, env_options));
+  Random r(301);
+  Status s;
+  std::unique_ptr<char[]> large_buf(new char[10 * kMb]);
+  for (int i = 0; i < 1000; i++) {
+    int skew_limit = (i < 700) ? 10 : 15;
+    uint32_t num = r.Skewed(skew_limit) * 100 + r.Uniform(100);
+    s = writer->Append(Slice(large_buf.get(), num));
+    ASSERT_OK(s);
+
+    // Flush in a chance of 1/10.
+    if (r.Uniform(10) == 0) {
+      s = writer->Flush();
+      ASSERT_OK(s);
+    }
+  }
+  s = writer->Close();
+  ASSERT_OK(s);
+}
+
+TEST_F(WritableFileWriterTest, IncrementalBuffer) {
+  class FakeWF : public FSWritableFile {
+   public:
+    explicit FakeWF(std::string* _file_data, bool _use_direct_io,
+                    bool _no_flush)
+        : file_data_(_file_data),
+          use_direct_io_(_use_direct_io),
+          no_flush_(_no_flush) {}
+    ~FakeWF() override {}
+
+    using FSWritableFile::Append;
+    IOStatus Append(const Slice& data, const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+      file_data_->append(data.data(), data.size());
+      size_ += data.size();
+      return IOStatus::OK();
+    }
+    using FSWritableFile::PositionedAppend;
+    IOStatus PositionedAppend(const Slice& data, uint64_t pos,
+                              const IOOptions& /*options*/,
+                              IODebugContext* /*dbg*/) override {
+      EXPECT_TRUE(pos % 512 == 0);
+      EXPECT_TRUE(data.size() % 512 == 0);
+      file_data_->resize(pos);
+      file_data_->append(data.data(), data.size());
+      size_ += data.size();
+      return IOStatus::OK();
+    }
+
+    IOStatus Truncate(uint64_t size, const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+      file_data_->resize(size);
+      return IOStatus::OK();
+    }
+    IOStatus Close(const IOOptions& /*options*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Flush(const IOOptions& /*options*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Sync(const IOOptions& /*options*/,
+                  IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Fsync(const IOOptions& /*options*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    void SetIOPriority(Env::IOPriority /*pri*/) override {}
+    uint64_t GetFileSize(const IOOptions& /*options*/,
+                         IODebugContext* /*dbg*/) override {
+      return size_;
+    }
+    void GetPreallocationStatus(size_t* /*block_size*/,
+                                size_t* /*last_allocated_block*/) override {}
+    size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const override {
+      return 0;
+    }
+    IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) override {
+      return IOStatus::OK();
+    }
+    bool use_direct_io() const override { return use_direct_io_; }
+
+    std::string* file_data_;
+    bool use_direct_io_;
+    bool no_flush_;
+    size_t size_ = 0;
+  };
+
+  Random r(301);
+  const int kNumAttempts = 50;
+  for (int attempt = 0; attempt < kNumAttempts; attempt++) {
+    bool no_flush = (attempt % 3 == 0);
+    EnvOptions env_options;
+    env_options.writable_file_max_buffer_size =
+        (attempt < kNumAttempts / 2) ? 512 * 1024 : 700 * 1024;
+    std::string actual;
+    std::unique_ptr<FakeWF> wf(new FakeWF(&actual,
+#ifndef ROCKSDB_LITE
+                                          attempt % 2 == 1,
+#else
+                                          false,
+#endif
+                                          no_flush));
+    std::unique_ptr<WritableFileWriter> writer(new WritableFileWriter(
+        std::move(wf), "" /* don't care */, env_options));
+
+    std::string target;
+    for (int i = 0; i < 20; i++) {
+      uint32_t num = r.Skewed(16) * 100 + r.Uniform(100);
+      std::string random_string = r.RandomString(num);
+      ASSERT_OK(writer->Append(Slice(random_string.c_str(), num)));
+      target.append(random_string.c_str(), num);
+
+      // In some attempts, flush in a chance of 1/10.
+      if (!no_flush && r.Uniform(10) == 0) {
+        ASSERT_OK(writer->Flush());
+      }
+    }
+    ASSERT_OK(writer->Flush());
+    ASSERT_OK(writer->Close());
+    ASSERT_EQ(target.size(), actual.size());
+    ASSERT_EQ(target, actual);
+  }
+}
+
+TEST_F(WritableFileWriterTest, BufferWithZeroCapacityDirectIO) {
+  EnvOptions env_opts;
+  env_opts.use_direct_writes = true;
+  env_opts.writable_file_max_buffer_size = 0;
+  {
+    std::unique_ptr<WritableFileWriter> writer;
+    const Status s =
+        WritableFileWriter::Create(FileSystem::Default(), /*fname=*/"dont_care",
+                                   FileOptions(env_opts), &writer,
+                                   /*dbg=*/nullptr);
+    ASSERT_TRUE(s.IsInvalidArgument());
+  }
+}
+
+class DBWritableFileWriterTest : public DBTestBase {
+ public:
+  DBWritableFileWriterTest()
+      : DBTestBase("db_secondary_cache_test", /*env_do_fsync=*/true) {
+    fault_fs_.reset(new FaultInjectionTestFS(env_->GetFileSystem()));
+    fault_env_.reset(new CompositeEnvWrapper(env_, fault_fs_));
+  }
+
+  std::shared_ptr<FaultInjectionTestFS> fault_fs_;
+  std::unique_ptr<Env> fault_env_;
+};
+
+TEST_F(DBWritableFileWriterTest, AppendWithChecksum) {
+  FileOptions file_options = FileOptions();
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+  std::string fname = dbname_ + "/test_file";
+  std::unique_ptr<FSWritableFile> writable_file_ptr;
+  ASSERT_OK(fault_fs_->NewWritableFile(fname, file_options, &writable_file_ptr,
+                                       /*dbg*/ nullptr));
+  std::unique_ptr<TestFSWritableFile> file;
+  file.reset(new TestFSWritableFile(
+      fname, file_options, std::move(writable_file_ptr), fault_fs_.get()));
+  std::unique_ptr<WritableFileWriter> file_writer;
+  ImmutableOptions ioptions(options);
+  file_writer.reset(new WritableFileWriter(
+      std::move(file), fname, file_options, SystemClock::Default().get(),
+      nullptr, ioptions.stats, ioptions.listeners,
+      ioptions.file_checksum_gen_factory.get(), true, true));
+
+  Random rnd(301);
+  std::string data = rnd.RandomString(1000);
+  uint32_t data_crc32c = crc32c::Value(data.c_str(), data.size());
+  fault_fs_->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+
+  ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c));
+  ASSERT_OK(file_writer->Flush());
+  Random size_r(47);
+  for (int i = 0; i < 2000; i++) {
+    data = rnd.RandomString((static_cast<int>(size_r.Next()) % 10000));
+    data_crc32c = crc32c::Value(data.c_str(), data.size());
+    ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c));
+
+    data = rnd.RandomString((static_cast<int>(size_r.Next()) % 97));
+    ASSERT_OK(file_writer->Append(Slice(data.c_str())));
+    ASSERT_OK(file_writer->Flush());
+  }
+  ASSERT_OK(file_writer->Close());
+  Destroy(options);
+}
+
+TEST_F(DBWritableFileWriterTest, AppendVerifyNoChecksum) {
+  FileOptions file_options = FileOptions();
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+  std::string fname = dbname_ + "/test_file";
+  std::unique_ptr<FSWritableFile> writable_file_ptr;
+  ASSERT_OK(fault_fs_->NewWritableFile(fname, file_options, &writable_file_ptr,
+                                       /*dbg*/ nullptr));
+  std::unique_ptr<TestFSWritableFile> file;
+  file.reset(new TestFSWritableFile(
+      fname, file_options, std::move(writable_file_ptr), fault_fs_.get()));
+  std::unique_ptr<WritableFileWriter> file_writer;
+  ImmutableOptions ioptions(options);
+  // Enable checksum handoff for this file, but do not enable buffer checksum.
+  // So Append with checksum logic will not be triggered
+  file_writer.reset(new WritableFileWriter(
+      std::move(file), fname, file_options, SystemClock::Default().get(),
+      nullptr, ioptions.stats, ioptions.listeners,
+      ioptions.file_checksum_gen_factory.get(), true, false));
+
+  Random rnd(301);
+  std::string data = rnd.RandomString(1000);
+  uint32_t data_crc32c = crc32c::Value(data.c_str(), data.size());
+  fault_fs_->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+
+  ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c));
+  ASSERT_OK(file_writer->Flush());
+  Random size_r(47);
+  for (int i = 0; i < 1000; i++) {
+    data = rnd.RandomString((static_cast<int>(size_r.Next()) % 10000));
+    data_crc32c = crc32c::Value(data.c_str(), data.size());
+    ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c));
+
+    data = rnd.RandomString((static_cast<int>(size_r.Next()) % 97));
+    ASSERT_OK(file_writer->Append(Slice(data.c_str())));
+    ASSERT_OK(file_writer->Flush());
+  }
+  ASSERT_OK(file_writer->Close());
+  Destroy(options);
+}
+
+TEST_F(DBWritableFileWriterTest, AppendWithChecksumRateLimiter) {
+  FileOptions file_options = FileOptions();
+  file_options.rate_limiter = nullptr;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+  std::string fname = dbname_ + "/test_file";
+  std::unique_ptr<FSWritableFile> writable_file_ptr;
+  ASSERT_OK(fault_fs_->NewWritableFile(fname, file_options, &writable_file_ptr,
+                                       /*dbg*/ nullptr));
+  std::unique_ptr<TestFSWritableFile> file;
+  file.reset(new TestFSWritableFile(
+      fname, file_options, std::move(writable_file_ptr), fault_fs_.get()));
+  std::unique_ptr<WritableFileWriter> file_writer;
+  ImmutableOptions ioptions(options);
+  // Enable checksum handoff for this file, but do not enable buffer checksum.
+  // So Append with checksum logic will not be triggered
+  file_writer.reset(new WritableFileWriter(
+      std::move(file), fname, file_options, SystemClock::Default().get(),
+      nullptr, ioptions.stats, ioptions.listeners,
+      ioptions.file_checksum_gen_factory.get(), true, true));
+  fault_fs_->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+
+  Random rnd(301);
+  std::string data;
+  uint32_t data_crc32c;
+  uint64_t start = fault_env_->NowMicros();
+  Random size_r(47);
+  uint64_t bytes_written = 0;
+  for (int i = 0; i < 100; i++) {
+    data = rnd.RandomString((static_cast<int>(size_r.Next()) % 10000));
+    data_crc32c = crc32c::Value(data.c_str(), data.size());
+    ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c));
+    bytes_written += static_cast<uint64_t>(data.size());
+
+    data = rnd.RandomString((static_cast<int>(size_r.Next()) % 97));
+    ASSERT_OK(file_writer->Append(Slice(data.c_str())));
+    ASSERT_OK(file_writer->Flush());
+    bytes_written += static_cast<uint64_t>(data.size());
+  }
+  uint64_t elapsed = fault_env_->NowMicros() - start;
+  double raw_rate = bytes_written * 1000000.0 / elapsed;
+  ASSERT_OK(file_writer->Close());
+
+  // Set the rate-limiter
+  FileOptions file_options1 = FileOptions();
+  file_options1.rate_limiter =
+      NewGenericRateLimiter(static_cast<int64_t>(0.5 * raw_rate));
+  fname = dbname_ + "/test_file_1";
+  std::unique_ptr<FSWritableFile> writable_file_ptr1;
+  ASSERT_OK(fault_fs_->NewWritableFile(fname, file_options1,
+                                       &writable_file_ptr1,
+                                       /*dbg*/ nullptr));
+  file.reset(new TestFSWritableFile(
+      fname, file_options1, std::move(writable_file_ptr1), fault_fs_.get()));
+  // Enable checksum handoff for this file, but do not enable buffer checksum.
+  // So Append with checksum logic will not be triggered
+  file_writer.reset(new WritableFileWriter(
+      std::move(file), fname, file_options1, SystemClock::Default().get(),
+      nullptr, ioptions.stats, ioptions.listeners,
+      ioptions.file_checksum_gen_factory.get(), true, true));
+
+  for (int i = 0; i < 1000; i++) {
+    data = rnd.RandomString((static_cast<int>(size_r.Next()) % 10000));
+    data_crc32c = crc32c::Value(data.c_str(), data.size());
+    ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c));
+
+    data = rnd.RandomString((static_cast<int>(size_r.Next()) % 97));
+    ASSERT_OK(file_writer->Append(Slice(data.c_str())));
+    ASSERT_OK(file_writer->Flush());
+  }
+  ASSERT_OK(file_writer->Close());
+  if (file_options1.rate_limiter != nullptr) {
+    delete file_options1.rate_limiter;
+  }
+
+  Destroy(options);
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(WritableFileWriterTest, AppendStatusReturn) {
+  class FakeWF : public FSWritableFile {
+   public:
+    explicit FakeWF() : use_direct_io_(false), io_error_(false) {}
+
+    bool use_direct_io() const override { return use_direct_io_; }
+
+    using FSWritableFile::Append;
+    IOStatus Append(const Slice& /*data*/, const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+      if (io_error_) {
+        return IOStatus::IOError("Fake IO error");
+      }
+      return IOStatus::OK();
+    }
+    using FSWritableFile::PositionedAppend;
+    IOStatus PositionedAppend(const Slice& /*data*/, uint64_t,
+                              const IOOptions& /*options*/,
+                              IODebugContext* /*dbg*/) override {
+      if (io_error_) {
+        return IOStatus::IOError("Fake IO error");
+      }
+      return IOStatus::OK();
+    }
+    IOStatus Close(const IOOptions& /*options*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Flush(const IOOptions& /*options*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Sync(const IOOptions& /*options*/,
+                  IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    void Setuse_direct_io(bool val) { use_direct_io_ = val; }
+    void SetIOError(bool val) { io_error_ = val; }
+
+   protected:
+    bool use_direct_io_;
+    bool io_error_;
+  };
+  std::unique_ptr<FakeWF> wf(new FakeWF());
+  wf->Setuse_direct_io(true);
+  std::unique_ptr<WritableFileWriter> writer(
+      new WritableFileWriter(std::move(wf), "" /* don't care */, EnvOptions()));
+
+  ASSERT_OK(writer->Append(std::string(2 * kMb, 'a')));
+
+  // Next call to WritableFile::Append() should fail
+  FakeWF* fwf = static_cast<FakeWF*>(writer->writable_file());
+  fwf->SetIOError(true);
+  ASSERT_NOK(writer->Append(std::string(2 * kMb, 'b')));
+}
+#endif
+
+class ReadaheadRandomAccessFileTest
+    : public testing::Test,
+      public testing::WithParamInterface<size_t> {
+ public:
+  static std::vector<size_t> GetReadaheadSizeList() {
+    return {1lu << 12, 1lu << 16};
+  }
+  void SetUp() override {
+    readahead_size_ = GetParam();
+    scratch_.reset(new char[2 * readahead_size_]);
+    ResetSourceStr();
+  }
+  ReadaheadRandomAccessFileTest() : control_contents_() {}
+  std::string Read(uint64_t offset, size_t n) {
+    Slice result;
+    Status s = test_read_holder_->Read(offset, n, IOOptions(), &result,
+                                       scratch_.get(), nullptr);
+    EXPECT_TRUE(s.ok() || s.IsInvalidArgument());
+    return std::string(result.data(), result.size());
+  }
+  void ResetSourceStr(const std::string& str = "") {
+    std::unique_ptr<FSWritableFile> sink(
+        new test::StringSink(&control_contents_));
+    std::unique_ptr<WritableFileWriter> write_holder(new WritableFileWriter(
+        std::move(sink), "" /* don't care */, FileOptions()));
+    Status s = write_holder->Append(Slice(str));
+    EXPECT_OK(s);
+    s = write_holder->Flush();
+    EXPECT_OK(s);
+    std::unique_ptr<FSRandomAccessFile> read_holder(
+        new test::StringSource(control_contents_));
+    test_read_holder_ =
+        NewReadaheadRandomAccessFile(std::move(read_holder), readahead_size_);
+  }
+  size_t GetReadaheadSize() const { return readahead_size_; }
+
+ private:
+  size_t readahead_size_;
+  Slice control_contents_;
+  std::unique_ptr<FSRandomAccessFile> test_read_holder_;
+  std::unique_ptr<char[]> scratch_;
+};
+
+TEST_P(ReadaheadRandomAccessFileTest, EmptySourceStr) {
+  ASSERT_EQ("", Read(0, 1));
+  ASSERT_EQ("", Read(0, 0));
+  ASSERT_EQ("", Read(13, 13));
+}
+
+TEST_P(ReadaheadRandomAccessFileTest, SourceStrLenLessThanReadaheadSize) {
+  std::string str = "abcdefghijklmnopqrs";
+  ResetSourceStr(str);
+  ASSERT_EQ(str.substr(3, 4), Read(3, 4));
+  ASSERT_EQ(str.substr(0, 3), Read(0, 3));
+  ASSERT_EQ(str, Read(0, str.size()));
+  ASSERT_EQ(str.substr(7, std::min(static_cast<int>(str.size()) - 7, 30)),
+            Read(7, 30));
+  ASSERT_EQ("", Read(100, 100));
+}
+
+TEST_P(ReadaheadRandomAccessFileTest, SourceStrLenGreaterThanReadaheadSize) {
+  Random rng(42);
+  for (int k = 0; k < 100; ++k) {
+    size_t strLen = k * GetReadaheadSize() +
+                    rng.Uniform(static_cast<int>(GetReadaheadSize()));
+    std::string str = rng.HumanReadableString(static_cast<int>(strLen));
+    ResetSourceStr(str);
+    for (int test = 1; test <= 100; ++test) {
+      size_t offset = rng.Uniform(static_cast<int>(strLen));
+      size_t n = rng.Uniform(static_cast<int>(GetReadaheadSize()));
+      ASSERT_EQ(str.substr(offset, std::min(n, strLen - offset)),
+                Read(offset, n));
+    }
+  }
+}
+
+TEST_P(ReadaheadRandomAccessFileTest, ReadExceedsReadaheadSize) {
+  Random rng(7);
+  size_t strLen = 4 * GetReadaheadSize() +
+                  rng.Uniform(static_cast<int>(GetReadaheadSize()));
+  std::string str = rng.HumanReadableString(static_cast<int>(strLen));
+  ResetSourceStr(str);
+  for (int test = 1; test <= 100; ++test) {
+    size_t offset = rng.Uniform(static_cast<int>(strLen));
+    size_t n =
+        GetReadaheadSize() + rng.Uniform(static_cast<int>(GetReadaheadSize()));
+    ASSERT_EQ(str.substr(offset, std::min(n, strLen - offset)),
+              Read(offset, n));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    EmptySourceStr, ReadaheadRandomAccessFileTest,
+    ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList()));
+INSTANTIATE_TEST_CASE_P(
+    SourceStrLenLessThanReadaheadSize, ReadaheadRandomAccessFileTest,
+    ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList()));
+INSTANTIATE_TEST_CASE_P(
+    SourceStrLenGreaterThanReadaheadSize, ReadaheadRandomAccessFileTest,
+    ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList()));
+INSTANTIATE_TEST_CASE_P(
+    ReadExceedsReadaheadSize, ReadaheadRandomAccessFileTest,
+    ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList()));
+
+class ReadaheadSequentialFileTest : public testing::Test,
+                                    public testing::WithParamInterface<size_t> {
+ public:
+  static std::vector<size_t> GetReadaheadSizeList() {
+    return {1lu << 8, 1lu << 12, 1lu << 16, 1lu << 18};
+  }
+  void SetUp() override {
+    readahead_size_ = GetParam();
+    scratch_.reset(new char[2 * readahead_size_]);
+    ResetSourceStr();
+  }
+  ReadaheadSequentialFileTest() {}
+  std::string Read(size_t n) {
+    Slice result;
+    Status s = test_read_holder_->Read(
+        n, &result, scratch_.get(), Env::IO_TOTAL /* rate_limiter_priority*/);
+    EXPECT_TRUE(s.ok() || s.IsInvalidArgument());
+    return std::string(result.data(), result.size());
+  }
+  void Skip(size_t n) { test_read_holder_->Skip(n); }
+  void ResetSourceStr(const std::string& str = "") {
+    auto read_holder = std::unique_ptr<FSSequentialFile>(
+        new test::SeqStringSource(str, &seq_read_count_));
+    test_read_holder_.reset(new SequentialFileReader(std::move(read_holder),
+                                                     "test", readahead_size_));
+  }
+  size_t GetReadaheadSize() const { return readahead_size_; }
+
+ private:
+  size_t readahead_size_;
+  std::unique_ptr<SequentialFileReader> test_read_holder_;
+  std::unique_ptr<char[]> scratch_;
+  std::atomic<int> seq_read_count_;
+};
+
+TEST_P(ReadaheadSequentialFileTest, EmptySourceStr) {
+  ASSERT_EQ("", Read(0));
+  ASSERT_EQ("", Read(1));
+  ASSERT_EQ("", Read(13));
+}
+
+TEST_P(ReadaheadSequentialFileTest, SourceStrLenLessThanReadaheadSize) {
+  std::string str = "abcdefghijklmnopqrs";
+  ResetSourceStr(str);
+  ASSERT_EQ(str.substr(0, 3), Read(3));
+  ASSERT_EQ(str.substr(3, 1), Read(1));
+  ASSERT_EQ(str.substr(4), Read(str.size()));
+  ASSERT_EQ("", Read(100));
+}
+
+TEST_P(ReadaheadSequentialFileTest, SourceStrLenGreaterThanReadaheadSize) {
+  Random rng(42);
+  for (int s = 0; s < 1; ++s) {
+    for (int k = 0; k < 100; ++k) {
+      size_t strLen = k * GetReadaheadSize() +
+                      rng.Uniform(static_cast<int>(GetReadaheadSize()));
+      std::string str = rng.HumanReadableString(static_cast<int>(strLen));
+      ResetSourceStr(str);
+      size_t offset = 0;
+      for (int test = 1; test <= 100; ++test) {
+        size_t n = rng.Uniform(static_cast<int>(GetReadaheadSize()));
+        if (s && test % 2) {
+          Skip(n);
+        } else {
+          ASSERT_EQ(str.substr(offset, std::min(n, strLen - offset)), Read(n));
+        }
+        offset = std::min(offset + n, strLen);
+      }
+    }
+  }
+}
+
+TEST_P(ReadaheadSequentialFileTest, ReadExceedsReadaheadSize) {
+  Random rng(42);
+  for (int s = 0; s < 1; ++s) {
+    for (int k = 0; k < 100; ++k) {
+      size_t strLen = k * GetReadaheadSize() +
+                      rng.Uniform(static_cast<int>(GetReadaheadSize()));
+      std::string str = rng.HumanReadableString(static_cast<int>(strLen));
+      ResetSourceStr(str);
+      size_t offset = 0;
+      for (int test = 1; test <= 100; ++test) {
+        size_t n = GetReadaheadSize() +
+                   rng.Uniform(static_cast<int>(GetReadaheadSize()));
+        if (s && test % 2) {
+          Skip(n);
+        } else {
+          ASSERT_EQ(str.substr(offset, std::min(n, strLen - offset)), Read(n));
+        }
+        offset = std::min(offset + n, strLen);
+      }
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    EmptySourceStr, ReadaheadSequentialFileTest,
+    ::testing::ValuesIn(ReadaheadSequentialFileTest::GetReadaheadSizeList()));
+INSTANTIATE_TEST_CASE_P(
+    SourceStrLenLessThanReadaheadSize, ReadaheadSequentialFileTest,
+    ::testing::ValuesIn(ReadaheadSequentialFileTest::GetReadaheadSizeList()));
+INSTANTIATE_TEST_CASE_P(
+    SourceStrLenGreaterThanReadaheadSize, ReadaheadSequentialFileTest,
+    ::testing::ValuesIn(ReadaheadSequentialFileTest::GetReadaheadSizeList()));
+INSTANTIATE_TEST_CASE_P(
+    ReadExceedsReadaheadSize, ReadaheadSequentialFileTest,
+    ::testing::ValuesIn(ReadaheadSequentialFileTest::GetReadaheadSizeList()));
+
+namespace {
+std::string GenerateLine(int n) {
+  std::string rv;
+  // Multiples of 17 characters per line, for likely bad buffer alignment
+  for (int i = 0; i < n; ++i) {
+    rv.push_back(static_cast<char>('0' + (i % 10)));
+    rv.append("xxxxxxxxxxxxxxxx");
+  }
+  return rv;
+}
+}  // namespace
+
+TEST(LineFileReaderTest, LineFileReaderTest) {
+  const int nlines = 1000;
+
+  std::unique_ptr<Env> mem_env(MockEnv::Create(Env::Default()));
+  std::shared_ptr<FileSystem> fs = mem_env->GetFileSystem();
+  // Create an input file
+  {
+    std::unique_ptr<FSWritableFile> file;
+    ASSERT_OK(
+        fs->NewWritableFile("testfile", FileOptions(), &file, /*dbg*/ nullptr));
+
+    for (int i = 0; i < nlines; ++i) {
+      std::string line = GenerateLine(i);
+      line.push_back('\n');
+      ASSERT_OK(file->Append(line, IOOptions(), /*dbg*/ nullptr));
+    }
+  }
+
+  // Verify with no I/O errors
+  {
+    std::unique_ptr<LineFileReader> reader;
+    ASSERT_OK(LineFileReader::Create(fs, "testfile", FileOptions(), &reader,
+                                     nullptr /* dbg */,
+                                     nullptr /* rate_limiter */));
+    std::string line;
+    int count = 0;
+    while (reader->ReadLine(&line, Env::IO_TOTAL /* rate_limiter_priority */)) {
+      ASSERT_EQ(line, GenerateLine(count));
+      ++count;
+      ASSERT_EQ(static_cast<int>(reader->GetLineNumber()), count);
+    }
+    ASSERT_OK(reader->GetStatus());
+    ASSERT_EQ(count, nlines);
+    ASSERT_EQ(static_cast<int>(reader->GetLineNumber()), count);
+    // And still
+    ASSERT_FALSE(
+        reader->ReadLine(&line, Env::IO_TOTAL /* rate_limiter_priority */));
+    ASSERT_OK(reader->GetStatus());
+    ASSERT_EQ(static_cast<int>(reader->GetLineNumber()), count);
+  }
+
+  // Verify with injected I/O error
+  {
+    std::unique_ptr<LineFileReader> reader;
+    ASSERT_OK(LineFileReader::Create(fs, "testfile", FileOptions(), &reader,
+                                     nullptr /* dbg */,
+                                     nullptr /* rate_limiter */));
+    std::string line;
+    int count = 0;
+    // Read part way through the file
+    while (count < nlines / 4) {
+      ASSERT_TRUE(
+          reader->ReadLine(&line, Env::IO_TOTAL /* rate_limiter_priority */));
+      ASSERT_EQ(line, GenerateLine(count));
+      ++count;
+      ASSERT_EQ(static_cast<int>(reader->GetLineNumber()), count);
+    }
+    ASSERT_OK(reader->GetStatus());
+
+    // Inject error
+    int callback_count = 0;
+    SyncPoint::GetInstance()->SetCallBack(
+        "MemFile::Read:IOStatus", [&](void* arg) {
+          IOStatus* status = static_cast<IOStatus*>(arg);
+          *status = IOStatus::Corruption("test");
+          ++callback_count;
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    while (reader->ReadLine(&line, Env::IO_TOTAL /* rate_limiter_priority */)) {
+      ASSERT_EQ(line, GenerateLine(count));
+      ++count;
+      ASSERT_EQ(static_cast<int>(reader->GetLineNumber()), count);
+    }
+    ASSERT_TRUE(reader->GetStatus().IsCorruption());
+    ASSERT_LT(count, nlines / 2);
+    ASSERT_EQ(callback_count, 1);
+
+    // Still get error & no retry
+    ASSERT_FALSE(
+        reader->ReadLine(&line, Env::IO_TOTAL /* rate_limiter_priority */));
+    ASSERT_TRUE(reader->GetStatus().IsCorruption());
+    ASSERT_EQ(callback_count, 1);
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+  }
+}
+
+#ifndef ROCKSDB_LITE
+class IOErrorEventListener : public EventListener {
+ public:
+  IOErrorEventListener() { notify_error_.store(0); }
+
+  void OnIOError(const IOErrorInfo& io_error_info) override {
+    notify_error_++;
+    EXPECT_FALSE(io_error_info.file_path.empty());
+    EXPECT_FALSE(io_error_info.io_status.ok());
+  }
+
+  size_t NotifyErrorCount() { return notify_error_; }
+
+  bool ShouldBeNotifiedOnFileIO() override { return true; }
+
+ private:
+  std::atomic<size_t> notify_error_;
+};
+
+TEST_F(DBWritableFileWriterTest, IOErrorNotification) {
+  class FakeWF : public FSWritableFile {
+   public:
+    explicit FakeWF() : io_error_(false) {
+      file_append_errors_.store(0);
+      file_flush_errors_.store(0);
+    }
+
+    using FSWritableFile::Append;
+    IOStatus Append(const Slice& /*data*/, const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+      if (io_error_) {
+        file_append_errors_++;
+        return IOStatus::IOError("Fake IO error");
+      }
+      return IOStatus::OK();
+    }
+
+    using FSWritableFile::PositionedAppend;
+    IOStatus PositionedAppend(const Slice& /*data*/, uint64_t,
+                              const IOOptions& /*options*/,
+                              IODebugContext* /*dbg*/) override {
+      if (io_error_) {
+        return IOStatus::IOError("Fake IO error");
+      }
+      return IOStatus::OK();
+    }
+    IOStatus Close(const IOOptions& /*options*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Flush(const IOOptions& /*options*/,
+                   IODebugContext* /*dbg*/) override {
+      if (io_error_) {
+        file_flush_errors_++;
+        return IOStatus::IOError("Fake IO error");
+      }
+      return IOStatus::OK();
+    }
+    IOStatus Sync(const IOOptions& /*options*/,
+                  IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+
+    void SetIOError(bool val) { io_error_ = val; }
+
+    void CheckCounters(int file_append_errors, int file_flush_errors) {
+      ASSERT_EQ(file_append_errors, file_append_errors_);
+      ASSERT_EQ(file_flush_errors_, file_flush_errors);
+    }
+
+   protected:
+    bool io_error_;
+    std::atomic<size_t> file_append_errors_;
+    std::atomic<size_t> file_flush_errors_;
+  };
+
+  FileOptions file_options = FileOptions();
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  IOErrorEventListener* listener = new IOErrorEventListener();
+  options.listeners.emplace_back(listener);
+
+  DestroyAndReopen(options);
+  ImmutableOptions ioptions(options);
+
+  std::string fname = dbname_ + "/test_file";
+  std::unique_ptr<FakeWF> writable_file_ptr(new FakeWF);
+
+  std::unique_ptr<WritableFileWriter> file_writer;
+  writable_file_ptr->SetIOError(true);
+
+  file_writer.reset(new WritableFileWriter(
+      std::move(writable_file_ptr), fname, file_options,
+      SystemClock::Default().get(), nullptr, ioptions.stats, ioptions.listeners,
+      ioptions.file_checksum_gen_factory.get(), true, true));
+
+  FakeWF* fwf = static_cast<FakeWF*>(file_writer->writable_file());
+
+  fwf->SetIOError(true);
+  ASSERT_NOK(file_writer->Append(std::string(2 * kMb, 'a')));
+  fwf->CheckCounters(1, 0);
+  ASSERT_EQ(listener->NotifyErrorCount(), 1);
+
+  file_writer->reset_seen_error();
+  fwf->SetIOError(true);
+  ASSERT_NOK(file_writer->Flush());
+  fwf->CheckCounters(1, 1);
+  ASSERT_EQ(listener->NotifyErrorCount(), 2);
+
+  /* No error generation */
+  file_writer->reset_seen_error();
+  fwf->SetIOError(false);
+  ASSERT_OK(file_writer->Append(std::string(2 * kMb, 'b')));
+  ASSERT_EQ(listener->NotifyErrorCount(), 2);
+  fwf->CheckCounters(1, 1);
+}
+#endif  // ROCKSDB_LITE
+
+class WritableFileWriterIOPriorityTest : public testing::Test {
+ protected:
+  // This test is to check whether the rate limiter priority can be passed
+  // correctly from WritableFileWriter functions to FSWritableFile functions.
+
+  void SetUp() override {
+    // When op_rate_limiter_priority parameter in WritableFileWriter functions
+    // is the default (Env::IO_TOTAL).
+    std::unique_ptr<FakeWF> wf{new FakeWF(Env::IO_HIGH)};
+    FileOptions file_options;
+    writer_.reset(new WritableFileWriter(std::move(wf), "" /* don't care */,
+                                         file_options));
+  }
+
+  class FakeWF : public FSWritableFile {
+   public:
+    explicit FakeWF(Env::IOPriority io_priority) { SetIOPriority(io_priority); }
+    ~FakeWF() override {}
+
+    IOStatus Append(const Slice& /*data*/, const IOOptions& options,
+                    IODebugContext* /*dbg*/) override {
+      EXPECT_EQ(options.rate_limiter_priority, io_priority_);
+      return IOStatus::OK();
+    }
+    IOStatus Append(const Slice& data, const IOOptions& options,
+                    const DataVerificationInfo& /* verification_info */,
+                    IODebugContext* dbg) override {
+      return Append(data, options, dbg);
+    }
+    IOStatus PositionedAppend(const Slice& /*data*/, uint64_t /*offset*/,
+                              const IOOptions& options,
+                              IODebugContext* /*dbg*/) override {
+      EXPECT_EQ(options.rate_limiter_priority, io_priority_);
+      return IOStatus::OK();
+    }
+    IOStatus PositionedAppend(
+        const Slice& /* data */, uint64_t /* offset */,
+        const IOOptions& options,
+        const DataVerificationInfo& /* verification_info */,
+        IODebugContext* /*dbg*/) override {
+      EXPECT_EQ(options.rate_limiter_priority, io_priority_);
+      return IOStatus::OK();
+    }
+    IOStatus Truncate(uint64_t /*size*/, const IOOptions& options,
+                      IODebugContext* /*dbg*/) override {
+      EXPECT_EQ(options.rate_limiter_priority, io_priority_);
+      return IOStatus::OK();
+    }
+    IOStatus Close(const IOOptions& options, IODebugContext* /*dbg*/) override {
+      EXPECT_EQ(options.rate_limiter_priority, io_priority_);
+      return IOStatus::OK();
+    }
+    IOStatus Flush(const IOOptions& options, IODebugContext* /*dbg*/) override {
+      EXPECT_EQ(options.rate_limiter_priority, io_priority_);
+      return IOStatus::OK();
+    }
+    IOStatus Sync(const IOOptions& options, IODebugContext* /*dbg*/) override {
+      EXPECT_EQ(options.rate_limiter_priority, io_priority_);
+      return IOStatus::OK();
+    }
+    IOStatus Fsync(const IOOptions& options, IODebugContext* /*dbg*/) override {
+      EXPECT_EQ(options.rate_limiter_priority, io_priority_);
+      return IOStatus::OK();
+    }
+    uint64_t GetFileSize(const IOOptions& options,
+                         IODebugContext* /*dbg*/) override {
+      EXPECT_EQ(options.rate_limiter_priority, io_priority_);
+      return 0;
+    }
+    void GetPreallocationStatus(size_t* /*block_size*/,
+                                size_t* /*last_allocated_block*/) override {}
+    size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const override {
+      return 0;
+    }
+    IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) override {
+      return IOStatus::OK();
+    }
+
+    IOStatus Allocate(uint64_t /*offset*/, uint64_t /*len*/,
+                      const IOOptions& options,
+                      IODebugContext* /*dbg*/) override {
+      EXPECT_EQ(options.rate_limiter_priority, io_priority_);
+      return IOStatus::OK();
+    }
+    IOStatus RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/,
+                       const IOOptions& options,
+                       IODebugContext* /*dbg*/) override {
+      EXPECT_EQ(options.rate_limiter_priority, io_priority_);
+      return IOStatus::OK();
+    }
+
+    void PrepareWrite(size_t /*offset*/, size_t /*len*/,
+                      const IOOptions& options,
+                      IODebugContext* /*dbg*/) override {
+      EXPECT_EQ(options.rate_limiter_priority, io_priority_);
+    }
+
+    bool IsSyncThreadSafe() const override { return true; }
+  };
+
+  std::unique_ptr<WritableFileWriter> writer_;
+};
+
+TEST_F(WritableFileWriterIOPriorityTest, Append) {
+  ASSERT_OK(writer_->Append(Slice("abc")));
+}
+
+TEST_F(WritableFileWriterIOPriorityTest, Pad) { ASSERT_OK(writer_->Pad(500)); }
+
+TEST_F(WritableFileWriterIOPriorityTest, Flush) { ASSERT_OK(writer_->Flush()); }
+
+TEST_F(WritableFileWriterIOPriorityTest, Close) { ASSERT_OK(writer_->Close()); }
+
+TEST_F(WritableFileWriterIOPriorityTest, Sync) {
+  ASSERT_OK(writer_->Sync(false));
+  ASSERT_OK(writer_->Sync(true));
+}
+
+TEST_F(WritableFileWriterIOPriorityTest, SyncWithoutFlush) {
+  ASSERT_OK(writer_->SyncWithoutFlush(false));
+  ASSERT_OK(writer_->SyncWithoutFlush(true));
+}
+
+TEST_F(WritableFileWriterIOPriorityTest, BasicOp) {
+  EnvOptions env_options;
+  env_options.bytes_per_sync = kMb;
+  std::unique_ptr<FakeWF> wf(new FakeWF(Env::IO_HIGH));
+  std::unique_ptr<WritableFileWriter> writer(
+      new WritableFileWriter(std::move(wf), "" /* don't care */, env_options));
+  Random r(301);
+  Status s;
+  std::unique_ptr<char[]> large_buf(new char[10 * kMb]);
+  for (int i = 0; i < 1000; i++) {
+    int skew_limit = (i < 700) ? 10 : 15;
+    uint32_t num = r.Skewed(skew_limit) * 100 + r.Uniform(100);
+    s = writer->Append(Slice(large_buf.get(), num));
+    ASSERT_OK(s);
+
+    // Flush in a chance of 1/10.
+    if (r.Uniform(10) == 0) {
+      s = writer->Flush();
+      ASSERT_OK(s);
+    }
+  }
+  s = writer->Close();
+  ASSERT_OK(s);
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/filelock_test.cc b/src/rocksdb/util/filelock_test.cc
new file mode 100644
index 000000000..69947a732
--- /dev/null
+++ b/src/rocksdb/util/filelock_test.cc
@@ -0,0 +1,148 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include <fcntl.h>
+
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#ifdef __FreeBSD__
+#include <sys/types.h>
+#include <sys/wait.h>
+#endif
+#include <vector>
+
+#include "test_util/testharness.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class LockTest : public testing::Test {
+ public:
+  static LockTest* current_;
+  std::string file_;
+  ROCKSDB_NAMESPACE::Env* env_;
+
+  LockTest()
+      : file_(test::PerThreadDBPath("db_testlock_file")),
+        env_(ROCKSDB_NAMESPACE::Env::Default()) {
+    current_ = this;
+  }
+
+  ~LockTest() override {}
+
+  Status LockFile(FileLock** db_lock) { return env_->LockFile(file_, db_lock); }
+
+  Status UnlockFile(FileLock* db_lock) { return env_->UnlockFile(db_lock); }
+
+  bool AssertFileIsLocked() {
+    return CheckFileLock(/* lock_expected = */ true);
+  }
+
+  bool AssertFileIsNotLocked() {
+    return CheckFileLock(/* lock_expected = */ false);
+  }
+
+  bool CheckFileLock(bool lock_expected) {
+    // We need to fork to check the fcntl lock as we need
+    // to open and close the file from a different process
+    // to avoid either releasing the lock on close, or not
+    // contending for it when requesting a lock.
+
+#ifdef OS_WIN
+
+    // WaitForSingleObject and GetExitCodeProcess can do what waitpid does.
+    // TODO - implement on Windows
+    return true;
+
+#else
+
+    pid_t pid = fork();
+    if (0 == pid) {
+      // child process
+      int exit_val = EXIT_FAILURE;
+      int fd = open(file_.c_str(), O_RDWR | O_CREAT, 0644);
+      if (fd < 0) {
+        // could not open file, could not check if it was locked
+        fprintf(stderr, "Open on on file %s failed.\n", file_.c_str());
+        exit(exit_val);
+      }
+
+      struct flock f;
+      memset(&f, 0, sizeof(f));
+      f.l_type = (F_WRLCK);
+      f.l_whence = SEEK_SET;
+      f.l_start = 0;
+      f.l_len = 0;  // Lock/unlock entire file
+      int value = fcntl(fd, F_SETLK, &f);
+      if (value == -1) {
+        if (lock_expected) {
+          exit_val = EXIT_SUCCESS;
+        }
+      } else {
+        if (!lock_expected) {
+          exit_val = EXIT_SUCCESS;
+        }
+      }
+      close(fd);  // lock is released for child process
+      exit(exit_val);
+    } else if (pid > 0) {
+      // parent process
+      int status;
+      while (-1 == waitpid(pid, &status, 0))
+        ;
+      if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
+        // child process exited with non success status
+        return false;
+      } else {
+        return true;
+      }
+    } else {
+      fprintf(stderr, "Fork failed\n");
+      return false;
+    }
+    return false;
+
+#endif
+  }
+};
+LockTest* LockTest::current_;
+
+TEST_F(LockTest, LockBySameThread) {
+  FileLock* lock1;
+  FileLock* lock2;
+
+  // acquire a lock on a file
+  ASSERT_OK(LockFile(&lock1));
+
+  // check the file is locked
+  ASSERT_TRUE(AssertFileIsLocked());
+
+  // re-acquire the lock on the same file. This should fail.
+  Status s = LockFile(&lock2);
+  ASSERT_TRUE(s.IsIOError());
+#ifndef OS_WIN
+  // Validate that error message contains current thread ID.
+  ASSERT_TRUE(s.ToString().find(std::to_string(
+                  Env::Default()->GetThreadID())) != std::string::npos);
+#endif
+
+  // check the file is locked
+  ASSERT_TRUE(AssertFileIsLocked());
+
+  // release the lock
+  ASSERT_OK(UnlockFile(lock1));
+
+  // check the file is not locked
+  ASSERT_TRUE(AssertFileIsNotLocked());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/filter_bench.cc b/src/rocksdb/util/filter_bench.cc
new file mode 100644
index 000000000..93186cd08
--- /dev/null
+++ b/src/rocksdb/util/filter_bench.cc
@@ -0,0 +1,840 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#if !defined(GFLAGS) || defined(ROCKSDB_LITE)
+#include <cstdio>
+int main() {
+  fprintf(stderr, "filter_bench requires gflags and !ROCKSDB_LITE\n");
+  return 1;
+}
+#else
+
+#include <cinttypes>
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include "memory/arena.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/table.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "table/block_based/full_filter_block.h"
+#include "table/block_based/mock_block_based_table.h"
+#include "table/plain/plain_table_bloom.h"
+#include "util/cast_util.h"
+#include "util/gflags_compat.h"
+#include "util/hash.h"
+#include "util/random.h"
+#include "util/stderr_logger.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::RegisterFlagValidator;
+using GFLAGS_NAMESPACE::SetUsageMessage;
+
+DEFINE_uint32(seed, 0, "Seed for random number generators");
+
+DEFINE_double(working_mem_size_mb, 200,
+              "MB of memory to get up to among all filters, unless "
+              "m_keys_total_max is specified.");
+
+DEFINE_uint32(average_keys_per_filter, 10000,
+              "Average number of keys per filter");
+
+DEFINE_double(vary_key_count_ratio, 0.4,
+              "Vary number of keys by up to +/- vary_key_count_ratio * "
+              "average_keys_per_filter.");
+
+DEFINE_uint32(key_size, 24, "Average number of bytes for each key");
+
+DEFINE_bool(vary_key_alignment, true,
+            "Whether to vary key alignment (default: at least 32-bit "
+            "alignment)");
+
+DEFINE_uint32(vary_key_size_log2_interval, 5,
+              "Use same key size 2^n times, then change. Key size varies from "
+              "-2 to +2 bytes vs. average, unless n>=30 to fix key size.");
+
+DEFINE_uint32(batch_size, 8, "Number of keys to group in each batch");
+
+DEFINE_double(bits_per_key, 10.0, "Bits per key setting for filters");
+
+DEFINE_double(m_queries, 200, "Millions of queries for each test mode");
+
+DEFINE_double(m_keys_total_max, 0,
+              "Maximum total keys added to filters, in millions. "
+              "0 (default) disables. Non-zero overrides working_mem_size_mb "
+              "option.");
+
+DEFINE_bool(use_full_block_reader, false,
+            "Use FullFilterBlockReader interface rather than FilterBitsReader");
+
+DEFINE_bool(use_plain_table_bloom, false,
+            "Use PlainTableBloom structure and interface rather than "
+            "FilterBitsReader/FullFilterBlockReader");
+
+DEFINE_bool(new_builder, false,
+            "Whether to create a new builder for each new filter");
+
+DEFINE_uint32(impl, 0,
+              "Select filter implementation. Without -use_plain_table_bloom:"
+              "0 = legacy full Bloom filter, "
+              "1 = format_version 5 Bloom filter, 2 = Ribbon128 filter. With "
+              "-use_plain_table_bloom: 0 = no locality, 1 = locality.");
+
+DEFINE_bool(net_includes_hashing, false,
+            "Whether query net ns/op times should include hashing. "
+            "(if not, dry run will include hashing) "
+            "(build times always include hashing)");
+
+DEFINE_bool(optimize_filters_for_memory, false,
+            "Setting for BlockBasedTableOptions::optimize_filters_for_memory");
+
+DEFINE_bool(detect_filter_construct_corruption, false,
+            "Setting for "
+            "BlockBasedTableOptions::detect_filter_construct_corruption");
+
+DEFINE_uint32(block_cache_capacity_MB, 8,
+              "Setting for "
+              "LRUCacheOptions::capacity");
+
+DEFINE_bool(charge_filter_construction, false,
+            "Setting for "
+            "CacheEntryRoleOptions::charged of"
+            "CacheEntryRole::kFilterConstruction");
+
+DEFINE_bool(strict_capacity_limit, false,
+            "Setting for "
+            "LRUCacheOptions::strict_capacity_limit");
+
+DEFINE_bool(quick, false, "Run more limited set of tests, fewer queries");
+
+DEFINE_bool(best_case, false, "Run limited tests only for best-case");
+
+DEFINE_bool(allow_bad_fp_rate, false, "Continue even if FP rate is bad");
+
+DEFINE_bool(legend, false,
+            "Print more information about interpreting results instead of "
+            "running tests");
+
+DEFINE_uint32(runs, 1, "Number of times to rebuild and run benchmark tests");
+
+void _always_assert_fail(int line, const char *file, const char *expr) {
+  fprintf(stderr, "%s: %d: Assertion %s failed\n", file, line, expr);
+  abort();
+}
+
+#define ALWAYS_ASSERT(cond) \
+  ((cond) ? (void)0 : ::_always_assert_fail(__LINE__, __FILE__, #cond))
+
+#ifndef NDEBUG
+// This could affect build times enough that we should not include it for
+// accurate speed tests
+#define PREDICT_FP_RATE
+#endif
+
+using ROCKSDB_NAMESPACE::Arena;
+using ROCKSDB_NAMESPACE::BlockContents;
+using ROCKSDB_NAMESPACE::BloomFilterPolicy;
+using ROCKSDB_NAMESPACE::BloomHash;
+using ROCKSDB_NAMESPACE::BloomLikeFilterPolicy;
+using ROCKSDB_NAMESPACE::BuiltinFilterBitsBuilder;
+using ROCKSDB_NAMESPACE::CachableEntry;
+using ROCKSDB_NAMESPACE::Cache;
+using ROCKSDB_NAMESPACE::CacheEntryRole;
+using ROCKSDB_NAMESPACE::CacheEntryRoleOptions;
+using ROCKSDB_NAMESPACE::EncodeFixed32;
+using ROCKSDB_NAMESPACE::Env;
+using ROCKSDB_NAMESPACE::FastRange32;
+using ROCKSDB_NAMESPACE::FilterBitsReader;
+using ROCKSDB_NAMESPACE::FilterBuildingContext;
+using ROCKSDB_NAMESPACE::FilterPolicy;
+using ROCKSDB_NAMESPACE::FullFilterBlockReader;
+using ROCKSDB_NAMESPACE::GetSliceHash;
+using ROCKSDB_NAMESPACE::GetSliceHash64;
+using ROCKSDB_NAMESPACE::Lower32of64;
+using ROCKSDB_NAMESPACE::LRUCacheOptions;
+using ROCKSDB_NAMESPACE::ParsedFullFilterBlock;
+using ROCKSDB_NAMESPACE::PlainTableBloomV1;
+using ROCKSDB_NAMESPACE::Random32;
+using ROCKSDB_NAMESPACE::Slice;
+using ROCKSDB_NAMESPACE::static_cast_with_check;
+using ROCKSDB_NAMESPACE::Status;
+using ROCKSDB_NAMESPACE::StderrLogger;
+using ROCKSDB_NAMESPACE::mock::MockBlockBasedTableTester;
+
+struct KeyMaker {
+  KeyMaker(size_t avg_size)
+      : smallest_size_(avg_size -
+                       (FLAGS_vary_key_size_log2_interval >= 30 ? 2 : 0)),
+        buf_size_(avg_size + 11),  // pad to vary key size and alignment
+        buf_(new char[buf_size_]) {
+    memset(buf_.get(), 0, buf_size_);
+    assert(smallest_size_ > 8);
+  }
+  size_t smallest_size_;
+  size_t buf_size_;
+  std::unique_ptr<char[]> buf_;
+
+  // Returns a unique(-ish) key based on the given parameter values. Each
+  // call returns a Slice from the same buffer so previously returned
+  // Slices should be considered invalidated.
+  Slice Get(uint32_t filter_num, uint32_t val_num) {
+    size_t start = FLAGS_vary_key_alignment ? val_num % 4 : 0;
+    size_t len = smallest_size_;
+    if (FLAGS_vary_key_size_log2_interval < 30) {
+      // To get range [avg_size - 2, avg_size + 2]
+      // use range [smallest_size, smallest_size + 4]
+      len += FastRange32(
+          (val_num >> FLAGS_vary_key_size_log2_interval) * 1234567891, 5);
+    }
+    char *data = buf_.get() + start;
+    // Populate key data such that all data makes it into a key of at
+    // least 8 bytes. We also don't want all the within-filter key
+    // variance confined to a contiguous 32 bits, because then a 32 bit
+    // hash function can "cheat" the false positive rate by
+    // approximating a perfect hash.
+    EncodeFixed32(data, val_num);
+    EncodeFixed32(data + 4, filter_num + val_num);
+    // ensure clearing leftovers from different alignment
+    EncodeFixed32(data + 8, 0);
+    return Slice(data, len);
+  }
+};
+
+void PrintWarnings() {
+#if defined(__GNUC__) && !defined(__OPTIMIZE__)
+  fprintf(stdout,
+          "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n");
+#endif
+#ifndef NDEBUG
+  fprintf(stdout,
+          "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
+#endif
+}
+
+void PrintError(const char *error) { fprintf(stderr, "ERROR: %s\n", error); }
+
+struct FilterInfo {
+  uint32_t filter_id_ = 0;
+  std::unique_ptr<const char[]> owner_;
+  Slice filter_;
+  Status filter_construction_status = Status::OK();
+  uint32_t keys_added_ = 0;
+  std::unique_ptr<FilterBitsReader> reader_;
+  std::unique_ptr<FullFilterBlockReader> full_block_reader_;
+  std::unique_ptr<PlainTableBloomV1> plain_table_bloom_;
+  uint64_t outside_queries_ = 0;
+  uint64_t false_positives_ = 0;
+};
+
+enum TestMode {
+  kSingleFilter,
+  kBatchPrepared,
+  kBatchUnprepared,
+  kFiftyOneFilter,
+  kEightyTwentyFilter,
+  kRandomFilter,
+};
+
+static const std::vector<TestMode> allTestModes = {
+    kSingleFilter,   kBatchPrepared,      kBatchUnprepared,
+    kFiftyOneFilter, kEightyTwentyFilter, kRandomFilter,
+};
+
+static const std::vector<TestMode> quickTestModes = {
+    kSingleFilter,
+    kRandomFilter,
+};
+
+static const std::vector<TestMode> bestCaseTestModes = {
+    kSingleFilter,
+};
+
+const char *TestModeToString(TestMode tm) {
+  switch (tm) {
+    case kSingleFilter:
+      return "Single filter";
+    case kBatchPrepared:
+      return "Batched, prepared";
+    case kBatchUnprepared:
+      return "Batched, unprepared";
+    case kFiftyOneFilter:
+      return "Skewed 50% in 1%";
+    case kEightyTwentyFilter:
+      return "Skewed 80% in 20%";
+    case kRandomFilter:
+      return "Random filter";
+  }
+  return "Bad TestMode";
+}
+
+// Do just enough to keep some data dependence for the
+// compiler / CPU
+static uint32_t DryRunNoHash(Slice &s) {
+  uint32_t sz = static_cast<uint32_t>(s.size());
+  if (sz >= 4) {
+    return sz + s.data()[3];
+  } else {
+    return sz;
+  }
+}
+
+static uint32_t DryRunHash32(Slice &s) {
+  // Same perf characteristics as GetSliceHash()
+  return BloomHash(s);
+}
+
+static uint32_t DryRunHash64(Slice &s) {
+  return Lower32of64(GetSliceHash64(s));
+}
+
+const std::shared_ptr<const FilterPolicy> &GetPolicy() {
+  static std::shared_ptr<const FilterPolicy> policy;
+  if (!policy) {
+    policy = BloomLikeFilterPolicy::Create(
+        BloomLikeFilterPolicy::GetAllFixedImpls().at(FLAGS_impl),
+        FLAGS_bits_per_key);
+  }
+  return policy;
+}
+
+struct FilterBench : public MockBlockBasedTableTester {
+  std::vector<KeyMaker> kms_;
+  std::vector<FilterInfo> infos_;
+  Random32 random_;
+  std::ostringstream fp_rate_report_;
+  Arena arena_;
+  double m_queries_;
+  StderrLogger stderr_logger_;
+
+  FilterBench()
+      : MockBlockBasedTableTester(GetPolicy()),
+        random_(FLAGS_seed),
+        m_queries_(0) {
+    for (uint32_t i = 0; i < FLAGS_batch_size; ++i) {
+      kms_.emplace_back(FLAGS_key_size < 8 ? 8 : FLAGS_key_size);
+    }
+    ioptions_.logger = &stderr_logger_;
+    table_options_.optimize_filters_for_memory =
+        FLAGS_optimize_filters_for_memory;
+    table_options_.detect_filter_construct_corruption =
+        FLAGS_detect_filter_construct_corruption;
+    table_options_.cache_usage_options.options_overrides.insert(
+        {CacheEntryRole::kFilterConstruction,
+         {/*.charged = */ FLAGS_charge_filter_construction
+              ? CacheEntryRoleOptions::Decision::kEnabled
+              : CacheEntryRoleOptions::Decision::kDisabled}});
+    if (FLAGS_charge_filter_construction) {
+      table_options_.no_block_cache = false;
+      LRUCacheOptions lo;
+      lo.capacity = FLAGS_block_cache_capacity_MB * 1024 * 1024;
+      lo.num_shard_bits = 0;  // 2^0 shard
+      lo.strict_capacity_limit = FLAGS_strict_capacity_limit;
+      std::shared_ptr<Cache> cache(NewLRUCache(lo));
+      table_options_.block_cache = cache;
+    }
+  }
+
+  void Go();
+
+  double RandomQueryTest(uint32_t inside_threshold, bool dry_run,
+                         TestMode mode);
+};
+
+void FilterBench::Go() {
+  if (FLAGS_use_plain_table_bloom && FLAGS_use_full_block_reader) {
+    throw std::runtime_error(
+        "Can't combine -use_plain_table_bloom and -use_full_block_reader");
+  }
+  if (FLAGS_use_plain_table_bloom) {
+    if (FLAGS_impl > 1) {
+      throw std::runtime_error(
+          "-impl must currently be >= 0 and <= 1 for Plain table");
+    }
+  } else {
+    if (FLAGS_impl > 2) {
+      throw std::runtime_error(
+          "-impl must currently be >= 0 and <= 2 for Block-based table");
+    }
+  }
+
+  if (FLAGS_vary_key_count_ratio < 0.0 || FLAGS_vary_key_count_ratio > 1.0) {
+    throw std::runtime_error("-vary_key_count_ratio must be >= 0.0 and <= 1.0");
+  }
+
+  // For example, average_keys_per_filter = 100, vary_key_count_ratio = 0.1.
+  // Varys up to +/- 10 keys. variance_range = 21 (generating value 0..20).
+  // variance_offset = 10, so value - offset average value is always 0.
+  const uint32_t variance_range =
+      1 + 2 * static_cast<uint32_t>(FLAGS_vary_key_count_ratio *
+                                    FLAGS_average_keys_per_filter);
+  const uint32_t variance_offset = variance_range / 2;
+
+  const std::vector<TestMode> &testModes = FLAGS_best_case ? bestCaseTestModes
+                                           : FLAGS_quick   ? quickTestModes
+                                                           : allTestModes;
+
+  m_queries_ = FLAGS_m_queries;
+  double working_mem_size_mb = FLAGS_working_mem_size_mb;
+  if (FLAGS_quick) {
+    m_queries_ /= 7.0;
+  } else if (FLAGS_best_case) {
+    m_queries_ /= 3.0;
+    working_mem_size_mb /= 10.0;
+  }
+
+  std::cout << "Building..." << std::endl;
+
+  std::unique_ptr<BuiltinFilterBitsBuilder> builder;
+
+  size_t total_memory_used = 0;
+  size_t total_size = 0;
+  size_t total_keys_added = 0;
+#ifdef PREDICT_FP_RATE
+  double weighted_predicted_fp_rate = 0.0;
+#endif
+  size_t max_total_keys;
+  size_t max_mem;
+  if (FLAGS_m_keys_total_max > 0) {
+    max_total_keys = static_cast<size_t>(1000000 * FLAGS_m_keys_total_max);
+    max_mem = SIZE_MAX;
+  } else {
+    max_total_keys = SIZE_MAX;
+    max_mem = static_cast<size_t>(1024 * 1024 * working_mem_size_mb);
+  }
+
+  ROCKSDB_NAMESPACE::StopWatchNano timer(
+      ROCKSDB_NAMESPACE::SystemClock::Default().get(), true);
+
+  infos_.clear();
+  while ((working_mem_size_mb == 0 || total_size < max_mem) &&
+         total_keys_added < max_total_keys) {
+    uint32_t filter_id = random_.Next();
+    uint32_t keys_to_add = FLAGS_average_keys_per_filter +
+                           FastRange32(random_.Next(), variance_range) -
+                           variance_offset;
+    if (max_total_keys - total_keys_added < keys_to_add) {
+      keys_to_add = static_cast<uint32_t>(max_total_keys - total_keys_added);
+    }
+    infos_.emplace_back();
+    FilterInfo &info = infos_.back();
+    info.filter_id_ = filter_id;
+    info.keys_added_ = keys_to_add;
+    if (FLAGS_use_plain_table_bloom) {
+      info.plain_table_bloom_.reset(new PlainTableBloomV1());
+      info.plain_table_bloom_->SetTotalBits(
+          &arena_, static_cast<uint32_t>(keys_to_add * FLAGS_bits_per_key),
+          FLAGS_impl, 0 /*huge_page*/, nullptr /*logger*/);
+      for (uint32_t i = 0; i < keys_to_add; ++i) {
+        uint32_t hash = GetSliceHash(kms_[0].Get(filter_id, i));
+        info.plain_table_bloom_->AddHash(hash);
+      }
+      info.filter_ = info.plain_table_bloom_->GetRawData();
+    } else {
+      if (!builder) {
+        builder.reset(
+            static_cast_with_check<BuiltinFilterBitsBuilder>(GetBuilder()));
+      }
+      for (uint32_t i = 0; i < keys_to_add; ++i) {
+        builder->AddKey(kms_[0].Get(filter_id, i));
+      }
+      info.filter_ =
+          builder->Finish(&info.owner_, &info.filter_construction_status);
+      if (info.filter_construction_status.ok()) {
+        info.filter_construction_status =
+            builder->MaybePostVerify(info.filter_);
+      }
+      if (!info.filter_construction_status.ok()) {
+        PrintError(info.filter_construction_status.ToString().c_str());
+      }
+#ifdef PREDICT_FP_RATE
+      weighted_predicted_fp_rate +=
+          keys_to_add *
+          builder->EstimatedFpRate(keys_to_add, info.filter_.size());
+#endif
+      if (FLAGS_new_builder) {
+        builder.reset();
+      }
+      info.reader_.reset(
+          table_options_.filter_policy->GetFilterBitsReader(info.filter_));
+      CachableEntry<ParsedFullFilterBlock> block(
+          new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+                                    BlockContents(info.filter_)),
+          nullptr /* cache */, nullptr /* cache_handle */,
+          true /* own_value */);
+      info.full_block_reader_.reset(
+          new FullFilterBlockReader(table_.get(), std::move(block)));
+    }
+    total_size += info.filter_.size();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    total_memory_used +=
+        malloc_usable_size(const_cast<char *>(info.filter_.data()));
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    total_keys_added += keys_to_add;
+  }
+
+  uint64_t elapsed_nanos = timer.ElapsedNanos();
+  double ns = double(elapsed_nanos) / total_keys_added;
+  std::cout << "Build avg ns/key: " << ns << std::endl;
+  std::cout << "Number of filters: " << infos_.size() << std::endl;
+  std::cout << "Total size (MB): " << total_size / 1024.0 / 1024.0 << std::endl;
+  if (total_memory_used > 0) {
+    std::cout << "Reported total allocated memory (MB): "
+              << total_memory_used / 1024.0 / 1024.0 << std::endl;
+    std::cout << "Reported internal fragmentation: "
+              << (total_memory_used - total_size) * 100.0 / total_size << "%"
+              << std::endl;
+  }
+
+  double bpk = total_size * 8.0 / total_keys_added;
+  std::cout << "Bits/key stored: " << bpk << std::endl;
+#ifdef PREDICT_FP_RATE
+  std::cout << "Predicted FP rate %: "
+            << 100.0 * (weighted_predicted_fp_rate / total_keys_added)
+            << std::endl;
+#endif
+  if (!FLAGS_quick && !FLAGS_best_case) {
+    double tolerable_rate = std::pow(2.0, -(bpk - 1.0) / (1.4 + bpk / 50.0));
+    std::cout << "Best possible FP rate %: " << 100.0 * std::pow(2.0, -bpk)
+              << std::endl;
+    std::cout << "Tolerable FP rate %: " << 100.0 * tolerable_rate << std::endl;
+
+    std::cout << "----------------------------" << std::endl;
+    std::cout << "Verifying..." << std::endl;
+
+    uint32_t outside_q_per_f =
+        static_cast<uint32_t>(m_queries_ * 1000000 / infos_.size());
+    uint64_t fps = 0;
+    for (uint32_t i = 0; i < infos_.size(); ++i) {
+      FilterInfo &info = infos_[i];
+      for (uint32_t j = 0; j < info.keys_added_; ++j) {
+        if (FLAGS_use_plain_table_bloom) {
+          uint32_t hash = GetSliceHash(kms_[0].Get(info.filter_id_, j));
+          ALWAYS_ASSERT(info.plain_table_bloom_->MayContainHash(hash));
+        } else {
+          ALWAYS_ASSERT(
+              info.reader_->MayMatch(kms_[0].Get(info.filter_id_, j)));
+        }
+      }
+      for (uint32_t j = 0; j < outside_q_per_f; ++j) {
+        if (FLAGS_use_plain_table_bloom) {
+          uint32_t hash =
+              GetSliceHash(kms_[0].Get(info.filter_id_, j | 0x80000000));
+          fps += info.plain_table_bloom_->MayContainHash(hash);
+        } else {
+          fps += info.reader_->MayMatch(
+              kms_[0].Get(info.filter_id_, j | 0x80000000));
+        }
+      }
+    }
+    std::cout << " No FNs :)" << std::endl;
+    double prelim_rate = double(fps) / outside_q_per_f / infos_.size();
+    std::cout << " Prelim FP rate %: " << (100.0 * prelim_rate) << std::endl;
+
+    if (!FLAGS_allow_bad_fp_rate) {
+      ALWAYS_ASSERT(prelim_rate < tolerable_rate);
+    }
+  }
+
+  std::cout << "----------------------------" << std::endl;
+  std::cout << "Mixed inside/outside queries..." << std::endl;
+  // 50% each inside and outside
+  uint32_t inside_threshold = UINT32_MAX / 2;
+  for (TestMode tm : testModes) {
+    random_.Seed(FLAGS_seed + 1);
+    double f = RandomQueryTest(inside_threshold, /*dry_run*/ false, tm);
+    random_.Seed(FLAGS_seed + 1);
+    double d = RandomQueryTest(inside_threshold, /*dry_run*/ true, tm);
+    std::cout << "  " << TestModeToString(tm) << " net ns/op: " << (f - d)
+              << std::endl;
+  }
+
+  if (!FLAGS_quick) {
+    std::cout << "----------------------------" << std::endl;
+    std::cout << "Inside queries (mostly)..." << std::endl;
+    // Do about 95% inside queries rather than 100% so that branch predictor
+    // can't give itself an artifically crazy advantage.
+    inside_threshold = UINT32_MAX / 20 * 19;
+    for (TestMode tm : testModes) {
+      random_.Seed(FLAGS_seed + 1);
+      double f = RandomQueryTest(inside_threshold, /*dry_run*/ false, tm);
+      random_.Seed(FLAGS_seed + 1);
+      double d = RandomQueryTest(inside_threshold, /*dry_run*/ true, tm);
+      std::cout << "  " << TestModeToString(tm) << " net ns/op: " << (f - d)
+                << std::endl;
+    }
+
+    std::cout << "----------------------------" << std::endl;
+    std::cout << "Outside queries (mostly)..." << std::endl;
+    // Do about 95% outside queries rather than 100% so that branch predictor
+    // can't give itself an artifically crazy advantage.
+    inside_threshold = UINT32_MAX / 20;
+    for (TestMode tm : testModes) {
+      random_.Seed(FLAGS_seed + 2);
+      double f = RandomQueryTest(inside_threshold, /*dry_run*/ false, tm);
+      random_.Seed(FLAGS_seed + 2);
+      double d = RandomQueryTest(inside_threshold, /*dry_run*/ true, tm);
+      std::cout << "  " << TestModeToString(tm) << " net ns/op: " << (f - d)
+                << std::endl;
+    }
+  }
+  std::cout << fp_rate_report_.str();
+
+  std::cout << "----------------------------" << std::endl;
+  std::cout << "Done. (For more info, run with -legend or -help.)" << std::endl;
+}
+
+double FilterBench::RandomQueryTest(uint32_t inside_threshold, bool dry_run,
+                                    TestMode mode) {
+  for (auto &info : infos_) {
+    info.outside_queries_ = 0;
+    info.false_positives_ = 0;
+  }
+
+  auto dry_run_hash_fn = DryRunNoHash;
+  if (!FLAGS_net_includes_hashing) {
+    if (FLAGS_impl == 0 || FLAGS_use_plain_table_bloom) {
+      dry_run_hash_fn = DryRunHash32;
+    } else {
+      dry_run_hash_fn = DryRunHash64;
+    }
+  }
+
+  uint32_t num_infos = static_cast<uint32_t>(infos_.size());
+  uint32_t dry_run_hash = 0;
+  uint64_t max_queries = static_cast<uint64_t>(m_queries_ * 1000000 + 0.50);
+  // Some filters may be considered secondary in order to implement skewed
+  // queries. num_primary_filters is the number that are to be treated as
+  // equal, and any remainder will be treated as secondary.
+  uint32_t num_primary_filters = num_infos;
+  // The proportion (when divided by 2^32 - 1) of filter queries going to
+  // the primary filters (default = all). The remainder of queries are
+  // against secondary filters.
+  uint32_t primary_filter_threshold = 0xffffffff;
+  if (mode == kSingleFilter) {
+    // 100% of queries to 1 filter
+    num_primary_filters = 1;
+  } else if (mode == kFiftyOneFilter) {
+    if (num_infos < 50) {
+      return 0.0;  // skip
+    }
+    // 50% of queries
+    primary_filter_threshold /= 2;
+    // to 1% of filters
+    num_primary_filters = (num_primary_filters + 99) / 100;
+  } else if (mode == kEightyTwentyFilter) {
+    if (num_infos < 5) {
+      return 0.0;  // skip
+    }
+    // 80% of queries
+    primary_filter_threshold = primary_filter_threshold / 5 * 4;
+    // to 20% of filters
+    num_primary_filters = (num_primary_filters + 4) / 5;
+  } else if (mode == kRandomFilter) {
+    if (num_infos == 1) {
+      return 0.0;  // skip
+    }
+  }
+  uint32_t batch_size = 1;
+  std::unique_ptr<Slice[]> batch_slices;
+  std::unique_ptr<Slice *[]> batch_slice_ptrs;
+  std::unique_ptr<bool[]> batch_results;
+  if (mode == kBatchPrepared || mode == kBatchUnprepared) {
+    batch_size = static_cast<uint32_t>(kms_.size());
+  }
+
+  batch_slices.reset(new Slice[batch_size]);
+  batch_slice_ptrs.reset(new Slice *[batch_size]);
+  batch_results.reset(new bool[batch_size]);
+  for (uint32_t i = 0; i < batch_size; ++i) {
+    batch_results[i] = false;
+    batch_slice_ptrs[i] = &batch_slices[i];
+  }
+
+  ROCKSDB_NAMESPACE::StopWatchNano timer(
+      ROCKSDB_NAMESPACE::SystemClock::Default().get(), true);
+
+  for (uint64_t q = 0; q < max_queries; q += batch_size) {
+    bool inside_this_time = random_.Next() <= inside_threshold;
+
+    uint32_t filter_index;
+    if (random_.Next() <= primary_filter_threshold) {
+      filter_index = random_.Uniformish(num_primary_filters);
+    } else {
+      // secondary
+      filter_index = num_primary_filters +
+                     random_.Uniformish(num_infos - num_primary_filters);
+    }
+    FilterInfo &info = infos_[filter_index];
+    for (uint32_t i = 0; i < batch_size; ++i) {
+      if (inside_this_time) {
+        batch_slices[i] =
+            kms_[i].Get(info.filter_id_, random_.Uniformish(info.keys_added_));
+      } else {
+        batch_slices[i] =
+            kms_[i].Get(info.filter_id_, random_.Uniformish(info.keys_added_) |
+                                             uint32_t{0x80000000});
+        info.outside_queries_++;
+      }
+    }
+    // TODO: implement batched interface to full block reader
+    // TODO: implement batched interface to plain table bloom
+    if (mode == kBatchPrepared && !FLAGS_use_full_block_reader &&
+        !FLAGS_use_plain_table_bloom) {
+      for (uint32_t i = 0; i < batch_size; ++i) {
+        batch_results[i] = false;
+      }
+      if (dry_run) {
+        for (uint32_t i = 0; i < batch_size; ++i) {
+          batch_results[i] = true;
+          dry_run_hash += dry_run_hash_fn(batch_slices[i]);
+        }
+      } else {
+        info.reader_->MayMatch(batch_size, batch_slice_ptrs.get(),
+                               batch_results.get());
+      }
+      for (uint32_t i = 0; i < batch_size; ++i) {
+        if (inside_this_time) {
+          ALWAYS_ASSERT(batch_results[i]);
+        } else {
+          info.false_positives_ += batch_results[i];
+        }
+      }
+    } else {
+      for (uint32_t i = 0; i < batch_size; ++i) {
+        bool may_match;
+        if (FLAGS_use_plain_table_bloom) {
+          if (dry_run) {
+            dry_run_hash += dry_run_hash_fn(batch_slices[i]);
+            may_match = true;
+          } else {
+            uint32_t hash = GetSliceHash(batch_slices[i]);
+            may_match = info.plain_table_bloom_->MayContainHash(hash);
+          }
+        } else if (FLAGS_use_full_block_reader) {
+          if (dry_run) {
+            dry_run_hash += dry_run_hash_fn(batch_slices[i]);
+            may_match = true;
+          } else {
+            may_match = info.full_block_reader_->KeyMayMatch(
+                batch_slices[i],
+                /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                /*get_context=*/nullptr,
+                /*lookup_context=*/nullptr, Env::IO_TOTAL);
+          }
+        } else {
+          if (dry_run) {
+            dry_run_hash += dry_run_hash_fn(batch_slices[i]);
+            may_match = true;
+          } else {
+            may_match = info.reader_->MayMatch(batch_slices[i]);
+          }
+        }
+        if (inside_this_time) {
+          ALWAYS_ASSERT(may_match);
+        } else {
+          info.false_positives_ += may_match;
+        }
+      }
+    }
+  }
+
+  uint64_t elapsed_nanos = timer.ElapsedNanos();
+  double ns = double(elapsed_nanos) / max_queries;
+
+  if (!FLAGS_quick) {
+    if (dry_run) {
+      // Printing part of hash prevents dry run components from being optimized
+      // away by compiler
+      std::cout << "    Dry run (" << std::hex << (dry_run_hash & 0xfffff)
+                << std::dec << ") ";
+    } else {
+      std::cout << "    Gross filter    ";
+    }
+    std::cout << "ns/op: " << ns << std::endl;
+  }
+
+  if (!dry_run) {
+    fp_rate_report_.str("");
+    uint64_t q = 0;
+    uint64_t fp = 0;
+    double worst_fp_rate = 0.0;
+    double best_fp_rate = 1.0;
+    for (auto &info : infos_) {
+      q += info.outside_queries_;
+      fp += info.false_positives_;
+      if (info.outside_queries_ > 0) {
+        double fp_rate = double(info.false_positives_) / info.outside_queries_;
+        worst_fp_rate = std::max(worst_fp_rate, fp_rate);
+        best_fp_rate = std::min(best_fp_rate, fp_rate);
+      }
+    }
+    fp_rate_report_ << "    Average FP rate %: " << 100.0 * fp / q << std::endl;
+    if (!FLAGS_quick && !FLAGS_best_case) {
+      fp_rate_report_ << "    Worst   FP rate %: " << 100.0 * worst_fp_rate
+                      << std::endl;
+      fp_rate_report_ << "    Best    FP rate %: " << 100.0 * best_fp_rate
+                      << std::endl;
+      fp_rate_report_ << "    Best possible bits/key: "
+                      << -std::log(double(fp) / q) / std::log(2.0) << std::endl;
+    }
+  }
+  return ns;
+}
+
+int main(int argc, char **argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                  " [-quick] [OTHER OPTIONS]...");
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  PrintWarnings();
+
+  if (FLAGS_legend) {
+    std::cout
+        << "Legend:" << std::endl
+        << "  \"Inside\" - key that was added to filter" << std::endl
+        << "  \"Outside\" - key that was not added to filter" << std::endl
+        << "  \"FN\" - false negative query (must not happen)" << std::endl
+        << "  \"FP\" - false positive query (OK at low rate)" << std::endl
+        << "  \"Dry run\" - cost of testing and hashing overhead." << std::endl
+        << "  \"Gross filter\" - cost of filter queries including testing "
+        << "\n     and hashing overhead." << std::endl
+        << "  \"net\" - best estimate of time in filter operation, without "
+        << "\n     testing and hashing overhead (gross filter - dry run)"
+        << std::endl
+        << "  \"ns/op\" - nanoseconds per operation (key query or add)"
+        << std::endl
+        << "  \"Single filter\" - essentially minimum cost, assuming filter"
+        << "\n     fits easily in L1 CPU cache." << std::endl
+        << "  \"Batched, prepared\" - several queries at once against a"
+        << "\n     randomly chosen filter, using multi-query interface."
+        << std::endl
+        << "  \"Batched, unprepared\" - similar, but using serial calls"
+        << "\n     to single query interface." << std::endl
+        << "  \"Random filter\" - a filter is chosen at random as target"
+        << "\n     of each query." << std::endl
+        << "  \"Skewed X% in Y%\" - like \"Random filter\" except Y% of"
+        << "\n      the filters are designated as \"hot\" and receive X%"
+        << "\n      of queries." << std::endl;
+  } else {
+    FilterBench b;
+    for (uint32_t i = 0; i < FLAGS_runs; ++i) {
+      b.Go();
+      FLAGS_seed += 100;
+      b.random_.Seed(FLAGS_seed);
+    }
+  }
+
+  return 0;
+}
+
+#endif  // !defined(GFLAGS) || defined(ROCKSDB_LITE)
diff --git a/src/rocksdb/util/gflags_compat.h b/src/rocksdb/util/gflags_compat.h
new file mode 100644
index 000000000..b6f88a5bc
--- /dev/null
+++ b/src/rocksdb/util/gflags_compat.h
@@ -0,0 +1,30 @@
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <gflags/gflags.h>
+
+#include <functional>
+
+#ifndef GFLAGS_NAMESPACE
+// in case it's not defined in old versions, that's probably because it was
+// still google by default.
+#define GFLAGS_NAMESPACE google
+#endif
+
+#ifndef DEFINE_uint32
+// DEFINE_uint32 does not appear in older versions of gflags. This should be
+// a sane definition for those versions.
+#include <cstdint>
+#define DEFINE_uint32(name, val, txt)             \
+  namespace gflags_compat {                       \
+  DEFINE_int32(name, val, txt);                   \
+  }                                               \
+  std::reference_wrapper<uint32_t> FLAGS_##name = \
+      std::ref(*reinterpret_cast<uint32_t *>(&gflags_compat::FLAGS_##name));
+
+#define DECLARE_uint32(name) \
+  extern std::reference_wrapper<uint32_t> FLAGS_##name;
+#endif  // !DEFINE_uint32
diff --git a/src/rocksdb/util/hash.cc b/src/rocksdb/util/hash.cc
new file mode 100644
index 000000000..0f7f2edc1
--- /dev/null
+++ b/src/rocksdb/util/hash.cc
@@ -0,0 +1,201 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/hash.h"
+
+#include <string>
+
+#include "port/lang.h"
+#include "util/coding.h"
+#include "util/hash128.h"
+#include "util/math128.h"
+#include "util/xxhash.h"
+#include "util/xxph3.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+uint64_t (*kGetSliceNPHash64UnseededFnPtr)(const Slice&) = &GetSliceHash64;
+
+uint32_t Hash(const char* data, size_t n, uint32_t seed) {
+  // MurmurHash1 - fast but mediocre quality
+  // https://github.com/aappleby/smhasher/wiki/MurmurHash1
+  //
+  const uint32_t m = 0xc6a4a793;
+  const uint32_t r = 24;
+  const char* limit = data + n;
+  uint32_t h = static_cast<uint32_t>(seed ^ (n * m));
+
+  // Pick up four bytes at a time
+  while (data + 4 <= limit) {
+    uint32_t w = DecodeFixed32(data);
+    data += 4;
+    h += w;
+    h *= m;
+    h ^= (h >> 16);
+  }
+
+  // Pick up remaining bytes
+  switch (limit - data) {
+    // Note: The original hash implementation used data[i] << shift, which
+    // promotes the char to int and then performs the shift. If the char is
+    // negative, the shift is undefined behavior in C++. The hash algorithm is
+    // part of the format definition, so we cannot change it; to obtain the same
+    // behavior in a legal way we just cast to uint32_t, which will do
+    // sign-extension. To guarantee compatibility with architectures where chars
+    // are unsigned we first cast the char to int8_t.
+    case 3:
+      h += static_cast<uint32_t>(static_cast<int8_t>(data[2])) << 16;
+      FALLTHROUGH_INTENDED;
+    case 2:
+      h += static_cast<uint32_t>(static_cast<int8_t>(data[1])) << 8;
+      FALLTHROUGH_INTENDED;
+    case 1:
+      h += static_cast<uint32_t>(static_cast<int8_t>(data[0]));
+      h *= m;
+      h ^= (h >> r);
+      break;
+  }
+  return h;
+}
+
+// We are standardizing on a preview release of XXH3, because that's
+// the best available at time of standardizing.
+//
+// In testing (mostly Intel Skylake), this hash function is much more
+// thorough than Hash32 and is almost universally faster. Hash() only
+// seems faster when passing runtime-sized keys of the same small size
+// (less than about 24 bytes) thousands of times in a row; this seems
+// to allow the branch predictor to work some magic. XXH3's speed is
+// much less dependent on branch prediction.
+//
+// Hashing with a prefix extractor is potentially a common case of
+// hashing objects of small, predictable size. We could consider
+// bundling hash functions specialized for particular lengths with
+// the prefix extractors.
+uint64_t Hash64(const char* data, size_t n, uint64_t seed) {
+  return XXPH3_64bits_withSeed(data, n, seed);
+}
+
+uint64_t Hash64(const char* data, size_t n) {
+  // Same as seed = 0
+  return XXPH3_64bits(data, n);
+}
+
+uint64_t GetSlicePartsNPHash64(const SliceParts& data, uint64_t seed) {
+  // TODO(ajkr): use XXH3 streaming APIs to avoid the copy/allocation.
+  size_t concat_len = 0;
+  for (int i = 0; i < data.num_parts; ++i) {
+    concat_len += data.parts[i].size();
+  }
+  std::string concat_data;
+  concat_data.reserve(concat_len);
+  for (int i = 0; i < data.num_parts; ++i) {
+    concat_data.append(data.parts[i].data(), data.parts[i].size());
+  }
+  assert(concat_data.size() == concat_len);
+  return NPHash64(concat_data.data(), concat_len, seed);
+}
+
+Unsigned128 Hash128(const char* data, size_t n, uint64_t seed) {
+  auto h = XXH3_128bits_withSeed(data, n, seed);
+  return (Unsigned128{h.high64} << 64) | (h.low64);
+}
+
+Unsigned128 Hash128(const char* data, size_t n) {
+  // Same as seed = 0
+  auto h = XXH3_128bits(data, n);
+  return (Unsigned128{h.high64} << 64) | (h.low64);
+}
+
+void Hash2x64(const char* data, size_t n, uint64_t* high64, uint64_t* low64) {
+  // Same as seed = 0
+  auto h = XXH3_128bits(data, n);
+  *high64 = h.high64;
+  *low64 = h.low64;
+}
+
+void Hash2x64(const char* data, size_t n, uint64_t seed, uint64_t* high64,
+              uint64_t* low64) {
+  auto h = XXH3_128bits_withSeed(data, n, seed);
+  *high64 = h.high64;
+  *low64 = h.low64;
+}
+
+namespace {
+
+inline uint64_t XXH3_avalanche(uint64_t h64) {
+  h64 ^= h64 >> 37;
+  h64 *= 0x165667919E3779F9U;
+  h64 ^= h64 >> 32;
+  return h64;
+}
+
+inline uint64_t XXH3_unavalanche(uint64_t h64) {
+  h64 ^= h64 >> 32;
+  h64 *= 0x8da8ee41d6df849U;  // inverse of 0x165667919E3779F9U
+  h64 ^= h64 >> 37;
+  return h64;
+}
+
+}  // namespace
+
+void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed,
+                       uint64_t* out_high64, uint64_t* out_low64) {
+  // Adapted from XXH3_len_9to16_128b
+  const uint64_t bitflipl = /*secret part*/ 0x59973f0033362349U - seed;
+  const uint64_t bitfliph = /*secret part*/ 0xc202797692d63d58U + seed;
+  Unsigned128 tmp128 =
+      Multiply64to128(in_low64 ^ in_high64 ^ bitflipl, 0x9E3779B185EBCA87U);
+  uint64_t lo = Lower64of128(tmp128);
+  uint64_t hi = Upper64of128(tmp128);
+  lo += 0x3c0000000000000U;  // (len - 1) << 54
+  in_high64 ^= bitfliph;
+  hi += in_high64 + (Lower32of64(in_high64) * uint64_t{0x85EBCA76});
+  lo ^= EndianSwapValue(hi);
+  tmp128 = Multiply64to128(lo, 0xC2B2AE3D27D4EB4FU);
+  lo = Lower64of128(tmp128);
+  hi = Upper64of128(tmp128) + (hi * 0xC2B2AE3D27D4EB4FU);
+  *out_low64 = XXH3_avalanche(lo);
+  *out_high64 = XXH3_avalanche(hi);
+}
+
+void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed,
+                         uint64_t* out_high64, uint64_t* out_low64) {
+  // Inverted above (also consulting XXH3_len_9to16_128b)
+  const uint64_t bitflipl = /*secret part*/ 0x59973f0033362349U - seed;
+  const uint64_t bitfliph = /*secret part*/ 0xc202797692d63d58U + seed;
+  uint64_t lo = XXH3_unavalanche(in_low64);
+  uint64_t hi = XXH3_unavalanche(in_high64);
+  lo *= 0xba79078168d4baf;  // inverse of 0xC2B2AE3D27D4EB4FU
+  hi -= Upper64of128(Multiply64to128(lo, 0xC2B2AE3D27D4EB4FU));
+  hi *= 0xba79078168d4baf;  // inverse of 0xC2B2AE3D27D4EB4FU
+  lo ^= EndianSwapValue(hi);
+  lo -= 0x3c0000000000000U;
+  lo *= 0x887493432badb37U;  // inverse of 0x9E3779B185EBCA87U
+  hi -= Upper64of128(Multiply64to128(lo, 0x9E3779B185EBCA87U));
+  uint32_t tmp32 = Lower32of64(hi) * 0xb6c92f47;  // inverse of 0x85EBCA77
+  hi -= tmp32;
+  hi = (hi & 0xFFFFFFFF00000000U) -
+       ((tmp32 * uint64_t{0x85EBCA76}) & 0xFFFFFFFF00000000U) + tmp32;
+  hi ^= bitfliph;
+  lo ^= hi ^ bitflipl;
+  *out_high64 = hi;
+  *out_low64 = lo;
+}
+
+void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64,
+                       uint64_t* out_high64, uint64_t* out_low64) {
+  BijectiveHash2x64(in_high64, in_low64, /*seed*/ 0, out_high64, out_low64);
+}
+
+void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64,
+                         uint64_t* out_high64, uint64_t* out_low64) {
+  BijectiveUnhash2x64(in_high64, in_low64, /*seed*/ 0, out_high64, out_low64);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/hash.h b/src/rocksdb/util/hash.h
new file mode 100644
index 000000000..eafa47f34
--- /dev/null
+++ b/src/rocksdb/util/hash.h
@@ -0,0 +1,137 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Common hash functions with convenient interfaces. If hashing a
+// statically-sized input in a performance-critical context, consider
+// calling a specific hash implementation directly, such as
+// XXH3_64bits from xxhash.h.
+//
+// Since this is a very common header, implementation details are kept
+// out-of-line. Out-of-lining also aids in tracking the time spent in
+// hashing functions. Inlining is of limited benefit for runtime-sized
+// hash inputs.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include "rocksdb/slice.h"
+#include "util/fastrange.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Stable/persistent 64-bit hash. Higher quality and generally faster than
+// Hash(), especially for inputs > 24 bytes.
+// KNOWN FLAW: incrementing seed by 1 might not give sufficiently independent
+// results from previous seed. Recommend incrementing by a large odd number.
+extern uint64_t Hash64(const char* data, size_t n, uint64_t seed);
+
+// Specific optimization without seed (same as seed = 0)
+extern uint64_t Hash64(const char* data, size_t n);
+
+// Non-persistent hash. Must only used for in-memory data structures.
+// The hash results are thus subject to change between releases,
+// architectures, build configuration, etc. (Thus, it rarely makes sense
+// to specify a seed for this function, except for a "rolling" hash.)
+// KNOWN FLAW: incrementing seed by 1 might not give sufficiently independent
+// results from previous seed. Recommend incrementing by a large odd number.
+inline uint64_t NPHash64(const char* data, size_t n, uint64_t seed) {
+#ifdef ROCKSDB_MODIFY_NPHASH
+  // For testing "subject to change"
+  return Hash64(data, n, seed + 123456789);
+#else
+  // Currently same as Hash64
+  return Hash64(data, n, seed);
+#endif
+}
+
+// Specific optimization without seed (same as seed = 0)
+inline uint64_t NPHash64(const char* data, size_t n) {
+#ifdef ROCKSDB_MODIFY_NPHASH
+  // For testing "subject to change"
+  return Hash64(data, n, 123456789);
+#else
+  // Currently same as Hash64
+  return Hash64(data, n);
+#endif
+}
+
+// Convenient and equivalent version of Hash128 without depending on 128-bit
+// scalars
+void Hash2x64(const char* data, size_t n, uint64_t* high64, uint64_t* low64);
+void Hash2x64(const char* data, size_t n, uint64_t seed, uint64_t* high64,
+              uint64_t* low64);
+
+// Hash 128 bits to 128 bits, guaranteed not to lose data (equivalent to
+// Hash2x64 on 16 bytes little endian)
+void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64,
+                       uint64_t* out_high64, uint64_t* out_low64);
+void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed,
+                       uint64_t* out_high64, uint64_t* out_low64);
+
+// Inverse of above (mostly for testing)
+void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64,
+                         uint64_t* out_high64, uint64_t* out_low64);
+void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed,
+                         uint64_t* out_high64, uint64_t* out_low64);
+
+// Stable/persistent 32-bit hash. Moderate quality and high speed on
+// small inputs.
+// TODO: consider rename to Hash32
+// KNOWN FLAW: incrementing seed by 1 might not give sufficiently independent
+// results from previous seed. Recommend pseudorandom or hashed seeds.
+extern uint32_t Hash(const char* data, size_t n, uint32_t seed);
+
+// TODO: consider rename to LegacyBloomHash32
+inline uint32_t BloomHash(const Slice& key) {
+  return Hash(key.data(), key.size(), 0xbc9f1d34);
+}
+
+inline uint64_t GetSliceHash64(const Slice& key) {
+  return Hash64(key.data(), key.size());
+}
+// Provided for convenience for use with template argument deduction, where a
+// specific overload needs to be used.
+extern uint64_t (*kGetSliceNPHash64UnseededFnPtr)(const Slice&);
+
+inline uint64_t GetSliceNPHash64(const Slice& s) {
+  return NPHash64(s.data(), s.size());
+}
+
+inline uint64_t GetSliceNPHash64(const Slice& s, uint64_t seed) {
+  return NPHash64(s.data(), s.size(), seed);
+}
+
+// Similar to `GetSliceNPHash64()` with `seed`, but input comes from
+// concatenation of `Slice`s in `data`.
+extern uint64_t GetSlicePartsNPHash64(const SliceParts& data, uint64_t seed);
+
+inline size_t GetSliceRangedNPHash(const Slice& s, size_t range) {
+  return FastRange64(NPHash64(s.data(), s.size()), range);
+}
+
+// TODO: consider rename to GetSliceHash32
+inline uint32_t GetSliceHash(const Slice& s) {
+  return Hash(s.data(), s.size(), 397);
+}
+
+// Useful for splitting up a 64-bit hash
+inline uint32_t Upper32of64(uint64_t v) {
+  return static_cast<uint32_t>(v >> 32);
+}
+inline uint32_t Lower32of64(uint64_t v) { return static_cast<uint32_t>(v); }
+
+// std::hash compatible interface.
+// TODO: consider rename to SliceHasher32
+struct SliceHasher {
+  uint32_t operator()(const Slice& s) const { return GetSliceHash(s); }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/hash128.h b/src/rocksdb/util/hash128.h
new file mode 100644
index 000000000..305caa14a
--- /dev/null
+++ b/src/rocksdb/util/hash128.h
@@ -0,0 +1,26 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+// 128-bit hash gets it own header so that more popular hash.h doesn't
+// depend on math128.h
+
+#include "rocksdb/slice.h"
+#include "util/math128.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Stable/persistent 128-bit hash for non-cryptographic applications.
+Unsigned128 Hash128(const char* data, size_t n, uint64_t seed);
+
+// Specific optimization without seed (same as seed = 0)
+Unsigned128 Hash128(const char* data, size_t n);
+
+inline Unsigned128 GetSliceHash128(const Slice& key) {
+  return Hash128(key.data(), key.size());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/hash_containers.h b/src/rocksdb/util/hash_containers.h
new file mode 100644
index 000000000..52be3718c
--- /dev/null
+++ b/src/rocksdb/util/hash_containers.h
@@ -0,0 +1,51 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+// This header establishes compile-time pluggable implementations of hashed
+// container structures, so that deployments have the option of minimal
+// dependencies with ok performance (e.g. std::unordered_map) or more
+// dependencies with optimized performance (e.g. folly::F14FastMap).
+
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+#ifdef USE_FOLLY
+
+#include <folly/container/F14Map.h>
+#include <folly/container/F14Set.h>
+
+namespace ROCKSDB_NAMESPACE {
+
+template <typename K, typename V>
+using UnorderedMap = folly::F14FastMap<K, V>;
+
+template <typename K, typename V, typename H>
+using UnorderedMapH = folly::F14FastMap<K, V, H>;
+
+template <typename K>
+using UnorderedSet = folly::F14FastSet<K>;
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#else
+
+#include <unordered_map>
+#include <unordered_set>
+
+namespace ROCKSDB_NAMESPACE {
+
+template <typename K, typename V>
+using UnorderedMap = std::unordered_map<K, V>;
+
+template <typename K, typename V, typename H>
+using UnorderedMapH = std::unordered_map<K, V, H>;
+
+template <typename K>
+using UnorderedSet = std::unordered_set<K>;
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/util/hash_map.h b/src/rocksdb/util/hash_map.h
new file mode 100644
index 000000000..e3ad2584f
--- /dev/null
+++ b/src/rocksdb/util/hash_map.h
@@ -0,0 +1,67 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <utility>
+
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This is similar to std::unordered_map, except that it tries to avoid
+// allocating or deallocating memory as much as possible. With
+// std::unordered_map, an allocation/deallocation is made for every insertion
+// or deletion because of the requirement that iterators remain valid even
+// with insertions or deletions. This means that the hash chains will be
+// implemented as linked lists.
+//
+// This implementation uses autovector as hash chains insteads.
+//
+template <typename K, typename V, size_t size = 128>
+class HashMap {
+  std::array<autovector<std::pair<K, V>, 1>, size> table_;
+
+ public:
+  bool Contains(K key) {
+    auto& bucket = table_[key % size];
+    auto it = std::find_if(
+        bucket.begin(), bucket.end(),
+        [key](const std::pair<K, V>& p) { return p.first == key; });
+    return it != bucket.end();
+  }
+
+  void Insert(K key, const V& value) {
+    auto& bucket = table_[key % size];
+    bucket.push_back({key, value});
+  }
+
+  void Delete(K key) {
+    auto& bucket = table_[key % size];
+    auto it = std::find_if(
+        bucket.begin(), bucket.end(),
+        [key](const std::pair<K, V>& p) { return p.first == key; });
+    if (it != bucket.end()) {
+      auto last = bucket.end() - 1;
+      if (it != last) {
+        *it = *last;
+      }
+      bucket.pop_back();
+    }
+  }
+
+  V& Get(K key) {
+    auto& bucket = table_[key % size];
+    auto it = std::find_if(
+        bucket.begin(), bucket.end(),
+        [key](const std::pair<K, V>& p) { return p.first == key; });
+    return it->second;
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/hash_test.cc b/src/rocksdb/util/hash_test.cc
new file mode 100644
index 000000000..72112b044
--- /dev/null
+++ b/src/rocksdb/util/hash_test.cc
@@ -0,0 +1,853 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/hash.h"
+
+#include <cstring>
+#include <type_traits>
+#include <vector>
+
+#include "test_util/testharness.h"
+#include "util/coding.h"
+#include "util/coding_lean.h"
+#include "util/hash128.h"
+#include "util/math.h"
+#include "util/math128.h"
+
+using ROCKSDB_NAMESPACE::BijectiveHash2x64;
+using ROCKSDB_NAMESPACE::BijectiveUnhash2x64;
+using ROCKSDB_NAMESPACE::DecodeFixed64;
+using ROCKSDB_NAMESPACE::EncodeFixed32;
+using ROCKSDB_NAMESPACE::EndianSwapValue;
+using ROCKSDB_NAMESPACE::GetSliceHash64;
+using ROCKSDB_NAMESPACE::Hash;
+using ROCKSDB_NAMESPACE::Hash128;
+using ROCKSDB_NAMESPACE::Hash2x64;
+using ROCKSDB_NAMESPACE::Hash64;
+using ROCKSDB_NAMESPACE::Lower32of64;
+using ROCKSDB_NAMESPACE::Lower64of128;
+using ROCKSDB_NAMESPACE::ReverseBits;
+using ROCKSDB_NAMESPACE::Slice;
+using ROCKSDB_NAMESPACE::Unsigned128;
+using ROCKSDB_NAMESPACE::Upper32of64;
+using ROCKSDB_NAMESPACE::Upper64of128;
+
+// The hash algorithm is part of the file format, for example for the Bloom
+// filters. Test that the hash values are stable for a set of random strings of
+// varying lengths.
+TEST(HashTest, Values) {
+  constexpr uint32_t kSeed = 0xbc9f1d34;  // Same as BloomHash.
+
+  EXPECT_EQ(Hash("", 0, kSeed), 3164544308u);
+  EXPECT_EQ(Hash("\x08", 1, kSeed), 422599524u);
+  EXPECT_EQ(Hash("\x17", 1, kSeed), 3168152998u);
+  EXPECT_EQ(Hash("\x9a", 1, kSeed), 3195034349u);
+  EXPECT_EQ(Hash("\x1c", 1, kSeed), 2651681383u);
+  EXPECT_EQ(Hash("\x4d\x76", 2, kSeed), 2447836956u);
+  EXPECT_EQ(Hash("\x52\xd5", 2, kSeed), 3854228105u);
+  EXPECT_EQ(Hash("\x91\xf7", 2, kSeed), 31066776u);
+  EXPECT_EQ(Hash("\xd6\x27", 2, kSeed), 1806091603u);
+  EXPECT_EQ(Hash("\x30\x46\x0b", 3, kSeed), 3808221797u);
+  EXPECT_EQ(Hash("\x56\xdc\xd6", 3, kSeed), 2157698265u);
+  EXPECT_EQ(Hash("\xd4\x52\x33", 3, kSeed), 1721992661u);
+  EXPECT_EQ(Hash("\x6a\xb5\xf4", 3, kSeed), 2469105222u);
+  EXPECT_EQ(Hash("\x67\x53\x81\x1c", 4, kSeed), 118283265u);
+  EXPECT_EQ(Hash("\x69\xb8\xc0\x88", 4, kSeed), 3416318611u);
+  EXPECT_EQ(Hash("\x1e\x84\xaf\x2d", 4, kSeed), 3315003572u);
+  EXPECT_EQ(Hash("\x46\xdc\x54\xbe", 4, kSeed), 447346355u);
+  EXPECT_EQ(Hash("\xd0\x7a\x6e\xea\x56", 5, kSeed), 4255445370u);
+  EXPECT_EQ(Hash("\x86\x83\xd5\xa4\xd8", 5, kSeed), 2390603402u);
+  EXPECT_EQ(Hash("\xb7\x46\xbb\x77\xce", 5, kSeed), 2048907743u);
+  EXPECT_EQ(Hash("\x6c\xa8\xbc\xe5\x99", 5, kSeed), 2177978500u);
+  EXPECT_EQ(Hash("\x5c\x5e\xe1\xa0\x73\x81", 6, kSeed), 1036846008u);
+  EXPECT_EQ(Hash("\x08\x5d\x73\x1c\xe5\x2e", 6, kSeed), 229980482u);
+  EXPECT_EQ(Hash("\x42\xfb\xf2\x52\xb4\x10", 6, kSeed), 3655585422u);
+  EXPECT_EQ(Hash("\x73\xe1\xff\x56\x9c\xce", 6, kSeed), 3502708029u);
+  EXPECT_EQ(Hash("\x5c\xbe\x97\x75\x54\x9a\x52", 7, kSeed), 815120748u);
+  EXPECT_EQ(Hash("\x16\x82\x39\x49\x88\x2b\x36", 7, kSeed), 3056033698u);
+  EXPECT_EQ(Hash("\x59\x77\xf0\xa7\x24\xf4\x78", 7, kSeed), 587205227u);
+  EXPECT_EQ(Hash("\xd3\xa5\x7c\x0e\xc0\x02\x07", 7, kSeed), 2030937252u);
+  EXPECT_EQ(Hash("\x31\x1b\x98\x75\x96\x22\xd3\x9a", 8, kSeed), 469635402u);
+  EXPECT_EQ(Hash("\x38\xd6\xf7\x28\x20\xb4\x8a\xe9", 8, kSeed), 3530274698u);
+  EXPECT_EQ(Hash("\xbb\x18\x5d\xf4\x12\x03\xf7\x99", 8, kSeed), 1974545809u);
+  EXPECT_EQ(Hash("\x80\xd4\x3b\x3b\xae\x22\xa2\x78", 8, kSeed), 3563570120u);
+  EXPECT_EQ(Hash("\x1a\xb5\xd0\xfe\xab\xc3\x61\xb2\x99", 9, kSeed),
+            2706087434u);
+  EXPECT_EQ(Hash("\x8e\x4a\xc3\x18\x20\x2f\x06\xe6\x3c", 9, kSeed),
+            1534654151u);
+  EXPECT_EQ(Hash("\xb6\xc0\xdd\x05\x3f\xc4\x86\x4c\xef", 9, kSeed),
+            2355554696u);
+  EXPECT_EQ(Hash("\x9a\x5f\x78\x0d\xaf\x50\xe1\x1f\x55", 9, kSeed),
+            1400800912u);
+  EXPECT_EQ(Hash("\x22\x6f\x39\x1f\xf8\xdd\x4f\x52\x17\x94", 10, kSeed),
+            3420325137u);
+  EXPECT_EQ(Hash("\x32\x89\x2a\x75\x48\x3a\x4a\x02\x69\xdd", 10, kSeed),
+            3427803584u);
+  EXPECT_EQ(Hash("\x06\x92\x5c\xf4\x88\x0e\x7e\x68\x38\x3e", 10, kSeed),
+            1152407945u);
+  EXPECT_EQ(Hash("\xbd\x2c\x63\x38\xbf\xe9\x78\xb7\xbf\x15", 10, kSeed),
+            3382479516u);
+}
+
+// The hash algorithm is part of the file format, for example for the Bloom
+// filters.
+TEST(HashTest, Hash64Misc) {
+  constexpr uint32_t kSeed = 0;  // Same as GetSliceHash64
+
+  for (char fill : {'\0', 'a', '1', '\xff'}) {
+    const size_t max_size = 1000;
+    const std::string str(max_size, fill);
+
+    for (size_t size = 0; size <= max_size; ++size) {
+      uint64_t here = Hash64(str.data(), size, kSeed);
+
+      // Must be same as unseeded Hash64 and GetSliceHash64
+      EXPECT_EQ(here, Hash64(str.data(), size));
+      EXPECT_EQ(here, GetSliceHash64(Slice(str.data(), size)));
+
+      // Upper and Lower must reconstruct hash
+      EXPECT_EQ(here, (uint64_t{Upper32of64(here)} << 32) | Lower32of64(here));
+      EXPECT_EQ(here, (uint64_t{Upper32of64(here)} << 32) + Lower32of64(here));
+      EXPECT_EQ(here, (uint64_t{Upper32of64(here)} << 32) ^ Lower32of64(here));
+
+      // Seed changes hash value (with high probability)
+      for (uint64_t var_seed = 1; var_seed != 0; var_seed <<= 1) {
+        EXPECT_NE(here, Hash64(str.data(), size, var_seed));
+      }
+
+      // Size changes hash value (with high probability)
+      size_t max_smaller_by = std::min(size_t{30}, size);
+      for (size_t smaller_by = 1; smaller_by <= max_smaller_by; ++smaller_by) {
+        EXPECT_NE(here, Hash64(str.data(), size - smaller_by, kSeed));
+      }
+    }
+  }
+}
+
+// Test that hash values are "non-trivial" for "trivial" inputs
+TEST(HashTest, Hash64Trivial) {
+  // Thorough test too slow for regression testing
+  constexpr bool thorough = false;
+
+  // For various seeds, make sure hash of empty string is not zero.
+  constexpr uint64_t max_seed = thorough ? 0x1000000 : 0x10000;
+  for (uint64_t seed = 0; seed < max_seed; ++seed) {
+    uint64_t here = Hash64("", 0, seed);
+    EXPECT_NE(Lower32of64(here), 0u);
+    EXPECT_NE(Upper32of64(here), 0u);
+  }
+
+  // For standard seed, make sure hash of small strings are not zero
+  constexpr uint32_t kSeed = 0;  // Same as GetSliceHash64
+  char input[4];
+  constexpr int max_len = thorough ? 3 : 2;
+  for (int len = 1; len <= max_len; ++len) {
+    for (uint32_t i = 0; (i >> (len * 8)) == 0; ++i) {
+      EncodeFixed32(input, i);
+      uint64_t here = Hash64(input, len, kSeed);
+      EXPECT_NE(Lower32of64(here), 0u);
+      EXPECT_NE(Upper32of64(here), 0u);
+    }
+  }
+}
+
+// Test that the hash values are stable for a set of random strings of
+// varying small lengths.
+TEST(HashTest, Hash64SmallValueSchema) {
+  constexpr uint32_t kSeed = 0;  // Same as GetSliceHash64
+
+  EXPECT_EQ(Hash64("", 0, kSeed), uint64_t{5999572062939766020u});
+  EXPECT_EQ(Hash64("\x08", 1, kSeed), uint64_t{583283813901344696u});
+  EXPECT_EQ(Hash64("\x17", 1, kSeed), uint64_t{16175549975585474943u});
+  EXPECT_EQ(Hash64("\x9a", 1, kSeed), uint64_t{16322991629225003903u});
+  EXPECT_EQ(Hash64("\x1c", 1, kSeed), uint64_t{13269285487706833447u});
+  EXPECT_EQ(Hash64("\x4d\x76", 2, kSeed), uint64_t{6859542833406258115u});
+  EXPECT_EQ(Hash64("\x52\xd5", 2, kSeed), uint64_t{4919611532550636959u});
+  EXPECT_EQ(Hash64("\x91\xf7", 2, kSeed), uint64_t{14199427467559720719u});
+  EXPECT_EQ(Hash64("\xd6\x27", 2, kSeed), uint64_t{12292689282614532691u});
+  EXPECT_EQ(Hash64("\x30\x46\x0b", 3, kSeed), uint64_t{11404699285340020889u});
+  EXPECT_EQ(Hash64("\x56\xdc\xd6", 3, kSeed), uint64_t{12404347133785524237u});
+  EXPECT_EQ(Hash64("\xd4\x52\x33", 3, kSeed), uint64_t{15853805298481534034u});
+  EXPECT_EQ(Hash64("\x6a\xb5\xf4", 3, kSeed), uint64_t{16863488758399383382u});
+  EXPECT_EQ(Hash64("\x67\x53\x81\x1c", 4, kSeed),
+            uint64_t{9010661983527562386u});
+  EXPECT_EQ(Hash64("\x69\xb8\xc0\x88", 4, kSeed),
+            uint64_t{6611781377647041447u});
+  EXPECT_EQ(Hash64("\x1e\x84\xaf\x2d", 4, kSeed),
+            uint64_t{15290969111616346501u});
+  EXPECT_EQ(Hash64("\x46\xdc\x54\xbe", 4, kSeed),
+            uint64_t{7063754590279313623u});
+  EXPECT_EQ(Hash64("\xd0\x7a\x6e\xea\x56", 5, kSeed),
+            uint64_t{6384167718754869899u});
+  EXPECT_EQ(Hash64("\x86\x83\xd5\xa4\xd8", 5, kSeed),
+            uint64_t{16874407254108011067u});
+  EXPECT_EQ(Hash64("\xb7\x46\xbb\x77\xce", 5, kSeed),
+            uint64_t{16809880630149135206u});
+  EXPECT_EQ(Hash64("\x6c\xa8\xbc\xe5\x99", 5, kSeed),
+            uint64_t{1249038833153141148u});
+  EXPECT_EQ(Hash64("\x5c\x5e\xe1\xa0\x73\x81", 6, kSeed),
+            uint64_t{17358142495308219330u});
+  EXPECT_EQ(Hash64("\x08\x5d\x73\x1c\xe5\x2e", 6, kSeed),
+            uint64_t{4237646583134806322u});
+  EXPECT_EQ(Hash64("\x42\xfb\xf2\x52\xb4\x10", 6, kSeed),
+            uint64_t{4373664924115234051u});
+  EXPECT_EQ(Hash64("\x73\xe1\xff\x56\x9c\xce", 6, kSeed),
+            uint64_t{12012981210634596029u});
+  EXPECT_EQ(Hash64("\x5c\xbe\x97\x75\x54\x9a\x52", 7, kSeed),
+            uint64_t{5716522398211028826u});
+  EXPECT_EQ(Hash64("\x16\x82\x39\x49\x88\x2b\x36", 7, kSeed),
+            uint64_t{15604531309862565013u});
+  EXPECT_EQ(Hash64("\x59\x77\xf0\xa7\x24\xf4\x78", 7, kSeed),
+            uint64_t{8601330687345614172u});
+  EXPECT_EQ(Hash64("\xd3\xa5\x7c\x0e\xc0\x02\x07", 7, kSeed),
+            uint64_t{8088079329364056942u});
+  EXPECT_EQ(Hash64("\x31\x1b\x98\x75\x96\x22\xd3\x9a", 8, kSeed),
+            uint64_t{9844314944338447628u});
+  EXPECT_EQ(Hash64("\x38\xd6\xf7\x28\x20\xb4\x8a\xe9", 8, kSeed),
+            uint64_t{10973293517982163143u});
+  EXPECT_EQ(Hash64("\xbb\x18\x5d\xf4\x12\x03\xf7\x99", 8, kSeed),
+            uint64_t{9986007080564743219u});
+  EXPECT_EQ(Hash64("\x80\xd4\x3b\x3b\xae\x22\xa2\x78", 8, kSeed),
+            uint64_t{1729303145008254458u});
+  EXPECT_EQ(Hash64("\x1a\xb5\xd0\xfe\xab\xc3\x61\xb2\x99", 9, kSeed),
+            uint64_t{13253403748084181481u});
+  EXPECT_EQ(Hash64("\x8e\x4a\xc3\x18\x20\x2f\x06\xe6\x3c", 9, kSeed),
+            uint64_t{7768754303876232188u});
+  EXPECT_EQ(Hash64("\xb6\xc0\xdd\x05\x3f\xc4\x86\x4c\xef", 9, kSeed),
+            uint64_t{12439346786701492u});
+  EXPECT_EQ(Hash64("\x9a\x5f\x78\x0d\xaf\x50\xe1\x1f\x55", 9, kSeed),
+            uint64_t{10841838338450144690u});
+  EXPECT_EQ(Hash64("\x22\x6f\x39\x1f\xf8\xdd\x4f\x52\x17\x94", 10, kSeed),
+            uint64_t{12883919702069153152u});
+  EXPECT_EQ(Hash64("\x32\x89\x2a\x75\x48\x3a\x4a\x02\x69\xdd", 10, kSeed),
+            uint64_t{12692903507676842188u});
+  EXPECT_EQ(Hash64("\x06\x92\x5c\xf4\x88\x0e\x7e\x68\x38\x3e", 10, kSeed),
+            uint64_t{6540985900674032620u});
+  EXPECT_EQ(Hash64("\xbd\x2c\x63\x38\xbf\xe9\x78\xb7\xbf\x15", 10, kSeed),
+            uint64_t{10551812464348219044u});
+}
+
+std::string Hash64TestDescriptor(const char *repeat, size_t limit) {
+  const char *mod61_encode =
+      "abcdefghijklmnopqrstuvwxyz123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+
+  std::string input;
+  while (input.size() < limit) {
+    input.append(repeat);
+  }
+  std::string rv;
+  for (size_t i = 0; i < limit; ++i) {
+    uint64_t h = GetSliceHash64(Slice(input.data(), i));
+    rv.append(1, mod61_encode[static_cast<size_t>(h % 61)]);
+  }
+  return rv;
+}
+
+// XXPH3 changes its algorithm for various sizes up through 250 bytes, so
+// we need to check the stability of larger sizes also.
+TEST(HashTest, Hash64LargeValueSchema) {
+  // Each of these derives a "descriptor" from the hash values for all
+  // lengths up to 430.
+  // Note that "c" is common for the zero-length string.
+  EXPECT_EQ(
+      Hash64TestDescriptor("foo", 430),
+      "cRhyWsY67B6klRA1udmOuiYuX7IthyGBKqbeosz2hzVglWCmQx8nEdnpkvPfYX56Up2OWOTV"
+      "lTzfAoYwvtqKzjD8E9xttR2unelbXbIV67NUe6bOO23BxaSFRcA3njGu5cUWfgwOqNoTsszp"
+      "uPvKRP6qaUR5VdoBkJUCFIefd7edlNK5mv6JYWaGdwxehg65hTkTmjZoPKxTZo4PLyzbL9U4"
+      "xt12ITSfeP2MfBHuLI2z2pDlBb44UQKVMx27LEoAHsdLp3WfWfgH3sdRBRCHm33UxCM4QmE2"
+      "xJ7gqSvNwTeH7v9GlC8zWbGroyD3UVNeShMLx29O7tH1biemLULwAHyIw8zdtLMDpEJ8m2ic"
+      "l6Lb4fDuuFNAs1GCVUthjK8CV8SWI8Rsz5THSwn5CGhpqUwSZcFknjwWIl5rNCvDxXJqYr");
+  // Note that "1EeRk" is common for "Rocks"
+  EXPECT_EQ(
+      Hash64TestDescriptor("Rocks", 430),
+      "c1EeRkrzgOYWLA8PuhJrwTePJewoB44WdXYDfhbk3ZxTqqg25WlPExDl7IKIQLJvnA6gJxxn"
+      "9TCSLkFGfJeXehaSS1GBqWSzfhEH4VXiXIUCuxJXxtKXcSC6FrNIQGTZbYDiUOLD6Y5inzrF"
+      "9etwQhXUBanw55xAUdNMFQAm2GjJ6UDWp2mISLiMMkLjANWMKLaZMqaFLX37qB4MRO1ooVRv"
+      "zSvaNRSCLxlggQCasQq8icWjzf3HjBlZtU6pd4rkaUxSzHqmo9oM5MghbU5Rtxg8wEfO7lVN"
+      "5wdMONYecslQTwjZUpO1K3LDf3K3XK6sUXM6ShQQ3RHmMn2acB4YtTZ3QQcHYJSOHn2DuWpa"
+      "Q8RqzX5lab92YmOLaCdOHq1BPsM7SIBzMdLgePNsJ1vvMALxAaoDUHPxoFLO2wx18IXnyX");
+  EXPECT_EQ(
+      Hash64TestDescriptor("RocksDB", 430),
+      "c1EeRkukbkb28wLTahwD2sfUhZzaBEnF8SVrxnPVB6A7b8CaAl3UKsDZISF92GSq2wDCukOq"
+      "Jgrsp7A3KZhDiLW8dFXp8UPqPxMCRlMdZeVeJ2dJxrmA6cyt99zkQFj7ELbut6jAeVqARFnw"
+      "fnWVXOsaLrq7bDCbMcns2DKvTaaqTCLMYxI7nhtLpFN1jR755FRQFcOzrrDbh7QhypjdvlYw"
+      "cdAMSZgp9JMHxbM23wPSuH6BOFgxejz35PScZfhDPvTOxIy1jc3MZsWrMC3P324zNolO7JdW"
+      "CX2I5UDKjjaEJfxbgVgJIXxtQGlmj2xkO5sPpjULQV4X2HlY7FQleJ4QRaJIB4buhCA4vUTF"
+      "eMFlxCIYUpTCsal2qsmnGOWa8WCcefrohMjDj1fjzSvSaQwlpyR1GZHF2uPOoQagiCpHpm");
+}
+
+TEST(HashTest, Hash128Misc) {
+  constexpr uint32_t kSeed = 0;  // Same as GetSliceHash128
+
+  for (char fill : {'\0', 'a', '1', '\xff', 'e'}) {
+    const size_t max_size = 1000;
+    std::string str(max_size, fill);
+
+    if (fill == 'e') {
+      // Use different characters to check endianness handling
+      for (size_t i = 0; i < str.size(); ++i) {
+        str[i] += static_cast<char>(i);
+      }
+    }
+
+    for (size_t size = 0; size <= max_size; ++size) {
+      Unsigned128 here = Hash128(str.data(), size, kSeed);
+
+      // Must be same as unseeded Hash128 and GetSliceHash128
+      EXPECT_EQ(here, Hash128(str.data(), size));
+      EXPECT_EQ(here, GetSliceHash128(Slice(str.data(), size)));
+      {
+        uint64_t hi, lo;
+        Hash2x64(str.data(), size, &hi, &lo);
+        EXPECT_EQ(Lower64of128(here), lo);
+        EXPECT_EQ(Upper64of128(here), hi);
+      }
+      if (size == 16) {
+        const uint64_t in_hi = DecodeFixed64(str.data() + 8);
+        const uint64_t in_lo = DecodeFixed64(str.data());
+        uint64_t hi, lo;
+        BijectiveHash2x64(in_hi, in_lo, &hi, &lo);
+        EXPECT_EQ(Lower64of128(here), lo);
+        EXPECT_EQ(Upper64of128(here), hi);
+        uint64_t un_hi, un_lo;
+        BijectiveUnhash2x64(hi, lo, &un_hi, &un_lo);
+        EXPECT_EQ(in_lo, un_lo);
+        EXPECT_EQ(in_hi, un_hi);
+      }
+
+      // Upper and Lower must reconstruct hash
+      EXPECT_EQ(here,
+                (Unsigned128{Upper64of128(here)} << 64) | Lower64of128(here));
+      EXPECT_EQ(here,
+                (Unsigned128{Upper64of128(here)} << 64) ^ Lower64of128(here));
+
+      // Seed changes hash value (with high probability)
+      for (uint64_t var_seed = 1; var_seed != 0; var_seed <<= 1) {
+        Unsigned128 seeded = Hash128(str.data(), size, var_seed);
+        EXPECT_NE(here, seeded);
+        // Must match seeded Hash2x64
+        {
+          uint64_t hi, lo;
+          Hash2x64(str.data(), size, var_seed, &hi, &lo);
+          EXPECT_EQ(Lower64of128(seeded), lo);
+          EXPECT_EQ(Upper64of128(seeded), hi);
+        }
+        if (size == 16) {
+          const uint64_t in_hi = DecodeFixed64(str.data() + 8);
+          const uint64_t in_lo = DecodeFixed64(str.data());
+          uint64_t hi, lo;
+          BijectiveHash2x64(in_hi, in_lo, var_seed, &hi, &lo);
+          EXPECT_EQ(Lower64of128(seeded), lo);
+          EXPECT_EQ(Upper64of128(seeded), hi);
+          uint64_t un_hi, un_lo;
+          BijectiveUnhash2x64(hi, lo, var_seed, &un_hi, &un_lo);
+          EXPECT_EQ(in_lo, un_lo);
+          EXPECT_EQ(in_hi, un_hi);
+        }
+      }
+
+      // Size changes hash value (with high probability)
+      size_t max_smaller_by = std::min(size_t{30}, size);
+      for (size_t smaller_by = 1; smaller_by <= max_smaller_by; ++smaller_by) {
+        EXPECT_NE(here, Hash128(str.data(), size - smaller_by, kSeed));
+      }
+    }
+  }
+}
+
+// Test that hash values are "non-trivial" for "trivial" inputs
+TEST(HashTest, Hash128Trivial) {
+  // Thorough test too slow for regression testing
+  constexpr bool thorough = false;
+
+  // For various seeds, make sure hash of empty string is not zero.
+  constexpr uint64_t max_seed = thorough ? 0x1000000 : 0x10000;
+  for (uint64_t seed = 0; seed < max_seed; ++seed) {
+    Unsigned128 here = Hash128("", 0, seed);
+    EXPECT_NE(Lower64of128(here), 0u);
+    EXPECT_NE(Upper64of128(here), 0u);
+  }
+
+  // For standard seed, make sure hash of small strings are not zero
+  constexpr uint32_t kSeed = 0;  // Same as GetSliceHash128
+  char input[4];
+  constexpr int max_len = thorough ? 3 : 2;
+  for (int len = 1; len <= max_len; ++len) {
+    for (uint32_t i = 0; (i >> (len * 8)) == 0; ++i) {
+      EncodeFixed32(input, i);
+      Unsigned128 here = Hash128(input, len, kSeed);
+      EXPECT_NE(Lower64of128(here), 0u);
+      EXPECT_NE(Upper64of128(here), 0u);
+    }
+  }
+}
+
+std::string Hash128TestDescriptor(const char *repeat, size_t limit) {
+  const char *mod61_encode =
+      "abcdefghijklmnopqrstuvwxyz123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+
+  std::string input;
+  while (input.size() < limit) {
+    input.append(repeat);
+  }
+  std::string rv;
+  for (size_t i = 0; i < limit; ++i) {
+    auto h = GetSliceHash128(Slice(input.data(), i));
+    uint64_t h2 = Upper64of128(h) + Lower64of128(h);
+    rv.append(1, mod61_encode[static_cast<size_t>(h2 % 61)]);
+  }
+  return rv;
+}
+
+// XXH3 changes its algorithm for various sizes up through 250 bytes, so
+// we need to check the stability of larger sizes also.
+TEST(HashTest, Hash128ValueSchema) {
+  // Each of these derives a "descriptor" from the hash values for all
+  // lengths up to 430.
+  // Note that "b" is common for the zero-length string.
+  EXPECT_EQ(
+      Hash128TestDescriptor("foo", 430),
+      "bUMA3As8n9I4vNGhThXlEevxZlyMcbb6TYAlIKJ2f5ponsv99q962rYclQ7u3gfnRdCDQ5JI"
+      "2LrGUaCycbXrvLFe4SjgRb9RQwCfrnmNQ7VSEwSKMnkGCK3bDbXSrnIh5qLXdtvIZklbJpGH"
+      "Dqr93BlqF9ubTnOSYkSdx89XvQqflMIW8bjfQp9BPjQejWOeEQspnN1D3sfgVdFhpaQdHYA5"
+      "pI2XcPlCMFPxvrFuRr7joaDvjNe9IUZaunLPMewuXmC3EL95h52Ju3D7y9RNKhgYxMTrA84B"
+      "yJrMvyjdm3vlBxet4EN7v2GEyjbGuaZW9UL6lrX6PghJDg7ACfLGdxNbH3qXM4zaiG2RKnL5"
+      "S3WXKR78RBB5fRFQ8KDIEQjHFvSNsc3GrAEi6W8P2lv8JMTzjBODO2uN4wadVQFT9wpGfV");
+  // Note that "35D2v" is common for "Rocks"
+  EXPECT_EQ(
+      Hash128TestDescriptor("Rocks", 430),
+      "b35D2vzvklFVDqJmyLRXyApwGGO3EAT3swhe8XJAN3mY2UVPglzdmydxcba6JI2tSvwO6zSu"
+      "ANpjSM7tc9G5iMhsa7R8GfyCXRO1TnLg7HvdWNdgGGBirxZR68BgT7TQsYJt6zyEyISeXI1n"
+      "MXA48Xo7dWfJeYN6Z4KWlqZY7TgFXGbks9AX4ehZNSGtIhdO5i58qlgVX1bEejeOVaCcjC79"
+      "67DrMfOKds7rUQzjBa77sMPcoPW1vu6ljGJPZH3XkRyDMZ1twxXKkNxN3tE8nR7JHwyqBAxE"
+      "fTcjbOWrLZ1irWxRSombD8sGDEmclgF11IxqEhe3Rt7gyofO3nExGckKkS9KfRqsCHbiUyva"
+      "JGkJwUHRXaZnh58b4i1Ei9aQKZjXlvIVDixoZrjcNaH5XJIJlRZce9Z9t82wYapTpckYSg");
+  EXPECT_EQ(
+      Hash128TestDescriptor("RocksDB", 430),
+      "b35D2vFUst3XDZCRlSrhmYYakmqImV97LbBsV6EZlOEQpUPH1d1sD3xMKAPlA5UErHehg5O7"
+      "n966fZqhAf3hRc24kGCLfNAWjyUa7vSNOx3IcPoTyVRFZeFlcCtfl7t1QJumHOCpS33EBmBF"
+      "hvK13QjBbDWYWeHQhJhgV9Mqbx17TIcvUkEnYZxb8IzWNmjVsJG44Z7v52DjGj1ZzS62S2Vv"
+      "qWcDO7apvH5VHg68E9Wl6nXP21vlmUqEH9GeWRehfWVvY7mUpsAg5drHHQyDSdiMceiUuUxJ"
+      "XJqHFcDdzbbPk7xDvbLgWCKvH8k3MpQNWOmbSSRDdAP6nGlDjoTToYkcqVREHJzztSWAAq5h"
+      "GHSUNJ6OxsMHhf8EhXfHtKyUzRmPtjYyeckQcGmrQfFFLidc6cjMDKCdBG6c6HVBrS7H2R");
+}
+
+TEST(FastRange32Test, Values) {
+  using ROCKSDB_NAMESPACE::FastRange32;
+  // Zero range
+  EXPECT_EQ(FastRange32(0, 0), 0U);
+  EXPECT_EQ(FastRange32(123, 0), 0U);
+  EXPECT_EQ(FastRange32(0xffffffff, 0), 0U);
+
+  // One range
+  EXPECT_EQ(FastRange32(0, 1), 0U);
+  EXPECT_EQ(FastRange32(123, 1), 0U);
+  EXPECT_EQ(FastRange32(0xffffffff, 1), 0U);
+
+  // Two range
+  EXPECT_EQ(FastRange32(0, 2), 0U);
+  EXPECT_EQ(FastRange32(123, 2), 0U);
+  EXPECT_EQ(FastRange32(0x7fffffff, 2), 0U);
+  EXPECT_EQ(FastRange32(0x80000000, 2), 1U);
+  EXPECT_EQ(FastRange32(0xffffffff, 2), 1U);
+
+  // Seven range
+  EXPECT_EQ(FastRange32(0, 7), 0U);
+  EXPECT_EQ(FastRange32(123, 7), 0U);
+  EXPECT_EQ(FastRange32(613566756, 7), 0U);
+  EXPECT_EQ(FastRange32(613566757, 7), 1U);
+  EXPECT_EQ(FastRange32(1227133513, 7), 1U);
+  EXPECT_EQ(FastRange32(1227133514, 7), 2U);
+  // etc.
+  EXPECT_EQ(FastRange32(0xffffffff, 7), 6U);
+
+  // Big
+  EXPECT_EQ(FastRange32(1, 0x80000000), 0U);
+  EXPECT_EQ(FastRange32(2, 0x80000000), 1U);
+  EXPECT_EQ(FastRange32(4, 0x7fffffff), 1U);
+  EXPECT_EQ(FastRange32(4, 0x80000000), 2U);
+  EXPECT_EQ(FastRange32(0xffffffff, 0x7fffffff), 0x7ffffffeU);
+  EXPECT_EQ(FastRange32(0xffffffff, 0x80000000), 0x7fffffffU);
+}
+
+TEST(FastRange64Test, Values) {
+  using ROCKSDB_NAMESPACE::FastRange64;
+  // Zero range
+  EXPECT_EQ(FastRange64(0, 0), 0U);
+  EXPECT_EQ(FastRange64(123, 0), 0U);
+  EXPECT_EQ(FastRange64(0xffffFFFF, 0), 0U);
+  EXPECT_EQ(FastRange64(0xffffFFFFffffFFFF, 0), 0U);
+
+  // One range
+  EXPECT_EQ(FastRange64(0, 1), 0U);
+  EXPECT_EQ(FastRange64(123, 1), 0U);
+  EXPECT_EQ(FastRange64(0xffffFFFF, 1), 0U);
+  EXPECT_EQ(FastRange64(0xffffFFFFffffFFFF, 1), 0U);
+
+  // Two range
+  EXPECT_EQ(FastRange64(0, 2), 0U);
+  EXPECT_EQ(FastRange64(123, 2), 0U);
+  EXPECT_EQ(FastRange64(0xffffFFFF, 2), 0U);
+  EXPECT_EQ(FastRange64(0x7fffFFFFffffFFFF, 2), 0U);
+  EXPECT_EQ(FastRange64(0x8000000000000000, 2), 1U);
+  EXPECT_EQ(FastRange64(0xffffFFFFffffFFFF, 2), 1U);
+
+  // Seven range
+  EXPECT_EQ(FastRange64(0, 7), 0U);
+  EXPECT_EQ(FastRange64(123, 7), 0U);
+  EXPECT_EQ(FastRange64(0xffffFFFF, 7), 0U);
+  EXPECT_EQ(FastRange64(2635249153387078802, 7), 0U);
+  EXPECT_EQ(FastRange64(2635249153387078803, 7), 1U);
+  EXPECT_EQ(FastRange64(5270498306774157604, 7), 1U);
+  EXPECT_EQ(FastRange64(5270498306774157605, 7), 2U);
+  EXPECT_EQ(FastRange64(0x7fffFFFFffffFFFF, 7), 3U);
+  EXPECT_EQ(FastRange64(0x8000000000000000, 7), 3U);
+  EXPECT_EQ(FastRange64(0xffffFFFFffffFFFF, 7), 6U);
+
+  // Big but 32-bit range
+  EXPECT_EQ(FastRange64(0x100000000, 0x80000000), 0U);
+  EXPECT_EQ(FastRange64(0x200000000, 0x80000000), 1U);
+  EXPECT_EQ(FastRange64(0x400000000, 0x7fffFFFF), 1U);
+  EXPECT_EQ(FastRange64(0x400000000, 0x80000000), 2U);
+  EXPECT_EQ(FastRange64(0xffffFFFFffffFFFF, 0x7fffFFFF), 0x7fffFFFEU);
+  EXPECT_EQ(FastRange64(0xffffFFFFffffFFFF, 0x80000000), 0x7fffFFFFU);
+
+  // Big, > 32-bit range
+#if SIZE_MAX == UINT64_MAX
+  EXPECT_EQ(FastRange64(0x7fffFFFFffffFFFF, 0x4200000002), 0x2100000000U);
+  EXPECT_EQ(FastRange64(0x8000000000000000, 0x4200000002), 0x2100000001U);
+
+  EXPECT_EQ(FastRange64(0x0000000000000000, 420000000002), 0U);
+  EXPECT_EQ(FastRange64(0x7fffFFFFffffFFFF, 420000000002), 210000000000U);
+  EXPECT_EQ(FastRange64(0x8000000000000000, 420000000002), 210000000001U);
+  EXPECT_EQ(FastRange64(0xffffFFFFffffFFFF, 420000000002), 420000000001U);
+
+  EXPECT_EQ(FastRange64(0xffffFFFFffffFFFF, 0xffffFFFFffffFFFF),
+            0xffffFFFFffffFFFEU);
+#endif
+}
+
+TEST(FastRangeGenericTest, Values) {
+  using ROCKSDB_NAMESPACE::FastRangeGeneric;
+  // Generic (including big and small)
+  // Note that FastRangeGeneric is also tested indirectly above via
+  // FastRange32 and FastRange64.
+  EXPECT_EQ(
+      FastRangeGeneric(uint64_t{0x8000000000000000}, uint64_t{420000000002}),
+      uint64_t{210000000001});
+  EXPECT_EQ(FastRangeGeneric(uint64_t{0x8000000000000000}, uint16_t{12468}),
+            uint16_t{6234});
+  EXPECT_EQ(FastRangeGeneric(uint32_t{0x80000000}, uint16_t{12468}),
+            uint16_t{6234});
+  // Not recommended for typical use because for example this could fail on
+  // some platforms and pass on others:
+  // EXPECT_EQ(FastRangeGeneric(static_cast<unsigned long>(0x80000000),
+  //                           uint16_t{12468}),
+  //          uint16_t{6234});
+}
+
+// for inspection of disassembly
+uint32_t FastRange32(uint32_t hash, uint32_t range) {
+  return ROCKSDB_NAMESPACE::FastRange32(hash, range);
+}
+
+// for inspection of disassembly
+size_t FastRange64(uint64_t hash, size_t range) {
+  return ROCKSDB_NAMESPACE::FastRange64(hash, range);
+}
+
+// Tests for math.h / math128.h (not worth a separate test binary)
+using ROCKSDB_NAMESPACE::BitParity;
+using ROCKSDB_NAMESPACE::BitsSetToOne;
+using ROCKSDB_NAMESPACE::ConstexprFloorLog2;
+using ROCKSDB_NAMESPACE::CountTrailingZeroBits;
+using ROCKSDB_NAMESPACE::DecodeFixed128;
+using ROCKSDB_NAMESPACE::DecodeFixedGeneric;
+using ROCKSDB_NAMESPACE::DownwardInvolution;
+using ROCKSDB_NAMESPACE::EncodeFixed128;
+using ROCKSDB_NAMESPACE::EncodeFixedGeneric;
+using ROCKSDB_NAMESPACE::FloorLog2;
+using ROCKSDB_NAMESPACE::Lower64of128;
+using ROCKSDB_NAMESPACE::Multiply64to128;
+using ROCKSDB_NAMESPACE::Unsigned128;
+using ROCKSDB_NAMESPACE::Upper64of128;
+
+int blah(int x) { return DownwardInvolution(x); }
+
+template <typename T>
+static void test_BitOps() {
+  // This complex code is to generalize to 128-bit values. Otherwise
+  // we could just use = static_cast<T>(0x5555555555555555ULL);
+  T everyOtherBit = 0;
+  for (unsigned i = 0; i < sizeof(T); ++i) {
+    everyOtherBit = (everyOtherBit << 8) | T{0x55};
+  }
+
+  // This one built using bit operations, as our 128-bit layer
+  // might not implement arithmetic such as subtraction.
+  T vm1 = 0;  // "v minus one"
+
+  for (int i = 0; i < int{8 * sizeof(T)}; ++i) {
+    T v = T{1} << i;
+    // If we could directly use arithmetic:
+    // T vm1 = static_cast<T>(v - 1);
+
+    // FloorLog2
+    if (v > 0) {
+      EXPECT_EQ(FloorLog2(v), i);
+      EXPECT_EQ(ConstexprFloorLog2(v), i);
+    }
+    if (vm1 > 0) {
+      EXPECT_EQ(FloorLog2(vm1), i - 1);
+      EXPECT_EQ(ConstexprFloorLog2(vm1), i - 1);
+      EXPECT_EQ(FloorLog2(everyOtherBit & vm1), (i - 1) & ~1);
+      EXPECT_EQ(ConstexprFloorLog2(everyOtherBit & vm1), (i - 1) & ~1);
+    }
+
+    // CountTrailingZeroBits
+    if (v != 0) {
+      EXPECT_EQ(CountTrailingZeroBits(v), i);
+    }
+    if (vm1 != 0) {
+      EXPECT_EQ(CountTrailingZeroBits(vm1), 0);
+    }
+    if (i < int{8 * sizeof(T)} - 1) {
+      EXPECT_EQ(CountTrailingZeroBits(~vm1 & everyOtherBit), (i + 1) & ~1);
+    }
+
+    // BitsSetToOne
+    EXPECT_EQ(BitsSetToOne(v), 1);
+    EXPECT_EQ(BitsSetToOne(vm1), i);
+    EXPECT_EQ(BitsSetToOne(vm1 & everyOtherBit), (i + 1) / 2);
+
+    // BitParity
+    EXPECT_EQ(BitParity(v), 1);
+    EXPECT_EQ(BitParity(vm1), i & 1);
+    EXPECT_EQ(BitParity(vm1 & everyOtherBit), ((i + 1) / 2) & 1);
+
+    // EndianSwapValue
+    T ev = T{1} << (((sizeof(T) - 1 - (i / 8)) * 8) + i % 8);
+    EXPECT_EQ(EndianSwapValue(v), ev);
+
+    // ReverseBits
+    EXPECT_EQ(ReverseBits(v), static_cast<T>(T{1} << (8 * sizeof(T) - 1 - i)));
+#ifdef HAVE_UINT128_EXTENSION          // Uses multiplication
+    if (std::is_unsigned<T>::value) {  // Technical UB on signed type
+      T rv = T{1} << (8 * sizeof(T) - 1 - i);
+      EXPECT_EQ(ReverseBits(vm1), static_cast<T>(rv * ~T{1}));
+    }
+#endif
+
+    // DownwardInvolution
+    {
+      T misc = static_cast<T>(/*random*/ 0xc682cd153d0e3279U +
+                              i * /*random*/ 0x9b3972f3bea0baa3U);
+      if constexpr (sizeof(T) > 8) {
+        misc = (misc << 64) | (/*random*/ 0x52af031a38ced62dU +
+                               i * /*random*/ 0x936f803d9752ddc3U);
+      }
+      T misc_masked = misc & vm1;
+      EXPECT_LE(misc_masked, vm1);
+      T di_misc_masked = DownwardInvolution(misc_masked);
+      EXPECT_LE(di_misc_masked, vm1);
+      if (misc_masked > 0) {
+        // Highest-order 1 in same position
+        EXPECT_EQ(FloorLog2(misc_masked), FloorLog2(di_misc_masked));
+      }
+      // Validate involution property on short value
+      EXPECT_EQ(DownwardInvolution(di_misc_masked), misc_masked);
+
+      // Validate involution property on large value
+      T di_misc = DownwardInvolution(misc);
+      EXPECT_EQ(DownwardInvolution(di_misc), misc);
+      // Highest-order 1 in same position
+      if (misc > 0) {
+        EXPECT_EQ(FloorLog2(misc), FloorLog2(di_misc));
+      }
+
+      // Validate distributes over xor.
+      // static_casts to avoid numerical promotion effects.
+      EXPECT_EQ(DownwardInvolution(static_cast<T>(misc_masked ^ vm1)),
+                static_cast<T>(di_misc_masked ^ DownwardInvolution(vm1)));
+      T misc2 = static_cast<T>(misc >> 1);
+      EXPECT_EQ(DownwardInvolution(static_cast<T>(misc ^ misc2)),
+                static_cast<T>(di_misc ^ DownwardInvolution(misc2)));
+
+      // Choose some small number of bits to pull off to test combined
+      // uniqueness guarantee
+      int in_bits = i % 7;
+      unsigned in_mask = (unsigned{1} << in_bits) - 1U;
+      // IMPLICIT: int out_bits = 8 - in_bits;
+      std::vector<bool> seen(256, false);
+      for (int j = 0; j < 255; ++j) {
+        T t_in = misc ^ static_cast<T>(j);
+        unsigned in = static_cast<unsigned>(t_in);
+        unsigned out = static_cast<unsigned>(DownwardInvolution(t_in));
+        unsigned val = ((out << in_bits) | (in & in_mask)) & 255U;
+        EXPECT_FALSE(seen[val]);
+        seen[val] = true;
+      }
+
+      if (i + 8 < int{8 * sizeof(T)}) {
+        // Also test manipulating bits in the middle of input is
+        // bijective in bottom of output
+        seen = std::vector<bool>(256, false);
+        for (int j = 0; j < 255; ++j) {
+          T in = misc ^ (static_cast<T>(j) << i);
+          unsigned val = static_cast<unsigned>(DownwardInvolution(in)) & 255U;
+          EXPECT_FALSE(seen[val]);
+          seen[val] = true;
+        }
+      }
+    }
+
+    vm1 = (vm1 << 1) | 1;
+  }
+
+  EXPECT_EQ(ConstexprFloorLog2(T{1}), 0);
+  EXPECT_EQ(ConstexprFloorLog2(T{2}), 1);
+  EXPECT_EQ(ConstexprFloorLog2(T{3}), 1);
+  EXPECT_EQ(ConstexprFloorLog2(T{42}), 5);
+}
+
+TEST(MathTest, BitOps) {
+  test_BitOps<uint32_t>();
+  test_BitOps<uint64_t>();
+  test_BitOps<uint16_t>();
+  test_BitOps<uint8_t>();
+  test_BitOps<unsigned char>();
+  test_BitOps<unsigned short>();
+  test_BitOps<unsigned int>();
+  test_BitOps<unsigned long>();
+  test_BitOps<unsigned long long>();
+  test_BitOps<char>();
+  test_BitOps<size_t>();
+  test_BitOps<int32_t>();
+  test_BitOps<int64_t>();
+  test_BitOps<int16_t>();
+  test_BitOps<int8_t>();
+  test_BitOps<signed char>();
+  test_BitOps<short>();
+  test_BitOps<int>();
+  test_BitOps<long>();
+  test_BitOps<long long>();
+  test_BitOps<ptrdiff_t>();
+}
+
+TEST(MathTest, BitOps128) { test_BitOps<Unsigned128>(); }
+
+TEST(MathTest, Math128) {
+  const Unsigned128 sixteenHexOnes = 0x1111111111111111U;
+  const Unsigned128 thirtyHexOnes = (sixteenHexOnes << 56) | sixteenHexOnes;
+  const Unsigned128 sixteenHexTwos = 0x2222222222222222U;
+  const Unsigned128 thirtyHexTwos = (sixteenHexTwos << 56) | sixteenHexTwos;
+
+  // v will slide from all hex ones to all hex twos
+  Unsigned128 v = thirtyHexOnes;
+  for (int i = 0; i <= 30; ++i) {
+    // Test bitwise operations
+    EXPECT_EQ(BitsSetToOne(v), 30);
+    EXPECT_EQ(BitsSetToOne(~v), 128 - 30);
+    EXPECT_EQ(BitsSetToOne(v & thirtyHexOnes), 30 - i);
+    EXPECT_EQ(BitsSetToOne(v | thirtyHexOnes), 30 + i);
+    EXPECT_EQ(BitsSetToOne(v ^ thirtyHexOnes), 2 * i);
+    EXPECT_EQ(BitsSetToOne(v & thirtyHexTwos), i);
+    EXPECT_EQ(BitsSetToOne(v | thirtyHexTwos), 60 - i);
+    EXPECT_EQ(BitsSetToOne(v ^ thirtyHexTwos), 60 - 2 * i);
+
+    // Test comparisons
+    EXPECT_EQ(v == thirtyHexOnes, i == 0);
+    EXPECT_EQ(v == thirtyHexTwos, i == 30);
+    EXPECT_EQ(v > thirtyHexOnes, i > 0);
+    EXPECT_EQ(v > thirtyHexTwos, false);
+    EXPECT_EQ(v >= thirtyHexOnes, true);
+    EXPECT_EQ(v >= thirtyHexTwos, i == 30);
+    EXPECT_EQ(v < thirtyHexOnes, false);
+    EXPECT_EQ(v < thirtyHexTwos, i < 30);
+    EXPECT_EQ(v <= thirtyHexOnes, i == 0);
+    EXPECT_EQ(v <= thirtyHexTwos, true);
+
+    // Update v, clearing upper-most byte
+    v = ((v << 12) >> 8) | 0x2;
+  }
+
+  for (int i = 0; i < 128; ++i) {
+    // Test shifts
+    Unsigned128 sl = thirtyHexOnes << i;
+    Unsigned128 sr = thirtyHexOnes >> i;
+    EXPECT_EQ(BitsSetToOne(sl), std::min(30, 32 - i / 4));
+    EXPECT_EQ(BitsSetToOne(sr), std::max(0, 30 - (i + 3) / 4));
+    EXPECT_EQ(BitsSetToOne(sl & sr), i % 2 ? 0 : std::max(0, 30 - i / 2));
+  }
+
+  // Test 64x64->128 multiply
+  Unsigned128 product =
+      Multiply64to128(0x1111111111111111U, 0x2222222222222222U);
+  EXPECT_EQ(Lower64of128(product), 2295594818061633090U);
+  EXPECT_EQ(Upper64of128(product), 163971058432973792U);
+}
+
+TEST(MathTest, Coding128) {
+  const char *in = "_1234567890123456";
+  // Note: in + 1 is likely unaligned
+  Unsigned128 decoded = DecodeFixed128(in + 1);
+  EXPECT_EQ(Lower64of128(decoded), 0x3837363534333231U);
+  EXPECT_EQ(Upper64of128(decoded), 0x3635343332313039U);
+  char out[18];
+  out[0] = '_';
+  EncodeFixed128(out + 1, decoded);
+  out[17] = '\0';
+  EXPECT_EQ(std::string(in), std::string(out));
+}
+
+TEST(MathTest, CodingGeneric) {
+  const char *in = "_1234567890123456";
+  // Decode
+  // Note: in + 1 is likely unaligned
+  Unsigned128 decoded128 = DecodeFixedGeneric<Unsigned128>(in + 1);
+  EXPECT_EQ(Lower64of128(decoded128), 0x3837363534333231U);
+  EXPECT_EQ(Upper64of128(decoded128), 0x3635343332313039U);
+
+  uint64_t decoded64 = DecodeFixedGeneric<uint64_t>(in + 1);
+  EXPECT_EQ(decoded64, 0x3837363534333231U);
+
+  uint32_t decoded32 = DecodeFixedGeneric<uint32_t>(in + 1);
+  EXPECT_EQ(decoded32, 0x34333231U);
+
+  uint16_t decoded16 = DecodeFixedGeneric<uint16_t>(in + 1);
+  EXPECT_EQ(decoded16, 0x3231U);
+
+  // Encode
+  char out[18];
+  out[0] = '_';
+  memset(out + 1, '\0', 17);
+  EncodeFixedGeneric(out + 1, decoded128);
+  EXPECT_EQ(std::string(in), std::string(out));
+
+  memset(out + 1, '\0', 9);
+  EncodeFixedGeneric(out + 1, decoded64);
+  EXPECT_EQ(std::string("_12345678"), std::string(out));
+
+  memset(out + 1, '\0', 5);
+  EncodeFixedGeneric(out + 1, decoded32);
+  EXPECT_EQ(std::string("_1234"), std::string(out));
+
+  memset(out + 1, '\0', 3);
+  EncodeFixedGeneric(out + 1, decoded16);
+  EXPECT_EQ(std::string("_12"), std::string(out));
+}
+
+int main(int argc, char **argv) {
+  fprintf(stderr, "NPHash64 id: %x\n",
+          static_cast<int>(ROCKSDB_NAMESPACE::GetSliceNPHash64("RocksDB")));
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/heap.h b/src/rocksdb/util/heap.h
new file mode 100644
index 000000000..f221fc732
--- /dev/null
+++ b/src/rocksdb/util/heap.h
@@ -0,0 +1,174 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+
+#include "port/port.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Binary heap implementation optimized for use in multi-way merge sort.
+// Comparison to std::priority_queue:
+// - In libstdc++, std::priority_queue::pop() usually performs just over logN
+//   comparisons but never fewer.
+// - std::priority_queue does not have a replace-top operation, requiring a
+//   pop+push.  If the replacement element is the new top, this requires
+//   around 2logN comparisons.
+// - This heap's pop() uses a "schoolbook" downheap which requires up to ~2logN
+//   comparisons.
+// - This heap provides a replace_top() operation which requires [1, 2logN]
+//   comparisons.  When the replacement element is also the new top, this
+//   takes just 1 or 2 comparisons.
+//
+// The last property can yield an order-of-magnitude performance improvement
+// when merge-sorting real-world non-random data.  If the merge operation is
+// likely to take chunks of elements from the same input stream, only 1
+// comparison per element is needed.  In RocksDB-land, this happens when
+// compacting a database where keys are not randomly distributed across L0
+// files but nearby keys are likely to be in the same L0 file.
+//
+// The container uses the same counterintuitive ordering as
+// std::priority_queue: the comparison operator is expected to provide the
+// less-than relation, but top() will return the maximum.
+
+template <typename T, typename Compare = std::less<T>>
+class BinaryHeap {
+ public:
+  BinaryHeap() {}
+  explicit BinaryHeap(Compare cmp) : cmp_(std::move(cmp)) {}
+
+  void push(const T& value) {
+    data_.push_back(value);
+    upheap(data_.size() - 1);
+  }
+
+  void push(T&& value) {
+    data_.push_back(std::move(value));
+    upheap(data_.size() - 1);
+  }
+
+  const T& top() const {
+    assert(!empty());
+    return data_.front();
+  }
+
+  void replace_top(const T& value) {
+    assert(!empty());
+    data_.front() = value;
+    downheap(get_root());
+  }
+
+  void replace_top(T&& value) {
+    assert(!empty());
+    data_.front() = std::move(value);
+    downheap(get_root());
+  }
+
+  void pop() {
+    assert(!empty());
+    if (data_.size() > 1) {
+      // Avoid self-move-assign, because it could cause problems with
+      // classes which are not prepared for this and it trips up the
+      // STL debugger when activated.
+      data_.front() = std::move(data_.back());
+    }
+    data_.pop_back();
+    if (!empty()) {
+      downheap(get_root());
+    } else {
+      reset_root_cmp_cache();
+    }
+  }
+
+  void swap(BinaryHeap& other) {
+    std::swap(cmp_, other.cmp_);
+    data_.swap(other.data_);
+    std::swap(root_cmp_cache_, other.root_cmp_cache_);
+  }
+
+  void clear() {
+    data_.clear();
+    reset_root_cmp_cache();
+  }
+
+  bool empty() const { return data_.empty(); }
+
+  size_t size() const { return data_.size(); }
+
+  void reset_root_cmp_cache() {
+    root_cmp_cache_ = std::numeric_limits<size_t>::max();
+  }
+
+ private:
+  static inline size_t get_root() { return 0; }
+  static inline size_t get_parent(size_t index) { return (index - 1) / 2; }
+  static inline size_t get_left(size_t index) { return 2 * index + 1; }
+  static inline size_t get_right(size_t index) { return 2 * index + 2; }
+
+  void upheap(size_t index) {
+    T v = std::move(data_[index]);
+    while (index > get_root()) {
+      const size_t parent = get_parent(index);
+      if (!cmp_(data_[parent], v)) {
+        break;
+      }
+      data_[index] = std::move(data_[parent]);
+      index = parent;
+    }
+    data_[index] = std::move(v);
+    reset_root_cmp_cache();
+  }
+
+  void downheap(size_t index) {
+    T v = std::move(data_[index]);
+
+    size_t picked_child = std::numeric_limits<size_t>::max();
+    while (1) {
+      const size_t left_child = get_left(index);
+      if (get_left(index) >= data_.size()) {
+        break;
+      }
+      const size_t right_child = left_child + 1;
+      assert(right_child == get_right(index));
+      picked_child = left_child;
+      if (index == 0 && root_cmp_cache_ < data_.size()) {
+        picked_child = root_cmp_cache_;
+      } else if (right_child < data_.size() &&
+                 cmp_(data_[left_child], data_[right_child])) {
+        picked_child = right_child;
+      }
+      if (!cmp_(v, data_[picked_child])) {
+        break;
+      }
+      data_[index] = std::move(data_[picked_child]);
+      index = picked_child;
+    }
+
+    if (index == 0) {
+      // We did not change anything in the tree except for the value
+      // of the root node, left and right child did not change, we can
+      // cache that `picked_child` is the smallest child
+      // so next time we compare againist it directly
+      root_cmp_cache_ = picked_child;
+    } else {
+      // the tree changed, reset cache
+      reset_root_cmp_cache();
+    }
+
+    data_[index] = std::move(v);
+  }
+
+  Compare cmp_;
+  autovector<T> data_;
+  // Used to reduce number of cmp_ calls in downheap()
+  size_t root_cmp_cache_ = std::numeric_limits<size_t>::max();
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/heap_test.cc b/src/rocksdb/util/heap_test.cc
new file mode 100644
index 000000000..bbb93324f
--- /dev/null
+++ b/src/rocksdb/util/heap_test.cc
@@ -0,0 +1,131 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/heap.h"
+
+#include <gtest/gtest.h>
+
+#include <climits>
+#include <queue>
+#include <random>
+#include <utility>
+
+#include "port/stack_trace.h"
+
+#ifndef GFLAGS
+const int64_t FLAGS_iters = 100000;
+#else
+#include "util/gflags_compat.h"
+DEFINE_int64(iters, 100000, "number of pseudo-random operations in each test");
+#endif  // GFLAGS
+
+/*
+ * Compares the custom heap implementation in util/heap.h against
+ * std::priority_queue on a pseudo-random sequence of operations.
+ */
+
+namespace ROCKSDB_NAMESPACE {
+
+using HeapTestValue = uint64_t;
+using Params = std::tuple<size_t, HeapTestValue, int64_t>;
+
+class HeapTest : public ::testing::TestWithParam<Params> {};
+
+TEST_P(HeapTest, Test) {
+  // This test performs the same pseudorandom sequence of operations on a
+  // BinaryHeap and an std::priority_queue, comparing output.  The three
+  // possible operations are insert, replace top and pop.
+  //
+  // Insert is chosen slightly more often than the others so that the size of
+  // the heap slowly grows.  Once the size heats the MAX_HEAP_SIZE limit, we
+  // disallow inserting until the heap becomes empty, testing the "draining"
+  // scenario.
+
+  const auto MAX_HEAP_SIZE = std::get<0>(GetParam());
+  const auto MAX_VALUE = std::get<1>(GetParam());
+  const auto RNG_SEED = std::get<2>(GetParam());
+
+  BinaryHeap<HeapTestValue> heap;
+  std::priority_queue<HeapTestValue> ref;
+
+  std::mt19937 rng(static_cast<unsigned int>(RNG_SEED));
+  std::uniform_int_distribution<HeapTestValue> value_dist(0, MAX_VALUE);
+  int ndrains = 0;
+  bool draining = false;  // hit max size, draining until we empty the heap
+  size_t size = 0;
+  for (int64_t i = 0; i < FLAGS_iters; ++i) {
+    if (size == 0) {
+      draining = false;
+    }
+
+    if (!draining && (size == 0 || std::bernoulli_distribution(0.4)(rng))) {
+      // insert
+      HeapTestValue val = value_dist(rng);
+      heap.push(val);
+      ref.push(val);
+      ++size;
+      if (size == MAX_HEAP_SIZE) {
+        draining = true;
+        ++ndrains;
+      }
+    } else if (std::bernoulli_distribution(0.5)(rng)) {
+      // replace top
+      HeapTestValue val = value_dist(rng);
+      heap.replace_top(val);
+      ref.pop();
+      ref.push(val);
+    } else {
+      // pop
+      assert(size > 0);
+      heap.pop();
+      ref.pop();
+      --size;
+    }
+
+    // After every operation, check that the public methods give the same
+    // results
+    assert((size == 0) == ref.empty());
+    ASSERT_EQ(size == 0, heap.empty());
+    if (size > 0) {
+      ASSERT_EQ(ref.top(), heap.top());
+    }
+  }
+
+  // Probabilities should be set up to occasionally hit the max heap size and
+  // drain it
+  assert(ndrains > 0);
+
+  heap.clear();
+  ASSERT_TRUE(heap.empty());
+}
+
+// Basic test, MAX_VALUE = 3*MAX_HEAP_SIZE (occasional duplicates)
+INSTANTIATE_TEST_CASE_P(Basic, HeapTest,
+                        ::testing::Values(Params(1000, 3000,
+                                                 0x1b575cf05b708945)));
+// Mid-size heap with small values (many duplicates)
+INSTANTIATE_TEST_CASE_P(SmallValues, HeapTest,
+                        ::testing::Values(Params(100, 10, 0x5ae213f7bd5dccd0)));
+// Small heap, large value range (no duplicates)
+INSTANTIATE_TEST_CASE_P(SmallHeap, HeapTest,
+                        ::testing::Values(Params(10, ULLONG_MAX,
+                                                 0x3e1fa8f4d01707cf)));
+// Two-element heap
+INSTANTIATE_TEST_CASE_P(TwoElementHeap, HeapTest,
+                        ::testing::Values(Params(2, 5, 0x4b5e13ea988c6abc)));
+// One-element heap
+INSTANTIATE_TEST_CASE_P(OneElementHeap, HeapTest,
+                        ::testing::Values(Params(1, 3, 0x176a1019ab0b612e)));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+#ifdef GFLAGS
+  GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+#endif  // GFLAGS
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/kv_map.h b/src/rocksdb/util/kv_map.h
new file mode 100644
index 000000000..62be6d18e
--- /dev/null
+++ b/src/rocksdb/util/kv_map.h
@@ -0,0 +1,33 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <map>
+#include <string>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/slice.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace stl_wrappers {
+
+struct LessOfComparator {
+  explicit LessOfComparator(const Comparator* c = BytewiseComparator())
+      : cmp(c) {}
+
+  bool operator()(const std::string& a, const std::string& b) const {
+    return cmp->Compare(Slice(a), Slice(b)) < 0;
+  }
+  bool operator()(const Slice& a, const Slice& b) const {
+    return cmp->Compare(a, b) < 0;
+  }
+
+  const Comparator* cmp;
+};
+
+using KVMap = std::map<std::string, std::string, LessOfComparator>;
+}  // namespace stl_wrappers
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/log_write_bench.cc b/src/rocksdb/util/log_write_bench.cc
new file mode 100644
index 000000000..c1637db15
--- /dev/null
+++ b/src/rocksdb/util/log_write_bench.cc
@@ -0,0 +1,88 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+
+#include "file/writable_file_writer.h"
+#include "monitoring/histogram.h"
+#include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/gflags_compat.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::SetUsageMessage;
+
+// A simple benchmark to simulate transactional logs
+
+DEFINE_int32(num_records, 6000, "Number of records.");
+DEFINE_int32(record_size, 249, "Size of each record.");
+DEFINE_int32(record_interval, 10000, "Interval between records (microSec)");
+DEFINE_int32(bytes_per_sync, 0, "bytes_per_sync parameter in EnvOptions");
+DEFINE_bool(enable_sync, false, "sync after each write.");
+
+namespace ROCKSDB_NAMESPACE {
+void RunBenchmark() {
+  std::string file_name = test::PerThreadDBPath("log_write_benchmark.log");
+  DBOptions options;
+  Env* env = Env::Default();
+  const auto& clock = env->GetSystemClock();
+  EnvOptions env_options = env->OptimizeForLogWrite(EnvOptions(), options);
+  env_options.bytes_per_sync = FLAGS_bytes_per_sync;
+  std::unique_ptr<WritableFile> file;
+  env->NewWritableFile(file_name, &file, env_options);
+  std::unique_ptr<WritableFileWriter> writer;
+  writer.reset(new WritableFileWriter(std::move(file), file_name, env_options,
+                                      clock, nullptr /* stats */,
+                                      options.listeners));
+
+  std::string record;
+  record.assign(FLAGS_record_size, 'X');
+
+  HistogramImpl hist;
+
+  uint64_t start_time = clock->NowMicros();
+  for (int i = 0; i < FLAGS_num_records; i++) {
+    uint64_t start_nanos = clock->NowNanos();
+    writer->Append(record);
+    writer->Flush();
+    if (FLAGS_enable_sync) {
+      writer->Sync(false);
+    }
+    hist.Add(clock->NowNanos() - start_nanos);
+
+    if (i % 1000 == 1) {
+      fprintf(stderr, "Wrote %d records...\n", i);
+    }
+
+    int time_to_sleep =
+        (i + 1) * FLAGS_record_interval - (clock->NowMicros() - start_time);
+    if (time_to_sleep > 0) {
+      clock->SleepForMicroseconds(time_to_sleep);
+    }
+  }
+
+  fprintf(stderr, "Distribution of latency of append+flush: \n%s",
+          hist.ToString().c_str());
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                  " [OPTIONS]...");
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  ROCKSDB_NAMESPACE::RunBenchmark();
+  return 0;
+}
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/util/math.h b/src/rocksdb/util/math.h
new file mode 100644
index 000000000..da31b43ec
--- /dev/null
+++ b/src/rocksdb/util/math.h
@@ -0,0 +1,294 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <assert.h>
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#include <cstdint>
+#include <type_traits>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Fast implementation of floor(log2(v)). Undefined for 0 or negative
+// numbers (in case of signed type).
+template <typename T>
+inline int FloorLog2(T v) {
+  static_assert(std::is_integral<T>::value, "non-integral type");
+  assert(v > 0);
+#ifdef _MSC_VER
+  static_assert(sizeof(T) <= sizeof(uint64_t), "type too big");
+  unsigned long idx = 0;
+  if (sizeof(T) <= sizeof(uint32_t)) {
+    _BitScanReverse(&idx, static_cast<uint32_t>(v));
+  } else {
+#if defined(_M_X64) || defined(_M_ARM64)
+    _BitScanReverse64(&idx, static_cast<uint64_t>(v));
+#else
+    const auto vh = static_cast<uint32_t>(static_cast<uint64_t>(v) >> 32);
+    if (vh != 0) {
+      _BitScanReverse(&idx, static_cast<uint32_t>(vh));
+      idx += 32;
+    } else {
+      _BitScanReverse(&idx, static_cast<uint32_t>(v));
+    }
+#endif
+  }
+  return idx;
+#else
+  static_assert(sizeof(T) <= sizeof(unsigned long long), "type too big");
+  if (sizeof(T) <= sizeof(unsigned int)) {
+    int lz = __builtin_clz(static_cast<unsigned int>(v));
+    return int{sizeof(unsigned int)} * 8 - 1 - lz;
+  } else if (sizeof(T) <= sizeof(unsigned long)) {
+    int lz = __builtin_clzl(static_cast<unsigned long>(v));
+    return int{sizeof(unsigned long)} * 8 - 1 - lz;
+  } else {
+    int lz = __builtin_clzll(static_cast<unsigned long long>(v));
+    return int{sizeof(unsigned long long)} * 8 - 1 - lz;
+  }
+#endif
+}
+
+// Constexpr version of FloorLog2
+template <typename T>
+constexpr int ConstexprFloorLog2(T v) {
+  int rv = 0;
+  while (v > T{1}) {
+    ++rv;
+    v >>= 1;
+  }
+  return rv;
+}
+
+// Number of low-order zero bits before the first 1 bit. Undefined for 0.
+template <typename T>
+inline int CountTrailingZeroBits(T v) {
+  static_assert(std::is_integral<T>::value, "non-integral type");
+  assert(v != 0);
+#ifdef _MSC_VER
+  static_assert(sizeof(T) <= sizeof(uint64_t), "type too big");
+  unsigned long tz = 0;
+  if (sizeof(T) <= sizeof(uint32_t)) {
+    _BitScanForward(&tz, static_cast<uint32_t>(v));
+  } else {
+#if defined(_M_X64) || defined(_M_ARM64)
+    _BitScanForward64(&tz, static_cast<uint64_t>(v));
+#else
+    _BitScanForward(&tz, static_cast<uint32_t>(v));
+    if (tz == 0) {
+      _BitScanForward(&tz,
+                      static_cast<uint32_t>(static_cast<uint64_t>(v) >> 32));
+      tz += 32;
+    }
+#endif
+  }
+  return static_cast<int>(tz);
+#else
+  static_assert(sizeof(T) <= sizeof(unsigned long long), "type too big");
+  if (sizeof(T) <= sizeof(unsigned int)) {
+    return __builtin_ctz(static_cast<unsigned int>(v));
+  } else if (sizeof(T) <= sizeof(unsigned long)) {
+    return __builtin_ctzl(static_cast<unsigned long>(v));
+  } else {
+    return __builtin_ctzll(static_cast<unsigned long long>(v));
+  }
+#endif
+}
+
+// Not all MSVC compile settings will use `BitsSetToOneFallback()`. We include
+// the following code at coarse granularity for simpler macros. It's important
+// to exclude at least so our non-MSVC unit test coverage tool doesn't see it.
+#ifdef _MSC_VER
+
+namespace detail {
+
+template <typename T>
+int BitsSetToOneFallback(T v) {
+  const int kBits = static_cast<int>(sizeof(T)) * 8;
+  static_assert((kBits & (kBits - 1)) == 0, "must be power of two bits");
+  // we static_cast these bit patterns in order to truncate them to the correct
+  // size. Warning C4309 dislikes this technique, so disable it here.
+#pragma warning(disable : 4309)
+  v = static_cast<T>(v - ((v >> 1) & static_cast<T>(0x5555555555555555ull)));
+  v = static_cast<T>((v & static_cast<T>(0x3333333333333333ull)) +
+                     ((v >> 2) & static_cast<T>(0x3333333333333333ull)));
+  v = static_cast<T>((v + (v >> 4)) & static_cast<T>(0x0F0F0F0F0F0F0F0Full));
+#pragma warning(default : 4309)
+  for (int shift_bits = 8; shift_bits < kBits; shift_bits <<= 1) {
+    v += static_cast<T>(v >> shift_bits);
+  }
+  // we want the bottom "slot" that's big enough to represent a value up to
+  // (and including) kBits.
+  return static_cast<int>(v & static_cast<T>(kBits | (kBits - 1)));
+}
+
+}  // namespace detail
+
+#endif  // _MSC_VER
+
+// Number of bits set to 1. Also known as "population count".
+template <typename T>
+inline int BitsSetToOne(T v) {
+  static_assert(std::is_integral<T>::value, "non-integral type");
+#ifdef _MSC_VER
+  static_assert(sizeof(T) <= sizeof(uint64_t), "type too big");
+  if (sizeof(T) < sizeof(uint32_t)) {
+    // This bit mask is to avoid a compiler warning on unused path
+    constexpr auto mm = 8 * sizeof(uint32_t) - 1;
+    // The bit mask is to neutralize sign extension on small signed types
+    constexpr uint32_t m = (uint32_t{1} << ((8 * sizeof(T)) & mm)) - 1;
+#if defined(HAVE_SSE42) && (defined(_M_X64) || defined(_M_IX86))
+    return static_cast<int>(__popcnt(static_cast<uint32_t>(v) & m));
+#else
+    return static_cast<int>(detail::BitsSetToOneFallback(v) & m);
+#endif
+  } else if (sizeof(T) == sizeof(uint32_t)) {
+#if defined(HAVE_SSE42) && (defined(_M_X64) || defined(_M_IX86))
+    return static_cast<int>(__popcnt(static_cast<uint32_t>(v)));
+#else
+    return detail::BitsSetToOneFallback(static_cast<uint32_t>(v));
+#endif
+  } else {
+#if defined(HAVE_SSE42) && defined(_M_X64)
+    return static_cast<int>(__popcnt64(static_cast<uint64_t>(v)));
+#elif defined(HAVE_SSE42) && defined(_M_IX86)
+    return static_cast<int>(
+        __popcnt(static_cast<uint32_t>(static_cast<uint64_t>(v) >> 32) +
+                 __popcnt(static_cast<uint32_t>(v))));
+#else
+    return detail::BitsSetToOneFallback(static_cast<uint64_t>(v));
+#endif
+  }
+#else
+  static_assert(sizeof(T) <= sizeof(unsigned long long), "type too big");
+  if (sizeof(T) < sizeof(unsigned int)) {
+    // This bit mask is to avoid a compiler warning on unused path
+    constexpr auto mm = 8 * sizeof(unsigned int) - 1;
+    // This bit mask is to neutralize sign extension on small signed types
+    constexpr unsigned int m = (1U << ((8 * sizeof(T)) & mm)) - 1;
+    return __builtin_popcount(static_cast<unsigned int>(v) & m);
+  } else if (sizeof(T) == sizeof(unsigned int)) {
+    return __builtin_popcount(static_cast<unsigned int>(v));
+  } else if (sizeof(T) <= sizeof(unsigned long)) {
+    return __builtin_popcountl(static_cast<unsigned long>(v));
+  } else {
+    return __builtin_popcountll(static_cast<unsigned long long>(v));
+  }
+#endif
+}
+
+template <typename T>
+inline int BitParity(T v) {
+  static_assert(std::is_integral<T>::value, "non-integral type");
+#ifdef _MSC_VER
+  // bit parity == oddness of popcount
+  return BitsSetToOne(v) & 1;
+#else
+  static_assert(sizeof(T) <= sizeof(unsigned long long), "type too big");
+  if (sizeof(T) <= sizeof(unsigned int)) {
+    // On any sane systen, potential sign extension here won't change parity
+    return __builtin_parity(static_cast<unsigned int>(v));
+  } else if (sizeof(T) <= sizeof(unsigned long)) {
+    return __builtin_parityl(static_cast<unsigned long>(v));
+  } else {
+    return __builtin_parityll(static_cast<unsigned long long>(v));
+  }
+#endif
+}
+
+// Swaps between big and little endian. Can be used in combination with the
+// little-endian encoding/decoding functions in coding_lean.h and coding.h to
+// encode/decode big endian.
+template <typename T>
+inline T EndianSwapValue(T v) {
+  static_assert(std::is_integral<T>::value, "non-integral type");
+
+#ifdef _MSC_VER
+  if (sizeof(T) == 2) {
+    return static_cast<T>(_byteswap_ushort(static_cast<uint16_t>(v)));
+  } else if (sizeof(T) == 4) {
+    return static_cast<T>(_byteswap_ulong(static_cast<uint32_t>(v)));
+  } else if (sizeof(T) == 8) {
+    return static_cast<T>(_byteswap_uint64(static_cast<uint64_t>(v)));
+  }
+#else
+  if (sizeof(T) == 2) {
+    return static_cast<T>(__builtin_bswap16(static_cast<uint16_t>(v)));
+  } else if (sizeof(T) == 4) {
+    return static_cast<T>(__builtin_bswap32(static_cast<uint32_t>(v)));
+  } else if (sizeof(T) == 8) {
+    return static_cast<T>(__builtin_bswap64(static_cast<uint64_t>(v)));
+  }
+#endif
+  // Recognized by clang as bswap, but not by gcc :(
+  T ret_val = 0;
+  for (std::size_t i = 0; i < sizeof(T); ++i) {
+    ret_val |= ((v >> (8 * i)) & 0xff) << (8 * (sizeof(T) - 1 - i));
+  }
+  return ret_val;
+}
+
+// Reverses the order of bits in an integral value
+template <typename T>
+inline T ReverseBits(T v) {
+  T r = EndianSwapValue(v);
+  const T kHighestByte = T{1} << ((sizeof(T) - 1) * 8);
+  const T kEveryByte = kHighestByte | (kHighestByte / 255);
+
+  r = ((r & (kEveryByte * 0x0f)) << 4) | ((r >> 4) & (kEveryByte * 0x0f));
+  r = ((r & (kEveryByte * 0x33)) << 2) | ((r >> 2) & (kEveryByte * 0x33));
+  r = ((r & (kEveryByte * 0x55)) << 1) | ((r >> 1) & (kEveryByte * 0x55));
+
+  return r;
+}
+
+// Every output bit depends on many input bits in the same and higher
+// positions, but not lower positions. Specifically, this function
+// * Output highest bit set to 1 is same as input (same FloorLog2, or
+//   equivalently, same number of leading zeros)
+// * Is its own inverse (an involution)
+// * Guarantees that b bottom bits of v and c bottom bits of
+//   DownwardInvolution(v) uniquely identify b + c bottom bits of v
+//   (which is all of v if v < 2**(b + c)).
+// ** A notable special case is that modifying c adjacent bits at
+//    some chosen position in the input is bijective with the bottom c
+//    output bits.
+// * Distributes over xor, as in DI(a ^ b) == DI(a) ^ DI(b)
+//
+// This transformation is equivalent to a matrix*vector multiplication in
+// GF(2) where the matrix is recursively defined by the pattern matrix
+// P = | 1 1 |
+//     | 0 1 |
+// and replacing 1's with P and 0's with 2x2 zero matices to some depth,
+// e.g. depth of 6 for 64-bit T. An essential feature of this matrix
+// is that all square sub-matrices that include the top row are invertible.
+template <typename T>
+inline T DownwardInvolution(T v) {
+  static_assert(std::is_integral<T>::value, "non-integral type");
+  static_assert(sizeof(T) <= 8, "only supported up to 64 bits");
+
+  uint64_t r = static_cast<uint64_t>(v);
+  if constexpr (sizeof(T) > 4) {
+    r ^= r >> 32;
+  }
+  if constexpr (sizeof(T) > 2) {
+    r ^= (r & 0xffff0000ffff0000U) >> 16;
+  }
+  if constexpr (sizeof(T) > 1) {
+    r ^= (r & 0xff00ff00ff00ff00U) >> 8;
+  }
+  r ^= (r & 0xf0f0f0f0f0f0f0f0U) >> 4;
+  r ^= (r & 0xccccccccccccccccU) >> 2;
+  r ^= (r & 0xaaaaaaaaaaaaaaaaU) >> 1;
+  return static_cast<T>(r);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/math128.h b/src/rocksdb/util/math128.h
new file mode 100644
index 000000000..ae490051a
--- /dev/null
+++ b/src/rocksdb/util/math128.h
@@ -0,0 +1,316 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "util/coding_lean.h"
+#include "util/math.h"
+
+#ifdef TEST_UINT128_COMPAT
+#undef HAVE_UINT128_EXTENSION
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+// Unsigned128 is a 128 bit value supporting (at least) bitwise operators,
+// shifts, and comparisons. __uint128_t is not always available.
+
+#ifdef HAVE_UINT128_EXTENSION
+using Unsigned128 = __uint128_t;
+#else
+struct Unsigned128 {
+  uint64_t lo;
+  uint64_t hi;
+
+  inline Unsigned128() {
+    static_assert(sizeof(Unsigned128) == 2 * sizeof(uint64_t),
+                  "unexpected overhead in representation");
+    lo = 0;
+    hi = 0;
+  }
+
+  inline Unsigned128(uint64_t lower) {
+    lo = lower;
+    hi = 0;
+  }
+
+  inline Unsigned128(uint64_t lower, uint64_t upper) {
+    lo = lower;
+    hi = upper;
+  }
+
+  explicit operator uint64_t() { return lo; }
+
+  explicit operator uint32_t() { return static_cast<uint32_t>(lo); }
+
+  explicit operator uint16_t() { return static_cast<uint16_t>(lo); }
+
+  explicit operator uint8_t() { return static_cast<uint8_t>(lo); }
+};
+
+inline Unsigned128 operator<<(const Unsigned128& lhs, unsigned shift) {
+  shift &= 127;
+  Unsigned128 rv;
+  if (shift >= 64) {
+    rv.lo = 0;
+    rv.hi = lhs.lo << (shift & 63);
+  } else {
+    uint64_t tmp = lhs.lo;
+    rv.lo = tmp << shift;
+    // Ensure shift==0 shifts away everything. (This avoids another
+    // conditional branch on shift == 0.)
+    tmp = tmp >> 1 >> (63 - shift);
+    rv.hi = tmp | (lhs.hi << shift);
+  }
+  return rv;
+}
+
+inline Unsigned128& operator<<=(Unsigned128& lhs, unsigned shift) {
+  lhs = lhs << shift;
+  return lhs;
+}
+
+inline Unsigned128 operator>>(const Unsigned128& lhs, unsigned shift) {
+  shift &= 127;
+  Unsigned128 rv;
+  if (shift >= 64) {
+    rv.hi = 0;
+    rv.lo = lhs.hi >> (shift & 63);
+  } else {
+    uint64_t tmp = lhs.hi;
+    rv.hi = tmp >> shift;
+    // Ensure shift==0 shifts away everything
+    tmp = tmp << 1 << (63 - shift);
+    rv.lo = tmp | (lhs.lo >> shift);
+  }
+  return rv;
+}
+
+inline Unsigned128& operator>>=(Unsigned128& lhs, unsigned shift) {
+  lhs = lhs >> shift;
+  return lhs;
+}
+
+inline Unsigned128 operator&(const Unsigned128& lhs, const Unsigned128& rhs) {
+  return Unsigned128(lhs.lo & rhs.lo, lhs.hi & rhs.hi);
+}
+
+inline Unsigned128& operator&=(Unsigned128& lhs, const Unsigned128& rhs) {
+  lhs = lhs & rhs;
+  return lhs;
+}
+
+inline Unsigned128 operator|(const Unsigned128& lhs, const Unsigned128& rhs) {
+  return Unsigned128(lhs.lo | rhs.lo, lhs.hi | rhs.hi);
+}
+
+inline Unsigned128& operator|=(Unsigned128& lhs, const Unsigned128& rhs) {
+  lhs = lhs | rhs;
+  return lhs;
+}
+
+inline Unsigned128 operator^(const Unsigned128& lhs, const Unsigned128& rhs) {
+  return Unsigned128(lhs.lo ^ rhs.lo, lhs.hi ^ rhs.hi);
+}
+
+inline Unsigned128& operator^=(Unsigned128& lhs, const Unsigned128& rhs) {
+  lhs = lhs ^ rhs;
+  return lhs;
+}
+
+inline Unsigned128 operator~(const Unsigned128& v) {
+  return Unsigned128(~v.lo, ~v.hi);
+}
+
+inline bool operator==(const Unsigned128& lhs, const Unsigned128& rhs) {
+  return lhs.lo == rhs.lo && lhs.hi == rhs.hi;
+}
+
+inline bool operator!=(const Unsigned128& lhs, const Unsigned128& rhs) {
+  return lhs.lo != rhs.lo || lhs.hi != rhs.hi;
+}
+
+inline bool operator>(const Unsigned128& lhs, const Unsigned128& rhs) {
+  return lhs.hi > rhs.hi || (lhs.hi == rhs.hi && lhs.lo > rhs.lo);
+}
+
+inline bool operator<(const Unsigned128& lhs, const Unsigned128& rhs) {
+  return lhs.hi < rhs.hi || (lhs.hi == rhs.hi && lhs.lo < rhs.lo);
+}
+
+inline bool operator>=(const Unsigned128& lhs, const Unsigned128& rhs) {
+  return lhs.hi > rhs.hi || (lhs.hi == rhs.hi && lhs.lo >= rhs.lo);
+}
+
+inline bool operator<=(const Unsigned128& lhs, const Unsigned128& rhs) {
+  return lhs.hi < rhs.hi || (lhs.hi == rhs.hi && lhs.lo <= rhs.lo);
+}
+#endif
+
+inline uint64_t Lower64of128(Unsigned128 v) {
+#ifdef HAVE_UINT128_EXTENSION
+  return static_cast<uint64_t>(v);
+#else
+  return v.lo;
+#endif
+}
+
+inline uint64_t Upper64of128(Unsigned128 v) {
+#ifdef HAVE_UINT128_EXTENSION
+  return static_cast<uint64_t>(v >> 64);
+#else
+  return v.hi;
+#endif
+}
+
+// This generally compiles down to a single fast instruction on 64-bit.
+// This doesn't really make sense as operator* because it's not a
+// general 128x128 multiply and provides more output than 64x64 multiply.
+inline Unsigned128 Multiply64to128(uint64_t a, uint64_t b) {
+#ifdef HAVE_UINT128_EXTENSION
+  return Unsigned128{a} * Unsigned128{b};
+#else
+  // Full decomposition
+  // NOTE: GCC seems to fully understand this code as 64-bit x 64-bit
+  // -> 128-bit multiplication and optimize it appropriately.
+  uint64_t tmp = uint64_t{b & 0xffffFFFF} * uint64_t{a & 0xffffFFFF};
+  uint64_t lower = tmp & 0xffffFFFF;
+  tmp >>= 32;
+  tmp += uint64_t{b & 0xffffFFFF} * uint64_t{a >> 32};
+  // Avoid overflow: first add lower 32 of tmp2, and later upper 32
+  uint64_t tmp2 = uint64_t{b >> 32} * uint64_t{a & 0xffffFFFF};
+  tmp += tmp2 & 0xffffFFFF;
+  lower |= tmp << 32;
+  tmp >>= 32;
+  tmp += tmp2 >> 32;
+  tmp += uint64_t{b >> 32} * uint64_t{a >> 32};
+  return Unsigned128(lower, tmp);
+#endif
+}
+
+template <>
+inline int FloorLog2(Unsigned128 v) {
+  if (Upper64of128(v) == 0) {
+    return FloorLog2(Lower64of128(v));
+  } else {
+    return FloorLog2(Upper64of128(v)) + 64;
+  }
+}
+
+template <>
+inline int CountTrailingZeroBits(Unsigned128 v) {
+  if (Lower64of128(v) != 0) {
+    return CountTrailingZeroBits(Lower64of128(v));
+  } else {
+    return CountTrailingZeroBits(Upper64of128(v)) + 64;
+  }
+}
+
+template <>
+inline int BitsSetToOne(Unsigned128 v) {
+  return BitsSetToOne(Lower64of128(v)) + BitsSetToOne(Upper64of128(v));
+}
+
+template <>
+inline int BitParity(Unsigned128 v) {
+  return BitParity(Lower64of128(v) ^ Upper64of128(v));
+}
+
+template <>
+inline Unsigned128 EndianSwapValue(Unsigned128 v) {
+  return (Unsigned128{EndianSwapValue(Lower64of128(v))} << 64) |
+         EndianSwapValue(Upper64of128(v));
+}
+
+template <>
+inline Unsigned128 ReverseBits(Unsigned128 v) {
+  return (Unsigned128{ReverseBits(Lower64of128(v))} << 64) |
+         ReverseBits(Upper64of128(v));
+}
+
+template <>
+inline Unsigned128 DownwardInvolution(Unsigned128 v) {
+  return (Unsigned128{DownwardInvolution(Upper64of128(v))} << 64) |
+         DownwardInvolution(Upper64of128(v) ^ Lower64of128(v));
+}
+
+template <typename T>
+struct IsUnsignedUpTo128
+    : std::integral_constant<bool, std::is_unsigned<T>::value ||
+                                       std::is_same<T, Unsigned128>::value> {};
+
+inline void EncodeFixed128(char* dst, Unsigned128 value) {
+  EncodeFixed64(dst, Lower64of128(value));
+  EncodeFixed64(dst + 8, Upper64of128(value));
+}
+
+inline Unsigned128 DecodeFixed128(const char* ptr) {
+  Unsigned128 rv = DecodeFixed64(ptr + 8);
+  return (rv << 64) | DecodeFixed64(ptr);
+}
+
+// A version of EncodeFixed* for generic algorithms. Likely to be used
+// with Unsigned128, so lives here for now.
+template <typename T>
+inline void EncodeFixedGeneric(char* /*dst*/, T /*value*/) {
+  // Unfortunately, GCC does not appear to optimize this simple code down
+  // to a trivial load on Intel:
+  //
+  // T ret_val = 0;
+  // for (size_t i = 0; i < sizeof(T); ++i) {
+  //   ret_val |= (static_cast<T>(static_cast<unsigned char>(ptr[i])) << (8 *
+  //   i));
+  // }
+  // return ret_val;
+  //
+  // But does unroll the loop, and does optimize manually unrolled version
+  // for specific sizes down to a trivial load. I have no idea why it doesn't
+  // do both on this code.
+
+  // So instead, we rely on specializations
+  static_assert(sizeof(T) == 0, "No specialization provided for this type");
+}
+
+template <>
+inline void EncodeFixedGeneric(char* dst, uint16_t value) {
+  return EncodeFixed16(dst, value);
+}
+template <>
+inline void EncodeFixedGeneric(char* dst, uint32_t value) {
+  return EncodeFixed32(dst, value);
+}
+template <>
+inline void EncodeFixedGeneric(char* dst, uint64_t value) {
+  return EncodeFixed64(dst, value);
+}
+template <>
+inline void EncodeFixedGeneric(char* dst, Unsigned128 value) {
+  return EncodeFixed128(dst, value);
+}
+
+// A version of EncodeFixed* for generic algorithms.
+template <typename T>
+inline T DecodeFixedGeneric(const char* /*dst*/) {
+  static_assert(sizeof(T) == 0, "No specialization provided for this type");
+}
+
+template <>
+inline uint16_t DecodeFixedGeneric(const char* dst) {
+  return DecodeFixed16(dst);
+}
+template <>
+inline uint32_t DecodeFixedGeneric(const char* dst) {
+  return DecodeFixed32(dst);
+}
+template <>
+inline uint64_t DecodeFixedGeneric(const char* dst) {
+  return DecodeFixed64(dst);
+}
+template <>
+inline Unsigned128 DecodeFixedGeneric(const char* dst) {
+  return DecodeFixed128(dst);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/murmurhash.cc b/src/rocksdb/util/murmurhash.cc
new file mode 100644
index 000000000..a69f3918a
--- /dev/null
+++ b/src/rocksdb/util/murmurhash.cc
@@ -0,0 +1,196 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+/*
+  Murmurhash from http://sites.google.com/site/murmurhash/
+
+  All code is released to the public domain. For business purposes, Murmurhash
+  is under the MIT license.
+*/
+#include "murmurhash.h"
+
+#include "port/lang.h"
+
+#if defined(__x86_64__)
+
+// -------------------------------------------------------------------
+//
+// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment
+// and endian-ness issues if used across multiple platforms.
+//
+// 64-bit hash for 64-bit platforms
+
+#ifdef ROCKSDB_UBSAN_RUN
+#if defined(__clang__)
+__attribute__((__no_sanitize__("alignment")))
+#elif defined(__GNUC__)
+__attribute__((__no_sanitize_undefined__))
+#endif
+#endif
+// clang-format off
+uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed )
+{
+    const uint64_t m = 0xc6a4a7935bd1e995;
+    const int r = 47;
+
+    uint64_t h = seed ^ (len * m);
+
+    const uint64_t * data = (const uint64_t *)key;
+    const uint64_t * end = data + (len/8);
+
+    while(data != end)
+    {
+        uint64_t k = *data++;
+
+        k *= m;
+        k ^= k >> r;
+        k *= m;
+
+        h ^= k;
+        h *= m;
+    }
+
+    const unsigned char * data2 = (const unsigned char*)data;
+
+    switch(len & 7)
+    {
+    case 7: h ^= ((uint64_t)data2[6]) << 48; FALLTHROUGH_INTENDED;
+    case 6: h ^= ((uint64_t)data2[5]) << 40; FALLTHROUGH_INTENDED;
+    case 5: h ^= ((uint64_t)data2[4]) << 32; FALLTHROUGH_INTENDED;
+    case 4: h ^= ((uint64_t)data2[3]) << 24; FALLTHROUGH_INTENDED;
+    case 3: h ^= ((uint64_t)data2[2]) << 16; FALLTHROUGH_INTENDED;
+    case 2: h ^= ((uint64_t)data2[1]) << 8;  FALLTHROUGH_INTENDED;
+    case 1: h ^= ((uint64_t)data2[0]);
+        h *= m;
+    };
+
+    h ^= h >> r;
+    h *= m;
+    h ^= h >> r;
+
+    return h;
+}
+// clang-format on
+
+#elif defined(__i386__)
+
+// -------------------------------------------------------------------
+//
+// Note - This code makes a few assumptions about how your machine behaves -
+//
+// 1. We can read a 4-byte value from any address without crashing
+// 2. sizeof(int) == 4
+//
+// And it has a few limitations -
+//
+// 1. It will not work incrementally.
+// 2. It will not produce the same results on little-endian and big-endian
+//    machines.
+// clang-format off
+unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )
+{
+    // 'm' and 'r' are mixing constants generated offline.
+    // They're not really 'magic', they just happen to work well.
+
+    const unsigned int m = 0x5bd1e995;
+    const int r = 24;
+
+    // Initialize the hash to a 'random' value
+
+    unsigned int h = seed ^ len;
+
+    // Mix 4 bytes at a time into the hash
+
+    const unsigned char * data = (const unsigned char *)key;
+
+    while(len >= 4)
+    {
+        unsigned int k = *(unsigned int *)data;
+
+        k *= m;
+        k ^= k >> r;
+        k *= m;
+
+        h *= m;
+        h ^= k;
+
+        data += 4;
+        len -= 4;
+    }
+
+    // Handle the last few bytes of the input array
+
+    switch(len)
+    {
+    case 3: h ^= data[2] << 16; FALLTHROUGH_INTENDED;
+    case 2: h ^= data[1] << 8;  FALLTHROUGH_INTENDED;
+    case 1: h ^= data[0];
+        h *= m;
+    };
+
+    // Do a few final mixes of the hash to ensure the last few
+    // bytes are well-incorporated.
+
+    h ^= h >> 13;
+    h *= m;
+    h ^= h >> 15;
+
+    return h;
+}
+// clang-format on
+
+#else
+
+// -------------------------------------------------------------------
+//
+// Same as MurmurHash2, but endian- and alignment-neutral.
+// Half the speed though, alas.
+// clang-format off
+unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed )
+{
+    const unsigned int m = 0x5bd1e995;
+    const int r = 24;
+
+    unsigned int h = seed ^ len;
+
+    const unsigned char * data = (const unsigned char *)key;
+
+    while(len >= 4)
+    {
+        unsigned int k;
+
+        k  = data[0];
+        k |= data[1] << 8;
+        k |= data[2] << 16;
+        k |= data[3] << 24;
+
+        k *= m;
+        k ^= k >> r;
+        k *= m;
+
+        h *= m;
+        h ^= k;
+
+        data += 4;
+        len -= 4;
+    }
+
+    switch(len)
+    {
+    case 3: h ^= data[2] << 16; FALLTHROUGH_INTENDED;
+    case 2: h ^= data[1] << 8;  FALLTHROUGH_INTENDED;
+    case 1: h ^= data[0];
+        h *= m;
+    };
+
+    h ^= h >> 13;
+    h *= m;
+    h ^= h >> 15;
+
+    return h;
+}
+// clang-format on
+
+#endif
diff --git a/src/rocksdb/util/murmurhash.h b/src/rocksdb/util/murmurhash.h
new file mode 100644
index 000000000..7ef4cbbec
--- /dev/null
+++ b/src/rocksdb/util/murmurhash.h
@@ -0,0 +1,43 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+/*
+  Murmurhash from http://sites.google.com/site/murmurhash/
+
+  All code is released to the public domain. For business purposes, Murmurhash
+  is under the MIT license.
+*/
+#pragma once
+#include <stdint.h>
+
+#include "rocksdb/slice.h"
+
+#if defined(__x86_64__)
+#define MURMUR_HASH MurmurHash64A
+uint64_t MurmurHash64A(const void* key, int len, unsigned int seed);
+#define MurmurHash MurmurHash64A
+using murmur_t = uint64_t;
+
+#elif defined(__i386__)
+#define MURMUR_HASH MurmurHash2
+unsigned int MurmurHash2(const void* key, int len, unsigned int seed);
+#define MurmurHash MurmurHash2
+using murmur_t = unsigned int;
+
+#else
+#define MURMUR_HASH MurmurHashNeutral2
+unsigned int MurmurHashNeutral2(const void* key, int len, unsigned int seed);
+#define MurmurHash MurmurHashNeutral2
+using murmur_t = unsigned int;
+#endif
+
+// Allow slice to be hashable by murmur hash.
+namespace ROCKSDB_NAMESPACE {
+struct murmur_hash {
+  size_t operator()(const Slice& slice) const {
+    return MurmurHash(slice.data(), static_cast<int>(slice.size()), 0);
+  }
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/mutexlock.h b/src/rocksdb/util/mutexlock.h
new file mode 100644
index 000000000..94066b29e
--- /dev/null
+++ b/src/rocksdb/util/mutexlock.h
@@ -0,0 +1,180 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <assert.h>
+
+#include <atomic>
+#include <mutex>
+#include <thread>
+
+#include "port/port.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Helper class that locks a mutex on construction and unlocks the mutex when
+// the destructor of the MutexLock object is invoked.
+//
+// Typical usage:
+//
+//   void MyClass::MyMethod() {
+//     MutexLock l(&mu_);       // mu_ is an instance variable
+//     ... some complex code, possibly with multiple return paths ...
+//   }
+
+class MutexLock {
+ public:
+  explicit MutexLock(port::Mutex *mu) : mu_(mu) { this->mu_->Lock(); }
+  // No copying allowed
+  MutexLock(const MutexLock &) = delete;
+  void operator=(const MutexLock &) = delete;
+
+  ~MutexLock() { this->mu_->Unlock(); }
+
+ private:
+  port::Mutex *const mu_;
+};
+
+//
+// Acquire a ReadLock on the specified RWMutex.
+// The Lock will be automatically released when the
+// object goes out of scope.
+//
+class ReadLock {
+ public:
+  explicit ReadLock(port::RWMutex *mu) : mu_(mu) { this->mu_->ReadLock(); }
+  // No copying allowed
+  ReadLock(const ReadLock &) = delete;
+  void operator=(const ReadLock &) = delete;
+
+  ~ReadLock() { this->mu_->ReadUnlock(); }
+
+ private:
+  port::RWMutex *const mu_;
+};
+
+//
+// Automatically unlock a locked mutex when the object is destroyed
+//
+class ReadUnlock {
+ public:
+  explicit ReadUnlock(port::RWMutex *mu) : mu_(mu) { mu->AssertHeld(); }
+  // No copying allowed
+  ReadUnlock(const ReadUnlock &) = delete;
+  ReadUnlock &operator=(const ReadUnlock &) = delete;
+
+  ~ReadUnlock() { mu_->ReadUnlock(); }
+
+ private:
+  port::RWMutex *const mu_;
+};
+
+//
+// Acquire a WriteLock on the specified RWMutex.
+// The Lock will be automatically released then the
+// object goes out of scope.
+//
+class WriteLock {
+ public:
+  explicit WriteLock(port::RWMutex *mu) : mu_(mu) { this->mu_->WriteLock(); }
+  // No copying allowed
+  WriteLock(const WriteLock &) = delete;
+  void operator=(const WriteLock &) = delete;
+
+  ~WriteLock() { this->mu_->WriteUnlock(); }
+
+ private:
+  port::RWMutex *const mu_;
+};
+
+//
+// SpinMutex has very low overhead for low-contention cases.  Method names
+// are chosen so you can use std::unique_lock or std::lock_guard with it.
+//
+class SpinMutex {
+ public:
+  SpinMutex() : locked_(false) {}
+
+  bool try_lock() {
+    auto currently_locked = locked_.load(std::memory_order_relaxed);
+    return !currently_locked &&
+           locked_.compare_exchange_weak(currently_locked, true,
+                                         std::memory_order_acquire,
+                                         std::memory_order_relaxed);
+  }
+
+  void lock() {
+    for (size_t tries = 0;; ++tries) {
+      if (try_lock()) {
+        // success
+        break;
+      }
+      port::AsmVolatilePause();
+      if (tries > 100) {
+        std::this_thread::yield();
+      }
+    }
+  }
+
+  void unlock() { locked_.store(false, std::memory_order_release); }
+
+ private:
+  std::atomic<bool> locked_;
+};
+
+// We want to prevent false sharing
+template <class T>
+struct ALIGN_AS(CACHE_LINE_SIZE) LockData {
+  T lock_;
+};
+
+//
+// Inspired by Guava: https://github.com/google/guava/wiki/StripedExplained
+// A striped Lock. This offers the underlying lock striping similar
+// to that of ConcurrentHashMap in a reusable form, and extends it for
+// semaphores and read-write locks. Conceptually, lock striping is the technique
+// of dividing a lock into many <i>stripes</i>, increasing the granularity of a
+// single lock and allowing independent operations to lock different stripes and
+// proceed concurrently, instead of creating contention for a single lock.
+//
+template <class T, class P>
+class Striped {
+ public:
+  Striped(size_t stripes, std::function<uint64_t(const P &)> hash)
+      : stripes_(stripes), hash_(hash) {
+    locks_ = reinterpret_cast<LockData<T> *>(
+        port::cacheline_aligned_alloc(sizeof(LockData<T>) * stripes));
+    for (size_t i = 0; i < stripes; i++) {
+      new (&locks_[i]) LockData<T>();
+    }
+  }
+
+  virtual ~Striped() {
+    if (locks_ != nullptr) {
+      assert(stripes_ > 0);
+      for (size_t i = 0; i < stripes_; i++) {
+        locks_[i].~LockData<T>();
+      }
+      port::cacheline_aligned_free(locks_);
+    }
+  }
+
+  T *get(const P &key) {
+    uint64_t h = hash_(key);
+    size_t index = h % stripes_;
+    return &reinterpret_cast<LockData<T> *>(&locks_[index])->lock_;
+  }
+
+ private:
+  size_t stripes_;
+  LockData<T> *locks_;
+  std::function<uint64_t(const P &)> hash_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/ppc-opcode.h b/src/rocksdb/util/ppc-opcode.h
new file mode 100644
index 000000000..5cc5af0e3
--- /dev/null
+++ b/src/rocksdb/util/ppc-opcode.h
@@ -0,0 +1,27 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  Copyright (c) 2017 International Business Machines Corp.
+//  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#define __PPC_RA(a) (((a)&0x1f) << 16)
+#define __PPC_RB(b) (((b)&0x1f) << 11)
+#define __PPC_XA(a) ((((a)&0x1f) << 16) | (((a)&0x20) >> 3))
+#define __PPC_XB(b) ((((b)&0x1f) << 11) | (((b)&0x20) >> 4))
+#define __PPC_XS(s) ((((s)&0x1f) << 21) | (((s)&0x20) >> 5))
+#define __PPC_XT(s) __PPC_XS(s)
+#define VSX_XX3(t, a, b) (__PPC_XT(t) | __PPC_XA(a) | __PPC_XB(b))
+#define VSX_XX1(s, a, b) (__PPC_XS(s) | __PPC_RA(a) | __PPC_RB(b))
+
+#define PPC_INST_VPMSUMW 0x10000488
+#define PPC_INST_VPMSUMD 0x100004c8
+#define PPC_INST_MFVSRD 0x7c000066
+#define PPC_INST_MTVSRD 0x7c000166
+
+#define VPMSUMW(t, a, b) .long PPC_INST_VPMSUMW | VSX_XX3((t), a, b)
+#define VPMSUMD(t, a, b) .long PPC_INST_VPMSUMD | VSX_XX3((t), a, b)
+#define MFVRD(a, t) .long PPC_INST_MFVSRD | VSX_XX1((t) + 32, a, 0)
+#define MTVRD(t, a) .long PPC_INST_MTVSRD | VSX_XX1((t) + 32, a, 0)
diff --git a/src/rocksdb/util/random.cc b/src/rocksdb/util/random.cc
new file mode 100644
index 000000000..c94c28dfb
--- /dev/null
+++ b/src/rocksdb/util/random.cc
@@ -0,0 +1,62 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "util/random.h"
+
+#include <stdint.h>
+#include <string.h>
+
+#include <thread>
+#include <utility>
+
+#include "port/likely.h"
+#include "util/thread_local.h"
+
+#define STORAGE_DECL static thread_local
+
+namespace ROCKSDB_NAMESPACE {
+
+Random* Random::GetTLSInstance() {
+  STORAGE_DECL Random* tls_instance;
+  STORAGE_DECL std::aligned_storage<sizeof(Random)>::type tls_instance_bytes;
+
+  auto rv = tls_instance;
+  if (UNLIKELY(rv == nullptr)) {
+    size_t seed = std::hash<std::thread::id>()(std::this_thread::get_id());
+    rv = new (&tls_instance_bytes) Random((uint32_t)seed);
+    tls_instance = rv;
+  }
+  return rv;
+}
+
+std::string Random::HumanReadableString(int len) {
+  std::string ret;
+  ret.resize(len);
+  for (int i = 0; i < len; ++i) {
+    ret[i] = static_cast<char>('a' + Uniform(26));
+  }
+  return ret;
+}
+
+std::string Random::RandomString(int len) {
+  std::string ret;
+  ret.resize(len);
+  for (int i = 0; i < len; i++) {
+    ret[i] = static_cast<char>(' ' + Uniform(95));  // ' ' .. '~'
+  }
+  return ret;
+}
+
+std::string Random::RandomBinaryString(int len) {
+  std::string ret;
+  ret.resize(len);
+  for (int i = 0; i < len; i++) {
+    ret[i] = static_cast<char>(Uniform(CHAR_MAX));
+  }
+  return ret;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/random.h b/src/rocksdb/util/random.h
new file mode 100644
index 000000000..8923bdc4f
--- /dev/null
+++ b/src/rocksdb/util/random.h
@@ -0,0 +1,190 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+
+#include <algorithm>
+#include <random>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A very simple random number generator.  Not especially good at
+// generating truly random bits, but good enough for our needs in this
+// package.
+class Random {
+ private:
+  enum : uint32_t {
+    M = 2147483647L  // 2^31-1
+  };
+  enum : uint64_t {
+    A = 16807  // bits 14, 8, 7, 5, 2, 1, 0
+  };
+
+  uint32_t seed_;
+
+  static uint32_t GoodSeed(uint32_t s) { return (s & M) != 0 ? (s & M) : 1; }
+
+ public:
+  // This is the largest value that can be returned from Next()
+  enum : uint32_t { kMaxNext = M };
+
+  explicit Random(uint32_t s) : seed_(GoodSeed(s)) {}
+
+  void Reset(uint32_t s) { seed_ = GoodSeed(s); }
+
+  uint32_t Next() {
+    // We are computing
+    //       seed_ = (seed_ * A) % M,    where M = 2^31-1
+    //
+    // seed_ must not be zero or M, or else all subsequent computed values
+    // will be zero or M respectively.  For all other values, seed_ will end
+    // up cycling through every number in [1,M-1]
+    uint64_t product = seed_ * A;
+
+    // Compute (product % M) using the fact that ((x << 31) % M) == x.
+    seed_ = static_cast<uint32_t>((product >> 31) + (product & M));
+    // The first reduction may overflow by 1 bit, so we may need to
+    // repeat.  mod == M is not possible; using > allows the faster
+    // sign-bit-based test.
+    if (seed_ > M) {
+      seed_ -= M;
+    }
+    return seed_;
+  }
+
+  uint64_t Next64() { return (uint64_t{Next()} << 32) | Next(); }
+
+  // Returns a uniformly distributed value in the range [0..n-1]
+  // REQUIRES: n > 0
+  uint32_t Uniform(int n) { return Next() % n; }
+
+  // Randomly returns true ~"1/n" of the time, and false otherwise.
+  // REQUIRES: n > 0
+  bool OneIn(int n) { return Uniform(n) == 0; }
+
+  // "Optional" one-in-n, where 0 or negative always returns false
+  // (may or may not consume a random value)
+  bool OneInOpt(int n) { return n > 0 && OneIn(n); }
+
+  // Returns random bool that is true for the given percentage of
+  // calls on average. Zero or less is always false and 100 or more
+  // is always true (may or may not consume a random value)
+  bool PercentTrue(int percentage) {
+    return static_cast<int>(Uniform(100)) < percentage;
+  }
+
+  // Skewed: pick "base" uniformly from range [0,max_log] and then
+  // return "base" random bits.  The effect is to pick a number in the
+  // range [0,2^max_log-1] with exponential bias towards smaller numbers.
+  uint32_t Skewed(int max_log) { return Uniform(1 << Uniform(max_log + 1)); }
+
+  // Returns a random string of length "len"
+  std::string RandomString(int len);
+
+  // Generates a random string of len bytes using human-readable characters
+  std::string HumanReadableString(int len);
+
+  // Generates a random binary data
+  std::string RandomBinaryString(int len);
+
+  // Returns a Random instance for use by the current thread without
+  // additional locking
+  static Random* GetTLSInstance();
+};
+
+// A good 32-bit random number generator based on std::mt19937.
+// This exists in part to avoid compiler variance in warning about coercing
+// uint_fast32_t from mt19937 to uint32_t.
+class Random32 {
+ private:
+  std::mt19937 generator_;
+
+ public:
+  explicit Random32(uint32_t s) : generator_(s) {}
+
+  // Generates the next random number
+  uint32_t Next() { return static_cast<uint32_t>(generator_()); }
+
+  // Returns a uniformly distributed value in the range [0..n-1]
+  // REQUIRES: n > 0
+  uint32_t Uniform(uint32_t n) {
+    return static_cast<uint32_t>(
+        std::uniform_int_distribution<std::mt19937::result_type>(
+            0, n - 1)(generator_));
+  }
+
+  // Returns an *almost* uniformly distributed value in the range [0..n-1].
+  // Much faster than Uniform().
+  // REQUIRES: n > 0
+  uint32_t Uniformish(uint32_t n) {
+    // fastrange (without the header)
+    return static_cast<uint32_t>((uint64_t(generator_()) * uint64_t(n)) >> 32);
+  }
+
+  // Randomly returns true ~"1/n" of the time, and false otherwise.
+  // REQUIRES: n > 0
+  bool OneIn(uint32_t n) { return Uniform(n) == 0; }
+
+  // Skewed: pick "base" uniformly from range [0,max_log] and then
+  // return "base" random bits.  The effect is to pick a number in the
+  // range [0,2^max_log-1] with exponential bias towards smaller numbers.
+  uint32_t Skewed(int max_log) {
+    return Uniform(uint32_t{1} << Uniform(max_log + 1));
+  }
+
+  // Reset the seed of the generator to the given value
+  void Seed(uint32_t new_seed) { generator_.seed(new_seed); }
+};
+
+// A good 64-bit random number generator based on std::mt19937_64
+class Random64 {
+ private:
+  std::mt19937_64 generator_;
+
+ public:
+  explicit Random64(uint64_t s) : generator_(s) {}
+
+  // Generates the next random number
+  uint64_t Next() { return generator_(); }
+
+  // Returns a uniformly distributed value in the range [0..n-1]
+  // REQUIRES: n > 0
+  uint64_t Uniform(uint64_t n) {
+    return std::uniform_int_distribution<uint64_t>(0, n - 1)(generator_);
+  }
+
+  // Randomly returns true ~"1/n" of the time, and false otherwise.
+  // REQUIRES: n > 0
+  bool OneIn(uint64_t n) { return Uniform(n) == 0; }
+
+  // Skewed: pick "base" uniformly from range [0,max_log] and then
+  // return "base" random bits.  The effect is to pick a number in the
+  // range [0,2^max_log-1] with exponential bias towards smaller numbers.
+  uint64_t Skewed(int max_log) {
+    return Uniform(uint64_t(1) << Uniform(max_log + 1));
+  }
+};
+
+// A seeded replacement for removed std::random_shuffle
+template <class RandomIt>
+void RandomShuffle(RandomIt first, RandomIt last, uint32_t seed) {
+  std::mt19937 rng(seed);
+  std::shuffle(first, last, rng);
+}
+
+// A replacement for removed std::random_shuffle
+template <class RandomIt>
+void RandomShuffle(RandomIt first, RandomIt last) {
+  RandomShuffle(first, last, std::random_device{}());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/random_test.cc b/src/rocksdb/util/random_test.cc
new file mode 100644
index 000000000..1aa62c5da
--- /dev/null
+++ b/src/rocksdb/util/random_test.cc
@@ -0,0 +1,107 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/random.h"
+
+#include <cstring>
+#include <vector>
+
+#include "test_util/testharness.h"
+
+using ROCKSDB_NAMESPACE::Random;
+
+TEST(RandomTest, Uniform) {
+  const int average = 20;
+  for (uint32_t seed : {0, 1, 2, 37, 4096}) {
+    Random r(seed);
+    for (int range : {1, 2, 8, 12, 100}) {
+      std::vector<int> counts(range, 0);
+
+      for (int i = 0; i < range * average; ++i) {
+        ++counts.at(r.Uniform(range));
+      }
+      int max_variance = static_cast<int>(std::sqrt(range) * 2 + 4);
+      for (int i = 0; i < range; ++i) {
+        EXPECT_GE(counts[i], std::max(1, average - max_variance));
+        EXPECT_LE(counts[i], average + max_variance + 1);
+      }
+    }
+  }
+}
+
+TEST(RandomTest, OneIn) {
+  Random r(42);
+  for (int range : {1, 2, 8, 12, 100, 1234}) {
+    const int average = 100;
+    int count = 0;
+    for (int i = 0; i < average * range; ++i) {
+      if (r.OneIn(range)) {
+        ++count;
+      }
+    }
+    if (range == 1) {
+      EXPECT_EQ(count, average);
+    } else {
+      int max_variance = static_cast<int>(std::sqrt(average) * 1.5);
+      EXPECT_GE(count, average - max_variance);
+      EXPECT_LE(count, average + max_variance);
+    }
+  }
+}
+
+TEST(RandomTest, OneInOpt) {
+  Random r(42);
+  for (int range : {-12, 0, 1, 2, 8, 12, 100, 1234}) {
+    const int average = 100;
+    int count = 0;
+    for (int i = 0; i < average * range; ++i) {
+      if (r.OneInOpt(range)) {
+        ++count;
+      }
+    }
+    if (range < 1) {
+      EXPECT_EQ(count, 0);
+    } else if (range == 1) {
+      EXPECT_EQ(count, average);
+    } else {
+      int max_variance = static_cast<int>(std::sqrt(average) * 1.5);
+      EXPECT_GE(count, average - max_variance);
+      EXPECT_LE(count, average + max_variance);
+    }
+  }
+}
+
+TEST(RandomTest, PercentTrue) {
+  Random r(42);
+  for (int pct : {-12, 0, 1, 2, 10, 50, 90, 98, 99, 100, 1234}) {
+    const int samples = 10000;
+
+    int count = 0;
+    for (int i = 0; i < samples; ++i) {
+      if (r.PercentTrue(pct)) {
+        ++count;
+      }
+    }
+    if (pct <= 0) {
+      EXPECT_EQ(count, 0);
+    } else if (pct >= 100) {
+      EXPECT_EQ(count, samples);
+    } else {
+      int est = (count * 100 + (samples / 2)) / samples;
+      EXPECT_EQ(est, pct);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/rate_limiter.cc b/src/rocksdb/util/rate_limiter.cc
new file mode 100644
index 000000000..6bbcabfae
--- /dev/null
+++ b/src/rocksdb/util/rate_limiter.cc
@@ -0,0 +1,378 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/rate_limiter.h"
+
+#include <algorithm>
+
+#include "monitoring/statistics.h"
+#include "port/port.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/sync_point.h"
+#include "util/aligned_buffer.h"
+
+namespace ROCKSDB_NAMESPACE {
+size_t RateLimiter::RequestToken(size_t bytes, size_t alignment,
+                                 Env::IOPriority io_priority, Statistics* stats,
+                                 RateLimiter::OpType op_type) {
+  if (io_priority < Env::IO_TOTAL && IsRateLimited(op_type)) {
+    bytes = std::min(bytes, static_cast<size_t>(GetSingleBurstBytes()));
+
+    if (alignment > 0) {
+      // Here we may actually require more than burst and block
+      // as we can not write/read less than one page at a time on direct I/O
+      // thus we do not want to be strictly constrained by burst
+      bytes = std::max(alignment, TruncateToPageBoundary(alignment, bytes));
+    }
+    Request(bytes, io_priority, stats, op_type);
+  }
+  return bytes;
+}
+
+// Pending request
+struct GenericRateLimiter::Req {
+  explicit Req(int64_t _bytes, port::Mutex* _mu)
+      : request_bytes(_bytes), bytes(_bytes), cv(_mu), granted(false) {}
+  int64_t request_bytes;
+  int64_t bytes;
+  port::CondVar cv;
+  bool granted;
+};
+
+GenericRateLimiter::GenericRateLimiter(
+    int64_t rate_bytes_per_sec, int64_t refill_period_us, int32_t fairness,
+    RateLimiter::Mode mode, const std::shared_ptr<SystemClock>& clock,
+    bool auto_tuned)
+    : RateLimiter(mode),
+      refill_period_us_(refill_period_us),
+      rate_bytes_per_sec_(auto_tuned ? rate_bytes_per_sec / 2
+                                     : rate_bytes_per_sec),
+      refill_bytes_per_period_(
+          CalculateRefillBytesPerPeriodLocked(rate_bytes_per_sec_)),
+      clock_(clock),
+      stop_(false),
+      exit_cv_(&request_mutex_),
+      requests_to_wait_(0),
+      available_bytes_(0),
+      next_refill_us_(NowMicrosMonotonicLocked()),
+      fairness_(fairness > 100 ? 100 : fairness),
+      rnd_((uint32_t)time(nullptr)),
+      wait_until_refill_pending_(false),
+      auto_tuned_(auto_tuned),
+      num_drains_(0),
+      max_bytes_per_sec_(rate_bytes_per_sec),
+      tuned_time_(NowMicrosMonotonicLocked()) {
+  for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) {
+    total_requests_[i] = 0;
+    total_bytes_through_[i] = 0;
+  }
+}
+
+GenericRateLimiter::~GenericRateLimiter() {
+  MutexLock g(&request_mutex_);
+  stop_ = true;
+  std::deque<Req*>::size_type queues_size_sum = 0;
+  for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) {
+    queues_size_sum += queue_[i].size();
+  }
+  requests_to_wait_ = static_cast<int32_t>(queues_size_sum);
+
+  for (int i = Env::IO_TOTAL - 1; i >= Env::IO_LOW; --i) {
+    std::deque<Req*> queue = queue_[i];
+    for (auto& r : queue) {
+      r->cv.Signal();
+    }
+  }
+
+  while (requests_to_wait_ > 0) {
+    exit_cv_.Wait();
+  }
+}
+
+// This API allows user to dynamically change rate limiter's bytes per second.
+void GenericRateLimiter::SetBytesPerSecond(int64_t bytes_per_second) {
+  MutexLock g(&request_mutex_);
+  SetBytesPerSecondLocked(bytes_per_second);
+}
+
+void GenericRateLimiter::SetBytesPerSecondLocked(int64_t bytes_per_second) {
+  assert(bytes_per_second > 0);
+  rate_bytes_per_sec_.store(bytes_per_second, std::memory_order_relaxed);
+  refill_bytes_per_period_.store(
+      CalculateRefillBytesPerPeriodLocked(bytes_per_second),
+      std::memory_order_relaxed);
+}
+
+void GenericRateLimiter::Request(int64_t bytes, const Env::IOPriority pri,
+                                 Statistics* stats) {
+  assert(bytes <= refill_bytes_per_period_.load(std::memory_order_relaxed));
+  bytes = std::max(static_cast<int64_t>(0), bytes);
+  TEST_SYNC_POINT("GenericRateLimiter::Request");
+  TEST_SYNC_POINT_CALLBACK("GenericRateLimiter::Request:1",
+                           &rate_bytes_per_sec_);
+  MutexLock g(&request_mutex_);
+
+  if (auto_tuned_) {
+    static const int kRefillsPerTune = 100;
+    std::chrono::microseconds now(NowMicrosMonotonicLocked());
+    if (now - tuned_time_ >=
+        kRefillsPerTune * std::chrono::microseconds(refill_period_us_)) {
+      Status s = TuneLocked();
+      s.PermitUncheckedError();  //**TODO: What to do on error?
+    }
+  }
+
+  if (stop_) {
+    // It is now in the clean-up of ~GenericRateLimiter().
+    // Therefore any new incoming request will exit from here
+    // and not get satiesfied.
+    return;
+  }
+
+  ++total_requests_[pri];
+
+  if (available_bytes_ >= bytes) {
+    // Refill thread assigns quota and notifies requests waiting on
+    // the queue under mutex. So if we get here, that means nobody
+    // is waiting?
+    available_bytes_ -= bytes;
+    total_bytes_through_[pri] += bytes;
+    return;
+  }
+
+  // Request cannot be satisfied at this moment, enqueue
+  Req r(bytes, &request_mutex_);
+  queue_[pri].push_back(&r);
+  TEST_SYNC_POINT_CALLBACK("GenericRateLimiter::Request:PostEnqueueRequest",
+                           &request_mutex_);
+  // A thread representing a queued request coordinates with other such threads.
+  // There are two main duties.
+  //
+  // (1) Waiting for the next refill time.
+  // (2) Refilling the bytes and granting requests.
+  do {
+    int64_t time_until_refill_us = next_refill_us_ - NowMicrosMonotonicLocked();
+    if (time_until_refill_us > 0) {
+      if (wait_until_refill_pending_) {
+        // Somebody is performing (1). Trust we'll be woken up when our request
+        // is granted or we are needed for future duties.
+        r.cv.Wait();
+      } else {
+        // Whichever thread reaches here first performs duty (1) as described
+        // above.
+        int64_t wait_until = clock_->NowMicros() + time_until_refill_us;
+        RecordTick(stats, NUMBER_RATE_LIMITER_DRAINS);
+        ++num_drains_;
+        wait_until_refill_pending_ = true;
+        r.cv.TimedWait(wait_until);
+        TEST_SYNC_POINT_CALLBACK("GenericRateLimiter::Request:PostTimedWait",
+                                 &time_until_refill_us);
+        wait_until_refill_pending_ = false;
+      }
+    } else {
+      // Whichever thread reaches here first performs duty (2) as described
+      // above.
+      RefillBytesAndGrantRequestsLocked();
+      if (r.granted) {
+        // If there is any remaining requests, make sure there exists at least
+        // one candidate is awake for future duties by signaling a front request
+        // of a queue.
+        for (int i = Env::IO_TOTAL - 1; i >= Env::IO_LOW; --i) {
+          std::deque<Req*> queue = queue_[i];
+          if (!queue.empty()) {
+            queue.front()->cv.Signal();
+            break;
+          }
+        }
+      }
+    }
+    // Invariant: non-granted request is always in one queue, and granted
+    // request is always in zero queues.
+#ifndef NDEBUG
+    int num_found = 0;
+    for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) {
+      if (std::find(queue_[i].begin(), queue_[i].end(), &r) !=
+          queue_[i].end()) {
+        ++num_found;
+      }
+    }
+    if (r.granted) {
+      assert(num_found == 0);
+    } else {
+      assert(num_found == 1);
+    }
+#endif  // NDEBUG
+  } while (!stop_ && !r.granted);
+
+  if (stop_) {
+    // It is now in the clean-up of ~GenericRateLimiter().
+    // Therefore any woken-up request will have come out of the loop and then
+    // exit here. It might or might not have been satisfied.
+    --requests_to_wait_;
+    exit_cv_.Signal();
+  }
+}
+
+std::vector<Env::IOPriority>
+GenericRateLimiter::GeneratePriorityIterationOrderLocked() {
+  std::vector<Env::IOPriority> pri_iteration_order(Env::IO_TOTAL /* 4 */);
+  // We make Env::IO_USER a superior priority by always iterating its queue
+  // first
+  pri_iteration_order[0] = Env::IO_USER;
+
+  bool high_pri_iterated_after_mid_low_pri = rnd_.OneIn(fairness_);
+  TEST_SYNC_POINT_CALLBACK(
+      "GenericRateLimiter::GeneratePriorityIterationOrderLocked::"
+      "PostRandomOneInFairnessForHighPri",
+      &high_pri_iterated_after_mid_low_pri);
+  bool mid_pri_itereated_after_low_pri = rnd_.OneIn(fairness_);
+  TEST_SYNC_POINT_CALLBACK(
+      "GenericRateLimiter::GeneratePriorityIterationOrderLocked::"
+      "PostRandomOneInFairnessForMidPri",
+      &mid_pri_itereated_after_low_pri);
+
+  if (high_pri_iterated_after_mid_low_pri) {
+    pri_iteration_order[3] = Env::IO_HIGH;
+    pri_iteration_order[2] =
+        mid_pri_itereated_after_low_pri ? Env::IO_MID : Env::IO_LOW;
+    pri_iteration_order[1] =
+        (pri_iteration_order[2] == Env::IO_MID) ? Env::IO_LOW : Env::IO_MID;
+  } else {
+    pri_iteration_order[1] = Env::IO_HIGH;
+    pri_iteration_order[3] =
+        mid_pri_itereated_after_low_pri ? Env::IO_MID : Env::IO_LOW;
+    pri_iteration_order[2] =
+        (pri_iteration_order[3] == Env::IO_MID) ? Env::IO_LOW : Env::IO_MID;
+  }
+
+  TEST_SYNC_POINT_CALLBACK(
+      "GenericRateLimiter::GeneratePriorityIterationOrderLocked::"
+      "PreReturnPriIterationOrder",
+      &pri_iteration_order);
+  return pri_iteration_order;
+}
+
+void GenericRateLimiter::RefillBytesAndGrantRequestsLocked() {
+  TEST_SYNC_POINT_CALLBACK(
+      "GenericRateLimiter::RefillBytesAndGrantRequestsLocked", &request_mutex_);
+  next_refill_us_ = NowMicrosMonotonicLocked() + refill_period_us_;
+  // Carry over the left over quota from the last period
+  auto refill_bytes_per_period =
+      refill_bytes_per_period_.load(std::memory_order_relaxed);
+  if (available_bytes_ < refill_bytes_per_period) {
+    available_bytes_ += refill_bytes_per_period;
+  }
+
+  std::vector<Env::IOPriority> pri_iteration_order =
+      GeneratePriorityIterationOrderLocked();
+
+  for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) {
+    assert(!pri_iteration_order.empty());
+    Env::IOPriority current_pri = pri_iteration_order[i];
+    auto* queue = &queue_[current_pri];
+    while (!queue->empty()) {
+      auto* next_req = queue->front();
+      if (available_bytes_ < next_req->request_bytes) {
+        // Grant partial request_bytes to avoid starvation of requests
+        // that become asking for more bytes than available_bytes_
+        // due to dynamically reduced rate limiter's bytes_per_second that
+        // leads to reduced refill_bytes_per_period hence available_bytes_
+        next_req->request_bytes -= available_bytes_;
+        available_bytes_ = 0;
+        break;
+      }
+      available_bytes_ -= next_req->request_bytes;
+      next_req->request_bytes = 0;
+      total_bytes_through_[current_pri] += next_req->bytes;
+      queue->pop_front();
+
+      next_req->granted = true;
+      // Quota granted, signal the thread to exit
+      next_req->cv.Signal();
+    }
+  }
+}
+
+int64_t GenericRateLimiter::CalculateRefillBytesPerPeriodLocked(
+    int64_t rate_bytes_per_sec) {
+  if (std::numeric_limits<int64_t>::max() / rate_bytes_per_sec <
+      refill_period_us_) {
+    // Avoid unexpected result in the overflow case. The result now is still
+    // inaccurate but is a number that is large enough.
+    return std::numeric_limits<int64_t>::max() / 1000000;
+  } else {
+    return rate_bytes_per_sec * refill_period_us_ / 1000000;
+  }
+}
+
+Status GenericRateLimiter::TuneLocked() {
+  const int kLowWatermarkPct = 50;
+  const int kHighWatermarkPct = 90;
+  const int kAdjustFactorPct = 5;
+  // computed rate limit will be in
+  // `[max_bytes_per_sec_ / kAllowedRangeFactor, max_bytes_per_sec_]`.
+  const int kAllowedRangeFactor = 20;
+
+  std::chrono::microseconds prev_tuned_time = tuned_time_;
+  tuned_time_ = std::chrono::microseconds(NowMicrosMonotonicLocked());
+
+  int64_t elapsed_intervals = (tuned_time_ - prev_tuned_time +
+                               std::chrono::microseconds(refill_period_us_) -
+                               std::chrono::microseconds(1)) /
+                              std::chrono::microseconds(refill_period_us_);
+  // We tune every kRefillsPerTune intervals, so the overflow and division-by-
+  // zero conditions should never happen.
+  assert(num_drains_ <= std::numeric_limits<int64_t>::max() / 100);
+  assert(elapsed_intervals > 0);
+  int64_t drained_pct = num_drains_ * 100 / elapsed_intervals;
+
+  int64_t prev_bytes_per_sec = GetBytesPerSecond();
+  int64_t new_bytes_per_sec;
+  if (drained_pct == 0) {
+    new_bytes_per_sec = max_bytes_per_sec_ / kAllowedRangeFactor;
+  } else if (drained_pct < kLowWatermarkPct) {
+    // sanitize to prevent overflow
+    int64_t sanitized_prev_bytes_per_sec =
+        std::min(prev_bytes_per_sec, std::numeric_limits<int64_t>::max() / 100);
+    new_bytes_per_sec =
+        std::max(max_bytes_per_sec_ / kAllowedRangeFactor,
+                 sanitized_prev_bytes_per_sec * 100 / (100 + kAdjustFactorPct));
+  } else if (drained_pct > kHighWatermarkPct) {
+    // sanitize to prevent overflow
+    int64_t sanitized_prev_bytes_per_sec =
+        std::min(prev_bytes_per_sec, std::numeric_limits<int64_t>::max() /
+                                         (100 + kAdjustFactorPct));
+    new_bytes_per_sec =
+        std::min(max_bytes_per_sec_,
+                 sanitized_prev_bytes_per_sec * (100 + kAdjustFactorPct) / 100);
+  } else {
+    new_bytes_per_sec = prev_bytes_per_sec;
+  }
+  if (new_bytes_per_sec != prev_bytes_per_sec) {
+    SetBytesPerSecondLocked(new_bytes_per_sec);
+  }
+  num_drains_ = 0;
+  return Status::OK();
+}
+
+RateLimiter* NewGenericRateLimiter(
+    int64_t rate_bytes_per_sec, int64_t refill_period_us /* = 100 * 1000 */,
+    int32_t fairness /* = 10 */,
+    RateLimiter::Mode mode /* = RateLimiter::Mode::kWritesOnly */,
+    bool auto_tuned /* = false */) {
+  assert(rate_bytes_per_sec > 0);
+  assert(refill_period_us > 0);
+  assert(fairness > 0);
+  std::unique_ptr<RateLimiter> limiter(
+      new GenericRateLimiter(rate_bytes_per_sec, refill_period_us, fairness,
+                             mode, SystemClock::Default(), auto_tuned));
+  return limiter.release();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/rate_limiter.h b/src/rocksdb/util/rate_limiter.h
new file mode 100644
index 000000000..4c078f5a0
--- /dev/null
+++ b/src/rocksdb/util/rate_limiter.h
@@ -0,0 +1,146 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <algorithm>
+#include <atomic>
+#include <chrono>
+#include <deque>
+
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/status.h"
+#include "rocksdb/system_clock.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class GenericRateLimiter : public RateLimiter {
+ public:
+  GenericRateLimiter(int64_t refill_bytes, int64_t refill_period_us,
+                     int32_t fairness, RateLimiter::Mode mode,
+                     const std::shared_ptr<SystemClock>& clock,
+                     bool auto_tuned);
+
+  virtual ~GenericRateLimiter();
+
+  // This API allows user to dynamically change rate limiter's bytes per second.
+  virtual void SetBytesPerSecond(int64_t bytes_per_second) override;
+
+  // Request for token to write bytes. If this request can not be satisfied,
+  // the call is blocked. Caller is responsible to make sure
+  // bytes <= GetSingleBurstBytes() and bytes >= 0. Negative bytes
+  // passed in will be rounded up to 0.
+  using RateLimiter::Request;
+  virtual void Request(const int64_t bytes, const Env::IOPriority pri,
+                       Statistics* stats) override;
+
+  virtual int64_t GetSingleBurstBytes() const override {
+    return refill_bytes_per_period_.load(std::memory_order_relaxed);
+  }
+
+  virtual int64_t GetTotalBytesThrough(
+      const Env::IOPriority pri = Env::IO_TOTAL) const override {
+    MutexLock g(&request_mutex_);
+    if (pri == Env::IO_TOTAL) {
+      int64_t total_bytes_through_sum = 0;
+      for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) {
+        total_bytes_through_sum += total_bytes_through_[i];
+      }
+      return total_bytes_through_sum;
+    }
+    return total_bytes_through_[pri];
+  }
+
+  virtual int64_t GetTotalRequests(
+      const Env::IOPriority pri = Env::IO_TOTAL) const override {
+    MutexLock g(&request_mutex_);
+    if (pri == Env::IO_TOTAL) {
+      int64_t total_requests_sum = 0;
+      for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) {
+        total_requests_sum += total_requests_[i];
+      }
+      return total_requests_sum;
+    }
+    return total_requests_[pri];
+  }
+
+  virtual Status GetTotalPendingRequests(
+      int64_t* total_pending_requests,
+      const Env::IOPriority pri = Env::IO_TOTAL) const override {
+    assert(total_pending_requests != nullptr);
+    MutexLock g(&request_mutex_);
+    if (pri == Env::IO_TOTAL) {
+      int64_t total_pending_requests_sum = 0;
+      for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) {
+        total_pending_requests_sum += static_cast<int64_t>(queue_[i].size());
+      }
+      *total_pending_requests = total_pending_requests_sum;
+    } else {
+      *total_pending_requests = static_cast<int64_t>(queue_[pri].size());
+    }
+    return Status::OK();
+  }
+
+  virtual int64_t GetBytesPerSecond() const override {
+    return rate_bytes_per_sec_.load(std::memory_order_relaxed);
+  }
+
+  virtual void TEST_SetClock(std::shared_ptr<SystemClock> clock) {
+    MutexLock g(&request_mutex_);
+    clock_ = std::move(clock);
+    next_refill_us_ = NowMicrosMonotonicLocked();
+  }
+
+ private:
+  void RefillBytesAndGrantRequestsLocked();
+  std::vector<Env::IOPriority> GeneratePriorityIterationOrderLocked();
+  int64_t CalculateRefillBytesPerPeriodLocked(int64_t rate_bytes_per_sec);
+  Status TuneLocked();
+  void SetBytesPerSecondLocked(int64_t bytes_per_second);
+
+  uint64_t NowMicrosMonotonicLocked() {
+    return clock_->NowNanos() / std::milli::den;
+  }
+
+  // This mutex guard all internal states
+  mutable port::Mutex request_mutex_;
+
+  const int64_t refill_period_us_;
+
+  std::atomic<int64_t> rate_bytes_per_sec_;
+  std::atomic<int64_t> refill_bytes_per_period_;
+  std::shared_ptr<SystemClock> clock_;
+
+  bool stop_;
+  port::CondVar exit_cv_;
+  int32_t requests_to_wait_;
+
+  int64_t total_requests_[Env::IO_TOTAL];
+  int64_t total_bytes_through_[Env::IO_TOTAL];
+  int64_t available_bytes_;
+  int64_t next_refill_us_;
+
+  int32_t fairness_;
+  Random rnd_;
+
+  struct Req;
+  std::deque<Req*> queue_[Env::IO_TOTAL];
+  bool wait_until_refill_pending_;
+
+  bool auto_tuned_;
+  int64_t num_drains_;
+  const int64_t max_bytes_per_sec_;
+  std::chrono::microseconds tuned_time_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/rate_limiter_test.cc b/src/rocksdb/util/rate_limiter_test.cc
new file mode 100644
index 000000000..cda134867
--- /dev/null
+++ b/src/rocksdb/util/rate_limiter_test.cc
@@ -0,0 +1,476 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/rate_limiter.h"
+
+#include <chrono>
+#include <cinttypes>
+#include <cstdint>
+#include <limits>
+
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO(yhchiang): the rate will not be accurate when we run test in parallel.
+class RateLimiterTest : public testing::Test {
+ protected:
+  ~RateLimiterTest() override {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+  }
+};
+
+TEST_F(RateLimiterTest, OverflowRate) {
+  GenericRateLimiter limiter(std::numeric_limits<int64_t>::max(), 1000, 10,
+                             RateLimiter::Mode::kWritesOnly,
+                             SystemClock::Default(), false /* auto_tuned */);
+  ASSERT_GT(limiter.GetSingleBurstBytes(), 1000000000ll);
+}
+
+TEST_F(RateLimiterTest, StartStop) {
+  std::unique_ptr<RateLimiter> limiter(NewGenericRateLimiter(100, 100, 10));
+}
+
+TEST_F(RateLimiterTest, GetTotalBytesThrough) {
+  std::unique_ptr<RateLimiter> limiter(NewGenericRateLimiter(
+      200 /* rate_bytes_per_sec */, 1000 * 1000 /* refill_period_us */,
+      10 /* fairness */));
+  for (int i = Env::IO_LOW; i <= Env::IO_TOTAL; ++i) {
+    ASSERT_EQ(limiter->GetTotalBytesThrough(static_cast<Env::IOPriority>(i)),
+              0);
+  }
+
+  std::int64_t request_byte = 200;
+  std::int64_t request_byte_sum = 0;
+  for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) {
+    limiter->Request(request_byte, static_cast<Env::IOPriority>(i),
+                     nullptr /* stats */, RateLimiter::OpType::kWrite);
+    request_byte_sum += request_byte;
+  }
+
+  for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) {
+    EXPECT_EQ(limiter->GetTotalBytesThrough(static_cast<Env::IOPriority>(i)),
+              request_byte)
+        << "Failed to track total_bytes_through_ correctly when IOPriority = "
+        << static_cast<Env::IOPriority>(i);
+  }
+  EXPECT_EQ(limiter->GetTotalBytesThrough(Env::IO_TOTAL), request_byte_sum)
+      << "Failed to track total_bytes_through_ correctly when IOPriority = "
+         "Env::IO_TOTAL";
+}
+
+TEST_F(RateLimiterTest, GetTotalRequests) {
+  std::unique_ptr<RateLimiter> limiter(NewGenericRateLimiter(
+      200 /* rate_bytes_per_sec */, 1000 * 1000 /* refill_period_us */,
+      10 /* fairness */));
+  for (int i = Env::IO_LOW; i <= Env::IO_TOTAL; ++i) {
+    ASSERT_EQ(limiter->GetTotalRequests(static_cast<Env::IOPriority>(i)), 0);
+  }
+
+  std::int64_t total_requests_sum = 0;
+  for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) {
+    limiter->Request(200, static_cast<Env::IOPriority>(i), nullptr /* stats */,
+                     RateLimiter::OpType::kWrite);
+    total_requests_sum += 1;
+  }
+
+  for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) {
+    EXPECT_EQ(limiter->GetTotalRequests(static_cast<Env::IOPriority>(i)), 1)
+        << "Failed to track total_requests_ correctly when IOPriority = "
+        << static_cast<Env::IOPriority>(i);
+  }
+  EXPECT_EQ(limiter->GetTotalRequests(Env::IO_TOTAL), total_requests_sum)
+      << "Failed to track total_requests_ correctly when IOPriority = "
+         "Env::IO_TOTAL";
+}
+
+TEST_F(RateLimiterTest, GetTotalPendingRequests) {
+  std::unique_ptr<RateLimiter> limiter(NewGenericRateLimiter(
+      200 /* rate_bytes_per_sec */, 1000 * 1000 /* refill_period_us */,
+      10 /* fairness */));
+  int64_t total_pending_requests = 0;
+  for (int i = Env::IO_LOW; i <= Env::IO_TOTAL; ++i) {
+    ASSERT_OK(limiter->GetTotalPendingRequests(
+        &total_pending_requests, static_cast<Env::IOPriority>(i)));
+    ASSERT_EQ(total_pending_requests, 0);
+  }
+  // This is a variable for making sure the following callback is called
+  // and the assertions in it are indeed excuted
+  bool nonzero_pending_requests_verified = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "GenericRateLimiter::Request:PostEnqueueRequest", [&](void* arg) {
+        port::Mutex* request_mutex = (port::Mutex*)arg;
+        // We temporarily unlock the mutex so that the following
+        // GetTotalPendingRequests() can acquire it
+        request_mutex->Unlock();
+        for (int i = Env::IO_LOW; i <= Env::IO_TOTAL; ++i) {
+          EXPECT_OK(limiter->GetTotalPendingRequests(
+              &total_pending_requests, static_cast<Env::IOPriority>(i)))
+              << "Failed to return total pending requests for priority level = "
+              << static_cast<Env::IOPriority>(i);
+          if (i == Env::IO_USER || i == Env::IO_TOTAL) {
+            EXPECT_EQ(total_pending_requests, 1)
+                << "Failed to correctly return total pending requests for "
+                   "priority level = "
+                << static_cast<Env::IOPriority>(i);
+          } else {
+            EXPECT_EQ(total_pending_requests, 0)
+                << "Failed to correctly return total pending requests for "
+                   "priority level = "
+                << static_cast<Env::IOPriority>(i);
+          }
+        }
+        // We lock the mutex again so that the request thread can resume running
+        // with the mutex locked
+        request_mutex->Lock();
+        nonzero_pending_requests_verified = true;
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  limiter->Request(200, Env::IO_USER, nullptr /* stats */,
+                   RateLimiter::OpType::kWrite);
+  ASSERT_EQ(nonzero_pending_requests_verified, true);
+  for (int i = Env::IO_LOW; i <= Env::IO_TOTAL; ++i) {
+    EXPECT_OK(limiter->GetTotalPendingRequests(&total_pending_requests,
+                                               static_cast<Env::IOPriority>(i)))
+        << "Failed to return total pending requests for priority level = "
+        << static_cast<Env::IOPriority>(i);
+    EXPECT_EQ(total_pending_requests, 0)
+        << "Failed to correctly return total pending requests for priority "
+           "level = "
+        << static_cast<Env::IOPriority>(i);
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearCallBack(
+      "GenericRateLimiter::Request:PostEnqueueRequest");
+}
+
+TEST_F(RateLimiterTest, Modes) {
+  for (auto mode : {RateLimiter::Mode::kWritesOnly,
+                    RateLimiter::Mode::kReadsOnly, RateLimiter::Mode::kAllIo}) {
+    GenericRateLimiter limiter(2000 /* rate_bytes_per_sec */,
+                               1000 * 1000 /* refill_period_us */,
+                               10 /* fairness */, mode, SystemClock::Default(),
+                               false /* auto_tuned */);
+    limiter.Request(1000 /* bytes */, Env::IO_HIGH, nullptr /* stats */,
+                    RateLimiter::OpType::kRead);
+    if (mode == RateLimiter::Mode::kWritesOnly) {
+      ASSERT_EQ(0, limiter.GetTotalBytesThrough(Env::IO_HIGH));
+    } else {
+      ASSERT_EQ(1000, limiter.GetTotalBytesThrough(Env::IO_HIGH));
+    }
+
+    limiter.Request(1000 /* bytes */, Env::IO_HIGH, nullptr /* stats */,
+                    RateLimiter::OpType::kWrite);
+    if (mode == RateLimiter::Mode::kAllIo) {
+      ASSERT_EQ(2000, limiter.GetTotalBytesThrough(Env::IO_HIGH));
+    } else {
+      ASSERT_EQ(1000, limiter.GetTotalBytesThrough(Env::IO_HIGH));
+    }
+  }
+}
+
+TEST_F(RateLimiterTest, GeneratePriorityIterationOrder) {
+  std::unique_ptr<RateLimiter> limiter(NewGenericRateLimiter(
+      200 /* rate_bytes_per_sec */, 1000 * 1000 /* refill_period_us */,
+      10 /* fairness */));
+
+  bool possible_random_one_in_fairness_results_for_high_mid_pri[4][2] = {
+      {false, false}, {false, true}, {true, false}, {true, true}};
+  std::vector<Env::IOPriority> possible_priority_iteration_orders[4] = {
+      {Env::IO_USER, Env::IO_HIGH, Env::IO_MID, Env::IO_LOW},
+      {Env::IO_USER, Env::IO_HIGH, Env::IO_LOW, Env::IO_MID},
+      {Env::IO_USER, Env::IO_MID, Env::IO_LOW, Env::IO_HIGH},
+      {Env::IO_USER, Env::IO_LOW, Env::IO_MID, Env::IO_HIGH}};
+
+  for (int i = 0; i < 4; ++i) {
+    // These are variables for making sure the following callbacks are called
+    // and the assertion in the last callback is indeed excuted
+    bool high_pri_iterated_after_mid_low_pri_set = false;
+    bool mid_pri_itereated_after_low_pri_set = false;
+    bool pri_iteration_order_verified = false;
+    SyncPoint::GetInstance()->SetCallBack(
+        "GenericRateLimiter::GeneratePriorityIterationOrderLocked::"
+        "PostRandomOneInFairnessForHighPri",
+        [&](void* arg) {
+          bool* high_pri_iterated_after_mid_low_pri = (bool*)arg;
+          *high_pri_iterated_after_mid_low_pri =
+              possible_random_one_in_fairness_results_for_high_mid_pri[i][0];
+          high_pri_iterated_after_mid_low_pri_set = true;
+        });
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "GenericRateLimiter::GeneratePriorityIterationOrderLocked::"
+        "PostRandomOneInFairnessForMidPri",
+        [&](void* arg) {
+          bool* mid_pri_itereated_after_low_pri = (bool*)arg;
+          *mid_pri_itereated_after_low_pri =
+              possible_random_one_in_fairness_results_for_high_mid_pri[i][1];
+          mid_pri_itereated_after_low_pri_set = true;
+        });
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "GenericRateLimiter::GeneratePriorityIterationOrderLocked::"
+        "PreReturnPriIterationOrder",
+        [&](void* arg) {
+          std::vector<Env::IOPriority>* pri_iteration_order =
+              (std::vector<Env::IOPriority>*)arg;
+          EXPECT_EQ(*pri_iteration_order, possible_priority_iteration_orders[i])
+              << "Failed to generate priority iteration order correctly when "
+                 "high_pri_iterated_after_mid_low_pri = "
+              << possible_random_one_in_fairness_results_for_high_mid_pri[i][0]
+              << ", mid_pri_itereated_after_low_pri = "
+              << possible_random_one_in_fairness_results_for_high_mid_pri[i][1]
+              << std::endl;
+          pri_iteration_order_verified = true;
+        });
+
+    SyncPoint::GetInstance()->EnableProcessing();
+    limiter->Request(200 /* request max bytes to drain so that refill and order
+                           generation will be triggered every time
+                           GenericRateLimiter::Request() is called */
+                     ,
+                     Env::IO_USER, nullptr /* stats */,
+                     RateLimiter::OpType::kWrite);
+    ASSERT_EQ(high_pri_iterated_after_mid_low_pri_set, true);
+    ASSERT_EQ(mid_pri_itereated_after_low_pri_set, true);
+    ASSERT_EQ(pri_iteration_order_verified, true);
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearCallBack(
+        "GenericRateLimiter::GeneratePriorityIterationOrderLocked::"
+        "PreReturnPriIterationOrder");
+    SyncPoint::GetInstance()->ClearCallBack(
+        "GenericRateLimiter::GeneratePriorityIterationOrderLocked::"
+        "PostRandomOneInFairnessForMidPri");
+    SyncPoint::GetInstance()->ClearCallBack(
+        "GenericRateLimiter::GeneratePriorityIterationOrderLocked::"
+        "PostRandomOneInFairnessForHighPri");
+  }
+}
+
+TEST_F(RateLimiterTest, Rate) {
+  auto* env = Env::Default();
+  struct Arg {
+    Arg(int32_t _target_rate, int _burst)
+        : limiter(NewGenericRateLimiter(_target_rate /* rate_bytes_per_sec */,
+                                        100 * 1000 /* refill_period_us */,
+                                        10 /* fairness */)),
+          request_size(_target_rate /
+                       10 /* refill period here is 1/10 second */),
+          burst(_burst) {}
+    std::unique_ptr<RateLimiter> limiter;
+    int32_t request_size;
+    int burst;
+  };
+
+  auto writer = [](void* p) {
+    const auto& thread_clock = SystemClock::Default();
+    auto* arg = static_cast<Arg*>(p);
+    // Test for 2 seconds
+    auto until = thread_clock->NowMicros() + 2 * 1000000;
+    Random r((uint32_t)(thread_clock->NowNanos() %
+                        std::numeric_limits<uint32_t>::max()));
+    while (thread_clock->NowMicros() < until) {
+      for (int i = 0; i < static_cast<int>(r.Skewed(arg->burst * 2) + 1); ++i) {
+        arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1,
+                              Env::IO_USER, nullptr /* stats */,
+                              RateLimiter::OpType::kWrite);
+      }
+
+      for (int i = 0; i < static_cast<int>(r.Skewed(arg->burst) + 1); ++i) {
+        arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1,
+                              Env::IO_HIGH, nullptr /* stats */,
+                              RateLimiter::OpType::kWrite);
+      }
+
+      for (int i = 0; i < static_cast<int>(r.Skewed(arg->burst / 2 + 1) + 1);
+           ++i) {
+        arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1, Env::IO_MID,
+                              nullptr /* stats */, RateLimiter::OpType::kWrite);
+      }
+
+      arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1, Env::IO_LOW,
+                            nullptr /* stats */, RateLimiter::OpType::kWrite);
+    }
+  };
+
+  int samples = 0;
+  int samples_at_minimum = 0;
+
+  for (int i = 1; i <= 16; i *= 2) {
+    int32_t target = i * 1024 * 10;
+    Arg arg(target, i / 4 + 1);
+    int64_t old_total_bytes_through = 0;
+    for (int iter = 1; iter <= 2; ++iter) {
+      // second iteration changes the target dynamically
+      if (iter == 2) {
+        target *= 2;
+        arg.limiter->SetBytesPerSecond(target);
+      }
+      auto start = env->NowMicros();
+      for (int t = 0; t < i; ++t) {
+        env->StartThread(writer, &arg);
+      }
+      env->WaitForJoin();
+
+      auto elapsed = env->NowMicros() - start;
+      double rate =
+          (arg.limiter->GetTotalBytesThrough() - old_total_bytes_through) *
+          1000000.0 / elapsed;
+      old_total_bytes_through = arg.limiter->GetTotalBytesThrough();
+      fprintf(stderr,
+              "request size [1 - %" PRIi32 "], limit %" PRIi32
+              " KB/sec, actual rate: %lf KB/sec, elapsed %.2lf seconds\n",
+              arg.request_size - 1, target / 1024, rate / 1024,
+              elapsed / 1000000.0);
+
+      ++samples;
+      if (rate / target >= 0.80) {
+        ++samples_at_minimum;
+      }
+      ASSERT_LE(rate / target, 1.25);
+    }
+  }
+
+  // This can fail due to slow execution speed, like when using valgrind or in
+  // heavily loaded CI environments
+  bool skip_minimum_rate_check =
+#if (defined(CIRCLECI) && defined(OS_MACOSX)) || defined(ROCKSDB_VALGRIND_RUN)
+      true;
+#else
+      getenv("SANDCASTLE");
+#endif
+  if (skip_minimum_rate_check) {
+    fprintf(stderr, "Skipped minimum rate check (%d / %d passed)\n",
+            samples_at_minimum, samples);
+  } else {
+    ASSERT_EQ(samples_at_minimum, samples);
+  }
+}
+
+TEST_F(RateLimiterTest, LimitChangeTest) {
+  // starvation test when limit changes to a smaller value
+  int64_t refill_period = 1000 * 1000;
+  auto* env = Env::Default();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  struct Arg {
+    Arg(int32_t _request_size, Env::IOPriority _pri,
+        std::shared_ptr<RateLimiter> _limiter)
+        : request_size(_request_size), pri(_pri), limiter(_limiter) {}
+    int32_t request_size;
+    Env::IOPriority pri;
+    std::shared_ptr<RateLimiter> limiter;
+  };
+
+  auto writer = [](void* p) {
+    auto* arg = static_cast<Arg*>(p);
+    arg->limiter->Request(arg->request_size, arg->pri, nullptr /* stats */,
+                          RateLimiter::OpType::kWrite);
+  };
+
+  for (uint32_t i = 1; i <= 16; i <<= 1) {
+    int32_t target = i * 1024 * 10;
+    // refill per second
+    for (int iter = 0; iter < 2; iter++) {
+      std::shared_ptr<RateLimiter> limiter =
+          std::make_shared<GenericRateLimiter>(
+              target, refill_period, 10, RateLimiter::Mode::kWritesOnly,
+              SystemClock::Default(), false /* auto_tuned */);
+      // After "GenericRateLimiter::Request:1" the mutex is held until the bytes
+      // are refilled. This test could be improved to change the limit when lock
+      // is released in `TimedWait()`.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+          {{"GenericRateLimiter::Request",
+            "RateLimiterTest::LimitChangeTest:changeLimitStart"},
+           {"RateLimiterTest::LimitChangeTest:changeLimitEnd",
+            "GenericRateLimiter::Request:1"}});
+      Arg arg(target, Env::IO_HIGH, limiter);
+      // The idea behind is to start a request first, then before it refills,
+      // update limit to a different value (2X/0.5X). No starvation should
+      // be guaranteed under any situation
+      // TODO(lightmark): more test cases are welcome.
+      env->StartThread(writer, &arg);
+      int32_t new_limit = (target << 1) >> (iter << 1);
+      TEST_SYNC_POINT("RateLimiterTest::LimitChangeTest:changeLimitStart");
+      arg.limiter->SetBytesPerSecond(new_limit);
+      TEST_SYNC_POINT("RateLimiterTest::LimitChangeTest:changeLimitEnd");
+      env->WaitForJoin();
+      fprintf(stderr,
+              "[COMPLETE] request size %" PRIi32 " KB, new limit %" PRIi32
+              "KB/sec, refill period %" PRIi64 " ms\n",
+              target / 1024, new_limit / 1024, refill_period / 1000);
+    }
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(RateLimiterTest, AutoTuneIncreaseWhenFull) {
+  const std::chrono::seconds kTimePerRefill(1);
+  const int kRefillsPerTune = 100;  // needs to match util/rate_limiter.cc
+
+  SpecialEnv special_env(Env::Default(), /*time_elapse_only_sleep*/ true);
+
+  auto stats = CreateDBStatistics();
+  std::unique_ptr<RateLimiter> rate_limiter(new GenericRateLimiter(
+      1000 /* rate_bytes_per_sec */,
+      std::chrono::microseconds(kTimePerRefill).count(), 10 /* fairness */,
+      RateLimiter::Mode::kWritesOnly, special_env.GetSystemClock(),
+      true /* auto_tuned */));
+
+  // Rate limiter uses `CondVar::TimedWait()`, which does not have access to the
+  // `Env` to advance its time according to the fake wait duration. The
+  // workaround is to install a callback that advance the `Env`'s mock time.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "GenericRateLimiter::Request:PostTimedWait", [&](void* arg) {
+        int64_t time_waited_us = *static_cast<int64_t*>(arg);
+        special_env.SleepForMicroseconds(static_cast<int>(time_waited_us));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // verify rate limit increases after a sequence of periods where rate limiter
+  // is always drained
+  int64_t orig_bytes_per_sec = rate_limiter->GetSingleBurstBytes();
+  rate_limiter->Request(orig_bytes_per_sec, Env::IO_HIGH, stats.get(),
+                        RateLimiter::OpType::kWrite);
+  while (std::chrono::microseconds(special_env.NowMicros()) <=
+         kRefillsPerTune * kTimePerRefill) {
+    rate_limiter->Request(orig_bytes_per_sec, Env::IO_HIGH, stats.get(),
+                          RateLimiter::OpType::kWrite);
+  }
+  int64_t new_bytes_per_sec = rate_limiter->GetSingleBurstBytes();
+  ASSERT_GT(new_bytes_per_sec, orig_bytes_per_sec);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "GenericRateLimiter::Request:PostTimedWait");
+
+  // decreases after a sequence of periods where rate limiter is not drained
+  orig_bytes_per_sec = new_bytes_per_sec;
+  special_env.SleepForMicroseconds(static_cast<int>(
+      kRefillsPerTune * std::chrono::microseconds(kTimePerRefill).count()));
+  // make a request so tuner can be triggered
+  rate_limiter->Request(1 /* bytes */, Env::IO_HIGH, stats.get(),
+                        RateLimiter::OpType::kWrite);
+  new_bytes_per_sec = rate_limiter->GetSingleBurstBytes();
+  ASSERT_LT(new_bytes_per_sec, orig_bytes_per_sec);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/repeatable_thread.h b/src/rocksdb/util/repeatable_thread.h
new file mode 100644
index 000000000..c75ad7c49
--- /dev/null
+++ b/src/rocksdb/util/repeatable_thread.h
@@ -0,0 +1,149 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <functional>
+#include <string>
+
+#include "monitoring/instrumented_mutex.h"
+#include "port/port.h"
+#include "rocksdb/system_clock.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Simple wrapper around port::Thread that supports calling a callback every
+// X seconds. If you pass in 0, then it will call your callback repeatedly
+// without delay.
+class RepeatableThread {
+ public:
+  RepeatableThread(std::function<void()> function,
+                   const std::string& thread_name, SystemClock* clock,
+                   uint64_t delay_us, uint64_t initial_delay_us = 0)
+      : function_(function),
+        thread_name_("rocksdb:" + thread_name),
+        clock_(clock),
+        delay_us_(delay_us),
+        initial_delay_us_(initial_delay_us),
+        mutex_(clock),
+        cond_var_(&mutex_),
+        running_(true),
+#ifndef NDEBUG
+        waiting_(false),
+        run_count_(0),
+#endif
+        thread_([this] { thread(); }) {
+  }
+
+  void cancel() {
+    {
+      InstrumentedMutexLock l(&mutex_);
+      if (!running_) {
+        return;
+      }
+      running_ = false;
+      cond_var_.SignalAll();
+    }
+    thread_.join();
+  }
+
+  bool IsRunning() { return running_; }
+
+  ~RepeatableThread() { cancel(); }
+
+#ifndef NDEBUG
+  // Wait until RepeatableThread starting waiting, call the optional callback,
+  // then wait for one run of RepeatableThread. Tests can use provide a
+  // custom clock object to mock time, and use the callback here to bump current
+  // time and trigger RepeatableThread. See repeatable_thread_test for example.
+  //
+  // Note: only support one caller of this method.
+  void TEST_WaitForRun(std::function<void()> callback = nullptr) {
+    InstrumentedMutexLock l(&mutex_);
+    while (!waiting_) {
+      cond_var_.Wait();
+    }
+    uint64_t prev_count = run_count_;
+    if (callback != nullptr) {
+      callback();
+    }
+    cond_var_.SignalAll();
+    while (!(run_count_ > prev_count)) {
+      cond_var_.Wait();
+    }
+  }
+#endif
+
+ private:
+  bool wait(uint64_t delay) {
+    InstrumentedMutexLock l(&mutex_);
+    if (running_ && delay > 0) {
+      uint64_t wait_until = clock_->NowMicros() + delay;
+#ifndef NDEBUG
+      waiting_ = true;
+      cond_var_.SignalAll();
+#endif
+      while (running_) {
+        cond_var_.TimedWait(wait_until);
+        if (clock_->NowMicros() >= wait_until) {
+          break;
+        }
+      }
+#ifndef NDEBUG
+      waiting_ = false;
+#endif
+    }
+    return running_;
+  }
+
+  void thread() {
+#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
+#if __GLIBC_PREREQ(2, 12)
+    // Set thread name.
+    auto thread_handle = thread_.native_handle();
+    int ret __attribute__((__unused__)) =
+        pthread_setname_np(thread_handle, thread_name_.c_str());
+    assert(ret == 0);
+#endif
+#endif
+
+    assert(delay_us_ > 0);
+    if (!wait(initial_delay_us_)) {
+      return;
+    }
+    do {
+      function_();
+#ifndef NDEBUG
+      {
+        InstrumentedMutexLock l(&mutex_);
+        run_count_++;
+        cond_var_.SignalAll();
+      }
+#endif
+    } while (wait(delay_us_));
+  }
+
+  const std::function<void()> function_;
+  const std::string thread_name_;
+  SystemClock* clock_;
+  const uint64_t delay_us_;
+  const uint64_t initial_delay_us_;
+
+  // Mutex lock should be held when accessing running_, waiting_
+  // and run_count_.
+  InstrumentedMutex mutex_;
+  InstrumentedCondVar cond_var_;
+  bool running_;
+#ifndef NDEBUG
+  // RepeatableThread waiting for timeout.
+  bool waiting_;
+  // Times function_ had run.
+  uint64_t run_count_;
+#endif
+  port::Thread thread_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/repeatable_thread_test.cc b/src/rocksdb/util/repeatable_thread_test.cc
new file mode 100644
index 000000000..0b3e95464
--- /dev/null
+++ b/src/rocksdb/util/repeatable_thread_test.cc
@@ -0,0 +1,111 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/repeatable_thread.h"
+
+#include <atomic>
+#include <memory>
+
+#include "db/db_test_util.h"
+#include "test_util/mock_time_env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+
+class RepeatableThreadTest : public testing::Test {
+ public:
+  RepeatableThreadTest()
+      : mock_clock_(std::make_shared<ROCKSDB_NAMESPACE::MockSystemClock>(
+            ROCKSDB_NAMESPACE::SystemClock::Default())) {}
+
+ protected:
+  std::shared_ptr<ROCKSDB_NAMESPACE::MockSystemClock> mock_clock_;
+};
+
+TEST_F(RepeatableThreadTest, TimedTest) {
+  constexpr uint64_t kSecond = 1000000;  // 1s = 1000000us
+  constexpr int kIteration = 3;
+  const auto& clock = ROCKSDB_NAMESPACE::SystemClock::Default();
+  ROCKSDB_NAMESPACE::port::Mutex mutex;
+  ROCKSDB_NAMESPACE::port::CondVar test_cv(&mutex);
+  int count = 0;
+  uint64_t prev_time = clock->NowMicros();
+  ROCKSDB_NAMESPACE::RepeatableThread thread(
+      [&] {
+        ROCKSDB_NAMESPACE::MutexLock l(&mutex);
+        count++;
+        uint64_t now = clock->NowMicros();
+        assert(count == 1 || prev_time + 1 * kSecond <= now);
+        prev_time = now;
+        if (count >= kIteration) {
+          test_cv.SignalAll();
+        }
+      },
+      "rt_test", clock.get(), 1 * kSecond);
+  // Wait for execution finish.
+  {
+    ROCKSDB_NAMESPACE::MutexLock l(&mutex);
+    while (count < kIteration) {
+      test_cv.Wait();
+    }
+  }
+
+  // Test cancel
+  thread.cancel();
+}
+
+TEST_F(RepeatableThreadTest, MockEnvTest) {
+  constexpr uint64_t kSecond = 1000000;  // 1s = 1000000us
+  constexpr int kIteration = 3;
+  mock_clock_->SetCurrentTime(0);  // in seconds
+  std::atomic<int> count{0};
+
+#if defined(OS_MACOSX) && !defined(NDEBUG)
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
+        // Obtain the current (real) time in seconds and add 1000 extra seconds
+        // to ensure that RepeatableThread::wait invokes TimedWait with a time
+        // greater than (real) current time. This is to prevent the TimedWait
+        // function from returning immediately without sleeping and releasing
+        // the mutex on certain platforms, e.g. OS X. If TimedWait returns
+        // immediately, the mutex will not be released, and
+        // RepeatableThread::TEST_WaitForRun never has a chance to execute the
+        // callback which, in this case, updates the result returned by
+        // mock_clock->NowMicros. Consequently, RepeatableThread::wait cannot
+        // break out of the loop, causing test to hang. The extra 1000 seconds
+        // is a best-effort approach because there seems no reliable and
+        // deterministic way to provide the aforementioned guarantee. By the
+        // time RepeatableThread::wait is called, it is no guarantee that the
+        // delay + mock_clock->NowMicros will be greater than the current real
+        // time. However, 1000 seconds should be sufficient in most cases.
+        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
+        if (time_us < mock_clock_->RealNowMicros()) {
+          *reinterpret_cast<uint64_t*>(arg) =
+              mock_clock_->RealNowMicros() + 1000;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+#endif  // OS_MACOSX && !NDEBUG
+
+  ROCKSDB_NAMESPACE::RepeatableThread thread(
+      [&] { count++; }, "rt_test", mock_clock_.get(), 1 * kSecond, 1 * kSecond);
+  for (int i = 1; i <= kIteration; i++) {
+    // Bump current time
+    thread.TEST_WaitForRun([&] { mock_clock_->SetCurrentTime(i); });
+  }
+  // Test function should be exectued exactly kIteraion times.
+  ASSERT_EQ(kIteration, count.load());
+
+  // Test cancel
+  thread.cancel();
+}
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/ribbon_alg.h b/src/rocksdb/util/ribbon_alg.h
new file mode 100644
index 000000000..f9afefc23
--- /dev/null
+++ b/src/rocksdb/util/ribbon_alg.h
@@ -0,0 +1,1225 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <array>
+#include <memory>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "util/math128.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace ribbon {
+
+// RIBBON PHSF & RIBBON Filter (Rapid Incremental Boolean Banding ON-the-fly)
+//
+// ribbon_alg.h: generic versions of core algorithms.
+//
+// Ribbon is a Perfect Hash Static Function construction useful as a compact
+// static Bloom filter alternative. It combines (a) a boolean (GF(2)) linear
+// system construction that approximates a Band Matrix with hashing,
+// (b) an incremental, on-the-fly Gaussian Elimination algorithm that is
+// remarkably efficient and adaptable at constructing an upper-triangular
+// band matrix from a set of band-approximating inputs from (a), and
+// (c) a storage layout that is fast and adaptable as a filter.
+//
+// Footnotes: (a) "Efficient Gauss Elimination for Near-Quadratic Matrices
+// with One Short Random Block per Row, with Applications" by Stefan
+// Walzer and Martin Dietzfelbinger ("DW paper")
+// (b) developed by Peter C. Dillinger, though not the first on-the-fly
+// GE algorithm. See "On the fly Gaussian Elimination for LT codes" by
+// Bioglio, Grangetto, Gaeta, and Sereno.
+// (c) see "interleaved" solution storage below.
+//
+// See ribbon_impl.h for high-level behavioral summary. This file focuses
+// on the core design details.
+//
+// ######################################################################
+// ################# PHSF -> static filter reduction ####################
+//
+// A Perfect Hash Static Function is a data structure representing a
+// map from anything hashable (a "key") to values of some fixed size.
+// Crucially, it is allowed to return garbage values for anything not in
+// the original set of map keys, and it is a "static" structure: entries
+// cannot be added or deleted after construction. PHSFs representing n
+// mappings to b-bit values (assume uniformly distributed) require at least
+// n * b bits to represent, or at least b bits per entry. We typically
+// describe the compactness of a PHSF by typical bits per entry as some
+// function of b. For example, the MWHC construction (k=3 "peeling")
+// requires about 1.0222*b and a variant called Xor+ requires about
+// 1.08*b + 0.5 bits per entry.
+//
+// With more hashing, a PHSF can over-approximate a set as a Bloom filter
+// does, with no FN queries and predictable false positive (FP) query
+// rate. Instead of the user providing a value to map each input key to,
+// a hash function provides the value. Keys in the original set will
+// return a positive membership query because the underlying PHSF returns
+// the same value as hashing the key. When a key is not in the original set,
+// the PHSF returns a "garbage" value, which is only equal to the key's
+// hash with (false positive) probability 1 in 2^b.
+//
+// For a matching false positive rate, standard Bloom filters require
+// 1.44*b bits per entry. Cache-local Bloom filters (like bloom_impl.h)
+// require a bit more, around 1.5*b bits per entry. Thus, a Bloom
+// alternative could save up to or nearly 1/3rd of memory and storage
+// that RocksDB uses for SST (static) Bloom filters. (Memtable Bloom filter
+// is dynamic.)
+//
+// Recommended reading:
+// "Xor Filters: Faster and Smaller Than Bloom and Cuckoo Filters"
+// by Graf and Lemire
+// First three sections of "Fast Scalable Construction of (Minimal
+// Perfect Hash) Functions" by Genuzio, Ottaviano, and Vigna
+//
+// ######################################################################
+// ################## PHSF vs. hash table vs. Bloom #####################
+//
+// You can think of traditional hash tables and related filter variants
+// such as Cuckoo filters as utilizing an "OR" construction: a hash
+// function associates a key with some slots and the data is returned if
+// the data is found in any one of those slots. The collision resolution
+// is visible in the final data structure and requires extra information.
+// For example, Cuckoo filter uses roughly 1.05b + 2 bits per entry, and
+// Golomb-Rice code (aka "GCS") as little as b + 1.5. When the data
+// structure associates each input key with data in one slot, the
+// structure implicitly constructs a (near-)minimal (near-)perfect hash
+// (MPH) of the keys, which requires at least 1.44 bits per key to
+// represent. This is why approaches with visible collision resolution
+// have a fixed + 1.5 or more in storage overhead per entry, often in
+// addition to an overhead multiplier on b.
+//
+// By contrast Bloom filters utilize an "AND" construction: a query only
+// returns true if all bit positions associated with a key are set to 1.
+// There is no collision resolution, so Bloom filters do not suffer a
+// fixed bits per entry overhead like the above structures.
+//
+// PHSFs typically use a bitwise XOR construction: the data you want is
+// not in a single slot, but in a linear combination of several slots.
+// For static data, this gives the best of "AND" and "OR" constructions:
+// avoids the +1.44 or more fixed overhead by not approximating a MPH and
+// can do much better than Bloom's 1.44 factor on b with collision
+// resolution, which here is done ahead of time and invisible at query
+// time.
+//
+// ######################################################################
+// ######################## PHSF construction ###########################
+//
+// For a typical PHSF, construction is solving a linear system of
+// equations, typically in GF(2), which is to say that values are boolean
+// and XOR serves both as addition and subtraction. We can use matrices to
+// represent the problem:
+//
+//    C    *    S    =    R
+// (n x m)   (m x b)   (n x b)
+// where C = coefficients, S = solution, R = results
+// and solving for S given C and R.
+//
+// Note that C and R each have n rows, one for each input entry for the
+// PHSF. A row in C is given by a hash function on the PHSF input key,
+// and the corresponding row in R is the b-bit value to associate with
+// that input key. (In a filter, rows of R are given by another hash
+// function on the input key.)
+//
+// On solving, the matrix S (solution) is the final PHSF data, as it
+// maps any row from the original C to its corresponding desired result
+// in R. We just have to hash our query inputs and compute a linear
+// combination of rows in S.
+//
+// In theory, we could chose m = n and let a hash function associate
+// each input key with random rows in C. A solution exists with high
+// probability, and uses essentially minimum space, b bits per entry
+// (because we set m = n) but this has terrible scaling, something
+// like O(n^2) space and O(n^3) time during construction (Gaussian
+// elimination) and O(n) query time. But computational efficiency is
+// key, and the core of this is avoiding scanning all of S to answer
+// each query.
+//
+// The traditional approach (MWHC, aka Xor filter) starts with setting
+// only some small fixed number of columns (typically k=3) to 1 for each
+// row of C, with remaining entries implicitly 0. This is implemented as
+// three hash functions over [0,m), and S can be implemented as a vector
+// of b-bit values. Now, a query only involves looking up k rows
+// (values) in S and computing their bitwise XOR. Additionally, this
+// construction can use a linear time algorithm called "peeling" for
+// finding a solution in many cases of one existing, but peeling
+// generally requires a larger space overhead factor in the solution
+// (m/n) than is required with Gaussian elimination.
+//
+// Recommended reading:
+// "Peeling Close to the Orientability Threshold - Spatial Coupling in
+// Hashing-Based Data Structures" by Stefan Walzer
+//
+// ######################################################################
+// ##################### Ribbon PHSF construction #######################
+//
+// Ribbon constructs coefficient rows essentially the same as in the
+// Walzer/Dietzfelbinger paper cited above: for some chosen fixed width
+// r (kCoeffBits in code), each key is hashed to a starting column in
+// [0, m - r] (GetStart() in code) and an r-bit sequence of boolean
+// coefficients (GetCoeffRow() in code). If you sort the rows by start,
+// the C matrix would look something like this:
+//
+// [####00000000000000000000]
+// [####00000000000000000000]
+// [000####00000000000000000]
+// [0000####0000000000000000]
+// [0000000####0000000000000]
+// [000000000####00000000000]
+// [000000000####00000000000]
+// [0000000000000####0000000]
+// [0000000000000000####0000]
+// [00000000000000000####000]
+// [00000000000000000000####]
+//
+// where each # could be a 0 or 1, chosen uniformly by a hash function.
+// (Except we typically set the start column value to 1.) This scheme
+// uses hashing to approximate a band matrix, and it has a solution iff
+// it reduces to an upper-triangular boolean r-band matrix, like this:
+//
+// [1###00000000000000000000]
+// [01##00000000000000000000]
+// [000000000000000000000000]
+// [0001###00000000000000000]
+// [000000000000000000000000]
+// [000001##0000000000000000]
+// [000000000000000000000000]
+// [00000001###0000000000000]
+// [000000001###000000000000]
+// [0000000001##000000000000]
+// ...
+// [00000000000000000000001#]
+// [000000000000000000000001]
+//
+// where we have expanded to an m x m matrix by filling with rows of
+// all zeros as needed. As in Gaussian elimination, this form is ready for
+// generating a solution through back-substitution.
+//
+// The awesome thing about the Ribbon construction (from the DW paper) is
+// how row reductions keep each row representable as a start column and
+// r coefficients, because row reductions are only needed when two rows
+// have the same number of leading zero columns. Thus, the combination
+// of those rows, the bitwise XOR of the r-bit coefficient rows, cancels
+// out the leading 1s, so starts (at least) one column later and only
+// needs (at most) r - 1 coefficients.
+//
+// ######################################################################
+// ###################### Ribbon PHSF scalability #######################
+//
+// Although more practical detail is in ribbon_impl.h, it's worth
+// understanding some of the overall benefits and limitations of the
+// Ribbon PHSFs.
+//
+// High-end scalability is a primary issue for Ribbon PHSFs, because in
+// a single Ribbon linear system with fixed r and fixed m/n ratio, the
+// solution probability approaches zero as n approaches infinity.
+// For a given n, solution probability improves with larger r and larger
+// m/n.
+//
+// By contrast, peeling-based PHSFs have somewhat worse storage ratio
+// or solution probability for small n (less than ~1000). This is
+// especially true with spatial-coupling, where benefits are only
+// notable for n on the order of 100k or 1m or more.
+//
+// To make best use of current hardware, r=128 seems to be closest to
+// a "generally good" choice for Ribbon, at least in RocksDB where SST
+// Bloom filters typically hold around 10-100k keys, and almost always
+// less than 10m keys. r=128 ribbon has a high chance of encoding success
+// (with first hash seed) when storage overhead is around 5% (m/n ~ 1.05)
+// for roughly 10k - 10m keys in a single linear system. r=64 only scales
+// up to about 10k keys with the same storage overhead. Construction and
+// access times for r=128 are similar to r=64. r=128 tracks nearly
+// twice as much data during construction, but in most cases we expect
+// the scalability benefits of r=128 vs. r=64 to make it preferred.
+//
+// A natural approach to scaling Ribbon beyond ~10m keys is splitting
+// (or "sharding") the inputs into multiple linear systems with their
+// own hash seeds. This can also help to control peak memory consumption.
+// TODO: much more to come
+//
+// ######################################################################
+// #################### Ribbon on-the-fly banding #######################
+//
+// "Banding" is what we call the process of reducing the inputs to an
+// upper-triangular r-band matrix ready for finishing a solution with
+// back-substitution. Although the DW paper presents an algorithm for
+// this ("SGauss"), the awesome properties of their construction enable
+// an even simpler, faster, and more backtrackable algorithm. In simplest
+// terms, the SGauss algorithm requires sorting the inputs by start
+// columns, but it's possible to make Gaussian elimination resemble hash
+// table insertion!
+//
+// The enhanced algorithm is based on these observations:
+// - When processing a coefficient row with first 1 in column j,
+//   - If it's the first at column j to be processed, it can be part of
+//     the banding at row j. (And that decision never overwritten, with
+//     no loss of generality!)
+//   - Else, it can be combined with existing row j and re-processed,
+//     which will look for a later "empty" row or reach "no solution".
+//
+// We call our banding algorithm "incremental" and "on-the-fly" because
+// (like hash table insertion) we are "finished" after each input
+// processed, with respect to all inputs processed so far. Although the
+// band matrix is an intermediate step to the solution structure, we have
+// eliminated intermediate steps and unnecessary data tracking for
+// banding.
+//
+// Building on "incremental" and "on-the-fly", the banding algorithm is
+// easily backtrackable because no (non-empty) rows are overwritten in
+// the banding. Thus, if we want to "try" adding an additional set of
+// inputs to the banding, we only have to record which rows were written
+// in order to efficiently backtrack to our state before considering
+// the additional set. (TODO: how this can mitigate scalability and
+// reach sub-1% overheads)
+//
+// Like in a linear-probed hash table, as the occupancy approaches and
+// surpasses 90-95%, collision resolution dominates the construction
+// time. (Ribbon doesn't usually pay at query time; see solution
+// storage below.) This means that we can speed up construction time
+// by using a higher m/n ratio, up to negative returns around 1.2.
+// At m/n ~= 1.2, which still saves memory substantially vs. Bloom
+// filter's 1.5, construction speed (including back-substitution) is not
+// far from sorting speed, but still a few times slower than cache-local
+// Bloom construction speed.
+//
+// Back-substitution from an upper-triangular boolean band matrix is
+// especially fast and easy. All the memory accesses are sequential or at
+// least local, no random. If the number of result bits (b) is a
+// compile-time constant, the back-substitution state can even be tracked
+// in CPU registers. Regardless of the solution representation, we prefer
+// column-major representation for tracking back-substitution state, as
+// r (the band width) will typically be much larger than b (result bits
+// or columns), so better to handle r-bit values b times (per solution
+// row) than b-bit values r times.
+//
+// ######################################################################
+// ##################### Ribbon solution storage ########################
+//
+// Row-major layout is typical for boolean (bit) matrices, including for
+// MWHC (Xor) filters where a query combines k b-bit values, and k is
+// typically smaller than b. Even for k=4 and b=2, at least k=4 random
+// look-ups are required regardless of layout.
+//
+// Ribbon PHSFs are quite different, however, because
+// (a) all of the solution rows relevant to a query are within a single
+// range of r rows, and
+// (b) the number of solution rows involved (r/2 on average, or r if
+// avoiding conditional accesses) is typically much greater than
+// b, the number of solution columns.
+//
+// Row-major for Ribbon PHSFs therefore tends to incur undue CPU overhead
+// by processing (up to) r entries of b bits each, where b is typically
+// less than 10 for filter applications.
+//
+// Column-major layout has poor locality because of accessing up to b
+// memory locations in different pages (and obviously cache lines). Note
+// that negative filter queries do not typically need to access all
+// solution columns, as they can return when a mismatch is found in any
+// result/solution column. This optimization doesn't always pay off on
+// recent hardware, where the penalty for unpredictable conditional
+// branching can exceed the penalty for unnecessary work, but the
+// optimization is essentially unavailable with row-major layout.
+//
+// The best compromise seems to be interleaving column-major on the small
+// scale with row-major on the large scale. For example, let a solution
+// "block" be r rows column-major encoded as b r-bit values in sequence.
+// Each query accesses (up to) 2 adjacent blocks, which will typically
+// span 1-3 cache lines in adjacent memory. We get very close to the same
+// locality as row-major, but with much faster reconstruction of each
+// result column, at least for filter applications where b is relatively
+// small and negative queries can return early.
+//
+// ######################################################################
+// ###################### Fractional result bits ########################
+//
+// Bloom filters have great flexibility that alternatives mostly do not
+// have. One of those flexibilities is in utilizing any ratio of data
+// structure bits per key. With a typical memory allocator like jemalloc,
+// this flexibility can save roughly 10% of the filters' footprint in
+// DRAM by rounding up and down filter sizes to minimize memory internal
+// fragmentation (see optimize_filters_for_memory RocksDB option).
+//
+// At first glance, PHSFs only offer a whole number of bits per "slot"
+// (m rather than number of keys n), but coefficient locality in the
+// Ribbon construction makes fractional bits/key quite possible and
+// attractive for filter applications. This works by a prefix of the
+// structure using b-1 solution columns and the rest using b solution
+// columns. See InterleavedSolutionStorage below for more detail.
+//
+// Because false positive rates are non-linear in bits/key, this approach
+// is not quite optimal in terms of information theory. In common cases,
+// we see additional space overhead up to about 1.5% vs. theoretical
+// optimal to achieve the same FP rate. We consider this a quite acceptable
+// overhead for very efficiently utilizing space that might otherwise be
+// wasted.
+//
+// This property of Ribbon even makes it "elastic." A Ribbon filter and
+// its small metadata for answering queries can be adapted into another
+// Ribbon filter filling any smaller multiple of r bits (plus small
+// metadata), with a correspondingly higher FP rate. None of the data
+// thrown away during construction needs to be recalled for this reduction.
+// Similarly a single Ribbon construction can be separated (by solution
+// column) into two or more structures (or "layers" or "levels") with
+// independent filtering ability (no FP correlation, just as solution or
+// result columns in a single structure) despite being constructed as part
+// of a single linear system. (TODO: implement)
+// See also "ElasticBF: Fine-grained and Elastic Bloom Filter Towards
+// Efficient Read for LSM-tree-based KV Stores."
+//
+
+// ######################################################################
+// ################### CODE: Ribbon core algorithms #####################
+// ######################################################################
+//
+// These algorithms are templatized for genericity but near-maximum
+// performance in a given application. The template parameters
+// adhere to informal class/struct type concepts outlined below. (This
+// code is written for C++11 so does not use formal C++ concepts.)
+
+// Rough architecture for these algorithms:
+//
+// +-----------+     +---+     +-----------------+
+// | AddInputs | --> | H | --> | BandingStorage  |
+// +-----------+     | a |     +-----------------+
+//                   | s |             |
+//                   | h |      Back substitution
+//                   | e |             V
+// +-----------+     | r |     +-----------------+
+// | Query Key | --> |   | >+< | SolutionStorage |
+// +-----------+     +---+  |  +-----------------+
+//                          V
+//                     Query result
+
+// Common to other concepts
+// concept RibbonTypes {
+//   // An unsigned integer type for an r-bit subsequence of coefficients.
+//   // r (or kCoeffBits) is taken to be sizeof(CoeffRow) * 8, as it would
+//   // generally only hurt scalability to leave bits of CoeffRow unused.
+//   typename CoeffRow;
+//   // An unsigned integer type big enough to hold a result row (b bits,
+//   // or number of solution/result columns).
+//   // In many applications, especially filters, the number of result
+//   // columns is decided at run time, so ResultRow simply needs to be
+//   // big enough for the largest number of columns allowed.
+//   typename ResultRow;
+//   // An unsigned integer type sufficient for representing the number of
+//   // rows in the solution structure, and at least the arithmetic
+//   // promotion size (usually 32 bits). uint32_t recommended because a
+//   // single Ribbon construction doesn't really scale to billions of
+//   // entries.
+//   typename Index;
+// };
+
+// ######################################################################
+// ######################## Hashers and Banding #########################
+
+// Hasher concepts abstract out hashing details.
+
+// concept PhsfQueryHasher extends RibbonTypes {
+//   // Type for a lookup key, which is hashable.
+//   typename Key;
+//
+//   // Type for hashed summary of a Key. uint64_t is recommended.
+//   typename Hash;
+//
+//   // Compute a hash value summarizing a Key
+//   Hash GetHash(const Key &) const;
+//
+//   // Given a hash value and a number of columns that can start an
+//   // r-sequence of coefficients (== m - r + 1), return the start
+//   // column to associate with that hash value. (Starts can be chosen
+//   // uniformly or "smash" extra entries into the beginning and end for
+//   // better utilization at those extremes of the structure. Details in
+//   // ribbon.impl.h)
+//   Index GetStart(Hash, Index num_starts) const;
+//
+//   // Given a hash value, return the r-bit sequence of coefficients to
+//   // associate with it. It's generally OK if
+//   //   sizeof(CoeffRow) > sizeof(Hash)
+//   // as long as the hash itself is not too prone to collisions for the
+//   // applications and the CoeffRow is generated uniformly from
+//   // available hash data, but relatively independent of the start.
+//   //
+//   // Must be non-zero, because that's required for a solution to exist
+//   // when mapping to non-zero result row. (Note: BandingAdd could be
+//   // modified to allow 0 coeff row if that only occurs with 0 result
+//   // row, which really only makes sense for filter implementation,
+//   // where both values are hash-derived. Or BandingAdd could reject 0
+//   // coeff row, forcing next seed, but that has potential problems with
+//   // generality/scalability.)
+//   CoeffRow GetCoeffRow(Hash) const;
+// };
+
+// concept FilterQueryHasher extends PhsfQueryHasher {
+//   // For building or querying a filter, this returns the expected
+//   // result row associated with a hashed input. For general PHSF,
+//   // this must return 0.
+//   //
+//   // Although not strictly required, there's a slightly better chance of
+//   // solver success if result row is masked down here to only the bits
+//   // actually needed.
+//   ResultRow GetResultRowFromHash(Hash) const;
+// }
+
+// concept BandingHasher extends FilterQueryHasher {
+//   // For a filter, this will generally be the same as Key.
+//   // For a general PHSF, it must either
+//   // (a) include a key and a result it maps to (e.g. in a std::pair), or
+//   // (b) GetResultRowFromInput looks up the result somewhere rather than
+//   // extracting it.
+//   typename AddInput;
+//
+//   // Instead of requiring a way to extract a Key from an
+//   // AddInput, we require getting the hash of the Key part
+//   // of an AddInput, which is trivial if AddInput == Key.
+//   Hash GetHash(const AddInput &) const;
+//
+//   // For building a non-filter PHSF, this extracts or looks up the result
+//   // row to associate with an input. For filter PHSF, this must return 0.
+//   ResultRow GetResultRowFromInput(const AddInput &) const;
+//
+//   // Whether the solver can assume the lowest bit of GetCoeffRow is
+//   // always 1. When true, it should improve solver efficiency slightly.
+//   static bool kFirstCoeffAlwaysOne;
+// }
+
+// Abstract storage for the the result of "banding" the inputs (Gaussian
+// elimination to an upper-triangular boolean band matrix). Because the
+// banding is an incremental / on-the-fly algorithm, this also represents
+// all the intermediate state between input entries.
+//
+// concept BandingStorage extends RibbonTypes {
+//   // Tells the banding algorithm to prefetch memory associated with
+//   // the next input before processing the current input. Generally
+//   // recommended iff the BandingStorage doesn't easily fit in CPU
+//   // cache.
+//   bool UsePrefetch() const;
+//
+//   // Prefetches (e.g. __builtin_prefetch) memory associated with a
+//   // slot index i.
+//   void Prefetch(Index i) const;
+//
+//   // Load or store CoeffRow and ResultRow for slot index i.
+//   // (Gaussian row operations involve both sides of the equation.)
+//   // Bool `for_back_subst` indicates that customizing values for
+//   // unconstrained solution rows (cr == 0) is allowed.
+//   void LoadRow(Index i, CoeffRow *cr, ResultRow *rr, bool for_back_subst)
+//        const;
+//   void StoreRow(Index i, CoeffRow cr, ResultRow rr);
+//
+//   // Returns the number of columns that can start an r-sequence of
+//   // coefficients, which is the number of slots minus r (kCoeffBits)
+//   // plus one. (m - r + 1)
+//   Index GetNumStarts() const;
+// };
+
+// Optional storage for backtracking data in banding a set of input
+// entries. It exposes an array structure which will generally be
+// used as a stack. It must be able to accommodate as many entries
+// as are passed in as inputs to `BandingAddRange`.
+//
+// concept BacktrackStorage extends RibbonTypes {
+//   // If false, backtracking support will be disabled in the algorithm.
+//   // This should preferably be an inline compile-time constant function.
+//   bool UseBacktrack() const;
+//
+//   // Records `to_save` as the `i`th backtrack entry
+//   void BacktrackPut(Index i, Index to_save);
+//
+//   // Recalls the `i`th backtrack entry
+//   Index BacktrackGet(Index i) const;
+// }
+
+// Adds a single entry to BandingStorage (and optionally, BacktrackStorage),
+// returning true if successful or false if solution is impossible with
+// current hasher (and presumably its seed) and number of "slots" (solution
+// or banding rows). (A solution is impossible when there is a linear
+// dependence among the inputs that doesn't "cancel out".)
+//
+// Pre- and post-condition: the BandingStorage represents a band matrix
+// ready for back substitution (row echelon form except for zero rows),
+// augmented with result values such that back substitution would give a
+// solution satisfying all the cr@start -> rr entries added.
+template <bool kFirstCoeffAlwaysOne, typename BandingStorage,
+          typename BacktrackStorage>
+bool BandingAdd(BandingStorage *bs, typename BandingStorage::Index start,
+                typename BandingStorage::ResultRow rr,
+                typename BandingStorage::CoeffRow cr, BacktrackStorage *bts,
+                typename BandingStorage::Index *backtrack_pos) {
+  using CoeffRow = typename BandingStorage::CoeffRow;
+  using ResultRow = typename BandingStorage::ResultRow;
+  using Index = typename BandingStorage::Index;
+
+  Index i = start;
+
+  if (!kFirstCoeffAlwaysOne) {
+    // Requires/asserts that cr != 0
+    int tz = CountTrailingZeroBits(cr);
+    i += static_cast<Index>(tz);
+    cr >>= tz;
+  }
+
+  for (;;) {
+    assert((cr & 1) == 1);
+    CoeffRow cr_at_i;
+    ResultRow rr_at_i;
+    bs->LoadRow(i, &cr_at_i, &rr_at_i, /* for_back_subst */ false);
+    if (cr_at_i == 0) {
+      bs->StoreRow(i, cr, rr);
+      bts->BacktrackPut(*backtrack_pos, i);
+      ++*backtrack_pos;
+      return true;
+    }
+    assert((cr_at_i & 1) == 1);
+    // Gaussian row reduction
+    cr ^= cr_at_i;
+    rr ^= rr_at_i;
+    if (cr == 0) {
+      // Inconsistency or (less likely) redundancy
+      break;
+    }
+    // Find relative offset of next non-zero coefficient.
+    int tz = CountTrailingZeroBits(cr);
+    i += static_cast<Index>(tz);
+    cr >>= tz;
+  }
+
+  // Failed, unless result row == 0 because e.g. a duplicate input or a
+  // stock hash collision, with same result row. (For filter, stock hash
+  // collision implies same result row.) Or we could have a full equation
+  // equal to sum of other equations, which is very possible with
+  // small range of values for result row.
+  return rr == 0;
+}
+
+// Adds a range of entries to BandingStorage returning true if successful
+// or false if solution is impossible with current hasher (and presumably
+// its seed) and number of "slots" (solution or banding rows). (A solution
+// is impossible when there is a linear dependence among the inputs that
+// doesn't "cancel out".) Here "InputIterator" is an iterator over AddInputs.
+//
+// If UseBacktrack in the BacktrackStorage, this function call rolls back
+// to prior state on failure. If !UseBacktrack, some subset of the entries
+// will have been added to the BandingStorage, so best considered to be in
+// an indeterminate state.
+//
+template <typename BandingStorage, typename BacktrackStorage,
+          typename BandingHasher, typename InputIterator>
+bool BandingAddRange(BandingStorage *bs, BacktrackStorage *bts,
+                     const BandingHasher &bh, InputIterator begin,
+                     InputIterator end) {
+  using CoeffRow = typename BandingStorage::CoeffRow;
+  using Index = typename BandingStorage::Index;
+  using ResultRow = typename BandingStorage::ResultRow;
+  using Hash = typename BandingHasher::Hash;
+
+  static_assert(IsUnsignedUpTo128<CoeffRow>::value, "must be unsigned");
+  static_assert(IsUnsignedUpTo128<Index>::value, "must be unsigned");
+  static_assert(IsUnsignedUpTo128<ResultRow>::value, "must be unsigned");
+
+  constexpr bool kFCA1 = BandingHasher::kFirstCoeffAlwaysOne;
+
+  if (begin == end) {
+    // trivial
+    return true;
+  }
+
+  const Index num_starts = bs->GetNumStarts();
+
+  InputIterator cur = begin;
+  Index backtrack_pos = 0;
+  if (!bs->UsePrefetch()) {
+    // Simple version, no prefetch
+    for (;;) {
+      Hash h = bh.GetHash(*cur);
+      Index start = bh.GetStart(h, num_starts);
+      ResultRow rr =
+          bh.GetResultRowFromInput(*cur) | bh.GetResultRowFromHash(h);
+      CoeffRow cr = bh.GetCoeffRow(h);
+
+      if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) {
+        break;
+      }
+      if ((++cur) == end) {
+        return true;
+      }
+    }
+  } else {
+    // Pipelined w/prefetch
+    // Prime the pipeline
+    Hash h = bh.GetHash(*cur);
+    Index start = bh.GetStart(h, num_starts);
+    ResultRow rr = bh.GetResultRowFromInput(*cur);
+    bs->Prefetch(start);
+
+    // Pipeline
+    for (;;) {
+      rr |= bh.GetResultRowFromHash(h);
+      CoeffRow cr = bh.GetCoeffRow(h);
+      if ((++cur) == end) {
+        if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) {
+          break;
+        }
+        return true;
+      }
+      Hash next_h = bh.GetHash(*cur);
+      Index next_start = bh.GetStart(next_h, num_starts);
+      ResultRow next_rr = bh.GetResultRowFromInput(*cur);
+      bs->Prefetch(next_start);
+      if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) {
+        break;
+      }
+      h = next_h;
+      start = next_start;
+      rr = next_rr;
+    }
+  }
+  // failed; backtrack (if implemented)
+  if (bts->UseBacktrack()) {
+    while (backtrack_pos > 0) {
+      --backtrack_pos;
+      Index i = bts->BacktrackGet(backtrack_pos);
+      // Clearing the ResultRow is not strictly required, but is required
+      // for good FP rate on inputs that might have been backtracked out.
+      // (We don't want anything we've backtracked on to leak into final
+      // result, as that might not be "harmless".)
+      bs->StoreRow(i, 0, 0);
+    }
+  }
+  return false;
+}
+
+// Adds a range of entries to BandingStorage returning true if successful
+// or false if solution is impossible with current hasher (and presumably
+// its seed) and number of "slots" (solution or banding rows). (A solution
+// is impossible when there is a linear dependence among the inputs that
+// doesn't "cancel out".) Here "InputIterator" is an iterator over AddInputs.
+//
+// On failure, some subset of the entries will have been added to the
+// BandingStorage, so best considered to be in an indeterminate state.
+//
+template <typename BandingStorage, typename BandingHasher,
+          typename InputIterator>
+bool BandingAddRange(BandingStorage *bs, const BandingHasher &bh,
+                     InputIterator begin, InputIterator end) {
+  using Index = typename BandingStorage::Index;
+  struct NoopBacktrackStorage {
+    bool UseBacktrack() { return false; }
+    void BacktrackPut(Index, Index) {}
+    Index BacktrackGet(Index) {
+      assert(false);
+      return 0;
+    }
+  } nbts;
+  return BandingAddRange(bs, &nbts, bh, begin, end);
+}
+
+// ######################################################################
+// ######################### Solution Storage ###########################
+
+// Back-substitution and query algorithms unfortunately depend on some
+// details of data layout in the final data structure ("solution"). Thus,
+// there is no common SolutionStorage covering all the reasonable
+// possibilities.
+
+// ###################### SimpleSolutionStorage #########################
+
+// SimpleSolutionStorage is for a row-major storage, typically with no
+// unused bits in each ResultRow. This is mostly for demonstration
+// purposes as the simplest solution storage scheme. It is relatively slow
+// for filter queries.
+
+// concept SimpleSolutionStorage extends RibbonTypes {
+//   // This is called at the beginning of back-substitution for the
+//   // solution storage to do any remaining configuration before data
+//   // is stored to it. If configuration is previously finalized, this
+//   // could be a simple assertion or even no-op. Ribbon algorithms
+//   // only call this from back-substitution, and only once per call,
+//   // before other functions here.
+//   void PrepareForNumStarts(Index num_starts) const;
+//   // Must return num_starts passed to PrepareForNumStarts, or the most
+//   // recent call to PrepareForNumStarts if this storage object can be
+//   // reused. Note that num_starts == num_slots - kCoeffBits + 1 because
+//   // there must be a run of kCoeffBits slots starting from each start.
+//   Index GetNumStarts() const;
+//   // Load the solution row (type ResultRow) for a slot
+//   ResultRow Load(Index slot_num) const;
+//   // Store the solution row (type ResultRow) for a slot
+//   void Store(Index slot_num, ResultRow data);
+// };
+
+// Back-substitution for generating a solution from BandingStorage to
+// SimpleSolutionStorage.
+template <typename SimpleSolutionStorage, typename BandingStorage>
+void SimpleBackSubst(SimpleSolutionStorage *sss, const BandingStorage &bs) {
+  using CoeffRow = typename BandingStorage::CoeffRow;
+  using Index = typename BandingStorage::Index;
+  using ResultRow = typename BandingStorage::ResultRow;
+
+  static_assert(sizeof(Index) == sizeof(typename SimpleSolutionStorage::Index),
+                "must be same");
+  static_assert(
+      sizeof(CoeffRow) == sizeof(typename SimpleSolutionStorage::CoeffRow),
+      "must be same");
+  static_assert(
+      sizeof(ResultRow) == sizeof(typename SimpleSolutionStorage::ResultRow),
+      "must be same");
+
+  constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
+  constexpr auto kResultBits = static_cast<Index>(sizeof(ResultRow) * 8U);
+
+  // A column-major buffer of the solution matrix, containing enough
+  // recently-computed solution data to compute the next solution row
+  // (based also on banding data).
+  std::array<CoeffRow, kResultBits> state;
+  state.fill(0);
+
+  const Index num_starts = bs.GetNumStarts();
+  sss->PrepareForNumStarts(num_starts);
+  const Index num_slots = num_starts + kCoeffBits - 1;
+
+  for (Index i = num_slots; i > 0;) {
+    --i;
+    CoeffRow cr;
+    ResultRow rr;
+    bs.LoadRow(i, &cr, &rr, /* for_back_subst */ true);
+    // solution row
+    ResultRow sr = 0;
+    for (Index j = 0; j < kResultBits; ++j) {
+      // Compute next solution bit at row i, column j (see derivation below)
+      CoeffRow tmp = state[j] << 1;
+      bool bit = (BitParity(tmp & cr) ^ ((rr >> j) & 1)) != 0;
+      tmp |= bit ? CoeffRow{1} : CoeffRow{0};
+
+      // Now tmp is solution at column j from row i for next kCoeffBits
+      // more rows. Thus, for valid solution, the dot product of the
+      // solution column with the coefficient row has to equal the result
+      // at that column,
+      //   BitParity(tmp & cr) == ((rr >> j) & 1)
+
+      // Update state.
+      state[j] = tmp;
+      // add to solution row
+      sr |= (bit ? ResultRow{1} : ResultRow{0}) << j;
+    }
+    sss->Store(i, sr);
+  }
+}
+
+// Common functionality for querying a key (already hashed) in
+// SimpleSolutionStorage.
+template <typename SimpleSolutionStorage>
+typename SimpleSolutionStorage::ResultRow SimpleQueryHelper(
+    typename SimpleSolutionStorage::Index start_slot,
+    typename SimpleSolutionStorage::CoeffRow cr,
+    const SimpleSolutionStorage &sss) {
+  using CoeffRow = typename SimpleSolutionStorage::CoeffRow;
+  using ResultRow = typename SimpleSolutionStorage::ResultRow;
+
+  constexpr unsigned kCoeffBits = static_cast<unsigned>(sizeof(CoeffRow) * 8U);
+
+  ResultRow result = 0;
+  for (unsigned i = 0; i < kCoeffBits; ++i) {
+    // Bit masking whole value is generally faster here than 'if'
+    result ^= sss.Load(start_slot + i) &
+              (ResultRow{0} - (static_cast<ResultRow>(cr >> i) & ResultRow{1}));
+  }
+  return result;
+}
+
+// General PHSF query a key from SimpleSolutionStorage.
+template <typename SimpleSolutionStorage, typename PhsfQueryHasher>
+typename SimpleSolutionStorage::ResultRow SimplePhsfQuery(
+    const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher,
+    const SimpleSolutionStorage &sss) {
+  const typename PhsfQueryHasher::Hash hash = hasher.GetHash(key);
+
+  static_assert(sizeof(typename SimpleSolutionStorage::Index) ==
+                    sizeof(typename PhsfQueryHasher::Index),
+                "must be same");
+  static_assert(sizeof(typename SimpleSolutionStorage::CoeffRow) ==
+                    sizeof(typename PhsfQueryHasher::CoeffRow),
+                "must be same");
+
+  return SimpleQueryHelper(hasher.GetStart(hash, sss.GetNumStarts()),
+                           hasher.GetCoeffRow(hash), sss);
+}
+
+// Filter query a key from SimpleSolutionStorage.
+template <typename SimpleSolutionStorage, typename FilterQueryHasher>
+bool SimpleFilterQuery(const typename FilterQueryHasher::Key &key,
+                       const FilterQueryHasher &hasher,
+                       const SimpleSolutionStorage &sss) {
+  const typename FilterQueryHasher::Hash hash = hasher.GetHash(key);
+  const typename SimpleSolutionStorage::ResultRow expected =
+      hasher.GetResultRowFromHash(hash);
+
+  static_assert(sizeof(typename SimpleSolutionStorage::Index) ==
+                    sizeof(typename FilterQueryHasher::Index),
+                "must be same");
+  static_assert(sizeof(typename SimpleSolutionStorage::CoeffRow) ==
+                    sizeof(typename FilterQueryHasher::CoeffRow),
+                "must be same");
+  static_assert(sizeof(typename SimpleSolutionStorage::ResultRow) ==
+                    sizeof(typename FilterQueryHasher::ResultRow),
+                "must be same");
+
+  return expected ==
+         SimpleQueryHelper(hasher.GetStart(hash, sss.GetNumStarts()),
+                           hasher.GetCoeffRow(hash), sss);
+}
+
+// #################### InterleavedSolutionStorage ######################
+
+// InterleavedSolutionStorage is row-major at a high level, for good
+// locality, and column-major at a low level, for CPU efficiency
+// especially in filter queries or relatively small number of result bits
+// (== solution columns). The storage is a sequence of "blocks" where a
+// block has one CoeffRow-sized segment for each solution column. Each
+// query spans at most two blocks; the starting solution row is typically
+// in the row-logical middle of a block and spans to the middle of the
+// next block. (See diagram below.)
+//
+// InterleavedSolutionStorage supports choosing b (number of result or
+// solution columns) at run time, and even supports mixing b and b-1 solution
+// columns in a single linear system solution, for filters that can
+// effectively utilize any size space (multiple of CoeffRow) for minimizing
+// FP rate for any number of added keys. To simplify query implementation
+// (with lower-index columns first), the b-bit portion comes after the b-1
+// portion of the structure.
+//
+// Diagram (=== marks logical block boundary; b=4; ### is data used by a
+// query crossing the b-1 to b boundary, each Segment has type CoeffRow):
+//  ...
+// +======================+
+// | S e g m e n t  col=0 |
+// +----------------------+
+// | S e g m e n t  col=1 |
+// +----------------------+
+// | S e g m e n t  col=2 |
+// +======================+
+// | S e g m e n #########|
+// +----------------------+
+// | S e g m e n #########|
+// +----------------------+
+// | S e g m e n #########|
+// +======================+ Result/solution columns: above = 3, below = 4
+// |#############t  col=0 |
+// +----------------------+
+// |#############t  col=1 |
+// +----------------------+
+// |#############t  col=2 |
+// +----------------------+
+// | S e g m e n t  col=3 |
+// +======================+
+// | S e g m e n t  col=0 |
+// +----------------------+
+// | S e g m e n t  col=1 |
+// +----------------------+
+// | S e g m e n t  col=2 |
+// +----------------------+
+// | S e g m e n t  col=3 |
+// +======================+
+//  ...
+//
+// InterleavedSolutionStorage will be adapted by the algorithms from
+// simple array-like segment storage. That array-like storage is templatized
+// in part so that an implementation may choose to handle byte ordering
+// at access time.
+//
+// concept InterleavedSolutionStorage extends RibbonTypes {
+//   // This is called at the beginning of back-substitution for the
+//   // solution storage to do any remaining configuration before data
+//   // is stored to it. If configuration is previously finalized, this
+//   // could be a simple assertion or even no-op. Ribbon algorithms
+//   // only call this from back-substitution, and only once per call,
+//   // before other functions here.
+//   void PrepareForNumStarts(Index num_starts) const;
+//   // Must return num_starts passed to PrepareForNumStarts, or the most
+//   // recent call to PrepareForNumStarts if this storage object can be
+//   // reused. Note that num_starts == num_slots - kCoeffBits + 1 because
+//   // there must be a run of kCoeffBits slots starting from each start.
+//   Index GetNumStarts() const;
+//   // The larger number of solution columns used (called "b" above).
+//   Index GetUpperNumColumns() const;
+//   // If returns > 0, then block numbers below that use
+//   // GetUpperNumColumns() - 1 columns per solution row, and the rest
+//   // use GetUpperNumColumns(). A block represents kCoeffBits "slots",
+//   // where all but the last kCoeffBits - 1 slots are also starts. And
+//   // a block contains a segment for each solution column.
+//   // An implementation may only support uniform columns per solution
+//   // row and return constant 0 here.
+//   Index GetUpperStartBlock() const;
+//
+//   // ### "Array of segments" portion of API ###
+//   // The number of values of type CoeffRow used in this solution
+//   // representation. (This value can be inferred from the previous
+//   // three functions, but is expected at least for sanity / assertion
+//   // checking.)
+//   Index GetNumSegments() const;
+//   // Load an entry from the logical array of segments
+//   CoeffRow LoadSegment(Index segment_num) const;
+//   // Store an entry to the logical array of segments
+//   void StoreSegment(Index segment_num, CoeffRow data);
+// };
+
+// A helper for InterleavedBackSubst.
+template <typename BandingStorage>
+inline void BackSubstBlock(typename BandingStorage::CoeffRow *state,
+                           typename BandingStorage::Index num_columns,
+                           const BandingStorage &bs,
+                           typename BandingStorage::Index start_slot) {
+  using CoeffRow = typename BandingStorage::CoeffRow;
+  using Index = typename BandingStorage::Index;
+  using ResultRow = typename BandingStorage::ResultRow;
+
+  constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
+
+  for (Index i = start_slot + kCoeffBits; i > start_slot;) {
+    --i;
+    CoeffRow cr;
+    ResultRow rr;
+    bs.LoadRow(i, &cr, &rr, /* for_back_subst */ true);
+    for (Index j = 0; j < num_columns; ++j) {
+      // Compute next solution bit at row i, column j (see derivation below)
+      CoeffRow tmp = state[j] << 1;
+      int bit = BitParity(tmp & cr) ^ ((rr >> j) & 1);
+      tmp |= static_cast<CoeffRow>(bit);
+
+      // Now tmp is solution at column j from row i for next kCoeffBits
+      // more rows. Thus, for valid solution, the dot product of the
+      // solution column with the coefficient row has to equal the result
+      // at that column,
+      //   BitParity(tmp & cr) == ((rr >> j) & 1)
+
+      // Update state.
+      state[j] = tmp;
+    }
+  }
+}
+
+// Back-substitution for generating a solution from BandingStorage to
+// InterleavedSolutionStorage.
+template <typename InterleavedSolutionStorage, typename BandingStorage>
+void InterleavedBackSubst(InterleavedSolutionStorage *iss,
+                          const BandingStorage &bs) {
+  using CoeffRow = typename BandingStorage::CoeffRow;
+  using Index = typename BandingStorage::Index;
+
+  static_assert(
+      sizeof(Index) == sizeof(typename InterleavedSolutionStorage::Index),
+      "must be same");
+  static_assert(
+      sizeof(CoeffRow) == sizeof(typename InterleavedSolutionStorage::CoeffRow),
+      "must be same");
+
+  constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
+
+  const Index num_starts = bs.GetNumStarts();
+  // Although it might be nice to have a filter that returns "always false"
+  // when no key is added, we aren't specifically supporting that here
+  // because it would require another condition branch in the query.
+  assert(num_starts > 0);
+  iss->PrepareForNumStarts(num_starts);
+
+  const Index num_slots = num_starts + kCoeffBits - 1;
+  assert(num_slots % kCoeffBits == 0);
+  const Index num_blocks = num_slots / kCoeffBits;
+  const Index num_segments = iss->GetNumSegments();
+
+  // For now upper, then lower
+  Index num_columns = iss->GetUpperNumColumns();
+  const Index upper_start_block = iss->GetUpperStartBlock();
+
+  if (num_columns == 0) {
+    // Nothing to do, presumably because there's not enough space for even
+    // a single segment.
+    assert(num_segments == 0);
+    // When num_columns == 0, a Ribbon filter query will always return true,
+    // or a PHSF query always 0.
+    return;
+  }
+
+  // We should be utilizing all available segments
+  assert(num_segments == (upper_start_block * (num_columns - 1)) +
+                             ((num_blocks - upper_start_block) * num_columns));
+
+  // TODO: consider fixed-column specializations with stack-allocated state
+
+  // A column-major buffer of the solution matrix, containing enough
+  // recently-computed solution data to compute the next solution row
+  // (based also on banding data).
+  std::unique_ptr<CoeffRow[]> state{new CoeffRow[num_columns]()};
+
+  Index block = num_blocks;
+  Index segment_num = num_segments;
+  while (block > upper_start_block) {
+    --block;
+    BackSubstBlock(state.get(), num_columns, bs, block * kCoeffBits);
+    segment_num -= num_columns;
+    for (Index i = 0; i < num_columns; ++i) {
+      iss->StoreSegment(segment_num + i, state[i]);
+    }
+  }
+  // Now (if applicable), region using lower number of columns
+  // (This should be optimized away if GetUpperStartBlock() returns
+  // constant 0.)
+  --num_columns;
+  while (block > 0) {
+    --block;
+    BackSubstBlock(state.get(), num_columns, bs, block * kCoeffBits);
+    segment_num -= num_columns;
+    for (Index i = 0; i < num_columns; ++i) {
+      iss->StoreSegment(segment_num + i, state[i]);
+    }
+  }
+  // Verify everything processed
+  assert(block == 0);
+  assert(segment_num == 0);
+}
+
+// Prefetch memory for a key in InterleavedSolutionStorage.
+template <typename InterleavedSolutionStorage, typename PhsfQueryHasher>
+inline void InterleavedPrepareQuery(
+    const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher,
+    const InterleavedSolutionStorage &iss,
+    typename PhsfQueryHasher::Hash *saved_hash,
+    typename InterleavedSolutionStorage::Index *saved_segment_num,
+    typename InterleavedSolutionStorage::Index *saved_num_columns,
+    typename InterleavedSolutionStorage::Index *saved_start_bit) {
+  using Hash = typename PhsfQueryHasher::Hash;
+  using CoeffRow = typename InterleavedSolutionStorage::CoeffRow;
+  using Index = typename InterleavedSolutionStorage::Index;
+
+  static_assert(sizeof(Index) == sizeof(typename PhsfQueryHasher::Index),
+                "must be same");
+
+  const Hash hash = hasher.GetHash(key);
+  const Index start_slot = hasher.GetStart(hash, iss.GetNumStarts());
+
+  constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
+
+  const Index upper_start_block = iss.GetUpperStartBlock();
+  Index num_columns = iss.GetUpperNumColumns();
+  Index start_block_num = start_slot / kCoeffBits;
+  Index segment_num = start_block_num * num_columns -
+                      std::min(start_block_num, upper_start_block);
+  // Change to lower num columns if applicable.
+  // (This should not compile to a conditional branch.)
+  num_columns -= (start_block_num < upper_start_block) ? 1 : 0;
+
+  Index start_bit = start_slot % kCoeffBits;
+
+  Index segment_count = num_columns + (start_bit == 0 ? 0 : num_columns);
+
+  iss.PrefetchSegmentRange(segment_num, segment_num + segment_count);
+
+  *saved_hash = hash;
+  *saved_segment_num = segment_num;
+  *saved_num_columns = num_columns;
+  *saved_start_bit = start_bit;
+}
+
+// General PHSF query from InterleavedSolutionStorage, using data for
+// the query key from InterleavedPrepareQuery
+template <typename InterleavedSolutionStorage, typename PhsfQueryHasher>
+inline typename InterleavedSolutionStorage::ResultRow InterleavedPhsfQuery(
+    typename PhsfQueryHasher::Hash hash,
+    typename InterleavedSolutionStorage::Index segment_num,
+    typename InterleavedSolutionStorage::Index num_columns,
+    typename InterleavedSolutionStorage::Index start_bit,
+    const PhsfQueryHasher &hasher, const InterleavedSolutionStorage &iss) {
+  using CoeffRow = typename InterleavedSolutionStorage::CoeffRow;
+  using Index = typename InterleavedSolutionStorage::Index;
+  using ResultRow = typename InterleavedSolutionStorage::ResultRow;
+
+  static_assert(sizeof(Index) == sizeof(typename PhsfQueryHasher::Index),
+                "must be same");
+  static_assert(sizeof(CoeffRow) == sizeof(typename PhsfQueryHasher::CoeffRow),
+                "must be same");
+
+  constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
+
+  const CoeffRow cr = hasher.GetCoeffRow(hash);
+
+  ResultRow sr = 0;
+  const CoeffRow cr_left = cr << static_cast<unsigned>(start_bit);
+  for (Index i = 0; i < num_columns; ++i) {
+    sr ^= BitParity(iss.LoadSegment(segment_num + i) & cr_left) << i;
+  }
+
+  if (start_bit > 0) {
+    segment_num += num_columns;
+    const CoeffRow cr_right =
+        cr >> static_cast<unsigned>(kCoeffBits - start_bit);
+    for (Index i = 0; i < num_columns; ++i) {
+      sr ^= BitParity(iss.LoadSegment(segment_num + i) & cr_right) << i;
+    }
+  }
+
+  return sr;
+}
+
+// Filter query a key from InterleavedFilterQuery.
+template <typename InterleavedSolutionStorage, typename FilterQueryHasher>
+inline bool InterleavedFilterQuery(
+    typename FilterQueryHasher::Hash hash,
+    typename InterleavedSolutionStorage::Index segment_num,
+    typename InterleavedSolutionStorage::Index num_columns,
+    typename InterleavedSolutionStorage::Index start_bit,
+    const FilterQueryHasher &hasher, const InterleavedSolutionStorage &iss) {
+  using CoeffRow = typename InterleavedSolutionStorage::CoeffRow;
+  using Index = typename InterleavedSolutionStorage::Index;
+  using ResultRow = typename InterleavedSolutionStorage::ResultRow;
+
+  static_assert(sizeof(Index) == sizeof(typename FilterQueryHasher::Index),
+                "must be same");
+  static_assert(
+      sizeof(CoeffRow) == sizeof(typename FilterQueryHasher::CoeffRow),
+      "must be same");
+  static_assert(
+      sizeof(ResultRow) == sizeof(typename FilterQueryHasher::ResultRow),
+      "must be same");
+
+  constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
+
+  const CoeffRow cr = hasher.GetCoeffRow(hash);
+  const ResultRow expected = hasher.GetResultRowFromHash(hash);
+
+  // TODO: consider optimizations such as
+  // * get rid of start_bit == 0 condition with careful fetching & shifting
+  if (start_bit == 0) {
+    for (Index i = 0; i < num_columns; ++i) {
+      if (BitParity(iss.LoadSegment(segment_num + i) & cr) !=
+          (static_cast<int>(expected >> i) & 1)) {
+        return false;
+      }
+    }
+  } else {
+    const CoeffRow cr_left = cr << static_cast<unsigned>(start_bit);
+    const CoeffRow cr_right =
+        cr >> static_cast<unsigned>(kCoeffBits - start_bit);
+
+    for (Index i = 0; i < num_columns; ++i) {
+      CoeffRow soln_data =
+          (iss.LoadSegment(segment_num + i) & cr_left) ^
+          (iss.LoadSegment(segment_num + num_columns + i) & cr_right);
+      if (BitParity(soln_data) != (static_cast<int>(expected >> i) & 1)) {
+        return false;
+      }
+    }
+  }
+  // otherwise, all match
+  return true;
+}
+
+// TODO: refactor Interleaved*Query so that queries can be "prepared" by
+// prefetching memory, to hide memory latency for multiple queries in a
+// single thread.
+
+}  // namespace ribbon
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/ribbon_config.cc b/src/rocksdb/util/ribbon_config.cc
new file mode 100644
index 000000000..c1046f4aa
--- /dev/null
+++ b/src/rocksdb/util/ribbon_config.cc
@@ -0,0 +1,506 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/ribbon_config.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace ribbon {
+
+namespace detail {
+
+// Each instantiation of this struct is sufficiently unique for configuration
+// purposes, and is only instantiated for settings where we support the
+// configuration API. An application might only reference one instantiation,
+// meaning the rest could be pruned at link time.
+template <ConstructionFailureChance kCfc, uint64_t kCoeffBits, bool kUseSmash>
+struct BandingConfigHelperData {
+  static constexpr size_t kKnownSize = 18U;
+
+  // Because of complexity in the data, for smaller numbers of slots
+  // (powers of two up to 2^17), we record known numbers that can be added
+  // with kCfc chance of construction failure and settings in template
+  // parameters. Zero means "unsupported (too small) number of slots".
+  // (GetNumToAdd below will use interpolation for numbers of slots
+  // between powers of two; double rather than integer values here make
+  // that more accurate.)
+  static const std::array<double, kKnownSize> kKnownToAddByPow2;
+
+  // For sufficiently large number of slots, doubling the number of
+  // slots will increase the expected overhead (slots over number added)
+  // by approximately this constant.
+  // (This is roughly constant regardless of ConstructionFailureChance and
+  // smash setting.)
+  // (Would be a constant if we had partial template specialization for
+  // static const members.)
+  static inline double GetFactorPerPow2() {
+    if (kCoeffBits == 128U) {
+      return 0.0038;
+    } else {
+      assert(kCoeffBits == 64U);
+      return 0.0083;
+    }
+  }
+
+  // Overhead factor for 2^(kKnownSize-1) slots
+  // (Would be a constant if we had partial template specialization for
+  // static const members.)
+  static inline double GetFinalKnownFactor() {
+    return 1.0 * (uint32_t{1} << (kKnownSize - 1)) /
+           kKnownToAddByPow2[kKnownSize - 1];
+  }
+
+  // GetFinalKnownFactor() - (kKnownSize-1) * GetFactorPerPow2()
+  // (Would be a constant if we had partial template specialization for
+  // static const members.)
+  static inline double GetBaseFactor() {
+    return GetFinalKnownFactor() - (kKnownSize - 1) * GetFactorPerPow2();
+  }
+
+  // Get overhead factor (slots over number to add) for sufficiently large
+  // number of slots (by log base 2)
+  static inline double GetFactorForLarge(double log2_num_slots) {
+    return GetBaseFactor() + log2_num_slots * GetFactorPerPow2();
+  }
+
+  // For a given power of two number of slots (specified by whole number
+  // log base 2), implements GetNumToAdd for such limited case, returning
+  // double for better interpolation in GetNumToAdd and GetNumSlots.
+  static inline double GetNumToAddForPow2(uint32_t log2_num_slots) {
+    assert(log2_num_slots <= 32);  // help clang-analyze
+    if (log2_num_slots < kKnownSize) {
+      return kKnownToAddByPow2[log2_num_slots];
+    } else {
+      return 1.0 * (uint64_t{1} << log2_num_slots) /
+             GetFactorForLarge(1.0 * log2_num_slots);
+    }
+  }
+};
+
+// Based on data from FindOccupancy in ribbon_test
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn2, 128U, false>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        252.984,
+        506.109,
+        1013.71,
+        2029.47,
+        4060.43,
+        8115.63,
+        16202.2,
+        32305.1,
+        64383.5,
+        128274,
+    }};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn2, 128U, /*smash*/ true>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        126.274,
+        254.279,
+        510.27,
+        1022.24,
+        2046.02,
+        4091.99,
+        8154.98,
+        16244.3,
+        32349.7,
+        64426.6,
+        128307,
+    }};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn2, 64U, false>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        124.94,
+        249.968,
+        501.234,
+        1004.06,
+        2006.15,
+        3997.89,
+        7946.99,
+        15778.4,
+        31306.9,
+        62115.3,
+        123284,
+    }};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn2, 64U, /*smash*/ true>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        62.2683,
+        126.259,
+        254.268,
+        509.975,
+        1019.98,
+        2026.16,
+        4019.75,
+        7969.8,
+        15798.2,
+        31330.3,
+        62134.2,
+        123255,
+    }};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn20, 128U, false>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        248.851,
+        499.532,
+        1001.26,
+        2003.97,
+        4005.59,
+        8000.39,
+        15966.6,
+        31828.1,
+        63447.3,
+        126506,
+    }};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn20, 128U, /*smash*/ true>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        122.637,
+        250.651,
+        506.625,
+        1018.54,
+        2036.43,
+        4041.6,
+        8039.25,
+        16005,
+        31869.6,
+        63492.8,
+        126537,
+    }};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn20, 64U, false>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        120.659,
+        243.346,
+        488.168,
+        976.373,
+        1948.86,
+        3875.85,
+        7704.97,
+        15312.4,
+        30395.1,
+        60321.8,
+        119813,
+    }};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn20, 64U, /*smash*/ true>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        58.6016,
+        122.619,
+        250.641,
+        503.595,
+        994.165,
+        1967.36,
+        3898.17,
+        7727.21,
+        15331.5,
+        30405.8,
+        60376.2,
+        119836,
+    }};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn1000, 128U, false>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        242.61,
+        491.887,
+        983.603,
+        1968.21,
+        3926.98,
+        7833.99,
+        15629,
+        31199.9,
+        62307.8,
+        123870,
+    }};
+
+template <>
+const std::array<double, 18> BandingConfigHelperData<
+    kOneIn1000, 128U, /*smash*/ true>::kKnownToAddByPow2{{
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,  // unsupported
+    117.19,
+    245.105,
+    500.748,
+    1010.67,
+    1993.4,
+    3950.01,
+    7863.31,
+    15652,
+    31262.1,
+    62462.8,
+    124095,
+}};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn1000, 64U, false>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        114,
+        234.8,
+        471.498,
+        940.165,
+        1874,
+        3721.5,
+        7387.5,
+        14592,
+        29160,
+        57745,
+        115082,
+    }};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn1000, 64U, /*smash*/ true>::kKnownToAddByPow2{
+        {
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,  // unsupported
+            53.0434,
+            117,
+            245.312,
+            483.571,
+            950.251,
+            1878,
+            3736.34,
+            7387.97,
+            14618,
+            29142.9,
+            57838.8,
+            114932,
+        }};
+
+// We hide these implementation details from the .h file with explicit
+// instantiations below these partial specializations.
+
+template <ConstructionFailureChance kCfc, uint64_t kCoeffBits, bool kUseSmash,
+          bool kHomogeneous>
+uint32_t BandingConfigHelper1MaybeSupported<
+    kCfc, kCoeffBits, kUseSmash, kHomogeneous,
+    true /* kIsSupported */>::GetNumToAdd(uint32_t num_slots) {
+  using Data = detail::BandingConfigHelperData<kCfc, kCoeffBits, kUseSmash>;
+  if (num_slots == 0) {
+    return 0;
+  }
+  uint32_t num_to_add;
+  double log2_num_slots = std::log(num_slots) * 1.4426950409;
+  uint32_t floor_log2 = static_cast<uint32_t>(log2_num_slots);
+  if (floor_log2 + 1 < Data::kKnownSize) {
+    double ceil_portion = 1.0 * num_slots / (uint32_t{1} << floor_log2) - 1.0;
+    // Must be a supported number of slots
+    assert(Data::kKnownToAddByPow2[floor_log2] > 0.0);
+    // Weighted average of two nearest known data points
+    num_to_add = static_cast<uint32_t>(
+        ceil_portion * Data::kKnownToAddByPow2[floor_log2 + 1] +
+        (1.0 - ceil_portion) * Data::kKnownToAddByPow2[floor_log2]);
+  } else {
+    // Use formula for large values
+    double factor = Data::GetFactorForLarge(log2_num_slots);
+    assert(factor >= 1.0);
+    num_to_add = static_cast<uint32_t>(num_slots / factor);
+  }
+  if (kHomogeneous) {
+    // Even when standard filter construction would succeed, we might
+    // have loaded things up too much for Homogeneous filter. (Complete
+    // explanation not known but observed empirically.) This seems to
+    // correct for that, mostly affecting small filter configurations.
+    if (num_to_add >= 8) {
+      num_to_add -= 8;
+    } else {
+      assert(false);
+    }
+  }
+  return num_to_add;
+}
+
+template <ConstructionFailureChance kCfc, uint64_t kCoeffBits, bool kUseSmash,
+          bool kHomogeneous>
+uint32_t BandingConfigHelper1MaybeSupported<
+    kCfc, kCoeffBits, kUseSmash, kHomogeneous,
+    true /* kIsSupported */>::GetNumSlots(uint32_t num_to_add) {
+  using Data = detail::BandingConfigHelperData<kCfc, kCoeffBits, kUseSmash>;
+
+  if (num_to_add == 0) {
+    return 0;
+  }
+  if (kHomogeneous) {
+    // Reverse of above in GetNumToAdd
+    num_to_add += 8;
+  }
+  double log2_num_to_add = std::log(num_to_add) * 1.4426950409;
+  uint32_t approx_log2_slots = static_cast<uint32_t>(log2_num_to_add + 0.5);
+  assert(approx_log2_slots <= 32);  // help clang-analyze
+
+  double lower_num_to_add = Data::GetNumToAddForPow2(approx_log2_slots);
+  double upper_num_to_add;
+  if (approx_log2_slots == 0 || lower_num_to_add == /* unsupported */ 0) {
+    // Return minimum non-zero slots in standard implementation
+    return kUseSmash ? kCoeffBits : 2 * kCoeffBits;
+  } else if (num_to_add < lower_num_to_add) {
+    upper_num_to_add = lower_num_to_add;
+    --approx_log2_slots;
+    lower_num_to_add = Data::GetNumToAddForPow2(approx_log2_slots);
+  } else {
+    upper_num_to_add = Data::GetNumToAddForPow2(approx_log2_slots + 1);
+  }
+
+  assert(num_to_add >= lower_num_to_add);
+  assert(num_to_add < upper_num_to_add);
+
+  double upper_portion =
+      (num_to_add - lower_num_to_add) / (upper_num_to_add - lower_num_to_add);
+
+  double lower_num_slots = 1.0 * (uint64_t{1} << approx_log2_slots);
+
+  // Interpolation, round up
+  return static_cast<uint32_t>(upper_portion * lower_num_slots +
+                               lower_num_slots + 0.999999999);
+}
+
+// These explicit instantiations enable us to hide most of the
+// implementation details from the .h file. (The .h file currently
+// needs to determine whether settings are "supported" or not.)
+
+template struct BandingConfigHelper1MaybeSupported<kOneIn2, 128U, /*sm*/ false,
+                                                   /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn2, 128U, /*sm*/ true,
+                                                   /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn2, 128U, /*sm*/ false,
+                                                   /*hm*/ true, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn2, 128U, /*sm*/ true,
+                                                   /*hm*/ true, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn2, 64U, /*sm*/ false,
+                                                   /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn2, 64U, /*sm*/ true,
+                                                   /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn2, 64U, /*sm*/ false,
+                                                   /*hm*/ true, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn2, 64U, /*sm*/ true,
+                                                   /*hm*/ true, /*sup*/ true>;
+
+template struct BandingConfigHelper1MaybeSupported<kOneIn20, 128U, /*sm*/ false,
+                                                   /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn20, 128U, /*sm*/ true,
+                                                   /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn20, 128U, /*sm*/ false,
+                                                   /*hm*/ true, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn20, 128U, /*sm*/ true,
+                                                   /*hm*/ true, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn20, 64U, /*sm*/ false,
+                                                   /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn20, 64U, /*sm*/ true,
+                                                   /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn20, 64U, /*sm*/ false,
+                                                   /*hm*/ true, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn20, 64U, /*sm*/ true,
+                                                   /*hm*/ true, /*sup*/ true>;
+
+template struct BandingConfigHelper1MaybeSupported<
+    kOneIn1000, 128U, /*sm*/ false, /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<
+    kOneIn1000, 128U, /*sm*/ true, /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<
+    kOneIn1000, 128U, /*sm*/ false, /*hm*/ true, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<
+    kOneIn1000, 128U, /*sm*/ true, /*hm*/ true, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<
+    kOneIn1000, 64U, /*sm*/ false, /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn1000, 64U, /*sm*/ true,
+                                                   /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<
+    kOneIn1000, 64U, /*sm*/ false, /*hm*/ true, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn1000, 64U, /*sm*/ true,
+                                                   /*hm*/ true, /*sup*/ true>;
+
+}  // namespace detail
+
+}  // namespace ribbon
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/ribbon_config.h b/src/rocksdb/util/ribbon_config.h
new file mode 100644
index 000000000..0e3edf073
--- /dev/null
+++ b/src/rocksdb/util/ribbon_config.h
@@ -0,0 +1,182 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <array>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+
+#include "port/lang.h"  // for FALLTHROUGH_INTENDED
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace ribbon {
+
+// RIBBON PHSF & RIBBON Filter (Rapid Incremental Boolean Banding ON-the-fly)
+//
+// ribbon_config.h: APIs for relating numbers of slots with numbers of
+// additions for tolerable construction failure probabilities. This is
+// separate from ribbon_impl.h because it might not be needed for
+// some applications.
+//
+// This API assumes uint32_t for number of slots, as a single Ribbon
+// linear system should not normally overflow that without big penalties.
+//
+// Template parameter kCoeffBits uses uint64_t for convenience in case it
+// comes from size_t.
+//
+// Most of the complexity here is trying to optimize speed and
+// compiled code size, using templates to minimize table look-ups and
+// the compiled size of all linked look-up tables. Look-up tables are
+// required because we don't have good formulas, and the data comes
+// from running FindOccupancy in ribbon_test.
+
+// Represents a chosen chance of successful Ribbon construction for a single
+// seed. Allowing higher chance of failed construction can reduce space
+// overhead but takes extra time in construction.
+enum ConstructionFailureChance {
+  kOneIn2,
+  kOneIn20,
+  // When using kHomogeneous==true, construction failure chance should
+  // not generally exceed target FP rate, so it unlikely useful to
+  // allow a higher "failure" chance. In some cases, even more overhead
+  // is appropriate. (TODO)
+  kOneIn1000,
+};
+
+namespace detail {
+
+// It is useful to compile ribbon_test linking to BandingConfigHelper with
+// settings for which we do not have configuration data, as long as we don't
+// run the code. This template hack supports that.
+template <ConstructionFailureChance kCfc, uint64_t kCoeffBits, bool kUseSmash,
+          bool kHomogeneous, bool kIsSupported>
+struct BandingConfigHelper1MaybeSupported {
+ public:
+  static uint32_t GetNumToAdd(uint32_t num_slots) {
+    // Unsupported
+    assert(num_slots == 0);
+    (void)num_slots;
+    return 0;
+  }
+
+  static uint32_t GetNumSlots(uint32_t num_to_add) {
+    // Unsupported
+    assert(num_to_add == 0);
+    (void)num_to_add;
+    return 0;
+  }
+};
+
+// Base class for BandingConfigHelper1 and helper for BandingConfigHelper
+// with core implementations built on above data
+template <ConstructionFailureChance kCfc, uint64_t kCoeffBits, bool kUseSmash,
+          bool kHomogeneous>
+struct BandingConfigHelper1MaybeSupported<
+    kCfc, kCoeffBits, kUseSmash, kHomogeneous, true /* kIsSupported */> {
+ public:
+  // See BandingConfigHelper1. Implementation in ribbon_config.cc
+  static uint32_t GetNumToAdd(uint32_t num_slots);
+
+  // See BandingConfigHelper1. Implementation in ribbon_config.cc
+  static uint32_t GetNumSlots(uint32_t num_to_add);
+};
+
+}  // namespace detail
+
+template <ConstructionFailureChance kCfc, uint64_t kCoeffBits, bool kUseSmash,
+          bool kHomogeneous>
+struct BandingConfigHelper1
+    : public detail::BandingConfigHelper1MaybeSupported<
+          kCfc, kCoeffBits, kUseSmash, kHomogeneous,
+          /* kIsSupported */ kCoeffBits == 64 || kCoeffBits == 128> {
+ public:
+  // Returns a number of entries that can be added to a given number of
+  // slots, with roughly kCfc chance of construction failure per seed,
+  // or better. Does NOT do rounding for InterleavedSoln; call
+  // RoundUpNumSlots for that.
+  //
+  // inherited:
+  // static uint32_t GetNumToAdd(uint32_t num_slots);
+
+  // Returns a number of slots for a given number of entries to add
+  // that should have roughly kCfc chance of construction failure per
+  // seed, or better. Does NOT do rounding for InterleavedSoln; call
+  // RoundUpNumSlots for that.
+  //
+  // num_to_add should not exceed roughly 2/3rds of the maximum value
+  // of the uint32_t type to avoid overflow.
+  //
+  // inherited:
+  // static uint32_t GetNumSlots(uint32_t num_to_add);
+};
+
+// Configured using TypesAndSettings as in ribbon_impl.h
+template <ConstructionFailureChance kCfc, class TypesAndSettings>
+struct BandingConfigHelper1TS
+    : public BandingConfigHelper1<
+          kCfc,
+          /* kCoeffBits */ sizeof(typename TypesAndSettings::CoeffRow) * 8U,
+          TypesAndSettings::kUseSmash, TypesAndSettings::kHomogeneous> {};
+
+// Like BandingConfigHelper1TS except failure chance can be a runtime rather
+// than compile time value.
+template <class TypesAndSettings>
+struct BandingConfigHelper {
+ public:
+  static constexpr ConstructionFailureChance kDefaultFailureChance =
+      TypesAndSettings::kHomogeneous ? kOneIn1000 : kOneIn20;
+
+  static uint32_t GetNumToAdd(
+      uint32_t num_slots,
+      ConstructionFailureChance max_failure = kDefaultFailureChance) {
+    switch (max_failure) {
+      default:
+        assert(false);
+        FALLTHROUGH_INTENDED;
+      case kOneIn20: {
+        using H1 = BandingConfigHelper1TS<kOneIn20, TypesAndSettings>;
+        return H1::GetNumToAdd(num_slots);
+      }
+      case kOneIn2: {
+        using H1 = BandingConfigHelper1TS<kOneIn2, TypesAndSettings>;
+        return H1::GetNumToAdd(num_slots);
+      }
+      case kOneIn1000: {
+        using H1 = BandingConfigHelper1TS<kOneIn1000, TypesAndSettings>;
+        return H1::GetNumToAdd(num_slots);
+      }
+    }
+  }
+
+  static uint32_t GetNumSlots(
+      uint32_t num_to_add,
+      ConstructionFailureChance max_failure = kDefaultFailureChance) {
+    switch (max_failure) {
+      default:
+        assert(false);
+        FALLTHROUGH_INTENDED;
+      case kOneIn20: {
+        using H1 = BandingConfigHelper1TS<kOneIn20, TypesAndSettings>;
+        return H1::GetNumSlots(num_to_add);
+      }
+      case kOneIn2: {
+        using H1 = BandingConfigHelper1TS<kOneIn2, TypesAndSettings>;
+        return H1::GetNumSlots(num_to_add);
+      }
+      case kOneIn1000: {
+        using H1 = BandingConfigHelper1TS<kOneIn1000, TypesAndSettings>;
+        return H1::GetNumSlots(num_to_add);
+      }
+    }
+  }
+};
+
+}  // namespace ribbon
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/ribbon_impl.h b/src/rocksdb/util/ribbon_impl.h
new file mode 100644
index 000000000..0afecc67d
--- /dev/null
+++ b/src/rocksdb/util/ribbon_impl.h
@@ -0,0 +1,1137 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cmath>
+
+#include "port/port.h"  // for PREFETCH
+#include "util/fastrange.h"
+#include "util/ribbon_alg.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace ribbon {
+
+// RIBBON PHSF & RIBBON Filter (Rapid Incremental Boolean Banding ON-the-fly)
+//
+// ribbon_impl.h: templated (parameterized) standard implementations
+//
+// Ribbon is a Perfect Hash Static Function construction useful as a compact
+// static Bloom filter alternative. See ribbon_alg.h for core algorithms
+// and core design details.
+//
+// TODO: more details on trade-offs and practical issues.
+//
+// APIs for configuring Ribbon are in ribbon_config.h
+
+// Ribbon implementations in this file take these parameters, which must be
+// provided in a class/struct type with members expressed in this concept:
+
+// concept TypesAndSettings {
+//   // See RibbonTypes and *Hasher in ribbon_alg.h, except here we have
+//   // the added constraint that Hash be equivalent to either uint32_t or
+//   // uint64_t.
+//   typename Hash;
+//   typename CoeffRow;
+//   typename ResultRow;
+//   typename Index;
+//   typename Key;
+//   static constexpr bool kFirstCoeffAlwaysOne;
+//
+//   // An unsigned integer type for identifying a hash seed, typically
+//   // uint32_t or uint64_t. Importantly, this is the amount of data
+//   // stored in memory for identifying a raw seed. See StandardHasher.
+//   typename Seed;
+//
+//   // When true, the PHSF implements a static filter, expecting just
+//   // keys as inputs for construction. When false, implements a general
+//   // PHSF and expects std::pair<Key, ResultRow> as inputs for
+//   // construction.
+//   static constexpr bool kIsFilter;
+//
+//   // When true, enables a special "homogeneous" filter implementation that
+//   // is slightly faster to construct, and never fails to construct though
+//   // FP rate can quickly explode in cases where corresponding
+//   // non-homogeneous filter would fail (or nearly fail?) to construct.
+//   // For smaller filters, you can configure with ConstructionFailureChance
+//   // smaller than desired FP rate to largely counteract this effect.
+//   // TODO: configuring Homogeneous Ribbon for arbitrarily large filters
+//   // based on data from OptimizeHomogAtScale
+//   static constexpr bool kHomogeneous;
+//
+//   // When true, adds a tiny bit more hashing logic on queries and
+//   // construction to improve utilization at the beginning and end of
+//   // the structure.  Recommended when CoeffRow is only 64 bits (or
+//   // less), so typical num_starts < 10k. Although this is compatible
+//   // with kHomogeneous, the competing space vs. time priorities might
+//   // not be useful.
+//   static constexpr bool kUseSmash;
+//
+//   // When true, allows number of "starts" to be zero, for best support
+//   // of the "no keys to add" case by always returning false for filter
+//   // queries. (This is distinct from the "keys added but no space for
+//   // any data" case, in which a filter always returns true.) The cost
+//   // supporting this is a conditional branch (probably predictable) in
+//   // queries.
+//   static constexpr bool kAllowZeroStarts;
+//
+//   // A seedable stock hash function on Keys. All bits of Hash must
+//   // be reasonably high quality. XXH functions recommended, but
+//   // Murmur, City, Farm, etc. also work.
+//   static Hash HashFn(const Key &, Seed raw_seed);
+// };
+
+// A bit of a hack to automatically construct the type for
+// AddInput based on a constexpr bool.
+template <typename Key, typename ResultRow, bool IsFilter>
+struct AddInputSelector {
+  // For general PHSF, not filter
+  using T = std::pair<Key, ResultRow>;
+};
+
+template <typename Key, typename ResultRow>
+struct AddInputSelector<Key, ResultRow, true /*IsFilter*/> {
+  // For Filter
+  using T = Key;
+};
+
+// To avoid writing 'typename' everywhere that we use types like 'Index'
+#define IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings)                   \
+  using CoeffRow = typename TypesAndSettings::CoeffRow;                      \
+  using ResultRow = typename TypesAndSettings::ResultRow;                    \
+  using Index = typename TypesAndSettings::Index;                            \
+  using Hash = typename TypesAndSettings::Hash;                              \
+  using Key = typename TypesAndSettings::Key;                                \
+  using Seed = typename TypesAndSettings::Seed;                              \
+                                                                             \
+  /* Some more additions */                                                  \
+  using QueryInput = Key;                                                    \
+  using AddInput = typename ROCKSDB_NAMESPACE::ribbon::AddInputSelector<     \
+      Key, ResultRow, TypesAndSettings::kIsFilter>::T;                       \
+  static constexpr auto kCoeffBits =                                         \
+      static_cast<Index>(sizeof(CoeffRow) * 8U);                             \
+                                                                             \
+  /* Export to algorithm */                                                  \
+  static constexpr bool kFirstCoeffAlwaysOne =                               \
+      TypesAndSettings::kFirstCoeffAlwaysOne;                                \
+                                                                             \
+  static_assert(sizeof(CoeffRow) + sizeof(ResultRow) + sizeof(Index) +       \
+                        sizeof(Hash) + sizeof(Key) + sizeof(Seed) +          \
+                        sizeof(QueryInput) + sizeof(AddInput) + kCoeffBits + \
+                        kFirstCoeffAlwaysOne >                               \
+                    0,                                                       \
+                "avoid unused warnings, semicolon expected after macro call")
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4309)  // cast truncating constant
+#pragma warning(disable : 4307)  // arithmetic constant overflow
+#endif
+
+// StandardHasher: A standard implementation of concepts RibbonTypes,
+// PhsfQueryHasher, FilterQueryHasher, and BandingHasher from ribbon_alg.h.
+//
+// This implementation should be suitable for most all practical purposes
+// as it "behaves" across a wide range of settings, with little room left
+// for improvement. The key functionality in this hasher is generating
+// CoeffRows, starts, and (for filters) ResultRows, which could be ~150
+// bits of data or more, from a modest hash of 64 or even just 32 bits, with
+// enough uniformity and bitwise independence to be close to "the best you
+// can do" with available hash information in terms of FP rate and
+// compactness. (64 bits recommended and sufficient for PHSF practical
+// purposes.)
+//
+// Another feature of this hasher is a minimal "premixing" of seeds before
+// they are provided to TypesAndSettings::HashFn in case that function does
+// not provide sufficiently independent hashes when iterating merely
+// sequentially on seeds. (This for example works around a problem with the
+// preview version 0.7.2 of XXH3 used in RocksDB, a.k.a. XXPH3 or Hash64, and
+// MurmurHash1 used in RocksDB, a.k.a. Hash.) We say this pre-mixing step
+// translates "ordinal seeds," which we iterate sequentially to find a
+// solution, into "raw seeds," with many more bits changing for each
+// iteration. The translation is an easily reversible lightweight mixing,
+// not suitable for hashing on its own. An advantage of this approach is that
+// StandardHasher can store just the raw seed (e.g. 64 bits) for fast query
+// times, while from the application perspective, we can limit to a small
+// number of ordinal keys (e.g. 64 in 6 bits) for saving in metadata.
+//
+// The default constructor initializes the seed to ordinal seed zero, which
+// is equal to raw seed zero.
+//
+template <class TypesAndSettings>
+class StandardHasher {
+ public:
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings);
+
+  inline Hash GetHash(const Key& key) const {
+    return TypesAndSettings::HashFn(key, raw_seed_);
+  };
+  // For when AddInput == pair<Key, ResultRow> (kIsFilter == false)
+  inline Hash GetHash(const std::pair<Key, ResultRow>& bi) const {
+    return GetHash(bi.first);
+  };
+  inline Index GetStart(Hash h, Index num_starts) const {
+    // This is "critical path" code because it's required before memory
+    // lookup.
+    //
+    // FastRange gives us a fast and effective mapping from h to the
+    // appropriate range. This depends most, sometimes exclusively, on
+    // upper bits of h.
+    //
+    if (TypesAndSettings::kUseSmash) {
+      // Extra logic to "smash" entries at beginning and end, for
+      // better utilization. For example, without smash and with
+      // kFirstCoeffAlwaysOne, there's about a 30% chance that the
+      // first slot in the banding will be unused, and worse without
+      // kFirstCoeffAlwaysOne. The ending slots are even less utilized
+      // without smash.
+      //
+      // But since this only affects roughly kCoeffBits of the slots,
+      // it's usually small enough to be ignorable (less computation in
+      // this function) when number of slots is roughly 10k or larger.
+      //
+      // The best values for these smash weights might depend on how
+      // densely you're packing entries, and also kCoeffBits, but this
+      // seems to work well for roughly 95% success probability.
+      //
+      constexpr Index kFrontSmash = kCoeffBits / 4;
+      constexpr Index kBackSmash = kCoeffBits / 4;
+      Index start = FastRangeGeneric(h, num_starts + kFrontSmash + kBackSmash);
+      start = std::max(start, kFrontSmash);
+      start -= kFrontSmash;
+      start = std::min(start, num_starts - 1);
+      return start;
+    } else {
+      // For query speed, we allow small number of initial and final
+      // entries to be under-utilized.
+      // NOTE: This call statically enforces that Hash is equivalent to
+      // either uint32_t or uint64_t.
+      return FastRangeGeneric(h, num_starts);
+    }
+  }
+  inline CoeffRow GetCoeffRow(Hash h) const {
+    // This is not so much "critical path" code because it can be done in
+    // parallel (instruction level) with memory lookup.
+    //
+    // When we might have many entries squeezed into a single start,
+    // we need reasonably good remixing for CoeffRow.
+    if (TypesAndSettings::kUseSmash) {
+      // Reasonably good, reasonably fast, reasonably general.
+      // Probably not 1:1 but probably close enough.
+      Unsigned128 a = Multiply64to128(h, kAltCoeffFactor1);
+      Unsigned128 b = Multiply64to128(h, kAltCoeffFactor2);
+      auto cr = static_cast<CoeffRow>(b ^ (a << 64) ^ (a >> 64));
+
+      // Now ensure the value is non-zero
+      if (kFirstCoeffAlwaysOne) {
+        cr |= 1;
+      } else {
+        // Still have to ensure some bit is non-zero
+        cr |= (cr == 0) ? 1 : 0;
+      }
+      return cr;
+    }
+    // If not kUseSmash, we ensure we're not squeezing many entries into a
+    // single start, in part by ensuring num_starts > num_slots / 2. Thus,
+    // here we do not need good remixing for CoeffRow, but just enough that
+    // (a) every bit is reasonably independent from Start.
+    // (b) every Hash-length bit subsequence of the CoeffRow has full or
+    // nearly full entropy from h.
+    // (c) if nontrivial bit subsequences within are correlated, it needs to
+    // be more complicated than exact copy or bitwise not (at least without
+    // kFirstCoeffAlwaysOne), or else there seems to be a kind of
+    // correlated clustering effect.
+    // (d) the CoeffRow is not zero, so that no one input on its own can
+    // doom construction success. (Preferably a mix of 1's and 0's if
+    // satisfying above.)
+
+    // First, establish sufficient bitwise independence from Start, with
+    // multiplication by a large random prime.
+    // Note that we cast to Hash because if we use product bits beyond
+    // original input size, that's going to correlate with Start (FastRange)
+    // even with a (likely) different multiplier here.
+    Hash a = h * kCoeffAndResultFactor;
+
+    static_assert(
+        sizeof(Hash) == sizeof(uint64_t) || sizeof(Hash) == sizeof(uint32_t),
+        "Supported sizes");
+    // If that's big enough, we're done. If not, we have to expand it,
+    // maybe up to 4x size.
+    uint64_t b;
+    if (sizeof(Hash) < sizeof(uint64_t)) {
+      // Almost-trivial hash expansion (OK - see above), favoring roughly
+      // equal number of 1's and 0's in result
+      b = (uint64_t{a} << 32) ^ (a ^ kCoeffXor32);
+    } else {
+      b = a;
+    }
+    static_assert(sizeof(CoeffRow) <= sizeof(Unsigned128), "Supported sizes");
+    Unsigned128 c;
+    if (sizeof(uint64_t) < sizeof(CoeffRow)) {
+      // Almost-trivial hash expansion (OK - see above), favoring roughly
+      // equal number of 1's and 0's in result
+      c = (Unsigned128{b} << 64) ^ (b ^ kCoeffXor64);
+    } else {
+      c = b;
+    }
+    auto cr = static_cast<CoeffRow>(c);
+
+    // Now ensure the value is non-zero
+    if (kFirstCoeffAlwaysOne) {
+      cr |= 1;
+    } else if (sizeof(CoeffRow) == sizeof(Hash)) {
+      // Still have to ensure some bit is non-zero
+      cr |= (cr == 0) ? 1 : 0;
+    } else {
+      // (We did trivial expansion with constant xor, which ensures some
+      // bits are non-zero.)
+    }
+    return cr;
+  }
+  inline ResultRow GetResultRowMask() const {
+    // TODO: will be used with InterleavedSolutionStorage?
+    // For now, all bits set (note: might be a small type so might need to
+    // narrow after promotion)
+    return static_cast<ResultRow>(~ResultRow{0});
+  }
+  inline ResultRow GetResultRowFromHash(Hash h) const {
+    if (TypesAndSettings::kIsFilter && !TypesAndSettings::kHomogeneous) {
+      // This is not so much "critical path" code because it can be done in
+      // parallel (instruction level) with memory lookup.
+      //
+      // ResultRow bits only needs to be independent from CoeffRow bits if
+      // many entries might have the same start location, where "many" is
+      // comparable to number of hash bits or kCoeffBits. If !kUseSmash
+      // and num_starts > kCoeffBits, it is safe and efficient to draw from
+      // the same bits computed for CoeffRow, which are reasonably
+      // independent from Start. (Inlining and common subexpression
+      // elimination with GetCoeffRow should make this
+      // a single shared multiplication in generated code when !kUseSmash.)
+      Hash a = h * kCoeffAndResultFactor;
+
+      // The bits here that are *most* independent of Start are the highest
+      // order bits (as in Knuth multiplicative hash). To make those the
+      // most preferred for use in the result row, we do a bswap here.
+      auto rr = static_cast<ResultRow>(EndianSwapValue(a));
+      return rr & GetResultRowMask();
+    } else {
+      // Must be zero
+      return 0;
+    }
+  }
+  // For when AddInput == Key (kIsFilter == true)
+  inline ResultRow GetResultRowFromInput(const Key&) const {
+    // Must be zero
+    return 0;
+  }
+  // For when AddInput == pair<Key, ResultRow> (kIsFilter == false)
+  inline ResultRow GetResultRowFromInput(
+      const std::pair<Key, ResultRow>& bi) const {
+    // Simple extraction
+    return bi.second;
+  }
+
+  // Seed tracking APIs - see class comment
+  void SetRawSeed(Seed seed) { raw_seed_ = seed; }
+  Seed GetRawSeed() { return raw_seed_; }
+  void SetOrdinalSeed(Seed count) {
+    // A simple, reversible mixing of any size (whole bytes) up to 64 bits.
+    // This allows casting the raw seed to any smaller size we use for
+    // ordinal seeds without risk of duplicate raw seeds for unique ordinal
+    // seeds.
+
+    // Seed type might be smaller than numerical promotion size, but Hash
+    // should be at least that size, so we use Hash as intermediate type.
+    static_assert(sizeof(Seed) <= sizeof(Hash),
+                  "Hash must be at least size of Seed");
+
+    // Multiply by a large random prime (one-to-one for any prefix of bits)
+    Hash tmp = count * kToRawSeedFactor;
+    // Within-byte one-to-one mixing
+    static_assert((kSeedMixMask & (kSeedMixMask >> kSeedMixShift)) == 0,
+                  "Illegal mask+shift");
+    tmp ^= (tmp & kSeedMixMask) >> kSeedMixShift;
+    raw_seed_ = static_cast<Seed>(tmp);
+    // dynamic verification
+    assert(GetOrdinalSeed() == count);
+  }
+  Seed GetOrdinalSeed() {
+    Hash tmp = raw_seed_;
+    // Within-byte one-to-one mixing (its own inverse)
+    tmp ^= (tmp & kSeedMixMask) >> kSeedMixShift;
+    // Multiply by 64-bit multiplicative inverse
+    static_assert(kToRawSeedFactor * kFromRawSeedFactor == Hash{1},
+                  "Must be inverses");
+    return static_cast<Seed>(tmp * kFromRawSeedFactor);
+  }
+
+ protected:
+  // For expanding hash:
+  // large random prime
+  static constexpr Hash kCoeffAndResultFactor =
+      static_cast<Hash>(0xc28f82822b650bedULL);
+  static constexpr uint64_t kAltCoeffFactor1 = 0x876f170be4f1fcb9U;
+  static constexpr uint64_t kAltCoeffFactor2 = 0xf0433a4aecda4c5fU;
+  // random-ish data
+  static constexpr uint32_t kCoeffXor32 = 0xa6293635U;
+  static constexpr uint64_t kCoeffXor64 = 0xc367844a6e52731dU;
+
+  // For pre-mixing seeds
+  static constexpr Hash kSeedMixMask = static_cast<Hash>(0xf0f0f0f0f0f0f0f0ULL);
+  static constexpr unsigned kSeedMixShift = 4U;
+  static constexpr Hash kToRawSeedFactor =
+      static_cast<Hash>(0xc78219a23eeadd03ULL);
+  static constexpr Hash kFromRawSeedFactor =
+      static_cast<Hash>(0xfe1a137d14b475abULL);
+
+  // See class description
+  Seed raw_seed_ = 0;
+};
+
+// StandardRehasher (and StandardRehasherAdapter): A variant of
+// StandardHasher that uses the same type for keys as for hashes.
+// This is primarily intended for building a Ribbon filter
+// from existing hashes without going back to original inputs in
+// order to apply a different seed. This hasher seeds a 1-to-1 mixing
+// transformation to apply a seed to an existing hash. (Untested for
+// hash-sized keys that are not already uniformly distributed.) This
+// transformation builds on the seed pre-mixing done in StandardHasher.
+//
+// Testing suggests essentially no degradation of solution success rate
+// vs. going back to original inputs when changing hash seeds. For example:
+// Average re-seeds for solution with r=128, 1.02x overhead, and ~100k keys
+// is about 1.10 for both StandardHasher and StandardRehasher.
+//
+// StandardRehasher is not really recommended for general PHSFs (not
+// filters) because a collision in the original hash could prevent
+// construction despite re-seeding the Rehasher. (Such collisions
+// do not interfere with filter construction.)
+//
+// concept RehasherTypesAndSettings: like TypesAndSettings but
+// does not require Key or HashFn.
+template <class RehasherTypesAndSettings>
+class StandardRehasherAdapter : public RehasherTypesAndSettings {
+ public:
+  using Hash = typename RehasherTypesAndSettings::Hash;
+  using Key = Hash;
+  using Seed = typename RehasherTypesAndSettings::Seed;
+
+  static Hash HashFn(const Hash& input, Seed raw_seed) {
+    // Note: raw_seed is already lightly pre-mixed, and this multiplication
+    // by a large prime is sufficient mixing (low-to-high bits) on top of
+    // that for good FastRange results, which depends primarily on highest
+    // bits. (The hashed CoeffRow and ResultRow are less sensitive to
+    // mixing than Start.)
+    // Also note: did consider adding ^ (input >> some) before the
+    // multiplication, but doesn't appear to be necessary.
+    return (input ^ raw_seed) * kRehashFactor;
+  }
+
+ private:
+  static constexpr Hash kRehashFactor =
+      static_cast<Hash>(0x6193d459236a3a0dULL);
+};
+
+// See comment on StandardRehasherAdapter
+template <class RehasherTypesAndSettings>
+using StandardRehasher =
+    StandardHasher<StandardRehasherAdapter<RehasherTypesAndSettings>>;
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+// Especially with smaller hashes (e.g. 32 bit), there can be noticeable
+// false positives due to collisions in the Hash returned by GetHash.
+// This function returns the expected FP rate due to those collisions,
+// which can be added to the expected FP rate from the underlying data
+// structure. (Note: technically, a + b is only a good approximation of
+// 1-(1-a)(1-b) == a + b - a*b, if a and b are much closer to 0 than to 1.)
+// The number of entries added can be a double here in case it's an
+// average.
+template <class Hasher, typename Numerical>
+double ExpectedCollisionFpRate(const Hasher& hasher, Numerical added) {
+  // Standardize on the 'double' specialization
+  return ExpectedCollisionFpRate(hasher, 1.0 * added);
+}
+template <class Hasher>
+double ExpectedCollisionFpRate(const Hasher& /*hasher*/, double added) {
+  // Technically, there could be overlap among the added, but ignoring that
+  // is typically close enough.
+  return added / std::pow(256.0, sizeof(typename Hasher::Hash));
+}
+
+// StandardBanding: a canonical implementation of BandingStorage and
+// BacktrackStorage, with convenience API for banding (solving with on-the-fly
+// Gaussian elimination) with and without backtracking.
+template <class TypesAndSettings>
+class StandardBanding : public StandardHasher<TypesAndSettings> {
+ public:
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings);
+
+  StandardBanding(Index num_slots = 0, Index backtrack_size = 0) {
+    Reset(num_slots, backtrack_size);
+  }
+
+  void Reset(Index num_slots, Index backtrack_size = 0) {
+    if (num_slots == 0) {
+      // Unusual (TypesAndSettings::kAllowZeroStarts) or "uninitialized"
+      num_starts_ = 0;
+    } else {
+      // Normal
+      assert(num_slots >= kCoeffBits);
+      if (num_slots > num_slots_allocated_) {
+        coeff_rows_.reset(new CoeffRow[num_slots]());
+        if (!TypesAndSettings::kHomogeneous) {
+          // Note: don't strictly have to zero-init result_rows,
+          // except possible information leakage, etc ;)
+          result_rows_.reset(new ResultRow[num_slots]());
+        }
+        num_slots_allocated_ = num_slots;
+      } else {
+        for (Index i = 0; i < num_slots; ++i) {
+          coeff_rows_[i] = 0;
+          if (!TypesAndSettings::kHomogeneous) {
+            // Note: don't strictly have to zero-init result_rows,
+            // except possible information leakage, etc ;)
+            result_rows_[i] = 0;
+          }
+        }
+      }
+      num_starts_ = num_slots - kCoeffBits + 1;
+    }
+    EnsureBacktrackSize(backtrack_size);
+  }
+
+  void EnsureBacktrackSize(Index backtrack_size) {
+    if (backtrack_size > backtrack_size_) {
+      backtrack_.reset(new Index[backtrack_size]);
+      backtrack_size_ = backtrack_size;
+    }
+  }
+
+  // ********************************************************************
+  // From concept BandingStorage
+
+  inline bool UsePrefetch() const {
+    // A rough guesstimate of when prefetching during construction pays off.
+    // TODO: verify/validate
+    return num_starts_ > 1500;
+  }
+  inline void Prefetch(Index i) const {
+    PREFETCH(&coeff_rows_[i], 1 /* rw */, 1 /* locality */);
+    if (!TypesAndSettings::kHomogeneous) {
+      PREFETCH(&result_rows_[i], 1 /* rw */, 1 /* locality */);
+    }
+  }
+  inline void LoadRow(Index i, CoeffRow* cr, ResultRow* rr,
+                      bool for_back_subst) const {
+    *cr = coeff_rows_[i];
+    if (TypesAndSettings::kHomogeneous) {
+      if (for_back_subst && *cr == 0) {
+        // Cheap pseudorandom data to fill unconstrained solution rows
+        *rr = static_cast<ResultRow>(i * 0x9E3779B185EBCA87ULL);
+      } else {
+        *rr = 0;
+      }
+    } else {
+      *rr = result_rows_[i];
+    }
+  }
+  inline void StoreRow(Index i, CoeffRow cr, ResultRow rr) {
+    coeff_rows_[i] = cr;
+    if (TypesAndSettings::kHomogeneous) {
+      assert(rr == 0);
+    } else {
+      result_rows_[i] = rr;
+    }
+  }
+  inline Index GetNumStarts() const { return num_starts_; }
+
+  // from concept BacktrackStorage, for when backtracking is used
+  inline bool UseBacktrack() const { return true; }
+  inline void BacktrackPut(Index i, Index to_save) { backtrack_[i] = to_save; }
+  inline Index BacktrackGet(Index i) const { return backtrack_[i]; }
+
+  // ********************************************************************
+  // Some useful API, still somewhat low level. Here an input is
+  // a Key for filters, or std::pair<Key, ResultRow> for general PHSF.
+
+  // Adds a range of inputs to the banding, returning true if successful.
+  // False means none or some may have been successfully added, so it's
+  // best to Reset this banding before any further use.
+  //
+  // Adding can fail even before all the "slots" are completely "full".
+  //
+  template <typename InputIterator>
+  bool AddRange(InputIterator begin, InputIterator end) {
+    assert(num_starts_ > 0 || TypesAndSettings::kAllowZeroStarts);
+    if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) {
+      // Unusual. Can't add any in this case.
+      return begin == end;
+    }
+    // Normal
+    return BandingAddRange(this, *this, begin, end);
+  }
+
+  // Adds a range of inputs to the banding, returning true if successful,
+  // or if unsuccessful, rolls back to state before this call and returns
+  // false. Caller guarantees that the number of inputs in this batch
+  // does not exceed `backtrack_size` provided to Reset.
+  //
+  // Adding can fail even before all the "slots" are completely "full".
+  //
+  template <typename InputIterator>
+  bool AddRangeOrRollBack(InputIterator begin, InputIterator end) {
+    assert(num_starts_ > 0 || TypesAndSettings::kAllowZeroStarts);
+    if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) {
+      // Unusual. Can't add any in this case.
+      return begin == end;
+    }
+    // else Normal
+    return BandingAddRange(this, this, *this, begin, end);
+  }
+
+  // Adds a single input to the banding, returning true if successful.
+  // If unsuccessful, returns false and banding state is unchanged.
+  //
+  // Adding can fail even before all the "slots" are completely "full".
+  //
+  bool Add(const AddInput& input) {
+    // Pointer can act as iterator
+    return AddRange(&input, &input + 1);
+  }
+
+  // Return the number of "occupied" rows (with non-zero coefficients stored).
+  Index GetOccupiedCount() const {
+    Index count = 0;
+    if (num_starts_ > 0) {
+      const Index num_slots = num_starts_ + kCoeffBits - 1;
+      for (Index i = 0; i < num_slots; ++i) {
+        if (coeff_rows_[i] != 0) {
+          ++count;
+        }
+      }
+    }
+    return count;
+  }
+
+  // Returns whether a row is "occupied" in the banding (non-zero
+  // coefficients stored). (Only recommended for debug/test)
+  bool IsOccupied(Index i) { return coeff_rows_[i] != 0; }
+
+  // ********************************************************************
+  // High-level API
+
+  // Iteratively (a) resets the structure for `num_slots`, (b) attempts
+  // to add the range of inputs, and (c) if unsuccessful, chooses next
+  // hash seed, until either successful or unsuccessful with all the
+  // allowed seeds. Returns true if successful. In that case, use
+  // GetOrdinalSeed() or GetRawSeed() to get the successful seed.
+  //
+  // The allowed sequence of hash seeds is determined by
+  // `starting_ordinal_seed,` the first ordinal seed to be attempted
+  // (see StandardHasher), and `ordinal_seed_mask,` a bit mask (power of
+  // two minus one) for the range of ordinal seeds to consider. The
+  // max number of seeds considered will be ordinal_seed_mask + 1.
+  // For filters we suggest `starting_ordinal_seed` be chosen randomly
+  // or round-robin, to minimize false positive correlations between keys.
+  //
+  // If unsuccessful, how best to continue is going to be application
+  // specific. It should be possible to choose parameters such that
+  // failure is extremely unlikely, using max_seed around 32 to 64.
+  // (TODO: APIs to help choose parameters) One option for fallback in
+  // constructing a filter is to construct a Bloom filter instead.
+  // Increasing num_slots is an option, but should not be used often
+  // unless construction maximum latency is a concern (rather than
+  // average running time of construction). Instead, choose parameters
+  // appropriately and trust that seeds are independent. (Also,
+  // increasing num_slots without changing hash seed would have a
+  // significant correlation in success, rather than independence.)
+  template <typename InputIterator>
+  bool ResetAndFindSeedToSolve(Index num_slots, InputIterator begin,
+                               InputIterator end,
+                               Seed starting_ordinal_seed = 0U,
+                               Seed ordinal_seed_mask = 63U) {
+    // power of 2 minus 1
+    assert((ordinal_seed_mask & (ordinal_seed_mask + 1)) == 0);
+    // starting seed is within mask
+    assert((starting_ordinal_seed & ordinal_seed_mask) ==
+           starting_ordinal_seed);
+    starting_ordinal_seed &= ordinal_seed_mask;  // if not debug
+
+    Seed cur_ordinal_seed = starting_ordinal_seed;
+    do {
+      StandardHasher<TypesAndSettings>::SetOrdinalSeed(cur_ordinal_seed);
+      Reset(num_slots);
+      bool success = AddRange(begin, end);
+      if (success) {
+        return true;
+      }
+      cur_ordinal_seed = (cur_ordinal_seed + 1) & ordinal_seed_mask;
+    } while (cur_ordinal_seed != starting_ordinal_seed);
+    // Reached limit by circling around
+    return false;
+  }
+
+  static std::size_t EstimateMemoryUsage(uint32_t num_slots) {
+    std::size_t bytes_coeff_rows = num_slots * sizeof(CoeffRow);
+    std::size_t bytes_result_rows = num_slots * sizeof(ResultRow);
+    std::size_t bytes_backtrack = 0;
+    std::size_t bytes_banding =
+        bytes_coeff_rows + bytes_result_rows + bytes_backtrack;
+
+    return bytes_banding;
+  }
+
+ protected:
+  // TODO: explore combining in a struct
+  std::unique_ptr<CoeffRow[]> coeff_rows_;
+  std::unique_ptr<ResultRow[]> result_rows_;
+  // We generally store "starts" instead of slots for speed of GetStart(),
+  // as in StandardHasher.
+  Index num_starts_ = 0;
+  Index num_slots_allocated_ = 0;
+  std::unique_ptr<Index[]> backtrack_;
+  Index backtrack_size_ = 0;
+};
+
+// Implements concept SimpleSolutionStorage, mostly for demonstration
+// purposes. This is "in memory" only because it does not handle byte
+// ordering issues for serialization.
+template <class TypesAndSettings>
+class InMemSimpleSolution {
+ public:
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings);
+
+  void PrepareForNumStarts(Index num_starts) {
+    if (TypesAndSettings::kAllowZeroStarts && num_starts == 0) {
+      // Unusual
+      num_starts_ = 0;
+    } else {
+      // Normal
+      const Index num_slots = num_starts + kCoeffBits - 1;
+      assert(num_slots >= kCoeffBits);
+      if (num_slots > num_slots_allocated_) {
+        // Do not need to init the memory
+        solution_rows_.reset(new ResultRow[num_slots]);
+        num_slots_allocated_ = num_slots;
+      }
+      num_starts_ = num_starts;
+    }
+  }
+
+  Index GetNumStarts() const { return num_starts_; }
+
+  ResultRow Load(Index slot_num) const { return solution_rows_[slot_num]; }
+
+  void Store(Index slot_num, ResultRow solution_row) {
+    solution_rows_[slot_num] = solution_row;
+  }
+
+  // ********************************************************************
+  // High-level API
+
+  template <typename BandingStorage>
+  void BackSubstFrom(const BandingStorage& bs) {
+    if (TypesAndSettings::kAllowZeroStarts && bs.GetNumStarts() == 0) {
+      // Unusual
+      PrepareForNumStarts(0);
+    } else {
+      // Normal
+      SimpleBackSubst(this, bs);
+    }
+  }
+
+  template <typename PhsfQueryHasher>
+  ResultRow PhsfQuery(const Key& input, const PhsfQueryHasher& hasher) const {
+    // assert(!TypesAndSettings::kIsFilter);  Can be useful in testing
+    if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) {
+      // Unusual
+      return 0;
+    } else {
+      // Normal
+      return SimplePhsfQuery(input, hasher, *this);
+    }
+  }
+
+  template <typename FilterQueryHasher>
+  bool FilterQuery(const Key& input, const FilterQueryHasher& hasher) const {
+    assert(TypesAndSettings::kIsFilter);
+    if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) {
+      // Unusual. Zero starts presumes no keys added -> always false
+      return false;
+    } else {
+      // Normal, or upper_num_columns_ == 0 means "no space for data" and
+      // thus will always return true.
+      return SimpleFilterQuery(input, hasher, *this);
+    }
+  }
+
+  double ExpectedFpRate() const {
+    assert(TypesAndSettings::kIsFilter);
+    if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) {
+      // Unusual, but we don't have FPs if we always return false.
+      return 0.0;
+    }
+    // else Normal
+
+    // Each result (solution) bit (column) cuts FP rate in half
+    return std::pow(0.5, 8U * sizeof(ResultRow));
+  }
+
+  // ********************************************************************
+  // Static high-level API
+
+  // Round up to a number of slots supported by this structure. Note that
+  // this needs to be must be taken into account for the banding if this
+  // solution layout/storage is to be used.
+  static Index RoundUpNumSlots(Index num_slots) {
+    // Must be at least kCoeffBits for at least one start
+    // Or if not smash, even more because hashing not equipped
+    // for stacking up so many entries on a single start location
+    auto min_slots = kCoeffBits * (TypesAndSettings::kUseSmash ? 1 : 2);
+    return std::max(num_slots, static_cast<Index>(min_slots));
+  }
+
+ protected:
+  // We generally store "starts" instead of slots for speed of GetStart(),
+  // as in StandardHasher.
+  Index num_starts_ = 0;
+  Index num_slots_allocated_ = 0;
+  std::unique_ptr<ResultRow[]> solution_rows_;
+};
+
+// Implements concept InterleavedSolutionStorage always using little-endian
+// byte order, so easy for serialization/deserialization. This implementation
+// fully supports fractional bits per key, where any number of segments
+// (number of bytes multiple of sizeof(CoeffRow)) can be used with any number
+// of slots that is a multiple of kCoeffBits.
+//
+// The structure is passed an externally allocated/de-allocated byte buffer
+// that is optionally pre-populated (from storage) for answering queries,
+// or can be populated by BackSubstFrom.
+//
+template <class TypesAndSettings>
+class SerializableInterleavedSolution {
+ public:
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings);
+
+  // Does not take ownership of `data` but uses it (up to `data_len` bytes)
+  // throughout lifetime
+  SerializableInterleavedSolution(char* data, size_t data_len)
+      : data_(data), data_len_(data_len) {}
+
+  void PrepareForNumStarts(Index num_starts) {
+    assert(num_starts == 0 || (num_starts % kCoeffBits == 1));
+    num_starts_ = num_starts;
+
+    InternalConfigure();
+  }
+
+  Index GetNumStarts() const { return num_starts_; }
+
+  Index GetNumBlocks() const {
+    const Index num_slots = num_starts_ + kCoeffBits - 1;
+    return num_slots / kCoeffBits;
+  }
+
+  Index GetUpperNumColumns() const { return upper_num_columns_; }
+
+  Index GetUpperStartBlock() const { return upper_start_block_; }
+
+  Index GetNumSegments() const {
+    return static_cast<Index>(data_len_ / sizeof(CoeffRow));
+  }
+
+  CoeffRow LoadSegment(Index segment_num) const {
+    assert(data_ != nullptr);  // suppress clang analyzer report
+    return DecodeFixedGeneric<CoeffRow>(data_ + segment_num * sizeof(CoeffRow));
+  }
+  void StoreSegment(Index segment_num, CoeffRow val) {
+    assert(data_ != nullptr);  // suppress clang analyzer report
+    EncodeFixedGeneric(data_ + segment_num * sizeof(CoeffRow), val);
+  }
+  void PrefetchSegmentRange(Index begin_segment_num,
+                            Index end_segment_num) const {
+    if (end_segment_num == begin_segment_num) {
+      // Nothing to do
+      return;
+    }
+    char* cur = data_ + begin_segment_num * sizeof(CoeffRow);
+    char* last = data_ + (end_segment_num - 1) * sizeof(CoeffRow);
+    while (cur < last) {
+      PREFETCH(cur, 0 /* rw */, 1 /* locality */);
+      cur += CACHE_LINE_SIZE;
+    }
+    PREFETCH(last, 0 /* rw */, 1 /* locality */);
+  }
+
+  // ********************************************************************
+  // High-level API
+
+  void ConfigureForNumBlocks(Index num_blocks) {
+    if (num_blocks == 0) {
+      PrepareForNumStarts(0);
+    } else {
+      PrepareForNumStarts(num_blocks * kCoeffBits - kCoeffBits + 1);
+    }
+  }
+
+  void ConfigureForNumSlots(Index num_slots) {
+    assert(num_slots % kCoeffBits == 0);
+    ConfigureForNumBlocks(num_slots / kCoeffBits);
+  }
+
+  template <typename BandingStorage>
+  void BackSubstFrom(const BandingStorage& bs) {
+    if (TypesAndSettings::kAllowZeroStarts && bs.GetNumStarts() == 0) {
+      // Unusual
+      PrepareForNumStarts(0);
+    } else {
+      // Normal
+      InterleavedBackSubst(this, bs);
+    }
+  }
+
+  template <typename PhsfQueryHasher>
+  ResultRow PhsfQuery(const Key& input, const PhsfQueryHasher& hasher) const {
+    // assert(!TypesAndSettings::kIsFilter);  Can be useful in testing
+    if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) {
+      // Unusual
+      return 0;
+    } else {
+      // Normal
+      // NOTE: not using a struct to encourage compiler optimization
+      Hash hash;
+      Index segment_num;
+      Index num_columns;
+      Index start_bit;
+      InterleavedPrepareQuery(input, hasher, *this, &hash, &segment_num,
+                              &num_columns, &start_bit);
+      return InterleavedPhsfQuery(hash, segment_num, num_columns, start_bit,
+                                  hasher, *this);
+    }
+  }
+
+  template <typename FilterQueryHasher>
+  bool FilterQuery(const Key& input, const FilterQueryHasher& hasher) const {
+    assert(TypesAndSettings::kIsFilter);
+    if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) {
+      // Unusual. Zero starts presumes no keys added -> always false
+      return false;
+    } else {
+      // Normal, or upper_num_columns_ == 0 means "no space for data" and
+      // thus will always return true.
+      // NOTE: not using a struct to encourage compiler optimization
+      Hash hash;
+      Index segment_num;
+      Index num_columns;
+      Index start_bit;
+      InterleavedPrepareQuery(input, hasher, *this, &hash, &segment_num,
+                              &num_columns, &start_bit);
+      return InterleavedFilterQuery(hash, segment_num, num_columns, start_bit,
+                                    hasher, *this);
+    }
+  }
+
+  double ExpectedFpRate() const {
+    assert(TypesAndSettings::kIsFilter);
+    if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) {
+      // Unusual. Zero starts presumes no keys added -> always false
+      return 0.0;
+    }
+    // else Normal
+
+    // Note: Ignoring smash setting; still close enough in that case
+    double lower_portion =
+        (upper_start_block_ * 1.0 * kCoeffBits) / num_starts_;
+
+    // Each result (solution) bit (column) cuts FP rate in half. Weight that
+    // for upper and lower number of bits (columns).
+    return lower_portion * std::pow(0.5, upper_num_columns_ - 1) +
+           (1.0 - lower_portion) * std::pow(0.5, upper_num_columns_);
+  }
+
+  // ********************************************************************
+  // Static high-level API
+
+  // Round up to a number of slots supported by this structure. Note that
+  // this needs to be must be taken into account for the banding if this
+  // solution layout/storage is to be used.
+  static Index RoundUpNumSlots(Index num_slots) {
+    // Must be multiple of kCoeffBits
+    Index corrected = (num_slots + kCoeffBits - 1) / kCoeffBits * kCoeffBits;
+
+    // Do not use num_starts==1 unless kUseSmash, because the hashing
+    // might not be equipped for stacking up so many entries on a
+    // single start location.
+    if (!TypesAndSettings::kUseSmash && corrected == kCoeffBits) {
+      corrected += kCoeffBits;
+    }
+    return corrected;
+  }
+
+  // Round down to a number of slots supported by this structure. Note that
+  // this needs to be must be taken into account for the banding if this
+  // solution layout/storage is to be used.
+  static Index RoundDownNumSlots(Index num_slots) {
+    // Must be multiple of kCoeffBits
+    Index corrected = num_slots / kCoeffBits * kCoeffBits;
+
+    // Do not use num_starts==1 unless kUseSmash, because the hashing
+    // might not be equipped for stacking up so many entries on a
+    // single start location.
+    if (!TypesAndSettings::kUseSmash && corrected == kCoeffBits) {
+      corrected = 0;
+    }
+    return corrected;
+  }
+
+  // Compute the number of bytes for a given number of slots and desired
+  // FP rate. Since desired FP rate might not be exactly achievable,
+  // rounding_bias32==0 means to always round toward lower FP rate
+  // than desired (more bytes); rounding_bias32==max uint32_t means always
+  // round toward higher FP rate than desired (fewer bytes); other values
+  // act as a proportional threshold or bias between the two.
+  static size_t GetBytesForFpRate(Index num_slots, double desired_fp_rate,
+                                  uint32_t rounding_bias32) {
+    return InternalGetBytesForFpRate(num_slots, desired_fp_rate,
+                                     1.0 / desired_fp_rate, rounding_bias32);
+  }
+
+  // The same, but specifying desired accuracy as 1.0 / FP rate, or
+  // one_in_fp_rate. E.g. desired_one_in_fp_rate=100 means 1% FP rate.
+  static size_t GetBytesForOneInFpRate(Index num_slots,
+                                       double desired_one_in_fp_rate,
+                                       uint32_t rounding_bias32) {
+    return InternalGetBytesForFpRate(num_slots, 1.0 / desired_one_in_fp_rate,
+                                     desired_one_in_fp_rate, rounding_bias32);
+  }
+
+ protected:
+  static size_t InternalGetBytesForFpRate(Index num_slots,
+                                          double desired_fp_rate,
+                                          double desired_one_in_fp_rate,
+                                          uint32_t rounding_bias32) {
+    assert(TypesAndSettings::kIsFilter);
+    if (TypesAndSettings::kAllowZeroStarts) {
+      if (num_slots == 0) {
+        // Unusual. Zero starts presumes no keys added -> always false (no FPs)
+        return 0U;
+      }
+    } else {
+      assert(num_slots > 0);
+    }
+    // Must be rounded up already.
+    assert(RoundUpNumSlots(num_slots) == num_slots);
+
+    if (desired_one_in_fp_rate > 1.0 && desired_fp_rate < 1.0) {
+      // Typical: less than 100% FP rate
+      if (desired_one_in_fp_rate <= static_cast<ResultRow>(-1)) {
+        // Typical: Less than maximum result row entropy
+        ResultRow rounded = static_cast<ResultRow>(desired_one_in_fp_rate);
+        int lower_columns = FloorLog2(rounded);
+        double lower_columns_fp_rate = std::pow(2.0, -lower_columns);
+        double upper_columns_fp_rate = std::pow(2.0, -(lower_columns + 1));
+        // Floating point don't let me down!
+        assert(lower_columns_fp_rate >= desired_fp_rate);
+        assert(upper_columns_fp_rate <= desired_fp_rate);
+
+        double lower_portion = (desired_fp_rate - upper_columns_fp_rate) /
+                               (lower_columns_fp_rate - upper_columns_fp_rate);
+        // Floating point don't let me down!
+        assert(lower_portion >= 0.0);
+        assert(lower_portion <= 1.0);
+
+        double rounding_bias = (rounding_bias32 + 0.5) / double{0x100000000};
+        assert(rounding_bias > 0.0);
+        assert(rounding_bias < 1.0);
+
+        // Note: Ignoring smash setting; still close enough in that case
+        Index num_starts = num_slots - kCoeffBits + 1;
+        // Lower upper_start_block means lower FP rate (higher accuracy)
+        Index upper_start_block = static_cast<Index>(
+            (lower_portion * num_starts + rounding_bias) / kCoeffBits);
+        Index num_blocks = num_slots / kCoeffBits;
+        assert(upper_start_block < num_blocks);
+
+        // Start by assuming all blocks use lower number of columns
+        Index num_segments = num_blocks * static_cast<Index>(lower_columns);
+        // Correct by 1 each for blocks using upper number of columns
+        num_segments += (num_blocks - upper_start_block);
+        // Total bytes
+        return num_segments * sizeof(CoeffRow);
+      } else {
+        // one_in_fp_rate too big, thus requested FP rate is smaller than
+        // supported. Use max number of columns for minimum supported FP rate.
+        return num_slots * sizeof(ResultRow);
+      }
+    } else {
+      // Effectively asking for 100% FP rate, or NaN etc.
+      if (TypesAndSettings::kAllowZeroStarts) {
+        // Zero segments
+        return 0U;
+      } else {
+        // One segment (minimum size, maximizing FP rate)
+        return sizeof(CoeffRow);
+      }
+    }
+  }
+
+  void InternalConfigure() {
+    const Index num_blocks = GetNumBlocks();
+    Index num_segments = GetNumSegments();
+
+    if (num_blocks == 0) {
+      // Exceptional
+      upper_num_columns_ = 0;
+      upper_start_block_ = 0;
+    } else {
+      // Normal
+      upper_num_columns_ =
+          (num_segments + /*round up*/ num_blocks - 1) / num_blocks;
+      upper_start_block_ = upper_num_columns_ * num_blocks - num_segments;
+      // Unless that's more columns than supported by ResultRow data type
+      if (upper_num_columns_ > 8U * sizeof(ResultRow)) {
+        // Use maximum columns (there will be space unused)
+        upper_num_columns_ = static_cast<Index>(8U * sizeof(ResultRow));
+        upper_start_block_ = 0;
+        num_segments = num_blocks * upper_num_columns_;
+      }
+    }
+    // Update data_len_ for correct rounding and/or unused space
+    // NOTE: unused space stays gone if we PrepareForNumStarts again.
+    // We are prioritizing minimizing the number of fields over making
+    // the "unusued space" feature work well.
+    data_len_ = num_segments * sizeof(CoeffRow);
+  }
+
+  char* const data_;
+  size_t data_len_;
+  Index num_starts_ = 0;
+  Index upper_num_columns_ = 0;
+  Index upper_start_block_ = 0;
+};
+
+}  // namespace ribbon
+
+}  // namespace ROCKSDB_NAMESPACE
+
+// For convenience working with templates
+#define IMPORT_RIBBON_IMPL_TYPES(TypesAndSettings)                            \
+  using Hasher = ROCKSDB_NAMESPACE::ribbon::StandardHasher<TypesAndSettings>; \
+  using Banding =                                                             \
+      ROCKSDB_NAMESPACE::ribbon::StandardBanding<TypesAndSettings>;           \
+  using SimpleSoln =                                                          \
+      ROCKSDB_NAMESPACE::ribbon::InMemSimpleSolution<TypesAndSettings>;       \
+  using InterleavedSoln =                                                     \
+      ROCKSDB_NAMESPACE::ribbon::SerializableInterleavedSolution<             \
+          TypesAndSettings>;                                                  \
+  static_assert(sizeof(Hasher) + sizeof(Banding) + sizeof(SimpleSoln) +       \
+                        sizeof(InterleavedSoln) >                             \
+                    0,                                                        \
+                "avoid unused warnings, semicolon expected after macro call")
diff --git a/src/rocksdb/util/ribbon_test.cc b/src/rocksdb/util/ribbon_test.cc
new file mode 100644
index 000000000..6519df3d5
--- /dev/null
+++ b/src/rocksdb/util/ribbon_test.cc
@@ -0,0 +1,1308 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/system_clock.h"
+#include "test_util/testharness.h"
+#include "util/bloom_impl.h"
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/ribbon_config.h"
+#include "util/ribbon_impl.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+
+#ifndef GFLAGS
+uint32_t FLAGS_thoroughness = 5;
+uint32_t FLAGS_max_add = 0;
+uint32_t FLAGS_min_check = 4000;
+uint32_t FLAGS_max_check = 100000;
+bool FLAGS_verbose = false;
+
+bool FLAGS_find_occ = false;
+bool FLAGS_find_slot_occ = false;
+double FLAGS_find_next_factor = 1.618;
+uint32_t FLAGS_find_iters = 10000;
+uint32_t FLAGS_find_min_slots = 128;
+uint32_t FLAGS_find_max_slots = 1000000;
+
+bool FLAGS_optimize_homog = false;
+uint32_t FLAGS_optimize_homog_slots = 30000000;
+uint32_t FLAGS_optimize_homog_check = 200000;
+double FLAGS_optimize_homog_granularity = 0.002;
+#else
+#include "util/gflags_compat.h"
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+// Using 500 is a good test when you have time to be thorough.
+// Default is for general RocksDB regression test runs.
+DEFINE_uint32(thoroughness, 5, "iterations per configuration");
+DEFINE_uint32(max_add, 0,
+              "Add up to this number of entries to a single filter in "
+              "CompactnessAndBacktrackAndFpRate; 0 == reasonable default");
+DEFINE_uint32(min_check, 4000,
+              "Minimum number of novel entries for testing FP rate");
+DEFINE_uint32(max_check, 10000,
+              "Maximum number of novel entries for testing FP rate");
+DEFINE_bool(verbose, false, "Print extra details");
+
+// Options for FindOccupancy, which is more of a tool than a test.
+DEFINE_bool(find_occ, false, "whether to run the FindOccupancy tool");
+DEFINE_bool(find_slot_occ, false,
+            "whether to show individual slot occupancies with "
+            "FindOccupancy tool");
+DEFINE_double(find_next_factor, 1.618,
+              "factor to next num_slots for FindOccupancy");
+DEFINE_uint32(find_iters, 10000, "number of samples for FindOccupancy");
+DEFINE_uint32(find_min_slots, 128, "number of slots for FindOccupancy");
+DEFINE_uint32(find_max_slots, 1000000, "number of slots for FindOccupancy");
+
+// Options for OptimizeHomogAtScale, which is more of a tool than a test.
+DEFINE_bool(optimize_homog, false,
+            "whether to run the OptimizeHomogAtScale tool");
+DEFINE_uint32(optimize_homog_slots, 30000000,
+              "number of slots for OptimizeHomogAtScale");
+DEFINE_uint32(optimize_homog_check, 200000,
+              "number of queries for checking FP rate in OptimizeHomogAtScale");
+DEFINE_double(
+    optimize_homog_granularity, 0.002,
+    "overhead change between FP rate checking in OptimizeHomogAtScale");
+
+#endif  // GFLAGS
+
+template <typename TypesAndSettings>
+class RibbonTypeParamTest : public ::testing::Test {};
+
+class RibbonTest : public ::testing::Test {};
+
+namespace {
+
+// Different ways of generating keys for testing
+
+// Generate semi-sequential keys
+struct StandardKeyGen {
+  StandardKeyGen(const std::string& prefix, uint64_t id)
+      : id_(id), str_(prefix) {
+    ROCKSDB_NAMESPACE::PutFixed64(&str_, /*placeholder*/ 0);
+  }
+
+  // Prefix (only one required)
+  StandardKeyGen& operator++() {
+    ++id_;
+    return *this;
+  }
+
+  StandardKeyGen& operator+=(uint64_t i) {
+    id_ += i;
+    return *this;
+  }
+
+  const std::string& operator*() {
+    // Use multiplication to mix things up a little in the key
+    ROCKSDB_NAMESPACE::EncodeFixed64(&str_[str_.size() - 8],
+                                     id_ * uint64_t{0x1500000001});
+    return str_;
+  }
+
+  bool operator==(const StandardKeyGen& other) {
+    // Same prefix is assumed
+    return id_ == other.id_;
+  }
+  bool operator!=(const StandardKeyGen& other) {
+    // Same prefix is assumed
+    return id_ != other.id_;
+  }
+
+  uint64_t id_;
+  std::string str_;
+};
+
+// Generate small sequential keys, that can misbehave with sequential seeds
+// as in https://github.com/Cyan4973/xxHash/issues/469.
+// These keys are only heuristically unique, but that's OK with 64 bits,
+// for testing purposes.
+struct SmallKeyGen {
+  SmallKeyGen(const std::string& prefix, uint64_t id) : id_(id) {
+    // Hash the prefix for a heuristically unique offset
+    id_ += ROCKSDB_NAMESPACE::GetSliceHash64(prefix);
+    ROCKSDB_NAMESPACE::PutFixed64(&str_, id_);
+  }
+
+  // Prefix (only one required)
+  SmallKeyGen& operator++() {
+    ++id_;
+    return *this;
+  }
+
+  SmallKeyGen& operator+=(uint64_t i) {
+    id_ += i;
+    return *this;
+  }
+
+  const std::string& operator*() {
+    ROCKSDB_NAMESPACE::EncodeFixed64(&str_[str_.size() - 8], id_);
+    return str_;
+  }
+
+  bool operator==(const SmallKeyGen& other) { return id_ == other.id_; }
+  bool operator!=(const SmallKeyGen& other) { return id_ != other.id_; }
+
+  uint64_t id_;
+  std::string str_;
+};
+
+template <typename KeyGen>
+struct Hash32KeyGenWrapper : public KeyGen {
+  Hash32KeyGenWrapper(const std::string& prefix, uint64_t id)
+      : KeyGen(prefix, id) {}
+  uint32_t operator*() {
+    auto& key = *static_cast<KeyGen&>(*this);
+    // unseeded
+    return ROCKSDB_NAMESPACE::GetSliceHash(key);
+  }
+};
+
+template <typename KeyGen>
+struct Hash64KeyGenWrapper : public KeyGen {
+  Hash64KeyGenWrapper(const std::string& prefix, uint64_t id)
+      : KeyGen(prefix, id) {}
+  uint64_t operator*() {
+    auto& key = *static_cast<KeyGen&>(*this);
+    // unseeded
+    return ROCKSDB_NAMESPACE::GetSliceHash64(key);
+  }
+};
+
+using ROCKSDB_NAMESPACE::ribbon::ConstructionFailureChance;
+
+const std::vector<ConstructionFailureChance> kFailureOnly50Pct = {
+    ROCKSDB_NAMESPACE::ribbon::kOneIn2};
+
+const std::vector<ConstructionFailureChance> kFailureOnlyRare = {
+    ROCKSDB_NAMESPACE::ribbon::kOneIn1000};
+
+const std::vector<ConstructionFailureChance> kFailureAll = {
+    ROCKSDB_NAMESPACE::ribbon::kOneIn2, ROCKSDB_NAMESPACE::ribbon::kOneIn20,
+    ROCKSDB_NAMESPACE::ribbon::kOneIn1000};
+
+}  // namespace
+
+using ROCKSDB_NAMESPACE::ribbon::ExpectedCollisionFpRate;
+using ROCKSDB_NAMESPACE::ribbon::StandardHasher;
+using ROCKSDB_NAMESPACE::ribbon::StandardRehasherAdapter;
+
+struct DefaultTypesAndSettings {
+  using CoeffRow = ROCKSDB_NAMESPACE::Unsigned128;
+  using ResultRow = uint8_t;
+  using Index = uint32_t;
+  using Hash = uint64_t;
+  using Seed = uint32_t;
+  using Key = ROCKSDB_NAMESPACE::Slice;
+  static constexpr bool kIsFilter = true;
+  static constexpr bool kHomogeneous = false;
+  static constexpr bool kFirstCoeffAlwaysOne = true;
+  static constexpr bool kUseSmash = false;
+  static constexpr bool kAllowZeroStarts = false;
+  static Hash HashFn(const Key& key, uint64_t raw_seed) {
+    // This version 0.7.2 preview of XXH3 (a.k.a. XXPH3) function does
+    // not pass SmallKeyGen tests below without some seed premixing from
+    // StandardHasher. See https://github.com/Cyan4973/xxHash/issues/469
+    return ROCKSDB_NAMESPACE::Hash64(key.data(), key.size(), raw_seed);
+  }
+  // For testing
+  using KeyGen = StandardKeyGen;
+  static const std::vector<ConstructionFailureChance>& FailureChanceToTest() {
+    return kFailureAll;
+  }
+};
+
+using TypesAndSettings_Coeff128 = DefaultTypesAndSettings;
+struct TypesAndSettings_Coeff128Smash : public DefaultTypesAndSettings {
+  static constexpr bool kUseSmash = true;
+};
+struct TypesAndSettings_Coeff64 : public DefaultTypesAndSettings {
+  using CoeffRow = uint64_t;
+};
+struct TypesAndSettings_Coeff64Smash : public TypesAndSettings_Coeff64 {
+  static constexpr bool kUseSmash = true;
+};
+struct TypesAndSettings_Coeff64Smash0 : public TypesAndSettings_Coeff64Smash {
+  static constexpr bool kFirstCoeffAlwaysOne = false;
+};
+
+// Homogeneous Ribbon configurations
+struct TypesAndSettings_Coeff128_Homog : public DefaultTypesAndSettings {
+  static constexpr bool kHomogeneous = true;
+  // Since our best construction success setting still has 1/1000 failure
+  // rate, the best FP rate we test is 1/256
+  using ResultRow = uint8_t;
+  // Homogeneous only makes sense with sufficient slots for equivalent of
+  // almost sure construction success
+  static const std::vector<ConstructionFailureChance>& FailureChanceToTest() {
+    return kFailureOnlyRare;
+  }
+};
+struct TypesAndSettings_Coeff128Smash_Homog
+    : public TypesAndSettings_Coeff128_Homog {
+  // Smash (extra time to save space) + Homog (extra space to save time)
+  // doesn't make much sense in practice, but we minimally test it
+  static constexpr bool kUseSmash = true;
+};
+struct TypesAndSettings_Coeff64_Homog : public TypesAndSettings_Coeff128_Homog {
+  using CoeffRow = uint64_t;
+};
+struct TypesAndSettings_Coeff64Smash_Homog
+    : public TypesAndSettings_Coeff64_Homog {
+  // Smash (extra time to save space) + Homog (extra space to save time)
+  // doesn't make much sense in practice, but we minimally test it
+  static constexpr bool kUseSmash = true;
+};
+
+// Less exhaustive mix of coverage, but still covering the most stressful case
+// (only 50% construction success)
+struct AbridgedTypesAndSettings : public DefaultTypesAndSettings {
+  static const std::vector<ConstructionFailureChance>& FailureChanceToTest() {
+    return kFailureOnly50Pct;
+  }
+};
+struct TypesAndSettings_Result16 : public AbridgedTypesAndSettings {
+  using ResultRow = uint16_t;
+};
+struct TypesAndSettings_Result32 : public AbridgedTypesAndSettings {
+  using ResultRow = uint32_t;
+};
+struct TypesAndSettings_IndexSizeT : public AbridgedTypesAndSettings {
+  using Index = size_t;
+};
+struct TypesAndSettings_Hash32 : public AbridgedTypesAndSettings {
+  using Hash = uint32_t;
+  static Hash HashFn(const Key& key, Hash raw_seed) {
+    // This MurmurHash1 function does not pass tests below without the
+    // seed premixing from StandardHasher. In fact, it needs more than
+    // just a multiplication mixer on the ordinal seed.
+    return ROCKSDB_NAMESPACE::Hash(key.data(), key.size(), raw_seed);
+  }
+};
+struct TypesAndSettings_Hash32_Result16 : public AbridgedTypesAndSettings {
+  using ResultRow = uint16_t;
+};
+struct TypesAndSettings_KeyString : public AbridgedTypesAndSettings {
+  using Key = std::string;
+};
+struct TypesAndSettings_Seed8 : public AbridgedTypesAndSettings {
+  // This is not a generally recommended configuration. With the configured
+  // hash function, it would fail with SmallKeyGen due to insufficient
+  // independence among the seeds.
+  using Seed = uint8_t;
+};
+struct TypesAndSettings_NoAlwaysOne : public AbridgedTypesAndSettings {
+  static constexpr bool kFirstCoeffAlwaysOne = false;
+};
+struct TypesAndSettings_AllowZeroStarts : public AbridgedTypesAndSettings {
+  static constexpr bool kAllowZeroStarts = true;
+};
+struct TypesAndSettings_Seed64 : public AbridgedTypesAndSettings {
+  using Seed = uint64_t;
+};
+struct TypesAndSettings_Rehasher
+    : public StandardRehasherAdapter<AbridgedTypesAndSettings> {
+  using KeyGen = Hash64KeyGenWrapper<StandardKeyGen>;
+};
+struct TypesAndSettings_Rehasher_Result16 : public TypesAndSettings_Rehasher {
+  using ResultRow = uint16_t;
+};
+struct TypesAndSettings_Rehasher_Result32 : public TypesAndSettings_Rehasher {
+  using ResultRow = uint32_t;
+};
+struct TypesAndSettings_Rehasher_Seed64
+    : public StandardRehasherAdapter<TypesAndSettings_Seed64> {
+  using KeyGen = Hash64KeyGenWrapper<StandardKeyGen>;
+  // Note: 64-bit seed with Rehasher gives slightly better average reseeds
+};
+struct TypesAndSettings_Rehasher32
+    : public StandardRehasherAdapter<TypesAndSettings_Hash32> {
+  using KeyGen = Hash32KeyGenWrapper<StandardKeyGen>;
+};
+struct TypesAndSettings_Rehasher32_Coeff64
+    : public TypesAndSettings_Rehasher32 {
+  using CoeffRow = uint64_t;
+};
+struct TypesAndSettings_SmallKeyGen : public AbridgedTypesAndSettings {
+  // SmallKeyGen stresses the independence of different hash seeds
+  using KeyGen = SmallKeyGen;
+};
+struct TypesAndSettings_Hash32_SmallKeyGen : public TypesAndSettings_Hash32 {
+  // SmallKeyGen stresses the independence of different hash seeds
+  using KeyGen = SmallKeyGen;
+};
+struct TypesAndSettings_Coeff32 : public DefaultTypesAndSettings {
+  using CoeffRow = uint32_t;
+};
+struct TypesAndSettings_Coeff32Smash : public TypesAndSettings_Coeff32 {
+  static constexpr bool kUseSmash = true;
+};
+struct TypesAndSettings_Coeff16 : public DefaultTypesAndSettings {
+  using CoeffRow = uint16_t;
+};
+struct TypesAndSettings_Coeff16Smash : public TypesAndSettings_Coeff16 {
+  static constexpr bool kUseSmash = true;
+};
+
+using TestTypesAndSettings = ::testing::Types<
+    TypesAndSettings_Coeff128, TypesAndSettings_Coeff128Smash,
+    TypesAndSettings_Coeff64, TypesAndSettings_Coeff64Smash,
+    TypesAndSettings_Coeff64Smash0, TypesAndSettings_Coeff128_Homog,
+    TypesAndSettings_Coeff128Smash_Homog, TypesAndSettings_Coeff64_Homog,
+    TypesAndSettings_Coeff64Smash_Homog, TypesAndSettings_Result16,
+    TypesAndSettings_Result32, TypesAndSettings_IndexSizeT,
+    TypesAndSettings_Hash32, TypesAndSettings_Hash32_Result16,
+    TypesAndSettings_KeyString, TypesAndSettings_Seed8,
+    TypesAndSettings_NoAlwaysOne, TypesAndSettings_AllowZeroStarts,
+    TypesAndSettings_Seed64, TypesAndSettings_Rehasher,
+    TypesAndSettings_Rehasher_Result16, TypesAndSettings_Rehasher_Result32,
+    TypesAndSettings_Rehasher_Seed64, TypesAndSettings_Rehasher32,
+    TypesAndSettings_Rehasher32_Coeff64, TypesAndSettings_SmallKeyGen,
+    TypesAndSettings_Hash32_SmallKeyGen, TypesAndSettings_Coeff32,
+    TypesAndSettings_Coeff32Smash, TypesAndSettings_Coeff16,
+    TypesAndSettings_Coeff16Smash>;
+TYPED_TEST_CASE(RibbonTypeParamTest, TestTypesAndSettings);
+
+namespace {
+
+// For testing Poisson-distributed (or similar) statistics, get value for
+// `stddevs_allowed` standard deviations above expected mean
+// `expected_count`.
+// (Poisson approximates Binomial only if probability of a trial being
+// in the count is low.)
+uint64_t PoissonUpperBound(double expected_count, double stddevs_allowed) {
+  return static_cast<uint64_t>(
+      expected_count + stddevs_allowed * std::sqrt(expected_count) + 1.0);
+}
+
+uint64_t PoissonLowerBound(double expected_count, double stddevs_allowed) {
+  return static_cast<uint64_t>(std::max(
+      0.0, expected_count - stddevs_allowed * std::sqrt(expected_count)));
+}
+
+uint64_t FrequentPoissonUpperBound(double expected_count) {
+  // Allow up to 5.0 standard deviations for frequently checked statistics
+  return PoissonUpperBound(expected_count, 5.0);
+}
+
+uint64_t FrequentPoissonLowerBound(double expected_count) {
+  return PoissonLowerBound(expected_count, 5.0);
+}
+
+uint64_t InfrequentPoissonUpperBound(double expected_count) {
+  // Allow up to 3 standard deviations for infrequently checked statistics
+  return PoissonUpperBound(expected_count, 3.0);
+}
+
+uint64_t InfrequentPoissonLowerBound(double expected_count) {
+  return PoissonLowerBound(expected_count, 3.0);
+}
+
+}  // namespace
+
+TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam);
+  IMPORT_RIBBON_IMPL_TYPES(TypeParam);
+  using KeyGen = typename TypeParam::KeyGen;
+  using ConfigHelper =
+      ROCKSDB_NAMESPACE::ribbon::BandingConfigHelper<TypeParam>;
+
+  if (sizeof(CoeffRow) < 8) {
+    ROCKSDB_GTEST_BYPASS("Not fully supported");
+    return;
+  }
+
+  const auto log2_thoroughness =
+      static_cast<uint32_t>(ROCKSDB_NAMESPACE::FloorLog2(FLAGS_thoroughness));
+
+  // We are going to choose num_to_add using an exponential distribution,
+  // so that we have good representation of small-to-medium filters.
+  // Here we just pick some reasonable, practical upper bound based on
+  // kCoeffBits or option.
+  const double log_max_add = std::log(
+      FLAGS_max_add > 0 ? FLAGS_max_add
+                        : static_cast<uint32_t>(kCoeffBits * kCoeffBits) *
+                              std::max(FLAGS_thoroughness, uint32_t{32}));
+
+  // This needs to be enough below the minimum number of slots to get a
+  // reasonable number of samples with the minimum number of slots.
+  const double log_min_add = std::log(0.66 * SimpleSoln::RoundUpNumSlots(1));
+
+  ASSERT_GT(log_max_add, log_min_add);
+
+  const double diff_log_add = log_max_add - log_min_add;
+
+  for (ConstructionFailureChance cs : TypeParam::FailureChanceToTest()) {
+    double expected_reseeds;
+    switch (cs) {
+      default:
+        assert(false);
+        FALLTHROUGH_INTENDED;
+      case ROCKSDB_NAMESPACE::ribbon::kOneIn2:
+        fprintf(stderr, "== Failure: 50 percent\n");
+        expected_reseeds = 1.0;
+        break;
+      case ROCKSDB_NAMESPACE::ribbon::kOneIn20:
+        fprintf(stderr, "== Failure: 95 percent\n");
+        expected_reseeds = 0.053;
+        break;
+      case ROCKSDB_NAMESPACE::ribbon::kOneIn1000:
+        fprintf(stderr, "== Failure: 1/1000\n");
+        expected_reseeds = 0.001;
+        break;
+    }
+
+    uint64_t total_reseeds = 0;
+    uint64_t total_singles = 0;
+    uint64_t total_single_failures = 0;
+    uint64_t total_batch = 0;
+    uint64_t total_batch_successes = 0;
+    uint64_t total_fp_count = 0;
+    uint64_t total_added = 0;
+    uint64_t total_expand_trials = 0;
+    uint64_t total_expand_failures = 0;
+    double total_expand_overhead = 0.0;
+
+    uint64_t soln_query_nanos = 0;
+    uint64_t soln_query_count = 0;
+    uint64_t bloom_query_nanos = 0;
+    uint64_t isoln_query_nanos = 0;
+    uint64_t isoln_query_count = 0;
+
+    // Take different samples if you change thoroughness
+    ROCKSDB_NAMESPACE::Random32 rnd(FLAGS_thoroughness);
+
+    for (uint32_t i = 0; i < FLAGS_thoroughness; ++i) {
+      // We are going to choose num_to_add using an exponential distribution
+      // as noted above, but instead of randomly choosing them, we generate
+      // samples linearly using the golden ratio, which ensures a nice spread
+      // even for a small number of samples, and starting with the minimum
+      // number of slots to ensure it is tested.
+      double log_add =
+          std::fmod(0.6180339887498948482 * diff_log_add * i, diff_log_add) +
+          log_min_add;
+      uint32_t num_to_add = static_cast<uint32_t>(std::exp(log_add));
+
+      // Most of the time, test the Interleaved solution storage, but when
+      // we do we have to make num_slots a multiple of kCoeffBits. So
+      // sometimes we want to test without that limitation.
+      bool test_interleaved = (i % 7) != 6;
+
+      // Compute num_slots, and re-adjust num_to_add to get as close as possible
+      // to next num_slots, to stress that num_slots in terms of construction
+      // success. Ensure at least one iteration:
+      Index num_slots = Index{0} - 1;
+      --num_to_add;
+      for (;;) {
+        Index next_num_slots = SimpleSoln::RoundUpNumSlots(
+            ConfigHelper::GetNumSlots(num_to_add + 1, cs));
+        if (test_interleaved) {
+          next_num_slots = InterleavedSoln::RoundUpNumSlots(next_num_slots);
+          // assert idempotent
+          EXPECT_EQ(next_num_slots,
+                    InterleavedSoln::RoundUpNumSlots(next_num_slots));
+        }
+        // assert idempotent with InterleavedSoln::RoundUpNumSlots
+        EXPECT_EQ(next_num_slots, SimpleSoln::RoundUpNumSlots(next_num_slots));
+
+        if (next_num_slots > num_slots) {
+          break;
+        }
+        num_slots = next_num_slots;
+        ++num_to_add;
+      }
+      assert(num_slots < Index{0} - 1);
+
+      total_added += num_to_add;
+
+      std::string prefix;
+      ROCKSDB_NAMESPACE::PutFixed32(&prefix, rnd.Next());
+
+      // Batch that must be added
+      std::string added_str = prefix + "added";
+      KeyGen keys_begin(added_str, 0);
+      KeyGen keys_end(added_str, num_to_add);
+
+      // A couple more that will probably be added
+      KeyGen one_more(prefix + "more", 1);
+      KeyGen two_more(prefix + "more", 2);
+
+      // Batch that may or may not be added
+      uint32_t batch_size =
+          static_cast<uint32_t>(2.0 * std::sqrt(num_slots - num_to_add));
+      if (batch_size < 10U) {
+        batch_size = 0;
+      }
+      std::string batch_str = prefix + "batch";
+      KeyGen batch_begin(batch_str, 0);
+      KeyGen batch_end(batch_str, batch_size);
+
+      // Batch never (successfully) added, but used for querying FP rate
+      std::string not_str = prefix + "not";
+      KeyGen other_keys_begin(not_str, 0);
+      KeyGen other_keys_end(not_str, FLAGS_max_check);
+
+      double overhead_ratio = 1.0 * num_slots / num_to_add;
+      if (FLAGS_verbose) {
+        fprintf(stderr, "Adding(%s) %u / %u   Overhead: %g   Batch size: %u\n",
+                test_interleaved ? "i" : "s", (unsigned)num_to_add,
+                (unsigned)num_slots, overhead_ratio, (unsigned)batch_size);
+      }
+
+      // Vary bytes for InterleavedSoln to use number of solution columns
+      // from 0 to max allowed by ResultRow type (and used by SimpleSoln).
+      // Specifically include 0 and max, and otherwise skew toward max.
+      uint32_t max_ibytes =
+          static_cast<uint32_t>(sizeof(ResultRow) * num_slots);
+      size_t ibytes;
+      if (i == 0) {
+        ibytes = 0;
+      } else if (i == 1) {
+        ibytes = max_ibytes;
+      } else {
+        // Skewed
+        ibytes =
+            std::max(rnd.Uniformish(max_ibytes), rnd.Uniformish(max_ibytes));
+      }
+      std::unique_ptr<char[]> idata(new char[ibytes]);
+      InterleavedSoln isoln(idata.get(), ibytes);
+
+      SimpleSoln soln;
+      Hasher hasher;
+      bool first_single;
+      bool second_single;
+      bool batch_success;
+      {
+        Banding banding;
+        // Traditional solve for a fixed set.
+        ASSERT_TRUE(
+            banding.ResetAndFindSeedToSolve(num_slots, keys_begin, keys_end));
+
+        Index occupied_count = banding.GetOccupiedCount();
+        Index more_added = 0;
+
+        if (TypeParam::kHomogeneous || overhead_ratio < 1.01 ||
+            batch_size == 0) {
+          // Homogeneous not compatible with backtracking because add
+          // doesn't fail. Small overhead ratio too packed to expect more
+          first_single = false;
+          second_single = false;
+          batch_success = false;
+        } else {
+          // Now to test backtracking, starting with guaranteed fail. By using
+          // the keys that will be used to test FP rate, we are then doing an
+          // extra check that after backtracking there are no remnants (e.g. in
+          // result side of banding) of these entries.
+          KeyGen other_keys_too_big_end = other_keys_begin;
+          other_keys_too_big_end += num_to_add;
+          banding.EnsureBacktrackSize(std::max(num_to_add, batch_size));
+          EXPECT_FALSE(banding.AddRangeOrRollBack(other_keys_begin,
+                                                  other_keys_too_big_end));
+          EXPECT_EQ(occupied_count, banding.GetOccupiedCount());
+
+          // Check that we still have a good chance of adding a couple more
+          // individually
+          first_single = banding.Add(*one_more);
+          second_single = banding.Add(*two_more);
+          more_added += (first_single ? 1 : 0) + (second_single ? 1 : 0);
+          total_singles += 2U;
+          total_single_failures += 2U - more_added;
+
+          // Or as a batch
+          batch_success = banding.AddRangeOrRollBack(batch_begin, batch_end);
+          ++total_batch;
+          if (batch_success) {
+            more_added += batch_size;
+            ++total_batch_successes;
+          }
+          EXPECT_LE(banding.GetOccupiedCount(), occupied_count + more_added);
+        }
+
+        // Also verify that redundant adds are OK (no effect)
+        ASSERT_TRUE(
+            banding.AddRange(keys_begin, KeyGen(added_str, num_to_add / 8)));
+        EXPECT_LE(banding.GetOccupiedCount(), occupied_count + more_added);
+
+        // Now back-substitution
+        soln.BackSubstFrom(banding);
+        if (test_interleaved) {
+          isoln.BackSubstFrom(banding);
+        }
+
+        Seed reseeds = banding.GetOrdinalSeed();
+        total_reseeds += reseeds;
+
+        EXPECT_LE(reseeds, 8 + log2_thoroughness);
+        if (reseeds > log2_thoroughness + 1) {
+          fprintf(
+              stderr, "%s high reseeds at %u, %u/%u: %u\n",
+              reseeds > log2_thoroughness + 8 ? "ERROR Extremely" : "Somewhat",
+              static_cast<unsigned>(i), static_cast<unsigned>(num_to_add),
+              static_cast<unsigned>(num_slots), static_cast<unsigned>(reseeds));
+        }
+
+        if (reseeds > 0) {
+          // "Expand" test: given a failed construction, how likely is it to
+          // pass with same seed and more slots. At each step, we increase
+          // enough to ensure there is at least one shift within each coeff
+          // block.
+          ++total_expand_trials;
+          Index expand_count = 0;
+          Index ex_slots = num_slots;
+          banding.SetOrdinalSeed(0);
+          for (;; ++expand_count) {
+            ASSERT_LE(expand_count, log2_thoroughness);
+            ex_slots += ex_slots / kCoeffBits;
+            if (test_interleaved) {
+              ex_slots = InterleavedSoln::RoundUpNumSlots(ex_slots);
+            }
+            banding.Reset(ex_slots);
+            bool success = banding.AddRange(keys_begin, keys_end);
+            if (success) {
+              break;
+            }
+          }
+          total_expand_failures += expand_count;
+          total_expand_overhead += 1.0 * (ex_slots - num_slots) / num_slots;
+        }
+
+        hasher.SetOrdinalSeed(reseeds);
+      }
+      // soln and hasher now independent of Banding object
+
+      // Verify keys added
+      KeyGen cur = keys_begin;
+      while (cur != keys_end) {
+        ASSERT_TRUE(soln.FilterQuery(*cur, hasher));
+        ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*cur, hasher));
+        ++cur;
+      }
+      // We (maybe) snuck these in!
+      if (first_single) {
+        ASSERT_TRUE(soln.FilterQuery(*one_more, hasher));
+        ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*one_more, hasher));
+      }
+      if (second_single) {
+        ASSERT_TRUE(soln.FilterQuery(*two_more, hasher));
+        ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*two_more, hasher));
+      }
+      if (batch_success) {
+        cur = batch_begin;
+        while (cur != batch_end) {
+          ASSERT_TRUE(soln.FilterQuery(*cur, hasher));
+          ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*cur, hasher));
+          ++cur;
+        }
+      }
+
+      // Check FP rate (depends only on number of result bits == solution
+      // columns)
+      Index fp_count = 0;
+      cur = other_keys_begin;
+      {
+        ROCKSDB_NAMESPACE::StopWatchNano timer(
+            ROCKSDB_NAMESPACE::SystemClock::Default().get(), true);
+        while (cur != other_keys_end) {
+          bool fp = soln.FilterQuery(*cur, hasher);
+          fp_count += fp ? 1 : 0;
+          ++cur;
+        }
+        soln_query_nanos += timer.ElapsedNanos();
+        soln_query_count += FLAGS_max_check;
+      }
+      {
+        double expected_fp_count = soln.ExpectedFpRate() * FLAGS_max_check;
+        // For expected FP rate, also include false positives due to collisions
+        // in Hash value. (Negligible for 64-bit, can matter for 32-bit.)
+        double correction =
+            FLAGS_max_check * ExpectedCollisionFpRate(hasher, num_to_add);
+
+        // NOTE: rare violations expected with kHomogeneous
+        EXPECT_LE(fp_count,
+                  FrequentPoissonUpperBound(expected_fp_count + correction));
+        EXPECT_GE(fp_count,
+                  FrequentPoissonLowerBound(expected_fp_count + correction));
+      }
+      total_fp_count += fp_count;
+
+      // And also check FP rate for isoln
+      if (test_interleaved) {
+        Index ifp_count = 0;
+        cur = other_keys_begin;
+        ROCKSDB_NAMESPACE::StopWatchNano timer(
+            ROCKSDB_NAMESPACE::SystemClock::Default().get(), true);
+        while (cur != other_keys_end) {
+          ifp_count += isoln.FilterQuery(*cur, hasher) ? 1 : 0;
+          ++cur;
+        }
+        isoln_query_nanos += timer.ElapsedNanos();
+        isoln_query_count += FLAGS_max_check;
+        {
+          double expected_fp_count = isoln.ExpectedFpRate() * FLAGS_max_check;
+          // For expected FP rate, also include false positives due to
+          // collisions in Hash value. (Negligible for 64-bit, can matter for
+          // 32-bit.)
+          double correction =
+              FLAGS_max_check * ExpectedCollisionFpRate(hasher, num_to_add);
+
+          // NOTE: rare violations expected with kHomogeneous
+          EXPECT_LE(ifp_count,
+                    FrequentPoissonUpperBound(expected_fp_count + correction));
+
+          // FIXME: why sometimes can we slightly "beat the odds"?
+          // (0.95 factor should not be needed)
+          EXPECT_GE(ifp_count, FrequentPoissonLowerBound(
+                                   0.95 * expected_fp_count + correction));
+        }
+        // Since the bits used in isoln are a subset of the bits used in soln,
+        // it cannot have fewer FPs
+        EXPECT_GE(ifp_count, fp_count);
+      }
+
+      // And compare to Bloom time, for fun
+      if (ibytes >= /* minimum Bloom impl bytes*/ 64) {
+        Index bfp_count = 0;
+        cur = other_keys_begin;
+        ROCKSDB_NAMESPACE::StopWatchNano timer(
+            ROCKSDB_NAMESPACE::SystemClock::Default().get(), true);
+        while (cur != other_keys_end) {
+          uint64_t h = hasher.GetHash(*cur);
+          uint32_t h1 = ROCKSDB_NAMESPACE::Lower32of64(h);
+          uint32_t h2 = sizeof(Hash) >= 8 ? ROCKSDB_NAMESPACE::Upper32of64(h)
+                                          : h1 * 0x9e3779b9;
+          bfp_count +=
+              ROCKSDB_NAMESPACE::FastLocalBloomImpl::HashMayMatch(
+                  h1, h2, static_cast<uint32_t>(ibytes), 6, idata.get())
+                  ? 1
+                  : 0;
+          ++cur;
+        }
+        bloom_query_nanos += timer.ElapsedNanos();
+        // ensure bfp_count is used
+        ASSERT_LT(bfp_count, FLAGS_max_check);
+      }
+    }
+
+    // "outside" == key not in original set so either negative or false positive
+    fprintf(stderr,
+            "Simple      outside query, hot, incl hashing, ns/key: %g\n",
+            1.0 * soln_query_nanos / soln_query_count);
+    fprintf(stderr,
+            "Interleaved outside query, hot, incl hashing, ns/key: %g\n",
+            1.0 * isoln_query_nanos / isoln_query_count);
+    fprintf(stderr,
+            "Bloom       outside query, hot, incl hashing, ns/key: %g\n",
+            1.0 * bloom_query_nanos / soln_query_count);
+
+    if (TypeParam::kHomogeneous) {
+      EXPECT_EQ(total_reseeds, 0U);
+    } else {
+      double average_reseeds = 1.0 * total_reseeds / FLAGS_thoroughness;
+      fprintf(stderr, "Average re-seeds: %g\n", average_reseeds);
+      // Values above were chosen to target around 50% chance of encoding
+      // success rate (average of 1.0 re-seeds) or slightly better. But 1.15 is
+      // also close enough.
+      EXPECT_LE(total_reseeds,
+                InfrequentPoissonUpperBound(1.15 * expected_reseeds *
+                                            FLAGS_thoroughness));
+      // Would use 0.85 here instead of 0.75, but
+      // TypesAndSettings_Hash32_SmallKeyGen can "beat the odds" because of
+      // sequential keys with a small, cheap hash function. We accept that
+      // there are surely inputs that are somewhat bad for this setup, but
+      // these somewhat good inputs are probably more likely.
+      EXPECT_GE(total_reseeds,
+                InfrequentPoissonLowerBound(0.75 * expected_reseeds *
+                                            FLAGS_thoroughness));
+    }
+
+    if (total_expand_trials > 0) {
+      double average_expand_failures =
+          1.0 * total_expand_failures / total_expand_trials;
+      fprintf(stderr, "Average expand failures, and overhead: %g, %g\n",
+              average_expand_failures,
+              total_expand_overhead / total_expand_trials);
+      // Seems to be a generous allowance
+      EXPECT_LE(total_expand_failures,
+                InfrequentPoissonUpperBound(1.0 * total_expand_trials));
+    } else {
+      fprintf(stderr, "Average expand failures: N/A\n");
+    }
+
+    if (total_singles > 0) {
+      double single_failure_rate = 1.0 * total_single_failures / total_singles;
+      fprintf(stderr, "Add'l single, failure rate: %g\n", single_failure_rate);
+      // A rough bound (one sided) based on nothing in particular
+      double expected_single_failures = 1.0 * total_singles /
+                                        (sizeof(CoeffRow) == 16 ? 128
+                                         : TypeParam::kUseSmash ? 64
+                                                                : 32);
+      EXPECT_LE(total_single_failures,
+                InfrequentPoissonUpperBound(expected_single_failures));
+    }
+
+    if (total_batch > 0) {
+      // Counting successes here for Poisson to approximate the Binomial
+      // distribution.
+      // A rough bound (one sided) based on nothing in particular.
+      double expected_batch_successes = 1.0 * total_batch / 2;
+      uint64_t lower_bound =
+          InfrequentPoissonLowerBound(expected_batch_successes);
+      fprintf(stderr, "Add'l batch, success rate: %g (>= %g)\n",
+              1.0 * total_batch_successes / total_batch,
+              1.0 * lower_bound / total_batch);
+      EXPECT_GE(total_batch_successes, lower_bound);
+    }
+
+    {
+      uint64_t total_checked = uint64_t{FLAGS_max_check} * FLAGS_thoroughness;
+      double expected_total_fp_count =
+          total_checked * std::pow(0.5, 8U * sizeof(ResultRow));
+      // For expected FP rate, also include false positives due to collisions
+      // in Hash value. (Negligible for 64-bit, can matter for 32-bit.)
+      double average_added = 1.0 * total_added / FLAGS_thoroughness;
+      expected_total_fp_count +=
+          total_checked * ExpectedCollisionFpRate(Hasher(), average_added);
+
+      uint64_t upper_bound =
+          InfrequentPoissonUpperBound(expected_total_fp_count);
+      uint64_t lower_bound =
+          InfrequentPoissonLowerBound(expected_total_fp_count);
+      fprintf(stderr, "Average FP rate: %g (~= %g, <= %g, >= %g)\n",
+              1.0 * total_fp_count / total_checked,
+              expected_total_fp_count / total_checked,
+              1.0 * upper_bound / total_checked,
+              1.0 * lower_bound / total_checked);
+      EXPECT_LE(total_fp_count, upper_bound);
+      EXPECT_GE(total_fp_count, lower_bound);
+    }
+  }
+}
+
+TYPED_TEST(RibbonTypeParamTest, Extremes) {
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam);
+  IMPORT_RIBBON_IMPL_TYPES(TypeParam);
+  using KeyGen = typename TypeParam::KeyGen;
+
+  size_t bytes = 128 * 1024;
+  std::unique_ptr<char[]> buf(new char[bytes]);
+  InterleavedSoln isoln(buf.get(), bytes);
+  SimpleSoln soln;
+  Hasher hasher;
+  Banding banding;
+
+  // ########################################
+  // Add zero keys to minimal number of slots
+  KeyGen begin_and_end("foo", 123);
+  ASSERT_TRUE(banding.ResetAndFindSeedToSolve(
+      /*slots*/ kCoeffBits, begin_and_end, begin_and_end, /*first seed*/ 0,
+      /* seed mask*/ 0));
+
+  soln.BackSubstFrom(banding);
+  isoln.BackSubstFrom(banding);
+
+  // Because there's plenty of memory, we expect the interleaved solution to
+  // use maximum supported columns (same as simple solution)
+  ASSERT_EQ(isoln.GetUpperNumColumns(), 8U * sizeof(ResultRow));
+  ASSERT_EQ(isoln.GetUpperStartBlock(), 0U);
+
+  // Somewhat oddly, we expect same FP rate as if we had essentially filled
+  // up the slots.
+  KeyGen other_keys_begin("not", 0);
+  KeyGen other_keys_end("not", FLAGS_max_check);
+
+  Index fp_count = 0;
+  KeyGen cur = other_keys_begin;
+  while (cur != other_keys_end) {
+    bool isoln_query_result = isoln.FilterQuery(*cur, hasher);
+    bool soln_query_result = soln.FilterQuery(*cur, hasher);
+    // Solutions are equivalent
+    ASSERT_EQ(isoln_query_result, soln_query_result);
+    if (!TypeParam::kHomogeneous) {
+      // And in fact we only expect an FP when ResultRow is 0
+      // (except Homogeneous)
+      ASSERT_EQ(soln_query_result, hasher.GetResultRowFromHash(
+                                       hasher.GetHash(*cur)) == ResultRow{0});
+    }
+    fp_count += soln_query_result ? 1 : 0;
+    ++cur;
+  }
+  {
+    ASSERT_EQ(isoln.ExpectedFpRate(), soln.ExpectedFpRate());
+    double expected_fp_count = isoln.ExpectedFpRate() * FLAGS_max_check;
+    EXPECT_LE(fp_count, InfrequentPoissonUpperBound(expected_fp_count));
+    if (TypeParam::kHomogeneous) {
+      // Pseudorandom garbage in Homogeneous filter can "beat the odds" if
+      // nothing added
+    } else {
+      EXPECT_GE(fp_count, InfrequentPoissonLowerBound(expected_fp_count));
+    }
+  }
+
+  // ######################################################
+  // Use zero bytes for interleaved solution (key(s) added)
+
+  // Add one key
+  KeyGen key_begin("added", 0);
+  KeyGen key_end("added", 1);
+  ASSERT_TRUE(banding.ResetAndFindSeedToSolve(
+      /*slots*/ kCoeffBits, key_begin, key_end, /*first seed*/ 0,
+      /* seed mask*/ 0));
+
+  InterleavedSoln isoln2(nullptr, /*bytes*/ 0);
+
+  isoln2.BackSubstFrom(banding);
+
+  ASSERT_EQ(isoln2.GetUpperNumColumns(), 0U);
+  ASSERT_EQ(isoln2.GetUpperStartBlock(), 0U);
+
+  // All queries return true
+  ASSERT_TRUE(isoln2.FilterQuery(*other_keys_begin, hasher));
+  ASSERT_EQ(isoln2.ExpectedFpRate(), 1.0);
+}
+
+TEST(RibbonTest, AllowZeroStarts) {
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings_AllowZeroStarts);
+  IMPORT_RIBBON_IMPL_TYPES(TypesAndSettings_AllowZeroStarts);
+  using KeyGen = StandardKeyGen;
+
+  InterleavedSoln isoln(nullptr, /*bytes*/ 0);
+  SimpleSoln soln;
+  Hasher hasher;
+  Banding banding;
+
+  KeyGen begin("foo", 0);
+  KeyGen end("foo", 1);
+  // Can't add 1 entry
+  ASSERT_FALSE(banding.ResetAndFindSeedToSolve(/*slots*/ 0, begin, end));
+
+  KeyGen begin_and_end("foo", 123);
+  // Can add 0 entries
+  ASSERT_TRUE(banding.ResetAndFindSeedToSolve(/*slots*/ 0, begin_and_end,
+                                              begin_and_end));
+
+  Seed reseeds = banding.GetOrdinalSeed();
+  ASSERT_EQ(reseeds, 0U);
+  hasher.SetOrdinalSeed(reseeds);
+
+  // Can construct 0-slot solutions
+  isoln.BackSubstFrom(banding);
+  soln.BackSubstFrom(banding);
+
+  // Should always return false
+  ASSERT_FALSE(isoln.FilterQuery(*begin, hasher));
+  ASSERT_FALSE(soln.FilterQuery(*begin, hasher));
+
+  // And report that in FP rate
+  ASSERT_EQ(isoln.ExpectedFpRate(), 0.0);
+  ASSERT_EQ(soln.ExpectedFpRate(), 0.0);
+}
+
+TEST(RibbonTest, RawAndOrdinalSeeds) {
+  StandardHasher<TypesAndSettings_Seed64> hasher64;
+  StandardHasher<DefaultTypesAndSettings> hasher64_32;
+  StandardHasher<TypesAndSettings_Hash32> hasher32;
+  StandardHasher<TypesAndSettings_Seed8> hasher8;
+
+  for (uint32_t limit : {0xffU, 0xffffU}) {
+    std::vector<bool> seen(limit + 1);
+    for (uint32_t i = 0; i < limit; ++i) {
+      hasher64.SetOrdinalSeed(i);
+      auto raw64 = hasher64.GetRawSeed();
+      hasher32.SetOrdinalSeed(i);
+      auto raw32 = hasher32.GetRawSeed();
+      hasher8.SetOrdinalSeed(static_cast<uint8_t>(i));
+      auto raw8 = hasher8.GetRawSeed();
+      {
+        hasher64_32.SetOrdinalSeed(i);
+        auto raw64_32 = hasher64_32.GetRawSeed();
+        ASSERT_EQ(raw64_32, raw32);  // Same size seed
+      }
+      if (i == 0) {
+        // Documented that ordinal seed 0 == raw seed 0
+        ASSERT_EQ(raw64, 0U);
+        ASSERT_EQ(raw32, 0U);
+        ASSERT_EQ(raw8, 0U);
+      } else {
+        // Extremely likely that upper bits are set
+        ASSERT_GT(raw64, raw32);
+        ASSERT_GT(raw32, raw8);
+      }
+      // Hashers agree on lower bits
+      ASSERT_EQ(static_cast<uint32_t>(raw64), raw32);
+      ASSERT_EQ(static_cast<uint8_t>(raw32), raw8);
+
+      // The translation is one-to-one for this size prefix
+      uint32_t v = static_cast<uint32_t>(raw32 & limit);
+      ASSERT_EQ(raw64 & limit, v);
+      ASSERT_FALSE(seen[v]);
+      seen[v] = true;
+    }
+  }
+}
+
+namespace {
+
+struct PhsfInputGen {
+  PhsfInputGen(const std::string& prefix, uint64_t id) : id_(id) {
+    val_.first = prefix;
+    ROCKSDB_NAMESPACE::PutFixed64(&val_.first, /*placeholder*/ 0);
+  }
+
+  // Prefix (only one required)
+  PhsfInputGen& operator++() {
+    ++id_;
+    return *this;
+  }
+
+  const std::pair<std::string, uint8_t>& operator*() {
+    // Use multiplication to mix things up a little in the key
+    ROCKSDB_NAMESPACE::EncodeFixed64(&val_.first[val_.first.size() - 8],
+                                     id_ * uint64_t{0x1500000001});
+    // Occasionally repeat values etc.
+    val_.second = static_cast<uint8_t>(id_ * 7 / 8);
+    return val_;
+  }
+
+  const std::pair<std::string, uint8_t>* operator->() { return &**this; }
+
+  bool operator==(const PhsfInputGen& other) {
+    // Same prefix is assumed
+    return id_ == other.id_;
+  }
+  bool operator!=(const PhsfInputGen& other) {
+    // Same prefix is assumed
+    return id_ != other.id_;
+  }
+
+  uint64_t id_;
+  std::pair<std::string, uint8_t> val_;
+};
+
+struct PhsfTypesAndSettings : public DefaultTypesAndSettings {
+  static constexpr bool kIsFilter = false;
+};
+}  // namespace
+
+TEST(RibbonTest, PhsfBasic) {
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(PhsfTypesAndSettings);
+  IMPORT_RIBBON_IMPL_TYPES(PhsfTypesAndSettings);
+
+  Index num_slots = 12800;
+  Index num_to_add = static_cast<Index>(num_slots / 1.02);
+
+  PhsfInputGen begin("in", 0);
+  PhsfInputGen end("in", num_to_add);
+
+  std::unique_ptr<char[]> idata(new char[/*bytes*/ num_slots]);
+  InterleavedSoln isoln(idata.get(), /*bytes*/ num_slots);
+  SimpleSoln soln;
+  Hasher hasher;
+
+  {
+    Banding banding;
+    ASSERT_TRUE(banding.ResetAndFindSeedToSolve(num_slots, begin, end));
+
+    soln.BackSubstFrom(banding);
+    isoln.BackSubstFrom(banding);
+
+    hasher.SetOrdinalSeed(banding.GetOrdinalSeed());
+  }
+
+  for (PhsfInputGen cur = begin; cur != end; ++cur) {
+    ASSERT_EQ(cur->second, soln.PhsfQuery(cur->first, hasher));
+    ASSERT_EQ(cur->second, isoln.PhsfQuery(cur->first, hasher));
+  }
+}
+
+// Not a real test, but a tool used to build APIs in ribbon_config.h
+TYPED_TEST(RibbonTypeParamTest, FindOccupancy) {
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam);
+  IMPORT_RIBBON_IMPL_TYPES(TypeParam);
+  using KeyGen = typename TypeParam::KeyGen;
+
+  if (!FLAGS_find_occ) {
+    ROCKSDB_GTEST_BYPASS("Tool disabled during unit test runs");
+    return;
+  }
+
+  KeyGen cur(std::to_string(testing::UnitTest::GetInstance()->random_seed()),
+             0);
+
+  Banding banding;
+  Index num_slots = InterleavedSoln::RoundUpNumSlots(FLAGS_find_min_slots);
+  Index max_slots = InterleavedSoln::RoundUpNumSlots(FLAGS_find_max_slots);
+  while (num_slots <= max_slots) {
+    std::map<int32_t, uint32_t> rem_histogram;
+    std::map<Index, uint32_t> slot_histogram;
+    if (FLAGS_find_slot_occ) {
+      for (Index i = 0; i < kCoeffBits; ++i) {
+        slot_histogram[i] = 0;
+        slot_histogram[num_slots - 1 - i] = 0;
+        slot_histogram[num_slots / 2 - kCoeffBits / 2 + i] = 0;
+      }
+    }
+    uint64_t total_added = 0;
+    for (uint32_t i = 0; i < FLAGS_find_iters; ++i) {
+      banding.Reset(num_slots);
+      uint32_t j = 0;
+      KeyGen end = cur;
+      end += num_slots + num_slots / 10;
+      for (; cur != end; ++cur) {
+        if (banding.Add(*cur)) {
+          ++j;
+        } else {
+          break;
+        }
+      }
+      total_added += j;
+      for (auto& slot : slot_histogram) {
+        slot.second += banding.IsOccupied(slot.first);
+      }
+
+      int32_t bucket =
+          static_cast<int32_t>(num_slots) - static_cast<int32_t>(j);
+      rem_histogram[bucket]++;
+      if (FLAGS_verbose) {
+        fprintf(stderr, "num_slots: %u i: %u / %u avg_overhead: %g\r",
+                static_cast<unsigned>(num_slots), static_cast<unsigned>(i),
+                static_cast<unsigned>(FLAGS_find_iters),
+                1.0 * (i + 1) * num_slots / total_added);
+      }
+    }
+    if (FLAGS_verbose) {
+      fprintf(stderr, "\n");
+    }
+
+    uint32_t cumulative = 0;
+
+    double p50_rem = 0;
+    double p95_rem = 0;
+    double p99_9_rem = 0;
+
+    for (auto& h : rem_histogram) {
+      double before = 1.0 * cumulative / FLAGS_find_iters;
+      double not_after = 1.0 * (cumulative + h.second) / FLAGS_find_iters;
+      if (FLAGS_verbose) {
+        fprintf(stderr, "overhead: %g before: %g not_after: %g\n",
+                1.0 * num_slots / (num_slots - h.first), before, not_after);
+      }
+      cumulative += h.second;
+      if (before < 0.5 && 0.5 <= not_after) {
+        // fake it with linear interpolation
+        double portion = (0.5 - before) / (not_after - before);
+        p50_rem = h.first + portion;
+      } else if (before < 0.95 && 0.95 <= not_after) {
+        // fake it with linear interpolation
+        double portion = (0.95 - before) / (not_after - before);
+        p95_rem = h.first + portion;
+      } else if (before < 0.999 && 0.999 <= not_after) {
+        // fake it with linear interpolation
+        double portion = (0.999 - before) / (not_after - before);
+        p99_9_rem = h.first + portion;
+      }
+    }
+    for (auto& slot : slot_histogram) {
+      fprintf(stderr, "slot[%u] occupied: %g\n", (unsigned)slot.first,
+              1.0 * slot.second / FLAGS_find_iters);
+    }
+
+    double mean_rem =
+        (1.0 * FLAGS_find_iters * num_slots - total_added) / FLAGS_find_iters;
+    fprintf(
+        stderr,
+        "num_slots: %u iters: %u mean_ovr: %g p50_ovr: %g p95_ovr: %g "
+        "p99.9_ovr: %g mean_rem: %g p50_rem: %g p95_rem: %g p99.9_rem: %g\n",
+        static_cast<unsigned>(num_slots),
+        static_cast<unsigned>(FLAGS_find_iters),
+        1.0 * num_slots / (num_slots - mean_rem),
+        1.0 * num_slots / (num_slots - p50_rem),
+        1.0 * num_slots / (num_slots - p95_rem),
+        1.0 * num_slots / (num_slots - p99_9_rem), mean_rem, p50_rem, p95_rem,
+        p99_9_rem);
+
+    num_slots = std::max(
+        num_slots + 1, static_cast<Index>(num_slots * FLAGS_find_next_factor));
+    num_slots = InterleavedSoln::RoundUpNumSlots(num_slots);
+  }
+}
+
+// Not a real test, but a tool to understand Homogeneous Ribbon
+// behavior (TODO: configuration APIs & tests)
+TYPED_TEST(RibbonTypeParamTest, OptimizeHomogAtScale) {
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam);
+  IMPORT_RIBBON_IMPL_TYPES(TypeParam);
+  using KeyGen = typename TypeParam::KeyGen;
+
+  if (!FLAGS_optimize_homog) {
+    ROCKSDB_GTEST_BYPASS("Tool disabled during unit test runs");
+    return;
+  }
+
+  if (!TypeParam::kHomogeneous) {
+    ROCKSDB_GTEST_BYPASS("Only for Homogeneous Ribbon");
+    return;
+  }
+
+  KeyGen cur(std::to_string(testing::UnitTest::GetInstance()->random_seed()),
+             0);
+
+  Banding banding;
+  Index num_slots = SimpleSoln::RoundUpNumSlots(FLAGS_optimize_homog_slots);
+  banding.Reset(num_slots);
+
+  // This and "band_ovr" is the "allocated overhead", or slots over added.
+  // It does not take into account FP rates.
+  double target_overhead = 1.20;
+  uint32_t num_added = 0;
+
+  do {
+    do {
+      (void)banding.Add(*cur);
+      ++cur;
+      ++num_added;
+    } while (1.0 * num_slots / num_added > target_overhead);
+
+    SimpleSoln soln;
+    soln.BackSubstFrom(banding);
+
+    std::array<uint32_t, 8U * sizeof(ResultRow)> fp_counts_by_cols;
+    fp_counts_by_cols.fill(0U);
+    for (uint32_t i = 0; i < FLAGS_optimize_homog_check; ++i) {
+      ResultRow r = soln.PhsfQuery(*cur, banding);
+      ++cur;
+      for (size_t j = 0; j < fp_counts_by_cols.size(); ++j) {
+        if ((r & 1) == 1) {
+          break;
+        }
+        fp_counts_by_cols[j]++;
+        r /= 2;
+      }
+    }
+    fprintf(stderr, "band_ovr: %g ", 1.0 * num_slots / num_added);
+    for (unsigned j = 0; j < fp_counts_by_cols.size(); ++j) {
+      double inv_fp_rate =
+          1.0 * FLAGS_optimize_homog_check / fp_counts_by_cols[j];
+      double equiv_cols = std::log(inv_fp_rate) * 1.4426950409;
+      // Overhead vs. information-theoretic minimum based on observed
+      // FP rate (subject to sampling error, especially for low FP rates)
+      double actual_overhead =
+          1.0 * (j + 1) * num_slots / (equiv_cols * num_added);
+      fprintf(stderr, "ovr_%u: %g ", j + 1, actual_overhead);
+    }
+    fprintf(stderr, "\n");
+    target_overhead -= FLAGS_optimize_homog_granularity;
+  } while (target_overhead > 1.0);
+}
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+#ifdef GFLAGS
+  ParseCommandLineFlags(&argc, &argv, true);
+#endif  // GFLAGS
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/set_comparator.h b/src/rocksdb/util/set_comparator.h
new file mode 100644
index 000000000..e0e64436a
--- /dev/null
+++ b/src/rocksdb/util/set_comparator.h
@@ -0,0 +1,24 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/comparator.h"
+
+namespace ROCKSDB_NAMESPACE {
+// A comparator to be used in std::set
+struct SetComparator {
+  explicit SetComparator() : user_comparator_(BytewiseComparator()) {}
+  explicit SetComparator(const Comparator* user_comparator)
+      : user_comparator_(user_comparator ? user_comparator
+                                         : BytewiseComparator()) {}
+  bool operator()(const Slice& lhs, const Slice& rhs) const {
+    return user_comparator_->Compare(lhs, rhs) < 0;
+  }
+
+ private:
+  const Comparator* user_comparator_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/single_thread_executor.h b/src/rocksdb/util/single_thread_executor.h
new file mode 100644
index 000000000..c69f2a292
--- /dev/null
+++ b/src/rocksdb/util/single_thread_executor.h
@@ -0,0 +1,56 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#if USE_COROUTINES
+#include <atomic>
+
+#include "folly/CPortability.h"
+#include "folly/CppAttributes.h"
+#include "folly/Executor.h"
+#include "util/async_file_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Implements a simple executor that runs callback functions in the same
+// thread, unlike CPUThreadExecutor which may schedule the callback on
+// another thread. Runs in a tight loop calling the queued callbacks,
+// and polls for async IO completions when idle. The completions will
+// resume suspended coroutines and they get added to the queue, which
+// will get picked up by this loop.
+// Any possibility of deadlock is precluded because the file system
+// guarantees that async IO completion callbacks will not be scheduled
+// to run in this thread or this executor.
+class SingleThreadExecutor : public folly::Executor {
+ public:
+  explicit SingleThreadExecutor(AsyncFileReader& reader)
+      : reader_(reader), busy_(false) {}
+
+  void add(folly::Func callback) override {
+    auto& q = q_;
+    q.push(std::move(callback));
+    if (q.size() == 1 && !busy_) {
+      while (!q.empty()) {
+        q.front()();
+        q.pop();
+
+        if (q.empty()) {
+          // Prevent recursion, as the Wait may queue resumed coroutines
+          busy_ = true;
+          reader_.Wait();
+          busy_ = false;
+        }
+      }
+    }
+  }
+
+ private:
+  std::queue<folly::Func> q_;
+  AsyncFileReader& reader_;
+  bool busy_;
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // USE_COROUTINES
diff --git a/src/rocksdb/util/slice.cc b/src/rocksdb/util/slice.cc
new file mode 100644
index 000000000..1fa21afcb
--- /dev/null
+++ b/src/rocksdb/util/slice.cc
@@ -0,0 +1,405 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/slice.h"
+
+#include <stdio.h>
+
+#include <algorithm>
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+class FixedPrefixTransform : public SliceTransform {
+ private:
+  size_t prefix_len_;
+  std::string id_;
+
+ public:
+  explicit FixedPrefixTransform(size_t prefix_len) : prefix_len_(prefix_len) {
+    id_ = std::string(kClassName()) + "." + std::to_string(prefix_len_);
+  }
+
+  static const char* kClassName() { return "rocksdb.FixedPrefix"; }
+  static const char* kNickName() { return "fixed"; }
+  const char* Name() const override { return kClassName(); }
+  const char* NickName() const override { return kNickName(); }
+
+  bool IsInstanceOf(const std::string& name) const override {
+    if (name == id_) {
+      return true;
+    } else if (StartsWith(name, kNickName())) {
+      std::string alt_id =
+          std::string(kNickName()) + ":" + std::to_string(prefix_len_);
+      if (name == alt_id) {
+        return true;
+      }
+    }
+    return SliceTransform::IsInstanceOf(name);
+  }
+
+  std::string GetId() const override { return id_; }
+
+  Slice Transform(const Slice& src) const override {
+    assert(InDomain(src));
+    return Slice(src.data(), prefix_len_);
+  }
+
+  bool InDomain(const Slice& src) const override {
+    return (src.size() >= prefix_len_);
+  }
+
+  bool InRange(const Slice& dst) const override {
+    return (dst.size() == prefix_len_);
+  }
+
+  bool FullLengthEnabled(size_t* len) const override {
+    *len = prefix_len_;
+    return true;
+  }
+
+  bool SameResultWhenAppended(const Slice& prefix) const override {
+    return InDomain(prefix);
+  }
+};
+
+class CappedPrefixTransform : public SliceTransform {
+ private:
+  size_t cap_len_;
+  std::string id_;
+
+ public:
+  explicit CappedPrefixTransform(size_t cap_len) : cap_len_(cap_len) {
+    id_ = std::string(kClassName()) + "." + std::to_string(cap_len_);
+  }
+
+  static const char* kClassName() { return "rocksdb.CappedPrefix"; }
+  static const char* kNickName() { return "capped"; }
+  const char* Name() const override { return kClassName(); }
+  const char* NickName() const override { return kNickName(); }
+  std::string GetId() const override { return id_; }
+
+  bool IsInstanceOf(const std::string& name) const override {
+    if (name == id_) {
+      return true;
+    } else if (StartsWith(name, kNickName())) {
+      std::string alt_id =
+          std::string(kNickName()) + ":" + std::to_string(cap_len_);
+      if (name == alt_id) {
+        return true;
+      }
+    }
+    return SliceTransform::IsInstanceOf(name);
+  }
+
+  Slice Transform(const Slice& src) const override {
+    assert(InDomain(src));
+    return Slice(src.data(), std::min(cap_len_, src.size()));
+  }
+
+  bool InDomain(const Slice& /*src*/) const override { return true; }
+
+  bool InRange(const Slice& dst) const override {
+    return (dst.size() <= cap_len_);
+  }
+
+  bool FullLengthEnabled(size_t* len) const override {
+    *len = cap_len_;
+    return true;
+  }
+
+  bool SameResultWhenAppended(const Slice& prefix) const override {
+    return prefix.size() >= cap_len_;
+  }
+};
+
+class NoopTransform : public SliceTransform {
+ public:
+  explicit NoopTransform() {}
+
+  static const char* kClassName() { return "rocksdb.Noop"; }
+  const char* Name() const override { return kClassName(); }
+
+  Slice Transform(const Slice& src) const override { return src; }
+
+  bool InDomain(const Slice& /*src*/) const override { return true; }
+
+  bool InRange(const Slice& /*dst*/) const override { return true; }
+
+  bool SameResultWhenAppended(const Slice& /*prefix*/) const override {
+    return false;
+  }
+};
+
+}  // end namespace
+
+const SliceTransform* NewFixedPrefixTransform(size_t prefix_len) {
+  return new FixedPrefixTransform(prefix_len);
+}
+
+const SliceTransform* NewCappedPrefixTransform(size_t cap_len) {
+  return new CappedPrefixTransform(cap_len);
+}
+
+const SliceTransform* NewNoopTransform() { return new NoopTransform; }
+
+#ifndef ROCKSDB_LITE
+static int RegisterBuiltinSliceTransform(ObjectLibrary& library,
+                                         const std::string& /*arg*/) {
+  // For the builtin transforms, the format is typically
+  // [Name].[0-9]+ or [NickName]:[0-9]+
+  library.AddFactory<const SliceTransform>(
+      NoopTransform::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<const SliceTransform>* guard,
+         std::string* /*errmsg*/) {
+        guard->reset(NewNoopTransform());
+        return guard->get();
+      });
+  library.AddFactory<const SliceTransform>(
+      ObjectLibrary::PatternEntry(FixedPrefixTransform::kNickName(), false)
+          .AddNumber(":"),
+      [](const std::string& uri, std::unique_ptr<const SliceTransform>* guard,
+         std::string* /*errmsg*/) {
+        auto colon = uri.find(":");
+        auto len = ParseSizeT(uri.substr(colon + 1));
+        guard->reset(NewFixedPrefixTransform(len));
+        return guard->get();
+      });
+  library.AddFactory<const SliceTransform>(
+      ObjectLibrary::PatternEntry(FixedPrefixTransform::kClassName(), false)
+          .AddNumber("."),
+      [](const std::string& uri, std::unique_ptr<const SliceTransform>* guard,
+         std::string* /*errmsg*/) {
+        auto len = ParseSizeT(
+            uri.substr(strlen(FixedPrefixTransform::kClassName()) + 1));
+        guard->reset(NewFixedPrefixTransform(len));
+        return guard->get();
+      });
+  library.AddFactory<const SliceTransform>(
+      ObjectLibrary::PatternEntry(CappedPrefixTransform::kNickName(), false)
+          .AddNumber(":"),
+      [](const std::string& uri, std::unique_ptr<const SliceTransform>* guard,
+         std::string* /*errmsg*/) {
+        auto colon = uri.find(":");
+        auto len = ParseSizeT(uri.substr(colon + 1));
+        guard->reset(NewCappedPrefixTransform(len));
+        return guard->get();
+      });
+  library.AddFactory<const SliceTransform>(
+      ObjectLibrary::PatternEntry(CappedPrefixTransform::kClassName(), false)
+          .AddNumber("."),
+      [](const std::string& uri, std::unique_ptr<const SliceTransform>* guard,
+         std::string* /*errmsg*/) {
+        auto len = ParseSizeT(
+            uri.substr(strlen(CappedPrefixTransform::kClassName()) + 1));
+        guard->reset(NewCappedPrefixTransform(len));
+        return guard->get();
+      });
+  size_t num_types;
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+#endif  // ROCKSDB_LITE
+
+Status SliceTransform::CreateFromString(
+    const ConfigOptions& config_options, const std::string& value,
+    std::shared_ptr<const SliceTransform>* result) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    RegisterBuiltinSliceTransform(*(ObjectLibrary::Default().get()), "");
+  });
+#endif  // ROCKSDB_LITE
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+  Status status = Customizable::GetOptionsMap(config_options, result->get(),
+                                              value, &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
+  } else if (id.empty() && opt_map.empty()) {
+    result->reset();
+  } else {
+#ifndef ROCKSDB_LITE
+    status = config_options.registry->NewSharedObject(id, result);
+#else
+    auto Matches = [](const std::string& input, size_t size,
+                      const char* pattern, char sep) {
+      auto plen = strlen(pattern);
+      return (size > plen + 2 && input[plen] == sep &&
+              StartsWith(input, pattern));
+    };
+
+    auto size = id.size();
+    if (id == NoopTransform::kClassName()) {
+      result->reset(NewNoopTransform());
+    } else if (Matches(id, size, FixedPrefixTransform::kNickName(), ':')) {
+      auto fixed = strlen(FixedPrefixTransform::kNickName());
+      auto len = ParseSizeT(id.substr(fixed + 1));
+      result->reset(NewFixedPrefixTransform(len));
+    } else if (Matches(id, size, CappedPrefixTransform::kNickName(), ':')) {
+      auto capped = strlen(CappedPrefixTransform::kNickName());
+      auto len = ParseSizeT(id.substr(capped + 1));
+      result->reset(NewCappedPrefixTransform(len));
+    } else if (Matches(id, size, CappedPrefixTransform::kClassName(), '.')) {
+      auto capped = strlen(CappedPrefixTransform::kClassName());
+      auto len = ParseSizeT(id.substr(capped + 1));
+      result->reset(NewCappedPrefixTransform(len));
+    } else if (Matches(id, size, FixedPrefixTransform::kClassName(), '.')) {
+      auto fixed = strlen(FixedPrefixTransform::kClassName());
+      auto len = ParseSizeT(id.substr(fixed + 1));
+      result->reset(NewFixedPrefixTransform(len));
+    } else {
+      status = Status::NotSupported("Cannot load object in LITE mode ", id);
+    }
+#endif  // ROCKSDB_LITE
+    if (config_options.ignore_unsupported_options && status.IsNotSupported()) {
+      return Status::OK();
+    } else if (status.ok()) {
+      SliceTransform* transform = const_cast<SliceTransform*>(result->get());
+      status =
+          Customizable::ConfigureNewObject(config_options, transform, opt_map);
+    }
+  }
+  return status;
+}
+
+std::string SliceTransform::AsString() const {
+#ifndef ROCKSDB_LITE
+  if (HasRegisteredOptions()) {
+    ConfigOptions opts;
+    opts.delimiter = ";";
+    return ToString(opts);
+  }
+#endif  // ROCKSDB_LITE
+  return GetId();
+}
+
+// 2 small internal utility functions, for efficient hex conversions
+// and no need for snprintf, toupper etc...
+// Originally from wdt/util/EncryptionUtils.cpp - for
+// std::to_string(true)/DecodeHex:
+char toHex(unsigned char v) {
+  if (v <= 9) {
+    return '0' + v;
+  }
+  return 'A' + v - 10;
+}
+// most of the code is for validation/error check
+int fromHex(char c) {
+  // toupper:
+  if (c >= 'a' && c <= 'f') {
+    c -= ('a' - 'A');  // aka 0x20
+  }
+  // validation
+  if (c < '0' || (c > '9' && (c < 'A' || c > 'F'))) {
+    return -1;  // invalid not 0-9A-F hex char
+  }
+  if (c <= '9') {
+    return c - '0';
+  }
+  return c - 'A' + 10;
+}
+
+Slice::Slice(const SliceParts& parts, std::string* buf) {
+  size_t length = 0;
+  for (int i = 0; i < parts.num_parts; ++i) {
+    length += parts.parts[i].size();
+  }
+  buf->reserve(length);
+
+  for (int i = 0; i < parts.num_parts; ++i) {
+    buf->append(parts.parts[i].data(), parts.parts[i].size());
+  }
+  data_ = buf->data();
+  size_ = buf->size();
+}
+
+// Return a string that contains the copy of the referenced data.
+std::string Slice::ToString(bool hex) const {
+  std::string result;  // RVO/NRVO/move
+  if (hex) {
+    result.reserve(2 * size_);
+    for (size_t i = 0; i < size_; ++i) {
+      unsigned char c = data_[i];
+      result.push_back(toHex(c >> 4));
+      result.push_back(toHex(c & 0xf));
+    }
+    return result;
+  } else {
+    result.assign(data_, size_);
+    return result;
+  }
+}
+
+// Originally from rocksdb/utilities/ldb_cmd.h
+bool Slice::DecodeHex(std::string* result) const {
+  std::string::size_type len = size_;
+  if (len % 2) {
+    // Hex string must be even number of hex digits to get complete bytes back
+    return false;
+  }
+  if (!result) {
+    return false;
+  }
+  result->clear();
+  result->reserve(len / 2);
+
+  for (size_t i = 0; i < len;) {
+    int h1 = fromHex(data_[i++]);
+    if (h1 < 0) {
+      return false;
+    }
+    int h2 = fromHex(data_[i++]);
+    if (h2 < 0) {
+      return false;
+    }
+    result->push_back(static_cast<char>((h1 << 4) | h2));
+  }
+  return true;
+}
+
+PinnableSlice::PinnableSlice(PinnableSlice&& other) {
+  *this = std::move(other);
+}
+
+PinnableSlice& PinnableSlice::operator=(PinnableSlice&& other) {
+  if (this != &other) {
+    Cleanable::Reset();
+    Cleanable::operator=(std::move(other));
+    size_ = other.size_;
+    pinned_ = other.pinned_;
+    if (pinned_) {
+      data_ = other.data_;
+      // When it's pinned, buf should no longer be of use.
+    } else {
+      if (other.buf_ == &other.self_space_) {
+        self_space_ = std::move(other.self_space_);
+        buf_ = &self_space_;
+        data_ = buf_->data();
+      } else {
+        buf_ = other.buf_;
+        data_ = other.data_;
+      }
+    }
+    other.self_space_.clear();
+    other.buf_ = &other.self_space_;
+    other.pinned_ = false;
+    other.PinSelf();
+  }
+  return *this;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/slice_test.cc b/src/rocksdb/util/slice_test.cc
new file mode 100644
index 000000000..e1c35d567
--- /dev/null
+++ b/src/rocksdb/util/slice_test.cc
@@ -0,0 +1,191 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/slice.h"
+
+#include <gtest/gtest.h>
+
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/data_structure.h"
+#include "rocksdb/types.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+TEST(SliceTest, StringView) {
+  std::string s = "foo";
+  std::string_view sv = s;
+  ASSERT_EQ(Slice(s), Slice(sv));
+  ASSERT_EQ(Slice(s), Slice(std::move(sv)));
+}
+
+// Use this to keep track of the cleanups that were actually performed
+void Multiplier(void* arg1, void* arg2) {
+  int* res = reinterpret_cast<int*>(arg1);
+  int* num = reinterpret_cast<int*>(arg2);
+  *res *= *num;
+}
+
+class PinnableSliceTest : public testing::Test {
+ public:
+  void AssertSameData(const std::string& expected, const PinnableSlice& slice) {
+    std::string got;
+    got.assign(slice.data(), slice.size());
+    ASSERT_EQ(expected, got);
+  }
+};
+
+// Test that the external buffer is moved instead of being copied.
+TEST_F(PinnableSliceTest, MoveExternalBuffer) {
+  Slice s("123");
+  std::string buf;
+  PinnableSlice v1(&buf);
+  v1.PinSelf(s);
+
+  PinnableSlice v2(std::move(v1));
+  ASSERT_EQ(buf.data(), v2.data());
+  ASSERT_EQ(&buf, v2.GetSelf());
+
+  PinnableSlice v3;
+  v3 = std::move(v2);
+  ASSERT_EQ(buf.data(), v3.data());
+  ASSERT_EQ(&buf, v3.GetSelf());
+}
+
+TEST_F(PinnableSliceTest, Move) {
+  int n2 = 2;
+  int res = 1;
+  const std::string const_str1 = "123";
+  const std::string const_str2 = "ABC";
+  Slice slice1(const_str1);
+  Slice slice2(const_str2);
+
+  {
+    // Test move constructor on a pinned slice.
+    res = 1;
+    PinnableSlice v1;
+    v1.PinSlice(slice1, Multiplier, &res, &n2);
+    PinnableSlice v2(std::move(v1));
+
+    // Since v1's Cleanable has been moved to v2,
+    // no cleanup should happen in Reset.
+    v1.Reset();
+    ASSERT_EQ(1, res);
+
+    AssertSameData(const_str1, v2);
+  }
+  // v2 is cleaned up.
+  ASSERT_EQ(2, res);
+
+  {
+    // Test move constructor on an unpinned slice.
+    PinnableSlice v1;
+    v1.PinSelf(slice1);
+    PinnableSlice v2(std::move(v1));
+
+    AssertSameData(const_str1, v2);
+  }
+
+  {
+    // Test move assignment from a pinned slice to
+    // another pinned slice.
+    res = 1;
+    PinnableSlice v1;
+    v1.PinSlice(slice1, Multiplier, &res, &n2);
+    PinnableSlice v2;
+    v2.PinSlice(slice2, Multiplier, &res, &n2);
+    v2 = std::move(v1);
+
+    // v2's Cleanable will be Reset before moving
+    // anything from v1.
+    ASSERT_EQ(2, res);
+    // Since v1's Cleanable has been moved to v2,
+    // no cleanup should happen in Reset.
+    v1.Reset();
+    ASSERT_EQ(2, res);
+
+    AssertSameData(const_str1, v2);
+  }
+  // The Cleanable moved from v1 to v2 will be Reset.
+  ASSERT_EQ(4, res);
+
+  {
+    // Test move assignment from a pinned slice to
+    // an unpinned slice.
+    res = 1;
+    PinnableSlice v1;
+    v1.PinSlice(slice1, Multiplier, &res, &n2);
+    PinnableSlice v2;
+    v2.PinSelf(slice2);
+    v2 = std::move(v1);
+
+    // Since v1's Cleanable has been moved to v2,
+    // no cleanup should happen in Reset.
+    v1.Reset();
+    ASSERT_EQ(1, res);
+
+    AssertSameData(const_str1, v2);
+  }
+  // The Cleanable moved from v1 to v2 will be Reset.
+  ASSERT_EQ(2, res);
+
+  {
+    // Test move assignment from an upinned slice to
+    // another unpinned slice.
+    PinnableSlice v1;
+    v1.PinSelf(slice1);
+    PinnableSlice v2;
+    v2.PinSelf(slice2);
+    v2 = std::move(v1);
+
+    AssertSameData(const_str1, v2);
+  }
+
+  {
+    // Test move assignment from an upinned slice to
+    // a pinned slice.
+    res = 1;
+    PinnableSlice v1;
+    v1.PinSelf(slice1);
+    PinnableSlice v2;
+    v2.PinSlice(slice2, Multiplier, &res, &n2);
+    v2 = std::move(v1);
+
+    // v2's Cleanable will be Reset before moving
+    // anything from v1.
+    ASSERT_EQ(2, res);
+
+    AssertSameData(const_str1, v2);
+  }
+  // No Cleanable is moved from v1 to v2, so no more cleanup.
+  ASSERT_EQ(2, res);
+}
+
+// ***************************************************************** //
+// Unit test for SmallEnumSet
+class SmallEnumSetTest : public testing::Test {
+ public:
+  SmallEnumSetTest() {}
+  ~SmallEnumSetTest() {}
+};
+
+TEST_F(SmallEnumSetTest, SmallSetTest) {
+  FileTypeSet fs;
+  ASSERT_TRUE(fs.Add(FileType::kIdentityFile));
+  ASSERT_FALSE(fs.Add(FileType::kIdentityFile));
+  ASSERT_TRUE(fs.Add(FileType::kInfoLogFile));
+  ASSERT_TRUE(fs.Contains(FileType::kIdentityFile));
+  ASSERT_FALSE(fs.Contains(FileType::kDBLockFile));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/slice_transform_test.cc b/src/rocksdb/util/slice_transform_test.cc
new file mode 100644
index 000000000..64ac8bb1f
--- /dev/null
+++ b/src/rocksdb/util/slice_transform_test.cc
@@ -0,0 +1,154 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/slice_transform.h"
+
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SliceTransformTest : public testing::Test {};
+
+TEST_F(SliceTransformTest, CapPrefixTransform) {
+  std::string s;
+  s = "abcdefge";
+
+  std::unique_ptr<const SliceTransform> transform;
+
+  transform.reset(NewCappedPrefixTransform(6));
+  ASSERT_EQ(transform->Transform(s).ToString(), "abcdef");
+  ASSERT_TRUE(transform->SameResultWhenAppended("123456"));
+  ASSERT_TRUE(transform->SameResultWhenAppended("1234567"));
+  ASSERT_TRUE(!transform->SameResultWhenAppended("12345"));
+
+  transform.reset(NewCappedPrefixTransform(8));
+  ASSERT_EQ(transform->Transform(s).ToString(), "abcdefge");
+
+  transform.reset(NewCappedPrefixTransform(10));
+  ASSERT_EQ(transform->Transform(s).ToString(), "abcdefge");
+
+  transform.reset(NewCappedPrefixTransform(0));
+  ASSERT_EQ(transform->Transform(s).ToString(), "");
+
+  transform.reset(NewCappedPrefixTransform(0));
+  ASSERT_EQ(transform->Transform("").ToString(), "");
+}
+
+class SliceTransformDBTest : public testing::Test {
+ private:
+  std::string dbname_;
+  Env* env_;
+  DB* db_;
+
+ public:
+  SliceTransformDBTest() : env_(Env::Default()), db_(nullptr) {
+    dbname_ = test::PerThreadDBPath("slice_transform_db_test");
+    EXPECT_OK(DestroyDB(dbname_, last_options_));
+  }
+
+  ~SliceTransformDBTest() override {
+    delete db_;
+    EXPECT_OK(DestroyDB(dbname_, last_options_));
+  }
+
+  DB* db() { return db_; }
+
+  // Return the current option configuration.
+  Options* GetOptions() { return &last_options_; }
+
+  void DestroyAndReopen() {
+    // Destroy using last options
+    Destroy();
+    ASSERT_OK(TryReopen());
+  }
+
+  void Destroy() {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, last_options_));
+  }
+
+  Status TryReopen() {
+    delete db_;
+    db_ = nullptr;
+    last_options_.create_if_missing = true;
+
+    return DB::Open(last_options_, dbname_, &db_);
+  }
+
+  Options last_options_;
+};
+
+namespace {
+uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) {
+  return options.statistics->getTickerCount(ticker_type);
+}
+}  // namespace
+
+TEST_F(SliceTransformDBTest, CapPrefix) {
+  last_options_.prefix_extractor.reset(NewCappedPrefixTransform(8));
+  last_options_.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  last_options_.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  ASSERT_OK(TryReopen());
+
+  ReadOptions ro;
+  FlushOptions fo;
+  WriteOptions wo;
+
+  ASSERT_OK(db()->Put(wo, "barbarbar", "foo"));
+  ASSERT_OK(db()->Put(wo, "barbarbar2", "foo2"));
+  ASSERT_OK(db()->Put(wo, "foo", "bar"));
+  ASSERT_OK(db()->Put(wo, "foo3", "bar3"));
+  ASSERT_OK(db()->Flush(fo));
+
+  std::unique_ptr<Iterator> iter(db()->NewIterator(ro));
+
+  iter->Seek("foo");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->value().ToString(), "bar");
+  ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 0U);
+
+  iter->Seek("foo2");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 1U);
+
+  iter->Seek("barbarbar");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->value().ToString(), "foo");
+  ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 1U);
+
+  iter->Seek("barfoofoo");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 2U);
+
+  iter->Seek("foobarbar");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 3U);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/status.cc b/src/rocksdb/util/status.cc
new file mode 100644
index 000000000..72fdfdbcc
--- /dev/null
+++ b/src/rocksdb/util/status.cc
@@ -0,0 +1,154 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/status.h"
+
+#include <stdio.h>
+#ifdef OS_WIN
+#include <string.h>
+#endif
+#include <cstring>
+
+#include "port/port.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::unique_ptr<const char[]> Status::CopyState(const char* s) {
+  const size_t cch = std::strlen(s) + 1;  // +1 for the null terminator
+  char* rv = new char[cch];
+  std::strncpy(rv, s, cch);
+  return std::unique_ptr<const char[]>(rv);
+}
+
+static const char* msgs[static_cast<int>(Status::kMaxSubCode)] = {
+    "",                                                   // kNone
+    "Timeout Acquiring Mutex",                            // kMutexTimeout
+    "Timeout waiting to lock key",                        // kLockTimeout
+    "Failed to acquire lock due to max_num_locks limit",  // kLockLimit
+    "No space left on device",                            // kNoSpace
+    "Deadlock",                                           // kDeadlock
+    "Stale file handle",                                  // kStaleFile
+    "Memory limit reached",                               // kMemoryLimit
+    "Space limit reached",                                // kSpaceLimit
+    "No such file or directory",                          // kPathNotFound
+    // KMergeOperandsInsufficientCapacity
+    "Insufficient capacity for merge operands",
+    // kManualCompactionPaused
+    "Manual compaction paused",
+    " (overwritten)",    // kOverwritten, subcode of OK
+    "Txn not prepared",  // kTxnNotPrepared
+    "IO fenced off",     // kIOFenced
+};
+
+Status::Status(Code _code, SubCode _subcode, const Slice& msg,
+               const Slice& msg2, Severity sev)
+    : code_(_code),
+      subcode_(_subcode),
+      sev_(sev),
+      retryable_(false),
+      data_loss_(false),
+      scope_(0) {
+  assert(subcode_ != kMaxSubCode);
+  const size_t len1 = msg.size();
+  const size_t len2 = msg2.size();
+  const size_t size = len1 + (len2 ? (2 + len2) : 0);
+  char* const result = new char[size + 1];  // +1 for null terminator
+  memcpy(result, msg.data(), len1);
+  if (len2) {
+    result[len1] = ':';
+    result[len1 + 1] = ' ';
+    memcpy(result + len1 + 2, msg2.data(), len2);
+  }
+  result[size] = '\0';  // null terminator for C style string
+  state_.reset(result);
+}
+
+std::string Status::ToString() const {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+  checked_ = true;
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+  const char* type = nullptr;
+  switch (code_) {
+    case kOk:
+      return "OK";
+    case kNotFound:
+      type = "NotFound: ";
+      break;
+    case kCorruption:
+      type = "Corruption: ";
+      break;
+    case kNotSupported:
+      type = "Not implemented: ";
+      break;
+    case kInvalidArgument:
+      type = "Invalid argument: ";
+      break;
+    case kIOError:
+      type = "IO error: ";
+      break;
+    case kMergeInProgress:
+      type = "Merge in progress: ";
+      break;
+    case kIncomplete:
+      type = "Result incomplete: ";
+      break;
+    case kShutdownInProgress:
+      type = "Shutdown in progress: ";
+      break;
+    case kTimedOut:
+      type = "Operation timed out: ";
+      break;
+    case kAborted:
+      type = "Operation aborted: ";
+      break;
+    case kBusy:
+      type = "Resource busy: ";
+      break;
+    case kExpired:
+      type = "Operation expired: ";
+      break;
+    case kTryAgain:
+      type = "Operation failed. Try again.: ";
+      break;
+    case kCompactionTooLarge:
+      type = "Compaction too large: ";
+      break;
+    case kColumnFamilyDropped:
+      type = "Column family dropped: ";
+      break;
+    case kMaxCode:
+      assert(false);
+      break;
+  }
+  char tmp[30];
+  if (type == nullptr) {
+    // This should not happen since `code_` should be a valid non-`kMaxCode`
+    // member of the `Code` enum. The above switch-statement should have had a
+    // case assigning `type` to a corresponding string.
+    assert(false);
+    snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", static_cast<int>(code()));
+    type = tmp;
+  }
+  std::string result(type);
+  if (subcode_ != kNone) {
+    uint32_t index = static_cast<int32_t>(subcode_);
+    assert(sizeof(msgs) / sizeof(msgs[0]) > index);
+    result.append(msgs[index]);
+  }
+
+  if (state_ != nullptr) {
+    if (subcode_ != kNone) {
+      result.append(": ");
+    }
+    result.append(state_.get());
+  }
+  return result;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/stderr_logger.cc b/src/rocksdb/util/stderr_logger.cc
new file mode 100644
index 000000000..6044b8b93
--- /dev/null
+++ b/src/rocksdb/util/stderr_logger.cc
@@ -0,0 +1,30 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/stderr_logger.h"
+
+#include "port/sys_time.h"
+
+namespace ROCKSDB_NAMESPACE {
+StderrLogger::~StderrLogger() {}
+
+void StderrLogger::Logv(const char* format, va_list ap) {
+  const uint64_t thread_id = Env::Default()->GetThreadID();
+
+  port::TimeVal now_tv;
+  port::GetTimeOfDay(&now_tv, nullptr);
+  const time_t seconds = now_tv.tv_sec;
+  struct tm t;
+  port::LocalTimeR(&seconds, &t);
+  fprintf(stderr, "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", t.tm_year + 1900,
+          t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec,
+          static_cast<int>(now_tv.tv_usec),
+          static_cast<long long unsigned int>(thread_id));
+
+  vfprintf(stderr, format, ap);
+  fprintf(stderr, "\n");
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/stderr_logger.h b/src/rocksdb/util/stderr_logger.h
new file mode 100644
index 000000000..c3b01210c
--- /dev/null
+++ b/src/rocksdb/util/stderr_logger.h
@@ -0,0 +1,31 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdarg.h>
+#include <stdio.h>
+
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Prints logs to stderr for faster debugging
+class StderrLogger : public Logger {
+ public:
+  explicit StderrLogger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL)
+      : Logger(log_level) {}
+
+  ~StderrLogger() override;
+
+  // Brings overloaded Logv()s into scope so they're not hidden when we override
+  // a subset of them.
+  using Logger::Logv;
+
+  virtual void Logv(const char* format, va_list ap) override;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/stop_watch.h b/src/rocksdb/util/stop_watch.h
new file mode 100644
index 000000000..e26380d97
--- /dev/null
+++ b/src/rocksdb/util/stop_watch.h
@@ -0,0 +1,118 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include "monitoring/statistics.h"
+#include "rocksdb/system_clock.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Auto-scoped.
+// Records the measure time into the corresponding histogram if statistics
+// is not nullptr. It is also saved into *elapsed if the pointer is not nullptr
+// and overwrite is true, it will be added to *elapsed if overwrite is false.
+class StopWatch {
+ public:
+  StopWatch(SystemClock* clock, Statistics* statistics,
+            const uint32_t hist_type, uint64_t* elapsed = nullptr,
+            bool overwrite = true, bool delay_enabled = false)
+      : clock_(clock),
+        statistics_(statistics),
+        hist_type_(hist_type),
+        elapsed_(elapsed),
+        overwrite_(overwrite),
+        stats_enabled_(statistics &&
+                       statistics->get_stats_level() >=
+                           StatsLevel::kExceptTimers &&
+                       statistics->HistEnabledForType(hist_type)),
+        delay_enabled_(delay_enabled),
+        total_delay_(0),
+        delay_start_time_(0),
+        start_time_((stats_enabled_ || elapsed != nullptr) ? clock->NowMicros()
+                                                           : 0) {}
+
+  ~StopWatch() {
+    if (elapsed_) {
+      if (overwrite_) {
+        *elapsed_ = clock_->NowMicros() - start_time_;
+      } else {
+        *elapsed_ += clock_->NowMicros() - start_time_;
+      }
+    }
+    if (elapsed_ && delay_enabled_) {
+      *elapsed_ -= total_delay_;
+    }
+    if (stats_enabled_) {
+      statistics_->reportTimeToHistogram(
+          hist_type_, (elapsed_ != nullptr)
+                          ? *elapsed_
+                          : (clock_->NowMicros() - start_time_));
+    }
+  }
+
+  void DelayStart() {
+    // if delay_start_time_ is not 0, it means we are already tracking delay,
+    // so delay_start_time_ should not be overwritten
+    if (elapsed_ && delay_enabled_ && delay_start_time_ == 0) {
+      delay_start_time_ = clock_->NowMicros();
+    }
+  }
+
+  void DelayStop() {
+    if (elapsed_ && delay_enabled_ && delay_start_time_ != 0) {
+      total_delay_ += clock_->NowMicros() - delay_start_time_;
+    }
+    // reset to 0 means currently no delay is being tracked, so two consecutive
+    // calls to DelayStop will not increase total_delay_
+    delay_start_time_ = 0;
+  }
+
+  uint64_t GetDelay() const { return delay_enabled_ ? total_delay_ : 0; }
+
+  uint64_t start_time() const { return start_time_; }
+
+ private:
+  SystemClock* clock_;
+  Statistics* statistics_;
+  const uint32_t hist_type_;
+  uint64_t* elapsed_;
+  bool overwrite_;
+  bool stats_enabled_;
+  bool delay_enabled_;
+  uint64_t total_delay_;
+  uint64_t delay_start_time_;
+  const uint64_t start_time_;
+};
+
+// a nano second precision stopwatch
+class StopWatchNano {
+ public:
+  explicit StopWatchNano(SystemClock* clock, bool auto_start = false)
+      : clock_(clock), start_(0) {
+    if (auto_start) {
+      Start();
+    }
+  }
+
+  void Start() { start_ = clock_->NowNanos(); }
+
+  uint64_t ElapsedNanos(bool reset = false) {
+    auto now = clock_->NowNanos();
+    auto elapsed = now - start_;
+    if (reset) {
+      start_ = now;
+    }
+    return elapsed;
+  }
+
+  uint64_t ElapsedNanosSafe(bool reset = false) {
+    return (clock_ != nullptr) ? ElapsedNanos(reset) : 0U;
+  }
+
+ private:
+  SystemClock* clock_;
+  uint64_t start_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/string_util.cc b/src/rocksdb/util/string_util.cc
new file mode 100644
index 000000000..324482a4c
--- /dev/null
+++ b/src/rocksdb/util/string_util.cc
@@ -0,0 +1,504 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "util/string_util.h"
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <cinttypes>
+#include <cmath>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "port/port.h"
+#include "port/sys_time.h"
+#include "rocksdb/slice.h"
+
+#ifndef __has_cpp_attribute
+#define ROCKSDB_HAS_CPP_ATTRIBUTE(x) 0
+#else
+#define ROCKSDB_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+#endif
+
+#if ROCKSDB_HAS_CPP_ATTRIBUTE(maybe_unused) && __cplusplus >= 201703L
+#define ROCKSDB_MAYBE_UNUSED [[maybe_unused]]
+#elif ROCKSDB_HAS_CPP_ATTRIBUTE(gnu::unused) || __GNUC__
+#define ROCKSDB_MAYBE_UNUSED [[gnu::unused]]
+#else
+#define ROCKSDB_MAYBE_UNUSED
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+const std::string kNullptrString = "nullptr";
+
+std::vector<std::string> StringSplit(const std::string& arg, char delim) {
+  std::vector<std::string> splits;
+  std::stringstream ss(arg);
+  std::string item;
+  while (std::getline(ss, item, delim)) {
+    splits.push_back(item);
+  }
+  return splits;
+}
+
+// for micros < 10ms, print "XX us".
+// for micros < 10sec, print "XX ms".
+// for micros >= 10 sec, print "XX sec".
+// for micros <= 1 hour, print Y:X M:S".
+// for micros > 1 hour, print Z:Y:X H:M:S".
+int AppendHumanMicros(uint64_t micros, char* output, int len,
+                      bool fixed_format) {
+  if (micros < 10000 && !fixed_format) {
+    return snprintf(output, len, "%" PRIu64 " us", micros);
+  } else if (micros < 10000000 && !fixed_format) {
+    return snprintf(output, len, "%.3lf ms",
+                    static_cast<double>(micros) / 1000);
+  } else if (micros < 1000000l * 60 && !fixed_format) {
+    return snprintf(output, len, "%.3lf sec",
+                    static_cast<double>(micros) / 1000000);
+  } else if (micros < 1000000ll * 60 * 60 && !fixed_format) {
+    return snprintf(output, len, "%02" PRIu64 ":%05.3f M:S",
+                    micros / 1000000 / 60,
+                    static_cast<double>(micros % 60000000) / 1000000);
+  } else {
+    return snprintf(output, len, "%02" PRIu64 ":%02" PRIu64 ":%05.3f H:M:S",
+                    micros / 1000000 / 3600, (micros / 1000000 / 60) % 60,
+                    static_cast<double>(micros % 60000000) / 1000000);
+  }
+}
+
+// for sizes >=10TB, print "XXTB"
+// for sizes >=10GB, print "XXGB"
+// etc.
+// append file size summary to output and return the len
+int AppendHumanBytes(uint64_t bytes, char* output, int len) {
+  const uint64_t ull10 = 10;
+  if (bytes >= ull10 << 40) {
+    return snprintf(output, len, "%" PRIu64 "TB", bytes >> 40);
+  } else if (bytes >= ull10 << 30) {
+    return snprintf(output, len, "%" PRIu64 "GB", bytes >> 30);
+  } else if (bytes >= ull10 << 20) {
+    return snprintf(output, len, "%" PRIu64 "MB", bytes >> 20);
+  } else if (bytes >= ull10 << 10) {
+    return snprintf(output, len, "%" PRIu64 "KB", bytes >> 10);
+  } else {
+    return snprintf(output, len, "%" PRIu64 "B", bytes);
+  }
+}
+
+void AppendNumberTo(std::string* str, uint64_t num) {
+  char buf[30];
+  snprintf(buf, sizeof(buf), "%" PRIu64, num);
+  str->append(buf);
+}
+
+void AppendEscapedStringTo(std::string* str, const Slice& value) {
+  for (size_t i = 0; i < value.size(); i++) {
+    char c = value[i];
+    if (c >= ' ' && c <= '~') {
+      str->push_back(c);
+    } else {
+      char buf[10];
+      snprintf(buf, sizeof(buf), "\\x%02x",
+               static_cast<unsigned int>(c) & 0xff);
+      str->append(buf);
+    }
+  }
+}
+
+std::string NumberToHumanString(int64_t num) {
+  char buf[19];
+  int64_t absnum = num < 0 ? -num : num;
+  if (absnum < 10000) {
+    snprintf(buf, sizeof(buf), "%" PRIi64, num);
+  } else if (absnum < 10000000) {
+    snprintf(buf, sizeof(buf), "%" PRIi64 "K", num / 1000);
+  } else if (absnum < 10000000000LL) {
+    snprintf(buf, sizeof(buf), "%" PRIi64 "M", num / 1000000);
+  } else {
+    snprintf(buf, sizeof(buf), "%" PRIi64 "G", num / 1000000000);
+  }
+  return std::string(buf);
+}
+
+std::string BytesToHumanString(uint64_t bytes) {
+  const char* size_name[] = {"KB", "MB", "GB", "TB"};
+  double final_size = static_cast<double>(bytes);
+  size_t size_idx;
+
+  // always start with KB
+  final_size /= 1024;
+  size_idx = 0;
+
+  while (size_idx < 3 && final_size >= 1024) {
+    final_size /= 1024;
+    size_idx++;
+  }
+
+  char buf[20];
+  snprintf(buf, sizeof(buf), "%.2f %s", final_size, size_name[size_idx]);
+  return std::string(buf);
+}
+
+std::string TimeToHumanString(int unixtime) {
+  char time_buffer[80];
+  time_t rawtime = unixtime;
+  struct tm tInfo;
+  struct tm* timeinfo = port::LocalTimeR(&rawtime, &tInfo);
+  assert(timeinfo == &tInfo);
+  strftime(time_buffer, 80, "%c", timeinfo);
+  return std::string(time_buffer);
+}
+
+std::string EscapeString(const Slice& value) {
+  std::string r;
+  AppendEscapedStringTo(&r, value);
+  return r;
+}
+
+bool ConsumeDecimalNumber(Slice* in, uint64_t* val) {
+  uint64_t v = 0;
+  int digits = 0;
+  while (!in->empty()) {
+    char c = (*in)[0];
+    if (c >= '0' && c <= '9') {
+      ++digits;
+      const unsigned int delta = (c - '0');
+      static const uint64_t kMaxUint64 = ~static_cast<uint64_t>(0);
+      if (v > kMaxUint64 / 10 ||
+          (v == kMaxUint64 / 10 && delta > kMaxUint64 % 10)) {
+        // Overflow
+        return false;
+      }
+      v = (v * 10) + delta;
+      in->remove_prefix(1);
+    } else {
+      break;
+    }
+  }
+  *val = v;
+  return (digits > 0);
+}
+
+bool isSpecialChar(const char c) {
+  if (c == '\\' || c == '#' || c == ':' || c == '\r' || c == '\n') {
+    return true;
+  }
+  return false;
+}
+
+namespace {
+using CharMap = std::pair<char, char>;
+}
+
+char UnescapeChar(const char c) {
+  static const CharMap convert_map[] = {{'r', '\r'}, {'n', '\n'}};
+
+  auto iter = std::find_if(std::begin(convert_map), std::end(convert_map),
+                           [c](const CharMap& p) { return p.first == c; });
+
+  if (iter == std::end(convert_map)) {
+    return c;
+  }
+  return iter->second;
+}
+
+char EscapeChar(const char c) {
+  static const CharMap convert_map[] = {{'\n', 'n'}, {'\r', 'r'}};
+
+  auto iter = std::find_if(std::begin(convert_map), std::end(convert_map),
+                           [c](const CharMap& p) { return p.first == c; });
+
+  if (iter == std::end(convert_map)) {
+    return c;
+  }
+  return iter->second;
+}
+
+std::string EscapeOptionString(const std::string& raw_string) {
+  std::string output;
+  for (auto c : raw_string) {
+    if (isSpecialChar(c)) {
+      output += '\\';
+      output += EscapeChar(c);
+    } else {
+      output += c;
+    }
+  }
+
+  return output;
+}
+
+std::string UnescapeOptionString(const std::string& escaped_string) {
+  bool escaped = false;
+  std::string output;
+
+  for (auto c : escaped_string) {
+    if (escaped) {
+      output += UnescapeChar(c);
+      escaped = false;
+    } else {
+      if (c == '\\') {
+        escaped = true;
+        continue;
+      }
+      output += c;
+    }
+  }
+  return output;
+}
+
+std::string trim(const std::string& str) {
+  if (str.empty()) return std::string();
+  size_t start = 0;
+  size_t end = str.size() - 1;
+  while (isspace(str[start]) != 0 && start < end) {
+    ++start;
+  }
+  while (isspace(str[end]) != 0 && start < end) {
+    --end;
+  }
+  if (start <= end) {
+    return str.substr(start, end - start + 1);
+  }
+  return std::string();
+}
+
+bool EndsWith(const std::string& string, const std::string& pattern) {
+  size_t plen = pattern.size();
+  size_t slen = string.size();
+  if (plen <= slen) {
+    return string.compare(slen - plen, plen, pattern) == 0;
+  } else {
+    return false;
+  }
+}
+
+bool StartsWith(const std::string& string, const std::string& pattern) {
+  return string.compare(0, pattern.size(), pattern) == 0;
+}
+
+#ifndef ROCKSDB_LITE
+
+bool ParseBoolean(const std::string& type, const std::string& value) {
+  if (value == "true" || value == "1") {
+    return true;
+  } else if (value == "false" || value == "0") {
+    return false;
+  }
+  throw std::invalid_argument(type);
+}
+
+uint8_t ParseUint8(const std::string& value) {
+  uint64_t num = ParseUint64(value);
+  if ((num >> 8LL) == 0) {
+    return static_cast<uint8_t>(num);
+  } else {
+    throw std::out_of_range(value);
+  }
+}
+
+uint32_t ParseUint32(const std::string& value) {
+  uint64_t num = ParseUint64(value);
+  if ((num >> 32LL) == 0) {
+    return static_cast<uint32_t>(num);
+  } else {
+    throw std::out_of_range(value);
+  }
+}
+
+int32_t ParseInt32(const std::string& value) {
+  int64_t num = ParseInt64(value);
+  if (num <= std::numeric_limits<int32_t>::max() &&
+      num >= std::numeric_limits<int32_t>::min()) {
+    return static_cast<int32_t>(num);
+  } else {
+    throw std::out_of_range(value);
+  }
+}
+
+#endif
+
+uint64_t ParseUint64(const std::string& value) {
+  size_t endchar;
+#ifndef CYGWIN
+  uint64_t num = std::stoull(value.c_str(), &endchar);
+#else
+  char* endptr;
+  uint64_t num = std::strtoul(value.c_str(), &endptr, 0);
+  endchar = endptr - value.c_str();
+#endif
+
+  if (endchar < value.length()) {
+    char c = value[endchar];
+    if (c == 'k' || c == 'K')
+      num <<= 10LL;
+    else if (c == 'm' || c == 'M')
+      num <<= 20LL;
+    else if (c == 'g' || c == 'G')
+      num <<= 30LL;
+    else if (c == 't' || c == 'T')
+      num <<= 40LL;
+  }
+
+  return num;
+}
+
+int64_t ParseInt64(const std::string& value) {
+  size_t endchar;
+#ifndef CYGWIN
+  int64_t num = std::stoll(value.c_str(), &endchar);
+#else
+  char* endptr;
+  int64_t num = std::strtoll(value.c_str(), &endptr, 0);
+  endchar = endptr - value.c_str();
+#endif
+
+  if (endchar < value.length()) {
+    char c = value[endchar];
+    if (c == 'k' || c == 'K')
+      num <<= 10LL;
+    else if (c == 'm' || c == 'M')
+      num <<= 20LL;
+    else if (c == 'g' || c == 'G')
+      num <<= 30LL;
+    else if (c == 't' || c == 'T')
+      num <<= 40LL;
+  }
+
+  return num;
+}
+
+int ParseInt(const std::string& value) {
+  size_t endchar;
+#ifndef CYGWIN
+  int num = std::stoi(value.c_str(), &endchar);
+#else
+  char* endptr;
+  int num = std::strtoul(value.c_str(), &endptr, 0);
+  endchar = endptr - value.c_str();
+#endif
+
+  if (endchar < value.length()) {
+    char c = value[endchar];
+    if (c == 'k' || c == 'K')
+      num <<= 10;
+    else if (c == 'm' || c == 'M')
+      num <<= 20;
+    else if (c == 'g' || c == 'G')
+      num <<= 30;
+  }
+
+  return num;
+}
+
+double ParseDouble(const std::string& value) {
+#ifndef CYGWIN
+  return std::stod(value);
+#else
+  return std::strtod(value.c_str(), 0);
+#endif
+}
+
+size_t ParseSizeT(const std::string& value) {
+  return static_cast<size_t>(ParseUint64(value));
+}
+
+std::vector<int> ParseVectorInt(const std::string& value) {
+  std::vector<int> result;
+  size_t start = 0;
+  while (start < value.size()) {
+    size_t end = value.find(':', start);
+    if (end == std::string::npos) {
+      result.push_back(ParseInt(value.substr(start)));
+      break;
+    } else {
+      result.push_back(ParseInt(value.substr(start, end - start)));
+      start = end + 1;
+    }
+  }
+  return result;
+}
+
+bool SerializeIntVector(const std::vector<int>& vec, std::string* value) {
+  *value = "";
+  for (size_t i = 0; i < vec.size(); ++i) {
+    if (i > 0) {
+      *value += ":";
+    }
+    *value += std::to_string(vec[i]);
+  }
+  return true;
+}
+
+// Copied from folly/string.cpp:
+// https://github.com/facebook/folly/blob/0deef031cb8aab76dc7e736f8b7c22d701d5f36b/folly/String.cpp#L457
+// There are two variants of `strerror_r` function, one returns
+// `int`, and another returns `char*`. Selecting proper version using
+// preprocessor macros portably is extremely hard.
+//
+// For example, on Android function signature depends on `__USE_GNU` and
+// `__ANDROID_API__` macros (https://git.io/fjBBE).
+//
+// So we are using C++ overloading trick: we pass a pointer of
+// `strerror_r` to `invoke_strerror_r` function, and C++ compiler
+// selects proper function.
+
+#if !(defined(_WIN32) && (defined(__MINGW32__) || defined(_MSC_VER)))
+ROCKSDB_MAYBE_UNUSED
+static std::string invoke_strerror_r(int (*strerror_r)(int, char*, size_t),
+                                     int err, char* buf, size_t buflen) {
+  // Using XSI-compatible strerror_r
+  int r = strerror_r(err, buf, buflen);
+
+  // OSX/FreeBSD use EINVAL and Linux uses -1 so just check for non-zero
+  if (r != 0) {
+    snprintf(buf, buflen, "Unknown error %d (strerror_r failed with error %d)",
+             err, errno);
+  }
+  return buf;
+}
+
+ROCKSDB_MAYBE_UNUSED
+static std::string invoke_strerror_r(char* (*strerror_r)(int, char*, size_t),
+                                     int err, char* buf, size_t buflen) {
+  // Using GNU strerror_r
+  return strerror_r(err, buf, buflen);
+}
+#endif  // !(defined(_WIN32) && (defined(__MINGW32__) || defined(_MSC_VER)))
+
+std::string errnoStr(int err) {
+  char buf[1024];
+  buf[0] = '\0';
+
+  std::string result;
+
+  // https://developer.apple.com/library/mac/documentation/Darwin/Reference/ManPages/man3/strerror_r.3.html
+  // http://www.kernel.org/doc/man-pages/online/pages/man3/strerror.3.html
+#if defined(_WIN32) && (defined(__MINGW32__) || defined(_MSC_VER))
+  // mingw64 has no strerror_r, but Windows has strerror_s, which C11 added
+  // as well. So maybe we should use this across all platforms (together
+  // with strerrorlen_s). Note strerror_r and _s have swapped args.
+  int r = strerror_s(buf, sizeof(buf), err);
+  if (r != 0) {
+    snprintf(buf, sizeof(buf),
+             "Unknown error %d (strerror_r failed with error %d)", err, errno);
+  }
+  result.assign(buf);
+#else
+  // Using any strerror_r
+  result.assign(invoke_strerror_r(strerror_r, err, buf, sizeof(buf)));
+#endif
+
+  return result;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/string_util.h b/src/rocksdb/util/string_util.h
new file mode 100644
index 000000000..11178fd1d
--- /dev/null
+++ b/src/rocksdb/util/string_util.h
@@ -0,0 +1,177 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include <cstdint>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+
+extern std::vector<std::string> StringSplit(const std::string& arg, char delim);
+
+// Append a human-readable printout of "num" to *str
+extern void AppendNumberTo(std::string* str, uint64_t num);
+
+// Append a human-readable printout of "value" to *str.
+// Escapes any non-printable characters found in "value".
+extern void AppendEscapedStringTo(std::string* str, const Slice& value);
+
+// Put n digits from v in base kBase to (*buf)[0] to (*buf)[n-1] and
+// advance *buf to the position after what was written.
+template <size_t kBase>
+inline void PutBaseChars(char** buf, size_t n, uint64_t v, bool uppercase) {
+  const char* digitChars = uppercase ? "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+                                     : "0123456789abcdefghijklmnopqrstuvwxyz";
+  for (size_t i = n; i > 0; --i) {
+    (*buf)[i - 1] = digitChars[static_cast<size_t>(v % kBase)];
+    v /= kBase;
+  }
+  *buf += n;
+}
+
+// Parse n digits from *buf in base kBase to *v and advance *buf to the
+// position after what was read. On success, true is returned. On failure,
+// false is returned, *buf is placed at the first bad character, and *v
+// contains the partial parsed data. Overflow is not checked but the
+// result is accurate mod 2^64. Requires the starting value of *v to be
+// zero or previously accumulated parsed digits, i.e.
+//   ParseBaseChars(&b, n, &v);
+// is equivalent to n calls to
+//   ParseBaseChars(&b, 1, &v);
+template <int kBase>
+inline bool ParseBaseChars(const char** buf, size_t n, uint64_t* v) {
+  while (n) {
+    char c = **buf;
+    *v *= static_cast<uint64_t>(kBase);
+    if (c >= '0' && (kBase >= 10 ? c <= '9' : c < '0' + kBase)) {
+      *v += static_cast<uint64_t>(c - '0');
+    } else if (kBase > 10 && c >= 'A' && c < 'A' + kBase - 10) {
+      *v += static_cast<uint64_t>(c - 'A' + 10);
+    } else if (kBase > 10 && c >= 'a' && c < 'a' + kBase - 10) {
+      *v += static_cast<uint64_t>(c - 'a' + 10);
+    } else {
+      return false;
+    }
+    --n;
+    ++*buf;
+  }
+  return true;
+}
+
+// Return a human-readable version of num.
+// for num >= 10.000, prints "xxK"
+// for num >= 10.000.000, prints "xxM"
+// for num >= 10.000.000.000, prints "xxG"
+extern std::string NumberToHumanString(int64_t num);
+
+// Return a human-readable version of bytes
+// ex: 1048576 -> 1.00 GB
+extern std::string BytesToHumanString(uint64_t bytes);
+
+// Return a human-readable version of unix time
+// ex: 1562116015 -> "Tue Jul  2 18:06:55 2019"
+extern std::string TimeToHumanString(int unixtime);
+
+// Append a human-readable time in micros.
+int AppendHumanMicros(uint64_t micros, char* output, int len,
+                      bool fixed_format);
+
+// Append a human-readable size in bytes
+int AppendHumanBytes(uint64_t bytes, char* output, int len);
+
+// Return a human-readable version of "value".
+// Escapes any non-printable characters found in "value".
+extern std::string EscapeString(const Slice& value);
+
+// Parse a human-readable number from "*in" into *value.  On success,
+// advances "*in" past the consumed number and sets "*val" to the
+// numeric value.  Otherwise, returns false and leaves *in in an
+// unspecified state.
+extern bool ConsumeDecimalNumber(Slice* in, uint64_t* val);
+
+// Returns true if the input char "c" is considered as a special character
+// that will be escaped when EscapeOptionString() is called.
+//
+// @param c the input char
+// @return true if the input char "c" is considered as a special character.
+// @see EscapeOptionString
+bool isSpecialChar(const char c);
+
+// If the input char is an escaped char, it will return the its
+// associated raw-char.  Otherwise, the function will simply return
+// the original input char.
+char UnescapeChar(const char c);
+
+// If the input char is a control char, it will return the its
+// associated escaped char.  Otherwise, the function will simply return
+// the original input char.
+char EscapeChar(const char c);
+
+// Converts a raw string to an escaped string.  Escaped-characters are
+// defined via the isSpecialChar() function.  When a char in the input
+// string "raw_string" is classified as a special characters, then it
+// will be prefixed by '\' in the output.
+//
+// It's inverse function is UnescapeOptionString().
+// @param raw_string the input string
+// @return the '\' escaped string of the input "raw_string"
+// @see isSpecialChar, UnescapeOptionString
+std::string EscapeOptionString(const std::string& raw_string);
+
+// The inverse function of EscapeOptionString.  It converts
+// an '\' escaped string back to a raw string.
+//
+// @param escaped_string the input '\' escaped string
+// @return the raw string of the input "escaped_string"
+std::string UnescapeOptionString(const std::string& escaped_string);
+
+std::string trim(const std::string& str);
+
+// Returns true if "string" ends with "pattern"
+bool EndsWith(const std::string& string, const std::string& pattern);
+
+// Returns true if "string" starts with "pattern"
+bool StartsWith(const std::string& string, const std::string& pattern);
+
+#ifndef ROCKSDB_LITE
+bool ParseBoolean(const std::string& type, const std::string& value);
+
+uint8_t ParseUint8(const std::string& value);
+
+uint32_t ParseUint32(const std::string& value);
+
+int32_t ParseInt32(const std::string& value);
+#endif
+
+uint64_t ParseUint64(const std::string& value);
+
+int ParseInt(const std::string& value);
+
+int64_t ParseInt64(const std::string& value);
+
+double ParseDouble(const std::string& value);
+
+size_t ParseSizeT(const std::string& value);
+
+std::vector<int> ParseVectorInt(const std::string& value);
+
+bool SerializeIntVector(const std::vector<int>& vec, std::string* value);
+
+extern const std::string kNullptrString;
+
+// errnoStr() function returns a string that describes the error code passed in
+// the argument err
+extern std::string errnoStr(int err);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/thread_guard.h b/src/rocksdb/util/thread_guard.h
new file mode 100644
index 000000000..b2bb06a1b
--- /dev/null
+++ b/src/rocksdb/util/thread_guard.h
@@ -0,0 +1,41 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "port/port.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Resource management object for threads that joins the thread upon
+// destruction. Has unique ownership of the thread object, so copying it is not
+// allowed, while moving it transfers ownership.
+class ThreadGuard {
+ public:
+  ThreadGuard() = default;
+
+  explicit ThreadGuard(port::Thread&& thread) : thread_(std::move(thread)) {}
+
+  ThreadGuard(const ThreadGuard&) = delete;
+  ThreadGuard& operator=(const ThreadGuard&) = delete;
+
+  ThreadGuard(ThreadGuard&&) noexcept = default;
+  ThreadGuard& operator=(ThreadGuard&&) noexcept = default;
+
+  ~ThreadGuard() {
+    if (thread_.joinable()) {
+      thread_.join();
+    }
+  }
+
+  const port::Thread& GetThread() const { return thread_; }
+  port::Thread& GetThread() { return thread_; }
+
+ private:
+  port::Thread thread_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/thread_list_test.cc b/src/rocksdb/util/thread_list_test.cc
new file mode 100644
index 000000000..af4e62355
--- /dev/null
+++ b/src/rocksdb/util/thread_list_test.cc
@@ -0,0 +1,360 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <condition_variable>
+#include <mutex>
+
+#include "monitoring/thread_status_updater.h"
+#include "rocksdb/db.h"
+#include "test_util/testharness.h"
+
+#ifdef ROCKSDB_USING_THREAD_STATUS
+
+namespace ROCKSDB_NAMESPACE {
+
+class SimulatedBackgroundTask {
+ public:
+  SimulatedBackgroundTask(
+      const void* db_key, const std::string& db_name, const void* cf_key,
+      const std::string& cf_name,
+      const ThreadStatus::OperationType operation_type =
+          ThreadStatus::OP_UNKNOWN,
+      const ThreadStatus::StateType state_type = ThreadStatus::STATE_UNKNOWN)
+      : db_key_(db_key),
+        db_name_(db_name),
+        cf_key_(cf_key),
+        cf_name_(cf_name),
+        operation_type_(operation_type),
+        state_type_(state_type),
+        should_run_(true),
+        running_count_(0) {
+    Env::Default()->GetThreadStatusUpdater()->NewColumnFamilyInfo(
+        db_key_, db_name_, cf_key_, cf_name_);
+  }
+
+  ~SimulatedBackgroundTask() {
+    Env::Default()->GetThreadStatusUpdater()->EraseDatabaseInfo(db_key_);
+  }
+
+  void Run() {
+    std::unique_lock<std::mutex> l(mutex_);
+    running_count_++;
+    bg_cv_.notify_all();
+    Env::Default()->GetThreadStatusUpdater()->SetColumnFamilyInfoKey(cf_key_);
+    Env::Default()->GetThreadStatusUpdater()->SetThreadOperation(
+        operation_type_);
+    Env::Default()->GetThreadStatusUpdater()->SetThreadState(state_type_);
+    while (should_run_) {
+      bg_cv_.wait(l);
+    }
+    Env::Default()->GetThreadStatusUpdater()->ClearThreadState();
+    Env::Default()->GetThreadStatusUpdater()->ClearThreadOperation();
+    Env::Default()->GetThreadStatusUpdater()->SetColumnFamilyInfoKey(nullptr);
+    running_count_--;
+    bg_cv_.notify_all();
+  }
+
+  void FinishAllTasks() {
+    std::unique_lock<std::mutex> l(mutex_);
+    should_run_ = false;
+    bg_cv_.notify_all();
+  }
+
+  void WaitUntilScheduled(int job_count) {
+    std::unique_lock<std::mutex> l(mutex_);
+    while (running_count_ < job_count) {
+      bg_cv_.wait(l);
+    }
+  }
+
+  void WaitUntilDone() {
+    std::unique_lock<std::mutex> l(mutex_);
+    while (running_count_ > 0) {
+      bg_cv_.wait(l);
+    }
+  }
+
+  static void DoSimulatedTask(void* arg) {
+    reinterpret_cast<SimulatedBackgroundTask*>(arg)->Run();
+  }
+
+ private:
+  const void* db_key_;
+  const std::string db_name_;
+  const void* cf_key_;
+  const std::string cf_name_;
+  const ThreadStatus::OperationType operation_type_;
+  const ThreadStatus::StateType state_type_;
+  std::mutex mutex_;
+  std::condition_variable bg_cv_;
+  bool should_run_;
+  std::atomic<int> running_count_;
+};
+
+class ThreadListTest : public testing::Test {
+ public:
+  ThreadListTest() {}
+};
+
+TEST_F(ThreadListTest, GlobalTables) {
+  // verify the global tables for operations and states are properly indexed.
+  for (int type = 0; type != ThreadStatus::NUM_OP_TYPES; ++type) {
+    ASSERT_EQ(global_operation_table[type].type, type);
+    ASSERT_EQ(
+        global_operation_table[type].name,
+        ThreadStatus::GetOperationName(ThreadStatus::OperationType(type)));
+  }
+
+  for (int type = 0; type != ThreadStatus::NUM_STATE_TYPES; ++type) {
+    ASSERT_EQ(global_state_table[type].type, type);
+    ASSERT_EQ(global_state_table[type].name,
+              ThreadStatus::GetStateName(ThreadStatus::StateType(type)));
+  }
+
+  for (int stage = 0; stage != ThreadStatus::NUM_OP_STAGES; ++stage) {
+    ASSERT_EQ(global_op_stage_table[stage].stage, stage);
+    ASSERT_EQ(global_op_stage_table[stage].name,
+              ThreadStatus::GetOperationStageName(
+                  ThreadStatus::OperationStage(stage)));
+  }
+}
+
+TEST_F(ThreadListTest, SimpleColumnFamilyInfoTest) {
+  Env* env = Env::Default();
+  const int kHighPriorityThreads = 3;
+  const int kLowPriorityThreads = 5;
+  const int kSimulatedHighPriThreads = kHighPriorityThreads - 1;
+  const int kSimulatedLowPriThreads = kLowPriorityThreads / 3;
+  const int kDelayMicros = 1000000;
+  env->SetBackgroundThreads(kHighPriorityThreads, Env::HIGH);
+  env->SetBackgroundThreads(kLowPriorityThreads, Env::LOW);
+  // Wait 1 second so that threads start
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  SimulatedBackgroundTask running_task(reinterpret_cast<void*>(1234), "running",
+                                       reinterpret_cast<void*>(5678),
+                                       "pikachu");
+
+  for (int test = 0; test < kSimulatedHighPriThreads; ++test) {
+    env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, &running_task,
+                  Env::Priority::HIGH);
+  }
+
+  for (int test = 0; test < kSimulatedLowPriThreads; ++test) {
+    env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, &running_task,
+                  Env::Priority::LOW);
+  }
+  running_task.WaitUntilScheduled(kSimulatedHighPriThreads +
+                                  kSimulatedLowPriThreads);
+  // We can only reserve limited number of waiting threads
+  ASSERT_EQ(kHighPriorityThreads - kSimulatedHighPriThreads,
+            env->ReserveThreads(kHighPriorityThreads, Env::Priority::HIGH));
+  ASSERT_EQ(kLowPriorityThreads - kSimulatedLowPriThreads,
+            env->ReserveThreads(kLowPriorityThreads, Env::Priority::LOW));
+
+  // Reservation shall not affect the existing thread list
+  std::vector<ThreadStatus> thread_list;
+
+  // Verify the number of running threads in each pool.
+  ASSERT_OK(env->GetThreadList(&thread_list));
+  int running_count[ThreadStatus::NUM_THREAD_TYPES] = {0};
+  for (auto thread_status : thread_list) {
+    if (thread_status.cf_name == "pikachu" &&
+        thread_status.db_name == "running") {
+      running_count[thread_status.thread_type]++;
+    }
+  }
+  // Cannot reserve more threads
+  ASSERT_EQ(0, env->ReserveThreads(kHighPriorityThreads, Env::Priority::HIGH));
+  ASSERT_EQ(0, env->ReserveThreads(kLowPriorityThreads, Env::Priority::LOW));
+
+  ASSERT_EQ(running_count[ThreadStatus::HIGH_PRIORITY],
+            kSimulatedHighPriThreads);
+  ASSERT_EQ(running_count[ThreadStatus::LOW_PRIORITY], kSimulatedLowPriThreads);
+  ASSERT_EQ(running_count[ThreadStatus::USER], 0);
+
+  running_task.FinishAllTasks();
+  running_task.WaitUntilDone();
+
+  ASSERT_EQ(kHighPriorityThreads - kSimulatedHighPriThreads,
+            env->ReleaseThreads(kHighPriorityThreads, Env::Priority::HIGH));
+  ASSERT_EQ(kLowPriorityThreads - kSimulatedLowPriThreads,
+            env->ReleaseThreads(kLowPriorityThreads, Env::Priority::LOW));
+  // Verify none of the threads are running
+  ASSERT_OK(env->GetThreadList(&thread_list));
+
+  for (int i = 0; i < ThreadStatus::NUM_THREAD_TYPES; ++i) {
+    running_count[i] = 0;
+  }
+  for (auto thread_status : thread_list) {
+    if (thread_status.cf_name == "pikachu" &&
+        thread_status.db_name == "running") {
+      running_count[thread_status.thread_type]++;
+    }
+  }
+
+  ASSERT_EQ(running_count[ThreadStatus::HIGH_PRIORITY], 0);
+  ASSERT_EQ(running_count[ThreadStatus::LOW_PRIORITY], 0);
+  ASSERT_EQ(running_count[ThreadStatus::USER], 0);
+}
+
+namespace {
+void UpdateStatusCounts(const std::vector<ThreadStatus>& thread_list,
+                        int operation_counts[], int state_counts[]) {
+  for (auto thread_status : thread_list) {
+    operation_counts[thread_status.operation_type]++;
+    state_counts[thread_status.state_type]++;
+  }
+}
+
+void VerifyAndResetCounts(const int correct_counts[], int collected_counts[],
+                          int size) {
+  for (int i = 0; i < size; ++i) {
+    ASSERT_EQ(collected_counts[i], correct_counts[i]);
+    collected_counts[i] = 0;
+  }
+}
+
+void UpdateCount(int operation_counts[], int from_event, int to_event,
+                 int amount) {
+  operation_counts[from_event] -= amount;
+  operation_counts[to_event] += amount;
+}
+}  // namespace
+
+TEST_F(ThreadListTest, SimpleEventTest) {
+  Env* env = Env::Default();
+
+  // simulated tasks
+  const int kFlushWriteTasks = 3;
+  SimulatedBackgroundTask flush_write_task(
+      reinterpret_cast<void*>(1234), "running", reinterpret_cast<void*>(5678),
+      "pikachu", ThreadStatus::OP_FLUSH);
+
+  const int kCompactionWriteTasks = 4;
+  SimulatedBackgroundTask compaction_write_task(
+      reinterpret_cast<void*>(1234), "running", reinterpret_cast<void*>(5678),
+      "pikachu", ThreadStatus::OP_COMPACTION);
+
+  const int kCompactionReadTasks = 5;
+  SimulatedBackgroundTask compaction_read_task(
+      reinterpret_cast<void*>(1234), "running", reinterpret_cast<void*>(5678),
+      "pikachu", ThreadStatus::OP_COMPACTION);
+
+  const int kCompactionWaitTasks = 6;
+  SimulatedBackgroundTask compaction_wait_task(
+      reinterpret_cast<void*>(1234), "running", reinterpret_cast<void*>(5678),
+      "pikachu", ThreadStatus::OP_COMPACTION);
+
+  // setup right answers
+  int correct_operation_counts[ThreadStatus::NUM_OP_TYPES] = {0};
+  correct_operation_counts[ThreadStatus::OP_FLUSH] = kFlushWriteTasks;
+  correct_operation_counts[ThreadStatus::OP_COMPACTION] =
+      kCompactionWriteTasks + kCompactionReadTasks + kCompactionWaitTasks;
+
+  env->SetBackgroundThreads(correct_operation_counts[ThreadStatus::OP_FLUSH],
+                            Env::HIGH);
+  env->SetBackgroundThreads(
+      correct_operation_counts[ThreadStatus::OP_COMPACTION], Env::LOW);
+
+  // schedule the simulated tasks
+  for (int t = 0; t < kFlushWriteTasks; ++t) {
+    env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, &flush_write_task,
+                  Env::Priority::HIGH);
+  }
+  flush_write_task.WaitUntilScheduled(kFlushWriteTasks);
+
+  for (int t = 0; t < kCompactionWriteTasks; ++t) {
+    env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
+                  &compaction_write_task, Env::Priority::LOW);
+  }
+  compaction_write_task.WaitUntilScheduled(kCompactionWriteTasks);
+
+  for (int t = 0; t < kCompactionReadTasks; ++t) {
+    env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
+                  &compaction_read_task, Env::Priority::LOW);
+  }
+  compaction_read_task.WaitUntilScheduled(kCompactionReadTasks);
+
+  for (int t = 0; t < kCompactionWaitTasks; ++t) {
+    env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
+                  &compaction_wait_task, Env::Priority::LOW);
+  }
+  compaction_wait_task.WaitUntilScheduled(kCompactionWaitTasks);
+
+  // verify the thread-status
+  int operation_counts[ThreadStatus::NUM_OP_TYPES] = {0};
+  int state_counts[ThreadStatus::NUM_STATE_TYPES] = {0};
+
+  std::vector<ThreadStatus> thread_list;
+  ASSERT_OK(env->GetThreadList(&thread_list));
+  UpdateStatusCounts(thread_list, operation_counts, state_counts);
+  VerifyAndResetCounts(correct_operation_counts, operation_counts,
+                       ThreadStatus::NUM_OP_TYPES);
+
+  // terminate compaction-wait tasks and see if the thread-status
+  // reflects this update
+  compaction_wait_task.FinishAllTasks();
+  compaction_wait_task.WaitUntilDone();
+  UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION,
+              ThreadStatus::OP_UNKNOWN, kCompactionWaitTasks);
+
+  ASSERT_OK(env->GetThreadList(&thread_list));
+  UpdateStatusCounts(thread_list, operation_counts, state_counts);
+  VerifyAndResetCounts(correct_operation_counts, operation_counts,
+                       ThreadStatus::NUM_OP_TYPES);
+
+  // terminate flush-write tasks and see if the thread-status
+  // reflects this update
+  flush_write_task.FinishAllTasks();
+  flush_write_task.WaitUntilDone();
+  UpdateCount(correct_operation_counts, ThreadStatus::OP_FLUSH,
+              ThreadStatus::OP_UNKNOWN, kFlushWriteTasks);
+
+  ASSERT_OK(env->GetThreadList(&thread_list));
+  UpdateStatusCounts(thread_list, operation_counts, state_counts);
+  VerifyAndResetCounts(correct_operation_counts, operation_counts,
+                       ThreadStatus::NUM_OP_TYPES);
+
+  // terminate compaction-write tasks and see if the thread-status
+  // reflects this update
+  compaction_write_task.FinishAllTasks();
+  compaction_write_task.WaitUntilDone();
+  UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION,
+              ThreadStatus::OP_UNKNOWN, kCompactionWriteTasks);
+
+  ASSERT_OK(env->GetThreadList(&thread_list));
+  UpdateStatusCounts(thread_list, operation_counts, state_counts);
+  VerifyAndResetCounts(correct_operation_counts, operation_counts,
+                       ThreadStatus::NUM_OP_TYPES);
+
+  // terminate compaction-write tasks and see if the thread-status
+  // reflects this update
+  compaction_read_task.FinishAllTasks();
+  compaction_read_task.WaitUntilDone();
+  UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION,
+              ThreadStatus::OP_UNKNOWN, kCompactionReadTasks);
+
+  ASSERT_OK(env->GetThreadList(&thread_list));
+  UpdateStatusCounts(thread_list, operation_counts, state_counts);
+  VerifyAndResetCounts(correct_operation_counts, operation_counts,
+                       ThreadStatus::NUM_OP_TYPES);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return 0;
+}
+
+#endif  // ROCKSDB_USING_THREAD_STATUS
diff --git a/src/rocksdb/util/thread_local.cc b/src/rocksdb/util/thread_local.cc
new file mode 100644
index 000000000..969639d9b
--- /dev/null
+++ b/src/rocksdb/util/thread_local.cc
@@ -0,0 +1,521 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/thread_local.h"
+
+#include <stdlib.h>
+
+#include "port/likely.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct Entry {
+  Entry() : ptr(nullptr) {}
+  Entry(const Entry& e) : ptr(e.ptr.load(std::memory_order_relaxed)) {}
+  std::atomic<void*> ptr;
+};
+
+class StaticMeta;
+
+// This is the structure that is declared as "thread_local" storage.
+// The vector keep list of atomic pointer for all instances for "current"
+// thread. The vector is indexed by an Id that is unique in process and
+// associated with one ThreadLocalPtr instance. The Id is assigned by a
+// global StaticMeta singleton. So if we instantiated 3 ThreadLocalPtr
+// instances, each thread will have a ThreadData with a vector of size 3:
+//     ---------------------------------------------------
+//     |          | instance 1 | instance 2 | instance 3 |
+//     ---------------------------------------------------
+//     | thread 1 |    void*   |    void*   |    void*   | <- ThreadData
+//     ---------------------------------------------------
+//     | thread 2 |    void*   |    void*   |    void*   | <- ThreadData
+//     ---------------------------------------------------
+//     | thread 3 |    void*   |    void*   |    void*   | <- ThreadData
+//     ---------------------------------------------------
+struct ThreadData {
+  explicit ThreadData(ThreadLocalPtr::StaticMeta* _inst)
+      : entries(), next(nullptr), prev(nullptr), inst(_inst) {}
+  std::vector<Entry> entries;
+  ThreadData* next;
+  ThreadData* prev;
+  ThreadLocalPtr::StaticMeta* inst;
+};
+
+class ThreadLocalPtr::StaticMeta {
+ public:
+  StaticMeta();
+
+  // Return the next available Id
+  uint32_t GetId();
+  // Return the next available Id without claiming it
+  uint32_t PeekId() const;
+  // Return the given Id back to the free pool. This also triggers
+  // UnrefHandler for associated pointer value (if not NULL) for all threads.
+  void ReclaimId(uint32_t id);
+
+  // Return the pointer value for the given id for the current thread.
+  void* Get(uint32_t id) const;
+  // Reset the pointer value for the given id for the current thread.
+  void Reset(uint32_t id, void* ptr);
+  // Atomically swap the supplied ptr and return the previous value
+  void* Swap(uint32_t id, void* ptr);
+  // Atomically compare and swap the provided value only if it equals
+  // to expected value.
+  bool CompareAndSwap(uint32_t id, void* ptr, void*& expected);
+  // Reset all thread local data to replacement, and return non-nullptr
+  // data for all existing threads
+  void Scrape(uint32_t id, autovector<void*>* ptrs, void* const replacement);
+  // Update res by applying func on each thread-local value. Holds a lock that
+  // prevents unref handler from running during this call, but clients must
+  // still provide external synchronization since the owning thread can
+  // access the values without internal locking, e.g., via Get() and Reset().
+  void Fold(uint32_t id, FoldFunc func, void* res);
+
+  // Register the UnrefHandler for id
+  void SetHandler(uint32_t id, UnrefHandler handler);
+
+  // protect inst, next_instance_id_, free_instance_ids_, head_,
+  // ThreadData.entries
+  //
+  // Note that here we prefer function static variable instead of the usual
+  // global static variable.  The reason is that c++ destruction order of
+  // static variables in the reverse order of their construction order.
+  // However, C++ does not guarantee any construction order when global
+  // static variables are defined in different files, while the function
+  // static variables are initialized when their function are first called.
+  // As a result, the construction order of the function static variables
+  // can be controlled by properly invoke their first function calls in
+  // the right order.
+  //
+  // For instance, the following function contains a function static
+  // variable.  We place a dummy function call of this inside
+  // Env::Default() to ensure the construction order of the construction
+  // order.
+  static port::Mutex* Mutex();
+
+  // Returns the member mutex of the current StaticMeta.  In general,
+  // Mutex() should be used instead of this one.  However, in case where
+  // the static variable inside Instance() goes out of scope, MemberMutex()
+  // should be used.  One example is OnThreadExit() function.
+  port::Mutex* MemberMutex() { return &mutex_; }
+
+ private:
+  // Get UnrefHandler for id with acquiring mutex
+  // REQUIRES: mutex locked
+  UnrefHandler GetHandler(uint32_t id);
+
+  // Triggered before a thread terminates
+  static void OnThreadExit(void* ptr);
+
+  // Add current thread's ThreadData to the global chain
+  // REQUIRES: mutex locked
+  void AddThreadData(ThreadData* d);
+
+  // Remove current thread's ThreadData from the global chain
+  // REQUIRES: mutex locked
+  void RemoveThreadData(ThreadData* d);
+
+  static ThreadData* GetThreadLocal();
+
+  uint32_t next_instance_id_;
+  // Used to recycle Ids in case ThreadLocalPtr is instantiated and destroyed
+  // frequently. This also prevents it from blowing up the vector space.
+  autovector<uint32_t> free_instance_ids_;
+  // Chain all thread local structure together. This is necessary since
+  // when one ThreadLocalPtr gets destroyed, we need to loop over each
+  // thread's version of pointer corresponding to that instance and
+  // call UnrefHandler for it.
+  ThreadData head_;
+
+  std::unordered_map<uint32_t, UnrefHandler> handler_map_;
+
+  // The private mutex.  Developers should always use Mutex() instead of
+  // using this variable directly.
+  port::Mutex mutex_;
+  // Thread local storage
+  static thread_local ThreadData* tls_;
+
+  // Used to make thread exit trigger possible if !defined(OS_MACOSX).
+  // Otherwise, used to retrieve thread data.
+  pthread_key_t pthread_key_;
+};
+
+thread_local ThreadData* ThreadLocalPtr::StaticMeta::tls_ = nullptr;
+
+// Windows doesn't support a per-thread destructor with its
+// TLS primitives.  So, we build it manually by inserting a
+// function to be called on each thread's exit.
+// See http://www.codeproject.com/Articles/8113/Thread-Local-Storage-The-C-Way
+// and http://www.nynaeve.net/?p=183
+//
+// really we do this to have clear conscience since using TLS with thread-pools
+// is iffy
+// although OK within a request. But otherwise, threads have no identity in its
+// modern use.
+
+// This runs on windows only called from the System Loader
+#ifdef OS_WIN
+
+// Windows cleanup routine is invoked from a System Loader with a different
+// signature so we can not directly hookup the original OnThreadExit which is
+// private member
+// so we make StaticMeta class share with the us the address of the function so
+// we can invoke it.
+namespace wintlscleanup {
+
+// This is set to OnThreadExit in StaticMeta singleton constructor
+UnrefHandler thread_local_inclass_routine = nullptr;
+pthread_key_t thread_local_key = pthread_key_t(-1);
+
+// Static callback function to call with each thread termination.
+void NTAPI WinOnThreadExit(PVOID module, DWORD reason, PVOID reserved) {
+  // We decided to punt on PROCESS_EXIT
+  if (DLL_THREAD_DETACH == reason) {
+    if (thread_local_key != pthread_key_t(-1) &&
+        thread_local_inclass_routine != nullptr) {
+      void* tls = TlsGetValue(thread_local_key);
+      if (tls != nullptr) {
+        thread_local_inclass_routine(tls);
+      }
+    }
+  }
+}
+
+}  // namespace wintlscleanup
+
+// extern "C" suppresses C++ name mangling so we know the symbol name for the
+// linker /INCLUDE:symbol pragma above.
+extern "C" {
+
+#ifdef _MSC_VER
+// The linker must not discard thread_callback_on_exit.  (We force a reference
+// to this variable with a linker /include:symbol pragma to ensure that.) If
+// this variable is discarded, the OnThreadExit function will never be called.
+#ifndef _X86_
+
+// .CRT section is merged with .rdata on x64 so it must be constant data.
+#pragma const_seg(".CRT$XLB")
+// When defining a const variable, it must have external linkage to be sure the
+// linker doesn't discard it.
+extern const PIMAGE_TLS_CALLBACK p_thread_callback_on_exit;
+const PIMAGE_TLS_CALLBACK p_thread_callback_on_exit =
+    wintlscleanup::WinOnThreadExit;
+// Reset the default section.
+#pragma const_seg()
+
+#pragma comment(linker, "/include:_tls_used")
+#pragma comment(linker, "/include:p_thread_callback_on_exit")
+
+#else  // _X86_
+
+#pragma data_seg(".CRT$XLB")
+PIMAGE_TLS_CALLBACK p_thread_callback_on_exit = wintlscleanup::WinOnThreadExit;
+// Reset the default section.
+#pragma data_seg()
+
+#pragma comment(linker, "/INCLUDE:__tls_used")
+#pragma comment(linker, "/INCLUDE:_p_thread_callback_on_exit")
+
+#endif  // _X86_
+
+#else
+// https://github.com/couchbase/gperftools/blob/master/src/windows/port.cc
+BOOL WINAPI DllMain(HINSTANCE h, DWORD dwReason, PVOID pv) {
+  if (dwReason == DLL_THREAD_DETACH)
+    wintlscleanup::WinOnThreadExit(h, dwReason, pv);
+  return TRUE;
+}
+#endif
+}  // extern "C"
+
+#endif  // OS_WIN
+
+void ThreadLocalPtr::InitSingletons() { ThreadLocalPtr::Instance(); }
+
+ThreadLocalPtr::StaticMeta* ThreadLocalPtr::Instance() {
+  // Here we prefer function static variable instead of global
+  // static variable as function static variable is initialized
+  // when the function is first call.  As a result, we can properly
+  // control their construction order by properly preparing their
+  // first function call.
+  //
+  // Note that here we decide to make "inst" a static pointer w/o deleting
+  // it at the end instead of a static variable.  This is to avoid the following
+  // destruction order disaster happens when a child thread using ThreadLocalPtr
+  // dies AFTER the main thread dies:  When a child thread happens to use
+  // ThreadLocalPtr, it will try to delete its thread-local data on its
+  // OnThreadExit when the child thread dies.  However, OnThreadExit depends
+  // on the following variable.  As a result, if the main thread dies before any
+  // child thread happen to use ThreadLocalPtr dies, then the destruction of
+  // the following variable will go first, then OnThreadExit, therefore causing
+  // invalid access.
+  //
+  // The above problem can be solved by using thread_local to store tls_.
+  // thread_local supports dynamic construction and destruction of
+  // non-primitive typed variables.  As a result, we can guarantee the
+  // destruction order even when the main thread dies before any child threads.
+  static ThreadLocalPtr::StaticMeta* inst = new ThreadLocalPtr::StaticMeta();
+  return inst;
+}
+
+port::Mutex* ThreadLocalPtr::StaticMeta::Mutex() { return &Instance()->mutex_; }
+
+void ThreadLocalPtr::StaticMeta::OnThreadExit(void* ptr) {
+  auto* tls = static_cast<ThreadData*>(ptr);
+  assert(tls != nullptr);
+
+  // Use the cached StaticMeta::Instance() instead of directly calling
+  // the variable inside StaticMeta::Instance() might already go out of
+  // scope here in case this OnThreadExit is called after the main thread
+  // dies.
+  auto* inst = tls->inst;
+  pthread_setspecific(inst->pthread_key_, nullptr);
+
+  MutexLock l(inst->MemberMutex());
+  inst->RemoveThreadData(tls);
+  // Unref stored pointers of current thread from all instances
+  uint32_t id = 0;
+  for (auto& e : tls->entries) {
+    void* raw = e.ptr.load();
+    if (raw != nullptr) {
+      auto unref = inst->GetHandler(id);
+      if (unref != nullptr) {
+        unref(raw);
+      }
+    }
+    ++id;
+  }
+  // Delete thread local structure no matter if it is Mac platform
+  delete tls;
+}
+
+ThreadLocalPtr::StaticMeta::StaticMeta()
+    : next_instance_id_(0), head_(this), pthread_key_(0) {
+  if (pthread_key_create(&pthread_key_, &OnThreadExit) != 0) {
+    abort();
+  }
+
+  // OnThreadExit is not getting called on the main thread.
+  // Call through the static destructor mechanism to avoid memory leak.
+  //
+  // Caveats: ~A() will be invoked _after_ ~StaticMeta for the global
+  // singleton (destructors are invoked in reverse order of constructor
+  // _completion_); the latter must not mutate internal members. This
+  // cleanup mechanism inherently relies on use-after-release of the
+  // StaticMeta, and is brittle with respect to compiler-specific handling
+  // of memory backing destructed statically-scoped objects. Perhaps
+  // registering with atexit(3) would be more robust.
+  //
+// This is not required on Windows.
+#if !defined(OS_WIN)
+  static struct A {
+    ~A() {
+      if (tls_) {
+        OnThreadExit(tls_);
+      }
+    }
+  } a;
+#endif  // !defined(OS_WIN)
+
+  head_.next = &head_;
+  head_.prev = &head_;
+
+#ifdef OS_WIN
+  // Share with Windows its cleanup routine and the key
+  wintlscleanup::thread_local_inclass_routine = OnThreadExit;
+  wintlscleanup::thread_local_key = pthread_key_;
+#endif
+}
+
+void ThreadLocalPtr::StaticMeta::AddThreadData(ThreadData* d) {
+  Mutex()->AssertHeld();
+  d->next = &head_;
+  d->prev = head_.prev;
+  head_.prev->next = d;
+  head_.prev = d;
+}
+
+void ThreadLocalPtr::StaticMeta::RemoveThreadData(ThreadData* d) {
+  Mutex()->AssertHeld();
+  d->next->prev = d->prev;
+  d->prev->next = d->next;
+  d->next = d->prev = d;
+}
+
+ThreadData* ThreadLocalPtr::StaticMeta::GetThreadLocal() {
+  if (UNLIKELY(tls_ == nullptr)) {
+    auto* inst = Instance();
+    tls_ = new ThreadData(inst);
+    {
+      // Register it in the global chain, needs to be done before thread exit
+      // handler registration
+      MutexLock l(Mutex());
+      inst->AddThreadData(tls_);
+    }
+    // Even it is not OS_MACOSX, need to register value for pthread_key_ so that
+    // its exit handler will be triggered.
+    if (pthread_setspecific(inst->pthread_key_, tls_) != 0) {
+      {
+        MutexLock l(Mutex());
+        inst->RemoveThreadData(tls_);
+      }
+      delete tls_;
+      abort();
+    }
+  }
+  return tls_;
+}
+
+void* ThreadLocalPtr::StaticMeta::Get(uint32_t id) const {
+  auto* tls = GetThreadLocal();
+  if (UNLIKELY(id >= tls->entries.size())) {
+    return nullptr;
+  }
+  return tls->entries[id].ptr.load(std::memory_order_acquire);
+}
+
+void ThreadLocalPtr::StaticMeta::Reset(uint32_t id, void* ptr) {
+  auto* tls = GetThreadLocal();
+  if (UNLIKELY(id >= tls->entries.size())) {
+    // Need mutex to protect entries access within ReclaimId
+    MutexLock l(Mutex());
+    tls->entries.resize(id + 1);
+  }
+  tls->entries[id].ptr.store(ptr, std::memory_order_release);
+}
+
+void* ThreadLocalPtr::StaticMeta::Swap(uint32_t id, void* ptr) {
+  auto* tls = GetThreadLocal();
+  if (UNLIKELY(id >= tls->entries.size())) {
+    // Need mutex to protect entries access within ReclaimId
+    MutexLock l(Mutex());
+    tls->entries.resize(id + 1);
+  }
+  return tls->entries[id].ptr.exchange(ptr, std::memory_order_acquire);
+}
+
+bool ThreadLocalPtr::StaticMeta::CompareAndSwap(uint32_t id, void* ptr,
+                                                void*& expected) {
+  auto* tls = GetThreadLocal();
+  if (UNLIKELY(id >= tls->entries.size())) {
+    // Need mutex to protect entries access within ReclaimId
+    MutexLock l(Mutex());
+    tls->entries.resize(id + 1);
+  }
+  return tls->entries[id].ptr.compare_exchange_strong(
+      expected, ptr, std::memory_order_release, std::memory_order_relaxed);
+}
+
+void ThreadLocalPtr::StaticMeta::Scrape(uint32_t id, autovector<void*>* ptrs,
+                                        void* const replacement) {
+  MutexLock l(Mutex());
+  for (ThreadData* t = head_.next; t != &head_; t = t->next) {
+    if (id < t->entries.size()) {
+      void* ptr =
+          t->entries[id].ptr.exchange(replacement, std::memory_order_acquire);
+      if (ptr != nullptr) {
+        ptrs->push_back(ptr);
+      }
+    }
+  }
+}
+
+void ThreadLocalPtr::StaticMeta::Fold(uint32_t id, FoldFunc func, void* res) {
+  MutexLock l(Mutex());
+  for (ThreadData* t = head_.next; t != &head_; t = t->next) {
+    if (id < t->entries.size()) {
+      void* ptr = t->entries[id].ptr.load();
+      if (ptr != nullptr) {
+        func(ptr, res);
+      }
+    }
+  }
+}
+
+uint32_t ThreadLocalPtr::TEST_PeekId() { return Instance()->PeekId(); }
+
+void ThreadLocalPtr::StaticMeta::SetHandler(uint32_t id, UnrefHandler handler) {
+  MutexLock l(Mutex());
+  handler_map_[id] = handler;
+}
+
+UnrefHandler ThreadLocalPtr::StaticMeta::GetHandler(uint32_t id) {
+  Mutex()->AssertHeld();
+  auto iter = handler_map_.find(id);
+  if (iter == handler_map_.end()) {
+    return nullptr;
+  }
+  return iter->second;
+}
+
+uint32_t ThreadLocalPtr::StaticMeta::GetId() {
+  MutexLock l(Mutex());
+  if (free_instance_ids_.empty()) {
+    return next_instance_id_++;
+  }
+
+  uint32_t id = free_instance_ids_.back();
+  free_instance_ids_.pop_back();
+  return id;
+}
+
+uint32_t ThreadLocalPtr::StaticMeta::PeekId() const {
+  MutexLock l(Mutex());
+  if (!free_instance_ids_.empty()) {
+    return free_instance_ids_.back();
+  }
+  return next_instance_id_;
+}
+
+void ThreadLocalPtr::StaticMeta::ReclaimId(uint32_t id) {
+  // This id is not used, go through all thread local data and release
+  // corresponding value
+  MutexLock l(Mutex());
+  auto unref = GetHandler(id);
+  for (ThreadData* t = head_.next; t != &head_; t = t->next) {
+    if (id < t->entries.size()) {
+      void* ptr = t->entries[id].ptr.exchange(nullptr);
+      if (ptr != nullptr && unref != nullptr) {
+        unref(ptr);
+      }
+    }
+  }
+  handler_map_[id] = nullptr;
+  free_instance_ids_.push_back(id);
+}
+
+ThreadLocalPtr::ThreadLocalPtr(UnrefHandler handler)
+    : id_(Instance()->GetId()) {
+  if (handler != nullptr) {
+    Instance()->SetHandler(id_, handler);
+  }
+}
+
+ThreadLocalPtr::~ThreadLocalPtr() { Instance()->ReclaimId(id_); }
+
+void* ThreadLocalPtr::Get() const { return Instance()->Get(id_); }
+
+void ThreadLocalPtr::Reset(void* ptr) { Instance()->Reset(id_, ptr); }
+
+void* ThreadLocalPtr::Swap(void* ptr) { return Instance()->Swap(id_, ptr); }
+
+bool ThreadLocalPtr::CompareAndSwap(void* ptr, void*& expected) {
+  return Instance()->CompareAndSwap(id_, ptr, expected);
+}
+
+void ThreadLocalPtr::Scrape(autovector<void*>* ptrs, void* const replacement) {
+  Instance()->Scrape(id_, ptrs, replacement);
+}
+
+void ThreadLocalPtr::Fold(FoldFunc func, void* res) {
+  Instance()->Fold(id_, func, res);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/thread_local.h b/src/rocksdb/util/thread_local.h
new file mode 100644
index 000000000..fde68f86f
--- /dev/null
+++ b/src/rocksdb/util/thread_local.h
@@ -0,0 +1,100 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "port/port.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Cleanup function that will be called for a stored thread local
+// pointer (if not NULL) when one of the following happens:
+// (1) a thread terminates
+// (2) a ThreadLocalPtr is destroyed
+//
+// Warning: this function is called while holding a global mutex. The same mutex
+// is used (at least in some cases) by most methods of ThreadLocalPtr, and it's
+// shared across all instances of ThreadLocalPtr. Thereforere extra care
+// is needed to avoid deadlocks. In particular, the handler shouldn't lock any
+// mutexes and shouldn't call any methods of any ThreadLocalPtr instances,
+// unless you know what you're doing.
+using UnrefHandler = void (*)(void* ptr);
+
+// ThreadLocalPtr stores only values of pointer type.  Different from
+// the usual thread-local-storage, ThreadLocalPtr has the ability to
+// distinguish data coming from different threads and different
+// ThreadLocalPtr instances.  For example, if a regular thread_local
+// variable A is declared in DBImpl, two DBImpl objects would share
+// the same A.  However, a ThreadLocalPtr that is defined under the
+// scope of DBImpl can avoid such confliction.  As a result, its memory
+// usage would be O(# of threads * # of ThreadLocalPtr instances).
+class ThreadLocalPtr {
+ public:
+  explicit ThreadLocalPtr(UnrefHandler handler = nullptr);
+
+  ThreadLocalPtr(const ThreadLocalPtr&) = delete;
+  ThreadLocalPtr& operator=(const ThreadLocalPtr&) = delete;
+
+  ~ThreadLocalPtr();
+
+  // Return the current pointer stored in thread local
+  void* Get() const;
+
+  // Set a new pointer value to the thread local storage.
+  void Reset(void* ptr);
+
+  // Atomically swap the supplied ptr and return the previous value
+  void* Swap(void* ptr);
+
+  // Atomically compare the stored value with expected. Set the new
+  // pointer value to thread local only if the comparison is true.
+  // Otherwise, expected returns the stored value.
+  // Return true on success, false on failure
+  bool CompareAndSwap(void* ptr, void*& expected);
+
+  // Reset all thread local data to replacement, and return non-nullptr
+  // data for all existing threads
+  void Scrape(autovector<void*>* ptrs, void* const replacement);
+
+  using FoldFunc = std::function<void(void*, void*)>;
+  // Update res by applying func on each thread-local value. Holds a lock that
+  // prevents unref handler from running during this call, but clients must
+  // still provide external synchronization since the owning thread can
+  // access the values without internal locking, e.g., via Get() and Reset().
+  void Fold(FoldFunc func, void* res);
+
+  // Add here for testing
+  // Return the next available Id without claiming it
+  static uint32_t TEST_PeekId();
+
+  // Initialize the static singletons of the ThreadLocalPtr.
+  //
+  // If this function is not called, then the singletons will be
+  // automatically initialized when they are used.
+  //
+  // Calling this function twice or after the singletons have been
+  // initialized will be no-op.
+  static void InitSingletons();
+
+  class StaticMeta;
+
+ private:
+  static StaticMeta* Instance();
+
+  const uint32_t id_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/thread_local_test.cc b/src/rocksdb/util/thread_local_test.cc
new file mode 100644
index 000000000..25ef5c0ee
--- /dev/null
+++ b/src/rocksdb/util/thread_local_test.cc
@@ -0,0 +1,582 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/thread_local.h"
+
+#include <atomic>
+#include <string>
+#include <thread>
+
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ThreadLocalTest : public testing::Test {
+ public:
+  ThreadLocalTest() : env_(Env::Default()) {}
+
+  Env* env_;
+};
+
+namespace {
+
+struct Params {
+  Params(port::Mutex* m, port::CondVar* c, int* u, int n,
+         UnrefHandler handler = nullptr)
+      : mu(m),
+        cv(c),
+        unref(u),
+        total(n),
+        started(0),
+        completed(0),
+        doWrite(false),
+        tls1(handler),
+        tls2(nullptr) {}
+
+  port::Mutex* mu;
+  port::CondVar* cv;
+  int* unref;
+  int total;
+  int started;
+  int completed;
+  bool doWrite;
+  ThreadLocalPtr tls1;
+  ThreadLocalPtr* tls2;
+};
+
+class IDChecker : public ThreadLocalPtr {
+ public:
+  static uint32_t PeekId() { return TEST_PeekId(); }
+};
+
+}  // anonymous namespace
+
+// Suppress false positive clang analyzer warnings.
+#ifndef __clang_analyzer__
+TEST_F(ThreadLocalTest, UniqueIdTest) {
+  port::Mutex mu;
+  port::CondVar cv(&mu);
+
+  uint32_t base_id = IDChecker::PeekId();
+  // New ThreadLocal instance bumps id by 1
+  {
+    // Id used 0
+    Params p1(&mu, &cv, nullptr, 1u);
+    ASSERT_EQ(IDChecker::PeekId(), base_id + 1u);
+    // Id used 1
+    Params p2(&mu, &cv, nullptr, 1u);
+    ASSERT_EQ(IDChecker::PeekId(), base_id + 2u);
+    // Id used 2
+    Params p3(&mu, &cv, nullptr, 1u);
+    ASSERT_EQ(IDChecker::PeekId(), base_id + 3u);
+    // Id used 3
+    Params p4(&mu, &cv, nullptr, 1u);
+    ASSERT_EQ(IDChecker::PeekId(), base_id + 4u);
+  }
+  // id 3, 2, 1, 0 are in the free queue in order
+  ASSERT_EQ(IDChecker::PeekId(), base_id + 0u);
+
+  // pick up 0
+  Params p1(&mu, &cv, nullptr, 1u);
+  ASSERT_EQ(IDChecker::PeekId(), base_id + 1u);
+  // pick up 1
+  Params* p2 = new Params(&mu, &cv, nullptr, 1u);
+  ASSERT_EQ(IDChecker::PeekId(), base_id + 2u);
+  // pick up 2
+  Params p3(&mu, &cv, nullptr, 1u);
+  ASSERT_EQ(IDChecker::PeekId(), base_id + 3u);
+  // return up 1
+  delete p2;
+  ASSERT_EQ(IDChecker::PeekId(), base_id + 1u);
+  // Now we have 3, 1 in queue
+  // pick up 1
+  Params p4(&mu, &cv, nullptr, 1u);
+  ASSERT_EQ(IDChecker::PeekId(), base_id + 3u);
+  // pick up 3
+  Params p5(&mu, &cv, nullptr, 1u);
+  // next new id
+  ASSERT_EQ(IDChecker::PeekId(), base_id + 4u);
+  // After exit, id sequence in queue:
+  // 3, 1, 2, 0
+}
+#endif  // __clang_analyzer__
+
+TEST_F(ThreadLocalTest, SequentialReadWriteTest) {
+  // global id list carries over 3, 1, 2, 0
+  uint32_t base_id = IDChecker::PeekId();
+
+  port::Mutex mu;
+  port::CondVar cv(&mu);
+  Params p(&mu, &cv, nullptr, 1);
+  ThreadLocalPtr tls2;
+  p.tls2 = &tls2;
+
+  ASSERT_GT(IDChecker::PeekId(), base_id);
+  base_id = IDChecker::PeekId();
+
+  auto func = [](Params* ptr) {
+    Params& params = *ptr;
+    ASSERT_TRUE(params.tls1.Get() == nullptr);
+    params.tls1.Reset(reinterpret_cast<int*>(1));
+    ASSERT_TRUE(params.tls1.Get() == reinterpret_cast<int*>(1));
+    params.tls1.Reset(reinterpret_cast<int*>(2));
+    ASSERT_TRUE(params.tls1.Get() == reinterpret_cast<int*>(2));
+
+    ASSERT_TRUE(params.tls2->Get() == nullptr);
+    params.tls2->Reset(reinterpret_cast<int*>(1));
+    ASSERT_TRUE(params.tls2->Get() == reinterpret_cast<int*>(1));
+    params.tls2->Reset(reinterpret_cast<int*>(2));
+    ASSERT_TRUE(params.tls2->Get() == reinterpret_cast<int*>(2));
+
+    params.mu->Lock();
+    ++(params.completed);
+    params.cv->SignalAll();
+    params.mu->Unlock();
+  };
+
+  for (int iter = 0; iter < 1024; ++iter) {
+    ASSERT_EQ(IDChecker::PeekId(), base_id);
+    // Another new thread, read/write should not see value from previous thread
+    env_->StartThreadTyped(func, &p);
+
+    mu.Lock();
+    while (p.completed != iter + 1) {
+      cv.Wait();
+    }
+    mu.Unlock();
+    ASSERT_EQ(IDChecker::PeekId(), base_id);
+  }
+}
+
+TEST_F(ThreadLocalTest, ConcurrentReadWriteTest) {
+  // global id list carries over 3, 1, 2, 0
+  uint32_t base_id = IDChecker::PeekId();
+
+  ThreadLocalPtr tls2;
+  port::Mutex mu1;
+  port::CondVar cv1(&mu1);
+  Params p1(&mu1, &cv1, nullptr, 16);
+  p1.tls2 = &tls2;
+
+  port::Mutex mu2;
+  port::CondVar cv2(&mu2);
+  Params p2(&mu2, &cv2, nullptr, 16);
+  p2.doWrite = true;
+  p2.tls2 = &tls2;
+
+  auto func = [](void* ptr) {
+    auto& p = *static_cast<Params*>(ptr);
+
+    p.mu->Lock();
+    // Size_T switches size along with the ptr size
+    // we want to cast to.
+    size_t own = ++(p.started);
+    p.cv->SignalAll();
+    while (p.started != p.total) {
+      p.cv->Wait();
+    }
+    p.mu->Unlock();
+
+    // Let write threads write a different value from the read threads
+    if (p.doWrite) {
+      own += 8192;
+    }
+
+    ASSERT_TRUE(p.tls1.Get() == nullptr);
+    ASSERT_TRUE(p.tls2->Get() == nullptr);
+
+    auto* env = Env::Default();
+    auto start = env->NowMicros();
+
+    p.tls1.Reset(reinterpret_cast<size_t*>(own));
+    p.tls2->Reset(reinterpret_cast<size_t*>(own + 1));
+    // Loop for 1 second
+    while (env->NowMicros() - start < 1000 * 1000) {
+      for (int iter = 0; iter < 100000; ++iter) {
+        ASSERT_TRUE(p.tls1.Get() == reinterpret_cast<size_t*>(own));
+        ASSERT_TRUE(p.tls2->Get() == reinterpret_cast<size_t*>(own + 1));
+        if (p.doWrite) {
+          p.tls1.Reset(reinterpret_cast<size_t*>(own));
+          p.tls2->Reset(reinterpret_cast<size_t*>(own + 1));
+        }
+      }
+    }
+
+    p.mu->Lock();
+    ++(p.completed);
+    p.cv->SignalAll();
+    p.mu->Unlock();
+  };
+
+  // Initiate 2 instnaces: one keeps writing and one keeps reading.
+  // The read instance should not see data from the write instance.
+  // Each thread local copy of the value are also different from each
+  // other.
+  for (int th = 0; th < p1.total; ++th) {
+    env_->StartThreadTyped(func, &p1);
+  }
+  for (int th = 0; th < p2.total; ++th) {
+    env_->StartThreadTyped(func, &p2);
+  }
+
+  mu1.Lock();
+  while (p1.completed != p1.total) {
+    cv1.Wait();
+  }
+  mu1.Unlock();
+
+  mu2.Lock();
+  while (p2.completed != p2.total) {
+    cv2.Wait();
+  }
+  mu2.Unlock();
+
+  ASSERT_EQ(IDChecker::PeekId(), base_id + 3u);
+}
+
+TEST_F(ThreadLocalTest, Unref) {
+  auto unref = [](void* ptr) {
+    auto& p = *static_cast<Params*>(ptr);
+    p.mu->Lock();
+    ++(*p.unref);
+    p.mu->Unlock();
+  };
+
+  // Case 0: no unref triggered if ThreadLocalPtr is never accessed
+  auto func0 = [](Params* ptr) {
+    auto& p = *ptr;
+    p.mu->Lock();
+    ++(p.started);
+    p.cv->SignalAll();
+    while (p.started != p.total) {
+      p.cv->Wait();
+    }
+    p.mu->Unlock();
+  };
+
+  for (int th = 1; th <= 128; th += th) {
+    port::Mutex mu;
+    port::CondVar cv(&mu);
+    int unref_count = 0;
+    Params p(&mu, &cv, &unref_count, th, unref);
+
+    for (int i = 0; i < p.total; ++i) {
+      env_->StartThreadTyped(func0, &p);
+    }
+    env_->WaitForJoin();
+    ASSERT_EQ(unref_count, 0);
+  }
+
+  // Case 1: unref triggered by thread exit
+  auto func1 = [](Params* ptr) {
+    auto& p = *ptr;
+
+    p.mu->Lock();
+    ++(p.started);
+    p.cv->SignalAll();
+    while (p.started != p.total) {
+      p.cv->Wait();
+    }
+    p.mu->Unlock();
+
+    ASSERT_TRUE(p.tls1.Get() == nullptr);
+    ASSERT_TRUE(p.tls2->Get() == nullptr);
+
+    p.tls1.Reset(ptr);
+    p.tls2->Reset(ptr);
+
+    p.tls1.Reset(ptr);
+    p.tls2->Reset(ptr);
+  };
+
+  for (int th = 1; th <= 128; th += th) {
+    port::Mutex mu;
+    port::CondVar cv(&mu);
+    int unref_count = 0;
+    ThreadLocalPtr tls2(unref);
+    Params p(&mu, &cv, &unref_count, th, unref);
+    p.tls2 = &tls2;
+
+    for (int i = 0; i < p.total; ++i) {
+      env_->StartThreadTyped(func1, &p);
+    }
+
+    env_->WaitForJoin();
+
+    // N threads x 2 ThreadLocal instance cleanup on thread exit
+    ASSERT_EQ(unref_count, 2 * p.total);
+  }
+
+  // Case 2: unref triggered by ThreadLocal instance destruction
+  auto func2 = [](Params* ptr) {
+    auto& p = *ptr;
+
+    p.mu->Lock();
+    ++(p.started);
+    p.cv->SignalAll();
+    while (p.started != p.total) {
+      p.cv->Wait();
+    }
+    p.mu->Unlock();
+
+    ASSERT_TRUE(p.tls1.Get() == nullptr);
+    ASSERT_TRUE(p.tls2->Get() == nullptr);
+
+    p.tls1.Reset(ptr);
+    p.tls2->Reset(ptr);
+
+    p.tls1.Reset(ptr);
+    p.tls2->Reset(ptr);
+
+    p.mu->Lock();
+    ++(p.completed);
+    p.cv->SignalAll();
+
+    // Waiting for instruction to exit thread
+    while (p.completed != 0) {
+      p.cv->Wait();
+    }
+    p.mu->Unlock();
+  };
+
+  for (int th = 1; th <= 128; th += th) {
+    port::Mutex mu;
+    port::CondVar cv(&mu);
+    int unref_count = 0;
+    Params p(&mu, &cv, &unref_count, th, unref);
+    p.tls2 = new ThreadLocalPtr(unref);
+
+    for (int i = 0; i < p.total; ++i) {
+      env_->StartThreadTyped(func2, &p);
+    }
+
+    // Wait for all threads to finish using Params
+    mu.Lock();
+    while (p.completed != p.total) {
+      cv.Wait();
+    }
+    mu.Unlock();
+
+    // Now destroy one ThreadLocal instance
+    delete p.tls2;
+    p.tls2 = nullptr;
+    // instance destroy for N threads
+    ASSERT_EQ(unref_count, p.total);
+
+    // Signal to exit
+    mu.Lock();
+    p.completed = 0;
+    cv.SignalAll();
+    mu.Unlock();
+    env_->WaitForJoin();
+    // additional N threads exit unref for the left instance
+    ASSERT_EQ(unref_count, 2 * p.total);
+  }
+}
+
+TEST_F(ThreadLocalTest, Swap) {
+  ThreadLocalPtr tls;
+  tls.Reset(reinterpret_cast<void*>(1));
+  ASSERT_EQ(reinterpret_cast<int64_t>(tls.Swap(nullptr)), 1);
+  ASSERT_TRUE(tls.Swap(reinterpret_cast<void*>(2)) == nullptr);
+  ASSERT_EQ(reinterpret_cast<int64_t>(tls.Get()), 2);
+  ASSERT_EQ(reinterpret_cast<int64_t>(tls.Swap(reinterpret_cast<void*>(3))), 2);
+}
+
+TEST_F(ThreadLocalTest, Scrape) {
+  auto unref = [](void* ptr) {
+    auto& p = *static_cast<Params*>(ptr);
+    p.mu->Lock();
+    ++(*p.unref);
+    p.mu->Unlock();
+  };
+
+  auto func = [](void* ptr) {
+    auto& p = *static_cast<Params*>(ptr);
+
+    ASSERT_TRUE(p.tls1.Get() == nullptr);
+    ASSERT_TRUE(p.tls2->Get() == nullptr);
+
+    p.tls1.Reset(ptr);
+    p.tls2->Reset(ptr);
+
+    p.tls1.Reset(ptr);
+    p.tls2->Reset(ptr);
+
+    p.mu->Lock();
+    ++(p.completed);
+    p.cv->SignalAll();
+
+    // Waiting for instruction to exit thread
+    while (p.completed != 0) {
+      p.cv->Wait();
+    }
+    p.mu->Unlock();
+  };
+
+  for (int th = 1; th <= 128; th += th) {
+    port::Mutex mu;
+    port::CondVar cv(&mu);
+    int unref_count = 0;
+    Params p(&mu, &cv, &unref_count, th, unref);
+    p.tls2 = new ThreadLocalPtr(unref);
+
+    for (int i = 0; i < p.total; ++i) {
+      env_->StartThreadTyped(func, &p);
+    }
+
+    // Wait for all threads to finish using Params
+    mu.Lock();
+    while (p.completed != p.total) {
+      cv.Wait();
+    }
+    mu.Unlock();
+
+    ASSERT_EQ(unref_count, 0);
+
+    // Scrape all thread local data. No unref at thread
+    // exit or ThreadLocalPtr destruction
+    autovector<void*> ptrs;
+    p.tls1.Scrape(&ptrs, nullptr);
+    p.tls2->Scrape(&ptrs, nullptr);
+    delete p.tls2;
+    // Signal to exit
+    mu.Lock();
+    p.completed = 0;
+    cv.SignalAll();
+    mu.Unlock();
+    env_->WaitForJoin();
+
+    ASSERT_EQ(unref_count, 0);
+  }
+}
+
+TEST_F(ThreadLocalTest, Fold) {
+  auto unref = [](void* ptr) {
+    delete static_cast<std::atomic<int64_t>*>(ptr);
+  };
+  static const int kNumThreads = 16;
+  static const int kItersPerThread = 10;
+  port::Mutex mu;
+  port::CondVar cv(&mu);
+  Params params(&mu, &cv, nullptr, kNumThreads, unref);
+  auto func = [](void* ptr) {
+    auto& p = *static_cast<Params*>(ptr);
+    ASSERT_TRUE(p.tls1.Get() == nullptr);
+    p.tls1.Reset(new std::atomic<int64_t>(0));
+
+    for (int i = 0; i < kItersPerThread; ++i) {
+      static_cast<std::atomic<int64_t>*>(p.tls1.Get())->fetch_add(1);
+    }
+
+    p.mu->Lock();
+    ++(p.completed);
+    p.cv->SignalAll();
+
+    // Waiting for instruction to exit thread
+    while (p.completed != 0) {
+      p.cv->Wait();
+    }
+    p.mu->Unlock();
+  };
+
+  for (int th = 0; th < params.total; ++th) {
+    env_->StartThread(func, &params);
+  }
+
+  // Wait for all threads to finish using Params
+  mu.Lock();
+  while (params.completed != params.total) {
+    cv.Wait();
+  }
+  mu.Unlock();
+
+  // Verify Fold() behavior
+  int64_t sum = 0;
+  params.tls1.Fold(
+      [](void* ptr, void* res) {
+        auto sum_ptr = static_cast<int64_t*>(res);
+        *sum_ptr += static_cast<std::atomic<int64_t>*>(ptr)->load();
+      },
+      &sum);
+  ASSERT_EQ(sum, kNumThreads * kItersPerThread);
+
+  // Signal to exit
+  mu.Lock();
+  params.completed = 0;
+  cv.SignalAll();
+  mu.Unlock();
+  env_->WaitForJoin();
+}
+
+TEST_F(ThreadLocalTest, CompareAndSwap) {
+  ThreadLocalPtr tls;
+  ASSERT_TRUE(tls.Swap(reinterpret_cast<void*>(1)) == nullptr);
+  void* expected = reinterpret_cast<void*>(1);
+  // Swap in 2
+  ASSERT_TRUE(tls.CompareAndSwap(reinterpret_cast<void*>(2), expected));
+  expected = reinterpret_cast<void*>(100);
+  // Fail Swap, still 2
+  ASSERT_TRUE(!tls.CompareAndSwap(reinterpret_cast<void*>(2), expected));
+  ASSERT_EQ(expected, reinterpret_cast<void*>(2));
+  // Swap in 3
+  expected = reinterpret_cast<void*>(2);
+  ASSERT_TRUE(tls.CompareAndSwap(reinterpret_cast<void*>(3), expected));
+  ASSERT_EQ(tls.Get(), reinterpret_cast<void*>(3));
+}
+
+namespace {
+
+void* AccessThreadLocal(void* /*arg*/) {
+  TEST_SYNC_POINT("AccessThreadLocal:Start");
+  ThreadLocalPtr tlp;
+  tlp.Reset(new std::string("hello RocksDB"));
+  TEST_SYNC_POINT("AccessThreadLocal:End");
+  return nullptr;
+}
+
+}  // namespace
+
+// The following test is disabled as it requires manual steps to run it
+// correctly.
+//
+// Currently we have no way to acess SyncPoint w/o ASAN error when the
+// child thread dies after the main thread dies.  So if you manually enable
+// this test and only see an ASAN error on SyncPoint, it means you pass the
+// test.
+TEST_F(ThreadLocalTest, DISABLED_MainThreadDiesFirst) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"AccessThreadLocal:Start", "MainThreadDiesFirst:End"},
+       {"PosixEnv::~PosixEnv():End", "AccessThreadLocal:End"}});
+
+  // Triggers the initialization of singletons.
+  Env::Default();
+
+#ifndef ROCKSDB_LITE
+  try {
+#endif  // ROCKSDB_LITE
+    ROCKSDB_NAMESPACE::port::Thread th(&AccessThreadLocal, nullptr);
+    th.detach();
+    TEST_SYNC_POINT("MainThreadDiesFirst:End");
+#ifndef ROCKSDB_LITE
+  } catch (const std::system_error& ex) {
+    std::cerr << "Start thread: " << ex.code() << std::endl;
+    FAIL();
+  }
+#endif  // ROCKSDB_LITE
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/thread_operation.h b/src/rocksdb/util/thread_operation.h
new file mode 100644
index 000000000..c24fccd5c
--- /dev/null
+++ b/src/rocksdb/util/thread_operation.h
@@ -0,0 +1,112 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file defines the structures for thread operation and state.
+// Thread operations are used to describe high level action of a
+// thread such as doing compaction or flush, while thread state
+// are used to describe lower-level action such as reading /
+// writing a file or waiting for a mutex.  Operations and states
+// are designed to be independent.  Typically, a thread usually involves
+// in one operation and one state at any specific point in time.
+
+#pragma once
+
+#include <string>
+
+#include "rocksdb/thread_status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifdef ROCKSDB_USING_THREAD_STATUS
+
+// The structure that describes a major thread operation.
+struct OperationInfo {
+  const ThreadStatus::OperationType type;
+  const std::string name;
+};
+
+// The global operation table.
+//
+// When updating a status of a thread, the pointer of the OperationInfo
+// of the current ThreadStatusData will be pointing to one of the
+// rows in this global table.
+//
+// Note that it's not designed to be constant as in the future we
+// might consider adding global count to the OperationInfo.
+static OperationInfo global_operation_table[] = {
+    {ThreadStatus::OP_UNKNOWN, ""},
+    {ThreadStatus::OP_COMPACTION, "Compaction"},
+    {ThreadStatus::OP_FLUSH, "Flush"}};
+
+struct OperationStageInfo {
+  const ThreadStatus::OperationStage stage;
+  const std::string name;
+};
+
+// A table maintains the mapping from stage type to stage string.
+// Note that the string must be changed accordingly when the
+// associated function name changed.
+static OperationStageInfo global_op_stage_table[] = {
+    {ThreadStatus::STAGE_UNKNOWN, ""},
+    {ThreadStatus::STAGE_FLUSH_RUN, "FlushJob::Run"},
+    {ThreadStatus::STAGE_FLUSH_WRITE_L0, "FlushJob::WriteLevel0Table"},
+    {ThreadStatus::STAGE_COMPACTION_PREPARE, "CompactionJob::Prepare"},
+    {ThreadStatus::STAGE_COMPACTION_RUN, "CompactionJob::Run"},
+    {ThreadStatus::STAGE_COMPACTION_PROCESS_KV,
+     "CompactionJob::ProcessKeyValueCompaction"},
+    {ThreadStatus::STAGE_COMPACTION_INSTALL, "CompactionJob::Install"},
+    {ThreadStatus::STAGE_COMPACTION_SYNC_FILE,
+     "CompactionJob::FinishCompactionOutputFile"},
+    {ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH,
+     "MemTableList::PickMemtablesToFlush"},
+    {ThreadStatus::STAGE_MEMTABLE_ROLLBACK,
+     "MemTableList::RollbackMemtableFlush"},
+    {ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS,
+     "MemTableList::TryInstallMemtableFlushResults"},
+};
+
+// The structure that describes a state.
+struct StateInfo {
+  const ThreadStatus::StateType type;
+  const std::string name;
+};
+
+// The global state table.
+//
+// When updating a status of a thread, the pointer of the StateInfo
+// of the current ThreadStatusData will be pointing to one of the
+// rows in this global table.
+static StateInfo global_state_table[] = {
+    {ThreadStatus::STATE_UNKNOWN, ""},
+    {ThreadStatus::STATE_MUTEX_WAIT, "Mutex Wait"},
+};
+
+struct OperationProperty {
+  int code;
+  std::string name;
+};
+
+static OperationProperty compaction_operation_properties[] = {
+    {ThreadStatus::COMPACTION_JOB_ID, "JobID"},
+    {ThreadStatus::COMPACTION_INPUT_OUTPUT_LEVEL, "InputOutputLevel"},
+    {ThreadStatus::COMPACTION_PROP_FLAGS, "Manual/Deletion/Trivial"},
+    {ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES, "TotalInputBytes"},
+    {ThreadStatus::COMPACTION_BYTES_READ, "BytesRead"},
+    {ThreadStatus::COMPACTION_BYTES_WRITTEN, "BytesWritten"},
+};
+
+static OperationProperty flush_operation_properties[] = {
+    {ThreadStatus::FLUSH_JOB_ID, "JobID"},
+    {ThreadStatus::FLUSH_BYTES_MEMTABLES, "BytesMemtables"},
+    {ThreadStatus::FLUSH_BYTES_WRITTEN, "BytesWritten"}};
+
+#else
+
+struct OperationInfo {};
+
+struct StateInfo {};
+
+#endif  // ROCKSDB_USING_THREAD_STATUS
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/threadpool_imp.cc b/src/rocksdb/util/threadpool_imp.cc
new file mode 100644
index 000000000..09706cac5
--- /dev/null
+++ b/src/rocksdb/util/threadpool_imp.cc
@@ -0,0 +1,551 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/threadpool_imp.h"
+
+#ifndef OS_WIN
+#include <unistd.h>
+#endif
+
+#ifdef OS_LINUX
+#include <sys/resource.h>
+#include <sys/syscall.h>
+#endif
+
+#include <stdlib.h>
+
+#include <algorithm>
+#include <atomic>
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+#include <sstream>
+#include <thread>
+#include <vector>
+
+#include "monitoring/thread_status_util.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void ThreadPoolImpl::PthreadCall(const char* label, int result) {
+  if (result != 0) {
+    fprintf(stderr, "pthread %s: %s\n", label, errnoStr(result).c_str());
+    abort();
+  }
+}
+
+struct ThreadPoolImpl::Impl {
+  Impl();
+  ~Impl();
+
+  void JoinThreads(bool wait_for_jobs_to_complete);
+
+  void SetBackgroundThreadsInternal(int num, bool allow_reduce);
+  int GetBackgroundThreads();
+
+  unsigned int GetQueueLen() const {
+    return queue_len_.load(std::memory_order_relaxed);
+  }
+
+  void LowerIOPriority();
+
+  void LowerCPUPriority(CpuPriority pri);
+
+  void WakeUpAllThreads() { bgsignal_.notify_all(); }
+
+  void BGThread(size_t thread_id);
+
+  void StartBGThreads();
+
+  void Submit(std::function<void()>&& schedule,
+              std::function<void()>&& unschedule, void* tag);
+
+  int UnSchedule(void* arg);
+
+  void SetHostEnv(Env* env) { env_ = env; }
+
+  Env* GetHostEnv() const { return env_; }
+
+  bool HasExcessiveThread() const {
+    return static_cast<int>(bgthreads_.size()) > total_threads_limit_;
+  }
+
+  // Return true iff the current thread is the excessive thread to terminate.
+  // Always terminate the running thread that is added last, even if there are
+  // more than one thread to terminate.
+  bool IsLastExcessiveThread(size_t thread_id) const {
+    return HasExcessiveThread() && thread_id == bgthreads_.size() - 1;
+  }
+
+  bool IsExcessiveThread(size_t thread_id) const {
+    return static_cast<int>(thread_id) >= total_threads_limit_;
+  }
+
+  // Return the thread priority.
+  // This would allow its member-thread to know its priority.
+  Env::Priority GetThreadPriority() const { return priority_; }
+
+  // Set the thread priority.
+  void SetThreadPriority(Env::Priority priority) { priority_ = priority; }
+
+  int ReserveThreads(int threads_to_be_reserved) {
+    std::unique_lock<std::mutex> lock(mu_);
+    // We can reserve at most num_waiting_threads_ in total so the number of
+    // threads that can be reserved might be fewer than the desired one. In
+    // rare cases, num_waiting_threads_ could be less than reserved_threads
+    // due to SetBackgroundThreadInternal or last excessive threads. If that
+    // happens, we cannot reserve any other threads.
+    int reserved_threads_in_success =
+        std::min(std::max(num_waiting_threads_ - reserved_threads_, 0),
+                 threads_to_be_reserved);
+    reserved_threads_ += reserved_threads_in_success;
+    return reserved_threads_in_success;
+  }
+
+  int ReleaseThreads(int threads_to_be_released) {
+    std::unique_lock<std::mutex> lock(mu_);
+    // We cannot release more than reserved_threads_
+    int released_threads_in_success =
+        std::min(reserved_threads_, threads_to_be_released);
+    reserved_threads_ -= released_threads_in_success;
+    WakeUpAllThreads();
+    return released_threads_in_success;
+  }
+
+ private:
+  static void BGThreadWrapper(void* arg);
+
+  bool low_io_priority_;
+  CpuPriority cpu_priority_;
+  Env::Priority priority_;
+  Env* env_;
+
+  int total_threads_limit_;
+  std::atomic_uint queue_len_;  // Queue length. Used for stats reporting
+  // Number of reserved threads, managed by ReserveThreads(..) and
+  // ReleaseThreads(..), if num_waiting_threads_ is no larger than
+  // reserved_threads_, its thread will be blocked to ensure the reservation
+  // mechanism
+  int reserved_threads_;
+  // Number of waiting threads (Maximum number of threads that can be
+  // reserved), in rare cases, num_waiting_threads_ could be less than
+  // reserved_threads due to SetBackgroundThreadInternal or last
+  // excessive threads.
+  int num_waiting_threads_;
+  bool exit_all_threads_;
+  bool wait_for_jobs_to_complete_;
+
+  // Entry per Schedule()/Submit() call
+  struct BGItem {
+    void* tag = nullptr;
+    std::function<void()> function;
+    std::function<void()> unschedFunction;
+  };
+
+  using BGQueue = std::deque<BGItem>;
+  BGQueue queue_;
+
+  std::mutex mu_;
+  std::condition_variable bgsignal_;
+  std::vector<port::Thread> bgthreads_;
+};
+
+inline ThreadPoolImpl::Impl::Impl()
+    : low_io_priority_(false),
+      cpu_priority_(CpuPriority::kNormal),
+      priority_(Env::LOW),
+      env_(nullptr),
+      total_threads_limit_(0),
+      queue_len_(),
+      reserved_threads_(0),
+      num_waiting_threads_(0),
+      exit_all_threads_(false),
+      wait_for_jobs_to_complete_(false),
+      queue_(),
+      mu_(),
+      bgsignal_(),
+      bgthreads_() {}
+
+inline ThreadPoolImpl::Impl::~Impl() { assert(bgthreads_.size() == 0U); }
+
+void ThreadPoolImpl::Impl::JoinThreads(bool wait_for_jobs_to_complete) {
+  std::unique_lock<std::mutex> lock(mu_);
+  assert(!exit_all_threads_);
+
+  wait_for_jobs_to_complete_ = wait_for_jobs_to_complete;
+  exit_all_threads_ = true;
+  // prevent threads from being recreated right after they're joined, in case
+  // the user is concurrently submitting jobs.
+  total_threads_limit_ = 0;
+  reserved_threads_ = 0;
+  num_waiting_threads_ = 0;
+
+  lock.unlock();
+
+  bgsignal_.notify_all();
+
+  for (auto& th : bgthreads_) {
+    th.join();
+  }
+
+  bgthreads_.clear();
+
+  exit_all_threads_ = false;
+  wait_for_jobs_to_complete_ = false;
+}
+
+inline void ThreadPoolImpl::Impl::LowerIOPriority() {
+  std::lock_guard<std::mutex> lock(mu_);
+  low_io_priority_ = true;
+}
+
+inline void ThreadPoolImpl::Impl::LowerCPUPriority(CpuPriority pri) {
+  std::lock_guard<std::mutex> lock(mu_);
+  cpu_priority_ = pri;
+}
+
+void ThreadPoolImpl::Impl::BGThread(size_t thread_id) {
+  bool low_io_priority = false;
+  CpuPriority current_cpu_priority = CpuPriority::kNormal;
+
+  while (true) {
+    // Wait until there is an item that is ready to run
+    std::unique_lock<std::mutex> lock(mu_);
+    // Stop waiting if the thread needs to do work or needs to terminate.
+    // Increase num_waiting_threads_ once this task has started waiting
+    num_waiting_threads_++;
+
+    TEST_SYNC_POINT("ThreadPoolImpl::BGThread::WaitingThreadsInc");
+    TEST_IDX_SYNC_POINT("ThreadPoolImpl::BGThread::Start:th", thread_id);
+    // When not exist_all_threads and the current thread id is not the last
+    // excessive thread, it may be blocked due to 3 reasons: 1) queue is empty
+    // 2) it is the excessive thread (not the last one)
+    // 3) the number of waiting threads is not greater than reserved threads
+    // (i.e, no available threads due to full reservation")
+    while (!exit_all_threads_ && !IsLastExcessiveThread(thread_id) &&
+           (queue_.empty() || IsExcessiveThread(thread_id) ||
+            num_waiting_threads_ <= reserved_threads_)) {
+      bgsignal_.wait(lock);
+    }
+    // Decrease num_waiting_threads_ once the thread is not waiting
+    num_waiting_threads_--;
+
+    if (exit_all_threads_) {  // mechanism to let BG threads exit safely
+
+      if (!wait_for_jobs_to_complete_ || queue_.empty()) {
+        break;
+      }
+    } else if (IsLastExcessiveThread(thread_id)) {
+      // Current thread is the last generated one and is excessive.
+      // We always terminate excessive thread in the reverse order of
+      // generation time. But not when `exit_all_threads_ == true`,
+      // otherwise `JoinThreads()` could try to `join()` a `detach()`ed
+      // thread.
+      auto& terminating_thread = bgthreads_.back();
+      terminating_thread.detach();
+      bgthreads_.pop_back();
+      if (HasExcessiveThread()) {
+        // There is still at least more excessive thread to terminate.
+        WakeUpAllThreads();
+      }
+      TEST_IDX_SYNC_POINT("ThreadPoolImpl::BGThread::Termination:th",
+                          thread_id);
+      TEST_SYNC_POINT("ThreadPoolImpl::BGThread::Termination");
+      break;
+    }
+
+    auto func = std::move(queue_.front().function);
+    queue_.pop_front();
+
+    queue_len_.store(static_cast<unsigned int>(queue_.size()),
+                     std::memory_order_relaxed);
+
+    bool decrease_io_priority = (low_io_priority != low_io_priority_);
+    CpuPriority cpu_priority = cpu_priority_;
+    lock.unlock();
+
+    if (cpu_priority < current_cpu_priority) {
+      TEST_SYNC_POINT_CALLBACK("ThreadPoolImpl::BGThread::BeforeSetCpuPriority",
+                               &current_cpu_priority);
+      // 0 means current thread.
+      port::SetCpuPriority(0, cpu_priority);
+      current_cpu_priority = cpu_priority;
+      TEST_SYNC_POINT_CALLBACK("ThreadPoolImpl::BGThread::AfterSetCpuPriority",
+                               &current_cpu_priority);
+    }
+
+#ifdef OS_LINUX
+    if (decrease_io_priority) {
+#define IOPRIO_CLASS_SHIFT (13)
+#define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data)
+      // Put schedule into IOPRIO_CLASS_IDLE class (lowest)
+      // These system calls only have an effect when used in conjunction
+      // with an I/O scheduler that supports I/O priorities. As at
+      // kernel 2.6.17 the only such scheduler is the Completely
+      // Fair Queuing (CFQ) I/O scheduler.
+      // To change scheduler:
+      //  echo cfq > /sys/block/<device_name>/queue/schedule
+      // Tunables to consider:
+      //  /sys/block/<device_name>/queue/slice_idle
+      //  /sys/block/<device_name>/queue/slice_sync
+      syscall(SYS_ioprio_set, 1,  // IOPRIO_WHO_PROCESS
+              0,                  // current thread
+              IOPRIO_PRIO_VALUE(3, 0));
+      low_io_priority = true;
+    }
+#else
+    (void)decrease_io_priority;  // avoid 'unused variable' error
+#endif
+
+    TEST_SYNC_POINT_CALLBACK("ThreadPoolImpl::Impl::BGThread:BeforeRun",
+                             &priority_);
+
+    func();
+  }
+}
+
+// Helper struct for passing arguments when creating threads.
+struct BGThreadMetadata {
+  ThreadPoolImpl::Impl* thread_pool_;
+  size_t thread_id_;  // Thread count in the thread.
+  BGThreadMetadata(ThreadPoolImpl::Impl* thread_pool, size_t thread_id)
+      : thread_pool_(thread_pool), thread_id_(thread_id) {}
+};
+
+void ThreadPoolImpl::Impl::BGThreadWrapper(void* arg) {
+  BGThreadMetadata* meta = reinterpret_cast<BGThreadMetadata*>(arg);
+  size_t thread_id = meta->thread_id_;
+  ThreadPoolImpl::Impl* tp = meta->thread_pool_;
+#ifdef ROCKSDB_USING_THREAD_STATUS
+  // initialize it because compiler isn't good enough to see we don't use it
+  // uninitialized
+  ThreadStatus::ThreadType thread_type = ThreadStatus::NUM_THREAD_TYPES;
+  switch (tp->GetThreadPriority()) {
+    case Env::Priority::HIGH:
+      thread_type = ThreadStatus::HIGH_PRIORITY;
+      break;
+    case Env::Priority::LOW:
+      thread_type = ThreadStatus::LOW_PRIORITY;
+      break;
+    case Env::Priority::BOTTOM:
+      thread_type = ThreadStatus::BOTTOM_PRIORITY;
+      break;
+    case Env::Priority::USER:
+      thread_type = ThreadStatus::USER;
+      break;
+    case Env::Priority::TOTAL:
+      assert(false);
+      return;
+  }
+  assert(thread_type != ThreadStatus::NUM_THREAD_TYPES);
+  ThreadStatusUtil::RegisterThread(tp->GetHostEnv(), thread_type);
+#endif
+  delete meta;
+  tp->BGThread(thread_id);
+#ifdef ROCKSDB_USING_THREAD_STATUS
+  ThreadStatusUtil::UnregisterThread();
+#endif
+  return;
+}
+
+void ThreadPoolImpl::Impl::SetBackgroundThreadsInternal(int num,
+                                                        bool allow_reduce) {
+  std::lock_guard<std::mutex> lock(mu_);
+  if (exit_all_threads_) {
+    return;
+  }
+  if (num > total_threads_limit_ ||
+      (num < total_threads_limit_ && allow_reduce)) {
+    total_threads_limit_ = std::max(0, num);
+    WakeUpAllThreads();
+    StartBGThreads();
+  }
+}
+
+int ThreadPoolImpl::Impl::GetBackgroundThreads() {
+  std::unique_lock<std::mutex> lock(mu_);
+  return total_threads_limit_;
+}
+
+void ThreadPoolImpl::Impl::StartBGThreads() {
+  // Start background thread if necessary
+  while ((int)bgthreads_.size() < total_threads_limit_) {
+    port::Thread p_t(&BGThreadWrapper,
+                     new BGThreadMetadata(this, bgthreads_.size()));
+
+// Set the thread name to aid debugging
+#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
+#if __GLIBC_PREREQ(2, 12)
+    auto th_handle = p_t.native_handle();
+    std::string thread_priority = Env::PriorityToString(GetThreadPriority());
+    std::ostringstream thread_name_stream;
+    thread_name_stream << "rocksdb:";
+    for (char c : thread_priority) {
+      thread_name_stream << static_cast<char>(tolower(c));
+    }
+    pthread_setname_np(th_handle, thread_name_stream.str().c_str());
+#endif
+#endif
+    bgthreads_.push_back(std::move(p_t));
+  }
+}
+
+void ThreadPoolImpl::Impl::Submit(std::function<void()>&& schedule,
+                                  std::function<void()>&& unschedule,
+                                  void* tag) {
+  std::lock_guard<std::mutex> lock(mu_);
+
+  if (exit_all_threads_) {
+    return;
+  }
+
+  StartBGThreads();
+
+  // Add to priority queue
+  queue_.push_back(BGItem());
+  TEST_SYNC_POINT("ThreadPoolImpl::Submit::Enqueue");
+  auto& item = queue_.back();
+  item.tag = tag;
+  item.function = std::move(schedule);
+  item.unschedFunction = std::move(unschedule);
+
+  queue_len_.store(static_cast<unsigned int>(queue_.size()),
+                   std::memory_order_relaxed);
+
+  if (!HasExcessiveThread()) {
+    // Wake up at least one waiting thread.
+    bgsignal_.notify_one();
+  } else {
+    // Need to wake up all threads to make sure the one woken
+    // up is not the one to terminate.
+    WakeUpAllThreads();
+  }
+}
+
+int ThreadPoolImpl::Impl::UnSchedule(void* arg) {
+  int count = 0;
+
+  std::vector<std::function<void()>> candidates;
+  {
+    std::lock_guard<std::mutex> lock(mu_);
+
+    // Remove from priority queue
+    BGQueue::iterator it = queue_.begin();
+    while (it != queue_.end()) {
+      if (arg == (*it).tag) {
+        if (it->unschedFunction) {
+          candidates.push_back(std::move(it->unschedFunction));
+        }
+        it = queue_.erase(it);
+        count++;
+      } else {
+        ++it;
+      }
+    }
+    queue_len_.store(static_cast<unsigned int>(queue_.size()),
+                     std::memory_order_relaxed);
+  }
+
+  // Run unschedule functions outside the mutex
+  for (auto& f : candidates) {
+    f();
+  }
+
+  return count;
+}
+
+ThreadPoolImpl::ThreadPoolImpl() : impl_(new Impl()) {}
+
+ThreadPoolImpl::~ThreadPoolImpl() {}
+
+void ThreadPoolImpl::JoinAllThreads() { impl_->JoinThreads(false); }
+
+void ThreadPoolImpl::SetBackgroundThreads(int num) {
+  impl_->SetBackgroundThreadsInternal(num, true);
+}
+
+int ThreadPoolImpl::GetBackgroundThreads() {
+  return impl_->GetBackgroundThreads();
+}
+
+unsigned int ThreadPoolImpl::GetQueueLen() const {
+  return impl_->GetQueueLen();
+}
+
+void ThreadPoolImpl::WaitForJobsAndJoinAllThreads() {
+  impl_->JoinThreads(true);
+}
+
+void ThreadPoolImpl::LowerIOPriority() { impl_->LowerIOPriority(); }
+
+void ThreadPoolImpl::LowerCPUPriority(CpuPriority pri) {
+  impl_->LowerCPUPriority(pri);
+}
+
+void ThreadPoolImpl::IncBackgroundThreadsIfNeeded(int num) {
+  impl_->SetBackgroundThreadsInternal(num, false);
+}
+
+void ThreadPoolImpl::SubmitJob(const std::function<void()>& job) {
+  auto copy(job);
+  impl_->Submit(std::move(copy), std::function<void()>(), nullptr);
+}
+
+void ThreadPoolImpl::SubmitJob(std::function<void()>&& job) {
+  impl_->Submit(std::move(job), std::function<void()>(), nullptr);
+}
+
+void ThreadPoolImpl::Schedule(void (*function)(void* arg1), void* arg,
+                              void* tag, void (*unschedFunction)(void* arg)) {
+  if (unschedFunction == nullptr) {
+    impl_->Submit(std::bind(function, arg), std::function<void()>(), tag);
+  } else {
+    impl_->Submit(std::bind(function, arg), std::bind(unschedFunction, arg),
+                  tag);
+  }
+}
+
+int ThreadPoolImpl::UnSchedule(void* arg) { return impl_->UnSchedule(arg); }
+
+void ThreadPoolImpl::SetHostEnv(Env* env) { impl_->SetHostEnv(env); }
+
+Env* ThreadPoolImpl::GetHostEnv() const { return impl_->GetHostEnv(); }
+
+// Return the thread priority.
+// This would allow its member-thread to know its priority.
+Env::Priority ThreadPoolImpl::GetThreadPriority() const {
+  return impl_->GetThreadPriority();
+}
+
+// Set the thread priority.
+void ThreadPoolImpl::SetThreadPriority(Env::Priority priority) {
+  impl_->SetThreadPriority(priority);
+}
+
+// Reserve a specific number of threads, prevent them from running other
+// functions The number of reserved threads could be fewer than the desired one
+int ThreadPoolImpl::ReserveThreads(int threads_to_be_reserved) {
+  return impl_->ReserveThreads(threads_to_be_reserved);
+}
+
+// Release a specific number of threads
+int ThreadPoolImpl::ReleaseThreads(int threads_to_be_released) {
+  return impl_->ReleaseThreads(threads_to_be_released);
+}
+
+ThreadPool* NewThreadPool(int num_threads) {
+  ThreadPoolImpl* thread_pool = new ThreadPoolImpl();
+  thread_pool->SetBackgroundThreads(num_threads);
+  return thread_pool;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/threadpool_imp.h b/src/rocksdb/util/threadpool_imp.h
new file mode 100644
index 000000000..a5109e38f
--- /dev/null
+++ b/src/rocksdb/util/threadpool_imp.h
@@ -0,0 +1,120 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <functional>
+#include <memory>
+
+#include "rocksdb/env.h"
+#include "rocksdb/threadpool.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ThreadPoolImpl : public ThreadPool {
+ public:
+  ThreadPoolImpl();
+  ~ThreadPoolImpl();
+
+  ThreadPoolImpl(ThreadPoolImpl&&) = delete;
+  ThreadPoolImpl& operator=(ThreadPoolImpl&&) = delete;
+
+  // Implement ThreadPool interfaces
+
+  // Wait for all threads to finish.
+  // Discards all the jobs that did not
+  // start executing and waits for those running
+  // to complete
+  void JoinAllThreads() override;
+
+  // Set the number of background threads that will be executing the
+  // scheduled jobs.
+  void SetBackgroundThreads(int num) override;
+  int GetBackgroundThreads() override;
+
+  // Get the number of jobs scheduled in the ThreadPool queue.
+  unsigned int GetQueueLen() const override;
+
+  // Waits for all jobs to complete those
+  // that already started running and those that did not
+  // start yet
+  void WaitForJobsAndJoinAllThreads() override;
+
+  // Make threads to run at a lower kernel IO priority
+  // Currently only has effect on Linux
+  void LowerIOPriority();
+
+  // Make threads to run at a lower kernel CPU priority
+  // Currently only has effect on Linux
+  void LowerCPUPriority(CpuPriority pri);
+
+  // Ensure there is at aleast num threads in the pool
+  // but do not kill threads if there are more
+  void IncBackgroundThreadsIfNeeded(int num);
+
+  // Submit a fire and forget job
+  // These jobs can not be unscheduled
+
+  // This allows to submit the same job multiple times
+  void SubmitJob(const std::function<void()>&) override;
+  // This moves the function in for efficiency
+  void SubmitJob(std::function<void()>&&) override;
+
+  // Schedule a job with an unschedule tag and unschedule function
+  // Can be used to filter and unschedule jobs by a tag
+  // that are still in the queue and did not start running
+  void Schedule(void (*function)(void* arg1), void* arg, void* tag,
+                void (*unschedFunction)(void* arg));
+
+  // Filter jobs that are still in a queue and match
+  // the given tag. Remove them from a queue if any
+  // and for each such job execute an unschedule function
+  // if such was given at scheduling time.
+  int UnSchedule(void* tag);
+
+  void SetHostEnv(Env* env);
+
+  Env* GetHostEnv() const;
+
+  // Return the thread priority.
+  // This would allow its member-thread to know its priority.
+  Env::Priority GetThreadPriority() const;
+
+  // Set the thread priority.
+  void SetThreadPriority(Env::Priority priority);
+
+  // Reserve a specific number of threads, prevent them from running other
+  // functions The number of reserved threads could be fewer than the desired
+  // one
+  int ReserveThreads(int threads_to_be_reserved) override;
+
+  // Release a specific number of threads
+  int ReleaseThreads(int threads_to_be_released) override;
+
+  static void PthreadCall(const char* label, int result);
+
+  struct Impl;
+
+ private:
+  // Current public virtual interface does not provide usable
+  // functionality and thus can not be used internally to
+  // facade different implementations.
+  //
+  // We propose a pimpl idiom in order to easily replace the thread pool impl
+  // w/o touching the header file but providing a different .cc potentially
+  // CMake option driven.
+  //
+  // Another option is to introduce a Env::MakeThreadPool() virtual interface
+  // and override the environment. This would require refactoring ThreadPool
+  // usage.
+  //
+  // We can also combine these two approaches
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/timer.h b/src/rocksdb/util/timer.h
new file mode 100644
index 000000000..db71cefaf
--- /dev/null
+++ b/src/rocksdb/util/timer.h
@@ -0,0 +1,340 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <queue>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "monitoring/instrumented_mutex.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/sync_point.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A Timer class to handle repeated work.
+//
+// `Start()` and `Shutdown()` are currently not thread-safe. The client must
+// serialize calls to these two member functions.
+//
+// A single timer instance can handle multiple functions via a single thread.
+// It is better to leave long running work to a dedicated thread pool.
+//
+// Timer can be started by calling `Start()`, and ended by calling `Shutdown()`.
+// Work (in terms of a `void function`) can be scheduled by calling `Add` with
+// a unique function name and de-scheduled by calling `Cancel`.
+// Many functions can be added.
+//
+// Impl Details:
+// A heap is used to keep track of when the next timer goes off.
+// A map from a function name to the function keeps track of all the functions.
+class Timer {
+ public:
+  explicit Timer(SystemClock* clock)
+      : clock_(clock),
+        mutex_(clock),
+        cond_var_(&mutex_),
+        running_(false),
+        executing_task_(false) {}
+
+  ~Timer() { Shutdown(); }
+
+  // Add a new function to run.
+  // fn_name has to be identical, otherwise it will fail to add and return false
+  // start_after_us is the initial delay.
+  // repeat_every_us is the interval between ending time of the last call and
+  // starting time of the next call. For example, repeat_every_us = 2000 and
+  // the function takes 1000us to run. If it starts at time [now]us, then it
+  // finishes at [now]+1000us, 2nd run starting time will be at [now]+3000us.
+  // repeat_every_us == 0 means do not repeat.
+  bool Add(std::function<void()> fn, const std::string& fn_name,
+           uint64_t start_after_us, uint64_t repeat_every_us) {
+    auto fn_info = std::make_unique<FunctionInfo>(std::move(fn), fn_name, 0,
+                                                  repeat_every_us);
+    InstrumentedMutexLock l(&mutex_);
+    // Assign time within mutex to make sure the next_run_time is larger than
+    // the current running one
+    fn_info->next_run_time_us = clock_->NowMicros() + start_after_us;
+    // the new task start time should never before the current task executing
+    // time, as the executing task can only be running if it's next_run_time_us
+    // is due (<= clock_->NowMicros()).
+    if (executing_task_ &&
+        fn_info->next_run_time_us < heap_.top()->next_run_time_us) {
+      return false;
+    }
+    auto it = map_.find(fn_name);
+    if (it == map_.end()) {
+      heap_.push(fn_info.get());
+      map_.try_emplace(fn_name, std::move(fn_info));
+    } else {
+      // timer doesn't support duplicated function name
+      return false;
+    }
+    cond_var_.SignalAll();
+    return true;
+  }
+
+  void Cancel(const std::string& fn_name) {
+    InstrumentedMutexLock l(&mutex_);
+
+    // Mark the function with fn_name as invalid so that it will not be
+    // requeued.
+    auto it = map_.find(fn_name);
+    if (it != map_.end() && it->second) {
+      it->second->Cancel();
+    }
+
+    // If the currently running function is fn_name, then we need to wait
+    // until it finishes before returning to caller.
+    while (!heap_.empty() && executing_task_) {
+      FunctionInfo* func_info = heap_.top();
+      assert(func_info);
+      if (func_info->name == fn_name) {
+        WaitForTaskCompleteIfNecessary();
+      } else {
+        break;
+      }
+    }
+  }
+
+  void CancelAll() {
+    InstrumentedMutexLock l(&mutex_);
+    CancelAllWithLock();
+  }
+
+  // Start the Timer
+  bool Start() {
+    InstrumentedMutexLock l(&mutex_);
+    if (running_) {
+      return false;
+    }
+
+    running_ = true;
+    thread_ = std::make_unique<port::Thread>(&Timer::Run, this);
+    return true;
+  }
+
+  // Shutdown the Timer
+  bool Shutdown() {
+    {
+      InstrumentedMutexLock l(&mutex_);
+      if (!running_) {
+        return false;
+      }
+      running_ = false;
+      CancelAllWithLock();
+      cond_var_.SignalAll();
+    }
+
+    if (thread_) {
+      thread_->join();
+    }
+    return true;
+  }
+
+  bool HasPendingTask() const {
+    InstrumentedMutexLock l(&mutex_);
+    for (const auto& fn_info : map_) {
+      if (fn_info.second->IsValid()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+#ifndef NDEBUG
+  // Wait until Timer starting waiting, call the optional callback, then wait
+  // for Timer waiting again.
+  // Tests can provide a custom Clock object to mock time, and use the callback
+  // here to bump current time and trigger Timer. See timer_test for example.
+  //
+  // Note: only support one caller of this method.
+  void TEST_WaitForRun(const std::function<void()>& callback = nullptr) {
+    InstrumentedMutexLock l(&mutex_);
+    // It act as a spin lock
+    while (executing_task_ ||
+           (!heap_.empty() &&
+            heap_.top()->next_run_time_us <= clock_->NowMicros())) {
+      cond_var_.TimedWait(clock_->NowMicros() + 1000);
+    }
+    if (callback != nullptr) {
+      callback();
+    }
+    cond_var_.SignalAll();
+    do {
+      cond_var_.TimedWait(clock_->NowMicros() + 1000);
+    } while (executing_task_ ||
+             (!heap_.empty() &&
+              heap_.top()->next_run_time_us <= clock_->NowMicros()));
+  }
+
+  size_t TEST_GetPendingTaskNum() const {
+    InstrumentedMutexLock l(&mutex_);
+    size_t ret = 0;
+    for (const auto& fn_info : map_) {
+      if (fn_info.second->IsValid()) {
+        ret++;
+      }
+    }
+    return ret;
+  }
+
+  void TEST_OverrideTimer(SystemClock* clock) {
+    InstrumentedMutexLock l(&mutex_);
+    clock_ = clock;
+  }
+#endif  // NDEBUG
+
+ private:
+  void Run() {
+    InstrumentedMutexLock l(&mutex_);
+
+    while (running_) {
+      if (heap_.empty()) {
+        // wait
+        TEST_SYNC_POINT("Timer::Run::Waiting");
+        cond_var_.Wait();
+        continue;
+      }
+
+      FunctionInfo* current_fn = heap_.top();
+      assert(current_fn);
+
+      if (!current_fn->IsValid()) {
+        heap_.pop();
+        map_.erase(current_fn->name);
+        continue;
+      }
+
+      if (current_fn->next_run_time_us <= clock_->NowMicros()) {
+        // make a copy of the function so it won't be changed after
+        // mutex_.unlock.
+        std::function<void()> fn = current_fn->fn;
+        executing_task_ = true;
+        mutex_.Unlock();
+        // Execute the work
+        fn();
+        mutex_.Lock();
+        executing_task_ = false;
+        cond_var_.SignalAll();
+
+        // Remove the work from the heap once it is done executing, make sure
+        // it's the same function after executing the work while mutex is
+        // released.
+        // Note that we are just removing the pointer from the heap. Its
+        // memory is still managed in the map (as it holds a unique ptr).
+        // So current_fn is still a valid ptr.
+        assert(heap_.top() == current_fn);
+        heap_.pop();
+
+        // current_fn may be cancelled already.
+        if (current_fn->IsValid() && current_fn->repeat_every_us > 0) {
+          assert(running_);
+          current_fn->next_run_time_us =
+              clock_->NowMicros() + current_fn->repeat_every_us;
+
+          // Schedule new work into the heap with new time.
+          heap_.push(current_fn);
+        } else {
+          // if current_fn is cancelled or no need to repeat, remove it from the
+          // map to avoid leak.
+          map_.erase(current_fn->name);
+        }
+      } else {
+        cond_var_.TimedWait(current_fn->next_run_time_us);
+      }
+    }
+  }
+
+  void CancelAllWithLock() {
+    mutex_.AssertHeld();
+    if (map_.empty() && heap_.empty()) {
+      return;
+    }
+
+    // With mutex_ held, set all tasks to invalid so that they will not be
+    // re-queued.
+    for (auto& elem : map_) {
+      auto& func_info = elem.second;
+      assert(func_info);
+      func_info->Cancel();
+    }
+
+    // WaitForTaskCompleteIfNecessary() may release mutex_
+    WaitForTaskCompleteIfNecessary();
+
+    while (!heap_.empty()) {
+      heap_.pop();
+    }
+    map_.clear();
+  }
+
+  // A wrapper around std::function to keep track when it should run next
+  // and at what frequency.
+  struct FunctionInfo {
+    // the actual work
+    std::function<void()> fn;
+    // name of the function
+    std::string name;
+    // when the function should run next
+    uint64_t next_run_time_us;
+    // repeat interval
+    uint64_t repeat_every_us;
+    // controls whether this function is valid.
+    // A function is valid upon construction and until someone explicitly
+    // calls `Cancel()`.
+    bool valid;
+
+    FunctionInfo(std::function<void()>&& _fn, std::string _name,
+                 const uint64_t _next_run_time_us, uint64_t _repeat_every_us)
+        : fn(std::move(_fn)),
+          name(std::move(_name)),
+          next_run_time_us(_next_run_time_us),
+          repeat_every_us(_repeat_every_us),
+          valid(true) {}
+
+    void Cancel() { valid = false; }
+
+    bool IsValid() const { return valid; }
+  };
+
+  void WaitForTaskCompleteIfNecessary() {
+    mutex_.AssertHeld();
+    while (executing_task_) {
+      TEST_SYNC_POINT("Timer::WaitForTaskCompleteIfNecessary:TaskExecuting");
+      cond_var_.Wait();
+    }
+  }
+
+  struct RunTimeOrder {
+    bool operator()(const FunctionInfo* f1, const FunctionInfo* f2) {
+      return f1->next_run_time_us > f2->next_run_time_us;
+    }
+  };
+
+  SystemClock* clock_;
+  // This mutex controls both the heap_ and the map_. It needs to be held for
+  // making any changes in them.
+  mutable InstrumentedMutex mutex_;
+  InstrumentedCondVar cond_var_;
+  std::unique_ptr<port::Thread> thread_;
+  bool running_;
+  bool executing_task_;
+
+  std::priority_queue<FunctionInfo*, std::vector<FunctionInfo*>, RunTimeOrder>
+      heap_;
+
+  // In addition to providing a mapping from a function name to a function,
+  // it is also responsible for memory management.
+  std::unordered_map<std::string, std::unique_ptr<FunctionInfo>> map_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/timer_queue.h b/src/rocksdb/util/timer_queue.h
new file mode 100644
index 000000000..36a1744ac
--- /dev/null
+++ b/src/rocksdb/util/timer_queue.h
@@ -0,0 +1,231 @@
+//  Portions Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Borrowed from
+// http://www.crazygaze.com/blog/2016/03/24/portable-c-timer-queue/
+// Timer Queue
+//
+// License
+//
+// The source code in this article is licensed under the CC0 license, so feel
+// free to copy, modify, share, do whatever you want with it.
+// No attribution is required, but Ill be happy if you do.
+// CC0 license
+
+// The person who associated a work with this deed has dedicated the work to the
+// public domain by waiving all of his or her rights to the work worldwide
+// under copyright law, including all related and neighboring rights, to the
+// extent allowed by law.  You can copy, modify, distribute and perform the
+// work, even for commercial purposes, all without asking permission.
+
+#pragma once
+
+#include <assert.h>
+
+#include <chrono>
+#include <condition_variable>
+#include <functional>
+#include <queue>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include "port/port.h"
+#include "test_util/sync_point.h"
+
+// Allows execution of handlers at a specified time in the future
+// Guarantees:
+//  - All handlers are executed ONCE, even if cancelled (aborted parameter will
+// be set to true)
+//      - If TimerQueue is destroyed, it will cancel all handlers.
+//  - Handlers are ALWAYS executed in the Timer Queue worker thread.
+//  - Handlers execution order is NOT guaranteed
+//
+////////////////////////////////////////////////////////////////////////////////
+// borrowed from
+// http://www.crazygaze.com/blog/2016/03/24/portable-c-timer-queue/
+class TimerQueue {
+ public:
+  TimerQueue() : m_th(&TimerQueue::run, this) {}
+
+  ~TimerQueue() { shutdown(); }
+
+  // This function is not thread-safe.
+  void shutdown() {
+    if (closed_) {
+      return;
+    }
+    cancelAll();
+    // Abusing the timer queue to trigger the shutdown.
+    add(0, [this](bool) {
+      m_finish = true;
+      return std::make_pair(false, 0);
+    });
+    m_th.join();
+    closed_ = true;
+  }
+
+  // Adds a new timer
+  // \return
+  //  Returns the ID of the new timer. You can use this ID to cancel the
+  // timer
+  uint64_t add(int64_t milliseconds,
+               std::function<std::pair<bool, int64_t>(bool)> handler) {
+    WorkItem item;
+    Clock::time_point tp = Clock::now();
+    item.end = tp + std::chrono::milliseconds(milliseconds);
+    TEST_SYNC_POINT_CALLBACK("TimeQueue::Add:item.end", &item.end);
+    item.period = milliseconds;
+    item.handler = std::move(handler);
+
+    std::unique_lock<std::mutex> lk(m_mtx);
+    uint64_t id = ++m_idcounter;
+    item.id = id;
+    m_items.push(std::move(item));
+
+    // Something changed, so wake up timer thread
+    m_checkWork.notify_one();
+    return id;
+  }
+
+  // Cancels the specified timer
+  // \return
+  //  1 if the timer was cancelled.
+  //  0 if you were too late to cancel (or the timer ID was never valid to
+  // start with)
+  size_t cancel(uint64_t id) {
+    // Instead of removing the item from the container (thus breaking the
+    // heap integrity), we set the item as having no handler, and put
+    // that handler on a new item at the top for immediate execution
+    // The timer thread will then ignore the original item, since it has no
+    // handler.
+    std::unique_lock<std::mutex> lk(m_mtx);
+    for (auto&& item : m_items.getContainer()) {
+      if (item.id == id && item.handler) {
+        WorkItem newItem;
+        // Zero time, so it stays at the top for immediate execution
+        newItem.end = Clock::time_point();
+        newItem.id = 0;  // Means it is a canceled item
+        // Move the handler from item to newitem (thus clearing item)
+        newItem.handler = std::move(item.handler);
+        m_items.push(std::move(newItem));
+
+        // Something changed, so wake up timer thread
+        m_checkWork.notify_one();
+        return 1;
+      }
+    }
+    return 0;
+  }
+
+  // Cancels all timers
+  // \return
+  //  The number of timers cancelled
+  size_t cancelAll() {
+    // Setting all "end" to 0 (for immediate execution) is ok,
+    // since it maintains the heap integrity
+    std::unique_lock<std::mutex> lk(m_mtx);
+    m_cancel = true;
+    for (auto&& item : m_items.getContainer()) {
+      if (item.id && item.handler) {
+        item.end = Clock::time_point();
+        item.id = 0;
+      }
+    }
+    auto ret = m_items.size();
+
+    m_checkWork.notify_one();
+    return ret;
+  }
+
+ private:
+  using Clock = std::chrono::steady_clock;
+  TimerQueue(const TimerQueue&) = delete;
+  TimerQueue& operator=(const TimerQueue&) = delete;
+
+  void run() {
+    std::unique_lock<std::mutex> lk(m_mtx);
+    while (!m_finish) {
+      auto end = calcWaitTime_lock();
+      if (end.first) {
+        // Timers found, so wait until it expires (or something else
+        // changes)
+        m_checkWork.wait_until(lk, end.second);
+      } else {
+        // No timers exist, so wait forever until something changes
+        m_checkWork.wait(lk);
+      }
+
+      // Check and execute as much work as possible, such as, all expired
+      // timers
+      checkWork(&lk);
+    }
+
+    // If we are shutting down, we should not have any items left,
+    // since the shutdown cancels all items
+    assert(m_items.size() == 0);
+  }
+
+  std::pair<bool, Clock::time_point> calcWaitTime_lock() {
+    while (m_items.size()) {
+      if (m_items.top().handler) {
+        // Item present, so return the new wait time
+        return std::make_pair(true, m_items.top().end);
+      } else {
+        // Discard empty handlers (they were cancelled)
+        m_items.pop();
+      }
+    }
+
+    // No items found, so return no wait time (causes the thread to wait
+    // indefinitely)
+    return std::make_pair(false, Clock::time_point());
+  }
+
+  void checkWork(std::unique_lock<std::mutex>* lk) {
+    while (m_items.size() && m_items.top().end <= Clock::now()) {
+      WorkItem item(m_items.top());
+      m_items.pop();
+
+      if (item.handler) {
+        (*lk).unlock();
+        auto reschedule_pair = item.handler(item.id == 0);
+        (*lk).lock();
+        if (!m_cancel && reschedule_pair.first) {
+          int64_t new_period = (reschedule_pair.second == -1)
+                                   ? item.period
+                                   : reschedule_pair.second;
+
+          item.period = new_period;
+          item.end = Clock::now() + std::chrono::milliseconds(new_period);
+          m_items.push(std::move(item));
+        }
+      }
+    }
+  }
+
+  bool m_finish = false;
+  bool m_cancel = false;
+  uint64_t m_idcounter = 0;
+  std::condition_variable m_checkWork;
+
+  struct WorkItem {
+    Clock::time_point end;
+    int64_t period;
+    uint64_t id;  // id==0 means it was cancelled
+    std::function<std::pair<bool, int64_t>(bool)> handler;
+    bool operator>(const WorkItem& other) const { return end > other.end; }
+  };
+
+  std::mutex m_mtx;
+  // Inheriting from priority_queue, so we can access the internal container
+  class Queue : public std::priority_queue<WorkItem, std::vector<WorkItem>,
+                                           std::greater<WorkItem>> {
+   public:
+    std::vector<WorkItem>& getContainer() { return this->c; }
+  } m_items;
+  ROCKSDB_NAMESPACE::port::Thread m_th;
+  bool closed_ = false;
+};
diff --git a/src/rocksdb/util/timer_queue_test.cc b/src/rocksdb/util/timer_queue_test.cc
new file mode 100644
index 000000000..b3c3768ec
--- /dev/null
+++ b/src/rocksdb/util/timer_queue_test.cc
@@ -0,0 +1,73 @@
+//  Portions Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+// borrowed from
+// http://www.crazygaze.com/blog/2016/03/24/portable-c-timer-queue/
+// Timer Queue
+//
+// License
+//
+// The source code in this article is licensed under the CC0 license, so feel
+// free
+// to copy, modify, share, do whatever you want with it.
+// No attribution is required, but Ill be happy if you do.
+// CC0 license
+
+// The person who associated a work with this deed has dedicated the work to the
+// public domain by waiving all of his or her rights to the work worldwide
+// under copyright law, including all related and neighboring rights, to the
+// extent allowed by law.  You can copy, modify, distribute and perform the
+// work, even for
+// commercial purposes, all without asking permission. See Other Information
+// below.
+//
+
+#include "util/timer_queue.h"
+
+#include <future>
+
+namespace Timing {
+
+using Clock = std::chrono::high_resolution_clock;
+double now() {
+  static auto start = Clock::now();
+  return std::chrono::duration<double, std::milli>(Clock::now() - start)
+      .count();
+}
+
+}  // namespace Timing
+
+int main() {
+  TimerQueue q;
+
+  double tnow = Timing::now();
+
+  q.add(10000, [tnow](bool aborted) mutable {
+    printf("T 1: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow);
+    return std::make_pair(false, 0);
+  });
+  q.add(10001, [tnow](bool aborted) mutable {
+    printf("T 2: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow);
+    return std::make_pair(false, 0);
+  });
+
+  q.add(1000, [tnow](bool aborted) mutable {
+    printf("T 3: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow);
+    return std::make_pair(!aborted, 1000);
+  });
+
+  auto id = q.add(2000, [tnow](bool aborted) mutable {
+    printf("T 4: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow);
+    return std::make_pair(!aborted, 2000);
+  });
+
+  (void)id;
+  // auto ret = q.cancel(id);
+  // assert(ret == 1);
+  // q.cancelAll();
+
+  return 0;
+}
+//////////////////////////////////////////
diff --git a/src/rocksdb/util/timer_test.cc b/src/rocksdb/util/timer_test.cc
new file mode 100644
index 000000000..0ebfa9f3d
--- /dev/null
+++ b/src/rocksdb/util/timer_test.cc
@@ -0,0 +1,402 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/timer.h"
+
+#include "db/db_test_util.h"
+#include "rocksdb/file_system.h"
+#include "test_util/mock_time_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TimerTest : public testing::Test {
+ public:
+  TimerTest()
+      : mock_clock_(std::make_shared<MockSystemClock>(SystemClock::Default())) {
+  }
+
+ protected:
+  std::shared_ptr<MockSystemClock> mock_clock_;
+
+  void SetUp() override { mock_clock_->InstallTimedWaitFixCallback(); }
+
+  const int kUsPerSec = 1000000;
+};
+
+TEST_F(TimerTest, SingleScheduleOnce) {
+  const int kInitDelayUs = 1 * kUsPerSec;
+  Timer timer(mock_clock_.get());
+
+  int count = 0;
+  timer.Add([&] { count++; }, "fn_sch_test", kInitDelayUs, 0);
+
+  ASSERT_TRUE(timer.Start());
+
+  ASSERT_EQ(0, count);
+  // Wait for execution to finish
+  timer.TEST_WaitForRun(
+      [&] { mock_clock_->SleepForMicroseconds(kInitDelayUs); });
+  ASSERT_EQ(1, count);
+
+  ASSERT_TRUE(timer.Shutdown());
+}
+
+TEST_F(TimerTest, MultipleScheduleOnce) {
+  const int kInitDelay1Us = 1 * kUsPerSec;
+  const int kInitDelay2Us = 3 * kUsPerSec;
+  Timer timer(mock_clock_.get());
+
+  int count1 = 0;
+  timer.Add([&] { count1++; }, "fn_sch_test1", kInitDelay1Us, 0);
+
+  int count2 = 0;
+  timer.Add([&] { count2++; }, "fn_sch_test2", kInitDelay2Us, 0);
+
+  ASSERT_TRUE(timer.Start());
+  ASSERT_EQ(0, count1);
+  ASSERT_EQ(0, count2);
+
+  timer.TEST_WaitForRun(
+      [&] { mock_clock_->SleepForMicroseconds(kInitDelay1Us); });
+
+  ASSERT_EQ(1, count1);
+  ASSERT_EQ(0, count2);
+
+  timer.TEST_WaitForRun([&] {
+    mock_clock_->SleepForMicroseconds(kInitDelay2Us - kInitDelay1Us);
+  });
+
+  ASSERT_EQ(1, count1);
+  ASSERT_EQ(1, count2);
+
+  ASSERT_TRUE(timer.Shutdown());
+}
+
+TEST_F(TimerTest, SingleScheduleRepeatedly) {
+  const int kIterations = 5;
+  const int kInitDelayUs = 1 * kUsPerSec;
+  const int kRepeatUs = 1 * kUsPerSec;
+
+  Timer timer(mock_clock_.get());
+  int count = 0;
+  timer.Add([&] { count++; }, "fn_sch_test", kInitDelayUs, kRepeatUs);
+
+  ASSERT_TRUE(timer.Start());
+  ASSERT_EQ(0, count);
+
+  timer.TEST_WaitForRun(
+      [&] { mock_clock_->SleepForMicroseconds(kInitDelayUs); });
+
+  ASSERT_EQ(1, count);
+
+  // Wait for execution to finish
+  for (int i = 1; i < kIterations; i++) {
+    timer.TEST_WaitForRun(
+        [&] { mock_clock_->SleepForMicroseconds(kRepeatUs); });
+  }
+  ASSERT_EQ(kIterations, count);
+
+  ASSERT_TRUE(timer.Shutdown());
+}
+
+TEST_F(TimerTest, MultipleScheduleRepeatedly) {
+  const int kIterations = 5;
+  const int kInitDelay1Us = 0 * kUsPerSec;
+  const int kInitDelay2Us = 1 * kUsPerSec;
+  const int kInitDelay3Us = 0 * kUsPerSec;
+  const int kRepeatUs = 2 * kUsPerSec;
+  const int kLargeRepeatUs = 100 * kUsPerSec;
+
+  Timer timer(mock_clock_.get());
+
+  int count1 = 0;
+  timer.Add([&] { count1++; }, "fn_sch_test1", kInitDelay1Us, kRepeatUs);
+
+  int count2 = 0;
+  timer.Add([&] { count2++; }, "fn_sch_test2", kInitDelay2Us, kRepeatUs);
+
+  // Add a function with relatively large repeat interval
+  int count3 = 0;
+  timer.Add([&] { count3++; }, "fn_sch_test3", kInitDelay3Us, kLargeRepeatUs);
+
+  ASSERT_TRUE(timer.Start());
+
+  ASSERT_EQ(0, count2);
+  // Wait for execution to finish
+  for (int i = 1; i < kIterations * (kRepeatUs / kUsPerSec); i++) {
+    timer.TEST_WaitForRun(
+        [&] { mock_clock_->SleepForMicroseconds(1 * kUsPerSec); });
+    ASSERT_EQ((i + 2) / (kRepeatUs / kUsPerSec), count1);
+    ASSERT_EQ((i + 1) / (kRepeatUs / kUsPerSec), count2);
+
+    // large interval function should only run once (the first one).
+    ASSERT_EQ(1, count3);
+  }
+
+  timer.Cancel("fn_sch_test1");
+
+  // Wait for execution to finish
+  timer.TEST_WaitForRun(
+      [&] { mock_clock_->SleepForMicroseconds(1 * kUsPerSec); });
+  ASSERT_EQ(kIterations, count1);
+  ASSERT_EQ(kIterations, count2);
+  ASSERT_EQ(1, count3);
+
+  timer.Cancel("fn_sch_test2");
+
+  ASSERT_EQ(kIterations, count1);
+  ASSERT_EQ(kIterations, count2);
+
+  // execute the long interval one
+  timer.TEST_WaitForRun([&] {
+    mock_clock_->SleepForMicroseconds(
+        kLargeRepeatUs - static_cast<int>(mock_clock_->NowMicros()));
+  });
+  ASSERT_EQ(2, count3);
+
+  ASSERT_TRUE(timer.Shutdown());
+}
+
+TEST_F(TimerTest, AddAfterStartTest) {
+  const int kIterations = 5;
+  const int kInitDelayUs = 1 * kUsPerSec;
+  const int kRepeatUs = 1 * kUsPerSec;
+
+  // wait timer to run and then add a new job
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"Timer::Run::Waiting", "TimerTest:AddAfterStartTest:1"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Timer timer(mock_clock_.get());
+
+  ASSERT_TRUE(timer.Start());
+
+  TEST_SYNC_POINT("TimerTest:AddAfterStartTest:1");
+  int count = 0;
+  timer.Add([&] { count++; }, "fn_sch_test", kInitDelayUs, kRepeatUs);
+  ASSERT_EQ(0, count);
+  // Wait for execution to finish
+  timer.TEST_WaitForRun(
+      [&] { mock_clock_->SleepForMicroseconds(kInitDelayUs); });
+  ASSERT_EQ(1, count);
+
+  for (int i = 1; i < kIterations; i++) {
+    timer.TEST_WaitForRun(
+        [&] { mock_clock_->SleepForMicroseconds(kRepeatUs); });
+  }
+  ASSERT_EQ(kIterations, count);
+
+  ASSERT_TRUE(timer.Shutdown());
+}
+
+TEST_F(TimerTest, CancelRunningTask) {
+  static constexpr char kTestFuncName[] = "test_func";
+  const int kRepeatUs = 1 * kUsPerSec;
+  Timer timer(mock_clock_.get());
+  ASSERT_TRUE(timer.Start());
+  int* value = new int;
+  *value = 0;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"TimerTest::CancelRunningTask:test_func:0",
+       "TimerTest::CancelRunningTask:BeforeCancel"},
+      {"Timer::WaitForTaskCompleteIfNecessary:TaskExecuting",
+       "TimerTest::CancelRunningTask:test_func:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  timer.Add(
+      [&]() {
+        *value = 1;
+        TEST_SYNC_POINT("TimerTest::CancelRunningTask:test_func:0");
+        TEST_SYNC_POINT("TimerTest::CancelRunningTask:test_func:1");
+      },
+      kTestFuncName, 0, kRepeatUs);
+  port::Thread control_thr([&]() {
+    TEST_SYNC_POINT("TimerTest::CancelRunningTask:BeforeCancel");
+    timer.Cancel(kTestFuncName);
+    // Verify that *value has been set to 1.
+    ASSERT_EQ(1, *value);
+    delete value;
+    value = nullptr;
+  });
+  mock_clock_->SleepForMicroseconds(kRepeatUs);
+  control_thr.join();
+  ASSERT_TRUE(timer.Shutdown());
+}
+
+TEST_F(TimerTest, ShutdownRunningTask) {
+  const int kRepeatUs = 1 * kUsPerSec;
+  constexpr char kTestFunc1Name[] = "test_func1";
+  constexpr char kTestFunc2Name[] = "test_func2";
+  Timer timer(mock_clock_.get());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"TimerTest::ShutdownRunningTest:test_func:0",
+       "TimerTest::ShutdownRunningTest:BeforeShutdown"},
+      {"Timer::WaitForTaskCompleteIfNecessary:TaskExecuting",
+       "TimerTest::ShutdownRunningTest:test_func:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_TRUE(timer.Start());
+
+  int* value = new int;
+  *value = 0;
+  timer.Add(
+      [&]() {
+        TEST_SYNC_POINT("TimerTest::ShutdownRunningTest:test_func:0");
+        *value = 1;
+        TEST_SYNC_POINT("TimerTest::ShutdownRunningTest:test_func:1");
+      },
+      kTestFunc1Name, 0, kRepeatUs);
+
+  timer.Add([&]() { ++(*value); }, kTestFunc2Name, 0, kRepeatUs);
+
+  port::Thread control_thr([&]() {
+    TEST_SYNC_POINT("TimerTest::ShutdownRunningTest:BeforeShutdown");
+    timer.Shutdown();
+  });
+  mock_clock_->SleepForMicroseconds(kRepeatUs);
+  control_thr.join();
+  delete value;
+}
+
+TEST_F(TimerTest, AddSameFuncName) {
+  const int kInitDelayUs = 1 * kUsPerSec;
+  const int kRepeat1Us = 5 * kUsPerSec;
+  const int kRepeat2Us = 4 * kUsPerSec;
+
+  Timer timer(mock_clock_.get());
+  ASSERT_TRUE(timer.Start());
+
+  int func_counter1 = 0;
+  ASSERT_TRUE(timer.Add([&] { func_counter1++; }, "duplicated_func",
+                        kInitDelayUs, kRepeat1Us));
+
+  int func2_counter = 0;
+  ASSERT_TRUE(
+      timer.Add([&] { func2_counter++; }, "func2", kInitDelayUs, kRepeat2Us));
+
+  // New function with the same name should fail to add
+  int func_counter2 = 0;
+  ASSERT_FALSE(timer.Add([&] { func_counter2++; }, "duplicated_func",
+                         kInitDelayUs, kRepeat1Us));
+
+  ASSERT_EQ(0, func_counter1);
+  ASSERT_EQ(0, func2_counter);
+
+  timer.TEST_WaitForRun(
+      [&] { mock_clock_->SleepForMicroseconds(kInitDelayUs); });
+
+  ASSERT_EQ(1, func_counter1);
+  ASSERT_EQ(1, func2_counter);
+
+  timer.TEST_WaitForRun([&] { mock_clock_->SleepForMicroseconds(kRepeat1Us); });
+
+  ASSERT_EQ(2, func_counter1);
+  ASSERT_EQ(2, func2_counter);
+  ASSERT_EQ(0, func_counter2);
+
+  ASSERT_TRUE(timer.Shutdown());
+}
+
+TEST_F(TimerTest, RepeatIntervalWithFuncRunningTime) {
+  const int kInitDelayUs = 1 * kUsPerSec;
+  const int kRepeatUs = 5 * kUsPerSec;
+  const int kFuncRunningTimeUs = 1 * kUsPerSec;
+
+  Timer timer(mock_clock_.get());
+  ASSERT_TRUE(timer.Start());
+
+  int func_counter = 0;
+  timer.Add(
+      [&] {
+        mock_clock_->SleepForMicroseconds(kFuncRunningTimeUs);
+        func_counter++;
+      },
+      "func", kInitDelayUs, kRepeatUs);
+
+  ASSERT_EQ(0, func_counter);
+  timer.TEST_WaitForRun(
+      [&] { mock_clock_->SleepForMicroseconds(kInitDelayUs); });
+  ASSERT_EQ(1, func_counter);
+  ASSERT_EQ(kInitDelayUs + kFuncRunningTimeUs, mock_clock_->NowMicros());
+
+  // After repeat interval time, the function is not executed, as running
+  // the function takes some time (`kFuncRunningTimeSec`). The repeat interval
+  // is the time between ending time of the last call and starting time of the
+  // next call.
+  uint64_t next_abs_interval_time_us = kInitDelayUs + kRepeatUs;
+  timer.TEST_WaitForRun([&] {
+    mock_clock_->SetCurrentTime(next_abs_interval_time_us / kUsPerSec);
+  });
+  ASSERT_EQ(1, func_counter);
+
+  // After the function running time, it's executed again
+  timer.TEST_WaitForRun(
+      [&] { mock_clock_->SleepForMicroseconds(kFuncRunningTimeUs); });
+  ASSERT_EQ(2, func_counter);
+
+  ASSERT_TRUE(timer.Shutdown());
+}
+
+TEST_F(TimerTest, DestroyRunningTimer) {
+  const int kInitDelayUs = 1 * kUsPerSec;
+  const int kRepeatUs = 1 * kUsPerSec;
+
+  auto timer_ptr = new Timer(mock_clock_.get());
+
+  int count = 0;
+  timer_ptr->Add([&] { count++; }, "fn_sch_test", kInitDelayUs, kRepeatUs);
+  ASSERT_TRUE(timer_ptr->Start());
+
+  timer_ptr->TEST_WaitForRun(
+      [&] { mock_clock_->SleepForMicroseconds(kInitDelayUs); });
+
+  // delete a running timer should not cause any exception
+  delete timer_ptr;
+}
+
+TEST_F(TimerTest, DestroyTimerWithRunningFunc) {
+  const int kRepeatUs = 1 * kUsPerSec;
+  auto timer_ptr = new Timer(mock_clock_.get());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"TimerTest::DestroyTimerWithRunningFunc:test_func:0",
+       "TimerTest::DestroyTimerWithRunningFunc:BeforeDelete"},
+      {"Timer::WaitForTaskCompleteIfNecessary:TaskExecuting",
+       "TimerTest::DestroyTimerWithRunningFunc:test_func:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_TRUE(timer_ptr->Start());
+
+  int count = 0;
+  timer_ptr->Add(
+      [&]() {
+        TEST_SYNC_POINT("TimerTest::DestroyTimerWithRunningFunc:test_func:0");
+        count++;
+        TEST_SYNC_POINT("TimerTest::DestroyTimerWithRunningFunc:test_func:1");
+      },
+      "fn_running_test", 0, kRepeatUs);
+
+  port::Thread control_thr([&] {
+    TEST_SYNC_POINT("TimerTest::DestroyTimerWithRunningFunc:BeforeDelete");
+    delete timer_ptr;
+  });
+  mock_clock_->SleepForMicroseconds(kRepeatUs);
+  control_thr.join();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/user_comparator_wrapper.h b/src/rocksdb/util/user_comparator_wrapper.h
new file mode 100644
index 000000000..59ebada12
--- /dev/null
+++ b/src/rocksdb/util/user_comparator_wrapper.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/comparator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Wrapper of user comparator, with auto increment to
+// perf_context.user_key_comparison_count.
+class UserComparatorWrapper {
+ public:
+  // `UserComparatorWrapper`s constructed with the default constructor are not
+  // usable and will segfault on any attempt to use them for comparisons.
+  UserComparatorWrapper() : user_comparator_(nullptr) {}
+
+  explicit UserComparatorWrapper(const Comparator* const user_cmp)
+      : user_comparator_(user_cmp) {}
+
+  ~UserComparatorWrapper() = default;
+
+  const Comparator* user_comparator() const { return user_comparator_; }
+
+  int Compare(const Slice& a, const Slice& b) const {
+    PERF_COUNTER_ADD(user_key_comparison_count, 1);
+    return user_comparator_->Compare(a, b);
+  }
+
+  bool Equal(const Slice& a, const Slice& b) const {
+    PERF_COUNTER_ADD(user_key_comparison_count, 1);
+    return user_comparator_->Equal(a, b);
+  }
+
+  int CompareTimestamp(const Slice& ts1, const Slice& ts2) const {
+    return user_comparator_->CompareTimestamp(ts1, ts2);
+  }
+
+  int CompareWithoutTimestamp(const Slice& a, const Slice& b) const {
+    PERF_COUNTER_ADD(user_key_comparison_count, 1);
+    return user_comparator_->CompareWithoutTimestamp(a, b);
+  }
+
+  int CompareWithoutTimestamp(const Slice& a, bool a_has_ts, const Slice& b,
+                              bool b_has_ts) const {
+    PERF_COUNTER_ADD(user_key_comparison_count, 1);
+    return user_comparator_->CompareWithoutTimestamp(a, a_has_ts, b, b_has_ts);
+  }
+
+  bool EqualWithoutTimestamp(const Slice& a, const Slice& b) const {
+    return user_comparator_->EqualWithoutTimestamp(a, b);
+  }
+
+ private:
+  const Comparator* user_comparator_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/vector_iterator.h b/src/rocksdb/util/vector_iterator.h
new file mode 100644
index 000000000..c4cc01d56
--- /dev/null
+++ b/src/rocksdb/util/vector_iterator.h
@@ -0,0 +1,118 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice.h"
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Iterator over a vector of keys/values
+class VectorIterator : public InternalIterator {
+ public:
+  VectorIterator(std::vector<std::string> keys, std::vector<std::string> values,
+                 const CompareInterface* icmp = nullptr)
+      : keys_(std::move(keys)),
+        values_(std::move(values)),
+        current_(keys_.size()),
+        indexed_cmp_(icmp, &keys_) {
+    assert(keys_.size() == values_.size());
+
+    indices_.reserve(keys_.size());
+    for (size_t i = 0; i < keys_.size(); i++) {
+      indices_.push_back(i);
+    }
+    if (icmp != nullptr) {
+      std::sort(indices_.begin(), indices_.end(), indexed_cmp_);
+    }
+  }
+
+  virtual bool Valid() const override {
+    return !indices_.empty() && current_ < indices_.size();
+  }
+
+  virtual void SeekToFirst() override { current_ = 0; }
+  virtual void SeekToLast() override { current_ = indices_.size() - 1; }
+
+  virtual void Seek(const Slice& target) override {
+    if (indexed_cmp_.cmp != nullptr) {
+      current_ = std::lower_bound(indices_.begin(), indices_.end(), target,
+                                  indexed_cmp_) -
+                 indices_.begin();
+    } else {
+      current_ =
+          std::lower_bound(keys_.begin(), keys_.end(), target.ToString()) -
+          keys_.begin();
+    }
+  }
+
+  virtual void SeekForPrev(const Slice& target) override {
+    if (indexed_cmp_.cmp != nullptr) {
+      current_ = std::upper_bound(indices_.begin(), indices_.end(), target,
+                                  indexed_cmp_) -
+                 indices_.begin();
+    } else {
+      current_ =
+          std::upper_bound(keys_.begin(), keys_.end(), target.ToString()) -
+          keys_.begin();
+    }
+    if (!Valid()) {
+      SeekToLast();
+    } else {
+      Prev();
+    }
+  }
+
+  virtual void Next() override { current_++; }
+  virtual void Prev() override { current_--; }
+
+  virtual Slice key() const override {
+    return Slice(keys_[indices_[current_]]);
+  }
+  virtual Slice value() const override {
+    return Slice(values_[indices_[current_]]);
+  }
+
+  virtual Status status() const override { return Status::OK(); }
+
+  virtual bool IsKeyPinned() const override { return true; }
+  virtual bool IsValuePinned() const override { return true; }
+
+ protected:
+  std::vector<std::string> keys_;
+  std::vector<std::string> values_;
+  size_t current_;
+
+ private:
+  struct IndexedKeyComparator {
+    IndexedKeyComparator(const CompareInterface* c,
+                         const std::vector<std::string>* ks)
+        : cmp(c), keys(ks) {}
+
+    bool operator()(size_t a, size_t b) const {
+      return cmp->Compare((*keys)[a], (*keys)[b]) < 0;
+    }
+
+    bool operator()(size_t a, const Slice& b) const {
+      return cmp->Compare((*keys)[a], b) < 0;
+    }
+
+    bool operator()(const Slice& a, size_t b) const {
+      return cmp->Compare(a, (*keys)[b]) < 0;
+    }
+
+    const CompareInterface* cmp;
+    const std::vector<std::string>* keys;
+  };
+
+  IndexedKeyComparator indexed_cmp_;
+  std::vector<size_t> indices_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/work_queue.h b/src/rocksdb/util/work_queue.h
new file mode 100644
index 000000000..94ece85d9
--- /dev/null
+++ b/src/rocksdb/util/work_queue.h
@@ -0,0 +1,150 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+/*
+ * Copyright (c) 2016-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ */
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <condition_variable>
+#include <cstddef>
+#include <functional>
+#include <mutex>
+#include <queue>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+/// Unbounded thread-safe work queue.
+//
+// This file is an excerpt from Facebook's zstd repo at
+// https://github.com/facebook/zstd/. The relevant file is
+// contrib/pzstd/utils/WorkQueue.h.
+
+template <typename T>
+class WorkQueue {
+  // Protects all member variable access
+  std::mutex mutex_;
+  std::condition_variable readerCv_;
+  std::condition_variable writerCv_;
+  std::condition_variable finishCv_;
+
+  std::queue<T> queue_;
+  bool done_;
+  std::size_t maxSize_;
+
+  // Must have lock to call this function
+  bool full() const {
+    if (maxSize_ == 0) {
+      return false;
+    }
+    return queue_.size() >= maxSize_;
+  }
+
+ public:
+  /**
+   * Constructs an empty work queue with an optional max size.
+   * If `maxSize == 0` the queue size is unbounded.
+   *
+   * @param maxSize The maximum allowed size of the work queue.
+   */
+  WorkQueue(std::size_t maxSize = 0) : done_(false), maxSize_(maxSize) {}
+
+  /**
+   * Push an item onto the work queue.  Notify a single thread that work is
+   * available.  If `finish()` has been called, do nothing and return false.
+   * If `push()` returns false, then `item` has not been copied from.
+   *
+   * @param item  Item to push onto the queue.
+   * @returns     True upon success, false if `finish()` has been called.  An
+   *               item was pushed iff `push()` returns true.
+   */
+  template <typename U>
+  bool push(U&& item) {
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      while (full() && !done_) {
+        writerCv_.wait(lock);
+      }
+      if (done_) {
+        return false;
+      }
+      queue_.push(std::forward<U>(item));
+    }
+    readerCv_.notify_one();
+    return true;
+  }
+
+  /**
+   * Attempts to pop an item off the work queue.  It will block until data is
+   * available or `finish()` has been called.
+   *
+   * @param[out] item  If `pop` returns `true`, it contains the popped item.
+   *                    If `pop` returns `false`, it is unmodified.
+   * @returns          True upon success.  False if the queue is empty and
+   *                    `finish()` has been called.
+   */
+  bool pop(T& item) {
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      while (queue_.empty() && !done_) {
+        readerCv_.wait(lock);
+      }
+      if (queue_.empty()) {
+        assert(done_);
+        return false;
+      }
+      item = queue_.front();
+      queue_.pop();
+    }
+    writerCv_.notify_one();
+    return true;
+  }
+
+  /**
+   * Sets the maximum queue size.  If `maxSize == 0` then it is unbounded.
+   *
+   * @param maxSize The new maximum queue size.
+   */
+  void setMaxSize(std::size_t maxSize) {
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      maxSize_ = maxSize;
+    }
+    writerCv_.notify_all();
+  }
+
+  /**
+   * Promise that `push()` won't be called again, so once the queue is empty
+   * there will never any more work.
+   */
+  void finish() {
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      assert(!done_);
+      done_ = true;
+    }
+    readerCv_.notify_all();
+    writerCv_.notify_all();
+    finishCv_.notify_all();
+  }
+
+  /// Blocks until `finish()` has been called (but the queue may not be empty).
+  void waitUntilFinished() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (!done_) {
+      finishCv_.wait(lock);
+    }
+  }
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/util/work_queue_test.cc b/src/rocksdb/util/work_queue_test.cc
new file mode 100644
index 000000000..c23a51279
--- /dev/null
+++ b/src/rocksdb/util/work_queue_test.cc
@@ -0,0 +1,272 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+/*
+ * Copyright (c) 2016-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ */
+#include "util/work_queue.h"
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Unit test for work_queue.h.
+//
+// This file is an excerpt from Facebook's zstd repo at
+// https://github.com/facebook/zstd/. The relevant file is
+// contrib/pzstd/utils/test/WorkQueueTest.cpp.
+
+struct Popper {
+  WorkQueue<int>* queue;
+  int* results;
+  std::mutex* mutex;
+
+  void operator()() {
+    int result;
+    while (queue->pop(result)) {
+      std::lock_guard<std::mutex> lock(*mutex);
+      results[result] = result;
+    }
+  }
+};
+
+TEST(WorkQueue, SingleThreaded) {
+  WorkQueue<int> queue;
+  int result;
+
+  queue.push(5);
+  EXPECT_TRUE(queue.pop(result));
+  EXPECT_EQ(5, result);
+
+  queue.push(1);
+  queue.push(2);
+  EXPECT_TRUE(queue.pop(result));
+  EXPECT_EQ(1, result);
+  EXPECT_TRUE(queue.pop(result));
+  EXPECT_EQ(2, result);
+
+  queue.push(1);
+  queue.push(2);
+  queue.finish();
+  EXPECT_TRUE(queue.pop(result));
+  EXPECT_EQ(1, result);
+  EXPECT_TRUE(queue.pop(result));
+  EXPECT_EQ(2, result);
+  EXPECT_FALSE(queue.pop(result));
+
+  queue.waitUntilFinished();
+}
+
+TEST(WorkQueue, SPSC) {
+  WorkQueue<int> queue;
+  const int max = 100;
+
+  for (int i = 0; i < 10; ++i) {
+    queue.push(i);
+  }
+
+  std::thread thread([&queue, max] {
+    int result;
+    for (int i = 0;; ++i) {
+      if (!queue.pop(result)) {
+        EXPECT_EQ(i, max);
+        break;
+      }
+      EXPECT_EQ(i, result);
+    }
+  });
+
+  std::this_thread::yield();
+  for (int i = 10; i < max; ++i) {
+    queue.push(i);
+  }
+  queue.finish();
+
+  thread.join();
+}
+
+TEST(WorkQueue, SPMC) {
+  WorkQueue<int> queue;
+  std::vector<int> results(50, -1);
+  std::mutex mutex;
+  std::vector<std::thread> threads;
+  for (int i = 0; i < 5; ++i) {
+    threads.emplace_back(Popper{&queue, results.data(), &mutex});
+  }
+
+  for (int i = 0; i < 50; ++i) {
+    queue.push(i);
+  }
+  queue.finish();
+
+  for (auto& thread : threads) {
+    thread.join();
+  }
+
+  for (int i = 0; i < 50; ++i) {
+    EXPECT_EQ(i, results[i]);
+  }
+}
+
+TEST(WorkQueue, MPMC) {
+  WorkQueue<int> queue;
+  std::vector<int> results(100, -1);
+  std::mutex mutex;
+  std::vector<std::thread> popperThreads;
+  for (int i = 0; i < 4; ++i) {
+    popperThreads.emplace_back(Popper{&queue, results.data(), &mutex});
+  }
+
+  std::vector<std::thread> pusherThreads;
+  for (int i = 0; i < 2; ++i) {
+    auto min = i * 50;
+    auto max = (i + 1) * 50;
+    pusherThreads.emplace_back([&queue, min, max] {
+      for (int j = min; j < max; ++j) {
+        queue.push(j);
+      }
+    });
+  }
+
+  for (auto& thread : pusherThreads) {
+    thread.join();
+  }
+  queue.finish();
+
+  for (auto& thread : popperThreads) {
+    thread.join();
+  }
+
+  for (int i = 0; i < 100; ++i) {
+    EXPECT_EQ(i, results[i]);
+  }
+}
+
+TEST(WorkQueue, BoundedSizeWorks) {
+  WorkQueue<int> queue(1);
+  int result;
+  queue.push(5);
+  queue.pop(result);
+  queue.push(5);
+  queue.pop(result);
+  queue.push(5);
+  queue.finish();
+  queue.pop(result);
+  EXPECT_EQ(5, result);
+}
+
+TEST(WorkQueue, BoundedSizePushAfterFinish) {
+  WorkQueue<int> queue(1);
+  int result;
+  queue.push(5);
+  std::thread pusher([&queue] { queue.push(6); });
+  // Dirtily try and make sure that pusher has run.
+  std::this_thread::sleep_for(std::chrono::seconds(1));
+  queue.finish();
+  EXPECT_TRUE(queue.pop(result));
+  EXPECT_EQ(5, result);
+  EXPECT_FALSE(queue.pop(result));
+
+  pusher.join();
+}
+
+TEST(WorkQueue, SetMaxSize) {
+  WorkQueue<int> queue(2);
+  int result;
+  queue.push(5);
+  queue.push(6);
+  queue.setMaxSize(1);
+  std::thread pusher([&queue] { queue.push(7); });
+  // Dirtily try and make sure that pusher has run.
+  std::this_thread::sleep_for(std::chrono::seconds(1));
+  queue.finish();
+  EXPECT_TRUE(queue.pop(result));
+  EXPECT_EQ(5, result);
+  EXPECT_TRUE(queue.pop(result));
+  EXPECT_EQ(6, result);
+  EXPECT_FALSE(queue.pop(result));
+
+  pusher.join();
+}
+
+TEST(WorkQueue, BoundedSizeMPMC) {
+  WorkQueue<int> queue(10);
+  std::vector<int> results(200, -1);
+  std::mutex mutex;
+  std::cerr << "Creating popperThreads" << std::endl;
+  std::vector<std::thread> popperThreads;
+  for (int i = 0; i < 4; ++i) {
+    popperThreads.emplace_back(Popper{&queue, results.data(), &mutex});
+  }
+
+  std::cerr << "Creating pusherThreads" << std::endl;
+  std::vector<std::thread> pusherThreads;
+  for (int i = 0; i < 2; ++i) {
+    auto min = i * 100;
+    auto max = (i + 1) * 100;
+    pusherThreads.emplace_back([&queue, min, max] {
+      for (int j = min; j < max; ++j) {
+        queue.push(j);
+      }
+    });
+  }
+
+  std::cerr << "Joining pusherThreads" << std::endl;
+  for (auto& thread : pusherThreads) {
+    thread.join();
+  }
+  std::cerr << "Finishing queue" << std::endl;
+  queue.finish();
+
+  std::cerr << "Joining popperThreads" << std::endl;
+  for (auto& thread : popperThreads) {
+    thread.join();
+  }
+
+  std::cerr << "Inspecting results" << std::endl;
+  for (int i = 0; i < 200; ++i) {
+    EXPECT_EQ(i, results[i]);
+  }
+}
+
+TEST(WorkQueue, FailedPush) {
+  WorkQueue<int> queue;
+  EXPECT_TRUE(queue.push(1));
+  queue.finish();
+  EXPECT_FALSE(queue.push(1));
+}
+
+TEST(WorkQueue, FailedPop) {
+  WorkQueue<int> queue;
+  int x = 5;
+  EXPECT_TRUE(queue.push(x));
+  queue.finish();
+  x = 0;
+  EXPECT_TRUE(queue.pop(x));
+  EXPECT_EQ(5, x);
+  EXPECT_FALSE(queue.pop(x));
+  EXPECT_EQ(5, x);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/xxhash.cc b/src/rocksdb/util/xxhash.cc
new file mode 100644
index 000000000..88852c330
--- /dev/null
+++ b/src/rocksdb/util/xxhash.cc
@@ -0,0 +1,48 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Copyright (C) 2012-2020 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+/*
+ * xxhash.c instantiates functions defined in xxhash.h
+ */
+// clang-format off
+#ifndef XXH_STATIC_LINKING_ONLY
+#define XXH_STATIC_LINKING_ONLY   /* access advanced declarations */
+#endif // !defined(XXH_STATIC_LINKING_ONLY)
+#define XXH_IMPLEMENTATION   /* access definitions */
+
+#include "xxhash.h"
diff --git a/src/rocksdb/util/xxhash.h b/src/rocksdb/util/xxhash.h
new file mode 100644
index 000000000..195f06b39
--- /dev/null
+++ b/src/rocksdb/util/xxhash.h
@@ -0,0 +1,5346 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+/* BEGIN RocksDB customizations */
+#ifndef XXH_STATIC_LINKING_ONLY
+// Using compiled xxhash.cc
+#define XXH_STATIC_LINKING_ONLY 1
+#endif  // !defined(XXH_STATIC_LINKING_ONLY)
+#ifndef XXH_NAMESPACE
+#define XXH_NAMESPACE ROCKSDB_
+#endif  // !defined(XXH_NAMESPACE)
+
+// for FALLTHROUGH_INTENDED, inserted as appropriate
+#include "port/lang.h"
+/* END RocksDB customizations */
+
+// clang-format off
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Header File
+ * Copyright (C) 2012-2020 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+/*!
+ * @mainpage xxHash
+ *
+ * @file xxhash.h
+ * xxHash prototypes and implementation
+ */
+/* TODO: update */
+/* Notice extracted from xxHash homepage:
+
+xxHash is an extremely fast hash algorithm, running at RAM speed limits.
+It also successfully passes all tests from the SMHasher suite.
+
+Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
+
+Name            Speed       Q.Score   Author
+xxHash          5.4 GB/s     10
+CrapWow         3.2 GB/s      2       Andrew
+MurmurHash 3a   2.7 GB/s     10       Austin Appleby
+SpookyHash      2.0 GB/s     10       Bob Jenkins
+SBox            1.4 GB/s      9       Bret Mulvey
+Lookup3         1.2 GB/s      9       Bob Jenkins
+SuperFastHash   1.2 GB/s      1       Paul Hsieh
+CityHash64      1.05 GB/s    10       Pike & Alakuijala
+FNV             0.55 GB/s     5       Fowler, Noll, Vo
+CRC32           0.43 GB/s     9
+MD5-32          0.33 GB/s    10       Ronald L. Rivest
+SHA1-32         0.28 GB/s    10
+
+Q.Score is a measure of quality of the hash function.
+It depends on successfully passing SMHasher test set.
+10 is a perfect score.
+
+Note: SMHasher's CRC32 implementation is not the fastest one.
+Other speed-oriented implementations can be faster,
+especially in combination with PCLMUL instruction:
+https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735
+
+A 64-bit version, named XXH64, is available since r35.
+It offers much better speed, but for 64-bit applications only.
+Name     Speed on 64 bits    Speed on 32 bits
+XXH64       13.8 GB/s            1.9 GB/s
+XXH32        6.8 GB/s            6.0 GB/s
+*/
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* ****************************
+ *  INLINE mode
+ ******************************/
+/*!
+ * XXH_INLINE_ALL (and XXH_PRIVATE_API)
+ * Use these build macros to inline xxhash into the target unit.
+ * Inlining improves performance on small inputs, especially when the length is
+ * expressed as a compile-time constant:
+ *
+ *      https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
+ *
+ * It also keeps xxHash symbols private to the unit, so they are not exported.
+ *
+ * Usage:
+ *     #define XXH_INLINE_ALL
+ *     #include "xxhash.h"
+ *
+ * Do not compile and link xxhash.o as a separate object, as it is not useful.
+ */
+#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \
+    && !defined(XXH_INLINE_ALL_31684351384)
+   /* this section should be traversed only once */
+#  define XXH_INLINE_ALL_31684351384
+   /* give access to the advanced API, required to compile implementations */
+#  undef XXH_STATIC_LINKING_ONLY   /* avoid macro redef */
+#  define XXH_STATIC_LINKING_ONLY
+   /* make all functions private */
+#  undef XXH_PUBLIC_API
+#  if defined(__GNUC__)
+#    define XXH_PUBLIC_API static __inline __attribute__((unused))
+#  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#    define XXH_PUBLIC_API static inline
+#  elif defined(_MSC_VER)
+#    define XXH_PUBLIC_API static __inline
+#  else
+     /* note: this version may generate warnings for unused static functions */
+#    define XXH_PUBLIC_API static
+#  endif
+
+   /*
+    * This part deals with the special case where a unit wants to inline xxHash,
+    * but "xxhash.h" has previously been included without XXH_INLINE_ALL, such
+    * as part of some previously included *.h header file.
+    * Without further action, the new include would just be ignored,
+    * and functions would effectively _not_ be inlined (silent failure).
+    * The following macros solve this situation by prefixing all inlined names,
+    * avoiding naming collision with previous inclusions.
+    */
+#  ifdef XXH_NAMESPACE
+#    error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported"
+     /*
+      * Note: Alternative: #undef all symbols (it's a pretty large list).
+      * Without #error: it compiles, but functions are actually not inlined.
+      */
+#  endif
+#  define XXH_NAMESPACE XXH_INLINE_
+   /*
+    * Some identifiers (enums, type names) are not symbols, but they must
+    * still be renamed to avoid redeclaration.
+    * Alternative solution: do not redeclare them.
+    * However, this requires some #ifdefs, and is a more dispersed action.
+    * Meanwhile, renaming can be achieved in a single block
+    */
+#  define XXH_IPREF(Id)   XXH_INLINE_ ## Id
+#  define XXH_OK XXH_IPREF(XXH_OK)
+#  define XXH_ERROR XXH_IPREF(XXH_ERROR)
+#  define XXH_errorcode XXH_IPREF(XXH_errorcode)
+#  define XXH32_canonical_t  XXH_IPREF(XXH32_canonical_t)
+#  define XXH64_canonical_t  XXH_IPREF(XXH64_canonical_t)
+#  define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t)
+#  define XXH32_state_s XXH_IPREF(XXH32_state_s)
+#  define XXH32_state_t XXH_IPREF(XXH32_state_t)
+#  define XXH64_state_s XXH_IPREF(XXH64_state_s)
+#  define XXH64_state_t XXH_IPREF(XXH64_state_t)
+#  define XXH3_state_s  XXH_IPREF(XXH3_state_s)
+#  define XXH3_state_t  XXH_IPREF(XXH3_state_t)
+#  define XXH128_hash_t XXH_IPREF(XXH128_hash_t)
+   /* Ensure the header is parsed again, even if it was previously included */
+#  undef XXHASH_H_5627135585666179
+#  undef XXHASH_H_STATIC_13879238742
+#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
+
+
+
+/* ****************************************************************
+ *  Stable API
+ *****************************************************************/
+#ifndef XXHASH_H_5627135585666179
+#define XXHASH_H_5627135585666179 1
+
+
+/*!
+ * @defgroup public Public API
+ * Contains details on the public xxHash functions.
+ * @{
+ */
+/* specific declaration modes for Windows */
+#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
+#  if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
+#    ifdef XXH_EXPORT
+#      define XXH_PUBLIC_API __declspec(dllexport)
+#    elif XXH_IMPORT
+#      define XXH_PUBLIC_API __declspec(dllimport)
+#    endif
+#  else
+#    define XXH_PUBLIC_API   /* do nothing */
+#  endif
+#endif
+
+#ifdef XXH_DOXYGEN
+/*!
+ * @brief Emulate a namespace by transparently prefixing all symbols.
+ *
+ * If you want to include _and expose_ xxHash functions from within your own
+ * library, but also want to avoid symbol collisions with other libraries which
+ * may also include xxHash, you can use XXH_NAMESPACE to automatically prefix
+ * any public symbol from xxhash library with the value of XXH_NAMESPACE
+ * (therefore, avoid empty or numeric values).
+ *
+ * Note that no change is required within the calling program as long as it
+ * includes `xxhash.h`: Regular symbol names will be automatically translated
+ * by this header.
+ */
+#  define XXH_NAMESPACE /* YOUR NAME HERE */
+#  undef XXH_NAMESPACE
+#endif
+
+#ifdef XXH_NAMESPACE
+#  define XXH_CAT(A,B) A##B
+#  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+/* XXH32 */
+#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
+#  define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
+#  define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+/* XXH64 */
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#  define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
+#  define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
+#  define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+/* XXH3_64bits */
+#  define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
+#  define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
+#  define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
+#  define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
+#  define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
+#  define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
+#  define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
+#  define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
+#  define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
+#  define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
+#  define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
+#  define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret)
+/* XXH3_128bits */
+#  define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
+#  define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
+#  define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
+#  define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
+#  define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
+#  define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
+#  define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
+#  define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
+#  define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
+#  define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
+#  define XXH128_cmp     XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
+#  define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
+#  define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
+#endif
+
+
+/* *************************************
+*  Version
+***************************************/
+#define XXH_VERSION_MAJOR    0
+#define XXH_VERSION_MINOR    8
+#define XXH_VERSION_RELEASE  1
+#define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+
+/*!
+ * @brief Obtains the xxHash version.
+ *
+ * This is only useful when xxHash is compiled as a shared library, as it is
+ * independent of the version defined in the header.
+ *
+ * @return `XXH_VERSION_NUMBER` as of when the libray was compiled.
+ */
+XXH_PUBLIC_API unsigned XXH_versionNumber (void);
+
+
+/* ****************************
+*  Definitions
+******************************/
+#include <stddef.h>   /* size_t */
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+
+
+/*-**********************************************************************
+*  32-bit hash
+************************************************************************/
+#if defined(XXH_DOXYGEN) /* Don't show <stdint.h> include */
+/*!
+ * @brief An unsigned 32-bit integer.
+ *
+ * Not necessarily defined to `uint32_t` but functionally equivalent.
+ */
+typedef uint32_t XXH32_hash_t;
+#elif !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint32_t XXH32_hash_t;
+#else
+#   include <limits.h>
+#   if UINT_MAX == 0xFFFFFFFFUL
+      typedef unsigned int XXH32_hash_t;
+#   else
+#     if ULONG_MAX == 0xFFFFFFFFUL
+        typedef unsigned long XXH32_hash_t;
+#     else
+#       error "unsupported platform: need a 32-bit type"
+#     endif
+#   endif
+#endif
+
+/*!
+ * @}
+ *
+ * @defgroup xxh32_family XXH32 family
+ * @ingroup public
+ * Contains functions used in the classic 32-bit xxHash algorithm.
+ *
+ * @note
+ *   XXH32 is considered rather weak by today's standards.
+ *   The @ref xxh3_family provides competitive speed for both 32-bit and 64-bit
+ *   systems, and offers true 64/128 bit hash results. It provides a superior
+ *   level of dispersion, and greatly reduces the risks of collisions.
+ *
+ * @see @ref xxh64_family, @ref xxh3_family : Other xxHash families
+ * @see @ref xxh32_impl for implementation details
+ * @{
+ */
+
+/*!
+ * @brief Calculates the 32-bit hash of @p input using xxHash32.
+ *
+ * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s
+ *
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ * @param seed The 32-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 32-bit hash value.
+ *
+ * @see
+ *    XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():
+ *    Direct equivalents for the other variants of xxHash.
+ * @see
+ *    XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version.
+ */
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
+
+/*!
+ * Streaming functions generate the xxHash value from an incremental input.
+ * This method is slower than single-call functions, due to state management.
+ * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
+ *
+ * An XXH state must first be allocated using `XXH*_createState()`.
+ *
+ * Start a new hash by initializing the state with a seed using `XXH*_reset()`.
+ *
+ * Then, feed the hash state by calling `XXH*_update()` as many times as necessary.
+ *
+ * The function returns an error code, with 0 meaning OK, and any other value
+ * meaning there is an error.
+ *
+ * Finally, a hash value can be produced anytime, by using `XXH*_digest()`.
+ * This function returns the nn-bits hash as an int or long long.
+ *
+ * It's still possible to continue inserting input into the hash state after a
+ * digest, and generate new hash values later on by invoking `XXH*_digest()`.
+ *
+ * When done, release the state using `XXH*_freeState()`.
+ *
+ * Example code for incrementally hashing a file:
+ * @code{.c}
+ *    #include <stdio.h>
+ *    #include <xxhash.h>
+ *    #define BUFFER_SIZE 256
+ *
+ *    // Note: XXH64 and XXH3 use the same interface.
+ *    XXH32_hash_t
+ *    hashFile(FILE* stream)
+ *    {
+ *        XXH32_state_t* state;
+ *        unsigned char buf[BUFFER_SIZE];
+ *        size_t amt;
+ *        XXH32_hash_t hash;
+ *
+ *        state = XXH32_createState();       // Create a state
+ *        assert(state != NULL);             // Error check here
+ *        XXH32_reset(state, 0xbaad5eed);    // Reset state with our seed
+ *        while ((amt = fread(buf, 1, sizeof(buf), stream)) != 0) {
+ *            XXH32_update(state, buf, amt); // Hash the file in chunks
+ *        }
+ *        hash = XXH32_digest(state);        // Finalize the hash
+ *        XXH32_freeState(state);            // Clean up
+ *        return hash;
+ *    }
+ * @endcode
+ */
+
+/*!
+ * @typedef struct XXH32_state_s XXH32_state_t
+ * @brief The opaque state struct for the XXH32 streaming API.
+ *
+ * @see XXH32_state_s for details.
+ */
+typedef struct XXH32_state_s XXH32_state_t;
+
+/*!
+ * @brief Allocates an @ref XXH32_state_t.
+ *
+ * Must be freed with XXH32_freeState().
+ * @return An allocated XXH32_state_t on success, `NULL` on failure.
+ */
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
+/*!
+ * @brief Frees an @ref XXH32_state_t.
+ *
+ * Must be allocated with XXH32_createState().
+ * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState().
+ * @return XXH_OK.
+ */
+XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+/*!
+ * @brief Copies one @ref XXH32_state_t to another.
+ *
+ * @param dst_state The state to copy to.
+ * @param src_state The state to copy from.
+ * @pre
+ *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
+ */
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
+
+/*!
+ * @brief Resets an @ref XXH32_state_t to begin a new hash.
+ *
+ * This function resets and seeds a state. Call it before @ref XXH32_update().
+ *
+ * @param statePtr The state struct to reset.
+ * @param seed The 32-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, XXH32_hash_t seed);
+
+/*!
+ * @brief Consumes a block of @p input to an @ref XXH32_state_t.
+ *
+ * Call this to incrementally consume blocks of data.
+ *
+ * @param statePtr The state struct to update.
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+
+/*!
+ * @brief Returns the calculated hash value from an @ref XXH32_state_t.
+ *
+ * @note
+ *   Calling XXH32_digest() will not affect @p statePtr, so you can update,
+ *   digest, and update again.
+ *
+ * @param statePtr The state struct to calculate the hash from.
+ *
+ * @pre
+ *  @p statePtr must not be `NULL`.
+ *
+ * @return The calculated xxHash32 value from that state.
+ */
+XXH_PUBLIC_API XXH32_hash_t  XXH32_digest (const XXH32_state_t* statePtr);
+
+/*******   Canonical representation   *******/
+
+/*
+ * The default return values from XXH functions are unsigned 32 and 64 bit
+ * integers.
+ * This the simplest and fastest format for further post-processing.
+ *
+ * However, this leaves open the question of what is the order on the byte level,
+ * since little and big endian conventions will store the same number differently.
+ *
+ * The canonical representation settles this issue by mandating big-endian
+ * convention, the same convention as human-readable numbers (large digits first).
+ *
+ * When writing hash values to storage, sending them over a network, or printing
+ * them, it's highly recommended to use the canonical representation to ensure
+ * portability across a wider range of systems, present and future.
+ *
+ * The following functions allow transformation of hash values to and from
+ * canonical format.
+ */
+
+/*!
+ * @brief Canonical (big endian) representation of @ref XXH32_hash_t.
+ */
+typedef struct {
+    unsigned char digest[4]; /*!< Hash bytes, big endian */
+} XXH32_canonical_t;
+
+/*!
+ * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t.
+ *
+ * @param dst The @ref XXH32_canonical_t pointer to be stored to.
+ * @param hash The @ref XXH32_hash_t to be converted.
+ *
+ * @pre
+ *   @p dst must not be `NULL`.
+ */
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+
+/*!
+ * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t.
+ *
+ * @param src The @ref XXH32_canonical_t to convert.
+ *
+ * @pre
+ *   @p src must not be `NULL`.
+ *
+ * @return The converted hash.
+ */
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+
+
+/*!
+ * @}
+ * @ingroup public
+ * @{
+ */
+
+#ifndef XXH_NO_LONG_LONG
+/*-**********************************************************************
+*  64-bit hash
+************************************************************************/
+#if defined(XXH_DOXYGEN) /* don't include <stdint.h> */
+/*!
+ * @brief An unsigned 64-bit integer.
+ *
+ * Not necessarily defined to `uint64_t` but functionally equivalent.
+ */
+typedef uint64_t XXH64_hash_t;
+#elif !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#  include <stdint.h>
+   typedef uint64_t XXH64_hash_t;
+#else
+#  include <limits.h>
+#  if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL
+     /* LP64 ABI says uint64_t is unsigned long */
+     typedef unsigned long XXH64_hash_t;
+#  else
+     /* the following type must have a width of 64-bit */
+     typedef unsigned long long XXH64_hash_t;
+#  endif
+#endif
+
+/*!
+ * @}
+ *
+ * @defgroup xxh64_family XXH64 family
+ * @ingroup public
+ * @{
+ * Contains functions used in the classic 64-bit xxHash algorithm.
+ *
+ * @note
+ *   XXH3 provides competitive speed for both 32-bit and 64-bit systems,
+ *   and offers true 64/128 bit hash results. It provides a superior level of
+ *   dispersion, and greatly reduces the risks of collisions.
+ */
+
+
+/*!
+ * @brief Calculates the 64-bit hash of @p input using xxHash64.
+ *
+ * This function usually runs faster on 64-bit systems, but slower on 32-bit
+ * systems (see benchmark).
+ *
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ * @param seed The 64-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 64-bit hash.
+ *
+ * @see
+ *    XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():
+ *    Direct equivalents for the other variants of xxHash.
+ * @see
+ *    XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version.
+ */
+XXH_PUBLIC_API XXH64_hash_t XXH64(const void* input, size_t length, XXH64_hash_t seed);
+
+/*******   Streaming   *******/
+/*!
+ * @brief The opaque state struct for the XXH64 streaming API.
+ *
+ * @see XXH64_state_s for details.
+ */
+typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH64_state_t* statePtr, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH64_digest (const XXH64_state_t* statePtr);
+
+/*******   Canonical representation   *******/
+typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t;
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
+
+/*!
+ * @}
+ * ************************************************************************
+ * @defgroup xxh3_family XXH3 family
+ * @ingroup public
+ * @{
+ *
+ * XXH3 is a more recent hash algorithm featuring:
+ *  - Improved speed for both small and large inputs
+ *  - True 64-bit and 128-bit outputs
+ *  - SIMD acceleration
+ *  - Improved 32-bit viability
+ *
+ * Speed analysis methodology is explained here:
+ *
+ *    https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
+ *
+ * Compared to XXH64, expect XXH3 to run approximately
+ * ~2x faster on large inputs and >3x faster on small ones,
+ * exact differences vary depending on platform.
+ *
+ * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic,
+ * but does not require it.
+ * Any 32-bit and 64-bit targets that can run XXH32 smoothly
+ * can run XXH3 at competitive speeds, even without vector support.
+ * Further details are explained in the implementation.
+ *
+ * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8,
+ * ZVector and scalar targets. This can be controlled via the XXH_VECTOR macro.
+ *
+ * XXH3 implementation is portable:
+ * it has a generic C90 formulation that can be compiled on any platform,
+ * all implementations generage exactly the same hash value on all platforms.
+ * Starting from v0.8.0, it's also labelled "stable", meaning that
+ * any future version will also generate the same hash value.
+ *
+ * XXH3 offers 2 variants, _64bits and _128bits.
+ *
+ * When only 64 bits are needed, prefer invoking the _64bits variant, as it
+ * reduces the amount of mixing, resulting in faster speed on small inputs.
+ * It's also generally simpler to manipulate a scalar return type than a struct.
+ *
+ * The API supports one-shot hashing, streaming mode, and custom secrets.
+ */
+
+/*-**********************************************************************
+*  XXH3 64-bit variant
+************************************************************************/
+
+/* XXH3_64bits():
+ * default 64-bit variant, using default secret and default seed of 0.
+ * It's the fastest variant. */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len);
+
+/*
+ * XXH3_64bits_withSeed():
+ * This variant generates a custom secret on the fly
+ * based on default secret altered using the `seed` value.
+ * While this operation is decently fast, note that it's not completely free.
+ * Note: seed==0 produces the same results as XXH3_64bits().
+ */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
+
+/*!
+ * The bare minimum size for a custom secret.
+ *
+ * @see
+ *  XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(),
+ *  XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret().
+ */
+#define XXH3_SECRET_SIZE_MIN 136
+
+/*
+ * XXH3_64bits_withSecret():
+ * It's possible to provide any blob of bytes as a "secret" to generate the hash.
+ * This makes it more difficult for an external actor to prepare an intentional collision.
+ * The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN).
+ * However, the quality of produced hash values depends on secret's entropy.
+ * Technically, the secret must look like a bunch of random bytes.
+ * Avoid "trivial" or structured data such as repeated sequences or a text document.
+ * Whenever unsure about the "randomness" of the blob of bytes,
+ * consider relabelling it as a "custom seed" instead,
+ * and employ "XXH3_generateSecret()" (see below)
+ * to generate a high entropy secret derived from the custom seed.
+ */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+
+
+/*******   Streaming   *******/
+/*
+ * Streaming requires state maintenance.
+ * This operation costs memory and CPU.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer one-shot functions whenever applicable.
+ */
+
+/*!
+ * @brief The state struct for the XXH3 streaming API.
+ *
+ * @see XXH3_state_s for details.
+ */
+typedef struct XXH3_state_s XXH3_state_t;
+XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void);
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
+XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state);
+
+/*
+ * XXH3_64bits_reset():
+ * Initialize with default parameters.
+ * digest will be equivalent to `XXH3_64bits()`.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr);
+/*
+ * XXH3_64bits_reset_withSeed():
+ * Generate a custom secret from `seed`, and store it into `statePtr`.
+ * digest will be equivalent to `XXH3_64bits_withSeed()`.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
+/*
+ * XXH3_64bits_reset_withSecret():
+ * `secret` is referenced, it _must outlive_ the hash streaming session.
+ * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`,
+ * and the quality of produced hash values depends on secret's entropy
+ * (secret's content should look like a bunch of random bytes).
+ * When in doubt about the randomness of a candidate `secret`,
+ * consider employing `XXH3_generateSecret()` instead (see below).
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH3_64bits_digest (const XXH3_state_t* statePtr);
+
+/* note : canonical representation of XXH3 is the same as XXH64
+ * since they both produce XXH64_hash_t values */
+
+
+/*-**********************************************************************
+*  XXH3 128-bit variant
+************************************************************************/
+
+/*!
+ * @brief The return value from 128-bit hashes.
+ *
+ * Stored in little endian order, although the fields themselves are in native
+ * endianness.
+ */
+typedef struct {
+    XXH64_hash_t low64;   /*!< `value & 0xFFFFFFFFFFFFFFFF` */
+    XXH64_hash_t high64;  /*!< `value >> 64` */
+} XXH128_hash_t;
+
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+
+/*******   Streaming   *******/
+/*
+ * Streaming requires state maintenance.
+ * This operation costs memory and CPU.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer one-shot functions whenever applicable.
+ *
+ * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits().
+ * Use already declared XXH3_createState() and XXH3_freeState().
+ *
+ * All reset and streaming functions have same meaning as their 64-bit counterpart.
+ */
+
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr);
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr);
+
+/* Following helper functions make it possible to compare XXH128_hast_t values.
+ * Since XXH128_hash_t is a structure, this capability is not offered by the language.
+ * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */
+
+/*!
+ * XXH128_isEqual():
+ * Return: 1 if `h1` and `h2` are equal, 0 if they are not.
+ */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
+
+/*!
+ * XXH128_cmp():
+ *
+ * This comparator is compatible with stdlib's `qsort()`/`bsearch()`.
+ *
+ * return: >0 if *h128_1  > *h128_2
+ *         =0 if *h128_1 == *h128_2
+ *         <0 if *h128_1  < *h128_2
+ */
+XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2);
+
+
+/*******   Canonical representation   *******/
+typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t;
+XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);
+XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
+
+
+#endif  /* XXH_NO_LONG_LONG */
+
+/*!
+ * @}
+ */
+#endif /* XXHASH_H_5627135585666179 */
+
+
+
+#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
+#define XXHASH_H_STATIC_13879238742
+/* ****************************************************************************
+ * This section contains declarations which are not guaranteed to remain stable.
+ * They may change in future versions, becoming incompatible with a different
+ * version of the library.
+ * These declarations should only be used with static linking.
+ * Never use them in association with dynamic linking!
+ ***************************************************************************** */
+
+/*
+ * These definitions are only present to allow static allocation
+ * of XXH states, on stack or in a struct, for example.
+ * Never **ever** access their members directly.
+ */
+
+/*!
+ * @internal
+ * @brief Structure for XXH32 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
+ * an opaque type. This allows fields to safely be changed.
+ *
+ * Typedef'd to @ref XXH32_state_t.
+ * Do not access the members of this struct directly.
+ * @see XXH64_state_s, XXH3_state_s
+ */
+struct XXH32_state_s {
+   XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */
+   XXH32_hash_t large_len;    /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */
+   XXH32_hash_t v1;           /*!< First accumulator lane */
+   XXH32_hash_t v2;           /*!< Second accumulator lane */
+   XXH32_hash_t v3;           /*!< Third accumulator lane */
+   XXH32_hash_t v4;           /*!< Fourth accumulator lane */
+   XXH32_hash_t mem32[4];     /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */
+   XXH32_hash_t memsize;      /*!< Amount of data in @ref mem32 */
+   XXH32_hash_t reserved;     /*!< Reserved field. Do not read or write to it, it may be removed. */
+};   /* typedef'd to XXH32_state_t */
+
+
+#ifndef XXH_NO_LONG_LONG  /* defined when there is no 64-bit support */
+
+/*!
+ * @internal
+ * @brief Structure for XXH64 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
+ * an opaque type. This allows fields to safely be changed.
+ *
+ * Typedef'd to @ref XXH64_state_t.
+ * Do not access the members of this struct directly.
+ * @see XXH32_state_s, XXH3_state_s
+ */
+struct XXH64_state_s {
+   XXH64_hash_t total_len;    /*!< Total length hashed. This is always 64-bit. */
+   XXH64_hash_t v1;           /*!< First accumulator lane */
+   XXH64_hash_t v2;           /*!< Second accumulator lane */
+   XXH64_hash_t v3;           /*!< Third accumulator lane */
+   XXH64_hash_t v4;           /*!< Fourth accumulator lane */
+   XXH64_hash_t mem64[4];     /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */
+   XXH32_hash_t memsize;      /*!< Amount of data in @ref mem64 */
+   XXH32_hash_t reserved32;   /*!< Reserved field, needed for padding anyways*/
+   XXH64_hash_t reserved64;   /*!< Reserved field. Do not read or write to it, it may be removed. */
+};   /* typedef'd to XXH64_state_t */
+
+#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)   /* C11+ */
+#  include <stdalign.h>
+#  define XXH_ALIGN(n)      alignas(n)
+#elif defined(__GNUC__)
+#  define XXH_ALIGN(n)      __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#  define XXH_ALIGN(n)      __declspec(align(n))
+#else
+#  define XXH_ALIGN(n)   /* disabled */
+#endif
+
+/* Old GCC versions only accept the attribute after the type in structures. */
+#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L))   /* C11+ */ \
+    && defined(__GNUC__)
+#   define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
+#else
+#   define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
+#endif
+
+/*!
+ * @brief The size of the internal XXH3 buffer.
+ *
+ * This is the optimal update size for incremental hashing.
+ *
+ * @see XXH3_64b_update(), XXH3_128b_update().
+ */
+#define XXH3_INTERNALBUFFER_SIZE 256
+
+/*!
+ * @brief Default size of the secret buffer (and @ref XXH3_kSecret).
+ *
+ * This is the size used in @ref XXH3_kSecret and the seeded functions.
+ *
+ * Not to be confused with @ref XXH3_SECRET_SIZE_MIN.
+ */
+#define XXH3_SECRET_DEFAULT_SIZE 192
+
+/*!
+ * @internal
+ * @brief Structure for XXH3 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
+ * an opaque type. This allows fields to safely be changed.
+ *
+ * @note **This structure has a strict alignment requirement of 64 bytes.** Do
+ * not allocate this with `malloc()` or `new`, it will not be sufficiently
+ * aligned. Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack
+ * allocation.
+ *
+ * Typedef'd to @ref XXH3_state_t.
+ * Do not access the members of this struct directly.
+ *
+ * @see XXH3_INITSTATE() for stack initialization.
+ * @see XXH3_createState(), XXH3_freeState().
+ * @see XXH32_state_s, XXH64_state_s
+ */
+struct XXH3_state_s {
+   XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
+       /*!< The 8 accumulators. Similar to `vN` in @ref XXH32_state_s::v1 and @ref XXH64_state_s */
+   XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
+       /*!< Used to store a custom secret generated from a seed. */
+   XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
+       /*!< The internal buffer. @see XXH32_state_s::mem32 */
+   XXH32_hash_t bufferedSize;
+       /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */
+   XXH32_hash_t reserved32;
+       /*!< Reserved field. Needed for padding on 64-bit. */
+   size_t nbStripesSoFar;
+       /*!< Number or stripes processed. */
+   XXH64_hash_t totalLen;
+       /*!< Total length hashed. 64-bit even on 32-bit targets. */
+   size_t nbStripesPerBlock;
+       /*!< Number of stripes per block. */
+   size_t secretLimit;
+       /*!< Size of @ref customSecret or @ref extSecret */
+   XXH64_hash_t seed;
+       /*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */
+   XXH64_hash_t reserved64;
+       /*!< Reserved field. */
+   const unsigned char* extSecret;
+       /*!< Reference to an external secret for the _withSecret variants, NULL
+        *   for other variants. */
+   /* note: there may be some padding at the end due to alignment on 64 bytes */
+}; /* typedef'd to XXH3_state_t */
+
+#undef XXH_ALIGN_MEMBER
+
+/*!
+ * @brief Initializes a stack-allocated `XXH3_state_s`.
+ *
+ * When the @ref XXH3_state_t structure is merely emplaced on stack,
+ * it should be initialized with XXH3_INITSTATE() or a memset()
+ * in case its first reset uses XXH3_NNbits_reset_withSeed().
+ * This init can be omitted if the first reset uses default or _withSecret mode.
+ * This operation isn't necessary when the state is created with XXH3_createState().
+ * Note that this doesn't prepare the state for a streaming operation,
+ * it's still necessary to use XXH3_NNbits_reset*() afterwards.
+ */
+#define XXH3_INITSTATE(XXH3_state_ptr)   { (XXH3_state_ptr)->seed = 0; }
+
+
+/* ===   Experimental API   === */
+/* Symbols defined below must be considered tied to a specific library version. */
+
+/*
+ * XXH3_generateSecret():
+ *
+ * Derive a high-entropy secret from any user-defined content, named customSeed.
+ * The generated secret can be used in combination with `*_withSecret()` functions.
+ * The `_withSecret()` variants are useful to provide a higher level of protection than 64-bit seed,
+ * as it becomes much more difficult for an external actor to guess how to impact the calculation logic.
+ *
+ * The function accepts as input a custom seed of any length and any content,
+ * and derives from it a high-entropy secret of length XXH3_SECRET_DEFAULT_SIZE
+ * into an already allocated buffer secretBuffer.
+ * The generated secret is _always_ XXH_SECRET_DEFAULT_SIZE bytes long.
+ *
+ * The generated secret can then be used with any `*_withSecret()` variant.
+ * Functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`,
+ * `XXH3_128bits_reset_withSecret()` and `XXH3_64bits_reset_withSecret()`
+ * are part of this list. They all accept a `secret` parameter
+ * which must be very long for implementation reasons (>= XXH3_SECRET_SIZE_MIN)
+ * _and_ feature very high entropy (consist of random-looking bytes).
+ * These conditions can be a high bar to meet, so
+ * this function can be used to generate a secret of proper quality.
+ *
+ * customSeed can be anything. It can have any size, even small ones,
+ * and its content can be anything, even stupidly "low entropy" source such as a bunch of zeroes.
+ * The resulting `secret` will nonetheless provide all expected qualities.
+ *
+ * Supplying NULL as the customSeed copies the default secret into `secretBuffer`.
+ * When customSeedSize > 0, supplying NULL as customSeed is undefined behavior.
+ */
+XXH_PUBLIC_API void XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize);
+
+
+/* simple short-cut to pre-selected XXH3_128bits variant */
+XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
+
+
+#endif  /* XXH_NO_LONG_LONG */
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+#  define XXH_IMPLEMENTATION
+#endif
+
+#endif  /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */
+
+
+/* ======================================================================== */
+/* ======================================================================== */
+/* ======================================================================== */
+
+
+/*-**********************************************************************
+ * xxHash implementation
+ *-**********************************************************************
+ * xxHash's implementation used to be hosted inside xxhash.c.
+ *
+ * However, inlining requires implementation to be visible to the compiler,
+ * hence be included alongside the header.
+ * Previously, implementation was hosted inside xxhash.c,
+ * which was then #included when inlining was activated.
+ * This construction created issues with a few build and install systems,
+ * as it required xxhash.c to be stored in /include directory.
+ *
+ * xxHash implementation is now directly integrated within xxhash.h.
+ * As a consequence, xxhash.c is no longer needed in /include.
+ *
+ * xxhash.c is still available and is still useful.
+ * In a "normal" setup, when xxhash is not inlined,
+ * xxhash.h only exposes the prototypes and public symbols,
+ * while xxhash.c can be built into an object file xxhash.o
+ * which can then be linked into the final binary.
+ ************************************************************************/
+
+#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \
+   || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387)
+#  define XXH_IMPLEM_13a8737387
+
+/* *************************************
+*  Tuning parameters
+***************************************/
+
+/*!
+ * @defgroup tuning Tuning parameters
+ * @{
+ *
+ * Various macros to control xxHash's behavior.
+ */
+#ifdef XXH_DOXYGEN
+/*!
+ * @brief Define this to disable 64-bit code.
+ *
+ * Useful if only using the @ref xxh32_family and you have a strict C90 compiler.
+ */
+#  define XXH_NO_LONG_LONG
+#  undef XXH_NO_LONG_LONG /* don't actually */
+/*!
+ * @brief Controls how unaligned memory is accessed.
+ *
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is
+ * safe and portable.
+ *
+ * Unfortunately, on some target/compiler combinations, the generated assembly
+ * is sub-optimal.
+ *
+ * The below switch allow selection of a different access method
+ * in the search for improved performance.
+ *
+ * @par Possible options:
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy`
+ *   @par
+ *     Use `memcpy()`. Safe and portable. Note that most modern compilers will
+ *     eliminate the function call and treat it as an unaligned access.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((packed))`
+ *   @par
+ *     Depends on compiler extensions and is therefore not portable.
+ *     This method is safe _if_ your compiler supports it,
+ *     and *generally* as fast or faster than `memcpy`.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast
+ *  @par
+ *     Casts directly and dereferences. This method doesn't depend on the
+ *     compiler, but it violates the C standard as it directly dereferences an
+ *     unaligned pointer. It can generate buggy code on targets which do not
+ *     support unaligned memory accesses, but in some circumstances, it's the
+ *     only known way to get the most performance.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift
+ *  @par
+ *     Also portable. This can generate the best code on old compilers which don't
+ *     inline small `memcpy()` calls, and it might also be faster on big-endian
+ *     systems which lack a native byteswap instruction. However, some compilers
+ *     will emit literal byteshifts even if the target supports unaligned access.
+ *  .
+ *
+ * @warning
+ *   Methods 1 and 2 rely on implementation-defined behavior. Use these with
+ *   care, as what works on one compiler/platform/optimization level may cause
+ *   another to read garbage data or even crash.
+ *
+ * See https://stackoverflow.com/a/32095106/646947 for details.
+ *
+ * Prefer these methods in priority order (0 > 3 > 1 > 2)
+ */
+#  define XXH_FORCE_MEMORY_ACCESS 0
+/*!
+ * @def XXH_ACCEPT_NULL_INPUT_POINTER
+ * @brief Whether to add explicit `NULL` checks.
+ *
+ * If the input pointer is `NULL` and the length is non-zero, xxHash's default
+ * behavior is to dereference it, triggering a segfault.
+ *
+ * When this macro is enabled, xxHash actively checks the input for a null pointer.
+ * If it is, the result for null input pointers is the same as a zero-length input.
+ */
+#  define XXH_ACCEPT_NULL_INPUT_POINTER 0
+/*!
+ * @def XXH_FORCE_ALIGN_CHECK
+ * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32()
+ * and XXH64() only).
+ *
+ * This is an important performance trick for architectures without decent
+ * unaligned memory access performance.
+ *
+ * It checks for input alignment, and when conditions are met, uses a "fast
+ * path" employing direct 32-bit/64-bit reads, resulting in _dramatically
+ * faster_ read speed.
+ *
+ * The check costs one initial branch per hash, which is generally negligible,
+ * but not zero.
+ *
+ * Moreover, it's not useful to generate an additional code path if memory
+ * access uses the same instruction for both aligned and unaligned
+ * addresses (e.g. x86 and aarch64).
+ *
+ * In these cases, the alignment check can be removed by setting this macro to 0.
+ * Then the code will always use unaligned memory access.
+ * Align check is automatically disabled on x86, x64 & arm64,
+ * which are platforms known to offer good unaligned memory accesses performance.
+ *
+ * This option does not affect XXH3 (only XXH32 and XXH64).
+ */
+#  define XXH_FORCE_ALIGN_CHECK 0
+
+/*!
+ * @def XXH_NO_INLINE_HINTS
+ * @brief When non-zero, sets all functions to `static`.
+ *
+ * By default, xxHash tries to force the compiler to inline almost all internal
+ * functions.
+ *
+ * This can usually improve performance due to reduced jumping and improved
+ * constant folding, but significantly increases the size of the binary which
+ * might not be favorable.
+ *
+ * Additionally, sometimes the forced inlining can be detrimental to performance,
+ * depending on the architecture.
+ *
+ * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
+ * compiler full control on whether to inline or not.
+ *
+ * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using
+ * -fno-inline with GCC or Clang, this will automatically be defined.
+ */
+#  define XXH_NO_INLINE_HINTS 0
+
+/*!
+ * @def XXH_REROLL
+ * @brief Whether to reroll `XXH32_finalize` and `XXH64_finalize`.
+ *
+ * For performance, `XXH32_finalize` and `XXH64_finalize` use an unrolled loop
+ * in the form of a switch statement.
+ *
+ * This is not always desirable, as it generates larger code, and depending on
+ * the architecture, may even be slower
+ *
+ * This is automatically defined with `-Os`/`-Oz` on GCC and Clang.
+ */
+#  define XXH_REROLL 0
+
+/*!
+ * @internal
+ * @brief Redefines old internal names.
+ *
+ * For compatibility with code that uses xxHash's internals before the names
+ * were changed to improve namespacing. There is no other reason to use this.
+ */
+#  define XXH_OLD_NAMES
+#  undef XXH_OLD_NAMES /* don't actually use, it is ugly. */
+#endif /* XXH_DOXYGEN */
+/*!
+ * @}
+ */
+
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+   /* prefer __packed__ structures (method 1) for gcc on armv7 and armv8 */
+#  if !defined(__clang__) && ( \
+    (defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
+    (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7)) )
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+#ifndef XXH_ACCEPT_NULL_INPUT_POINTER   /* can be defined externally */
+#  define XXH_ACCEPT_NULL_INPUT_POINTER 0
+#endif
+
+#ifndef XXH_FORCE_ALIGN_CHECK  /* can be defined externally */
+#  if defined(__i386)  || defined(__x86_64__) || defined(__aarch64__) \
+   || defined(_M_IX86) || defined(_M_X64)     || defined(_M_ARM64) /* visual */
+#    define XXH_FORCE_ALIGN_CHECK 0
+#  else
+#    define XXH_FORCE_ALIGN_CHECK 1
+#  endif
+#endif
+
+#ifndef XXH_NO_INLINE_HINTS
+#  if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \
+   || defined(__NO_INLINE__)     /* -O0, -fno-inline */
+#    define XXH_NO_INLINE_HINTS 1
+#  else
+#    define XXH_NO_INLINE_HINTS 0
+#  endif
+#endif
+
+#ifndef XXH_REROLL
+#  if defined(__OPTIMIZE_SIZE__)
+#    define XXH_REROLL 1
+#  else
+#    define XXH_REROLL 0
+#  endif
+#endif
+
+/*!
+ * @defgroup impl Implementation
+ * @{
+ */
+
+
+/* *************************************
+*  Includes & Memory related functions
+***************************************/
+/*
+ * Modify the local functions below should you wish to use
+ * different memory routines for malloc() and free()
+ */
+#include <stdlib.h>
+
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than malloc().
+ */
+static void* XXH_malloc(size_t s) { return malloc(s); }
+
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than free().
+ */
+static void XXH_free(void* p) { free(p); }
+
+#include <string.h>
+
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than memcpy().
+ */
+static void* XXH_memcpy(void* dest, const void* src, size_t size)
+{
+    return memcpy(dest,src,size);
+}
+
+#include <limits.h>   /* ULLONG_MAX */
+
+
+/* *************************************
+*  Compiler Specific Options
+***************************************/
+#ifdef _MSC_VER /* Visual Studio warning fix */
+#  pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
+#endif
+
+#if XXH_NO_INLINE_HINTS  /* disable inlining hints */
+#  if defined(__GNUC__)
+#    define XXH_FORCE_INLINE static __attribute__((unused))
+#  else
+#    define XXH_FORCE_INLINE static
+#  endif
+#  define XXH_NO_INLINE static
+/* enable inlining hints */
+#elif defined(_MSC_VER)  /* Visual Studio */
+#  define XXH_FORCE_INLINE static __forceinline
+#  define XXH_NO_INLINE static __declspec(noinline)
+#elif defined(__GNUC__)
+#  define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused))
+#  define XXH_NO_INLINE static __attribute__((noinline))
+#elif defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L))   /* C99 */
+#  define XXH_FORCE_INLINE static inline
+#  define XXH_NO_INLINE static
+#else
+#  define XXH_FORCE_INLINE static
+#  define XXH_NO_INLINE static
+#endif
+
+
+
+/* *************************************
+*  Debug
+***************************************/
+/*!
+ * @ingroup tuning
+ * @def XXH_DEBUGLEVEL
+ * @brief Sets the debugging level.
+ *
+ * XXH_DEBUGLEVEL is expected to be defined externally, typically via the
+ * compiler's command line options. The value must be a number.
+ */
+#ifndef XXH_DEBUGLEVEL
+#  ifdef DEBUGLEVEL /* backwards compat */
+#    define XXH_DEBUGLEVEL DEBUGLEVEL
+#  else
+#    define XXH_DEBUGLEVEL 0
+#  endif
+#endif
+
+#if (XXH_DEBUGLEVEL>=1)
+#  include <assert.h>   /* note: can still be disabled with NDEBUG */
+#  define XXH_ASSERT(c)   assert(c)
+#else
+#  define XXH_ASSERT(c)   ((void)0)
+#endif
+
+/* note: use after variable declarations */
+#define XXH_STATIC_ASSERT(c)  do { enum { XXH_sa = 1/(int)(!!(c)) }; } while (0)
+
+/*!
+ * @internal
+ * @def XXH_COMPILER_GUARD(var)
+ * @brief Used to prevent unwanted optimizations for @p var.
+ *
+ * It uses an empty GCC inline assembly statement with a register constraint
+ * which forces @p var into a general purpose register (eg eax, ebx, ecx
+ * on x86) and marks it as modified.
+ *
+ * This is used in a few places to avoid unwanted autovectorization (e.g.
+ * XXH32_round()). All vectorization we want is explicit via intrinsics,
+ * and _usually_ isn't wanted elsewhere.
+ *
+ * We also use it to prevent unwanted constant folding for AArch64 in
+ * XXH3_initCustomSecret_scalar().
+ */
+#ifdef __GNUC__
+#  define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r" (var))
+#else
+#  define XXH_COMPILER_GUARD(var) ((void)0)
+#endif
+
+/* *************************************
+*  Basic Types
+***************************************/
+#if !defined (__VMS) \
+ && (defined (__cplusplus) \
+ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+  typedef uint8_t xxh_u8;
+#else
+  typedef unsigned char xxh_u8;
+#endif
+typedef XXH32_hash_t xxh_u32;
+
+#ifdef XXH_OLD_NAMES
+#  define BYTE xxh_u8
+#  define U8   xxh_u8
+#  define U32  xxh_u32
+#endif
+
+/* ***   Memory access   *** */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_read32(const void* ptr)
+ * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit native endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readLE32(const void* ptr)
+ * @brief Reads an unaligned 32-bit little endian integer from @p ptr.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit little endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readBE32(const void* ptr)
+ * @brief Reads an unaligned 32-bit big endian integer from @p ptr.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit big endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align)
+ * @brief Like @ref XXH_readLE32(), but has an option for aligned reads.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is
+ * always @ref XXH_alignment::XXH_unaligned.
+ *
+ * @param ptr The pointer to read from.
+ * @param align Whether @p ptr is aligned.
+ * @pre
+ *   If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte
+ *   aligned.
+ * @return The 32-bit little endian integer from the bytes at @p ptr.
+ */
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+/*
+ * Manual byteshift. Best for old compilers which don't inline memcpy.
+ * We actually directly use XXH_readLE32 and XXH_readBE32.
+ */
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/*
+ * Force direct memory access. Only works on CPU which support unaligned memory
+ * access in hardware.
+ */
+static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/*
+ * __pack instructions are safer but compiler specific, hence potentially
+ * problematic for some compilers.
+ *
+ * Currently only defined for GCC and ICC.
+ */
+#ifdef XXH_OLD_NAMES
+typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;
+#endif
+static xxh_u32 XXH_read32(const void* ptr)
+{
+    typedef union { xxh_u32 u32; } __attribute__((packed)) xxh_unalign;
+    return ((const xxh_unalign*)ptr)->u32;
+}
+
+#else
+
+/*
+ * Portable and safe solution. Generally efficient.
+ * see: https://stackoverflow.com/a/32095106/646947
+ */
+static xxh_u32 XXH_read32(const void* memPtr)
+{
+    xxh_u32 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+
+/* ***   Endianness   *** */
+/*!
+ * @ingroup tuning
+ * @def XXH_CPU_LITTLE_ENDIAN
+ * @brief Whether the target is little endian.
+ *
+ * Defined to 1 if the target is little endian, or 0 if it is big endian.
+ * It can be defined externally, for example on the compiler command line.
+ *
+ * If it is not defined, a runtime check (which is usually constant folded)
+ * is used instead.
+ *
+ * @note
+ *   This is not necessarily defined to an integer constant.
+ *
+ * @see XXH_isLittleEndian() for the runtime check.
+ */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+/*
+ * Try to detect endianness automatically, to avoid the nonstandard behavior
+ * in `XXH_isLittleEndian()`
+ */
+#  if defined(_WIN32) /* Windows is always little endian */ \
+     || defined(__LITTLE_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 1
+#  elif defined(__BIG_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 0
+#  else
+/*!
+ * @internal
+ * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN.
+ *
+ * Most compilers will constant fold this.
+ */
+static int XXH_isLittleEndian(void)
+{
+    /*
+     * Portable and well-defined behavior.
+     * Don't use static: it is detrimental to performance.
+     */
+    const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };
+    return one.c[0];
+}
+#   define XXH_CPU_LITTLE_ENDIAN   XXH_isLittleEndian()
+#  endif
+#endif
+
+
+
+
+/* ****************************************
+*  Compiler-specific Functions and Macros
+******************************************/
+#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#ifdef __has_builtin
+#  define XXH_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#  define XXH_HAS_BUILTIN(x) 0
+#endif
+
+/*!
+ * @internal
+ * @def XXH_rotl32(x,r)
+ * @brief 32-bit rotate left.
+ *
+ * @param x The 32-bit integer to be rotated.
+ * @param r The number of bits to rotate.
+ * @pre
+ *   @p r > 0 && @p r < 32
+ * @note
+ *   @p x and @p r may be evaluated multiple times.
+ * @return The rotated result.
+ */
+#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \
+                               && XXH_HAS_BUILTIN(__builtin_rotateleft64)
+#  define XXH_rotl32 __builtin_rotateleft32
+#  define XXH_rotl64 __builtin_rotateleft64
+/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */
+#elif defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#  define XXH_rotl64(x,r) _rotl64(x,r)
+#else
+#  define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
+#  define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
+#endif
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_swap32(xxh_u32 x)
+ * @brief A 32-bit byteswap.
+ *
+ * @param x The 32-bit integer to byteswap.
+ * @return @p x, byteswapped.
+ */
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap32 _byteswap_ulong
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#else
+static xxh_u32 XXH_swap32 (xxh_u32 x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
+#endif
+
+
+/* ***************************
+*  Memory reads
+*****************************/
+
+/*!
+ * @internal
+ * @brief Enum to indicate whether a pointer is aligned.
+ */
+typedef enum {
+    XXH_aligned,  /*!< Aligned */
+    XXH_unaligned /*!< Possibly unaligned */
+} XXH_alignment;
+
+/*
+ * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.
+ *
+ * This is ideal for older compilers which don't inline memcpy.
+ */
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[0]
+         | ((xxh_u32)bytePtr[1] << 8)
+         | ((xxh_u32)bytePtr[2] << 16)
+         | ((xxh_u32)bytePtr[3] << 24);
+}
+
+XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[3]
+         | ((xxh_u32)bytePtr[2] << 8)
+         | ((xxh_u32)bytePtr[1] << 16)
+         | ((xxh_u32)bytePtr[0] << 24);
+}
+
+#else
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+}
+
+static xxh_u32 XXH_readBE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u32
+XXH_readLE32_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned) {
+        return XXH_readLE32(ptr);
+    } else {
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr);
+    }
+}
+
+
+/* *************************************
+*  Misc
+***************************************/
+/*! @ingroup public */
+XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
+
+
+/* *******************************************************************
+*  32-bit hash functions
+*********************************************************************/
+/*!
+ * @}
+ * @defgroup xxh32_impl XXH32 implementation
+ * @ingroup impl
+ * @{
+ */
+ /* #define instead of static const, to be used as initializers */
+#define XXH_PRIME32_1  0x9E3779B1U  /*!< 0b10011110001101110111100110110001 */
+#define XXH_PRIME32_2  0x85EBCA77U  /*!< 0b10000101111010111100101001110111 */
+#define XXH_PRIME32_3  0xC2B2AE3DU  /*!< 0b11000010101100101010111000111101 */
+#define XXH_PRIME32_4  0x27D4EB2FU  /*!< 0b00100111110101001110101100101111 */
+#define XXH_PRIME32_5  0x165667B1U  /*!< 0b00010110010101100110011110110001 */
+
+#ifdef XXH_OLD_NAMES
+#  define PRIME32_1 XXH_PRIME32_1
+#  define PRIME32_2 XXH_PRIME32_2
+#  define PRIME32_3 XXH_PRIME32_3
+#  define PRIME32_4 XXH_PRIME32_4
+#  define PRIME32_5 XXH_PRIME32_5
+#endif
+
+/*!
+ * @internal
+ * @brief Normal stripe processing routine.
+ *
+ * This shuffles the bits so that any bit from @p input impacts several bits in
+ * @p acc.
+ *
+ * @param acc The accumulator lane.
+ * @param input The stripe of input to mix.
+ * @return The mixed accumulator lane.
+ */
+static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
+{
+    acc += input * XXH_PRIME32_2;
+    acc  = XXH_rotl32(acc, 13);
+    acc *= XXH_PRIME32_1;
+#if (defined(__SSE4_1__) || defined(__aarch64__)) && !defined(XXH_ENABLE_AUTOVECTORIZE)
+    /*
+     * UGLY HACK:
+     * A compiler fence is the only thing that prevents GCC and Clang from
+     * autovectorizing the XXH32 loop (pragmas and attributes don't work for some
+     * reason) without globally disabling SSE4.1.
+     *
+     * The reason we want to avoid vectorization is because despite working on
+     * 4 integers at a time, there are multiple factors slowing XXH32 down on
+     * SSE4:
+     * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on
+     *   newer chips!) making it slightly slower to multiply four integers at
+     *   once compared to four integers independently. Even when pmulld was
+     *   fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE
+     *   just to multiply unless doing a long operation.
+     *
+     * - Four instructions are required to rotate,
+     *      movqda tmp,  v // not required with VEX encoding
+     *      pslld  tmp, 13 // tmp <<= 13
+     *      psrld  v,   19 // x >>= 19
+     *      por    v,  tmp // x |= tmp
+     *   compared to one for scalar:
+     *      roll   v, 13    // reliably fast across the board
+     *      shldl  v, v, 13 // Sandy Bridge and later prefer this for some reason
+     *
+     * - Instruction level parallelism is actually more beneficial here because
+     *   the SIMD actually serializes this operation: While v1 is rotating, v2
+     *   can load data, while v3 can multiply. SSE forces them to operate
+     *   together.
+     *
+     * This is also enabled on AArch64, as Clang autovectorizes it incorrectly
+     * and it is pointless writing a NEON implementation that is basically the
+     * same speed as scalar for XXH32.
+     */
+    XXH_COMPILER_GUARD(acc);
+#endif
+    return acc;
+}
+
+/*!
+ * @internal
+ * @brief Mixes all bits to finalize the hash.
+ *
+ * The final mix ensures that all input bits have a chance to impact any bit in
+ * the output digest, resulting in an unbiased distribution.
+ *
+ * @param h32 The hash to avalanche.
+ * @return The avalanched hash.
+ */
+static xxh_u32 XXH32_avalanche(xxh_u32 h32)
+{
+    h32 ^= h32 >> 15;
+    h32 *= XXH_PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= XXH_PRIME32_3;
+    h32 ^= h32 >> 16;
+    return(h32);
+}
+
+#define XXH_get32bits(p) XXH_readLE32_align(p, align)
+
+/*!
+ * @internal
+ * @brief Processes the last 0-15 bytes of @p ptr.
+ *
+ * There may be up to 15 bytes remaining to consume from the input.
+ * This final stage will digest them to ensure that all input bytes are present
+ * in the final mix.
+ *
+ * @param h32 The hash to finalize.
+ * @param ptr The pointer to the remaining input.
+ * @param len The remaining length, modulo 16.
+ * @param align Whether @p ptr is aligned.
+ * @return The finalized hash.
+ */
+static xxh_u32
+XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+#define XXH_PROCESS1 do {                           \
+    h32 += (*ptr++) * XXH_PRIME32_5;                \
+    h32 = XXH_rotl32(h32, 11) * XXH_PRIME32_1;      \
+} while (0)
+
+#define XXH_PROCESS4 do {                           \
+    h32 += XXH_get32bits(ptr) * XXH_PRIME32_3;      \
+    ptr += 4;                                   \
+    h32  = XXH_rotl32(h32, 17) * XXH_PRIME32_4;     \
+} while (0)
+
+    /* Compact rerolled version */
+    if (XXH_REROLL) {
+        len &= 15;
+        while (len >= 4) {
+            XXH_PROCESS4;
+            len -= 4;
+        }
+        while (len > 0) {
+            XXH_PROCESS1;
+            --len;
+        }
+        return XXH32_avalanche(h32);
+    } else {
+         switch(len&15) /* or switch(bEnd - p) */ {
+           case 12:      XXH_PROCESS4;
+                         FALLTHROUGH_INTENDED;
+           case 8:       XXH_PROCESS4;
+                         FALLTHROUGH_INTENDED;
+           case 4:       XXH_PROCESS4;
+                         return XXH32_avalanche(h32);
+
+           case 13:      XXH_PROCESS4;
+                         FALLTHROUGH_INTENDED;
+           case 9:       XXH_PROCESS4;
+                         FALLTHROUGH_INTENDED;
+           case 5:       XXH_PROCESS4;
+                         XXH_PROCESS1;
+                         return XXH32_avalanche(h32);
+
+           case 14:      XXH_PROCESS4;
+                         FALLTHROUGH_INTENDED;
+           case 10:      XXH_PROCESS4;
+                         FALLTHROUGH_INTENDED;
+           case 6:       XXH_PROCESS4;
+                         XXH_PROCESS1;
+                         XXH_PROCESS1;
+                         return XXH32_avalanche(h32);
+
+           case 15:      XXH_PROCESS4;
+                         FALLTHROUGH_INTENDED;
+           case 11:      XXH_PROCESS4;
+                         FALLTHROUGH_INTENDED;
+           case 7:       XXH_PROCESS4;
+                         FALLTHROUGH_INTENDED;
+           case 3:       XXH_PROCESS1;
+                         FALLTHROUGH_INTENDED;
+           case 2:       XXH_PROCESS1;
+                         FALLTHROUGH_INTENDED;
+           case 1:       XXH_PROCESS1;
+                         FALLTHROUGH_INTENDED;
+           case 0:       return XXH32_avalanche(h32);
+        }
+        XXH_ASSERT(0);
+        return h32;   /* reaching this point is deemed impossible */
+    }
+}
+
+#ifdef XXH_OLD_NAMES
+#  define PROCESS1 XXH_PROCESS1
+#  define PROCESS4 XXH_PROCESS4
+#else
+#  undef XXH_PROCESS1
+#  undef XXH_PROCESS4
+#endif
+
+/*!
+ * @internal
+ * @brief The implementation for @ref XXH32().
+ *
+ * @param input, len, seed Directly passed from @ref XXH32().
+ * @param align Whether @p input is aligned.
+ * @return The calculated hash.
+ */
+XXH_FORCE_INLINE xxh_u32
+XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
+{
+    const xxh_u8* bEnd = input ? input + len : NULL;
+    xxh_u32 h32;
+
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+    if (input==NULL) {
+        len=0;
+        bEnd=input=(const xxh_u8*)(size_t)16;
+    }
+#endif
+
+    if (len>=16) {
+        const xxh_u8* const limit = bEnd - 15;
+        xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
+        xxh_u32 v2 = seed + XXH_PRIME32_2;
+        xxh_u32 v3 = seed + 0;
+        xxh_u32 v4 = seed - XXH_PRIME32_1;
+
+        do {
+            v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4;
+            v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4;
+            v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4;
+            v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4;
+        } while (input < limit);
+
+        h32 = XXH_rotl32(v1, 1)  + XXH_rotl32(v2, 7)
+            + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    } else {
+        h32  = seed + XXH_PRIME32_5;
+    }
+
+    h32 += (xxh_u32)len;
+
+    return XXH32_finalize(h32, input, len&15, align);
+}
+
+/*! @ingroup xxh32_family */
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH32_state_t state;
+    XXH32_reset(&state, seed);
+    XXH32_update(&state, (const xxh_u8*)input, len);
+    return XXH32_digest(&state);
+#else
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
+            return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+#endif
+}
+
+
+
+/*******   Hash streaming   *******/
+/*!
+ * @ingroup xxh32_family
+ */
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
+{
+    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+/*! @ingroup xxh32_family */
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+/*! @ingroup xxh32_family */
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+/*! @ingroup xxh32_family */
+XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
+{
+    XXH32_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state));
+    state.v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
+    state.v2 = seed + XXH_PRIME32_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - XXH_PRIME32_1;
+    /* do not write into reserved, planned to be removed in a future version */
+    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
+    return XXH_OK;
+}
+
+
+/*! @ingroup xxh32_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH32_update(XXH32_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
+
+        state->total_len_32 += (XXH32_hash_t)len;
+        state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16));
+
+        if (state->memsize + len < 16)  {   /* fill in tmp buffer */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len);
+            state->memsize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+
+        if (state->memsize) {   /* some data left from previous update */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);
+            {   const xxh_u32* p32 = state->mem32;
+                state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++;
+                state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++;
+                state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++;
+                state->v4 = XXH32_round(state->v4, XXH_readLE32(p32));
+            }
+            p += 16-state->memsize;
+            state->memsize = 0;
+        }
+
+        /* uintptr_t casts avoid UB or compiler warning on out-of-bounds
+         * pointer arithmetic */
+        if ((uintptr_t)p <= (uintptr_t)bEnd - 16) {
+            const uintptr_t limit = (uintptr_t)bEnd - 16;
+            xxh_u32 v1 = state->v1;
+            xxh_u32 v2 = state->v2;
+            xxh_u32 v3 = state->v3;
+            xxh_u32 v4 = state->v4;
+
+            do {
+                v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4;
+                v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4;
+                v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4;
+                v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4;
+            } while ((uintptr_t)p<=limit);
+
+            state->v1 = v1;
+            state->v2 = v2;
+            state->v3 = v3;
+            state->v4 = v4;
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+/*! @ingroup xxh32_family */
+XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state)
+{
+    xxh_u32 h32;
+
+    if (state->large_len) {
+        h32 = XXH_rotl32(state->v1, 1)
+            + XXH_rotl32(state->v2, 7)
+            + XXH_rotl32(state->v3, 12)
+            + XXH_rotl32(state->v4, 18);
+    } else {
+        h32 = state->v3 /* == seed */ + XXH_PRIME32_5;
+    }
+
+    h32 += state->total_len_32;
+
+    return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);
+}
+
+
+/*******   Canonical representation   *******/
+
+/*!
+ * @ingroup xxh32_family
+ * The default return values from XXH functions are unsigned 32 and 64 bit
+ * integers.
+ *
+ * The canonical representation uses big endian convention, the same convention
+ * as human-readable numbers (large digits first).
+ *
+ * This way, hash values can be written into a file or buffer, remaining
+ * comparable across different systems.
+ *
+ * The following functions allow transformation of hash values to and from their
+ * canonical format.
+ */
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
+/*! @ingroup xxh32_family */
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
+{
+    return XXH_readBE32(src);
+}
+
+
+#ifndef XXH_NO_LONG_LONG
+
+/* *******************************************************************
+*  64-bit hash functions
+*********************************************************************/
+/*!
+ * @}
+ * @ingroup impl
+ * @{
+ */
+/*******   Memory access   *******/
+
+typedef XXH64_hash_t xxh_u64;
+
+#ifdef XXH_OLD_NAMES
+#  define U64 xxh_u64
+#endif
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+/*
+ * Manual byteshift. Best for old compilers which don't inline memcpy.
+ * We actually directly use XXH_readLE64 and XXH_readBE64.
+ */
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static xxh_u64 XXH_read64(const void* memPtr)
+{
+    return *(const xxh_u64*) memPtr;
+}
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/*
+ * __pack instructions are safer, but compiler specific, hence potentially
+ * problematic for some compilers.
+ *
+ * Currently only defined for GCC and ICC.
+ */
+#ifdef XXH_OLD_NAMES
+typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;
+#endif
+static xxh_u64 XXH_read64(const void* ptr)
+{
+    typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) xxh_unalign64;
+    return ((const xxh_unalign64*)ptr)->u64;
+}
+
+#else
+
+/*
+ * Portable and safe solution. Generally efficient.
+ * see: https://stackoverflow.com/a/32095106/646947
+ */
+static xxh_u64 XXH_read64(const void* memPtr)
+{
+    xxh_u64 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap64 _byteswap_uint64
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap64 __builtin_bswap64
+#else
+static xxh_u64 XXH_swap64(xxh_u64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+
+/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[0]
+         | ((xxh_u64)bytePtr[1] << 8)
+         | ((xxh_u64)bytePtr[2] << 16)
+         | ((xxh_u64)bytePtr[3] << 24)
+         | ((xxh_u64)bytePtr[4] << 32)
+         | ((xxh_u64)bytePtr[5] << 40)
+         | ((xxh_u64)bytePtr[6] << 48)
+         | ((xxh_u64)bytePtr[7] << 56);
+}
+
+XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[7]
+         | ((xxh_u64)bytePtr[6] << 8)
+         | ((xxh_u64)bytePtr[5] << 16)
+         | ((xxh_u64)bytePtr[4] << 24)
+         | ((xxh_u64)bytePtr[3] << 32)
+         | ((xxh_u64)bytePtr[2] << 40)
+         | ((xxh_u64)bytePtr[1] << 48)
+         | ((xxh_u64)bytePtr[0] << 56);
+}
+
+#else
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
+}
+
+static xxh_u64 XXH_readBE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u64
+XXH_readLE64_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return XXH_readLE64(ptr);
+    else
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr);
+}
+
+
+/*******   xxh64   *******/
+/*!
+ * @}
+ * @defgroup xxh64_impl XXH64 implementation
+ * @ingroup impl
+ * @{
+ */
+/* #define rather that static const, to be used as initializers */
+#define XXH_PRIME64_1  0x9E3779B185EBCA87ULL  /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */
+#define XXH_PRIME64_2  0xC2B2AE3D27D4EB4FULL  /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */
+#define XXH_PRIME64_3  0x165667B19E3779F9ULL  /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */
+#define XXH_PRIME64_4  0x85EBCA77C2B2AE63ULL  /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */
+#define XXH_PRIME64_5  0x27D4EB2F165667C5ULL  /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */
+
+#ifdef XXH_OLD_NAMES
+#  define PRIME64_1 XXH_PRIME64_1
+#  define PRIME64_2 XXH_PRIME64_2
+#  define PRIME64_3 XXH_PRIME64_3
+#  define PRIME64_4 XXH_PRIME64_4
+#  define PRIME64_5 XXH_PRIME64_5
+#endif
+
+static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)
+{
+    acc += input * XXH_PRIME64_2;
+    acc  = XXH_rotl64(acc, 31);
+    acc *= XXH_PRIME64_1;
+    return acc;
+}
+
+static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)
+{
+    val  = XXH64_round(0, val);
+    acc ^= val;
+    acc  = acc * XXH_PRIME64_1 + XXH_PRIME64_4;
+    return acc;
+}
+
+static xxh_u64 XXH64_avalanche(xxh_u64 h64)
+{
+    h64 ^= h64 >> 33;
+    h64 *= XXH_PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= XXH_PRIME64_3;
+    h64 ^= h64 >> 32;
+    return h64;
+}
+
+
+#define XXH_get64bits(p) XXH_readLE64_align(p, align)
+
+static xxh_u64
+XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+    len &= 31;
+    while (len >= 8) {
+        xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));
+        ptr += 8;
+        h64 ^= k1;
+        h64  = XXH_rotl64(h64,27) * XXH_PRIME64_1 + XXH_PRIME64_4;
+        len -= 8;
+    }
+    if (len >= 4) {
+        h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;
+        ptr += 4;
+        h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;
+        len -= 4;
+    }
+    while (len > 0) {
+        h64 ^= (*ptr++) * XXH_PRIME64_5;
+        h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1;
+        --len;
+    }
+    return  XXH64_avalanche(h64);
+}
+
+#ifdef XXH_OLD_NAMES
+#  define PROCESS1_64 XXH_PROCESS1_64
+#  define PROCESS4_64 XXH_PROCESS4_64
+#  define PROCESS8_64 XXH_PROCESS8_64
+#else
+#  undef XXH_PROCESS1_64
+#  undef XXH_PROCESS4_64
+#  undef XXH_PROCESS8_64
+#endif
+
+XXH_FORCE_INLINE xxh_u64
+XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
+{
+    const xxh_u8* bEnd = input ? input + len : NULL;
+    xxh_u64 h64;
+
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+    if (input==NULL) {
+        len=0;
+        bEnd=input=(const xxh_u8*)(size_t)32;
+    }
+#endif
+
+    if (len>=32) {
+        const xxh_u8* const limit = bEnd - 32;
+        xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
+        xxh_u64 v2 = seed + XXH_PRIME64_2;
+        xxh_u64 v3 = seed + 0;
+        xxh_u64 v4 = seed - XXH_PRIME64_1;
+
+        do {
+            v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8;
+            v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;
+            v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;
+            v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;
+        } while (input<=limit);
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+
+    } else {
+        h64  = seed + XXH_PRIME64_5;
+    }
+
+    h64 += (xxh_u64) len;
+
+    return XXH64_finalize(h64, input, len, align);
+}
+
+
+/*! @ingroup xxh64_family */
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_state_t state;
+    XXH64_reset(&state, seed);
+    XXH64_update(&state, (const xxh_u8*)input, len);
+    return XXH64_digest(&state);
+#else
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
+            return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+
+#endif
+}
+
+/*******   Hash Streaming   *******/
+
+/*! @ingroup xxh64_family*/
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
+{
+    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+/*! @ingroup xxh64_family */
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+/*! @ingroup xxh64_family */
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+/*! @ingroup xxh64_family */
+XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
+{
+    XXH64_state_t state;   /* use a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state));
+    state.v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
+    state.v2 = seed + XXH_PRIME64_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - XXH_PRIME64_1;
+     /* do not write into reserved64, might be removed in a future version */
+    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64));
+    return XXH_OK;
+}
+
+/*! @ingroup xxh64_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH64_update (XXH64_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
+
+        state->total_len += len;
+
+        if (state->memsize + len < 32) {  /* fill in tmp buffer */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len);
+            state->memsize += (xxh_u32)len;
+            return XXH_OK;
+        }
+
+        if (state->memsize) {   /* tmp buffer is full */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);
+            state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0));
+            state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1));
+            state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2));
+            state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3));
+            p += 32 - state->memsize;
+            state->memsize = 0;
+        }
+
+        /* uintptr_t casts avoid UB or compiler warning on out-of-bounds
+         * pointer arithmetic */
+        if ((uintptr_t)p + 32 <= (uintptr_t)bEnd) {
+            const uintptr_t limit = (uintptr_t)bEnd - 32;
+            xxh_u64 v1 = state->v1;
+            xxh_u64 v2 = state->v2;
+            xxh_u64 v3 = state->v3;
+            xxh_u64 v4 = state->v4;
+
+            do {
+                v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8;
+                v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8;
+                v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8;
+                v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8;
+            } while ((uintptr_t)p<=limit);
+
+            state->v1 = v1;
+            state->v2 = v2;
+            state->v3 = v3;
+            state->v4 = v4;
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+/*! @ingroup xxh64_family */
+XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state)
+{
+    xxh_u64 h64;
+
+    if (state->total_len >= 32) {
+        xxh_u64 const v1 = state->v1;
+        xxh_u64 const v2 = state->v2;
+        xxh_u64 const v3 = state->v3;
+        xxh_u64 const v4 = state->v4;
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+    } else {
+        h64  = state->v3 /*seed*/ + XXH_PRIME64_5;
+    }
+
+    h64 += (xxh_u64) state->total_len;
+
+    return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);
+}
+
+
+/******* Canonical representation   *******/
+
+/*! @ingroup xxh64_family */
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
+
+/*! @ingroup xxh64_family */
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
+{
+    return XXH_readBE64(src);
+}
+
+#ifndef XXH_NO_XXH3
+
+/* *********************************************************************
+*  XXH3
+*  New generation hash designed for speed on small keys and vectorization
+************************************************************************ */
+/*!
+ * @}
+ * @defgroup xxh3_impl XXH3 implementation
+ * @ingroup impl
+ * @{
+ */
+
+/* ===   Compiler specifics   === */
+
+#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */
+#  define XXH_RESTRICT /* disable */
+#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
+#  define XXH_RESTRICT   restrict
+#else
+/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */
+#  define XXH_RESTRICT   /* disable */
+#endif
+
+#if (defined(__GNUC__) && (__GNUC__ >= 3))  \
+  || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \
+  || defined(__clang__)
+#    define XXH_likely(x) __builtin_expect(x, 1)
+#    define XXH_unlikely(x) __builtin_expect(x, 0)
+#else
+#    define XXH_likely(x) (x)
+#    define XXH_unlikely(x) (x)
+#endif
+
+#if defined(__GNUC__)
+#  if defined(__AVX2__)
+#    include <immintrin.h>
+#  elif defined(__SSE2__)
+#    include <emmintrin.h>
+#  elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+#    define inline __inline__  /* circumvent a clang bug */
+#    include <arm_neon.h>
+#    undef inline
+#  endif
+#elif defined(_MSC_VER)
+#  include <intrin.h>
+#endif
+
+/*
+ * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
+ * remaining a true 64-bit/128-bit hash function.
+ *
+ * This is done by prioritizing a subset of 64-bit operations that can be
+ * emulated without too many steps on the average 32-bit machine.
+ *
+ * For example, these two lines seem similar, and run equally fast on 64-bit:
+ *
+ *   xxh_u64 x;
+ *   x ^= (x >> 47); // good
+ *   x ^= (x >> 13); // bad
+ *
+ * However, to a 32-bit machine, there is a major difference.
+ *
+ * x ^= (x >> 47) looks like this:
+ *
+ *   x.lo ^= (x.hi >> (47 - 32));
+ *
+ * while x ^= (x >> 13) looks like this:
+ *
+ *   // note: funnel shifts are not usually cheap.
+ *   x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
+ *   x.hi ^= (x.hi >> 13);
+ *
+ * The first one is significantly faster than the second, simply because the
+ * shift is larger than 32. This means:
+ *  - All the bits we need are in the upper 32 bits, so we can ignore the lower
+ *    32 bits in the shift.
+ *  - The shift result will always fit in the lower 32 bits, and therefore,
+ *    we can ignore the upper 32 bits in the xor.
+ *
+ * Thanks to this optimization, XXH3 only requires these features to be efficient:
+ *
+ *  - Usable unaligned access
+ *  - A 32-bit or 64-bit ALU
+ *      - If 32-bit, a decent ADC instruction
+ *  - A 32 or 64-bit multiply with a 64-bit result
+ *  - For the 128-bit variant, a decent byteswap helps short inputs.
+ *
+ * The first two are already required by XXH32, and almost all 32-bit and 64-bit
+ * platforms which can run XXH32 can run XXH3 efficiently.
+ *
+ * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
+ * notable exception.
+ *
+ * First of all, Thumb-1 lacks support for the UMULL instruction which
+ * performs the important long multiply. This means numerous __aeabi_lmul
+ * calls.
+ *
+ * Second of all, the 8 functional registers are just not enough.
+ * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
+ * Lo registers, and this shuffling results in thousands more MOVs than A32.
+ *
+ * A32 and T32 don't have this limitation. They can access all 14 registers,
+ * do a 32->64 multiply with UMULL, and the flexible operand allowing free
+ * shifts is helpful, too.
+ *
+ * Therefore, we do a quick sanity check.
+ *
+ * If compiling Thumb-1 for a target which supports ARM instructions, we will
+ * emit a warning, as it is not a "sane" platform to compile for.
+ *
+ * Usually, if this happens, it is because of an accident and you probably need
+ * to specify -march, as you likely meant to compile for a newer architecture.
+ *
+ * Credit: large sections of the vectorial and asm source code paths
+ *         have been contributed by @easyaspi314
+ */
+#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
+#   warning "XXH3 is highly inefficient without ARM or Thumb-2."
+#endif
+
+/* ==========================================
+ * Vectorization detection
+ * ========================================== */
+
+#ifdef XXH_DOXYGEN
+/*!
+ * @ingroup tuning
+ * @brief Overrides the vectorization implementation chosen for XXH3.
+ *
+ * Can be defined to 0 to disable SIMD or any of the values mentioned in
+ * @ref XXH_VECTOR_TYPE.
+ *
+ * If this is not defined, it uses predefined macros to determine the best
+ * implementation.
+ */
+#  define XXH_VECTOR XXH_SCALAR
+/*!
+ * @ingroup tuning
+ * @brief Possible values for @ref XXH_VECTOR.
+ *
+ * Note that these are actually implemented as macros.
+ *
+ * If this is not defined, it is detected automatically.
+ * @ref XXH_X86DISPATCH overrides this.
+ */
+enum XXH_VECTOR_TYPE /* fake enum */ {
+    XXH_SCALAR = 0,  /*!< Portable scalar version */
+    XXH_SSE2   = 1,  /*!<
+                      * SSE2 for Pentium 4, Opteron, all x86_64.
+                      *
+                      * @note SSE2 is also guaranteed on Windows 10, macOS, and
+                      * Android x86.
+                      */
+    XXH_AVX2   = 2,  /*!< AVX2 for Haswell and Bulldozer */
+    XXH_AVX512 = 3,  /*!< AVX512 for Skylake and Icelake */
+    XXH_NEON   = 4,  /*!< NEON for most ARMv7-A and all AArch64 */
+    XXH_VSX    = 5,  /*!< VSX and ZVector for POWER8/z13 (64-bit) */
+};
+/*!
+ * @ingroup tuning
+ * @brief Selects the minimum alignment for XXH3's accumulators.
+ *
+ * When using SIMD, this should match the alignment reqired for said vector
+ * type, so, for example, 32 for AVX2.
+ *
+ * Default: Auto detected.
+ */
+#  define XXH_ACC_ALIGN 8
+#endif
+
+/* Actual definition */
+#ifndef XXH_DOXYGEN
+#  define XXH_SCALAR 0
+#  define XXH_SSE2   1
+#  define XXH_AVX2   2
+#  define XXH_AVX512 3
+#  define XXH_NEON   4
+#  define XXH_VSX    5
+#endif
+
+#ifndef XXH_VECTOR    /* can be defined on command line */
+#  if defined(__AVX512F__)
+#    define XXH_VECTOR XXH_AVX512
+#  elif defined(__AVX2__)
+#    define XXH_VECTOR XXH_AVX2
+#  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+#    define XXH_VECTOR XXH_SSE2
+#  elif defined(__GNUC__) /* msvc support maybe later */ \
+  && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
+  && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
+    || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+#    define XXH_VECTOR XXH_NEON
+#  elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \
+     || (defined(__s390x__) && defined(__VEC__)) \
+     && defined(__GNUC__) /* TODO: IBM XL */
+#    define XXH_VECTOR XXH_VSX
+#  else
+#    define XXH_VECTOR XXH_SCALAR
+#  endif
+#endif
+
+/*
+ * Controls the alignment of the accumulator,
+ * for compatibility with aligned vector loads, which are usually faster.
+ */
+#ifndef XXH_ACC_ALIGN
+#  if defined(XXH_X86DISPATCH)
+#     define XXH_ACC_ALIGN 64  /* for compatibility with avx512 */
+#  elif XXH_VECTOR == XXH_SCALAR  /* scalar */
+#     define XXH_ACC_ALIGN 8
+#  elif XXH_VECTOR == XXH_SSE2  /* sse2 */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_AVX2  /* avx2 */
+#     define XXH_ACC_ALIGN 32
+#  elif XXH_VECTOR == XXH_NEON  /* neon */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_VSX   /* vsx */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_AVX512  /* avx512 */
+#     define XXH_ACC_ALIGN 64
+#  endif
+#endif
+
+#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \
+    || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
+#  define XXH_SEC_ALIGN XXH_ACC_ALIGN
+#else
+#  define XXH_SEC_ALIGN 8
+#endif
+
+/*
+ * UGLY HACK:
+ * GCC usually generates the best code with -O3 for xxHash.
+ *
+ * However, when targeting AVX2, it is overzealous in its unrolling resulting
+ * in code roughly 3/4 the speed of Clang.
+ *
+ * There are other issues, such as GCC splitting _mm256_loadu_si256 into
+ * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
+ * only applies to Sandy and Ivy Bridge... which don't even support AVX2.
+ *
+ * That is why when compiling the AVX2 version, it is recommended to use either
+ *   -O2 -mavx2 -march=haswell
+ * or
+ *   -O2 -mavx2 -mno-avx256-split-unaligned-load
+ * for decent performance, or to use Clang instead.
+ *
+ * Fortunately, we can control the first one with a pragma that forces GCC into
+ * -O2, but the other one we can't control without "failed to inline always
+ * inline function due to target mismatch" warnings.
+ */
+#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
+#  pragma GCC push_options
+#  pragma GCC optimize("-O2")
+#endif
+
+
+#if XXH_VECTOR == XXH_NEON
+/*
+ * NEON's setup for vmlal_u32 is a little more complicated than it is on
+ * SSE2, AVX2, and VSX.
+ *
+ * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast.
+ *
+ * To do the same operation, the 128-bit 'Q' register needs to be split into
+ * two 64-bit 'D' registers, performing this operation::
+ *
+ *   [                a                 |                 b                ]
+ *            |              '---------. .--------'                |
+ *            |                         x                          |
+ *            |              .---------' '--------.                |
+ *   [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[    a >> 32     |     b >> 32    ]
+ *
+ * Due to significant changes in aarch64, the fastest method for aarch64 is
+ * completely different than the fastest method for ARMv7-A.
+ *
+ * ARMv7-A treats D registers as unions overlaying Q registers, so modifying
+ * D11 will modify the high half of Q5. This is similar to how modifying AH
+ * will only affect bits 8-15 of AX on x86.
+ *
+ * VZIP takes two registers, and puts even lanes in one register and odd lanes
+ * in the other.
+ *
+ * On ARMv7-A, this strangely modifies both parameters in place instead of
+ * taking the usual 3-operand form.
+ *
+ * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the
+ * lower and upper halves of the Q register to end up with the high and low
+ * halves where we want - all in one instruction.
+ *
+ *   vzip.32   d10, d11       @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] }
+ *
+ * Unfortunately we need inline assembly for this: Instructions modifying two
+ * registers at once is not possible in GCC or Clang's IR, and they have to
+ * create a copy.
+ *
+ * aarch64 requires a different approach.
+ *
+ * In order to make it easier to write a decent compiler for aarch64, many
+ * quirks were removed, such as conditional execution.
+ *
+ * NEON was also affected by this.
+ *
+ * aarch64 cannot access the high bits of a Q-form register, and writes to a
+ * D-form register zero the high bits, similar to how writes to W-form scalar
+ * registers (or DWORD registers on x86_64) work.
+ *
+ * The formerly free vget_high intrinsics now require a vext (with a few
+ * exceptions)
+ *
+ * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent
+ * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one
+ * operand.
+ *
+ * The equivalent of the VZIP.32 on the lower and upper halves would be this
+ * mess:
+ *
+ *   ext     v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] }
+ *   zip1    v1.2s, v0.2s, v2.2s     // v1 = { v0[0], v2[0] }
+ *   zip2    v0.2s, v0.2s, v1.2s     // v0 = { v0[1], v2[1] }
+ *
+ * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN):
+ *
+ *   shrn    v1.2s, v0.2d, #32  // v1 = (uint32x2_t)(v0 >> 32);
+ *   xtn     v0.2s, v0.2d       // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
+ *
+ * This is available on ARMv7-A, but is less efficient than a single VZIP.32.
+ */
+
+/*!
+ * Function-like macro:
+ * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)
+ * {
+ *     outLo = (uint32x2_t)(in & 0xFFFFFFFF);
+ *     outHi = (uint32x2_t)(in >> 32);
+ *     in = UNDEFINED;
+ * }
+ */
+# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
+   && defined(__GNUC__) \
+   && !defined(__aarch64__) && !defined(__arm64__)
+#  define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                              \
+    do {                                                                                    \
+      /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \
+      /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */     \
+      /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \
+      __asm__("vzip.32  %e0, %f0" : "+w" (in));                                             \
+      (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in));                                   \
+      (outHi) = vget_high_u32(vreinterpretq_u32_u64(in));                                   \
+   } while (0)
+# else
+#  define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                            \
+    do {                                                                                  \
+      (outLo) = vmovn_u64    (in);                                                        \
+      (outHi) = vshrn_n_u64  ((in), 32);                                                  \
+    } while (0)
+# endif
+#endif  /* XXH_VECTOR == XXH_NEON */
+
+/*
+ * VSX and Z Vector helpers.
+ *
+ * This is very messy, and any pull requests to clean this up are welcome.
+ *
+ * There are a lot of problems with supporting VSX and s390x, due to
+ * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
+ */
+#if XXH_VECTOR == XXH_VSX
+#  if defined(__s390x__)
+#    include <s390intrin.h>
+#  else
+/* gcc's altivec.h can have the unwanted consequence to unconditionally
+ * #define bool, vector, and pixel keywords,
+ * with bad consequences for programs already using these keywords for other purposes.
+ * The paragraph defining these macros is skipped when __APPLE_ALTIVEC__ is defined.
+ * __APPLE_ALTIVEC__ is _generally_ defined automatically by the compiler,
+ * but it seems that, in some cases, it isn't.
+ * Force the build macro to be defined, so that keywords are not altered.
+ */
+#    if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__)
+#      define __APPLE_ALTIVEC__
+#    endif
+#    include <altivec.h>
+#  endif
+
+typedef __vector unsigned long long xxh_u64x2;
+typedef __vector unsigned char xxh_u8x16;
+typedef __vector unsigned xxh_u32x4;
+
+# ifndef XXH_VSX_BE
+#  if defined(__BIG_ENDIAN__) \
+  || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_VSX_BE 1
+#  elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
+#    warning "-maltivec=be is not recommended. Please use native endianness."
+#    define XXH_VSX_BE 1
+#  else
+#    define XXH_VSX_BE 0
+#  endif
+# endif /* !defined(XXH_VSX_BE) */
+
+# if XXH_VSX_BE
+#  if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__))
+#    define XXH_vec_revb vec_revb
+#  else
+/*!
+ * A polyfill for POWER9's vec_revb().
+ */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)
+{
+    xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
+                                  0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
+    return vec_perm(val, val, vByteSwap);
+}
+#  endif
+# endif /* XXH_VSX_BE */
+
+/*!
+ * Performs an unaligned vector load and byte swaps it on big endian.
+ */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
+{
+    xxh_u64x2 ret;
+    memcpy(&ret, ptr, sizeof(xxh_u64x2));
+# if XXH_VSX_BE
+    ret = XXH_vec_revb(ret);
+# endif
+    return ret;
+}
+
+/*
+ * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
+ *
+ * These intrinsics weren't added until GCC 8, despite existing for a while,
+ * and they are endian dependent. Also, their meaning swap depending on version.
+ * */
+# if defined(__s390x__)
+ /* s390x is always big endian, no issue on this platform */
+#  define XXH_vec_mulo vec_mulo
+#  define XXH_vec_mule vec_mule
+# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw)
+/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */
+#  define XXH_vec_mulo __builtin_altivec_vmulouw
+#  define XXH_vec_mule __builtin_altivec_vmuleuw
+# else
+/* gcc needs inline assembly */
+/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b)
+{
+    xxh_u64x2 result;
+    __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
+{
+    xxh_u64x2 result;
+    __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+# endif /* XXH_vec_mulo, XXH_vec_mule */
+#endif /* XXH_VECTOR == XXH_VSX */
+
+
+/* prefetch
+ * can be disabled, by declaring XXH_NO_PREFETCH build macro */
+#if defined(XXH_NO_PREFETCH)
+#  define XXH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
+#else
+#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))  /* _mm_prefetch() not defined outside of x86/x64 */
+#    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#    define XXH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+#    define XXH_PREFETCH(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#  else
+#    define XXH_PREFETCH(ptr) (void)(ptr)  /* disabled */
+#  endif
+#endif  /* XXH_NO_PREFETCH */
+
+
+/* ==========================================
+ * XXH3 default settings
+ * ========================================== */
+
+#define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
+
+#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
+#  error "default keyset is not large enough"
+#endif
+
+/*! Pseudorandom secret taken directly from FARSH. */
+XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
+    0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
+    0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
+    0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+    0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
+    0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
+    0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+    0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
+    0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
+    0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+    0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
+    0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
+    0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+};
+
+
+#ifdef XXH_OLD_NAMES
+#  define kSecret XXH3_kSecret
+#endif
+
+#ifdef XXH_DOXYGEN
+/*!
+ * @brief Calculates a 32-bit to 64-bit long multiply.
+ *
+ * Implemented as a macro.
+ *
+ * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't
+ * need to (but it shouldn't need to anyways, it is about 7 instructions to do
+ * a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we
+ * use that instead of the normal method.
+ *
+ * If you are compiling for platforms like Thumb-1 and don't have a better option,
+ * you may also want to write your own long multiply routine here.
+ *
+ * @param x, y Numbers to be multiplied
+ * @return 64-bit product of the low 32 bits of @p x and @p y.
+ */
+XXH_FORCE_INLINE xxh_u64
+XXH_mult32to64(xxh_u64 x, xxh_u64 y)
+{
+   return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
+}
+#elif defined(_MSC_VER) && defined(_M_IX86)
+#    include <intrin.h>
+#    define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
+#else
+/*
+ * Downcast + upcast is usually better than masking on older compilers like
+ * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
+ *
+ * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands
+ * and perform a full 64x64 multiply -- entirely redundant on 32-bit.
+ */
+#    define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
+#endif
+
+/*!
+ * @brief Calculates a 64->128-bit long multiply.
+ *
+ * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar
+ * version.
+ *
+ * @param lhs, rhs The 64-bit integers to be multiplied
+ * @return The 128-bit result represented in an @ref XXH128_hash_t.
+ */
+static XXH128_hash_t
+XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
+{
+    /*
+     * GCC/Clang __uint128_t method.
+     *
+     * On most 64-bit targets, GCC and Clang define a __uint128_t type.
+     * This is usually the best way as it usually uses a native long 64-bit
+     * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
+     *
+     * Usually.
+     *
+     * Despite being a 32-bit platform, Clang (and emscripten) define this type
+     * despite not having the arithmetic for it. This results in a laggy
+     * compiler builtin call which calculates a full 128-bit multiply.
+     * In that case it is best to use the portable one.
+     * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
+     */
+#if defined(__GNUC__) && !defined(__wasm__) \
+    && defined(__SIZEOF_INT128__) \
+    || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+
+    __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
+    XXH128_hash_t r128;
+    r128.low64  = (xxh_u64)(product);
+    r128.high64 = (xxh_u64)(product >> 64);
+    return r128;
+
+    /*
+     * MSVC for x64's _umul128 method.
+     *
+     * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
+     *
+     * This compiles to single operand MUL on x64.
+     */
+#elif defined(_M_X64) || defined(_M_IA64)
+
+#ifndef _MSC_VER
+#   pragma intrinsic(_umul128)
+#endif
+    xxh_u64 product_high;
+    xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
+    XXH128_hash_t r128;
+    r128.low64  = product_low;
+    r128.high64 = product_high;
+    return r128;
+
+#else
+    /*
+     * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
+     *
+     * This is a fast and simple grade school multiply, which is shown below
+     * with base 10 arithmetic instead of base 0x100000000.
+     *
+     *           9 3 // D2 lhs = 93
+     *         x 7 5 // D2 rhs = 75
+     *     ----------
+     *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
+     *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
+     *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
+     *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
+     *     ---------
+     *         2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
+     *     + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
+     *     ---------
+     *       6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
+     *
+     * The reasons for adding the products like this are:
+     *  1. It avoids manual carry tracking. Just like how
+     *     (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
+     *     This avoids a lot of complexity.
+     *
+     *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
+     *     instruction available in ARM's Digital Signal Processing extension
+     *     in 32-bit ARMv6 and later, which is shown below:
+     *
+     *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
+     *         {
+     *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
+     *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
+     *             *RdHi = (xxh_u32)(product >> 32);
+     *         }
+     *
+     *     This instruction was designed for efficient long multiplication, and
+     *     allows this to be calculated in only 4 instructions at speeds
+     *     comparable to some 64-bit ALUs.
+     *
+     *  3. It isn't terrible on other platforms. Usually this will be a couple
+     *     of 32-bit ADD/ADCs.
+     */
+
+    /* First calculate all of the cross products. */
+    xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
+    xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32,        rhs & 0xFFFFFFFF);
+    xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
+    xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32,        rhs >> 32);
+
+    /* Now add the products together. These will never overflow. */
+    xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
+    xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
+    xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
+
+    XXH128_hash_t r128;
+    r128.low64  = lower;
+    r128.high64 = upper;
+    return r128;
+#endif
+}
+
+/*!
+ * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it.
+ *
+ * The reason for the separate function is to prevent passing too many structs
+ * around by value. This will hopefully inline the multiply, but we don't force it.
+ *
+ * @param lhs, rhs The 64-bit integers to multiply
+ * @return The low 64 bits of the product XOR'd by the high 64 bits.
+ * @see XXH_mult64to128()
+ */
+static xxh_u64
+XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
+{
+    XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
+    return product.low64 ^ product.high64;
+}
+
+/*! Seems to produce slightly better code on GCC for some reason. */
+XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
+{
+    XXH_ASSERT(0 <= shift && shift < 64);
+    return v64 ^ (v64 >> shift);
+}
+
+/*
+ * This is a fast avalanche stage,
+ * suitable when input bits are already partially mixed
+ */
+static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)
+{
+    h64 = XXH_xorshift64(h64, 37);
+    h64 *= 0x165667919E3779F9ULL;
+    h64 = XXH_xorshift64(h64, 32);
+    return h64;
+}
+
+/*
+ * This is a stronger avalanche,
+ * inspired by Pelle Evensen's rrmxmx
+ * preferable when input has not been previously mixed
+ */
+static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len)
+{
+    /* this mix is inspired by Pelle Evensen's rrmxmx */
+    h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24);
+    h64 *= 0x9FB21C651E98DF25ULL;
+    h64 ^= (h64 >> 35) + len ;
+    h64 *= 0x9FB21C651E98DF25ULL;
+    return XXH_xorshift64(h64, 28);
+}
+
+
+/* ==========================================
+ * Short keys
+ * ==========================================
+ * One of the shortcomings of XXH32 and XXH64 was that their performance was
+ * sub-optimal on short lengths. It used an iterative algorithm which strongly
+ * favored lengths that were a multiple of 4 or 8.
+ *
+ * Instead of iterating over individual inputs, we use a set of single shot
+ * functions which piece together a range of lengths and operate in constant time.
+ *
+ * Additionally, the number of multiplies has been significantly reduced. This
+ * reduces latency, especially when emulating 64-bit multiplies on 32-bit.
+ *
+ * Depending on the platform, this may or may not be faster than XXH32, but it
+ * is almost guaranteed to be faster than XXH64.
+ */
+
+/*
+ * At very short lengths, there isn't enough input to fully hide secrets, or use
+ * the entire secret.
+ *
+ * There is also only a limited amount of mixing we can do before significantly
+ * impacting performance.
+ *
+ * Therefore, we use different sections of the secret and always mix two secret
+ * samples with an XOR. This should have no effect on performance on the
+ * seedless or withSeed variants because everything _should_ be constant folded
+ * by modern compilers.
+ *
+ * The XOR mixing hides individual parts of the secret and increases entropy.
+ *
+ * This adds an extra layer of strength for custom secrets.
+ */
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    /*
+     * len = 1: combined = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combined = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combined = { input[2], 0x03, input[0], input[1] }
+     */
+    {   xxh_u8  const c1 = input[0];
+        xxh_u8  const c2 = input[len >> 1];
+        xxh_u8  const c3 = input[len - 1];
+        xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2  << 24)
+                               | ((xxh_u32)c3 <<  0) | ((xxh_u32)len << 8);
+        xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+        xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;
+        return XXH64_avalanche(keyed);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+    {   xxh_u32 const input1 = XXH_readLE32(input);
+        xxh_u32 const input2 = XXH_readLE32(input + len - 4);
+        xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed;
+        xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
+        xxh_u64 const keyed = input64 ^ bitflip;
+        return XXH3_rrmxmx(keyed, len);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed;
+        xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed;
+        xxh_u64 const input_lo = XXH_readLE64(input)           ^ bitflip1;
+        xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
+        xxh_u64 const acc = len
+                          + XXH_swap64(input_lo) + input_hi
+                          + XXH3_mul128_fold64(input_lo, input_hi);
+        return XXH3_avalanche(acc);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (XXH_likely(len >  8)) return XXH3_len_9to16_64b(input, len, secret, seed);
+        if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_64b(input, len, secret, seed);
+        return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64)));
+    }
+}
+
+/*
+ * DISCLAIMER: There are known *seed-dependent* multicollisions here due to
+ * multiplication by zero, affecting hashes of lengths 17 to 240.
+ *
+ * However, they are very unlikely.
+ *
+ * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
+ * unseeded non-cryptographic hashes, it does not attempt to defend itself
+ * against specially crafted inputs, only random inputs.
+ *
+ * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
+ * cancelling out the secret is taken an arbitrary number of times (addressed
+ * in XXH3_accumulate_512), this collision is very unlikely with random inputs
+ * and/or proper seeding:
+ *
+ * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
+ * function that is only called up to 16 times per hash with up to 240 bytes of
+ * input.
+ *
+ * This is not too bad for a non-cryptographic hash function, especially with
+ * only 64 bit outputs.
+ *
+ * The 128-bit variant (which trades some speed for strength) is NOT affected
+ * by this, although it is always a good idea to use a proper seed if you care
+ * about strength.
+ */
+XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,
+                                     const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
+{
+#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__i386__) && defined(__SSE2__)  /* x86 + SSE2 */ \
+  && !defined(XXH_ENABLE_AUTOVECTORIZE)      /* Define to disable like XXH32 hack */
+    /*
+     * UGLY HACK:
+     * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
+     * slower code.
+     *
+     * By forcing seed64 into a register, we disrupt the cost model and
+     * cause it to scalarize. See `XXH32_round()`
+     *
+     * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,
+     * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on
+     * GCC 9.2, despite both emitting scalar code.
+     *
+     * GCC generates much better scalar code than Clang for the rest of XXH3,
+     * which is why finding a more optimal codepath is an interest.
+     */
+    XXH_COMPILER_GUARD(seed64);
+#endif
+    {   xxh_u64 const input_lo = XXH_readLE64(input);
+        xxh_u64 const input_hi = XXH_readLE64(input+8);
+        return XXH3_mul128_fold64(
+            input_lo ^ (XXH_readLE64(secret)   + seed64),
+            input_hi ^ (XXH_readLE64(secret+8) - seed64)
+        );
+    }
+}
+
+/* For mid range keys, XXH3 uses a Mum-hash variant. */
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                     const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                     XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   xxh_u64 acc = len * XXH_PRIME64_1;
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc += XXH3_mix16B(input+48, secret+96, seed);
+                    acc += XXH3_mix16B(input+len-64, secret+112, seed);
+                }
+                acc += XXH3_mix16B(input+32, secret+64, seed);
+                acc += XXH3_mix16B(input+len-48, secret+80, seed);
+            }
+            acc += XXH3_mix16B(input+16, secret+32, seed);
+            acc += XXH3_mix16B(input+len-32, secret+48, seed);
+        }
+        acc += XXH3_mix16B(input+0, secret+0, seed);
+        acc += XXH3_mix16B(input+len-16, secret+16, seed);
+
+        return XXH3_avalanche(acc);
+    }
+}
+
+#define XXH3_MIDSIZE_MAX 240
+
+XXH_NO_INLINE XXH64_hash_t
+XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    #define XXH3_MIDSIZE_STARTOFFSET 3
+    #define XXH3_MIDSIZE_LASTOFFSET  17
+
+    {   xxh_u64 acc = len * XXH_PRIME64_1;
+        int const nbRounds = (int)len / 16;
+        int i;
+        for (i=0; i<8; i++) {
+            acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);
+        }
+        acc = XXH3_avalanche(acc);
+        XXH_ASSERT(nbRounds >= 8);
+#if defined(__clang__)                                /* Clang */ \
+    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
+    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
+         * In everywhere else, it uses scalar code.
+         *
+         * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
+         * would still be slower than UMAAL (see XXH_mult64to128).
+         *
+         * Unfortunately, Clang doesn't handle the long multiplies properly and
+         * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
+         * scalarized into an ugly mess of VMOV.32 instructions.
+         *
+         * This mess is difficult to avoid without turning autovectorization
+         * off completely, but they are usually relatively minor and/or not
+         * worth it to fix.
+         *
+         * This loop is the easiest to fix, as unlike XXH32, this pragma
+         * _actually works_ because it is a loop vectorization instead of an
+         * SLP vectorization.
+         */
+        #pragma clang loop vectorize(disable)
+#endif
+        for (i=8 ; i < nbRounds; i++) {
+            acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
+        }
+        /* last bytes */
+        acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
+        return XXH3_avalanche(acc);
+    }
+}
+
+
+/* =======     Long Keys     ======= */
+
+#define XXH_STRIPE_LEN 64
+#define XXH_SECRET_CONSUME_RATE 8   /* nb of secret bytes consumed at each accumulation */
+#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))
+
+#ifdef XXH_OLD_NAMES
+#  define STRIPE_LEN XXH_STRIPE_LEN
+#  define ACC_NB XXH_ACC_NB
+#endif
+
+XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
+{
+    if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
+    memcpy(dst, &v64, sizeof(v64));
+}
+
+/* Several intrinsic functions below are supposed to accept __int64 as argument,
+ * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ .
+ * However, several environments do not define __int64 type,
+ * requiring a workaround.
+ */
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+    typedef int64_t xxh_i64;
+#else
+    /* the following type must have a width of 64-bit */
+    typedef long long xxh_i64;
+#endif
+
+/*
+ * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
+ *
+ * It is a hardened version of UMAC, based off of FARSH's implementation.
+ *
+ * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
+ * implementations, and it is ridiculously fast.
+ *
+ * We harden it by mixing the original input to the accumulators as well as the product.
+ *
+ * This means that in the (relatively likely) case of a multiply by zero, the
+ * original input is preserved.
+ *
+ * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
+ * cross-pollination, as otherwise the upper and lower halves would be
+ * essentially independent.
+ *
+ * This doesn't matter on 64-bit hashes since they all get merged together in
+ * the end, so we skip the extra step.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+
+#if (XXH_VECTOR == XXH_AVX512) \
+     || (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0)
+
+#ifndef XXH_TARGET_AVX512
+# define XXH_TARGET_AVX512  /* disable attribute target */
+#endif
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
+                     const void* XXH_RESTRICT input,
+                     const void* XXH_RESTRICT secret)
+{
+    XXH_ALIGN(64) __m512i* const xacc = (__m512i *) acc;
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
+
+    {
+        /* data_vec    = input[0]; */
+        __m512i const data_vec    = _mm512_loadu_si512   (input);
+        /* key_vec     = secret[0]; */
+        __m512i const key_vec     = _mm512_loadu_si512   (secret);
+        /* data_key    = data_vec ^ key_vec; */
+        __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
+        /* data_key_lo = data_key >> 32; */
+        __m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
+        /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+        __m512i const product     = _mm512_mul_epu32     (data_key, data_key_lo);
+        /* xacc[0] += swap(data_vec); */
+        __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2));
+        __m512i const sum       = _mm512_add_epi64(*xacc, data_swap);
+        /* xacc[0] += product; */
+        *xacc = _mm512_add_epi64(product, sum);
+    }
+}
+
+/*
+ * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
+ *
+ * Multiplication isn't perfect, as explained by Google in HighwayHash:
+ *
+ *  // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
+ *  // varying degrees. In descending order of goodness, bytes
+ *  // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
+ *  // As expected, the upper and lower bytes are much worse.
+ *
+ * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291
+ *
+ * Since our algorithm uses a pseudorandom secret to add some variance into the
+ * mix, we don't need to (or want to) mix as often or as much as HighwayHash does.
+ *
+ * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid
+ * extraction.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
+    {   XXH_ALIGN(64) __m512i* const xacc = (__m512i*) acc;
+        const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
+
+        /* xacc[0] ^= (xacc[0] >> 47) */
+        __m512i const acc_vec     = *xacc;
+        __m512i const shifted     = _mm512_srli_epi64    (acc_vec, 47);
+        __m512i const data_vec    = _mm512_xor_si512     (acc_vec, shifted);
+        /* xacc[0] ^= secret; */
+        __m512i const key_vec     = _mm512_loadu_si512   (secret);
+        __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
+
+        /* xacc[0] *= XXH_PRIME32_1; */
+        __m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
+        __m512i const prod_lo     = _mm512_mul_epu32     (data_key, prime32);
+        __m512i const prod_hi     = _mm512_mul_epu32     (data_key_hi, prime32);
+        *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
+    }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64);
+    XXH_ASSERT(((size_t)customSecret & 63) == 0);
+    (void)(&XXH_writeLE64);
+    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
+        __m512i const seed = _mm512_mask_set1_epi64(_mm512_set1_epi64((xxh_i64)seed64), 0xAA, (xxh_i64)(0U - seed64));
+
+        XXH_ALIGN(64) const __m512i* const src  = (const __m512i*) XXH3_kSecret;
+        XXH_ALIGN(64)       __m512i* const dest = (      __m512i*) customSecret;
+        int i;
+        for (i=0; i < nbRounds; ++i) {
+            /* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void const*',
+             * this will warn "discards 'const' qualifier". */
+            union {
+                XXH_ALIGN(64) const __m512i* cp;
+                XXH_ALIGN(64) void* p;
+            } remote_const_void;
+            remote_const_void.cp = src + i;
+            dest[i] = _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed);
+    }   }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_AVX2) \
+    || (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0)
+
+#ifndef XXH_TARGET_AVX2
+# define XXH_TARGET_AVX2  /* disable attribute target */
+#endif
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void
+XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   XXH_ALIGN(32) __m256i* const xacc    =       (__m256i *) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires  a const __m256i * pointer for some reason. */
+        const         __m256i* const xinput  = (const __m256i *) input;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+        const         __m256i* const xsecret = (const __m256i *) secret;
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
+            /* data_vec    = xinput[i]; */
+            __m256i const data_vec    = _mm256_loadu_si256    (xinput+i);
+            /* key_vec     = xsecret[i]; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            /* data_key    = data_vec ^ key_vec; */
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m256i const product     = _mm256_mul_epu32     (data_key, data_key_lo);
+            /* xacc[i] += swap(data_vec); */
+            __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
+            __m256i const sum       = _mm256_add_epi64(xacc[i], data_swap);
+            /* xacc[i] += product; */
+            xacc[i] = _mm256_add_epi64(product, sum);
+    }   }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void
+XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+        const         __m256i* const xsecret = (const __m256i *) secret;
+        const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1);
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m256i const acc_vec     = xacc[i];
+            __m256i const shifted     = _mm256_srli_epi64    (acc_vec, 47);
+            __m256i const data_vec    = _mm256_xor_si256     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+
+            /* xacc[i] *= XXH_PRIME32_1; */
+            __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            __m256i const prod_lo     = _mm256_mul_epu32     (data_key, prime32);
+            __m256i const prod_hi     = _mm256_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0);
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6);
+    XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64);
+    (void)(&XXH_writeLE64);
+    XXH_PREFETCH(customSecret);
+    {   __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64);
+
+        XXH_ALIGN(64) const __m256i* const src  = (const __m256i*) XXH3_kSecret;
+        XXH_ALIGN(64)       __m256i*       dest = (      __m256i*) customSecret;
+
+#       if defined(__GNUC__) || defined(__clang__)
+        /*
+         * On GCC & Clang, marking 'dest' as modified will cause the compiler:
+         *   - do not extract the secret from sse registers in the internal loop
+         *   - use less common registers, and avoid pushing these reg into stack
+         */
+        XXH_COMPILER_GUARD(dest);
+#       endif
+
+        /* GCC -O2 need unroll loop manually */
+        dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src+0), seed);
+        dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src+1), seed);
+        dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src+2), seed);
+        dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src+3), seed);
+        dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src+4), seed);
+        dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src+5), seed);
+    }
+}
+
+#endif
+
+/* x86dispatch always generates SSE2 */
+#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)
+
+#ifndef XXH_TARGET_SSE2
+# define XXH_TARGET_SSE2  /* disable attribute target */
+#endif
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void
+XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    /* SSE2 is just a half-scale version of the AVX2 version. */
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   XXH_ALIGN(16) __m128i* const xacc    =       (__m128i *) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xinput  = (const __m128i *) input;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xsecret = (const __m128i *) secret;
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
+            /* data_vec    = xinput[i]; */
+            __m128i const data_vec    = _mm_loadu_si128   (xinput+i);
+            /* key_vec     = xsecret[i]; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            /* data_key    = data_vec ^ key_vec; */
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m128i const product     = _mm_mul_epu32     (data_key, data_key_lo);
+            /* xacc[i] += swap(data_vec); */
+            __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
+            __m128i const sum       = _mm_add_epi64(xacc[i], data_swap);
+            /* xacc[i] += product; */
+            xacc[i] = _mm_add_epi64(product, sum);
+    }   }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void
+XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xsecret = (const __m128i *) secret;
+        const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1);
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m128i const acc_vec     = xacc[i];
+            __m128i const shifted     = _mm_srli_epi64    (acc_vec, 47);
+            __m128i const data_vec    = _mm_xor_si128     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret[i]; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+
+            /* xacc[i] *= XXH_PRIME32_1; */
+            __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            __m128i const prod_lo     = _mm_mul_epu32     (data_key, prime32);
+            __m128i const prod_hi     = _mm_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+    (void)(&XXH_writeLE64);
+    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);
+
+#       if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
+        // MSVC 32bit mode does not support _mm_set_epi64x before 2015
+        XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) };
+        __m128i const seed = _mm_load_si128((__m128i const*)seed64x2);
+#       else
+        __m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64);
+#       endif
+        int i;
+
+        XXH_ALIGN(64)        const float* const src  = (float const*) XXH3_kSecret;
+        XXH_ALIGN(XXH_SEC_ALIGN) __m128i*       dest = (__m128i*) customSecret;
+#       if defined(__GNUC__) || defined(__clang__)
+        /*
+         * On GCC & Clang, marking 'dest' as modified will cause the compiler:
+         *   - do not extract the secret from sse registers in the internal loop
+         *   - use less common registers, and avoid pushing these reg into stack
+         */
+        XXH_COMPILER_GUARD(dest);
+#       endif
+
+        for (i=0; i < nbRounds; ++i) {
+            dest[i] = _mm_add_epi64(_mm_castps_si128(_mm_load_ps(src+i*4)), seed);
+    }   }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_NEON)
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {
+        XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
+        /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
+        uint8_t const* const xinput = (const uint8_t *) input;
+        uint8_t const* const xsecret  = (const uint8_t *) secret;
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) {
+            /* data_vec = xinput[i]; */
+            uint8x16_t data_vec    = vld1q_u8(xinput  + (i * 16));
+            /* key_vec  = xsecret[i];  */
+            uint8x16_t key_vec     = vld1q_u8(xsecret + (i * 16));
+            uint64x2_t data_key;
+            uint32x2_t data_key_lo, data_key_hi;
+            /* xacc[i] += swap(data_vec); */
+            uint64x2_t const data64  = vreinterpretq_u64_u8(data_vec);
+            uint64x2_t const swapped = vextq_u64(data64, data64, 1);
+            xacc[i] = vaddq_u64 (xacc[i], swapped);
+            /* data_key = data_vec ^ key_vec; */
+            data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
+            /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);
+             * data_key_hi = (uint32x2_t) (data_key >> 32);
+             * data_key = UNDEFINED; */
+            XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
+            /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
+            xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
+
+        }
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {   uint64x2_t* xacc       = (uint64x2_t*) acc;
+        uint8_t const* xsecret = (uint8_t const*) secret;
+        uint32x2_t prime       = vdup_n_u32 (XXH_PRIME32_1);
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(uint64x2_t); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47); */
+            uint64x2_t acc_vec  = xacc[i];
+            uint64x2_t shifted  = vshrq_n_u64 (acc_vec, 47);
+            uint64x2_t data_vec = veorq_u64   (acc_vec, shifted);
+
+            /* xacc[i] ^= xsecret[i]; */
+            uint8x16_t key_vec  = vld1q_u8(xsecret + (i * 16));
+            uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec));
+
+            /* xacc[i] *= XXH_PRIME32_1 */
+            uint32x2_t data_key_lo, data_key_hi;
+            /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);
+             * data_key_hi = (uint32x2_t) (xacc[i] >> 32);
+             * xacc[i] = UNDEFINED; */
+            XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
+            {   /*
+                 * prod_hi = (data_key >> 32) * XXH_PRIME32_1;
+                 *
+                 * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
+                 * incorrectly "optimize" this:
+                 *   tmp     = vmul_u32(vmovn_u64(a), vmovn_u64(b));
+                 *   shifted = vshll_n_u32(tmp, 32);
+                 * to this:
+                 *   tmp     = "vmulq_u64"(a, b); // no such thing!
+                 *   shifted = vshlq_n_u64(tmp, 32);
+                 *
+                 * However, unlike SSE, Clang lacks a 64-bit multiply routine
+                 * for NEON, and it scalarizes two 64-bit multiplies instead.
+                 *
+                 * vmull_u32 has the same timing as vmul_u32, and it avoids
+                 * this bug completely.
+                 * See https://bugs.llvm.org/show_bug.cgi?id=39967
+                 */
+                uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);
+                /* xacc[i] = prod_hi << 32; */
+                xacc[i] = vshlq_n_u64(prod_hi, 32);
+                /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */
+                xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime);
+            }
+    }   }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_VSX)
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_vsx(  void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+          xxh_u64x2* const xacc     =       (xxh_u64x2*) acc;    /* presumed aligned */
+    xxh_u64x2 const* const xinput   = (xxh_u64x2 const*) input;   /* no alignment restriction */
+    xxh_u64x2 const* const xsecret  = (xxh_u64x2 const*) secret;    /* no alignment restriction */
+    xxh_u64x2 const v32 = { 32, 32 };
+    size_t i;
+    for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+        /* data_vec = xinput[i]; */
+        xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);
+        /* key_vec = xsecret[i]; */
+        xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + i);
+        xxh_u64x2 const data_key = data_vec ^ key_vec;
+        /* shuffled = (data_key << 32) | (data_key >> 32); */
+        xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
+        /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
+        xxh_u64x2 const product  = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
+        xacc[i] += product;
+
+        /* swap high and low halves */
+#ifdef __s390x__
+        xacc[i] += vec_permi(data_vec, data_vec, 2);
+#else
+        xacc[i] += vec_xxpermdi(data_vec, data_vec, 2);
+#endif
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {         xxh_u64x2* const xacc    =       (xxh_u64x2*) acc;
+        const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret;
+        /* constants */
+        xxh_u64x2 const v32  = { 32, 32 };
+        xxh_u64x2 const v47 = { 47, 47 };
+        xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 };
+        size_t i;
+        for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47); */
+            xxh_u64x2 const acc_vec  = xacc[i];
+            xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
+
+            /* xacc[i] ^= xsecret[i]; */
+            xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + i);
+            xxh_u64x2 const data_key = data_vec ^ key_vec;
+
+            /* xacc[i] *= XXH_PRIME32_1 */
+            /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF);  */
+            xxh_u64x2 const prod_even  = XXH_vec_mule((xxh_u32x4)data_key, prime);
+            /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32);  */
+            xxh_u64x2 const prod_odd  = XXH_vec_mulo((xxh_u32x4)data_key, prime);
+            xacc[i] = prod_odd + (prod_even << v32);
+    }   }
+}
+
+#endif
+
+/* scalar variants - universal */
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,
+                     const void* XXH_RESTRICT input,
+                     const void* XXH_RESTRICT secret)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */
+    const xxh_u8* const xinput  = (const xxh_u8*) input;  /* no alignment restriction */
+    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+    size_t i;
+    XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
+    for (i=0; i < XXH_ACC_NB; i++) {
+        xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);
+        xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);
+        xacc[i ^ 1] += data_val; /* swap adjacent lanes */
+        xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned */
+    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+    size_t i;
+    XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
+    for (i=0; i < XXH_ACC_NB; i++) {
+        xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);
+        xxh_u64 acc64 = xacc[i];
+        acc64 = XXH_xorshift64(acc64, 47);
+        acc64 ^= key64;
+        acc64 *= XXH_PRIME32_1;
+        xacc[i] = acc64;
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    /*
+     * We need a separate pointer for the hack below,
+     * which requires a non-const pointer.
+     * Any decent compiler will optimize this out otherwise.
+     */
+    const xxh_u8* kSecretPtr = XXH3_kSecret;
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+
+#if defined(__clang__) && defined(__aarch64__)
+    /*
+     * UGLY HACK:
+     * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are
+     * placed sequentially, in order, at the top of the unrolled loop.
+     *
+     * While MOVK is great for generating constants (2 cycles for a 64-bit
+     * constant compared to 4 cycles for LDR), long MOVK chains stall the
+     * integer pipelines:
+     *   I   L   S
+     * MOVK
+     * MOVK
+     * MOVK
+     * MOVK
+     * ADD
+     * SUB      STR
+     *          STR
+     * By forcing loads from memory (as the asm line causes Clang to assume
+     * that XXH3_kSecretPtr has been changed), the pipelines are used more
+     * efficiently:
+     *   I   L   S
+     *      LDR
+     *  ADD LDR
+     *  SUB     STR
+     *          STR
+     * XXH3_64bits_withSeed, len == 256, Snapdragon 835
+     *   without hack: 2654.4 MB/s
+     *   with hack:    3202.9 MB/s
+     */
+    XXH_COMPILER_GUARD(kSecretPtr);
+#endif
+    /*
+     * Note: in debug mode, this overrides the asm optimization
+     * and Clang will emit MOVK chains again.
+     */
+    XXH_ASSERT(kSecretPtr == XXH3_kSecret);
+
+    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
+        int i;
+        for (i=0; i < nbRounds; i++) {
+            /*
+             * The asm hack causes Clang to assume that kSecretPtr aliases with
+             * customSecret, and on aarch64, this prevented LDP from merging two
+             * loads together for free. Putting the loads together before the stores
+             * properly generates LDP.
+             */
+            xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i)     + seed64;
+            xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64;
+            XXH_writeLE64((xxh_u8*)customSecret + 16*i,     lo);
+            XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi);
+    }   }
+}
+
+
+typedef void (*XXH3_f_accumulate_512)(void* XXH_RESTRICT, const void*, const void*);
+typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*);
+typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
+
+
+#if (XXH_VECTOR == XXH_AVX512)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_avx512
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_avx512
+#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
+
+#elif (XXH_VECTOR == XXH_AVX2)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_avx2
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_avx2
+#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_sse2
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_sse2
+#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_neon
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_neon
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#elif (XXH_VECTOR == XXH_VSX)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_vsx
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_vsx
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#else /* scalar */
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_scalar
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_scalar
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#endif
+
+
+
+#ifndef XXH_PREFETCH_DIST
+#  ifdef __clang__
+#    define XXH_PREFETCH_DIST 320
+#  else
+#    if (XXH_VECTOR == XXH_AVX512)
+#      define XXH_PREFETCH_DIST 512
+#    else
+#      define XXH_PREFETCH_DIST 384
+#    endif
+#  endif  /* __clang__ */
+#endif  /* XXH_PREFETCH_DIST */
+
+/*
+ * XXH3_accumulate()
+ * Loops over XXH3_accumulate_512().
+ * Assumption: nbStripes will not overflow the secret size
+ */
+XXH_FORCE_INLINE void
+XXH3_accumulate(     xxh_u64* XXH_RESTRICT acc,
+                const xxh_u8* XXH_RESTRICT input,
+                const xxh_u8* XXH_RESTRICT secret,
+                      size_t nbStripes,
+                      XXH3_f_accumulate_512 f_acc512)
+{
+    size_t n;
+    for (n = 0; n < nbStripes; n++ ) {
+        const xxh_u8* const in = input + n*XXH_STRIPE_LEN;
+        XXH_PREFETCH(in + XXH_PREFETCH_DIST);
+        f_acc512(acc,
+                 in,
+                 secret + n*XXH_SECRET_CONSUME_RATE);
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
+                      const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                            XXH3_f_accumulate_512 f_acc512,
+                            XXH3_f_scrambleAcc f_scramble)
+{
+    size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
+    size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock;
+    size_t const nb_blocks = (len - 1) / block_len;
+
+    size_t n;
+
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+
+    for (n = 0; n < nb_blocks; n++) {
+        XXH3_accumulate(acc, input + n*block_len, secret, nbStripesPerBlock, f_acc512);
+        f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);
+    }
+
+    /* last partial block */
+    XXH_ASSERT(len > XXH_STRIPE_LEN);
+    {   size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
+        XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
+        XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, f_acc512);
+
+        /* last stripe */
+        {   const xxh_u8* const p = input + len - XXH_STRIPE_LEN;
+#define XXH_SECRET_LASTACC_START 7  /* not aligned on 8, last secret is different from acc & scrambler */
+            f_acc512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
+    }   }
+}
+
+XXH_FORCE_INLINE xxh_u64
+XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
+{
+    return XXH3_mul128_fold64(
+               acc[0] ^ XXH_readLE64(secret),
+               acc[1] ^ XXH_readLE64(secret+8) );
+}
+
+static XXH64_hash_t
+XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
+{
+    xxh_u64 result64 = start;
+    size_t i = 0;
+
+    for (i = 0; i < 4; i++) {
+        result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i);
+#if defined(__clang__)                                /* Clang */ \
+    && (defined(__arm__) || defined(__thumb__))       /* ARMv7 */ \
+    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
+    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Prevent autovectorization on Clang ARMv7-a. Exact same problem as
+         * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.
+         * XXH3_64bits, len == 256, Snapdragon 835:
+         *   without hack: 2063.7 MB/s
+         *   with hack:    2560.7 MB/s
+         */
+        XXH_COMPILER_GUARD(result64);
+#endif
+    }
+
+    return XXH3_avalanche(result64);
+}
+
+#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \
+                        XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 }
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
+                           const void* XXH_RESTRICT secret, size_t secretSize,
+                           XXH3_f_accumulate_512 f_acc512,
+                           XXH3_f_scrambleAcc f_scramble)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc512, f_scramble);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    /* do not align on 8, so that the secret is different from the accumulator */
+#define XXH_SECRET_MERGEACCS_START 11
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1);
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
+                             XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64;
+    return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate_512, XXH3_scrambleAcc);
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ * Since the function is not inlined, the compiler may not be able to understand that,
+ * in some scenarios, its `secret` argument is actually a compile time constant.
+ * This variant enforces that the compiler can detect that,
+ * and uses this opportunity to streamline the generated code for better performance.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
+                          XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64; (void)secret; (void)secretLen;
+    return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate_512, XXH3_scrambleAcc);
+}
+
+/*
+ * XXH3_hashLong_64b_withSeed():
+ * Generate a custom key based on alteration of default XXH3_kSecret with the seed,
+ * and then use this key for long mode hashing.
+ *
+ * This operation is decently fast but nonetheless costs a little bit of time.
+ * Try to avoid it whenever possible (typically when seed==0).
+ *
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
+                                    XXH64_hash_t seed,
+                                    XXH3_f_accumulate_512 f_acc512,
+                                    XXH3_f_scrambleAcc f_scramble,
+                                    XXH3_f_initCustomSecret f_initSec)
+{
+    if (seed == 0)
+        return XXH3_hashLong_64b_internal(input, len,
+                                          XXH3_kSecret, sizeof(XXH3_kSecret),
+                                          f_acc512, f_scramble);
+    {   XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+        f_initSec(secret, seed);
+        return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),
+                                          f_acc512, f_scramble);
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSeed(const void* input, size_t len,
+                           XXH64_hash_t seed, const xxh_u8* secret, size_t secretLen)
+{
+    (void)secret; (void)secretLen;
+    return XXH3_hashLong_64b_withSeed_internal(input, len, seed,
+                XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);
+}
+
+
+typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t,
+                                          XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t);
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len,
+                     XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
+                     XXH3_hashLong64_f f_hashLong)
+{
+    XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
+    /*
+     * If an action is to be taken if `secretLen` condition is not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     * Also, note that function signature doesn't offer room to return an error.
+     */
+    if (len <= 16)
+        return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);
+    if (len <= 128)
+        return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen);
+}
+
+
+/* ===   Public entry point   === */
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len)
+{
+    return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+{
+    return XXH3_64bits_internal(input, len, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
+}
+
+
+/* ===   XXH3 streaming   === */
+
+/*
+ * Malloc's a pointer that is always aligned to align.
+ *
+ * This must be freed with `XXH_alignedFree()`.
+ *
+ * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte
+ * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2
+ * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.
+ *
+ * This underalignment previously caused a rather obvious crash which went
+ * completely unnoticed due to XXH3_createState() not actually being tested.
+ * Credit to RedSpah for noticing this bug.
+ *
+ * The alignment is done manually: Functions like posix_memalign or _mm_malloc
+ * are avoided: To maintain portability, we would have to write a fallback
+ * like this anyways, and besides, testing for the existence of library
+ * functions without relying on external build tools is impossible.
+ *
+ * The method is simple: Overallocate, manually align, and store the offset
+ * to the original behind the returned pointer.
+ *
+ * Align must be a power of 2 and 8 <= align <= 128.
+ */
+static void* XXH_alignedMalloc(size_t s, size_t align)
+{
+    XXH_ASSERT(align <= 128 && align >= 8); /* range check */
+    XXH_ASSERT((align & (align-1)) == 0);   /* power of 2 */
+    XXH_ASSERT(s != 0 && s < (s + align));  /* empty/overflow */
+    {   /* Overallocate to make room for manual realignment and an offset byte */
+        xxh_u8* base = (xxh_u8*)XXH_malloc(s + align);
+        if (base != NULL) {
+            /*
+             * Get the offset needed to align this pointer.
+             *
+             * Even if the returned pointer is aligned, there will always be
+             * at least one byte to store the offset to the original pointer.
+             */
+            size_t offset = align - ((size_t)base & (align - 1)); /* base % align */
+            /* Add the offset for the now-aligned pointer */
+            xxh_u8* ptr = base + offset;
+
+            XXH_ASSERT((size_t)ptr % align == 0);
+
+            /* Store the offset immediately before the returned pointer. */
+            ptr[-1] = (xxh_u8)offset;
+            return ptr;
+        }
+        return NULL;
+    }
+}
+/*
+ * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass
+ * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.
+ */
+static void XXH_alignedFree(void* p)
+{
+    if (p != NULL) {
+        xxh_u8* ptr = (xxh_u8*)p;
+        /* Get the offset byte we added in XXH_malloc. */
+        xxh_u8 offset = ptr[-1];
+        /* Free the original malloc'd pointer */
+        xxh_u8* base = ptr - offset;
+        XXH_free(base);
+    }
+}
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
+{
+    XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
+    if (state==NULL) return NULL;
+    XXH3_INITSTATE(state);
+    return state;
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
+{
+    XXH_alignedFree(statePtr);
+    return XXH_OK;
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API void
+XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
+{
+    memcpy(dst_state, src_state, sizeof(*dst_state));
+}
+
+static void
+XXH3_reset_internal(XXH3_state_t* statePtr,
+                           XXH64_hash_t seed,
+                           const void* secret, size_t secretSize)
+{
+    size_t const initStart = offsetof(XXH3_state_t, bufferedSize);
+    size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart;
+    XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart);
+    XXH_ASSERT(statePtr != NULL);
+    /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */
+    memset((char*)statePtr + initStart, 0, initLength);
+    statePtr->acc[0] = XXH_PRIME32_3;
+    statePtr->acc[1] = XXH_PRIME64_1;
+    statePtr->acc[2] = XXH_PRIME64_2;
+    statePtr->acc[3] = XXH_PRIME64_3;
+    statePtr->acc[4] = XXH_PRIME64_4;
+    statePtr->acc[5] = XXH_PRIME32_2;
+    statePtr->acc[6] = XXH_PRIME64_5;
+    statePtr->acc[7] = XXH_PRIME32_1;
+    statePtr->seed = seed;
+    statePtr->extSecret = (const unsigned char*)secret;
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;
+    statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset(XXH3_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, 0, secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    if (seed==0) return XXH3_64bits_reset(statePtr);
+    if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed);
+    XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+/* Note : when XXH3_consumeStripes() is invoked,
+ * there must be a guarantee that at least one more byte must be consumed from input
+ * so that the function can blindly consume all stripes using the "normal" secret segment */
+XXH_FORCE_INLINE void
+XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
+                    size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock,
+                    const xxh_u8* XXH_RESTRICT input, size_t nbStripes,
+                    const xxh_u8* XXH_RESTRICT secret, size_t secretLimit,
+                    XXH3_f_accumulate_512 f_acc512,
+                    XXH3_f_scrambleAcc f_scramble)
+{
+    XXH_ASSERT(nbStripes <= nbStripesPerBlock);  /* can handle max 1 scramble per invocation */
+    XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
+    if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) {
+        /* need a scrambling operation */
+        size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr;
+        size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock;
+        XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripesToEndofBlock, f_acc512);
+        f_scramble(acc, secret + secretLimit);
+        XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock, f_acc512);
+        *nbStripesSoFarPtr = nbStripesAfterBlock;
+    } else {
+        XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, f_acc512);
+        *nbStripesSoFarPtr += nbStripes;
+    }
+}
+
+/*
+ * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
+ */
+XXH_FORCE_INLINE XXH_errorcode
+XXH3_update(XXH3_state_t* state,
+            const xxh_u8* input, size_t len,
+            XXH3_f_accumulate_512 f_acc512,
+            XXH3_f_scrambleAcc f_scramble)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const xxh_u8* const bEnd = input + len;
+        const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+
+        state->totalLen += len;
+        XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);
+
+        if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) {  /* fill in tmp buffer */
+            XXH_memcpy(state->buffer + state->bufferedSize, input, len);
+            state->bufferedSize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+        /* total input is now > XXH3_INTERNALBUFFER_SIZE */
+
+        #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
+        XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0);   /* clean multiple */
+
+        /*
+         * Internal buffer is partially filled (always, except at beginning)
+         * Complete it, then consume it.
+         */
+        if (state->bufferedSize) {
+            size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
+            XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
+            input += loadSize;
+            XXH3_consumeStripes(state->acc,
+                               &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                state->buffer, XXH3_INTERNALBUFFER_STRIPES,
+                                secret, state->secretLimit,
+                                f_acc512, f_scramble);
+            state->bufferedSize = 0;
+        }
+        XXH_ASSERT(input < bEnd);
+
+        /* Consume input by a multiple of internal buffer size */
+        if (input+XXH3_INTERNALBUFFER_SIZE < bEnd) {
+            const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
+            do {
+                XXH3_consumeStripes(state->acc,
+                                   &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                    input, XXH3_INTERNALBUFFER_STRIPES,
+                                    secret, state->secretLimit,
+                                    f_acc512, f_scramble);
+                input += XXH3_INTERNALBUFFER_SIZE;
+            } while (input<limit);
+            /* for last partial stripe */
+            memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
+        }
+        XXH_ASSERT(input < bEnd);
+
+        /* Some remaining input (always) : buffer it */
+        XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
+        state->bufferedSize = (XXH32_hash_t)(bEnd-input);
+    }
+
+    return XXH_OK;
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)
+{
+    return XXH3_update(state, (const xxh_u8*)input, len,
+                       XXH3_accumulate_512, XXH3_scrambleAcc);
+}
+
+
+XXH_FORCE_INLINE void
+XXH3_digest_long (XXH64_hash_t* acc,
+                  const XXH3_state_t* state,
+                  const unsigned char* secret)
+{
+    /*
+     * Digest on a local copy. This way, the state remains unaltered, and it can
+     * continue ingesting more input afterwards.
+     */
+    memcpy(acc, state->acc, sizeof(state->acc));
+    if (state->bufferedSize >= XXH_STRIPE_LEN) {
+        size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
+        size_t nbStripesSoFar = state->nbStripesSoFar;
+        XXH3_consumeStripes(acc,
+                           &nbStripesSoFar, state->nbStripesPerBlock,
+                            state->buffer, nbStripes,
+                            secret, state->secretLimit,
+                            XXH3_accumulate_512, XXH3_scrambleAcc);
+        /* last stripe */
+        XXH3_accumulate_512(acc,
+                            state->buffer + state->bufferedSize - XXH_STRIPE_LEN,
+                            secret + state->secretLimit - XXH_SECRET_LASTACC_START);
+    } else {  /* bufferedSize < XXH_STRIPE_LEN */
+        xxh_u8 lastStripe[XXH_STRIPE_LEN];
+        size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
+        XXH_ASSERT(state->bufferedSize > 0);  /* there is always some input buffered */
+        memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
+        memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
+        XXH3_accumulate_512(acc,
+                            lastStripe,
+                            secret + state->secretLimit - XXH_SECRET_LASTACC_START);
+    }
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
+{
+    const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
+        XXH3_digest_long(acc, state, secret);
+        return XXH3_mergeAccs(acc,
+                              secret + XXH_SECRET_MERGEACCS_START,
+                              (xxh_u64)state->totalLen * XXH_PRIME64_1);
+    }
+    /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */
+    if (state->seed)
+        return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                  secret, state->secretLimit + XXH_STRIPE_LEN);
+}
+
+
+#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API void
+XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize)
+{
+    XXH_ASSERT(secretBuffer != NULL);
+    if (customSeedSize == 0) {
+        memcpy(secretBuffer, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
+        return;
+    }
+    XXH_ASSERT(customSeed != NULL);
+
+    {   size_t const segmentSize = sizeof(XXH128_hash_t);
+        size_t const nbSegments = XXH_SECRET_DEFAULT_SIZE / segmentSize;
+        XXH128_canonical_t scrambler;
+        XXH64_hash_t seeds[12];
+        size_t segnb;
+        XXH_ASSERT(nbSegments == 12);
+        XXH_ASSERT(segmentSize * nbSegments == XXH_SECRET_DEFAULT_SIZE); /* exact multiple */
+        XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
+
+        /*
+        * Copy customSeed to seeds[], truncating or repeating as necessary.
+        */
+        {   size_t toFill = XXH_MIN(customSeedSize, sizeof(seeds));
+            size_t filled = toFill;
+            memcpy(seeds, customSeed, toFill);
+            while (filled < sizeof(seeds)) {
+                toFill = XXH_MIN(filled, sizeof(seeds) - filled);
+                memcpy((char*)seeds + filled, seeds, toFill);
+                filled += toFill;
+        }   }
+
+        /* generate secret */
+        memcpy(secretBuffer, &scrambler, sizeof(scrambler));
+        for (segnb=1; segnb < nbSegments; segnb++) {
+            size_t const segmentStart = segnb * segmentSize;
+            XXH128_canonical_t segment;
+            XXH128_canonicalFromHash(&segment,
+                XXH128(&scrambler, sizeof(scrambler), XXH_readLE64(seeds + segnb) + segnb) );
+            memcpy((char*)secretBuffer + segmentStart, &segment, sizeof(segment));
+    }   }
+}
+
+
+/* ==========================================
+ * XXH3 128 bits (a.k.a XXH128)
+ * ==========================================
+ * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
+ * even without counting the significantly larger output size.
+ *
+ * For example, extra steps are taken to avoid the seed-dependent collisions
+ * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
+ *
+ * This strength naturally comes at the cost of some speed, especially on short
+ * lengths. Note that longer hashes are about as fast as the 64-bit version
+ * due to it using only a slight modification of the 64-bit loop.
+ *
+ * XXH128 is also more oriented towards 64-bit machines. It is still extremely
+ * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
+ */
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    /* A doubled version of 1to3_64b with different constants. */
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    /*
+     * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
+     */
+    {   xxh_u8 const c1 = input[0];
+        xxh_u8 const c2 = input[len >> 1];
+        xxh_u8 const c3 = input[len - 1];
+        xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24)
+                                | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
+        xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);
+        xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+        xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed;
+        xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;
+        xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;
+        XXH128_hash_t h128;
+        h128.low64  = XXH64_avalanche(keyed_lo);
+        h128.high64 = XXH64_avalanche(keyed_hi);
+        return h128;
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+    {   xxh_u32 const input_lo = XXH_readLE32(input);
+        xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
+        xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);
+        xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed;
+        xxh_u64 const keyed = input_64 ^ bitflip;
+
+        /* Shift len to the left to ensure it is even, this avoids even multiplies. */
+        XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2));
+
+        m128.high64 += (m128.low64 << 1);
+        m128.low64  ^= (m128.high64 >> 3);
+
+        m128.low64   = XXH_xorshift64(m128.low64, 35);
+        m128.low64  *= 0x9FB21C651E98DF25ULL;
+        m128.low64   = XXH_xorshift64(m128.low64, 28);
+        m128.high64  = XXH3_avalanche(m128.high64);
+        return m128;
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed;
+        xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed;
+        xxh_u64 const input_lo = XXH_readLE64(input);
+        xxh_u64       input_hi = XXH_readLE64(input + len - 8);
+        XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1);
+        /*
+         * Put len in the middle of m128 to ensure that the length gets mixed to
+         * both the low and high bits in the 128x64 multiply below.
+         */
+        m128.low64 += (xxh_u64)(len - 1) << 54;
+        input_hi   ^= bitfliph;
+        /*
+         * Add the high 32 bits of input_hi to the high 32 bits of m128, then
+         * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to
+         * the high 64 bits of m128.
+         *
+         * The best approach to this operation is different on 32-bit and 64-bit.
+         */
+        if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */
+            /*
+             * 32-bit optimized version, which is more readable.
+             *
+             * On 32-bit, it removes an ADC and delays a dependency between the two
+             * halves of m128.high64, but it generates an extra mask on 64-bit.
+             */
+            m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2);
+        } else {
+            /*
+             * 64-bit optimized (albeit more confusing) version.
+             *
+             * Uses some properties of addition and multiplication to remove the mask:
+             *
+             * Let:
+             *    a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
+             *    b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
+             *    c = XXH_PRIME32_2
+             *
+             *    a + (b * c)
+             * Inverse Property: x + y - x == y
+             *    a + (b * (1 + c - 1))
+             * Distributive Property: x * (y + z) == (x * y) + (x * z)
+             *    a + (b * 1) + (b * (c - 1))
+             * Identity Property: x * 1 == x
+             *    a + b + (b * (c - 1))
+             *
+             * Substitute a, b, and c:
+             *    input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
+             *
+             * Since input_hi.hi + input_hi.lo == input_hi, we get this:
+             *    input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
+             */
+            m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1);
+        }
+        /* m128 ^= XXH_swap64(m128 >> 64); */
+        m128.low64  ^= XXH_swap64(m128.high64);
+
+        {   /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */
+            XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2);
+            h128.high64 += m128.high64 * XXH_PRIME64_2;
+
+            h128.low64   = XXH3_avalanche(h128.low64);
+            h128.high64  = XXH3_avalanche(h128.high64);
+            return h128;
+    }   }
+}
+
+/*
+ * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
+ */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);
+        if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_128b(input, len, secret, seed);
+        {   XXH128_hash_t h128;
+            xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72);
+            xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88);
+            h128.low64 = XXH64_avalanche(seed ^ bitflipl);
+            h128.high64 = XXH64_avalanche( seed ^ bitfliph);
+            return h128;
+    }   }
+}
+
+/*
+ * A bit slower than XXH3_mix16B, but handles multiply by zero better.
+ */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2,
+              const xxh_u8* secret, XXH64_hash_t seed)
+{
+    acc.low64  += XXH3_mix16B (input_1, secret+0, seed);
+    acc.low64  ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
+    acc.high64 += XXH3_mix16B (input_2, secret+16, seed);
+    acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
+    return acc;
+}
+
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   XXH128_hash_t acc;
+        acc.low64 = len * XXH_PRIME64_1;
+        acc.high64 = 0;
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
+                }
+                acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
+            }
+            acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
+        }
+        acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
+        {   XXH128_hash_t h128;
+            h128.low64  = acc.low64 + acc.high64;
+            h128.high64 = (acc.low64    * XXH_PRIME64_1)
+                        + (acc.high64   * XXH_PRIME64_4)
+                        + ((len - seed) * XXH_PRIME64_2);
+            h128.low64  = XXH3_avalanche(h128.low64);
+            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+XXH_NO_INLINE XXH128_hash_t
+XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                       XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    {   XXH128_hash_t acc;
+        int const nbRounds = (int)len / 32;
+        int i;
+        acc.low64 = len * XXH_PRIME64_1;
+        acc.high64 = 0;
+        for (i=0; i<4; i++) {
+            acc = XXH128_mix32B(acc,
+                                input  + (32 * i),
+                                input  + (32 * i) + 16,
+                                secret + (32 * i),
+                                seed);
+        }
+        acc.low64 = XXH3_avalanche(acc.low64);
+        acc.high64 = XXH3_avalanche(acc.high64);
+        XXH_ASSERT(nbRounds >= 4);
+        for (i=4 ; i < nbRounds; i++) {
+            acc = XXH128_mix32B(acc,
+                                input + (32 * i),
+                                input + (32 * i) + 16,
+                                secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)),
+                                seed);
+        }
+        /* last bytes */
+        acc = XXH128_mix32B(acc,
+                            input + len - 16,
+                            input + len - 32,
+                            secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
+                            0ULL - seed);
+
+        {   XXH128_hash_t h128;
+            h128.low64  = acc.low64 + acc.high64;
+            h128.high64 = (acc.low64    * XXH_PRIME64_1)
+                        + (acc.high64   * XXH_PRIME64_4)
+                        + ((len - seed) * XXH_PRIME64_2);
+            h128.low64  = XXH3_avalanche(h128.low64);
+            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len,
+                            const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                            XXH3_f_accumulate_512 f_acc512,
+                            XXH3_f_scrambleAcc f_scramble)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc512, f_scramble);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    {   XXH128_hash_t h128;
+        h128.low64  = XXH3_mergeAccs(acc,
+                                     secret + XXH_SECRET_MERGEACCS_START,
+                                     (xxh_u64)len * XXH_PRIME64_1);
+        h128.high64 = XXH3_mergeAccs(acc,
+                                     secret + secretSize
+                                            - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
+                                     ~((xxh_u64)len * XXH_PRIME64_2));
+        return h128;
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len,
+                           XXH64_hash_t seed64,
+                           const void* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64; (void)secret; (void)secretLen;
+    return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret),
+                                       XXH3_accumulate_512, XXH3_scrambleAcc);
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,
+                              XXH64_hash_t seed64,
+                              const void* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64;
+    return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen,
+                                       XXH3_accumulate_512, XXH3_scrambleAcc);
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len,
+                                XXH64_hash_t seed64,
+                                XXH3_f_accumulate_512 f_acc512,
+                                XXH3_f_scrambleAcc f_scramble,
+                                XXH3_f_initCustomSecret f_initSec)
+{
+    if (seed64 == 0)
+        return XXH3_hashLong_128b_internal(input, len,
+                                           XXH3_kSecret, sizeof(XXH3_kSecret),
+                                           f_acc512, f_scramble);
+    {   XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+        f_initSec(secret, seed64);
+        return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret),
+                                           f_acc512, f_scramble);
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSeed(const void* input, size_t len,
+                            XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)secret; (void)secretLen;
+    return XXH3_hashLong_128b_withSeed_internal(input, len, seed64,
+                XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);
+}
+
+typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t,
+                                            XXH64_hash_t, const void* XXH_RESTRICT, size_t);
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_128bits_internal(const void* input, size_t len,
+                      XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
+                      XXH3_hashLong128_f f_hl128)
+{
+    XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
+    /*
+     * If an action is to be taken if `secret` conditions are not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     */
+    if (len <= 16)
+        return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);
+    if (len <= 128)
+        return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    return f_hl128(input, len, seed64, secret, secretLen);
+}
+
+
+/* ===   Public XXH128 API   === */
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
+{
+    return XXH3_128bits_internal(input, len, 0,
+                                 XXH3_kSecret, sizeof(XXH3_kSecret),
+                                 XXH3_hashLong_128b_default);
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+{
+    return XXH3_128bits_internal(input, len, 0,
+                                 (const xxh_u8*)secret, secretSize,
+                                 XXH3_hashLong_128b_withSecret);
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_128bits_internal(input, len, seed,
+                                 XXH3_kSecret, sizeof(XXH3_kSecret),
+                                 XXH3_hashLong_128b_withSeed);
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH128(const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_128bits_withSeed(input, len, seed);
+}
+
+
+/* ===   XXH3 128-bit streaming   === */
+
+/*
+ * All the functions are actually the same as for 64-bit streaming variant.
+ * The only difference is the finalization routine.
+ */
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset(XXH3_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, 0, secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    if (seed==0) return XXH3_128bits_reset(statePtr);
+    if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed);
+    XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
+{
+    return XXH3_update(state, (const xxh_u8*)input, len,
+                       XXH3_accumulate_512, XXH3_scrambleAcc);
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
+{
+    const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
+        XXH3_digest_long(acc, state, secret);
+        XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+        {   XXH128_hash_t h128;
+            h128.low64  = XXH3_mergeAccs(acc,
+                                         secret + XXH_SECRET_MERGEACCS_START,
+                                         (xxh_u64)state->totalLen * XXH_PRIME64_1);
+            h128.high64 = XXH3_mergeAccs(acc,
+                                         secret + state->secretLimit + XXH_STRIPE_LEN
+                                                - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
+                                         ~((xxh_u64)state->totalLen * XXH_PRIME64_2));
+            return h128;
+        }
+    }
+    /* len <= XXH3_MIDSIZE_MAX : short code */
+    if (state->seed)
+        return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                   secret, state->secretLimit + XXH_STRIPE_LEN);
+}
+
+/* 128-bit utility functions */
+
+#include <string.h>   /* memcmp, memcpy */
+
+/* return : 1 is equal, 0 if different */
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
+{
+    /* note : XXH128_hash_t is compact, it has no padding byte */
+    return !(memcmp(&h1, &h2, sizeof(h1)));
+}
+
+/* This prototype is compatible with stdlib's qsort().
+ * return : >0 if *h128_1  > *h128_2
+ *          <0 if *h128_1  < *h128_2
+ *          =0 if *h128_1 == *h128_2  */
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
+{
+    XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
+    XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
+    int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
+    /* note : bets that, in most cases, hash values are different */
+    if (hcmp) return hcmp;
+    return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
+}
+
+
+/*======   Canonical representation   ======*/
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API void
+XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) {
+        hash.high64 = XXH_swap64(hash.high64);
+        hash.low64  = XXH_swap64(hash.low64);
+    }
+    memcpy(dst, &hash.high64, sizeof(hash.high64));
+    memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH128_hashFromCanonical(const XXH128_canonical_t* src)
+{
+    XXH128_hash_t h;
+    h.high64 = XXH_readBE64(src);
+    h.low64  = XXH_readBE64(src->digest + 8);
+    return h;
+}
+
+/* Pop our optimization override from above */
+#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
+#  pragma GCC pop_options
+#endif
+
+#endif  /* XXH_NO_LONG_LONG */
+
+#endif  /* XXH_NO_XXH3 */
+
+/*!
+ * @}
+ */
+#endif  /* XXH_IMPLEMENTATION */
+
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/src/rocksdb/util/xxph3.h b/src/rocksdb/util/xxph3.h
new file mode 100644
index 000000000..968000c3a
--- /dev/null
+++ b/src/rocksdb/util/xxph3.h
@@ -0,0 +1,1764 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+/*
+   xxHash - Extremely Fast Hash algorithm
+   Header File
+   Copyright (C) 2012-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+// This is a fork of a preview version of xxHash, as RocksDB depends on
+// this preview version of XXH3. To allow this to coexist with the
+// standard xxHash, including in the "unity" build where all source files
+// and headers go into a single translation unit, here "XXH" has been
+// replaced with "XXPH" for XX Preview Hash.
+
+#ifndef XXPHASH_H_5627135585666179
+#define XXPHASH_H_5627135585666179 1
+
+/* BEGIN RocksDB customizations */
+#ifndef XXPH_STATIC_LINKING_ONLY
+// Access experimental APIs
+#define XXPH_STATIC_LINKING_ONLY 1
+#endif
+#define XXPH_NAMESPACE ROCKSDB_
+#define XXPH_INLINE_ALL
+#include <cstring>
+/* END RocksDB customizations */
+
+// clang-format off
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* ****************************
+*  Definitions
+******************************/
+#include <stddef.h>   /* size_t */
+typedef enum { XXPH_OK=0, XXPH_ERROR } XXPH_errorcode;
+
+
+/* ****************************
+ *  API modifier
+ ******************************/
+/** XXPH_INLINE_ALL (and XXPH_PRIVATE_API)
+ *  This build macro includes xxhash functions in `static` mode
+ *  in order to inline them, and remove their symbol from the public list.
+ *  Inlining offers great performance improvement on small keys,
+ *  and dramatic ones when length is expressed as a compile-time constant.
+ *  See https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html .
+ *  Methodology :
+ *     #define XXPH_INLINE_ALL
+ *     #include "xxhash.h"
+ * `xxhash.c` is automatically included.
+ *  It's not useful to compile and link it as a separate object.
+ */
+#if defined(XXPH_INLINE_ALL) || defined(XXPH_PRIVATE_API)
+#  ifndef XXPH_STATIC_LINKING_ONLY
+#    define XXPH_STATIC_LINKING_ONLY
+#  endif
+#  if defined(__GNUC__)
+#    define XXPH_PUBLIC_API static __inline __attribute__((unused))
+#  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#    define XXPH_PUBLIC_API static inline
+#  elif defined(_MSC_VER)
+#    define XXPH_PUBLIC_API static __inline
+#  else
+     /* this version may generate warnings for unused static functions */
+#    define XXPH_PUBLIC_API static
+#  endif
+#else
+#  if defined(WIN32) && defined(_MSC_VER) && (defined(XXPH_IMPORT) || defined(XXPH_EXPORT))
+#    ifdef XXPH_EXPORT
+#      define XXPH_PUBLIC_API __declspec(dllexport)
+#    elif XXPH_IMPORT
+#      define XXPH_PUBLIC_API __declspec(dllimport)
+#    endif
+#  else
+#    define XXPH_PUBLIC_API   /* do nothing */
+#  endif
+#endif /* XXPH_INLINE_ALL || XXPH_PRIVATE_API */
+
+/*! XXPH_NAMESPACE, aka Namespace Emulation :
+ *
+ * If you want to include _and expose_ xxHash functions from within your own library,
+ * but also want to avoid symbol collisions with other libraries which may also include xxHash,
+ *
+ * you can use XXPH_NAMESPACE, to automatically prefix any public symbol from xxhash library
+ * with the value of XXPH_NAMESPACE (therefore, avoid NULL and numeric values).
+ *
+ * Note that no change is required within the calling program as long as it includes `xxhash.h` :
+ * regular symbol name will be automatically translated by this header.
+ */
+#ifdef XXPH_NAMESPACE
+#  define XXPH_CAT(A,B) A##B
+#  define XXPH_NAME2(A,B) XXPH_CAT(A,B)
+#  define XXPH_versionNumber XXPH_NAME2(XXPH_NAMESPACE, XXPH_versionNumber)
+#endif
+
+
+/* *************************************
+*  Version
+***************************************/
+#define XXPH_VERSION_MAJOR    0
+#define XXPH_VERSION_MINOR    7
+#define XXPH_VERSION_RELEASE  2
+#define XXPH_VERSION_NUMBER  (XXPH_VERSION_MAJOR *100*100 + XXPH_VERSION_MINOR *100 + XXPH_VERSION_RELEASE)
+XXPH_PUBLIC_API unsigned XXPH_versionNumber (void);
+
+
+/*-**********************************************************************
+*  32-bit hash
+************************************************************************/
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint32_t XXPH32_hash_t;
+#else
+#   include <limits.h>
+#   if UINT_MAX == 0xFFFFFFFFUL
+      typedef unsigned int XXPH32_hash_t;
+#   else
+#     if ULONG_MAX == 0xFFFFFFFFUL
+        typedef unsigned long XXPH32_hash_t;
+#     else
+#       error "unsupported platform : need a 32-bit type"
+#     endif
+#   endif
+#endif
+
+#ifndef XXPH_NO_LONG_LONG
+/*-**********************************************************************
+*  64-bit hash
+************************************************************************/
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint64_t XXPH64_hash_t;
+#else
+    /* the following type must have a width of 64-bit */
+    typedef unsigned long long XXPH64_hash_t;
+#endif
+
+#endif  /* XXPH_NO_LONG_LONG */
+
+
+
+#ifdef XXPH_STATIC_LINKING_ONLY
+
+/* ================================================================================================
+   This section contains declarations which are not guaranteed to remain stable.
+   They may change in future versions, becoming incompatible with a different version of the library.
+   These declarations should only be used with static linking.
+   Never use them in association with dynamic linking !
+=================================================================================================== */
+
+
+/*-**********************************************************************
+*  XXPH3
+*  New experimental hash
+************************************************************************/
+#ifndef XXPH_NO_LONG_LONG
+
+
+/* ============================================
+ * XXPH3 is a new hash algorithm,
+ * featuring improved speed performance for both small and large inputs.
+ * See full speed analysis at : http://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
+ * In general, expect XXPH3 to run about ~2x faster on large inputs,
+ * and >3x faster on small ones, though exact differences depend on platform.
+ *
+ * The algorithm is portable, will generate the same hash on all platforms.
+ * It benefits greatly from vectorization units, but does not require it.
+ *
+ * XXPH3 offers 2 variants, _64bits and _128bits.
+ * When only 64 bits are needed, prefer calling the _64bits variant :
+ * it reduces the amount of mixing, resulting in faster speed on small inputs.
+ * It's also generally simpler to manipulate a scalar return type than a struct.
+ *
+ * The XXPH3 algorithm is still considered experimental.
+ * Produced results can still change between versions.
+ * Results produced by v0.7.x are not comparable with results from v0.7.y .
+ * It's nonetheless possible to use XXPH3 for ephemeral data (local sessions),
+ * but avoid storing values in long-term storage for later reads.
+ *
+ * The API supports one-shot hashing, streaming mode, and custom secrets.
+ *
+ * There are still a number of opened questions that community can influence during the experimental period.
+ * I'm trying to list a few of them below, though don't consider this list as complete.
+ *
+ * - 128-bits output type : currently defined as a structure of two 64-bits fields.
+ *                          That's because 128-bit values do not exist in C standard.
+ *                          Note that it means that, at byte level, result is not identical depending on endianess.
+ *                          However, at field level, they are identical on all platforms.
+ *                          The canonical representation solves the issue of identical byte-level representation across platforms,
+ *                          which is necessary for serialization.
+ *                          Q1 : Would there be a better representation for a 128-bit hash result ?
+ *                          Q2 : Are the names of the inner 64-bit fields important ? Should they be changed ?
+ *
+ * - Prototype XXPH128() :   XXPH128() uses the same arguments as XXPH64(), for consistency.
+ *                          It means it maps to XXPH3_128bits_withSeed().
+ *                          This variant is slightly slower than XXPH3_128bits(),
+ *                          because the seed is now part of the algorithm, and can't be simplified.
+ *                          Is that a good idea ?
+ *
+ * - Seed type for XXPH128() : currently, it's a single 64-bit value, like the 64-bit variant.
+ *                          It could be argued that it's more logical to offer a 128-bit seed input parameter for a 128-bit hash.
+ *                          But 128-bit seed is more difficult to use, since it requires to pass a structure instead of a scalar value.
+ *                          Such a variant could either replace current one, or become an additional one.
+ *                          Farmhash, for example, offers both variants (the 128-bits seed variant is called `doubleSeed`).
+ *                          Follow up question : if both 64-bit and 128-bit seeds are allowed, which variant should be called XXPH128 ?
+ *
+ * - Result for len==0 :    Currently, the result of hashing a zero-length input is always `0`.
+ *                          It seems okay as a return value when using "default" secret and seed.
+ *                          But is it still fine to return `0` when secret or seed are non-default ?
+ *                          Are there use cases which could depend on generating a different hash result for zero-length input when the secret is different ?
+ *
+ * - Consistency (1) :      Streaming XXPH128 uses an XXPH3 state, which is the same state as XXPH3_64bits().
+ *                          It means a 128bit streaming loop must invoke the following symbols :
+ *                          XXPH3_createState(), XXPH3_128bits_reset(), XXPH3_128bits_update() (loop), XXPH3_128bits_digest(), XXPH3_freeState().
+ *                          Is that consistent enough ?
+ *
+ * - Consistency (2) :      The canonical representation of `XXPH3_64bits` is provided by existing functions
+ *                          XXPH64_canonicalFromHash(), and reverse operation XXPH64_hashFromCanonical().
+ *                          As a mirror, canonical functions for XXPH128_hash_t results generated by `XXPH3_128bits`
+ *                          are XXPH128_canonicalFromHash() and XXPH128_hashFromCanonical().
+ *                          Which means, `XXPH3` doesn't appear in the names, because canonical functions operate on a type,
+ *                          independently of which algorithm was used to generate that type.
+ *                          Is that consistent enough ?
+ */
+
+#ifdef XXPH_NAMESPACE
+#  define XXPH3_64bits XXPH_NAME2(XXPH_NAMESPACE, XXPH3_64bits)
+#  define XXPH3_64bits_withSecret XXPH_NAME2(XXPH_NAMESPACE, XXPH3_64bits_withSecret)
+#  define XXPH3_64bits_withSeed XXPH_NAME2(XXPH_NAMESPACE, XXPH3_64bits_withSeed)
+#endif
+
+/* XXPH3_64bits() :
+ * default 64-bit variant, using default secret and default seed of 0.
+ * It's the fastest variant. */
+XXPH_PUBLIC_API XXPH64_hash_t XXPH3_64bits(const void* data, size_t len);
+
+/* XXPH3_64bits_withSecret() :
+ * It's possible to provide any blob of bytes as a "secret" to generate the hash.
+ * This makes it more difficult for an external actor to prepare an intentional collision.
+ * The secret *must* be large enough (>= XXPH3_SECRET_SIZE_MIN).
+ * It should consist of random bytes.
+ * Avoid repeating same character, or sequences of bytes,
+ * and especially avoid swathes of \0.
+ * Failure to respect these conditions will result in a poor quality hash.
+ */
+#define XXPH3_SECRET_SIZE_MIN 136
+XXPH_PUBLIC_API XXPH64_hash_t XXPH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+
+/* XXPH3_64bits_withSeed() :
+ * This variant generates on the fly a custom secret,
+ * based on the default secret, altered using the `seed` value.
+ * While this operation is decently fast, note that it's not completely free.
+ * note : seed==0 produces same results as XXPH3_64bits() */
+XXPH_PUBLIC_API XXPH64_hash_t XXPH3_64bits_withSeed(const void* data, size_t len, XXPH64_hash_t seed);
+
+#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)   /* C11+ */
+#  include <stdalign.h>
+#  define XXPH_ALIGN(n)      alignas(n)
+#elif defined(__GNUC__)
+#  define XXPH_ALIGN(n)      __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#  define XXPH_ALIGN(n)      __declspec(align(n))
+#else
+#  define XXPH_ALIGN(n)   /* disabled */
+#endif
+
+#define XXPH3_SECRET_DEFAULT_SIZE 192   /* minimum XXPH3_SECRET_SIZE_MIN */
+
+#endif  /* XXPH_NO_LONG_LONG */
+
+
+/*-**********************************************************************
+*  XXPH_INLINE_ALL
+************************************************************************/
+#if defined(XXPH_INLINE_ALL) || defined(XXPH_PRIVATE_API)
+
+/* === RocksDB modification: was #include here but permanently inlining === */
+
+typedef struct {
+    XXPH64_hash_t low64;
+    XXPH64_hash_t high64;
+} XXPH128_hash_t;
+
+/* *************************************
+*  Tuning parameters
+***************************************/
+/*!XXPH_FORCE_MEMORY_ACCESS :
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method doesn't depend on compiler but violate C standard.
+ *            It can generate buggy code on targets which do not support unaligned memory accesses.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://stackoverflow.com/a/32095106/646947 for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef XXPH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if !defined(__clang__) && defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && (__ARM_ARCH == 6)
+#    define XXPH_FORCE_MEMORY_ACCESS 2
+#  elif !defined(__clang__) && ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
+  (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7)))
+#    define XXPH_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+/*!XXPH_ACCEPT_NULL_INPUT_POINTER :
+ * If input pointer is NULL, xxHash default behavior is to dereference it, triggering a segfault.
+ * When this macro is enabled, xxHash actively checks input for null pointer.
+ * It it is, result for null input pointers is the same as a null-length input.
+ */
+#ifndef XXPH_ACCEPT_NULL_INPUT_POINTER   /* can be defined externally */
+#  define XXPH_ACCEPT_NULL_INPUT_POINTER 0
+#endif
+
+/*!XXPH_FORCE_ALIGN_CHECK :
+ * This is a minor performance trick, only useful with lots of very small keys.
+ * It means : check for aligned/unaligned input.
+ * The check costs one initial branch per hash;
+ * set it to 0 when the input is guaranteed to be aligned,
+ * or when alignment doesn't matter for performance.
+ */
+#ifndef XXPH_FORCE_ALIGN_CHECK /* can be defined externally */
+#  if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#    define XXPH_FORCE_ALIGN_CHECK 0
+#  else
+#    define XXPH_FORCE_ALIGN_CHECK 1
+#  endif
+#endif
+
+/*!XXPH_REROLL:
+ * Whether to reroll XXPH32_finalize, and XXPH64_finalize,
+ * instead of using an unrolled jump table/if statement loop.
+ *
+ * This is automatically defined on -Os/-Oz on GCC and Clang. */
+#ifndef XXPH_REROLL
+#  if defined(__OPTIMIZE_SIZE__)
+#    define XXPH_REROLL 1
+#  else
+#    define XXPH_REROLL 0
+#  endif
+#endif
+
+#include <limits.h>   /* ULLONG_MAX */
+
+#ifndef XXPH_STATIC_LINKING_ONLY
+#define XXPH_STATIC_LINKING_ONLY
+#endif
+
+/* BEGIN RocksDB customizations */
+#include "port/lang.h" /* for FALLTHROUGH_INTENDED, inserted as appropriate */
+/* END RocksDB customizations */
+
+/* *************************************
+*  Compiler Specific Options
+***************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)      /* disable: C4127: conditional expression is constant */
+#  define XXPH_FORCE_INLINE static __forceinline
+#  define XXPH_NO_INLINE static __declspec(noinline)
+#else
+#  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define XXPH_FORCE_INLINE static inline __attribute__((always_inline))
+#      define XXPH_NO_INLINE static __attribute__((noinline))
+#    else
+#      define XXPH_FORCE_INLINE static inline
+#      define XXPH_NO_INLINE static
+#    endif
+#  else
+#    define XXPH_FORCE_INLINE static
+#    define XXPH_NO_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+
+
+/* *************************************
+*  Debug
+***************************************/
+/* DEBUGLEVEL is expected to be defined externally,
+ * typically through compiler command line.
+ * Value must be a number. */
+#ifndef DEBUGLEVEL
+#  define DEBUGLEVEL 0
+#endif
+
+#if (DEBUGLEVEL>=1)
+#  include <assert.h>   /* note : can still be disabled with NDEBUG */
+#  define XXPH_ASSERT(c)   assert(c)
+#else
+#  define XXPH_ASSERT(c)   ((void)0)
+#endif
+
+/* note : use after variable declarations */
+#define XXPH_STATIC_ASSERT(c)  { enum { XXPH_sa = 1/(int)(!!(c)) }; }
+
+
+/* *************************************
+*  Basic Types
+***************************************/
+#if !defined (__VMS) \
+ && (defined (__cplusplus) \
+ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+  typedef uint8_t  xxh_u8;
+#else
+  typedef unsigned char      xxh_u8;
+#endif
+typedef XXPH32_hash_t xxh_u32;
+
+
+/* ===   Memory access   === */
+
+#if (defined(XXPH_FORCE_MEMORY_ACCESS) && (XXPH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static xxh_u32 XXPH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; }
+
+#elif (defined(XXPH_FORCE_MEMORY_ACCESS) && (XXPH_FORCE_MEMORY_ACCESS==1))
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;
+static xxh_u32 XXPH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+
+#else
+
+/* portable and safe solution. Generally efficient.
+ * see : http://stackoverflow.com/a/32095106/646947
+ */
+static xxh_u32 XXPH_read32(const void* memPtr)
+{
+    xxh_u32 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXPH_FORCE_DIRECT_MEMORY_ACCESS */
+
+
+/* ===   Endianess   === */
+
+/* XXPH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */
+#ifndef XXPH_CPU_LITTLE_ENDIAN
+#  if defined(_WIN32) /* Windows is always little endian */ \
+     || defined(__LITTLE_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#    define XXPH_CPU_LITTLE_ENDIAN 1
+#  elif defined(__BIG_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXPH_CPU_LITTLE_ENDIAN 0
+#  else
+static int XXPH_isLittleEndian(void)
+{
+    const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+#   define XXPH_CPU_LITTLE_ENDIAN   XXPH_isLittleEndian()
+#  endif
+#endif
+
+
+
+
+/* ****************************************
+*  Compiler-specific Functions and Macros
+******************************************/
+#define XXPH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#ifndef __has_builtin
+#  define __has_builtin(x) 0
+#endif
+
+#if !defined(NO_CLANG_BUILTIN) && __has_builtin(__builtin_rotateleft32) && __has_builtin(__builtin_rotateleft64)
+#  define XXPH_rotl32 __builtin_rotateleft32
+#  define XXPH_rotl64 __builtin_rotateleft64
+/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */
+#elif defined(_MSC_VER)
+#  define XXPH_rotl32(x,r) _rotl(x,r)
+#  define XXPH_rotl64(x,r) _rotl64(x,r)
+#else
+#  define XXPH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
+#  define XXPH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
+#endif
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXPH_swap32 _byteswap_ulong
+#elif XXPH_GCC_VERSION >= 403
+#  define XXPH_swap32 __builtin_bswap32
+#else
+static xxh_u32 XXPH_swap32 (xxh_u32 x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
+#endif
+
+
+/* ***************************
+*  Memory reads
+*****************************/
+typedef enum { XXPH_aligned, XXPH_unaligned } XXPH_alignment;
+
+XXPH_FORCE_INLINE xxh_u32 XXPH_readLE32(const void* ptr)
+{
+    return XXPH_CPU_LITTLE_ENDIAN ? XXPH_read32(ptr) : XXPH_swap32(XXPH_read32(ptr));
+}
+
+XXPH_FORCE_INLINE xxh_u32
+XXPH_readLE32_align(const void* ptr, XXPH_alignment align)
+{
+    if (align==XXPH_unaligned) {
+        return XXPH_readLE32(ptr);
+    } else {
+        return XXPH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXPH_swap32(*(const xxh_u32*)ptr);
+    }
+}
+
+
+/* *************************************
+*  Misc
+***************************************/
+XXPH_PUBLIC_API unsigned XXPH_versionNumber (void) { return XXPH_VERSION_NUMBER; }
+
+
+static const xxh_u32 PRIME32_1 = 0x9E3779B1U;   /* 0b10011110001101110111100110110001 */
+static const xxh_u32 PRIME32_2 = 0x85EBCA77U;   /* 0b10000101111010111100101001110111 */
+static const xxh_u32 PRIME32_3 = 0xC2B2AE3DU;   /* 0b11000010101100101010111000111101 */
+static const xxh_u32 PRIME32_4 = 0x27D4EB2FU;   /* 0b00100111110101001110101100101111 */
+static const xxh_u32 PRIME32_5 = 0x165667B1U;   /* 0b00010110010101100110011110110001 */
+
+#ifndef XXPH_NO_LONG_LONG
+
+/* *******************************************************************
+*  64-bit hash functions
+*********************************************************************/
+
+/*======   Memory access   ======*/
+
+typedef XXPH64_hash_t xxh_u64;
+
+#if (defined(XXPH_FORCE_MEMORY_ACCESS) && (XXPH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static xxh_u64 XXPH_read64(const void* memPtr) { return *(const xxh_u64*) memPtr; }
+
+#elif (defined(XXPH_FORCE_MEMORY_ACCESS) && (XXPH_FORCE_MEMORY_ACCESS==1))
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;
+static xxh_u64 XXPH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; }
+
+#else
+
+/* portable and safe solution. Generally efficient.
+ * see : http://stackoverflow.com/a/32095106/646947
+ */
+
+static xxh_u64 XXPH_read64(const void* memPtr)
+{
+    xxh_u64 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXPH_FORCE_DIRECT_MEMORY_ACCESS */
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXPH_swap64 _byteswap_uint64
+#elif XXPH_GCC_VERSION >= 403
+#  define XXPH_swap64 __builtin_bswap64
+#else
+static xxh_u64 XXPH_swap64 (xxh_u64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+XXPH_FORCE_INLINE xxh_u64 XXPH_readLE64(const void* ptr)
+{
+    return XXPH_CPU_LITTLE_ENDIAN ? XXPH_read64(ptr) : XXPH_swap64(XXPH_read64(ptr));
+}
+
+XXPH_FORCE_INLINE xxh_u64
+XXPH_readLE64_align(const void* ptr, XXPH_alignment align)
+{
+    if (align==XXPH_unaligned)
+        return XXPH_readLE64(ptr);
+    else
+        return XXPH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXPH_swap64(*(const xxh_u64*)ptr);
+}
+
+
+/*======   xxh64   ======*/
+
+static const xxh_u64 PRIME64_1 = 0x9E3779B185EBCA87ULL;   /* 0b1001111000110111011110011011000110000101111010111100101010000111 */
+static const xxh_u64 PRIME64_2 = 0xC2B2AE3D27D4EB4FULL;   /* 0b1100001010110010101011100011110100100111110101001110101101001111 */
+static const xxh_u64 PRIME64_3 = 0x165667B19E3779F9ULL;   /* 0b0001011001010110011001111011000110011110001101110111100111111001 */
+static const xxh_u64 PRIME64_4 = 0x85EBCA77C2B2AE63ULL;   /* 0b1000010111101011110010100111011111000010101100101010111001100011 */
+static const xxh_u64 PRIME64_5 = 0x27D4EB2F165667C5ULL;   /* 0b0010011111010100111010110010111100010110010101100110011111000101 */
+
+
+/* *********************************************************************
+*  XXPH3
+*  New generation hash designed for speed on small keys and vectorization
+************************************************************************ */
+
+/*======== Was #include "xxh3.h", now inlined below ==========*/
+
+/*
+   xxHash - Extremely Fast Hash algorithm
+   Development source file for `xxh3`
+   Copyright (C) 2019-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+/* RocksDB Note: This file contains a preview release (xxhash repository
+   version 0.7.2) of XXPH3 that is unlikely to be compatible with the final
+   version of XXPH3. We have therefore renamed this XXPH3 ("preview"), for
+   clarity so that we can continue to use this version even after
+   integrating a newer incompatible version.
+*/
+
+/* ===   Dependencies   === */
+
+#undef XXPH_INLINE_ALL   /* in case it's already defined */
+#define XXPH_INLINE_ALL
+
+
+/* ===   Compiler specifics   === */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
+#  define XXPH_RESTRICT   restrict
+#else
+/* note : it might be useful to define __restrict or __restrict__ for some C++ compilers */
+#  define XXPH_RESTRICT   /* disable */
+#endif
+
+#if defined(__GNUC__)
+#  if defined(__AVX2__)
+#    include <immintrin.h>
+#  elif defined(__SSE2__)
+#    include <emmintrin.h>
+#  elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+#    define inline __inline__  /* clang bug */
+#    include <arm_neon.h>
+#    undef inline
+#  endif
+#elif defined(_MSC_VER)
+#  include <intrin.h>
+#endif
+
+/*
+ * Sanity check.
+ *
+ * XXPH3 only requires these features to be efficient:
+ *
+ *  - Usable unaligned access
+ *  - A 32-bit or 64-bit ALU
+ *      - If 32-bit, a decent ADC instruction
+ *  - A 32 or 64-bit multiply with a 64-bit result
+ *
+ * Almost all 32-bit and 64-bit targets meet this, except for Thumb-1, the
+ * classic 16-bit only subset of ARM's instruction set.
+ *
+ * First of all, Thumb-1 lacks support for the UMULL instruction which
+ * performs the important long multiply. This means numerous __aeabi_lmul
+ * calls.
+ *
+ * Second of all, the 8 functional registers are just not enough.
+ * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
+ * Lo registers, and this shuffling results in thousands more MOVs than A32.
+ *
+ * A32 and T32 don't have this limitation. They can access all 14 registers,
+ * do a 32->64 multiply with UMULL, and the flexible operand is helpful too.
+ *
+ * If compiling Thumb-1 for a target which supports ARM instructions, we
+ * will give a warning.
+ *
+ * Usually, if this happens, it is because of an accident and you probably
+ * need to specify -march, as you probably meant to compileh for a newer
+ * architecture.
+ */
+#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
+#   warning "XXPH3 is highly inefficient without ARM or Thumb-2."
+#endif
+
+/* ==========================================
+ * Vectorization detection
+ * ========================================== */
+#define XXPH_SCALAR 0
+#define XXPH_SSE2   1
+#define XXPH_AVX2   2
+#define XXPH_NEON   3
+#define XXPH_VSX    4
+
+#ifndef XXPH_VECTOR    /* can be defined on command line */
+#  if defined(__AVX2__)
+#    define XXPH_VECTOR XXPH_AVX2
+#  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+#    define XXPH_VECTOR XXPH_SSE2
+#  elif defined(__GNUC__) /* msvc support maybe later */ \
+  && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
+  && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
+    || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+#    define XXPH_VECTOR XXPH_NEON
+#  elif defined(__PPC64__) && defined(__POWER8_VECTOR__) && defined(__GNUC__)
+#    define XXPH_VECTOR XXPH_VSX
+#  else
+#    define XXPH_VECTOR XXPH_SCALAR
+#  endif
+#endif
+
+/* control alignment of accumulator,
+ * for compatibility with fast vector loads */
+#ifndef XXPH_ACC_ALIGN
+#  if XXPH_VECTOR == 0   /* scalar */
+#     define XXPH_ACC_ALIGN 8
+#  elif XXPH_VECTOR == 1  /* sse2 */
+#     define XXPH_ACC_ALIGN 16
+#  elif XXPH_VECTOR == 2  /* avx2 */
+#     define XXPH_ACC_ALIGN 32
+#  elif XXPH_VECTOR == 3  /* neon */
+#     define XXPH_ACC_ALIGN 16
+#  elif XXPH_VECTOR == 4  /* vsx */
+#     define XXPH_ACC_ALIGN 16
+#  endif
+#endif
+
+/* xxh_u64 XXPH_mult32to64(xxh_u32 a, xxh_u64 b) { return (xxh_u64)a * (xxh_u64)b; } */
+#if defined(_MSC_VER) && defined(_M_IX86)
+#    include <intrin.h>
+#    define XXPH_mult32to64(x, y) __emulu(x, y)
+#else
+#    define XXPH_mult32to64(x, y) ((xxh_u64)((x) & 0xFFFFFFFF) * (xxh_u64)((y) & 0xFFFFFFFF))
+#endif
+
+/* VSX stuff. It's a lot because VSX support is mediocre across compilers and
+ * there is a lot of mischief with endianness. */
+#if XXPH_VECTOR == XXPH_VSX
+#  include <altivec.h>
+#  undef vector
+typedef __vector unsigned long long U64x2;
+typedef __vector unsigned char U8x16;
+typedef __vector unsigned U32x4;
+
+#ifndef XXPH_VSX_BE
+#  if defined(__BIG_ENDIAN__) \
+  || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXPH_VSX_BE 1
+#  elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
+#    warning "-maltivec=be is not recommended. Please use native endianness."
+#    define XXPH_VSX_BE 1
+#  else
+#    define XXPH_VSX_BE 0
+#  endif
+#endif
+
+/* We need some helpers for big endian mode. */
+#if XXPH_VSX_BE
+/* A wrapper for POWER9's vec_revb. */
+#  ifdef __POWER9_VECTOR__
+#    define XXPH_vec_revb vec_revb
+#  else
+XXPH_FORCE_INLINE U64x2 XXPH_vec_revb(U64x2 val)
+{
+    U8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
+                              0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
+    return vec_perm(val, val, vByteSwap);
+}
+#  endif
+
+/* Power8 Crypto gives us vpermxor which is very handy for
+ * PPC64EB.
+ *
+ * U8x16 vpermxor(U8x16 a, U8x16 b, U8x16 mask)
+ * {
+ *     U8x16 ret;
+ *     for (int i = 0; i < 16; i++) {
+ *         ret[i] = a[mask[i] & 0xF] ^ b[mask[i] >> 4];
+ *     }
+ *     return ret;
+ * }
+ *
+ * Because both of the main loops load the key, swap, and xor it with input,
+ * we can combine the key swap into this instruction.
+ */
+#  ifdef vec_permxor
+#    define XXPH_vec_permxor vec_permxor
+#  else
+#    define XXPH_vec_permxor __builtin_crypto_vpermxor
+#  endif
+#endif  /* XXPH_VSX_BE */
+/*
+ * Because we reinterpret the multiply, there are endian memes: vec_mulo actually becomes
+ * vec_mule.
+ *
+ * Additionally, the intrinsic wasn't added until GCC 8, despite existing for a while.
+ * Clang has an easy way to control this, we can just use the builtin which doesn't swap.
+ * GCC needs inline assembly. */
+#if __has_builtin(__builtin_altivec_vmuleuw)
+#  define XXPH_vec_mulo __builtin_altivec_vmulouw
+#  define XXPH_vec_mule __builtin_altivec_vmuleuw
+#else
+/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
+XXPH_FORCE_INLINE U64x2 XXPH_vec_mulo(U32x4 a, U32x4 b) {
+    U64x2 result;
+    __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+XXPH_FORCE_INLINE U64x2 XXPH_vec_mule(U32x4 a, U32x4 b) {
+    U64x2 result;
+    __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+#endif /* __has_builtin(__builtin_altivec_vmuleuw) */
+#endif /* XXPH_VECTOR == XXPH_VSX */
+
+/* prefetch
+ * can be disabled, by declaring XXPH_NO_PREFETCH build macro */
+#if defined(XXPH_NO_PREFETCH)
+#  define XXPH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
+#else
+#if defined(_MSC_VER) && \
+    (defined(_M_X64) ||  \
+     defined(_M_IX86)) /* _mm_prefetch() is not defined outside of x86/x64 */
+#    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#    define XXPH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+#    define XXPH_PREFETCH(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#  else
+#    define XXPH_PREFETCH(ptr) (void)(ptr)  /* disabled */
+#  endif
+#endif  /* XXPH_NO_PREFETCH */
+
+
+/* ==========================================
+ * XXPH3 default settings
+ * ========================================== */
+
+#define XXPH_SECRET_DEFAULT_SIZE 192   /* minimum XXPH3_SECRET_SIZE_MIN */
+
+#if (XXPH_SECRET_DEFAULT_SIZE < XXPH3_SECRET_SIZE_MIN)
+#  error "default keyset is not large enough"
+#endif
+
+XXPH_ALIGN(64) static const xxh_u8 kSecret[XXPH_SECRET_DEFAULT_SIZE] = {
+    0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
+    0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
+    0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+    0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
+    0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
+    0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+    0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
+    0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
+
+    0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+    0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
+    0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
+    0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+};
+
+/*
+ * GCC for x86 has a tendency to use SSE in this loop. While it
+ * successfully avoids swapping (as MUL overwrites EAX and EDX), it
+ * slows it down because instead of free register swap shifts, it
+ * must use pshufd and punpckl/hd.
+ *
+ * To prevent this, we use this attribute to shut off SSE.
+ */
+#if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
+__attribute__((__target__("no-sse")))
+#endif
+static XXPH128_hash_t
+XXPH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
+{
+    /*
+     * GCC/Clang __uint128_t method.
+     *
+     * On most 64-bit targets, GCC and Clang define a __uint128_t type.
+     * This is usually the best way as it usually uses a native long 64-bit
+     * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
+     *
+     * Usually.
+     *
+     * Despite being a 32-bit platform, Clang (and emscripten) define this
+     * type despite not having the arithmetic for it. This results in a
+     * laggy compiler builtin call which calculates a full 128-bit multiply.
+     * In that case it is best to use the portable one.
+     * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
+     */
+#if defined(__GNUC__) && !defined(__wasm__) \
+    && defined(__SIZEOF_INT128__) \
+    || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+
+    __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs;
+    XXPH128_hash_t const r128 = { (xxh_u64)(product), (xxh_u64)(product >> 64) };
+    return r128;
+
+    /*
+     * MSVC for x64's _umul128 method.
+     *
+     * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
+     *
+     * This compiles to single operand MUL on x64.
+     */
+#elif defined(_M_X64) || defined(_M_IA64)
+
+#ifndef _MSC_VER
+#   pragma intrinsic(_umul128)
+#endif
+    xxh_u64 product_high;
+    xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
+    XXPH128_hash_t const r128 = { product_low, product_high };
+    return r128;
+
+#else
+    /*
+     * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
+     *
+     * This is a fast and simple grade school multiply, which is shown
+     * below with base 10 arithmetic instead of base 0x100000000.
+     *
+     *           9 3 // D2 lhs = 93
+     *         x 7 5 // D2 rhs = 75
+     *     ----------
+     *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10)
+     *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10)
+     *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10)
+     *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10)
+     *     ---------
+     *         2 7 | // D2 cross  = (15 / 10) + (45 % 10) + 21
+     *     + 6 7 | | // D2 upper  = (27 / 10) + (45 / 10) + 63
+     *     ---------
+     *       6 9 7 5
+     *
+     * The reasons for adding the products like this are:
+     *  1. It avoids manual carry tracking. Just like how
+     *     (9 * 9) + 9 + 9 = 99, the same applies with this for
+     *     UINT64_MAX. This avoids a lot of complexity.
+     *
+     *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
+     *     instruction available in ARMv6+ A32/T32, which is shown below:
+     *
+     *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
+     *         {
+     *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
+     *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
+     *             *RdHi = (xxh_u32)(product >> 32);
+     *         }
+     *
+     *     This instruction was designed for efficient long multiplication,
+     *     and allows this to be calculated in only 4 instructions which
+     *     is comparable to some 64-bit ALUs.
+     *
+     *  3. It isn't terrible on other platforms. Usually this will be
+     *     a couple of 32-bit ADD/ADCs.
+     */
+
+    /* First calculate all of the cross products. */
+    xxh_u64 const lo_lo = XXPH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
+    xxh_u64 const hi_lo = XXPH_mult32to64(lhs >> 32,        rhs & 0xFFFFFFFF);
+    xxh_u64 const lo_hi = XXPH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
+    xxh_u64 const hi_hi = XXPH_mult32to64(lhs >> 32,        rhs >> 32);
+
+    /* Now add the products together. These will never overflow. */
+    xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
+    xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
+    xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
+
+    XXPH128_hash_t r128 = { lower, upper };
+    return r128;
+#endif
+}
+
+/*
+ * We want to keep the attribute here because a target switch
+ * disables inlining.
+ *
+ * Does a 64-bit to 128-bit multiply, then XOR folds it.
+ * The reason for the separate function is to prevent passing
+ * too many structs around by value. This will hopefully inline
+ * the multiply, but we don't force it.
+ */
+#if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
+__attribute__((__target__("no-sse")))
+#endif
+static xxh_u64
+XXPH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
+{
+    XXPH128_hash_t product = XXPH_mult64to128(lhs, rhs);
+    return product.low64 ^ product.high64;
+}
+
+
+static XXPH64_hash_t XXPH3_avalanche(xxh_u64 h64)
+{
+    h64 ^= h64 >> 37;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+    return h64;
+}
+
+
+/* ==========================================
+ * Short keys
+ * ========================================== */
+
+XXPH_FORCE_INLINE XXPH64_hash_t
+XXPH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXPH64_hash_t seed)
+{
+    XXPH_ASSERT(input != NULL);
+    XXPH_ASSERT(1 <= len && len <= 3);
+    XXPH_ASSERT(secret != NULL);
+    {   xxh_u8 const c1 = input[0];
+        xxh_u8 const c2 = input[len >> 1];
+        xxh_u8 const c3 = input[len - 1];
+        xxh_u32  const combined = ((xxh_u32)c1) | (((xxh_u32)c2) << 8) | (((xxh_u32)c3) << 16) | (((xxh_u32)len) << 24);
+        xxh_u64  const keyed = (xxh_u64)combined ^ (XXPH_readLE32(secret) + seed);
+        xxh_u64  const mixed = keyed * PRIME64_1;
+        return XXPH3_avalanche(mixed);
+    }
+}
+
+XXPH_FORCE_INLINE XXPH64_hash_t
+XXPH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXPH64_hash_t seed)
+{
+    XXPH_ASSERT(input != NULL);
+    XXPH_ASSERT(secret != NULL);
+    XXPH_ASSERT(4 <= len && len <= 8);
+    {   xxh_u32 const input_lo = XXPH_readLE32(input);
+        xxh_u32 const input_hi = XXPH_readLE32(input + len - 4);
+        xxh_u64 const input_64 = input_lo | ((xxh_u64)input_hi << 32);
+        xxh_u64 const keyed = input_64 ^ (XXPH_readLE64(secret) + seed);
+        xxh_u64 const mix64 = len + ((keyed ^ (keyed >> 51)) * PRIME32_1);
+        return XXPH3_avalanche((mix64 ^ (mix64 >> 47)) * PRIME64_2);
+    }
+}
+
+XXPH_FORCE_INLINE XXPH64_hash_t
+XXPH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXPH64_hash_t seed)
+{
+    XXPH_ASSERT(input != NULL);
+    XXPH_ASSERT(secret != NULL);
+    XXPH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const input_lo = XXPH_readLE64(input)           ^ (XXPH_readLE64(secret)     + seed);
+        xxh_u64 const input_hi = XXPH_readLE64(input + len - 8) ^ (XXPH_readLE64(secret + 8) - seed);
+        xxh_u64 const acc = len + (input_lo + input_hi) + XXPH3_mul128_fold64(input_lo, input_hi);
+        return XXPH3_avalanche(acc);
+    }
+}
+
+XXPH_FORCE_INLINE XXPH64_hash_t
+XXPH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXPH64_hash_t seed)
+{
+    XXPH_ASSERT(len <= 16);
+    {   if (len > 8) return XXPH3_len_9to16_64b(input, len, secret, seed);
+        if (len >= 4) return XXPH3_len_4to8_64b(input, len, secret, seed);
+        if (len) return XXPH3_len_1to3_64b(input, len, secret, seed);
+        /*
+         * RocksDB modification from XXPH3 preview: zero result for empty
+         * string can be problematic for multiplication-based algorithms.
+         * Return a hash of the seed instead.
+         */
+        return XXPH3_mul128_fold64(seed + XXPH_readLE64(secret), PRIME64_2);
+    }
+}
+
+
+/* ===    Long Keys    === */
+
+#define STRIPE_LEN 64
+#define XXPH_SECRET_CONSUME_RATE 8   /* nb of secret bytes consumed at each accumulation */
+#define ACC_NB (STRIPE_LEN / sizeof(xxh_u64))
+
+typedef enum { XXPH3_acc_64bits, XXPH3_acc_128bits } XXPH3_accWidth_e;
+
+XXPH_FORCE_INLINE void
+XXPH3_accumulate_512(      void* XXPH_RESTRICT acc,
+                    const void* XXPH_RESTRICT input,
+                    const void* XXPH_RESTRICT secret,
+                    XXPH3_accWidth_e accWidth)
+{
+#if (XXPH_VECTOR == XXPH_AVX2)
+
+    XXPH_ASSERT((((size_t)acc) & 31) == 0);
+    {   XXPH_ALIGN(32) __m256i* const xacc  =       (__m256i *) acc;
+        const         __m256i* const xinput = (const __m256i *) input;  /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
+        const         __m256i* const xsecret = (const __m256i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+            __m256i const data_vec = _mm256_loadu_si256 (xinput+i);
+            __m256i const key_vec = _mm256_loadu_si256 (xsecret+i);
+            __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);                                  /* uint32 dk[8]  = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
+            __m256i const product = _mm256_mul_epu32 (data_key, _mm256_shuffle_epi32 (data_key, 0x31));  /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
+            if (accWidth == XXPH3_acc_128bits) {
+                __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
+                __m256i const sum = _mm256_add_epi64(xacc[i], data_swap);
+                xacc[i]  = _mm256_add_epi64(product, sum);
+            } else {  /* XXPH3_acc_64bits */
+                __m256i const sum = _mm256_add_epi64(xacc[i], data_vec);
+                xacc[i]  = _mm256_add_epi64(product, sum);
+            }
+    }   }
+
+#elif (XXPH_VECTOR == XXPH_SSE2)
+
+    XXPH_ASSERT((((size_t)acc) & 15) == 0);
+    {   XXPH_ALIGN(16) __m128i* const xacc  =       (__m128i *) acc;
+        const         __m128i* const xinput = (const __m128i *) input;  /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
+        const         __m128i* const xsecret = (const __m128i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+            __m128i const data_vec = _mm_loadu_si128 (xinput+i);
+            __m128i const key_vec = _mm_loadu_si128 (xsecret+i);
+            __m128i const data_key = _mm_xor_si128 (data_vec, key_vec);                                  /* uint32 dk[8]  = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
+            __m128i const product = _mm_mul_epu32 (data_key, _mm_shuffle_epi32 (data_key, 0x31));  /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
+            if (accWidth == XXPH3_acc_128bits) {
+                __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
+                __m128i const sum = _mm_add_epi64(xacc[i], data_swap);
+                xacc[i]  = _mm_add_epi64(product, sum);
+            } else {  /* XXPH3_acc_64bits */
+                __m128i const sum = _mm_add_epi64(xacc[i], data_vec);
+                xacc[i]  = _mm_add_epi64(product, sum);
+            }
+    }   }
+
+#elif (XXPH_VECTOR == XXPH_NEON)
+
+    XXPH_ASSERT((((size_t)acc) & 15) == 0);
+    {
+        XXPH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
+        /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
+        uint8_t const* const xinput = (const uint8_t *) input;
+        uint8_t const* const xsecret  = (const uint8_t *) secret;
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) {
+#if !defined(__aarch64__) && !defined(__arm64__) && defined(__GNUC__) /* ARM32-specific hack */
+            /* vzip on ARMv7 Clang generates a lot of vmovs (technically vorrs) without this.
+             * vzip on 32-bit ARM NEON will overwrite the original register, and I think that Clang
+             * assumes I don't want to destroy it and tries to make a copy. This slows down the code
+             * a lot.
+             * aarch64 not only uses an entirely different syntax, but it requires three
+             * instructions...
+             *    ext    v1.16B, v0.16B, #8    // select high bits because aarch64 can't address them directly
+             *    zip1   v3.2s, v0.2s, v1.2s   // first zip
+             *    zip2   v2.2s, v0.2s, v1.2s   // second zip
+             * ...to do what ARM does in one:
+             *    vzip.32 d0, d1               // Interleave high and low bits and overwrite. */
+
+            /* data_vec = xsecret[i]; */
+            uint8x16_t const data_vec    = vld1q_u8(xinput + (i * 16));
+            /* key_vec  = xsecret[i];  */
+            uint8x16_t const key_vec     = vld1q_u8(xsecret  + (i * 16));
+            /* data_key = data_vec ^ key_vec; */
+            uint32x4_t       data_key;
+
+            if (accWidth == XXPH3_acc_64bits) {
+                /* Add first to prevent register swaps */
+                /* xacc[i] += data_vec; */
+                xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
+            } else {  /* XXPH3_acc_128bits */
+                /* xacc[i] += swap(data_vec); */
+                /* can probably be optimized better */
+                uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
+                uint64x2_t const swapped= vextq_u64(data64, data64, 1);
+                xacc[i] = vaddq_u64 (xacc[i], swapped);
+            }
+
+            data_key = vreinterpretq_u32_u8(veorq_u8(data_vec, key_vec));
+
+            /* Here's the magic. We use the quirkiness of vzip to shuffle data_key in place.
+             * shuffle: data_key[0, 1, 2, 3] = data_key[0, 2, 1, 3] */
+            __asm__("vzip.32 %e0, %f0" : "+w" (data_key));
+            /* xacc[i] += (uint64x2_t) data_key[0, 1] * (uint64x2_t) data_key[2, 3]; */
+            xacc[i] = vmlal_u32(xacc[i], vget_low_u32(data_key), vget_high_u32(data_key));
+
+#else
+            /* On aarch64, vshrn/vmovn seems to be equivalent to, if not faster than, the vzip method. */
+
+            /* data_vec = xsecret[i]; */
+            uint8x16_t const data_vec    = vld1q_u8(xinput + (i * 16));
+            /* key_vec  = xsecret[i];  */
+            uint8x16_t const key_vec     = vld1q_u8(xsecret  + (i * 16));
+            /* data_key = data_vec ^ key_vec; */
+            uint64x2_t const data_key    = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
+            /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); */
+            uint32x2_t const data_key_lo = vmovn_u64  (data_key);
+            /* data_key_hi = (uint32x2_t) (data_key >> 32); */
+            uint32x2_t const data_key_hi = vshrn_n_u64 (data_key, 32);
+            if (accWidth == XXPH3_acc_64bits) {
+                /* xacc[i] += data_vec; */
+                xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
+            } else {  /* XXPH3_acc_128bits */
+                /* xacc[i] += swap(data_vec); */
+                uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
+                uint64x2_t const swapped= vextq_u64(data64, data64, 1);
+                xacc[i] = vaddq_u64 (xacc[i], swapped);
+            }
+            /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
+            xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
+
+#endif
+        }
+    }
+
+#elif (XXPH_VECTOR == XXPH_VSX) && /* work around a compiler bug */ (__GNUC__ > 5)
+          U64x2* const xacc =        (U64x2*) acc;    /* presumed aligned */
+    U64x2 const* const xinput = (U64x2 const*) input;   /* no alignment restriction */
+    U64x2 const* const xsecret  = (U64x2 const*) secret;    /* no alignment restriction */
+    U64x2 const v32 = { 32,  32 };
+#if XXPH_VSX_BE
+    U8x16 const vXorSwap  = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
+                              0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 };
+#endif
+    size_t i;
+    for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
+        /* data_vec = xinput[i]; */
+        /* key_vec = xsecret[i]; */
+#if XXPH_VSX_BE
+        /* byteswap */
+        U64x2 const data_vec = XXPH_vec_revb(vec_vsx_ld(0, xinput + i));
+        U64x2 const key_raw = vec_vsx_ld(0, xsecret + i);
+        /* See comment above. data_key = data_vec ^ swap(xsecret[i]); */
+        U64x2 const data_key = (U64x2)XXPH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap);
+#else
+        U64x2 const data_vec = vec_vsx_ld(0, xinput + i);
+        U64x2 const key_vec = vec_vsx_ld(0, xsecret + i);
+        U64x2 const data_key = data_vec ^ key_vec;
+#endif
+        /* shuffled = (data_key << 32) | (data_key >> 32); */
+        U32x4 const shuffled = (U32x4)vec_rl(data_key, v32);
+        /* product = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)shuffled & 0xFFFFFFFF); */
+        U64x2 const product = XXPH_vec_mulo((U32x4)data_key, shuffled);
+        xacc[i] += product;
+
+        if (accWidth == XXPH3_acc_64bits) {
+            xacc[i] += data_vec;
+        } else {  /* XXPH3_acc_128bits */
+            /* swap high and low halves */
+            U64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2);
+            xacc[i] += data_swapped;
+        }
+    }
+
+#else   /* scalar variant of Accumulator - universal */
+
+    XXPH_ALIGN(XXPH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc;    /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
+    const xxh_u8* const xinput = (const xxh_u8*) input;  /* no alignment restriction */
+    const xxh_u8* const xsecret  = (const xxh_u8*) secret;   /* no alignment restriction */
+    size_t i;
+    XXPH_ASSERT(((size_t)acc & (XXPH_ACC_ALIGN-1)) == 0);
+    for (i=0; i < ACC_NB; i++) {
+        xxh_u64 const data_val = XXPH_readLE64(xinput + 8*i);
+        xxh_u64 const data_key = data_val ^ XXPH_readLE64(xsecret + i*8);
+
+        if (accWidth == XXPH3_acc_64bits) {
+            xacc[i] += data_val;
+        } else {
+            xacc[i ^ 1] += data_val; /* swap adjacent lanes */
+        }
+        xacc[i] += XXPH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
+    }
+#endif
+}
+
+XXPH_FORCE_INLINE void
+XXPH3_scrambleAcc(void* XXPH_RESTRICT acc, const void* XXPH_RESTRICT secret)
+{
+#if (XXPH_VECTOR == XXPH_AVX2)
+
+    XXPH_ASSERT((((size_t)acc) & 31) == 0);
+    {   XXPH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
+        const         __m256i* const xsecret = (const __m256i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this argument type */
+        const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m256i const acc_vec     = xacc[i];
+            __m256i const shifted     = _mm256_srli_epi64    (acc_vec, 47);
+            __m256i const data_vec    = _mm256_xor_si256     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+
+            /* xacc[i] *= PRIME32_1; */
+            __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, 0x31);
+            __m256i const prod_lo     = _mm256_mul_epu32     (data_key, prime32);
+            __m256i const prod_hi     = _mm256_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
+        }
+    }
+
+#elif (XXPH_VECTOR == XXPH_SSE2)
+
+    XXPH_ASSERT((((size_t)acc) & 15) == 0);
+    {   XXPH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
+        const         __m128i* const xsecret = (const __m128i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this argument type */
+        const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m128i const acc_vec     = xacc[i];
+            __m128i const shifted     = _mm_srli_epi64    (acc_vec, 47);
+            __m128i const data_vec    = _mm_xor_si128     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+
+            /* xacc[i] *= PRIME32_1; */
+            __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, 0x31);
+            __m128i const prod_lo     = _mm_mul_epu32     (data_key, prime32);
+            __m128i const prod_hi     = _mm_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
+        }
+    }
+
+#elif (XXPH_VECTOR == XXPH_NEON)
+
+    XXPH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {   uint64x2_t* const xacc =     (uint64x2_t*) acc;
+        uint8_t const* const xsecret = (uint8_t const*) secret;
+        uint32x2_t const prime     = vdup_n_u32 (PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) {
+            /* data_vec = xacc[i] ^ (xacc[i] >> 47); */
+            uint64x2_t const   acc_vec  = xacc[i];
+            uint64x2_t const   shifted  = vshrq_n_u64 (acc_vec, 47);
+            uint64x2_t const   data_vec = veorq_u64   (acc_vec, shifted);
+
+            /* key_vec  = xsecret[i]; */
+            uint32x4_t const   key_vec  = vreinterpretq_u32_u8(vld1q_u8(xsecret + (i * 16)));
+            /* data_key = data_vec ^ key_vec; */
+            uint32x4_t const   data_key = veorq_u32   (vreinterpretq_u32_u64(data_vec), key_vec);
+            /* shuffled = { data_key[0, 2], data_key[1, 3] }; */
+            uint32x2x2_t const shuffled = vzip_u32    (vget_low_u32(data_key), vget_high_u32(data_key));
+
+            /* data_key *= PRIME32_1 */
+
+            /* prod_hi = (data_key >> 32) * PRIME32_1; */
+            uint64x2_t const   prod_hi = vmull_u32    (shuffled.val[1], prime);
+            /* xacc[i] = prod_hi << 32; */
+            xacc[i] = vshlq_n_u64(prod_hi, 32);
+            /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */
+            xacc[i] = vmlal_u32(xacc[i], shuffled.val[0], prime);
+    }   }
+
+#elif (XXPH_VECTOR == XXPH_VSX) && /* work around a compiler bug */ (__GNUC__ > 5)
+
+          U64x2* const xacc =       (U64x2*) acc;
+    const U64x2* const xsecret = (const U64x2*) secret;
+    /* constants */
+    U64x2 const v32  = { 32, 32 };
+    U64x2 const v47 = { 47, 47 };
+    U32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 };
+    size_t i;
+#if XXPH_VSX_BE
+    /* endian swap */
+    U8x16 const vXorSwap  = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
+                              0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 };
+#endif
+    for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
+        U64x2 const acc_vec  = xacc[i];
+        U64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
+        /* key_vec = xsecret[i]; */
+#if XXPH_VSX_BE
+        /* swap bytes words */
+        U64x2 const key_raw  = vec_vsx_ld(0, xsecret + i);
+        U64x2 const data_key = (U64x2)XXPH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap);
+#else
+        U64x2 const key_vec  = vec_vsx_ld(0, xsecret + i);
+        U64x2 const data_key = data_vec ^ key_vec;
+#endif
+
+        /* data_key *= PRIME32_1 */
+
+        /* prod_lo = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)prime & 0xFFFFFFFF);  */
+        U64x2 const prod_even  = XXPH_vec_mule((U32x4)data_key, prime);
+        /* prod_hi = ((U64x2)data_key >> 32) * ((U64x2)prime >> 32);  */
+        U64x2 const prod_odd  = XXPH_vec_mulo((U32x4)data_key, prime);
+        xacc[i] = prod_odd + (prod_even << v32);
+    }
+
+#else   /* scalar variant of Scrambler - universal */
+
+    XXPH_ALIGN(XXPH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
+    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+    size_t i;
+    XXPH_ASSERT((((size_t)acc) & (XXPH_ACC_ALIGN-1)) == 0);
+    for (i=0; i < ACC_NB; i++) {
+        xxh_u64 const key64 = XXPH_readLE64(xsecret + 8*i);
+        xxh_u64 acc64 = xacc[i];
+        acc64 ^= acc64 >> 47;
+        acc64 ^= key64;
+        acc64 *= PRIME32_1;
+        xacc[i] = acc64;
+    }
+
+#endif
+}
+
+#define XXPH_PREFETCH_DIST 384
+
+/* assumption : nbStripes will not overflow secret size */
+XXPH_FORCE_INLINE void
+XXPH3_accumulate(       xxh_u64* XXPH_RESTRICT acc,
+                const xxh_u8* XXPH_RESTRICT input,
+                const xxh_u8* XXPH_RESTRICT secret,
+                      size_t nbStripes,
+                      XXPH3_accWidth_e accWidth)
+{
+    size_t n;
+    for (n = 0; n < nbStripes; n++ ) {
+        const xxh_u8* const in = input + n*STRIPE_LEN;
+        XXPH_PREFETCH(in + XXPH_PREFETCH_DIST);
+        XXPH3_accumulate_512(acc,
+                            in,
+                            secret + n*XXPH_SECRET_CONSUME_RATE,
+                            accWidth);
+    }
+}
+
+/* note : clang auto-vectorizes well in SS2 mode _if_ this function is `static`,
+ *        and doesn't auto-vectorize it at all if it is `FORCE_INLINE`.
+ *        However, it auto-vectorizes better AVX2 if it is `FORCE_INLINE`
+ *        Pretty much every other modes and compilers prefer `FORCE_INLINE`.
+ */
+
+#if defined(__clang__) && (XXPH_VECTOR==0) && !defined(__AVX2__) && !defined(__arm__) && !defined(__thumb__)
+static void
+#else
+XXPH_FORCE_INLINE void
+#endif
+XXPH3_hashLong_internal_loop( xxh_u64* XXPH_RESTRICT acc,
+                      const xxh_u8* XXPH_RESTRICT input, size_t len,
+                      const xxh_u8* XXPH_RESTRICT secret, size_t secretSize,
+                            XXPH3_accWidth_e accWidth)
+{
+    size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXPH_SECRET_CONSUME_RATE;
+    size_t const block_len = STRIPE_LEN * nb_rounds;
+    size_t const nb_blocks = len / block_len;
+
+    size_t n;
+
+    XXPH_ASSERT(secretSize >= XXPH3_SECRET_SIZE_MIN);
+
+    for (n = 0; n < nb_blocks; n++) {
+        XXPH3_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth);
+        XXPH3_scrambleAcc(acc, secret + secretSize - STRIPE_LEN);
+    }
+
+    /* last partial block */
+    XXPH_ASSERT(len > STRIPE_LEN);
+    {   size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN;
+        XXPH_ASSERT(nbStripes <= (secretSize / XXPH_SECRET_CONSUME_RATE));
+        XXPH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth);
+
+        /* last stripe */
+        if (len & (STRIPE_LEN - 1)) {
+            const xxh_u8* const p = input + len - STRIPE_LEN;
+#define XXPH_SECRET_LASTACC_START 7  /* do not align on 8, so that secret is different from scrambler */
+            XXPH3_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXPH_SECRET_LASTACC_START, accWidth);
+    }   }
+}
+
+XXPH_FORCE_INLINE xxh_u64
+XXPH3_mix2Accs(const xxh_u64* XXPH_RESTRICT acc, const xxh_u8* XXPH_RESTRICT secret)
+{
+    return XXPH3_mul128_fold64(
+               acc[0] ^ XXPH_readLE64(secret),
+               acc[1] ^ XXPH_readLE64(secret+8) );
+}
+
+static XXPH64_hash_t
+XXPH3_mergeAccs(const xxh_u64* XXPH_RESTRICT acc, const xxh_u8* XXPH_RESTRICT secret, xxh_u64 start)
+{
+    xxh_u64 result64 = start;
+
+    result64 += XXPH3_mix2Accs(acc+0, secret +  0);
+    result64 += XXPH3_mix2Accs(acc+2, secret + 16);
+    result64 += XXPH3_mix2Accs(acc+4, secret + 32);
+    result64 += XXPH3_mix2Accs(acc+6, secret + 48);
+
+    return XXPH3_avalanche(result64);
+}
+
+#define XXPH3_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \
+                        PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 };
+
+XXPH_FORCE_INLINE XXPH64_hash_t
+XXPH3_hashLong_internal(const xxh_u8* XXPH_RESTRICT input, size_t len,
+                       const xxh_u8* XXPH_RESTRICT secret, size_t secretSize)
+{
+    XXPH_ALIGN(XXPH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXPH3_INIT_ACC;
+
+    XXPH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXPH3_acc_64bits);
+
+    /* converge into final hash */
+    XXPH_STATIC_ASSERT(sizeof(acc) == 64);
+#define XXPH_SECRET_MERGEACCS_START 11  /* do not align on 8, so that secret is different from accumulator */
+    XXPH_ASSERT(secretSize >= sizeof(acc) + XXPH_SECRET_MERGEACCS_START);
+    return XXPH3_mergeAccs(acc, secret + XXPH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
+}
+
+
+XXPH_NO_INLINE XXPH64_hash_t    /* It's important for performance that XXPH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXPH3_hashLong_64b_defaultSecret(const xxh_u8* XXPH_RESTRICT input, size_t len)
+{
+    return XXPH3_hashLong_internal(input, len, kSecret, sizeof(kSecret));
+}
+
+XXPH_NO_INLINE XXPH64_hash_t    /* It's important for performance that XXPH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXPH3_hashLong_64b_withSecret(const xxh_u8* XXPH_RESTRICT input, size_t len,
+                             const xxh_u8* XXPH_RESTRICT secret, size_t secretSize)
+{
+    return XXPH3_hashLong_internal(input, len, secret, secretSize);
+}
+
+
+XXPH_FORCE_INLINE void XXPH_writeLE64(void* dst, xxh_u64 v64)
+{
+    if (!XXPH_CPU_LITTLE_ENDIAN) v64 = XXPH_swap64(v64);
+    memcpy(dst, &v64, sizeof(v64));
+}
+
+/* XXPH3_initCustomSecret() :
+ * destination `customSecret` is presumed allocated and same size as `kSecret`.
+ */
+XXPH_FORCE_INLINE void XXPH3_initCustomSecret(xxh_u8* customSecret, xxh_u64 seed64)
+{
+    int const nbRounds = XXPH_SECRET_DEFAULT_SIZE / 16;
+    int i;
+
+    XXPH_STATIC_ASSERT((XXPH_SECRET_DEFAULT_SIZE & 15) == 0);
+
+    for (i=0; i < nbRounds; i++) {
+        XXPH_writeLE64(customSecret + 16*i,     XXPH_readLE64(kSecret + 16*i)     + seed64);
+        XXPH_writeLE64(customSecret + 16*i + 8, XXPH_readLE64(kSecret + 16*i + 8) - seed64);
+    }
+}
+
+
+/* XXPH3_hashLong_64b_withSeed() :
+ * Generate a custom key,
+ * based on alteration of default kSecret with the seed,
+ * and then use this key for long mode hashing.
+ * This operation is decently fast but nonetheless costs a little bit of time.
+ * Try to avoid it whenever possible (typically when seed==0).
+ */
+XXPH_NO_INLINE XXPH64_hash_t    /* It's important for performance that XXPH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXPH3_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXPH64_hash_t seed)
+{
+    XXPH_ALIGN(8) xxh_u8 secret[XXPH_SECRET_DEFAULT_SIZE];
+    if (seed==0) return XXPH3_hashLong_64b_defaultSecret(input, len);
+    XXPH3_initCustomSecret(secret, seed);
+    return XXPH3_hashLong_internal(input, len, secret, sizeof(secret));
+}
+
+
+XXPH_FORCE_INLINE xxh_u64 XXPH3_mix16B(const xxh_u8* XXPH_RESTRICT input,
+                                 const xxh_u8* XXPH_RESTRICT secret, xxh_u64 seed64)
+{
+    xxh_u64 const input_lo = XXPH_readLE64(input);
+    xxh_u64 const input_hi = XXPH_readLE64(input+8);
+    return XXPH3_mul128_fold64(
+               input_lo ^ (XXPH_readLE64(secret)   + seed64),
+               input_hi ^ (XXPH_readLE64(secret+8) - seed64) );
+}
+
+
+XXPH_FORCE_INLINE XXPH64_hash_t
+XXPH3_len_17to128_64b(const xxh_u8* XXPH_RESTRICT input, size_t len,
+                     const xxh_u8* XXPH_RESTRICT secret, size_t secretSize,
+                     XXPH64_hash_t seed)
+{
+    XXPH_ASSERT(secretSize >= XXPH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXPH_ASSERT(16 < len && len <= 128);
+
+    {   xxh_u64 acc = len * PRIME64_1;
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc += XXPH3_mix16B(input+48, secret+96, seed);
+                    acc += XXPH3_mix16B(input+len-64, secret+112, seed);
+                }
+                acc += XXPH3_mix16B(input+32, secret+64, seed);
+                acc += XXPH3_mix16B(input+len-48, secret+80, seed);
+            }
+            acc += XXPH3_mix16B(input+16, secret+32, seed);
+            acc += XXPH3_mix16B(input+len-32, secret+48, seed);
+        }
+        acc += XXPH3_mix16B(input+0, secret+0, seed);
+        acc += XXPH3_mix16B(input+len-16, secret+16, seed);
+
+        return XXPH3_avalanche(acc);
+    }
+}
+
+#define XXPH3_MIDSIZE_MAX 240
+
+XXPH_NO_INLINE XXPH64_hash_t
+XXPH3_len_129to240_64b(const xxh_u8* XXPH_RESTRICT input, size_t len,
+                      const xxh_u8* XXPH_RESTRICT secret, size_t secretSize,
+                      XXPH64_hash_t seed)
+{
+    XXPH_ASSERT(secretSize >= XXPH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXPH_ASSERT(128 < len && len <= XXPH3_MIDSIZE_MAX);
+
+    #define XXPH3_MIDSIZE_STARTOFFSET 3
+    #define XXPH3_MIDSIZE_LASTOFFSET  17
+
+    {   xxh_u64 acc = len * PRIME64_1;
+        int const nbRounds = (int)len / 16;
+        int i;
+        for (i=0; i<8; i++) {
+            acc += XXPH3_mix16B(input+(16*i), secret+(16*i), seed);
+        }
+        acc = XXPH3_avalanche(acc);
+        XXPH_ASSERT(nbRounds >= 8);
+        for (i=8 ; i < nbRounds; i++) {
+            acc += XXPH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXPH3_MIDSIZE_STARTOFFSET, seed);
+        }
+        /* last bytes */
+        acc += XXPH3_mix16B(input + len - 16, secret + XXPH3_SECRET_SIZE_MIN - XXPH3_MIDSIZE_LASTOFFSET, seed);
+        return XXPH3_avalanche(acc);
+    }
+}
+
+/* ===   Public entry point   === */
+
+XXPH_PUBLIC_API XXPH64_hash_t XXPH3_64bits(const void* input, size_t len)
+{
+    if (len <= 16) return XXPH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0);
+    if (len <= 128) return XXPH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    if (len <= XXPH3_MIDSIZE_MAX) return XXPH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    return XXPH3_hashLong_64b_defaultSecret((const xxh_u8*)input, len);
+}
+
+XXPH_PUBLIC_API XXPH64_hash_t
+XXPH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+{
+    XXPH_ASSERT(secretSize >= XXPH3_SECRET_SIZE_MIN);
+    /* if an action must be taken should `secret` conditions not be respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash */
+     if (len <= 16) return XXPH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
+     if (len <= 128) return XXPH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+     if (len <= XXPH3_MIDSIZE_MAX) return XXPH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+     return XXPH3_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
+}
+
+XXPH_PUBLIC_API XXPH64_hash_t
+XXPH3_64bits_withSeed(const void* input, size_t len, XXPH64_hash_t seed)
+{
+    if (len <= 16) return XXPH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed);
+    if (len <= 128) return XXPH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    if (len <= XXPH3_MIDSIZE_MAX) return XXPH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    return XXPH3_hashLong_64b_withSeed((const xxh_u8*)input, len, seed);
+}
+
+/* ===   XXPH3 streaming   === */
+
+/* RocksDB Note: unused & removed due to bug in preview version */
+
+/*======== END #include "xxh3.h", now inlined above ==========*/
+
+#endif  /* XXPH_NO_LONG_LONG */
+
+/* === END RocksDB modification of permanently inlining === */
+
+#endif /*  defined(XXPH_INLINE_ALL) || defined(XXPH_PRIVATE_API) */
+
+#endif /* XXPH_STATIC_LINKING_ONLY */
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* XXPHASH_H_5627135585666179 */
diff --git a/src/rocksdb/utilities/agg_merge/agg_merge.cc b/src/rocksdb/utilities/agg_merge/agg_merge.cc
new file mode 100644
index 000000000..a7eab1f12
--- /dev/null
+++ b/src/rocksdb/utilities/agg_merge/agg_merge.cc
@@ -0,0 +1,238 @@
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "utilities/agg_merge/agg_merge.h"
+
+#include <assert.h>
+
+#include <deque>
+#include <memory>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "port/lang.h"
+#include "port/likely.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/agg_merge.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/coding.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+static std::unordered_map<std::string, std::unique_ptr<Aggregator>> func_map;
+const std::string kUnnamedFuncName = "";
+const std::string kErrorFuncName = "kErrorFuncName";
+
+Status AddAggregator(const std::string& function_name,
+                     std::unique_ptr<Aggregator>&& agg) {
+  if (function_name == kErrorFuncName) {
+    return Status::InvalidArgument(
+        "Cannot register function name kErrorFuncName");
+  }
+  func_map.emplace(function_name, std::move(agg));
+  return Status::OK();
+}
+
+AggMergeOperator::AggMergeOperator() {}
+
+std::string EncodeAggFuncAndPayloadNoCheck(const Slice& function_name,
+                                           const Slice& value) {
+  std::string result;
+  PutLengthPrefixedSlice(&result, function_name);
+  result += value.ToString();
+  return result;
+}
+
+Status EncodeAggFuncAndPayload(const Slice& function_name, const Slice& payload,
+                               std::string& output) {
+  if (function_name == kErrorFuncName) {
+    return Status::InvalidArgument("Cannot use error function name");
+  }
+  if (function_name != kUnnamedFuncName &&
+      func_map.find(function_name.ToString()) == func_map.end()) {
+    return Status::InvalidArgument("Function name not registered");
+  }
+  output = EncodeAggFuncAndPayloadNoCheck(function_name, payload);
+  return Status::OK();
+}
+
+bool ExtractAggFuncAndValue(const Slice& op, Slice& func, Slice& value) {
+  value = op;
+  return GetLengthPrefixedSlice(&value, &func);
+}
+
+bool ExtractList(const Slice& encoded_list, std::vector<Slice>& decoded_list) {
+  decoded_list.clear();
+  Slice list_slice = encoded_list;
+  Slice item;
+  while (GetLengthPrefixedSlice(&list_slice, &item)) {
+    decoded_list.push_back(item);
+  }
+  return list_slice.empty();
+}
+
+class AggMergeOperator::Accumulator {
+ public:
+  bool Add(const Slice& op, bool is_partial_aggregation) {
+    if (ignore_operands_) {
+      return true;
+    }
+    Slice my_func;
+    Slice my_value;
+    bool ret = ExtractAggFuncAndValue(op, my_func, my_value);
+    if (!ret) {
+      ignore_operands_ = true;
+      return true;
+    }
+
+    // Determine whether we need to do partial merge.
+    if (is_partial_aggregation && !my_func.empty()) {
+      auto f = func_map.find(my_func.ToString());
+      if (f == func_map.end() || !f->second->DoPartialAggregate()) {
+        return false;
+      }
+    }
+
+    if (!func_valid_) {
+      if (my_func != kUnnamedFuncName) {
+        func_ = my_func;
+        func_valid_ = true;
+      }
+    } else if (func_ != my_func) {
+      // User switched aggregation function. Need to aggregate the older
+      // one first.
+
+      // Previous aggreagion can't be done in partial merge
+      if (is_partial_aggregation) {
+        func_valid_ = false;
+        ignore_operands_ = true;
+        return false;
+      }
+
+      // We could consider stashing an iterator into the hash of aggregators
+      // to avoid repeated lookups when the aggregator doesn't change.
+      auto f = func_map.find(func_.ToString());
+      if (f == func_map.end() || !f->second->Aggregate(values_, scratch_)) {
+        func_valid_ = false;
+        ignore_operands_ = true;
+        return true;
+      }
+      std::swap(scratch_, aggregated_);
+      values_.clear();
+      values_.push_back(aggregated_);
+      func_ = my_func;
+    }
+    values_.push_back(my_value);
+    return true;
+  }
+
+  // Return false if aggregation fails.
+  // One possible reason
+  bool GetResult(std::string& result) {
+    if (!func_valid_) {
+      return false;
+    }
+    auto f = func_map.find(func_.ToString());
+    if (f == func_map.end()) {
+      return false;
+    }
+    if (!f->second->Aggregate(values_, scratch_)) {
+      return false;
+    }
+    result = EncodeAggFuncAndPayloadNoCheck(func_, scratch_);
+    return true;
+  }
+
+  void Clear() {
+    func_.clear();
+    values_.clear();
+    aggregated_.clear();
+    scratch_.clear();
+    ignore_operands_ = false;
+    func_valid_ = false;
+  }
+
+ private:
+  Slice func_;
+  std::vector<Slice> values_;
+  std::string aggregated_;
+  std::string scratch_;
+  bool ignore_operands_ = false;
+  bool func_valid_ = false;
+};
+
+// Creating and using a new Accumulator might invoke multiple malloc and is
+// expensive if it needs to be done when processing each merge operation.
+// AggMergeOperator's merge operators can be invoked concurrently by multiple
+// threads so we cannot simply create one Aggregator and reuse.
+// We use thread local instances instead.
+AggMergeOperator::Accumulator& AggMergeOperator::GetTLSAccumulator() {
+  static thread_local Accumulator tls_acc;
+  tls_acc.Clear();
+  return tls_acc;
+}
+
+void AggMergeOperator::PackAllMergeOperands(const MergeOperationInput& merge_in,
+                                            MergeOperationOutput& merge_out) {
+  merge_out.new_value = "";
+  PutLengthPrefixedSlice(&merge_out.new_value, kErrorFuncName);
+  if (merge_in.existing_value != nullptr) {
+    PutLengthPrefixedSlice(&merge_out.new_value, *merge_in.existing_value);
+  }
+  for (const Slice& op : merge_in.operand_list) {
+    PutLengthPrefixedSlice(&merge_out.new_value, op);
+  }
+}
+
+bool AggMergeOperator::FullMergeV2(const MergeOperationInput& merge_in,
+                                   MergeOperationOutput* merge_out) const {
+  Accumulator& agg = GetTLSAccumulator();
+  if (merge_in.existing_value != nullptr) {
+    agg.Add(*merge_in.existing_value, /*is_partial_aggregation=*/false);
+  }
+  for (const Slice& e : merge_in.operand_list) {
+    agg.Add(e, /*is_partial_aggregation=*/false);
+  }
+
+  bool succ = agg.GetResult(merge_out->new_value);
+  if (!succ) {
+    // If aggregation can't happen, pack all merge operands. In contrast to
+    // merge operator, we don't want to fail the DB. If users insert wrong
+    // format or call unregistered an aggregation function, we still hope
+    // the DB can continue functioning with other keys.
+    PackAllMergeOperands(merge_in, *merge_out);
+  }
+  agg.Clear();
+  return true;
+}
+
+bool AggMergeOperator::PartialMergeMulti(const Slice& /*key*/,
+                                         const std::deque<Slice>& operand_list,
+                                         std::string* new_value,
+                                         Logger* /*logger*/) const {
+  Accumulator& agg = GetTLSAccumulator();
+  bool do_aggregation = true;
+  for (const Slice& item : operand_list) {
+    do_aggregation = agg.Add(item, /*is_partial_aggregation=*/true);
+    if (!do_aggregation) {
+      break;
+    }
+  }
+  if (do_aggregation) {
+    do_aggregation = agg.GetResult(*new_value);
+  }
+  agg.Clear();
+  return do_aggregation;
+}
+
+std::shared_ptr<MergeOperator> GetAggMergeOperator() {
+  STATIC_AVOID_DESTRUCTION(std::shared_ptr<MergeOperator>, instance)
+  (std::make_shared<AggMergeOperator>());
+  assert(instance);
+  return instance;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/agg_merge/agg_merge.h b/src/rocksdb/utilities/agg_merge/agg_merge.h
new file mode 100644
index 000000000..00e58de08
--- /dev/null
+++ b/src/rocksdb/utilities/agg_merge/agg_merge.h
@@ -0,0 +1,49 @@
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <algorithm>
+#include <cstddef>
+#include <memory>
+#include <unordered_map>
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/agg_merge.h"
+#include "utilities/cassandra/cassandra_options.h"
+
+namespace ROCKSDB_NAMESPACE {
+class AggMergeOperator : public MergeOperator {
+ public:
+  explicit AggMergeOperator();
+
+  bool FullMergeV2(const MergeOperationInput& merge_in,
+                   MergeOperationOutput* merge_out) const override;
+
+  bool PartialMergeMulti(const Slice& key,
+                         const std::deque<Slice>& operand_list,
+                         std::string* new_value, Logger* logger) const override;
+
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "AggMergeOperator.v1"; }
+
+  bool AllowSingleOperand() const override { return true; }
+
+  bool ShouldMerge(const std::vector<Slice>&) const override { return false; }
+
+ private:
+  class Accumulator;
+
+  // Pack all merge operands into one value. This is called when aggregation
+  // fails. The existing values are preserved and returned so that users can
+  // debug the problem.
+  static void PackAllMergeOperands(const MergeOperationInput& merge_in,
+                                   MergeOperationOutput& merge_out);
+  static Accumulator& GetTLSAccumulator();
+};
+
+extern std::string EncodeAggFuncAndPayloadNoCheck(const Slice& function_name,
+                                                  const Slice& value);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/agg_merge/agg_merge_test.cc b/src/rocksdb/utilities/agg_merge/agg_merge_test.cc
new file mode 100644
index 000000000..a65441cd0
--- /dev/null
+++ b/src/rocksdb/utilities/agg_merge/agg_merge_test.cc
@@ -0,0 +1,135 @@
+// Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/utilities/agg_merge.h"
+
+#include <gtest/gtest.h>
+
+#include <memory>
+
+#include "db/db_test_util.h"
+#include "rocksdb/options.h"
+#include "test_util/testharness.h"
+#include "utilities/agg_merge/agg_merge.h"
+#include "utilities/agg_merge/test_agg_merge.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class AggMergeTest : public DBTestBase {
+ public:
+  AggMergeTest() : DBTestBase("agg_merge_db_test", /*env_do_fsync=*/true) {}
+};
+
+TEST_F(AggMergeTest, TestUsingMergeOperator) {
+  ASSERT_OK(AddAggregator("sum", std::make_unique<SumAggregator>()));
+  ASSERT_OK(AddAggregator("last3", std::make_unique<Last3Aggregator>()));
+  ASSERT_OK(AddAggregator("mul", std::make_unique<MultipleAggregator>()));
+
+  Options options = CurrentOptions();
+  options.merge_operator = GetAggMergeOperator();
+  Reopen(options);
+  std::string v = EncodeHelper::EncodeFuncAndInt("sum", 10);
+  ASSERT_OK(Merge("foo", v));
+  v = EncodeHelper::EncodeFuncAndInt("sum", 20);
+  ASSERT_OK(Merge("foo", v));
+  v = EncodeHelper::EncodeFuncAndInt("sum", 15);
+  ASSERT_OK(Merge("foo", v));
+
+  v = EncodeHelper::EncodeFuncAndList("last3", {"a", "b"});
+  ASSERT_OK(Merge("bar", v));
+  v = EncodeHelper::EncodeFuncAndList("last3", {"c", "d", "e"});
+  ASSERT_OK(Merge("bar", v));
+  ASSERT_OK(Flush());
+  v = EncodeHelper::EncodeFuncAndList("last3", {"f"});
+  ASSERT_OK(Merge("bar", v));
+
+  // Test Put() without aggregation type.
+  v = EncodeHelper::EncodeFuncAndInt(kUnnamedFuncName, 30);
+  ASSERT_OK(Put("foo2", v));
+  v = EncodeHelper::EncodeFuncAndInt("sum", 10);
+  ASSERT_OK(Merge("foo2", v));
+  v = EncodeHelper::EncodeFuncAndInt("sum", 20);
+  ASSERT_OK(Merge("foo2", v));
+
+  EXPECT_EQ(EncodeHelper::EncodeFuncAndInt("sum", 45), Get("foo"));
+  EXPECT_EQ(EncodeHelper::EncodeFuncAndList("last3", {"f", "c", "d"}),
+            Get("bar"));
+  EXPECT_EQ(EncodeHelper::EncodeFuncAndInt("sum", 60), Get("foo2"));
+
+  // Test changing aggregation type
+  v = EncodeHelper::EncodeFuncAndInt("mul", 10);
+  ASSERT_OK(Put("bar2", v));
+  v = EncodeHelper::EncodeFuncAndInt("mul", 20);
+  ASSERT_OK(Merge("bar2", v));
+  v = EncodeHelper::EncodeFuncAndInt("sum", 30);
+  ASSERT_OK(Merge("bar2", v));
+  v = EncodeHelper::EncodeFuncAndInt("sum", 40);
+  ASSERT_OK(Merge("bar2", v));
+  EXPECT_EQ(EncodeHelper::EncodeFuncAndInt("sum", 10 * 20 + 30 + 40),
+            Get("bar2"));
+
+  // Changing aggregation type with partial merge
+  v = EncodeHelper::EncodeFuncAndInt("mul", 10);
+  ASSERT_OK(Merge("foo3", v));
+  ASSERT_OK(Flush());
+  v = EncodeHelper::EncodeFuncAndInt("mul", 10);
+  ASSERT_OK(Merge("foo3", v));
+  v = EncodeHelper::EncodeFuncAndInt("mul", 10);
+  ASSERT_OK(Merge("foo3", v));
+  v = EncodeHelper::EncodeFuncAndInt("sum", 10);
+  ASSERT_OK(Merge("foo3", v));
+  ASSERT_OK(Flush());
+  EXPECT_EQ(EncodeHelper::EncodeFuncAndInt("sum", 10 * 10 * 10 + 10),
+            Get("foo3"));
+
+  // Merge after full merge
+  v = EncodeHelper::EncodeFuncAndInt("sum", 1);
+  ASSERT_OK(Merge("foo4", v));
+  v = EncodeHelper::EncodeFuncAndInt("sum", 2);
+  ASSERT_OK(Merge("foo4", v));
+  ASSERT_OK(Flush());
+  v = EncodeHelper::EncodeFuncAndInt("sum", 3);
+  ASSERT_OK(Merge("foo4", v));
+  v = EncodeHelper::EncodeFuncAndInt("sum", 4);
+  ASSERT_OK(Merge("foo4", v));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  v = EncodeHelper::EncodeFuncAndInt("sum", 5);
+  ASSERT_OK(Merge("foo4", v));
+  EXPECT_EQ(EncodeHelper::EncodeFuncAndInt("sum", 15), Get("foo4"));
+
+  // Test unregistered function name
+  v = EncodeAggFuncAndPayloadNoCheck("non_existing", "1");
+  ASSERT_OK(Merge("bar3", v));
+  std::string v1;
+  v1 = EncodeAggFuncAndPayloadNoCheck("non_existing", "invalid");
+  ;
+  ASSERT_OK(Merge("bar3", v1));
+  EXPECT_EQ(EncodeAggFuncAndPayloadNoCheck(kErrorFuncName,
+                                           EncodeHelper::EncodeList({v, v1})),
+            Get("bar3"));
+
+  // invalidate input
+  ASSERT_OK(EncodeAggFuncAndPayload("sum", "invalid", v));
+  ASSERT_OK(Merge("bar4", v));
+  v1 = EncodeHelper::EncodeFuncAndInt("sum", 20);
+  ASSERT_OK(Merge("bar4", v1));
+  std::string aggregated_value = Get("bar4");
+  Slice func, payload;
+  ASSERT_TRUE(ExtractAggFuncAndValue(aggregated_value, func, payload));
+  EXPECT_EQ(kErrorFuncName, func);
+  std::vector<Slice> decoded_list;
+  ASSERT_TRUE(ExtractList(payload, decoded_list));
+  ASSERT_EQ(2, decoded_list.size());
+  ASSERT_EQ(v, decoded_list[0]);
+  ASSERT_EQ(v1, decoded_list[1]);
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/utilities/agg_merge/test_agg_merge.cc b/src/rocksdb/utilities/agg_merge/test_agg_merge.cc
new file mode 100644
index 000000000..06e5b5697
--- /dev/null
+++ b/src/rocksdb/utilities/agg_merge/test_agg_merge.cc
@@ -0,0 +1,104 @@
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "test_agg_merge.h"
+
+#include <assert.h>
+
+#include <deque>
+#include <vector>
+
+#include "util/coding.h"
+#include "utilities/agg_merge/agg_merge.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::string EncodeHelper::EncodeFuncAndInt(const Slice& function_name,
+                                           int64_t value) {
+  std::string encoded_value;
+  PutVarsignedint64(&encoded_value, value);
+  std::string ret;
+  Status s = EncodeAggFuncAndPayload(function_name, encoded_value, ret);
+  assert(s.ok());
+  return ret;
+}
+
+std::string EncodeHelper::EncodeInt(int64_t value) {
+  std::string encoded_value;
+  PutVarsignedint64(&encoded_value, value);
+  return encoded_value;
+}
+
+std::string EncodeHelper::EncodeFuncAndList(const Slice& function_name,
+                                            const std::vector<Slice>& list) {
+  std::string ret;
+  Status s = EncodeAggFuncAndPayload(function_name, EncodeList(list), ret);
+  assert(s.ok());
+  return ret;
+}
+
+std::string EncodeHelper::EncodeList(const std::vector<Slice>& list) {
+  std::string result;
+  for (const Slice& entity : list) {
+    PutLengthPrefixedSlice(&result, entity);
+  }
+  return result;
+}
+
+bool SumAggregator::Aggregate(const std::vector<Slice>& item_list,
+                              std::string& result) const {
+  int64_t sum = 0;
+  for (const Slice& item : item_list) {
+    int64_t ivalue;
+    Slice v = item;
+    if (!GetVarsignedint64(&v, &ivalue) || !v.empty()) {
+      return false;
+    }
+    sum += ivalue;
+  }
+  result = EncodeHelper::EncodeInt(sum);
+  return true;
+}
+
+bool MultipleAggregator::Aggregate(const std::vector<Slice>& item_list,
+                                   std::string& result) const {
+  int64_t mresult = 1;
+  for (const Slice& item : item_list) {
+    int64_t ivalue;
+    Slice v = item;
+    if (!GetVarsignedint64(&v, &ivalue) || !v.empty()) {
+      return false;
+    }
+    mresult *= ivalue;
+  }
+  result = EncodeHelper::EncodeInt(mresult);
+  return true;
+}
+
+bool Last3Aggregator::Aggregate(const std::vector<Slice>& item_list,
+                                std::string& result) const {
+  std::vector<Slice> last3;
+  last3.reserve(3);
+  for (auto it = item_list.rbegin(); it != item_list.rend(); ++it) {
+    Slice input = *it;
+    Slice entity;
+    bool ret;
+    while ((ret = GetLengthPrefixedSlice(&input, &entity)) == true) {
+      last3.push_back(entity);
+      if (last3.size() >= 3) {
+        break;
+      }
+    }
+    if (last3.size() >= 3) {
+      break;
+    }
+    if (!ret) {
+      continue;
+    }
+  }
+  result = EncodeHelper::EncodeList(last3);
+  return true;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/agg_merge/test_agg_merge.h b/src/rocksdb/utilities/agg_merge/test_agg_merge.h
new file mode 100644
index 000000000..5bdf8b9cc
--- /dev/null
+++ b/src/rocksdb/utilities/agg_merge/test_agg_merge.h
@@ -0,0 +1,47 @@
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <algorithm>
+#include <cstddef>
+#include <memory>
+#include <unordered_map>
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/agg_merge.h"
+#include "utilities/cassandra/cassandra_options.h"
+
+namespace ROCKSDB_NAMESPACE {
+class SumAggregator : public Aggregator {
+ public:
+  ~SumAggregator() override {}
+  bool Aggregate(const std::vector<Slice>&, std::string& result) const override;
+  bool DoPartialAggregate() const override { return true; }
+};
+
+class MultipleAggregator : public Aggregator {
+ public:
+  ~MultipleAggregator() override {}
+  bool Aggregate(const std::vector<Slice>&, std::string& result) const override;
+  bool DoPartialAggregate() const override { return true; }
+};
+
+class Last3Aggregator : public Aggregator {
+ public:
+  ~Last3Aggregator() override {}
+  bool Aggregate(const std::vector<Slice>&, std::string& result) const override;
+};
+
+class EncodeHelper {
+ public:
+  static std::string EncodeFuncAndInt(const Slice& function_name,
+                                      int64_t value);
+  static std::string EncodeInt(int64_t value);
+  static std::string EncodeList(const std::vector<Slice>& list);
+  static std::string EncodeFuncAndList(const Slice& function_name,
+                                       const std::vector<Slice>& list);
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/backup/backup_engine.cc b/src/rocksdb/utilities/backup/backup_engine.cc
new file mode 100644
index 000000000..81b4a6629
--- /dev/null
+++ b/src/rocksdb/utilities/backup/backup_engine.cc
@@ -0,0 +1,3181 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <atomic>
+#include <cinttypes>
+#include <cstdlib>
+#include <functional>
+#include <future>
+#include <limits>
+#include <map>
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "env/composite_env_wrapper.h"
+#include "env/fs_readonly.h"
+#include "env/fs_remap.h"
+#include "file/filename.h"
+#include "file/line_file_reader.h"
+#include "file/sequence_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "logging/logging.h"
+#include "monitoring/iostats_context_imp.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/transaction_log.h"
+#include "table/sst_file_dumper.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/channel.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/math.h"
+#include "util/rate_limiter.h"
+#include "util/string_util.h"
+#include "utilities/backup/backup_engine_impl.h"
+#include "utilities/checkpoint/checkpoint_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+using ShareFilesNaming = BackupEngineOptions::ShareFilesNaming;
+
+constexpr BackupID kLatestBackupIDMarker = static_cast<BackupID>(-2);
+
+inline uint32_t ChecksumHexToInt32(const std::string& checksum_hex) {
+  std::string checksum_str;
+  Slice(checksum_hex).DecodeHex(&checksum_str);
+  return EndianSwapValue(DecodeFixed32(checksum_str.c_str()));
+}
+inline std::string ChecksumStrToHex(const std::string& checksum_str) {
+  return Slice(checksum_str).ToString(true);
+}
+inline std::string ChecksumInt32ToHex(const uint32_t& checksum_value) {
+  std::string checksum_str;
+  PutFixed32(&checksum_str, EndianSwapValue(checksum_value));
+  return ChecksumStrToHex(checksum_str);
+}
+
+const std::string kPrivateDirName = "private";
+const std::string kMetaDirName = "meta";
+const std::string kSharedDirName = "shared";
+const std::string kSharedChecksumDirName = "shared_checksum";
+const std::string kPrivateDirSlash = kPrivateDirName + "/";
+const std::string kMetaDirSlash = kMetaDirName + "/";
+const std::string kSharedDirSlash = kSharedDirName + "/";
+const std::string kSharedChecksumDirSlash = kSharedChecksumDirName + "/";
+
+}  // namespace
+
+void BackupStatistics::IncrementNumberSuccessBackup() {
+  number_success_backup++;
+}
+void BackupStatistics::IncrementNumberFailBackup() { number_fail_backup++; }
+
+uint32_t BackupStatistics::GetNumberSuccessBackup() const {
+  return number_success_backup;
+}
+uint32_t BackupStatistics::GetNumberFailBackup() const {
+  return number_fail_backup;
+}
+
+std::string BackupStatistics::ToString() const {
+  char result[50];
+  snprintf(result, sizeof(result), "# success backup: %u, # fail backup: %u",
+           GetNumberSuccessBackup(), GetNumberFailBackup());
+  return result;
+}
+
+void BackupEngineOptions::Dump(Logger* logger) const {
+  ROCKS_LOG_INFO(logger, "               Options.backup_dir: %s",
+                 backup_dir.c_str());
+  ROCKS_LOG_INFO(logger, "               Options.backup_env: %p", backup_env);
+  ROCKS_LOG_INFO(logger, "        Options.share_table_files: %d",
+                 static_cast<int>(share_table_files));
+  ROCKS_LOG_INFO(logger, "                 Options.info_log: %p", info_log);
+  ROCKS_LOG_INFO(logger, "                     Options.sync: %d",
+                 static_cast<int>(sync));
+  ROCKS_LOG_INFO(logger, "         Options.destroy_old_data: %d",
+                 static_cast<int>(destroy_old_data));
+  ROCKS_LOG_INFO(logger, "         Options.backup_log_files: %d",
+                 static_cast<int>(backup_log_files));
+  ROCKS_LOG_INFO(logger, "        Options.backup_rate_limit: %" PRIu64,
+                 backup_rate_limit);
+  ROCKS_LOG_INFO(logger, "       Options.restore_rate_limit: %" PRIu64,
+                 restore_rate_limit);
+  ROCKS_LOG_INFO(logger, "Options.max_background_operations: %d",
+                 max_background_operations);
+}
+
+namespace {
+// -------- BackupEngineImpl class ---------
+class BackupEngineImpl {
+ public:
+  BackupEngineImpl(const BackupEngineOptions& options, Env* db_env,
+                   bool read_only = false);
+  ~BackupEngineImpl();
+
+  IOStatus CreateNewBackupWithMetadata(const CreateBackupOptions& options,
+                                       DB* db, const std::string& app_metadata,
+                                       BackupID* new_backup_id_ptr);
+
+  IOStatus PurgeOldBackups(uint32_t num_backups_to_keep);
+
+  IOStatus DeleteBackup(BackupID backup_id);
+
+  void StopBackup() { stop_backup_.store(true, std::memory_order_release); }
+
+  IOStatus GarbageCollect();
+
+  // The returned BackupInfos are in chronological order, which means the
+  // latest backup comes last.
+  void GetBackupInfo(std::vector<BackupInfo>* backup_info,
+                     bool include_file_details) const;
+
+  Status GetBackupInfo(BackupID backup_id, BackupInfo* backup_info,
+                       bool include_file_details = false) const;
+
+  void GetCorruptedBackups(std::vector<BackupID>* corrupt_backup_ids) const;
+
+  IOStatus RestoreDBFromBackup(const RestoreOptions& options,
+                               BackupID backup_id, const std::string& db_dir,
+                               const std::string& wal_dir) const;
+
+  IOStatus RestoreDBFromLatestBackup(const RestoreOptions& options,
+                                     const std::string& db_dir,
+                                     const std::string& wal_dir) const {
+    // Note: don't read latest_valid_backup_id_ outside of lock
+    return RestoreDBFromBackup(options, kLatestBackupIDMarker, db_dir, wal_dir);
+  }
+
+  IOStatus VerifyBackup(BackupID backup_id,
+                        bool verify_with_checksum = false) const;
+
+  IOStatus Initialize();
+
+  ShareFilesNaming GetNamingNoFlags() const {
+    return options_.share_files_with_checksum_naming &
+           BackupEngineOptions::kMaskNoNamingFlags;
+  }
+  ShareFilesNaming GetNamingFlags() const {
+    return options_.share_files_with_checksum_naming &
+           BackupEngineOptions::kMaskNamingFlags;
+  }
+
+  void TEST_SetDefaultRateLimitersClock(
+      const std::shared_ptr<SystemClock>& backup_rate_limiter_clock,
+      const std::shared_ptr<SystemClock>& restore_rate_limiter_clock) {
+    if (backup_rate_limiter_clock) {
+      static_cast<GenericRateLimiter*>(options_.backup_rate_limiter.get())
+          ->TEST_SetClock(backup_rate_limiter_clock);
+    }
+
+    if (restore_rate_limiter_clock) {
+      static_cast<GenericRateLimiter*>(options_.restore_rate_limiter.get())
+          ->TEST_SetClock(restore_rate_limiter_clock);
+    }
+  }
+
+ private:
+  void DeleteChildren(const std::string& dir,
+                      uint32_t file_type_filter = 0) const;
+  IOStatus DeleteBackupNoGC(BackupID backup_id);
+
+  // Extends the "result" map with pathname->size mappings for the contents of
+  // "dir" in "env". Pathnames are prefixed with "dir".
+  IOStatus ReadChildFileCurrentSizes(
+      const std::string& dir, const std::shared_ptr<FileSystem>&,
+      std::unordered_map<std::string, uint64_t>* result) const;
+
+  struct FileInfo {
+    FileInfo(const std::string& fname, uint64_t sz, const std::string& checksum,
+             const std::string& id, const std::string& sid, Temperature _temp)
+        : refs(0),
+          filename(fname),
+          size(sz),
+          checksum_hex(checksum),
+          db_id(id),
+          db_session_id(sid),
+          temp(_temp) {}
+
+    FileInfo(const FileInfo&) = delete;
+    FileInfo& operator=(const FileInfo&) = delete;
+
+    int refs;
+    const std::string filename;
+    const uint64_t size;
+    // crc32c checksum as hex. empty == unknown / unavailable
+    std::string checksum_hex;
+    // DB identities
+    // db_id is obtained for potential usage in the future but not used
+    // currently
+    const std::string db_id;
+    // db_session_id appears in the backup SST filename if the table naming
+    // option is kUseDbSessionId
+    const std::string db_session_id;
+    Temperature temp;
+
+    std::string GetDbFileName() {
+      std::string rv;
+      // extract the filename part
+      size_t slash = filename.find_last_of('/');
+      // file will either be shared/<file>, shared_checksum/<file_crc32c_size>,
+      // shared_checksum/<file_session>, shared_checksum/<file_crc32c_session>,
+      // or private/<number>/<file>
+      assert(slash != std::string::npos);
+      rv = filename.substr(slash + 1);
+
+      // if the file was in shared_checksum, extract the real file name
+      // in this case the file is <number>_<checksum>_<size>.<type>,
+      // <number>_<session>.<type>, or <number>_<checksum>_<session>.<type>
+      if (filename.substr(0, slash) == kSharedChecksumDirName) {
+        rv = GetFileFromChecksumFile(rv);
+      }
+      return rv;
+    }
+  };
+
+  // TODO: deprecate this function once we migrate all BackupEngine's rate
+  // limiting to lower-level ones (i.e, ones in file access wrapper level like
+  // `WritableFileWriter`)
+  static void LoopRateLimitRequestHelper(const size_t total_bytes_to_request,
+                                         RateLimiter* rate_limiter,
+                                         const Env::IOPriority pri,
+                                         Statistics* stats,
+                                         const RateLimiter::OpType op_type);
+
+  static inline std::string WithoutTrailingSlash(const std::string& path) {
+    if (path.empty() || path.back() != '/') {
+      return path;
+    } else {
+      return path.substr(path.size() - 1);
+    }
+  }
+
+  static inline std::string WithTrailingSlash(const std::string& path) {
+    if (path.empty() || path.back() != '/') {
+      return path + '/';
+    } else {
+      return path;
+    }
+  }
+
+  // A filesystem wrapper that makes shared backup files appear to be in the
+  // private backup directory (dst_dir), so that the private backup dir can
+  // be opened as a read-only DB.
+  class RemapSharedFileSystem : public RemapFileSystem {
+   public:
+    RemapSharedFileSystem(const std::shared_ptr<FileSystem>& base,
+                          const std::string& dst_dir,
+                          const std::string& src_base_dir,
+                          const std::vector<std::shared_ptr<FileInfo>>& files)
+        : RemapFileSystem(base),
+          dst_dir_(WithoutTrailingSlash(dst_dir)),
+          dst_dir_slash_(WithTrailingSlash(dst_dir)),
+          src_base_dir_(WithTrailingSlash(src_base_dir)) {
+      for (auto& info : files) {
+        if (!StartsWith(info->filename, kPrivateDirSlash)) {
+          assert(StartsWith(info->filename, kSharedDirSlash) ||
+                 StartsWith(info->filename, kSharedChecksumDirSlash));
+          remaps_[info->GetDbFileName()] = info;
+        }
+      }
+    }
+
+    const char* Name() const override {
+      return "BackupEngineImpl::RemapSharedFileSystem";
+    }
+
+    // Sometimes a directory listing is required in opening a DB
+    IOStatus GetChildren(const std::string& dir, const IOOptions& options,
+                         std::vector<std::string>* result,
+                         IODebugContext* dbg) override {
+      IOStatus s = RemapFileSystem::GetChildren(dir, options, result, dbg);
+      if (s.ok() && (dir == dst_dir_ || dir == dst_dir_slash_)) {
+        // Assume remapped files exist
+        for (auto& r : remaps_) {
+          result->push_back(r.first);
+        }
+      }
+      return s;
+    }
+
+    // Sometimes a directory listing is required in opening a DB
+    IOStatus GetChildrenFileAttributes(const std::string& dir,
+                                       const IOOptions& options,
+                                       std::vector<FileAttributes>* result,
+                                       IODebugContext* dbg) override {
+      IOStatus s =
+          RemapFileSystem::GetChildrenFileAttributes(dir, options, result, dbg);
+      if (s.ok() && (dir == dst_dir_ || dir == dst_dir_slash_)) {
+        // Assume remapped files exist with recorded size
+        for (auto& r : remaps_) {
+          result->emplace_back();  // clean up with C++20
+          FileAttributes& attr = result->back();
+          attr.name = r.first;
+          attr.size_bytes = r.second->size;
+        }
+      }
+      return s;
+    }
+
+   protected:
+    // When a file in dst_dir is requested, see if we need to remap to shared
+    // file path.
+    std::pair<IOStatus, std::string> EncodePath(
+        const std::string& path) override {
+      if (path.empty() || path[0] != '/') {
+        return {IOStatus::InvalidArgument(path, "Not an absolute path"), ""};
+      }
+      std::pair<IOStatus, std::string> rv{IOStatus(), path};
+      if (StartsWith(path, dst_dir_slash_)) {
+        std::string relative = path.substr(dst_dir_slash_.size());
+        auto it = remaps_.find(relative);
+        if (it != remaps_.end()) {
+          rv.second = src_base_dir_ + it->second->filename;
+        }
+      }
+      return rv;
+    }
+
+   private:
+    // Absolute path to a directory that some extra files will be mapped into.
+    const std::string dst_dir_;
+    // Includes a trailing slash.
+    const std::string dst_dir_slash_;
+    // Absolute path to a directory containing some files to be mapped into
+    // dst_dir_. Includes a trailing slash.
+    const std::string src_base_dir_;
+    // If remaps_[x] exists, attempt to read dst_dir_ / x should instead read
+    // src_base_dir_ / remaps_[x]->filename. FileInfo is used to maximize
+    // sharing with other backup data in memory.
+    std::unordered_map<std::string, std::shared_ptr<FileInfo>> remaps_;
+  };
+
+  class BackupMeta {
+   public:
+    BackupMeta(
+        const std::string& meta_filename, const std::string& meta_tmp_filename,
+        std::unordered_map<std::string, std::shared_ptr<FileInfo>>* file_infos,
+        Env* env, const std::shared_ptr<FileSystem>& fs)
+        : timestamp_(0),
+          sequence_number_(0),
+          size_(0),
+          meta_filename_(meta_filename),
+          meta_tmp_filename_(meta_tmp_filename),
+          file_infos_(file_infos),
+          env_(env),
+          fs_(fs) {}
+
+    BackupMeta(const BackupMeta&) = delete;
+    BackupMeta& operator=(const BackupMeta&) = delete;
+
+    ~BackupMeta() {}
+
+    void RecordTimestamp() {
+      // Best effort
+      Status s = env_->GetCurrentTime(&timestamp_);
+      if (!s.ok()) {
+        timestamp_ = /* something clearly fabricated */ 1;
+      }
+    }
+    int64_t GetTimestamp() const { return timestamp_; }
+    uint64_t GetSize() const { return size_; }
+    uint32_t GetNumberFiles() const {
+      return static_cast<uint32_t>(files_.size());
+    }
+    void SetSequenceNumber(uint64_t sequence_number) {
+      sequence_number_ = sequence_number;
+    }
+    uint64_t GetSequenceNumber() const { return sequence_number_; }
+
+    const std::string& GetAppMetadata() const { return app_metadata_; }
+
+    void SetAppMetadata(const std::string& app_metadata) {
+      app_metadata_ = app_metadata;
+    }
+
+    IOStatus AddFile(std::shared_ptr<FileInfo> file_info);
+
+    IOStatus Delete(bool delete_meta = true);
+
+    bool Empty() const { return files_.empty(); }
+
+    std::shared_ptr<FileInfo> GetFile(const std::string& filename) const {
+      auto it = file_infos_->find(filename);
+      if (it == file_infos_->end()) {
+        return nullptr;
+      }
+      return it->second;
+    }
+
+    const std::vector<std::shared_ptr<FileInfo>>& GetFiles() const {
+      return files_;
+    }
+
+    // @param abs_path_to_size Pre-fetched file sizes (bytes).
+    IOStatus LoadFromFile(
+        const std::string& backup_dir,
+        const std::unordered_map<std::string, uint64_t>& abs_path_to_size,
+        RateLimiter* rate_limiter, Logger* info_log,
+        std::unordered_set<std::string>* reported_ignored_fields);
+    IOStatus StoreToFile(
+        bool sync, int schema_version,
+        const TEST_BackupMetaSchemaOptions* schema_test_options);
+
+    std::string GetInfoString() {
+      std::ostringstream ss;
+      ss << "Timestamp: " << timestamp_ << std::endl;
+      char human_size[16];
+      AppendHumanBytes(size_, human_size, sizeof(human_size));
+      ss << "Size: " << human_size << std::endl;
+      ss << "Files:" << std::endl;
+      for (const auto& file : files_) {
+        AppendHumanBytes(file->size, human_size, sizeof(human_size));
+        ss << file->filename << ", size " << human_size << ", refs "
+           << file->refs << std::endl;
+      }
+      return ss.str();
+    }
+
+    const std::shared_ptr<Env>& GetEnvForOpen() const {
+      if (!env_for_open_) {
+        // Lazy initialize
+        // Find directories
+        std::string dst_dir = meta_filename_;
+        auto i = dst_dir.rfind(kMetaDirSlash);
+        assert(i != std::string::npos);
+        std::string src_base_dir = dst_dir.substr(0, i);
+        dst_dir.replace(i, kMetaDirSlash.size(), kPrivateDirSlash);
+        // Make the RemapSharedFileSystem
+        std::shared_ptr<FileSystem> remap_fs =
+            std::make_shared<RemapSharedFileSystem>(fs_, dst_dir, src_base_dir,
+                                                    files_);
+        // Make it read-only for safety
+        remap_fs = std::make_shared<ReadOnlyFileSystem>(remap_fs);
+        // Make an Env wrapper
+        env_for_open_ = std::make_shared<CompositeEnvWrapper>(env_, remap_fs);
+      }
+      return env_for_open_;
+    }
+
+   private:
+    int64_t timestamp_;
+    // sequence number is only approximate, should not be used
+    // by clients
+    uint64_t sequence_number_;
+    uint64_t size_;
+    std::string app_metadata_;
+    std::string const meta_filename_;
+    std::string const meta_tmp_filename_;
+    // files with relative paths (without "/" prefix!!)
+    std::vector<std::shared_ptr<FileInfo>> files_;
+    std::unordered_map<std::string, std::shared_ptr<FileInfo>>* file_infos_;
+    Env* env_;
+    mutable std::shared_ptr<Env> env_for_open_;
+    std::shared_ptr<FileSystem> fs_;
+    IOOptions iooptions_ = IOOptions();
+  };  // BackupMeta
+
+  void SetBackupInfoFromBackupMeta(BackupID id, const BackupMeta& meta,
+                                   BackupInfo* backup_info,
+                                   bool include_file_details) const;
+
+  inline std::string GetAbsolutePath(
+      const std::string& relative_path = "") const {
+    assert(relative_path.size() == 0 || relative_path[0] != '/');
+    return options_.backup_dir + "/" + relative_path;
+  }
+  inline std::string GetPrivateFileRel(BackupID backup_id, bool tmp = false,
+                                       const std::string& file = "") const {
+    assert(file.size() == 0 || file[0] != '/');
+    return kPrivateDirSlash + std::to_string(backup_id) + (tmp ? ".tmp" : "") +
+           "/" + file;
+  }
+  inline std::string GetSharedFileRel(const std::string& file = "",
+                                      bool tmp = false) const {
+    assert(file.size() == 0 || file[0] != '/');
+    return kSharedDirSlash + std::string(tmp ? "." : "") + file +
+           (tmp ? ".tmp" : "");
+  }
+  inline std::string GetSharedFileWithChecksumRel(const std::string& file = "",
+                                                  bool tmp = false) const {
+    assert(file.size() == 0 || file[0] != '/');
+    return kSharedChecksumDirSlash + std::string(tmp ? "." : "") + file +
+           (tmp ? ".tmp" : "");
+  }
+  inline bool UseLegacyNaming(const std::string& sid) const {
+    return GetNamingNoFlags() ==
+               BackupEngineOptions::kLegacyCrc32cAndFileSize ||
+           sid.empty();
+  }
+  inline std::string GetSharedFileWithChecksum(
+      const std::string& file, const std::string& checksum_hex,
+      const uint64_t file_size, const std::string& db_session_id) const {
+    assert(file.size() == 0 || file[0] != '/');
+    std::string file_copy = file;
+    if (UseLegacyNaming(db_session_id)) {
+      assert(!checksum_hex.empty());
+      file_copy.insert(file_copy.find_last_of('.'),
+                       "_" + std::to_string(ChecksumHexToInt32(checksum_hex)) +
+                           "_" + std::to_string(file_size));
+    } else {
+      file_copy.insert(file_copy.find_last_of('.'), "_s" + db_session_id);
+      if (GetNamingFlags() & BackupEngineOptions::kFlagIncludeFileSize) {
+        file_copy.insert(file_copy.find_last_of('.'),
+                         "_" + std::to_string(file_size));
+      }
+    }
+    return file_copy;
+  }
+  static inline std::string GetFileFromChecksumFile(const std::string& file) {
+    assert(file.size() == 0 || file[0] != '/');
+    std::string file_copy = file;
+    size_t first_underscore = file_copy.find_first_of('_');
+    return file_copy.erase(first_underscore,
+                           file_copy.find_last_of('.') - first_underscore);
+  }
+  inline std::string GetBackupMetaFile(BackupID backup_id, bool tmp) const {
+    return GetAbsolutePath(kMetaDirName) + "/" + (tmp ? "." : "") +
+           std::to_string(backup_id) + (tmp ? ".tmp" : "");
+  }
+
+  // If size_limit == 0, there is no size limit, copy everything.
+  //
+  // Exactly one of src and contents must be non-empty.
+  //
+  // @param src If non-empty, the file is copied from this pathname.
+  // @param contents If non-empty, the file will be created with these contents.
+  // @param src_temperature Pass in expected temperature of src, return back
+  // temperature reported by FileSystem
+  IOStatus CopyOrCreateFile(const std::string& src, const std::string& dst,
+                            const std::string& contents, uint64_t size_limit,
+                            Env* src_env, Env* dst_env,
+                            const EnvOptions& src_env_options, bool sync,
+                            RateLimiter* rate_limiter,
+                            std::function<void()> progress_callback,
+                            Temperature* src_temperature,
+                            Temperature dst_temperature,
+                            uint64_t* bytes_toward_next_callback,
+                            uint64_t* size, std::string* checksum_hex);
+
+  IOStatus ReadFileAndComputeChecksum(const std::string& src,
+                                      const std::shared_ptr<FileSystem>& src_fs,
+                                      const EnvOptions& src_env_options,
+                                      uint64_t size_limit,
+                                      std::string* checksum_hex,
+                                      const Temperature src_temperature) const;
+
+  // Obtain db_id and db_session_id from the table properties of file_path
+  Status GetFileDbIdentities(Env* src_env, const EnvOptions& src_env_options,
+                             const std::string& file_path,
+                             Temperature file_temp, RateLimiter* rate_limiter,
+                             std::string* db_id, std::string* db_session_id);
+
+  struct CopyOrCreateResult {
+    ~CopyOrCreateResult() {
+      // The Status needs to be ignored here for two reasons.
+      // First, if the BackupEngineImpl shuts down with jobs outstanding, then
+      // it is possible that the Status in the future/promise is never read,
+      // resulting in an unchecked Status. Second, if there are items in the
+      // channel when the BackupEngineImpl is shutdown, these will also have
+      // Status that have not been checked.  This
+      // TODO: Fix those issues so that the Status
+      io_status.PermitUncheckedError();
+    }
+    uint64_t size;
+    std::string checksum_hex;
+    std::string db_id;
+    std::string db_session_id;
+    IOStatus io_status;
+    Temperature expected_src_temperature = Temperature::kUnknown;
+    Temperature current_src_temperature = Temperature::kUnknown;
+  };
+
+  // Exactly one of src_path and contents must be non-empty. If src_path is
+  // non-empty, the file is copied from this pathname. Otherwise, if contents is
+  // non-empty, the file will be created at dst_path with these contents.
+  struct CopyOrCreateWorkItem {
+    std::string src_path;
+    std::string dst_path;
+    Temperature src_temperature;
+    Temperature dst_temperature;
+    std::string contents;
+    Env* src_env;
+    Env* dst_env;
+    EnvOptions src_env_options;
+    bool sync;
+    RateLimiter* rate_limiter;
+    uint64_t size_limit;
+    Statistics* stats;
+    std::promise<CopyOrCreateResult> result;
+    std::function<void()> progress_callback;
+    std::string src_checksum_func_name;
+    std::string src_checksum_hex;
+    std::string db_id;
+    std::string db_session_id;
+
+    CopyOrCreateWorkItem()
+        : src_path(""),
+          dst_path(""),
+          src_temperature(Temperature::kUnknown),
+          dst_temperature(Temperature::kUnknown),
+          contents(""),
+          src_env(nullptr),
+          dst_env(nullptr),
+          src_env_options(),
+          sync(false),
+          rate_limiter(nullptr),
+          size_limit(0),
+          stats(nullptr),
+          src_checksum_func_name(kUnknownFileChecksumFuncName),
+          src_checksum_hex(""),
+          db_id(""),
+          db_session_id("") {}
+
+    CopyOrCreateWorkItem(const CopyOrCreateWorkItem&) = delete;
+    CopyOrCreateWorkItem& operator=(const CopyOrCreateWorkItem&) = delete;
+
+    CopyOrCreateWorkItem(CopyOrCreateWorkItem&& o) noexcept {
+      *this = std::move(o);
+    }
+
+    CopyOrCreateWorkItem& operator=(CopyOrCreateWorkItem&& o) noexcept {
+      src_path = std::move(o.src_path);
+      dst_path = std::move(o.dst_path);
+      src_temperature = std::move(o.src_temperature);
+      dst_temperature = std::move(o.dst_temperature);
+      contents = std::move(o.contents);
+      src_env = o.src_env;
+      dst_env = o.dst_env;
+      src_env_options = std::move(o.src_env_options);
+      sync = o.sync;
+      rate_limiter = o.rate_limiter;
+      size_limit = o.size_limit;
+      stats = o.stats;
+      result = std::move(o.result);
+      progress_callback = std::move(o.progress_callback);
+      src_checksum_func_name = std::move(o.src_checksum_func_name);
+      src_checksum_hex = std::move(o.src_checksum_hex);
+      db_id = std::move(o.db_id);
+      db_session_id = std::move(o.db_session_id);
+      src_temperature = o.src_temperature;
+      return *this;
+    }
+
+    CopyOrCreateWorkItem(
+        std::string _src_path, std::string _dst_path,
+        const Temperature _src_temperature, const Temperature _dst_temperature,
+        std::string _contents, Env* _src_env, Env* _dst_env,
+        EnvOptions _src_env_options, bool _sync, RateLimiter* _rate_limiter,
+        uint64_t _size_limit, Statistics* _stats,
+        std::function<void()> _progress_callback = []() {},
+        const std::string& _src_checksum_func_name =
+            kUnknownFileChecksumFuncName,
+        const std::string& _src_checksum_hex = "",
+        const std::string& _db_id = "", const std::string& _db_session_id = "")
+        : src_path(std::move(_src_path)),
+          dst_path(std::move(_dst_path)),
+          src_temperature(_src_temperature),
+          dst_temperature(_dst_temperature),
+          contents(std::move(_contents)),
+          src_env(_src_env),
+          dst_env(_dst_env),
+          src_env_options(std::move(_src_env_options)),
+          sync(_sync),
+          rate_limiter(_rate_limiter),
+          size_limit(_size_limit),
+          stats(_stats),
+          progress_callback(_progress_callback),
+          src_checksum_func_name(_src_checksum_func_name),
+          src_checksum_hex(_src_checksum_hex),
+          db_id(_db_id),
+          db_session_id(_db_session_id) {}
+  };
+
+  struct BackupAfterCopyOrCreateWorkItem {
+    std::future<CopyOrCreateResult> result;
+    bool shared;
+    bool needed_to_copy;
+    Env* backup_env;
+    std::string dst_path_tmp;
+    std::string dst_path;
+    std::string dst_relative;
+    BackupAfterCopyOrCreateWorkItem()
+        : shared(false),
+          needed_to_copy(false),
+          backup_env(nullptr),
+          dst_path_tmp(""),
+          dst_path(""),
+          dst_relative("") {}
+
+    BackupAfterCopyOrCreateWorkItem(
+        BackupAfterCopyOrCreateWorkItem&& o) noexcept {
+      *this = std::move(o);
+    }
+
+    BackupAfterCopyOrCreateWorkItem& operator=(
+        BackupAfterCopyOrCreateWorkItem&& o) noexcept {
+      result = std::move(o.result);
+      shared = o.shared;
+      needed_to_copy = o.needed_to_copy;
+      backup_env = o.backup_env;
+      dst_path_tmp = std::move(o.dst_path_tmp);
+      dst_path = std::move(o.dst_path);
+      dst_relative = std::move(o.dst_relative);
+      return *this;
+    }
+
+    BackupAfterCopyOrCreateWorkItem(std::future<CopyOrCreateResult>&& _result,
+                                    bool _shared, bool _needed_to_copy,
+                                    Env* _backup_env, std::string _dst_path_tmp,
+                                    std::string _dst_path,
+                                    std::string _dst_relative)
+        : result(std::move(_result)),
+          shared(_shared),
+          needed_to_copy(_needed_to_copy),
+          backup_env(_backup_env),
+          dst_path_tmp(std::move(_dst_path_tmp)),
+          dst_path(std::move(_dst_path)),
+          dst_relative(std::move(_dst_relative)) {}
+  };
+
+  struct RestoreAfterCopyOrCreateWorkItem {
+    std::future<CopyOrCreateResult> result;
+    std::string from_file;
+    std::string to_file;
+    std::string checksum_hex;
+    RestoreAfterCopyOrCreateWorkItem() : checksum_hex("") {}
+    RestoreAfterCopyOrCreateWorkItem(std::future<CopyOrCreateResult>&& _result,
+                                     const std::string& _from_file,
+                                     const std::string& _to_file,
+                                     const std::string& _checksum_hex)
+        : result(std::move(_result)),
+          from_file(_from_file),
+          to_file(_to_file),
+          checksum_hex(_checksum_hex) {}
+    RestoreAfterCopyOrCreateWorkItem(
+        RestoreAfterCopyOrCreateWorkItem&& o) noexcept {
+      *this = std::move(o);
+    }
+
+    RestoreAfterCopyOrCreateWorkItem& operator=(
+        RestoreAfterCopyOrCreateWorkItem&& o) noexcept {
+      result = std::move(o.result);
+      checksum_hex = std::move(o.checksum_hex);
+      return *this;
+    }
+  };
+
+  bool initialized_;
+  std::mutex byte_report_mutex_;
+  mutable channel<CopyOrCreateWorkItem> files_to_copy_or_create_;
+  std::vector<port::Thread> threads_;
+  std::atomic<CpuPriority> threads_cpu_priority_;
+
+  // Certain operations like PurgeOldBackups and DeleteBackup will trigger
+  // automatic GarbageCollect (true) unless we've already done one in this
+  // session and have not failed to delete backup files since then (false).
+  bool might_need_garbage_collect_ = true;
+
+  // Adds a file to the backup work queue to be copied or created if it doesn't
+  // already exist.
+  //
+  // Exactly one of src_dir and contents must be non-empty.
+  //
+  // @param src_dir If non-empty, the file in this directory named fname will be
+  //    copied.
+  // @param fname Name of destination file and, in case of copy, source file.
+  // @param contents If non-empty, the file will be created with these contents.
+  IOStatus AddBackupFileWorkItem(
+      std::unordered_set<std::string>& live_dst_paths,
+      std::vector<BackupAfterCopyOrCreateWorkItem>& backup_items_to_finish,
+      BackupID backup_id, bool shared, const std::string& src_dir,
+      const std::string& fname,  // starts with "/"
+      const EnvOptions& src_env_options, RateLimiter* rate_limiter,
+      FileType file_type, uint64_t size_bytes, Statistics* stats,
+      uint64_t size_limit = 0, bool shared_checksum = false,
+      std::function<void()> progress_callback = []() {},
+      const std::string& contents = std::string(),
+      const std::string& src_checksum_func_name = kUnknownFileChecksumFuncName,
+      const std::string& src_checksum_str = kUnknownFileChecksum,
+      const Temperature src_temperature = Temperature::kUnknown);
+
+  // backup state data
+  BackupID latest_backup_id_;
+  BackupID latest_valid_backup_id_;
+  std::map<BackupID, std::unique_ptr<BackupMeta>> backups_;
+  std::map<BackupID, std::pair<IOStatus, std::unique_ptr<BackupMeta>>>
+      corrupt_backups_;
+  std::unordered_map<std::string, std::shared_ptr<FileInfo>>
+      backuped_file_infos_;
+  std::atomic<bool> stop_backup_;
+
+  // options data
+  BackupEngineOptions options_;
+  Env* db_env_;
+  Env* backup_env_;
+
+  // directories
+  std::unique_ptr<FSDirectory> backup_directory_;
+  std::unique_ptr<FSDirectory> shared_directory_;
+  std::unique_ptr<FSDirectory> meta_directory_;
+  std::unique_ptr<FSDirectory> private_directory_;
+
+  static const size_t kDefaultCopyFileBufferSize = 5 * 1024 * 1024LL;  // 5MB
+  bool read_only_;
+  BackupStatistics backup_statistics_;
+  std::unordered_set<std::string> reported_ignored_fields_;
+  static const size_t kMaxAppMetaSize = 1024 * 1024;  // 1MB
+  std::shared_ptr<FileSystem> db_fs_;
+  std::shared_ptr<FileSystem> backup_fs_;
+  IOOptions io_options_ = IOOptions();
+
+ public:
+  std::unique_ptr<TEST_BackupMetaSchemaOptions> schema_test_options_;
+};
+
+// -------- BackupEngineImplThreadSafe class ---------
+// This locking layer for thread safety in the public API is layered on
+// top to prevent accidental recursive locking with RWMutex, which is UB.
+// Note: BackupEngineReadOnlyBase inherited twice, but has no fields
+class BackupEngineImplThreadSafe : public BackupEngine,
+                                   public BackupEngineReadOnly {
+ public:
+  BackupEngineImplThreadSafe(const BackupEngineOptions& options, Env* db_env,
+                             bool read_only = false)
+      : impl_(options, db_env, read_only) {}
+  ~BackupEngineImplThreadSafe() override {}
+
+  using BackupEngine::CreateNewBackupWithMetadata;
+  IOStatus CreateNewBackupWithMetadata(const CreateBackupOptions& options,
+                                       DB* db, const std::string& app_metadata,
+                                       BackupID* new_backup_id) override {
+    WriteLock lock(&mutex_);
+    return impl_.CreateNewBackupWithMetadata(options, db, app_metadata,
+                                             new_backup_id);
+  }
+
+  IOStatus PurgeOldBackups(uint32_t num_backups_to_keep) override {
+    WriteLock lock(&mutex_);
+    return impl_.PurgeOldBackups(num_backups_to_keep);
+  }
+
+  IOStatus DeleteBackup(BackupID backup_id) override {
+    WriteLock lock(&mutex_);
+    return impl_.DeleteBackup(backup_id);
+  }
+
+  void StopBackup() override {
+    // No locking needed
+    impl_.StopBackup();
+  }
+
+  IOStatus GarbageCollect() override {
+    WriteLock lock(&mutex_);
+    return impl_.GarbageCollect();
+  }
+
+  Status GetLatestBackupInfo(BackupInfo* backup_info,
+                             bool include_file_details = false) const override {
+    ReadLock lock(&mutex_);
+    return impl_.GetBackupInfo(kLatestBackupIDMarker, backup_info,
+                               include_file_details);
+  }
+
+  Status GetBackupInfo(BackupID backup_id, BackupInfo* backup_info,
+                       bool include_file_details = false) const override {
+    ReadLock lock(&mutex_);
+    return impl_.GetBackupInfo(backup_id, backup_info, include_file_details);
+  }
+
+  void GetBackupInfo(std::vector<BackupInfo>* backup_info,
+                     bool include_file_details) const override {
+    ReadLock lock(&mutex_);
+    impl_.GetBackupInfo(backup_info, include_file_details);
+  }
+
+  void GetCorruptedBackups(
+      std::vector<BackupID>* corrupt_backup_ids) const override {
+    ReadLock lock(&mutex_);
+    impl_.GetCorruptedBackups(corrupt_backup_ids);
+  }
+
+  using BackupEngine::RestoreDBFromBackup;
+  IOStatus RestoreDBFromBackup(const RestoreOptions& options,
+                               BackupID backup_id, const std::string& db_dir,
+                               const std::string& wal_dir) const override {
+    ReadLock lock(&mutex_);
+    return impl_.RestoreDBFromBackup(options, backup_id, db_dir, wal_dir);
+  }
+
+  using BackupEngine::RestoreDBFromLatestBackup;
+  IOStatus RestoreDBFromLatestBackup(
+      const RestoreOptions& options, const std::string& db_dir,
+      const std::string& wal_dir) const override {
+    // Defer to above function, which locks
+    return RestoreDBFromBackup(options, kLatestBackupIDMarker, db_dir, wal_dir);
+  }
+
+  IOStatus VerifyBackup(BackupID backup_id,
+                        bool verify_with_checksum = false) const override {
+    ReadLock lock(&mutex_);
+    return impl_.VerifyBackup(backup_id, verify_with_checksum);
+  }
+
+  // Not public API but needed
+  IOStatus Initialize() {
+    // No locking needed
+    return impl_.Initialize();
+  }
+
+  // Not public API but used in testing
+  void TEST_SetBackupMetaSchemaOptions(
+      const TEST_BackupMetaSchemaOptions& options) {
+    impl_.schema_test_options_.reset(new TEST_BackupMetaSchemaOptions(options));
+  }
+
+  // Not public API but used in testing
+  void TEST_SetDefaultRateLimitersClock(
+      const std::shared_ptr<SystemClock>& backup_rate_limiter_clock = nullptr,
+      const std::shared_ptr<SystemClock>& restore_rate_limiter_clock =
+          nullptr) {
+    impl_.TEST_SetDefaultRateLimitersClock(backup_rate_limiter_clock,
+                                           restore_rate_limiter_clock);
+  }
+
+ private:
+  mutable port::RWMutex mutex_;
+  BackupEngineImpl impl_;
+};
+}  // namespace
+
+IOStatus BackupEngine::Open(const BackupEngineOptions& options, Env* env,
+                            BackupEngine** backup_engine_ptr) {
+  std::unique_ptr<BackupEngineImplThreadSafe> backup_engine(
+      new BackupEngineImplThreadSafe(options, env));
+  auto s = backup_engine->Initialize();
+  if (!s.ok()) {
+    *backup_engine_ptr = nullptr;
+    return s;
+  }
+  *backup_engine_ptr = backup_engine.release();
+  return IOStatus::OK();
+}
+
+namespace {
+BackupEngineImpl::BackupEngineImpl(const BackupEngineOptions& options,
+                                   Env* db_env, bool read_only)
+    : initialized_(false),
+      threads_cpu_priority_(),
+      latest_backup_id_(0),
+      latest_valid_backup_id_(0),
+      stop_backup_(false),
+      options_(options),
+      db_env_(db_env),
+      backup_env_(options.backup_env != nullptr ? options.backup_env : db_env_),
+      read_only_(read_only) {
+  if (options_.backup_rate_limiter == nullptr &&
+      options_.backup_rate_limit > 0) {
+    options_.backup_rate_limiter.reset(
+        NewGenericRateLimiter(options_.backup_rate_limit));
+  }
+  if (options_.restore_rate_limiter == nullptr &&
+      options_.restore_rate_limit > 0) {
+    options_.restore_rate_limiter.reset(
+        NewGenericRateLimiter(options_.restore_rate_limit));
+  }
+  db_fs_ = db_env_->GetFileSystem();
+  backup_fs_ = backup_env_->GetFileSystem();
+}
+
+BackupEngineImpl::~BackupEngineImpl() {
+  files_to_copy_or_create_.sendEof();
+  for (auto& t : threads_) {
+    t.join();
+  }
+  LogFlush(options_.info_log);
+  for (const auto& it : corrupt_backups_) {
+    it.second.first.PermitUncheckedError();
+  }
+}
+
+IOStatus BackupEngineImpl::Initialize() {
+  assert(!initialized_);
+  initialized_ = true;
+  if (read_only_) {
+    ROCKS_LOG_INFO(options_.info_log, "Starting read_only backup engine");
+  }
+  options_.Dump(options_.info_log);
+
+  auto meta_path = GetAbsolutePath(kMetaDirName);
+
+  if (!read_only_) {
+    // we might need to clean up from previous crash or I/O errors
+    might_need_garbage_collect_ = true;
+
+    if (options_.max_valid_backups_to_open !=
+        std::numeric_limits<int32_t>::max()) {
+      options_.max_valid_backups_to_open = std::numeric_limits<int32_t>::max();
+      ROCKS_LOG_WARN(
+          options_.info_log,
+          "`max_valid_backups_to_open` is not set to the default value. "
+          "Ignoring its value since BackupEngine is not read-only.");
+    }
+
+    // gather the list of directories that we need to create
+    std::vector<std::pair<std::string, std::unique_ptr<FSDirectory>*>>
+        directories;
+    directories.emplace_back(GetAbsolutePath(), &backup_directory_);
+    if (options_.share_table_files) {
+      if (options_.share_files_with_checksum) {
+        directories.emplace_back(
+            GetAbsolutePath(GetSharedFileWithChecksumRel()),
+            &shared_directory_);
+      } else {
+        directories.emplace_back(GetAbsolutePath(GetSharedFileRel()),
+                                 &shared_directory_);
+      }
+    }
+    directories.emplace_back(GetAbsolutePath(kPrivateDirName),
+                             &private_directory_);
+    directories.emplace_back(meta_path, &meta_directory_);
+    // create all the dirs we need
+    for (const auto& d : directories) {
+      IOStatus io_s =
+          backup_fs_->CreateDirIfMissing(d.first, io_options_, nullptr);
+      if (io_s.ok()) {
+        io_s =
+            backup_fs_->NewDirectory(d.first, io_options_, d.second, nullptr);
+      }
+      if (!io_s.ok()) {
+        return io_s;
+      }
+    }
+  }
+
+  std::vector<std::string> backup_meta_files;
+  {
+    IOStatus io_s = backup_fs_->GetChildren(meta_path, io_options_,
+                                            &backup_meta_files, nullptr);
+    if (io_s.IsNotFound()) {
+      return IOStatus::NotFound(meta_path + " is missing");
+    } else if (!io_s.ok()) {
+      return io_s;
+    }
+  }
+  // create backups_ structure
+  for (auto& file : backup_meta_files) {
+    ROCKS_LOG_INFO(options_.info_log, "Detected backup %s", file.c_str());
+    BackupID backup_id = 0;
+    sscanf(file.c_str(), "%u", &backup_id);
+    if (backup_id == 0 || file != std::to_string(backup_id)) {
+      // Invalid file name, will be deleted with auto-GC when user
+      // initiates an append or write operation. (Behave as read-only until
+      // then.)
+      ROCKS_LOG_INFO(options_.info_log, "Skipping unrecognized meta file %s",
+                     file.c_str());
+      continue;
+    }
+    assert(backups_.find(backup_id) == backups_.end());
+    // Insert all the (backup_id, BackupMeta) that will be loaded later
+    // The loading performed later will check whether there are corrupt backups
+    // and move the corrupt backups to corrupt_backups_
+    backups_.insert(std::make_pair(
+        backup_id, std::unique_ptr<BackupMeta>(new BackupMeta(
+                       GetBackupMetaFile(backup_id, false /* tmp */),
+                       GetBackupMetaFile(backup_id, true /* tmp */),
+                       &backuped_file_infos_, backup_env_, backup_fs_))));
+  }
+
+  latest_backup_id_ = 0;
+  latest_valid_backup_id_ = 0;
+  if (options_.destroy_old_data) {  // Destroy old data
+    assert(!read_only_);
+    ROCKS_LOG_INFO(
+        options_.info_log,
+        "Backup Engine started with destroy_old_data == true, deleting all "
+        "backups");
+    IOStatus io_s = PurgeOldBackups(0);
+    if (io_s.ok()) {
+      io_s = GarbageCollect();
+    }
+    if (!io_s.ok()) {
+      return io_s;
+    }
+  } else {  // Load data from storage
+    // abs_path_to_size: maps absolute paths of files in backup directory to
+    // their corresponding sizes
+    std::unordered_map<std::string, uint64_t> abs_path_to_size;
+    // Insert files and their sizes in backup sub-directories (shared and
+    // shared_checksum) to abs_path_to_size
+    for (const auto& rel_dir :
+         {GetSharedFileRel(), GetSharedFileWithChecksumRel()}) {
+      const auto abs_dir = GetAbsolutePath(rel_dir);
+      IOStatus io_s =
+          ReadChildFileCurrentSizes(abs_dir, backup_fs_, &abs_path_to_size);
+      if (!io_s.ok()) {
+        // I/O error likely impacting all backups
+        return io_s;
+      }
+    }
+    // load the backups if any, until valid_backups_to_open of the latest
+    // non-corrupted backups have been successfully opened.
+    int valid_backups_to_open = options_.max_valid_backups_to_open;
+    for (auto backup_iter = backups_.rbegin(); backup_iter != backups_.rend();
+         ++backup_iter) {
+      assert(latest_backup_id_ == 0 || latest_backup_id_ > backup_iter->first);
+      if (latest_backup_id_ == 0) {
+        latest_backup_id_ = backup_iter->first;
+      }
+      if (valid_backups_to_open == 0) {
+        break;
+      }
+
+      // Insert files and their sizes in backup sub-directories
+      // (private/backup_id) to abs_path_to_size
+      IOStatus io_s = ReadChildFileCurrentSizes(
+          GetAbsolutePath(GetPrivateFileRel(backup_iter->first)), backup_fs_,
+          &abs_path_to_size);
+      if (io_s.ok()) {
+        io_s = backup_iter->second->LoadFromFile(
+            options_.backup_dir, abs_path_to_size,
+            options_.backup_rate_limiter.get(), options_.info_log,
+            &reported_ignored_fields_);
+      }
+      if (io_s.IsCorruption() || io_s.IsNotSupported()) {
+        ROCKS_LOG_INFO(options_.info_log, "Backup %u corrupted -- %s",
+                       backup_iter->first, io_s.ToString().c_str());
+        corrupt_backups_.insert(std::make_pair(
+            backup_iter->first,
+            std::make_pair(io_s, std::move(backup_iter->second))));
+      } else if (!io_s.ok()) {
+        // Distinguish corruption errors from errors in the backup Env.
+        // Errors in the backup Env (i.e., this code path) will cause Open() to
+        // fail, whereas corruption errors would not cause Open() failures.
+        return io_s;
+      } else {
+        ROCKS_LOG_INFO(options_.info_log, "Loading backup %" PRIu32 " OK:\n%s",
+                       backup_iter->first,
+                       backup_iter->second->GetInfoString().c_str());
+        assert(latest_valid_backup_id_ == 0 ||
+               latest_valid_backup_id_ > backup_iter->first);
+        if (latest_valid_backup_id_ == 0) {
+          latest_valid_backup_id_ = backup_iter->first;
+        }
+        --valid_backups_to_open;
+      }
+    }
+
+    for (const auto& corrupt : corrupt_backups_) {
+      backups_.erase(backups_.find(corrupt.first));
+    }
+    // erase the backups before max_valid_backups_to_open
+    int num_unopened_backups;
+    if (options_.max_valid_backups_to_open == 0) {
+      num_unopened_backups = 0;
+    } else {
+      num_unopened_backups =
+          std::max(0, static_cast<int>(backups_.size()) -
+                          options_.max_valid_backups_to_open);
+    }
+    for (int i = 0; i < num_unopened_backups; ++i) {
+      assert(backups_.begin()->second->Empty());
+      backups_.erase(backups_.begin());
+    }
+  }
+
+  ROCKS_LOG_INFO(options_.info_log, "Latest backup is %u", latest_backup_id_);
+  ROCKS_LOG_INFO(options_.info_log, "Latest valid backup is %u",
+                 latest_valid_backup_id_);
+
+  // set up threads perform copies from files_to_copy_or_create_ in the
+  // background
+  threads_cpu_priority_ = CpuPriority::kNormal;
+  threads_.reserve(options_.max_background_operations);
+  for (int t = 0; t < options_.max_background_operations; t++) {
+    threads_.emplace_back([this]() {
+#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
+#if __GLIBC_PREREQ(2, 12)
+      pthread_setname_np(pthread_self(), "backup_engine");
+#endif
+#endif
+      CpuPriority current_priority = CpuPriority::kNormal;
+      CopyOrCreateWorkItem work_item;
+      uint64_t bytes_toward_next_callback = 0;
+      while (files_to_copy_or_create_.read(work_item)) {
+        CpuPriority priority = threads_cpu_priority_;
+        if (current_priority != priority) {
+          TEST_SYNC_POINT_CALLBACK(
+              "BackupEngineImpl::Initialize:SetCpuPriority", &priority);
+          port::SetCpuPriority(0, priority);
+          current_priority = priority;
+        }
+        // `bytes_read` and `bytes_written` stats are enabled based on
+        // compile-time support and cannot be dynamically toggled. So we do not
+        // need to worry about `PerfLevel` here, unlike many other
+        // `IOStatsContext` / `PerfContext` stats.
+        uint64_t prev_bytes_read = IOSTATS(bytes_read);
+        uint64_t prev_bytes_written = IOSTATS(bytes_written);
+
+        CopyOrCreateResult result;
+        Temperature temp = work_item.src_temperature;
+        result.io_status = CopyOrCreateFile(
+            work_item.src_path, work_item.dst_path, work_item.contents,
+            work_item.size_limit, work_item.src_env, work_item.dst_env,
+            work_item.src_env_options, work_item.sync, work_item.rate_limiter,
+            work_item.progress_callback, &temp, work_item.dst_temperature,
+            &bytes_toward_next_callback, &result.size, &result.checksum_hex);
+
+        RecordTick(work_item.stats, BACKUP_READ_BYTES,
+                   IOSTATS(bytes_read) - prev_bytes_read);
+        RecordTick(work_item.stats, BACKUP_WRITE_BYTES,
+                   IOSTATS(bytes_written) - prev_bytes_written);
+
+        result.db_id = work_item.db_id;
+        result.db_session_id = work_item.db_session_id;
+        result.expected_src_temperature = work_item.src_temperature;
+        result.current_src_temperature = temp;
+        if (result.io_status.ok() && !work_item.src_checksum_hex.empty()) {
+          // unknown checksum function name implies no db table file checksum in
+          // db manifest; work_item.src_checksum_hex not empty means
+          // backup engine has calculated its crc32c checksum for the table
+          // file; therefore, we are able to compare the checksums.
+          if (work_item.src_checksum_func_name ==
+                  kUnknownFileChecksumFuncName ||
+              work_item.src_checksum_func_name == kDbFileChecksumFuncName) {
+            if (work_item.src_checksum_hex != result.checksum_hex) {
+              std::string checksum_info(
+                  "Expected checksum is " + work_item.src_checksum_hex +
+                  " while computed checksum is " + result.checksum_hex);
+              result.io_status = IOStatus::Corruption(
+                  "Checksum mismatch after copying to " + work_item.dst_path +
+                  ": " + checksum_info);
+            }
+          } else {
+            // FIXME(peterd): dead code?
+            std::string checksum_function_info(
+                "Existing checksum function is " +
+                work_item.src_checksum_func_name +
+                " while provided checksum function is " +
+                kBackupFileChecksumFuncName);
+            ROCKS_LOG_INFO(
+                options_.info_log,
+                "Unable to verify checksum after copying to %s: %s\n",
+                work_item.dst_path.c_str(), checksum_function_info.c_str());
+          }
+        }
+        work_item.result.set_value(std::move(result));
+      }
+    });
+  }
+  ROCKS_LOG_INFO(options_.info_log, "Initialized BackupEngine");
+  return IOStatus::OK();
+}
+
+IOStatus BackupEngineImpl::CreateNewBackupWithMetadata(
+    const CreateBackupOptions& options, DB* db, const std::string& app_metadata,
+    BackupID* new_backup_id_ptr) {
+  assert(initialized_);
+  assert(!read_only_);
+  if (app_metadata.size() > kMaxAppMetaSize) {
+    return IOStatus::InvalidArgument("App metadata too large");
+  }
+
+  if (options.decrease_background_thread_cpu_priority) {
+    if (options.background_thread_cpu_priority < threads_cpu_priority_) {
+      threads_cpu_priority_.store(options.background_thread_cpu_priority);
+    }
+  }
+
+  BackupID new_backup_id = latest_backup_id_ + 1;
+
+  // `bytes_read` and `bytes_written` stats are enabled based on compile-time
+  // support and cannot be dynamically toggled. So we do not need to worry about
+  // `PerfLevel` here, unlike many other `IOStatsContext` / `PerfContext` stats.
+  uint64_t prev_bytes_read = IOSTATS(bytes_read);
+  uint64_t prev_bytes_written = IOSTATS(bytes_written);
+
+  assert(backups_.find(new_backup_id) == backups_.end());
+
+  auto private_dir = GetAbsolutePath(GetPrivateFileRel(new_backup_id));
+  IOStatus io_s = backup_fs_->FileExists(private_dir, io_options_, nullptr);
+  if (io_s.ok()) {
+    // maybe last backup failed and left partial state behind, clean it up.
+    // need to do this before updating backups_ such that a private dir
+    // named after new_backup_id will be cleaned up.
+    // (If an incomplete new backup is followed by an incomplete delete
+    // of the latest full backup, then there could be more than one next
+    // id with a private dir, the last thing to be deleted in delete
+    // backup, but all will be cleaned up with a GarbageCollect.)
+    io_s = GarbageCollect();
+  } else if (io_s.IsNotFound()) {
+    // normal case, the new backup's private dir doesn't exist yet
+    io_s = IOStatus::OK();
+  }
+
+  auto ret = backups_.insert(std::make_pair(
+      new_backup_id, std::unique_ptr<BackupMeta>(new BackupMeta(
+                         GetBackupMetaFile(new_backup_id, false /* tmp */),
+                         GetBackupMetaFile(new_backup_id, true /* tmp */),
+                         &backuped_file_infos_, backup_env_, backup_fs_))));
+  assert(ret.second == true);
+  auto& new_backup = ret.first->second;
+  new_backup->RecordTimestamp();
+  new_backup->SetAppMetadata(app_metadata);
+
+  auto start_backup = backup_env_->NowMicros();
+
+  ROCKS_LOG_INFO(options_.info_log,
+                 "Started the backup process -- creating backup %u",
+                 new_backup_id);
+
+  if (options_.share_table_files && !options_.share_files_with_checksum) {
+    ROCKS_LOG_WARN(options_.info_log,
+                   "BackupEngineOptions::share_files_with_checksum=false is "
+                   "DEPRECATED and could lead to data loss.");
+  }
+
+  if (io_s.ok()) {
+    io_s = backup_fs_->CreateDir(private_dir, io_options_, nullptr);
+  }
+
+  // A set into which we will insert the dst_paths that are calculated for live
+  // files and live WAL files.
+  // This is used to check whether a live files shares a dst_path with another
+  // live file.
+  std::unordered_set<std::string> live_dst_paths;
+
+  std::vector<BackupAfterCopyOrCreateWorkItem> backup_items_to_finish;
+  // Add a CopyOrCreateWorkItem to the channel for each live file
+  Status disabled = db->DisableFileDeletions();
+  DBOptions db_options = db->GetDBOptions();
+  Statistics* stats = db_options.statistics.get();
+  if (io_s.ok()) {
+    CheckpointImpl checkpoint(db);
+    uint64_t sequence_number = 0;
+    FileChecksumGenFactory* db_checksum_factory =
+        db_options.file_checksum_gen_factory.get();
+    const std::string kFileChecksumGenFactoryName =
+        "FileChecksumGenCrc32cFactory";
+    bool compare_checksum =
+        db_checksum_factory != nullptr &&
+                db_checksum_factory->Name() == kFileChecksumGenFactoryName
+            ? true
+            : false;
+    EnvOptions src_raw_env_options(db_options);
+    RateLimiter* rate_limiter = options_.backup_rate_limiter.get();
+    io_s = status_to_io_status(checkpoint.CreateCustomCheckpoint(
+        [&](const std::string& /*src_dirname*/, const std::string& /*fname*/,
+            FileType) {
+          // custom checkpoint will switch to calling copy_file_cb after it sees
+          // NotSupported returned from link_file_cb.
+          return IOStatus::NotSupported();
+        } /* link_file_cb */,
+        [&](const std::string& src_dirname, const std::string& fname,
+            uint64_t size_limit_bytes, FileType type,
+            const std::string& checksum_func_name,
+            const std::string& checksum_val,
+            const Temperature src_temperature) {
+          if (type == kWalFile && !options_.backup_log_files) {
+            return IOStatus::OK();
+          }
+          Log(options_.info_log, "add file for backup %s", fname.c_str());
+          uint64_t size_bytes = 0;
+          IOStatus io_st;
+          if (type == kTableFile || type == kBlobFile) {
+            io_st = db_fs_->GetFileSize(src_dirname + "/" + fname, io_options_,
+                                        &size_bytes, nullptr);
+            if (!io_st.ok()) {
+              Log(options_.info_log, "GetFileSize is failed: %s",
+                  io_st.ToString().c_str());
+              return io_st;
+            }
+          }
+          EnvOptions src_env_options;
+          switch (type) {
+            case kWalFile:
+              src_env_options =
+                  db_env_->OptimizeForLogRead(src_raw_env_options);
+              break;
+            case kTableFile:
+              src_env_options = db_env_->OptimizeForCompactionTableRead(
+                  src_raw_env_options, ImmutableDBOptions(db_options));
+              break;
+            case kDescriptorFile:
+              src_env_options =
+                  db_env_->OptimizeForManifestRead(src_raw_env_options);
+              break;
+            case kBlobFile:
+              src_env_options = db_env_->OptimizeForBlobFileRead(
+                  src_raw_env_options, ImmutableDBOptions(db_options));
+              break;
+            default:
+              // Other backed up files (like options file) are not read by live
+              // DB, so don't need to worry about avoiding mixing buffered and
+              // direct I/O. Just use plain defaults.
+              src_env_options = src_raw_env_options;
+              break;
+          }
+          io_st = AddBackupFileWorkItem(
+              live_dst_paths, backup_items_to_finish, new_backup_id,
+              options_.share_table_files &&
+                  (type == kTableFile || type == kBlobFile),
+              src_dirname, fname, src_env_options, rate_limiter, type,
+              size_bytes, db_options.statistics.get(), size_limit_bytes,
+              options_.share_files_with_checksum &&
+                  (type == kTableFile || type == kBlobFile),
+              options.progress_callback, "" /* contents */, checksum_func_name,
+              checksum_val, src_temperature);
+          return io_st;
+        } /* copy_file_cb */,
+        [&](const std::string& fname, const std::string& contents,
+            FileType type) {
+          Log(options_.info_log, "add file for backup %s", fname.c_str());
+          return AddBackupFileWorkItem(
+              live_dst_paths, backup_items_to_finish, new_backup_id,
+              false /* shared */, "" /* src_dir */, fname,
+              EnvOptions() /* src_env_options */, rate_limiter, type,
+              contents.size(), db_options.statistics.get(), 0 /* size_limit */,
+              false /* shared_checksum */, options.progress_callback, contents);
+        } /* create_file_cb */,
+        &sequence_number,
+        options.flush_before_backup ? 0 : std::numeric_limits<uint64_t>::max(),
+        compare_checksum));
+    if (io_s.ok()) {
+      new_backup->SetSequenceNumber(sequence_number);
+    }
+  }
+  ROCKS_LOG_INFO(options_.info_log, "add files for backup done, wait finish.");
+  IOStatus item_io_status;
+  for (auto& item : backup_items_to_finish) {
+    item.result.wait();
+    auto result = item.result.get();
+    item_io_status = result.io_status;
+    Temperature temp = result.expected_src_temperature;
+    if (result.current_src_temperature != Temperature::kUnknown &&
+        (temp == Temperature::kUnknown ||
+         options_.current_temperatures_override_manifest)) {
+      temp = result.current_src_temperature;
+    }
+    if (item_io_status.ok() && item.shared && item.needed_to_copy) {
+      item_io_status = item.backup_env->GetFileSystem()->RenameFile(
+          item.dst_path_tmp, item.dst_path, io_options_, nullptr);
+    }
+    if (item_io_status.ok()) {
+      item_io_status = new_backup.get()->AddFile(std::make_shared<FileInfo>(
+          item.dst_relative, result.size, result.checksum_hex, result.db_id,
+          result.db_session_id, temp));
+    }
+    if (!item_io_status.ok()) {
+      io_s = item_io_status;
+    }
+  }
+
+  // we copied all the files, enable file deletions
+  if (disabled.ok()) {  // If we successfully disabled file deletions
+    db->EnableFileDeletions(false).PermitUncheckedError();
+  }
+  auto backup_time = backup_env_->NowMicros() - start_backup;
+
+  if (io_s.ok()) {
+    // persist the backup metadata on the disk
+    io_s = new_backup->StoreToFile(options_.sync, options_.schema_version,
+                                   schema_test_options_.get());
+  }
+  if (io_s.ok() && options_.sync) {
+    std::unique_ptr<FSDirectory> backup_private_directory;
+    backup_fs_
+        ->NewDirectory(GetAbsolutePath(GetPrivateFileRel(new_backup_id, false)),
+                       io_options_, &backup_private_directory, nullptr)
+        .PermitUncheckedError();
+    if (backup_private_directory != nullptr) {
+      io_s = backup_private_directory->FsyncWithDirOptions(io_options_, nullptr,
+                                                           DirFsyncOptions());
+    }
+    if (io_s.ok() && private_directory_ != nullptr) {
+      io_s = private_directory_->FsyncWithDirOptions(io_options_, nullptr,
+                                                     DirFsyncOptions());
+    }
+    if (io_s.ok() && meta_directory_ != nullptr) {
+      io_s = meta_directory_->FsyncWithDirOptions(io_options_, nullptr,
+                                                  DirFsyncOptions());
+    }
+    if (io_s.ok() && shared_directory_ != nullptr) {
+      io_s = shared_directory_->FsyncWithDirOptions(io_options_, nullptr,
+                                                    DirFsyncOptions());
+    }
+    if (io_s.ok() && backup_directory_ != nullptr) {
+      io_s = backup_directory_->FsyncWithDirOptions(io_options_, nullptr,
+                                                    DirFsyncOptions());
+    }
+  }
+
+  if (io_s.ok()) {
+    backup_statistics_.IncrementNumberSuccessBackup();
+    // here we know that we succeeded and installed the new backup
+    latest_backup_id_ = new_backup_id;
+    latest_valid_backup_id_ = new_backup_id;
+    if (new_backup_id_ptr) {
+      *new_backup_id_ptr = new_backup_id;
+    }
+    ROCKS_LOG_INFO(options_.info_log, "Backup DONE. All is good");
+
+    // backup_speed is in byte/second
+    double backup_speed = new_backup->GetSize() / (1.048576 * backup_time);
+    ROCKS_LOG_INFO(options_.info_log, "Backup number of files: %u",
+                   new_backup->GetNumberFiles());
+    char human_size[16];
+    AppendHumanBytes(new_backup->GetSize(), human_size, sizeof(human_size));
+    ROCKS_LOG_INFO(options_.info_log, "Backup size: %s", human_size);
+    ROCKS_LOG_INFO(options_.info_log, "Backup time: %" PRIu64 " microseconds",
+                   backup_time);
+    ROCKS_LOG_INFO(options_.info_log, "Backup speed: %.3f MB/s", backup_speed);
+    ROCKS_LOG_INFO(options_.info_log, "Backup Statistics %s",
+                   backup_statistics_.ToString().c_str());
+  } else {
+    backup_statistics_.IncrementNumberFailBackup();
+    // clean all the files we might have created
+    ROCKS_LOG_INFO(options_.info_log, "Backup failed -- %s",
+                   io_s.ToString().c_str());
+    ROCKS_LOG_INFO(options_.info_log, "Backup Statistics %s\n",
+                   backup_statistics_.ToString().c_str());
+    // delete files that we might have already written
+    might_need_garbage_collect_ = true;
+    DeleteBackup(new_backup_id).PermitUncheckedError();
+  }
+
+  RecordTick(stats, BACKUP_READ_BYTES, IOSTATS(bytes_read) - prev_bytes_read);
+  RecordTick(stats, BACKUP_WRITE_BYTES,
+             IOSTATS(bytes_written) - prev_bytes_written);
+  return io_s;
+}
+
+IOStatus BackupEngineImpl::PurgeOldBackups(uint32_t num_backups_to_keep) {
+  assert(initialized_);
+  assert(!read_only_);
+
+  // Best effort deletion even with errors
+  IOStatus overall_status = IOStatus::OK();
+
+  ROCKS_LOG_INFO(options_.info_log, "Purging old backups, keeping %u",
+                 num_backups_to_keep);
+  std::vector<BackupID> to_delete;
+  auto itr = backups_.begin();
+  while ((backups_.size() - to_delete.size()) > num_backups_to_keep) {
+    to_delete.push_back(itr->first);
+    itr++;
+  }
+  for (auto backup_id : to_delete) {
+    // Do not GC until end
+    IOStatus io_s = DeleteBackupNoGC(backup_id);
+    if (!io_s.ok()) {
+      overall_status = io_s;
+    }
+  }
+  // Clean up after any incomplete backup deletion, potentially from
+  // earlier session.
+  if (might_need_garbage_collect_) {
+    IOStatus io_s = GarbageCollect();
+    if (!io_s.ok() && overall_status.ok()) {
+      overall_status = io_s;
+    }
+  }
+  return overall_status;
+}
+
+IOStatus BackupEngineImpl::DeleteBackup(BackupID backup_id) {
+  IOStatus s1 = DeleteBackupNoGC(backup_id);
+  IOStatus s2 = IOStatus::OK();
+
+  // Clean up after any incomplete backup deletion, potentially from
+  // earlier session.
+  if (might_need_garbage_collect_) {
+    s2 = GarbageCollect();
+  }
+
+  if (!s1.ok()) {
+    // Any failure in the primary objective trumps any failure in the
+    // secondary objective.
+    s2.PermitUncheckedError();
+    return s1;
+  } else {
+    return s2;
+  }
+}
+
+// Does not auto-GarbageCollect nor lock
+IOStatus BackupEngineImpl::DeleteBackupNoGC(BackupID backup_id) {
+  assert(initialized_);
+  assert(!read_only_);
+
+  ROCKS_LOG_INFO(options_.info_log, "Deleting backup %u", backup_id);
+  auto backup = backups_.find(backup_id);
+  if (backup != backups_.end()) {
+    IOStatus io_s = backup->second->Delete();
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    backups_.erase(backup);
+  } else {
+    auto corrupt = corrupt_backups_.find(backup_id);
+    if (corrupt == corrupt_backups_.end()) {
+      return IOStatus::NotFound("Backup not found");
+    }
+    IOStatus io_s = corrupt->second.second->Delete();
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    corrupt->second.first.PermitUncheckedError();
+    corrupt_backups_.erase(corrupt);
+  }
+
+  // After removing meta file, best effort deletion even with errors.
+  // (Don't delete other files if we can't delete the meta file right
+  // now.)
+  std::vector<std::string> to_delete;
+  for (auto& itr : backuped_file_infos_) {
+    if (itr.second->refs == 0) {
+      IOStatus io_s = backup_fs_->DeleteFile(GetAbsolutePath(itr.first),
+                                             io_options_, nullptr);
+      ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s", itr.first.c_str(),
+                     io_s.ToString().c_str());
+      to_delete.push_back(itr.first);
+      if (!io_s.ok()) {
+        // Trying again later might work
+        might_need_garbage_collect_ = true;
+      }
+    }
+  }
+  for (auto& td : to_delete) {
+    backuped_file_infos_.erase(td);
+  }
+
+  // take care of private dirs -- GarbageCollect() will take care of them
+  // if they are not empty
+  std::string private_dir = GetPrivateFileRel(backup_id);
+  IOStatus io_s =
+      backup_fs_->DeleteDir(GetAbsolutePath(private_dir), io_options_, nullptr);
+  ROCKS_LOG_INFO(options_.info_log, "Deleting private dir %s -- %s",
+                 private_dir.c_str(), io_s.ToString().c_str());
+  if (!io_s.ok()) {
+    // Full gc or trying again later might work
+    might_need_garbage_collect_ = true;
+  }
+  return IOStatus::OK();
+}
+
+void BackupEngineImpl::SetBackupInfoFromBackupMeta(
+    BackupID id, const BackupMeta& meta, BackupInfo* backup_info,
+    bool include_file_details) const {
+  *backup_info = BackupInfo(id, meta.GetTimestamp(), meta.GetSize(),
+                            meta.GetNumberFiles(), meta.GetAppMetadata());
+  std::string dir =
+      options_.backup_dir + "/" + kPrivateDirSlash + std::to_string(id);
+  if (include_file_details) {
+    auto& file_details = backup_info->file_details;
+    file_details.reserve(meta.GetFiles().size());
+    for (auto& file_ptr : meta.GetFiles()) {
+      BackupFileInfo& finfo = *file_details.emplace(file_details.end());
+      finfo.relative_filename = file_ptr->filename;
+      finfo.size = file_ptr->size;
+      finfo.directory = dir;
+      uint64_t number;
+      FileType type;
+      bool ok = ParseFileName(file_ptr->filename, &number, &type);
+      if (ok) {
+        finfo.file_number = number;
+        finfo.file_type = type;
+      }
+      // TODO: temperature, file_checksum, file_checksum_func_name
+    }
+    backup_info->name_for_open = GetAbsolutePath(GetPrivateFileRel(id));
+    backup_info->name_for_open.pop_back();  // remove trailing '/'
+    backup_info->env_for_open = meta.GetEnvForOpen();
+  }
+}
+
+Status BackupEngineImpl::GetBackupInfo(BackupID backup_id,
+                                       BackupInfo* backup_info,
+                                       bool include_file_details) const {
+  assert(initialized_);
+  if (backup_id == kLatestBackupIDMarker) {
+    // Note: Read latest_valid_backup_id_ inside of lock
+    backup_id = latest_valid_backup_id_;
+  }
+  auto corrupt_itr = corrupt_backups_.find(backup_id);
+  if (corrupt_itr != corrupt_backups_.end()) {
+    return Status::Corruption(corrupt_itr->second.first.ToString());
+  }
+  auto backup_itr = backups_.find(backup_id);
+  if (backup_itr == backups_.end()) {
+    return Status::NotFound("Backup not found");
+  }
+  auto& backup = backup_itr->second;
+  if (backup->Empty()) {
+    return Status::NotFound("Backup not found");
+  }
+
+  SetBackupInfoFromBackupMeta(backup_id, *backup, backup_info,
+                              include_file_details);
+  return Status::OK();
+}
+
+void BackupEngineImpl::GetBackupInfo(std::vector<BackupInfo>* backup_info,
+                                     bool include_file_details) const {
+  assert(initialized_);
+  backup_info->resize(backups_.size());
+  size_t i = 0;
+  for (auto& backup : backups_) {
+    const BackupMeta& meta = *backup.second;
+    if (!meta.Empty()) {
+      SetBackupInfoFromBackupMeta(backup.first, meta, &backup_info->at(i++),
+                                  include_file_details);
+    }
+  }
+}
+
+void BackupEngineImpl::GetCorruptedBackups(
+    std::vector<BackupID>* corrupt_backup_ids) const {
+  assert(initialized_);
+  corrupt_backup_ids->reserve(corrupt_backups_.size());
+  for (auto& backup : corrupt_backups_) {
+    corrupt_backup_ids->push_back(backup.first);
+  }
+}
+
+IOStatus BackupEngineImpl::RestoreDBFromBackup(
+    const RestoreOptions& options, BackupID backup_id,
+    const std::string& db_dir, const std::string& wal_dir) const {
+  assert(initialized_);
+  if (backup_id == kLatestBackupIDMarker) {
+    // Note: Read latest_valid_backup_id_ inside of lock
+    backup_id = latest_valid_backup_id_;
+  }
+  auto corrupt_itr = corrupt_backups_.find(backup_id);
+  if (corrupt_itr != corrupt_backups_.end()) {
+    return corrupt_itr->second.first;
+  }
+  auto backup_itr = backups_.find(backup_id);
+  if (backup_itr == backups_.end()) {
+    return IOStatus::NotFound("Backup not found");
+  }
+  auto& backup = backup_itr->second;
+  if (backup->Empty()) {
+    return IOStatus::NotFound("Backup not found");
+  }
+
+  ROCKS_LOG_INFO(options_.info_log, "Restoring backup id %u\n", backup_id);
+  ROCKS_LOG_INFO(options_.info_log, "keep_log_files: %d\n",
+                 static_cast<int>(options.keep_log_files));
+
+  // just in case. Ignore errors
+  db_fs_->CreateDirIfMissing(db_dir, io_options_, nullptr)
+      .PermitUncheckedError();
+  db_fs_->CreateDirIfMissing(wal_dir, io_options_, nullptr)
+      .PermitUncheckedError();
+
+  if (options.keep_log_files) {
+    // delete files in db_dir, but keep all the log files
+    DeleteChildren(db_dir, 1 << kWalFile);
+    // move all the files from archive dir to wal_dir
+    std::string archive_dir = ArchivalDirectory(wal_dir);
+    std::vector<std::string> archive_files;
+    db_fs_->GetChildren(archive_dir, io_options_, &archive_files, nullptr)
+        .PermitUncheckedError();  // ignore errors
+    for (const auto& f : archive_files) {
+      uint64_t number;
+      FileType type;
+      bool ok = ParseFileName(f, &number, &type);
+      if (ok && type == kWalFile) {
+        ROCKS_LOG_INFO(options_.info_log,
+                       "Moving log file from archive/ to wal_dir: %s",
+                       f.c_str());
+        IOStatus io_s = db_fs_->RenameFile(
+            archive_dir + "/" + f, wal_dir + "/" + f, io_options_, nullptr);
+        if (!io_s.ok()) {
+          // if we can't move log file from archive_dir to wal_dir,
+          // we should fail, since it might mean data loss
+          return io_s;
+        }
+      }
+    }
+  } else {
+    DeleteChildren(wal_dir);
+    DeleteChildren(ArchivalDirectory(wal_dir));
+    DeleteChildren(db_dir);
+  }
+
+  IOStatus io_s;
+  std::vector<RestoreAfterCopyOrCreateWorkItem> restore_items_to_finish;
+  std::string temporary_current_file;
+  std::string final_current_file;
+  std::unique_ptr<FSDirectory> db_dir_for_fsync;
+  std::unique_ptr<FSDirectory> wal_dir_for_fsync;
+
+  for (const auto& file_info : backup->GetFiles()) {
+    const std::string& file = file_info->filename;
+    // 1. get DB filename
+    std::string dst = file_info->GetDbFileName();
+
+    // 2. find the filetype
+    uint64_t number;
+    FileType type;
+    bool ok = ParseFileName(dst, &number, &type);
+    if (!ok) {
+      return IOStatus::Corruption("Backup corrupted: Fail to parse filename " +
+                                  dst);
+    }
+    // 3. Construct the final path
+    // kWalFile lives in wal_dir and all the rest live in db_dir
+    if (type == kWalFile) {
+      dst = wal_dir + "/" + dst;
+      if (options_.sync && !wal_dir_for_fsync) {
+        io_s = db_fs_->NewDirectory(wal_dir, io_options_, &wal_dir_for_fsync,
+                                    nullptr);
+        if (!io_s.ok()) {
+          return io_s;
+        }
+      }
+    } else {
+      dst = db_dir + "/" + dst;
+      if (options_.sync && !db_dir_for_fsync) {
+        io_s = db_fs_->NewDirectory(db_dir, io_options_, &db_dir_for_fsync,
+                                    nullptr);
+        if (!io_s.ok()) {
+          return io_s;
+        }
+      }
+    }
+    // For atomicity, initially restore CURRENT file to a temporary name.
+    // This is useful even without options_.sync e.g. in case the restore
+    // process is interrupted.
+    if (type == kCurrentFile) {
+      final_current_file = dst;
+      dst = temporary_current_file = dst + ".tmp";
+    }
+
+    ROCKS_LOG_INFO(options_.info_log, "Restoring %s to %s\n", file.c_str(),
+                   dst.c_str());
+    CopyOrCreateWorkItem copy_or_create_work_item(
+        GetAbsolutePath(file), dst, Temperature::kUnknown /* src_temp */,
+        file_info->temp, "" /* contents */, backup_env_, db_env_,
+        EnvOptions() /* src_env_options */, options_.sync,
+        options_.restore_rate_limiter.get(), file_info->size,
+        nullptr /* stats */);
+    RestoreAfterCopyOrCreateWorkItem after_copy_or_create_work_item(
+        copy_or_create_work_item.result.get_future(), file, dst,
+        file_info->checksum_hex);
+    files_to_copy_or_create_.write(std::move(copy_or_create_work_item));
+    restore_items_to_finish.push_back(
+        std::move(after_copy_or_create_work_item));
+  }
+  IOStatus item_io_status;
+  for (auto& item : restore_items_to_finish) {
+    item.result.wait();
+    auto result = item.result.get();
+    item_io_status = result.io_status;
+    // Note: It is possible that both of the following bad-status cases occur
+    // during copying. But, we only return one status.
+    if (!item_io_status.ok()) {
+      io_s = item_io_status;
+      break;
+    } else if (!item.checksum_hex.empty() &&
+               item.checksum_hex != result.checksum_hex) {
+      io_s = IOStatus::Corruption(
+          "While restoring " + item.from_file + " -> " + item.to_file +
+          ": expected checksum is " + item.checksum_hex +
+          " while computed checksum is " + result.checksum_hex);
+      break;
+    }
+  }
+
+  // When enabled, the first FsyncWithDirOptions is to ensure all files are
+  // fully persisted before renaming CURRENT.tmp
+  if (io_s.ok() && db_dir_for_fsync) {
+    ROCKS_LOG_INFO(options_.info_log, "Restore: fsync\n");
+    io_s = db_dir_for_fsync->FsyncWithDirOptions(io_options_, nullptr,
+                                                 DirFsyncOptions());
+  }
+
+  if (io_s.ok() && wal_dir_for_fsync) {
+    io_s = wal_dir_for_fsync->FsyncWithDirOptions(io_options_, nullptr,
+                                                  DirFsyncOptions());
+  }
+
+  if (io_s.ok() && !temporary_current_file.empty()) {
+    ROCKS_LOG_INFO(options_.info_log, "Restore: atomic rename CURRENT.tmp\n");
+    assert(!final_current_file.empty());
+    io_s = db_fs_->RenameFile(temporary_current_file, final_current_file,
+                              io_options_, nullptr);
+  }
+
+  if (io_s.ok() && db_dir_for_fsync && !temporary_current_file.empty()) {
+    // Second FsyncWithDirOptions is to ensure the final atomic rename of DB
+    // restore is fully persisted even if power goes out right after restore
+    // operation returns success
+    assert(db_dir_for_fsync);
+    io_s = db_dir_for_fsync->FsyncWithDirOptions(
+        io_options_, nullptr, DirFsyncOptions(final_current_file));
+  }
+
+  ROCKS_LOG_INFO(options_.info_log, "Restoring done -- %s\n",
+                 io_s.ToString().c_str());
+  return io_s;
+}
+
+IOStatus BackupEngineImpl::VerifyBackup(BackupID backup_id,
+                                        bool verify_with_checksum) const {
+  assert(initialized_);
+  // Check if backup_id is corrupted, or valid and registered
+  auto corrupt_itr = corrupt_backups_.find(backup_id);
+  if (corrupt_itr != corrupt_backups_.end()) {
+    return corrupt_itr->second.first;
+  }
+
+  auto backup_itr = backups_.find(backup_id);
+  if (backup_itr == backups_.end()) {
+    return IOStatus::NotFound();
+  }
+
+  auto& backup = backup_itr->second;
+  if (backup->Empty()) {
+    return IOStatus::NotFound();
+  }
+
+  ROCKS_LOG_INFO(options_.info_log, "Verifying backup id %u\n", backup_id);
+
+  // Find all existing backup files belong to backup_id
+  std::unordered_map<std::string, uint64_t> curr_abs_path_to_size;
+  for (const auto& rel_dir : {GetPrivateFileRel(backup_id), GetSharedFileRel(),
+                              GetSharedFileWithChecksumRel()}) {
+    const auto abs_dir = GetAbsolutePath(rel_dir);
+    // Shared directories allowed to be missing in some cases. Expected but
+    // missing files will be reported a few lines down.
+    ReadChildFileCurrentSizes(abs_dir, backup_fs_, &curr_abs_path_to_size)
+        .PermitUncheckedError();
+  }
+
+  // For all files registered in backup
+  for (const auto& file_info : backup->GetFiles()) {
+    const auto abs_path = GetAbsolutePath(file_info->filename);
+    // check existence of the file
+    if (curr_abs_path_to_size.find(abs_path) == curr_abs_path_to_size.end()) {
+      return IOStatus::NotFound("File missing: " + abs_path);
+    }
+    // verify file size
+    if (file_info->size != curr_abs_path_to_size[abs_path]) {
+      std::string size_info("Expected file size is " +
+                            std::to_string(file_info->size) +
+                            " while found file size is " +
+                            std::to_string(curr_abs_path_to_size[abs_path]));
+      return IOStatus::Corruption("File corrupted: File size mismatch for " +
+                                  abs_path + ": " + size_info);
+    }
+    if (verify_with_checksum && !file_info->checksum_hex.empty()) {
+      // verify file checksum
+      std::string checksum_hex;
+      ROCKS_LOG_INFO(options_.info_log, "Verifying %s checksum...\n",
+                     abs_path.c_str());
+      IOStatus io_s = ReadFileAndComputeChecksum(
+          abs_path, backup_fs_, EnvOptions(), 0 /* size_limit */, &checksum_hex,
+          Temperature::kUnknown);
+      if (!io_s.ok()) {
+        return io_s;
+      } else if (file_info->checksum_hex != checksum_hex) {
+        std::string checksum_info(
+            "Expected checksum is " + file_info->checksum_hex +
+            " while computed checksum is " + checksum_hex);
+        return IOStatus::Corruption("File corrupted: Checksum mismatch for " +
+                                    abs_path + ": " + checksum_info);
+      }
+    }
+  }
+  return IOStatus::OK();
+}
+
+IOStatus BackupEngineImpl::CopyOrCreateFile(
+    const std::string& src, const std::string& dst, const std::string& contents,
+    uint64_t size_limit, Env* src_env, Env* dst_env,
+    const EnvOptions& src_env_options, bool sync, RateLimiter* rate_limiter,
+    std::function<void()> progress_callback, Temperature* src_temperature,
+    Temperature dst_temperature, uint64_t* bytes_toward_next_callback,
+    uint64_t* size, std::string* checksum_hex) {
+  assert(src.empty() != contents.empty());
+  IOStatus io_s;
+  std::unique_ptr<FSWritableFile> dst_file;
+  std::unique_ptr<FSSequentialFile> src_file;
+  FileOptions dst_file_options;
+  dst_file_options.use_mmap_writes = false;
+  dst_file_options.temperature = dst_temperature;
+  // TODO:(gzh) maybe use direct reads/writes here if possible
+  if (size != nullptr) {
+    *size = 0;
+  }
+  uint32_t checksum_value = 0;
+
+  // Check if size limit is set. if not, set it to very big number
+  if (size_limit == 0) {
+    size_limit = std::numeric_limits<uint64_t>::max();
+  }
+
+  io_s = dst_env->GetFileSystem()->NewWritableFile(dst, dst_file_options,
+                                                   &dst_file, nullptr);
+  if (io_s.ok() && !src.empty()) {
+    auto src_file_options = FileOptions(src_env_options);
+    src_file_options.temperature = *src_temperature;
+    io_s = src_env->GetFileSystem()->NewSequentialFile(src, src_file_options,
+                                                       &src_file, nullptr);
+  }
+  if (io_s.IsPathNotFound() && *src_temperature != Temperature::kUnknown) {
+    // Retry without temperature hint in case the FileSystem is strict with
+    // non-kUnknown temperature option
+    io_s = src_env->GetFileSystem()->NewSequentialFile(
+        src, FileOptions(src_env_options), &src_file, nullptr);
+  }
+  if (!io_s.ok()) {
+    return io_s;
+  }
+
+  size_t buf_size =
+      rate_limiter ? static_cast<size_t>(rate_limiter->GetSingleBurstBytes())
+                   : kDefaultCopyFileBufferSize;
+
+  std::unique_ptr<WritableFileWriter> dest_writer(
+      new WritableFileWriter(std::move(dst_file), dst, dst_file_options));
+  std::unique_ptr<SequentialFileReader> src_reader;
+  std::unique_ptr<char[]> buf;
+  if (!src.empty()) {
+    // Return back current temperature in FileSystem
+    *src_temperature = src_file->GetTemperature();
+
+    src_reader.reset(new SequentialFileReader(
+        std::move(src_file), src, nullptr /* io_tracer */, {}, rate_limiter));
+    buf.reset(new char[buf_size]);
+  }
+
+  Slice data;
+  do {
+    if (stop_backup_.load(std::memory_order_acquire)) {
+      return status_to_io_status(Status::Incomplete("Backup stopped"));
+    }
+    if (!src.empty()) {
+      size_t buffer_to_read =
+          (buf_size < size_limit) ? buf_size : static_cast<size_t>(size_limit);
+      io_s = src_reader->Read(buffer_to_read, &data, buf.get(),
+                              Env::IO_LOW /* rate_limiter_priority */);
+      *bytes_toward_next_callback += data.size();
+    } else {
+      data = contents;
+    }
+    size_limit -= data.size();
+    TEST_SYNC_POINT_CALLBACK(
+        "BackupEngineImpl::CopyOrCreateFile:CorruptionDuringBackup",
+        (src.length() > 4 && src.rfind(".sst") == src.length() - 4) ? &data
+                                                                    : nullptr);
+
+    if (!io_s.ok()) {
+      return io_s;
+    }
+
+    if (size != nullptr) {
+      *size += data.size();
+    }
+    if (checksum_hex != nullptr) {
+      checksum_value = crc32c::Extend(checksum_value, data.data(), data.size());
+    }
+    io_s = dest_writer->Append(data);
+
+    if (rate_limiter != nullptr) {
+      if (!src.empty()) {
+        rate_limiter->Request(data.size(), Env::IO_LOW, nullptr /* stats */,
+                              RateLimiter::OpType::kWrite);
+      } else {
+        LoopRateLimitRequestHelper(data.size(), rate_limiter, Env::IO_LOW,
+                                   nullptr /* stats */,
+                                   RateLimiter::OpType::kWrite);
+      }
+    }
+    while (*bytes_toward_next_callback >=
+           options_.callback_trigger_interval_size) {
+      *bytes_toward_next_callback -= options_.callback_trigger_interval_size;
+      std::lock_guard<std::mutex> lock(byte_report_mutex_);
+      progress_callback();
+    }
+  } while (io_s.ok() && contents.empty() && data.size() > 0 && size_limit > 0);
+
+  // Convert uint32_t checksum to hex checksum
+  if (checksum_hex != nullptr) {
+    checksum_hex->assign(ChecksumInt32ToHex(checksum_value));
+  }
+
+  if (io_s.ok() && sync) {
+    io_s = dest_writer->Sync(false);
+  }
+  if (io_s.ok()) {
+    io_s = dest_writer->Close();
+  }
+  return io_s;
+}
+
+// fname will always start with "/"
+IOStatus BackupEngineImpl::AddBackupFileWorkItem(
+    std::unordered_set<std::string>& live_dst_paths,
+    std::vector<BackupAfterCopyOrCreateWorkItem>& backup_items_to_finish,
+    BackupID backup_id, bool shared, const std::string& src_dir,
+    const std::string& fname, const EnvOptions& src_env_options,
+    RateLimiter* rate_limiter, FileType file_type, uint64_t size_bytes,
+    Statistics* stats, uint64_t size_limit, bool shared_checksum,
+    std::function<void()> progress_callback, const std::string& contents,
+    const std::string& src_checksum_func_name,
+    const std::string& src_checksum_str, const Temperature src_temperature) {
+  assert(contents.empty() != src_dir.empty());
+
+  std::string src_path = src_dir + "/" + fname;
+  std::string dst_relative;
+  std::string dst_relative_tmp;
+  std::string db_id;
+  std::string db_session_id;
+  // crc32c checksum in hex. empty == unavailable / unknown
+  std::string checksum_hex;
+
+  // Whenever a default checksum function name is passed in, we will compares
+  // the corresponding checksum values after copying. Note that only table and
+  // blob files may have a known checksum function name passed in.
+  //
+  // If no default checksum function name is passed in and db session id is not
+  // available, we will calculate the checksum *before* copying in two cases
+  // (we always calcuate checksums when copying or creating for any file types):
+  // a) share_files_with_checksum is true and file type is table;
+  // b) share_table_files is true and the file exists already.
+  //
+  // Step 0: Check if default checksum function name is passed in
+  if (kDbFileChecksumFuncName == src_checksum_func_name) {
+    if (src_checksum_str == kUnknownFileChecksum) {
+      return status_to_io_status(
+          Status::Aborted("Unknown checksum value for " + fname));
+    }
+    checksum_hex = ChecksumStrToHex(src_checksum_str);
+  }
+
+  // Step 1: Prepare the relative path to destination
+  if (shared && shared_checksum) {
+    if (GetNamingNoFlags() != BackupEngineOptions::kLegacyCrc32cAndFileSize &&
+        file_type != kBlobFile) {
+      // Prepare db_session_id to add to the file name
+      // Ignore the returned status
+      // In the failed cases, db_id and db_session_id will be empty
+      GetFileDbIdentities(db_env_, src_env_options, src_path, src_temperature,
+                          rate_limiter, &db_id, &db_session_id)
+          .PermitUncheckedError();
+    }
+    // Calculate checksum if checksum and db session id are not available.
+    // If db session id is available, we will not calculate the checksum
+    // since the session id should suffice to avoid file name collision in
+    // the shared_checksum directory.
+    if (checksum_hex.empty() && db_session_id.empty()) {
+      IOStatus io_s = ReadFileAndComputeChecksum(
+          src_path, db_fs_, src_env_options, size_limit, &checksum_hex,
+          src_temperature);
+      if (!io_s.ok()) {
+        return io_s;
+      }
+    }
+    if (size_bytes == std::numeric_limits<uint64_t>::max()) {
+      return IOStatus::NotFound("File missing: " + src_path);
+    }
+    // dst_relative depends on the following conditions:
+    // 1) the naming scheme is kUseDbSessionId,
+    // 2) db_session_id is not empty,
+    // 3) checksum is available in the DB manifest.
+    // If 1,2,3) are satisfied, then dst_relative will be of the form:
+    // shared_checksum/<file_number>_<checksum>_<db_session_id>.sst
+    // If 1,2) are satisfied, then dst_relative will be of the form:
+    // shared_checksum/<file_number>_<db_session_id>.sst
+    // Otherwise, dst_relative is of the form
+    // shared_checksum/<file_number>_<checksum>_<size>.sst
+    //
+    // For blob files, db_session_id is not supported with the blob file format.
+    // It uses original/legacy naming scheme.
+    // dst_relative will be of the form:
+    // shared_checksum/<file_number>_<checksum>_<size>.blob
+    dst_relative = GetSharedFileWithChecksum(fname, checksum_hex, size_bytes,
+                                             db_session_id);
+    dst_relative_tmp = GetSharedFileWithChecksumRel(dst_relative, true);
+    dst_relative = GetSharedFileWithChecksumRel(dst_relative, false);
+  } else if (shared) {
+    dst_relative_tmp = GetSharedFileRel(fname, true);
+    dst_relative = GetSharedFileRel(fname, false);
+  } else {
+    dst_relative = GetPrivateFileRel(backup_id, false, fname);
+  }
+
+  // We copy into `temp_dest_path` and, once finished, rename it to
+  // `final_dest_path`. This allows files to atomically appear at
+  // `final_dest_path`. We can copy directly to the final path when atomicity
+  // is unnecessary, like for files in private backup directories.
+  const std::string* copy_dest_path;
+  std::string temp_dest_path;
+  std::string final_dest_path = GetAbsolutePath(dst_relative);
+  if (!dst_relative_tmp.empty()) {
+    temp_dest_path = GetAbsolutePath(dst_relative_tmp);
+    copy_dest_path = &temp_dest_path;
+  } else {
+    copy_dest_path = &final_dest_path;
+  }
+
+  // Step 2: Determine whether to copy or not
+  // if it's shared, we also need to check if it exists -- if it does, no need
+  // to copy it again.
+  bool need_to_copy = true;
+  // true if final_dest_path is the same path as another live file
+  const bool same_path =
+      live_dst_paths.find(final_dest_path) != live_dst_paths.end();
+
+  bool file_exists = false;
+  if (shared && !same_path) {
+    // Should be in shared directory but not a live path, check existence in
+    // shared directory
+    IOStatus exist =
+        backup_fs_->FileExists(final_dest_path, io_options_, nullptr);
+    if (exist.ok()) {
+      file_exists = true;
+    } else if (exist.IsNotFound()) {
+      file_exists = false;
+    } else {
+      return exist;
+    }
+  }
+
+  if (!contents.empty()) {
+    need_to_copy = false;
+  } else if (shared && (same_path || file_exists)) {
+    need_to_copy = false;
+    auto find_result = backuped_file_infos_.find(dst_relative);
+    if (find_result == backuped_file_infos_.end() && !same_path) {
+      // file exists but not referenced
+      ROCKS_LOG_INFO(
+          options_.info_log,
+          "%s already present, but not referenced by any backup. We will "
+          "overwrite the file.",
+          fname.c_str());
+      need_to_copy = true;
+      // Defer any failure reporting to when we try to write the file
+      backup_fs_->DeleteFile(final_dest_path, io_options_, nullptr)
+          .PermitUncheckedError();
+    } else {
+      // file exists and referenced
+      if (checksum_hex.empty()) {
+        // same_path should not happen for a standard DB, so OK to
+        // read file contents to check for checksum mismatch between
+        // two files from same DB getting same name.
+        // For compatibility with future meta file that might not have
+        // crc32c checksum available, consider it might be empty, but
+        // we don't currently generate meta file without crc32c checksum.
+        // Therefore we have to read & compute it if we don't have it.
+        if (!same_path && !find_result->second->checksum_hex.empty()) {
+          assert(find_result != backuped_file_infos_.end());
+          // Note: to save I/O on incremental backups, we copy prior known
+          // checksum of the file instead of reading entire file contents
+          // to recompute it.
+          checksum_hex = find_result->second->checksum_hex;
+          // Regarding corruption detection, consider:
+          // (a) the DB file is corrupt (since previous backup) and the backup
+          // file is OK: we failed to detect, but the backup is safe. DB can
+          // be repaired/restored once its corruption is detected.
+          // (b) the backup file is corrupt (since previous backup) and the
+          // db file is OK: we failed to detect, but the backup is corrupt.
+          // CreateNewBackup should support fast incremental backups and
+          // there's no way to support that without reading all the files.
+          // We might add an option for extra checks on incremental backup,
+          // but until then, use VerifyBackups to check existing backup data.
+          // (c) file name collision with legitimately different content.
+          // This is almost inconceivable with a well-generated DB session
+          // ID, but even in that case, we double check the file sizes in
+          // BackupMeta::AddFile.
+        } else {
+          IOStatus io_s = ReadFileAndComputeChecksum(
+              src_path, db_fs_, src_env_options, size_limit, &checksum_hex,
+              src_temperature);
+          if (!io_s.ok()) {
+            return io_s;
+          }
+        }
+      }
+      if (!db_session_id.empty()) {
+        ROCKS_LOG_INFO(options_.info_log,
+                       "%s already present, with checksum %s, size %" PRIu64
+                       " and DB session identity %s",
+                       fname.c_str(), checksum_hex.c_str(), size_bytes,
+                       db_session_id.c_str());
+      } else {
+        ROCKS_LOG_INFO(options_.info_log,
+                       "%s already present, with checksum %s and size %" PRIu64,
+                       fname.c_str(), checksum_hex.c_str(), size_bytes);
+      }
+    }
+  }
+  live_dst_paths.insert(final_dest_path);
+
+  // Step 3: Add work item
+  if (!contents.empty() || need_to_copy) {
+    ROCKS_LOG_INFO(options_.info_log, "Copying %s to %s", fname.c_str(),
+                   copy_dest_path->c_str());
+    CopyOrCreateWorkItem copy_or_create_work_item(
+        src_dir.empty() ? "" : src_path, *copy_dest_path, src_temperature,
+        Temperature::kUnknown /*dst_temp*/, contents, db_env_, backup_env_,
+        src_env_options, options_.sync, rate_limiter, size_limit, stats,
+        progress_callback, src_checksum_func_name, checksum_hex, db_id,
+        db_session_id);
+    BackupAfterCopyOrCreateWorkItem after_copy_or_create_work_item(
+        copy_or_create_work_item.result.get_future(), shared, need_to_copy,
+        backup_env_, temp_dest_path, final_dest_path, dst_relative);
+    files_to_copy_or_create_.write(std::move(copy_or_create_work_item));
+    backup_items_to_finish.push_back(std::move(after_copy_or_create_work_item));
+  } else {
+    std::promise<CopyOrCreateResult> promise_result;
+    BackupAfterCopyOrCreateWorkItem after_copy_or_create_work_item(
+        promise_result.get_future(), shared, need_to_copy, backup_env_,
+        temp_dest_path, final_dest_path, dst_relative);
+    backup_items_to_finish.push_back(std::move(after_copy_or_create_work_item));
+    CopyOrCreateResult result;
+    result.io_status = IOStatus::OK();
+    result.size = size_bytes;
+    result.checksum_hex = std::move(checksum_hex);
+    result.db_id = std::move(db_id);
+    result.db_session_id = std::move(db_session_id);
+    promise_result.set_value(std::move(result));
+  }
+  return IOStatus::OK();
+}
+
+IOStatus BackupEngineImpl::ReadFileAndComputeChecksum(
+    const std::string& src, const std::shared_ptr<FileSystem>& src_fs,
+    const EnvOptions& src_env_options, uint64_t size_limit,
+    std::string* checksum_hex, const Temperature src_temperature) const {
+  if (checksum_hex == nullptr) {
+    return status_to_io_status(Status::Aborted("Checksum pointer is null"));
+  }
+  uint32_t checksum_value = 0;
+  if (size_limit == 0) {
+    size_limit = std::numeric_limits<uint64_t>::max();
+  }
+
+  std::unique_ptr<SequentialFileReader> src_reader;
+  auto file_options = FileOptions(src_env_options);
+  file_options.temperature = src_temperature;
+  RateLimiter* rate_limiter = options_.backup_rate_limiter.get();
+  IOStatus io_s = SequentialFileReader::Create(
+      src_fs, src, file_options, &src_reader, nullptr /* dbg */, rate_limiter);
+  if (io_s.IsPathNotFound() && src_temperature != Temperature::kUnknown) {
+    // Retry without temperature hint in case the FileSystem is strict with
+    // non-kUnknown temperature option
+    file_options.temperature = Temperature::kUnknown;
+    io_s = SequentialFileReader::Create(src_fs, src, file_options, &src_reader,
+                                        nullptr /* dbg */, rate_limiter);
+  }
+  if (!io_s.ok()) {
+    return io_s;
+  }
+
+  size_t buf_size = kDefaultCopyFileBufferSize;
+  std::unique_ptr<char[]> buf(new char[buf_size]);
+  Slice data;
+
+  do {
+    if (stop_backup_.load(std::memory_order_acquire)) {
+      return status_to_io_status(Status::Incomplete("Backup stopped"));
+    }
+    size_t buffer_to_read =
+        (buf_size < size_limit) ? buf_size : static_cast<size_t>(size_limit);
+    io_s = src_reader->Read(buffer_to_read, &data, buf.get(),
+                            Env::IO_LOW /* rate_limiter_priority */);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+
+    size_limit -= data.size();
+    checksum_value = crc32c::Extend(checksum_value, data.data(), data.size());
+  } while (data.size() > 0 && size_limit > 0);
+
+  checksum_hex->assign(ChecksumInt32ToHex(checksum_value));
+
+  return io_s;
+}
+
+Status BackupEngineImpl::GetFileDbIdentities(
+    Env* src_env, const EnvOptions& src_env_options,
+    const std::string& file_path, Temperature file_temp,
+    RateLimiter* rate_limiter, std::string* db_id, std::string* db_session_id) {
+  assert(db_id != nullptr || db_session_id != nullptr);
+
+  Options options;
+  options.env = src_env;
+  SstFileDumper sst_reader(options, file_path, file_temp,
+                           2 * 1024 * 1024
+                           /* readahead_size */,
+                           false /* verify_checksum */, false /* output_hex */,
+                           false /* decode_blob_index */, src_env_options,
+                           true /* silent */);
+
+  const TableProperties* table_properties = nullptr;
+  std::shared_ptr<const TableProperties> tp;
+  Status s = sst_reader.getStatus();
+
+  if (s.ok()) {
+    // Try to get table properties from the table reader of sst_reader
+    if (!sst_reader.ReadTableProperties(&tp).ok()) {
+      // Try to use table properites from the initialization of sst_reader
+      table_properties = sst_reader.GetInitTableProperties();
+    } else {
+      table_properties = tp.get();
+      if (table_properties != nullptr && rate_limiter != nullptr) {
+        // sizeof(*table_properties) is a sufficent but far-from-exact
+        // approximation of read bytes due to metaindex block, std::string
+        // properties and varint compression
+        LoopRateLimitRequestHelper(sizeof(*table_properties), rate_limiter,
+                                   Env::IO_LOW, nullptr /* stats */,
+                                   RateLimiter::OpType::kRead);
+      }
+    }
+  } else {
+    ROCKS_LOG_INFO(options_.info_log, "Failed to read %s: %s",
+                   file_path.c_str(), s.ToString().c_str());
+    return s;
+  }
+
+  if (table_properties != nullptr) {
+    if (db_id != nullptr) {
+      db_id->assign(table_properties->db_id);
+    }
+    if (db_session_id != nullptr) {
+      db_session_id->assign(table_properties->db_session_id);
+      if (db_session_id->empty()) {
+        s = Status::NotFound("DB session identity not found in " + file_path);
+        ROCKS_LOG_INFO(options_.info_log, "%s", s.ToString().c_str());
+        return s;
+      }
+    }
+    return Status::OK();
+  } else {
+    s = Status::Corruption("Table properties missing in " + file_path);
+    ROCKS_LOG_INFO(options_.info_log, "%s", s.ToString().c_str());
+    return s;
+  }
+}
+
+void BackupEngineImpl::LoopRateLimitRequestHelper(
+    const size_t total_bytes_to_request, RateLimiter* rate_limiter,
+    const Env::IOPriority pri, Statistics* stats,
+    const RateLimiter::OpType op_type) {
+  assert(rate_limiter != nullptr);
+  size_t remaining_bytes = total_bytes_to_request;
+  size_t request_bytes = 0;
+  while (remaining_bytes > 0) {
+    request_bytes =
+        std::min(static_cast<size_t>(rate_limiter->GetSingleBurstBytes()),
+                 remaining_bytes);
+    rate_limiter->Request(request_bytes, pri, stats, op_type);
+    remaining_bytes -= request_bytes;
+  }
+}
+
+void BackupEngineImpl::DeleteChildren(const std::string& dir,
+                                      uint32_t file_type_filter) const {
+  std::vector<std::string> children;
+  db_fs_->GetChildren(dir, io_options_, &children, nullptr)
+      .PermitUncheckedError();  // ignore errors
+
+  for (const auto& f : children) {
+    uint64_t number;
+    FileType type;
+    bool ok = ParseFileName(f, &number, &type);
+    if (ok && (file_type_filter & (1 << type))) {
+      // don't delete this file
+      continue;
+    }
+    db_fs_->DeleteFile(dir + "/" + f, io_options_, nullptr)
+        .PermitUncheckedError();  // ignore errors
+  }
+}
+
+IOStatus BackupEngineImpl::ReadChildFileCurrentSizes(
+    const std::string& dir, const std::shared_ptr<FileSystem>& fs,
+    std::unordered_map<std::string, uint64_t>* result) const {
+  assert(result != nullptr);
+  std::vector<Env::FileAttributes> files_attrs;
+  IOStatus io_status = fs->FileExists(dir, io_options_, nullptr);
+  if (io_status.ok()) {
+    io_status =
+        fs->GetChildrenFileAttributes(dir, io_options_, &files_attrs, nullptr);
+  } else if (io_status.IsNotFound()) {
+    // Insert no entries can be considered success
+    io_status = IOStatus::OK();
+  }
+  const bool slash_needed = dir.empty() || dir.back() != '/';
+  for (const auto& file_attrs : files_attrs) {
+    result->emplace(dir + (slash_needed ? "/" : "") + file_attrs.name,
+                    file_attrs.size_bytes);
+  }
+  return io_status;
+}
+
+IOStatus BackupEngineImpl::GarbageCollect() {
+  assert(!read_only_);
+
+  // We will make a best effort to remove all garbage even in the presence
+  // of inconsistencies or I/O failures that inhibit finding garbage.
+  IOStatus overall_status = IOStatus::OK();
+  // If all goes well, we don't need another auto-GC this session
+  might_need_garbage_collect_ = false;
+
+  ROCKS_LOG_INFO(options_.info_log, "Starting garbage collection");
+
+  // delete obsolete shared files
+  for (bool with_checksum : {false, true}) {
+    std::vector<std::string> shared_children;
+    {
+      std::string shared_path;
+      if (with_checksum) {
+        shared_path = GetAbsolutePath(GetSharedFileWithChecksumRel());
+      } else {
+        shared_path = GetAbsolutePath(GetSharedFileRel());
+      }
+      IOStatus io_s = backup_fs_->FileExists(shared_path, io_options_, nullptr);
+      if (io_s.ok()) {
+        io_s = backup_fs_->GetChildren(shared_path, io_options_,
+                                       &shared_children, nullptr);
+      } else if (io_s.IsNotFound()) {
+        io_s = IOStatus::OK();
+      }
+      if (!io_s.ok()) {
+        overall_status = io_s;
+        // Trying again later might work
+        might_need_garbage_collect_ = true;
+      }
+    }
+    for (auto& child : shared_children) {
+      std::string rel_fname;
+      if (with_checksum) {
+        rel_fname = GetSharedFileWithChecksumRel(child);
+      } else {
+        rel_fname = GetSharedFileRel(child);
+      }
+      auto child_itr = backuped_file_infos_.find(rel_fname);
+      // if it's not refcounted, delete it
+      if (child_itr == backuped_file_infos_.end() ||
+          child_itr->second->refs == 0) {
+        // this might be a directory, but DeleteFile will just fail in that
+        // case, so we're good
+        IOStatus io_s = backup_fs_->DeleteFile(GetAbsolutePath(rel_fname),
+                                               io_options_, nullptr);
+        ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s",
+                       rel_fname.c_str(), io_s.ToString().c_str());
+        backuped_file_infos_.erase(rel_fname);
+        if (!io_s.ok()) {
+          // Trying again later might work
+          might_need_garbage_collect_ = true;
+        }
+      }
+    }
+  }
+
+  // delete obsolete private files
+  std::vector<std::string> private_children;
+  {
+    IOStatus io_s =
+        backup_fs_->GetChildren(GetAbsolutePath(kPrivateDirName), io_options_,
+                                &private_children, nullptr);
+    if (!io_s.ok()) {
+      overall_status = io_s;
+      // Trying again later might work
+      might_need_garbage_collect_ = true;
+    }
+  }
+  for (auto& child : private_children) {
+    BackupID backup_id = 0;
+    bool tmp_dir = child.find(".tmp") != std::string::npos;
+    sscanf(child.c_str(), "%u", &backup_id);
+    if (!tmp_dir &&  // if it's tmp_dir, delete it
+        (backup_id == 0 || backups_.find(backup_id) != backups_.end())) {
+      // it's either not a number or it's still alive. continue
+      continue;
+    }
+    // here we have to delete the dir and all its children
+    std::string full_private_path =
+        GetAbsolutePath(GetPrivateFileRel(backup_id));
+    std::vector<std::string> subchildren;
+    if (backup_fs_
+            ->GetChildren(full_private_path, io_options_, &subchildren, nullptr)
+            .ok()) {
+      for (auto& subchild : subchildren) {
+        IOStatus io_s = backup_fs_->DeleteFile(full_private_path + subchild,
+                                               io_options_, nullptr);
+        ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s",
+                       (full_private_path + subchild).c_str(),
+                       io_s.ToString().c_str());
+        if (!io_s.ok()) {
+          // Trying again later might work
+          might_need_garbage_collect_ = true;
+        }
+      }
+    }
+    // finally delete the private dir
+    IOStatus io_s =
+        backup_fs_->DeleteDir(full_private_path, io_options_, nullptr);
+    ROCKS_LOG_INFO(options_.info_log, "Deleting dir %s -- %s",
+                   full_private_path.c_str(), io_s.ToString().c_str());
+    if (!io_s.ok()) {
+      // Trying again later might work
+      might_need_garbage_collect_ = true;
+    }
+  }
+
+  assert(overall_status.ok() || might_need_garbage_collect_);
+  return overall_status;
+}
+
+// ------- BackupMeta class --------
+
+IOStatus BackupEngineImpl::BackupMeta::AddFile(
+    std::shared_ptr<FileInfo> file_info) {
+  auto itr = file_infos_->find(file_info->filename);
+  if (itr == file_infos_->end()) {
+    auto ret = file_infos_->insert({file_info->filename, file_info});
+    if (ret.second) {
+      itr = ret.first;
+      itr->second->refs = 1;
+    } else {
+      // if this happens, something is seriously wrong
+      return IOStatus::Corruption("In memory metadata insertion error");
+    }
+  } else {
+    // Compare sizes, because we scanned that off the filesystem on both
+    // ends. This is like a check in VerifyBackup.
+    if (itr->second->size != file_info->size) {
+      std::string msg = "Size mismatch for existing backup file: ";
+      msg.append(file_info->filename);
+      msg.append(" Size in backup is " + std::to_string(itr->second->size) +
+                 " while size in DB is " + std::to_string(file_info->size));
+      msg.append(
+          " If this DB file checks as not corrupt, try deleting old"
+          " backups or backing up to a different backup directory.");
+      return IOStatus::Corruption(msg);
+    }
+    if (file_info->checksum_hex.empty()) {
+      // No checksum available to check
+    } else if (itr->second->checksum_hex.empty()) {
+      // Remember checksum if newly acquired
+      itr->second->checksum_hex = file_info->checksum_hex;
+    } else if (itr->second->checksum_hex != file_info->checksum_hex) {
+      // Note: to save I/O, these will be equal trivially on already backed
+      // up files that don't have the checksum in their name. And it should
+      // never fail for files that do have checksum in their name.
+
+      // Should never reach here, but produce an appropriate corruption
+      // message in case we do in a release build.
+      assert(false);
+      std::string msg = "Checksum mismatch for existing backup file: ";
+      msg.append(file_info->filename);
+      msg.append(" Expected checksum is " + itr->second->checksum_hex +
+                 " while computed checksum is " + file_info->checksum_hex);
+      msg.append(
+          " If this DB file checks as not corrupt, try deleting old"
+          " backups or backing up to a different backup directory.");
+      return IOStatus::Corruption(msg);
+    }
+    ++itr->second->refs;  // increase refcount if already present
+  }
+
+  size_ += file_info->size;
+  files_.push_back(itr->second);
+
+  return IOStatus::OK();
+}
+
+IOStatus BackupEngineImpl::BackupMeta::Delete(bool delete_meta) {
+  IOStatus io_s;
+  for (const auto& file : files_) {
+    --file->refs;  // decrease refcount
+  }
+  files_.clear();
+  // delete meta file
+  if (delete_meta) {
+    io_s = fs_->FileExists(meta_filename_, iooptions_, nullptr);
+    if (io_s.ok()) {
+      io_s = fs_->DeleteFile(meta_filename_, iooptions_, nullptr);
+    } else if (io_s.IsNotFound()) {
+      io_s = IOStatus::OK();  // nothing to delete
+    }
+  }
+  timestamp_ = 0;
+  return io_s;
+}
+
+// Constants for backup meta file schema (see LoadFromFile)
+const std::string kSchemaVersionPrefix{"schema_version "};
+const std::string kFooterMarker{"// FOOTER"};
+
+const std::string kAppMetaDataFieldName{"metadata"};
+
+// WART: The checksums are crc32c but named "crc32"
+const std::string kFileCrc32cFieldName{"crc32"};
+const std::string kFileSizeFieldName{"size"};
+const std::string kTemperatureFieldName{"temp"};
+
+// Marks a (future) field that should cause failure if not recognized.
+// Other fields are assumed to be ignorable. For example, in the future
+// we might add
+//  ni::file_name_escape uri_percent
+// to indicate all file names have had spaces and special characters
+// escaped using a URI percent encoding.
+const std::string kNonIgnorableFieldPrefix{"ni::"};
+
+// Each backup meta file is of the format (schema version 1):
+//----------------------------------------------------------
+// <timestamp>
+// <seq number>
+// metadata <metadata> (optional)
+// <number of files>
+// <file1> crc32 <crc32c_as_unsigned_decimal>
+// <file2> crc32 <crc32c_as_unsigned_decimal>
+// ...
+//----------------------------------------------------------
+//
+// For schema version 2.x (not in public APIs, but
+// forward-compatibility started):
+//----------------------------------------------------------
+// schema_version <ver>
+// <timestamp>
+// <seq number>
+// [<field name> <field data>]
+// ...
+// <number of files>
+// <file1>( <field name> <field data no spaces>)*
+// <file2>( <field name> <field data no spaces>)*
+// ...
+// [// FOOTER]
+// [<field name> <field data>]
+// ...
+//----------------------------------------------------------
+// where
+// <ver> ::= [0-9]+([.][0-9]+)
+// <field name> ::= [A-Za-z_][A-Za-z_0-9.]+
+// <field data> is anything but newline
+// <field data no spaces> is anything but space and newline
+// Although "// FOOTER" wouldn't strictly be required as a delimiter
+// given the number of files is included, it is there for parsing
+// sanity in case of corruption. It is only required if followed
+// by footer fields, such as a checksum of the meta file (so far).
+// Unrecognized fields are ignored, to support schema evolution on
+// non-critical features with forward compatibility. Update schema
+// major version for breaking changes. Schema minor versions are indicated
+// only for diagnostic/debugging purposes.
+//
+// Fields in schema version 2.0:
+// * Top-level meta fields:
+//   * Only "metadata" as in schema version 1
+// * File meta fields:
+//   * "crc32" - a crc32c checksum as in schema version 1
+//   * "size" - the size of the file (new)
+// * Footer meta fields:
+//   * None yet (future use for meta file checksum anticipated)
+//
+IOStatus BackupEngineImpl::BackupMeta::LoadFromFile(
+    const std::string& backup_dir,
+    const std::unordered_map<std::string, uint64_t>& abs_path_to_size,
+    RateLimiter* rate_limiter, Logger* info_log,
+    std::unordered_set<std::string>* reported_ignored_fields) {
+  assert(reported_ignored_fields);
+  assert(Empty());
+
+  std::unique_ptr<LineFileReader> backup_meta_reader;
+  {
+    IOStatus io_s = LineFileReader::Create(fs_, meta_filename_, FileOptions(),
+                                           &backup_meta_reader,
+                                           nullptr /* dbg */, rate_limiter);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+  }
+
+  // If we don't read an explicit schema_version, that implies version 1,
+  // which is what we call the original backup meta schema.
+  int schema_major_version = 1;
+
+  // Failures handled at the end
+  std::string line;
+  if (backup_meta_reader->ReadLine(&line,
+                                   Env::IO_LOW /* rate_limiter_priority */)) {
+    if (StartsWith(line, kSchemaVersionPrefix)) {
+      std::string ver = line.substr(kSchemaVersionPrefix.size());
+      if (ver == "2" || StartsWith(ver, "2.")) {
+        schema_major_version = 2;
+      } else {
+        return IOStatus::NotSupported(
+            "Unsupported/unrecognized schema version: " + ver);
+      }
+      line.clear();
+    } else if (line.empty()) {
+      return IOStatus::Corruption("Unexpected empty line");
+    }
+  }
+  if (!line.empty()) {
+    timestamp_ = std::strtoull(line.c_str(), nullptr, /*base*/ 10);
+  } else if (backup_meta_reader->ReadLine(
+                 &line, Env::IO_LOW /* rate_limiter_priority */)) {
+    timestamp_ = std::strtoull(line.c_str(), nullptr, /*base*/ 10);
+  }
+  if (backup_meta_reader->ReadLine(&line,
+                                   Env::IO_LOW /* rate_limiter_priority */)) {
+    sequence_number_ = std::strtoull(line.c_str(), nullptr, /*base*/ 10);
+  }
+  uint32_t num_files = UINT32_MAX;
+  while (backup_meta_reader->ReadLine(
+      &line, Env::IO_LOW /* rate_limiter_priority */)) {
+    if (line.empty()) {
+      return IOStatus::Corruption("Unexpected empty line");
+    }
+    // Number -> number of files -> exit loop reading optional meta fields
+    if (line[0] >= '0' && line[0] <= '9') {
+      num_files = static_cast<uint32_t>(strtoul(line.c_str(), nullptr, 10));
+      break;
+    }
+    // else, must be a meta field assignment
+    auto space_pos = line.find_first_of(' ');
+    if (space_pos == std::string::npos) {
+      return IOStatus::Corruption("Expected number of files or meta field");
+    }
+    std::string field_name = line.substr(0, space_pos);
+    std::string field_data = line.substr(space_pos + 1);
+    if (field_name == kAppMetaDataFieldName) {
+      // app metadata present
+      bool decode_success = Slice(field_data).DecodeHex(&app_metadata_);
+      if (!decode_success) {
+        return IOStatus::Corruption(
+            "Failed to decode stored hex encoded app metadata");
+      }
+    } else if (schema_major_version < 2) {
+      return IOStatus::Corruption("Expected number of files or \"" +
+                                  kAppMetaDataFieldName + "\" field");
+    } else if (StartsWith(field_name, kNonIgnorableFieldPrefix)) {
+      return IOStatus::NotSupported("Unrecognized non-ignorable meta field " +
+                                    field_name + " (from future version?)");
+    } else {
+      // Warn the first time we see any particular unrecognized meta field
+      if (reported_ignored_fields->insert("meta:" + field_name).second) {
+        ROCKS_LOG_WARN(info_log, "Ignoring unrecognized backup meta field %s",
+                       field_name.c_str());
+      }
+    }
+  }
+  std::vector<std::shared_ptr<FileInfo>> files;
+  bool footer_present = false;
+  while (backup_meta_reader->ReadLine(
+      &line, Env::IO_LOW /* rate_limiter_priority */)) {
+    std::vector<std::string> components = StringSplit(line, ' ');
+
+    if (components.size() < 1) {
+      return IOStatus::Corruption("Empty line instead of file entry.");
+    }
+    if (schema_major_version >= 2 && components.size() == 2 &&
+        line == kFooterMarker) {
+      footer_present = true;
+      break;
+    }
+
+    const std::string& filename = components[0];
+
+    uint64_t actual_size;
+    const std::shared_ptr<FileInfo> file_info = GetFile(filename);
+    if (file_info) {
+      actual_size = file_info->size;
+    } else {
+      std::string abs_path = backup_dir + "/" + filename;
+      auto e = abs_path_to_size.find(abs_path);
+      if (e == abs_path_to_size.end()) {
+        return IOStatus::Corruption(
+            "Pathname in meta file not found on disk: " + abs_path);
+      }
+      actual_size = e->second;
+    }
+
+    if (schema_major_version >= 2) {
+      if (components.size() % 2 != 1) {
+        return IOStatus::Corruption(
+            "Bad number of line components for file entry.");
+      }
+    } else {
+      // Check restricted original schema
+      if (components.size() < 3) {
+        return IOStatus::Corruption("File checksum is missing for " + filename +
+                                    " in " + meta_filename_);
+      }
+      if (components[1] != kFileCrc32cFieldName) {
+        return IOStatus::Corruption("Unknown checksum type for " + filename +
+                                    " in " + meta_filename_);
+      }
+      if (components.size() > 3) {
+        return IOStatus::Corruption("Extra data for entry " + filename +
+                                    " in " + meta_filename_);
+      }
+    }
+
+    std::string checksum_hex;
+    Temperature temp = Temperature::kUnknown;
+    for (unsigned i = 1; i < components.size(); i += 2) {
+      const std::string& field_name = components[i];
+      const std::string& field_data = components[i + 1];
+
+      if (field_name == kFileCrc32cFieldName) {
+        uint32_t checksum_value =
+            static_cast<uint32_t>(strtoul(field_data.c_str(), nullptr, 10));
+        if (field_data != std::to_string(checksum_value)) {
+          return IOStatus::Corruption("Invalid checksum value for " + filename +
+                                      " in " + meta_filename_);
+        }
+        checksum_hex = ChecksumInt32ToHex(checksum_value);
+      } else if (field_name == kFileSizeFieldName) {
+        uint64_t ex_size =
+            std::strtoull(field_data.c_str(), nullptr, /*base*/ 10);
+        if (ex_size != actual_size) {
+          return IOStatus::Corruption(
+              "For file " + filename + " expected size " +
+              std::to_string(ex_size) + " but found size" +
+              std::to_string(actual_size));
+        }
+      } else if (field_name == kTemperatureFieldName) {
+        auto iter = temperature_string_map.find(field_data);
+        if (iter != temperature_string_map.end()) {
+          temp = iter->second;
+        } else {
+          // Could report corruption, but in case of new temperatures added
+          // in future, letting those map to kUnknown which should generally
+          // be safe.
+          temp = Temperature::kUnknown;
+        }
+      } else if (StartsWith(field_name, kNonIgnorableFieldPrefix)) {
+        return IOStatus::NotSupported("Unrecognized non-ignorable file field " +
+                                      field_name + " (from future version?)");
+      } else {
+        // Warn the first time we see any particular unrecognized file field
+        if (reported_ignored_fields->insert("file:" + field_name).second) {
+          ROCKS_LOG_WARN(info_log, "Ignoring unrecognized backup file field %s",
+                         field_name.c_str());
+        }
+      }
+    }
+
+    files.emplace_back(new FileInfo(filename, actual_size, checksum_hex,
+                                    /*id*/ "", /*sid*/ "", temp));
+  }
+
+  if (footer_present) {
+    assert(schema_major_version >= 2);
+    while (backup_meta_reader->ReadLine(
+        &line, Env::IO_LOW /* rate_limiter_priority */)) {
+      if (line.empty()) {
+        return IOStatus::Corruption("Unexpected empty line");
+      }
+      auto space_pos = line.find_first_of(' ');
+      if (space_pos == std::string::npos) {
+        return IOStatus::Corruption("Expected footer field");
+      }
+      std::string field_name = line.substr(0, space_pos);
+      std::string field_data = line.substr(space_pos + 1);
+      if (StartsWith(field_name, kNonIgnorableFieldPrefix)) {
+        return IOStatus::NotSupported("Unrecognized non-ignorable field " +
+                                      field_name + " (from future version?)");
+      } else if (reported_ignored_fields->insert("footer:" + field_name)
+                     .second) {
+        // Warn the first time we see any particular unrecognized footer field
+        ROCKS_LOG_WARN(info_log,
+                       "Ignoring unrecognized backup meta footer field %s",
+                       field_name.c_str());
+      }
+    }
+  }
+
+  {
+    IOStatus io_s = backup_meta_reader->GetStatus();
+    if (!io_s.ok()) {
+      return io_s;
+    }
+  }
+
+  if (num_files != files.size()) {
+    return IOStatus::Corruption(
+        "Inconsistent number of files or missing/incomplete header in " +
+        meta_filename_);
+  }
+
+  files_.reserve(files.size());
+  for (const auto& file_info : files) {
+    IOStatus io_s = AddFile(file_info);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+  }
+
+  return IOStatus::OK();
+}
+
+const std::vector<std::string> minor_version_strings{
+    "",  // invalid major version 0
+    "",  // implicit major version 1
+    "2.0",
+};
+
+IOStatus BackupEngineImpl::BackupMeta::StoreToFile(
+    bool sync, int schema_version,
+    const TEST_BackupMetaSchemaOptions* schema_test_options) {
+  if (schema_version < 1) {
+    return IOStatus::InvalidArgument(
+        "BackupEngineOptions::schema_version must be >= 1");
+  }
+  if (schema_version > static_cast<int>(minor_version_strings.size() - 1)) {
+    return IOStatus::NotSupported(
+        "Only BackupEngineOptions::schema_version <= " +
+        std::to_string(minor_version_strings.size() - 1) + " is supported");
+  }
+  std::string ver = minor_version_strings[schema_version];
+
+  // Need schema_version >= 2 for TEST_BackupMetaSchemaOptions
+  assert(schema_version >= 2 || schema_test_options == nullptr);
+
+  IOStatus io_s;
+  std::unique_ptr<FSWritableFile> backup_meta_file;
+  FileOptions file_options;
+  file_options.use_mmap_writes = false;
+  file_options.use_direct_writes = false;
+  io_s = fs_->NewWritableFile(meta_tmp_filename_, file_options,
+                              &backup_meta_file, nullptr);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+
+  std::ostringstream buf;
+  if (schema_test_options) {
+    // override for testing
+    ver = schema_test_options->version;
+  }
+  if (!ver.empty()) {
+    assert(schema_version >= 2);
+    buf << kSchemaVersionPrefix << ver << "\n";
+  }
+  buf << static_cast<unsigned long long>(timestamp_) << "\n";
+  buf << sequence_number_ << "\n";
+
+  if (!app_metadata_.empty()) {
+    std::string hex_encoded_metadata =
+        Slice(app_metadata_).ToString(/* hex */ true);
+    buf << kAppMetaDataFieldName << " " << hex_encoded_metadata << "\n";
+  }
+  if (schema_test_options) {
+    for (auto& e : schema_test_options->meta_fields) {
+      buf << e.first << " " << e.second << "\n";
+    }
+  }
+  buf << files_.size() << "\n";
+
+  for (const auto& file : files_) {
+    buf << file->filename;
+    if (schema_test_options == nullptr ||
+        schema_test_options->crc32c_checksums) {
+      // use crc32c for now, switch to something else if needed
+      buf << " " << kFileCrc32cFieldName << " "
+          << ChecksumHexToInt32(file->checksum_hex);
+    }
+    if (schema_version >= 2 && file->temp != Temperature::kUnknown) {
+      buf << " " << kTemperatureFieldName << " "
+          << temperature_to_string[file->temp];
+    }
+    if (schema_test_options && schema_test_options->file_sizes) {
+      buf << " " << kFileSizeFieldName << " " << std::to_string(file->size);
+    }
+    if (schema_test_options) {
+      for (auto& e : schema_test_options->file_fields) {
+        buf << " " << e.first << " " << e.second;
+      }
+    }
+    buf << "\n";
+  }
+
+  if (schema_test_options && !schema_test_options->footer_fields.empty()) {
+    buf << kFooterMarker << "\n";
+    for (auto& e : schema_test_options->footer_fields) {
+      buf << e.first << " " << e.second << "\n";
+    }
+  }
+
+  io_s = backup_meta_file->Append(Slice(buf.str()), iooptions_, nullptr);
+  IOSTATS_ADD(bytes_written, buf.str().size());
+  if (io_s.ok() && sync) {
+    io_s = backup_meta_file->Sync(iooptions_, nullptr);
+  }
+  if (io_s.ok()) {
+    io_s = backup_meta_file->Close(iooptions_, nullptr);
+  }
+  if (io_s.ok()) {
+    io_s = fs_->RenameFile(meta_tmp_filename_, meta_filename_, iooptions_,
+                           nullptr);
+  }
+  return io_s;
+}
+}  // namespace
+
+IOStatus BackupEngineReadOnly::Open(const BackupEngineOptions& options,
+                                    Env* env,
+                                    BackupEngineReadOnly** backup_engine_ptr) {
+  if (options.destroy_old_data) {
+    return IOStatus::InvalidArgument(
+        "Can't destroy old data with ReadOnly BackupEngine");
+  }
+  std::unique_ptr<BackupEngineImplThreadSafe> backup_engine(
+      new BackupEngineImplThreadSafe(options, env, true /*read_only*/));
+  auto s = backup_engine->Initialize();
+  if (!s.ok()) {
+    *backup_engine_ptr = nullptr;
+    return s;
+  }
+  *backup_engine_ptr = backup_engine.release();
+  return IOStatus::OK();
+}
+
+void TEST_SetBackupMetaSchemaOptions(
+    BackupEngine* engine, const TEST_BackupMetaSchemaOptions& options) {
+  BackupEngineImplThreadSafe* impl =
+      static_cast_with_check<BackupEngineImplThreadSafe>(engine);
+  impl->TEST_SetBackupMetaSchemaOptions(options);
+}
+
+void TEST_SetDefaultRateLimitersClock(
+    BackupEngine* engine,
+    const std::shared_ptr<SystemClock>& backup_rate_limiter_clock,
+    const std::shared_ptr<SystemClock>& restore_rate_limiter_clock) {
+  BackupEngineImplThreadSafe* impl =
+      static_cast_with_check<BackupEngineImplThreadSafe>(engine);
+  impl->TEST_SetDefaultRateLimitersClock(backup_rate_limiter_clock,
+                                         restore_rate_limiter_clock);
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/backup/backup_engine_impl.h b/src/rocksdb/utilities/backup/backup_engine_impl.h
new file mode 100644
index 000000000..398f47f27
--- /dev/null
+++ b/src/rocksdb/utilities/backup/backup_engine_impl.h
@@ -0,0 +1,36 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/backup_engine.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct TEST_BackupMetaSchemaOptions {
+  std::string version = "2";
+  bool crc32c_checksums = false;
+  bool file_sizes = true;
+  std::map<std::string, std::string> meta_fields;
+  std::map<std::string, std::string> file_fields;
+  std::map<std::string, std::string> footer_fields;
+};
+
+// Modifies the BackupEngine(Impl) to write backup meta files using the
+// unpublished schema version 2, for the life of this object (not backup_dir).
+// TEST_BackupMetaSchemaOptions offers some customization for testing.
+void TEST_SetBackupMetaSchemaOptions(
+    BackupEngine* engine, const TEST_BackupMetaSchemaOptions& options);
+
+// Modifies the BackupEngine(Impl) to use specified clocks for backup and
+// restore rate limiters created by default if not specified by users for
+// test speedup.
+void TEST_SetDefaultRateLimitersClock(
+    BackupEngine* engine,
+    const std::shared_ptr<SystemClock>& backup_rate_limiter_clock = nullptr,
+    const std::shared_ptr<SystemClock>& restore_rate_limiter_clock = nullptr);
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/backup/backup_engine_test.cc b/src/rocksdb/utilities/backup/backup_engine_test.cc
new file mode 100644
index 000000000..d1f74f769
--- /dev/null
+++ b/src/rocksdb/utilities/backup/backup_engine_test.cc
@@ -0,0 +1,4219 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#if !defined(ROCKSDB_LITE) && !defined(OS_WIN)
+
+#include "rocksdb/utilities/backup_engine.h"
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <random>
+#include <string>
+#include <utility>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "env/composite_env_wrapper.h"
+#include "env/env_chroot.h"
+#include "file/filename.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_checksum.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/transaction_log.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/options_util.h"
+#include "rocksdb/utilities/stackable_db.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/cast_util.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+#include "util/stderr_logger.h"
+#include "util/string_util.h"
+#include "utilities/backup/backup_engine_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+using ShareFilesNaming = BackupEngineOptions::ShareFilesNaming;
+const auto kLegacyCrc32cAndFileSize =
+    BackupEngineOptions::kLegacyCrc32cAndFileSize;
+const auto kUseDbSessionId = BackupEngineOptions::kUseDbSessionId;
+const auto kFlagIncludeFileSize = BackupEngineOptions::kFlagIncludeFileSize;
+const auto kNamingDefault = kUseDbSessionId | kFlagIncludeFileSize;
+
+class DummyDB : public StackableDB {
+ public:
+  /* implicit */
+  DummyDB(const Options& options, const std::string& dbname)
+      : StackableDB(nullptr),
+        options_(options),
+        dbname_(dbname),
+        deletions_enabled_(true),
+        sequence_number_(0) {}
+
+  SequenceNumber GetLatestSequenceNumber() const override {
+    return ++sequence_number_;
+  }
+
+  const std::string& GetName() const override { return dbname_; }
+
+  Env* GetEnv() const override { return options_.env; }
+
+  using DB::GetOptions;
+  Options GetOptions(ColumnFamilyHandle* /*column_family*/) const override {
+    return options_;
+  }
+
+  DBOptions GetDBOptions() const override { return DBOptions(options_); }
+
+  Status EnableFileDeletions(bool /*force*/) override {
+    EXPECT_TRUE(!deletions_enabled_);
+    deletions_enabled_ = true;
+    return Status::OK();
+  }
+
+  Status DisableFileDeletions() override {
+    EXPECT_TRUE(deletions_enabled_);
+    deletions_enabled_ = false;
+    return Status::OK();
+  }
+
+  ColumnFamilyHandle* DefaultColumnFamily() const override { return nullptr; }
+
+  Status GetLiveFilesStorageInfo(
+      const LiveFilesStorageInfoOptions& opts,
+      std::vector<LiveFileStorageInfo>* files) override {
+    uint64_t number;
+    FileType type;
+    files->clear();
+    for (auto& f : live_files_) {
+      bool success = ParseFileName(f, &number, &type);
+      if (!success) {
+        return Status::InvalidArgument("Bad file name: " + f);
+      }
+      files->emplace_back();
+      LiveFileStorageInfo& info = files->back();
+      info.relative_filename = f;
+      info.directory = dbname_;
+      info.file_number = number;
+      info.file_type = type;
+      if (type == kDescriptorFile) {
+        info.size = 100;  // See TestFs::GetChildrenFileAttributes below
+        info.trim_to_size = true;
+      } else if (type == kCurrentFile) {
+        info.size = 0;
+        info.trim_to_size = true;
+      } else {
+        info.size = 200;  // See TestFs::GetChildrenFileAttributes below
+      }
+      if (opts.include_checksum_info) {
+        info.file_checksum = kUnknownFileChecksum;
+        info.file_checksum_func_name = kUnknownFileChecksumFuncName;
+      }
+    }
+    return Status::OK();
+  }
+
+  // To avoid FlushWAL called on stacked db which is nullptr
+  Status FlushWAL(bool /*sync*/) override { return Status::OK(); }
+
+  std::vector<std::string> live_files_;
+
+ private:
+  Options options_;
+  std::string dbname_;
+  bool deletions_enabled_;
+  mutable SequenceNumber sequence_number_;
+};  // DummyDB
+
+class TestFs : public FileSystemWrapper {
+ public:
+  explicit TestFs(const std::shared_ptr<FileSystem>& t)
+      : FileSystemWrapper(t) {}
+  const char* Name() const override { return "TestFs"; }
+
+  class DummySequentialFile : public FSSequentialFile {
+   public:
+    explicit DummySequentialFile(bool fail_reads)
+        : FSSequentialFile(), rnd_(5), fail_reads_(fail_reads) {}
+    IOStatus Read(size_t n, const IOOptions&, Slice* result, char* scratch,
+                  IODebugContext*) override {
+      if (fail_reads_) {
+        return IOStatus::IOError();
+      }
+      size_t read_size = (n > size_left) ? size_left : n;
+      for (size_t i = 0; i < read_size; ++i) {
+        scratch[i] = rnd_.Next() & 255;
+      }
+      *result = Slice(scratch, read_size);
+      size_left -= read_size;
+      return IOStatus::OK();
+    }
+
+    IOStatus Skip(uint64_t n) override {
+      size_left = (n > size_left) ? size_left - n : 0;
+      return IOStatus::OK();
+    }
+
+   private:
+    size_t size_left = 200;
+    Random rnd_;
+    bool fail_reads_;
+  };
+
+  IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts,
+                             std::unique_ptr<FSSequentialFile>* r,
+                             IODebugContext* dbg) override {
+    MutexLock l(&mutex_);
+    if (dummy_sequential_file_) {
+      r->reset(
+          new TestFs::DummySequentialFile(dummy_sequential_file_fail_reads_));
+      return IOStatus::OK();
+    } else {
+      IOStatus s = FileSystemWrapper::NewSequentialFile(f, file_opts, r, dbg);
+      if (s.ok()) {
+        if ((*r)->use_direct_io()) {
+          ++num_direct_seq_readers_;
+        }
+        ++num_seq_readers_;
+      }
+      return s;
+    }
+  }
+
+  IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts,
+                           std::unique_ptr<FSWritableFile>* r,
+                           IODebugContext* dbg) override {
+    MutexLock l(&mutex_);
+    written_files_.push_back(f);
+    if (limit_written_files_ == 0) {
+      return IOStatus::NotSupported("Limit on written files reached");
+    }
+    limit_written_files_--;
+    IOStatus s = FileSystemWrapper::NewWritableFile(f, file_opts, r, dbg);
+    if (s.ok()) {
+      if ((*r)->use_direct_io()) {
+        ++num_direct_writers_;
+      }
+      ++num_writers_;
+    }
+    return s;
+  }
+
+  IOStatus NewRandomAccessFile(const std::string& f,
+                               const FileOptions& file_opts,
+                               std::unique_ptr<FSRandomAccessFile>* r,
+                               IODebugContext* dbg) override {
+    MutexLock l(&mutex_);
+    IOStatus s = FileSystemWrapper::NewRandomAccessFile(f, file_opts, r, dbg);
+    if (s.ok()) {
+      if ((*r)->use_direct_io()) {
+        ++num_direct_rand_readers_;
+      }
+      ++num_rand_readers_;
+    }
+    return s;
+  }
+
+  IOStatus DeleteFile(const std::string& f, const IOOptions& options,
+                      IODebugContext* dbg) override {
+    MutexLock l(&mutex_);
+    if (fail_delete_files_) {
+      return IOStatus::IOError();
+    }
+    EXPECT_GT(limit_delete_files_, 0U);
+    limit_delete_files_--;
+    return FileSystemWrapper::DeleteFile(f, options, dbg);
+  }
+
+  IOStatus DeleteDir(const std::string& d, const IOOptions& options,
+                     IODebugContext* dbg) override {
+    MutexLock l(&mutex_);
+    if (fail_delete_files_) {
+      return IOStatus::IOError();
+    }
+    return FileSystemWrapper::DeleteDir(d, options, dbg);
+  }
+
+  void AssertWrittenFiles(std::vector<std::string>& should_have_written) {
+    MutexLock l(&mutex_);
+    std::sort(should_have_written.begin(), should_have_written.end());
+    std::sort(written_files_.begin(), written_files_.end());
+
+    ASSERT_EQ(should_have_written, written_files_);
+  }
+
+  void ClearWrittenFiles() {
+    MutexLock l(&mutex_);
+    written_files_.clear();
+  }
+
+  void SetLimitWrittenFiles(uint64_t limit) {
+    MutexLock l(&mutex_);
+    limit_written_files_ = limit;
+  }
+
+  void SetLimitDeleteFiles(uint64_t limit) {
+    MutexLock l(&mutex_);
+    limit_delete_files_ = limit;
+  }
+
+  void SetDeleteFileFailure(bool fail) {
+    MutexLock l(&mutex_);
+    fail_delete_files_ = fail;
+  }
+
+  void SetDummySequentialFile(bool dummy_sequential_file) {
+    MutexLock l(&mutex_);
+    dummy_sequential_file_ = dummy_sequential_file;
+  }
+  void SetDummySequentialFileFailReads(bool dummy_sequential_file_fail_reads) {
+    MutexLock l(&mutex_);
+    dummy_sequential_file_fail_reads_ = dummy_sequential_file_fail_reads;
+  }
+
+  void SetGetChildrenFailure(bool fail) { get_children_failure_ = fail; }
+  IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts,
+                       std::vector<std::string>* r,
+                       IODebugContext* dbg) override {
+    if (get_children_failure_) {
+      return IOStatus::IOError("SimulatedFailure");
+    }
+    return FileSystemWrapper::GetChildren(dir, io_opts, r, dbg);
+  }
+
+  // Some test cases do not actually create the test files (e.g., see
+  // DummyDB::live_files_) - for those cases, we mock those files' attributes
+  // so CreateNewBackup() can get their attributes.
+  void SetFilenamesForMockedAttrs(const std::vector<std::string>& filenames) {
+    filenames_for_mocked_attrs_ = filenames;
+  }
+  IOStatus GetChildrenFileAttributes(const std::string& dir,
+                                     const IOOptions& options,
+                                     std::vector<FileAttributes>* result,
+                                     IODebugContext* dbg) override {
+    if (filenames_for_mocked_attrs_.size() > 0) {
+      for (const auto& filename : filenames_for_mocked_attrs_) {
+        uint64_t size_bytes = 200;  // Match TestFs
+        if (filename.find("MANIFEST") == 0) {
+          size_bytes = 100;  // Match DummyDB::GetLiveFiles
+        }
+        result->push_back({dir + "/" + filename, size_bytes});
+      }
+      return IOStatus::OK();
+    }
+    return FileSystemWrapper::GetChildrenFileAttributes(dir, options, result,
+                                                        dbg);
+  }
+
+  IOStatus GetFileSize(const std::string& f, const IOOptions& options,
+                       uint64_t* s, IODebugContext* dbg) override {
+    if (filenames_for_mocked_attrs_.size() > 0) {
+      auto fname = f.substr(f.find_last_of('/') + 1);
+      auto filename_iter = std::find(filenames_for_mocked_attrs_.begin(),
+                                     filenames_for_mocked_attrs_.end(), fname);
+      if (filename_iter != filenames_for_mocked_attrs_.end()) {
+        *s = 200;  // Match TestFs
+        if (fname.find("MANIFEST") == 0) {
+          *s = 100;  // Match DummyDB::GetLiveFiles
+        }
+        return IOStatus::OK();
+      }
+      return IOStatus::NotFound(fname);
+    }
+    return FileSystemWrapper::GetFileSize(f, options, s, dbg);
+  }
+
+  void SetCreateDirIfMissingFailure(bool fail) {
+    create_dir_if_missing_failure_ = fail;
+  }
+  IOStatus CreateDirIfMissing(const std::string& d, const IOOptions& options,
+                              IODebugContext* dbg) override {
+    if (create_dir_if_missing_failure_) {
+      return IOStatus::IOError("SimulatedFailure");
+    }
+    return FileSystemWrapper::CreateDirIfMissing(d, options, dbg);
+  }
+
+  void SetNewDirectoryFailure(bool fail) { new_directory_failure_ = fail; }
+  IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts,
+                        std::unique_ptr<FSDirectory>* result,
+                        IODebugContext* dbg) override {
+    if (new_directory_failure_) {
+      return IOStatus::IOError("SimulatedFailure");
+    }
+    return FileSystemWrapper::NewDirectory(name, io_opts, result, dbg);
+  }
+
+  void ClearFileOpenCounters() {
+    MutexLock l(&mutex_);
+    num_rand_readers_ = 0;
+    num_direct_rand_readers_ = 0;
+    num_seq_readers_ = 0;
+    num_direct_seq_readers_ = 0;
+    num_writers_ = 0;
+    num_direct_writers_ = 0;
+  }
+
+  int num_rand_readers() { return num_rand_readers_; }
+  int num_direct_rand_readers() { return num_direct_rand_readers_; }
+  int num_seq_readers() { return num_seq_readers_; }
+  int num_direct_seq_readers() { return num_direct_seq_readers_; }
+  int num_writers() { return num_writers_; }
+  // FIXME(?): unused
+  int num_direct_writers() { return num_direct_writers_; }
+
+ private:
+  port::Mutex mutex_;
+  bool dummy_sequential_file_ = false;
+  bool dummy_sequential_file_fail_reads_ = false;
+  std::vector<std::string> written_files_;
+  std::vector<std::string> filenames_for_mocked_attrs_;
+  uint64_t limit_written_files_ = 1000000;
+  uint64_t limit_delete_files_ = 1000000;
+  bool fail_delete_files_ = false;
+
+  bool get_children_failure_ = false;
+  bool create_dir_if_missing_failure_ = false;
+  bool new_directory_failure_ = false;
+
+  // Keeps track of how many files of each type were successfully opened, and
+  // out of those, how many were opened with direct I/O.
+  std::atomic<int> num_rand_readers_{};
+  std::atomic<int> num_direct_rand_readers_{};
+  std::atomic<int> num_seq_readers_{};
+  std::atomic<int> num_direct_seq_readers_{};
+  std::atomic<int> num_writers_{};
+  std::atomic<int> num_direct_writers_{};
+};  // TestFs
+
+class FileManager : public EnvWrapper {
+ public:
+  explicit FileManager(Env* t) : EnvWrapper(t), rnd_(5) {}
+  const char* Name() const override { return "FileManager"; }
+
+  Status GetRandomFileInDir(const std::string& dir, std::string* fname,
+                            uint64_t* fsize) {
+    std::vector<FileAttributes> children;
+    auto s = GetChildrenFileAttributes(dir, &children);
+    if (!s.ok()) {
+      return s;
+    } else if (children.size() <= 2) {  // . and ..
+      return Status::NotFound("Empty directory: " + dir);
+    }
+    assert(fname != nullptr);
+    while (true) {
+      int i = rnd_.Next() % children.size();
+      fname->assign(dir + "/" + children[i].name);
+      *fsize = children[i].size_bytes;
+      return Status::OK();
+    }
+    // should never get here
+    assert(false);
+    return Status::NotFound("");
+  }
+
+  Status DeleteRandomFileInDir(const std::string& dir) {
+    std::vector<std::string> children;
+    Status s = GetChildren(dir, &children);
+    if (!s.ok()) {
+      return s;
+    }
+    while (true) {
+      int i = rnd_.Next() % children.size();
+      return DeleteFile(dir + "/" + children[i]);
+    }
+    // should never get here
+    assert(false);
+    return Status::NotFound("");
+  }
+
+  Status AppendToRandomFileInDir(const std::string& dir,
+                                 const std::string& data) {
+    std::vector<std::string> children;
+    Status s = GetChildren(dir, &children);
+    if (!s.ok()) {
+      return s;
+    }
+    while (true) {
+      int i = rnd_.Next() % children.size();
+      return WriteToFile(dir + "/" + children[i], data);
+    }
+    // should never get here
+    assert(false);
+    return Status::NotFound("");
+  }
+
+  Status CorruptFile(const std::string& fname, uint64_t bytes_to_corrupt) {
+    std::string file_contents;
+    Status s = ReadFileToString(this, fname, &file_contents);
+    if (!s.ok()) {
+      return s;
+    }
+    s = DeleteFile(fname);
+    if (!s.ok()) {
+      return s;
+    }
+
+    for (uint64_t i = 0; i < bytes_to_corrupt; ++i) {
+      std::string tmp = rnd_.RandomString(1);
+      file_contents[rnd_.Next() % file_contents.size()] = tmp[0];
+    }
+    return WriteToFile(fname, file_contents);
+  }
+
+  Status CorruptFileStart(const std::string& fname) {
+    std::string to_xor = "blah";
+    std::string file_contents;
+    Status s = ReadFileToString(this, fname, &file_contents);
+    if (!s.ok()) {
+      return s;
+    }
+    s = DeleteFile(fname);
+    if (!s.ok()) {
+      return s;
+    }
+    for (size_t i = 0; i < to_xor.size(); ++i) {
+      file_contents[i] ^= to_xor[i];
+    }
+    return WriteToFile(fname, file_contents);
+  }
+
+  Status CorruptChecksum(const std::string& fname, bool appear_valid) {
+    std::string metadata;
+    Status s = ReadFileToString(this, fname, &metadata);
+    if (!s.ok()) {
+      return s;
+    }
+    s = DeleteFile(fname);
+    if (!s.ok()) {
+      return s;
+    }
+
+    auto pos = metadata.find("private");
+    if (pos == std::string::npos) {
+      return Status::Corruption("private file is expected");
+    }
+    pos = metadata.find(" crc32 ", pos + 6);
+    if (pos == std::string::npos) {
+      return Status::Corruption("checksum not found");
+    }
+
+    if (metadata.size() < pos + 7) {
+      return Status::Corruption("bad CRC32 checksum value");
+    }
+
+    if (appear_valid) {
+      if (metadata[pos + 8] == '\n') {
+        // single digit value, safe to insert one more digit
+        metadata.insert(pos + 8, 1, '0');
+      } else {
+        metadata.erase(pos + 8, 1);
+      }
+    } else {
+      metadata[pos + 7] = 'a';
+    }
+
+    return WriteToFile(fname, metadata);
+  }
+
+  Status WriteToFile(const std::string& fname, const std::string& data) {
+    std::unique_ptr<WritableFile> file;
+    EnvOptions env_options;
+    env_options.use_mmap_writes = false;
+    Status s = EnvWrapper::NewWritableFile(fname, &file, env_options);
+    if (!s.ok()) {
+      return s;
+    }
+    return file->Append(Slice(data));
+  }
+
+ private:
+  Random rnd_;
+};  // FileManager
+
+// utility functions
+namespace {
+
+enum FillDBFlushAction {
+  kFlushMost,
+  kFlushAll,
+  kAutoFlushOnly,
+};
+
+// Many tests in this file expect FillDB to write at least one sst file,
+// so the default behavior (if not kAutoFlushOnly) of FillDB is to force
+// a flush. But to ensure coverage of the WAL file case, we also (by default)
+// do one Put after the Flush (kFlushMost).
+size_t FillDB(DB* db, int from, int to,
+              FillDBFlushAction flush_action = kFlushMost) {
+  size_t bytes_written = 0;
+  for (int i = from; i < to; ++i) {
+    std::string key = "testkey" + std::to_string(i);
+    std::string value = "testvalue" + std::to_string(i);
+    bytes_written += key.size() + value.size();
+
+    EXPECT_OK(db->Put(WriteOptions(), Slice(key), Slice(value)));
+
+    if (flush_action == kFlushMost && i == to - 2) {
+      EXPECT_OK(db->Flush(FlushOptions()));
+    }
+  }
+  if (flush_action == kFlushAll) {
+    EXPECT_OK(db->Flush(FlushOptions()));
+  }
+  return bytes_written;
+}
+
+void AssertExists(DB* db, int from, int to) {
+  for (int i = from; i < to; ++i) {
+    std::string key = "testkey" + std::to_string(i);
+    std::string value;
+    Status s = db->Get(ReadOptions(), Slice(key), &value);
+    ASSERT_EQ(value, "testvalue" + std::to_string(i));
+  }
+}
+
+void AssertEmpty(DB* db, int from, int to) {
+  for (int i = from; i < to; ++i) {
+    std::string key = "testkey" + std::to_string(i);
+    std::string value = "testvalue" + std::to_string(i);
+
+    Status s = db->Get(ReadOptions(), Slice(key), &value);
+    ASSERT_TRUE(s.IsNotFound());
+  }
+}
+}  // namespace
+
+class BackupEngineTest : public testing::Test {
+ public:
+  enum ShareOption {
+    kNoShare,
+    kShareNoChecksum,
+    kShareWithChecksum,
+  };
+
+  const std::vector<ShareOption> kAllShareOptions = {kNoShare, kShareNoChecksum,
+                                                     kShareWithChecksum};
+
+  BackupEngineTest() {
+    // set up files
+    std::string db_chroot = test::PerThreadDBPath("db_for_backup");
+    std::string backup_chroot = test::PerThreadDBPath("db_backups");
+    EXPECT_OK(Env::Default()->CreateDirIfMissing(db_chroot));
+    EXPECT_OK(Env::Default()->CreateDirIfMissing(backup_chroot));
+    dbname_ = "/tempdb";
+    backupdir_ = "/tempbk";
+    latest_backup_ = backupdir_ + "/LATEST_BACKUP";
+
+    // set up FileSystem & Envs
+    db_chroot_fs_ = NewChrootFileSystem(FileSystem::Default(), db_chroot);
+    backup_chroot_fs_ =
+        NewChrootFileSystem(FileSystem::Default(), backup_chroot);
+    test_db_fs_ = std::make_shared<TestFs>(db_chroot_fs_);
+    test_backup_fs_ = std::make_shared<TestFs>(backup_chroot_fs_);
+    SetEnvsFromFileSystems();
+
+    // set up db options
+    options_.create_if_missing = true;
+    options_.paranoid_checks = true;
+    options_.write_buffer_size = 1 << 17;  // 128KB
+    options_.wal_dir = dbname_;
+    options_.enable_blob_files = true;
+
+    // The sync option is not easily testable in unit tests, but should be
+    // smoke tested across all the other backup tests. However, it is
+    // certainly not worth doubling the runtime of backup tests for it.
+    // Thus, we can enable sync for one of our alternate testing
+    // configurations.
+    constexpr bool kUseSync =
+#ifdef ROCKSDB_MODIFY_NPHASH
+        true;
+#else
+        false;
+#endif  // ROCKSDB_MODIFY_NPHASH
+
+    // set up backup db options
+    engine_options_.reset(new BackupEngineOptions(
+        backupdir_, test_backup_env_.get(), /*share_table_files*/ true,
+        logger_.get(), kUseSync));
+
+    // most tests will use multi-threaded backups
+    engine_options_->max_background_operations = 7;
+
+    // delete old files in db
+    DestroyDBWithoutCheck(dbname_, options_);
+
+    // delete old LATEST_BACKUP file, which some tests create for compatibility
+    // testing.
+    backup_chroot_env_->DeleteFile(latest_backup_).PermitUncheckedError();
+  }
+
+  void SetEnvsFromFileSystems() {
+    db_chroot_env_.reset(
+        new CompositeEnvWrapper(Env::Default(), db_chroot_fs_));
+    backup_chroot_env_.reset(
+        new CompositeEnvWrapper(Env::Default(), backup_chroot_fs_));
+    test_db_env_.reset(new CompositeEnvWrapper(Env::Default(), test_db_fs_));
+    options_.env = test_db_env_.get();
+    test_backup_env_.reset(
+        new CompositeEnvWrapper(Env::Default(), test_backup_fs_));
+    if (engine_options_) {
+      engine_options_->backup_env = test_backup_env_.get();
+    }
+    file_manager_.reset(new FileManager(backup_chroot_env_.get()));
+    db_file_manager_.reset(new FileManager(db_chroot_env_.get()));
+
+    // Create logger
+    DBOptions logger_options;
+    logger_options.env = db_chroot_env_.get();
+    ASSERT_OK(CreateLoggerFromOptions(dbname_, logger_options, &logger_));
+  }
+
+  DB* OpenDB() {
+    DB* db;
+    EXPECT_OK(DB::Open(options_, dbname_, &db));
+    return db;
+  }
+
+  void CloseAndReopenDB(bool read_only = false) {
+    // Close DB
+    db_.reset();
+
+    // Open DB
+    test_db_fs_->SetLimitWrittenFiles(1000000);
+    DB* db;
+    if (read_only) {
+      ASSERT_OK(DB::OpenForReadOnly(options_, dbname_, &db));
+    } else {
+      ASSERT_OK(DB::Open(options_, dbname_, &db));
+    }
+    db_.reset(db);
+  }
+
+  void InitializeDBAndBackupEngine(bool dummy = false) {
+    // reset all the db env defaults
+    test_db_fs_->SetLimitWrittenFiles(1000000);
+    test_db_fs_->SetDummySequentialFile(dummy);
+
+    DB* db;
+    if (dummy) {
+      dummy_db_ = new DummyDB(options_, dbname_);
+      db = dummy_db_;
+    } else {
+      ASSERT_OK(DB::Open(options_, dbname_, &db));
+    }
+    db_.reset(db);
+  }
+
+  virtual void OpenDBAndBackupEngine(
+      bool destroy_old_data = false, bool dummy = false,
+      ShareOption shared_option = kShareNoChecksum) {
+    InitializeDBAndBackupEngine(dummy);
+    // reset backup env defaults
+    test_backup_fs_->SetLimitWrittenFiles(1000000);
+    engine_options_->destroy_old_data = destroy_old_data;
+    engine_options_->share_table_files = shared_option != kNoShare;
+    engine_options_->share_files_with_checksum =
+        shared_option == kShareWithChecksum;
+    OpenBackupEngine(destroy_old_data);
+  }
+
+  void CloseDBAndBackupEngine() {
+    db_.reset();
+    backup_engine_.reset();
+  }
+
+  void OpenBackupEngine(bool destroy_old_data = false) {
+    engine_options_->destroy_old_data = destroy_old_data;
+    engine_options_->info_log = logger_.get();
+    BackupEngine* backup_engine;
+    ASSERT_OK(BackupEngine::Open(test_db_env_.get(), *engine_options_,
+                                 &backup_engine));
+    backup_engine_.reset(backup_engine);
+  }
+
+  void CloseBackupEngine() { backup_engine_.reset(nullptr); }
+
+  // cross-cutting test of GetBackupInfo
+  void AssertBackupInfoConsistency() {
+    std::vector<BackupInfo> backup_info;
+    backup_engine_->GetBackupInfo(&backup_info, /*with file details*/ true);
+    std::map<std::string, uint64_t> file_sizes;
+
+    // Find the files that are supposed to be there
+    for (auto& backup : backup_info) {
+      uint64_t sum_for_backup = 0;
+      for (auto& file : backup.file_details) {
+        auto e = file_sizes.find(file.relative_filename);
+        if (e == file_sizes.end()) {
+          // fprintf(stderr, "Adding %s -> %u\n",
+          // file.relative_filename.c_str(), (unsigned)file.size);
+          file_sizes[file.relative_filename] = file.size;
+        } else {
+          ASSERT_EQ(file_sizes[file.relative_filename], file.size);
+        }
+        sum_for_backup += file.size;
+      }
+      ASSERT_EQ(backup.size, sum_for_backup);
+    }
+
+    std::vector<BackupID> corrupt_backup_ids;
+    backup_engine_->GetCorruptedBackups(&corrupt_backup_ids);
+    bool has_corrupt = corrupt_backup_ids.size() > 0;
+
+    // Compare with what's in backup dir
+    std::vector<std::string> child_dirs;
+    ASSERT_OK(
+        test_backup_env_->GetChildren(backupdir_ + "/private", &child_dirs));
+    for (auto& dir : child_dirs) {
+      dir = "private/" + dir;
+    }
+    child_dirs.push_back("shared");           // might not exist
+    child_dirs.push_back("shared_checksum");  // might not exist
+    for (auto& dir : child_dirs) {
+      std::vector<std::string> children;
+      test_backup_env_->GetChildren(backupdir_ + "/" + dir, &children)
+          .PermitUncheckedError();
+      // fprintf(stderr, "ls %s\n", (backupdir_ + "/" + dir).c_str());
+      for (auto& file : children) {
+        uint64_t size;
+        size = UINT64_MAX;  // appease clang-analyze
+        std::string rel_file = dir + "/" + file;
+        // fprintf(stderr, "stat %s\n", (backupdir_ + "/" + rel_file).c_str());
+        ASSERT_OK(
+            test_backup_env_->GetFileSize(backupdir_ + "/" + rel_file, &size));
+        auto e = file_sizes.find(rel_file);
+        if (e == file_sizes.end()) {
+          // The only case in which we should find files not reported
+          ASSERT_TRUE(has_corrupt);
+        } else {
+          ASSERT_EQ(e->second, size);
+          file_sizes.erase(e);
+        }
+      }
+    }
+
+    // Everything should have been matched
+    ASSERT_EQ(file_sizes.size(), 0);
+  }
+
+  // restores backup backup_id and asserts the existence of
+  // [start_exist, end_exist> and not-existence of
+  // [end_exist, end>
+  //
+  // if backup_id == 0, it means restore from latest
+  // if end == 0, don't check AssertEmpty
+  void AssertBackupConsistency(BackupID backup_id, uint32_t start_exist,
+                               uint32_t end_exist, uint32_t end = 0,
+                               bool keep_log_files = false) {
+    RestoreOptions restore_options(keep_log_files);
+    bool opened_backup_engine = false;
+    if (backup_engine_.get() == nullptr) {
+      opened_backup_engine = true;
+      OpenBackupEngine();
+    }
+    AssertBackupInfoConsistency();
+
+    // Now perform restore
+    if (backup_id > 0) {
+      ASSERT_OK(backup_engine_->RestoreDBFromBackup(backup_id, dbname_, dbname_,
+                                                    restore_options));
+    } else {
+      ASSERT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_,
+                                                          restore_options));
+    }
+    DB* db = OpenDB();
+    // Check DB contents
+    AssertExists(db, start_exist, end_exist);
+    if (end != 0) {
+      AssertEmpty(db, end_exist, end);
+    }
+    delete db;
+    if (opened_backup_engine) {
+      CloseBackupEngine();
+    }
+  }
+
+  void DeleteLogFiles() {
+    std::vector<std::string> delete_logs;
+    ASSERT_OK(db_chroot_env_->GetChildren(dbname_, &delete_logs));
+    for (auto f : delete_logs) {
+      uint64_t number;
+      FileType type;
+      bool ok = ParseFileName(f, &number, &type);
+      if (ok && type == kWalFile) {
+        ASSERT_OK(db_chroot_env_->DeleteFile(dbname_ + "/" + f));
+      }
+    }
+  }
+
+  Status GetDataFilesInDB(const FileType& file_type,
+                          std::vector<FileAttributes>* files) {
+    std::vector<std::string> live;
+    uint64_t ignore_manifest_size;
+    Status s = db_->GetLiveFiles(live, &ignore_manifest_size, /*flush*/ false);
+    if (!s.ok()) {
+      return s;
+    }
+    std::vector<FileAttributes> children;
+    s = test_db_env_->GetChildrenFileAttributes(dbname_, &children);
+    for (const auto& child : children) {
+      FileType type;
+      uint64_t number = 0;
+      if (ParseFileName(child.name, &number, &type) && type == file_type &&
+          std::find(live.begin(), live.end(), "/" + child.name) != live.end()) {
+        files->push_back(child);
+      }
+    }
+    return s;
+  }
+
+  Status GetRandomDataFileInDB(const FileType& file_type,
+                               std::string* fname_out,
+                               uint64_t* fsize_out = nullptr) {
+    Random rnd(6);  // NB: hardly "random"
+    std::vector<FileAttributes> files;
+    Status s = GetDataFilesInDB(file_type, &files);
+    if (!s.ok()) {
+      return s;
+    }
+    if (files.empty()) {
+      return Status::NotFound("");
+    }
+    size_t i = rnd.Uniform(static_cast<int>(files.size()));
+    *fname_out = dbname_ + "/" + files[i].name;
+    if (fsize_out) {
+      *fsize_out = files[i].size_bytes;
+    }
+    return Status::OK();
+  }
+
+  Status CorruptRandomDataFileInDB(const FileType& file_type) {
+    std::string fname;
+    uint64_t fsize = 0;
+    Status s = GetRandomDataFileInDB(file_type, &fname, &fsize);
+    if (!s.ok()) {
+      return s;
+    }
+
+    std::string file_contents;
+    s = ReadFileToString(test_db_env_.get(), fname, &file_contents);
+    if (!s.ok()) {
+      return s;
+    }
+    s = test_db_env_->DeleteFile(fname);
+    if (!s.ok()) {
+      return s;
+    }
+
+    file_contents[0] = (file_contents[0] + 257) % 256;
+    return WriteStringToFile(test_db_env_.get(), file_contents, fname);
+  }
+
+  void AssertDirectoryFilesMatchRegex(const std::string& dir,
+                                      const TestRegex& pattern,
+                                      const std::string& file_type,
+                                      int minimum_count) {
+    std::vector<FileAttributes> children;
+    ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children));
+    int found_count = 0;
+    for (const auto& child : children) {
+      if (EndsWith(child.name, file_type)) {
+        ASSERT_MATCHES_REGEX(child.name, pattern);
+        ++found_count;
+      }
+    }
+    ASSERT_GE(found_count, minimum_count);
+  }
+
+  void AssertDirectoryFilesSizeIndicators(const std::string& dir,
+                                          int minimum_count) {
+    std::vector<FileAttributes> children;
+    ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children));
+    int found_count = 0;
+    for (const auto& child : children) {
+      auto last_underscore = child.name.find_last_of('_');
+      auto last_dot = child.name.find_last_of('.');
+      ASSERT_NE(child.name, child.name.substr(0, last_underscore));
+      ASSERT_NE(child.name, child.name.substr(0, last_dot));
+      ASSERT_LT(last_underscore, last_dot);
+      std::string s = child.name.substr(last_underscore + 1,
+                                        last_dot - (last_underscore + 1));
+      ASSERT_EQ(s, std::to_string(child.size_bytes));
+      ++found_count;
+    }
+    ASSERT_GE(found_count, minimum_count);
+  }
+
+  // files
+  std::string dbname_;
+  std::string backupdir_;
+  std::string latest_backup_;
+
+  // logger_ must be above backup_engine_ such that the engine's destructor,
+  // which uses a raw pointer to the logger, executes first.
+  std::shared_ptr<Logger> logger_;
+
+  // FileSystems
+  std::shared_ptr<FileSystem> db_chroot_fs_;
+  std::shared_ptr<FileSystem> backup_chroot_fs_;
+  std::shared_ptr<TestFs> test_db_fs_;
+  std::shared_ptr<TestFs> test_backup_fs_;
+
+  // Env wrappers
+  std::unique_ptr<Env> db_chroot_env_;
+  std::unique_ptr<Env> backup_chroot_env_;
+  std::unique_ptr<Env> test_db_env_;
+  std::unique_ptr<Env> test_backup_env_;
+  std::unique_ptr<FileManager> file_manager_;
+  std::unique_ptr<FileManager> db_file_manager_;
+
+  // all the dbs!
+  DummyDB* dummy_db_;  // owned as db_ when present
+  std::unique_ptr<DB> db_;
+  std::unique_ptr<BackupEngine> backup_engine_;
+
+  // options
+  Options options_;
+
+ protected:
+  void DestroyDBWithoutCheck(const std::string& dbname,
+                             const Options& options) {
+    // DestroyDB may fail because the db might not be existed for some tests
+    DestroyDB(dbname, options).PermitUncheckedError();
+  }
+
+  std::unique_ptr<BackupEngineOptions> engine_options_;
+};  // BackupEngineTest
+
+void AppendPath(const std::string& path, std::vector<std::string>& v) {
+  for (auto& f : v) {
+    f = path + f;
+  }
+}
+
+class BackupEngineTestWithParam : public BackupEngineTest,
+                                  public testing::WithParamInterface<bool> {
+ public:
+  BackupEngineTestWithParam() {
+    engine_options_->share_files_with_checksum = GetParam();
+  }
+  void OpenDBAndBackupEngine(
+      bool destroy_old_data = false, bool dummy = false,
+      ShareOption shared_option = kShareNoChecksum) override {
+    BackupEngineTest::InitializeDBAndBackupEngine(dummy);
+    // reset backup env defaults
+    test_backup_fs_->SetLimitWrittenFiles(1000000);
+    engine_options_->destroy_old_data = destroy_old_data;
+    engine_options_->share_table_files = shared_option != kNoShare;
+    // NOTE: keep share_files_with_checksum setting from constructor
+    OpenBackupEngine(destroy_old_data);
+  }
+};
+
+TEST_F(BackupEngineTest, FileCollision) {
+  const int keys_iteration = 100;
+  for (const auto& sopt : kAllShareOptions) {
+    OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */, sopt);
+    FillDB(db_.get(), 0, keys_iteration);
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+    FillDB(db_.get(), keys_iteration, keys_iteration * 2);
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+    CloseDBAndBackupEngine();
+
+    // If the db directory has been cleaned up, it is sensitive to file
+    // collision.
+    DestroyDBWithoutCheck(dbname_, options_);
+
+    // open fresh DB, but old backups present
+    OpenDBAndBackupEngine(false /* destroy_old_data */, false /* dummy */,
+                          sopt);
+    FillDB(db_.get(), 0, keys_iteration);
+    ASSERT_OK(db_->Flush(FlushOptions()));  // like backup would do
+    FillDB(db_.get(), keys_iteration, keys_iteration * 2);
+    if (sopt != kShareNoChecksum) {
+      ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+    } else {
+      // The new table files created in FillDB() will clash with the old
+      // backup and sharing tables with no checksum will have the file
+      // collision problem.
+      ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get()));
+      ASSERT_OK(backup_engine_->PurgeOldBackups(0));
+      ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+    }
+    CloseDBAndBackupEngine();
+
+    // delete old data
+    DestroyDBWithoutCheck(dbname_, options_);
+  }
+}
+
+// This test verifies that the verifyBackup method correctly identifies
+// invalid backups
+TEST_P(BackupEngineTestWithParam, VerifyBackup) {
+  const int keys_iteration = 5000;
+  OpenDBAndBackupEngine(true);
+  // create five backups
+  for (int i = 0; i < 5; ++i) {
+    FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  }
+  CloseDBAndBackupEngine();
+
+  OpenDBAndBackupEngine();
+  // ---------- case 1. - valid backup -----------
+  ASSERT_TRUE(backup_engine_->VerifyBackup(1).ok());
+
+  // ---------- case 2. - delete a file -----------i
+  ASSERT_OK(file_manager_->DeleteRandomFileInDir(backupdir_ + "/private/1"));
+  ASSERT_TRUE(backup_engine_->VerifyBackup(1).IsNotFound());
+
+  // ---------- case 3. - corrupt a file -----------
+  std::string append_data = "Corrupting a random file";
+  ASSERT_OK(file_manager_->AppendToRandomFileInDir(backupdir_ + "/private/2",
+                                                   append_data));
+  ASSERT_TRUE(backup_engine_->VerifyBackup(2).IsCorruption());
+
+  // ---------- case 4. - invalid backup -----------
+  ASSERT_TRUE(backup_engine_->VerifyBackup(6).IsNotFound());
+  CloseDBAndBackupEngine();
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+// open DB, write, close DB, backup, restore, repeat
+TEST_P(BackupEngineTestWithParam, OfflineIntegrationTest) {
+  // has to be a big number, so that it triggers the memtable flush
+  const int keys_iteration = 5000;
+  const int max_key = keys_iteration * 4 + 10;
+  // first iter -- flush before backup
+  // second iter -- don't flush before backup
+  for (int iter = 0; iter < 2; ++iter) {
+    // delete old data
+    DestroyDBWithoutCheck(dbname_, options_);
+    bool destroy_data = true;
+
+    // every iteration --
+    // 1. insert new data in the DB
+    // 2. backup the DB
+    // 3. destroy the db
+    // 4. restore the db, check everything is still there
+    for (int i = 0; i < 5; ++i) {
+      // in last iteration, put smaller amount of data,
+      int fill_up_to = std::min(keys_iteration * (i + 1), max_key);
+      // ---- insert new data and back up ----
+      OpenDBAndBackupEngine(destroy_data);
+      destroy_data = false;
+      // kAutoFlushOnly to preserve legacy test behavior (consider updating)
+      FillDB(db_.get(), keys_iteration * i, fill_up_to, kAutoFlushOnly);
+      ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), iter == 0))
+          << "iter: " << iter << ", idx: " << i;
+      CloseDBAndBackupEngine();
+      DestroyDBWithoutCheck(dbname_, options_);
+
+      // ---- make sure it's empty ----
+      DB* db = OpenDB();
+      AssertEmpty(db, 0, fill_up_to);
+      delete db;
+
+      // ---- restore the DB ----
+      OpenBackupEngine();
+      if (i >= 3) {  // test purge old backups
+        // when i == 4, purge to only 1 backup
+        // when i == 3, purge to 2 backups
+        ASSERT_OK(backup_engine_->PurgeOldBackups(5 - i));
+      }
+      // ---- make sure the data is there ---
+      AssertBackupConsistency(0, 0, fill_up_to, max_key);
+      CloseBackupEngine();
+    }
+  }
+}
+
+// open DB, write, backup, write, backup, close, restore
+TEST_P(BackupEngineTestWithParam, OnlineIntegrationTest) {
+  // has to be a big number, so that it triggers the memtable flush
+  const int keys_iteration = 5000;
+  const int max_key = keys_iteration * 4 + 10;
+  Random rnd(7);
+  // delete old data
+  DestroyDBWithoutCheck(dbname_, options_);
+
+  // TODO: Implement & test db_paths support in backup (not supported in
+  // restore)
+  // options_.db_paths.emplace_back(dbname_, 500 * 1024);
+  // options_.db_paths.emplace_back(dbname_ + "_2", 1024 * 1024 * 1024);
+
+  OpenDBAndBackupEngine(true);
+  // write some data, backup, repeat
+  for (int i = 0; i < 5; ++i) {
+    if (i == 4) {
+      // delete backup number 2, online delete!
+      ASSERT_OK(backup_engine_->DeleteBackup(2));
+    }
+    // in last iteration, put smaller amount of data,
+    // so that backups can share sst files
+    int fill_up_to = std::min(keys_iteration * (i + 1), max_key);
+    // kAutoFlushOnly to preserve legacy test behavior (consider updating)
+    FillDB(db_.get(), keys_iteration * i, fill_up_to, kAutoFlushOnly);
+    // we should get consistent results with flush_before_backup
+    // set to both true and false
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2)));
+  }
+  // close and destroy
+  CloseDBAndBackupEngine();
+  DestroyDBWithoutCheck(dbname_, options_);
+
+  // ---- make sure it's empty ----
+  DB* db = OpenDB();
+  AssertEmpty(db, 0, max_key);
+  delete db;
+
+  // ---- restore every backup and verify all the data is there ----
+  OpenBackupEngine();
+  for (int i = 1; i <= 5; ++i) {
+    if (i == 2) {
+      // we deleted backup 2
+      Status s = backup_engine_->RestoreDBFromBackup(2, dbname_, dbname_);
+      ASSERT_TRUE(!s.ok());
+    } else {
+      int fill_up_to = std::min(keys_iteration * i, max_key);
+      AssertBackupConsistency(i, 0, fill_up_to, max_key);
+    }
+  }
+
+  // delete some backups -- this should leave only backups 3 and 5 alive
+  ASSERT_OK(backup_engine_->DeleteBackup(4));
+  ASSERT_OK(backup_engine_->PurgeOldBackups(2));
+
+  std::vector<BackupInfo> backup_info;
+  backup_engine_->GetBackupInfo(&backup_info);
+  ASSERT_EQ(2UL, backup_info.size());
+
+  // check backup 3
+  AssertBackupConsistency(3, 0, 3 * keys_iteration, max_key);
+  // check backup 5
+  AssertBackupConsistency(5, 0, max_key);
+
+  CloseBackupEngine();
+}
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+INSTANTIATE_TEST_CASE_P(BackupEngineTestWithParam, BackupEngineTestWithParam,
+                        ::testing::Bool());
+
+// this will make sure that backup does not copy the same file twice
+TEST_F(BackupEngineTest, NoDoubleCopy_And_AutoGC) {
+  OpenDBAndBackupEngine(true, true);
+
+  // should write 5 DB files + one meta file
+  test_backup_fs_->SetLimitWrittenFiles(7);
+  test_backup_fs_->ClearWrittenFiles();
+  test_db_fs_->SetLimitWrittenFiles(0);
+  dummy_db_->live_files_ = {"00010.sst", "00011.sst", "CURRENT", "MANIFEST-01",
+                            "00011.log"};
+  test_db_fs_->SetFilenamesForMockedAttrs(dummy_db_->live_files_);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false));
+  std::vector<std::string> should_have_written = {
+      "/shared/.00010.sst.tmp", "/shared/.00011.sst.tmp", "/private/1/CURRENT",
+      "/private/1/MANIFEST-01", "/private/1/00011.log",   "/meta/.1.tmp"};
+  AppendPath(backupdir_, should_have_written);
+  test_backup_fs_->AssertWrittenFiles(should_have_written);
+
+  char db_number = '1';
+
+  for (std::string other_sst : {"00015.sst", "00017.sst", "00019.sst"}) {
+    // should write 4 new DB files + one meta file
+    // should not write/copy 00010.sst, since it's already there!
+    test_backup_fs_->SetLimitWrittenFiles(6);
+    test_backup_fs_->ClearWrittenFiles();
+
+    dummy_db_->live_files_ = {"00010.sst", other_sst, "CURRENT", "MANIFEST-01",
+                              "00011.log"};
+    test_db_fs_->SetFilenamesForMockedAttrs(dummy_db_->live_files_);
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false));
+    // should not open 00010.sst - it's already there
+
+    ++db_number;
+    std::string private_dir = std::string("/private/") + db_number;
+    should_have_written = {
+        "/shared/." + other_sst + ".tmp", private_dir + "/CURRENT",
+        private_dir + "/MANIFEST-01", private_dir + "/00011.log",
+        std::string("/meta/.") + db_number + ".tmp"};
+    AppendPath(backupdir_, should_have_written);
+    test_backup_fs_->AssertWrittenFiles(should_have_written);
+  }
+
+  ASSERT_OK(backup_engine_->DeleteBackup(1));
+  ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00010.sst"));
+
+  // 00011.sst was only in backup 1, should be deleted
+  ASSERT_EQ(Status::NotFound(),
+            test_backup_env_->FileExists(backupdir_ + "/shared/00011.sst"));
+  ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00015.sst"));
+
+  // MANIFEST file size should be only 100
+  uint64_t size = 0;
+  ASSERT_OK(test_backup_env_->GetFileSize(backupdir_ + "/private/2/MANIFEST-01",
+                                          &size));
+  ASSERT_EQ(100UL, size);
+  ASSERT_OK(
+      test_backup_env_->GetFileSize(backupdir_ + "/shared/00015.sst", &size));
+  ASSERT_EQ(200UL, size);
+
+  CloseBackupEngine();
+
+  //
+  // Now simulate incomplete delete by removing just meta
+  //
+  ASSERT_OK(test_backup_env_->DeleteFile(backupdir_ + "/meta/2"));
+
+  OpenBackupEngine();
+
+  // 1 appears to be removed, so
+  // 2 non-corrupt and 0 corrupt seen
+  std::vector<BackupInfo> backup_info;
+  std::vector<BackupID> corrupt_backup_ids;
+  backup_engine_->GetBackupInfo(&backup_info);
+  backup_engine_->GetCorruptedBackups(&corrupt_backup_ids);
+  ASSERT_EQ(2UL, backup_info.size());
+  ASSERT_EQ(0UL, corrupt_backup_ids.size());
+
+  // Keep the two we see, but this should suffice to purge unreferenced
+  // shared files from incomplete delete.
+  ASSERT_OK(backup_engine_->PurgeOldBackups(2));
+
+  // Make sure dangling sst file has been removed (somewhere along this
+  // process). GarbageCollect should not be needed.
+  ASSERT_EQ(Status::NotFound(),
+            test_backup_env_->FileExists(backupdir_ + "/shared/00015.sst"));
+  ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00017.sst"));
+  ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00019.sst"));
+
+  // Now actually purge a good one
+  ASSERT_OK(backup_engine_->PurgeOldBackups(1));
+
+  ASSERT_EQ(Status::NotFound(),
+            test_backup_env_->FileExists(backupdir_ + "/shared/00017.sst"));
+  ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00019.sst"));
+
+  CloseDBAndBackupEngine();
+}
+
+// test various kind of corruptions that may happen:
+// 1. Not able to write a file for backup - that backup should fail,
+//      everything else should work
+// 2. Corrupted backup meta file or missing backuped file - we should
+//      not be able to open that backup, but all other backups should be
+//      fine
+// 3. Corrupted checksum value - if the checksum is not a valid uint32_t,
+//      db open should fail, otherwise, it aborts during the restore process.
+TEST_F(BackupEngineTest, CorruptionsTest) {
+  const int keys_iteration = 5000;
+  Random rnd(6);
+  Status s;
+
+  OpenDBAndBackupEngine(true);
+  // create five backups
+  for (int i = 0; i < 5; ++i) {
+    FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2)));
+  }
+
+  // ---------- case 1. - fail a write -----------
+  // try creating backup 6, but fail a write
+  FillDB(db_.get(), keys_iteration * 5, keys_iteration * 6);
+  test_backup_fs_->SetLimitWrittenFiles(2);
+  // should fail
+  s = backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2));
+  ASSERT_NOK(s);
+  test_backup_fs_->SetLimitWrittenFiles(1000000);
+  // latest backup should have all the keys
+  CloseDBAndBackupEngine();
+  AssertBackupConsistency(0, 0, keys_iteration * 5, keys_iteration * 6);
+
+  // --------- case 2. corrupted backup meta or missing backuped file ----
+  ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/meta/5", 3));
+  // since 5 meta is now corrupted, latest backup should be 4
+  AssertBackupConsistency(0, 0, keys_iteration * 4, keys_iteration * 5);
+  OpenBackupEngine();
+  s = backup_engine_->RestoreDBFromBackup(5, dbname_, dbname_);
+  ASSERT_NOK(s);
+  CloseBackupEngine();
+  ASSERT_OK(file_manager_->DeleteRandomFileInDir(backupdir_ + "/private/4"));
+  // 4 is corrupted, 3 is the latest backup now
+  AssertBackupConsistency(0, 0, keys_iteration * 3, keys_iteration * 5);
+  OpenBackupEngine();
+  s = backup_engine_->RestoreDBFromBackup(4, dbname_, dbname_);
+  CloseBackupEngine();
+  ASSERT_NOK(s);
+
+  // --------- case 3. corrupted checksum value ----
+  ASSERT_OK(file_manager_->CorruptChecksum(backupdir_ + "/meta/3", false));
+  // checksum of backup 3 is an invalid value, this can be detected at
+  // db open time, and it reverts to the previous backup automatically
+  AssertBackupConsistency(0, 0, keys_iteration * 2, keys_iteration * 5);
+  // checksum of the backup 2 appears to be valid, this can cause checksum
+  // mismatch and abort restore process
+  ASSERT_OK(file_manager_->CorruptChecksum(backupdir_ + "/meta/2", true));
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/2"));
+  OpenBackupEngine();
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/2"));
+  s = backup_engine_->RestoreDBFromBackup(2, dbname_, dbname_);
+  ASSERT_NOK(s);
+
+  // make sure that no corrupt backups have actually been deleted!
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/1"));
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/2"));
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/3"));
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/4"));
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/5"));
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/private/1"));
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/private/2"));
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/private/3"));
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/private/4"));
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/private/5"));
+
+  // delete the corrupt backups and then make sure they're actually deleted
+  ASSERT_OK(backup_engine_->DeleteBackup(5));
+  ASSERT_OK(backup_engine_->DeleteBackup(4));
+  ASSERT_OK(backup_engine_->DeleteBackup(3));
+  ASSERT_OK(backup_engine_->DeleteBackup(2));
+  // Should not be needed anymore with auto-GC on DeleteBackup
+  //(void)backup_engine_->GarbageCollect();
+  ASSERT_EQ(Status::NotFound(),
+            file_manager_->FileExists(backupdir_ + "/meta/5"));
+  ASSERT_EQ(Status::NotFound(),
+            file_manager_->FileExists(backupdir_ + "/private/5"));
+  ASSERT_EQ(Status::NotFound(),
+            file_manager_->FileExists(backupdir_ + "/meta/4"));
+  ASSERT_EQ(Status::NotFound(),
+            file_manager_->FileExists(backupdir_ + "/private/4"));
+  ASSERT_EQ(Status::NotFound(),
+            file_manager_->FileExists(backupdir_ + "/meta/3"));
+  ASSERT_EQ(Status::NotFound(),
+            file_manager_->FileExists(backupdir_ + "/private/3"));
+  ASSERT_EQ(Status::NotFound(),
+            file_manager_->FileExists(backupdir_ + "/meta/2"));
+  ASSERT_EQ(Status::NotFound(),
+            file_manager_->FileExists(backupdir_ + "/private/2"));
+  CloseBackupEngine();
+  AssertBackupConsistency(0, 0, keys_iteration * 1, keys_iteration * 5);
+
+  // new backup should be 2!
+  OpenDBAndBackupEngine();
+  FillDB(db_.get(), keys_iteration * 1, keys_iteration * 2);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2)));
+  CloseDBAndBackupEngine();
+  AssertBackupConsistency(2, 0, keys_iteration * 2, keys_iteration * 5);
+}
+
+// Corrupt a file but maintain its size
+TEST_F(BackupEngineTest, CorruptFileMaintainSize) {
+  const int keys_iteration = 5000;
+  OpenDBAndBackupEngine(true);
+  // create a backup
+  FillDB(db_.get(), 0, keys_iteration);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+
+  OpenDBAndBackupEngine();
+  // verify with file size
+  ASSERT_OK(backup_engine_->VerifyBackup(1, false));
+  // verify with file checksum
+  ASSERT_OK(backup_engine_->VerifyBackup(1, true));
+
+  std::string file_to_corrupt;
+  uint64_t file_size = 0;
+  // under normal circumstance, there should be at least one nonempty file
+  while (file_size == 0) {
+    // get a random file in /private/1
+    assert(file_manager_
+               ->GetRandomFileInDir(backupdir_ + "/private/1", &file_to_corrupt,
+                                    &file_size)
+               .ok());
+    // corrupt the file by replacing its content by file_size random bytes
+    ASSERT_OK(file_manager_->CorruptFile(file_to_corrupt, file_size));
+  }
+  // file sizes match
+  ASSERT_OK(backup_engine_->VerifyBackup(1, false));
+  // file checksums mismatch
+  ASSERT_NOK(backup_engine_->VerifyBackup(1, true));
+  // sanity check, use default second argument
+  ASSERT_OK(backup_engine_->VerifyBackup(1));
+  CloseDBAndBackupEngine();
+
+  // an extra challenge
+  // set share_files_with_checksum to true and do two more backups
+  // corrupt all the table files in shared_checksum but maintain their sizes
+  OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+                        kShareWithChecksum);
+  // creat two backups
+  for (int i = 1; i < 3; ++i) {
+    FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  }
+  CloseDBAndBackupEngine();
+
+  OpenDBAndBackupEngine();
+  std::vector<FileAttributes> children;
+  const std::string dir = backupdir_ + "/shared_checksum";
+  ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children));
+  for (const auto& child : children) {
+    if (child.size_bytes == 0) {
+      continue;
+    }
+    // corrupt the file by replacing its content by file_size random bytes
+    ASSERT_OK(
+        file_manager_->CorruptFile(dir + "/" + child.name, child.size_bytes));
+  }
+  // file sizes match
+  ASSERT_OK(backup_engine_->VerifyBackup(1, false));
+  ASSERT_OK(backup_engine_->VerifyBackup(2, false));
+  // file checksums mismatch
+  ASSERT_NOK(backup_engine_->VerifyBackup(1, true));
+  ASSERT_NOK(backup_engine_->VerifyBackup(2, true));
+  CloseDBAndBackupEngine();
+}
+
+// Corrupt a blob file but maintain its size
+TEST_P(BackupEngineTestWithParam, CorruptBlobFileMaintainSize) {
+  const int keys_iteration = 5000;
+  OpenDBAndBackupEngine(true);
+  // create a backup
+  FillDB(db_.get(), 0, keys_iteration);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+
+  OpenDBAndBackupEngine();
+  // verify with file size
+  ASSERT_OK(backup_engine_->VerifyBackup(1, false));
+  // verify with file checksum
+  ASSERT_OK(backup_engine_->VerifyBackup(1, true));
+
+  std::string file_to_corrupt;
+  std::vector<FileAttributes> children;
+
+  std::string dir = backupdir_;
+  if (engine_options_->share_files_with_checksum) {
+    dir += "/shared_checksum";
+  } else {
+    dir += "/shared";
+  }
+
+  ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children));
+
+  for (const auto& child : children) {
+    if (EndsWith(child.name, ".blob") && child.size_bytes != 0) {
+      // corrupt the blob files by replacing its content by file_size random
+      // bytes
+      ASSERT_OK(
+          file_manager_->CorruptFile(dir + "/" + child.name, child.size_bytes));
+    }
+  }
+
+  // file sizes match
+  ASSERT_OK(backup_engine_->VerifyBackup(1, false));
+  // file checksums mismatch
+  ASSERT_NOK(backup_engine_->VerifyBackup(1, true));
+  // sanity check, use default second argument
+  ASSERT_OK(backup_engine_->VerifyBackup(1));
+  CloseDBAndBackupEngine();
+}
+
+// Test if BackupEngine will fail to create new backup if some table has been
+// corrupted and the table file checksum is stored in the DB manifest
+TEST_F(BackupEngineTest, TableFileCorruptedBeforeBackup) {
+  const int keys_iteration = 50000;
+
+  OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+                        kNoShare);
+  FillDB(db_.get(), 0, keys_iteration);
+  CloseAndReopenDB(/*read_only*/ true);
+  // corrupt a random table file in the DB directory
+  ASSERT_OK(CorruptRandomDataFileInDB(kTableFile));
+  // file_checksum_gen_factory is null, and thus table checksum is not
+  // verified for creating a new backup; no correction is detected
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+  CloseDBAndBackupEngine();
+
+  // delete old files in db
+  DestroyDBWithoutCheck(dbname_, options_);
+
+  // Enable table file checksum in DB manifest
+  options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+                        kNoShare);
+  FillDB(db_.get(), 0, keys_iteration);
+  CloseAndReopenDB(/*read_only*/ true);
+  // corrupt a random table file in the DB directory
+  ASSERT_OK(CorruptRandomDataFileInDB(kTableFile));
+  // table file checksum is enabled so we should be able to detect any
+  // corruption
+  ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get()));
+  CloseDBAndBackupEngine();
+}
+
+// Test if BackupEngine will fail to create new backup if some blob files has
+// been corrupted and the blob file checksum is stored in the DB manifest
+TEST_F(BackupEngineTest, BlobFileCorruptedBeforeBackup) {
+  const int keys_iteration = 50000;
+
+  OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+                        kNoShare);
+  FillDB(db_.get(), 0, keys_iteration);
+  CloseAndReopenDB(/*read_only*/ true);
+  // corrupt a random blob file in the DB directory
+  ASSERT_OK(CorruptRandomDataFileInDB(kBlobFile));
+  // file_checksum_gen_factory is null, and thus blob checksum is not
+  // verified for creating a new backup; no correction is detected
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+  CloseDBAndBackupEngine();
+
+  // delete old files in db
+  DestroyDBWithoutCheck(dbname_, options_);
+
+  // Enable file checksum in DB manifest
+  options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+                        kNoShare);
+  FillDB(db_.get(), 0, keys_iteration);
+  CloseAndReopenDB(/*read_only*/ true);
+  // corrupt a random blob file in the DB directory
+  ASSERT_OK(CorruptRandomDataFileInDB(kBlobFile));
+
+  // file checksum is enabled so we should be able to detect any
+  // corruption
+  ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get()));
+  CloseDBAndBackupEngine();
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+// Test if BackupEngine will fail to create new backup if some table has been
+// corrupted and the table file checksum is stored in the DB manifest for the
+// case when backup table files will be stored in a shared directory
+TEST_P(BackupEngineTestWithParam, TableFileCorruptedBeforeBackup) {
+  const int keys_iteration = 50000;
+
+  OpenDBAndBackupEngine(true /* destroy_old_data */);
+  FillDB(db_.get(), 0, keys_iteration);
+  CloseAndReopenDB(/*read_only*/ true);
+  // corrupt a random table file in the DB directory
+  ASSERT_OK(CorruptRandomDataFileInDB(kTableFile));
+  // cannot detect corruption since DB manifest has no table checksums
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+  CloseDBAndBackupEngine();
+
+  // delete old files in db
+  DestroyDBWithoutCheck(dbname_, options_);
+
+  // Enable table checksums in DB manifest
+  options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  OpenDBAndBackupEngine(true /* destroy_old_data */);
+  FillDB(db_.get(), 0, keys_iteration);
+  CloseAndReopenDB(/*read_only*/ true);
+  // corrupt a random table file in the DB directory
+  ASSERT_OK(CorruptRandomDataFileInDB(kTableFile));
+  // corruption is detected
+  ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get()));
+  CloseDBAndBackupEngine();
+}
+
+// Test if BackupEngine will fail to create new backup if some blob files have
+// been corrupted and the blob file checksum is stored in the DB manifest for
+// the case when backup blob files will be stored in a shared directory
+TEST_P(BackupEngineTestWithParam, BlobFileCorruptedBeforeBackup) {
+  const int keys_iteration = 50000;
+  OpenDBAndBackupEngine(true /* destroy_old_data */);
+  FillDB(db_.get(), 0, keys_iteration);
+  CloseAndReopenDB(/*read_only*/ true);
+  // corrupt a random blob file in the DB directory
+  ASSERT_OK(CorruptRandomDataFileInDB(kBlobFile));
+  // cannot detect corruption since DB manifest has no blob file checksums
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+  CloseDBAndBackupEngine();
+
+  // delete old files in db
+  DestroyDBWithoutCheck(dbname_, options_);
+
+  // Enable blob file checksums in DB manifest
+  options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  OpenDBAndBackupEngine(true /* destroy_old_data */);
+  FillDB(db_.get(), 0, keys_iteration);
+  CloseAndReopenDB(/*read_only*/ true);
+  // corrupt a random blob file in the DB directory
+  ASSERT_OK(CorruptRandomDataFileInDB(kBlobFile));
+  // corruption is detected
+  ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get()));
+  CloseDBAndBackupEngine();
+}
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_F(BackupEngineTest, TableFileWithoutDbChecksumCorruptedDuringBackup) {
+  const int keys_iteration = 50000;
+  engine_options_->share_files_with_checksum_naming = kLegacyCrc32cAndFileSize;
+  // When share_files_with_checksum is on, we calculate checksums of table
+  // files before and after copying. So we can test whether a corruption has
+  // happened during the file is copied to backup directory.
+  OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+                        kShareWithChecksum);
+
+  FillDB(db_.get(), 0, keys_iteration);
+  std::atomic<bool> corrupted{false};
+  // corrupt files when copying to the backup directory
+  SyncPoint::GetInstance()->SetCallBack(
+      "BackupEngineImpl::CopyOrCreateFile:CorruptionDuringBackup",
+      [&](void* data) {
+        if (data != nullptr) {
+          Slice* d = reinterpret_cast<Slice*>(data);
+          if (!d->empty()) {
+            d->remove_suffix(1);
+            corrupted = true;
+          }
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  Status s = backup_engine_->CreateNewBackup(db_.get());
+  if (corrupted) {
+    ASSERT_NOK(s);
+  } else {
+    // should not in this path in normal cases
+    ASSERT_OK(s);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  CloseDBAndBackupEngine();
+  // delete old files in db
+  DestroyDBWithoutCheck(dbname_, options_);
+}
+
+TEST_F(BackupEngineTest, TableFileWithDbChecksumCorruptedDuringBackup) {
+  const int keys_iteration = 50000;
+  options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  for (auto& sopt : kAllShareOptions) {
+    // Since the default DB table file checksum is on, we obtain checksums of
+    // table files from the DB manifest before copying and verify it with the
+    // one calculated during copying.
+    // Therefore, we can test whether a corruption has happened during the file
+    // being copied to backup directory.
+    OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */, sopt);
+
+    FillDB(db_.get(), 0, keys_iteration);
+
+    // corrupt files when copying to the backup directory
+    SyncPoint::GetInstance()->SetCallBack(
+        "BackupEngineImpl::CopyOrCreateFile:CorruptionDuringBackup",
+        [&](void* data) {
+          if (data != nullptr) {
+            Slice* d = reinterpret_cast<Slice*>(data);
+            if (!d->empty()) {
+              d->remove_suffix(1);
+            }
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    // The only case that we can't detect a corruption is when the file
+    // being backed up is empty. But as keys_iteration is large, such
+    // a case shouldn't have happened and we should be able to detect
+    // the corruption.
+    ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get()));
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+
+    CloseDBAndBackupEngine();
+    // delete old files in db
+    DestroyDBWithoutCheck(dbname_, options_);
+  }
+}
+
+TEST_F(BackupEngineTest, InterruptCreationTest) {
+  // Interrupt backup creation by failing new writes and failing cleanup of the
+  // partial state. Then verify a subsequent backup can still succeed.
+  const int keys_iteration = 5000;
+  Random rnd(6);
+
+  OpenDBAndBackupEngine(true /* destroy_old_data */);
+  FillDB(db_.get(), 0, keys_iteration);
+  test_backup_fs_->SetLimitWrittenFiles(2);
+  test_backup_fs_->SetDeleteFileFailure(true);
+  // should fail creation
+  ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2)));
+  CloseDBAndBackupEngine();
+  // should also fail cleanup so the tmp directory stays behind
+  ASSERT_OK(backup_chroot_env_->FileExists(backupdir_ + "/private/1/"));
+
+  OpenDBAndBackupEngine(false /* destroy_old_data */);
+  test_backup_fs_->SetLimitWrittenFiles(1000000);
+  test_backup_fs_->SetDeleteFileFailure(false);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2)));
+  // latest backup should have all the keys
+  CloseDBAndBackupEngine();
+  AssertBackupConsistency(0, 0, keys_iteration);
+}
+
+TEST_F(BackupEngineTest, FlushCompactDuringBackupCheckpoint) {
+  const int keys_iteration = 5000;
+  options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  for (const auto& sopt : kAllShareOptions) {
+    OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */, sopt);
+    FillDB(db_.get(), 0, keys_iteration);
+    // That FillDB leaves a mix of flushed and unflushed data
+    SyncPoint::GetInstance()->LoadDependency(
+        {{"CheckpointImpl::CreateCustomCheckpoint:AfterGetLive1",
+          "BackupEngineTest::FlushCompactDuringBackupCheckpoint:Before"},
+         {"BackupEngineTest::FlushCompactDuringBackupCheckpoint:After",
+          "CheckpointImpl::CreateCustomCheckpoint:AfterGetLive2"}});
+    SyncPoint::GetInstance()->EnableProcessing();
+    ROCKSDB_NAMESPACE::port::Thread flush_thread{[this]() {
+      TEST_SYNC_POINT(
+          "BackupEngineTest::FlushCompactDuringBackupCheckpoint:Before");
+      FillDB(db_.get(), keys_iteration, 2 * keys_iteration);
+      ASSERT_OK(db_->Flush(FlushOptions()));
+      DBImpl* dbi = static_cast<DBImpl*>(db_.get());
+      ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+      ASSERT_OK(dbi->TEST_WaitForCompact());
+      TEST_SYNC_POINT(
+          "BackupEngineTest::FlushCompactDuringBackupCheckpoint:After");
+    }};
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+    flush_thread.join();
+    CloseDBAndBackupEngine();
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    /* FIXME(peterd): reinstate with option for checksum in file names
+    if (sopt == kShareWithChecksum) {
+      // Ensure we actually got DB manifest checksums by inspecting
+      // shared_checksum file names for hex checksum component
+      TestRegex expected("[^_]+_[0-9A-F]{8}_[^_]+.sst");
+      std::vector<FileAttributes> children;
+      const std::string dir = backupdir_ + "/shared_checksum";
+      ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children));
+      for (const auto& child : children) {
+        if (child.size_bytes == 0) {
+          continue;
+        }
+        EXPECT_MATCHES_REGEX(child.name, expected);
+      }
+    }
+    */
+    AssertBackupConsistency(0, 0, keys_iteration);
+  }
+}
+
+inline std::string OptionsPath(std::string ret, int backupID) {
+  ret += "/private/";
+  ret += std::to_string(backupID);
+  ret += "/";
+  return ret;
+}
+
+// Backup the LATEST options file to
+// "<backup_dir>/private/<backup_id>/OPTIONS<number>"
+
+TEST_F(BackupEngineTest, BackupOptions) {
+  OpenDBAndBackupEngine(true);
+  for (int i = 1; i < 5; i++) {
+    std::string name;
+    std::vector<std::string> filenames;
+    // Must reset() before reset(OpenDB()) again.
+    // Calling OpenDB() while *db_ is existing will cause LOCK issue
+    db_.reset();
+    db_.reset(OpenDB());
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+    ASSERT_OK(ROCKSDB_NAMESPACE::GetLatestOptionsFileName(db_->GetName(),
+                                                          options_.env, &name));
+    ASSERT_OK(file_manager_->FileExists(OptionsPath(backupdir_, i) + name));
+    ASSERT_OK(backup_chroot_env_->GetChildren(OptionsPath(backupdir_, i),
+                                              &filenames));
+    for (auto fn : filenames) {
+      if (fn.compare(0, 7, "OPTIONS") == 0) {
+        ASSERT_EQ(name, fn);
+      }
+    }
+  }
+
+  CloseDBAndBackupEngine();
+}
+
+TEST_F(BackupEngineTest, SetOptionsBackupRaceCondition) {
+  OpenDBAndBackupEngine(true);
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"CheckpointImpl::CreateCheckpoint:SavedLiveFiles1",
+        "BackupEngineTest::SetOptionsBackupRaceCondition:BeforeSetOptions"},
+       {"BackupEngineTest::SetOptionsBackupRaceCondition:AfterSetOptions",
+        "CheckpointImpl::CreateCheckpoint:SavedLiveFiles2"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+  ROCKSDB_NAMESPACE::port::Thread setoptions_thread{[this]() {
+    TEST_SYNC_POINT(
+        "BackupEngineTest::SetOptionsBackupRaceCondition:BeforeSetOptions");
+    DBImpl* dbi = static_cast<DBImpl*>(db_.get());
+    // Change arbitrary option to trigger OPTIONS file deletion
+    ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(),
+                              {{"paranoid_file_checks", "false"}}));
+    ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(),
+                              {{"paranoid_file_checks", "true"}}));
+    ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(),
+                              {{"paranoid_file_checks", "false"}}));
+    TEST_SYNC_POINT(
+        "BackupEngineTest::SetOptionsBackupRaceCondition:AfterSetOptions");
+  }};
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+  setoptions_thread.join();
+  CloseDBAndBackupEngine();
+}
+
+// This test verifies we don't delete the latest backup when read-only option is
+// set
+TEST_F(BackupEngineTest, NoDeleteWithReadOnly) {
+  const int keys_iteration = 5000;
+  Random rnd(6);
+
+  OpenDBAndBackupEngine(true);
+  // create five backups
+  for (int i = 0; i < 5; ++i) {
+    FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2)));
+  }
+  CloseDBAndBackupEngine();
+  ASSERT_OK(file_manager_->WriteToFile(latest_backup_, "4"));
+
+  engine_options_->destroy_old_data = false;
+  BackupEngineReadOnly* read_only_backup_engine;
+  ASSERT_OK(BackupEngineReadOnly::Open(
+      backup_chroot_env_.get(), *engine_options_, &read_only_backup_engine));
+
+  // assert that data from backup 5 is still here (even though LATEST_BACKUP
+  // says 4 is latest)
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/5"));
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/private/5"));
+
+  // Behavior change: We now ignore LATEST_BACKUP contents. This means that
+  // we should have 5 backups, even if LATEST_BACKUP says 4.
+  std::vector<BackupInfo> backup_info;
+  read_only_backup_engine->GetBackupInfo(&backup_info);
+  ASSERT_EQ(5UL, backup_info.size());
+  delete read_only_backup_engine;
+}
+
+TEST_F(BackupEngineTest, FailOverwritingBackups) {
+  options_.write_buffer_size = 1024 * 1024 * 1024;  // 1GB
+  options_.disable_auto_compactions = true;
+
+  // create backups 1, 2, 3, 4, 5
+  OpenDBAndBackupEngine(true);
+  for (int i = 0; i < 5; ++i) {
+    CloseDBAndBackupEngine();
+    DeleteLogFiles();
+    OpenDBAndBackupEngine(false);
+    FillDB(db_.get(), 100 * i, 100 * (i + 1), kFlushAll);
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+  }
+  CloseDBAndBackupEngine();
+
+  // restore 3
+  OpenBackupEngine();
+  ASSERT_OK(backup_engine_->RestoreDBFromBackup(3, dbname_, dbname_));
+  CloseBackupEngine();
+
+  OpenDBAndBackupEngine(false);
+  // More data, bigger SST
+  FillDB(db_.get(), 1000, 1300, kFlushAll);
+  Status s = backup_engine_->CreateNewBackup(db_.get());
+  // the new backup fails because new table files
+  // clash with old table files from backups 4 and 5
+  // (since write_buffer_size is huge, we can be sure that
+  // each backup will generate only one sst file and that
+  // a file generated here would have the same name as an
+  // sst file generated by backup 4, and will be bigger)
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_OK(backup_engine_->DeleteBackup(4));
+  ASSERT_OK(backup_engine_->DeleteBackup(5));
+  // now, the backup can succeed
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+  CloseDBAndBackupEngine();
+}
+
+TEST_F(BackupEngineTest, NoShareTableFiles) {
+  const int keys_iteration = 5000;
+  OpenDBAndBackupEngine(true, false, kNoShare);
+  for (int i = 0; i < 5; ++i) {
+    FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(i % 2)));
+  }
+  CloseDBAndBackupEngine();
+
+  for (int i = 0; i < 5; ++i) {
+    AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
+                            keys_iteration * 6);
+  }
+}
+
+// Verify that you can backup and restore with share_files_with_checksum on
+TEST_F(BackupEngineTest, ShareTableFilesWithChecksums) {
+  const int keys_iteration = 5000;
+  OpenDBAndBackupEngine(true, false, kShareWithChecksum);
+  for (int i = 0; i < 5; ++i) {
+    FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(i % 2)));
+  }
+  CloseDBAndBackupEngine();
+
+  for (int i = 0; i < 5; ++i) {
+    AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
+                            keys_iteration * 6);
+  }
+}
+
+// Verify that you can backup and restore using share_files_with_checksum set to
+// false and then transition this option to true
+TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsTransition) {
+  const int keys_iteration = 5000;
+  // set share_files_with_checksum to false
+  OpenDBAndBackupEngine(true, false, kShareNoChecksum);
+  for (int i = 0; i < 5; ++i) {
+    FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  }
+  CloseDBAndBackupEngine();
+
+  for (int i = 0; i < 5; ++i) {
+    AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
+                            keys_iteration * 6);
+  }
+
+  // set share_files_with_checksum to true and do some more backups
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+                        kShareWithChecksum);
+  for (int i = 5; i < 10; ++i) {
+    FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  }
+  CloseDBAndBackupEngine();
+
+  // Verify first (about to delete)
+  AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * 11);
+
+  // For an extra challenge, make sure that GarbageCollect / DeleteBackup
+  // is OK even if we open without share_table_files
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare);
+  ASSERT_OK(backup_engine_->DeleteBackup(1));
+  ASSERT_OK(backup_engine_->GarbageCollect());
+  CloseDBAndBackupEngine();
+
+  // Verify rest (not deleted)
+  for (int i = 1; i < 10; ++i) {
+    AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
+                            keys_iteration * 11);
+  }
+}
+
+// Verify backup and restore with various naming options, check names
+TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsNewNaming) {
+  ASSERT_TRUE(engine_options_->share_files_with_checksum_naming ==
+              kNamingDefault);
+
+  const int keys_iteration = 5000;
+
+  OpenDBAndBackupEngine(true, false, kShareWithChecksum);
+  FillDB(db_.get(), 0, keys_iteration);
+  CloseDBAndBackupEngine();
+
+  static const std::map<ShareFilesNaming, TestRegex> option_to_expected = {
+      {kLegacyCrc32cAndFileSize, "[0-9]+_[0-9]+_[0-9]+[.]sst"},
+      // kFlagIncludeFileSize redundant here
+      {kLegacyCrc32cAndFileSize | kFlagIncludeFileSize,
+       "[0-9]+_[0-9]+_[0-9]+[.]sst"},
+      {kUseDbSessionId, "[0-9]+_s[0-9A-Z]{20}[.]sst"},
+      {kUseDbSessionId | kFlagIncludeFileSize,
+       "[0-9]+_s[0-9A-Z]{20}_[0-9]+[.]sst"},
+  };
+
+  const TestRegex blobfile_pattern = "[0-9]+_[0-9]+_[0-9]+[.]blob";
+
+  for (const auto& pair : option_to_expected) {
+    CloseAndReopenDB();
+    engine_options_->share_files_with_checksum_naming = pair.first;
+    OpenBackupEngine(true /*destroy_old_data*/);
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+    CloseDBAndBackupEngine();
+    AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * 2);
+    AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum", pair.second,
+                                   ".sst", 1 /* minimum_count */);
+    if (std::string::npos != pair.second.GetPattern().find("_[0-9]+[.]sst")) {
+      AssertDirectoryFilesSizeIndicators(backupdir_ + "/shared_checksum",
+                                         1 /* minimum_count */);
+    }
+
+    AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum",
+                                   blobfile_pattern, ".blob",
+                                   1 /* minimum_count */);
+  }
+}
+
+// Mimic SST file generated by pre-6.12 releases and verify that
+// old names are always used regardless of naming option.
+TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsOldFileNaming) {
+  const int keys_iteration = 5000;
+
+  // Pre-6.12 release did not include db id and db session id properties.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "PropertyBlockBuilder::AddTableProperty:Start", [&](void* props_vs) {
+        auto props = static_cast<TableProperties*>(props_vs);
+        props->db_id = "";
+        props->db_session_id = "";
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Corrupting the table properties corrupts the unique id.
+  // Ignore the unique id recorded in the manifest.
+  options_.verify_sst_unique_id_in_manifest = false;
+
+  OpenDBAndBackupEngine(true, false, kShareWithChecksum);
+  FillDB(db_.get(), 0, keys_iteration);
+  CloseDBAndBackupEngine();
+
+  // Old names should always be used on old files
+  const TestRegex sstfile_pattern("[0-9]+_[0-9]+_[0-9]+[.]sst");
+
+  const TestRegex blobfile_pattern = "[0-9]+_[0-9]+_[0-9]+[.]blob";
+
+  for (ShareFilesNaming option : {kNamingDefault, kUseDbSessionId}) {
+    CloseAndReopenDB();
+    engine_options_->share_files_with_checksum_naming = option;
+    OpenBackupEngine(true /*destroy_old_data*/);
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+    CloseDBAndBackupEngine();
+    AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * 2);
+    AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum",
+                                   sstfile_pattern, ".sst",
+                                   1 /* minimum_count */);
+    AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum",
+                                   blobfile_pattern, ".blob",
+                                   1 /* minimum_count */);
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// Test how naming options interact with detecting DB corruption
+// between incremental backups
+TEST_F(BackupEngineTest, TableFileCorruptionBeforeIncremental) {
+  const auto share_no_checksum = static_cast<ShareFilesNaming>(0);
+
+  for (bool corrupt_before_first_backup : {false, true}) {
+    for (ShareFilesNaming option :
+         {share_no_checksum, kLegacyCrc32cAndFileSize, kNamingDefault}) {
+      auto share =
+          option == share_no_checksum ? kShareNoChecksum : kShareWithChecksum;
+      if (option != share_no_checksum) {
+        engine_options_->share_files_with_checksum_naming = option;
+      }
+      OpenDBAndBackupEngine(true, false, share);
+      DBImpl* dbi = static_cast<DBImpl*>(db_.get());
+      // A small SST file
+      ASSERT_OK(dbi->Put(WriteOptions(), "x", "y"));
+      ASSERT_OK(dbi->Flush(FlushOptions()));
+      // And a bigger one
+      ASSERT_OK(dbi->Put(WriteOptions(), "y", Random(42).RandomString(500)));
+      ASSERT_OK(dbi->Flush(FlushOptions()));
+      ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
+      CloseAndReopenDB(/*read_only*/ true);
+
+      std::vector<FileAttributes> table_files;
+      ASSERT_OK(GetDataFilesInDB(kTableFile, &table_files));
+      ASSERT_EQ(table_files.size(), 2);
+      std::string tf0 = dbname_ + "/" + table_files[0].name;
+      std::string tf1 = dbname_ + "/" + table_files[1].name;
+
+      CloseDBAndBackupEngine();
+
+      if (corrupt_before_first_backup) {
+        // This corrupts a data block, which does not cause DB open
+        // failure, only failure on accessing the block.
+        ASSERT_OK(db_file_manager_->CorruptFileStart(tf0));
+      }
+
+      OpenDBAndBackupEngine(false, false, share);
+      ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+      CloseDBAndBackupEngine();
+
+      // if corrupt_before_first_backup, this undoes the initial corruption
+      ASSERT_OK(db_file_manager_->CorruptFileStart(tf0));
+
+      OpenDBAndBackupEngine(false, false, share);
+      Status s = backup_engine_->CreateNewBackup(db_.get());
+
+      // Even though none of the naming options catch the inconsistency
+      // between the first and second time backing up fname, in the case
+      // of kUseDbSessionId (kNamingDefault), this is an intentional
+      // trade-off to avoid full scan of files from the DB that are
+      // already backed up. If we did the scan, kUseDbSessionId could catch
+      // the corruption. kLegacyCrc32cAndFileSize does the scan (to
+      // compute checksum for name) without catching the corruption,
+      // because the corruption means the names don't merge.
+      EXPECT_OK(s);
+
+      // VerifyBackup doesn't check DB integrity or table file internal
+      // checksums
+      EXPECT_OK(backup_engine_->VerifyBackup(1, true));
+      EXPECT_OK(backup_engine_->VerifyBackup(2, true));
+
+      db_.reset();
+      ASSERT_OK(backup_engine_->RestoreDBFromBackup(2, dbname_, dbname_));
+      {
+        DB* db = OpenDB();
+        s = db->VerifyChecksum();
+        delete db;
+      }
+      if (option != kLegacyCrc32cAndFileSize && !corrupt_before_first_backup) {
+        // Second backup is OK because it used (uncorrupt) file from first
+        // backup instead of (corrupt) file from DB.
+        // This is arguably a good trade-off vs. treating the file as distinct
+        // from the old version, because a file should be more likely to be
+        // corrupt as it ages. Although the backed-up file might also corrupt
+        // with age, the alternative approach (checksum in file name computed
+        // from current DB file contents) wouldn't detect that case at backup
+        // time either. Although you would have both copies of the file with
+        // the alternative approach, that would only last until the older
+        // backup is deleted.
+        ASSERT_OK(s);
+      } else if (option == kLegacyCrc32cAndFileSize &&
+                 corrupt_before_first_backup) {
+        // Second backup is OK because it saved the updated (uncorrupt)
+        // file from DB, instead of the sharing with first backup.
+        // Recall: if corrupt_before_first_backup, [second CorruptFileStart]
+        // undoes the initial corruption.
+        // This is arguably a bad trade-off vs. sharing the old version of the
+        // file because a file should be more likely to corrupt as it ages.
+        // (Not likely that the previously backed-up version was already
+        // corrupt and the new version is non-corrupt. This approach doesn't
+        // help if backed-up version is corrupted after taking the backup.)
+        ASSERT_OK(s);
+      } else {
+        // Something is legitimately corrupted, but we can't be sure what
+        // with information available (TODO? unless one passes block checksum
+        // test and other doesn't. Probably better to use end-to-end full file
+        // checksum anyway.)
+        ASSERT_TRUE(s.IsCorruption());
+      }
+
+      CloseDBAndBackupEngine();
+      DestroyDBWithoutCheck(dbname_, options_);
+    }
+  }
+}
+
+// Test how naming options interact with detecting file size corruption
+// between incremental backups
+TEST_F(BackupEngineTest, FileSizeForIncremental) {
+  const auto share_no_checksum = static_cast<ShareFilesNaming>(0);
+  // TODO: enable blob files once Integrated BlobDB supports DB session id.
+  options_.enable_blob_files = false;
+
+  for (ShareFilesNaming option : {share_no_checksum, kLegacyCrc32cAndFileSize,
+                                  kNamingDefault, kUseDbSessionId}) {
+    auto share =
+        option == share_no_checksum ? kShareNoChecksum : kShareWithChecksum;
+    if (option != share_no_checksum) {
+      engine_options_->share_files_with_checksum_naming = option;
+    }
+    OpenDBAndBackupEngine(true, false, share);
+
+    std::vector<FileAttributes> children;
+    const std::string shared_dir =
+        backupdir_ +
+        (option == share_no_checksum ? "/shared" : "/shared_checksum");
+
+    // A single small SST file
+    ASSERT_OK(db_->Put(WriteOptions(), "x", "y"));
+
+    // First, test that we always detect file size corruption on the shared
+    // backup side on incremental. (Since sizes aren't really part of backup
+    // meta file, this works by querying the filesystem for the sizes.)
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true /*flush*/));
+    CloseDBAndBackupEngine();
+
+    // Corrupt backup SST file
+    ASSERT_OK(file_manager_->GetChildrenFileAttributes(shared_dir, &children));
+    ASSERT_EQ(children.size(), 1U);  // one sst
+    for (const auto& child : children) {
+      if (child.name.size() > 4 && child.size_bytes > 0) {
+        ASSERT_OK(
+            file_manager_->WriteToFile(shared_dir + "/" + child.name, "asdf"));
+        break;
+      }
+    }
+
+    OpenDBAndBackupEngine(false, false, share);
+    Status s = backup_engine_->CreateNewBackup(db_.get());
+    EXPECT_TRUE(s.IsCorruption());
+
+    ASSERT_OK(backup_engine_->PurgeOldBackups(0));
+    CloseDBAndBackupEngine();
+
+    // Second, test that a hypothetical db session id collision would likely
+    // not suffice to corrupt a backup, because there's a good chance of
+    // file size difference (in this test, guaranteed) so either no name
+    // collision or detected collision.
+
+    // Create backup 1
+    OpenDBAndBackupEngine(false, false, share);
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+    // Even though we have "the same" DB state as backup 1, we need
+    // to restore to recreate the same conditions as later restore.
+    db_.reset();
+    DestroyDBWithoutCheck(dbname_, options_);
+    ASSERT_OK(backup_engine_->RestoreDBFromBackup(1, dbname_, dbname_));
+    CloseDBAndBackupEngine();
+
+    // Forge session id
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::SetDbSessionId", [](void* sid_void_star) {
+          std::string* sid = static_cast<std::string*>(sid_void_star);
+          *sid = "01234567890123456789";
+        });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    // Create another SST file
+    OpenDBAndBackupEngine(false, false, share);
+    ASSERT_OK(db_->Put(WriteOptions(), "y", "x"));
+
+    // Create backup 2
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true /*flush*/));
+
+    // Restore backup 1 (again)
+    db_.reset();
+    DestroyDBWithoutCheck(dbname_, options_);
+    ASSERT_OK(backup_engine_->RestoreDBFromBackup(1, dbname_, dbname_));
+    CloseDBAndBackupEngine();
+
+    // Create another SST file with same number and db session id, only bigger
+    OpenDBAndBackupEngine(false, false, share);
+    ASSERT_OK(db_->Put(WriteOptions(), "y", Random(42).RandomString(500)));
+
+    // Count backup SSTs files.
+    children.clear();
+    ASSERT_OK(file_manager_->GetChildrenFileAttributes(shared_dir, &children));
+    ASSERT_EQ(children.size(), 2U);  // two sst files
+
+    // Try create backup 3
+    s = backup_engine_->CreateNewBackup(db_.get(), true /*flush*/);
+
+    // Re-count backup SSTs
+    children.clear();
+    ASSERT_OK(file_manager_->GetChildrenFileAttributes(shared_dir, &children));
+
+    if (option == kUseDbSessionId) {
+      // Acceptable to call it corruption if size is not in name and
+      // db session id collision is practically impossible.
+      EXPECT_TRUE(s.IsCorruption());
+      EXPECT_EQ(children.size(), 2U);  // no SST file added
+    } else if (option == share_no_checksum) {
+      // Good to call it corruption if both backups cannot be
+      // accommodated.
+      EXPECT_TRUE(s.IsCorruption());
+      EXPECT_EQ(children.size(), 2U);  // no SST file added
+    } else {
+      // Since opening a DB seems sufficient for detecting size corruption
+      // on the DB side, this should be a good thing, ...
+      EXPECT_OK(s);
+      // ... as long as we did actually treat it as a distinct SST file.
+      EXPECT_EQ(children.size(), 3U);  // Another SST added
+    }
+    CloseDBAndBackupEngine();
+    DestroyDBWithoutCheck(dbname_, options_);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  }
+}
+
+// Verify backup and restore with share_files_with_checksum off and then
+// transition this option to on and share_files_with_checksum_naming to be
+// based on kUseDbSessionId
+TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsNewNamingTransition) {
+  const int keys_iteration = 5000;
+  // We may set share_files_with_checksum_naming to kLegacyCrc32cAndFileSize
+  // here but even if we don't, it should have no effect when
+  // share_files_with_checksum is false
+  ASSERT_TRUE(engine_options_->share_files_with_checksum_naming ==
+              kNamingDefault);
+  // set share_files_with_checksum to false
+  OpenDBAndBackupEngine(true, false, kShareNoChecksum);
+  int j = 3;
+  for (int i = 0; i < j; ++i) {
+    FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  }
+  CloseDBAndBackupEngine();
+
+  for (int i = 0; i < j; ++i) {
+    AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
+                            keys_iteration * (j + 1));
+  }
+
+  // set share_files_with_checksum to true and do some more backups
+  // and use session id in the name of SST file backup
+  ASSERT_TRUE(engine_options_->share_files_with_checksum_naming ==
+              kNamingDefault);
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+                        kShareWithChecksum);
+  FillDB(db_.get(), keys_iteration * j, keys_iteration * (j + 1));
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+  // Use checksum in the name as well
+  ++j;
+  options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+                        kShareWithChecksum);
+  FillDB(db_.get(), keys_iteration * j, keys_iteration * (j + 1));
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+
+  // Verify first (about to delete)
+  AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * (j + 1));
+
+  // For an extra challenge, make sure that GarbageCollect / DeleteBackup
+  // is OK even if we open without share_table_files but with
+  // share_files_with_checksum_naming based on kUseDbSessionId
+  ASSERT_TRUE(engine_options_->share_files_with_checksum_naming ==
+              kNamingDefault);
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare);
+  ASSERT_OK(backup_engine_->DeleteBackup(1));
+  ASSERT_OK(backup_engine_->GarbageCollect());
+  CloseDBAndBackupEngine();
+
+  // Verify second (about to delete)
+  AssertBackupConsistency(2, 0, keys_iteration * 2, keys_iteration * (j + 1));
+
+  // Use checksum and file size for backup table file names and open without
+  // share_table_files
+  // Again, make sure that GarbageCollect / DeleteBackup is OK
+  engine_options_->share_files_with_checksum_naming = kLegacyCrc32cAndFileSize;
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare);
+  ASSERT_OK(backup_engine_->DeleteBackup(2));
+  ASSERT_OK(backup_engine_->GarbageCollect());
+  CloseDBAndBackupEngine();
+
+  // Verify rest (not deleted)
+  for (int i = 2; i < j; ++i) {
+    AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
+                            keys_iteration * (j + 1));
+  }
+}
+
+// Verify backup and restore with share_files_with_checksum on and transition
+// from kLegacyCrc32cAndFileSize to kUseDbSessionId
+TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsNewNamingUpgrade) {
+  engine_options_->share_files_with_checksum_naming = kLegacyCrc32cAndFileSize;
+  const int keys_iteration = 5000;
+  // set share_files_with_checksum to true
+  OpenDBAndBackupEngine(true, false, kShareWithChecksum);
+  int j = 3;
+  for (int i = 0; i < j; ++i) {
+    FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  }
+  CloseDBAndBackupEngine();
+
+  for (int i = 0; i < j; ++i) {
+    AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
+                            keys_iteration * (j + 1));
+  }
+
+  engine_options_->share_files_with_checksum_naming = kUseDbSessionId;
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+                        kShareWithChecksum);
+  FillDB(db_.get(), keys_iteration * j, keys_iteration * (j + 1));
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+
+  ++j;
+  options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+                        kShareWithChecksum);
+  FillDB(db_.get(), keys_iteration * j, keys_iteration * (j + 1));
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+
+  // Verify first (about to delete)
+  AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * (j + 1));
+
+  // For an extra challenge, make sure that GarbageCollect / DeleteBackup
+  // is OK even if we open without share_table_files
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare);
+  ASSERT_OK(backup_engine_->DeleteBackup(1));
+  ASSERT_OK(backup_engine_->GarbageCollect());
+  CloseDBAndBackupEngine();
+
+  // Verify second (about to delete)
+  AssertBackupConsistency(2, 0, keys_iteration * 2, keys_iteration * (j + 1));
+
+  // Use checksum and file size for backup table file names and open without
+  // share_table_files
+  // Again, make sure that GarbageCollect / DeleteBackup is OK
+  engine_options_->share_files_with_checksum_naming = kLegacyCrc32cAndFileSize;
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare);
+  ASSERT_OK(backup_engine_->DeleteBackup(2));
+  ASSERT_OK(backup_engine_->GarbageCollect());
+  CloseDBAndBackupEngine();
+
+  // Verify rest (not deleted)
+  for (int i = 2; i < j; ++i) {
+    AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
+                            keys_iteration * (j + 1));
+  }
+}
+
+// This test simulates cleaning up after aborted or incomplete creation
+// of a new backup.
+TEST_F(BackupEngineTest, DeleteTmpFiles) {
+  for (int cleanup_fn : {1, 2, 3, 4}) {
+    for (ShareOption shared_option : kAllShareOptions) {
+      OpenDBAndBackupEngine(false /* destroy_old_data */, false /* dummy */,
+                            shared_option);
+      ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+      BackupID next_id = 1;
+      BackupID oldest_id = std::numeric_limits<BackupID>::max();
+      {
+        std::vector<BackupInfo> backup_info;
+        backup_engine_->GetBackupInfo(&backup_info);
+        for (const auto& bi : backup_info) {
+          next_id = std::max(next_id, bi.backup_id + 1);
+          oldest_id = std::min(oldest_id, bi.backup_id);
+        }
+      }
+      CloseDBAndBackupEngine();
+
+      // An aborted or incomplete new backup will always be in the next
+      // id (maybe more)
+      std::string next_private = "private/" + std::to_string(next_id);
+
+      // NOTE: both shared and shared_checksum should be cleaned up
+      // regardless of how the backup engine is opened.
+      std::vector<std::string> tmp_files_and_dirs;
+      for (const auto& dir_and_file : {
+               std::make_pair(std::string("shared"),
+                              std::string(".00006.sst.tmp")),
+               std::make_pair(std::string("shared_checksum"),
+                              std::string(".00007.sst.tmp")),
+               std::make_pair(next_private, std::string("00003.sst")),
+           }) {
+        std::string dir = backupdir_ + "/" + dir_and_file.first;
+        ASSERT_OK(file_manager_->CreateDirIfMissing(dir));
+        ASSERT_OK(file_manager_->FileExists(dir));
+
+        std::string file = dir + "/" + dir_and_file.second;
+        ASSERT_OK(file_manager_->WriteToFile(file, "tmp"));
+        ASSERT_OK(file_manager_->FileExists(file));
+
+        tmp_files_and_dirs.push_back(file);
+      }
+      if (cleanup_fn != /*CreateNewBackup*/ 4) {
+        // This exists after CreateNewBackup because it's deleted then
+        // re-created.
+        tmp_files_and_dirs.push_back(backupdir_ + "/" + next_private);
+      }
+
+      OpenDBAndBackupEngine(false /* destroy_old_data */, false /* dummy */,
+                            shared_option);
+      // Need to call one of these explicitly to delete tmp files
+      switch (cleanup_fn) {
+        case 1:
+          ASSERT_OK(backup_engine_->GarbageCollect());
+          break;
+        case 2:
+          ASSERT_OK(backup_engine_->DeleteBackup(oldest_id));
+          break;
+        case 3:
+          ASSERT_OK(backup_engine_->PurgeOldBackups(1));
+          break;
+        case 4:
+          // Does a garbage collect if it sees that next private dir exists
+          ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+          break;
+        default:
+          assert(false);
+      }
+      CloseDBAndBackupEngine();
+      for (std::string file_or_dir : tmp_files_and_dirs) {
+        if (file_manager_->FileExists(file_or_dir) != Status::NotFound()) {
+          FAIL() << file_or_dir << " was expected to be deleted." << cleanup_fn;
+        }
+      }
+    }
+  }
+}
+
+TEST_F(BackupEngineTest, KeepLogFiles) {
+  engine_options_->backup_log_files = false;
+  // basically infinite
+  options_.WAL_ttl_seconds = 24 * 60 * 60;
+  OpenDBAndBackupEngine(true);
+  FillDB(db_.get(), 0, 100, kFlushAll);
+  FillDB(db_.get(), 100, 200, kFlushAll);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false));
+  FillDB(db_.get(), 200, 300, kFlushAll);
+  FillDB(db_.get(), 300, 400, kFlushAll);
+  FillDB(db_.get(), 400, 500, kFlushAll);
+  CloseDBAndBackupEngine();
+
+  // all data should be there if we call with keep_log_files = true
+  AssertBackupConsistency(0, 0, 500, 600, true);
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+class BackupEngineRateLimitingTestWithParam
+    : public BackupEngineTest,
+      public testing::WithParamInterface<
+          std::tuple<bool /* make throttle */,
+                     int /* 0 = single threaded, 1 = multi threaded*/,
+                     std::pair<uint64_t, uint64_t> /* limits */>> {
+ public:
+  BackupEngineRateLimitingTestWithParam() {}
+};
+
+uint64_t const MB = 1024 * 1024;
+
+INSTANTIATE_TEST_CASE_P(
+    RateLimiting, BackupEngineRateLimitingTestWithParam,
+    ::testing::Values(std::make_tuple(false, 0, std::make_pair(1 * MB, 5 * MB)),
+                      std::make_tuple(false, 0, std::make_pair(2 * MB, 3 * MB)),
+                      std::make_tuple(false, 1, std::make_pair(1 * MB, 5 * MB)),
+                      std::make_tuple(false, 1, std::make_pair(2 * MB, 3 * MB)),
+                      std::make_tuple(true, 0, std::make_pair(1 * MB, 5 * MB)),
+                      std::make_tuple(true, 0, std::make_pair(2 * MB, 3 * MB)),
+                      std::make_tuple(true, 1, std::make_pair(1 * MB, 5 * MB)),
+                      std::make_tuple(true, 1,
+                                      std::make_pair(2 * MB, 3 * MB))));
+
+TEST_P(BackupEngineRateLimitingTestWithParam, RateLimiting) {
+  size_t const kMicrosPerSec = 1000 * 1000LL;
+  const bool custom_rate_limiter = std::get<0>(GetParam());
+  // iter 0 -- single threaded
+  // iter 1 -- multi threaded
+  const int iter = std::get<1>(GetParam());
+  const std::pair<uint64_t, uint64_t> limit = std::get<2>(GetParam());
+  std::unique_ptr<Env> special_env(
+      new SpecialEnv(db_chroot_env_.get(), /*time_elapse_only_sleep*/ true));
+  // destroy old data
+  Options options;
+  options.env = special_env.get();
+  DestroyDBWithoutCheck(dbname_, options);
+
+  if (custom_rate_limiter) {
+    std::shared_ptr<RateLimiter> backup_rate_limiter =
+        std::make_shared<GenericRateLimiter>(
+            limit.first, 100 * 1000 /* refill_period_us */, 10 /* fairness */,
+            RateLimiter::Mode::kWritesOnly /* mode */,
+            special_env->GetSystemClock(), false /* auto_tuned */);
+    std::shared_ptr<RateLimiter> restore_rate_limiter =
+        std::make_shared<GenericRateLimiter>(
+            limit.second, 100 * 1000 /* refill_period_us */, 10 /* fairness */,
+            RateLimiter::Mode::kWritesOnly /* mode */,
+            special_env->GetSystemClock(), false /* auto_tuned */);
+    engine_options_->backup_rate_limiter = backup_rate_limiter;
+    engine_options_->restore_rate_limiter = restore_rate_limiter;
+  } else {
+    engine_options_->backup_rate_limit = limit.first;
+    engine_options_->restore_rate_limit = limit.second;
+  }
+
+  engine_options_->max_background_operations = (iter == 0) ? 1 : 10;
+  options_.compression = kNoCompression;
+
+  // Rate limiter uses `CondVar::TimedWait()`, which does not have access to the
+  // `Env` to advance its time according to the fake wait duration. The
+  // workaround is to install a callback that advance the `Env`'s mock time.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "GenericRateLimiter::Request:PostTimedWait", [&](void* arg) {
+        int64_t time_waited_us = *static_cast<int64_t*>(arg);
+        special_env->SleepForMicroseconds(static_cast<int>(time_waited_us));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  OpenDBAndBackupEngine(true);
+  TEST_SetDefaultRateLimitersClock(backup_engine_.get(),
+                                   special_env->GetSystemClock());
+
+  size_t bytes_written = FillDB(db_.get(), 0, 10000);
+
+  auto start_backup = special_env->NowMicros();
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false));
+  auto backup_time = special_env->NowMicros() - start_backup;
+  CloseDBAndBackupEngine();
+  auto rate_limited_backup_time = (bytes_written * kMicrosPerSec) / limit.first;
+  ASSERT_GT(backup_time, 0.8 * rate_limited_backup_time);
+
+  OpenBackupEngine();
+  TEST_SetDefaultRateLimitersClock(
+      backup_engine_.get(),
+      special_env->GetSystemClock() /* backup_rate_limiter_clock */,
+      special_env->GetSystemClock() /* restore_rate_limiter_clock */);
+
+  auto start_restore = special_env->NowMicros();
+  ASSERT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_));
+  auto restore_time = special_env->NowMicros() - start_restore;
+  CloseBackupEngine();
+  auto rate_limited_restore_time =
+      (bytes_written * kMicrosPerSec) / limit.second;
+  ASSERT_GT(restore_time, 0.8 * rate_limited_restore_time);
+
+  AssertBackupConsistency(0, 0, 10000, 10100);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "GenericRateLimiter::Request:PostTimedWait");
+}
+
+TEST_P(BackupEngineRateLimitingTestWithParam, RateLimitingVerifyBackup) {
+  const std::size_t kMicrosPerSec = 1000 * 1000LL;
+  const bool custom_rate_limiter = std::get<0>(GetParam());
+  const std::uint64_t backup_rate_limiter_limit = std::get<2>(GetParam()).first;
+  const bool is_single_threaded = std::get<1>(GetParam()) == 0 ? true : false;
+  std::unique_ptr<Env> special_env(
+      new SpecialEnv(db_chroot_env_.get(), /*time_elapse_only_sleep*/ true));
+
+  if (custom_rate_limiter) {
+    std::shared_ptr<RateLimiter> backup_rate_limiter =
+        std::make_shared<GenericRateLimiter>(
+            backup_rate_limiter_limit, 100 * 1000 /* refill_period_us */,
+            10 /* fairness */, RateLimiter::Mode::kAllIo /* mode */,
+            special_env->GetSystemClock(), false /* auto_tuned */);
+    engine_options_->backup_rate_limiter = backup_rate_limiter;
+  } else {
+    engine_options_->backup_rate_limit = backup_rate_limiter_limit;
+  }
+
+  engine_options_->max_background_operations = is_single_threaded ? 1 : 10;
+
+  Options options;
+  options.env = special_env.get();
+  DestroyDBWithoutCheck(dbname_, options);
+  // Rate limiter uses `CondVar::TimedWait()`, which does not have access to the
+  // `Env` to advance its time according to the fake wait duration. The
+  // workaround is to install a callback that advance the `Env`'s mock time.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "GenericRateLimiter::Request:PostTimedWait", [&](void* arg) {
+        int64_t time_waited_us = *static_cast<int64_t*>(arg);
+        special_env->SleepForMicroseconds(static_cast<int>(time_waited_us));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  OpenDBAndBackupEngine(true /* destroy_old_data */);
+  TEST_SetDefaultRateLimitersClock(backup_engine_.get(),
+                                   special_env->GetSystemClock(), nullptr);
+  FillDB(db_.get(), 0, 10000);
+
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+                                            false /* flush_before_backup */));
+
+  std::vector<BackupInfo> backup_infos;
+  BackupInfo backup_info;
+  backup_engine_->GetBackupInfo(&backup_infos);
+  ASSERT_EQ(1, backup_infos.size());
+  const int backup_id = 1;
+  ASSERT_EQ(backup_id, backup_infos[0].backup_id);
+  ASSERT_OK(backup_engine_->GetBackupInfo(backup_id, &backup_info,
+                                          true /* include_file_details */));
+
+  std::uint64_t bytes_read_during_verify_backup = 0;
+  for (BackupFileInfo backup_file_info : backup_info.file_details) {
+    bytes_read_during_verify_backup += backup_file_info.size;
+  }
+  auto start_verify_backup = special_env->NowMicros();
+  ASSERT_OK(
+      backup_engine_->VerifyBackup(backup_id, true /* verify_with_checksum */));
+  auto verify_backup_time = special_env->NowMicros() - start_verify_backup;
+  auto rate_limited_verify_backup_time =
+      (bytes_read_during_verify_backup * kMicrosPerSec) /
+      backup_rate_limiter_limit;
+  if (custom_rate_limiter) {
+    EXPECT_GE(verify_backup_time, 0.8 * rate_limited_verify_backup_time);
+  }
+
+  CloseDBAndBackupEngine();
+  AssertBackupConsistency(backup_id, 0, 10000, 10010);
+  DestroyDBWithoutCheck(dbname_, options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "GenericRateLimiter::Request:PostTimedWait");
+}
+
+TEST_P(BackupEngineRateLimitingTestWithParam, RateLimitingChargeReadInBackup) {
+  bool is_single_threaded = std::get<1>(GetParam()) == 0 ? true : false;
+  engine_options_->max_background_operations = is_single_threaded ? 1 : 10;
+
+  const std::uint64_t backup_rate_limiter_limit = std::get<2>(GetParam()).first;
+  std::shared_ptr<RateLimiter> backup_rate_limiter(NewGenericRateLimiter(
+      backup_rate_limiter_limit, 100 * 1000 /* refill_period_us */,
+      10 /* fairness */, RateLimiter::Mode::kWritesOnly /* mode */));
+  engine_options_->backup_rate_limiter = backup_rate_limiter;
+
+  DestroyDBWithoutCheck(dbname_, Options());
+  OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+                        kShareWithChecksum /* shared_option */);
+  FillDB(db_.get(), 0, 10);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+                                            false /* flush_before_backup */));
+  std::int64_t total_bytes_through_with_no_read_charged =
+      backup_rate_limiter->GetTotalBytesThrough();
+  CloseBackupEngine();
+
+  backup_rate_limiter.reset(NewGenericRateLimiter(
+      backup_rate_limiter_limit, 100 * 1000 /* refill_period_us */,
+      10 /* fairness */, RateLimiter::Mode::kAllIo /* mode */));
+  engine_options_->backup_rate_limiter = backup_rate_limiter;
+
+  OpenBackupEngine(true);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+                                            false /* flush_before_backup */));
+  std::int64_t total_bytes_through_with_read_charged =
+      backup_rate_limiter->GetTotalBytesThrough();
+  EXPECT_GT(total_bytes_through_with_read_charged,
+            total_bytes_through_with_no_read_charged);
+  CloseDBAndBackupEngine();
+  AssertBackupConsistency(1, 0, 10, 20);
+  DestroyDBWithoutCheck(dbname_, Options());
+}
+
+TEST_P(BackupEngineRateLimitingTestWithParam, RateLimitingChargeReadInRestore) {
+  bool is_single_threaded = std::get<1>(GetParam()) == 0 ? true : false;
+  engine_options_->max_background_operations = is_single_threaded ? 1 : 10;
+
+  const std::uint64_t restore_rate_limiter_limit =
+      std::get<2>(GetParam()).second;
+  std::shared_ptr<RateLimiter> restore_rate_limiter(NewGenericRateLimiter(
+      restore_rate_limiter_limit, 100 * 1000 /* refill_period_us */,
+      10 /* fairness */, RateLimiter::Mode::kWritesOnly /* mode */));
+  engine_options_->restore_rate_limiter = restore_rate_limiter;
+
+  DestroyDBWithoutCheck(dbname_, Options());
+  OpenDBAndBackupEngine(true /* destroy_old_data */);
+  FillDB(db_.get(), 0, 10);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+                                            false /* flush_before_backup */));
+  CloseDBAndBackupEngine();
+  DestroyDBWithoutCheck(dbname_, Options());
+
+  OpenBackupEngine(false /* destroy_old_data */);
+  ASSERT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_));
+  std::int64_t total_bytes_through_with_no_read_charged =
+      restore_rate_limiter->GetTotalBytesThrough();
+  CloseBackupEngine();
+  DestroyDBWithoutCheck(dbname_, Options());
+
+  restore_rate_limiter.reset(NewGenericRateLimiter(
+      restore_rate_limiter_limit, 100 * 1000 /* refill_period_us */,
+      10 /* fairness */, RateLimiter::Mode::kAllIo /* mode */));
+  engine_options_->restore_rate_limiter = restore_rate_limiter;
+
+  OpenBackupEngine(false /* destroy_old_data */);
+  ASSERT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_));
+  std::int64_t total_bytes_through_with_read_charged =
+      restore_rate_limiter->GetTotalBytesThrough();
+  EXPECT_EQ(total_bytes_through_with_read_charged,
+            total_bytes_through_with_no_read_charged * 2);
+  CloseBackupEngine();
+  AssertBackupConsistency(1, 0, 10, 20);
+  DestroyDBWithoutCheck(dbname_, Options());
+}
+
+TEST_P(BackupEngineRateLimitingTestWithParam,
+       RateLimitingChargeReadInInitialize) {
+  bool is_single_threaded = std::get<1>(GetParam()) == 0 ? true : false;
+  engine_options_->max_background_operations = is_single_threaded ? 1 : 10;
+
+  const std::uint64_t backup_rate_limiter_limit = std::get<2>(GetParam()).first;
+  std::shared_ptr<RateLimiter> backup_rate_limiter(NewGenericRateLimiter(
+      backup_rate_limiter_limit, 100 * 1000 /* refill_period_us */,
+      10 /* fairness */, RateLimiter::Mode::kAllIo /* mode */));
+  engine_options_->backup_rate_limiter = backup_rate_limiter;
+
+  DestroyDBWithoutCheck(dbname_, Options());
+  OpenDBAndBackupEngine(true /* destroy_old_data */);
+  FillDB(db_.get(), 0, 10);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+                                            false /* flush_before_backup */));
+  CloseDBAndBackupEngine();
+  AssertBackupConsistency(1, 0, 10, 20);
+
+  std::int64_t total_bytes_through_before_initialize =
+      engine_options_->backup_rate_limiter->GetTotalBytesThrough();
+  OpenDBAndBackupEngine(false /* destroy_old_data */);
+  // We charge read in BackupEngineImpl::BackupMeta::LoadFromFile,
+  // which is called in BackupEngineImpl::Initialize() during
+  // OpenBackupEngine(false)
+  EXPECT_GT(engine_options_->backup_rate_limiter->GetTotalBytesThrough(),
+            total_bytes_through_before_initialize);
+  CloseDBAndBackupEngine();
+  DestroyDBWithoutCheck(dbname_, Options());
+}
+
+class BackupEngineRateLimitingTestWithParam2
+    : public BackupEngineTest,
+      public testing::WithParamInterface<
+          std::tuple<std::pair<uint64_t, uint64_t> /* limits */>> {
+ public:
+  BackupEngineRateLimitingTestWithParam2() {}
+};
+
+INSTANTIATE_TEST_CASE_P(
+    LowRefillBytesPerPeriod, BackupEngineRateLimitingTestWithParam2,
+    ::testing::Values(std::make_tuple(std::make_pair(1, 1))));
+// To verify we don't request over-sized bytes relative to
+// refill_bytes_per_period_ in each RateLimiter::Request() called in
+// BackupEngine through verifying we don't trigger assertion
+// failure on over-sized request in GenericRateLimiter in debug builds
+TEST_P(BackupEngineRateLimitingTestWithParam2,
+       RateLimitingWithLowRefillBytesPerPeriod) {
+  SpecialEnv special_env(Env::Default(), /*time_elapse_only_sleep*/ true);
+
+  engine_options_->max_background_operations = 1;
+  const uint64_t backup_rate_limiter_limit = std::get<0>(GetParam()).first;
+  std::shared_ptr<RateLimiter> backup_rate_limiter(
+      std::make_shared<GenericRateLimiter>(
+          backup_rate_limiter_limit, 1000 * 1000 /* refill_period_us */,
+          10 /* fairness */, RateLimiter::Mode::kAllIo /* mode */,
+          special_env.GetSystemClock(), false /* auto_tuned */));
+
+  engine_options_->backup_rate_limiter = backup_rate_limiter;
+
+  const uint64_t restore_rate_limiter_limit = std::get<0>(GetParam()).second;
+  std::shared_ptr<RateLimiter> restore_rate_limiter(
+      std::make_shared<GenericRateLimiter>(
+          restore_rate_limiter_limit, 1000 * 1000 /* refill_period_us */,
+          10 /* fairness */, RateLimiter::Mode::kAllIo /* mode */,
+          special_env.GetSystemClock(), false /* auto_tuned */));
+
+  engine_options_->restore_rate_limiter = restore_rate_limiter;
+
+  // Rate limiter uses `CondVar::TimedWait()`, which does not have access to the
+  // `Env` to advance its time according to the fake wait duration. The
+  // workaround is to install a callback that advance the `Env`'s mock time.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "GenericRateLimiter::Request:PostTimedWait", [&](void* arg) {
+        int64_t time_waited_us = *static_cast<int64_t*>(arg);
+        special_env.SleepForMicroseconds(static_cast<int>(time_waited_us));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  DestroyDBWithoutCheck(dbname_, Options());
+  OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+                        kShareWithChecksum /* shared_option */);
+
+  FillDB(db_.get(), 0, 100);
+  int64_t total_bytes_through_before_backup =
+      engine_options_->backup_rate_limiter->GetTotalBytesThrough();
+  EXPECT_OK(backup_engine_->CreateNewBackup(db_.get(),
+                                            false /* flush_before_backup */));
+  int64_t total_bytes_through_after_backup =
+      engine_options_->backup_rate_limiter->GetTotalBytesThrough();
+  ASSERT_GT(total_bytes_through_after_backup,
+            total_bytes_through_before_backup);
+
+  std::vector<BackupInfo> backup_infos;
+  BackupInfo backup_info;
+  backup_engine_->GetBackupInfo(&backup_infos);
+  ASSERT_EQ(1, backup_infos.size());
+  const int backup_id = 1;
+  ASSERT_EQ(backup_id, backup_infos[0].backup_id);
+  ASSERT_OK(backup_engine_->GetBackupInfo(backup_id, &backup_info,
+                                          true /* include_file_details */));
+  int64_t total_bytes_through_before_verify_backup =
+      engine_options_->backup_rate_limiter->GetTotalBytesThrough();
+  EXPECT_OK(
+      backup_engine_->VerifyBackup(backup_id, true /* verify_with_checksum */));
+  int64_t total_bytes_through_after_verify_backup =
+      engine_options_->backup_rate_limiter->GetTotalBytesThrough();
+  ASSERT_GT(total_bytes_through_after_verify_backup,
+            total_bytes_through_before_verify_backup);
+
+  CloseDBAndBackupEngine();
+  AssertBackupConsistency(backup_id, 0, 100, 101);
+
+  int64_t total_bytes_through_before_initialize =
+      engine_options_->backup_rate_limiter->GetTotalBytesThrough();
+  OpenDBAndBackupEngine(false /* destroy_old_data */);
+  // We charge read in BackupEngineImpl::BackupMeta::LoadFromFile,
+  // which is called in BackupEngineImpl::Initialize() during
+  // OpenBackupEngine(false)
+  int64_t total_bytes_through_after_initialize =
+      engine_options_->backup_rate_limiter->GetTotalBytesThrough();
+  ASSERT_GT(total_bytes_through_after_initialize,
+            total_bytes_through_before_initialize);
+  CloseDBAndBackupEngine();
+
+  DestroyDBWithoutCheck(dbname_, Options());
+  OpenBackupEngine(false /* destroy_old_data */);
+  int64_t total_bytes_through_before_restore =
+      engine_options_->restore_rate_limiter->GetTotalBytesThrough();
+  EXPECT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_));
+  int64_t total_bytes_through_after_restore =
+      engine_options_->restore_rate_limiter->GetTotalBytesThrough();
+  ASSERT_GT(total_bytes_through_after_restore,
+            total_bytes_through_before_restore);
+  CloseBackupEngine();
+
+  DestroyDBWithoutCheck(dbname_, Options());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "GenericRateLimiter::Request:PostTimedWait");
+}
+
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_F(BackupEngineTest, ReadOnlyBackupEngine) {
+  DestroyDBWithoutCheck(dbname_, options_);
+  OpenDBAndBackupEngine(true);
+  FillDB(db_.get(), 0, 100);
+  // Also test read-only DB with CreateNewBackup and flush=true (no flush)
+  CloseAndReopenDB(/*read_only*/ true);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), /*flush*/ true));
+  CloseAndReopenDB(/*read_only*/ false);
+  FillDB(db_.get(), 100, 200);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), /*flush*/ true));
+  CloseDBAndBackupEngine();
+  DestroyDBWithoutCheck(dbname_, options_);
+
+  engine_options_->destroy_old_data = false;
+  test_backup_fs_->ClearWrittenFiles();
+  test_backup_fs_->SetLimitDeleteFiles(0);
+  BackupEngineReadOnly* read_only_backup_engine;
+  ASSERT_OK(BackupEngineReadOnly::Open(db_chroot_env_.get(), *engine_options_,
+                                       &read_only_backup_engine));
+  std::vector<BackupInfo> backup_info;
+  read_only_backup_engine->GetBackupInfo(&backup_info);
+  ASSERT_EQ(backup_info.size(), 2U);
+
+  RestoreOptions restore_options(false);
+  ASSERT_OK(read_only_backup_engine->RestoreDBFromLatestBackup(
+      dbname_, dbname_, restore_options));
+  delete read_only_backup_engine;
+  std::vector<std::string> should_have_written;
+  test_backup_fs_->AssertWrittenFiles(should_have_written);
+
+  DB* db = OpenDB();
+  AssertExists(db, 0, 200);
+  delete db;
+}
+
+TEST_F(BackupEngineTest, OpenBackupAsReadOnlyDB) {
+  DestroyDBWithoutCheck(dbname_, options_);
+  options_.write_dbid_to_manifest = false;
+
+  OpenDBAndBackupEngine(true);
+  FillDB(db_.get(), 0, 100);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), /*flush*/ false));
+
+  options_.write_dbid_to_manifest = true;  // exercises some read-only DB code
+  CloseAndReopenDB();
+
+  FillDB(db_.get(), 100, 200);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), /*flush*/ false));
+  db_.reset();  // CloseDB
+  DestroyDBWithoutCheck(dbname_, options_);
+  BackupInfo backup_info;
+  // First, check that we get empty fields without include_file_details
+  ASSERT_OK(backup_engine_->GetBackupInfo(/*id*/ 1U, &backup_info,
+                                          /*with file details*/ false));
+  ASSERT_EQ(backup_info.name_for_open, "");
+  ASSERT_FALSE(backup_info.env_for_open);
+
+  // Now for the real test
+  backup_info = BackupInfo();
+  ASSERT_OK(backup_engine_->GetBackupInfo(/*id*/ 1U, &backup_info,
+                                          /*with file details*/ true));
+
+  // Caution: DBOptions only holds a raw pointer to Env, so something else
+  // must keep it alive.
+  // Case 1: Keeping BackupEngine open suffices to keep Env alive
+  DB* db = nullptr;
+  Options opts = options_;
+  // Ensure some key defaults are set
+  opts.wal_dir = "";
+  opts.create_if_missing = false;
+  opts.info_log.reset();
+
+  opts.env = backup_info.env_for_open.get();
+  std::string name = backup_info.name_for_open;
+  backup_info = BackupInfo();
+  ASSERT_OK(DB::OpenForReadOnly(opts, name, &db));
+
+  AssertExists(db, 0, 100);
+  AssertEmpty(db, 100, 200);
+
+  delete db;
+  db = nullptr;
+
+  // Case 2: Keeping BackupInfo alive rather than BackupEngine also suffices
+  ASSERT_OK(backup_engine_->GetBackupInfo(/*id*/ 2U, &backup_info,
+                                          /*with file details*/ true));
+  CloseBackupEngine();
+  opts.create_if_missing = true;  // check also OK (though pointless)
+  opts.env = backup_info.env_for_open.get();
+  name = backup_info.name_for_open;
+  // Note: keeping backup_info alive
+  ASSERT_OK(DB::OpenForReadOnly(opts, name, &db));
+
+  AssertExists(db, 0, 200);
+  delete db;
+  db = nullptr;
+
+  // Now try opening read-write and make sure it fails, for safety.
+  ASSERT_TRUE(DB::Open(opts, name, &db).IsIOError());
+}
+
+TEST_F(BackupEngineTest, ProgressCallbackDuringBackup) {
+  DestroyDBWithoutCheck(dbname_, options_);
+  // Too big for this small DB
+  engine_options_->callback_trigger_interval_size = 100000;
+  OpenDBAndBackupEngine(true);
+  FillDB(db_.get(), 0, 100);
+  bool is_callback_invoked = false;
+  ASSERT_OK(backup_engine_->CreateNewBackup(
+      db_.get(), true,
+      [&is_callback_invoked]() { is_callback_invoked = true; }));
+  ASSERT_FALSE(is_callback_invoked);
+  CloseBackupEngine();
+
+  // Easily small enough for this small DB
+  engine_options_->callback_trigger_interval_size = 1000;
+  OpenBackupEngine();
+  ASSERT_OK(backup_engine_->CreateNewBackup(
+      db_.get(), true,
+      [&is_callback_invoked]() { is_callback_invoked = true; }));
+  ASSERT_TRUE(is_callback_invoked);
+  CloseDBAndBackupEngine();
+  DestroyDBWithoutCheck(dbname_, options_);
+}
+
+TEST_F(BackupEngineTest, GarbageCollectionBeforeBackup) {
+  DestroyDBWithoutCheck(dbname_, options_);
+  OpenDBAndBackupEngine(true);
+
+  ASSERT_OK(backup_chroot_env_->CreateDirIfMissing(backupdir_ + "/shared"));
+  std::string file_five = backupdir_ + "/shared/000009.sst";
+  std::string file_five_contents = "I'm not really a sst file";
+  // this depends on the fact that 00009.sst is the first file created by the DB
+  ASSERT_OK(file_manager_->WriteToFile(file_five, file_five_contents));
+
+  FillDB(db_.get(), 0, 100);
+  // backup overwrites file 000009.sst
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+
+  std::string new_file_five_contents;
+  ASSERT_OK(ReadFileToString(backup_chroot_env_.get(), file_five,
+                             &new_file_five_contents));
+  // file 000009.sst was overwritten
+  ASSERT_TRUE(new_file_five_contents != file_five_contents);
+
+  CloseDBAndBackupEngine();
+
+  AssertBackupConsistency(0, 0, 100);
+}
+
+// Test that we properly propagate Env failures
+TEST_F(BackupEngineTest, EnvFailures) {
+  BackupEngine* backup_engine;
+
+  // get children failure
+  {
+    test_backup_fs_->SetGetChildrenFailure(true);
+    ASSERT_NOK(BackupEngine::Open(test_db_env_.get(), *engine_options_,
+                                  &backup_engine));
+    test_backup_fs_->SetGetChildrenFailure(false);
+  }
+
+  // created dir failure
+  {
+    test_backup_fs_->SetCreateDirIfMissingFailure(true);
+    ASSERT_NOK(BackupEngine::Open(test_db_env_.get(), *engine_options_,
+                                  &backup_engine));
+    test_backup_fs_->SetCreateDirIfMissingFailure(false);
+  }
+
+  // new directory failure
+  {
+    test_backup_fs_->SetNewDirectoryFailure(true);
+    ASSERT_NOK(BackupEngine::Open(test_db_env_.get(), *engine_options_,
+                                  &backup_engine));
+    test_backup_fs_->SetNewDirectoryFailure(false);
+  }
+
+  // Read from meta-file failure
+  {
+    DestroyDBWithoutCheck(dbname_, options_);
+    OpenDBAndBackupEngine(true);
+    FillDB(db_.get(), 0, 100);
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+    CloseDBAndBackupEngine();
+    test_backup_fs_->SetDummySequentialFile(true);
+    test_backup_fs_->SetDummySequentialFileFailReads(true);
+    engine_options_->destroy_old_data = false;
+    ASSERT_NOK(BackupEngine::Open(test_db_env_.get(), *engine_options_,
+                                  &backup_engine));
+    test_backup_fs_->SetDummySequentialFile(false);
+    test_backup_fs_->SetDummySequentialFileFailReads(false);
+  }
+
+  // no failure
+  {
+    ASSERT_OK(BackupEngine::Open(test_db_env_.get(), *engine_options_,
+                                 &backup_engine));
+    delete backup_engine;
+  }
+}
+
+// Verify manifest can roll while a backup is being created with the old
+// manifest.
+TEST_F(BackupEngineTest, ChangeManifestDuringBackupCreation) {
+  DestroyDBWithoutCheck(dbname_, options_);
+  options_.max_manifest_file_size = 0;  // always rollover manifest for file add
+  OpenDBAndBackupEngine(true);
+  FillDB(db_.get(), 0, 100, kAutoFlushOnly);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"CheckpointImpl::CreateCheckpoint:SavedLiveFiles1",
+       "VersionSet::LogAndApply:WriteManifest"},
+      {"VersionSet::LogAndApply:WriteManifestDone",
+       "CheckpointImpl::CreateCheckpoint:SavedLiveFiles2"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread flush_thread{
+      [this]() { ASSERT_OK(db_->Flush(FlushOptions())); }};
+
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false));
+
+  flush_thread.join();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  // The last manifest roll would've already been cleaned up by the full scan
+  // that happens when CreateNewBackup invokes EnableFileDeletions. We need to
+  // trigger another roll to verify non-full scan purges stale manifests.
+  DBImpl* db_impl = static_cast_with_check<DBImpl>(db_.get());
+  std::string prev_manifest_path =
+      DescriptorFileName(dbname_, db_impl->TEST_Current_Manifest_FileNo());
+  FillDB(db_.get(), 0, 100, kAutoFlushOnly);
+  ASSERT_OK(db_chroot_env_->FileExists(prev_manifest_path));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  // Even though manual flush completed above, the background thread may not
+  // have finished its cleanup work. `TEST_WaitForBackgroundWork()` will wait
+  // until all the background thread's work has completed, including cleanup.
+  ASSERT_OK(db_impl->TEST_WaitForBackgroundWork());
+  ASSERT_TRUE(db_chroot_env_->FileExists(prev_manifest_path).IsNotFound());
+
+  CloseDBAndBackupEngine();
+  DestroyDBWithoutCheck(dbname_, options_);
+  AssertBackupConsistency(0, 0, 100);
+}
+
+// see https://github.com/facebook/rocksdb/issues/921
+TEST_F(BackupEngineTest, Issue921Test) {
+  BackupEngine* backup_engine;
+  engine_options_->share_table_files = false;
+  ASSERT_OK(
+      backup_chroot_env_->CreateDirIfMissing(engine_options_->backup_dir));
+  engine_options_->backup_dir += "/new_dir";
+  ASSERT_OK(BackupEngine::Open(backup_chroot_env_.get(), *engine_options_,
+                               &backup_engine));
+
+  delete backup_engine;
+}
+
+TEST_F(BackupEngineTest, BackupWithMetadata) {
+  const int keys_iteration = 5000;
+  OpenDBAndBackupEngine(true);
+  // create five backups
+  for (int i = 0; i < 5; ++i) {
+    const std::string metadata = std::to_string(i);
+    FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+    // Here also test CreateNewBackupWithMetadata with CreateBackupOptions
+    // and outputting saved BackupID.
+    CreateBackupOptions opts;
+    opts.flush_before_backup = true;
+    BackupID new_id = 0;
+    ASSERT_OK(backup_engine_->CreateNewBackupWithMetadata(opts, db_.get(),
+                                                          metadata, &new_id));
+    ASSERT_EQ(new_id, static_cast<BackupID>(i + 1));
+  }
+  CloseDBAndBackupEngine();
+
+  OpenDBAndBackupEngine();
+  {  // Verify in bulk BackupInfo
+    std::vector<BackupInfo> backup_infos;
+    backup_engine_->GetBackupInfo(&backup_infos);
+    ASSERT_EQ(5, backup_infos.size());
+    for (int i = 0; i < 5; i++) {
+      ASSERT_EQ(std::to_string(i), backup_infos[i].app_metadata);
+    }
+  }
+  // Also verify in individual BackupInfo
+  for (int i = 0; i < 5; i++) {
+    BackupInfo backup_info;
+    ASSERT_OK(backup_engine_->GetBackupInfo(static_cast<BackupID>(i + 1),
+                                            &backup_info));
+    ASSERT_EQ(std::to_string(i), backup_info.app_metadata);
+  }
+  CloseDBAndBackupEngine();
+  DestroyDBWithoutCheck(dbname_, options_);
+}
+
+TEST_F(BackupEngineTest, BinaryMetadata) {
+  OpenDBAndBackupEngine(true);
+  std::string binaryMetadata = "abc\ndef";
+  binaryMetadata.push_back('\0');
+  binaryMetadata.append("ghi");
+  ASSERT_OK(
+      backup_engine_->CreateNewBackupWithMetadata(db_.get(), binaryMetadata));
+  CloseDBAndBackupEngine();
+
+  OpenDBAndBackupEngine();
+  std::vector<BackupInfo> backup_infos;
+  backup_engine_->GetBackupInfo(&backup_infos);
+  ASSERT_EQ(1, backup_infos.size());
+  ASSERT_EQ(binaryMetadata, backup_infos[0].app_metadata);
+  CloseDBAndBackupEngine();
+  DestroyDBWithoutCheck(dbname_, options_);
+}
+
+TEST_F(BackupEngineTest, MetadataTooLarge) {
+  OpenDBAndBackupEngine(true);
+  std::string largeMetadata(1024 * 1024 + 1, 0);
+  ASSERT_NOK(
+      backup_engine_->CreateNewBackupWithMetadata(db_.get(), largeMetadata));
+  CloseDBAndBackupEngine();
+  DestroyDBWithoutCheck(dbname_, options_);
+}
+
+TEST_F(BackupEngineTest, MetaSchemaVersion2_SizeCorruption) {
+  engine_options_->schema_version = 1;
+  OpenDBAndBackupEngine(/*destroy_old_data*/ true);
+
+  // Backup 1: no future schema, no sizes, with checksums
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+  CloseDBAndBackupEngine();
+  engine_options_->schema_version = 2;
+  OpenDBAndBackupEngine(/*destroy_old_data*/ false);
+
+  // Backup 2: no checksums, no sizes
+  TEST_BackupMetaSchemaOptions test_opts;
+  test_opts.crc32c_checksums = false;
+  test_opts.file_sizes = false;
+  TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+  // Backup 3: no checksums, with sizes
+  test_opts.file_sizes = true;
+  TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+  // Backup 4: with checksums and sizes
+  test_opts.crc32c_checksums = true;
+  TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+  CloseDBAndBackupEngine();
+
+  // Corrupt all the CURRENT files with the wrong size
+  const std::string private_dir = backupdir_ + "/private";
+
+  for (int id = 1; id <= 3; ++id) {
+    ASSERT_OK(file_manager_->WriteToFile(
+        private_dir + "/" + std::to_string(id) + "/CURRENT", "x"));
+  }
+  // Except corrupt Backup 4 with same size CURRENT file
+  {
+    uint64_t size = 0;
+    ASSERT_OK(test_backup_env_->GetFileSize(private_dir + "/4/CURRENT", &size));
+    ASSERT_OK(file_manager_->WriteToFile(private_dir + "/4/CURRENT",
+                                         std::string(size, 'x')));
+  }
+
+  OpenBackupEngine();
+
+  // Only the one with sizes in metadata will be immediately detected
+  // as corrupt
+  std::vector<BackupID> corrupted;
+  backup_engine_->GetCorruptedBackups(&corrupted);
+  ASSERT_EQ(corrupted.size(), 1);
+  ASSERT_EQ(corrupted[0], 3);
+
+  // Size corruption detected on Restore with checksum
+  ASSERT_TRUE(backup_engine_->RestoreDBFromBackup(1 /*id*/, dbname_, dbname_)
+                  .IsCorruption());
+
+  // Size corruption not detected without checksums nor sizes
+  ASSERT_OK(backup_engine_->RestoreDBFromBackup(2 /*id*/, dbname_, dbname_));
+
+  // Non-size corruption detected on Restore with checksum
+  ASSERT_TRUE(backup_engine_->RestoreDBFromBackup(4 /*id*/, dbname_, dbname_)
+                  .IsCorruption());
+
+  CloseBackupEngine();
+}
+
+TEST_F(BackupEngineTest, MetaSchemaVersion2_NotSupported) {
+  engine_options_->schema_version = 2;
+  TEST_BackupMetaSchemaOptions test_opts;
+  std::string app_metadata = "abc\ndef";
+
+  OpenDBAndBackupEngine(true);
+  // Start with supported
+  TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts);
+  ASSERT_OK(
+      backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata));
+
+  // Because we are injecting badness with a TEST API, the badness is only
+  // detected on attempt to restore.
+  // Not supported versions
+  test_opts.version = "3";
+  TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts);
+  ASSERT_OK(
+      backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata));
+  test_opts.version = "23.45.67";
+  TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts);
+  ASSERT_OK(
+      backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata));
+  test_opts.version = "2";
+
+  // Non-ignorable fields
+  test_opts.meta_fields["ni::blah"] = "123";
+  TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts);
+  ASSERT_OK(
+      backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata));
+  test_opts.meta_fields.clear();
+
+  test_opts.file_fields["ni::123"] = "xyz";
+  TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts);
+  ASSERT_OK(
+      backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata));
+  test_opts.file_fields.clear();
+
+  test_opts.footer_fields["ni::123"] = "xyz";
+  TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts);
+  ASSERT_OK(
+      backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata));
+  test_opts.footer_fields.clear();
+  CloseDBAndBackupEngine();
+
+  OpenBackupEngine();
+  std::vector<BackupID> corrupted;
+  backup_engine_->GetCorruptedBackups(&corrupted);
+  ASSERT_EQ(corrupted.size(), 5);
+
+  ASSERT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_));
+  CloseBackupEngine();
+}
+
+TEST_F(BackupEngineTest, MetaSchemaVersion2_Restore) {
+  engine_options_->schema_version = 2;
+  TEST_BackupMetaSchemaOptions test_opts;
+  const int keys_iteration = 5000;
+
+  OpenDBAndBackupEngine(true, false, kShareWithChecksum);
+  FillDB(db_.get(), 0, keys_iteration);
+  // Start with minimum metadata to ensure it works without it being filled
+  // based on shared files also in other backups with the metadata.
+  test_opts.crc32c_checksums = false;
+  test_opts.file_sizes = false;
+  TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+
+  AssertBackupConsistency(1 /* id */, 0, keys_iteration, keys_iteration * 2);
+
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+                        kShareWithChecksum);
+  test_opts.file_sizes = true;
+  TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+
+  for (int id = 1; id <= 2; ++id) {
+    AssertBackupConsistency(id, 0, keys_iteration, keys_iteration * 2);
+  }
+
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+                        kShareWithChecksum);
+  test_opts.crc32c_checksums = true;
+  TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+
+  for (int id = 1; id <= 3; ++id) {
+    AssertBackupConsistency(id, 0, keys_iteration, keys_iteration * 2);
+  }
+
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+                        kShareWithChecksum);
+  // No TEST_EnableWriteFutureSchemaVersion2
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+
+  for (int id = 1; id <= 4; ++id) {
+    AssertBackupConsistency(id, 0, keys_iteration, keys_iteration * 2);
+  }
+
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+                        kShareWithChecksum);
+  // Minor version updates should be forward-compatible
+  test_opts.version = "2.5.70";
+  test_opts.meta_fields["asdf.3456"] = "-42";
+  test_opts.meta_fields["__QRST"] = " 1 $ %%& ";
+  test_opts.file_fields["z94._"] = "^\\";
+  test_opts.file_fields["_7yyyyyyyyy"] = "111111111111";
+  test_opts.footer_fields["Qwzn.tz89"] = "ASDF!!@# ##=\t ";
+  test_opts.footer_fields["yes"] = "no!";
+  TEST_SetBackupMetaSchemaOptions(backup_engine_.get(), test_opts);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+
+  for (int id = 1; id <= 5; ++id) {
+    AssertBackupConsistency(id, 0, keys_iteration, keys_iteration * 2);
+  }
+}
+
+TEST_F(BackupEngineTest, Concurrency) {
+  // Check that we can simultaneously:
+  // * Run several read operations in different threads on a single
+  // BackupEngine object, and
+  // * With another BackupEngine object on the same
+  // backup_dir, run the same read operations in another thread, and
+  // * With yet another BackupEngine object on the same
+  // backup_dir, create two new backups in parallel threads.
+  //
+  // Because of the challenges of integrating this into db_stress,
+  // this is a non-deterministic mini-stress test here instead.
+
+  // To check for a race condition in handling buffer size based on byte
+  // burst limit, we need a (generous) rate limiter
+  std::shared_ptr<RateLimiter> limiter{NewGenericRateLimiter(1000000000)};
+  engine_options_->backup_rate_limiter = limiter;
+  engine_options_->restore_rate_limiter = limiter;
+
+  OpenDBAndBackupEngine(true, false, kShareWithChecksum);
+
+  static constexpr int keys_iteration = 5000;
+  FillDB(db_.get(), 0, keys_iteration);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+  FillDB(db_.get(), keys_iteration, 2 * keys_iteration);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+  static constexpr int max_factor = 3;
+  FillDB(db_.get(), 2 * keys_iteration, max_factor * keys_iteration);
+  // will create another backup soon...
+
+  Options db_opts = options_;
+  db_opts.wal_dir = "";
+  db_opts.create_if_missing = false;
+  BackupEngineOptions be_opts = *engine_options_;
+  be_opts.destroy_old_data = false;
+
+  std::mt19937 rng{std::random_device()()};
+
+  std::array<std::thread, 4> read_threads;
+  std::array<std::thread, 4> restore_verify_threads;
+  for (uint32_t i = 0; i < read_threads.size(); ++i) {
+    uint32_t sleep_micros = rng() % 100000;
+    read_threads[i] = std::thread([this, i, sleep_micros, &db_opts, &be_opts,
+                                   &restore_verify_threads, &limiter] {
+      test_db_env_->SleepForMicroseconds(sleep_micros);
+
+      // Whether to also re-open the BackupEngine, potentially seeing
+      // additional backups
+      bool reopen = i == 3;
+      // Whether we are going to restore "latest"
+      bool latest = i > 1;
+
+      BackupEngine* my_be;
+      if (reopen) {
+        ASSERT_OK(BackupEngine::Open(test_db_env_.get(), be_opts, &my_be));
+      } else {
+        my_be = backup_engine_.get();
+      }
+
+      // Verify metadata (we don't receive updates from concurrently
+      // creating a new backup)
+      std::vector<BackupInfo> infos;
+      my_be->GetBackupInfo(&infos);
+      const uint32_t count = static_cast<uint32_t>(infos.size());
+      infos.clear();
+      if (reopen) {
+        ASSERT_GE(count, 2U);
+        ASSERT_LE(count, 4U);
+        fprintf(stderr, "Reopen saw %u backups\n", count);
+      } else {
+        ASSERT_EQ(count, 2U);
+      }
+      std::vector<BackupID> ids;
+      my_be->GetCorruptedBackups(&ids);
+      ASSERT_EQ(ids.size(), 0U);
+
+      // (Eventually, see below) Restore one of the backups, or "latest"
+      std::string restore_db_dir = dbname_ + "/restore" + std::to_string(i);
+      DestroyDir(test_db_env_.get(), restore_db_dir).PermitUncheckedError();
+      BackupID to_restore;
+      if (latest) {
+        to_restore = count;
+      } else {
+        to_restore = i + 1;
+      }
+
+      // Open restored DB to verify its contents, but test atomic restore
+      // by doing it async and ensuring we either get OK or InvalidArgument
+      restore_verify_threads[i] =
+          std::thread([this, &db_opts, restore_db_dir, to_restore] {
+            DB* restored;
+            Status s;
+            for (;;) {
+              s = DB::Open(db_opts, restore_db_dir, &restored);
+              if (s.IsInvalidArgument()) {
+                // Restore hasn't finished
+                test_db_env_->SleepForMicroseconds(1000);
+                continue;
+              } else {
+                // We should only get InvalidArgument if restore is
+                // incomplete, or OK if complete
+                ASSERT_OK(s);
+                break;
+              }
+            }
+            int factor = std::min(static_cast<int>(to_restore), max_factor);
+            AssertExists(restored, 0, factor * keys_iteration);
+            AssertEmpty(restored, factor * keys_iteration,
+                        (factor + 1) * keys_iteration);
+            delete restored;
+          });
+
+      // (Ok now) Restore one of the backups, or "latest"
+      if (latest) {
+        ASSERT_OK(
+            my_be->RestoreDBFromLatestBackup(restore_db_dir, restore_db_dir));
+      } else {
+        ASSERT_OK(my_be->VerifyBackup(to_restore, true));
+        ASSERT_OK(my_be->RestoreDBFromBackup(to_restore, restore_db_dir,
+                                             restore_db_dir));
+      }
+
+      // Test for race condition in reconfiguring limiter
+      // FIXME: this could set to a different value in all threads, except
+      // GenericRateLimiter::SetBytesPerSecond has a write-write race
+      // reported by TSAN
+      if (i == 0) {
+        limiter->SetBytesPerSecond(2000000000);
+      }
+
+      // Re-verify metadata (we don't receive updates from concurrently
+      // creating a new backup)
+      my_be->GetBackupInfo(&infos);
+      ASSERT_EQ(infos.size(), count);
+      my_be->GetCorruptedBackups(&ids);
+      ASSERT_EQ(ids.size(), 0);
+      // fprintf(stderr, "Finished read thread\n");
+
+      if (reopen) {
+        delete my_be;
+      }
+    });
+  }
+
+  BackupEngine* alt_be;
+  ASSERT_OK(BackupEngine::Open(test_db_env_.get(), be_opts, &alt_be));
+
+  std::array<std::thread, 2> append_threads;
+  for (unsigned i = 0; i < append_threads.size(); ++i) {
+    uint32_t sleep_micros = rng() % 100000;
+    append_threads[i] = std::thread([this, sleep_micros, alt_be] {
+      test_db_env_->SleepForMicroseconds(sleep_micros);
+      // WART: CreateNewBackup doesn't tell you the BackupID it just created,
+      // which is ugly for multithreaded setting.
+      // TODO: add delete backup also when that is added
+      ASSERT_OK(alt_be->CreateNewBackup(db_.get()));
+      // fprintf(stderr, "Finished append thread\n");
+    });
+  }
+
+  for (auto& t : append_threads) {
+    t.join();
+  }
+  // Verify metadata
+  std::vector<BackupInfo> infos;
+  alt_be->GetBackupInfo(&infos);
+  ASSERT_EQ(infos.size(), 2 + append_threads.size());
+
+  for (auto& t : read_threads) {
+    t.join();
+  }
+
+  delete alt_be;
+
+  for (auto& t : restore_verify_threads) {
+    t.join();
+  }
+
+  CloseDBAndBackupEngine();
+}
+
+TEST_F(BackupEngineTest, LimitBackupsOpened) {
+  // Verify the specified max backups are opened, including skipping over
+  // corrupted backups.
+  //
+  // Setup:
+  // - backups 1, 2, and 4 are valid
+  // - backup 3 is corrupt
+  // - max_valid_backups_to_open == 2
+  //
+  // Expectation: the engine opens backups 4 and 2 since those are latest two
+  // non-corrupt backups.
+  const int kNumKeys = 5000;
+  OpenDBAndBackupEngine(true);
+  for (int i = 1; i <= 4; ++i) {
+    FillDB(db_.get(), kNumKeys * i, kNumKeys * (i + 1));
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+    if (i == 3) {
+      ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/meta/3", 3));
+    }
+  }
+  CloseDBAndBackupEngine();
+
+  engine_options_->max_valid_backups_to_open = 2;
+  engine_options_->destroy_old_data = false;
+  BackupEngineReadOnly* read_only_backup_engine;
+  ASSERT_OK(BackupEngineReadOnly::Open(
+      backup_chroot_env_.get(), *engine_options_, &read_only_backup_engine));
+
+  std::vector<BackupInfo> backup_infos;
+  read_only_backup_engine->GetBackupInfo(&backup_infos);
+  ASSERT_EQ(2, backup_infos.size());
+  ASSERT_EQ(2, backup_infos[0].backup_id);
+  ASSERT_EQ(4, backup_infos[1].backup_id);
+  delete read_only_backup_engine;
+}
+
+TEST_F(BackupEngineTest, IgnoreLimitBackupsOpenedWhenNotReadOnly) {
+  // Verify the specified max_valid_backups_to_open is ignored if the engine
+  // is not read-only.
+  //
+  // Setup:
+  // - backups 1, 2, and 4 are valid
+  // - backup 3 is corrupt
+  // - max_valid_backups_to_open == 2
+  //
+  // Expectation: the engine opens backups 4, 2, and 1 since those are latest
+  // non-corrupt backups, by ignoring max_valid_backups_to_open == 2.
+  const int kNumKeys = 5000;
+  OpenDBAndBackupEngine(true);
+  for (int i = 1; i <= 4; ++i) {
+    FillDB(db_.get(), kNumKeys * i, kNumKeys * (i + 1));
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+    if (i == 3) {
+      ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/meta/3", 3));
+    }
+  }
+  CloseDBAndBackupEngine();
+
+  engine_options_->max_valid_backups_to_open = 2;
+  OpenDBAndBackupEngine();
+  std::vector<BackupInfo> backup_infos;
+  backup_engine_->GetBackupInfo(&backup_infos);
+  ASSERT_EQ(3, backup_infos.size());
+  ASSERT_EQ(1, backup_infos[0].backup_id);
+  ASSERT_EQ(2, backup_infos[1].backup_id);
+  ASSERT_EQ(4, backup_infos[2].backup_id);
+  CloseDBAndBackupEngine();
+  DestroyDBWithoutCheck(dbname_, options_);
+}
+
+TEST_F(BackupEngineTest, CreateWhenLatestBackupCorrupted) {
+  // we should pick an ID greater than corrupted backups' IDs so creation can
+  // succeed even when latest backup is corrupted.
+  const int kNumKeys = 5000;
+  OpenDBAndBackupEngine(true /* destroy_old_data */);
+  BackupInfo backup_info;
+  ASSERT_TRUE(backup_engine_->GetLatestBackupInfo(&backup_info).IsNotFound());
+  FillDB(db_.get(), 0 /* from */, kNumKeys);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+                                            true /* flush_before_backup */));
+  ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/meta/1",
+                                       3 /* bytes_to_corrupt */));
+  CloseDBAndBackupEngine();
+
+  OpenDBAndBackupEngine();
+  ASSERT_TRUE(backup_engine_->GetLatestBackupInfo(&backup_info).IsNotFound());
+
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+                                            true /* flush_before_backup */));
+
+  ASSERT_TRUE(backup_engine_->GetLatestBackupInfo(&backup_info).ok());
+  ASSERT_EQ(2, backup_info.backup_id);
+
+  std::vector<BackupInfo> backup_infos;
+  backup_engine_->GetBackupInfo(&backup_infos);
+  ASSERT_EQ(1, backup_infos.size());
+  ASSERT_EQ(2, backup_infos[0].backup_id);
+
+  // Verify individual GetBackupInfo by ID
+  ASSERT_TRUE(backup_engine_->GetBackupInfo(0U, &backup_info).IsNotFound());
+  ASSERT_TRUE(backup_engine_->GetBackupInfo(1U, &backup_info).IsCorruption());
+  ASSERT_TRUE(backup_engine_->GetBackupInfo(2U, &backup_info).ok());
+  ASSERT_TRUE(backup_engine_->GetBackupInfo(3U, &backup_info).IsNotFound());
+  ASSERT_TRUE(
+      backup_engine_->GetBackupInfo(999999U, &backup_info).IsNotFound());
+}
+
+TEST_F(BackupEngineTest, WriteOnlyEngineNoSharedFileDeletion) {
+  // Verifies a write-only BackupEngine does not delete files belonging to valid
+  // backups when GarbageCollect, PurgeOldBackups, or DeleteBackup are called.
+  const int kNumKeys = 5000;
+  for (int i = 0; i < 3; ++i) {
+    OpenDBAndBackupEngine(i == 0 /* destroy_old_data */);
+    FillDB(db_.get(), i * kNumKeys, (i + 1) * kNumKeys);
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+    CloseDBAndBackupEngine();
+
+    engine_options_->max_valid_backups_to_open = 0;
+    OpenDBAndBackupEngine();
+    switch (i) {
+      case 0:
+        ASSERT_OK(backup_engine_->GarbageCollect());
+        break;
+      case 1:
+        ASSERT_OK(backup_engine_->PurgeOldBackups(1 /* num_backups_to_keep */));
+        break;
+      case 2:
+        ASSERT_OK(backup_engine_->DeleteBackup(2 /* backup_id */));
+        break;
+      default:
+        assert(false);
+    }
+    CloseDBAndBackupEngine();
+
+    engine_options_->max_valid_backups_to_open =
+        std::numeric_limits<int32_t>::max();
+    AssertBackupConsistency(i + 1, 0, (i + 1) * kNumKeys);
+  }
+}
+
+TEST_P(BackupEngineTestWithParam, BackupUsingDirectIO) {
+  // Tests direct I/O on the backup engine's reads and writes on the DB env and
+  // backup env
+  // We use ChrootEnv underneath so the below line checks for direct I/O support
+  // in the chroot directory, not the true filesystem root.
+  if (!test::IsDirectIOSupported(test_db_env_.get(), "/")) {
+    ROCKSDB_GTEST_SKIP("Test requires Direct I/O Support");
+    return;
+  }
+  const int kNumKeysPerBackup = 100;
+  const int kNumBackups = 3;
+  options_.use_direct_reads = true;
+  OpenDBAndBackupEngine(true /* destroy_old_data */);
+  for (int i = 0; i < kNumBackups; ++i) {
+    FillDB(db_.get(), i * kNumKeysPerBackup /* from */,
+           (i + 1) * kNumKeysPerBackup /* to */, kFlushAll);
+
+    // Clear the file open counters and then do a bunch of backup engine ops.
+    // For all ops, files should be opened in direct mode.
+    test_backup_fs_->ClearFileOpenCounters();
+    test_db_fs_->ClearFileOpenCounters();
+    CloseBackupEngine();
+    OpenBackupEngine();
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+                                              false /* flush_before_backup */));
+    ASSERT_OK(backup_engine_->VerifyBackup(i + 1));
+    CloseBackupEngine();
+    OpenBackupEngine();
+    std::vector<BackupInfo> backup_infos;
+    backup_engine_->GetBackupInfo(&backup_infos);
+    ASSERT_EQ(static_cast<size_t>(i + 1), backup_infos.size());
+
+    // Verify backup engine always opened files with direct I/O
+    ASSERT_EQ(0, test_db_fs_->num_writers());
+    ASSERT_GE(test_db_fs_->num_direct_rand_readers(), 0);
+    ASSERT_GT(test_db_fs_->num_direct_seq_readers(), 0);
+    // Currently the DB doesn't support reading WALs or manifest with direct
+    // I/O, so subtract two.
+    ASSERT_EQ(test_db_fs_->num_seq_readers() - 2,
+              test_db_fs_->num_direct_seq_readers());
+    ASSERT_EQ(test_db_fs_->num_rand_readers(),
+              test_db_fs_->num_direct_rand_readers());
+  }
+  CloseDBAndBackupEngine();
+
+  for (int i = 0; i < kNumBackups; ++i) {
+    AssertBackupConsistency(i + 1 /* backup_id */,
+                            i * kNumKeysPerBackup /* start_exist */,
+                            (i + 1) * kNumKeysPerBackup /* end_exist */,
+                            (i + 2) * kNumKeysPerBackup /* end */);
+  }
+}
+
+TEST_F(BackupEngineTest, BackgroundThreadCpuPriority) {
+  std::atomic<CpuPriority> priority(CpuPriority::kNormal);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackupEngineImpl::Initialize:SetCpuPriority", [&](void* new_priority) {
+        priority.store(*reinterpret_cast<CpuPriority*>(new_priority));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // 1 thread is easier to test, otherwise, we may not be sure which thread
+  // actually does the work during CreateNewBackup.
+  engine_options_->max_background_operations = 1;
+  OpenDBAndBackupEngine(true);
+
+  {
+    FillDB(db_.get(), 0, 100);
+
+    // by default, cpu priority is not changed.
+    CreateBackupOptions options;
+    ASSERT_OK(backup_engine_->CreateNewBackup(options, db_.get()));
+
+    ASSERT_EQ(priority, CpuPriority::kNormal);
+  }
+
+  {
+    FillDB(db_.get(), 101, 200);
+
+    // decrease cpu priority from normal to low.
+    CreateBackupOptions options;
+    options.decrease_background_thread_cpu_priority = true;
+    options.background_thread_cpu_priority = CpuPriority::kLow;
+    ASSERT_OK(backup_engine_->CreateNewBackup(options, db_.get()));
+
+    ASSERT_EQ(priority, CpuPriority::kLow);
+  }
+
+  {
+    FillDB(db_.get(), 201, 300);
+
+    // try to upgrade cpu priority back to normal,
+    // the priority should still low.
+    CreateBackupOptions options;
+    options.decrease_background_thread_cpu_priority = true;
+    options.background_thread_cpu_priority = CpuPriority::kNormal;
+    ASSERT_OK(backup_engine_->CreateNewBackup(options, db_.get()));
+
+    ASSERT_EQ(priority, CpuPriority::kLow);
+  }
+
+  {
+    FillDB(db_.get(), 301, 400);
+
+    // decrease cpu priority from low to idle.
+    CreateBackupOptions options;
+    options.decrease_background_thread_cpu_priority = true;
+    options.background_thread_cpu_priority = CpuPriority::kIdle;
+    ASSERT_OK(backup_engine_->CreateNewBackup(options, db_.get()));
+
+    ASSERT_EQ(priority, CpuPriority::kIdle);
+  }
+
+  {
+    FillDB(db_.get(), 301, 400);
+
+    // reset priority to later verify that it's not updated by SetCpuPriority.
+    priority = CpuPriority::kNormal;
+
+    // setting the same cpu priority won't call SetCpuPriority.
+    CreateBackupOptions options;
+    options.decrease_background_thread_cpu_priority = true;
+    options.background_thread_cpu_priority = CpuPriority::kIdle;
+
+    // Also check output backup_id with CreateNewBackup
+    BackupID new_id = 0;
+    ASSERT_OK(backup_engine_->CreateNewBackup(options, db_.get(), &new_id));
+    ASSERT_EQ(new_id, 5U);
+
+    ASSERT_EQ(priority, CpuPriority::kNormal);
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  CloseDBAndBackupEngine();
+  DestroyDBWithoutCheck(dbname_, options_);
+}
+
+// Populates `*total_size` with the size of all files under `backup_dir`.
+// We don't go through `BackupEngine` currently because it's hard to figure out
+// the metadata file size.
+Status GetSizeOfBackupFiles(FileSystem* backup_fs,
+                            const std::string& backup_dir, size_t* total_size) {
+  *total_size = 0;
+  std::vector<std::string> dir_stack = {backup_dir};
+  Status s;
+  while (s.ok() && !dir_stack.empty()) {
+    std::string dir = std::move(dir_stack.back());
+    dir_stack.pop_back();
+    std::vector<std::string> children;
+    s = backup_fs->GetChildren(dir, IOOptions(), &children, nullptr /* dbg */);
+    for (size_t i = 0; s.ok() && i < children.size(); ++i) {
+      std::string path = dir + "/" + children[i];
+      bool is_dir;
+      s = backup_fs->IsDirectory(path, IOOptions(), &is_dir, nullptr /* dbg */);
+      uint64_t file_size = 0;
+      if (s.ok()) {
+        if (is_dir) {
+          dir_stack.emplace_back(std::move(path));
+        } else {
+          s = backup_fs->GetFileSize(path, IOOptions(), &file_size,
+                                     nullptr /* dbg */);
+        }
+      }
+      if (s.ok()) {
+        *total_size += file_size;
+      }
+    }
+  }
+  return s;
+}
+
+TEST_F(BackupEngineTest, IOStats) {
+  // Tests the `BACKUP_READ_BYTES` and `BACKUP_WRITE_BYTES` ticker stats have
+  // the expected values according to the files in the backups.
+
+  // These ticker stats are expected to be populated regardless of `PerfLevel`
+  // in user thread
+  SetPerfLevel(kDisable);
+
+  options_.statistics = CreateDBStatistics();
+  OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+                        kShareWithChecksum);
+
+  FillDB(db_.get(), 0 /* from */, 100 /* to */, kFlushMost);
+
+  ASSERT_EQ(0, options_.statistics->getTickerCount(BACKUP_READ_BYTES));
+  ASSERT_EQ(0, options_.statistics->getTickerCount(BACKUP_WRITE_BYTES));
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+                                            false /* flush_before_backup */));
+
+  size_t orig_backup_files_size;
+  ASSERT_OK(GetSizeOfBackupFiles(test_backup_env_->GetFileSystem().get(),
+                                 backupdir_, &orig_backup_files_size));
+  size_t expected_bytes_written = orig_backup_files_size;
+  ASSERT_EQ(expected_bytes_written,
+            options_.statistics->getTickerCount(BACKUP_WRITE_BYTES));
+  // Bytes read is more difficult to pin down since there are reads for many
+  // purposes other than creating file, like `GetSortedWalFiles()` to find first
+  // sequence number, or `CreateNewBackup()` thread to find SST file session ID.
+  // So we loosely require there are at least as many reads as needed for
+  // copying, but not as many as twice that.
+  ASSERT_GE(options_.statistics->getTickerCount(BACKUP_READ_BYTES),
+            expected_bytes_written);
+  ASSERT_LT(expected_bytes_written,
+            2 * options_.statistics->getTickerCount(BACKUP_READ_BYTES));
+
+  FillDB(db_.get(), 100 /* from */, 200 /* to */, kFlushMost);
+
+  ASSERT_OK(options_.statistics->Reset());
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+                                            false /* flush_before_backup */));
+  size_t final_backup_files_size;
+  ASSERT_OK(GetSizeOfBackupFiles(test_backup_env_->GetFileSystem().get(),
+                                 backupdir_, &final_backup_files_size));
+  expected_bytes_written = final_backup_files_size - orig_backup_files_size;
+  ASSERT_EQ(expected_bytes_written,
+            options_.statistics->getTickerCount(BACKUP_WRITE_BYTES));
+  // See above for why these bounds were chosen.
+  ASSERT_GE(options_.statistics->getTickerCount(BACKUP_READ_BYTES),
+            expected_bytes_written);
+  ASSERT_LT(expected_bytes_written,
+            2 * options_.statistics->getTickerCount(BACKUP_READ_BYTES));
+}
+
+TEST_F(BackupEngineTest, FileTemperatures) {
+  CloseDBAndBackupEngine();
+
+  // Required for recording+restoring temperatures
+  engine_options_->schema_version = 2;
+
+  // More file IO instrumentation
+  auto my_db_fs = std::make_shared<FileTemperatureTestFS>(db_chroot_fs_);
+  test_db_fs_ = std::make_shared<TestFs>(my_db_fs);
+  SetEnvsFromFileSystems();
+
+  // Use temperatures
+  options_.bottommost_temperature = Temperature::kWarm;
+  options_.level0_file_num_compaction_trigger = 2;
+  // set dynamic_level to true so the compaction would compact the data to the
+  // last level directly which will have the last_level_temperature
+  options_.level_compaction_dynamic_level_bytes = true;
+
+  OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+                        kShareWithChecksum);
+
+  // generate a bottommost file (combined from 2) and a non-bottommost file
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_.get());
+  ASSERT_OK(db_->Put(WriteOptions(), "a", "val"));
+  ASSERT_OK(db_->Put(WriteOptions(), "c", "val"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_OK(db_->Put(WriteOptions(), "b", "val"));
+  ASSERT_OK(db_->Put(WriteOptions(), "d", "val"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_OK(dbi->TEST_WaitForCompact());
+  ASSERT_OK(db_->Put(WriteOptions(), "e", "val"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  // Get temperatures from manifest
+  std::map<uint64_t, Temperature> manifest_temps;
+  std::map<Temperature, int> manifest_temp_counts;
+  {
+    std::vector<LiveFileStorageInfo> infos;
+    ASSERT_OK(
+        db_->GetLiveFilesStorageInfo(LiveFilesStorageInfoOptions(), &infos));
+    for (auto info : infos) {
+      if (info.file_type == kTableFile) {
+        manifest_temps.emplace(info.file_number, info.temperature);
+        manifest_temp_counts[info.temperature]++;
+      }
+    }
+  }
+
+  // Verify expected manifest temperatures
+  ASSERT_EQ(manifest_temp_counts.size(), 2);
+  ASSERT_EQ(manifest_temp_counts[Temperature::kWarm], 1);
+  ASSERT_EQ(manifest_temp_counts[Temperature::kUnknown], 1);
+
+  // Verify manifest temperatures match FS temperatures
+  std::map<uint64_t, Temperature> current_temps;
+  my_db_fs->CopyCurrentSstFileTemperatures(&current_temps);
+  for (const auto& manifest_temp : manifest_temps) {
+    ASSERT_EQ(current_temps[manifest_temp.first], manifest_temp.second);
+  }
+
+  // Try a few different things
+  for (int i = 1; i <= 5; ++i) {
+    // Expected temperatures after restore are based on manifest temperatures
+    std::map<uint64_t, Temperature> expected_temps = manifest_temps;
+
+    if (i >= 2) {
+      // For iterations 2 & 3, override current temperature of one file
+      // and vary which temperature is authoritative (current or manifest).
+      // For iterations 4 & 5, override current temperature of both files
+      // but make sure an current temperate always takes precedence over
+      // unknown regardless of current_temperatures_override_manifest setting.
+      bool use_current = ((i % 2) == 1);
+      engine_options_->current_temperatures_override_manifest = use_current;
+      CloseBackupEngine();
+      OpenBackupEngine();
+      for (const auto& manifest_temp : manifest_temps) {
+        if (i <= 3) {
+          if (manifest_temp.second == Temperature::kWarm) {
+            my_db_fs->OverrideSstFileTemperature(manifest_temp.first,
+                                                 Temperature::kCold);
+            if (use_current) {
+              expected_temps[manifest_temp.first] = Temperature::kCold;
+            }
+          }
+        } else {
+          assert(i <= 5);
+          if (manifest_temp.second == Temperature::kWarm) {
+            my_db_fs->OverrideSstFileTemperature(manifest_temp.first,
+                                                 Temperature::kUnknown);
+          } else {
+            ASSERT_EQ(manifest_temp.second, Temperature::kUnknown);
+            my_db_fs->OverrideSstFileTemperature(manifest_temp.first,
+                                                 Temperature::kHot);
+            // regardless of use_current
+            expected_temps[manifest_temp.first] = Temperature::kHot;
+          }
+        }
+      }
+    }
+
+    // Sample requested temperatures in opening files for backup
+    my_db_fs->PopRequestedSstFileTemperatures();
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+    // Verify requested temperatures against manifest temperatures (before
+    // retry with kUnknown if needed, and before backup finds out current
+    // temperatures in FileSystem)
+    std::vector<std::pair<uint64_t, Temperature>> requested_temps;
+    my_db_fs->PopRequestedSstFileTemperatures(&requested_temps);
+    std::set<uint64_t> distinct_requests;
+    for (const auto& requested_temp : requested_temps) {
+      // Matching manifest temperatures, except allow retry request with
+      // kUnknown
+      auto manifest_temp = manifest_temps.at(requested_temp.first);
+      if (manifest_temp == Temperature::kUnknown ||
+          requested_temp.second != Temperature::kUnknown) {
+        ASSERT_EQ(manifest_temp, requested_temp.second);
+      }
+      distinct_requests.insert(requested_temp.first);
+    }
+    // Two distinct requests
+    ASSERT_EQ(distinct_requests.size(), 2);
+
+    // Verify against backup info file details API
+    BackupInfo info;
+    ASSERT_OK(backup_engine_->GetLatestBackupInfo(
+        &info, /*include_file_details*/ true));
+    ASSERT_GT(info.file_details.size(), 2);
+    for (auto& e : info.file_details) {
+      ASSERT_EQ(expected_temps[e.file_number], e.temperature);
+    }
+
+    // Restore backup to another virtual (tiered) dir
+    const std::string restore_dir = "/restore" + std::to_string(i);
+    ASSERT_OK(backup_engine_->RestoreDBFromLatestBackup(
+        RestoreOptions(), restore_dir, restore_dir));
+
+    // Verify restored FS temperatures match expectation
+    // (FileTemperatureTestFS doesn't distinguish directories when reporting
+    // current temperatures, just whatever SST was written or overridden last
+    // with that file number.)
+    my_db_fs->CopyCurrentSstFileTemperatures(&current_temps);
+    for (const auto& expected_temp : expected_temps) {
+      ASSERT_EQ(current_temps[expected_temp.first], expected_temp.second);
+    }
+
+    // Delete backup to force next backup to copy files
+    ASSERT_OK(backup_engine_->PurgeOldBackups(0));
+  }
+}
+
+}  // namespace
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as BackupEngine is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !defined(ROCKSDB_LITE) && !defined(OS_WIN)
diff --git a/src/rocksdb/utilities/blob_db/blob_compaction_filter.cc b/src/rocksdb/utilities/blob_db/blob_compaction_filter.cc
new file mode 100644
index 000000000..86907e979
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_compaction_filter.cc
@@ -0,0 +1,490 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/blob_db/blob_compaction_filter.h"
+
+#include <cinttypes>
+
+#include "db/dbformat.h"
+#include "logging/logging.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace blob_db {
+
+BlobIndexCompactionFilterBase::~BlobIndexCompactionFilterBase() {
+  if (blob_file_) {
+    CloseAndRegisterNewBlobFile();
+  }
+  RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EXPIRED_COUNT, expired_count_);
+  RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EXPIRED_SIZE, expired_size_);
+  RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EVICTED_COUNT, evicted_count_);
+  RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EVICTED_SIZE, evicted_size_);
+}
+
+CompactionFilter::Decision BlobIndexCompactionFilterBase::FilterV2(
+    int level, const Slice& key, ValueType value_type, const Slice& value,
+    std::string* new_value, std::string* skip_until) const {
+  const CompactionFilter* ucf = user_comp_filter();
+  if (value_type != kBlobIndex) {
+    if (ucf == nullptr) {
+      return Decision::kKeep;
+    }
+    // Apply user compaction filter for inlined data.
+    CompactionFilter::Decision decision =
+        ucf->FilterV2(level, key, value_type, value, new_value, skip_until);
+    if (decision == Decision::kChangeValue) {
+      return HandleValueChange(key, new_value);
+    }
+    return decision;
+  }
+  BlobIndex blob_index;
+  Status s = blob_index.DecodeFrom(value);
+  if (!s.ok()) {
+    // Unable to decode blob index. Keeping the value.
+    return Decision::kKeep;
+  }
+  if (blob_index.HasTTL() && blob_index.expiration() <= current_time_) {
+    // Expired
+    expired_count_++;
+    expired_size_ += key.size() + value.size();
+    return Decision::kRemove;
+  }
+  if (!blob_index.IsInlined() &&
+      blob_index.file_number() < context_.next_file_number &&
+      context_.current_blob_files.count(blob_index.file_number()) == 0) {
+    // Corresponding blob file gone (most likely, evicted by FIFO eviction).
+    evicted_count_++;
+    evicted_size_ += key.size() + value.size();
+    return Decision::kRemove;
+  }
+  if (context_.fifo_eviction_seq > 0 && blob_index.HasTTL() &&
+      blob_index.expiration() < context_.evict_expiration_up_to) {
+    // Hack: Internal key is passed to BlobIndexCompactionFilter for it to
+    // get sequence number.
+    ParsedInternalKey ikey;
+    if (!ParseInternalKey(
+             key, &ikey,
+             context_.blob_db_impl->db_options_.allow_data_in_errors)
+             .ok()) {
+      assert(false);
+      return Decision::kKeep;
+    }
+    // Remove keys that could have been remove by last FIFO eviction.
+    // If get error while parsing key, ignore and continue.
+    if (ikey.sequence < context_.fifo_eviction_seq) {
+      evicted_count_++;
+      evicted_size_ += key.size() + value.size();
+      return Decision::kRemove;
+    }
+  }
+  // Apply user compaction filter for all non-TTL blob data.
+  if (ucf != nullptr && !blob_index.HasTTL()) {
+    // Hack: Internal key is passed to BlobIndexCompactionFilter for it to
+    // get sequence number.
+    ParsedInternalKey ikey;
+    if (!ParseInternalKey(
+             key, &ikey,
+             context_.blob_db_impl->db_options_.allow_data_in_errors)
+             .ok()) {
+      assert(false);
+      return Decision::kKeep;
+    }
+    // Read value from blob file.
+    PinnableSlice blob;
+    CompressionType compression_type = kNoCompression;
+    constexpr bool need_decompress = true;
+    if (!ReadBlobFromOldFile(ikey.user_key, blob_index, &blob, need_decompress,
+                             &compression_type)) {
+      return Decision::kIOError;
+    }
+    CompactionFilter::Decision decision = ucf->FilterV2(
+        level, ikey.user_key, kValue, blob, new_value, skip_until);
+    if (decision == Decision::kChangeValue) {
+      return HandleValueChange(ikey.user_key, new_value);
+    }
+    return decision;
+  }
+  return Decision::kKeep;
+}
+
+CompactionFilter::Decision BlobIndexCompactionFilterBase::HandleValueChange(
+    const Slice& key, std::string* new_value) const {
+  BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
+  assert(blob_db_impl);
+
+  if (new_value->size() < blob_db_impl->bdb_options_.min_blob_size) {
+    // Keep new_value inlined.
+    return Decision::kChangeValue;
+  }
+  if (!OpenNewBlobFileIfNeeded()) {
+    return Decision::kIOError;
+  }
+  Slice new_blob_value(*new_value);
+  std::string compression_output;
+  if (blob_db_impl->bdb_options_.compression != kNoCompression) {
+    new_blob_value =
+        blob_db_impl->GetCompressedSlice(new_blob_value, &compression_output);
+  }
+  uint64_t new_blob_file_number = 0;
+  uint64_t new_blob_offset = 0;
+  if (!WriteBlobToNewFile(key, new_blob_value, &new_blob_file_number,
+                          &new_blob_offset)) {
+    return Decision::kIOError;
+  }
+  if (!CloseAndRegisterNewBlobFileIfNeeded()) {
+    return Decision::kIOError;
+  }
+  BlobIndex::EncodeBlob(new_value, new_blob_file_number, new_blob_offset,
+                        new_blob_value.size(),
+                        blob_db_impl->bdb_options_.compression);
+  return Decision::kChangeBlobIndex;
+}
+
+BlobIndexCompactionFilterGC::~BlobIndexCompactionFilterGC() {
+  assert(context().blob_db_impl);
+
+  ROCKS_LOG_INFO(context().blob_db_impl->db_options_.info_log,
+                 "GC pass finished %s: encountered %" PRIu64 " blobs (%" PRIu64
+                 " bytes), relocated %" PRIu64 " blobs (%" PRIu64
+                 " bytes), created %" PRIu64 " new blob file(s)",
+                 !gc_stats_.HasError() ? "successfully" : "with failure",
+                 gc_stats_.AllBlobs(), gc_stats_.AllBytes(),
+                 gc_stats_.RelocatedBlobs(), gc_stats_.RelocatedBytes(),
+                 gc_stats_.NewFiles());
+
+  RecordTick(statistics(), BLOB_DB_GC_NUM_KEYS_RELOCATED,
+             gc_stats_.RelocatedBlobs());
+  RecordTick(statistics(), BLOB_DB_GC_BYTES_RELOCATED,
+             gc_stats_.RelocatedBytes());
+  RecordTick(statistics(), BLOB_DB_GC_NUM_NEW_FILES, gc_stats_.NewFiles());
+  RecordTick(statistics(), BLOB_DB_GC_FAILURES, gc_stats_.HasError());
+}
+
+bool BlobIndexCompactionFilterBase::IsBlobFileOpened() const {
+  if (blob_file_) {
+    assert(writer_);
+    return true;
+  }
+  return false;
+}
+
+bool BlobIndexCompactionFilterBase::OpenNewBlobFileIfNeeded() const {
+  if (IsBlobFileOpened()) {
+    return true;
+  }
+
+  BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
+  assert(blob_db_impl);
+
+  const Status s = blob_db_impl->CreateBlobFileAndWriter(
+      /* has_ttl */ false, ExpirationRange(), "compaction/GC", &blob_file_,
+      &writer_);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(
+        blob_db_impl->db_options_.info_log,
+        "Error opening new blob file during compaction/GC, status: %s",
+        s.ToString().c_str());
+    blob_file_.reset();
+    writer_.reset();
+    return false;
+  }
+
+  assert(blob_file_);
+  assert(writer_);
+
+  return true;
+}
+
+bool BlobIndexCompactionFilterBase::ReadBlobFromOldFile(
+    const Slice& key, const BlobIndex& blob_index, PinnableSlice* blob,
+    bool need_decompress, CompressionType* compression_type) const {
+  BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
+  assert(blob_db_impl);
+
+  Status s = blob_db_impl->GetRawBlobFromFile(
+      key, blob_index.file_number(), blob_index.offset(), blob_index.size(),
+      blob, compression_type);
+
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(
+        blob_db_impl->db_options_.info_log,
+        "Error reading blob during compaction/GC, key: %s (%s), status: %s",
+        key.ToString(/* output_hex */ true).c_str(),
+        blob_index.DebugString(/* output_hex */ true).c_str(),
+        s.ToString().c_str());
+
+    return false;
+  }
+
+  if (need_decompress && *compression_type != kNoCompression) {
+    s = blob_db_impl->DecompressSlice(*blob, *compression_type, blob);
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(
+          blob_db_impl->db_options_.info_log,
+          "Uncompression error during blob read from file: %" PRIu64
+          " blob_offset: %" PRIu64 " blob_size: %" PRIu64
+          " key: %s status: '%s'",
+          blob_index.file_number(), blob_index.offset(), blob_index.size(),
+          key.ToString(/* output_hex */ true).c_str(), s.ToString().c_str());
+
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool BlobIndexCompactionFilterBase::WriteBlobToNewFile(
+    const Slice& key, const Slice& blob, uint64_t* new_blob_file_number,
+    uint64_t* new_blob_offset) const {
+  TEST_SYNC_POINT("BlobIndexCompactionFilterBase::WriteBlobToNewFile");
+  assert(new_blob_file_number);
+  assert(new_blob_offset);
+
+  assert(blob_file_);
+  *new_blob_file_number = blob_file_->BlobFileNumber();
+
+  assert(writer_);
+  uint64_t new_key_offset = 0;
+  const Status s = writer_->AddRecord(key, blob, kNoExpiration, &new_key_offset,
+                                      new_blob_offset);
+
+  if (!s.ok()) {
+    const BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
+    assert(blob_db_impl);
+
+    ROCKS_LOG_ERROR(blob_db_impl->db_options_.info_log,
+                    "Error writing blob to new file %s during compaction/GC, "
+                    "key: %s, status: %s",
+                    blob_file_->PathName().c_str(),
+                    key.ToString(/* output_hex */ true).c_str(),
+                    s.ToString().c_str());
+    return false;
+  }
+
+  const uint64_t new_size =
+      BlobLogRecord::kHeaderSize + key.size() + blob.size();
+  blob_file_->BlobRecordAdded(new_size);
+
+  BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
+  assert(blob_db_impl);
+
+  blob_db_impl->total_blob_size_ += new_size;
+
+  return true;
+}
+
+bool BlobIndexCompactionFilterBase::CloseAndRegisterNewBlobFileIfNeeded()
+    const {
+  const BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
+  assert(blob_db_impl);
+
+  assert(blob_file_);
+  if (blob_file_->GetFileSize() < blob_db_impl->bdb_options_.blob_file_size) {
+    return true;
+  }
+
+  return CloseAndRegisterNewBlobFile();
+}
+
+bool BlobIndexCompactionFilterBase::CloseAndRegisterNewBlobFile() const {
+  BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
+  assert(blob_db_impl);
+  assert(blob_file_);
+
+  Status s;
+
+  {
+    WriteLock wl(&blob_db_impl->mutex_);
+
+    s = blob_db_impl->CloseBlobFile(blob_file_);
+
+    // Note: we delay registering the new blob file until it's closed to
+    // prevent FIFO eviction from processing it during compaction/GC.
+    blob_db_impl->RegisterBlobFile(blob_file_);
+  }
+
+  assert(blob_file_->Immutable());
+
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(
+        blob_db_impl->db_options_.info_log,
+        "Error closing new blob file %s during compaction/GC, status: %s",
+        blob_file_->PathName().c_str(), s.ToString().c_str());
+  }
+
+  blob_file_.reset();
+  return s.ok();
+}
+
+CompactionFilter::BlobDecision BlobIndexCompactionFilterGC::PrepareBlobOutput(
+    const Slice& key, const Slice& existing_value,
+    std::string* new_value) const {
+  assert(new_value);
+
+  const BlobDBImpl* const blob_db_impl = context().blob_db_impl;
+  (void)blob_db_impl;
+
+  assert(blob_db_impl);
+  assert(blob_db_impl->bdb_options_.enable_garbage_collection);
+
+  BlobIndex blob_index;
+  const Status s = blob_index.DecodeFrom(existing_value);
+  if (!s.ok()) {
+    gc_stats_.SetError();
+    return BlobDecision::kCorruption;
+  }
+
+  if (blob_index.IsInlined()) {
+    gc_stats_.AddBlob(blob_index.value().size());
+
+    return BlobDecision::kKeep;
+  }
+
+  gc_stats_.AddBlob(blob_index.size());
+
+  if (blob_index.HasTTL()) {
+    return BlobDecision::kKeep;
+  }
+
+  if (blob_index.file_number() >= context_gc_.cutoff_file_number) {
+    return BlobDecision::kKeep;
+  }
+
+  // Note: each compaction generates its own blob files, which, depending on the
+  // workload, might result in many small blob files. The total number of files
+  // is bounded though (determined by the number of compactions and the blob
+  // file size option).
+  if (!OpenNewBlobFileIfNeeded()) {
+    gc_stats_.SetError();
+    return BlobDecision::kIOError;
+  }
+
+  PinnableSlice blob;
+  CompressionType compression_type = kNoCompression;
+  std::string compression_output;
+  if (!ReadBlobFromOldFile(key, blob_index, &blob, false, &compression_type)) {
+    gc_stats_.SetError();
+    return BlobDecision::kIOError;
+  }
+
+  // If the compression_type is changed, re-compress it with the new compression
+  // type.
+  if (compression_type != blob_db_impl->bdb_options_.compression) {
+    if (compression_type != kNoCompression) {
+      const Status status =
+          blob_db_impl->DecompressSlice(blob, compression_type, &blob);
+      if (!status.ok()) {
+        gc_stats_.SetError();
+        return BlobDecision::kCorruption;
+      }
+    }
+    if (blob_db_impl->bdb_options_.compression != kNoCompression) {
+      blob_db_impl->GetCompressedSlice(blob, &compression_output);
+      blob = PinnableSlice(&compression_output);
+      blob.PinSelf();
+    }
+  }
+
+  uint64_t new_blob_file_number = 0;
+  uint64_t new_blob_offset = 0;
+  if (!WriteBlobToNewFile(key, blob, &new_blob_file_number, &new_blob_offset)) {
+    gc_stats_.SetError();
+    return BlobDecision::kIOError;
+  }
+
+  if (!CloseAndRegisterNewBlobFileIfNeeded()) {
+    gc_stats_.SetError();
+    return BlobDecision::kIOError;
+  }
+
+  BlobIndex::EncodeBlob(new_value, new_blob_file_number, new_blob_offset,
+                        blob.size(), compression_type);
+
+  gc_stats_.AddRelocatedBlob(blob_index.size());
+
+  return BlobDecision::kChangeValue;
+}
+
+bool BlobIndexCompactionFilterGC::OpenNewBlobFileIfNeeded() const {
+  if (IsBlobFileOpened()) {
+    return true;
+  }
+  bool result = BlobIndexCompactionFilterBase::OpenNewBlobFileIfNeeded();
+  if (result) {
+    gc_stats_.AddNewFile();
+  }
+  return result;
+}
+
+std::unique_ptr<CompactionFilter>
+BlobIndexCompactionFilterFactoryBase::CreateUserCompactionFilterFromFactory(
+    const CompactionFilter::Context& context) const {
+  std::unique_ptr<CompactionFilter> user_comp_filter_from_factory;
+  if (user_comp_filter_factory_) {
+    user_comp_filter_from_factory =
+        user_comp_filter_factory_->CreateCompactionFilter(context);
+  }
+  return user_comp_filter_from_factory;
+}
+
+std::unique_ptr<CompactionFilter>
+BlobIndexCompactionFilterFactory::CreateCompactionFilter(
+    const CompactionFilter::Context& _context) {
+  assert(clock());
+
+  int64_t current_time = 0;
+  Status s = clock()->GetCurrentTime(&current_time);
+  if (!s.ok()) {
+    return nullptr;
+  }
+  assert(current_time >= 0);
+
+  assert(blob_db_impl());
+
+  BlobCompactionContext context;
+  blob_db_impl()->GetCompactionContext(&context);
+
+  std::unique_ptr<CompactionFilter> user_comp_filter_from_factory =
+      CreateUserCompactionFilterFromFactory(_context);
+
+  return std::unique_ptr<CompactionFilter>(new BlobIndexCompactionFilter(
+      std::move(context), user_comp_filter(),
+      std::move(user_comp_filter_from_factory), current_time, statistics()));
+}
+
+std::unique_ptr<CompactionFilter>
+BlobIndexCompactionFilterFactoryGC::CreateCompactionFilter(
+    const CompactionFilter::Context& _context) {
+  assert(clock());
+
+  int64_t current_time = 0;
+  Status s = clock()->GetCurrentTime(&current_time);
+  if (!s.ok()) {
+    return nullptr;
+  }
+  assert(current_time >= 0);
+
+  assert(blob_db_impl());
+
+  BlobCompactionContext context;
+  BlobCompactionContextGC context_gc;
+  blob_db_impl()->GetCompactionContext(&context, &context_gc);
+
+  std::unique_ptr<CompactionFilter> user_comp_filter_from_factory =
+      CreateUserCompactionFilterFromFactory(_context);
+
+  return std::unique_ptr<CompactionFilter>(new BlobIndexCompactionFilterGC(
+      std::move(context), std::move(context_gc), user_comp_filter(),
+      std::move(user_comp_filter_from_factory), current_time, statistics()));
+}
+
+}  // namespace blob_db
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/blob_db/blob_compaction_filter.h b/src/rocksdb/utilities/blob_db/blob_compaction_filter.h
new file mode 100644
index 000000000..1493cfc1a
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_compaction_filter.h
@@ -0,0 +1,204 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <unordered_set>
+
+#include "db/blob/blob_index.h"
+#include "monitoring/statistics.h"
+#include "rocksdb/compaction_filter.h"
+#include "utilities/blob_db/blob_db_gc_stats.h"
+#include "utilities/blob_db/blob_db_impl.h"
+#include "utilities/compaction_filters/layered_compaction_filter_base.h"
+
+namespace ROCKSDB_NAMESPACE {
+class SystemClock;
+namespace blob_db {
+
+struct BlobCompactionContext {
+  BlobDBImpl* blob_db_impl = nullptr;
+  uint64_t next_file_number = 0;
+  std::unordered_set<uint64_t> current_blob_files;
+  SequenceNumber fifo_eviction_seq = 0;
+  uint64_t evict_expiration_up_to = 0;
+};
+
+struct BlobCompactionContextGC {
+  uint64_t cutoff_file_number = 0;
+};
+
+// Compaction filter that deletes expired blob indexes from the base DB.
+// Comes into two varieties, one for the non-GC case and one for the GC case.
+class BlobIndexCompactionFilterBase : public LayeredCompactionFilterBase {
+ public:
+  BlobIndexCompactionFilterBase(
+      BlobCompactionContext&& _context,
+      const CompactionFilter* _user_comp_filter,
+      std::unique_ptr<const CompactionFilter> _user_comp_filter_from_factory,
+      uint64_t current_time, Statistics* stats)
+      : LayeredCompactionFilterBase(_user_comp_filter,
+                                    std::move(_user_comp_filter_from_factory)),
+        context_(std::move(_context)),
+        current_time_(current_time),
+        statistics_(stats) {}
+
+  ~BlobIndexCompactionFilterBase() override;
+
+  // Filter expired blob indexes regardless of snapshots.
+  bool IgnoreSnapshots() const override { return true; }
+
+  Decision FilterV2(int level, const Slice& key, ValueType value_type,
+                    const Slice& value, std::string* new_value,
+                    std::string* skip_until) const override;
+
+  bool IsStackedBlobDbInternalCompactionFilter() const override { return true; }
+
+ protected:
+  bool IsBlobFileOpened() const;
+  virtual bool OpenNewBlobFileIfNeeded() const;
+  bool ReadBlobFromOldFile(const Slice& key, const BlobIndex& blob_index,
+                           PinnableSlice* blob, bool need_decompress,
+                           CompressionType* compression_type) const;
+  bool WriteBlobToNewFile(const Slice& key, const Slice& blob,
+                          uint64_t* new_blob_file_number,
+                          uint64_t* new_blob_offset) const;
+  bool CloseAndRegisterNewBlobFileIfNeeded() const;
+  bool CloseAndRegisterNewBlobFile() const;
+
+  Statistics* statistics() const { return statistics_; }
+  const BlobCompactionContext& context() const { return context_; }
+
+ private:
+  Decision HandleValueChange(const Slice& key, std::string* new_value) const;
+
+ private:
+  BlobCompactionContext context_;
+  const uint64_t current_time_;
+  Statistics* statistics_;
+
+  mutable std::shared_ptr<BlobFile> blob_file_;
+  mutable std::shared_ptr<BlobLogWriter> writer_;
+
+  // It is safe to not using std::atomic since the compaction filter, created
+  // from a compaction filter factroy, will not be called from multiple threads.
+  mutable uint64_t expired_count_ = 0;
+  mutable uint64_t expired_size_ = 0;
+  mutable uint64_t evicted_count_ = 0;
+  mutable uint64_t evicted_size_ = 0;
+};
+
+class BlobIndexCompactionFilter : public BlobIndexCompactionFilterBase {
+ public:
+  BlobIndexCompactionFilter(
+      BlobCompactionContext&& _context,
+      const CompactionFilter* _user_comp_filter,
+      std::unique_ptr<const CompactionFilter> _user_comp_filter_from_factory,
+      uint64_t current_time, Statistics* stats)
+      : BlobIndexCompactionFilterBase(std::move(_context), _user_comp_filter,
+                                      std::move(_user_comp_filter_from_factory),
+                                      current_time, stats) {}
+
+  const char* Name() const override { return "BlobIndexCompactionFilter"; }
+};
+
+class BlobIndexCompactionFilterGC : public BlobIndexCompactionFilterBase {
+ public:
+  BlobIndexCompactionFilterGC(
+      BlobCompactionContext&& _context, BlobCompactionContextGC&& context_gc,
+      const CompactionFilter* _user_comp_filter,
+      std::unique_ptr<const CompactionFilter> _user_comp_filter_from_factory,
+      uint64_t current_time, Statistics* stats)
+      : BlobIndexCompactionFilterBase(std::move(_context), _user_comp_filter,
+                                      std::move(_user_comp_filter_from_factory),
+                                      current_time, stats),
+        context_gc_(std::move(context_gc)) {}
+
+  ~BlobIndexCompactionFilterGC() override;
+
+  const char* Name() const override { return "BlobIndexCompactionFilterGC"; }
+
+  BlobDecision PrepareBlobOutput(const Slice& key, const Slice& existing_value,
+                                 std::string* new_value) const override;
+
+ private:
+  bool OpenNewBlobFileIfNeeded() const override;
+
+ private:
+  BlobCompactionContextGC context_gc_;
+  mutable BlobDBGarbageCollectionStats gc_stats_;
+};
+
+// Compaction filter factory; similarly to the filters above, it comes
+// in two flavors, one that creates filters that support GC, and one
+// that creates non-GC filters.
+class BlobIndexCompactionFilterFactoryBase : public CompactionFilterFactory {
+ public:
+  BlobIndexCompactionFilterFactoryBase(BlobDBImpl* _blob_db_impl,
+                                       SystemClock* _clock,
+                                       const ColumnFamilyOptions& _cf_options,
+                                       Statistics* _statistics)
+      : blob_db_impl_(_blob_db_impl),
+        clock_(_clock),
+        statistics_(_statistics),
+        user_comp_filter_(_cf_options.compaction_filter),
+        user_comp_filter_factory_(_cf_options.compaction_filter_factory) {}
+
+ protected:
+  std::unique_ptr<CompactionFilter> CreateUserCompactionFilterFromFactory(
+      const CompactionFilter::Context& context) const;
+
+  BlobDBImpl* blob_db_impl() const { return blob_db_impl_; }
+  SystemClock* clock() const { return clock_; }
+  Statistics* statistics() const { return statistics_; }
+  const CompactionFilter* user_comp_filter() const { return user_comp_filter_; }
+
+ private:
+  BlobDBImpl* blob_db_impl_;
+  SystemClock* clock_;
+  Statistics* statistics_;
+  const CompactionFilter* user_comp_filter_;
+  std::shared_ptr<CompactionFilterFactory> user_comp_filter_factory_;
+};
+
+class BlobIndexCompactionFilterFactory
+    : public BlobIndexCompactionFilterFactoryBase {
+ public:
+  BlobIndexCompactionFilterFactory(BlobDBImpl* _blob_db_impl,
+                                   SystemClock* _clock,
+                                   const ColumnFamilyOptions& _cf_options,
+                                   Statistics* _statistics)
+      : BlobIndexCompactionFilterFactoryBase(_blob_db_impl, _clock, _cf_options,
+                                             _statistics) {}
+
+  const char* Name() const override {
+    return "BlobIndexCompactionFilterFactory";
+  }
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override;
+};
+
+class BlobIndexCompactionFilterFactoryGC
+    : public BlobIndexCompactionFilterFactoryBase {
+ public:
+  BlobIndexCompactionFilterFactoryGC(BlobDBImpl* _blob_db_impl,
+                                     SystemClock* _clock,
+                                     const ColumnFamilyOptions& _cf_options,
+                                     Statistics* _statistics)
+      : BlobIndexCompactionFilterFactoryBase(_blob_db_impl, _clock, _cf_options,
+                                             _statistics) {}
+
+  const char* Name() const override {
+    return "BlobIndexCompactionFilterFactoryGC";
+  }
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override;
+};
+
+}  // namespace blob_db
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/blob_db/blob_db.cc b/src/rocksdb/utilities/blob_db/blob_db.cc
new file mode 100644
index 000000000..cbd02e68e
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_db.cc
@@ -0,0 +1,114 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+
+#include "utilities/blob_db/blob_db.h"
+
+#include <cinttypes>
+
+#include "logging/logging.h"
+#include "utilities/blob_db/blob_db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace blob_db {
+
+Status BlobDB::Open(const Options& options, const BlobDBOptions& bdb_options,
+                    const std::string& dbname, BlobDB** blob_db) {
+  *blob_db = nullptr;
+  DBOptions db_options(options);
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+  std::vector<ColumnFamilyHandle*> handles;
+  Status s = BlobDB::Open(db_options, bdb_options, dbname, column_families,
+                          &handles, blob_db);
+  if (s.ok()) {
+    assert(handles.size() == 1);
+    // i can delete the handle since DBImpl is always holding a reference to
+    // default column family
+    delete handles[0];
+  }
+  return s;
+}
+
+Status BlobDB::Open(const DBOptions& db_options,
+                    const BlobDBOptions& bdb_options, const std::string& dbname,
+                    const std::vector<ColumnFamilyDescriptor>& column_families,
+                    std::vector<ColumnFamilyHandle*>* handles,
+                    BlobDB** blob_db) {
+  assert(handles);
+
+  if (column_families.size() != 1 ||
+      column_families[0].name != kDefaultColumnFamilyName) {
+    return Status::NotSupported(
+        "Blob DB doesn't support non-default column family.");
+  }
+
+  BlobDBImpl* blob_db_impl = new BlobDBImpl(dbname, bdb_options, db_options,
+                                            column_families[0].options);
+  Status s = blob_db_impl->Open(handles);
+  if (s.ok()) {
+    *blob_db = static_cast<BlobDB*>(blob_db_impl);
+  } else {
+    if (!handles->empty()) {
+      for (ColumnFamilyHandle* cfh : *handles) {
+        blob_db_impl->DestroyColumnFamilyHandle(cfh);
+      }
+
+      handles->clear();
+    }
+
+    delete blob_db_impl;
+    *blob_db = nullptr;
+  }
+  return s;
+}
+
+BlobDB::BlobDB() : StackableDB(nullptr) {}
+
+void BlobDBOptions::Dump(Logger* log) const {
+  ROCKS_LOG_HEADER(
+      log, "                                  BlobDBOptions.blob_dir: %s",
+      blob_dir.c_str());
+  ROCKS_LOG_HEADER(
+      log, "                             BlobDBOptions.path_relative: %d",
+      path_relative);
+  ROCKS_LOG_HEADER(
+      log, "                                   BlobDBOptions.is_fifo: %d",
+      is_fifo);
+  ROCKS_LOG_HEADER(
+      log, "                               BlobDBOptions.max_db_size: %" PRIu64,
+      max_db_size);
+  ROCKS_LOG_HEADER(
+      log, "                            BlobDBOptions.ttl_range_secs: %" PRIu64,
+      ttl_range_secs);
+  ROCKS_LOG_HEADER(
+      log, "                             BlobDBOptions.min_blob_size: %" PRIu64,
+      min_blob_size);
+  ROCKS_LOG_HEADER(
+      log, "                            BlobDBOptions.bytes_per_sync: %" PRIu64,
+      bytes_per_sync);
+  ROCKS_LOG_HEADER(
+      log, "                            BlobDBOptions.blob_file_size: %" PRIu64,
+      blob_file_size);
+  ROCKS_LOG_HEADER(
+      log, "                               BlobDBOptions.compression: %d",
+      static_cast<int>(compression));
+  ROCKS_LOG_HEADER(
+      log, "                 BlobDBOptions.enable_garbage_collection: %d",
+      enable_garbage_collection);
+  ROCKS_LOG_HEADER(
+      log, "                 BlobDBOptions.garbage_collection_cutoff: %f",
+      garbage_collection_cutoff);
+  ROCKS_LOG_HEADER(
+      log, "                  BlobDBOptions.disable_background_tasks: %d",
+      disable_background_tasks);
+}
+
+}  // namespace blob_db
+}  // namespace ROCKSDB_NAMESPACE
+#endif
diff --git a/src/rocksdb/utilities/blob_db/blob_db.h b/src/rocksdb/utilities/blob_db/blob_db.h
new file mode 100644
index 000000000..e9d92486f
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_db.h
@@ -0,0 +1,266 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/stackable_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace blob_db {
+
+// A wrapped database which puts values of KV pairs in a separate log
+// and store location to the log in the underlying DB.
+//
+// The factory needs to be moved to include/rocksdb/utilities to allow
+// users to use blob DB.
+
+constexpr uint64_t kNoExpiration = std::numeric_limits<uint64_t>::max();
+
+struct BlobDBOptions {
+  // Name of the directory under the base DB where blobs will be stored. Using
+  // a directory where the base DB stores its SST files is not supported.
+  // Default is "blob_dir"
+  std::string blob_dir = "blob_dir";
+
+  // whether the blob_dir path is relative or absolute.
+  bool path_relative = true;
+
+  // When max_db_size is reached, evict blob files to free up space
+  // instead of returnning NoSpace error on write. Blob files will be
+  // evicted from oldest to newest, based on file creation time.
+  bool is_fifo = false;
+
+  // Maximum size of the database (including SST files and blob files).
+  //
+  // Default: 0 (no limits)
+  uint64_t max_db_size = 0;
+
+  // a new bucket is opened, for ttl_range. So if ttl_range is 600seconds
+  // (10 minutes), and the first bucket starts at 1471542000
+  // then the blob buckets will be
+  // first bucket is 1471542000 - 1471542600
+  // second bucket is 1471542600 - 1471543200
+  // and so on
+  uint64_t ttl_range_secs = 3600;
+
+  // The smallest value to store in blob log. Values smaller than this threshold
+  // will be inlined in base DB together with the key.
+  uint64_t min_blob_size = 0;
+
+  // Allows OS to incrementally sync blob files to disk for every
+  // bytes_per_sync bytes written. Users shouldn't rely on it for
+  // persistency guarantee.
+  uint64_t bytes_per_sync = 512 * 1024;
+
+  // the target size of each blob file. File will become immutable
+  // after it exceeds that size
+  uint64_t blob_file_size = 256 * 1024 * 1024;
+
+  // what compression to use for Blob's
+  CompressionType compression = kNoCompression;
+
+  // If enabled, BlobDB cleans up stale blobs in non-TTL files during compaction
+  // by rewriting the remaining live blobs to new files.
+  bool enable_garbage_collection = false;
+
+  // The cutoff in terms of blob file age for garbage collection. Blobs in
+  // the oldest N non-TTL blob files will be rewritten when encountered during
+  // compaction, where N = garbage_collection_cutoff * number_of_non_TTL_files.
+  double garbage_collection_cutoff = 0.25;
+
+  // Disable all background job. Used for test only.
+  bool disable_background_tasks = false;
+
+  void Dump(Logger* log) const;
+};
+
+class BlobDB : public StackableDB {
+ public:
+  using ROCKSDB_NAMESPACE::StackableDB::Put;
+  virtual Status Put(const WriteOptions& options, const Slice& key,
+                     const Slice& value) override = 0;
+  virtual Status Put(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& value) override {
+    if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
+      return Status::NotSupported(
+          "Blob DB doesn't support non-default column family.");
+    }
+    return Put(options, key, value);
+  }
+
+  using ROCKSDB_NAMESPACE::StackableDB::Delete;
+  virtual Status Delete(const WriteOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const Slice& key) override {
+    if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
+      return Status::NotSupported(
+          "Blob DB doesn't support non-default column family.");
+    }
+    assert(db_ != nullptr);
+    return db_->Delete(options, column_family, key);
+  }
+
+  virtual Status PutWithTTL(const WriteOptions& options, const Slice& key,
+                            const Slice& value, uint64_t ttl) = 0;
+  virtual Status PutWithTTL(const WriteOptions& options,
+                            ColumnFamilyHandle* column_family, const Slice& key,
+                            const Slice& value, uint64_t ttl) {
+    if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
+      return Status::NotSupported(
+          "Blob DB doesn't support non-default column family.");
+    }
+    return PutWithTTL(options, key, value, ttl);
+  }
+
+  // Put with expiration. Key with expiration time equal to
+  // std::numeric_limits<uint64_t>::max() means the key don't expire.
+  virtual Status PutUntil(const WriteOptions& options, const Slice& key,
+                          const Slice& value, uint64_t expiration) = 0;
+  virtual Status PutUntil(const WriteOptions& options,
+                          ColumnFamilyHandle* column_family, const Slice& key,
+                          const Slice& value, uint64_t expiration) {
+    if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
+      return Status::NotSupported(
+          "Blob DB doesn't support non-default column family.");
+    }
+    return PutUntil(options, key, value, expiration);
+  }
+
+  using ROCKSDB_NAMESPACE::StackableDB::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* value) override = 0;
+
+  // Get value and expiration.
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* value, uint64_t* expiration) = 0;
+  virtual Status Get(const ReadOptions& options, const Slice& key,
+                     PinnableSlice* value, uint64_t* expiration) {
+    return Get(options, DefaultColumnFamily(), key, value, expiration);
+  }
+
+  using ROCKSDB_NAMESPACE::StackableDB::MultiGet;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options, const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override = 0;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override {
+    for (auto column_family : column_families) {
+      if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
+        return std::vector<Status>(
+            column_families.size(),
+            Status::NotSupported(
+                "Blob DB doesn't support non-default column family."));
+      }
+    }
+    return MultiGet(options, keys, values);
+  }
+  virtual void MultiGet(const ReadOptions& /*options*/,
+                        ColumnFamilyHandle* /*column_family*/,
+                        const size_t num_keys, const Slice* /*keys*/,
+                        PinnableSlice* /*values*/, Status* statuses,
+                        const bool /*sorted_input*/ = false) override {
+    for (size_t i = 0; i < num_keys; ++i) {
+      statuses[i] =
+          Status::NotSupported("Blob DB doesn't support batched MultiGet");
+    }
+  }
+
+  using ROCKSDB_NAMESPACE::StackableDB::SingleDelete;
+  virtual Status SingleDelete(const WriteOptions& /*wopts*/,
+                              ColumnFamilyHandle* /*column_family*/,
+                              const Slice& /*key*/) override {
+    return Status::NotSupported("Not supported operation in blob db.");
+  }
+
+  using ROCKSDB_NAMESPACE::StackableDB::Merge;
+  virtual Status Merge(const WriteOptions& /*options*/,
+                       ColumnFamilyHandle* /*column_family*/,
+                       const Slice& /*key*/, const Slice& /*value*/) override {
+    return Status::NotSupported("Not supported operation in blob db.");
+  }
+
+  virtual Status Write(const WriteOptions& opts,
+                       WriteBatch* updates) override = 0;
+
+  using ROCKSDB_NAMESPACE::StackableDB::NewIterator;
+  virtual Iterator* NewIterator(const ReadOptions& options) override = 0;
+  virtual Iterator* NewIterator(const ReadOptions& options,
+                                ColumnFamilyHandle* column_family) override {
+    if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
+      // Blob DB doesn't support non-default column family.
+      return nullptr;
+    }
+    return NewIterator(options);
+  }
+
+  Status CompactFiles(
+      const CompactionOptions& compact_options,
+      const std::vector<std::string>& input_file_names, const int output_level,
+      const int output_path_id = -1,
+      std::vector<std::string>* const output_file_names = nullptr,
+      CompactionJobInfo* compaction_job_info = nullptr) override = 0;
+  Status CompactFiles(
+      const CompactionOptions& compact_options,
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& input_file_names, const int output_level,
+      const int output_path_id = -1,
+      std::vector<std::string>* const output_file_names = nullptr,
+      CompactionJobInfo* compaction_job_info = nullptr) override {
+    if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
+      return Status::NotSupported(
+          "Blob DB doesn't support non-default column family.");
+    }
+
+    return CompactFiles(compact_options, input_file_names, output_level,
+                        output_path_id, output_file_names, compaction_job_info);
+  }
+
+  using ROCKSDB_NAMESPACE::StackableDB::Close;
+  virtual Status Close() override = 0;
+
+  // Opening blob db.
+  static Status Open(const Options& options, const BlobDBOptions& bdb_options,
+                     const std::string& dbname, BlobDB** blob_db);
+
+  static Status Open(const DBOptions& db_options,
+                     const BlobDBOptions& bdb_options,
+                     const std::string& dbname,
+                     const std::vector<ColumnFamilyDescriptor>& column_families,
+                     std::vector<ColumnFamilyHandle*>* handles,
+                     BlobDB** blob_db);
+
+  virtual BlobDBOptions GetBlobDBOptions() const = 0;
+
+  virtual Status SyncBlobFiles() = 0;
+
+  virtual ~BlobDB() {}
+
+ protected:
+  explicit BlobDB();
+};
+
+// Destroy the content of the database.
+Status DestroyBlobDB(const std::string& dbname, const Options& options,
+                     const BlobDBOptions& bdb_options);
+
+}  // namespace blob_db
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/blob_db/blob_db_gc_stats.h b/src/rocksdb/utilities/blob_db/blob_db_gc_stats.h
new file mode 100644
index 000000000..fea6b0032
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_db_gc_stats.h
@@ -0,0 +1,56 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#include <cstdint>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+#ifndef ROCKSDB_LITE
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace blob_db {
+
+/**
+ * Statistics related to a single garbage collection pass (i.e. a single
+ * (sub)compaction).
+ */
+class BlobDBGarbageCollectionStats {
+ public:
+  uint64_t AllBlobs() const { return all_blobs_; }
+  uint64_t AllBytes() const { return all_bytes_; }
+  uint64_t RelocatedBlobs() const { return relocated_blobs_; }
+  uint64_t RelocatedBytes() const { return relocated_bytes_; }
+  uint64_t NewFiles() const { return new_files_; }
+  bool HasError() const { return error_; }
+
+  void AddBlob(uint64_t size) {
+    ++all_blobs_;
+    all_bytes_ += size;
+  }
+
+  void AddRelocatedBlob(uint64_t size) {
+    ++relocated_blobs_;
+    relocated_bytes_ += size;
+  }
+
+  void AddNewFile() { ++new_files_; }
+
+  void SetError() { error_ = true; }
+
+ private:
+  uint64_t all_blobs_ = 0;
+  uint64_t all_bytes_ = 0;
+  uint64_t relocated_blobs_ = 0;
+  uint64_t relocated_bytes_ = 0;
+  uint64_t new_files_ = 0;
+  bool error_ = false;
+};
+
+}  // namespace blob_db
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/blob_db/blob_db_impl.cc b/src/rocksdb/utilities/blob_db/blob_db_impl.cc
new file mode 100644
index 000000000..87e294c5c
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_db_impl.cc
@@ -0,0 +1,2177 @@
+
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#ifndef ROCKSDB_LITE
+
+#include "utilities/blob_db/blob_db_impl.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <iomanip>
+#include <memory>
+#include <sstream>
+
+#include "db/blob/blob_index.h"
+#include "db/db_impl/db_impl.h"
+#include "db/write_batch_internal.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "file/sst_file_manager_impl.h"
+#include "file/writable_file_writer.h"
+#include "logging/logging.h"
+#include "monitoring/instrumented_mutex.h"
+#include "monitoring/statistics.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/utilities/stackable_db.h"
+#include "rocksdb/utilities/transaction.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_builder.h"
+#include "table/meta_blocks.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/crc32c.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/stop_watch.h"
+#include "util/timer_queue.h"
+#include "utilities/blob_db/blob_compaction_filter.h"
+#include "utilities/blob_db/blob_db_iterator.h"
+#include "utilities/blob_db/blob_db_listener.h"
+
+namespace {
+int kBlockBasedTableVersionFormat = 2;
+}  // end namespace
+
+namespace ROCKSDB_NAMESPACE {
+namespace blob_db {
+
+bool BlobFileComparator::operator()(
+    const std::shared_ptr<BlobFile>& lhs,
+    const std::shared_ptr<BlobFile>& rhs) const {
+  return lhs->BlobFileNumber() > rhs->BlobFileNumber();
+}
+
+bool BlobFileComparatorTTL::operator()(
+    const std::shared_ptr<BlobFile>& lhs,
+    const std::shared_ptr<BlobFile>& rhs) const {
+  assert(lhs->HasTTL() && rhs->HasTTL());
+  if (lhs->expiration_range_.first < rhs->expiration_range_.first) {
+    return true;
+  }
+  if (lhs->expiration_range_.first > rhs->expiration_range_.first) {
+    return false;
+  }
+  return lhs->BlobFileNumber() < rhs->BlobFileNumber();
+}
+
+BlobDBImpl::BlobDBImpl(const std::string& dbname,
+                       const BlobDBOptions& blob_db_options,
+                       const DBOptions& db_options,
+                       const ColumnFamilyOptions& cf_options)
+    : BlobDB(),
+      dbname_(dbname),
+      db_impl_(nullptr),
+      env_(db_options.env),
+      bdb_options_(blob_db_options),
+      db_options_(db_options),
+      cf_options_(cf_options),
+      file_options_(db_options),
+      statistics_(db_options_.statistics.get()),
+      next_file_number_(1),
+      flush_sequence_(0),
+      closed_(true),
+      open_file_count_(0),
+      total_blob_size_(0),
+      live_sst_size_(0),
+      fifo_eviction_seq_(0),
+      evict_expiration_up_to_(0),
+      debug_level_(0) {
+  clock_ = env_->GetSystemClock().get();
+  blob_dir_ = (bdb_options_.path_relative)
+                  ? dbname + "/" + bdb_options_.blob_dir
+                  : bdb_options_.blob_dir;
+  file_options_.bytes_per_sync = blob_db_options.bytes_per_sync;
+}
+
+BlobDBImpl::~BlobDBImpl() {
+  tqueue_.shutdown();
+  // CancelAllBackgroundWork(db_, true);
+  Status s __attribute__((__unused__)) = Close();
+  assert(s.ok());
+}
+
+Status BlobDBImpl::Close() {
+  if (closed_) {
+    return Status::OK();
+  }
+  closed_ = true;
+
+  // Close base DB before BlobDBImpl destructs to stop event listener and
+  // compaction filter call.
+  Status s = db_->Close();
+  // delete db_ anyway even if close failed.
+  delete db_;
+  // Reset pointers to avoid StackableDB delete the pointer again.
+  db_ = nullptr;
+  db_impl_ = nullptr;
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = SyncBlobFiles();
+  return s;
+}
+
+BlobDBOptions BlobDBImpl::GetBlobDBOptions() const { return bdb_options_; }
+
+Status BlobDBImpl::Open(std::vector<ColumnFamilyHandle*>* handles) {
+  assert(handles != nullptr);
+  assert(db_ == nullptr);
+
+  if (blob_dir_.empty()) {
+    return Status::NotSupported("No blob directory in options");
+  }
+
+  if (bdb_options_.garbage_collection_cutoff < 0.0 ||
+      bdb_options_.garbage_collection_cutoff > 1.0) {
+    return Status::InvalidArgument(
+        "Garbage collection cutoff must be in the interval [0.0, 1.0]");
+  }
+
+  // Temporarily disable compactions in the base DB during open; save the user
+  // defined value beforehand so we can restore it once BlobDB is initialized.
+  // Note: this is only needed if garbage collection is enabled.
+  const bool disable_auto_compactions = cf_options_.disable_auto_compactions;
+
+  if (bdb_options_.enable_garbage_collection) {
+    cf_options_.disable_auto_compactions = true;
+  }
+
+  Status s;
+
+  // Create info log.
+  if (db_options_.info_log == nullptr) {
+    s = CreateLoggerFromOptions(dbname_, db_options_, &db_options_.info_log);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  ROCKS_LOG_INFO(db_options_.info_log, "Opening BlobDB...");
+
+  if ((cf_options_.compaction_filter != nullptr ||
+       cf_options_.compaction_filter_factory != nullptr)) {
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "BlobDB only support compaction filter on non-TTL values.");
+  }
+
+  // Open blob directory.
+  s = env_->CreateDirIfMissing(blob_dir_);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(db_options_.info_log,
+                    "Failed to create blob_dir %s, status: %s",
+                    blob_dir_.c_str(), s.ToString().c_str());
+  }
+  s = env_->GetFileSystem()->NewDirectory(blob_dir_, IOOptions(), &dir_ent_,
+                                          nullptr);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(db_options_.info_log,
+                    "Failed to open blob_dir %s, status: %s", blob_dir_.c_str(),
+                    s.ToString().c_str());
+    return s;
+  }
+
+  // Open blob files.
+  s = OpenAllBlobFiles();
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Update options
+  if (bdb_options_.enable_garbage_collection) {
+    db_options_.listeners.push_back(std::make_shared<BlobDBListenerGC>(this));
+    cf_options_.compaction_filter_factory =
+        std::make_shared<BlobIndexCompactionFilterFactoryGC>(
+            this, clock_, cf_options_, statistics_);
+  } else {
+    db_options_.listeners.push_back(std::make_shared<BlobDBListener>(this));
+    cf_options_.compaction_filter_factory =
+        std::make_shared<BlobIndexCompactionFilterFactory>(
+            this, clock_, cf_options_, statistics_);
+  }
+
+  // Reset user compaction filter after building into compaction factory.
+  cf_options_.compaction_filter = nullptr;
+
+  // Open base db.
+  ColumnFamilyDescriptor cf_descriptor(kDefaultColumnFamilyName, cf_options_);
+  s = DB::Open(db_options_, dbname_, {cf_descriptor}, handles, &db_);
+  if (!s.ok()) {
+    return s;
+  }
+  db_impl_ = static_cast_with_check<DBImpl>(db_->GetRootDB());
+
+  // Sanitize the blob_dir provided. Using a directory where the
+  // base DB stores its files for the default CF is not supported.
+  const ColumnFamilyData* const cfd =
+      static_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->cfd();
+  assert(cfd);
+
+  const ImmutableCFOptions* const ioptions = cfd->ioptions();
+  assert(ioptions);
+
+  assert(env_);
+
+  for (const auto& cf_path : ioptions->cf_paths) {
+    bool blob_dir_same_as_cf_dir = false;
+    s = env_->AreFilesSame(blob_dir_, cf_path.path, &blob_dir_same_as_cf_dir);
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(db_options_.info_log,
+                      "Error while sanitizing blob_dir %s, status: %s",
+                      blob_dir_.c_str(), s.ToString().c_str());
+      return s;
+    }
+
+    if (blob_dir_same_as_cf_dir) {
+      return Status::NotSupported(
+          "Using the base DB's storage directories for BlobDB files is not "
+          "supported.");
+    }
+  }
+
+  // Initialize SST file <-> oldest blob file mapping if garbage collection
+  // is enabled.
+  if (bdb_options_.enable_garbage_collection) {
+    std::vector<LiveFileMetaData> live_files;
+    db_->GetLiveFilesMetaData(&live_files);
+
+    InitializeBlobFileToSstMapping(live_files);
+
+    MarkUnreferencedBlobFilesObsoleteDuringOpen();
+
+    if (!disable_auto_compactions) {
+      s = db_->EnableAutoCompaction(*handles);
+      if (!s.ok()) {
+        ROCKS_LOG_ERROR(
+            db_options_.info_log,
+            "Failed to enable automatic compactions during open, status: %s",
+            s.ToString().c_str());
+        return s;
+      }
+    }
+  }
+
+  // Add trash files in blob dir to file delete scheduler.
+  SstFileManagerImpl* sfm = static_cast<SstFileManagerImpl*>(
+      db_impl_->immutable_db_options().sst_file_manager.get());
+  DeleteScheduler::CleanupDirectory(env_, sfm, blob_dir_);
+
+  UpdateLiveSSTSize();
+
+  // Start background jobs.
+  if (!bdb_options_.disable_background_tasks) {
+    StartBackgroundTasks();
+  }
+
+  ROCKS_LOG_INFO(db_options_.info_log, "BlobDB pointer %p", this);
+  bdb_options_.Dump(db_options_.info_log.get());
+  closed_ = false;
+  return s;
+}
+
+void BlobDBImpl::StartBackgroundTasks() {
+  // store a call to a member function and object
+  tqueue_.add(
+      kReclaimOpenFilesPeriodMillisecs,
+      std::bind(&BlobDBImpl::ReclaimOpenFiles, this, std::placeholders::_1));
+  tqueue_.add(
+      kDeleteObsoleteFilesPeriodMillisecs,
+      std::bind(&BlobDBImpl::DeleteObsoleteFiles, this, std::placeholders::_1));
+  tqueue_.add(kSanityCheckPeriodMillisecs,
+              std::bind(&BlobDBImpl::SanityCheck, this, std::placeholders::_1));
+  tqueue_.add(
+      kEvictExpiredFilesPeriodMillisecs,
+      std::bind(&BlobDBImpl::EvictExpiredFiles, this, std::placeholders::_1));
+}
+
+Status BlobDBImpl::GetAllBlobFiles(std::set<uint64_t>* file_numbers) {
+  assert(file_numbers != nullptr);
+  std::vector<std::string> all_files;
+  Status s = env_->GetChildren(blob_dir_, &all_files);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(db_options_.info_log,
+                    "Failed to get list of blob files, status: %s",
+                    s.ToString().c_str());
+    return s;
+  }
+
+  for (const auto& file_name : all_files) {
+    uint64_t file_number;
+    FileType type;
+    bool success = ParseFileName(file_name, &file_number, &type);
+    if (success && type == kBlobFile) {
+      file_numbers->insert(file_number);
+    } else {
+      ROCKS_LOG_WARN(db_options_.info_log,
+                     "Skipping file in blob directory: %s", file_name.c_str());
+    }
+  }
+
+  return s;
+}
+
+Status BlobDBImpl::OpenAllBlobFiles() {
+  std::set<uint64_t> file_numbers;
+  Status s = GetAllBlobFiles(&file_numbers);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (!file_numbers.empty()) {
+    next_file_number_.store(*file_numbers.rbegin() + 1);
+  }
+
+  std::ostringstream blob_file_oss;
+  std::ostringstream live_imm_oss;
+  std::ostringstream obsolete_file_oss;
+
+  for (auto& file_number : file_numbers) {
+    std::shared_ptr<BlobFile> blob_file = std::make_shared<BlobFile>(
+        this, blob_dir_, file_number, db_options_.info_log.get());
+    blob_file->MarkImmutable(/* sequence */ 0);
+
+    // Read file header and footer
+    Status read_metadata_status =
+        blob_file->ReadMetadata(env_->GetFileSystem(), file_options_);
+    if (read_metadata_status.IsCorruption()) {
+      // Remove incomplete file.
+      if (!obsolete_files_.empty()) {
+        obsolete_file_oss << ", ";
+      }
+      obsolete_file_oss << file_number;
+
+      ObsoleteBlobFile(blob_file, 0 /*obsolete_seq*/, false /*update_size*/);
+      continue;
+    } else if (!read_metadata_status.ok()) {
+      ROCKS_LOG_ERROR(db_options_.info_log,
+                      "Unable to read metadata of blob file %" PRIu64
+                      ", status: '%s'",
+                      file_number, read_metadata_status.ToString().c_str());
+      return read_metadata_status;
+    }
+
+    total_blob_size_ += blob_file->GetFileSize();
+
+    if (!blob_files_.empty()) {
+      blob_file_oss << ", ";
+    }
+    blob_file_oss << file_number;
+
+    blob_files_[file_number] = blob_file;
+
+    if (!blob_file->HasTTL()) {
+      if (!live_imm_non_ttl_blob_files_.empty()) {
+        live_imm_oss << ", ";
+      }
+      live_imm_oss << file_number;
+
+      live_imm_non_ttl_blob_files_[file_number] = blob_file;
+    }
+  }
+
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "Found %" ROCKSDB_PRIszt " blob files: %s", blob_files_.size(),
+                 blob_file_oss.str().c_str());
+  ROCKS_LOG_INFO(
+      db_options_.info_log, "Found %" ROCKSDB_PRIszt " non-TTL blob files: %s",
+      live_imm_non_ttl_blob_files_.size(), live_imm_oss.str().c_str());
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "Found %" ROCKSDB_PRIszt
+                 " incomplete or corrupted blob files: %s",
+                 obsolete_files_.size(), obsolete_file_oss.str().c_str());
+  return s;
+}
+
+template <typename Linker>
+void BlobDBImpl::LinkSstToBlobFileImpl(uint64_t sst_file_number,
+                                       uint64_t blob_file_number,
+                                       Linker linker) {
+  assert(bdb_options_.enable_garbage_collection);
+  assert(blob_file_number != kInvalidBlobFileNumber);
+
+  auto it = blob_files_.find(blob_file_number);
+  if (it == blob_files_.end()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Blob file %" PRIu64
+                   " not found while trying to link "
+                   "SST file %" PRIu64,
+                   blob_file_number, sst_file_number);
+    return;
+  }
+
+  BlobFile* const blob_file = it->second.get();
+  assert(blob_file);
+
+  linker(blob_file, sst_file_number);
+
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "Blob file %" PRIu64 " linked to SST file %" PRIu64,
+                 blob_file_number, sst_file_number);
+}
+
+void BlobDBImpl::LinkSstToBlobFile(uint64_t sst_file_number,
+                                   uint64_t blob_file_number) {
+  auto linker = [](BlobFile* blob_file, uint64_t sst_file) {
+    WriteLock file_lock(&blob_file->mutex_);
+    blob_file->LinkSstFile(sst_file);
+  };
+
+  LinkSstToBlobFileImpl(sst_file_number, blob_file_number, linker);
+}
+
+void BlobDBImpl::LinkSstToBlobFileNoLock(uint64_t sst_file_number,
+                                         uint64_t blob_file_number) {
+  auto linker = [](BlobFile* blob_file, uint64_t sst_file) {
+    blob_file->LinkSstFile(sst_file);
+  };
+
+  LinkSstToBlobFileImpl(sst_file_number, blob_file_number, linker);
+}
+
+void BlobDBImpl::UnlinkSstFromBlobFile(uint64_t sst_file_number,
+                                       uint64_t blob_file_number) {
+  assert(bdb_options_.enable_garbage_collection);
+  assert(blob_file_number != kInvalidBlobFileNumber);
+
+  auto it = blob_files_.find(blob_file_number);
+  if (it == blob_files_.end()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Blob file %" PRIu64
+                   " not found while trying to unlink "
+                   "SST file %" PRIu64,
+                   blob_file_number, sst_file_number);
+    return;
+  }
+
+  BlobFile* const blob_file = it->second.get();
+  assert(blob_file);
+
+  {
+    WriteLock file_lock(&blob_file->mutex_);
+    blob_file->UnlinkSstFile(sst_file_number);
+  }
+
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "Blob file %" PRIu64 " unlinked from SST file %" PRIu64,
+                 blob_file_number, sst_file_number);
+}
+
+void BlobDBImpl::InitializeBlobFileToSstMapping(
+    const std::vector<LiveFileMetaData>& live_files) {
+  assert(bdb_options_.enable_garbage_collection);
+
+  for (const auto& live_file : live_files) {
+    const uint64_t sst_file_number = live_file.file_number;
+    const uint64_t blob_file_number = live_file.oldest_blob_file_number;
+
+    if (blob_file_number == kInvalidBlobFileNumber) {
+      continue;
+    }
+
+    LinkSstToBlobFileNoLock(sst_file_number, blob_file_number);
+  }
+}
+
+void BlobDBImpl::ProcessFlushJobInfo(const FlushJobInfo& info) {
+  assert(bdb_options_.enable_garbage_collection);
+
+  WriteLock lock(&mutex_);
+
+  if (info.oldest_blob_file_number != kInvalidBlobFileNumber) {
+    LinkSstToBlobFile(info.file_number, info.oldest_blob_file_number);
+  }
+
+  assert(flush_sequence_ < info.largest_seqno);
+  flush_sequence_ = info.largest_seqno;
+
+  MarkUnreferencedBlobFilesObsolete();
+}
+
+void BlobDBImpl::ProcessCompactionJobInfo(const CompactionJobInfo& info) {
+  assert(bdb_options_.enable_garbage_collection);
+
+  if (!info.status.ok()) {
+    return;
+  }
+
+  // Note: the same SST file may appear in both the input and the output
+  // file list in case of a trivial move. We walk through the two lists
+  // below in a fashion that's similar to merge sort to detect this.
+
+  auto cmp = [](const CompactionFileInfo& lhs, const CompactionFileInfo& rhs) {
+    return lhs.file_number < rhs.file_number;
+  };
+
+  auto inputs = info.input_file_infos;
+  auto iit = inputs.begin();
+  const auto iit_end = inputs.end();
+
+  std::sort(iit, iit_end, cmp);
+
+  auto outputs = info.output_file_infos;
+  auto oit = outputs.begin();
+  const auto oit_end = outputs.end();
+
+  std::sort(oit, oit_end, cmp);
+
+  WriteLock lock(&mutex_);
+
+  while (iit != iit_end && oit != oit_end) {
+    const auto& input = *iit;
+    const auto& output = *oit;
+
+    if (input.file_number == output.file_number) {
+      ++iit;
+      ++oit;
+    } else if (input.file_number < output.file_number) {
+      if (input.oldest_blob_file_number != kInvalidBlobFileNumber) {
+        UnlinkSstFromBlobFile(input.file_number, input.oldest_blob_file_number);
+      }
+
+      ++iit;
+    } else {
+      assert(output.file_number < input.file_number);
+
+      if (output.oldest_blob_file_number != kInvalidBlobFileNumber) {
+        LinkSstToBlobFile(output.file_number, output.oldest_blob_file_number);
+      }
+
+      ++oit;
+    }
+  }
+
+  while (iit != iit_end) {
+    const auto& input = *iit;
+
+    if (input.oldest_blob_file_number != kInvalidBlobFileNumber) {
+      UnlinkSstFromBlobFile(input.file_number, input.oldest_blob_file_number);
+    }
+
+    ++iit;
+  }
+
+  while (oit != oit_end) {
+    const auto& output = *oit;
+
+    if (output.oldest_blob_file_number != kInvalidBlobFileNumber) {
+      LinkSstToBlobFile(output.file_number, output.oldest_blob_file_number);
+    }
+
+    ++oit;
+  }
+
+  MarkUnreferencedBlobFilesObsolete();
+}
+
+bool BlobDBImpl::MarkBlobFileObsoleteIfNeeded(
+    const std::shared_ptr<BlobFile>& blob_file, SequenceNumber obsolete_seq) {
+  assert(blob_file);
+  assert(!blob_file->HasTTL());
+  assert(blob_file->Immutable());
+  assert(bdb_options_.enable_garbage_collection);
+
+  // Note: FIFO eviction could have marked this file obsolete already.
+  if (blob_file->Obsolete()) {
+    return true;
+  }
+
+  // We cannot mark this file (or any higher-numbered files for that matter)
+  // obsolete if it is referenced by any memtables or SSTs. We keep track of
+  // the SSTs explicitly. To account for memtables, we keep track of the highest
+  // sequence number received in flush notifications, and we do not mark the
+  // blob file obsolete if there are still unflushed memtables from before
+  // the time the blob file was closed.
+  if (blob_file->GetImmutableSequence() > flush_sequence_ ||
+      !blob_file->GetLinkedSstFiles().empty()) {
+    return false;
+  }
+
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "Blob file %" PRIu64 " is no longer needed, marking obsolete",
+                 blob_file->BlobFileNumber());
+
+  ObsoleteBlobFile(blob_file, obsolete_seq, /* update_size */ true);
+  return true;
+}
+
+template <class Functor>
+void BlobDBImpl::MarkUnreferencedBlobFilesObsoleteImpl(Functor mark_if_needed) {
+  assert(bdb_options_.enable_garbage_collection);
+
+  // Iterate through all live immutable non-TTL blob files, and mark them
+  // obsolete assuming no SST files or memtables rely on the blobs in them.
+  // Note: we need to stop as soon as we find a blob file that has any
+  // linked SSTs (or one potentially referenced by memtables).
+
+  uint64_t obsoleted_files = 0;
+
+  auto it = live_imm_non_ttl_blob_files_.begin();
+  while (it != live_imm_non_ttl_blob_files_.end()) {
+    const auto& blob_file = it->second;
+    assert(blob_file);
+    assert(blob_file->BlobFileNumber() == it->first);
+    assert(!blob_file->HasTTL());
+    assert(blob_file->Immutable());
+
+    // Small optimization: Obsolete() does an atomic read, so we can do
+    // this check without taking a lock on the blob file's mutex.
+    if (blob_file->Obsolete()) {
+      it = live_imm_non_ttl_blob_files_.erase(it);
+      continue;
+    }
+
+    if (!mark_if_needed(blob_file)) {
+      break;
+    }
+
+    it = live_imm_non_ttl_blob_files_.erase(it);
+
+    ++obsoleted_files;
+  }
+
+  if (obsoleted_files > 0) {
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "%" PRIu64 " blob file(s) marked obsolete by GC",
+                   obsoleted_files);
+    RecordTick(statistics_, BLOB_DB_GC_NUM_FILES, obsoleted_files);
+  }
+}
+
+void BlobDBImpl::MarkUnreferencedBlobFilesObsolete() {
+  const SequenceNumber obsolete_seq = GetLatestSequenceNumber();
+
+  MarkUnreferencedBlobFilesObsoleteImpl(
+      [this, obsolete_seq](const std::shared_ptr<BlobFile>& blob_file) {
+        WriteLock file_lock(&blob_file->mutex_);
+        return MarkBlobFileObsoleteIfNeeded(blob_file, obsolete_seq);
+      });
+}
+
+void BlobDBImpl::MarkUnreferencedBlobFilesObsoleteDuringOpen() {
+  MarkUnreferencedBlobFilesObsoleteImpl(
+      [this](const std::shared_ptr<BlobFile>& blob_file) {
+        return MarkBlobFileObsoleteIfNeeded(blob_file, /* obsolete_seq */ 0);
+      });
+}
+
+void BlobDBImpl::CloseRandomAccessLocked(
+    const std::shared_ptr<BlobFile>& bfile) {
+  bfile->CloseRandomAccessLocked();
+  open_file_count_--;
+}
+
+Status BlobDBImpl::GetBlobFileReader(
+    const std::shared_ptr<BlobFile>& blob_file,
+    std::shared_ptr<RandomAccessFileReader>* reader) {
+  assert(reader != nullptr);
+  bool fresh_open = false;
+  Status s = blob_file->GetReader(env_, file_options_, reader, &fresh_open);
+  if (s.ok() && fresh_open) {
+    assert(*reader != nullptr);
+    open_file_count_++;
+  }
+  return s;
+}
+
+std::shared_ptr<BlobFile> BlobDBImpl::NewBlobFile(
+    bool has_ttl, const ExpirationRange& expiration_range,
+    const std::string& reason) {
+  assert(has_ttl == (expiration_range.first || expiration_range.second));
+
+  uint64_t file_num = next_file_number_++;
+
+  const uint32_t column_family_id =
+      static_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
+  auto blob_file = std::make_shared<BlobFile>(
+      this, blob_dir_, file_num, db_options_.info_log.get(), column_family_id,
+      bdb_options_.compression, has_ttl, expiration_range);
+
+  ROCKS_LOG_DEBUG(db_options_.info_log, "New blob file created: %s reason='%s'",
+                  blob_file->PathName().c_str(), reason.c_str());
+  LogFlush(db_options_.info_log);
+
+  return blob_file;
+}
+
+void BlobDBImpl::RegisterBlobFile(std::shared_ptr<BlobFile> blob_file) {
+  const uint64_t blob_file_number = blob_file->BlobFileNumber();
+
+  auto it = blob_files_.lower_bound(blob_file_number);
+  assert(it == blob_files_.end() || it->first != blob_file_number);
+
+  blob_files_.insert(it,
+                     std::map<uint64_t, std::shared_ptr<BlobFile>>::value_type(
+                         blob_file_number, std::move(blob_file)));
+}
+
+Status BlobDBImpl::CreateWriterLocked(const std::shared_ptr<BlobFile>& bfile) {
+  std::string fpath(bfile->PathName());
+  std::unique_ptr<FSWritableFile> wfile;
+  const auto& fs = env_->GetFileSystem();
+
+  Status s = fs->ReopenWritableFile(fpath, file_options_, &wfile, nullptr);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(db_options_.info_log,
+                    "Failed to open blob file for write: %s status: '%s'"
+                    " exists: '%s'",
+                    fpath.c_str(), s.ToString().c_str(),
+                    fs->FileExists(fpath, file_options_.io_options, nullptr)
+                        .ToString()
+                        .c_str());
+    return s;
+  }
+
+  std::unique_ptr<WritableFileWriter> fwriter;
+  fwriter.reset(new WritableFileWriter(std::move(wfile), fpath, file_options_));
+
+  uint64_t boffset = bfile->GetFileSize();
+  if (debug_level_ >= 2 && boffset) {
+    ROCKS_LOG_DEBUG(db_options_.info_log,
+                    "Open blob file: %s with offset: %" PRIu64, fpath.c_str(),
+                    boffset);
+  }
+
+  BlobLogWriter::ElemType et = BlobLogWriter::kEtNone;
+  if (bfile->file_size_ == BlobLogHeader::kSize) {
+    et = BlobLogWriter::kEtFileHdr;
+  } else if (bfile->file_size_ > BlobLogHeader::kSize) {
+    et = BlobLogWriter::kEtRecord;
+  } else if (bfile->file_size_) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Open blob file: %s with wrong size: %" PRIu64,
+                   fpath.c_str(), boffset);
+    return Status::Corruption("Invalid blob file size");
+  }
+
+  constexpr bool do_flush = true;
+
+  bfile->log_writer_ = std::make_shared<BlobLogWriter>(
+      std::move(fwriter), clock_, statistics_, bfile->file_number_,
+      db_options_.use_fsync, do_flush, boffset);
+  bfile->log_writer_->last_elem_type_ = et;
+
+  return s;
+}
+
+std::shared_ptr<BlobFile> BlobDBImpl::FindBlobFileLocked(
+    uint64_t expiration) const {
+  if (open_ttl_files_.empty()) {
+    return nullptr;
+  }
+
+  std::shared_ptr<BlobFile> tmp = std::make_shared<BlobFile>();
+  tmp->SetHasTTL(true);
+  tmp->expiration_range_ = std::make_pair(expiration, 0);
+  tmp->file_number_ = std::numeric_limits<uint64_t>::max();
+
+  auto citr = open_ttl_files_.equal_range(tmp);
+  if (citr.first == open_ttl_files_.end()) {
+    assert(citr.second == open_ttl_files_.end());
+
+    std::shared_ptr<BlobFile> check = *(open_ttl_files_.rbegin());
+    return (check->expiration_range_.second <= expiration) ? nullptr : check;
+  }
+
+  if (citr.first != citr.second) {
+    return *(citr.first);
+  }
+
+  auto finditr = citr.second;
+  if (finditr != open_ttl_files_.begin()) {
+    --finditr;
+  }
+
+  bool b2 = (*finditr)->expiration_range_.second <= expiration;
+  bool b1 = (*finditr)->expiration_range_.first > expiration;
+
+  return (b1 || b2) ? nullptr : (*finditr);
+}
+
+Status BlobDBImpl::CheckOrCreateWriterLocked(
+    const std::shared_ptr<BlobFile>& blob_file,
+    std::shared_ptr<BlobLogWriter>* writer) {
+  assert(writer != nullptr);
+  *writer = blob_file->GetWriter();
+  if (*writer != nullptr) {
+    return Status::OK();
+  }
+  Status s = CreateWriterLocked(blob_file);
+  if (s.ok()) {
+    *writer = blob_file->GetWriter();
+  }
+  return s;
+}
+
+Status BlobDBImpl::CreateBlobFileAndWriter(
+    bool has_ttl, const ExpirationRange& expiration_range,
+    const std::string& reason, std::shared_ptr<BlobFile>* blob_file,
+    std::shared_ptr<BlobLogWriter>* writer) {
+  TEST_SYNC_POINT("BlobDBImpl::CreateBlobFileAndWriter");
+  assert(has_ttl == (expiration_range.first || expiration_range.second));
+  assert(blob_file);
+  assert(writer);
+
+  *blob_file = NewBlobFile(has_ttl, expiration_range, reason);
+  assert(*blob_file);
+
+  // file not visible, hence no lock
+  Status s = CheckOrCreateWriterLocked(*blob_file, writer);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(db_options_.info_log,
+                    "Failed to get writer for blob file: %s, error: %s",
+                    (*blob_file)->PathName().c_str(), s.ToString().c_str());
+    return s;
+  }
+
+  assert(*writer);
+
+  s = (*writer)->WriteHeader((*blob_file)->header_);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(db_options_.info_log,
+                    "Failed to write header to new blob file: %s"
+                    " status: '%s'",
+                    (*blob_file)->PathName().c_str(), s.ToString().c_str());
+    return s;
+  }
+
+  (*blob_file)->SetFileSize(BlobLogHeader::kSize);
+  total_blob_size_ += BlobLogHeader::kSize;
+
+  return s;
+}
+
+Status BlobDBImpl::SelectBlobFile(std::shared_ptr<BlobFile>* blob_file) {
+  assert(blob_file);
+
+  {
+    ReadLock rl(&mutex_);
+
+    if (open_non_ttl_file_) {
+      assert(!open_non_ttl_file_->Immutable());
+      *blob_file = open_non_ttl_file_;
+      return Status::OK();
+    }
+  }
+
+  // Check again
+  WriteLock wl(&mutex_);
+
+  if (open_non_ttl_file_) {
+    assert(!open_non_ttl_file_->Immutable());
+    *blob_file = open_non_ttl_file_;
+    return Status::OK();
+  }
+
+  std::shared_ptr<BlobLogWriter> writer;
+  const Status s = CreateBlobFileAndWriter(
+      /* has_ttl */ false, ExpirationRange(),
+      /* reason */ "SelectBlobFile", blob_file, &writer);
+  if (!s.ok()) {
+    return s;
+  }
+
+  RegisterBlobFile(*blob_file);
+  open_non_ttl_file_ = *blob_file;
+
+  return s;
+}
+
+Status BlobDBImpl::SelectBlobFileTTL(uint64_t expiration,
+                                     std::shared_ptr<BlobFile>* blob_file) {
+  assert(blob_file);
+  assert(expiration != kNoExpiration);
+
+  {
+    ReadLock rl(&mutex_);
+
+    *blob_file = FindBlobFileLocked(expiration);
+    if (*blob_file != nullptr) {
+      assert(!(*blob_file)->Immutable());
+      return Status::OK();
+    }
+  }
+
+  // Check again
+  WriteLock wl(&mutex_);
+
+  *blob_file = FindBlobFileLocked(expiration);
+  if (*blob_file != nullptr) {
+    assert(!(*blob_file)->Immutable());
+    return Status::OK();
+  }
+
+  const uint64_t exp_low =
+      (expiration / bdb_options_.ttl_range_secs) * bdb_options_.ttl_range_secs;
+  const uint64_t exp_high = exp_low + bdb_options_.ttl_range_secs;
+  const ExpirationRange expiration_range(exp_low, exp_high);
+
+  std::ostringstream oss;
+  oss << "SelectBlobFileTTL range: [" << exp_low << ',' << exp_high << ')';
+
+  std::shared_ptr<BlobLogWriter> writer;
+  const Status s =
+      CreateBlobFileAndWriter(/* has_ttl */ true, expiration_range,
+                              /* reason */ oss.str(), blob_file, &writer);
+  if (!s.ok()) {
+    return s;
+  }
+
+  RegisterBlobFile(*blob_file);
+  open_ttl_files_.insert(*blob_file);
+
+  return s;
+}
+
+class BlobDBImpl::BlobInserter : public WriteBatch::Handler {
+ private:
+  const WriteOptions& options_;
+  BlobDBImpl* blob_db_impl_;
+  uint32_t default_cf_id_;
+  WriteBatch batch_;
+
+ public:
+  BlobInserter(const WriteOptions& options, BlobDBImpl* blob_db_impl,
+               uint32_t default_cf_id)
+      : options_(options),
+        blob_db_impl_(blob_db_impl),
+        default_cf_id_(default_cf_id) {}
+
+  WriteBatch* batch() { return &batch_; }
+
+  Status PutCF(uint32_t column_family_id, const Slice& key,
+               const Slice& value) override {
+    if (column_family_id != default_cf_id_) {
+      return Status::NotSupported(
+          "Blob DB doesn't support non-default column family.");
+    }
+    Status s = blob_db_impl_->PutBlobValue(options_, key, value, kNoExpiration,
+                                           &batch_);
+    return s;
+  }
+
+  Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
+    if (column_family_id != default_cf_id_) {
+      return Status::NotSupported(
+          "Blob DB doesn't support non-default column family.");
+    }
+    Status s = WriteBatchInternal::Delete(&batch_, column_family_id, key);
+    return s;
+  }
+
+  virtual Status DeleteRange(uint32_t column_family_id, const Slice& begin_key,
+                             const Slice& end_key) {
+    if (column_family_id != default_cf_id_) {
+      return Status::NotSupported(
+          "Blob DB doesn't support non-default column family.");
+    }
+    Status s = WriteBatchInternal::DeleteRange(&batch_, column_family_id,
+                                               begin_key, end_key);
+    return s;
+  }
+
+  Status SingleDeleteCF(uint32_t /*column_family_id*/,
+                        const Slice& /*key*/) override {
+    return Status::NotSupported("Not supported operation in blob db.");
+  }
+
+  Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/,
+                 const Slice& /*value*/) override {
+    return Status::NotSupported("Not supported operation in blob db.");
+  }
+
+  void LogData(const Slice& blob) override { batch_.PutLogData(blob); }
+};
+
+Status BlobDBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
+  StopWatch write_sw(clock_, statistics_, BLOB_DB_WRITE_MICROS);
+  RecordTick(statistics_, BLOB_DB_NUM_WRITE);
+  uint32_t default_cf_id =
+      static_cast_with_check<ColumnFamilyHandleImpl>(DefaultColumnFamily())
+          ->GetID();
+  Status s;
+  BlobInserter blob_inserter(options, this, default_cf_id);
+  {
+    // Release write_mutex_ before DB write to avoid race condition with
+    // flush begin listener, which also require write_mutex_ to sync
+    // blob files.
+    MutexLock l(&write_mutex_);
+    s = updates->Iterate(&blob_inserter);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+  return db_->Write(options, blob_inserter.batch());
+}
+
+Status BlobDBImpl::Put(const WriteOptions& options, const Slice& key,
+                       const Slice& value) {
+  return PutUntil(options, key, value, kNoExpiration);
+}
+
+Status BlobDBImpl::PutWithTTL(const WriteOptions& options, const Slice& key,
+                              const Slice& value, uint64_t ttl) {
+  uint64_t now = EpochNow();
+  uint64_t expiration = kNoExpiration - now > ttl ? now + ttl : kNoExpiration;
+  return PutUntil(options, key, value, expiration);
+}
+
+Status BlobDBImpl::PutUntil(const WriteOptions& options, const Slice& key,
+                            const Slice& value, uint64_t expiration) {
+  StopWatch write_sw(clock_, statistics_, BLOB_DB_WRITE_MICROS);
+  RecordTick(statistics_, BLOB_DB_NUM_PUT);
+  Status s;
+  WriteBatch batch;
+  {
+    // Release write_mutex_ before DB write to avoid race condition with
+    // flush begin listener, which also require write_mutex_ to sync
+    // blob files.
+    MutexLock l(&write_mutex_);
+    s = PutBlobValue(options, key, value, expiration, &batch);
+  }
+  if (s.ok()) {
+    s = db_->Write(options, &batch);
+  }
+  return s;
+}
+
+Status BlobDBImpl::PutBlobValue(const WriteOptions& /*options*/,
+                                const Slice& key, const Slice& value,
+                                uint64_t expiration, WriteBatch* batch) {
+  write_mutex_.AssertHeld();
+  Status s;
+  std::string index_entry;
+  uint32_t column_family_id =
+      static_cast_with_check<ColumnFamilyHandleImpl>(DefaultColumnFamily())
+          ->GetID();
+  if (value.size() < bdb_options_.min_blob_size) {
+    if (expiration == kNoExpiration) {
+      // Put as normal value
+      s = batch->Put(key, value);
+      RecordTick(statistics_, BLOB_DB_WRITE_INLINED);
+    } else {
+      // Inlined with TTL
+      BlobIndex::EncodeInlinedTTL(&index_entry, expiration, value);
+      s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key,
+                                           index_entry);
+      RecordTick(statistics_, BLOB_DB_WRITE_INLINED_TTL);
+    }
+  } else {
+    std::string compression_output;
+    Slice value_compressed = GetCompressedSlice(value, &compression_output);
+
+    std::string headerbuf;
+    BlobLogWriter::ConstructBlobHeader(&headerbuf, key, value_compressed,
+                                       expiration);
+
+    // Check DB size limit before selecting blob file to
+    // Since CheckSizeAndEvictBlobFiles() can close blob files, it needs to be
+    // done before calling SelectBlobFile().
+    s = CheckSizeAndEvictBlobFiles(headerbuf.size() + key.size() +
+                                   value_compressed.size());
+    if (!s.ok()) {
+      return s;
+    }
+
+    std::shared_ptr<BlobFile> blob_file;
+    if (expiration != kNoExpiration) {
+      s = SelectBlobFileTTL(expiration, &blob_file);
+    } else {
+      s = SelectBlobFile(&blob_file);
+    }
+    if (s.ok()) {
+      assert(blob_file != nullptr);
+      assert(blob_file->GetCompressionType() == bdb_options_.compression);
+      s = AppendBlob(blob_file, headerbuf, key, value_compressed, expiration,
+                     &index_entry);
+    }
+    if (s.ok()) {
+      if (expiration != kNoExpiration) {
+        WriteLock file_lock(&blob_file->mutex_);
+        blob_file->ExtendExpirationRange(expiration);
+      }
+      s = CloseBlobFileIfNeeded(blob_file);
+    }
+    if (s.ok()) {
+      s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key,
+                                           index_entry);
+    }
+    if (s.ok()) {
+      if (expiration == kNoExpiration) {
+        RecordTick(statistics_, BLOB_DB_WRITE_BLOB);
+      } else {
+        RecordTick(statistics_, BLOB_DB_WRITE_BLOB_TTL);
+      }
+    } else {
+      ROCKS_LOG_ERROR(
+          db_options_.info_log,
+          "Failed to append blob to FILE: %s: KEY: %s VALSZ: %" ROCKSDB_PRIszt
+          " status: '%s' blob_file: '%s'",
+          blob_file->PathName().c_str(), key.ToString().c_str(), value.size(),
+          s.ToString().c_str(), blob_file->DumpState().c_str());
+    }
+  }
+
+  RecordTick(statistics_, BLOB_DB_NUM_KEYS_WRITTEN);
+  RecordTick(statistics_, BLOB_DB_BYTES_WRITTEN, key.size() + value.size());
+  RecordInHistogram(statistics_, BLOB_DB_KEY_SIZE, key.size());
+  RecordInHistogram(statistics_, BLOB_DB_VALUE_SIZE, value.size());
+
+  return s;
+}
+
+Slice BlobDBImpl::GetCompressedSlice(const Slice& raw,
+                                     std::string* compression_output) const {
+  if (bdb_options_.compression == kNoCompression) {
+    return raw;
+  }
+  StopWatch compression_sw(clock_, statistics_, BLOB_DB_COMPRESSION_MICROS);
+  CompressionType type = bdb_options_.compression;
+  CompressionOptions opts;
+  CompressionContext context(type);
+  CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), type,
+                       0 /* sample_for_compression */);
+  CompressBlock(raw, info, &type, kBlockBasedTableVersionFormat, false,
+                compression_output, nullptr, nullptr);
+  return *compression_output;
+}
+
+Status BlobDBImpl::DecompressSlice(const Slice& compressed_value,
+                                   CompressionType compression_type,
+                                   PinnableSlice* value_output) const {
+  assert(compression_type != kNoCompression);
+
+  BlockContents contents;
+  auto cfh = static_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
+
+  {
+    StopWatch decompression_sw(clock_, statistics_,
+                               BLOB_DB_DECOMPRESSION_MICROS);
+    UncompressionContext context(compression_type);
+    UncompressionInfo info(context, UncompressionDict::GetEmptyDict(),
+                           compression_type);
+    Status s = UncompressBlockData(
+        info, compressed_value.data(), compressed_value.size(), &contents,
+        kBlockBasedTableVersionFormat, *(cfh->cfd()->ioptions()));
+    if (!s.ok()) {
+      return Status::Corruption("Unable to decompress blob.");
+    }
+  }
+
+  value_output->PinSelf(contents.data);
+
+  return Status::OK();
+}
+
+Status BlobDBImpl::CompactFiles(
+    const CompactionOptions& compact_options,
+    const std::vector<std::string>& input_file_names, const int output_level,
+    const int output_path_id, std::vector<std::string>* const output_file_names,
+    CompactionJobInfo* compaction_job_info) {
+  // Note: we need CompactionJobInfo to be able to track updates to the
+  // blob file <-> SST mappings, so we provide one if the user hasn't,
+  // assuming that GC is enabled.
+  CompactionJobInfo info{};
+  if (bdb_options_.enable_garbage_collection && !compaction_job_info) {
+    compaction_job_info = &info;
+  }
+
+  const Status s =
+      db_->CompactFiles(compact_options, input_file_names, output_level,
+                        output_path_id, output_file_names, compaction_job_info);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (bdb_options_.enable_garbage_collection) {
+    assert(compaction_job_info);
+    ProcessCompactionJobInfo(*compaction_job_info);
+  }
+
+  return s;
+}
+
+void BlobDBImpl::GetCompactionContextCommon(BlobCompactionContext* context) {
+  assert(context);
+
+  context->blob_db_impl = this;
+  context->next_file_number = next_file_number_.load();
+  context->current_blob_files.clear();
+  for (auto& p : blob_files_) {
+    context->current_blob_files.insert(p.first);
+  }
+  context->fifo_eviction_seq = fifo_eviction_seq_;
+  context->evict_expiration_up_to = evict_expiration_up_to_;
+}
+
+void BlobDBImpl::GetCompactionContext(BlobCompactionContext* context) {
+  assert(context);
+
+  ReadLock l(&mutex_);
+  GetCompactionContextCommon(context);
+}
+
+void BlobDBImpl::GetCompactionContext(BlobCompactionContext* context,
+                                      BlobCompactionContextGC* context_gc) {
+  assert(context);
+  assert(context_gc);
+
+  ReadLock l(&mutex_);
+  GetCompactionContextCommon(context);
+
+  if (!live_imm_non_ttl_blob_files_.empty()) {
+    auto it = live_imm_non_ttl_blob_files_.begin();
+    std::advance(it, bdb_options_.garbage_collection_cutoff *
+                         live_imm_non_ttl_blob_files_.size());
+    context_gc->cutoff_file_number = it != live_imm_non_ttl_blob_files_.end()
+                                         ? it->first
+                                         : std::numeric_limits<uint64_t>::max();
+  }
+}
+
+void BlobDBImpl::UpdateLiveSSTSize() {
+  uint64_t live_sst_size = 0;
+  bool ok = GetIntProperty(DB::Properties::kLiveSstFilesSize, &live_sst_size);
+  if (ok) {
+    live_sst_size_.store(live_sst_size);
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "Updated total SST file size: %" PRIu64 " bytes.",
+                   live_sst_size);
+  } else {
+    ROCKS_LOG_ERROR(
+        db_options_.info_log,
+        "Failed to update total SST file size after flush or compaction.");
+  }
+  {
+    // Trigger FIFO eviction if needed.
+    MutexLock l(&write_mutex_);
+    Status s = CheckSizeAndEvictBlobFiles(0, true /*force*/);
+    if (s.IsNoSpace()) {
+      ROCKS_LOG_WARN(db_options_.info_log,
+                     "DB grow out-of-space after SST size updated. Current live"
+                     " SST size: %" PRIu64
+                     " , current blob files size: %" PRIu64 ".",
+                     live_sst_size_.load(), total_blob_size_.load());
+    }
+  }
+}
+
+Status BlobDBImpl::CheckSizeAndEvictBlobFiles(uint64_t blob_size,
+                                              bool force_evict) {
+  write_mutex_.AssertHeld();
+
+  uint64_t live_sst_size = live_sst_size_.load();
+  if (bdb_options_.max_db_size == 0 ||
+      live_sst_size + total_blob_size_.load() + blob_size <=
+          bdb_options_.max_db_size) {
+    return Status::OK();
+  }
+
+  if (bdb_options_.is_fifo == false ||
+      (!force_evict && live_sst_size + blob_size > bdb_options_.max_db_size)) {
+    // FIFO eviction is disabled, or no space to insert new blob even we evict
+    // all blob files.
+    return Status::NoSpace(
+        "Write failed, as writing it would exceed max_db_size limit.");
+  }
+
+  std::vector<std::shared_ptr<BlobFile>> candidate_files;
+  CopyBlobFiles(&candidate_files);
+  std::sort(candidate_files.begin(), candidate_files.end(),
+            BlobFileComparator());
+  fifo_eviction_seq_ = GetLatestSequenceNumber();
+
+  WriteLock l(&mutex_);
+
+  while (!candidate_files.empty() &&
+         live_sst_size + total_blob_size_.load() + blob_size >
+             bdb_options_.max_db_size) {
+    std::shared_ptr<BlobFile> blob_file = candidate_files.back();
+    candidate_files.pop_back();
+    WriteLock file_lock(&blob_file->mutex_);
+    if (blob_file->Obsolete()) {
+      // File already obsoleted by someone else.
+      assert(blob_file->Immutable());
+      continue;
+    }
+    // FIFO eviction can evict open blob files.
+    if (!blob_file->Immutable()) {
+      Status s = CloseBlobFile(blob_file);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+    assert(blob_file->Immutable());
+    auto expiration_range = blob_file->GetExpirationRange();
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "Evict oldest blob file since DB out of space. Current "
+                   "live SST file size: %" PRIu64 ", total blob size: %" PRIu64
+                   ", max db size: %" PRIu64 ", evicted blob file #%" PRIu64
+                   ".",
+                   live_sst_size, total_blob_size_.load(),
+                   bdb_options_.max_db_size, blob_file->BlobFileNumber());
+    ObsoleteBlobFile(blob_file, fifo_eviction_seq_, true /*update_size*/);
+    evict_expiration_up_to_ = expiration_range.first;
+    RecordTick(statistics_, BLOB_DB_FIFO_NUM_FILES_EVICTED);
+    RecordTick(statistics_, BLOB_DB_FIFO_NUM_KEYS_EVICTED,
+               blob_file->BlobCount());
+    RecordTick(statistics_, BLOB_DB_FIFO_BYTES_EVICTED,
+               blob_file->GetFileSize());
+    TEST_SYNC_POINT("BlobDBImpl::EvictOldestBlobFile:Evicted");
+  }
+  if (live_sst_size + total_blob_size_.load() + blob_size >
+      bdb_options_.max_db_size) {
+    return Status::NoSpace(
+        "Write failed, as writing it would exceed max_db_size limit.");
+  }
+  return Status::OK();
+}
+
+Status BlobDBImpl::AppendBlob(const std::shared_ptr<BlobFile>& bfile,
+                              const std::string& headerbuf, const Slice& key,
+                              const Slice& value, uint64_t expiration,
+                              std::string* index_entry) {
+  Status s;
+  uint64_t blob_offset = 0;
+  uint64_t key_offset = 0;
+  {
+    WriteLock lockbfile_w(&bfile->mutex_);
+    std::shared_ptr<BlobLogWriter> writer;
+    s = CheckOrCreateWriterLocked(bfile, &writer);
+    if (!s.ok()) {
+      return s;
+    }
+
+    // write the blob to the blob log.
+    s = writer->EmitPhysicalRecord(headerbuf, key, value, &key_offset,
+                                   &blob_offset);
+  }
+
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(db_options_.info_log,
+                    "Invalid status in AppendBlob: %s status: '%s'",
+                    bfile->PathName().c_str(), s.ToString().c_str());
+    return s;
+  }
+
+  uint64_t size_put = headerbuf.size() + key.size() + value.size();
+  bfile->BlobRecordAdded(size_put);
+  total_blob_size_ += size_put;
+
+  if (expiration == kNoExpiration) {
+    BlobIndex::EncodeBlob(index_entry, bfile->BlobFileNumber(), blob_offset,
+                          value.size(), bdb_options_.compression);
+  } else {
+    BlobIndex::EncodeBlobTTL(index_entry, expiration, bfile->BlobFileNumber(),
+                             blob_offset, value.size(),
+                             bdb_options_.compression);
+  }
+
+  return s;
+}
+
+std::vector<Status> BlobDBImpl::MultiGet(const ReadOptions& read_options,
+                                         const std::vector<Slice>& keys,
+                                         std::vector<std::string>* values) {
+  StopWatch multiget_sw(clock_, statistics_, BLOB_DB_MULTIGET_MICROS);
+  RecordTick(statistics_, BLOB_DB_NUM_MULTIGET);
+  // Get a snapshot to avoid blob file get deleted between we
+  // fetch and index entry and reading from the file.
+  ReadOptions ro(read_options);
+  bool snapshot_created = SetSnapshotIfNeeded(&ro);
+
+  std::vector<Status> statuses;
+  statuses.reserve(keys.size());
+  values->clear();
+  values->reserve(keys.size());
+  PinnableSlice value;
+  for (size_t i = 0; i < keys.size(); i++) {
+    statuses.push_back(Get(ro, DefaultColumnFamily(), keys[i], &value));
+    values->push_back(value.ToString());
+    value.Reset();
+  }
+  if (snapshot_created) {
+    db_->ReleaseSnapshot(ro.snapshot);
+  }
+  return statuses;
+}
+
+bool BlobDBImpl::SetSnapshotIfNeeded(ReadOptions* read_options) {
+  assert(read_options != nullptr);
+  if (read_options->snapshot != nullptr) {
+    return false;
+  }
+  read_options->snapshot = db_->GetSnapshot();
+  return true;
+}
+
+Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
+                                PinnableSlice* value, uint64_t* expiration) {
+  assert(value);
+
+  BlobIndex blob_index;
+  Status s = blob_index.DecodeFrom(index_entry);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (blob_index.HasTTL() && blob_index.expiration() <= EpochNow()) {
+    return Status::NotFound("Key expired");
+  }
+
+  if (expiration != nullptr) {
+    if (blob_index.HasTTL()) {
+      *expiration = blob_index.expiration();
+    } else {
+      *expiration = kNoExpiration;
+    }
+  }
+
+  if (blob_index.IsInlined()) {
+    // TODO(yiwu): If index_entry is a PinnableSlice, we can also pin the same
+    // memory buffer to avoid extra copy.
+    value->PinSelf(blob_index.value());
+    return Status::OK();
+  }
+
+  CompressionType compression_type = kNoCompression;
+  s = GetRawBlobFromFile(key, blob_index.file_number(), blob_index.offset(),
+                         blob_index.size(), value, &compression_type);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (compression_type != kNoCompression) {
+    s = DecompressSlice(*value, compression_type, value);
+    if (!s.ok()) {
+      if (debug_level_ >= 2) {
+        ROCKS_LOG_ERROR(
+            db_options_.info_log,
+            "Uncompression error during blob read from file: %" PRIu64
+            " blob_offset: %" PRIu64 " blob_size: %" PRIu64
+            " key: %s status: '%s'",
+            blob_index.file_number(), blob_index.offset(), blob_index.size(),
+            key.ToString(/* output_hex */ true).c_str(), s.ToString().c_str());
+      }
+      return s;
+    }
+  }
+
+  return Status::OK();
+}
+
+Status BlobDBImpl::GetRawBlobFromFile(const Slice& key, uint64_t file_number,
+                                      uint64_t offset, uint64_t size,
+                                      PinnableSlice* value,
+                                      CompressionType* compression_type) {
+  assert(value);
+  assert(compression_type);
+  assert(*compression_type == kNoCompression);
+
+  if (!size) {
+    value->PinSelf("");
+    return Status::OK();
+  }
+
+  // offset has to have certain min, as we will read CRC
+  // later from the Blob Header, which needs to be also a
+  // valid offset.
+  if (offset <
+      (BlobLogHeader::kSize + BlobLogRecord::kHeaderSize + key.size())) {
+    if (debug_level_ >= 2) {
+      ROCKS_LOG_ERROR(db_options_.info_log,
+                      "Invalid blob index file_number: %" PRIu64
+                      " blob_offset: %" PRIu64 " blob_size: %" PRIu64
+                      " key: %s",
+                      file_number, offset, size,
+                      key.ToString(/* output_hex */ true).c_str());
+    }
+
+    return Status::NotFound("Invalid blob offset");
+  }
+
+  std::shared_ptr<BlobFile> blob_file;
+
+  {
+    ReadLock rl(&mutex_);
+    auto it = blob_files_.find(file_number);
+
+    // file was deleted
+    if (it == blob_files_.end()) {
+      return Status::NotFound("Blob Not Found as blob file missing");
+    }
+
+    blob_file = it->second;
+  }
+
+  *compression_type = blob_file->GetCompressionType();
+
+  // takes locks when called
+  std::shared_ptr<RandomAccessFileReader> reader;
+  Status s = GetBlobFileReader(blob_file, &reader);
+  if (!s.ok()) {
+    return s;
+  }
+
+  assert(offset >= key.size() + sizeof(uint32_t));
+  const uint64_t record_offset = offset - key.size() - sizeof(uint32_t);
+  const uint64_t record_size = sizeof(uint32_t) + key.size() + size;
+
+  // Allocate the buffer. This is safe in C++11
+  std::string buf;
+  AlignedBuf aligned_buf;
+
+  // A partial blob record contain checksum, key and value.
+  Slice blob_record;
+
+  {
+    StopWatch read_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
+    // TODO: rate limit old blob DB file reads.
+    if (reader->use_direct_io()) {
+      s = reader->Read(IOOptions(), record_offset,
+                       static_cast<size_t>(record_size), &blob_record, nullptr,
+                       &aligned_buf, Env::IO_TOTAL /* rate_limiter_priority */);
+    } else {
+      buf.reserve(static_cast<size_t>(record_size));
+      s = reader->Read(IOOptions(), record_offset,
+                       static_cast<size_t>(record_size), &blob_record, &buf[0],
+                       nullptr, Env::IO_TOTAL /* rate_limiter_priority */);
+    }
+    RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, blob_record.size());
+  }
+
+  if (!s.ok()) {
+    ROCKS_LOG_DEBUG(
+        db_options_.info_log,
+        "Failed to read blob from blob file %" PRIu64 ", blob_offset: %" PRIu64
+        ", blob_size: %" PRIu64 ", key_size: %" ROCKSDB_PRIszt ", status: '%s'",
+        file_number, offset, size, key.size(), s.ToString().c_str());
+    return s;
+  }
+
+  if (blob_record.size() != record_size) {
+    ROCKS_LOG_DEBUG(
+        db_options_.info_log,
+        "Failed to read blob from blob file %" PRIu64 ", blob_offset: %" PRIu64
+        ", blob_size: %" PRIu64 ", key_size: %" ROCKSDB_PRIszt
+        ", read %" ROCKSDB_PRIszt " bytes, expected %" PRIu64 " bytes",
+        file_number, offset, size, key.size(), blob_record.size(), record_size);
+
+    return Status::Corruption("Failed to retrieve blob from blob index.");
+  }
+
+  Slice crc_slice(blob_record.data(), sizeof(uint32_t));
+  Slice blob_value(blob_record.data() + sizeof(uint32_t) + key.size(),
+                   static_cast<size_t>(size));
+
+  uint32_t crc_exp = 0;
+  if (!GetFixed32(&crc_slice, &crc_exp)) {
+    ROCKS_LOG_DEBUG(
+        db_options_.info_log,
+        "Unable to decode CRC from blob file %" PRIu64 ", blob_offset: %" PRIu64
+        ", blob_size: %" PRIu64 ", key size: %" ROCKSDB_PRIszt ", status: '%s'",
+        file_number, offset, size, key.size(), s.ToString().c_str());
+    return Status::Corruption("Unable to decode checksum.");
+  }
+
+  uint32_t crc = crc32c::Value(blob_record.data() + sizeof(uint32_t),
+                               blob_record.size() - sizeof(uint32_t));
+  crc = crc32c::Mask(crc);  // Adjust for storage
+  if (crc != crc_exp) {
+    if (debug_level_ >= 2) {
+      ROCKS_LOG_ERROR(
+          db_options_.info_log,
+          "Blob crc mismatch file: %" PRIu64 " blob_offset: %" PRIu64
+          " blob_size: %" PRIu64 " key: %s status: '%s'",
+          file_number, offset, size,
+          key.ToString(/* output_hex */ true).c_str(), s.ToString().c_str());
+    }
+
+    return Status::Corruption("Corruption. Blob CRC mismatch");
+  }
+
+  value->PinSelf(blob_value);
+
+  return Status::OK();
+}
+
+Status BlobDBImpl::Get(const ReadOptions& read_options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       PinnableSlice* value) {
+  return Get(read_options, column_family, key, value,
+             static_cast<uint64_t*>(nullptr) /*expiration*/);
+}
+
+Status BlobDBImpl::Get(const ReadOptions& read_options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       PinnableSlice* value, uint64_t* expiration) {
+  StopWatch get_sw(clock_, statistics_, BLOB_DB_GET_MICROS);
+  RecordTick(statistics_, BLOB_DB_NUM_GET);
+  return GetImpl(read_options, column_family, key, value, expiration);
+}
+
+Status BlobDBImpl::GetImpl(const ReadOptions& read_options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           PinnableSlice* value, uint64_t* expiration) {
+  if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
+    return Status::NotSupported(
+        "Blob DB doesn't support non-default column family.");
+  }
+  // Get a snapshot to avoid blob file get deleted between we
+  // fetch and index entry and reading from the file.
+  // TODO(yiwu): For Get() retry if file not found would be a simpler strategy.
+  ReadOptions ro(read_options);
+  bool snapshot_created = SetSnapshotIfNeeded(&ro);
+
+  PinnableSlice index_entry;
+  Status s;
+  bool is_blob_index = false;
+  DBImpl::GetImplOptions get_impl_options;
+  get_impl_options.column_family = column_family;
+  get_impl_options.value = &index_entry;
+  get_impl_options.is_blob_index = &is_blob_index;
+  s = db_impl_->GetImpl(ro, key, get_impl_options);
+  if (expiration != nullptr) {
+    *expiration = kNoExpiration;
+  }
+  RecordTick(statistics_, BLOB_DB_NUM_KEYS_READ);
+  if (s.ok()) {
+    if (is_blob_index) {
+      s = GetBlobValue(key, index_entry, value, expiration);
+    } else {
+      // The index entry is the value itself in this case.
+      value->PinSelf(index_entry);
+    }
+    RecordTick(statistics_, BLOB_DB_BYTES_READ, value->size());
+  }
+  if (snapshot_created) {
+    db_->ReleaseSnapshot(ro.snapshot);
+  }
+  return s;
+}
+
+std::pair<bool, int64_t> BlobDBImpl::SanityCheck(bool aborted) {
+  if (aborted) {
+    return std::make_pair(false, -1);
+  }
+
+  ReadLock rl(&mutex_);
+
+  ROCKS_LOG_INFO(db_options_.info_log, "Starting Sanity Check");
+  ROCKS_LOG_INFO(db_options_.info_log, "Number of files %" ROCKSDB_PRIszt,
+                 blob_files_.size());
+  ROCKS_LOG_INFO(db_options_.info_log, "Number of open files %" ROCKSDB_PRIszt,
+                 open_ttl_files_.size());
+
+  for (const auto& blob_file : open_ttl_files_) {
+    (void)blob_file;
+    assert(!blob_file->Immutable());
+  }
+
+  for (const auto& pair : live_imm_non_ttl_blob_files_) {
+    const auto& blob_file = pair.second;
+    (void)blob_file;
+    assert(!blob_file->HasTTL());
+    assert(blob_file->Immutable());
+  }
+
+  uint64_t now = EpochNow();
+
+  for (auto blob_file_pair : blob_files_) {
+    auto blob_file = blob_file_pair.second;
+    std::ostringstream buf;
+
+    buf << "Blob file " << blob_file->BlobFileNumber() << ", size "
+        << blob_file->GetFileSize() << ", blob count " << blob_file->BlobCount()
+        << ", immutable " << blob_file->Immutable();
+
+    if (blob_file->HasTTL()) {
+      ExpirationRange expiration_range;
+      {
+        ReadLock file_lock(&blob_file->mutex_);
+        expiration_range = blob_file->GetExpirationRange();
+      }
+      buf << ", expiration range (" << expiration_range.first << ", "
+          << expiration_range.second << ")";
+
+      if (!blob_file->Obsolete()) {
+        buf << ", expire in " << (expiration_range.second - now) << "seconds";
+      }
+    }
+    if (blob_file->Obsolete()) {
+      buf << ", obsolete at " << blob_file->GetObsoleteSequence();
+    }
+    buf << ".";
+    ROCKS_LOG_INFO(db_options_.info_log, "%s", buf.str().c_str());
+  }
+
+  // reschedule
+  return std::make_pair(true, -1);
+}
+
+Status BlobDBImpl::CloseBlobFile(std::shared_ptr<BlobFile> bfile) {
+  TEST_SYNC_POINT("BlobDBImpl::CloseBlobFile");
+  assert(bfile);
+  assert(!bfile->Immutable());
+  assert(!bfile->Obsolete());
+
+  if (bfile->HasTTL() || bfile == open_non_ttl_file_) {
+    write_mutex_.AssertHeld();
+  }
+
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "Closing blob file %" PRIu64 ". Path: %s",
+                 bfile->BlobFileNumber(), bfile->PathName().c_str());
+
+  const SequenceNumber sequence = GetLatestSequenceNumber();
+
+  const Status s = bfile->WriteFooterAndCloseLocked(sequence);
+
+  if (s.ok()) {
+    total_blob_size_ += BlobLogFooter::kSize;
+  } else {
+    bfile->MarkImmutable(sequence);
+
+    ROCKS_LOG_ERROR(db_options_.info_log,
+                    "Failed to close blob file %" PRIu64 "with error: %s",
+                    bfile->BlobFileNumber(), s.ToString().c_str());
+  }
+
+  if (bfile->HasTTL()) {
+    size_t erased __attribute__((__unused__));
+    erased = open_ttl_files_.erase(bfile);
+  } else {
+    if (bfile == open_non_ttl_file_) {
+      open_non_ttl_file_ = nullptr;
+    }
+
+    const uint64_t blob_file_number = bfile->BlobFileNumber();
+    auto it = live_imm_non_ttl_blob_files_.lower_bound(blob_file_number);
+    assert(it == live_imm_non_ttl_blob_files_.end() ||
+           it->first != blob_file_number);
+    live_imm_non_ttl_blob_files_.insert(
+        it, std::map<uint64_t, std::shared_ptr<BlobFile>>::value_type(
+                blob_file_number, bfile));
+  }
+
+  return s;
+}
+
+Status BlobDBImpl::CloseBlobFileIfNeeded(std::shared_ptr<BlobFile>& bfile) {
+  write_mutex_.AssertHeld();
+
+  // atomic read
+  if (bfile->GetFileSize() < bdb_options_.blob_file_size) {
+    return Status::OK();
+  }
+
+  WriteLock lock(&mutex_);
+  WriteLock file_lock(&bfile->mutex_);
+
+  assert(!bfile->Obsolete() || bfile->Immutable());
+  if (bfile->Immutable()) {
+    return Status::OK();
+  }
+
+  return CloseBlobFile(bfile);
+}
+
+void BlobDBImpl::ObsoleteBlobFile(std::shared_ptr<BlobFile> blob_file,
+                                  SequenceNumber obsolete_seq,
+                                  bool update_size) {
+  assert(blob_file->Immutable());
+  assert(!blob_file->Obsolete());
+
+  // Should hold write lock of mutex_ or during DB open.
+  blob_file->MarkObsolete(obsolete_seq);
+  obsolete_files_.push_back(blob_file);
+  assert(total_blob_size_.load() >= blob_file->GetFileSize());
+  if (update_size) {
+    total_blob_size_ -= blob_file->GetFileSize();
+  }
+}
+
+bool BlobDBImpl::VisibleToActiveSnapshot(
+    const std::shared_ptr<BlobFile>& bfile) {
+  assert(bfile->Obsolete());
+
+  // We check whether the oldest snapshot is no less than the last sequence
+  // by the time the blob file become obsolete. If so, the blob file is not
+  // visible to all existing snapshots.
+  //
+  // If we keep track of the earliest sequence of the keys in the blob file,
+  // we could instead check if there's a snapshot falls in range
+  // [earliest_sequence, obsolete_sequence). But doing so will make the
+  // implementation more complicated.
+  SequenceNumber obsolete_sequence = bfile->GetObsoleteSequence();
+  SequenceNumber oldest_snapshot = kMaxSequenceNumber;
+  {
+    // Need to lock DBImpl mutex before access snapshot list.
+    InstrumentedMutexLock l(db_impl_->mutex());
+    auto& snapshots = db_impl_->snapshots();
+    if (!snapshots.empty()) {
+      oldest_snapshot = snapshots.oldest()->GetSequenceNumber();
+    }
+  }
+  bool visible = oldest_snapshot < obsolete_sequence;
+  if (visible) {
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "Obsolete blob file %" PRIu64 " (obsolete at %" PRIu64
+                   ") visible to oldest snapshot %" PRIu64 ".",
+                   bfile->BlobFileNumber(), obsolete_sequence, oldest_snapshot);
+  }
+  return visible;
+}
+
+std::pair<bool, int64_t> BlobDBImpl::EvictExpiredFiles(bool aborted) {
+  if (aborted) {
+    return std::make_pair(false, -1);
+  }
+
+  TEST_SYNC_POINT("BlobDBImpl::EvictExpiredFiles:0");
+  TEST_SYNC_POINT("BlobDBImpl::EvictExpiredFiles:1");
+
+  std::vector<std::shared_ptr<BlobFile>> process_files;
+  uint64_t now = EpochNow();
+  {
+    ReadLock rl(&mutex_);
+    for (auto p : blob_files_) {
+      auto& blob_file = p.second;
+      ReadLock file_lock(&blob_file->mutex_);
+      if (blob_file->HasTTL() && !blob_file->Obsolete() &&
+          blob_file->GetExpirationRange().second <= now) {
+        process_files.push_back(blob_file);
+      }
+    }
+  }
+
+  TEST_SYNC_POINT("BlobDBImpl::EvictExpiredFiles:2");
+  TEST_SYNC_POINT("BlobDBImpl::EvictExpiredFiles:3");
+  TEST_SYNC_POINT_CALLBACK("BlobDBImpl::EvictExpiredFiles:cb", nullptr);
+
+  SequenceNumber seq = GetLatestSequenceNumber();
+  {
+    MutexLock l(&write_mutex_);
+    WriteLock lock(&mutex_);
+    for (auto& blob_file : process_files) {
+      WriteLock file_lock(&blob_file->mutex_);
+
+      // Need to double check if the file is obsolete.
+      if (blob_file->Obsolete()) {
+        assert(blob_file->Immutable());
+        continue;
+      }
+
+      if (!blob_file->Immutable()) {
+        CloseBlobFile(blob_file);
+      }
+
+      assert(blob_file->Immutable());
+
+      ObsoleteBlobFile(blob_file, seq, true /*update_size*/);
+    }
+  }
+
+  return std::make_pair(true, -1);
+}
+
+Status BlobDBImpl::SyncBlobFiles() {
+  MutexLock l(&write_mutex_);
+
+  std::vector<std::shared_ptr<BlobFile>> process_files;
+  {
+    ReadLock rl(&mutex_);
+    for (auto fitr : open_ttl_files_) {
+      process_files.push_back(fitr);
+    }
+    if (open_non_ttl_file_ != nullptr) {
+      process_files.push_back(open_non_ttl_file_);
+    }
+  }
+
+  Status s;
+  for (auto& blob_file : process_files) {
+    s = blob_file->Fsync();
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(db_options_.info_log,
+                      "Failed to sync blob file %" PRIu64 ", status: %s",
+                      blob_file->BlobFileNumber(), s.ToString().c_str());
+      return s;
+    }
+  }
+
+  s = dir_ent_->FsyncWithDirOptions(IOOptions(), nullptr, DirFsyncOptions());
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(db_options_.info_log,
+                    "Failed to sync blob directory, status: %s",
+                    s.ToString().c_str());
+  }
+  return s;
+}
+
+std::pair<bool, int64_t> BlobDBImpl::ReclaimOpenFiles(bool aborted) {
+  if (aborted) return std::make_pair(false, -1);
+
+  if (open_file_count_.load() < kOpenFilesTrigger) {
+    return std::make_pair(true, -1);
+  }
+
+  // in the future, we should sort by last_access_
+  // instead of closing every file
+  ReadLock rl(&mutex_);
+  for (auto const& ent : blob_files_) {
+    auto bfile = ent.second;
+    if (bfile->last_access_.load() == -1) continue;
+
+    WriteLock lockbfile_w(&bfile->mutex_);
+    CloseRandomAccessLocked(bfile);
+  }
+
+  return std::make_pair(true, -1);
+}
+
+std::pair<bool, int64_t> BlobDBImpl::DeleteObsoleteFiles(bool aborted) {
+  if (aborted) {
+    return std::make_pair(false, -1);
+  }
+
+  MutexLock delete_file_lock(&delete_file_mutex_);
+  if (disable_file_deletions_ > 0) {
+    return std::make_pair(true, -1);
+  }
+
+  std::list<std::shared_ptr<BlobFile>> tobsolete;
+  {
+    WriteLock wl(&mutex_);
+    if (obsolete_files_.empty()) {
+      return std::make_pair(true, -1);
+    }
+    tobsolete.swap(obsolete_files_);
+  }
+
+  bool file_deleted = false;
+  for (auto iter = tobsolete.begin(); iter != tobsolete.end();) {
+    auto bfile = *iter;
+    {
+      ReadLock lockbfile_r(&bfile->mutex_);
+      if (VisibleToActiveSnapshot(bfile)) {
+        ROCKS_LOG_INFO(db_options_.info_log,
+                       "Could not delete file due to snapshot failure %s",
+                       bfile->PathName().c_str());
+        ++iter;
+        continue;
+      }
+    }
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "Will delete file due to snapshot success %s",
+                   bfile->PathName().c_str());
+
+    {
+      WriteLock wl(&mutex_);
+      blob_files_.erase(bfile->BlobFileNumber());
+    }
+
+    Status s = DeleteDBFile(&(db_impl_->immutable_db_options()),
+                            bfile->PathName(), blob_dir_, true,
+                            /*force_fg=*/false);
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(db_options_.info_log,
+                      "File failed to be deleted as obsolete %s",
+                      bfile->PathName().c_str());
+      ++iter;
+      continue;
+    }
+
+    file_deleted = true;
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "File deleted as obsolete from blob dir %s",
+                   bfile->PathName().c_str());
+
+    iter = tobsolete.erase(iter);
+  }
+
+  // directory change. Fsync
+  if (file_deleted) {
+    Status s = dir_ent_->FsyncWithDirOptions(
+        IOOptions(), nullptr,
+        DirFsyncOptions(DirFsyncOptions::FsyncReason::kFileDeleted));
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(db_options_.info_log, "Failed to sync dir %s: %s",
+                      blob_dir_.c_str(), s.ToString().c_str());
+    }
+  }
+
+  // put files back into obsolete if for some reason, delete failed
+  if (!tobsolete.empty()) {
+    WriteLock wl(&mutex_);
+    for (auto bfile : tobsolete) {
+      blob_files_.insert(std::make_pair(bfile->BlobFileNumber(), bfile));
+      obsolete_files_.push_front(bfile);
+    }
+  }
+
+  return std::make_pair(!aborted, -1);
+}
+
+void BlobDBImpl::CopyBlobFiles(
+    std::vector<std::shared_ptr<BlobFile>>* bfiles_copy) {
+  ReadLock rl(&mutex_);
+  for (auto const& p : blob_files_) {
+    bfiles_copy->push_back(p.second);
+  }
+}
+
+Iterator* BlobDBImpl::NewIterator(const ReadOptions& read_options) {
+  auto* cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(DefaultColumnFamily())
+          ->cfd();
+  // Get a snapshot to avoid blob file get deleted between we
+  // fetch and index entry and reading from the file.
+  ManagedSnapshot* own_snapshot = nullptr;
+  const Snapshot* snapshot = read_options.snapshot;
+  if (snapshot == nullptr) {
+    own_snapshot = new ManagedSnapshot(db_);
+    snapshot = own_snapshot->snapshot();
+  }
+  auto* iter = db_impl_->NewIteratorImpl(
+      read_options, cfd, snapshot->GetSequenceNumber(),
+      nullptr /*read_callback*/, true /*expose_blob_index*/);
+  return new BlobDBIterator(own_snapshot, iter, this, clock_, statistics_);
+}
+
+Status DestroyBlobDB(const std::string& dbname, const Options& options,
+                     const BlobDBOptions& bdb_options) {
+  const ImmutableDBOptions soptions(SanitizeOptions(dbname, options));
+  Env* env = soptions.env;
+
+  Status status;
+  std::string blobdir;
+  blobdir = (bdb_options.path_relative) ? dbname + "/" + bdb_options.blob_dir
+                                        : bdb_options.blob_dir;
+
+  std::vector<std::string> filenames;
+  if (env->GetChildren(blobdir, &filenames).ok()) {
+    for (const auto& f : filenames) {
+      uint64_t number;
+      FileType type;
+      if (ParseFileName(f, &number, &type) && type == kBlobFile) {
+        Status del = DeleteDBFile(&soptions, blobdir + "/" + f, blobdir, true,
+                                  /*force_fg=*/false);
+        if (status.ok() && !del.ok()) {
+          status = del;
+        }
+      }
+    }
+    // TODO: What to do if we cannot delete the directory?
+    env->DeleteDir(blobdir).PermitUncheckedError();
+  }
+  Status destroy = DestroyDB(dbname, options);
+  if (status.ok() && !destroy.ok()) {
+    status = destroy;
+  }
+
+  return status;
+}
+
+#ifndef NDEBUG
+Status BlobDBImpl::TEST_GetBlobValue(const Slice& key, const Slice& index_entry,
+                                     PinnableSlice* value) {
+  return GetBlobValue(key, index_entry, value);
+}
+
+void BlobDBImpl::TEST_AddDummyBlobFile(uint64_t blob_file_number,
+                                       SequenceNumber immutable_sequence) {
+  auto blob_file = std::make_shared<BlobFile>(this, blob_dir_, blob_file_number,
+                                              db_options_.info_log.get());
+  blob_file->MarkImmutable(immutable_sequence);
+
+  blob_files_[blob_file_number] = blob_file;
+  live_imm_non_ttl_blob_files_[blob_file_number] = blob_file;
+}
+
+std::vector<std::shared_ptr<BlobFile>> BlobDBImpl::TEST_GetBlobFiles() const {
+  ReadLock l(&mutex_);
+  std::vector<std::shared_ptr<BlobFile>> blob_files;
+  for (auto& p : blob_files_) {
+    blob_files.emplace_back(p.second);
+  }
+  return blob_files;
+}
+
+std::vector<std::shared_ptr<BlobFile>> BlobDBImpl::TEST_GetLiveImmNonTTLFiles()
+    const {
+  ReadLock l(&mutex_);
+  std::vector<std::shared_ptr<BlobFile>> live_imm_non_ttl_files;
+  for (const auto& pair : live_imm_non_ttl_blob_files_) {
+    live_imm_non_ttl_files.emplace_back(pair.second);
+  }
+  return live_imm_non_ttl_files;
+}
+
+std::vector<std::shared_ptr<BlobFile>> BlobDBImpl::TEST_GetObsoleteFiles()
+    const {
+  ReadLock l(&mutex_);
+  std::vector<std::shared_ptr<BlobFile>> obsolete_files;
+  for (auto& bfile : obsolete_files_) {
+    obsolete_files.emplace_back(bfile);
+  }
+  return obsolete_files;
+}
+
+void BlobDBImpl::TEST_DeleteObsoleteFiles() {
+  DeleteObsoleteFiles(false /*abort*/);
+}
+
+Status BlobDBImpl::TEST_CloseBlobFile(std::shared_ptr<BlobFile>& bfile) {
+  MutexLock l(&write_mutex_);
+  WriteLock lock(&mutex_);
+  WriteLock file_lock(&bfile->mutex_);
+
+  return CloseBlobFile(bfile);
+}
+
+void BlobDBImpl::TEST_ObsoleteBlobFile(std::shared_ptr<BlobFile>& blob_file,
+                                       SequenceNumber obsolete_seq,
+                                       bool update_size) {
+  return ObsoleteBlobFile(blob_file, obsolete_seq, update_size);
+}
+
+void BlobDBImpl::TEST_EvictExpiredFiles() {
+  EvictExpiredFiles(false /*abort*/);
+}
+
+uint64_t BlobDBImpl::TEST_live_sst_size() { return live_sst_size_.load(); }
+
+void BlobDBImpl::TEST_InitializeBlobFileToSstMapping(
+    const std::vector<LiveFileMetaData>& live_files) {
+  InitializeBlobFileToSstMapping(live_files);
+}
+
+void BlobDBImpl::TEST_ProcessFlushJobInfo(const FlushJobInfo& info) {
+  ProcessFlushJobInfo(info);
+}
+
+void BlobDBImpl::TEST_ProcessCompactionJobInfo(const CompactionJobInfo& info) {
+  ProcessCompactionJobInfo(info);
+}
+
+#endif  //  !NDEBUG
+
+}  // namespace blob_db
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/blob_db/blob_db_impl.h b/src/rocksdb/utilities/blob_db/blob_db_impl.h
new file mode 100644
index 000000000..0b4dbf5e5
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_db_impl.h
@@ -0,0 +1,503 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <atomic>
+#include <condition_variable>
+#include <limits>
+#include <list>
+#include <memory>
+#include <set>
+#include <string>
+#include <thread>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_writer.h"
+#include "db/db_iter.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/wal_filter.h"
+#include "util/mutexlock.h"
+#include "util/timer_queue.h"
+#include "utilities/blob_db/blob_db.h"
+#include "utilities/blob_db/blob_file.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImpl;
+class ColumnFamilyHandle;
+class ColumnFamilyData;
+class SystemClock;
+
+struct FlushJobInfo;
+
+namespace blob_db {
+
+struct BlobCompactionContext;
+struct BlobCompactionContextGC;
+class BlobDBImpl;
+class BlobFile;
+
+// Comparator to sort "TTL" aware Blob files based on the lower value of
+// TTL range.
+struct BlobFileComparatorTTL {
+  bool operator()(const std::shared_ptr<BlobFile>& lhs,
+                  const std::shared_ptr<BlobFile>& rhs) const;
+};
+
+struct BlobFileComparator {
+  bool operator()(const std::shared_ptr<BlobFile>& lhs,
+                  const std::shared_ptr<BlobFile>& rhs) const;
+};
+
+/**
+ * The implementation class for BlobDB. It manages the blob logs, which
+ * are sequentially written files. Blob logs can be of the TTL or non-TTL
+ * varieties; the former are cleaned up when they expire, while the latter
+ * are (optionally) garbage collected.
+ */
+class BlobDBImpl : public BlobDB {
+  friend class BlobFile;
+  friend class BlobDBIterator;
+  friend class BlobDBListener;
+  friend class BlobDBListenerGC;
+  friend class BlobIndexCompactionFilterBase;
+  friend class BlobIndexCompactionFilterGC;
+
+ public:
+  // deletions check period
+  static constexpr uint32_t kDeleteCheckPeriodMillisecs = 2 * 1000;
+
+  // sanity check task
+  static constexpr uint32_t kSanityCheckPeriodMillisecs = 20 * 60 * 1000;
+
+  // how many random access open files can we tolerate
+  static constexpr uint32_t kOpenFilesTrigger = 100;
+
+  // how often to schedule reclaim open files.
+  static constexpr uint32_t kReclaimOpenFilesPeriodMillisecs = 1 * 1000;
+
+  // how often to schedule delete obs files periods
+  static constexpr uint32_t kDeleteObsoleteFilesPeriodMillisecs = 10 * 1000;
+
+  // how often to schedule expired files eviction.
+  static constexpr uint32_t kEvictExpiredFilesPeriodMillisecs = 10 * 1000;
+
+  // when should oldest file be evicted:
+  // on reaching 90% of blob_dir_size
+  static constexpr double kEvictOldestFileAtSize = 0.9;
+
+  using BlobDB::Put;
+  Status Put(const WriteOptions& options, const Slice& key,
+             const Slice& value) override;
+
+  using BlobDB::Get;
+  Status Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family,
+             const Slice& key, PinnableSlice* value) override;
+
+  Status Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family,
+             const Slice& key, PinnableSlice* value,
+             uint64_t* expiration) override;
+
+  using BlobDB::NewIterator;
+  virtual Iterator* NewIterator(const ReadOptions& read_options) override;
+
+  using BlobDB::NewIterators;
+  virtual Status NewIterators(
+      const ReadOptions& /*read_options*/,
+      const std::vector<ColumnFamilyHandle*>& /*column_families*/,
+      std::vector<Iterator*>* /*iterators*/) override {
+    return Status::NotSupported("Not implemented");
+  }
+
+  using BlobDB::MultiGet;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& read_options, const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override;
+
+  using BlobDB::Write;
+  virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
+
+  virtual Status Close() override;
+
+  using BlobDB::PutWithTTL;
+  Status PutWithTTL(const WriteOptions& options, const Slice& key,
+                    const Slice& value, uint64_t ttl) override;
+
+  using BlobDB::PutUntil;
+  Status PutUntil(const WriteOptions& options, const Slice& key,
+                  const Slice& value, uint64_t expiration) override;
+
+  using BlobDB::CompactFiles;
+  Status CompactFiles(
+      const CompactionOptions& compact_options,
+      const std::vector<std::string>& input_file_names, const int output_level,
+      const int output_path_id = -1,
+      std::vector<std::string>* const output_file_names = nullptr,
+      CompactionJobInfo* compaction_job_info = nullptr) override;
+
+  BlobDBOptions GetBlobDBOptions() const override;
+
+  BlobDBImpl(const std::string& dbname, const BlobDBOptions& bdb_options,
+             const DBOptions& db_options,
+             const ColumnFamilyOptions& cf_options);
+
+  virtual Status DisableFileDeletions() override;
+
+  virtual Status EnableFileDeletions(bool force) override;
+
+  virtual Status GetLiveFiles(std::vector<std::string>&,
+                              uint64_t* manifest_file_size,
+                              bool flush_memtable = true) override;
+  virtual void GetLiveFilesMetaData(std::vector<LiveFileMetaData>*) override;
+
+  ~BlobDBImpl();
+
+  Status Open(std::vector<ColumnFamilyHandle*>* handles);
+
+  Status SyncBlobFiles() override;
+
+  // Common part of the two GetCompactionContext methods below.
+  // REQUIRES: read lock on mutex_
+  void GetCompactionContextCommon(BlobCompactionContext* context);
+
+  void GetCompactionContext(BlobCompactionContext* context);
+  void GetCompactionContext(BlobCompactionContext* context,
+                            BlobCompactionContextGC* context_gc);
+
+#ifndef NDEBUG
+  Status TEST_GetBlobValue(const Slice& key, const Slice& index_entry,
+                           PinnableSlice* value);
+
+  void TEST_AddDummyBlobFile(uint64_t blob_file_number,
+                             SequenceNumber immutable_sequence);
+
+  std::vector<std::shared_ptr<BlobFile>> TEST_GetBlobFiles() const;
+
+  std::vector<std::shared_ptr<BlobFile>> TEST_GetLiveImmNonTTLFiles() const;
+
+  std::vector<std::shared_ptr<BlobFile>> TEST_GetObsoleteFiles() const;
+
+  Status TEST_CloseBlobFile(std::shared_ptr<BlobFile>& bfile);
+
+  void TEST_ObsoleteBlobFile(std::shared_ptr<BlobFile>& blob_file,
+                             SequenceNumber obsolete_seq = 0,
+                             bool update_size = true);
+
+  void TEST_EvictExpiredFiles();
+
+  void TEST_DeleteObsoleteFiles();
+
+  uint64_t TEST_live_sst_size();
+
+  const std::string& TEST_blob_dir() const { return blob_dir_; }
+
+  void TEST_InitializeBlobFileToSstMapping(
+      const std::vector<LiveFileMetaData>& live_files);
+
+  void TEST_ProcessFlushJobInfo(const FlushJobInfo& info);
+
+  void TEST_ProcessCompactionJobInfo(const CompactionJobInfo& info);
+
+#endif  //  !NDEBUG
+
+ private:
+  class BlobInserter;
+
+  // Create a snapshot if there isn't one in read options.
+  // Return true if a snapshot is created.
+  bool SetSnapshotIfNeeded(ReadOptions* read_options);
+
+  Status GetImpl(const ReadOptions& read_options,
+                 ColumnFamilyHandle* column_family, const Slice& key,
+                 PinnableSlice* value, uint64_t* expiration = nullptr);
+
+  Status GetBlobValue(const Slice& key, const Slice& index_entry,
+                      PinnableSlice* value, uint64_t* expiration = nullptr);
+
+  Status GetRawBlobFromFile(const Slice& key, uint64_t file_number,
+                            uint64_t offset, uint64_t size,
+                            PinnableSlice* value,
+                            CompressionType* compression_type);
+
+  Slice GetCompressedSlice(const Slice& raw,
+                           std::string* compression_output) const;
+
+  Status DecompressSlice(const Slice& compressed_value,
+                         CompressionType compression_type,
+                         PinnableSlice* value_output) const;
+
+  // Close a file by appending a footer, and removes file from open files list.
+  // REQUIRES: lock held on write_mutex_, write lock held on both the db mutex_
+  // and the blob file's mutex_. If called on a blob file which is visible only
+  // to a single thread (like in the case of new files written during
+  // compaction/GC), the locks on write_mutex_ and the blob file's mutex_ can be
+  // avoided.
+  Status CloseBlobFile(std::shared_ptr<BlobFile> bfile);
+
+  // Close a file if its size exceeds blob_file_size
+  // REQUIRES: lock held on write_mutex_.
+  Status CloseBlobFileIfNeeded(std::shared_ptr<BlobFile>& bfile);
+
+  // Mark file as obsolete and move the file to obsolete file list.
+  //
+  // REQUIRED: hold write lock of mutex_ or during DB open.
+  void ObsoleteBlobFile(std::shared_ptr<BlobFile> blob_file,
+                        SequenceNumber obsolete_seq, bool update_size);
+
+  Status PutBlobValue(const WriteOptions& options, const Slice& key,
+                      const Slice& value, uint64_t expiration,
+                      WriteBatch* batch);
+
+  Status AppendBlob(const std::shared_ptr<BlobFile>& bfile,
+                    const std::string& headerbuf, const Slice& key,
+                    const Slice& value, uint64_t expiration,
+                    std::string* index_entry);
+
+  // Create a new blob file and associated writer.
+  Status CreateBlobFileAndWriter(bool has_ttl,
+                                 const ExpirationRange& expiration_range,
+                                 const std::string& reason,
+                                 std::shared_ptr<BlobFile>* blob_file,
+                                 std::shared_ptr<BlobLogWriter>* writer);
+
+  // Get the open non-TTL blob log file, or create a new one if no such file
+  // exists.
+  Status SelectBlobFile(std::shared_ptr<BlobFile>* blob_file);
+
+  // Get the open TTL blob log file for a certain expiration, or create a new
+  // one if no such file exists.
+  Status SelectBlobFileTTL(uint64_t expiration,
+                           std::shared_ptr<BlobFile>* blob_file);
+
+  std::shared_ptr<BlobFile> FindBlobFileLocked(uint64_t expiration) const;
+
+  // periodic sanity check. Bunch of checks
+  std::pair<bool, int64_t> SanityCheck(bool aborted);
+
+  // Delete files that have been marked obsolete (either because of TTL
+  // or GC). Check whether any snapshots exist which refer to the same.
+  std::pair<bool, int64_t> DeleteObsoleteFiles(bool aborted);
+
+  // periodically check if open blob files and their TTL's has expired
+  // if expired, close the sequential writer and make the file immutable
+  std::pair<bool, int64_t> EvictExpiredFiles(bool aborted);
+
+  // if the number of open files, approaches ULIMIT's this
+  // task will close random readers, which are kept around for
+  // efficiency
+  std::pair<bool, int64_t> ReclaimOpenFiles(bool aborted);
+
+  std::pair<bool, int64_t> RemoveTimerQ(TimerQueue* tq, bool aborted);
+
+  // Adds the background tasks to the timer queue
+  void StartBackgroundTasks();
+
+  // add a new Blob File
+  std::shared_ptr<BlobFile> NewBlobFile(bool has_ttl,
+                                        const ExpirationRange& expiration_range,
+                                        const std::string& reason);
+
+  // Register a new blob file.
+  // REQUIRES: write lock on mutex_.
+  void RegisterBlobFile(std::shared_ptr<BlobFile> blob_file);
+
+  // collect all the blob log files from the blob directory
+  Status GetAllBlobFiles(std::set<uint64_t>* file_numbers);
+
+  // Open all blob files found in blob_dir.
+  Status OpenAllBlobFiles();
+
+  // Link an SST to a blob file. Comes in locking and non-locking varieties
+  // (the latter is used during Open).
+  template <typename Linker>
+  void LinkSstToBlobFileImpl(uint64_t sst_file_number,
+                             uint64_t blob_file_number, Linker linker);
+
+  void LinkSstToBlobFile(uint64_t sst_file_number, uint64_t blob_file_number);
+
+  void LinkSstToBlobFileNoLock(uint64_t sst_file_number,
+                               uint64_t blob_file_number);
+
+  // Unlink an SST from a blob file.
+  void UnlinkSstFromBlobFile(uint64_t sst_file_number,
+                             uint64_t blob_file_number);
+
+  // Initialize the mapping between blob files and SSTs during Open.
+  void InitializeBlobFileToSstMapping(
+      const std::vector<LiveFileMetaData>& live_files);
+
+  // Update the mapping between blob files and SSTs after a flush and mark
+  // any unneeded blob files obsolete.
+  void ProcessFlushJobInfo(const FlushJobInfo& info);
+
+  // Update the mapping between blob files and SSTs after a compaction and
+  // mark any unneeded blob files obsolete.
+  void ProcessCompactionJobInfo(const CompactionJobInfo& info);
+
+  // Mark an immutable non-TTL blob file obsolete assuming it has no more SSTs
+  // linked to it, and all memtables from before the blob file became immutable
+  // have been flushed. Note: should only be called if the condition holds for
+  // all lower-numbered non-TTL blob files as well.
+  bool MarkBlobFileObsoleteIfNeeded(const std::shared_ptr<BlobFile>& blob_file,
+                                    SequenceNumber obsolete_seq);
+
+  // Mark all immutable non-TTL blob files that aren't needed by any SSTs as
+  // obsolete. Comes in two varieties; the version used during Open need not
+  // worry about locking or snapshots.
+  template <class Functor>
+  void MarkUnreferencedBlobFilesObsoleteImpl(Functor mark_if_needed);
+
+  void MarkUnreferencedBlobFilesObsolete();
+  void MarkUnreferencedBlobFilesObsoleteDuringOpen();
+
+  void UpdateLiveSSTSize();
+
+  Status GetBlobFileReader(const std::shared_ptr<BlobFile>& blob_file,
+                           std::shared_ptr<RandomAccessFileReader>* reader);
+
+  // hold write mutex on file and call.
+  // Close the above Random Access reader
+  void CloseRandomAccessLocked(const std::shared_ptr<BlobFile>& bfile);
+
+  // hold write mutex on file and call
+  // creates a sequential (append) writer for this blobfile
+  Status CreateWriterLocked(const std::shared_ptr<BlobFile>& bfile);
+
+  // returns a BlobLogWriter object for the file. If writer is not
+  // already present, creates one. Needs Write Mutex to be held
+  Status CheckOrCreateWriterLocked(const std::shared_ptr<BlobFile>& blob_file,
+                                   std::shared_ptr<BlobLogWriter>* writer);
+
+  // checks if there is no snapshot which is referencing the
+  // blobs
+  bool VisibleToActiveSnapshot(const std::shared_ptr<BlobFile>& file);
+  bool FileDeleteOk_SnapshotCheckLocked(const std::shared_ptr<BlobFile>& bfile);
+
+  void CopyBlobFiles(std::vector<std::shared_ptr<BlobFile>>* bfiles_copy);
+
+  uint64_t EpochNow() { return clock_->NowMicros() / 1000000; }
+
+  // Check if inserting a new blob will make DB grow out of space.
+  // If is_fifo = true, FIFO eviction will be triggered to make room for the
+  // new blob. If force_evict = true, FIFO eviction will evict blob files
+  // even eviction will not make enough room for the new blob.
+  Status CheckSizeAndEvictBlobFiles(uint64_t blob_size,
+                                    bool force_evict = false);
+
+  // name of the database directory
+  std::string dbname_;
+
+  // the base DB
+  DBImpl* db_impl_;
+  Env* env_;
+  SystemClock* clock_;
+  // the options that govern the behavior of Blob Storage
+  BlobDBOptions bdb_options_;
+  DBOptions db_options_;
+  ColumnFamilyOptions cf_options_;
+  FileOptions file_options_;
+
+  // Raw pointer of statistic. db_options_ has a std::shared_ptr to hold
+  // ownership.
+  Statistics* statistics_;
+
+  // by default this is "blob_dir" under dbname_
+  // but can be configured
+  std::string blob_dir_;
+
+  // pointer to directory
+  std::unique_ptr<FSDirectory> dir_ent_;
+
+  // Read Write Mutex, which protects all the data structures
+  // HEAVILY TRAFFICKED
+  mutable port::RWMutex mutex_;
+
+  // Writers has to hold write_mutex_ before writing.
+  mutable port::Mutex write_mutex_;
+
+  // counter for blob file number
+  std::atomic<uint64_t> next_file_number_;
+
+  // entire metadata of all the BLOB files memory
+  std::map<uint64_t, std::shared_ptr<BlobFile>> blob_files_;
+
+  // All live immutable non-TTL blob files.
+  std::map<uint64_t, std::shared_ptr<BlobFile>> live_imm_non_ttl_blob_files_;
+
+  // The largest sequence number that has been flushed.
+  SequenceNumber flush_sequence_;
+
+  // opened non-TTL blob file.
+  std::shared_ptr<BlobFile> open_non_ttl_file_;
+
+  // all the blob files which are currently being appended to based
+  // on variety of incoming TTL's
+  std::set<std::shared_ptr<BlobFile>, BlobFileComparatorTTL> open_ttl_files_;
+
+  // Flag to check whether Close() has been called on this DB
+  bool closed_;
+
+  // timer based queue to execute tasks
+  TimerQueue tqueue_;
+
+  // number of files opened for random access/GET
+  // counter is used to monitor and close excess RA files.
+  std::atomic<uint32_t> open_file_count_;
+
+  // Total size of all live blob files (i.e. exclude obsolete files).
+  std::atomic<uint64_t> total_blob_size_;
+
+  // total size of SST files.
+  std::atomic<uint64_t> live_sst_size_;
+
+  // Latest FIFO eviction timestamp
+  //
+  // REQUIRES: access with metex_ lock held.
+  uint64_t fifo_eviction_seq_;
+
+  // The expiration up to which latest FIFO eviction evicts.
+  //
+  // REQUIRES: access with metex_ lock held.
+  uint64_t evict_expiration_up_to_;
+
+  std::list<std::shared_ptr<BlobFile>> obsolete_files_;
+
+  // DeleteObsoleteFiles, DiableFileDeletions and EnableFileDeletions block
+  // on the mutex to avoid contention.
+  //
+  // While DeleteObsoleteFiles hold both mutex_ and delete_file_mutex_, note
+  // the difference. mutex_ only needs to be held when access the
+  // data-structure, and delete_file_mutex_ needs to be held the whole time
+  // during DeleteObsoleteFiles to avoid being run simultaneously with
+  // DisableFileDeletions.
+  //
+  // If both of mutex_ and delete_file_mutex_ needs to be held, it is adviced
+  // to hold delete_file_mutex_ first to avoid deadlock.
+  mutable port::Mutex delete_file_mutex_;
+
+  // Each call of DisableFileDeletions will increase disable_file_deletion_
+  // by 1. EnableFileDeletions will either decrease the count by 1 or reset
+  // it to zeor, depending on the force flag.
+  //
+  // REQUIRES: access with delete_file_mutex_ held.
+  int disable_file_deletions_ = 0;
+
+  uint32_t debug_level_;
+};
+
+}  // namespace blob_db
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc b/src/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc
new file mode 100644
index 000000000..87e3f33cc
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc
@@ -0,0 +1,113 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "file/filename.h"
+#include "logging/logging.h"
+#include "util/cast_util.h"
+#include "util/mutexlock.h"
+#include "utilities/blob_db/blob_db_impl.h"
+
+// BlobDBImpl methods to get snapshot of files, e.g. for replication.
+
+namespace ROCKSDB_NAMESPACE {
+namespace blob_db {
+
+Status BlobDBImpl::DisableFileDeletions() {
+  // Disable base DB file deletions.
+  Status s = db_impl_->DisableFileDeletions();
+  if (!s.ok()) {
+    return s;
+  }
+
+  int count = 0;
+  {
+    // Hold delete_file_mutex_ to make sure no DeleteObsoleteFiles job
+    // is running.
+    MutexLock l(&delete_file_mutex_);
+    count = ++disable_file_deletions_;
+  }
+
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "Disabled blob file deletions. count: %d", count);
+  return Status::OK();
+}
+
+Status BlobDBImpl::EnableFileDeletions(bool force) {
+  // Enable base DB file deletions.
+  Status s = db_impl_->EnableFileDeletions(force);
+  if (!s.ok()) {
+    return s;
+  }
+
+  int count = 0;
+  {
+    MutexLock l(&delete_file_mutex_);
+    if (force) {
+      disable_file_deletions_ = 0;
+    } else if (disable_file_deletions_ > 0) {
+      count = --disable_file_deletions_;
+    }
+    assert(count >= 0);
+  }
+
+  ROCKS_LOG_INFO(db_options_.info_log, "Enabled blob file deletions. count: %d",
+                 count);
+  // Consider trigger DeleteobsoleteFiles once after re-enabled, if we are to
+  // make DeleteobsoleteFiles re-run interval configuration.
+  return Status::OK();
+}
+
+Status BlobDBImpl::GetLiveFiles(std::vector<std::string>& ret,
+                                uint64_t* manifest_file_size,
+                                bool flush_memtable) {
+  if (!bdb_options_.path_relative) {
+    return Status::NotSupported(
+        "Not able to get relative blob file path from absolute blob_dir.");
+  }
+  // Hold a lock in the beginning to avoid updates to base DB during the call
+  ReadLock rl(&mutex_);
+  Status s = db_->GetLiveFiles(ret, manifest_file_size, flush_memtable);
+  if (!s.ok()) {
+    return s;
+  }
+  ret.reserve(ret.size() + blob_files_.size());
+  for (auto bfile_pair : blob_files_) {
+    auto blob_file = bfile_pair.second;
+    // Path should be relative to db_name, but begin with slash.
+    ret.emplace_back(
+        BlobFileName("", bdb_options_.blob_dir, blob_file->BlobFileNumber()));
+  }
+  return Status::OK();
+}
+
+void BlobDBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
+  // Path should be relative to db_name.
+  assert(bdb_options_.path_relative);
+  // Hold a lock in the beginning to avoid updates to base DB during the call
+  ReadLock rl(&mutex_);
+  db_->GetLiveFilesMetaData(metadata);
+  for (auto bfile_pair : blob_files_) {
+    auto blob_file = bfile_pair.second;
+    LiveFileMetaData filemetadata;
+    filemetadata.size = blob_file->GetFileSize();
+    const uint64_t file_number = blob_file->BlobFileNumber();
+    // Path should be relative to db_name, but begin with slash.
+    filemetadata.name = BlobFileName("", bdb_options_.blob_dir, file_number);
+    filemetadata.file_number = file_number;
+    if (blob_file->HasTTL()) {
+      filemetadata.oldest_ancester_time = blob_file->GetExpirationRange().first;
+    }
+    auto cfh =
+        static_cast_with_check<ColumnFamilyHandleImpl>(DefaultColumnFamily());
+    filemetadata.column_family_name = cfh->GetName();
+    metadata->emplace_back(filemetadata);
+  }
+}
+
+}  // namespace blob_db
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/blob_db/blob_db_iterator.h b/src/rocksdb/utilities/blob_db/blob_db_iterator.h
new file mode 100644
index 000000000..fd2b2f8f5
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_db_iterator.h
@@ -0,0 +1,150 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "db/arena_wrapped_db_iter.h"
+#include "rocksdb/iterator.h"
+#include "util/stop_watch.h"
+#include "utilities/blob_db/blob_db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Statistics;
+class SystemClock;
+
+namespace blob_db {
+
+using ROCKSDB_NAMESPACE::ManagedSnapshot;
+
+class BlobDBIterator : public Iterator {
+ public:
+  BlobDBIterator(ManagedSnapshot* snapshot, ArenaWrappedDBIter* iter,
+                 BlobDBImpl* blob_db, SystemClock* clock,
+                 Statistics* statistics)
+      : snapshot_(snapshot),
+        iter_(iter),
+        blob_db_(blob_db),
+        clock_(clock),
+        statistics_(statistics) {}
+
+  virtual ~BlobDBIterator() = default;
+
+  bool Valid() const override {
+    if (!iter_->Valid()) {
+      return false;
+    }
+    return status_.ok();
+  }
+
+  Status status() const override {
+    if (!iter_->status().ok()) {
+      return iter_->status();
+    }
+    return status_;
+  }
+
+  void SeekToFirst() override {
+    StopWatch seek_sw(clock_, statistics_, BLOB_DB_SEEK_MICROS);
+    RecordTick(statistics_, BLOB_DB_NUM_SEEK);
+    iter_->SeekToFirst();
+    while (UpdateBlobValue()) {
+      iter_->Next();
+    }
+  }
+
+  void SeekToLast() override {
+    StopWatch seek_sw(clock_, statistics_, BLOB_DB_SEEK_MICROS);
+    RecordTick(statistics_, BLOB_DB_NUM_SEEK);
+    iter_->SeekToLast();
+    while (UpdateBlobValue()) {
+      iter_->Prev();
+    }
+  }
+
+  void Seek(const Slice& target) override {
+    StopWatch seek_sw(clock_, statistics_, BLOB_DB_SEEK_MICROS);
+    RecordTick(statistics_, BLOB_DB_NUM_SEEK);
+    iter_->Seek(target);
+    while (UpdateBlobValue()) {
+      iter_->Next();
+    }
+  }
+
+  void SeekForPrev(const Slice& target) override {
+    StopWatch seek_sw(clock_, statistics_, BLOB_DB_SEEK_MICROS);
+    RecordTick(statistics_, BLOB_DB_NUM_SEEK);
+    iter_->SeekForPrev(target);
+    while (UpdateBlobValue()) {
+      iter_->Prev();
+    }
+  }
+
+  void Next() override {
+    assert(Valid());
+    StopWatch next_sw(clock_, statistics_, BLOB_DB_NEXT_MICROS);
+    RecordTick(statistics_, BLOB_DB_NUM_NEXT);
+    iter_->Next();
+    while (UpdateBlobValue()) {
+      iter_->Next();
+    }
+  }
+
+  void Prev() override {
+    assert(Valid());
+    StopWatch prev_sw(clock_, statistics_, BLOB_DB_PREV_MICROS);
+    RecordTick(statistics_, BLOB_DB_NUM_PREV);
+    iter_->Prev();
+    while (UpdateBlobValue()) {
+      iter_->Prev();
+    }
+  }
+
+  Slice key() const override {
+    assert(Valid());
+    return iter_->key();
+  }
+
+  Slice value() const override {
+    assert(Valid());
+    if (!iter_->IsBlob()) {
+      return iter_->value();
+    }
+    return value_;
+  }
+
+  // Iterator::Refresh() not supported.
+
+ private:
+  // Return true if caller should continue to next value.
+  bool UpdateBlobValue() {
+    value_.Reset();
+    status_ = Status::OK();
+    if (iter_->Valid() && iter_->status().ok() && iter_->IsBlob()) {
+      Status s = blob_db_->GetBlobValue(iter_->key(), iter_->value(), &value_);
+      if (s.IsNotFound()) {
+        return true;
+      } else {
+        if (!s.ok()) {
+          status_ = s;
+        }
+        return false;
+      }
+    } else {
+      return false;
+    }
+  }
+
+  std::unique_ptr<ManagedSnapshot> snapshot_;
+  std::unique_ptr<ArenaWrappedDBIter> iter_;
+  BlobDBImpl* blob_db_;
+  SystemClock* clock_;
+  Statistics* statistics_;
+  Status status_;
+  PinnableSlice value_;
+};
+}  // namespace blob_db
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/blob_db/blob_db_listener.h b/src/rocksdb/utilities/blob_db/blob_db_listener.h
new file mode 100644
index 000000000..d17d29853
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_db_listener.h
@@ -0,0 +1,71 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <atomic>
+
+#include "rocksdb/listener.h"
+#include "util/mutexlock.h"
+#include "utilities/blob_db/blob_db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace blob_db {
+
+class BlobDBListener : public EventListener {
+ public:
+  explicit BlobDBListener(BlobDBImpl* blob_db_impl)
+      : blob_db_impl_(blob_db_impl) {}
+
+  void OnFlushBegin(DB* /*db*/, const FlushJobInfo& /*info*/) override {
+    assert(blob_db_impl_ != nullptr);
+    blob_db_impl_->SyncBlobFiles();
+  }
+
+  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& /*info*/) override {
+    assert(blob_db_impl_ != nullptr);
+    blob_db_impl_->UpdateLiveSSTSize();
+  }
+
+  void OnCompactionCompleted(DB* /*db*/,
+                             const CompactionJobInfo& /*info*/) override {
+    assert(blob_db_impl_ != nullptr);
+    blob_db_impl_->UpdateLiveSSTSize();
+  }
+
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "BlobDBListener"; }
+
+ protected:
+  BlobDBImpl* blob_db_impl_;
+};
+
+class BlobDBListenerGC : public BlobDBListener {
+ public:
+  explicit BlobDBListenerGC(BlobDBImpl* blob_db_impl)
+      : BlobDBListener(blob_db_impl) {}
+
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "BlobDBListenerGC"; }
+  void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
+    BlobDBListener::OnFlushCompleted(db, info);
+
+    assert(blob_db_impl_);
+    blob_db_impl_->ProcessFlushJobInfo(info);
+  }
+
+  void OnCompactionCompleted(DB* db, const CompactionJobInfo& info) override {
+    BlobDBListener::OnCompactionCompleted(db, info);
+
+    assert(blob_db_impl_);
+    blob_db_impl_->ProcessCompactionJobInfo(info);
+  }
+};
+
+}  // namespace blob_db
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/blob_db/blob_db_test.cc b/src/rocksdb/utilities/blob_db/blob_db_test.cc
new file mode 100644
index 000000000..e392962b2
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_db_test.cc
@@ -0,0 +1,2407 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/blob_db/blob_db.h"
+
+#include <algorithm>
+#include <chrono>
+#include <cstdlib>
+#include <iomanip>
+#include <map>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "db/blob/blob_index.h"
+#include "db/db_test_util.h"
+#include "env/composite_env_wrapper.h"
+#include "file/file_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "port/port.h"
+#include "rocksdb/utilities/debug.h"
+#include "test_util/mock_time_env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/blob_db/blob_db_impl.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace blob_db {
+
+class BlobDBTest : public testing::Test {
+ public:
+  const int kMaxBlobSize = 1 << 14;
+
+  struct BlobIndexVersion {
+    BlobIndexVersion() = default;
+    BlobIndexVersion(std::string _user_key, uint64_t _file_number,
+                     uint64_t _expiration, SequenceNumber _sequence,
+                     ValueType _type)
+        : user_key(std::move(_user_key)),
+          file_number(_file_number),
+          expiration(_expiration),
+          sequence(_sequence),
+          type(_type) {}
+
+    std::string user_key;
+    uint64_t file_number = kInvalidBlobFileNumber;
+    uint64_t expiration = kNoExpiration;
+    SequenceNumber sequence = 0;
+    ValueType type = kTypeValue;
+  };
+
+  BlobDBTest()
+      : dbname_(test::PerThreadDBPath("blob_db_test")), blob_db_(nullptr) {
+    mock_clock_ = std::make_shared<MockSystemClock>(SystemClock::Default());
+    mock_env_.reset(new CompositeEnvWrapper(Env::Default(), mock_clock_));
+    fault_injection_env_.reset(new FaultInjectionTestEnv(Env::Default()));
+
+    Status s = DestroyBlobDB(dbname_, Options(), BlobDBOptions());
+    assert(s.ok());
+  }
+
+  ~BlobDBTest() override {
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    Destroy();
+  }
+
+  Status TryOpen(BlobDBOptions bdb_options = BlobDBOptions(),
+                 Options options = Options()) {
+    options.create_if_missing = true;
+    if (options.env == mock_env_.get()) {
+      // Need to disable stats dumping and persisting which also use
+      // RepeatableThread, which uses InstrumentedCondVar::TimedWaitInternal.
+      // With mocked time, this can hang on some platforms (MacOS)
+      // because (a) on some platforms, pthread_cond_timedwait does not appear
+      // to release the lock for other threads to operate if the deadline time
+      // is already passed, and (b) TimedWait calls are currently a bad
+      // abstraction because the deadline parameter is usually computed from
+      // Env time, but is interpreted in real clock time.
+      options.stats_dump_period_sec = 0;
+      options.stats_persist_period_sec = 0;
+    }
+    return BlobDB::Open(options, bdb_options, dbname_, &blob_db_);
+  }
+
+  void Open(BlobDBOptions bdb_options = BlobDBOptions(),
+            Options options = Options()) {
+    ASSERT_OK(TryOpen(bdb_options, options));
+  }
+
+  void Reopen(BlobDBOptions bdb_options = BlobDBOptions(),
+              Options options = Options()) {
+    assert(blob_db_ != nullptr);
+    delete blob_db_;
+    blob_db_ = nullptr;
+    Open(bdb_options, options);
+  }
+
+  void Close() {
+    assert(blob_db_ != nullptr);
+    delete blob_db_;
+    blob_db_ = nullptr;
+  }
+
+  void Destroy() {
+    if (blob_db_) {
+      Options options = blob_db_->GetOptions();
+      BlobDBOptions bdb_options = blob_db_->GetBlobDBOptions();
+      delete blob_db_;
+      blob_db_ = nullptr;
+      ASSERT_OK(DestroyBlobDB(dbname_, options, bdb_options));
+    }
+  }
+
+  BlobDBImpl *blob_db_impl() {
+    return reinterpret_cast<BlobDBImpl *>(blob_db_);
+  }
+
+  Status Put(const Slice &key, const Slice &value,
+             std::map<std::string, std::string> *data = nullptr) {
+    Status s = blob_db_->Put(WriteOptions(), key, value);
+    if (data != nullptr) {
+      (*data)[key.ToString()] = value.ToString();
+    }
+    return s;
+  }
+
+  void Delete(const std::string &key,
+              std::map<std::string, std::string> *data = nullptr) {
+    ASSERT_OK(blob_db_->Delete(WriteOptions(), key));
+    if (data != nullptr) {
+      data->erase(key);
+    }
+  }
+
+  Status PutWithTTL(const Slice &key, const Slice &value, uint64_t ttl,
+                    std::map<std::string, std::string> *data = nullptr) {
+    Status s = blob_db_->PutWithTTL(WriteOptions(), key, value, ttl);
+    if (data != nullptr) {
+      (*data)[key.ToString()] = value.ToString();
+    }
+    return s;
+  }
+
+  Status PutUntil(const Slice &key, const Slice &value, uint64_t expiration) {
+    return blob_db_->PutUntil(WriteOptions(), key, value, expiration);
+  }
+
+  void PutRandomWithTTL(const std::string &key, uint64_t ttl, Random *rnd,
+                        std::map<std::string, std::string> *data = nullptr) {
+    int len = rnd->Next() % kMaxBlobSize + 1;
+    std::string value = rnd->HumanReadableString(len);
+    ASSERT_OK(
+        blob_db_->PutWithTTL(WriteOptions(), Slice(key), Slice(value), ttl));
+    if (data != nullptr) {
+      (*data)[key] = value;
+    }
+  }
+
+  void PutRandomUntil(const std::string &key, uint64_t expiration, Random *rnd,
+                      std::map<std::string, std::string> *data = nullptr) {
+    int len = rnd->Next() % kMaxBlobSize + 1;
+    std::string value = rnd->HumanReadableString(len);
+    ASSERT_OK(blob_db_->PutUntil(WriteOptions(), Slice(key), Slice(value),
+                                 expiration));
+    if (data != nullptr) {
+      (*data)[key] = value;
+    }
+  }
+
+  void PutRandom(const std::string &key, Random *rnd,
+                 std::map<std::string, std::string> *data = nullptr) {
+    PutRandom(blob_db_, key, rnd, data);
+  }
+
+  void PutRandom(DB *db, const std::string &key, Random *rnd,
+                 std::map<std::string, std::string> *data = nullptr) {
+    int len = rnd->Next() % kMaxBlobSize + 1;
+    std::string value = rnd->HumanReadableString(len);
+    ASSERT_OK(db->Put(WriteOptions(), Slice(key), Slice(value)));
+    if (data != nullptr) {
+      (*data)[key] = value;
+    }
+  }
+
+  void PutRandomToWriteBatch(
+      const std::string &key, Random *rnd, WriteBatch *batch,
+      std::map<std::string, std::string> *data = nullptr) {
+    int len = rnd->Next() % kMaxBlobSize + 1;
+    std::string value = rnd->HumanReadableString(len);
+    ASSERT_OK(batch->Put(key, value));
+    if (data != nullptr) {
+      (*data)[key] = value;
+    }
+  }
+
+  // Verify blob db contain expected data and nothing more.
+  void VerifyDB(const std::map<std::string, std::string> &data) {
+    VerifyDB(blob_db_, data);
+  }
+
+  void VerifyDB(DB *db, const std::map<std::string, std::string> &data) {
+    // Verify normal Get
+    auto *cfh = db->DefaultColumnFamily();
+    for (auto &p : data) {
+      PinnableSlice value_slice;
+      ASSERT_OK(db->Get(ReadOptions(), cfh, p.first, &value_slice));
+      ASSERT_EQ(p.second, value_slice.ToString());
+      std::string value;
+      ASSERT_OK(db->Get(ReadOptions(), cfh, p.first, &value));
+      ASSERT_EQ(p.second, value);
+    }
+
+    // Verify iterators
+    Iterator *iter = db->NewIterator(ReadOptions());
+    iter->SeekToFirst();
+    for (auto &p : data) {
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ(p.first, iter->key().ToString());
+      ASSERT_EQ(p.second, iter->value().ToString());
+      iter->Next();
+    }
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_OK(iter->status());
+    delete iter;
+  }
+
+  void VerifyBaseDB(
+      const std::map<std::string, KeyVersion> &expected_versions) {
+    auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
+    DB *db = blob_db_->GetRootDB();
+    const size_t kMaxKeys = 10000;
+    std::vector<KeyVersion> versions;
+    ASSERT_OK(GetAllKeyVersions(db, "", "", kMaxKeys, &versions));
+    ASSERT_EQ(expected_versions.size(), versions.size());
+    size_t i = 0;
+    for (auto &key_version : expected_versions) {
+      const KeyVersion &expected_version = key_version.second;
+      ASSERT_EQ(expected_version.user_key, versions[i].user_key);
+      ASSERT_EQ(expected_version.sequence, versions[i].sequence);
+      ASSERT_EQ(expected_version.type, versions[i].type);
+      if (versions[i].type == kTypeValue) {
+        ASSERT_EQ(expected_version.value, versions[i].value);
+      } else {
+        ASSERT_EQ(kTypeBlobIndex, versions[i].type);
+        PinnableSlice value;
+        ASSERT_OK(bdb_impl->TEST_GetBlobValue(versions[i].user_key,
+                                              versions[i].value, &value));
+        ASSERT_EQ(expected_version.value, value.ToString());
+      }
+      i++;
+    }
+  }
+
+  void VerifyBaseDBBlobIndex(
+      const std::map<std::string, BlobIndexVersion> &expected_versions) {
+    const size_t kMaxKeys = 10000;
+    std::vector<KeyVersion> versions;
+    ASSERT_OK(
+        GetAllKeyVersions(blob_db_->GetRootDB(), "", "", kMaxKeys, &versions));
+    ASSERT_EQ(versions.size(), expected_versions.size());
+
+    size_t i = 0;
+    for (const auto &expected_pair : expected_versions) {
+      const BlobIndexVersion &expected_version = expected_pair.second;
+
+      ASSERT_EQ(versions[i].user_key, expected_version.user_key);
+      ASSERT_EQ(versions[i].sequence, expected_version.sequence);
+      ASSERT_EQ(versions[i].type, expected_version.type);
+      if (versions[i].type != kTypeBlobIndex) {
+        ASSERT_EQ(kInvalidBlobFileNumber, expected_version.file_number);
+        ASSERT_EQ(kNoExpiration, expected_version.expiration);
+
+        ++i;
+        continue;
+      }
+
+      BlobIndex blob_index;
+      ASSERT_OK(blob_index.DecodeFrom(versions[i].value));
+
+      const uint64_t file_number = !blob_index.IsInlined()
+                                       ? blob_index.file_number()
+                                       : kInvalidBlobFileNumber;
+      ASSERT_EQ(file_number, expected_version.file_number);
+
+      const uint64_t expiration =
+          blob_index.HasTTL() ? blob_index.expiration() : kNoExpiration;
+      ASSERT_EQ(expiration, expected_version.expiration);
+
+      ++i;
+    }
+  }
+
+  void InsertBlobs() {
+    WriteOptions wo;
+    std::string value;
+
+    Random rnd(301);
+    for (size_t i = 0; i < 100000; i++) {
+      uint64_t ttl = rnd.Next() % 86400;
+      PutRandomWithTTL("key" + std::to_string(i % 500), ttl, &rnd, nullptr);
+    }
+
+    for (size_t i = 0; i < 10; i++) {
+      Delete("key" + std::to_string(i % 500));
+    }
+  }
+
+  const std::string dbname_;
+  std::shared_ptr<MockSystemClock> mock_clock_;
+  std::unique_ptr<Env> mock_env_;
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env_;
+  BlobDB *blob_db_;
+};  // class BlobDBTest
+
+TEST_F(BlobDBTest, Put) {
+  Random rnd(301);
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  bdb_options.disable_background_tasks = true;
+  Open(bdb_options);
+  std::map<std::string, std::string> data;
+  for (size_t i = 0; i < 100; i++) {
+    PutRandom("key" + std::to_string(i), &rnd, &data);
+  }
+  VerifyDB(data);
+}
+
+TEST_F(BlobDBTest, PutWithTTL) {
+  Random rnd(301);
+  Options options;
+  options.env = mock_env_.get();
+  BlobDBOptions bdb_options;
+  bdb_options.ttl_range_secs = 1000;
+  bdb_options.min_blob_size = 0;
+  bdb_options.blob_file_size = 256 * 1000 * 1000;
+  bdb_options.disable_background_tasks = true;
+  Open(bdb_options, options);
+  std::map<std::string, std::string> data;
+  mock_clock_->SetCurrentTime(50);
+  for (size_t i = 0; i < 100; i++) {
+    uint64_t ttl = rnd.Next() % 100;
+    PutRandomWithTTL("key" + std::to_string(i), ttl, &rnd,
+                     (ttl <= 50 ? nullptr : &data));
+  }
+  mock_clock_->SetCurrentTime(100);
+  auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
+  auto blob_files = bdb_impl->TEST_GetBlobFiles();
+  ASSERT_EQ(1, blob_files.size());
+  ASSERT_TRUE(blob_files[0]->HasTTL());
+  ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
+  VerifyDB(data);
+}
+
+TEST_F(BlobDBTest, PutUntil) {
+  Random rnd(301);
+  Options options;
+  options.env = mock_env_.get();
+  BlobDBOptions bdb_options;
+  bdb_options.ttl_range_secs = 1000;
+  bdb_options.min_blob_size = 0;
+  bdb_options.blob_file_size = 256 * 1000 * 1000;
+  bdb_options.disable_background_tasks = true;
+  Open(bdb_options, options);
+  std::map<std::string, std::string> data;
+  mock_clock_->SetCurrentTime(50);
+  for (size_t i = 0; i < 100; i++) {
+    uint64_t expiration = rnd.Next() % 100 + 50;
+    PutRandomUntil("key" + std::to_string(i), expiration, &rnd,
+                   (expiration <= 100 ? nullptr : &data));
+  }
+  mock_clock_->SetCurrentTime(100);
+  auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
+  auto blob_files = bdb_impl->TEST_GetBlobFiles();
+  ASSERT_EQ(1, blob_files.size());
+  ASSERT_TRUE(blob_files[0]->HasTTL());
+  ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
+  VerifyDB(data);
+}
+
+TEST_F(BlobDBTest, StackableDBGet) {
+  Random rnd(301);
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  bdb_options.disable_background_tasks = true;
+  Open(bdb_options);
+  std::map<std::string, std::string> data;
+  for (size_t i = 0; i < 100; i++) {
+    PutRandom("key" + std::to_string(i), &rnd, &data);
+  }
+  for (size_t i = 0; i < 100; i++) {
+    StackableDB *db = blob_db_;
+    ColumnFamilyHandle *column_family = db->DefaultColumnFamily();
+    std::string key = "key" + std::to_string(i);
+    PinnableSlice pinnable_value;
+    ASSERT_OK(db->Get(ReadOptions(), column_family, key, &pinnable_value));
+    std::string string_value;
+    ASSERT_OK(db->Get(ReadOptions(), column_family, key, &string_value));
+    ASSERT_EQ(string_value, pinnable_value.ToString());
+    ASSERT_EQ(string_value, data[key]);
+  }
+}
+
+TEST_F(BlobDBTest, GetExpiration) {
+  Options options;
+  options.env = mock_env_.get();
+  BlobDBOptions bdb_options;
+  bdb_options.disable_background_tasks = true;
+  mock_clock_->SetCurrentTime(100);
+  Open(bdb_options, options);
+  ASSERT_OK(Put("key1", "value1"));
+  ASSERT_OK(PutWithTTL("key2", "value2", 200));
+  PinnableSlice value;
+  uint64_t expiration;
+  ASSERT_OK(blob_db_->Get(ReadOptions(), "key1", &value, &expiration));
+  ASSERT_EQ("value1", value.ToString());
+  ASSERT_EQ(kNoExpiration, expiration);
+  ASSERT_OK(blob_db_->Get(ReadOptions(), "key2", &value, &expiration));
+  ASSERT_EQ("value2", value.ToString());
+  ASSERT_EQ(300 /* = 100 + 200 */, expiration);
+}
+
+TEST_F(BlobDBTest, GetIOError) {
+  Options options;
+  options.env = fault_injection_env_.get();
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;  // Make sure value write to blob file
+  bdb_options.disable_background_tasks = true;
+  Open(bdb_options, options);
+  ColumnFamilyHandle *column_family = blob_db_->DefaultColumnFamily();
+  PinnableSlice value;
+  ASSERT_OK(Put("foo", "bar"));
+  fault_injection_env_->SetFilesystemActive(false, Status::IOError());
+  Status s = blob_db_->Get(ReadOptions(), column_family, "foo", &value);
+  ASSERT_TRUE(s.IsIOError());
+  // Reactivate file system to allow test to close DB.
+  fault_injection_env_->SetFilesystemActive(true);
+}
+
+TEST_F(BlobDBTest, PutIOError) {
+  Options options;
+  options.env = fault_injection_env_.get();
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;  // Make sure value write to blob file
+  bdb_options.disable_background_tasks = true;
+  Open(bdb_options, options);
+  fault_injection_env_->SetFilesystemActive(false, Status::IOError());
+  ASSERT_TRUE(Put("foo", "v1").IsIOError());
+  fault_injection_env_->SetFilesystemActive(true, Status::IOError());
+  ASSERT_OK(Put("bar", "v1"));
+}
+
+TEST_F(BlobDBTest, WriteBatch) {
+  Random rnd(301);
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  bdb_options.disable_background_tasks = true;
+  Open(bdb_options);
+  std::map<std::string, std::string> data;
+  for (size_t i = 0; i < 100; i++) {
+    WriteBatch batch;
+    for (size_t j = 0; j < 10; j++) {
+      PutRandomToWriteBatch("key" + std::to_string(j * 100 + i), &rnd, &batch,
+                            &data);
+    }
+
+    ASSERT_OK(blob_db_->Write(WriteOptions(), &batch));
+  }
+  VerifyDB(data);
+}
+
+TEST_F(BlobDBTest, Delete) {
+  Random rnd(301);
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  bdb_options.disable_background_tasks = true;
+  Open(bdb_options);
+  std::map<std::string, std::string> data;
+  for (size_t i = 0; i < 100; i++) {
+    PutRandom("key" + std::to_string(i), &rnd, &data);
+  }
+  for (size_t i = 0; i < 100; i += 5) {
+    Delete("key" + std::to_string(i), &data);
+  }
+  VerifyDB(data);
+}
+
+TEST_F(BlobDBTest, DeleteBatch) {
+  Random rnd(301);
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  bdb_options.disable_background_tasks = true;
+  Open(bdb_options);
+  for (size_t i = 0; i < 100; i++) {
+    PutRandom("key" + std::to_string(i), &rnd);
+  }
+  WriteBatch batch;
+  for (size_t i = 0; i < 100; i++) {
+    ASSERT_OK(batch.Delete("key" + std::to_string(i)));
+  }
+  ASSERT_OK(blob_db_->Write(WriteOptions(), &batch));
+  // DB should be empty.
+  VerifyDB({});
+}
+
+TEST_F(BlobDBTest, Override) {
+  Random rnd(301);
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  bdb_options.disable_background_tasks = true;
+  Open(bdb_options);
+  std::map<std::string, std::string> data;
+  for (int i = 0; i < 10000; i++) {
+    PutRandom("key" + std::to_string(i), &rnd, nullptr);
+  }
+  // override all the keys
+  for (int i = 0; i < 10000; i++) {
+    PutRandom("key" + std::to_string(i), &rnd, &data);
+  }
+  VerifyDB(data);
+}
+
+#ifdef SNAPPY
+TEST_F(BlobDBTest, Compression) {
+  Random rnd(301);
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  bdb_options.disable_background_tasks = true;
+  bdb_options.compression = CompressionType::kSnappyCompression;
+  Open(bdb_options);
+  std::map<std::string, std::string> data;
+  for (size_t i = 0; i < 100; i++) {
+    PutRandom("put-key" + std::to_string(i), &rnd, &data);
+  }
+  for (int i = 0; i < 100; i++) {
+    WriteBatch batch;
+    for (size_t j = 0; j < 10; j++) {
+      PutRandomToWriteBatch("write-batch-key" + std::to_string(j * 100 + i),
+                            &rnd, &batch, &data);
+    }
+    ASSERT_OK(blob_db_->Write(WriteOptions(), &batch));
+  }
+  VerifyDB(data);
+}
+
+TEST_F(BlobDBTest, DecompressAfterReopen) {
+  Random rnd(301);
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  bdb_options.disable_background_tasks = true;
+  bdb_options.compression = CompressionType::kSnappyCompression;
+  Open(bdb_options);
+  std::map<std::string, std::string> data;
+  for (size_t i = 0; i < 100; i++) {
+    PutRandom("put-key" + std::to_string(i), &rnd, &data);
+  }
+  VerifyDB(data);
+  bdb_options.compression = CompressionType::kNoCompression;
+  Reopen(bdb_options);
+  VerifyDB(data);
+}
+
+TEST_F(BlobDBTest, EnableDisableCompressionGC) {
+  Random rnd(301);
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  bdb_options.garbage_collection_cutoff = 1.0;
+  bdb_options.disable_background_tasks = true;
+  bdb_options.compression = kSnappyCompression;
+  Open(bdb_options);
+  std::map<std::string, std::string> data;
+  size_t data_idx = 0;
+  for (; data_idx < 100; data_idx++) {
+    PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
+  }
+  VerifyDB(data);
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(1, blob_files.size());
+  ASSERT_EQ(kSnappyCompression, blob_files[0]->GetCompressionType());
+
+  // disable compression
+  bdb_options.compression = kNoCompression;
+  Reopen(bdb_options);
+
+  // Add more data with new compression type
+  for (; data_idx < 200; data_idx++) {
+    PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
+  }
+  VerifyDB(data);
+
+  blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(2, blob_files.size());
+  ASSERT_EQ(kNoCompression, blob_files[1]->GetCompressionType());
+
+  // Enable GC. If we do it earlier the snapshot release triggered compaction
+  // may compact files and trigger GC before we can verify there are two files.
+  bdb_options.enable_garbage_collection = true;
+  Reopen(bdb_options);
+
+  // Trigger compaction
+  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  blob_db_impl()->TEST_DeleteObsoleteFiles();
+  VerifyDB(data);
+
+  blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  for (auto bfile : blob_files) {
+    ASSERT_EQ(kNoCompression, bfile->GetCompressionType());
+  }
+
+  // enabling the compression again
+  bdb_options.compression = kSnappyCompression;
+  Reopen(bdb_options);
+
+  // Add more data with new compression type
+  for (; data_idx < 300; data_idx++) {
+    PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
+  }
+  VerifyDB(data);
+
+  // Trigger compaction
+  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  blob_db_impl()->TEST_DeleteObsoleteFiles();
+  VerifyDB(data);
+
+  blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  for (auto bfile : blob_files) {
+    ASSERT_EQ(kSnappyCompression, bfile->GetCompressionType());
+  }
+}
+
+#ifdef LZ4
+// Test switch compression types and run GC, it needs both Snappy and LZ4
+// support.
+TEST_F(BlobDBTest, ChangeCompressionGC) {
+  Random rnd(301);
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  bdb_options.garbage_collection_cutoff = 1.0;
+  bdb_options.disable_background_tasks = true;
+  bdb_options.compression = kLZ4Compression;
+  Open(bdb_options);
+  std::map<std::string, std::string> data;
+  size_t data_idx = 0;
+  for (; data_idx < 100; data_idx++) {
+    PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
+  }
+  VerifyDB(data);
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(1, blob_files.size());
+  ASSERT_EQ(kLZ4Compression, blob_files[0]->GetCompressionType());
+
+  // Change compression type
+  bdb_options.compression = kSnappyCompression;
+  Reopen(bdb_options);
+
+  // Add more data with Snappy compression type
+  for (; data_idx < 200; data_idx++) {
+    PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
+  }
+  VerifyDB(data);
+
+  // Verify blob file compression type
+  blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(2, blob_files.size());
+  ASSERT_EQ(kSnappyCompression, blob_files[1]->GetCompressionType());
+
+  // Enable GC. If we do it earlier the snapshot release triggered compaction
+  // may compact files and trigger GC before we can verify there are two files.
+  bdb_options.enable_garbage_collection = true;
+  Reopen(bdb_options);
+
+  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  VerifyDB(data);
+
+  blob_db_impl()->TEST_DeleteObsoleteFiles();
+  blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  for (auto bfile : blob_files) {
+    ASSERT_EQ(kSnappyCompression, bfile->GetCompressionType());
+  }
+
+  // Disable compression
+  bdb_options.compression = kNoCompression;
+  Reopen(bdb_options);
+  for (; data_idx < 300; data_idx++) {
+    PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
+  }
+  VerifyDB(data);
+
+  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  VerifyDB(data);
+
+  blob_db_impl()->TEST_DeleteObsoleteFiles();
+  blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  for (auto bfile : blob_files) {
+    ASSERT_EQ(kNoCompression, bfile->GetCompressionType());
+  }
+
+  // switching different compression types to generate mixed compression types
+  bdb_options.compression = kSnappyCompression;
+  Reopen(bdb_options);
+  for (; data_idx < 400; data_idx++) {
+    PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
+  }
+  VerifyDB(data);
+
+  bdb_options.compression = kLZ4Compression;
+  Reopen(bdb_options);
+  for (; data_idx < 500; data_idx++) {
+    PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
+  }
+  VerifyDB(data);
+
+  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  VerifyDB(data);
+
+  blob_db_impl()->TEST_DeleteObsoleteFiles();
+  blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  for (auto bfile : blob_files) {
+    ASSERT_EQ(kLZ4Compression, bfile->GetCompressionType());
+  }
+}
+#endif  // LZ4
+#endif  // SNAPPY
+
+TEST_F(BlobDBTest, MultipleWriters) {
+  Open(BlobDBOptions());
+
+  std::vector<port::Thread> workers;
+  std::vector<std::map<std::string, std::string>> data_set(10);
+  for (uint32_t i = 0; i < 10; i++)
+    workers.push_back(port::Thread(
+        [&](uint32_t id) {
+          Random rnd(301 + id);
+          for (int j = 0; j < 100; j++) {
+            std::string key =
+                "key" + std::to_string(id) + "_" + std::to_string(j);
+            if (id < 5) {
+              PutRandom(key, &rnd, &data_set[id]);
+            } else {
+              WriteBatch batch;
+              PutRandomToWriteBatch(key, &rnd, &batch, &data_set[id]);
+              ASSERT_OK(blob_db_->Write(WriteOptions(), &batch));
+            }
+          }
+        },
+        i));
+  std::map<std::string, std::string> data;
+  for (size_t i = 0; i < 10; i++) {
+    workers[i].join();
+    data.insert(data_set[i].begin(), data_set[i].end());
+  }
+  VerifyDB(data);
+}
+
+TEST_F(BlobDBTest, SstFileManager) {
+  // run the same test for Get(), MultiGet() and Iterator each.
+  std::shared_ptr<SstFileManager> sst_file_manager(
+      NewSstFileManager(mock_env_.get()));
+  sst_file_manager->SetDeleteRateBytesPerSecond(1);
+  SstFileManagerImpl *sfm =
+      static_cast<SstFileManagerImpl *>(sst_file_manager.get());
+
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  bdb_options.enable_garbage_collection = true;
+  bdb_options.garbage_collection_cutoff = 1.0;
+  Options db_options;
+
+  int files_scheduled_to_delete = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::ScheduleFileDeletion", [&](void *arg) {
+        assert(arg);
+        const std::string *const file_path =
+            static_cast<const std::string *>(arg);
+        if (file_path->find(".blob") != std::string::npos) {
+          ++files_scheduled_to_delete;
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  db_options.sst_file_manager = sst_file_manager;
+
+  Open(bdb_options, db_options);
+
+  // Create one obselete file and clean it.
+  ASSERT_OK(blob_db_->Put(WriteOptions(), "foo", "bar"));
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(1, blob_files.size());
+  std::shared_ptr<BlobFile> bfile = blob_files[0];
+  ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(bfile));
+  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  blob_db_impl()->TEST_DeleteObsoleteFiles();
+
+  // Even if SSTFileManager is not set, DB is creating a dummy one.
+  ASSERT_EQ(1, files_scheduled_to_delete);
+  Destroy();
+  // Make sure that DestroyBlobDB() also goes through delete scheduler.
+  ASSERT_EQ(2, files_scheduled_to_delete);
+  SyncPoint::GetInstance()->DisableProcessing();
+  sfm->WaitForEmptyTrash();
+}
+
+TEST_F(BlobDBTest, SstFileManagerRestart) {
+  int files_scheduled_to_delete = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::ScheduleFileDeletion", [&](void *arg) {
+        assert(arg);
+        const std::string *const file_path =
+            static_cast<const std::string *>(arg);
+        if (file_path->find(".blob") != std::string::npos) {
+          ++files_scheduled_to_delete;
+        }
+      });
+
+  // run the same test for Get(), MultiGet() and Iterator each.
+  std::shared_ptr<SstFileManager> sst_file_manager(
+      NewSstFileManager(mock_env_.get()));
+  sst_file_manager->SetDeleteRateBytesPerSecond(1);
+  SstFileManagerImpl *sfm =
+      static_cast<SstFileManagerImpl *>(sst_file_manager.get());
+
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  Options db_options;
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  db_options.sst_file_manager = sst_file_manager;
+
+  Open(bdb_options, db_options);
+  std::string blob_dir = blob_db_impl()->TEST_blob_dir();
+  ASSERT_OK(blob_db_->Put(WriteOptions(), "foo", "bar"));
+  Close();
+
+  // Create 3 dummy trash files under the blob_dir
+  const auto &fs = db_options.env->GetFileSystem();
+  ASSERT_OK(CreateFile(fs, blob_dir + "/000666.blob.trash", "", false));
+  ASSERT_OK(CreateFile(fs, blob_dir + "/000888.blob.trash", "", true));
+  ASSERT_OK(CreateFile(fs, blob_dir + "/something_not_match.trash", "", false));
+
+  // Make sure that reopening the DB rescan the existing trash files
+  Open(bdb_options, db_options);
+  ASSERT_EQ(files_scheduled_to_delete, 2);
+
+  sfm->WaitForEmptyTrash();
+
+  // There should be exact one file under the blob dir now.
+  std::vector<std::string> all_files;
+  ASSERT_OK(db_options.env->GetChildren(blob_dir, &all_files));
+  int nfiles = 0;
+  for (const auto &f : all_files) {
+    assert(!f.empty());
+    if (f[0] == '.') {
+      continue;
+    }
+    nfiles++;
+  }
+  ASSERT_EQ(nfiles, 1);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(BlobDBTest, SnapshotAndGarbageCollection) {
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  bdb_options.enable_garbage_collection = true;
+  bdb_options.garbage_collection_cutoff = 1.0;
+  bdb_options.disable_background_tasks = true;
+
+  Options options;
+  options.disable_auto_compactions = true;
+
+  // i = when to take snapshot
+  for (int i = 0; i < 4; i++) {
+    Destroy();
+    Open(bdb_options, options);
+
+    const Snapshot *snapshot = nullptr;
+
+    // First file
+    ASSERT_OK(Put("key1", "value"));
+    if (i == 0) {
+      snapshot = blob_db_->GetSnapshot();
+    }
+
+    auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+    ASSERT_EQ(1, blob_files.size());
+    ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[0]));
+
+    // Second file
+    ASSERT_OK(Put("key2", "value"));
+    if (i == 1) {
+      snapshot = blob_db_->GetSnapshot();
+    }
+
+    blob_files = blob_db_impl()->TEST_GetBlobFiles();
+    ASSERT_EQ(2, blob_files.size());
+    auto bfile = blob_files[1];
+    ASSERT_FALSE(bfile->Immutable());
+    ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(bfile));
+
+    // Third file
+    ASSERT_OK(Put("key3", "value"));
+    if (i == 2) {
+      snapshot = blob_db_->GetSnapshot();
+    }
+
+    ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    ASSERT_TRUE(bfile->Obsolete());
+    ASSERT_EQ(blob_db_->GetLatestSequenceNumber(),
+              bfile->GetObsoleteSequence());
+
+    Delete("key2");
+    if (i == 3) {
+      snapshot = blob_db_->GetSnapshot();
+    }
+
+    ASSERT_EQ(4, blob_db_impl()->TEST_GetBlobFiles().size());
+    blob_db_impl()->TEST_DeleteObsoleteFiles();
+
+    if (i >= 2) {
+      // The snapshot shouldn't see data in bfile
+      ASSERT_EQ(2, blob_db_impl()->TEST_GetBlobFiles().size());
+      blob_db_->ReleaseSnapshot(snapshot);
+    } else {
+      // The snapshot will see data in bfile, so the file shouldn't be deleted
+      ASSERT_EQ(4, blob_db_impl()->TEST_GetBlobFiles().size());
+      blob_db_->ReleaseSnapshot(snapshot);
+      blob_db_impl()->TEST_DeleteObsoleteFiles();
+      ASSERT_EQ(2, blob_db_impl()->TEST_GetBlobFiles().size());
+    }
+  }
+}
+
+TEST_F(BlobDBTest, ColumnFamilyNotSupported) {
+  Options options;
+  options.env = mock_env_.get();
+  mock_clock_->SetCurrentTime(0);
+  Open(BlobDBOptions(), options);
+  ColumnFamilyHandle *default_handle = blob_db_->DefaultColumnFamily();
+  ColumnFamilyHandle *handle = nullptr;
+  std::string value;
+  std::vector<std::string> values;
+  // The call simply pass through to base db. It should succeed.
+  ASSERT_OK(
+      blob_db_->CreateColumnFamily(ColumnFamilyOptions(), "foo", &handle));
+  ASSERT_TRUE(blob_db_->Put(WriteOptions(), handle, "k", "v").IsNotSupported());
+  ASSERT_TRUE(blob_db_->PutWithTTL(WriteOptions(), handle, "k", "v", 60)
+                  .IsNotSupported());
+  ASSERT_TRUE(blob_db_->PutUntil(WriteOptions(), handle, "k", "v", 100)
+                  .IsNotSupported());
+  WriteBatch batch;
+  ASSERT_OK(batch.Put("k1", "v1"));
+  ASSERT_OK(batch.Put(handle, "k2", "v2"));
+  ASSERT_TRUE(blob_db_->Write(WriteOptions(), &batch).IsNotSupported());
+  ASSERT_TRUE(blob_db_->Get(ReadOptions(), "k1", &value).IsNotFound());
+  ASSERT_TRUE(
+      blob_db_->Get(ReadOptions(), handle, "k", &value).IsNotSupported());
+  auto statuses = blob_db_->MultiGet(ReadOptions(), {default_handle, handle},
+                                     {"k1", "k2"}, &values);
+  ASSERT_EQ(2, statuses.size());
+  ASSERT_TRUE(statuses[0].IsNotSupported());
+  ASSERT_TRUE(statuses[1].IsNotSupported());
+  ASSERT_EQ(nullptr, blob_db_->NewIterator(ReadOptions(), handle));
+  delete handle;
+}
+
+TEST_F(BlobDBTest, GetLiveFilesMetaData) {
+  Random rnd(301);
+
+  BlobDBOptions bdb_options;
+  bdb_options.blob_dir = "blob_dir";
+  bdb_options.path_relative = true;
+  bdb_options.ttl_range_secs = 10;
+  bdb_options.min_blob_size = 0;
+  bdb_options.disable_background_tasks = true;
+
+  Options options;
+  options.env = mock_env_.get();
+
+  Open(bdb_options, options);
+
+  std::map<std::string, std::string> data;
+  for (size_t i = 0; i < 100; i++) {
+    PutRandom("key" + std::to_string(i), &rnd, &data);
+  }
+
+  constexpr uint64_t expiration = 1000ULL;
+  PutRandomUntil("key100", expiration, &rnd, &data);
+
+  std::vector<LiveFileMetaData> metadata;
+  blob_db_->GetLiveFilesMetaData(&metadata);
+
+  ASSERT_EQ(2U, metadata.size());
+  // Path should be relative to db_name, but begin with slash.
+  const std::string filename1("/blob_dir/000001.blob");
+  ASSERT_EQ(filename1, metadata[0].name);
+  ASSERT_EQ(1, metadata[0].file_number);
+  ASSERT_EQ(0, metadata[0].oldest_ancester_time);
+  ASSERT_EQ(kDefaultColumnFamilyName, metadata[0].column_family_name);
+
+  const std::string filename2("/blob_dir/000002.blob");
+  ASSERT_EQ(filename2, metadata[1].name);
+  ASSERT_EQ(2, metadata[1].file_number);
+  ASSERT_EQ(expiration, metadata[1].oldest_ancester_time);
+  ASSERT_EQ(kDefaultColumnFamilyName, metadata[1].column_family_name);
+
+  std::vector<std::string> livefile;
+  uint64_t mfs;
+  ASSERT_OK(blob_db_->GetLiveFiles(livefile, &mfs, false));
+  ASSERT_EQ(5U, livefile.size());
+  ASSERT_EQ(filename1, livefile[3]);
+  ASSERT_EQ(filename2, livefile[4]);
+  VerifyDB(data);
+}
+
+TEST_F(BlobDBTest, MigrateFromPlainRocksDB) {
+  constexpr size_t kNumKey = 20;
+  constexpr size_t kNumIteration = 10;
+  Random rnd(301);
+  std::map<std::string, std::string> data;
+  std::vector<bool> is_blob(kNumKey, false);
+
+  // Write to plain rocksdb.
+  Options options;
+  options.create_if_missing = true;
+  DB *db = nullptr;
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  for (size_t i = 0; i < kNumIteration; i++) {
+    auto key_index = rnd.Next() % kNumKey;
+    std::string key = "key" + std::to_string(key_index);
+    PutRandom(db, key, &rnd, &data);
+  }
+  VerifyDB(db, data);
+  delete db;
+  db = nullptr;
+
+  // Open as blob db. Verify it can read existing data.
+  Open();
+  VerifyDB(blob_db_, data);
+  for (size_t i = 0; i < kNumIteration; i++) {
+    auto key_index = rnd.Next() % kNumKey;
+    std::string key = "key" + std::to_string(key_index);
+    is_blob[key_index] = true;
+    PutRandom(blob_db_, key, &rnd, &data);
+  }
+  VerifyDB(blob_db_, data);
+  delete blob_db_;
+  blob_db_ = nullptr;
+
+  // Verify plain db return error for keys written by blob db.
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  std::string value;
+  for (size_t i = 0; i < kNumKey; i++) {
+    std::string key = "key" + std::to_string(i);
+    Status s = db->Get(ReadOptions(), key, &value);
+    if (data.count(key) == 0) {
+      ASSERT_TRUE(s.IsNotFound());
+    } else if (is_blob[i]) {
+      ASSERT_TRUE(s.IsCorruption());
+    } else {
+      ASSERT_OK(s);
+      ASSERT_EQ(data[key], value);
+    }
+  }
+  delete db;
+}
+
+// Test to verify that a NoSpace IOError Status is returned on reaching
+// max_db_size limit.
+TEST_F(BlobDBTest, OutOfSpace) {
+  // Use mock env to stop wall clock.
+  Options options;
+  options.env = mock_env_.get();
+  BlobDBOptions bdb_options;
+  bdb_options.max_db_size = 200;
+  bdb_options.is_fifo = false;
+  bdb_options.disable_background_tasks = true;
+  Open(bdb_options);
+
+  // Each stored blob has an overhead of about 42 bytes currently.
+  // So a small key + a 100 byte blob should take up ~150 bytes in the db.
+  std::string value(100, 'v');
+  ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), "key1", value, 60));
+
+  // Putting another blob should fail as ading it would exceed the max_db_size
+  // limit.
+  Status s = blob_db_->PutWithTTL(WriteOptions(), "key2", value, 60);
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_TRUE(s.IsNoSpace());
+}
+
+TEST_F(BlobDBTest, FIFOEviction) {
+  BlobDBOptions bdb_options;
+  bdb_options.max_db_size = 200;
+  bdb_options.blob_file_size = 100;
+  bdb_options.is_fifo = true;
+  bdb_options.disable_background_tasks = true;
+  Open(bdb_options);
+
+  std::atomic<int> evict_count{0};
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobDBImpl::EvictOldestBlobFile:Evicted",
+      [&](void *) { evict_count++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Each stored blob has an overhead of 32 bytes currently.
+  // So a 100 byte blob should take up 132 bytes.
+  std::string value(100, 'v');
+  ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), "key1", value, 10));
+  VerifyDB({{"key1", value}});
+
+  ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
+
+  // Adding another 100 bytes blob would take the total size to 264 bytes
+  // (2*132). max_db_size will be exceeded
+  // than max_db_size and trigger FIFO eviction.
+  ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), "key2", value, 60));
+  ASSERT_EQ(1, evict_count);
+  // key1 will exist until corresponding file be deleted.
+  VerifyDB({{"key1", value}, {"key2", value}});
+
+  // Adding another 100 bytes blob without TTL.
+  ASSERT_OK(blob_db_->Put(WriteOptions(), "key3", value));
+  ASSERT_EQ(2, evict_count);
+  // key1 and key2 will exist until corresponding file be deleted.
+  VerifyDB({{"key1", value}, {"key2", value}, {"key3", value}});
+
+  // The fourth blob file, without TTL.
+  ASSERT_OK(blob_db_->Put(WriteOptions(), "key4", value));
+  ASSERT_EQ(3, evict_count);
+  VerifyDB(
+      {{"key1", value}, {"key2", value}, {"key3", value}, {"key4", value}});
+
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(4, blob_files.size());
+  ASSERT_TRUE(blob_files[0]->Obsolete());
+  ASSERT_TRUE(blob_files[1]->Obsolete());
+  ASSERT_TRUE(blob_files[2]->Obsolete());
+  ASSERT_FALSE(blob_files[3]->Obsolete());
+  auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
+  ASSERT_EQ(3, obsolete_files.size());
+  ASSERT_EQ(blob_files[0], obsolete_files[0]);
+  ASSERT_EQ(blob_files[1], obsolete_files[1]);
+  ASSERT_EQ(blob_files[2], obsolete_files[2]);
+
+  blob_db_impl()->TEST_DeleteObsoleteFiles();
+  obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
+  ASSERT_TRUE(obsolete_files.empty());
+  VerifyDB({{"key4", value}});
+}
+
+TEST_F(BlobDBTest, FIFOEviction_NoOldestFileToEvict) {
+  Options options;
+  BlobDBOptions bdb_options;
+  bdb_options.max_db_size = 1000;
+  bdb_options.blob_file_size = 5000;
+  bdb_options.is_fifo = true;
+  bdb_options.disable_background_tasks = true;
+  Open(bdb_options);
+
+  std::atomic<int> evict_count{0};
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobDBImpl::EvictOldestBlobFile:Evicted",
+      [&](void *) { evict_count++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::string value(2000, 'v');
+  ASSERT_TRUE(Put("foo", std::string(2000, 'v')).IsNoSpace());
+  ASSERT_EQ(0, evict_count);
+}
+
+TEST_F(BlobDBTest, FIFOEviction_NoEnoughBlobFilesToEvict) {
+  BlobDBOptions bdb_options;
+  bdb_options.is_fifo = true;
+  bdb_options.min_blob_size = 100;
+  bdb_options.disable_background_tasks = true;
+  Options options;
+  // Use mock env to stop wall clock.
+  options.env = mock_env_.get();
+  options.disable_auto_compactions = true;
+  auto statistics = CreateDBStatistics();
+  options.statistics = statistics;
+  Open(bdb_options, options);
+
+  ASSERT_EQ(0, blob_db_impl()->TEST_live_sst_size());
+  std::string small_value(50, 'v');
+  std::map<std::string, std::string> data;
+  // Insert some data into LSM tree to make sure FIFO eviction take SST
+  // file size into account.
+  for (int i = 0; i < 1000; i++) {
+    ASSERT_OK(Put("key" + std::to_string(i), small_value, &data));
+  }
+  ASSERT_OK(blob_db_->Flush(FlushOptions()));
+  uint64_t live_sst_size = 0;
+  ASSERT_TRUE(blob_db_->GetIntProperty(DB::Properties::kTotalSstFilesSize,
+                                       &live_sst_size));
+  ASSERT_TRUE(live_sst_size > 0);
+  ASSERT_EQ(live_sst_size, blob_db_impl()->TEST_live_sst_size());
+
+  bdb_options.max_db_size = live_sst_size + 2000;
+  Reopen(bdb_options, options);
+  ASSERT_EQ(live_sst_size, blob_db_impl()->TEST_live_sst_size());
+
+  std::string value_1k(1000, 'v');
+  ASSERT_OK(PutWithTTL("large_key1", value_1k, 60, &data));
+  ASSERT_EQ(0, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
+  VerifyDB(data);
+  // large_key2 evicts large_key1
+  ASSERT_OK(PutWithTTL("large_key2", value_1k, 60, &data));
+  ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
+  blob_db_impl()->TEST_DeleteObsoleteFiles();
+  data.erase("large_key1");
+  VerifyDB(data);
+  // large_key3 get no enough space even after evicting large_key2, so it
+  // instead return no space error.
+  std::string value_2k(2000, 'v');
+  ASSERT_TRUE(PutWithTTL("large_key3", value_2k, 60).IsNoSpace());
+  ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
+  // Verify large_key2 still exists.
+  VerifyDB(data);
+}
+
+// Test flush or compaction will trigger FIFO eviction since they update
+// total SST file size.
+TEST_F(BlobDBTest, FIFOEviction_TriggerOnSSTSizeChange) {
+  BlobDBOptions bdb_options;
+  bdb_options.max_db_size = 1000;
+  bdb_options.is_fifo = true;
+  bdb_options.min_blob_size = 100;
+  bdb_options.disable_background_tasks = true;
+  Options options;
+  // Use mock env to stop wall clock.
+  options.env = mock_env_.get();
+  auto statistics = CreateDBStatistics();
+  options.statistics = statistics;
+  options.compression = kNoCompression;
+  Open(bdb_options, options);
+
+  std::string value(800, 'v');
+  ASSERT_OK(PutWithTTL("large_key", value, 60));
+  ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
+  ASSERT_EQ(0, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
+  VerifyDB({{"large_key", value}});
+
+  // Insert some small keys and flush to bring DB out of space.
+  std::map<std::string, std::string> data;
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put("key" + std::to_string(i), "v", &data));
+  }
+  ASSERT_OK(blob_db_->Flush(FlushOptions()));
+
+  // Verify large_key is deleted by FIFO eviction.
+  blob_db_impl()->TEST_DeleteObsoleteFiles();
+  ASSERT_EQ(0, blob_db_impl()->TEST_GetBlobFiles().size());
+  ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
+  VerifyDB(data);
+}
+
+TEST_F(BlobDBTest, InlineSmallValues) {
+  constexpr uint64_t kMaxExpiration = 1000;
+  Random rnd(301);
+  BlobDBOptions bdb_options;
+  bdb_options.ttl_range_secs = kMaxExpiration;
+  bdb_options.min_blob_size = 100;
+  bdb_options.blob_file_size = 256 * 1000 * 1000;
+  bdb_options.disable_background_tasks = true;
+  Options options;
+  options.env = mock_env_.get();
+  mock_clock_->SetCurrentTime(0);
+  Open(bdb_options, options);
+  std::map<std::string, std::string> data;
+  std::map<std::string, KeyVersion> versions;
+  for (size_t i = 0; i < 1000; i++) {
+    bool is_small_value = rnd.Next() % 2;
+    bool has_ttl = rnd.Next() % 2;
+    uint64_t expiration = rnd.Next() % kMaxExpiration;
+    int len = is_small_value ? 50 : 200;
+    std::string key = "key" + std::to_string(i);
+    std::string value = rnd.HumanReadableString(len);
+    std::string blob_index;
+    data[key] = value;
+    SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
+    if (!has_ttl) {
+      ASSERT_OK(blob_db_->Put(WriteOptions(), key, value));
+    } else {
+      ASSERT_OK(blob_db_->PutUntil(WriteOptions(), key, value, expiration));
+    }
+    ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
+    versions[key] =
+        KeyVersion(key, value, sequence,
+                   (is_small_value && !has_ttl) ? kTypeValue : kTypeBlobIndex);
+  }
+  VerifyDB(data);
+  VerifyBaseDB(versions);
+  auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
+  auto blob_files = bdb_impl->TEST_GetBlobFiles();
+  ASSERT_EQ(2, blob_files.size());
+  std::shared_ptr<BlobFile> non_ttl_file;
+  std::shared_ptr<BlobFile> ttl_file;
+  if (blob_files[0]->HasTTL()) {
+    ttl_file = blob_files[0];
+    non_ttl_file = blob_files[1];
+  } else {
+    non_ttl_file = blob_files[0];
+    ttl_file = blob_files[1];
+  }
+  ASSERT_FALSE(non_ttl_file->HasTTL());
+  ASSERT_TRUE(ttl_file->HasTTL());
+}
+
+TEST_F(BlobDBTest, UserCompactionFilter) {
+  class CustomerFilter : public CompactionFilter {
+   public:
+    bool Filter(int /*level*/, const Slice & /*key*/, const Slice &value,
+                std::string *new_value, bool *value_changed) const override {
+      *value_changed = false;
+      // changing value size to test value transitions between inlined data
+      // and stored-in-blob data
+      if (value.size() % 4 == 1) {
+        *new_value = value.ToString();
+        // double size by duplicating value
+        *new_value += *new_value;
+        *value_changed = true;
+        return false;
+      } else if (value.size() % 3 == 1) {
+        *new_value = value.ToString();
+        // trancate value size by half
+        *new_value = new_value->substr(0, new_value->size() / 2);
+        *value_changed = true;
+        return false;
+      } else if (value.size() % 2 == 1) {
+        return true;
+      }
+      return false;
+    }
+    bool IgnoreSnapshots() const override { return true; }
+    const char *Name() const override { return "CustomerFilter"; }
+  };
+  class CustomerFilterFactory : public CompactionFilterFactory {
+    const char *Name() const override { return "CustomerFilterFactory"; }
+    std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+        const CompactionFilter::Context & /*context*/) override {
+      return std::unique_ptr<CompactionFilter>(new CustomerFilter());
+    }
+  };
+
+  constexpr size_t kNumPuts = 1 << 10;
+  // Generate both inlined and blob value
+  constexpr uint64_t kMinValueSize = 1 << 6;
+  constexpr uint64_t kMaxValueSize = 1 << 8;
+  constexpr uint64_t kMinBlobSize = 1 << 7;
+  static_assert(kMinValueSize < kMinBlobSize, "");
+  static_assert(kMaxValueSize > kMinBlobSize, "");
+
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = kMinBlobSize;
+  bdb_options.blob_file_size = kMaxValueSize * 10;
+  bdb_options.disable_background_tasks = true;
+  if (Snappy_Supported()) {
+    bdb_options.compression = CompressionType::kSnappyCompression;
+  }
+  // case_num == 0: Test user defined compaction filter
+  // case_num == 1: Test user defined compaction filter factory
+  for (int case_num = 0; case_num < 2; case_num++) {
+    Options options;
+    if (case_num == 0) {
+      options.compaction_filter = new CustomerFilter();
+    } else {
+      options.compaction_filter_factory.reset(new CustomerFilterFactory());
+    }
+    options.disable_auto_compactions = true;
+    options.env = mock_env_.get();
+    options.statistics = CreateDBStatistics();
+    Open(bdb_options, options);
+
+    std::map<std::string, std::string> data;
+    std::map<std::string, std::string> data_after_compact;
+    Random rnd(301);
+    uint64_t value_size = kMinValueSize;
+    int drop_record = 0;
+    for (size_t i = 0; i < kNumPuts; ++i) {
+      std::ostringstream oss;
+      oss << "key" << std::setw(4) << std::setfill('0') << i;
+
+      const std::string key(oss.str());
+      const std::string value = rnd.HumanReadableString((int)value_size);
+      const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
+
+      ASSERT_OK(Put(key, value));
+      ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
+
+      data[key] = value;
+      if (value.length() % 4 == 1) {
+        data_after_compact[key] = value + value;
+      } else if (value.length() % 3 == 1) {
+        data_after_compact[key] = value.substr(0, value.size() / 2);
+      } else if (value.length() % 2 == 1) {
+        ++drop_record;
+      } else {
+        data_after_compact[key] = value;
+      }
+
+      if (++value_size > kMaxValueSize) {
+        value_size = kMinValueSize;
+      }
+    }
+    // Verify full data set
+    VerifyDB(data);
+    // Applying compaction filter for records
+    ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    // Verify data after compaction, only value with even length left.
+    VerifyDB(data_after_compact);
+    ASSERT_EQ(drop_record,
+              options.statistics->getTickerCount(COMPACTION_KEY_DROP_USER));
+    delete options.compaction_filter;
+    Destroy();
+  }
+}
+
+// Test user comapction filter when there is IO error on blob data.
+TEST_F(BlobDBTest, UserCompactionFilter_BlobIOError) {
+  class CustomerFilter : public CompactionFilter {
+   public:
+    bool Filter(int /*level*/, const Slice & /*key*/, const Slice &value,
+                std::string *new_value, bool *value_changed) const override {
+      *new_value = value.ToString() + "_new";
+      *value_changed = true;
+      return false;
+    }
+    bool IgnoreSnapshots() const override { return true; }
+    const char *Name() const override { return "CustomerFilter"; }
+  };
+
+  constexpr size_t kNumPuts = 100;
+  constexpr int kValueSize = 100;
+
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  bdb_options.blob_file_size = kValueSize * 10;
+  bdb_options.disable_background_tasks = true;
+  bdb_options.compression = CompressionType::kNoCompression;
+
+  std::vector<std::string> io_failure_cases = {
+      "BlobDBImpl::CreateBlobFileAndWriter",
+      "BlobIndexCompactionFilterBase::WriteBlobToNewFile",
+      "BlobDBImpl::CloseBlobFile"};
+
+  for (size_t case_num = 0; case_num < io_failure_cases.size(); case_num++) {
+    Options options;
+    options.compaction_filter = new CustomerFilter();
+    options.disable_auto_compactions = true;
+    options.env = fault_injection_env_.get();
+    options.statistics = CreateDBStatistics();
+    Open(bdb_options, options);
+
+    std::map<std::string, std::string> data;
+    Random rnd(301);
+    for (size_t i = 0; i < kNumPuts; ++i) {
+      std::ostringstream oss;
+      oss << "key" << std::setw(4) << std::setfill('0') << i;
+
+      const std::string key(oss.str());
+      const std::string value = rnd.HumanReadableString(kValueSize);
+      const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
+
+      ASSERT_OK(Put(key, value));
+      ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
+      data[key] = value;
+    }
+
+    // Verify full data set
+    VerifyDB(data);
+
+    SyncPoint::GetInstance()->SetCallBack(
+        io_failure_cases[case_num], [&](void * /*arg*/) {
+          fault_injection_env_->SetFilesystemActive(false, Status::IOError());
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    auto s = blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    ASSERT_TRUE(s.IsIOError());
+
+    // Reactivate file system to allow test to verify and close DB.
+    fault_injection_env_->SetFilesystemActive(true);
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+
+    // Verify full data set after compaction failure
+    VerifyDB(data);
+
+    delete options.compaction_filter;
+    Destroy();
+  }
+}
+
+// Test comapction filter should remove any expired blob index.
+TEST_F(BlobDBTest, FilterExpiredBlobIndex) {
+  constexpr size_t kNumKeys = 100;
+  constexpr size_t kNumPuts = 1000;
+  constexpr uint64_t kMaxExpiration = 1000;
+  constexpr uint64_t kCompactTime = 500;
+  constexpr uint64_t kMinBlobSize = 100;
+  Random rnd(301);
+  mock_clock_->SetCurrentTime(0);
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = kMinBlobSize;
+  bdb_options.disable_background_tasks = true;
+  Options options;
+  options.env = mock_env_.get();
+  Open(bdb_options, options);
+
+  std::map<std::string, std::string> data;
+  std::map<std::string, std::string> data_after_compact;
+  for (size_t i = 0; i < kNumPuts; i++) {
+    bool is_small_value = rnd.Next() % 2;
+    bool has_ttl = rnd.Next() % 2;
+    uint64_t expiration = rnd.Next() % kMaxExpiration;
+    int len = is_small_value ? 10 : 200;
+    std::string key = "key" + std::to_string(rnd.Next() % kNumKeys);
+    std::string value = rnd.HumanReadableString(len);
+    if (!has_ttl) {
+      if (is_small_value) {
+        std::string blob_entry;
+        BlobIndex::EncodeInlinedTTL(&blob_entry, expiration, value);
+        // Fake blob index with TTL. See what it will do.
+        ASSERT_GT(kMinBlobSize, blob_entry.size());
+        value = blob_entry;
+      }
+      ASSERT_OK(Put(key, value));
+      data_after_compact[key] = value;
+    } else {
+      ASSERT_OK(PutUntil(key, value, expiration));
+      if (expiration <= kCompactTime) {
+        data_after_compact.erase(key);
+      } else {
+        data_after_compact[key] = value;
+      }
+    }
+    data[key] = value;
+  }
+  VerifyDB(data);
+
+  mock_clock_->SetCurrentTime(kCompactTime);
+  // Take a snapshot before compaction. Make sure expired blob indexes is
+  // filtered regardless of snapshot.
+  const Snapshot *snapshot = blob_db_->GetSnapshot();
+  // Issue manual compaction to trigger compaction filter.
+  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  blob_db_->ReleaseSnapshot(snapshot);
+  // Verify expired blob index are filtered.
+  std::vector<KeyVersion> versions;
+  const size_t kMaxKeys = 10000;
+  ASSERT_OK(GetAllKeyVersions(blob_db_, "", "", kMaxKeys, &versions));
+  ASSERT_EQ(data_after_compact.size(), versions.size());
+  for (auto &version : versions) {
+    ASSERT_TRUE(data_after_compact.count(version.user_key) > 0);
+  }
+  VerifyDB(data_after_compact);
+}
+
+// Test compaction filter should remove any blob index where corresponding
+// blob file has been removed.
+TEST_F(BlobDBTest, FilterFileNotAvailable) {
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  bdb_options.disable_background_tasks = true;
+  Options options;
+  options.disable_auto_compactions = true;
+  Open(bdb_options, options);
+
+  ASSERT_OK(Put("foo", "v1"));
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(1, blob_files.size());
+  ASSERT_EQ(1, blob_files[0]->BlobFileNumber());
+  ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[0]));
+
+  ASSERT_OK(Put("bar", "v2"));
+  blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(2, blob_files.size());
+  ASSERT_EQ(2, blob_files[1]->BlobFileNumber());
+  ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[1]));
+
+  const size_t kMaxKeys = 10000;
+
+  DB *base_db = blob_db_->GetRootDB();
+  std::vector<KeyVersion> versions;
+  ASSERT_OK(GetAllKeyVersions(base_db, "", "", kMaxKeys, &versions));
+  ASSERT_EQ(2, versions.size());
+  ASSERT_EQ("bar", versions[0].user_key);
+  ASSERT_EQ("foo", versions[1].user_key);
+  VerifyDB({{"bar", "v2"}, {"foo", "v1"}});
+
+  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(GetAllKeyVersions(base_db, "", "", kMaxKeys, &versions));
+  ASSERT_EQ(2, versions.size());
+  ASSERT_EQ("bar", versions[0].user_key);
+  ASSERT_EQ("foo", versions[1].user_key);
+  VerifyDB({{"bar", "v2"}, {"foo", "v1"}});
+
+  // Remove the first blob file and compact. foo should be remove from base db.
+  blob_db_impl()->TEST_ObsoleteBlobFile(blob_files[0]);
+  blob_db_impl()->TEST_DeleteObsoleteFiles();
+  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(GetAllKeyVersions(base_db, "", "", kMaxKeys, &versions));
+  ASSERT_EQ(1, versions.size());
+  ASSERT_EQ("bar", versions[0].user_key);
+  VerifyDB({{"bar", "v2"}});
+
+  // Remove the second blob file and compact. bar should be remove from base db.
+  blob_db_impl()->TEST_ObsoleteBlobFile(blob_files[1]);
+  blob_db_impl()->TEST_DeleteObsoleteFiles();
+  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(GetAllKeyVersions(base_db, "", "", kMaxKeys, &versions));
+  ASSERT_EQ(0, versions.size());
+  VerifyDB({});
+}
+
+// Test compaction filter should filter any inlined TTL keys that would have
+// been dropped by last FIFO eviction if they are store out-of-line.
+TEST_F(BlobDBTest, FilterForFIFOEviction) {
+  Random rnd(215);
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 100;
+  bdb_options.ttl_range_secs = 60;
+  bdb_options.max_db_size = 0;
+  bdb_options.disable_background_tasks = true;
+  Options options;
+  // Use mock env to stop wall clock.
+  mock_clock_->SetCurrentTime(0);
+  options.env = mock_env_.get();
+  auto statistics = CreateDBStatistics();
+  options.statistics = statistics;
+  options.disable_auto_compactions = true;
+  Open(bdb_options, options);
+
+  std::map<std::string, std::string> data;
+  std::map<std::string, std::string> data_after_compact;
+  // Insert some small values that will be inlined.
+  for (int i = 0; i < 1000; i++) {
+    std::string key = "key" + std::to_string(i);
+    std::string value = rnd.HumanReadableString(50);
+    uint64_t ttl = rnd.Next() % 120 + 1;
+    ASSERT_OK(PutWithTTL(key, value, ttl, &data));
+    if (ttl >= 60) {
+      data_after_compact[key] = value;
+    }
+  }
+  uint64_t num_keys_to_evict = data.size() - data_after_compact.size();
+  ASSERT_OK(blob_db_->Flush(FlushOptions()));
+  uint64_t live_sst_size = blob_db_impl()->TEST_live_sst_size();
+  ASSERT_GT(live_sst_size, 0);
+  VerifyDB(data);
+
+  bdb_options.max_db_size = live_sst_size + 30000;
+  bdb_options.is_fifo = true;
+  Reopen(bdb_options, options);
+  VerifyDB(data);
+
+  // Put two large values, each on a different blob file.
+  std::string large_value(10000, 'v');
+  ASSERT_OK(PutWithTTL("large_key1", large_value, 90));
+  ASSERT_OK(PutWithTTL("large_key2", large_value, 150));
+  ASSERT_EQ(2, blob_db_impl()->TEST_GetBlobFiles().size());
+  ASSERT_EQ(0, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
+  data["large_key1"] = large_value;
+  data["large_key2"] = large_value;
+  VerifyDB(data);
+
+  // Put a third large value which will bring the DB out of space.
+  // FIFO eviction will evict the file of large_key1.
+  ASSERT_OK(PutWithTTL("large_key3", large_value, 150));
+  ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
+  ASSERT_EQ(2, blob_db_impl()->TEST_GetBlobFiles().size());
+  blob_db_impl()->TEST_DeleteObsoleteFiles();
+  ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
+  data.erase("large_key1");
+  data["large_key3"] = large_value;
+  VerifyDB(data);
+
+  // Putting some more small values. These values shouldn't be evicted by
+  // compaction filter since they are inserted after FIFO eviction.
+  ASSERT_OK(PutWithTTL("foo", "v", 30, &data_after_compact));
+  ASSERT_OK(PutWithTTL("bar", "v", 30, &data_after_compact));
+
+  // FIFO eviction doesn't trigger again since there enough room for the flush.
+  ASSERT_OK(blob_db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
+
+  // Manual compact and check if compaction filter evict those keys with
+  // expiration < 60.
+  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  // All keys with expiration < 60, plus large_key1 is filtered by
+  // compaction filter.
+  ASSERT_EQ(num_keys_to_evict + 1,
+            statistics->getTickerCount(BLOB_DB_BLOB_INDEX_EVICTED_COUNT));
+  ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
+  ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
+  data_after_compact["large_key2"] = large_value;
+  data_after_compact["large_key3"] = large_value;
+  VerifyDB(data_after_compact);
+}
+
+TEST_F(BlobDBTest, GarbageCollection) {
+  constexpr size_t kNumPuts = 1 << 10;
+
+  constexpr uint64_t kExpiration = 1000;
+  constexpr uint64_t kCompactTime = 500;
+
+  constexpr uint64_t kKeySize = 7;  // "key" + 4 digits
+
+  constexpr uint64_t kSmallValueSize = 1 << 6;
+  constexpr uint64_t kLargeValueSize = 1 << 8;
+  constexpr uint64_t kMinBlobSize = 1 << 7;
+  static_assert(kSmallValueSize < kMinBlobSize, "");
+  static_assert(kLargeValueSize > kMinBlobSize, "");
+
+  constexpr size_t kBlobsPerFile = 8;
+  constexpr size_t kNumBlobFiles = kNumPuts / kBlobsPerFile;
+  constexpr uint64_t kBlobFileSize =
+      BlobLogHeader::kSize +
+      (BlobLogRecord::kHeaderSize + kKeySize + kLargeValueSize) * kBlobsPerFile;
+
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = kMinBlobSize;
+  bdb_options.blob_file_size = kBlobFileSize;
+  bdb_options.enable_garbage_collection = true;
+  bdb_options.garbage_collection_cutoff = 0.25;
+  bdb_options.disable_background_tasks = true;
+
+  Options options;
+  options.env = mock_env_.get();
+  options.statistics = CreateDBStatistics();
+
+  Open(bdb_options, options);
+
+  std::map<std::string, std::string> data;
+  std::map<std::string, KeyVersion> blob_value_versions;
+  std::map<std::string, BlobIndexVersion> blob_index_versions;
+
+  Random rnd(301);
+
+  // Add a bunch of large non-TTL values. These will be written to non-TTL
+  // blob files and will be subject to GC.
+  for (size_t i = 0; i < kNumPuts; ++i) {
+    std::ostringstream oss;
+    oss << "key" << std::setw(4) << std::setfill('0') << i;
+
+    const std::string key(oss.str());
+    const std::string value = rnd.HumanReadableString(kLargeValueSize);
+    const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
+
+    ASSERT_OK(Put(key, value));
+    ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
+
+    data[key] = value;
+    blob_value_versions[key] = KeyVersion(key, value, sequence, kTypeBlobIndex);
+    blob_index_versions[key] =
+        BlobIndexVersion(key, /* file_number */ (i >> 3) + 1, kNoExpiration,
+                         sequence, kTypeBlobIndex);
+  }
+
+  // Add some small and/or TTL values that will be ignored during GC.
+  // First, add a large TTL value will be written to its own TTL blob file.
+  {
+    const std::string key("key2000");
+    const std::string value = rnd.HumanReadableString(kLargeValueSize);
+    const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
+
+    ASSERT_OK(PutUntil(key, value, kExpiration));
+    ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
+
+    data[key] = value;
+    blob_value_versions[key] = KeyVersion(key, value, sequence, kTypeBlobIndex);
+    blob_index_versions[key] =
+        BlobIndexVersion(key, /* file_number */ kNumBlobFiles + 1, kExpiration,
+                         sequence, kTypeBlobIndex);
+  }
+
+  // Now add a small TTL value (which will be inlined).
+  {
+    const std::string key("key3000");
+    const std::string value = rnd.HumanReadableString(kSmallValueSize);
+    const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
+
+    ASSERT_OK(PutUntil(key, value, kExpiration));
+    ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
+
+    data[key] = value;
+    blob_value_versions[key] = KeyVersion(key, value, sequence, kTypeBlobIndex);
+    blob_index_versions[key] = BlobIndexVersion(
+        key, kInvalidBlobFileNumber, kExpiration, sequence, kTypeBlobIndex);
+  }
+
+  // Finally, add a small non-TTL value (which will be stored as a regular
+  // value).
+  {
+    const std::string key("key4000");
+    const std::string value = rnd.HumanReadableString(kSmallValueSize);
+    const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
+
+    ASSERT_OK(Put(key, value));
+    ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
+
+    data[key] = value;
+    blob_value_versions[key] = KeyVersion(key, value, sequence, kTypeValue);
+    blob_index_versions[key] = BlobIndexVersion(
+        key, kInvalidBlobFileNumber, kNoExpiration, sequence, kTypeValue);
+  }
+
+  VerifyDB(data);
+  VerifyBaseDB(blob_value_versions);
+  VerifyBaseDBBlobIndex(blob_index_versions);
+
+  // At this point, we should have 128 immutable non-TTL files with file numbers
+  // 1..128.
+  {
+    auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+    ASSERT_EQ(live_imm_files.size(), kNumBlobFiles);
+    for (size_t i = 0; i < kNumBlobFiles; ++i) {
+      ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 1);
+      ASSERT_EQ(live_imm_files[i]->GetFileSize(),
+                kBlobFileSize + BlobLogFooter::kSize);
+    }
+  }
+
+  mock_clock_->SetCurrentTime(kCompactTime);
+
+  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // We expect the data to remain the same and the blobs from the oldest N files
+  // to be moved to new files. Sequence numbers get zeroed out during the
+  // compaction.
+  VerifyDB(data);
+
+  for (auto &pair : blob_value_versions) {
+    KeyVersion &version = pair.second;
+    version.sequence = 0;
+  }
+
+  VerifyBaseDB(blob_value_versions);
+
+  const uint64_t cutoff = static_cast<uint64_t>(
+      bdb_options.garbage_collection_cutoff * kNumBlobFiles);
+  for (auto &pair : blob_index_versions) {
+    BlobIndexVersion &version = pair.second;
+
+    version.sequence = 0;
+
+    if (version.file_number == kInvalidBlobFileNumber) {
+      continue;
+    }
+
+    if (version.file_number > cutoff) {
+      continue;
+    }
+
+    version.file_number += kNumBlobFiles + 1;
+  }
+
+  VerifyBaseDBBlobIndex(blob_index_versions);
+
+  const Statistics *const statistics = options.statistics.get();
+  assert(statistics);
+
+  ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_NUM_FILES), cutoff);
+  ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_NUM_NEW_FILES), cutoff);
+  ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_FAILURES), 0);
+  ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_NUM_KEYS_RELOCATED),
+            cutoff * kBlobsPerFile);
+  ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_BYTES_RELOCATED),
+            cutoff * kBlobsPerFile * kLargeValueSize);
+
+  // At this point, we should have 128 immutable non-TTL files with file numbers
+  // 33..128 and 130..161. (129 was taken by the TTL blob file.)
+  {
+    auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+    ASSERT_EQ(live_imm_files.size(), kNumBlobFiles);
+    for (size_t i = 0; i < kNumBlobFiles; ++i) {
+      uint64_t expected_file_number = i + cutoff + 1;
+      if (expected_file_number > kNumBlobFiles) {
+        ++expected_file_number;
+      }
+
+      ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), expected_file_number);
+      ASSERT_EQ(live_imm_files[i]->GetFileSize(),
+                kBlobFileSize + BlobLogFooter::kSize);
+    }
+  }
+}
+
+TEST_F(BlobDBTest, GarbageCollectionFailure) {
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  bdb_options.enable_garbage_collection = true;
+  bdb_options.garbage_collection_cutoff = 1.0;
+  bdb_options.disable_background_tasks = true;
+
+  Options db_options;
+  db_options.statistics = CreateDBStatistics();
+
+  Open(bdb_options, db_options);
+
+  // Write a couple of valid blobs.
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("dead", "beef"));
+
+  // Write a fake blob reference into the base DB that points to a non-existing
+  // blob file.
+  std::string blob_index;
+  BlobIndex::EncodeBlob(&blob_index, /* file_number */ 1000, /* offset */ 1234,
+                        /* size */ 5678, kNoCompression);
+
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(
+      &batch, blob_db_->DefaultColumnFamily()->GetID(), "key", blob_index));
+  ASSERT_OK(blob_db_->GetRootDB()->Write(WriteOptions(), &batch));
+
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(blob_files.size(), 1);
+  auto blob_file = blob_files[0];
+  ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_file));
+
+  ASSERT_TRUE(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)
+                  .IsIOError());
+
+  const Statistics *const statistics = db_options.statistics.get();
+  assert(statistics);
+
+  ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_NUM_FILES), 0);
+  ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_NUM_NEW_FILES), 1);
+  ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_FAILURES), 1);
+  ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_NUM_KEYS_RELOCATED), 2);
+  ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_BYTES_RELOCATED), 7);
+}
+
+// File should be evicted after expiration.
+TEST_F(BlobDBTest, EvictExpiredFile) {
+  BlobDBOptions bdb_options;
+  bdb_options.ttl_range_secs = 100;
+  bdb_options.min_blob_size = 0;
+  bdb_options.disable_background_tasks = true;
+  Options options;
+  options.env = mock_env_.get();
+  Open(bdb_options, options);
+  mock_clock_->SetCurrentTime(50);
+  std::map<std::string, std::string> data;
+  ASSERT_OK(PutWithTTL("foo", "bar", 100, &data));
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(1, blob_files.size());
+  auto blob_file = blob_files[0];
+  ASSERT_FALSE(blob_file->Immutable());
+  ASSERT_FALSE(blob_file->Obsolete());
+  VerifyDB(data);
+  mock_clock_->SetCurrentTime(250);
+  // The key should expired now.
+  blob_db_impl()->TEST_EvictExpiredFiles();
+  ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
+  ASSERT_EQ(1, blob_db_impl()->TEST_GetObsoleteFiles().size());
+  ASSERT_TRUE(blob_file->Immutable());
+  ASSERT_TRUE(blob_file->Obsolete());
+  blob_db_impl()->TEST_DeleteObsoleteFiles();
+  ASSERT_EQ(0, blob_db_impl()->TEST_GetBlobFiles().size());
+  ASSERT_EQ(0, blob_db_impl()->TEST_GetObsoleteFiles().size());
+  // Make sure we don't return garbage value after blob file being evicted,
+  // but the blob index still exists in the LSM tree.
+  std::string val = "";
+  ASSERT_TRUE(blob_db_->Get(ReadOptions(), "foo", &val).IsNotFound());
+  ASSERT_EQ("", val);
+}
+
+TEST_F(BlobDBTest, DisableFileDeletions) {
+  BlobDBOptions bdb_options;
+  bdb_options.disable_background_tasks = true;
+  Open(bdb_options);
+  std::map<std::string, std::string> data;
+  for (bool force : {true, false}) {
+    ASSERT_OK(Put("foo", "v", &data));
+    auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+    ASSERT_EQ(1, blob_files.size());
+    auto blob_file = blob_files[0];
+    ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_file));
+    blob_db_impl()->TEST_ObsoleteBlobFile(blob_file);
+    ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
+    ASSERT_EQ(1, blob_db_impl()->TEST_GetObsoleteFiles().size());
+    // Call DisableFileDeletions twice.
+    ASSERT_OK(blob_db_->DisableFileDeletions());
+    ASSERT_OK(blob_db_->DisableFileDeletions());
+    // File deletions should be disabled.
+    blob_db_impl()->TEST_DeleteObsoleteFiles();
+    ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
+    ASSERT_EQ(1, blob_db_impl()->TEST_GetObsoleteFiles().size());
+    VerifyDB(data);
+    // Enable file deletions once. If force=true, file deletion is enabled.
+    // Otherwise it needs to enable it for a second time.
+    ASSERT_OK(blob_db_->EnableFileDeletions(force));
+    blob_db_impl()->TEST_DeleteObsoleteFiles();
+    if (!force) {
+      ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
+      ASSERT_EQ(1, blob_db_impl()->TEST_GetObsoleteFiles().size());
+      VerifyDB(data);
+      // Call EnableFileDeletions a second time.
+      ASSERT_OK(blob_db_->EnableFileDeletions(false));
+      blob_db_impl()->TEST_DeleteObsoleteFiles();
+    }
+    // Regardless of value of `force`, file should be deleted by now.
+    ASSERT_EQ(0, blob_db_impl()->TEST_GetBlobFiles().size());
+    ASSERT_EQ(0, blob_db_impl()->TEST_GetObsoleteFiles().size());
+    VerifyDB({});
+  }
+}
+
+TEST_F(BlobDBTest, MaintainBlobFileToSstMapping) {
+  BlobDBOptions bdb_options;
+  bdb_options.enable_garbage_collection = true;
+  bdb_options.disable_background_tasks = true;
+  Open(bdb_options);
+
+  // Register some dummy blob files.
+  blob_db_impl()->TEST_AddDummyBlobFile(1, /* immutable_sequence */ 200);
+  blob_db_impl()->TEST_AddDummyBlobFile(2, /* immutable_sequence */ 300);
+  blob_db_impl()->TEST_AddDummyBlobFile(3, /* immutable_sequence */ 400);
+  blob_db_impl()->TEST_AddDummyBlobFile(4, /* immutable_sequence */ 500);
+  blob_db_impl()->TEST_AddDummyBlobFile(5, /* immutable_sequence */ 600);
+
+  // Initialize the blob <-> SST file mapping. First, add some SST files with
+  // blob file references, then some without.
+  std::vector<LiveFileMetaData> live_files;
+
+  for (uint64_t i = 1; i <= 10; ++i) {
+    LiveFileMetaData live_file;
+    live_file.file_number = i;
+    live_file.oldest_blob_file_number = ((i - 1) % 5) + 1;
+
+    live_files.emplace_back(live_file);
+  }
+
+  for (uint64_t i = 11; i <= 20; ++i) {
+    LiveFileMetaData live_file;
+    live_file.file_number = i;
+
+    live_files.emplace_back(live_file);
+  }
+
+  blob_db_impl()->TEST_InitializeBlobFileToSstMapping(live_files);
+
+  // Check that the blob <-> SST mappings have been correctly initialized.
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+
+  ASSERT_EQ(blob_files.size(), 5);
+
+  {
+    auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+    ASSERT_EQ(live_imm_files.size(), 5);
+    for (size_t i = 0; i < 5; ++i) {
+      ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 1);
+    }
+
+    ASSERT_TRUE(blob_db_impl()->TEST_GetObsoleteFiles().empty());
+  }
+
+  {
+    const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
+        {1, 6}, {2, 7}, {3, 8}, {4, 9}, {5, 10}};
+    const std::vector<bool> expected_obsolete{false, false, false, false,
+                                              false};
+    for (size_t i = 0; i < 5; ++i) {
+      const auto &blob_file = blob_files[i];
+      ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+      ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
+    }
+
+    auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+    ASSERT_EQ(live_imm_files.size(), 5);
+    for (size_t i = 0; i < 5; ++i) {
+      ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 1);
+    }
+
+    ASSERT_TRUE(blob_db_impl()->TEST_GetObsoleteFiles().empty());
+  }
+
+  // Simulate a flush where the SST does not reference any blob files.
+  {
+    FlushJobInfo info{};
+    info.file_number = 21;
+    info.smallest_seqno = 1;
+    info.largest_seqno = 100;
+
+    blob_db_impl()->TEST_ProcessFlushJobInfo(info);
+
+    const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
+        {1, 6}, {2, 7}, {3, 8}, {4, 9}, {5, 10}};
+    const std::vector<bool> expected_obsolete{false, false, false, false,
+                                              false};
+    for (size_t i = 0; i < 5; ++i) {
+      const auto &blob_file = blob_files[i];
+      ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+      ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
+    }
+
+    auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+    ASSERT_EQ(live_imm_files.size(), 5);
+    for (size_t i = 0; i < 5; ++i) {
+      ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 1);
+    }
+
+    ASSERT_TRUE(blob_db_impl()->TEST_GetObsoleteFiles().empty());
+  }
+
+  // Simulate a flush where the SST references a blob file.
+  {
+    FlushJobInfo info{};
+    info.file_number = 22;
+    info.oldest_blob_file_number = 5;
+    info.smallest_seqno = 101;
+    info.largest_seqno = 200;
+
+    blob_db_impl()->TEST_ProcessFlushJobInfo(info);
+
+    const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
+        {1, 6}, {2, 7}, {3, 8}, {4, 9}, {5, 10, 22}};
+    const std::vector<bool> expected_obsolete{false, false, false, false,
+                                              false};
+    for (size_t i = 0; i < 5; ++i) {
+      const auto &blob_file = blob_files[i];
+      ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+      ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
+    }
+
+    auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+    ASSERT_EQ(live_imm_files.size(), 5);
+    for (size_t i = 0; i < 5; ++i) {
+      ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 1);
+    }
+
+    ASSERT_TRUE(blob_db_impl()->TEST_GetObsoleteFiles().empty());
+  }
+
+  // Simulate a compaction. Some inputs and outputs have blob file references,
+  // some don't. There is also a trivial move (which means the SST appears on
+  // both the input and the output list). Blob file 1 loses all its linked SSTs,
+  // and since it got marked immutable at sequence number 200 which has already
+  // been flushed, it can be marked obsolete.
+  {
+    CompactionJobInfo info{};
+    info.input_file_infos.emplace_back(CompactionFileInfo{1, 1, 1});
+    info.input_file_infos.emplace_back(CompactionFileInfo{1, 2, 2});
+    info.input_file_infos.emplace_back(CompactionFileInfo{1, 6, 1});
+    info.input_file_infos.emplace_back(
+        CompactionFileInfo{1, 11, kInvalidBlobFileNumber});
+    info.input_file_infos.emplace_back(CompactionFileInfo{1, 22, 5});
+    info.output_file_infos.emplace_back(CompactionFileInfo{2, 22, 5});
+    info.output_file_infos.emplace_back(CompactionFileInfo{2, 23, 3});
+    info.output_file_infos.emplace_back(
+        CompactionFileInfo{2, 24, kInvalidBlobFileNumber});
+
+    blob_db_impl()->TEST_ProcessCompactionJobInfo(info);
+
+    const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
+        {}, {7}, {3, 8, 23}, {4, 9}, {5, 10, 22}};
+    const std::vector<bool> expected_obsolete{true, false, false, false, false};
+    for (size_t i = 0; i < 5; ++i) {
+      const auto &blob_file = blob_files[i];
+      ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+      ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
+    }
+
+    auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+    ASSERT_EQ(live_imm_files.size(), 4);
+    for (size_t i = 0; i < 4; ++i) {
+      ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 2);
+    }
+
+    auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
+    ASSERT_EQ(obsolete_files.size(), 1);
+    ASSERT_EQ(obsolete_files[0]->BlobFileNumber(), 1);
+  }
+
+  // Simulate a failed compaction. No mappings should be updated.
+  {
+    CompactionJobInfo info{};
+    info.input_file_infos.emplace_back(CompactionFileInfo{1, 7, 2});
+    info.input_file_infos.emplace_back(CompactionFileInfo{2, 22, 5});
+    info.output_file_infos.emplace_back(CompactionFileInfo{2, 25, 3});
+    info.status = Status::Corruption();
+
+    blob_db_impl()->TEST_ProcessCompactionJobInfo(info);
+
+    const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
+        {}, {7}, {3, 8, 23}, {4, 9}, {5, 10, 22}};
+    const std::vector<bool> expected_obsolete{true, false, false, false, false};
+    for (size_t i = 0; i < 5; ++i) {
+      const auto &blob_file = blob_files[i];
+      ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+      ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
+    }
+
+    auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+    ASSERT_EQ(live_imm_files.size(), 4);
+    for (size_t i = 0; i < 4; ++i) {
+      ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 2);
+    }
+
+    auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
+    ASSERT_EQ(obsolete_files.size(), 1);
+    ASSERT_EQ(obsolete_files[0]->BlobFileNumber(), 1);
+  }
+
+  // Simulate another compaction. Blob file 2 loses all its linked SSTs
+  // but since it got marked immutable at sequence number 300 which hasn't
+  // been flushed yet, it cannot be marked obsolete at this point.
+  {
+    CompactionJobInfo info{};
+    info.input_file_infos.emplace_back(CompactionFileInfo{1, 7, 2});
+    info.input_file_infos.emplace_back(CompactionFileInfo{2, 22, 5});
+    info.output_file_infos.emplace_back(CompactionFileInfo{2, 25, 3});
+
+    blob_db_impl()->TEST_ProcessCompactionJobInfo(info);
+
+    const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
+        {}, {}, {3, 8, 23, 25}, {4, 9}, {5, 10}};
+    const std::vector<bool> expected_obsolete{true, false, false, false, false};
+    for (size_t i = 0; i < 5; ++i) {
+      const auto &blob_file = blob_files[i];
+      ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+      ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
+    }
+
+    auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+    ASSERT_EQ(live_imm_files.size(), 4);
+    for (size_t i = 0; i < 4; ++i) {
+      ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 2);
+    }
+
+    auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
+    ASSERT_EQ(obsolete_files.size(), 1);
+    ASSERT_EQ(obsolete_files[0]->BlobFileNumber(), 1);
+  }
+
+  // Simulate a flush with largest sequence number 300. This will make it
+  // possible to mark blob file 2 obsolete.
+  {
+    FlushJobInfo info{};
+    info.file_number = 26;
+    info.smallest_seqno = 201;
+    info.largest_seqno = 300;
+
+    blob_db_impl()->TEST_ProcessFlushJobInfo(info);
+
+    const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
+        {}, {}, {3, 8, 23, 25}, {4, 9}, {5, 10}};
+    const std::vector<bool> expected_obsolete{true, true, false, false, false};
+    for (size_t i = 0; i < 5; ++i) {
+      const auto &blob_file = blob_files[i];
+      ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+      ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
+    }
+
+    auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+    ASSERT_EQ(live_imm_files.size(), 3);
+    for (size_t i = 0; i < 3; ++i) {
+      ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 3);
+    }
+
+    auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
+    ASSERT_EQ(obsolete_files.size(), 2);
+    ASSERT_EQ(obsolete_files[0]->BlobFileNumber(), 1);
+    ASSERT_EQ(obsolete_files[1]->BlobFileNumber(), 2);
+  }
+}
+
+TEST_F(BlobDBTest, ShutdownWait) {
+  BlobDBOptions bdb_options;
+  bdb_options.ttl_range_secs = 100;
+  bdb_options.min_blob_size = 0;
+  bdb_options.disable_background_tasks = false;
+  Options options;
+  options.env = mock_env_.get();
+
+  SyncPoint::GetInstance()->LoadDependency({
+      {"BlobDBImpl::EvictExpiredFiles:0", "BlobDBTest.ShutdownWait:0"},
+      {"BlobDBTest.ShutdownWait:1", "BlobDBImpl::EvictExpiredFiles:1"},
+      {"BlobDBImpl::EvictExpiredFiles:2", "BlobDBTest.ShutdownWait:2"},
+      {"BlobDBTest.ShutdownWait:3", "BlobDBImpl::EvictExpiredFiles:3"},
+  });
+  // Force all tasks to be scheduled immediately.
+  SyncPoint::GetInstance()->SetCallBack(
+      "TimeQueue::Add:item.end", [&](void *arg) {
+        std::chrono::steady_clock::time_point *tp =
+            static_cast<std::chrono::steady_clock::time_point *>(arg);
+        *tp =
+            std::chrono::steady_clock::now() - std::chrono::milliseconds(10000);
+      });
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobDBImpl::EvictExpiredFiles:cb", [&](void * /*arg*/) {
+        // Sleep 3 ms to increase the chance of data race.
+        // We've synced up the code so that EvictExpiredFiles()
+        // is called concurrently with ~BlobDBImpl().
+        // ~BlobDBImpl() is supposed to wait for all background
+        // task to shutdown before doing anything else. In order
+        // to use the same test to reproduce a bug of the waiting
+        // logic, we wait a little bit here, so that TSAN can
+        // catch the data race.
+        // We should improve the test if we find a better way.
+        Env::Default()->SleepForMicroseconds(3000);
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Open(bdb_options, options);
+  mock_clock_->SetCurrentTime(50);
+  std::map<std::string, std::string> data;
+  ASSERT_OK(PutWithTTL("foo", "bar", 100, &data));
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(1, blob_files.size());
+  auto blob_file = blob_files[0];
+  ASSERT_FALSE(blob_file->Immutable());
+  ASSERT_FALSE(blob_file->Obsolete());
+  VerifyDB(data);
+
+  TEST_SYNC_POINT("BlobDBTest.ShutdownWait:0");
+  mock_clock_->SetCurrentTime(250);
+  // The key should expired now.
+  TEST_SYNC_POINT("BlobDBTest.ShutdownWait:1");
+
+  TEST_SYNC_POINT("BlobDBTest.ShutdownWait:2");
+  TEST_SYNC_POINT("BlobDBTest.ShutdownWait:3");
+  Close();
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(BlobDBTest, SyncBlobFileBeforeClose) {
+  Options options;
+  options.statistics = CreateDBStatistics();
+
+  BlobDBOptions blob_options;
+  blob_options.min_blob_size = 0;
+  blob_options.bytes_per_sync = 1 << 20;
+  blob_options.disable_background_tasks = true;
+
+  Open(blob_options, options);
+
+  ASSERT_OK(Put("foo", "bar"));
+
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(blob_files.size(), 1);
+
+  ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[0]));
+  ASSERT_EQ(options.statistics->getTickerCount(BLOB_DB_BLOB_FILE_SYNCED), 1);
+}
+
+TEST_F(BlobDBTest, SyncBlobFileBeforeCloseIOError) {
+  Options options;
+  options.env = fault_injection_env_.get();
+
+  BlobDBOptions blob_options;
+  blob_options.min_blob_size = 0;
+  blob_options.bytes_per_sync = 1 << 20;
+  blob_options.disable_background_tasks = true;
+
+  Open(blob_options, options);
+
+  ASSERT_OK(Put("foo", "bar"));
+
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(blob_files.size(), 1);
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobLogWriter::Sync", [this](void * /* arg */) {
+        fault_injection_env_->SetFilesystemActive(false, Status::IOError());
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  const Status s = blob_db_impl()->TEST_CloseBlobFile(blob_files[0]);
+
+  fault_injection_env_->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_TRUE(s.IsIOError());
+}
+
+}  //  namespace blob_db
+}  // namespace ROCKSDB_NAMESPACE
+
+// A black-box test for the ttl wrapper around rocksdb
+int main(int argc, char **argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as BlobDB is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/blob_db/blob_dump_tool.cc b/src/rocksdb/utilities/blob_db/blob_dump_tool.cc
new file mode 100644
index 000000000..1e0632990
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_dump_tool.cc
@@ -0,0 +1,282 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#ifndef ROCKSDB_LITE
+
+#include "utilities/blob_db/blob_dump_tool.h"
+
+#include <stdio.h>
+
+#include <cinttypes>
+#include <iostream>
+#include <memory>
+#include <string>
+
+#include "file/random_access_file_reader.h"
+#include "file/readahead_raf.h"
+#include "port/port.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/file_system.h"
+#include "table/format.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace blob_db {
+
+BlobDumpTool::BlobDumpTool()
+    : reader_(nullptr), buffer_(nullptr), buffer_size_(0) {}
+
+Status BlobDumpTool::Run(const std::string& filename, DisplayType show_key,
+                         DisplayType show_blob,
+                         DisplayType show_uncompressed_blob,
+                         bool show_summary) {
+  constexpr size_t kReadaheadSize = 2 * 1024 * 1024;
+  Status s;
+  const auto fs = FileSystem::Default();
+  IOOptions io_opts;
+  s = fs->FileExists(filename, io_opts, nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+  uint64_t file_size = 0;
+  s = fs->GetFileSize(filename, io_opts, &file_size, nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+  std::unique_ptr<FSRandomAccessFile> file;
+  s = fs->NewRandomAccessFile(filename, FileOptions(), &file, nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+  file = NewReadaheadRandomAccessFile(std::move(file), kReadaheadSize);
+  if (file_size == 0) {
+    return Status::Corruption("File is empty.");
+  }
+  reader_.reset(new RandomAccessFileReader(std::move(file), filename));
+  uint64_t offset = 0;
+  uint64_t footer_offset = 0;
+  CompressionType compression = kNoCompression;
+  s = DumpBlobLogHeader(&offset, &compression);
+  if (!s.ok()) {
+    return s;
+  }
+  s = DumpBlobLogFooter(file_size, &footer_offset);
+  if (!s.ok()) {
+    return s;
+  }
+  uint64_t total_records = 0;
+  uint64_t total_key_size = 0;
+  uint64_t total_blob_size = 0;
+  uint64_t total_uncompressed_blob_size = 0;
+  if (show_key != DisplayType::kNone || show_summary) {
+    while (offset < footer_offset) {
+      s = DumpRecord(show_key, show_blob, show_uncompressed_blob, show_summary,
+                     compression, &offset, &total_records, &total_key_size,
+                     &total_blob_size, &total_uncompressed_blob_size);
+      if (!s.ok()) {
+        break;
+      }
+    }
+  }
+  if (show_summary) {
+    fprintf(stdout, "Summary:\n");
+    fprintf(stdout, "  total records: %" PRIu64 "\n", total_records);
+    fprintf(stdout, "  total key size: %" PRIu64 "\n", total_key_size);
+    fprintf(stdout, "  total blob size: %" PRIu64 "\n", total_blob_size);
+    if (compression != kNoCompression) {
+      fprintf(stdout, "  total raw blob size: %" PRIu64 "\n",
+              total_uncompressed_blob_size);
+    }
+  }
+  return s;
+}
+
+Status BlobDumpTool::Read(uint64_t offset, size_t size, Slice* result) {
+  if (buffer_size_ < size) {
+    if (buffer_size_ == 0) {
+      buffer_size_ = 4096;
+    }
+    while (buffer_size_ < size) {
+      buffer_size_ *= 2;
+    }
+    buffer_.reset(new char[buffer_size_]);
+  }
+  Status s = reader_->Read(IOOptions(), offset, size, result, buffer_.get(),
+                           nullptr, Env::IO_TOTAL /* rate_limiter_priority */);
+  if (!s.ok()) {
+    return s;
+  }
+  if (result->size() != size) {
+    return Status::Corruption("Reach the end of the file unexpectedly.");
+  }
+  return s;
+}
+
+Status BlobDumpTool::DumpBlobLogHeader(uint64_t* offset,
+                                       CompressionType* compression) {
+  Slice slice;
+  Status s = Read(0, BlobLogHeader::kSize, &slice);
+  if (!s.ok()) {
+    return s;
+  }
+  BlobLogHeader header;
+  s = header.DecodeFrom(slice);
+  if (!s.ok()) {
+    return s;
+  }
+  fprintf(stdout, "Blob log header:\n");
+  fprintf(stdout, "  Version          : %" PRIu32 "\n", header.version);
+  fprintf(stdout, "  Column Family ID : %" PRIu32 "\n",
+          header.column_family_id);
+  std::string compression_str;
+  if (!GetStringFromCompressionType(&compression_str, header.compression)
+           .ok()) {
+    compression_str = "Unrecongnized compression type (" +
+                      std::to_string((int)header.compression) + ")";
+  }
+  fprintf(stdout, "  Compression      : %s\n", compression_str.c_str());
+  fprintf(stdout, "  Expiration range : %s\n",
+          GetString(header.expiration_range).c_str());
+  *offset = BlobLogHeader::kSize;
+  *compression = header.compression;
+  return s;
+}
+
+Status BlobDumpTool::DumpBlobLogFooter(uint64_t file_size,
+                                       uint64_t* footer_offset) {
+  auto no_footer = [&]() {
+    *footer_offset = file_size;
+    fprintf(stdout, "No blob log footer.\n");
+    return Status::OK();
+  };
+  if (file_size < BlobLogHeader::kSize + BlobLogFooter::kSize) {
+    return no_footer();
+  }
+  Slice slice;
+  *footer_offset = file_size - BlobLogFooter::kSize;
+  Status s = Read(*footer_offset, BlobLogFooter::kSize, &slice);
+  if (!s.ok()) {
+    return s;
+  }
+  BlobLogFooter footer;
+  s = footer.DecodeFrom(slice);
+  if (!s.ok()) {
+    return no_footer();
+  }
+  fprintf(stdout, "Blob log footer:\n");
+  fprintf(stdout, "  Blob count       : %" PRIu64 "\n", footer.blob_count);
+  fprintf(stdout, "  Expiration Range : %s\n",
+          GetString(footer.expiration_range).c_str());
+  return s;
+}
+
+Status BlobDumpTool::DumpRecord(DisplayType show_key, DisplayType show_blob,
+                                DisplayType show_uncompressed_blob,
+                                bool show_summary, CompressionType compression,
+                                uint64_t* offset, uint64_t* total_records,
+                                uint64_t* total_key_size,
+                                uint64_t* total_blob_size,
+                                uint64_t* total_uncompressed_blob_size) {
+  if (show_key != DisplayType::kNone) {
+    fprintf(stdout, "Read record with offset 0x%" PRIx64 " (%" PRIu64 "):\n",
+            *offset, *offset);
+  }
+  Slice slice;
+  Status s = Read(*offset, BlobLogRecord::kHeaderSize, &slice);
+  if (!s.ok()) {
+    return s;
+  }
+  BlobLogRecord record;
+  s = record.DecodeHeaderFrom(slice);
+  if (!s.ok()) {
+    return s;
+  }
+  uint64_t key_size = record.key_size;
+  uint64_t value_size = record.value_size;
+  if (show_key != DisplayType::kNone) {
+    fprintf(stdout, "  key size   : %" PRIu64 "\n", key_size);
+    fprintf(stdout, "  value size : %" PRIu64 "\n", value_size);
+    fprintf(stdout, "  expiration : %" PRIu64 "\n", record.expiration);
+  }
+  *offset += BlobLogRecord::kHeaderSize;
+  s = Read(*offset, static_cast<size_t>(key_size + value_size), &slice);
+  if (!s.ok()) {
+    return s;
+  }
+  // Decompress value
+  std::string uncompressed_value;
+  if (compression != kNoCompression &&
+      (show_uncompressed_blob != DisplayType::kNone || show_summary)) {
+    BlockContents contents;
+    UncompressionContext context(compression);
+    UncompressionInfo info(context, UncompressionDict::GetEmptyDict(),
+                           compression);
+    s = UncompressBlockData(
+        info, slice.data() + key_size, static_cast<size_t>(value_size),
+        &contents, 2 /*compress_format_version*/, ImmutableOptions(Options()));
+    if (!s.ok()) {
+      return s;
+    }
+    uncompressed_value = contents.data.ToString();
+  }
+  if (show_key != DisplayType::kNone) {
+    fprintf(stdout, "  key        : ");
+    DumpSlice(Slice(slice.data(), static_cast<size_t>(key_size)), show_key);
+    if (show_blob != DisplayType::kNone) {
+      fprintf(stdout, "  blob       : ");
+      DumpSlice(Slice(slice.data() + static_cast<size_t>(key_size),
+                      static_cast<size_t>(value_size)),
+                show_blob);
+    }
+    if (show_uncompressed_blob != DisplayType::kNone) {
+      fprintf(stdout, "  raw blob   : ");
+      DumpSlice(Slice(uncompressed_value), show_uncompressed_blob);
+    }
+  }
+  *offset += key_size + value_size;
+  *total_records += 1;
+  *total_key_size += key_size;
+  *total_blob_size += value_size;
+  *total_uncompressed_blob_size += uncompressed_value.size();
+  return s;
+}
+
+void BlobDumpTool::DumpSlice(const Slice s, DisplayType type) {
+  if (type == DisplayType::kRaw) {
+    fprintf(stdout, "%s\n", s.ToString().c_str());
+  } else if (type == DisplayType::kHex) {
+    fprintf(stdout, "%s\n", s.ToString(true /*hex*/).c_str());
+  } else if (type == DisplayType::kDetail) {
+    char buf[100];
+    for (size_t i = 0; i < s.size(); i += 16) {
+      memset(buf, 0, sizeof(buf));
+      for (size_t j = 0; j < 16 && i + j < s.size(); j++) {
+        unsigned char c = s[i + j];
+        snprintf(buf + j * 3 + 15, 2, "%x", c >> 4);
+        snprintf(buf + j * 3 + 16, 2, "%x", c & 0xf);
+        snprintf(buf + j + 65, 2, "%c", (0x20 <= c && c <= 0x7e) ? c : '.');
+      }
+      for (size_t p = 0; p + 1 < sizeof(buf); p++) {
+        if (buf[p] == 0) {
+          buf[p] = ' ';
+        }
+      }
+      fprintf(stdout, "%s\n", i == 0 ? buf + 15 : buf);
+    }
+  }
+}
+
+template <class T>
+std::string BlobDumpTool::GetString(std::pair<T, T> p) {
+  if (p.first == 0 && p.second == 0) {
+    return "nil";
+  }
+  return "(" + std::to_string(p.first) + ", " + std::to_string(p.second) + ")";
+}
+
+}  // namespace blob_db
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/blob_db/blob_dump_tool.h b/src/rocksdb/utilities/blob_db/blob_dump_tool.h
new file mode 100644
index 000000000..bece564e1
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_dump_tool.h
@@ -0,0 +1,58 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "db/blob/blob_log_format.h"
+#include "file/random_access_file_reader.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace blob_db {
+
+class BlobDumpTool {
+ public:
+  enum class DisplayType {
+    kNone,
+    kRaw,
+    kHex,
+    kDetail,
+  };
+
+  BlobDumpTool();
+
+  Status Run(const std::string& filename, DisplayType show_key,
+             DisplayType show_blob, DisplayType show_uncompressed_blob,
+             bool show_summary);
+
+ private:
+  std::unique_ptr<RandomAccessFileReader> reader_;
+  std::unique_ptr<char[]> buffer_;
+  size_t buffer_size_;
+
+  Status Read(uint64_t offset, size_t size, Slice* result);
+  Status DumpBlobLogHeader(uint64_t* offset, CompressionType* compression);
+  Status DumpBlobLogFooter(uint64_t file_size, uint64_t* footer_offset);
+  Status DumpRecord(DisplayType show_key, DisplayType show_blob,
+                    DisplayType show_uncompressed_blob, bool show_summary,
+                    CompressionType compression, uint64_t* offset,
+                    uint64_t* total_records, uint64_t* total_key_size,
+                    uint64_t* total_blob_size,
+                    uint64_t* total_uncompressed_blob_size);
+  void DumpSlice(const Slice s, DisplayType type);
+
+  template <class T>
+  std::string GetString(std::pair<T, T> p);
+};
+
+}  // namespace blob_db
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/blob_db/blob_file.cc b/src/rocksdb/utilities/blob_db/blob_file.cc
new file mode 100644
index 000000000..c68e557c6
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_file.cc
@@ -0,0 +1,318 @@
+
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#ifndef ROCKSDB_LITE
+#include "utilities/blob_db/blob_file.h"
+
+#include <stdio.h>
+
+#include <algorithm>
+#include <cinttypes>
+#include <memory>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "file/filename.h"
+#include "file/readahead_raf.h"
+#include "logging/logging.h"
+#include "utilities/blob_db/blob_db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace blob_db {
+
+BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn,
+                   Logger* info_log)
+    : parent_(p), path_to_dir_(bdir), file_number_(fn), info_log_(info_log) {}
+
+BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn,
+                   Logger* info_log, uint32_t column_family_id,
+                   CompressionType compression, bool has_ttl,
+                   const ExpirationRange& expiration_range)
+    : parent_(p),
+      path_to_dir_(bdir),
+      file_number_(fn),
+      info_log_(info_log),
+      column_family_id_(column_family_id),
+      compression_(compression),
+      has_ttl_(has_ttl),
+      expiration_range_(expiration_range),
+      header_(column_family_id, compression, has_ttl, expiration_range),
+      header_valid_(true) {}
+
+BlobFile::~BlobFile() {
+  if (obsolete_) {
+    std::string pn(PathName());
+    Status s = Env::Default()->DeleteFile(PathName());
+    if (!s.ok()) {
+      // ROCKS_LOG_INFO(db_options_.info_log,
+      // "File could not be deleted %s", pn.c_str());
+    }
+  }
+}
+
+uint32_t BlobFile::GetColumnFamilyId() const { return column_family_id_; }
+
+std::string BlobFile::PathName() const {
+  return BlobFileName(path_to_dir_, file_number_);
+}
+
+std::string BlobFile::DumpState() const {
+  char str[1000];
+  snprintf(
+      str, sizeof(str),
+      "path: %s fn: %" PRIu64 " blob_count: %" PRIu64 " file_size: %" PRIu64
+      " closed: %d obsolete: %d expiration_range: (%" PRIu64 ", %" PRIu64
+      "), writer: %d reader: %d",
+      path_to_dir_.c_str(), file_number_, blob_count_.load(), file_size_.load(),
+      closed_.load(), obsolete_.load(), expiration_range_.first,
+      expiration_range_.second, (!!log_writer_), (!!ra_file_reader_));
+  return str;
+}
+
+void BlobFile::MarkObsolete(SequenceNumber sequence) {
+  assert(Immutable());
+  obsolete_sequence_ = sequence;
+  obsolete_.store(true);
+}
+
+Status BlobFile::WriteFooterAndCloseLocked(SequenceNumber sequence) {
+  BlobLogFooter footer;
+  footer.blob_count = blob_count_;
+  if (HasTTL()) {
+    footer.expiration_range = expiration_range_;
+  }
+
+  // this will close the file and reset the Writable File Pointer.
+  Status s = log_writer_->AppendFooter(footer, /* checksum_method */ nullptr,
+                                       /* checksum_value */ nullptr);
+  if (s.ok()) {
+    closed_ = true;
+    immutable_sequence_ = sequence;
+    file_size_ += BlobLogFooter::kSize;
+  }
+  // delete the sequential writer
+  log_writer_.reset();
+  return s;
+}
+
+Status BlobFile::ReadFooter(BlobLogFooter* bf) {
+  if (file_size_ < (BlobLogHeader::kSize + BlobLogFooter::kSize)) {
+    return Status::IOError("File does not have footer", PathName());
+  }
+
+  uint64_t footer_offset = file_size_ - BlobLogFooter::kSize;
+  // assume that ra_file_reader_ is valid before we enter this
+  assert(ra_file_reader_);
+
+  Slice result;
+  std::string buf;
+  AlignedBuf aligned_buf;
+  Status s;
+  // TODO: rate limit reading footers from blob files.
+  if (ra_file_reader_->use_direct_io()) {
+    s = ra_file_reader_->Read(IOOptions(), footer_offset, BlobLogFooter::kSize,
+                              &result, nullptr, &aligned_buf,
+                              Env::IO_TOTAL /* rate_limiter_priority */);
+  } else {
+    buf.reserve(BlobLogFooter::kSize + 10);
+    s = ra_file_reader_->Read(IOOptions(), footer_offset, BlobLogFooter::kSize,
+                              &result, &buf[0], nullptr,
+                              Env::IO_TOTAL /* rate_limiter_priority */);
+  }
+  if (!s.ok()) return s;
+  if (result.size() != BlobLogFooter::kSize) {
+    // should not happen
+    return Status::IOError("EOF reached before footer");
+  }
+
+  s = bf->DecodeFrom(result);
+  return s;
+}
+
+Status BlobFile::SetFromFooterLocked(const BlobLogFooter& footer) {
+  blob_count_ = footer.blob_count;
+  expiration_range_ = footer.expiration_range;
+  closed_ = true;
+  return Status::OK();
+}
+
+Status BlobFile::Fsync() {
+  Status s;
+  if (log_writer_.get()) {
+    s = log_writer_->Sync();
+  }
+  return s;
+}
+
+void BlobFile::CloseRandomAccessLocked() {
+  ra_file_reader_.reset();
+  last_access_ = -1;
+}
+
+Status BlobFile::GetReader(Env* env, const FileOptions& file_options,
+                           std::shared_ptr<RandomAccessFileReader>* reader,
+                           bool* fresh_open) {
+  assert(reader != nullptr);
+  assert(fresh_open != nullptr);
+  *fresh_open = false;
+  int64_t current_time = 0;
+  if (env->GetCurrentTime(&current_time).ok()) {
+    last_access_.store(current_time);
+  }
+  Status s;
+
+  {
+    ReadLock lockbfile_r(&mutex_);
+    if (ra_file_reader_) {
+      *reader = ra_file_reader_;
+      return s;
+    }
+  }
+
+  WriteLock lockbfile_w(&mutex_);
+  // Double check.
+  if (ra_file_reader_) {
+    *reader = ra_file_reader_;
+    return s;
+  }
+
+  std::unique_ptr<FSRandomAccessFile> rfile;
+  s = env->GetFileSystem()->NewRandomAccessFile(PathName(), file_options,
+                                                &rfile, nullptr);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(info_log_,
+                    "Failed to open blob file for random-read: %s status: '%s'"
+                    " exists: '%s'",
+                    PathName().c_str(), s.ToString().c_str(),
+                    env->FileExists(PathName()).ToString().c_str());
+    return s;
+  }
+
+  ra_file_reader_ =
+      std::make_shared<RandomAccessFileReader>(std::move(rfile), PathName());
+  *reader = ra_file_reader_;
+  *fresh_open = true;
+  return s;
+}
+
+Status BlobFile::ReadMetadata(const std::shared_ptr<FileSystem>& fs,
+                              const FileOptions& file_options) {
+  assert(Immutable());
+  // Get file size.
+  uint64_t file_size = 0;
+  Status s =
+      fs->GetFileSize(PathName(), file_options.io_options, &file_size, nullptr);
+  if (s.ok()) {
+    file_size_ = file_size;
+  } else {
+    ROCKS_LOG_ERROR(info_log_,
+                    "Failed to get size of blob file %" PRIu64 ", status: %s",
+                    file_number_, s.ToString().c_str());
+    return s;
+  }
+  if (file_size < BlobLogHeader::kSize) {
+    ROCKS_LOG_ERROR(
+        info_log_, "Incomplete blob file blob file %" PRIu64 ", size: %" PRIu64,
+        file_number_, file_size);
+    return Status::Corruption("Incomplete blob file header.");
+  }
+
+  // Create file reader.
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  s = RandomAccessFileReader::Create(fs, PathName(), file_options, &file_reader,
+                                     nullptr);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(info_log_,
+                    "Failed to open blob file %" PRIu64 ", status: %s",
+                    file_number_, s.ToString().c_str());
+    return s;
+  }
+
+  // Read file header.
+  std::string header_buf;
+  AlignedBuf aligned_buf;
+  Slice header_slice;
+  // TODO: rate limit reading headers from blob files.
+  if (file_reader->use_direct_io()) {
+    s = file_reader->Read(IOOptions(), 0, BlobLogHeader::kSize, &header_slice,
+                          nullptr, &aligned_buf,
+                          Env::IO_TOTAL /* rate_limiter_priority */);
+  } else {
+    header_buf.reserve(BlobLogHeader::kSize);
+    s = file_reader->Read(IOOptions(), 0, BlobLogHeader::kSize, &header_slice,
+                          &header_buf[0], nullptr,
+                          Env::IO_TOTAL /* rate_limiter_priority */);
+  }
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(
+        info_log_, "Failed to read header of blob file %" PRIu64 ", status: %s",
+        file_number_, s.ToString().c_str());
+    return s;
+  }
+  BlobLogHeader header;
+  s = header.DecodeFrom(header_slice);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(info_log_,
+                    "Failed to decode header of blob file %" PRIu64
+                    ", status: %s",
+                    file_number_, s.ToString().c_str());
+    return s;
+  }
+  column_family_id_ = header.column_family_id;
+  compression_ = header.compression;
+  has_ttl_ = header.has_ttl;
+  if (has_ttl_) {
+    expiration_range_ = header.expiration_range;
+  }
+  header_valid_ = true;
+
+  // Read file footer.
+  if (file_size_ < BlobLogHeader::kSize + BlobLogFooter::kSize) {
+    // OK not to have footer.
+    assert(!footer_valid_);
+    return Status::OK();
+  }
+  std::string footer_buf;
+  Slice footer_slice;
+  // TODO: rate limit reading footers from blob files.
+  if (file_reader->use_direct_io()) {
+    s = file_reader->Read(IOOptions(), file_size - BlobLogFooter::kSize,
+                          BlobLogFooter::kSize, &footer_slice, nullptr,
+                          &aligned_buf,
+                          Env::IO_TOTAL /* rate_limiter_priority */);
+  } else {
+    footer_buf.reserve(BlobLogFooter::kSize);
+    s = file_reader->Read(IOOptions(), file_size - BlobLogFooter::kSize,
+                          BlobLogFooter::kSize, &footer_slice, &footer_buf[0],
+                          nullptr, Env::IO_TOTAL /* rate_limiter_priority */);
+  }
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(
+        info_log_, "Failed to read footer of blob file %" PRIu64 ", status: %s",
+        file_number_, s.ToString().c_str());
+    return s;
+  }
+  BlobLogFooter footer;
+  s = footer.DecodeFrom(footer_slice);
+  if (!s.ok()) {
+    // OK not to have footer.
+    assert(!footer_valid_);
+    return Status::OK();
+  }
+  blob_count_ = footer.blob_count;
+  if (has_ttl_) {
+    assert(header.expiration_range.first <= footer.expiration_range.first);
+    assert(header.expiration_range.second >= footer.expiration_range.second);
+    expiration_range_ = footer.expiration_range;
+  }
+  footer_valid_ = true;
+  return Status::OK();
+}
+
+}  // namespace blob_db
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/blob_db/blob_file.h b/src/rocksdb/utilities/blob_db/blob_file.h
new file mode 100644
index 000000000..6f3f2bea7
--- /dev/null
+++ b/src/rocksdb/utilities/blob_db/blob_file.h
@@ -0,0 +1,246 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <atomic>
+#include <limits>
+#include <memory>
+#include <unordered_set>
+
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_writer.h"
+#include "file/random_access_file_reader.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/options.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace blob_db {
+
+class BlobDBImpl;
+
+class BlobFile {
+  friend class BlobDBImpl;
+  friend struct BlobFileComparator;
+  friend struct BlobFileComparatorTTL;
+  friend class BlobIndexCompactionFilterBase;
+  friend class BlobIndexCompactionFilterGC;
+
+ private:
+  // access to parent
+  const BlobDBImpl* parent_{nullptr};
+
+  // path to blob directory
+  std::string path_to_dir_;
+
+  // the id of the file.
+  // the above 2 are created during file creation and never changed
+  // after that
+  uint64_t file_number_{0};
+
+  // The file numbers of the SST files whose oldest blob file reference
+  // points to this blob file.
+  std::unordered_set<uint64_t> linked_sst_files_;
+
+  // Info log.
+  Logger* info_log_{nullptr};
+
+  // Column family id.
+  uint32_t column_family_id_{std::numeric_limits<uint32_t>::max()};
+
+  // Compression type of blobs in the file
+  CompressionType compression_{kNoCompression};
+
+  // If true, the keys in this file all has TTL. Otherwise all keys don't
+  // have TTL.
+  bool has_ttl_{false};
+
+  // TTL range of blobs in the file.
+  ExpirationRange expiration_range_;
+
+  // number of blobs in the file
+  std::atomic<uint64_t> blob_count_{0};
+
+  // size of the file
+  std::atomic<uint64_t> file_size_{0};
+
+  BlobLogHeader header_;
+
+  // closed_ = true implies the file is no more mutable
+  // no more blobs will be appended and the footer has been written out
+  std::atomic<bool> closed_{false};
+
+  // The latest sequence number when the file was closed/made immutable.
+  SequenceNumber immutable_sequence_{0};
+
+  // Whether the file was marked obsolete (due to either TTL or GC).
+  // obsolete_ still needs to do iterator/snapshot checks
+  std::atomic<bool> obsolete_{false};
+
+  // The last sequence number by the time the file marked as obsolete.
+  // Data in this file is visible to a snapshot taken before the sequence.
+  SequenceNumber obsolete_sequence_{0};
+
+  // Sequential/Append writer for blobs
+  std::shared_ptr<BlobLogWriter> log_writer_;
+
+  // random access file reader for GET calls
+  std::shared_ptr<RandomAccessFileReader> ra_file_reader_;
+
+  // This Read-Write mutex is per file specific and protects
+  // all the datastructures
+  mutable port::RWMutex mutex_;
+
+  // time when the random access reader was last created.
+  std::atomic<std::int64_t> last_access_{-1};
+
+  bool header_valid_{false};
+
+  bool footer_valid_{false};
+
+ public:
+  BlobFile() = default;
+
+  BlobFile(const BlobDBImpl* parent, const std::string& bdir, uint64_t fnum,
+           Logger* info_log);
+
+  BlobFile(const BlobDBImpl* parent, const std::string& bdir, uint64_t fnum,
+           Logger* info_log, uint32_t column_family_id,
+           CompressionType compression, bool has_ttl,
+           const ExpirationRange& expiration_range);
+
+  ~BlobFile();
+
+  uint32_t GetColumnFamilyId() const;
+
+  // Returns log file's absolute pathname.
+  std::string PathName() const;
+
+  // Primary identifier for blob file.
+  // once the file is created, this never changes
+  uint64_t BlobFileNumber() const { return file_number_; }
+
+  // Get the set of SST files whose oldest blob file reference points to
+  // this file.
+  const std::unordered_set<uint64_t>& GetLinkedSstFiles() const {
+    return linked_sst_files_;
+  }
+
+  // Link an SST file whose oldest blob file reference points to this file.
+  void LinkSstFile(uint64_t sst_file_number) {
+    assert(linked_sst_files_.find(sst_file_number) == linked_sst_files_.end());
+    linked_sst_files_.insert(sst_file_number);
+  }
+
+  // Unlink an SST file whose oldest blob file reference points to this file.
+  void UnlinkSstFile(uint64_t sst_file_number) {
+    auto it = linked_sst_files_.find(sst_file_number);
+    assert(it != linked_sst_files_.end());
+    linked_sst_files_.erase(it);
+  }
+
+  // the following functions are atomic, and don't need
+  // read lock
+  uint64_t BlobCount() const {
+    return blob_count_.load(std::memory_order_acquire);
+  }
+
+  std::string DumpState() const;
+
+  // if the file is not taking any more appends.
+  bool Immutable() const { return closed_.load(); }
+
+  // Mark the file as immutable.
+  // REQUIRES: write lock held, or access from single thread (on DB open).
+  void MarkImmutable(SequenceNumber sequence) {
+    closed_ = true;
+    immutable_sequence_ = sequence;
+  }
+
+  SequenceNumber GetImmutableSequence() const {
+    assert(Immutable());
+    return immutable_sequence_;
+  }
+
+  // Whether the file was marked obsolete (due to either TTL or GC).
+  bool Obsolete() const {
+    assert(Immutable() || !obsolete_.load());
+    return obsolete_.load();
+  }
+
+  // Mark file as obsolete (due to either TTL or GC). The file is not visible to
+  // snapshots with sequence greater or equal to the given sequence.
+  void MarkObsolete(SequenceNumber sequence);
+
+  SequenceNumber GetObsoleteSequence() const {
+    assert(Obsolete());
+    return obsolete_sequence_;
+  }
+
+  Status Fsync();
+
+  uint64_t GetFileSize() const {
+    return file_size_.load(std::memory_order_acquire);
+  }
+
+  // All Get functions which are not atomic, will need ReadLock on the mutex
+
+  const ExpirationRange& GetExpirationRange() const {
+    return expiration_range_;
+  }
+
+  void ExtendExpirationRange(uint64_t expiration) {
+    expiration_range_.first = std::min(expiration_range_.first, expiration);
+    expiration_range_.second = std::max(expiration_range_.second, expiration);
+  }
+
+  bool HasTTL() const { return has_ttl_; }
+
+  void SetHasTTL(bool has_ttl) { has_ttl_ = has_ttl; }
+
+  CompressionType GetCompressionType() const { return compression_; }
+
+  std::shared_ptr<BlobLogWriter> GetWriter() const { return log_writer_; }
+
+  // Read blob file header and footer. Return corruption if file header is
+  // malform or incomplete. If footer is malform or incomplete, set
+  // footer_valid_ to false and return Status::OK.
+  Status ReadMetadata(const std::shared_ptr<FileSystem>& fs,
+                      const FileOptions& file_options);
+
+  Status GetReader(Env* env, const FileOptions& file_options,
+                   std::shared_ptr<RandomAccessFileReader>* reader,
+                   bool* fresh_open);
+
+ private:
+  Status ReadFooter(BlobLogFooter* footer);
+
+  Status WriteFooterAndCloseLocked(SequenceNumber sequence);
+
+  void CloseRandomAccessLocked();
+
+  // this is used, when you are reading only the footer of a
+  // previously closed file
+  Status SetFromFooterLocked(const BlobLogFooter& footer);
+
+  void set_expiration_range(const ExpirationRange& expiration_range) {
+    expiration_range_ = expiration_range;
+  }
+
+  // The following functions are atomic, and don't need locks
+  void SetFileSize(uint64_t fs) { file_size_ = fs; }
+
+  void SetBlobCount(uint64_t bc) { blob_count_ = bc; }
+
+  void BlobRecordAdded(uint64_t record_size) {
+    ++blob_count_;
+    file_size_ += record_size;
+  }
+};
+}  // namespace blob_db
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/cache_dump_load.cc b/src/rocksdb/utilities/cache_dump_load.cc
new file mode 100644
index 000000000..9a7c76798
--- /dev/null
+++ b/src/rocksdb/utilities/cache_dump_load.cc
@@ -0,0 +1,69 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/cache_dump_load.h"
+
+#include "file/writable_file_writer.h"
+#include "port/lang.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "table/format.h"
+#include "util/crc32c.h"
+#include "utilities/cache_dump_load_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+IOStatus NewToFileCacheDumpWriter(const std::shared_ptr<FileSystem>& fs,
+                                  const FileOptions& file_opts,
+                                  const std::string& file_name,
+                                  std::unique_ptr<CacheDumpWriter>* writer) {
+  std::unique_ptr<WritableFileWriter> file_writer;
+  IOStatus io_s = WritableFileWriter::Create(fs, file_name, file_opts,
+                                             &file_writer, nullptr);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  writer->reset(new ToFileCacheDumpWriter(std::move(file_writer)));
+  return io_s;
+}
+
+IOStatus NewFromFileCacheDumpReader(const std::shared_ptr<FileSystem>& fs,
+                                    const FileOptions& file_opts,
+                                    const std::string& file_name,
+                                    std::unique_ptr<CacheDumpReader>* reader) {
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  IOStatus io_s = RandomAccessFileReader::Create(fs, file_name, file_opts,
+                                                 &file_reader, nullptr);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  reader->reset(new FromFileCacheDumpReader(std::move(file_reader)));
+  return io_s;
+}
+
+Status NewDefaultCacheDumper(const CacheDumpOptions& dump_options,
+                             const std::shared_ptr<Cache>& cache,
+                             std::unique_ptr<CacheDumpWriter>&& writer,
+                             std::unique_ptr<CacheDumper>* cache_dumper) {
+  cache_dumper->reset(
+      new CacheDumperImpl(dump_options, cache, std::move(writer)));
+  return Status::OK();
+}
+
+Status NewDefaultCacheDumpedLoader(
+    const CacheDumpOptions& dump_options,
+    const BlockBasedTableOptions& toptions,
+    const std::shared_ptr<SecondaryCache>& secondary_cache,
+    std::unique_ptr<CacheDumpReader>&& reader,
+    std::unique_ptr<CacheDumpedLoader>* cache_dump_loader) {
+  cache_dump_loader->reset(new CacheDumpedLoaderImpl(
+      dump_options, toptions, secondary_cache, std::move(reader)));
+  return Status::OK();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/cache_dump_load_impl.cc b/src/rocksdb/utilities/cache_dump_load_impl.cc
new file mode 100644
index 000000000..2b9f2a29d
--- /dev/null
+++ b/src/rocksdb/utilities/cache_dump_load_impl.cc
@@ -0,0 +1,393 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "cache/cache_key.h"
+#include "table/block_based/block_based_table_reader.h"
+#ifndef ROCKSDB_LITE
+
+#include "cache/cache_entry_roles.h"
+#include "file/writable_file_writer.h"
+#include "port/lang.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/utilities/ldb_cmd.h"
+#include "table/format.h"
+#include "util/crc32c.h"
+#include "utilities/cache_dump_load_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Set the dump filter with a list of DBs. Block cache may be shared by multipe
+// DBs and we may only want to dump out the blocks belonging to certain DB(s).
+// Therefore, a filter is need to decide if the key of the block satisfy the
+// requirement.
+Status CacheDumperImpl::SetDumpFilter(std::vector<DB*> db_list) {
+  Status s = Status::OK();
+  for (size_t i = 0; i < db_list.size(); i++) {
+    assert(i < db_list.size());
+    TablePropertiesCollection ptc;
+    assert(db_list[i] != nullptr);
+    s = db_list[i]->GetPropertiesOfAllTables(&ptc);
+    if (!s.ok()) {
+      return s;
+    }
+    for (auto id = ptc.begin(); id != ptc.end(); id++) {
+      OffsetableCacheKey base;
+      // We only want to save cache entries that are portable to another
+      // DB::Open, so only save entries with stable keys.
+      bool is_stable;
+      BlockBasedTable::SetupBaseCacheKey(id->second.get(),
+                                         /*cur_db_session_id*/ "",
+                                         /*cur_file_num*/ 0, &base, &is_stable);
+      if (is_stable) {
+        Slice prefix_slice = base.CommonPrefixSlice();
+        assert(prefix_slice.size() == OffsetableCacheKey::kCommonPrefixSize);
+        prefix_filter_.insert(prefix_slice.ToString());
+      }
+    }
+  }
+  return s;
+}
+
+// This is the main function to dump out the cache block entries to the writer.
+// The writer may create a file or write to other systems. Currently, we will
+// iterate the whole block cache, get the blocks, and write them to the writer
+IOStatus CacheDumperImpl::DumpCacheEntriesToWriter() {
+  // Prepare stage, check the parameters.
+  if (cache_ == nullptr) {
+    return IOStatus::InvalidArgument("Cache is null");
+  }
+  if (writer_ == nullptr) {
+    return IOStatus::InvalidArgument("CacheDumpWriter is null");
+  }
+  // Set the system clock
+  if (options_.clock == nullptr) {
+    return IOStatus::InvalidArgument("System clock is null");
+  }
+  clock_ = options_.clock;
+  // We copy the Cache Deleter Role Map as its member.
+  role_map_ = CopyCacheDeleterRoleMap();
+  // Set the sequence number
+  sequence_num_ = 0;
+
+  // Dump stage, first, we write the hader
+  IOStatus io_s = WriteHeader();
+  if (!io_s.ok()) {
+    return io_s;
+  }
+
+  // Then, we iterate the block cache and dump out the blocks that are not
+  // filtered out.
+  cache_->ApplyToAllEntries(DumpOneBlockCallBack(), {});
+
+  // Finally, write the footer
+  io_s = WriteFooter();
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  io_s = writer_->Close();
+  return io_s;
+}
+
+// Check if we need to filter out the block based on its key
+bool CacheDumperImpl::ShouldFilterOut(const Slice& key) {
+  if (key.size() < OffsetableCacheKey::kCommonPrefixSize) {
+    return /*filter out*/ true;
+  }
+  Slice key_prefix(key.data(), OffsetableCacheKey::kCommonPrefixSize);
+  std::string prefix = key_prefix.ToString();
+  // Filter out if not found
+  return prefix_filter_.find(prefix) == prefix_filter_.end();
+}
+
+// This is the callback function which will be applied to
+// Cache::ApplyToAllEntries. In this callback function, we will get the block
+// type, decide if the block needs to be dumped based on the filter, and write
+// the block through the provided writer.
+std::function<void(const Slice&, void*, size_t, Cache::DeleterFn)>
+CacheDumperImpl::DumpOneBlockCallBack() {
+  return [&](const Slice& key, void* value, size_t /*charge*/,
+             Cache::DeleterFn deleter) {
+    // Step 1: get the type of the block from role_map_
+    auto e = role_map_.find(deleter);
+    CacheEntryRole role;
+    CacheDumpUnitType type = CacheDumpUnitType::kBlockTypeMax;
+    if (e == role_map_.end()) {
+      role = CacheEntryRole::kMisc;
+    } else {
+      role = e->second;
+    }
+    bool filter_out = false;
+
+    // Step 2: based on the key prefix, check if the block should be filter out.
+    if (ShouldFilterOut(key)) {
+      filter_out = true;
+    }
+
+    // Step 3: based on the block type, get the block raw pointer and length.
+    const char* block_start = nullptr;
+    size_t block_len = 0;
+    switch (role) {
+      case CacheEntryRole::kDataBlock:
+        type = CacheDumpUnitType::kData;
+        block_start = (static_cast<Block*>(value))->data();
+        block_len = (static_cast<Block*>(value))->size();
+        break;
+      case CacheEntryRole::kFilterBlock:
+        type = CacheDumpUnitType::kFilter;
+        block_start = (static_cast<ParsedFullFilterBlock*>(value))
+                          ->GetBlockContentsData()
+                          .data();
+        block_len = (static_cast<ParsedFullFilterBlock*>(value))
+                        ->GetBlockContentsData()
+                        .size();
+        break;
+      case CacheEntryRole::kFilterMetaBlock:
+        type = CacheDumpUnitType::kFilterMetaBlock;
+        block_start = (static_cast<Block*>(value))->data();
+        block_len = (static_cast<Block*>(value))->size();
+        break;
+      case CacheEntryRole::kIndexBlock:
+        type = CacheDumpUnitType::kIndex;
+        block_start = (static_cast<Block*>(value))->data();
+        block_len = (static_cast<Block*>(value))->size();
+        break;
+      case CacheEntryRole::kDeprecatedFilterBlock:
+        // Obsolete
+        filter_out = true;
+        break;
+      case CacheEntryRole::kMisc:
+        filter_out = true;
+        break;
+      case CacheEntryRole::kOtherBlock:
+        filter_out = true;
+        break;
+      case CacheEntryRole::kWriteBuffer:
+        filter_out = true;
+        break;
+      default:
+        filter_out = true;
+    }
+
+    // Step 4: if the block should not be filter out, write the block to the
+    // CacheDumpWriter
+    if (!filter_out && block_start != nullptr) {
+      WriteBlock(type, key, Slice(block_start, block_len))
+          .PermitUncheckedError();
+    }
+  };
+}
+
+// Write the block to the writer. It takes the timestamp of the
+// block being copied from block cache, block type, key, block pointer,
+// block size and block checksum as the input. When writing the dumper raw
+// block, we first create the dump unit and encoude it to a string. Then,
+// we calculate the checksum of the whole dump unit string and store it in
+// the dump unit metadata.
+// First, we write the metadata first, which is a fixed size string. Then, we
+// Append the dump unit string to the writer.
+IOStatus CacheDumperImpl::WriteBlock(CacheDumpUnitType type, const Slice& key,
+                                     const Slice& value) {
+  uint64_t timestamp = clock_->NowMicros();
+  uint32_t value_checksum = crc32c::Value(value.data(), value.size());
+
+  // First, serialize the block information in a string
+  DumpUnit dump_unit;
+  dump_unit.timestamp = timestamp;
+  dump_unit.key = key;
+  dump_unit.type = type;
+  dump_unit.value_len = value.size();
+  dump_unit.value = const_cast<char*>(value.data());
+  dump_unit.value_checksum = value_checksum;
+  std::string encoded_data;
+  CacheDumperHelper::EncodeDumpUnit(dump_unit, &encoded_data);
+
+  // Second, create the metadata, which contains a sequence number, the dump
+  // unit string checksum and the string size. The sequence number monotonically
+  // increases from 0.
+  DumpUnitMeta unit_meta;
+  unit_meta.sequence_num = sequence_num_;
+  sequence_num_++;
+  unit_meta.dump_unit_checksum =
+      crc32c::Value(encoded_data.data(), encoded_data.size());
+  unit_meta.dump_unit_size = encoded_data.size();
+  std::string encoded_meta;
+  CacheDumperHelper::EncodeDumpUnitMeta(unit_meta, &encoded_meta);
+
+  // We write the metadata first.
+  assert(writer_ != nullptr);
+  IOStatus io_s = writer_->WriteMetadata(encoded_meta);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  // followed by the dump unit.
+  return writer_->WritePacket(encoded_data);
+}
+
+// Before we write any block, we write the header first to store the cache dump
+// format version, rocksdb version, and brief intro.
+IOStatus CacheDumperImpl::WriteHeader() {
+  std::string header_key = "header";
+  std::ostringstream s;
+  s << kTraceMagic << "\t"
+    << "Cache dump format version: " << kCacheDumpMajorVersion << "."
+    << kCacheDumpMinorVersion << "\t"
+    << "RocksDB Version: " << kMajorVersion << "." << kMinorVersion << "\t"
+    << "Format: dump_unit_metadata <sequence_number, dump_unit_checksum, "
+       "dump_unit_size>, dump_unit <timestamp, key, block_type, "
+       "block_size, block_data, block_checksum> cache_value\n";
+  std::string header_value(s.str());
+  CacheDumpUnitType type = CacheDumpUnitType::kHeader;
+  return WriteBlock(type, header_key, header_value);
+}
+
+// Write the footer after all the blocks are stored to indicate the ending.
+IOStatus CacheDumperImpl::WriteFooter() {
+  std::string footer_key = "footer";
+  std::string footer_value("cache dump completed");
+  CacheDumpUnitType type = CacheDumpUnitType::kFooter;
+  return WriteBlock(type, footer_key, footer_value);
+}
+
+// This is the main function to restore the cache entries to secondary cache.
+// First, we check if all the arguments are valid. Then, we read the block
+// sequentially from the reader and insert them to the secondary cache.
+IOStatus CacheDumpedLoaderImpl::RestoreCacheEntriesToSecondaryCache() {
+  // TODO: remove this line when options are used in the loader
+  (void)options_;
+  // Step 1: we check if all the arguments are valid
+  if (secondary_cache_ == nullptr) {
+    return IOStatus::InvalidArgument("Secondary Cache is null");
+  }
+  if (reader_ == nullptr) {
+    return IOStatus::InvalidArgument("CacheDumpReader is null");
+  }
+  // we copy the Cache Deleter Role Map as its member.
+  role_map_ = CopyCacheDeleterRoleMap();
+
+  // Step 2: read the header
+  // TODO: we need to check the cache dump format version and RocksDB version
+  // after the header is read out.
+  IOStatus io_s;
+  DumpUnit dump_unit;
+  std::string data;
+  io_s = ReadHeader(&data, &dump_unit);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+
+  // Step 3: read out the rest of the blocks from the reader. The loop will stop
+  // either I/O status is not ok or we reach to the the end.
+  while (io_s.ok()) {
+    dump_unit.reset();
+    data.clear();
+    // read the content and store in the dump_unit
+    io_s = ReadCacheBlock(&data, &dump_unit);
+    if (!io_s.ok()) {
+      break;
+    }
+    if (dump_unit.type == CacheDumpUnitType::kFooter) {
+      break;
+    }
+    // Create the uncompressed_block based on the information in the dump_unit
+    // (There is no block trailer here compatible with block-based SST file.)
+    Slice content =
+        Slice(static_cast<char*>(dump_unit.value), dump_unit.value_len);
+    Status s = secondary_cache_->InsertSaved(dump_unit.key, content);
+    if (!s.ok()) {
+      io_s = status_to_io_status(std::move(s));
+    }
+  }
+  if (dump_unit.type == CacheDumpUnitType::kFooter) {
+    return IOStatus::OK();
+  } else {
+    return io_s;
+  }
+}
+
+// Read and copy the dump unit metadata to std::string data, decode and create
+// the unit metadata based on the string
+IOStatus CacheDumpedLoaderImpl::ReadDumpUnitMeta(std::string* data,
+                                                 DumpUnitMeta* unit_meta) {
+  assert(reader_ != nullptr);
+  assert(data != nullptr);
+  assert(unit_meta != nullptr);
+  IOStatus io_s = reader_->ReadMetadata(data);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  return status_to_io_status(
+      CacheDumperHelper::DecodeDumpUnitMeta(*data, unit_meta));
+}
+
+// Read and copy the dump unit to std::string data, decode and create the unit
+// based on the string
+IOStatus CacheDumpedLoaderImpl::ReadDumpUnit(size_t len, std::string* data,
+                                             DumpUnit* unit) {
+  assert(reader_ != nullptr);
+  assert(data != nullptr);
+  assert(unit != nullptr);
+  IOStatus io_s = reader_->ReadPacket(data);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  if (data->size() != len) {
+    return IOStatus::Corruption(
+        "The data being read out does not match the size stored in metadata!");
+  }
+  Slice block;
+  return status_to_io_status(CacheDumperHelper::DecodeDumpUnit(*data, unit));
+}
+
+// Read the header
+IOStatus CacheDumpedLoaderImpl::ReadHeader(std::string* data,
+                                           DumpUnit* dump_unit) {
+  DumpUnitMeta header_meta;
+  header_meta.reset();
+  std::string meta_string;
+  IOStatus io_s = ReadDumpUnitMeta(&meta_string, &header_meta);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+
+  io_s = ReadDumpUnit(header_meta.dump_unit_size, data, dump_unit);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  uint32_t unit_checksum = crc32c::Value(data->data(), data->size());
+  if (unit_checksum != header_meta.dump_unit_checksum) {
+    return IOStatus::Corruption("Read header unit corrupted!");
+  }
+  return io_s;
+}
+
+// Read the blocks after header is read out
+IOStatus CacheDumpedLoaderImpl::ReadCacheBlock(std::string* data,
+                                               DumpUnit* dump_unit) {
+  // According to the write process, we read the dump_unit_metadata first
+  DumpUnitMeta unit_meta;
+  unit_meta.reset();
+  std::string unit_string;
+  IOStatus io_s = ReadDumpUnitMeta(&unit_string, &unit_meta);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+
+  // Based on the information in the dump_unit_metadata, we read the dump_unit
+  // and verify if its content is correct.
+  io_s = ReadDumpUnit(unit_meta.dump_unit_size, data, dump_unit);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  uint32_t unit_checksum = crc32c::Value(data->data(), data->size());
+  if (unit_checksum != unit_meta.dump_unit_checksum) {
+    return IOStatus::Corruption(
+        "Checksum does not match! Read dumped unit corrupted!");
+  }
+  return io_s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/cache_dump_load_impl.h b/src/rocksdb/utilities/cache_dump_load_impl.h
new file mode 100644
index 000000000..9ca1ff45a
--- /dev/null
+++ b/src/rocksdb/utilities/cache_dump_load_impl.h
@@ -0,0 +1,359 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <unordered_map>
+
+#include "file/random_access_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/utilities/cache_dump_load.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_like_traits.h"
+#include "table/block_based/block_type.h"
+#include "table/block_based/cachable_entry.h"
+#include "table/block_based/parsed_full_filter_block.h"
+#include "table/block_based/reader_common.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// the read buffer size of for the default CacheDumpReader
+const unsigned int kDumpReaderBufferSize = 1024;  // 1KB
+static const unsigned int kSizePrefixLen = 4;
+
+enum CacheDumpUnitType : unsigned char {
+  kHeader = 1,
+  kFooter = 2,
+  kData = 3,
+  kFilter = 4,
+  kProperties = 5,
+  kCompressionDictionary = 6,
+  kRangeDeletion = 7,
+  kHashIndexPrefixes = 8,
+  kHashIndexMetadata = 9,
+  kMetaIndex = 10,
+  kIndex = 11,
+  kDeprecatedFilterBlock = 12,  // OBSOLETE / DEPRECATED
+  kFilterMetaBlock = 13,
+  kBlockTypeMax,
+};
+
+// The metadata of a dump unit. After it is serilized, its size is fixed 16
+// bytes.
+struct DumpUnitMeta {
+  // sequence number is a monotonically increasing number to indicate the order
+  // of the blocks being written. Header is 0.
+  uint32_t sequence_num;
+  // The Crc32c checksum of its dump unit.
+  uint32_t dump_unit_checksum;
+  // The dump unit size after the dump unit is serilized to a string.
+  uint64_t dump_unit_size;
+
+  void reset() {
+    sequence_num = 0;
+    dump_unit_checksum = 0;
+    dump_unit_size = 0;
+  }
+};
+
+// The data structure to hold a block and its information.
+struct DumpUnit {
+  // The timestamp when the block is identified, copied, and dumped from block
+  // cache
+  uint64_t timestamp;
+  // The type of the block
+  CacheDumpUnitType type;
+  // The key of this block when the block is referenced by this Cache
+  Slice key;
+  // The block size
+  size_t value_len;
+  // The Crc32c checksum of the block
+  uint32_t value_checksum;
+  // Pointer to the block. Note that, in the dump process, it points to a memory
+  // buffer copied from cache block. The buffer is freed when we process the
+  // next block. In the load process, we use an std::string to store the
+  // serialized dump_unit read from the reader. So it points to the memory
+  // address of the begin of the block in this string.
+  void* value;
+
+  DumpUnit() { reset(); }
+
+  void reset() {
+    timestamp = 0;
+    type = CacheDumpUnitType::kBlockTypeMax;
+    key.clear();
+    value_len = 0;
+    value_checksum = 0;
+    value = nullptr;
+  }
+};
+
+// The default implementation of the Cache Dumper
+class CacheDumperImpl : public CacheDumper {
+ public:
+  CacheDumperImpl(const CacheDumpOptions& dump_options,
+                  const std::shared_ptr<Cache>& cache,
+                  std::unique_ptr<CacheDumpWriter>&& writer)
+      : options_(dump_options), cache_(cache), writer_(std::move(writer)) {}
+  ~CacheDumperImpl() { writer_.reset(); }
+  Status SetDumpFilter(std::vector<DB*> db_list) override;
+  IOStatus DumpCacheEntriesToWriter() override;
+
+ private:
+  IOStatus WriteBlock(CacheDumpUnitType type, const Slice& key,
+                      const Slice& value);
+  IOStatus WriteHeader();
+  IOStatus WriteFooter();
+  bool ShouldFilterOut(const Slice& key);
+  std::function<void(const Slice&, void*, size_t, Cache::DeleterFn)>
+  DumpOneBlockCallBack();
+
+  CacheDumpOptions options_;
+  std::shared_ptr<Cache> cache_;
+  std::unique_ptr<CacheDumpWriter> writer_;
+  UnorderedMap<Cache::DeleterFn, CacheEntryRole> role_map_;
+  SystemClock* clock_;
+  uint32_t sequence_num_;
+  // The cache key prefix filter. Currently, we use db_session_id as the prefix,
+  // so using std::set to store the prefixes as filter is enough. Further
+  // improvement can be applied like BloomFilter or others to speedup the
+  // filtering.
+  std::set<std::string> prefix_filter_;
+};
+
+// The default implementation of CacheDumpedLoader
+class CacheDumpedLoaderImpl : public CacheDumpedLoader {
+ public:
+  CacheDumpedLoaderImpl(const CacheDumpOptions& dump_options,
+                        const BlockBasedTableOptions& /*toptions*/,
+                        const std::shared_ptr<SecondaryCache>& secondary_cache,
+                        std::unique_ptr<CacheDumpReader>&& reader)
+      : options_(dump_options),
+        secondary_cache_(secondary_cache),
+        reader_(std::move(reader)) {}
+  ~CacheDumpedLoaderImpl() {}
+  IOStatus RestoreCacheEntriesToSecondaryCache() override;
+
+ private:
+  IOStatus ReadDumpUnitMeta(std::string* data, DumpUnitMeta* unit_meta);
+  IOStatus ReadDumpUnit(size_t len, std::string* data, DumpUnit* unit);
+  IOStatus ReadHeader(std::string* data, DumpUnit* dump_unit);
+  IOStatus ReadCacheBlock(std::string* data, DumpUnit* dump_unit);
+
+  CacheDumpOptions options_;
+  std::shared_ptr<SecondaryCache> secondary_cache_;
+  std::unique_ptr<CacheDumpReader> reader_;
+  UnorderedMap<Cache::DeleterFn, CacheEntryRole> role_map_;
+};
+
+// The default implementation of CacheDumpWriter. We write the blocks to a file
+// sequentially.
+class ToFileCacheDumpWriter : public CacheDumpWriter {
+ public:
+  explicit ToFileCacheDumpWriter(
+      std::unique_ptr<WritableFileWriter>&& file_writer)
+      : file_writer_(std::move(file_writer)) {}
+
+  ~ToFileCacheDumpWriter() { Close().PermitUncheckedError(); }
+
+  // Write the serialized metadata to the file
+  virtual IOStatus WriteMetadata(const Slice& metadata) override {
+    assert(file_writer_ != nullptr);
+    std::string prefix;
+    PutFixed32(&prefix, static_cast<uint32_t>(metadata.size()));
+    IOStatus io_s = file_writer_->Append(Slice(prefix));
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    io_s = file_writer_->Append(metadata);
+    return io_s;
+  }
+
+  // Write the serialized data to the file
+  virtual IOStatus WritePacket(const Slice& data) override {
+    assert(file_writer_ != nullptr);
+    std::string prefix;
+    PutFixed32(&prefix, static_cast<uint32_t>(data.size()));
+    IOStatus io_s = file_writer_->Append(Slice(prefix));
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    io_s = file_writer_->Append(data);
+    return io_s;
+  }
+
+  // Reset the writer
+  virtual IOStatus Close() override {
+    file_writer_.reset();
+    return IOStatus::OK();
+  }
+
+ private:
+  std::unique_ptr<WritableFileWriter> file_writer_;
+};
+
+// The default implementation of CacheDumpReader. It is implemented based on
+// RandomAccessFileReader. Note that, we keep an internal variable to remember
+// the current offset.
+class FromFileCacheDumpReader : public CacheDumpReader {
+ public:
+  explicit FromFileCacheDumpReader(
+      std::unique_ptr<RandomAccessFileReader>&& reader)
+      : file_reader_(std::move(reader)),
+        offset_(0),
+        buffer_(new char[kDumpReaderBufferSize]) {}
+
+  ~FromFileCacheDumpReader() { delete[] buffer_; }
+
+  virtual IOStatus ReadMetadata(std::string* metadata) override {
+    uint32_t metadata_len = 0;
+    IOStatus io_s = ReadSizePrefix(&metadata_len);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    return Read(metadata_len, metadata);
+  }
+
+  virtual IOStatus ReadPacket(std::string* data) override {
+    uint32_t data_len = 0;
+    IOStatus io_s = ReadSizePrefix(&data_len);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    return Read(data_len, data);
+  }
+
+ private:
+  IOStatus ReadSizePrefix(uint32_t* len) {
+    std::string prefix;
+    IOStatus io_s = Read(kSizePrefixLen, &prefix);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    Slice encoded_slice(prefix);
+    if (!GetFixed32(&encoded_slice, len)) {
+      return IOStatus::Corruption("Decode size prefix string failed");
+    }
+    return IOStatus::OK();
+  }
+
+  IOStatus Read(size_t len, std::string* data) {
+    assert(file_reader_ != nullptr);
+    IOStatus io_s;
+
+    unsigned int bytes_to_read = static_cast<unsigned int>(len);
+    unsigned int to_read = bytes_to_read > kDumpReaderBufferSize
+                               ? kDumpReaderBufferSize
+                               : bytes_to_read;
+
+    while (to_read > 0) {
+      io_s = file_reader_->Read(IOOptions(), offset_, to_read, &result_,
+                                buffer_, nullptr,
+                                Env::IO_TOTAL /* rate_limiter_priority */);
+      if (!io_s.ok()) {
+        return io_s;
+      }
+      if (result_.size() < to_read) {
+        return IOStatus::Corruption("Corrupted cache dump file.");
+      }
+      data->append(result_.data(), result_.size());
+
+      offset_ += to_read;
+      bytes_to_read -= to_read;
+      to_read = bytes_to_read > kDumpReaderBufferSize ? kDumpReaderBufferSize
+                                                      : bytes_to_read;
+    }
+    return io_s;
+  }
+  std::unique_ptr<RandomAccessFileReader> file_reader_;
+  Slice result_;
+  size_t offset_;
+  char* buffer_;
+};
+
+// The cache dump and load helper class
+class CacheDumperHelper {
+ public:
+  // serialize the dump_unit_meta to a string, it is fixed 16 bytes size.
+  static void EncodeDumpUnitMeta(const DumpUnitMeta& meta, std::string* data) {
+    assert(data);
+    PutFixed32(data, static_cast<uint32_t>(meta.sequence_num));
+    PutFixed32(data, static_cast<uint32_t>(meta.dump_unit_checksum));
+    PutFixed64(data, meta.dump_unit_size);
+  }
+
+  // Serialize the dump_unit to a string.
+  static void EncodeDumpUnit(const DumpUnit& dump_unit, std::string* data) {
+    assert(data);
+    PutFixed64(data, dump_unit.timestamp);
+    data->push_back(dump_unit.type);
+    PutLengthPrefixedSlice(data, dump_unit.key);
+    PutFixed32(data, static_cast<uint32_t>(dump_unit.value_len));
+    PutFixed32(data, dump_unit.value_checksum);
+    PutLengthPrefixedSlice(data,
+                           Slice((char*)dump_unit.value, dump_unit.value_len));
+  }
+
+  // Deserialize the dump_unit_meta from a string
+  static Status DecodeDumpUnitMeta(const std::string& encoded_data,
+                                   DumpUnitMeta* unit_meta) {
+    assert(unit_meta != nullptr);
+    Slice encoded_slice = Slice(encoded_data);
+    if (!GetFixed32(&encoded_slice, &(unit_meta->sequence_num))) {
+      return Status::Incomplete("Decode dumped unit meta sequence_num failed");
+    }
+    if (!GetFixed32(&encoded_slice, &(unit_meta->dump_unit_checksum))) {
+      return Status::Incomplete(
+          "Decode dumped unit meta dump_unit_checksum failed");
+    }
+    if (!GetFixed64(&encoded_slice, &(unit_meta->dump_unit_size))) {
+      return Status::Incomplete(
+          "Decode dumped unit meta dump_unit_size failed");
+    }
+    return Status::OK();
+  }
+
+  // Deserialize the dump_unit from a string.
+  static Status DecodeDumpUnit(const std::string& encoded_data,
+                               DumpUnit* dump_unit) {
+    assert(dump_unit != nullptr);
+    Slice encoded_slice = Slice(encoded_data);
+
+    // Decode timestamp
+    if (!GetFixed64(&encoded_slice, &dump_unit->timestamp)) {
+      return Status::Incomplete("Decode dumped unit string failed");
+    }
+    // Decode the block type
+    dump_unit->type = static_cast<CacheDumpUnitType>(encoded_slice[0]);
+    encoded_slice.remove_prefix(1);
+    // Decode the key
+    if (!GetLengthPrefixedSlice(&encoded_slice, &(dump_unit->key))) {
+      return Status::Incomplete("Decode dumped unit string failed");
+    }
+    // Decode the value size
+    uint32_t value_len;
+    if (!GetFixed32(&encoded_slice, &value_len)) {
+      return Status::Incomplete("Decode dumped unit string failed");
+    }
+    dump_unit->value_len = static_cast<size_t>(value_len);
+    // Decode the value checksum
+    if (!GetFixed32(&encoded_slice, &(dump_unit->value_checksum))) {
+      return Status::Incomplete("Decode dumped unit string failed");
+    }
+    // Decode the block content and copy to the memory space whose pointer
+    // will be managed by the cache finally.
+    Slice block;
+    if (!GetLengthPrefixedSlice(&encoded_slice, &block)) {
+      return Status::Incomplete("Decode dumped unit string failed");
+    }
+    dump_unit->value = (void*)block.data();
+    assert(block.size() == dump_unit->value_len);
+    return Status::OK();
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/cassandra/cassandra_compaction_filter.cc b/src/rocksdb/utilities/cassandra/cassandra_compaction_filter.cc
new file mode 100644
index 000000000..4e48d63aa
--- /dev/null
+++ b/src/rocksdb/utilities/cassandra/cassandra_compaction_filter.cc
@@ -0,0 +1,110 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "utilities/cassandra/cassandra_compaction_filter.h"
+
+#include <string>
+
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+#include "utilities/cassandra/format.h"
+#include "utilities/cassandra/merge_operator.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace cassandra {
+static std::unordered_map<std::string, OptionTypeInfo>
+    cassandra_filter_type_info = {
+#ifndef ROCKSDB_LITE
+        {"purge_ttl_on_expiration",
+         {offsetof(struct CassandraOptions, purge_ttl_on_expiration),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"gc_grace_period_in_seconds",
+         {offsetof(struct CassandraOptions, gc_grace_period_in_seconds),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+};
+
+CassandraCompactionFilter::CassandraCompactionFilter(
+    bool purge_ttl_on_expiration, int32_t gc_grace_period_in_seconds)
+    : options_(gc_grace_period_in_seconds, 0, purge_ttl_on_expiration) {
+  RegisterOptions(&options_, &cassandra_filter_type_info);
+}
+
+CompactionFilter::Decision CassandraCompactionFilter::FilterV2(
+    int /*level*/, const Slice& /*key*/, ValueType value_type,
+    const Slice& existing_value, std::string* new_value,
+    std::string* /*skip_until*/) const {
+  bool value_changed = false;
+  RowValue row_value =
+      RowValue::Deserialize(existing_value.data(), existing_value.size());
+  RowValue compacted =
+      options_.purge_ttl_on_expiration
+          ? row_value.RemoveExpiredColumns(&value_changed)
+          : row_value.ConvertExpiredColumnsToTombstones(&value_changed);
+
+  if (value_type == ValueType::kValue) {
+    compacted = compacted.RemoveTombstones(options_.gc_grace_period_in_seconds);
+  }
+
+  if (compacted.Empty()) {
+    return Decision::kRemove;
+  }
+
+  if (value_changed) {
+    compacted.Serialize(new_value);
+    return Decision::kChangeValue;
+  }
+
+  return Decision::kKeep;
+}
+
+CassandraCompactionFilterFactory::CassandraCompactionFilterFactory(
+    bool purge_ttl_on_expiration, int32_t gc_grace_period_in_seconds)
+    : options_(gc_grace_period_in_seconds, 0, purge_ttl_on_expiration) {
+  RegisterOptions(&options_, &cassandra_filter_type_info);
+}
+
+std::unique_ptr<CompactionFilter>
+CassandraCompactionFilterFactory::CreateCompactionFilter(
+    const CompactionFilter::Context&) {
+  std::unique_ptr<CompactionFilter> result(new CassandraCompactionFilter(
+      options_.purge_ttl_on_expiration, options_.gc_grace_period_in_seconds));
+  return result;
+}
+
+#ifndef ROCKSDB_LITE
+int RegisterCassandraObjects(ObjectLibrary& library,
+                             const std::string& /*arg*/) {
+  library.AddFactory<MergeOperator>(
+      CassandraValueMergeOperator::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<MergeOperator>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new CassandraValueMergeOperator(0));
+        return guard->get();
+      });
+  library.AddFactory<CompactionFilter>(
+      CassandraCompactionFilter::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<CompactionFilter>* /*guard */,
+         std::string* /* errmsg */) {
+        return new CassandraCompactionFilter(false, 0);
+      });
+  library.AddFactory<CompactionFilterFactory>(
+      CassandraCompactionFilterFactory::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<CompactionFilterFactory>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new CassandraCompactionFilterFactory(false, 0));
+        return guard->get();
+      });
+  size_t num_types;
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+#endif  // ROCKSDB_LITE
+}  // namespace cassandra
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/cassandra/cassandra_compaction_filter.h b/src/rocksdb/utilities/cassandra/cassandra_compaction_filter.h
new file mode 100644
index 000000000..0325a4c39
--- /dev/null
+++ b/src/rocksdb/utilities/cassandra/cassandra_compaction_filter.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <string>
+
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/slice.h"
+#include "utilities/cassandra/cassandra_options.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace cassandra {
+
+/**
+ * Compaction filter for removing expired Cassandra data with ttl.
+ * If option `purge_ttl_on_expiration` is set to true, expired data
+ * will be directly purged. Otherwise expired data will be converted
+ * tombstones first, then be eventally removed after gc grace period.
+ * `purge_ttl_on_expiration` should only be on in the case all the
+ * writes have same ttl setting, otherwise it could bring old data back.
+ *
+ * Compaction filter is also in charge of removing tombstone that has been
+ * promoted to kValue type after serials of merging in compaction.
+ */
+class CassandraCompactionFilter : public CompactionFilter {
+ public:
+  explicit CassandraCompactionFilter(bool purge_ttl_on_expiration,
+                                     int32_t gc_grace_period_in_seconds);
+  static const char* kClassName() { return "CassandraCompactionFilter"; }
+  const char* Name() const override { return kClassName(); }
+
+  virtual Decision FilterV2(int level, const Slice& key, ValueType value_type,
+                            const Slice& existing_value, std::string* new_value,
+                            std::string* skip_until) const override;
+
+ private:
+  CassandraOptions options_;
+};
+
+class CassandraCompactionFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit CassandraCompactionFilterFactory(bool purge_ttl_on_expiration,
+                                            int32_t gc_grace_period_in_seconds);
+  ~CassandraCompactionFilterFactory() override {}
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override;
+  static const char* kClassName() { return "CassandraCompactionFilterFactory"; }
+  const char* Name() const override { return kClassName(); }
+
+ private:
+  CassandraOptions options_;
+};
+}  // namespace cassandra
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/cassandra/cassandra_format_test.cc b/src/rocksdb/utilities/cassandra/cassandra_format_test.cc
new file mode 100644
index 000000000..4f12947ad
--- /dev/null
+++ b/src/rocksdb/utilities/cassandra/cassandra_format_test.cc
@@ -0,0 +1,377 @@
+// Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <cstring>
+#include <memory>
+
+#include "test_util/testharness.h"
+#include "utilities/cassandra/format.h"
+#include "utilities/cassandra/serialize.h"
+#include "utilities/cassandra/test_utils.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace cassandra {
+
+TEST(ColumnTest, Column) {
+  char data[4] = {'d', 'a', 't', 'a'};
+  int8_t mask = 0;
+  int8_t index = 1;
+  int64_t timestamp = 1494022807044;
+  Column c = Column(mask, index, timestamp, sizeof(data), data);
+
+  EXPECT_EQ(c.Index(), index);
+  EXPECT_EQ(c.Timestamp(), timestamp);
+  EXPECT_EQ(c.Size(), 14 + sizeof(data));
+
+  // Verify the serialization.
+  std::string dest;
+  dest.reserve(c.Size() * 2);
+  c.Serialize(&dest);
+
+  EXPECT_EQ(dest.size(), c.Size());
+  std::size_t offset = 0;
+  EXPECT_EQ(Deserialize<int8_t>(dest.c_str(), offset), mask);
+  offset += sizeof(int8_t);
+  EXPECT_EQ(Deserialize<int8_t>(dest.c_str(), offset), index);
+  offset += sizeof(int8_t);
+  EXPECT_EQ(Deserialize<int64_t>(dest.c_str(), offset), timestamp);
+  offset += sizeof(int64_t);
+  EXPECT_EQ(Deserialize<int32_t>(dest.c_str(), offset), sizeof(data));
+  offset += sizeof(int32_t);
+  EXPECT_TRUE(std::memcmp(data, dest.c_str() + offset, sizeof(data)) == 0);
+
+  // Verify the deserialization.
+  std::string saved_dest = dest;
+  std::shared_ptr<Column> c1 = Column::Deserialize(saved_dest.c_str(), 0);
+  EXPECT_EQ(c1->Index(), index);
+  EXPECT_EQ(c1->Timestamp(), timestamp);
+  EXPECT_EQ(c1->Size(), 14 + sizeof(data));
+
+  c1->Serialize(&dest);
+  EXPECT_EQ(dest.size(), 2 * c.Size());
+  EXPECT_TRUE(std::memcmp(dest.c_str(), dest.c_str() + c.Size(), c.Size()) ==
+              0);
+
+  // Verify the ColumnBase::Deserialization.
+  saved_dest = dest;
+  std::shared_ptr<ColumnBase> c2 =
+      ColumnBase::Deserialize(saved_dest.c_str(), c.Size());
+  c2->Serialize(&dest);
+  EXPECT_EQ(dest.size(), 3 * c.Size());
+  EXPECT_TRUE(std::memcmp(dest.c_str() + c.Size(), dest.c_str() + c.Size() * 2,
+                          c.Size()) == 0);
+}
+
+TEST(ExpiringColumnTest, ExpiringColumn) {
+  char data[4] = {'d', 'a', 't', 'a'};
+  int8_t mask = ColumnTypeMask::EXPIRATION_MASK;
+  int8_t index = 3;
+  int64_t timestamp = 1494022807044;
+  int32_t ttl = 3600;
+  ExpiringColumn c =
+      ExpiringColumn(mask, index, timestamp, sizeof(data), data, ttl);
+
+  EXPECT_EQ(c.Index(), index);
+  EXPECT_EQ(c.Timestamp(), timestamp);
+  EXPECT_EQ(c.Size(), 18 + sizeof(data));
+
+  // Verify the serialization.
+  std::string dest;
+  dest.reserve(c.Size() * 2);
+  c.Serialize(&dest);
+
+  EXPECT_EQ(dest.size(), c.Size());
+  std::size_t offset = 0;
+  EXPECT_EQ(Deserialize<int8_t>(dest.c_str(), offset), mask);
+  offset += sizeof(int8_t);
+  EXPECT_EQ(Deserialize<int8_t>(dest.c_str(), offset), index);
+  offset += sizeof(int8_t);
+  EXPECT_EQ(Deserialize<int64_t>(dest.c_str(), offset), timestamp);
+  offset += sizeof(int64_t);
+  EXPECT_EQ(Deserialize<int32_t>(dest.c_str(), offset), sizeof(data));
+  offset += sizeof(int32_t);
+  EXPECT_TRUE(std::memcmp(data, dest.c_str() + offset, sizeof(data)) == 0);
+  offset += sizeof(data);
+  EXPECT_EQ(Deserialize<int32_t>(dest.c_str(), offset), ttl);
+
+  // Verify the deserialization.
+  std::string saved_dest = dest;
+  std::shared_ptr<ExpiringColumn> c1 =
+      ExpiringColumn::Deserialize(saved_dest.c_str(), 0);
+  EXPECT_EQ(c1->Index(), index);
+  EXPECT_EQ(c1->Timestamp(), timestamp);
+  EXPECT_EQ(c1->Size(), 18 + sizeof(data));
+
+  c1->Serialize(&dest);
+  EXPECT_EQ(dest.size(), 2 * c.Size());
+  EXPECT_TRUE(std::memcmp(dest.c_str(), dest.c_str() + c.Size(), c.Size()) ==
+              0);
+
+  // Verify the ColumnBase::Deserialization.
+  saved_dest = dest;
+  std::shared_ptr<ColumnBase> c2 =
+      ColumnBase::Deserialize(saved_dest.c_str(), c.Size());
+  c2->Serialize(&dest);
+  EXPECT_EQ(dest.size(), 3 * c.Size());
+  EXPECT_TRUE(std::memcmp(dest.c_str() + c.Size(), dest.c_str() + c.Size() * 2,
+                          c.Size()) == 0);
+}
+
+TEST(TombstoneTest, TombstoneCollectable) {
+  int32_t now = (int32_t)time(nullptr);
+  int32_t gc_grace_seconds = 16440;
+  int32_t time_delta_seconds = 10;
+  EXPECT_TRUE(
+      Tombstone(ColumnTypeMask::DELETION_MASK, 0,
+                now - gc_grace_seconds - time_delta_seconds,
+                ToMicroSeconds(now - gc_grace_seconds - time_delta_seconds))
+          .Collectable(gc_grace_seconds));
+  EXPECT_FALSE(
+      Tombstone(ColumnTypeMask::DELETION_MASK, 0,
+                now - gc_grace_seconds + time_delta_seconds,
+                ToMicroSeconds(now - gc_grace_seconds + time_delta_seconds))
+          .Collectable(gc_grace_seconds));
+}
+
+TEST(TombstoneTest, Tombstone) {
+  int8_t mask = ColumnTypeMask::DELETION_MASK;
+  int8_t index = 2;
+  int32_t local_deletion_time = 1494022807;
+  int64_t marked_for_delete_at = 1494022807044;
+  Tombstone c =
+      Tombstone(mask, index, local_deletion_time, marked_for_delete_at);
+
+  EXPECT_EQ(c.Index(), index);
+  EXPECT_EQ(c.Timestamp(), marked_for_delete_at);
+  EXPECT_EQ(c.Size(), 14);
+
+  // Verify the serialization.
+  std::string dest;
+  dest.reserve(c.Size() * 2);
+  c.Serialize(&dest);
+
+  EXPECT_EQ(dest.size(), c.Size());
+  std::size_t offset = 0;
+  EXPECT_EQ(Deserialize<int8_t>(dest.c_str(), offset), mask);
+  offset += sizeof(int8_t);
+  EXPECT_EQ(Deserialize<int8_t>(dest.c_str(), offset), index);
+  offset += sizeof(int8_t);
+  EXPECT_EQ(Deserialize<int32_t>(dest.c_str(), offset), local_deletion_time);
+  offset += sizeof(int32_t);
+  EXPECT_EQ(Deserialize<int64_t>(dest.c_str(), offset), marked_for_delete_at);
+
+  // Verify the deserialization.
+  std::shared_ptr<Tombstone> c1 = Tombstone::Deserialize(dest.c_str(), 0);
+  EXPECT_EQ(c1->Index(), index);
+  EXPECT_EQ(c1->Timestamp(), marked_for_delete_at);
+  EXPECT_EQ(c1->Size(), 14);
+
+  c1->Serialize(&dest);
+  EXPECT_EQ(dest.size(), 2 * c.Size());
+  EXPECT_TRUE(std::memcmp(dest.c_str(), dest.c_str() + c.Size(), c.Size()) ==
+              0);
+
+  // Verify the ColumnBase::Deserialization.
+  std::shared_ptr<ColumnBase> c2 =
+      ColumnBase::Deserialize(dest.c_str(), c.Size());
+  c2->Serialize(&dest);
+  EXPECT_EQ(dest.size(), 3 * c.Size());
+  EXPECT_TRUE(std::memcmp(dest.c_str() + c.Size(), dest.c_str() + c.Size() * 2,
+                          c.Size()) == 0);
+}
+
+class RowValueTest : public testing::Test {};
+
+TEST(RowValueTest, RowTombstone) {
+  int32_t local_deletion_time = 1494022807;
+  int64_t marked_for_delete_at = 1494022807044;
+  RowValue r = RowValue(local_deletion_time, marked_for_delete_at);
+
+  EXPECT_EQ(r.Size(), 12);
+  EXPECT_EQ(r.IsTombstone(), true);
+  EXPECT_EQ(r.LastModifiedTime(), marked_for_delete_at);
+
+  // Verify the serialization.
+  std::string dest;
+  dest.reserve(r.Size() * 2);
+  r.Serialize(&dest);
+
+  EXPECT_EQ(dest.size(), r.Size());
+  std::size_t offset = 0;
+  EXPECT_EQ(Deserialize<int32_t>(dest.c_str(), offset), local_deletion_time);
+  offset += sizeof(int32_t);
+  EXPECT_EQ(Deserialize<int64_t>(dest.c_str(), offset), marked_for_delete_at);
+
+  // Verify the deserialization.
+  RowValue r1 = RowValue::Deserialize(dest.c_str(), r.Size());
+  EXPECT_EQ(r1.Size(), 12);
+  EXPECT_EQ(r1.IsTombstone(), true);
+  EXPECT_EQ(r1.LastModifiedTime(), marked_for_delete_at);
+
+  r1.Serialize(&dest);
+  EXPECT_EQ(dest.size(), 2 * r.Size());
+  EXPECT_TRUE(std::memcmp(dest.c_str(), dest.c_str() + r.Size(), r.Size()) ==
+              0);
+}
+
+TEST(RowValueTest, RowWithColumns) {
+  std::vector<std::shared_ptr<ColumnBase>> columns;
+  int64_t last_modified_time = 1494022807048;
+  std::size_t columns_data_size = 0;
+
+  char e_data[5] = {'e', 'd', 'a', 't', 'a'};
+  int8_t e_index = 0;
+  int64_t e_timestamp = 1494022807044;
+  int32_t e_ttl = 3600;
+  columns.push_back(std::shared_ptr<ExpiringColumn>(
+      new ExpiringColumn(ColumnTypeMask::EXPIRATION_MASK, e_index, e_timestamp,
+                         sizeof(e_data), e_data, e_ttl)));
+  columns_data_size += columns[0]->Size();
+
+  char c_data[4] = {'d', 'a', 't', 'a'};
+  int8_t c_index = 1;
+  int64_t c_timestamp = 1494022807048;
+  columns.push_back(std::shared_ptr<Column>(
+      new Column(0, c_index, c_timestamp, sizeof(c_data), c_data)));
+  columns_data_size += columns[1]->Size();
+
+  int8_t t_index = 2;
+  int32_t t_local_deletion_time = 1494022801;
+  int64_t t_marked_for_delete_at = 1494022807043;
+  columns.push_back(std::shared_ptr<Tombstone>(
+      new Tombstone(ColumnTypeMask::DELETION_MASK, t_index,
+                    t_local_deletion_time, t_marked_for_delete_at)));
+  columns_data_size += columns[2]->Size();
+
+  RowValue r = RowValue(std::move(columns), last_modified_time);
+
+  EXPECT_EQ(r.Size(), columns_data_size + 12);
+  EXPECT_EQ(r.IsTombstone(), false);
+  EXPECT_EQ(r.LastModifiedTime(), last_modified_time);
+
+  // Verify the serialization.
+  std::string dest;
+  dest.reserve(r.Size() * 2);
+  r.Serialize(&dest);
+
+  EXPECT_EQ(dest.size(), r.Size());
+  std::size_t offset = 0;
+  EXPECT_EQ(Deserialize<int32_t>(dest.c_str(), offset),
+            std::numeric_limits<int32_t>::max());
+  offset += sizeof(int32_t);
+  EXPECT_EQ(Deserialize<int64_t>(dest.c_str(), offset),
+            std::numeric_limits<int64_t>::min());
+  offset += sizeof(int64_t);
+
+  // Column0: ExpiringColumn
+  EXPECT_EQ(Deserialize<int8_t>(dest.c_str(), offset),
+            ColumnTypeMask::EXPIRATION_MASK);
+  offset += sizeof(int8_t);
+  EXPECT_EQ(Deserialize<int8_t>(dest.c_str(), offset), e_index);
+  offset += sizeof(int8_t);
+  EXPECT_EQ(Deserialize<int64_t>(dest.c_str(), offset), e_timestamp);
+  offset += sizeof(int64_t);
+  EXPECT_EQ(Deserialize<int32_t>(dest.c_str(), offset), sizeof(e_data));
+  offset += sizeof(int32_t);
+  EXPECT_TRUE(std::memcmp(e_data, dest.c_str() + offset, sizeof(e_data)) == 0);
+  offset += sizeof(e_data);
+  EXPECT_EQ(Deserialize<int32_t>(dest.c_str(), offset), e_ttl);
+  offset += sizeof(int32_t);
+
+  // Column1: Column
+  EXPECT_EQ(Deserialize<int8_t>(dest.c_str(), offset), 0);
+  offset += sizeof(int8_t);
+  EXPECT_EQ(Deserialize<int8_t>(dest.c_str(), offset), c_index);
+  offset += sizeof(int8_t);
+  EXPECT_EQ(Deserialize<int64_t>(dest.c_str(), offset), c_timestamp);
+  offset += sizeof(int64_t);
+  EXPECT_EQ(Deserialize<int32_t>(dest.c_str(), offset), sizeof(c_data));
+  offset += sizeof(int32_t);
+  EXPECT_TRUE(std::memcmp(c_data, dest.c_str() + offset, sizeof(c_data)) == 0);
+  offset += sizeof(c_data);
+
+  // Column2: Tombstone
+  EXPECT_EQ(Deserialize<int8_t>(dest.c_str(), offset),
+            ColumnTypeMask::DELETION_MASK);
+  offset += sizeof(int8_t);
+  EXPECT_EQ(Deserialize<int8_t>(dest.c_str(), offset), t_index);
+  offset += sizeof(int8_t);
+  EXPECT_EQ(Deserialize<int32_t>(dest.c_str(), offset), t_local_deletion_time);
+  offset += sizeof(int32_t);
+  EXPECT_EQ(Deserialize<int64_t>(dest.c_str(), offset), t_marked_for_delete_at);
+
+  // Verify the deserialization.
+  RowValue r1 = RowValue::Deserialize(dest.c_str(), r.Size());
+  EXPECT_EQ(r1.Size(), columns_data_size + 12);
+  EXPECT_EQ(r1.IsTombstone(), false);
+  EXPECT_EQ(r1.LastModifiedTime(), last_modified_time);
+
+  r1.Serialize(&dest);
+  EXPECT_EQ(dest.size(), 2 * r.Size());
+  EXPECT_TRUE(std::memcmp(dest.c_str(), dest.c_str() + r.Size(), r.Size()) ==
+              0);
+}
+
+TEST(RowValueTest, PurgeTtlShouldRemvoeAllColumnsExpired) {
+  int64_t now = time(nullptr);
+
+  auto row_value = CreateTestRowValue(
+      {CreateTestColumnSpec(kColumn, 0, ToMicroSeconds(now)),
+       CreateTestColumnSpec(kExpiringColumn, 1,
+                            ToMicroSeconds(now - kTtl - 10)),  // expired
+       CreateTestColumnSpec(kExpiringColumn, 2,
+                            ToMicroSeconds(now)),  // not expired
+       CreateTestColumnSpec(kTombstone, 3, ToMicroSeconds(now))});
+
+  bool changed = false;
+  auto purged = row_value.RemoveExpiredColumns(&changed);
+  EXPECT_TRUE(changed);
+  EXPECT_EQ(purged.get_columns().size(), 3);
+  VerifyRowValueColumns(purged.get_columns(), 0, kColumn, 0,
+                        ToMicroSeconds(now));
+  VerifyRowValueColumns(purged.get_columns(), 1, kExpiringColumn, 2,
+                        ToMicroSeconds(now));
+  VerifyRowValueColumns(purged.get_columns(), 2, kTombstone, 3,
+                        ToMicroSeconds(now));
+
+  purged.RemoveExpiredColumns(&changed);
+  EXPECT_FALSE(changed);
+}
+
+TEST(RowValueTest, ExpireTtlShouldConvertExpiredColumnsToTombstones) {
+  int64_t now = time(nullptr);
+
+  auto row_value = CreateTestRowValue(
+      {CreateTestColumnSpec(kColumn, 0, ToMicroSeconds(now)),
+       CreateTestColumnSpec(kExpiringColumn, 1,
+                            ToMicroSeconds(now - kTtl - 10)),  // expired
+       CreateTestColumnSpec(kExpiringColumn, 2,
+                            ToMicroSeconds(now)),  // not expired
+       CreateTestColumnSpec(kTombstone, 3, ToMicroSeconds(now))});
+
+  bool changed = false;
+  auto compacted = row_value.ConvertExpiredColumnsToTombstones(&changed);
+  EXPECT_TRUE(changed);
+  EXPECT_EQ(compacted.get_columns().size(), 4);
+  VerifyRowValueColumns(compacted.get_columns(), 0, kColumn, 0,
+                        ToMicroSeconds(now));
+  VerifyRowValueColumns(compacted.get_columns(), 1, kTombstone, 1,
+                        ToMicroSeconds(now - 10));
+  VerifyRowValueColumns(compacted.get_columns(), 2, kExpiringColumn, 2,
+                        ToMicroSeconds(now));
+  VerifyRowValueColumns(compacted.get_columns(), 3, kTombstone, 3,
+                        ToMicroSeconds(now));
+
+  compacted.ConvertExpiredColumnsToTombstones(&changed);
+  EXPECT_FALSE(changed);
+}
+}  // namespace cassandra
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/utilities/cassandra/cassandra_functional_test.cc b/src/rocksdb/utilities/cassandra/cassandra_functional_test.cc
new file mode 100644
index 000000000..c5be836e8
--- /dev/null
+++ b/src/rocksdb/utilities/cassandra/cassandra_functional_test.cc
@@ -0,0 +1,446 @@
+// Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <iostream>
+
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "test_util/testharness.h"
+#include "util/cast_util.h"
+#include "util/random.h"
+#include "utilities/cassandra/cassandra_compaction_filter.h"
+#include "utilities/cassandra/merge_operator.h"
+#include "utilities/cassandra/test_utils.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace cassandra {
+
+// Path to the database on file system
+const std::string kDbName = test::PerThreadDBPath("cassandra_functional_test");
+
+class CassandraStore {
+ public:
+  explicit CassandraStore(std::shared_ptr<DB> db)
+      : db_(db), write_option_(), get_option_() {
+    assert(db);
+  }
+
+  bool Append(const std::string& key, const RowValue& val) {
+    std::string result;
+    val.Serialize(&result);
+    Slice valSlice(result.data(), result.size());
+    auto s = db_->Merge(write_option_, key, valSlice);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      std::cerr << "ERROR " << s.ToString() << std::endl;
+      return false;
+    }
+  }
+
+  bool Put(const std::string& key, const RowValue& val) {
+    std::string result;
+    val.Serialize(&result);
+    Slice valSlice(result.data(), result.size());
+    auto s = db_->Put(write_option_, key, valSlice);
+    if (s.ok()) {
+      return true;
+    } else {
+      std::cerr << "ERROR " << s.ToString() << std::endl;
+      return false;
+    }
+  }
+
+  Status Flush() {
+    Status s = dbfull()->TEST_FlushMemTable();
+    if (s.ok()) {
+      s = dbfull()->TEST_WaitForCompact();
+    }
+    return s;
+  }
+
+  Status Compact() {
+    return dbfull()->TEST_CompactRange(0, nullptr, nullptr,
+                                       db_->DefaultColumnFamily());
+  }
+
+  std::tuple<bool, RowValue> Get(const std::string& key) {
+    std::string result;
+    auto s = db_->Get(get_option_, key, &result);
+
+    if (s.ok()) {
+      return std::make_tuple(
+          true, RowValue::Deserialize(result.data(), result.size()));
+    }
+
+    if (!s.IsNotFound()) {
+      std::cerr << "ERROR " << s.ToString() << std::endl;
+    }
+
+    return std::make_tuple(false, RowValue(0, 0));
+  }
+
+ private:
+  std::shared_ptr<DB> db_;
+  WriteOptions write_option_;
+  ReadOptions get_option_;
+
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_.get()); }
+};
+
+class TestCompactionFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit TestCompactionFilterFactory(bool purge_ttl_on_expiration,
+                                       int32_t gc_grace_period_in_seconds)
+      : purge_ttl_on_expiration_(purge_ttl_on_expiration),
+        gc_grace_period_in_seconds_(gc_grace_period_in_seconds) {}
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& /*context*/) override {
+    return std::unique_ptr<CompactionFilter>(new CassandraCompactionFilter(
+        purge_ttl_on_expiration_, gc_grace_period_in_seconds_));
+  }
+
+  const char* Name() const override { return "TestCompactionFilterFactory"; }
+
+ private:
+  bool purge_ttl_on_expiration_;
+  int32_t gc_grace_period_in_seconds_;
+};
+
+// The class for unit-testing
+class CassandraFunctionalTest : public testing::Test {
+ public:
+  CassandraFunctionalTest() {
+    EXPECT_OK(
+        DestroyDB(kDbName, Options()));  // Start each test with a fresh DB
+  }
+
+  std::shared_ptr<DB> OpenDb() {
+    DB* db;
+    Options options;
+    options.create_if_missing = true;
+    options.merge_operator.reset(
+        new CassandraValueMergeOperator(gc_grace_period_in_seconds_));
+    auto* cf_factory = new TestCompactionFilterFactory(
+        purge_ttl_on_expiration_, gc_grace_period_in_seconds_);
+    options.compaction_filter_factory.reset(cf_factory);
+    EXPECT_OK(DB::Open(options, kDbName, &db));
+    return std::shared_ptr<DB>(db);
+  }
+
+  bool purge_ttl_on_expiration_ = false;
+  int32_t gc_grace_period_in_seconds_ = 100;
+};
+
+// THE TEST CASES BEGIN HERE
+
+TEST_F(CassandraFunctionalTest, SimpleMergeTest) {
+  CassandraStore store(OpenDb());
+  int64_t now = time(nullptr);
+
+  store.Append(
+      "k1",
+      CreateTestRowValue({
+          CreateTestColumnSpec(kTombstone, 0, ToMicroSeconds(now + 5)),
+          CreateTestColumnSpec(kColumn, 1, ToMicroSeconds(now + 8)),
+          CreateTestColumnSpec(kExpiringColumn, 2, ToMicroSeconds(now + 5)),
+      }));
+  store.Append(
+      "k1",
+      CreateTestRowValue({
+          CreateTestColumnSpec(kColumn, 0, ToMicroSeconds(now + 2)),
+          CreateTestColumnSpec(kExpiringColumn, 1, ToMicroSeconds(now + 5)),
+          CreateTestColumnSpec(kTombstone, 2, ToMicroSeconds(now + 7)),
+          CreateTestColumnSpec(kExpiringColumn, 7, ToMicroSeconds(now + 17)),
+      }));
+  store.Append(
+      "k1",
+      CreateTestRowValue({
+          CreateTestColumnSpec(kExpiringColumn, 0, ToMicroSeconds(now + 6)),
+          CreateTestColumnSpec(kTombstone, 1, ToMicroSeconds(now + 5)),
+          CreateTestColumnSpec(kColumn, 2, ToMicroSeconds(now + 4)),
+          CreateTestColumnSpec(kTombstone, 11, ToMicroSeconds(now + 11)),
+      }));
+
+  auto ret = store.Get("k1");
+
+  ASSERT_TRUE(std::get<0>(ret));
+  RowValue& merged = std::get<1>(ret);
+  EXPECT_EQ(merged.get_columns().size(), 5);
+  VerifyRowValueColumns(merged.get_columns(), 0, kExpiringColumn, 0,
+                        ToMicroSeconds(now + 6));
+  VerifyRowValueColumns(merged.get_columns(), 1, kColumn, 1,
+                        ToMicroSeconds(now + 8));
+  VerifyRowValueColumns(merged.get_columns(), 2, kTombstone, 2,
+                        ToMicroSeconds(now + 7));
+  VerifyRowValueColumns(merged.get_columns(), 3, kExpiringColumn, 7,
+                        ToMicroSeconds(now + 17));
+  VerifyRowValueColumns(merged.get_columns(), 4, kTombstone, 11,
+                        ToMicroSeconds(now + 11));
+}
+
+constexpr int64_t kTestTimeoutSecs = 600;
+
+TEST_F(CassandraFunctionalTest,
+       CompactionShouldConvertExpiredColumnsToTombstone) {
+  CassandraStore store(OpenDb());
+  int64_t now = time(nullptr);
+
+  store.Append(
+      "k1",
+      CreateTestRowValue(
+          {CreateTestColumnSpec(kExpiringColumn, 0,
+                                ToMicroSeconds(now - kTtl - 20)),  // expired
+           CreateTestColumnSpec(
+               kExpiringColumn, 1,
+               ToMicroSeconds(now - kTtl + kTestTimeoutSecs)),  // not expired
+           CreateTestColumnSpec(kTombstone, 3, ToMicroSeconds(now))}));
+
+  ASSERT_OK(store.Flush());
+
+  store.Append(
+      "k1",
+      CreateTestRowValue(
+          {CreateTestColumnSpec(kExpiringColumn, 0,
+                                ToMicroSeconds(now - kTtl - 10)),  // expired
+           CreateTestColumnSpec(kColumn, 2, ToMicroSeconds(now))}));
+
+  ASSERT_OK(store.Flush());
+  ASSERT_OK(store.Compact());
+
+  auto ret = store.Get("k1");
+  ASSERT_TRUE(std::get<0>(ret));
+  RowValue& merged = std::get<1>(ret);
+  EXPECT_EQ(merged.get_columns().size(), 4);
+  VerifyRowValueColumns(merged.get_columns(), 0, kTombstone, 0,
+                        ToMicroSeconds(now - 10));
+  VerifyRowValueColumns(merged.get_columns(), 1, kExpiringColumn, 1,
+                        ToMicroSeconds(now - kTtl + kTestTimeoutSecs));
+  VerifyRowValueColumns(merged.get_columns(), 2, kColumn, 2,
+                        ToMicroSeconds(now));
+  VerifyRowValueColumns(merged.get_columns(), 3, kTombstone, 3,
+                        ToMicroSeconds(now));
+}
+
+TEST_F(CassandraFunctionalTest,
+       CompactionShouldPurgeExpiredColumnsIfPurgeTtlIsOn) {
+  purge_ttl_on_expiration_ = true;
+  CassandraStore store(OpenDb());
+  int64_t now = time(nullptr);
+
+  store.Append(
+      "k1",
+      CreateTestRowValue(
+          {CreateTestColumnSpec(kExpiringColumn, 0,
+                                ToMicroSeconds(now - kTtl - 20)),  // expired
+           CreateTestColumnSpec(kExpiringColumn, 1,
+                                ToMicroSeconds(now)),  // not expired
+           CreateTestColumnSpec(kTombstone, 3, ToMicroSeconds(now))}));
+
+  ASSERT_OK(store.Flush());
+
+  store.Append(
+      "k1",
+      CreateTestRowValue(
+          {CreateTestColumnSpec(kExpiringColumn, 0,
+                                ToMicroSeconds(now - kTtl - 10)),  // expired
+           CreateTestColumnSpec(kColumn, 2, ToMicroSeconds(now))}));
+
+  ASSERT_OK(store.Flush());
+  ASSERT_OK(store.Compact());
+
+  auto ret = store.Get("k1");
+  ASSERT_TRUE(std::get<0>(ret));
+  RowValue& merged = std::get<1>(ret);
+  EXPECT_EQ(merged.get_columns().size(), 3);
+  VerifyRowValueColumns(merged.get_columns(), 0, kExpiringColumn, 1,
+                        ToMicroSeconds(now));
+  VerifyRowValueColumns(merged.get_columns(), 1, kColumn, 2,
+                        ToMicroSeconds(now));
+  VerifyRowValueColumns(merged.get_columns(), 2, kTombstone, 3,
+                        ToMicroSeconds(now));
+}
+
+TEST_F(CassandraFunctionalTest,
+       CompactionShouldRemoveRowWhenAllColumnsExpiredIfPurgeTtlIsOn) {
+  purge_ttl_on_expiration_ = true;
+  CassandraStore store(OpenDb());
+  int64_t now = time(nullptr);
+
+  store.Append("k1", CreateTestRowValue({
+                         CreateTestColumnSpec(kExpiringColumn, 0,
+                                              ToMicroSeconds(now - kTtl - 20)),
+                         CreateTestColumnSpec(kExpiringColumn, 1,
+                                              ToMicroSeconds(now - kTtl - 20)),
+                     }));
+
+  ASSERT_OK(store.Flush());
+
+  store.Append("k1", CreateTestRowValue({
+                         CreateTestColumnSpec(kExpiringColumn, 0,
+                                              ToMicroSeconds(now - kTtl - 10)),
+                     }));
+
+  ASSERT_OK(store.Flush());
+  ASSERT_OK(store.Compact());
+  ASSERT_FALSE(std::get<0>(store.Get("k1")));
+}
+
+TEST_F(CassandraFunctionalTest,
+       CompactionShouldRemoveTombstoneExceedingGCGracePeriod) {
+  purge_ttl_on_expiration_ = true;
+  CassandraStore store(OpenDb());
+  int64_t now = time(nullptr);
+
+  store.Append("k1",
+               CreateTestRowValue(
+                   {CreateTestColumnSpec(
+                        kTombstone, 0,
+                        ToMicroSeconds(now - gc_grace_period_in_seconds_ - 1)),
+                    CreateTestColumnSpec(kColumn, 1, ToMicroSeconds(now))}));
+
+  store.Append("k2", CreateTestRowValue({CreateTestColumnSpec(
+                         kColumn, 0, ToMicroSeconds(now))}));
+
+  ASSERT_OK(store.Flush());
+
+  store.Append("k1", CreateTestRowValue({
+                         CreateTestColumnSpec(kColumn, 1, ToMicroSeconds(now)),
+                     }));
+
+  ASSERT_OK(store.Flush());
+  ASSERT_OK(store.Compact());
+
+  auto ret = store.Get("k1");
+  ASSERT_TRUE(std::get<0>(ret));
+  RowValue& gced = std::get<1>(ret);
+  EXPECT_EQ(gced.get_columns().size(), 1);
+  VerifyRowValueColumns(gced.get_columns(), 0, kColumn, 1, ToMicroSeconds(now));
+}
+
+TEST_F(CassandraFunctionalTest, CompactionShouldRemoveTombstoneFromPut) {
+  purge_ttl_on_expiration_ = true;
+  CassandraStore store(OpenDb());
+  int64_t now = time(nullptr);
+
+  store.Put("k1",
+            CreateTestRowValue({
+                CreateTestColumnSpec(
+                    kTombstone, 0,
+                    ToMicroSeconds(now - gc_grace_period_in_seconds_ - 1)),
+            }));
+
+  ASSERT_OK(store.Flush());
+  ASSERT_OK(store.Compact());
+  ASSERT_FALSE(std::get<0>(store.Get("k1")));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(CassandraFunctionalTest, LoadMergeOperator) {
+  ConfigOptions config_options;
+  std::shared_ptr<MergeOperator> mo;
+  config_options.ignore_unsupported_options = false;
+
+  ASSERT_NOK(MergeOperator::CreateFromString(
+      config_options, CassandraValueMergeOperator::kClassName(), &mo));
+
+  config_options.registry->AddLibrary("cassandra", RegisterCassandraObjects,
+                                      "cassandra");
+
+  ASSERT_OK(MergeOperator::CreateFromString(
+      config_options, CassandraValueMergeOperator::kClassName(), &mo));
+  ASSERT_NE(mo, nullptr);
+  ASSERT_STREQ(mo->Name(), CassandraValueMergeOperator::kClassName());
+  mo.reset();
+  ASSERT_OK(MergeOperator::CreateFromString(
+      config_options,
+      std::string("operands_limit=20;gc_grace_period_in_seconds=42;id=") +
+          CassandraValueMergeOperator::kClassName(),
+      &mo));
+  ASSERT_NE(mo, nullptr);
+  ASSERT_STREQ(mo->Name(), CassandraValueMergeOperator::kClassName());
+  const auto* opts = mo->GetOptions<CassandraOptions>();
+  ASSERT_NE(opts, nullptr);
+  ASSERT_EQ(opts->gc_grace_period_in_seconds, 42);
+  ASSERT_EQ(opts->operands_limit, 20);
+}
+
+TEST_F(CassandraFunctionalTest, LoadCompactionFilter) {
+  ConfigOptions config_options;
+  const CompactionFilter* filter = nullptr;
+  config_options.ignore_unsupported_options = false;
+
+  ASSERT_NOK(CompactionFilter::CreateFromString(
+      config_options, CassandraCompactionFilter::kClassName(), &filter));
+  config_options.registry->AddLibrary("cassandra", RegisterCassandraObjects,
+                                      "cassandra");
+
+  ASSERT_OK(CompactionFilter::CreateFromString(
+      config_options, CassandraCompactionFilter::kClassName(), &filter));
+  ASSERT_NE(filter, nullptr);
+  ASSERT_STREQ(filter->Name(), CassandraCompactionFilter::kClassName());
+  delete filter;
+  filter = nullptr;
+  ASSERT_OK(CompactionFilter::CreateFromString(
+      config_options,
+      std::string(
+          "purge_ttl_on_expiration=true;gc_grace_period_in_seconds=42;id=") +
+          CassandraCompactionFilter::kClassName(),
+      &filter));
+  ASSERT_NE(filter, nullptr);
+  ASSERT_STREQ(filter->Name(), CassandraCompactionFilter::kClassName());
+  const auto* opts = filter->GetOptions<CassandraOptions>();
+  ASSERT_NE(opts, nullptr);
+  ASSERT_EQ(opts->gc_grace_period_in_seconds, 42);
+  ASSERT_TRUE(opts->purge_ttl_on_expiration);
+  delete filter;
+}
+
+TEST_F(CassandraFunctionalTest, LoadCompactionFilterFactory) {
+  ConfigOptions config_options;
+  std::shared_ptr<CompactionFilterFactory> factory;
+
+  config_options.ignore_unsupported_options = false;
+  ASSERT_NOK(CompactionFilterFactory::CreateFromString(
+      config_options, CassandraCompactionFilterFactory::kClassName(),
+      &factory));
+  config_options.registry->AddLibrary("cassandra", RegisterCassandraObjects,
+                                      "cassandra");
+
+  ASSERT_OK(CompactionFilterFactory::CreateFromString(
+      config_options, CassandraCompactionFilterFactory::kClassName(),
+      &factory));
+  ASSERT_NE(factory, nullptr);
+  ASSERT_STREQ(factory->Name(), CassandraCompactionFilterFactory::kClassName());
+  factory.reset();
+  ASSERT_OK(CompactionFilterFactory::CreateFromString(
+      config_options,
+      std::string(
+          "purge_ttl_on_expiration=true;gc_grace_period_in_seconds=42;id=") +
+          CassandraCompactionFilterFactory::kClassName(),
+      &factory));
+  ASSERT_NE(factory, nullptr);
+  ASSERT_STREQ(factory->Name(), CassandraCompactionFilterFactory::kClassName());
+  const auto* opts = factory->GetOptions<CassandraOptions>();
+  ASSERT_NE(opts, nullptr);
+  ASSERT_EQ(opts->gc_grace_period_in_seconds, 42);
+  ASSERT_TRUE(opts->purge_ttl_on_expiration);
+}
+#endif  // ROCKSDB_LITE
+
+}  // namespace cassandra
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/utilities/cassandra/cassandra_options.h b/src/rocksdb/utilities/cassandra/cassandra_options.h
new file mode 100644
index 000000000..efa73a308
--- /dev/null
+++ b/src/rocksdb/utilities/cassandra/cassandra_options.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+class ObjectLibrary;
+namespace cassandra {
+struct CassandraOptions {
+  static const char* kName() { return "CassandraOptions"; }
+  CassandraOptions(int32_t _gc_grace_period_in_seconds, size_t _operands_limit,
+                   bool _purge_ttl_on_expiration = false)
+      : operands_limit(_operands_limit),
+        gc_grace_period_in_seconds(_gc_grace_period_in_seconds),
+        purge_ttl_on_expiration(_purge_ttl_on_expiration) {}
+  // Limit on the number of merge operands.
+  size_t operands_limit;
+
+  // How long (in seconds) tombstoned data remains before it is purged
+  int32_t gc_grace_period_in_seconds;
+
+  // If is set to true, expired data will be directly purged.
+  // Otherwise expired data will be converted tombstones first,
+  // then be eventually removed after gc grace period. This value should
+  // only true if all writes have same ttl setting, otherwise it could bring old
+  // data back.
+  bool purge_ttl_on_expiration;
+};
+#ifndef ROCKSDB_LITE
+extern "C" {
+int RegisterCassandraObjects(ObjectLibrary& library, const std::string& arg);
+}  // extern "C"
+#endif  // ROCKSDB_LITE
+}  // namespace cassandra
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/cassandra/cassandra_row_merge_test.cc b/src/rocksdb/utilities/cassandra/cassandra_row_merge_test.cc
new file mode 100644
index 000000000..0b4a89287
--- /dev/null
+++ b/src/rocksdb/utilities/cassandra/cassandra_row_merge_test.cc
@@ -0,0 +1,98 @@
+// Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <memory>
+
+#include "test_util/testharness.h"
+#include "utilities/cassandra/format.h"
+#include "utilities/cassandra/test_utils.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace cassandra {
+
+class RowValueMergeTest : public testing::Test {};
+
+TEST(RowValueMergeTest, Merge) {
+  std::vector<RowValue> row_values;
+  row_values.push_back(CreateTestRowValue({
+      CreateTestColumnSpec(kTombstone, 0, 5),
+      CreateTestColumnSpec(kColumn, 1, 8),
+      CreateTestColumnSpec(kExpiringColumn, 2, 5),
+  }));
+
+  row_values.push_back(CreateTestRowValue({
+      CreateTestColumnSpec(kColumn, 0, 2),
+      CreateTestColumnSpec(kExpiringColumn, 1, 5),
+      CreateTestColumnSpec(kTombstone, 2, 7),
+      CreateTestColumnSpec(kExpiringColumn, 7, 17),
+  }));
+
+  row_values.push_back(CreateTestRowValue({
+      CreateTestColumnSpec(kExpiringColumn, 0, 6),
+      CreateTestColumnSpec(kTombstone, 1, 5),
+      CreateTestColumnSpec(kColumn, 2, 4),
+      CreateTestColumnSpec(kTombstone, 11, 11),
+  }));
+
+  RowValue merged = RowValue::Merge(std::move(row_values));
+  EXPECT_FALSE(merged.IsTombstone());
+  EXPECT_EQ(merged.get_columns().size(), 5);
+  VerifyRowValueColumns(merged.get_columns(), 0, kExpiringColumn, 0, 6);
+  VerifyRowValueColumns(merged.get_columns(), 1, kColumn, 1, 8);
+  VerifyRowValueColumns(merged.get_columns(), 2, kTombstone, 2, 7);
+  VerifyRowValueColumns(merged.get_columns(), 3, kExpiringColumn, 7, 17);
+  VerifyRowValueColumns(merged.get_columns(), 4, kTombstone, 11, 11);
+}
+
+TEST(RowValueMergeTest, MergeWithRowTombstone) {
+  std::vector<RowValue> row_values;
+
+  // A row tombstone.
+  row_values.push_back(CreateRowTombstone(11));
+
+  // This row's timestamp is smaller than tombstone.
+  row_values.push_back(CreateTestRowValue({
+      CreateTestColumnSpec(kColumn, 0, 5),
+      CreateTestColumnSpec(kColumn, 1, 6),
+  }));
+
+  // Some of the column's row is smaller, some is larger.
+  row_values.push_back(CreateTestRowValue({
+      CreateTestColumnSpec(kColumn, 2, 10),
+      CreateTestColumnSpec(kColumn, 3, 12),
+  }));
+
+  // All of the column's rows are larger than tombstone.
+  row_values.push_back(CreateTestRowValue({
+      CreateTestColumnSpec(kColumn, 4, 13),
+      CreateTestColumnSpec(kColumn, 5, 14),
+  }));
+
+  RowValue merged = RowValue::Merge(std::move(row_values));
+  EXPECT_FALSE(merged.IsTombstone());
+  EXPECT_EQ(merged.get_columns().size(), 3);
+  VerifyRowValueColumns(merged.get_columns(), 0, kColumn, 3, 12);
+  VerifyRowValueColumns(merged.get_columns(), 1, kColumn, 4, 13);
+  VerifyRowValueColumns(merged.get_columns(), 2, kColumn, 5, 14);
+
+  // If the tombstone's timestamp is the latest, then it returns a
+  // row tombstone.
+  row_values.push_back(CreateRowTombstone(15));
+
+  row_values.push_back(CreateRowTombstone(17));
+
+  merged = RowValue::Merge(std::move(row_values));
+  EXPECT_TRUE(merged.IsTombstone());
+  EXPECT_EQ(merged.LastModifiedTime(), 17);
+}
+
+}  // namespace cassandra
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/utilities/cassandra/cassandra_serialize_test.cc b/src/rocksdb/utilities/cassandra/cassandra_serialize_test.cc
new file mode 100644
index 000000000..c14d8fd80
--- /dev/null
+++ b/src/rocksdb/utilities/cassandra/cassandra_serialize_test.cc
@@ -0,0 +1,164 @@
+// Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "test_util/testharness.h"
+#include "utilities/cassandra/serialize.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace cassandra {
+
+TEST(SerializeTest, SerializeI64) {
+  std::string dest;
+  Serialize<int64_t>(0, &dest);
+  EXPECT_EQ(std::string({'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00',
+                         '\x00'}),
+            dest);
+
+  dest.clear();
+  Serialize<int64_t>(1, &dest);
+  EXPECT_EQ(std::string({'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00',
+                         '\x01'}),
+            dest);
+
+  dest.clear();
+  Serialize<int64_t>(-1, &dest);
+  EXPECT_EQ(std::string({'\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff',
+                         '\xff'}),
+            dest);
+
+  dest.clear();
+  Serialize<int64_t>(9223372036854775807, &dest);
+  EXPECT_EQ(std::string({'\x7f', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff',
+                         '\xff'}),
+            dest);
+
+  dest.clear();
+  Serialize<int64_t>(-9223372036854775807, &dest);
+  EXPECT_EQ(std::string({'\x80', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00',
+                         '\x01'}),
+            dest);
+}
+
+TEST(SerializeTest, DeserializeI64) {
+  std::string dest;
+  std::size_t offset = dest.size();
+  Serialize<int64_t>(0, &dest);
+  EXPECT_EQ(0, Deserialize<int64_t>(dest.c_str(), offset));
+
+  offset = dest.size();
+  Serialize<int64_t>(1, &dest);
+  EXPECT_EQ(1, Deserialize<int64_t>(dest.c_str(), offset));
+
+  offset = dest.size();
+  Serialize<int64_t>(-1, &dest);
+  EXPECT_EQ(-1, Deserialize<int64_t>(dest.c_str(), offset));
+
+  offset = dest.size();
+  Serialize<int64_t>(-9223372036854775807, &dest);
+  EXPECT_EQ(-9223372036854775807, Deserialize<int64_t>(dest.c_str(), offset));
+
+  offset = dest.size();
+  Serialize<int64_t>(9223372036854775807, &dest);
+  EXPECT_EQ(9223372036854775807, Deserialize<int64_t>(dest.c_str(), offset));
+}
+
+TEST(SerializeTest, SerializeI32) {
+  std::string dest;
+  Serialize<int32_t>(0, &dest);
+  EXPECT_EQ(std::string({'\x00', '\x00', '\x00', '\x00'}), dest);
+
+  dest.clear();
+  Serialize<int32_t>(1, &dest);
+  EXPECT_EQ(std::string({'\x00', '\x00', '\x00', '\x01'}), dest);
+
+  dest.clear();
+  Serialize<int32_t>(-1, &dest);
+  EXPECT_EQ(std::string({'\xff', '\xff', '\xff', '\xff'}), dest);
+
+  dest.clear();
+  Serialize<int32_t>(2147483647, &dest);
+  EXPECT_EQ(std::string({'\x7f', '\xff', '\xff', '\xff'}), dest);
+
+  dest.clear();
+  Serialize<int32_t>(-2147483648LL, &dest);
+  EXPECT_EQ(std::string({'\x80', '\x00', '\x00', '\x00'}), dest);
+}
+
+TEST(SerializeTest, DeserializeI32) {
+  std::string dest;
+  std::size_t offset = dest.size();
+  Serialize<int32_t>(0, &dest);
+  EXPECT_EQ(0, Deserialize<int32_t>(dest.c_str(), offset));
+
+  offset = dest.size();
+  Serialize<int32_t>(1, &dest);
+  EXPECT_EQ(1, Deserialize<int32_t>(dest.c_str(), offset));
+
+  offset = dest.size();
+  Serialize<int32_t>(-1, &dest);
+  EXPECT_EQ(-1, Deserialize<int32_t>(dest.c_str(), offset));
+
+  offset = dest.size();
+  Serialize<int32_t>(2147483647, &dest);
+  EXPECT_EQ(2147483647, Deserialize<int32_t>(dest.c_str(), offset));
+
+  offset = dest.size();
+  Serialize<int32_t>(-2147483648LL, &dest);
+  EXPECT_EQ(-2147483648LL, Deserialize<int32_t>(dest.c_str(), offset));
+}
+
+TEST(SerializeTest, SerializeI8) {
+  std::string dest;
+  Serialize<int8_t>(0, &dest);
+  EXPECT_EQ(std::string({'\x00'}), dest);
+
+  dest.clear();
+  Serialize<int8_t>(1, &dest);
+  EXPECT_EQ(std::string({'\x01'}), dest);
+
+  dest.clear();
+  Serialize<int8_t>(-1, &dest);
+  EXPECT_EQ(std::string({'\xff'}), dest);
+
+  dest.clear();
+  Serialize<int8_t>(127, &dest);
+  EXPECT_EQ(std::string({'\x7f'}), dest);
+
+  dest.clear();
+  Serialize<int8_t>(-128, &dest);
+  EXPECT_EQ(std::string({'\x80'}), dest);
+}
+
+TEST(SerializeTest, DeserializeI8) {
+  std::string dest;
+  std::size_t offset = dest.size();
+  Serialize<int8_t>(0, &dest);
+  EXPECT_EQ(0, Deserialize<int8_t>(dest.c_str(), offset));
+
+  offset = dest.size();
+  Serialize<int8_t>(1, &dest);
+  EXPECT_EQ(1, Deserialize<int8_t>(dest.c_str(), offset));
+
+  offset = dest.size();
+  Serialize<int8_t>(-1, &dest);
+  EXPECT_EQ(-1, Deserialize<int8_t>(dest.c_str(), offset));
+
+  offset = dest.size();
+  Serialize<int8_t>(127, &dest);
+  EXPECT_EQ(127, Deserialize<int8_t>(dest.c_str(), offset));
+
+  offset = dest.size();
+  Serialize<int8_t>(-128, &dest);
+  EXPECT_EQ(-128, Deserialize<int8_t>(dest.c_str(), offset));
+}
+
+}  // namespace cassandra
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/utilities/cassandra/format.cc b/src/rocksdb/utilities/cassandra/format.cc
new file mode 100644
index 000000000..cc1dd2f28
--- /dev/null
+++ b/src/rocksdb/utilities/cassandra/format.cc
@@ -0,0 +1,367 @@
+// Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "format.h"
+
+#include <algorithm>
+#include <map>
+#include <memory>
+
+#include "utilities/cassandra/serialize.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace cassandra {
+namespace {
+const int32_t kDefaultLocalDeletionTime = std::numeric_limits<int32_t>::max();
+const int64_t kDefaultMarkedForDeleteAt = std::numeric_limits<int64_t>::min();
+}  // namespace
+
+ColumnBase::ColumnBase(int8_t mask, int8_t index)
+    : mask_(mask), index_(index) {}
+
+std::size_t ColumnBase::Size() const { return sizeof(mask_) + sizeof(index_); }
+
+int8_t ColumnBase::Mask() const { return mask_; }
+
+int8_t ColumnBase::Index() const { return index_; }
+
+void ColumnBase::Serialize(std::string* dest) const {
+  ROCKSDB_NAMESPACE::cassandra::Serialize<int8_t>(mask_, dest);
+  ROCKSDB_NAMESPACE::cassandra::Serialize<int8_t>(index_, dest);
+}
+
+std::shared_ptr<ColumnBase> ColumnBase::Deserialize(const char* src,
+                                                    std::size_t offset) {
+  int8_t mask = ROCKSDB_NAMESPACE::cassandra::Deserialize<int8_t>(src, offset);
+  if ((mask & ColumnTypeMask::DELETION_MASK) != 0) {
+    return Tombstone::Deserialize(src, offset);
+  } else if ((mask & ColumnTypeMask::EXPIRATION_MASK) != 0) {
+    return ExpiringColumn::Deserialize(src, offset);
+  } else {
+    return Column::Deserialize(src, offset);
+  }
+}
+
+Column::Column(int8_t mask, int8_t index, int64_t timestamp, int32_t value_size,
+               const char* value)
+    : ColumnBase(mask, index),
+      timestamp_(timestamp),
+      value_size_(value_size),
+      value_(value) {}
+
+int64_t Column::Timestamp() const { return timestamp_; }
+
+std::size_t Column::Size() const {
+  return ColumnBase::Size() + sizeof(timestamp_) + sizeof(value_size_) +
+         value_size_;
+}
+
+void Column::Serialize(std::string* dest) const {
+  ColumnBase::Serialize(dest);
+  ROCKSDB_NAMESPACE::cassandra::Serialize<int64_t>(timestamp_, dest);
+  ROCKSDB_NAMESPACE::cassandra::Serialize<int32_t>(value_size_, dest);
+  dest->append(value_, value_size_);
+}
+
+std::shared_ptr<Column> Column::Deserialize(const char* src,
+                                            std::size_t offset) {
+  int8_t mask = ROCKSDB_NAMESPACE::cassandra::Deserialize<int8_t>(src, offset);
+  offset += sizeof(mask);
+  int8_t index = ROCKSDB_NAMESPACE::cassandra::Deserialize<int8_t>(src, offset);
+  offset += sizeof(index);
+  int64_t timestamp =
+      ROCKSDB_NAMESPACE::cassandra::Deserialize<int64_t>(src, offset);
+  offset += sizeof(timestamp);
+  int32_t value_size =
+      ROCKSDB_NAMESPACE::cassandra::Deserialize<int32_t>(src, offset);
+  offset += sizeof(value_size);
+  return std::make_shared<Column>(mask, index, timestamp, value_size,
+                                  src + offset);
+}
+
+ExpiringColumn::ExpiringColumn(int8_t mask, int8_t index, int64_t timestamp,
+                               int32_t value_size, const char* value,
+                               int32_t ttl)
+    : Column(mask, index, timestamp, value_size, value), ttl_(ttl) {}
+
+std::size_t ExpiringColumn::Size() const {
+  return Column::Size() + sizeof(ttl_);
+}
+
+void ExpiringColumn::Serialize(std::string* dest) const {
+  Column::Serialize(dest);
+  ROCKSDB_NAMESPACE::cassandra::Serialize<int32_t>(ttl_, dest);
+}
+
+std::chrono::time_point<std::chrono::system_clock> ExpiringColumn::TimePoint()
+    const {
+  return std::chrono::time_point<std::chrono::system_clock>(
+      std::chrono::microseconds(Timestamp()));
+}
+
+std::chrono::seconds ExpiringColumn::Ttl() const {
+  return std::chrono::seconds(ttl_);
+}
+
+bool ExpiringColumn::Expired() const {
+  return TimePoint() + Ttl() < std::chrono::system_clock::now();
+}
+
+std::shared_ptr<Tombstone> ExpiringColumn::ToTombstone() const {
+  auto expired_at = (TimePoint() + Ttl()).time_since_epoch();
+  int32_t local_deletion_time = static_cast<int32_t>(
+      std::chrono::duration_cast<std::chrono::seconds>(expired_at).count());
+  int64_t marked_for_delete_at =
+      std::chrono::duration_cast<std::chrono::microseconds>(expired_at).count();
+  return std::make_shared<Tombstone>(
+      static_cast<int8_t>(ColumnTypeMask::DELETION_MASK), Index(),
+      local_deletion_time, marked_for_delete_at);
+}
+
+std::shared_ptr<ExpiringColumn> ExpiringColumn::Deserialize(
+    const char* src, std::size_t offset) {
+  int8_t mask = ROCKSDB_NAMESPACE::cassandra::Deserialize<int8_t>(src, offset);
+  offset += sizeof(mask);
+  int8_t index = ROCKSDB_NAMESPACE::cassandra::Deserialize<int8_t>(src, offset);
+  offset += sizeof(index);
+  int64_t timestamp =
+      ROCKSDB_NAMESPACE::cassandra::Deserialize<int64_t>(src, offset);
+  offset += sizeof(timestamp);
+  int32_t value_size =
+      ROCKSDB_NAMESPACE::cassandra::Deserialize<int32_t>(src, offset);
+  offset += sizeof(value_size);
+  const char* value = src + offset;
+  offset += value_size;
+  int32_t ttl = ROCKSDB_NAMESPACE::cassandra::Deserialize<int32_t>(src, offset);
+  return std::make_shared<ExpiringColumn>(mask, index, timestamp, value_size,
+                                          value, ttl);
+}
+
+Tombstone::Tombstone(int8_t mask, int8_t index, int32_t local_deletion_time,
+                     int64_t marked_for_delete_at)
+    : ColumnBase(mask, index),
+      local_deletion_time_(local_deletion_time),
+      marked_for_delete_at_(marked_for_delete_at) {}
+
+int64_t Tombstone::Timestamp() const { return marked_for_delete_at_; }
+
+std::size_t Tombstone::Size() const {
+  return ColumnBase::Size() + sizeof(local_deletion_time_) +
+         sizeof(marked_for_delete_at_);
+}
+
+void Tombstone::Serialize(std::string* dest) const {
+  ColumnBase::Serialize(dest);
+  ROCKSDB_NAMESPACE::cassandra::Serialize<int32_t>(local_deletion_time_, dest);
+  ROCKSDB_NAMESPACE::cassandra::Serialize<int64_t>(marked_for_delete_at_, dest);
+}
+
+bool Tombstone::Collectable(int32_t gc_grace_period_in_seconds) const {
+  auto local_deleted_at = std::chrono::time_point<std::chrono::system_clock>(
+      std::chrono::seconds(local_deletion_time_));
+  auto gc_grace_period = std::chrono::seconds(gc_grace_period_in_seconds);
+  return local_deleted_at + gc_grace_period < std::chrono::system_clock::now();
+}
+
+std::shared_ptr<Tombstone> Tombstone::Deserialize(const char* src,
+                                                  std::size_t offset) {
+  int8_t mask = ROCKSDB_NAMESPACE::cassandra::Deserialize<int8_t>(src, offset);
+  offset += sizeof(mask);
+  int8_t index = ROCKSDB_NAMESPACE::cassandra::Deserialize<int8_t>(src, offset);
+  offset += sizeof(index);
+  int32_t local_deletion_time =
+      ROCKSDB_NAMESPACE::cassandra::Deserialize<int32_t>(src, offset);
+  offset += sizeof(int32_t);
+  int64_t marked_for_delete_at =
+      ROCKSDB_NAMESPACE::cassandra::Deserialize<int64_t>(src, offset);
+  return std::make_shared<Tombstone>(mask, index, local_deletion_time,
+                                     marked_for_delete_at);
+}
+
+RowValue::RowValue(int32_t local_deletion_time, int64_t marked_for_delete_at)
+    : local_deletion_time_(local_deletion_time),
+      marked_for_delete_at_(marked_for_delete_at),
+      columns_(),
+      last_modified_time_(0) {}
+
+RowValue::RowValue(Columns columns, int64_t last_modified_time)
+    : local_deletion_time_(kDefaultLocalDeletionTime),
+      marked_for_delete_at_(kDefaultMarkedForDeleteAt),
+      columns_(std::move(columns)),
+      last_modified_time_(last_modified_time) {}
+
+std::size_t RowValue::Size() const {
+  std::size_t size =
+      sizeof(local_deletion_time_) + sizeof(marked_for_delete_at_);
+  for (const auto& column : columns_) {
+    size += column->Size();
+  }
+  return size;
+}
+
+int64_t RowValue::LastModifiedTime() const {
+  if (IsTombstone()) {
+    return marked_for_delete_at_;
+  } else {
+    return last_modified_time_;
+  }
+}
+
+bool RowValue::IsTombstone() const {
+  return marked_for_delete_at_ > kDefaultMarkedForDeleteAt;
+}
+
+void RowValue::Serialize(std::string* dest) const {
+  ROCKSDB_NAMESPACE::cassandra::Serialize<int32_t>(local_deletion_time_, dest);
+  ROCKSDB_NAMESPACE::cassandra::Serialize<int64_t>(marked_for_delete_at_, dest);
+  for (const auto& column : columns_) {
+    column->Serialize(dest);
+  }
+}
+
+RowValue RowValue::RemoveExpiredColumns(bool* changed) const {
+  *changed = false;
+  Columns new_columns;
+  for (auto& column : columns_) {
+    if (column->Mask() == ColumnTypeMask::EXPIRATION_MASK) {
+      std::shared_ptr<ExpiringColumn> expiring_column =
+          std::static_pointer_cast<ExpiringColumn>(column);
+
+      if (expiring_column->Expired()) {
+        *changed = true;
+        continue;
+      }
+    }
+
+    new_columns.push_back(column);
+  }
+  return RowValue(std::move(new_columns), last_modified_time_);
+}
+
+RowValue RowValue::ConvertExpiredColumnsToTombstones(bool* changed) const {
+  *changed = false;
+  Columns new_columns;
+  for (auto& column : columns_) {
+    if (column->Mask() == ColumnTypeMask::EXPIRATION_MASK) {
+      std::shared_ptr<ExpiringColumn> expiring_column =
+          std::static_pointer_cast<ExpiringColumn>(column);
+
+      if (expiring_column->Expired()) {
+        std::shared_ptr<Tombstone> tombstone = expiring_column->ToTombstone();
+        new_columns.push_back(tombstone);
+        *changed = true;
+        continue;
+      }
+    }
+    new_columns.push_back(column);
+  }
+  return RowValue(std::move(new_columns), last_modified_time_);
+}
+
+RowValue RowValue::RemoveTombstones(int32_t gc_grace_period) const {
+  Columns new_columns;
+  for (auto& column : columns_) {
+    if (column->Mask() == ColumnTypeMask::DELETION_MASK) {
+      std::shared_ptr<Tombstone> tombstone =
+          std::static_pointer_cast<Tombstone>(column);
+
+      if (tombstone->Collectable(gc_grace_period)) {
+        continue;
+      }
+    }
+
+    new_columns.push_back(column);
+  }
+  return RowValue(std::move(new_columns), last_modified_time_);
+}
+
+bool RowValue::Empty() const { return columns_.empty(); }
+
+RowValue RowValue::Deserialize(const char* src, std::size_t size) {
+  std::size_t offset = 0;
+  assert(size >= sizeof(local_deletion_time_) + sizeof(marked_for_delete_at_));
+  int32_t local_deletion_time =
+      ROCKSDB_NAMESPACE::cassandra::Deserialize<int32_t>(src, offset);
+  offset += sizeof(int32_t);
+  int64_t marked_for_delete_at =
+      ROCKSDB_NAMESPACE::cassandra::Deserialize<int64_t>(src, offset);
+  offset += sizeof(int64_t);
+  if (offset == size) {
+    return RowValue(local_deletion_time, marked_for_delete_at);
+  }
+
+  assert(local_deletion_time == kDefaultLocalDeletionTime);
+  assert(marked_for_delete_at == kDefaultMarkedForDeleteAt);
+  Columns columns;
+  int64_t last_modified_time = 0;
+  while (offset < size) {
+    auto c = ColumnBase::Deserialize(src, offset);
+    offset += c->Size();
+    assert(offset <= size);
+    last_modified_time = std::max(last_modified_time, c->Timestamp());
+    columns.push_back(std::move(c));
+  }
+
+  return RowValue(std::move(columns), last_modified_time);
+}
+
+// Merge multiple row values into one.
+// For each column in rows with same index, we pick the one with latest
+// timestamp. And we also take row tombstone into consideration, by iterating
+// each row from reverse timestamp order, and stop once we hit the first
+// row tombstone.
+RowValue RowValue::Merge(std::vector<RowValue>&& values) {
+  assert(values.size() > 0);
+  if (values.size() == 1) {
+    return std::move(values[0]);
+  }
+
+  // Merge columns by their last modified time, and skip once we hit
+  // a row tombstone.
+  std::sort(values.begin(), values.end(),
+            [](const RowValue& r1, const RowValue& r2) {
+              return r1.LastModifiedTime() > r2.LastModifiedTime();
+            });
+
+  std::map<int8_t, std::shared_ptr<ColumnBase>> merged_columns;
+  int64_t tombstone_timestamp = 0;
+
+  for (auto& value : values) {
+    if (value.IsTombstone()) {
+      if (merged_columns.size() == 0) {
+        return std::move(value);
+      }
+      tombstone_timestamp = value.LastModifiedTime();
+      break;
+    }
+    for (auto& column : value.columns_) {
+      int8_t index = column->Index();
+      if (merged_columns.find(index) == merged_columns.end()) {
+        merged_columns[index] = column;
+      } else {
+        if (column->Timestamp() > merged_columns[index]->Timestamp()) {
+          merged_columns[index] = column;
+        }
+      }
+    }
+  }
+
+  int64_t last_modified_time = 0;
+  Columns columns;
+  for (auto& pair : merged_columns) {
+    // For some row, its last_modified_time > row tombstone_timestamp, but
+    // it might have rows whose timestamp is ealier than tombstone, so we
+    // ned to filter these rows.
+    if (pair.second->Timestamp() <= tombstone_timestamp) {
+      continue;
+    }
+    last_modified_time = std::max(last_modified_time, pair.second->Timestamp());
+    columns.push_back(std::move(pair.second));
+  }
+  return RowValue(std::move(columns), last_modified_time);
+}
+
+}  // namespace cassandra
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/cassandra/format.h b/src/rocksdb/utilities/cassandra/format.h
new file mode 100644
index 000000000..1b2714735
--- /dev/null
+++ b/src/rocksdb/utilities/cassandra/format.h
@@ -0,0 +1,183 @@
+// Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+/**
+ * The encoding of Cassandra Row Value.
+ *
+ * A Cassandra Row Value could either be a row tombstone,
+ * or contains multiple columns, it has following fields:
+ *
+ * struct row_value {
+ *   int32_t local_deletion_time;  // Time in second when the row is deleted,
+ *                                 // only used for Cassandra tombstone gc.
+ *   int64_t marked_for_delete_at; // Ms that marked this row is deleted.
+ *   struct column_base columns[]; // For non tombstone row, all columns
+ *                                 // are stored here.
+ * }
+ *
+ * If the local_deletion_time and marked_for_delete_at is set, then this is
+ * a tombstone, otherwise it contains multiple columns.
+ *
+ * There are three type of Columns: Normal Column, Expiring Column and Column
+ * Tombstone, which have following fields:
+ *
+ * // Identify the type of the column.
+ * enum mask {
+ *   DELETION_MASK = 0x01,
+ *   EXPIRATION_MASK = 0x02,
+ * };
+ *
+ * struct column  {
+ *   int8_t mask = 0;
+ *   int8_t index;
+ *   int64_t timestamp;
+ *   int32_t value_length;
+ *   char value[value_length];
+ * }
+ *
+ * struct expiring_column  {
+ *   int8_t mask = mask.EXPIRATION_MASK;
+ *   int8_t index;
+ *   int64_t timestamp;
+ *   int32_t value_length;
+ *   char value[value_length];
+ *   int32_t ttl;
+ * }
+ *
+ * struct tombstone_column  {
+ *   int8_t mask = mask.DELETION_MASK;
+ *   int8_t index;
+ *   int32_t local_deletion_time; // Similar to row_value's field.
+ *   int64_t marked_for_delete_at;
+ *  }
+ */
+
+#pragma once
+#include <chrono>
+#include <memory>
+#include <vector>
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace cassandra {
+
+// Identify the type of the column.
+enum ColumnTypeMask {
+  DELETION_MASK = 0x01,
+  EXPIRATION_MASK = 0x02,
+};
+
+class ColumnBase {
+ public:
+  ColumnBase(int8_t mask, int8_t index);
+  virtual ~ColumnBase() = default;
+
+  virtual int64_t Timestamp() const = 0;
+  virtual int8_t Mask() const;
+  virtual int8_t Index() const;
+  virtual std::size_t Size() const;
+  virtual void Serialize(std::string* dest) const;
+  static std::shared_ptr<ColumnBase> Deserialize(const char* src,
+                                                 std::size_t offset);
+
+ private:
+  int8_t mask_;
+  int8_t index_;
+};
+
+class Column : public ColumnBase {
+ public:
+  Column(int8_t mask, int8_t index, int64_t timestamp, int32_t value_size,
+         const char* value);
+
+  virtual int64_t Timestamp() const override;
+  virtual std::size_t Size() const override;
+  virtual void Serialize(std::string* dest) const override;
+  static std::shared_ptr<Column> Deserialize(const char* src,
+                                             std::size_t offset);
+
+ private:
+  int64_t timestamp_;
+  int32_t value_size_;
+  const char* value_;
+};
+
+class Tombstone : public ColumnBase {
+ public:
+  Tombstone(int8_t mask, int8_t index, int32_t local_deletion_time,
+            int64_t marked_for_delete_at);
+
+  virtual int64_t Timestamp() const override;
+  virtual std::size_t Size() const override;
+  virtual void Serialize(std::string* dest) const override;
+  bool Collectable(int32_t gc_grace_period) const;
+  static std::shared_ptr<Tombstone> Deserialize(const char* src,
+                                                std::size_t offset);
+
+ private:
+  int32_t local_deletion_time_;
+  int64_t marked_for_delete_at_;
+};
+
+class ExpiringColumn : public Column {
+ public:
+  ExpiringColumn(int8_t mask, int8_t index, int64_t timestamp,
+                 int32_t value_size, const char* value, int32_t ttl);
+
+  virtual std::size_t Size() const override;
+  virtual void Serialize(std::string* dest) const override;
+  bool Expired() const;
+  std::shared_ptr<Tombstone> ToTombstone() const;
+
+  static std::shared_ptr<ExpiringColumn> Deserialize(const char* src,
+                                                     std::size_t offset);
+
+ private:
+  int32_t ttl_;
+  std::chrono::time_point<std::chrono::system_clock> TimePoint() const;
+  std::chrono::seconds Ttl() const;
+};
+
+using Columns = std::vector<std::shared_ptr<ColumnBase>>;
+
+class RowValue {
+ public:
+  // Create a Row Tombstone.
+  RowValue(int32_t local_deletion_time, int64_t marked_for_delete_at);
+  // Create a Row containing columns.
+  RowValue(Columns columns, int64_t last_modified_time);
+  RowValue(const RowValue& /*that*/) = delete;
+  RowValue(RowValue&& /*that*/) noexcept = default;
+  RowValue& operator=(const RowValue& /*that*/) = delete;
+  RowValue& operator=(RowValue&& /*that*/) = default;
+
+  std::size_t Size() const;
+  bool IsTombstone() const;
+  // For Tombstone this returns the marked_for_delete_at_,
+  // otherwise it returns the max timestamp of containing columns.
+  int64_t LastModifiedTime() const;
+  void Serialize(std::string* dest) const;
+  RowValue RemoveExpiredColumns(bool* changed) const;
+  RowValue ConvertExpiredColumnsToTombstones(bool* changed) const;
+  RowValue RemoveTombstones(int32_t gc_grace_period) const;
+  bool Empty() const;
+
+  static RowValue Deserialize(const char* src, std::size_t size);
+  // Merge multiple rows according to their timestamp.
+  static RowValue Merge(std::vector<RowValue>&& values);
+
+  const Columns& get_columns() { return columns_; }
+
+ private:
+  int32_t local_deletion_time_;
+  int64_t marked_for_delete_at_;
+  Columns columns_;
+  int64_t last_modified_time_;
+};
+
+}  // namespace cassandra
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/cassandra/merge_operator.cc b/src/rocksdb/utilities/cassandra/merge_operator.cc
new file mode 100644
index 000000000..bde5dcbad
--- /dev/null
+++ b/src/rocksdb/utilities/cassandra/merge_operator.cc
@@ -0,0 +1,82 @@
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "merge_operator.h"
+
+#include <assert.h>
+
+#include <memory>
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/options_type.h"
+#include "utilities/cassandra/format.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace cassandra {
+static std::unordered_map<std::string, OptionTypeInfo>
+    merge_operator_options_info = {
+#ifndef ROCKSDB_LITE
+        {"gc_grace_period_in_seconds",
+         {offsetof(struct CassandraOptions, gc_grace_period_in_seconds),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"operands_limit",
+         {offsetof(struct CassandraOptions, operands_limit), OptionType::kSizeT,
+          OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+};
+
+CassandraValueMergeOperator::CassandraValueMergeOperator(
+    int32_t gc_grace_period_in_seconds, size_t operands_limit)
+    : options_(gc_grace_period_in_seconds, operands_limit) {
+  RegisterOptions(&options_, &merge_operator_options_info);
+}
+
+// Implementation for the merge operation (merges two Cassandra values)
+bool CassandraValueMergeOperator::FullMergeV2(
+    const MergeOperationInput& merge_in,
+    MergeOperationOutput* merge_out) const {
+  // Clear the *new_value for writing.
+  merge_out->new_value.clear();
+  std::vector<RowValue> row_values;
+  if (merge_in.existing_value) {
+    row_values.push_back(RowValue::Deserialize(
+        merge_in.existing_value->data(), merge_in.existing_value->size()));
+  }
+
+  for (auto& operand : merge_in.operand_list) {
+    row_values.push_back(RowValue::Deserialize(operand.data(), operand.size()));
+  }
+
+  RowValue merged = RowValue::Merge(std::move(row_values));
+  merged = merged.RemoveTombstones(options_.gc_grace_period_in_seconds);
+  merge_out->new_value.reserve(merged.Size());
+  merged.Serialize(&(merge_out->new_value));
+
+  return true;
+}
+
+bool CassandraValueMergeOperator::PartialMergeMulti(
+    const Slice& /*key*/, const std::deque<Slice>& operand_list,
+    std::string* new_value, Logger* /*logger*/) const {
+  // Clear the *new_value for writing.
+  assert(new_value);
+  new_value->clear();
+
+  std::vector<RowValue> row_values;
+  for (auto& operand : operand_list) {
+    row_values.push_back(RowValue::Deserialize(operand.data(), operand.size()));
+  }
+  RowValue merged = RowValue::Merge(std::move(row_values));
+  new_value->reserve(merged.Size());
+  merged.Serialize(new_value);
+  return true;
+}
+
+}  // namespace cassandra
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/cassandra/merge_operator.h b/src/rocksdb/utilities/cassandra/merge_operator.h
new file mode 100644
index 000000000..af8725db7
--- /dev/null
+++ b/src/rocksdb/utilities/cassandra/merge_operator.h
@@ -0,0 +1,44 @@
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "utilities/cassandra/cassandra_options.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace cassandra {
+
+/**
+ * A MergeOperator for rocksdb that implements Cassandra row value merge.
+ */
+class CassandraValueMergeOperator : public MergeOperator {
+ public:
+  explicit CassandraValueMergeOperator(int32_t gc_grace_period_in_seconds,
+                                       size_t operands_limit = 0);
+
+  virtual bool FullMergeV2(const MergeOperationInput& merge_in,
+                           MergeOperationOutput* merge_out) const override;
+
+  virtual bool PartialMergeMulti(const Slice& key,
+                                 const std::deque<Slice>& operand_list,
+                                 std::string* new_value,
+                                 Logger* logger) const override;
+
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "CassandraValueMergeOperator"; }
+
+  virtual bool AllowSingleOperand() const override { return true; }
+
+  virtual bool ShouldMerge(const std::vector<Slice>& operands) const override {
+    return options_.operands_limit > 0 &&
+           operands.size() >= options_.operands_limit;
+  }
+
+ private:
+  CassandraOptions options_;
+};
+}  // namespace cassandra
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/cassandra/serialize.h b/src/rocksdb/utilities/cassandra/serialize.h
new file mode 100644
index 000000000..4bd552bfc
--- /dev/null
+++ b/src/rocksdb/utilities/cassandra/serialize.h
@@ -0,0 +1,81 @@
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+/**
+ * Helper functions which serialize and deserialize integers
+ * into bytes in big endian.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace cassandra {
+namespace {
+const int64_t kCharMask = 0xFFLL;
+const int32_t kBitsPerByte = 8;
+}  // namespace
+
+template <typename T>
+void Serialize(T val, std::string* dest);
+
+template <typename T>
+T Deserialize(const char* src, std::size_t offset = 0);
+
+// Specializations
+template <>
+inline void Serialize<int8_t>(int8_t t, std::string* dest) {
+  dest->append(1, static_cast<char>(t & kCharMask));
+}
+
+template <>
+inline void Serialize<int32_t>(int32_t t, std::string* dest) {
+  for (unsigned long i = 0; i < sizeof(int32_t); i++) {
+    dest->append(
+        1, static_cast<char>((t >> (sizeof(int32_t) - 1 - i) * kBitsPerByte) &
+                             kCharMask));
+  }
+}
+
+template <>
+inline void Serialize<int64_t>(int64_t t, std::string* dest) {
+  for (unsigned long i = 0; i < sizeof(int64_t); i++) {
+    dest->append(
+        1, static_cast<char>((t >> (sizeof(int64_t) - 1 - i) * kBitsPerByte) &
+                             kCharMask));
+  }
+}
+
+template <>
+inline int8_t Deserialize<int8_t>(const char* src, std::size_t offset) {
+  return static_cast<int8_t>(src[offset]);
+}
+
+template <>
+inline int32_t Deserialize<int32_t>(const char* src, std::size_t offset) {
+  int32_t result = 0;
+  for (unsigned long i = 0; i < sizeof(int32_t); i++) {
+    result |= static_cast<int32_t>(static_cast<unsigned char>(src[offset + i]))
+              << ((sizeof(int32_t) - 1 - i) * kBitsPerByte);
+  }
+  return result;
+}
+
+template <>
+inline int64_t Deserialize<int64_t>(const char* src, std::size_t offset) {
+  int64_t result = 0;
+  for (unsigned long i = 0; i < sizeof(int64_t); i++) {
+    result |= static_cast<int64_t>(static_cast<unsigned char>(src[offset + i]))
+              << ((sizeof(int64_t) - 1 - i) * kBitsPerByte);
+  }
+  return result;
+}
+
+}  // namespace cassandra
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/cassandra/test_utils.cc b/src/rocksdb/utilities/cassandra/test_utils.cc
new file mode 100644
index 000000000..ec6e5752d
--- /dev/null
+++ b/src/rocksdb/utilities/cassandra/test_utils.cc
@@ -0,0 +1,69 @@
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "test_utils.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace cassandra {
+const char kData[] = {'d', 'a', 't', 'a'};
+const char kExpiringData[] = {'e', 'd', 'a', 't', 'a'};
+const int32_t kTtl = 86400;
+const int8_t kColumn = 0;
+const int8_t kTombstone = 1;
+const int8_t kExpiringColumn = 2;
+
+std::shared_ptr<ColumnBase> CreateTestColumn(int8_t mask, int8_t index,
+                                             int64_t timestamp) {
+  if ((mask & ColumnTypeMask::DELETION_MASK) != 0) {
+    return std::shared_ptr<Tombstone>(
+        new Tombstone(mask, index, ToSeconds(timestamp), timestamp));
+  } else if ((mask & ColumnTypeMask::EXPIRATION_MASK) != 0) {
+    return std::shared_ptr<ExpiringColumn>(new ExpiringColumn(
+        mask, index, timestamp, sizeof(kExpiringData), kExpiringData, kTtl));
+  } else {
+    return std::shared_ptr<Column>(
+        new Column(mask, index, timestamp, sizeof(kData), kData));
+  }
+}
+
+std::tuple<int8_t, int8_t, int64_t> CreateTestColumnSpec(int8_t mask,
+                                                         int8_t index,
+                                                         int64_t timestamp) {
+  return std::make_tuple(mask, index, timestamp);
+}
+
+RowValue CreateTestRowValue(
+    std::vector<std::tuple<int8_t, int8_t, int64_t>> column_specs) {
+  std::vector<std::shared_ptr<ColumnBase>> columns;
+  int64_t last_modified_time = 0;
+  for (auto spec : column_specs) {
+    auto c = CreateTestColumn(std::get<0>(spec), std::get<1>(spec),
+                              std::get<2>(spec));
+    last_modified_time = std::max(last_modified_time, c->Timestamp());
+    columns.push_back(std::move(c));
+  }
+  return RowValue(std::move(columns), last_modified_time);
+}
+
+RowValue CreateRowTombstone(int64_t timestamp) {
+  return RowValue(ToSeconds(timestamp), timestamp);
+}
+
+void VerifyRowValueColumns(
+    const std::vector<std::shared_ptr<ColumnBase>> &columns,
+    std::size_t index_of_vector, int8_t expected_mask, int8_t expected_index,
+    int64_t expected_timestamp) {
+  EXPECT_EQ(expected_timestamp, columns[index_of_vector]->Timestamp());
+  EXPECT_EQ(expected_mask, columns[index_of_vector]->Mask());
+  EXPECT_EQ(expected_index, columns[index_of_vector]->Index());
+}
+
+int64_t ToMicroSeconds(int64_t seconds) { return seconds * (int64_t)1000000; }
+
+int32_t ToSeconds(int64_t microseconds) {
+  return (int32_t)(microseconds / (int64_t)1000000);
+}
+}  // namespace cassandra
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/cassandra/test_utils.h b/src/rocksdb/utilities/cassandra/test_utils.h
new file mode 100644
index 000000000..be23f7076
--- /dev/null
+++ b/src/rocksdb/utilities/cassandra/test_utils.h
@@ -0,0 +1,42 @@
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <memory>
+
+#include "test_util/testharness.h"
+#include "utilities/cassandra/format.h"
+#include "utilities/cassandra/serialize.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace cassandra {
+extern const char kData[];
+extern const char kExpiringData[];
+extern const int32_t kTtl;
+extern const int8_t kColumn;
+extern const int8_t kTombstone;
+extern const int8_t kExpiringColumn;
+
+std::shared_ptr<ColumnBase> CreateTestColumn(int8_t mask, int8_t index,
+                                             int64_t timestamp);
+
+std::tuple<int8_t, int8_t, int64_t> CreateTestColumnSpec(int8_t mask,
+                                                         int8_t index,
+                                                         int64_t timestamp);
+
+RowValue CreateTestRowValue(
+    std::vector<std::tuple<int8_t, int8_t, int64_t>> column_specs);
+
+RowValue CreateRowTombstone(int64_t timestamp);
+
+void VerifyRowValueColumns(
+    const std::vector<std::shared_ptr<ColumnBase>> &columns,
+    std::size_t index_of_vector, int8_t expected_mask, int8_t expected_index,
+    int64_t expected_timestamp);
+
+int64_t ToMicroSeconds(int64_t seconds);
+int32_t ToSeconds(int64_t microseconds);
+}  // namespace cassandra
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/checkpoint/checkpoint_impl.cc b/src/rocksdb/utilities/checkpoint/checkpoint_impl.cc
new file mode 100644
index 000000000..44ce70b1b
--- /dev/null
+++ b/src/rocksdb/utilities/checkpoint/checkpoint_impl.cc
@@ -0,0 +1,469 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 Facebook.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/checkpoint/checkpoint_impl.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <string>
+#include <tuple>
+#include <unordered_set>
+#include <vector>
+
+#include "db/wal_manager.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "logging/logging.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/metadata.h"
+#include "rocksdb/options.h"
+#include "rocksdb/transaction_log.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/file_checksum_helper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status Checkpoint::Create(DB* db, Checkpoint** checkpoint_ptr) {
+  *checkpoint_ptr = new CheckpointImpl(db);
+  return Status::OK();
+}
+
+Status Checkpoint::CreateCheckpoint(const std::string& /*checkpoint_dir*/,
+                                    uint64_t /*log_size_for_flush*/,
+                                    uint64_t* /*sequence_number_ptr*/) {
+  return Status::NotSupported("");
+}
+
+void CheckpointImpl::CleanStagingDirectory(const std::string& full_private_path,
+                                           Logger* info_log) {
+  std::vector<std::string> subchildren;
+  Status s = db_->GetEnv()->FileExists(full_private_path);
+  if (s.IsNotFound()) {
+    return;
+  }
+  ROCKS_LOG_INFO(info_log, "File exists %s -- %s", full_private_path.c_str(),
+                 s.ToString().c_str());
+  s = db_->GetEnv()->GetChildren(full_private_path, &subchildren);
+  if (s.ok()) {
+    for (auto& subchild : subchildren) {
+      std::string subchild_path = full_private_path + "/" + subchild;
+      s = db_->GetEnv()->DeleteFile(subchild_path);
+      ROCKS_LOG_INFO(info_log, "Delete file %s -- %s", subchild_path.c_str(),
+                     s.ToString().c_str());
+    }
+  }
+  // finally delete the private dir
+  s = db_->GetEnv()->DeleteDir(full_private_path);
+  ROCKS_LOG_INFO(info_log, "Delete dir %s -- %s", full_private_path.c_str(),
+                 s.ToString().c_str());
+}
+
+Status Checkpoint::ExportColumnFamily(
+    ColumnFamilyHandle* /*handle*/, const std::string& /*export_dir*/,
+    ExportImportFilesMetaData** /*metadata*/) {
+  return Status::NotSupported("");
+}
+
+// Builds an openable snapshot of RocksDB
+Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir,
+                                        uint64_t log_size_for_flush,
+                                        uint64_t* sequence_number_ptr) {
+  DBOptions db_options = db_->GetDBOptions();
+
+  Status s = db_->GetEnv()->FileExists(checkpoint_dir);
+  if (s.ok()) {
+    return Status::InvalidArgument("Directory exists");
+  } else if (!s.IsNotFound()) {
+    assert(s.IsIOError());
+    return s;
+  }
+
+  ROCKS_LOG_INFO(
+      db_options.info_log,
+      "Started the snapshot process -- creating snapshot in directory %s",
+      checkpoint_dir.c_str());
+
+  size_t final_nonslash_idx = checkpoint_dir.find_last_not_of('/');
+  if (final_nonslash_idx == std::string::npos) {
+    // npos means it's only slashes or empty. Non-empty means it's the root
+    // directory, but it shouldn't be because we verified above the directory
+    // doesn't exist.
+    assert(checkpoint_dir.empty());
+    return Status::InvalidArgument("invalid checkpoint directory name");
+  }
+
+  std::string full_private_path =
+      checkpoint_dir.substr(0, final_nonslash_idx + 1) + ".tmp";
+  ROCKS_LOG_INFO(db_options.info_log,
+                 "Snapshot process -- using temporary directory %s",
+                 full_private_path.c_str());
+  CleanStagingDirectory(full_private_path, db_options.info_log.get());
+  // create snapshot directory
+  s = db_->GetEnv()->CreateDir(full_private_path);
+  uint64_t sequence_number = 0;
+  if (s.ok()) {
+    // enable file deletions
+    s = db_->DisableFileDeletions();
+    const bool disabled_file_deletions = s.ok();
+
+    if (s.ok() || s.IsNotSupported()) {
+      s = CreateCustomCheckpoint(
+          [&](const std::string& src_dirname, const std::string& fname,
+              FileType) {
+            ROCKS_LOG_INFO(db_options.info_log, "Hard Linking %s",
+                           fname.c_str());
+            return db_->GetFileSystem()->LinkFile(
+                src_dirname + "/" + fname, full_private_path + "/" + fname,
+                IOOptions(), nullptr);
+          } /* link_file_cb */,
+          [&](const std::string& src_dirname, const std::string& fname,
+              uint64_t size_limit_bytes, FileType,
+              const std::string& /* checksum_func_name */,
+              const std::string& /* checksum_val */,
+              const Temperature temperature) {
+            ROCKS_LOG_INFO(db_options.info_log, "Copying %s", fname.c_str());
+            return CopyFile(db_->GetFileSystem(), src_dirname + "/" + fname,
+                            full_private_path + "/" + fname, size_limit_bytes,
+                            db_options.use_fsync, nullptr, temperature);
+          } /* copy_file_cb */,
+          [&](const std::string& fname, const std::string& contents, FileType) {
+            ROCKS_LOG_INFO(db_options.info_log, "Creating %s", fname.c_str());
+            return CreateFile(db_->GetFileSystem(),
+                              full_private_path + "/" + fname, contents,
+                              db_options.use_fsync);
+          } /* create_file_cb */,
+          &sequence_number, log_size_for_flush);
+
+      // we copied all the files, enable file deletions
+      if (disabled_file_deletions) {
+        Status ss = db_->EnableFileDeletions(false);
+        assert(ss.ok());
+        ss.PermitUncheckedError();
+      }
+    }
+  }
+
+  if (s.ok()) {
+    // move tmp private backup to real snapshot directory
+    s = db_->GetEnv()->RenameFile(full_private_path, checkpoint_dir);
+  }
+  if (s.ok()) {
+    std::unique_ptr<FSDirectory> checkpoint_directory;
+    s = db_->GetFileSystem()->NewDirectory(checkpoint_dir, IOOptions(),
+                                           &checkpoint_directory, nullptr);
+    if (s.ok() && checkpoint_directory != nullptr) {
+      s = checkpoint_directory->FsyncWithDirOptions(
+          IOOptions(), nullptr,
+          DirFsyncOptions(DirFsyncOptions::FsyncReason::kDirRenamed));
+    }
+  }
+
+  if (s.ok()) {
+    if (sequence_number_ptr != nullptr) {
+      *sequence_number_ptr = sequence_number;
+    }
+    // here we know that we succeeded and installed the new snapshot
+    ROCKS_LOG_INFO(db_options.info_log, "Snapshot DONE. All is good");
+    ROCKS_LOG_INFO(db_options.info_log, "Snapshot sequence number: %" PRIu64,
+                   sequence_number);
+  } else {
+    // clean all the files we might have created
+    ROCKS_LOG_INFO(db_options.info_log, "Snapshot failed -- %s",
+                   s.ToString().c_str());
+    CleanStagingDirectory(full_private_path, db_options.info_log.get());
+  }
+  return s;
+}
+
+Status CheckpointImpl::CreateCustomCheckpoint(
+    std::function<Status(const std::string& src_dirname,
+                         const std::string& src_fname, FileType type)>
+        link_file_cb,
+    std::function<
+        Status(const std::string& src_dirname, const std::string& src_fname,
+               uint64_t size_limit_bytes, FileType type,
+               const std::string& checksum_func_name,
+               const std::string& checksum_val, const Temperature temperature)>
+        copy_file_cb,
+    std::function<Status(const std::string& fname, const std::string& contents,
+                         FileType type)>
+        create_file_cb,
+    uint64_t* sequence_number, uint64_t log_size_for_flush,
+    bool get_live_table_checksum) {
+  *sequence_number = db_->GetLatestSequenceNumber();
+
+  LiveFilesStorageInfoOptions opts;
+  opts.include_checksum_info = get_live_table_checksum;
+  opts.wal_size_for_flush = log_size_for_flush;
+
+  std::vector<LiveFileStorageInfo> infos;
+  {
+    Status s = db_->GetLiveFilesStorageInfo(opts, &infos);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  // Verify that everything except WAL files are in same directory
+  // (db_paths / cf_paths not supported)
+  std::unordered_set<std::string> dirs;
+  for (auto& info : infos) {
+    if (info.file_type != kWalFile) {
+      dirs.insert(info.directory);
+    }
+  }
+  if (dirs.size() > 1) {
+    return Status::NotSupported(
+        "db_paths / cf_paths not supported for Checkpoint nor BackupEngine");
+  }
+
+  bool same_fs = true;
+
+  for (auto& info : infos) {
+    Status s;
+    if (!info.replacement_contents.empty()) {
+      // Currently should only be used for CURRENT file.
+      assert(info.file_type == kCurrentFile);
+
+      if (info.size != info.replacement_contents.size()) {
+        s = Status::Corruption("Inconsistent size metadata for " +
+                               info.relative_filename);
+      } else {
+        s = create_file_cb(info.relative_filename, info.replacement_contents,
+                           info.file_type);
+      }
+    } else {
+      if (same_fs && !info.trim_to_size) {
+        s = link_file_cb(info.directory, info.relative_filename,
+                         info.file_type);
+        if (s.IsNotSupported()) {
+          same_fs = false;
+          s = Status::OK();
+        }
+        s.MustCheck();
+      }
+      if (!same_fs || info.trim_to_size) {
+        assert(info.file_checksum_func_name.empty() ==
+               !opts.include_checksum_info);
+        // no assertion on file_checksum because empty is used for both "not
+        // set" and "unknown"
+        if (opts.include_checksum_info) {
+          s = copy_file_cb(info.directory, info.relative_filename, info.size,
+                           info.file_type, info.file_checksum_func_name,
+                           info.file_checksum, info.temperature);
+        } else {
+          s = copy_file_cb(info.directory, info.relative_filename, info.size,
+                           info.file_type, kUnknownFileChecksumFuncName,
+                           kUnknownFileChecksum, info.temperature);
+        }
+      }
+    }
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  return Status::OK();
+}
+
+// Exports all live SST files of a specified Column Family onto export_dir,
+// returning SST files information in metadata.
+Status CheckpointImpl::ExportColumnFamily(
+    ColumnFamilyHandle* handle, const std::string& export_dir,
+    ExportImportFilesMetaData** metadata) {
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(handle);
+  const auto cf_name = cfh->GetName();
+  const auto db_options = db_->GetDBOptions();
+
+  assert(metadata != nullptr);
+  assert(*metadata == nullptr);
+  auto s = db_->GetEnv()->FileExists(export_dir);
+  if (s.ok()) {
+    return Status::InvalidArgument("Specified export_dir exists");
+  } else if (!s.IsNotFound()) {
+    assert(s.IsIOError());
+    return s;
+  }
+
+  const auto final_nonslash_idx = export_dir.find_last_not_of('/');
+  if (final_nonslash_idx == std::string::npos) {
+    return Status::InvalidArgument("Specified export_dir invalid");
+  }
+  ROCKS_LOG_INFO(db_options.info_log,
+                 "[%s] export column family onto export directory %s",
+                 cf_name.c_str(), export_dir.c_str());
+
+  // Create a temporary export directory.
+  const auto tmp_export_dir =
+      export_dir.substr(0, final_nonslash_idx + 1) + ".tmp";
+  s = db_->GetEnv()->CreateDir(tmp_export_dir);
+
+  if (s.ok()) {
+    s = db_->Flush(ROCKSDB_NAMESPACE::FlushOptions(), handle);
+  }
+
+  ColumnFamilyMetaData db_metadata;
+  if (s.ok()) {
+    // Export live sst files with file deletions disabled.
+    s = db_->DisableFileDeletions();
+    if (s.ok()) {
+      db_->GetColumnFamilyMetaData(handle, &db_metadata);
+
+      s = ExportFilesInMetaData(
+          db_options, db_metadata,
+          [&](const std::string& src_dirname, const std::string& fname) {
+            ROCKS_LOG_INFO(db_options.info_log, "[%s] HardLinking %s",
+                           cf_name.c_str(), fname.c_str());
+            return db_->GetEnv()->LinkFile(src_dirname + fname,
+                                           tmp_export_dir + fname);
+          } /*link_file_cb*/,
+          [&](const std::string& src_dirname, const std::string& fname) {
+            ROCKS_LOG_INFO(db_options.info_log, "[%s] Copying %s",
+                           cf_name.c_str(), fname.c_str());
+            return CopyFile(db_->GetFileSystem(), src_dirname + fname,
+                            tmp_export_dir + fname, 0, db_options.use_fsync,
+                            nullptr, Temperature::kUnknown);
+          } /*copy_file_cb*/);
+
+      const auto enable_status = db_->EnableFileDeletions(false /*force*/);
+      if (s.ok()) {
+        s = enable_status;
+      }
+    }
+  }
+
+  auto moved_to_user_specified_dir = false;
+  if (s.ok()) {
+    // Move temporary export directory to the actual export directory.
+    s = db_->GetEnv()->RenameFile(tmp_export_dir, export_dir);
+  }
+
+  if (s.ok()) {
+    // Fsync export directory.
+    moved_to_user_specified_dir = true;
+    std::unique_ptr<FSDirectory> dir_ptr;
+    s = db_->GetFileSystem()->NewDirectory(export_dir, IOOptions(), &dir_ptr,
+                                           nullptr);
+    if (s.ok()) {
+      assert(dir_ptr != nullptr);
+      s = dir_ptr->FsyncWithDirOptions(
+          IOOptions(), nullptr,
+          DirFsyncOptions(DirFsyncOptions::FsyncReason::kDirRenamed));
+    }
+  }
+
+  if (s.ok()) {
+    // Export of files succeeded. Fill in the metadata information.
+    auto result_metadata = new ExportImportFilesMetaData();
+    result_metadata->db_comparator_name = handle->GetComparator()->Name();
+    for (const auto& level_metadata : db_metadata.levels) {
+      for (const auto& file_metadata : level_metadata.files) {
+        LiveFileMetaData live_file_metadata;
+        live_file_metadata.size = file_metadata.size;
+        live_file_metadata.name = std::move(file_metadata.name);
+        live_file_metadata.file_number = file_metadata.file_number;
+        live_file_metadata.db_path = export_dir;
+        live_file_metadata.smallest_seqno = file_metadata.smallest_seqno;
+        live_file_metadata.largest_seqno = file_metadata.largest_seqno;
+        live_file_metadata.smallestkey = std::move(file_metadata.smallestkey);
+        live_file_metadata.largestkey = std::move(file_metadata.largestkey);
+        live_file_metadata.oldest_blob_file_number =
+            file_metadata.oldest_blob_file_number;
+        live_file_metadata.level = level_metadata.level;
+        result_metadata->files.push_back(live_file_metadata);
+      }
+      *metadata = result_metadata;
+    }
+    ROCKS_LOG_INFO(db_options.info_log, "[%s] Export succeeded.",
+                   cf_name.c_str());
+  } else {
+    // Failure: Clean up all the files/directories created.
+    ROCKS_LOG_INFO(db_options.info_log, "[%s] Export failed. %s",
+                   cf_name.c_str(), s.ToString().c_str());
+    std::vector<std::string> subchildren;
+    const auto cleanup_dir =
+        moved_to_user_specified_dir ? export_dir : tmp_export_dir;
+    db_->GetEnv()->GetChildren(cleanup_dir, &subchildren);
+    for (const auto& subchild : subchildren) {
+      const auto subchild_path = cleanup_dir + "/" + subchild;
+      const auto status = db_->GetEnv()->DeleteFile(subchild_path);
+      if (!status.ok()) {
+        ROCKS_LOG_WARN(db_options.info_log, "Failed to cleanup file %s: %s",
+                       subchild_path.c_str(), status.ToString().c_str());
+      }
+    }
+    const auto status = db_->GetEnv()->DeleteDir(cleanup_dir);
+    if (!status.ok()) {
+      ROCKS_LOG_WARN(db_options.info_log, "Failed to cleanup dir %s: %s",
+                     cleanup_dir.c_str(), status.ToString().c_str());
+    }
+  }
+  return s;
+}
+
+Status CheckpointImpl::ExportFilesInMetaData(
+    const DBOptions& db_options, const ColumnFamilyMetaData& metadata,
+    std::function<Status(const std::string& src_dirname,
+                         const std::string& src_fname)>
+        link_file_cb,
+    std::function<Status(const std::string& src_dirname,
+                         const std::string& src_fname)>
+        copy_file_cb) {
+  Status s;
+  auto hardlink_file = true;
+
+  // Copy/hard link files in metadata.
+  size_t num_files = 0;
+  for (const auto& level_metadata : metadata.levels) {
+    for (const auto& file_metadata : level_metadata.files) {
+      uint64_t number;
+      FileType type;
+      const auto ok = ParseFileName(file_metadata.name, &number, &type);
+      if (!ok) {
+        s = Status::Corruption("Could not parse file name");
+        break;
+      }
+
+      // We should only get sst files here.
+      assert(type == kTableFile);
+      assert(file_metadata.size > 0 && file_metadata.name[0] == '/');
+      const auto src_fname = file_metadata.name;
+      ++num_files;
+
+      if (hardlink_file) {
+        s = link_file_cb(db_->GetName(), src_fname);
+        if (num_files == 1 && s.IsNotSupported()) {
+          // Fallback to copy if link failed due to cross-device directories.
+          hardlink_file = false;
+          s = Status::OK();
+        }
+      }
+      if (!hardlink_file) {
+        s = copy_file_cb(db_->GetName(), src_fname);
+      }
+      if (!s.ok()) {
+        break;
+      }
+    }
+  }
+  ROCKS_LOG_INFO(db_options.info_log, "Number of table files %" ROCKSDB_PRIszt,
+                 num_files);
+
+  return s;
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/checkpoint/checkpoint_impl.h b/src/rocksdb/utilities/checkpoint/checkpoint_impl.h
new file mode 100644
index 000000000..2947330cc
--- /dev/null
+++ b/src/rocksdb/utilities/checkpoint/checkpoint_impl.h
@@ -0,0 +1,66 @@
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "file/filename.h"
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/checkpoint.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CheckpointImpl : public Checkpoint {
+ public:
+  explicit CheckpointImpl(DB* db) : db_(db) {}
+
+  Status CreateCheckpoint(const std::string& checkpoint_dir,
+                          uint64_t log_size_for_flush,
+                          uint64_t* sequence_number_ptr) override;
+
+  Status ExportColumnFamily(ColumnFamilyHandle* handle,
+                            const std::string& export_dir,
+                            ExportImportFilesMetaData** metadata) override;
+
+  // Checkpoint logic can be customized by providing callbacks for link, copy,
+  // or create.
+  Status CreateCustomCheckpoint(
+      std::function<Status(const std::string& src_dirname,
+                           const std::string& fname, FileType type)>
+          link_file_cb,
+      std::function<Status(const std::string& src_dirname,
+                           const std::string& fname, uint64_t size_limit_bytes,
+                           FileType type, const std::string& checksum_func_name,
+                           const std::string& checksum_val,
+                           const Temperature src_temperature)>
+          copy_file_cb,
+      std::function<Status(const std::string& fname,
+                           const std::string& contents, FileType type)>
+          create_file_cb,
+      uint64_t* sequence_number, uint64_t log_size_for_flush,
+      bool get_live_table_checksum = false);
+
+ private:
+  void CleanStagingDirectory(const std::string& path, Logger* info_log);
+
+  // Export logic customization by providing callbacks for link or copy.
+  Status ExportFilesInMetaData(
+      const DBOptions& db_options, const ColumnFamilyMetaData& metadata,
+      std::function<Status(const std::string& src_dirname,
+                           const std::string& fname)>
+          link_file_cb,
+      std::function<Status(const std::string& src_dirname,
+                           const std::string& fname)>
+          copy_file_cb);
+
+ private:
+  DB* db_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/checkpoint/checkpoint_test.cc b/src/rocksdb/utilities/checkpoint/checkpoint_test.cc
new file mode 100644
index 000000000..3da753d5f
--- /dev/null
+++ b/src/rocksdb/utilities/checkpoint/checkpoint_test.cc
@@ -0,0 +1,974 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Syncpoint prevents us building and running tests in release
+#ifndef ROCKSDB_LITE
+#include "rocksdb/utilities/checkpoint.h"
+
+#ifndef OS_WIN
+#include <unistd.h>
+#endif
+#include <iostream>
+#include <thread>
+#include <utility>
+
+#include "db/db_impl/db_impl.h"
+#include "file/file_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/fault_injection_fs.h"
+
+namespace ROCKSDB_NAMESPACE {
+class CheckpointTest : public testing::Test {
+ protected:
+  // Sequence of option configurations to try
+  enum OptionConfig {
+    kDefault = 0,
+  };
+  int option_config_;
+
+ public:
+  std::string dbname_;
+  std::string alternative_wal_dir_;
+  Env* env_;
+  DB* db_;
+  Options last_options_;
+  std::vector<ColumnFamilyHandle*> handles_;
+  std::string snapshot_name_;
+  std::string export_path_;
+  ColumnFamilyHandle* cfh_reverse_comp_;
+  ExportImportFilesMetaData* metadata_;
+
+  CheckpointTest() : env_(Env::Default()) {
+    env_->SetBackgroundThreads(1, Env::LOW);
+    env_->SetBackgroundThreads(1, Env::HIGH);
+    dbname_ = test::PerThreadDBPath(env_, "checkpoint_test");
+    alternative_wal_dir_ = dbname_ + "/wal";
+    auto options = CurrentOptions();
+    auto delete_options = options;
+    delete_options.wal_dir = alternative_wal_dir_;
+    EXPECT_OK(DestroyDB(dbname_, delete_options));
+    // Destroy it for not alternative WAL dir is used.
+    EXPECT_OK(DestroyDB(dbname_, options));
+    db_ = nullptr;
+    snapshot_name_ = test::PerThreadDBPath(env_, "snapshot");
+    std::string snapshot_tmp_name = snapshot_name_ + ".tmp";
+    EXPECT_OK(DestroyDB(snapshot_name_, options));
+    test::DeleteDir(env_, snapshot_name_);
+    EXPECT_OK(DestroyDB(snapshot_tmp_name, options));
+    test::DeleteDir(env_, snapshot_tmp_name);
+    Reopen(options);
+    export_path_ = test::PerThreadDBPath("/export");
+    DestroyDir(env_, export_path_).PermitUncheckedError();
+    cfh_reverse_comp_ = nullptr;
+    metadata_ = nullptr;
+  }
+
+  ~CheckpointTest() override {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({});
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+    if (cfh_reverse_comp_) {
+      EXPECT_OK(db_->DestroyColumnFamilyHandle(cfh_reverse_comp_));
+      cfh_reverse_comp_ = nullptr;
+    }
+    if (metadata_) {
+      delete metadata_;
+      metadata_ = nullptr;
+    }
+    Close();
+    Options options;
+    options.db_paths.emplace_back(dbname_, 0);
+    options.db_paths.emplace_back(dbname_ + "_2", 0);
+    options.db_paths.emplace_back(dbname_ + "_3", 0);
+    options.db_paths.emplace_back(dbname_ + "_4", 0);
+    EXPECT_OK(DestroyDB(dbname_, options));
+    EXPECT_OK(DestroyDB(snapshot_name_, options));
+    DestroyDir(env_, export_path_).PermitUncheckedError();
+  }
+
+  // Return the current option configuration.
+  Options CurrentOptions() {
+    Options options;
+    options.env = env_;
+    options.create_if_missing = true;
+    return options;
+  }
+
+  void CreateColumnFamilies(const std::vector<std::string>& cfs,
+                            const Options& options) {
+    ColumnFamilyOptions cf_opts(options);
+    size_t cfi = handles_.size();
+    handles_.resize(cfi + cfs.size());
+    for (auto cf : cfs) {
+      ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]));
+    }
+  }
+
+  void CreateAndReopenWithCF(const std::vector<std::string>& cfs,
+                             const Options& options) {
+    CreateColumnFamilies(cfs, options);
+    std::vector<std::string> cfs_plus_default = cfs;
+    cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
+    ReopenWithColumnFamilies(cfs_plus_default, options);
+  }
+
+  void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                const std::vector<Options>& options) {
+    ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+  }
+
+  void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                const Options& options) {
+    ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+  }
+
+  Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                     const std::vector<Options>& options) {
+    Close();
+    EXPECT_EQ(cfs.size(), options.size());
+    std::vector<ColumnFamilyDescriptor> column_families;
+    for (size_t i = 0; i < cfs.size(); ++i) {
+      column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i]));
+    }
+    DBOptions db_opts = DBOptions(options[0]);
+    return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+  }
+
+  Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                     const Options& options) {
+    Close();
+    std::vector<Options> v_opts(cfs.size(), options);
+    return TryReopenWithColumnFamilies(cfs, v_opts);
+  }
+
+  void Reopen(const Options& options) { ASSERT_OK(TryReopen(options)); }
+
+  void CompactAll() {
+    for (auto h : handles_) {
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), h, nullptr, nullptr));
+    }
+  }
+
+  void Close() {
+    for (auto h : handles_) {
+      delete h;
+    }
+    handles_.clear();
+    delete db_;
+    db_ = nullptr;
+  }
+
+  void DestroyAndReopen(const Options& options) {
+    // Destroy using last options
+    Destroy(last_options_);
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Destroy(const Options& options) {
+    Close();
+    ASSERT_OK(DestroyDB(dbname_, options));
+  }
+
+  Status ReadOnlyReopen(const Options& options) {
+    return DB::OpenForReadOnly(options, dbname_, &db_);
+  }
+
+  Status ReadOnlyReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                          const Options& options) {
+    std::vector<ColumnFamilyDescriptor> column_families;
+    for (const auto& cf : cfs) {
+      column_families.emplace_back(cf, options);
+    }
+    return DB::OpenForReadOnly(options, dbname_, column_families, &handles_,
+                               &db_);
+  }
+
+  Status TryReopen(const Options& options) {
+    Close();
+    last_options_ = options;
+    return DB::Open(options, dbname_, &db_);
+  }
+
+  Status Flush(int cf = 0) {
+    if (cf == 0) {
+      return db_->Flush(FlushOptions());
+    } else {
+      return db_->Flush(FlushOptions(), handles_[cf]);
+    }
+  }
+
+  Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()) {
+    return db_->Put(wo, k, v);
+  }
+
+  Status Put(int cf, const Slice& k, const Slice& v,
+             WriteOptions wo = WriteOptions()) {
+    return db_->Put(wo, handles_[cf], k, v);
+  }
+
+  Status Delete(const std::string& k) { return db_->Delete(WriteOptions(), k); }
+
+  Status Delete(int cf, const std::string& k) {
+    return db_->Delete(WriteOptions(), handles_[cf], k);
+  }
+
+  std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
+    ReadOptions options;
+    options.verify_checksums = true;
+    options.snapshot = snapshot;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  std::string Get(int cf, const std::string& k,
+                  const Snapshot* snapshot = nullptr) {
+    ReadOptions options;
+    options.verify_checksums = true;
+    options.snapshot = snapshot;
+    std::string result;
+    Status s = db_->Get(options, handles_[cf], k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+};
+
+TEST_F(CheckpointTest, GetSnapshotLink) {
+  for (uint64_t log_size_for_flush : {0, 1000000}) {
+    Options options;
+    DB* snapshotDB;
+    ReadOptions roptions;
+    std::string result;
+    Checkpoint* checkpoint;
+
+    options = CurrentOptions();
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, options));
+
+    // Create a database
+    options.create_if_missing = true;
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    std::string key = std::string("foo");
+    ASSERT_OK(Put(key, "v1"));
+    // Take a snapshot
+    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+    ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_, log_size_for_flush));
+    ASSERT_OK(Put(key, "v2"));
+    ASSERT_EQ("v2", Get(key));
+    ASSERT_OK(Flush());
+    ASSERT_EQ("v2", Get(key));
+    // Open snapshot and verify contents while DB is running
+    options.create_if_missing = false;
+    ASSERT_OK(DB::Open(options, snapshot_name_, &snapshotDB));
+    ASSERT_OK(snapshotDB->Get(roptions, key, &result));
+    ASSERT_EQ("v1", result);
+    delete snapshotDB;
+    snapshotDB = nullptr;
+    delete db_;
+    db_ = nullptr;
+
+    // Destroy original DB
+    ASSERT_OK(DestroyDB(dbname_, options));
+
+    // Open snapshot and verify contents
+    options.create_if_missing = false;
+    dbname_ = snapshot_name_;
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    ASSERT_EQ("v1", Get(key));
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, options));
+    delete checkpoint;
+
+    // Restore DB name
+    dbname_ = test::PerThreadDBPath(env_, "db_test");
+  }
+}
+
+TEST_F(CheckpointTest, CheckpointWithBlob) {
+  // Create a database with a blob file
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  ASSERT_OK(Put(key, blob));
+  ASSERT_OK(Flush());
+
+  // Create a checkpoint
+  Checkpoint* checkpoint = nullptr;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+
+  std::unique_ptr<Checkpoint> checkpoint_guard(checkpoint);
+
+  ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+
+  // Make sure it contains the blob file
+  std::vector<std::string> files;
+  ASSERT_OK(env_->GetChildren(snapshot_name_, &files));
+
+  bool blob_file_found = false;
+  for (const auto& file : files) {
+    uint64_t number = 0;
+    FileType type = kWalFile;
+
+    if (ParseFileName(file, &number, &type) && type == kBlobFile) {
+      blob_file_found = true;
+      break;
+    }
+  }
+
+  ASSERT_TRUE(blob_file_found);
+
+  // Make sure the checkpoint can be opened and the blob value read
+  options.create_if_missing = false;
+  DB* checkpoint_db = nullptr;
+  ASSERT_OK(DB::Open(options, snapshot_name_, &checkpoint_db));
+
+  std::unique_ptr<DB> checkpoint_db_guard(checkpoint_db);
+
+  PinnableSlice value;
+  ASSERT_OK(checkpoint_db->Get(
+      ReadOptions(), checkpoint_db->DefaultColumnFamily(), key, &value));
+
+  ASSERT_EQ(value, blob);
+}
+
+TEST_F(CheckpointTest, ExportColumnFamilyWithLinks) {
+  // Create a database
+  auto options = CurrentOptions();
+  options.create_if_missing = true;
+  CreateAndReopenWithCF({}, options);
+
+  // Helper to verify the number of files in metadata and export dir
+  auto verify_files_exported = [&](const ExportImportFilesMetaData& metadata,
+                                   int num_files_expected) {
+    ASSERT_EQ(metadata.files.size(), num_files_expected);
+    std::vector<std::string> subchildren;
+    ASSERT_OK(env_->GetChildren(export_path_, &subchildren));
+    ASSERT_EQ(subchildren.size(), num_files_expected);
+  };
+
+  // Test DefaultColumnFamily
+  {
+    const auto key = std::string("foo");
+    ASSERT_OK(Put(key, "v1"));
+
+    Checkpoint* checkpoint;
+    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+
+    // Export the Tables and verify
+    ASSERT_OK(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(),
+                                             export_path_, &metadata_));
+    verify_files_exported(*metadata_, 1);
+    ASSERT_EQ(metadata_->db_comparator_name, options.comparator->Name());
+    ASSERT_OK(DestroyDir(env_, export_path_));
+    delete metadata_;
+    metadata_ = nullptr;
+
+    // Check again after compaction
+    CompactAll();
+    ASSERT_OK(Put(key, "v2"));
+    ASSERT_OK(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(),
+                                             export_path_, &metadata_));
+    verify_files_exported(*metadata_, 2);
+    ASSERT_EQ(metadata_->db_comparator_name, options.comparator->Name());
+    ASSERT_OK(DestroyDir(env_, export_path_));
+    delete metadata_;
+    metadata_ = nullptr;
+    delete checkpoint;
+  }
+
+  // Test non default column family with non default comparator
+  {
+    auto cf_options = CurrentOptions();
+    cf_options.comparator = ReverseBytewiseComparator();
+    ASSERT_OK(db_->CreateColumnFamily(cf_options, "yoyo", &cfh_reverse_comp_));
+
+    const auto key = std::string("foo");
+    ASSERT_OK(db_->Put(WriteOptions(), cfh_reverse_comp_, key, "v1"));
+
+    Checkpoint* checkpoint;
+    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+
+    // Export the Tables and verify
+    ASSERT_OK(checkpoint->ExportColumnFamily(cfh_reverse_comp_, export_path_,
+                                             &metadata_));
+    verify_files_exported(*metadata_, 1);
+    ASSERT_EQ(metadata_->db_comparator_name,
+              ReverseBytewiseComparator()->Name());
+    delete checkpoint;
+  }
+}
+
+TEST_F(CheckpointTest, ExportColumnFamilyNegativeTest) {
+  // Create a database
+  auto options = CurrentOptions();
+  options.create_if_missing = true;
+  CreateAndReopenWithCF({}, options);
+
+  const auto key = std::string("foo");
+  ASSERT_OK(Put(key, "v1"));
+
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+
+  // Export onto existing directory
+  ASSERT_OK(env_->CreateDirIfMissing(export_path_));
+  ASSERT_EQ(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(),
+                                           export_path_, &metadata_),
+            Status::InvalidArgument("Specified export_dir exists"));
+  ASSERT_OK(DestroyDir(env_, export_path_));
+
+  // Export with invalid directory specification
+  export_path_ = "";
+  ASSERT_EQ(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(),
+                                           export_path_, &metadata_),
+            Status::InvalidArgument("Specified export_dir invalid"));
+  delete checkpoint;
+}
+
+TEST_F(CheckpointTest, CheckpointCF) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"one", "two", "three", "four", "five"}, options);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"CheckpointTest::CheckpointCF:2", "DBImpl::GetLiveFiles:2"},
+       {"DBImpl::GetLiveFiles:1", "CheckpointTest::CheckpointCF:1"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(0, "Default", "Default"));
+  ASSERT_OK(Put(1, "one", "one"));
+  ASSERT_OK(Put(2, "two", "two"));
+  ASSERT_OK(Put(3, "three", "three"));
+  ASSERT_OK(Put(4, "four", "four"));
+  ASSERT_OK(Put(5, "five", "five"));
+
+  DB* snapshotDB;
+  ReadOptions roptions;
+  std::string result;
+  std::vector<ColumnFamilyHandle*> cphandles;
+
+  // Take a snapshot
+  ROCKSDB_NAMESPACE::port::Thread t([&]() {
+    Checkpoint* checkpoint;
+    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+    ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+    delete checkpoint;
+  });
+  TEST_SYNC_POINT("CheckpointTest::CheckpointCF:1");
+  ASSERT_OK(Put(0, "Default", "Default1"));
+  ASSERT_OK(Put(1, "one", "eleven"));
+  ASSERT_OK(Put(2, "two", "twelve"));
+  ASSERT_OK(Put(3, "three", "thirteen"));
+  ASSERT_OK(Put(4, "four", "fourteen"));
+  ASSERT_OK(Put(5, "five", "fifteen"));
+  TEST_SYNC_POINT("CheckpointTest::CheckpointCF:2");
+  t.join();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_OK(Put(1, "one", "twentyone"));
+  ASSERT_OK(Put(2, "two", "twentytwo"));
+  ASSERT_OK(Put(3, "three", "twentythree"));
+  ASSERT_OK(Put(4, "four", "twentyfour"));
+  ASSERT_OK(Put(5, "five", "twentyfive"));
+  ASSERT_OK(Flush());
+
+  // Open snapshot and verify contents while DB is running
+  options.create_if_missing = false;
+  std::vector<std::string> cfs;
+  cfs = {kDefaultColumnFamilyName, "one", "two", "three", "four", "five"};
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (size_t i = 0; i < cfs.size(); ++i) {
+    column_families.push_back(ColumnFamilyDescriptor(cfs[i], options));
+  }
+  ASSERT_OK(DB::Open(options, snapshot_name_, column_families, &cphandles,
+                     &snapshotDB));
+  ASSERT_OK(snapshotDB->Get(roptions, cphandles[0], "Default", &result));
+  ASSERT_EQ("Default1", result);
+  ASSERT_OK(snapshotDB->Get(roptions, cphandles[1], "one", &result));
+  ASSERT_EQ("eleven", result);
+  ASSERT_OK(snapshotDB->Get(roptions, cphandles[2], "two", &result));
+  for (auto h : cphandles) {
+    delete h;
+  }
+  cphandles.clear();
+  delete snapshotDB;
+  snapshotDB = nullptr;
+}
+
+TEST_F(CheckpointTest, CheckpointCFNoFlush) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"one", "two", "three", "four", "five"}, options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(0, "Default", "Default"));
+  ASSERT_OK(Put(1, "one", "one"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(2, "two", "two"));
+
+  DB* snapshotDB;
+  ReadOptions roptions;
+  std::string result;
+  std::vector<ColumnFamilyHandle*> cphandles;
+
+  // Take a snapshot
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCallFlush:start", [&](void* /*arg*/) {
+        // Flush should never trigger.
+        FAIL();
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_, 1000000));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  delete checkpoint;
+  ASSERT_OK(Put(1, "one", "two"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(2, "two", "twentytwo"));
+  Close();
+  EXPECT_OK(DestroyDB(dbname_, options));
+
+  // Open snapshot and verify contents while DB is running
+  options.create_if_missing = false;
+  std::vector<std::string> cfs;
+  cfs = {kDefaultColumnFamilyName, "one", "two", "three", "four", "five"};
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (size_t i = 0; i < cfs.size(); ++i) {
+    column_families.push_back(ColumnFamilyDescriptor(cfs[i], options));
+  }
+  ASSERT_OK(DB::Open(options, snapshot_name_, column_families, &cphandles,
+                     &snapshotDB));
+  ASSERT_OK(snapshotDB->Get(roptions, cphandles[0], "Default", &result));
+  ASSERT_EQ("Default", result);
+  ASSERT_OK(snapshotDB->Get(roptions, cphandles[1], "one", &result));
+  ASSERT_EQ("one", result);
+  ASSERT_OK(snapshotDB->Get(roptions, cphandles[2], "two", &result));
+  ASSERT_EQ("two", result);
+  for (auto h : cphandles) {
+    delete h;
+  }
+  cphandles.clear();
+  delete snapshotDB;
+  snapshotDB = nullptr;
+}
+
+TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing) {
+  Options options = CurrentOptions();
+  options.max_manifest_file_size = 0;  // always rollover manifest for file add
+  Reopen(options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {// Get past the flush in the checkpoint thread before adding any keys to
+       // the db so the checkpoint thread won't hit the WriteManifest
+       // syncpoints.
+       {"CheckpointImpl::CreateCheckpoint:FlushDone",
+        "CheckpointTest::CurrentFileModifiedWhileCheckpointing:PrePut"},
+       // Roll the manifest during checkpointing right after live files are
+       // snapshotted.
+       {"CheckpointImpl::CreateCheckpoint:SavedLiveFiles1",
+        "VersionSet::LogAndApply:WriteManifest"},
+       {"VersionSet::LogAndApply:WriteManifestDone",
+        "CheckpointImpl::CreateCheckpoint:SavedLiveFiles2"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread t([&]() {
+    Checkpoint* checkpoint;
+    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+    ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+    delete checkpoint;
+  });
+  TEST_SYNC_POINT(
+      "CheckpointTest::CurrentFileModifiedWhileCheckpointing:PrePut");
+  ASSERT_OK(Put("Default", "Default1"));
+  ASSERT_OK(Flush());
+  t.join();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  DB* snapshotDB;
+  // Successful Open() implies that CURRENT pointed to the manifest in the
+  // checkpoint.
+  ASSERT_OK(DB::Open(options, snapshot_name_, &snapshotDB));
+  delete snapshotDB;
+  snapshotDB = nullptr;
+}
+
+TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing2PC) {
+  Close();
+  const std::string dbname = test::PerThreadDBPath("transaction_testdb");
+  ASSERT_OK(DestroyDB(dbname, CurrentOptions()));
+  test::DeleteDir(env_, dbname);
+
+  Options options = CurrentOptions();
+  options.allow_2pc = true;
+  // allow_2pc is implicitly set with tx prepare
+  // options.allow_2pc = true;
+  TransactionDBOptions txn_db_options;
+  TransactionDB* txdb;
+  Status s = TransactionDB::Open(options, txn_db_options, dbname, &txdb);
+  ASSERT_OK(s);
+  ColumnFamilyHandle* cfa;
+  ColumnFamilyHandle* cfb;
+  ColumnFamilyOptions cf_options;
+  ASSERT_OK(txdb->CreateColumnFamily(cf_options, "CFA", &cfa));
+
+  WriteOptions write_options;
+  // Insert something into CFB so lots of log files will be kept
+  // before creating the checkpoint.
+  ASSERT_OK(txdb->CreateColumnFamily(cf_options, "CFB", &cfb));
+  ASSERT_OK(txdb->Put(write_options, cfb, "", ""));
+
+  ReadOptions read_options;
+  std::string value;
+  TransactionOptions txn_options;
+  Transaction* txn = txdb->BeginTransaction(write_options, txn_options);
+  s = txn->SetName("xid");
+  ASSERT_OK(s);
+  ASSERT_EQ(txdb->GetTransactionByName("xid"), txn);
+
+  s = txn->Put(Slice("foo"), Slice("bar"));
+  ASSERT_OK(s);
+  s = txn->Put(cfa, Slice("foocfa"), Slice("barcfa"));
+  ASSERT_OK(s);
+  // Writing prepare into middle of first WAL, then flush WALs many times
+  for (int i = 1; i <= 100000; i++) {
+    Transaction* tx = txdb->BeginTransaction(write_options, txn_options);
+    ASSERT_OK(tx->SetName("x"));
+    ASSERT_OK(tx->Put(Slice(std::to_string(i)), Slice("val")));
+    ASSERT_OK(tx->Put(cfa, Slice("aaa"), Slice("111")));
+    ASSERT_OK(tx->Prepare());
+    ASSERT_OK(tx->Commit());
+    if (i % 10000 == 0) {
+      ASSERT_OK(txdb->Flush(FlushOptions()));
+    }
+    if (i == 88888) {
+      ASSERT_OK(txn->Prepare());
+    }
+    delete tx;
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"CheckpointImpl::CreateCheckpoint:SavedLiveFiles1",
+        "CheckpointTest::CurrentFileModifiedWhileCheckpointing2PC:PreCommit"},
+       {"CheckpointTest::CurrentFileModifiedWhileCheckpointing2PC:PostCommit",
+        "CheckpointImpl::CreateCheckpoint:SavedLiveFiles2"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ROCKSDB_NAMESPACE::port::Thread t([&]() {
+    Checkpoint* checkpoint;
+    ASSERT_OK(Checkpoint::Create(txdb, &checkpoint));
+    ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+    delete checkpoint;
+  });
+  TEST_SYNC_POINT(
+      "CheckpointTest::CurrentFileModifiedWhileCheckpointing2PC:PreCommit");
+  ASSERT_OK(txn->Commit());
+  delete txn;
+  TEST_SYNC_POINT(
+      "CheckpointTest::CurrentFileModifiedWhileCheckpointing2PC:PostCommit");
+  t.join();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  // No more than two logs files should exist.
+  std::vector<std::string> files;
+  ASSERT_OK(env_->GetChildren(snapshot_name_, &files));
+  int num_log_files = 0;
+  for (auto& file : files) {
+    uint64_t num;
+    FileType type;
+    WalFileType log_type;
+    if (ParseFileName(file, &num, &type, &log_type) && type == kWalFile) {
+      num_log_files++;
+    }
+  }
+  // One flush after preapare + one outstanding file before checkpoint + one log
+  // file generated after checkpoint.
+  ASSERT_LE(num_log_files, 3);
+
+  TransactionDB* snapshotDB;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions()));
+  column_families.push_back(
+      ColumnFamilyDescriptor("CFA", ColumnFamilyOptions()));
+  column_families.push_back(
+      ColumnFamilyDescriptor("CFB", ColumnFamilyOptions()));
+  std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*> cf_handles;
+  ASSERT_OK(TransactionDB::Open(options, txn_db_options, snapshot_name_,
+                                column_families, &cf_handles, &snapshotDB));
+  ASSERT_OK(snapshotDB->Get(read_options, "foo", &value));
+  ASSERT_EQ(value, "bar");
+  ASSERT_OK(snapshotDB->Get(read_options, cf_handles[1], "foocfa", &value));
+  ASSERT_EQ(value, "barcfa");
+
+  delete cfa;
+  delete cfb;
+  delete cf_handles[0];
+  delete cf_handles[1];
+  delete cf_handles[2];
+  delete snapshotDB;
+  snapshotDB = nullptr;
+  delete txdb;
+}
+
+TEST_F(CheckpointTest, CheckpointInvalidDirectoryName) {
+  for (std::string checkpoint_dir : {"", "/", "////"}) {
+    Checkpoint* checkpoint;
+    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+    ASSERT_TRUE(
+        checkpoint->CreateCheckpoint(checkpoint_dir).IsInvalidArgument());
+    delete checkpoint;
+  }
+}
+
+TEST_F(CheckpointTest, CheckpointWithParallelWrites) {
+  // When run with TSAN, this exposes the data race fixed in
+  // https://github.com/facebook/rocksdb/pull/3603
+  ASSERT_OK(Put("key1", "val1"));
+  port::Thread thread([this]() { ASSERT_OK(Put("key2", "val2")); });
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+  delete checkpoint;
+  thread.join();
+}
+
+TEST_F(CheckpointTest, CheckpointWithUnsyncedDataDropped) {
+  Options options = CurrentOptions();
+  std::unique_ptr<FaultInjectionTestEnv> env(new FaultInjectionTestEnv(env_));
+  options.env = env.get();
+  Reopen(options);
+  ASSERT_OK(Put("key1", "val1"));
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+  delete checkpoint;
+  ASSERT_OK(env->DropUnsyncedFileData());
+
+  // make sure it's openable even though whatever data that wasn't synced got
+  // dropped.
+  options.env = env_;
+  DB* snapshot_db;
+  ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db));
+  ReadOptions read_opts;
+  std::string get_result;
+  ASSERT_OK(snapshot_db->Get(read_opts, "key1", &get_result));
+  ASSERT_EQ("val1", get_result);
+  delete snapshot_db;
+  delete db_;
+  db_ = nullptr;
+}
+
+TEST_F(CheckpointTest, CheckpointOptionsFileFailedToPersist) {
+  // Regression test for a bug where checkpoint failed on a DB where persisting
+  // OPTIONS file failed and the DB was opened with
+  // `fail_if_options_file_error == false`.
+  Options options = CurrentOptions();
+  options.fail_if_options_file_error = false;
+  auto fault_fs = std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
+
+  // Setup `FaultInjectionTestFS` and `SyncPoint` callbacks to fail one
+  // operation when inside the OPTIONS file persisting code.
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  fault_fs->SetRandomMetadataWriteError(1 /* one_in */);
+  SyncPoint::GetInstance()->SetCallBack(
+      "PersistRocksDBOptions:start", [fault_fs](void* /* arg */) {
+        fault_fs->EnableMetadataWriteErrorInjection();
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "FaultInjectionTestFS::InjectMetadataWriteError:Injected",
+      [fault_fs](void* /* arg */) {
+        fault_fs->DisableMetadataWriteErrorInjection();
+      });
+  options.env = fault_fs_env.get();
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Reopen(options);
+  ASSERT_OK(Put("key1", "val1"));
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+  delete checkpoint;
+
+  // Make sure it's usable.
+  options.env = env_;
+  DB* snapshot_db;
+  ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db));
+  ReadOptions read_opts;
+  std::string get_result;
+  ASSERT_OK(snapshot_db->Get(read_opts, "key1", &get_result));
+  ASSERT_EQ("val1", get_result);
+  delete snapshot_db;
+  delete db_;
+  db_ = nullptr;
+}
+
+TEST_F(CheckpointTest, CheckpointReadOnlyDB) {
+  ASSERT_OK(Put("foo", "foo_value"));
+  ASSERT_OK(Flush());
+  Close();
+  Options options = CurrentOptions();
+  ASSERT_OK(ReadOnlyReopen(options));
+  Checkpoint* checkpoint = nullptr;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+  delete checkpoint;
+  checkpoint = nullptr;
+  Close();
+  DB* snapshot_db = nullptr;
+  ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db));
+  ReadOptions read_opts;
+  std::string get_result;
+  ASSERT_OK(snapshot_db->Get(read_opts, "foo", &get_result));
+  ASSERT_EQ("foo_value", get_result);
+  delete snapshot_db;
+}
+
+TEST_F(CheckpointTest, CheckpointReadOnlyDBWithMultipleColumnFamilies) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  for (int i = 0; i != 3; ++i) {
+    ASSERT_OK(Put(i, "foo", "foo_value"));
+    ASSERT_OK(Flush(i));
+  }
+  Close();
+  Status s = ReadOnlyReopenWithColumnFamilies(
+      {kDefaultColumnFamilyName, "pikachu", "eevee"}, options);
+  ASSERT_OK(s);
+  Checkpoint* checkpoint = nullptr;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+  delete checkpoint;
+  checkpoint = nullptr;
+  Close();
+
+  std::vector<ColumnFamilyDescriptor> column_families{
+      {kDefaultColumnFamilyName, options},
+      {"pikachu", options},
+      {"eevee", options}};
+  DB* snapshot_db = nullptr;
+  std::vector<ColumnFamilyHandle*> snapshot_handles;
+  s = DB::Open(options, snapshot_name_, column_families, &snapshot_handles,
+               &snapshot_db);
+  ASSERT_OK(s);
+  ReadOptions read_opts;
+  for (int i = 0; i != 3; ++i) {
+    std::string get_result;
+    s = snapshot_db->Get(read_opts, snapshot_handles[i], "foo", &get_result);
+    ASSERT_OK(s);
+    ASSERT_EQ("foo_value", get_result);
+  }
+
+  for (auto snapshot_h : snapshot_handles) {
+    delete snapshot_h;
+  }
+  snapshot_handles.clear();
+  delete snapshot_db;
+}
+
+TEST_F(CheckpointTest, CheckpointWithDbPath) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_ + "_2", 0);
+  Reopen(options);
+  ASSERT_OK(Put("key1", "val1"));
+  Flush();
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  // Currently not supported
+  ASSERT_TRUE(checkpoint->CreateCheckpoint(snapshot_name_).IsNotSupported());
+  delete checkpoint;
+}
+
+TEST_F(CheckpointTest, PutRaceWithCheckpointTrackedWalSync) {
+  // Repro for a race condition where a user write comes in after the checkpoint
+  // syncs WAL for `track_and_verify_wals_in_manifest` but before the
+  // corresponding MANIFEST update. With the bug, that scenario resulted in an
+  // unopenable DB with error "Corruption: Size mismatch: WAL ...".
+  Options options = CurrentOptions();
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(env_));
+  options.env = fault_env.get();
+  options.track_and_verify_wals_in_manifest = true;
+  Reopen(options);
+
+  ASSERT_OK(Put("key1", "val1"));
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::SyncWAL:BeforeMarkLogsSynced:1",
+      [this](void* /* arg */) { ASSERT_OK(Put("key2", "val2")); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::unique_ptr<Checkpoint> checkpoint;
+  {
+    Checkpoint* checkpoint_ptr;
+    ASSERT_OK(Checkpoint::Create(db_, &checkpoint_ptr));
+    checkpoint.reset(checkpoint_ptr);
+  }
+
+  ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+
+  // Ensure callback ran.
+  ASSERT_EQ("val2", Get("key2"));
+
+  Close();
+
+  // Simulate full loss of unsynced data. This drops "key2" -> "val2" from the
+  // DB WAL.
+  fault_env->DropUnsyncedFileData();
+
+  // Before the bug fix, reopening the DB would fail because the MANIFEST's
+  // AddWal entry indicated the WAL should be synced through "key2" -> "val2".
+  Reopen(options);
+
+  // Need to close before `fault_env` goes out of scope.
+  Close();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as Checkpoint is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/compaction_filters.cc b/src/rocksdb/utilities/compaction_filters.cc
new file mode 100644
index 000000000..8763901c3
--- /dev/null
+++ b/src/rocksdb/utilities/compaction_filters.cc
@@ -0,0 +1,56 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <memory>
+
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/options_type.h"
+#include "utilities/compaction_filters/layered_compaction_filter_base.h"
+#include "utilities/compaction_filters/remove_emptyvalue_compactionfilter.h"
+
+namespace ROCKSDB_NAMESPACE {
+#ifndef ROCKSDB_LITE
+static int RegisterBuiltinCompactionFilters(ObjectLibrary& library,
+                                            const std::string& /*arg*/) {
+  library.AddFactory<CompactionFilter>(
+      RemoveEmptyValueCompactionFilter::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<CompactionFilter>* /*guard*/,
+         std::string* /*errmsg*/) {
+        return new RemoveEmptyValueCompactionFilter();
+      });
+  return 1;
+}
+#endif  // ROCKSDB_LITE
+Status CompactionFilter::CreateFromString(const ConfigOptions& config_options,
+                                          const std::string& value,
+                                          const CompactionFilter** result) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    RegisterBuiltinCompactionFilters(*(ObjectLibrary::Default().get()), "");
+  });
+#endif  // ROCKSDB_LITE
+  CompactionFilter* filter = const_cast<CompactionFilter*>(*result);
+  Status status = LoadStaticObject<CompactionFilter>(config_options, value,
+                                                     nullptr, &filter);
+  if (status.ok()) {
+    *result = const_cast<CompactionFilter*>(filter);
+  }
+  return status;
+}
+
+Status CompactionFilterFactory::CreateFromString(
+    const ConfigOptions& config_options, const std::string& value,
+    std::shared_ptr<CompactionFilterFactory>* result) {
+  // Currently there are no builtin CompactionFilterFactories.
+  // If any are introduced, they need to be registered here.
+  Status status = LoadSharedObject<CompactionFilterFactory>(
+      config_options, value, nullptr, result);
+  return status;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/compaction_filters/layered_compaction_filter_base.h b/src/rocksdb/utilities/compaction_filters/layered_compaction_filter_base.h
new file mode 100644
index 000000000..803fa94ae
--- /dev/null
+++ b/src/rocksdb/utilities/compaction_filters/layered_compaction_filter_base.h
@@ -0,0 +1,41 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <memory>
+
+#include "rocksdb/compaction_filter.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Abstract base class for building layered compaction filter on top of
+// user compaction filter.
+// See BlobIndexCompactionFilter or TtlCompactionFilter for a basic usage.
+class LayeredCompactionFilterBase : public CompactionFilter {
+ public:
+  LayeredCompactionFilterBase(
+      const CompactionFilter* _user_comp_filter,
+      std::unique_ptr<const CompactionFilter> _user_comp_filter_from_factory)
+      : user_comp_filter_(_user_comp_filter),
+        user_comp_filter_from_factory_(
+            std::move(_user_comp_filter_from_factory)) {
+    if (!user_comp_filter_) {
+      user_comp_filter_ = user_comp_filter_from_factory_.get();
+    }
+  }
+
+  // Return a pointer to user compaction filter
+  const CompactionFilter* user_comp_filter() const { return user_comp_filter_; }
+
+  const Customizable* Inner() const override { return user_comp_filter_; }
+
+ protected:
+  const CompactionFilter* user_comp_filter_;
+
+ private:
+  std::unique_ptr<const CompactionFilter> user_comp_filter_from_factory_;
+};
+
+}  //  namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc b/src/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc
new file mode 100644
index 000000000..b788dbf9b
--- /dev/null
+++ b/src/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/compaction_filters/remove_emptyvalue_compactionfilter.h"
+
+#include <string>
+
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool RemoveEmptyValueCompactionFilter::Filter(int /*level*/,
+                                              const Slice& /*key*/,
+                                              const Slice& existing_value,
+                                              std::string* /*new_value*/,
+                                              bool* /*value_changed*/) const {
+  // remove kv pairs that have empty values
+  return existing_value.empty();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h b/src/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h
new file mode 100644
index 000000000..864ad15ff
--- /dev/null
+++ b/src/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#pragma once
+
+#include <string>
+
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class RemoveEmptyValueCompactionFilter : public CompactionFilter {
+ public:
+  static const char* kClassName() { return "RemoveEmptyValueCompactionFilter"; }
+
+  const char* Name() const override { return kClassName(); }
+
+  bool Filter(int level, const Slice& key, const Slice& existing_value,
+              std::string* new_value, bool* value_changed) const override;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/convenience/info_log_finder.cc b/src/rocksdb/utilities/convenience/info_log_finder.cc
new file mode 100644
index 000000000..fe62fd561
--- /dev/null
+++ b/src/rocksdb/utilities/convenience/info_log_finder.cc
@@ -0,0 +1,26 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 Facebook.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "rocksdb/utilities/info_log_finder.h"
+
+#include "file/filename.h"
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status GetInfoLogList(DB* db, std::vector<std::string>* info_log_list) {
+  if (!db) {
+    return Status::InvalidArgument("DB pointer is not valid");
+  }
+  std::string parent_path;
+  const Options& options = db->GetOptions();
+  return GetInfoLogFiles(options.env->GetFileSystem(), options.db_log_dir,
+                         db->GetName(), &parent_path, info_log_list);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/counted_fs.cc b/src/rocksdb/utilities/counted_fs.cc
new file mode 100644
index 000000000..e43f3a191
--- /dev/null
+++ b/src/rocksdb/utilities/counted_fs.cc
@@ -0,0 +1,379 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "utilities/counted_fs.h"
+
+#include <sstream>
+
+#include "rocksdb/file_system.h"
+#include "rocksdb/utilities/options_type.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+class CountedSequentialFile : public FSSequentialFileOwnerWrapper {
+ private:
+  CountedFileSystem* fs_;
+
+ public:
+  CountedSequentialFile(std::unique_ptr<FSSequentialFile>&& f,
+                        CountedFileSystem* fs)
+      : FSSequentialFileOwnerWrapper(std::move(f)), fs_(fs) {}
+
+  ~CountedSequentialFile() override { fs_->counters()->closes++; }
+
+  IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+                char* scratch, IODebugContext* dbg) override {
+    IOStatus rv = target()->Read(n, options, result, scratch, dbg);
+    fs_->counters()->reads.RecordOp(rv, result->size());
+    return rv;
+  }
+
+  IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options,
+                          Slice* result, char* scratch,
+                          IODebugContext* dbg) override {
+    IOStatus rv =
+        target()->PositionedRead(offset, n, options, result, scratch, dbg);
+    fs_->counters()->reads.RecordOp(rv, result->size());
+    return rv;
+  }
+};
+
+class CountedRandomAccessFile : public FSRandomAccessFileOwnerWrapper {
+ private:
+  CountedFileSystem* fs_;
+
+ public:
+  CountedRandomAccessFile(std::unique_ptr<FSRandomAccessFile>&& f,
+                          CountedFileSystem* fs)
+      : FSRandomAccessFileOwnerWrapper(std::move(f)), fs_(fs) {}
+
+  ~CountedRandomAccessFile() override { fs_->counters()->closes++; }
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override {
+    IOStatus rv = target()->Read(offset, n, options, result, scratch, dbg);
+    fs_->counters()->reads.RecordOp(rv, result->size());
+    return rv;
+  }
+
+  IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
+                     const IOOptions& options, IODebugContext* dbg) override {
+    IOStatus rv = target()->MultiRead(reqs, num_reqs, options, dbg);
+    for (size_t r = 0; r < num_reqs; r++) {
+      fs_->counters()->reads.RecordOp(reqs[r].status, reqs[r].result.size());
+    }
+    return rv;
+  }
+};
+
+class CountedWritableFile : public FSWritableFileOwnerWrapper {
+ private:
+  CountedFileSystem* fs_;
+
+ public:
+  CountedWritableFile(std::unique_ptr<FSWritableFile>&& f,
+                      CountedFileSystem* fs)
+      : FSWritableFileOwnerWrapper(std::move(f)), fs_(fs) {}
+
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  IODebugContext* dbg) override {
+    IOStatus rv = target()->Append(data, options, dbg);
+    fs_->counters()->writes.RecordOp(rv, data.size());
+    return rv;
+  }
+
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  const DataVerificationInfo& info,
+                  IODebugContext* dbg) override {
+    IOStatus rv = target()->Append(data, options, info, dbg);
+    fs_->counters()->writes.RecordOp(rv, data.size());
+    return rv;
+  }
+
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& options,
+                            IODebugContext* dbg) override {
+    IOStatus rv = target()->PositionedAppend(data, offset, options, dbg);
+    fs_->counters()->writes.RecordOp(rv, data.size());
+    return rv;
+  }
+
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& options,
+                            const DataVerificationInfo& info,
+                            IODebugContext* dbg) override {
+    IOStatus rv = target()->PositionedAppend(data, offset, options, info, dbg);
+    fs_->counters()->writes.RecordOp(rv, data.size());
+    return rv;
+  }
+
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+    IOStatus rv = target()->Close(options, dbg);
+    if (rv.ok()) {
+      fs_->counters()->closes++;
+    }
+    return rv;
+  }
+
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override {
+    IOStatus rv = target()->Flush(options, dbg);
+    if (rv.ok()) {
+      fs_->counters()->flushes++;
+    }
+    return rv;
+  }
+
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override {
+    IOStatus rv = target()->Sync(options, dbg);
+    if (rv.ok()) {
+      fs_->counters()->syncs++;
+    }
+    return rv;
+  }
+
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override {
+    IOStatus rv = target()->Fsync(options, dbg);
+    if (rv.ok()) {
+      fs_->counters()->fsyncs++;
+    }
+    return rv;
+  }
+
+  IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options,
+                     IODebugContext* dbg) override {
+    IOStatus rv = target()->RangeSync(offset, nbytes, options, dbg);
+    if (rv.ok()) {
+      fs_->counters()->syncs++;
+    }
+    return rv;
+  }
+};
+
+class CountedRandomRWFile : public FSRandomRWFileOwnerWrapper {
+ private:
+  mutable CountedFileSystem* fs_;
+
+ public:
+  CountedRandomRWFile(std::unique_ptr<FSRandomRWFile>&& f,
+                      CountedFileSystem* fs)
+      : FSRandomRWFileOwnerWrapper(std::move(f)), fs_(fs) {}
+  IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options,
+                 IODebugContext* dbg) override {
+    IOStatus rv = target()->Write(offset, data, options, dbg);
+    fs_->counters()->writes.RecordOp(rv, data.size());
+    return rv;
+  }
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override {
+    IOStatus rv = target()->Read(offset, n, options, result, scratch, dbg);
+    fs_->counters()->reads.RecordOp(rv, result->size());
+    return rv;
+  }
+
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override {
+    IOStatus rv = target()->Flush(options, dbg);
+    if (rv.ok()) {
+      fs_->counters()->flushes++;
+    }
+    return rv;
+  }
+
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override {
+    IOStatus rv = target()->Sync(options, dbg);
+    if (rv.ok()) {
+      fs_->counters()->syncs++;
+    }
+    return rv;
+  }
+
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override {
+    IOStatus rv = target()->Fsync(options, dbg);
+    if (rv.ok()) {
+      fs_->counters()->fsyncs++;
+    }
+    return rv;
+  }
+
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+    IOStatus rv = target()->Close(options, dbg);
+    if (rv.ok()) {
+      fs_->counters()->closes++;
+    }
+    return rv;
+  }
+};
+
+class CountedDirectory : public FSDirectoryWrapper {
+ private:
+  mutable CountedFileSystem* fs_;
+  bool closed_ = false;
+
+ public:
+  CountedDirectory(std::unique_ptr<FSDirectory>&& f, CountedFileSystem* fs)
+      : FSDirectoryWrapper(std::move(f)), fs_(fs) {}
+
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override {
+    IOStatus rv = FSDirectoryWrapper::Fsync(options, dbg);
+    if (rv.ok()) {
+      fs_->counters()->dsyncs++;
+    }
+    return rv;
+  }
+
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+    IOStatus rv = FSDirectoryWrapper::Close(options, dbg);
+    if (rv.ok()) {
+      fs_->counters()->closes++;
+      fs_->counters()->dir_closes++;
+      closed_ = true;
+    }
+    return rv;
+  }
+
+  IOStatus FsyncWithDirOptions(const IOOptions& options, IODebugContext* dbg,
+                               const DirFsyncOptions& dir_options) override {
+    IOStatus rv =
+        FSDirectoryWrapper::FsyncWithDirOptions(options, dbg, dir_options);
+    if (rv.ok()) {
+      fs_->counters()->dsyncs++;
+    }
+    return rv;
+  }
+
+  ~CountedDirectory() {
+    if (!closed_) {
+      // TODO: fix DB+CF code to use explicit Close, not rely on destructor
+      fs_->counters()->closes++;
+      fs_->counters()->dir_closes++;
+    }
+  }
+};
+}  // anonymous namespace
+
+std::string FileOpCounters::PrintCounters() const {
+  std::stringstream ss;
+  ss << "Num files opened: " << opens.load(std::memory_order_relaxed)
+     << std::endl;
+  ss << "Num files deleted: " << deletes.load(std::memory_order_relaxed)
+     << std::endl;
+  ss << "Num files renamed: " << renames.load(std::memory_order_relaxed)
+     << std::endl;
+  ss << "Num Flush(): " << flushes.load(std::memory_order_relaxed) << std::endl;
+  ss << "Num Sync(): " << syncs.load(std::memory_order_relaxed) << std::endl;
+  ss << "Num Fsync(): " << fsyncs.load(std::memory_order_relaxed) << std::endl;
+  ss << "Num Dir Fsync(): " << dsyncs.load(std::memory_order_relaxed)
+     << std::endl;
+  ss << "Num Close(): " << closes.load(std::memory_order_relaxed) << std::endl;
+  ss << "Num Dir Open(): " << dir_opens.load(std::memory_order_relaxed)
+     << std::endl;
+  ss << "Num Dir Close(): " << dir_closes.load(std::memory_order_relaxed)
+     << std::endl;
+  ss << "Num Read(): " << reads.ops.load(std::memory_order_relaxed)
+     << std::endl;
+  ss << "Num Append(): " << writes.ops.load(std::memory_order_relaxed)
+     << std::endl;
+  ss << "Num bytes read: " << reads.bytes.load(std::memory_order_relaxed)
+     << std::endl;
+  ss << "Num bytes written: " << writes.bytes.load(std::memory_order_relaxed)
+     << std::endl;
+  return ss.str();
+}
+
+CountedFileSystem::CountedFileSystem(const std::shared_ptr<FileSystem>& base)
+    : FileSystemWrapper(base) {}
+
+IOStatus CountedFileSystem::NewSequentialFile(
+    const std::string& f, const FileOptions& options,
+    std::unique_ptr<FSSequentialFile>* r, IODebugContext* dbg) {
+  std::unique_ptr<FSSequentialFile> base;
+  IOStatus s = target()->NewSequentialFile(f, options, &base, dbg);
+  if (s.ok()) {
+    counters_.opens++;
+    r->reset(new CountedSequentialFile(std::move(base), this));
+  }
+  return s;
+}
+
+IOStatus CountedFileSystem::NewRandomAccessFile(
+    const std::string& f, const FileOptions& options,
+    std::unique_ptr<FSRandomAccessFile>* r, IODebugContext* dbg) {
+  std::unique_ptr<FSRandomAccessFile> base;
+  IOStatus s = target()->NewRandomAccessFile(f, options, &base, dbg);
+  if (s.ok()) {
+    counters_.opens++;
+    r->reset(new CountedRandomAccessFile(std::move(base), this));
+  }
+  return s;
+}
+
+IOStatus CountedFileSystem::NewWritableFile(const std::string& f,
+                                            const FileOptions& options,
+                                            std::unique_ptr<FSWritableFile>* r,
+                                            IODebugContext* dbg) {
+  std::unique_ptr<FSWritableFile> base;
+  IOStatus s = target()->NewWritableFile(f, options, &base, dbg);
+  if (s.ok()) {
+    counters_.opens++;
+    r->reset(new CountedWritableFile(std::move(base), this));
+  }
+  return s;
+}
+
+IOStatus CountedFileSystem::ReopenWritableFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) {
+  std::unique_ptr<FSWritableFile> base;
+  IOStatus s = target()->ReopenWritableFile(fname, options, &base, dbg);
+  if (s.ok()) {
+    counters_.opens++;
+    result->reset(new CountedWritableFile(std::move(base), this));
+  }
+  return s;
+}
+
+IOStatus CountedFileSystem::ReuseWritableFile(
+    const std::string& fname, const std::string& old_fname,
+    const FileOptions& options, std::unique_ptr<FSWritableFile>* result,
+    IODebugContext* dbg) {
+  std::unique_ptr<FSWritableFile> base;
+  IOStatus s =
+      target()->ReuseWritableFile(fname, old_fname, options, &base, dbg);
+  if (s.ok()) {
+    counters_.opens++;
+    result->reset(new CountedWritableFile(std::move(base), this));
+  }
+  return s;
+}
+
+IOStatus CountedFileSystem::NewRandomRWFile(
+    const std::string& name, const FileOptions& options,
+    std::unique_ptr<FSRandomRWFile>* result, IODebugContext* dbg) {
+  std::unique_ptr<FSRandomRWFile> base;
+  IOStatus s = target()->NewRandomRWFile(name, options, &base, dbg);
+  if (s.ok()) {
+    counters_.opens++;
+    result->reset(new CountedRandomRWFile(std::move(base), this));
+  }
+  return s;
+}
+
+IOStatus CountedFileSystem::NewDirectory(const std::string& name,
+                                         const IOOptions& options,
+                                         std::unique_ptr<FSDirectory>* result,
+                                         IODebugContext* dbg) {
+  std::unique_ptr<FSDirectory> base;
+  IOStatus s = target()->NewDirectory(name, options, &base, dbg);
+  if (s.ok()) {
+    counters_.opens++;
+    counters_.dir_opens++;
+    result->reset(new CountedDirectory(std::move(base), this));
+  }
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/counted_fs.h b/src/rocksdb/utilities/counted_fs.h
new file mode 100644
index 000000000..cb8a8968f
--- /dev/null
+++ b/src/rocksdb/utilities/counted_fs.h
@@ -0,0 +1,158 @@
+//  Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+
+#include "rocksdb/file_system.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Logger;
+
+struct OpCounter {
+  std::atomic<int> ops;
+  std::atomic<uint64_t> bytes;
+
+  OpCounter() : ops(0), bytes(0) {}
+
+  void Reset() {
+    ops = 0;
+    bytes = 0;
+  }
+  void RecordOp(const IOStatus& io_s, size_t added_bytes) {
+    if (!io_s.IsNotSupported()) {
+      ops.fetch_add(1, std::memory_order_relaxed);
+    }
+    if (io_s.ok()) {
+      bytes.fetch_add(added_bytes, std::memory_order_relaxed);
+    }
+  }
+};
+
+struct FileOpCounters {
+  static const char* kName() { return "FileOpCounters"; }
+
+  std::atomic<int> opens;
+  std::atomic<int> closes;
+  std::atomic<int> deletes;
+  std::atomic<int> renames;
+  std::atomic<int> flushes;
+  std::atomic<int> syncs;
+  std::atomic<int> dsyncs;
+  std::atomic<int> fsyncs;
+  std::atomic<int> dir_opens;
+  std::atomic<int> dir_closes;
+  OpCounter reads;
+  OpCounter writes;
+
+  FileOpCounters()
+      : opens(0),
+        closes(0),
+        deletes(0),
+        renames(0),
+        flushes(0),
+        syncs(0),
+        dsyncs(0),
+        fsyncs(0),
+        dir_opens(0),
+        dir_closes(0) {}
+
+  void Reset() {
+    opens = 0;
+    closes = 0;
+    deletes = 0;
+    renames = 0;
+    flushes = 0;
+    syncs = 0;
+    dsyncs = 0;
+    fsyncs = 0;
+    dir_opens = 0;
+    dir_closes = 0;
+    reads.Reset();
+    writes.Reset();
+  }
+  std::string PrintCounters() const;
+};
+
+// A FileSystem class that counts operations (reads, writes, opens, closes, etc)
+class CountedFileSystem : public FileSystemWrapper {
+ public:
+ private:
+  FileOpCounters counters_;
+
+ public:
+  explicit CountedFileSystem(const std::shared_ptr<FileSystem>& base);
+  static const char* kClassName() { return "CountedFileSystem"; }
+  const char* Name() const override { return kClassName(); }
+
+  IOStatus NewSequentialFile(const std::string& f, const FileOptions& options,
+                             std::unique_ptr<FSSequentialFile>* r,
+                             IODebugContext* dbg) override;
+
+  IOStatus NewRandomAccessFile(const std::string& f,
+                               const FileOptions& file_opts,
+                               std::unique_ptr<FSRandomAccessFile>* r,
+                               IODebugContext* dbg) override;
+
+  IOStatus NewWritableFile(const std::string& f, const FileOptions& options,
+                           std::unique_ptr<FSWritableFile>* r,
+                           IODebugContext* dbg) override;
+  IOStatus ReopenWritableFile(const std::string& fname,
+                              const FileOptions& options,
+                              std::unique_ptr<FSWritableFile>* result,
+                              IODebugContext* dbg) override;
+
+  IOStatus ReuseWritableFile(const std::string& fname,
+                             const std::string& old_fname,
+                             const FileOptions& file_opts,
+                             std::unique_ptr<FSWritableFile>* result,
+                             IODebugContext* dbg) override;
+  IOStatus NewRandomRWFile(const std::string& name, const FileOptions& options,
+                           std::unique_ptr<FSRandomRWFile>* result,
+                           IODebugContext* dbg) override;
+
+  IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts,
+                        std::unique_ptr<FSDirectory>* result,
+                        IODebugContext* dbg) override;
+
+  IOStatus DeleteFile(const std::string& fname, const IOOptions& options,
+                      IODebugContext* dbg) override {
+    IOStatus s = target()->DeleteFile(fname, options, dbg);
+    if (s.ok()) {
+      counters_.deletes++;
+    }
+    return s;
+  }
+
+  IOStatus RenameFile(const std::string& s, const std::string& t,
+                      const IOOptions& options, IODebugContext* dbg) override {
+    IOStatus st = target()->RenameFile(s, t, options, dbg);
+    if (st.ok()) {
+      counters_.renames++;
+    }
+    return st;
+  }
+
+  const FileOpCounters* counters() const { return &counters_; }
+
+  FileOpCounters* counters() { return &counters_; }
+
+  const void* GetOptionsPtr(const std::string& name) const override {
+    if (name == FileOpCounters::kName()) {
+      return counters();
+    } else {
+      return FileSystemWrapper::GetOptionsPtr(name);
+    }
+  }
+
+  // Prints the counters to a string
+  std::string PrintCounters() const { return counters_.PrintCounters(); }
+  void ResetCounters() { counters_.Reset(); }
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/debug.cc b/src/rocksdb/utilities/debug.cc
new file mode 100644
index 000000000..f2c3bb513
--- /dev/null
+++ b/src/rocksdb/utilities/debug.cc
@@ -0,0 +1,120 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/debug.h"
+
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/utilities/options_type.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static std::unordered_map<std::string, ValueType> value_type_string_map = {
+    {"TypeDeletion", ValueType::kTypeDeletion},
+    {"TypeValue", ValueType::kTypeValue},
+    {"TypeMerge", ValueType::kTypeMerge},
+    {"TypeLogData", ValueType::kTypeLogData},
+    {"TypeColumnFamilyDeletion", ValueType::kTypeColumnFamilyDeletion},
+    {"TypeColumnFamilyValue", ValueType::kTypeColumnFamilyValue},
+    {"TypeColumnFamilyMerge", ValueType::kTypeColumnFamilyMerge},
+    {"TypeSingleDeletion", ValueType::kTypeSingleDeletion},
+    {"TypeColumnFamilySingleDeletion",
+     ValueType::kTypeColumnFamilySingleDeletion},
+    {"TypeBeginPrepareXID", ValueType::kTypeBeginPrepareXID},
+    {"TypeEndPrepareXID", ValueType::kTypeEndPrepareXID},
+    {"TypeCommitXID", ValueType::kTypeCommitXID},
+    {"TypeRollbackXID", ValueType::kTypeRollbackXID},
+    {"TypeNoop", ValueType::kTypeNoop},
+    {"TypeColumnFamilyRangeDeletion",
+     ValueType::kTypeColumnFamilyRangeDeletion},
+    {"TypeRangeDeletion", ValueType::kTypeRangeDeletion},
+    {"TypeColumnFamilyBlobIndex", ValueType::kTypeColumnFamilyBlobIndex},
+    {"TypeBlobIndex", ValueType::kTypeBlobIndex},
+    {"TypeBeginPersistedPrepareXID", ValueType::kTypeBeginPersistedPrepareXID},
+    {"TypeBeginUnprepareXID", ValueType::kTypeBeginUnprepareXID},
+    {"TypeDeletionWithTimestamp", ValueType::kTypeDeletionWithTimestamp},
+    {"TypeCommitXIDAndTimestamp", ValueType::kTypeCommitXIDAndTimestamp},
+    {"TypeWideColumnEntity", ValueType::kTypeWideColumnEntity},
+    {"TypeColumnFamilyWideColumnEntity",
+     ValueType::kTypeColumnFamilyWideColumnEntity}};
+
+std::string KeyVersion::GetTypeName() const {
+  std::string type_name;
+  if (SerializeEnum<ValueType>(value_type_string_map,
+                               static_cast<ValueType>(type), &type_name)) {
+    return type_name;
+  } else {
+    return "Invalid";
+  }
+}
+
+Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key,
+                         size_t max_num_ikeys,
+                         std::vector<KeyVersion>* key_versions) {
+  if (nullptr == db) {
+    return Status::InvalidArgument("db cannot be null.");
+  }
+  return GetAllKeyVersions(db, db->DefaultColumnFamily(), begin_key, end_key,
+                           max_num_ikeys, key_versions);
+}
+
+Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key,
+                         Slice end_key, size_t max_num_ikeys,
+                         std::vector<KeyVersion>* key_versions) {
+  if (nullptr == db) {
+    return Status::InvalidArgument("db cannot be null.");
+  }
+  if (nullptr == cfh) {
+    return Status::InvalidArgument("Column family handle cannot be null.");
+  }
+  if (nullptr == key_versions) {
+    return Status::InvalidArgument("key_versions cannot be null.");
+  }
+  key_versions->clear();
+
+  DBImpl* idb = static_cast<DBImpl*>(db->GetRootDB());
+  auto icmp = InternalKeyComparator(idb->GetOptions(cfh).comparator);
+  ReadOptions read_options;
+  Arena arena;
+  ScopedArenaIterator iter(
+      idb->NewInternalIterator(read_options, &arena, kMaxSequenceNumber, cfh));
+
+  if (!begin_key.empty()) {
+    InternalKey ikey;
+    ikey.SetMinPossibleForUserKey(begin_key);
+    iter->Seek(ikey.Encode());
+  } else {
+    iter->SeekToFirst();
+  }
+
+  size_t num_keys = 0;
+  for (; iter->Valid(); iter->Next()) {
+    ParsedInternalKey ikey;
+    Status pik_status =
+        ParseInternalKey(iter->key(), &ikey, true /* log_err_key */);  // TODO
+    if (!pik_status.ok()) {
+      return pik_status;
+    }
+
+    if (!end_key.empty() &&
+        icmp.user_comparator()->Compare(ikey.user_key, end_key) > 0) {
+      break;
+    }
+
+    key_versions->emplace_back(ikey.user_key.ToString() /* _user_key */,
+                               iter->value().ToString() /* _value */,
+                               ikey.sequence /* _sequence */,
+                               static_cast<int>(ikey.type) /* _type */);
+    if (++num_keys >= max_num_ikeys) {
+      break;
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/env_mirror.cc b/src/rocksdb/utilities/env_mirror.cc
new file mode 100644
index 000000000..3ea323b42
--- /dev/null
+++ b/src/rocksdb/utilities/env_mirror.cc
@@ -0,0 +1,275 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2015, Red Hat, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/env_mirror.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// An implementation of Env that mirrors all work over two backend
+// Env's.  This is useful for debugging purposes.
+class SequentialFileMirror : public SequentialFile {
+ public:
+  std::unique_ptr<SequentialFile> a_, b_;
+  std::string fname;
+  explicit SequentialFileMirror(std::string f) : fname(f) {}
+
+  Status Read(size_t n, Slice* result, char* scratch) override {
+    Slice aslice;
+    Status as = a_->Read(n, &aslice, scratch);
+    if (as == Status::OK()) {
+      char* bscratch = new char[n];
+      Slice bslice;
+#ifndef NDEBUG
+      size_t off = 0;
+#endif
+      size_t left = aslice.size();
+      while (left) {
+        Status bs = b_->Read(left, &bslice, bscratch);
+#ifndef NDEBUG
+        assert(as == bs);
+        assert(memcmp(bscratch, scratch + off, bslice.size()) == 0);
+        off += bslice.size();
+#endif
+        left -= bslice.size();
+      }
+      delete[] bscratch;
+      *result = aslice;
+    } else {
+      Status bs = b_->Read(n, result, scratch);
+      assert(as == bs);
+    }
+    return as;
+  }
+
+  Status Skip(uint64_t n) override {
+    Status as = a_->Skip(n);
+    Status bs = b_->Skip(n);
+    assert(as == bs);
+    return as;
+  }
+  Status InvalidateCache(size_t offset, size_t length) override {
+    Status as = a_->InvalidateCache(offset, length);
+    Status bs = b_->InvalidateCache(offset, length);
+    assert(as == bs);
+    return as;
+  };
+};
+
+class RandomAccessFileMirror : public RandomAccessFile {
+ public:
+  std::unique_ptr<RandomAccessFile> a_, b_;
+  std::string fname;
+  explicit RandomAccessFileMirror(std::string f) : fname(f) {}
+
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const override {
+    Status as = a_->Read(offset, n, result, scratch);
+    if (as == Status::OK()) {
+      char* bscratch = new char[n];
+      Slice bslice;
+      size_t off = 0;
+      size_t left = result->size();
+      while (left) {
+        Status bs = b_->Read(offset + off, left, &bslice, bscratch);
+        assert(as == bs);
+        assert(memcmp(bscratch, scratch + off, bslice.size()) == 0);
+        off += bslice.size();
+        left -= bslice.size();
+      }
+      delete[] bscratch;
+    } else {
+      Status bs = b_->Read(offset, n, result, scratch);
+      assert(as == bs);
+    }
+    return as;
+  }
+
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    // NOTE: not verified
+    return a_->GetUniqueId(id, max_size);
+  }
+};
+
+class WritableFileMirror : public WritableFile {
+ public:
+  std::unique_ptr<WritableFile> a_, b_;
+  std::string fname;
+  explicit WritableFileMirror(std::string f, const EnvOptions& options)
+      : WritableFile(options), fname(f) {}
+
+  Status Append(const Slice& data) override {
+    Status as = a_->Append(data);
+    Status bs = b_->Append(data);
+    assert(as == bs);
+    return as;
+  }
+  Status Append(const Slice& data,
+                const DataVerificationInfo& /* verification_info */) override {
+    return Append(data);
+  }
+  Status PositionedAppend(const Slice& data, uint64_t offset) override {
+    Status as = a_->PositionedAppend(data, offset);
+    Status bs = b_->PositionedAppend(data, offset);
+    assert(as == bs);
+    return as;
+  }
+  Status PositionedAppend(
+      const Slice& data, uint64_t offset,
+      const DataVerificationInfo& /* verification_info */) override {
+    return PositionedAppend(data, offset);
+  }
+  Status Truncate(uint64_t size) override {
+    Status as = a_->Truncate(size);
+    Status bs = b_->Truncate(size);
+    assert(as == bs);
+    return as;
+  }
+  Status Close() override {
+    Status as = a_->Close();
+    Status bs = b_->Close();
+    assert(as == bs);
+    return as;
+  }
+  Status Flush() override {
+    Status as = a_->Flush();
+    Status bs = b_->Flush();
+    assert(as == bs);
+    return as;
+  }
+  Status Sync() override {
+    Status as = a_->Sync();
+    Status bs = b_->Sync();
+    assert(as == bs);
+    return as;
+  }
+  Status Fsync() override {
+    Status as = a_->Fsync();
+    Status bs = b_->Fsync();
+    assert(as == bs);
+    return as;
+  }
+  bool IsSyncThreadSafe() const override {
+    bool as = a_->IsSyncThreadSafe();
+    assert(as == b_->IsSyncThreadSafe());
+    return as;
+  }
+  void SetIOPriority(Env::IOPriority pri) override {
+    a_->SetIOPriority(pri);
+    b_->SetIOPriority(pri);
+  }
+  Env::IOPriority GetIOPriority() override {
+    // NOTE: we don't verify this one
+    return a_->GetIOPriority();
+  }
+  uint64_t GetFileSize() override {
+    uint64_t as = a_->GetFileSize();
+    assert(as == b_->GetFileSize());
+    return as;
+  }
+  void GetPreallocationStatus(size_t* block_size,
+                              size_t* last_allocated_block) override {
+    // NOTE: we don't verify this one
+    return a_->GetPreallocationStatus(block_size, last_allocated_block);
+  }
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    // NOTE: we don't verify this one
+    return a_->GetUniqueId(id, max_size);
+  }
+  Status InvalidateCache(size_t offset, size_t length) override {
+    Status as = a_->InvalidateCache(offset, length);
+    Status bs = b_->InvalidateCache(offset, length);
+    assert(as == bs);
+    return as;
+  }
+
+ protected:
+  Status Allocate(uint64_t offset, uint64_t length) override {
+    Status as = a_->Allocate(offset, length);
+    Status bs = b_->Allocate(offset, length);
+    assert(as == bs);
+    return as;
+  }
+  Status RangeSync(uint64_t offset, uint64_t nbytes) override {
+    Status as = a_->RangeSync(offset, nbytes);
+    Status bs = b_->RangeSync(offset, nbytes);
+    assert(as == bs);
+    return as;
+  }
+};
+
+Status EnvMirror::NewSequentialFile(const std::string& f,
+                                    std::unique_ptr<SequentialFile>* r,
+                                    const EnvOptions& options) {
+  if (f.find("/proc/") == 0) {
+    return a_->NewSequentialFile(f, r, options);
+  }
+  SequentialFileMirror* mf = new SequentialFileMirror(f);
+  Status as = a_->NewSequentialFile(f, &mf->a_, options);
+  Status bs = b_->NewSequentialFile(f, &mf->b_, options);
+  assert(as == bs);
+  if (as.ok())
+    r->reset(mf);
+  else
+    delete mf;
+  return as;
+}
+
+Status EnvMirror::NewRandomAccessFile(const std::string& f,
+                                      std::unique_ptr<RandomAccessFile>* r,
+                                      const EnvOptions& options) {
+  if (f.find("/proc/") == 0) {
+    return a_->NewRandomAccessFile(f, r, options);
+  }
+  RandomAccessFileMirror* mf = new RandomAccessFileMirror(f);
+  Status as = a_->NewRandomAccessFile(f, &mf->a_, options);
+  Status bs = b_->NewRandomAccessFile(f, &mf->b_, options);
+  assert(as == bs);
+  if (as.ok())
+    r->reset(mf);
+  else
+    delete mf;
+  return as;
+}
+
+Status EnvMirror::NewWritableFile(const std::string& f,
+                                  std::unique_ptr<WritableFile>* r,
+                                  const EnvOptions& options) {
+  if (f.find("/proc/") == 0) return a_->NewWritableFile(f, r, options);
+  WritableFileMirror* mf = new WritableFileMirror(f, options);
+  Status as = a_->NewWritableFile(f, &mf->a_, options);
+  Status bs = b_->NewWritableFile(f, &mf->b_, options);
+  assert(as == bs);
+  if (as.ok())
+    r->reset(mf);
+  else
+    delete mf;
+  return as;
+}
+
+Status EnvMirror::ReuseWritableFile(const std::string& fname,
+                                    const std::string& old_fname,
+                                    std::unique_ptr<WritableFile>* r,
+                                    const EnvOptions& options) {
+  if (fname.find("/proc/") == 0)
+    return a_->ReuseWritableFile(fname, old_fname, r, options);
+  WritableFileMirror* mf = new WritableFileMirror(fname, options);
+  Status as = a_->ReuseWritableFile(fname, old_fname, &mf->a_, options);
+  Status bs = b_->ReuseWritableFile(fname, old_fname, &mf->b_, options);
+  assert(as == bs);
+  if (as.ok())
+    r->reset(mf);
+  else
+    delete mf;
+  return as;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif
diff --git a/src/rocksdb/utilities/env_mirror_test.cc b/src/rocksdb/utilities/env_mirror_test.cc
new file mode 100644
index 000000000..c372de1da
--- /dev/null
+++ b/src/rocksdb/utilities/env_mirror_test.cc
@@ -0,0 +1,226 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  Copyright (c) 2015, Red Hat, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/env_mirror.h"
+
+#include "env/mock_env.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class EnvMirrorTest : public testing::Test {
+ public:
+  Env* default_;
+  MockEnv *a_, *b_;
+  EnvMirror* env_;
+  const EnvOptions soptions_;
+
+  EnvMirrorTest()
+      : default_(Env::Default()),
+        a_(new MockEnv(default_)),
+        b_(new MockEnv(default_)),
+        env_(new EnvMirror(a_, b_)) {}
+  ~EnvMirrorTest() {
+    delete env_;
+    delete a_;
+    delete b_;
+  }
+};
+
+TEST_F(EnvMirrorTest, Basics) {
+  uint64_t file_size;
+  std::unique_ptr<WritableFile> writable_file;
+  std::vector<std::string> children;
+
+  ASSERT_OK(env_->CreateDir("/dir"));
+
+  // Check that the directory is empty.
+  ASSERT_EQ(Status::NotFound(), env_->FileExists("/dir/non_existent"));
+  ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok());
+  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_EQ(0U, children.size());
+
+  // Create a file.
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  writable_file.reset();
+
+  // Check that the file exists.
+  ASSERT_OK(env_->FileExists("/dir/f"));
+  ASSERT_OK(a_->FileExists("/dir/f"));
+  ASSERT_OK(b_->FileExists("/dir/f"));
+  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
+  ASSERT_EQ(0U, file_size);
+  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_EQ(1U, children.size());
+  ASSERT_EQ("f", children[0]);
+  ASSERT_OK(a_->GetChildren("/dir", &children));
+  ASSERT_EQ(1U, children.size());
+  ASSERT_EQ("f", children[0]);
+  ASSERT_OK(b_->GetChildren("/dir", &children));
+  ASSERT_EQ(1U, children.size());
+  ASSERT_EQ("f", children[0]);
+
+  // Write to the file.
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append("abc"));
+  writable_file.reset();
+
+  // Check for expected size.
+  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
+  ASSERT_EQ(3U, file_size);
+  ASSERT_OK(a_->GetFileSize("/dir/f", &file_size));
+  ASSERT_EQ(3U, file_size);
+  ASSERT_OK(b_->GetFileSize("/dir/f", &file_size));
+  ASSERT_EQ(3U, file_size);
+
+  // Check that renaming works.
+  ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok());
+  ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g"));
+  ASSERT_EQ(Status::NotFound(), env_->FileExists("/dir/f"));
+  ASSERT_OK(env_->FileExists("/dir/g"));
+  ASSERT_OK(env_->GetFileSize("/dir/g", &file_size));
+  ASSERT_EQ(3U, file_size);
+  ASSERT_OK(a_->FileExists("/dir/g"));
+  ASSERT_OK(a_->GetFileSize("/dir/g", &file_size));
+  ASSERT_EQ(3U, file_size);
+  ASSERT_OK(b_->FileExists("/dir/g"));
+  ASSERT_OK(b_->GetFileSize("/dir/g", &file_size));
+  ASSERT_EQ(3U, file_size);
+
+  // Check that opening non-existent file fails.
+  std::unique_ptr<SequentialFile> seq_file;
+  std::unique_ptr<RandomAccessFile> rand_file;
+  ASSERT_TRUE(
+      !env_->NewSequentialFile("/dir/non_existent", &seq_file, soptions_).ok());
+  ASSERT_TRUE(!seq_file);
+  ASSERT_TRUE(
+      !env_->NewRandomAccessFile("/dir/non_existent", &rand_file, soptions_)
+           .ok());
+  ASSERT_TRUE(!rand_file);
+
+  // Check that deleting works.
+  ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok());
+  ASSERT_OK(env_->DeleteFile("/dir/g"));
+  ASSERT_EQ(Status::NotFound(), env_->FileExists("/dir/g"));
+  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_EQ(0U, children.size());
+  ASSERT_OK(env_->DeleteDir("/dir"));
+}
+
+TEST_F(EnvMirrorTest, ReadWrite) {
+  std::unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<SequentialFile> seq_file;
+  std::unique_ptr<RandomAccessFile> rand_file;
+  Slice result;
+  char scratch[100];
+
+  ASSERT_OK(env_->CreateDir("/dir"));
+
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append("hello "));
+  ASSERT_OK(writable_file->Append("world"));
+  writable_file.reset();
+
+  // Read sequentially.
+  ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_));
+  ASSERT_OK(seq_file->Read(5, &result, scratch));  // Read "hello".
+  ASSERT_EQ(0, result.compare("hello"));
+  ASSERT_OK(seq_file->Skip(1));
+  ASSERT_OK(seq_file->Read(1000, &result, scratch));  // Read "world".
+  ASSERT_EQ(0, result.compare("world"));
+  ASSERT_OK(seq_file->Read(1000, &result, scratch));  // Try reading past EOF.
+  ASSERT_EQ(0U, result.size());
+  ASSERT_OK(seq_file->Skip(100));  // Try to skip past end of file.
+  ASSERT_OK(seq_file->Read(1000, &result, scratch));
+  ASSERT_EQ(0U, result.size());
+
+  // Random reads.
+  ASSERT_OK(env_->NewRandomAccessFile("/dir/f", &rand_file, soptions_));
+  ASSERT_OK(rand_file->Read(6, 5, &result, scratch));  // Read "world".
+  ASSERT_EQ(0, result.compare("world"));
+  ASSERT_OK(rand_file->Read(0, 5, &result, scratch));  // Read "hello".
+  ASSERT_EQ(0, result.compare("hello"));
+  ASSERT_OK(rand_file->Read(10, 100, &result, scratch));  // Read "d".
+  ASSERT_EQ(0, result.compare("d"));
+
+  // Too high offset.
+  ASSERT_TRUE(!rand_file->Read(1000, 5, &result, scratch).ok());
+}
+
+TEST_F(EnvMirrorTest, Locks) {
+  FileLock* lock;
+
+  // These are no-ops, but we test they return success.
+  ASSERT_OK(env_->LockFile("some file", &lock));
+  ASSERT_OK(env_->UnlockFile(lock));
+}
+
+TEST_F(EnvMirrorTest, Misc) {
+  std::string test_dir;
+  ASSERT_OK(env_->GetTestDirectory(&test_dir));
+  ASSERT_TRUE(!test_dir.empty());
+
+  std::unique_ptr<WritableFile> writable_file;
+  ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file, soptions_));
+
+  // These are no-ops, but we test they return success.
+  ASSERT_OK(writable_file->Sync());
+  ASSERT_OK(writable_file->Flush());
+  ASSERT_OK(writable_file->Close());
+  writable_file.reset();
+}
+
+TEST_F(EnvMirrorTest, LargeWrite) {
+  const size_t kWriteSize = 300 * 1024;
+  char* scratch = new char[kWriteSize * 2];
+
+  std::string write_data;
+  for (size_t i = 0; i < kWriteSize; ++i) {
+    write_data.append(1, static_cast<char>(i));
+  }
+
+  std::unique_ptr<WritableFile> writable_file;
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append("foo"));
+  ASSERT_OK(writable_file->Append(write_data));
+  writable_file.reset();
+
+  std::unique_ptr<SequentialFile> seq_file;
+  Slice result;
+  ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_));
+  ASSERT_OK(seq_file->Read(3, &result, scratch));  // Read "foo".
+  ASSERT_EQ(0, result.compare("foo"));
+
+  size_t read = 0;
+  std::string read_data;
+  while (read < kWriteSize) {
+    ASSERT_OK(seq_file->Read(kWriteSize - read, &result, scratch));
+    read_data.append(result.data(), result.size());
+    read += result.size();
+  }
+  ASSERT_TRUE(write_data == read_data);
+  delete[] scratch;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+  fprintf(stderr, "SKIPPED as EnvMirror is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/env_timed.cc b/src/rocksdb/utilities/env_timed.cc
new file mode 100644
index 000000000..1eb723146
--- /dev/null
+++ b/src/rocksdb/utilities/env_timed.cc
@@ -0,0 +1,187 @@
+// Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#include "utilities/env_timed.h"
+
+#include "env/composite_env_wrapper.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+TimedFileSystem::TimedFileSystem(const std::shared_ptr<FileSystem>& base)
+    : FileSystemWrapper(base) {}
+IOStatus TimedFileSystem::NewSequentialFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSSequentialFile>* result, IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_new_sequential_file_nanos);
+  return FileSystemWrapper::NewSequentialFile(fname, options, result, dbg);
+}
+
+IOStatus TimedFileSystem::NewRandomAccessFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_new_random_access_file_nanos);
+  return FileSystemWrapper::NewRandomAccessFile(fname, options, result, dbg);
+}
+
+IOStatus TimedFileSystem::NewWritableFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_new_writable_file_nanos);
+  return FileSystemWrapper::NewWritableFile(fname, options, result, dbg);
+}
+
+IOStatus TimedFileSystem::ReuseWritableFile(
+    const std::string& fname, const std::string& old_fname,
+    const FileOptions& options, std::unique_ptr<FSWritableFile>* result,
+    IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_reuse_writable_file_nanos);
+  return FileSystemWrapper::ReuseWritableFile(fname, old_fname, options, result,
+                                              dbg);
+}
+
+IOStatus TimedFileSystem::NewRandomRWFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSRandomRWFile>* result, IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_new_random_rw_file_nanos);
+  return FileSystemWrapper::NewRandomRWFile(fname, options, result, dbg);
+}
+
+IOStatus TimedFileSystem::NewDirectory(const std::string& name,
+                                       const IOOptions& options,
+                                       std::unique_ptr<FSDirectory>* result,
+                                       IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_new_directory_nanos);
+  return FileSystemWrapper::NewDirectory(name, options, result, dbg);
+}
+
+IOStatus TimedFileSystem::FileExists(const std::string& fname,
+                                     const IOOptions& options,
+                                     IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_file_exists_nanos);
+  return FileSystemWrapper::FileExists(fname, options, dbg);
+}
+
+IOStatus TimedFileSystem::GetChildren(const std::string& dir,
+                                      const IOOptions& options,
+                                      std::vector<std::string>* result,
+                                      IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_get_children_nanos);
+  return FileSystemWrapper::GetChildren(dir, options, result, dbg);
+}
+
+IOStatus TimedFileSystem::GetChildrenFileAttributes(
+    const std::string& dir, const IOOptions& options,
+    std::vector<FileAttributes>* result, IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_get_children_file_attributes_nanos);
+  return FileSystemWrapper::GetChildrenFileAttributes(dir, options, result,
+                                                      dbg);
+}
+
+IOStatus TimedFileSystem::DeleteFile(const std::string& fname,
+                                     const IOOptions& options,
+                                     IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_delete_file_nanos);
+  return FileSystemWrapper::DeleteFile(fname, options, dbg);
+}
+
+IOStatus TimedFileSystem::CreateDir(const std::string& dirname,
+                                    const IOOptions& options,
+                                    IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_create_dir_nanos);
+  return FileSystemWrapper::CreateDir(dirname, options, dbg);
+}
+
+IOStatus TimedFileSystem::CreateDirIfMissing(const std::string& dirname,
+                                             const IOOptions& options,
+                                             IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_create_dir_if_missing_nanos);
+  return FileSystemWrapper::CreateDirIfMissing(dirname, options, dbg);
+}
+
+IOStatus TimedFileSystem::DeleteDir(const std::string& dirname,
+                                    const IOOptions& options,
+                                    IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_delete_dir_nanos);
+  return FileSystemWrapper::DeleteDir(dirname, options, dbg);
+}
+
+IOStatus TimedFileSystem::GetFileSize(const std::string& fname,
+                                      const IOOptions& options,
+                                      uint64_t* file_size,
+                                      IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_get_file_size_nanos);
+  return FileSystemWrapper::GetFileSize(fname, options, file_size, dbg);
+}
+
+IOStatus TimedFileSystem::GetFileModificationTime(const std::string& fname,
+                                                  const IOOptions& options,
+                                                  uint64_t* file_mtime,
+                                                  IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_get_file_modification_time_nanos);
+  return FileSystemWrapper::GetFileModificationTime(fname, options, file_mtime,
+                                                    dbg);
+}
+
+IOStatus TimedFileSystem::RenameFile(const std::string& src,
+                                     const std::string& dst,
+                                     const IOOptions& options,
+                                     IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_rename_file_nanos);
+  return FileSystemWrapper::RenameFile(src, dst, options, dbg);
+}
+
+IOStatus TimedFileSystem::LinkFile(const std::string& src,
+                                   const std::string& dst,
+                                   const IOOptions& options,
+                                   IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_link_file_nanos);
+  return FileSystemWrapper::LinkFile(src, dst, options, dbg);
+}
+
+IOStatus TimedFileSystem::LockFile(const std::string& fname,
+                                   const IOOptions& options, FileLock** lock,
+                                   IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_lock_file_nanos);
+  return FileSystemWrapper::LockFile(fname, options, lock, dbg);
+}
+
+IOStatus TimedFileSystem::UnlockFile(FileLock* lock, const IOOptions& options,
+                                     IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_unlock_file_nanos);
+  return FileSystemWrapper::UnlockFile(lock, options, dbg);
+}
+
+IOStatus TimedFileSystem::NewLogger(const std::string& fname,
+                                    const IOOptions& options,
+                                    std::shared_ptr<Logger>* result,
+                                    IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_new_logger_nanos);
+  return FileSystemWrapper::NewLogger(fname, options, result, dbg);
+}
+
+std::shared_ptr<FileSystem> NewTimedFileSystem(
+    const std::shared_ptr<FileSystem>& base) {
+  return std::make_shared<TimedFileSystem>(base);
+}
+
+// An environment that measures function call times for filesystem
+// operations, reporting results to variables in PerfContext.
+Env* NewTimedEnv(Env* base_env) {
+  std::shared_ptr<FileSystem> timed_fs =
+      NewTimedFileSystem(base_env->GetFileSystem());
+  return new CompositeEnvWrapper(base_env, timed_fs);
+}
+
+#else  // ROCKSDB_LITE
+
+Env* NewTimedEnv(Env* /*base_env*/) { return nullptr; }
+
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/env_timed.h b/src/rocksdb/utilities/env_timed.h
new file mode 100644
index 000000000..2d34fd590
--- /dev/null
+++ b/src/rocksdb/utilities/env_timed.h
@@ -0,0 +1,97 @@
+// Copyright (c) 2019-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+#include "rocksdb/file_system.h"
+namespace ROCKSDB_NAMESPACE {
+#ifndef ROCKSDB_LITE
+class TimedFileSystem : public FileSystemWrapper {
+ public:
+  explicit TimedFileSystem(const std::shared_ptr<FileSystem>& base);
+
+  static const char* kClassName() { return "TimedFS"; }
+  const char* Name() const override { return kClassName(); }
+
+  IOStatus NewSequentialFile(const std::string& fname,
+                             const FileOptions& options,
+                             std::unique_ptr<FSSequentialFile>* result,
+                             IODebugContext* dbg) override;
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& options,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override;
+
+  IOStatus NewWritableFile(const std::string& fname, const FileOptions& options,
+                           std::unique_ptr<FSWritableFile>* result,
+                           IODebugContext* dbg) override;
+
+  IOStatus ReuseWritableFile(const std::string& fname,
+                             const std::string& old_fname,
+                             const FileOptions& options,
+                             std::unique_ptr<FSWritableFile>* result,
+                             IODebugContext* dbg) override;
+
+  IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options,
+                           std::unique_ptr<FSRandomRWFile>* result,
+                           IODebugContext* dbg) override;
+
+  IOStatus NewDirectory(const std::string& name, const IOOptions& options,
+                        std::unique_ptr<FSDirectory>* result,
+                        IODebugContext* dbg) override;
+
+  IOStatus FileExists(const std::string& fname, const IOOptions& options,
+                      IODebugContext* dbg) override;
+
+  IOStatus GetChildren(const std::string& dir, const IOOptions& options,
+                       std::vector<std::string>* result,
+                       IODebugContext* dbg) override;
+
+  IOStatus GetChildrenFileAttributes(const std::string& dir,
+                                     const IOOptions& options,
+                                     std::vector<FileAttributes>* result,
+                                     IODebugContext* dbg) override;
+
+  IOStatus DeleteFile(const std::string& fname, const IOOptions& options,
+                      IODebugContext* dbg) override;
+
+  IOStatus CreateDir(const std::string& dirname, const IOOptions& options,
+                     IODebugContext* dbg) override;
+
+  IOStatus CreateDirIfMissing(const std::string& dirname,
+                              const IOOptions& options,
+                              IODebugContext* dbg) override;
+
+  IOStatus DeleteDir(const std::string& dirname, const IOOptions& options,
+                     IODebugContext* dbg) override;
+
+  IOStatus GetFileSize(const std::string& fname, const IOOptions& options,
+                       uint64_t* file_size, IODebugContext* dbg) override;
+
+  IOStatus GetFileModificationTime(const std::string& fname,
+                                   const IOOptions& options,
+                                   uint64_t* file_mtime,
+                                   IODebugContext* dbg) override;
+
+  IOStatus RenameFile(const std::string& src, const std::string& dst,
+                      const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus LinkFile(const std::string& src, const std::string& dst,
+                    const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus LockFile(const std::string& fname, const IOOptions& options,
+                    FileLock** lock, IODebugContext* dbg) override;
+
+  IOStatus UnlockFile(FileLock* lock, const IOOptions& options,
+                      IODebugContext* dbg) override;
+
+  IOStatus NewLogger(const std::string& fname, const IOOptions& options,
+                     std::shared_ptr<Logger>* result,
+                     IODebugContext* dbg) override;
+};
+
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/env_timed_test.cc b/src/rocksdb/utilities/env_timed_test.cc
new file mode 100644
index 000000000..6e392579d
--- /dev/null
+++ b/src/rocksdb/utilities/env_timed_test.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/env.h"
+#include "rocksdb/perf_context.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TimedEnvTest : public testing::Test {};
+
+TEST_F(TimedEnvTest, BasicTest) {
+  SetPerfLevel(PerfLevel::kEnableTime);
+  ASSERT_EQ(0, get_perf_context()->env_new_writable_file_nanos);
+
+  std::unique_ptr<Env> mem_env(NewMemEnv(Env::Default()));
+  std::unique_ptr<Env> timed_env(NewTimedEnv(mem_env.get()));
+  std::unique_ptr<WritableFile> writable_file;
+  ASSERT_OK(timed_env->NewWritableFile("f", &writable_file, EnvOptions()));
+
+  ASSERT_GT(get_perf_context()->env_new_writable_file_nanos, 0);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else  // ROCKSDB_LITE
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as TimedEnv is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/fault_injection_env.cc b/src/rocksdb/utilities/fault_injection_env.cc
new file mode 100644
index 000000000..b0495a8c1
--- /dev/null
+++ b/src/rocksdb/utilities/fault_injection_env.cc
@@ -0,0 +1,555 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This test uses a custom Env to keep track of the state of a filesystem as of
+// the last "sync". It then checks for data loss errors by purposely dropping
+// file data (or entire files) not protected by a "sync".
+
+#include "utilities/fault_injection_env.h"
+
+#include <functional>
+#include <utility>
+
+#include "util/random.h"
+namespace ROCKSDB_NAMESPACE {
+
+// Assume a filename, and not a directory name like "/foo/bar/"
+std::string GetDirName(const std::string filename) {
+  size_t found = filename.find_last_of("/\\");
+  if (found == std::string::npos) {
+    return "";
+  } else {
+    return filename.substr(0, found);
+  }
+}
+
+// A basic file truncation function suitable for this test.
+Status Truncate(Env* env, const std::string& filename, uint64_t length) {
+  std::unique_ptr<SequentialFile> orig_file;
+  const EnvOptions options;
+  Status s = env->NewSequentialFile(filename, &orig_file, options);
+  if (!s.ok()) {
+    fprintf(stderr, "Cannot open file %s for truncation: %s\n",
+            filename.c_str(), s.ToString().c_str());
+    return s;
+  }
+
+  std::unique_ptr<char[]> scratch(new char[length]);
+  ROCKSDB_NAMESPACE::Slice result;
+  s = orig_file->Read(length, &result, scratch.get());
+#ifdef OS_WIN
+  orig_file.reset();
+#endif
+  if (s.ok()) {
+    std::string tmp_name = GetDirName(filename) + "/truncate.tmp";
+    std::unique_ptr<WritableFile> tmp_file;
+    s = env->NewWritableFile(tmp_name, &tmp_file, options);
+    if (s.ok()) {
+      s = tmp_file->Append(result);
+      if (s.ok()) {
+        s = env->RenameFile(tmp_name, filename);
+      } else {
+        fprintf(stderr, "Cannot rename file %s to %s: %s\n", tmp_name.c_str(),
+                filename.c_str(), s.ToString().c_str());
+        env->DeleteFile(tmp_name);
+      }
+    }
+  }
+  if (!s.ok()) {
+    fprintf(stderr, "Cannot truncate file %s: %s\n", filename.c_str(),
+            s.ToString().c_str());
+  }
+
+  return s;
+}
+
+// Trim the tailing "/" in the end of `str`
+std::string TrimDirname(const std::string& str) {
+  size_t found = str.find_last_not_of("/");
+  if (found == std::string::npos) {
+    return str;
+  }
+  return str.substr(0, found + 1);
+}
+
+// Return pair <parent directory name, file name> of a full path.
+std::pair<std::string, std::string> GetDirAndName(const std::string& name) {
+  std::string dirname = GetDirName(name);
+  std::string fname = name.substr(dirname.size() + 1);
+  return std::make_pair(dirname, fname);
+}
+
+Status FileState::DropUnsyncedData(Env* env) const {
+  ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_;
+  return Truncate(env, filename_, sync_pos);
+}
+
+Status FileState::DropRandomUnsyncedData(Env* env, Random* rand) const {
+  ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_;
+  assert(pos_ >= sync_pos);
+  int range = static_cast<int>(pos_ - sync_pos);
+  uint64_t truncated_size =
+      static_cast<uint64_t>(sync_pos) + rand->Uniform(range);
+  return Truncate(env, filename_, truncated_size);
+}
+
+Status TestDirectory::Fsync() {
+  if (!env_->IsFilesystemActive()) {
+    return env_->GetError();
+  }
+  env_->SyncDir(dirname_);
+  return dir_->Fsync();
+}
+
+Status TestDirectory::Close() {
+  if (!env_->IsFilesystemActive()) {
+    return env_->GetError();
+  }
+  return dir_->Close();
+}
+
+TestRandomAccessFile::TestRandomAccessFile(
+    std::unique_ptr<RandomAccessFile>&& target, FaultInjectionTestEnv* env)
+    : target_(std::move(target)), env_(env) {
+  assert(target_);
+  assert(env_);
+}
+
+Status TestRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result,
+                                  char* scratch) const {
+  assert(env_);
+  if (!env_->IsFilesystemActive()) {
+    return env_->GetError();
+  }
+
+  assert(target_);
+  return target_->Read(offset, n, result, scratch);
+}
+
+Status TestRandomAccessFile::Prefetch(uint64_t offset, size_t n) {
+  assert(env_);
+  if (!env_->IsFilesystemActive()) {
+    return env_->GetError();
+  }
+
+  assert(target_);
+  return target_->Prefetch(offset, n);
+}
+
+Status TestRandomAccessFile::MultiRead(ReadRequest* reqs, size_t num_reqs) {
+  assert(env_);
+  if (!env_->IsFilesystemActive()) {
+    const Status s = env_->GetError();
+
+    assert(reqs);
+    for (size_t i = 0; i < num_reqs; ++i) {
+      reqs[i].status = s;
+    }
+
+    return s;
+  }
+
+  assert(target_);
+  return target_->MultiRead(reqs, num_reqs);
+}
+
+TestWritableFile::TestWritableFile(const std::string& fname,
+                                   std::unique_ptr<WritableFile>&& f,
+                                   FaultInjectionTestEnv* env)
+    : state_(fname),
+      target_(std::move(f)),
+      writable_file_opened_(true),
+      env_(env) {
+  assert(target_ != nullptr);
+  state_.pos_ = 0;
+}
+
+TestWritableFile::~TestWritableFile() {
+  if (writable_file_opened_) {
+    Close().PermitUncheckedError();
+  }
+}
+
+Status TestWritableFile::Append(const Slice& data) {
+  if (!env_->IsFilesystemActive()) {
+    return env_->GetError();
+  }
+  Status s = target_->Append(data);
+  if (s.ok()) {
+    state_.pos_ += data.size();
+    env_->WritableFileAppended(state_);
+  }
+  return s;
+}
+
+Status TestWritableFile::Close() {
+  writable_file_opened_ = false;
+  Status s = target_->Close();
+  if (s.ok()) {
+    env_->WritableFileClosed(state_);
+  }
+  return s;
+}
+
+Status TestWritableFile::Flush() {
+  Status s = target_->Flush();
+  if (s.ok() && env_->IsFilesystemActive()) {
+    state_.pos_at_last_flush_ = state_.pos_;
+  }
+  return s;
+}
+
+Status TestWritableFile::Sync() {
+  if (!env_->IsFilesystemActive()) {
+    return Status::IOError("FaultInjectionTestEnv: not active");
+  }
+  // No need to actual sync.
+  state_.pos_at_last_sync_ = state_.pos_;
+  env_->WritableFileSynced(state_);
+  return Status::OK();
+}
+
+TestRandomRWFile::TestRandomRWFile(const std::string& /*fname*/,
+                                   std::unique_ptr<RandomRWFile>&& f,
+                                   FaultInjectionTestEnv* env)
+    : target_(std::move(f)), file_opened_(true), env_(env) {
+  assert(target_ != nullptr);
+}
+
+TestRandomRWFile::~TestRandomRWFile() {
+  if (file_opened_) {
+    Close().PermitUncheckedError();
+  }
+}
+
+Status TestRandomRWFile::Write(uint64_t offset, const Slice& data) {
+  if (!env_->IsFilesystemActive()) {
+    return env_->GetError();
+  }
+  return target_->Write(offset, data);
+}
+
+Status TestRandomRWFile::Read(uint64_t offset, size_t n, Slice* result,
+                              char* scratch) const {
+  if (!env_->IsFilesystemActive()) {
+    return env_->GetError();
+  }
+  return target_->Read(offset, n, result, scratch);
+}
+
+Status TestRandomRWFile::Close() {
+  file_opened_ = false;
+  return target_->Close();
+}
+
+Status TestRandomRWFile::Flush() {
+  if (!env_->IsFilesystemActive()) {
+    return env_->GetError();
+  }
+  return target_->Flush();
+}
+
+Status TestRandomRWFile::Sync() {
+  if (!env_->IsFilesystemActive()) {
+    return env_->GetError();
+  }
+  return target_->Sync();
+}
+
+Status FaultInjectionTestEnv::NewDirectory(const std::string& name,
+                                           std::unique_ptr<Directory>* result) {
+  std::unique_ptr<Directory> r;
+  Status s = target()->NewDirectory(name, &r);
+  assert(s.ok());
+  if (!s.ok()) {
+    return s;
+  }
+  result->reset(new TestDirectory(this, TrimDirname(name), r.release()));
+  return Status::OK();
+}
+
+Status FaultInjectionTestEnv::NewWritableFile(
+    const std::string& fname, std::unique_ptr<WritableFile>* result,
+    const EnvOptions& soptions) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  // Not allow overwriting files
+  Status s = target()->FileExists(fname);
+  if (s.ok()) {
+    return Status::Corruption("File already exists.");
+  } else if (!s.IsNotFound()) {
+    assert(s.IsIOError());
+    return s;
+  }
+  s = target()->NewWritableFile(fname, result, soptions);
+  if (s.ok()) {
+    result->reset(new TestWritableFile(fname, std::move(*result), this));
+    // WritableFileWriter* file is opened
+    // again then it will be truncated - so forget our saved state.
+    UntrackFile(fname);
+    MutexLock l(&mutex_);
+    open_managed_files_.insert(fname);
+    auto dir_and_name = GetDirAndName(fname);
+    auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+    list.insert(dir_and_name.second);
+  }
+  return s;
+}
+
+Status FaultInjectionTestEnv::ReopenWritableFile(
+    const std::string& fname, std::unique_ptr<WritableFile>* result,
+    const EnvOptions& soptions) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+
+  bool exists;
+  Status s, exists_s = target()->FileExists(fname);
+  if (exists_s.IsNotFound()) {
+    exists = false;
+  } else if (exists_s.ok()) {
+    exists = true;
+  } else {
+    s = exists_s;
+    exists = false;
+  }
+
+  if (s.ok()) {
+    s = target()->ReopenWritableFile(fname, result, soptions);
+  }
+
+  // Only track files we created. Files created outside of this
+  // `FaultInjectionTestEnv` are not eligible for tracking/data dropping
+  // (for example, they may contain data a previous db_stress run expects to
+  // be recovered). This could be extended to track/drop data appended once
+  // the file is under `FaultInjectionTestEnv`'s control.
+  if (s.ok()) {
+    bool should_track;
+    {
+      MutexLock l(&mutex_);
+      if (db_file_state_.find(fname) != db_file_state_.end()) {
+        // It was written by this `Env` earlier.
+        assert(exists);
+        should_track = true;
+      } else if (!exists) {
+        // It was created by this `Env` just now.
+        should_track = true;
+        open_managed_files_.insert(fname);
+        auto dir_and_name = GetDirAndName(fname);
+        auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+        list.insert(dir_and_name.second);
+      } else {
+        should_track = false;
+      }
+    }
+    if (should_track) {
+      result->reset(new TestWritableFile(fname, std::move(*result), this));
+    }
+  }
+  return s;
+}
+
+Status FaultInjectionTestEnv::NewRandomRWFile(
+    const std::string& fname, std::unique_ptr<RandomRWFile>* result,
+    const EnvOptions& soptions) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  Status s = target()->NewRandomRWFile(fname, result, soptions);
+  if (s.ok()) {
+    result->reset(new TestRandomRWFile(fname, std::move(*result), this));
+    // WritableFileWriter* file is opened
+    // again then it will be truncated - so forget our saved state.
+    UntrackFile(fname);
+    MutexLock l(&mutex_);
+    open_managed_files_.insert(fname);
+    auto dir_and_name = GetDirAndName(fname);
+    auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+    list.insert(dir_and_name.second);
+  }
+  return s;
+}
+
+Status FaultInjectionTestEnv::NewRandomAccessFile(
+    const std::string& fname, std::unique_ptr<RandomAccessFile>* result,
+    const EnvOptions& soptions) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+
+  assert(target());
+  const Status s = target()->NewRandomAccessFile(fname, result, soptions);
+  if (!s.ok()) {
+    return s;
+  }
+
+  assert(result);
+  result->reset(new TestRandomAccessFile(std::move(*result), this));
+
+  return Status::OK();
+}
+
+Status FaultInjectionTestEnv::DeleteFile(const std::string& f) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  Status s = EnvWrapper::DeleteFile(f);
+  if (s.ok()) {
+    UntrackFile(f);
+  }
+  return s;
+}
+
+Status FaultInjectionTestEnv::RenameFile(const std::string& s,
+                                         const std::string& t) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  Status ret = EnvWrapper::RenameFile(s, t);
+
+  if (ret.ok()) {
+    MutexLock l(&mutex_);
+    if (db_file_state_.find(s) != db_file_state_.end()) {
+      db_file_state_[t] = db_file_state_[s];
+      db_file_state_.erase(s);
+    }
+
+    auto sdn = GetDirAndName(s);
+    auto tdn = GetDirAndName(t);
+    if (dir_to_new_files_since_last_sync_[sdn.first].erase(sdn.second) != 0) {
+      auto& tlist = dir_to_new_files_since_last_sync_[tdn.first];
+      assert(tlist.find(tdn.second) == tlist.end());
+      tlist.insert(tdn.second);
+    }
+  }
+
+  return ret;
+}
+
+Status FaultInjectionTestEnv::LinkFile(const std::string& s,
+                                       const std::string& t) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  Status ret = EnvWrapper::LinkFile(s, t);
+
+  if (ret.ok()) {
+    MutexLock l(&mutex_);
+    if (db_file_state_.find(s) != db_file_state_.end()) {
+      db_file_state_[t] = db_file_state_[s];
+    }
+
+    auto sdn = GetDirAndName(s);
+    auto tdn = GetDirAndName(t);
+    if (dir_to_new_files_since_last_sync_[sdn.first].find(sdn.second) !=
+        dir_to_new_files_since_last_sync_[sdn.first].end()) {
+      auto& tlist = dir_to_new_files_since_last_sync_[tdn.first];
+      assert(tlist.find(tdn.second) == tlist.end());
+      tlist.insert(tdn.second);
+    }
+  }
+
+  return ret;
+}
+
+void FaultInjectionTestEnv::WritableFileClosed(const FileState& state) {
+  MutexLock l(&mutex_);
+  if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) {
+    db_file_state_[state.filename_] = state;
+    open_managed_files_.erase(state.filename_);
+  }
+}
+
+void FaultInjectionTestEnv::WritableFileSynced(const FileState& state) {
+  MutexLock l(&mutex_);
+  if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) {
+    if (db_file_state_.find(state.filename_) == db_file_state_.end()) {
+      db_file_state_.insert(std::make_pair(state.filename_, state));
+    } else {
+      db_file_state_[state.filename_] = state;
+    }
+  }
+}
+
+void FaultInjectionTestEnv::WritableFileAppended(const FileState& state) {
+  MutexLock l(&mutex_);
+  if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) {
+    if (db_file_state_.find(state.filename_) == db_file_state_.end()) {
+      db_file_state_.insert(std::make_pair(state.filename_, state));
+    } else {
+      db_file_state_[state.filename_] = state;
+    }
+  }
+}
+
+// For every file that is not fully synced, make a call to `func` with
+// FileState of the file as the parameter.
+Status FaultInjectionTestEnv::DropFileData(
+    std::function<Status(Env*, FileState)> func) {
+  Status s;
+  MutexLock l(&mutex_);
+  for (std::map<std::string, FileState>::const_iterator it =
+           db_file_state_.begin();
+       s.ok() && it != db_file_state_.end(); ++it) {
+    const FileState& state = it->second;
+    if (!state.IsFullySynced()) {
+      s = func(target(), state);
+    }
+  }
+  return s;
+}
+
+Status FaultInjectionTestEnv::DropUnsyncedFileData() {
+  return DropFileData([&](Env* env, const FileState& state) {
+    return state.DropUnsyncedData(env);
+  });
+}
+
+Status FaultInjectionTestEnv::DropRandomUnsyncedFileData(Random* rnd) {
+  return DropFileData([&](Env* env, const FileState& state) {
+    return state.DropRandomUnsyncedData(env, rnd);
+  });
+}
+
+Status FaultInjectionTestEnv::DeleteFilesCreatedAfterLastDirSync() {
+  // Because DeleteFile access this container make a copy to avoid deadlock
+  std::map<std::string, std::set<std::string>> map_copy;
+  {
+    MutexLock l(&mutex_);
+    map_copy.insert(dir_to_new_files_since_last_sync_.begin(),
+                    dir_to_new_files_since_last_sync_.end());
+  }
+
+  for (auto& pair : map_copy) {
+    for (std::string name : pair.second) {
+      Status s = DeleteFile(pair.first + "/" + name);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  }
+  return Status::OK();
+}
+void FaultInjectionTestEnv::ResetState() {
+  MutexLock l(&mutex_);
+  db_file_state_.clear();
+  dir_to_new_files_since_last_sync_.clear();
+  SetFilesystemActiveNoLock(true);
+}
+
+void FaultInjectionTestEnv::UntrackFile(const std::string& f) {
+  MutexLock l(&mutex_);
+  auto dir_and_name = GetDirAndName(f);
+  dir_to_new_files_since_last_sync_[dir_and_name.first].erase(
+      dir_and_name.second);
+  db_file_state_.erase(f);
+  open_managed_files_.erase(f);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/fault_injection_env.h b/src/rocksdb/utilities/fault_injection_env.h
new file mode 100644
index 000000000..549bfe716
--- /dev/null
+++ b/src/rocksdb/utilities/fault_injection_env.h
@@ -0,0 +1,258 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This test uses a custom Env to keep track of the state of a filesystem as of
+// the last "sync". It then checks for data loss errors by purposely dropping
+// file data (or entire files) not protected by a "sync".
+
+#pragma once
+
+#include <map>
+#include <set>
+#include <string>
+
+#include "file/filename.h"
+#include "rocksdb/env.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Random;
+class TestWritableFile;
+class FaultInjectionTestEnv;
+
+struct FileState {
+  std::string filename_;
+  ssize_t pos_;
+  ssize_t pos_at_last_sync_;
+  ssize_t pos_at_last_flush_;
+
+  explicit FileState(const std::string& filename)
+      : filename_(filename),
+        pos_(-1),
+        pos_at_last_sync_(-1),
+        pos_at_last_flush_(-1) {}
+
+  FileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {}
+
+  bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; }
+
+  Status DropUnsyncedData(Env* env) const;
+
+  Status DropRandomUnsyncedData(Env* env, Random* rand) const;
+};
+
+class TestRandomAccessFile : public RandomAccessFile {
+ public:
+  TestRandomAccessFile(std::unique_ptr<RandomAccessFile>&& target,
+                       FaultInjectionTestEnv* env);
+
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const override;
+
+  Status Prefetch(uint64_t offset, size_t n) override;
+
+  Status MultiRead(ReadRequest* reqs, size_t num_reqs) override;
+
+ private:
+  std::unique_ptr<RandomAccessFile> target_;
+  FaultInjectionTestEnv* env_;
+};
+
+// A wrapper around WritableFileWriter* file
+// is written to or sync'ed.
+class TestWritableFile : public WritableFile {
+ public:
+  explicit TestWritableFile(const std::string& fname,
+                            std::unique_ptr<WritableFile>&& f,
+                            FaultInjectionTestEnv* env);
+  virtual ~TestWritableFile();
+  virtual Status Append(const Slice& data) override;
+  virtual Status Append(
+      const Slice& data,
+      const DataVerificationInfo& /*verification_info*/) override {
+    return Append(data);
+  }
+  virtual Status Truncate(uint64_t size) override {
+    return target_->Truncate(size);
+  }
+  virtual Status Close() override;
+  virtual Status Flush() override;
+  virtual Status Sync() override;
+  virtual bool IsSyncThreadSafe() const override { return true; }
+  virtual Status PositionedAppend(const Slice& data, uint64_t offset) override {
+    return target_->PositionedAppend(data, offset);
+  }
+  virtual Status PositionedAppend(
+      const Slice& data, uint64_t offset,
+      const DataVerificationInfo& /*verification_info*/) override {
+    return PositionedAppend(data, offset);
+  }
+  virtual bool use_direct_io() const override {
+    return target_->use_direct_io();
+  };
+
+ private:
+  FileState state_;
+  std::unique_ptr<WritableFile> target_;
+  bool writable_file_opened_;
+  FaultInjectionTestEnv* env_;
+};
+
+// A wrapper around WritableFileWriter* file
+// is written to or sync'ed.
+class TestRandomRWFile : public RandomRWFile {
+ public:
+  explicit TestRandomRWFile(const std::string& fname,
+                            std::unique_ptr<RandomRWFile>&& f,
+                            FaultInjectionTestEnv* env);
+  virtual ~TestRandomRWFile();
+  Status Write(uint64_t offset, const Slice& data) override;
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const override;
+  Status Close() override;
+  Status Flush() override;
+  Status Sync() override;
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  bool use_direct_io() const override { return target_->use_direct_io(); };
+
+ private:
+  std::unique_ptr<RandomRWFile> target_;
+  bool file_opened_;
+  FaultInjectionTestEnv* env_;
+};
+
+class TestDirectory : public Directory {
+ public:
+  explicit TestDirectory(FaultInjectionTestEnv* env, std::string dirname,
+                         Directory* dir)
+      : env_(env), dirname_(dirname), dir_(dir) {}
+  ~TestDirectory() {}
+
+  virtual Status Fsync() override;
+  virtual Status Close() override;
+
+ private:
+  FaultInjectionTestEnv* env_;
+  std::string dirname_;
+  std::unique_ptr<Directory> dir_;
+};
+
+class FaultInjectionTestEnv : public EnvWrapper {
+ public:
+  explicit FaultInjectionTestEnv(Env* base)
+      : EnvWrapper(base), filesystem_active_(true) {}
+  virtual ~FaultInjectionTestEnv() { error_.PermitUncheckedError(); }
+
+  static const char* kClassName() { return "FaultInjectionTestEnv"; }
+  const char* Name() const override { return kClassName(); }
+
+  Status NewDirectory(const std::string& name,
+                      std::unique_ptr<Directory>* result) override;
+
+  Status NewWritableFile(const std::string& fname,
+                         std::unique_ptr<WritableFile>* result,
+                         const EnvOptions& soptions) override;
+
+  Status ReopenWritableFile(const std::string& fname,
+                            std::unique_ptr<WritableFile>* result,
+                            const EnvOptions& soptions) override;
+
+  Status NewRandomRWFile(const std::string& fname,
+                         std::unique_ptr<RandomRWFile>* result,
+                         const EnvOptions& soptions) override;
+
+  Status NewRandomAccessFile(const std::string& fname,
+                             std::unique_ptr<RandomAccessFile>* result,
+                             const EnvOptions& soptions) override;
+
+  virtual Status DeleteFile(const std::string& f) override;
+
+  virtual Status RenameFile(const std::string& s,
+                            const std::string& t) override;
+
+  virtual Status LinkFile(const std::string& s, const std::string& t) override;
+
+// Undef to eliminate clash on Windows
+#undef GetFreeSpace
+  virtual Status GetFreeSpace(const std::string& path,
+                              uint64_t* disk_free) override {
+    if (!IsFilesystemActive() &&
+        error_.subcode() == IOStatus::SubCode::kNoSpace) {
+      *disk_free = 0;
+      return Status::OK();
+    } else {
+      return target()->GetFreeSpace(path, disk_free);
+    }
+  }
+
+  void WritableFileClosed(const FileState& state);
+
+  void WritableFileSynced(const FileState& state);
+
+  void WritableFileAppended(const FileState& state);
+
+  // For every file that is not fully synced, make a call to `func` with
+  // FileState of the file as the parameter.
+  Status DropFileData(std::function<Status(Env*, FileState)> func);
+
+  Status DropUnsyncedFileData();
+
+  Status DropRandomUnsyncedFileData(Random* rnd);
+
+  Status DeleteFilesCreatedAfterLastDirSync();
+
+  void ResetState();
+
+  void UntrackFile(const std::string& f);
+
+  void SyncDir(const std::string& dirname) {
+    MutexLock l(&mutex_);
+    dir_to_new_files_since_last_sync_.erase(dirname);
+  }
+
+  // Setting the filesystem to inactive is the test equivalent to simulating a
+  // system reset. Setting to inactive will freeze our saved filesystem state so
+  // that it will stop being recorded. It can then be reset back to the state at
+  // the time of the reset.
+  bool IsFilesystemActive() {
+    MutexLock l(&mutex_);
+    return filesystem_active_;
+  }
+  void SetFilesystemActiveNoLock(
+      bool active, Status error = Status::Corruption("Not active")) {
+    error.PermitUncheckedError();
+    filesystem_active_ = active;
+    if (!active) {
+      error_ = error;
+    }
+    error.PermitUncheckedError();
+  }
+  void SetFilesystemActive(bool active,
+                           Status error = Status::Corruption("Not active")) {
+    error.PermitUncheckedError();
+    MutexLock l(&mutex_);
+    SetFilesystemActiveNoLock(active, error);
+    error.PermitUncheckedError();
+  }
+  void AssertNoOpenFile() { assert(open_managed_files_.empty()); }
+  Status GetError() { return error_; }
+
+ private:
+  port::Mutex mutex_;
+  std::map<std::string, FileState> db_file_state_;
+  std::set<std::string> open_managed_files_;
+  std::unordered_map<std::string, std::set<std::string>>
+      dir_to_new_files_since_last_sync_;
+  bool filesystem_active_;  // Record flushes, syncs, writes
+  Status error_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/fault_injection_fs.cc b/src/rocksdb/utilities/fault_injection_fs.cc
new file mode 100644
index 000000000..549051856
--- /dev/null
+++ b/src/rocksdb/utilities/fault_injection_fs.cc
@@ -0,0 +1,1032 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This test uses a custom FileSystem to keep track of the state of a file
+// system the last "Sync". The data being written is cached in a "buffer".
+// Only when "Sync" is called, the data will be persistent. It can simulate
+// file data loss (or entire files) not protected by a "Sync". For any of the
+// FileSystem related operations, by specify the "IOStatus Error", a specific
+// error can be returned when file system is not activated.
+
+#include "utilities/fault_injection_fs.h"
+
+#include <algorithm>
+#include <functional>
+#include <utility>
+
+#include "env/composite_env_wrapper.h"
+#include "port/lang.h"
+#include "port/stack_trace.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "util/xxhash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const std::string kNewFileNoOverwrite = "";
+
+// Assume a filename, and not a directory name like "/foo/bar/"
+std::string TestFSGetDirName(const std::string filename) {
+  size_t found = filename.find_last_of("/\\");
+  if (found == std::string::npos) {
+    return "";
+  } else {
+    return filename.substr(0, found);
+  }
+}
+
+// Trim the tailing "/" in the end of `str`
+std::string TestFSTrimDirname(const std::string& str) {
+  size_t found = str.find_last_not_of("/");
+  if (found == std::string::npos) {
+    return str;
+  }
+  return str.substr(0, found + 1);
+}
+
+// Return pair <parent directory name, file name> of a full path.
+std::pair<std::string, std::string> TestFSGetDirAndName(
+    const std::string& name) {
+  std::string dirname = TestFSGetDirName(name);
+  std::string fname = name.substr(dirname.size() + 1);
+  return std::make_pair(dirname, fname);
+}
+
+// Calculate the checksum of the data with corresponding checksum
+// type. If name does not match, no checksum is returned.
+void CalculateTypedChecksum(const ChecksumType& checksum_type, const char* data,
+                            size_t size, std::string* checksum) {
+  if (checksum_type == ChecksumType::kCRC32c) {
+    uint32_t v_crc32c = crc32c::Extend(0, data, size);
+    PutFixed32(checksum, v_crc32c);
+    return;
+  } else if (checksum_type == ChecksumType::kxxHash) {
+    uint32_t v = XXH32(data, size, 0);
+    PutFixed32(checksum, v);
+  }
+  return;
+}
+
+IOStatus FSFileState::DropUnsyncedData() {
+  buffer_.resize(0);
+  return IOStatus::OK();
+}
+
+IOStatus FSFileState::DropRandomUnsyncedData(Random* rand) {
+  int range = static_cast<int>(buffer_.size());
+  size_t truncated_size = static_cast<size_t>(rand->Uniform(range));
+  buffer_.resize(truncated_size);
+  return IOStatus::OK();
+}
+
+IOStatus TestFSDirectory::Fsync(const IOOptions& options, IODebugContext* dbg) {
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  {
+    IOStatus in_s = fs_->InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+  fs_->SyncDir(dirname_);
+  IOStatus s = dir_->Fsync(options, dbg);
+  {
+    IOStatus in_s = fs_->InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+  return s;
+}
+
+IOStatus TestFSDirectory::Close(const IOOptions& options, IODebugContext* dbg) {
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  IOStatus s = dir_->Close(options, dbg);
+  return s;
+}
+
+IOStatus TestFSDirectory::FsyncWithDirOptions(
+    const IOOptions& options, IODebugContext* dbg,
+    const DirFsyncOptions& dir_fsync_options) {
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  {
+    IOStatus in_s = fs_->InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+  fs_->SyncDir(dirname_);
+  IOStatus s = dir_->FsyncWithDirOptions(options, dbg, dir_fsync_options);
+  {
+    IOStatus in_s = fs_->InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+  return s;
+}
+
+TestFSWritableFile::TestFSWritableFile(const std::string& fname,
+                                       const FileOptions& file_opts,
+                                       std::unique_ptr<FSWritableFile>&& f,
+                                       FaultInjectionTestFS* fs)
+    : state_(fname),
+      file_opts_(file_opts),
+      target_(std::move(f)),
+      writable_file_opened_(true),
+      fs_(fs) {
+  assert(target_ != nullptr);
+  state_.pos_ = 0;
+}
+
+TestFSWritableFile::~TestFSWritableFile() {
+  if (writable_file_opened_) {
+    Close(IOOptions(), nullptr).PermitUncheckedError();
+  }
+}
+
+IOStatus TestFSWritableFile::Append(const Slice& data, const IOOptions& options,
+                                    IODebugContext* dbg) {
+  MutexLock l(&mutex_);
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  if (target_->use_direct_io()) {
+    target_->Append(data, options, dbg).PermitUncheckedError();
+  } else {
+    state_.buffer_.append(data.data(), data.size());
+    state_.pos_ += data.size();
+    fs_->WritableFileAppended(state_);
+  }
+  IOStatus io_s = fs_->InjectWriteError(state_.filename_);
+  return io_s;
+}
+
+// By setting the IngestDataCorruptionBeforeWrite(), the data corruption is
+// simulated.
+IOStatus TestFSWritableFile::Append(
+    const Slice& data, const IOOptions& options,
+    const DataVerificationInfo& verification_info, IODebugContext* dbg) {
+  MutexLock l(&mutex_);
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  if (fs_->ShouldDataCorruptionBeforeWrite()) {
+    return IOStatus::Corruption("Data is corrupted!");
+  }
+
+  // Calculate the checksum
+  std::string checksum;
+  CalculateTypedChecksum(fs_->GetChecksumHandoffFuncType(), data.data(),
+                         data.size(), &checksum);
+  if (fs_->GetChecksumHandoffFuncType() != ChecksumType::kNoChecksum &&
+      checksum != verification_info.checksum.ToString()) {
+    std::string msg = "Data is corrupted! Origin data checksum: " +
+                      verification_info.checksum.ToString() +
+                      "current data checksum: " + checksum;
+    return IOStatus::Corruption(msg);
+  }
+  if (target_->use_direct_io()) {
+    target_->Append(data, options, dbg).PermitUncheckedError();
+  } else {
+    state_.buffer_.append(data.data(), data.size());
+    state_.pos_ += data.size();
+    fs_->WritableFileAppended(state_);
+  }
+  IOStatus io_s = fs_->InjectWriteError(state_.filename_);
+  return io_s;
+}
+
+IOStatus TestFSWritableFile::PositionedAppend(
+    const Slice& data, uint64_t offset, const IOOptions& options,
+    const DataVerificationInfo& verification_info, IODebugContext* dbg) {
+  MutexLock l(&mutex_);
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  if (fs_->ShouldDataCorruptionBeforeWrite()) {
+    return IOStatus::Corruption("Data is corrupted!");
+  }
+
+  // Calculate the checksum
+  std::string checksum;
+  CalculateTypedChecksum(fs_->GetChecksumHandoffFuncType(), data.data(),
+                         data.size(), &checksum);
+  if (fs_->GetChecksumHandoffFuncType() != ChecksumType::kNoChecksum &&
+      checksum != verification_info.checksum.ToString()) {
+    std::string msg = "Data is corrupted! Origin data checksum: " +
+                      verification_info.checksum.ToString() +
+                      "current data checksum: " + checksum;
+    return IOStatus::Corruption(msg);
+  }
+  target_->PositionedAppend(data, offset, options, dbg);
+  IOStatus io_s = fs_->InjectWriteError(state_.filename_);
+  return io_s;
+}
+
+IOStatus TestFSWritableFile::Close(const IOOptions& options,
+                                   IODebugContext* dbg) {
+  MutexLock l(&mutex_);
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  {
+    IOStatus in_s = fs_->InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+  writable_file_opened_ = false;
+  IOStatus io_s;
+  if (!target_->use_direct_io()) {
+    io_s = target_->Append(state_.buffer_, options, dbg);
+  }
+  if (io_s.ok()) {
+    state_.buffer_.resize(0);
+    // Ignore sync errors
+    target_->Sync(options, dbg).PermitUncheckedError();
+    io_s = target_->Close(options, dbg);
+  }
+  if (io_s.ok()) {
+    fs_->WritableFileClosed(state_);
+    IOStatus in_s = fs_->InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+  return io_s;
+}
+
+IOStatus TestFSWritableFile::Flush(const IOOptions&, IODebugContext*) {
+  MutexLock l(&mutex_);
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  if (fs_->IsFilesystemActive()) {
+    state_.pos_at_last_flush_ = state_.pos_;
+  }
+  return IOStatus::OK();
+}
+
+IOStatus TestFSWritableFile::Sync(const IOOptions& options,
+                                  IODebugContext* dbg) {
+  MutexLock l(&mutex_);
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  if (target_->use_direct_io()) {
+    // For Direct IO mode, we don't buffer anything in TestFSWritableFile.
+    // So just return
+    return IOStatus::OK();
+  }
+  IOStatus io_s = target_->Append(state_.buffer_, options, dbg);
+  state_.buffer_.resize(0);
+  // Ignore sync errors
+  target_->Sync(options, dbg).PermitUncheckedError();
+  state_.pos_at_last_sync_ = state_.pos_;
+  fs_->WritableFileSynced(state_);
+  return io_s;
+}
+
+IOStatus TestFSWritableFile::RangeSync(uint64_t offset, uint64_t nbytes,
+                                       const IOOptions& options,
+                                       IODebugContext* dbg) {
+  MutexLock l(&mutex_);
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  // Assumes caller passes consecutive byte ranges.
+  uint64_t sync_limit = offset + nbytes;
+  uint64_t buf_begin =
+      state_.pos_at_last_sync_ < 0 ? 0 : state_.pos_at_last_sync_;
+
+  IOStatus io_s;
+  if (sync_limit < buf_begin) {
+    return io_s;
+  }
+  uint64_t num_to_sync = std::min(static_cast<uint64_t>(state_.buffer_.size()),
+                                  sync_limit - buf_begin);
+  Slice buf_to_sync(state_.buffer_.data(), num_to_sync);
+  io_s = target_->Append(buf_to_sync, options, dbg);
+  state_.buffer_ = state_.buffer_.substr(num_to_sync);
+  // Ignore sync errors
+  target_->RangeSync(offset, nbytes, options, dbg).PermitUncheckedError();
+  state_.pos_at_last_sync_ = offset + num_to_sync;
+  fs_->WritableFileSynced(state_);
+  return io_s;
+}
+
+TestFSRandomRWFile::TestFSRandomRWFile(const std::string& /*fname*/,
+                                       std::unique_ptr<FSRandomRWFile>&& f,
+                                       FaultInjectionTestFS* fs)
+    : target_(std::move(f)), file_opened_(true), fs_(fs) {
+  assert(target_ != nullptr);
+}
+
+TestFSRandomRWFile::~TestFSRandomRWFile() {
+  if (file_opened_) {
+    Close(IOOptions(), nullptr).PermitUncheckedError();
+  }
+}
+
+IOStatus TestFSRandomRWFile::Write(uint64_t offset, const Slice& data,
+                                   const IOOptions& options,
+                                   IODebugContext* dbg) {
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  return target_->Write(offset, data, options, dbg);
+}
+
+IOStatus TestFSRandomRWFile::Read(uint64_t offset, size_t n,
+                                  const IOOptions& options, Slice* result,
+                                  char* scratch, IODebugContext* dbg) const {
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  return target_->Read(offset, n, options, result, scratch, dbg);
+}
+
+IOStatus TestFSRandomRWFile::Close(const IOOptions& options,
+                                   IODebugContext* dbg) {
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  file_opened_ = false;
+  return target_->Close(options, dbg);
+}
+
+IOStatus TestFSRandomRWFile::Flush(const IOOptions& options,
+                                   IODebugContext* dbg) {
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  return target_->Flush(options, dbg);
+}
+
+IOStatus TestFSRandomRWFile::Sync(const IOOptions& options,
+                                  IODebugContext* dbg) {
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  return target_->Sync(options, dbg);
+}
+
+TestFSRandomAccessFile::TestFSRandomAccessFile(
+    const std::string& /*fname*/, std::unique_ptr<FSRandomAccessFile>&& f,
+    FaultInjectionTestFS* fs)
+    : target_(std::move(f)), fs_(fs) {
+  assert(target_ != nullptr);
+}
+
+IOStatus TestFSRandomAccessFile::Read(uint64_t offset, size_t n,
+                                      const IOOptions& options, Slice* result,
+                                      char* scratch,
+                                      IODebugContext* dbg) const {
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  IOStatus s = target_->Read(offset, n, options, result, scratch, dbg);
+  if (s.ok()) {
+    s = fs_->InjectThreadSpecificReadError(
+        FaultInjectionTestFS::ErrorOperation::kRead, result, use_direct_io(),
+        scratch, /*need_count_increase=*/true, /*fault_injected=*/nullptr);
+  }
+  if (s.ok() && fs_->ShouldInjectRandomReadError()) {
+    return IOStatus::IOError("Injected read error");
+  }
+  return s;
+}
+
+IOStatus TestFSRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs,
+                                           const IOOptions& options,
+                                           IODebugContext* dbg) {
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  IOStatus s = target_->MultiRead(reqs, num_reqs, options, dbg);
+  bool injected_error = false;
+  for (size_t i = 0; i < num_reqs; i++) {
+    if (!reqs[i].status.ok()) {
+      // Already seeing an error.
+      break;
+    }
+    bool this_injected_error;
+    reqs[i].status = fs_->InjectThreadSpecificReadError(
+        FaultInjectionTestFS::ErrorOperation::kMultiReadSingleReq,
+        &(reqs[i].result), use_direct_io(), reqs[i].scratch,
+        /*need_count_increase=*/true,
+        /*fault_injected=*/&this_injected_error);
+    injected_error |= this_injected_error;
+  }
+  if (s.ok()) {
+    s = fs_->InjectThreadSpecificReadError(
+        FaultInjectionTestFS::ErrorOperation::kMultiRead, nullptr,
+        use_direct_io(), nullptr, /*need_count_increase=*/!injected_error,
+        /*fault_injected=*/nullptr);
+  }
+  if (s.ok() && fs_->ShouldInjectRandomReadError()) {
+    return IOStatus::IOError("Injected read error");
+  }
+  return s;
+}
+
+size_t TestFSRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
+  if (fs_->ShouldFailGetUniqueId()) {
+    return 0;
+  } else {
+    return target_->GetUniqueId(id, max_size);
+  }
+}
+IOStatus TestFSSequentialFile::Read(size_t n, const IOOptions& options,
+                                    Slice* result, char* scratch,
+                                    IODebugContext* dbg) {
+  IOStatus s = target()->Read(n, options, result, scratch, dbg);
+  if (s.ok() && fs_->ShouldInjectRandomReadError()) {
+    return IOStatus::IOError("Injected seq read error");
+  }
+  return s;
+}
+
+IOStatus TestFSSequentialFile::PositionedRead(uint64_t offset, size_t n,
+                                              const IOOptions& options,
+                                              Slice* result, char* scratch,
+                                              IODebugContext* dbg) {
+  IOStatus s =
+      target()->PositionedRead(offset, n, options, result, scratch, dbg);
+  if (s.ok() && fs_->ShouldInjectRandomReadError()) {
+    return IOStatus::IOError("Injected seq positioned read error");
+  }
+  return s;
+}
+
+IOStatus FaultInjectionTestFS::NewDirectory(
+    const std::string& name, const IOOptions& options,
+    std::unique_ptr<FSDirectory>* result, IODebugContext* dbg) {
+  std::unique_ptr<FSDirectory> r;
+  IOStatus io_s = target()->NewDirectory(name, options, &r, dbg);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  result->reset(
+      new TestFSDirectory(this, TestFSTrimDirname(name), r.release()));
+  return IOStatus::OK();
+}
+
+IOStatus FaultInjectionTestFS::NewWritableFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  {
+    IOStatus in_s = InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+
+  if (ShouldUseDiretWritable(fname)) {
+    return target()->NewWritableFile(fname, file_opts, result, dbg);
+  }
+
+  IOStatus io_s = target()->NewWritableFile(fname, file_opts, result, dbg);
+  if (io_s.ok()) {
+    result->reset(
+        new TestFSWritableFile(fname, file_opts, std::move(*result), this));
+    // WritableFileWriter* file is opened
+    // again then it will be truncated - so forget our saved state.
+    UntrackFile(fname);
+    {
+      MutexLock l(&mutex_);
+      open_managed_files_.insert(fname);
+      auto dir_and_name = TestFSGetDirAndName(fname);
+      auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+      // The new file could overwrite an old one. Here we simplify
+      // the implementation by assuming no file of this name after
+      // dropping unsynced files.
+      list[dir_and_name.second] = kNewFileNoOverwrite;
+    }
+    {
+      IOStatus in_s = InjectMetadataWriteError();
+      if (!in_s.ok()) {
+        return in_s;
+      }
+    }
+  }
+  return io_s;
+}
+
+IOStatus FaultInjectionTestFS::ReopenWritableFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  if (ShouldUseDiretWritable(fname)) {
+    return target()->ReopenWritableFile(fname, file_opts, result, dbg);
+  }
+  {
+    IOStatus in_s = InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+
+  bool exists;
+  IOStatus io_s,
+      exists_s = target()->FileExists(fname, IOOptions(), nullptr /* dbg */);
+  if (exists_s.IsNotFound()) {
+    exists = false;
+  } else if (exists_s.ok()) {
+    exists = true;
+  } else {
+    io_s = exists_s;
+    exists = false;
+  }
+
+  if (io_s.ok()) {
+    io_s = target()->ReopenWritableFile(fname, file_opts, result, dbg);
+  }
+
+  // Only track files we created. Files created outside of this
+  // `FaultInjectionTestFS` are not eligible for tracking/data dropping
+  // (for example, they may contain data a previous db_stress run expects to
+  // be recovered). This could be extended to track/drop data appended once
+  // the file is under `FaultInjectionTestFS`'s control.
+  if (io_s.ok()) {
+    bool should_track;
+    {
+      MutexLock l(&mutex_);
+      if (db_file_state_.find(fname) != db_file_state_.end()) {
+        // It was written by this `FileSystem` earlier.
+        assert(exists);
+        should_track = true;
+      } else if (!exists) {
+        // It was created by this `FileSystem` just now.
+        should_track = true;
+        open_managed_files_.insert(fname);
+        auto dir_and_name = TestFSGetDirAndName(fname);
+        auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+        list[dir_and_name.second] = kNewFileNoOverwrite;
+      } else {
+        should_track = false;
+      }
+    }
+    if (should_track) {
+      result->reset(
+          new TestFSWritableFile(fname, file_opts, std::move(*result), this));
+    }
+    {
+      IOStatus in_s = InjectMetadataWriteError();
+      if (!in_s.ok()) {
+        return in_s;
+      }
+    }
+  }
+  return io_s;
+}
+
+IOStatus FaultInjectionTestFS::NewRandomRWFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSRandomRWFile>* result, IODebugContext* dbg) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  if (ShouldUseDiretWritable(fname)) {
+    return target()->NewRandomRWFile(fname, file_opts, result, dbg);
+  }
+  {
+    IOStatus in_s = InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+  IOStatus io_s = target()->NewRandomRWFile(fname, file_opts, result, dbg);
+  if (io_s.ok()) {
+    result->reset(new TestFSRandomRWFile(fname, std::move(*result), this));
+    // WritableFileWriter* file is opened
+    // again then it will be truncated - so forget our saved state.
+    UntrackFile(fname);
+    {
+      MutexLock l(&mutex_);
+      open_managed_files_.insert(fname);
+      auto dir_and_name = TestFSGetDirAndName(fname);
+      auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+      // It could be overwriting an old file, but we simplify the
+      // implementation by ignoring it.
+      list[dir_and_name.second] = kNewFileNoOverwrite;
+    }
+    {
+      IOStatus in_s = InjectMetadataWriteError();
+      if (!in_s.ok()) {
+        return in_s;
+      }
+    }
+  }
+  return io_s;
+}
+
+IOStatus FaultInjectionTestFS::NewRandomAccessFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  if (ShouldInjectRandomReadError()) {
+    return IOStatus::IOError("Injected error when open random access file");
+  }
+  IOStatus io_s = InjectThreadSpecificReadError(ErrorOperation::kOpen, nullptr,
+                                                false, nullptr,
+                                                /*need_count_increase=*/true,
+                                                /*fault_injected=*/nullptr);
+  if (io_s.ok()) {
+    io_s = target()->NewRandomAccessFile(fname, file_opts, result, dbg);
+  }
+  if (io_s.ok()) {
+    result->reset(new TestFSRandomAccessFile(fname, std::move(*result), this));
+  }
+  return io_s;
+}
+
+IOStatus FaultInjectionTestFS::NewSequentialFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSSequentialFile>* result, IODebugContext* dbg) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+
+  if (ShouldInjectRandomReadError()) {
+    return IOStatus::IOError("Injected read error when creating seq file");
+  }
+  IOStatus io_s = target()->NewSequentialFile(fname, file_opts, result, dbg);
+  if (io_s.ok()) {
+    result->reset(new TestFSSequentialFile(std::move(*result), this));
+  }
+  return io_s;
+}
+
+IOStatus FaultInjectionTestFS::DeleteFile(const std::string& f,
+                                          const IOOptions& options,
+                                          IODebugContext* dbg) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  {
+    IOStatus in_s = InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+  IOStatus io_s = FileSystemWrapper::DeleteFile(f, options, dbg);
+  if (io_s.ok()) {
+    UntrackFile(f);
+    {
+      IOStatus in_s = InjectMetadataWriteError();
+      if (!in_s.ok()) {
+        return in_s;
+      }
+    }
+  }
+  return io_s;
+}
+
+IOStatus FaultInjectionTestFS::RenameFile(const std::string& s,
+                                          const std::string& t,
+                                          const IOOptions& options,
+                                          IODebugContext* dbg) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  {
+    IOStatus in_s = InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+
+  // We preserve contents of overwritten files up to a size threshold.
+  // We could keep previous file in another name, but we need to worry about
+  // garbage collect the those files. We do it if it is needed later.
+  // We ignore I/O errors here for simplicity.
+  std::string previous_contents = kNewFileNoOverwrite;
+  if (target()->FileExists(t, IOOptions(), nullptr).ok()) {
+    uint64_t file_size;
+    if (target()->GetFileSize(t, IOOptions(), &file_size, nullptr).ok() &&
+        file_size < 1024) {
+      ReadFileToString(target(), t, &previous_contents).PermitUncheckedError();
+    }
+  }
+  IOStatus io_s = FileSystemWrapper::RenameFile(s, t, options, dbg);
+
+  if (io_s.ok()) {
+    {
+      MutexLock l(&mutex_);
+      if (db_file_state_.find(s) != db_file_state_.end()) {
+        db_file_state_[t] = db_file_state_[s];
+        db_file_state_.erase(s);
+      }
+
+      auto sdn = TestFSGetDirAndName(s);
+      auto tdn = TestFSGetDirAndName(t);
+      if (dir_to_new_files_since_last_sync_[sdn.first].erase(sdn.second) != 0) {
+        auto& tlist = dir_to_new_files_since_last_sync_[tdn.first];
+        assert(tlist.find(tdn.second) == tlist.end());
+        tlist[tdn.second] = previous_contents;
+      }
+    }
+    IOStatus in_s = InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+
+  return io_s;
+}
+
+IOStatus FaultInjectionTestFS::LinkFile(const std::string& s,
+                                        const std::string& t,
+                                        const IOOptions& options,
+                                        IODebugContext* dbg) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  {
+    IOStatus in_s = InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+
+  // Using the value in `dir_to_new_files_since_last_sync_` for the source file
+  // may be a more reasonable choice.
+  std::string previous_contents = kNewFileNoOverwrite;
+
+  IOStatus io_s = FileSystemWrapper::LinkFile(s, t, options, dbg);
+
+  if (io_s.ok()) {
+    {
+      MutexLock l(&mutex_);
+      if (db_file_state_.find(s) != db_file_state_.end()) {
+        db_file_state_[t] = db_file_state_[s];
+      }
+
+      auto sdn = TestFSGetDirAndName(s);
+      auto tdn = TestFSGetDirAndName(t);
+      if (dir_to_new_files_since_last_sync_[sdn.first].find(sdn.second) !=
+          dir_to_new_files_since_last_sync_[sdn.first].end()) {
+        auto& tlist = dir_to_new_files_since_last_sync_[tdn.first];
+        assert(tlist.find(tdn.second) == tlist.end());
+        tlist[tdn.second] = previous_contents;
+      }
+    }
+    IOStatus in_s = InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+
+  return io_s;
+}
+
+void FaultInjectionTestFS::WritableFileClosed(const FSFileState& state) {
+  MutexLock l(&mutex_);
+  if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) {
+    db_file_state_[state.filename_] = state;
+    open_managed_files_.erase(state.filename_);
+  }
+}
+
+void FaultInjectionTestFS::WritableFileSynced(const FSFileState& state) {
+  MutexLock l(&mutex_);
+  if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) {
+    if (db_file_state_.find(state.filename_) == db_file_state_.end()) {
+      db_file_state_.insert(std::make_pair(state.filename_, state));
+    } else {
+      db_file_state_[state.filename_] = state;
+    }
+  }
+}
+
+void FaultInjectionTestFS::WritableFileAppended(const FSFileState& state) {
+  MutexLock l(&mutex_);
+  if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) {
+    if (db_file_state_.find(state.filename_) == db_file_state_.end()) {
+      db_file_state_.insert(std::make_pair(state.filename_, state));
+    } else {
+      db_file_state_[state.filename_] = state;
+    }
+  }
+}
+
+IOStatus FaultInjectionTestFS::DropUnsyncedFileData() {
+  IOStatus io_s;
+  MutexLock l(&mutex_);
+  for (std::map<std::string, FSFileState>::iterator it = db_file_state_.begin();
+       io_s.ok() && it != db_file_state_.end(); ++it) {
+    FSFileState& fs_state = it->second;
+    if (!fs_state.IsFullySynced()) {
+      io_s = fs_state.DropUnsyncedData();
+    }
+  }
+  return io_s;
+}
+
+IOStatus FaultInjectionTestFS::DropRandomUnsyncedFileData(Random* rnd) {
+  IOStatus io_s;
+  MutexLock l(&mutex_);
+  for (std::map<std::string, FSFileState>::iterator it = db_file_state_.begin();
+       io_s.ok() && it != db_file_state_.end(); ++it) {
+    FSFileState& fs_state = it->second;
+    if (!fs_state.IsFullySynced()) {
+      io_s = fs_state.DropRandomUnsyncedData(rnd);
+    }
+  }
+  return io_s;
+}
+
+IOStatus FaultInjectionTestFS::DeleteFilesCreatedAfterLastDirSync(
+    const IOOptions& options, IODebugContext* dbg) {
+  // Because DeleteFile access this container make a copy to avoid deadlock
+  std::map<std::string, std::map<std::string, std::string>> map_copy;
+  {
+    MutexLock l(&mutex_);
+    map_copy.insert(dir_to_new_files_since_last_sync_.begin(),
+                    dir_to_new_files_since_last_sync_.end());
+  }
+
+  for (auto& pair : map_copy) {
+    for (auto& file_pair : pair.second) {
+      if (file_pair.second == kNewFileNoOverwrite) {
+        IOStatus io_s =
+            DeleteFile(pair.first + "/" + file_pair.first, options, dbg);
+        if (!io_s.ok()) {
+          return io_s;
+        }
+      } else {
+        IOStatus io_s =
+            WriteStringToFile(target(), file_pair.second,
+                              pair.first + "/" + file_pair.first, true);
+        if (!io_s.ok()) {
+          return io_s;
+        }
+      }
+    }
+  }
+  return IOStatus::OK();
+}
+
+void FaultInjectionTestFS::ResetState() {
+  MutexLock l(&mutex_);
+  db_file_state_.clear();
+  dir_to_new_files_since_last_sync_.clear();
+  SetFilesystemActiveNoLock(true);
+}
+
+void FaultInjectionTestFS::UntrackFile(const std::string& f) {
+  MutexLock l(&mutex_);
+  auto dir_and_name = TestFSGetDirAndName(f);
+  dir_to_new_files_since_last_sync_[dir_and_name.first].erase(
+      dir_and_name.second);
+  db_file_state_.erase(f);
+  open_managed_files_.erase(f);
+}
+
+IOStatus FaultInjectionTestFS::InjectThreadSpecificReadError(
+    ErrorOperation op, Slice* result, bool direct_io, char* scratch,
+    bool need_count_increase, bool* fault_injected) {
+  bool dummy_bool;
+  bool& ret_fault_injected = fault_injected ? *fault_injected : dummy_bool;
+  ret_fault_injected = false;
+  ErrorContext* ctx = static_cast<ErrorContext*>(thread_local_error_->Get());
+  if (ctx == nullptr || !ctx->enable_error_injection || !ctx->one_in) {
+    return IOStatus::OK();
+  }
+
+  if (ctx->rand.OneIn(ctx->one_in)) {
+    if (ctx->count == 0) {
+      ctx->message = "";
+    }
+    if (need_count_increase) {
+      ctx->count++;
+    }
+    if (ctx->callstack) {
+      free(ctx->callstack);
+    }
+    ctx->callstack = port::SaveStack(&ctx->frames);
+
+    if (op != ErrorOperation::kMultiReadSingleReq) {
+      // Likely non-per read status code for MultiRead
+      ctx->message += "error; ";
+      ret_fault_injected = true;
+      return IOStatus::IOError();
+    } else if (Random::GetTLSInstance()->OneIn(8)) {
+      assert(result);
+      // For a small chance, set the failure to status but turn the
+      // result to be empty, which is supposed to be caught for a check.
+      *result = Slice();
+      ctx->message += "inject empty result; ";
+      ret_fault_injected = true;
+    } else if (!direct_io && Random::GetTLSInstance()->OneIn(7) &&
+               scratch != nullptr && result->data() == scratch) {
+      assert(result);
+      // With direct I/O, many extra bytes might be read so corrupting
+      // one byte might not cause checksum mismatch. Skip checksum
+      // corruption injection.
+      // We only corrupt data if the result is filled to `scratch`. For other
+      // cases, the data might not be able to be modified (e.g mmaped files)
+      // or has unintended side effects.
+      // For a small chance, set the failure to status but corrupt the
+      // result in a way that checksum checking is supposed to fail.
+      // Corrupt the last byte, which is supposed to be a checksum byte
+      // It would work for CRC. Not 100% sure for xxhash and will adjust
+      // if it is not the case.
+      const_cast<char*>(result->data())[result->size() - 1]++;
+      ctx->message += "corrupt last byte; ";
+      ret_fault_injected = true;
+    } else {
+      ctx->message += "error result multiget single; ";
+      ret_fault_injected = true;
+      return IOStatus::IOError();
+    }
+  }
+  return IOStatus::OK();
+}
+
+bool FaultInjectionTestFS::TryParseFileName(const std::string& file_name,
+                                            uint64_t* number, FileType* type) {
+  std::size_t found = file_name.find_last_of("/");
+  std::string file = file_name.substr(found);
+  return ParseFileName(file, number, type);
+}
+
+IOStatus FaultInjectionTestFS::InjectWriteError(const std::string& file_name) {
+  MutexLock l(&mutex_);
+  if (!enable_write_error_injection_ || !write_error_one_in_) {
+    return IOStatus::OK();
+  }
+  bool allowed_type = false;
+
+  if (inject_for_all_file_types_) {
+    allowed_type = true;
+  } else {
+    uint64_t number;
+    FileType cur_type = kTempFile;
+    if (TryParseFileName(file_name, &number, &cur_type)) {
+      for (const auto& type : write_error_allowed_types_) {
+        if (cur_type == type) {
+          allowed_type = true;
+        }
+      }
+    }
+  }
+
+  if (allowed_type) {
+    if (write_error_rand_.OneIn(write_error_one_in_)) {
+      return GetError();
+    }
+  }
+  return IOStatus::OK();
+}
+
+IOStatus FaultInjectionTestFS::InjectMetadataWriteError() {
+  {
+    MutexLock l(&mutex_);
+    if (!enable_metadata_write_error_injection_ ||
+        !metadata_write_error_one_in_ ||
+        !write_error_rand_.OneIn(metadata_write_error_one_in_)) {
+      return IOStatus::OK();
+    }
+  }
+  TEST_SYNC_POINT("FaultInjectionTestFS::InjectMetadataWriteError:Injected");
+  return IOStatus::IOError();
+}
+
+void FaultInjectionTestFS::PrintFaultBacktrace() {
+#if defined(OS_LINUX)
+  ErrorContext* ctx = static_cast<ErrorContext*>(thread_local_error_->Get());
+  if (ctx == nullptr) {
+    return;
+  }
+  fprintf(stderr, "Injected error type = %d\n", ctx->type);
+  fprintf(stderr, "Message: %s\n", ctx->message.c_str());
+  port::PrintAndFreeStack(ctx->callstack, ctx->frames);
+  ctx->callstack = nullptr;
+#endif
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/fault_injection_fs.h b/src/rocksdb/utilities/fault_injection_fs.h
new file mode 100644
index 000000000..53c9ccb6f
--- /dev/null
+++ b/src/rocksdb/utilities/fault_injection_fs.h
@@ -0,0 +1,584 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This test uses a custom FileSystem to keep track of the state of a file
+// system the last "Sync". The data being written is cached in a "buffer".
+// Only when "Sync" is called, the data will be persistent. It can similate
+// file data loss (or entire files) not protected by a "Sync". For any of the
+// FileSystem related operations, by specify the "IOStatus Error", a specific
+// error can be returned when file system is not activated.
+
+#pragma once
+
+#include <algorithm>
+#include <map>
+#include <set>
+#include <string>
+
+#include "file/filename.h"
+#include "rocksdb/file_system.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TestFSWritableFile;
+class FaultInjectionTestFS;
+
+struct FSFileState {
+  std::string filename_;
+  ssize_t pos_;
+  ssize_t pos_at_last_sync_;
+  ssize_t pos_at_last_flush_;
+  std::string buffer_;
+
+  explicit FSFileState(const std::string& filename)
+      : filename_(filename),
+        pos_(-1),
+        pos_at_last_sync_(-1),
+        pos_at_last_flush_(-1) {}
+
+  FSFileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {}
+
+  bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; }
+
+  IOStatus DropUnsyncedData();
+
+  IOStatus DropRandomUnsyncedData(Random* rand);
+};
+
+// A wrapper around WritableFileWriter* file
+// is written to or sync'ed.
+class TestFSWritableFile : public FSWritableFile {
+ public:
+  explicit TestFSWritableFile(const std::string& fname,
+                              const FileOptions& file_opts,
+                              std::unique_ptr<FSWritableFile>&& f,
+                              FaultInjectionTestFS* fs);
+  virtual ~TestFSWritableFile();
+  virtual IOStatus Append(const Slice& data, const IOOptions&,
+                          IODebugContext*) override;
+  virtual IOStatus Append(const Slice& data, const IOOptions& options,
+                          const DataVerificationInfo& verification_info,
+                          IODebugContext* dbg) override;
+  virtual IOStatus Truncate(uint64_t size, const IOOptions& options,
+                            IODebugContext* dbg) override {
+    return target_->Truncate(size, options, dbg);
+  }
+  virtual IOStatus Close(const IOOptions& options,
+                         IODebugContext* dbg) override;
+  virtual IOStatus Flush(const IOOptions&, IODebugContext*) override;
+  virtual IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
+  virtual IOStatus RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/,
+                             const IOOptions& options,
+                             IODebugContext* dbg) override;
+  virtual bool IsSyncThreadSafe() const override { return true; }
+  virtual IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                                    const IOOptions& options,
+                                    IODebugContext* dbg) override {
+    return target_->PositionedAppend(data, offset, options, dbg);
+  }
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& options,
+                            const DataVerificationInfo& verification_info,
+                            IODebugContext* dbg) override;
+  virtual size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  virtual bool use_direct_io() const override {
+    return target_->use_direct_io();
+  };
+
+ private:
+  FSFileState state_;  // Need protection by mutex_
+  FileOptions file_opts_;
+  std::unique_ptr<FSWritableFile> target_;
+  bool writable_file_opened_;
+  FaultInjectionTestFS* fs_;
+  port::Mutex mutex_;
+};
+
+// A wrapper around WritableFileWriter* file
+// is written to or sync'ed.
+class TestFSRandomRWFile : public FSRandomRWFile {
+ public:
+  explicit TestFSRandomRWFile(const std::string& fname,
+                              std::unique_ptr<FSRandomRWFile>&& f,
+                              FaultInjectionTestFS* fs);
+  virtual ~TestFSRandomRWFile();
+  IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options,
+                 IODebugContext* dbg) override;
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  bool use_direct_io() const override { return target_->use_direct_io(); };
+
+ private:
+  std::unique_ptr<FSRandomRWFile> target_;
+  bool file_opened_;
+  FaultInjectionTestFS* fs_;
+};
+
+class TestFSRandomAccessFile : public FSRandomAccessFile {
+ public:
+  explicit TestFSRandomAccessFile(const std::string& fname,
+                                  std::unique_ptr<FSRandomAccessFile>&& f,
+                                  FaultInjectionTestFS* fs);
+  ~TestFSRandomAccessFile() override {}
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
+  IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
+                     const IOOptions& options, IODebugContext* dbg) override;
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+
+  size_t GetUniqueId(char* id, size_t max_size) const override;
+
+ private:
+  std::unique_ptr<FSRandomAccessFile> target_;
+  FaultInjectionTestFS* fs_;
+};
+
+class TestFSSequentialFile : public FSSequentialFileOwnerWrapper {
+ public:
+  explicit TestFSSequentialFile(std::unique_ptr<FSSequentialFile>&& f,
+                                FaultInjectionTestFS* fs)
+      : FSSequentialFileOwnerWrapper(std::move(f)), fs_(fs) {}
+  IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+                char* scratch, IODebugContext* dbg) override;
+  IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options,
+                          Slice* result, char* scratch,
+                          IODebugContext* dbg) override;
+
+ private:
+  FaultInjectionTestFS* fs_;
+};
+
+class TestFSDirectory : public FSDirectory {
+ public:
+  explicit TestFSDirectory(FaultInjectionTestFS* fs, std::string dirname,
+                           FSDirectory* dir)
+      : fs_(fs), dirname_(dirname), dir_(dir) {}
+  ~TestFSDirectory() {}
+
+  virtual IOStatus Fsync(const IOOptions& options,
+                         IODebugContext* dbg) override;
+
+  virtual IOStatus Close(const IOOptions& options,
+                         IODebugContext* dbg) override;
+
+  virtual IOStatus FsyncWithDirOptions(
+      const IOOptions& options, IODebugContext* dbg,
+      const DirFsyncOptions& dir_fsync_options) override;
+
+ private:
+  FaultInjectionTestFS* fs_;
+  std::string dirname_;
+  std::unique_ptr<FSDirectory> dir_;
+};
+
+class FaultInjectionTestFS : public FileSystemWrapper {
+ public:
+  explicit FaultInjectionTestFS(const std::shared_ptr<FileSystem>& base)
+      : FileSystemWrapper(base),
+        filesystem_active_(true),
+        filesystem_writable_(false),
+        thread_local_error_(new ThreadLocalPtr(DeleteThreadLocalErrorContext)),
+        enable_write_error_injection_(false),
+        enable_metadata_write_error_injection_(false),
+        write_error_rand_(0),
+        write_error_one_in_(0),
+        metadata_write_error_one_in_(0),
+        read_error_one_in_(0),
+        ingest_data_corruption_before_write_(false),
+        fail_get_file_unique_id_(false) {}
+  virtual ~FaultInjectionTestFS() { error_.PermitUncheckedError(); }
+
+  static const char* kClassName() { return "FaultInjectionTestFS"; }
+  const char* Name() const override { return kClassName(); }
+
+  IOStatus NewDirectory(const std::string& name, const IOOptions& options,
+                        std::unique_ptr<FSDirectory>* result,
+                        IODebugContext* dbg) override;
+
+  IOStatus NewWritableFile(const std::string& fname,
+                           const FileOptions& file_opts,
+                           std::unique_ptr<FSWritableFile>* result,
+                           IODebugContext* dbg) override;
+
+  IOStatus ReopenWritableFile(const std::string& fname,
+                              const FileOptions& file_opts,
+                              std::unique_ptr<FSWritableFile>* result,
+                              IODebugContext* dbg) override;
+
+  IOStatus NewRandomRWFile(const std::string& fname,
+                           const FileOptions& file_opts,
+                           std::unique_ptr<FSRandomRWFile>* result,
+                           IODebugContext* dbg) override;
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& file_opts,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override;
+  IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts,
+                             std::unique_ptr<FSSequentialFile>* r,
+                             IODebugContext* dbg) override;
+
+  virtual IOStatus DeleteFile(const std::string& f, const IOOptions& options,
+                              IODebugContext* dbg) override;
+
+  virtual IOStatus RenameFile(const std::string& s, const std::string& t,
+                              const IOOptions& options,
+                              IODebugContext* dbg) override;
+
+  virtual IOStatus LinkFile(const std::string& src, const std::string& target,
+                            const IOOptions& options,
+                            IODebugContext* dbg) override;
+
+// Undef to eliminate clash on Windows
+#undef GetFreeSpace
+  virtual IOStatus GetFreeSpace(const std::string& path,
+                                const IOOptions& options, uint64_t* disk_free,
+                                IODebugContext* dbg) override {
+    IOStatus io_s;
+    if (!IsFilesystemActive() &&
+        error_.subcode() == IOStatus::SubCode::kNoSpace) {
+      *disk_free = 0;
+    } else {
+      io_s = target()->GetFreeSpace(path, options, disk_free, dbg);
+    }
+    return io_s;
+  }
+
+  void WritableFileClosed(const FSFileState& state);
+
+  void WritableFileSynced(const FSFileState& state);
+
+  void WritableFileAppended(const FSFileState& state);
+
+  IOStatus DropUnsyncedFileData();
+
+  IOStatus DropRandomUnsyncedFileData(Random* rnd);
+
+  IOStatus DeleteFilesCreatedAfterLastDirSync(const IOOptions& options,
+                                              IODebugContext* dbg);
+
+  void ResetState();
+
+  void UntrackFile(const std::string& f);
+
+  void SyncDir(const std::string& dirname) {
+    MutexLock l(&mutex_);
+    dir_to_new_files_since_last_sync_.erase(dirname);
+  }
+
+  // Setting the filesystem to inactive is the test equivalent to simulating a
+  // system reset. Setting to inactive will freeze our saved filesystem state so
+  // that it will stop being recorded. It can then be reset back to the state at
+  // the time of the reset.
+  bool IsFilesystemActive() {
+    MutexLock l(&mutex_);
+    return filesystem_active_;
+  }
+
+  // Setting filesystem_writable_ makes NewWritableFile. ReopenWritableFile,
+  // and NewRandomRWFile bypass FaultInjectionTestFS and go directly to the
+  // target FS
+  bool IsFilesystemDirectWritable() {
+    MutexLock l(&mutex_);
+    return filesystem_writable_;
+  }
+  bool ShouldUseDiretWritable(const std::string& file_name) {
+    MutexLock l(&mutex_);
+    if (filesystem_writable_) {
+      return true;
+    }
+    FileType file_type = kTempFile;
+    uint64_t file_number = 0;
+    if (!TryParseFileName(file_name, &file_number, &file_type)) {
+      return false;
+    }
+    return skip_direct_writable_types_.find(file_type) !=
+           skip_direct_writable_types_.end();
+  }
+  void SetFilesystemActiveNoLock(
+      bool active, IOStatus error = IOStatus::Corruption("Not active")) {
+    error.PermitUncheckedError();
+    filesystem_active_ = active;
+    if (!active) {
+      error_ = error;
+    }
+  }
+  void SetFilesystemActive(
+      bool active, IOStatus error = IOStatus::Corruption("Not active")) {
+    MutexLock l(&mutex_);
+    error.PermitUncheckedError();
+    SetFilesystemActiveNoLock(active, error);
+  }
+  void SetFilesystemDirectWritable(bool writable) {
+    MutexLock l(&mutex_);
+    filesystem_writable_ = writable;
+  }
+  void AssertNoOpenFile() { assert(open_managed_files_.empty()); }
+
+  IOStatus GetError() { return error_; }
+
+  void SetFileSystemIOError(IOStatus io_error) {
+    MutexLock l(&mutex_);
+    io_error.PermitUncheckedError();
+    error_ = io_error;
+  }
+
+  // To simulate the data corruption before data is written in FS
+  void IngestDataCorruptionBeforeWrite() {
+    MutexLock l(&mutex_);
+    ingest_data_corruption_before_write_ = true;
+  }
+
+  void NoDataCorruptionBeforeWrite() {
+    MutexLock l(&mutex_);
+    ingest_data_corruption_before_write_ = false;
+  }
+
+  bool ShouldDataCorruptionBeforeWrite() {
+    MutexLock l(&mutex_);
+    return ingest_data_corruption_before_write_;
+  }
+
+  void SetChecksumHandoffFuncType(const ChecksumType& func_type) {
+    MutexLock l(&mutex_);
+    checksum_handoff_func_tpye_ = func_type;
+  }
+
+  const ChecksumType& GetChecksumHandoffFuncType() {
+    MutexLock l(&mutex_);
+    return checksum_handoff_func_tpye_;
+  }
+
+  void SetFailGetUniqueId(bool flag) {
+    MutexLock l(&mutex_);
+    fail_get_file_unique_id_ = flag;
+  }
+
+  bool ShouldFailGetUniqueId() {
+    MutexLock l(&mutex_);
+    return fail_get_file_unique_id_;
+  }
+
+  // Specify what the operation, so we can inject the right type of error
+  enum ErrorOperation : char {
+    kRead = 0,
+    kMultiReadSingleReq = 1,
+    kMultiRead = 2,
+    kOpen,
+  };
+
+  // Set thread-local parameters for error injection. The first argument,
+  // seed is the seed for the random number generator, and one_in determines
+  // the probability of injecting error (i.e an error is injected with
+  // 1/one_in probability)
+  void SetThreadLocalReadErrorContext(uint32_t seed, int one_in) {
+    struct ErrorContext* ctx =
+        static_cast<struct ErrorContext*>(thread_local_error_->Get());
+    if (ctx == nullptr) {
+      ctx = new ErrorContext(seed);
+      thread_local_error_->Reset(ctx);
+    }
+    ctx->one_in = one_in;
+    ctx->count = 0;
+  }
+
+  static void DeleteThreadLocalErrorContext(void* p) {
+    ErrorContext* ctx = static_cast<ErrorContext*>(p);
+    delete ctx;
+  }
+
+  // This is to set the parameters for the write error injection.
+  // seed is the seed for the random number generator, and one_in determines
+  // the probability of injecting error (i.e an error is injected with
+  // 1/one_in probability). For write error, we can specify the error we
+  // want to inject. Types decides the file types we want to inject the
+  // error (e.g., Wal files, SST files), which is empty by default.
+  void SetRandomWriteError(uint32_t seed, int one_in, IOStatus error,
+                           bool inject_for_all_file_types,
+                           const std::vector<FileType>& types) {
+    MutexLock l(&mutex_);
+    Random tmp_rand(seed);
+    error.PermitUncheckedError();
+    error_ = error;
+    write_error_rand_ = tmp_rand;
+    write_error_one_in_ = one_in;
+    inject_for_all_file_types_ = inject_for_all_file_types;
+    write_error_allowed_types_ = types;
+  }
+
+  void SetSkipDirectWritableTypes(const std::set<FileType>& types) {
+    MutexLock l(&mutex_);
+    skip_direct_writable_types_ = types;
+  }
+
+  void SetRandomMetadataWriteError(int one_in) {
+    MutexLock l(&mutex_);
+    metadata_write_error_one_in_ = one_in;
+  }
+  // If the value is not 0, it is enabled. Otherwise, it is disabled.
+  void SetRandomReadError(int one_in) { read_error_one_in_ = one_in; }
+
+  bool ShouldInjectRandomReadError() {
+    return read_error_one_in() &&
+           Random::GetTLSInstance()->OneIn(read_error_one_in());
+  }
+
+  // Inject an write error with randomlized parameter and the predefined
+  // error type. Only the allowed file types will inject the write error
+  IOStatus InjectWriteError(const std::string& file_name);
+
+  // Ingest error to metadata operations.
+  IOStatus InjectMetadataWriteError();
+
+  // Inject an error. For a READ operation, a status of IOError(), a
+  // corruption in the contents of scratch, or truncation of slice
+  // are the types of error with equal probability. For OPEN,
+  // its always an IOError.
+  // fault_injected returns whether a fault is injected. It is needed
+  // because some fault is inected with IOStatus to be OK.
+  IOStatus InjectThreadSpecificReadError(ErrorOperation op, Slice* slice,
+                                         bool direct_io, char* scratch,
+                                         bool need_count_increase,
+                                         bool* fault_injected);
+
+  // Get the count of how many times we injected since the previous call
+  int GetAndResetErrorCount() {
+    ErrorContext* ctx = static_cast<ErrorContext*>(thread_local_error_->Get());
+    int count = 0;
+    if (ctx != nullptr) {
+      count = ctx->count;
+      ctx->count = 0;
+    }
+    return count;
+  }
+
+  void EnableErrorInjection() {
+    ErrorContext* ctx = static_cast<ErrorContext*>(thread_local_error_->Get());
+    if (ctx) {
+      ctx->enable_error_injection = true;
+    }
+  }
+
+  void EnableWriteErrorInjection() {
+    MutexLock l(&mutex_);
+    enable_write_error_injection_ = true;
+  }
+  void EnableMetadataWriteErrorInjection() {
+    MutexLock l(&mutex_);
+    enable_metadata_write_error_injection_ = true;
+  }
+
+  void DisableWriteErrorInjection() {
+    MutexLock l(&mutex_);
+    enable_write_error_injection_ = false;
+  }
+
+  void DisableErrorInjection() {
+    ErrorContext* ctx = static_cast<ErrorContext*>(thread_local_error_->Get());
+    if (ctx) {
+      ctx->enable_error_injection = false;
+    }
+  }
+
+  void DisableMetadataWriteErrorInjection() {
+    MutexLock l(&mutex_);
+    enable_metadata_write_error_injection_ = false;
+  }
+
+  int read_error_one_in() const { return read_error_one_in_.load(); }
+
+  int write_error_one_in() const { return write_error_one_in_; }
+
+  // We capture a backtrace every time a fault is injected, for debugging
+  // purposes. This call prints the backtrace to stderr and frees the
+  // saved callstack
+  void PrintFaultBacktrace();
+
+ private:
+  port::Mutex mutex_;
+  std::map<std::string, FSFileState> db_file_state_;
+  std::set<std::string> open_managed_files_;
+  // directory -> (file name -> file contents to recover)
+  // When data is recovered from unsyned parent directory, the files with
+  // empty file contents to recover is deleted. Those with non-empty ones
+  // will be recovered to content accordingly.
+  std::unordered_map<std::string, std::map<std::string, std::string>>
+      dir_to_new_files_since_last_sync_;
+  bool filesystem_active_;    // Record flushes, syncs, writes
+  bool filesystem_writable_;  // Bypass FaultInjectionTestFS and go directly
+                              // to underlying FS for writable files
+  IOStatus error_;
+
+  enum ErrorType : int {
+    kErrorTypeStatus = 0,
+    kErrorTypeCorruption,
+    kErrorTypeTruncated,
+    kErrorTypeMax
+  };
+
+  struct ErrorContext {
+    Random rand;
+    int one_in;
+    int count;
+    bool enable_error_injection;
+    void* callstack;
+    std::string message;
+    int frames;
+    ErrorType type;
+
+    explicit ErrorContext(uint32_t seed)
+        : rand(seed),
+          enable_error_injection(false),
+          callstack(nullptr),
+          frames(0) {}
+    ~ErrorContext() {
+      if (callstack) {
+        free(callstack);
+      }
+    }
+  };
+
+  std::unique_ptr<ThreadLocalPtr> thread_local_error_;
+  bool enable_write_error_injection_;
+  bool enable_metadata_write_error_injection_;
+  Random write_error_rand_;
+  int write_error_one_in_;
+  int metadata_write_error_one_in_;
+  std::atomic<int> read_error_one_in_;
+  bool inject_for_all_file_types_;
+  std::vector<FileType> write_error_allowed_types_;
+  // File types where direct writable is skipped.
+  std::set<FileType> skip_direct_writable_types_;
+  bool ingest_data_corruption_before_write_;
+  ChecksumType checksum_handoff_func_tpye_;
+  bool fail_get_file_unique_id_;
+
+  // Extract number of type from file name. Return false if failing to fine
+  // them.
+  bool TryParseFileName(const std::string& file_name, uint64_t* number,
+                        FileType* type);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/fault_injection_secondary_cache.cc b/src/rocksdb/utilities/fault_injection_secondary_cache.cc
new file mode 100644
index 000000000..2758c2a19
--- /dev/null
+++ b/src/rocksdb/utilities/fault_injection_secondary_cache.cc
@@ -0,0 +1,131 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+// This class implements a custom SecondaryCache that randomly injects an
+// error status into Inserts/Lookups based on a specified probability.
+
+#include "utilities/fault_injection_secondary_cache.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void FaultInjectionSecondaryCache::ResultHandle::UpdateHandleValue(
+    FaultInjectionSecondaryCache::ResultHandle* handle) {
+  ErrorContext* ctx = handle->cache_->GetErrorContext();
+  if (!ctx->rand.OneIn(handle->cache_->prob_)) {
+    handle->value_ = handle->base_->Value();
+    handle->size_ = handle->base_->Size();
+  }
+  handle->base_.reset();
+}
+
+bool FaultInjectionSecondaryCache::ResultHandle::IsReady() {
+  bool ready = true;
+  if (base_) {
+    ready = base_->IsReady();
+    if (ready) {
+      UpdateHandleValue(this);
+    }
+  }
+  return ready;
+}
+
+void FaultInjectionSecondaryCache::ResultHandle::Wait() {
+  base_->Wait();
+  UpdateHandleValue(this);
+}
+
+void* FaultInjectionSecondaryCache::ResultHandle::Value() { return value_; }
+
+size_t FaultInjectionSecondaryCache::ResultHandle::Size() { return size_; }
+
+void FaultInjectionSecondaryCache::ResultHandle::WaitAll(
+    FaultInjectionSecondaryCache* cache,
+    std::vector<SecondaryCacheResultHandle*> handles) {
+  std::vector<SecondaryCacheResultHandle*> base_handles;
+  for (SecondaryCacheResultHandle* hdl : handles) {
+    FaultInjectionSecondaryCache::ResultHandle* handle =
+        static_cast<FaultInjectionSecondaryCache::ResultHandle*>(hdl);
+    if (!handle->base_) {
+      continue;
+    }
+    base_handles.emplace_back(handle->base_.get());
+  }
+
+  cache->base_->WaitAll(base_handles);
+  for (SecondaryCacheResultHandle* hdl : handles) {
+    FaultInjectionSecondaryCache::ResultHandle* handle =
+        static_cast<FaultInjectionSecondaryCache::ResultHandle*>(hdl);
+    if (handle->base_) {
+      UpdateHandleValue(handle);
+    }
+  }
+}
+
+FaultInjectionSecondaryCache::ErrorContext*
+FaultInjectionSecondaryCache::GetErrorContext() {
+  ErrorContext* ctx = static_cast<ErrorContext*>(thread_local_error_->Get());
+  if (!ctx) {
+    ctx = new ErrorContext(seed_);
+    thread_local_error_->Reset(ctx);
+  }
+
+  return ctx;
+}
+
+Status FaultInjectionSecondaryCache::Insert(
+    const Slice& key, void* value, const Cache::CacheItemHelper* helper) {
+  ErrorContext* ctx = GetErrorContext();
+  if (ctx->rand.OneIn(prob_)) {
+    return Status::IOError();
+  }
+
+  return base_->Insert(key, value, helper);
+}
+
+std::unique_ptr<SecondaryCacheResultHandle>
+FaultInjectionSecondaryCache::Lookup(const Slice& key,
+                                     const Cache::CreateCallback& create_cb,
+                                     bool wait, bool advise_erase,
+                                     bool& is_in_sec_cache) {
+  ErrorContext* ctx = GetErrorContext();
+  if (base_is_compressed_sec_cache_) {
+    if (ctx->rand.OneIn(prob_)) {
+      return nullptr;
+    } else {
+      return base_->Lookup(key, create_cb, wait, advise_erase, is_in_sec_cache);
+    }
+  } else {
+    std::unique_ptr<SecondaryCacheResultHandle> hdl =
+        base_->Lookup(key, create_cb, wait, advise_erase, is_in_sec_cache);
+    if (wait && ctx->rand.OneIn(prob_)) {
+      hdl.reset();
+    }
+    return std::unique_ptr<FaultInjectionSecondaryCache::ResultHandle>(
+        new FaultInjectionSecondaryCache::ResultHandle(this, std::move(hdl)));
+  }
+}
+
+void FaultInjectionSecondaryCache::Erase(const Slice& key) {
+  base_->Erase(key);
+}
+
+void FaultInjectionSecondaryCache::WaitAll(
+    std::vector<SecondaryCacheResultHandle*> handles) {
+  if (base_is_compressed_sec_cache_) {
+    ErrorContext* ctx = GetErrorContext();
+    std::vector<SecondaryCacheResultHandle*> base_handles;
+    for (SecondaryCacheResultHandle* hdl : handles) {
+      if (ctx->rand.OneIn(prob_)) {
+        continue;
+      }
+      base_handles.push_back(hdl);
+    }
+    base_->WaitAll(base_handles);
+  } else {
+    FaultInjectionSecondaryCache::ResultHandle::WaitAll(this, handles);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/fault_injection_secondary_cache.h b/src/rocksdb/utilities/fault_injection_secondary_cache.h
new file mode 100644
index 000000000..5321df626
--- /dev/null
+++ b/src/rocksdb/utilities/fault_injection_secondary_cache.h
@@ -0,0 +1,108 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/secondary_cache.h"
+#include "util/random.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This class implements a custom SecondaryCache that randomly injects an
+// error status into Inserts/Lookups based on a specified probability.
+// Its used by db_stress to verify correctness in the presence of
+// secondary cache errors.
+//
+class FaultInjectionSecondaryCache : public SecondaryCache {
+ public:
+  explicit FaultInjectionSecondaryCache(
+      const std::shared_ptr<SecondaryCache>& base, uint32_t seed, int prob)
+      : base_(base),
+        seed_(seed),
+        prob_(prob),
+        thread_local_error_(new ThreadLocalPtr(DeleteThreadLocalErrorContext)) {
+    if (std::strcmp(base_->Name(), "CompressedSecondaryCache") == 0) {
+      base_is_compressed_sec_cache_ = true;
+    }
+  }
+
+  virtual ~FaultInjectionSecondaryCache() override {}
+
+  const char* Name() const override { return "FaultInjectionSecondaryCache"; }
+
+  Status Insert(const Slice& key, void* value,
+                const Cache::CacheItemHelper* helper) override;
+
+  std::unique_ptr<SecondaryCacheResultHandle> Lookup(
+      const Slice& key, const Cache::CreateCallback& create_cb, bool wait,
+      bool advise_erase, bool& is_in_sec_cache) override;
+
+  bool SupportForceErase() const override { return base_->SupportForceErase(); }
+
+  void Erase(const Slice& key) override;
+
+  void WaitAll(std::vector<SecondaryCacheResultHandle*> handles) override;
+
+  Status SetCapacity(size_t capacity) override {
+    return base_->SetCapacity(capacity);
+  }
+
+  Status GetCapacity(size_t& capacity) override {
+    return base_->GetCapacity(capacity);
+  }
+
+  std::string GetPrintableOptions() const override {
+    return base_->GetPrintableOptions();
+  }
+
+ private:
+  class ResultHandle : public SecondaryCacheResultHandle {
+   public:
+    ResultHandle(FaultInjectionSecondaryCache* cache,
+                 std::unique_ptr<SecondaryCacheResultHandle>&& base)
+        : cache_(cache), base_(std::move(base)), value_(nullptr), size_(0) {}
+
+    ~ResultHandle() override {}
+
+    bool IsReady() override;
+
+    void Wait() override;
+
+    void* Value() override;
+
+    size_t Size() override;
+
+    static void WaitAll(FaultInjectionSecondaryCache* cache,
+                        std::vector<SecondaryCacheResultHandle*> handles);
+
+   private:
+    static void UpdateHandleValue(ResultHandle* handle);
+
+    FaultInjectionSecondaryCache* cache_;
+    std::unique_ptr<SecondaryCacheResultHandle> base_;
+    void* value_;
+    size_t size_;
+  };
+
+  static void DeleteThreadLocalErrorContext(void* p) {
+    ErrorContext* ctx = static_cast<ErrorContext*>(p);
+    delete ctx;
+  }
+
+  const std::shared_ptr<SecondaryCache> base_;
+  uint32_t seed_;
+  int prob_;
+  bool base_is_compressed_sec_cache_{false};
+
+  struct ErrorContext {
+    Random rand;
+
+    explicit ErrorContext(uint32_t seed) : rand(seed) {}
+  };
+  std::unique_ptr<ThreadLocalPtr> thread_local_error_;
+
+  ErrorContext* GetErrorContext();
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/leveldb_options/leveldb_options.cc b/src/rocksdb/utilities/leveldb_options/leveldb_options.cc
new file mode 100644
index 000000000..125c3d956
--- /dev/null
+++ b/src/rocksdb/utilities/leveldb_options/leveldb_options.cc
@@ -0,0 +1,57 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/utilities/leveldb_options.h"
+
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+LevelDBOptions::LevelDBOptions()
+    : comparator(BytewiseComparator()),
+      create_if_missing(false),
+      error_if_exists(false),
+      paranoid_checks(false),
+      env(Env::Default()),
+      info_log(nullptr),
+      write_buffer_size(4 << 20),
+      max_open_files(1000),
+      block_cache(nullptr),
+      block_size(4096),
+      block_restart_interval(16),
+      compression(kSnappyCompression),
+      filter_policy(nullptr) {}
+
+Options ConvertOptions(const LevelDBOptions& leveldb_options) {
+  Options options = Options();
+  options.create_if_missing = leveldb_options.create_if_missing;
+  options.error_if_exists = leveldb_options.error_if_exists;
+  options.paranoid_checks = leveldb_options.paranoid_checks;
+  options.env = leveldb_options.env;
+  options.info_log.reset(leveldb_options.info_log);
+  options.write_buffer_size = leveldb_options.write_buffer_size;
+  options.max_open_files = leveldb_options.max_open_files;
+  options.compression = leveldb_options.compression;
+
+  BlockBasedTableOptions table_options;
+  table_options.block_cache.reset(leveldb_options.block_cache);
+  table_options.block_size = leveldb_options.block_size;
+  table_options.block_restart_interval = leveldb_options.block_restart_interval;
+  table_options.filter_policy.reset(leveldb_options.filter_policy);
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  return options;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/memory/memory_test.cc b/src/rocksdb/utilities/memory/memory_test.cc
new file mode 100644
index 000000000..0b043af0e
--- /dev/null
+++ b/src/rocksdb/utilities/memory/memory_test.cc
@@ -0,0 +1,279 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/memory_util.h"
+#include "rocksdb/utilities/stackable_db.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MemoryTest : public testing::Test {
+ public:
+  MemoryTest() : kDbDir(test::PerThreadDBPath("memory_test")), rnd_(301) {
+    assert(Env::Default()->CreateDirIfMissing(kDbDir).ok());
+  }
+
+  std::string GetDBName(int id) { return kDbDir + "db_" + std::to_string(id); }
+
+  void UpdateUsagesHistory(const std::vector<DB*>& dbs) {
+    std::map<MemoryUtil::UsageType, uint64_t> usage_by_type;
+    ASSERT_OK(GetApproximateMemoryUsageByType(dbs, &usage_by_type));
+    for (int i = 0; i < MemoryUtil::kNumUsageTypes; ++i) {
+      usage_history_[i].push_back(
+          usage_by_type[static_cast<MemoryUtil::UsageType>(i)]);
+    }
+  }
+
+  void GetCachePointersFromTableFactory(
+      const TableFactory* factory,
+      std::unordered_set<const Cache*>* cache_set) {
+    const auto bbto = factory->GetOptions<BlockBasedTableOptions>();
+    if (bbto != nullptr) {
+      cache_set->insert(bbto->block_cache.get());
+      cache_set->insert(bbto->block_cache_compressed.get());
+    }
+  }
+
+  void GetCachePointers(const std::vector<DB*>& dbs,
+                        std::unordered_set<const Cache*>* cache_set) {
+    cache_set->clear();
+
+    for (auto* db : dbs) {
+      assert(db);
+
+      // Cache from DBImpl
+      StackableDB* sdb = dynamic_cast<StackableDB*>(db);
+      DBImpl* db_impl = dynamic_cast<DBImpl*>(sdb ? sdb->GetBaseDB() : db);
+      if (db_impl != nullptr) {
+        cache_set->insert(db_impl->TEST_table_cache());
+      }
+
+      // Cache from DBOptions
+      cache_set->insert(db->GetDBOptions().row_cache.get());
+
+      // Cache from table factories
+      std::unordered_map<std::string, const ImmutableCFOptions*> iopts_map;
+      if (db_impl != nullptr) {
+        ASSERT_OK(db_impl->TEST_GetAllImmutableCFOptions(&iopts_map));
+      }
+      for (auto pair : iopts_map) {
+        GetCachePointersFromTableFactory(pair.second->table_factory.get(),
+                                         cache_set);
+      }
+    }
+  }
+
+  Status GetApproximateMemoryUsageByType(
+      const std::vector<DB*>& dbs,
+      std::map<MemoryUtil::UsageType, uint64_t>* usage_by_type) {
+    std::unordered_set<const Cache*> cache_set;
+    GetCachePointers(dbs, &cache_set);
+
+    return MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set,
+                                                       usage_by_type);
+  }
+
+  const std::string kDbDir;
+  Random rnd_;
+  std::vector<uint64_t> usage_history_[MemoryUtil::kNumUsageTypes];
+};
+
+TEST_F(MemoryTest, SharedBlockCacheTotal) {
+  std::vector<DB*> dbs;
+  std::vector<uint64_t> usage_by_type;
+  const int kNumDBs = 10;
+  const int kKeySize = 100;
+  const int kValueSize = 500;
+  Options opt;
+  opt.create_if_missing = true;
+  opt.write_buffer_size = kKeySize + kValueSize;
+  opt.max_write_buffer_number = 10;
+  opt.min_write_buffer_number_to_merge = 10;
+  opt.disable_auto_compactions = true;
+  BlockBasedTableOptions bbt_opts;
+  bbt_opts.block_cache = NewLRUCache(4096 * 1000 * 10);
+  for (int i = 0; i < kNumDBs; ++i) {
+    ASSERT_OK(DestroyDB(GetDBName(i), opt));
+    DB* db = nullptr;
+    ASSERT_OK(DB::Open(opt, GetDBName(i), &db));
+    dbs.push_back(db);
+  }
+
+  std::vector<std::string> keys_by_db[kNumDBs];
+
+  // Fill one memtable per Put to make memtable use more memory.
+  for (int p = 0; p < opt.min_write_buffer_number_to_merge / 2; ++p) {
+    for (int i = 0; i < kNumDBs; ++i) {
+      for (int j = 0; j < 100; ++j) {
+        keys_by_db[i].emplace_back(rnd_.RandomString(kKeySize));
+        ASSERT_OK(dbs[i]->Put(WriteOptions(), keys_by_db[i].back(),
+                              rnd_.RandomString(kValueSize)));
+      }
+      ASSERT_OK(dbs[i]->Flush(FlushOptions()));
+    }
+  }
+  for (int i = 0; i < kNumDBs; ++i) {
+    for (auto& key : keys_by_db[i]) {
+      std::string value;
+      ASSERT_OK(dbs[i]->Get(ReadOptions(), key, &value));
+    }
+    UpdateUsagesHistory(dbs);
+  }
+  for (size_t i = 1; i < usage_history_[MemoryUtil::kMemTableTotal].size();
+       ++i) {
+    // Expect EQ as we didn't flush more memtables.
+    ASSERT_EQ(usage_history_[MemoryUtil::kTableReadersTotal][i],
+              usage_history_[MemoryUtil::kTableReadersTotal][i - 1]);
+  }
+  for (int i = 0; i < kNumDBs; ++i) {
+    delete dbs[i];
+  }
+}
+
+TEST_F(MemoryTest, MemTableAndTableReadersTotal) {
+  std::vector<DB*> dbs;
+  std::vector<uint64_t> usage_by_type;
+  std::vector<std::vector<ColumnFamilyHandle*>> vec_handles;
+  const int kNumDBs = 10;
+  // These key/value sizes ensure each KV has its own memtable. Note that the
+  // minimum write_buffer_size allowed is 64 KB.
+  const int kKeySize = 100;
+  const int kValueSize = 1 << 16;
+  Options opt;
+  opt.create_if_missing = true;
+  opt.create_missing_column_families = true;
+  opt.write_buffer_size = kKeySize + kValueSize;
+  opt.max_write_buffer_number = 10;
+  opt.min_write_buffer_number_to_merge = 10;
+  opt.disable_auto_compactions = true;
+
+  std::vector<ColumnFamilyDescriptor> cf_descs = {
+      {kDefaultColumnFamilyName, ColumnFamilyOptions(opt)},
+      {"one", ColumnFamilyOptions(opt)},
+      {"two", ColumnFamilyOptions(opt)},
+  };
+
+  for (int i = 0; i < kNumDBs; ++i) {
+    ASSERT_OK(DestroyDB(GetDBName(i), opt));
+    std::vector<ColumnFamilyHandle*> handles;
+    dbs.emplace_back();
+    vec_handles.emplace_back();
+    ASSERT_OK(DB::Open(DBOptions(opt), GetDBName(i), cf_descs,
+                       &vec_handles.back(), &dbs.back()));
+  }
+
+  // Fill one memtable per Put to make memtable use more memory.
+  for (int p = 0; p < opt.min_write_buffer_number_to_merge / 2; ++p) {
+    for (int i = 0; i < kNumDBs; ++i) {
+      for (auto* handle : vec_handles[i]) {
+        ASSERT_OK(dbs[i]->Put(WriteOptions(), handle,
+                              rnd_.RandomString(kKeySize),
+                              rnd_.RandomString(kValueSize)));
+        UpdateUsagesHistory(dbs);
+      }
+    }
+  }
+  // Expect the usage history is monotonically increasing
+  for (size_t i = 1; i < usage_history_[MemoryUtil::kMemTableTotal].size();
+       ++i) {
+    ASSERT_GT(usage_history_[MemoryUtil::kMemTableTotal][i],
+              usage_history_[MemoryUtil::kMemTableTotal][i - 1]);
+    ASSERT_GT(usage_history_[MemoryUtil::kMemTableUnFlushed][i],
+              usage_history_[MemoryUtil::kMemTableUnFlushed][i - 1]);
+    ASSERT_EQ(usage_history_[MemoryUtil::kTableReadersTotal][i],
+              usage_history_[MemoryUtil::kTableReadersTotal][i - 1]);
+  }
+
+  size_t usage_check_point = usage_history_[MemoryUtil::kMemTableTotal].size();
+  std::vector<Iterator*> iters;
+
+  // Create an iterator and flush all memtables for each db
+  for (int i = 0; i < kNumDBs; ++i) {
+    iters.push_back(dbs[i]->NewIterator(ReadOptions()));
+    ASSERT_OK(dbs[i]->Flush(FlushOptions()));
+
+    for (int j = 0; j < 100; ++j) {
+      std::string value;
+      ASSERT_NOK(
+          dbs[i]->Get(ReadOptions(), rnd_.RandomString(kKeySize), &value));
+    }
+
+    UpdateUsagesHistory(dbs);
+  }
+  for (size_t i = usage_check_point;
+       i < usage_history_[MemoryUtil::kMemTableTotal].size(); ++i) {
+    // Since memtables are pinned by iterators, we don't expect the
+    // memory usage of all the memtables decreases as they are pinned
+    // by iterators.
+    ASSERT_GE(usage_history_[MemoryUtil::kMemTableTotal][i],
+              usage_history_[MemoryUtil::kMemTableTotal][i - 1]);
+    // Expect the usage history from the "usage_decay_point" is
+    // monotonically decreasing.
+    ASSERT_LT(usage_history_[MemoryUtil::kMemTableUnFlushed][i],
+              usage_history_[MemoryUtil::kMemTableUnFlushed][i - 1]);
+    // Expect the usage history of the table readers increases
+    // as we flush tables.
+    ASSERT_GT(usage_history_[MemoryUtil::kTableReadersTotal][i],
+              usage_history_[MemoryUtil::kTableReadersTotal][i - 1]);
+    ASSERT_GT(usage_history_[MemoryUtil::kCacheTotal][i],
+              usage_history_[MemoryUtil::kCacheTotal][i - 1]);
+  }
+  usage_check_point = usage_history_[MemoryUtil::kMemTableTotal].size();
+  for (int i = 0; i < kNumDBs; ++i) {
+    // iterator is not used.
+    ASSERT_OK(iters[i]->status());
+    delete iters[i];
+    UpdateUsagesHistory(dbs);
+  }
+  for (size_t i = usage_check_point;
+       i < usage_history_[MemoryUtil::kMemTableTotal].size(); ++i) {
+    // Expect the usage of all memtables decreasing as we delete iterators.
+    ASSERT_LT(usage_history_[MemoryUtil::kMemTableTotal][i],
+              usage_history_[MemoryUtil::kMemTableTotal][i - 1]);
+    // Since the memory usage of un-flushed memtables is only affected
+    // by Put and flush, we expect EQ here as we only delete iterators.
+    ASSERT_EQ(usage_history_[MemoryUtil::kMemTableUnFlushed][i],
+              usage_history_[MemoryUtil::kMemTableUnFlushed][i - 1]);
+    // Expect EQ as we didn't flush more memtables.
+    ASSERT_EQ(usage_history_[MemoryUtil::kTableReadersTotal][i],
+              usage_history_[MemoryUtil::kTableReadersTotal][i - 1]);
+  }
+
+  for (int i = 0; i < kNumDBs; ++i) {
+    for (auto* handle : vec_handles[i]) {
+      delete handle;
+    }
+    delete dbs[i];
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+#if !(defined NDEBUG) || !defined(OS_WIN)
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  return 0;
+#endif
+}
+
+#else
+#include <cstdio>
+
+int main(int /*argc*/, char** /*argv*/) {
+  printf("Skipped in RocksDBLite as utilities are not supported.\n");
+  return 0;
+}
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/memory/memory_util.cc b/src/rocksdb/utilities/memory/memory_util.cc
new file mode 100644
index 000000000..13c81aec4
--- /dev/null
+++ b/src/rocksdb/utilities/memory/memory_util.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/memory_util.h"
+
+#include "db/db_impl/db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status MemoryUtil::GetApproximateMemoryUsageByType(
+    const std::vector<DB*>& dbs,
+    const std::unordered_set<const Cache*> cache_set,
+    std::map<MemoryUtil::UsageType, uint64_t>* usage_by_type) {
+  usage_by_type->clear();
+
+  // MemTable
+  for (auto* db : dbs) {
+    uint64_t usage = 0;
+    if (db->GetAggregatedIntProperty(DB::Properties::kSizeAllMemTables,
+                                     &usage)) {
+      (*usage_by_type)[MemoryUtil::kMemTableTotal] += usage;
+    }
+    if (db->GetAggregatedIntProperty(DB::Properties::kCurSizeAllMemTables,
+                                     &usage)) {
+      (*usage_by_type)[MemoryUtil::kMemTableUnFlushed] += usage;
+    }
+  }
+
+  // Table Readers
+  for (auto* db : dbs) {
+    uint64_t usage = 0;
+    if (db->GetAggregatedIntProperty(DB::Properties::kEstimateTableReadersMem,
+                                     &usage)) {
+      (*usage_by_type)[MemoryUtil::kTableReadersTotal] += usage;
+    }
+  }
+
+  // Cache
+  for (const auto* cache : cache_set) {
+    if (cache != nullptr) {
+      (*usage_by_type)[MemoryUtil::kCacheTotal] += cache->GetUsage();
+    }
+  }
+
+  return Status::OK();
+}
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/memory_allocators.h b/src/rocksdb/utilities/memory_allocators.h
new file mode 100644
index 000000000..c9e77a5b7
--- /dev/null
+++ b/src/rocksdb/utilities/memory_allocators.h
@@ -0,0 +1,104 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+
+#include "rocksdb/memory_allocator.h"
+
+namespace ROCKSDB_NAMESPACE {
+// A memory allocator using new/delete
+class DefaultMemoryAllocator : public MemoryAllocator {
+ public:
+  static const char* kClassName() { return "DefaultMemoryAllocator"; }
+  const char* Name() const override { return kClassName(); }
+  void* Allocate(size_t size) override {
+    return static_cast<void*>(new char[size]);
+  }
+
+  void Deallocate(void* p) override { delete[] static_cast<char*>(p); }
+};
+
+// Base class for a MemoryAllocator.  This implementation does nothing
+// and implements the methods in failuse mode (assert if the methods are
+// invoked). Implementations can extend this class and override these methods
+// when they are enabled via compiler switches (e.g., the
+// JeMallocMemoryAllocator can define these methods if ROCKSDB_JEMALLOC is
+// defined at compile time.  If compiled in "disabled" mode, this class provides
+// default/failure implementations.  If compiled in "enabled" mode, the derived
+// class needs to provide the appopriate "enabled" methods for the "real"
+// implementation. Failure of the "real" implementation to implement ovreride
+// any of these methods will result in an assert failure.
+class BaseMemoryAllocator : public MemoryAllocator {
+ public:
+  void* Allocate(size_t /*size*/) override {
+    assert(false);
+    return nullptr;
+  }
+
+  void Deallocate(void* /*p*/) override { assert(false); }
+};
+
+// A Wrapped MemoryAllocator.  Delegates the memory allcator functions to the
+// wrapped one.
+class MemoryAllocatorWrapper : public MemoryAllocator {
+ public:
+  // Initialize an MemoryAllocatorWrapper that delegates all calls to *t
+  explicit MemoryAllocatorWrapper(const std::shared_ptr<MemoryAllocator>& t);
+  ~MemoryAllocatorWrapper() override {}
+
+  // Return the target to which to forward all calls
+  MemoryAllocator* target() const { return target_.get(); }
+  // Allocate a block of at least size. Has to be thread-safe.
+  void* Allocate(size_t size) override { return target_->Allocate(size); }
+
+  // Deallocate previously allocated block. Has to be thread-safe.
+  void Deallocate(void* p) override { return target_->Deallocate(p); }
+
+  // Returns the memory size of the block allocated at p. The default
+  // implementation that just returns the original allocation_size is fine.
+  size_t UsableSize(void* p, size_t allocation_size) const override {
+    return target_->UsableSize(p, allocation_size);
+  }
+
+  const Customizable* Inner() const override { return target_.get(); }
+
+ protected:
+  std::shared_ptr<MemoryAllocator> target_;
+};
+
+// A memory allocator that counts the number of allocations and deallocations
+// This class is useful if the number of memory allocations/dellocations is
+// important.
+class CountedMemoryAllocator : public MemoryAllocatorWrapper {
+ public:
+  CountedMemoryAllocator()
+      : MemoryAllocatorWrapper(std::make_shared<DefaultMemoryAllocator>()),
+        allocations_(0),
+        deallocations_(0) {}
+
+  explicit CountedMemoryAllocator(const std::shared_ptr<MemoryAllocator>& t)
+      : MemoryAllocatorWrapper(t), allocations_(0), deallocations_(0) {}
+  static const char* kClassName() { return "CountedMemoryAllocator"; }
+  const char* Name() const override { return kClassName(); }
+  std::string GetId() const override { return std::string(Name()); }
+  void* Allocate(size_t size) override {
+    allocations_++;
+    return MemoryAllocatorWrapper::Allocate(size);
+  }
+
+  void Deallocate(void* p) override {
+    deallocations_++;
+    MemoryAllocatorWrapper::Deallocate(p);
+  }
+  uint64_t GetNumAllocations() const { return allocations_; }
+  uint64_t GetNumDeallocations() const { return deallocations_; }
+
+ private:
+  std::atomic<uint64_t> allocations_;
+  std::atomic<uint64_t> deallocations_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/merge_operators.cc b/src/rocksdb/utilities/merge_operators.cc
new file mode 100644
index 000000000..c97e9ce25
--- /dev/null
+++ b/src/rocksdb/utilities/merge_operators.cc
@@ -0,0 +1,120 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "utilities/merge_operators.h"
+
+#include <memory>
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "utilities/merge_operators/bytesxor.h"
+#include "utilities/merge_operators/sortlist.h"
+#include "utilities/merge_operators/string_append/stringappend.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
+
+namespace ROCKSDB_NAMESPACE {
+static bool LoadMergeOperator(const std::string& id,
+                              std::shared_ptr<MergeOperator>* result) {
+  bool success = true;
+  // TODO: Hook the "name" up to the actual Name() of the MergeOperators?
+  // Requires these classes be moved into a header file...
+  if (id == "put" || id == "PutOperator") {
+    *result = MergeOperators::CreatePutOperator();
+  } else if (id == "put_v1") {
+    *result = MergeOperators::CreateDeprecatedPutOperator();
+  } else if (id == "uint64add" || id == "UInt64AddOperator") {
+    *result = MergeOperators::CreateUInt64AddOperator();
+  } else if (id == "max" || id == "MaxOperator") {
+    *result = MergeOperators::CreateMaxOperator();
+#ifdef ROCKSDB_LITE
+    // The remainder of the classes are handled by the ObjectRegistry in
+    // non-LITE mode
+  } else if (id == StringAppendOperator::kNickName() ||
+             id == StringAppendOperator::kClassName()) {
+    *result = MergeOperators::CreateStringAppendOperator();
+  } else if (id == StringAppendTESTOperator::kNickName() ||
+             id == StringAppendTESTOperator::kClassName()) {
+    *result = MergeOperators::CreateStringAppendTESTOperator();
+  } else if (id == BytesXOROperator::kNickName() ||
+             id == BytesXOROperator::kClassName()) {
+    *result = MergeOperators::CreateBytesXOROperator();
+  } else if (id == SortList::kNickName() || id == SortList::kClassName()) {
+    *result = MergeOperators::CreateSortOperator();
+#endif  // ROCKSDB_LITE
+  } else {
+    success = false;
+  }
+  return success;
+}
+
+#ifndef ROCKSDB_LITE
+static int RegisterBuiltinMergeOperators(ObjectLibrary& library,
+                                         const std::string& /*arg*/) {
+  size_t num_types;
+  library.AddFactory<MergeOperator>(
+      ObjectLibrary::PatternEntry(StringAppendOperator::kClassName())
+          .AnotherName(StringAppendOperator::kNickName()),
+      [](const std::string& /*uri*/, std::unique_ptr<MergeOperator>* guard,
+         std::string* /*errmsg*/) {
+        guard->reset(new StringAppendOperator(","));
+        return guard->get();
+      });
+  library.AddFactory<MergeOperator>(
+      ObjectLibrary::PatternEntry(StringAppendTESTOperator::kClassName())
+          .AnotherName(StringAppendTESTOperator::kNickName()),
+      [](const std::string& /*uri*/, std::unique_ptr<MergeOperator>* guard,
+         std::string* /*errmsg*/) {
+        guard->reset(new StringAppendTESTOperator(","));
+        return guard->get();
+      });
+  library.AddFactory<MergeOperator>(
+      ObjectLibrary::PatternEntry(SortList::kClassName())
+          .AnotherName(SortList::kNickName()),
+      [](const std::string& /*uri*/, std::unique_ptr<MergeOperator>* guard,
+         std::string* /*errmsg*/) {
+        guard->reset(new SortList());
+        return guard->get();
+      });
+  library.AddFactory<MergeOperator>(
+      ObjectLibrary::PatternEntry(BytesXOROperator::kClassName())
+          .AnotherName(BytesXOROperator::kNickName()),
+      [](const std::string& /*uri*/, std::unique_ptr<MergeOperator>* guard,
+         std::string* /*errmsg*/) {
+        guard->reset(new BytesXOROperator());
+        return guard->get();
+      });
+
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+#endif  // ROCKSDB_LITE
+
+Status MergeOperator::CreateFromString(const ConfigOptions& config_options,
+                                       const std::string& value,
+                                       std::shared_ptr<MergeOperator>* result) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    RegisterBuiltinMergeOperators(*(ObjectLibrary::Default().get()), "");
+  });
+#endif  // ROCKSDB_LITE
+  return LoadSharedObject<MergeOperator>(config_options, value,
+                                         LoadMergeOperator, result);
+}
+
+std::shared_ptr<MergeOperator> MergeOperators::CreateFromStringId(
+    const std::string& id) {
+  std::shared_ptr<MergeOperator> result;
+  Status s = MergeOperator::CreateFromString(ConfigOptions(), id, &result);
+  if (s.ok()) {
+    return result;
+  } else {
+    // Empty or unknown, just return nullptr
+    return nullptr;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/merge_operators.h b/src/rocksdb/utilities/merge_operators.h
new file mode 100644
index 000000000..9b90107e3
--- /dev/null
+++ b/src/rocksdb/utilities/merge_operators.h
@@ -0,0 +1,36 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include <stdio.h>
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/merge_operator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MergeOperators {
+ public:
+  static std::shared_ptr<MergeOperator> CreatePutOperator();
+  static std::shared_ptr<MergeOperator> CreateDeprecatedPutOperator();
+  static std::shared_ptr<MergeOperator> CreateUInt64AddOperator();
+  static std::shared_ptr<MergeOperator> CreateStringAppendOperator();
+  static std::shared_ptr<MergeOperator> CreateStringAppendOperator(
+      char delim_char);
+  static std::shared_ptr<MergeOperator> CreateStringAppendOperator(
+      const std::string& delim);
+  static std::shared_ptr<MergeOperator> CreateStringAppendTESTOperator();
+  static std::shared_ptr<MergeOperator> CreateMaxOperator();
+  static std::shared_ptr<MergeOperator> CreateBytesXOROperator();
+  static std::shared_ptr<MergeOperator> CreateSortOperator();
+
+  // Will return a different merge operator depending on the string.
+  static std::shared_ptr<MergeOperator> CreateFromStringId(
+      const std::string& name);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/merge_operators/bytesxor.cc b/src/rocksdb/utilities/merge_operators/bytesxor.cc
new file mode 100644
index 000000000..fa09c18ea
--- /dev/null
+++ b/src/rocksdb/utilities/merge_operators/bytesxor.cc
@@ -0,0 +1,57 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "utilities/merge_operators/bytesxor.h"
+
+#include <algorithm>
+#include <string>
+
+namespace ROCKSDB_NAMESPACE {
+
+std::shared_ptr<MergeOperator> MergeOperators::CreateBytesXOROperator() {
+  return std::make_shared<BytesXOROperator>();
+}
+
+bool BytesXOROperator::Merge(const Slice& /*key*/, const Slice* existing_value,
+                             const Slice& value, std::string* new_value,
+                             Logger* /*logger*/) const {
+  XOR(existing_value, value, new_value);
+  return true;
+}
+
+void BytesXOROperator::XOR(const Slice* existing_value, const Slice& value,
+                           std::string* new_value) const {
+  if (!existing_value) {
+    new_value->clear();
+    new_value->assign(value.data(), value.size());
+    return;
+  }
+
+  size_t min_size = std::min(existing_value->size(), value.size());
+  size_t max_size = std::max(existing_value->size(), value.size());
+
+  new_value->clear();
+  new_value->reserve(max_size);
+
+  const char* existing_value_data = existing_value->data();
+  const char* value_data = value.data();
+
+  for (size_t i = 0; i < min_size; i++) {
+    new_value->push_back(existing_value_data[i] ^ value_data[i]);
+  }
+
+  if (existing_value->size() == max_size) {
+    for (size_t i = min_size; i < max_size; i++) {
+      new_value->push_back(existing_value_data[i]);
+    }
+  } else {
+    assert(value.size() == max_size);
+    for (size_t i = min_size; i < max_size; i++) {
+      new_value->push_back(value_data[i]);
+    }
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/merge_operators/bytesxor.h b/src/rocksdb/utilities/merge_operators/bytesxor.h
new file mode 100644
index 000000000..3c7baacce
--- /dev/null
+++ b/src/rocksdb/utilities/merge_operators/bytesxor.h
@@ -0,0 +1,40 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <algorithm>
+#include <memory>
+#include <string>
+
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "util/coding.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A 'model' merge operator that XORs two (same sized) array of bytes.
+// Implemented as an AssociativeMergeOperator for simplicity and example.
+class BytesXOROperator : public AssociativeMergeOperator {
+ public:
+  // XORs the two array of bytes one byte at a time and stores the result
+  // in new_value. len is the number of xored bytes, and the length of new_value
+  virtual bool Merge(const Slice& key, const Slice* existing_value,
+                     const Slice& value, std::string* new_value,
+                     Logger* logger) const override;
+
+  static const char* kClassName() { return "BytesXOR"; }
+  static const char* kNickName() { return "bytesxor"; }
+
+  const char* NickName() const override { return kNickName(); }
+  const char* Name() const override { return kClassName(); }
+
+  void XOR(const Slice* existing_value, const Slice& value,
+           std::string* new_value) const;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/merge_operators/max.cc b/src/rocksdb/utilities/merge_operators/max.cc
new file mode 100644
index 000000000..de4abfa6f
--- /dev/null
+++ b/src/rocksdb/utilities/merge_operators/max.cc
@@ -0,0 +1,80 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <memory>
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "utilities/merge_operators.h"
+
+using ROCKSDB_NAMESPACE::Logger;
+using ROCKSDB_NAMESPACE::MergeOperator;
+using ROCKSDB_NAMESPACE::Slice;
+
+namespace {  // anonymous namespace
+
+// Merge operator that picks the maximum operand, Comparison is based on
+// Slice::compare
+class MaxOperator : public MergeOperator {
+ public:
+  bool FullMergeV2(const MergeOperationInput& merge_in,
+                   MergeOperationOutput* merge_out) const override {
+    Slice& max = merge_out->existing_operand;
+    if (merge_in.existing_value) {
+      max = Slice(merge_in.existing_value->data(),
+                  merge_in.existing_value->size());
+    } else if (max.data() == nullptr) {
+      max = Slice();
+    }
+
+    for (const auto& op : merge_in.operand_list) {
+      if (max.compare(op) < 0) {
+        max = op;
+      }
+    }
+
+    return true;
+  }
+
+  bool PartialMerge(const Slice& /*key*/, const Slice& left_operand,
+                    const Slice& right_operand, std::string* new_value,
+                    Logger* /*logger*/) const override {
+    if (left_operand.compare(right_operand) >= 0) {
+      new_value->assign(left_operand.data(), left_operand.size());
+    } else {
+      new_value->assign(right_operand.data(), right_operand.size());
+    }
+    return true;
+  }
+
+  bool PartialMergeMulti(const Slice& /*key*/,
+                         const std::deque<Slice>& operand_list,
+                         std::string* new_value,
+                         Logger* /*logger*/) const override {
+    Slice max;
+    for (const auto& operand : operand_list) {
+      if (max.compare(operand) < 0) {
+        max = operand;
+      }
+    }
+
+    new_value->assign(max.data(), max.size());
+    return true;
+  }
+
+  static const char* kClassName() { return "MaxOperator"; }
+  static const char* kNickName() { return "max"; }
+  const char* Name() const override { return kClassName(); }
+  const char* NickName() const override { return kNickName(); }
+};
+
+}  // end of anonymous namespace
+
+namespace ROCKSDB_NAMESPACE {
+
+std::shared_ptr<MergeOperator> MergeOperators::CreateMaxOperator() {
+  return std::make_shared<MaxOperator>();
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/merge_operators/put.cc b/src/rocksdb/utilities/merge_operators/put.cc
new file mode 100644
index 000000000..ccf9ff21f
--- /dev/null
+++ b/src/rocksdb/utilities/merge_operators/put.cc
@@ -0,0 +1,92 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <memory>
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "utilities/merge_operators.h"
+
+namespace {  // anonymous namespace
+
+using ROCKSDB_NAMESPACE::Logger;
+using ROCKSDB_NAMESPACE::MergeOperator;
+using ROCKSDB_NAMESPACE::Slice;
+
+// A merge operator that mimics Put semantics
+// Since this merge-operator will not be used in production,
+// it is implemented as a non-associative merge operator to illustrate the
+// new interface and for testing purposes. (That is, we inherit from
+// the MergeOperator class rather than the AssociativeMergeOperator
+// which would be simpler in this case).
+//
+// From the client-perspective, semantics are the same.
+class PutOperator : public MergeOperator {
+ public:
+  bool FullMerge(const Slice& /*key*/, const Slice* /*existing_value*/,
+                 const std::deque<std::string>& operand_sequence,
+                 std::string* new_value, Logger* /*logger*/) const override {
+    // Put basically only looks at the current/latest value
+    assert(!operand_sequence.empty());
+    assert(new_value != nullptr);
+    new_value->assign(operand_sequence.back());
+    return true;
+  }
+
+  bool PartialMerge(const Slice& /*key*/, const Slice& /*left_operand*/,
+                    const Slice& right_operand, std::string* new_value,
+                    Logger* /*logger*/) const override {
+    new_value->assign(right_operand.data(), right_operand.size());
+    return true;
+  }
+
+  using MergeOperator::PartialMergeMulti;
+  bool PartialMergeMulti(const Slice& /*key*/,
+                         const std::deque<Slice>& operand_list,
+                         std::string* new_value,
+                         Logger* /*logger*/) const override {
+    new_value->assign(operand_list.back().data(), operand_list.back().size());
+    return true;
+  }
+
+  static const char* kClassName() { return "PutOperator"; }
+  static const char* kNickName() { return "put_v1"; }
+  const char* Name() const override { return kClassName(); }
+  const char* NickName() const override { return kNickName(); }
+};
+
+class PutOperatorV2 : public PutOperator {
+  bool FullMerge(const Slice& /*key*/, const Slice* /*existing_value*/,
+                 const std::deque<std::string>& /*operand_sequence*/,
+                 std::string* /*new_value*/,
+                 Logger* /*logger*/) const override {
+    assert(false);
+    return false;
+  }
+
+  bool FullMergeV2(const MergeOperationInput& merge_in,
+                   MergeOperationOutput* merge_out) const override {
+    // Put basically only looks at the current/latest value
+    assert(!merge_in.operand_list.empty());
+    merge_out->existing_operand = merge_in.operand_list.back();
+    return true;
+  }
+
+  static const char* kNickName() { return "put"; }
+  const char* NickName() const override { return kNickName(); }
+};
+
+}  // end of anonymous namespace
+
+namespace ROCKSDB_NAMESPACE {
+
+std::shared_ptr<MergeOperator> MergeOperators::CreateDeprecatedPutOperator() {
+  return std::make_shared<PutOperator>();
+}
+
+std::shared_ptr<MergeOperator> MergeOperators::CreatePutOperator() {
+  return std::make_shared<PutOperatorV2>();
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/merge_operators/sortlist.cc b/src/rocksdb/utilities/merge_operators/sortlist.cc
new file mode 100644
index 000000000..67bfc7e5e
--- /dev/null
+++ b/src/rocksdb/utilities/merge_operators/sortlist.cc
@@ -0,0 +1,95 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#include "utilities/merge_operators/sortlist.h"
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool SortList::FullMergeV2(const MergeOperationInput& merge_in,
+                           MergeOperationOutput* merge_out) const {
+  std::vector<int> left;
+  for (Slice slice : merge_in.operand_list) {
+    std::vector<int> right;
+    MakeVector(right, slice);
+    left = Merge(left, right);
+  }
+  for (int i = 0; i < static_cast<int>(left.size()) - 1; i++) {
+    merge_out->new_value.append(std::to_string(left[i])).append(",");
+  }
+  merge_out->new_value.append(std::to_string(left.back()));
+  return true;
+}
+
+bool SortList::PartialMerge(const Slice& /*key*/, const Slice& left_operand,
+                            const Slice& right_operand, std::string* new_value,
+                            Logger* /*logger*/) const {
+  std::vector<int> left;
+  std::vector<int> right;
+  MakeVector(left, left_operand);
+  MakeVector(right, right_operand);
+  left = Merge(left, right);
+  for (int i = 0; i < static_cast<int>(left.size()) - 1; i++) {
+    new_value->append(std::to_string(left[i])).append(",");
+  }
+  new_value->append(std::to_string(left.back()));
+  return true;
+}
+
+bool SortList::PartialMergeMulti(const Slice& /*key*/,
+                                 const std::deque<Slice>& operand_list,
+                                 std::string* new_value,
+                                 Logger* /*logger*/) const {
+  (void)operand_list;
+  (void)new_value;
+  return true;
+}
+
+void SortList::MakeVector(std::vector<int>& operand, Slice slice) const {
+  do {
+    const char* begin = slice.data_;
+    while (*slice.data_ != ',' && *slice.data_) slice.data_++;
+    operand.push_back(std::stoi(std::string(begin, slice.data_)));
+  } while (0 != *slice.data_++);
+}
+
+std::vector<int> SortList::Merge(std::vector<int>& left,
+                                 std::vector<int>& right) const {
+  // Fill the resultant vector with sorted results from both vectors
+  std::vector<int> result;
+  unsigned left_it = 0, right_it = 0;
+
+  while (left_it < left.size() && right_it < right.size()) {
+    // If the left value is smaller than the right it goes next
+    // into the resultant vector
+    if (left[left_it] < right[right_it]) {
+      result.push_back(left[left_it]);
+      left_it++;
+    } else {
+      result.push_back(right[right_it]);
+      right_it++;
+    }
+  }
+
+  // Push the remaining data from both vectors onto the resultant
+  while (left_it < left.size()) {
+    result.push_back(left[left_it]);
+    left_it++;
+  }
+
+  while (right_it < right.size()) {
+    result.push_back(right[right_it]);
+    right_it++;
+  }
+
+  return result;
+}
+
+std::shared_ptr<MergeOperator> MergeOperators::CreateSortOperator() {
+  return std::make_shared<SortList>();
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/merge_operators/sortlist.h b/src/rocksdb/utilities/merge_operators/sortlist.h
new file mode 100644
index 000000000..eaa4e76fb
--- /dev/null
+++ b/src/rocksdb/utilities/merge_operators/sortlist.h
@@ -0,0 +1,42 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+// A MergeOperator for RocksDB that implements Merge Sort.
+// It is built using the MergeOperator interface. The operator works by taking
+// an input which contains one or more merge operands where each operand is a
+// list of sorted ints and merges them to form a large sorted list.
+#pragma once
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SortList : public MergeOperator {
+ public:
+  bool FullMergeV2(const MergeOperationInput& merge_in,
+                   MergeOperationOutput* merge_out) const override;
+
+  bool PartialMerge(const Slice& /*key*/, const Slice& left_operand,
+                    const Slice& right_operand, std::string* new_value,
+                    Logger* /*logger*/) const override;
+
+  bool PartialMergeMulti(const Slice& key,
+                         const std::deque<Slice>& operand_list,
+                         std::string* new_value, Logger* logger) const override;
+
+  static const char* kClassName() { return "MergeSortOperator"; }
+  static const char* kNickName() { return "sortlist"; }
+
+  const char* Name() const override { return kClassName(); }
+  const char* NickName() const override { return kNickName(); }
+
+  void MakeVector(std::vector<int>& operand, Slice slice) const;
+
+ private:
+  std::vector<int> Merge(std::vector<int>& left, std::vector<int>& right) const;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/merge_operators/string_append/stringappend.cc b/src/rocksdb/utilities/merge_operators/string_append/stringappend.cc
new file mode 100644
index 000000000..5092cabcb
--- /dev/null
+++ b/src/rocksdb/utilities/merge_operators/string_append/stringappend.cc
@@ -0,0 +1,78 @@
+/**
+ * A MergeOperator for rocksdb that implements string append.
+ * @author Deon Nicholas (dnicholas@fb.com)
+ * Copyright 2013 Facebook
+ */
+
+#include "stringappend.h"
+
+#include <assert.h>
+
+#include <memory>
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/options_type.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo>
+    stringappend_merge_type_info = {
+#ifndef ROCKSDB_LITE
+        {"delimiter",
+         {0, OptionType::kString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+};
+}  // namespace
+// Constructor: also specify the delimiter character.
+StringAppendOperator::StringAppendOperator(char delim_char)
+    : delim_(1, delim_char) {
+  RegisterOptions("Delimiter", &delim_, &stringappend_merge_type_info);
+}
+
+StringAppendOperator::StringAppendOperator(const std::string& delim)
+    : delim_(delim) {
+  RegisterOptions("Delimiter", &delim_, &stringappend_merge_type_info);
+}
+
+// Implementation for the merge operation (concatenates two strings)
+bool StringAppendOperator::Merge(const Slice& /*key*/,
+                                 const Slice* existing_value,
+                                 const Slice& value, std::string* new_value,
+                                 Logger* /*logger*/) const {
+  // Clear the *new_value for writing.
+  assert(new_value);
+  new_value->clear();
+
+  if (!existing_value) {
+    // No existing_value. Set *new_value = value
+    new_value->assign(value.data(), value.size());
+  } else {
+    // Generic append (existing_value != null).
+    // Reserve *new_value to correct size, and apply concatenation.
+    new_value->reserve(existing_value->size() + delim_.size() + value.size());
+    new_value->assign(existing_value->data(), existing_value->size());
+    new_value->append(delim_);
+    new_value->append(value.data(), value.size());
+  }
+
+  return true;
+}
+
+std::shared_ptr<MergeOperator> MergeOperators::CreateStringAppendOperator() {
+  return std::make_shared<StringAppendOperator>(',');
+}
+
+std::shared_ptr<MergeOperator> MergeOperators::CreateStringAppendOperator(
+    char delim_char) {
+  return std::make_shared<StringAppendOperator>(delim_char);
+}
+
+std::shared_ptr<MergeOperator> MergeOperators::CreateStringAppendOperator(
+    const std::string& delim) {
+  return std::make_shared<StringAppendOperator>(delim);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/merge_operators/string_append/stringappend.h b/src/rocksdb/utilities/merge_operators/string_append/stringappend.h
new file mode 100644
index 000000000..153532382
--- /dev/null
+++ b/src/rocksdb/utilities/merge_operators/string_append/stringappend.h
@@ -0,0 +1,32 @@
+/**
+ * A MergeOperator for rocksdb that implements string append.
+ * @author Deon Nicholas (dnicholas@fb.com)
+ * Copyright 2013 Facebook
+ */
+
+#pragma once
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class StringAppendOperator : public AssociativeMergeOperator {
+ public:
+  // Constructor: specify delimiter
+  explicit StringAppendOperator(char delim_char);
+  explicit StringAppendOperator(const std::string& delim);
+
+  virtual bool Merge(const Slice& key, const Slice* existing_value,
+                     const Slice& value, std::string* new_value,
+                     Logger* logger) const override;
+
+  static const char* kClassName() { return "StringAppendOperator"; }
+  static const char* kNickName() { return "stringappend"; }
+  virtual const char* Name() const override { return kClassName(); }
+  virtual const char* NickName() const override { return kNickName(); }
+
+ private:
+  std::string delim_;  // The delimiter is inserted between elements
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/merge_operators/string_append/stringappend2.cc b/src/rocksdb/utilities/merge_operators/string_append/stringappend2.cc
new file mode 100644
index 000000000..36cb9ee34
--- /dev/null
+++ b/src/rocksdb/utilities/merge_operators/string_append/stringappend2.cc
@@ -0,0 +1,132 @@
+/**
+ * @author Deon Nicholas (dnicholas@fb.com)
+ * Copyright 2013 Facebook
+ */
+
+#include "stringappend2.h"
+
+#include <assert.h>
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/options_type.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo>
+    stringappend2_merge_type_info = {
+#ifndef ROCKSDB_LITE
+        {"delimiter",
+         {0, OptionType::kString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+};
+}  // namespace
+
+// Constructor: also specify the delimiter character.
+StringAppendTESTOperator::StringAppendTESTOperator(char delim_char)
+    : delim_(1, delim_char) {
+  RegisterOptions("Delimiter", &delim_, &stringappend2_merge_type_info);
+}
+
+StringAppendTESTOperator::StringAppendTESTOperator(const std::string& delim)
+    : delim_(delim) {
+  RegisterOptions("Delimiter", &delim_, &stringappend2_merge_type_info);
+}
+
+// Implementation for the merge operation (concatenates two strings)
+bool StringAppendTESTOperator::FullMergeV2(
+    const MergeOperationInput& merge_in,
+    MergeOperationOutput* merge_out) const {
+  // Clear the *new_value for writing.
+  merge_out->new_value.clear();
+
+  if (merge_in.existing_value == nullptr && merge_in.operand_list.size() == 1) {
+    // Only one operand
+    merge_out->existing_operand = merge_in.operand_list.back();
+    return true;
+  }
+
+  // Compute the space needed for the final result.
+  size_t numBytes = 0;
+
+  for (auto it = merge_in.operand_list.begin();
+       it != merge_in.operand_list.end(); ++it) {
+    numBytes += it->size() + delim_.size();
+  }
+
+  // Only print the delimiter after the first entry has been printed
+  bool printDelim = false;
+
+  // Prepend the *existing_value if one exists.
+  if (merge_in.existing_value) {
+    merge_out->new_value.reserve(numBytes + merge_in.existing_value->size());
+    merge_out->new_value.append(merge_in.existing_value->data(),
+                                merge_in.existing_value->size());
+    printDelim = true;
+  } else if (numBytes) {
+    // Without the existing (initial) value, the delimiter before the first of
+    // subsequent operands becomes redundant.
+    merge_out->new_value.reserve(numBytes - delim_.size());
+  }
+
+  // Concatenate the sequence of strings (and add a delimiter between each)
+  for (auto it = merge_in.operand_list.begin();
+       it != merge_in.operand_list.end(); ++it) {
+    if (printDelim) {
+      merge_out->new_value.append(delim_);
+    }
+    merge_out->new_value.append(it->data(), it->size());
+    printDelim = true;
+  }
+
+  return true;
+}
+
+bool StringAppendTESTOperator::PartialMergeMulti(
+    const Slice& /*key*/, const std::deque<Slice>& /*operand_list*/,
+    std::string* /*new_value*/, Logger* /*logger*/) const {
+  return false;
+}
+
+// A version of PartialMerge that actually performs "partial merging".
+// Use this to simulate the exact behaviour of the StringAppendOperator.
+bool StringAppendTESTOperator::_AssocPartialMergeMulti(
+    const Slice& /*key*/, const std::deque<Slice>& operand_list,
+    std::string* new_value, Logger* /*logger*/) const {
+  // Clear the *new_value for writing
+  assert(new_value);
+  new_value->clear();
+  assert(operand_list.size() >= 2);
+
+  // Generic append
+  // Determine and reserve correct size for *new_value.
+  size_t size = 0;
+  for (const auto& operand : operand_list) {
+    size += operand.size();
+  }
+  size += (operand_list.size() - 1) * delim_.length();  // Delimiters
+  new_value->reserve(size);
+
+  // Apply concatenation
+  new_value->assign(operand_list.front().data(), operand_list.front().size());
+
+  for (std::deque<Slice>::const_iterator it = operand_list.begin() + 1;
+       it != operand_list.end(); ++it) {
+    new_value->append(delim_);
+    new_value->append(it->data(), it->size());
+  }
+
+  return true;
+}
+
+std::shared_ptr<MergeOperator>
+MergeOperators::CreateStringAppendTESTOperator() {
+  return std::make_shared<StringAppendTESTOperator>(',');
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/merge_operators/string_append/stringappend2.h b/src/rocksdb/utilities/merge_operators/string_append/stringappend2.h
new file mode 100644
index 000000000..75389e4ae
--- /dev/null
+++ b/src/rocksdb/utilities/merge_operators/string_append/stringappend2.h
@@ -0,0 +1,52 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+/**
+ * A TEST MergeOperator for rocksdb that implements string append.
+ * It is built using the MergeOperator interface rather than the simpler
+ * AssociativeMergeOperator interface. This is useful for testing/benchmarking.
+ * While the two operators are semantically the same, all production code
+ * should use the StringAppendOperator defined in stringappend.{h,cc}. The
+ * operator defined in the present file is primarily for testing.
+ *
+ * @author Deon Nicholas (dnicholas@fb.com)
+ * Copyright 2013 Facebook
+ */
+
+#pragma once
+#include <deque>
+#include <string>
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class StringAppendTESTOperator : public MergeOperator {
+ public:
+  // Constructor with delimiter
+  explicit StringAppendTESTOperator(char delim_char);
+  explicit StringAppendTESTOperator(const std::string& delim);
+
+  virtual bool FullMergeV2(const MergeOperationInput& merge_in,
+                           MergeOperationOutput* merge_out) const override;
+
+  virtual bool PartialMergeMulti(const Slice& key,
+                                 const std::deque<Slice>& operand_list,
+                                 std::string* new_value,
+                                 Logger* logger) const override;
+
+  static const char* kClassName() { return "StringAppendTESTOperator"; }
+  static const char* kNickName() { return "stringappendtest"; }
+  const char* Name() const override { return kClassName(); }
+  const char* NickName() const override { return kNickName(); }
+
+ private:
+  // A version of PartialMerge that actually performs "partial merging".
+  // Use this to simulate the exact behaviour of the StringAppendOperator.
+  bool _AssocPartialMergeMulti(const Slice& key,
+                               const std::deque<Slice>& operand_list,
+                               std::string* new_value, Logger* logger) const;
+
+  std::string delim_;  // The delimiter is inserted between elements
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc b/src/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc
new file mode 100644
index 000000000..22b6144af
--- /dev/null
+++ b/src/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc
@@ -0,0 +1,640 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+/**
+ * An persistent map : key -> (list of strings), using rocksdb merge.
+ * This file is a test-harness / use-case for the StringAppendOperator.
+ *
+ * @author Deon Nicholas (dnicholas@fb.com)
+ * Copyright 2013 Facebook, Inc.
+ */
+
+#include "utilities/merge_operators/string_append/stringappend.h"
+
+#include <iostream>
+#include <map>
+#include <tuple>
+
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "test_util/testharness.h"
+#include "util/random.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Path to the database on file system
+const std::string kDbName = test::PerThreadDBPath("stringappend_test");
+
+namespace {
+// OpenDb opens a (possibly new) rocksdb database with a StringAppendOperator
+std::shared_ptr<DB> OpenNormalDb(const std::string& delim) {
+  DB* db;
+  Options options;
+  options.create_if_missing = true;
+  MergeOperator* mergeOperator;
+  if (delim.size() == 1) {
+    mergeOperator = new StringAppendOperator(delim[0]);
+  } else {
+    mergeOperator = new StringAppendOperator(delim);
+  }
+  options.merge_operator.reset(mergeOperator);
+  EXPECT_OK(DB::Open(options, kDbName, &db));
+  return std::shared_ptr<DB>(db);
+}
+
+#ifndef ROCKSDB_LITE  // TtlDb is not supported in Lite
+// Open a TtlDB with a non-associative StringAppendTESTOperator
+std::shared_ptr<DB> OpenTtlDb(const std::string& delim) {
+  DBWithTTL* db;
+  Options options;
+  options.create_if_missing = true;
+  MergeOperator* mergeOperator;
+  if (delim.size() == 1) {
+    mergeOperator = new StringAppendTESTOperator(delim[0]);
+  } else {
+    mergeOperator = new StringAppendTESTOperator(delim);
+  }
+  options.merge_operator.reset(mergeOperator);
+  EXPECT_OK(DBWithTTL::Open(options, kDbName, &db, 123456));
+  return std::shared_ptr<DB>(db);
+}
+#endif  // !ROCKSDB_LITE
+}  // namespace
+
+/// StringLists represents a set of string-lists, each with a key-index.
+/// Supports Append(list, string) and Get(list)
+class StringLists {
+ public:
+  // Constructor: specifies the rocksdb db
+  /* implicit */
+  StringLists(std::shared_ptr<DB> db)
+      : db_(db), merge_option_(), get_option_() {
+    assert(db);
+  }
+
+  // Append string val onto the list defined by key; return true on success
+  bool Append(const std::string& key, const std::string& val) {
+    Slice valSlice(val.data(), val.size());
+    auto s = db_->Merge(merge_option_, key, valSlice);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      std::cerr << "ERROR " << s.ToString() << std::endl;
+      return false;
+    }
+  }
+
+  // Returns the list of strings associated with key (or "" if does not exist)
+  bool Get(const std::string& key, std::string* const result) {
+    assert(result != nullptr);  // we should have a place to store the result
+    auto s = db_->Get(get_option_, key, result);
+
+    if (s.ok()) {
+      return true;
+    }
+
+    // Either key does not exist, or there is some error.
+    *result = "";  // Always return empty string (just for convention)
+
+    // NotFound is okay; just return empty (similar to std::map)
+    // But network or db errors, etc, should fail the test (or at least yell)
+    if (!s.IsNotFound()) {
+      std::cerr << "ERROR " << s.ToString() << std::endl;
+    }
+
+    // Always return false if s.ok() was not true
+    return false;
+  }
+
+ private:
+  std::shared_ptr<DB> db_;
+  WriteOptions merge_option_;
+  ReadOptions get_option_;
+};
+
+// The class for unit-testing
+class StringAppendOperatorTest : public testing::Test,
+                                 public ::testing::WithParamInterface<bool> {
+ public:
+  StringAppendOperatorTest() {
+    EXPECT_OK(
+        DestroyDB(kDbName, Options()));  // Start each test with a fresh DB
+  }
+
+  void SetUp() override {
+#ifndef ROCKSDB_LITE  // TtlDb is not supported in Lite
+    bool if_use_ttl = GetParam();
+    if (if_use_ttl) {
+      fprintf(stderr, "Running tests with ttl db and generic operator.\n");
+      StringAppendOperatorTest::SetOpenDbFunction(&OpenTtlDb);
+      return;
+    }
+#endif  // !ROCKSDB_LITE
+    fprintf(stderr, "Running tests with regular db and operator.\n");
+    StringAppendOperatorTest::SetOpenDbFunction(&OpenNormalDb);
+  }
+
+  using OpenFuncPtr = std::shared_ptr<DB> (*)(const std::string&);
+
+  // Allows user to open databases with different configurations.
+  // e.g.: Can open a DB or a TtlDB, etc.
+  static void SetOpenDbFunction(OpenFuncPtr func) { OpenDb = func; }
+
+ protected:
+  static OpenFuncPtr OpenDb;
+};
+StringAppendOperatorTest::OpenFuncPtr StringAppendOperatorTest::OpenDb =
+    nullptr;
+
+// THE TEST CASES BEGIN HERE
+
+TEST_P(StringAppendOperatorTest, IteratorTest) {
+  auto db_ = OpenDb(",");
+  StringLists slists(db_);
+
+  slists.Append("k1", "v1");
+  slists.Append("k1", "v2");
+  slists.Append("k1", "v3");
+
+  slists.Append("k2", "a1");
+  slists.Append("k2", "a2");
+  slists.Append("k2", "a3");
+
+  std::string res;
+  std::unique_ptr<ROCKSDB_NAMESPACE::Iterator> it(
+      db_->NewIterator(ReadOptions()));
+  std::string k1("k1");
+  std::string k2("k2");
+  bool first = true;
+  for (it->Seek(k1); it->Valid(); it->Next()) {
+    res = it->value().ToString();
+    if (first) {
+      ASSERT_EQ(res, "v1,v2,v3");
+      first = false;
+    } else {
+      ASSERT_EQ(res, "a1,a2,a3");
+    }
+  }
+  slists.Append("k2", "a4");
+  slists.Append("k1", "v4");
+
+  // Snapshot should still be the same. Should ignore a4 and v4.
+  first = true;
+  for (it->Seek(k1); it->Valid(); it->Next()) {
+    res = it->value().ToString();
+    if (first) {
+      ASSERT_EQ(res, "v1,v2,v3");
+      first = false;
+    } else {
+      ASSERT_EQ(res, "a1,a2,a3");
+    }
+  }
+
+  // Should release the snapshot and be aware of the new stuff now
+  it.reset(db_->NewIterator(ReadOptions()));
+  first = true;
+  for (it->Seek(k1); it->Valid(); it->Next()) {
+    res = it->value().ToString();
+    if (first) {
+      ASSERT_EQ(res, "v1,v2,v3,v4");
+      first = false;
+    } else {
+      ASSERT_EQ(res, "a1,a2,a3,a4");
+    }
+  }
+
+  // start from k2 this time.
+  for (it->Seek(k2); it->Valid(); it->Next()) {
+    res = it->value().ToString();
+    if (first) {
+      ASSERT_EQ(res, "v1,v2,v3,v4");
+      first = false;
+    } else {
+      ASSERT_EQ(res, "a1,a2,a3,a4");
+    }
+  }
+
+  slists.Append("k3", "g1");
+
+  it.reset(db_->NewIterator(ReadOptions()));
+  first = true;
+  std::string k3("k3");
+  for (it->Seek(k2); it->Valid(); it->Next()) {
+    res = it->value().ToString();
+    if (first) {
+      ASSERT_EQ(res, "a1,a2,a3,a4");
+      first = false;
+    } else {
+      ASSERT_EQ(res, "g1");
+    }
+  }
+  for (it->Seek(k3); it->Valid(); it->Next()) {
+    res = it->value().ToString();
+    if (first) {
+      // should not be hit
+      ASSERT_EQ(res, "a1,a2,a3,a4");
+      first = false;
+    } else {
+      ASSERT_EQ(res, "g1");
+    }
+  }
+}
+
+TEST_P(StringAppendOperatorTest, SimpleTest) {
+  auto db = OpenDb(",");
+  StringLists slists(db);
+
+  slists.Append("k1", "v1");
+  slists.Append("k1", "v2");
+  slists.Append("k1", "v3");
+
+  std::string res;
+  ASSERT_TRUE(slists.Get("k1", &res));
+  ASSERT_EQ(res, "v1,v2,v3");
+}
+
+TEST_P(StringAppendOperatorTest, SimpleDelimiterTest) {
+  auto db = OpenDb("|");
+  StringLists slists(db);
+
+  slists.Append("k1", "v1");
+  slists.Append("k1", "v2");
+  slists.Append("k1", "v3");
+
+  std::string res;
+  ASSERT_TRUE(slists.Get("k1", &res));
+  ASSERT_EQ(res, "v1|v2|v3");
+}
+
+TEST_P(StringAppendOperatorTest, EmptyDelimiterTest) {
+  auto db = OpenDb("");
+  StringLists slists(db);
+
+  slists.Append("k1", "v1");
+  slists.Append("k1", "v2");
+  slists.Append("k1", "v3");
+
+  std::string res;
+  ASSERT_TRUE(slists.Get("k1", &res));
+  ASSERT_EQ(res, "v1v2v3");
+}
+
+TEST_P(StringAppendOperatorTest, MultiCharDelimiterTest) {
+  auto db = OpenDb("<>");
+  StringLists slists(db);
+
+  slists.Append("k1", "v1");
+  slists.Append("k1", "v2");
+  slists.Append("k1", "v3");
+
+  std::string res;
+  ASSERT_TRUE(slists.Get("k1", &res));
+  ASSERT_EQ(res, "v1<>v2<>v3");
+}
+
+TEST_P(StringAppendOperatorTest, DelimiterIsDefensivelyCopiedTest) {
+  std::string delimiter = "<>";
+  auto db = OpenDb(delimiter);
+  StringLists slists(db);
+
+  slists.Append("k1", "v1");
+  slists.Append("k1", "v2");
+  delimiter.clear();
+  slists.Append("k1", "v3");
+
+  std::string res;
+  ASSERT_TRUE(slists.Get("k1", &res));
+  ASSERT_EQ(res, "v1<>v2<>v3");
+}
+
+TEST_P(StringAppendOperatorTest, OneValueNoDelimiterTest) {
+  auto db = OpenDb("!");
+  StringLists slists(db);
+
+  slists.Append("random_key", "single_val");
+
+  std::string res;
+  ASSERT_TRUE(slists.Get("random_key", &res));
+  ASSERT_EQ(res, "single_val");
+}
+
+TEST_P(StringAppendOperatorTest, VariousKeys) {
+  auto db = OpenDb("\n");
+  StringLists slists(db);
+
+  slists.Append("c", "asdasd");
+  slists.Append("a", "x");
+  slists.Append("b", "y");
+  slists.Append("a", "t");
+  slists.Append("a", "r");
+  slists.Append("b", "2");
+  slists.Append("c", "asdasd");
+
+  std::string a, b, c;
+  bool sa, sb, sc;
+  sa = slists.Get("a", &a);
+  sb = slists.Get("b", &b);
+  sc = slists.Get("c", &c);
+
+  ASSERT_TRUE(sa && sb && sc);  // All three keys should have been found
+
+  ASSERT_EQ(a, "x\nt\nr");
+  ASSERT_EQ(b, "y\n2");
+  ASSERT_EQ(c, "asdasd\nasdasd");
+}
+
+// Generate semi random keys/words from a small distribution.
+TEST_P(StringAppendOperatorTest, RandomMixGetAppend) {
+  auto db = OpenDb(" ");
+  StringLists slists(db);
+
+  // Generate a list of random keys and values
+  const int kWordCount = 15;
+  std::string words[] = {"sdasd",     "triejf",       "fnjsdfn",  "dfjisdfsf",
+                         "342839",    "dsuha",        "mabuais",  "sadajsid",
+                         "jf9834hf",  "2d9j89",       "dj9823jd", "a",
+                         "dk02ed2dh", "$(jd4h984$(*", "mabz"};
+  const int kKeyCount = 6;
+  std::string keys[] = {"dhaiusdhu", "denidw", "daisda",
+                        "keykey",    "muki",   "shzassdianmd"};
+
+  // Will store a local copy of all data in order to verify correctness
+  std::map<std::string, std::string> parallel_copy;
+
+  // Generate a bunch of random queries (Append and Get)!
+  enum query_t { APPEND_OP, GET_OP, NUM_OPS };
+  Random randomGen(1337);  // deterministic seed; always get same results!
+
+  const int kNumQueries = 30;
+  for (int q = 0; q < kNumQueries; ++q) {
+    // Generate a random query (Append or Get) and random parameters
+    query_t query = (query_t)randomGen.Uniform((int)NUM_OPS);
+    std::string key = keys[randomGen.Uniform((int)kKeyCount)];
+    std::string word = words[randomGen.Uniform((int)kWordCount)];
+
+    // Apply the query and any checks.
+    if (query == APPEND_OP) {
+      // Apply the rocksdb test-harness Append defined above
+      slists.Append(key, word);  // apply the rocksdb append
+
+      // Apply the similar "Append" to the parallel copy
+      if (parallel_copy[key].size() > 0) {
+        parallel_copy[key] += " " + word;
+      } else {
+        parallel_copy[key] = word;
+      }
+
+    } else if (query == GET_OP) {
+      // Assumes that a non-existent key just returns <empty>
+      std::string res;
+      slists.Get(key, &res);
+      ASSERT_EQ(res, parallel_copy[key]);
+    }
+  }
+}
+
+TEST_P(StringAppendOperatorTest, BIGRandomMixGetAppend) {
+  auto db = OpenDb(" ");
+  StringLists slists(db);
+
+  // Generate a list of random keys and values
+  const int kWordCount = 15;
+  std::string words[] = {"sdasd",     "triejf",       "fnjsdfn",  "dfjisdfsf",
+                         "342839",    "dsuha",        "mabuais",  "sadajsid",
+                         "jf9834hf",  "2d9j89",       "dj9823jd", "a",
+                         "dk02ed2dh", "$(jd4h984$(*", "mabz"};
+  const int kKeyCount = 6;
+  std::string keys[] = {"dhaiusdhu", "denidw", "daisda",
+                        "keykey",    "muki",   "shzassdianmd"};
+
+  // Will store a local copy of all data in order to verify correctness
+  std::map<std::string, std::string> parallel_copy;
+
+  // Generate a bunch of random queries (Append and Get)!
+  enum query_t { APPEND_OP, GET_OP, NUM_OPS };
+  Random randomGen(9138204);  // deterministic seed
+
+  const int kNumQueries = 1000;
+  for (int q = 0; q < kNumQueries; ++q) {
+    // Generate a random query (Append or Get) and random parameters
+    query_t query = (query_t)randomGen.Uniform((int)NUM_OPS);
+    std::string key = keys[randomGen.Uniform((int)kKeyCount)];
+    std::string word = words[randomGen.Uniform((int)kWordCount)];
+
+    // Apply the query and any checks.
+    if (query == APPEND_OP) {
+      // Apply the rocksdb test-harness Append defined above
+      slists.Append(key, word);  // apply the rocksdb append
+
+      // Apply the similar "Append" to the parallel copy
+      if (parallel_copy[key].size() > 0) {
+        parallel_copy[key] += " " + word;
+      } else {
+        parallel_copy[key] = word;
+      }
+
+    } else if (query == GET_OP) {
+      // Assumes that a non-existent key just returns <empty>
+      std::string res;
+      slists.Get(key, &res);
+      ASSERT_EQ(res, parallel_copy[key]);
+    }
+  }
+}
+
+TEST_P(StringAppendOperatorTest, PersistentVariousKeys) {
+  // Perform the following operations in limited scope
+  {
+    auto db = OpenDb("\n");
+    StringLists slists(db);
+
+    slists.Append("c", "asdasd");
+    slists.Append("a", "x");
+    slists.Append("b", "y");
+    slists.Append("a", "t");
+    slists.Append("a", "r");
+    slists.Append("b", "2");
+    slists.Append("c", "asdasd");
+
+    std::string a, b, c;
+    ASSERT_TRUE(slists.Get("a", &a));
+    ASSERT_TRUE(slists.Get("b", &b));
+    ASSERT_TRUE(slists.Get("c", &c));
+
+    ASSERT_EQ(a, "x\nt\nr");
+    ASSERT_EQ(b, "y\n2");
+    ASSERT_EQ(c, "asdasd\nasdasd");
+  }
+
+  // Reopen the database (the previous changes should persist / be remembered)
+  {
+    auto db = OpenDb("\n");
+    StringLists slists(db);
+
+    slists.Append("c", "bbnagnagsx");
+    slists.Append("a", "sa");
+    slists.Append("b", "df");
+    slists.Append("a", "gh");
+    slists.Append("a", "jk");
+    slists.Append("b", "l;");
+    slists.Append("c", "rogosh");
+
+    // The previous changes should be on disk (L0)
+    // The most recent changes should be in memory (MemTable)
+    // Hence, this will test both Get() paths.
+    std::string a, b, c;
+    ASSERT_TRUE(slists.Get("a", &a));
+    ASSERT_TRUE(slists.Get("b", &b));
+    ASSERT_TRUE(slists.Get("c", &c));
+
+    ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk");
+    ASSERT_EQ(b, "y\n2\ndf\nl;");
+    ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh");
+  }
+
+  // Reopen the database (the previous changes should persist / be remembered)
+  {
+    auto db = OpenDb("\n");
+    StringLists slists(db);
+
+    // All changes should be on disk. This will test VersionSet Get()
+    std::string a, b, c;
+    ASSERT_TRUE(slists.Get("a", &a));
+    ASSERT_TRUE(slists.Get("b", &b));
+    ASSERT_TRUE(slists.Get("c", &c));
+
+    ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk");
+    ASSERT_EQ(b, "y\n2\ndf\nl;");
+    ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh");
+  }
+}
+
+TEST_P(StringAppendOperatorTest, PersistentFlushAndCompaction) {
+  // Perform the following operations in limited scope
+  {
+    auto db = OpenDb("\n");
+    StringLists slists(db);
+    std::string a, b, c;
+
+    // Append, Flush, Get
+    slists.Append("c", "asdasd");
+    ASSERT_OK(db->Flush(ROCKSDB_NAMESPACE::FlushOptions()));
+    ASSERT_TRUE(slists.Get("c", &c));
+    ASSERT_EQ(c, "asdasd");
+
+    // Append, Flush, Append, Get
+    slists.Append("a", "x");
+    slists.Append("b", "y");
+    ASSERT_OK(db->Flush(ROCKSDB_NAMESPACE::FlushOptions()));
+    slists.Append("a", "t");
+    slists.Append("a", "r");
+    slists.Append("b", "2");
+
+    ASSERT_TRUE(slists.Get("a", &a));
+    ASSERT_EQ(a, "x\nt\nr");
+
+    ASSERT_TRUE(slists.Get("b", &b));
+    ASSERT_EQ(b, "y\n2");
+
+    // Append, Get
+    ASSERT_TRUE(slists.Append("c", "asdasd"));
+    ASSERT_TRUE(slists.Append("b", "monkey"));
+
+    ASSERT_TRUE(slists.Get("a", &a));
+    ASSERT_TRUE(slists.Get("b", &b));
+    ASSERT_TRUE(slists.Get("c", &c));
+
+    ASSERT_EQ(a, "x\nt\nr");
+    ASSERT_EQ(b, "y\n2\nmonkey");
+    ASSERT_EQ(c, "asdasd\nasdasd");
+  }
+
+  // Reopen the database (the previous changes should persist / be remembered)
+  {
+    auto db = OpenDb("\n");
+    StringLists slists(db);
+    std::string a, b, c;
+
+    // Get (Quick check for persistence of previous database)
+    ASSERT_TRUE(slists.Get("a", &a));
+    ASSERT_EQ(a, "x\nt\nr");
+
+    // Append, Compact, Get
+    slists.Append("c", "bbnagnagsx");
+    slists.Append("a", "sa");
+    slists.Append("b", "df");
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    ASSERT_TRUE(slists.Get("a", &a));
+    ASSERT_TRUE(slists.Get("b", &b));
+    ASSERT_TRUE(slists.Get("c", &c));
+    ASSERT_EQ(a, "x\nt\nr\nsa");
+    ASSERT_EQ(b, "y\n2\nmonkey\ndf");
+    ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx");
+
+    // Append, Get
+    slists.Append("a", "gh");
+    slists.Append("a", "jk");
+    slists.Append("b", "l;");
+    slists.Append("c", "rogosh");
+    ASSERT_TRUE(slists.Get("a", &a));
+    ASSERT_TRUE(slists.Get("b", &b));
+    ASSERT_TRUE(slists.Get("c", &c));
+    ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk");
+    ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;");
+    ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh");
+
+    // Compact, Get
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk");
+    ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;");
+    ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh");
+
+    // Append, Flush, Compact, Get
+    slists.Append("b", "afcg");
+    ASSERT_OK(db->Flush(ROCKSDB_NAMESPACE::FlushOptions()));
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    ASSERT_TRUE(slists.Get("b", &b));
+    ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;\nafcg");
+  }
+}
+
+TEST_P(StringAppendOperatorTest, SimpleTestNullDelimiter) {
+  auto db = OpenDb(std::string(1, '\0'));
+  StringLists slists(db);
+
+  slists.Append("k1", "v1");
+  slists.Append("k1", "v2");
+  slists.Append("k1", "v3");
+
+  std::string res;
+  ASSERT_TRUE(slists.Get("k1", &res));
+
+  // Construct the desired string. Default constructor doesn't like '\0' chars.
+  std::string checker("v1,v2,v3");  // Verify that the string is right size.
+  checker[2] = '\0';                // Use null delimiter instead of comma.
+  checker[5] = '\0';
+  ASSERT_EQ(checker.size(), 8);  // Verify it is still the correct size
+
+  // Check that the rocksdb result string matches the desired string
+  ASSERT_EQ(res.size(), checker.size());
+  ASSERT_EQ(res, checker);
+}
+
+INSTANTIATE_TEST_CASE_P(StringAppendOperatorTest, StringAppendOperatorTest,
+                        testing::Bool());
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/utilities/merge_operators/uint64add.cc b/src/rocksdb/utilities/merge_operators/uint64add.cc
new file mode 100644
index 000000000..5be2f5641
--- /dev/null
+++ b/src/rocksdb/utilities/merge_operators/uint64add.cc
@@ -0,0 +1,75 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <memory>
+
+#include "logging/logging.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "util/coding.h"
+#include "utilities/merge_operators.h"
+
+namespace {  // anonymous namespace
+
+using ROCKSDB_NAMESPACE::AssociativeMergeOperator;
+using ROCKSDB_NAMESPACE::InfoLogLevel;
+using ROCKSDB_NAMESPACE::Logger;
+using ROCKSDB_NAMESPACE::Slice;
+
+// A 'model' merge operator with uint64 addition semantics
+// Implemented as an AssociativeMergeOperator for simplicity and example.
+class UInt64AddOperator : public AssociativeMergeOperator {
+ public:
+  bool Merge(const Slice& /*key*/, const Slice* existing_value,
+             const Slice& value, std::string* new_value,
+             Logger* logger) const override {
+    uint64_t orig_value = 0;
+    if (existing_value) {
+      orig_value = DecodeInteger(*existing_value, logger);
+    }
+    uint64_t operand = DecodeInteger(value, logger);
+
+    assert(new_value);
+    new_value->clear();
+    ROCKSDB_NAMESPACE::PutFixed64(new_value, orig_value + operand);
+
+    return true;  // Return true always since corruption will be treated as 0
+  }
+
+  static const char* kClassName() { return "UInt64AddOperator"; }
+  static const char* kNickName() { return "uint64add"; }
+  const char* Name() const override { return kClassName(); }
+  const char* NickName() const override { return kNickName(); }
+
+ private:
+  // Takes the string and decodes it into a uint64_t
+  // On error, prints a message and returns 0
+  uint64_t DecodeInteger(const Slice& value, Logger* logger) const {
+    uint64_t result = 0;
+
+    if (value.size() == sizeof(uint64_t)) {
+      result = ROCKSDB_NAMESPACE::DecodeFixed64(value.data());
+    } else if (logger != nullptr) {
+      // If value is corrupted, treat it as 0
+      ROCKS_LOG_ERROR(logger,
+                      "uint64 value corruption, size: %" ROCKSDB_PRIszt
+                      " > %" ROCKSDB_PRIszt,
+                      value.size(), sizeof(uint64_t));
+    }
+
+    return result;
+  }
+};
+
+}  // anonymous namespace
+
+namespace ROCKSDB_NAMESPACE {
+
+std::shared_ptr<MergeOperator> MergeOperators::CreateUInt64AddOperator() {
+  return std::make_shared<UInt64AddOperator>();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/object_registry.cc b/src/rocksdb/utilities/object_registry.cc
new file mode 100644
index 000000000..18834783d
--- /dev/null
+++ b/src/rocksdb/utilities/object_registry.cc
@@ -0,0 +1,383 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/utilities/object_registry.h"
+
+#include <ctype.h>
+
+#include "logging/logging.h"
+#include "port/lang.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/env.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+#ifndef ROCKSDB_LITE
+namespace {
+bool MatchesInteger(const std::string &target, size_t start, size_t pos) {
+  // If it is numeric, everything up to the match must be a number
+  int digits = 0;
+  if (target[start] == '-') {
+    start++;  // Allow negative numbers
+  }
+  while (start < pos) {
+    if (!isdigit(target[start++])) {
+      return false;
+    } else {
+      digits++;
+    }
+  }
+  return (digits > 0);
+}
+
+bool MatchesDecimal(const std::string &target, size_t start, size_t pos) {
+  int digits = 0;
+  if (target[start] == '-') {
+    start++;  // Allow negative numbers
+  }
+  for (bool point = false; start < pos; start++) {
+    if (target[start] == '.') {
+      if (point) {
+        return false;
+      } else {
+        point = true;
+      }
+    } else if (!isdigit(target[start])) {
+      return false;
+    } else {
+      digits++;
+    }
+  }
+  return (digits > 0);
+}
+}  // namespace
+
+size_t ObjectLibrary::PatternEntry::MatchSeparatorAt(
+    size_t start, Quantifier mode, const std::string &target, size_t tlen,
+    const std::string &separator) const {
+  size_t slen = separator.size();
+  // See if there is enough space.  If so, find the separator
+  if (tlen < start + slen) {
+    return std::string::npos;  // not enough space left
+  } else if (mode == kMatchExact) {
+    // Exact mode means the next thing we are looking for is the separator
+    if (target.compare(start, slen, separator) != 0) {
+      return std::string::npos;
+    } else {
+      return start + slen;  // Found the separator, return where we found it
+    }
+  } else {
+    auto pos = start + 1;
+    if (!separator.empty()) {
+      pos = target.find(separator, pos);
+    }
+    if (pos == std::string::npos) {
+      return pos;
+    } else if (mode == kMatchInteger) {
+      if (!MatchesInteger(target, start, pos)) {
+        return std::string::npos;
+      }
+    } else if (mode == kMatchDecimal) {
+      if (!MatchesDecimal(target, start, pos)) {
+        return std::string::npos;
+      }
+    }
+    return pos + slen;
+  }
+}
+
+bool ObjectLibrary::PatternEntry::MatchesTarget(const std::string &name,
+                                                size_t nlen,
+                                                const std::string &target,
+                                                size_t tlen) const {
+  if (separators_.empty()) {
+    assert(optional_);  // If there are no separators, it must be only a name
+    return nlen == tlen && name == target;
+  } else if (nlen == tlen) {  // The lengths are the same
+    return optional_ && name == target;
+  } else if (tlen < nlen + slength_) {
+    // The target is not long enough
+    return false;
+  } else if (target.compare(0, nlen, name) != 0) {
+    return false;  // Target does not start with name
+  } else {
+    // Loop through all of the separators one at a time matching them.
+    // Note that we first match the separator and then its quantifiers.
+    // Since we expect the separator first, we start with an exact match
+    // Subsequent matches will use the quantifier of the previous separator
+    size_t start = nlen;
+    auto mode = kMatchExact;
+    for (size_t idx = 0; idx < separators_.size(); ++idx) {
+      const auto &separator = separators_[idx];
+      start = MatchSeparatorAt(start, mode, target, tlen, separator.first);
+      if (start == std::string::npos) {
+        return false;
+      } else {
+        mode = separator.second;
+      }
+    }
+    // We have matched all of the separators.  Now check that what is left
+    // unmatched in the target is acceptable.
+    if (mode == kMatchExact) {
+      return (start == tlen);
+    } else if (start > tlen || (start == tlen && mode != kMatchZeroOrMore)) {
+      return false;
+    } else if (mode == kMatchInteger) {
+      return MatchesInteger(target, start, tlen);
+    } else if (mode == kMatchDecimal) {
+      return MatchesDecimal(target, start, tlen);
+    }
+  }
+  return true;
+}
+
+bool ObjectLibrary::PatternEntry::Matches(const std::string &target) const {
+  auto tlen = target.size();
+  if (MatchesTarget(name_, nlength_, target, tlen)) {
+    return true;
+  } else if (!names_.empty()) {
+    for (const auto &alt : names_) {
+      if (MatchesTarget(alt, alt.size(), target, tlen)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+size_t ObjectLibrary::GetFactoryCount(size_t *types) const {
+  std::unique_lock<std::mutex> lock(mu_);
+  *types = factories_.size();
+  size_t factories = 0;
+  for (const auto &e : factories_) {
+    factories += e.second.size();
+  }
+  return factories;
+}
+
+size_t ObjectLibrary::GetFactoryCount(const std::string &type) const {
+  std::unique_lock<std::mutex> lock(mu_);
+  auto iter = factories_.find(type);
+  if (iter != factories_.end()) {
+    return iter->second.size();
+  } else {
+    return 0;
+  }
+}
+
+void ObjectLibrary::GetFactoryNames(const std::string &type,
+                                    std::vector<std::string> *names) const {
+  assert(names);
+  std::unique_lock<std::mutex> lock(mu_);
+  auto iter = factories_.find(type);
+  if (iter != factories_.end()) {
+    for (const auto &f : iter->second) {
+      names->push_back(f->Name());
+    }
+  }
+}
+
+void ObjectLibrary::GetFactoryTypes(
+    std::unordered_set<std::string> *types) const {
+  assert(types);
+  std::unique_lock<std::mutex> lock(mu_);
+  for (const auto &iter : factories_) {
+    types->insert(iter.first);
+  }
+}
+
+void ObjectLibrary::Dump(Logger *logger) const {
+  std::unique_lock<std::mutex> lock(mu_);
+  if (logger != nullptr && !factories_.empty()) {
+    ROCKS_LOG_HEADER(logger, "    Registered Library: %s\n", id_.c_str());
+    for (const auto &iter : factories_) {
+      ROCKS_LOG_HEADER(logger, "    Registered factories for type[%s] ",
+                       iter.first.c_str());
+      bool printed_one = false;
+      for (const auto &e : iter.second) {
+        ROCKS_LOG_HEADER(logger, "%c %s", (printed_one) ? ',' : ':', e->Name());
+        printed_one = true;
+      }
+    }
+  }
+}
+
+// Returns the Default singleton instance of the ObjectLibrary
+// This instance will contain most of the "standard" registered objects
+std::shared_ptr<ObjectLibrary> &ObjectLibrary::Default() {
+  // Use avoid destruction here so the default ObjectLibrary will not be
+  // statically destroyed and long-lived.
+  STATIC_AVOID_DESTRUCTION(std::shared_ptr<ObjectLibrary>, instance)
+  (std::make_shared<ObjectLibrary>("default"));
+  return instance;
+}
+
+ObjectRegistry::ObjectRegistry(const std::shared_ptr<ObjectLibrary> &library) {
+  libraries_.push_back(library);
+  for (const auto &b : builtins_) {
+    RegisterPlugin(b.first, b.second);
+  }
+}
+
+std::shared_ptr<ObjectRegistry> ObjectRegistry::Default() {
+  // Use avoid destruction here so the default ObjectRegistry will not be
+  // statically destroyed and long-lived.
+  STATIC_AVOID_DESTRUCTION(std::shared_ptr<ObjectRegistry>, instance)
+  (std::make_shared<ObjectRegistry>(ObjectLibrary::Default()));
+  return instance;
+}
+
+std::shared_ptr<ObjectRegistry> ObjectRegistry::NewInstance() {
+  return std::make_shared<ObjectRegistry>(Default());
+}
+
+std::shared_ptr<ObjectRegistry> ObjectRegistry::NewInstance(
+    const std::shared_ptr<ObjectRegistry> &parent) {
+  return std::make_shared<ObjectRegistry>(parent);
+}
+
+Status ObjectRegistry::SetManagedObject(
+    const std::string &type, const std::string &id,
+    const std::shared_ptr<Customizable> &object) {
+  std::string object_key = ToManagedObjectKey(type, id);
+  std::shared_ptr<Customizable> curr;
+  if (parent_ != nullptr) {
+    curr = parent_->GetManagedObject(type, id);
+  }
+  if (curr == nullptr) {
+    // We did not find the object in any parent.  Update in the current
+    std::unique_lock<std::mutex> lock(objects_mutex_);
+    auto iter = managed_objects_.find(object_key);
+    if (iter != managed_objects_.end()) {  // The object exists
+      curr = iter->second.lock();
+      if (curr != nullptr && curr != object) {
+        return Status::InvalidArgument("Object already exists: ", object_key);
+      } else {
+        iter->second = object;
+      }
+    } else {
+      // The object does not exist.  Add it
+      managed_objects_[object_key] = object;
+    }
+  } else if (curr != object) {
+    return Status::InvalidArgument("Object already exists: ", object_key);
+  }
+  return Status::OK();
+}
+
+std::shared_ptr<Customizable> ObjectRegistry::GetManagedObject(
+    const std::string &type, const std::string &id) const {
+  {
+    std::unique_lock<std::mutex> lock(objects_mutex_);
+    auto iter = managed_objects_.find(ToManagedObjectKey(type, id));
+    if (iter != managed_objects_.end()) {
+      return iter->second.lock();
+    }
+  }
+  if (parent_ != nullptr) {
+    return parent_->GetManagedObject(type, id);
+  } else {
+    return nullptr;
+  }
+}
+
+Status ObjectRegistry::ListManagedObjects(
+    const std::string &type, const std::string &name,
+    std::vector<std::shared_ptr<Customizable>> *results) const {
+  {
+    std::string key = ToManagedObjectKey(type, name);
+    std::unique_lock<std::mutex> lock(objects_mutex_);
+    for (auto iter = managed_objects_.lower_bound(key);
+         iter != managed_objects_.end() && StartsWith(iter->first, key);
+         ++iter) {
+      auto shared = iter->second.lock();
+      if (shared != nullptr) {
+        if (name.empty() || shared->IsInstanceOf(name)) {
+          results->emplace_back(shared);
+        }
+      }
+    }
+  }
+  if (parent_ != nullptr) {
+    return parent_->ListManagedObjects(type, name, results);
+  } else {
+    return Status::OK();
+  }
+}
+
+// Returns the number of registered types for this registry.
+// If specified (not-null), types is updated to include the names of the
+// registered types.
+size_t ObjectRegistry::GetFactoryCount(const std::string &type) const {
+  size_t count = 0;
+  if (parent_ != nullptr) {
+    count = parent_->GetFactoryCount(type);
+  }
+  std::unique_lock<std::mutex> lock(library_mutex_);
+  for (const auto &library : libraries_) {
+    count += library->GetFactoryCount(type);
+  }
+  return count;
+}
+
+void ObjectRegistry::GetFactoryNames(const std::string &type,
+                                     std::vector<std::string> *names) const {
+  assert(names);
+  names->clear();
+  if (parent_ != nullptr) {
+    parent_->GetFactoryNames(type, names);
+  }
+  std::unique_lock<std::mutex> lock(library_mutex_);
+  for (const auto &library : libraries_) {
+    library->GetFactoryNames(type, names);
+  }
+}
+
+void ObjectRegistry::GetFactoryTypes(
+    std::unordered_set<std::string> *types) const {
+  assert(types);
+  if (parent_ != nullptr) {
+    parent_->GetFactoryTypes(types);
+  }
+  std::unique_lock<std::mutex> lock(library_mutex_);
+  for (const auto &library : libraries_) {
+    library->GetFactoryTypes(types);
+  }
+}
+
+void ObjectRegistry::Dump(Logger *logger) const {
+  if (logger != nullptr) {
+    std::unique_lock<std::mutex> lock(library_mutex_);
+    if (!plugins_.empty()) {
+      ROCKS_LOG_HEADER(logger, "    Registered Plugins:");
+      bool printed_one = false;
+      for (const auto &plugin : plugins_) {
+        ROCKS_LOG_HEADER(logger, "%s%s", (printed_one) ? ", " : " ",
+                         plugin.c_str());
+        printed_one = true;
+      }
+      ROCKS_LOG_HEADER(logger, "\n");
+    }
+    for (auto iter = libraries_.crbegin(); iter != libraries_.crend(); ++iter) {
+      iter->get()->Dump(logger);
+    }
+  }
+  if (parent_ != nullptr) {
+    parent_->Dump(logger);
+  }
+}
+
+int ObjectRegistry::RegisterPlugin(const std::string &name,
+                                   const RegistrarFunc &func) {
+  if (!name.empty() && func != nullptr) {
+    plugins_.push_back(name);
+    return AddLibrary(name)->Register(func, name);
+  } else {
+    return -1;
+  }
+}
+
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/object_registry_test.cc b/src/rocksdb/utilities/object_registry_test.cc
new file mode 100644
index 000000000..90cd155ee
--- /dev/null
+++ b/src/rocksdb/utilities/object_registry_test.cc
@@ -0,0 +1,872 @@
+// Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/object_registry.h"
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/customizable.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ObjRegistryTest : public testing::Test {
+ public:
+  static int num_a, num_b;
+};
+
+int ObjRegistryTest::num_a = 0;
+int ObjRegistryTest::num_b = 0;
+static FactoryFunc<Env> test_reg_a = ObjectLibrary::Default()->AddFactory<Env>(
+    ObjectLibrary::PatternEntry("a", false).AddSeparator("://"),
+    [](const std::string& /*uri*/, std::unique_ptr<Env>* /*env_guard*/,
+       std::string* /* errmsg */) {
+      ++ObjRegistryTest::num_a;
+      return Env::Default();
+    });
+
+class WrappedEnv : public EnvWrapper {
+ private:
+  std::string id_;
+
+ public:
+  WrappedEnv(Env* t, const std::string& id) : EnvWrapper(t), id_(id) {}
+  const char* Name() const override { return id_.c_str(); }
+  std::string GetId() const override { return id_; }
+};
+static FactoryFunc<Env> test_reg_b = ObjectLibrary::Default()->AddFactory<Env>(
+    ObjectLibrary::PatternEntry("b", false).AddSeparator("://"),
+    [](const std::string& uri, std::unique_ptr<Env>* env_guard,
+       std::string* /* errmsg */) {
+      ++ObjRegistryTest::num_b;
+      // Env::Default() is a singleton so we can't grant ownership directly to
+      // the caller - we must wrap it first.
+      env_guard->reset(new WrappedEnv(Env::Default(), uri));
+      return env_guard->get();
+    });
+
+TEST_F(ObjRegistryTest, Basics) {
+  std::string msg;
+  std::unique_ptr<Env> guard;
+  Env* a_env = nullptr;
+
+  auto registry = ObjectRegistry::NewInstance();
+  ASSERT_NOK(registry->NewStaticObject<Env>("c://test", &a_env));
+  ASSERT_NOK(registry->NewUniqueObject<Env>("c://test", &guard));
+  ASSERT_EQ(a_env, nullptr);
+  ASSERT_EQ(guard, nullptr);
+  ASSERT_EQ(0, num_a);
+  ASSERT_EQ(0, num_b);
+
+  ASSERT_OK(registry->NewStaticObject<Env>("a://test", &a_env));
+  ASSERT_NE(a_env, nullptr);
+  ASSERT_EQ(1, num_a);
+  ASSERT_EQ(0, num_b);
+
+  ASSERT_OK(registry->NewUniqueObject<Env>("b://test", &guard));
+  ASSERT_NE(guard, nullptr);
+  ASSERT_EQ(1, num_a);
+  ASSERT_EQ(1, num_b);
+
+  Env* b_env = nullptr;
+  ASSERT_NOK(registry->NewStaticObject<Env>("b://test", &b_env));
+  ASSERT_EQ(b_env, nullptr);
+  ASSERT_EQ(1, num_a);
+  ASSERT_EQ(2, num_b);  // Created but rejected as not static
+
+  b_env = a_env;
+  ASSERT_NOK(registry->NewStaticObject<Env>("b://test", &b_env));
+  ASSERT_EQ(b_env, a_env);
+  ASSERT_EQ(1, num_a);
+  ASSERT_EQ(3, num_b);
+
+  b_env = guard.get();
+  ASSERT_NOK(registry->NewUniqueObject<Env>("a://test", &guard));
+  ASSERT_EQ(guard.get(), b_env);  // Unchanged
+  ASSERT_EQ(2, num_a);            // Created one but rejected it as not unique
+  ASSERT_EQ(3, num_b);
+}
+
+TEST_F(ObjRegistryTest, LocalRegistry) {
+  Env* env = nullptr;
+  auto registry = ObjectRegistry::NewInstance();
+  std::shared_ptr<ObjectLibrary> library =
+      std::make_shared<ObjectLibrary>("local");
+  registry->AddLibrary(library);
+  library->AddFactory<Env>(
+      "test-local",
+      [](const std::string& /*uri*/, std::unique_ptr<Env>* /*guard */,
+         std::string* /* errmsg */) { return Env::Default(); });
+
+  ObjectLibrary::Default()->AddFactory<Env>(
+      "test-global",
+      [](const std::string& /*uri*/, std::unique_ptr<Env>* /*guard */,
+         std::string* /* errmsg */) { return Env::Default(); });
+
+  ASSERT_NOK(
+      ObjectRegistry::NewInstance()->NewStaticObject<Env>("test-local", &env));
+  ASSERT_EQ(env, nullptr);
+  ASSERT_OK(
+      ObjectRegistry::NewInstance()->NewStaticObject<Env>("test-global", &env));
+  ASSERT_NE(env, nullptr);
+  ASSERT_OK(registry->NewStaticObject<Env>("test-local", &env));
+  ASSERT_NE(env, nullptr);
+  ASSERT_OK(registry->NewStaticObject<Env>("test-global", &env));
+  ASSERT_NE(env, nullptr);
+}
+
+static int RegisterTestUnguarded(ObjectLibrary& library,
+                                 const std::string& /*arg*/) {
+  library.AddFactory<Env>(
+      "unguarded",
+      [](const std::string& /*uri*/, std::unique_ptr<Env>* /*guard */,
+         std::string* /* errmsg */) { return Env::Default(); });
+  library.AddFactory<Env>(
+      "guarded", [](const std::string& uri, std::unique_ptr<Env>* guard,
+                    std::string* /* errmsg */) {
+        guard->reset(new WrappedEnv(Env::Default(), uri));
+        return guard->get();
+      });
+  return 2;
+}
+
+TEST_F(ObjRegistryTest, CheckShared) {
+  std::shared_ptr<Env> shared;
+  std::shared_ptr<ObjectRegistry> registry = ObjectRegistry::NewInstance();
+  registry->AddLibrary("shared", RegisterTestUnguarded, "");
+
+  ASSERT_OK(registry->NewSharedObject<Env>("guarded", &shared));
+  ASSERT_NE(shared, nullptr);
+  shared.reset();
+  ASSERT_NOK(registry->NewSharedObject<Env>("unguarded", &shared));
+  ASSERT_EQ(shared, nullptr);
+}
+
+TEST_F(ObjRegistryTest, CheckStatic) {
+  Env* env = nullptr;
+  std::shared_ptr<ObjectRegistry> registry = ObjectRegistry::NewInstance();
+  registry->AddLibrary("static", RegisterTestUnguarded, "");
+
+  ASSERT_NOK(registry->NewStaticObject<Env>("guarded", &env));
+  ASSERT_EQ(env, nullptr);
+  env = nullptr;
+  ASSERT_OK(registry->NewStaticObject<Env>("unguarded", &env));
+  ASSERT_NE(env, nullptr);
+}
+
+TEST_F(ObjRegistryTest, CheckUnique) {
+  std::unique_ptr<Env> unique;
+  std::shared_ptr<ObjectRegistry> registry = ObjectRegistry::NewInstance();
+  registry->AddLibrary("unique", RegisterTestUnguarded, "");
+
+  ASSERT_OK(registry->NewUniqueObject<Env>("guarded", &unique));
+  ASSERT_NE(unique, nullptr);
+  unique.reset();
+  ASSERT_NOK(registry->NewUniqueObject<Env>("unguarded", &unique));
+  ASSERT_EQ(unique, nullptr);
+}
+
+TEST_F(ObjRegistryTest, FailingFactory) {
+  std::shared_ptr<ObjectRegistry> registry = ObjectRegistry::NewInstance();
+  std::shared_ptr<ObjectLibrary> library =
+      std::make_shared<ObjectLibrary>("failing");
+  registry->AddLibrary(library);
+  library->AddFactory<Env>(
+      "failing", [](const std::string& /*uri*/,
+                    std::unique_ptr<Env>* /*guard */, std::string* errmsg) {
+        *errmsg = "Bad Factory";
+        return nullptr;
+      });
+  std::unique_ptr<Env> unique;
+  std::shared_ptr<Env> shared;
+  Env* pointer = nullptr;
+  Status s;
+  s = registry->NewUniqueObject<Env>("failing", &unique);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  s = registry->NewSharedObject<Env>("failing", &shared);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  s = registry->NewStaticObject<Env>("failing", &pointer);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  s = registry->NewUniqueObject<Env>("missing", &unique);
+  ASSERT_TRUE(s.IsNotSupported());
+  s = registry->NewSharedObject<Env>("missing", &shared);
+  ASSERT_TRUE(s.IsNotSupported());
+  s = registry->NewStaticObject<Env>("missing", &pointer);
+  ASSERT_TRUE(s.IsNotSupported());
+}
+
+TEST_F(ObjRegistryTest, TestRegistryParents) {
+  auto grand = ObjectRegistry::Default();
+  auto parent = ObjectRegistry::NewInstance();  // parent with a grandparent
+  auto uncle = ObjectRegistry::NewInstance(grand);
+  auto child = ObjectRegistry::NewInstance(parent);
+  auto cousin = ObjectRegistry::NewInstance(uncle);
+
+  auto library = parent->AddLibrary("parent");
+  library->AddFactory<Env>(
+      "parent", [](const std::string& uri, std::unique_ptr<Env>* guard,
+                   std::string* /* errmsg */) {
+        guard->reset(new WrappedEnv(Env::Default(), uri));
+        return guard->get();
+      });
+  library = cousin->AddLibrary("cousin");
+  library->AddFactory<Env>(
+      "cousin", [](const std::string& uri, std::unique_ptr<Env>* guard,
+                   std::string* /* errmsg */) {
+        guard->reset(new WrappedEnv(Env::Default(), uri));
+        return guard->get();
+      });
+
+  Env* env = nullptr;
+  std::unique_ptr<Env> guard;
+  std::string msg;
+
+  // a:://* is registered in Default, so they should all work
+  ASSERT_OK(parent->NewStaticObject<Env>("a://test", &env));
+  ASSERT_OK(child->NewStaticObject<Env>("a://test", &env));
+  ASSERT_OK(uncle->NewStaticObject<Env>("a://test", &env));
+  ASSERT_OK(cousin->NewStaticObject<Env>("a://test", &env));
+
+  // The parent env is only registered for parent, not uncle,
+  // So parent and child should return success and uncle and cousin should fail
+  ASSERT_OK(parent->NewUniqueObject<Env>("parent", &guard));
+  ASSERT_OK(child->NewUniqueObject<Env>("parent", &guard));
+  ASSERT_NOK(uncle->NewUniqueObject<Env>("parent", &guard));
+  ASSERT_NOK(cousin->NewUniqueObject<Env>("parent", &guard));
+
+  // The cousin is only registered in the cousin, so all of the others should
+  // fail
+  ASSERT_OK(cousin->NewUniqueObject<Env>("cousin", &guard));
+  ASSERT_NOK(parent->NewUniqueObject<Env>("cousin", &guard));
+  ASSERT_NOK(child->NewUniqueObject<Env>("cousin", &guard));
+  ASSERT_NOK(uncle->NewUniqueObject<Env>("cousin", &guard));
+}
+
+class MyCustomizable : public Customizable {
+ public:
+  static const char* Type() { return "MyCustomizable"; }
+  MyCustomizable(const char* prefix, const std::string& id) : id_(id) {
+    name_ = id_.substr(0, strlen(prefix) - 1);
+  }
+  const char* Name() const override { return name_.c_str(); }
+  std::string GetId() const override { return id_; }
+
+ private:
+  std::string id_;
+  std::string name_;
+};
+
+TEST_F(ObjRegistryTest, TestFactoryCount) {
+  std::string msg;
+  auto grand = ObjectRegistry::Default();
+  auto local = ObjectRegistry::NewInstance();
+  std::unordered_set<std::string> grand_types, local_types;
+  std::vector<std::string> grand_names, local_names;
+
+  // Check how many types we have on startup.
+  // Grand should equal local
+  grand->GetFactoryTypes(&grand_types);
+  local->GetFactoryTypes(&local_types);
+  ASSERT_EQ(grand_types, local_types);
+  size_t grand_count = grand->GetFactoryCount(Env::Type());
+  size_t local_count = local->GetFactoryCount(Env::Type());
+
+  ASSERT_EQ(grand_count, local_count);
+  grand->GetFactoryNames(Env::Type(), &grand_names);
+  local->GetFactoryNames(Env::Type(), &local_names);
+  ASSERT_EQ(grand_names.size(), grand_count);
+  ASSERT_EQ(local_names.size(), local_count);
+  ASSERT_EQ(grand_names, local_names);
+
+  // Add an Env to the local registry.
+  // This will add one factory.
+  auto library = local->AddLibrary("local");
+  library->AddFactory<Env>(
+      "A", [](const std::string& /*uri*/, std::unique_ptr<Env>* /*guard */,
+              std::string* /* errmsg */) { return nullptr; });
+  ASSERT_EQ(local_count + 1, local->GetFactoryCount(Env::Type()));
+  ASSERT_EQ(grand_count, grand->GetFactoryCount(Env::Type()));
+  local->GetFactoryTypes(&local_types);
+  local->GetFactoryNames(Env::Type(), &local_names);
+  ASSERT_EQ(grand_names.size() + 1, local_names.size());
+  ASSERT_EQ(local_names.size(), local->GetFactoryCount(Env::Type()));
+
+  if (grand_count == 0) {
+    // There were no Env when we started.  Should have one more type
+    // than previously
+    ASSERT_NE(grand_types, local_types);
+    ASSERT_EQ(grand_types.size() + 1, local_types.size());
+  } else {
+    // There was an Env type when we started.  The types should match
+    ASSERT_EQ(grand_types, local_types);
+  }
+
+  // Add a MyCustomizable to the registry.  This should be a new type
+  library->AddFactory<MyCustomizable>(
+      "MY", [](const std::string& /*uri*/,
+               std::unique_ptr<MyCustomizable>* /*guard */,
+               std::string* /* errmsg */) { return nullptr; });
+  ASSERT_EQ(local_count + 1, local->GetFactoryCount(Env::Type()));
+  ASSERT_EQ(grand_count, grand->GetFactoryCount(Env::Type()));
+  ASSERT_EQ(0U, grand->GetFactoryCount(MyCustomizable::Type()));
+  ASSERT_EQ(1U, local->GetFactoryCount(MyCustomizable::Type()));
+
+  local->GetFactoryNames(MyCustomizable::Type(), &local_names);
+  ASSERT_EQ(1U, local_names.size());
+  ASSERT_EQ(local_names[0], "MY");
+
+  local->GetFactoryTypes(&local_types);
+  ASSERT_EQ(grand_count == 0 ? 2 : grand_types.size() + 1, local_types.size());
+
+  // Add the same name again.  We should now have 2 factories.
+  library->AddFactory<MyCustomizable>(
+      "MY", [](const std::string& /*uri*/,
+               std::unique_ptr<MyCustomizable>* /*guard */,
+               std::string* /* errmsg */) { return nullptr; });
+  local->GetFactoryNames(MyCustomizable::Type(), &local_names);
+  ASSERT_EQ(2U, local_names.size());
+}
+
+TEST_F(ObjRegistryTest, TestManagedObjects) {
+  auto registry = ObjectRegistry::NewInstance();
+  auto m_a1 = std::make_shared<MyCustomizable>("", "A");
+  auto m_a2 = std::make_shared<MyCustomizable>("", "A");
+
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+  ASSERT_OK(registry->SetManagedObject<MyCustomizable>(m_a1));
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), m_a1);
+
+  ASSERT_NOK(registry->SetManagedObject<MyCustomizable>(m_a2));
+  ASSERT_OK(registry->SetManagedObject<MyCustomizable>(m_a1));
+  m_a1.reset();
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+  ASSERT_OK(registry->SetManagedObject<MyCustomizable>(m_a2));
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), m_a2);
+}
+
+TEST_F(ObjRegistryTest, TestTwoManagedObjects) {
+  auto registry = ObjectRegistry::NewInstance();
+  auto m_a = std::make_shared<MyCustomizable>("", "A");
+  auto m_b = std::make_shared<MyCustomizable>("", "B");
+  std::vector<std::shared_ptr<MyCustomizable>> objects;
+
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("B"), nullptr);
+  ASSERT_OK(registry->ListManagedObjects(&objects));
+  ASSERT_EQ(objects.size(), 0U);
+  ASSERT_OK(registry->SetManagedObject(m_a));
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("B"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), m_a);
+  ASSERT_OK(registry->ListManagedObjects(&objects));
+  ASSERT_EQ(objects.size(), 1U);
+  ASSERT_EQ(objects.front(), m_a);
+
+  ASSERT_OK(registry->SetManagedObject(m_b));
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), m_a);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("B"), m_b);
+  ASSERT_OK(registry->ListManagedObjects(&objects));
+  ASSERT_EQ(objects.size(), 2U);
+  ASSERT_OK(registry->ListManagedObjects("A", &objects));
+  ASSERT_EQ(objects.size(), 1U);
+  ASSERT_EQ(objects.front(), m_a);
+  ASSERT_OK(registry->ListManagedObjects("B", &objects));
+  ASSERT_EQ(objects.size(), 1U);
+  ASSERT_EQ(objects.front(), m_b);
+  ASSERT_OK(registry->ListManagedObjects("C", &objects));
+  ASSERT_EQ(objects.size(), 0U);
+
+  m_a.reset();
+  objects.clear();
+
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("B"), m_b);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+  ASSERT_OK(registry->ListManagedObjects(&objects));
+  ASSERT_EQ(objects.size(), 1U);
+  ASSERT_EQ(objects.front(), m_b);
+
+  m_b.reset();
+  objects.clear();
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("B"), nullptr);
+}
+
+TEST_F(ObjRegistryTest, TestAlternateNames) {
+  auto registry = ObjectRegistry::NewInstance();
+  auto m_a = std::make_shared<MyCustomizable>("", "A");
+  auto m_b = std::make_shared<MyCustomizable>("", "B");
+  std::vector<std::shared_ptr<MyCustomizable>> objects;
+  // Test no objects exist
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("B"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("TheOne"), nullptr);
+  ASSERT_OK(registry->ListManagedObjects(&objects));
+  ASSERT_EQ(objects.size(), 0U);
+
+  // Mark "TheOne" to be A
+  ASSERT_OK(registry->SetManagedObject("TheOne", m_a));
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("B"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("TheOne"), m_a);
+  ASSERT_OK(registry->ListManagedObjects(&objects));
+  ASSERT_EQ(objects.size(), 1U);
+  ASSERT_EQ(objects.front(), m_a);
+
+  // Try to mark "TheOne" again.
+  ASSERT_NOK(registry->SetManagedObject("TheOne", m_b));
+  ASSERT_OK(registry->SetManagedObject("TheOne", m_a));
+
+  // Add "A" as a managed object.  Registered 2x
+  ASSERT_OK(registry->SetManagedObject(m_a));
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("B"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), m_a);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("TheOne"), m_a);
+  ASSERT_OK(registry->ListManagedObjects(&objects));
+  ASSERT_EQ(objects.size(), 2U);
+
+  // Delete "A".
+  m_a.reset();
+  objects.clear();
+
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("TheOne"), nullptr);
+  ASSERT_OK(registry->SetManagedObject("TheOne", m_b));
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("TheOne"), m_b);
+  ASSERT_OK(registry->ListManagedObjects(&objects));
+  ASSERT_EQ(objects.size(), 1U);
+  ASSERT_EQ(objects.front(), m_b);
+
+  m_b.reset();
+  objects.clear();
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("TheOne"), nullptr);
+  ASSERT_OK(registry->ListManagedObjects(&objects));
+  ASSERT_EQ(objects.size(), 0U);
+}
+
+TEST_F(ObjRegistryTest, TestTwoManagedClasses) {
+  class MyCustomizable2 : public MyCustomizable {
+   public:
+    static const char* Type() { return "MyCustomizable2"; }
+    MyCustomizable2(const char* prefix, const std::string& id)
+        : MyCustomizable(prefix, id) {}
+  };
+
+  auto registry = ObjectRegistry::NewInstance();
+  auto m_a1 = std::make_shared<MyCustomizable>("", "A");
+  auto m_a2 = std::make_shared<MyCustomizable2>("", "A");
+  std::vector<std::shared_ptr<MyCustomizable>> obj1s;
+  std::vector<std::shared_ptr<MyCustomizable2>> obj2s;
+
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable2>("A"), nullptr);
+
+  ASSERT_OK(registry->SetManagedObject(m_a1));
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), m_a1);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable2>("A"), nullptr);
+
+  ASSERT_OK(registry->SetManagedObject(m_a2));
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable2>("A"), m_a2);
+  ASSERT_OK(registry->ListManagedObjects(&obj1s));
+  ASSERT_OK(registry->ListManagedObjects(&obj2s));
+  ASSERT_EQ(obj1s.size(), 1U);
+  ASSERT_EQ(obj2s.size(), 1U);
+  ASSERT_EQ(obj1s.front(), m_a1);
+  ASSERT_EQ(obj2s.front(), m_a2);
+  m_a1.reset();
+  obj1s.clear();
+  obj2s.clear();
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable2>("A"), m_a2);
+
+  m_a2.reset();
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable2>("A"), nullptr);
+}
+
+TEST_F(ObjRegistryTest, TestManagedObjectsWithParent) {
+  auto base = ObjectRegistry::NewInstance();
+  auto registry = ObjectRegistry::NewInstance(base);
+
+  auto m_a = std::make_shared<MyCustomizable>("", "A");
+  auto m_b = std::make_shared<MyCustomizable>("", "A");
+
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+  ASSERT_OK(base->SetManagedObject(m_a));
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), m_a);
+
+  ASSERT_NOK(registry->SetManagedObject(m_b));
+  ASSERT_OK(registry->SetManagedObject(m_a));
+
+  m_a.reset();
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+  ASSERT_OK(registry->SetManagedObject(m_b));
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), m_b);
+}
+
+TEST_F(ObjRegistryTest, TestGetOrCreateManagedObject) {
+  auto registry = ObjectRegistry::NewInstance();
+  registry->AddLibrary("test")->AddFactory<MyCustomizable>(
+      ObjectLibrary::PatternEntry::AsIndividualId("MC"),
+      [](const std::string& uri, std::unique_ptr<MyCustomizable>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new MyCustomizable("MC", uri));
+        return guard->get();
+      });
+  std::shared_ptr<MyCustomizable> m_a, m_b, obj;
+  std::vector<std::shared_ptr<MyCustomizable>> objs;
+
+  std::unordered_map<std::string, std::string> opt_map;
+
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("MC@A#1"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("MC@B#1"), nullptr);
+  ASSERT_OK(registry->GetOrCreateManagedObject("MC@A#1", &m_a));
+  ASSERT_OK(registry->GetOrCreateManagedObject("MC@B#1", &m_b));
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("MC@A#1"), m_a);
+  ASSERT_OK(registry->GetOrCreateManagedObject("MC@A#1", &obj));
+  ASSERT_EQ(obj, m_a);
+  ASSERT_OK(registry->GetOrCreateManagedObject("MC@B#1", &obj));
+  ASSERT_EQ(obj, m_b);
+  ASSERT_OK(registry->ListManagedObjects(&objs));
+  ASSERT_EQ(objs.size(), 2U);
+
+  objs.clear();
+  m_a.reset();
+  obj.reset();
+  ASSERT_OK(registry->GetOrCreateManagedObject("MC@A#1", &m_a));
+  ASSERT_EQ(1, m_a.use_count());
+  ASSERT_OK(registry->GetOrCreateManagedObject("MC@B#1", &obj));
+  ASSERT_EQ(2, obj.use_count());
+}
+
+TEST_F(ObjRegistryTest, RegisterPlugin) {
+  std::shared_ptr<ObjectRegistry> registry = ObjectRegistry::NewInstance();
+  std::unique_ptr<Env> guard;
+  Env* env = nullptr;
+
+  ASSERT_NOK(registry->NewObject<Env>("unguarded", &env, &guard));
+  ASSERT_EQ(registry->RegisterPlugin("Missing", nullptr), -1);
+  ASSERT_EQ(registry->RegisterPlugin("", RegisterTestUnguarded), -1);
+  ASSERT_GT(registry->RegisterPlugin("Valid", RegisterTestUnguarded), 0);
+  ASSERT_OK(registry->NewObject<Env>("unguarded", &env, &guard));
+  ASSERT_NE(env, nullptr);
+}
+class PatternEntryTest : public testing::Test {};
+
+TEST_F(PatternEntryTest, TestSimpleEntry) {
+  ObjectLibrary::PatternEntry entry("ABC", true);
+
+  ASSERT_TRUE(entry.Matches("ABC"));
+  ASSERT_FALSE(entry.Matches("AABC"));
+  ASSERT_FALSE(entry.Matches("ABCA"));
+  ASSERT_FALSE(entry.Matches("AABCA"));
+  ASSERT_FALSE(entry.Matches("AB"));
+  ASSERT_FALSE(entry.Matches("BC"));
+  ASSERT_FALSE(entry.Matches("ABD"));
+  ASSERT_FALSE(entry.Matches("BCA"));
+}
+
+TEST_F(PatternEntryTest, TestPatternEntry) {
+  // Matches A:+
+  ObjectLibrary::PatternEntry entry("A", false);
+  entry.AddSeparator(":");
+  ASSERT_FALSE(entry.Matches("A"));
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("AB"));
+  ASSERT_FALSE(entry.Matches("B"));
+  ASSERT_FALSE(entry.Matches("A:"));
+  ASSERT_FALSE(entry.Matches("AA:"));
+  ASSERT_FALSE(entry.Matches("AA:B"));
+  ASSERT_FALSE(entry.Matches("AA:BB"));
+  ASSERT_TRUE(entry.Matches("A:B"));
+  ASSERT_TRUE(entry.Matches("A:BB"));
+
+  entry.SetOptional(true);  // Now matches "A" or "A:+"
+  ASSERT_TRUE(entry.Matches("A"));
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("AB"));
+  ASSERT_FALSE(entry.Matches("B"));
+  ASSERT_FALSE(entry.Matches("A:"));
+  ASSERT_FALSE(entry.Matches("AA:"));
+  ASSERT_FALSE(entry.Matches("AA:B"));
+  ASSERT_FALSE(entry.Matches("AA:BB"));
+  ASSERT_TRUE(entry.Matches("A:B"));
+  ASSERT_TRUE(entry.Matches("A:BB"));
+}
+
+TEST_F(PatternEntryTest, MatchZeroOrMore) {
+  // Matches A:*
+  ObjectLibrary::PatternEntry entry("A", false);
+  entry.AddSeparator(":", false);
+  ASSERT_FALSE(entry.Matches("A"));
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("AB"));
+  ASSERT_FALSE(entry.Matches("B"));
+  ASSERT_TRUE(entry.Matches("A:"));
+  ASSERT_FALSE(entry.Matches("B:"));
+  ASSERT_FALSE(entry.Matches("B:A"));
+  ASSERT_FALSE(entry.Matches("AA:"));
+  ASSERT_FALSE(entry.Matches("AA:B"));
+  ASSERT_FALSE(entry.Matches("AA:BB"));
+  ASSERT_TRUE(entry.Matches("A:B"));
+  ASSERT_TRUE(entry.Matches("A:BB"));
+
+  entry.SetOptional(true);  // Now matches "A" or "A:*"
+  ASSERT_TRUE(entry.Matches("A"));
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("AB"));
+  ASSERT_FALSE(entry.Matches("B"));
+  ASSERT_TRUE(entry.Matches("A:"));
+  ASSERT_FALSE(entry.Matches("B:"));
+  ASSERT_FALSE(entry.Matches("B:A"));
+  ASSERT_FALSE(entry.Matches("AA:"));
+  ASSERT_FALSE(entry.Matches("AA:B"));
+  ASSERT_FALSE(entry.Matches("AA:BB"));
+  ASSERT_TRUE(entry.Matches("A:B"));
+  ASSERT_TRUE(entry.Matches("A:BB"));
+}
+
+TEST_F(PatternEntryTest, TestSuffixEntry) {
+  ObjectLibrary::PatternEntry entry("AA", true);
+  entry.AddSuffix("BB");
+
+  ASSERT_TRUE(entry.Matches("AA"));
+  ASSERT_TRUE(entry.Matches("AABB"));
+
+  ASSERT_FALSE(entry.Matches("A"));
+  ASSERT_FALSE(entry.Matches("AB"));
+  ASSERT_FALSE(entry.Matches("B"));
+  ASSERT_FALSE(entry.Matches("BB"));
+  ASSERT_FALSE(entry.Matches("ABA"));
+  ASSERT_FALSE(entry.Matches("BBAA"));
+  ASSERT_FALSE(entry.Matches("AABBA"));
+  ASSERT_FALSE(entry.Matches("AABBB"));
+}
+
+TEST_F(PatternEntryTest, TestNumericEntry) {
+  ObjectLibrary::PatternEntry entry("A", false);
+  entry.AddNumber(":");
+  ASSERT_FALSE(entry.Matches("A"));
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("A:"));
+  ASSERT_FALSE(entry.Matches("AA:"));
+  ASSERT_TRUE(entry.Matches("A:1"));
+  ASSERT_TRUE(entry.Matches("A:11"));
+  ASSERT_FALSE(entry.Matches("AA:1"));
+  ASSERT_FALSE(entry.Matches("AA:11"));
+  ASSERT_FALSE(entry.Matches("A:B"));
+  ASSERT_FALSE(entry.Matches("A:1B"));
+  ASSERT_FALSE(entry.Matches("A:B1"));
+
+  entry.AddSeparator(":", false);
+  ASSERT_FALSE(entry.Matches("A"));
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("A:"));
+  ASSERT_FALSE(entry.Matches("AA:"));
+  ASSERT_TRUE(entry.Matches("A:1:"));
+  ASSERT_TRUE(entry.Matches("A:11:"));
+  ASSERT_FALSE(entry.Matches("A:1"));
+  ASSERT_FALSE(entry.Matches("A:B1:"));
+  ASSERT_FALSE(entry.Matches("A:1B:"));
+  ASSERT_FALSE(entry.Matches("A::"));
+}
+
+TEST_F(PatternEntryTest, TestDoubleEntry) {
+  ObjectLibrary::PatternEntry entry("A", false);
+  entry.AddNumber(":", false);
+  ASSERT_FALSE(entry.Matches("A"));
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("A:"));
+  ASSERT_FALSE(entry.Matches("AA:"));
+  ASSERT_FALSE(entry.Matches("AA:1"));
+  ASSERT_FALSE(entry.Matches("AA:11"));
+  ASSERT_FALSE(entry.Matches("A:B"));
+  ASSERT_FALSE(entry.Matches("A:1B"));
+  ASSERT_FALSE(entry.Matches("A:B1"));
+  ASSERT_TRUE(entry.Matches("A:1"));
+  ASSERT_TRUE(entry.Matches("A:11"));
+  ASSERT_TRUE(entry.Matches("A:1.1"));
+  ASSERT_TRUE(entry.Matches("A:11.11"));
+  ASSERT_TRUE(entry.Matches("A:1."));
+  ASSERT_TRUE(entry.Matches("A:.1"));
+  ASSERT_TRUE(entry.Matches("A:0.1"));
+  ASSERT_TRUE(entry.Matches("A:1.0"));
+  ASSERT_TRUE(entry.Matches("A:1.0"));
+
+  ASSERT_FALSE(entry.Matches("A:1.0."));
+  ASSERT_FALSE(entry.Matches("A:1.0.2"));
+  ASSERT_FALSE(entry.Matches("A:.1.0"));
+  ASSERT_FALSE(entry.Matches("A:..10"));
+  ASSERT_FALSE(entry.Matches("A:10.."));
+  ASSERT_FALSE(entry.Matches("A:."));
+
+  entry.AddSeparator(":", false);
+  ASSERT_FALSE(entry.Matches("A:1"));
+  ASSERT_FALSE(entry.Matches("A:1.0"));
+
+  ASSERT_TRUE(entry.Matches("A:11:"));
+  ASSERT_TRUE(entry.Matches("A:1.1:"));
+  ASSERT_TRUE(entry.Matches("A:11.11:"));
+  ASSERT_TRUE(entry.Matches("A:1.:"));
+  ASSERT_TRUE(entry.Matches("A:.1:"));
+  ASSERT_TRUE(entry.Matches("A:0.1:"));
+  ASSERT_TRUE(entry.Matches("A:1.0:"));
+  ASSERT_TRUE(entry.Matches("A:1.0:"));
+
+  ASSERT_FALSE(entry.Matches("A:1.0.:"));
+  ASSERT_FALSE(entry.Matches("A:1.0.2:"));
+  ASSERT_FALSE(entry.Matches("A:.1.0:"));
+  ASSERT_FALSE(entry.Matches("A:..10:"));
+  ASSERT_FALSE(entry.Matches("A:10..:"));
+  ASSERT_FALSE(entry.Matches("A:.:"));
+  ASSERT_FALSE(entry.Matches("A::"));
+}
+
+TEST_F(PatternEntryTest, TestIndividualIdEntry) {
+  auto entry = ObjectLibrary::PatternEntry::AsIndividualId("AA");
+  ASSERT_TRUE(entry.Matches("AA"));
+  ASSERT_TRUE(entry.Matches("AA@123#456"));
+  ASSERT_TRUE(entry.Matches("AA@deadbeef#id"));
+
+  ASSERT_FALSE(entry.Matches("A"));
+  ASSERT_FALSE(entry.Matches("AAA"));
+  ASSERT_FALSE(entry.Matches("AA@123"));
+  ASSERT_FALSE(entry.Matches("AA@123#"));
+  ASSERT_FALSE(entry.Matches("AA@#123"));
+}
+
+TEST_F(PatternEntryTest, TestTwoNameEntry) {
+  ObjectLibrary::PatternEntry entry("A");
+  entry.AnotherName("B");
+  ASSERT_TRUE(entry.Matches("A"));
+  ASSERT_TRUE(entry.Matches("B"));
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("BB"));
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("BA"));
+  ASSERT_FALSE(entry.Matches("AB"));
+}
+
+TEST_F(PatternEntryTest, TestTwoPatternEntry) {
+  ObjectLibrary::PatternEntry entry("AA", false);
+  entry.AddSeparator(":");
+  entry.AddSeparator(":");
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("AA:"));
+  ASSERT_FALSE(entry.Matches("AA::"));
+  ASSERT_FALSE(entry.Matches("AA::12"));
+  ASSERT_TRUE(entry.Matches("AA:1:2"));
+  ASSERT_TRUE(entry.Matches("AA:1:2:"));
+
+  ObjectLibrary::PatternEntry entry2("AA", false);
+  entry2.AddSeparator("::");
+  entry2.AddSeparator("##");
+  ASSERT_FALSE(entry2.Matches("AA"));
+  ASSERT_FALSE(entry2.Matches("AA:"));
+  ASSERT_FALSE(entry2.Matches("AA::"));
+  ASSERT_FALSE(entry2.Matches("AA::#"));
+  ASSERT_FALSE(entry2.Matches("AA::##"));
+  ASSERT_FALSE(entry2.Matches("AA##1::2"));
+  ASSERT_FALSE(entry2.Matches("AA::123##"));
+  ASSERT_TRUE(entry2.Matches("AA::1##2"));
+  ASSERT_TRUE(entry2.Matches("AA::12##34:"));
+  ASSERT_TRUE(entry2.Matches("AA::12::34##56"));
+  ASSERT_TRUE(entry2.Matches("AA::12##34::56"));
+}
+
+TEST_F(PatternEntryTest, TestTwoNumbersEntry) {
+  ObjectLibrary::PatternEntry entry("AA", false);
+  entry.AddNumber(":");
+  entry.AddNumber(":");
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("AA:"));
+  ASSERT_FALSE(entry.Matches("AA::"));
+  ASSERT_FALSE(entry.Matches("AA::12"));
+  ASSERT_FALSE(entry.Matches("AA:1:2:"));
+  ASSERT_TRUE(entry.Matches("AA:1:2"));
+  ASSERT_TRUE(entry.Matches("AA:12:23456"));
+
+  ObjectLibrary::PatternEntry entry2("AA", false);
+  entry2.AddNumber(":");
+  entry2.AddNumber("#");
+  ASSERT_FALSE(entry2.Matches("AA"));
+  ASSERT_FALSE(entry2.Matches("AA:"));
+  ASSERT_FALSE(entry2.Matches("AA:#"));
+  ASSERT_FALSE(entry2.Matches("AA#:"));
+  ASSERT_FALSE(entry2.Matches("AA:123#"));
+  ASSERT_FALSE(entry2.Matches("AA:123#B"));
+  ASSERT_FALSE(entry2.Matches("AA:B#123"));
+  ASSERT_TRUE(entry2.Matches("AA:1#2"));
+  ASSERT_FALSE(entry2.Matches("AA:123#23:"));
+  ASSERT_FALSE(entry2.Matches("AA::12#234"));
+}
+
+TEST_F(PatternEntryTest, TestPatternAndSuffix) {
+  ObjectLibrary::PatternEntry entry("AA", false);
+  entry.AddSeparator("::");
+  entry.AddSuffix("##");
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("AA::"));
+  ASSERT_FALSE(entry.Matches("AA::##"));
+  ASSERT_FALSE(entry.Matches("AB::1##"));
+  ASSERT_FALSE(entry.Matches("AB::1##2"));
+  ASSERT_FALSE(entry.Matches("AA##1::"));
+  ASSERT_TRUE(entry.Matches("AA::1##"));
+  ASSERT_FALSE(entry.Matches("AA::1###"));
+
+  ObjectLibrary::PatternEntry entry2("AA", false);
+  entry2.AddSuffix("::");
+  entry2.AddSeparator("##");
+  ASSERT_FALSE(entry2.Matches("AA"));
+  ASSERT_FALSE(entry2.Matches("AA::"));
+  ASSERT_FALSE(entry2.Matches("AA::##"));
+  ASSERT_FALSE(entry2.Matches("AB::1##"));
+  ASSERT_FALSE(entry2.Matches("AB::1##2"));
+  ASSERT_TRUE(entry2.Matches("AA::##12"));
+}
+
+TEST_F(PatternEntryTest, TestTwoNamesAndPattern) {
+  ObjectLibrary::PatternEntry entry("AA", true);
+  entry.AddSeparator("::");
+  entry.AnotherName("BBB");
+  ASSERT_TRUE(entry.Matches("AA"));
+  ASSERT_TRUE(entry.Matches("AA::1"));
+  ASSERT_TRUE(entry.Matches("BBB"));
+  ASSERT_TRUE(entry.Matches("BBB::2"));
+
+  ASSERT_FALSE(entry.Matches("AA::"));
+  ASSERT_FALSE(entry.Matches("AAA::"));
+  ASSERT_FALSE(entry.Matches("BBB::"));
+
+  entry.SetOptional(false);
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("BBB"));
+
+  ASSERT_FALSE(entry.Matches("AA::"));
+  ASSERT_FALSE(entry.Matches("AAA::"));
+  ASSERT_FALSE(entry.Matches("BBB::"));
+
+  ASSERT_TRUE(entry.Matches("AA::1"));
+  ASSERT_TRUE(entry.Matches("BBB::2"));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else  // ROCKSDB_LITE
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as ObjRegistry is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/option_change_migration/option_change_migration.cc b/src/rocksdb/utilities/option_change_migration/option_change_migration.cc
new file mode 100644
index 000000000..e93d2152d
--- /dev/null
+++ b/src/rocksdb/utilities/option_change_migration/option_change_migration.cc
@@ -0,0 +1,186 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/utilities/option_change_migration.h"
+
+#ifndef ROCKSDB_LITE
+#include "rocksdb/db.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+// Return a version of Options `opts` that allow us to open/write into a DB
+// without triggering an automatic compaction or stalling. This is guaranteed
+// by disabling automatic compactions and using huge values for stalling
+// triggers.
+Options GetNoCompactionOptions(const Options& opts) {
+  Options ret_opts = opts;
+  ret_opts.disable_auto_compactions = true;
+  ret_opts.level0_slowdown_writes_trigger = 999999;
+  ret_opts.level0_stop_writes_trigger = 999999;
+  ret_opts.soft_pending_compaction_bytes_limit = 0;
+  ret_opts.hard_pending_compaction_bytes_limit = 0;
+  return ret_opts;
+}
+
+Status OpenDb(const Options& options, const std::string& dbname,
+              std::unique_ptr<DB>* db) {
+  db->reset();
+  DB* tmpdb;
+  Status s = DB::Open(options, dbname, &tmpdb);
+  if (s.ok()) {
+    db->reset(tmpdb);
+  }
+  return s;
+}
+
+// l0_file_size specifies size of file on L0. Files will be range partitioned
+// after a full compaction so they are likely qualified to put on L0. If
+// left as 0, the files are compacted in a single file and put to L0. Otherwise,
+// will try to compact the files as size l0_file_size.
+Status CompactToLevel(const Options& options, const std::string& dbname,
+                      int dest_level, uint64_t l0_file_size, bool need_reopen) {
+  std::unique_ptr<DB> db;
+  Options no_compact_opts = GetNoCompactionOptions(options);
+  if (dest_level == 0) {
+    if (l0_file_size == 0) {
+      // Single file.
+      l0_file_size = 999999999999999;
+    }
+    // L0 has strict sequenceID requirements to files to it. It's safer
+    // to only put one compacted file to there.
+    // This is only used for converting to universal compaction with
+    // only one level. In this case, compacting to one file is also
+    // optimal.
+    no_compact_opts.target_file_size_base = l0_file_size;
+    no_compact_opts.max_compaction_bytes = l0_file_size;
+  }
+  Status s = OpenDb(no_compact_opts, dbname, &db);
+  if (!s.ok()) {
+    return s;
+  }
+  CompactRangeOptions cro;
+  cro.change_level = true;
+  cro.target_level = dest_level;
+  if (dest_level == 0) {
+    // cannot use kForceOptimized because the compaction is expected to
+    // generate one output file
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  }
+  s = db->CompactRange(cro, nullptr, nullptr);
+
+  if (s.ok() && need_reopen) {
+    // Need to restart DB to rewrite the manifest file.
+    // In order to open a DB with specific num_levels, the manifest file should
+    // contain no record that mentiones any level beyond num_levels. Issuing a
+    // full compaction will move all the data to a level not exceeding
+    // num_levels, but the manifest may still contain previous record mentioning
+    // a higher level. Reopening the DB will force the manifest to be rewritten
+    // so that those records will be cleared.
+    db.reset();
+    s = OpenDb(no_compact_opts, dbname, &db);
+  }
+  return s;
+}
+
+Status MigrateToUniversal(std::string dbname, const Options& old_opts,
+                          const Options& new_opts) {
+  if (old_opts.num_levels <= new_opts.num_levels ||
+      old_opts.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+    return Status::OK();
+  } else {
+    bool need_compact = false;
+    {
+      std::unique_ptr<DB> db;
+      Options opts = GetNoCompactionOptions(old_opts);
+      Status s = OpenDb(opts, dbname, &db);
+      if (!s.ok()) {
+        return s;
+      }
+      ColumnFamilyMetaData metadata;
+      db->GetColumnFamilyMetaData(&metadata);
+      if (!metadata.levels.empty() &&
+          metadata.levels.back().level >= new_opts.num_levels) {
+        need_compact = true;
+      }
+    }
+    if (need_compact) {
+      return CompactToLevel(old_opts, dbname, new_opts.num_levels - 1,
+                            /*l0_file_size=*/0, true);
+    }
+    return Status::OK();
+  }
+}
+
+Status MigrateToLevelBase(std::string dbname, const Options& old_opts,
+                          const Options& new_opts) {
+  if (!new_opts.level_compaction_dynamic_level_bytes) {
+    if (old_opts.num_levels == 1) {
+      return Status::OK();
+    }
+    // Compact everything to level 1 to guarantee it can be safely opened.
+    Options opts = old_opts;
+    opts.target_file_size_base = new_opts.target_file_size_base;
+    // Although sometimes we can open the DB with the new option without error,
+    // We still want to compact the files to avoid the LSM tree to stuck
+    // in bad shape. For example, if the user changed the level size
+    // multiplier from 4 to 8, with the same data, we will have fewer
+    // levels. Unless we issue a full comaction, the LSM tree may stuck
+    // with more levels than needed and it won't recover automatically.
+    return CompactToLevel(opts, dbname, 1, /*l0_file_size=*/0, true);
+  } else {
+    // Compact everything to the last level to guarantee it can be safely
+    // opened.
+    if (old_opts.num_levels == 1) {
+      return Status::OK();
+    } else if (new_opts.num_levels > old_opts.num_levels) {
+      // Dynamic level mode requires data to be put in the last level first.
+      return CompactToLevel(new_opts, dbname, new_opts.num_levels - 1,
+                            /*l0_file_size=*/0, false);
+    } else {
+      Options opts = old_opts;
+      opts.target_file_size_base = new_opts.target_file_size_base;
+      return CompactToLevel(opts, dbname, new_opts.num_levels - 1,
+                            /*l0_file_size=*/0, true);
+    }
+  }
+}
+}  // namespace
+
+Status OptionChangeMigration(std::string dbname, const Options& old_opts,
+                             const Options& new_opts) {
+  if (old_opts.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+    // LSM generated by FIFO compaction can be opened by any compaction.
+    return Status::OK();
+  } else if (new_opts.compaction_style ==
+             CompactionStyle::kCompactionStyleUniversal) {
+    return MigrateToUniversal(dbname, old_opts, new_opts);
+  } else if (new_opts.compaction_style ==
+             CompactionStyle::kCompactionStyleLevel) {
+    return MigrateToLevelBase(dbname, old_opts, new_opts);
+  } else if (new_opts.compaction_style ==
+             CompactionStyle::kCompactionStyleFIFO) {
+    uint64_t l0_file_size = 0;
+    if (new_opts.compaction_options_fifo.max_table_files_size > 0) {
+      // Create at least 8 files when max_table_files_size hits, so that the DB
+      // doesn't just disappear. This in fact violates the FIFO condition, but
+      // otherwise, the migrated DB is unlikley to be usable.
+      l0_file_size = new_opts.compaction_options_fifo.max_table_files_size / 8;
+    }
+    return CompactToLevel(old_opts, dbname, 0, l0_file_size, true);
+  } else {
+    return Status::NotSupported(
+        "Do not how to migrate to this compaction style");
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
+#else
+namespace ROCKSDB_NAMESPACE {
+Status OptionChangeMigration(std::string /*dbname*/,
+                             const Options& /*old_opts*/,
+                             const Options& /*new_opts*/) {
+  return Status::NotSupported();
+}
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/option_change_migration/option_change_migration_test.cc b/src/rocksdb/utilities/option_change_migration/option_change_migration_test.cc
new file mode 100644
index 000000000..71af45db1
--- /dev/null
+++ b/src/rocksdb/utilities/option_change_migration/option_change_migration_test.cc
@@ -0,0 +1,550 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/utilities/option_change_migration.h"
+
+#include <set>
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBOptionChangeMigrationTests
+    : public DBTestBase,
+      public testing::WithParamInterface<
+          std::tuple<int, int, bool, int, int, bool, uint64_t>> {
+ public:
+  DBOptionChangeMigrationTests()
+      : DBTestBase("db_option_change_migration_test", /*env_do_fsync=*/true) {
+    level1_ = std::get<0>(GetParam());
+    compaction_style1_ = std::get<1>(GetParam());
+    is_dynamic1_ = std::get<2>(GetParam());
+
+    level2_ = std::get<3>(GetParam());
+    compaction_style2_ = std::get<4>(GetParam());
+    is_dynamic2_ = std::get<5>(GetParam());
+    fifo_max_table_files_size_ = std::get<6>(GetParam());
+  }
+
+  // Required if inheriting from testing::WithParamInterface<>
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  int level1_;
+  int compaction_style1_;
+  bool is_dynamic1_;
+
+  int level2_;
+  int compaction_style2_;
+  bool is_dynamic2_;
+
+  uint64_t fifo_max_table_files_size_;
+};
+
+#ifndef ROCKSDB_LITE
+TEST_P(DBOptionChangeMigrationTests, Migrate1) {
+  Options old_options = CurrentOptions();
+  old_options.compaction_style =
+      static_cast<CompactionStyle>(compaction_style1_);
+  if (old_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
+    old_options.level_compaction_dynamic_level_bytes = is_dynamic1_;
+  }
+  if (old_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+    old_options.max_open_files = -1;
+  }
+  old_options.level0_file_num_compaction_trigger = 3;
+  old_options.write_buffer_size = 64 * 1024;
+  old_options.target_file_size_base = 128 * 1024;
+  // Make level target of L1, L2 to be 200KB and 600KB
+  old_options.num_levels = level1_;
+  old_options.max_bytes_for_level_multiplier = 3;
+  old_options.max_bytes_for_level_base = 200 * 1024;
+
+  Reopen(old_options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // Generate at least 2MB of data
+  for (int num = 0; num < 20; num++) {
+    GenerateNewFile(&rnd, &key_idx);
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Will make sure exactly those keys are in the DB after migration.
+  std::set<std::string> keys;
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->SeekToFirst();
+    for (; it->Valid(); it->Next()) {
+      keys.insert(it->key().ToString());
+    }
+  }
+  Close();
+
+  Options new_options = old_options;
+  new_options.compaction_style =
+      static_cast<CompactionStyle>(compaction_style2_);
+  if (new_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
+    new_options.level_compaction_dynamic_level_bytes = is_dynamic2_;
+  }
+  if (new_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+    new_options.max_open_files = -1;
+  }
+  if (fifo_max_table_files_size_ != 0) {
+    new_options.compaction_options_fifo.max_table_files_size =
+        fifo_max_table_files_size_;
+  }
+  new_options.target_file_size_base = 256 * 1024;
+  new_options.num_levels = level2_;
+  new_options.max_bytes_for_level_base = 150 * 1024;
+  new_options.max_bytes_for_level_multiplier = 4;
+  ASSERT_OK(OptionChangeMigration(dbname_, old_options, new_options));
+  Reopen(new_options);
+
+  // Wait for compaction to finish and make sure it can reopen
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  Reopen(new_options);
+
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->SeekToFirst();
+    for (std::string key : keys) {
+      ASSERT_TRUE(it->Valid());
+      ASSERT_EQ(key, it->key().ToString());
+      it->Next();
+    }
+    ASSERT_TRUE(!it->Valid());
+  }
+}
+
+TEST_P(DBOptionChangeMigrationTests, Migrate2) {
+  Options old_options = CurrentOptions();
+  old_options.compaction_style =
+      static_cast<CompactionStyle>(compaction_style2_);
+  if (old_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
+    old_options.level_compaction_dynamic_level_bytes = is_dynamic2_;
+  }
+  if (old_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+    old_options.max_open_files = -1;
+  }
+  old_options.level0_file_num_compaction_trigger = 3;
+  old_options.write_buffer_size = 64 * 1024;
+  old_options.target_file_size_base = 128 * 1024;
+  // Make level target of L1, L2 to be 200KB and 600KB
+  old_options.num_levels = level2_;
+  old_options.max_bytes_for_level_multiplier = 3;
+  old_options.max_bytes_for_level_base = 200 * 1024;
+
+  Reopen(old_options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // Generate at least 2MB of data
+  for (int num = 0; num < 20; num++) {
+    GenerateNewFile(&rnd, &key_idx);
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Will make sure exactly those keys are in the DB after migration.
+  std::set<std::string> keys;
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->SeekToFirst();
+    for (; it->Valid(); it->Next()) {
+      keys.insert(it->key().ToString());
+    }
+  }
+
+  Close();
+
+  Options new_options = old_options;
+  new_options.compaction_style =
+      static_cast<CompactionStyle>(compaction_style1_);
+  if (new_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
+    new_options.level_compaction_dynamic_level_bytes = is_dynamic1_;
+  }
+  if (new_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+    new_options.max_open_files = -1;
+  }
+  if (fifo_max_table_files_size_ != 0) {
+    new_options.compaction_options_fifo.max_table_files_size =
+        fifo_max_table_files_size_;
+  }
+  new_options.target_file_size_base = 256 * 1024;
+  new_options.num_levels = level1_;
+  new_options.max_bytes_for_level_base = 150 * 1024;
+  new_options.max_bytes_for_level_multiplier = 4;
+  ASSERT_OK(OptionChangeMigration(dbname_, old_options, new_options));
+  Reopen(new_options);
+  // Wait for compaction to finish and make sure it can reopen
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  Reopen(new_options);
+
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->SeekToFirst();
+    for (std::string key : keys) {
+      ASSERT_TRUE(it->Valid());
+      ASSERT_EQ(key, it->key().ToString());
+      it->Next();
+    }
+    ASSERT_TRUE(!it->Valid());
+  }
+}
+
+TEST_P(DBOptionChangeMigrationTests, Migrate3) {
+  Options old_options = CurrentOptions();
+  old_options.compaction_style =
+      static_cast<CompactionStyle>(compaction_style1_);
+  if (old_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
+    old_options.level_compaction_dynamic_level_bytes = is_dynamic1_;
+  }
+  if (old_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+    old_options.max_open_files = -1;
+  }
+  old_options.level0_file_num_compaction_trigger = 3;
+  old_options.write_buffer_size = 64 * 1024;
+  old_options.target_file_size_base = 128 * 1024;
+  // Make level target of L1, L2 to be 200KB and 600KB
+  old_options.num_levels = level1_;
+  old_options.max_bytes_for_level_multiplier = 3;
+  old_options.max_bytes_for_level_base = 200 * 1024;
+
+  Reopen(old_options);
+  Random rnd(301);
+  for (int num = 0; num < 20; num++) {
+    for (int i = 0; i < 50; i++) {
+      ASSERT_OK(Put(Key(num * 100 + i), rnd.RandomString(900)));
+    }
+    Flush();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    if (num == 9) {
+      // Issue a full compaction to generate some zero-out files
+      CompactRangeOptions cro;
+      cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+      ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+    }
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Will make sure exactly those keys are in the DB after migration.
+  std::set<std::string> keys;
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->SeekToFirst();
+    for (; it->Valid(); it->Next()) {
+      keys.insert(it->key().ToString());
+    }
+  }
+  Close();
+
+  Options new_options = old_options;
+  new_options.compaction_style =
+      static_cast<CompactionStyle>(compaction_style2_);
+  if (new_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
+    new_options.level_compaction_dynamic_level_bytes = is_dynamic2_;
+  }
+  if (new_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+    new_options.max_open_files = -1;
+  }
+  if (fifo_max_table_files_size_ != 0) {
+    new_options.compaction_options_fifo.max_table_files_size =
+        fifo_max_table_files_size_;
+  }
+  new_options.target_file_size_base = 256 * 1024;
+  new_options.num_levels = level2_;
+  new_options.max_bytes_for_level_base = 150 * 1024;
+  new_options.max_bytes_for_level_multiplier = 4;
+  ASSERT_OK(OptionChangeMigration(dbname_, old_options, new_options));
+  Reopen(new_options);
+
+  // Wait for compaction to finish and make sure it can reopen
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  Reopen(new_options);
+
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->SeekToFirst();
+    for (std::string key : keys) {
+      ASSERT_TRUE(it->Valid());
+      ASSERT_EQ(key, it->key().ToString());
+      it->Next();
+    }
+    ASSERT_TRUE(!it->Valid());
+  }
+}
+
+TEST_P(DBOptionChangeMigrationTests, Migrate4) {
+  Options old_options = CurrentOptions();
+  old_options.compaction_style =
+      static_cast<CompactionStyle>(compaction_style2_);
+  if (old_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
+    old_options.level_compaction_dynamic_level_bytes = is_dynamic2_;
+  }
+  if (old_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+    old_options.max_open_files = -1;
+  }
+  old_options.level0_file_num_compaction_trigger = 3;
+  old_options.write_buffer_size = 64 * 1024;
+  old_options.target_file_size_base = 128 * 1024;
+  // Make level target of L1, L2 to be 200KB and 600KB
+  old_options.num_levels = level2_;
+  old_options.max_bytes_for_level_multiplier = 3;
+  old_options.max_bytes_for_level_base = 200 * 1024;
+
+  Reopen(old_options);
+  Random rnd(301);
+  for (int num = 0; num < 20; num++) {
+    for (int i = 0; i < 50; i++) {
+      ASSERT_OK(Put(Key(num * 100 + i), rnd.RandomString(900)));
+    }
+    Flush();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    if (num == 9) {
+      // Issue a full compaction to generate some zero-out files
+      CompactRangeOptions cro;
+      cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+      ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+    }
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Will make sure exactly those keys are in the DB after migration.
+  std::set<std::string> keys;
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->SeekToFirst();
+    for (; it->Valid(); it->Next()) {
+      keys.insert(it->key().ToString());
+    }
+  }
+
+  Close();
+
+  Options new_options = old_options;
+  new_options.compaction_style =
+      static_cast<CompactionStyle>(compaction_style1_);
+  if (new_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
+    new_options.level_compaction_dynamic_level_bytes = is_dynamic1_;
+  }
+  if (new_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+    new_options.max_open_files = -1;
+  }
+  if (fifo_max_table_files_size_ != 0) {
+    new_options.compaction_options_fifo.max_table_files_size =
+        fifo_max_table_files_size_;
+  }
+  new_options.target_file_size_base = 256 * 1024;
+  new_options.num_levels = level1_;
+  new_options.max_bytes_for_level_base = 150 * 1024;
+  new_options.max_bytes_for_level_multiplier = 4;
+  ASSERT_OK(OptionChangeMigration(dbname_, old_options, new_options));
+  Reopen(new_options);
+  // Wait for compaction to finish and make sure it can reopen
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  Reopen(new_options);
+
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->SeekToFirst();
+    for (std::string key : keys) {
+      ASSERT_TRUE(it->Valid());
+      ASSERT_EQ(key, it->key().ToString());
+      it->Next();
+    }
+    ASSERT_TRUE(!it->Valid());
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DBOptionChangeMigrationTests, DBOptionChangeMigrationTests,
+    ::testing::Values(
+        std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */,
+                        false /* is dynamic leveling in old option */,
+                        4 /* old num_levels */, 0 /* new compaction style */,
+                        false /* is dynamic leveling in new option */,
+                        0 /*fifo max_table_files_size*/),
+        std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */,
+                        true /* is dynamic leveling in old option */,
+                        4 /* old num_levels */, 0 /* new compaction style */,
+                        true /* is dynamic leveling in new option */,
+                        0 /*fifo max_table_files_size*/),
+        std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */,
+                        true /* is dynamic leveling in old option */,
+                        4 /* old num_levels */, 0 /* new compaction style */,
+                        false, 0 /*fifo max_table_files_size*/),
+        std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */,
+                        false /* is dynamic leveling in old option */,
+                        4 /* old num_levels */, 0 /* new compaction style */,
+                        true /* is dynamic leveling in new option */,
+                        0 /*fifo max_table_files_size*/),
+        std::make_tuple(3 /* old num_levels */, 1 /* old compaction style */,
+                        false /* is dynamic leveling in old option */,
+                        4 /* old num_levels */, 1 /* new compaction style */,
+                        false /* is dynamic leveling in new option */,
+                        0 /*fifo max_table_files_size*/),
+        std::make_tuple(1 /* old num_levels */, 1 /* old compaction style */,
+                        false /* is dynamic leveling in old option */,
+                        4 /* old num_levels */, 1 /* new compaction style */,
+                        false /* is dynamic leveling in new option */,
+                        0 /*fifo max_table_files_size*/),
+        std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */,
+                        false /* is dynamic leveling in old option */,
+                        4 /* old num_levels */, 1 /* new compaction style */,
+                        false /* is dynamic leveling in new option */,
+                        0 /*fifo max_table_files_size*/),
+        std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */,
+                        false /* is dynamic leveling in old option */,
+                        1 /* old num_levels */, 1 /* new compaction style */,
+                        false /* is dynamic leveling in new option */,
+                        0 /*fifo max_table_files_size*/),
+        std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */,
+                        true /* is dynamic leveling in old option */,
+                        4 /* old num_levels */, 1 /* new compaction style */,
+                        false /* is dynamic leveling in new option */,
+                        0 /*fifo max_table_files_size*/),
+        std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */,
+                        true /* is dynamic leveling in old option */,
+                        1 /* old num_levels */, 1 /* new compaction style */,
+                        false /* is dynamic leveling in new option */,
+                        0 /*fifo max_table_files_size*/),
+        std::make_tuple(1 /* old num_levels */, 1 /* old compaction style */,
+                        false /* is dynamic leveling in old option */,
+                        4 /* old num_levels */, 0 /* new compaction style */,
+                        false /* is dynamic leveling in new option */,
+                        0 /*fifo max_table_files_size*/),
+        std::make_tuple(4 /* old num_levels */, 0 /* old compaction style */,
+                        false /* is dynamic leveling in old option */,
+                        1 /* old num_levels */, 2 /* new compaction style */,
+                        false /* is dynamic leveling in new option */,
+                        0 /*fifo max_table_files_size*/),
+        std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */,
+                        true /* is dynamic leveling in old option */,
+                        2 /* old num_levels */, 2 /* new compaction style */,
+                        false /* is dynamic leveling in new option */,
+                        0 /*fifo max_table_files_size*/),
+        std::make_tuple(3 /* old num_levels */, 1 /* old compaction style */,
+                        false /* is dynamic leveling in old option */,
+                        3 /* old num_levels */, 2 /* new compaction style */,
+                        false /* is dynamic leveling in new option */,
+                        0 /*fifo max_table_files_size*/),
+        std::make_tuple(1 /* old num_levels */, 1 /* old compaction style */,
+                        false /* is dynamic leveling in old option */,
+                        4 /* old num_levels */, 2 /* new compaction style */,
+                        false /* is dynamic leveling in new option */, 0),
+        std::make_tuple(4 /* old num_levels */, 0 /* old compaction style */,
+                        false /* is dynamic leveling in old option */,
+                        1 /* old num_levels */, 2 /* new compaction style */,
+                        false /* is dynamic leveling in new option */,
+                        5 * 1024 * 1024 /*fifo max_table_files_size*/),
+        std::make_tuple(3 /* old num_levels */, 0 /* old compaction style */,
+                        true /* is dynamic leveling in old option */,
+                        2 /* old num_levels */, 2 /* new compaction style */,
+                        false /* is dynamic leveling in new option */,
+                        5 * 1024 * 1024 /*fifo max_table_files_size*/),
+        std::make_tuple(3 /* old num_levels */, 1 /* old compaction style */,
+                        false /* is dynamic leveling in old option */,
+                        3 /* old num_levels */, 2 /* new compaction style */,
+                        false /* is dynamic leveling in new option */,
+                        5 * 1024 * 1024 /*fifo max_table_files_size*/),
+        std::make_tuple(1 /* old num_levels */, 1 /* old compaction style */,
+                        false /* is dynamic leveling in old option */,
+                        4 /* old num_levels */, 2 /* new compaction style */,
+                        false /* is dynamic leveling in new option */,
+                        5 * 1024 * 1024 /*fifo max_table_files_size*/)));
+
+class DBOptionChangeMigrationTest : public DBTestBase {
+ public:
+  DBOptionChangeMigrationTest()
+      : DBTestBase("db_option_change_migration_test2", /*env_do_fsync=*/true) {}
+};
+
+TEST_F(DBOptionChangeMigrationTest, CompactedSrcToUniversal) {
+  Options old_options = CurrentOptions();
+  old_options.compaction_style = CompactionStyle::kCompactionStyleLevel;
+  old_options.max_compaction_bytes = 200 * 1024;
+  old_options.level_compaction_dynamic_level_bytes = false;
+  old_options.level0_file_num_compaction_trigger = 3;
+  old_options.write_buffer_size = 64 * 1024;
+  old_options.target_file_size_base = 128 * 1024;
+  // Make level target of L1, L2 to be 200KB and 600KB
+  old_options.num_levels = 4;
+  old_options.max_bytes_for_level_multiplier = 3;
+  old_options.max_bytes_for_level_base = 200 * 1024;
+
+  Reopen(old_options);
+  Random rnd(301);
+  for (int num = 0; num < 20; num++) {
+    for (int i = 0; i < 50; i++) {
+      ASSERT_OK(Put(Key(num * 100 + i), rnd.RandomString(900)));
+    }
+  }
+  Flush();
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+
+  // Will make sure exactly those keys are in the DB after migration.
+  std::set<std::string> keys;
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->SeekToFirst();
+    for (; it->Valid(); it->Next()) {
+      keys.insert(it->key().ToString());
+    }
+  }
+
+  Close();
+
+  Options new_options = old_options;
+  new_options.compaction_style = CompactionStyle::kCompactionStyleUniversal;
+  new_options.target_file_size_base = 256 * 1024;
+  new_options.num_levels = 1;
+  new_options.max_bytes_for_level_base = 150 * 1024;
+  new_options.max_bytes_for_level_multiplier = 4;
+  ASSERT_OK(OptionChangeMigration(dbname_, old_options, new_options));
+  Reopen(new_options);
+  // Wait for compaction to finish and make sure it can reopen
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  Reopen(new_options);
+
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->SeekToFirst();
+    for (std::string key : keys) {
+      ASSERT_TRUE(it->Valid());
+      ASSERT_EQ(key, it->key().ToString());
+      it->Next();
+    }
+    ASSERT_TRUE(!it->Valid());
+    ASSERT_OK(it->status());
+  }
+}
+
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/utilities/options/options_util.cc b/src/rocksdb/utilities/options/options_util.cc
new file mode 100644
index 000000000..00c4b981a
--- /dev/null
+++ b/src/rocksdb/utilities/options/options_util.cc
@@ -0,0 +1,159 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/options_util.h"
+
+#include "file/filename.h"
+#include "options/options_parser.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/options.h"
+#include "table/block_based/block_based_table_factory.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status LoadOptionsFromFile(const std::string& file_name, Env* env,
+                           DBOptions* db_options,
+                           std::vector<ColumnFamilyDescriptor>* cf_descs,
+                           bool ignore_unknown_options,
+                           std::shared_ptr<Cache>* cache) {
+  ConfigOptions config_options;
+  config_options.ignore_unknown_options = ignore_unknown_options;
+  config_options.input_strings_escaped = true;
+  config_options.env = env;
+
+  return LoadOptionsFromFile(config_options, file_name, db_options, cf_descs,
+                             cache);
+}
+
+Status LoadOptionsFromFile(const ConfigOptions& config_options,
+                           const std::string& file_name, DBOptions* db_options,
+                           std::vector<ColumnFamilyDescriptor>* cf_descs,
+                           std::shared_ptr<Cache>* cache) {
+  RocksDBOptionsParser parser;
+  const auto& fs = config_options.env->GetFileSystem();
+  Status s = parser.Parse(config_options, file_name, fs.get());
+  if (!s.ok()) {
+    return s;
+  }
+  *db_options = *parser.db_opt();
+  const std::vector<std::string>& cf_names = *parser.cf_names();
+  const std::vector<ColumnFamilyOptions>& cf_opts = *parser.cf_opts();
+  cf_descs->clear();
+  for (size_t i = 0; i < cf_opts.size(); ++i) {
+    cf_descs->push_back({cf_names[i], cf_opts[i]});
+    if (cache != nullptr) {
+      TableFactory* tf = cf_opts[i].table_factory.get();
+      if (tf != nullptr) {
+        auto* opts = tf->GetOptions<BlockBasedTableOptions>();
+        if (opts != nullptr) {
+          opts->block_cache = *cache;
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status GetLatestOptionsFileName(const std::string& dbpath, Env* env,
+                                std::string* options_file_name) {
+  Status s;
+  std::string latest_file_name;
+  uint64_t latest_time_stamp = 0;
+  std::vector<std::string> file_names;
+  s = env->GetChildren(dbpath, &file_names);
+  if (s.IsNotFound()) {
+    return Status::NotFound(Status::kPathNotFound,
+                            "No options files found in the DB directory.",
+                            dbpath);
+  } else if (!s.ok()) {
+    return s;
+  }
+  for (auto& file_name : file_names) {
+    uint64_t time_stamp;
+    FileType type;
+    if (ParseFileName(file_name, &time_stamp, &type) && type == kOptionsFile) {
+      if (time_stamp > latest_time_stamp) {
+        latest_time_stamp = time_stamp;
+        latest_file_name = file_name;
+      }
+    }
+  }
+  if (latest_file_name.size() == 0) {
+    return Status::NotFound(Status::kPathNotFound,
+                            "No options files found in the DB directory.",
+                            dbpath);
+  }
+  *options_file_name = latest_file_name;
+  return Status::OK();
+}
+
+Status LoadLatestOptions(const std::string& dbpath, Env* env,
+                         DBOptions* db_options,
+                         std::vector<ColumnFamilyDescriptor>* cf_descs,
+                         bool ignore_unknown_options,
+                         std::shared_ptr<Cache>* cache) {
+  ConfigOptions config_options;
+  config_options.ignore_unknown_options = ignore_unknown_options;
+  config_options.input_strings_escaped = true;
+  config_options.env = env;
+
+  return LoadLatestOptions(config_options, dbpath, db_options, cf_descs, cache);
+}
+
+Status LoadLatestOptions(const ConfigOptions& config_options,
+                         const std::string& dbpath, DBOptions* db_options,
+                         std::vector<ColumnFamilyDescriptor>* cf_descs,
+                         std::shared_ptr<Cache>* cache) {
+  std::string options_file_name;
+  Status s =
+      GetLatestOptionsFileName(dbpath, config_options.env, &options_file_name);
+  if (!s.ok()) {
+    return s;
+  }
+  return LoadOptionsFromFile(config_options, dbpath + "/" + options_file_name,
+                             db_options, cf_descs, cache);
+}
+
+Status CheckOptionsCompatibility(
+    const std::string& dbpath, Env* env, const DBOptions& db_options,
+    const std::vector<ColumnFamilyDescriptor>& cf_descs,
+    bool ignore_unknown_options) {
+  ConfigOptions config_options(db_options);
+  config_options.sanity_level = ConfigOptions::kSanityLevelLooselyCompatible;
+  config_options.ignore_unknown_options = ignore_unknown_options;
+  config_options.input_strings_escaped = true;
+  config_options.env = env;
+  return CheckOptionsCompatibility(config_options, dbpath, db_options,
+                                   cf_descs);
+}
+
+Status CheckOptionsCompatibility(
+    const ConfigOptions& config_options, const std::string& dbpath,
+    const DBOptions& db_options,
+    const std::vector<ColumnFamilyDescriptor>& cf_descs) {
+  std::string options_file_name;
+  Status s =
+      GetLatestOptionsFileName(dbpath, config_options.env, &options_file_name);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::vector<std::string> cf_names;
+  std::vector<ColumnFamilyOptions> cf_opts;
+  for (const auto& cf_desc : cf_descs) {
+    cf_names.push_back(cf_desc.name);
+    cf_opts.push_back(cf_desc.options);
+  }
+
+  const auto& fs = config_options.env->GetFileSystem();
+
+  return RocksDBOptionsParser::VerifyRocksDBOptionsFromFile(
+      config_options, db_options, cf_names, cf_opts,
+      dbpath + "/" + options_file_name, fs.get());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/options/options_util_test.cc b/src/rocksdb/utilities/options/options_util_test.cc
new file mode 100644
index 000000000..1c3b41ff2
--- /dev/null
+++ b/src/rocksdb/utilities/options/options_util_test.cc
@@ -0,0 +1,779 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/options_util.h"
+
+#include <cctype>
+#include <cinttypes>
+#include <unordered_map>
+
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "options/options_parser.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/table.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+#ifndef GFLAGS
+bool FLAGS_enable_print = false;
+#else
+#include "util/gflags_compat.h"
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+DEFINE_bool(enable_print, false, "Print options generated to console.");
+#endif  // GFLAGS
+
+namespace ROCKSDB_NAMESPACE {
+class OptionsUtilTest : public testing::Test {
+ public:
+  OptionsUtilTest() : rnd_(0xFB) {
+    env_.reset(NewMemEnv(Env::Default()));
+    dbname_ = test::PerThreadDBPath("options_util_test");
+  }
+
+ protected:
+  std::unique_ptr<Env> env_;
+  std::string dbname_;
+  Random rnd_;
+};
+
+TEST_F(OptionsUtilTest, SaveAndLoad) {
+  const size_t kCFCount = 5;
+
+  DBOptions db_opt;
+  std::vector<std::string> cf_names;
+  std::vector<ColumnFamilyOptions> cf_opts;
+  test::RandomInitDBOptions(&db_opt, &rnd_);
+  for (size_t i = 0; i < kCFCount; ++i) {
+    cf_names.push_back(i == 0 ? kDefaultColumnFamilyName
+                              : test::RandomName(&rnd_, 10));
+    cf_opts.emplace_back();
+    test::RandomInitCFOptions(&cf_opts.back(), db_opt, &rnd_);
+  }
+
+  const std::string kFileName = "OPTIONS-123456";
+  ASSERT_OK(PersistRocksDBOptions(db_opt, cf_names, cf_opts, kFileName,
+                                  env_->GetFileSystem().get()));
+
+  DBOptions loaded_db_opt;
+  std::vector<ColumnFamilyDescriptor> loaded_cf_descs;
+  ASSERT_OK(LoadOptionsFromFile(kFileName, env_.get(), &loaded_db_opt,
+                                &loaded_cf_descs));
+  ConfigOptions exact;
+  exact.sanity_level = ConfigOptions::kSanityLevelExactMatch;
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyDBOptions(exact, db_opt, loaded_db_opt));
+  test::RandomInitDBOptions(&db_opt, &rnd_);
+  ASSERT_NOK(
+      RocksDBOptionsParser::VerifyDBOptions(exact, db_opt, loaded_db_opt));
+
+  for (size_t i = 0; i < kCFCount; ++i) {
+    ASSERT_EQ(cf_names[i], loaded_cf_descs[i].name);
+    ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
+        exact, cf_opts[i], loaded_cf_descs[i].options));
+    ASSERT_OK(RocksDBOptionsParser::VerifyTableFactory(
+        exact, cf_opts[i].table_factory.get(),
+        loaded_cf_descs[i].options.table_factory.get()));
+    test::RandomInitCFOptions(&cf_opts[i], db_opt, &rnd_);
+    ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions(
+        exact, cf_opts[i], loaded_cf_descs[i].options));
+  }
+
+  ASSERT_OK(DestroyDB(dbname_, Options(db_opt, cf_opts[0])));
+  for (size_t i = 0; i < kCFCount; ++i) {
+    if (cf_opts[i].compaction_filter) {
+      delete cf_opts[i].compaction_filter;
+    }
+  }
+}
+
+TEST_F(OptionsUtilTest, SaveAndLoadWithCacheCheck) {
+  // creating db
+  DBOptions db_opt;
+  db_opt.create_if_missing = true;
+  // initialize BlockBasedTableOptions
+  std::shared_ptr<Cache> cache = NewLRUCache(1 * 1024);
+  BlockBasedTableOptions bbt_opts;
+  bbt_opts.block_size = 32 * 1024;
+  // saving cf options
+  std::vector<ColumnFamilyOptions> cf_opts;
+  ColumnFamilyOptions default_column_family_opt = ColumnFamilyOptions();
+  default_column_family_opt.table_factory.reset(
+      NewBlockBasedTableFactory(bbt_opts));
+  cf_opts.push_back(default_column_family_opt);
+
+  ColumnFamilyOptions cf_opt_sample = ColumnFamilyOptions();
+  cf_opt_sample.table_factory.reset(NewBlockBasedTableFactory(bbt_opts));
+  cf_opts.push_back(cf_opt_sample);
+
+  ColumnFamilyOptions cf_opt_plain_table_opt = ColumnFamilyOptions();
+  cf_opt_plain_table_opt.table_factory.reset(NewPlainTableFactory());
+  cf_opts.push_back(cf_opt_plain_table_opt);
+
+  std::vector<std::string> cf_names;
+  cf_names.push_back(kDefaultColumnFamilyName);
+  cf_names.push_back("cf_sample");
+  cf_names.push_back("cf_plain_table_sample");
+  // Saving DB in file
+  const std::string kFileName = "OPTIONS-LOAD_CACHE_123456";
+  ASSERT_OK(PersistRocksDBOptions(db_opt, cf_names, cf_opts, kFileName,
+                                  env_->GetFileSystem().get()));
+  DBOptions loaded_db_opt;
+  std::vector<ColumnFamilyDescriptor> loaded_cf_descs;
+
+  ConfigOptions config_options;
+  config_options.ignore_unknown_options = false;
+  config_options.input_strings_escaped = true;
+  config_options.env = env_.get();
+  ASSERT_OK(LoadOptionsFromFile(config_options, kFileName, &loaded_db_opt,
+                                &loaded_cf_descs, &cache));
+  for (size_t i = 0; i < loaded_cf_descs.size(); i++) {
+    auto* loaded_bbt_opt =
+        loaded_cf_descs[i]
+            .options.table_factory->GetOptions<BlockBasedTableOptions>();
+    // Expect the same cache will be loaded
+    if (loaded_bbt_opt != nullptr) {
+      ASSERT_EQ(loaded_bbt_opt->block_cache.get(), cache.get());
+    }
+  }
+
+  // Test the old interface
+  ASSERT_OK(LoadOptionsFromFile(kFileName, env_.get(), &loaded_db_opt,
+                                &loaded_cf_descs, false, &cache));
+  for (size_t i = 0; i < loaded_cf_descs.size(); i++) {
+    auto* loaded_bbt_opt =
+        loaded_cf_descs[i]
+            .options.table_factory->GetOptions<BlockBasedTableOptions>();
+    // Expect the same cache will be loaded
+    if (loaded_bbt_opt != nullptr) {
+      ASSERT_EQ(loaded_bbt_opt->block_cache.get(), cache.get());
+    }
+  }
+  ASSERT_OK(DestroyDB(dbname_, Options(loaded_db_opt, cf_opts[0])));
+}
+
+namespace {
+class DummyTableFactory : public TableFactory {
+ public:
+  DummyTableFactory() {}
+  ~DummyTableFactory() override {}
+
+  const char* Name() const override { return "DummyTableFactory"; }
+
+  using TableFactory::NewTableReader;
+  Status NewTableReader(
+      const ReadOptions& /*ro*/,
+      const TableReaderOptions& /*table_reader_options*/,
+      std::unique_ptr<RandomAccessFileReader>&& /*file*/,
+      uint64_t /*file_size*/, std::unique_ptr<TableReader>* /*table_reader*/,
+      bool /*prefetch_index_and_filter_in_cache*/) const override {
+    return Status::NotSupported();
+  }
+
+  TableBuilder* NewTableBuilder(
+      const TableBuilderOptions& /*table_builder_options*/,
+      WritableFileWriter* /*file*/) const override {
+    return nullptr;
+  }
+
+  Status ValidateOptions(
+      const DBOptions& /*db_opts*/,
+      const ColumnFamilyOptions& /*cf_opts*/) const override {
+    return Status::NotSupported();
+  }
+
+  std::string GetPrintableOptions() const override { return ""; }
+};
+
+class DummyMergeOperator : public MergeOperator {
+ public:
+  DummyMergeOperator() {}
+  ~DummyMergeOperator() override {}
+
+  bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+                   MergeOperationOutput* /*merge_out*/) const override {
+    return false;
+  }
+
+  bool PartialMergeMulti(const Slice& /*key*/,
+                         const std::deque<Slice>& /*operand_list*/,
+                         std::string* /*new_value*/,
+                         Logger* /*logger*/) const override {
+    return false;
+  }
+
+  const char* Name() const override { return "DummyMergeOperator"; }
+};
+
+class DummySliceTransform : public SliceTransform {
+ public:
+  DummySliceTransform() {}
+  ~DummySliceTransform() override {}
+
+  // Return the name of this transformation.
+  const char* Name() const override { return "DummySliceTransform"; }
+
+  // transform a src in domain to a dst in the range
+  Slice Transform(const Slice& src) const override { return src; }
+
+  // determine whether this is a valid src upon the function applies
+  bool InDomain(const Slice& /*src*/) const override { return false; }
+
+  // determine whether dst=Transform(src) for some src
+  bool InRange(const Slice& /*dst*/) const override { return false; }
+};
+
+}  // namespace
+
+TEST_F(OptionsUtilTest, SanityCheck) {
+  DBOptions db_opt;
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  const size_t kCFCount = 5;
+  for (size_t i = 0; i < kCFCount; ++i) {
+    cf_descs.emplace_back();
+    cf_descs.back().name =
+        (i == 0) ? kDefaultColumnFamilyName : test::RandomName(&rnd_, 10);
+
+    cf_descs.back().options.table_factory.reset(NewBlockBasedTableFactory());
+    // Assign non-null values to prefix_extractors except the first cf.
+    cf_descs.back().options.prefix_extractor.reset(
+        i != 0 ? test::RandomSliceTransform(&rnd_) : nullptr);
+    cf_descs.back().options.merge_operator.reset(
+        test::RandomMergeOperator(&rnd_));
+  }
+
+  db_opt.create_missing_column_families = true;
+  db_opt.create_if_missing = true;
+
+  ASSERT_OK(DestroyDB(dbname_, Options(db_opt, cf_descs[0].options)));
+  DB* db;
+  std::vector<ColumnFamilyHandle*> handles;
+  // open and persist the options
+  ASSERT_OK(DB::Open(db_opt, dbname_, cf_descs, &handles, &db));
+
+  // close the db
+  for (auto* handle : handles) {
+    delete handle;
+  }
+  delete db;
+
+  ConfigOptions config_options;
+  config_options.ignore_unknown_options = false;
+  config_options.input_strings_escaped = true;
+  config_options.sanity_level = ConfigOptions::kSanityLevelLooselyCompatible;
+  // perform sanity check
+  ASSERT_OK(
+      CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+
+  ASSERT_GE(kCFCount, 5);
+  // merge operator
+  {
+    std::shared_ptr<MergeOperator> merge_op =
+        cf_descs[0].options.merge_operator;
+
+    ASSERT_NE(merge_op.get(), nullptr);
+    cf_descs[0].options.merge_operator.reset();
+    ASSERT_NOK(
+        CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+
+    cf_descs[0].options.merge_operator.reset(new DummyMergeOperator());
+    ASSERT_NOK(
+        CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+
+    cf_descs[0].options.merge_operator = merge_op;
+    ASSERT_OK(
+        CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+  }
+
+  // prefix extractor
+  {
+    std::shared_ptr<const SliceTransform> prefix_extractor =
+        cf_descs[1].options.prefix_extractor;
+
+    // It's okay to set prefix_extractor to nullptr.
+    ASSERT_NE(prefix_extractor, nullptr);
+    cf_descs[1].options.prefix_extractor.reset();
+    ASSERT_OK(
+        CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+
+    cf_descs[1].options.prefix_extractor.reset(new DummySliceTransform());
+    ASSERT_OK(
+        CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+
+    cf_descs[1].options.prefix_extractor = prefix_extractor;
+    ASSERT_OK(
+        CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+  }
+
+  // prefix extractor nullptr case
+  {
+    std::shared_ptr<const SliceTransform> prefix_extractor =
+        cf_descs[0].options.prefix_extractor;
+
+    // It's okay to set prefix_extractor to nullptr.
+    ASSERT_EQ(prefix_extractor, nullptr);
+    cf_descs[0].options.prefix_extractor.reset();
+    ASSERT_OK(
+        CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+
+    // It's okay to change prefix_extractor from nullptr to non-nullptr
+    cf_descs[0].options.prefix_extractor.reset(new DummySliceTransform());
+    ASSERT_OK(
+        CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+
+    cf_descs[0].options.prefix_extractor = prefix_extractor;
+    ASSERT_OK(
+        CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+  }
+
+  // comparator
+  {
+    test::SimpleSuffixReverseComparator comparator;
+
+    auto* prev_comparator = cf_descs[2].options.comparator;
+    cf_descs[2].options.comparator = &comparator;
+    ASSERT_NOK(
+        CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+
+    cf_descs[2].options.comparator = prev_comparator;
+    ASSERT_OK(
+        CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+  }
+
+  // table factory
+  {
+    std::shared_ptr<TableFactory> table_factory =
+        cf_descs[3].options.table_factory;
+
+    ASSERT_NE(table_factory, nullptr);
+    cf_descs[3].options.table_factory.reset(new DummyTableFactory());
+    ASSERT_NOK(
+        CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+
+    cf_descs[3].options.table_factory = table_factory;
+    ASSERT_OK(
+        CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+  }
+  ASSERT_OK(DestroyDB(dbname_, Options(db_opt, cf_descs[0].options)));
+}
+
+TEST_F(OptionsUtilTest, LatestOptionsNotFound) {
+  std::unique_ptr<Env> env(NewMemEnv(Env::Default()));
+  Status s;
+  Options options;
+  ConfigOptions config_opts;
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+
+  options.env = env.get();
+  options.create_if_missing = true;
+  config_opts.env = options.env;
+  config_opts.ignore_unknown_options = false;
+
+  std::vector<std::string> children;
+
+  std::string options_file_name;
+  ASSERT_OK(DestroyDB(dbname_, options));
+  // First, test where the db directory does not exist
+  ASSERT_NOK(options.env->GetChildren(dbname_, &children));
+
+  s = GetLatestOptionsFileName(dbname_, options.env, &options_file_name);
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_TRUE(s.IsPathNotFound());
+
+  s = LoadLatestOptions(dbname_, options.env, &options, &cf_descs);
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_TRUE(s.IsPathNotFound());
+
+  s = LoadLatestOptions(config_opts, dbname_, &options, &cf_descs);
+  ASSERT_TRUE(s.IsPathNotFound());
+
+  s = GetLatestOptionsFileName(dbname_, options.env, &options_file_name);
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_TRUE(s.IsPathNotFound());
+
+  // Second, test where the db directory exists but is empty
+  ASSERT_OK(options.env->CreateDir(dbname_));
+
+  s = GetLatestOptionsFileName(dbname_, options.env, &options_file_name);
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_TRUE(s.IsPathNotFound());
+
+  s = LoadLatestOptions(dbname_, options.env, &options, &cf_descs);
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_TRUE(s.IsPathNotFound());
+
+  // Finally, test where a file exists but is not an "Options" file
+  std::unique_ptr<WritableFile> file;
+  ASSERT_OK(
+      options.env->NewWritableFile(dbname_ + "/temp.txt", &file, EnvOptions()));
+  ASSERT_OK(file->Close());
+  s = GetLatestOptionsFileName(dbname_, options.env, &options_file_name);
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_TRUE(s.IsPathNotFound());
+
+  s = LoadLatestOptions(config_opts, dbname_, &options, &cf_descs);
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_TRUE(s.IsPathNotFound());
+  ASSERT_OK(options.env->DeleteFile(dbname_ + "/temp.txt"));
+  ASSERT_OK(options.env->DeleteDir(dbname_));
+}
+
+TEST_F(OptionsUtilTest, LoadLatestOptions) {
+  Options options;
+  options.OptimizeForSmallDb();
+  ColumnFamilyDescriptor cf_desc;
+  ConfigOptions config_opts;
+  DBOptions db_opts;
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  std::vector<ColumnFamilyHandle*> handles;
+  DB* db;
+  options.create_if_missing = true;
+
+  ASSERT_OK(DestroyDB(dbname_, options));
+
+  cf_descs.emplace_back();
+  cf_descs.back().name = kDefaultColumnFamilyName;
+  cf_descs.back().options.table_factory.reset(NewBlockBasedTableFactory());
+  cf_descs.emplace_back();
+  cf_descs.back().name = "Plain";
+  cf_descs.back().options.table_factory.reset(NewPlainTableFactory());
+  db_opts.create_missing_column_families = true;
+  db_opts.create_if_missing = true;
+
+  // open and persist the options
+  ASSERT_OK(DB::Open(db_opts, dbname_, cf_descs, &handles, &db));
+
+  std::string options_file_name;
+  std::string new_options_file;
+
+  ASSERT_OK(GetLatestOptionsFileName(dbname_, options.env, &options_file_name));
+  ASSERT_OK(LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs));
+  ASSERT_EQ(cf_descs.size(), 2U);
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_opts,
+                                                  db->GetDBOptions(), db_opts));
+  ASSERT_OK(handles[0]->GetDescriptor(&cf_desc));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_opts, cf_desc.options,
+                                                  cf_descs[0].options));
+  ASSERT_OK(handles[1]->GetDescriptor(&cf_desc));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_opts, cf_desc.options,
+                                                  cf_descs[1].options));
+
+  // Now change some of the DBOptions
+  ASSERT_OK(db->SetDBOptions(
+      {{"delayed_write_rate", "1234"}, {"bytes_per_sync", "32768"}}));
+  ASSERT_OK(GetLatestOptionsFileName(dbname_, options.env, &new_options_file));
+  ASSERT_NE(options_file_name, new_options_file);
+  ASSERT_OK(LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs));
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_opts,
+                                                  db->GetDBOptions(), db_opts));
+  options_file_name = new_options_file;
+
+  // Now change some of the ColumnFamilyOptions
+  ASSERT_OK(db->SetOptions(handles[1], {{"write_buffer_size", "32768"}}));
+  ASSERT_OK(GetLatestOptionsFileName(dbname_, options.env, &new_options_file));
+  ASSERT_NE(options_file_name, new_options_file);
+  ASSERT_OK(LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs));
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_opts,
+                                                  db->GetDBOptions(), db_opts));
+  ASSERT_OK(handles[0]->GetDescriptor(&cf_desc));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_opts, cf_desc.options,
+                                                  cf_descs[0].options));
+  ASSERT_OK(handles[1]->GetDescriptor(&cf_desc));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_opts, cf_desc.options,
+                                                  cf_descs[1].options));
+
+  // close the db
+  for (auto* handle : handles) {
+    delete handle;
+  }
+  delete db;
+  ASSERT_OK(DestroyDB(dbname_, options, cf_descs));
+}
+
+static void WriteOptionsFile(Env* env, const std::string& path,
+                             const std::string& options_file, int major,
+                             int minor, const std::string& db_opts,
+                             const std::string& cf_opts,
+                             const std::string& bbt_opts = "") {
+  std::string options_file_header =
+      "\n"
+      "[Version]\n"
+      "  rocksdb_version=" +
+      std::to_string(major) + "." + std::to_string(minor) +
+      ".0\n"
+      "  options_file_version=1\n";
+
+  std::unique_ptr<WritableFile> wf;
+  ASSERT_OK(env->NewWritableFile(path + "/" + options_file, &wf, EnvOptions()));
+  ASSERT_OK(
+      wf->Append(options_file_header + "[ DBOptions ]\n" + db_opts + "\n"));
+  ASSERT_OK(wf->Append(
+      "[CFOptions   \"default\"]  # column family must be specified\n" +
+      cf_opts + "\n"));
+  ASSERT_OK(wf->Append("[TableOptions/BlockBasedTable   \"default\"]\n" +
+                       bbt_opts + "\n"));
+  ASSERT_OK(wf->Close());
+
+  std::string latest_options_file;
+  ASSERT_OK(GetLatestOptionsFileName(path, env, &latest_options_file));
+  ASSERT_EQ(latest_options_file, options_file);
+}
+
+TEST_F(OptionsUtilTest, BadLatestOptions) {
+  Status s;
+  ConfigOptions config_opts;
+  DBOptions db_opts;
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  Options options;
+  options.env = env_.get();
+  config_opts.env = env_.get();
+  config_opts.ignore_unknown_options = false;
+  config_opts.delimiter = "\n";
+
+  ConfigOptions ignore_opts = config_opts;
+  ignore_opts.ignore_unknown_options = true;
+
+  std::string options_file_name;
+
+  // Test where the db directory exists but is empty
+  ASSERT_OK(options.env->CreateDir(dbname_));
+  ASSERT_NOK(
+      GetLatestOptionsFileName(dbname_, options.env, &options_file_name));
+  ASSERT_NOK(LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs));
+
+  // Write an options file for a previous major release with an unknown DB
+  // Option
+  WriteOptionsFile(options.env, dbname_, "OPTIONS-0001", ROCKSDB_MAJOR - 1,
+                   ROCKSDB_MINOR, "unknown_db_opt=true", "");
+  s = LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  // Even though ignore_unknown_options=true, we still return an error...
+  s = LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  // Write an options file for a previous minor release with an unknown CF
+  // Option
+  WriteOptionsFile(options.env, dbname_, "OPTIONS-0002", ROCKSDB_MAJOR,
+                   ROCKSDB_MINOR - 1, "", "unknown_cf_opt=true");
+  s = LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  // Even though ignore_unknown_options=true, we still return an error...
+  s = LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  // Write an options file for a previous minor release with an unknown BBT
+  // Option
+  WriteOptionsFile(options.env, dbname_, "OPTIONS-0003", ROCKSDB_MAJOR,
+                   ROCKSDB_MINOR - 1, "", "", "unknown_bbt_opt=true");
+  s = LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  // Even though ignore_unknown_options=true, we still return an error...
+  s = LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  // Write an options file for the current release with an unknown DB Option
+  WriteOptionsFile(options.env, dbname_, "OPTIONS-0004", ROCKSDB_MAJOR,
+                   ROCKSDB_MINOR, "unknown_db_opt=true", "");
+  s = LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  // Even though ignore_unknown_options=true, we still return an error...
+  s = LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  // Write an options file for the current release with an unknown CF Option
+  WriteOptionsFile(options.env, dbname_, "OPTIONS-0005", ROCKSDB_MAJOR,
+                   ROCKSDB_MINOR, "", "unknown_cf_opt=true");
+  s = LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  // Even though ignore_unknown_options=true, we still return an error...
+  s = LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  // Write an options file for the current release with an invalid DB Option
+  WriteOptionsFile(options.env, dbname_, "OPTIONS-0006", ROCKSDB_MAJOR,
+                   ROCKSDB_MINOR, "create_if_missing=hello", "");
+  s = LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  // Even though ignore_unknown_options=true, we still return an error...
+  s = LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  // Write an options file for the next release with an invalid DB Option
+  WriteOptionsFile(options.env, dbname_, "OPTIONS-0007", ROCKSDB_MAJOR,
+                   ROCKSDB_MINOR + 1, "create_if_missing=hello", "");
+  ASSERT_NOK(LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs));
+  ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs));
+
+  // Write an options file for the next release with an unknown DB Option
+  WriteOptionsFile(options.env, dbname_, "OPTIONS-0008", ROCKSDB_MAJOR,
+                   ROCKSDB_MINOR + 1, "unknown_db_opt=true", "");
+  ASSERT_NOK(LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs));
+  // Ignore the errors for future releases when ignore_unknown_options=true
+  ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs));
+
+  // Write an options file for the next major release with an unknown CF Option
+  WriteOptionsFile(options.env, dbname_, "OPTIONS-0009", ROCKSDB_MAJOR + 1,
+                   ROCKSDB_MINOR, "", "unknown_cf_opt=true");
+  ASSERT_NOK(LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs));
+  // Ignore the errors for future releases when ignore_unknown_options=true
+  ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs));
+}
+
+TEST_F(OptionsUtilTest, RenameDatabaseDirectory) {
+  DB* db;
+  Options options;
+  DBOptions db_opts;
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  std::vector<ColumnFamilyHandle*> handles;
+
+  options.create_if_missing = true;
+
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  ASSERT_OK(db->Put(WriteOptions(), "foo", "value0"));
+  delete db;
+
+  auto new_dbname = dbname_ + "_2";
+
+  ASSERT_OK(options.env->RenameFile(dbname_, new_dbname));
+  ASSERT_OK(LoadLatestOptions(new_dbname, options.env, &db_opts, &cf_descs));
+  ASSERT_EQ(cf_descs.size(), 1U);
+
+  db_opts.create_if_missing = false;
+  ASSERT_OK(DB::Open(db_opts, new_dbname, cf_descs, &handles, &db));
+  std::string value;
+  ASSERT_OK(db->Get(ReadOptions(), "foo", &value));
+  ASSERT_EQ("value0", value);
+  // close the db
+  for (auto* handle : handles) {
+    delete handle;
+  }
+  delete db;
+  Options new_options(db_opts, cf_descs[0].options);
+  ASSERT_OK(DestroyDB(new_dbname, new_options, cf_descs));
+  ASSERT_OK(DestroyDB(dbname_, options));
+}
+
+TEST_F(OptionsUtilTest, WalDirSettings) {
+  DB* db;
+  Options options;
+  DBOptions db_opts;
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  std::vector<ColumnFamilyHandle*> handles;
+
+  options.create_if_missing = true;
+
+  // Open a DB with no wal dir set.  The wal_dir should stay empty
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  delete db;
+  ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs));
+  ASSERT_EQ(db_opts.wal_dir, "");
+
+  // Open a DB with wal_dir == dbname.  The wal_dir should be set to empty
+  options.wal_dir = dbname_;
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  delete db;
+  ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs));
+  ASSERT_EQ(db_opts.wal_dir, "");
+
+  // Open a DB with no wal_dir but a db_path==dbname_.  The wal_dir should be
+  // empty
+  options.wal_dir = "";
+  options.db_paths.emplace_back(dbname_, std::numeric_limits<uint64_t>::max());
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  delete db;
+  ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs));
+  ASSERT_EQ(db_opts.wal_dir, "");
+
+  // Open a DB with no wal_dir==dbname_ and db_path==dbname_.  The wal_dir
+  // should be empty
+  options.wal_dir = dbname_ + "/";
+  options.db_paths.emplace_back(dbname_, std::numeric_limits<uint64_t>::max());
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  delete db;
+  ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs));
+  ASSERT_EQ(db_opts.wal_dir, "");
+  ASSERT_OK(DestroyDB(dbname_, options));
+
+  // Open a DB with no wal_dir but db_path != db_name.  The wal_dir == dbname_
+  options.wal_dir = "";
+  options.db_paths.clear();
+  options.db_paths.emplace_back(dbname_ + "_0",
+                                std::numeric_limits<uint64_t>::max());
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  delete db;
+  ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs));
+  ASSERT_EQ(db_opts.wal_dir, dbname_);
+  ASSERT_OK(DestroyDB(dbname_, options));
+
+  // Open a DB with wal_dir != db_name.  The wal_dir remains unchanged
+  options.wal_dir = dbname_ + "/wal";
+  options.db_paths.clear();
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  delete db;
+  ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs));
+  ASSERT_EQ(db_opts.wal_dir, dbname_ + "/wal");
+  ASSERT_OK(DestroyDB(dbname_, options));
+}
+
+TEST_F(OptionsUtilTest, WalDirInOptins) {
+  DB* db;
+  Options options;
+  DBOptions db_opts;
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  std::vector<ColumnFamilyHandle*> handles;
+
+  // Store an options file with wal_dir=dbname_ and make sure it still loads
+  // when the input wal_dir is empty
+  options.create_if_missing = true;
+  options.wal_dir = "";
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  delete db;
+  options.wal_dir = dbname_;
+  std::string options_file;
+  ASSERT_OK(GetLatestOptionsFileName(dbname_, options.env, &options_file));
+  ASSERT_OK(PersistRocksDBOptions(options, {"default"}, {options},
+                                  dbname_ + "/" + options_file,
+                                  options.env->GetFileSystem().get()));
+  ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs));
+  ASSERT_EQ(db_opts.wal_dir, dbname_);
+  options.wal_dir = "";
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  delete db;
+  ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs));
+  ASSERT_EQ(db_opts.wal_dir, "");
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+#ifdef GFLAGS
+  ParseCommandLineFlags(&argc, &argv, true);
+#endif  // GFLAGS
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <cstdio>
+
+int main(int /*argc*/, char** /*argv*/) {
+  printf("Skipped in RocksDBLite as utilities are not supported.\n");
+  return 0;
+}
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/persistent_cache/block_cache_tier.cc b/src/rocksdb/utilities/persistent_cache/block_cache_tier.cc
new file mode 100644
index 000000000..8ad9bb1b1
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/block_cache_tier.cc
@@ -0,0 +1,422 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#ifndef ROCKSDB_LITE
+
+#include "utilities/persistent_cache/block_cache_tier.h"
+
+#include <utility>
+#include <vector>
+
+#include "logging/logging.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/stop_watch.h"
+#include "utilities/persistent_cache/block_cache_tier_file.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+//
+// BlockCacheImpl
+//
+Status BlockCacheTier::Open() {
+  Status status;
+
+  WriteLock _(&lock_);
+
+  assert(!size_);
+
+  // Check the validity of the options
+  status = opt_.ValidateSettings();
+  assert(status.ok());
+  if (!status.ok()) {
+    Error(opt_.log, "Invalid block cache options");
+    return status;
+  }
+
+  // Create base directory or cleanup existing directory
+  status = opt_.env->CreateDirIfMissing(opt_.path);
+  if (!status.ok()) {
+    Error(opt_.log, "Error creating directory %s. %s", opt_.path.c_str(),
+          status.ToString().c_str());
+    return status;
+  }
+
+  // Create base/<cache dir> directory
+  status = opt_.env->CreateDir(GetCachePath());
+  if (!status.ok()) {
+    // directory already exists, clean it up
+    status = CleanupCacheFolder(GetCachePath());
+    assert(status.ok());
+    if (!status.ok()) {
+      Error(opt_.log, "Error creating directory %s. %s", opt_.path.c_str(),
+            status.ToString().c_str());
+      return status;
+    }
+  }
+
+  // create a new file
+  assert(!cache_file_);
+  status = NewCacheFile();
+  if (!status.ok()) {
+    Error(opt_.log, "Error creating new file %s. %s", opt_.path.c_str(),
+          status.ToString().c_str());
+    return status;
+  }
+
+  assert(cache_file_);
+
+  if (opt_.pipeline_writes) {
+    assert(!insert_th_.joinable());
+    insert_th_ = port::Thread(&BlockCacheTier::InsertMain, this);
+  }
+
+  return Status::OK();
+}
+
+bool IsCacheFile(const std::string& file) {
+  // check if the file has .rc suffix
+  // Unfortunately regex support across compilers is not even, so we use simple
+  // string parsing
+  size_t pos = file.find(".");
+  if (pos == std::string::npos) {
+    return false;
+  }
+
+  std::string suffix = file.substr(pos);
+  return suffix == ".rc";
+}
+
+Status BlockCacheTier::CleanupCacheFolder(const std::string& folder) {
+  std::vector<std::string> files;
+  Status status = opt_.env->GetChildren(folder, &files);
+  if (!status.ok()) {
+    Error(opt_.log, "Error getting files for %s. %s", folder.c_str(),
+          status.ToString().c_str());
+    return status;
+  }
+
+  // cleanup files with the patter :digi:.rc
+  for (auto file : files) {
+    if (IsCacheFile(file)) {
+      // cache file
+      Info(opt_.log, "Removing file %s.", file.c_str());
+      status = opt_.env->DeleteFile(folder + "/" + file);
+      if (!status.ok()) {
+        Error(opt_.log, "Error deleting file %s. %s", file.c_str(),
+              status.ToString().c_str());
+        return status;
+      }
+    } else {
+      ROCKS_LOG_DEBUG(opt_.log, "Skipping file %s", file.c_str());
+    }
+  }
+  return Status::OK();
+}
+
+Status BlockCacheTier::Close() {
+  // stop the insert thread
+  if (opt_.pipeline_writes && insert_th_.joinable()) {
+    InsertOp op(/*quit=*/true);
+    insert_ops_.Push(std::move(op));
+    insert_th_.join();
+  }
+
+  // stop the writer before
+  writer_.Stop();
+
+  // clear all metadata
+  WriteLock _(&lock_);
+  metadata_.Clear();
+  return Status::OK();
+}
+
+template <class T>
+void Add(std::map<std::string, double>* stats, const std::string& key,
+         const T& t) {
+  stats->insert({key, static_cast<double>(t)});
+}
+
+PersistentCache::StatsType BlockCacheTier::Stats() {
+  std::map<std::string, double> stats;
+  Add(&stats, "persistentcache.blockcachetier.bytes_piplined",
+      stats_.bytes_pipelined_.Average());
+  Add(&stats, "persistentcache.blockcachetier.bytes_written",
+      stats_.bytes_written_.Average());
+  Add(&stats, "persistentcache.blockcachetier.bytes_read",
+      stats_.bytes_read_.Average());
+  Add(&stats, "persistentcache.blockcachetier.insert_dropped",
+      stats_.insert_dropped_);
+  Add(&stats, "persistentcache.blockcachetier.cache_hits", stats_.cache_hits_);
+  Add(&stats, "persistentcache.blockcachetier.cache_misses",
+      stats_.cache_misses_);
+  Add(&stats, "persistentcache.blockcachetier.cache_errors",
+      stats_.cache_errors_);
+  Add(&stats, "persistentcache.blockcachetier.cache_hits_pct",
+      stats_.CacheHitPct());
+  Add(&stats, "persistentcache.blockcachetier.cache_misses_pct",
+      stats_.CacheMissPct());
+  Add(&stats, "persistentcache.blockcachetier.read_hit_latency",
+      stats_.read_hit_latency_.Average());
+  Add(&stats, "persistentcache.blockcachetier.read_miss_latency",
+      stats_.read_miss_latency_.Average());
+  Add(&stats, "persistentcache.blockcachetier.write_latency",
+      stats_.write_latency_.Average());
+
+  auto out = PersistentCacheTier::Stats();
+  out.push_back(stats);
+  return out;
+}
+
+Status BlockCacheTier::Insert(const Slice& key, const char* data,
+                              const size_t size) {
+  // update stats
+  stats_.bytes_pipelined_.Add(size);
+
+  if (opt_.pipeline_writes) {
+    // off load the write to the write thread
+    insert_ops_.Push(
+        InsertOp(key.ToString(), std::move(std::string(data, size))));
+    return Status::OK();
+  }
+
+  assert(!opt_.pipeline_writes);
+  return InsertImpl(key, Slice(data, size));
+}
+
+void BlockCacheTier::InsertMain() {
+  while (true) {
+    InsertOp op(insert_ops_.Pop());
+
+    if (op.signal_) {
+      // that is a secret signal to exit
+      break;
+    }
+
+    size_t retry = 0;
+    Status s;
+    while ((s = InsertImpl(Slice(op.key_), Slice(op.data_))).IsTryAgain()) {
+      if (retry > kMaxRetry) {
+        break;
+      }
+
+      // this can happen when the buffers are full, we wait till some buffers
+      // are free. Why don't we wait inside the code. This is because we want
+      // to support both pipelined and non-pipelined mode
+      buffer_allocator_.WaitUntilUsable();
+      retry++;
+    }
+
+    if (!s.ok()) {
+      stats_.insert_dropped_++;
+    }
+  }
+}
+
+Status BlockCacheTier::InsertImpl(const Slice& key, const Slice& data) {
+  // pre-condition
+  assert(key.size());
+  assert(data.size());
+  assert(cache_file_);
+
+  StopWatchNano timer(opt_.clock, /*auto_start=*/true);
+
+  WriteLock _(&lock_);
+
+  LBA lba;
+  if (metadata_.Lookup(key, &lba)) {
+    // the key already exists, this is duplicate insert
+    return Status::OK();
+  }
+
+  while (!cache_file_->Append(key, data, &lba)) {
+    if (!cache_file_->Eof()) {
+      ROCKS_LOG_DEBUG(opt_.log, "Error inserting to cache file %d",
+                      cache_file_->cacheid());
+      stats_.write_latency_.Add(timer.ElapsedNanos() / 1000);
+      return Status::TryAgain();
+    }
+
+    assert(cache_file_->Eof());
+    Status status = NewCacheFile();
+    if (!status.ok()) {
+      return status;
+    }
+  }
+
+  // Insert into lookup index
+  BlockInfo* info = metadata_.Insert(key, lba);
+  assert(info);
+  if (!info) {
+    return Status::IOError("Unexpected error inserting to index");
+  }
+
+  // insert to cache file reverse mapping
+  cache_file_->Add(info);
+
+  // update stats
+  stats_.bytes_written_.Add(data.size());
+  stats_.write_latency_.Add(timer.ElapsedNanos() / 1000);
+  return Status::OK();
+}
+
+Status BlockCacheTier::Lookup(const Slice& key, std::unique_ptr<char[]>* val,
+                              size_t* size) {
+  StopWatchNano timer(opt_.clock, /*auto_start=*/true);
+
+  LBA lba;
+  bool status;
+  status = metadata_.Lookup(key, &lba);
+  if (!status) {
+    stats_.cache_misses_++;
+    stats_.read_miss_latency_.Add(timer.ElapsedNanos() / 1000);
+    return Status::NotFound("blockcache: key not found");
+  }
+
+  BlockCacheFile* const file = metadata_.Lookup(lba.cache_id_);
+  if (!file) {
+    // this can happen because the block index and cache file index are
+    // different, and the cache file might be removed between the two lookups
+    stats_.cache_misses_++;
+    stats_.read_miss_latency_.Add(timer.ElapsedNanos() / 1000);
+    return Status::NotFound("blockcache: cache file not found");
+  }
+
+  assert(file->refs_);
+
+  std::unique_ptr<char[]> scratch(new char[lba.size_]);
+  Slice blk_key;
+  Slice blk_val;
+
+  status = file->Read(lba, &blk_key, &blk_val, scratch.get());
+  --file->refs_;
+  if (!status) {
+    stats_.cache_misses_++;
+    stats_.cache_errors_++;
+    stats_.read_miss_latency_.Add(timer.ElapsedNanos() / 1000);
+    return Status::NotFound("blockcache: error reading data");
+  }
+
+  assert(blk_key == key);
+
+  val->reset(new char[blk_val.size()]);
+  memcpy(val->get(), blk_val.data(), blk_val.size());
+  *size = blk_val.size();
+
+  stats_.bytes_read_.Add(*size);
+  stats_.cache_hits_++;
+  stats_.read_hit_latency_.Add(timer.ElapsedNanos() / 1000);
+
+  return Status::OK();
+}
+
+bool BlockCacheTier::Erase(const Slice& key) {
+  WriteLock _(&lock_);
+  BlockInfo* info = metadata_.Remove(key);
+  assert(info);
+  delete info;
+  return true;
+}
+
+Status BlockCacheTier::NewCacheFile() {
+  lock_.AssertHeld();
+
+  TEST_SYNC_POINT_CALLBACK("BlockCacheTier::NewCacheFile:DeleteDir",
+                           (void*)(GetCachePath().c_str()));
+
+  std::unique_ptr<WriteableCacheFile> f(new WriteableCacheFile(
+      opt_.env, &buffer_allocator_, &writer_, GetCachePath(), writer_cache_id_,
+      opt_.cache_file_size, opt_.log));
+
+  bool status = f->Create(opt_.enable_direct_writes, opt_.enable_direct_reads);
+  if (!status) {
+    return Status::IOError("Error creating file");
+  }
+
+  Info(opt_.log, "Created cache file %d", writer_cache_id_);
+
+  writer_cache_id_++;
+  cache_file_ = f.release();
+
+  // insert to cache files tree
+  status = metadata_.Insert(cache_file_);
+  assert(status);
+  if (!status) {
+    Error(opt_.log, "Error inserting to metadata");
+    return Status::IOError("Error inserting to metadata");
+  }
+
+  return Status::OK();
+}
+
+bool BlockCacheTier::Reserve(const size_t size) {
+  WriteLock _(&lock_);
+  assert(size_ <= opt_.cache_size);
+
+  if (size + size_ <= opt_.cache_size) {
+    // there is enough space to write
+    size_ += size;
+    return true;
+  }
+
+  assert(size + size_ >= opt_.cache_size);
+  // there is not enough space to fit the requested data
+  // we can clear some space by evicting cold data
+
+  const double retain_fac = (100 - kEvictPct) / static_cast<double>(100);
+  while (size + size_ > opt_.cache_size * retain_fac) {
+    std::unique_ptr<BlockCacheFile> f(metadata_.Evict());
+    if (!f) {
+      // nothing is evictable
+      return false;
+    }
+    assert(!f->refs_);
+    uint64_t file_size;
+    if (!f->Delete(&file_size).ok()) {
+      // unable to delete file
+      return false;
+    }
+
+    assert(file_size <= size_);
+    size_ -= file_size;
+  }
+
+  size_ += size;
+  assert(size_ <= opt_.cache_size * 0.9);
+  return true;
+}
+
+Status NewPersistentCache(Env* const env, const std::string& path,
+                          const uint64_t size,
+                          const std::shared_ptr<Logger>& log,
+                          const bool optimized_for_nvm,
+                          std::shared_ptr<PersistentCache>* cache) {
+  if (!cache) {
+    return Status::IOError("invalid argument cache");
+  }
+
+  auto opt = PersistentCacheConfig(env, path, size, log);
+  if (optimized_for_nvm) {
+    // the default settings are optimized for SSD
+    // NVM devices are better accessed with 4K direct IO and written with
+    // parallelism
+    opt.enable_direct_writes = true;
+    opt.writer_qdepth = 4;
+    opt.writer_dispatch_size = 4 * 1024;
+  }
+
+  auto pcache = std::make_shared<BlockCacheTier>(opt);
+  Status s = pcache->Open();
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  *cache = pcache;
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ifndef ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/persistent_cache/block_cache_tier.h b/src/rocksdb/utilities/persistent_cache/block_cache_tier.h
new file mode 100644
index 000000000..1aac287cc
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/block_cache_tier.h
@@ -0,0 +1,156 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#ifndef OS_WIN
+#include <unistd.h>
+#endif  // ! OS_WIN
+
+#include <atomic>
+#include <list>
+#include <memory>
+#include <set>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <thread>
+
+#include "memory/arena.h"
+#include "memtable/skiplist.h"
+#include "monitoring/histogram.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/persistent_cache.h"
+#include "rocksdb/system_clock.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/mutexlock.h"
+#include "utilities/persistent_cache/block_cache_tier_file.h"
+#include "utilities/persistent_cache/block_cache_tier_metadata.h"
+#include "utilities/persistent_cache/persistent_cache_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+//
+// Block cache tier implementation
+//
+class BlockCacheTier : public PersistentCacheTier {
+ public:
+  explicit BlockCacheTier(const PersistentCacheConfig& opt)
+      : opt_(opt),
+        insert_ops_(static_cast<size_t>(opt_.max_write_pipeline_backlog_size)),
+        buffer_allocator_(opt.write_buffer_size, opt.write_buffer_count()),
+        writer_(this, opt_.writer_qdepth,
+                static_cast<size_t>(opt_.writer_dispatch_size)) {
+    Info(opt_.log, "Initializing allocator. size=%d B count=%" ROCKSDB_PRIszt,
+         opt_.write_buffer_size, opt_.write_buffer_count());
+  }
+
+  virtual ~BlockCacheTier() {
+    // Close is re-entrant so we can call close even if it is already closed
+    Close().PermitUncheckedError();
+    assert(!insert_th_.joinable());
+  }
+
+  Status Insert(const Slice& key, const char* data, const size_t size) override;
+  Status Lookup(const Slice& key, std::unique_ptr<char[]>* data,
+                size_t* size) override;
+  Status Open() override;
+  Status Close() override;
+  bool Erase(const Slice& key) override;
+  bool Reserve(const size_t size) override;
+
+  bool IsCompressed() override { return opt_.is_compressed; }
+
+  std::string GetPrintableOptions() const override { return opt_.ToString(); }
+
+  PersistentCache::StatsType Stats() override;
+
+  void TEST_Flush() override {
+    while (insert_ops_.Size()) {
+      /* sleep override */
+      SystemClock::Default()->SleepForMicroseconds(1000000);
+    }
+  }
+
+ private:
+  // Percentage of cache to be evicted when the cache is full
+  static const size_t kEvictPct = 10;
+  // Max attempts to insert key, value to cache in pipelined mode
+  static const size_t kMaxRetry = 3;
+
+  // Pipelined operation
+  struct InsertOp {
+    explicit InsertOp(const bool signal) : signal_(signal) {}
+    explicit InsertOp(std::string&& key, const std::string& data)
+        : key_(std::move(key)), data_(data) {}
+    ~InsertOp() {}
+
+    InsertOp() = delete;
+    InsertOp(InsertOp&& /*rhs*/) = default;
+    InsertOp& operator=(InsertOp&& rhs) = default;
+
+    // used for estimating size by bounded queue
+    size_t Size() { return data_.size() + key_.size(); }
+
+    std::string key_;
+    std::string data_;
+    bool signal_ = false;  // signal to request processing thread to exit
+  };
+
+  // entry point for insert thread
+  void InsertMain();
+  // insert implementation
+  Status InsertImpl(const Slice& key, const Slice& data);
+  // Create a new cache file
+  Status NewCacheFile();
+  // Get cache directory path
+  std::string GetCachePath() const { return opt_.path + "/cache"; }
+  // Cleanup folder
+  Status CleanupCacheFolder(const std::string& folder);
+
+  // Statistics
+  struct Statistics {
+    HistogramImpl bytes_pipelined_;
+    HistogramImpl bytes_written_;
+    HistogramImpl bytes_read_;
+    HistogramImpl read_hit_latency_;
+    HistogramImpl read_miss_latency_;
+    HistogramImpl write_latency_;
+    std::atomic<uint64_t> cache_hits_{0};
+    std::atomic<uint64_t> cache_misses_{0};
+    std::atomic<uint64_t> cache_errors_{0};
+    std::atomic<uint64_t> insert_dropped_{0};
+
+    double CacheHitPct() const {
+      const auto lookups = cache_hits_ + cache_misses_;
+      return lookups ? 100 * cache_hits_ / static_cast<double>(lookups) : 0.0;
+    }
+
+    double CacheMissPct() const {
+      const auto lookups = cache_hits_ + cache_misses_;
+      return lookups ? 100 * cache_misses_ / static_cast<double>(lookups) : 0.0;
+    }
+  };
+
+  port::RWMutex lock_;                          // Synchronization
+  const PersistentCacheConfig opt_;             // BlockCache options
+  BoundedQueue<InsertOp> insert_ops_;           // Ops waiting for insert
+  ROCKSDB_NAMESPACE::port::Thread insert_th_;   // Insert thread
+  uint32_t writer_cache_id_ = 0;                // Current cache file identifier
+  WriteableCacheFile* cache_file_ = nullptr;    // Current cache file reference
+  CacheWriteBufferAllocator buffer_allocator_;  // Buffer provider
+  ThreadedWriter writer_;                       // Writer threads
+  BlockCacheTierMetadata metadata_;             // Cache meta data manager
+  std::atomic<uint64_t> size_{0};               // Size of the cache
+  Statistics stats_;                            // Statistics
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc b/src/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc
new file mode 100644
index 000000000..f4f8517ab
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc
@@ -0,0 +1,610 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#ifndef ROCKSDB_LITE
+
+#include "utilities/persistent_cache/block_cache_tier_file.h"
+
+#ifndef OS_WIN
+#include <unistd.h>
+#endif
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "env/composite_env_wrapper.h"
+#include "logging/logging.h"
+#include "port/port.h"
+#include "rocksdb/system_clock.h"
+#include "util/crc32c.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+//
+// File creation factories
+//
+Status NewWritableCacheFile(Env* const env, const std::string& filepath,
+                            std::unique_ptr<WritableFile>* file,
+                            const bool use_direct_writes = false) {
+  EnvOptions opt;
+  opt.use_direct_writes = use_direct_writes;
+  Status s = env->NewWritableFile(filepath, file, opt);
+  return s;
+}
+
+Status NewRandomAccessCacheFile(const std::shared_ptr<FileSystem>& fs,
+                                const std::string& filepath,
+                                std::unique_ptr<FSRandomAccessFile>* file,
+                                const bool use_direct_reads = true) {
+  assert(fs.get());
+
+  FileOptions opt;
+  opt.use_direct_reads = use_direct_reads;
+  return fs->NewRandomAccessFile(filepath, opt, file, nullptr);
+}
+
+//
+// BlockCacheFile
+//
+Status BlockCacheFile::Delete(uint64_t* size) {
+  assert(env_);
+
+  Status status = env_->GetFileSize(Path(), size);
+  if (!status.ok()) {
+    return status;
+  }
+  return env_->DeleteFile(Path());
+}
+
+//
+// CacheRecord
+//
+// Cache record represents the record on disk
+//
+// +--------+---------+----------+------------+---------------+-------------+
+// | magic  | crc     | key size | value size | key data      | value data  |
+// +--------+---------+----------+------------+---------------+-------------+
+// <-- 4 --><-- 4  --><-- 4   --><-- 4     --><-- key size  --><-- v-size -->
+//
+struct CacheRecordHeader {
+  CacheRecordHeader() : magic_(0), crc_(0), key_size_(0), val_size_(0) {}
+  CacheRecordHeader(const uint32_t magic, const uint32_t key_size,
+                    const uint32_t val_size)
+      : magic_(magic), crc_(0), key_size_(key_size), val_size_(val_size) {}
+
+  uint32_t magic_;
+  uint32_t crc_;
+  uint32_t key_size_;
+  uint32_t val_size_;
+};
+
+struct CacheRecord {
+  CacheRecord() {}
+  CacheRecord(const Slice& key, const Slice& val)
+      : hdr_(MAGIC, static_cast<uint32_t>(key.size()),
+             static_cast<uint32_t>(val.size())),
+        key_(key),
+        val_(val) {
+    hdr_.crc_ = ComputeCRC();
+  }
+
+  uint32_t ComputeCRC() const;
+  bool Serialize(std::vector<CacheWriteBuffer*>* bufs, size_t* woff);
+  bool Deserialize(const Slice& buf);
+
+  static uint32_t CalcSize(const Slice& key, const Slice& val) {
+    return static_cast<uint32_t>(sizeof(CacheRecordHeader) + key.size() +
+                                 val.size());
+  }
+
+  static const uint32_t MAGIC = 0xfefa;
+
+  bool Append(std::vector<CacheWriteBuffer*>* bufs, size_t* woff,
+              const char* data, const size_t size);
+
+  CacheRecordHeader hdr_;
+  Slice key_;
+  Slice val_;
+};
+
+static_assert(sizeof(CacheRecordHeader) == 16, "DataHeader is not aligned");
+
+uint32_t CacheRecord::ComputeCRC() const {
+  uint32_t crc = 0;
+  CacheRecordHeader tmp = hdr_;
+  tmp.crc_ = 0;
+  crc = crc32c::Extend(crc, reinterpret_cast<const char*>(&tmp), sizeof(tmp));
+  crc = crc32c::Extend(crc, reinterpret_cast<const char*>(key_.data()),
+                       key_.size());
+  crc = crc32c::Extend(crc, reinterpret_cast<const char*>(val_.data()),
+                       val_.size());
+  return crc;
+}
+
+bool CacheRecord::Serialize(std::vector<CacheWriteBuffer*>* bufs,
+                            size_t* woff) {
+  assert(bufs->size());
+  return Append(bufs, woff, reinterpret_cast<const char*>(&hdr_),
+                sizeof(hdr_)) &&
+         Append(bufs, woff, reinterpret_cast<const char*>(key_.data()),
+                key_.size()) &&
+         Append(bufs, woff, reinterpret_cast<const char*>(val_.data()),
+                val_.size());
+}
+
+bool CacheRecord::Append(std::vector<CacheWriteBuffer*>* bufs, size_t* woff,
+                         const char* data, const size_t data_size) {
+  assert(*woff < bufs->size());
+
+  const char* p = data;
+  size_t size = data_size;
+
+  while (size && *woff < bufs->size()) {
+    CacheWriteBuffer* buf = (*bufs)[*woff];
+    const size_t free = buf->Free();
+    if (size <= free) {
+      buf->Append(p, size);
+      size = 0;
+    } else {
+      buf->Append(p, free);
+      p += free;
+      size -= free;
+      assert(!buf->Free());
+      assert(buf->Used() == buf->Capacity());
+    }
+
+    if (!buf->Free()) {
+      *woff += 1;
+    }
+  }
+
+  assert(!size);
+
+  return !size;
+}
+
+bool CacheRecord::Deserialize(const Slice& data) {
+  assert(data.size() >= sizeof(CacheRecordHeader));
+  if (data.size() < sizeof(CacheRecordHeader)) {
+    return false;
+  }
+
+  memcpy(&hdr_, data.data(), sizeof(hdr_));
+
+  assert(hdr_.key_size_ + hdr_.val_size_ + sizeof(hdr_) == data.size());
+  if (hdr_.key_size_ + hdr_.val_size_ + sizeof(hdr_) != data.size()) {
+    return false;
+  }
+
+  key_ = Slice(data.data_ + sizeof(hdr_), hdr_.key_size_);
+  val_ = Slice(key_.data_ + hdr_.key_size_, hdr_.val_size_);
+
+  if (!(hdr_.magic_ == MAGIC && ComputeCRC() == hdr_.crc_)) {
+    fprintf(stderr, "** magic %d ** \n", hdr_.magic_);
+    fprintf(stderr, "** key_size %d ** \n", hdr_.key_size_);
+    fprintf(stderr, "** val_size %d ** \n", hdr_.val_size_);
+    fprintf(stderr, "** key %s ** \n", key_.ToString().c_str());
+    fprintf(stderr, "** val %s ** \n", val_.ToString().c_str());
+    for (size_t i = 0; i < hdr_.val_size_; ++i) {
+      fprintf(stderr, "%d.", (uint8_t)val_.data()[i]);
+    }
+    fprintf(stderr, "\n** cksum %d != %d **", hdr_.crc_, ComputeCRC());
+  }
+
+  assert(hdr_.magic_ == MAGIC && ComputeCRC() == hdr_.crc_);
+  return hdr_.magic_ == MAGIC && ComputeCRC() == hdr_.crc_;
+}
+
+//
+// RandomAccessFile
+//
+
+bool RandomAccessCacheFile::Open(const bool enable_direct_reads) {
+  WriteLock _(&rwlock_);
+  return OpenImpl(enable_direct_reads);
+}
+
+bool RandomAccessCacheFile::OpenImpl(const bool enable_direct_reads) {
+  rwlock_.AssertHeld();
+
+  ROCKS_LOG_DEBUG(log_, "Opening cache file %s", Path().c_str());
+  assert(env_);
+
+  std::unique_ptr<FSRandomAccessFile> file;
+  Status status = NewRandomAccessCacheFile(env_->GetFileSystem(), Path(), &file,
+                                           enable_direct_reads);
+  if (!status.ok()) {
+    Error(log_, "Error opening random access file %s. %s", Path().c_str(),
+          status.ToString().c_str());
+    return false;
+  }
+  freader_.reset(new RandomAccessFileReader(std::move(file), Path(),
+                                            env_->GetSystemClock().get()));
+
+  return true;
+}
+
+bool RandomAccessCacheFile::Read(const LBA& lba, Slice* key, Slice* val,
+                                 char* scratch) {
+  ReadLock _(&rwlock_);
+
+  assert(lba.cache_id_ == cache_id_);
+
+  if (!freader_) {
+    return false;
+  }
+
+  Slice result;
+  Status s = freader_->Read(IOOptions(), lba.off_, lba.size_, &result, scratch,
+                            nullptr, Env::IO_TOTAL /* rate_limiter_priority */);
+  if (!s.ok()) {
+    Error(log_, "Error reading from file %s. %s", Path().c_str(),
+          s.ToString().c_str());
+    return false;
+  }
+
+  assert(result.data() == scratch);
+
+  return ParseRec(lba, key, val, scratch);
+}
+
+bool RandomAccessCacheFile::ParseRec(const LBA& lba, Slice* key, Slice* val,
+                                     char* scratch) {
+  Slice data(scratch, lba.size_);
+
+  CacheRecord rec;
+  if (!rec.Deserialize(data)) {
+    assert(!"Error deserializing data");
+    Error(log_, "Error de-serializing record from file %s off %d",
+          Path().c_str(), lba.off_);
+    return false;
+  }
+
+  *key = Slice(rec.key_);
+  *val = Slice(rec.val_);
+
+  return true;
+}
+
+//
+// WriteableCacheFile
+//
+
+WriteableCacheFile::~WriteableCacheFile() {
+  WriteLock _(&rwlock_);
+  if (!eof_) {
+    // This file never flushed. We give priority to shutdown since this is a
+    // cache
+    // TODO(krad): Figure a way to flush the pending data
+    if (file_) {
+      assert(refs_ == 1);
+      --refs_;
+    }
+  }
+  assert(!refs_);
+  ClearBuffers();
+}
+
+bool WriteableCacheFile::Create(const bool /*enable_direct_writes*/,
+                                const bool enable_direct_reads) {
+  WriteLock _(&rwlock_);
+
+  enable_direct_reads_ = enable_direct_reads;
+
+  ROCKS_LOG_DEBUG(log_, "Creating new cache %s (max size is %d B)",
+                  Path().c_str(), max_size_);
+
+  assert(env_);
+
+  Status s = env_->FileExists(Path());
+  if (s.ok()) {
+    ROCKS_LOG_WARN(log_, "File %s already exists. %s", Path().c_str(),
+                   s.ToString().c_str());
+  }
+
+  s = NewWritableCacheFile(env_, Path(), &file_);
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(log_, "Unable to create file %s. %s", Path().c_str(),
+                   s.ToString().c_str());
+    return false;
+  }
+
+  assert(!refs_);
+  ++refs_;
+
+  return true;
+}
+
+bool WriteableCacheFile::Append(const Slice& key, const Slice& val, LBA* lba) {
+  WriteLock _(&rwlock_);
+
+  if (eof_) {
+    // We can't append since the file is full
+    return false;
+  }
+
+  // estimate the space required to store the (key, val)
+  uint32_t rec_size = CacheRecord::CalcSize(key, val);
+
+  if (!ExpandBuffer(rec_size)) {
+    // unable to expand the buffer
+    ROCKS_LOG_DEBUG(log_, "Error expanding buffers. size=%d", rec_size);
+    return false;
+  }
+
+  lba->cache_id_ = cache_id_;
+  lba->off_ = disk_woff_;
+  lba->size_ = rec_size;
+
+  CacheRecord rec(key, val);
+  if (!rec.Serialize(&bufs_, &buf_woff_)) {
+    // unexpected error: unable to serialize the data
+    assert(!"Error serializing record");
+    return false;
+  }
+
+  disk_woff_ += rec_size;
+  eof_ = disk_woff_ >= max_size_;
+
+  // dispatch buffer for flush
+  DispatchBuffer();
+
+  return true;
+}
+
+bool WriteableCacheFile::ExpandBuffer(const size_t size) {
+  rwlock_.AssertHeld();
+  assert(!eof_);
+
+  // determine if there is enough space
+  size_t free = 0;  // compute the free space left in buffer
+  for (size_t i = buf_woff_; i < bufs_.size(); ++i) {
+    free += bufs_[i]->Free();
+    if (size <= free) {
+      // we have enough space in the buffer
+      return true;
+    }
+  }
+
+  // expand the buffer until there is enough space to write `size` bytes
+  assert(free < size);
+  assert(alloc_);
+
+  while (free < size) {
+    CacheWriteBuffer* const buf = alloc_->Allocate();
+    if (!buf) {
+      ROCKS_LOG_DEBUG(log_, "Unable to allocate buffers");
+      return false;
+    }
+
+    size_ += static_cast<uint32_t>(buf->Free());
+    free += buf->Free();
+    bufs_.push_back(buf);
+  }
+
+  assert(free >= size);
+  return true;
+}
+
+void WriteableCacheFile::DispatchBuffer() {
+  rwlock_.AssertHeld();
+
+  assert(bufs_.size());
+  assert(buf_doff_ <= buf_woff_);
+  assert(buf_woff_ <= bufs_.size());
+
+  if (pending_ios_) {
+    return;
+  }
+
+  if (!eof_ && buf_doff_ == buf_woff_) {
+    // dispatch buffer is pointing to write buffer and we haven't hit eof
+    return;
+  }
+
+  assert(eof_ || buf_doff_ < buf_woff_);
+  assert(buf_doff_ < bufs_.size());
+  assert(file_);
+  assert(alloc_);
+
+  auto* buf = bufs_[buf_doff_];
+  const uint64_t file_off = buf_doff_ * alloc_->BufferSize();
+
+  assert(!buf->Free() ||
+         (eof_ && buf_doff_ == buf_woff_ && buf_woff_ < bufs_.size()));
+  // we have reached end of file, and there is space in the last buffer
+  // pad it with zero for direct IO
+  buf->FillTrailingZeros();
+
+  assert(buf->Used() % kFileAlignmentSize == 0);
+
+  writer_->Write(file_.get(), buf, file_off,
+                 std::bind(&WriteableCacheFile::BufferWriteDone, this));
+  pending_ios_++;
+  buf_doff_++;
+}
+
+void WriteableCacheFile::BufferWriteDone() {
+  WriteLock _(&rwlock_);
+
+  assert(bufs_.size());
+
+  pending_ios_--;
+
+  if (buf_doff_ < bufs_.size()) {
+    DispatchBuffer();
+  }
+
+  if (eof_ && buf_doff_ >= bufs_.size() && !pending_ios_) {
+    // end-of-file reached, move to read mode
+    CloseAndOpenForReading();
+  }
+}
+
+void WriteableCacheFile::CloseAndOpenForReading() {
+  // Our env abstraction do not allow reading from a file opened for appending
+  // We need close the file and re-open it for reading
+  Close();
+  RandomAccessCacheFile::OpenImpl(enable_direct_reads_);
+}
+
+bool WriteableCacheFile::ReadBuffer(const LBA& lba, Slice* key, Slice* block,
+                                    char* scratch) {
+  rwlock_.AssertHeld();
+
+  if (!ReadBuffer(lba, scratch)) {
+    Error(log_, "Error reading from buffer. cache=%d off=%d", cache_id_,
+          lba.off_);
+    return false;
+  }
+
+  return ParseRec(lba, key, block, scratch);
+}
+
+bool WriteableCacheFile::ReadBuffer(const LBA& lba, char* data) {
+  rwlock_.AssertHeld();
+
+  assert(lba.off_ < disk_woff_);
+  assert(alloc_);
+
+  // we read from the buffers like reading from a flat file. The list of buffers
+  // are treated as contiguous stream of data
+
+  char* tmp = data;
+  size_t pending_nbytes = lba.size_;
+  // start buffer
+  size_t start_idx = lba.off_ / alloc_->BufferSize();
+  // offset into the start buffer
+  size_t start_off = lba.off_ % alloc_->BufferSize();
+
+  assert(start_idx <= buf_woff_);
+
+  for (size_t i = start_idx; pending_nbytes && i < bufs_.size(); ++i) {
+    assert(i <= buf_woff_);
+    auto* buf = bufs_[i];
+    assert(i == buf_woff_ || !buf->Free());
+    // bytes to write to the buffer
+    size_t nbytes = pending_nbytes > (buf->Used() - start_off)
+                        ? (buf->Used() - start_off)
+                        : pending_nbytes;
+    memcpy(tmp, buf->Data() + start_off, nbytes);
+
+    // left over to be written
+    pending_nbytes -= nbytes;
+    start_off = 0;
+    tmp += nbytes;
+  }
+
+  assert(!pending_nbytes);
+  if (pending_nbytes) {
+    return false;
+  }
+
+  assert(tmp == data + lba.size_);
+  return true;
+}
+
+void WriteableCacheFile::Close() {
+  rwlock_.AssertHeld();
+
+  assert(size_ >= max_size_);
+  assert(disk_woff_ >= max_size_);
+  assert(buf_doff_ == bufs_.size());
+  assert(bufs_.size() - buf_woff_ <= 1);
+  assert(!pending_ios_);
+
+  Info(log_, "Closing file %s. size=%d written=%d", Path().c_str(), size_,
+       disk_woff_);
+
+  ClearBuffers();
+  file_.reset();
+
+  assert(refs_);
+  --refs_;
+}
+
+void WriteableCacheFile::ClearBuffers() {
+  assert(alloc_);
+
+  for (size_t i = 0; i < bufs_.size(); ++i) {
+    alloc_->Deallocate(bufs_[i]);
+  }
+
+  bufs_.clear();
+}
+
+//
+// ThreadedFileWriter implementation
+//
+ThreadedWriter::ThreadedWriter(PersistentCacheTier* const cache,
+                               const size_t qdepth, const size_t io_size)
+    : Writer(cache), io_size_(io_size) {
+  for (size_t i = 0; i < qdepth; ++i) {
+    port::Thread th(&ThreadedWriter::ThreadMain, this);
+    threads_.push_back(std::move(th));
+  }
+}
+
+void ThreadedWriter::Stop() {
+  // notify all threads to exit
+  for (size_t i = 0; i < threads_.size(); ++i) {
+    q_.Push(IO(/*signal=*/true));
+  }
+
+  // wait for all threads to exit
+  for (auto& th : threads_) {
+    th.join();
+    assert(!th.joinable());
+  }
+  threads_.clear();
+}
+
+void ThreadedWriter::Write(WritableFile* const file, CacheWriteBuffer* buf,
+                           const uint64_t file_off,
+                           const std::function<void()> callback) {
+  q_.Push(IO(file, buf, file_off, callback));
+}
+
+void ThreadedWriter::ThreadMain() {
+  while (true) {
+    // Fetch the IO to process
+    IO io(q_.Pop());
+    if (io.signal_) {
+      // that's secret signal to exit
+      break;
+    }
+
+    // Reserve space for writing the buffer
+    while (!cache_->Reserve(io.buf_->Used())) {
+      // We can fail to reserve space if every file in the system
+      // is being currently accessed
+      /* sleep override */
+      SystemClock::Default()->SleepForMicroseconds(1000000);
+    }
+
+    DispatchIO(io);
+
+    io.callback_();
+  }
+}
+
+void ThreadedWriter::DispatchIO(const IO& io) {
+  size_t written = 0;
+  while (written < io.buf_->Used()) {
+    Slice data(io.buf_->Data() + written, io_size_);
+    Status s = io.file_->Append(data);
+    assert(s.ok());
+    if (!s.ok()) {
+      // That is definite IO error to device. There is not much we can
+      // do but ignore the failure. This can lead to corruption of data on
+      // disk, but the cache will skip while reading
+      fprintf(stderr, "Error writing data to file. %s\n", s.ToString().c_str());
+    }
+    written += io_size_;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/utilities/persistent_cache/block_cache_tier_file.h b/src/rocksdb/utilities/persistent_cache/block_cache_tier_file.h
new file mode 100644
index 000000000..1d265ab74
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/block_cache_tier_file.h
@@ -0,0 +1,293 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <list>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "file/random_access_file_reader.h"
+#include "port/port.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "util/crc32c.h"
+#include "util/mutexlock.h"
+#include "utilities/persistent_cache/block_cache_tier_file_buffer.h"
+#include "utilities/persistent_cache/lrulist.h"
+#include "utilities/persistent_cache/persistent_cache_tier.h"
+#include "utilities/persistent_cache/persistent_cache_util.h"
+
+// The io code path of persistent cache uses pipelined architecture
+//
+// client -> In Queue <-- BlockCacheTier --> Out Queue <-- Writer <--> Kernel
+//
+// This would enable the system to scale for GB/s of throughput which is
+// expected with modern devies like NVM.
+//
+// The file level operations are encapsulated in the following abstractions
+//
+// BlockCacheFile
+//       ^
+//       |
+//       |
+// RandomAccessCacheFile (For reading)
+//       ^
+//       |
+//       |
+// WriteableCacheFile (For writing)
+//
+// Write IO code path :
+//
+namespace ROCKSDB_NAMESPACE {
+
+class WriteableCacheFile;
+struct BlockInfo;
+
+// Represents a logical record on device
+//
+// (L)ogical (B)lock (Address = { cache-file-id, offset, size }
+struct LogicalBlockAddress {
+  LogicalBlockAddress() {}
+  explicit LogicalBlockAddress(const uint32_t cache_id, const uint32_t off,
+                               const uint16_t size)
+      : cache_id_(cache_id), off_(off), size_(size) {}
+
+  uint32_t cache_id_ = 0;
+  uint32_t off_ = 0;
+  uint32_t size_ = 0;
+};
+
+using LBA = LogicalBlockAddress;
+
+// class Writer
+//
+// Writer is the abstraction used for writing data to file. The component can be
+// multithreaded. It is the last step of write pipeline
+class Writer {
+ public:
+  explicit Writer(PersistentCacheTier* const cache) : cache_(cache) {}
+  virtual ~Writer() {}
+
+  // write buffer to file at the given offset
+  virtual void Write(WritableFile* const file, CacheWriteBuffer* buf,
+                     const uint64_t file_off,
+                     const std::function<void()> callback) = 0;
+  // stop the writer
+  virtual void Stop() = 0;
+
+  PersistentCacheTier* const cache_;
+};
+
+// class BlockCacheFile
+//
+// Generic interface to support building file specialized for read/writing
+class BlockCacheFile : public LRUElement<BlockCacheFile> {
+ public:
+  explicit BlockCacheFile(const uint32_t cache_id)
+      : LRUElement<BlockCacheFile>(), cache_id_(cache_id) {}
+
+  explicit BlockCacheFile(Env* const env, const std::string& dir,
+                          const uint32_t cache_id)
+      : LRUElement<BlockCacheFile>(),
+        env_(env),
+        dir_(dir),
+        cache_id_(cache_id) {}
+
+  virtual ~BlockCacheFile() {}
+
+  // append key/value to file and return LBA locator to user
+  virtual bool Append(const Slice& /*key*/, const Slice& /*val*/,
+                      LBA* const /*lba*/) {
+    assert(!"not implemented");
+    return false;
+  }
+
+  // read from the record locator (LBA) and return key, value and status
+  virtual bool Read(const LBA& /*lba*/, Slice* /*key*/, Slice* /*block*/,
+                    char* /*scratch*/) {
+    assert(!"not implemented");
+    return false;
+  }
+
+  // get file path
+  std::string Path() const {
+    return dir_ + "/" + std::to_string(cache_id_) + ".rc";
+  }
+  // get cache ID
+  uint32_t cacheid() const { return cache_id_; }
+  // Add block information to file data
+  // Block information is the list of index reference for this file
+  virtual void Add(BlockInfo* binfo) {
+    WriteLock _(&rwlock_);
+    block_infos_.push_back(binfo);
+  }
+  // get block information
+  std::list<BlockInfo*>& block_infos() { return block_infos_; }
+  // delete file and return the size of the file
+  virtual Status Delete(uint64_t* size);
+
+ protected:
+  port::RWMutex rwlock_;               // synchronization mutex
+  Env* const env_ = nullptr;           // Env for OS
+  const std::string dir_;              // Directory name
+  const uint32_t cache_id_;            // Cache id for the file
+  std::list<BlockInfo*> block_infos_;  // List of index entries mapping to the
+                                       // file content
+};
+
+// class RandomAccessFile
+//
+// Thread safe implementation for reading random data from file
+class RandomAccessCacheFile : public BlockCacheFile {
+ public:
+  explicit RandomAccessCacheFile(Env* const env, const std::string& dir,
+                                 const uint32_t cache_id,
+                                 const std::shared_ptr<Logger>& log)
+      : BlockCacheFile(env, dir, cache_id), log_(log) {}
+
+  virtual ~RandomAccessCacheFile() {}
+
+  // open file for reading
+  bool Open(const bool enable_direct_reads);
+  // read data from the disk
+  bool Read(const LBA& lba, Slice* key, Slice* block, char* scratch) override;
+
+ private:
+  std::unique_ptr<RandomAccessFileReader> freader_;
+
+ protected:
+  bool OpenImpl(const bool enable_direct_reads);
+  bool ParseRec(const LBA& lba, Slice* key, Slice* val, char* scratch);
+
+  std::shared_ptr<Logger> log_;  // log file
+};
+
+// class WriteableCacheFile
+//
+// All writes to the files are cached in buffers. The buffers are flushed to
+// disk as they get filled up. When file size reaches a certain size, a new file
+// will be created provided there is free space
+class WriteableCacheFile : public RandomAccessCacheFile {
+ public:
+  explicit WriteableCacheFile(Env* const env, CacheWriteBufferAllocator* alloc,
+                              Writer* writer, const std::string& dir,
+                              const uint32_t cache_id, const uint32_t max_size,
+                              const std::shared_ptr<Logger>& log)
+      : RandomAccessCacheFile(env, dir, cache_id, log),
+        alloc_(alloc),
+        writer_(writer),
+        max_size_(max_size) {}
+
+  virtual ~WriteableCacheFile();
+
+  // create file on disk
+  bool Create(const bool enable_direct_writes, const bool enable_direct_reads);
+
+  // read data from logical file
+  bool Read(const LBA& lba, Slice* key, Slice* block, char* scratch) override {
+    ReadLock _(&rwlock_);
+    const bool closed = eof_ && bufs_.empty();
+    if (closed) {
+      // the file is closed, read from disk
+      return RandomAccessCacheFile::Read(lba, key, block, scratch);
+    }
+    // file is still being written, read from buffers
+    return ReadBuffer(lba, key, block, scratch);
+  }
+
+  // append data to end of file
+  bool Append(const Slice&, const Slice&, LBA* const) override;
+  // End-of-file
+  bool Eof() const { return eof_; }
+
+ private:
+  friend class ThreadedWriter;
+
+  static const size_t kFileAlignmentSize = 4 * 1024;  // align file size
+
+  bool ReadBuffer(const LBA& lba, Slice* key, Slice* block, char* scratch);
+  bool ReadBuffer(const LBA& lba, char* data);
+  bool ExpandBuffer(const size_t size);
+  void DispatchBuffer();
+  void BufferWriteDone();
+  void CloseAndOpenForReading();
+  void ClearBuffers();
+  void Close();
+
+  // File layout in memory
+  //
+  // +------+------+------+------+------+------+
+  // | b0   | b1   | b2   | b3   | b4   | b5   |
+  // +------+------+------+------+------+------+
+  //        ^                           ^
+  //        |                           |
+  //      buf_doff_                   buf_woff_
+  //   (next buffer to           (next buffer to fill)
+  //   flush to disk)
+  //
+  //  The buffers are flushed to disk serially for a given file
+
+  CacheWriteBufferAllocator* const alloc_ = nullptr;  // Buffer provider
+  Writer* const writer_ = nullptr;                    // File writer thread
+  std::unique_ptr<WritableFile> file_;   // RocksDB Env file abstraction
+  std::vector<CacheWriteBuffer*> bufs_;  // Written buffers
+  uint32_t size_ = 0;                    // Size of the file
+  const uint32_t max_size_;              // Max size of the file
+  bool eof_ = false;                     // End of file
+  uint32_t disk_woff_ = 0;               // Offset to write on disk
+  size_t buf_woff_ = 0;                  // off into bufs_ to write
+  size_t buf_doff_ = 0;                  // off into bufs_ to dispatch
+  size_t pending_ios_ = 0;               // Number of ios to disk in-progress
+  bool enable_direct_reads_ = false;     // Should we enable direct reads
+                                         // when reading from disk
+};
+
+//
+// Abstraction to do writing to device. It is part of pipelined architecture.
+//
+class ThreadedWriter : public Writer {
+ public:
+  // Representation of IO to device
+  struct IO {
+    explicit IO(const bool signal) : signal_(signal) {}
+    explicit IO(WritableFile* const file, CacheWriteBuffer* const buf,
+                const uint64_t file_off, const std::function<void()> callback)
+        : file_(file), buf_(buf), file_off_(file_off), callback_(callback) {}
+
+    IO(const IO&) = default;
+    IO& operator=(const IO&) = default;
+    size_t Size() const { return sizeof(IO); }
+
+    WritableFile* file_ = nullptr;     // File to write to
+    CacheWriteBuffer* buf_ = nullptr;  // buffer to write
+    uint64_t file_off_ = 0;            // file offset
+    bool signal_ = false;              // signal to exit thread loop
+    std::function<void()> callback_;   // Callback on completion
+  };
+
+  explicit ThreadedWriter(PersistentCacheTier* const cache, const size_t qdepth,
+                          const size_t io_size);
+  virtual ~ThreadedWriter() { assert(threads_.empty()); }
+
+  void Stop() override;
+  void Write(WritableFile* const file, CacheWriteBuffer* buf,
+             const uint64_t file_off,
+             const std::function<void()> callback) override;
+
+ private:
+  void ThreadMain();
+  void DispatchIO(const IO& io);
+
+  const size_t io_size_ = 0;
+  BoundedQueue<IO> q_;
+  std::vector<port::Thread> threads_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/utilities/persistent_cache/block_cache_tier_file_buffer.h b/src/rocksdb/utilities/persistent_cache/block_cache_tier_file_buffer.h
new file mode 100644
index 000000000..d4f02455a
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/block_cache_tier_file_buffer.h
@@ -0,0 +1,127 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <list>
+#include <memory>
+#include <string>
+
+#include "memory/arena.h"
+#include "rocksdb/comparator.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+//
+// CacheWriteBuffer
+//
+// Buffer abstraction that can be manipulated via append
+// (not thread safe)
+class CacheWriteBuffer {
+ public:
+  explicit CacheWriteBuffer(const size_t size) : size_(size), pos_(0) {
+    buf_.reset(new char[size_]);
+    assert(!pos_);
+    assert(size_);
+  }
+
+  virtual ~CacheWriteBuffer() {}
+
+  void Append(const char* buf, const size_t size) {
+    assert(pos_ + size <= size_);
+    memcpy(buf_.get() + pos_, buf, size);
+    pos_ += size;
+    assert(pos_ <= size_);
+  }
+
+  void FillTrailingZeros() {
+    assert(pos_ <= size_);
+    memset(buf_.get() + pos_, '0', size_ - pos_);
+    pos_ = size_;
+  }
+
+  void Reset() { pos_ = 0; }
+  size_t Free() const { return size_ - pos_; }
+  size_t Capacity() const { return size_; }
+  size_t Used() const { return pos_; }
+  char* Data() const { return buf_.get(); }
+
+ private:
+  std::unique_ptr<char[]> buf_;
+  const size_t size_;
+  size_t pos_;
+};
+
+//
+// CacheWriteBufferAllocator
+//
+// Buffer pool abstraction(not thread safe)
+//
+class CacheWriteBufferAllocator {
+ public:
+  explicit CacheWriteBufferAllocator(const size_t buffer_size,
+                                     const size_t buffer_count)
+      : cond_empty_(&lock_), buffer_size_(buffer_size) {
+    MutexLock _(&lock_);
+    buffer_size_ = buffer_size;
+    for (uint32_t i = 0; i < buffer_count; i++) {
+      auto* buf = new CacheWriteBuffer(buffer_size_);
+      assert(buf);
+      if (buf) {
+        bufs_.push_back(buf);
+        cond_empty_.Signal();
+      }
+    }
+  }
+
+  virtual ~CacheWriteBufferAllocator() {
+    MutexLock _(&lock_);
+    assert(bufs_.size() * buffer_size_ == Capacity());
+    for (auto* buf : bufs_) {
+      delete buf;
+    }
+    bufs_.clear();
+  }
+
+  CacheWriteBuffer* Allocate() {
+    MutexLock _(&lock_);
+    if (bufs_.empty()) {
+      return nullptr;
+    }
+
+    assert(!bufs_.empty());
+    CacheWriteBuffer* const buf = bufs_.front();
+    bufs_.pop_front();
+    return buf;
+  }
+
+  void Deallocate(CacheWriteBuffer* const buf) {
+    assert(buf);
+    MutexLock _(&lock_);
+    buf->Reset();
+    bufs_.push_back(buf);
+    cond_empty_.Signal();
+  }
+
+  void WaitUntilUsable() {
+    // We are asked to wait till we have buffers available
+    MutexLock _(&lock_);
+    while (bufs_.empty()) {
+      cond_empty_.Wait();
+    }
+  }
+
+  size_t Capacity() const { return bufs_.size() * buffer_size_; }
+  size_t Free() const { return bufs_.size() * buffer_size_; }
+  size_t BufferSize() const { return buffer_size_; }
+
+ private:
+  port::Mutex lock_;                   // Sync lock
+  port::CondVar cond_empty_;           // Condition var for empty buffers
+  size_t buffer_size_;                 // Size of each buffer
+  std::list<CacheWriteBuffer*> bufs_;  // Buffer stash
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.cc b/src/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.cc
new file mode 100644
index 000000000..d73b5d0b4
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.cc
@@ -0,0 +1,86 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#ifndef ROCKSDB_LITE
+
+#include "utilities/persistent_cache/block_cache_tier_metadata.h"
+
+#include <functional>
+
+namespace ROCKSDB_NAMESPACE {
+
+bool BlockCacheTierMetadata::Insert(BlockCacheFile* file) {
+  return cache_file_index_.Insert(file);
+}
+
+BlockCacheFile* BlockCacheTierMetadata::Lookup(const uint32_t cache_id) {
+  BlockCacheFile* ret = nullptr;
+  BlockCacheFile lookup_key(cache_id);
+  bool ok = cache_file_index_.Find(&lookup_key, &ret);
+  if (ok) {
+    assert(ret->refs_);
+    return ret;
+  }
+  return nullptr;
+}
+
+BlockCacheFile* BlockCacheTierMetadata::Evict() {
+  using std::placeholders::_1;
+  auto fn = std::bind(&BlockCacheTierMetadata::RemoveAllKeys, this, _1);
+  return cache_file_index_.Evict(fn);
+}
+
+void BlockCacheTierMetadata::Clear() {
+  cache_file_index_.Clear([](BlockCacheFile* arg) { delete arg; });
+  block_index_.Clear([](BlockInfo* arg) { delete arg; });
+}
+
+BlockInfo* BlockCacheTierMetadata::Insert(const Slice& key, const LBA& lba) {
+  std::unique_ptr<BlockInfo> binfo(new BlockInfo(key, lba));
+  if (!block_index_.Insert(binfo.get())) {
+    return nullptr;
+  }
+  return binfo.release();
+}
+
+bool BlockCacheTierMetadata::Lookup(const Slice& key, LBA* lba) {
+  BlockInfo lookup_key(key);
+  BlockInfo* block;
+  port::RWMutex* rlock = nullptr;
+  if (!block_index_.Find(&lookup_key, &block, &rlock)) {
+    return false;
+  }
+
+  ReadUnlock _(rlock);
+  assert(block->key_ == key.ToString());
+  if (lba) {
+    *lba = block->lba_;
+  }
+  return true;
+}
+
+BlockInfo* BlockCacheTierMetadata::Remove(const Slice& key) {
+  BlockInfo lookup_key(key);
+  BlockInfo* binfo = nullptr;
+  bool ok __attribute__((__unused__));
+  ok = block_index_.Erase(&lookup_key, &binfo);
+  assert(ok);
+  return binfo;
+}
+
+void BlockCacheTierMetadata::RemoveAllKeys(BlockCacheFile* f) {
+  for (BlockInfo* binfo : f->block_infos()) {
+    BlockInfo* tmp = nullptr;
+    bool status = block_index_.Erase(binfo, &tmp);
+    (void)status;
+    assert(status);
+    assert(tmp == binfo);
+    delete binfo;
+  }
+  f->block_infos().clear();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.h b/src/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.h
new file mode 100644
index 000000000..2fcd50105
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.h
@@ -0,0 +1,124 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+#include <string>
+#include <unordered_map>
+
+#include "rocksdb/slice.h"
+#include "utilities/persistent_cache/block_cache_tier_file.h"
+#include "utilities/persistent_cache/hash_table.h"
+#include "utilities/persistent_cache/hash_table_evictable.h"
+#include "utilities/persistent_cache/lrulist.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+//
+// Block Cache Tier Metadata
+//
+// The BlockCacheTierMetadata holds all the metadata associated with block
+// cache. It
+// fundamentally contains 2 indexes and an LRU.
+//
+// Block Cache Index
+//
+// This is a forward index that maps a given key to a LBA (Logical Block
+// Address). LBA is a disk pointer that points to a record on the cache.
+//
+// LBA = { cache-id, offset, size }
+//
+// Cache File Index
+//
+// This is a forward index that maps a given cache-id to a cache file object.
+// Typically you would lookup using LBA and use the object to read or write
+struct BlockInfo {
+  explicit BlockInfo(const Slice& key, const LBA& lba = LBA())
+      : key_(key.ToString()), lba_(lba) {}
+
+  std::string key_;
+  LBA lba_;
+};
+
+class BlockCacheTierMetadata {
+ public:
+  explicit BlockCacheTierMetadata(const uint32_t blocks_capacity = 1024 * 1024,
+                                  const uint32_t cachefile_capacity = 10 * 1024)
+      : cache_file_index_(cachefile_capacity), block_index_(blocks_capacity) {}
+
+  virtual ~BlockCacheTierMetadata() {}
+
+  // Insert a given cache file
+  bool Insert(BlockCacheFile* file);
+
+  // Lookup cache file based on cache_id
+  BlockCacheFile* Lookup(const uint32_t cache_id);
+
+  // Insert block information to block index
+  BlockInfo* Insert(const Slice& key, const LBA& lba);
+  // bool Insert(BlockInfo* binfo);
+
+  // Lookup block information from block index
+  bool Lookup(const Slice& key, LBA* lba);
+
+  // Remove a given from the block index
+  BlockInfo* Remove(const Slice& key);
+
+  // Find and evict a cache file using LRU policy
+  BlockCacheFile* Evict();
+
+  // Clear the metadata contents
+  virtual void Clear();
+
+ protected:
+  // Remove all block information from a given file
+  virtual void RemoveAllKeys(BlockCacheFile* file);
+
+ private:
+  // Cache file index definition
+  //
+  // cache-id => BlockCacheFile
+  struct BlockCacheFileHash {
+    uint64_t operator()(const BlockCacheFile* rec) {
+      return std::hash<uint32_t>()(rec->cacheid());
+    }
+  };
+
+  struct BlockCacheFileEqual {
+    uint64_t operator()(const BlockCacheFile* lhs, const BlockCacheFile* rhs) {
+      return lhs->cacheid() == rhs->cacheid();
+    }
+  };
+
+  using CacheFileIndexType =
+      EvictableHashTable<BlockCacheFile, BlockCacheFileHash,
+                         BlockCacheFileEqual>;
+
+  // Block Lookup Index
+  //
+  // key => LBA
+  struct Hash {
+    size_t operator()(BlockInfo* node) const {
+      return std::hash<std::string>()(node->key_);
+    }
+  };
+
+  struct Equal {
+    size_t operator()(BlockInfo* lhs, BlockInfo* rhs) const {
+      return lhs->key_ == rhs->key_;
+    }
+  };
+
+  using BlockIndexType = HashTable<BlockInfo*, Hash, Equal>;
+
+  CacheFileIndexType cache_file_index_;
+  BlockIndexType block_index_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/utilities/persistent_cache/hash_table.h b/src/rocksdb/utilities/persistent_cache/hash_table.h
new file mode 100644
index 000000000..b00b294ce
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/hash_table.h
@@ -0,0 +1,239 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <assert.h>
+
+#include <list>
+#include <vector>
+
+#ifdef OS_LINUX
+#include <sys/mman.h>
+#endif
+
+#include "rocksdb/env.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// HashTable<T, Hash, Equal>
+//
+// Traditional implementation of hash table with synchronization built on top
+// don't perform very well in multi-core scenarios. This is an implementation
+// designed for multi-core scenarios with high lock contention.
+//
+//                         |<-------- alpha ------------->|
+//               Buckets   Collision list
+//          ---- +----+    +---+---+--- ...... ---+---+---+
+//         /     |    |--->|   |   |              |   |   |
+//        /      +----+    +---+---+--- ...... ---+---+---+
+//       /       |    |
+// Locks/        +----+
+// +--+/         .    .
+// |  |          .    .
+// +--+          .    .
+// |  |          .    .
+// +--+          .    .
+// |  |          .    .
+// +--+          .    .
+//     \         +----+
+//      \        |    |
+//       \       +----+
+//        \      |    |
+//         \---- +----+
+//
+// The lock contention is spread over an array of locks. This helps improve
+// concurrent access. The spine is designed for a certain capacity and load
+// factor. When the capacity planning is done correctly we can expect
+// O(load_factor = 1) insert, access and remove time.
+//
+// Micro benchmark on debug build gives about .5 Million/sec rate of insert,
+// erase and lookup in parallel (total of about 1.5 Million ops/sec). If the
+// blocks were of 4K, the hash table can support  a virtual throughput of
+// 6 GB/s.
+//
+// T      Object type (contains both key and value)
+// Hash   Function that returns an hash from type T
+// Equal  Returns if two objects are equal
+//        (We need explicit equal for pointer type)
+//
+template <class T, class Hash, class Equal>
+class HashTable {
+ public:
+  explicit HashTable(const size_t capacity = 1024 * 1024,
+                     const float load_factor = 2.0, const uint32_t nlocks = 256)
+      : nbuckets_(
+            static_cast<uint32_t>(load_factor ? capacity / load_factor : 0)),
+        nlocks_(nlocks) {
+    // pre-conditions
+    assert(capacity);
+    assert(load_factor);
+    assert(nbuckets_);
+    assert(nlocks_);
+
+    buckets_.reset(new Bucket[nbuckets_]);
+#ifdef OS_LINUX
+    mlock(buckets_.get(), nbuckets_ * sizeof(Bucket));
+#endif
+
+    // initialize locks
+    locks_.reset(new port::RWMutex[nlocks_]);
+#ifdef OS_LINUX
+    mlock(locks_.get(), nlocks_ * sizeof(port::RWMutex));
+#endif
+
+    // post-conditions
+    assert(buckets_);
+    assert(locks_);
+  }
+
+  virtual ~HashTable() { AssertEmptyBuckets(); }
+
+  //
+  // Insert given record to hash table
+  //
+  bool Insert(const T& t) {
+    const uint64_t h = Hash()(t);
+    const uint32_t bucket_idx = h % nbuckets_;
+    const uint32_t lock_idx = bucket_idx % nlocks_;
+
+    WriteLock _(&locks_[lock_idx]);
+    auto& bucket = buckets_[bucket_idx];
+    return Insert(&bucket, t);
+  }
+
+  // Lookup hash table
+  //
+  // Please note that read lock should be held by the caller. This is because
+  // the caller owns the data, and should hold the read lock as long as he
+  // operates on the data.
+  bool Find(const T& t, T* ret, port::RWMutex** ret_lock) {
+    const uint64_t h = Hash()(t);
+    const uint32_t bucket_idx = h % nbuckets_;
+    const uint32_t lock_idx = bucket_idx % nlocks_;
+
+    port::RWMutex& lock = locks_[lock_idx];
+    lock.ReadLock();
+
+    auto& bucket = buckets_[bucket_idx];
+    if (Find(&bucket, t, ret)) {
+      *ret_lock = &lock;
+      return true;
+    }
+
+    lock.ReadUnlock();
+    return false;
+  }
+
+  //
+  // Erase a given key from the hash table
+  //
+  bool Erase(const T& t, T* ret) {
+    const uint64_t h = Hash()(t);
+    const uint32_t bucket_idx = h % nbuckets_;
+    const uint32_t lock_idx = bucket_idx % nlocks_;
+
+    WriteLock _(&locks_[lock_idx]);
+
+    auto& bucket = buckets_[bucket_idx];
+    return Erase(&bucket, t, ret);
+  }
+
+  // Fetch the mutex associated with a key
+  // This call is used to hold the lock for a given data for extended period of
+  // time.
+  port::RWMutex* GetMutex(const T& t) {
+    const uint64_t h = Hash()(t);
+    const uint32_t bucket_idx = h % nbuckets_;
+    const uint32_t lock_idx = bucket_idx % nlocks_;
+
+    return &locks_[lock_idx];
+  }
+
+  void Clear(void (*fn)(T)) {
+    for (uint32_t i = 0; i < nbuckets_; ++i) {
+      const uint32_t lock_idx = i % nlocks_;
+      WriteLock _(&locks_[lock_idx]);
+      for (auto& t : buckets_[i].list_) {
+        (*fn)(t);
+      }
+      buckets_[i].list_.clear();
+    }
+  }
+
+ protected:
+  // Models bucket of keys that hash to the same bucket number
+  struct Bucket {
+    std::list<T> list_;
+  };
+
+  // Substitute for std::find with custom comparator operator
+  typename std::list<T>::iterator Find(std::list<T>* list, const T& t) {
+    for (auto it = list->begin(); it != list->end(); ++it) {
+      if (Equal()(*it, t)) {
+        return it;
+      }
+    }
+    return list->end();
+  }
+
+  bool Insert(Bucket* bucket, const T& t) {
+    // Check if the key already exists
+    auto it = Find(&bucket->list_, t);
+    if (it != bucket->list_.end()) {
+      return false;
+    }
+
+    // insert to bucket
+    bucket->list_.push_back(t);
+    return true;
+  }
+
+  bool Find(Bucket* bucket, const T& t, T* ret) {
+    auto it = Find(&bucket->list_, t);
+    if (it != bucket->list_.end()) {
+      if (ret) {
+        *ret = *it;
+      }
+      return true;
+    }
+    return false;
+  }
+
+  bool Erase(Bucket* bucket, const T& t, T* ret) {
+    auto it = Find(&bucket->list_, t);
+    if (it != bucket->list_.end()) {
+      if (ret) {
+        *ret = *it;
+      }
+
+      bucket->list_.erase(it);
+      return true;
+    }
+    return false;
+  }
+
+  // assert that all buckets are empty
+  void AssertEmptyBuckets() {
+#ifndef NDEBUG
+    for (size_t i = 0; i < nbuckets_; ++i) {
+      WriteLock _(&locks_[i % nlocks_]);
+      assert(buckets_[i].list_.empty());
+    }
+#endif
+  }
+
+  const uint32_t nbuckets_;                 // No. of buckets in the spine
+  std::unique_ptr<Bucket[]> buckets_;       // Spine of the hash buckets
+  const uint32_t nlocks_;                   // No. of locks
+  std::unique_ptr<port::RWMutex[]> locks_;  // Granular locks
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/utilities/persistent_cache/hash_table_bench.cc b/src/rocksdb/utilities/persistent_cache/hash_table_bench.cc
new file mode 100644
index 000000000..74d7e2edf
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/hash_table_bench.cc
@@ -0,0 +1,310 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#if !defined(OS_WIN) && !defined(ROCKSDB_LITE)
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() { fprintf(stderr, "Please install gflags to run tools\n"); }
+#else
+
+#include <sys/time.h>
+#include <unistd.h>
+
+#include <atomic>
+#include <functional>
+#include <string>
+#include <unordered_map>
+
+#include "port/port_posix.h"
+#include "port/sys_time.h"
+#include "rocksdb/env.h"
+#include "util/gflags_compat.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "utilities/persistent_cache/hash_table.h"
+
+using std::string;
+
+DEFINE_int32(nsec, 10, "nsec");
+DEFINE_int32(nthread_write, 1, "insert %");
+DEFINE_int32(nthread_read, 1, "lookup %");
+DEFINE_int32(nthread_erase, 1, "erase %");
+
+namespace ROCKSDB_NAMESPACE {
+
+//
+// HashTableImpl interface
+//
+// Abstraction of a hash table implementation
+template <class Key, class Value>
+class HashTableImpl {
+ public:
+  virtual ~HashTableImpl() {}
+
+  virtual bool Insert(const Key& key, const Value& val) = 0;
+  virtual bool Erase(const Key& key) = 0;
+  virtual bool Lookup(const Key& key, Value* val) = 0;
+};
+
+// HashTableBenchmark
+//
+// Abstraction to test a given hash table implementation. The test mostly
+// focus on insert, lookup and erase. The test can operate in test mode and
+// benchmark mode.
+class HashTableBenchmark {
+ public:
+  explicit HashTableBenchmark(HashTableImpl<size_t, std::string>* impl,
+                              const size_t sec = 10,
+                              const size_t nthread_write = 1,
+                              const size_t nthread_read = 1,
+                              const size_t nthread_erase = 1)
+      : impl_(impl),
+        sec_(sec),
+        ninserts_(0),
+        nreads_(0),
+        nerases_(0),
+        nerases_failed_(0),
+        quit_(false) {
+    Prepop();
+
+    StartThreads(nthread_write, WriteMain);
+    StartThreads(nthread_read, ReadMain);
+    StartThreads(nthread_erase, EraseMain);
+
+    uint64_t start = NowInMillSec();
+    while (!quit_) {
+      quit_ = NowInMillSec() - start > sec_ * 1000;
+      /* sleep override */ sleep(1);
+    }
+
+    Env* env = Env::Default();
+    env->WaitForJoin();
+
+    if (sec_) {
+      printf("Result \n");
+      printf("====== \n");
+      printf("insert/sec = %f \n", ninserts_ / static_cast<double>(sec_));
+      printf("read/sec = %f \n", nreads_ / static_cast<double>(sec_));
+      printf("erases/sec = %f \n", nerases_ / static_cast<double>(sec_));
+      const uint64_t ops = ninserts_ + nreads_ + nerases_;
+      printf("ops/sec = %f \n", ops / static_cast<double>(sec_));
+      printf("erase fail = %d (%f%%)\n", static_cast<int>(nerases_failed_),
+             static_cast<float>(nerases_failed_ / nerases_ * 100));
+      printf("====== \n");
+    }
+  }
+
+  void RunWrite() {
+    while (!quit_) {
+      size_t k = insert_key_++;
+      std::string tmp(1000, k % 255);
+      bool status = impl_->Insert(k, tmp);
+      assert(status);
+      ninserts_++;
+    }
+  }
+
+  void RunRead() {
+    Random64 rgen(time(nullptr));
+    while (!quit_) {
+      std::string s;
+      size_t k = rgen.Next() % max_prepop_key;
+      bool status = impl_->Lookup(k, &s);
+      assert(status);
+      assert(s == std::string(1000, k % 255));
+      nreads_++;
+    }
+  }
+
+  void RunErase() {
+    while (!quit_) {
+      size_t k = erase_key_++;
+      bool status = impl_->Erase(k);
+      nerases_failed_ += !status;
+      nerases_++;
+    }
+  }
+
+ private:
+  // Start threads for a given function
+  void StartThreads(const size_t n, void (*fn)(void*)) {
+    Env* env = Env::Default();
+    for (size_t i = 0; i < n; ++i) {
+      env->StartThread(fn, this);
+    }
+  }
+
+  // Prepop the hash table with 1M keys
+  void Prepop() {
+    for (size_t i = 0; i < max_prepop_key; ++i) {
+      bool status = impl_->Insert(i, std::string(1000, i % 255));
+      assert(status);
+    }
+
+    erase_key_ = insert_key_ = max_prepop_key;
+
+    for (size_t i = 0; i < 10 * max_prepop_key; ++i) {
+      bool status = impl_->Insert(insert_key_++, std::string(1000, 'x'));
+      assert(status);
+    }
+  }
+
+  static uint64_t NowInMillSec() {
+    port::TimeVal tv;
+    port::GetTimeOfDay(&tv, /*tz=*/nullptr);
+    return tv.tv_sec * 1000 + tv.tv_usec / 1000;
+  }
+
+  //
+  //  Wrapper functions for thread entry
+  //
+  static void WriteMain(void* args) {
+    reinterpret_cast<HashTableBenchmark*>(args)->RunWrite();
+  }
+
+  static void ReadMain(void* args) {
+    reinterpret_cast<HashTableBenchmark*>(args)->RunRead();
+  }
+
+  static void EraseMain(void* args) {
+    reinterpret_cast<HashTableBenchmark*>(args)->RunErase();
+  }
+
+  HashTableImpl<size_t, std::string>* impl_;         // Implementation to test
+  const size_t sec_;                                 // Test time
+  const size_t max_prepop_key = 1ULL * 1024 * 1024;  // Max prepop key
+  std::atomic<size_t> insert_key_;                   // Last inserted key
+  std::atomic<size_t> erase_key_;                    // Erase key
+  std::atomic<size_t> ninserts_;                     // Number of inserts
+  std::atomic<size_t> nreads_;                       // Number of reads
+  std::atomic<size_t> nerases_;                      // Number of erases
+  std::atomic<size_t> nerases_failed_;               // Number of erases failed
+  bool quit_;  // Should the threads quit ?
+};
+
+//
+// SimpleImpl
+// Lock safe unordered_map implementation
+class SimpleImpl : public HashTableImpl<size_t, string> {
+ public:
+  bool Insert(const size_t& key, const string& val) override {
+    WriteLock _(&rwlock_);
+    map_.insert(make_pair(key, val));
+    return true;
+  }
+
+  bool Erase(const size_t& key) override {
+    WriteLock _(&rwlock_);
+    auto it = map_.find(key);
+    if (it == map_.end()) {
+      return false;
+    }
+    map_.erase(it);
+    return true;
+  }
+
+  bool Lookup(const size_t& key, string* val) override {
+    ReadLock _(&rwlock_);
+    auto it = map_.find(key);
+    if (it != map_.end()) {
+      *val = it->second;
+    }
+    return it != map_.end();
+  }
+
+ private:
+  port::RWMutex rwlock_;
+  std::unordered_map<size_t, string> map_;
+};
+
+//
+// GranularLockImpl
+// Thread safe custom RocksDB implementation of hash table with granular
+// locking
+class GranularLockImpl : public HashTableImpl<size_t, string> {
+ public:
+  bool Insert(const size_t& key, const string& val) override {
+    Node n(key, val);
+    return impl_.Insert(n);
+  }
+
+  bool Erase(const size_t& key) override {
+    Node n(key, string());
+    return impl_.Erase(n, nullptr);
+  }
+
+  bool Lookup(const size_t& key, string* val) override {
+    Node n(key, string());
+    port::RWMutex* rlock;
+    bool status = impl_.Find(n, &n, &rlock);
+    if (status) {
+      ReadUnlock _(rlock);
+      *val = n.val_;
+    }
+    return status;
+  }
+
+ private:
+  struct Node {
+    explicit Node(const size_t key, const string& val) : key_(key), val_(val) {}
+
+    size_t key_ = 0;
+    string val_;
+  };
+
+  struct Hash {
+    uint64_t operator()(const Node& node) {
+      return std::hash<uint64_t>()(node.key_);
+    }
+  };
+
+  struct Equal {
+    bool operator()(const Node& lhs, const Node& rhs) {
+      return lhs.key_ == rhs.key_;
+    }
+  };
+
+  HashTable<Node, Hash, Equal> impl_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+//
+// main
+//
+int main(int argc, char** argv) {
+  GFLAGS_NAMESPACE::SetUsageMessage(std::string("\nUSAGE:\n") +
+                                    std::string(argv[0]) + " [OPTIONS]...");
+  GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, false);
+
+  //
+  // Micro benchmark unordered_map
+  //
+  printf("Micro benchmarking std::unordered_map \n");
+  {
+    ROCKSDB_NAMESPACE::SimpleImpl impl;
+    ROCKSDB_NAMESPACE::HashTableBenchmark _(
+        &impl, FLAGS_nsec, FLAGS_nthread_write, FLAGS_nthread_read,
+        FLAGS_nthread_erase);
+  }
+  //
+  // Micro benchmark scalable hash table
+  //
+  printf("Micro benchmarking scalable hash map \n");
+  {
+    ROCKSDB_NAMESPACE::GranularLockImpl impl;
+    ROCKSDB_NAMESPACE::HashTableBenchmark _(
+        &impl, FLAGS_nsec, FLAGS_nthread_write, FLAGS_nthread_read,
+        FLAGS_nthread_erase);
+  }
+
+  return 0;
+}
+#endif  // #ifndef GFLAGS
+#else
+int main(int /*argc*/, char** /*argv*/) { return 0; }
+#endif
diff --git a/src/rocksdb/utilities/persistent_cache/hash_table_evictable.h b/src/rocksdb/utilities/persistent_cache/hash_table_evictable.h
new file mode 100644
index 000000000..e10939b2f
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/hash_table_evictable.h
@@ -0,0 +1,168 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+
+#include "util/random.h"
+#include "utilities/persistent_cache/hash_table.h"
+#include "utilities/persistent_cache/lrulist.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Evictable Hash Table
+//
+// Hash table index where least accessed (or one of the least accessed) elements
+// can be evicted.
+//
+// Please note EvictableHashTable can only be created for pointer type objects
+template <class T, class Hash, class Equal>
+class EvictableHashTable : private HashTable<T*, Hash, Equal> {
+ public:
+  using hash_table = HashTable<T*, Hash, Equal>;
+
+  explicit EvictableHashTable(const size_t capacity = 1024 * 1024,
+                              const float load_factor = 2.0,
+                              const uint32_t nlocks = 256)
+      : HashTable<T*, Hash, Equal>(capacity, load_factor, nlocks),
+        lru_lists_(new LRUList<T>[hash_table::nlocks_]) {
+    assert(lru_lists_);
+  }
+
+  virtual ~EvictableHashTable() { AssertEmptyLRU(); }
+
+  //
+  // Insert given record to hash table (and LRU list)
+  //
+  bool Insert(T* t) {
+    const uint64_t h = Hash()(t);
+    typename hash_table::Bucket& bucket = GetBucket(h);
+    LRUListType& lru = GetLRUList(h);
+    port::RWMutex& lock = GetMutex(h);
+
+    WriteLock _(&lock);
+    if (hash_table::Insert(&bucket, t)) {
+      lru.Push(t);
+      return true;
+    }
+    return false;
+  }
+
+  //
+  // Lookup hash table
+  //
+  // Please note that read lock should be held by the caller. This is because
+  // the caller owns the data, and should hold the read lock as long as he
+  // operates on the data.
+  bool Find(T* t, T** ret) {
+    const uint64_t h = Hash()(t);
+    typename hash_table::Bucket& bucket = GetBucket(h);
+    LRUListType& lru = GetLRUList(h);
+    port::RWMutex& lock = GetMutex(h);
+
+    ReadLock _(&lock);
+    if (hash_table::Find(&bucket, t, ret)) {
+      ++(*ret)->refs_;
+      lru.Touch(*ret);
+      return true;
+    }
+    return false;
+  }
+
+  //
+  // Evict one of the least recently used object
+  //
+  T* Evict(const std::function<void(T*)>& fn = nullptr) {
+    uint32_t random = Random::GetTLSInstance()->Next();
+    const size_t start_idx = random % hash_table::nlocks_;
+    T* t = nullptr;
+
+    // iterate from start_idx .. 0 .. start_idx
+    for (size_t i = 0; !t && i < hash_table::nlocks_; ++i) {
+      const size_t idx = (start_idx + i) % hash_table::nlocks_;
+
+      WriteLock _(&hash_table::locks_[idx]);
+      LRUListType& lru = lru_lists_[idx];
+      if (!lru.IsEmpty() && (t = lru.Pop()) != nullptr) {
+        assert(!t->refs_);
+        // We got an item to evict, erase from the bucket
+        const uint64_t h = Hash()(t);
+        typename hash_table::Bucket& bucket = GetBucket(h);
+        T* tmp = nullptr;
+        bool status = hash_table::Erase(&bucket, t, &tmp);
+        assert(t == tmp);
+        (void)status;
+        assert(status);
+        if (fn) {
+          fn(t);
+        }
+        break;
+      }
+      assert(!t);
+    }
+    return t;
+  }
+
+  void Clear(void (*fn)(T*)) {
+    for (uint32_t i = 0; i < hash_table::nbuckets_; ++i) {
+      const uint32_t lock_idx = i % hash_table::nlocks_;
+      WriteLock _(&hash_table::locks_[lock_idx]);
+      auto& lru_list = lru_lists_[lock_idx];
+      auto& bucket = hash_table::buckets_[i];
+      for (auto* t : bucket.list_) {
+        lru_list.Unlink(t);
+        (*fn)(t);
+      }
+      bucket.list_.clear();
+    }
+    // make sure that all LRU lists are emptied
+    AssertEmptyLRU();
+  }
+
+  void AssertEmptyLRU() {
+#ifndef NDEBUG
+    for (uint32_t i = 0; i < hash_table::nlocks_; ++i) {
+      WriteLock _(&hash_table::locks_[i]);
+      auto& lru_list = lru_lists_[i];
+      assert(lru_list.IsEmpty());
+    }
+#endif
+  }
+
+  //
+  // Fetch the mutex associated with a key
+  // This call is used to hold the lock for a given data for extended period of
+  // time.
+  port::RWMutex* GetMutex(T* t) { return hash_table::GetMutex(t); }
+
+ private:
+  using LRUListType = LRUList<T>;
+
+  typename hash_table::Bucket& GetBucket(const uint64_t h) {
+    const uint32_t bucket_idx = h % hash_table::nbuckets_;
+    return hash_table::buckets_[bucket_idx];
+  }
+
+  LRUListType& GetLRUList(const uint64_t h) {
+    const uint32_t bucket_idx = h % hash_table::nbuckets_;
+    const uint32_t lock_idx = bucket_idx % hash_table::nlocks_;
+    return lru_lists_[lock_idx];
+  }
+
+  port::RWMutex& GetMutex(const uint64_t h) {
+    const uint32_t bucket_idx = h % hash_table::nbuckets_;
+    const uint32_t lock_idx = bucket_idx % hash_table::nlocks_;
+    return hash_table::locks_[lock_idx];
+  }
+
+  std::unique_ptr<LRUListType[]> lru_lists_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/utilities/persistent_cache/hash_table_test.cc b/src/rocksdb/utilities/persistent_cache/hash_table_test.cc
new file mode 100644
index 000000000..2f6387f5f
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/hash_table_test.cc
@@ -0,0 +1,163 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "utilities/persistent_cache/hash_table.h"
+
+#include <stdlib.h>
+
+#include <iostream>
+#include <set>
+#include <string>
+
+#include "db/db_test_util.h"
+#include "memory/arena.h"
+#include "test_util/testharness.h"
+#include "util/random.h"
+#include "utilities/persistent_cache/hash_table_evictable.h"
+
+#ifndef ROCKSDB_LITE
+
+namespace ROCKSDB_NAMESPACE {
+
+struct HashTableTest : public testing::Test {
+  ~HashTableTest() override { map_.Clear(&HashTableTest::ClearNode); }
+
+  struct Node {
+    Node() {}
+    explicit Node(const uint64_t key, const std::string& val = std::string())
+        : key_(key), val_(val) {}
+
+    uint64_t key_ = 0;
+    std::string val_;
+  };
+
+  struct Equal {
+    bool operator()(const Node& lhs, const Node& rhs) {
+      return lhs.key_ == rhs.key_;
+    }
+  };
+
+  struct Hash {
+    uint64_t operator()(const Node& node) {
+      return std::hash<uint64_t>()(node.key_);
+    }
+  };
+
+  static void ClearNode(Node /*node*/) {}
+
+  HashTable<Node, Hash, Equal> map_;
+};
+
+struct EvictableHashTableTest : public testing::Test {
+  ~EvictableHashTableTest() override {
+    map_.Clear(&EvictableHashTableTest::ClearNode);
+  }
+
+  struct Node : LRUElement<Node> {
+    Node() {}
+    explicit Node(const uint64_t key, const std::string& val = std::string())
+        : key_(key), val_(val) {}
+
+    uint64_t key_ = 0;
+    std::string val_;
+    std::atomic<uint32_t> refs_{0};
+  };
+
+  struct Equal {
+    bool operator()(const Node* lhs, const Node* rhs) {
+      return lhs->key_ == rhs->key_;
+    }
+  };
+
+  struct Hash {
+    uint64_t operator()(const Node* node) {
+      return std::hash<uint64_t>()(node->key_);
+    }
+  };
+
+  static void ClearNode(Node* /*node*/) {}
+
+  EvictableHashTable<Node, Hash, Equal> map_;
+};
+
+TEST_F(HashTableTest, TestInsert) {
+  const uint64_t max_keys = 1024 * 1024;
+
+  // insert
+  for (uint64_t k = 0; k < max_keys; ++k) {
+    map_.Insert(Node(k, std::string(1000, k % 255)));
+  }
+
+  // verify
+  for (uint64_t k = 0; k < max_keys; ++k) {
+    Node val;
+    port::RWMutex* rlock = nullptr;
+    assert(map_.Find(Node(k), &val, &rlock));
+    rlock->ReadUnlock();
+    assert(val.val_ == std::string(1000, k % 255));
+  }
+}
+
+TEST_F(HashTableTest, TestErase) {
+  const uint64_t max_keys = 1024 * 1024;
+  // insert
+  for (uint64_t k = 0; k < max_keys; ++k) {
+    map_.Insert(Node(k, std::string(1000, k % 255)));
+  }
+
+  auto rand = Random64(time(nullptr));
+  // erase a few keys randomly
+  std::set<uint64_t> erased;
+  for (int i = 0; i < 1024; ++i) {
+    uint64_t k = rand.Next() % max_keys;
+    if (erased.find(k) != erased.end()) {
+      continue;
+    }
+    assert(map_.Erase(Node(k), /*ret=*/nullptr));
+    erased.insert(k);
+  }
+
+  // verify
+  for (uint64_t k = 0; k < max_keys; ++k) {
+    Node val;
+    port::RWMutex* rlock = nullptr;
+    bool status = map_.Find(Node(k), &val, &rlock);
+    if (erased.find(k) == erased.end()) {
+      assert(status);
+      rlock->ReadUnlock();
+      assert(val.val_ == std::string(1000, k % 255));
+    } else {
+      assert(!status);
+    }
+  }
+}
+
+TEST_F(EvictableHashTableTest, TestEvict) {
+  const uint64_t max_keys = 1024 * 1024;
+
+  // insert
+  for (uint64_t k = 0; k < max_keys; ++k) {
+    map_.Insert(new Node(k, std::string(1000, k % 255)));
+  }
+
+  // verify
+  for (uint64_t k = 0; k < max_keys; ++k) {
+    Node* val = map_.Evict();
+    // unfortunately we can't predict eviction value since it is from any one of
+    // the lock stripe
+    assert(val);
+    assert(val->val_ == std::string(1000, val->key_ % 255));
+    delete val;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/utilities/persistent_cache/lrulist.h b/src/rocksdb/utilities/persistent_cache/lrulist.h
new file mode 100644
index 000000000..a608890fc
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/lrulist.h
@@ -0,0 +1,174 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <atomic>
+
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// LRU element definition
+//
+// Any object that needs to be part of the LRU algorithm should extend this
+// class
+template <class T>
+struct LRUElement {
+  explicit LRUElement() : next_(nullptr), prev_(nullptr), refs_(0) {}
+
+  virtual ~LRUElement() { assert(!refs_); }
+
+  T* next_;
+  T* prev_;
+  std::atomic<size_t> refs_;
+};
+
+// LRU implementation
+//
+// In place LRU implementation. There is no copy or allocation involved when
+// inserting or removing an element. This makes the data structure slim
+template <class T>
+class LRUList {
+ public:
+  virtual ~LRUList() {
+    MutexLock _(&lock_);
+    assert(!head_);
+    assert(!tail_);
+  }
+
+  // Push element into the LRU at the cold end
+  inline void Push(T* const t) {
+    assert(t);
+    assert(!t->next_);
+    assert(!t->prev_);
+
+    MutexLock _(&lock_);
+
+    assert((!head_ && !tail_) || (head_ && tail_));
+    assert(!head_ || !head_->prev_);
+    assert(!tail_ || !tail_->next_);
+
+    t->next_ = head_;
+    if (head_) {
+      head_->prev_ = t;
+    }
+
+    head_ = t;
+    if (!tail_) {
+      tail_ = t;
+    }
+  }
+
+  // Unlink the element from the LRU
+  inline void Unlink(T* const t) {
+    MutexLock _(&lock_);
+    UnlinkImpl(t);
+  }
+
+  // Evict an element from the LRU
+  inline T* Pop() {
+    MutexLock _(&lock_);
+
+    assert(tail_ && head_);
+    assert(!tail_->next_);
+    assert(!head_->prev_);
+
+    T* t = head_;
+    while (t && t->refs_) {
+      t = t->next_;
+    }
+
+    if (!t) {
+      // nothing can be evicted
+      return nullptr;
+    }
+
+    assert(!t->refs_);
+
+    // unlike the element
+    UnlinkImpl(t);
+    return t;
+  }
+
+  // Move the element from the front of the list to the back of the list
+  inline void Touch(T* const t) {
+    MutexLock _(&lock_);
+    UnlinkImpl(t);
+    PushBackImpl(t);
+  }
+
+  // Check if the LRU is empty
+  inline bool IsEmpty() const {
+    MutexLock _(&lock_);
+    return !head_ && !tail_;
+  }
+
+ private:
+  // Unlink an element from the LRU
+  void UnlinkImpl(T* const t) {
+    assert(t);
+
+    lock_.AssertHeld();
+
+    assert(head_ && tail_);
+    assert(t->prev_ || head_ == t);
+    assert(t->next_ || tail_ == t);
+
+    if (t->prev_) {
+      t->prev_->next_ = t->next_;
+    }
+    if (t->next_) {
+      t->next_->prev_ = t->prev_;
+    }
+
+    if (tail_ == t) {
+      tail_ = tail_->prev_;
+    }
+    if (head_ == t) {
+      head_ = head_->next_;
+    }
+
+    t->next_ = t->prev_ = nullptr;
+  }
+
+  // Insert an element at the hot end
+  inline void PushBack(T* const t) {
+    MutexLock _(&lock_);
+    PushBackImpl(t);
+  }
+
+  inline void PushBackImpl(T* const t) {
+    assert(t);
+    assert(!t->next_);
+    assert(!t->prev_);
+
+    lock_.AssertHeld();
+
+    assert((!head_ && !tail_) || (head_ && tail_));
+    assert(!head_ || !head_->prev_);
+    assert(!tail_ || !tail_->next_);
+
+    t->prev_ = tail_;
+    if (tail_) {
+      tail_->next_ = t;
+    }
+
+    tail_ = t;
+    if (!head_) {
+      head_ = tail_;
+    }
+  }
+
+  mutable port::Mutex lock_;  // synchronization primitive
+  T* head_ = nullptr;         // front (cold)
+  T* tail_ = nullptr;         // back (hot)
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/utilities/persistent_cache/persistent_cache_bench.cc b/src/rocksdb/utilities/persistent_cache/persistent_cache_bench.cc
new file mode 100644
index 000000000..9d6e15d6b
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/persistent_cache_bench.cc
@@ -0,0 +1,359 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() { fprintf(stderr, "Please install gflags to run tools\n"); }
+#else
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <unordered_map>
+
+#include "monitoring/histogram.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
+#include "table/block_based/block_builder.h"
+#include "util/gflags_compat.h"
+#include "util/mutexlock.h"
+#include "util/stop_watch.h"
+#include "utilities/persistent_cache/block_cache_tier.h"
+#include "utilities/persistent_cache/persistent_cache_tier.h"
+#include "utilities/persistent_cache/volatile_tier_impl.h"
+
+DEFINE_int32(nsec, 10, "nsec");
+DEFINE_int32(nthread_write, 1, "Insert threads");
+DEFINE_int32(nthread_read, 1, "Lookup threads");
+DEFINE_string(path, "/tmp/microbench/blkcache", "Path for cachefile");
+DEFINE_string(log_path, "/tmp/log", "Path for the log file");
+DEFINE_uint64(cache_size, std::numeric_limits<uint64_t>::max(), "Cache size");
+DEFINE_int32(iosize, 4 * 1024, "Read IO size");
+DEFINE_int32(writer_iosize, 4 * 1024, "File writer IO size");
+DEFINE_int32(writer_qdepth, 1, "File writer qdepth");
+DEFINE_bool(enable_pipelined_writes, false, "Enable async writes");
+DEFINE_string(cache_type, "block_cache",
+              "Cache type. (block_cache, volatile, tiered)");
+DEFINE_bool(benchmark, false, "Benchmark mode");
+DEFINE_int32(volatile_cache_pct, 10, "Percentage of cache in memory tier.");
+
+namespace ROCKSDB_NAMESPACE {
+
+std::unique_ptr<PersistentCacheTier> NewVolatileCache() {
+  assert(FLAGS_cache_size != std::numeric_limits<uint64_t>::max());
+  std::unique_ptr<PersistentCacheTier> pcache(
+      new VolatileCacheTier(FLAGS_cache_size));
+  return pcache;
+}
+
+std::unique_ptr<PersistentCacheTier> NewBlockCache() {
+  std::shared_ptr<Logger> log;
+  if (!Env::Default()->NewLogger(FLAGS_log_path, &log).ok()) {
+    fprintf(stderr, "Error creating log %s \n", FLAGS_log_path.c_str());
+    return nullptr;
+  }
+
+  PersistentCacheConfig opt(Env::Default(), FLAGS_path, FLAGS_cache_size, log);
+  opt.writer_dispatch_size = FLAGS_writer_iosize;
+  opt.writer_qdepth = FLAGS_writer_qdepth;
+  opt.pipeline_writes = FLAGS_enable_pipelined_writes;
+  opt.max_write_pipeline_backlog_size = std::numeric_limits<uint64_t>::max();
+  std::unique_ptr<PersistentCacheTier> cache(new BlockCacheTier(opt));
+  Status status = cache->Open();
+  return cache;
+}
+
+// create a new cache tier
+// construct a tiered RAM+Block cache
+std::unique_ptr<PersistentTieredCache> NewTieredCache(
+    const size_t mem_size, const PersistentCacheConfig& opt) {
+  std::unique_ptr<PersistentTieredCache> tcache(new PersistentTieredCache());
+  // create primary tier
+  assert(mem_size);
+  auto pcache =
+      std::shared_ptr<PersistentCacheTier>(new VolatileCacheTier(mem_size));
+  tcache->AddTier(pcache);
+  // create secondary tier
+  auto scache = std::shared_ptr<PersistentCacheTier>(new BlockCacheTier(opt));
+  tcache->AddTier(scache);
+
+  Status s = tcache->Open();
+  assert(s.ok());
+  return tcache;
+}
+
+std::unique_ptr<PersistentTieredCache> NewTieredCache() {
+  std::shared_ptr<Logger> log;
+  if (!Env::Default()->NewLogger(FLAGS_log_path, &log).ok()) {
+    fprintf(stderr, "Error creating log %s \n", FLAGS_log_path.c_str());
+    abort();
+  }
+
+  auto pct = FLAGS_volatile_cache_pct / static_cast<double>(100);
+  PersistentCacheConfig opt(Env::Default(), FLAGS_path,
+                            (1 - pct) * FLAGS_cache_size, log);
+  opt.writer_dispatch_size = FLAGS_writer_iosize;
+  opt.writer_qdepth = FLAGS_writer_qdepth;
+  opt.pipeline_writes = FLAGS_enable_pipelined_writes;
+  opt.max_write_pipeline_backlog_size = std::numeric_limits<uint64_t>::max();
+  return NewTieredCache(FLAGS_cache_size * pct, opt);
+}
+
+//
+// Benchmark driver
+//
+class CacheTierBenchmark {
+ public:
+  explicit CacheTierBenchmark(std::shared_ptr<PersistentCacheTier>&& cache)
+      : cache_(cache) {
+    if (FLAGS_nthread_read) {
+      fprintf(stdout, "Pre-populating\n");
+      Prepop();
+      fprintf(stdout, "Pre-population completed\n");
+    }
+
+    stats_.Clear();
+
+    // Start IO threads
+    std::list<port::Thread> threads;
+    Spawn(FLAGS_nthread_write, &threads,
+          std::bind(&CacheTierBenchmark::Write, this));
+    Spawn(FLAGS_nthread_read, &threads,
+          std::bind(&CacheTierBenchmark::Read, this));
+
+    // Wait till FLAGS_nsec and then signal to quit
+    StopWatchNano t(SystemClock::Default().get(), /*auto_start=*/true);
+    size_t sec = t.ElapsedNanos() / 1000000000ULL;
+    while (!quit_) {
+      sec = t.ElapsedNanos() / 1000000000ULL;
+      quit_ = sec > size_t(FLAGS_nsec);
+      /* sleep override */ sleep(1);
+    }
+
+    // Wait for threads to exit
+    Join(&threads);
+    // Print stats
+    PrintStats(sec);
+    // Close the cache
+    cache_->TEST_Flush();
+    cache_->Close();
+  }
+
+ private:
+  void PrintStats(const size_t sec) {
+    std::ostringstream msg;
+    msg << "Test stats" << std::endl
+        << "* Elapsed: " << sec << " s" << std::endl
+        << "* Write Latency:" << std::endl
+        << stats_.write_latency_.ToString() << std::endl
+        << "* Read Latency:" << std::endl
+        << stats_.read_latency_.ToString() << std::endl
+        << "* Bytes written:" << std::endl
+        << stats_.bytes_written_.ToString() << std::endl
+        << "* Bytes read:" << std::endl
+        << stats_.bytes_read_.ToString() << std::endl
+        << "Cache stats:" << std::endl
+        << cache_->PrintStats() << std::endl;
+    fprintf(stderr, "%s\n", msg.str().c_str());
+  }
+
+  //
+  // Insert implementation and corresponding helper functions
+  //
+  void Prepop() {
+    for (uint64_t i = 0; i < 1024 * 1024; ++i) {
+      InsertKey(i);
+      insert_key_limit_++;
+      read_key_limit_++;
+    }
+
+    // Wait until data is flushed
+    cache_->TEST_Flush();
+    // warmup the cache
+    for (uint64_t i = 0; i < 1024 * 1024; ReadKey(i++)) {
+    }
+  }
+
+  void Write() {
+    while (!quit_) {
+      InsertKey(insert_key_limit_++);
+    }
+  }
+
+  void InsertKey(const uint64_t key) {
+    // construct key
+    uint64_t k[3];
+    Slice block_key = FillKey(k, key);
+
+    // construct value
+    auto block = NewBlock(key);
+
+    // insert
+    StopWatchNano timer(SystemClock::Default().get(), /*auto_start=*/true);
+    while (true) {
+      Status status = cache_->Insert(block_key, block.get(), FLAGS_iosize);
+      if (status.ok()) {
+        break;
+      }
+
+      // transient error is possible if we run without pipelining
+      assert(!FLAGS_enable_pipelined_writes);
+    }
+
+    // adjust stats
+    const size_t elapsed_micro = timer.ElapsedNanos() / 1000;
+    stats_.write_latency_.Add(elapsed_micro);
+    stats_.bytes_written_.Add(FLAGS_iosize);
+  }
+
+  //
+  // Read implementation
+  //
+  void Read() {
+    while (!quit_) {
+      ReadKey(random() % read_key_limit_);
+    }
+  }
+
+  void ReadKey(const uint64_t val) {
+    // construct key
+    uint64_t k[3];
+    Slice key = FillKey(k, val);
+
+    // Lookup in cache
+    StopWatchNano timer(SystemClock::Default().get(), /*auto_start=*/true);
+    std::unique_ptr<char[]> block;
+    size_t size;
+    Status status = cache_->Lookup(key, &block, &size);
+    if (!status.ok()) {
+      fprintf(stderr, "%s\n", status.ToString().c_str());
+    }
+    assert(status.ok());
+    assert(size == (size_t)FLAGS_iosize);
+
+    // adjust stats
+    const size_t elapsed_micro = timer.ElapsedNanos() / 1000;
+    stats_.read_latency_.Add(elapsed_micro);
+    stats_.bytes_read_.Add(FLAGS_iosize);
+
+    // verify content
+    if (!FLAGS_benchmark) {
+      auto expected_block = NewBlock(val);
+      assert(memcmp(block.get(), expected_block.get(), FLAGS_iosize) == 0);
+    }
+  }
+
+  // create data for a key by filling with a certain pattern
+  std::unique_ptr<char[]> NewBlock(const uint64_t val) {
+    std::unique_ptr<char[]> data(new char[FLAGS_iosize]);
+    memset(data.get(), val % 255, FLAGS_iosize);
+    return data;
+  }
+
+  // spawn threads
+  void Spawn(const size_t n, std::list<port::Thread>* threads,
+             const std::function<void()>& fn) {
+    for (size_t i = 0; i < n; ++i) {
+      threads->emplace_back(fn);
+    }
+  }
+
+  // join threads
+  void Join(std::list<port::Thread>* threads) {
+    for (auto& th : *threads) {
+      th.join();
+    }
+  }
+
+  // construct key
+  Slice FillKey(uint64_t (&k)[3], const uint64_t val) {
+    k[0] = k[1] = 0;
+    k[2] = val;
+    void* p = static_cast<void*>(&k);
+    return Slice(static_cast<char*>(p), sizeof(k));
+  }
+
+  // benchmark stats
+  struct Stats {
+    void Clear() {
+      bytes_written_.Clear();
+      bytes_read_.Clear();
+      read_latency_.Clear();
+      write_latency_.Clear();
+    }
+
+    HistogramImpl bytes_written_;
+    HistogramImpl bytes_read_;
+    HistogramImpl read_latency_;
+    HistogramImpl write_latency_;
+  };
+
+  std::shared_ptr<PersistentCacheTier> cache_;  // cache implementation
+  std::atomic<uint64_t> insert_key_limit_{0};   // data inserted upto
+  std::atomic<uint64_t> read_key_limit_{0};     // data can be read safely upto
+  bool quit_ = false;                           // Quit thread ?
+  mutable Stats stats_;                         // Stats
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+//
+// main
+//
+int main(int argc, char** argv) {
+  GFLAGS_NAMESPACE::SetUsageMessage(std::string("\nUSAGE:\n") +
+                                    std::string(argv[0]) + " [OPTIONS]...");
+  GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, false);
+
+  std::ostringstream msg;
+  msg << "Config" << std::endl
+      << "======" << std::endl
+      << "* nsec=" << FLAGS_nsec << std::endl
+      << "* nthread_write=" << FLAGS_nthread_write << std::endl
+      << "* path=" << FLAGS_path << std::endl
+      << "* cache_size=" << FLAGS_cache_size << std::endl
+      << "* iosize=" << FLAGS_iosize << std::endl
+      << "* writer_iosize=" << FLAGS_writer_iosize << std::endl
+      << "* writer_qdepth=" << FLAGS_writer_qdepth << std::endl
+      << "* enable_pipelined_writes=" << FLAGS_enable_pipelined_writes
+      << std::endl
+      << "* cache_type=" << FLAGS_cache_type << std::endl
+      << "* benchmark=" << FLAGS_benchmark << std::endl
+      << "* volatile_cache_pct=" << FLAGS_volatile_cache_pct << std::endl;
+
+  fprintf(stderr, "%s\n", msg.str().c_str());
+
+  std::shared_ptr<ROCKSDB_NAMESPACE::PersistentCacheTier> cache;
+  if (FLAGS_cache_type == "block_cache") {
+    fprintf(stderr, "Using block cache implementation\n");
+    cache = ROCKSDB_NAMESPACE::NewBlockCache();
+  } else if (FLAGS_cache_type == "volatile") {
+    fprintf(stderr, "Using volatile cache implementation\n");
+    cache = ROCKSDB_NAMESPACE::NewVolatileCache();
+  } else if (FLAGS_cache_type == "tiered") {
+    fprintf(stderr, "Using tiered cache implementation\n");
+    cache = ROCKSDB_NAMESPACE::NewTieredCache();
+  } else {
+    fprintf(stderr, "Unknown option for cache\n");
+  }
+
+  assert(cache);
+  if (!cache) {
+    fprintf(stderr, "Error creating cache\n");
+    abort();
+  }
+
+  std::unique_ptr<ROCKSDB_NAMESPACE::CacheTierBenchmark> benchmark(
+      new ROCKSDB_NAMESPACE::CacheTierBenchmark(std::move(cache)));
+
+  return 0;
+}
+#endif  // #ifndef GFLAGS
+#else
+int main(int, char**) { return 0; }
+#endif
diff --git a/src/rocksdb/utilities/persistent_cache/persistent_cache_test.cc b/src/rocksdb/utilities/persistent_cache/persistent_cache_test.cc
new file mode 100644
index 000000000..d1b18b68a
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/persistent_cache_test.cc
@@ -0,0 +1,462 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#if !defined ROCKSDB_LITE
+
+#include "utilities/persistent_cache/persistent_cache_test.h"
+
+#include <functional>
+#include <memory>
+#include <thread>
+
+#include "file/file_util.h"
+#include "utilities/persistent_cache/block_cache_tier.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static const double kStressFactor = .125;
+
+#ifdef OS_LINUX
+static void OnOpenForRead(void* arg) {
+  int* val = static_cast<int*>(arg);
+  *val &= ~O_DIRECT;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "NewRandomAccessFile:O_DIRECT",
+      std::bind(OnOpenForRead, std::placeholders::_1));
+}
+
+static void OnOpenForWrite(void* arg) {
+  int* val = static_cast<int*>(arg);
+  *val &= ~O_DIRECT;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "NewWritableFile:O_DIRECT",
+      std::bind(OnOpenForWrite, std::placeholders::_1));
+}
+#endif
+
+static void OnDeleteDir(void* arg) {
+  char* dir = static_cast<char*>(arg);
+  ASSERT_OK(DestroyDir(Env::Default(), std::string(dir)));
+}
+
+//
+// Simple logger that prints message on stdout
+//
+class ConsoleLogger : public Logger {
+ public:
+  using Logger::Logv;
+  ConsoleLogger() : Logger(InfoLogLevel::ERROR_LEVEL) {}
+
+  void Logv(const char* format, va_list ap) override {
+    MutexLock _(&lock_);
+    vprintf(format, ap);
+    printf("\n");
+  }
+
+  port::Mutex lock_;
+};
+
+// construct a tiered RAM+Block cache
+std::unique_ptr<PersistentTieredCache> NewTieredCache(
+    const size_t mem_size, const PersistentCacheConfig& opt) {
+  std::unique_ptr<PersistentTieredCache> tcache(new PersistentTieredCache());
+  // create primary tier
+  assert(mem_size);
+  auto pcache = std::shared_ptr<PersistentCacheTier>(new VolatileCacheTier(
+      /*is_compressed*/ true, mem_size));
+  tcache->AddTier(pcache);
+  // create secondary tier
+  auto scache = std::shared_ptr<PersistentCacheTier>(new BlockCacheTier(opt));
+  tcache->AddTier(scache);
+
+  Status s = tcache->Open();
+  assert(s.ok());
+  return tcache;
+}
+
+// create block cache
+std::unique_ptr<PersistentCacheTier> NewBlockCache(
+    Env* env, const std::string& path,
+    const uint64_t max_size = std::numeric_limits<uint64_t>::max(),
+    const bool enable_direct_writes = false) {
+  const uint32_t max_file_size =
+      static_cast<uint32_t>(12 * 1024 * 1024 * kStressFactor);
+  auto log = std::make_shared<ConsoleLogger>();
+  PersistentCacheConfig opt(env, path, max_size, log);
+  opt.cache_file_size = max_file_size;
+  opt.max_write_pipeline_backlog_size = std::numeric_limits<uint64_t>::max();
+  opt.enable_direct_writes = enable_direct_writes;
+  std::unique_ptr<PersistentCacheTier> scache(new BlockCacheTier(opt));
+  Status s = scache->Open();
+  assert(s.ok());
+  return scache;
+}
+
+// create a new cache tier
+std::unique_ptr<PersistentTieredCache> NewTieredCache(
+    Env* env, const std::string& path, const uint64_t max_volatile_cache_size,
+    const uint64_t max_block_cache_size =
+        std::numeric_limits<uint64_t>::max()) {
+  const uint32_t max_file_size =
+      static_cast<uint32_t>(12 * 1024 * 1024 * kStressFactor);
+  auto log = std::make_shared<ConsoleLogger>();
+  auto opt = PersistentCacheConfig(env, path, max_block_cache_size, log);
+  opt.cache_file_size = max_file_size;
+  opt.max_write_pipeline_backlog_size = std::numeric_limits<uint64_t>::max();
+  // create tier out of the two caches
+  auto cache = NewTieredCache(max_volatile_cache_size, opt);
+  return cache;
+}
+
+PersistentCacheTierTest::PersistentCacheTierTest()
+    : path_(test::PerThreadDBPath("cache_test")) {
+#ifdef OS_LINUX
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "NewRandomAccessFile:O_DIRECT", OnOpenForRead);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "NewWritableFile:O_DIRECT", OnOpenForWrite);
+#endif
+}
+
+// Block cache tests
+TEST_F(PersistentCacheTierTest, DISABLED_BlockCacheInsertWithFileCreateError) {
+  cache_ = NewBlockCache(Env::Default(), path_,
+                         /*size=*/std::numeric_limits<uint64_t>::max(),
+                         /*direct_writes=*/false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BlockCacheTier::NewCacheFile:DeleteDir", OnDeleteDir);
+
+  RunNegativeInsertTest(/*nthreads=*/1,
+                        /*max_keys*/
+                        static_cast<size_t>(10 * 1024 * kStressFactor));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// Travis is unable to handle the normal version of the tests running out of
+// fds, out of space and timeouts. This is an easier version of the test
+// specifically written for Travis
+TEST_F(PersistentCacheTierTest, DISABLED_BasicTest) {
+  cache_ = std::make_shared<VolatileCacheTier>();
+  RunInsertTest(/*nthreads=*/1, /*max_keys=*/1024);
+
+  cache_ = NewBlockCache(Env::Default(), path_,
+                         /*size=*/std::numeric_limits<uint64_t>::max(),
+                         /*direct_writes=*/true);
+  RunInsertTest(/*nthreads=*/1, /*max_keys=*/1024);
+
+  cache_ = NewTieredCache(Env::Default(), path_,
+                          /*memory_size=*/static_cast<size_t>(1 * 1024 * 1024));
+  RunInsertTest(/*nthreads=*/1, /*max_keys=*/1024);
+}
+
+// Volatile cache tests
+// DISABLED for now (somewhat expensive)
+TEST_F(PersistentCacheTierTest, DISABLED_VolatileCacheInsert) {
+  for (auto nthreads : {1, 5}) {
+    for (auto max_keys :
+         {10 * 1024 * kStressFactor, 1 * 1024 * 1024 * kStressFactor}) {
+      cache_ = std::make_shared<VolatileCacheTier>();
+      RunInsertTest(nthreads, static_cast<size_t>(max_keys));
+    }
+  }
+}
+
+// DISABLED for now (somewhat expensive)
+TEST_F(PersistentCacheTierTest, DISABLED_VolatileCacheInsertWithEviction) {
+  for (auto nthreads : {1, 5}) {
+    for (auto max_keys : {1 * 1024 * 1024 * kStressFactor}) {
+      cache_ = std::make_shared<VolatileCacheTier>(
+          /*compressed=*/true,
+          /*size=*/static_cast<size_t>(1 * 1024 * 1024 * kStressFactor));
+      RunInsertTestWithEviction(nthreads, static_cast<size_t>(max_keys));
+    }
+  }
+}
+
+// Block cache tests
+// DISABLED for now (expensive)
+TEST_F(PersistentCacheTierTest, DISABLED_BlockCacheInsert) {
+  for (auto direct_writes : {true, false}) {
+    for (auto nthreads : {1, 5}) {
+      for (auto max_keys :
+           {10 * 1024 * kStressFactor, 1 * 1024 * 1024 * kStressFactor}) {
+        cache_ = NewBlockCache(Env::Default(), path_,
+                               /*size=*/std::numeric_limits<uint64_t>::max(),
+                               direct_writes);
+        RunInsertTest(nthreads, static_cast<size_t>(max_keys));
+      }
+    }
+  }
+}
+
+// DISABLED for now (somewhat expensive)
+TEST_F(PersistentCacheTierTest, DISABLED_BlockCacheInsertWithEviction) {
+  for (auto nthreads : {1, 5}) {
+    for (auto max_keys : {1 * 1024 * 1024 * kStressFactor}) {
+      cache_ = NewBlockCache(
+          Env::Default(), path_,
+          /*max_size=*/static_cast<size_t>(200 * 1024 * 1024 * kStressFactor));
+      RunInsertTestWithEviction(nthreads, static_cast<size_t>(max_keys));
+    }
+  }
+}
+
+// Tiered cache tests
+// DISABLED for now (expensive)
+TEST_F(PersistentCacheTierTest, DISABLED_TieredCacheInsert) {
+  for (auto nthreads : {1, 5}) {
+    for (auto max_keys :
+         {10 * 1024 * kStressFactor, 1 * 1024 * 1024 * kStressFactor}) {
+      cache_ = NewTieredCache(
+          Env::Default(), path_,
+          /*memory_size=*/static_cast<size_t>(1 * 1024 * 1024 * kStressFactor));
+      RunInsertTest(nthreads, static_cast<size_t>(max_keys));
+    }
+  }
+}
+
+// the tests causes a lot of file deletions which Travis limited testing
+// environment cannot handle
+// DISABLED for now (somewhat expensive)
+TEST_F(PersistentCacheTierTest, DISABLED_TieredCacheInsertWithEviction) {
+  for (auto nthreads : {1, 5}) {
+    for (auto max_keys : {1 * 1024 * 1024 * kStressFactor}) {
+      cache_ = NewTieredCache(
+          Env::Default(), path_,
+          /*memory_size=*/static_cast<size_t>(1 * 1024 * 1024 * kStressFactor),
+          /*block_cache_size*/
+          static_cast<size_t>(200 * 1024 * 1024 * kStressFactor));
+      RunInsertTestWithEviction(nthreads, static_cast<size_t>(max_keys));
+    }
+  }
+}
+
+std::shared_ptr<PersistentCacheTier> MakeVolatileCache(
+    Env* /*env*/, const std::string& /*dbname*/) {
+  return std::make_shared<VolatileCacheTier>();
+}
+
+std::shared_ptr<PersistentCacheTier> MakeBlockCache(Env* env,
+                                                    const std::string& dbname) {
+  return NewBlockCache(env, dbname);
+}
+
+std::shared_ptr<PersistentCacheTier> MakeTieredCache(
+    Env* env, const std::string& dbname) {
+  const auto memory_size = 1 * 1024 * 1024 * kStressFactor;
+  return NewTieredCache(env, dbname, static_cast<size_t>(memory_size));
+}
+
+#ifdef OS_LINUX
+static void UniqueIdCallback(void* arg) {
+  int* result = reinterpret_cast<int*>(arg);
+  if (*result == -1) {
+    *result = 0;
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback);
+}
+#endif
+
+TEST_F(PersistentCacheTierTest, FactoryTest) {
+  for (auto nvm_opt : {true, false}) {
+    ASSERT_FALSE(cache_);
+    auto log = std::make_shared<ConsoleLogger>();
+    std::shared_ptr<PersistentCache> cache;
+    ASSERT_OK(NewPersistentCache(Env::Default(), path_,
+                                 /*size=*/1 * 1024 * 1024 * 1024, log, nvm_opt,
+                                 &cache));
+    ASSERT_TRUE(cache);
+    ASSERT_EQ(cache->Stats().size(), 1);
+    ASSERT_TRUE(cache->Stats()[0].size());
+    cache.reset();
+  }
+}
+
+PersistentCacheDBTest::PersistentCacheDBTest()
+    : DBTestBase("cache_test", /*env_do_fsync=*/true) {
+#ifdef OS_LINUX
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "NewRandomAccessFile:O_DIRECT", OnOpenForRead);
+#endif
+}
+
+// test template
+void PersistentCacheDBTest::RunTest(
+    const std::function<std::shared_ptr<PersistentCacheTier>(bool)>& new_pcache,
+    const size_t max_keys = 100 * 1024, const size_t max_usecase = 5) {
+  // number of insertion interations
+  int num_iter = static_cast<int>(max_keys * kStressFactor);
+
+  for (size_t iter = 0; iter < max_usecase; iter++) {
+    Options options;
+    options.write_buffer_size =
+        static_cast<size_t>(64 * 1024 * kStressFactor);  // small write buffer
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    options = CurrentOptions(options);
+
+    // setup page cache
+    std::shared_ptr<PersistentCacheTier> pcache;
+    BlockBasedTableOptions table_options;
+    table_options.cache_index_and_filter_blocks = true;
+
+    const size_t size_max = std::numeric_limits<size_t>::max();
+
+    switch (iter) {
+      case 0:
+        // page cache, block cache, no-compressed cache
+        pcache = new_pcache(/*is_compressed=*/true);
+        table_options.persistent_cache = pcache;
+        table_options.block_cache = NewLRUCache(size_max);
+        table_options.block_cache_compressed = nullptr;
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        break;
+      case 1:
+        // page cache, block cache, compressed cache
+        pcache = new_pcache(/*is_compressed=*/true);
+        table_options.persistent_cache = pcache;
+        table_options.block_cache = NewLRUCache(size_max);
+        table_options.block_cache_compressed = NewLRUCache(size_max);
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        break;
+      case 2:
+        // page cache, block cache, compressed cache + KNoCompression
+        // both block cache and compressed cache, but DB is not compressed
+        // also, make block cache sizes bigger, to trigger block cache hits
+        pcache = new_pcache(/*is_compressed=*/true);
+        table_options.persistent_cache = pcache;
+        table_options.block_cache = NewLRUCache(size_max);
+        table_options.block_cache_compressed = NewLRUCache(size_max);
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        options.compression = kNoCompression;
+        break;
+      case 3:
+        // page cache, no block cache, no compressed cache
+        pcache = new_pcache(/*is_compressed=*/false);
+        table_options.persistent_cache = pcache;
+        table_options.block_cache = nullptr;
+        table_options.block_cache_compressed = nullptr;
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        break;
+      case 4:
+        // page cache, no block cache, no compressed cache
+        // Page cache caches compressed blocks
+        pcache = new_pcache(/*is_compressed=*/true);
+        table_options.persistent_cache = pcache;
+        table_options.block_cache = nullptr;
+        table_options.block_cache_compressed = nullptr;
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        break;
+      default:
+        FAIL();
+    }
+
+    std::vector<std::string> values;
+    // insert data
+    Insert(options, table_options, num_iter, &values);
+    // flush all data in cache to device
+    pcache->TEST_Flush();
+    // verify data
+    Verify(num_iter, values);
+
+    auto block_miss = TestGetTickerCount(options, BLOCK_CACHE_MISS);
+    auto compressed_block_hit =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT);
+    auto compressed_block_miss =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS);
+    auto page_hit = TestGetTickerCount(options, PERSISTENT_CACHE_HIT);
+    auto page_miss = TestGetTickerCount(options, PERSISTENT_CACHE_MISS);
+
+    // check that we triggered the appropriate code paths in the cache
+    switch (iter) {
+      case 0:
+        // page cache, block cache, no-compressed cache
+        ASSERT_GT(page_miss, 0);
+        ASSERT_GT(page_hit, 0);
+        ASSERT_GT(block_miss, 0);
+        ASSERT_EQ(compressed_block_miss, 0);
+        ASSERT_EQ(compressed_block_hit, 0);
+        break;
+      case 1:
+        // page cache, block cache, compressed cache
+        ASSERT_GT(page_miss, 0);
+        ASSERT_GT(block_miss, 0);
+        ASSERT_GT(compressed_block_miss, 0);
+        break;
+      case 2:
+        // page cache, block cache, compressed cache + KNoCompression
+        ASSERT_GT(page_miss, 0);
+        ASSERT_GT(page_hit, 0);
+        ASSERT_GT(block_miss, 0);
+        ASSERT_GT(compressed_block_miss, 0);
+        // remember kNoCompression
+        ASSERT_EQ(compressed_block_hit, 0);
+        break;
+      case 3:
+      case 4:
+        // page cache, no block cache, no compressed cache
+        ASSERT_GT(page_miss, 0);
+        ASSERT_GT(page_hit, 0);
+        ASSERT_EQ(compressed_block_hit, 0);
+        ASSERT_EQ(compressed_block_miss, 0);
+        break;
+      default:
+        FAIL();
+    }
+
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+
+    ASSERT_OK(pcache->Close());
+  }
+}
+
+// Travis is unable to handle the normal version of the tests running out of
+// fds, out of space and timeouts. This is an easier version of the test
+// specifically written for Travis.
+// Now used generally because main tests are too expensive as unit tests.
+TEST_F(PersistentCacheDBTest, BasicTest) {
+  RunTest(std::bind(&MakeBlockCache, env_, dbname_), /*max_keys=*/1024,
+          /*max_usecase=*/1);
+}
+
+// test table with block page cache
+// DISABLED for now (very expensive, especially memory)
+TEST_F(PersistentCacheDBTest, DISABLED_BlockCacheTest) {
+  RunTest(std::bind(&MakeBlockCache, env_, dbname_));
+}
+
+// test table with volatile page cache
+// DISABLED for now (very expensive, especially memory)
+TEST_F(PersistentCacheDBTest, DISABLED_VolatileCacheTest) {
+  RunTest(std::bind(&MakeVolatileCache, env_, dbname_));
+}
+
+// test table with tiered page cache
+// DISABLED for now (very expensive, especially memory)
+TEST_F(PersistentCacheDBTest, DISABLED_TieredCacheTest) {
+  RunTest(std::bind(&MakeTieredCache, env_, dbname_));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+#else   // !defined ROCKSDB_LITE
+int main() { return 0; }
+#endif  // !defined ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/persistent_cache/persistent_cache_test.h b/src/rocksdb/utilities/persistent_cache/persistent_cache_test.h
new file mode 100644
index 000000000..f13155ed6
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/persistent_cache_test.h
@@ -0,0 +1,286 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+#include <limits>
+#include <list>
+#include <memory>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "db/db_test_util.h"
+#include "memory/arena.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "table/block_based/block_builder.h"
+#include "test_util/testharness.h"
+#include "util/random.h"
+#include "utilities/persistent_cache/volatile_tier_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+//
+// Unit tests for testing PersistentCacheTier
+//
+class PersistentCacheTierTest : public testing::Test {
+ public:
+  PersistentCacheTierTest();
+  virtual ~PersistentCacheTierTest() {
+    if (cache_) {
+      Status s = cache_->Close();
+      assert(s.ok());
+    }
+  }
+
+ protected:
+  // Flush cache
+  void Flush() {
+    if (cache_) {
+      cache_->TEST_Flush();
+    }
+  }
+
+  // create threaded workload
+  template <class T>
+  std::list<port::Thread> SpawnThreads(const size_t n, const T& fn) {
+    std::list<port::Thread> threads;
+    for (size_t i = 0; i < n; i++) {
+      port::Thread th(fn);
+      threads.push_back(std::move(th));
+    }
+    return threads;
+  }
+
+  // Wait for threads to join
+  void Join(std::list<port::Thread>&& threads) {
+    for (auto& th : threads) {
+      th.join();
+    }
+    threads.clear();
+  }
+
+  // Run insert workload in threads
+  void Insert(const size_t nthreads, const size_t max_keys) {
+    key_ = 0;
+    max_keys_ = max_keys;
+    // spawn threads
+    auto fn = std::bind(&PersistentCacheTierTest::InsertImpl, this);
+    auto threads = SpawnThreads(nthreads, fn);
+    // join with threads
+    Join(std::move(threads));
+    // Flush cache
+    Flush();
+  }
+
+  // Run verification on the cache
+  void Verify(const size_t nthreads = 1, const bool eviction_enabled = false) {
+    stats_verify_hits_ = 0;
+    stats_verify_missed_ = 0;
+    key_ = 0;
+    // spawn threads
+    auto fn =
+        std::bind(&PersistentCacheTierTest::VerifyImpl, this, eviction_enabled);
+    auto threads = SpawnThreads(nthreads, fn);
+    // join with threads
+    Join(std::move(threads));
+  }
+
+  // pad 0 to numbers
+  std::string PaddedNumber(const size_t data, const size_t pad_size) {
+    assert(pad_size);
+    char* ret = new char[pad_size];
+    int pos = static_cast<int>(pad_size) - 1;
+    size_t count = 0;
+    size_t t = data;
+    // copy numbers
+    while (t) {
+      count++;
+      ret[pos--] = '0' + t % 10;
+      t = t / 10;
+    }
+    // copy 0s
+    while (pos >= 0) {
+      ret[pos--] = '0';
+    }
+    // post condition
+    assert(count <= pad_size);
+    assert(pos == -1);
+    std::string result(ret, pad_size);
+    delete[] ret;
+    return result;
+  }
+
+  // Insert workload implementation
+  void InsertImpl() {
+    const std::string prefix = "key_prefix_";
+
+    while (true) {
+      size_t i = key_++;
+      if (i >= max_keys_) {
+        break;
+      }
+
+      char data[4 * 1024];
+      memset(data, '0' + (i % 10), sizeof(data));
+      auto k = prefix + PaddedNumber(i, /*count=*/8);
+      Slice key(k);
+      while (true) {
+        Status status = cache_->Insert(key, data, sizeof(data));
+        if (status.ok()) {
+          break;
+        }
+        ASSERT_TRUE(status.IsTryAgain());
+        Env::Default()->SleepForMicroseconds(1 * 1000 * 1000);
+      }
+    }
+  }
+
+  // Verification implementation
+  void VerifyImpl(const bool eviction_enabled = false) {
+    const std::string prefix = "key_prefix_";
+    while (true) {
+      size_t i = key_++;
+      if (i >= max_keys_) {
+        break;
+      }
+
+      char edata[4 * 1024];
+      memset(edata, '0' + (i % 10), sizeof(edata));
+      auto k = prefix + PaddedNumber(i, /*count=*/8);
+      Slice key(k);
+      std::unique_ptr<char[]> block;
+      size_t block_size;
+
+      if (eviction_enabled) {
+        if (!cache_->Lookup(key, &block, &block_size).ok()) {
+          // assume that the key is evicted
+          stats_verify_missed_++;
+          continue;
+        }
+      }
+
+      ASSERT_OK(cache_->Lookup(key, &block, &block_size));
+      ASSERT_EQ(block_size, sizeof(edata));
+      ASSERT_EQ(memcmp(edata, block.get(), sizeof(edata)), 0);
+      stats_verify_hits_++;
+    }
+  }
+
+  // template for insert test
+  void RunInsertTest(const size_t nthreads, const size_t max_keys) {
+    Insert(nthreads, max_keys);
+    Verify(nthreads);
+    ASSERT_EQ(stats_verify_hits_, max_keys);
+    ASSERT_EQ(stats_verify_missed_, 0);
+
+    ASSERT_OK(cache_->Close());
+    cache_.reset();
+  }
+
+  // template for negative insert test
+  void RunNegativeInsertTest(const size_t nthreads, const size_t max_keys) {
+    Insert(nthreads, max_keys);
+    Verify(nthreads, /*eviction_enabled=*/true);
+    ASSERT_LT(stats_verify_hits_, max_keys);
+    ASSERT_GT(stats_verify_missed_, 0);
+
+    ASSERT_OK(cache_->Close());
+    cache_.reset();
+  }
+
+  // template for insert with eviction test
+  void RunInsertTestWithEviction(const size_t nthreads, const size_t max_keys) {
+    Insert(nthreads, max_keys);
+    Verify(nthreads, /*eviction_enabled=*/true);
+    ASSERT_EQ(stats_verify_hits_ + stats_verify_missed_, max_keys);
+    ASSERT_GT(stats_verify_hits_, 0);
+    ASSERT_GT(stats_verify_missed_, 0);
+
+    ASSERT_OK(cache_->Close());
+    cache_.reset();
+  }
+
+  const std::string path_;
+  std::shared_ptr<Logger> log_;
+  std::shared_ptr<PersistentCacheTier> cache_;
+  std::atomic<size_t> key_{0};
+  size_t max_keys_ = 0;
+  std::atomic<size_t> stats_verify_hits_{0};
+  std::atomic<size_t> stats_verify_missed_{0};
+};
+
+//
+// RocksDB tests
+//
+class PersistentCacheDBTest : public DBTestBase {
+ public:
+  PersistentCacheDBTest();
+
+  static uint64_t TestGetTickerCount(const Options& options,
+                                     Tickers ticker_type) {
+    return static_cast<uint32_t>(
+        options.statistics->getTickerCount(ticker_type));
+  }
+
+  // insert data to table
+  void Insert(const Options& options,
+              const BlockBasedTableOptions& /*table_options*/,
+              const int num_iter, std::vector<std::string>* values) {
+    CreateAndReopenWithCF({"pikachu"}, options);
+    // default column family doesn't have block cache
+    Options no_block_cache_opts;
+    no_block_cache_opts.statistics = options.statistics;
+    no_block_cache_opts = CurrentOptions(no_block_cache_opts);
+    BlockBasedTableOptions table_options_no_bc;
+    table_options_no_bc.no_block_cache = true;
+    no_block_cache_opts.table_factory.reset(
+        NewBlockBasedTableFactory(table_options_no_bc));
+    ReopenWithColumnFamilies(
+        {"default", "pikachu"},
+        std::vector<Options>({no_block_cache_opts, options}));
+
+    Random rnd(301);
+
+    // Write 8MB (80 values, each 100K)
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+    std::string str;
+    for (int i = 0; i < num_iter; i++) {
+      if (i % 4 == 0) {  // high compression ratio
+        str = rnd.RandomString(1000);
+      }
+      values->push_back(str);
+      ASSERT_OK(Put(1, Key(i), (*values)[i]));
+    }
+
+    // flush all data from memtable so that reads are from block cache
+    ASSERT_OK(Flush(1));
+  }
+
+  // verify data
+  void Verify(const int num_iter, const std::vector<std::string>& values) {
+    for (int j = 0; j < 2; ++j) {
+      for (int i = 0; i < num_iter; i++) {
+        ASSERT_EQ(Get(1, Key(i)), values[i]);
+      }
+    }
+  }
+
+  // test template
+  void RunTest(const std::function<std::shared_ptr<PersistentCacheTier>(bool)>&
+                   new_pcache,
+               const size_t max_keys, const size_t max_usecase);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/utilities/persistent_cache/persistent_cache_tier.cc b/src/rocksdb/utilities/persistent_cache/persistent_cache_tier.cc
new file mode 100644
index 000000000..54cbce8f7
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/persistent_cache_tier.cc
@@ -0,0 +1,167 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+
+#include "utilities/persistent_cache/persistent_cache_tier.h"
+
+#include <cinttypes>
+#include <sstream>
+#include <string>
+
+namespace ROCKSDB_NAMESPACE {
+
+std::string PersistentCacheConfig::ToString() const {
+  std::string ret;
+  ret.reserve(20000);
+  const int kBufferSize = 200;
+  char buffer[kBufferSize];
+
+  snprintf(buffer, kBufferSize, "    path: %s\n", path.c_str());
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "    enable_direct_reads: %d\n",
+           enable_direct_reads);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "    enable_direct_writes: %d\n",
+           enable_direct_writes);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "    cache_size: %" PRIu64 "\n", cache_size);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "    cache_file_size: %" PRIu32 "\n",
+           cache_file_size);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "    writer_qdepth: %" PRIu32 "\n",
+           writer_qdepth);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "    pipeline_writes: %d\n", pipeline_writes);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize,
+           "    max_write_pipeline_backlog_size: %" PRIu64 "\n",
+           max_write_pipeline_backlog_size);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "    write_buffer_size: %" PRIu32 "\n",
+           write_buffer_size);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "    writer_dispatch_size: %" PRIu64 "\n",
+           writer_dispatch_size);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "    is_compressed: %d\n", is_compressed);
+  ret.append(buffer);
+
+  return ret;
+}
+
+//
+// PersistentCacheTier implementation
+//
+Status PersistentCacheTier::Open() {
+  if (next_tier_) {
+    return next_tier_->Open();
+  }
+  return Status::OK();
+}
+
+Status PersistentCacheTier::Close() {
+  if (next_tier_) {
+    return next_tier_->Close();
+  }
+  return Status::OK();
+}
+
+bool PersistentCacheTier::Reserve(const size_t /*size*/) {
+  // default implementation is a pass through
+  return true;
+}
+
+bool PersistentCacheTier::Erase(const Slice& /*key*/) {
+  // default implementation is a pass through since not all cache tiers might
+  // support erase
+  return true;
+}
+
+std::string PersistentCacheTier::PrintStats() {
+  std::ostringstream os;
+  for (auto tier_stats : Stats()) {
+    os << "---- next tier -----" << std::endl;
+    for (auto stat : tier_stats) {
+      os << stat.first << ": " << stat.second << std::endl;
+    }
+  }
+  return os.str();
+}
+
+PersistentCache::StatsType PersistentCacheTier::Stats() {
+  if (next_tier_) {
+    return next_tier_->Stats();
+  }
+  return PersistentCache::StatsType{};
+}
+
+uint64_t PersistentCacheTier::NewId() {
+  return last_id_.fetch_add(1, std::memory_order_relaxed);
+}
+
+//
+// PersistentTieredCache implementation
+//
+PersistentTieredCache::~PersistentTieredCache() { assert(tiers_.empty()); }
+
+Status PersistentTieredCache::Open() {
+  assert(!tiers_.empty());
+  return tiers_.front()->Open();
+}
+
+Status PersistentTieredCache::Close() {
+  assert(!tiers_.empty());
+  Status status = tiers_.front()->Close();
+  if (status.ok()) {
+    tiers_.clear();
+  }
+  return status;
+}
+
+bool PersistentTieredCache::Erase(const Slice& key) {
+  assert(!tiers_.empty());
+  return tiers_.front()->Erase(key);
+}
+
+PersistentCache::StatsType PersistentTieredCache::Stats() {
+  assert(!tiers_.empty());
+  return tiers_.front()->Stats();
+}
+
+std::string PersistentTieredCache::PrintStats() {
+  assert(!tiers_.empty());
+  return tiers_.front()->PrintStats();
+}
+
+Status PersistentTieredCache::Insert(const Slice& page_key, const char* data,
+                                     const size_t size) {
+  assert(!tiers_.empty());
+  return tiers_.front()->Insert(page_key, data, size);
+}
+
+Status PersistentTieredCache::Lookup(const Slice& page_key,
+                                     std::unique_ptr<char[]>* data,
+                                     size_t* size) {
+  assert(!tiers_.empty());
+  return tiers_.front()->Lookup(page_key, data, size);
+}
+
+void PersistentTieredCache::AddTier(const Tier& tier) {
+  if (!tiers_.empty()) {
+    tiers_.back()->set_next_tier(tier);
+  }
+  tiers_.push_back(tier);
+}
+
+bool PersistentTieredCache::IsCompressed() {
+  assert(tiers_.size());
+  return tiers_.front()->IsCompressed();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/utilities/persistent_cache/persistent_cache_tier.h b/src/rocksdb/utilities/persistent_cache/persistent_cache_tier.h
new file mode 100644
index 000000000..65aadcd3f
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/persistent_cache_tier.h
@@ -0,0 +1,342 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <limits>
+#include <list>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "monitoring/histogram.h"
+#include "rocksdb/env.h"
+#include "rocksdb/persistent_cache.h"
+#include "rocksdb/status.h"
+#include "rocksdb/system_clock.h"
+
+// Persistent Cache
+//
+// Persistent cache is tiered key-value cache that can use persistent medium. It
+// is a generic design and can leverage any storage medium -- disk/SSD/NVM/RAM.
+// The code has been kept generic but significant benchmark/design/development
+// time has been spent to make sure the cache performs appropriately for
+// respective storage medium.
+// The file defines
+// PersistentCacheTier    : Implementation that handles individual cache tier
+// PersistentTieresCache  : Implementation that handles all tiers as a logical
+//                          unit
+//
+// PersistentTieredCache architecture:
+// +--------------------------+ PersistentCacheTier that handles multiple tiers
+// | +----------------+       |
+// | | RAM            | PersistentCacheTier that handles RAM (VolatileCacheImpl)
+// | +----------------+       |
+// |   | next                 |
+// |   v                      |
+// | +----------------+       |
+// | | NVM            | PersistentCacheTier implementation that handles NVM
+// | +----------------+ (BlockCacheImpl)
+// |   | next                 |
+// |   V                      |
+// | +----------------+       |
+// | | LE-SSD         | PersistentCacheTier implementation that handles LE-SSD
+// | +----------------+ (BlockCacheImpl)
+// |   |                      |
+// |   V                      |
+// |  null                    |
+// +--------------------------+
+//               |
+//               V
+//              null
+namespace ROCKSDB_NAMESPACE {
+
+// Persistent Cache Config
+//
+// This struct captures all the options that are used to configure persistent
+// cache. Some of the terminologies used in naming the options are
+//
+// dispatch size :
+// This is the size in which IO is dispatched to the device
+//
+// write buffer size :
+// This is the size of an individual write buffer size. Write buffers are
+// grouped to form buffered file.
+//
+// cache size :
+// This is the logical maximum for the cache size
+//
+// qdepth :
+// This is the max number of IOs that can issues to the device in parallel
+//
+// pepeling :
+// The writer code path follows pipelined architecture, which means the
+// operations are handed off from one stage to another
+//
+// pipelining backlog size :
+// With the pipelined architecture, there can always be backlogging of ops in
+// pipeline queues. This is the maximum backlog size after which ops are dropped
+// from queue
+struct PersistentCacheConfig {
+  explicit PersistentCacheConfig(
+      Env* const _env, const std::string& _path, const uint64_t _cache_size,
+      const std::shared_ptr<Logger>& _log,
+      const uint32_t _write_buffer_size = 1 * 1024 * 1024 /*1MB*/) {
+    env = _env;
+    clock = (env != nullptr) ? env->GetSystemClock().get()
+                             : SystemClock::Default().get();
+    path = _path;
+    log = _log;
+    cache_size = _cache_size;
+    writer_dispatch_size = write_buffer_size = _write_buffer_size;
+  }
+
+  //
+  // Validate the settings. Our intentions are to catch erroneous settings ahead
+  // of time instead going violating invariants or causing dead locks.
+  //
+  Status ValidateSettings() const {
+    // (1) check pre-conditions for variables
+    if (!env || path.empty()) {
+      return Status::InvalidArgument("empty or null args");
+    }
+
+    // (2) assert size related invariants
+    // - cache size cannot be less than cache file size
+    // - individual write buffer size cannot be greater than cache file size
+    // - total write buffer size cannot be less than 2X cache file size
+    if (cache_size < cache_file_size || write_buffer_size >= cache_file_size ||
+        write_buffer_size * write_buffer_count() < 2 * cache_file_size) {
+      return Status::InvalidArgument("invalid cache size");
+    }
+
+    // (2) check writer settings
+    // - Queue depth cannot be 0
+    // - writer_dispatch_size cannot be greater than writer_buffer_size
+    // - dispatch size and buffer size need to be aligned
+    if (!writer_qdepth || writer_dispatch_size > write_buffer_size ||
+        write_buffer_size % writer_dispatch_size) {
+      return Status::InvalidArgument("invalid writer settings");
+    }
+
+    return Status::OK();
+  }
+
+  //
+  // Env abstraction to use for system level operations
+  //
+  Env* env;
+  SystemClock* clock;
+  //
+  // Path for the block cache where blocks are persisted
+  //
+  std::string path;
+
+  //
+  // Log handle for logging messages
+  //
+  std::shared_ptr<Logger> log;
+
+  //
+  // Enable direct IO for reading
+  //
+  bool enable_direct_reads = true;
+
+  //
+  // Enable direct IO for writing
+  //
+  bool enable_direct_writes = false;
+
+  //
+  // Logical cache size
+  //
+  uint64_t cache_size = std::numeric_limits<uint64_t>::max();
+
+  // cache-file-size
+  //
+  // Cache consists of multiples of small files. This parameter defines the
+  // size of an individual cache file
+  //
+  // default: 1M
+  uint32_t cache_file_size = 100ULL * 1024 * 1024;
+
+  // writer-qdepth
+  //
+  // The writers can issues IO to the devices in parallel. This parameter
+  // controls the max number if IOs that can issues in parallel to the block
+  // device
+  //
+  // default :1
+  uint32_t writer_qdepth = 1;
+
+  // pipeline-writes
+  //
+  // The write optionally follow pipelined architecture. This helps
+  // avoid regression in the eviction code path of the primary tier. This
+  // parameter defines if pipelining is enabled or disabled
+  //
+  // default: true
+  bool pipeline_writes = true;
+
+  // max-write-pipeline-backlog-size
+  //
+  // Max pipeline buffer size. This is the maximum backlog we can accumulate
+  // while waiting for writes. After the limit, new ops will be dropped.
+  //
+  // Default: 1GiB
+  uint64_t max_write_pipeline_backlog_size = 1ULL * 1024 * 1024 * 1024;
+
+  // write-buffer-size
+  //
+  // This is the size in which buffer slabs are allocated.
+  //
+  // Default: 1M
+  uint32_t write_buffer_size = 1ULL * 1024 * 1024;
+
+  // write-buffer-count
+  //
+  // This is the total number of buffer slabs. This is calculated as a factor of
+  // file size in order to avoid dead lock.
+  size_t write_buffer_count() const {
+    assert(write_buffer_size);
+    return static_cast<size_t>((writer_qdepth + 1.2) * cache_file_size /
+                               write_buffer_size);
+  }
+
+  // writer-dispatch-size
+  //
+  // The writer thread will dispatch the IO at the specified IO size
+  //
+  // default: 1M
+  uint64_t writer_dispatch_size = 1ULL * 1024 * 1024;
+
+  // is_compressed
+  //
+  // This option determines if the cache will run in compressed mode or
+  // uncompressed mode
+  bool is_compressed = true;
+
+  PersistentCacheConfig MakePersistentCacheConfig(
+      const std::string& path, const uint64_t size,
+      const std::shared_ptr<Logger>& log);
+
+  std::string ToString() const;
+};
+
+// Persistent Cache Tier
+//
+// This a logical abstraction that defines a tier of the persistent cache. Tiers
+// can be stacked over one another. PersistentCahe provides the basic definition
+// for accessing/storing in the cache. PersistentCacheTier extends the interface
+// to enable management and stacking of tiers.
+class PersistentCacheTier : public PersistentCache {
+ public:
+  using Tier = std::shared_ptr<PersistentCacheTier>;
+
+  virtual ~PersistentCacheTier() {}
+
+  // Open the persistent cache tier
+  virtual Status Open();
+
+  // Close the persistent cache tier
+  virtual Status Close();
+
+  // Reserve space up to 'size' bytes
+  virtual bool Reserve(const size_t size);
+
+  // Erase a key from the cache
+  virtual bool Erase(const Slice& key);
+
+  // Print stats to string recursively
+  virtual std::string PrintStats();
+
+  virtual PersistentCache::StatsType Stats() override;
+
+  // Insert to page cache
+  virtual Status Insert(const Slice& page_key, const char* data,
+                        const size_t size) override = 0;
+
+  // Lookup page cache by page identifier
+  virtual Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,
+                        size_t* size) override = 0;
+
+  // Does it store compressed data ?
+  virtual bool IsCompressed() override = 0;
+
+  virtual std::string GetPrintableOptions() const override = 0;
+
+  virtual uint64_t NewId() override;
+
+  // Return a reference to next tier
+  virtual Tier& next_tier() { return next_tier_; }
+
+  // Set the value for next tier
+  virtual void set_next_tier(const Tier& tier) {
+    assert(!next_tier_);
+    next_tier_ = tier;
+  }
+
+  virtual void TEST_Flush() {
+    if (next_tier_) {
+      next_tier_->TEST_Flush();
+    }
+  }
+
+ private:
+  Tier next_tier_;  // next tier
+  std::atomic<uint64_t> last_id_{1};
+};
+
+// PersistentTieredCache
+//
+// Abstraction that helps you construct a tiers of persistent caches as a
+// unified cache. The tier(s) of cache will act a single tier for management
+// ease and support PersistentCache methods for accessing data.
+class PersistentTieredCache : public PersistentCacheTier {
+ public:
+  virtual ~PersistentTieredCache();
+
+  Status Open() override;
+  Status Close() override;
+  bool Erase(const Slice& key) override;
+  std::string PrintStats() override;
+  PersistentCache::StatsType Stats() override;
+  Status Insert(const Slice& page_key, const char* data,
+                const size_t size) override;
+  Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,
+                size_t* size) override;
+  bool IsCompressed() override;
+
+  std::string GetPrintableOptions() const override {
+    return "PersistentTieredCache";
+  }
+
+  void AddTier(const Tier& tier);
+
+  Tier& next_tier() override {
+    auto it = tiers_.end();
+    return (*it)->next_tier();
+  }
+
+  void set_next_tier(const Tier& tier) override {
+    auto it = tiers_.end();
+    (*it)->set_next_tier(tier);
+  }
+
+  void TEST_Flush() override {
+    assert(!tiers_.empty());
+    tiers_.front()->TEST_Flush();
+    PersistentCacheTier::TEST_Flush();
+  }
+
+ protected:
+  std::list<Tier> tiers_;  // list of tiers top-down
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/utilities/persistent_cache/persistent_cache_util.h b/src/rocksdb/utilities/persistent_cache/persistent_cache_util.h
new file mode 100644
index 000000000..2a769652d
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/persistent_cache_util.h
@@ -0,0 +1,67 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <limits>
+#include <list>
+
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+//
+// Simple synchronized queue implementation with the option of
+// bounding the queue
+//
+// On overflow, the elements will be discarded
+//
+template <class T>
+class BoundedQueue {
+ public:
+  explicit BoundedQueue(
+      const size_t max_size = std::numeric_limits<size_t>::max())
+      : cond_empty_(&lock_), max_size_(max_size) {}
+
+  virtual ~BoundedQueue() {}
+
+  void Push(T&& t) {
+    MutexLock _(&lock_);
+    if (max_size_ != std::numeric_limits<size_t>::max() &&
+        size_ + t.Size() >= max_size_) {
+      // overflow
+      return;
+    }
+
+    size_ += t.Size();
+    q_.push_back(std::move(t));
+    cond_empty_.SignalAll();
+  }
+
+  T Pop() {
+    MutexLock _(&lock_);
+    while (q_.empty()) {
+      cond_empty_.Wait();
+    }
+
+    T t = std::move(q_.front());
+    size_ -= t.Size();
+    q_.pop_front();
+    return t;
+  }
+
+  size_t Size() const {
+    MutexLock _(&lock_);
+    return size_;
+  }
+
+ private:
+  mutable port::Mutex lock_;
+  port::CondVar cond_empty_;
+  std::list<T> q_;
+  size_t size_ = 0;
+  const size_t max_size_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc b/src/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc
new file mode 100644
index 000000000..45d2830aa
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc
@@ -0,0 +1,140 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+
+#include "utilities/persistent_cache/volatile_tier_impl.h"
+
+#include <string>
+
+namespace ROCKSDB_NAMESPACE {
+
+void VolatileCacheTier::DeleteCacheData(VolatileCacheTier::CacheData* data) {
+  assert(data);
+  delete data;
+}
+
+VolatileCacheTier::~VolatileCacheTier() { index_.Clear(&DeleteCacheData); }
+
+PersistentCache::StatsType VolatileCacheTier::Stats() {
+  std::map<std::string, double> stat;
+  stat.insert({"persistent_cache.volatile_cache.hits",
+               static_cast<double>(stats_.cache_hits_)});
+  stat.insert({"persistent_cache.volatile_cache.misses",
+               static_cast<double>(stats_.cache_misses_)});
+  stat.insert({"persistent_cache.volatile_cache.inserts",
+               static_cast<double>(stats_.cache_inserts_)});
+  stat.insert({"persistent_cache.volatile_cache.evicts",
+               static_cast<double>(stats_.cache_evicts_)});
+  stat.insert({"persistent_cache.volatile_cache.hit_pct",
+               static_cast<double>(stats_.CacheHitPct())});
+  stat.insert({"persistent_cache.volatile_cache.miss_pct",
+               static_cast<double>(stats_.CacheMissPct())});
+
+  auto out = PersistentCacheTier::Stats();
+  out.push_back(stat);
+  return out;
+}
+
+Status VolatileCacheTier::Insert(const Slice& page_key, const char* data,
+                                 const size_t size) {
+  // precondition
+  assert(data);
+  assert(size);
+
+  // increment the size
+  size_ += size;
+
+  // check if we have overshot the limit, if so evict some space
+  while (size_ > max_size_) {
+    if (!Evict()) {
+      // unable to evict data, we give up so we don't spike read
+      // latency
+      assert(size_ >= size);
+      size_ -= size;
+      return Status::TryAgain("Unable to evict any data");
+    }
+  }
+
+  assert(size_ >= size);
+
+  // insert order: LRU, followed by index
+  std::string key(page_key.data(), page_key.size());
+  std::string value(data, size);
+  std::unique_ptr<CacheData> cache_data(
+      new CacheData(std::move(key), std::move(value)));
+  bool ok = index_.Insert(cache_data.get());
+  if (!ok) {
+    // decrement the size that we incremented ahead of time
+    assert(size_ >= size);
+    size_ -= size;
+    // failed to insert to cache, block already in cache
+    return Status::TryAgain("key already exists in volatile cache");
+  }
+
+  cache_data.release();
+  stats_.cache_inserts_++;
+  return Status::OK();
+}
+
+Status VolatileCacheTier::Lookup(const Slice& page_key,
+                                 std::unique_ptr<char[]>* result,
+                                 size_t* size) {
+  CacheData key(std::move(page_key.ToString()));
+  CacheData* kv;
+  bool ok = index_.Find(&key, &kv);
+  if (ok) {
+    // set return data
+    result->reset(new char[kv->value.size()]);
+    memcpy(result->get(), kv->value.c_str(), kv->value.size());
+    *size = kv->value.size();
+    // drop the reference on cache data
+    kv->refs_--;
+    // update stats
+    stats_.cache_hits_++;
+    return Status::OK();
+  }
+
+  stats_.cache_misses_++;
+
+  if (next_tier()) {
+    return next_tier()->Lookup(page_key, result, size);
+  }
+
+  return Status::NotFound("key not found in volatile cache");
+}
+
+bool VolatileCacheTier::Erase(const Slice& /*key*/) {
+  assert(!"not supported");
+  return true;
+}
+
+bool VolatileCacheTier::Evict() {
+  CacheData* edata = index_.Evict();
+  if (!edata) {
+    // not able to evict any object
+    return false;
+  }
+
+  stats_.cache_evicts_++;
+
+  // push the evicted object to the next level
+  if (next_tier()) {
+    // TODO: Should the insert error be ignored?
+    Status s = next_tier()->Insert(Slice(edata->key), edata->value.c_str(),
+                                   edata->value.size());
+    s.PermitUncheckedError();
+  }
+
+  // adjust size and destroy data
+  size_ -= edata->value.size();
+  delete edata;
+
+  return true;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/utilities/persistent_cache/volatile_tier_impl.h b/src/rocksdb/utilities/persistent_cache/volatile_tier_impl.h
new file mode 100644
index 000000000..09265e457
--- /dev/null
+++ b/src/rocksdb/utilities/persistent_cache/volatile_tier_impl.h
@@ -0,0 +1,141 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <atomic>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "rocksdb/cache.h"
+#include "utilities/persistent_cache/hash_table.h"
+#include "utilities/persistent_cache/hash_table_evictable.h"
+#include "utilities/persistent_cache/persistent_cache_tier.h"
+
+// VolatileCacheTier
+//
+// This file provides persistent cache tier implementation for caching
+// key/values in RAM.
+//
+//        key/values
+//           |
+//           V
+// +-------------------+
+// | VolatileCacheTier | Store in an evictable hash table
+// +-------------------+
+//           |
+//           V
+//       on eviction
+//   pushed to next tier
+//
+// The implementation is designed to be concurrent. The evictable hash table
+// implementation is not concurrent at this point though.
+//
+// The eviction algorithm is LRU
+namespace ROCKSDB_NAMESPACE {
+
+class VolatileCacheTier : public PersistentCacheTier {
+ public:
+  explicit VolatileCacheTier(
+      const bool is_compressed = true,
+      const size_t max_size = std::numeric_limits<size_t>::max())
+      : is_compressed_(is_compressed), max_size_(max_size) {}
+
+  virtual ~VolatileCacheTier();
+
+  // insert to cache
+  Status Insert(const Slice& page_key, const char* data,
+                const size_t size) override;
+  // lookup key in cache
+  Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,
+                size_t* size) override;
+
+  // is compressed cache ?
+  bool IsCompressed() override { return is_compressed_; }
+
+  // erase key from cache
+  bool Erase(const Slice& key) override;
+
+  std::string GetPrintableOptions() const override {
+    return "VolatileCacheTier";
+  }
+
+  // Expose stats as map
+  PersistentCache::StatsType Stats() override;
+
+ private:
+  //
+  // Cache data abstraction
+  //
+  struct CacheData : LRUElement<CacheData> {
+    explicit CacheData(CacheData&& rhs) noexcept
+        : key(std::move(rhs.key)), value(std::move(rhs.value)) {}
+
+    explicit CacheData(const std::string& _key, const std::string& _value = "")
+        : key(_key), value(_value) {}
+
+    virtual ~CacheData() {}
+
+    const std::string key;
+    const std::string value;
+  };
+
+  static void DeleteCacheData(CacheData* data);
+
+  //
+  // Index and LRU definition
+  //
+  struct CacheDataHash {
+    uint64_t operator()(const CacheData* obj) const {
+      assert(obj);
+      return std::hash<std::string>()(obj->key);
+    }
+  };
+
+  struct CacheDataEqual {
+    bool operator()(const CacheData* lhs, const CacheData* rhs) const {
+      assert(lhs);
+      assert(rhs);
+      return lhs->key == rhs->key;
+    }
+  };
+
+  struct Statistics {
+    std::atomic<uint64_t> cache_misses_{0};
+    std::atomic<uint64_t> cache_hits_{0};
+    std::atomic<uint64_t> cache_inserts_{0};
+    std::atomic<uint64_t> cache_evicts_{0};
+
+    double CacheHitPct() const {
+      auto lookups = cache_hits_ + cache_misses_;
+      return lookups ? 100 * cache_hits_ / static_cast<double>(lookups) : 0.0;
+    }
+
+    double CacheMissPct() const {
+      auto lookups = cache_hits_ + cache_misses_;
+      return lookups ? 100 * cache_misses_ / static_cast<double>(lookups) : 0.0;
+    }
+  };
+
+  using IndexType =
+      EvictableHashTable<CacheData, CacheDataHash, CacheDataEqual>;
+
+  // Evict LRU tail
+  bool Evict();
+
+  const bool is_compressed_ = true;    // does it store compressed data
+  IndexType index_;                    // in-memory cache
+  std::atomic<uint64_t> max_size_{0};  // Maximum size of the cache
+  std::atomic<uint64_t> size_{0};      // Size of the cache
+  Statistics stats_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff --git a/src/rocksdb/utilities/simulator_cache/cache_simulator.cc b/src/rocksdb/utilities/simulator_cache/cache_simulator.cc
new file mode 100644
index 000000000..dc419e51a
--- /dev/null
+++ b/src/rocksdb/utilities/simulator_cache/cache_simulator.cc
@@ -0,0 +1,288 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "utilities/simulator_cache/cache_simulator.h"
+
+#include <algorithm>
+
+#include "db/dbformat.h"
+#include "rocksdb/trace_record.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+const std::string kGhostCachePrefix = "ghost_";
+}  // namespace
+
+GhostCache::GhostCache(std::shared_ptr<Cache> sim_cache)
+    : sim_cache_(sim_cache) {}
+
+bool GhostCache::Admit(const Slice& lookup_key) {
+  auto handle = sim_cache_->Lookup(lookup_key);
+  if (handle != nullptr) {
+    sim_cache_->Release(handle);
+    return true;
+  }
+  // TODO: Should we check for errors here?
+  auto s = sim_cache_->Insert(lookup_key, /*value=*/nullptr, lookup_key.size(),
+                              /*deleter=*/nullptr);
+  s.PermitUncheckedError();
+  return false;
+}
+
+CacheSimulator::CacheSimulator(std::unique_ptr<GhostCache>&& ghost_cache,
+                               std::shared_ptr<Cache> sim_cache)
+    : ghost_cache_(std::move(ghost_cache)), sim_cache_(sim_cache) {}
+
+void CacheSimulator::Access(const BlockCacheTraceRecord& access) {
+  bool admit = true;
+  const bool is_user_access =
+      BlockCacheTraceHelper::IsUserAccess(access.caller);
+  bool is_cache_miss = true;
+  if (ghost_cache_ && !access.no_insert) {
+    admit = ghost_cache_->Admit(access.block_key);
+  }
+  auto handle = sim_cache_->Lookup(access.block_key);
+  if (handle != nullptr) {
+    sim_cache_->Release(handle);
+    is_cache_miss = false;
+  } else {
+    if (!access.no_insert && admit && access.block_size > 0) {
+      // Ignore errors on insert
+      auto s = sim_cache_->Insert(access.block_key, /*value=*/nullptr,
+                                  access.block_size,
+                                  /*deleter=*/nullptr);
+      s.PermitUncheckedError();
+    }
+  }
+  miss_ratio_stats_.UpdateMetrics(access.access_timestamp, is_user_access,
+                                  is_cache_miss);
+}
+
+void MissRatioStats::UpdateMetrics(uint64_t timestamp_in_ms,
+                                   bool is_user_access, bool is_cache_miss) {
+  uint64_t timestamp_in_seconds = timestamp_in_ms / kMicrosInSecond;
+  num_accesses_timeline_[timestamp_in_seconds] += 1;
+  num_accesses_ += 1;
+  if (num_misses_timeline_.find(timestamp_in_seconds) ==
+      num_misses_timeline_.end()) {
+    num_misses_timeline_[timestamp_in_seconds] = 0;
+  }
+  if (is_cache_miss) {
+    num_misses_ += 1;
+    num_misses_timeline_[timestamp_in_seconds] += 1;
+  }
+  if (is_user_access) {
+    user_accesses_ += 1;
+    if (is_cache_miss) {
+      user_misses_ += 1;
+    }
+  }
+}
+
+Cache::Priority PrioritizedCacheSimulator::ComputeBlockPriority(
+    const BlockCacheTraceRecord& access) const {
+  if (access.block_type == TraceType::kBlockTraceFilterBlock ||
+      access.block_type == TraceType::kBlockTraceIndexBlock ||
+      access.block_type == TraceType::kBlockTraceUncompressionDictBlock) {
+    return Cache::Priority::HIGH;
+  }
+  return Cache::Priority::LOW;
+}
+
+void PrioritizedCacheSimulator::AccessKVPair(
+    const Slice& key, uint64_t value_size, Cache::Priority priority,
+    const BlockCacheTraceRecord& access, bool no_insert, bool is_user_access,
+    bool* is_cache_miss, bool* admitted, bool update_metrics) {
+  assert(is_cache_miss);
+  assert(admitted);
+  *is_cache_miss = true;
+  *admitted = true;
+  if (ghost_cache_ && !no_insert) {
+    *admitted = ghost_cache_->Admit(key);
+  }
+  auto handle = sim_cache_->Lookup(key);
+  if (handle != nullptr) {
+    sim_cache_->Release(handle);
+    *is_cache_miss = false;
+  } else if (!no_insert && *admitted && value_size > 0) {
+    // TODO: Should we check for an error here?
+    auto s = sim_cache_->Insert(key, /*value=*/nullptr, value_size,
+                                /*deleter=*/nullptr,
+                                /*handle=*/nullptr, priority);
+    s.PermitUncheckedError();
+  }
+  if (update_metrics) {
+    miss_ratio_stats_.UpdateMetrics(access.access_timestamp, is_user_access,
+                                    *is_cache_miss);
+  }
+}
+
+void PrioritizedCacheSimulator::Access(const BlockCacheTraceRecord& access) {
+  bool is_cache_miss = true;
+  bool admitted = true;
+  AccessKVPair(access.block_key, access.block_size,
+               ComputeBlockPriority(access), access, access.no_insert,
+               BlockCacheTraceHelper::IsUserAccess(access.caller),
+               &is_cache_miss, &admitted, /*update_metrics=*/true);
+}
+
+void HybridRowBlockCacheSimulator::Access(const BlockCacheTraceRecord& access) {
+  // TODO (haoyu): We only support Get for now. We need to extend the tracing
+  // for MultiGet, i.e., non-data block accesses must log all keys in a
+  // MultiGet.
+  bool is_cache_miss = true;
+  bool admitted = false;
+  if (access.caller == TableReaderCaller::kUserGet &&
+      access.get_id != BlockCacheTraceHelper::kReservedGetId) {
+    // This is a Get request.
+    const std::string& row_key = BlockCacheTraceHelper::ComputeRowKey(access);
+    GetRequestStatus& status = getid_status_map_[access.get_id];
+    if (status.is_complete) {
+      // This Get request completes.
+      // Skip future accesses to its index/filter/data
+      // blocks. These block lookups are unnecessary if we observe a hit for the
+      // referenced key-value pair already. Thus, we treat these lookups as
+      // hits. This is also to ensure the total number of accesses are the same
+      // when comparing to other policies.
+      miss_ratio_stats_.UpdateMetrics(access.access_timestamp,
+                                      /*is_user_access=*/true,
+                                      /*is_cache_miss=*/false);
+      return;
+    }
+    if (status.row_key_status.find(row_key) == status.row_key_status.end()) {
+      // This is the first time that this key is accessed. Look up the key-value
+      // pair first. Do not update the miss/accesses metrics here since it will
+      // be updated later.
+      AccessKVPair(row_key, access.referenced_data_size, Cache::Priority::HIGH,
+                   access,
+                   /*no_insert=*/false,
+                   /*is_user_access=*/true, &is_cache_miss, &admitted,
+                   /*update_metrics=*/false);
+      InsertResult result = InsertResult::NO_INSERT;
+      if (admitted && access.referenced_data_size > 0) {
+        result = InsertResult::INSERTED;
+      } else if (admitted) {
+        result = InsertResult::ADMITTED;
+      }
+      status.row_key_status[row_key] = result;
+    }
+    if (!is_cache_miss) {
+      // A cache hit.
+      status.is_complete = true;
+      miss_ratio_stats_.UpdateMetrics(access.access_timestamp,
+                                      /*is_user_access=*/true,
+                                      /*is_cache_miss=*/false);
+      return;
+    }
+    // The row key-value pair observes a cache miss. We need to access its
+    // index/filter/data blocks.
+    InsertResult inserted = status.row_key_status[row_key];
+    AccessKVPair(
+        access.block_key, access.block_size, ComputeBlockPriority(access),
+        access,
+        /*no_insert=*/!insert_blocks_upon_row_kvpair_miss_ || access.no_insert,
+        /*is_user_access=*/true, &is_cache_miss, &admitted,
+        /*update_metrics=*/true);
+    if (access.referenced_data_size > 0 && inserted == InsertResult::ADMITTED) {
+      // TODO: Should we check for an error here?
+      auto s = sim_cache_->Insert(row_key, /*value=*/nullptr,
+                                  access.referenced_data_size,
+                                  /*deleter=*/nullptr,
+                                  /*handle=*/nullptr, Cache::Priority::HIGH);
+      s.PermitUncheckedError();
+      status.row_key_status[row_key] = InsertResult::INSERTED;
+    }
+    return;
+  }
+  AccessKVPair(access.block_key, access.block_size,
+               ComputeBlockPriority(access), access, access.no_insert,
+               BlockCacheTraceHelper::IsUserAccess(access.caller),
+               &is_cache_miss, &admitted, /*update_metrics=*/true);
+}
+
+BlockCacheTraceSimulator::BlockCacheTraceSimulator(
+    uint64_t warmup_seconds, uint32_t downsample_ratio,
+    const std::vector<CacheConfiguration>& cache_configurations)
+    : warmup_seconds_(warmup_seconds),
+      downsample_ratio_(downsample_ratio),
+      cache_configurations_(cache_configurations) {}
+
+Status BlockCacheTraceSimulator::InitializeCaches() {
+  for (auto const& config : cache_configurations_) {
+    for (auto cache_capacity : config.cache_capacities) {
+      // Scale down the cache capacity since the trace contains accesses on
+      // 1/'downsample_ratio' blocks.
+      uint64_t simulate_cache_capacity = cache_capacity / downsample_ratio_;
+      std::shared_ptr<CacheSimulator> sim_cache;
+      std::unique_ptr<GhostCache> ghost_cache;
+      std::string cache_name = config.cache_name;
+      if (cache_name.find(kGhostCachePrefix) != std::string::npos) {
+        ghost_cache.reset(new GhostCache(
+            NewLRUCache(config.ghost_cache_capacity, /*num_shard_bits=*/1,
+                        /*strict_capacity_limit=*/false,
+                        /*high_pri_pool_ratio=*/0)));
+        cache_name = cache_name.substr(kGhostCachePrefix.size());
+      }
+      if (cache_name == "lru") {
+        sim_cache = std::make_shared<CacheSimulator>(
+            std::move(ghost_cache),
+            NewLRUCache(simulate_cache_capacity, config.num_shard_bits,
+                        /*strict_capacity_limit=*/false,
+                        /*high_pri_pool_ratio=*/0));
+      } else if (cache_name == "lru_priority") {
+        sim_cache = std::make_shared<PrioritizedCacheSimulator>(
+            std::move(ghost_cache),
+            NewLRUCache(simulate_cache_capacity, config.num_shard_bits,
+                        /*strict_capacity_limit=*/false,
+                        /*high_pri_pool_ratio=*/0.5));
+      } else if (cache_name == "lru_hybrid") {
+        sim_cache = std::make_shared<HybridRowBlockCacheSimulator>(
+            std::move(ghost_cache),
+            NewLRUCache(simulate_cache_capacity, config.num_shard_bits,
+                        /*strict_capacity_limit=*/false,
+                        /*high_pri_pool_ratio=*/0.5),
+            /*insert_blocks_upon_row_kvpair_miss=*/true);
+      } else if (cache_name == "lru_hybrid_no_insert_on_row_miss") {
+        sim_cache = std::make_shared<HybridRowBlockCacheSimulator>(
+            std::move(ghost_cache),
+            NewLRUCache(simulate_cache_capacity, config.num_shard_bits,
+                        /*strict_capacity_limit=*/false,
+                        /*high_pri_pool_ratio=*/0.5),
+            /*insert_blocks_upon_row_kvpair_miss=*/false);
+      } else {
+        // Not supported.
+        return Status::InvalidArgument("Unknown cache name " +
+                                       config.cache_name);
+      }
+      sim_caches_[config].push_back(sim_cache);
+    }
+  }
+  return Status::OK();
+}
+
+void BlockCacheTraceSimulator::Access(const BlockCacheTraceRecord& access) {
+  if (trace_start_time_ == 0) {
+    trace_start_time_ = access.access_timestamp;
+  }
+  // access.access_timestamp is in microseconds.
+  if (!warmup_complete_ &&
+      trace_start_time_ + warmup_seconds_ * kMicrosInSecond <=
+          access.access_timestamp) {
+    for (auto& config_caches : sim_caches_) {
+      for (auto& sim_cache : config_caches.second) {
+        sim_cache->reset_counter();
+      }
+    }
+    warmup_complete_ = true;
+  }
+  for (auto& config_caches : sim_caches_) {
+    for (auto& sim_cache : config_caches.second) {
+      sim_cache->Access(access);
+    }
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/simulator_cache/cache_simulator.h b/src/rocksdb/utilities/simulator_cache/cache_simulator.h
new file mode 100644
index 000000000..6d4979013
--- /dev/null
+++ b/src/rocksdb/utilities/simulator_cache/cache_simulator.h
@@ -0,0 +1,231 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <unordered_map>
+
+#include "cache/lru_cache.h"
+#include "trace_replay/block_cache_tracer.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A cache configuration provided by user.
+struct CacheConfiguration {
+  std::string cache_name;  // LRU.
+  uint32_t num_shard_bits;
+  uint64_t ghost_cache_capacity;  // ghost cache capacity in bytes.
+  std::vector<uint64_t>
+      cache_capacities;  // simulate cache capacities in bytes.
+
+  bool operator==(const CacheConfiguration& o) const {
+    return cache_name == o.cache_name && num_shard_bits == o.num_shard_bits &&
+           ghost_cache_capacity == o.ghost_cache_capacity;
+  }
+  bool operator<(const CacheConfiguration& o) const {
+    return cache_name < o.cache_name ||
+           (cache_name == o.cache_name && num_shard_bits < o.num_shard_bits) ||
+           (cache_name == o.cache_name && num_shard_bits == o.num_shard_bits &&
+            ghost_cache_capacity < o.ghost_cache_capacity);
+  }
+};
+
+class MissRatioStats {
+ public:
+  void reset_counter() {
+    num_misses_ = 0;
+    num_accesses_ = 0;
+    user_accesses_ = 0;
+    user_misses_ = 0;
+  }
+  double miss_ratio() const {
+    if (num_accesses_ == 0) {
+      return -1;
+    }
+    return static_cast<double>(num_misses_ * 100.0 / num_accesses_);
+  }
+  uint64_t total_accesses() const { return num_accesses_; }
+  uint64_t total_misses() const { return num_misses_; }
+
+  const std::map<uint64_t, uint64_t>& num_accesses_timeline() const {
+    return num_accesses_timeline_;
+  }
+
+  const std::map<uint64_t, uint64_t>& num_misses_timeline() const {
+    return num_misses_timeline_;
+  }
+
+  double user_miss_ratio() const {
+    if (user_accesses_ == 0) {
+      return -1;
+    }
+    return static_cast<double>(user_misses_ * 100.0 / user_accesses_);
+  }
+  uint64_t user_accesses() const { return user_accesses_; }
+  uint64_t user_misses() const { return user_misses_; }
+
+  void UpdateMetrics(uint64_t timestamp_in_ms, bool is_user_access,
+                     bool is_cache_miss);
+
+ private:
+  uint64_t num_accesses_ = 0;
+  uint64_t num_misses_ = 0;
+  uint64_t user_accesses_ = 0;
+  uint64_t user_misses_ = 0;
+
+  std::map<uint64_t, uint64_t> num_accesses_timeline_;
+  std::map<uint64_t, uint64_t> num_misses_timeline_;
+};
+
+// A ghost cache admits an entry on its second access.
+class GhostCache {
+ public:
+  explicit GhostCache(std::shared_ptr<Cache> sim_cache);
+  ~GhostCache() = default;
+  // No copy and move.
+  GhostCache(const GhostCache&) = delete;
+  GhostCache& operator=(const GhostCache&) = delete;
+  GhostCache(GhostCache&&) = delete;
+  GhostCache& operator=(GhostCache&&) = delete;
+
+  // Returns true if the lookup_key is in the ghost cache.
+  // Returns false otherwise.
+  bool Admit(const Slice& lookup_key);
+
+ private:
+  std::shared_ptr<Cache> sim_cache_;
+};
+
+// A cache simulator that runs against a block cache trace.
+class CacheSimulator {
+ public:
+  CacheSimulator(std::unique_ptr<GhostCache>&& ghost_cache,
+                 std::shared_ptr<Cache> sim_cache);
+  virtual ~CacheSimulator() = default;
+  // No copy and move.
+  CacheSimulator(const CacheSimulator&) = delete;
+  CacheSimulator& operator=(const CacheSimulator&) = delete;
+  CacheSimulator(CacheSimulator&&) = delete;
+  CacheSimulator& operator=(CacheSimulator&&) = delete;
+
+  virtual void Access(const BlockCacheTraceRecord& access);
+
+  void reset_counter() { miss_ratio_stats_.reset_counter(); }
+
+  const MissRatioStats& miss_ratio_stats() const { return miss_ratio_stats_; }
+
+ protected:
+  MissRatioStats miss_ratio_stats_;
+  std::unique_ptr<GhostCache> ghost_cache_;
+  std::shared_ptr<Cache> sim_cache_;
+};
+
+// A prioritized cache simulator that runs against a block cache trace.
+// It inserts missing index/filter/uncompression-dictionary blocks with high
+// priority in the cache.
+class PrioritizedCacheSimulator : public CacheSimulator {
+ public:
+  PrioritizedCacheSimulator(std::unique_ptr<GhostCache>&& ghost_cache,
+                            std::shared_ptr<Cache> sim_cache)
+      : CacheSimulator(std::move(ghost_cache), sim_cache) {}
+  void Access(const BlockCacheTraceRecord& access) override;
+
+ protected:
+  // Access the key-value pair and returns true upon a cache miss.
+  void AccessKVPair(const Slice& key, uint64_t value_size,
+                    Cache::Priority priority,
+                    const BlockCacheTraceRecord& access, bool no_insert,
+                    bool is_user_access, bool* is_cache_miss, bool* admitted,
+                    bool update_metrics);
+
+  Cache::Priority ComputeBlockPriority(
+      const BlockCacheTraceRecord& access) const;
+};
+
+// A hybrid row and block cache simulator. It looks up/inserts key-value pairs
+// referenced by Get/MultiGet requests, and not their accessed index/filter/data
+// blocks.
+//
+// Upon a Get/MultiGet request, it looks up the referenced key first.
+// If it observes a cache hit, future block accesses on this key-value pair is
+// skipped since the request is served already. Otherwise, it continues to look
+// up/insert its index/filter/data blocks. It also inserts the referenced
+// key-value pair in the cache for future lookups.
+class HybridRowBlockCacheSimulator : public PrioritizedCacheSimulator {
+ public:
+  HybridRowBlockCacheSimulator(std::unique_ptr<GhostCache>&& ghost_cache,
+                               std::shared_ptr<Cache> sim_cache,
+                               bool insert_blocks_upon_row_kvpair_miss)
+      : PrioritizedCacheSimulator(std::move(ghost_cache), sim_cache),
+        insert_blocks_upon_row_kvpair_miss_(
+            insert_blocks_upon_row_kvpair_miss) {}
+  void Access(const BlockCacheTraceRecord& access) override;
+
+ private:
+  enum InsertResult : char {
+    INSERTED,
+    ADMITTED,
+    NO_INSERT,
+  };
+
+  // We set is_complete to true when the referenced row-key of a get request
+  // hits the cache. If is_complete is true, we treat future accesses of this
+  // get request as hits.
+  //
+  // For each row key, it stores an enum. It is INSERTED when the
+  // kv-pair has been inserted into the cache, ADMITTED if it should be inserted
+  // but haven't been, NO_INSERT if it should not be inserted.
+  //
+  // A kv-pair is in ADMITTED state when we encounter this kv-pair but do not
+  // know its size. This may happen if the first access on the referenced key is
+  // an index/filter block.
+  struct GetRequestStatus {
+    bool is_complete = false;
+    std::map<std::string, InsertResult> row_key_status;
+  };
+
+  // A map stores get_id to a map of row keys.
+  std::map<uint64_t, GetRequestStatus> getid_status_map_;
+  bool insert_blocks_upon_row_kvpair_miss_;
+};
+
+// A block cache simulator that reports miss ratio curves given a set of cache
+// configurations.
+class BlockCacheTraceSimulator {
+ public:
+  // warmup_seconds: The number of seconds to warmup simulated caches. The
+  // hit/miss counters are reset after the warmup completes.
+  BlockCacheTraceSimulator(
+      uint64_t warmup_seconds, uint32_t downsample_ratio,
+      const std::vector<CacheConfiguration>& cache_configurations);
+  ~BlockCacheTraceSimulator() = default;
+  // No copy and move.
+  BlockCacheTraceSimulator(const BlockCacheTraceSimulator&) = delete;
+  BlockCacheTraceSimulator& operator=(const BlockCacheTraceSimulator&) = delete;
+  BlockCacheTraceSimulator(BlockCacheTraceSimulator&&) = delete;
+  BlockCacheTraceSimulator& operator=(BlockCacheTraceSimulator&&) = delete;
+
+  Status InitializeCaches();
+
+  void Access(const BlockCacheTraceRecord& access);
+
+  const std::map<CacheConfiguration,
+                 std::vector<std::shared_ptr<CacheSimulator>>>&
+  sim_caches() const {
+    return sim_caches_;
+  }
+
+ private:
+  const uint64_t warmup_seconds_;
+  const uint32_t downsample_ratio_;
+  const std::vector<CacheConfiguration> cache_configurations_;
+
+  bool warmup_complete_ = false;
+  std::map<CacheConfiguration, std::vector<std::shared_ptr<CacheSimulator>>>
+      sim_caches_;
+  uint64_t trace_start_time_ = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/simulator_cache/cache_simulator_test.cc b/src/rocksdb/utilities/simulator_cache/cache_simulator_test.cc
new file mode 100644
index 000000000..2bc057c92
--- /dev/null
+++ b/src/rocksdb/utilities/simulator_cache/cache_simulator_test.cc
@@ -0,0 +1,497 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "utilities/simulator_cache/cache_simulator.h"
+
+#include <cstdlib>
+
+#include "rocksdb/env.h"
+#include "rocksdb/trace_record.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+const std::string kBlockKeyPrefix = "test-block-";
+const std::string kRefKeyPrefix = "test-get-";
+const std::string kRefKeySequenceNumber = std::string(8, 'c');
+const uint64_t kGetId = 1;
+const uint64_t kGetBlockId = 100;
+const uint64_t kCompactionBlockId = 1000;
+const uint64_t kCacheSize = 1024 * 1024 * 1024;
+const uint64_t kGhostCacheSize = 1024 * 1024;
+}  // namespace
+
+class CacheSimulatorTest : public testing::Test {
+ public:
+  const size_t kNumBlocks = 5;
+  const size_t kValueSize = 1000;
+
+  CacheSimulatorTest() { env_ = ROCKSDB_NAMESPACE::Env::Default(); }
+
+  BlockCacheTraceRecord GenerateGetRecord(uint64_t getid) {
+    BlockCacheTraceRecord record;
+    record.block_type = TraceType::kBlockTraceDataBlock;
+    record.block_size = 4096;
+    record.block_key = kBlockKeyPrefix + std::to_string(kGetBlockId);
+    record.access_timestamp = env_->NowMicros();
+    record.cf_id = 0;
+    record.cf_name = "test";
+    record.caller = TableReaderCaller::kUserGet;
+    record.level = 6;
+    record.sst_fd_number = 0;
+    record.get_id = getid;
+    record.is_cache_hit = false;
+    record.no_insert = false;
+    record.referenced_key =
+        kRefKeyPrefix + std::to_string(kGetId) + kRefKeySequenceNumber;
+    record.referenced_key_exist_in_block = true;
+    record.referenced_data_size = 100;
+    record.num_keys_in_block = 300;
+    return record;
+  }
+
+  BlockCacheTraceRecord GenerateCompactionRecord() {
+    BlockCacheTraceRecord record;
+    record.block_type = TraceType::kBlockTraceDataBlock;
+    record.block_size = 4096;
+    record.block_key = kBlockKeyPrefix + std::to_string(kCompactionBlockId);
+    record.access_timestamp = env_->NowMicros();
+    record.cf_id = 0;
+    record.cf_name = "test";
+    record.caller = TableReaderCaller::kCompaction;
+    record.level = 6;
+    record.sst_fd_number = kCompactionBlockId;
+    record.is_cache_hit = false;
+    record.no_insert = true;
+    return record;
+  }
+
+  void AssertCache(std::shared_ptr<Cache> sim_cache,
+                   const MissRatioStats& miss_ratio_stats,
+                   uint64_t expected_usage, uint64_t expected_num_accesses,
+                   uint64_t expected_num_misses,
+                   std::vector<std::string> blocks,
+                   std::vector<std::string> keys) {
+    EXPECT_EQ(expected_usage, sim_cache->GetUsage());
+    EXPECT_EQ(expected_num_accesses, miss_ratio_stats.total_accesses());
+    EXPECT_EQ(expected_num_misses, miss_ratio_stats.total_misses());
+    for (auto const& block : blocks) {
+      auto handle = sim_cache->Lookup(block);
+      EXPECT_NE(nullptr, handle);
+      sim_cache->Release(handle);
+    }
+    for (auto const& key : keys) {
+      std::string row_key = kRefKeyPrefix + key + kRefKeySequenceNumber;
+      auto handle =
+          sim_cache->Lookup("0_" + ExtractUserKey(row_key).ToString());
+      EXPECT_NE(nullptr, handle);
+      sim_cache->Release(handle);
+    }
+  }
+
+  Env* env_;
+};
+
+TEST_F(CacheSimulatorTest, GhostCache) {
+  const std::string key1 = "test1";
+  const std::string key2 = "test2";
+  std::unique_ptr<GhostCache> ghost_cache(new GhostCache(
+      NewLRUCache(/*capacity=*/kGhostCacheSize, /*num_shard_bits=*/1,
+                  /*strict_capacity_limit=*/false,
+                  /*high_pri_pool_ratio=*/0)));
+  EXPECT_FALSE(ghost_cache->Admit(key1));
+  EXPECT_TRUE(ghost_cache->Admit(key1));
+  EXPECT_TRUE(ghost_cache->Admit(key1));
+  EXPECT_FALSE(ghost_cache->Admit(key2));
+  EXPECT_TRUE(ghost_cache->Admit(key2));
+}
+
+TEST_F(CacheSimulatorTest, CacheSimulator) {
+  const BlockCacheTraceRecord& access = GenerateGetRecord(kGetId);
+  const BlockCacheTraceRecord& compaction_access = GenerateCompactionRecord();
+  std::shared_ptr<Cache> sim_cache =
+      NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1,
+                  /*strict_capacity_limit=*/false,
+                  /*high_pri_pool_ratio=*/0);
+  std::unique_ptr<CacheSimulator> cache_simulator(
+      new CacheSimulator(nullptr, sim_cache));
+  cache_simulator->Access(access);
+  cache_simulator->Access(access);
+  ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses());
+  ASSERT_EQ(50, cache_simulator->miss_ratio_stats().miss_ratio());
+  ASSERT_EQ(2, cache_simulator->miss_ratio_stats().user_accesses());
+  ASSERT_EQ(50, cache_simulator->miss_ratio_stats().user_miss_ratio());
+
+  cache_simulator->Access(compaction_access);
+  cache_simulator->Access(compaction_access);
+  ASSERT_EQ(4, cache_simulator->miss_ratio_stats().total_accesses());
+  ASSERT_EQ(75, cache_simulator->miss_ratio_stats().miss_ratio());
+  ASSERT_EQ(2, cache_simulator->miss_ratio_stats().user_accesses());
+  ASSERT_EQ(50, cache_simulator->miss_ratio_stats().user_miss_ratio());
+
+  cache_simulator->reset_counter();
+  ASSERT_EQ(0, cache_simulator->miss_ratio_stats().total_accesses());
+  ASSERT_EQ(-1, cache_simulator->miss_ratio_stats().miss_ratio());
+  auto handle = sim_cache->Lookup(access.block_key);
+  ASSERT_NE(nullptr, handle);
+  sim_cache->Release(handle);
+  handle = sim_cache->Lookup(compaction_access.block_key);
+  ASSERT_EQ(nullptr, handle);
+}
+
+TEST_F(CacheSimulatorTest, GhostCacheSimulator) {
+  const BlockCacheTraceRecord& access = GenerateGetRecord(kGetId);
+  std::unique_ptr<GhostCache> ghost_cache(new GhostCache(
+      NewLRUCache(/*capacity=*/kGhostCacheSize, /*num_shard_bits=*/1,
+                  /*strict_capacity_limit=*/false,
+                  /*high_pri_pool_ratio=*/0)));
+  std::unique_ptr<CacheSimulator> cache_simulator(new CacheSimulator(
+      std::move(ghost_cache),
+      NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1,
+                  /*strict_capacity_limit=*/false,
+                  /*high_pri_pool_ratio=*/0)));
+  cache_simulator->Access(access);
+  cache_simulator->Access(access);
+  ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses());
+  // Both of them will be miss since we have a ghost cache.
+  ASSERT_EQ(100, cache_simulator->miss_ratio_stats().miss_ratio());
+}
+
+TEST_F(CacheSimulatorTest, PrioritizedCacheSimulator) {
+  const BlockCacheTraceRecord& access = GenerateGetRecord(kGetId);
+  std::shared_ptr<Cache> sim_cache =
+      NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1,
+                  /*strict_capacity_limit=*/false,
+                  /*high_pri_pool_ratio=*/0);
+  std::unique_ptr<PrioritizedCacheSimulator> cache_simulator(
+      new PrioritizedCacheSimulator(nullptr, sim_cache));
+  cache_simulator->Access(access);
+  cache_simulator->Access(access);
+  ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses());
+  ASSERT_EQ(50, cache_simulator->miss_ratio_stats().miss_ratio());
+
+  auto handle = sim_cache->Lookup(access.block_key);
+  ASSERT_NE(nullptr, handle);
+  sim_cache->Release(handle);
+}
+
+TEST_F(CacheSimulatorTest, GhostPrioritizedCacheSimulator) {
+  const BlockCacheTraceRecord& access = GenerateGetRecord(kGetId);
+  std::unique_ptr<GhostCache> ghost_cache(new GhostCache(
+      NewLRUCache(/*capacity=*/kGhostCacheSize, /*num_shard_bits=*/1,
+                  /*strict_capacity_limit=*/false,
+                  /*high_pri_pool_ratio=*/0)));
+  std::unique_ptr<PrioritizedCacheSimulator> cache_simulator(
+      new PrioritizedCacheSimulator(
+          std::move(ghost_cache),
+          NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1,
+                      /*strict_capacity_limit=*/false,
+                      /*high_pri_pool_ratio=*/0)));
+  cache_simulator->Access(access);
+  cache_simulator->Access(access);
+  ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses());
+  // Both of them will be miss since we have a ghost cache.
+  ASSERT_EQ(100, cache_simulator->miss_ratio_stats().miss_ratio());
+}
+
+TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulator) {
+  uint64_t block_id = 100;
+  BlockCacheTraceRecord first_get = GenerateGetRecord(kGetId);
+  first_get.get_from_user_specified_snapshot = true;
+  BlockCacheTraceRecord second_get = GenerateGetRecord(kGetId + 1);
+  second_get.referenced_data_size = 0;
+  second_get.referenced_key_exist_in_block = false;
+  second_get.get_from_user_specified_snapshot = true;
+  BlockCacheTraceRecord third_get = GenerateGetRecord(kGetId + 2);
+  third_get.referenced_data_size = 0;
+  third_get.referenced_key_exist_in_block = false;
+  third_get.referenced_key = kRefKeyPrefix + "third_get";
+  // We didn't find the referenced key in the third get.
+  third_get.referenced_key_exist_in_block = false;
+  third_get.referenced_data_size = 0;
+  std::shared_ptr<Cache> sim_cache =
+      NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1,
+                  /*strict_capacity_limit=*/false,
+                  /*high_pri_pool_ratio=*/0);
+  std::unique_ptr<HybridRowBlockCacheSimulator> cache_simulator(
+      new HybridRowBlockCacheSimulator(
+          nullptr, sim_cache, /*insert_blocks_row_kvpair_misses=*/true));
+  // The first get request accesses 10 blocks. We should only report 10 accesses
+  // and 100% miss.
+  for (uint32_t i = 0; i < 10; i++) {
+    first_get.block_key = kBlockKeyPrefix + std::to_string(block_id);
+    cache_simulator->Access(first_get);
+    block_id++;
+  }
+
+  ASSERT_EQ(10, cache_simulator->miss_ratio_stats().total_accesses());
+  ASSERT_EQ(100, cache_simulator->miss_ratio_stats().miss_ratio());
+  ASSERT_EQ(10, cache_simulator->miss_ratio_stats().user_accesses());
+  ASSERT_EQ(100, cache_simulator->miss_ratio_stats().user_miss_ratio());
+  auto handle =
+      sim_cache->Lookup(std::to_string(first_get.sst_fd_number) + "_" +
+                        ExtractUserKey(first_get.referenced_key).ToString());
+  ASSERT_NE(nullptr, handle);
+  sim_cache->Release(handle);
+  for (uint32_t i = 100; i < block_id; i++) {
+    handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i));
+    ASSERT_NE(nullptr, handle);
+    sim_cache->Release(handle);
+  }
+
+  // The second get request accesses the same key. We should report 15
+  // access and 66% miss, 10 misses with 15 accesses.
+  // We do not consider these 5 block lookups as misses since the row hits the
+  // cache.
+  for (uint32_t i = 0; i < 5; i++) {
+    second_get.block_key = kBlockKeyPrefix + std::to_string(block_id);
+    cache_simulator->Access(second_get);
+    block_id++;
+  }
+  ASSERT_EQ(15, cache_simulator->miss_ratio_stats().total_accesses());
+  ASSERT_EQ(66, static_cast<uint64_t>(
+                    cache_simulator->miss_ratio_stats().miss_ratio()));
+  ASSERT_EQ(15, cache_simulator->miss_ratio_stats().user_accesses());
+  ASSERT_EQ(66, static_cast<uint64_t>(
+                    cache_simulator->miss_ratio_stats().user_miss_ratio()));
+  handle =
+      sim_cache->Lookup(std::to_string(second_get.sst_fd_number) + "_" +
+                        ExtractUserKey(second_get.referenced_key).ToString());
+  ASSERT_NE(nullptr, handle);
+  sim_cache->Release(handle);
+  for (uint32_t i = 100; i < block_id; i++) {
+    handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i));
+    if (i < 110) {
+      ASSERT_NE(nullptr, handle) << i;
+      sim_cache->Release(handle);
+    } else {
+      ASSERT_EQ(nullptr, handle) << i;
+    }
+  }
+
+  // The third get on a different key and does not have a size.
+  // This key should not be inserted into the cache.
+  for (uint32_t i = 0; i < 5; i++) {
+    third_get.block_key = kBlockKeyPrefix + std::to_string(block_id);
+    cache_simulator->Access(third_get);
+    block_id++;
+  }
+  ASSERT_EQ(20, cache_simulator->miss_ratio_stats().total_accesses());
+  ASSERT_EQ(75, static_cast<uint64_t>(
+                    cache_simulator->miss_ratio_stats().miss_ratio()));
+  ASSERT_EQ(20, cache_simulator->miss_ratio_stats().user_accesses());
+  ASSERT_EQ(75, static_cast<uint64_t>(
+                    cache_simulator->miss_ratio_stats().user_miss_ratio()));
+  // Assert that the third key is not inserted into the cache.
+  handle = sim_cache->Lookup(std::to_string(third_get.sst_fd_number) + "_" +
+                             third_get.referenced_key);
+  ASSERT_EQ(nullptr, handle);
+  for (uint32_t i = 100; i < block_id; i++) {
+    if (i < 110 || i >= 115) {
+      handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i));
+      ASSERT_NE(nullptr, handle) << i;
+      sim_cache->Release(handle);
+    } else {
+      handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i));
+      ASSERT_EQ(nullptr, handle) << i;
+    }
+  }
+}
+
+TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulatorGetTest) {
+  BlockCacheTraceRecord get = GenerateGetRecord(kGetId);
+  get.block_size = 1;
+  get.referenced_data_size = 0;
+  get.access_timestamp = 0;
+  get.block_key = "1";
+  get.get_id = 1;
+  get.get_from_user_specified_snapshot = false;
+  get.referenced_key =
+      kRefKeyPrefix + std::to_string(1) + kRefKeySequenceNumber;
+  get.no_insert = false;
+  get.sst_fd_number = 0;
+  get.get_from_user_specified_snapshot = false;
+
+  LRUCacheOptions co;
+  co.capacity = 16;
+  co.num_shard_bits = 1;
+  co.strict_capacity_limit = false;
+  co.high_pri_pool_ratio = 0;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<Cache> sim_cache = NewLRUCache(co);
+  std::unique_ptr<HybridRowBlockCacheSimulator> cache_simulator(
+      new HybridRowBlockCacheSimulator(
+          nullptr, sim_cache, /*insert_blocks_row_kvpair_misses=*/true));
+  // Expect a miss and does not insert the row key-value pair since it does not
+  // have size.
+  cache_simulator->Access(get);
+  AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 1, 1, 1, {"1"},
+              {});
+  get.access_timestamp += 1;
+  get.referenced_data_size = 1;
+  get.block_key = "2";
+  cache_simulator->Access(get);
+  AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 3, 2, 2,
+              {"1", "2"}, {"1"});
+  get.access_timestamp += 1;
+  get.block_key = "3";
+  // K1 should not inserted again.
+  cache_simulator->Access(get);
+  AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 4, 3, 3,
+              {"1", "2", "3"}, {"1"});
+
+  // A second get request referencing the same key.
+  get.access_timestamp += 1;
+  get.get_id = 2;
+  get.block_key = "4";
+  get.referenced_data_size = 0;
+  cache_simulator->Access(get);
+  AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 4, 4, 3,
+              {"1", "2", "3"}, {"1"});
+
+  // A third get request searches three files, three different keys.
+  // And the second key observes a hit.
+  get.access_timestamp += 1;
+  get.referenced_data_size = 1;
+  get.get_id = 3;
+  get.block_key = "3";
+  get.referenced_key = kRefKeyPrefix + "2" + kRefKeySequenceNumber;
+  // K2 should observe a miss. Block 3 observes a hit.
+  cache_simulator->Access(get);
+  AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 5, 5, 3,
+              {"1", "2", "3"}, {"1", "2"});
+
+  get.access_timestamp += 1;
+  get.referenced_data_size = 1;
+  get.get_id = 3;
+  get.block_key = "4";
+  get.referenced_data_size = 1;
+  get.referenced_key = kRefKeyPrefix + "1" + kRefKeySequenceNumber;
+  // K1 should observe a hit.
+  cache_simulator->Access(get);
+  AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 5, 6, 3,
+              {"1", "2", "3"}, {"1", "2"});
+
+  get.access_timestamp += 1;
+  get.referenced_data_size = 1;
+  get.get_id = 3;
+  get.block_key = "4";
+  get.referenced_data_size = 1;
+  get.referenced_key = kRefKeyPrefix + "3" + kRefKeySequenceNumber;
+  // K3 should observe a miss.
+  // However, as the get already complete, we should not access k3 any more.
+  cache_simulator->Access(get);
+  AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 5, 7, 3,
+              {"1", "2", "3"}, {"1", "2"});
+
+  // A fourth get request searches one file and two blocks. One row key.
+  get.access_timestamp += 1;
+  get.get_id = 4;
+  get.block_key = "5";
+  get.referenced_key = kRefKeyPrefix + "4" + kRefKeySequenceNumber;
+  get.referenced_data_size = 1;
+  cache_simulator->Access(get);
+  AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 7, 8, 4,
+              {"1", "2", "3", "5"}, {"1", "2", "4"});
+  for (auto const& key : {"1", "2", "4"}) {
+    auto handle = sim_cache->Lookup("0_" + kRefKeyPrefix + key);
+    ASSERT_NE(nullptr, handle);
+    sim_cache->Release(handle);
+  }
+
+  // A bunch of insertions which evict cached row keys.
+  for (uint32_t i = 6; i < 100; i++) {
+    get.access_timestamp += 1;
+    get.get_id = 0;
+    get.block_key = std::to_string(i);
+    cache_simulator->Access(get);
+  }
+
+  get.get_id = 4;
+  // A different block.
+  get.block_key = "100";
+  // Same row key and should not be inserted again.
+  get.referenced_key = kRefKeyPrefix + "4" + kRefKeySequenceNumber;
+  get.referenced_data_size = 1;
+  cache_simulator->Access(get);
+  AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 16, 103, 99, {},
+              {});
+  for (auto const& key : {"1", "2", "4"}) {
+    auto handle = sim_cache->Lookup("0_" + kRefKeyPrefix + key);
+    ASSERT_EQ(nullptr, handle);
+  }
+}
+
+TEST_F(CacheSimulatorTest, HybridRowBlockNoInsertCacheSimulator) {
+  uint64_t block_id = 100;
+  BlockCacheTraceRecord first_get = GenerateGetRecord(kGetId);
+  std::shared_ptr<Cache> sim_cache =
+      NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1,
+                  /*strict_capacity_limit=*/false,
+                  /*high_pri_pool_ratio=*/0);
+  std::unique_ptr<HybridRowBlockCacheSimulator> cache_simulator(
+      new HybridRowBlockCacheSimulator(
+          nullptr, sim_cache, /*insert_blocks_row_kvpair_misses=*/false));
+  for (uint32_t i = 0; i < 9; i++) {
+    first_get.block_key = kBlockKeyPrefix + std::to_string(block_id);
+    cache_simulator->Access(first_get);
+    block_id++;
+  }
+  auto handle =
+      sim_cache->Lookup(std::to_string(first_get.sst_fd_number) + "_" +
+                        ExtractUserKey(first_get.referenced_key).ToString());
+  ASSERT_NE(nullptr, handle);
+  sim_cache->Release(handle);
+  // All blocks are missing from the cache since insert_blocks_row_kvpair_misses
+  // is set to false.
+  for (uint32_t i = 100; i < block_id; i++) {
+    handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i));
+    ASSERT_EQ(nullptr, handle);
+  }
+}
+
+TEST_F(CacheSimulatorTest, GhostHybridRowBlockCacheSimulator) {
+  std::unique_ptr<GhostCache> ghost_cache(new GhostCache(
+      NewLRUCache(/*capacity=*/kGhostCacheSize, /*num_shard_bits=*/1,
+                  /*strict_capacity_limit=*/false,
+                  /*high_pri_pool_ratio=*/0)));
+  const BlockCacheTraceRecord& first_get = GenerateGetRecord(kGetId);
+  const BlockCacheTraceRecord& second_get = GenerateGetRecord(kGetId + 1);
+  const BlockCacheTraceRecord& third_get = GenerateGetRecord(kGetId + 2);
+  std::unique_ptr<HybridRowBlockCacheSimulator> cache_simulator(
+      new HybridRowBlockCacheSimulator(
+          std::move(ghost_cache),
+          NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1,
+                      /*strict_capacity_limit=*/false,
+                      /*high_pri_pool_ratio=*/0),
+          /*insert_blocks_row_kvpair_misses=*/false));
+  // Two get requests access the same key.
+  cache_simulator->Access(first_get);
+  cache_simulator->Access(second_get);
+  ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses());
+  ASSERT_EQ(100, cache_simulator->miss_ratio_stats().miss_ratio());
+  ASSERT_EQ(2, cache_simulator->miss_ratio_stats().user_accesses());
+  ASSERT_EQ(100, cache_simulator->miss_ratio_stats().user_miss_ratio());
+  // We insert the key-value pair upon the second get request. A third get
+  // request should observe a hit.
+  for (uint32_t i = 0; i < 10; i++) {
+    cache_simulator->Access(third_get);
+  }
+  ASSERT_EQ(12, cache_simulator->miss_ratio_stats().total_accesses());
+  ASSERT_EQ(16, static_cast<uint64_t>(
+                    cache_simulator->miss_ratio_stats().miss_ratio()));
+  ASSERT_EQ(12, cache_simulator->miss_ratio_stats().user_accesses());
+  ASSERT_EQ(16, static_cast<uint64_t>(
+                    cache_simulator->miss_ratio_stats().user_miss_ratio()));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/utilities/simulator_cache/sim_cache.cc b/src/rocksdb/utilities/simulator_cache/sim_cache.cc
new file mode 100644
index 000000000..a883b52e7
--- /dev/null
+++ b/src/rocksdb/utilities/simulator_cache/sim_cache.cc
@@ -0,0 +1,364 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/utilities/sim_cache.h"
+
+#include <atomic>
+#include <iomanip>
+
+#include "file/writable_file_writer.h"
+#include "monitoring/statistics.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+class CacheActivityLogger {
+ public:
+  CacheActivityLogger()
+      : activity_logging_enabled_(false), max_logging_size_(0) {}
+
+  ~CacheActivityLogger() {
+    MutexLock l(&mutex_);
+
+    StopLoggingInternal();
+    bg_status_.PermitUncheckedError();
+  }
+
+  Status StartLogging(const std::string& activity_log_file, Env* env,
+                      uint64_t max_logging_size = 0) {
+    assert(activity_log_file != "");
+    assert(env != nullptr);
+
+    Status status;
+    FileOptions file_opts;
+
+    MutexLock l(&mutex_);
+
+    // Stop existing logging if any
+    StopLoggingInternal();
+
+    // Open log file
+    status = WritableFileWriter::Create(env->GetFileSystem(), activity_log_file,
+                                        file_opts, &file_writer_, nullptr);
+    if (!status.ok()) {
+      return status;
+    }
+
+    max_logging_size_ = max_logging_size;
+    activity_logging_enabled_.store(true);
+
+    return status;
+  }
+
+  void StopLogging() {
+    MutexLock l(&mutex_);
+
+    StopLoggingInternal();
+  }
+
+  void ReportLookup(const Slice& key) {
+    if (activity_logging_enabled_.load() == false) {
+      return;
+    }
+
+    std::ostringstream oss;
+    // line format: "LOOKUP - <KEY>"
+    oss << "LOOKUP - " << key.ToString(true) << std::endl;
+
+    MutexLock l(&mutex_);
+    Status s = file_writer_->Append(oss.str());
+    if (!s.ok() && bg_status_.ok()) {
+      bg_status_ = s;
+    }
+    if (MaxLoggingSizeReached() || !bg_status_.ok()) {
+      // Stop logging if we have reached the max file size or
+      // encountered an error
+      StopLoggingInternal();
+    }
+  }
+
+  void ReportAdd(const Slice& key, size_t size) {
+    if (activity_logging_enabled_.load() == false) {
+      return;
+    }
+
+    std::ostringstream oss;
+    // line format: "ADD - <KEY> - <KEY-SIZE>"
+    oss << "ADD - " << key.ToString(true) << " - " << size << std::endl;
+    MutexLock l(&mutex_);
+    Status s = file_writer_->Append(oss.str());
+    if (!s.ok() && bg_status_.ok()) {
+      bg_status_ = s;
+    }
+
+    if (MaxLoggingSizeReached() || !bg_status_.ok()) {
+      // Stop logging if we have reached the max file size or
+      // encountered an error
+      StopLoggingInternal();
+    }
+  }
+
+  Status& bg_status() {
+    MutexLock l(&mutex_);
+    return bg_status_;
+  }
+
+ private:
+  bool MaxLoggingSizeReached() {
+    mutex_.AssertHeld();
+
+    return (max_logging_size_ > 0 &&
+            file_writer_->GetFileSize() >= max_logging_size_);
+  }
+
+  void StopLoggingInternal() {
+    mutex_.AssertHeld();
+
+    if (!activity_logging_enabled_) {
+      return;
+    }
+
+    activity_logging_enabled_.store(false);
+    Status s = file_writer_->Close();
+    if (!s.ok() && bg_status_.ok()) {
+      bg_status_ = s;
+    }
+  }
+
+  // Mutex to sync writes to file_writer, and all following
+  // class data members
+  port::Mutex mutex_;
+  // Indicates if logging is currently enabled
+  // atomic to allow reads without mutex
+  std::atomic<bool> activity_logging_enabled_;
+  // When reached, we will stop logging and close the file
+  // Value of 0 means unlimited
+  uint64_t max_logging_size_;
+  std::unique_ptr<WritableFileWriter> file_writer_;
+  Status bg_status_;
+};
+
+// SimCacheImpl definition
+class SimCacheImpl : public SimCache {
+ public:
+  // capacity for real cache (ShardedLRUCache)
+  // test_capacity for key only cache
+  SimCacheImpl(std::shared_ptr<Cache> sim_cache, std::shared_ptr<Cache> cache)
+      : cache_(cache),
+        key_only_cache_(sim_cache),
+        miss_times_(0),
+        hit_times_(0),
+        stats_(nullptr) {}
+
+  ~SimCacheImpl() override {}
+  void SetCapacity(size_t capacity) override { cache_->SetCapacity(capacity); }
+
+  void SetStrictCapacityLimit(bool strict_capacity_limit) override {
+    cache_->SetStrictCapacityLimit(strict_capacity_limit);
+  }
+
+  using Cache::Insert;
+  Status Insert(const Slice& key, void* value, size_t charge,
+                void (*deleter)(const Slice& key, void* value), Handle** handle,
+                Priority priority) override {
+    // The handle and value passed in are for real cache, so we pass nullptr
+    // to key_only_cache_ for both instead. Also, the deleter function pointer
+    // will be called by user to perform some external operation which should
+    // be applied only once. Thus key_only_cache accepts an empty function.
+    // *Lambda function without capture can be assgined to a function pointer
+    Handle* h = key_only_cache_->Lookup(key);
+    if (h == nullptr) {
+      // TODO: Check for error here?
+      auto s = key_only_cache_->Insert(
+          key, nullptr, charge, [](const Slice& /*k*/, void* /*v*/) {}, nullptr,
+          priority);
+      s.PermitUncheckedError();
+    } else {
+      key_only_cache_->Release(h);
+    }
+
+    cache_activity_logger_.ReportAdd(key, charge);
+    if (!cache_) {
+      return Status::OK();
+    }
+    return cache_->Insert(key, value, charge, deleter, handle, priority);
+  }
+
+  using Cache::Lookup;
+  Handle* Lookup(const Slice& key, Statistics* stats) override {
+    Handle* h = key_only_cache_->Lookup(key);
+    if (h != nullptr) {
+      key_only_cache_->Release(h);
+      inc_hit_counter();
+      RecordTick(stats, SIM_BLOCK_CACHE_HIT);
+    } else {
+      inc_miss_counter();
+      RecordTick(stats, SIM_BLOCK_CACHE_MISS);
+    }
+
+    cache_activity_logger_.ReportLookup(key);
+    if (!cache_) {
+      return nullptr;
+    }
+    return cache_->Lookup(key, stats);
+  }
+
+  bool Ref(Handle* handle) override { return cache_->Ref(handle); }
+
+  using Cache::Release;
+  bool Release(Handle* handle, bool erase_if_last_ref = false) override {
+    return cache_->Release(handle, erase_if_last_ref);
+  }
+
+  void Erase(const Slice& key) override {
+    cache_->Erase(key);
+    key_only_cache_->Erase(key);
+  }
+
+  void* Value(Handle* handle) override { return cache_->Value(handle); }
+
+  uint64_t NewId() override { return cache_->NewId(); }
+
+  size_t GetCapacity() const override { return cache_->GetCapacity(); }
+
+  bool HasStrictCapacityLimit() const override {
+    return cache_->HasStrictCapacityLimit();
+  }
+
+  size_t GetUsage() const override { return cache_->GetUsage(); }
+
+  size_t GetUsage(Handle* handle) const override {
+    return cache_->GetUsage(handle);
+  }
+
+  size_t GetCharge(Handle* handle) const override {
+    return cache_->GetCharge(handle);
+  }
+
+  DeleterFn GetDeleter(Handle* handle) const override {
+    return cache_->GetDeleter(handle);
+  }
+
+  size_t GetPinnedUsage() const override { return cache_->GetPinnedUsage(); }
+
+  void DisownData() override {
+    cache_->DisownData();
+    key_only_cache_->DisownData();
+  }
+
+  void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+                              bool thread_safe) override {
+    // only apply to _cache since key_only_cache doesn't hold value
+    cache_->ApplyToAllCacheEntries(callback, thread_safe);
+  }
+
+  void ApplyToAllEntries(
+      const std::function<void(const Slice& key, void* value, size_t charge,
+                               DeleterFn deleter)>& callback,
+      const ApplyToAllEntriesOptions& opts) override {
+    cache_->ApplyToAllEntries(callback, opts);
+  }
+
+  void EraseUnRefEntries() override {
+    cache_->EraseUnRefEntries();
+    key_only_cache_->EraseUnRefEntries();
+  }
+
+  size_t GetSimCapacity() const override {
+    return key_only_cache_->GetCapacity();
+  }
+  size_t GetSimUsage() const override { return key_only_cache_->GetUsage(); }
+  void SetSimCapacity(size_t capacity) override {
+    key_only_cache_->SetCapacity(capacity);
+  }
+
+  uint64_t get_miss_counter() const override {
+    return miss_times_.load(std::memory_order_relaxed);
+  }
+
+  uint64_t get_hit_counter() const override {
+    return hit_times_.load(std::memory_order_relaxed);
+  }
+
+  void reset_counter() override {
+    miss_times_.store(0, std::memory_order_relaxed);
+    hit_times_.store(0, std::memory_order_relaxed);
+    SetTickerCount(stats_, SIM_BLOCK_CACHE_HIT, 0);
+    SetTickerCount(stats_, SIM_BLOCK_CACHE_MISS, 0);
+  }
+
+  std::string ToString() const override {
+    std::ostringstream oss;
+    oss << "SimCache MISSes:  " << get_miss_counter() << std::endl;
+    oss << "SimCache HITs:    " << get_hit_counter() << std::endl;
+    auto lookups = get_miss_counter() + get_hit_counter();
+    oss << "SimCache HITRATE: " << std::fixed << std::setprecision(2)
+        << (lookups == 0 ? 0 : get_hit_counter() * 100.0f / lookups)
+        << std::endl;
+    return oss.str();
+  }
+
+  std::string GetPrintableOptions() const override {
+    std::ostringstream oss;
+    oss << "    cache_options:" << std::endl;
+    oss << cache_->GetPrintableOptions();
+    oss << "    sim_cache_options:" << std::endl;
+    oss << key_only_cache_->GetPrintableOptions();
+    return oss.str();
+  }
+
+  Status StartActivityLogging(const std::string& activity_log_file, Env* env,
+                              uint64_t max_logging_size = 0) override {
+    return cache_activity_logger_.StartLogging(activity_log_file, env,
+                                               max_logging_size);
+  }
+
+  void StopActivityLogging() override { cache_activity_logger_.StopLogging(); }
+
+  Status GetActivityLoggingStatus() override {
+    return cache_activity_logger_.bg_status();
+  }
+
+ private:
+  std::shared_ptr<Cache> cache_;
+  std::shared_ptr<Cache> key_only_cache_;
+  std::atomic<uint64_t> miss_times_;
+  std::atomic<uint64_t> hit_times_;
+  Statistics* stats_;
+  CacheActivityLogger cache_activity_logger_;
+
+  void inc_miss_counter() {
+    miss_times_.fetch_add(1, std::memory_order_relaxed);
+  }
+  void inc_hit_counter() { hit_times_.fetch_add(1, std::memory_order_relaxed); }
+};
+
+}  // end anonymous namespace
+
+// For instrumentation purpose, use NewSimCache instead
+std::shared_ptr<SimCache> NewSimCache(std::shared_ptr<Cache> cache,
+                                      size_t sim_capacity, int num_shard_bits) {
+  LRUCacheOptions co;
+  co.capacity = sim_capacity;
+  co.num_shard_bits = num_shard_bits;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  return NewSimCache(NewLRUCache(co), cache, num_shard_bits);
+}
+
+std::shared_ptr<SimCache> NewSimCache(std::shared_ptr<Cache> sim_cache,
+                                      std::shared_ptr<Cache> cache,
+                                      int num_shard_bits) {
+  if (num_shard_bits >= 20) {
+    return nullptr;  // the cache cannot be sharded into too many fine pieces
+  }
+  return std::make_shared<SimCacheImpl>(sim_cache, cache);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/simulator_cache/sim_cache_test.cc b/src/rocksdb/utilities/simulator_cache/sim_cache_test.cc
new file mode 100644
index 000000000..2e37cd347
--- /dev/null
+++ b/src/rocksdb/utilities/simulator_cache/sim_cache_test.cc
@@ -0,0 +1,226 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/utilities/sim_cache.h"
+
+#include <cstdlib>
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SimCacheTest : public DBTestBase {
+ private:
+  size_t miss_count_ = 0;
+  size_t hit_count_ = 0;
+  size_t insert_count_ = 0;
+  size_t failure_count_ = 0;
+
+ public:
+  const size_t kNumBlocks = 5;
+  const size_t kValueSize = 1000;
+
+  SimCacheTest() : DBTestBase("sim_cache_test", /*env_do_fsync=*/true) {}
+
+  BlockBasedTableOptions GetTableOptions() {
+    BlockBasedTableOptions table_options;
+    // Set a small enough block size so that each key-value get its own block.
+    table_options.block_size = 1;
+    return table_options;
+  }
+
+  Options GetOptions(const BlockBasedTableOptions& table_options) {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    // options.compression = kNoCompression;
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    return options;
+  }
+
+  void InitTable(const Options& /*options*/) {
+    std::string value(kValueSize, 'a');
+    for (size_t i = 0; i < kNumBlocks * 2; i++) {
+      ASSERT_OK(Put(std::to_string(i), value.c_str()));
+    }
+  }
+
+  void RecordCacheCounters(const Options& options) {
+    miss_count_ = TestGetTickerCount(options, BLOCK_CACHE_MISS);
+    hit_count_ = TestGetTickerCount(options, BLOCK_CACHE_HIT);
+    insert_count_ = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    failure_count_ = TestGetTickerCount(options, BLOCK_CACHE_ADD_FAILURES);
+  }
+
+  void CheckCacheCounters(const Options& options, size_t expected_misses,
+                          size_t expected_hits, size_t expected_inserts,
+                          size_t expected_failures) {
+    size_t new_miss_count = TestGetTickerCount(options, BLOCK_CACHE_MISS);
+    size_t new_hit_count = TestGetTickerCount(options, BLOCK_CACHE_HIT);
+    size_t new_insert_count = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    size_t new_failure_count =
+        TestGetTickerCount(options, BLOCK_CACHE_ADD_FAILURES);
+    ASSERT_EQ(miss_count_ + expected_misses, new_miss_count);
+    ASSERT_EQ(hit_count_ + expected_hits, new_hit_count);
+    ASSERT_EQ(insert_count_ + expected_inserts, new_insert_count);
+    ASSERT_EQ(failure_count_ + expected_failures, new_failure_count);
+    miss_count_ = new_miss_count;
+    hit_count_ = new_hit_count;
+    insert_count_ = new_insert_count;
+    failure_count_ = new_failure_count;
+  }
+};
+
+TEST_F(SimCacheTest, SimCache) {
+  ReadOptions read_options;
+  auto table_options = GetTableOptions();
+  auto options = GetOptions(table_options);
+  InitTable(options);
+  LRUCacheOptions co;
+  co.capacity = 0;
+  co.num_shard_bits = 0;
+  co.strict_capacity_limit = false;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<SimCache> simCache = NewSimCache(NewLRUCache(co), 20000, 0);
+  table_options.block_cache = simCache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  RecordCacheCounters(options);
+  // due to cache entry stats collector
+  uint64_t base_misses = simCache->get_miss_counter();
+
+  std::vector<std::unique_ptr<Iterator>> iterators(kNumBlocks);
+  Iterator* iter = nullptr;
+
+  // Load blocks into cache.
+  for (size_t i = 0; i < kNumBlocks; i++) {
+    iter = db_->NewIterator(read_options);
+    iter->Seek(std::to_string(i));
+    ASSERT_OK(iter->status());
+    CheckCacheCounters(options, 1, 0, 1, 0);
+    iterators[i].reset(iter);
+  }
+  ASSERT_EQ(kNumBlocks, simCache->get_hit_counter() +
+                            simCache->get_miss_counter() - base_misses);
+  ASSERT_EQ(0, simCache->get_hit_counter());
+  size_t usage = simCache->GetUsage();
+  ASSERT_LT(0, usage);
+  ASSERT_EQ(usage, simCache->GetSimUsage());
+  simCache->SetCapacity(usage);
+  ASSERT_EQ(usage, simCache->GetPinnedUsage());
+
+  // Test with strict capacity limit.
+  simCache->SetStrictCapacityLimit(true);
+  iter = db_->NewIterator(read_options);
+  iter->Seek(std::to_string(kNumBlocks * 2 - 1));
+  ASSERT_TRUE(iter->status().IsMemoryLimit());
+  CheckCacheCounters(options, 1, 0, 0, 1);
+  delete iter;
+  iter = nullptr;
+
+  // Release iterators and access cache again.
+  for (size_t i = 0; i < kNumBlocks; i++) {
+    iterators[i].reset();
+    CheckCacheCounters(options, 0, 0, 0, 0);
+  }
+  // Add kNumBlocks again
+  for (size_t i = 0; i < kNumBlocks; i++) {
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_options));
+    it->Seek(std::to_string(i));
+    ASSERT_OK(it->status());
+    CheckCacheCounters(options, 0, 1, 0, 0);
+  }
+  ASSERT_EQ(5, simCache->get_hit_counter());
+  for (size_t i = kNumBlocks; i < kNumBlocks * 2; i++) {
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_options));
+    it->Seek(std::to_string(i));
+    ASSERT_OK(it->status());
+    CheckCacheCounters(options, 1, 0, 1, 0);
+  }
+  ASSERT_EQ(0, simCache->GetPinnedUsage());
+  ASSERT_EQ(3 * kNumBlocks + 1, simCache->get_hit_counter() +
+                                    simCache->get_miss_counter() - base_misses);
+  ASSERT_EQ(6, simCache->get_hit_counter());
+}
+
+TEST_F(SimCacheTest, SimCacheLogging) {
+  auto table_options = GetTableOptions();
+  auto options = GetOptions(table_options);
+  options.disable_auto_compactions = true;
+  LRUCacheOptions co;
+  co.capacity = 1024 * 1024;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<SimCache> sim_cache = NewSimCache(NewLRUCache(co), 20000, 0);
+  table_options.block_cache = sim_cache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  int num_block_entries = 20;
+  for (int i = 0; i < num_block_entries; i++) {
+    ASSERT_OK(Put(Key(i), "val"));
+    ASSERT_OK(Flush());
+  }
+
+  std::string log_file = test::PerThreadDBPath(env_, "cache_log.txt");
+  ASSERT_OK(sim_cache->StartActivityLogging(log_file, env_));
+  for (int i = 0; i < num_block_entries; i++) {
+    ASSERT_EQ(Get(Key(i)), "val");
+  }
+  for (int i = 0; i < num_block_entries; i++) {
+    ASSERT_EQ(Get(Key(i)), "val");
+  }
+  sim_cache->StopActivityLogging();
+  ASSERT_OK(sim_cache->GetActivityLoggingStatus());
+
+  std::string file_contents = "";
+  ASSERT_OK(ReadFileToString(env_, log_file, &file_contents));
+  std::istringstream contents(file_contents);
+
+  int lookup_num = 0;
+  int add_num = 0;
+
+  std::string line;
+  // count number of lookups and additions
+  while (std::getline(contents, line)) {
+    // check if the line starts with LOOKUP or ADD
+    if (line.rfind("LOOKUP -", 0) == 0) {
+      ++lookup_num;
+    }
+    if (line.rfind("ADD -", 0) == 0) {
+      ++add_num;
+    }
+  }
+
+  // We asked for every block twice
+  ASSERT_EQ(lookup_num, num_block_entries * 2);
+
+  // We added every block only once, since the cache can hold all blocks
+  ASSERT_EQ(add_num, num_block_entries);
+
+  // Log things again but stop logging automatically after reaching 512 bytes
+  int max_size = 512;
+  ASSERT_OK(sim_cache->StartActivityLogging(log_file, env_, max_size));
+  for (int it = 0; it < 10; it++) {
+    for (int i = 0; i < num_block_entries; i++) {
+      ASSERT_EQ(Get(Key(i)), "val");
+    }
+  }
+  ASSERT_OK(sim_cache->GetActivityLoggingStatus());
+
+  uint64_t fsize = 0;
+  ASSERT_OK(env_->GetFileSize(log_file, &fsize));
+  // error margin of 100 bytes
+  ASSERT_LT(fsize, max_size + 100);
+  ASSERT_GT(fsize, max_size - 100);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc b/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc
new file mode 100644
index 000000000..16f33934d
--- /dev/null
+++ b/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc
@@ -0,0 +1,227 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "utilities/table_properties_collectors/compact_on_deletion_collector.h"
+
+#include <memory>
+#include <sstream>
+
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+#include "rocksdb/utilities/table_properties_collectors.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+#ifndef ROCKSDB_LITE
+
+CompactOnDeletionCollector::CompactOnDeletionCollector(
+    size_t sliding_window_size, size_t deletion_trigger, double deletion_ratio)
+    : bucket_size_((sliding_window_size + kNumBuckets - 1) / kNumBuckets),
+      current_bucket_(0),
+      num_keys_in_current_bucket_(0),
+      num_deletions_in_observation_window_(0),
+      deletion_trigger_(deletion_trigger),
+      deletion_ratio_(deletion_ratio),
+      deletion_ratio_enabled_(deletion_ratio > 0 && deletion_ratio <= 1),
+      need_compaction_(false),
+      finished_(false) {
+  memset(num_deletions_in_buckets_, 0, sizeof(size_t) * kNumBuckets);
+}
+
+// AddUserKey() will be called when a new key/value pair is inserted into the
+// table.
+// @params key    the user key that is inserted into the table.
+// @params value  the value that is inserted into the table.
+// @params file_size  file size up to now
+Status CompactOnDeletionCollector::AddUserKey(const Slice& /*key*/,
+                                              const Slice& /*value*/,
+                                              EntryType type,
+                                              SequenceNumber /*seq*/,
+                                              uint64_t /*file_size*/) {
+  assert(!finished_);
+  if (!bucket_size_ && !deletion_ratio_enabled_) {
+    // This collector is effectively disabled
+    return Status::OK();
+  }
+
+  if (need_compaction_) {
+    // If the output file already needs to be compacted, skip the check.
+    return Status::OK();
+  }
+
+  if (deletion_ratio_enabled_) {
+    total_entries_++;
+    if (type == kEntryDelete) {
+      deletion_entries_++;
+    }
+  }
+
+  if (bucket_size_) {
+    if (num_keys_in_current_bucket_ == bucket_size_) {
+      // When the current bucket is full, advance the cursor of the
+      // ring buffer to the next bucket.
+      current_bucket_ = (current_bucket_ + 1) % kNumBuckets;
+
+      // Update the current count of observed deletion keys by excluding
+      // the number of deletion keys in the oldest bucket in the
+      // observation window.
+      assert(num_deletions_in_observation_window_ >=
+             num_deletions_in_buckets_[current_bucket_]);
+      num_deletions_in_observation_window_ -=
+          num_deletions_in_buckets_[current_bucket_];
+      num_deletions_in_buckets_[current_bucket_] = 0;
+      num_keys_in_current_bucket_ = 0;
+    }
+
+    num_keys_in_current_bucket_++;
+    if (type == kEntryDelete) {
+      num_deletions_in_observation_window_++;
+      num_deletions_in_buckets_[current_bucket_]++;
+      if (num_deletions_in_observation_window_ >= deletion_trigger_) {
+        need_compaction_ = true;
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+Status CompactOnDeletionCollector::Finish(
+    UserCollectedProperties* /*properties*/) {
+  if (!need_compaction_ && deletion_ratio_enabled_ && total_entries_ > 0) {
+    double ratio = static_cast<double>(deletion_entries_) / total_entries_;
+    need_compaction_ = ratio >= deletion_ratio_;
+  }
+  finished_ = true;
+  return Status::OK();
+}
+static std::unordered_map<std::string, OptionTypeInfo>
+    on_deletion_collector_type_info = {
+#ifndef ROCKSDB_LITE
+        {"window_size",
+         {0, OptionType::kUnknown, OptionVerificationType::kNormal,
+          OptionTypeFlags::kCompareNever | OptionTypeFlags::kMutable,
+          [](const ConfigOptions&, const std::string&, const std::string& value,
+             void* addr) {
+            auto* factory =
+                static_cast<CompactOnDeletionCollectorFactory*>(addr);
+            factory->SetWindowSize(ParseSizeT(value));
+            return Status::OK();
+          },
+          [](const ConfigOptions&, const std::string&, const void* addr,
+             std::string* value) {
+            const auto* factory =
+                static_cast<const CompactOnDeletionCollectorFactory*>(addr);
+            *value = std::to_string(factory->GetWindowSize());
+            return Status::OK();
+          },
+          nullptr}},
+        {"deletion_trigger",
+         {0, OptionType::kUnknown, OptionVerificationType::kNormal,
+          OptionTypeFlags::kCompareNever | OptionTypeFlags::kMutable,
+          [](const ConfigOptions&, const std::string&, const std::string& value,
+             void* addr) {
+            auto* factory =
+                static_cast<CompactOnDeletionCollectorFactory*>(addr);
+            factory->SetDeletionTrigger(ParseSizeT(value));
+            return Status::OK();
+          },
+          [](const ConfigOptions&, const std::string&, const void* addr,
+             std::string* value) {
+            const auto* factory =
+                static_cast<const CompactOnDeletionCollectorFactory*>(addr);
+            *value = std::to_string(factory->GetDeletionTrigger());
+            return Status::OK();
+          },
+          nullptr}},
+        {"deletion_ratio",
+         {0, OptionType::kUnknown, OptionVerificationType::kNormal,
+          OptionTypeFlags::kCompareNever | OptionTypeFlags::kMutable,
+          [](const ConfigOptions&, const std::string&, const std::string& value,
+             void* addr) {
+            auto* factory =
+                static_cast<CompactOnDeletionCollectorFactory*>(addr);
+            factory->SetDeletionRatio(ParseDouble(value));
+            return Status::OK();
+          },
+          [](const ConfigOptions&, const std::string&, const void* addr,
+             std::string* value) {
+            const auto* factory =
+                static_cast<const CompactOnDeletionCollectorFactory*>(addr);
+            *value = std::to_string(factory->GetDeletionRatio());
+            return Status::OK();
+          },
+          nullptr}},
+
+#endif  // ROCKSDB_LITE
+};
+
+CompactOnDeletionCollectorFactory::CompactOnDeletionCollectorFactory(
+    size_t sliding_window_size, size_t deletion_trigger, double deletion_ratio)
+    : sliding_window_size_(sliding_window_size),
+      deletion_trigger_(deletion_trigger),
+      deletion_ratio_(deletion_ratio) {
+  RegisterOptions("", this, &on_deletion_collector_type_info);
+}
+
+TablePropertiesCollector*
+CompactOnDeletionCollectorFactory::CreateTablePropertiesCollector(
+    TablePropertiesCollectorFactory::Context /*context*/) {
+  return new CompactOnDeletionCollector(sliding_window_size_.load(),
+                                        deletion_trigger_.load(),
+                                        deletion_ratio_.load());
+}
+
+std::string CompactOnDeletionCollectorFactory::ToString() const {
+  std::ostringstream cfg;
+  cfg << Name() << " (Sliding window size = " << sliding_window_size_.load()
+      << " Deletion trigger = " << deletion_trigger_.load()
+      << " Deletion ratio = " << deletion_ratio_.load() << ')';
+  return cfg.str();
+}
+
+std::shared_ptr<CompactOnDeletionCollectorFactory>
+NewCompactOnDeletionCollectorFactory(size_t sliding_window_size,
+                                     size_t deletion_trigger,
+                                     double deletion_ratio) {
+  return std::shared_ptr<CompactOnDeletionCollectorFactory>(
+      new CompactOnDeletionCollectorFactory(sliding_window_size,
+                                            deletion_trigger, deletion_ratio));
+}
+namespace {
+static int RegisterTablePropertiesCollectorFactories(
+    ObjectLibrary& library, const std::string& /*arg*/) {
+  library.AddFactory<TablePropertiesCollectorFactory>(
+      CompactOnDeletionCollectorFactory::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<TablePropertiesCollectorFactory>* guard,
+         std::string* /* errmsg */) {
+        // By default, create a CompactionOnDeletionCollector that is disabled.
+        // Users will need to provide configuration parameters or call the
+        // corresponding Setter to enable the factory.
+        guard->reset(new CompactOnDeletionCollectorFactory(0, 0, 0));
+        return guard->get();
+      });
+  return 1;
+}
+}  // namespace
+#endif  // !ROCKSDB_LITE
+
+Status TablePropertiesCollectorFactory::CreateFromString(
+    const ConfigOptions& options, const std::string& value,
+    std::shared_ptr<TablePropertiesCollectorFactory>* result) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    RegisterTablePropertiesCollectorFactories(*(ObjectLibrary::Default().get()),
+                                              "");
+  });
+#endif  // ROCKSDB_LITE
+  return LoadSharedObject<TablePropertiesCollectorFactory>(options, value,
+                                                           nullptr, result);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h b/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h
new file mode 100644
index 000000000..2f7dc4f1b
--- /dev/null
+++ b/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h
@@ -0,0 +1,70 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+#include "rocksdb/utilities/table_properties_collectors.h"
+namespace ROCKSDB_NAMESPACE {
+
+class CompactOnDeletionCollector : public TablePropertiesCollector {
+ public:
+  CompactOnDeletionCollector(size_t sliding_window_size,
+                             size_t deletion_trigger, double deletion_raatio);
+
+  // AddUserKey() will be called when a new key/value pair is inserted into the
+  // table.
+  // @params key    the user key that is inserted into the table.
+  // @params value  the value that is inserted into the table.
+  // @params file_size  file size up to now
+  virtual Status AddUserKey(const Slice& key, const Slice& value,
+                            EntryType type, SequenceNumber seq,
+                            uint64_t file_size) override;
+
+  // Finish() will be called when a table has already been built and is ready
+  // for writing the properties block.
+  // @params properties  User will add their collected statistics to
+  // `properties`.
+  virtual Status Finish(UserCollectedProperties* /*properties*/) override;
+
+  // Return the human-readable properties, where the key is property name and
+  // the value is the human-readable form of value.
+  virtual UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties();
+  }
+
+  // The name of the properties collector can be used for debugging purpose.
+  virtual const char* Name() const override {
+    return "CompactOnDeletionCollector";
+  }
+
+  // EXPERIMENTAL Return whether the output file should be further compacted
+  virtual bool NeedCompact() const override { return need_compaction_; }
+
+  static const int kNumBuckets = 128;
+
+ private:
+  void Reset();
+
+  // A ring buffer that used to count the number of deletion entries for every
+  // "bucket_size_" keys.
+  size_t num_deletions_in_buckets_[kNumBuckets];
+  // the number of keys in a bucket
+  size_t bucket_size_;
+
+  size_t current_bucket_;
+  size_t num_keys_in_current_bucket_;
+  size_t num_deletions_in_observation_window_;
+  size_t deletion_trigger_;
+  const double deletion_ratio_;
+  const bool deletion_ratio_enabled_;
+  size_t total_entries_ = 0;
+  size_t deletion_entries_ = 0;
+  // true if the current SST file needs to be compacted.
+  bool need_compaction_;
+  bool finished_;
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc b/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
new file mode 100644
index 000000000..88aeb8d5c
--- /dev/null
+++ b/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
@@ -0,0 +1,245 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <stdio.h>
+
+#ifndef ROCKSDB_LITE
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "port/stack_trace.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/utilities/table_properties_collectors.h"
+#include "test_util/testharness.h"
+#include "util/random.h"
+#include "utilities/table_properties_collectors/compact_on_deletion_collector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+TEST(CompactOnDeletionCollector, DeletionRatio) {
+  TablePropertiesCollectorFactory::Context context;
+  context.column_family_id =
+      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily;
+  const size_t kTotalEntries = 100;
+
+  {
+    // Disable deletion ratio.
+    for (double deletion_ratio : {-1.5, -1.0, 0.0, 1.5, 2.0}) {
+      auto factory = NewCompactOnDeletionCollectorFactory(0, 0, deletion_ratio);
+      std::unique_ptr<TablePropertiesCollector> collector(
+          factory->CreateTablePropertiesCollector(context));
+      for (size_t i = 0; i < kTotalEntries; i++) {
+        // All entries are deletion entries.
+        ASSERT_OK(
+            collector->AddUserKey("hello", "rocksdb", kEntryDelete, 0, 0));
+        ASSERT_FALSE(collector->NeedCompact());
+      }
+      ASSERT_OK(collector->Finish(nullptr));
+      ASSERT_FALSE(collector->NeedCompact());
+    }
+  }
+
+  {
+    for (double deletion_ratio : {0.3, 0.5, 0.8, 1.0}) {
+      auto factory = NewCompactOnDeletionCollectorFactory(0, 0, deletion_ratio);
+      const size_t deletion_entries_trigger =
+          static_cast<size_t>(deletion_ratio * kTotalEntries);
+      for (int delta : {-1, 0, 1}) {
+        // Actual deletion entry ratio <, =, > deletion_ratio
+        size_t actual_deletion_entries = deletion_entries_trigger + delta;
+        std::unique_ptr<TablePropertiesCollector> collector(
+            factory->CreateTablePropertiesCollector(context));
+        for (size_t i = 0; i < kTotalEntries; i++) {
+          if (i < actual_deletion_entries) {
+            ASSERT_OK(
+                collector->AddUserKey("hello", "rocksdb", kEntryDelete, 0, 0));
+          } else {
+            ASSERT_OK(
+                collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0));
+          }
+          ASSERT_FALSE(collector->NeedCompact());
+        }
+        ASSERT_OK(collector->Finish(nullptr));
+        if (delta >= 0) {
+          // >= deletion_ratio
+          ASSERT_TRUE(collector->NeedCompact());
+        } else {
+          ASSERT_FALSE(collector->NeedCompact());
+        }
+      }
+    }
+  }
+}
+
+TEST(CompactOnDeletionCollector, SlidingWindow) {
+  const int kWindowSizes[] = {1000, 10000, 10000, 127, 128,  129,
+                              255,  256,   257,   2,   10000};
+  const int kDeletionTriggers[] = {500, 9500, 4323, 47, 61, 128,
+                                   250, 250,  250,  2,  2};
+  TablePropertiesCollectorFactory::Context context;
+  context.column_family_id =
+      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily;
+
+  std::vector<int> window_sizes;
+  std::vector<int> deletion_triggers;
+  // deterministic tests
+  for (int test = 0; test < 9; ++test) {
+    window_sizes.emplace_back(kWindowSizes[test]);
+    deletion_triggers.emplace_back(kDeletionTriggers[test]);
+  }
+
+  // randomize tests
+  Random rnd(301);
+  const int kMaxTestSize = 100000l;
+  for (int random_test = 0; random_test < 10; random_test++) {
+    int window_size = rnd.Uniform(kMaxTestSize) + 1;
+    int deletion_trigger = rnd.Uniform(window_size);
+    window_sizes.emplace_back(window_size);
+    deletion_triggers.emplace_back(deletion_trigger);
+  }
+
+  assert(window_sizes.size() == deletion_triggers.size());
+
+  for (size_t test = 0; test < window_sizes.size(); ++test) {
+    const int kBucketSize = 128;
+    const int kWindowSize = window_sizes[test];
+    const int kPaddedWindowSize =
+        kBucketSize * ((window_sizes[test] + kBucketSize - 1) / kBucketSize);
+    const int kNumDeletionTrigger = deletion_triggers[test];
+    const int kBias = (kNumDeletionTrigger + kBucketSize - 1) / kBucketSize;
+    // Simple test
+    {
+      auto factory = NewCompactOnDeletionCollectorFactory(kWindowSize,
+                                                          kNumDeletionTrigger);
+      const int kSample = 10;
+      for (int delete_rate = 0; delete_rate <= kSample; ++delete_rate) {
+        std::unique_ptr<TablePropertiesCollector> collector(
+            factory->CreateTablePropertiesCollector(context));
+        int deletions = 0;
+        for (int i = 0; i < kPaddedWindowSize; ++i) {
+          if (i % kSample < delete_rate) {
+            ASSERT_OK(
+                collector->AddUserKey("hello", "rocksdb", kEntryDelete, 0, 0));
+            deletions++;
+          } else {
+            ASSERT_OK(
+                collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0));
+          }
+        }
+        if (collector->NeedCompact() != (deletions >= kNumDeletionTrigger) &&
+            std::abs(deletions - kNumDeletionTrigger) > kBias) {
+          fprintf(stderr,
+                  "[Error] collector->NeedCompact() != (%d >= %d)"
+                  " with kWindowSize = %d and kNumDeletionTrigger = %d\n",
+                  deletions, kNumDeletionTrigger, kWindowSize,
+                  kNumDeletionTrigger);
+          ASSERT_TRUE(false);
+        }
+        ASSERT_OK(collector->Finish(nullptr));
+      }
+    }
+
+    // Only one section of a file satisfies the compaction trigger
+    {
+      auto factory = NewCompactOnDeletionCollectorFactory(kWindowSize,
+                                                          kNumDeletionTrigger);
+      const int kSample = 10;
+      for (int delete_rate = 0; delete_rate <= kSample; ++delete_rate) {
+        std::unique_ptr<TablePropertiesCollector> collector(
+            factory->CreateTablePropertiesCollector(context));
+        int deletions = 0;
+        for (int section = 0; section < 5; ++section) {
+          int initial_entries = rnd.Uniform(kWindowSize) + kWindowSize;
+          for (int i = 0; i < initial_entries; ++i) {
+            ASSERT_OK(
+                collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0));
+          }
+        }
+        for (int i = 0; i < kPaddedWindowSize; ++i) {
+          if (i % kSample < delete_rate) {
+            ASSERT_OK(
+                collector->AddUserKey("hello", "rocksdb", kEntryDelete, 0, 0));
+            deletions++;
+          } else {
+            ASSERT_OK(
+                collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0));
+          }
+        }
+        for (int section = 0; section < 5; ++section) {
+          int ending_entries = rnd.Uniform(kWindowSize) + kWindowSize;
+          for (int i = 0; i < ending_entries; ++i) {
+            ASSERT_OK(
+                collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0));
+          }
+        }
+        if (collector->NeedCompact() != (deletions >= kNumDeletionTrigger) &&
+            std::abs(deletions - kNumDeletionTrigger) > kBias) {
+          fprintf(stderr,
+                  "[Error] collector->NeedCompact() %d != (%d >= %d)"
+                  " with kWindowSize = %d, kNumDeletionTrigger = %d\n",
+                  collector->NeedCompact(), deletions, kNumDeletionTrigger,
+                  kWindowSize, kNumDeletionTrigger);
+          ASSERT_TRUE(false);
+        }
+        ASSERT_OK(collector->Finish(nullptr));
+      }
+    }
+
+    // TEST 3:  Issues a lots of deletes, but their density is not
+    // high enough to trigger compaction.
+    {
+      std::unique_ptr<TablePropertiesCollector> collector;
+      auto factory = NewCompactOnDeletionCollectorFactory(kWindowSize,
+                                                          kNumDeletionTrigger);
+      collector.reset(factory->CreateTablePropertiesCollector(context));
+      assert(collector->NeedCompact() == false);
+      // Insert "kNumDeletionTrigger * 0.95" deletions for every
+      // "kWindowSize" and verify compaction is not needed.
+      const int kDeletionsPerSection = kNumDeletionTrigger * 95 / 100;
+      if (kDeletionsPerSection >= 0) {
+        for (int section = 0; section < 200; ++section) {
+          for (int i = 0; i < kPaddedWindowSize; ++i) {
+            if (i < kDeletionsPerSection) {
+              ASSERT_OK(collector->AddUserKey("hello", "rocksdb", kEntryDelete,
+                                              0, 0));
+            } else {
+              ASSERT_OK(
+                  collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0));
+            }
+          }
+        }
+        if (collector->NeedCompact() &&
+            std::abs(kDeletionsPerSection - kNumDeletionTrigger) > kBias) {
+          fprintf(stderr,
+                  "[Error] collector->NeedCompact() != false"
+                  " with kWindowSize = %d and kNumDeletionTrigger = %d\n",
+                  kWindowSize, kNumDeletionTrigger);
+          ASSERT_TRUE(false);
+        }
+        ASSERT_OK(collector->Finish(nullptr));
+      }
+    }
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+#else
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as RocksDBLite does not include utilities.\n");
+  return 0;
+}
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/trace/file_trace_reader_writer.cc b/src/rocksdb/utilities/trace/file_trace_reader_writer.cc
new file mode 100644
index 000000000..5886d3539
--- /dev/null
+++ b/src/rocksdb/utilities/trace/file_trace_reader_writer.cc
@@ -0,0 +1,133 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "utilities/trace/file_trace_reader_writer.h"
+
+#include "env/composite_env_wrapper.h"
+#include "file/random_access_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "trace_replay/trace_replay.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const unsigned int FileTraceReader::kBufferSize = 1024;  // 1KB
+
+FileTraceReader::FileTraceReader(
+    std::unique_ptr<RandomAccessFileReader>&& reader)
+    : file_reader_(std::move(reader)),
+      offset_(0),
+      buffer_(new char[kBufferSize]) {}
+
+FileTraceReader::~FileTraceReader() {
+  Close().PermitUncheckedError();
+  delete[] buffer_;
+}
+
+Status FileTraceReader::Close() {
+  file_reader_.reset();
+  return Status::OK();
+}
+
+Status FileTraceReader::Reset() {
+  if (file_reader_ == nullptr) {
+    return Status::IOError("TraceReader is closed.");
+  }
+  offset_ = 0;
+  return Status::OK();
+}
+
+Status FileTraceReader::Read(std::string* data) {
+  assert(file_reader_ != nullptr);
+  Status s = file_reader_->Read(IOOptions(), offset_, kTraceMetadataSize,
+                                &result_, buffer_, nullptr,
+                                Env::IO_TOTAL /* rate_limiter_priority */);
+  if (!s.ok()) {
+    return s;
+  }
+  if (result_.size() == 0) {
+    // No more data to read
+    // Todo: Come up with a better way to indicate end of data. May be this
+    // could be avoided once footer is introduced.
+    return Status::Incomplete();
+  }
+  if (result_.size() < kTraceMetadataSize) {
+    return Status::Corruption("Corrupted trace file.");
+  }
+  *data = result_.ToString();
+  offset_ += kTraceMetadataSize;
+
+  uint32_t payload_len =
+      DecodeFixed32(&buffer_[kTraceTimestampSize + kTraceTypeSize]);
+
+  // Read Payload
+  unsigned int bytes_to_read = payload_len;
+  unsigned int to_read =
+      bytes_to_read > kBufferSize ? kBufferSize : bytes_to_read;
+  while (to_read > 0) {
+    s = file_reader_->Read(IOOptions(), offset_, to_read, &result_, buffer_,
+                           nullptr, Env::IO_TOTAL /* rate_limiter_priority */);
+    if (!s.ok()) {
+      return s;
+    }
+    if (result_.size() < to_read) {
+      return Status::Corruption("Corrupted trace file.");
+    }
+    data->append(result_.data(), result_.size());
+
+    offset_ += to_read;
+    bytes_to_read -= to_read;
+    to_read = bytes_to_read > kBufferSize ? kBufferSize : bytes_to_read;
+  }
+
+  return s;
+}
+
+FileTraceWriter::FileTraceWriter(
+    std::unique_ptr<WritableFileWriter>&& file_writer)
+    : file_writer_(std::move(file_writer)) {}
+
+FileTraceWriter::~FileTraceWriter() { Close().PermitUncheckedError(); }
+
+Status FileTraceWriter::Close() {
+  file_writer_.reset();
+  return Status::OK();
+}
+
+Status FileTraceWriter::Write(const Slice& data) {
+  return file_writer_->Append(data);
+}
+
+uint64_t FileTraceWriter::GetFileSize() { return file_writer_->GetFileSize(); }
+
+Status NewFileTraceReader(Env* env, const EnvOptions& env_options,
+                          const std::string& trace_filename,
+                          std::unique_ptr<TraceReader>* trace_reader) {
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  Status s = RandomAccessFileReader::Create(
+      env->GetFileSystem(), trace_filename, FileOptions(env_options),
+      &file_reader, nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+  trace_reader->reset(new FileTraceReader(std::move(file_reader)));
+  return s;
+}
+
+Status NewFileTraceWriter(Env* env, const EnvOptions& env_options,
+                          const std::string& trace_filename,
+                          std::unique_ptr<TraceWriter>* trace_writer) {
+  std::unique_ptr<WritableFileWriter> file_writer;
+  Status s = WritableFileWriter::Create(env->GetFileSystem(), trace_filename,
+                                        FileOptions(env_options), &file_writer,
+                                        nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+  trace_writer->reset(new FileTraceWriter(std::move(file_writer)));
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/trace/file_trace_reader_writer.h b/src/rocksdb/utilities/trace/file_trace_reader_writer.h
new file mode 100644
index 000000000..65d483108
--- /dev/null
+++ b/src/rocksdb/utilities/trace/file_trace_reader_writer.h
@@ -0,0 +1,48 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/trace_reader_writer.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class RandomAccessFileReader;
+class WritableFileWriter;
+
+// FileTraceReader allows reading RocksDB traces from a file.
+class FileTraceReader : public TraceReader {
+ public:
+  explicit FileTraceReader(std::unique_ptr<RandomAccessFileReader>&& reader);
+  ~FileTraceReader();
+
+  virtual Status Read(std::string* data) override;
+  virtual Status Close() override;
+  virtual Status Reset() override;
+
+ private:
+  std::unique_ptr<RandomAccessFileReader> file_reader_;
+  Slice result_;
+  size_t offset_;
+  char* const buffer_;
+
+  static const unsigned int kBufferSize;
+};
+
+// FileTraceWriter allows writing RocksDB traces to a file.
+class FileTraceWriter : public TraceWriter {
+ public:
+  explicit FileTraceWriter(std::unique_ptr<WritableFileWriter>&& file_writer);
+  ~FileTraceWriter();
+
+  virtual Status Write(const Slice& data) override;
+  virtual Status Close() override;
+  virtual uint64_t GetFileSize() override;
+
+ private:
+  std::unique_ptr<WritableFileWriter> file_writer_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/trace/replayer_impl.cc b/src/rocksdb/utilities/trace/replayer_impl.cc
new file mode 100644
index 000000000..31023f1a2
--- /dev/null
+++ b/src/rocksdb/utilities/trace/replayer_impl.cc
@@ -0,0 +1,316 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/trace/replayer_impl.h"
+
+#include <cmath>
+#include <thread>
+
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/system_clock.h"
+#include "util/threadpool_imp.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ReplayerImpl::ReplayerImpl(DB* db,
+                           const std::vector<ColumnFamilyHandle*>& handles,
+                           std::unique_ptr<TraceReader>&& reader)
+    : Replayer(),
+      trace_reader_(std::move(reader)),
+      prepared_(false),
+      trace_end_(false),
+      header_ts_(0),
+      exec_handler_(TraceRecord::NewExecutionHandler(db, handles)),
+      env_(db->GetEnv()),
+      trace_file_version_(-1) {}
+
+ReplayerImpl::~ReplayerImpl() {
+  exec_handler_.reset();
+  trace_reader_.reset();
+}
+
+Status ReplayerImpl::Prepare() {
+  Trace header;
+  int db_version;
+  Status s = ReadHeader(&header);
+  if (!s.ok()) {
+    return s;
+  }
+  s = TracerHelper::ParseTraceHeader(header, &trace_file_version_, &db_version);
+  if (!s.ok()) {
+    return s;
+  }
+  header_ts_ = header.ts;
+  prepared_ = true;
+  trace_end_ = false;
+  return Status::OK();
+}
+
+Status ReplayerImpl::Next(std::unique_ptr<TraceRecord>* record) {
+  if (!prepared_) {
+    return Status::Incomplete("Not prepared!");
+  }
+  if (trace_end_) {
+    return Status::Incomplete("Trace end.");
+  }
+
+  Trace trace;
+  Status s = ReadTrace(&trace);  // ReadTrace is atomic
+  // Reached the trace end.
+  if (s.ok() && trace.type == kTraceEnd) {
+    trace_end_ = true;
+    return Status::Incomplete("Trace end.");
+  }
+  if (!s.ok() || record == nullptr) {
+    return s;
+  }
+
+  return TracerHelper::DecodeTraceRecord(&trace, trace_file_version_, record);
+}
+
+Status ReplayerImpl::Execute(const std::unique_ptr<TraceRecord>& record,
+                             std::unique_ptr<TraceRecordResult>* result) {
+  return record->Accept(exec_handler_.get(), result);
+}
+
+Status ReplayerImpl::Replay(
+    const ReplayOptions& options,
+    const std::function<void(Status, std::unique_ptr<TraceRecordResult>&&)>&
+        result_callback) {
+  if (options.fast_forward <= 0.0) {
+    return Status::InvalidArgument("Wrong fast forward speed!");
+  }
+
+  if (!prepared_) {
+    return Status::Incomplete("Not prepared!");
+  }
+  if (trace_end_) {
+    return Status::Incomplete("Trace end.");
+  }
+
+  Status s = Status::OK();
+
+  if (options.num_threads <= 1) {
+    // num_threads == 0 or num_threads == 1 uses single thread.
+    std::chrono::system_clock::time_point replay_epoch =
+        std::chrono::system_clock::now();
+
+    while (s.ok()) {
+      Trace trace;
+      s = ReadTrace(&trace);
+      // If already at trace end, ReadTrace should return Status::Incomplete().
+      if (!s.ok()) {
+        break;
+      }
+
+      // No need to sleep before breaking the loop if at the trace end.
+      if (trace.type == kTraceEnd) {
+        trace_end_ = true;
+        s = Status::Incomplete("Trace end.");
+        break;
+      }
+
+      // In single-threaded replay, decode first then sleep.
+      std::unique_ptr<TraceRecord> record;
+      s = TracerHelper::DecodeTraceRecord(&trace, trace_file_version_, &record);
+      if (!s.ok() && !s.IsNotSupported()) {
+        break;
+      }
+
+      std::chrono::system_clock::time_point sleep_to =
+          replay_epoch +
+          std::chrono::microseconds(static_cast<uint64_t>(std::llround(
+              1.0 * (trace.ts - header_ts_) / options.fast_forward)));
+      if (sleep_to > std::chrono::system_clock::now()) {
+        std::this_thread::sleep_until(sleep_to);
+      }
+
+      // Skip unsupported traces, stop for other errors.
+      if (s.IsNotSupported()) {
+        if (result_callback != nullptr) {
+          result_callback(s, nullptr);
+        }
+        s = Status::OK();
+        continue;
+      }
+
+      if (result_callback == nullptr) {
+        s = Execute(record, nullptr);
+      } else {
+        std::unique_ptr<TraceRecordResult> res;
+        s = Execute(record, &res);
+        result_callback(s, std::move(res));
+      }
+    }
+  } else {
+    // Multi-threaded replay.
+    ThreadPoolImpl thread_pool;
+    thread_pool.SetHostEnv(env_);
+    thread_pool.SetBackgroundThreads(static_cast<int>(options.num_threads));
+
+    std::mutex mtx;
+    // Background decoding and execution status.
+    Status bg_s = Status::OK();
+    uint64_t last_err_ts = static_cast<uint64_t>(-1);
+    // Callback function used in background work to update bg_s for the ealiest
+    // TraceRecord which has execution error. This is different from the
+    // timestamp of the first execution error (either start or end timestamp).
+    //
+    // Suppose TraceRecord R1, R2, with timestamps T1 < T2. Their execution
+    // timestamps are T1_start, T1_end, T2_start, T2_end.
+    // Single-thread: there must be T1_start < T1_end < T2_start < T2_end.
+    // Multi-thread: T1_start < T2_start may not be enforced. Orders of them are
+    // totally unknown.
+    // In order to report the same `first` error in both single-thread and
+    // multi-thread replay, we can only rely on the TraceRecords' timestamps,
+    // rather than their executin timestamps. Although in single-thread replay,
+    // the first error is also the last error, while in multi-thread replay, the
+    // first error may not be the first error in execution, and it may not be
+    // the last error in exeution as well.
+    auto error_cb = [&mtx, &bg_s, &last_err_ts](Status err, uint64_t err_ts) {
+      std::lock_guard<std::mutex> gd(mtx);
+      // Only record the first error.
+      if (!err.ok() && !err.IsNotSupported() && err_ts < last_err_ts) {
+        bg_s = err;
+        last_err_ts = err_ts;
+      }
+    };
+
+    std::chrono::system_clock::time_point replay_epoch =
+        std::chrono::system_clock::now();
+
+    while (bg_s.ok() && s.ok()) {
+      Trace trace;
+      s = ReadTrace(&trace);
+      // If already at trace end, ReadTrace should return Status::Incomplete().
+      if (!s.ok()) {
+        break;
+      }
+
+      TraceType trace_type = trace.type;
+
+      // No need to sleep before breaking the loop if at the trace end.
+      if (trace_type == kTraceEnd) {
+        trace_end_ = true;
+        s = Status::Incomplete("Trace end.");
+        break;
+      }
+
+      // In multi-threaded replay, sleep first then start decoding and
+      // execution in a thread.
+      std::chrono::system_clock::time_point sleep_to =
+          replay_epoch +
+          std::chrono::microseconds(static_cast<uint64_t>(std::llround(
+              1.0 * (trace.ts - header_ts_) / options.fast_forward)));
+      if (sleep_to > std::chrono::system_clock::now()) {
+        std::this_thread::sleep_until(sleep_to);
+      }
+
+      if (trace_type == kTraceWrite || trace_type == kTraceGet ||
+          trace_type == kTraceIteratorSeek ||
+          trace_type == kTraceIteratorSeekForPrev ||
+          trace_type == kTraceMultiGet) {
+        std::unique_ptr<ReplayerWorkerArg> ra(new ReplayerWorkerArg);
+        ra->trace_entry = std::move(trace);
+        ra->handler = exec_handler_.get();
+        ra->trace_file_version = trace_file_version_;
+        ra->error_cb = error_cb;
+        ra->result_cb = result_callback;
+        thread_pool.Schedule(&ReplayerImpl::BackgroundWork, ra.release(),
+                             nullptr, nullptr);
+      } else {
+        // Skip unsupported traces.
+        if (result_callback != nullptr) {
+          result_callback(Status::NotSupported("Unsupported trace type."),
+                          nullptr);
+        }
+      }
+    }
+
+    thread_pool.WaitForJobsAndJoinAllThreads();
+    if (!bg_s.ok()) {
+      s = bg_s;
+    }
+  }
+
+  if (s.IsIncomplete()) {
+    // Reaching eof returns Incomplete status at the moment.
+    // Could happen when killing a process without calling EndTrace() API.
+    // TODO: Add better error handling.
+    trace_end_ = true;
+    return Status::OK();
+  }
+  return s;
+}
+
+uint64_t ReplayerImpl::GetHeaderTimestamp() const { return header_ts_; }
+
+Status ReplayerImpl::ReadHeader(Trace* header) {
+  assert(header != nullptr);
+  Status s = trace_reader_->Reset();
+  if (!s.ok()) {
+    return s;
+  }
+  std::string encoded_trace;
+  // Read the trace head
+  s = trace_reader_->Read(&encoded_trace);
+  if (!s.ok()) {
+    return s;
+  }
+
+  return TracerHelper::DecodeHeader(encoded_trace, header);
+}
+
+Status ReplayerImpl::ReadTrace(Trace* trace) {
+  assert(trace != nullptr);
+  std::string encoded_trace;
+  // We don't know if TraceReader is implemented thread-safe, so we protect the
+  // reading trace part with a mutex. The decoding part does not need to be
+  // protected since it's local.
+  {
+    std::lock_guard<std::mutex> guard(mutex_);
+    Status s = trace_reader_->Read(&encoded_trace);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  return TracerHelper::DecodeTrace(encoded_trace, trace);
+}
+
+void ReplayerImpl::BackgroundWork(void* arg) {
+  std::unique_ptr<ReplayerWorkerArg> ra(
+      reinterpret_cast<ReplayerWorkerArg*>(arg));
+  assert(ra != nullptr);
+
+  std::unique_ptr<TraceRecord> record;
+  Status s = TracerHelper::DecodeTraceRecord(&(ra->trace_entry),
+                                             ra->trace_file_version, &record);
+  if (!s.ok()) {
+    // Stop the replay
+    if (ra->error_cb != nullptr) {
+      ra->error_cb(s, ra->trace_entry.ts);
+    }
+    // Report the result
+    if (ra->result_cb != nullptr) {
+      ra->result_cb(s, nullptr);
+    }
+    return;
+  }
+
+  if (ra->result_cb == nullptr) {
+    s = record->Accept(ra->handler, nullptr);
+  } else {
+    std::unique_ptr<TraceRecordResult> res;
+    s = record->Accept(ra->handler, &res);
+    ra->result_cb(s, std::move(res));
+  }
+  record.reset();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/trace/replayer_impl.h b/src/rocksdb/utilities/trace/replayer_impl.h
new file mode 100644
index 000000000..367b0b51e
--- /dev/null
+++ b/src/rocksdb/utilities/trace/replayer_impl.h
@@ -0,0 +1,86 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "rocksdb/trace_record.h"
+#include "rocksdb/trace_record_result.h"
+#include "rocksdb/utilities/replayer.h"
+#include "trace_replay/trace_replay.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ReplayerImpl : public Replayer {
+ public:
+  ReplayerImpl(DB* db, const std::vector<ColumnFamilyHandle*>& handles,
+               std::unique_ptr<TraceReader>&& reader);
+  ~ReplayerImpl() override;
+
+  using Replayer::Prepare;
+  Status Prepare() override;
+
+  using Replayer::Next;
+  Status Next(std::unique_ptr<TraceRecord>* record) override;
+
+  using Replayer::Execute;
+  Status Execute(const std::unique_ptr<TraceRecord>& record,
+                 std::unique_ptr<TraceRecordResult>* result) override;
+
+  using Replayer::Replay;
+  Status Replay(
+      const ReplayOptions& options,
+      const std::function<void(Status, std::unique_ptr<TraceRecordResult>&&)>&
+          result_callback) override;
+
+  using Replayer::GetHeaderTimestamp;
+  uint64_t GetHeaderTimestamp() const override;
+
+ private:
+  Status ReadHeader(Trace* header);
+  Status ReadTrace(Trace* trace);
+
+  // Generic function to execute a Trace in a thread pool.
+  static void BackgroundWork(void* arg);
+
+  std::unique_ptr<TraceReader> trace_reader_;
+  std::mutex mutex_;
+  std::atomic<bool> prepared_;
+  std::atomic<bool> trace_end_;
+  uint64_t header_ts_;
+  std::unique_ptr<TraceRecord::Handler> exec_handler_;
+  Env* env_;
+  // When reading the trace header, the trace file version can be parsed.
+  // Replayer will use different decode method to get the trace content based
+  // on different trace file version.
+  int trace_file_version_;
+};
+
+// Arguments passed to BackgroundWork() for replaying in a thread pool.
+struct ReplayerWorkerArg {
+  Trace trace_entry;
+  int trace_file_version;
+  // Handler to execute TraceRecord.
+  TraceRecord::Handler* handler;
+  // Callback function to report the error status and the timestamp of the
+  // TraceRecord (not the start/end timestamp of executing the TraceRecord).
+  std::function<void(Status, uint64_t)> error_cb;
+  // Callback function to report the trace execution status and operation
+  // execution status/result(s).
+  std::function<void(Status, std::unique_ptr<TraceRecordResult>&&)> result_cb;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/lock_manager.cc b/src/rocksdb/utilities/transactions/lock/lock_manager.cc
new file mode 100644
index 000000000..df16b32ad
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/lock_manager.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/lock/lock_manager.h"
+
+#include "utilities/transactions/lock/point/point_lock_manager.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::shared_ptr<LockManager> NewLockManager(PessimisticTransactionDB* db,
+                                            const TransactionDBOptions& opt) {
+  assert(db);
+  if (opt.lock_mgr_handle) {
+    // A custom lock manager was provided in options
+    auto mgr = opt.lock_mgr_handle->getLockManager();
+    return std::shared_ptr<LockManager>(opt.lock_mgr_handle, mgr);
+  } else {
+    // Use a point lock manager by default
+    return std::shared_ptr<LockManager>(new PointLockManager(db, opt));
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/lock_manager.h b/src/rocksdb/utilities/transactions/lock/lock_manager.h
new file mode 100644
index 000000000..a5ce1948c
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/lock_manager.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "utilities/transactions/lock/lock_tracker.h"
+#include "utilities/transactions/pessimistic_transaction.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class PessimisticTransactionDB;
+
+class LockManager {
+ public:
+  virtual ~LockManager() {}
+
+  // Whether supports locking a specific key.
+  virtual bool IsPointLockSupported() const = 0;
+
+  // Whether supports locking a range of keys.
+  virtual bool IsRangeLockSupported() const = 0;
+
+  // Locks acquired through this LockManager should be tracked by
+  // the LockTrackers created through the returned factory.
+  virtual const LockTrackerFactory& GetLockTrackerFactory() const = 0;
+
+  // Enable locking for the specified column family.
+  // Caller should guarantee that this column family is not already enabled.
+  virtual void AddColumnFamily(const ColumnFamilyHandle* cf) = 0;
+
+  // Disable locking for the specified column family.
+  // Caller should guarantee that this column family is no longer used.
+  virtual void RemoveColumnFamily(const ColumnFamilyHandle* cf) = 0;
+
+  // Attempt to lock a key or a key range.  If OK status is returned, the caller
+  // is responsible for calling UnLock() on this key.
+  virtual Status TryLock(PessimisticTransaction* txn,
+                         ColumnFamilyId column_family_id,
+                         const std::string& key, Env* env, bool exclusive) = 0;
+  // The range [start, end] are inclusive at both sides.
+  virtual Status TryLock(PessimisticTransaction* txn,
+                         ColumnFamilyId column_family_id, const Endpoint& start,
+                         const Endpoint& end, Env* env, bool exclusive) = 0;
+
+  // Unlock a key or a range locked by TryLock().  txn must be the same
+  // Transaction that locked this key.
+  virtual void UnLock(PessimisticTransaction* txn, const LockTracker& tracker,
+                      Env* env) = 0;
+  virtual void UnLock(PessimisticTransaction* txn,
+                      ColumnFamilyId column_family_id, const std::string& key,
+                      Env* env) = 0;
+  virtual void UnLock(PessimisticTransaction* txn,
+                      ColumnFamilyId column_family_id, const Endpoint& start,
+                      const Endpoint& end, Env* env) = 0;
+
+  using PointLockStatus = std::unordered_multimap<ColumnFamilyId, KeyLockInfo>;
+  virtual PointLockStatus GetPointLockStatus() = 0;
+
+  using RangeLockStatus =
+      std::unordered_multimap<ColumnFamilyId, RangeLockInfo>;
+  virtual RangeLockStatus GetRangeLockStatus() = 0;
+
+  virtual std::vector<DeadlockPath> GetDeadlockInfoBuffer() = 0;
+
+  virtual void Resize(uint32_t new_size) = 0;
+};
+
+// LockManager should always be constructed through this factory method,
+// instead of constructing through concrete implementations' constructor.
+// Caller owns the returned pointer.
+std::shared_ptr<LockManager> NewLockManager(PessimisticTransactionDB* db,
+                                            const TransactionDBOptions& opt);
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/lock_tracker.h b/src/rocksdb/utilities/transactions/lock/lock_tracker.h
new file mode 100644
index 000000000..5fa228a82
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/lock_tracker.h
@@ -0,0 +1,209 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <memory>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/transaction_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Request for locking a single key.
+struct PointLockRequest {
+  // The id of the key's column family.
+  ColumnFamilyId column_family_id = 0;
+  // The key to lock.
+  std::string key;
+  // The sequence number from which there is no concurrent update to key.
+  SequenceNumber seq = 0;
+  // Whether the lock is acquired only for read.
+  bool read_only = false;
+  // Whether the lock is in exclusive mode.
+  bool exclusive = true;
+};
+
+// Request for locking a range of keys.
+struct RangeLockRequest {
+  // The id of the key's column family.
+  ColumnFamilyId column_family_id;
+
+  // The range to be locked
+  Endpoint start_endp;
+  Endpoint end_endp;
+};
+
+struct PointLockStatus {
+  // Whether the key is locked.
+  bool locked = false;
+  // Whether the key is locked in exclusive mode.
+  bool exclusive = true;
+  // The sequence number in the tracked PointLockRequest.
+  SequenceNumber seq = 0;
+};
+
+// Return status when calling LockTracker::Untrack.
+enum class UntrackStatus {
+  // The lock is not tracked at all, so no lock to untrack.
+  NOT_TRACKED,
+  // The lock is untracked but not removed from the tracker.
+  UNTRACKED,
+  // The lock is removed from the tracker.
+  REMOVED,
+};
+
+// Tracks the lock requests.
+// In PessimisticTransaction, it tracks the locks acquired through LockMgr;
+// In OptimisticTransaction, since there is no LockMgr, it tracks the lock
+// intention. Not thread-safe.
+class LockTracker {
+ public:
+  virtual ~LockTracker() {}
+
+  // Whether supports locking a specific key.
+  virtual bool IsPointLockSupported() const = 0;
+
+  // Whether supports locking a range of keys.
+  virtual bool IsRangeLockSupported() const = 0;
+
+  // Tracks the acquirement of a lock on key.
+  //
+  // If this method is not supported, leave it as a no-op.
+  virtual void Track(const PointLockRequest& /*lock_request*/) = 0;
+
+  // Untracks the lock on a key.
+  // seq and exclusive in lock_request are not used.
+  //
+  // If this method is not supported, leave it as a no-op and
+  // returns NOT_TRACKED.
+  virtual UntrackStatus Untrack(const PointLockRequest& /*lock_request*/) = 0;
+
+  // Counterpart of Track(const PointLockRequest&) for RangeLockRequest.
+  virtual void Track(const RangeLockRequest& /*lock_request*/) = 0;
+
+  // Counterpart of Untrack(const PointLockRequest&) for RangeLockRequest.
+  virtual UntrackStatus Untrack(const RangeLockRequest& /*lock_request*/) = 0;
+
+  // Merges lock requests tracked in the specified tracker into the current
+  // tracker.
+  //
+  // E.g. for point lock, if a key in tracker is not yet tracked,
+  // track this new key; otherwise, merge the tracked information of the key
+  // such as lock's exclusiveness, read/write statistics.
+  //
+  // If this method is not supported, leave it as a no-op.
+  //
+  // REQUIRED: the specified tracker must be of the same concrete class type as
+  // the current tracker.
+  virtual void Merge(const LockTracker& /*tracker*/) = 0;
+
+  // This is a reverse operation of Merge.
+  //
+  // E.g. for point lock, if a key exists in both current and the sepcified
+  // tracker, then subtract the information (such as read/write statistics) of
+  // the key in the specified tracker from the current tracker.
+  //
+  // If this method is not supported, leave it as a no-op.
+  //
+  // REQUIRED:
+  // The specified tracker must be of the same concrete class type as
+  // the current tracker.
+  // The tracked locks in the specified tracker must be a subset of those
+  // tracked by the current tracker.
+  virtual void Subtract(const LockTracker& /*tracker*/) = 0;
+
+  // Clears all tracked locks.
+  virtual void Clear() = 0;
+
+  // Gets the new locks (excluding the locks that have been tracked before the
+  // save point) tracked since the specified save point, the result is stored
+  // in an internally constructed LockTracker and returned.
+  //
+  // save_point_tracker is the tracker used by a SavePoint to track locks
+  // tracked after creating the SavePoint.
+  //
+  // The implementation should document whether point lock, or range lock, or
+  // both are considered in this method.
+  // If this method is not supported, returns nullptr.
+  //
+  // REQUIRED:
+  // The save_point_tracker must be of the same concrete class type as the
+  // current tracker.
+  // The tracked locks in the specified tracker must be a subset of those
+  // tracked by the current tracker.
+  virtual LockTracker* GetTrackedLocksSinceSavePoint(
+      const LockTracker& /*save_point_tracker*/) const = 0;
+
+  // Gets lock related information of the key.
+  //
+  // If point lock is not supported, always returns LockStatus with
+  // locked=false.
+  virtual PointLockStatus GetPointLockStatus(
+      ColumnFamilyId /*column_family_id*/,
+      const std::string& /*key*/) const = 0;
+
+  // Gets number of tracked point locks.
+  //
+  // If point lock is not supported, always returns 0.
+  virtual uint64_t GetNumPointLocks() const = 0;
+
+  class ColumnFamilyIterator {
+   public:
+    virtual ~ColumnFamilyIterator() {}
+
+    // Whether there are remaining column families.
+    virtual bool HasNext() const = 0;
+
+    // Gets next column family id.
+    //
+    // If HasNext is false, calling this method has undefined behavior.
+    virtual ColumnFamilyId Next() = 0;
+  };
+
+  // Gets an iterator for column families.
+  //
+  // Returned iterator must not be nullptr.
+  // If there is no column family to iterate,
+  // returns an empty non-null iterator.
+  // Caller owns the returned pointer.
+  virtual ColumnFamilyIterator* GetColumnFamilyIterator() const = 0;
+
+  class KeyIterator {
+   public:
+    virtual ~KeyIterator() {}
+
+    // Whether there are remaining keys.
+    virtual bool HasNext() const = 0;
+
+    // Gets the next key.
+    //
+    // If HasNext is false, calling this method has undefined behavior.
+    virtual const std::string& Next() = 0;
+  };
+
+  // Gets an iterator for keys with tracked point locks in the column family.
+  //
+  // The column family must exist.
+  // Returned iterator must not be nullptr.
+  // Caller owns the returned pointer.
+  virtual KeyIterator* GetKeyIterator(
+      ColumnFamilyId /*column_family_id*/) const = 0;
+};
+
+// LockTracker should always be constructed through this factory.
+// Each LockManager owns a LockTrackerFactory.
+class LockTrackerFactory {
+ public:
+  // Caller owns the returned pointer.
+  virtual LockTracker* Create() const = 0;
+  virtual ~LockTrackerFactory() {}
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc
new file mode 100644
index 000000000..b362a164d
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc
@@ -0,0 +1,721 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/lock/point/point_lock_manager.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <mutex>
+
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/transaction_db_mutex.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/hash.h"
+#include "util/thread_local.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/transaction_db_mutex_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct LockInfo {
+  bool exclusive;
+  autovector<TransactionID> txn_ids;
+
+  // Transaction locks are not valid after this time in us
+  uint64_t expiration_time;
+
+  LockInfo(TransactionID id, uint64_t time, bool ex)
+      : exclusive(ex), expiration_time(time) {
+    txn_ids.push_back(id);
+  }
+  LockInfo(const LockInfo& lock_info)
+      : exclusive(lock_info.exclusive),
+        txn_ids(lock_info.txn_ids),
+        expiration_time(lock_info.expiration_time) {}
+  void operator=(const LockInfo& lock_info) {
+    exclusive = lock_info.exclusive;
+    txn_ids = lock_info.txn_ids;
+    expiration_time = lock_info.expiration_time;
+  }
+  DECLARE_DEFAULT_MOVES(LockInfo);
+};
+
+struct LockMapStripe {
+  explicit LockMapStripe(std::shared_ptr<TransactionDBMutexFactory> factory) {
+    stripe_mutex = factory->AllocateMutex();
+    stripe_cv = factory->AllocateCondVar();
+    assert(stripe_mutex);
+    assert(stripe_cv);
+  }
+
+  // Mutex must be held before modifying keys map
+  std::shared_ptr<TransactionDBMutex> stripe_mutex;
+
+  // Condition Variable per stripe for waiting on a lock
+  std::shared_ptr<TransactionDBCondVar> stripe_cv;
+
+  // Locked keys mapped to the info about the transactions that locked them.
+  // TODO(agiardullo): Explore performance of other data structures.
+  UnorderedMap<std::string, LockInfo> keys;
+};
+
+// Map of #num_stripes LockMapStripes
+struct LockMap {
+  explicit LockMap(size_t num_stripes,
+                   std::shared_ptr<TransactionDBMutexFactory> factory)
+      : num_stripes_(num_stripes) {
+    lock_map_stripes_.reserve(num_stripes);
+    for (size_t i = 0; i < num_stripes; i++) {
+      LockMapStripe* stripe = new LockMapStripe(factory);
+      lock_map_stripes_.push_back(stripe);
+    }
+  }
+
+  ~LockMap() {
+    for (auto stripe : lock_map_stripes_) {
+      delete stripe;
+    }
+  }
+
+  // Number of sepearate LockMapStripes to create, each with their own Mutex
+  const size_t num_stripes_;
+
+  // Count of keys that are currently locked in this column family.
+  // (Only maintained if PointLockManager::max_num_locks_ is positive.)
+  std::atomic<int64_t> lock_cnt{0};
+
+  std::vector<LockMapStripe*> lock_map_stripes_;
+
+  size_t GetStripe(const std::string& key) const;
+};
+
+namespace {
+void UnrefLockMapsCache(void* ptr) {
+  // Called when a thread exits or a ThreadLocalPtr gets destroyed.
+  auto lock_maps_cache =
+      static_cast<UnorderedMap<uint32_t, std::shared_ptr<LockMap>>*>(ptr);
+  delete lock_maps_cache;
+}
+}  // anonymous namespace
+
+PointLockManager::PointLockManager(PessimisticTransactionDB* txn_db,
+                                   const TransactionDBOptions& opt)
+    : txn_db_impl_(txn_db),
+      default_num_stripes_(opt.num_stripes),
+      max_num_locks_(opt.max_num_locks),
+      lock_maps_cache_(new ThreadLocalPtr(&UnrefLockMapsCache)),
+      dlock_buffer_(opt.max_num_deadlocks),
+      mutex_factory_(opt.custom_mutex_factory
+                         ? opt.custom_mutex_factory
+                         : std::make_shared<TransactionDBMutexFactoryImpl>()) {}
+
+size_t LockMap::GetStripe(const std::string& key) const {
+  assert(num_stripes_ > 0);
+  return FastRange64(GetSliceNPHash64(key), num_stripes_);
+}
+
+void PointLockManager::AddColumnFamily(const ColumnFamilyHandle* cf) {
+  InstrumentedMutexLock l(&lock_map_mutex_);
+
+  if (lock_maps_.find(cf->GetID()) == lock_maps_.end()) {
+    lock_maps_.emplace(cf->GetID(), std::make_shared<LockMap>(
+                                        default_num_stripes_, mutex_factory_));
+  } else {
+    // column_family already exists in lock map
+    assert(false);
+  }
+}
+
+void PointLockManager::RemoveColumnFamily(const ColumnFamilyHandle* cf) {
+  // Remove lock_map for this column family.  Since the lock map is stored
+  // as a shared ptr, concurrent transactions can still keep using it
+  // until they release their references to it.
+  {
+    InstrumentedMutexLock l(&lock_map_mutex_);
+
+    auto lock_maps_iter = lock_maps_.find(cf->GetID());
+    if (lock_maps_iter == lock_maps_.end()) {
+      return;
+    }
+
+    lock_maps_.erase(lock_maps_iter);
+  }  // lock_map_mutex_
+
+  // Clear all thread-local caches
+  autovector<void*> local_caches;
+  lock_maps_cache_->Scrape(&local_caches, nullptr);
+  for (auto cache : local_caches) {
+    delete static_cast<LockMaps*>(cache);
+  }
+}
+
+// Look up the LockMap std::shared_ptr for a given column_family_id.
+// Note:  The LockMap is only valid as long as the caller is still holding on
+//   to the returned std::shared_ptr.
+std::shared_ptr<LockMap> PointLockManager::GetLockMap(
+    ColumnFamilyId column_family_id) {
+  // First check thread-local cache
+  if (lock_maps_cache_->Get() == nullptr) {
+    lock_maps_cache_->Reset(new LockMaps());
+  }
+
+  auto lock_maps_cache = static_cast<LockMaps*>(lock_maps_cache_->Get());
+
+  auto lock_map_iter = lock_maps_cache->find(column_family_id);
+  if (lock_map_iter != lock_maps_cache->end()) {
+    // Found lock map for this column family.
+    return lock_map_iter->second;
+  }
+
+  // Not found in local cache, grab mutex and check shared LockMaps
+  InstrumentedMutexLock l(&lock_map_mutex_);
+
+  lock_map_iter = lock_maps_.find(column_family_id);
+  if (lock_map_iter == lock_maps_.end()) {
+    return std::shared_ptr<LockMap>(nullptr);
+  } else {
+    // Found lock map.  Store in thread-local cache and return.
+    std::shared_ptr<LockMap>& lock_map = lock_map_iter->second;
+    lock_maps_cache->insert({column_family_id, lock_map});
+
+    return lock_map;
+  }
+}
+
+// Returns true if this lock has expired and can be acquired by another
+// transaction.
+// If false, sets *expire_time to the expiration time of the lock according
+// to Env->GetMicros() or 0 if no expiration.
+bool PointLockManager::IsLockExpired(TransactionID txn_id,
+                                     const LockInfo& lock_info, Env* env,
+                                     uint64_t* expire_time) {
+  if (lock_info.expiration_time == 0) {
+    *expire_time = 0;
+    return false;
+  }
+
+  auto now = env->NowMicros();
+  bool expired = lock_info.expiration_time <= now;
+  if (!expired) {
+    // return how many microseconds until lock will be expired
+    *expire_time = lock_info.expiration_time;
+  } else {
+    for (auto id : lock_info.txn_ids) {
+      if (txn_id == id) {
+        continue;
+      }
+
+      bool success = txn_db_impl_->TryStealingExpiredTransactionLocks(id);
+      if (!success) {
+        expired = false;
+        *expire_time = 0;
+        break;
+      }
+    }
+  }
+
+  return expired;
+}
+
+Status PointLockManager::TryLock(PessimisticTransaction* txn,
+                                 ColumnFamilyId column_family_id,
+                                 const std::string& key, Env* env,
+                                 bool exclusive) {
+  // Lookup lock map for this column family id
+  std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id);
+  LockMap* lock_map = lock_map_ptr.get();
+  if (lock_map == nullptr) {
+    char msg[255];
+    snprintf(msg, sizeof(msg), "Column family id not found: %" PRIu32,
+             column_family_id);
+
+    return Status::InvalidArgument(msg);
+  }
+
+  // Need to lock the mutex for the stripe that this key hashes to
+  size_t stripe_num = lock_map->GetStripe(key);
+  assert(lock_map->lock_map_stripes_.size() > stripe_num);
+  LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
+
+  LockInfo lock_info(txn->GetID(), txn->GetExpirationTime(), exclusive);
+  int64_t timeout = txn->GetLockTimeout();
+
+  return AcquireWithTimeout(txn, lock_map, stripe, column_family_id, key, env,
+                            timeout, lock_info);
+}
+
+// Helper function for TryLock().
+Status PointLockManager::AcquireWithTimeout(
+    PessimisticTransaction* txn, LockMap* lock_map, LockMapStripe* stripe,
+    ColumnFamilyId column_family_id, const std::string& key, Env* env,
+    int64_t timeout, const LockInfo& lock_info) {
+  Status result;
+  uint64_t end_time = 0;
+
+  if (timeout > 0) {
+    uint64_t start_time = env->NowMicros();
+    end_time = start_time + timeout;
+  }
+
+  if (timeout < 0) {
+    // If timeout is negative, we wait indefinitely to acquire the lock
+    result = stripe->stripe_mutex->Lock();
+  } else {
+    result = stripe->stripe_mutex->TryLockFor(timeout);
+  }
+
+  if (!result.ok()) {
+    // failed to acquire mutex
+    return result;
+  }
+
+  // Acquire lock if we are able to
+  uint64_t expire_time_hint = 0;
+  autovector<TransactionID> wait_ids;
+  result = AcquireLocked(lock_map, stripe, key, env, lock_info,
+                         &expire_time_hint, &wait_ids);
+
+  if (!result.ok() && timeout != 0) {
+    PERF_TIMER_GUARD(key_lock_wait_time);
+    PERF_COUNTER_ADD(key_lock_wait_count, 1);
+    // If we weren't able to acquire the lock, we will keep retrying as long
+    // as the timeout allows.
+    bool timed_out = false;
+    do {
+      // Decide how long to wait
+      int64_t cv_end_time = -1;
+      if (expire_time_hint > 0 && end_time > 0) {
+        cv_end_time = std::min(expire_time_hint, end_time);
+      } else if (expire_time_hint > 0) {
+        cv_end_time = expire_time_hint;
+      } else if (end_time > 0) {
+        cv_end_time = end_time;
+      }
+
+      assert(result.IsBusy() || wait_ids.size() != 0);
+
+      // We are dependent on a transaction to finish, so perform deadlock
+      // detection.
+      if (wait_ids.size() != 0) {
+        if (txn->IsDeadlockDetect()) {
+          if (IncrementWaiters(txn, wait_ids, key, column_family_id,
+                               lock_info.exclusive, env)) {
+            result = Status::Busy(Status::SubCode::kDeadlock);
+            stripe->stripe_mutex->UnLock();
+            return result;
+          }
+        }
+        txn->SetWaitingTxn(wait_ids, column_family_id, &key);
+      }
+
+      TEST_SYNC_POINT("PointLockManager::AcquireWithTimeout:WaitingTxn");
+      if (cv_end_time < 0) {
+        // Wait indefinitely
+        result = stripe->stripe_cv->Wait(stripe->stripe_mutex);
+      } else {
+        uint64_t now = env->NowMicros();
+        if (static_cast<uint64_t>(cv_end_time) > now) {
+          result = stripe->stripe_cv->WaitFor(stripe->stripe_mutex,
+                                              cv_end_time - now);
+        }
+      }
+
+      if (wait_ids.size() != 0) {
+        txn->ClearWaitingTxn();
+        if (txn->IsDeadlockDetect()) {
+          DecrementWaiters(txn, wait_ids);
+        }
+      }
+
+      if (result.IsTimedOut()) {
+        timed_out = true;
+        // Even though we timed out, we will still make one more attempt to
+        // acquire lock below (it is possible the lock expired and we
+        // were never signaled).
+      }
+
+      if (result.ok() || result.IsTimedOut()) {
+        result = AcquireLocked(lock_map, stripe, key, env, lock_info,
+                               &expire_time_hint, &wait_ids);
+      }
+    } while (!result.ok() && !timed_out);
+  }
+
+  stripe->stripe_mutex->UnLock();
+
+  return result;
+}
+
+void PointLockManager::DecrementWaiters(
+    const PessimisticTransaction* txn,
+    const autovector<TransactionID>& wait_ids) {
+  std::lock_guard<std::mutex> lock(wait_txn_map_mutex_);
+  DecrementWaitersImpl(txn, wait_ids);
+}
+
+void PointLockManager::DecrementWaitersImpl(
+    const PessimisticTransaction* txn,
+    const autovector<TransactionID>& wait_ids) {
+  auto id = txn->GetID();
+  assert(wait_txn_map_.Contains(id));
+  wait_txn_map_.Delete(id);
+
+  for (auto wait_id : wait_ids) {
+    rev_wait_txn_map_.Get(wait_id)--;
+    if (rev_wait_txn_map_.Get(wait_id) == 0) {
+      rev_wait_txn_map_.Delete(wait_id);
+    }
+  }
+}
+
+bool PointLockManager::IncrementWaiters(
+    const PessimisticTransaction* txn,
+    const autovector<TransactionID>& wait_ids, const std::string& key,
+    const uint32_t& cf_id, const bool& exclusive, Env* const env) {
+  auto id = txn->GetID();
+  std::vector<int> queue_parents(
+      static_cast<size_t>(txn->GetDeadlockDetectDepth()));
+  std::vector<TransactionID> queue_values(
+      static_cast<size_t>(txn->GetDeadlockDetectDepth()));
+  std::lock_guard<std::mutex> lock(wait_txn_map_mutex_);
+  assert(!wait_txn_map_.Contains(id));
+
+  wait_txn_map_.Insert(id, {wait_ids, cf_id, exclusive, key});
+
+  for (auto wait_id : wait_ids) {
+    if (rev_wait_txn_map_.Contains(wait_id)) {
+      rev_wait_txn_map_.Get(wait_id)++;
+    } else {
+      rev_wait_txn_map_.Insert(wait_id, 1);
+    }
+  }
+
+  // No deadlock if nobody is waiting on self.
+  if (!rev_wait_txn_map_.Contains(id)) {
+    return false;
+  }
+
+  const auto* next_ids = &wait_ids;
+  int parent = -1;
+  int64_t deadlock_time = 0;
+  for (int tail = 0, head = 0; head < txn->GetDeadlockDetectDepth(); head++) {
+    int i = 0;
+    if (next_ids) {
+      for (; i < static_cast<int>(next_ids->size()) &&
+             tail + i < txn->GetDeadlockDetectDepth();
+           i++) {
+        queue_values[tail + i] = (*next_ids)[i];
+        queue_parents[tail + i] = parent;
+      }
+      tail += i;
+    }
+
+    // No more items in the list, meaning no deadlock.
+    if (tail == head) {
+      return false;
+    }
+
+    auto next = queue_values[head];
+    if (next == id) {
+      std::vector<DeadlockInfo> path;
+      while (head != -1) {
+        assert(wait_txn_map_.Contains(queue_values[head]));
+
+        auto extracted_info = wait_txn_map_.Get(queue_values[head]);
+        path.push_back({queue_values[head], extracted_info.m_cf_id,
+                        extracted_info.m_exclusive,
+                        extracted_info.m_waiting_key});
+        head = queue_parents[head];
+      }
+      if (!env->GetCurrentTime(&deadlock_time).ok()) {
+        /*
+          TODO(AR) this preserves the current behaviour whilst checking the
+          status of env->GetCurrentTime to ensure that ASSERT_STATUS_CHECKED
+          passes. Should we instead raise an error if !ok() ?
+        */
+        deadlock_time = 0;
+      }
+      std::reverse(path.begin(), path.end());
+      dlock_buffer_.AddNewPath(DeadlockPath(path, deadlock_time));
+      deadlock_time = 0;
+      DecrementWaitersImpl(txn, wait_ids);
+      return true;
+    } else if (!wait_txn_map_.Contains(next)) {
+      next_ids = nullptr;
+      continue;
+    } else {
+      parent = head;
+      next_ids = &(wait_txn_map_.Get(next).m_neighbors);
+    }
+  }
+
+  // Wait cycle too big, just assume deadlock.
+  if (!env->GetCurrentTime(&deadlock_time).ok()) {
+    /*
+      TODO(AR) this preserves the current behaviour whilst checking the status
+      of env->GetCurrentTime to ensure that ASSERT_STATUS_CHECKED passes.
+      Should we instead raise an error if !ok() ?
+    */
+    deadlock_time = 0;
+  }
+  dlock_buffer_.AddNewPath(DeadlockPath(deadlock_time, true));
+  DecrementWaitersImpl(txn, wait_ids);
+  return true;
+}
+
+// Try to lock this key after we have acquired the mutex.
+// Sets *expire_time to the expiration time in microseconds
+//  or 0 if no expiration.
+// REQUIRED:  Stripe mutex must be held.
+Status PointLockManager::AcquireLocked(LockMap* lock_map, LockMapStripe* stripe,
+                                       const std::string& key, Env* env,
+                                       const LockInfo& txn_lock_info,
+                                       uint64_t* expire_time,
+                                       autovector<TransactionID>* txn_ids) {
+  assert(txn_lock_info.txn_ids.size() == 1);
+
+  Status result;
+  // Check if this key is already locked
+  auto stripe_iter = stripe->keys.find(key);
+  if (stripe_iter != stripe->keys.end()) {
+    // Lock already held
+    LockInfo& lock_info = stripe_iter->second;
+    assert(lock_info.txn_ids.size() == 1 || !lock_info.exclusive);
+
+    if (lock_info.exclusive || txn_lock_info.exclusive) {
+      if (lock_info.txn_ids.size() == 1 &&
+          lock_info.txn_ids[0] == txn_lock_info.txn_ids[0]) {
+        // The list contains one txn and we're it, so just take it.
+        lock_info.exclusive = txn_lock_info.exclusive;
+        lock_info.expiration_time = txn_lock_info.expiration_time;
+      } else {
+        // Check if it's expired. Skips over txn_lock_info.txn_ids[0] in case
+        // it's there for a shared lock with multiple holders which was not
+        // caught in the first case.
+        if (IsLockExpired(txn_lock_info.txn_ids[0], lock_info, env,
+                          expire_time)) {
+          // lock is expired, can steal it
+          lock_info.txn_ids = txn_lock_info.txn_ids;
+          lock_info.exclusive = txn_lock_info.exclusive;
+          lock_info.expiration_time = txn_lock_info.expiration_time;
+          // lock_cnt does not change
+        } else {
+          result = Status::TimedOut(Status::SubCode::kLockTimeout);
+          *txn_ids = lock_info.txn_ids;
+        }
+      }
+    } else {
+      // We are requesting shared access to a shared lock, so just grant it.
+      lock_info.txn_ids.push_back(txn_lock_info.txn_ids[0]);
+      // Using std::max means that expiration time never goes down even when
+      // a transaction is removed from the list. The correct solution would be
+      // to track expiry for every transaction, but this would also work for
+      // now.
+      lock_info.expiration_time =
+          std::max(lock_info.expiration_time, txn_lock_info.expiration_time);
+    }
+  } else {  // Lock not held.
+    // Check lock limit
+    if (max_num_locks_ > 0 &&
+        lock_map->lock_cnt.load(std::memory_order_acquire) >= max_num_locks_) {
+      result = Status::Busy(Status::SubCode::kLockLimit);
+    } else {
+      // acquire lock
+      stripe->keys.emplace(key, txn_lock_info);
+
+      // Maintain lock count if there is a limit on the number of locks
+      if (max_num_locks_) {
+        lock_map->lock_cnt++;
+      }
+    }
+  }
+
+  return result;
+}
+
+void PointLockManager::UnLockKey(PessimisticTransaction* txn,
+                                 const std::string& key, LockMapStripe* stripe,
+                                 LockMap* lock_map, Env* env) {
+#ifdef NDEBUG
+  (void)env;
+#endif
+  TransactionID txn_id = txn->GetID();
+
+  auto stripe_iter = stripe->keys.find(key);
+  if (stripe_iter != stripe->keys.end()) {
+    auto& txns = stripe_iter->second.txn_ids;
+    auto txn_it = std::find(txns.begin(), txns.end(), txn_id);
+    // Found the key we locked.  unlock it.
+    if (txn_it != txns.end()) {
+      if (txns.size() == 1) {
+        stripe->keys.erase(stripe_iter);
+      } else {
+        auto last_it = txns.end() - 1;
+        if (txn_it != last_it) {
+          *txn_it = *last_it;
+        }
+        txns.pop_back();
+      }
+
+      if (max_num_locks_ > 0) {
+        // Maintain lock count if there is a limit on the number of locks.
+        assert(lock_map->lock_cnt.load(std::memory_order_relaxed) > 0);
+        lock_map->lock_cnt--;
+      }
+    }
+  } else {
+    // This key is either not locked or locked by someone else.  This should
+    // only happen if the unlocking transaction has expired.
+    assert(txn->GetExpirationTime() > 0 &&
+           txn->GetExpirationTime() < env->NowMicros());
+  }
+}
+
+void PointLockManager::UnLock(PessimisticTransaction* txn,
+                              ColumnFamilyId column_family_id,
+                              const std::string& key, Env* env) {
+  std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id);
+  LockMap* lock_map = lock_map_ptr.get();
+  if (lock_map == nullptr) {
+    // Column Family must have been dropped.
+    return;
+  }
+
+  // Lock the mutex for the stripe that this key hashes to
+  size_t stripe_num = lock_map->GetStripe(key);
+  assert(lock_map->lock_map_stripes_.size() > stripe_num);
+  LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
+
+  stripe->stripe_mutex->Lock().PermitUncheckedError();
+  UnLockKey(txn, key, stripe, lock_map, env);
+  stripe->stripe_mutex->UnLock();
+
+  // Signal waiting threads to retry locking
+  stripe->stripe_cv->NotifyAll();
+}
+
+void PointLockManager::UnLock(PessimisticTransaction* txn,
+                              const LockTracker& tracker, Env* env) {
+  std::unique_ptr<LockTracker::ColumnFamilyIterator> cf_it(
+      tracker.GetColumnFamilyIterator());
+  assert(cf_it != nullptr);
+  while (cf_it->HasNext()) {
+    ColumnFamilyId cf = cf_it->Next();
+    std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(cf);
+    LockMap* lock_map = lock_map_ptr.get();
+    if (!lock_map) {
+      // Column Family must have been dropped.
+      return;
+    }
+
+    // Bucket keys by lock_map_ stripe
+    UnorderedMap<size_t, std::vector<const std::string*>> keys_by_stripe(
+        lock_map->num_stripes_);
+    std::unique_ptr<LockTracker::KeyIterator> key_it(
+        tracker.GetKeyIterator(cf));
+    assert(key_it != nullptr);
+    while (key_it->HasNext()) {
+      const std::string& key = key_it->Next();
+      size_t stripe_num = lock_map->GetStripe(key);
+      keys_by_stripe[stripe_num].push_back(&key);
+    }
+
+    // For each stripe, grab the stripe mutex and unlock all keys in this stripe
+    for (auto& stripe_iter : keys_by_stripe) {
+      size_t stripe_num = stripe_iter.first;
+      auto& stripe_keys = stripe_iter.second;
+
+      assert(lock_map->lock_map_stripes_.size() > stripe_num);
+      LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
+
+      stripe->stripe_mutex->Lock().PermitUncheckedError();
+
+      for (const std::string* key : stripe_keys) {
+        UnLockKey(txn, *key, stripe, lock_map, env);
+      }
+
+      stripe->stripe_mutex->UnLock();
+
+      // Signal waiting threads to retry locking
+      stripe->stripe_cv->NotifyAll();
+    }
+  }
+}
+
+PointLockManager::PointLockStatus PointLockManager::GetPointLockStatus() {
+  PointLockStatus data;
+  // Lock order here is important. The correct order is lock_map_mutex_, then
+  // for every column family ID in ascending order lock every stripe in
+  // ascending order.
+  InstrumentedMutexLock l(&lock_map_mutex_);
+
+  std::vector<uint32_t> cf_ids;
+  for (const auto& map : lock_maps_) {
+    cf_ids.push_back(map.first);
+  }
+  std::sort(cf_ids.begin(), cf_ids.end());
+
+  for (auto i : cf_ids) {
+    const auto& stripes = lock_maps_[i]->lock_map_stripes_;
+    // Iterate and lock all stripes in ascending order.
+    for (const auto& j : stripes) {
+      j->stripe_mutex->Lock().PermitUncheckedError();
+      for (const auto& it : j->keys) {
+        struct KeyLockInfo info;
+        info.exclusive = it.second.exclusive;
+        info.key = it.first;
+        for (const auto& id : it.second.txn_ids) {
+          info.ids.push_back(id);
+        }
+        data.insert({i, info});
+      }
+    }
+  }
+
+  // Unlock everything. Unlocking order is not important.
+  for (auto i : cf_ids) {
+    const auto& stripes = lock_maps_[i]->lock_map_stripes_;
+    for (const auto& j : stripes) {
+      j->stripe_mutex->UnLock();
+    }
+  }
+
+  return data;
+}
+
+std::vector<DeadlockPath> PointLockManager::GetDeadlockInfoBuffer() {
+  return dlock_buffer_.PrepareBuffer();
+}
+
+void PointLockManager::Resize(uint32_t target_size) {
+  dlock_buffer_.Resize(target_size);
+}
+
+PointLockManager::RangeLockStatus PointLockManager::GetRangeLockStatus() {
+  return {};
+}
+
+Status PointLockManager::TryLock(PessimisticTransaction* /* txn */,
+                                 ColumnFamilyId /* cf_id */,
+                                 const Endpoint& /* start */,
+                                 const Endpoint& /* end */, Env* /* env */,
+                                 bool /* exclusive */) {
+  return Status::NotSupported(
+      "PointLockManager does not support range locking");
+}
+
+void PointLockManager::UnLock(PessimisticTransaction* /* txn */,
+                              ColumnFamilyId /* cf_id */,
+                              const Endpoint& /* start */,
+                              const Endpoint& /* end */, Env* /* env */) {
+  // no-op
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/point/point_lock_manager.h b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager.h
new file mode 100644
index 000000000..eeb34f3be
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager.h
@@ -0,0 +1,224 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "monitoring/instrumented_mutex.h"
+#include "rocksdb/utilities/transaction.h"
+#include "util/autovector.h"
+#include "util/hash_containers.h"
+#include "util/hash_map.h"
+#include "util/thread_local.h"
+#include "utilities/transactions/lock/lock_manager.h"
+#include "utilities/transactions/lock/point/point_lock_tracker.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ColumnFamilyHandle;
+struct LockInfo;
+struct LockMap;
+struct LockMapStripe;
+
+template <class Path>
+class DeadlockInfoBufferTempl {
+ private:
+  std::vector<Path> paths_buffer_;
+  uint32_t buffer_idx_;
+  std::mutex paths_buffer_mutex_;
+
+  std::vector<Path> Normalize() {
+    auto working = paths_buffer_;
+
+    if (working.empty()) {
+      return working;
+    }
+
+    // Next write occurs at a nonexistent path's slot
+    if (paths_buffer_[buffer_idx_].empty()) {
+      working.resize(buffer_idx_);
+    } else {
+      std::rotate(working.begin(), working.begin() + buffer_idx_,
+                  working.end());
+    }
+
+    return working;
+  }
+
+ public:
+  explicit DeadlockInfoBufferTempl(uint32_t n_latest_dlocks)
+      : paths_buffer_(n_latest_dlocks), buffer_idx_(0) {}
+
+  void AddNewPath(Path path) {
+    std::lock_guard<std::mutex> lock(paths_buffer_mutex_);
+
+    if (paths_buffer_.empty()) {
+      return;
+    }
+
+    paths_buffer_[buffer_idx_] = std::move(path);
+    buffer_idx_ = (buffer_idx_ + 1) % paths_buffer_.size();
+  }
+
+  void Resize(uint32_t target_size) {
+    std::lock_guard<std::mutex> lock(paths_buffer_mutex_);
+
+    paths_buffer_ = Normalize();
+
+    // Drop the deadlocks that will no longer be needed ater the normalize
+    if (target_size < paths_buffer_.size()) {
+      paths_buffer_.erase(
+          paths_buffer_.begin(),
+          paths_buffer_.begin() + (paths_buffer_.size() - target_size));
+      buffer_idx_ = 0;
+    }
+    // Resize the buffer to the target size and restore the buffer's idx
+    else {
+      auto prev_size = paths_buffer_.size();
+      paths_buffer_.resize(target_size);
+      buffer_idx_ = (uint32_t)prev_size;
+    }
+  }
+
+  std::vector<Path> PrepareBuffer() {
+    std::lock_guard<std::mutex> lock(paths_buffer_mutex_);
+
+    // Reversing the normalized vector returns the latest deadlocks first
+    auto working = Normalize();
+    std::reverse(working.begin(), working.end());
+
+    return working;
+  }
+};
+
+using DeadlockInfoBuffer = DeadlockInfoBufferTempl<DeadlockPath>;
+
+struct TrackedTrxInfo {
+  autovector<TransactionID> m_neighbors;
+  uint32_t m_cf_id;
+  bool m_exclusive;
+  std::string m_waiting_key;
+};
+
+class PointLockManager : public LockManager {
+ public:
+  PointLockManager(PessimisticTransactionDB* db,
+                   const TransactionDBOptions& opt);
+  // No copying allowed
+  PointLockManager(const PointLockManager&) = delete;
+  PointLockManager& operator=(const PointLockManager&) = delete;
+
+  ~PointLockManager() override {}
+
+  bool IsPointLockSupported() const override { return true; }
+
+  bool IsRangeLockSupported() const override { return false; }
+
+  const LockTrackerFactory& GetLockTrackerFactory() const override {
+    return PointLockTrackerFactory::Get();
+  }
+
+  // Creates a new LockMap for this column family.  Caller should guarantee
+  // that this column family does not already exist.
+  void AddColumnFamily(const ColumnFamilyHandle* cf) override;
+  // Deletes the LockMap for this column family.  Caller should guarantee that
+  // this column family is no longer in use.
+  void RemoveColumnFamily(const ColumnFamilyHandle* cf) override;
+
+  Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+                 const std::string& key, Env* env, bool exclusive) override;
+  Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+                 const Endpoint& start, const Endpoint& end, Env* env,
+                 bool exclusive) override;
+
+  void UnLock(PessimisticTransaction* txn, const LockTracker& tracker,
+              Env* env) override;
+  void UnLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+              const std::string& key, Env* env) override;
+  void UnLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+              const Endpoint& start, const Endpoint& end, Env* env) override;
+
+  PointLockStatus GetPointLockStatus() override;
+
+  RangeLockStatus GetRangeLockStatus() override;
+
+  std::vector<DeadlockPath> GetDeadlockInfoBuffer() override;
+
+  void Resize(uint32_t new_size) override;
+
+ private:
+  PessimisticTransactionDB* txn_db_impl_;
+
+  // Default number of lock map stripes per column family
+  const size_t default_num_stripes_;
+
+  // Limit on number of keys locked per column family
+  const int64_t max_num_locks_;
+
+  // The following lock order must be satisfied in order to avoid deadlocking
+  // ourselves.
+  //   - lock_map_mutex_
+  //   - stripe mutexes in ascending cf id, ascending stripe order
+  //   - wait_txn_map_mutex_
+  //
+  // Must be held when accessing/modifying lock_maps_.
+  InstrumentedMutex lock_map_mutex_;
+
+  // Map of ColumnFamilyId to locked key info
+  using LockMaps = UnorderedMap<uint32_t, std::shared_ptr<LockMap>>;
+  LockMaps lock_maps_;
+
+  // Thread-local cache of entries in lock_maps_.  This is an optimization
+  // to avoid acquiring a mutex in order to look up a LockMap
+  std::unique_ptr<ThreadLocalPtr> lock_maps_cache_;
+
+  // Must be held when modifying wait_txn_map_ and rev_wait_txn_map_.
+  std::mutex wait_txn_map_mutex_;
+
+  // Maps from waitee -> number of waiters.
+  HashMap<TransactionID, int> rev_wait_txn_map_;
+  // Maps from waiter -> waitee.
+  HashMap<TransactionID, TrackedTrxInfo> wait_txn_map_;
+  DeadlockInfoBuffer dlock_buffer_;
+
+  // Used to allocate mutexes/condvars to use when locking keys
+  std::shared_ptr<TransactionDBMutexFactory> mutex_factory_;
+
+  bool IsLockExpired(TransactionID txn_id, const LockInfo& lock_info, Env* env,
+                     uint64_t* wait_time);
+
+  std::shared_ptr<LockMap> GetLockMap(uint32_t column_family_id);
+
+  Status AcquireWithTimeout(PessimisticTransaction* txn, LockMap* lock_map,
+                            LockMapStripe* stripe, uint32_t column_family_id,
+                            const std::string& key, Env* env, int64_t timeout,
+                            const LockInfo& lock_info);
+
+  Status AcquireLocked(LockMap* lock_map, LockMapStripe* stripe,
+                       const std::string& key, Env* env,
+                       const LockInfo& lock_info, uint64_t* wait_time,
+                       autovector<TransactionID>* txn_ids);
+
+  void UnLockKey(PessimisticTransaction* txn, const std::string& key,
+                 LockMapStripe* stripe, LockMap* lock_map, Env* env);
+
+  bool IncrementWaiters(const PessimisticTransaction* txn,
+                        const autovector<TransactionID>& wait_ids,
+                        const std::string& key, const uint32_t& cf_id,
+                        const bool& exclusive, Env* const env);
+  void DecrementWaiters(const PessimisticTransaction* txn,
+                        const autovector<TransactionID>& wait_ids);
+  void DecrementWaitersImpl(const PessimisticTransaction* txn,
+                            const autovector<TransactionID>& wait_ids);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc
new file mode 100644
index 000000000..525fdea71
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc
@@ -0,0 +1,181 @@
+//  Copyright (c) 2020-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/lock/point/point_lock_manager_test.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This test is not applicable for Range Lock manager as Range Lock Manager
+// operates on Column Families, not their ids.
+TEST_F(PointLockManagerTest, LockNonExistingColumnFamily) {
+  MockColumnFamilyHandle cf(1024);
+  locker_->RemoveColumnFamily(&cf);
+  auto txn = NewTxn();
+  auto s = locker_->TryLock(txn, 1024, "k", env_, true);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_STREQ(s.getState(), "Column family id not found: 1024");
+  delete txn;
+}
+
+TEST_F(PointLockManagerTest, LockStatus) {
+  MockColumnFamilyHandle cf1(1024), cf2(2048);
+  locker_->AddColumnFamily(&cf1);
+  locker_->AddColumnFamily(&cf2);
+
+  auto txn1 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn1, 1024, "k1", env_, true));
+  ASSERT_OK(locker_->TryLock(txn1, 2048, "k1", env_, true));
+
+  auto txn2 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn2, 1024, "k2", env_, false));
+  ASSERT_OK(locker_->TryLock(txn2, 2048, "k2", env_, false));
+
+  auto s = locker_->GetPointLockStatus();
+  ASSERT_EQ(s.size(), 4u);
+  for (uint32_t cf_id : {1024, 2048}) {
+    ASSERT_EQ(s.count(cf_id), 2u);
+    auto range = s.equal_range(cf_id);
+    for (auto it = range.first; it != range.second; it++) {
+      ASSERT_TRUE(it->second.key == "k1" || it->second.key == "k2");
+      if (it->second.key == "k1") {
+        ASSERT_EQ(it->second.exclusive, true);
+        ASSERT_EQ(it->second.ids.size(), 1u);
+        ASSERT_EQ(it->second.ids[0], txn1->GetID());
+      } else if (it->second.key == "k2") {
+        ASSERT_EQ(it->second.exclusive, false);
+        ASSERT_EQ(it->second.ids.size(), 1u);
+        ASSERT_EQ(it->second.ids[0], txn2->GetID());
+      }
+    }
+  }
+
+  // Cleanup
+  locker_->UnLock(txn1, 1024, "k1", env_);
+  locker_->UnLock(txn1, 2048, "k1", env_);
+  locker_->UnLock(txn2, 1024, "k2", env_);
+  locker_->UnLock(txn2, 2048, "k2", env_);
+
+  delete txn1;
+  delete txn2;
+}
+
+TEST_F(PointLockManagerTest, UnlockExclusive) {
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+
+  auto txn1 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, true));
+  locker_->UnLock(txn1, 1, "k", env_);
+
+  auto txn2 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, true));
+
+  // Cleanup
+  locker_->UnLock(txn2, 1, "k", env_);
+
+  delete txn1;
+  delete txn2;
+}
+
+TEST_F(PointLockManagerTest, UnlockShared) {
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+
+  auto txn1 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false));
+  locker_->UnLock(txn1, 1, "k", env_);
+
+  auto txn2 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, true));
+
+  // Cleanup
+  locker_->UnLock(txn2, 1, "k", env_);
+
+  delete txn1;
+  delete txn2;
+}
+
+// This test doesn't work with Range Lock Manager, because Range Lock Manager
+// doesn't support deadlock_detect_depth.
+
+TEST_F(PointLockManagerTest, DeadlockDepthExceeded) {
+  // Tests that when detecting deadlock, if the detection depth is exceeded,
+  // it's also viewed as deadlock.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  TransactionOptions txn_opt;
+  txn_opt.deadlock_detect = true;
+  txn_opt.deadlock_detect_depth = 1;
+  txn_opt.lock_timeout = 1000000;
+  auto txn1 = NewTxn(txn_opt);
+  auto txn2 = NewTxn(txn_opt);
+  auto txn3 = NewTxn(txn_opt);
+  auto txn4 = NewTxn(txn_opt);
+  // "a ->(k) b" means transaction a is waiting for transaction b to release
+  // the held lock on key k.
+  // txn4 ->(k3) -> txn3 ->(k2) txn2 ->(k1) txn1
+  // txn3's deadlock detection will exceed the detection depth 1,
+  // which will be viewed as a deadlock.
+  // NOTE:
+  // txn4 ->(k3) -> txn3 must be set up before
+  // txn3 ->(k2) -> txn2, because to trigger deadlock detection for txn3,
+  // it must have another txn waiting on it, which is txn4 in this case.
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+
+  port::Thread t1 = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() {
+    ASSERT_OK(locker_->TryLock(txn2, 1, "k2", env_, true));
+    // block because txn1 is holding a lock on k1.
+    locker_->TryLock(txn2, 1, "k1", env_, true);
+  });
+
+  ASSERT_OK(locker_->TryLock(txn3, 1, "k3", env_, true));
+
+  port::Thread t2 = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() {
+    // block because txn3 is holding a lock on k1.
+    locker_->TryLock(txn4, 1, "k3", env_, true);
+  });
+
+  auto s = locker_->TryLock(txn3, 1, "k2", env_, true);
+  ASSERT_TRUE(s.IsBusy());
+  ASSERT_EQ(s.subcode(), Status::SubCode::kDeadlock);
+
+  std::vector<DeadlockPath> deadlock_paths = locker_->GetDeadlockInfoBuffer();
+  ASSERT_EQ(deadlock_paths.size(), 1u);
+  ASSERT_TRUE(deadlock_paths[0].limit_exceeded);
+
+  locker_->UnLock(txn1, 1, "k1", env_);
+  locker_->UnLock(txn3, 1, "k3", env_);
+  t1.join();
+  t2.join();
+
+  delete txn4;
+  delete txn3;
+  delete txn2;
+  delete txn1;
+}
+
+INSTANTIATE_TEST_CASE_P(PointLockManager, AnyLockManagerTest,
+                        ::testing::Values(nullptr));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED because Transactions are not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h
new file mode 100644
index 000000000..ca9f46bf9
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h
@@ -0,0 +1,324 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "file/file_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "utilities/transactions/lock/point/point_lock_manager.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/transaction_db_mutex_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MockColumnFamilyHandle : public ColumnFamilyHandle {
+ public:
+  explicit MockColumnFamilyHandle(ColumnFamilyId cf_id) : cf_id_(cf_id) {}
+
+  ~MockColumnFamilyHandle() override {}
+
+  const std::string& GetName() const override { return name_; }
+
+  ColumnFamilyId GetID() const override { return cf_id_; }
+
+  Status GetDescriptor(ColumnFamilyDescriptor*) override {
+    return Status::OK();
+  }
+
+  const Comparator* GetComparator() const override {
+    return BytewiseComparator();
+  }
+
+ private:
+  ColumnFamilyId cf_id_;
+  std::string name_ = "MockCF";
+};
+
+class PointLockManagerTest : public testing::Test {
+ public:
+  void SetUp() override {
+    env_ = Env::Default();
+    db_dir_ = test::PerThreadDBPath("point_lock_manager_test");
+    ASSERT_OK(env_->CreateDir(db_dir_));
+
+    Options opt;
+    opt.create_if_missing = true;
+    TransactionDBOptions txn_opt;
+    txn_opt.transaction_lock_timeout = 0;
+
+    ASSERT_OK(TransactionDB::Open(opt, txn_opt, db_dir_, &db_));
+
+    // CAUTION: This test creates a separate lock manager object (right, NOT
+    // the one that the TransactionDB is using!), and runs tests on it.
+    locker_.reset(new PointLockManager(
+        static_cast<PessimisticTransactionDB*>(db_), txn_opt));
+
+    wait_sync_point_name_ = "PointLockManager::AcquireWithTimeout:WaitingTxn";
+  }
+
+  void TearDown() override {
+    delete db_;
+    EXPECT_OK(DestroyDir(env_, db_dir_));
+  }
+
+  PessimisticTransaction* NewTxn(
+      TransactionOptions txn_opt = TransactionOptions()) {
+    Transaction* txn = db_->BeginTransaction(WriteOptions(), txn_opt);
+    return reinterpret_cast<PessimisticTransaction*>(txn);
+  }
+
+ protected:
+  Env* env_;
+  std::shared_ptr<LockManager> locker_;
+  const char* wait_sync_point_name_;
+  friend void PointLockManagerTestExternalSetup(PointLockManagerTest*);
+
+ private:
+  std::string db_dir_;
+  TransactionDB* db_;
+};
+
+using init_func_t = void (*)(PointLockManagerTest*);
+
+class AnyLockManagerTest : public PointLockManagerTest,
+                           public testing::WithParamInterface<init_func_t> {
+ public:
+  void SetUp() override {
+    // If a custom setup function was provided, use it. Otherwise, use what we
+    // have inherited.
+    auto init_func = GetParam();
+    if (init_func)
+      (*init_func)(this);
+    else
+      PointLockManagerTest::SetUp();
+  }
+};
+
+TEST_P(AnyLockManagerTest, ReentrantExclusiveLock) {
+  // Tests that a txn can acquire exclusive lock on the same key repeatedly.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
+
+  // Cleanup
+  locker_->UnLock(txn, 1, "k", env_);
+
+  delete txn;
+}
+
+TEST_P(AnyLockManagerTest, ReentrantSharedLock) {
+  // Tests that a txn can acquire shared lock on the same key repeatedly.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
+
+  // Cleanup
+  locker_->UnLock(txn, 1, "k", env_);
+
+  delete txn;
+}
+
+TEST_P(AnyLockManagerTest, LockUpgrade) {
+  // Tests that a txn can upgrade from a shared lock to an exclusive lock.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
+
+  // Cleanup
+  locker_->UnLock(txn, 1, "k", env_);
+  delete txn;
+}
+
+TEST_P(AnyLockManagerTest, LockDowngrade) {
+  // Tests that a txn can acquire a shared lock after acquiring an exclusive
+  // lock on the same key.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
+
+  // Cleanup
+  locker_->UnLock(txn, 1, "k", env_);
+  delete txn;
+}
+
+TEST_P(AnyLockManagerTest, LockConflict) {
+  // Tests that lock conflicts lead to lock timeout.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn1 = NewTxn();
+  auto txn2 = NewTxn();
+
+  {
+    // exclusive-exclusive conflict.
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+    auto s = locker_->TryLock(txn2, 1, "k1", env_, true);
+    ASSERT_TRUE(s.IsTimedOut());
+  }
+
+  {
+    // exclusive-shared conflict.
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, true));
+    auto s = locker_->TryLock(txn2, 1, "k2", env_, false);
+    ASSERT_TRUE(s.IsTimedOut());
+  }
+
+  {
+    // shared-exclusive conflict.
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, false));
+    auto s = locker_->TryLock(txn2, 1, "k2", env_, true);
+    ASSERT_TRUE(s.IsTimedOut());
+  }
+
+  // Cleanup
+  locker_->UnLock(txn1, 1, "k1", env_);
+  locker_->UnLock(txn1, 1, "k2", env_);
+
+  delete txn1;
+  delete txn2;
+}
+
+port::Thread BlockUntilWaitingTxn(const char* sync_point_name,
+                                  std::function<void()> f) {
+  std::atomic<bool> reached(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      sync_point_name, [&](void* /*arg*/) { reached.store(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread t(f);
+
+  while (!reached.load()) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  return t;
+}
+
+TEST_P(AnyLockManagerTest, SharedLocks) {
+  // Tests that shared locks can be concurrently held by multiple transactions.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn1 = NewTxn();
+  auto txn2 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false));
+  ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, false));
+
+  // Cleanup
+  locker_->UnLock(txn1, 1, "k", env_);
+  locker_->UnLock(txn2, 1, "k", env_);
+
+  delete txn1;
+  delete txn2;
+}
+
+TEST_P(AnyLockManagerTest, Deadlock) {
+  // Tests that deadlock can be detected.
+  // Deadlock scenario:
+  // txn1 exclusively locks k1, and wants to lock k2;
+  // txn2 exclusively locks k2, and wants to lock k1.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  TransactionOptions txn_opt;
+  txn_opt.deadlock_detect = true;
+  txn_opt.lock_timeout = 1000000;
+  auto txn1 = NewTxn(txn_opt);
+  auto txn2 = NewTxn(txn_opt);
+
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+  ASSERT_OK(locker_->TryLock(txn2, 1, "k2", env_, true));
+
+  // txn1 tries to lock k2, will block forever.
+  port::Thread t = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() {
+    // block because txn2 is holding a lock on k2.
+    locker_->TryLock(txn1, 1, "k2", env_, true);
+  });
+
+  auto s = locker_->TryLock(txn2, 1, "k1", env_, true);
+  ASSERT_TRUE(s.IsBusy());
+  ASSERT_EQ(s.subcode(), Status::SubCode::kDeadlock);
+
+  std::vector<DeadlockPath> deadlock_paths = locker_->GetDeadlockInfoBuffer();
+  ASSERT_EQ(deadlock_paths.size(), 1u);
+  ASSERT_FALSE(deadlock_paths[0].limit_exceeded);
+
+  std::vector<DeadlockInfo> deadlocks = deadlock_paths[0].path;
+  ASSERT_EQ(deadlocks.size(), 2u);
+
+  ASSERT_EQ(deadlocks[0].m_txn_id, txn1->GetID());
+  ASSERT_EQ(deadlocks[0].m_cf_id, 1u);
+  ASSERT_TRUE(deadlocks[0].m_exclusive);
+  ASSERT_EQ(deadlocks[0].m_waiting_key, "k2");
+
+  ASSERT_EQ(deadlocks[1].m_txn_id, txn2->GetID());
+  ASSERT_EQ(deadlocks[1].m_cf_id, 1u);
+  ASSERT_TRUE(deadlocks[1].m_exclusive);
+  ASSERT_EQ(deadlocks[1].m_waiting_key, "k1");
+
+  locker_->UnLock(txn2, 1, "k2", env_);
+  t.join();
+
+  // Cleanup
+  locker_->UnLock(txn1, 1, "k1", env_);
+  locker_->UnLock(txn1, 1, "k2", env_);
+  delete txn2;
+  delete txn1;
+}
+
+TEST_P(AnyLockManagerTest, GetWaitingTxns_MultipleTxns) {
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+
+  auto txn1 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false));
+
+  auto txn2 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, false));
+
+  auto txn3 = NewTxn();
+  txn3->SetLockTimeout(10000);
+  port::Thread t1 = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() {
+    ASSERT_OK(locker_->TryLock(txn3, 1, "k", env_, true));
+    locker_->UnLock(txn3, 1, "k", env_);
+  });
+
+  // Ok, now txn3 is waiting for lock on "k", which is owned by two
+  // transactions. Check that GetWaitingTxns reports this correctly
+  uint32_t wait_cf_id;
+  std::string wait_key;
+  auto waiters = txn3->GetWaitingTxns(&wait_cf_id, &wait_key);
+
+  ASSERT_EQ(wait_cf_id, 1u);
+  ASSERT_EQ(wait_key, "k");
+  ASSERT_EQ(waiters.size(), 2);
+  bool waits_correct =
+      (waiters[0] == txn1->GetID() && waiters[1] == txn2->GetID()) ||
+      (waiters[1] == txn1->GetID() && waiters[0] == txn2->GetID());
+  ASSERT_EQ(waits_correct, true);
+
+  // Release locks so txn3 can proceed with execution
+  locker_->UnLock(txn1, 1, "k", env_);
+  locker_->UnLock(txn2, 1, "k", env_);
+
+  // Wait until txn3 finishes
+  t1.join();
+
+  delete txn1;
+  delete txn2;
+  delete txn3;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/transactions/lock/point/point_lock_tracker.cc b/src/rocksdb/utilities/transactions/lock/point/point_lock_tracker.cc
new file mode 100644
index 000000000..6204a8f02
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/point/point_lock_tracker.cc
@@ -0,0 +1,257 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/lock/point/point_lock_tracker.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+class TrackedKeysColumnFamilyIterator
+    : public LockTracker::ColumnFamilyIterator {
+ public:
+  explicit TrackedKeysColumnFamilyIterator(const TrackedKeys& keys)
+      : tracked_keys_(keys), it_(keys.begin()) {}
+
+  bool HasNext() const override { return it_ != tracked_keys_.end(); }
+
+  ColumnFamilyId Next() override { return (it_++)->first; }
+
+ private:
+  const TrackedKeys& tracked_keys_;
+  TrackedKeys::const_iterator it_;
+};
+
+class TrackedKeysIterator : public LockTracker::KeyIterator {
+ public:
+  TrackedKeysIterator(const TrackedKeys& keys, ColumnFamilyId id)
+      : key_infos_(keys.at(id)), it_(key_infos_.begin()) {}
+
+  bool HasNext() const override { return it_ != key_infos_.end(); }
+
+  const std::string& Next() override { return (it_++)->first; }
+
+ private:
+  const TrackedKeyInfos& key_infos_;
+  TrackedKeyInfos::const_iterator it_;
+};
+
+}  // namespace
+
+void PointLockTracker::Track(const PointLockRequest& r) {
+  auto& keys = tracked_keys_[r.column_family_id];
+  auto result = keys.try_emplace(r.key, r.seq);
+  auto it = result.first;
+  if (!result.second && r.seq < it->second.seq) {
+    // Now tracking this key with an earlier sequence number
+    it->second.seq = r.seq;
+  }
+  // else we do not update the seq. The smaller the tracked seq, the stronger it
+  // the guarantee since it implies from the seq onward there has not been a
+  // concurrent update to the key. So we update the seq if it implies stronger
+  // guarantees, i.e., if it is smaller than the existing tracked seq.
+
+  if (r.read_only) {
+    it->second.num_reads++;
+  } else {
+    it->second.num_writes++;
+  }
+
+  it->second.exclusive = it->second.exclusive || r.exclusive;
+}
+
+UntrackStatus PointLockTracker::Untrack(const PointLockRequest& r) {
+  auto cf_keys = tracked_keys_.find(r.column_family_id);
+  if (cf_keys == tracked_keys_.end()) {
+    return UntrackStatus::NOT_TRACKED;
+  }
+
+  auto& keys = cf_keys->second;
+  auto it = keys.find(r.key);
+  if (it == keys.end()) {
+    return UntrackStatus::NOT_TRACKED;
+  }
+
+  bool untracked = false;
+  auto& info = it->second;
+  if (r.read_only) {
+    if (info.num_reads > 0) {
+      info.num_reads--;
+      untracked = true;
+    }
+  } else {
+    if (info.num_writes > 0) {
+      info.num_writes--;
+      untracked = true;
+    }
+  }
+
+  bool removed = false;
+  if (info.num_reads == 0 && info.num_writes == 0) {
+    keys.erase(it);
+    if (keys.empty()) {
+      tracked_keys_.erase(cf_keys);
+    }
+    removed = true;
+  }
+
+  if (removed) {
+    return UntrackStatus::REMOVED;
+  }
+  if (untracked) {
+    return UntrackStatus::UNTRACKED;
+  }
+  return UntrackStatus::NOT_TRACKED;
+}
+
+void PointLockTracker::Merge(const LockTracker& tracker) {
+  const PointLockTracker& t = static_cast<const PointLockTracker&>(tracker);
+  for (const auto& cf_keys : t.tracked_keys_) {
+    ColumnFamilyId cf = cf_keys.first;
+    const auto& keys = cf_keys.second;
+
+    auto current_cf_keys = tracked_keys_.find(cf);
+    if (current_cf_keys == tracked_keys_.end()) {
+      tracked_keys_.emplace(cf_keys);
+    } else {
+      auto& current_keys = current_cf_keys->second;
+      for (const auto& key_info : keys) {
+        const std::string& key = key_info.first;
+        const TrackedKeyInfo& info = key_info.second;
+        // If key was not previously tracked, just copy the whole struct over.
+        // Otherwise, some merging needs to occur.
+        auto current_info = current_keys.find(key);
+        if (current_info == current_keys.end()) {
+          current_keys.emplace(key_info);
+        } else {
+          current_info->second.Merge(info);
+        }
+      }
+    }
+  }
+}
+
+void PointLockTracker::Subtract(const LockTracker& tracker) {
+  const PointLockTracker& t = static_cast<const PointLockTracker&>(tracker);
+  for (const auto& cf_keys : t.tracked_keys_) {
+    ColumnFamilyId cf = cf_keys.first;
+    const auto& keys = cf_keys.second;
+
+    auto& current_keys = tracked_keys_.at(cf);
+    for (const auto& key_info : keys) {
+      const std::string& key = key_info.first;
+      const TrackedKeyInfo& info = key_info.second;
+      uint32_t num_reads = info.num_reads;
+      uint32_t num_writes = info.num_writes;
+
+      auto current_key_info = current_keys.find(key);
+      assert(current_key_info != current_keys.end());
+
+      // Decrement the total reads/writes of this key by the number of
+      // reads/writes done since the last SavePoint.
+      if (num_reads > 0) {
+        assert(current_key_info->second.num_reads >= num_reads);
+        current_key_info->second.num_reads -= num_reads;
+      }
+      if (num_writes > 0) {
+        assert(current_key_info->second.num_writes >= num_writes);
+        current_key_info->second.num_writes -= num_writes;
+      }
+      if (current_key_info->second.num_reads == 0 &&
+          current_key_info->second.num_writes == 0) {
+        current_keys.erase(current_key_info);
+      }
+    }
+  }
+}
+
+LockTracker* PointLockTracker::GetTrackedLocksSinceSavePoint(
+    const LockTracker& save_point_tracker) const {
+  // Examine the number of reads/writes performed on all keys written
+  // since the last SavePoint and compare to the total number of reads/writes
+  // for each key.
+  LockTracker* t = new PointLockTracker();
+  const PointLockTracker& save_point_t =
+      static_cast<const PointLockTracker&>(save_point_tracker);
+  for (const auto& cf_keys : save_point_t.tracked_keys_) {
+    ColumnFamilyId cf = cf_keys.first;
+    const auto& keys = cf_keys.second;
+
+    auto& current_keys = tracked_keys_.at(cf);
+    for (const auto& key_info : keys) {
+      const std::string& key = key_info.first;
+      const TrackedKeyInfo& info = key_info.second;
+      uint32_t num_reads = info.num_reads;
+      uint32_t num_writes = info.num_writes;
+
+      auto current_key_info = current_keys.find(key);
+      assert(current_key_info != current_keys.end());
+      assert(current_key_info->second.num_reads >= num_reads);
+      assert(current_key_info->second.num_writes >= num_writes);
+
+      if (current_key_info->second.num_reads == num_reads &&
+          current_key_info->second.num_writes == num_writes) {
+        // All the reads/writes to this key were done in the last savepoint.
+        PointLockRequest r;
+        r.column_family_id = cf;
+        r.key = key;
+        r.seq = info.seq;
+        r.read_only = (num_writes == 0);
+        r.exclusive = info.exclusive;
+        t->Track(r);
+      }
+    }
+  }
+  return t;
+}
+
+PointLockStatus PointLockTracker::GetPointLockStatus(
+    ColumnFamilyId column_family_id, const std::string& key) const {
+  assert(IsPointLockSupported());
+  PointLockStatus status;
+  auto it = tracked_keys_.find(column_family_id);
+  if (it == tracked_keys_.end()) {
+    return status;
+  }
+
+  const auto& keys = it->second;
+  auto key_it = keys.find(key);
+  if (key_it == keys.end()) {
+    return status;
+  }
+
+  const TrackedKeyInfo& key_info = key_it->second;
+  status.locked = true;
+  status.exclusive = key_info.exclusive;
+  status.seq = key_info.seq;
+  return status;
+}
+
+uint64_t PointLockTracker::GetNumPointLocks() const {
+  uint64_t num_keys = 0;
+  for (const auto& cf_keys : tracked_keys_) {
+    num_keys += cf_keys.second.size();
+  }
+  return num_keys;
+}
+
+LockTracker::ColumnFamilyIterator* PointLockTracker::GetColumnFamilyIterator()
+    const {
+  return new TrackedKeysColumnFamilyIterator(tracked_keys_);
+}
+
+LockTracker::KeyIterator* PointLockTracker::GetKeyIterator(
+    ColumnFamilyId column_family_id) const {
+  assert(tracked_keys_.find(column_family_id) != tracked_keys_.end());
+  return new TrackedKeysIterator(tracked_keys_, column_family_id);
+}
+
+void PointLockTracker::Clear() { tracked_keys_.clear(); }
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/point/point_lock_tracker.h b/src/rocksdb/utilities/transactions/lock/point/point_lock_tracker.h
new file mode 100644
index 000000000..daf6f9aa2
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/point/point_lock_tracker.h
@@ -0,0 +1,99 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "utilities/transactions/lock/lock_tracker.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct TrackedKeyInfo {
+  // Earliest sequence number that is relevant to this transaction for this key
+  SequenceNumber seq;
+
+  uint32_t num_writes;
+  uint32_t num_reads;
+
+  bool exclusive;
+
+  explicit TrackedKeyInfo(SequenceNumber seq_no)
+      : seq(seq_no), num_writes(0), num_reads(0), exclusive(false) {}
+
+  void Merge(const TrackedKeyInfo& info) {
+    assert(seq <= info.seq);
+    num_reads += info.num_reads;
+    num_writes += info.num_writes;
+    exclusive = exclusive || info.exclusive;
+  }
+};
+
+using TrackedKeyInfos = std::unordered_map<std::string, TrackedKeyInfo>;
+
+using TrackedKeys = std::unordered_map<ColumnFamilyId, TrackedKeyInfos>;
+
+// Tracks point locks on single keys.
+class PointLockTracker : public LockTracker {
+ public:
+  PointLockTracker() = default;
+
+  PointLockTracker(const PointLockTracker&) = delete;
+  PointLockTracker& operator=(const PointLockTracker&) = delete;
+
+  bool IsPointLockSupported() const override { return true; }
+
+  bool IsRangeLockSupported() const override { return false; }
+
+  void Track(const PointLockRequest& lock_request) override;
+
+  UntrackStatus Untrack(const PointLockRequest& lock_request) override;
+
+  void Track(const RangeLockRequest& /*lock_request*/) override {}
+
+  UntrackStatus Untrack(const RangeLockRequest& /*lock_request*/) override {
+    return UntrackStatus::NOT_TRACKED;
+  }
+
+  void Merge(const LockTracker& tracker) override;
+
+  void Subtract(const LockTracker& tracker) override;
+
+  void Clear() override;
+
+  virtual LockTracker* GetTrackedLocksSinceSavePoint(
+      const LockTracker& save_point_tracker) const override;
+
+  PointLockStatus GetPointLockStatus(ColumnFamilyId column_family_id,
+                                     const std::string& key) const override;
+
+  uint64_t GetNumPointLocks() const override;
+
+  ColumnFamilyIterator* GetColumnFamilyIterator() const override;
+
+  KeyIterator* GetKeyIterator(ColumnFamilyId column_family_id) const override;
+
+ private:
+  TrackedKeys tracked_keys_;
+};
+
+class PointLockTrackerFactory : public LockTrackerFactory {
+ public:
+  static const PointLockTrackerFactory& Get() {
+    static const PointLockTrackerFactory instance;
+    return instance;
+  }
+
+  LockTracker* Create() const override { return new PointLockTracker(); }
+
+ private:
+  PointLockTrackerFactory() {}
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_lock_manager.h b/src/rocksdb/utilities/transactions/lock/range/range_lock_manager.h
new file mode 100644
index 000000000..01899542e
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_lock_manager.h
@@ -0,0 +1,36 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+//
+// Generic definitions for a Range-based Lock Manager
+//
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/lock/lock_manager.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+/*
+  A base class for all Range-based lock managers
+
+  See also class RangeLockManagerHandle in
+  include/rocksdb/utilities/transaction_db.h
+*/
+class RangeLockManagerBase : public LockManager {
+ public:
+  // Geting a point lock is reduced to getting a range lock on a single-point
+  // range
+  using LockManager::TryLock;
+  Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+                 const std::string& key, Env* env, bool exclusive) override {
+    Endpoint endp(key.data(), key.size(), false);
+    return TryLock(txn, column_family_id, endp, endp, env, exclusive);
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_locking_test.cc b/src/rocksdb/utilities/transactions/lock/range/range_locking_test.cc
new file mode 100644
index 000000000..bce66c1f3
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_locking_test.cc
@@ -0,0 +1,459 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+
+#include <algorithm>
+#include <functional>
+#include <string>
+#include <thread>
+
+#include "db/db_impl/db_impl.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "utilities/transactions/lock/point/point_lock_manager_test.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/transaction_test.h"
+
+using std::string;
+
+namespace ROCKSDB_NAMESPACE {
+
+class RangeLockingTest : public ::testing::Test {
+ public:
+  TransactionDB* db;
+  std::string dbname;
+  Options options;
+
+  std::shared_ptr<RangeLockManagerHandle> range_lock_mgr;
+  TransactionDBOptions txn_db_options;
+
+  RangeLockingTest() : db(nullptr) {
+    options.create_if_missing = true;
+    dbname = test::PerThreadDBPath("range_locking_testdb");
+
+    EXPECT_OK(DestroyDB(dbname, options));
+
+    range_lock_mgr.reset(NewRangeLockManager(nullptr));
+    txn_db_options.lock_mgr_handle = range_lock_mgr;
+
+    auto s = TransactionDB::Open(options, txn_db_options, dbname, &db);
+    assert(s.ok());
+  }
+
+  ~RangeLockingTest() {
+    delete db;
+    db = nullptr;
+    // This is to skip the assert statement in FaultInjectionTestEnv. There
+    // seems to be a bug in btrfs that the makes readdir return recently
+    // unlink-ed files. By using the default fs we simply ignore errors resulted
+    // from attempting to delete such files in DestroyDB.
+    EXPECT_OK(DestroyDB(dbname, options));
+  }
+
+  PessimisticTransaction* NewTxn(
+      TransactionOptions txn_opt = TransactionOptions()) {
+    Transaction* txn = db->BeginTransaction(WriteOptions(), txn_opt);
+    return reinterpret_cast<PessimisticTransaction*>(txn);
+  }
+};
+
+// TODO: set a smaller lock wait timeout so that the test runs faster.
+TEST_F(RangeLockingTest, BasicRangeLocking) {
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+  std::string value;
+  ReadOptions read_options;
+  auto cf = db->DefaultColumnFamily();
+
+  Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+
+  // Get a range lock
+  ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("c")));
+
+  // Check that range Lock inhibits an overlapping range lock
+  {
+    auto s = txn1->GetRangeLock(cf, Endpoint("b"), Endpoint("z"));
+    ASSERT_TRUE(s.IsTimedOut());
+  }
+
+  // Check that range Lock inhibits an overlapping point lock
+  {
+    auto s = txn1->GetForUpdate(read_options, cf, Slice("b"), &value);
+    ASSERT_TRUE(s.IsTimedOut());
+  }
+
+  // Get a point lock, check that it inhibits range locks
+  ASSERT_OK(txn0->Put(cf, Slice("n"), Slice("value")));
+  {
+    auto s = txn1->GetRangeLock(cf, Endpoint("m"), Endpoint("p"));
+    ASSERT_TRUE(s.IsTimedOut());
+  }
+
+  ASSERT_OK(txn0->Commit());
+  txn1->Rollback();
+
+  delete txn0;
+  delete txn1;
+}
+
+TEST_F(RangeLockingTest, MyRocksLikeUpdate) {
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+  Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+  auto cf = db->DefaultColumnFamily();
+  Status s;
+
+  // Get a range lock for the range we are about to update
+  ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("c")));
+
+  bool try_range_lock_called = false;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "RangeTreeLockManager::TryRangeLock:enter",
+      [&](void* /*arg*/) { try_range_lock_called = true; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // For performance reasons, the following must NOT call lock_mgr->TryLock():
+  // We verify that by checking the value of try_range_lock_called.
+  ASSERT_OK(txn0->Put(cf, Slice("b"), Slice("value"),
+                      /*assume_tracked=*/true));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ASSERT_FALSE(try_range_lock_called);
+
+  txn0->Rollback();
+
+  delete txn0;
+}
+
+TEST_F(RangeLockingTest, UpgradeLockAndGetConflict) {
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+  auto cf = db->DefaultColumnFamily();
+  Status s;
+  std::string value;
+  txn_options.lock_timeout = 10;
+
+  Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+
+  // Get the shared lock in txn0
+  s = txn0->GetForUpdate(ReadOptions(), cf, Slice("a"), &value,
+                         false /*exclusive*/);
+  ASSERT_TRUE(s.IsNotFound());
+
+  // Get the shared lock on the same key in txn1
+  s = txn1->GetForUpdate(ReadOptions(), cf, Slice("a"), &value,
+                         false /*exclusive*/);
+  ASSERT_TRUE(s.IsNotFound());
+
+  // Now, try getting an exclusive lock that overlaps with the above
+  s = txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("b"));
+  ASSERT_TRUE(s.IsTimedOut());
+
+  txn0->Rollback();
+  txn1->Rollback();
+
+  delete txn0;
+  delete txn1;
+}
+
+TEST_F(RangeLockingTest, SnapshotValidation) {
+  Status s;
+  Slice key_slice = Slice("k");
+  ColumnFamilyHandle* cfh = db->DefaultColumnFamily();
+
+  auto txn0 = NewTxn();
+  txn0->Put(key_slice, Slice("initial"));
+  txn0->Commit();
+
+  // txn1
+  auto txn1 = NewTxn();
+  txn1->SetSnapshot();
+  std::string val1;
+  ASSERT_OK(txn1->Get(ReadOptions(), cfh, key_slice, &val1));
+  ASSERT_EQ(val1, "initial");
+  val1 = val1 + std::string("-txn1");
+
+  ASSERT_OK(txn1->Put(cfh, key_slice, Slice(val1)));
+
+  // txn2
+  auto txn2 = NewTxn();
+  txn2->SetSnapshot();
+  std::string val2;
+  // This will see the original value as nothing is committed
+  // This is also Get, so it is doesn't acquire any locks.
+  ASSERT_OK(txn2->Get(ReadOptions(), cfh, key_slice, &val2));
+  ASSERT_EQ(val2, "initial");
+
+  // txn1
+  ASSERT_OK(txn1->Commit());
+
+  // txn2
+  val2 = val2 + std::string("-txn2");
+  // Now, this call should do Snapshot Validation and fail:
+  s = txn2->Put(cfh, key_slice, Slice(val2));
+  ASSERT_TRUE(s.IsBusy());
+
+  ASSERT_OK(txn2->Commit());
+
+  delete txn0;
+  delete txn1;
+  delete txn2;
+}
+
+TEST_F(RangeLockingTest, MultipleTrxLockStatusData) {
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+  auto cf = db->DefaultColumnFamily();
+
+  Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+
+  // Get a range lock
+  ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("z"), Endpoint("z")));
+  ASSERT_OK(txn1->GetRangeLock(cf, Endpoint("b"), Endpoint("e")));
+
+  auto s = range_lock_mgr->GetRangeLockStatusData();
+  ASSERT_EQ(s.size(), 2);
+  for (auto it = s.begin(); it != s.end(); ++it) {
+    ASSERT_EQ(it->first, cf->GetID());
+    auto val = it->second;
+    ASSERT_FALSE(val.start.inf_suffix);
+    ASSERT_FALSE(val.end.inf_suffix);
+    ASSERT_TRUE(val.exclusive);
+    ASSERT_EQ(val.ids.size(), 1);
+    if (val.ids[0] == txn0->GetID()) {
+      ASSERT_EQ(val.start.slice, "z");
+      ASSERT_EQ(val.end.slice, "z");
+    } else if (val.ids[0] == txn1->GetID()) {
+      ASSERT_EQ(val.start.slice, "b");
+      ASSERT_EQ(val.end.slice, "e");
+    } else {
+      FAIL();  // Unknown transaction ID.
+    }
+  }
+
+  delete txn0;
+  delete txn1;
+}
+
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#define SKIP_LOCK_ESCALATION_TEST 1
+#endif
+#else
+#define SKIP_LOCK_ESCALATION_TEST 1
+#endif
+
+#ifndef SKIP_LOCK_ESCALATION_TEST
+TEST_F(RangeLockingTest, BasicLockEscalation) {
+  auto cf = db->DefaultColumnFamily();
+
+  auto counters = range_lock_mgr->GetStatus();
+
+  // Initially not using any lock memory
+  ASSERT_EQ(counters.current_lock_memory, 0);
+  ASSERT_EQ(counters.escalation_count, 0);
+
+  ASSERT_EQ(0, range_lock_mgr->SetMaxLockMemory(2000));
+
+  // Insert until we see lock escalations
+  auto txn = NewTxn();
+
+  // Get the locks until we hit an escalation
+  for (int i = 0; i < 2020; i++) {
+    std::ostringstream buf;
+    buf << std::setw(8) << std::setfill('0') << i;
+    std::string buf_str = buf.str();
+    ASSERT_OK(txn->GetRangeLock(cf, Endpoint(buf_str), Endpoint(buf_str)));
+  }
+  counters = range_lock_mgr->GetStatus();
+  ASSERT_GT(counters.escalation_count, 0);
+  ASSERT_LE(counters.current_lock_memory, 2000);
+
+  delete txn;
+}
+
+// An escalation barrier function. Allow escalation iff the first two bytes are
+// identical.
+static bool escalation_barrier(const Endpoint& a, const Endpoint& b) {
+  assert(a.slice.size() > 2);
+  assert(b.slice.size() > 2);
+  if (memcmp(a.slice.data(), b.slice.data(), 2)) {
+    return true;  // This is a barrier
+  } else {
+    return false;  // No barrier
+  }
+}
+
+TEST_F(RangeLockingTest, LockEscalationBarrier) {
+  auto cf = db->DefaultColumnFamily();
+
+  auto counters = range_lock_mgr->GetStatus();
+
+  // Initially not using any lock memory
+  ASSERT_EQ(counters.escalation_count, 0);
+
+  range_lock_mgr->SetMaxLockMemory(8000);
+  range_lock_mgr->SetEscalationBarrierFunc(escalation_barrier);
+
+  // Insert enough locks to cause lock escalations to happen
+  auto txn = NewTxn();
+  const int N = 2000;
+  for (int i = 0; i < N; i++) {
+    std::ostringstream buf;
+    buf << std::setw(4) << std::setfill('0') << i;
+    std::string buf_str = buf.str();
+    ASSERT_OK(txn->GetRangeLock(cf, Endpoint(buf_str), Endpoint(buf_str)));
+  }
+  counters = range_lock_mgr->GetStatus();
+  ASSERT_GT(counters.escalation_count, 0);
+
+  // Check that lock escalation was not performed across escalation barriers:
+  // Use another txn to acquire locks near the barriers.
+  auto txn2 = NewTxn();
+  range_lock_mgr->SetMaxLockMemory(500000);
+  for (int i = 100; i < N; i += 100) {
+    std::ostringstream buf;
+    buf << std::setw(4) << std::setfill('0') << i - 1 << "-a";
+    std::string buf_str = buf.str();
+    // Check that we CAN get a lock near the escalation barrier
+    ASSERT_OK(txn2->GetRangeLock(cf, Endpoint(buf_str), Endpoint(buf_str)));
+  }
+
+  txn->Rollback();
+  txn2->Rollback();
+  delete txn;
+  delete txn2;
+}
+
+#endif
+
+TEST_F(RangeLockingTest, LockWaitCount) {
+  TransactionOptions txn_options;
+  auto cf = db->DefaultColumnFamily();
+  txn_options.lock_timeout = 50;
+  Transaction* txn0 = db->BeginTransaction(WriteOptions(), txn_options);
+  Transaction* txn1 = db->BeginTransaction(WriteOptions(), txn_options);
+
+  // Get a range lock
+  ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("c")));
+
+  uint64_t lock_waits1 = range_lock_mgr->GetStatus().lock_wait_count;
+  // Attempt to get a conflicting lock
+  auto s = txn1->GetRangeLock(cf, Endpoint("b"), Endpoint("z"));
+  ASSERT_TRUE(s.IsTimedOut());
+
+  // Check that the counter was incremented
+  uint64_t lock_waits2 = range_lock_mgr->GetStatus().lock_wait_count;
+  ASSERT_EQ(lock_waits1 + 1, lock_waits2);
+
+  txn0->Rollback();
+  txn1->Rollback();
+
+  delete txn0;
+  delete txn1;
+}
+
+TEST_F(RangeLockingTest, LockWaiteeAccess) {
+  TransactionOptions txn_options;
+  auto cf = db->DefaultColumnFamily();
+  txn_options.lock_timeout = 60;
+  Transaction* txn0 = db->BeginTransaction(WriteOptions(), txn_options);
+  Transaction* txn1 = db->BeginTransaction(WriteOptions(), txn_options);
+
+  // Get a range lock
+  ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("c")));
+
+  std::atomic<bool> reached(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "RangeTreeLockManager::TryRangeLock:EnterWaitingTxn", [&](void* /*arg*/) {
+        reached.store(true);
+        std::this_thread::sleep_for(std::chrono::milliseconds(2000));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread t([&]() {
+    // Attempt to get a conflicting lock
+    auto s = txn1->GetRangeLock(cf, Endpoint("b"), Endpoint("z"));
+    ASSERT_TRUE(s.ok());
+    txn1->Rollback();
+  });
+
+  while (!reached.load()) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Release locks and free the transaction
+  txn0->Rollback();
+  delete txn0;
+
+  t.join();
+
+  delete txn1;
+}
+
+void PointLockManagerTestExternalSetup(PointLockManagerTest* self) {
+  self->env_ = Env::Default();
+  self->db_dir_ = test::PerThreadDBPath("point_lock_manager_test");
+  ASSERT_OK(self->env_->CreateDir(self->db_dir_));
+
+  Options opt;
+  opt.create_if_missing = true;
+  TransactionDBOptions txn_opt;
+  txn_opt.transaction_lock_timeout = 0;
+
+  auto mutex_factory = std::make_shared<TransactionDBMutexFactoryImpl>();
+  self->locker_.reset(NewRangeLockManager(mutex_factory)->getLockManager());
+  std::shared_ptr<RangeLockManagerHandle> range_lock_mgr =
+      std::dynamic_pointer_cast<RangeLockManagerHandle>(self->locker_);
+  txn_opt.lock_mgr_handle = range_lock_mgr;
+
+  ASSERT_OK(TransactionDB::Open(opt, txn_opt, self->db_dir_, &self->db_));
+  self->wait_sync_point_name_ = "RangeTreeLockManager::TryRangeLock:WaitingTxn";
+}
+
+INSTANTIATE_TEST_CASE_P(RangeLockManager, AnyLockManagerTest,
+                        ::testing::Values(PointLockManagerTestExternalSetup));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else  // OS_WIN
+
+#include <stdio.h>
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "skipped as Range Locking is not supported on Windows\n");
+  return 0;
+}
+
+#endif  // OS_WIN
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "skipped as transactions are not supported in rocksdb_lite\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.AGPLv3 b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.AGPLv3
new file mode 100644
index 000000000..dba13ed2d
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.AGPLv3
@@ -0,0 +1,661 @@
+                    GNU AFFERO GENERAL PUBLIC LICENSE
+                       Version 3, 19 November 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU Affero General Public License is a free, copyleft license for
+software and other kinds of works, specifically designed to ensure
+cooperation with the community in the case of network server software.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+our General Public Licenses are intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  Developers that use our General Public Licenses protect your rights
+with two steps: (1) assert copyright on the software, and (2) offer
+you this License which gives you legal permission to copy, distribute
+and/or modify the software.
+
+  A secondary benefit of defending all users' freedom is that
+improvements made in alternate versions of the program, if they
+receive widespread use, become available for other developers to
+incorporate.  Many developers of free software are heartened and
+encouraged by the resulting cooperation.  However, in the case of
+software used on network servers, this result may fail to come about.
+The GNU General Public License permits making a modified version and
+letting the public access it on a server without ever releasing its
+source code to the public.
+
+  The GNU Affero General Public License is designed specifically to
+ensure that, in such cases, the modified source code becomes available
+to the community.  It requires the operator of a network server to
+provide the source code of the modified version running there to the
+users of that server.  Therefore, public use of a modified version, on
+a publicly accessible server, gives the public access to the source
+code of the modified version.
+
+  An older license, called the Affero General Public License and
+published by Affero, was designed to accomplish similar goals.  This is
+a different license, not a version of the Affero GPL, but Affero has
+released a new version of the Affero GPL which permits relicensing under
+this license.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU Affero General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Remote Network Interaction; Use with the GNU General Public License.
+
+  Notwithstanding any other provision of this License, if you modify the
+Program, your modified version must prominently offer all users
+interacting with it remotely through a computer network (if your version
+supports such interaction) an opportunity to receive the Corresponding
+Source of your version by providing access to the Corresponding Source
+from a network server at no charge, through some standard or customary
+means of facilitating copying of software.  This Corresponding Source
+shall include the Corresponding Source for any work covered by version 3
+of the GNU General Public License that is incorporated pursuant to the
+following paragraph.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the work with which it is combined will remain governed by version
+3 of the GNU General Public License.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU Affero General Public License from time to time.  Such new versions
+will be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU Affero General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU Affero General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU Affero General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If your software can interact with users remotely through a computer
+network, you should also make sure that it provides a way for users to
+get its source.  For example, if your program is a web application, its
+interface could display a "Source" link that leads users to an archive
+of the code.  There are many ways you could offer source, and different
+solutions will be better for different programs; see section 13 for the
+specific requirements.
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU AGPL, see
+<http://www.gnu.org/licenses/>.
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.APACHEv2 b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.APACHEv2
new file mode 100644
index 000000000..ecbfc770f
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.APACHEv2
@@ -0,0 +1,174 @@
+Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.GPLv2 b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.GPLv2
new file mode 100644
index 000000000..d511905c1
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.GPLv2
@@ -0,0 +1,339 @@
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/README b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/README
new file mode 100644
index 000000000..2ea86bf46
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/README
@@ -0,0 +1,13 @@
+The files in this directory originally come from
+https://github.com/percona/PerconaFT/.
+
+This directory only includes the "locktree" part of PerconaFT, and its
+dependencies.
+
+The following modifications were made:
+- Make locktree usable outside of PerconaFT library
+- Add shared read-only lock support
+
+The files named *_subst.* are substitutes of the PerconaFT's files, they
+contain replacements of PerconaFT's functionality.
+
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/db.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/db.h
new file mode 100644
index 000000000..5aa826c8e
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/db.h
@@ -0,0 +1,76 @@
+#ifndef _DB_H
+#define _DB_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+typedef struct __toku_dbt DBT;
+
+// port: this is currently not used
+struct simple_dbt {
+  uint32_t len;
+  void *data;
+};
+
+// engine status info
+// engine status is passed to handlerton as an array of
+// TOKU_ENGINE_STATUS_ROW_S[]
+typedef enum {
+  STATUS_FS_STATE = 0,  // interpret as file system state (redzone) enum
+  STATUS_UINT64,        // interpret as uint64_t
+  STATUS_CHARSTR,       // interpret as char *
+  STATUS_UNIXTIME,      // interpret as time_t
+  STATUS_TOKUTIME,      // interpret as tokutime_t
+  STATUS_PARCOUNT,      // interpret as PARTITIONED_COUNTER
+  STATUS_DOUBLE         // interpret as double
+} toku_engine_status_display_type;
+
+typedef enum {
+  TOKU_ENGINE_STATUS = (1ULL << 0),  // Include when asking for engine status
+  TOKU_GLOBAL_STATUS =
+      (1ULL << 1),  // Include when asking for information_schema.global_status
+} toku_engine_status_include_type;
+
+typedef struct __toku_engine_status_row {
+  const char *keyname;  // info schema key, should not change across revisions
+                        // without good reason
+  const char
+      *columnname;  // column for mysql, e.g. information_schema.global_status.
+                    // TOKUDB_ will automatically be prefixed.
+  const char *legend;  // the text that will appear at user interface
+  toku_engine_status_display_type type;  // how to interpret the value
+  toku_engine_status_include_type
+      include;  // which kinds of callers should get read this row?
+  union {
+    double dnum;
+    uint64_t num;
+    const char *str;
+    char datebuf[26];
+    struct partitioned_counter *parcount;
+  } value;
+} * TOKU_ENGINE_STATUS_ROW, TOKU_ENGINE_STATUS_ROW_S;
+
+#define DB_BUFFER_SMALL -30999
+#define DB_LOCK_DEADLOCK -30995
+#define DB_LOCK_NOTGRANTED -30994
+#define DB_NOTFOUND -30989
+#define DB_KEYEXIST -30996
+#define DB_DBT_MALLOC 8
+#define DB_DBT_REALLOC 64
+#define DB_DBT_USERMEM 256
+
+/* PerconaFT specific error codes */
+#define TOKUDB_OUT_OF_LOCKS -100000
+
+typedef void (*lock_wait_callback)(void *arg, uint64_t requesting_txnid,
+                                   uint64_t blocking_txnid);
+
+struct __toku_dbt {
+  void *data;
+  size_t size;
+  size_t ulen;
+  // One of DB_DBT_XXX flags
+  uint32_t flags;
+};
+
+#endif
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h
new file mode 100644
index 000000000..718efc623
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h
@@ -0,0 +1,138 @@
+/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <string.h>
+
+#include "../db.h"
+#include "../portability/memory.h"
+#include "../util/dbt.h"
+
+typedef int (*ft_compare_func)(void *arg, const DBT *a, const DBT *b);
+
+int toku_keycompare(const void *key1, size_t key1len, const void *key2,
+                    size_t key2len);
+
+int toku_builtin_compare_fun(const DBT *, const DBT *)
+    __attribute__((__visibility__("default")));
+
+namespace toku {
+
+// a comparator object encapsulates the data necessary for
+// comparing two keys in a fractal tree. it further understands
+// that points may be positive or negative infinity.
+
+class comparator {
+  void init(ft_compare_func cmp, void *cmp_arg, uint8_t memcmp_magic) {
+    _cmp = cmp;
+    _cmp_arg = cmp_arg;
+    _memcmp_magic = memcmp_magic;
+  }
+
+ public:
+  // This magic value is reserved to mean that the magic has not been set.
+  static const uint8_t MEMCMP_MAGIC_NONE = 0;
+
+  void create(ft_compare_func cmp, void *cmp_arg,
+              uint8_t memcmp_magic = MEMCMP_MAGIC_NONE) {
+    init(cmp, cmp_arg, memcmp_magic);
+  }
+
+  // inherit the attributes of another comparator, but keep our own
+  // copy of fake_db that is owned separately from the one given.
+  void inherit(const comparator &cmp) {
+    invariant_notnull(cmp._cmp);
+    init(cmp._cmp, cmp._cmp_arg, cmp._memcmp_magic);
+  }
+
+  // like inherit, but doesn't require that the this comparator
+  // was already created
+  void create_from(const comparator &cmp) { inherit(cmp); }
+
+  void destroy() {}
+
+  ft_compare_func get_compare_func() const { return _cmp; }
+
+  uint8_t get_memcmp_magic() const { return _memcmp_magic; }
+
+  bool valid() const { return _cmp != nullptr; }
+
+  inline bool dbt_has_memcmp_magic(const DBT *dbt) const {
+    return *reinterpret_cast<const char *>(dbt->data) == _memcmp_magic;
+  }
+
+  int operator()(const DBT *a, const DBT *b) const {
+    if (__builtin_expect(toku_dbt_is_infinite(a) || toku_dbt_is_infinite(b),
+                         0)) {
+      return toku_dbt_infinite_compare(a, b);
+    } else if (_memcmp_magic != MEMCMP_MAGIC_NONE
+               // If `a' has the memcmp magic..
+               && dbt_has_memcmp_magic(a)
+               // ..then we expect `b' to also have the memcmp magic
+               && __builtin_expect(dbt_has_memcmp_magic(b), 1)) {
+      assert(0);  // psergey: this branch should not be taken.
+      return toku_builtin_compare_fun(a, b);
+    } else {
+      // yikes, const sadness here
+      return _cmp(_cmp_arg, a, b);
+    }
+  }
+
+ private:
+  ft_compare_func _cmp;
+  void *_cmp_arg;
+
+  uint8_t _memcmp_magic;
+};
+
+} /* namespace toku */
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h
new file mode 100644
index 000000000..1b4511172
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h
@@ -0,0 +1,102 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "../db.h"
+#include "../portability/toku_race_tools.h"
+#include "../util/status.h"
+
+//
+// Lock Tree Manager statistics
+//
+class LTM_STATUS_S {
+ public:
+  enum {
+    LTM_SIZE_CURRENT = 0,
+    LTM_SIZE_LIMIT,
+    LTM_ESCALATION_COUNT,
+    LTM_ESCALATION_TIME,
+    LTM_ESCALATION_LATEST_RESULT,
+    LTM_NUM_LOCKTREES,
+    LTM_LOCK_REQUESTS_PENDING,
+    LTM_STO_NUM_ELIGIBLE,
+    LTM_STO_END_EARLY_COUNT,
+    LTM_STO_END_EARLY_TIME,
+    LTM_WAIT_COUNT,
+    LTM_WAIT_TIME,
+    LTM_LONG_WAIT_COUNT,
+    LTM_LONG_WAIT_TIME,
+    LTM_TIMEOUT_COUNT,
+    LTM_WAIT_ESCALATION_COUNT,
+    LTM_WAIT_ESCALATION_TIME,
+    LTM_LONG_WAIT_ESCALATION_COUNT,
+    LTM_LONG_WAIT_ESCALATION_TIME,
+    LTM_STATUS_NUM_ROWS  // must be last
+  };
+
+  void init(void);
+  void destroy(void);
+
+  TOKU_ENGINE_STATUS_ROW_S status[LTM_STATUS_NUM_ROWS];
+
+ private:
+  bool m_initialized = false;
+};
+typedef LTM_STATUS_S* LTM_STATUS;
+extern LTM_STATUS_S ltm_status;
+
+#define LTM_STATUS_VAL(x) ltm_status.status[LTM_STATUS_S::x].value.num
+
+void toku_status_init(void);     // just call ltm_status.init();
+void toku_status_destroy(void);  // just call ltm_status.destroy();
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc
new file mode 100644
index 000000000..5110cd482
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc
@@ -0,0 +1,139 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "concurrent_tree.h"
+
+// PORT #include <toku_assert.h>
+namespace toku {
+
+void concurrent_tree::create(const comparator *cmp) {
+  // start with an empty root node. we do this instead of
+  // setting m_root to null so there's always a root to lock
+  m_root.create_root(cmp);
+}
+
+void concurrent_tree::destroy(void) { m_root.destroy_root(); }
+
+bool concurrent_tree::is_empty(void) { return m_root.is_empty(); }
+
+uint64_t concurrent_tree::get_insertion_memory_overhead(void) {
+  return sizeof(treenode);
+}
+
+void concurrent_tree::locked_keyrange::prepare(concurrent_tree *tree) {
+  // the first step in acquiring a locked keyrange is locking the root
+  treenode *const root = &tree->m_root;
+  m_tree = tree;
+  m_subtree = root;
+  m_range = keyrange::get_infinite_range();
+  root->mutex_lock();
+}
+
+void concurrent_tree::locked_keyrange::acquire(const keyrange &range) {
+  treenode *const root = &m_tree->m_root;
+
+  treenode *subtree;
+  if (root->is_empty() || root->range_overlaps(range)) {
+    subtree = root;
+  } else {
+    // we do not have a precomputed comparison hint, so pass null
+    const keyrange::comparison *cmp_hint = nullptr;
+    subtree = root->find_node_with_overlapping_child(range, cmp_hint);
+  }
+
+  // subtree is locked. it will be unlocked when this is release()'d
+  invariant_notnull(subtree);
+  m_range = range;
+  m_subtree = subtree;
+}
+
+bool concurrent_tree::locked_keyrange::add_shared_owner(const keyrange &range,
+                                                        TXNID new_owner) {
+  return m_subtree->insert(range, new_owner, /*is_shared*/ true);
+}
+
+void concurrent_tree::locked_keyrange::release(void) {
+  m_subtree->mutex_unlock();
+}
+
+void concurrent_tree::locked_keyrange::insert(const keyrange &range,
+                                              TXNID txnid, bool is_shared) {
+  // empty means no children, and only the root should ever be empty
+  if (m_subtree->is_empty()) {
+    m_subtree->set_range_and_txnid(range, txnid, is_shared);
+  } else {
+    m_subtree->insert(range, txnid, is_shared);
+  }
+}
+
+void concurrent_tree::locked_keyrange::remove(const keyrange &range,
+                                              TXNID txnid) {
+  invariant(!m_subtree->is_empty());
+  treenode *new_subtree = m_subtree->remove(range, txnid);
+  // if removing range changed the root of the subtree,
+  // then the subtree must be the root of the entire tree.
+  if (new_subtree == nullptr) {
+    invariant(m_subtree->is_root());
+    invariant(m_subtree->is_empty());
+  }
+}
+
+void concurrent_tree::locked_keyrange::remove_all(void) {
+  m_subtree->recursive_remove();
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h
new file mode 100644
index 000000000..e1bfb86c5
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h
@@ -0,0 +1,174 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "../ft/comparator.h"
+#include "keyrange.h"
+#include "treenode.h"
+
+namespace toku {
+
+// A concurrent_tree stores non-overlapping ranges.
+// Access to disjoint parts of the tree usually occurs concurrently.
+
+class concurrent_tree {
+ public:
+  // A locked_keyrange gives you exclusive access to read and write
+  // operations that occur on any keys in that range. You only have
+  // the right to operate on keys in that range or keys that were read
+  // from the keyrange using iterate()
+  //
+  // Access model:
+  // - user prepares a locked keyrange. all threads serialize behind prepare().
+  // - user breaks the serialzation point by acquiring a range, or releasing.
+  // - one thread operates on a certain locked_keyrange object at a time.
+  // - when the thread is finished, it releases
+
+  class locked_keyrange {
+   public:
+    // effect: prepare to acquire a locked keyrange over the given
+    //         concurrent_tree, preventing other threads from preparing
+    //         until this thread either does acquire() or release().
+    // note: operations performed on a prepared keyrange are equivalent
+    //         to ones performed on an acquired keyrange over -inf, +inf.
+    // rationale: this provides the user with a serialization point for
+    // descending
+    //            or modifying the the tree. it also proives a convenient way of
+    //            doing serializable operations on the tree.
+    // There are two valid sequences of calls:
+    //  - prepare, acquire, [operations], release
+    //  - prepare, [operations],release
+    void prepare(concurrent_tree *tree);
+
+    // requires: the locked keyrange was prepare()'d
+    // effect: acquire a locked keyrange over the given concurrent_tree.
+    //         the locked keyrange represents the range of keys overlapped
+    //         by the given range
+    void acquire(const keyrange &range);
+
+    // effect: releases a locked keyrange and the mutex it holds
+    void release(void);
+
+    // effect: iterate over each range this locked_keyrange represents,
+    //         calling function->fn() on each node's keyrange and txnid
+    //         until there are no more or the function returns false
+    template <class F>
+    void iterate(F *function) const {
+      // if the subtree is non-empty, traverse it by calling the given
+      // function on each range, txnid pair found that overlaps.
+      if (!m_subtree->is_empty()) {
+        m_subtree->traverse_overlaps(m_range, function);
+      }
+    }
+
+    // Adds another owner to the lock on the specified keyrange.
+    // requires: the keyrange contains one treenode whose bounds are
+    //           exactly equal to the specifed range (no sub/supersets)
+    bool add_shared_owner(const keyrange &range, TXNID new_owner);
+
+    // inserts the given range into the tree, with an associated txnid.
+    // requires: range does not overlap with anything in this locked_keyrange
+    // rationale: caller is responsible for only inserting unique ranges
+    void insert(const keyrange &range, TXNID txnid, bool is_shared);
+
+    // effect: removes the given range from the tree.
+    //         - txnid=TXNID_ANY means remove the range no matter what its
+    //           owners are
+    //         - Other value means remove the specified txnid from
+    //           ownership (if the range has other owners, it will remain
+    //           in the tree)
+    // requires: range exists exactly in this locked_keyrange
+    // rationale: caller is responsible for only removing existing ranges
+    void remove(const keyrange &range, TXNID txnid);
+
+    // effect: removes all of the keys represented by this locked keyrange
+    // rationale: we'd like a fast way to empty out a tree
+    void remove_all(void);
+
+   private:
+    // the concurrent tree this locked keyrange is for
+    concurrent_tree *m_tree;
+
+    // the range of keys this locked keyrange represents
+    keyrange m_range;
+
+    // the subtree under which all overlapping ranges exist
+    treenode *m_subtree;
+
+    friend class concurrent_tree_unit_test;
+  };
+
+  // effect: initialize the tree to an empty state
+  void create(const comparator *cmp);
+
+  // effect: destroy the tree.
+  // requires: tree is empty
+  void destroy(void);
+
+  // returns: true iff the tree is empty
+  bool is_empty(void);
+
+  // returns: the memory overhead of a single insertion into the tree
+  static uint64_t get_insertion_memory_overhead(void);
+
+ private:
+  // the root needs to always exist so there's a lock to grab
+  // even if the tree is empty. that's why we store a treenode
+  // here and not a pointer to one.
+  treenode m_root;
+
+  friend class concurrent_tree_unit_test;
+};
+
+} /* namespace toku */
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc
new file mode 100644
index 000000000..e50ace5a9
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc
@@ -0,0 +1,222 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "keyrange.h"
+
+#include "../util/dbt.h"
+
+namespace toku {
+
+// create a keyrange by borrowing the left and right dbt
+// pointers. no memory is copied. no checks for infinity needed.
+void keyrange::create(const DBT *left, const DBT *right) {
+  init_empty();
+  m_left_key = left;
+  m_right_key = right;
+}
+
+// destroy the key copies. if they were never set, then destroy does nothing.
+void keyrange::destroy(void) {
+  toku_destroy_dbt(&m_left_key_copy);
+  toku_destroy_dbt(&m_right_key_copy);
+}
+
+// create a keyrange by copying the keys from the given range.
+void keyrange::create_copy(const keyrange &range) {
+  // start with an initialized, empty range
+  init_empty();
+
+  // optimize the case where the left and right keys are the same.
+  // we'd like to only have one copy of the data.
+  if (toku_dbt_equals(range.get_left_key(), range.get_right_key())) {
+    set_both_keys(range.get_left_key());
+  } else {
+    // replace our empty left and right keys with
+    // copies of the range's left and right keys
+    replace_left_key(range.get_left_key());
+    replace_right_key(range.get_right_key());
+  }
+}
+
+// extend this keyrange by choosing the leftmost and rightmost
+// endpoints between this range and the given. replaced keys
+// in this range are freed and inherited keys are copied.
+void keyrange::extend(const comparator &cmp, const keyrange &range) {
+  const DBT *range_left = range.get_left_key();
+  const DBT *range_right = range.get_right_key();
+  if (cmp(range_left, get_left_key()) < 0) {
+    replace_left_key(range_left);
+  }
+  if (cmp(range_right, get_right_key()) > 0) {
+    replace_right_key(range_right);
+  }
+}
+
+// how much memory does this keyrange take?
+// - the size of the left and right keys
+// --- ignore the fact that we may have optimized the point case.
+//     it complicates things for little gain.
+// - the size of the keyrange class itself
+uint64_t keyrange::get_memory_size(void) const {
+  const DBT *left_key = get_left_key();
+  const DBT *right_key = get_right_key();
+  return left_key->size + right_key->size + sizeof(keyrange);
+}
+
+// compare ranges.
+keyrange::comparison keyrange::compare(const comparator &cmp,
+                                       const keyrange &range) const {
+  if (cmp(get_right_key(), range.get_left_key()) < 0) {
+    return comparison::LESS_THAN;
+  } else if (cmp(get_left_key(), range.get_right_key()) > 0) {
+    return comparison::GREATER_THAN;
+  } else if (cmp(get_left_key(), range.get_left_key()) == 0 &&
+             cmp(get_right_key(), range.get_right_key()) == 0) {
+    return comparison::EQUALS;
+  } else {
+    return comparison::OVERLAPS;
+  }
+}
+
+bool keyrange::overlaps(const comparator &cmp, const keyrange &range) const {
+  // equality is a stronger form of overlapping.
+  // so two ranges "overlap" if they're either equal or just overlapping.
+  comparison c = compare(cmp, range);
+  return c == comparison::EQUALS || c == comparison::OVERLAPS;
+}
+
+keyrange keyrange::get_infinite_range(void) {
+  keyrange range;
+  range.create(toku_dbt_negative_infinity(), toku_dbt_positive_infinity());
+  return range;
+}
+
+void keyrange::init_empty(void) {
+  m_left_key = nullptr;
+  m_right_key = nullptr;
+  toku_init_dbt(&m_left_key_copy);
+  toku_init_dbt(&m_right_key_copy);
+  m_point_range = false;
+}
+
+const DBT *keyrange::get_left_key(void) const {
+  if (m_left_key) {
+    return m_left_key;
+  } else {
+    return &m_left_key_copy;
+  }
+}
+
+const DBT *keyrange::get_right_key(void) const {
+  if (m_right_key) {
+    return m_right_key;
+  } else {
+    return &m_right_key_copy;
+  }
+}
+
+// copy the given once and set both the left and right pointers.
+// optimization for point ranges, so the left and right ranges
+// are not copied twice.
+void keyrange::set_both_keys(const DBT *key) {
+  if (toku_dbt_is_infinite(key)) {
+    m_left_key = key;
+    m_right_key = key;
+  } else {
+    toku_clone_dbt(&m_left_key_copy, *key);
+    toku_copyref_dbt(&m_right_key_copy, m_left_key_copy);
+  }
+  m_point_range = true;
+}
+
+// destroy the current left key. set and possibly copy the new one
+void keyrange::replace_left_key(const DBT *key) {
+  // a little magic:
+  //
+  // if this is a point range, then the left and right keys share
+  // one copy of the data, and it lives in the left key copy. so
+  // if we're replacing the left key, move the real data to the
+  // right key copy instead of destroying it. now, the memory is
+  // owned by the right key and the left key may be replaced.
+  if (m_point_range) {
+    m_right_key_copy = m_left_key_copy;
+  } else {
+    toku_destroy_dbt(&m_left_key_copy);
+  }
+
+  if (toku_dbt_is_infinite(key)) {
+    m_left_key = key;
+  } else {
+    toku_clone_dbt(&m_left_key_copy, *key);
+    m_left_key = nullptr;
+  }
+  m_point_range = false;
+}
+
+// destroy the current right key. set and possibly copy the new one
+void keyrange::replace_right_key(const DBT *key) {
+  toku_destroy_dbt(&m_right_key_copy);
+  if (toku_dbt_is_infinite(key)) {
+    m_right_key = key;
+  } else {
+    toku_clone_dbt(&m_right_key_copy, *key);
+    m_right_key = nullptr;
+  }
+  m_point_range = false;
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h
new file mode 100644
index 000000000..f9aeea0c4
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h
@@ -0,0 +1,141 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "../ft/comparator.h"
+
+namespace toku {
+
+// A keyrange has a left and right key as endpoints.
+//
+// When a keyrange is created it owns no memory, but when it copies
+// or extends another keyrange, it copies memory as necessary. This
+// means it is cheap in the common case.
+
+class keyrange {
+ public:
+  // effect: constructor that borrows left and right key pointers.
+  //         no memory is allocated or copied.
+  void create(const DBT *left_key, const DBT *right_key);
+
+  // effect: constructor that allocates and copies another keyrange's points.
+  void create_copy(const keyrange &range);
+
+  // effect: destroys the keyrange, freeing any allocated memory
+  void destroy(void);
+
+  // effect: extends the keyrange by choosing the leftmost and rightmost
+  //         endpoints from this range and the given range.
+  //         replaced keys in this range are freed, new keys are copied.
+  void extend(const comparator &cmp, const keyrange &range);
+
+  // returns: the amount of memory this keyrange takes. does not account
+  //          for point optimizations or malloc overhead.
+  uint64_t get_memory_size(void) const;
+
+  // returns: pointer to the left key of this range
+  const DBT *get_left_key(void) const;
+
+  // returns: pointer to the right key of this range
+  const DBT *get_right_key(void) const;
+
+  // two ranges are either equal, lt, gt, or overlapping
+  enum comparison { EQUALS, LESS_THAN, GREATER_THAN, OVERLAPS };
+
+  // effect: compares this range to the given range
+  // returns: LESS_THAN    if given range is strictly to the left
+  //          GREATER_THAN if given range is strictly to the right
+  //          EQUALS       if given range has the same left and right endpoints
+  //          OVERLAPS     if at least one of the given range's endpoints falls
+  //                       between this range's endpoints
+  comparison compare(const comparator &cmp, const keyrange &range) const;
+
+  // returns: true if the range and the given range are equal or overlapping
+  bool overlaps(const comparator &cmp, const keyrange &range) const;
+
+  // returns: a keyrange representing -inf, +inf
+  static keyrange get_infinite_range(void);
+
+ private:
+  // some keys should be copied, some keys should not be.
+  //
+  // to support both, we use two DBTs for copies and two pointers
+  // for temporaries. the access rule is:
+  // - if a pointer is non-null, then it reprsents the key.
+  // - otherwise the pointer is null, and the key is in the copy.
+  DBT m_left_key_copy;
+  DBT m_right_key_copy;
+  const DBT *m_left_key;
+  const DBT *m_right_key;
+
+  // if this range is a point range, then m_left_key == m_right_key
+  // and the actual data is stored exactly once in m_left_key_copy.
+  bool m_point_range;
+
+  // effect: initializes a keyrange to be empty
+  void init_empty(void);
+
+  // effect: copies the given key once into the left key copy
+  //         and sets the right key copy to share the left.
+  // rationale: optimization for point ranges to only do one malloc
+  void set_both_keys(const DBT *key);
+
+  // effect: destroys the current left key. sets and copies the new one.
+  void replace_left_key(const DBT *key);
+
+  // effect: destroys the current right key. sets and copies the new one.
+  void replace_right_key(const DBT *key);
+};
+
+} /* namespace toku */
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc
new file mode 100644
index 000000000..3d217be70
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc
@@ -0,0 +1,527 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "lock_request.h"
+
+#include "../portability/toku_race_tools.h"
+#include "../portability/txn_subst.h"
+#include "../util/dbt.h"
+#include "locktree.h"
+
+namespace toku {
+
+// initialize a lock request's internals
+void lock_request::create(toku_external_mutex_factory_t mutex_factory) {
+  m_txnid = TXNID_NONE;
+  m_conflicting_txnid = TXNID_NONE;
+  m_start_time = 0;
+  m_left_key = nullptr;
+  m_right_key = nullptr;
+  toku_init_dbt(&m_left_key_copy);
+  toku_init_dbt(&m_right_key_copy);
+
+  m_type = type::UNKNOWN;
+  m_lt = nullptr;
+
+  m_complete_r = 0;
+  m_state = state::UNINITIALIZED;
+  m_info = nullptr;
+
+  // psergey-todo: this condition is for interruptible wait
+  // note: moved to here from lock_request::create:
+  toku_external_cond_init(mutex_factory, &m_wait_cond);
+
+  m_start_test_callback = nullptr;
+  m_start_before_pending_test_callback = nullptr;
+  m_retry_test_callback = nullptr;
+}
+
+// destroy a lock request.
+void lock_request::destroy(void) {
+  invariant(m_state != state::PENDING);
+  invariant(m_state != state::DESTROYED);
+  m_state = state::DESTROYED;
+  toku_destroy_dbt(&m_left_key_copy);
+  toku_destroy_dbt(&m_right_key_copy);
+  toku_external_cond_destroy(&m_wait_cond);
+}
+
+// set the lock request parameters. this API allows a lock request to be reused.
+void lock_request::set(locktree *lt, TXNID txnid, const DBT *left_key,
+                       const DBT *right_key, lock_request::type lock_type,
+                       bool big_txn, void *extra) {
+  invariant(m_state != state::PENDING);
+  m_lt = lt;
+
+  m_txnid = txnid;
+  m_left_key = left_key;
+  m_right_key = right_key;
+  toku_destroy_dbt(&m_left_key_copy);
+  toku_destroy_dbt(&m_right_key_copy);
+  m_type = lock_type;
+  m_state = state::INITIALIZED;
+  m_info = lt ? lt->get_lock_request_info() : nullptr;
+  m_big_txn = big_txn;
+  m_extra = extra;
+}
+
+// get rid of any stored left and right key copies and
+// replace them with copies of the given left and right key
+void lock_request::copy_keys() {
+  if (!toku_dbt_is_infinite(m_left_key)) {
+    toku_clone_dbt(&m_left_key_copy, *m_left_key);
+    m_left_key = &m_left_key_copy;
+  }
+  if (!toku_dbt_is_infinite(m_right_key)) {
+    toku_clone_dbt(&m_right_key_copy, *m_right_key);
+    m_right_key = &m_right_key_copy;
+  }
+}
+
+// what are the conflicts for this pending lock request?
+void lock_request::get_conflicts(txnid_set *conflicts) {
+  invariant(m_state == state::PENDING);
+  const bool is_write_request = m_type == type::WRITE;
+  m_lt->get_conflicts(is_write_request, m_txnid, m_left_key, m_right_key,
+                      conflicts);
+}
+
+// build a wait-for-graph for this lock request and the given conflict set
+// for each transaction B that blocks A's lock request
+//     if B is blocked then
+//         add (A,T) to the WFG and if B is new, fill in the WFG from B
+void lock_request::build_wait_graph(wfg *wait_graph,
+                                    const txnid_set &conflicts) {
+  uint32_t num_conflicts = conflicts.size();
+  for (uint32_t i = 0; i < num_conflicts; i++) {
+    TXNID conflicting_txnid = conflicts.get(i);
+    lock_request *conflicting_request = find_lock_request(conflicting_txnid);
+    invariant(conflicting_txnid != m_txnid);
+    invariant(conflicting_request != this);
+    if (conflicting_request) {
+      bool already_exists = wait_graph->node_exists(conflicting_txnid);
+      wait_graph->add_edge(m_txnid, conflicting_txnid);
+      if (!already_exists) {
+        // recursively build the wait for graph rooted at the conflicting
+        // request, given its set of lock conflicts.
+        txnid_set other_conflicts;
+        other_conflicts.create();
+        conflicting_request->get_conflicts(&other_conflicts);
+        conflicting_request->build_wait_graph(wait_graph, other_conflicts);
+        other_conflicts.destroy();
+      }
+    }
+  }
+}
+
+// returns: true if the current set of lock requests contains
+//          a deadlock, false otherwise.
+bool lock_request::deadlock_exists(const txnid_set &conflicts) {
+  wfg wait_graph;
+  wait_graph.create();
+
+  build_wait_graph(&wait_graph, conflicts);
+
+  std::function<void(TXNID)> reporter;
+  if (m_deadlock_cb) {
+    reporter = [this](TXNID a) {
+      lock_request *req = find_lock_request(a);
+      if (req) {
+        m_deadlock_cb(req->m_txnid, (req->m_type == lock_request::WRITE),
+                      req->m_left_key, req->m_right_key);
+      }
+    };
+  }
+
+  bool deadlock = wait_graph.cycle_exists_from_txnid(m_txnid, reporter);
+  wait_graph.destroy();
+  return deadlock;
+}
+
+// try to acquire a lock described by this lock request.
+int lock_request::start(void) {
+  int r;
+
+  txnid_set conflicts;
+  conflicts.create();
+  if (m_type == type::WRITE) {
+    r = m_lt->acquire_write_lock(m_txnid, m_left_key, m_right_key, &conflicts,
+                                 m_big_txn);
+  } else {
+    invariant(m_type == type::READ);
+    r = m_lt->acquire_read_lock(m_txnid, m_left_key, m_right_key, &conflicts,
+                                m_big_txn);
+  }
+
+  // if the lock is not granted, save it to the set of lock requests
+  // and check for a deadlock. if there is one, complete it as failed
+  if (r == DB_LOCK_NOTGRANTED) {
+    copy_keys();
+    m_state = state::PENDING;
+    m_start_time = toku_current_time_microsec() / 1000;
+    m_conflicting_txnid = conflicts.get(0);
+    if (m_start_before_pending_test_callback)
+      m_start_before_pending_test_callback();
+    toku_external_mutex_lock(&m_info->mutex);
+    insert_into_lock_requests();
+    if (deadlock_exists(conflicts)) {
+      remove_from_lock_requests();
+      r = DB_LOCK_DEADLOCK;
+    }
+    toku_external_mutex_unlock(&m_info->mutex);
+    if (m_start_test_callback) m_start_test_callback();  // test callback
+  }
+
+  if (r != DB_LOCK_NOTGRANTED) {
+    complete(r);
+  }
+
+  conflicts.destroy();
+  return r;
+}
+
+// sleep on the lock request until it becomes resolved or the wait time has
+// elapsed.
+int lock_request::wait(uint64_t wait_time_ms) {
+  return wait(wait_time_ms, 0, nullptr);
+}
+
+int lock_request::wait(uint64_t wait_time_ms, uint64_t killed_time_ms,
+                       int (*killed_callback)(void),
+                       void (*lock_wait_callback)(void *, lock_wait_infos *),
+                       void *callback_arg) {
+  uint64_t t_now = toku_current_time_microsec();
+  uint64_t t_start = t_now;
+  uint64_t t_end = t_start + wait_time_ms * 1000;
+
+  toku_external_mutex_lock(&m_info->mutex);
+
+  // check again, this time locking out other retry calls
+  if (m_state == state::PENDING) {
+    lock_wait_infos conflicts_collector;
+    retry(&conflicts_collector);
+    if (m_state == state::PENDING) {
+      report_waits(&conflicts_collector, lock_wait_callback, callback_arg);
+    }
+  }
+
+  while (m_state == state::PENDING) {
+    // check if this thread is killed
+    if (killed_callback && killed_callback()) {
+      remove_from_lock_requests();
+      complete(DB_LOCK_NOTGRANTED);
+      continue;
+    }
+
+    // compute the time until we should wait
+    uint64_t t_wait;
+    if (killed_time_ms == 0) {
+      t_wait = t_end;
+    } else {
+      t_wait = t_now + killed_time_ms * 1000;
+      if (t_wait > t_end) t_wait = t_end;
+    }
+
+    int r = toku_external_cond_timedwait(&m_wait_cond, &m_info->mutex,
+                                         (int64_t)(t_wait - t_now));
+    invariant(r == 0 || r == ETIMEDOUT);
+
+    t_now = toku_current_time_microsec();
+    if (m_state == state::PENDING && (t_now >= t_end)) {
+      m_info->counters.timeout_count += 1;
+
+      // if we're still pending and we timed out, then remove our
+      // request from the set of lock requests and fail.
+      remove_from_lock_requests();
+
+      // complete sets m_state to COMPLETE, breaking us out of the loop
+      complete(DB_LOCK_NOTGRANTED);
+    }
+  }
+
+  uint64_t t_real_end = toku_current_time_microsec();
+  uint64_t duration = t_real_end - t_start;
+  m_info->counters.wait_count += 1;
+  m_info->counters.wait_time += duration;
+  if (duration >= 1000000) {
+    m_info->counters.long_wait_count += 1;
+    m_info->counters.long_wait_time += duration;
+  }
+  toku_external_mutex_unlock(&m_info->mutex);
+
+  invariant(m_state == state::COMPLETE);
+  return m_complete_r;
+}
+
+// complete this lock request with the given return value
+void lock_request::complete(int complete_r) {
+  m_complete_r = complete_r;
+  m_state = state::COMPLETE;
+}
+
+const DBT *lock_request::get_left_key(void) const { return m_left_key; }
+
+const DBT *lock_request::get_right_key(void) const { return m_right_key; }
+
+TXNID lock_request::get_txnid(void) const { return m_txnid; }
+
+uint64_t lock_request::get_start_time(void) const { return m_start_time; }
+
+TXNID lock_request::get_conflicting_txnid(void) const {
+  return m_conflicting_txnid;
+}
+
+int lock_request::retry(lock_wait_infos *conflicts_collector) {
+  invariant(m_state == state::PENDING);
+  int r;
+  txnid_set conflicts;
+  conflicts.create();
+
+  if (m_type == type::WRITE) {
+    r = m_lt->acquire_write_lock(m_txnid, m_left_key, m_right_key, &conflicts,
+                                 m_big_txn);
+  } else {
+    r = m_lt->acquire_read_lock(m_txnid, m_left_key, m_right_key, &conflicts,
+                                m_big_txn);
+  }
+
+  // if the acquisition succeeded then remove ourselves from the
+  // set of lock requests, complete, and signal the waiting thread.
+  if (r == 0) {
+    remove_from_lock_requests();
+    complete(r);
+    if (m_retry_test_callback) m_retry_test_callback();  // test callback
+    toku_external_cond_broadcast(&m_wait_cond);
+  } else {
+    m_conflicting_txnid = conflicts.get(0);
+    add_conflicts_to_waits(&conflicts, conflicts_collector);
+  }
+  conflicts.destroy();
+
+  return r;
+}
+
+void lock_request::retry_all_lock_requests(
+    locktree *lt, void (*lock_wait_callback)(void *, lock_wait_infos *),
+    void *callback_arg, void (*after_retry_all_test_callback)(void)) {
+  lt_lock_request_info *info = lt->get_lock_request_info();
+
+  // if there are no pending lock requests than there is nothing to do
+  // the unlocked data race on pending_is_empty is OK since lock requests
+  // are retried after added to the pending set.
+  if (info->pending_is_empty) return;
+
+  // get my retry generation (post increment of retry_want)
+  unsigned long long my_retry_want = (info->retry_want += 1);
+
+  toku_mutex_lock(&info->retry_mutex);
+
+  // here is the group retry algorithm.
+  // get the latest retry_want count and use it as the generation number of
+  // this retry operation. if this retry generation is > the last retry
+  // generation, then do the lock retries.  otherwise, no lock retries
+  // are needed.
+  if ((my_retry_want - 1) == info->retry_done) {
+    for (;;) {
+      if (!info->running_retry) {
+        info->running_retry = true;
+        info->retry_done = info->retry_want;
+        toku_mutex_unlock(&info->retry_mutex);
+        retry_all_lock_requests_info(info, lock_wait_callback, callback_arg);
+        if (after_retry_all_test_callback) after_retry_all_test_callback();
+        toku_mutex_lock(&info->retry_mutex);
+        info->running_retry = false;
+        toku_cond_broadcast(&info->retry_cv);
+        break;
+      } else {
+        toku_cond_wait(&info->retry_cv, &info->retry_mutex);
+      }
+    }
+  }
+  toku_mutex_unlock(&info->retry_mutex);
+}
+
+void lock_request::retry_all_lock_requests_info(
+    lt_lock_request_info *info,
+    void (*lock_wait_callback)(void *, lock_wait_infos *), void *callback_arg) {
+  toku_external_mutex_lock(&info->mutex);
+  // retry all of the pending lock requests.
+  lock_wait_infos conflicts_collector;
+  for (uint32_t i = 0; i < info->pending_lock_requests.size();) {
+    lock_request *request;
+    int r = info->pending_lock_requests.fetch(i, &request);
+    invariant_zero(r);
+
+    // retry the lock request. if it didn't succeed,
+    // move on to the next lock request. otherwise
+    // the request is gone from the list so we may
+    // read the i'th entry for the next one.
+    r = request->retry(&conflicts_collector);
+    if (r != 0) {
+      i++;
+    }
+  }
+
+  // call report_waits while holding the pending queue lock since
+  // the waiter object is still valid while it's in the queue
+  report_waits(&conflicts_collector, lock_wait_callback, callback_arg);
+
+  // future threads should only retry lock requests if some still exist
+  info->should_retry_lock_requests = info->pending_lock_requests.size() > 0;
+  toku_external_mutex_unlock(&info->mutex);
+}
+
+void lock_request::add_conflicts_to_waits(txnid_set *conflicts,
+                                          lock_wait_infos *wait_conflicts) {
+  wait_conflicts->push_back({m_lt, get_txnid(), m_extra, {}});
+  uint32_t num_conflicts = conflicts->size();
+  for (uint32_t i = 0; i < num_conflicts; i++) {
+    wait_conflicts->back().waitees.push_back(conflicts->get(i));
+  }
+}
+
+void lock_request::report_waits(lock_wait_infos *wait_conflicts,
+                                void (*lock_wait_callback)(void *,
+                                                           lock_wait_infos *),
+                                void *callback_arg) {
+  if (lock_wait_callback) (*lock_wait_callback)(callback_arg, wait_conflicts);
+}
+
+void *lock_request::get_extra(void) const { return m_extra; }
+
+void lock_request::kill_waiter(void) {
+  remove_from_lock_requests();
+  complete(DB_LOCK_NOTGRANTED);
+  toku_external_cond_broadcast(&m_wait_cond);
+}
+
+void lock_request::kill_waiter(locktree *lt, void *extra) {
+  lt_lock_request_info *info = lt->get_lock_request_info();
+  toku_external_mutex_lock(&info->mutex);
+  for (uint32_t i = 0; i < info->pending_lock_requests.size(); i++) {
+    lock_request *request;
+    int r = info->pending_lock_requests.fetch(i, &request);
+    if (r == 0 && request->get_extra() == extra) {
+      request->kill_waiter();
+      break;
+    }
+  }
+  toku_external_mutex_unlock(&info->mutex);
+}
+
+// find another lock request by txnid. must hold the mutex.
+lock_request *lock_request::find_lock_request(const TXNID &txnid) {
+  lock_request *request;
+  int r = m_info->pending_lock_requests.find_zero<TXNID, find_by_txnid>(
+      txnid, &request, nullptr);
+  if (r != 0) {
+    request = nullptr;
+  }
+  return request;
+}
+
+// insert this lock request into the locktree's set. must hold the mutex.
+void lock_request::insert_into_lock_requests(void) {
+  uint32_t idx;
+  lock_request *request;
+  int r = m_info->pending_lock_requests.find_zero<TXNID, find_by_txnid>(
+      m_txnid, &request, &idx);
+  invariant(r == DB_NOTFOUND);
+  r = m_info->pending_lock_requests.insert_at(this, idx);
+  invariant_zero(r);
+  m_info->pending_is_empty = false;
+}
+
+// remove this lock request from the locktree's set. must hold the mutex.
+void lock_request::remove_from_lock_requests(void) {
+  uint32_t idx;
+  lock_request *request;
+  int r = m_info->pending_lock_requests.find_zero<TXNID, find_by_txnid>(
+      m_txnid, &request, &idx);
+  invariant_zero(r);
+  invariant(request == this);
+  r = m_info->pending_lock_requests.delete_at(idx);
+  invariant_zero(r);
+  if (m_info->pending_lock_requests.size() == 0)
+    m_info->pending_is_empty = true;
+}
+
+int lock_request::find_by_txnid(lock_request *const &request,
+                                const TXNID &txnid) {
+  TXNID request_txnid = request->m_txnid;
+  if (request_txnid < txnid) {
+    return -1;
+  } else if (request_txnid == txnid) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+void lock_request::set_start_test_callback(void (*f)(void)) {
+  m_start_test_callback = f;
+}
+
+void lock_request::set_start_before_pending_test_callback(void (*f)(void)) {
+  m_start_before_pending_test_callback = f;
+}
+
+void lock_request::set_retry_test_callback(void (*f)(void)) {
+  m_retry_test_callback = f;
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h
new file mode 100644
index 000000000..d30e1e2ca
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h
@@ -0,0 +1,255 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "../db.h"
+#include "../ft/comparator.h"
+#include "../portability/toku_pthread.h"
+#include "locktree.h"
+#include "txnid_set.h"
+#include "wfg.h"
+
+namespace toku {
+
+// Information about a lock wait
+struct lock_wait_info {
+  locktree *ltree;  // the tree where wait happens
+  TXNID waiter;     // the waiting transaction
+  void *m_extra;    // lock_request's m_extra
+
+  // The transactions that are waited for.
+  std::vector<TXNID> waitees;
+};
+
+typedef std::vector<lock_wait_info> lock_wait_infos;
+
+// A lock request contains the db, the key range, the lock type, and
+// the transaction id that describes a potential row range lock.
+//
+// the typical use case is:
+// - initialize a lock request
+// - start to try to acquire the lock
+// - do something else
+// - wait for the lock request to be resolved on a timed condition
+// - destroy the lock request
+// a lock request is resolved when its state is no longer pending, or
+// when it becomes granted, or timedout, or deadlocked. when resolved, the
+// state of the lock request is changed and any waiting threads are awakened.
+
+class lock_request {
+ public:
+  enum type { UNKNOWN, READ, WRITE };
+
+  // effect: Initializes a lock request.
+  void create(toku_external_mutex_factory_t mutex_factory);
+
+  // effect: Destroys a lock request.
+  void destroy(void);
+
+  // effect: Resets the lock request parameters, allowing it to be reused.
+  // requires: Lock request was already created at some point
+  void set(locktree *lt, TXNID txnid, const DBT *left_key, const DBT *right_key,
+           type lock_type, bool big_txn, void *extra = nullptr);
+
+  // effect: Tries to acquire a lock described by this lock request.
+  // returns: The return code of locktree::acquire_[write,read]_lock()
+  //          or DB_LOCK_DEADLOCK if this request would end up deadlocked.
+  int start(void);
+
+  // effect: Sleeps until either the request is granted or the wait time
+  // expires. returns: The return code of locktree::acquire_[write,read]_lock()
+  //          or simply DB_LOCK_NOTGRANTED if the wait time expired.
+  int wait(uint64_t wait_time_ms);
+  int wait(uint64_t wait_time_ms, uint64_t killed_time_ms,
+           int (*killed_callback)(void),
+           void (*lock_wait_callback)(void *, lock_wait_infos *) = nullptr,
+           void *callback_arg = nullptr);
+
+  // return: left end-point of the lock range
+  const DBT *get_left_key(void) const;
+
+  // return: right end-point of the lock range
+  const DBT *get_right_key(void) const;
+
+  // return: the txnid waiting for a lock
+  TXNID get_txnid(void) const;
+
+  // return: when this lock request started, as milliseconds from epoch
+  uint64_t get_start_time(void) const;
+
+  // return: which txnid is blocking this request (there may be more, though)
+  TXNID get_conflicting_txnid(void) const;
+
+  // effect: Retries all of the lock requests for the given locktree.
+  //         Any lock requests successfully restarted is completed and woken
+  //         up.
+  //         The rest remain pending.
+  static void retry_all_lock_requests(
+      locktree *lt,
+      void (*lock_wait_callback)(void *, lock_wait_infos *) = nullptr,
+      void *callback_arg = nullptr,
+      void (*after_retry_test_callback)(void) = nullptr);
+  static void retry_all_lock_requests_info(
+      lt_lock_request_info *info,
+      void (*lock_wait_callback)(void *, lock_wait_infos *),
+      void *callback_arg);
+
+  void set_start_test_callback(void (*f)(void));
+  void set_start_before_pending_test_callback(void (*f)(void));
+  void set_retry_test_callback(void (*f)(void));
+
+  void *get_extra(void) const;
+
+  void kill_waiter(void);
+  static void kill_waiter(locktree *lt, void *extra);
+
+ private:
+  enum state {
+    UNINITIALIZED,
+    INITIALIZED,
+    PENDING,
+    COMPLETE,
+    DESTROYED,
+  };
+
+  // The keys for a lock request are stored "unowned" in m_left_key
+  // and m_right_key. When the request is about to go to sleep, it
+  // copies these keys and stores them in m_left_key_copy etc and
+  // sets the temporary pointers to null.
+  TXNID m_txnid;
+  TXNID m_conflicting_txnid;
+  uint64_t m_start_time;
+  const DBT *m_left_key;
+  const DBT *m_right_key;
+  DBT m_left_key_copy;
+  DBT m_right_key_copy;
+
+  // The lock request type and associated locktree
+  type m_type;
+  locktree *m_lt;
+
+  // If the lock request is in the completed state, then its
+  // final return value is stored in m_complete_r
+  int m_complete_r;
+  state m_state;
+
+  toku_external_cond_t m_wait_cond;
+
+  bool m_big_txn;
+
+  // the lock request info state stored in the
+  // locktree that this lock request is for.
+  struct lt_lock_request_info *m_info;
+
+  void *m_extra;
+
+  // effect: tries again to acquire the lock described by this lock request
+  // returns: 0 if retrying the request succeeded and is now complete
+  int retry(lock_wait_infos *collector);
+
+  void complete(int complete_r);
+
+  // effect: Finds another lock request by txnid.
+  // requires: The lock request info mutex is held
+  lock_request *find_lock_request(const TXNID &txnid);
+
+  // effect: Insert this lock request into the locktree's set.
+  // requires: the locktree's mutex is held
+  void insert_into_lock_requests(void);
+
+  // effect: Removes this lock request from the locktree's set.
+  // requires: The lock request info mutex is held
+  void remove_from_lock_requests(void);
+
+  // effect: Asks this request's locktree which txnids are preventing
+  //         us from getting the lock described by this request.
+  // returns: conflicts is populated with the txnid's that this request
+  //          is blocked on
+  void get_conflicts(txnid_set *conflicts);
+
+  // effect: Builds a wait-for-graph for this lock request and the given
+  // conflict set
+  void build_wait_graph(wfg *wait_graph, const txnid_set &conflicts);
+
+  // returns: True if this lock request is in deadlock with the given conflicts
+  // set
+  bool deadlock_exists(const txnid_set &conflicts);
+
+  void copy_keys(void);
+
+  static int find_by_txnid(lock_request *const &request, const TXNID &txnid);
+
+  // Report list of conflicts to lock wait callback.
+  static void report_waits(lock_wait_infos *wait_conflicts,
+                           void (*lock_wait_callback)(void *,
+                                                      lock_wait_infos *),
+                           void *callback_arg);
+  void add_conflicts_to_waits(txnid_set *conflicts,
+                              lock_wait_infos *wait_conflicts);
+
+  void (*m_start_test_callback)(void);
+  void (*m_start_before_pending_test_callback)(void);
+  void (*m_retry_test_callback)(void);
+
+ public:
+  std::function<void(TXNID, bool, const DBT *, const DBT *)> m_deadlock_cb;
+
+  friend class lock_request_unit_test;
+};
+// PORT: lock_request is not a POD anymore due to use of toku_external_cond_t
+//  This is ok as the PODness is not really required: lock_request objects are
+//  not moved in memory or anything.
+// ENSURE_POD(lock_request);
+
+} /* namespace toku */
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc
new file mode 100644
index 000000000..3d6a590c7
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc
@@ -0,0 +1,1023 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "locktree.h"
+
+#include <memory.h>
+
+#include "../portability/toku_pthread.h"
+#include "../portability/toku_time.h"
+#include "../util/growable_array.h"
+#include "range_buffer.h"
+
+// including the concurrent_tree here expands the templates
+// and "defines" the implementation, so we do it here in
+// the locktree source file instead of the header.
+#include "concurrent_tree.h"
+
+namespace toku {
+// A locktree represents the set of row locks owned by all transactions
+// over an open dictionary. Read and write ranges are represented as
+// a left and right key which are compared with the given descriptor
+// and comparison fn.
+//
+// Each locktree has a reference count which it manages
+// but does nothing based on the value of the reference count - it is
+// up to the user of the locktree to destroy it when it sees fit.
+
+void locktree::create(locktree_manager *mgr, DICTIONARY_ID dict_id,
+                      const comparator &cmp,
+                      toku_external_mutex_factory_t mutex_factory) {
+  m_mgr = mgr;
+  m_dict_id = dict_id;
+
+  m_cmp.create_from(cmp);
+  m_reference_count = 1;
+  m_userdata = nullptr;
+
+  XCALLOC(m_rangetree);
+  m_rangetree->create(&m_cmp);
+
+  m_sto_txnid = TXNID_NONE;
+  m_sto_buffer.create();
+  m_sto_score = STO_SCORE_THRESHOLD;
+  m_sto_end_early_count = 0;
+  m_sto_end_early_time = 0;
+
+  m_escalation_barrier = [](const DBT *, const DBT *, void *) -> bool {
+    return false;
+  };
+
+  m_lock_request_info.init(mutex_factory);
+}
+
+void locktree::set_escalation_barrier_func(
+    lt_escalation_barrier_check_func func, void *extra) {
+  m_escalation_barrier = func;
+  m_escalation_barrier_arg = extra;
+}
+
+void lt_lock_request_info::init(toku_external_mutex_factory_t mutex_factory) {
+  pending_lock_requests.create();
+  pending_is_empty = true;
+  toku_external_mutex_init(mutex_factory, &mutex);
+  retry_want = retry_done = 0;
+  ZERO_STRUCT(counters);
+  ZERO_STRUCT(retry_mutex);
+  toku_mutex_init(locktree_request_info_retry_mutex_key, &retry_mutex, nullptr);
+  toku_cond_init(locktree_request_info_retry_cv_key, &retry_cv, nullptr);
+  running_retry = false;
+
+  TOKU_VALGRIND_HG_DISABLE_CHECKING(&pending_is_empty,
+                                    sizeof(pending_is_empty));
+  TOKU_DRD_IGNORE_VAR(pending_is_empty);
+}
+
+void locktree::destroy(void) {
+  invariant(m_reference_count == 0);
+  invariant(m_lock_request_info.pending_lock_requests.size() == 0);
+  m_cmp.destroy();
+  m_rangetree->destroy();
+  toku_free(m_rangetree);
+  m_sto_buffer.destroy();
+  m_lock_request_info.destroy();
+}
+
+void lt_lock_request_info::destroy(void) {
+  pending_lock_requests.destroy();
+  toku_external_mutex_destroy(&mutex);
+  toku_mutex_destroy(&retry_mutex);
+  toku_cond_destroy(&retry_cv);
+}
+
+void locktree::add_reference(void) {
+  (void)toku_sync_add_and_fetch(&m_reference_count, 1);
+}
+
+uint32_t locktree::release_reference(void) {
+  return toku_sync_sub_and_fetch(&m_reference_count, 1);
+}
+
+uint32_t locktree::get_reference_count(void) { return m_reference_count; }
+
+// a container for a range/txnid pair
+struct row_lock {
+  keyrange range;
+  TXNID txnid;
+  bool is_shared;
+  TxnidVector *owners;
+};
+
+// iterate over a locked keyrange and copy out all of the data,
+// storing each row lock into the given growable array. the
+// caller does not own the range inside the returned row locks,
+// so remove from the tree with care using them as keys.
+static void iterate_and_get_overlapping_row_locks(
+    const concurrent_tree::locked_keyrange *lkr,
+    GrowableArray<row_lock> *row_locks) {
+  struct copy_fn_obj {
+    GrowableArray<row_lock> *row_locks;
+    bool fn(const keyrange &range, TXNID txnid, bool is_shared,
+            TxnidVector *owners) {
+      row_lock lock = {.range = range,
+                       .txnid = txnid,
+                       .is_shared = is_shared,
+                       .owners = owners};
+      row_locks->push(lock);
+      return true;
+    }
+  } copy_fn;
+  copy_fn.row_locks = row_locks;
+  lkr->iterate(&copy_fn);
+}
+
+// given a txnid and a set of overlapping row locks, determine
+// which txnids are conflicting, and store them in the conflicts
+// set, if given.
+static bool determine_conflicting_txnids(
+    const GrowableArray<row_lock> &row_locks, const TXNID &txnid,
+    txnid_set *conflicts) {
+  bool conflicts_exist = false;
+  const size_t num_overlaps = row_locks.get_size();
+  for (size_t i = 0; i < num_overlaps; i++) {
+    const row_lock lock = row_locks.fetch_unchecked(i);
+    const TXNID other_txnid = lock.txnid;
+    if (other_txnid != txnid) {
+      if (conflicts) {
+        if (other_txnid == TXNID_SHARED) {
+          // Add all shared lock owners, except this transaction.
+          for (TXNID shared_id : *lock.owners) {
+            if (shared_id != txnid) conflicts->add(shared_id);
+          }
+        } else {
+          conflicts->add(other_txnid);
+        }
+      }
+      conflicts_exist = true;
+    }
+  }
+  return conflicts_exist;
+}
+
+// how much memory does a row lock take up in a concurrent tree?
+static uint64_t row_lock_size_in_tree(const row_lock &lock) {
+  const uint64_t overhead = concurrent_tree::get_insertion_memory_overhead();
+  return lock.range.get_memory_size() + overhead;
+}
+
+// remove and destroy the given row lock from the locked keyrange,
+// then notify the memory tracker of the newly freed lock.
+static void remove_row_lock_from_tree(concurrent_tree::locked_keyrange *lkr,
+                                      const row_lock &lock, TXNID txnid,
+                                      locktree_manager *mgr) {
+  const uint64_t mem_released = row_lock_size_in_tree(lock);
+  lkr->remove(lock.range, txnid);
+  if (mgr != nullptr) {
+    mgr->note_mem_released(mem_released);
+  }
+}
+
+// insert a row lock into the locked keyrange, then notify
+// the memory tracker of this newly acquired lock.
+static void insert_row_lock_into_tree(concurrent_tree::locked_keyrange *lkr,
+                                      const row_lock &lock,
+                                      locktree_manager *mgr) {
+  uint64_t mem_used = row_lock_size_in_tree(lock);
+  lkr->insert(lock.range, lock.txnid, lock.is_shared);
+  if (mgr != nullptr) {
+    mgr->note_mem_used(mem_used);
+  }
+}
+
+void locktree::sto_begin(TXNID txnid) {
+  invariant(m_sto_txnid == TXNID_NONE);
+  invariant(m_sto_buffer.is_empty());
+  m_sto_txnid = txnid;
+}
+
+void locktree::sto_append(const DBT *left_key, const DBT *right_key,
+                          bool is_write_request) {
+  uint64_t buffer_mem, delta;
+
+  // psergey: the below two lines do not make any sense
+  // (and it's the same in upstream TokuDB)
+  keyrange range;
+  range.create(left_key, right_key);
+
+  buffer_mem = m_sto_buffer.total_memory_size();
+  m_sto_buffer.append(left_key, right_key, is_write_request);
+  delta = m_sto_buffer.total_memory_size() - buffer_mem;
+  if (m_mgr != nullptr) {
+    m_mgr->note_mem_used(delta);
+  }
+}
+
+void locktree::sto_end(void) {
+  uint64_t mem_size = m_sto_buffer.total_memory_size();
+  if (m_mgr != nullptr) {
+    m_mgr->note_mem_released(mem_size);
+  }
+  m_sto_buffer.destroy();
+  m_sto_buffer.create();
+  m_sto_txnid = TXNID_NONE;
+}
+
+void locktree::sto_end_early_no_accounting(void *prepared_lkr) {
+  sto_migrate_buffer_ranges_to_tree(prepared_lkr);
+  sto_end();
+  toku_unsafe_set(m_sto_score, 0);
+}
+
+void locktree::sto_end_early(void *prepared_lkr) {
+  m_sto_end_early_count++;
+
+  tokutime_t t0 = toku_time_now();
+  sto_end_early_no_accounting(prepared_lkr);
+  tokutime_t t1 = toku_time_now();
+
+  m_sto_end_early_time += (t1 - t0);
+}
+
+void locktree::sto_migrate_buffer_ranges_to_tree(void *prepared_lkr) {
+  // There should be something to migrate, and nothing in the rangetree.
+  invariant(!m_sto_buffer.is_empty());
+  invariant(m_rangetree->is_empty());
+
+  concurrent_tree sto_rangetree;
+  concurrent_tree::locked_keyrange sto_lkr;
+  sto_rangetree.create(&m_cmp);
+
+  // insert all of the ranges from the single txnid buffer into a new rangtree
+  range_buffer::iterator iter(&m_sto_buffer);
+  range_buffer::iterator::record rec;
+  while (iter.current(&rec)) {
+    sto_lkr.prepare(&sto_rangetree);
+    int r = acquire_lock_consolidated(&sto_lkr, m_sto_txnid, rec.get_left_key(),
+                                      rec.get_right_key(),
+                                      rec.get_exclusive_flag(), nullptr);
+    invariant_zero(r);
+    sto_lkr.release();
+    iter.next();
+  }
+
+  // Iterate the newly created rangetree and insert each range into the
+  // locktree's rangetree, on behalf of the old single txnid.
+  struct migrate_fn_obj {
+    concurrent_tree::locked_keyrange *dst_lkr;
+    bool fn(const keyrange &range, TXNID txnid, bool is_shared,
+            TxnidVector *owners) {
+      // There can't be multiple owners in STO mode
+      invariant_zero(owners);
+      dst_lkr->insert(range, txnid, is_shared);
+      return true;
+    }
+  } migrate_fn;
+  migrate_fn.dst_lkr =
+      static_cast<concurrent_tree::locked_keyrange *>(prepared_lkr);
+  sto_lkr.prepare(&sto_rangetree);
+  sto_lkr.iterate(&migrate_fn);
+  sto_lkr.remove_all();
+  sto_lkr.release();
+  sto_rangetree.destroy();
+  invariant(!m_rangetree->is_empty());
+}
+
+bool locktree::sto_try_acquire(void *prepared_lkr, TXNID txnid,
+                               const DBT *left_key, const DBT *right_key,
+                               bool is_write_request) {
+  if (m_rangetree->is_empty() && m_sto_buffer.is_empty() &&
+      toku_unsafe_fetch(m_sto_score) >= STO_SCORE_THRESHOLD) {
+    // We can do the optimization because the rangetree is empty, and
+    // we know its worth trying because the sto score is big enough.
+    sto_begin(txnid);
+  } else if (m_sto_txnid != TXNID_NONE) {
+    // We are currently doing the optimization. Check if we need to cancel
+    // it because a new txnid appeared, or if the current single txnid has
+    // taken too many locks already.
+    if (m_sto_txnid != txnid ||
+        m_sto_buffer.get_num_ranges() > STO_BUFFER_MAX_SIZE) {
+      sto_end_early(prepared_lkr);
+    }
+  }
+
+  // At this point the sto txnid is properly set. If it is valid, then
+  // this txnid can append its lock to the sto buffer successfully.
+  if (m_sto_txnid != TXNID_NONE) {
+    invariant(m_sto_txnid == txnid);
+    sto_append(left_key, right_key, is_write_request);
+    return true;
+  } else {
+    invariant(m_sto_buffer.is_empty());
+    return false;
+  }
+}
+
+/*
+  Do the same as iterate_and_get_overlapping_row_locks does, but also check for
+  this:
+    The set of overlapping rows locks consists of just one read-only shared
+    lock with the same endpoints as specified (in that case, we can just add
+    ourselves into that list)
+
+  @return true - One compatible shared lock
+         false - Otherwise
+*/
+static bool iterate_and_get_overlapping_row_locks2(
+    const concurrent_tree::locked_keyrange *lkr, const DBT *left_key,
+    const DBT *right_key, comparator *cmp, TXNID,
+    GrowableArray<row_lock> *row_locks) {
+  struct copy_fn_obj {
+    GrowableArray<row_lock> *row_locks;
+    bool first_call = true;
+    bool matching_lock_found = false;
+    const DBT *left_key, *right_key;
+    comparator *cmp;
+
+    bool fn(const keyrange &range, TXNID txnid, bool is_shared,
+            TxnidVector *owners) {
+      if (first_call) {
+        first_call = false;
+        if (is_shared && !(*cmp)(left_key, range.get_left_key()) &&
+            !(*cmp)(right_key, range.get_right_key())) {
+          matching_lock_found = true;
+        }
+      } else {
+        // if we see multiple matching locks, it doesn't matter whether
+        // the first one was matching.
+        matching_lock_found = false;
+      }
+      row_lock lock = {.range = range,
+                       .txnid = txnid,
+                       .is_shared = is_shared,
+                       .owners = owners};
+      row_locks->push(lock);
+      return true;
+    }
+  } copy_fn;
+  copy_fn.row_locks = row_locks;
+  copy_fn.left_key = left_key;
+  copy_fn.right_key = right_key;
+  copy_fn.cmp = cmp;
+  lkr->iterate(&copy_fn);
+  return copy_fn.matching_lock_found;
+}
+
+// try to acquire a lock and consolidate it with existing locks if possible
+// param: lkr, a prepared locked keyrange
+// return: 0 on success, DB_LOCK_NOTGRANTED if conflicting locks exist.
+int locktree::acquire_lock_consolidated(void *prepared_lkr, TXNID txnid,
+                                        const DBT *left_key,
+                                        const DBT *right_key,
+                                        bool is_write_request,
+                                        txnid_set *conflicts) {
+  int r = 0;
+  concurrent_tree::locked_keyrange *lkr;
+
+  keyrange requested_range;
+  requested_range.create(left_key, right_key);
+  lkr = static_cast<concurrent_tree::locked_keyrange *>(prepared_lkr);
+  lkr->acquire(requested_range);
+
+  // copy out the set of overlapping row locks.
+  GrowableArray<row_lock> overlapping_row_locks;
+  overlapping_row_locks.init();
+  bool matching_shared_lock_found = false;
+
+  if (is_write_request)
+    iterate_and_get_overlapping_row_locks(lkr, &overlapping_row_locks);
+  else {
+    matching_shared_lock_found = iterate_and_get_overlapping_row_locks2(
+        lkr, left_key, right_key, &m_cmp, txnid, &overlapping_row_locks);
+    // psergey-todo: what to do now? So, we have figured we have just one
+    // shareable lock. Need to add us into it as an owner but the lock
+    // pointer cannot be kept?
+    // A: use find_node_with_overlapping_child(key_range, nullptr);
+    //  then, add ourselves to the owner list.
+    // Dont' foreget to release the subtree after that.
+  }
+
+  if (matching_shared_lock_found) {
+    // there is just one non-confliting matching shared lock.
+    //  we are hilding a lock on it (see acquire() call above).
+    //  we need to modify it to indicate there is another locker...
+    if (lkr->add_shared_owner(requested_range, txnid)) {
+      // Pretend shared lock uses as much memory.
+      row_lock new_lock = {.range = requested_range,
+                           .txnid = txnid,
+                           .is_shared = false,
+                           .owners = nullptr};
+      uint64_t mem_used = row_lock_size_in_tree(new_lock);
+      if (m_mgr) {
+        m_mgr->note_mem_used(mem_used);
+      }
+    }
+    requested_range.destroy();
+    overlapping_row_locks.deinit();
+    return 0;
+  }
+
+  size_t num_overlapping_row_locks = overlapping_row_locks.get_size();
+
+  // if any overlapping row locks conflict with this request, bail out.
+
+  bool conflicts_exist =
+      determine_conflicting_txnids(overlapping_row_locks, txnid, conflicts);
+  if (!conflicts_exist) {
+    // there are no conflicts, so all of the overlaps are for the requesting
+    // txnid. so, we must consolidate all existing overlapping ranges and the
+    // requested range into one dominating range. then we insert the dominating
+    // range.
+    bool all_shared = !is_write_request;
+    for (size_t i = 0; i < num_overlapping_row_locks; i++) {
+      row_lock overlapping_lock = overlapping_row_locks.fetch_unchecked(i);
+      invariant(overlapping_lock.txnid == txnid);
+      requested_range.extend(m_cmp, overlapping_lock.range);
+      remove_row_lock_from_tree(lkr, overlapping_lock, TXNID_ANY, m_mgr);
+      all_shared = all_shared && overlapping_lock.is_shared;
+    }
+
+    row_lock new_lock = {.range = requested_range,
+                         .txnid = txnid,
+                         .is_shared = all_shared,
+                         .owners = nullptr};
+    insert_row_lock_into_tree(lkr, new_lock, m_mgr);
+  } else {
+    r = DB_LOCK_NOTGRANTED;
+  }
+
+  requested_range.destroy();
+  overlapping_row_locks.deinit();
+  return r;
+}
+
+// acquire a lock in the given key range, inclusive. if successful,
+// return 0. otherwise, populate the conflicts txnid_set with the set of
+// transactions that conflict with this request.
+int locktree::acquire_lock(bool is_write_request, TXNID txnid,
+                           const DBT *left_key, const DBT *right_key,
+                           txnid_set *conflicts) {
+  int r = 0;
+
+  // we are only supporting write locks for simplicity
+  // invariant(is_write_request);
+
+  // acquire and prepare a locked keyrange over the requested range.
+  // prepare is a serialzation point, so we take the opportunity to
+  // try the single txnid optimization first.
+  concurrent_tree::locked_keyrange lkr;
+  lkr.prepare(m_rangetree);
+
+  bool acquired =
+      sto_try_acquire(&lkr, txnid, left_key, right_key, is_write_request);
+  if (!acquired) {
+    r = acquire_lock_consolidated(&lkr, txnid, left_key, right_key,
+                                  is_write_request, conflicts);
+  }
+
+  lkr.release();
+  return r;
+}
+
+int locktree::try_acquire_lock(bool is_write_request, TXNID txnid,
+                               const DBT *left_key, const DBT *right_key,
+                               txnid_set *conflicts, bool big_txn) {
+  // All ranges in the locktree must have left endpoints <= right endpoints.
+  // Range comparisons rely on this fact, so we make a paranoid invariant here.
+  paranoid_invariant(m_cmp(left_key, right_key) <= 0);
+  int r = m_mgr == nullptr ? 0 : m_mgr->check_current_lock_constraints(big_txn);
+  if (r == 0) {
+    r = acquire_lock(is_write_request, txnid, left_key, right_key, conflicts);
+  }
+  return r;
+}
+
+// the locktree silently upgrades read locks to write locks for simplicity
+int locktree::acquire_read_lock(TXNID txnid, const DBT *left_key,
+                                const DBT *right_key, txnid_set *conflicts,
+                                bool big_txn) {
+  return try_acquire_lock(false, txnid, left_key, right_key, conflicts,
+                          big_txn);
+}
+
+int locktree::acquire_write_lock(TXNID txnid, const DBT *left_key,
+                                 const DBT *right_key, txnid_set *conflicts,
+                                 bool big_txn) {
+  return try_acquire_lock(true, txnid, left_key, right_key, conflicts, big_txn);
+}
+
+// typedef void (*dump_callback)(void *cdata, const DBT *left, const DBT *right,
+// TXNID txnid);
+void locktree::dump_locks(void *cdata, dump_callback cb) {
+  concurrent_tree::locked_keyrange lkr;
+  keyrange range;
+  range.create(toku_dbt_negative_infinity(), toku_dbt_positive_infinity());
+
+  lkr.prepare(m_rangetree);
+  lkr.acquire(range);
+
+  TXNID sto_txn;
+  if ((sto_txn = toku_unsafe_fetch(m_sto_txnid)) != TXNID_NONE) {
+    // insert all of the ranges from the single txnid buffer into a new rangtree
+    range_buffer::iterator iter(&m_sto_buffer);
+    range_buffer::iterator::record rec;
+    while (iter.current(&rec)) {
+      (*cb)(cdata, rec.get_left_key(), rec.get_right_key(), sto_txn,
+            !rec.get_exclusive_flag(), nullptr);
+      iter.next();
+    }
+  } else {
+    GrowableArray<row_lock> all_locks;
+    all_locks.init();
+    iterate_and_get_overlapping_row_locks(&lkr, &all_locks);
+
+    const size_t n_locks = all_locks.get_size();
+    for (size_t i = 0; i < n_locks; i++) {
+      const row_lock lock = all_locks.fetch_unchecked(i);
+      (*cb)(cdata, lock.range.get_left_key(), lock.range.get_right_key(),
+            lock.txnid, lock.is_shared, lock.owners);
+    }
+    all_locks.deinit();
+  }
+  lkr.release();
+  range.destroy();
+}
+
+void locktree::get_conflicts(bool is_write_request, TXNID txnid,
+                             const DBT *left_key, const DBT *right_key,
+                             txnid_set *conflicts) {
+  // because we only support write locks, ignore this bit for now.
+  (void)is_write_request;
+
+  // preparing and acquire a locked keyrange over the range
+  keyrange range;
+  range.create(left_key, right_key);
+  concurrent_tree::locked_keyrange lkr;
+  lkr.prepare(m_rangetree);
+  lkr.acquire(range);
+
+  // copy out the set of overlapping row locks and determine the conflicts
+  GrowableArray<row_lock> overlapping_row_locks;
+  overlapping_row_locks.init();
+  iterate_and_get_overlapping_row_locks(&lkr, &overlapping_row_locks);
+
+  // we don't care if conflicts exist. we just want the conflicts set populated.
+  (void)determine_conflicting_txnids(overlapping_row_locks, txnid, conflicts);
+
+  lkr.release();
+  overlapping_row_locks.deinit();
+  range.destroy();
+}
+
+// Effect:
+//  For each range in the lock tree that overlaps the given range and has
+//  the given txnid, remove it.
+// Rationale:
+//  In the common case, there is only the range [left_key, right_key] and
+//  it is associated with txnid, so this is a single tree delete.
+//
+//  However, consolidation and escalation change the objects in the tree
+//  without telling the txn anything.  In this case, the txn may own a
+//  large range lock that represents its ownership of many smaller range
+//  locks.  For example, the txn may think it owns point locks on keys 1,
+//  2, and 3, but due to escalation, only the object [1,3] exists in the
+//  tree.
+//
+//  The first call for a small lock will remove the large range lock, and
+//  the rest of the calls should do nothing.  After the first release,
+//  another thread can acquire one of the locks that the txn thinks it
+//  still owns.  That's ok, because the txn doesn't want it anymore (it
+//  unlocks everything at once), but it may find a lock that it does not
+//  own.
+//
+//  In our example, the txn unlocks key 1, which actually removes the
+//  whole lock [1,3].  Now, someone else can lock 2 before our txn gets
+//  around to unlocking 2, so we should not remove that lock.
+void locktree::remove_overlapping_locks_for_txnid(TXNID txnid,
+                                                  const DBT *left_key,
+                                                  const DBT *right_key) {
+  keyrange release_range;
+  release_range.create(left_key, right_key);
+
+  // acquire and prepare a locked keyrange over the release range
+  concurrent_tree::locked_keyrange lkr;
+  lkr.prepare(m_rangetree);
+  lkr.acquire(release_range);
+
+  // copy out the set of overlapping row locks.
+  GrowableArray<row_lock> overlapping_row_locks;
+  overlapping_row_locks.init();
+  iterate_and_get_overlapping_row_locks(&lkr, &overlapping_row_locks);
+  size_t num_overlapping_row_locks = overlapping_row_locks.get_size();
+
+  for (size_t i = 0; i < num_overlapping_row_locks; i++) {
+    row_lock lock = overlapping_row_locks.fetch_unchecked(i);
+    // If this isn't our lock, that's ok, just don't remove it.
+    // See rationale above.
+    // psergey-todo: for shared locks, just remove ourselves from the
+    //               owners.
+    if (lock.txnid == txnid || (lock.owners && lock.owners->contains(txnid))) {
+      remove_row_lock_from_tree(&lkr, lock, txnid, m_mgr);
+    }
+  }
+
+  lkr.release();
+  overlapping_row_locks.deinit();
+  release_range.destroy();
+}
+
+bool locktree::sto_txnid_is_valid_unsafe(void) const {
+  return toku_unsafe_fetch(m_sto_txnid) != TXNID_NONE;
+}
+
+int locktree::sto_get_score_unsafe(void) const {
+  return toku_unsafe_fetch(m_sto_score);
+}
+
+bool locktree::sto_try_release(TXNID txnid) {
+  bool released = false;
+  if (toku_unsafe_fetch(m_sto_txnid) != TXNID_NONE) {
+    // check the bit again with a prepared locked keyrange,
+    // which protects the optimization bits and rangetree data
+    concurrent_tree::locked_keyrange lkr;
+    lkr.prepare(m_rangetree);
+    if (m_sto_txnid != TXNID_NONE) {
+      // this txnid better be the single txnid on this locktree,
+      // or else we are in big trouble (meaning the logic is broken)
+      invariant(m_sto_txnid == txnid);
+      invariant(m_rangetree->is_empty());
+      sto_end();
+      released = true;
+    }
+    lkr.release();
+  }
+  return released;
+}
+
+// release all of the locks for a txnid whose endpoints are pairs
+// in the given range buffer.
+void locktree::release_locks(TXNID txnid, const range_buffer *ranges,
+                             bool all_trx_locks_hint) {
+  // try the single txn optimization. if it worked, then all of the
+  // locks are already released, otherwise we need to do it here.
+  bool released;
+  if (all_trx_locks_hint) {
+    // This will release all of the locks the transaction is holding
+    released = sto_try_release(txnid);
+  } else {
+    /*
+      psergey: we are asked to release *Some* of the locks the transaction
+      is holding.
+      We could try doing that without leaving the STO mode, but right now,
+      the easiest way is to exit the STO mode and let the non-STO code path
+      handle it.
+    */
+    if (toku_unsafe_fetch(m_sto_txnid) != TXNID_NONE) {
+      // check the bit again with a prepared locked keyrange,
+      // which protects the optimization bits and rangetree data
+      concurrent_tree::locked_keyrange lkr;
+      lkr.prepare(m_rangetree);
+      if (m_sto_txnid != TXNID_NONE) {
+        sto_end_early(&lkr);
+      }
+      lkr.release();
+    }
+    released = false;
+  }
+  if (!released) {
+    range_buffer::iterator iter(ranges);
+    range_buffer::iterator::record rec;
+    while (iter.current(&rec)) {
+      const DBT *left_key = rec.get_left_key();
+      const DBT *right_key = rec.get_right_key();
+      // All ranges in the locktree must have left endpoints <= right endpoints.
+      // Range comparisons rely on this fact, so we make a paranoid invariant
+      // here.
+      paranoid_invariant(m_cmp(left_key, right_key) <= 0);
+      remove_overlapping_locks_for_txnid(txnid, left_key, right_key);
+      iter.next();
+    }
+    // Increase the sto score slightly. Eventually it will hit
+    // the threshold and we'll try the optimization again. This
+    // is how a previously multithreaded system transitions into
+    // a single threaded system that benefits from the optimization.
+    if (toku_unsafe_fetch(m_sto_score) < STO_SCORE_THRESHOLD) {
+      toku_sync_fetch_and_add(&m_sto_score, 1);
+    }
+  }
+}
+
+// iterate over a locked keyrange and extract copies of the first N
+// row locks, storing each one into the given array of size N,
+// then removing each extracted lock from the locked keyrange.
+static int extract_first_n_row_locks(concurrent_tree::locked_keyrange *lkr,
+                                     locktree_manager *mgr, row_lock *row_locks,
+                                     int num_to_extract) {
+  struct extract_fn_obj {
+    int num_extracted;
+    int num_to_extract;
+    row_lock *row_locks;
+    bool fn(const keyrange &range, TXNID txnid, bool is_shared,
+            TxnidVector *owners) {
+      if (num_extracted < num_to_extract) {
+        row_lock lock;
+        lock.range.create_copy(range);
+        lock.txnid = txnid;
+        lock.is_shared = is_shared;
+        // deep-copy the set of owners:
+        if (owners)
+          lock.owners = new TxnidVector(*owners);
+        else
+          lock.owners = nullptr;
+        row_locks[num_extracted++] = lock;
+        return true;
+      } else {
+        return false;
+      }
+    }
+  } extract_fn;
+
+  extract_fn.row_locks = row_locks;
+  extract_fn.num_to_extract = num_to_extract;
+  extract_fn.num_extracted = 0;
+  lkr->iterate(&extract_fn);
+
+  // now that the ranges have been copied out, complete
+  // the extraction by removing the ranges from the tree.
+  // use remove_row_lock_from_tree() so we properly track the
+  // amount of memory and number of locks freed.
+  int num_extracted = extract_fn.num_extracted;
+  invariant(num_extracted <= num_to_extract);
+  for (int i = 0; i < num_extracted; i++) {
+    remove_row_lock_from_tree(lkr, row_locks[i], TXNID_ANY, mgr);
+  }
+
+  return num_extracted;
+}
+
+// Store each newly escalated lock in a range buffer for appropriate txnid.
+// We'll rebuild the locktree by iterating over these ranges, and then we
+// can pass back each txnid/buffer pair individually through a callback
+// to notify higher layers that locks have changed.
+struct txnid_range_buffer {
+  TXNID txnid;
+  range_buffer buffer;
+
+  static int find_by_txnid(struct txnid_range_buffer *const &other_buffer,
+                           const TXNID &txnid) {
+    if (txnid < other_buffer->txnid) {
+      return -1;
+    } else if (other_buffer->txnid == txnid) {
+      return 0;
+    } else {
+      return 1;
+    }
+  }
+};
+
+// escalate the locks in the locktree by merging adjacent
+// locks that have the same txnid into one larger lock.
+//
+// if there's only one txnid in the locktree then this
+// approach works well. if there are many txnids and each
+// has locks in a random/alternating order, then this does
+// not work so well.
+void locktree::escalate(lt_escalate_cb after_escalate_callback,
+                        void *after_escalate_callback_extra) {
+  omt<struct txnid_range_buffer *, struct txnid_range_buffer *> range_buffers;
+  range_buffers.create();
+
+  // prepare and acquire a locked keyrange on the entire locktree
+  concurrent_tree::locked_keyrange lkr;
+  keyrange infinite_range = keyrange::get_infinite_range();
+  lkr.prepare(m_rangetree);
+  lkr.acquire(infinite_range);
+
+  // if we're in the single txnid optimization, simply call it off.
+  // if you have to run escalation, you probably don't care about
+  // the optimization anyway, and this makes things easier.
+  if (m_sto_txnid != TXNID_NONE) {
+    // We are already accounting for this escalation time and
+    // count, so don't do it for sto_end_early too.
+    sto_end_early_no_accounting(&lkr);
+  }
+
+  // extract and remove batches of row locks from the locktree
+  int num_extracted;
+  const int num_row_locks_per_batch = 128;
+  row_lock *XCALLOC_N(num_row_locks_per_batch, extracted_buf);
+
+  // we always remove the "first" n because we are removing n
+  // each time we do an extraction. so this loops until its empty.
+  while ((num_extracted = extract_first_n_row_locks(
+              &lkr, m_mgr, extracted_buf, num_row_locks_per_batch)) > 0) {
+    int current_index = 0;
+    while (current_index < num_extracted) {
+      // every batch of extracted locks is in range-sorted order. search
+      // through them and merge adjacent locks with the same txnid into
+      // one dominating lock and save it to a set of escalated locks.
+      //
+      // first, find the index of the next row lock that
+      //  - belongs to a different txnid, or
+      //  - belongs to several txnids, or
+      //  - is a shared lock (we could potentially merge those but
+      //    currently we don't), or
+      //  - is across a lock escalation barrier.
+      int next_txnid_index = current_index + 1;
+
+      while (next_txnid_index < num_extracted &&
+             (extracted_buf[current_index].txnid ==
+              extracted_buf[next_txnid_index].txnid) &&
+             !extracted_buf[next_txnid_index].is_shared &&
+             !extracted_buf[next_txnid_index].owners &&
+             !m_escalation_barrier(
+                 extracted_buf[current_index].range.get_right_key(),
+                 extracted_buf[next_txnid_index].range.get_left_key(),
+                 m_escalation_barrier_arg)) {
+        next_txnid_index++;
+      }
+
+      // Create an escalated range for the current txnid that dominates
+      // each range between the current indext and the next txnid's index.
+      // const TXNID current_txnid = extracted_buf[current_index].txnid;
+      const DBT *escalated_left_key =
+          extracted_buf[current_index].range.get_left_key();
+      const DBT *escalated_right_key =
+          extracted_buf[next_txnid_index - 1].range.get_right_key();
+
+      // Try to find a range buffer for the current txnid. Create one if it
+      // doesn't exist. Then, append the new escalated range to the buffer. (If
+      // a lock is shared by multiple txnids, append it each of txnid's lists)
+      TxnidVector *owners_ptr;
+      TxnidVector singleton_owner;
+      if (extracted_buf[current_index].owners)
+        owners_ptr = extracted_buf[current_index].owners;
+      else {
+        singleton_owner.insert(extracted_buf[current_index].txnid);
+        owners_ptr = &singleton_owner;
+      }
+
+      for (auto cur_txnid : *owners_ptr) {
+        uint32_t idx;
+        struct txnid_range_buffer *existing_range_buffer;
+        int r =
+            range_buffers.find_zero<TXNID, txnid_range_buffer::find_by_txnid>(
+                cur_txnid, &existing_range_buffer, &idx);
+        if (r == DB_NOTFOUND) {
+          struct txnid_range_buffer *XMALLOC(new_range_buffer);
+          new_range_buffer->txnid = cur_txnid;
+          new_range_buffer->buffer.create();
+          new_range_buffer->buffer.append(
+              escalated_left_key, escalated_right_key,
+              !extracted_buf[current_index].is_shared);
+          range_buffers.insert_at(new_range_buffer, idx);
+        } else {
+          invariant_zero(r);
+          invariant(existing_range_buffer->txnid == cur_txnid);
+          existing_range_buffer->buffer.append(
+              escalated_left_key, escalated_right_key,
+              !extracted_buf[current_index].is_shared);
+        }
+      }
+
+      current_index = next_txnid_index;
+    }
+
+    // destroy the ranges copied during the extraction
+    for (int i = 0; i < num_extracted; i++) {
+      delete extracted_buf[i].owners;
+      extracted_buf[i].range.destroy();
+    }
+  }
+  toku_free(extracted_buf);
+
+  // Rebuild the locktree from each range in each range buffer,
+  // then notify higher layers that the txnid's locks have changed.
+  //
+  // (shared locks: if a lock was initially shared between transactions TRX1,
+  //  TRX2, etc, we will now try to acquire it acting on behalf on TRX1, on
+  //  TRX2, etc.  This will succeed and an identical shared lock will be
+  //  constructed)
+
+  invariant(m_rangetree->is_empty());
+  const uint32_t num_range_buffers = range_buffers.size();
+  for (uint32_t i = 0; i < num_range_buffers; i++) {
+    struct txnid_range_buffer *current_range_buffer;
+    int r = range_buffers.fetch(i, &current_range_buffer);
+    invariant_zero(r);
+    if (r == EINVAL)  // Shouldn't happen, avoid compiler warning
+      continue;
+
+    const TXNID current_txnid = current_range_buffer->txnid;
+    range_buffer::iterator iter(&current_range_buffer->buffer);
+    range_buffer::iterator::record rec;
+    while (iter.current(&rec)) {
+      keyrange range;
+      range.create(rec.get_left_key(), rec.get_right_key());
+      row_lock lock = {.range = range,
+                       .txnid = current_txnid,
+                       .is_shared = !rec.get_exclusive_flag(),
+                       .owners = nullptr};
+      insert_row_lock_into_tree(&lkr, lock, m_mgr);
+      iter.next();
+    }
+
+    // Notify higher layers that locks have changed for the current txnid
+    if (after_escalate_callback) {
+      after_escalate_callback(current_txnid, this, current_range_buffer->buffer,
+                              after_escalate_callback_extra);
+    }
+    current_range_buffer->buffer.destroy();
+  }
+
+  while (range_buffers.size() > 0) {
+    struct txnid_range_buffer *buffer;
+    int r = range_buffers.fetch(0, &buffer);
+    invariant_zero(r);
+    r = range_buffers.delete_at(0);
+    invariant_zero(r);
+    toku_free(buffer);
+  }
+  range_buffers.destroy();
+
+  lkr.release();
+}
+
+void *locktree::get_userdata(void) const { return m_userdata; }
+
+void locktree::set_userdata(void *userdata) { m_userdata = userdata; }
+
+struct lt_lock_request_info *locktree::get_lock_request_info(void) {
+  return &m_lock_request_info;
+}
+
+void locktree::set_comparator(const comparator &cmp) { m_cmp.inherit(cmp); }
+
+locktree_manager *locktree::get_manager(void) const { return m_mgr; }
+
+int locktree::compare(const locktree *lt) const {
+  if (m_dict_id.dictid < lt->m_dict_id.dictid) {
+    return -1;
+  } else if (m_dict_id.dictid == lt->m_dict_id.dictid) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+DICTIONARY_ID locktree::get_dict_id() const { return m_dict_id; }
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h
new file mode 100644
index 000000000..f0f4b042d
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h
@@ -0,0 +1,580 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <atomic>
+
+#include "../db.h"
+#include "../ft/comparator.h"
+#include "../portability/toku_external_pthread.h"
+#include "../portability/toku_pthread.h"
+#include "../portability/toku_time.h"
+// PORT #include <ft/ft-ops.h>  // just for DICTIONARY_ID..
+// PORT: ft-status for LTM_STATUS:
+#include "../ft/ft-status.h"
+
+struct DICTIONARY_ID {
+  uint64_t dictid;
+};
+
+#include "../util/omt.h"
+#include "range_buffer.h"
+#include "txnid_set.h"
+#include "wfg.h"
+
+namespace toku {
+
+class locktree;
+class locktree_manager;
+class lock_request;
+class concurrent_tree;
+
+typedef int (*lt_create_cb)(locktree *lt, void *extra);
+typedef void (*lt_destroy_cb)(locktree *lt);
+typedef void (*lt_escalate_cb)(TXNID txnid, const locktree *lt,
+                               const range_buffer &buffer, void *extra);
+
+typedef bool (*lt_escalation_barrier_check_func)(const DBT *a, const DBT *b,
+                                                 void *extra);
+
+struct lt_counters {
+  uint64_t wait_count, wait_time;
+  uint64_t long_wait_count, long_wait_time;
+  uint64_t timeout_count;
+
+  void add(const lt_counters &rhs) {
+    wait_count += rhs.wait_count;
+    wait_time += rhs.wait_time;
+    long_wait_count += rhs.long_wait_count;
+    long_wait_time += rhs.long_wait_time;
+    timeout_count += rhs.timeout_count;
+  }
+};
+
+// Lock request state for some locktree
+struct lt_lock_request_info {
+  omt<lock_request *> pending_lock_requests;
+  std::atomic_bool pending_is_empty;
+  toku_external_mutex_t mutex;
+  bool should_retry_lock_requests;
+  lt_counters counters;
+  std::atomic_ullong retry_want;
+  unsigned long long retry_done;
+  toku_mutex_t retry_mutex;
+  toku_cond_t retry_cv;
+  bool running_retry;
+
+  void init(toku_external_mutex_factory_t mutex_factory);
+  void destroy(void);
+};
+
+// The locktree manager manages a set of locktrees, one for each open
+// dictionary. Locktrees are retrieved from the manager. When they are no
+// longer needed, they are be released by the user.
+class locktree_manager {
+ public:
+  // param: create_cb, called just after a locktree is first created.
+  //        destroy_cb, called just before a locktree is destroyed.
+  //        escalate_cb, called after a locktree is escalated (with extra
+  //        param)
+  void create(lt_create_cb create_cb, lt_destroy_cb destroy_cb,
+              lt_escalate_cb escalate_cb, void *extra,
+              toku_external_mutex_factory_t mutex_factory_arg);
+
+  void destroy(void);
+
+  size_t get_max_lock_memory(void);
+
+  int set_max_lock_memory(size_t max_lock_memory);
+
+  // effect: Get a locktree from the manager. If a locktree exists with the
+  // given
+  //         dict_id, it is referenced and then returned. If one did not exist,
+  //         it is created. It will use the comparator for comparing keys. The
+  //         on_create callback (passed to locktree_manager::create()) will be
+  //         called with the given extra parameter.
+  locktree *get_lt(DICTIONARY_ID dict_id, const comparator &cmp,
+                   void *on_create_extra);
+
+  void reference_lt(locktree *lt);
+
+  // effect: Releases one reference on a locktree. If the reference count
+  // transitions
+  //         to zero, the on_destroy callback is called before it gets
+  //         destroyed.
+  void release_lt(locktree *lt);
+
+  void get_status(LTM_STATUS status);
+
+  // effect: calls the iterate function on each pending lock request
+  // note: holds the manager's mutex
+  typedef int (*lock_request_iterate_callback)(DICTIONARY_ID dict_id,
+                                               TXNID txnid, const DBT *left_key,
+                                               const DBT *right_key,
+                                               TXNID blocking_txnid,
+                                               uint64_t start_time,
+                                               void *extra);
+  int iterate_pending_lock_requests(lock_request_iterate_callback cb,
+                                    void *extra);
+
+  // effect: Determines if too many locks or too much memory is being used,
+  //         Runs escalation on the manager if so.
+  // param: big_txn, if the current transaction is 'big' (has spilled rollback
+  // logs) returns: 0 if there enough resources to create a new lock, or
+  // TOKUDB_OUT_OF_LOCKS
+  //          if there are not enough resources and lock escalation failed to
+  //          free up enough resources for a new lock.
+  int check_current_lock_constraints(bool big_txn);
+
+  bool over_big_threshold(void);
+
+  void note_mem_used(uint64_t mem_used);
+
+  void note_mem_released(uint64_t mem_freed);
+
+  bool out_of_locks(void) const;
+
+  // Escalate all locktrees
+  void escalate_all_locktrees(void);
+
+  // Escalate a set of locktrees
+  void escalate_locktrees(locktree **locktrees, int num_locktrees);
+
+  // effect: calls the private function run_escalation(), only ok to
+  //         do for tests.
+  // rationale: to get better stress test coverage, we want a way to
+  //            deterministicly trigger lock escalation.
+  void run_escalation_for_test(void);
+  void run_escalation(void);
+
+  // Add time t to the escalator's wait time statistics
+  void add_escalator_wait_time(uint64_t t);
+
+  void kill_waiter(void *extra);
+
+ private:
+  static const uint64_t DEFAULT_MAX_LOCK_MEMORY = 64L * 1024 * 1024;
+
+  // tracks the current number of locks and lock memory
+  uint64_t m_max_lock_memory;
+  uint64_t m_current_lock_memory;
+
+  struct lt_counters m_lt_counters;
+
+  // the create and destroy callbacks for the locktrees
+  lt_create_cb m_lt_create_callback;
+  lt_destroy_cb m_lt_destroy_callback;
+  lt_escalate_cb m_lt_escalate_callback;
+  void *m_lt_escalate_callback_extra;
+
+  omt<locktree *> m_locktree_map;
+
+  toku_external_mutex_factory_t mutex_factory;
+
+  // the manager's mutex protects the locktree map
+  toku_mutex_t m_mutex;
+
+  void mutex_lock(void);
+
+  void mutex_unlock(void);
+
+  // Manage the set of open locktrees
+  locktree *locktree_map_find(const DICTIONARY_ID &dict_id);
+  void locktree_map_put(locktree *lt);
+  void locktree_map_remove(locktree *lt);
+
+  static int find_by_dict_id(locktree *const &lt, const DICTIONARY_ID &dict_id);
+
+  void escalator_init(void);
+  void escalator_destroy(void);
+
+  // statistics about lock escalation.
+  toku_mutex_t m_escalation_mutex;
+  uint64_t m_escalation_count;
+  tokutime_t m_escalation_time;
+  uint64_t m_escalation_latest_result;
+  uint64_t m_wait_escalation_count;
+  uint64_t m_wait_escalation_time;
+  uint64_t m_long_wait_escalation_count;
+  uint64_t m_long_wait_escalation_time;
+
+  // the escalator coordinates escalation on a set of locktrees for a bunch of
+  // threads
+  class locktree_escalator {
+   public:
+    void create(void);
+    void destroy(void);
+    void run(locktree_manager *mgr, void (*escalate_locktrees_fun)(void *extra),
+             void *extra);
+
+   private:
+    toku_mutex_t m_escalator_mutex;
+    toku_cond_t m_escalator_done;
+    bool m_escalator_running;
+  };
+
+  locktree_escalator m_escalator;
+
+  friend class manager_unit_test;
+};
+
+// A locktree represents the set of row locks owned by all transactions
+// over an open dictionary. Read and write ranges are represented as
+// a left and right key which are compared with the given comparator
+//
+// Locktrees are not created and destroyed by the user. Instead, they are
+// referenced and released using the locktree manager.
+//
+// A sample workflow looks like this:
+// - Create a manager.
+// - Get a locktree by dictionaroy id from the manager.
+// - Perform read/write lock acquision on the locktree, add references to
+//   the locktree using the manager, release locks, release references, etc.
+// - ...
+// - Release the final reference to the locktree. It will be destroyed.
+// - Destroy the manager.
+class locktree {
+ public:
+  // effect: Creates a locktree
+  void create(locktree_manager *mgr, DICTIONARY_ID dict_id,
+              const comparator &cmp,
+              toku_external_mutex_factory_t mutex_factory);
+
+  void destroy(void);
+
+  // For thread-safe, external reference counting
+  void add_reference(void);
+
+  // requires: the reference count is > 0
+  // returns: the reference count, after decrementing it by one
+  uint32_t release_reference(void);
+
+  // returns: the current reference count
+  uint32_t get_reference_count(void);
+
+  // effect: Attempts to grant a read lock for the range of keys between
+  // [left_key, right_key]. returns: If the lock cannot be granted, return
+  // DB_LOCK_NOTGRANTED, and populate the
+  //          given conflicts set with the txnids that hold conflicting locks in
+  //          the range. If the locktree cannot create more locks, return
+  //          TOKUDB_OUT_OF_LOCKS.
+  // note: Read locks cannot be shared between txnids, as one would expect.
+  //       This is for simplicity since read locks are rare in MySQL.
+  int acquire_read_lock(TXNID txnid, const DBT *left_key, const DBT *right_key,
+                        txnid_set *conflicts, bool big_txn);
+
+  // effect: Attempts to grant a write lock for the range of keys between
+  // [left_key, right_key]. returns: If the lock cannot be granted, return
+  // DB_LOCK_NOTGRANTED, and populate the
+  //          given conflicts set with the txnids that hold conflicting locks in
+  //          the range. If the locktree cannot create more locks, return
+  //          TOKUDB_OUT_OF_LOCKS.
+  int acquire_write_lock(TXNID txnid, const DBT *left_key, const DBT *right_key,
+                         txnid_set *conflicts, bool big_txn);
+
+  // effect: populate the conflicts set with the txnids that would preventing
+  //         the given txnid from getting a lock on [left_key, right_key]
+  void get_conflicts(bool is_write_request, TXNID txnid, const DBT *left_key,
+                     const DBT *right_key, txnid_set *conflicts);
+
+  // effect: Release all of the lock ranges represented by the range buffer for
+  // a txnid.
+  void release_locks(TXNID txnid, const range_buffer *ranges,
+                     bool all_trx_locks_hint = false);
+
+  // effect: Runs escalation on this locktree
+  void escalate(lt_escalate_cb after_escalate_callback, void *extra);
+
+  // returns: The userdata associated with this locktree, or null if it has not
+  // been set.
+  void *get_userdata(void) const;
+
+  void set_userdata(void *userdata);
+
+  locktree_manager *get_manager(void) const;
+
+  void set_comparator(const comparator &cmp);
+
+  // Set the user-provided Lock Escalation Barrier check function and its
+  // argument
+  //
+  // Lock Escalation Barrier limits the scope of Lock Escalation.
+  // For two keys A and B (such that A < B),
+  // escalation_barrier_check_func(A, B)==true means that there's a lock
+  // escalation barrier between A and B, and lock escalation is not allowed to
+  // bridge the gap between A and B.
+  //
+  // This method sets the user-provided barrier check function and its
+  // parameter.
+  void set_escalation_barrier_func(lt_escalation_barrier_check_func func,
+                                   void *extra);
+
+  int compare(const locktree *lt) const;
+
+  DICTIONARY_ID get_dict_id() const;
+
+  // Private info struct for storing pending lock request state.
+  // Only to be used by lock requests. We store it here as
+  // something less opaque than usual to strike a tradeoff between
+  // abstraction and code complexity. It is still fairly abstract
+  // since the lock_request object is opaque
+  struct lt_lock_request_info *get_lock_request_info(void);
+
+  typedef void (*dump_callback)(void *cdata, const DBT *left, const DBT *right,
+                                TXNID txnid, bool is_shared,
+                                TxnidVector *owners);
+  void dump_locks(void *cdata, dump_callback cb);
+
+ private:
+  locktree_manager *m_mgr;
+  DICTIONARY_ID m_dict_id;
+  uint32_t m_reference_count;
+
+  // Since the memory referenced by this comparator is not owned by the
+  // locktree, the user must guarantee it will outlive the locktree.
+  //
+  // The ydb API accomplishes this by opening an ft_handle in the on_create
+  // callback, which will keep the underlying FT (and its descriptor) in memory
+  // for as long as the handle is open. The ft_handle is stored opaquely in the
+  // userdata pointer below. see locktree_manager::get_lt w/ on_create_extra
+  comparator m_cmp;
+
+  lt_escalation_barrier_check_func m_escalation_barrier;
+  void *m_escalation_barrier_arg;
+
+  concurrent_tree *m_rangetree;
+
+  void *m_userdata;
+  struct lt_lock_request_info m_lock_request_info;
+
+  // psergey-todo:
+  //  Each transaction also keeps a list of ranges it has locked.
+  //  So, when a transaction is running in STO mode, two identical
+  //  lists are kept: the STO lock list and transaction's owned locks
+  //  list. Why can't we do with just one list?
+
+  // The following fields and members prefixed with "sto_" are for
+  // the single txnid optimization, intended to speed up the case
+  // when only one transaction is using the locktree. If we know
+  // the locktree has only one transaction, then acquiring locks
+  // takes O(1) work and releasing all locks takes O(1) work.
+  //
+  // How do we know that the locktree only has a single txnid?
+  // What do we do if it does?
+  //
+  // When a txn with txnid T requests a lock:
+  // - If the tree is empty, the optimization is possible. Set the single
+  // txnid to T, and insert the lock range into the buffer.
+  // - If the tree is not empty, check if the single txnid is T. If so,
+  // append the lock range to the buffer. Otherwise, migrate all of
+  // the locks in the buffer into the rangetree on behalf of txnid T,
+  // and invalid the single txnid.
+  //
+  // When a txn with txnid T releases its locks:
+  // - If the single txnid is valid, it must be for T. Destroy the buffer.
+  // - If it's not valid, release locks the normal way in the rangetree.
+  //
+  // To carry out the optimization we need to record a single txnid
+  // and a range buffer for each locktree, each protected by the root
+  // lock of the locktree's rangetree. The root lock for a rangetree
+  // is grabbed by preparing a locked keyrange on the rangetree.
+  TXNID m_sto_txnid;
+  range_buffer m_sto_buffer;
+
+  // The single txnid optimization speeds up the case when only one
+  // transaction is using the locktree. But it has the potential to
+  // hurt the case when more than one txnid exists.
+  //
+  // There are two things we need to do to make the optimization only
+  // optimize the case we care about, and not hurt the general case.
+  //
+  // Bound the worst-case latency for lock migration when the
+  // optimization stops working:
+  // - Idea: Stop the optimization and migrate immediate if we notice
+  // the single txnid has takes many locks in the range buffer.
+  // - Implementation: Enforce a max size on the single txnid range buffer.
+  // - Analysis: Choosing the perfect max value, M, is difficult to do
+  // without some feedback from the field. Intuition tells us that M should
+  // not be so small that the optimization is worthless, and it should not
+  // be so big that it's unreasonable to have to wait behind a thread doing
+  // the work of converting M buffer locks into rangetree locks.
+  //
+  // Prevent concurrent-transaction workloads from trying the optimization
+  // in vain:
+  // - Idea: Don't even bother trying the optimization if we think the
+  // system is in a concurrent-transaction state.
+  // - Implementation: Do something even simpler than detecting whether the
+  // system is in a concurent-transaction state. Just keep a "score" value
+  // and some threshold. If at any time the locktree is eligible for the
+  // optimization, only do it if the score is at this threshold. When you
+  // actually do the optimization but someone has to migrate locks in the buffer
+  // (expensive), then reset the score back to zero. Each time a txn
+  // releases locks, the score is incremented by 1.
+  // - Analysis: If you let the threshold be "C", then at most 1 / C txns will
+  // do the optimization in a concurrent-transaction system. Similarly, it
+  // takes at most C txns to start using the single txnid optimzation, which
+  // is good when the system transitions from multithreaded to single threaded.
+  //
+  // STO_BUFFER_MAX_SIZE:
+  //
+  // We choose the max value to be 1 million since most transactions are smaller
+  // than 1 million and we can create a rangetree of 1 million elements in
+  // less than a second. So we can be pretty confident that this threshold
+  // enables the optimization almost always, and prevents super pathological
+  // latency issues for the first lock taken by a second thread.
+  //
+  // STO_SCORE_THRESHOLD:
+  //
+  // A simple first guess at a good value for the score threshold is 100.
+  // By our analysis, we'd end up doing the optimization in vain for
+  // around 1% of all transactions, which seems reasonable. Further,
+  // if the system goes single threaded, it ought to be pretty quick
+  // for 100 transactions to go by, so we won't have to wait long before
+  // we start doing the single txind optimzation again.
+  static const int STO_BUFFER_MAX_SIZE = 50 * 1024;
+  static const int STO_SCORE_THRESHOLD = 100;
+  int m_sto_score;
+
+  // statistics about time spent ending the STO early
+  uint64_t m_sto_end_early_count;
+  tokutime_t m_sto_end_early_time;
+
+  // effect: begins the single txnid optimizaiton, setting m_sto_txnid
+  //         to the given txnid.
+  // requires: m_sto_txnid is invalid
+  void sto_begin(TXNID txnid);
+
+  // effect: append a range to the sto buffer
+  // requires: m_sto_txnid is valid
+  void sto_append(const DBT *left_key, const DBT *right_key,
+                  bool is_write_request);
+
+  // effect: ends the single txnid optimization, releaseing any memory
+  //         stored in the sto buffer, notifying the tracker, and
+  //         invalidating m_sto_txnid.
+  // requires: m_sto_txnid is valid
+  void sto_end(void);
+
+  // params: prepared_lkr is a void * to a prepared locked keyrange. see below.
+  // effect: ends the single txnid optimization early, migrating buffer locks
+  //         into the rangetree, calling sto_end(), and then setting the
+  //         sto_score back to zero.
+  // requires: m_sto_txnid is valid
+  void sto_end_early(void *prepared_lkr);
+  void sto_end_early_no_accounting(void *prepared_lkr);
+
+  // params: prepared_lkr is a void * to a prepared locked keyrange. we can't
+  // use
+  //         the real type because the compiler won't allow us to forward
+  //         declare concurrent_tree::locked_keyrange without including
+  //         concurrent_tree.h, which we cannot do here because it is a template
+  //         implementation.
+  // requires: the prepared locked keyrange is for the locktree's rangetree
+  // requires: m_sto_txnid is valid
+  // effect: migrates each lock in the single txnid buffer into the locktree's
+  //         rangetree, notifying the memory tracker as necessary.
+  void sto_migrate_buffer_ranges_to_tree(void *prepared_lkr);
+
+  // effect: If m_sto_txnid is valid, then release the txnid's locks
+  //         by ending the optimization.
+  // requires: If m_sto_txnid is valid, it is equal to the given txnid
+  // returns: True if locks were released for this txnid
+  bool sto_try_release(TXNID txnid);
+
+  // params: prepared_lkr is a void * to a prepared locked keyrange. see above.
+  // requires: the prepared locked keyrange is for the locktree's rangetree
+  // effect: If m_sto_txnid is valid and equal to the given txnid, then
+  // append a range onto the buffer. Otherwise, if m_sto_txnid is valid
+  //        but not equal to this txnid, then migrate the buffer's locks
+  //        into the rangetree and end the optimization, setting the score
+  //        back to zero.
+  // returns: true if the lock was acquired for this txnid
+  bool sto_try_acquire(void *prepared_lkr, TXNID txnid, const DBT *left_key,
+                       const DBT *right_key, bool is_write_request);
+
+  // Effect:
+  //  Provides a hook for a helgrind suppression.
+  // Returns:
+  //  true if m_sto_txnid is not TXNID_NONE
+  bool sto_txnid_is_valid_unsafe(void) const;
+
+  // Effect:
+  //  Provides a hook for a helgrind suppression.
+  // Returns:
+  //  m_sto_score
+  int sto_get_score_unsafe(void) const;
+
+  void remove_overlapping_locks_for_txnid(TXNID txnid, const DBT *left_key,
+                                          const DBT *right_key);
+
+  int acquire_lock_consolidated(void *prepared_lkr, TXNID txnid,
+                                const DBT *left_key, const DBT *right_key,
+                                bool is_write_request, txnid_set *conflicts);
+
+  int acquire_lock(bool is_write_request, TXNID txnid, const DBT *left_key,
+                   const DBT *right_key, txnid_set *conflicts);
+
+  int try_acquire_lock(bool is_write_request, TXNID txnid, const DBT *left_key,
+                       const DBT *right_key, txnid_set *conflicts,
+                       bool big_txn);
+
+  friend class locktree_unit_test;
+  friend class manager_unit_test;
+  friend class lock_request_unit_test;
+
+  // engine status reaches into the locktree to read some stats
+  friend void locktree_manager::get_status(LTM_STATUS status);
+};
+
+} /* namespace toku */
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc
new file mode 100644
index 000000000..4186182be
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc
@@ -0,0 +1,527 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "../portability/toku_pthread.h"
+#include "../util/status.h"
+#include "lock_request.h"
+#include "locktree.h"
+
+namespace toku {
+
+void locktree_manager::create(lt_create_cb create_cb, lt_destroy_cb destroy_cb,
+                              lt_escalate_cb escalate_cb, void *escalate_extra,
+                              toku_external_mutex_factory_t mutex_factory_arg) {
+  mutex_factory = mutex_factory_arg;
+  m_max_lock_memory = DEFAULT_MAX_LOCK_MEMORY;
+  m_current_lock_memory = 0;
+
+  m_locktree_map.create();
+  m_lt_create_callback = create_cb;
+  m_lt_destroy_callback = destroy_cb;
+  m_lt_escalate_callback = escalate_cb;
+  m_lt_escalate_callback_extra = escalate_extra;
+  ZERO_STRUCT(m_mutex);
+  toku_mutex_init(manager_mutex_key, &m_mutex, nullptr);
+
+  ZERO_STRUCT(m_lt_counters);
+
+  escalator_init();
+}
+
+void locktree_manager::destroy(void) {
+  escalator_destroy();
+  invariant(m_current_lock_memory == 0);
+  invariant(m_locktree_map.size() == 0);
+  m_locktree_map.destroy();
+  toku_mutex_destroy(&m_mutex);
+}
+
+void locktree_manager::mutex_lock(void) { toku_mutex_lock(&m_mutex); }
+
+void locktree_manager::mutex_unlock(void) { toku_mutex_unlock(&m_mutex); }
+
+size_t locktree_manager::get_max_lock_memory(void) { return m_max_lock_memory; }
+
+int locktree_manager::set_max_lock_memory(size_t max_lock_memory) {
+  int r = 0;
+  mutex_lock();
+  if (max_lock_memory < m_current_lock_memory) {
+    r = EDOM;
+  } else {
+    m_max_lock_memory = max_lock_memory;
+  }
+  mutex_unlock();
+  return r;
+}
+
+int locktree_manager::find_by_dict_id(locktree *const &lt,
+                                      const DICTIONARY_ID &dict_id) {
+  if (lt->get_dict_id().dictid < dict_id.dictid) {
+    return -1;
+  } else if (lt->get_dict_id().dictid == dict_id.dictid) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+locktree *locktree_manager::locktree_map_find(const DICTIONARY_ID &dict_id) {
+  locktree *lt;
+  int r = m_locktree_map.find_zero<DICTIONARY_ID, find_by_dict_id>(dict_id, &lt,
+                                                                   nullptr);
+  return r == 0 ? lt : nullptr;
+}
+
+void locktree_manager::locktree_map_put(locktree *lt) {
+  int r = m_locktree_map.insert<DICTIONARY_ID, find_by_dict_id>(
+      lt, lt->get_dict_id(), nullptr);
+  invariant_zero(r);
+}
+
+void locktree_manager::locktree_map_remove(locktree *lt) {
+  uint32_t idx;
+  locktree *found_lt;
+  int r = m_locktree_map.find_zero<DICTIONARY_ID, find_by_dict_id>(
+      lt->get_dict_id(), &found_lt, &idx);
+  invariant_zero(r);
+  invariant(found_lt == lt);
+  r = m_locktree_map.delete_at(idx);
+  invariant_zero(r);
+}
+
+locktree *locktree_manager::get_lt(DICTIONARY_ID dict_id, const comparator &cmp,
+                                   void *on_create_extra) {
+  // hold the mutex around searching and maybe
+  // inserting into the locktree map
+  mutex_lock();
+
+  locktree *lt = locktree_map_find(dict_id);
+  if (lt == nullptr) {
+    XCALLOC(lt);
+    lt->create(this, dict_id, cmp, mutex_factory);
+
+    // new locktree created - call the on_create callback
+    // and put it in the locktree map
+    if (m_lt_create_callback) {
+      int r = m_lt_create_callback(lt, on_create_extra);
+      if (r != 0) {
+        lt->release_reference();
+        lt->destroy();
+        toku_free(lt);
+        lt = nullptr;
+      }
+    }
+    if (lt) {
+      locktree_map_put(lt);
+    }
+  } else {
+    reference_lt(lt);
+  }
+
+  mutex_unlock();
+
+  return lt;
+}
+
+void locktree_manager::reference_lt(locktree *lt) {
+  // increment using a sync fetch and add.
+  // the caller guarantees that the lt won't be
+  // destroyed while we increment the count here.
+  //
+  // the caller can do this by already having an lt
+  // reference or by holding the manager mutex.
+  //
+  // if the manager's mutex is held, it is ok for the
+  // reference count to transition from 0 to 1 (no race),
+  // since we're serialized with other opens and closes.
+  lt->add_reference();
+}
+
+void locktree_manager::release_lt(locktree *lt) {
+  bool do_destroy = false;
+  DICTIONARY_ID dict_id = lt->get_dict_id();
+
+  // Release a reference on the locktree. If the count transitions to zero,
+  // then we *may* need to do the cleanup.
+  //
+  // Grab the manager's mutex and look for a locktree with this locktree's
+  // dictionary id. Since dictionary id's never get reused, any locktree
+  // found must be the one we just released a reference on.
+  //
+  // At least two things could have happened since we got the mutex:
+  // - Another thread gets a locktree with the same dict_id, increments
+  // the reference count. In this case, we shouldn't destroy it.
+  // - Another thread gets a locktree with the same dict_id and then
+  // releases it quickly, transitioning the reference count from zero to
+  // one and back to zero. In this case, only one of us should destroy it.
+  // It doesn't matter which. We originally missed this case, see #5776.
+  //
+  // After 5776, the high level rule for release is described below.
+  //
+  // If a thread releases a locktree and notices the reference count transition
+  // to zero, then that thread must immediately:
+  // - assume the locktree object is invalid
+  // - grab the manager's mutex
+  // - search the locktree map for a locktree with the same dict_id and remove
+  // it, if it exists. the destroy may be deferred.
+  // - release the manager's mutex
+  //
+  // This way, if many threads transition the same locktree's reference count
+  // from 1 to zero and wait behind the manager's mutex, only one of them will
+  // do the actual destroy and the others will happily do nothing.
+  uint32_t refs = lt->release_reference();
+  if (refs == 0) {
+    mutex_lock();
+    // lt may not have already been destroyed, so look it up.
+    locktree *find_lt = locktree_map_find(dict_id);
+    if (find_lt != nullptr) {
+      // A locktree is still in the map with that dict_id, so it must be
+      // equal to lt. This is true because dictionary ids are never reused.
+      // If the reference count is zero, it's our responsibility to remove
+      // it and do the destroy. Otherwise, someone still wants it.
+      // If the locktree is still valid then check if it should be deleted.
+      if (find_lt == lt) {
+        if (lt->get_reference_count() == 0) {
+          locktree_map_remove(lt);
+          do_destroy = true;
+        }
+        m_lt_counters.add(lt->get_lock_request_info()->counters);
+      }
+    }
+    mutex_unlock();
+  }
+
+  // if necessary, do the destroy without holding the mutex
+  if (do_destroy) {
+    if (m_lt_destroy_callback) {
+      m_lt_destroy_callback(lt);
+    }
+    lt->destroy();
+    toku_free(lt);
+  }
+}
+
+void locktree_manager::run_escalation(void) {
+  struct escalation_fn {
+    static void run(void *extra) {
+      locktree_manager *mgr = (locktree_manager *)extra;
+      mgr->escalate_all_locktrees();
+    };
+  };
+  m_escalator.run(this, escalation_fn::run, this);
+}
+
+// test-only version of lock escalation
+void locktree_manager::run_escalation_for_test(void) { run_escalation(); }
+
+void locktree_manager::escalate_all_locktrees(void) {
+  uint64_t t0 = toku_current_time_microsec();
+
+  // get all locktrees
+  mutex_lock();
+  int num_locktrees = m_locktree_map.size();
+  locktree **locktrees = new locktree *[num_locktrees];
+  for (int i = 0; i < num_locktrees; i++) {
+    int r = m_locktree_map.fetch(i, &locktrees[i]);
+    invariant_zero(r);
+    reference_lt(locktrees[i]);
+  }
+  mutex_unlock();
+
+  // escalate them
+  escalate_locktrees(locktrees, num_locktrees);
+
+  delete[] locktrees;
+
+  uint64_t t1 = toku_current_time_microsec();
+  add_escalator_wait_time(t1 - t0);
+}
+
+void locktree_manager::note_mem_used(uint64_t mem_used) {
+  (void)toku_sync_fetch_and_add(&m_current_lock_memory, mem_used);
+}
+
+void locktree_manager::note_mem_released(uint64_t mem_released) {
+  uint64_t old_mem_used =
+      toku_sync_fetch_and_sub(&m_current_lock_memory, mem_released);
+  invariant(old_mem_used >= mem_released);
+}
+
+bool locktree_manager::out_of_locks(void) const {
+  return m_current_lock_memory >= m_max_lock_memory;
+}
+
+bool locktree_manager::over_big_threshold(void) {
+  return m_current_lock_memory >= m_max_lock_memory / 2;
+}
+
+int locktree_manager::iterate_pending_lock_requests(
+    lock_request_iterate_callback callback, void *extra) {
+  mutex_lock();
+  int r = 0;
+  uint32_t num_locktrees = m_locktree_map.size();
+  for (uint32_t i = 0; i < num_locktrees && r == 0; i++) {
+    locktree *lt;
+    r = m_locktree_map.fetch(i, &lt);
+    invariant_zero(r);
+    if (r == EINVAL)  // Shouldn't happen, avoid compiler warning
+      continue;
+
+    struct lt_lock_request_info *info = lt->get_lock_request_info();
+    toku_external_mutex_lock(&info->mutex);
+
+    uint32_t num_requests = info->pending_lock_requests.size();
+    for (uint32_t k = 0; k < num_requests && r == 0; k++) {
+      lock_request *req;
+      r = info->pending_lock_requests.fetch(k, &req);
+      invariant_zero(r);
+      if (r == EINVAL) /* Shouldn't happen, avoid compiler warning */
+        continue;
+      r = callback(lt->get_dict_id(), req->get_txnid(), req->get_left_key(),
+                   req->get_right_key(), req->get_conflicting_txnid(),
+                   req->get_start_time(), extra);
+    }
+
+    toku_external_mutex_unlock(&info->mutex);
+  }
+  mutex_unlock();
+  return r;
+}
+
+int locktree_manager::check_current_lock_constraints(bool big_txn) {
+  int r = 0;
+  if (big_txn && over_big_threshold()) {
+    run_escalation();
+    if (over_big_threshold()) {
+      r = TOKUDB_OUT_OF_LOCKS;
+    }
+  }
+  if (r == 0 && out_of_locks()) {
+    run_escalation();
+    if (out_of_locks()) {
+      // return an error if we're still out of locks after escalation.
+      r = TOKUDB_OUT_OF_LOCKS;
+    }
+  }
+  return r;
+}
+
+void locktree_manager::escalator_init(void) {
+  ZERO_STRUCT(m_escalation_mutex);
+  toku_mutex_init(manager_escalation_mutex_key, &m_escalation_mutex, nullptr);
+  m_escalation_count = 0;
+  m_escalation_time = 0;
+  m_wait_escalation_count = 0;
+  m_wait_escalation_time = 0;
+  m_long_wait_escalation_count = 0;
+  m_long_wait_escalation_time = 0;
+  m_escalation_latest_result = 0;
+  m_escalator.create();
+}
+
+void locktree_manager::escalator_destroy(void) {
+  m_escalator.destroy();
+  toku_mutex_destroy(&m_escalation_mutex);
+}
+
+void locktree_manager::add_escalator_wait_time(uint64_t t) {
+  toku_mutex_lock(&m_escalation_mutex);
+  m_wait_escalation_count += 1;
+  m_wait_escalation_time += t;
+  if (t >= 1000000) {
+    m_long_wait_escalation_count += 1;
+    m_long_wait_escalation_time += t;
+  }
+  toku_mutex_unlock(&m_escalation_mutex);
+}
+
+void locktree_manager::escalate_locktrees(locktree **locktrees,
+                                          int num_locktrees) {
+  // there are too many row locks in the system and we need to tidy up.
+  //
+  // a simple implementation of escalation does not attempt
+  // to reduce the memory foot print of each txn's range buffer.
+  // doing so would require some layering hackery (or a callback)
+  // and more complicated locking. for now, just escalate each
+  // locktree individually, in-place.
+  tokutime_t t0 = toku_time_now();
+  for (int i = 0; i < num_locktrees; i++) {
+    locktrees[i]->escalate(m_lt_escalate_callback,
+                           m_lt_escalate_callback_extra);
+    release_lt(locktrees[i]);
+  }
+  tokutime_t t1 = toku_time_now();
+
+  toku_mutex_lock(&m_escalation_mutex);
+  m_escalation_count++;
+  m_escalation_time += (t1 - t0);
+  m_escalation_latest_result = m_current_lock_memory;
+  toku_mutex_unlock(&m_escalation_mutex);
+}
+
+struct escalate_args {
+  locktree_manager *mgr;
+  locktree **locktrees;
+  int num_locktrees;
+};
+
+void locktree_manager::locktree_escalator::create(void) {
+  ZERO_STRUCT(m_escalator_mutex);
+  toku_mutex_init(manager_escalator_mutex_key, &m_escalator_mutex, nullptr);
+  toku_cond_init(manager_m_escalator_done_key, &m_escalator_done, nullptr);
+  m_escalator_running = false;
+}
+
+void locktree_manager::locktree_escalator::destroy(void) {
+  toku_cond_destroy(&m_escalator_done);
+  toku_mutex_destroy(&m_escalator_mutex);
+}
+
+void locktree_manager::locktree_escalator::run(
+    locktree_manager *mgr, void (*escalate_locktrees_fun)(void *extra),
+    void *extra) {
+  uint64_t t0 = toku_current_time_microsec();
+  toku_mutex_lock(&m_escalator_mutex);
+  if (!m_escalator_running) {
+    // run escalation on this thread
+    m_escalator_running = true;
+    toku_mutex_unlock(&m_escalator_mutex);
+    escalate_locktrees_fun(extra);
+    toku_mutex_lock(&m_escalator_mutex);
+    m_escalator_running = false;
+    toku_cond_broadcast(&m_escalator_done);
+  } else {
+    toku_cond_wait(&m_escalator_done, &m_escalator_mutex);
+  }
+  toku_mutex_unlock(&m_escalator_mutex);
+  uint64_t t1 = toku_current_time_microsec();
+  mgr->add_escalator_wait_time(t1 - t0);
+}
+
+void locktree_manager::get_status(LTM_STATUS statp) {
+  ltm_status.init();
+  LTM_STATUS_VAL(LTM_SIZE_CURRENT) = m_current_lock_memory;
+  LTM_STATUS_VAL(LTM_SIZE_LIMIT) = m_max_lock_memory;
+  LTM_STATUS_VAL(LTM_ESCALATION_COUNT) = m_escalation_count;
+  LTM_STATUS_VAL(LTM_ESCALATION_TIME) = m_escalation_time;
+  LTM_STATUS_VAL(LTM_ESCALATION_LATEST_RESULT) = m_escalation_latest_result;
+  LTM_STATUS_VAL(LTM_WAIT_ESCALATION_COUNT) = m_wait_escalation_count;
+  LTM_STATUS_VAL(LTM_WAIT_ESCALATION_TIME) = m_wait_escalation_time;
+  LTM_STATUS_VAL(LTM_LONG_WAIT_ESCALATION_COUNT) = m_long_wait_escalation_count;
+  LTM_STATUS_VAL(LTM_LONG_WAIT_ESCALATION_TIME) = m_long_wait_escalation_time;
+
+  uint64_t lock_requests_pending = 0;
+  uint64_t sto_num_eligible = 0;
+  uint64_t sto_end_early_count = 0;
+  tokutime_t sto_end_early_time = 0;
+  uint32_t num_locktrees = 0;
+  struct lt_counters lt_counters;
+  ZERO_STRUCT(lt_counters);  // PORT: instead of ={}.
+
+  if (toku_mutex_trylock(&m_mutex) == 0) {
+    lt_counters = m_lt_counters;
+    num_locktrees = m_locktree_map.size();
+    for (uint32_t i = 0; i < num_locktrees; i++) {
+      locktree *lt;
+      int r = m_locktree_map.fetch(i, &lt);
+      invariant_zero(r);
+      if (r == EINVAL)  // Shouldn't happen, avoid compiler warning
+        continue;
+      if (toku_external_mutex_trylock(&lt->m_lock_request_info.mutex) == 0) {
+        lock_requests_pending +=
+            lt->m_lock_request_info.pending_lock_requests.size();
+        lt_counters.add(lt->get_lock_request_info()->counters);
+        toku_external_mutex_unlock(&lt->m_lock_request_info.mutex);
+      }
+      sto_num_eligible += lt->sto_txnid_is_valid_unsafe() ? 1 : 0;
+      sto_end_early_count += lt->m_sto_end_early_count;
+      sto_end_early_time += lt->m_sto_end_early_time;
+    }
+    mutex_unlock();
+  }
+
+  LTM_STATUS_VAL(LTM_NUM_LOCKTREES) = num_locktrees;
+  LTM_STATUS_VAL(LTM_LOCK_REQUESTS_PENDING) = lock_requests_pending;
+  LTM_STATUS_VAL(LTM_STO_NUM_ELIGIBLE) = sto_num_eligible;
+  LTM_STATUS_VAL(LTM_STO_END_EARLY_COUNT) = sto_end_early_count;
+  LTM_STATUS_VAL(LTM_STO_END_EARLY_TIME) = sto_end_early_time;
+  LTM_STATUS_VAL(LTM_WAIT_COUNT) = lt_counters.wait_count;
+  LTM_STATUS_VAL(LTM_WAIT_TIME) = lt_counters.wait_time;
+  LTM_STATUS_VAL(LTM_LONG_WAIT_COUNT) = lt_counters.long_wait_count;
+  LTM_STATUS_VAL(LTM_LONG_WAIT_TIME) = lt_counters.long_wait_time;
+  LTM_STATUS_VAL(LTM_TIMEOUT_COUNT) = lt_counters.timeout_count;
+  *statp = ltm_status;
+}
+
+void locktree_manager::kill_waiter(void *extra) {
+  mutex_lock();
+  int r = 0;
+  uint32_t num_locktrees = m_locktree_map.size();
+  for (uint32_t i = 0; i < num_locktrees; i++) {
+    locktree *lt;
+    r = m_locktree_map.fetch(i, &lt);
+    invariant_zero(r);
+    if (r) continue;  // Get rid of "may be used uninitialized" warning
+    lock_request::kill_waiter(lt, extra);
+  }
+  mutex_unlock();
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc
new file mode 100644
index 000000000..1e1d23ef8
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc
@@ -0,0 +1,265 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "range_buffer.h"
+
+#include <string.h>
+
+#include "../portability/memory.h"
+#include "../util/dbt.h"
+
+namespace toku {
+
+bool range_buffer::record_header::left_is_infinite(void) const {
+  return left_neg_inf || left_pos_inf;
+}
+
+bool range_buffer::record_header::right_is_infinite(void) const {
+  return right_neg_inf || right_pos_inf;
+}
+
+void range_buffer::record_header::init(const DBT *left_key,
+                                       const DBT *right_key,
+                                       bool is_exclusive) {
+  is_exclusive_lock = is_exclusive;
+  left_neg_inf = left_key == toku_dbt_negative_infinity();
+  left_pos_inf = left_key == toku_dbt_positive_infinity();
+  left_key_size = toku_dbt_is_infinite(left_key) ? 0 : left_key->size;
+  if (right_key) {
+    right_neg_inf = right_key == toku_dbt_negative_infinity();
+    right_pos_inf = right_key == toku_dbt_positive_infinity();
+    right_key_size = toku_dbt_is_infinite(right_key) ? 0 : right_key->size;
+  } else {
+    right_neg_inf = left_neg_inf;
+    right_pos_inf = left_pos_inf;
+    right_key_size = 0;
+  }
+}
+
+const DBT *range_buffer::iterator::record::get_left_key(void) const {
+  if (_header.left_neg_inf) {
+    return toku_dbt_negative_infinity();
+  } else if (_header.left_pos_inf) {
+    return toku_dbt_positive_infinity();
+  } else {
+    return &_left_key;
+  }
+}
+
+const DBT *range_buffer::iterator::record::get_right_key(void) const {
+  if (_header.right_neg_inf) {
+    return toku_dbt_negative_infinity();
+  } else if (_header.right_pos_inf) {
+    return toku_dbt_positive_infinity();
+  } else {
+    return &_right_key;
+  }
+}
+
+size_t range_buffer::iterator::record::size(void) const {
+  return sizeof(record_header) + _header.left_key_size + _header.right_key_size;
+}
+
+void range_buffer::iterator::record::deserialize(const char *buf) {
+  size_t current = 0;
+
+  // deserialize the header
+  memcpy(&_header, buf, sizeof(record_header));
+  current += sizeof(record_header);
+
+  // deserialize the left key if necessary
+  if (!_header.left_is_infinite()) {
+    // point the left DBT's buffer into ours
+    toku_fill_dbt(&_left_key, buf + current, _header.left_key_size);
+    current += _header.left_key_size;
+  }
+
+  // deserialize the right key if necessary
+  if (!_header.right_is_infinite()) {
+    if (_header.right_key_size == 0) {
+      toku_copyref_dbt(&_right_key, _left_key);
+    } else {
+      toku_fill_dbt(&_right_key, buf + current, _header.right_key_size);
+    }
+  }
+}
+
+toku::range_buffer::iterator::iterator()
+    : _ma_chunk_iterator(nullptr),
+      _current_chunk_base(nullptr),
+      _current_chunk_offset(0),
+      _current_chunk_max(0),
+      _current_rec_size(0) {}
+
+toku::range_buffer::iterator::iterator(const range_buffer *buffer)
+    : _ma_chunk_iterator(&buffer->_arena),
+      _current_chunk_base(nullptr),
+      _current_chunk_offset(0),
+      _current_chunk_max(0),
+      _current_rec_size(0) {
+  reset_current_chunk();
+}
+
+void range_buffer::iterator::reset_current_chunk() {
+  _current_chunk_base = _ma_chunk_iterator.current(&_current_chunk_max);
+  _current_chunk_offset = 0;
+}
+
+bool range_buffer::iterator::current(record *rec) {
+  if (_current_chunk_offset < _current_chunk_max) {
+    const char *buf = reinterpret_cast<const char *>(_current_chunk_base);
+    rec->deserialize(buf + _current_chunk_offset);
+    _current_rec_size = rec->size();
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// move the iterator to the next record in the buffer
+void range_buffer::iterator::next(void) {
+  invariant(_current_chunk_offset < _current_chunk_max);
+  invariant(_current_rec_size > 0);
+
+  // the next record is _current_rec_size bytes forward
+  _current_chunk_offset += _current_rec_size;
+  // now, we don't know how big the current is, set it to 0.
+  _current_rec_size = 0;
+
+  if (_current_chunk_offset >= _current_chunk_max) {
+    // current chunk is exhausted, try moving to the next one
+    if (_ma_chunk_iterator.more()) {
+      _ma_chunk_iterator.next();
+      reset_current_chunk();
+    }
+  }
+}
+
+void range_buffer::create(void) {
+  // allocate buffer space lazily instead of on creation. this way,
+  // no malloc/free is done if the transaction ends up taking no locks.
+  _arena.create(0);
+  _num_ranges = 0;
+}
+
+void range_buffer::append(const DBT *left_key, const DBT *right_key,
+                          bool is_write_request) {
+  // if the keys are equal, then only one copy is stored.
+  if (toku_dbt_equals(left_key, right_key)) {
+    invariant(left_key->size <= MAX_KEY_SIZE);
+    append_point(left_key, is_write_request);
+  } else {
+    invariant(left_key->size <= MAX_KEY_SIZE);
+    invariant(right_key->size <= MAX_KEY_SIZE);
+    append_range(left_key, right_key, is_write_request);
+  }
+  _num_ranges++;
+}
+
+bool range_buffer::is_empty(void) const { return total_memory_size() == 0; }
+
+uint64_t range_buffer::total_memory_size(void) const {
+  return _arena.total_size_in_use();
+}
+
+int range_buffer::get_num_ranges(void) const { return _num_ranges; }
+
+void range_buffer::destroy(void) { _arena.destroy(); }
+
+void range_buffer::append_range(const DBT *left_key, const DBT *right_key,
+                                bool is_exclusive) {
+  size_t record_length =
+      sizeof(record_header) + left_key->size + right_key->size;
+  char *buf = reinterpret_cast<char *>(_arena.malloc_from_arena(record_length));
+
+  record_header h;
+  h.init(left_key, right_key, is_exclusive);
+
+  // serialize the header
+  memcpy(buf, &h, sizeof(record_header));
+  buf += sizeof(record_header);
+
+  // serialize the left key if necessary
+  if (!h.left_is_infinite()) {
+    memcpy(buf, left_key->data, left_key->size);
+    buf += left_key->size;
+  }
+
+  // serialize the right key if necessary
+  if (!h.right_is_infinite()) {
+    memcpy(buf, right_key->data, right_key->size);
+  }
+}
+
+void range_buffer::append_point(const DBT *key, bool is_exclusive) {
+  size_t record_length = sizeof(record_header) + key->size;
+  char *buf = reinterpret_cast<char *>(_arena.malloc_from_arena(record_length));
+
+  record_header h;
+  h.init(key, nullptr, is_exclusive);
+
+  // serialize the header
+  memcpy(buf, &h, sizeof(record_header));
+  buf += sizeof(record_header);
+
+  // serialize the key if necessary
+  if (!h.left_is_infinite()) {
+    memcpy(buf, key->data, key->size);
+  }
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h
new file mode 100644
index 000000000..76e28d747
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h
@@ -0,0 +1,178 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <inttypes.h>
+#include <stdint.h>
+
+#include "../util/dbt.h"
+#include "../util/memarena.h"
+
+namespace toku {
+
+// a key range buffer represents a set of key ranges that can
+// be stored, iterated over, and then destroyed all at once.
+class range_buffer {
+ private:
+  // the key range buffer is a bunch of records in a row.
+  // each record has the following header, followed by the
+  // left key and right key data payload, if applicable.
+  // we limit keys to be 2^16, since we store lengths as 2 bytes.
+  static const size_t MAX_KEY_SIZE = 1 << 16;
+
+  struct record_header {
+    bool left_neg_inf;
+    bool left_pos_inf;
+    bool right_pos_inf;
+    bool right_neg_inf;
+    uint16_t left_key_size;
+    uint16_t right_key_size;
+    bool is_exclusive_lock;
+
+    bool left_is_infinite(void) const;
+
+    bool right_is_infinite(void) const;
+
+    void init(const DBT *left_key, const DBT *right_key, bool is_exclusive);
+  };
+  // PORT static_assert(sizeof(record_header) == 8, "record header format is
+  // off");
+
+ public:
+  // the iterator abstracts reading over a buffer of variable length
+  // records one by one until there are no more left.
+  class iterator {
+   public:
+    iterator();
+    iterator(const range_buffer *buffer);
+
+    // a record represents the user-view of a serialized key range.
+    // it handles positive and negative infinity and the optimized
+    // point range case, where left and right points share memory.
+    class record {
+     public:
+      // get a read-only pointer to the left key of this record's range
+      const DBT *get_left_key(void) const;
+
+      // get a read-only pointer to the right key of this record's range
+      const DBT *get_right_key(void) const;
+
+      // how big is this record? this tells us where the next record is
+      size_t size(void) const;
+
+      bool get_exclusive_flag() const { return _header.is_exclusive_lock; }
+
+      // populate a record header and point our DBT's
+      // buffers into ours if they are not infinite.
+      void deserialize(const char *buf);
+
+     private:
+      record_header _header;
+      DBT _left_key;
+      DBT _right_key;
+    };
+
+    // populate the given record object with the current
+    // the memory referred to by record is valid for only
+    // as long as the record exists.
+    bool current(record *rec);
+
+    // move the iterator to the next record in the buffer
+    void next(void);
+
+   private:
+    void reset_current_chunk();
+
+    // the key range buffer we are iterating over, the current
+    // offset in that buffer, and the size of the current record.
+    memarena::chunk_iterator _ma_chunk_iterator;
+    const void *_current_chunk_base;
+    size_t _current_chunk_offset;
+    size_t _current_chunk_max;
+    size_t _current_rec_size;
+  };
+
+  // allocate buffer space lazily instead of on creation. this way,
+  // no malloc/free is done if the transaction ends up taking no locks.
+  void create(void);
+
+  // append a left/right key range to the buffer.
+  // if the keys are equal, then only one copy is stored.
+  void append(const DBT *left_key, const DBT *right_key,
+              bool is_write_request = false);
+
+  // is this range buffer empty?
+  bool is_empty(void) const;
+
+  // how much memory is being used by this range buffer?
+  uint64_t total_memory_size(void) const;
+
+  // how many ranges are stored in this range buffer?
+  int get_num_ranges(void) const;
+
+  void destroy(void);
+
+ private:
+  memarena _arena;
+  int _num_ranges;
+
+  void append_range(const DBT *left_key, const DBT *right_key,
+                    bool is_write_request);
+
+  // append a point to the buffer. this is the space/time saving
+  // optimization for key ranges where left == right.
+  void append_point(const DBT *key, bool is_write_request);
+};
+
+} /* namespace toku */
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc
new file mode 100644
index 000000000..8997f634b
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc
@@ -0,0 +1,520 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "treenode.h"
+
+#include "../portability/toku_race_tools.h"
+
+namespace toku {
+
+// TODO: source location info might have to be pulled up one caller
+// to be useful
+void treenode::mutex_lock(void) { toku_mutex_lock(&m_mutex); }
+
+void treenode::mutex_unlock(void) { toku_mutex_unlock(&m_mutex); }
+
+void treenode::init(const comparator *cmp) {
+  m_txnid = TXNID_NONE;
+  m_is_root = false;
+  m_is_empty = true;
+  m_cmp = cmp;
+
+  m_is_shared = false;
+  m_owners = nullptr;
+
+  // use an adaptive mutex at each node since we expect the time the
+  // lock is held to be relatively short compared to a context switch.
+  // indeed, this improves performance at high thread counts considerably.
+  memset(&m_mutex, 0, sizeof(toku_mutex_t));
+  toku_pthread_mutexattr_t attr;
+  toku_mutexattr_init(&attr);
+  toku_mutexattr_settype(&attr, TOKU_MUTEX_ADAPTIVE);
+  toku_mutex_init(treenode_mutex_key, &m_mutex, &attr);
+  toku_mutexattr_destroy(&attr);
+  m_left_child.set(nullptr);
+  m_right_child.set(nullptr);
+}
+
+void treenode::create_root(const comparator *cmp) {
+  init(cmp);
+  m_is_root = true;
+}
+
+void treenode::destroy_root(void) {
+  invariant(is_root());
+  invariant(is_empty());
+  toku_mutex_destroy(&m_mutex);
+  m_cmp = nullptr;
+}
+
+void treenode::set_range_and_txnid(const keyrange &range, TXNID txnid,
+                                   bool is_shared) {
+  // allocates a new copy of the range for this node
+  m_range.create_copy(range);
+  m_txnid = txnid;
+  m_is_shared = is_shared;
+  m_is_empty = false;
+}
+
+bool treenode::is_root(void) { return m_is_root; }
+
+bool treenode::is_empty(void) { return m_is_empty; }
+
+bool treenode::range_overlaps(const keyrange &range) {
+  return m_range.overlaps(*m_cmp, range);
+}
+
+treenode *treenode::alloc(const comparator *cmp, const keyrange &range,
+                          TXNID txnid, bool is_shared) {
+  treenode *XCALLOC(node);
+  node->init(cmp);
+  node->set_range_and_txnid(range, txnid, is_shared);
+  return node;
+}
+
+void treenode::swap_in_place(treenode *node1, treenode *node2) {
+  keyrange tmp_range = node1->m_range;
+  TXNID tmp_txnid = node1->m_txnid;
+  node1->m_range = node2->m_range;
+  node1->m_txnid = node2->m_txnid;
+  node2->m_range = tmp_range;
+  node2->m_txnid = tmp_txnid;
+
+  bool tmp_is_shared = node1->m_is_shared;
+  node1->m_is_shared = node2->m_is_shared;
+  node2->m_is_shared = tmp_is_shared;
+
+  auto tmp_m_owners = node1->m_owners;
+  node1->m_owners = node2->m_owners;
+  node2->m_owners = tmp_m_owners;
+}
+
+bool treenode::add_shared_owner(TXNID txnid) {
+  assert(m_is_shared);
+  if (txnid == m_txnid)
+    return false;  // acquiring a lock on the same range by the same trx
+
+  if (m_txnid != TXNID_SHARED) {
+    m_owners = new TxnidVector;
+    m_owners->insert(m_txnid);
+    m_txnid = TXNID_SHARED;
+  }
+  m_owners->insert(txnid);
+  return true;
+}
+
+void treenode::free(treenode *node) {
+  // destroy the range, freeing any copied keys
+  node->m_range.destroy();
+
+  if (node->m_owners) {
+    delete node->m_owners;
+    node->m_owners = nullptr;  // need this?
+  }
+
+  // the root is simply marked as empty.
+  if (node->is_root()) {
+    // PORT toku_mutex_assert_locked(&node->m_mutex);
+    node->m_is_empty = true;
+  } else {
+    // PORT toku_mutex_assert_unlocked(&node->m_mutex);
+    toku_mutex_destroy(&node->m_mutex);
+    toku_free(node);
+  }
+}
+
+uint32_t treenode::get_depth_estimate(void) const {
+  const uint32_t left_est = m_left_child.depth_est;
+  const uint32_t right_est = m_right_child.depth_est;
+  return (left_est > right_est ? left_est : right_est) + 1;
+}
+
+treenode *treenode::find_node_with_overlapping_child(
+    const keyrange &range, const keyrange::comparison *cmp_hint) {
+  // determine which child to look at based on a comparison. if we were
+  // given a comparison hint, use that. otherwise, compare them now.
+  keyrange::comparison c =
+      cmp_hint ? *cmp_hint : range.compare(*m_cmp, m_range);
+
+  treenode *child;
+  if (c == keyrange::comparison::LESS_THAN) {
+    child = lock_and_rebalance_left();
+  } else {
+    // The caller (locked_keyrange::acquire) handles the case where
+    // the root of the locked_keyrange is the node that overlaps.
+    // range is guaranteed not to overlap this node.
+    invariant(c == keyrange::comparison::GREATER_THAN);
+    child = lock_and_rebalance_right();
+  }
+
+  // if the search would lead us to an empty subtree (child == nullptr),
+  // or the child overlaps, then we know this node is the parent we want.
+  // otherwise we need to recur into that child.
+  if (child == nullptr) {
+    return this;
+  } else {
+    c = range.compare(*m_cmp, child->m_range);
+    if (c == keyrange::comparison::EQUALS ||
+        c == keyrange::comparison::OVERLAPS) {
+      child->mutex_unlock();
+      return this;
+    } else {
+      // unlock this node before recurring into the locked child,
+      // passing in a comparison hint since we just comapred range
+      // to the child's range.
+      mutex_unlock();
+      return child->find_node_with_overlapping_child(range, &c);
+    }
+  }
+}
+
+bool treenode::insert(const keyrange &range, TXNID txnid, bool is_shared) {
+  int rc = true;
+  // choose a child to check. if that child is null, then insert the new node
+  // there. otherwise recur down that child's subtree
+  keyrange::comparison c = range.compare(*m_cmp, m_range);
+  if (c == keyrange::comparison::LESS_THAN) {
+    treenode *left_child = lock_and_rebalance_left();
+    if (left_child == nullptr) {
+      left_child = treenode::alloc(m_cmp, range, txnid, is_shared);
+      m_left_child.set(left_child);
+    } else {
+      left_child->insert(range, txnid, is_shared);
+      left_child->mutex_unlock();
+    }
+  } else if (c == keyrange::comparison::GREATER_THAN) {
+    // invariant(c == keyrange::comparison::GREATER_THAN);
+    treenode *right_child = lock_and_rebalance_right();
+    if (right_child == nullptr) {
+      right_child = treenode::alloc(m_cmp, range, txnid, is_shared);
+      m_right_child.set(right_child);
+    } else {
+      right_child->insert(range, txnid, is_shared);
+      right_child->mutex_unlock();
+    }
+  } else if (c == keyrange::comparison::EQUALS) {
+    invariant(is_shared);
+    invariant(m_is_shared);
+    rc = add_shared_owner(txnid);
+  } else {
+    invariant(0);
+  }
+  return rc;
+}
+
+treenode *treenode::find_child_at_extreme(int direction, treenode **parent) {
+  treenode *child =
+      direction > 0 ? m_right_child.get_locked() : m_left_child.get_locked();
+
+  if (child) {
+    *parent = this;
+    treenode *child_extreme = child->find_child_at_extreme(direction, parent);
+    child->mutex_unlock();
+    return child_extreme;
+  } else {
+    return this;
+  }
+}
+
+treenode *treenode::find_leftmost_child(treenode **parent) {
+  return find_child_at_extreme(-1, parent);
+}
+
+treenode *treenode::find_rightmost_child(treenode **parent) {
+  return find_child_at_extreme(1, parent);
+}
+
+treenode *treenode::remove_root_of_subtree() {
+  // if this node has no children, just free it and return null
+  if (m_left_child.ptr == nullptr && m_right_child.ptr == nullptr) {
+    // treenode::free requires that non-root nodes are unlocked
+    if (!is_root()) {
+      mutex_unlock();
+    }
+    treenode::free(this);
+    return nullptr;
+  }
+
+  // we have a child, so get either the in-order successor or
+  // predecessor of this node to be our replacement.
+  // replacement_parent is updated by the find functions as
+  // they recur down the tree, so initialize it to this.
+  treenode *child, *replacement;
+  treenode *replacement_parent = this;
+  if (m_left_child.ptr != nullptr) {
+    child = m_left_child.get_locked();
+    replacement = child->find_rightmost_child(&replacement_parent);
+    invariant(replacement == child || replacement_parent != this);
+
+    // detach the replacement from its parent
+    if (replacement_parent == this) {
+      m_left_child = replacement->m_left_child;
+    } else {
+      replacement_parent->m_right_child = replacement->m_left_child;
+    }
+  } else {
+    child = m_right_child.get_locked();
+    replacement = child->find_leftmost_child(&replacement_parent);
+    invariant(replacement == child || replacement_parent != this);
+
+    // detach the replacement from its parent
+    if (replacement_parent == this) {
+      m_right_child = replacement->m_right_child;
+    } else {
+      replacement_parent->m_left_child = replacement->m_right_child;
+    }
+  }
+  child->mutex_unlock();
+
+  // swap in place with the detached replacement, then destroy it
+  treenode::swap_in_place(replacement, this);
+  treenode::free(replacement);
+
+  return this;
+}
+
+void treenode::recursive_remove(void) {
+  treenode *left = m_left_child.ptr;
+  if (left) {
+    left->recursive_remove();
+  }
+  m_left_child.set(nullptr);
+
+  treenode *right = m_right_child.ptr;
+  if (right) {
+    right->recursive_remove();
+  }
+  m_right_child.set(nullptr);
+
+  // we do not take locks on the way down, so we know non-root nodes
+  // are unlocked here and the caller is required to pass a locked
+  // root, so this free is correct.
+  treenode::free(this);
+}
+
+void treenode::remove_shared_owner(TXNID txnid) {
+  assert(m_owners->size() > 1);
+  m_owners->erase(txnid);
+  assert(m_owners->size() > 0);
+  /* if there is just one owner left, move it to m_txnid */
+  if (m_owners->size() == 1) {
+    m_txnid = *m_owners->begin();
+    delete m_owners;
+    m_owners = nullptr;
+  }
+}
+
+treenode *treenode::remove(const keyrange &range, TXNID txnid) {
+  treenode *child;
+  // if the range is equal to this node's range, then just remove
+  // the root of this subtree. otherwise search down the tree
+  // in either the left or right children.
+  keyrange::comparison c = range.compare(*m_cmp, m_range);
+  switch (c) {
+    case keyrange::comparison::EQUALS: {
+      // if we are the only owners, remove. Otherwise, just remove
+      // us from the owners list.
+      if (txnid != TXNID_ANY && has_multiple_owners()) {
+        remove_shared_owner(txnid);
+        return this;
+      } else {
+        return remove_root_of_subtree();
+      }
+    }
+    case keyrange::comparison::LESS_THAN:
+      child = m_left_child.get_locked();
+      invariant_notnull(child);
+      child = child->remove(range, txnid);
+
+      // unlock the child if there still is one.
+      // regardless, set the right child pointer
+      if (child) {
+        child->mutex_unlock();
+      }
+      m_left_child.set(child);
+      break;
+    case keyrange::comparison::GREATER_THAN:
+      child = m_right_child.get_locked();
+      invariant_notnull(child);
+      child = child->remove(range, txnid);
+
+      // unlock the child if there still is one.
+      // regardless, set the right child pointer
+      if (child) {
+        child->mutex_unlock();
+      }
+      m_right_child.set(child);
+      break;
+    case keyrange::comparison::OVERLAPS:
+      // shouldn't be overlapping, since the tree is
+      // non-overlapping and this range must exist
+      abort();
+  }
+
+  return this;
+}
+
+bool treenode::left_imbalanced(int threshold) const {
+  uint32_t left_depth = m_left_child.depth_est;
+  uint32_t right_depth = m_right_child.depth_est;
+  return m_left_child.ptr != nullptr && left_depth > threshold + right_depth;
+}
+
+bool treenode::right_imbalanced(int threshold) const {
+  uint32_t left_depth = m_left_child.depth_est;
+  uint32_t right_depth = m_right_child.depth_est;
+  return m_right_child.ptr != nullptr && right_depth > threshold + left_depth;
+}
+
+// effect: rebalances the subtree rooted at this node
+//         using AVL style O(1) rotations. unlocks this
+//         node if it is not the new root of the subtree.
+// requires: node is locked by this thread, children are not
+// returns: locked root node of the rebalanced tree
+treenode *treenode::maybe_rebalance(void) {
+  // if we end up not rotating at all, the new root is this
+  treenode *new_root = this;
+  treenode *child = nullptr;
+
+  if (left_imbalanced(IMBALANCE_THRESHOLD)) {
+    child = m_left_child.get_locked();
+    if (child->right_imbalanced(0)) {
+      treenode *grandchild = child->m_right_child.get_locked();
+
+      child->m_right_child = grandchild->m_left_child;
+      grandchild->m_left_child.set(child);
+
+      m_left_child = grandchild->m_right_child;
+      grandchild->m_right_child.set(this);
+
+      new_root = grandchild;
+    } else {
+      m_left_child = child->m_right_child;
+      child->m_right_child.set(this);
+      new_root = child;
+    }
+  } else if (right_imbalanced(IMBALANCE_THRESHOLD)) {
+    child = m_right_child.get_locked();
+    if (child->left_imbalanced(0)) {
+      treenode *grandchild = child->m_left_child.get_locked();
+
+      child->m_left_child = grandchild->m_right_child;
+      grandchild->m_right_child.set(child);
+
+      m_right_child = grandchild->m_left_child;
+      grandchild->m_left_child.set(this);
+
+      new_root = grandchild;
+    } else {
+      m_right_child = child->m_left_child;
+      child->m_left_child.set(this);
+      new_root = child;
+    }
+  }
+
+  // up to three nodes may be locked.
+  // - this
+  // - child
+  // - grandchild (but if it is locked, its the new root)
+  //
+  // one of them is the new root. we unlock everything except the new root.
+  if (child && child != new_root) {
+    TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&child->m_mutex);
+    child->mutex_unlock();
+  }
+  if (this != new_root) {
+    TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&m_mutex);
+    mutex_unlock();
+  }
+  TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&new_root->m_mutex);
+  return new_root;
+}
+
+treenode *treenode::lock_and_rebalance_left(void) {
+  treenode *child = m_left_child.get_locked();
+  if (child) {
+    treenode *new_root = child->maybe_rebalance();
+    m_left_child.set(new_root);
+    child = new_root;
+  }
+  return child;
+}
+
+treenode *treenode::lock_and_rebalance_right(void) {
+  treenode *child = m_right_child.get_locked();
+  if (child) {
+    treenode *new_root = child->maybe_rebalance();
+    m_right_child.set(new_root);
+    child = new_root;
+  }
+  return child;
+}
+
+void treenode::child_ptr::set(treenode *node) {
+  ptr = node;
+  depth_est = ptr ? ptr->get_depth_estimate() : 0;
+}
+
+treenode *treenode::child_ptr::get_locked(void) {
+  if (ptr) {
+    ptr->mutex_lock();
+    depth_est = ptr->get_depth_estimate();
+  }
+  return ptr;
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h
new file mode 100644
index 000000000..ec25a8c58
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h
@@ -0,0 +1,302 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <string.h>
+
+#include "../ft/comparator.h"
+#include "../portability/memory.h"
+#include "../portability/toku_pthread.h"
+// PORT: we need LTM_STATUS
+#include "../ft/ft-status.h"
+#include "../portability/txn_subst.h"
+#include "keyrange.h"
+
+namespace toku {
+
+// a node in a tree with its own mutex
+// - range is the "key" of this node
+// - txnid is the single txnid associated with this node
+// - left and right children may be null
+//
+// to build a tree on top of this abstraction, the user:
+// - provides memory for a root node, initializes it via create_root()
+// - performs tree operations on the root node. memory management
+//   below the root node is handled by the abstraction, not the user.
+// this pattern:
+// - guaruntees a root node always exists.
+// - does not allow for rebalances on the root node
+
+class treenode {
+ public:
+  // every treenode function has some common requirements:
+  // - node is locked and children are never locked
+  // - node may be unlocked if no other thread has visibility
+
+  // effect: create the root node
+  void create_root(const comparator *cmp);
+
+  // effect: destroys the root node
+  void destroy_root(void);
+
+  // effect: sets the txnid and copies the given range for this node
+  void set_range_and_txnid(const keyrange &range, TXNID txnid, bool is_shared);
+
+  // returns: true iff this node is marked as empty
+  bool is_empty(void);
+
+  // returns: true if this is the root node, denoted by a null parent
+  bool is_root(void);
+
+  // returns: true if the given range overlaps with this node's range
+  bool range_overlaps(const keyrange &range);
+
+  // effect: locks the node
+  void mutex_lock(void);
+
+  // effect: unlocks the node
+  void mutex_unlock(void);
+
+  // return: node whose child overlaps, or a child that is empty
+  //         and would contain range if it existed
+  // given: if cmp_hint is non-null, then it is a precomputed
+  //        comparison of this node's range to the given range.
+  treenode *find_node_with_overlapping_child(
+      const keyrange &range, const keyrange::comparison *cmp_hint);
+
+  // effect: performs an in-order traversal of the ranges that overlap the
+  //         given range, calling function->fn() on each node that does
+  // requires: function signature is: bool fn(const keyrange &range, TXNID
+  // txnid) requires: fn returns true to keep iterating, false to stop iterating
+  // requires: fn does not attempt to use any ranges read out by value
+  //           after removing a node with an overlapping range from the tree.
+  template <class F>
+  void traverse_overlaps(const keyrange &range, F *function) {
+    keyrange::comparison c = range.compare(*m_cmp, m_range);
+    if (c == keyrange::comparison::EQUALS) {
+      // Doesn't matter if fn wants to keep going, there
+      // is nothing left, so return.
+      function->fn(m_range, m_txnid, m_is_shared, m_owners);
+      return;
+    }
+
+    treenode *left = m_left_child.get_locked();
+    if (left) {
+      if (c != keyrange::comparison::GREATER_THAN) {
+        // Target range is less than this node, or it overlaps this
+        // node.  There may be something on the left.
+        left->traverse_overlaps(range, function);
+      }
+      left->mutex_unlock();
+    }
+
+    if (c == keyrange::comparison::OVERLAPS) {
+      bool keep_going = function->fn(m_range, m_txnid, m_is_shared, m_owners);
+      if (!keep_going) {
+        return;
+      }
+    }
+
+    treenode *right = m_right_child.get_locked();
+    if (right) {
+      if (c != keyrange::comparison::LESS_THAN) {
+        // Target range is greater than this node, or it overlaps this
+        // node.  There may be something on the right.
+        right->traverse_overlaps(range, function);
+      }
+      right->mutex_unlock();
+    }
+  }
+
+  // effect: inserts the given range and txnid into a subtree, recursively
+  // requires: range does not overlap with any node below the subtree
+  bool insert(const keyrange &range, TXNID txnid, bool is_shared);
+
+  // effect: removes the given range from the subtree
+  // requires: range exists in the subtree
+  // returns: the root of the resulting subtree
+  treenode *remove(const keyrange &range, TXNID txnid);
+
+  // effect: removes this node and all of its children, recursively
+  // requires: every node at and below this node is unlocked
+  void recursive_remove(void);
+
+ private:
+  // the child_ptr is a light abstraction for the locking of
+  // a child and the maintenence of its depth estimate.
+
+  struct child_ptr {
+    // set the child pointer
+    void set(treenode *node);
+
+    // get and lock this child if it exists
+    treenode *get_locked(void);
+
+    treenode *ptr;
+    uint32_t depth_est;
+  };
+
+  // the balance factor at which a node is considered imbalanced
+  static const int32_t IMBALANCE_THRESHOLD = 2;
+
+  // node-level mutex
+  toku_mutex_t m_mutex;
+
+  // the range and txnid for this node. the range contains a copy
+  // of the keys originally inserted into the tree. nodes may
+  // swap ranges. but at the end of the day, when a node is
+  // destroyed, it frees the memory associated with whatever range
+  // it has at the time of destruction.
+  keyrange m_range;
+
+  void remove_shared_owner(TXNID txnid);
+
+  bool has_multiple_owners() { return (m_txnid == TXNID_SHARED); }
+
+ private:
+  // Owner transaction id.
+  // A value of TXNID_SHARED means this node has multiple owners
+  TXNID m_txnid;
+
+  // If true, this lock is a non-exclusive lock, and it can have either
+  // one or several owners.
+  bool m_is_shared;
+
+  // List of the owners, or nullptr if there's just one owner.
+  TxnidVector *m_owners;
+
+  // two child pointers
+  child_ptr m_left_child;
+  child_ptr m_right_child;
+
+  // comparator for ranges
+  // psergey-todo: Is there any sense to store the comparator in each tree
+  // node?
+  const comparator *m_cmp;
+
+  // marked for the root node. the root node is never free()'d
+  // when removed, but instead marked as empty.
+  bool m_is_root;
+
+  // marked for an empty node. only valid for the root.
+  bool m_is_empty;
+
+  // effect: initializes an empty node with the given comparator
+  void init(const comparator *cmp);
+
+  // requires: this is a shared node (m_is_shared==true)
+  // effect: another transaction is added as an owner.
+  // returns: true <=> added another owner
+  //          false <=> this transaction is already an owner
+  bool add_shared_owner(TXNID txnid);
+
+  // requires: *parent is initialized to something meaningful.
+  // requires: subtree is non-empty
+  // returns: the leftmost child of the given subtree
+  // returns: a pointer to the parent of said child in *parent, only
+  //          if this function recurred, otherwise it is untouched.
+  treenode *find_leftmost_child(treenode **parent);
+
+  // requires: *parent is initialized to something meaningful.
+  // requires: subtree is non-empty
+  // returns: the rightmost child of the given subtree
+  // returns: a pointer to the parent of said child in *parent, only
+  //          if this function recurred, otherwise it is untouched.
+  treenode *find_rightmost_child(treenode **parent);
+
+  // effect: remove the root of this subtree, destroying the old root
+  // returns: the new root of the subtree
+  treenode *remove_root_of_subtree(void);
+
+  // requires: subtree is non-empty, direction is not 0
+  // returns: the child of the subtree at either the left or rightmost extreme
+  treenode *find_child_at_extreme(int direction, treenode **parent);
+
+  // effect: retrieves and possibly rebalances the left child
+  // returns: a locked left child, if it exists
+  treenode *lock_and_rebalance_left(void);
+
+  // effect: retrieves and possibly rebalances the right child
+  // returns: a locked right child, if it exists
+  treenode *lock_and_rebalance_right(void);
+
+  // returns: the estimated depth of this subtree
+  uint32_t get_depth_estimate(void) const;
+
+  // returns: true iff left subtree depth is sufficiently less than the right
+  bool left_imbalanced(int threshold) const;
+
+  // returns: true iff right subtree depth is sufficiently greater than the left
+  bool right_imbalanced(int threshold) const;
+
+  // effect: performs an O(1) rebalance, which will "heal" an imbalance by at
+  // most 1. effect: if the new root is not this node, then this node is
+  // unlocked. returns: locked node representing the new root of the rebalanced
+  // subtree
+  treenode *maybe_rebalance(void);
+
+  // returns: allocated treenode populated with a copy of the range and txnid
+  static treenode *alloc(const comparator *cmp, const keyrange &range,
+                         TXNID txnid, bool is_shared);
+
+  // requires: node is a locked root node, or an unlocked non-root node
+  static void free(treenode *node);
+
+  // effect: swaps the range/txnid pairs for node1 and node2.
+  static void swap_in_place(treenode *node1, treenode *node2);
+
+  friend class concurrent_tree_unit_test;
+};
+
+} /* namespace toku */
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc
new file mode 100644
index 000000000..4caf1e26f
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc
@@ -0,0 +1,120 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "txnid_set.h"
+
+#include "../db.h"
+
+namespace toku {
+
+int find_by_txnid(const TXNID &txnid_a, const TXNID &txnid_b);
+int find_by_txnid(const TXNID &txnid_a, const TXNID &txnid_b) {
+  if (txnid_a < txnid_b) {
+    return -1;
+  } else if (txnid_a == txnid_b) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+void txnid_set::create(void) {
+  // lazily allocate the underlying omt, since it is common
+  // to create a txnid set and never put anything in it.
+  m_txnids.create_no_array();
+}
+
+void txnid_set::destroy(void) { m_txnids.destroy(); }
+
+// Return true if the given transaction id is a member of the set.
+// Otherwise, return false.
+bool txnid_set::contains(TXNID txnid) const {
+  TXNID find_txnid;
+  int r = m_txnids.find_zero<TXNID, find_by_txnid>(txnid, &find_txnid, nullptr);
+  return r == 0 ? true : false;
+}
+
+// Add a given txnid to the set
+void txnid_set::add(TXNID txnid) {
+  int r = m_txnids.insert<TXNID, find_by_txnid>(txnid, txnid, nullptr);
+  invariant(r == 0 || r == DB_KEYEXIST);
+}
+
+// Delete a given txnid from the set.
+void txnid_set::remove(TXNID txnid) {
+  uint32_t idx;
+  int r = m_txnids.find_zero<TXNID, find_by_txnid>(txnid, nullptr, &idx);
+  if (r == 0) {
+    r = m_txnids.delete_at(idx);
+    invariant_zero(r);
+  }
+}
+
+// Return the size of the set
+uint32_t txnid_set::size(void) const { return m_txnids.size(); }
+
+// Get the ith id in the set, assuming that the set is sorted.
+TXNID txnid_set::get(uint32_t i) const {
+  TXNID txnid;
+  int r = m_txnids.fetch(i, &txnid);
+  if (r == EINVAL) /* Shouldn't happen, avoid compiler warning */
+    return TXNID_NONE;
+  invariant_zero(r);
+  return txnid;
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h
new file mode 100644
index 000000000..d79c24fb0
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h
@@ -0,0 +1,92 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "../portability/txn_subst.h"
+#include "../util/omt.h"
+
+namespace toku {
+
+class txnid_set {
+ public:
+  // effect: Creates an empty set. Does not malloc space for
+  //         any entries yet. That is done lazily on add().
+  void create(void);
+
+  // effect: Destroy the set's internals.
+  void destroy(void);
+
+  // returns: True if the given txnid is a member of the set.
+  bool contains(TXNID id) const;
+
+  // effect: Adds a given txnid to the set if it did not exist
+  void add(TXNID txnid);
+
+  // effect: Deletes a txnid from the set if it exists.
+  void remove(TXNID txnid);
+
+  // returns: Size of the set
+  uint32_t size(void) const;
+
+  // returns: The "i'th" id in the set, as if it were sorted.
+  TXNID get(uint32_t i) const;
+
+ private:
+  toku::omt<TXNID> m_txnids;
+
+  friend class txnid_set_unit_test;
+};
+ENSURE_POD(txnid_set);
+
+} /* namespace toku */
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc
new file mode 100644
index 000000000..24536c88e
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc
@@ -0,0 +1,213 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "../db.h"
+#include "../portability/memory.h"
+// PORT #include <toku_assert.h>
+#include <memory.h>
+#include <string.h>
+
+#include "txnid_set.h"
+#include "wfg.h"
+
+namespace toku {
+
+// Create a lock request graph
+void wfg::create(void) { m_nodes.create(); }
+
+// Destroy the internals of the lock request graph
+void wfg::destroy(void) {
+  uint32_t n_nodes = m_nodes.size();
+  for (uint32_t i = 0; i < n_nodes; i++) {
+    node *n;
+    int r = m_nodes.fetch(i, &n);
+    invariant_zero(r);
+    invariant_notnull(n);
+    if (r) continue;  // Get rid of "may be used uninitialized" warning
+    node::free(n);
+  }
+  m_nodes.destroy();
+}
+
+// Add an edge (a_id, b_id) to the graph
+void wfg::add_edge(TXNID a_txnid, TXNID b_txnid) {
+  node *a_node = find_create_node(a_txnid);
+  node *b_node = find_create_node(b_txnid);
+  a_node->edges.add(b_node->txnid);
+}
+
+// Return true if a node with the given transaction id exists in the graph.
+// Return false otherwise.
+bool wfg::node_exists(TXNID txnid) {
+  node *n = find_node(txnid);
+  return n != NULL;
+}
+
+bool wfg::cycle_exists_from_node(node *target, node *head,
+                                 std::function<void(TXNID)> reporter) {
+  bool cycle_found = false;
+  head->visited = true;
+  uint32_t n_edges = head->edges.size();
+  for (uint32_t i = 0; i < n_edges && !cycle_found; i++) {
+    TXNID edge_id = head->edges.get(i);
+    if (target->txnid == edge_id) {
+      cycle_found = true;
+      if (reporter) reporter(edge_id);
+    } else {
+      node *new_head = find_node(edge_id);
+      if (new_head && !new_head->visited) {
+        cycle_found = cycle_exists_from_node(target, new_head, reporter);
+        if (cycle_found && reporter) reporter(edge_id);
+      }
+    }
+  }
+  head->visited = false;
+  return cycle_found;
+}
+
+// Return true if there exists a cycle from a given transaction id in the graph.
+// Return false otherwise.
+bool wfg::cycle_exists_from_txnid(TXNID txnid,
+                                  std::function<void(TXNID)> reporter) {
+  node *a_node = find_node(txnid);
+  bool cycles_found = false;
+  if (a_node) {
+    cycles_found = cycle_exists_from_node(a_node, a_node, reporter);
+  }
+  return cycles_found;
+}
+
+// Apply a given function f to all of the nodes in the graph.  The apply
+// function returns when the function f is called for all of the nodes in the
+// graph, or the function f returns non-zero.
+void wfg::apply_nodes(int (*fn)(TXNID id, void *extra), void *extra) {
+  int r = 0;
+  uint32_t n_nodes = m_nodes.size();
+  for (uint32_t i = 0; i < n_nodes && r == 0; i++) {
+    node *n;
+    r = m_nodes.fetch(i, &n);
+    invariant_zero(r);
+    if (r) continue;  // Get rid of "may be used uninitialized" warning
+    r = fn(n->txnid, extra);
+  }
+}
+
+// Apply a given function f to all of the edges whose origin is a given node id.
+// The apply function returns when the function f is called for all edges in the
+// graph rooted at node id, or the function f returns non-zero.
+void wfg::apply_edges(TXNID txnid,
+                      int (*fn)(TXNID txnid, TXNID edge_txnid, void *extra),
+                      void *extra) {
+  node *n = find_node(txnid);
+  if (n) {
+    int r = 0;
+    uint32_t n_edges = n->edges.size();
+    for (uint32_t i = 0; i < n_edges && r == 0; i++) {
+      r = fn(txnid, n->edges.get(i), extra);
+    }
+  }
+}
+
+// find node by id
+wfg::node *wfg::find_node(TXNID txnid) {
+  node *n = nullptr;
+  int r = m_nodes.find_zero<TXNID, find_by_txnid>(txnid, &n, nullptr);
+  invariant(r == 0 || r == DB_NOTFOUND);
+  return n;
+}
+
+// this is the omt comparison function
+// nodes are compared by their txnid.
+int wfg::find_by_txnid(node *const &node_a, const TXNID &txnid_b) {
+  TXNID txnid_a = node_a->txnid;
+  if (txnid_a < txnid_b) {
+    return -1;
+  } else if (txnid_a == txnid_b) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+// insert a new node
+wfg::node *wfg::find_create_node(TXNID txnid) {
+  node *n;
+  uint32_t idx;
+  int r = m_nodes.find_zero<TXNID, find_by_txnid>(txnid, &n, &idx);
+  if (r == DB_NOTFOUND) {
+    n = node::alloc(txnid);
+    r = m_nodes.insert_at(n, idx);
+    invariant_zero(r);
+  }
+  invariant_notnull(n);
+  return n;
+}
+
+wfg::node *wfg::node::alloc(TXNID txnid) {
+  node *XCALLOC(n);
+  n->txnid = txnid;
+  n->visited = false;
+  n->edges.create();
+  return n;
+}
+
+void wfg::node::free(wfg::node *n) {
+  n->edges.destroy();
+  toku_free(n);
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h
new file mode 100644
index 000000000..804202170
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h
@@ -0,0 +1,124 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <functional>
+
+#include "../util/omt.h"
+#include "txnid_set.h"
+
+namespace toku {
+
+// A wfg is a 'wait-for' graph. A directed edge in represents one
+// txn waiting for another to finish before it can acquire a lock.
+
+class wfg {
+ public:
+  // Create a lock request graph
+  void create(void);
+
+  // Destroy the internals of the lock request graph
+  void destroy(void);
+
+  // Add an edge (a_id, b_id) to the graph
+  void add_edge(TXNID a_txnid, TXNID b_txnid);
+
+  // Return true if a node with the given transaction id exists in the graph.
+  // Return false otherwise.
+  bool node_exists(TXNID txnid);
+
+  // Return true if there exists a cycle from a given transaction id in the
+  // graph. Return false otherwise.
+  bool cycle_exists_from_txnid(TXNID txnid,
+                               std::function<void(TXNID)> reporter);
+
+  // Apply a given function f to all of the nodes in the graph.  The apply
+  // function returns when the function f is called for all of the nodes in the
+  // graph, or the function f returns non-zero.
+  void apply_nodes(int (*fn)(TXNID txnid, void *extra), void *extra);
+
+  // Apply a given function f to all of the edges whose origin is a given node
+  // id. The apply function returns when the function f is called for all edges
+  // in the graph rooted at node id, or the function f returns non-zero.
+  void apply_edges(TXNID txnid,
+                   int (*fn)(TXNID txnid, TXNID edge_txnid, void *extra),
+                   void *extra);
+
+ private:
+  struct node {
+    // txnid for this node and the associated set of edges
+    TXNID txnid;
+    txnid_set edges;
+    bool visited;
+
+    static node *alloc(TXNID txnid);
+
+    static void free(node *n);
+  };
+  ENSURE_POD(node);
+
+  toku::omt<node *> m_nodes;
+
+  node *find_node(TXNID txnid);
+
+  node *find_create_node(TXNID txnid);
+
+  bool cycle_exists_from_node(node *target, node *head,
+                              std::function<void(TXNID)> reporter);
+
+  static int find_by_txnid(node *const &node_a, const TXNID &txnid_b);
+};
+ENSURE_POD(wfg);
+
+} /* namespace toku */
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/memory.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/memory.h
new file mode 100644
index 000000000..0a621f8e0
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/memory.h
@@ -0,0 +1,215 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <stdlib.h>
+
+#include "toku_portability.h"
+
+/* Percona memory allocation functions and macros.
+ * These are functions for malloc and free */
+
+int toku_memory_startup(void) __attribute__((constructor));
+void toku_memory_shutdown(void) __attribute__((destructor));
+
+/* Generally: errno is set to 0 or a value to indicate problems. */
+
+// Everything should call toku_malloc() instead of malloc(), and toku_calloc()
+// instead of calloc() That way the tests can can, e.g.,  replace the malloc
+// function using toku_set_func_malloc().
+void *toku_calloc(size_t nmemb, size_t size)
+    __attribute__((__visibility__("default")));
+void *toku_xcalloc(size_t nmemb, size_t size)
+    __attribute__((__visibility__("default")));
+void *toku_malloc(size_t size) __attribute__((__visibility__("default")));
+void *toku_malloc_aligned(size_t alignment, size_t size)
+    __attribute__((__visibility__("default")));
+
+// xmalloc aborts instead of return NULL if we run out of memory
+void *toku_xmalloc(size_t size) __attribute__((__visibility__("default")));
+void *toku_xrealloc(void *, size_t size)
+    __attribute__((__visibility__("default")));
+void *toku_xmalloc_aligned(size_t alignment, size_t size)
+    __attribute__((__visibility__("default")));
+// Effect: Perform a os_malloc_aligned(size) with the additional property that
+// the returned pointer is a multiple of ALIGNMENT.
+//  Fail with a resource_assert if the allocation fails (don't return an error
+//  code). If the alloc_aligned function has been set then call it instead.
+// Requires: alignment is a power of two.
+
+void toku_free(void *) __attribute__((__visibility__("default")));
+
+size_t toku_malloc_usable_size(void *p)
+    __attribute__((__visibility__("default")));
+
+/* MALLOC is a macro that helps avoid a common error:
+ * Suppose I write
+ *    struct foo *x = malloc(sizeof(struct foo));
+ * That works fine.  But if I change it to this, I've probably made an mistake:
+ *    struct foo *x = malloc(sizeof(struct bar));
+ * It can get worse, since one might have something like
+ *    struct foo *x = malloc(sizeof(struct foo *))
+ * which looks reasonable, but it allocoates enough to hold a pointer instead of
+ * the amount needed for the struct. So instead, write struct foo *MALLOC(x);
+ * and you cannot go wrong.
+ */
+#define MALLOC(v) CAST_FROM_VOIDP(v, toku_malloc(sizeof(*v)))
+/* MALLOC_N is like calloc(Except no 0ing of data):  It makes an array.  Write
+ *   int *MALLOC_N(5,x);
+ * to make an array of 5 integers.
+ */
+#define MALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_malloc((n) * sizeof(*v)))
+#define MALLOC_N_ALIGNED(align, n, v) \
+  CAST_FROM_VOIDP(v, toku_malloc_aligned((align), (n) * sizeof(*v)))
+
+// CALLOC_N is like calloc with auto-figuring out size of members
+#define CALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_calloc((n), sizeof(*v)))
+
+#define CALLOC(v) CALLOC_N(1, v)
+
+// XMALLOC macros are like MALLOC except they abort if the operation fails
+#define XMALLOC(v) CAST_FROM_VOIDP(v, toku_xmalloc(sizeof(*v)))
+#define XMALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_xmalloc((n) * sizeof(*v)))
+#define XCALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_xcalloc((n), (sizeof(*v))))
+#define XCALLOC(v) XCALLOC_N(1, v)
+#define XREALLOC(v, s) CAST_FROM_VOIDP(v, toku_xrealloc(v, s))
+#define XREALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_xrealloc(v, (n) * sizeof(*v)))
+
+#define XMALLOC_N_ALIGNED(align, n, v) \
+  CAST_FROM_VOIDP(v, toku_xmalloc_aligned((align), (n) * sizeof(*v)))
+
+#define XMEMDUP(dst, src) CAST_FROM_VOIDP(dst, toku_xmemdup(src, sizeof(*src)))
+#define XMEMDUP_N(dst, src, len) CAST_FROM_VOIDP(dst, toku_xmemdup(src, len))
+
+// ZERO_ARRAY writes zeroes to a stack-allocated array
+#define ZERO_ARRAY(o)          \
+  do {                         \
+    memset((o), 0, sizeof(o)); \
+  } while (0)
+// ZERO_STRUCT writes zeroes to a stack-allocated struct
+#define ZERO_STRUCT(o)          \
+  do {                          \
+    memset(&(o), 0, sizeof(o)); \
+  } while (0)
+
+/* Copy memory.  Analogous to strdup() */
+void *toku_memdup(const void *v, size_t len);
+/* Toku-version of strdup.  Use this so that it calls toku_malloc() */
+char *toku_strdup(const char *s) __attribute__((__visibility__("default")));
+/* Toku-version of strndup.  Use this so that it calls toku_malloc() */
+char *toku_strndup(const char *s, size_t n)
+    __attribute__((__visibility__("default")));
+/* Copy memory.  Analogous to strdup() Crashes instead of returning NULL */
+void *toku_xmemdup(const void *v, size_t len)
+    __attribute__((__visibility__("default")));
+/* Toku-version of strdup.  Use this so that it calls toku_xmalloc()  Crashes
+ * instead of returning NULL */
+char *toku_xstrdup(const char *s) __attribute__((__visibility__("default")));
+
+void toku_malloc_cleanup(
+    void); /* Before exiting, call this function to free up any internal data
+              structures from toku_malloc.  Otherwise valgrind will complain of
+              memory leaks. */
+
+/* Check to see if everything malloc'd was free.  Might be a no-op depending on
+ * how memory.c is configured. */
+void toku_memory_check_all_free(void);
+/* Check to see if memory is "sane".  Might be a no-op.  Probably better to
+ * simply use valgrind. */
+void toku_do_memory_check(void);
+
+typedef void *(*malloc_fun_t)(size_t);
+typedef void (*free_fun_t)(void *);
+typedef void *(*realloc_fun_t)(void *, size_t);
+typedef void *(*malloc_aligned_fun_t)(size_t /*alignment*/, size_t /*size*/);
+typedef void *(*realloc_aligned_fun_t)(size_t /*alignment*/, void * /*pointer*/,
+                                       size_t /*size*/);
+
+void toku_set_func_malloc(malloc_fun_t f);
+void toku_set_func_xmalloc_only(malloc_fun_t f);
+void toku_set_func_malloc_only(malloc_fun_t f);
+void toku_set_func_realloc(realloc_fun_t f);
+void toku_set_func_xrealloc_only(realloc_fun_t f);
+void toku_set_func_realloc_only(realloc_fun_t f);
+void toku_set_func_free(free_fun_t f);
+
+typedef struct memory_status {
+  uint64_t malloc_count;   // number of malloc operations
+  uint64_t free_count;     // number of free operations
+  uint64_t realloc_count;  // number of realloc operations
+  uint64_t malloc_fail;    // number of malloc operations that failed
+  uint64_t realloc_fail;   // number of realloc operations that failed
+  uint64_t requested;      // number of bytes requested
+  uint64_t used;   // number of bytes used (requested + overhead), obtained from
+                   // malloc_usable_size()
+  uint64_t freed;  // number of bytes freed;
+  uint64_t max_requested_size;  // largest attempted allocation size
+  uint64_t last_failed_size;    // size of the last failed allocation attempt
+  volatile uint64_t
+      max_in_use;  // maximum memory footprint (used - freed), approximate (not
+                   // worth threadsafety overhead for exact)
+  const char *mallocator_version;
+  uint64_t mmap_threshold;
+} LOCAL_MEMORY_STATUS_S, *LOCAL_MEMORY_STATUS;
+
+void toku_memory_get_status(LOCAL_MEMORY_STATUS s);
+
+// Effect: Like toku_memory_footprint, except instead of passing p,
+//   we pass toku_malloc_usable_size(p).
+size_t toku_memory_footprint_given_usable_size(size_t touched, size_t usable);
+
+// Effect: Return an estimate how how much space an object is using, possibly by
+//   using toku_malloc_usable_size(p).
+//   If p is NULL then returns 0.
+size_t toku_memory_footprint(void *p, size_t touched);
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h
new file mode 100644
index 000000000..af47800fb
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h
@@ -0,0 +1,39 @@
+//
+// A replacement for toku_assert.h
+//
+#pragma once
+
+#include <assert.h>
+#include <errno.h>
+
+#ifdef NDEBUG
+
+#define assert_zero(a) ((void)(a))
+#define invariant(a) ((void)(a))
+#define invariant_notnull(a) ((void)(a))
+#define invariant_zero(a) ((void)(a))
+
+#else
+
+#define assert_zero(a) assert((a) == 0)
+#define invariant(a) assert(a)
+#define invariant_notnull(a) assert(a)
+#define invariant_zero(a) assert_zero(a)
+
+#endif
+
+#define lazy_assert_zero(a) assert_zero(a)
+
+#define paranoid_invariant_zero(a) assert_zero(a)
+#define paranoid_invariant_notnull(a) assert(a)
+#define paranoid_invariant(a) assert(a)
+
+#define ENSURE_POD(type)                                                    \
+  static_assert(                                                            \
+      std::is_standard_layout<type>::value && std::is_trivial<type>::value, \
+      #type "isn't POD")
+
+inline int get_error_errno(void) {
+  invariant(errno);
+  return errno;
+}
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h
new file mode 100644
index 000000000..aaa2298fa
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h
@@ -0,0 +1,130 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+// PORT2: #include <portability/toku_config.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "toku_assert_subst.h"
+
+__attribute__((const, always_inline)) static inline intptr_t which_cache_line(
+    intptr_t addr) {
+  static const size_t assumed_cache_line_size = 64;
+  return addr / assumed_cache_line_size;
+}
+template <typename T>
+__attribute__((const, always_inline)) static inline bool crosses_boundary(
+    T *addr, size_t width) {
+  const intptr_t int_addr = reinterpret_cast<intptr_t>(addr);
+  const intptr_t last_byte = int_addr + width - 1;
+  return which_cache_line(int_addr) != which_cache_line(last_byte);
+}
+
+template <typename T, typename U>
+__attribute__((always_inline)) static inline T toku_sync_fetch_and_add(T *addr,
+                                                                       U diff) {
+  paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+  return __sync_fetch_and_add(addr, diff);
+}
+template <typename T, typename U>
+__attribute__((always_inline)) static inline T toku_sync_add_and_fetch(T *addr,
+                                                                       U diff) {
+  paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+  return __sync_add_and_fetch(addr, diff);
+}
+template <typename T, typename U>
+__attribute__((always_inline)) static inline T toku_sync_fetch_and_sub(T *addr,
+                                                                       U diff) {
+  paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+  return __sync_fetch_and_sub(addr, diff);
+}
+template <typename T, typename U>
+__attribute__((always_inline)) static inline T toku_sync_sub_and_fetch(T *addr,
+                                                                       U diff) {
+  paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+  return __sync_sub_and_fetch(addr, diff);
+}
+template <typename T, typename U, typename V>
+__attribute__((always_inline)) static inline T toku_sync_val_compare_and_swap(
+    T *addr, U oldval, V newval) {
+  paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+  return __sync_val_compare_and_swap(addr, oldval, newval);
+}
+template <typename T, typename U, typename V>
+__attribute__((always_inline)) static inline bool
+toku_sync_bool_compare_and_swap(T *addr, U oldval, V newval) {
+  paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+  return __sync_bool_compare_and_swap(addr, oldval, newval);
+}
+
+// in case you include this but not toku_portability.h
+#pragma GCC poison __sync_fetch_and_add
+#pragma GCC poison __sync_fetch_and_sub
+#pragma GCC poison __sync_fetch_and_or
+#pragma GCC poison __sync_fetch_and_and
+#pragma GCC poison __sync_fetch_and_xor
+#pragma GCC poison __sync_fetch_and_nand
+#pragma GCC poison __sync_add_and_fetch
+#pragma GCC poison __sync_sub_and_fetch
+#pragma GCC poison __sync_or_and_fetch
+#pragma GCC poison __sync_and_and_fetch
+#pragma GCC poison __sync_xor_and_fetch
+#pragma GCC poison __sync_nand_and_fetch
+#pragma GCC poison __sync_bool_compare_and_swap
+#pragma GCC poison __sync_val_compare_and_swap
+#pragma GCC poison __sync_synchronize
+#pragma GCC poison __sync_lock_test_and_set
+#pragma GCC poison __sync_release
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h
new file mode 100644
index 000000000..eb8291c1d
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h
@@ -0,0 +1,83 @@
+/*
+  A wrapper around ROCKSDB_NAMESPACE::TransactionDBMutexFactory-provided
+  condition and mutex that provides toku_pthread_*-like interface. The functions
+  are named
+
+    toku_external_{mutex|cond}_XXX
+
+  Lock Tree uses this mutex and condition for interruptible (long) lock waits.
+
+  (It also still uses toku_pthread_XXX calls for mutexes/conditions for
+   shorter waits on internal objects)
+*/
+
+#pragma once
+
+#include <pthread.h>
+#include <stdint.h>
+#include <time.h>
+
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/utilities/transaction_db_mutex.h"
+#include "toku_portability.h"
+
+using ROCKSDB_NAMESPACE::TransactionDBCondVar;
+using ROCKSDB_NAMESPACE::TransactionDBMutex;
+
+typedef std::shared_ptr<ROCKSDB_NAMESPACE::TransactionDBMutexFactory>
+    toku_external_mutex_factory_t;
+
+typedef std::shared_ptr<TransactionDBMutex> toku_external_mutex_t;
+typedef std::shared_ptr<TransactionDBCondVar> toku_external_cond_t;
+
+static inline void toku_external_cond_init(
+    toku_external_mutex_factory_t mutex_factory, toku_external_cond_t *cond) {
+  *cond = mutex_factory->AllocateCondVar();
+}
+
+inline void toku_external_cond_destroy(toku_external_cond_t *cond) {
+  cond->reset();  // this will destroy the managed object
+}
+
+inline void toku_external_cond_signal(toku_external_cond_t *cond) {
+  (*cond)->Notify();
+}
+
+inline void toku_external_cond_broadcast(toku_external_cond_t *cond) {
+  (*cond)->NotifyAll();
+}
+
+inline int toku_external_cond_timedwait(toku_external_cond_t *cond,
+                                        toku_external_mutex_t *mutex,
+                                        int64_t timeout_microsec) {
+  auto res = (*cond)->WaitFor(*mutex, timeout_microsec);
+  if (res.ok())
+    return 0;
+  else
+    return ETIMEDOUT;
+}
+
+inline void toku_external_mutex_init(toku_external_mutex_factory_t factory,
+                                     toku_external_mutex_t *mutex) {
+  // Use placement new: the memory has been allocated but constructor wasn't
+  // called
+  new (mutex) toku_external_mutex_t;
+  *mutex = factory->AllocateMutex();
+}
+
+inline void toku_external_mutex_lock(toku_external_mutex_t *mutex) {
+  (*mutex)->Lock();
+}
+
+inline int toku_external_mutex_trylock(toku_external_mutex_t *mutex) {
+  (*mutex)->Lock();
+  return 0;
+}
+
+inline void toku_external_mutex_unlock(toku_external_mutex_t *mutex) {
+  (*mutex)->UnLock();
+}
+
+inline void toku_external_mutex_destroy(toku_external_mutex_t *mutex) {
+  mutex->reset();  // this will destroy the managed object
+}
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h
new file mode 100644
index 000000000..c967e7177
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h
@@ -0,0 +1,286 @@
+/*======
+This file is part of PerconaFT.
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#pragma once
+
+#include <stdio.h>  // FILE
+
+// Performance instrumentation object identifier type
+typedef unsigned int pfs_key_t;
+
+enum class toku_instr_object_type { mutex, rwlock, cond, thread, file };
+
+struct PSI_file;
+
+struct TOKU_FILE {
+  /** The real file. */
+  FILE *file;
+  struct PSI_file *key;
+  TOKU_FILE() : file(nullptr), key(nullptr) {}
+};
+
+struct PSI_mutex;
+struct PSI_cond;
+struct PSI_rwlock;
+
+struct toku_mutex_t;
+struct toku_cond_t;
+struct toku_pthread_rwlock_t;
+
+class toku_instr_key;
+
+class toku_instr_probe_empty {
+ public:
+  explicit toku_instr_probe_empty(UU(const toku_instr_key &key)) {}
+
+  void start_with_source_location(UU(const char *src_file), UU(int src_line)) {}
+
+  void stop() {}
+};
+
+#define TOKU_PROBE_START(p) p->start_with_source_location(__FILE__, __LINE__)
+#define TOKU_PROBE_STOP(p) p->stop
+
+extern toku_instr_key toku_uninstrumented;
+
+#ifndef MYSQL_TOKUDB_ENGINE
+
+#include <pthread.h>
+
+class toku_instr_key {
+ public:
+  toku_instr_key(UU(toku_instr_object_type type), UU(const char *group),
+                 UU(const char *name)) {}
+
+  explicit toku_instr_key(UU(pfs_key_t key_id)) {}
+  // No-instrumentation constructor:
+  toku_instr_key() {}
+  ~toku_instr_key() {}
+};
+
+typedef toku_instr_probe_empty toku_instr_probe;
+
+enum class toku_instr_file_op {
+  file_stream_open,
+  file_create,
+  file_open,
+  file_delete,
+  file_rename,
+  file_read,
+  file_write,
+  file_sync,
+  file_stream_close,
+  file_close,
+  file_stat
+};
+
+struct PSI_file {};
+struct PSI_mutex {};
+
+struct toku_io_instrumentation {};
+
+inline int toku_pthread_create(UU(const toku_instr_key &key), pthread_t *thread,
+                               const pthread_attr_t *attr,
+                               void *(*start_routine)(void *), void *arg) {
+  return pthread_create(thread, attr, start_routine, arg);
+}
+
+inline void toku_instr_register_current_thread() {}
+
+inline void toku_instr_delete_current_thread() {}
+
+// Instrument file creation, opening, closing, and renaming
+inline void toku_instr_file_open_begin(UU(toku_io_instrumentation &io_instr),
+                                       UU(const toku_instr_key &key),
+                                       UU(toku_instr_file_op op),
+                                       UU(const char *name),
+                                       UU(const char *src_file),
+                                       UU(int src_line)) {}
+
+inline void toku_instr_file_stream_open_end(
+    UU(toku_io_instrumentation &io_instr), UU(TOKU_FILE &file)) {}
+
+inline void toku_instr_file_open_end(UU(toku_io_instrumentation &io_instr),
+                                     UU(int fd)) {}
+
+inline void toku_instr_file_name_close_begin(
+    UU(toku_io_instrumentation &io_instr), UU(const toku_instr_key &key),
+    UU(toku_instr_file_op op), UU(const char *name), UU(const char *src_file),
+    UU(int src_line)) {}
+
+inline void toku_instr_file_stream_close_begin(
+    UU(toku_io_instrumentation &io_instr), UU(toku_instr_file_op op),
+    UU(TOKU_FILE &file), UU(const char *src_file), UU(int src_line)) {}
+
+inline void toku_instr_file_fd_close_begin(
+    UU(toku_io_instrumentation &io_instr), UU(toku_instr_file_op op),
+    UU(int fd), UU(const char *src_file), UU(int src_line)) {}
+
+inline void toku_instr_file_close_end(UU(toku_io_instrumentation &io_instr),
+                                      UU(int result)) {}
+
+inline void toku_instr_file_io_begin(UU(toku_io_instrumentation &io_instr),
+                                     UU(toku_instr_file_op op), UU(int fd),
+                                     UU(unsigned int count),
+                                     UU(const char *src_file),
+                                     UU(int src_line)) {}
+
+inline void toku_instr_file_name_io_begin(
+    UU(toku_io_instrumentation &io_instr), UU(const toku_instr_key &key),
+    UU(toku_instr_file_op op), UU(const char *name), UU(unsigned int count),
+    UU(const char *src_file), UU(int src_line)) {}
+
+inline void toku_instr_file_stream_io_begin(
+    UU(toku_io_instrumentation &io_instr), UU(toku_instr_file_op op),
+    UU(TOKU_FILE &file), UU(unsigned int count), UU(const char *src_file),
+    UU(int src_line)) {}
+
+inline void toku_instr_file_io_end(UU(toku_io_instrumentation &io_instr),
+                                   UU(unsigned int count)) {}
+
+struct toku_mutex_t;
+
+struct toku_mutex_instrumentation {};
+
+inline PSI_mutex *toku_instr_mutex_init(UU(const toku_instr_key &key),
+                                        UU(toku_mutex_t &mutex)) {
+  return nullptr;
+}
+
+inline void toku_instr_mutex_destroy(UU(PSI_mutex *&mutex_instr)) {}
+
+inline void toku_instr_mutex_lock_start(
+    UU(toku_mutex_instrumentation &mutex_instr), UU(toku_mutex_t &mutex),
+    UU(const char *src_file), UU(int src_line)) {}
+
+inline void toku_instr_mutex_trylock_start(
+    UU(toku_mutex_instrumentation &mutex_instr), UU(toku_mutex_t &mutex),
+    UU(const char *src_file), UU(int src_line)) {}
+
+inline void toku_instr_mutex_lock_end(
+    UU(toku_mutex_instrumentation &mutex_instr),
+    UU(int pthread_mutex_lock_result)) {}
+
+inline void toku_instr_mutex_unlock(UU(PSI_mutex *mutex_instr)) {}
+
+struct toku_cond_instrumentation {};
+
+enum class toku_instr_cond_op {
+  cond_wait,
+  cond_timedwait,
+};
+
+inline PSI_cond *toku_instr_cond_init(UU(const toku_instr_key &key),
+                                      UU(toku_cond_t &cond)) {
+  return nullptr;
+}
+
+inline void toku_instr_cond_destroy(UU(PSI_cond *&cond_instr)) {}
+
+inline void toku_instr_cond_wait_start(
+    UU(toku_cond_instrumentation &cond_instr), UU(toku_instr_cond_op op),
+    UU(toku_cond_t &cond), UU(toku_mutex_t &mutex), UU(const char *src_file),
+    UU(int src_line)) {}
+
+inline void toku_instr_cond_wait_end(UU(toku_cond_instrumentation &cond_instr),
+                                     UU(int pthread_cond_wait_result)) {}
+
+inline void toku_instr_cond_signal(UU(toku_cond_t &cond)) {}
+
+inline void toku_instr_cond_broadcast(UU(toku_cond_t &cond)) {}
+
+#if 0
+// rw locks are not used 
+// rwlock instrumentation
+struct toku_rwlock_instrumentation {};
+
+inline PSI_rwlock *toku_instr_rwlock_init(UU(const toku_instr_key &key),
+                                          UU(toku_pthread_rwlock_t &rwlock)) {
+    return nullptr;
+}
+
+inline void toku_instr_rwlock_destroy(UU(PSI_rwlock *&rwlock_instr)) {}
+
+inline void toku_instr_rwlock_rdlock_wait_start(
+    UU(toku_rwlock_instrumentation &rwlock_instr),
+    UU(toku_pthread_rwlock_t &rwlock),
+    UU(const char *src_file),
+    UU(int src_line)) {}
+
+inline void toku_instr_rwlock_wrlock_wait_start(
+    UU(toku_rwlock_instrumentation &rwlock_instr),
+    UU(toku_pthread_rwlock_t &rwlock),
+    UU(const char *src_file),
+    UU(int src_line)) {}
+
+inline void toku_instr_rwlock_rdlock_wait_end(
+    UU(toku_rwlock_instrumentation &rwlock_instr),
+    UU(int pthread_rwlock_wait_result)) {}
+
+inline void toku_instr_rwlock_wrlock_wait_end(
+    UU(toku_rwlock_instrumentation &rwlock_instr),
+    UU(int pthread_rwlock_wait_result)) {}
+
+inline void toku_instr_rwlock_unlock(UU(toku_pthread_rwlock_t &rwlock)) {}
+#endif
+
+#else  // MYSQL_TOKUDB_ENGINE
+// There can be not only mysql but also mongodb or any other PFS stuff
+#include <toku_instr_mysql.h>
+#endif  // MYSQL_TOKUDB_ENGINE
+
+// Mutexes
+extern toku_instr_key manager_escalation_mutex_key;
+extern toku_instr_key manager_escalator_mutex_key;
+extern toku_instr_key manager_mutex_key;
+extern toku_instr_key treenode_mutex_key;
+extern toku_instr_key locktree_request_info_mutex_key;
+extern toku_instr_key locktree_request_info_retry_mutex_key;
+
+// condition vars
+extern toku_instr_key lock_request_m_wait_cond_key;
+extern toku_instr_key locktree_request_info_retry_cv_key;
+extern toku_instr_key manager_m_escalator_done_key;  // unused
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h
new file mode 100644
index 000000000..9a95b38bd
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h
@@ -0,0 +1,87 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#if defined(__clang__)
+#define constexpr_static_assert(a, b)
+#else
+#define constexpr_static_assert(a, b) static_assert(a, b)
+#endif
+
+// include here, before they get deprecated
+#include <inttypes.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "toku_atomic.h"
+
+#if defined(__cplusplus)
+#include <type_traits>
+#endif
+
+#if defined(__cplusplus)
+// decltype() here gives a reference-to-pointer instead of just a pointer,
+// just use __typeof__
+#define CAST_FROM_VOIDP(name, value) name = static_cast<__typeof__(name)>(value)
+#else
+#define CAST_FROM_VOIDP(name, value) name = cast_to_typeof(name)(value)
+#endif
+
+#define UU(x) x __attribute__((__unused__))
+
+#include "toku_instrumentation.h"
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h
new file mode 100644
index 000000000..571b950e1
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h
@@ -0,0 +1,520 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <pthread.h>
+#include <stdint.h>
+#include <time.h>
+
+#include "toku_portability.h"
+// PORT2: #include "toku_assert.h"
+
+// TODO: some things moved toku_instrumentation.h, not necessarily the best
+// place
+typedef pthread_attr_t toku_pthread_attr_t;
+typedef pthread_t toku_pthread_t;
+typedef pthread_mutex_t toku_pthread_mutex_t;
+typedef pthread_condattr_t toku_pthread_condattr_t;
+typedef pthread_cond_t toku_pthread_cond_t;
+typedef pthread_rwlockattr_t toku_pthread_rwlockattr_t;
+typedef pthread_key_t toku_pthread_key_t;
+typedef struct timespec toku_timespec_t;
+
+// TODO: break this include loop
+#include <pthread.h>
+typedef pthread_mutexattr_t toku_pthread_mutexattr_t;
+
+struct toku_mutex_t {
+  pthread_mutex_t pmutex;
+  struct PSI_mutex *psi_mutex; /* The performance schema instrumentation hook */
+#if defined(TOKU_PTHREAD_DEBUG)
+  pthread_t owner;  // = pthread_self(); // for debugging
+  bool locked;
+  bool valid;
+  pfs_key_t instr_key_id;
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+};
+
+struct toku_cond_t {
+  pthread_cond_t pcond;
+  struct PSI_cond *psi_cond;
+#if defined(TOKU_PTHREAD_DEBUG)
+  pfs_key_t instr_key_id;
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+};
+
+#if defined(TOKU_PTHREAD_DEBUG)
+#define TOKU_COND_INITIALIZER \
+  { .pcond = PTHREAD_COND_INITIALIZER, .psi_cond = nullptr, .instr_key_id = 0 }
+#else
+#define TOKU_COND_INITIALIZER \
+  { .pcond = PTHREAD_COND_INITIALIZER, .psi_cond = nullptr }
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+
+struct toku_pthread_rwlock_t {
+  pthread_rwlock_t rwlock;
+  struct PSI_rwlock *psi_rwlock;
+#if defined(TOKU_PTHREAD_DEBUG)
+  pfs_key_t instr_key_id;
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+};
+
+typedef struct toku_mutex_aligned {
+  toku_mutex_t aligned_mutex __attribute__((__aligned__(64)));
+} toku_mutex_aligned_t;
+
+// Initializing with {} will fill in a struct with all zeros.
+// But you may also need a pragma to suppress the warnings, as follows
+//
+//   #pragma GCC diagnostic push
+//   #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+//   toku_mutex_t foo = ZERO_MUTEX_INITIALIZER;
+//   #pragma GCC diagnostic pop
+//
+// In general it will be a lot of busy work to make this codebase compile
+// cleanly with -Wmissing-field-initializers
+
+#define ZERO_MUTEX_INITIALIZER \
+  {}
+
+#if defined(TOKU_PTHREAD_DEBUG)
+#define TOKU_MUTEX_INITIALIZER                                             \
+  {                                                                        \
+    .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr, .owner = 0, \
+    .locked = false, .valid = true, .instr_key_id = 0                      \
+  }
+#else
+#define TOKU_MUTEX_INITIALIZER \
+  { .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr }
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+
+// Darwin doesn't provide adaptive mutexes
+#if defined(__APPLE__)
+#define TOKU_MUTEX_ADAPTIVE PTHREAD_MUTEX_DEFAULT
+#if defined(TOKU_PTHREAD_DEBUG)
+#define TOKU_ADAPTIVE_MUTEX_INITIALIZER                                    \
+  {                                                                        \
+    .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr, .owner = 0, \
+    .locked = false, .valid = true, .instr_key_id = 0                      \
+  }
+#else
+#define TOKU_ADAPTIVE_MUTEX_INITIALIZER \
+  { .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr }
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+#else   // __FreeBSD__, __linux__, at least
+#if defined(__GLIBC__)
+#define TOKU_MUTEX_ADAPTIVE PTHREAD_MUTEX_ADAPTIVE_NP
+#else
+// not all libc (e.g. musl) implement NP (Non-POSIX) attributes
+#define TOKU_MUTEX_ADAPTIVE PTHREAD_MUTEX_DEFAULT
+#endif
+#if defined(TOKU_PTHREAD_DEBUG)
+#define TOKU_ADAPTIVE_MUTEX_INITIALIZER                                    \
+  {                                                                        \
+    .pmutex = PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP, .psi_mutex = nullptr, \
+    .owner = 0, .locked = false, .valid = true, .instr_key_id = 0          \
+  }
+#else
+#define TOKU_ADAPTIVE_MUTEX_INITIALIZER \
+  { .pmutex = PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP, .psi_mutex = nullptr }
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+#endif  // defined(__APPLE__)
+
+// Different OSes implement mutexes as different amounts of nested structs.
+// C++ will fill out all missing values with zeroes if you provide at least one
+// zero, but it needs the right amount of nesting.
+#if defined(__FreeBSD__)
+#define ZERO_COND_INITIALIZER \
+  { 0 }
+#elif defined(__APPLE__)
+#define ZERO_COND_INITIALIZER \
+  {                           \
+    { 0 }                     \
+  }
+#else  // __linux__, at least
+#define ZERO_COND_INITIALIZER \
+  {}
+#endif
+
+static inline void toku_mutexattr_init(toku_pthread_mutexattr_t *attr) {
+  int r = pthread_mutexattr_init(attr);
+  assert_zero(r);
+}
+
+static inline void toku_mutexattr_settype(toku_pthread_mutexattr_t *attr,
+                                          int type) {
+  int r = pthread_mutexattr_settype(attr, type);
+  assert_zero(r);
+}
+
+static inline void toku_mutexattr_destroy(toku_pthread_mutexattr_t *attr) {
+  int r = pthread_mutexattr_destroy(attr);
+  assert_zero(r);
+}
+
+#if defined(TOKU_PTHREAD_DEBUG)
+static inline void toku_mutex_assert_locked(const toku_mutex_t *mutex) {
+  invariant(mutex->locked);
+  invariant(mutex->owner == pthread_self());
+}
+#else
+static inline void toku_mutex_assert_locked(const toku_mutex_t *mutex
+                                            __attribute__((unused))) {}
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+
+// asserting that a mutex is unlocked only makes sense
+// if the calling thread can guaruntee that no other threads
+// are trying to lock this mutex at the time of the assertion
+//
+// a good example of this is a tree with mutexes on each node.
+// when a node is locked the caller knows that no other threads
+// can be trying to lock its childrens' mutexes. the children
+// are in one of two fixed states: locked or unlocked.
+#if defined(TOKU_PTHREAD_DEBUG)
+static inline void toku_mutex_assert_unlocked(toku_mutex_t *mutex) {
+  invariant(mutex->owner == 0);
+  invariant(!mutex->locked);
+}
+#else
+static inline void toku_mutex_assert_unlocked(toku_mutex_t *mutex
+                                              __attribute__((unused))) {}
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+
+#define toku_mutex_lock(M) \
+  toku_mutex_lock_with_source_location(M, __FILE__, __LINE__)
+
+static inline void toku_cond_init(toku_cond_t *cond,
+                                  const toku_pthread_condattr_t *attr) {
+  int r = pthread_cond_init(&cond->pcond, attr);
+  assert_zero(r);
+}
+
+#define toku_mutex_trylock(M) \
+  toku_mutex_trylock_with_source_location(M, __FILE__, __LINE__)
+
+inline void toku_mutex_unlock(toku_mutex_t *mutex) {
+#if defined(TOKU_PTHREAD_DEBUG)
+  invariant(mutex->owner == pthread_self());
+  invariant(mutex->valid);
+  invariant(mutex->locked);
+  mutex->locked = false;
+  mutex->owner = 0;
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+  toku_instr_mutex_unlock(mutex->psi_mutex);
+  int r = pthread_mutex_unlock(&mutex->pmutex);
+  assert_zero(r);
+}
+
+inline void toku_mutex_lock_with_source_location(toku_mutex_t *mutex,
+                                                 const char *src_file,
+                                                 int src_line) {
+  toku_mutex_instrumentation mutex_instr;
+  toku_instr_mutex_lock_start(mutex_instr, *mutex, src_file, src_line);
+
+  const int r = pthread_mutex_lock(&mutex->pmutex);
+  toku_instr_mutex_lock_end(mutex_instr, r);
+
+  assert_zero(r);
+#if defined(TOKU_PTHREAD_DEBUG)
+  invariant(mutex->valid);
+  invariant(!mutex->locked);
+  invariant(mutex->owner == 0);
+  mutex->locked = true;
+  mutex->owner = pthread_self();
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+}
+
+inline int toku_mutex_trylock_with_source_location(toku_mutex_t *mutex,
+                                                   const char *src_file,
+                                                   int src_line) {
+  toku_mutex_instrumentation mutex_instr;
+  toku_instr_mutex_trylock_start(mutex_instr, *mutex, src_file, src_line);
+
+  const int r = pthread_mutex_lock(&mutex->pmutex);
+  toku_instr_mutex_lock_end(mutex_instr, r);
+
+#if defined(TOKU_PTHREAD_DEBUG)
+  if (r == 0) {
+    invariant(mutex->valid);
+    invariant(!mutex->locked);
+    invariant(mutex->owner == 0);
+    mutex->locked = true;
+    mutex->owner = pthread_self();
+  }
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+  return r;
+}
+
+#define toku_cond_wait(C, M) \
+  toku_cond_wait_with_source_location(C, M, __FILE__, __LINE__)
+
+#define toku_cond_timedwait(C, M, W) \
+  toku_cond_timedwait_with_source_location(C, M, W, __FILE__, __LINE__)
+
+inline void toku_cond_init(const toku_instr_key &key, toku_cond_t *cond,
+                           const pthread_condattr_t *attr) {
+  toku_instr_cond_init(key, *cond);
+  int r = pthread_cond_init(&cond->pcond, attr);
+  assert_zero(r);
+}
+
+inline void toku_cond_destroy(toku_cond_t *cond) {
+  toku_instr_cond_destroy(cond->psi_cond);
+  int r = pthread_cond_destroy(&cond->pcond);
+  assert_zero(r);
+}
+
+inline void toku_cond_wait_with_source_location(toku_cond_t *cond,
+                                                toku_mutex_t *mutex,
+                                                const char *src_file,
+                                                int src_line) {
+#if defined(TOKU_PTHREAD_DEBUG)
+  invariant(mutex->locked);
+  mutex->locked = false;
+  mutex->owner = 0;
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+
+  /* Instrumentation start */
+  toku_cond_instrumentation cond_instr;
+  toku_instr_cond_wait_start(cond_instr, toku_instr_cond_op::cond_wait, *cond,
+                             *mutex, src_file, src_line);
+
+  /* Instrumented code */
+  const int r = pthread_cond_wait(&cond->pcond, &mutex->pmutex);
+
+  /* Instrumentation end */
+  toku_instr_cond_wait_end(cond_instr, r);
+
+  assert_zero(r);
+#if defined(TOKU_PTHREAD_DEBUG)
+  invariant(!mutex->locked);
+  mutex->locked = true;
+  mutex->owner = pthread_self();
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+}
+
+inline int toku_cond_timedwait_with_source_location(toku_cond_t *cond,
+                                                    toku_mutex_t *mutex,
+                                                    toku_timespec_t *wakeup_at,
+                                                    const char *src_file,
+                                                    int src_line) {
+#if defined(TOKU_PTHREAD_DEBUG)
+  invariant(mutex->locked);
+  mutex->locked = false;
+  mutex->owner = 0;
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+
+  /* Instrumentation start */
+  toku_cond_instrumentation cond_instr;
+  toku_instr_cond_wait_start(cond_instr, toku_instr_cond_op::cond_timedwait,
+                             *cond, *mutex, src_file, src_line);
+
+  /* Instrumented code */
+  const int r = pthread_cond_timedwait(&cond->pcond, &mutex->pmutex, wakeup_at);
+
+  /* Instrumentation end */
+  toku_instr_cond_wait_end(cond_instr, r);
+
+#if defined(TOKU_PTHREAD_DEBUG)
+  invariant(!mutex->locked);
+  mutex->locked = true;
+  mutex->owner = pthread_self();
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+  return r;
+}
+
+inline void toku_cond_signal(toku_cond_t *cond) {
+  toku_instr_cond_signal(*cond);
+  const int r = pthread_cond_signal(&cond->pcond);
+  assert_zero(r);
+}
+
+inline void toku_cond_broadcast(toku_cond_t *cond) {
+  toku_instr_cond_broadcast(*cond);
+  const int r = pthread_cond_broadcast(&cond->pcond);
+  assert_zero(r);
+}
+
+inline void toku_mutex_init(const toku_instr_key &key, toku_mutex_t *mutex,
+                            const toku_pthread_mutexattr_t *attr) {
+#if defined(TOKU_PTHREAD_DEBUG)
+  mutex->valid = true;
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+  toku_instr_mutex_init(key, *mutex);
+  const int r = pthread_mutex_init(&mutex->pmutex, attr);
+  assert_zero(r);
+#if defined(TOKU_PTHREAD_DEBUG)
+  mutex->locked = false;
+  invariant(mutex->valid);
+  mutex->valid = true;
+  mutex->owner = 0;
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+}
+
+inline void toku_mutex_destroy(toku_mutex_t *mutex) {
+#if defined(TOKU_PTHREAD_DEBUG)
+  invariant(mutex->valid);
+  mutex->valid = false;
+  invariant(!mutex->locked);
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+  toku_instr_mutex_destroy(mutex->psi_mutex);
+  int r = pthread_mutex_destroy(&mutex->pmutex);
+  assert_zero(r);
+}
+
+#define toku_pthread_rwlock_rdlock(RW) \
+  toku_pthread_rwlock_rdlock_with_source_location(RW, __FILE__, __LINE__)
+
+#define toku_pthread_rwlock_wrlock(RW) \
+  toku_pthread_rwlock_wrlock_with_source_location(RW, __FILE__, __LINE__)
+
+#if 0
+inline void toku_pthread_rwlock_init(
+    const toku_instr_key &key,
+    toku_pthread_rwlock_t *__restrict rwlock,
+    const toku_pthread_rwlockattr_t *__restrict attr) {
+    toku_instr_rwlock_init(key, *rwlock);
+    int r = pthread_rwlock_init(&rwlock->rwlock, attr);
+    assert_zero(r);
+}
+
+inline void toku_pthread_rwlock_destroy(toku_pthread_rwlock_t *rwlock) {
+    toku_instr_rwlock_destroy(rwlock->psi_rwlock);
+    int r = pthread_rwlock_destroy(&rwlock->rwlock);
+    assert_zero(r);
+}
+
+inline void toku_pthread_rwlock_rdlock_with_source_location(
+    toku_pthread_rwlock_t *rwlock,
+    const char *src_file,
+    uint src_line) {
+
+    /* Instrumentation start */
+    toku_rwlock_instrumentation rwlock_instr;
+    toku_instr_rwlock_rdlock_wait_start(
+        rwlock_instr, *rwlock, src_file, src_line);
+    /* Instrumented code */
+    const int r = pthread_rwlock_rdlock(&rwlock->rwlock);
+
+    /* Instrumentation end */
+    toku_instr_rwlock_rdlock_wait_end(rwlock_instr, r);
+
+    assert_zero(r);
+}
+
+inline void toku_pthread_rwlock_wrlock_with_source_location(
+    toku_pthread_rwlock_t *rwlock,
+    const char *src_file,
+    uint src_line) {
+
+    /* Instrumentation start */
+    toku_rwlock_instrumentation rwlock_instr;
+    toku_instr_rwlock_wrlock_wait_start(
+        rwlock_instr, *rwlock, src_file, src_line);
+    /* Instrumented code */
+    const int r = pthread_rwlock_wrlock(&rwlock->rwlock);
+
+    /* Instrumentation end */
+    toku_instr_rwlock_wrlock_wait_end(rwlock_instr, r);
+
+    assert_zero(r);
+}
+
+inline void toku_pthread_rwlock_rdunlock(toku_pthread_rwlock_t *rwlock) {
+    toku_instr_rwlock_unlock(*rwlock);
+    const int r = pthread_rwlock_unlock(&rwlock->rwlock);
+    assert_zero(r);
+}
+
+inline void toku_pthread_rwlock_wrunlock(toku_pthread_rwlock_t *rwlock) {
+    toku_instr_rwlock_unlock(*rwlock);
+    const int r = pthread_rwlock_unlock(&rwlock->rwlock);
+    assert_zero(r);
+}
+#endif
+
+static inline int toku_pthread_join(toku_pthread_t thread, void **value_ptr) {
+  return pthread_join(thread, value_ptr);
+}
+
+static inline int toku_pthread_detach(toku_pthread_t thread) {
+  return pthread_detach(thread);
+}
+
+static inline int toku_pthread_key_create(toku_pthread_key_t *key,
+                                          void (*destroyf)(void *)) {
+  return pthread_key_create(key, destroyf);
+}
+
+static inline int toku_pthread_key_delete(toku_pthread_key_t key) {
+  return pthread_key_delete(key);
+}
+
+static inline void *toku_pthread_getspecific(toku_pthread_key_t key) {
+  return pthread_getspecific(key);
+}
+
+static inline int toku_pthread_setspecific(toku_pthread_key_t key, void *data) {
+  return pthread_setspecific(key, data);
+}
+
+int toku_pthread_yield(void) __attribute__((__visibility__("default")));
+
+static inline toku_pthread_t toku_pthread_self(void) { return pthread_self(); }
+
+static inline void *toku_pthread_done(void *exit_value) {
+  toku_instr_delete_current_thread();
+  pthread_exit(exit_value);
+  return nullptr;  // Avoid compiler warning
+}
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h
new file mode 100644
index 000000000..3cb5b5790
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h
@@ -0,0 +1,179 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+// PORT2: #include <portability/toku_config.h>
+
+#ifdef HAVE_valgrind
+#undef USE_VALGRIND
+#define USE_VALGRIND 1
+#endif
+
+#if defined(__linux__) && USE_VALGRIND
+
+#include <valgrind/drd.h>
+#include <valgrind/helgrind.h>
+
+#define TOKU_ANNOTATE_NEW_MEMORY(p, size) ANNOTATE_NEW_MEMORY(p, size)
+#define TOKU_VALGRIND_HG_ENABLE_CHECKING(p, size) \
+  VALGRIND_HG_ENABLE_CHECKING(p, size)
+#define TOKU_VALGRIND_HG_DISABLE_CHECKING(p, size) \
+  VALGRIND_HG_DISABLE_CHECKING(p, size)
+#define TOKU_DRD_IGNORE_VAR(v) DRD_IGNORE_VAR(v)
+#define TOKU_DRD_STOP_IGNORING_VAR(v) DRD_STOP_IGNORING_VAR(v)
+#define TOKU_ANNOTATE_IGNORE_READS_BEGIN() ANNOTATE_IGNORE_READS_BEGIN()
+#define TOKU_ANNOTATE_IGNORE_READS_END() ANNOTATE_IGNORE_READS_END()
+#define TOKU_ANNOTATE_IGNORE_WRITES_BEGIN() ANNOTATE_IGNORE_WRITES_BEGIN()
+#define TOKU_ANNOTATE_IGNORE_WRITES_END() ANNOTATE_IGNORE_WRITES_END()
+
+/*
+ * How to make helgrind happy about tree rotations and new mutex orderings:
+ *
+ * // Tell helgrind that we unlocked it so that the next call doesn't get a
+ * "destroyed a locked mutex" error.
+ * // Tell helgrind that we destroyed the mutex.
+ * VALGRIND_HG_MUTEX_UNLOCK_PRE(&locka);
+ * VALGRIND_HG_MUTEX_DESTROY_PRE(&locka);
+ *
+ * // And recreate it.  It would be better to simply be able to say that the
+ * order on these two can now be reversed, because this code forgets all the
+ * ordering information for this mutex.
+ * // Then tell helgrind that we have locked it again.
+ * VALGRIND_HG_MUTEX_INIT_POST(&locka, 0);
+ * VALGRIND_HG_MUTEX_LOCK_POST(&locka);
+ *
+ * When the ordering of two locks changes, we don't need tell Helgrind about do
+ * both locks.  Just one is good enough.
+ */
+
+#define TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(mutex) \
+  VALGRIND_HG_MUTEX_UNLOCK_PRE(mutex);                 \
+  VALGRIND_HG_MUTEX_DESTROY_PRE(mutex);                \
+  VALGRIND_HG_MUTEX_INIT_POST(mutex, 0);               \
+  VALGRIND_HG_MUTEX_LOCK_POST(mutex);
+
+#else  // !defined(__linux__) || !USE_VALGRIND
+
+#define NVALGRIND 1
+#define TOKU_ANNOTATE_NEW_MEMORY(p, size) ((void)0)
+#define TOKU_VALGRIND_HG_ENABLE_CHECKING(p, size) ((void)0)
+#define TOKU_VALGRIND_HG_DISABLE_CHECKING(p, size) ((void)0)
+#define TOKU_DRD_IGNORE_VAR(v)
+#define TOKU_DRD_STOP_IGNORING_VAR(v)
+#define TOKU_ANNOTATE_IGNORE_READS_BEGIN() ((void)0)
+#define TOKU_ANNOTATE_IGNORE_READS_END() ((void)0)
+#define TOKU_ANNOTATE_IGNORE_WRITES_BEGIN() ((void)0)
+#define TOKU_ANNOTATE_IGNORE_WRITES_END() ((void)0)
+#define TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(mutex)
+#undef RUNNING_ON_VALGRIND
+#define RUNNING_ON_VALGRIND (0U)
+#endif
+
+// Valgrind 3.10.1 (and previous versions).
+// Problems with VALGRIND_HG_DISABLE_CHECKING and VALGRIND_HG_ENABLE_CHECKING.
+// Helgrind's implementation of disable and enable checking causes false races
+// to be reported.  In addition, the race report does not include ANY
+// information about the code that uses the helgrind disable and enable
+// functions.  Therefore, it is very difficult to figure out the cause of the
+// race. DRD does implement the disable and enable functions.
+
+// Problems with ANNOTATE_IGNORE_READS.
+// Helgrind does not implement ignore reads.
+// Annotate ignore reads is the way to inform DRD to ignore racy reads.
+
+// FT code uses unsafe reads in several places.  These unsafe reads have been
+// noted as valid since they use the toku_unsafe_fetch function. Unfortunately,
+// this causes helgrind to report erroneous data races which makes use of
+// helgrind problematic.
+
+// Unsafely fetch and return a `T' from src, telling drd to ignore
+// racey access to src for the next sizeof(*src) bytes
+template <typename T>
+T toku_unsafe_fetch(T *src) {
+  if (0)
+    TOKU_VALGRIND_HG_DISABLE_CHECKING(src,
+                                      sizeof *src);  // disabled, see comment
+  TOKU_ANNOTATE_IGNORE_READS_BEGIN();
+  T r = *src;
+  TOKU_ANNOTATE_IGNORE_READS_END();
+  if (0)
+    TOKU_VALGRIND_HG_ENABLE_CHECKING(src,
+                                     sizeof *src);  // disabled, see comment
+  return r;
+}
+
+template <typename T>
+T toku_unsafe_fetch(T &src) {
+  return toku_unsafe_fetch(&src);
+}
+
+// Unsafely set a `T' value into *dest from src, telling drd to ignore
+// racey access to dest for the next sizeof(*dest) bytes
+template <typename T>
+void toku_unsafe_set(T *dest, const T src) {
+  if (0)
+    TOKU_VALGRIND_HG_DISABLE_CHECKING(dest,
+                                      sizeof *dest);  // disabled, see comment
+  TOKU_ANNOTATE_IGNORE_WRITES_BEGIN();
+  *dest = src;
+  TOKU_ANNOTATE_IGNORE_WRITES_END();
+  if (0)
+    TOKU_VALGRIND_HG_ENABLE_CHECKING(dest,
+                                     sizeof *dest);  // disabled, see comment
+}
+
+template <typename T>
+void toku_unsafe_set(T &dest, const T src) {
+  toku_unsafe_set(&dest, src);
+}
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h
new file mode 100644
index 000000000..46111e7f0
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h
@@ -0,0 +1,193 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+// PORT2: #include "toku_config.h"
+
+#include <stdint.h>
+#include <sys/time.h>
+#include <time.h>
+#if defined(__powerpc__)
+#include <sys/platform/ppc.h>
+#endif
+
+#if 0
+static inline float toku_tdiff (struct timeval *a, struct timeval *b) {
+    return (float)((a->tv_sec - b->tv_sec) + 1e-6 * (a->tv_usec - b->tv_usec));
+}
+// PORT2: temporary:
+#define HAVE_CLOCK_REALTIME
+#if !defined(HAVE_CLOCK_REALTIME)
+// OS X does not have clock_gettime, we fake clockid_t for the interface, and we'll implement it with clock_get_time.
+typedef int clockid_t;
+// just something bogus, it doesn't matter, we just want to make sure we're
+// only supporting this mode because we're not sure we can support other modes
+// without a real clock_gettime()
+#define CLOCK_REALTIME 0x01867234
+#endif
+int toku_clock_gettime(clockid_t clk_id, struct timespec *ts) __attribute__((__visibility__("default")));
+#endif
+
+// *************** Performance timers ************************
+// What do you really want from a performance timer:
+//  (1) Can determine actual time of day from the performance time.
+//  (2) Time goes forward, never backward.
+//  (3) Same time on different processors (or even different machines).
+//  (4) Time goes forward at a constant rate (doesn't get faster and slower)
+//  (5) Portable.
+//  (6) Getting the time is cheap.
+// Unfortuately it seems tough to get Properties 1-5.  So we go for Property 6,,
+// but we abstract it. We offer a type tokutime_t which can hold the time. This
+// type can be subtracted to get a time difference. We can get the present time
+// cheaply. We can convert this type to seconds (but that can be expensive). The
+// implementation is to use RDTSC (hence we lose property 3: not portable).
+// Recent machines have constant_tsc in which case we get property (4).
+// Recent OSs on recent machines (that have RDTSCP) fix the per-processor clock
+// skew, so we get property (3). We get property 2 with RDTSC (as long as
+// there's not any skew). We don't even try to get propety 1, since we don't
+// need it. The decision here is that these times are really accurate only on
+// modern machines with modern OSs.
+typedef uint64_t tokutime_t;  // Time type used in by tokutek timers.
+
+#if 0
+// The value of tokutime_t is not specified here. 
+// It might be microseconds since 1/1/1970 (if gettimeofday() is
+// used), or clock cycles since boot (if rdtsc is used).  Or something
+// else.
+// Two tokutime_t values can be subtracted to get a time difference.
+// Use tokutime_to_seconds to that convert difference  to seconds.
+// We want get_tokutime() to be fast, but don't care so much about tokutime_to_seconds();
+//
+// For accurate time calculations do the subtraction in the right order:
+//   Right:  tokutime_to_seconds(t1-t2);
+//   Wrong   tokutime_to_seconds(t1)-toku_time_to_seconds(t2);
+// Doing it the wrong way is likely to result in loss of precision.
+// A double can hold numbers up to about 53 bits.  RDTSC which uses about 33 bits every second, so that leaves
+// 2^20 seconds from booting (about 2 weeks) before the RDTSC value cannot be represented accurately as a double.
+//
+double tokutime_to_seconds(tokutime_t)  __attribute__((__visibility__("default"))); // Convert tokutime to seconds.
+
+#endif
+
+// Get the value of tokutime for right now.  We want this to be fast, so we
+// expose the implementation as RDTSC.
+static inline tokutime_t toku_time_now(void) {
+#if defined(__x86_64__) || defined(__i386__)
+  uint32_t lo, hi;
+  __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
+  return (uint64_t)hi << 32 | lo;
+#elif defined(__aarch64__)
+  uint64_t result;
+  __asm __volatile__("mrs %[rt], cntvct_el0" : [rt] "=r"(result));
+  return result;
+#elif defined(__powerpc__)
+  return __ppc_get_timebase();
+#elif defined(__s390x__)
+  uint64_t result;
+  asm volatile("stckf %0" : "=Q"(result) : : "cc");
+  return result;
+#elif defined(__riscv) && __riscv_xlen == 32
+  uint32_t cycles_lo, cycles_hi0, cycles_hi1;
+  // Implemented in assembly because Clang insisted on branching.
+  asm volatile(
+      "rdcycleh %0\n"
+      "rdcycle %1\n"
+      "rdcycleh %2\n"
+      "sub %0, %0, %2\n"
+      "seqz %0, %0\n"
+      "sub %0, zero, %0\n"
+      "and %1, %1, %0\n"
+      : "=r"(cycles_hi0), "=r"(cycles_lo), "=r"(cycles_hi1));
+  return (static_cast<uint64_t>(cycles_hi1) << 32) | cycles_lo;
+#elif defined(__riscv) && __riscv_xlen == 64
+  uint64_t cycles;
+  asm volatile("rdcycle %0" : "=r"(cycles));
+  return cycles;
+#else
+#error No timer implementation for this platform
+#endif
+}
+
+static inline uint64_t toku_current_time_microsec(void) {
+  struct timeval t;
+  gettimeofday(&t, NULL);
+  return t.tv_sec * (1UL * 1000 * 1000) + t.tv_usec;
+}
+
+#if 0
+// sleep microseconds
+static inline void toku_sleep_microsec(uint64_t ms) {
+    struct timeval  t;
+
+    t.tv_sec = ms / 1000000;
+    t.tv_usec = ms % 1000000;
+
+    select(0, NULL, NULL, NULL, &t);
+}
+#endif
+
+/*
+  PORT: Usage of this file:
+
+  uint64_t toku_current_time_microsec()   // uses gettimeoday
+      is used to track how much time various operations took (for example, lock
+      escalation). (TODO: it is not clear why these operations are tracked with
+      microsecond precision while others use nanoseconds)
+
+  tokutime_t toku_time_now() // uses rdtsc
+      seems to be used for a very similar purpose. This has greater precision
+
+  RocksDB environment provides Env::Default()->NowMicros() and NowNanos() which
+  should be adequate substitutes.
+*/
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h
new file mode 100644
index 000000000..803914862
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h
@@ -0,0 +1,27 @@
+//
+// A substitute for ft/txn/txn.h
+//
+#pragma once
+
+#include <set>
+
+#include "../util/omt.h"
+
+typedef uint64_t TXNID;
+#define TXNID_NONE ((TXNID)0)
+
+// A set of transactions
+//  (TODO: consider using class toku::txnid_set. The reason for using STL
+//   container was that its API is easier)
+class TxnidVector : public std::set<TXNID> {
+ public:
+  bool contains(TXNID txnid) { return find(txnid) != end(); }
+};
+
+// A value for lock structures with a meaning "the lock is owned by multiple
+// transactions (and one has to check the TxnidVector to get their ids)
+#define TXNID_SHARED (TXNID(-1))
+
+// Auxiliary value meaning "any transaction id will do".  No real transaction
+// may have this is as id.
+#define TXNID_ANY (TXNID(-2))
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc
new file mode 100644
index 000000000..50dc879ce
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc
@@ -0,0 +1,132 @@
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+/*
+  This is a dump ground to make Lock Tree work without the rest of TokuDB.
+*/
+#include <string.h>
+
+#include "db.h"
+#include "ft/ft-status.h"
+#include "portability/memory.h"
+#include "util/dbt.h"
+
+// portability/os_malloc.cc
+
+void toku_free(void *p) { free(p); }
+
+void *toku_xmalloc(size_t size) { return malloc(size); }
+
+void *toku_xrealloc(void *v, size_t size) { return realloc(v, size); }
+
+void *toku_xmemdup(const void *v, size_t len) {
+  void *p = toku_xmalloc(len);
+  memcpy(p, v, len);
+  return p;
+}
+
+// TODO: what are the X-functions? Xcalloc, Xrealloc?
+void *toku_xcalloc(size_t nmemb, size_t size) { return calloc(nmemb, size); }
+
+// ft-ft-opts.cc:
+
+// locktree
+toku_instr_key lock_request_m_wait_cond_key;
+toku_instr_key manager_m_escalator_done_key;
+toku_instr_key locktree_request_info_mutex_key;
+toku_instr_key locktree_request_info_retry_mutex_key;
+toku_instr_key locktree_request_info_retry_cv_key;
+
+toku_instr_key treenode_mutex_key;
+toku_instr_key manager_mutex_key;
+toku_instr_key manager_escalation_mutex_key;
+toku_instr_key manager_escalator_mutex_key;
+
+// portability/memory.cc
+size_t toku_memory_footprint(void *, size_t touched) { return touched; }
+
+// ft/ft-status.c
+// PORT2: note: the @c parameter to TOKUFT_STATUS_INIT must not start with
+//   "TOKU"
+LTM_STATUS_S ltm_status;
+void LTM_STATUS_S::init() {
+  if (m_initialized) return;
+#define LTM_STATUS_INIT(k, c, t, l)                    \
+  TOKUFT_STATUS_INIT((*this), k, c, t, "locktree: " l, \
+                     TOKU_ENGINE_STATUS | TOKU_GLOBAL_STATUS)
+  LTM_STATUS_INIT(LTM_SIZE_CURRENT, LOCKTREE_MEMORY_SIZE, STATUS_UINT64,
+                  "memory size");
+  LTM_STATUS_INIT(LTM_SIZE_LIMIT, LOCKTREE_MEMORY_SIZE_LIMIT, STATUS_UINT64,
+                  "memory size limit");
+  LTM_STATUS_INIT(LTM_ESCALATION_COUNT, LOCKTREE_ESCALATION_NUM, STATUS_UINT64,
+                  "number of times lock escalation ran");
+  LTM_STATUS_INIT(LTM_ESCALATION_TIME, LOCKTREE_ESCALATION_SECONDS,
+                  STATUS_TOKUTIME, "time spent running escalation (seconds)");
+  LTM_STATUS_INIT(LTM_ESCALATION_LATEST_RESULT,
+                  LOCKTREE_LATEST_POST_ESCALATION_MEMORY_SIZE, STATUS_UINT64,
+                  "latest post-escalation memory size");
+  LTM_STATUS_INIT(LTM_NUM_LOCKTREES, LOCKTREE_OPEN_CURRENT, STATUS_UINT64,
+                  "number of locktrees open now");
+  LTM_STATUS_INIT(LTM_LOCK_REQUESTS_PENDING, LOCKTREE_PENDING_LOCK_REQUESTS,
+                  STATUS_UINT64, "number of pending lock requests");
+  LTM_STATUS_INIT(LTM_STO_NUM_ELIGIBLE, LOCKTREE_STO_ELIGIBLE_NUM,
+                  STATUS_UINT64, "number of locktrees eligible for the STO");
+  LTM_STATUS_INIT(LTM_STO_END_EARLY_COUNT, LOCKTREE_STO_ENDED_NUM,
+                  STATUS_UINT64,
+                  "number of times a locktree ended the STO early");
+  LTM_STATUS_INIT(LTM_STO_END_EARLY_TIME, LOCKTREE_STO_ENDED_SECONDS,
+                  STATUS_TOKUTIME, "time spent ending the STO early (seconds)");
+  LTM_STATUS_INIT(LTM_WAIT_COUNT, LOCKTREE_WAIT_COUNT, STATUS_UINT64,
+                  "number of wait locks");
+  LTM_STATUS_INIT(LTM_WAIT_TIME, LOCKTREE_WAIT_TIME, STATUS_UINT64,
+                  "time waiting for locks");
+  LTM_STATUS_INIT(LTM_LONG_WAIT_COUNT, LOCKTREE_LONG_WAIT_COUNT, STATUS_UINT64,
+                  "number of long wait locks");
+  LTM_STATUS_INIT(LTM_LONG_WAIT_TIME, LOCKTREE_LONG_WAIT_TIME, STATUS_UINT64,
+                  "long time waiting for locks");
+  LTM_STATUS_INIT(LTM_TIMEOUT_COUNT, LOCKTREE_TIMEOUT_COUNT, STATUS_UINT64,
+                  "number of lock timeouts");
+  LTM_STATUS_INIT(LTM_WAIT_ESCALATION_COUNT, LOCKTREE_WAIT_ESCALATION_COUNT,
+                  STATUS_UINT64, "number of waits on lock escalation");
+  LTM_STATUS_INIT(LTM_WAIT_ESCALATION_TIME, LOCKTREE_WAIT_ESCALATION_TIME,
+                  STATUS_UINT64, "time waiting on lock escalation");
+  LTM_STATUS_INIT(LTM_LONG_WAIT_ESCALATION_COUNT,
+                  LOCKTREE_LONG_WAIT_ESCALATION_COUNT, STATUS_UINT64,
+                  "number of long waits on lock escalation");
+  LTM_STATUS_INIT(LTM_LONG_WAIT_ESCALATION_TIME,
+                  LOCKTREE_LONG_WAIT_ESCALATION_TIME, STATUS_UINT64,
+                  "long time waiting on lock escalation");
+
+  m_initialized = true;
+#undef LTM_STATUS_INIT
+}
+void LTM_STATUS_S::destroy() {
+  if (!m_initialized) return;
+  for (int i = 0; i < LTM_STATUS_NUM_ROWS; ++i) {
+    if (status[i].type == STATUS_PARCOUNT) {
+      // PORT: TODO?? destroy_partitioned_counter(status[i].value.parcount);
+    }
+  }
+}
+
+int toku_keycompare(const void *key1, size_t key1len, const void *key2,
+                    size_t key2len) {
+  size_t comparelen = key1len < key2len ? key1len : key2len;
+  int c = memcmp(key1, key2, comparelen);
+  if (__builtin_expect(c != 0, 1)) {
+    return c;
+  } else {
+    if (key1len < key2len) {
+      return -1;
+    } else if (key1len > key2len) {
+      return 1;
+    } else {
+      return 0;
+    }
+  }
+}
+
+int toku_builtin_compare_fun(const DBT *a, const DBT *b) {
+  return toku_keycompare(a->data, a->size, b->data, b->size);
+}
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc
new file mode 100644
index 000000000..63cc3a267
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc
@@ -0,0 +1,153 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "dbt.h"
+
+#include <string.h>
+
+#include "../db.h"
+#include "../portability/memory.h"
+
+DBT *toku_init_dbt(DBT *dbt) {
+  memset(dbt, 0, sizeof(*dbt));
+  return dbt;
+}
+
+DBT toku_empty_dbt(void) {
+  static const DBT empty_dbt = {.data = 0, .size = 0, .ulen = 0, .flags = 0};
+  return empty_dbt;
+}
+
+DBT *toku_init_dbt_flags(DBT *dbt, uint32_t flags) {
+  toku_init_dbt(dbt);
+  dbt->flags = flags;
+  return dbt;
+}
+
+void toku_destroy_dbt(DBT *dbt) {
+  switch (dbt->flags) {
+    case DB_DBT_MALLOC:
+    case DB_DBT_REALLOC:
+      toku_free(dbt->data);
+      toku_init_dbt(dbt);
+      break;
+  }
+}
+
+DBT *toku_fill_dbt(DBT *dbt, const void *k, size_t len) {
+  toku_init_dbt(dbt);
+  dbt->size = len;
+  dbt->data = (char *)k;
+  return dbt;
+}
+
+DBT *toku_memdup_dbt(DBT *dbt, const void *k, size_t len) {
+  toku_init_dbt_flags(dbt, DB_DBT_MALLOC);
+  dbt->size = len;
+  dbt->data = toku_xmemdup(k, len);
+  return dbt;
+}
+
+DBT *toku_copyref_dbt(DBT *dst, const DBT src) {
+  dst->flags = 0;
+  dst->ulen = 0;
+  dst->size = src.size;
+  dst->data = src.data;
+  return dst;
+}
+
+DBT *toku_clone_dbt(DBT *dst, const DBT &src) {
+  return toku_memdup_dbt(dst, src.data, src.size);
+}
+
+void toku_sdbt_cleanup(struct simple_dbt *sdbt) {
+  if (sdbt->data) toku_free(sdbt->data);
+  memset(sdbt, 0, sizeof(*sdbt));
+}
+
+const DBT *toku_dbt_positive_infinity(void) {
+  static DBT positive_infinity_dbt = {
+      .data = 0, .size = 0, .ulen = 0, .flags = 0};  // port
+  return &positive_infinity_dbt;
+}
+
+const DBT *toku_dbt_negative_infinity(void) {
+  static DBT negative_infinity_dbt = {
+      .data = 0, .size = 0, .ulen = 0, .flags = 0};  // port
+  return &negative_infinity_dbt;
+}
+
+bool toku_dbt_is_infinite(const DBT *dbt) {
+  return dbt == toku_dbt_positive_infinity() ||
+         dbt == toku_dbt_negative_infinity();
+}
+
+bool toku_dbt_is_empty(const DBT *dbt) {
+  // can't have a null data field with a non-zero size
+  paranoid_invariant(dbt->data != nullptr || dbt->size == 0);
+  return dbt->data == nullptr;
+}
+
+int toku_dbt_infinite_compare(const DBT *a, const DBT *b) {
+  if (a == b) {
+    return 0;
+  } else if (a == toku_dbt_positive_infinity()) {
+    return 1;
+  } else if (b == toku_dbt_positive_infinity()) {
+    return -1;
+  } else if (a == toku_dbt_negative_infinity()) {
+    return -1;
+  } else {
+    invariant(b == toku_dbt_negative_infinity());
+    return 1;
+  }
+}
+
+bool toku_dbt_equals(const DBT *a, const DBT *b) {
+  if (!toku_dbt_is_infinite(a) && !toku_dbt_is_infinite(b)) {
+    return a->data == b->data && a->size == b->size;
+  } else {
+    // a or b is infinite, so they're equal if they are the same infinite
+    return a == b ? true : false;
+  }
+}
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.h
new file mode 100644
index 000000000..d86c440f8
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.h
@@ -0,0 +1,98 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "../db.h"
+
+// TODO: John
+// Document this API a little better so that DBT
+// memory management can be morm widely understood.
+
+DBT *toku_init_dbt(DBT *);
+
+// returns: an initialized but empty dbt (for which toku_dbt_is_empty() is true)
+DBT toku_empty_dbt(void);
+
+DBT *toku_init_dbt_flags(DBT *, uint32_t flags);
+
+void toku_destroy_dbt(DBT *);
+
+DBT *toku_fill_dbt(DBT *dbt, const void *k, size_t len);
+
+DBT *toku_memdup_dbt(DBT *dbt, const void *k, size_t len);
+
+DBT *toku_copyref_dbt(DBT *dst, const DBT src);
+
+DBT *toku_clone_dbt(DBT *dst, const DBT &src);
+
+void toku_sdbt_cleanup(struct simple_dbt *sdbt);
+
+// returns: special DBT pointer representing positive infinity
+const DBT *toku_dbt_positive_infinity(void);
+
+// returns: special DBT pointer representing negative infinity
+const DBT *toku_dbt_negative_infinity(void);
+
+// returns: true if the given dbt is either positive or negative infinity
+bool toku_dbt_is_infinite(const DBT *dbt);
+
+// returns: true if the given dbt has no data (ie: dbt->data == nullptr)
+bool toku_dbt_is_empty(const DBT *dbt);
+
+// effect: compares two potentially infinity-valued dbts
+// requires: at least one is infinite (assert otherwise)
+int toku_dbt_infinite_compare(const DBT *a, const DBT *b);
+
+// returns: true if the given dbts have the same data pointer and size
+bool toku_dbt_equals(const DBT *a, const DBT *b);
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h
new file mode 100644
index 000000000..158750fdb
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h
@@ -0,0 +1,144 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <memory.h>
+
+//******************************************************************************
+//
+// Overview: A growable array is a little bit like std::vector except that
+//  it doesn't have constructors (hence can be used in static constructs, since
+//  the google style guide says no constructors), and it's a little simpler.
+// Operations:
+//   init and deinit (we don't have constructors and destructors).
+//   fetch_unchecked to get values out.
+//   store_unchecked to put values in.
+//   push to add an element at the end
+//   get_size to find out the size
+//   get_memory_size to find out how much memory the data stucture is using.
+//
+//******************************************************************************
+
+namespace toku {
+
+template <typename T>
+class GrowableArray {
+ public:
+  void init(void)
+  // Effect: Initialize the array to contain no elements.
+  {
+    m_array = NULL;
+    m_size = 0;
+    m_size_limit = 0;
+  }
+
+  void deinit(void)
+  // Effect: Deinitialize the array (freeing any memory it uses, for example).
+  {
+    toku_free(m_array);
+    m_array = NULL;
+    m_size = 0;
+    m_size_limit = 0;
+  }
+
+  T fetch_unchecked(size_t i) const
+  // Effect: Fetch the ith element.  If i is out of range, the system asserts.
+  {
+    return m_array[i];
+  }
+
+  void store_unchecked(size_t i, T v)
+  // Effect: Store v in the ith element.  If i is out of range, the system
+  // asserts.
+  {
+    paranoid_invariant(i < m_size);
+    m_array[i] = v;
+  }
+
+  void push(T v)
+  // Effect: Add v to the end of the array (increasing the size).  The amortized
+  // cost of this operation is constant. Implementation hint:  Double the size
+  // of the array when it gets too big so that the amortized cost stays
+  // constant.
+  {
+    if (m_size >= m_size_limit) {
+      if (m_array == NULL) {
+        m_size_limit = 1;
+      } else {
+        m_size_limit *= 2;
+      }
+      XREALLOC_N(m_size_limit, m_array);
+    }
+    m_array[m_size++] = v;
+  }
+
+  size_t get_size(void) const
+  // Effect: Return the number of elements in the array.
+  {
+    return m_size;
+  }
+  size_t memory_size(void) const
+  // Effect: Return the size (in bytes) that the array occupies in memory.  This
+  // is really only an estimate.
+  {
+    return sizeof(*this) + sizeof(T) * m_size_limit;
+  }
+
+ private:
+  T *m_array;
+  size_t m_size;
+  size_t m_size_limit;  // How much space is allocated in array.
+};
+
+}  // namespace toku
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc
new file mode 100644
index 000000000..0e7a9880b
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc
@@ -0,0 +1,201 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "memarena.h"
+
+#include <string.h>
+
+#include <algorithm>
+
+#include "../portability/memory.h"
+
+void memarena::create(size_t initial_size) {
+  _current_chunk = arena_chunk();
+  _other_chunks = nullptr;
+  _size_of_other_chunks = 0;
+  _footprint_of_other_chunks = 0;
+  _n_other_chunks = 0;
+
+  _current_chunk.size = initial_size;
+  if (_current_chunk.size > 0) {
+    XMALLOC_N(_current_chunk.size, _current_chunk.buf);
+  }
+}
+
+void memarena::destroy(void) {
+  if (_current_chunk.buf) {
+    toku_free(_current_chunk.buf);
+  }
+  for (int i = 0; i < _n_other_chunks; i++) {
+    toku_free(_other_chunks[i].buf);
+  }
+  if (_other_chunks) {
+    toku_free(_other_chunks);
+  }
+  _current_chunk = arena_chunk();
+  _other_chunks = nullptr;
+  _n_other_chunks = 0;
+}
+
+static size_t round_to_page(size_t size) {
+  const size_t page_size = 4096;
+  const size_t r = page_size + ((size - 1) & ~(page_size - 1));
+  assert((r & (page_size - 1)) == 0);  // make sure it's aligned
+  assert(r >= size);                   // make sure it's not too small
+  assert(r <
+         size + page_size);  // make sure we didn't grow by more than a page.
+  return r;
+}
+
+static const size_t MEMARENA_MAX_CHUNK_SIZE = 64 * 1024 * 1024;
+
+void *memarena::malloc_from_arena(size_t size) {
+  if (_current_chunk.buf == nullptr ||
+      _current_chunk.size < _current_chunk.used + size) {
+    // The existing block isn't big enough.
+    // Add the block to the vector of blocks.
+    if (_current_chunk.buf) {
+      invariant(_current_chunk.size > 0);
+      int old_n = _n_other_chunks;
+      XREALLOC_N(old_n + 1, _other_chunks);
+      _other_chunks[old_n] = _current_chunk;
+      _n_other_chunks = old_n + 1;
+      _size_of_other_chunks += _current_chunk.size;
+      _footprint_of_other_chunks +=
+          toku_memory_footprint(_current_chunk.buf, _current_chunk.used);
+    }
+
+    // Make a new one. Grow the buffer size exponentially until we hit
+    // the max chunk size, but make it at least `size' bytes so the
+    // current allocation always fit.
+    size_t new_size =
+        std::min(MEMARENA_MAX_CHUNK_SIZE, 2 * _current_chunk.size);
+    if (new_size < size) {
+      new_size = size;
+    }
+    new_size = round_to_page(
+        new_size);  // at least size, but round to the next page size
+    XMALLOC_N(new_size, _current_chunk.buf);
+    _current_chunk.used = 0;
+    _current_chunk.size = new_size;
+  }
+  invariant(_current_chunk.buf != nullptr);
+
+  // allocate in the existing block.
+  char *p = _current_chunk.buf + _current_chunk.used;
+  _current_chunk.used += size;
+  return p;
+}
+
+void memarena::move_memory(memarena *dest) {
+  // Move memory to dest
+  XREALLOC_N(dest->_n_other_chunks + _n_other_chunks + 1, dest->_other_chunks);
+  dest->_size_of_other_chunks += _size_of_other_chunks + _current_chunk.size;
+  dest->_footprint_of_other_chunks +=
+      _footprint_of_other_chunks +
+      toku_memory_footprint(_current_chunk.buf, _current_chunk.used);
+  for (int i = 0; i < _n_other_chunks; i++) {
+    dest->_other_chunks[dest->_n_other_chunks++] = _other_chunks[i];
+  }
+  dest->_other_chunks[dest->_n_other_chunks++] = _current_chunk;
+
+  // Clear out this memarena's memory
+  toku_free(_other_chunks);
+  _current_chunk = arena_chunk();
+  _other_chunks = nullptr;
+  _size_of_other_chunks = 0;
+  _footprint_of_other_chunks = 0;
+  _n_other_chunks = 0;
+}
+
+size_t memarena::total_memory_size(void) const {
+  return sizeof(*this) + total_size_in_use() +
+         _n_other_chunks * sizeof(*_other_chunks);
+}
+
+size_t memarena::total_size_in_use(void) const {
+  return _size_of_other_chunks + _current_chunk.used;
+}
+
+size_t memarena::total_footprint(void) const {
+  return sizeof(*this) + _footprint_of_other_chunks +
+         toku_memory_footprint(_current_chunk.buf, _current_chunk.used) +
+         _n_other_chunks * sizeof(*_other_chunks);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+const void *memarena::chunk_iterator::current(size_t *used) const {
+  if (_chunk_idx < 0) {
+    *used = _ma->_current_chunk.used;
+    return _ma->_current_chunk.buf;
+  } else if (_chunk_idx < _ma->_n_other_chunks) {
+    *used = _ma->_other_chunks[_chunk_idx].used;
+    return _ma->_other_chunks[_chunk_idx].buf;
+  }
+  *used = 0;
+  return nullptr;
+}
+
+void memarena::chunk_iterator::next() { _chunk_idx++; }
+
+bool memarena::chunk_iterator::more() const {
+  if (_chunk_idx < 0) {
+    return _ma->_current_chunk.buf != nullptr;
+  }
+  return _chunk_idx < _ma->_n_other_chunks;
+}
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.h
new file mode 100644
index 000000000..ddcc1144f
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.h
@@ -0,0 +1,141 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <stdlib.h>
+
+/*
+ * A memarena is used to efficiently store a collection of objects that never
+ * move The pattern is allocate more and more stuff and free all of the items at
+ * once. The underlying memory will store 1 or more objects per chunk. Each
+ * chunk is contiguously laid out in memory but chunks are not necessarily
+ * contiguous with each other.
+ */
+class memarena {
+ public:
+  memarena()
+      : _current_chunk(arena_chunk()),
+        _other_chunks(nullptr),
+        _n_other_chunks(0),
+        _size_of_other_chunks(0),
+        _footprint_of_other_chunks(0) {}
+
+  // Effect: Create a memarena with the specified initial size
+  void create(size_t initial_size);
+
+  void destroy(void);
+
+  // Effect: Allocate some memory.  The returned value remains valid until the
+  // memarena is cleared or closed.
+  //  In case of ENOMEM, aborts.
+  void *malloc_from_arena(size_t size);
+
+  // Effect: Move all the memory from this memarena into DEST.
+  //         When SOURCE is closed the memory won't be freed.
+  //         When DEST is closed, the memory will be freed, unless DEST moves
+  //         its memory to another memarena...
+  void move_memory(memarena *dest);
+
+  // Effect: Calculate the amount of memory used by a memory arena.
+  size_t total_memory_size(void) const;
+
+  // Effect: Calculate the used space of the memory arena (ie: excludes unused
+  // space)
+  size_t total_size_in_use(void) const;
+
+  // Effect: Calculate the amount of memory used, according to
+  // toku_memory_footprint(),
+  //         which is a more expensive but more accurate count of memory used.
+  size_t total_footprint(void) const;
+
+  // iterator over the underlying chunks that store objects in the memarena.
+  // a chunk is represented by a pointer to const memory and a usable byte
+  // count.
+  class chunk_iterator {
+   public:
+    chunk_iterator(const memarena *ma) : _ma(ma), _chunk_idx(-1) {}
+
+    // returns: base pointer to the current chunk
+    //          *used set to the number of usable bytes
+    //          if more() is false, returns nullptr and *used = 0
+    const void *current(size_t *used) const;
+
+    // requires: more() is true
+    void next();
+
+    bool more() const;
+
+   private:
+    // -1 represents the 'initial' chunk in a memarena, ie: ma->_current_chunk
+    // >= 0 represents the i'th chunk in the ma->_other_chunks array
+    const memarena *_ma;
+    int _chunk_idx;
+  };
+
+ private:
+  struct arena_chunk {
+    arena_chunk() : buf(nullptr), used(0), size(0) {}
+    char *buf;
+    size_t used;
+    size_t size;
+  };
+
+  struct arena_chunk _current_chunk;
+  struct arena_chunk *_other_chunks;
+  int _n_other_chunks;
+  size_t _size_of_other_chunks;       // the buf_size of all the other chunks.
+  size_t _footprint_of_other_chunks;  // the footprint of all the other chunks.
+
+  friend class memarena_unit_test;
+};
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt.h
new file mode 100644
index 000000000..f208002d3
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt.h
@@ -0,0 +1,794 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <memory.h>
+#include <stdint.h>
+
+#include "../portability/toku_portability.h"
+#include "../portability/toku_race_tools.h"
+#include "growable_array.h"
+
+namespace toku {
+
+/**
+ * Order Maintenance Tree (OMT)
+ *
+ * Maintains a collection of totally ordered values, where each value has an
+ * integer weight. The OMT is a mutable datatype.
+ *
+ * The Abstraction:
+ *
+ * An OMT is a vector of values, $V$, where $|V|$ is the length of the vector.
+ * The vector is numbered from $0$ to $|V|-1$.
+ * Each value has a weight.  The weight of the $i$th element is denoted
+ * $w(V_i)$.
+ *
+ * We can create a new OMT, which is the empty vector.
+ *
+ * We can insert a new element $x$ into slot $i$, changing $V$ into $V'$ where
+ *  $|V'|=1+|V|$       and
+ *
+ *   V'_j = V_j       if $j<i$
+ *          x         if $j=i$
+ *          V_{j-1}   if $j>i$.
+ *
+ * We can specify $i$ using a kind of function instead of as an integer.
+ * Let $b$ be a function mapping from values to nonzero integers, such that
+ * the signum of $b$ is monotically increasing.
+ * We can specify $i$ as the minimum integer such that $b(V_i)>0$.
+ *
+ * We look up a value using its index, or using a Heaviside function.
+ * For lookups, we allow $b$ to be zero for some values, and again the signum of
+ * $b$ must be monotonically increasing. When lookup up values, we can look up
+ *  $V_i$ where $i$ is the minimum integer such that $b(V_i)=0$.   (With a
+ * special return code if no such value exists.) (Rationale:  Ordinarily we want
+ * $i$ to be unique.  But for various reasons we want to allow multiple zeros,
+ * and we want the smallest $i$ in that case.) $V_i$ where $i$ is the minimum
+ * integer such that $b(V_i)>0$.   (Or an indication that no such value exists.)
+ *  $V_i$ where $i$ is the maximum integer such that $b(V_i)<0$.   (Or an
+ * indication that no such value exists.)
+ *
+ * When looking up a value using a Heaviside function, we get the value and its
+ * index.
+ *
+ * We can also split an OMT into two OMTs, splitting the weight of the values
+ * evenly. Find a value $j$ such that the values to the left of $j$ have about
+ * the same total weight as the values to the right of $j$. The resulting two
+ * OMTs contain the values to the left of $j$ and the values to the right of $j$
+ * respectively. All of the values from the original OMT go into one of the new
+ * OMTs. If the weights of the values don't split exactly evenly, then the
+ * implementation has the freedom to choose whether the new left OMT or the new
+ * right OMT is larger.
+ *
+ * Performance:
+ *  Insertion and deletion should run with $O(\log |V|)$ time and $O(\log |V|)$
+ * calls to the Heaviside function. The memory required is O(|V|).
+ *
+ * Usage:
+ *  The omt is templated by two parameters:
+ *   - omtdata_t is what will be stored within the omt.  These could be pointers
+ * or real data types (ints, structs).
+ *   - omtdataout_t is what will be returned by find and related functions.  By
+ * default, it is the same as omtdata_t, but you can set it to (omtdata_t *). To
+ * create an omt which will store "TXNID"s, for example, it is a good idea to
+ * typedef the template: typedef omt<TXNID> txnid_omt_t; If you are storing
+ * structs, you may want to be able to get a pointer to the data actually stored
+ * in the omt (see find_zero).  To do this, use the second template parameter:
+ *   typedef omt<struct foo, struct foo *> foo_omt_t;
+ */
+
+namespace omt_internal {
+
+template <bool subtree_supports_marks>
+class subtree_templated {
+ private:
+  uint32_t m_index;
+
+ public:
+  static const uint32_t NODE_NULL = UINT32_MAX;
+  inline void set_to_null(void) { m_index = NODE_NULL; }
+
+  inline bool is_null(void) const { return NODE_NULL == this->get_index(); }
+
+  inline uint32_t get_index(void) const { return m_index; }
+
+  inline void set_index(uint32_t index) {
+    paranoid_invariant(index != NODE_NULL);
+    m_index = index;
+  }
+} __attribute__((__packed__, aligned(4)));
+
+template <>
+class subtree_templated<true> {
+ private:
+  uint32_t m_bitfield;
+  static const uint32_t MASK_INDEX = ~(((uint32_t)1) << 31);
+  static const uint32_t MASK_BIT = ((uint32_t)1) << 31;
+
+  inline void set_index_internal(uint32_t new_index) {
+    m_bitfield = (m_bitfield & MASK_BIT) | new_index;
+  }
+
+ public:
+  static const uint32_t NODE_NULL = INT32_MAX;
+  inline void set_to_null(void) { this->set_index_internal(NODE_NULL); }
+
+  inline bool is_null(void) const { return NODE_NULL == this->get_index(); }
+
+  inline uint32_t get_index(void) const {
+    TOKU_DRD_IGNORE_VAR(m_bitfield);
+    const uint32_t bits = m_bitfield;
+    TOKU_DRD_STOP_IGNORING_VAR(m_bitfield);
+    return bits & MASK_INDEX;
+  }
+
+  inline void set_index(uint32_t index) {
+    paranoid_invariant(index < NODE_NULL);
+    this->set_index_internal(index);
+  }
+
+  inline bool get_bit(void) const {
+    TOKU_DRD_IGNORE_VAR(m_bitfield);
+    const uint32_t bits = m_bitfield;
+    TOKU_DRD_STOP_IGNORING_VAR(m_bitfield);
+    return (bits & MASK_BIT) != 0;
+  }
+
+  inline void enable_bit(void) {
+    // These bits may be set by a thread with a write lock on some
+    // leaf, and the index can be read by another thread with a (read
+    // or write) lock on another thread.  Also, the has_marks_below
+    // bit can be set by two threads simultaneously.  Neither of these
+    // are real races, so if we are using DRD we should tell it to
+    // ignore these bits just while we set this bit.  If there were a
+    // race in setting the index, that would be a real race.
+    TOKU_DRD_IGNORE_VAR(m_bitfield);
+    m_bitfield |= MASK_BIT;
+    TOKU_DRD_STOP_IGNORING_VAR(m_bitfield);
+  }
+
+  inline void disable_bit(void) { m_bitfield &= MASK_INDEX; }
+} __attribute__((__packed__));
+
+template <typename omtdata_t, bool subtree_supports_marks>
+class omt_node_templated {
+ public:
+  omtdata_t value;
+  uint32_t weight;
+  subtree_templated<subtree_supports_marks> left;
+  subtree_templated<subtree_supports_marks> right;
+
+  // this needs to be in both implementations because we don't have
+  // a "static if" the caller can use
+  inline void clear_stolen_bits(void) {}
+};  // note: originally this class had __attribute__((__packed__, aligned(4)))
+
+template <typename omtdata_t>
+class omt_node_templated<omtdata_t, true> {
+ public:
+  omtdata_t value;
+  uint32_t weight;
+  subtree_templated<true> left;
+  subtree_templated<true> right;
+  inline bool get_marked(void) const { return left.get_bit(); }
+  inline void set_marked_bit(void) { return left.enable_bit(); }
+  inline void unset_marked_bit(void) { return left.disable_bit(); }
+
+  inline bool get_marks_below(void) const { return right.get_bit(); }
+  inline void set_marks_below_bit(void) {
+    // This function can be called by multiple threads.
+    // Checking first reduces cache invalidation.
+    if (!this->get_marks_below()) {
+      right.enable_bit();
+    }
+  }
+  inline void unset_marks_below_bit(void) { right.disable_bit(); }
+
+  inline void clear_stolen_bits(void) {
+    this->unset_marked_bit();
+    this->unset_marks_below_bit();
+  }
+};  // note: originally this class had __attribute__((__packed__, aligned(4)))
+
+}  // namespace omt_internal
+
+template <typename omtdata_t, typename omtdataout_t = omtdata_t,
+          bool supports_marks = false>
+class omt {
+ public:
+  /**
+   * Effect: Create an empty OMT.
+   * Performance: constant time.
+   */
+  void create(void);
+
+  /**
+   * Effect: Create an empty OMT with no internal allocated space.
+   * Performance: constant time.
+   * Rationale: In some cases we need a valid omt but don't want to malloc.
+   */
+  void create_no_array(void);
+
+  /**
+   * Effect: Create a OMT containing values.  The number of values is in
+   * numvalues. Stores the new OMT in *omtp. Requires: this has not been created
+   * yet Requires: values != NULL Requires: values is sorted Performance:
+   * time=O(numvalues) Rationale:    Normally to insert N values takes O(N lg N)
+   * amortized time. If the N values are known in advance, are sorted, and the
+   * structure is empty, we can batch insert them much faster.
+   */
+  __attribute__((nonnull)) void create_from_sorted_array(
+      const omtdata_t *const values, const uint32_t numvalues);
+
+  /**
+   * Effect: Create an OMT containing values.  The number of values is in
+   * numvalues. On success the OMT takes ownership of *values array, and sets
+   * values=NULL. Requires: this has not been created yet Requires: values !=
+   * NULL Requires: *values is sorted Requires: *values was allocated with
+   * toku_malloc Requires: Capacity of the *values array is <= new_capacity
+   * Requires: On success, *values may not be accessed again by the caller.
+   * Performance:  time=O(1)
+   * Rational:     create_from_sorted_array takes O(numvalues) time.
+   *               By taking ownership of the array, we save a malloc and
+   * memcpy, and possibly a free (if the caller is done with the array).
+   */
+  void create_steal_sorted_array(omtdata_t **const values,
+                                 const uint32_t numvalues,
+                                 const uint32_t new_capacity);
+
+  /**
+   * Effect: Create a new OMT, storing it in *newomt.
+   *  The values to the right of index (starting at index) are moved to *newomt.
+   * Requires: newomt != NULL
+   * Returns
+   *    0             success,
+   *    EINVAL        if index > toku_omt_size(omt)
+   * On nonzero return, omt and *newomt are unmodified.
+   * Performance: time=O(n)
+   * Rationale:  We don't need a split-evenly operation.  We need to split items
+   * so that their total sizes are even, and other similar splitting criteria.
+   * It's easy to split evenly by calling size(), and dividing by two.
+   */
+  __attribute__((nonnull)) int split_at(omt *const newomt, const uint32_t idx);
+
+  /**
+   * Effect: Appends leftomt and rightomt to produce a new omt.
+   *  Creates this as the new omt.
+   *  leftomt and rightomt are destroyed.
+   * Performance: time=O(n) is acceptable, but one can imagine implementations
+   * that are O(\log n) worst-case.
+   */
+  __attribute__((nonnull)) void merge(omt *const leftomt, omt *const rightomt);
+
+  /**
+   * Effect: Creates a copy of an omt.
+   *  Creates this as the clone.
+   *  Each element is copied directly.  If they are pointers, the underlying
+   * data is not duplicated. Performance: O(n) or the running time of
+   * fill_array_with_subtree_values()
+   */
+  void clone(const omt &src);
+
+  /**
+   * Effect: Set the tree to be empty.
+   *  Note: Will not reallocate or resize any memory.
+   * Performance: time=O(1)
+   */
+  void clear(void);
+
+  /**
+   * Effect:  Destroy an OMT, freeing all its memory.
+   *   If the values being stored are pointers, their underlying data is not
+   * freed.  See free_items() Those values may be freed before or after calling
+   * toku_omt_destroy. Rationale: Returns no values since free() cannot fail.
+   * Rationale: Does not free the underlying pointers to reduce complexity.
+   * Performance:  time=O(1)
+   */
+  void destroy(void);
+
+  /**
+   * Effect: return |this|.
+   * Performance:  time=O(1)
+   */
+  uint32_t size(void) const;
+
+  /**
+   * Effect:  Insert value into the OMT.
+   *   If there is some i such that $h(V_i, v)=0$ then returns DB_KEYEXIST.
+   *   Otherwise, let i be the minimum value such that $h(V_i, v)>0$.
+   *      If no such i exists, then let i be |V|
+   *   Then this has the same effect as
+   *    insert_at(tree, value, i);
+   *   If idx!=NULL then i is stored in *idx
+   * Requires:  The signum of h must be monotonically increasing.
+   * Returns:
+   *    0            success
+   *    DB_KEYEXIST  the key is present (h was equal to zero for some value)
+   * On nonzero return, omt is unchanged.
+   * Performance: time=O(\log N) amortized.
+   * Rationale: Some future implementation may be O(\log N) worst-case time, but
+   * O(\log N) amortized is good enough for now.
+   */
+  template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+  int insert(const omtdata_t &value, const omtcmp_t &v, uint32_t *const idx);
+
+  /**
+   * Effect: Increases indexes of all items at slot >= idx by 1.
+   *         Insert value into the position at idx.
+   * Returns:
+   *   0         success
+   *   EINVAL    if idx > this->size()
+   * On error, omt is unchanged.
+   * Performance: time=O(\log N) amortized time.
+   * Rationale: Some future implementation may be O(\log N) worst-case time, but
+   * O(\log N) amortized is good enough for now.
+   */
+  int insert_at(const omtdata_t &value, const uint32_t idx);
+
+  /**
+   * Effect:  Replaces the item at idx with value.
+   * Returns:
+   *   0       success
+   *   EINVAL  if idx>=this->size()
+   * On error, omt is unchanged.
+   * Performance: time=O(\log N)
+   * Rationale: The FT needs to be able to replace a value with another copy of
+   * the same value (allocated in a different location)
+   *
+   */
+  int set_at(const omtdata_t &value, const uint32_t idx);
+
+  /**
+   * Effect: Delete the item in slot idx.
+   *         Decreases indexes of all items at slot > idx by 1.
+   * Returns
+   *     0            success
+   *     EINVAL       if idx>=this->size()
+   * On error, omt is unchanged.
+   * Rationale: To delete an item, first find its index using find or find_zero,
+   * then delete it. Performance: time=O(\log N) amortized.
+   */
+  int delete_at(const uint32_t idx);
+
+  /**
+   * Effect:  Iterate over the values of the omt, from left to right, calling f
+   * on each value. The first argument passed to f is a ref-to-const of the
+   * value stored in the omt. The second argument passed to f is the index of
+   * the value. The third argument passed to f is iterate_extra. The indices run
+   * from 0 (inclusive) to this->size() (exclusive). Requires: f != NULL
+   * Returns:
+   *  If f ever returns nonzero, then the iteration stops, and the value
+   * returned by f is returned by iterate. If f always returns zero, then
+   * iterate returns 0. Requires:  Don't modify the omt while running.  (E.g., f
+   * may not insert or delete values from the omt.) Performance: time=O(i+\log
+   * N) where i is the number of times f is called, and N is the number of
+   * elements in the omt. Rationale: Although the functional iterator requires
+   * defining another function (as opposed to C++ style iterator), it is much
+   * easier to read. Rationale: We may at some point use functors, but for now
+   * this is a smaller change from the old OMT.
+   */
+  template <typename iterate_extra_t,
+            int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+  int iterate(iterate_extra_t *const iterate_extra) const;
+
+  /**
+   * Effect:  Iterate over the values of the omt, from left to right, calling f
+   * on each value. The first argument passed to f is a ref-to-const of the
+   * value stored in the omt. The second argument passed to f is the index of
+   * the value. The third argument passed to f is iterate_extra. The indices run
+   * from 0 (inclusive) to this->size() (exclusive). We will iterate only over
+   * [left,right)
+   *
+   * Requires: left <= right
+   * Requires: f != NULL
+   * Returns:
+   *  EINVAL  if right > this->size()
+   *  If f ever returns nonzero, then the iteration stops, and the value
+   * returned by f is returned by iterate_on_range. If f always returns zero,
+   * then iterate_on_range returns 0. Requires:  Don't modify the omt while
+   * running.  (E.g., f may not insert or delete values from the omt.)
+   * Performance: time=O(i+\log N) where i is the number of times f is called,
+   * and N is the number of elements in the omt. Rational: Although the
+   * functional iterator requires defining another function (as opposed to C++
+   * style iterator), it is much easier to read.
+   */
+  template <typename iterate_extra_t,
+            int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+  int iterate_on_range(const uint32_t left, const uint32_t right,
+                       iterate_extra_t *const iterate_extra) const;
+
+  /**
+   * Effect: Iterate over the values of the omt, and mark the nodes that are
+   * visited. Other than the marks, this behaves the same as iterate_on_range.
+   * Requires: supports_marks == true
+   * Performance: time=O(i+\log N) where i is the number of times f is called,
+   * and N is the number of elements in the omt. Notes: This function MAY be
+   * called concurrently by multiple threads, but not concurrently with any
+   * other non-const function.
+   */
+  template <typename iterate_extra_t,
+            int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+  int iterate_and_mark_range(const uint32_t left, const uint32_t right,
+                             iterate_extra_t *const iterate_extra);
+
+  /**
+   * Effect: Iterate over the values of the omt, from left to right, calling f
+   * on each value whose node has been marked. Other than the marks, this
+   * behaves the same as iterate. Requires: supports_marks == true Performance:
+   * time=O(i+\log N) where i is the number of times f is called, and N is the
+   * number of elements in the omt.
+   */
+  template <typename iterate_extra_t,
+            int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+  int iterate_over_marked(iterate_extra_t *const iterate_extra) const;
+
+  /**
+   * Effect: Delete all elements from the omt, whose nodes have been marked.
+   * Requires: supports_marks == true
+   * Performance: time=O(N + i\log N) where i is the number of marked elements,
+   * {c,sh}ould be faster
+   */
+  void delete_all_marked(void);
+
+  /**
+   * Effect: Verify that the internal state of the marks in the tree are
+   * self-consistent. Crashes the system if the marks are in a bad state.
+   * Requires: supports_marks == true
+   * Performance: time=O(N)
+   * Notes:
+   *  Even though this is a const function, it requires exclusive access.
+   * Rationale:
+   *  The current implementation of the marks relies on a sort of
+   *  "cache" bit representing the state of bits below it in the tree.
+   *  This allows glass-box testing that these bits are correct.
+   */
+  void verify_marks_consistent(void) const;
+
+  /**
+   * Effect: None
+   * Returns whether there are any marks in the tree.
+   */
+  bool has_marks(void) const;
+
+  /**
+   * Effect:  Iterate over the values of the omt, from left to right, calling f
+   * on each value. The first argument passed to f is a pointer to the value
+   * stored in the omt. The second argument passed to f is the index of the
+   * value. The third argument passed to f is iterate_extra. The indices run
+   * from 0 (inclusive) to this->size() (exclusive). Requires: same as for
+   * iterate() Returns: same as for iterate() Performance: same as for iterate()
+   * Rationale: In general, most iterators should use iterate() since they
+   * should not modify the data stored in the omt.  This function is for
+   * iterators which need to modify values (for example, free_items). Rationale:
+   * We assume if you are transforming the data in place, you want to do it to
+   * everything at once, so there is not yet an iterate_on_range_ptr (but there
+   * could be).
+   */
+  template <typename iterate_extra_t,
+            int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+  void iterate_ptr(iterate_extra_t *const iterate_extra);
+
+  /**
+   * Effect: Set *value=V_idx
+   * Returns
+   *    0             success
+   *    EINVAL        if index>=toku_omt_size(omt)
+   * On nonzero return, *value is unchanged
+   * Performance: time=O(\log N)
+   */
+  int fetch(const uint32_t idx, omtdataout_t *const value) const;
+
+  /**
+   * Effect:  Find the smallest i such that h(V_i, extra)>=0
+   *  If there is such an i and h(V_i,extra)==0 then set *idxp=i, set *value =
+   * V_i, and return 0. If there is such an i and h(V_i,extra)>0  then set
+   * *idxp=i and return DB_NOTFOUND. If there is no such i then set
+   * *idx=this->size() and return DB_NOTFOUND. Note: value is of type
+   * omtdataout_t, which may be of type (omtdata_t) or (omtdata_t *) but is
+   * fixed by the instantiation. If it is the value type, then the value is
+   * copied out (even if the value type is a pointer to something else) If it is
+   * the pointer type, then *value is set to a pointer to the data within the
+   * omt. This is determined by the type of the omt as initially declared. If
+   * the omt is declared as omt<foo_t>, then foo_t's will be stored and foo_t's
+   * will be returned by find and related functions. If the omt is declared as
+   * omt<foo_t, foo_t *>, then foo_t's will be stored, and pointers to the
+   * stored items will be returned by find and related functions. Rationale:
+   *  Structs too small for malloc should be stored directly in the omt.
+   *  These structs may need to be edited as they exist inside the omt, so we
+   * need a way to get a pointer within the omt. Using separate functions for
+   * returning pointers and values increases code duplication and reduces
+   * type-checking. That also reduces the ability of the creator of a data
+   * structure to give advice to its future users. Slight overloading in this
+   * case seemed to provide a better API and better type checking.
+   */
+  template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+  int find_zero(const omtcmp_t &extra, omtdataout_t *const value,
+                uint32_t *const idxp) const;
+
+  /**
+   *   Effect:
+   *    If direction >0 then find the smallest i such that h(V_i,extra)>0.
+   *    If direction <0 then find the largest  i such that h(V_i,extra)<0.
+   *    (Direction may not be equal to zero.)
+   *    If value!=NULL then store V_i in *value
+   *    If idxp!=NULL then store i in *idxp.
+   *   Requires: The signum of h is monotically increasing.
+   *   Returns
+   *      0             success
+   *      DB_NOTFOUND   no such value is found.
+   *   On nonzero return, *value and *idxp are unchanged
+   *   Performance: time=O(\log N)
+   *   Rationale:
+   *     Here's how to use the find function to find various things
+   *       Cases for find:
+   *        find first value:         ( h(v)=+1, direction=+1 )
+   *        find last value           ( h(v)=-1, direction=-1 )
+   *        find first X              ( h(v)=(v< x) ? -1 : 1    direction=+1 )
+   *        find last X               ( h(v)=(v<=x) ? -1 : 1    direction=-1 )
+   *        find X or successor to X  ( same as find first X. )
+   *
+   *   Rationale: To help understand heaviside functions and behavor of find:
+   *    There are 7 kinds of heaviside functions.
+   *    The signus of the h must be monotonically increasing.
+   *    Given a function of the following form, A is the element
+   *    returned for direction>0, B is the element returned
+   *    for direction<0, C is the element returned for
+   *    direction==0 (see find_zero) (with a return of 0), and D is the element
+   *    returned for direction==0 (see find_zero) with a return of DB_NOTFOUND.
+   *    If any of A, B, or C are not found, then asking for the
+   *    associated direction will return DB_NOTFOUND.
+   *    See find_zero for more information.
+   *
+   *    Let the following represent the signus of the heaviside function.
+   *
+   *    -...-
+   *        A
+   *         D
+   *
+   *    +...+
+   *    B
+   *    D
+   *
+   *    0...0
+   *    C
+   *
+   *    -...-0...0
+   *        AC
+   *
+   *    0...0+...+
+   *    C    B
+   *
+   *    -...-+...+
+   *        AB
+   *         D
+   *
+   *    -...-0...0+...+
+   *        AC    B
+   */
+  template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+  int find(const omtcmp_t &extra, int direction, omtdataout_t *const value,
+           uint32_t *const idxp) const;
+
+  /**
+   * Effect: Return the size (in bytes) of the omt, as it resides in main
+   * memory.  If the data stored are pointers, don't include the size of what
+   * they all point to.
+   */
+  size_t memory_size(void);
+
+ private:
+  typedef uint32_t node_idx;
+  typedef omt_internal::subtree_templated<supports_marks> subtree;
+  typedef omt_internal::omt_node_templated<omtdata_t, supports_marks> omt_node;
+  ENSURE_POD(subtree);
+
+  struct omt_array {
+    uint32_t start_idx;
+    uint32_t num_values;
+    omtdata_t *values;
+  };
+
+  struct omt_tree {
+    subtree root;
+    uint32_t free_idx;
+    omt_node *nodes;
+  };
+
+  bool is_array;
+  uint32_t capacity;
+  union {
+    struct omt_array a;
+    struct omt_tree t;
+  } d;
+
+  __attribute__((nonnull)) void unmark(const subtree &subtree,
+                                       const uint32_t index,
+                                       GrowableArray<node_idx> *const indexes);
+
+  void create_internal_no_array(const uint32_t new_capacity);
+
+  void create_internal(const uint32_t new_capacity);
+
+  uint32_t nweight(const subtree &subtree) const;
+
+  node_idx node_malloc(void);
+
+  void node_free(const node_idx idx);
+
+  void maybe_resize_array(const uint32_t n);
+
+  __attribute__((nonnull)) void fill_array_with_subtree_values(
+      omtdata_t *const array, const subtree &subtree) const;
+
+  void convert_to_array(void);
+
+  __attribute__((nonnull)) void rebuild_from_sorted_array(
+      subtree *const subtree, const omtdata_t *const values,
+      const uint32_t numvalues);
+
+  void convert_to_tree(void);
+
+  void maybe_resize_or_convert(const uint32_t n);
+
+  bool will_need_rebalance(const subtree &subtree, const int leftmod,
+                           const int rightmod) const;
+
+  __attribute__((nonnull)) void insert_internal(
+      subtree *const subtreep, const omtdata_t &value, const uint32_t idx,
+      subtree **const rebalance_subtree);
+
+  void set_at_internal_array(const omtdata_t &value, const uint32_t idx);
+
+  void set_at_internal(const subtree &subtree, const omtdata_t &value,
+                       const uint32_t idx);
+
+  void delete_internal(subtree *const subtreep, const uint32_t idx,
+                       omt_node *const copyn,
+                       subtree **const rebalance_subtree);
+
+  template <typename iterate_extra_t,
+            int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+  int iterate_internal_array(const uint32_t left, const uint32_t right,
+                             iterate_extra_t *const iterate_extra) const;
+
+  template <typename iterate_extra_t,
+            int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+  void iterate_ptr_internal(const uint32_t left, const uint32_t right,
+                            const subtree &subtree, const uint32_t idx,
+                            iterate_extra_t *const iterate_extra);
+
+  template <typename iterate_extra_t,
+            int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+  void iterate_ptr_internal_array(const uint32_t left, const uint32_t right,
+                                  iterate_extra_t *const iterate_extra);
+
+  template <typename iterate_extra_t,
+            int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+  int iterate_internal(const uint32_t left, const uint32_t right,
+                       const subtree &subtree, const uint32_t idx,
+                       iterate_extra_t *const iterate_extra) const;
+
+  template <typename iterate_extra_t,
+            int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+  int iterate_and_mark_range_internal(const uint32_t left, const uint32_t right,
+                                      const subtree &subtree,
+                                      const uint32_t idx,
+                                      iterate_extra_t *const iterate_extra);
+
+  template <typename iterate_extra_t,
+            int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+  int iterate_over_marked_internal(const subtree &subtree, const uint32_t idx,
+                                   iterate_extra_t *const iterate_extra) const;
+
+  uint32_t verify_marks_consistent_internal(const subtree &subtree,
+                                            const bool allow_marks) const;
+
+  void fetch_internal_array(const uint32_t i, omtdataout_t *const value) const;
+
+  void fetch_internal(const subtree &subtree, const uint32_t i,
+                      omtdataout_t *const value) const;
+
+  __attribute__((nonnull)) void fill_array_with_subtree_idxs(
+      node_idx *const array, const subtree &subtree) const;
+
+  __attribute__((nonnull)) void rebuild_subtree_from_idxs(
+      subtree *const subtree, const node_idx *const idxs,
+      const uint32_t numvalues);
+
+  __attribute__((nonnull)) void rebalance(subtree *const subtree);
+
+  __attribute__((nonnull)) static void copyout(omtdata_t *const out,
+                                               const omt_node *const n);
+
+  __attribute__((nonnull)) static void copyout(omtdata_t **const out,
+                                               omt_node *const n);
+
+  __attribute__((nonnull)) static void copyout(
+      omtdata_t *const out, const omtdata_t *const stored_value_ptr);
+
+  __attribute__((nonnull)) static void copyout(
+      omtdata_t **const out, omtdata_t *const stored_value_ptr);
+
+  template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+  int find_internal_zero_array(const omtcmp_t &extra, omtdataout_t *const value,
+                               uint32_t *const idxp) const;
+
+  template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+  int find_internal_zero(const subtree &subtree, const omtcmp_t &extra,
+                         omtdataout_t *const value, uint32_t *const idxp) const;
+
+  template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+  int find_internal_plus_array(const omtcmp_t &extra, omtdataout_t *const value,
+                               uint32_t *const idxp) const;
+
+  template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+  int find_internal_plus(const subtree &subtree, const omtcmp_t &extra,
+                         omtdataout_t *const value, uint32_t *const idxp) const;
+
+  template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+  int find_internal_minus_array(const omtcmp_t &extra,
+                                omtdataout_t *const value,
+                                uint32_t *const idxp) const;
+
+  template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+  int find_internal_minus(const subtree &subtree, const omtcmp_t &extra,
+                          omtdataout_t *const value,
+                          uint32_t *const idxp) const;
+};
+
+}  // namespace toku
+
+// include the implementation here
+#include "omt_impl.h"
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h
new file mode 100644
index 000000000..e77986716
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h
@@ -0,0 +1,1295 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <string.h>
+
+#include "../db.h"
+#include "../portability/memory.h"
+
+namespace toku {
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::create(void) {
+  this->create_internal(2);
+  if (supports_marks) {
+    this->convert_to_tree();
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::create_no_array(void) {
+  if (!supports_marks) {
+    this->create_internal_no_array(0);
+  } else {
+    this->is_array = false;
+    this->capacity = 0;
+    this->d.t.nodes = nullptr;
+    this->d.t.root.set_to_null();
+    this->d.t.free_idx = 0;
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::create_from_sorted_array(
+    const omtdata_t *const values, const uint32_t numvalues) {
+  this->create_internal(numvalues);
+  memcpy(this->d.a.values, values, numvalues * (sizeof values[0]));
+  this->d.a.num_values = numvalues;
+  if (supports_marks) {
+    this->convert_to_tree();
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::create_steal_sorted_array(
+    omtdata_t **const values, const uint32_t numvalues,
+    const uint32_t new_capacity) {
+  paranoid_invariant_notnull(values);
+  this->create_internal_no_array(new_capacity);
+  this->d.a.num_values = numvalues;
+  this->d.a.values = *values;
+  *values = nullptr;
+  if (supports_marks) {
+    this->convert_to_tree();
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+int omt<omtdata_t, omtdataout_t, supports_marks>::split_at(omt *const newomt,
+                                                           const uint32_t idx) {
+  barf_if_marked(*this);
+  paranoid_invariant_notnull(newomt);
+  if (idx > this->size()) {
+    return EINVAL;
+  }
+  this->convert_to_array();
+  const uint32_t newsize = this->size() - idx;
+  newomt->create_from_sorted_array(&this->d.a.values[this->d.a.start_idx + idx],
+                                   newsize);
+  this->d.a.num_values = idx;
+  this->maybe_resize_array(idx);
+  if (supports_marks) {
+    this->convert_to_tree();
+  }
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::merge(omt *const leftomt,
+                                                         omt *const rightomt) {
+  barf_if_marked(*this);
+  paranoid_invariant_notnull(leftomt);
+  paranoid_invariant_notnull(rightomt);
+  const uint32_t leftsize = leftomt->size();
+  const uint32_t rightsize = rightomt->size();
+  const uint32_t newsize = leftsize + rightsize;
+
+  if (leftomt->is_array) {
+    if (leftomt->capacity -
+            (leftomt->d.a.start_idx + leftomt->d.a.num_values) >=
+        rightsize) {
+      this->create_steal_sorted_array(
+          &leftomt->d.a.values, leftomt->d.a.num_values, leftomt->capacity);
+      this->d.a.start_idx = leftomt->d.a.start_idx;
+    } else {
+      this->create_internal(newsize);
+      memcpy(&this->d.a.values[0], &leftomt->d.a.values[leftomt->d.a.start_idx],
+             leftomt->d.a.num_values * (sizeof this->d.a.values[0]));
+    }
+  } else {
+    this->create_internal(newsize);
+    leftomt->fill_array_with_subtree_values(&this->d.a.values[0],
+                                            leftomt->d.t.root);
+  }
+  leftomt->destroy();
+  this->d.a.num_values = leftsize;
+
+  if (rightomt->is_array) {
+    memcpy(&this->d.a.values[this->d.a.start_idx + this->d.a.num_values],
+           &rightomt->d.a.values[rightomt->d.a.start_idx],
+           rightomt->d.a.num_values * (sizeof this->d.a.values[0]));
+  } else {
+    rightomt->fill_array_with_subtree_values(
+        &this->d.a.values[this->d.a.start_idx + this->d.a.num_values],
+        rightomt->d.t.root);
+  }
+  rightomt->destroy();
+  this->d.a.num_values += rightsize;
+  paranoid_invariant(this->size() == newsize);
+  if (supports_marks) {
+    this->convert_to_tree();
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::clone(const omt &src) {
+  barf_if_marked(*this);
+  this->create_internal(src.size());
+  if (src.is_array) {
+    memcpy(&this->d.a.values[0], &src.d.a.values[src.d.a.start_idx],
+           src.d.a.num_values * (sizeof this->d.a.values[0]));
+  } else {
+    src.fill_array_with_subtree_values(&this->d.a.values[0], src.d.t.root);
+  }
+  this->d.a.num_values = src.size();
+  if (supports_marks) {
+    this->convert_to_tree();
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::clear(void) {
+  if (this->is_array) {
+    this->d.a.start_idx = 0;
+    this->d.a.num_values = 0;
+  } else {
+    this->d.t.root.set_to_null();
+    this->d.t.free_idx = 0;
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::destroy(void) {
+  this->clear();
+  this->capacity = 0;
+  if (this->is_array) {
+    if (this->d.a.values != nullptr) {
+      toku_free(this->d.a.values);
+    }
+    this->d.a.values = nullptr;
+  } else {
+    if (this->d.t.nodes != nullptr) {
+      toku_free(this->d.t.nodes);
+    }
+    this->d.t.nodes = nullptr;
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+uint32_t omt<omtdata_t, omtdataout_t, supports_marks>::size(void) const {
+  if (this->is_array) {
+    return this->d.a.num_values;
+  } else {
+    return this->nweight(this->d.t.root);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::insert(const omtdata_t &value,
+                                                         const omtcmp_t &v,
+                                                         uint32_t *const idx) {
+  int r;
+  uint32_t insert_idx;
+
+  r = this->find_zero<omtcmp_t, h>(v, nullptr, &insert_idx);
+  if (r == 0) {
+    if (idx) *idx = insert_idx;
+    return DB_KEYEXIST;
+  }
+  if (r != DB_NOTFOUND) return r;
+
+  if ((r = this->insert_at(value, insert_idx))) return r;
+  if (idx) *idx = insert_idx;
+
+  return 0;
+}
+
+// The following 3 functions implement a static if for us.
+template <typename omtdata_t, typename omtdataout_t>
+static void barf_if_marked(const omt<omtdata_t, omtdataout_t, false> &UU(omt)) {
+}
+
+template <typename omtdata_t, typename omtdataout_t>
+static void barf_if_marked(const omt<omtdata_t, omtdataout_t, true> &omt) {
+  invariant(!omt.has_marks());
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+bool omt<omtdata_t, omtdataout_t, supports_marks>::has_marks(void) const {
+  static_assert(supports_marks, "Does not support marks");
+  if (this->d.t.root.is_null()) {
+    return false;
+  }
+  const omt_node &node = this->d.t.nodes[this->d.t.root.get_index()];
+  return node.get_marks_below() || node.get_marked();
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+int omt<omtdata_t, omtdataout_t, supports_marks>::insert_at(
+    const omtdata_t &value, const uint32_t idx) {
+  barf_if_marked(*this);
+  if (idx > this->size()) {
+    return EINVAL;
+  }
+
+  this->maybe_resize_or_convert(this->size() + 1);
+  if (this->is_array && idx != this->d.a.num_values &&
+      (idx != 0 || this->d.a.start_idx == 0)) {
+    this->convert_to_tree();
+  }
+  if (this->is_array) {
+    if (idx == this->d.a.num_values) {
+      this->d.a.values[this->d.a.start_idx + this->d.a.num_values] = value;
+    } else {
+      this->d.a.values[--this->d.a.start_idx] = value;
+    }
+    this->d.a.num_values++;
+  } else {
+    subtree *rebalance_subtree = nullptr;
+    this->insert_internal(&this->d.t.root, value, idx, &rebalance_subtree);
+    if (rebalance_subtree != nullptr) {
+      this->rebalance(rebalance_subtree);
+    }
+  }
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+int omt<omtdata_t, omtdataout_t, supports_marks>::set_at(const omtdata_t &value,
+                                                         const uint32_t idx) {
+  barf_if_marked(*this);
+  if (idx >= this->size()) {
+    return EINVAL;
+  }
+
+  if (this->is_array) {
+    this->set_at_internal_array(value, idx);
+  } else {
+    this->set_at_internal(this->d.t.root, value, idx);
+  }
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+int omt<omtdata_t, omtdataout_t, supports_marks>::delete_at(
+    const uint32_t idx) {
+  barf_if_marked(*this);
+  if (idx >= this->size()) {
+    return EINVAL;
+  }
+
+  this->maybe_resize_or_convert(this->size() - 1);
+  if (this->is_array && idx != 0 && idx != this->d.a.num_values - 1) {
+    this->convert_to_tree();
+  }
+  if (this->is_array) {
+    // Testing for 0 does not rule out it being the last entry.
+    // Test explicitly for num_values-1
+    if (idx != this->d.a.num_values - 1) {
+      this->d.a.start_idx++;
+    }
+    this->d.a.num_values--;
+  } else {
+    subtree *rebalance_subtree = nullptr;
+    this->delete_internal(&this->d.t.root, idx, nullptr, &rebalance_subtree);
+    if (rebalance_subtree != nullptr) {
+      this->rebalance(rebalance_subtree);
+    }
+  }
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::iterate(
+    iterate_extra_t *const iterate_extra) const {
+  return this->iterate_on_range<iterate_extra_t, f>(0, this->size(),
+                                                    iterate_extra);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_on_range(
+    const uint32_t left, const uint32_t right,
+    iterate_extra_t *const iterate_extra) const {
+  if (right > this->size()) {
+    return EINVAL;
+  }
+  if (left == right) {
+    return 0;
+  }
+  if (this->is_array) {
+    return this->iterate_internal_array<iterate_extra_t, f>(left, right,
+                                                            iterate_extra);
+  }
+  return this->iterate_internal<iterate_extra_t, f>(left, right, this->d.t.root,
+                                                    0, iterate_extra);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_and_mark_range(
+    const uint32_t left, const uint32_t right,
+    iterate_extra_t *const iterate_extra) {
+  static_assert(supports_marks, "does not support marks");
+  if (right > this->size()) {
+    return EINVAL;
+  }
+  if (left == right) {
+    return 0;
+  }
+  paranoid_invariant(!this->is_array);
+  return this->iterate_and_mark_range_internal<iterate_extra_t, f>(
+      left, right, this->d.t.root, 0, iterate_extra);
+}
+
+// TODO: We can optimize this if we steal 3 bits.  1 bit: this node is
+// marked.  1 bit: left subtree has marks. 1 bit: right subtree has marks.
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_over_marked(
+    iterate_extra_t *const iterate_extra) const {
+  static_assert(supports_marks, "does not support marks");
+  paranoid_invariant(!this->is_array);
+  return this->iterate_over_marked_internal<iterate_extra_t, f>(
+      this->d.t.root, 0, iterate_extra);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::unmark(
+    const subtree &st, const uint32_t index,
+    GrowableArray<node_idx> *const indexes) {
+  if (st.is_null()) {
+    return;
+  }
+  omt_node &n = this->d.t.nodes[st.get_index()];
+  const uint32_t index_root = index + this->nweight(n.left);
+
+  const bool below = n.get_marks_below();
+  if (below) {
+    this->unmark(n.left, index, indexes);
+  }
+  if (n.get_marked()) {
+    indexes->push(index_root);
+  }
+  n.clear_stolen_bits();
+  if (below) {
+    this->unmark(n.right, index_root + 1, indexes);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::delete_all_marked(void) {
+  static_assert(supports_marks, "does not support marks");
+  if (!this->has_marks()) {
+    return;
+  }
+  paranoid_invariant(!this->is_array);
+  GrowableArray<node_idx> marked_indexes;
+  marked_indexes.init();
+
+  // Remove all marks.
+  // We need to delete all the stolen bits before calling delete_at to
+  // prevent barfing.
+  this->unmark(this->d.t.root, 0, &marked_indexes);
+
+  for (uint32_t i = 0; i < marked_indexes.get_size(); i++) {
+    // Delete from left to right, shift by number already deleted.
+    // Alternative is delete from right to left.
+    int r = this->delete_at(marked_indexes.fetch_unchecked(i) - i);
+    lazy_assert_zero(r);
+  }
+  marked_indexes.deinit();
+  barf_if_marked(*this);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+uint32_t
+omt<omtdata_t, omtdataout_t, supports_marks>::verify_marks_consistent_internal(
+    const subtree &st, const bool UU(allow_marks)) const {
+  if (st.is_null()) {
+    return 0;
+  }
+  const omt_node &node = this->d.t.nodes[st.get_index()];
+  uint32_t num_marks =
+      verify_marks_consistent_internal(node.left, node.get_marks_below());
+  num_marks +=
+      verify_marks_consistent_internal(node.right, node.get_marks_below());
+  if (node.get_marks_below()) {
+    paranoid_invariant(allow_marks);
+    paranoid_invariant(num_marks > 0);
+  } else {
+    // redundant with invariant below, but nice to have explicitly
+    paranoid_invariant(num_marks == 0);
+  }
+  if (node.get_marked()) {
+    paranoid_invariant(allow_marks);
+    ++num_marks;
+  }
+  return num_marks;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::verify_marks_consistent(
+    void) const {
+  static_assert(supports_marks, "does not support marks");
+  paranoid_invariant(!this->is_array);
+  this->verify_marks_consistent_internal(this->d.t.root, true);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+void omt<omtdata_t, omtdataout_t, supports_marks>::iterate_ptr(
+    iterate_extra_t *const iterate_extra) {
+  if (this->is_array) {
+    this->iterate_ptr_internal_array<iterate_extra_t, f>(0, this->size(),
+                                                         iterate_extra);
+  } else {
+    this->iterate_ptr_internal<iterate_extra_t, f>(
+        0, this->size(), this->d.t.root, 0, iterate_extra);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+int omt<omtdata_t, omtdataout_t, supports_marks>::fetch(
+    const uint32_t idx, omtdataout_t *const value) const {
+  if (idx >= this->size()) {
+    return EINVAL;
+  }
+  if (this->is_array) {
+    this->fetch_internal_array(idx, value);
+  } else {
+    this->fetch_internal(this->d.t.root, idx, value);
+  }
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find_zero(
+    const omtcmp_t &extra, omtdataout_t *const value,
+    uint32_t *const idxp) const {
+  uint32_t tmp_index;
+  uint32_t *const child_idxp = (idxp != nullptr) ? idxp : &tmp_index;
+  int r;
+  if (this->is_array) {
+    r = this->find_internal_zero_array<omtcmp_t, h>(extra, value, child_idxp);
+  } else {
+    r = this->find_internal_zero<omtcmp_t, h>(this->d.t.root, extra, value,
+                                              child_idxp);
+  }
+  return r;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find(
+    const omtcmp_t &extra, int direction, omtdataout_t *const value,
+    uint32_t *const idxp) const {
+  uint32_t tmp_index;
+  uint32_t *const child_idxp = (idxp != nullptr) ? idxp : &tmp_index;
+  paranoid_invariant(direction != 0);
+  if (direction < 0) {
+    if (this->is_array) {
+      return this->find_internal_minus_array<omtcmp_t, h>(extra, value,
+                                                          child_idxp);
+    } else {
+      return this->find_internal_minus<omtcmp_t, h>(this->d.t.root, extra,
+                                                    value, child_idxp);
+    }
+  } else {
+    if (this->is_array) {
+      return this->find_internal_plus_array<omtcmp_t, h>(extra, value,
+                                                         child_idxp);
+    } else {
+      return this->find_internal_plus<omtcmp_t, h>(this->d.t.root, extra, value,
+                                                   child_idxp);
+    }
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+size_t omt<omtdata_t, omtdataout_t, supports_marks>::memory_size(void) {
+  if (this->is_array) {
+    return (sizeof *this) + this->capacity * (sizeof this->d.a.values[0]);
+  }
+  return (sizeof *this) + this->capacity * (sizeof this->d.t.nodes[0]);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::create_internal_no_array(
+    const uint32_t new_capacity) {
+  this->is_array = true;
+  this->d.a.start_idx = 0;
+  this->d.a.num_values = 0;
+  this->d.a.values = nullptr;
+  this->capacity = new_capacity;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::create_internal(
+    const uint32_t new_capacity) {
+  this->create_internal_no_array(new_capacity);
+  XMALLOC_N(this->capacity, this->d.a.values);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+uint32_t omt<omtdata_t, omtdataout_t, supports_marks>::nweight(
+    const subtree &st) const {
+  if (st.is_null()) {
+    return 0;
+  } else {
+    return this->d.t.nodes[st.get_index()].weight;
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+typename omt<omtdata_t, omtdataout_t, supports_marks>::node_idx
+omt<omtdata_t, omtdataout_t, supports_marks>::node_malloc(void) {
+  paranoid_invariant(this->d.t.free_idx < this->capacity);
+  omt_node &n = this->d.t.nodes[this->d.t.free_idx];
+  n.clear_stolen_bits();
+  return this->d.t.free_idx++;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::node_free(
+    const node_idx UU(idx)) {
+  paranoid_invariant(idx < this->capacity);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::maybe_resize_array(
+    const uint32_t n) {
+  const uint32_t new_size = n <= 2 ? 4 : 2 * n;
+  const uint32_t room = this->capacity - this->d.a.start_idx;
+
+  if (room < n || this->capacity / 2 >= new_size) {
+    omtdata_t *XMALLOC_N(new_size, tmp_values);
+    if (this->d.a.num_values) {
+      memcpy(tmp_values, &this->d.a.values[this->d.a.start_idx],
+             this->d.a.num_values * (sizeof tmp_values[0]));
+    }
+    this->d.a.start_idx = 0;
+    this->capacity = new_size;
+    toku_free(this->d.a.values);
+    this->d.a.values = tmp_values;
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t,
+         supports_marks>::fill_array_with_subtree_values(omtdata_t *const array,
+                                                         const subtree &st)
+    const {
+  if (st.is_null()) return;
+  const omt_node &tree = this->d.t.nodes[st.get_index()];
+  this->fill_array_with_subtree_values(&array[0], tree.left);
+  array[this->nweight(tree.left)] = tree.value;
+  this->fill_array_with_subtree_values(&array[this->nweight(tree.left) + 1],
+                                       tree.right);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::convert_to_array(void) {
+  if (!this->is_array) {
+    const uint32_t num_values = this->size();
+    uint32_t new_size = 2 * num_values;
+    new_size = new_size < 4 ? 4 : new_size;
+
+    omtdata_t *XMALLOC_N(new_size, tmp_values);
+    this->fill_array_with_subtree_values(tmp_values, this->d.t.root);
+    toku_free(this->d.t.nodes);
+    this->is_array = true;
+    this->capacity = new_size;
+    this->d.a.num_values = num_values;
+    this->d.a.values = tmp_values;
+    this->d.a.start_idx = 0;
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::rebuild_from_sorted_array(
+    subtree *const st, const omtdata_t *const values,
+    const uint32_t numvalues) {
+  if (numvalues == 0) {
+    st->set_to_null();
+  } else {
+    const uint32_t halfway = numvalues / 2;
+    const node_idx newidx = this->node_malloc();
+    omt_node *const newnode = &this->d.t.nodes[newidx];
+    newnode->weight = numvalues;
+    newnode->value = values[halfway];
+    st->set_index(newidx);
+    // update everything before the recursive calls so the second call
+    // can be a tail call.
+    this->rebuild_from_sorted_array(&newnode->left, &values[0], halfway);
+    this->rebuild_from_sorted_array(&newnode->right, &values[halfway + 1],
+                                    numvalues - (halfway + 1));
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::convert_to_tree(void) {
+  if (this->is_array) {
+    const uint32_t num_nodes = this->size();
+    uint32_t new_size = num_nodes * 2;
+    new_size = new_size < 4 ? 4 : new_size;
+
+    omt_node *XMALLOC_N(new_size, new_nodes);
+    omtdata_t *const values = this->d.a.values;
+    omtdata_t *const tmp_values = &values[this->d.a.start_idx];
+    this->is_array = false;
+    this->d.t.nodes = new_nodes;
+    this->capacity = new_size;
+    this->d.t.free_idx = 0;
+    this->d.t.root.set_to_null();
+    this->rebuild_from_sorted_array(&this->d.t.root, tmp_values, num_nodes);
+    toku_free(values);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::maybe_resize_or_convert(
+    const uint32_t n) {
+  if (this->is_array) {
+    this->maybe_resize_array(n);
+  } else {
+    const uint32_t new_size = n <= 2 ? 4 : 2 * n;
+    const uint32_t num_nodes = this->nweight(this->d.t.root);
+    if ((this->capacity / 2 >= new_size) ||
+        (this->d.t.free_idx >= this->capacity && num_nodes < n) ||
+        (this->capacity < n)) {
+      this->convert_to_array();
+      // if we had a free list, the "supports_marks" version could
+      // just resize, as it is now, we have to convert to and back
+      // from an array.
+      if (supports_marks) {
+        this->convert_to_tree();
+      }
+    }
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+bool omt<omtdata_t, omtdataout_t, supports_marks>::will_need_rebalance(
+    const subtree &st, const int leftmod, const int rightmod) const {
+  if (st.is_null()) {
+    return false;
+  }
+  const omt_node &n = this->d.t.nodes[st.get_index()];
+  // one of the 1's is for the root.
+  // the other is to take ceil(n/2)
+  const uint32_t weight_left = this->nweight(n.left) + leftmod;
+  const uint32_t weight_right = this->nweight(n.right) + rightmod;
+  return ((1 + weight_left < (1 + 1 + weight_right) / 2) ||
+          (1 + weight_right < (1 + 1 + weight_left) / 2));
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::insert_internal(
+    subtree *const subtreep, const omtdata_t &value, const uint32_t idx,
+    subtree **const rebalance_subtree) {
+  if (subtreep->is_null()) {
+    paranoid_invariant_zero(idx);
+    const node_idx newidx = this->node_malloc();
+    omt_node *const newnode = &this->d.t.nodes[newidx];
+    newnode->weight = 1;
+    newnode->left.set_to_null();
+    newnode->right.set_to_null();
+    newnode->value = value;
+    subtreep->set_index(newidx);
+  } else {
+    omt_node &n = this->d.t.nodes[subtreep->get_index()];
+    n.weight++;
+    if (idx <= this->nweight(n.left)) {
+      if (*rebalance_subtree == nullptr &&
+          this->will_need_rebalance(*subtreep, 1, 0)) {
+        *rebalance_subtree = subtreep;
+      }
+      this->insert_internal(&n.left, value, idx, rebalance_subtree);
+    } else {
+      if (*rebalance_subtree == nullptr &&
+          this->will_need_rebalance(*subtreep, 0, 1)) {
+        *rebalance_subtree = subtreep;
+      }
+      const uint32_t sub_index = idx - this->nweight(n.left) - 1;
+      this->insert_internal(&n.right, value, sub_index, rebalance_subtree);
+    }
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::set_at_internal_array(
+    const omtdata_t &value, const uint32_t idx) {
+  this->d.a.values[this->d.a.start_idx + idx] = value;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::set_at_internal(
+    const subtree &st, const omtdata_t &value, const uint32_t idx) {
+  paranoid_invariant(!st.is_null());
+  omt_node &n = this->d.t.nodes[st.get_index()];
+  const uint32_t leftweight = this->nweight(n.left);
+  if (idx < leftweight) {
+    this->set_at_internal(n.left, value, idx);
+  } else if (idx == leftweight) {
+    n.value = value;
+  } else {
+    this->set_at_internal(n.right, value, idx - leftweight - 1);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::delete_internal(
+    subtree *const subtreep, const uint32_t idx, omt_node *const copyn,
+    subtree **const rebalance_subtree) {
+  paranoid_invariant_notnull(subtreep);
+  paranoid_invariant_notnull(rebalance_subtree);
+  paranoid_invariant(!subtreep->is_null());
+  omt_node &n = this->d.t.nodes[subtreep->get_index()];
+  const uint32_t leftweight = this->nweight(n.left);
+  if (idx < leftweight) {
+    n.weight--;
+    if (*rebalance_subtree == nullptr &&
+        this->will_need_rebalance(*subtreep, -1, 0)) {
+      *rebalance_subtree = subtreep;
+    }
+    this->delete_internal(&n.left, idx, copyn, rebalance_subtree);
+  } else if (idx == leftweight) {
+    if (n.left.is_null()) {
+      const uint32_t oldidx = subtreep->get_index();
+      *subtreep = n.right;
+      if (copyn != nullptr) {
+        copyn->value = n.value;
+      }
+      this->node_free(oldidx);
+    } else if (n.right.is_null()) {
+      const uint32_t oldidx = subtreep->get_index();
+      *subtreep = n.left;
+      if (copyn != nullptr) {
+        copyn->value = n.value;
+      }
+      this->node_free(oldidx);
+    } else {
+      if (*rebalance_subtree == nullptr &&
+          this->will_need_rebalance(*subtreep, 0, -1)) {
+        *rebalance_subtree = subtreep;
+      }
+      // don't need to copy up value, it's only used by this
+      // next call, and when that gets to the bottom there
+      // won't be any more recursion
+      n.weight--;
+      this->delete_internal(&n.right, 0, &n, rebalance_subtree);
+    }
+  } else {
+    n.weight--;
+    if (*rebalance_subtree == nullptr &&
+        this->will_need_rebalance(*subtreep, 0, -1)) {
+      *rebalance_subtree = subtreep;
+    }
+    this->delete_internal(&n.right, idx - leftweight - 1, copyn,
+                          rebalance_subtree);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_internal_array(
+    const uint32_t left, const uint32_t right,
+    iterate_extra_t *const iterate_extra) const {
+  int r;
+  for (uint32_t i = left; i < right; ++i) {
+    r = f(this->d.a.values[this->d.a.start_idx + i], i, iterate_extra);
+    if (r != 0) {
+      return r;
+    }
+  }
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+void omt<omtdata_t, omtdataout_t, supports_marks>::iterate_ptr_internal(
+    const uint32_t left, const uint32_t right, const subtree &st,
+    const uint32_t idx, iterate_extra_t *const iterate_extra) {
+  if (!st.is_null()) {
+    omt_node &n = this->d.t.nodes[st.get_index()];
+    const uint32_t idx_root = idx + this->nweight(n.left);
+    if (left < idx_root) {
+      this->iterate_ptr_internal<iterate_extra_t, f>(left, right, n.left, idx,
+                                                     iterate_extra);
+    }
+    if (left <= idx_root && idx_root < right) {
+      int r = f(&n.value, idx_root, iterate_extra);
+      lazy_assert_zero(r);
+    }
+    if (idx_root + 1 < right) {
+      this->iterate_ptr_internal<iterate_extra_t, f>(
+          left, right, n.right, idx_root + 1, iterate_extra);
+    }
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+void omt<omtdata_t, omtdataout_t, supports_marks>::iterate_ptr_internal_array(
+    const uint32_t left, const uint32_t right,
+    iterate_extra_t *const iterate_extra) {
+  for (uint32_t i = left; i < right; ++i) {
+    int r = f(&this->d.a.values[this->d.a.start_idx + i], i, iterate_extra);
+    lazy_assert_zero(r);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_internal(
+    const uint32_t left, const uint32_t right, const subtree &st,
+    const uint32_t idx, iterate_extra_t *const iterate_extra) const {
+  if (st.is_null()) {
+    return 0;
+  }
+  int r;
+  const omt_node &n = this->d.t.nodes[st.get_index()];
+  const uint32_t idx_root = idx + this->nweight(n.left);
+  if (left < idx_root) {
+    r = this->iterate_internal<iterate_extra_t, f>(left, right, n.left, idx,
+                                                   iterate_extra);
+    if (r != 0) {
+      return r;
+    }
+  }
+  if (left <= idx_root && idx_root < right) {
+    r = f(n.value, idx_root, iterate_extra);
+    if (r != 0) {
+      return r;
+    }
+  }
+  if (idx_root + 1 < right) {
+    return this->iterate_internal<iterate_extra_t, f>(
+        left, right, n.right, idx_root + 1, iterate_extra);
+  }
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::
+    iterate_and_mark_range_internal(const uint32_t left, const uint32_t right,
+                                    const subtree &st, const uint32_t idx,
+                                    iterate_extra_t *const iterate_extra) {
+  paranoid_invariant(!st.is_null());
+  int r;
+  omt_node &n = this->d.t.nodes[st.get_index()];
+  const uint32_t idx_root = idx + this->nweight(n.left);
+  if (left < idx_root && !n.left.is_null()) {
+    n.set_marks_below_bit();
+    r = this->iterate_and_mark_range_internal<iterate_extra_t, f>(
+        left, right, n.left, idx, iterate_extra);
+    if (r != 0) {
+      return r;
+    }
+  }
+  if (left <= idx_root && idx_root < right) {
+    n.set_marked_bit();
+    r = f(n.value, idx_root, iterate_extra);
+    if (r != 0) {
+      return r;
+    }
+  }
+  if (idx_root + 1 < right && !n.right.is_null()) {
+    n.set_marks_below_bit();
+    return this->iterate_and_mark_range_internal<iterate_extra_t, f>(
+        left, right, n.right, idx_root + 1, iterate_extra);
+  }
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_over_marked_internal(
+    const subtree &st, const uint32_t idx,
+    iterate_extra_t *const iterate_extra) const {
+  if (st.is_null()) {
+    return 0;
+  }
+  int r;
+  const omt_node &n = this->d.t.nodes[st.get_index()];
+  const uint32_t idx_root = idx + this->nweight(n.left);
+  if (n.get_marks_below()) {
+    r = this->iterate_over_marked_internal<iterate_extra_t, f>(n.left, idx,
+                                                               iterate_extra);
+    if (r != 0) {
+      return r;
+    }
+  }
+  if (n.get_marked()) {
+    r = f(n.value, idx_root, iterate_extra);
+    if (r != 0) {
+      return r;
+    }
+  }
+  if (n.get_marks_below()) {
+    return this->iterate_over_marked_internal<iterate_extra_t, f>(
+        n.right, idx_root + 1, iterate_extra);
+  }
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::fetch_internal_array(
+    const uint32_t i, omtdataout_t *const value) const {
+  if (value != nullptr) {
+    copyout(value, &this->d.a.values[this->d.a.start_idx + i]);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::fetch_internal(
+    const subtree &st, const uint32_t i, omtdataout_t *const value) const {
+  omt_node &n = this->d.t.nodes[st.get_index()];
+  const uint32_t leftweight = this->nweight(n.left);
+  if (i < leftweight) {
+    this->fetch_internal(n.left, i, value);
+  } else if (i == leftweight) {
+    if (value != nullptr) {
+      copyout(value, &n);
+    }
+  } else {
+    this->fetch_internal(n.right, i - leftweight - 1, value);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::fill_array_with_subtree_idxs(
+    node_idx *const array, const subtree &st) const {
+  if (!st.is_null()) {
+    const omt_node &tree = this->d.t.nodes[st.get_index()];
+    this->fill_array_with_subtree_idxs(&array[0], tree.left);
+    array[this->nweight(tree.left)] = st.get_index();
+    this->fill_array_with_subtree_idxs(&array[this->nweight(tree.left) + 1],
+                                       tree.right);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::rebuild_subtree_from_idxs(
+    subtree *const st, const node_idx *const idxs, const uint32_t numvalues) {
+  if (numvalues == 0) {
+    st->set_to_null();
+  } else {
+    uint32_t halfway = numvalues / 2;
+    st->set_index(idxs[halfway]);
+    // node_idx newidx = idxs[halfway];
+    omt_node &newnode = this->d.t.nodes[st->get_index()];
+    newnode.weight = numvalues;
+    // value is already in there.
+    this->rebuild_subtree_from_idxs(&newnode.left, &idxs[0], halfway);
+    this->rebuild_subtree_from_idxs(&newnode.right, &idxs[halfway + 1],
+                                    numvalues - (halfway + 1));
+    // n_idx = newidx;
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::rebalance(
+    subtree *const st) {
+  node_idx idx = st->get_index();
+  if (idx == this->d.t.root.get_index()) {
+    // Try to convert to an array.
+    // If this fails, (malloc) nothing will have changed.
+    // In the failure case we continue on to the standard rebalance
+    // algorithm.
+    this->convert_to_array();
+    if (supports_marks) {
+      this->convert_to_tree();
+    }
+  } else {
+    const omt_node &n = this->d.t.nodes[idx];
+    node_idx *tmp_array;
+    size_t mem_needed = n.weight * (sizeof tmp_array[0]);
+    size_t mem_free =
+        (this->capacity - this->d.t.free_idx) * (sizeof this->d.t.nodes[0]);
+    bool malloced;
+    if (mem_needed <= mem_free) {
+      // There is sufficient free space at the end of the nodes array
+      // to hold enough node indexes to rebalance.
+      malloced = false;
+      tmp_array =
+          reinterpret_cast<node_idx *>(&this->d.t.nodes[this->d.t.free_idx]);
+    } else {
+      malloced = true;
+      XMALLOC_N(n.weight, tmp_array);
+    }
+    this->fill_array_with_subtree_idxs(tmp_array, *st);
+    this->rebuild_subtree_from_idxs(st, tmp_array, n.weight);
+    if (malloced) toku_free(tmp_array);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::copyout(
+    omtdata_t *const out, const omt_node *const n) {
+  *out = n->value;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::copyout(
+    omtdata_t **const out, omt_node *const n) {
+  *out = &n->value;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::copyout(
+    omtdata_t *const out, const omtdata_t *const stored_value_ptr) {
+  *out = *stored_value_ptr;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::copyout(
+    omtdata_t **const out, omtdata_t *const stored_value_ptr) {
+  *out = stored_value_ptr;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_zero_array(
+    const omtcmp_t &extra, omtdataout_t *const value,
+    uint32_t *const idxp) const {
+  paranoid_invariant_notnull(idxp);
+  uint32_t min = this->d.a.start_idx;
+  uint32_t limit = this->d.a.start_idx + this->d.a.num_values;
+  uint32_t best_pos = subtree::NODE_NULL;
+  uint32_t best_zero = subtree::NODE_NULL;
+
+  while (min != limit) {
+    uint32_t mid = (min + limit) / 2;
+    int hv = h(this->d.a.values[mid], extra);
+    if (hv < 0) {
+      min = mid + 1;
+    } else if (hv > 0) {
+      best_pos = mid;
+      limit = mid;
+    } else {
+      best_zero = mid;
+      limit = mid;
+    }
+  }
+  if (best_zero != subtree::NODE_NULL) {
+    // Found a zero
+    if (value != nullptr) {
+      copyout(value, &this->d.a.values[best_zero]);
+    }
+    *idxp = best_zero - this->d.a.start_idx;
+    return 0;
+  }
+  if (best_pos != subtree::NODE_NULL)
+    *idxp = best_pos - this->d.a.start_idx;
+  else
+    *idxp = this->d.a.num_values;
+  return DB_NOTFOUND;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_zero(
+    const subtree &st, const omtcmp_t &extra, omtdataout_t *const value,
+    uint32_t *const idxp) const {
+  paranoid_invariant_notnull(idxp);
+  if (st.is_null()) {
+    *idxp = 0;
+    return DB_NOTFOUND;
+  }
+  omt_node &n = this->d.t.nodes[st.get_index()];
+  int hv = h(n.value, extra);
+  if (hv < 0) {
+    int r = this->find_internal_zero<omtcmp_t, h>(n.right, extra, value, idxp);
+    *idxp += this->nweight(n.left) + 1;
+    return r;
+  } else if (hv > 0) {
+    return this->find_internal_zero<omtcmp_t, h>(n.left, extra, value, idxp);
+  } else {
+    int r = this->find_internal_zero<omtcmp_t, h>(n.left, extra, value, idxp);
+    if (r == DB_NOTFOUND) {
+      *idxp = this->nweight(n.left);
+      if (value != nullptr) {
+        copyout(value, &n);
+      }
+      r = 0;
+    }
+    return r;
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_plus_array(
+    const omtcmp_t &extra, omtdataout_t *const value,
+    uint32_t *const idxp) const {
+  paranoid_invariant_notnull(idxp);
+  uint32_t min = this->d.a.start_idx;
+  uint32_t limit = this->d.a.start_idx + this->d.a.num_values;
+  uint32_t best = subtree::NODE_NULL;
+
+  while (min != limit) {
+    const uint32_t mid = (min + limit) / 2;
+    const int hv = h(this->d.a.values[mid], extra);
+    if (hv > 0) {
+      best = mid;
+      limit = mid;
+    } else {
+      min = mid + 1;
+    }
+  }
+  if (best == subtree::NODE_NULL) {
+    return DB_NOTFOUND;
+  }
+  if (value != nullptr) {
+    copyout(value, &this->d.a.values[best]);
+  }
+  *idxp = best - this->d.a.start_idx;
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_plus(
+    const subtree &st, const omtcmp_t &extra, omtdataout_t *const value,
+    uint32_t *const idxp) const {
+  paranoid_invariant_notnull(idxp);
+  if (st.is_null()) {
+    return DB_NOTFOUND;
+  }
+  omt_node *const n = &this->d.t.nodes[st.get_index()];
+  int hv = h(n->value, extra);
+  int r;
+  if (hv > 0) {
+    r = this->find_internal_plus<omtcmp_t, h>(n->left, extra, value, idxp);
+    if (r == DB_NOTFOUND) {
+      *idxp = this->nweight(n->left);
+      if (value != nullptr) {
+        copyout(value, n);
+      }
+      r = 0;
+    }
+  } else {
+    r = this->find_internal_plus<omtcmp_t, h>(n->right, extra, value, idxp);
+    if (r == 0) {
+      *idxp += this->nweight(n->left) + 1;
+    }
+  }
+  return r;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_minus_array(
+    const omtcmp_t &extra, omtdataout_t *const value,
+    uint32_t *const idxp) const {
+  paranoid_invariant_notnull(idxp);
+  uint32_t min = this->d.a.start_idx;
+  uint32_t limit = this->d.a.start_idx + this->d.a.num_values;
+  uint32_t best = subtree::NODE_NULL;
+
+  while (min != limit) {
+    const uint32_t mid = (min + limit) / 2;
+    const int hv = h(this->d.a.values[mid], extra);
+    if (hv < 0) {
+      best = mid;
+      min = mid + 1;
+    } else {
+      limit = mid;
+    }
+  }
+  if (best == subtree::NODE_NULL) {
+    return DB_NOTFOUND;
+  }
+  if (value != nullptr) {
+    copyout(value, &this->d.a.values[best]);
+  }
+  *idxp = best - this->d.a.start_idx;
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_minus(
+    const subtree &st, const omtcmp_t &extra, omtdataout_t *const value,
+    uint32_t *const idxp) const {
+  paranoid_invariant_notnull(idxp);
+  if (st.is_null()) {
+    return DB_NOTFOUND;
+  }
+  omt_node *const n = &this->d.t.nodes[st.get_index()];
+  int hv = h(n->value, extra);
+  if (hv < 0) {
+    int r =
+        this->find_internal_minus<omtcmp_t, h>(n->right, extra, value, idxp);
+    if (r == 0) {
+      *idxp += this->nweight(n->left) + 1;
+    } else if (r == DB_NOTFOUND) {
+      *idxp = this->nweight(n->left);
+      if (value != nullptr) {
+        copyout(value, n);
+      }
+      r = 0;
+    }
+    return r;
+  } else {
+    return this->find_internal_minus<omtcmp_t, h>(n->left, extra, value, idxp);
+  }
+}
+}  // namespace toku
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h
new file mode 100644
index 000000000..f20eeedf2
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h
@@ -0,0 +1,165 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+// Overview: A partitioned_counter provides a counter that can be incremented
+// and the running sum can be read at any time.
+//  We assume that increments are frequent, whereas reading is infrequent.
+// Implementation hint: Use thread-local storage so each thread increments its
+// own data.  The increment does not require a lock or atomic operation.
+//  Reading the data can be performed by iterating over the thread-local
+//  versions, summing them up. The data structure also includes a sum for all
+//  the threads that have died. Use a pthread_key to create the thread-local
+//  versions.  When a thread finishes, the system calls pthread_key destructor
+//  which can add that thread's copy into the sum_of_dead counter.
+// Rationale: For statistics such as are found in engine status, we need a
+// counter that requires no cache misses to increment.  We've seen significant
+//  performance speedups by removing certain counters.  Rather than removing
+//  those statistics, we would like to just make the counter fast. We generally
+//  increment the counters frequently, and want to fetch the values
+//  infrequently. The counters are monotonic. The counters can be split into
+//  many counters, which can be summed up at the end. We don't care if we get
+//  slightly out-of-date counter sums when we read the counter.  We don't care
+//  if there is a race on reading the a counter
+//   variable and incrementing.
+//  See tests/test_partitioned_counter.c for some performance measurements.
+// Operations:
+//   create_partitioned_counter    Create a counter initialized to zero.
+//   destroy_partitioned_counter   Destroy it.
+//   increment_partitioned_counter Increment it.  This is the frequent
+//   operation. read_partitioned_counter      Get the current value.  This is
+//   infrequent.
+// See partitioned_counter.cc for the abstraction function and representation
+// invariant.
+//
+// The google style guide says to avoid using constructors, and it appears that
+// constructors may have broken all the tests, because they called
+// pthread_key_create before the key was actually created.  So the google style
+// guide may have some wisdom there...
+//
+// This version does not use constructors, essentially reverrting to the google
+// C++ style guide.
+//
+
+// The old C interface.  This required a bunch of explicit
+// ___attribute__((__destructor__)) functions to remember to destroy counters at
+// the end.
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+typedef struct partitioned_counter *PARTITIONED_COUNTER;
+PARTITIONED_COUNTER create_partitioned_counter(void);
+// Effect: Create a counter, initialized to zero.
+
+void destroy_partitioned_counter(PARTITIONED_COUNTER);
+// Effect: Destroy the counter.  No operations on that counter are permitted
+// after this.
+
+void increment_partitioned_counter(PARTITIONED_COUNTER, uint64_t amount);
+// Effect: Increment the counter by amount.
+// Requires: No overflows.  This is a 64-bit unsigned counter.
+
+uint64_t read_partitioned_counter(PARTITIONED_COUNTER)
+    __attribute__((__visibility__("default")));
+// Effect: Return the current value of the counter.
+
+void partitioned_counters_init(void);
+// Effect: Initialize any partitioned counters data structures that must be set
+// up before any partitioned counters run.
+
+void partitioned_counters_destroy(void);
+// Effect: Destroy any partitioned counters data structures.
+
+#if defined(__cplusplus)
+};
+#endif
+
+#if 0
+#include <pthread.h>
+
+#include "fttypes.h"
+
+// Used inside the PARTITIONED_COUNTER.
+struct linked_list_head {
+    struct linked_list_element *first;
+};
+
+
+class PARTITIONED_COUNTER {
+public:
+    PARTITIONED_COUNTER(void);
+    // Effect: Construct a counter, initialized to zero.
+
+    ~PARTITIONED_COUNTER(void);
+    // Effect: Destruct the counter.
+
+    void increment(uint64_t amount);
+    // Effect: Increment the counter by amount.  This is a 64-bit unsigned counter, and if you overflow it, you will get overflowed results (that is mod 2^64).
+    // Requires: Don't use this from a static constructor or destructor.
+
+    uint64_t read(void);
+    // Effect: Read the sum.
+    // Requires: Don't use this from a static constructor or destructor.
+
+private:
+    uint64_t       _sum_of_dead;             // The sum of all thread-local counts from threads that have terminated.
+    pthread_key_t   _key;                     // The pthread_key which gives us the hook to construct and destruct thread-local storage.
+    struct linked_list_head _ll_counter_head; // A linked list of all the thread-local information for this counter.
+    
+    // This function is used to destroy the thread-local part of the state when a thread terminates.
+    // But it's not the destructor for the local part of the counter, it's a destructor on a "dummy" key just so that we get a notification when a thread ends.
+    friend void destroy_thread_local_part_of_partitioned_counters (void *);
+};
+#endif
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/status.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/status.h
new file mode 100644
index 000000000..3fd0095d0
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/status.h
@@ -0,0 +1,76 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "partitioned_counter.h"
+// PORT2: #include <util/constexpr.h>
+
+#define TOKUFT_STATUS_INIT(array, k, c, t, l, inc)                    \
+  do {                                                                \
+    array.status[k].keyname = #k;                                     \
+    array.status[k].columnname = #c;                                  \
+    array.status[k].type = t;                                         \
+    array.status[k].legend = l;                                       \
+    constexpr_static_assert(                                          \
+        strcmp(#c, "NULL") && strcmp(#c, "0"),                        \
+        "Use nullptr for no column name instead of NULL, 0, etc..."); \
+    constexpr_static_assert(                                          \
+        (inc) == TOKU_ENGINE_STATUS || strcmp(#c, "nullptr"),         \
+        "Missing column name.");                                      \
+    array.status[k].include =                                         \
+        static_cast<toku_engine_status_include_type>(inc);            \
+    if (t == STATUS_PARCOUNT) {                                       \
+      array.status[k].value.parcount = create_partitioned_counter();  \
+    }                                                                 \
+  } while (0)
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc
new file mode 100644
index 000000000..531165dea
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc
@@ -0,0 +1,503 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+
+#include "utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <mutex>
+
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/transaction_db_mutex.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/hash.h"
+#include "util/thread_local.h"
+#include "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/transaction_db_mutex_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+RangeLockManagerHandle* NewRangeLockManager(
+    std::shared_ptr<TransactionDBMutexFactory> mutex_factory) {
+  std::shared_ptr<TransactionDBMutexFactory> use_factory;
+
+  if (mutex_factory) {
+    use_factory = mutex_factory;
+  } else {
+    use_factory.reset(new TransactionDBMutexFactoryImpl());
+  }
+  return new RangeTreeLockManager(use_factory);
+}
+
+static const char SUFFIX_INFIMUM = 0x0;
+static const char SUFFIX_SUPREMUM = 0x1;
+
+// Convert Endpoint into an internal format used for storing it in locktree
+// (DBT structure is used for passing endpoints to locktree and getting back)
+void serialize_endpoint(const Endpoint& endp, std::string* buf) {
+  buf->push_back(endp.inf_suffix ? SUFFIX_SUPREMUM : SUFFIX_INFIMUM);
+  buf->append(endp.slice.data(), endp.slice.size());
+}
+
+// Decode the endpoint from the format it is stored in the locktree (DBT) to
+// the one used outside: either Endpoint or EndpointWithString
+template <typename EndpointStruct>
+void deserialize_endpoint(const DBT* dbt, EndpointStruct* endp) {
+  assert(dbt->size >= 1);
+  const char* dbt_data = (const char*)dbt->data;
+  char suffix = dbt_data[0];
+  assert(suffix == SUFFIX_INFIMUM || suffix == SUFFIX_SUPREMUM);
+  endp->inf_suffix = (suffix == SUFFIX_SUPREMUM);
+  endp->slice = decltype(EndpointStruct::slice)(dbt_data + 1, dbt->size - 1);
+}
+
+// Get a range lock on [start_key; end_key] range
+Status RangeTreeLockManager::TryLock(PessimisticTransaction* txn,
+                                     uint32_t column_family_id,
+                                     const Endpoint& start_endp,
+                                     const Endpoint& end_endp, Env*,
+                                     bool exclusive) {
+  toku::lock_request request;
+  request.create(mutex_factory_);
+  DBT start_key_dbt, end_key_dbt;
+
+  TEST_SYNC_POINT("RangeTreeLockManager::TryRangeLock:enter");
+  std::string start_key;
+  std::string end_key;
+  serialize_endpoint(start_endp, &start_key);
+  serialize_endpoint(end_endp, &end_key);
+
+  toku_fill_dbt(&start_key_dbt, start_key.data(), start_key.size());
+  toku_fill_dbt(&end_key_dbt, end_key.data(), end_key.size());
+
+  auto lt = GetLockTreeForCF(column_family_id);
+
+  // Put the key waited on into request's m_extra. See
+  // wait_callback_for_locktree for details.
+  std::string wait_key(start_endp.slice.data(), start_endp.slice.size());
+
+  request.set(lt.get(), (TXNID)txn, &start_key_dbt, &end_key_dbt,
+              exclusive ? toku::lock_request::WRITE : toku::lock_request::READ,
+              false /* not a big txn */, &wait_key);
+
+  // This is for "periodically wake up and check if the wait is killed" feature
+  // which we are not using.
+  uint64_t killed_time_msec = 0;
+  uint64_t wait_time_msec = txn->GetLockTimeout();
+
+  if (wait_time_msec == static_cast<uint64_t>(-1)) {
+    // The transaction has no wait timeout. lock_request::wait doesn't support
+    // this, it needs a number of milliseconds to wait. Pass it one year to
+    // be safe.
+    wait_time_msec = uint64_t(1000) * 60 * 60 * 24 * 365;
+  } else {
+    // convert microseconds to milliseconds
+    wait_time_msec = (wait_time_msec + 500) / 1000;
+  }
+
+  std::vector<RangeDeadlockInfo> di_path;
+  request.m_deadlock_cb = [&](TXNID txnid, bool is_exclusive,
+                              const DBT* start_dbt, const DBT* end_dbt) {
+    EndpointWithString start;
+    EndpointWithString end;
+    deserialize_endpoint(start_dbt, &start);
+    deserialize_endpoint(end_dbt, &end);
+
+    di_path.push_back({txnid, column_family_id, is_exclusive, std::move(start),
+                       std::move(end)});
+  };
+
+  request.start();
+
+  const int r = request.wait(wait_time_msec, killed_time_msec,
+                             nullptr,  // killed_callback
+                             wait_callback_for_locktree, nullptr);
+
+  // Inform the txn that we are no longer waiting:
+  txn->ClearWaitingTxn();
+
+  request.destroy();
+  switch (r) {
+    case 0:
+      break;  // fall through
+    case DB_LOCK_NOTGRANTED:
+      return Status::TimedOut(Status::SubCode::kLockTimeout);
+    case TOKUDB_OUT_OF_LOCKS:
+      return Status::Busy(Status::SubCode::kLockLimit);
+    case DB_LOCK_DEADLOCK: {
+      std::reverse(di_path.begin(), di_path.end());
+      dlock_buffer_.AddNewPath(
+          RangeDeadlockPath(di_path, request.get_start_time()));
+      return Status::Busy(Status::SubCode::kDeadlock);
+    }
+    default:
+      assert(0);
+      return Status::Busy(Status::SubCode::kLockLimit);
+  }
+
+  return Status::OK();
+}
+
+// Wait callback that locktree library will call to inform us about
+// the lock waits that are in progress.
+void wait_callback_for_locktree(void*, toku::lock_wait_infos* infos) {
+  TEST_SYNC_POINT("RangeTreeLockManager::TryRangeLock:EnterWaitingTxn");
+  for (auto wait_info : *infos) {
+    // As long as we hold the lock on the locktree's pending request queue
+    // this should be safe.
+    auto txn = (PessimisticTransaction*)wait_info.waiter;
+    auto cf_id = (ColumnFamilyId)wait_info.ltree->get_dict_id().dictid;
+
+    autovector<TransactionID> waitee_ids;
+    for (auto waitee : wait_info.waitees) {
+      waitee_ids.push_back(waitee);
+    }
+    txn->SetWaitingTxn(waitee_ids, cf_id, (std::string*)wait_info.m_extra);
+  }
+
+  // Here we can assume that the locktree code will now wait for some lock
+  TEST_SYNC_POINT("RangeTreeLockManager::TryRangeLock:WaitingTxn");
+}
+
+void RangeTreeLockManager::UnLock(PessimisticTransaction* txn,
+                                  ColumnFamilyId column_family_id,
+                                  const std::string& key, Env*) {
+  auto locktree = GetLockTreeForCF(column_family_id);
+  std::string endp_image;
+  serialize_endpoint({key.data(), key.size(), false}, &endp_image);
+
+  DBT key_dbt;
+  toku_fill_dbt(&key_dbt, endp_image.data(), endp_image.size());
+
+  toku::range_buffer range_buf;
+  range_buf.create();
+  range_buf.append(&key_dbt, &key_dbt);
+
+  locktree->release_locks((TXNID)txn, &range_buf);
+  range_buf.destroy();
+
+  toku::lock_request::retry_all_lock_requests(
+      locktree.get(), wait_callback_for_locktree, nullptr);
+}
+
+void RangeTreeLockManager::UnLock(PessimisticTransaction* txn,
+                                  const LockTracker& tracker, Env*) {
+  const RangeTreeLockTracker* range_tracker =
+      static_cast<const RangeTreeLockTracker*>(&tracker);
+
+  RangeTreeLockTracker* range_trx_tracker =
+      static_cast<RangeTreeLockTracker*>(&txn->GetTrackedLocks());
+  bool all_keys = (range_trx_tracker == range_tracker);
+
+  // tracked_locks_->range_list may hold nullptr if the transaction has never
+  // acquired any locks.
+  ((RangeTreeLockTracker*)range_tracker)->ReleaseLocks(this, txn, all_keys);
+}
+
+int RangeTreeLockManager::CompareDbtEndpoints(void* arg, const DBT* a_key,
+                                              const DBT* b_key) {
+  const char* a = (const char*)a_key->data;
+  const char* b = (const char*)b_key->data;
+
+  size_t a_len = a_key->size;
+  size_t b_len = b_key->size;
+
+  size_t min_len = std::min(a_len, b_len);
+
+  // Compare the values. The first byte encodes the endpoint type, its value
+  // is either SUFFIX_INFIMUM or SUFFIX_SUPREMUM.
+  Comparator* cmp = (Comparator*)arg;
+  int res = cmp->Compare(Slice(a + 1, min_len - 1), Slice(b + 1, min_len - 1));
+  if (!res) {
+    if (b_len > min_len) {
+      // a is shorter;
+      if (a[0] == SUFFIX_INFIMUM) {
+        return -1;  //"a is smaller"
+      } else {
+        // a is considered padded with 0xFF:FF:FF:FF...
+        return 1;  // "a" is bigger
+      }
+    } else if (a_len > min_len) {
+      // the opposite of the above: b is shorter.
+      if (b[0] == SUFFIX_INFIMUM) {
+        return 1;  //"b is smaller"
+      } else {
+        // b is considered padded with 0xFF:FF:FF:FF...
+        return -1;  // "b" is bigger
+      }
+    } else {
+      // the lengths are equal (and the key values, too)
+      if (a[0] < b[0]) {
+        return -1;
+      } else if (a[0] > b[0]) {
+        return 1;
+      } else {
+        return 0;
+      }
+    }
+  } else {
+    return res;
+  }
+}
+
+namespace {
+void UnrefLockTreeMapsCache(void* ptr) {
+  // Called when a thread exits or a ThreadLocalPtr gets destroyed.
+  auto lock_tree_map_cache = static_cast<
+      std::unordered_map<ColumnFamilyId, std::shared_ptr<toku::locktree>>*>(
+      ptr);
+  delete lock_tree_map_cache;
+}
+}  // anonymous namespace
+
+RangeTreeLockManager::RangeTreeLockManager(
+    std::shared_ptr<TransactionDBMutexFactory> mutex_factory)
+    : mutex_factory_(mutex_factory),
+      ltree_lookup_cache_(new ThreadLocalPtr(&UnrefLockTreeMapsCache)),
+      dlock_buffer_(10) {
+  ltm_.create(on_create, on_destroy, on_escalate, nullptr, mutex_factory_);
+}
+
+int RangeTreeLockManager::on_create(toku::locktree* lt, void* arg) {
+  // arg is a pointer to RangeTreeLockManager
+  lt->set_escalation_barrier_func(&OnEscalationBarrierCheck, arg);
+  return 0;
+}
+
+bool RangeTreeLockManager::OnEscalationBarrierCheck(const DBT* a, const DBT* b,
+                                                    void* extra) {
+  Endpoint a_endp, b_endp;
+  deserialize_endpoint(a, &a_endp);
+  deserialize_endpoint(b, &b_endp);
+  auto self = static_cast<RangeTreeLockManager*>(extra);
+  return self->barrier_func_(a_endp, b_endp);
+}
+
+void RangeTreeLockManager::SetRangeDeadlockInfoBufferSize(
+    uint32_t target_size) {
+  dlock_buffer_.Resize(target_size);
+}
+
+void RangeTreeLockManager::Resize(uint32_t target_size) {
+  SetRangeDeadlockInfoBufferSize(target_size);
+}
+
+std::vector<RangeDeadlockPath>
+RangeTreeLockManager::GetRangeDeadlockInfoBuffer() {
+  return dlock_buffer_.PrepareBuffer();
+}
+
+std::vector<DeadlockPath> RangeTreeLockManager::GetDeadlockInfoBuffer() {
+  std::vector<DeadlockPath> res;
+  std::vector<RangeDeadlockPath> data = GetRangeDeadlockInfoBuffer();
+  // report left endpoints
+  for (auto it = data.begin(); it != data.end(); ++it) {
+    std::vector<DeadlockInfo> path;
+
+    for (auto it2 = it->path.begin(); it2 != it->path.end(); ++it2) {
+      path.push_back(
+          {it2->m_txn_id, it2->m_cf_id, it2->m_exclusive, it2->m_start.slice});
+    }
+    res.push_back(DeadlockPath(path, it->deadlock_time));
+  }
+  return res;
+}
+
+// @brief  Lock Escalation Callback function
+//
+// @param txnid   Transaction whose locks got escalated
+// @param lt      Lock Tree where escalation is happening
+// @param buffer  Escalation result: list of locks that this transaction now
+//                owns in this lock tree.
+// @param void*   Callback context
+void RangeTreeLockManager::on_escalate(TXNID txnid, const toku::locktree* lt,
+                                       const toku::range_buffer& buffer,
+                                       void*) {
+  auto txn = (PessimisticTransaction*)txnid;
+  ((RangeTreeLockTracker*)&txn->GetTrackedLocks())->ReplaceLocks(lt, buffer);
+}
+
+RangeTreeLockManager::~RangeTreeLockManager() {
+  autovector<void*> local_caches;
+  ltree_lookup_cache_->Scrape(&local_caches, nullptr);
+  for (auto cache : local_caches) {
+    delete static_cast<LockTreeMap*>(cache);
+  }
+  ltree_map_.clear();  // this will call release_lt() for all locktrees
+  ltm_.destroy();
+}
+
+RangeLockManagerHandle::Counters RangeTreeLockManager::GetStatus() {
+  LTM_STATUS_S ltm_status_test;
+  ltm_.get_status(&ltm_status_test);
+  Counters res;
+
+  // Searching status variable by its string name is how Toku's unit tests
+  // do it (why didn't they make LTM_ESCALATION_COUNT constant visible?)
+  // lookup keyname in status
+  for (int i = 0; i < LTM_STATUS_S::LTM_STATUS_NUM_ROWS; i++) {
+    TOKU_ENGINE_STATUS_ROW status = &ltm_status_test.status[i];
+    if (strcmp(status->keyname, "LTM_ESCALATION_COUNT") == 0) {
+      res.escalation_count = status->value.num;
+      continue;
+    }
+    if (strcmp(status->keyname, "LTM_WAIT_COUNT") == 0) {
+      res.lock_wait_count = status->value.num;
+      continue;
+    }
+    if (strcmp(status->keyname, "LTM_SIZE_CURRENT") == 0) {
+      res.current_lock_memory = status->value.num;
+    }
+  }
+  return res;
+}
+
+std::shared_ptr<toku::locktree> RangeTreeLockManager::MakeLockTreePtr(
+    toku::locktree* lt) {
+  toku::locktree_manager* ltm = &ltm_;
+  return std::shared_ptr<toku::locktree>(
+      lt, [ltm](toku::locktree* p) { ltm->release_lt(p); });
+}
+
+void RangeTreeLockManager::AddColumnFamily(const ColumnFamilyHandle* cfh) {
+  uint32_t column_family_id = cfh->GetID();
+
+  InstrumentedMutexLock l(&ltree_map_mutex_);
+  if (ltree_map_.find(column_family_id) == ltree_map_.end()) {
+    DICTIONARY_ID dict_id = {.dictid = column_family_id};
+    toku::comparator cmp;
+    cmp.create(CompareDbtEndpoints, (void*)cfh->GetComparator());
+    toku::locktree* ltree =
+        ltm_.get_lt(dict_id, cmp,
+                    /* on_create_extra*/ static_cast<void*>(this));
+    // This is ok to because get_lt has copied the comparator:
+    cmp.destroy();
+
+    ltree_map_.insert({column_family_id, MakeLockTreePtr(ltree)});
+  }
+}
+
+void RangeTreeLockManager::RemoveColumnFamily(const ColumnFamilyHandle* cfh) {
+  uint32_t column_family_id = cfh->GetID();
+  // Remove lock_map for this column family.  Since the lock map is stored
+  // as a shared ptr, concurrent transactions can still keep using it
+  // until they release their references to it.
+
+  // TODO what if one drops a column family while transaction(s) still have
+  // locks in it?
+  // locktree uses column family'c Comparator* as the criteria to do tree
+  // ordering. If the comparator is gone, we won't even be able to remove the
+  // elements from the locktree.
+  // A possible solution might be to remove everything right now:
+  //  - wait until everyone traversing the locktree are gone
+  //  - remove everything from the locktree.
+  //  - some transactions may have acquired locks in their LockTracker objects.
+  //    Arrange something so we don't blow up when they try to release them.
+  //  - ...
+  // This use case (drop column family while somebody is using it) doesn't seem
+  // the priority, though.
+
+  {
+    InstrumentedMutexLock l(&ltree_map_mutex_);
+
+    auto lock_maps_iter = ltree_map_.find(column_family_id);
+    assert(lock_maps_iter != ltree_map_.end());
+    ltree_map_.erase(lock_maps_iter);
+  }  // lock_map_mutex_
+
+  autovector<void*> local_caches;
+  ltree_lookup_cache_->Scrape(&local_caches, nullptr);
+  for (auto cache : local_caches) {
+    delete static_cast<LockTreeMap*>(cache);
+  }
+}
+
+std::shared_ptr<toku::locktree> RangeTreeLockManager::GetLockTreeForCF(
+    ColumnFamilyId column_family_id) {
+  // First check thread-local cache
+  if (ltree_lookup_cache_->Get() == nullptr) {
+    ltree_lookup_cache_->Reset(new LockTreeMap());
+  }
+
+  auto ltree_map_cache = static_cast<LockTreeMap*>(ltree_lookup_cache_->Get());
+
+  auto it = ltree_map_cache->find(column_family_id);
+  if (it != ltree_map_cache->end()) {
+    // Found lock map for this column family.
+    return it->second;
+  }
+
+  // Not found in local cache, grab mutex and check shared LockMaps
+  InstrumentedMutexLock l(&ltree_map_mutex_);
+
+  it = ltree_map_.find(column_family_id);
+  if (it == ltree_map_.end()) {
+    return nullptr;
+  } else {
+    // Found lock map.  Store in thread-local cache and return.
+    ltree_map_cache->insert({column_family_id, it->second});
+    return it->second;
+  }
+}
+
+struct LOCK_PRINT_CONTEXT {
+  RangeLockManagerHandle::RangeLockStatus* data;  // Save locks here
+  uint32_t cfh_id;  // Column Family whose tree we are traversing
+};
+
+// Report left endpoints of the acquired locks
+LockManager::PointLockStatus RangeTreeLockManager::GetPointLockStatus() {
+  PointLockStatus res;
+  LockManager::RangeLockStatus data = GetRangeLockStatus();
+  // report left endpoints
+  for (auto it = data.begin(); it != data.end(); ++it) {
+    auto& val = it->second;
+    res.insert({it->first, {val.start.slice, val.ids, val.exclusive}});
+  }
+  return res;
+}
+
+static void push_into_lock_status_data(void* param, const DBT* left,
+                                       const DBT* right, TXNID txnid_arg,
+                                       bool is_shared, TxnidVector* owners) {
+  struct LOCK_PRINT_CONTEXT* ctx = (LOCK_PRINT_CONTEXT*)param;
+  struct RangeLockInfo info;
+
+  info.exclusive = !is_shared;
+
+  deserialize_endpoint(left, &info.start);
+  deserialize_endpoint(right, &info.end);
+
+  if (txnid_arg != TXNID_SHARED) {
+    info.ids.push_back(txnid_arg);
+  } else {
+    for (auto it : *owners) {
+      info.ids.push_back(it);
+    }
+  }
+  ctx->data->insert({ctx->cfh_id, info});
+}
+
+LockManager::RangeLockStatus RangeTreeLockManager::GetRangeLockStatus() {
+  LockManager::RangeLockStatus data;
+  {
+    InstrumentedMutexLock l(&ltree_map_mutex_);
+    for (auto it : ltree_map_) {
+      LOCK_PRINT_CONTEXT ctx = {&data, it.first};
+      it.second->dump_locks((void*)&ctx, push_into_lock_status_data);
+    }
+  }
+  return data;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h
new file mode 100644
index 000000000..e4236d600
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h
@@ -0,0 +1,137 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+
+// For DeadlockInfoBuffer:
+#include "util/thread_local.h"
+#include "utilities/transactions/lock/point/point_lock_manager.h"
+#include "utilities/transactions/lock/range/range_lock_manager.h"
+
+// Lock Tree library:
+#include "utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h"
+#include "utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h"
+#include "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+typedef DeadlockInfoBufferTempl<RangeDeadlockPath> RangeDeadlockInfoBuffer;
+
+// A Range Lock Manager that uses PerconaFT's locktree library
+class RangeTreeLockManager : public RangeLockManagerBase,
+                             public RangeLockManagerHandle {
+ public:
+  LockManager* getLockManager() override { return this; }
+
+  void AddColumnFamily(const ColumnFamilyHandle* cfh) override;
+  void RemoveColumnFamily(const ColumnFamilyHandle* cfh) override;
+
+  void Resize(uint32_t) override;
+  std::vector<DeadlockPath> GetDeadlockInfoBuffer() override;
+
+  std::vector<RangeDeadlockPath> GetRangeDeadlockInfoBuffer() override;
+  void SetRangeDeadlockInfoBufferSize(uint32_t target_size) override;
+
+  // Get a lock on a range
+  //  @note only exclusive locks are currently supported (requesting a
+  //  non-exclusive lock will get an exclusive one)
+  using LockManager::TryLock;
+  Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+                 const Endpoint& start_endp, const Endpoint& end_endp, Env* env,
+                 bool exclusive) override;
+
+  void UnLock(PessimisticTransaction* txn, const LockTracker& tracker,
+              Env* env) override;
+  void UnLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+              const std::string& key, Env* env) override;
+  void UnLock(PessimisticTransaction*, ColumnFamilyId, const Endpoint&,
+              const Endpoint&, Env*) override {
+    // TODO: range unlock does nothing...
+  }
+
+  explicit RangeTreeLockManager(
+      std::shared_ptr<TransactionDBMutexFactory> mutex_factory);
+
+  ~RangeTreeLockManager() override;
+
+  int SetMaxLockMemory(size_t max_lock_memory) override {
+    return ltm_.set_max_lock_memory(max_lock_memory);
+  }
+
+  size_t GetMaxLockMemory() override { return ltm_.get_max_lock_memory(); }
+
+  Counters GetStatus() override;
+
+  bool IsPointLockSupported() const override {
+    // One could have acquired a point lock (it is reduced to range lock)
+    return true;
+  }
+
+  PointLockStatus GetPointLockStatus() override;
+
+  // This is from LockManager
+  LockManager::RangeLockStatus GetRangeLockStatus() override;
+
+  // This has the same meaning as GetRangeLockStatus but is from
+  // RangeLockManagerHandle
+  RangeLockManagerHandle::RangeLockStatus GetRangeLockStatusData() override {
+    return GetRangeLockStatus();
+  }
+
+  bool IsRangeLockSupported() const override { return true; }
+
+  const LockTrackerFactory& GetLockTrackerFactory() const override {
+    return RangeTreeLockTrackerFactory::Get();
+  }
+
+  // Get the locktree which stores locks for the Column Family with given cf_id
+  std::shared_ptr<toku::locktree> GetLockTreeForCF(ColumnFamilyId cf_id);
+
+  void SetEscalationBarrierFunc(EscalationBarrierFunc func) override {
+    barrier_func_ = func;
+  }
+
+ private:
+  toku::locktree_manager ltm_;
+
+  EscalationBarrierFunc barrier_func_ =
+      [](const Endpoint&, const Endpoint&) -> bool { return false; };
+
+  std::shared_ptr<TransactionDBMutexFactory> mutex_factory_;
+
+  // Map from cf_id to locktree*. Can only be accessed while holding the
+  // ltree_map_mutex_. Must use a custom deleter that calls ltm_.release_lt
+  using LockTreeMap =
+      std::unordered_map<ColumnFamilyId, std::shared_ptr<toku::locktree>>;
+  LockTreeMap ltree_map_;
+
+  InstrumentedMutex ltree_map_mutex_;
+
+  // Per-thread cache of ltree_map_.
+  // (uses the same approach as TransactionLockMgr::lock_maps_cache_)
+  std::unique_ptr<ThreadLocalPtr> ltree_lookup_cache_;
+
+  RangeDeadlockInfoBuffer dlock_buffer_;
+
+  std::shared_ptr<toku::locktree> MakeLockTreePtr(toku::locktree* lt);
+  static int CompareDbtEndpoints(void* arg, const DBT* a_key, const DBT* b_key);
+
+  // Callbacks
+  static int on_create(toku::locktree*, void*);
+  static void on_destroy(toku::locktree*) {}
+  static void on_escalate(TXNID txnid, const toku::locktree* lt,
+                          const toku::range_buffer& buffer, void* extra);
+
+  static bool OnEscalationBarrierCheck(const DBT* a, const DBT* b, void* extra);
+};
+
+void serialize_endpoint(const Endpoint& endp, std::string* buf);
+void wait_callback_for_locktree(void* cdata, toku::lock_wait_infos* infos);
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc
new file mode 100644
index 000000000..be1e1478b
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc
@@ -0,0 +1,156 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+
+#include "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h"
+
+#include "utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+RangeLockList *RangeTreeLockTracker::getOrCreateList() {
+  if (range_list_) return range_list_.get();
+
+  // Doesn't exist, create
+  range_list_.reset(new RangeLockList());
+  return range_list_.get();
+}
+
+void RangeTreeLockTracker::Track(const PointLockRequest &lock_req) {
+  DBT key_dbt;
+  std::string key;
+  serialize_endpoint(Endpoint(lock_req.key, false), &key);
+  toku_fill_dbt(&key_dbt, key.data(), key.size());
+  RangeLockList *rl = getOrCreateList();
+  rl->Append(lock_req.column_family_id, &key_dbt, &key_dbt);
+}
+
+void RangeTreeLockTracker::Track(const RangeLockRequest &lock_req) {
+  DBT start_dbt, end_dbt;
+  std::string start_key, end_key;
+
+  serialize_endpoint(lock_req.start_endp, &start_key);
+  serialize_endpoint(lock_req.end_endp, &end_key);
+
+  toku_fill_dbt(&start_dbt, start_key.data(), start_key.size());
+  toku_fill_dbt(&end_dbt, end_key.data(), end_key.size());
+
+  RangeLockList *rl = getOrCreateList();
+  rl->Append(lock_req.column_family_id, &start_dbt, &end_dbt);
+}
+
+PointLockStatus RangeTreeLockTracker::GetPointLockStatus(
+    ColumnFamilyId /*cf_id*/, const std::string & /*key*/) const {
+  // This function is not expected to be called as RangeTreeLockTracker::
+  // IsPointLockSupported() returns false. Return the status which indicates
+  // the point is not locked.
+  PointLockStatus p;
+  p.locked = false;
+  p.exclusive = true;
+  p.seq = 0;
+  return p;
+}
+
+void RangeTreeLockTracker::Clear() { range_list_.reset(); }
+
+void RangeLockList::Append(ColumnFamilyId cf_id, const DBT *left_key,
+                           const DBT *right_key) {
+  MutexLock l(&mutex_);
+  // Only the transaction owner thread calls this function.
+  // The same thread does the lock release, so we can be certain nobody is
+  // releasing the locks concurrently.
+  assert(!releasing_locks_.load());
+  auto it = buffers_.find(cf_id);
+  if (it == buffers_.end()) {
+    // create a new one
+    it = buffers_.emplace(cf_id, std::make_shared<toku::range_buffer>()).first;
+    it->second->create();
+  }
+  it->second->append(left_key, right_key);
+}
+
+void RangeLockList::ReleaseLocks(RangeTreeLockManager *mgr,
+                                 PessimisticTransaction *txn,
+                                 bool all_trx_locks) {
+  {
+    MutexLock l(&mutex_);
+    // The lt->release_locks() call below will walk range_list->buffer_. We
+    // need to prevent lock escalation callback from replacing
+    // range_list->buffer_ while we are doing that.
+    //
+    // Additional complication here is internal mutex(es) in the locktree
+    // (let's call them latches):
+    // - Lock escalation first obtains latches on the lock tree
+    // - Then, it calls RangeTreeLockManager::on_escalate to replace
+    // transaction's range_list->buffer_. = Access to that buffer must be
+    // synchronized, so it will want to acquire the range_list->mutex_.
+    //
+    // While in this function we would want to do the reverse:
+    // - Acquire range_list->mutex_ to prevent access to the range_list.
+    // - Then, lt->release_locks() call will walk through the range_list
+    // - and acquire latches on parts of the lock tree to remove locks from
+    //   it.
+    //
+    // How do we avoid the deadlock? The idea is that here we set
+    // releasing_locks_=true, and release the mutex.
+    // All other users of the range_list must:
+    // - Acquire the mutex, then check that releasing_locks_=false.
+    //   (the code in this function doesnt do that as there's only one thread
+    //    that releases transaction's locks)
+    releasing_locks_.store(true);
+  }
+
+  for (auto it : buffers_) {
+    // Don't try to call release_locks() if the buffer is empty! if we are
+    //  not holding any locks, the lock tree might be in the STO-mode with
+    //  another transaction, and our attempt to release an empty set of locks
+    //  will cause an assertion failure.
+    if (it.second->get_num_ranges()) {
+      auto lt_ptr = mgr->GetLockTreeForCF(it.first);
+      toku::locktree *lt = lt_ptr.get();
+
+      lt->release_locks((TXNID)txn, it.second.get(), all_trx_locks);
+
+      it.second->destroy();
+      it.second->create();
+
+      toku::lock_request::retry_all_lock_requests(lt,
+                                                  wait_callback_for_locktree);
+    }
+  }
+
+  Clear();
+  releasing_locks_.store(false);
+}
+
+void RangeLockList::ReplaceLocks(const toku::locktree *lt,
+                                 const toku::range_buffer &buffer) {
+  MutexLock l(&mutex_);
+  if (releasing_locks_.load()) {
+    // Do nothing. The transaction is releasing its locks, so it will not care
+    // about having a correct list of ranges. (In TokuDB,
+    // toku_db_txn_escalate_callback() makes use of this property, too)
+    return;
+  }
+
+  ColumnFamilyId cf_id = (ColumnFamilyId)lt->get_dict_id().dictid;
+
+  auto it = buffers_.find(cf_id);
+  it->second->destroy();
+  it->second->create();
+
+  toku::range_buffer::iterator iter(&buffer);
+  toku::range_buffer::iterator::record rec;
+  while (iter.current(&rec)) {
+    it->second->append(rec.get_left_key(), rec.get_right_key());
+    iter.next();
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h
new file mode 100644
index 000000000..4ef48d252
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h
@@ -0,0 +1,146 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "util/mutexlock.h"
+#include "utilities/transactions/lock/lock_tracker.h"
+#include "utilities/transactions/pessimistic_transaction.h"
+
+// Range Locking:
+#include "lib/locktree/lock_request.h"
+#include "lib/locktree/locktree.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class RangeTreeLockManager;
+
+// Storage for locks that are currently held by a transaction.
+//
+// Locks are kept in toku::range_buffer because toku::locktree::release_locks()
+// accepts that as an argument.
+//
+// Note: the list of locks may differ slighly from the contents of the lock
+// tree, due to concurrency between lock acquisition, lock release, and lock
+// escalation. See MDEV-18227 and RangeTreeLockManager::UnLock for details.
+// This property is currently harmless.
+//
+// Append() and ReleaseLocks() are not thread-safe, as they are expected to be
+// called only by the owner transaction. ReplaceLocks() is safe to call from
+// other threads.
+class RangeLockList {
+ public:
+  ~RangeLockList() { Clear(); }
+
+  RangeLockList() : releasing_locks_(false) {}
+
+  void Append(ColumnFamilyId cf_id, const DBT* left_key, const DBT* right_key);
+  void ReleaseLocks(RangeTreeLockManager* mgr, PessimisticTransaction* txn,
+                    bool all_trx_locks);
+  void ReplaceLocks(const toku::locktree* lt, const toku::range_buffer& buffer);
+
+ private:
+  void Clear() {
+    for (auto it : buffers_) {
+      it.second->destroy();
+    }
+    buffers_.clear();
+  }
+
+  std::unordered_map<ColumnFamilyId, std::shared_ptr<toku::range_buffer>>
+      buffers_;
+  port::Mutex mutex_;
+  std::atomic<bool> releasing_locks_;
+};
+
+// A LockTracker-based object that is used together with RangeTreeLockManager.
+class RangeTreeLockTracker : public LockTracker {
+ public:
+  RangeTreeLockTracker() : range_list_(nullptr) {}
+
+  RangeTreeLockTracker(const RangeTreeLockTracker&) = delete;
+  RangeTreeLockTracker& operator=(const RangeTreeLockTracker&) = delete;
+
+  void Track(const PointLockRequest&) override;
+  void Track(const RangeLockRequest&) override;
+
+  bool IsPointLockSupported() const override {
+    // This indicates that we don't implement GetPointLockStatus()
+    return false;
+  }
+  bool IsRangeLockSupported() const override { return true; }
+
+  // a Not-supported dummy implementation.
+  UntrackStatus Untrack(const RangeLockRequest& /*lock_request*/) override {
+    return UntrackStatus::NOT_TRACKED;
+  }
+
+  UntrackStatus Untrack(const PointLockRequest& /*lock_request*/) override {
+    return UntrackStatus::NOT_TRACKED;
+  }
+
+  // "If this method is not supported, leave it as a no-op."
+  void Merge(const LockTracker&) override {}
+
+  // "If this method is not supported, leave it as a no-op."
+  void Subtract(const LockTracker&) override {}
+
+  void Clear() override;
+
+  // "If this method is not supported, returns nullptr."
+  virtual LockTracker* GetTrackedLocksSinceSavePoint(
+      const LockTracker&) const override {
+    return nullptr;
+  }
+
+  PointLockStatus GetPointLockStatus(ColumnFamilyId column_family_id,
+                                     const std::string& key) const override;
+
+  // The return value is only used for tests
+  uint64_t GetNumPointLocks() const override { return 0; }
+
+  ColumnFamilyIterator* GetColumnFamilyIterator() const override {
+    return nullptr;
+  }
+
+  KeyIterator* GetKeyIterator(
+      ColumnFamilyId /*column_family_id*/) const override {
+    return nullptr;
+  }
+
+  void ReleaseLocks(RangeTreeLockManager* mgr, PessimisticTransaction* txn,
+                    bool all_trx_locks) {
+    if (range_list_) range_list_->ReleaseLocks(mgr, txn, all_trx_locks);
+  }
+
+  void ReplaceLocks(const toku::locktree* lt,
+                    const toku::range_buffer& buffer) {
+    // range_list_ cannot be NULL here
+    range_list_->ReplaceLocks(lt, buffer);
+  }
+
+ private:
+  RangeLockList* getOrCreateList();
+  std::unique_ptr<RangeLockList> range_list_;
+};
+
+class RangeTreeLockTrackerFactory : public LockTrackerFactory {
+ public:
+  static const RangeTreeLockTrackerFactory& Get() {
+    static const RangeTreeLockTrackerFactory instance;
+    return instance;
+  }
+
+  LockTracker* Create() const override { return new RangeTreeLockTracker(); }
+
+ private:
+  RangeTreeLockTrackerFactory() {}
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/transactions/optimistic_transaction.cc b/src/rocksdb/utilities/transactions/optimistic_transaction.cc
new file mode 100644
index 000000000..0ee0f28b6
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/optimistic_transaction.cc
@@ -0,0 +1,196 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/optimistic_transaction.h"
+
+#include <string>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "util/cast_util.h"
+#include "util/string_util.h"
+#include "utilities/transactions/lock/point/point_lock_tracker.h"
+#include "utilities/transactions/optimistic_transaction.h"
+#include "utilities/transactions/optimistic_transaction_db_impl.h"
+#include "utilities/transactions/transaction_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct WriteOptions;
+
+OptimisticTransaction::OptimisticTransaction(
+    OptimisticTransactionDB* txn_db, const WriteOptions& write_options,
+    const OptimisticTransactionOptions& txn_options)
+    : TransactionBaseImpl(txn_db->GetBaseDB(), write_options,
+                          PointLockTrackerFactory::Get()),
+      txn_db_(txn_db) {
+  Initialize(txn_options);
+}
+
+void OptimisticTransaction::Initialize(
+    const OptimisticTransactionOptions& txn_options) {
+  if (txn_options.set_snapshot) {
+    SetSnapshot();
+  }
+}
+
+void OptimisticTransaction::Reinitialize(
+    OptimisticTransactionDB* txn_db, const WriteOptions& write_options,
+    const OptimisticTransactionOptions& txn_options) {
+  TransactionBaseImpl::Reinitialize(txn_db->GetBaseDB(), write_options);
+  Initialize(txn_options);
+}
+
+OptimisticTransaction::~OptimisticTransaction() {}
+
+void OptimisticTransaction::Clear() { TransactionBaseImpl::Clear(); }
+
+Status OptimisticTransaction::Prepare() {
+  return Status::InvalidArgument(
+      "Two phase commit not supported for optimistic transactions.");
+}
+
+Status OptimisticTransaction::Commit() {
+  auto txn_db_impl = static_cast_with_check<OptimisticTransactionDBImpl,
+                                            OptimisticTransactionDB>(txn_db_);
+  assert(txn_db_impl);
+  switch (txn_db_impl->GetValidatePolicy()) {
+    case OccValidationPolicy::kValidateParallel:
+      return CommitWithParallelValidate();
+    case OccValidationPolicy::kValidateSerial:
+      return CommitWithSerialValidate();
+    default:
+      assert(0);
+  }
+  // unreachable, just void compiler complain
+  return Status::OK();
+}
+
+Status OptimisticTransaction::CommitWithSerialValidate() {
+  // Set up callback which will call CheckTransactionForConflicts() to
+  // check whether this transaction is safe to be committed.
+  OptimisticTransactionCallback callback(this);
+
+  DBImpl* db_impl = static_cast_with_check<DBImpl>(db_->GetRootDB());
+
+  Status s = db_impl->WriteWithCallback(
+      write_options_, GetWriteBatch()->GetWriteBatch(), &callback);
+
+  if (s.ok()) {
+    Clear();
+  }
+
+  return s;
+}
+
+Status OptimisticTransaction::CommitWithParallelValidate() {
+  auto txn_db_impl = static_cast_with_check<OptimisticTransactionDBImpl,
+                                            OptimisticTransactionDB>(txn_db_);
+  assert(txn_db_impl);
+  DBImpl* db_impl = static_cast_with_check<DBImpl>(db_->GetRootDB());
+  assert(db_impl);
+  const size_t space = txn_db_impl->GetLockBucketsSize();
+  std::set<size_t> lk_idxes;
+  std::vector<std::unique_lock<std::mutex>> lks;
+  std::unique_ptr<LockTracker::ColumnFamilyIterator> cf_it(
+      tracked_locks_->GetColumnFamilyIterator());
+  assert(cf_it != nullptr);
+  while (cf_it->HasNext()) {
+    ColumnFamilyId cf = cf_it->Next();
+    std::unique_ptr<LockTracker::KeyIterator> key_it(
+        tracked_locks_->GetKeyIterator(cf));
+    assert(key_it != nullptr);
+    while (key_it->HasNext()) {
+      const std::string& key = key_it->Next();
+      lk_idxes.insert(FastRange64(GetSliceNPHash64(key), space));
+    }
+  }
+  // NOTE: in a single txn, all bucket-locks are taken in ascending order.
+  // In this way, txns from different threads all obey this rule so that
+  // deadlock can be avoided.
+  for (auto v : lk_idxes) {
+    lks.emplace_back(txn_db_impl->LockBucket(v));
+  }
+
+  Status s = TransactionUtil::CheckKeysForConflicts(db_impl, *tracked_locks_,
+                                                    true /* cache_only */);
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = db_impl->Write(write_options_, GetWriteBatch()->GetWriteBatch());
+  if (s.ok()) {
+    Clear();
+  }
+
+  return s;
+}
+
+Status OptimisticTransaction::Rollback() {
+  Clear();
+  return Status::OK();
+}
+
+// Record this key so that we can check it for conflicts at commit time.
+//
+// 'exclusive' is unused for OptimisticTransaction.
+Status OptimisticTransaction::TryLock(ColumnFamilyHandle* column_family,
+                                      const Slice& key, bool read_only,
+                                      bool exclusive, const bool do_validate,
+                                      const bool assume_tracked) {
+  assert(!assume_tracked);  // not supported
+  (void)assume_tracked;
+  if (!do_validate) {
+    return Status::OK();
+  }
+  uint32_t cfh_id = GetColumnFamilyID(column_family);
+
+  SetSnapshotIfNeeded();
+
+  SequenceNumber seq;
+  if (snapshot_) {
+    seq = snapshot_->GetSequenceNumber();
+  } else {
+    seq = db_->GetLatestSequenceNumber();
+  }
+
+  std::string key_str = key.ToString();
+
+  TrackKey(cfh_id, key_str, seq, read_only, exclusive);
+
+  // Always return OK. Confilct checking will happen at commit time.
+  return Status::OK();
+}
+
+// Returns OK if it is safe to commit this transaction.  Returns Status::Busy
+// if there are read or write conflicts that would prevent us from committing OR
+// if we can not determine whether there would be any such conflicts.
+//
+// Should only be called on writer thread in order to avoid any race conditions
+// in detecting write conflicts.
+Status OptimisticTransaction::CheckTransactionForConflicts(DB* db) {
+  auto db_impl = static_cast_with_check<DBImpl>(db);
+
+  // Since we are on the write thread and do not want to block other writers,
+  // we will do a cache-only conflict check.  This can result in TryAgain
+  // getting returned if there is not sufficient memtable history to check
+  // for conflicts.
+  return TransactionUtil::CheckKeysForConflicts(db_impl, *tracked_locks_,
+                                                true /* cache_only */);
+}
+
+Status OptimisticTransaction::SetName(const TransactionName& /* unused */) {
+  return Status::InvalidArgument("Optimistic transactions cannot be named.");
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/optimistic_transaction.h b/src/rocksdb/utilities/transactions/optimistic_transaction.h
new file mode 100644
index 000000000..de23233d5
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/optimistic_transaction.h
@@ -0,0 +1,101 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <stack>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "db/write_callback.h"
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/snapshot.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "utilities/transactions/transaction_base.h"
+#include "utilities/transactions/transaction_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class OptimisticTransaction : public TransactionBaseImpl {
+ public:
+  OptimisticTransaction(OptimisticTransactionDB* db,
+                        const WriteOptions& write_options,
+                        const OptimisticTransactionOptions& txn_options);
+  // No copying allowed
+  OptimisticTransaction(const OptimisticTransaction&) = delete;
+  void operator=(const OptimisticTransaction&) = delete;
+
+  virtual ~OptimisticTransaction();
+
+  void Reinitialize(OptimisticTransactionDB* txn_db,
+                    const WriteOptions& write_options,
+                    const OptimisticTransactionOptions& txn_options);
+
+  Status Prepare() override;
+
+  Status Commit() override;
+
+  Status Rollback() override;
+
+  Status SetName(const TransactionName& name) override;
+
+ protected:
+  Status TryLock(ColumnFamilyHandle* column_family, const Slice& key,
+                 bool read_only, bool exclusive, const bool do_validate = true,
+                 const bool assume_tracked = false) override;
+
+ private:
+  ROCKSDB_FIELD_UNUSED OptimisticTransactionDB* const txn_db_;
+
+  friend class OptimisticTransactionCallback;
+
+  void Initialize(const OptimisticTransactionOptions& txn_options);
+
+  // Returns OK if it is safe to commit this transaction.  Returns Status::Busy
+  // if there are read or write conflicts that would prevent us from committing
+  // OR if we can not determine whether there would be any such conflicts.
+  //
+  // Should only be called on writer thread.
+  Status CheckTransactionForConflicts(DB* db);
+
+  void Clear() override;
+
+  void UnlockGetForUpdate(ColumnFamilyHandle* /* unused */,
+                          const Slice& /* unused */) override {
+    // Nothing to unlock.
+  }
+
+  Status CommitWithSerialValidate();
+
+  Status CommitWithParallelValidate();
+};
+
+// Used at commit time to trigger transaction validation
+class OptimisticTransactionCallback : public WriteCallback {
+ public:
+  explicit OptimisticTransactionCallback(OptimisticTransaction* txn)
+      : txn_(txn) {}
+
+  Status Callback(DB* db) override {
+    return txn_->CheckTransactionForConflicts(db);
+  }
+
+  bool AllowWriteBatching() override { return false; }
+
+ private:
+  OptimisticTransaction* txn_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc b/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc
new file mode 100644
index 000000000..bffb3d5ed
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc
@@ -0,0 +1,111 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/optimistic_transaction_db_impl.h"
+
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "utilities/transactions/optimistic_transaction.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Transaction* OptimisticTransactionDBImpl::BeginTransaction(
+    const WriteOptions& write_options,
+    const OptimisticTransactionOptions& txn_options, Transaction* old_txn) {
+  if (old_txn != nullptr) {
+    ReinitializeTransaction(old_txn, write_options, txn_options);
+    return old_txn;
+  } else {
+    return new OptimisticTransaction(this, write_options, txn_options);
+  }
+}
+
+std::unique_lock<std::mutex> OptimisticTransactionDBImpl::LockBucket(
+    size_t idx) {
+  assert(idx < bucketed_locks_.size());
+  return std::unique_lock<std::mutex>(*bucketed_locks_[idx]);
+}
+
+Status OptimisticTransactionDB::Open(const Options& options,
+                                     const std::string& dbname,
+                                     OptimisticTransactionDB** dbptr) {
+  DBOptions db_options(options);
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+  std::vector<ColumnFamilyHandle*> handles;
+  Status s = Open(db_options, dbname, column_families, &handles, dbptr);
+  if (s.ok()) {
+    assert(handles.size() == 1);
+    // i can delete the handle since DBImpl is always holding a reference to
+    // default column family
+    delete handles[0];
+  }
+
+  return s;
+}
+
+Status OptimisticTransactionDB::Open(
+    const DBOptions& db_options, const std::string& dbname,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles,
+    OptimisticTransactionDB** dbptr) {
+  return OptimisticTransactionDB::Open(db_options,
+                                       OptimisticTransactionDBOptions(), dbname,
+                                       column_families, handles, dbptr);
+}
+
+Status OptimisticTransactionDB::Open(
+    const DBOptions& db_options,
+    const OptimisticTransactionDBOptions& occ_options,
+    const std::string& dbname,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles,
+    OptimisticTransactionDB** dbptr) {
+  Status s;
+  DB* db;
+
+  std::vector<ColumnFamilyDescriptor> column_families_copy = column_families;
+
+  // Enable MemTable History if not already enabled
+  for (auto& column_family : column_families_copy) {
+    ColumnFamilyOptions* options = &column_family.options;
+
+    if (options->max_write_buffer_size_to_maintain == 0 &&
+        options->max_write_buffer_number_to_maintain == 0) {
+      // Setting to -1 will set the History size to
+      // max_write_buffer_number * write_buffer_size.
+      options->max_write_buffer_size_to_maintain = -1;
+    }
+  }
+
+  s = DB::Open(db_options, dbname, column_families_copy, handles, &db);
+
+  if (s.ok()) {
+    *dbptr = new OptimisticTransactionDBImpl(db, occ_options);
+  }
+
+  return s;
+}
+
+void OptimisticTransactionDBImpl::ReinitializeTransaction(
+    Transaction* txn, const WriteOptions& write_options,
+    const OptimisticTransactionOptions& txn_options) {
+  assert(dynamic_cast<OptimisticTransaction*>(txn) != nullptr);
+  auto txn_impl = reinterpret_cast<OptimisticTransaction*>(txn);
+
+  txn_impl->Reinitialize(this, write_options, txn_options);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h b/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h
new file mode 100644
index 000000000..88e86ea4a
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h
@@ -0,0 +1,88 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <mutex>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class OptimisticTransactionDBImpl : public OptimisticTransactionDB {
+ public:
+  explicit OptimisticTransactionDBImpl(
+      DB* db, const OptimisticTransactionDBOptions& occ_options,
+      bool take_ownership = true)
+      : OptimisticTransactionDB(db),
+        db_owner_(take_ownership),
+        validate_policy_(occ_options.validate_policy) {
+    if (validate_policy_ == OccValidationPolicy::kValidateParallel) {
+      uint32_t bucket_size = std::max(16u, occ_options.occ_lock_buckets);
+      bucketed_locks_.reserve(bucket_size);
+      for (size_t i = 0; i < bucket_size; ++i) {
+        bucketed_locks_.emplace_back(
+            std::unique_ptr<std::mutex>(new std::mutex));
+      }
+    }
+  }
+
+  ~OptimisticTransactionDBImpl() {
+    // Prevent this stackable from destroying
+    // base db
+    if (!db_owner_) {
+      db_ = nullptr;
+    }
+  }
+
+  Transaction* BeginTransaction(const WriteOptions& write_options,
+                                const OptimisticTransactionOptions& txn_options,
+                                Transaction* old_txn) override;
+
+  // Transactional `DeleteRange()` is not yet supported.
+  using StackableDB::DeleteRange;
+  virtual Status DeleteRange(const WriteOptions&, ColumnFamilyHandle*,
+                             const Slice&, const Slice&) override {
+    return Status::NotSupported();
+  }
+
+  // Range deletions also must not be snuck into `WriteBatch`es as they are
+  // incompatible with `OptimisticTransactionDB`.
+  virtual Status Write(const WriteOptions& write_opts,
+                       WriteBatch* batch) override {
+    if (batch->HasDeleteRange()) {
+      return Status::NotSupported();
+    }
+    return OptimisticTransactionDB::Write(write_opts, batch);
+  }
+
+  size_t GetLockBucketsSize() const { return bucketed_locks_.size(); }
+
+  OccValidationPolicy GetValidatePolicy() const { return validate_policy_; }
+
+  std::unique_lock<std::mutex> LockBucket(size_t idx);
+
+ private:
+  // NOTE: used in validation phase. Each key is hashed into some
+  // bucket. We then take the lock in the hash value order to avoid deadlock.
+  std::vector<std::unique_ptr<std::mutex>> bucketed_locks_;
+
+  bool db_owner_;
+
+  const OccValidationPolicy validate_policy_;
+
+  void ReinitializeTransaction(Transaction* txn,
+                               const WriteOptions& write_options,
+                               const OptimisticTransactionOptions& txn_options =
+                                   OptimisticTransactionOptions());
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/optimistic_transaction_test.cc b/src/rocksdb/utilities/transactions/optimistic_transaction_test.cc
new file mode 100644
index 000000000..aa8192c32
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/optimistic_transaction_test.cc
@@ -0,0 +1,1491 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+#include <string>
+#include <thread>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksdb/utilities/transaction.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/transaction_test_util.h"
+#include "util/crc32c.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class OptimisticTransactionTest
+    : public testing::Test,
+      public testing::WithParamInterface<OccValidationPolicy> {
+ public:
+  OptimisticTransactionDB* txn_db;
+  std::string dbname;
+  Options options;
+
+  OptimisticTransactionTest() {
+    options.create_if_missing = true;
+    options.max_write_buffer_number = 2;
+    options.max_write_buffer_size_to_maintain = 2 * Arena::kInlineSize;
+    options.merge_operator.reset(new TestPutOperator());
+    dbname = test::PerThreadDBPath("optimistic_transaction_testdb");
+
+    EXPECT_OK(DestroyDB(dbname, options));
+    Open();
+  }
+  ~OptimisticTransactionTest() override {
+    delete txn_db;
+    EXPECT_OK(DestroyDB(dbname, options));
+  }
+
+  void Reopen() {
+    delete txn_db;
+    txn_db = nullptr;
+    Open();
+  }
+
+ private:
+  void Open() {
+    ColumnFamilyOptions cf_options(options);
+    OptimisticTransactionDBOptions occ_opts;
+    occ_opts.validate_policy = GetParam();
+    std::vector<ColumnFamilyDescriptor> column_families;
+    std::vector<ColumnFamilyHandle*> handles;
+    column_families.push_back(
+        ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+    Status s =
+        OptimisticTransactionDB::Open(DBOptions(options), occ_opts, dbname,
+                                      column_families, &handles, &txn_db);
+
+    ASSERT_OK(s);
+    ASSERT_NE(txn_db, nullptr);
+    ASSERT_EQ(handles.size(), 1);
+    delete handles[0];
+  }
+};
+
+TEST_P(OptimisticTransactionTest, SuccessTest) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  std::string value;
+
+  ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar")));
+  ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar")));
+
+  Transaction* txn = txn_db->BeginTransaction(write_options);
+  ASSERT_NE(txn, nullptr);
+
+  ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value));
+  ASSERT_EQ(value, "bar");
+
+  ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2")));
+
+  ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value));
+  ASSERT_EQ(value, "bar2");
+
+  ASSERT_OK(txn->Commit());
+
+  ASSERT_OK(txn_db->Get(read_options, "foo", &value));
+  ASSERT_EQ(value, "bar2");
+
+  delete txn;
+}
+
+TEST_P(OptimisticTransactionTest, WriteConflictTest) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  std::string value;
+
+  ASSERT_OK(txn_db->Put(write_options, "foo", "bar"));
+  ASSERT_OK(txn_db->Put(write_options, "foo2", "bar"));
+
+  Transaction* txn = txn_db->BeginTransaction(write_options);
+  ASSERT_NE(txn, nullptr);
+
+  ASSERT_OK(txn->Put("foo", "bar2"));
+
+  // This Put outside of a transaction will conflict with the previous write
+  ASSERT_OK(txn_db->Put(write_options, "foo", "barz"));
+
+  ASSERT_OK(txn_db->Get(read_options, "foo", &value));
+  ASSERT_EQ(value, "barz");
+  ASSERT_EQ(1, txn->GetNumKeys());
+
+  Status s = txn->Commit();
+  ASSERT_TRUE(s.IsBusy());  // Txn should not commit
+
+  // Verify that transaction did not write anything
+  ASSERT_OK(txn_db->Get(read_options, "foo", &value));
+  ASSERT_EQ(value, "barz");
+  ASSERT_OK(txn_db->Get(read_options, "foo2", &value));
+  ASSERT_EQ(value, "bar");
+
+  delete txn;
+}
+
+TEST_P(OptimisticTransactionTest, WriteConflictTest2) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  OptimisticTransactionOptions txn_options;
+  std::string value;
+
+  ASSERT_OK(txn_db->Put(write_options, "foo", "bar"));
+  ASSERT_OK(txn_db->Put(write_options, "foo2", "bar"));
+
+  txn_options.set_snapshot = true;
+  Transaction* txn = txn_db->BeginTransaction(write_options, txn_options);
+  ASSERT_NE(txn, nullptr);
+
+  // This Put outside of a transaction will conflict with a later write
+  ASSERT_OK(txn_db->Put(write_options, "foo", "barz"));
+
+  ASSERT_OK(txn->Put(
+      "foo", "bar2"));  // Conflicts with write done after snapshot taken
+
+  ASSERT_OK(txn_db->Get(read_options, "foo", &value));
+  ASSERT_EQ(value, "barz");
+
+  Status s = txn->Commit();
+  ASSERT_TRUE(s.IsBusy());  // Txn should not commit
+
+  // Verify that transaction did not write anything
+  ASSERT_OK(txn_db->Get(read_options, "foo", &value));
+  ASSERT_EQ(value, "barz");
+  ASSERT_OK(txn_db->Get(read_options, "foo2", &value));
+  ASSERT_EQ(value, "bar");
+
+  delete txn;
+}
+
+TEST_P(OptimisticTransactionTest, WriteConflictTest3) {
+  ASSERT_OK(txn_db->Put(WriteOptions(), "foo", "bar"));
+
+  Transaction* txn = txn_db->BeginTransaction(WriteOptions());
+  ASSERT_NE(txn, nullptr);
+
+  std::string value;
+  ASSERT_OK(txn->GetForUpdate(ReadOptions(), "foo", &value));
+  ASSERT_EQ(value, "bar");
+  ASSERT_OK(txn->Merge("foo", "bar3"));
+
+  // Merge outside of a transaction should conflict with the previous merge
+  ASSERT_OK(txn_db->Merge(WriteOptions(), "foo", "bar2"));
+  ASSERT_OK(txn_db->Get(ReadOptions(), "foo", &value));
+  ASSERT_EQ(value, "bar2");
+
+  ASSERT_EQ(1, txn->GetNumKeys());
+
+  Status s = txn->Commit();
+  EXPECT_TRUE(s.IsBusy());  // Txn should not commit
+
+  // Verify that transaction did not write anything
+  ASSERT_OK(txn_db->Get(ReadOptions(), "foo", &value));
+  ASSERT_EQ(value, "bar2");
+
+  delete txn;
+}
+
+TEST_P(OptimisticTransactionTest, WriteConflict4) {
+  ASSERT_OK(txn_db->Put(WriteOptions(), "foo", "bar"));
+
+  Transaction* txn = txn_db->BeginTransaction(WriteOptions());
+  ASSERT_NE(txn, nullptr);
+
+  std::string value;
+  ASSERT_OK(txn->GetForUpdate(ReadOptions(), "foo", &value));
+  ASSERT_EQ(value, "bar");
+  ASSERT_OK(txn->Merge("foo", "bar3"));
+
+  // Range delete outside of a transaction should conflict with the previous
+  // merge inside txn
+  auto* dbimpl = static_cast_with_check<DBImpl>(txn_db->GetRootDB());
+  ColumnFamilyHandle* default_cf = dbimpl->DefaultColumnFamily();
+  ASSERT_OK(dbimpl->DeleteRange(WriteOptions(), default_cf, "foo", "foo1"));
+  Status s = txn_db->Get(ReadOptions(), "foo", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  ASSERT_EQ(1, txn->GetNumKeys());
+
+  s = txn->Commit();
+  EXPECT_TRUE(s.IsBusy());  // Txn should not commit
+
+  // Verify that transaction did not write anything
+  s = txn_db->Get(ReadOptions(), "foo", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  delete txn;
+}
+
+TEST_P(OptimisticTransactionTest, ReadConflictTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  OptimisticTransactionOptions txn_options;
+  std::string value;
+
+  ASSERT_OK(txn_db->Put(write_options, "foo", "bar"));
+  ASSERT_OK(txn_db->Put(write_options, "foo2", "bar"));
+
+  txn_options.set_snapshot = true;
+  Transaction* txn = txn_db->BeginTransaction(write_options, txn_options);
+  ASSERT_NE(txn, nullptr);
+
+  txn->SetSnapshot();
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
+  ASSERT_EQ(value, "bar");
+
+  // This Put outside of a transaction will conflict with the previous read
+  ASSERT_OK(txn_db->Put(write_options, "foo", "barz"));
+
+  ASSERT_OK(txn_db->Get(read_options, "foo", &value));
+  ASSERT_EQ(value, "barz");
+
+  Status s = txn->Commit();
+  ASSERT_TRUE(s.IsBusy());  // Txn should not commit
+
+  // Verify that transaction did not write anything
+  ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value));
+  ASSERT_EQ(value, "barz");
+  ASSERT_OK(txn->GetForUpdate(read_options, "foo2", &value));
+  ASSERT_EQ(value, "bar");
+
+  delete txn;
+}
+
+TEST_P(OptimisticTransactionTest, TxnOnlyTest) {
+  // Test to make sure transactions work when there are no other writes in an
+  // empty db.
+
+  WriteOptions write_options;
+  ReadOptions read_options;
+  std::string value;
+
+  Transaction* txn = txn_db->BeginTransaction(write_options);
+  ASSERT_NE(txn, nullptr);
+
+  ASSERT_OK(txn->Put("x", "y"));
+
+  ASSERT_OK(txn->Commit());
+
+  delete txn;
+}
+
+TEST_P(OptimisticTransactionTest, FlushTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  std::string value;
+
+  ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar")));
+  ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar")));
+
+  Transaction* txn = txn_db->BeginTransaction(write_options);
+  ASSERT_NE(txn, nullptr);
+
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
+  ASSERT_EQ(value, "bar");
+
+  ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2")));
+
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
+  ASSERT_EQ(value, "bar2");
+
+  // Put a random key so we have a memtable to flush
+  ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy"));
+
+  // force a memtable flush
+  FlushOptions flush_ops;
+  ASSERT_OK(txn_db->Flush(flush_ops));
+
+  // txn should commit since the flushed table is still in MemtableList History
+  ASSERT_OK(txn->Commit());
+
+  ASSERT_OK(txn_db->Get(read_options, "foo", &value));
+  ASSERT_EQ(value, "bar2");
+
+  delete txn;
+}
+
+TEST_P(OptimisticTransactionTest, FlushTest2) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  std::string value;
+
+  ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar")));
+  ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar")));
+
+  Transaction* txn = txn_db->BeginTransaction(write_options);
+  ASSERT_NE(txn, nullptr);
+
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
+  ASSERT_EQ(value, "bar");
+
+  ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2")));
+
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
+  ASSERT_EQ(value, "bar2");
+
+  // Put a random key so we have a MemTable to flush
+  ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy"));
+
+  // force a memtable flush
+  FlushOptions flush_ops;
+  ASSERT_OK(txn_db->Flush(flush_ops));
+
+  // Put a random key so we have a MemTable to flush
+  ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy2"));
+
+  // force a memtable flush
+  ASSERT_OK(txn_db->Flush(flush_ops));
+
+  ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy3"));
+
+  // force a memtable flush
+  // Since our test db has max_write_buffer_number=2, this flush will cause
+  // the first memtable to get purged from the MemtableList history.
+  ASSERT_OK(txn_db->Flush(flush_ops));
+
+  Status s = txn->Commit();
+  // txn should not commit since MemTableList History is not large enough
+  ASSERT_TRUE(s.IsTryAgain());
+
+  ASSERT_OK(txn_db->Get(read_options, "foo", &value));
+  ASSERT_EQ(value, "bar");
+
+  delete txn;
+}
+
+// Trigger the condition where some old memtables are skipped when doing
+// TransactionUtil::CheckKey(), and make sure the result is still correct.
+TEST_P(OptimisticTransactionTest, CheckKeySkipOldMemtable) {
+  const int kAttemptHistoryMemtable = 0;
+  const int kAttemptImmMemTable = 1;
+  for (int attempt = kAttemptHistoryMemtable; attempt <= kAttemptImmMemTable;
+       attempt++) {
+    Reopen();
+
+    WriteOptions write_options;
+    ReadOptions read_options;
+    ReadOptions snapshot_read_options;
+    ReadOptions snapshot_read_options2;
+    std::string value;
+
+    ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar")));
+    ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar")));
+
+    Transaction* txn = txn_db->BeginTransaction(write_options);
+    ASSERT_TRUE(txn != nullptr);
+
+    Transaction* txn2 = txn_db->BeginTransaction(write_options);
+    ASSERT_TRUE(txn2 != nullptr);
+
+    snapshot_read_options.snapshot = txn->GetSnapshot();
+    ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
+    ASSERT_EQ(value, "bar");
+    ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2")));
+
+    snapshot_read_options2.snapshot = txn2->GetSnapshot();
+    ASSERT_OK(txn2->GetForUpdate(snapshot_read_options2, "foo2", &value));
+    ASSERT_EQ(value, "bar");
+    ASSERT_OK(txn2->Put(Slice("foo2"), Slice("bar2")));
+
+    // txn updates "foo" and txn2 updates "foo2", and now a write is
+    // issued for "foo", which conflicts with txn but not txn2
+    ASSERT_OK(txn_db->Put(write_options, "foo", "bar"));
+
+    if (attempt == kAttemptImmMemTable) {
+      // For the second attempt, hold flush from beginning. The memtable
+      // will be switched to immutable after calling TEST_SwitchMemtable()
+      // while CheckKey() is called.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+          {{"OptimisticTransactionTest.CheckKeySkipOldMemtable",
+            "FlushJob::Start"}});
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    }
+
+    // force a memtable flush. The memtable should still be kept
+    FlushOptions flush_ops;
+    if (attempt == kAttemptHistoryMemtable) {
+      ASSERT_OK(txn_db->Flush(flush_ops));
+    } else {
+      ASSERT_EQ(attempt, kAttemptImmMemTable);
+      DBImpl* db_impl = static_cast<DBImpl*>(txn_db->GetRootDB());
+      ASSERT_OK(db_impl->TEST_SwitchMemtable());
+    }
+    uint64_t num_imm_mems;
+    ASSERT_TRUE(txn_db->GetIntProperty(DB::Properties::kNumImmutableMemTable,
+                                       &num_imm_mems));
+    if (attempt == kAttemptHistoryMemtable) {
+      ASSERT_EQ(0, num_imm_mems);
+    } else {
+      ASSERT_EQ(attempt, kAttemptImmMemTable);
+      ASSERT_EQ(1, num_imm_mems);
+    }
+
+    // Put something in active memtable
+    ASSERT_OK(txn_db->Put(write_options, Slice("foo3"), Slice("bar")));
+
+    // Create txn3 after flushing, when this transaction is commited,
+    // only need to check the active memtable
+    Transaction* txn3 = txn_db->BeginTransaction(write_options);
+    ASSERT_TRUE(txn3 != nullptr);
+
+    // Commit both of txn and txn2. txn will conflict but txn2 will
+    // pass. In both ways, both memtables are queried.
+    SetPerfLevel(PerfLevel::kEnableCount);
+
+    get_perf_context()->Reset();
+    Status s = txn->Commit();
+    // We should have checked two memtables
+    ASSERT_EQ(2, get_perf_context()->get_from_memtable_count);
+    // txn should fail because of conflict, even if the memtable
+    // has flushed, because it is still preserved in history.
+    ASSERT_TRUE(s.IsBusy());
+
+    get_perf_context()->Reset();
+    s = txn2->Commit();
+    // We should have checked two memtables
+    ASSERT_EQ(2, get_perf_context()->get_from_memtable_count);
+    ASSERT_TRUE(s.ok());
+
+    ASSERT_OK(txn3->Put(Slice("foo2"), Slice("bar2")));
+    get_perf_context()->Reset();
+    s = txn3->Commit();
+    // txn3 is created after the active memtable is created, so that is the only
+    // memtable to check.
+    ASSERT_EQ(1, get_perf_context()->get_from_memtable_count);
+    ASSERT_TRUE(s.ok());
+
+    TEST_SYNC_POINT("OptimisticTransactionTest.CheckKeySkipOldMemtable");
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+    SetPerfLevel(PerfLevel::kDisable);
+
+    delete txn;
+    delete txn2;
+    delete txn3;
+  }
+}
+
+TEST_P(OptimisticTransactionTest, NoSnapshotTest) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  std::string value;
+
+  ASSERT_OK(txn_db->Put(write_options, "AAA", "bar"));
+
+  Transaction* txn = txn_db->BeginTransaction(write_options);
+  ASSERT_NE(txn, nullptr);
+
+  // Modify key after transaction start
+  ASSERT_OK(txn_db->Put(write_options, "AAA", "bar1"));
+
+  // Read and write without a snapshot
+  ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
+  ASSERT_EQ(value, "bar1");
+  ASSERT_OK(txn->Put("AAA", "bar2"));
+
+  // Should commit since read/write was done after data changed
+  ASSERT_OK(txn->Commit());
+
+  ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
+  ASSERT_EQ(value, "bar2");
+
+  delete txn;
+}
+
+TEST_P(OptimisticTransactionTest, MultipleSnapshotTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  std::string value;
+
+  ASSERT_OK(txn_db->Put(write_options, "AAA", "bar"));
+  ASSERT_OK(txn_db->Put(write_options, "BBB", "bar"));
+  ASSERT_OK(txn_db->Put(write_options, "CCC", "bar"));
+
+  Transaction* txn = txn_db->BeginTransaction(write_options);
+  ASSERT_NE(txn, nullptr);
+
+  ASSERT_OK(txn_db->Put(write_options, "AAA", "bar1"));
+
+  // Read and write without a snapshot
+  ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
+  ASSERT_EQ(value, "bar1");
+  ASSERT_OK(txn->Put("AAA", "bar2"));
+
+  // Modify BBB before snapshot is taken
+  ASSERT_OK(txn_db->Put(write_options, "BBB", "bar1"));
+
+  txn->SetSnapshot();
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+
+  // Read and write with snapshot
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "BBB", &value));
+  ASSERT_EQ(value, "bar1");
+  ASSERT_OK(txn->Put("BBB", "bar2"));
+
+  ASSERT_OK(txn_db->Put(write_options, "CCC", "bar1"));
+
+  // Set a new snapshot
+  txn->SetSnapshot();
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+
+  // Read and write with snapshot
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "CCC", &value));
+  ASSERT_EQ(value, "bar1");
+  ASSERT_OK(txn->Put("CCC", "bar2"));
+
+  ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
+  ASSERT_EQ(value, "bar2");
+  ASSERT_OK(txn->GetForUpdate(read_options, "BBB", &value));
+  ASSERT_EQ(value, "bar2");
+  ASSERT_OK(txn->GetForUpdate(read_options, "CCC", &value));
+  ASSERT_EQ(value, "bar2");
+
+  ASSERT_OK(txn_db->Get(read_options, "AAA", &value));
+  ASSERT_EQ(value, "bar1");
+  ASSERT_OK(txn_db->Get(read_options, "BBB", &value));
+  ASSERT_EQ(value, "bar1");
+  ASSERT_OK(txn_db->Get(read_options, "CCC", &value));
+  ASSERT_EQ(value, "bar1");
+
+  ASSERT_OK(txn->Commit());
+
+  ASSERT_OK(txn_db->Get(read_options, "AAA", &value));
+  ASSERT_EQ(value, "bar2");
+  ASSERT_OK(txn_db->Get(read_options, "BBB", &value));
+  ASSERT_EQ(value, "bar2");
+  ASSERT_OK(txn_db->Get(read_options, "CCC", &value));
+  ASSERT_EQ(value, "bar2");
+
+  // verify that we track multiple writes to the same key at different snapshots
+  delete txn;
+  txn = txn_db->BeginTransaction(write_options);
+
+  // Potentially conflicting writes
+  ASSERT_OK(txn_db->Put(write_options, "ZZZ", "zzz"));
+  ASSERT_OK(txn_db->Put(write_options, "XXX", "xxx"));
+
+  txn->SetSnapshot();
+
+  OptimisticTransactionOptions txn_options;
+  txn_options.set_snapshot = true;
+  Transaction* txn2 = txn_db->BeginTransaction(write_options, txn_options);
+  txn2->SetSnapshot();
+
+  // This should not conflict in txn since the snapshot is later than the
+  // previous write (spoiler alert:  it will later conflict with txn2).
+  ASSERT_OK(txn->Put("ZZZ", "zzzz"));
+  ASSERT_OK(txn->Commit());
+
+  delete txn;
+
+  // This will conflict since the snapshot is earlier than another write to ZZZ
+  ASSERT_OK(txn2->Put("ZZZ", "xxxxx"));
+
+  Status s = txn2->Commit();
+  ASSERT_TRUE(s.IsBusy());
+
+  delete txn2;
+}
+
+TEST_P(OptimisticTransactionTest, ColumnFamiliesTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  OptimisticTransactionOptions txn_options;
+  std::string value;
+
+  ColumnFamilyHandle *cfa, *cfb;
+  ColumnFamilyOptions cf_options;
+
+  // Create 2 new column families
+  ASSERT_OK(txn_db->CreateColumnFamily(cf_options, "CFA", &cfa));
+  ASSERT_OK(txn_db->CreateColumnFamily(cf_options, "CFB", &cfb));
+
+  delete cfa;
+  delete cfb;
+  delete txn_db;
+  txn_db = nullptr;
+
+  // open DB with three column families
+  std::vector<ColumnFamilyDescriptor> column_families;
+  // have to open default column family
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions()));
+  // open the new column families
+  column_families.push_back(
+      ColumnFamilyDescriptor("CFA", ColumnFamilyOptions()));
+  column_families.push_back(
+      ColumnFamilyDescriptor("CFB", ColumnFamilyOptions()));
+  std::vector<ColumnFamilyHandle*> handles;
+  ASSERT_OK(OptimisticTransactionDB::Open(options, dbname, column_families,
+                                          &handles, &txn_db));
+  assert(txn_db != nullptr);
+  ASSERT_NE(txn_db, nullptr);
+
+  Transaction* txn = txn_db->BeginTransaction(write_options);
+  ASSERT_NE(txn, nullptr);
+
+  txn->SetSnapshot();
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+
+  txn_options.set_snapshot = true;
+  Transaction* txn2 = txn_db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn2);
+
+  // Write some data to the db
+  WriteBatch batch;
+  ASSERT_OK(batch.Put("foo", "foo"));
+  ASSERT_OK(batch.Put(handles[1], "AAA", "bar"));
+  ASSERT_OK(batch.Put(handles[1], "AAAZZZ", "bar"));
+  ASSERT_OK(txn_db->Write(write_options, &batch));
+  ASSERT_OK(txn_db->Delete(write_options, handles[1], "AAAZZZ"));
+
+  // These keys do no conflict with existing writes since they're in
+  // different column families
+  ASSERT_OK(txn->Delete("AAA"));
+  Status s =
+      txn->GetForUpdate(snapshot_read_options, handles[1], "foo", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  Slice key_slice("AAAZZZ");
+  Slice value_slices[2] = {Slice("bar"), Slice("bar")};
+  ASSERT_OK(txn->Put(handles[2], SliceParts(&key_slice, 1),
+                     SliceParts(value_slices, 2)));
+
+  ASSERT_EQ(3, txn->GetNumKeys());
+
+  // Txn should commit
+  ASSERT_OK(txn->Commit());
+  s = txn_db->Get(read_options, "AAA", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  s = txn_db->Get(read_options, handles[2], "AAAZZZ", &value);
+  ASSERT_EQ(value, "barbar");
+
+  Slice key_slices[3] = {Slice("AAA"), Slice("ZZ"), Slice("Z")};
+  Slice value_slice("barbarbar");
+  // This write will cause a conflict with the earlier batch write
+  ASSERT_OK(txn2->Put(handles[1], SliceParts(key_slices, 3),
+                      SliceParts(&value_slice, 1)));
+
+  ASSERT_OK(txn2->Delete(handles[2], "XXX"));
+  ASSERT_OK(txn2->Delete(handles[1], "XXX"));
+  s = txn2->GetForUpdate(snapshot_read_options, handles[1], "AAA", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  // Verify txn did not commit
+  s = txn2->Commit();
+  ASSERT_TRUE(s.IsBusy());
+  s = txn_db->Get(read_options, handles[1], "AAAZZZ", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_EQ(value, "barbar");
+
+  delete txn;
+  delete txn2;
+
+  txn = txn_db->BeginTransaction(write_options, txn_options);
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+
+  txn2 = txn_db->BeginTransaction(write_options, txn_options);
+  ASSERT_NE(txn, nullptr);
+
+  std::vector<ColumnFamilyHandle*> multiget_cfh = {handles[1], handles[2],
+                                                   handles[0], handles[2]};
+  std::vector<Slice> multiget_keys = {"AAA", "AAAZZZ", "foo", "foo"};
+  std::vector<std::string> values(4);
+
+  std::vector<Status> results = txn->MultiGetForUpdate(
+      snapshot_read_options, multiget_cfh, multiget_keys, &values);
+  ASSERT_OK(results[0]);
+  ASSERT_OK(results[1]);
+  ASSERT_OK(results[2]);
+  ASSERT_TRUE(results[3].IsNotFound());
+  ASSERT_EQ(values[0], "bar");
+  ASSERT_EQ(values[1], "barbar");
+  ASSERT_EQ(values[2], "foo");
+
+  ASSERT_OK(txn->Delete(handles[2], "ZZZ"));
+  ASSERT_OK(txn->Put(handles[2], "ZZZ", "YYY"));
+  ASSERT_OK(txn->Put(handles[2], "ZZZ", "YYYY"));
+  ASSERT_OK(txn->Delete(handles[2], "ZZZ"));
+  ASSERT_OK(txn->Put(handles[2], "AAAZZZ", "barbarbar"));
+
+  ASSERT_EQ(5, txn->GetNumKeys());
+
+  // Txn should commit
+  ASSERT_OK(txn->Commit());
+  s = txn_db->Get(read_options, handles[2], "ZZZ", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  // Put a key which will conflict with the next txn using the previous snapshot
+  ASSERT_OK(txn_db->Put(write_options, handles[2], "foo", "000"));
+
+  results = txn2->MultiGetForUpdate(snapshot_read_options, multiget_cfh,
+                                    multiget_keys, &values);
+  ASSERT_OK(results[0]);
+  ASSERT_OK(results[1]);
+  ASSERT_OK(results[2]);
+  ASSERT_TRUE(results[3].IsNotFound());
+  ASSERT_EQ(values[0], "bar");
+  ASSERT_EQ(values[1], "barbar");
+  ASSERT_EQ(values[2], "foo");
+
+  // Verify Txn Did not Commit
+  s = txn2->Commit();
+  ASSERT_TRUE(s.IsBusy());
+
+  s = txn_db->DropColumnFamily(handles[1]);
+  ASSERT_OK(s);
+  s = txn_db->DropColumnFamily(handles[2]);
+  ASSERT_OK(s);
+
+  delete txn;
+  delete txn2;
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+}
+
+TEST_P(OptimisticTransactionTest, EmptyTest) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  std::string value;
+
+  ASSERT_OK(txn_db->Put(write_options, "aaa", "aaa"));
+
+  Transaction* txn = txn_db->BeginTransaction(write_options);
+  ASSERT_OK(txn->Commit());
+  delete txn;
+
+  txn = txn_db->BeginTransaction(write_options);
+  ASSERT_OK(txn->Rollback());
+  delete txn;
+
+  txn = txn_db->BeginTransaction(write_options);
+  ASSERT_OK(txn->GetForUpdate(read_options, "aaa", &value));
+  ASSERT_EQ(value, "aaa");
+
+  ASSERT_OK(txn->Commit());
+  delete txn;
+
+  txn = txn_db->BeginTransaction(write_options);
+  txn->SetSnapshot();
+  ASSERT_OK(txn->GetForUpdate(read_options, "aaa", &value));
+  ASSERT_EQ(value, "aaa");
+
+  ASSERT_OK(txn_db->Put(write_options, "aaa", "xxx"));
+  Status s = txn->Commit();
+  ASSERT_TRUE(s.IsBusy());
+  delete txn;
+}
+
+TEST_P(OptimisticTransactionTest, PredicateManyPreceders) {
+  WriteOptions write_options;
+  ReadOptions read_options1, read_options2;
+  OptimisticTransactionOptions txn_options;
+  std::string value;
+
+  txn_options.set_snapshot = true;
+  Transaction* txn1 = txn_db->BeginTransaction(write_options, txn_options);
+  read_options1.snapshot = txn1->GetSnapshot();
+
+  Transaction* txn2 = txn_db->BeginTransaction(write_options);
+  txn2->SetSnapshot();
+  read_options2.snapshot = txn2->GetSnapshot();
+
+  std::vector<Slice> multiget_keys = {"1", "2", "3"};
+  std::vector<std::string> multiget_values;
+
+  std::vector<Status> results =
+      txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values);
+  ASSERT_TRUE(results[0].IsNotFound());
+  ASSERT_TRUE(results[1].IsNotFound());
+  ASSERT_TRUE(results[2].IsNotFound());
+
+  ASSERT_OK(txn2->Put("2", "x"));
+
+  ASSERT_OK(txn2->Commit());
+
+  multiget_values.clear();
+  results =
+      txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values);
+  ASSERT_TRUE(results[0].IsNotFound());
+  ASSERT_TRUE(results[1].IsNotFound());
+  ASSERT_TRUE(results[2].IsNotFound());
+
+  // should not commit since txn2 wrote a key txn has read
+  Status s = txn1->Commit();
+  ASSERT_TRUE(s.IsBusy());
+
+  delete txn1;
+  delete txn2;
+
+  txn1 = txn_db->BeginTransaction(write_options, txn_options);
+  read_options1.snapshot = txn1->GetSnapshot();
+
+  txn2 = txn_db->BeginTransaction(write_options, txn_options);
+  read_options2.snapshot = txn2->GetSnapshot();
+
+  ASSERT_OK(txn1->Put("4", "x"));
+
+  ASSERT_OK(txn2->Delete("4"));
+
+  // txn1 can commit since txn2's delete hasn't happened yet (it's just batched)
+  ASSERT_OK(txn1->Commit());
+
+  s = txn2->GetForUpdate(read_options2, "4", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  // txn2 cannot commit since txn1 changed "4"
+  s = txn2->Commit();
+  ASSERT_TRUE(s.IsBusy());
+
+  delete txn1;
+  delete txn2;
+}
+
+TEST_P(OptimisticTransactionTest, LostUpdate) {
+  WriteOptions write_options;
+  ReadOptions read_options, read_options1, read_options2;
+  OptimisticTransactionOptions txn_options;
+  std::string value;
+
+  // Test 2 transactions writing to the same key in multiple orders and
+  // with/without snapshots
+
+  Transaction* txn1 = txn_db->BeginTransaction(write_options);
+  Transaction* txn2 = txn_db->BeginTransaction(write_options);
+
+  ASSERT_OK(txn1->Put("1", "1"));
+  ASSERT_OK(txn2->Put("1", "2"));
+
+  ASSERT_OK(txn1->Commit());
+
+  Status s = txn2->Commit();
+  ASSERT_TRUE(s.IsBusy());
+
+  delete txn1;
+  delete txn2;
+
+  txn_options.set_snapshot = true;
+  txn1 = txn_db->BeginTransaction(write_options, txn_options);
+  read_options1.snapshot = txn1->GetSnapshot();
+
+  txn2 = txn_db->BeginTransaction(write_options, txn_options);
+  read_options2.snapshot = txn2->GetSnapshot();
+
+  ASSERT_OK(txn1->Put("1", "3"));
+  ASSERT_OK(txn2->Put("1", "4"));
+
+  ASSERT_OK(txn1->Commit());
+
+  s = txn2->Commit();
+  ASSERT_TRUE(s.IsBusy());
+
+  delete txn1;
+  delete txn2;
+
+  txn1 = txn_db->BeginTransaction(write_options, txn_options);
+  read_options1.snapshot = txn1->GetSnapshot();
+
+  txn2 = txn_db->BeginTransaction(write_options, txn_options);
+  read_options2.snapshot = txn2->GetSnapshot();
+
+  ASSERT_OK(txn1->Put("1", "5"));
+  ASSERT_OK(txn1->Commit());
+
+  ASSERT_OK(txn2->Put("1", "6"));
+  s = txn2->Commit();
+  ASSERT_TRUE(s.IsBusy());
+
+  delete txn1;
+  delete txn2;
+
+  txn1 = txn_db->BeginTransaction(write_options, txn_options);
+  read_options1.snapshot = txn1->GetSnapshot();
+
+  txn2 = txn_db->BeginTransaction(write_options, txn_options);
+  read_options2.snapshot = txn2->GetSnapshot();
+
+  ASSERT_OK(txn1->Put("1", "5"));
+  ASSERT_OK(txn1->Commit());
+
+  txn2->SetSnapshot();
+  ASSERT_OK(txn2->Put("1", "6"));
+  ASSERT_OK(txn2->Commit());
+
+  delete txn1;
+  delete txn2;
+
+  txn1 = txn_db->BeginTransaction(write_options);
+  txn2 = txn_db->BeginTransaction(write_options);
+
+  ASSERT_OK(txn1->Put("1", "7"));
+  ASSERT_OK(txn1->Commit());
+
+  ASSERT_OK(txn2->Put("1", "8"));
+  ASSERT_OK(txn2->Commit());
+
+  delete txn1;
+  delete txn2;
+
+  ASSERT_OK(txn_db->Get(read_options, "1", &value));
+  ASSERT_EQ(value, "8");
+}
+
+TEST_P(OptimisticTransactionTest, UntrackedWrites) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  std::string value;
+  Status s;
+
+  // Verify transaction rollback works for untracked keys.
+  Transaction* txn = txn_db->BeginTransaction(write_options);
+  ASSERT_OK(txn->PutUntracked("untracked", "0"));
+  ASSERT_OK(txn->Rollback());
+  s = txn_db->Get(read_options, "untracked", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  delete txn;
+  txn = txn_db->BeginTransaction(write_options);
+
+  ASSERT_OK(txn->Put("tracked", "1"));
+  ASSERT_OK(txn->PutUntracked("untracked", "1"));
+  ASSERT_OK(txn->MergeUntracked("untracked", "2"));
+  ASSERT_OK(txn->DeleteUntracked("untracked"));
+
+  // Write to the untracked key outside of the transaction and verify
+  // it doesn't prevent the transaction from committing.
+  ASSERT_OK(txn_db->Put(write_options, "untracked", "x"));
+
+  ASSERT_OK(txn->Commit());
+
+  s = txn_db->Get(read_options, "untracked", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  delete txn;
+  txn = txn_db->BeginTransaction(write_options);
+
+  ASSERT_OK(txn->Put("tracked", "10"));
+  ASSERT_OK(txn->PutUntracked("untracked", "A"));
+
+  // Write to tracked key outside of the transaction and verify that the
+  // untracked keys are not written when the commit fails.
+  ASSERT_OK(txn_db->Delete(write_options, "tracked"));
+
+  s = txn->Commit();
+  ASSERT_TRUE(s.IsBusy());
+
+  s = txn_db->Get(read_options, "untracked", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  delete txn;
+}
+
+TEST_P(OptimisticTransactionTest, IteratorTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  OptimisticTransactionOptions txn_options;
+  std::string value;
+
+  // Write some keys to the db
+  ASSERT_OK(txn_db->Put(write_options, "A", "a"));
+  ASSERT_OK(txn_db->Put(write_options, "G", "g"));
+  ASSERT_OK(txn_db->Put(write_options, "F", "f"));
+  ASSERT_OK(txn_db->Put(write_options, "C", "c"));
+  ASSERT_OK(txn_db->Put(write_options, "D", "d"));
+
+  Transaction* txn = txn_db->BeginTransaction(write_options);
+  ASSERT_NE(txn, nullptr);
+
+  // Write some keys in a txn
+  ASSERT_OK(txn->Put("B", "b"));
+  ASSERT_OK(txn->Put("H", "h"));
+  ASSERT_OK(txn->Delete("D"));
+  ASSERT_OK(txn->Put("E", "e"));
+
+  txn->SetSnapshot();
+  const Snapshot* snapshot = txn->GetSnapshot();
+
+  // Write some keys to the db after the snapshot
+  ASSERT_OK(txn_db->Put(write_options, "BB", "xx"));
+  ASSERT_OK(txn_db->Put(write_options, "C", "xx"));
+
+  read_options.snapshot = snapshot;
+  Iterator* iter = txn->GetIterator(read_options);
+  ASSERT_OK(iter->status());
+  iter->SeekToFirst();
+
+  // Read all keys via iter and lock them all
+  std::string results[] = {"a", "b", "c", "e", "f", "g", "h"};
+  for (int i = 0; i < 7; i++) {
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(results[i], iter->value().ToString());
+
+    ASSERT_OK(txn->GetForUpdate(read_options, iter->key(), nullptr));
+
+    iter->Next();
+  }
+  ASSERT_FALSE(iter->Valid());
+
+  iter->Seek("G");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("g", iter->value().ToString());
+
+  iter->Prev();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("f", iter->value().ToString());
+
+  iter->Seek("D");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("e", iter->value().ToString());
+
+  iter->Seek("C");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("c", iter->value().ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("e", iter->value().ToString());
+
+  iter->Seek("");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("a", iter->value().ToString());
+
+  iter->Seek("X");
+  ASSERT_OK(iter->status());
+  ASSERT_FALSE(iter->Valid());
+
+  iter->SeekToLast();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("h", iter->value().ToString());
+
+  // key "C" was modified in the db after txn's snapshot.  txn will not commit.
+  Status s = txn->Commit();
+  ASSERT_TRUE(s.IsBusy());
+
+  delete iter;
+  delete txn;
+}
+
+TEST_P(OptimisticTransactionTest, DeleteRangeSupportTest) {
+  // `OptimisticTransactionDB` does not allow range deletion in any API.
+  ASSERT_TRUE(
+      txn_db
+          ->DeleteRange(WriteOptions(), txn_db->DefaultColumnFamily(), "a", "b")
+          .IsNotSupported());
+  WriteBatch wb;
+  ASSERT_OK(wb.DeleteRange("a", "b"));
+  ASSERT_NOK(txn_db->Write(WriteOptions(), &wb));
+}
+
+TEST_P(OptimisticTransactionTest, SavepointTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  OptimisticTransactionOptions txn_options;
+  std::string value;
+
+  Transaction* txn = txn_db->BeginTransaction(write_options);
+  ASSERT_NE(txn, nullptr);
+
+  Status s = txn->RollbackToSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+
+  txn->SetSavePoint();  // 1
+
+  ASSERT_OK(txn->RollbackToSavePoint());  // Rollback to beginning of txn
+  s = txn->RollbackToSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+
+  ASSERT_OK(txn->Put("B", "b"));
+
+  ASSERT_OK(txn->Commit());
+
+  ASSERT_OK(txn_db->Get(read_options, "B", &value));
+  ASSERT_EQ("b", value);
+
+  delete txn;
+  txn = txn_db->BeginTransaction(write_options);
+  ASSERT_NE(txn, nullptr);
+
+  ASSERT_OK(txn->Put("A", "a"));
+  ASSERT_OK(txn->Put("B", "bb"));
+  ASSERT_OK(txn->Put("C", "c"));
+
+  txn->SetSavePoint();  // 2
+
+  ASSERT_OK(txn->Delete("B"));
+  ASSERT_OK(txn->Put("C", "cc"));
+  ASSERT_OK(txn->Put("D", "d"));
+
+  ASSERT_OK(txn->RollbackToSavePoint());  // Rollback to 2
+
+  ASSERT_OK(txn->Get(read_options, "A", &value));
+  ASSERT_EQ("a", value);
+  ASSERT_OK(txn->Get(read_options, "B", &value));
+  ASSERT_EQ("bb", value);
+  ASSERT_OK(txn->Get(read_options, "C", &value));
+  ASSERT_EQ("c", value);
+  s = txn->Get(read_options, "D", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  ASSERT_OK(txn->Put("A", "a"));
+  ASSERT_OK(txn->Put("E", "e"));
+
+  // Rollback to beginning of txn
+  s = txn->RollbackToSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_OK(txn->Rollback());
+
+  s = txn->Get(read_options, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_OK(txn->Get(read_options, "B", &value));
+  ASSERT_EQ("b", value);
+  s = txn->Get(read_options, "D", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  s = txn->Get(read_options, "D", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  s = txn->Get(read_options, "E", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  ASSERT_OK(txn->Put("A", "aa"));
+  ASSERT_OK(txn->Put("F", "f"));
+
+  txn->SetSavePoint();  // 3
+  txn->SetSavePoint();  // 4
+
+  ASSERT_OK(txn->Put("G", "g"));
+  ASSERT_OK(txn->Delete("F"));
+  ASSERT_OK(txn->Delete("B"));
+
+  ASSERT_OK(txn->Get(read_options, "A", &value));
+  ASSERT_EQ("aa", value);
+
+  s = txn->Get(read_options, "F", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Get(read_options, "B", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  ASSERT_OK(txn->RollbackToSavePoint());  // Rollback to 3
+
+  ASSERT_OK(txn->Get(read_options, "F", &value));
+  ASSERT_EQ("f", value);
+
+  s = txn->Get(read_options, "G", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  ASSERT_OK(txn->Commit());
+
+  ASSERT_OK(txn_db->Get(read_options, "F", &value));
+  ASSERT_EQ("f", value);
+
+  s = txn_db->Get(read_options, "G", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  ASSERT_OK(txn_db->Get(read_options, "A", &value));
+  ASSERT_EQ("aa", value);
+
+  ASSERT_OK(txn_db->Get(read_options, "B", &value));
+  ASSERT_EQ("b", value);
+
+  s = txn_db->Get(read_options, "C", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn_db->Get(read_options, "D", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn_db->Get(read_options, "E", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  delete txn;
+}
+
+TEST_P(OptimisticTransactionTest, UndoGetForUpdateTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  OptimisticTransactionOptions txn_options;
+  std::string value;
+
+  ASSERT_OK(txn_db->Put(write_options, "A", ""));
+
+  Transaction* txn1 = txn_db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn1);
+
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
+
+  txn1->UndoGetForUpdate("A");
+
+  Transaction* txn2 = txn_db->BeginTransaction(write_options);
+  txn2->Put("A", "x");
+  ASSERT_OK(txn2->Commit());
+  delete txn2;
+
+  // Verify that txn1 can commit since A isn't conflict checked
+  ASSERT_OK(txn1->Commit());
+  delete txn1;
+
+  txn1 = txn_db->BeginTransaction(write_options);
+  ASSERT_OK(txn1->Put("A", "a"));
+
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
+
+  txn1->UndoGetForUpdate("A");
+
+  txn2 = txn_db->BeginTransaction(write_options);
+  ASSERT_OK(txn2->Put("A", "x"));
+  ASSERT_OK(txn2->Commit());
+  delete txn2;
+
+  // Verify that txn1 cannot commit since A will still be conflict checked
+  Status s = txn1->Commit();
+  ASSERT_TRUE(s.IsBusy());
+  delete txn1;
+
+  txn1 = txn_db->BeginTransaction(write_options);
+
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
+
+  txn1->UndoGetForUpdate("A");
+
+  txn2 = txn_db->BeginTransaction(write_options);
+  ASSERT_OK(txn2->Put("A", "x"));
+  ASSERT_OK(txn2->Commit());
+  delete txn2;
+
+  // Verify that txn1 cannot commit since A will still be conflict checked
+  s = txn1->Commit();
+  ASSERT_TRUE(s.IsBusy());
+  delete txn1;
+
+  txn1 = txn_db->BeginTransaction(write_options);
+
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
+
+  txn1->UndoGetForUpdate("A");
+  txn1->UndoGetForUpdate("A");
+
+  txn2 = txn_db->BeginTransaction(write_options);
+  ASSERT_OK(txn2->Put("A", "x"));
+  ASSERT_OK(txn2->Commit());
+  delete txn2;
+
+  // Verify that txn1 can commit since A isn't conflict checked
+  ASSERT_OK(txn1->Commit());
+  delete txn1;
+
+  txn1 = txn_db->BeginTransaction(write_options);
+
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
+
+  txn1->SetSavePoint();
+  txn1->UndoGetForUpdate("A");
+
+  txn2 = txn_db->BeginTransaction(write_options);
+  ASSERT_OK(txn2->Put("A", "x"));
+  ASSERT_OK(txn2->Commit());
+  delete txn2;
+
+  // Verify that txn1 cannot commit since A will still be conflict checked
+  s = txn1->Commit();
+  ASSERT_TRUE(s.IsBusy());
+  delete txn1;
+
+  txn1 = txn_db->BeginTransaction(write_options);
+
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
+
+  txn1->SetSavePoint();
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
+  txn1->UndoGetForUpdate("A");
+
+  txn2 = txn_db->BeginTransaction(write_options);
+  ASSERT_OK(txn2->Put("A", "x"));
+  ASSERT_OK(txn2->Commit());
+  delete txn2;
+
+  // Verify that txn1 cannot commit since A will still be conflict checked
+  s = txn1->Commit();
+  ASSERT_TRUE(s.IsBusy());
+  delete txn1;
+
+  txn1 = txn_db->BeginTransaction(write_options);
+
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
+
+  txn1->SetSavePoint();
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
+  txn1->UndoGetForUpdate("A");
+
+  ASSERT_OK(txn1->RollbackToSavePoint());
+  txn1->UndoGetForUpdate("A");
+
+  txn2 = txn_db->BeginTransaction(write_options);
+  ASSERT_OK(txn2->Put("A", "x"));
+  ASSERT_OK(txn2->Commit());
+  delete txn2;
+
+  // Verify that txn1 can commit since A isn't conflict checked
+  ASSERT_OK(txn1->Commit());
+  delete txn1;
+}
+
+namespace {
+Status OptimisticTransactionStressTestInserter(OptimisticTransactionDB* db,
+                                               const size_t num_transactions,
+                                               const size_t num_sets,
+                                               const size_t num_keys_per_set) {
+  size_t seed = std::hash<std::thread::id>()(std::this_thread::get_id());
+  Random64 _rand(seed);
+  WriteOptions write_options;
+  ReadOptions read_options;
+  OptimisticTransactionOptions txn_options;
+  txn_options.set_snapshot = true;
+
+  RandomTransactionInserter inserter(&_rand, write_options, read_options,
+                                     num_keys_per_set,
+                                     static_cast<uint16_t>(num_sets));
+
+  for (size_t t = 0; t < num_transactions; t++) {
+    bool success = inserter.OptimisticTransactionDBInsert(db, txn_options);
+    if (!success) {
+      // unexpected failure
+      return inserter.GetLastStatus();
+    }
+  }
+
+  inserter.GetLastStatus().PermitUncheckedError();
+
+  // Make sure at least some of the transactions succeeded.  It's ok if
+  // some failed due to write-conflicts.
+  if (inserter.GetFailureCount() > num_transactions / 2) {
+    return Status::TryAgain("Too many transactions failed! " +
+                            std::to_string(inserter.GetFailureCount()) + " / " +
+                            std::to_string(num_transactions));
+  }
+
+  return Status::OK();
+}
+}  // namespace
+
+TEST_P(OptimisticTransactionTest, OptimisticTransactionStressTest) {
+  const size_t num_threads = 4;
+  const size_t num_transactions_per_thread = 10000;
+  const size_t num_sets = 3;
+  const size_t num_keys_per_set = 100;
+  // Setting the key-space to be 100 keys should cause enough write-conflicts
+  // to make this test interesting.
+
+  std::vector<port::Thread> threads;
+
+  std::function<void()> call_inserter = [&] {
+    ASSERT_OK(OptimisticTransactionStressTestInserter(
+        txn_db, num_transactions_per_thread, num_sets, num_keys_per_set));
+  };
+
+  // Create N threads that use RandomTransactionInserter to write
+  // many transactions.
+  for (uint32_t i = 0; i < num_threads; i++) {
+    threads.emplace_back(call_inserter);
+  }
+
+  // Wait for all threads to run
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  // Verify that data is consistent
+  Status s = RandomTransactionInserter::Verify(txn_db, num_sets);
+  ASSERT_OK(s);
+}
+
+TEST_P(OptimisticTransactionTest, SequenceNumberAfterRecoverTest) {
+  WriteOptions write_options;
+  OptimisticTransactionOptions transaction_options;
+
+  Transaction* transaction(
+      txn_db->BeginTransaction(write_options, transaction_options));
+  Status s = transaction->Put("foo", "val");
+  ASSERT_OK(s);
+  s = transaction->Put("foo2", "val");
+  ASSERT_OK(s);
+  s = transaction->Put("foo3", "val");
+  ASSERT_OK(s);
+  s = transaction->Commit();
+  ASSERT_OK(s);
+  delete transaction;
+
+  Reopen();
+  transaction = txn_db->BeginTransaction(write_options, transaction_options);
+  s = transaction->Put("bar", "val");
+  ASSERT_OK(s);
+  s = transaction->Put("bar2", "val");
+  ASSERT_OK(s);
+  s = transaction->Commit();
+  ASSERT_OK(s);
+
+  delete transaction;
+}
+
+TEST_P(OptimisticTransactionTest, TimestampedSnapshotMissingCommitTs) {
+  std::unique_ptr<Transaction> txn(txn_db->BeginTransaction(WriteOptions()));
+  ASSERT_OK(txn->Put("a", "v"));
+  Status s = txn->CommitAndTryCreateSnapshot();
+  ASSERT_TRUE(s.IsInvalidArgument());
+}
+
+TEST_P(OptimisticTransactionTest, TimestampedSnapshotSetCommitTs) {
+  std::unique_ptr<Transaction> txn(txn_db->BeginTransaction(WriteOptions()));
+  ASSERT_OK(txn->Put("a", "v"));
+  std::shared_ptr<const Snapshot> snapshot;
+  Status s = txn->CommitAndTryCreateSnapshot(nullptr, /*ts=*/100, &snapshot);
+  ASSERT_TRUE(s.IsNotSupported());
+}
+
+INSTANTIATE_TEST_CASE_P(
+    InstanceOccGroup, OptimisticTransactionTest,
+    testing::Values(OccValidationPolicy::kValidateSerial,
+                    OccValidationPolicy::kValidateParallel));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(
+      stderr,
+      "SKIPPED as optimistic_transaction is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/pessimistic_transaction.cc b/src/rocksdb/utilities/transactions/pessimistic_transaction.cc
new file mode 100644
index 000000000..cb8fd3bb6
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/pessimistic_transaction.cc
@@ -0,0 +1,1175 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/pessimistic_transaction.h"
+
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "logging/logging.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/snapshot.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/string_util.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/transaction_util.h"
+#include "utilities/write_batch_with_index/write_batch_with_index_internal.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct WriteOptions;
+
+std::atomic<TransactionID> PessimisticTransaction::txn_id_counter_(1);
+
+TransactionID PessimisticTransaction::GenTxnID() {
+  return txn_id_counter_.fetch_add(1);
+}
+
+PessimisticTransaction::PessimisticTransaction(
+    TransactionDB* txn_db, const WriteOptions& write_options,
+    const TransactionOptions& txn_options, const bool init)
+    : TransactionBaseImpl(
+          txn_db->GetRootDB(), write_options,
+          static_cast_with_check<PessimisticTransactionDB>(txn_db)
+              ->GetLockTrackerFactory()),
+      txn_db_impl_(nullptr),
+      expiration_time_(0),
+      txn_id_(0),
+      waiting_cf_id_(0),
+      waiting_key_(nullptr),
+      lock_timeout_(0),
+      deadlock_detect_(false),
+      deadlock_detect_depth_(0),
+      skip_concurrency_control_(false) {
+  txn_db_impl_ = static_cast_with_check<PessimisticTransactionDB>(txn_db);
+  db_impl_ = static_cast_with_check<DBImpl>(db_);
+  if (init) {
+    Initialize(txn_options);
+  }
+}
+
+void PessimisticTransaction::Initialize(const TransactionOptions& txn_options) {
+  // Range lock manager uses address of transaction object as TXNID
+  const TransactionDBOptions& db_options = txn_db_impl_->GetTxnDBOptions();
+  if (db_options.lock_mgr_handle &&
+      db_options.lock_mgr_handle->getLockManager()->IsRangeLockSupported()) {
+    txn_id_ = reinterpret_cast<TransactionID>(this);
+  } else {
+    txn_id_ = GenTxnID();
+  }
+
+  txn_state_ = STARTED;
+
+  deadlock_detect_ = txn_options.deadlock_detect;
+  deadlock_detect_depth_ = txn_options.deadlock_detect_depth;
+  write_batch_.SetMaxBytes(txn_options.max_write_batch_size);
+  skip_concurrency_control_ = txn_options.skip_concurrency_control;
+
+  lock_timeout_ = txn_options.lock_timeout * 1000;
+  if (lock_timeout_ < 0) {
+    // Lock timeout not set, use default
+    lock_timeout_ =
+        txn_db_impl_->GetTxnDBOptions().transaction_lock_timeout * 1000;
+  }
+
+  if (txn_options.expiration >= 0) {
+    expiration_time_ = start_time_ + txn_options.expiration * 1000;
+  } else {
+    expiration_time_ = 0;
+  }
+
+  if (txn_options.set_snapshot) {
+    SetSnapshot();
+  }
+
+  if (expiration_time_ > 0) {
+    txn_db_impl_->InsertExpirableTransaction(txn_id_, this);
+  }
+  use_only_the_last_commit_time_batch_for_recovery_ =
+      txn_options.use_only_the_last_commit_time_batch_for_recovery;
+  skip_prepare_ = txn_options.skip_prepare;
+
+  read_timestamp_ = kMaxTxnTimestamp;
+  commit_timestamp_ = kMaxTxnTimestamp;
+}
+
+PessimisticTransaction::~PessimisticTransaction() {
+  txn_db_impl_->UnLock(this, *tracked_locks_);
+  if (expiration_time_ > 0) {
+    txn_db_impl_->RemoveExpirableTransaction(txn_id_);
+  }
+  if (!name_.empty() && txn_state_ != COMMITTED) {
+    txn_db_impl_->UnregisterTransaction(this);
+  }
+}
+
+void PessimisticTransaction::Clear() {
+  txn_db_impl_->UnLock(this, *tracked_locks_);
+  TransactionBaseImpl::Clear();
+}
+
+void PessimisticTransaction::Reinitialize(
+    TransactionDB* txn_db, const WriteOptions& write_options,
+    const TransactionOptions& txn_options) {
+  if (!name_.empty() && txn_state_ != COMMITTED) {
+    txn_db_impl_->UnregisterTransaction(this);
+  }
+  TransactionBaseImpl::Reinitialize(txn_db->GetRootDB(), write_options);
+  Initialize(txn_options);
+}
+
+bool PessimisticTransaction::IsExpired() const {
+  if (expiration_time_ > 0) {
+    if (dbimpl_->GetSystemClock()->NowMicros() >= expiration_time_) {
+      // Transaction is expired.
+      return true;
+    }
+  }
+
+  return false;
+}
+
+WriteCommittedTxn::WriteCommittedTxn(TransactionDB* txn_db,
+                                     const WriteOptions& write_options,
+                                     const TransactionOptions& txn_options)
+    : PessimisticTransaction(txn_db, write_options, txn_options) {}
+
+Status WriteCommittedTxn::GetForUpdate(const ReadOptions& read_options,
+                                       ColumnFamilyHandle* column_family,
+                                       const Slice& key, std::string* value,
+                                       bool exclusive, const bool do_validate) {
+  return GetForUpdateImpl(read_options, column_family, key, value, exclusive,
+                          do_validate);
+}
+
+Status WriteCommittedTxn::GetForUpdate(const ReadOptions& read_options,
+                                       ColumnFamilyHandle* column_family,
+                                       const Slice& key,
+                                       PinnableSlice* pinnable_val,
+                                       bool exclusive, const bool do_validate) {
+  return GetForUpdateImpl(read_options, column_family, key, pinnable_val,
+                          exclusive, do_validate);
+}
+
+template <typename TValue>
+inline Status WriteCommittedTxn::GetForUpdateImpl(
+    const ReadOptions& read_options, ColumnFamilyHandle* column_family,
+    const Slice& key, TValue* value, bool exclusive, const bool do_validate) {
+  column_family =
+      column_family ? column_family : db_impl_->DefaultColumnFamily();
+  assert(column_family);
+  if (!read_options.timestamp) {
+    const Comparator* const ucmp = column_family->GetComparator();
+    assert(ucmp);
+    size_t ts_sz = ucmp->timestamp_size();
+    if (0 == ts_sz) {
+      return TransactionBaseImpl::GetForUpdate(read_options, column_family, key,
+                                               value, exclusive, do_validate);
+    }
+  } else {
+    Status s = db_impl_->FailIfTsMismatchCf(
+        column_family, *(read_options.timestamp), /*ts_for_read=*/true);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (!do_validate) {
+    return Status::InvalidArgument(
+        "If do_validate is false then GetForUpdate with read_timestamp is not "
+        "defined.");
+  } else if (kMaxTxnTimestamp == read_timestamp_) {
+    return Status::InvalidArgument("read_timestamp must be set for validation");
+  }
+
+  if (!read_options.timestamp) {
+    ReadOptions read_opts_copy = read_options;
+    char ts_buf[sizeof(kMaxTxnTimestamp)];
+    EncodeFixed64(ts_buf, read_timestamp_);
+    Slice ts(ts_buf, sizeof(ts_buf));
+    read_opts_copy.timestamp = &ts;
+    return TransactionBaseImpl::GetForUpdate(read_opts_copy, column_family, key,
+                                             value, exclusive, do_validate);
+  }
+  assert(read_options.timestamp);
+  const char* const ts_buf = read_options.timestamp->data();
+  assert(read_options.timestamp->size() == sizeof(kMaxTxnTimestamp));
+  TxnTimestamp ts = DecodeFixed64(ts_buf);
+  if (ts != read_timestamp_) {
+    return Status::InvalidArgument("Must read from the same read_timestamp");
+  }
+  return TransactionBaseImpl::GetForUpdate(read_options, column_family, key,
+                                           value, exclusive, do_validate);
+}
+
+Status WriteCommittedTxn::Put(ColumnFamilyHandle* column_family,
+                              const Slice& key, const Slice& value,
+                              const bool assume_tracked) {
+  const bool do_validate = !assume_tracked;
+  return Operate(column_family, key, do_validate, assume_tracked,
+                 [column_family, &key, &value, this]() {
+                   Status s =
+                       GetBatchForWrite()->Put(column_family, key, value);
+                   if (s.ok()) {
+                     ++num_puts_;
+                   }
+                   return s;
+                 });
+}
+
+Status WriteCommittedTxn::Put(ColumnFamilyHandle* column_family,
+                              const SliceParts& key, const SliceParts& value,
+                              const bool assume_tracked) {
+  const bool do_validate = !assume_tracked;
+  return Operate(column_family, key, do_validate, assume_tracked,
+                 [column_family, &key, &value, this]() {
+                   Status s =
+                       GetBatchForWrite()->Put(column_family, key, value);
+                   if (s.ok()) {
+                     ++num_puts_;
+                   }
+                   return s;
+                 });
+}
+
+Status WriteCommittedTxn::PutUntracked(ColumnFamilyHandle* column_family,
+                                       const Slice& key, const Slice& value) {
+  return Operate(
+      column_family, key, /*do_validate=*/false,
+      /*assume_tracked=*/false, [column_family, &key, &value, this]() {
+        Status s = GetBatchForWrite()->Put(column_family, key, value);
+        if (s.ok()) {
+          ++num_puts_;
+        }
+        return s;
+      });
+}
+
+Status WriteCommittedTxn::PutUntracked(ColumnFamilyHandle* column_family,
+                                       const SliceParts& key,
+                                       const SliceParts& value) {
+  return Operate(
+      column_family, key, /*do_validate=*/false,
+      /*assume_tracked=*/false, [column_family, &key, &value, this]() {
+        Status s = GetBatchForWrite()->Put(column_family, key, value);
+        if (s.ok()) {
+          ++num_puts_;
+        }
+        return s;
+      });
+}
+
+Status WriteCommittedTxn::Delete(ColumnFamilyHandle* column_family,
+                                 const Slice& key, const bool assume_tracked) {
+  const bool do_validate = !assume_tracked;
+  return Operate(column_family, key, do_validate, assume_tracked,
+                 [column_family, &key, this]() {
+                   Status s = GetBatchForWrite()->Delete(column_family, key);
+                   if (s.ok()) {
+                     ++num_deletes_;
+                   }
+                   return s;
+                 });
+}
+
+Status WriteCommittedTxn::Delete(ColumnFamilyHandle* column_family,
+                                 const SliceParts& key,
+                                 const bool assume_tracked) {
+  const bool do_validate = !assume_tracked;
+  return Operate(column_family, key, do_validate, assume_tracked,
+                 [column_family, &key, this]() {
+                   Status s = GetBatchForWrite()->Delete(column_family, key);
+                   if (s.ok()) {
+                     ++num_deletes_;
+                   }
+                   return s;
+                 });
+}
+
+Status WriteCommittedTxn::DeleteUntracked(ColumnFamilyHandle* column_family,
+                                          const Slice& key) {
+  return Operate(column_family, key, /*do_validate=*/false,
+                 /*assume_tracked=*/false, [column_family, &key, this]() {
+                   Status s = GetBatchForWrite()->Delete(column_family, key);
+                   if (s.ok()) {
+                     ++num_deletes_;
+                   }
+                   return s;
+                 });
+}
+
+Status WriteCommittedTxn::DeleteUntracked(ColumnFamilyHandle* column_family,
+                                          const SliceParts& key) {
+  return Operate(column_family, key, /*do_validate=*/false,
+                 /*assume_tracked=*/false, [column_family, &key, this]() {
+                   Status s = GetBatchForWrite()->Delete(column_family, key);
+                   if (s.ok()) {
+                     ++num_deletes_;
+                   }
+                   return s;
+                 });
+}
+
+Status WriteCommittedTxn::SingleDelete(ColumnFamilyHandle* column_family,
+                                       const Slice& key,
+                                       const bool assume_tracked) {
+  const bool do_validate = !assume_tracked;
+  return Operate(column_family, key, do_validate, assume_tracked,
+                 [column_family, &key, this]() {
+                   Status s =
+                       GetBatchForWrite()->SingleDelete(column_family, key);
+                   if (s.ok()) {
+                     ++num_deletes_;
+                   }
+                   return s;
+                 });
+}
+
+Status WriteCommittedTxn::SingleDelete(ColumnFamilyHandle* column_family,
+                                       const SliceParts& key,
+                                       const bool assume_tracked) {
+  const bool do_validate = !assume_tracked;
+  return Operate(column_family, key, do_validate, assume_tracked,
+                 [column_family, &key, this]() {
+                   Status s =
+                       GetBatchForWrite()->SingleDelete(column_family, key);
+                   if (s.ok()) {
+                     ++num_deletes_;
+                   }
+                   return s;
+                 });
+}
+
+Status WriteCommittedTxn::SingleDeleteUntracked(
+    ColumnFamilyHandle* column_family, const Slice& key) {
+  return Operate(column_family, key, /*do_validate=*/false,
+                 /*assume_tracked=*/false, [column_family, &key, this]() {
+                   Status s =
+                       GetBatchForWrite()->SingleDelete(column_family, key);
+                   if (s.ok()) {
+                     ++num_deletes_;
+                   }
+                   return s;
+                 });
+}
+
+Status WriteCommittedTxn::Merge(ColumnFamilyHandle* column_family,
+                                const Slice& key, const Slice& value,
+                                const bool assume_tracked) {
+  const bool do_validate = !assume_tracked;
+  return Operate(column_family, key, do_validate, assume_tracked,
+                 [column_family, &key, &value, this]() {
+                   Status s =
+                       GetBatchForWrite()->Merge(column_family, key, value);
+                   if (s.ok()) {
+                     ++num_merges_;
+                   }
+                   return s;
+                 });
+}
+
+template <typename TKey, typename TOperation>
+Status WriteCommittedTxn::Operate(ColumnFamilyHandle* column_family,
+                                  const TKey& key, const bool do_validate,
+                                  const bool assume_tracked,
+                                  TOperation&& operation) {
+  Status s;
+  if constexpr (std::is_same_v<Slice, TKey>) {
+    s = TryLock(column_family, key, /*read_only=*/false, /*exclusive=*/true,
+                do_validate, assume_tracked);
+  } else if constexpr (std::is_same_v<SliceParts, TKey>) {
+    std::string key_buf;
+    Slice contiguous_key(key, &key_buf);
+    s = TryLock(column_family, contiguous_key, /*read_only=*/false,
+                /*exclusive=*/true, do_validate, assume_tracked);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+  column_family =
+      column_family ? column_family : db_impl_->DefaultColumnFamily();
+  assert(column_family);
+  const Comparator* const ucmp = column_family->GetComparator();
+  assert(ucmp);
+  size_t ts_sz = ucmp->timestamp_size();
+  if (ts_sz > 0) {
+    assert(ts_sz == sizeof(TxnTimestamp));
+    if (!IndexingEnabled()) {
+      cfs_with_ts_tracked_when_indexing_disabled_.insert(
+          column_family->GetID());
+    }
+  }
+  return operation();
+}
+
+Status WriteCommittedTxn::SetReadTimestampForValidation(TxnTimestamp ts) {
+  if (read_timestamp_ < kMaxTxnTimestamp && ts < read_timestamp_) {
+    return Status::InvalidArgument(
+        "Cannot decrease read timestamp for validation");
+  }
+  read_timestamp_ = ts;
+  return Status::OK();
+}
+
+Status WriteCommittedTxn::SetCommitTimestamp(TxnTimestamp ts) {
+  if (read_timestamp_ < kMaxTxnTimestamp && ts <= read_timestamp_) {
+    return Status::InvalidArgument(
+        "Cannot commit at timestamp smaller than or equal to read timestamp");
+  }
+  commit_timestamp_ = ts;
+  return Status::OK();
+}
+
+Status PessimisticTransaction::CommitBatch(WriteBatch* batch) {
+  if (batch && WriteBatchInternal::HasKeyWithTimestamp(*batch)) {
+    // CommitBatch() needs to lock the keys in the batch.
+    // However, the application also needs to specify the timestamp for the
+    // keys in batch before calling this API.
+    // This means timestamp order may violate the order of locking, thus
+    // violate the sequence number order for the same user key.
+    // Therefore, we disallow this operation for now.
+    return Status::NotSupported(
+        "Batch to commit includes timestamp assigned before locking");
+  }
+
+  std::unique_ptr<LockTracker> keys_to_unlock(lock_tracker_factory_.Create());
+  Status s = LockBatch(batch, keys_to_unlock.get());
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  bool can_commit = false;
+
+  if (IsExpired()) {
+    s = Status::Expired();
+  } else if (expiration_time_ > 0) {
+    TransactionState expected = STARTED;
+    can_commit = std::atomic_compare_exchange_strong(&txn_state_, &expected,
+                                                     AWAITING_COMMIT);
+  } else if (txn_state_ == STARTED) {
+    // lock stealing is not a concern
+    can_commit = true;
+  }
+
+  if (can_commit) {
+    txn_state_.store(AWAITING_COMMIT);
+    s = CommitBatchInternal(batch);
+    if (s.ok()) {
+      txn_state_.store(COMMITTED);
+    }
+  } else if (txn_state_ == LOCKS_STOLEN) {
+    s = Status::Expired();
+  } else {
+    s = Status::InvalidArgument("Transaction is not in state for commit.");
+  }
+
+  txn_db_impl_->UnLock(this, *keys_to_unlock);
+
+  return s;
+}
+
+Status PessimisticTransaction::Prepare() {
+  if (name_.empty()) {
+    return Status::InvalidArgument(
+        "Cannot prepare a transaction that has not been named.");
+  }
+
+  if (IsExpired()) {
+    return Status::Expired();
+  }
+
+  Status s;
+  bool can_prepare = false;
+
+  if (expiration_time_ > 0) {
+    // must concern ourselves with expiraton and/or lock stealing
+    // need to compare/exchange bc locks could be stolen under us here
+    TransactionState expected = STARTED;
+    can_prepare = std::atomic_compare_exchange_strong(&txn_state_, &expected,
+                                                      AWAITING_PREPARE);
+  } else if (txn_state_ == STARTED) {
+    // expiration and lock stealing is not possible
+    txn_state_.store(AWAITING_PREPARE);
+    can_prepare = true;
+  }
+
+  if (can_prepare) {
+    // transaction can't expire after preparation
+    expiration_time_ = 0;
+    assert(log_number_ == 0 ||
+           txn_db_impl_->GetTxnDBOptions().write_policy == WRITE_UNPREPARED);
+
+    s = PrepareInternal();
+    if (s.ok()) {
+      txn_state_.store(PREPARED);
+    }
+  } else if (txn_state_ == LOCKS_STOLEN) {
+    s = Status::Expired();
+  } else if (txn_state_ == PREPARED) {
+    s = Status::InvalidArgument("Transaction has already been prepared.");
+  } else if (txn_state_ == COMMITTED) {
+    s = Status::InvalidArgument("Transaction has already been committed.");
+  } else if (txn_state_ == ROLLEDBACK) {
+    s = Status::InvalidArgument("Transaction has already been rolledback.");
+  } else {
+    s = Status::InvalidArgument("Transaction is not in state for commit.");
+  }
+
+  return s;
+}
+
+Status WriteCommittedTxn::PrepareInternal() {
+  WriteOptions write_options = write_options_;
+  write_options.disableWAL = false;
+  auto s = WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(),
+                                              name_);
+  assert(s.ok());
+  class MarkLogCallback : public PreReleaseCallback {
+   public:
+    MarkLogCallback(DBImpl* db, bool two_write_queues)
+        : db_(db), two_write_queues_(two_write_queues) {
+      (void)two_write_queues_;  // to silence unused private field warning
+    }
+    virtual Status Callback(SequenceNumber, bool is_mem_disabled,
+                            uint64_t log_number, size_t /*index*/,
+                            size_t /*total*/) override {
+#ifdef NDEBUG
+      (void)is_mem_disabled;
+#endif
+      assert(log_number != 0);
+      assert(!two_write_queues_ || is_mem_disabled);  // implies the 2nd queue
+      db_->logs_with_prep_tracker()->MarkLogAsContainingPrepSection(log_number);
+      return Status::OK();
+    }
+
+   private:
+    DBImpl* db_;
+    bool two_write_queues_;
+  } mark_log_callback(db_impl_,
+                      db_impl_->immutable_db_options().two_write_queues);
+
+  WriteCallback* const kNoWriteCallback = nullptr;
+  const uint64_t kRefNoLog = 0;
+  const bool kDisableMemtable = true;
+  SequenceNumber* const KIgnoreSeqUsed = nullptr;
+  const size_t kNoBatchCount = 0;
+  s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
+                          kNoWriteCallback, &log_number_, kRefNoLog,
+                          kDisableMemtable, KIgnoreSeqUsed, kNoBatchCount,
+                          &mark_log_callback);
+  return s;
+}
+
+Status PessimisticTransaction::Commit() {
+  bool commit_without_prepare = false;
+  bool commit_prepared = false;
+
+  if (IsExpired()) {
+    return Status::Expired();
+  }
+
+  if (expiration_time_ > 0) {
+    // we must atomicaly compare and exchange the state here because at
+    // this state in the transaction it is possible for another thread
+    // to change our state out from under us in the even that we expire and have
+    // our locks stolen. In this case the only valid state is STARTED because
+    // a state of PREPARED would have a cleared expiration_time_.
+    TransactionState expected = STARTED;
+    commit_without_prepare = std::atomic_compare_exchange_strong(
+        &txn_state_, &expected, AWAITING_COMMIT);
+    TEST_SYNC_POINT("TransactionTest::ExpirableTransactionDataRace:1");
+  } else if (txn_state_ == PREPARED) {
+    // expiration and lock stealing is not a concern
+    commit_prepared = true;
+  } else if (txn_state_ == STARTED) {
+    // expiration and lock stealing is not a concern
+    if (skip_prepare_) {
+      commit_without_prepare = true;
+    } else {
+      return Status::TxnNotPrepared();
+    }
+  }
+
+  Status s;
+  if (commit_without_prepare) {
+    assert(!commit_prepared);
+    if (WriteBatchInternal::Count(GetCommitTimeWriteBatch()) > 0) {
+      s = Status::InvalidArgument(
+          "Commit-time batch contains values that will not be committed.");
+    } else {
+      txn_state_.store(AWAITING_COMMIT);
+      if (log_number_ > 0) {
+        dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed(
+            log_number_);
+      }
+      s = CommitWithoutPrepareInternal();
+      if (!name_.empty()) {
+        txn_db_impl_->UnregisterTransaction(this);
+      }
+      Clear();
+      if (s.ok()) {
+        txn_state_.store(COMMITTED);
+      }
+    }
+  } else if (commit_prepared) {
+    txn_state_.store(AWAITING_COMMIT);
+
+    s = CommitInternal();
+
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log,
+                     "Commit write failed");
+      return s;
+    }
+
+    // FindObsoleteFiles must now look to the memtables
+    // to determine what prep logs must be kept around,
+    // not the prep section heap.
+    assert(log_number_ > 0);
+    dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed(
+        log_number_);
+    txn_db_impl_->UnregisterTransaction(this);
+
+    Clear();
+    txn_state_.store(COMMITTED);
+  } else if (txn_state_ == LOCKS_STOLEN) {
+    s = Status::Expired();
+  } else if (txn_state_ == COMMITTED) {
+    s = Status::InvalidArgument("Transaction has already been committed.");
+  } else if (txn_state_ == ROLLEDBACK) {
+    s = Status::InvalidArgument("Transaction has already been rolledback.");
+  } else {
+    s = Status::InvalidArgument("Transaction is not in state for commit.");
+  }
+
+  return s;
+}
+
+Status WriteCommittedTxn::CommitWithoutPrepareInternal() {
+  WriteBatchWithIndex* wbwi = GetWriteBatch();
+  assert(wbwi);
+  WriteBatch* wb = wbwi->GetWriteBatch();
+  assert(wb);
+
+  const bool needs_ts = WriteBatchInternal::HasKeyWithTimestamp(*wb);
+  if (needs_ts && commit_timestamp_ == kMaxTxnTimestamp) {
+    return Status::InvalidArgument("Must assign a commit timestamp");
+  }
+
+  if (needs_ts) {
+    assert(commit_timestamp_ != kMaxTxnTimestamp);
+    char commit_ts_buf[sizeof(kMaxTxnTimestamp)];
+    EncodeFixed64(commit_ts_buf, commit_timestamp_);
+    Slice commit_ts(commit_ts_buf, sizeof(commit_ts_buf));
+
+    Status s =
+        wb->UpdateTimestamps(commit_ts, [wbwi, this](uint32_t cf) -> size_t {
+          auto cf_iter = cfs_with_ts_tracked_when_indexing_disabled_.find(cf);
+          if (cf_iter != cfs_with_ts_tracked_when_indexing_disabled_.end()) {
+            return sizeof(kMaxTxnTimestamp);
+          }
+          const Comparator* ucmp =
+              WriteBatchWithIndexInternal::GetUserComparator(*wbwi, cf);
+          return ucmp ? ucmp->timestamp_size()
+                      : std::numeric_limits<uint64_t>::max();
+        });
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  uint64_t seq_used = kMaxSequenceNumber;
+  SnapshotCreationCallback snapshot_creation_cb(db_impl_, commit_timestamp_,
+                                                snapshot_notifier_, snapshot_);
+  PostMemTableCallback* post_mem_cb = nullptr;
+  if (snapshot_needed_) {
+    if (commit_timestamp_ == kMaxTxnTimestamp) {
+      return Status::InvalidArgument("Must set transaction commit timestamp");
+    } else {
+      post_mem_cb = &snapshot_creation_cb;
+    }
+  }
+  auto s = db_impl_->WriteImpl(write_options_, wb,
+                               /*callback*/ nullptr, /*log_used*/ nullptr,
+                               /*log_ref*/ 0, /*disable_memtable*/ false,
+                               &seq_used, /*batch_cnt=*/0,
+                               /*pre_release_callback=*/nullptr, post_mem_cb);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  if (s.ok()) {
+    SetId(seq_used);
+  }
+  return s;
+}
+
+Status WriteCommittedTxn::CommitBatchInternal(WriteBatch* batch, size_t) {
+  uint64_t seq_used = kMaxSequenceNumber;
+  auto s = db_impl_->WriteImpl(write_options_, batch, /*callback*/ nullptr,
+                               /*log_used*/ nullptr, /*log_ref*/ 0,
+                               /*disable_memtable*/ false, &seq_used);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  if (s.ok()) {
+    SetId(seq_used);
+  }
+  return s;
+}
+
+Status WriteCommittedTxn::CommitInternal() {
+  WriteBatchWithIndex* wbwi = GetWriteBatch();
+  assert(wbwi);
+  WriteBatch* wb = wbwi->GetWriteBatch();
+  assert(wb);
+
+  const bool needs_ts = WriteBatchInternal::HasKeyWithTimestamp(*wb);
+  if (needs_ts && commit_timestamp_ == kMaxTxnTimestamp) {
+    return Status::InvalidArgument("Must assign a commit timestamp");
+  }
+  // We take the commit-time batch and append the Commit marker.
+  // The Memtable will ignore the Commit marker in non-recovery mode
+  WriteBatch* working_batch = GetCommitTimeWriteBatch();
+
+  Status s;
+  if (!needs_ts) {
+    s = WriteBatchInternal::MarkCommit(working_batch, name_);
+  } else {
+    assert(commit_timestamp_ != kMaxTxnTimestamp);
+    char commit_ts_buf[sizeof(kMaxTxnTimestamp)];
+    EncodeFixed64(commit_ts_buf, commit_timestamp_);
+    Slice commit_ts(commit_ts_buf, sizeof(commit_ts_buf));
+    s = WriteBatchInternal::MarkCommitWithTimestamp(working_batch, name_,
+                                                    commit_ts);
+    if (s.ok()) {
+      s = wb->UpdateTimestamps(commit_ts, [wbwi, this](uint32_t cf) -> size_t {
+        if (cfs_with_ts_tracked_when_indexing_disabled_.find(cf) !=
+            cfs_with_ts_tracked_when_indexing_disabled_.end()) {
+          return sizeof(kMaxTxnTimestamp);
+        }
+        const Comparator* ucmp =
+            WriteBatchWithIndexInternal::GetUserComparator(*wbwi, cf);
+        return ucmp ? ucmp->timestamp_size()
+                    : std::numeric_limits<uint64_t>::max();
+      });
+    }
+  }
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  // any operations appended to this working_batch will be ignored from WAL
+  working_batch->MarkWalTerminationPoint();
+
+  // insert prepared batch into Memtable only skipping WAL.
+  // Memtable will ignore BeginPrepare/EndPrepare markers
+  // in non recovery mode and simply insert the values
+  s = WriteBatchInternal::Append(working_batch, wb);
+  assert(s.ok());
+
+  uint64_t seq_used = kMaxSequenceNumber;
+  SnapshotCreationCallback snapshot_creation_cb(db_impl_, commit_timestamp_,
+                                                snapshot_notifier_, snapshot_);
+  PostMemTableCallback* post_mem_cb = nullptr;
+  if (snapshot_needed_) {
+    if (commit_timestamp_ == kMaxTxnTimestamp) {
+      s = Status::InvalidArgument("Must set transaction commit timestamp");
+      return s;
+    } else {
+      post_mem_cb = &snapshot_creation_cb;
+    }
+  }
+  s = db_impl_->WriteImpl(write_options_, working_batch, /*callback*/ nullptr,
+                          /*log_used*/ nullptr, /*log_ref*/ log_number_,
+                          /*disable_memtable*/ false, &seq_used,
+                          /*batch_cnt=*/0, /*pre_release_callback=*/nullptr,
+                          post_mem_cb);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  if (s.ok()) {
+    SetId(seq_used);
+  }
+  return s;
+}
+
+Status PessimisticTransaction::Rollback() {
+  Status s;
+  if (txn_state_ == PREPARED) {
+    txn_state_.store(AWAITING_ROLLBACK);
+
+    s = RollbackInternal();
+
+    if (s.ok()) {
+      // we do not need to keep our prepared section around
+      assert(log_number_ > 0);
+      dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed(
+          log_number_);
+      Clear();
+      txn_state_.store(ROLLEDBACK);
+    }
+  } else if (txn_state_ == STARTED) {
+    if (log_number_ > 0) {
+      assert(txn_db_impl_->GetTxnDBOptions().write_policy == WRITE_UNPREPARED);
+      assert(GetId() > 0);
+      s = RollbackInternal();
+
+      if (s.ok()) {
+        dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed(
+            log_number_);
+      }
+    }
+    // prepare couldn't have taken place
+    Clear();
+  } else if (txn_state_ == COMMITTED) {
+    s = Status::InvalidArgument("This transaction has already been committed.");
+  } else {
+    s = Status::InvalidArgument(
+        "Two phase transaction is not in state for rollback.");
+  }
+
+  return s;
+}
+
+Status WriteCommittedTxn::RollbackInternal() {
+  WriteBatch rollback_marker;
+  auto s = WriteBatchInternal::MarkRollback(&rollback_marker, name_);
+  assert(s.ok());
+  s = db_impl_->WriteImpl(write_options_, &rollback_marker);
+  return s;
+}
+
+Status PessimisticTransaction::RollbackToSavePoint() {
+  if (txn_state_ != STARTED) {
+    return Status::InvalidArgument("Transaction is beyond state for rollback.");
+  }
+
+  if (save_points_ != nullptr && !save_points_->empty()) {
+    // Unlock any keys locked since last transaction
+    auto& save_point_tracker = *save_points_->top().new_locks_;
+    std::unique_ptr<LockTracker> t(
+        tracked_locks_->GetTrackedLocksSinceSavePoint(save_point_tracker));
+    if (t) {
+      txn_db_impl_->UnLock(this, *t);
+    }
+  }
+
+  return TransactionBaseImpl::RollbackToSavePoint();
+}
+
+// Lock all keys in this batch.
+// On success, caller should unlock keys_to_unlock
+Status PessimisticTransaction::LockBatch(WriteBatch* batch,
+                                         LockTracker* keys_to_unlock) {
+  if (!batch) {
+    return Status::InvalidArgument("batch is nullptr");
+  }
+
+  class Handler : public WriteBatch::Handler {
+   public:
+    // Sorted map of column_family_id to sorted set of keys.
+    // Since LockBatch() always locks keys in sorted order, it cannot deadlock
+    // with itself.  We're not using a comparator here since it doesn't matter
+    // what the sorting is as long as it's consistent.
+    std::map<uint32_t, std::set<std::string>> keys_;
+
+    Handler() {}
+
+    void RecordKey(uint32_t column_family_id, const Slice& key) {
+      std::string key_str = key.ToString();
+
+      auto& cfh_keys = keys_[column_family_id];
+      auto iter = cfh_keys.find(key_str);
+      if (iter == cfh_keys.end()) {
+        // key not yet seen, store it.
+        cfh_keys.insert({std::move(key_str)});
+      }
+    }
+
+    Status PutCF(uint32_t column_family_id, const Slice& key,
+                 const Slice& /* unused */) override {
+      RecordKey(column_family_id, key);
+      return Status::OK();
+    }
+    Status MergeCF(uint32_t column_family_id, const Slice& key,
+                   const Slice& /* unused */) override {
+      RecordKey(column_family_id, key);
+      return Status::OK();
+    }
+    Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
+      RecordKey(column_family_id, key);
+      return Status::OK();
+    }
+  };
+
+  // Iterating on this handler will add all keys in this batch into keys
+  Handler handler;
+  Status s = batch->Iterate(&handler);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Attempt to lock all keys
+  for (const auto& cf_iter : handler.keys_) {
+    uint32_t cfh_id = cf_iter.first;
+    auto& cfh_keys = cf_iter.second;
+
+    for (const auto& key_iter : cfh_keys) {
+      const std::string& key = key_iter;
+
+      s = txn_db_impl_->TryLock(this, cfh_id, key, true /* exclusive */);
+      if (!s.ok()) {
+        break;
+      }
+      PointLockRequest r;
+      r.column_family_id = cfh_id;
+      r.key = key;
+      r.seq = kMaxSequenceNumber;
+      r.read_only = false;
+      r.exclusive = true;
+      keys_to_unlock->Track(r);
+    }
+
+    if (!s.ok()) {
+      break;
+    }
+  }
+
+  if (!s.ok()) {
+    txn_db_impl_->UnLock(this, *keys_to_unlock);
+  }
+
+  return s;
+}
+
+// Attempt to lock this key.
+// Returns OK if the key has been successfully locked.  Non-ok, otherwise.
+// If check_shapshot is true and this transaction has a snapshot set,
+// this key will only be locked if there have been no writes to this key since
+// the snapshot time.
+Status PessimisticTransaction::TryLock(ColumnFamilyHandle* column_family,
+                                       const Slice& key, bool read_only,
+                                       bool exclusive, const bool do_validate,
+                                       const bool assume_tracked) {
+  assert(!assume_tracked || !do_validate);
+  Status s;
+  if (UNLIKELY(skip_concurrency_control_)) {
+    return s;
+  }
+  uint32_t cfh_id = GetColumnFamilyID(column_family);
+  std::string key_str = key.ToString();
+
+  PointLockStatus status;
+  bool lock_upgrade;
+  bool previously_locked;
+  if (tracked_locks_->IsPointLockSupported()) {
+    status = tracked_locks_->GetPointLockStatus(cfh_id, key_str);
+    previously_locked = status.locked;
+    lock_upgrade = previously_locked && exclusive && !status.exclusive;
+  } else {
+    // If the record is tracked, we can assume it was locked, too.
+    previously_locked = assume_tracked;
+    status.locked = false;
+    lock_upgrade = false;
+  }
+
+  // Lock this key if this transactions hasn't already locked it or we require
+  // an upgrade.
+  if (!previously_locked || lock_upgrade) {
+    s = txn_db_impl_->TryLock(this, cfh_id, key_str, exclusive);
+  }
+
+  const ColumnFamilyHandle* const cfh =
+      column_family ? column_family : db_impl_->DefaultColumnFamily();
+  assert(cfh);
+  const Comparator* const ucmp = cfh->GetComparator();
+  assert(ucmp);
+  size_t ts_sz = ucmp->timestamp_size();
+
+  SetSnapshotIfNeeded();
+
+  // Even though we do not care about doing conflict checking for this write,
+  // we still need to take a lock to make sure we do not cause a conflict with
+  // some other write.  However, we do not need to check if there have been
+  // any writes since this transaction's snapshot.
+  // TODO(agiardullo): could optimize by supporting shared txn locks in the
+  // future.
+  SequenceNumber tracked_at_seq =
+      status.locked ? status.seq : kMaxSequenceNumber;
+  if (!do_validate || (snapshot_ == nullptr &&
+                       (0 == ts_sz || kMaxTxnTimestamp == read_timestamp_))) {
+    if (assume_tracked && !previously_locked &&
+        tracked_locks_->IsPointLockSupported()) {
+      s = Status::InvalidArgument(
+          "assume_tracked is set but it is not tracked yet");
+    }
+    // Need to remember the earliest sequence number that we know that this
+    // key has not been modified after.  This is useful if this same
+    // transaction later tries to lock this key again.
+    if (tracked_at_seq == kMaxSequenceNumber) {
+      // Since we haven't checked a snapshot, we only know this key has not
+      // been modified since after we locked it.
+      // Note: when last_seq_same_as_publish_seq_==false this is less than the
+      // latest allocated seq but it is ok since i) this is just a heuristic
+      // used only as a hint to avoid actual check for conflicts, ii) this would
+      // cause a false positive only if the snapthot is taken right after the
+      // lock, which would be an unusual sequence.
+      tracked_at_seq = db_->GetLatestSequenceNumber();
+    }
+  } else if (s.ok()) {
+    // If a snapshot is set, we need to make sure the key hasn't been modified
+    // since the snapshot.  This must be done after we locked the key.
+    // If we already have validated an earilier snapshot it must has been
+    // reflected in tracked_at_seq and ValidateSnapshot will return OK.
+    s = ValidateSnapshot(column_family, key, &tracked_at_seq);
+
+    if (!s.ok()) {
+      // Failed to validate key
+      // Unlock key we just locked
+      if (lock_upgrade) {
+        s = txn_db_impl_->TryLock(this, cfh_id, key_str, false /* exclusive */);
+        assert(s.ok());
+      } else if (!previously_locked) {
+        txn_db_impl_->UnLock(this, cfh_id, key.ToString());
+      }
+    }
+  }
+
+  if (s.ok()) {
+    // We must track all the locked keys so that we can unlock them later. If
+    // the key is already locked, this func will update some stats on the
+    // tracked key. It could also update the tracked_at_seq if it is lower
+    // than the existing tracked key seq. These stats are necessary for
+    // RollbackToSavePoint to determine whether a key can be safely removed
+    // from tracked_keys_. Removal can only be done if a key was only locked
+    // during the current savepoint.
+    //
+    // Recall that if assume_tracked is true, we assume that TrackKey has been
+    // called previously since the last savepoint, with the same exclusive
+    // setting, and at a lower sequence number, so skipping here should be
+    // safe.
+    if (!assume_tracked) {
+      TrackKey(cfh_id, key_str, tracked_at_seq, read_only, exclusive);
+    } else {
+#ifndef NDEBUG
+      if (tracked_locks_->IsPointLockSupported()) {
+        PointLockStatus lock_status =
+            tracked_locks_->GetPointLockStatus(cfh_id, key_str);
+        assert(lock_status.locked);
+        assert(lock_status.seq <= tracked_at_seq);
+        assert(lock_status.exclusive == exclusive);
+      }
+#endif
+    }
+  }
+
+  return s;
+}
+
+Status PessimisticTransaction::GetRangeLock(ColumnFamilyHandle* column_family,
+                                            const Endpoint& start_endp,
+                                            const Endpoint& end_endp) {
+  ColumnFamilyHandle* cfh =
+      column_family ? column_family : db_impl_->DefaultColumnFamily();
+  uint32_t cfh_id = GetColumnFamilyID(cfh);
+
+  Status s = txn_db_impl_->TryRangeLock(this, cfh_id, start_endp, end_endp);
+
+  if (s.ok()) {
+    RangeLockRequest req{cfh_id, start_endp, end_endp};
+    tracked_locks_->Track(req);
+  }
+  return s;
+}
+
+// Return OK() if this key has not been modified more recently than the
+// transaction snapshot_.
+// tracked_at_seq is the global seq at which we either locked the key or already
+// have done ValidateSnapshot.
+Status PessimisticTransaction::ValidateSnapshot(
+    ColumnFamilyHandle* column_family, const Slice& key,
+    SequenceNumber* tracked_at_seq) {
+  assert(snapshot_ || read_timestamp_ < kMaxTxnTimestamp);
+
+  SequenceNumber snap_seq = 0;
+  if (snapshot_) {
+    snap_seq = snapshot_->GetSequenceNumber();
+    if (*tracked_at_seq <= snap_seq) {
+      // If the key has been previous validated (or locked) at a sequence number
+      // earlier than the current snapshot's sequence number, we already know it
+      // has not been modified aftter snap_seq either.
+      return Status::OK();
+    }
+  } else {
+    snap_seq = db_impl_->GetLatestSequenceNumber();
+  }
+
+  // Otherwise we have either
+  // 1: tracked_at_seq == kMaxSequenceNumber, i.e., first time tracking the key
+  // 2: snap_seq < tracked_at_seq: last time we lock the key was via
+  // do_validate=false which means we had skipped ValidateSnapshot. In both
+  // cases we should do ValidateSnapshot now.
+
+  *tracked_at_seq = snap_seq;
+
+  ColumnFamilyHandle* cfh =
+      column_family ? column_family : db_impl_->DefaultColumnFamily();
+
+  assert(cfh);
+  const Comparator* const ucmp = cfh->GetComparator();
+  assert(ucmp);
+  size_t ts_sz = ucmp->timestamp_size();
+  std::string ts_buf;
+  if (ts_sz > 0 && read_timestamp_ < kMaxTxnTimestamp) {
+    assert(ts_sz == sizeof(read_timestamp_));
+    PutFixed64(&ts_buf, read_timestamp_);
+  }
+
+  return TransactionUtil::CheckKeyForConflicts(
+      db_impl_, cfh, key.ToString(), snap_seq, ts_sz == 0 ? nullptr : &ts_buf,
+      false /* cache_only */);
+}
+
+bool PessimisticTransaction::TryStealingLocks() {
+  assert(IsExpired());
+  TransactionState expected = STARTED;
+  return std::atomic_compare_exchange_strong(&txn_state_, &expected,
+                                             LOCKS_STOLEN);
+}
+
+void PessimisticTransaction::UnlockGetForUpdate(
+    ColumnFamilyHandle* column_family, const Slice& key) {
+  txn_db_impl_->UnLock(this, GetColumnFamilyID(column_family), key.ToString());
+}
+
+Status PessimisticTransaction::SetName(const TransactionName& name) {
+  Status s;
+  if (txn_state_ == STARTED) {
+    if (name_.length()) {
+      s = Status::InvalidArgument("Transaction has already been named.");
+    } else if (txn_db_impl_->GetTransactionByName(name) != nullptr) {
+      s = Status::InvalidArgument("Transaction name must be unique.");
+    } else if (name.length() < 1 || name.length() > 512) {
+      s = Status::InvalidArgument(
+          "Transaction name length must be between 1 and 512 chars.");
+    } else {
+      name_ = name;
+      txn_db_impl_->RegisterTransaction(this);
+    }
+  } else {
+    s = Status::InvalidArgument("Transaction is beyond state for naming.");
+  }
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/pessimistic_transaction.h b/src/rocksdb/utilities/transactions/pessimistic_transaction.h
new file mode 100644
index 000000000..d43d1d3ac
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/pessimistic_transaction.h
@@ -0,0 +1,313 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <atomic>
+#include <mutex>
+#include <stack>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "db/write_callback.h"
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/snapshot.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "util/autovector.h"
+#include "utilities/transactions/transaction_base.h"
+#include "utilities/transactions/transaction_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class PessimisticTransactionDB;
+
+// A transaction under pessimistic concurrency control. This class implements
+// the locking API and interfaces with the lock manager as well as the
+// pessimistic transactional db.
+class PessimisticTransaction : public TransactionBaseImpl {
+ public:
+  PessimisticTransaction(TransactionDB* db, const WriteOptions& write_options,
+                         const TransactionOptions& txn_options,
+                         const bool init = true);
+  // No copying allowed
+  PessimisticTransaction(const PessimisticTransaction&) = delete;
+  void operator=(const PessimisticTransaction&) = delete;
+
+  ~PessimisticTransaction() override;
+
+  void Reinitialize(TransactionDB* txn_db, const WriteOptions& write_options,
+                    const TransactionOptions& txn_options);
+
+  Status Prepare() override;
+
+  Status Commit() override;
+
+  // It is basically Commit without going through Prepare phase. The write batch
+  // is also directly provided instead of expecting txn to gradually batch the
+  // transactions writes to an internal write batch.
+  Status CommitBatch(WriteBatch* batch);
+
+  Status Rollback() override;
+
+  Status RollbackToSavePoint() override;
+
+  Status SetName(const TransactionName& name) override;
+
+  // Generate a new unique transaction identifier
+  static TransactionID GenTxnID();
+
+  TransactionID GetID() const override { return txn_id_; }
+
+  std::vector<TransactionID> GetWaitingTxns(uint32_t* column_family_id,
+                                            std::string* key) const override {
+    std::lock_guard<std::mutex> lock(wait_mutex_);
+    std::vector<TransactionID> ids(waiting_txn_ids_.size());
+    if (key) *key = waiting_key_ ? *waiting_key_ : "";
+    if (column_family_id) *column_family_id = waiting_cf_id_;
+    std::copy(waiting_txn_ids_.begin(), waiting_txn_ids_.end(), ids.begin());
+    return ids;
+  }
+
+  void SetWaitingTxn(autovector<TransactionID> ids, uint32_t column_family_id,
+                     const std::string* key) {
+    std::lock_guard<std::mutex> lock(wait_mutex_);
+    waiting_txn_ids_ = ids;
+    waiting_cf_id_ = column_family_id;
+    waiting_key_ = key;
+  }
+
+  void ClearWaitingTxn() {
+    std::lock_guard<std::mutex> lock(wait_mutex_);
+    waiting_txn_ids_.clear();
+    waiting_cf_id_ = 0;
+    waiting_key_ = nullptr;
+  }
+
+  // Returns the time (in microseconds according to Env->GetMicros())
+  // that this transaction will be expired.  Returns 0 if this transaction does
+  // not expire.
+  uint64_t GetExpirationTime() const { return expiration_time_; }
+
+  // returns true if this transaction has an expiration_time and has expired.
+  bool IsExpired() const;
+
+  // Returns the number of microseconds a transaction can wait on acquiring a
+  // lock or -1 if there is no timeout.
+  int64_t GetLockTimeout() const { return lock_timeout_; }
+  void SetLockTimeout(int64_t timeout) override {
+    lock_timeout_ = timeout * 1000;
+  }
+
+  // Returns true if locks were stolen successfully, false otherwise.
+  bool TryStealingLocks();
+
+  bool IsDeadlockDetect() const override { return deadlock_detect_; }
+
+  int64_t GetDeadlockDetectDepth() const { return deadlock_detect_depth_; }
+
+  virtual Status GetRangeLock(ColumnFamilyHandle* column_family,
+                              const Endpoint& start_key,
+                              const Endpoint& end_key) override;
+
+ protected:
+  // Refer to
+  // TransactionOptions::use_only_the_last_commit_time_batch_for_recovery
+  bool use_only_the_last_commit_time_batch_for_recovery_ = false;
+  // Refer to
+  // TransactionOptions::skip_prepare
+  bool skip_prepare_ = false;
+
+  virtual Status PrepareInternal() = 0;
+
+  virtual Status CommitWithoutPrepareInternal() = 0;
+
+  // batch_cnt if non-zero is the number of sub-batches. A sub-batch is a batch
+  // with no duplicate keys. If zero, then the number of sub-batches is unknown.
+  virtual Status CommitBatchInternal(WriteBatch* batch,
+                                     size_t batch_cnt = 0) = 0;
+
+  virtual Status CommitInternal() = 0;
+
+  virtual Status RollbackInternal() = 0;
+
+  virtual void Initialize(const TransactionOptions& txn_options);
+
+  Status LockBatch(WriteBatch* batch, LockTracker* keys_to_unlock);
+
+  Status TryLock(ColumnFamilyHandle* column_family, const Slice& key,
+                 bool read_only, bool exclusive, const bool do_validate = true,
+                 const bool assume_tracked = false) override;
+
+  void Clear() override;
+
+  PessimisticTransactionDB* txn_db_impl_;
+  DBImpl* db_impl_;
+
+  // If non-zero, this transaction should not be committed after this time (in
+  // microseconds according to Env->NowMicros())
+  uint64_t expiration_time_;
+
+  // Timestamp used by the transaction to perform all GetForUpdate.
+  // Use this timestamp for conflict checking.
+  // read_timestamp_ == kMaxTxnTimestamp means this transaction has not
+  // performed any GetForUpdate. It is possible that the transaction has
+  // performed blind writes or Get, though.
+  TxnTimestamp read_timestamp_{kMaxTxnTimestamp};
+  TxnTimestamp commit_timestamp_{kMaxTxnTimestamp};
+
+ private:
+  friend class TransactionTest_ValidateSnapshotTest_Test;
+  // Used to create unique ids for transactions.
+  static std::atomic<TransactionID> txn_id_counter_;
+
+  // Unique ID for this transaction
+  TransactionID txn_id_;
+
+  // IDs for the transactions that are blocking the current transaction.
+  //
+  // empty if current transaction is not waiting.
+  autovector<TransactionID> waiting_txn_ids_;
+
+  // The following two represents the (cf, key) that a transaction is waiting
+  // on.
+  //
+  // If waiting_key_ is not null, then the pointer should always point to
+  // a valid string object. The reason is that it is only non-null when the
+  // transaction is blocked in the PointLockManager::AcquireWithTimeout
+  // function. At that point, the key string object is one of the function
+  // parameters.
+  uint32_t waiting_cf_id_;
+  const std::string* waiting_key_;
+
+  // Mutex protecting waiting_txn_ids_, waiting_cf_id_ and waiting_key_.
+  mutable std::mutex wait_mutex_;
+
+  // Timeout in microseconds when locking a key or -1 if there is no timeout.
+  int64_t lock_timeout_;
+
+  // Whether to perform deadlock detection or not.
+  bool deadlock_detect_;
+
+  // Whether to perform deadlock detection or not.
+  int64_t deadlock_detect_depth_;
+
+  // Refer to TransactionOptions::skip_concurrency_control
+  bool skip_concurrency_control_;
+
+  virtual Status ValidateSnapshot(ColumnFamilyHandle* column_family,
+                                  const Slice& key,
+                                  SequenceNumber* tracked_at_seq);
+
+  void UnlockGetForUpdate(ColumnFamilyHandle* column_family,
+                          const Slice& key) override;
+};
+
+class WriteCommittedTxn : public PessimisticTransaction {
+ public:
+  WriteCommittedTxn(TransactionDB* db, const WriteOptions& write_options,
+                    const TransactionOptions& txn_options);
+  // No copying allowed
+  WriteCommittedTxn(const WriteCommittedTxn&) = delete;
+  void operator=(const WriteCommittedTxn&) = delete;
+
+  ~WriteCommittedTxn() override {}
+
+  using TransactionBaseImpl::GetForUpdate;
+  Status GetForUpdate(const ReadOptions& read_options,
+                      ColumnFamilyHandle* column_family, const Slice& key,
+                      std::string* value, bool exclusive,
+                      const bool do_validate) override;
+  Status GetForUpdate(const ReadOptions& read_options,
+                      ColumnFamilyHandle* column_family, const Slice& key,
+                      PinnableSlice* pinnable_val, bool exclusive,
+                      const bool do_validate) override;
+
+  using TransactionBaseImpl::Put;
+  // `key` does NOT include timestamp even when it's enabled.
+  Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+             const Slice& value, const bool assume_tracked = false) override;
+  Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+             const SliceParts& value,
+             const bool assume_tracked = false) override;
+
+  using TransactionBaseImpl::PutUntracked;
+  Status PutUntracked(ColumnFamilyHandle* column_family, const Slice& key,
+                      const Slice& value) override;
+  Status PutUntracked(ColumnFamilyHandle* column_family, const SliceParts& key,
+                      const SliceParts& value) override;
+
+  using TransactionBaseImpl::Delete;
+  // `key` does NOT include timestamp even when it's enabled.
+  Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
+                const bool assume_tracked = false) override;
+  Status Delete(ColumnFamilyHandle* column_family, const SliceParts& key,
+                const bool assume_tracked = false) override;
+
+  using TransactionBaseImpl::DeleteUntracked;
+  Status DeleteUntracked(ColumnFamilyHandle* column_family,
+                         const Slice& key) override;
+  Status DeleteUntracked(ColumnFamilyHandle* column_family,
+                         const SliceParts& key) override;
+
+  using TransactionBaseImpl::SingleDelete;
+  // `key` does NOT include timestamp even when it's enabled.
+  Status SingleDelete(ColumnFamilyHandle* column_family, const Slice& key,
+                      const bool assume_tracked = false) override;
+  Status SingleDelete(ColumnFamilyHandle* column_family, const SliceParts& key,
+                      const bool assume_tracked = false) override;
+
+  using TransactionBaseImpl::SingleDeleteUntracked;
+  Status SingleDeleteUntracked(ColumnFamilyHandle* column_family,
+                               const Slice& key) override;
+
+  using TransactionBaseImpl::Merge;
+  Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+               const Slice& value, const bool assume_tracked = false) override;
+
+  Status SetReadTimestampForValidation(TxnTimestamp ts) override;
+  Status SetCommitTimestamp(TxnTimestamp ts) override;
+  TxnTimestamp GetCommitTimestamp() const override { return commit_timestamp_; }
+
+ private:
+  template <typename TValue>
+  Status GetForUpdateImpl(const ReadOptions& read_options,
+                          ColumnFamilyHandle* column_family, const Slice& key,
+                          TValue* value, bool exclusive,
+                          const bool do_validate);
+
+  template <typename TKey, typename TOperation>
+  Status Operate(ColumnFamilyHandle* column_family, const TKey& key,
+                 const bool do_validate, const bool assume_tracked,
+                 TOperation&& operation);
+
+  Status PrepareInternal() override;
+
+  Status CommitWithoutPrepareInternal() override;
+
+  Status CommitBatchInternal(WriteBatch* batch, size_t batch_cnt) override;
+
+  Status CommitInternal() override;
+
+  Status RollbackInternal() override;
+
+  // Column families that enable timestamps and whose data are written when
+  // indexing_enabled_ is false. If a key is written when indexing_enabled_ is
+  // true, then the corresponding column family is not added to cfs_with_ts
+  // even if it enables timestamp.
+  std::unordered_set<uint32_t> cfs_with_ts_tracked_when_indexing_disabled_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/pessimistic_transaction_db.cc b/src/rocksdb/utilities/transactions/pessimistic_transaction_db.cc
new file mode 100644
index 000000000..950ef8042
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/pessimistic_transaction_db.cc
@@ -0,0 +1,782 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/pessimistic_transaction_db.h"
+
+#include <cinttypes>
+#include <sstream>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "logging/logging.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/mutexlock.h"
+#include "utilities/transactions/pessimistic_transaction.h"
+#include "utilities/transactions/transaction_db_mutex_impl.h"
+#include "utilities/transactions/write_prepared_txn_db.h"
+#include "utilities/transactions/write_unprepared_txn_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+PessimisticTransactionDB::PessimisticTransactionDB(
+    DB* db, const TransactionDBOptions& txn_db_options)
+    : TransactionDB(db),
+      db_impl_(static_cast_with_check<DBImpl>(db)),
+      txn_db_options_(txn_db_options),
+      lock_manager_(NewLockManager(this, txn_db_options)) {
+  assert(db_impl_ != nullptr);
+  info_log_ = db_impl_->GetDBOptions().info_log;
+}
+
+// Support initiliazing PessimisticTransactionDB from a stackable db
+//
+//    PessimisticTransactionDB
+//     ^        ^
+//     |        |
+//     |        +
+//     |   StackableDB
+//     |   ^
+//     |   |
+//     +   +
+//     DBImpl
+//       ^
+//       |(inherit)
+//       +
+//       DB
+//
+PessimisticTransactionDB::PessimisticTransactionDB(
+    StackableDB* db, const TransactionDBOptions& txn_db_options)
+    : TransactionDB(db),
+      db_impl_(static_cast_with_check<DBImpl>(db->GetRootDB())),
+      txn_db_options_(txn_db_options),
+      lock_manager_(NewLockManager(this, txn_db_options)) {
+  assert(db_impl_ != nullptr);
+}
+
+PessimisticTransactionDB::~PessimisticTransactionDB() {
+  while (!transactions_.empty()) {
+    delete transactions_.begin()->second;
+    // TODO(myabandeh): this seems to be an unsafe approach as it is not quite
+    // clear whether delete would also remove the entry from transactions_.
+  }
+}
+
+Status PessimisticTransactionDB::VerifyCFOptions(
+    const ColumnFamilyOptions& cf_options) {
+  const Comparator* const ucmp = cf_options.comparator;
+  assert(ucmp);
+  size_t ts_sz = ucmp->timestamp_size();
+  if (0 == ts_sz) {
+    return Status::OK();
+  }
+  if (ts_sz != sizeof(TxnTimestamp)) {
+    std::ostringstream oss;
+    oss << "Timestamp of transaction must have " << sizeof(TxnTimestamp)
+        << " bytes. CF comparator " << std::string(ucmp->Name())
+        << " timestamp size is " << ts_sz << " bytes";
+    return Status::InvalidArgument(oss.str());
+  }
+  if (txn_db_options_.write_policy != WRITE_COMMITTED) {
+    return Status::NotSupported("Only WriteCommittedTxn supports timestamp");
+  }
+  return Status::OK();
+}
+
+Status PessimisticTransactionDB::Initialize(
+    const std::vector<size_t>& compaction_enabled_cf_indices,
+    const std::vector<ColumnFamilyHandle*>& handles) {
+  for (auto cf_ptr : handles) {
+    AddColumnFamily(cf_ptr);
+  }
+  // Verify cf options
+  for (auto handle : handles) {
+    ColumnFamilyDescriptor cfd;
+    Status s = handle->GetDescriptor(&cfd);
+    if (!s.ok()) {
+      return s;
+    }
+    s = VerifyCFOptions(cfd.options);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  // Re-enable compaction for the column families that initially had
+  // compaction enabled.
+  std::vector<ColumnFamilyHandle*> compaction_enabled_cf_handles;
+  compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size());
+  for (auto index : compaction_enabled_cf_indices) {
+    compaction_enabled_cf_handles.push_back(handles[index]);
+  }
+
+  Status s = EnableAutoCompaction(compaction_enabled_cf_handles);
+
+  // create 'real' transactions from recovered shell transactions
+  auto dbimpl = static_cast_with_check<DBImpl>(GetRootDB());
+  assert(dbimpl != nullptr);
+  auto rtrxs = dbimpl->recovered_transactions();
+
+  for (auto it = rtrxs.begin(); it != rtrxs.end(); ++it) {
+    auto recovered_trx = it->second;
+    assert(recovered_trx);
+    assert(recovered_trx->batches_.size() == 1);
+    const auto& seq = recovered_trx->batches_.begin()->first;
+    const auto& batch_info = recovered_trx->batches_.begin()->second;
+    assert(batch_info.log_number_);
+    assert(recovered_trx->name_.length());
+
+    WriteOptions w_options;
+    w_options.sync = true;
+    TransactionOptions t_options;
+    // This would help avoiding deadlock for keys that although exist in the WAL
+    // did not go through concurrency control. This includes the merge that
+    // MyRocks uses for auto-inc columns. It is safe to do so, since (i) if
+    // there is a conflict between the keys of two transactions that must be
+    // avoided, it is already avoided by the application, MyRocks, before the
+    // restart (ii) application, MyRocks, guarntees to rollback/commit the
+    // recovered transactions before new transactions start.
+    t_options.skip_concurrency_control = true;
+
+    Transaction* real_trx = BeginTransaction(w_options, t_options, nullptr);
+    assert(real_trx);
+    real_trx->SetLogNumber(batch_info.log_number_);
+    assert(seq != kMaxSequenceNumber);
+    if (GetTxnDBOptions().write_policy != WRITE_COMMITTED) {
+      real_trx->SetId(seq);
+    }
+
+    s = real_trx->SetName(recovered_trx->name_);
+    if (!s.ok()) {
+      break;
+    }
+
+    s = real_trx->RebuildFromWriteBatch(batch_info.batch_);
+    // WriteCommitted set this to to disable this check that is specific to
+    // WritePrepared txns
+    assert(batch_info.batch_cnt_ == 0 ||
+           real_trx->GetWriteBatch()->SubBatchCnt() == batch_info.batch_cnt_);
+    real_trx->SetState(Transaction::PREPARED);
+    if (!s.ok()) {
+      break;
+    }
+  }
+  if (s.ok()) {
+    dbimpl->DeleteAllRecoveredTransactions();
+  }
+  return s;
+}
+
+Transaction* WriteCommittedTxnDB::BeginTransaction(
+    const WriteOptions& write_options, const TransactionOptions& txn_options,
+    Transaction* old_txn) {
+  if (old_txn != nullptr) {
+    ReinitializeTransaction(old_txn, write_options, txn_options);
+    return old_txn;
+  } else {
+    return new WriteCommittedTxn(this, write_options, txn_options);
+  }
+}
+
+TransactionDBOptions PessimisticTransactionDB::ValidateTxnDBOptions(
+    const TransactionDBOptions& txn_db_options) {
+  TransactionDBOptions validated = txn_db_options;
+
+  if (txn_db_options.num_stripes == 0) {
+    validated.num_stripes = 1;
+  }
+
+  return validated;
+}
+
+Status TransactionDB::Open(const Options& options,
+                           const TransactionDBOptions& txn_db_options,
+                           const std::string& dbname, TransactionDB** dbptr) {
+  DBOptions db_options(options);
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+  std::vector<ColumnFamilyHandle*> handles;
+  Status s = TransactionDB::Open(db_options, txn_db_options, dbname,
+                                 column_families, &handles, dbptr);
+  if (s.ok()) {
+    assert(handles.size() == 1);
+    // i can delete the handle since DBImpl is always holding a reference to
+    // default column family
+    delete handles[0];
+  }
+
+  return s;
+}
+
+Status TransactionDB::Open(
+    const DBOptions& db_options, const TransactionDBOptions& txn_db_options,
+    const std::string& dbname,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles, TransactionDB** dbptr) {
+  Status s;
+  DB* db = nullptr;
+  if (txn_db_options.write_policy == WRITE_COMMITTED &&
+      db_options.unordered_write) {
+    return Status::NotSupported(
+        "WRITE_COMMITTED is incompatible with unordered_writes");
+  }
+  if (txn_db_options.write_policy == WRITE_UNPREPARED &&
+      db_options.unordered_write) {
+    // TODO(lth): support it
+    return Status::NotSupported(
+        "WRITE_UNPREPARED is currently incompatible with unordered_writes");
+  }
+  if (txn_db_options.write_policy == WRITE_PREPARED &&
+      db_options.unordered_write && !db_options.two_write_queues) {
+    return Status::NotSupported(
+        "WRITE_PREPARED is incompatible with unordered_writes if "
+        "two_write_queues is not enabled.");
+  }
+
+  std::vector<ColumnFamilyDescriptor> column_families_copy = column_families;
+  std::vector<size_t> compaction_enabled_cf_indices;
+  DBOptions db_options_2pc = db_options;
+  PrepareWrap(&db_options_2pc, &column_families_copy,
+              &compaction_enabled_cf_indices);
+  const bool use_seq_per_batch =
+      txn_db_options.write_policy == WRITE_PREPARED ||
+      txn_db_options.write_policy == WRITE_UNPREPARED;
+  const bool use_batch_per_txn =
+      txn_db_options.write_policy == WRITE_COMMITTED ||
+      txn_db_options.write_policy == WRITE_PREPARED;
+  s = DBImpl::Open(db_options_2pc, dbname, column_families_copy, handles, &db,
+                   use_seq_per_batch, use_batch_per_txn);
+  if (s.ok()) {
+    ROCKS_LOG_WARN(db->GetDBOptions().info_log,
+                   "Transaction write_policy is %" PRId32,
+                   static_cast<int>(txn_db_options.write_policy));
+    // if WrapDB return non-ok, db will be deleted in WrapDB() via
+    // ~StackableDB().
+    s = WrapDB(db, txn_db_options, compaction_enabled_cf_indices, *handles,
+               dbptr);
+  }
+  return s;
+}
+
+void TransactionDB::PrepareWrap(
+    DBOptions* db_options, std::vector<ColumnFamilyDescriptor>* column_families,
+    std::vector<size_t>* compaction_enabled_cf_indices) {
+  compaction_enabled_cf_indices->clear();
+
+  // Enable MemTable History if not already enabled
+  for (size_t i = 0; i < column_families->size(); i++) {
+    ColumnFamilyOptions* cf_options = &(*column_families)[i].options;
+
+    if (cf_options->max_write_buffer_size_to_maintain == 0 &&
+        cf_options->max_write_buffer_number_to_maintain == 0) {
+      // Setting to -1 will set the History size to
+      // max_write_buffer_number * write_buffer_size.
+      cf_options->max_write_buffer_size_to_maintain = -1;
+    }
+    if (!cf_options->disable_auto_compactions) {
+      // Disable compactions momentarily to prevent race with DB::Open
+      cf_options->disable_auto_compactions = true;
+      compaction_enabled_cf_indices->push_back(i);
+    }
+  }
+  db_options->allow_2pc = true;
+}
+
+namespace {
+template <typename DBType>
+Status WrapAnotherDBInternal(
+    DBType* db, const TransactionDBOptions& txn_db_options,
+    const std::vector<size_t>& compaction_enabled_cf_indices,
+    const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr) {
+  assert(db != nullptr);
+  assert(dbptr != nullptr);
+  *dbptr = nullptr;
+  std::unique_ptr<PessimisticTransactionDB> txn_db;
+  // txn_db owns object pointed to by the raw db pointer.
+  switch (txn_db_options.write_policy) {
+    case WRITE_UNPREPARED:
+      txn_db.reset(new WriteUnpreparedTxnDB(
+          db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options)));
+      break;
+    case WRITE_PREPARED:
+      txn_db.reset(new WritePreparedTxnDB(
+          db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options)));
+      break;
+    case WRITE_COMMITTED:
+    default:
+      txn_db.reset(new WriteCommittedTxnDB(
+          db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options)));
+  }
+  txn_db->UpdateCFComparatorMap(handles);
+  Status s = txn_db->Initialize(compaction_enabled_cf_indices, handles);
+  // In case of a failure at this point, db is deleted via the txn_db destructor
+  // and set to nullptr.
+  if (s.ok()) {
+    *dbptr = txn_db.release();
+  } else {
+    for (auto* h : handles) {
+      delete h;
+    }
+    // txn_db still owns db, and ~StackableDB() will be called when txn_db goes
+    // out of scope, deleting the input db pointer.
+    ROCKS_LOG_FATAL(db->GetDBOptions().info_log,
+                    "Failed to initialize txn_db: %s", s.ToString().c_str());
+  }
+  return s;
+}
+}  // namespace
+
+Status TransactionDB::WrapDB(
+    // make sure this db is already opened with memtable history enabled,
+    // auto compaction distabled and 2 phase commit enabled
+    DB* db, const TransactionDBOptions& txn_db_options,
+    const std::vector<size_t>& compaction_enabled_cf_indices,
+    const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr) {
+  return WrapAnotherDBInternal(db, txn_db_options,
+                               compaction_enabled_cf_indices, handles, dbptr);
+}
+
+Status TransactionDB::WrapStackableDB(
+    // make sure this stackable_db is already opened with memtable history
+    // enabled, auto compaction distabled and 2 phase commit enabled
+    StackableDB* db, const TransactionDBOptions& txn_db_options,
+    const std::vector<size_t>& compaction_enabled_cf_indices,
+    const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr) {
+  return WrapAnotherDBInternal(db, txn_db_options,
+                               compaction_enabled_cf_indices, handles, dbptr);
+}
+
+// Let LockManager know that this column family exists so it can
+// allocate a LockMap for it.
+void PessimisticTransactionDB::AddColumnFamily(
+    const ColumnFamilyHandle* handle) {
+  lock_manager_->AddColumnFamily(handle);
+}
+
+Status PessimisticTransactionDB::CreateColumnFamily(
+    const ColumnFamilyOptions& options, const std::string& column_family_name,
+    ColumnFamilyHandle** handle) {
+  InstrumentedMutexLock l(&column_family_mutex_);
+  Status s = VerifyCFOptions(options);
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = db_->CreateColumnFamily(options, column_family_name, handle);
+  if (s.ok()) {
+    lock_manager_->AddColumnFamily(*handle);
+    UpdateCFComparatorMap(*handle);
+  }
+
+  return s;
+}
+
+Status PessimisticTransactionDB::CreateColumnFamilies(
+    const ColumnFamilyOptions& options,
+    const std::vector<std::string>& column_family_names,
+    std::vector<ColumnFamilyHandle*>* handles) {
+  InstrumentedMutexLock l(&column_family_mutex_);
+
+  Status s = VerifyCFOptions(options);
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = db_->CreateColumnFamilies(options, column_family_names, handles);
+  if (s.ok()) {
+    for (auto* handle : *handles) {
+      lock_manager_->AddColumnFamily(handle);
+      UpdateCFComparatorMap(handle);
+    }
+  }
+
+  return s;
+}
+
+Status PessimisticTransactionDB::CreateColumnFamilies(
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles) {
+  InstrumentedMutexLock l(&column_family_mutex_);
+
+  for (auto& cf_desc : column_families) {
+    Status s = VerifyCFOptions(cf_desc.options);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  Status s = db_->CreateColumnFamilies(column_families, handles);
+  if (s.ok()) {
+    for (auto* handle : *handles) {
+      lock_manager_->AddColumnFamily(handle);
+      UpdateCFComparatorMap(handle);
+    }
+  }
+
+  return s;
+}
+
+// Let LockManager know that it can deallocate the LockMap for this
+// column family.
+Status PessimisticTransactionDB::DropColumnFamily(
+    ColumnFamilyHandle* column_family) {
+  InstrumentedMutexLock l(&column_family_mutex_);
+
+  Status s = db_->DropColumnFamily(column_family);
+  if (s.ok()) {
+    lock_manager_->RemoveColumnFamily(column_family);
+  }
+
+  return s;
+}
+
+Status PessimisticTransactionDB::DropColumnFamilies(
+    const std::vector<ColumnFamilyHandle*>& column_families) {
+  InstrumentedMutexLock l(&column_family_mutex_);
+
+  Status s = db_->DropColumnFamilies(column_families);
+  if (s.ok()) {
+    for (auto* handle : column_families) {
+      lock_manager_->RemoveColumnFamily(handle);
+    }
+  }
+
+  return s;
+}
+
+Status PessimisticTransactionDB::TryLock(PessimisticTransaction* txn,
+                                         uint32_t cfh_id,
+                                         const std::string& key,
+                                         bool exclusive) {
+  return lock_manager_->TryLock(txn, cfh_id, key, GetEnv(), exclusive);
+}
+
+Status PessimisticTransactionDB::TryRangeLock(PessimisticTransaction* txn,
+                                              uint32_t cfh_id,
+                                              const Endpoint& start_endp,
+                                              const Endpoint& end_endp) {
+  return lock_manager_->TryLock(txn, cfh_id, start_endp, end_endp, GetEnv(),
+                                /*exclusive=*/true);
+}
+
+void PessimisticTransactionDB::UnLock(PessimisticTransaction* txn,
+                                      const LockTracker& keys) {
+  lock_manager_->UnLock(txn, keys, GetEnv());
+}
+
+void PessimisticTransactionDB::UnLock(PessimisticTransaction* txn,
+                                      uint32_t cfh_id, const std::string& key) {
+  lock_manager_->UnLock(txn, cfh_id, key, GetEnv());
+}
+
+// Used when wrapping DB write operations in a transaction
+Transaction* PessimisticTransactionDB::BeginInternalTransaction(
+    const WriteOptions& options) {
+  TransactionOptions txn_options;
+  Transaction* txn = BeginTransaction(options, txn_options, nullptr);
+
+  // Use default timeout for non-transactional writes
+  txn->SetLockTimeout(txn_db_options_.default_lock_timeout);
+  return txn;
+}
+
+// All user Put, Merge, Delete, and Write requests must be intercepted to make
+// sure that they lock all keys that they are writing to avoid causing conflicts
+// with any concurrent transactions. The easiest way to do this is to wrap all
+// write operations in a transaction.
+//
+// Put(), Merge(), and Delete() only lock a single key per call.  Write() will
+// sort its keys before locking them.  This guarantees that TransactionDB write
+// methods cannot deadlock with each other (but still could deadlock with a
+// Transaction).
+Status PessimisticTransactionDB::Put(const WriteOptions& options,
+                                     ColumnFamilyHandle* column_family,
+                                     const Slice& key, const Slice& val) {
+  Status s = FailIfCfEnablesTs(this, column_family);
+  if (!s.ok()) {
+    return s;
+  }
+
+  Transaction* txn = BeginInternalTransaction(options);
+  txn->DisableIndexing();
+
+  // Since the client didn't create a transaction, they don't care about
+  // conflict checking for this write.  So we just need to do PutUntracked().
+  s = txn->PutUntracked(column_family, key, val);
+
+  if (s.ok()) {
+    s = txn->Commit();
+  }
+
+  delete txn;
+
+  return s;
+}
+
+Status PessimisticTransactionDB::Delete(const WriteOptions& wopts,
+                                        ColumnFamilyHandle* column_family,
+                                        const Slice& key) {
+  Status s = FailIfCfEnablesTs(this, column_family);
+  if (!s.ok()) {
+    return s;
+  }
+
+  Transaction* txn = BeginInternalTransaction(wopts);
+  txn->DisableIndexing();
+
+  // Since the client didn't create a transaction, they don't care about
+  // conflict checking for this write.  So we just need to do
+  // DeleteUntracked().
+  s = txn->DeleteUntracked(column_family, key);
+
+  if (s.ok()) {
+    s = txn->Commit();
+  }
+
+  delete txn;
+
+  return s;
+}
+
+Status PessimisticTransactionDB::SingleDelete(const WriteOptions& wopts,
+                                              ColumnFamilyHandle* column_family,
+                                              const Slice& key) {
+  Status s = FailIfCfEnablesTs(this, column_family);
+  if (!s.ok()) {
+    return s;
+  }
+
+  Transaction* txn = BeginInternalTransaction(wopts);
+  txn->DisableIndexing();
+
+  // Since the client didn't create a transaction, they don't care about
+  // conflict checking for this write.  So we just need to do
+  // SingleDeleteUntracked().
+  s = txn->SingleDeleteUntracked(column_family, key);
+
+  if (s.ok()) {
+    s = txn->Commit();
+  }
+
+  delete txn;
+
+  return s;
+}
+
+Status PessimisticTransactionDB::Merge(const WriteOptions& options,
+                                       ColumnFamilyHandle* column_family,
+                                       const Slice& key, const Slice& value) {
+  Status s = FailIfCfEnablesTs(this, column_family);
+  if (!s.ok()) {
+    return s;
+  }
+
+  Transaction* txn = BeginInternalTransaction(options);
+  txn->DisableIndexing();
+
+  // Since the client didn't create a transaction, they don't care about
+  // conflict checking for this write.  So we just need to do
+  // MergeUntracked().
+  s = txn->MergeUntracked(column_family, key, value);
+
+  if (s.ok()) {
+    s = txn->Commit();
+  }
+
+  delete txn;
+
+  return s;
+}
+
+Status PessimisticTransactionDB::Write(const WriteOptions& opts,
+                                       WriteBatch* updates) {
+  return WriteWithConcurrencyControl(opts, updates);
+}
+
+Status WriteCommittedTxnDB::Write(const WriteOptions& opts,
+                                  WriteBatch* updates) {
+  Status s = FailIfBatchHasTs(updates);
+  if (!s.ok()) {
+    return s;
+  }
+  if (txn_db_options_.skip_concurrency_control) {
+    return db_impl_->Write(opts, updates);
+  } else {
+    return WriteWithConcurrencyControl(opts, updates);
+  }
+}
+
+Status WriteCommittedTxnDB::Write(
+    const WriteOptions& opts,
+    const TransactionDBWriteOptimizations& optimizations, WriteBatch* updates) {
+  Status s = FailIfBatchHasTs(updates);
+  if (!s.ok()) {
+    return s;
+  }
+  if (optimizations.skip_concurrency_control) {
+    return db_impl_->Write(opts, updates);
+  } else {
+    return WriteWithConcurrencyControl(opts, updates);
+  }
+}
+
+void PessimisticTransactionDB::InsertExpirableTransaction(
+    TransactionID tx_id, PessimisticTransaction* tx) {
+  assert(tx->GetExpirationTime() > 0);
+  std::lock_guard<std::mutex> lock(map_mutex_);
+  expirable_transactions_map_.insert({tx_id, tx});
+}
+
+void PessimisticTransactionDB::RemoveExpirableTransaction(TransactionID tx_id) {
+  std::lock_guard<std::mutex> lock(map_mutex_);
+  expirable_transactions_map_.erase(tx_id);
+}
+
+bool PessimisticTransactionDB::TryStealingExpiredTransactionLocks(
+    TransactionID tx_id) {
+  std::lock_guard<std::mutex> lock(map_mutex_);
+
+  auto tx_it = expirable_transactions_map_.find(tx_id);
+  if (tx_it == expirable_transactions_map_.end()) {
+    return true;
+  }
+  PessimisticTransaction& tx = *(tx_it->second);
+  return tx.TryStealingLocks();
+}
+
+void PessimisticTransactionDB::ReinitializeTransaction(
+    Transaction* txn, const WriteOptions& write_options,
+    const TransactionOptions& txn_options) {
+  auto txn_impl = static_cast_with_check<PessimisticTransaction>(txn);
+
+  txn_impl->Reinitialize(this, write_options, txn_options);
+}
+
+Transaction* PessimisticTransactionDB::GetTransactionByName(
+    const TransactionName& name) {
+  std::lock_guard<std::mutex> lock(name_map_mutex_);
+  auto it = transactions_.find(name);
+  if (it == transactions_.end()) {
+    return nullptr;
+  } else {
+    return it->second;
+  }
+}
+
+void PessimisticTransactionDB::GetAllPreparedTransactions(
+    std::vector<Transaction*>* transv) {
+  assert(transv);
+  transv->clear();
+  std::lock_guard<std::mutex> lock(name_map_mutex_);
+  for (auto it = transactions_.begin(); it != transactions_.end(); ++it) {
+    if (it->second->GetState() == Transaction::PREPARED) {
+      transv->push_back(it->second);
+    }
+  }
+}
+
+LockManager::PointLockStatus PessimisticTransactionDB::GetLockStatusData() {
+  return lock_manager_->GetPointLockStatus();
+}
+
+std::vector<DeadlockPath> PessimisticTransactionDB::GetDeadlockInfoBuffer() {
+  return lock_manager_->GetDeadlockInfoBuffer();
+}
+
+void PessimisticTransactionDB::SetDeadlockInfoBufferSize(uint32_t target_size) {
+  lock_manager_->Resize(target_size);
+}
+
+void PessimisticTransactionDB::RegisterTransaction(Transaction* txn) {
+  assert(txn);
+  assert(txn->GetName().length() > 0);
+  assert(GetTransactionByName(txn->GetName()) == nullptr);
+  assert(txn->GetState() == Transaction::STARTED);
+  std::lock_guard<std::mutex> lock(name_map_mutex_);
+  transactions_[txn->GetName()] = txn;
+}
+
+void PessimisticTransactionDB::UnregisterTransaction(Transaction* txn) {
+  assert(txn);
+  std::lock_guard<std::mutex> lock(name_map_mutex_);
+  auto it = transactions_.find(txn->GetName());
+  assert(it != transactions_.end());
+  transactions_.erase(it);
+}
+
+std::pair<Status, std::shared_ptr<const Snapshot>>
+PessimisticTransactionDB::CreateTimestampedSnapshot(TxnTimestamp ts) {
+  if (kMaxTxnTimestamp == ts) {
+    return std::make_pair(Status::InvalidArgument("invalid ts"), nullptr);
+  }
+  assert(db_impl_);
+  return db_impl_->CreateTimestampedSnapshot(kMaxSequenceNumber, ts);
+}
+
+std::shared_ptr<const Snapshot>
+PessimisticTransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const {
+  assert(db_impl_);
+  return db_impl_->GetTimestampedSnapshot(ts);
+}
+
+void PessimisticTransactionDB::ReleaseTimestampedSnapshotsOlderThan(
+    TxnTimestamp ts) {
+  assert(db_impl_);
+  db_impl_->ReleaseTimestampedSnapshotsOlderThan(ts);
+}
+
+Status PessimisticTransactionDB::GetTimestampedSnapshots(
+    TxnTimestamp ts_lb, TxnTimestamp ts_ub,
+    std::vector<std::shared_ptr<const Snapshot>>& timestamped_snapshots) const {
+  assert(db_impl_);
+  return db_impl_->GetTimestampedSnapshots(ts_lb, ts_ub, timestamped_snapshots);
+}
+
+Status SnapshotCreationCallback::operator()(SequenceNumber seq,
+                                            bool disable_memtable) {
+  assert(db_impl_);
+  assert(commit_ts_ != kMaxTxnTimestamp);
+
+  const bool two_write_queues =
+      db_impl_->immutable_db_options().two_write_queues;
+  assert(!two_write_queues || !disable_memtable);
+#ifdef NDEBUG
+  (void)two_write_queues;
+  (void)disable_memtable;
+#endif
+
+  const bool seq_per_batch = db_impl_->seq_per_batch();
+  if (!seq_per_batch) {
+    assert(db_impl_->GetLastPublishedSequence() <= seq);
+  } else {
+    assert(db_impl_->GetLastPublishedSequence() < seq);
+  }
+
+  // Create a snapshot which can also be used for write conflict checking.
+  auto ret = db_impl_->CreateTimestampedSnapshot(seq, commit_ts_);
+  snapshot_creation_status_ = ret.first;
+  snapshot_ = ret.second;
+  if (snapshot_creation_status_.ok()) {
+    assert(snapshot_);
+  } else {
+    assert(!snapshot_);
+  }
+  if (snapshot_ && snapshot_notifier_) {
+    snapshot_notifier_->SnapshotCreated(snapshot_.get());
+  }
+  return Status::OK();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/pessimistic_transaction_db.h b/src/rocksdb/utilities/transactions/pessimistic_transaction_db.h
new file mode 100644
index 000000000..25cd11054
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/pessimistic_transaction_db.h
@@ -0,0 +1,318 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <mutex>
+#include <queue>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "db/db_iter.h"
+#include "db/read_callback.h"
+#include "db/snapshot_checker.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "util/cast_util.h"
+#include "utilities/transactions/lock/lock_manager.h"
+#include "utilities/transactions/lock/range/range_lock_manager.h"
+#include "utilities/transactions/pessimistic_transaction.h"
+#include "utilities/transactions/write_prepared_txn.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class PessimisticTransactionDB : public TransactionDB {
+ public:
+  explicit PessimisticTransactionDB(DB* db,
+                                    const TransactionDBOptions& txn_db_options);
+
+  explicit PessimisticTransactionDB(StackableDB* db,
+                                    const TransactionDBOptions& txn_db_options);
+
+  virtual ~PessimisticTransactionDB();
+
+  virtual const Snapshot* GetSnapshot() override { return db_->GetSnapshot(); }
+
+  virtual Status Initialize(
+      const std::vector<size_t>& compaction_enabled_cf_indices,
+      const std::vector<ColumnFamilyHandle*>& handles);
+
+  Transaction* BeginTransaction(const WriteOptions& write_options,
+                                const TransactionOptions& txn_options,
+                                Transaction* old_txn) override = 0;
+
+  using StackableDB::Put;
+  virtual Status Put(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& val) override;
+
+  using StackableDB::Delete;
+  virtual Status Delete(const WriteOptions& wopts,
+                        ColumnFamilyHandle* column_family,
+                        const Slice& key) override;
+
+  using StackableDB::SingleDelete;
+  virtual Status SingleDelete(const WriteOptions& wopts,
+                              ColumnFamilyHandle* column_family,
+                              const Slice& key) override;
+
+  using StackableDB::Merge;
+  virtual Status Merge(const WriteOptions& options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) override;
+
+  using TransactionDB::Write;
+  virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
+  inline Status WriteWithConcurrencyControl(const WriteOptions& opts,
+                                            WriteBatch* updates) {
+    Status s;
+    if (opts.protection_bytes_per_key > 0) {
+      s = WriteBatchInternal::UpdateProtectionInfo(
+          updates, opts.protection_bytes_per_key);
+    }
+    if (s.ok()) {
+      // Need to lock all keys in this batch to prevent write conflicts with
+      // concurrent transactions.
+      Transaction* txn = BeginInternalTransaction(opts);
+      txn->DisableIndexing();
+
+      auto txn_impl = static_cast_with_check<PessimisticTransaction>(txn);
+
+      // Since commitBatch sorts the keys before locking, concurrent Write()
+      // operations will not cause a deadlock.
+      // In order to avoid a deadlock with a concurrent Transaction,
+      // Transactions should use a lock timeout.
+      s = txn_impl->CommitBatch(updates);
+
+      delete txn;
+    }
+
+    return s;
+  }
+
+  using StackableDB::CreateColumnFamily;
+  virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
+                                    const std::string& column_family_name,
+                                    ColumnFamilyHandle** handle) override;
+
+  Status CreateColumnFamilies(
+      const ColumnFamilyOptions& options,
+      const std::vector<std::string>& column_family_names,
+      std::vector<ColumnFamilyHandle*>* handles) override;
+
+  Status CreateColumnFamilies(
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      std::vector<ColumnFamilyHandle*>* handles) override;
+
+  using StackableDB::DropColumnFamily;
+  virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override;
+
+  Status DropColumnFamilies(
+      const std::vector<ColumnFamilyHandle*>& column_families) override;
+
+  Status TryLock(PessimisticTransaction* txn, uint32_t cfh_id,
+                 const std::string& key, bool exclusive);
+  Status TryRangeLock(PessimisticTransaction* txn, uint32_t cfh_id,
+                      const Endpoint& start_endp, const Endpoint& end_endp);
+
+  void UnLock(PessimisticTransaction* txn, const LockTracker& keys);
+  void UnLock(PessimisticTransaction* txn, uint32_t cfh_id,
+              const std::string& key);
+
+  void AddColumnFamily(const ColumnFamilyHandle* handle);
+
+  static TransactionDBOptions ValidateTxnDBOptions(
+      const TransactionDBOptions& txn_db_options);
+
+  const TransactionDBOptions& GetTxnDBOptions() const {
+    return txn_db_options_;
+  }
+
+  void InsertExpirableTransaction(TransactionID tx_id,
+                                  PessimisticTransaction* tx);
+  void RemoveExpirableTransaction(TransactionID tx_id);
+
+  // If transaction is no longer available, locks can be stolen
+  // If transaction is available, try stealing locks directly from transaction
+  // It is the caller's responsibility to ensure that the referred transaction
+  // is expirable (GetExpirationTime() > 0) and that it is expired.
+  bool TryStealingExpiredTransactionLocks(TransactionID tx_id);
+
+  Transaction* GetTransactionByName(const TransactionName& name) override;
+
+  void RegisterTransaction(Transaction* txn);
+  void UnregisterTransaction(Transaction* txn);
+
+  // not thread safe. current use case is during recovery (single thread)
+  void GetAllPreparedTransactions(std::vector<Transaction*>* trans) override;
+
+  LockManager::PointLockStatus GetLockStatusData() override;
+
+  std::vector<DeadlockPath> GetDeadlockInfoBuffer() override;
+  void SetDeadlockInfoBufferSize(uint32_t target_size) override;
+
+  // The default implementation does nothing. The actual implementation is moved
+  // to the child classes that actually need this information. This was due to
+  // an odd performance drop we observed when the added std::atomic member to
+  // the base class even when the subclass do not read it in the fast path.
+  virtual void UpdateCFComparatorMap(const std::vector<ColumnFamilyHandle*>&) {}
+  virtual void UpdateCFComparatorMap(ColumnFamilyHandle*) {}
+
+  // Use the returned factory to create LockTrackers in transactions.
+  const LockTrackerFactory& GetLockTrackerFactory() const {
+    return lock_manager_->GetLockTrackerFactory();
+  }
+
+  std::pair<Status, std::shared_ptr<const Snapshot>> CreateTimestampedSnapshot(
+      TxnTimestamp ts) override;
+
+  std::shared_ptr<const Snapshot> GetTimestampedSnapshot(
+      TxnTimestamp ts) const override;
+
+  void ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts) override;
+
+  Status GetTimestampedSnapshots(TxnTimestamp ts_lb, TxnTimestamp ts_ub,
+                                 std::vector<std::shared_ptr<const Snapshot>>&
+                                     timestamped_snapshots) const override;
+
+ protected:
+  DBImpl* db_impl_;
+  std::shared_ptr<Logger> info_log_;
+  const TransactionDBOptions txn_db_options_;
+
+  static Status FailIfBatchHasTs(const WriteBatch* wb);
+
+  static Status FailIfCfEnablesTs(const DB* db,
+                                  const ColumnFamilyHandle* column_family);
+
+  void ReinitializeTransaction(
+      Transaction* txn, const WriteOptions& write_options,
+      const TransactionOptions& txn_options = TransactionOptions());
+
+  virtual Status VerifyCFOptions(const ColumnFamilyOptions& cf_options);
+
+ private:
+  friend class WritePreparedTxnDB;
+  friend class WritePreparedTxnDBMock;
+  friend class WriteUnpreparedTxn;
+  friend class TransactionTest_DoubleCrashInRecovery_Test;
+  friend class TransactionTest_DoubleEmptyWrite_Test;
+  friend class TransactionTest_DuplicateKeys_Test;
+  friend class TransactionTest_PersistentTwoPhaseTransactionTest_Test;
+  friend class TransactionTest_TwoPhaseDoubleRecoveryTest_Test;
+  friend class TransactionTest_TwoPhaseOutOfOrderDelete_Test;
+  friend class TransactionStressTest_TwoPhaseLongPrepareTest_Test;
+  friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
+  friend class WriteUnpreparedTransactionTest_MarkLogWithPrepSection_Test;
+
+  Transaction* BeginInternalTransaction(const WriteOptions& options);
+
+  std::shared_ptr<LockManager> lock_manager_;
+
+  // Must be held when adding/dropping column families.
+  InstrumentedMutex column_family_mutex_;
+
+  // Used to ensure that no locks are stolen from an expirable transaction
+  // that has started a commit. Only transactions with an expiration time
+  // should be in this map.
+  std::mutex map_mutex_;
+  std::unordered_map<TransactionID, PessimisticTransaction*>
+      expirable_transactions_map_;
+
+  // map from name to two phase transaction instance
+  std::mutex name_map_mutex_;
+  std::unordered_map<TransactionName, Transaction*> transactions_;
+
+  // Signal that we are testing a crash scenario. Some asserts could be relaxed
+  // in such cases.
+  virtual void TEST_Crash() {}
+};
+
+// A PessimisticTransactionDB that writes the data to the DB after the commit.
+// In this way the DB only contains the committed data.
+class WriteCommittedTxnDB : public PessimisticTransactionDB {
+ public:
+  explicit WriteCommittedTxnDB(DB* db,
+                               const TransactionDBOptions& txn_db_options)
+      : PessimisticTransactionDB(db, txn_db_options) {}
+
+  explicit WriteCommittedTxnDB(StackableDB* db,
+                               const TransactionDBOptions& txn_db_options)
+      : PessimisticTransactionDB(db, txn_db_options) {}
+
+  virtual ~WriteCommittedTxnDB() {}
+
+  Transaction* BeginTransaction(const WriteOptions& write_options,
+                                const TransactionOptions& txn_options,
+                                Transaction* old_txn) override;
+
+  // Optimized version of ::Write that makes use of skip_concurrency_control
+  // hint
+  using TransactionDB::Write;
+  virtual Status Write(const WriteOptions& opts,
+                       const TransactionDBWriteOptimizations& optimizations,
+                       WriteBatch* updates) override;
+  virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
+};
+
+inline Status PessimisticTransactionDB::FailIfBatchHasTs(
+    const WriteBatch* batch) {
+  if (batch != nullptr && WriteBatchInternal::HasKeyWithTimestamp(*batch)) {
+    return Status::NotSupported(
+        "Writes with timestamp must go through transaction API instead of "
+        "TransactionDB.");
+  }
+  return Status::OK();
+}
+
+inline Status PessimisticTransactionDB::FailIfCfEnablesTs(
+    const DB* db, const ColumnFamilyHandle* column_family) {
+  assert(db);
+  column_family = column_family ? column_family : db->DefaultColumnFamily();
+  assert(column_family);
+  const Comparator* const ucmp = column_family->GetComparator();
+  assert(ucmp);
+  if (ucmp->timestamp_size() > 0) {
+    return Status::NotSupported(
+        "Write operation with user timestamp must go through the transaction "
+        "API instead of TransactionDB.");
+  }
+  return Status::OK();
+}
+
+class SnapshotCreationCallback : public PostMemTableCallback {
+ public:
+  explicit SnapshotCreationCallback(
+      DBImpl* dbi, TxnTimestamp commit_ts,
+      const std::shared_ptr<TransactionNotifier>& notifier,
+      std::shared_ptr<const Snapshot>& snapshot)
+      : db_impl_(dbi),
+        commit_ts_(commit_ts),
+        snapshot_notifier_(notifier),
+        snapshot_(snapshot) {
+    assert(db_impl_);
+  }
+
+  ~SnapshotCreationCallback() override {
+    snapshot_creation_status_.PermitUncheckedError();
+  }
+
+  Status operator()(SequenceNumber seq, bool disable_memtable) override;
+
+ private:
+  DBImpl* const db_impl_;
+  const TxnTimestamp commit_ts_;
+  std::shared_ptr<TransactionNotifier> snapshot_notifier_;
+  std::shared_ptr<const Snapshot>& snapshot_;
+
+  Status snapshot_creation_status_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/snapshot_checker.cc b/src/rocksdb/utilities/transactions/snapshot_checker.cc
new file mode 100644
index 000000000..76d16681a
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/snapshot_checker.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/snapshot_checker.h"
+
+#ifdef ROCKSDB_LITE
+#include <assert.h>
+#endif  // ROCKSDB_LITE
+
+#include "port/lang.h"
+#include "utilities/transactions/write_prepared_txn_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifdef ROCKSDB_LITE
+WritePreparedSnapshotChecker::WritePreparedSnapshotChecker(
+    WritePreparedTxnDB* /*txn_db*/) {}
+
+SnapshotCheckerResult WritePreparedSnapshotChecker::CheckInSnapshot(
+    SequenceNumber /*sequence*/, SequenceNumber /*snapshot_sequence*/) const {
+  // Should never be called in LITE mode.
+  assert(false);
+  return SnapshotCheckerResult::kInSnapshot;
+}
+
+#else
+
+WritePreparedSnapshotChecker::WritePreparedSnapshotChecker(
+    WritePreparedTxnDB* txn_db)
+    : txn_db_(txn_db){};
+
+SnapshotCheckerResult WritePreparedSnapshotChecker::CheckInSnapshot(
+    SequenceNumber sequence, SequenceNumber snapshot_sequence) const {
+  bool snapshot_released = false;
+  // TODO(myabandeh): set min_uncommitted
+  bool in_snapshot = txn_db_->IsInSnapshot(
+      sequence, snapshot_sequence, kMinUnCommittedSeq, &snapshot_released);
+  if (snapshot_released) {
+    return SnapshotCheckerResult::kSnapshotReleased;
+  }
+  return in_snapshot ? SnapshotCheckerResult::kInSnapshot
+                     : SnapshotCheckerResult::kNotInSnapshot;
+}
+
+#endif  // ROCKSDB_LITE
+
+DisableGCSnapshotChecker* DisableGCSnapshotChecker::Instance() {
+  STATIC_AVOID_DESTRUCTION(DisableGCSnapshotChecker, instance);
+  return &instance;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/transactions/timestamped_snapshot_test.cc b/src/rocksdb/utilities/transactions/timestamped_snapshot_test.cc
new file mode 100644
index 000000000..e9b474415
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/timestamped_snapshot_test.cc
@@ -0,0 +1,466 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifdef ROCKSDB_LITE
+#include <cstdio>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as Transactions are not supported in LITE mode\n");
+  return 0;
+}
+#else  // ROCKSDB_LITE
+#include <cassert>
+
+#include "util/cast_util.h"
+#include "utilities/transactions/transaction_test.h"
+
+namespace ROCKSDB_NAMESPACE {
+INSTANTIATE_TEST_CASE_P(
+    Unsupported, TimestampedSnapshotWithTsSanityCheck,
+    ::testing::Values(
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite),
+        std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite)));
+
+INSTANTIATE_TEST_CASE_P(WriteCommitted, TransactionTest,
+                        ::testing::Combine(::testing::Bool(), ::testing::Bool(),
+                                           ::testing::Values(WRITE_COMMITTED),
+                                           ::testing::Values(kOrderedWrite)));
+
+namespace {
+// Not thread-safe. Caller needs to provide external synchronization.
+class TsCheckingTxnNotifier : public TransactionNotifier {
+ public:
+  explicit TsCheckingTxnNotifier() = default;
+
+  ~TsCheckingTxnNotifier() override {}
+
+  void SnapshotCreated(const Snapshot* new_snapshot) override {
+    assert(new_snapshot);
+    if (prev_snapshot_seq_ != kMaxSequenceNumber) {
+      assert(prev_snapshot_seq_ <= new_snapshot->GetSequenceNumber());
+    }
+    prev_snapshot_seq_ = new_snapshot->GetSequenceNumber();
+    if (prev_snapshot_ts_ != kMaxTxnTimestamp) {
+      assert(prev_snapshot_ts_ <= new_snapshot->GetTimestamp());
+    }
+    prev_snapshot_ts_ = new_snapshot->GetTimestamp();
+  }
+
+  TxnTimestamp prev_snapshot_ts() const { return prev_snapshot_ts_; }
+
+ private:
+  SequenceNumber prev_snapshot_seq_ = kMaxSequenceNumber;
+  TxnTimestamp prev_snapshot_ts_ = kMaxTxnTimestamp;
+};
+}  // anonymous namespace
+
+TEST_P(TimestampedSnapshotWithTsSanityCheck, WithoutCommitTs) {
+  std::unique_ptr<Transaction> txn(
+      db->BeginTransaction(WriteOptions(), TransactionOptions()));
+  assert(txn);
+  ASSERT_OK(txn->SetName("txn0"));
+  ASSERT_OK(txn->Put("a", "v"));
+  ASSERT_OK(txn->Prepare());
+  Status s = txn->CommitAndTryCreateSnapshot();
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_OK(txn->Rollback());
+
+  txn.reset(db->BeginTransaction(WriteOptions(), TransactionOptions()));
+  assert(txn);
+  ASSERT_OK(txn->SetName("txn0"));
+  ASSERT_OK(txn->Put("a", "v"));
+  s = txn->CommitAndTryCreateSnapshot();
+  ASSERT_TRUE(s.IsInvalidArgument());
+}
+
+TEST_P(TimestampedSnapshotWithTsSanityCheck, SetCommitTs) {
+  std::unique_ptr<Transaction> txn(
+      db->BeginTransaction(WriteOptions(), TransactionOptions()));
+  assert(txn);
+  ASSERT_OK(txn->SetName("txn0"));
+  ASSERT_OK(txn->Put("a", "v"));
+  ASSERT_OK(txn->Prepare());
+  std::shared_ptr<const Snapshot> snapshot;
+  Status s = txn->CommitAndTryCreateSnapshot(nullptr, 10, &snapshot);
+  ASSERT_TRUE(s.IsNotSupported());
+  ASSERT_OK(txn->Rollback());
+
+  txn.reset(db->BeginTransaction(WriteOptions(), TransactionOptions()));
+  assert(txn);
+  ASSERT_OK(txn->SetName("txn0"));
+  ASSERT_OK(txn->Put("a", "v"));
+  s = txn->CommitAndTryCreateSnapshot(nullptr, 10, &snapshot);
+  ASSERT_TRUE(s.IsNotSupported());
+}
+
+TEST_P(TransactionTest, WithoutCommitTs) {
+  std::unique_ptr<Transaction> txn(
+      db->BeginTransaction(WriteOptions(), TransactionOptions()));
+  assert(txn);
+  ASSERT_OK(txn->SetName("txn0"));
+  ASSERT_OK(txn->Put("a", "v"));
+  ASSERT_OK(txn->Prepare());
+  Status s = txn->CommitAndTryCreateSnapshot();
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_OK(txn->Rollback());
+
+  txn.reset(db->BeginTransaction(WriteOptions(), TransactionOptions()));
+  assert(txn);
+  ASSERT_OK(txn->SetName("txn0"));
+  ASSERT_OK(txn->Put("a", "v"));
+  s = txn->CommitAndTryCreateSnapshot();
+  ASSERT_TRUE(s.IsInvalidArgument());
+}
+
+TEST_P(TransactionTest, ReuseExistingTxn) {
+  Transaction* txn = db->BeginTransaction(WriteOptions(), TransactionOptions());
+  assert(txn);
+  ASSERT_OK(txn->SetName("txn0"));
+  ASSERT_OK(txn->Put("a", "v1"));
+  ASSERT_OK(txn->Prepare());
+
+  auto notifier = std::make_shared<TsCheckingTxnNotifier>();
+  std::shared_ptr<const Snapshot> snapshot1;
+  Status s =
+      txn->CommitAndTryCreateSnapshot(notifier, /*commit_ts=*/100, &snapshot1);
+  ASSERT_OK(s);
+  ASSERT_EQ(100, snapshot1->GetTimestamp());
+
+  Transaction* txn1 =
+      db->BeginTransaction(WriteOptions(), TransactionOptions(), txn);
+  assert(txn1 == txn);
+  ASSERT_OK(txn1->SetName("txn1"));
+  ASSERT_OK(txn->Put("a", "v2"));
+  ASSERT_OK(txn->Prepare());
+  std::shared_ptr<const Snapshot> snapshot2;
+  s = txn->CommitAndTryCreateSnapshot(notifier, /*commit_ts=*/110, &snapshot2);
+  ASSERT_OK(s);
+  ASSERT_EQ(110, snapshot2->GetTimestamp());
+  delete txn;
+
+  {
+    std::string value;
+    ReadOptions read_opts;
+    read_opts.snapshot = snapshot1.get();
+    ASSERT_OK(db->Get(read_opts, "a", &value));
+    ASSERT_EQ("v1", value);
+
+    read_opts.snapshot = snapshot2.get();
+    ASSERT_OK(db->Get(read_opts, "a", &value));
+    ASSERT_EQ("v2", value);
+  }
+}
+
+TEST_P(TransactionTest, CreateSnapshotWhenCommit) {
+  std::unique_ptr<Transaction> txn(
+      db->BeginTransaction(WriteOptions(), TransactionOptions()));
+  assert(txn);
+
+  constexpr int batch_size = 10;
+  for (int i = 0; i < batch_size; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), "k" + std::to_string(i), "v0"));
+  }
+  const SequenceNumber seq0 = db->GetLatestSequenceNumber();
+  ASSERT_EQ(static_cast<SequenceNumber>(batch_size), seq0);
+
+  txn->SetSnapshot();
+  {
+    const Snapshot* const snapshot = txn->GetSnapshot();
+    assert(snapshot);
+    ASSERT_EQ(seq0, snapshot->GetSequenceNumber());
+  }
+
+  for (int i = 0; i < batch_size; ++i) {
+    ASSERT_OK(txn->Put("k" + std::to_string(i), "v1"));
+  }
+  ASSERT_OK(txn->SetName("txn0"));
+  ASSERT_OK(txn->Prepare());
+
+  std::shared_ptr<const Snapshot> snapshot;
+  constexpr TxnTimestamp timestamp = 1;
+  auto notifier = std::make_shared<TsCheckingTxnNotifier>();
+  Status s = txn->CommitAndTryCreateSnapshot(notifier, timestamp, &snapshot);
+  ASSERT_OK(s);
+  ASSERT_LT(notifier->prev_snapshot_ts(), kMaxTxnTimestamp);
+  assert(snapshot);
+  ASSERT_EQ(timestamp, snapshot->GetTimestamp());
+  ASSERT_EQ(seq0 + batch_size, snapshot->GetSequenceNumber());
+  const Snapshot* const raw_snapshot_ptr = txn->GetSnapshot();
+  ASSERT_EQ(raw_snapshot_ptr, snapshot.get());
+  ASSERT_EQ(snapshot, txn->GetTimestampedSnapshot());
+
+  {
+    std::shared_ptr<const Snapshot> snapshot1 =
+        db->GetLatestTimestampedSnapshot();
+    ASSERT_EQ(snapshot, snapshot1);
+  }
+  {
+    std::shared_ptr<const Snapshot> snapshot1 =
+        db->GetTimestampedSnapshot(timestamp);
+    ASSERT_EQ(snapshot, snapshot1);
+  }
+  {
+    std::vector<std::shared_ptr<const Snapshot> > snapshots;
+    s = db->GetAllTimestampedSnapshots(snapshots);
+    ASSERT_OK(s);
+    ASSERT_EQ(std::vector<std::shared_ptr<const Snapshot> >{snapshot},
+              snapshots);
+  }
+}
+
+TEST_P(TransactionTest, CreateSnapshot) {
+  // First create a non-timestamped snapshot
+  ManagedSnapshot snapshot_guard(db);
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), "k" + std::to_string(i),
+                      "v0_" + std::to_string(i)));
+  }
+  {
+    auto ret = db->CreateTimestampedSnapshot(kMaxTxnTimestamp);
+    ASSERT_TRUE(ret.first.IsInvalidArgument());
+    auto snapshot = ret.second;
+    ASSERT_EQ(nullptr, snapshot.get());
+  }
+  constexpr TxnTimestamp timestamp = 100;
+  Status s;
+  std::shared_ptr<const Snapshot> ts_snap0;
+  std::tie(s, ts_snap0) = db->CreateTimestampedSnapshot(timestamp);
+  ASSERT_OK(s);
+  assert(ts_snap0);
+  ASSERT_EQ(timestamp, ts_snap0->GetTimestamp());
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_OK(db->Delete(WriteOptions(), "k" + std::to_string(i)));
+  }
+  {
+    ReadOptions read_opts;
+    read_opts.snapshot = ts_snap0.get();
+    for (int i = 0; i < 10; ++i) {
+      std::string value;
+      s = db->Get(read_opts, "k" + std::to_string(i), &value);
+      ASSERT_OK(s);
+      ASSERT_EQ("v0_" + std::to_string(i), value);
+    }
+  }
+  {
+    std::shared_ptr<const Snapshot> snapshot =
+        db->GetLatestTimestampedSnapshot();
+    ASSERT_EQ(ts_snap0, snapshot);
+  }
+  {
+    std::shared_ptr<const Snapshot> snapshot =
+        db->GetTimestampedSnapshot(timestamp);
+    ASSERT_OK(s);
+    ASSERT_EQ(ts_snap0, snapshot);
+  }
+  {
+    std::vector<std::shared_ptr<const Snapshot> > snapshots;
+    s = db->GetAllTimestampedSnapshots(snapshots);
+    ASSERT_OK(s);
+    ASSERT_EQ(std::vector<std::shared_ptr<const Snapshot> >{ts_snap0},
+              snapshots);
+  }
+}
+
+TEST_P(TransactionTest, SequenceAndTsOrder) {
+  Status s;
+  std::shared_ptr<const Snapshot> snapshot;
+  std::tie(s, snapshot) = db->CreateTimestampedSnapshot(100);
+  ASSERT_OK(s);
+  assert(snapshot);
+  {
+    // Cannot request smaller timestamp for the new timestamped snapshot.
+    std::shared_ptr<const Snapshot> tmp_snapshot;
+    std::tie(s, tmp_snapshot) = db->CreateTimestampedSnapshot(50);
+    ASSERT_TRUE(s.IsInvalidArgument());
+    ASSERT_EQ(nullptr, tmp_snapshot.get());
+  }
+
+  // If requesting a new timestamped snapshot with the same timestamp and
+  // sequence number, we avoid creating new snapshot object but reuse
+  // exisisting one.
+  std::shared_ptr<const Snapshot> snapshot1;
+  std::tie(s, snapshot1) = db->CreateTimestampedSnapshot(100);
+  ASSERT_OK(s);
+  ASSERT_EQ(snapshot.get(), snapshot1.get());
+
+  // If there is no write, but we request a larger timestamp, we still create
+  // a new snapshot object.
+  std::shared_ptr<const Snapshot> snapshot2;
+  std::tie(s, snapshot2) = db->CreateTimestampedSnapshot(200);
+  ASSERT_OK(s);
+  assert(snapshot2);
+  ASSERT_NE(snapshot.get(), snapshot2.get());
+  ASSERT_EQ(snapshot2->GetSequenceNumber(), snapshot->GetSequenceNumber());
+  ASSERT_EQ(200, snapshot2->GetTimestamp());
+
+  // Increase sequence number.
+  ASSERT_OK(db->Put(WriteOptions(), "foo", "v0"));
+  {
+    // We are requesting the same timestamp for a larger sequence number, thus
+    // we cannot create timestamped snapshot.
+    std::shared_ptr<const Snapshot> tmp_snapshot;
+    std::tie(s, tmp_snapshot) = db->CreateTimestampedSnapshot(200);
+    ASSERT_TRUE(s.IsInvalidArgument());
+    ASSERT_EQ(nullptr, tmp_snapshot.get());
+  }
+  {
+    std::unique_ptr<Transaction> txn1(
+        db->BeginTransaction(WriteOptions(), TransactionOptions()));
+    ASSERT_OK(txn1->Put("bar", "v0"));
+    std::shared_ptr<const Snapshot> ss;
+    ASSERT_OK(txn1->CommitAndTryCreateSnapshot(nullptr, 200, &ss));
+    // Cannot create snapshot because requested timestamp is the same as the
+    // latest timestamped snapshot while sequence number is strictly higher.
+    ASSERT_EQ(nullptr, ss);
+  }
+  {
+    std::unique_ptr<Transaction> txn2(
+        db->BeginTransaction(WriteOptions(), TransactionOptions()));
+    ASSERT_OK(txn2->Put("bar", "v0"));
+    std::shared_ptr<const Snapshot> ss;
+    // Application should never do this. This is just to demonstrate error
+    // handling.
+    ASSERT_OK(txn2->CommitAndTryCreateSnapshot(nullptr, 100, &ss));
+    // Cannot create snapshot because requested timestamp is smaller than
+    // latest timestamped snapshot.
+    ASSERT_EQ(nullptr, ss);
+  }
+}
+
+TEST_P(TransactionTest, CloseDbWithSnapshots) {
+  std::unique_ptr<Transaction> txn(
+      db->BeginTransaction(WriteOptions(), TransactionOptions()));
+  ASSERT_OK(txn->SetName("txn0"));
+  ASSERT_OK(txn->Put("foo", "v"));
+  ASSERT_OK(txn->Prepare());
+  std::shared_ptr<const Snapshot> snapshot;
+  constexpr TxnTimestamp timestamp = 121;
+  auto notifier = std::make_shared<TsCheckingTxnNotifier>();
+  ASSERT_OK(txn->CommitAndTryCreateSnapshot(notifier, timestamp, &snapshot));
+  assert(snapshot);
+  ASSERT_LT(notifier->prev_snapshot_ts(), kMaxTxnTimestamp);
+  ASSERT_EQ(timestamp, snapshot->GetTimestamp());
+  ASSERT_TRUE(db->Close().IsAborted());
+}
+
+TEST_P(TransactionTest, MultipleTimestampedSnapshots) {
+  auto* dbimpl = static_cast_with_check<DBImpl>(db->GetRootDB());
+  assert(dbimpl);
+  const bool seq_per_batch = dbimpl->seq_per_batch();
+  // TODO: remove the following assert(!seq_per_batch) once timestamped snapshot
+  // is supported in write-prepared/write-unprepared transactions.
+  assert(!seq_per_batch);
+  constexpr size_t txn_size = 10;
+  constexpr TxnTimestamp ts_delta = 10;
+  constexpr size_t num_txns = 100;
+  std::vector<std::shared_ptr<const Snapshot> > snapshots(num_txns);
+  constexpr TxnTimestamp start_ts = 10000;
+  auto notifier = std::make_shared<TsCheckingTxnNotifier>();
+  for (size_t i = 0; i < num_txns; ++i) {
+    std::unique_ptr<Transaction> txn(
+        db->BeginTransaction(WriteOptions(), TransactionOptions()));
+    ASSERT_OK(txn->SetName("txn" + std::to_string(i)));
+    for (size_t j = 0; j < txn_size; ++j) {
+      ASSERT_OK(txn->Put("k" + std::to_string(j),
+                         "v" + std::to_string(j) + "_" + std::to_string(i)));
+    }
+    if (0 == (i % 2)) {
+      ASSERT_OK(txn->Prepare());
+    }
+    ASSERT_OK(txn->CommitAndTryCreateSnapshot(notifier, start_ts + i * ts_delta,
+                                              &snapshots[i]));
+    assert(snapshots[i]);
+    ASSERT_LT(notifier->prev_snapshot_ts(), kMaxTxnTimestamp);
+    ASSERT_EQ(start_ts + i * ts_delta, snapshots[i]->GetTimestamp());
+  }
+
+  {
+    auto snapshot = db->GetTimestampedSnapshot(start_ts + 1);
+    ASSERT_EQ(nullptr, snapshot);
+  }
+
+  constexpr TxnTimestamp max_ts = start_ts + num_txns * ts_delta;
+  for (size_t i = 0; i < num_txns; ++i) {
+    auto snapshot = db->GetTimestampedSnapshot(start_ts + i * ts_delta);
+    ASSERT_EQ(snapshots[i], snapshot);
+
+    std::vector<std::shared_ptr<const Snapshot> > tmp_snapshots;
+    Status s = db->GetTimestampedSnapshots(max_ts, start_ts + i * ts_delta,
+                                           tmp_snapshots);
+    ASSERT_TRUE(s.IsInvalidArgument());
+    ASSERT_TRUE(tmp_snapshots.empty());
+
+    for (size_t j = i; j < num_txns; ++j) {
+      std::vector<std::shared_ptr<const Snapshot> > expected_snapshots(
+          snapshots.begin() + i, snapshots.begin() + j);
+      tmp_snapshots.clear();
+      s = db->GetTimestampedSnapshots(start_ts + i * ts_delta,
+                                      start_ts + j * ts_delta, tmp_snapshots);
+      if (i < j) {
+        ASSERT_OK(s);
+      } else {
+        ASSERT_TRUE(s.IsInvalidArgument());
+      }
+      ASSERT_EQ(expected_snapshots, tmp_snapshots);
+    }
+  }
+
+  {
+    std::vector<std::shared_ptr<const Snapshot> > tmp_snapshots;
+    const Status s = db->GetAllTimestampedSnapshots(tmp_snapshots);
+    ASSERT_OK(s);
+    ASSERT_EQ(snapshots, tmp_snapshots);
+
+    const std::shared_ptr<const Snapshot> latest_snapshot =
+        db->GetLatestTimestampedSnapshot();
+    ASSERT_EQ(snapshots.back(), latest_snapshot);
+  }
+
+  for (size_t i = 0; i <= num_txns; ++i) {
+    std::vector<std::shared_ptr<const Snapshot> > snapshots1(
+        snapshots.begin() + i, snapshots.end());
+    if (i > 0) {
+      auto snapshot1 =
+          db->GetTimestampedSnapshot(start_ts + (i - 1) * ts_delta);
+      assert(snapshot1);
+      ASSERT_EQ(start_ts + (i - 1) * ts_delta, snapshot1->GetTimestamp());
+    }
+
+    db->ReleaseTimestampedSnapshotsOlderThan(start_ts + i * ts_delta);
+
+    if (i > 0) {
+      auto snapshot1 =
+          db->GetTimestampedSnapshot(start_ts + (i - 1) * ts_delta);
+      ASSERT_EQ(nullptr, snapshot1);
+    }
+
+    std::vector<std::shared_ptr<const Snapshot> > tmp_snapshots;
+    const Status s = db->GetAllTimestampedSnapshots(tmp_snapshots);
+    ASSERT_OK(s);
+    ASSERT_EQ(snapshots1, tmp_snapshots);
+  }
+
+  // Even after released by db, the applications still hold reference to shared
+  // snapshots.
+  for (size_t i = 0; i < num_txns; ++i) {
+    assert(snapshots[i]);
+    ASSERT_EQ(start_ts + i * ts_delta, snapshots[i]->GetTimestamp());
+  }
+
+  snapshots.clear();
+  ASSERT_OK(db->Close());
+  delete db;
+  db = nullptr;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/transaction_base.cc b/src/rocksdb/utilities/transactions/transaction_base.cc
new file mode 100644
index 000000000..83fd94ac8
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_base.cc
@@ -0,0 +1,731 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/transaction_base.h"
+
+#include <cinttypes>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "logging/logging.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+#include "util/cast_util.h"
+#include "util/string_util.h"
+#include "utilities/transactions/lock/lock_tracker.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status Transaction::CommitAndTryCreateSnapshot(
+    std::shared_ptr<TransactionNotifier> notifier, TxnTimestamp ts,
+    std::shared_ptr<const Snapshot>* snapshot) {
+  if (snapshot) {
+    snapshot->reset();
+  }
+  TxnTimestamp commit_ts = GetCommitTimestamp();
+  if (commit_ts == kMaxTxnTimestamp) {
+    if (ts == kMaxTxnTimestamp) {
+      return Status::InvalidArgument("Commit timestamp unset");
+    } else {
+      const Status s = SetCommitTimestamp(ts);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  } else if (ts != kMaxTxnTimestamp) {
+    if (ts != commit_ts) {
+      // For now we treat this as error.
+      return Status::InvalidArgument("Different commit ts specified");
+    }
+  }
+  SetSnapshotOnNextOperation(notifier);
+  Status s = Commit();
+  if (!s.ok()) {
+    return s;
+  }
+  assert(s.ok());
+  // If we reach here, we must return ok status for this function.
+  std::shared_ptr<const Snapshot> new_snapshot = GetTimestampedSnapshot();
+
+  if (snapshot) {
+    *snapshot = new_snapshot;
+  }
+  return Status::OK();
+}
+
+TransactionBaseImpl::TransactionBaseImpl(
+    DB* db, const WriteOptions& write_options,
+    const LockTrackerFactory& lock_tracker_factory)
+    : db_(db),
+      dbimpl_(static_cast_with_check<DBImpl>(db)),
+      write_options_(write_options),
+      cmp_(GetColumnFamilyUserComparator(db->DefaultColumnFamily())),
+      lock_tracker_factory_(lock_tracker_factory),
+      start_time_(dbimpl_->GetSystemClock()->NowMicros()),
+      write_batch_(cmp_, 0, true, 0, write_options.protection_bytes_per_key),
+      tracked_locks_(lock_tracker_factory_.Create()),
+      commit_time_batch_(0 /* reserved_bytes */, 0 /* max_bytes */,
+                         write_options.protection_bytes_per_key,
+                         0 /* default_cf_ts_sz */),
+      indexing_enabled_(true) {
+  assert(dynamic_cast<DBImpl*>(db_) != nullptr);
+  log_number_ = 0;
+  if (dbimpl_->allow_2pc()) {
+    InitWriteBatch();
+  }
+}
+
+TransactionBaseImpl::~TransactionBaseImpl() {
+  // Release snapshot if snapshot is set
+  SetSnapshotInternal(nullptr);
+}
+
+void TransactionBaseImpl::Clear() {
+  save_points_.reset(nullptr);
+  write_batch_.Clear();
+  commit_time_batch_.Clear();
+  tracked_locks_->Clear();
+  num_puts_ = 0;
+  num_deletes_ = 0;
+  num_merges_ = 0;
+
+  if (dbimpl_->allow_2pc()) {
+    InitWriteBatch();
+  }
+}
+
+void TransactionBaseImpl::Reinitialize(DB* db,
+                                       const WriteOptions& write_options) {
+  Clear();
+  ClearSnapshot();
+  id_ = 0;
+  db_ = db;
+  name_.clear();
+  log_number_ = 0;
+  write_options_ = write_options;
+  start_time_ = dbimpl_->GetSystemClock()->NowMicros();
+  indexing_enabled_ = true;
+  cmp_ = GetColumnFamilyUserComparator(db_->DefaultColumnFamily());
+  WriteBatchInternal::UpdateProtectionInfo(
+      write_batch_.GetWriteBatch(), write_options_.protection_bytes_per_key)
+      .PermitUncheckedError();
+  WriteBatchInternal::UpdateProtectionInfo(
+      &commit_time_batch_, write_options_.protection_bytes_per_key)
+      .PermitUncheckedError();
+}
+
+void TransactionBaseImpl::SetSnapshot() {
+  const Snapshot* snapshot = dbimpl_->GetSnapshotForWriteConflictBoundary();
+  SetSnapshotInternal(snapshot);
+}
+
+void TransactionBaseImpl::SetSnapshotInternal(const Snapshot* snapshot) {
+  // Set a custom deleter for the snapshot_ SharedPtr as the snapshot needs to
+  // be released, not deleted when it is no longer referenced.
+  snapshot_.reset(snapshot, std::bind(&TransactionBaseImpl::ReleaseSnapshot,
+                                      this, std::placeholders::_1, db_));
+  snapshot_needed_ = false;
+  snapshot_notifier_ = nullptr;
+}
+
+void TransactionBaseImpl::SetSnapshotOnNextOperation(
+    std::shared_ptr<TransactionNotifier> notifier) {
+  snapshot_needed_ = true;
+  snapshot_notifier_ = notifier;
+}
+
+void TransactionBaseImpl::SetSnapshotIfNeeded() {
+  if (snapshot_needed_) {
+    std::shared_ptr<TransactionNotifier> notifier = snapshot_notifier_;
+    SetSnapshot();
+    if (notifier != nullptr) {
+      notifier->SnapshotCreated(GetSnapshot());
+    }
+  }
+}
+
+Status TransactionBaseImpl::TryLock(ColumnFamilyHandle* column_family,
+                                    const SliceParts& key, bool read_only,
+                                    bool exclusive, const bool do_validate,
+                                    const bool assume_tracked) {
+  size_t key_size = 0;
+  for (int i = 0; i < key.num_parts; ++i) {
+    key_size += key.parts[i].size();
+  }
+
+  std::string str;
+  str.reserve(key_size);
+
+  for (int i = 0; i < key.num_parts; ++i) {
+    str.append(key.parts[i].data(), key.parts[i].size());
+  }
+
+  return TryLock(column_family, str, read_only, exclusive, do_validate,
+                 assume_tracked);
+}
+
+void TransactionBaseImpl::SetSavePoint() {
+  if (save_points_ == nullptr) {
+    save_points_.reset(
+        new std::stack<TransactionBaseImpl::SavePoint,
+                       autovector<TransactionBaseImpl::SavePoint>>());
+  }
+  save_points_->emplace(snapshot_, snapshot_needed_, snapshot_notifier_,
+                        num_puts_, num_deletes_, num_merges_,
+                        lock_tracker_factory_);
+  write_batch_.SetSavePoint();
+}
+
+Status TransactionBaseImpl::RollbackToSavePoint() {
+  if (save_points_ != nullptr && save_points_->size() > 0) {
+    // Restore saved SavePoint
+    TransactionBaseImpl::SavePoint& save_point = save_points_->top();
+    snapshot_ = save_point.snapshot_;
+    snapshot_needed_ = save_point.snapshot_needed_;
+    snapshot_notifier_ = save_point.snapshot_notifier_;
+    num_puts_ = save_point.num_puts_;
+    num_deletes_ = save_point.num_deletes_;
+    num_merges_ = save_point.num_merges_;
+
+    // Rollback batch
+    Status s = write_batch_.RollbackToSavePoint();
+    assert(s.ok());
+
+    // Rollback any keys that were tracked since the last savepoint
+    tracked_locks_->Subtract(*save_point.new_locks_);
+
+    save_points_->pop();
+
+    return s;
+  } else {
+    assert(write_batch_.RollbackToSavePoint().IsNotFound());
+    return Status::NotFound();
+  }
+}
+
+Status TransactionBaseImpl::PopSavePoint() {
+  if (save_points_ == nullptr || save_points_->empty()) {
+    // No SavePoint yet.
+    assert(write_batch_.PopSavePoint().IsNotFound());
+    return Status::NotFound();
+  }
+
+  assert(!save_points_->empty());
+  // If there is another savepoint A below the current savepoint B, then A needs
+  // to inherit tracked_keys in B so that if we rollback to savepoint A, we
+  // remember to unlock keys in B. If there is no other savepoint below, then we
+  // can safely discard savepoint info.
+  if (save_points_->size() == 1) {
+    save_points_->pop();
+  } else {
+    TransactionBaseImpl::SavePoint top(lock_tracker_factory_);
+    std::swap(top, save_points_->top());
+    save_points_->pop();
+
+    save_points_->top().new_locks_->Merge(*top.new_locks_);
+  }
+
+  return write_batch_.PopSavePoint();
+}
+
+Status TransactionBaseImpl::Get(const ReadOptions& read_options,
+                                ColumnFamilyHandle* column_family,
+                                const Slice& key, std::string* value) {
+  assert(value != nullptr);
+  PinnableSlice pinnable_val(value);
+  assert(!pinnable_val.IsPinned());
+  auto s = Get(read_options, column_family, key, &pinnable_val);
+  if (s.ok() && pinnable_val.IsPinned()) {
+    value->assign(pinnable_val.data(), pinnable_val.size());
+  }  // else value is already assigned
+  return s;
+}
+
+Status TransactionBaseImpl::Get(const ReadOptions& read_options,
+                                ColumnFamilyHandle* column_family,
+                                const Slice& key, PinnableSlice* pinnable_val) {
+  return write_batch_.GetFromBatchAndDB(db_, read_options, column_family, key,
+                                        pinnable_val);
+}
+
+Status TransactionBaseImpl::GetForUpdate(const ReadOptions& read_options,
+                                         ColumnFamilyHandle* column_family,
+                                         const Slice& key, std::string* value,
+                                         bool exclusive,
+                                         const bool do_validate) {
+  if (!do_validate && read_options.snapshot != nullptr) {
+    return Status::InvalidArgument(
+        "If do_validate is false then GetForUpdate with snapshot is not "
+        "defined.");
+  }
+  Status s =
+      TryLock(column_family, key, true /* read_only */, exclusive, do_validate);
+
+  if (s.ok() && value != nullptr) {
+    assert(value != nullptr);
+    PinnableSlice pinnable_val(value);
+    assert(!pinnable_val.IsPinned());
+    s = Get(read_options, column_family, key, &pinnable_val);
+    if (s.ok() && pinnable_val.IsPinned()) {
+      value->assign(pinnable_val.data(), pinnable_val.size());
+    }  // else value is already assigned
+  }
+  return s;
+}
+
+Status TransactionBaseImpl::GetForUpdate(const ReadOptions& read_options,
+                                         ColumnFamilyHandle* column_family,
+                                         const Slice& key,
+                                         PinnableSlice* pinnable_val,
+                                         bool exclusive,
+                                         const bool do_validate) {
+  if (!do_validate && read_options.snapshot != nullptr) {
+    return Status::InvalidArgument(
+        "If do_validate is false then GetForUpdate with snapshot is not "
+        "defined.");
+  }
+  Status s =
+      TryLock(column_family, key, true /* read_only */, exclusive, do_validate);
+
+  if (s.ok() && pinnable_val != nullptr) {
+    s = Get(read_options, column_family, key, pinnable_val);
+  }
+  return s;
+}
+
+std::vector<Status> TransactionBaseImpl::MultiGet(
+    const ReadOptions& read_options,
+    const std::vector<ColumnFamilyHandle*>& column_family,
+    const std::vector<Slice>& keys, std::vector<std::string>* values) {
+  size_t num_keys = keys.size();
+  values->resize(num_keys);
+
+  std::vector<Status> stat_list(num_keys);
+  for (size_t i = 0; i < num_keys; ++i) {
+    stat_list[i] = Get(read_options, column_family[i], keys[i], &(*values)[i]);
+  }
+
+  return stat_list;
+}
+
+void TransactionBaseImpl::MultiGet(const ReadOptions& read_options,
+                                   ColumnFamilyHandle* column_family,
+                                   const size_t num_keys, const Slice* keys,
+                                   PinnableSlice* values, Status* statuses,
+                                   const bool sorted_input) {
+  write_batch_.MultiGetFromBatchAndDB(db_, read_options, column_family,
+                                      num_keys, keys, values, statuses,
+                                      sorted_input);
+}
+
+std::vector<Status> TransactionBaseImpl::MultiGetForUpdate(
+    const ReadOptions& read_options,
+    const std::vector<ColumnFamilyHandle*>& column_family,
+    const std::vector<Slice>& keys, std::vector<std::string>* values) {
+  // Regardless of whether the MultiGet succeeded, track these keys.
+  size_t num_keys = keys.size();
+  values->resize(num_keys);
+
+  // Lock all keys
+  for (size_t i = 0; i < num_keys; ++i) {
+    Status s = TryLock(column_family[i], keys[i], true /* read_only */,
+                       true /* exclusive */);
+    if (!s.ok()) {
+      // Fail entire multiget if we cannot lock all keys
+      return std::vector<Status>(num_keys, s);
+    }
+  }
+
+  // TODO(agiardullo): optimize multiget?
+  std::vector<Status> stat_list(num_keys);
+  for (size_t i = 0; i < num_keys; ++i) {
+    stat_list[i] = Get(read_options, column_family[i], keys[i], &(*values)[i]);
+  }
+
+  return stat_list;
+}
+
+Iterator* TransactionBaseImpl::GetIterator(const ReadOptions& read_options) {
+  Iterator* db_iter = db_->NewIterator(read_options);
+  assert(db_iter);
+
+  return write_batch_.NewIteratorWithBase(db_->DefaultColumnFamily(), db_iter,
+                                          &read_options);
+}
+
+Iterator* TransactionBaseImpl::GetIterator(const ReadOptions& read_options,
+                                           ColumnFamilyHandle* column_family) {
+  Iterator* db_iter = db_->NewIterator(read_options, column_family);
+  assert(db_iter);
+
+  return write_batch_.NewIteratorWithBase(column_family, db_iter,
+                                          &read_options);
+}
+
+Status TransactionBaseImpl::Put(ColumnFamilyHandle* column_family,
+                                const Slice& key, const Slice& value,
+                                const bool assume_tracked) {
+  const bool do_validate = !assume_tracked;
+  Status s = TryLock(column_family, key, false /* read_only */,
+                     true /* exclusive */, do_validate, assume_tracked);
+
+  if (s.ok()) {
+    s = GetBatchForWrite()->Put(column_family, key, value);
+    if (s.ok()) {
+      num_puts_++;
+    }
+  }
+
+  return s;
+}
+
+Status TransactionBaseImpl::Put(ColumnFamilyHandle* column_family,
+                                const SliceParts& key, const SliceParts& value,
+                                const bool assume_tracked) {
+  const bool do_validate = !assume_tracked;
+  Status s = TryLock(column_family, key, false /* read_only */,
+                     true /* exclusive */, do_validate, assume_tracked);
+
+  if (s.ok()) {
+    s = GetBatchForWrite()->Put(column_family, key, value);
+    if (s.ok()) {
+      num_puts_++;
+    }
+  }
+
+  return s;
+}
+
+Status TransactionBaseImpl::Merge(ColumnFamilyHandle* column_family,
+                                  const Slice& key, const Slice& value,
+                                  const bool assume_tracked) {
+  const bool do_validate = !assume_tracked;
+  Status s = TryLock(column_family, key, false /* read_only */,
+                     true /* exclusive */, do_validate, assume_tracked);
+
+  if (s.ok()) {
+    s = GetBatchForWrite()->Merge(column_family, key, value);
+    if (s.ok()) {
+      num_merges_++;
+    }
+  }
+
+  return s;
+}
+
+Status TransactionBaseImpl::Delete(ColumnFamilyHandle* column_family,
+                                   const Slice& key,
+                                   const bool assume_tracked) {
+  const bool do_validate = !assume_tracked;
+  Status s = TryLock(column_family, key, false /* read_only */,
+                     true /* exclusive */, do_validate, assume_tracked);
+
+  if (s.ok()) {
+    s = GetBatchForWrite()->Delete(column_family, key);
+    if (s.ok()) {
+      num_deletes_++;
+    }
+  }
+
+  return s;
+}
+
+Status TransactionBaseImpl::Delete(ColumnFamilyHandle* column_family,
+                                   const SliceParts& key,
+                                   const bool assume_tracked) {
+  const bool do_validate = !assume_tracked;
+  Status s = TryLock(column_family, key, false /* read_only */,
+                     true /* exclusive */, do_validate, assume_tracked);
+
+  if (s.ok()) {
+    s = GetBatchForWrite()->Delete(column_family, key);
+    if (s.ok()) {
+      num_deletes_++;
+    }
+  }
+
+  return s;
+}
+
+Status TransactionBaseImpl::SingleDelete(ColumnFamilyHandle* column_family,
+                                         const Slice& key,
+                                         const bool assume_tracked) {
+  const bool do_validate = !assume_tracked;
+  Status s = TryLock(column_family, key, false /* read_only */,
+                     true /* exclusive */, do_validate, assume_tracked);
+
+  if (s.ok()) {
+    s = GetBatchForWrite()->SingleDelete(column_family, key);
+    if (s.ok()) {
+      num_deletes_++;
+    }
+  }
+
+  return s;
+}
+
+Status TransactionBaseImpl::SingleDelete(ColumnFamilyHandle* column_family,
+                                         const SliceParts& key,
+                                         const bool assume_tracked) {
+  const bool do_validate = !assume_tracked;
+  Status s = TryLock(column_family, key, false /* read_only */,
+                     true /* exclusive */, do_validate, assume_tracked);
+
+  if (s.ok()) {
+    s = GetBatchForWrite()->SingleDelete(column_family, key);
+    if (s.ok()) {
+      num_deletes_++;
+    }
+  }
+
+  return s;
+}
+
+Status TransactionBaseImpl::PutUntracked(ColumnFamilyHandle* column_family,
+                                         const Slice& key, const Slice& value) {
+  Status s = TryLock(column_family, key, false /* read_only */,
+                     true /* exclusive */, false /* do_validate */);
+
+  if (s.ok()) {
+    s = GetBatchForWrite()->Put(column_family, key, value);
+    if (s.ok()) {
+      num_puts_++;
+    }
+  }
+
+  return s;
+}
+
+Status TransactionBaseImpl::PutUntracked(ColumnFamilyHandle* column_family,
+                                         const SliceParts& key,
+                                         const SliceParts& value) {
+  Status s = TryLock(column_family, key, false /* read_only */,
+                     true /* exclusive */, false /* do_validate */);
+
+  if (s.ok()) {
+    s = GetBatchForWrite()->Put(column_family, key, value);
+    if (s.ok()) {
+      num_puts_++;
+    }
+  }
+
+  return s;
+}
+
+Status TransactionBaseImpl::MergeUntracked(ColumnFamilyHandle* column_family,
+                                           const Slice& key,
+                                           const Slice& value) {
+  Status s = TryLock(column_family, key, false /* read_only */,
+                     true /* exclusive */, false /* do_validate */);
+
+  if (s.ok()) {
+    s = GetBatchForWrite()->Merge(column_family, key, value);
+    if (s.ok()) {
+      num_merges_++;
+    }
+  }
+
+  return s;
+}
+
+Status TransactionBaseImpl::DeleteUntracked(ColumnFamilyHandle* column_family,
+                                            const Slice& key) {
+  Status s = TryLock(column_family, key, false /* read_only */,
+                     true /* exclusive */, false /* do_validate */);
+
+  if (s.ok()) {
+    s = GetBatchForWrite()->Delete(column_family, key);
+    if (s.ok()) {
+      num_deletes_++;
+    }
+  }
+
+  return s;
+}
+
+Status TransactionBaseImpl::DeleteUntracked(ColumnFamilyHandle* column_family,
+                                            const SliceParts& key) {
+  Status s = TryLock(column_family, key, false /* read_only */,
+                     true /* exclusive */, false /* do_validate */);
+
+  if (s.ok()) {
+    s = GetBatchForWrite()->Delete(column_family, key);
+    if (s.ok()) {
+      num_deletes_++;
+    }
+  }
+
+  return s;
+}
+
+Status TransactionBaseImpl::SingleDeleteUntracked(
+    ColumnFamilyHandle* column_family, const Slice& key) {
+  Status s = TryLock(column_family, key, false /* read_only */,
+                     true /* exclusive */, false /* do_validate */);
+
+  if (s.ok()) {
+    s = GetBatchForWrite()->SingleDelete(column_family, key);
+    if (s.ok()) {
+      num_deletes_++;
+    }
+  }
+
+  return s;
+}
+
+void TransactionBaseImpl::PutLogData(const Slice& blob) {
+  auto s = write_batch_.PutLogData(blob);
+  (void)s;
+  assert(s.ok());
+}
+
+WriteBatchWithIndex* TransactionBaseImpl::GetWriteBatch() {
+  return &write_batch_;
+}
+
+uint64_t TransactionBaseImpl::GetElapsedTime() const {
+  return (dbimpl_->GetSystemClock()->NowMicros() - start_time_) / 1000;
+}
+
+uint64_t TransactionBaseImpl::GetNumPuts() const { return num_puts_; }
+
+uint64_t TransactionBaseImpl::GetNumDeletes() const { return num_deletes_; }
+
+uint64_t TransactionBaseImpl::GetNumMerges() const { return num_merges_; }
+
+uint64_t TransactionBaseImpl::GetNumKeys() const {
+  return tracked_locks_->GetNumPointLocks();
+}
+
+void TransactionBaseImpl::TrackKey(uint32_t cfh_id, const std::string& key,
+                                   SequenceNumber seq, bool read_only,
+                                   bool exclusive) {
+  PointLockRequest r;
+  r.column_family_id = cfh_id;
+  r.key = key;
+  r.seq = seq;
+  r.read_only = read_only;
+  r.exclusive = exclusive;
+
+  // Update map of all tracked keys for this transaction
+  tracked_locks_->Track(r);
+
+  if (save_points_ != nullptr && !save_points_->empty()) {
+    // Update map of tracked keys in this SavePoint
+    save_points_->top().new_locks_->Track(r);
+  }
+}
+
+// Gets the write batch that should be used for Put/Merge/Deletes.
+//
+// Returns either a WriteBatch or WriteBatchWithIndex depending on whether
+// DisableIndexing() has been called.
+WriteBatchBase* TransactionBaseImpl::GetBatchForWrite() {
+  if (indexing_enabled_) {
+    // Use WriteBatchWithIndex
+    return &write_batch_;
+  } else {
+    // Don't use WriteBatchWithIndex. Return base WriteBatch.
+    return write_batch_.GetWriteBatch();
+  }
+}
+
+void TransactionBaseImpl::ReleaseSnapshot(const Snapshot* snapshot, DB* db) {
+  if (snapshot != nullptr) {
+    ROCKS_LOG_DETAILS(dbimpl_->immutable_db_options().info_log,
+                      "ReleaseSnapshot %" PRIu64 " Set",
+                      snapshot->GetSequenceNumber());
+    db->ReleaseSnapshot(snapshot);
+  }
+}
+
+void TransactionBaseImpl::UndoGetForUpdate(ColumnFamilyHandle* column_family,
+                                           const Slice& key) {
+  PointLockRequest r;
+  r.column_family_id = GetColumnFamilyID(column_family);
+  r.key = key.ToString();
+  r.read_only = true;
+
+  bool can_untrack = false;
+  if (save_points_ != nullptr && !save_points_->empty()) {
+    // If there is no GetForUpdate of the key in this save point,
+    // then cannot untrack from the global lock tracker.
+    UntrackStatus s = save_points_->top().new_locks_->Untrack(r);
+    can_untrack = (s != UntrackStatus::NOT_TRACKED);
+  } else {
+    // No save point, so can untrack from the global lock tracker.
+    can_untrack = true;
+  }
+
+  if (can_untrack) {
+    // If erased from the global tracker, then can unlock the key.
+    UntrackStatus s = tracked_locks_->Untrack(r);
+    bool can_unlock = (s == UntrackStatus::REMOVED);
+    if (can_unlock) {
+      UnlockGetForUpdate(column_family, key);
+    }
+  }
+}
+
+Status TransactionBaseImpl::RebuildFromWriteBatch(WriteBatch* src_batch) {
+  struct IndexedWriteBatchBuilder : public WriteBatch::Handler {
+    Transaction* txn_;
+    DBImpl* db_;
+    IndexedWriteBatchBuilder(Transaction* txn, DBImpl* db)
+        : txn_(txn), db_(db) {
+      assert(dynamic_cast<TransactionBaseImpl*>(txn_) != nullptr);
+    }
+
+    Status PutCF(uint32_t cf, const Slice& key, const Slice& val) override {
+      return txn_->Put(db_->GetColumnFamilyHandle(cf), key, val);
+    }
+
+    Status DeleteCF(uint32_t cf, const Slice& key) override {
+      return txn_->Delete(db_->GetColumnFamilyHandle(cf), key);
+    }
+
+    Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+      return txn_->SingleDelete(db_->GetColumnFamilyHandle(cf), key);
+    }
+
+    Status MergeCF(uint32_t cf, const Slice& key, const Slice& val) override {
+      return txn_->Merge(db_->GetColumnFamilyHandle(cf), key, val);
+    }
+
+    // this is used for reconstructing prepared transactions upon
+    // recovery. there should not be any meta markers in the batches
+    // we are processing.
+    Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); }
+
+    Status MarkEndPrepare(const Slice&) override {
+      return Status::InvalidArgument();
+    }
+
+    Status MarkCommit(const Slice&) override {
+      return Status::InvalidArgument();
+    }
+
+    Status MarkCommitWithTimestamp(const Slice&, const Slice&) override {
+      return Status::InvalidArgument();
+    }
+
+    Status MarkRollback(const Slice&) override {
+      return Status::InvalidArgument();
+    }
+  };
+
+  IndexedWriteBatchBuilder copycat(this, dbimpl_);
+  return src_batch->Iterate(&copycat);
+}
+
+WriteBatch* TransactionBaseImpl::GetCommitTimeWriteBatch() {
+  return &commit_time_batch_;
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/transaction_base.h b/src/rocksdb/utilities/transactions/transaction_base.h
new file mode 100644
index 000000000..1bcb20ca9
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_base.h
@@ -0,0 +1,384 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <stack>
+#include <string>
+#include <vector>
+
+#include "db/write_batch_internal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/snapshot.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "util/autovector.h"
+#include "utilities/transactions/lock/lock_tracker.h"
+#include "utilities/transactions/transaction_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TransactionBaseImpl : public Transaction {
+ public:
+  TransactionBaseImpl(DB* db, const WriteOptions& write_options,
+                      const LockTrackerFactory& lock_tracker_factory);
+
+  ~TransactionBaseImpl() override;
+
+  // Remove pending operations queued in this transaction.
+  virtual void Clear();
+
+  void Reinitialize(DB* db, const WriteOptions& write_options);
+
+  // Called before executing Put, Merge, Delete, and GetForUpdate.  If TryLock
+  // returns non-OK, the Put/Merge/Delete/GetForUpdate will be failed.
+  // do_validate will be false if called from PutUntracked, DeleteUntracked,
+  // MergeUntracked, or GetForUpdate(do_validate=false)
+  virtual Status TryLock(ColumnFamilyHandle* column_family, const Slice& key,
+                         bool read_only, bool exclusive,
+                         const bool do_validate = true,
+                         const bool assume_tracked = false) = 0;
+
+  void SetSavePoint() override;
+
+  Status RollbackToSavePoint() override;
+
+  Status PopSavePoint() override;
+
+  using Transaction::Get;
+  Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
+             const Slice& key, std::string* value) override;
+
+  Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
+             const Slice& key, PinnableSlice* value) override;
+
+  Status Get(const ReadOptions& options, const Slice& key,
+             std::string* value) override {
+    return Get(options, db_->DefaultColumnFamily(), key, value);
+  }
+
+  using Transaction::GetForUpdate;
+  Status GetForUpdate(const ReadOptions& options,
+                      ColumnFamilyHandle* column_family, const Slice& key,
+                      std::string* value, bool exclusive,
+                      const bool do_validate) override;
+
+  Status GetForUpdate(const ReadOptions& options,
+                      ColumnFamilyHandle* column_family, const Slice& key,
+                      PinnableSlice* pinnable_val, bool exclusive,
+                      const bool do_validate) override;
+
+  Status GetForUpdate(const ReadOptions& options, const Slice& key,
+                      std::string* value, bool exclusive,
+                      const bool do_validate) override {
+    return GetForUpdate(options, db_->DefaultColumnFamily(), key, value,
+                        exclusive, do_validate);
+  }
+
+  using Transaction::MultiGet;
+  std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override;
+
+  std::vector<Status> MultiGet(const ReadOptions& options,
+                               const std::vector<Slice>& keys,
+                               std::vector<std::string>* values) override {
+    return MultiGet(options,
+                    std::vector<ColumnFamilyHandle*>(
+                        keys.size(), db_->DefaultColumnFamily()),
+                    keys, values);
+  }
+
+  void MultiGet(const ReadOptions& options, ColumnFamilyHandle* column_family,
+                const size_t num_keys, const Slice* keys, PinnableSlice* values,
+                Status* statuses, const bool sorted_input = false) override;
+
+  using Transaction::MultiGetForUpdate;
+  std::vector<Status> MultiGetForUpdate(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override;
+
+  std::vector<Status> MultiGetForUpdate(
+      const ReadOptions& options, const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override {
+    return MultiGetForUpdate(options,
+                             std::vector<ColumnFamilyHandle*>(
+                                 keys.size(), db_->DefaultColumnFamily()),
+                             keys, values);
+  }
+
+  Iterator* GetIterator(const ReadOptions& read_options) override;
+  Iterator* GetIterator(const ReadOptions& read_options,
+                        ColumnFamilyHandle* column_family) override;
+
+  Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+             const Slice& value, const bool assume_tracked = false) override;
+  Status Put(const Slice& key, const Slice& value) override {
+    return Put(nullptr, key, value);
+  }
+
+  Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+             const SliceParts& value,
+             const bool assume_tracked = false) override;
+  Status Put(const SliceParts& key, const SliceParts& value) override {
+    return Put(nullptr, key, value);
+  }
+
+  Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+               const Slice& value, const bool assume_tracked = false) override;
+  Status Merge(const Slice& key, const Slice& value) override {
+    return Merge(nullptr, key, value);
+  }
+
+  Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
+                const bool assume_tracked = false) override;
+  Status Delete(const Slice& key) override { return Delete(nullptr, key); }
+  Status Delete(ColumnFamilyHandle* column_family, const SliceParts& key,
+                const bool assume_tracked = false) override;
+  Status Delete(const SliceParts& key) override { return Delete(nullptr, key); }
+
+  Status SingleDelete(ColumnFamilyHandle* column_family, const Slice& key,
+                      const bool assume_tracked = false) override;
+  Status SingleDelete(const Slice& key) override {
+    return SingleDelete(nullptr, key);
+  }
+  Status SingleDelete(ColumnFamilyHandle* column_family, const SliceParts& key,
+                      const bool assume_tracked = false) override;
+  Status SingleDelete(const SliceParts& key) override {
+    return SingleDelete(nullptr, key);
+  }
+
+  Status PutUntracked(ColumnFamilyHandle* column_family, const Slice& key,
+                      const Slice& value) override;
+  Status PutUntracked(const Slice& key, const Slice& value) override {
+    return PutUntracked(nullptr, key, value);
+  }
+
+  Status PutUntracked(ColumnFamilyHandle* column_family, const SliceParts& key,
+                      const SliceParts& value) override;
+  Status PutUntracked(const SliceParts& key, const SliceParts& value) override {
+    return PutUntracked(nullptr, key, value);
+  }
+
+  Status MergeUntracked(ColumnFamilyHandle* column_family, const Slice& key,
+                        const Slice& value) override;
+  Status MergeUntracked(const Slice& key, const Slice& value) override {
+    return MergeUntracked(nullptr, key, value);
+  }
+
+  Status DeleteUntracked(ColumnFamilyHandle* column_family,
+                         const Slice& key) override;
+  Status DeleteUntracked(const Slice& key) override {
+    return DeleteUntracked(nullptr, key);
+  }
+  Status DeleteUntracked(ColumnFamilyHandle* column_family,
+                         const SliceParts& key) override;
+  Status DeleteUntracked(const SliceParts& key) override {
+    return DeleteUntracked(nullptr, key);
+  }
+
+  Status SingleDeleteUntracked(ColumnFamilyHandle* column_family,
+                               const Slice& key) override;
+  Status SingleDeleteUntracked(const Slice& key) override {
+    return SingleDeleteUntracked(nullptr, key);
+  }
+
+  void PutLogData(const Slice& blob) override;
+
+  WriteBatchWithIndex* GetWriteBatch() override;
+
+  virtual void SetLockTimeout(int64_t /*timeout*/) override { /* Do nothing */
+  }
+
+  const Snapshot* GetSnapshot() const override {
+    // will return nullptr when there is no snapshot
+    return snapshot_.get();
+  }
+
+  std::shared_ptr<const Snapshot> GetTimestampedSnapshot() const override {
+    return snapshot_;
+  }
+
+  virtual void SetSnapshot() override;
+  void SetSnapshotOnNextOperation(
+      std::shared_ptr<TransactionNotifier> notifier = nullptr) override;
+
+  void ClearSnapshot() override {
+    snapshot_.reset();
+    snapshot_needed_ = false;
+    snapshot_notifier_ = nullptr;
+  }
+
+  void DisableIndexing() override { indexing_enabled_ = false; }
+
+  void EnableIndexing() override { indexing_enabled_ = true; }
+
+  bool IndexingEnabled() const { return indexing_enabled_; }
+
+  uint64_t GetElapsedTime() const override;
+
+  uint64_t GetNumPuts() const override;
+
+  uint64_t GetNumDeletes() const override;
+
+  uint64_t GetNumMerges() const override;
+
+  uint64_t GetNumKeys() const override;
+
+  void UndoGetForUpdate(ColumnFamilyHandle* column_family,
+                        const Slice& key) override;
+  void UndoGetForUpdate(const Slice& key) override {
+    return UndoGetForUpdate(nullptr, key);
+  };
+
+  WriteOptions* GetWriteOptions() override { return &write_options_; }
+
+  void SetWriteOptions(const WriteOptions& write_options) override {
+    write_options_ = write_options;
+  }
+
+  // Used for memory management for snapshot_
+  void ReleaseSnapshot(const Snapshot* snapshot, DB* db);
+
+  // iterates over the given batch and makes the appropriate inserts.
+  // used for rebuilding prepared transactions after recovery.
+  virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) override;
+
+  WriteBatch* GetCommitTimeWriteBatch() override;
+
+  LockTracker& GetTrackedLocks() { return *tracked_locks_; }
+
+ protected:
+  // Add a key to the list of tracked keys.
+  //
+  // seqno is the earliest seqno this key was involved with this transaction.
+  // readonly should be set to true if no data was written for this key
+  void TrackKey(uint32_t cfh_id, const std::string& key, SequenceNumber seqno,
+                bool readonly, bool exclusive);
+
+  // Called when UndoGetForUpdate determines that this key can be unlocked.
+  virtual void UnlockGetForUpdate(ColumnFamilyHandle* column_family,
+                                  const Slice& key) = 0;
+
+  // Sets a snapshot if SetSnapshotOnNextOperation() has been called.
+  void SetSnapshotIfNeeded();
+
+  // Initialize write_batch_ for 2PC by inserting Noop.
+  inline void InitWriteBatch(bool clear = false) {
+    if (clear) {
+      write_batch_.Clear();
+    }
+    assert(write_batch_.GetDataSize() == WriteBatchInternal::kHeader);
+    auto s = WriteBatchInternal::InsertNoop(write_batch_.GetWriteBatch());
+    assert(s.ok());
+  }
+
+  WriteBatchBase* GetBatchForWrite();
+
+  DB* db_;
+  DBImpl* dbimpl_;
+
+  WriteOptions write_options_;
+
+  const Comparator* cmp_;
+
+  const LockTrackerFactory& lock_tracker_factory_;
+
+  // Stores that time the txn was constructed, in microseconds.
+  uint64_t start_time_;
+
+  // Stores the current snapshot that was set by SetSnapshot or null if
+  // no snapshot is currently set.
+  std::shared_ptr<const Snapshot> snapshot_;
+
+  // Count of various operations pending in this transaction
+  uint64_t num_puts_ = 0;
+  uint64_t num_deletes_ = 0;
+  uint64_t num_merges_ = 0;
+
+  struct SavePoint {
+    std::shared_ptr<const Snapshot> snapshot_;
+    bool snapshot_needed_ = false;
+    std::shared_ptr<TransactionNotifier> snapshot_notifier_;
+    uint64_t num_puts_ = 0;
+    uint64_t num_deletes_ = 0;
+    uint64_t num_merges_ = 0;
+
+    // Record all locks tracked since the last savepoint
+    std::shared_ptr<LockTracker> new_locks_;
+
+    SavePoint(std::shared_ptr<const Snapshot> snapshot, bool snapshot_needed,
+              std::shared_ptr<TransactionNotifier> snapshot_notifier,
+              uint64_t num_puts, uint64_t num_deletes, uint64_t num_merges,
+              const LockTrackerFactory& lock_tracker_factory)
+        : snapshot_(snapshot),
+          snapshot_needed_(snapshot_needed),
+          snapshot_notifier_(snapshot_notifier),
+          num_puts_(num_puts),
+          num_deletes_(num_deletes),
+          num_merges_(num_merges),
+          new_locks_(lock_tracker_factory.Create()) {}
+
+    explicit SavePoint(const LockTrackerFactory& lock_tracker_factory)
+        : new_locks_(lock_tracker_factory.Create()) {}
+  };
+
+  // Records writes pending in this transaction
+  WriteBatchWithIndex write_batch_;
+
+  // For Pessimistic Transactions this is the set of acquired locks.
+  // Optimistic Transactions will keep note the requested locks (not actually
+  // locked), and do conflict checking until commit time based on the tracked
+  // lock requests.
+  std::unique_ptr<LockTracker> tracked_locks_;
+
+  // Stack of the Snapshot saved at each save point. Saved snapshots may be
+  // nullptr if there was no snapshot at the time SetSavePoint() was called.
+  std::unique_ptr<std::stack<TransactionBaseImpl::SavePoint,
+                             autovector<TransactionBaseImpl::SavePoint>>>
+      save_points_;
+
+ private:
+  friend class WriteCommittedTxn;
+  friend class WritePreparedTxn;
+
+  // Extra data to be persisted with the commit. Note this is only used when
+  // prepare phase is not skipped.
+  WriteBatch commit_time_batch_;
+
+  // If true, future Put/Merge/Deletes will be indexed in the
+  // WriteBatchWithIndex.
+  // If false, future Put/Merge/Deletes will be inserted directly into the
+  // underlying WriteBatch and not indexed in the WriteBatchWithIndex.
+  bool indexing_enabled_;
+
+  // SetSnapshotOnNextOperation() has been called and the snapshot has not yet
+  // been reset.
+  bool snapshot_needed_ = false;
+
+  // SetSnapshotOnNextOperation() has been called and the caller would like
+  // a notification through the TransactionNotifier interface
+  std::shared_ptr<TransactionNotifier> snapshot_notifier_ = nullptr;
+
+  Status TryLock(ColumnFamilyHandle* column_family, const SliceParts& key,
+                 bool read_only, bool exclusive, const bool do_validate = true,
+                 const bool assume_tracked = false);
+
+  void SetSnapshotInternal(const Snapshot* snapshot);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc b/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc
new file mode 100644
index 000000000..345c4be90
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc
@@ -0,0 +1,135 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/transaction_db_mutex_impl.h"
+
+#include <chrono>
+#include <condition_variable>
+#include <functional>
+#include <mutex>
+
+#include "rocksdb/utilities/transaction_db_mutex.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TransactionDBMutexImpl : public TransactionDBMutex {
+ public:
+  TransactionDBMutexImpl() {}
+  ~TransactionDBMutexImpl() override {}
+
+  Status Lock() override;
+
+  Status TryLockFor(int64_t timeout_time) override;
+
+  void UnLock() override { mutex_.unlock(); }
+
+  friend class TransactionDBCondVarImpl;
+
+ private:
+  std::mutex mutex_;
+};
+
+class TransactionDBCondVarImpl : public TransactionDBCondVar {
+ public:
+  TransactionDBCondVarImpl() {}
+  ~TransactionDBCondVarImpl() override {}
+
+  Status Wait(std::shared_ptr<TransactionDBMutex> mutex) override;
+
+  Status WaitFor(std::shared_ptr<TransactionDBMutex> mutex,
+                 int64_t timeout_time) override;
+
+  void Notify() override { cv_.notify_one(); }
+
+  void NotifyAll() override { cv_.notify_all(); }
+
+ private:
+  std::condition_variable cv_;
+};
+
+std::shared_ptr<TransactionDBMutex>
+TransactionDBMutexFactoryImpl::AllocateMutex() {
+  return std::shared_ptr<TransactionDBMutex>(new TransactionDBMutexImpl());
+}
+
+std::shared_ptr<TransactionDBCondVar>
+TransactionDBMutexFactoryImpl::AllocateCondVar() {
+  return std::shared_ptr<TransactionDBCondVar>(new TransactionDBCondVarImpl());
+}
+
+Status TransactionDBMutexImpl::Lock() {
+  mutex_.lock();
+  return Status::OK();
+}
+
+Status TransactionDBMutexImpl::TryLockFor(int64_t timeout_time) {
+  bool locked = true;
+
+  if (timeout_time == 0) {
+    locked = mutex_.try_lock();
+  } else {
+    // Previously, this code used a std::timed_mutex.  However, this was changed
+    // due to known bugs in gcc versions < 4.9.
+    // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54562
+    //
+    // Since this mutex isn't held for long and only a single mutex is ever
+    // held at a time, it is reasonable to ignore the lock timeout_time here
+    // and only check it when waiting on the condition_variable.
+    mutex_.lock();
+  }
+
+  if (!locked) {
+    // timeout acquiring mutex
+    return Status::TimedOut(Status::SubCode::kMutexTimeout);
+  }
+
+  return Status::OK();
+}
+
+Status TransactionDBCondVarImpl::Wait(
+    std::shared_ptr<TransactionDBMutex> mutex) {
+  auto mutex_impl = reinterpret_cast<TransactionDBMutexImpl*>(mutex.get());
+
+  std::unique_lock<std::mutex> lock(mutex_impl->mutex_, std::adopt_lock);
+  cv_.wait(lock);
+
+  // Make sure unique_lock doesn't unlock mutex when it destructs
+  lock.release();
+
+  return Status::OK();
+}
+
+Status TransactionDBCondVarImpl::WaitFor(
+    std::shared_ptr<TransactionDBMutex> mutex, int64_t timeout_time) {
+  Status s;
+
+  auto mutex_impl = reinterpret_cast<TransactionDBMutexImpl*>(mutex.get());
+  std::unique_lock<std::mutex> lock(mutex_impl->mutex_, std::adopt_lock);
+
+  if (timeout_time < 0) {
+    // If timeout is negative, do not use a timeout
+    cv_.wait(lock);
+  } else {
+    auto duration = std::chrono::microseconds(timeout_time);
+    auto cv_status = cv_.wait_for(lock, duration);
+
+    // Check if the wait stopped due to timing out.
+    if (cv_status == std::cv_status::timeout) {
+      s = Status::TimedOut(Status::SubCode::kMutexTimeout);
+    }
+  }
+
+  // Make sure unique_lock doesn't unlock mutex when it destructs
+  lock.release();
+
+  // CV was signaled, or we spuriously woke up (but didn't time out)
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.h b/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.h
new file mode 100644
index 000000000..fbee92832
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.h
@@ -0,0 +1,26 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/transaction_db_mutex.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TransactionDBMutex;
+class TransactionDBCondVar;
+
+// Default implementation of TransactionDBMutexFactory.  May be overridden
+// by TransactionDBOptions.custom_mutex_factory.
+class TransactionDBMutexFactoryImpl : public TransactionDBMutexFactory {
+ public:
+  std::shared_ptr<TransactionDBMutex> AllocateMutex() override;
+  std::shared_ptr<TransactionDBCondVar> AllocateCondVar() override;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/transaction_test.cc b/src/rocksdb/utilities/transactions/transaction_test.cc
new file mode 100644
index 000000000..caf1566b9
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_test.cc
@@ -0,0 +1,6550 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/transaction_test.h"
+
+#include <algorithm>
+#include <functional>
+#include <string>
+#include <thread>
+
+#include "db/db_impl/db_impl.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "table/mock_table.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "test_util/transaction_test_util.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/string_append/stringappend.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+INSTANTIATE_TEST_CASE_P(
+    DBAsBaseDB, TransactionTest,
+    ::testing::Values(
+        std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite),
+        std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite),
+        std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite),
+        std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite)));
+INSTANTIATE_TEST_CASE_P(
+    DBAsBaseDB, TransactionStressTest,
+    ::testing::Values(
+        std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite),
+        std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite),
+        std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite),
+        std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite)));
+INSTANTIATE_TEST_CASE_P(
+    StackableDBAsBaseDB, TransactionTest,
+    ::testing::Values(
+        std::make_tuple(true, true, WRITE_COMMITTED, kOrderedWrite),
+        std::make_tuple(true, true, WRITE_PREPARED, kOrderedWrite),
+        std::make_tuple(true, true, WRITE_UNPREPARED, kOrderedWrite)));
+
+// MySQLStyleTransactionTest takes far too long for valgrind to run. Only do it
+// in full mode (`ROCKSDB_FULL_VALGRIND_RUN` compiler flag is set).
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+INSTANTIATE_TEST_CASE_P(
+    MySQLStyleTransactionTest, MySQLStyleTransactionTest,
+    ::testing::Values(
+        std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite, false),
+        std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite, false),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, false),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, true),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, false),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, true),
+        std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, false),
+        std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, true),
+        std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, false),
+        std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, true),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, false),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, true)));
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_P(TransactionTest, DoubleEmptyWrite) {
+  WriteOptions write_options;
+  write_options.sync = true;
+  write_options.disableWAL = false;
+
+  WriteBatch batch;
+
+  ASSERT_OK(db->Write(write_options, &batch));
+  ASSERT_OK(db->Write(write_options, &batch));
+
+  // Also test committing empty transactions in 2PC
+  TransactionOptions txn_options;
+  Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_OK(txn0->SetName("xid"));
+  ASSERT_OK(txn0->Prepare());
+  ASSERT_OK(txn0->Commit());
+  delete txn0;
+
+  // Also test that it works during recovery
+  txn0 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_OK(txn0->SetName("xid2"));
+  ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0a")));
+  ASSERT_OK(txn0->Prepare());
+  delete txn0;
+  reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+  ASSERT_OK(ReOpenNoDelete());
+  assert(db != nullptr);
+  txn0 = db->GetTransactionByName("xid2");
+  ASSERT_OK(txn0->Commit());
+  delete txn0;
+}
+
+TEST_P(TransactionTest, SuccessTest) {
+  ASSERT_OK(db->ResetStats());
+
+  WriteOptions write_options;
+  ReadOptions read_options;
+  std::string value;
+
+  ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar")));
+  ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar")));
+
+  Transaction* txn = db->BeginTransaction(write_options, TransactionOptions());
+  ASSERT_TRUE(txn);
+
+  ASSERT_EQ(0, txn->GetNumPuts());
+  ASSERT_LE(0, txn->GetID());
+
+  ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value));
+  ASSERT_EQ(value, "bar");
+
+  ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2")));
+
+  ASSERT_EQ(1, txn->GetNumPuts());
+
+  ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value));
+  ASSERT_EQ(value, "bar2");
+
+  ASSERT_OK(txn->Commit());
+
+  ASSERT_OK(db->Get(read_options, "foo", &value));
+  ASSERT_EQ(value, "bar2");
+
+  delete txn;
+}
+
+TEST_P(TransactionTest, SwitchMemtableDuringPrepareAndCommit_WC) {
+  const TxnDBWritePolicy write_policy = std::get<2>(GetParam());
+
+  if (write_policy != TxnDBWritePolicy::WRITE_COMMITTED) {
+    ROCKSDB_GTEST_BYPASS("Test applies to write-committed only");
+    return;
+  }
+
+  ASSERT_OK(db->Put(WriteOptions(), "key0", "value"));
+
+  TransactionOptions txn_opts;
+  txn_opts.use_only_the_last_commit_time_batch_for_recovery = true;
+  Transaction* txn = db->BeginTransaction(WriteOptions(), txn_opts);
+  assert(txn);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table", [&](void* arg) {
+        // db mutex not held.
+        auto* mems = reinterpret_cast<autovector<MemTable*>*>(arg);
+        assert(mems);
+        ASSERT_EQ(1, mems->size());
+        auto* ctwb = txn->GetCommitTimeWriteBatch();
+        ASSERT_OK(ctwb->Put("gtid", "123"));
+        ASSERT_OK(txn->Commit());
+        delete txn;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(txn->Put("key1", "value"));
+  ASSERT_OK(txn->SetName("txn1"));
+
+  ASSERT_OK(txn->Prepare());
+
+  auto dbimpl = static_cast_with_check<DBImpl>(db->GetRootDB());
+  ASSERT_OK(dbimpl->TEST_SwitchMemtable(nullptr));
+  ASSERT_OK(dbimpl->TEST_FlushMemTable(
+      /*wait=*/false, /*allow_write_stall=*/true, /*cfh=*/nullptr));
+
+  ASSERT_OK(dbimpl->TEST_WaitForFlushMemTable());
+
+  {
+    std::string value;
+    ASSERT_OK(db->Get(ReadOptions(), "key1", &value));
+    ASSERT_EQ("value", value);
+  }
+
+  delete db;
+  db = nullptr;
+  Status s;
+  if (use_stackable_db_ == false) {
+    s = TransactionDB::Open(options, txn_db_options, dbname, &db);
+  } else {
+    s = OpenWithStackableDB();
+  }
+  ASSERT_OK(s);
+  assert(db);
+
+  {
+    std::string value;
+    ASSERT_OK(db->Get(ReadOptions(), "gtid", &value));
+    ASSERT_EQ("123", value);
+
+    ASSERT_OK(db->Get(ReadOptions(), "key1", &value));
+    ASSERT_EQ("value", value);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// The test clarifies the contract of do_validate and assume_tracked
+// in GetForUpdate and Put/Merge/Delete
+TEST_P(TransactionTest, AssumeExclusiveTracked) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  std::string value;
+  Status s;
+  TransactionOptions txn_options;
+  txn_options.lock_timeout = 1;
+  const bool EXCLUSIVE = true;
+  const bool DO_VALIDATE = true;
+  const bool ASSUME_LOCKED = true;
+
+  Transaction* txn = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn);
+  txn->SetSnapshot();
+
+  // commit a value after the snapshot is taken
+  ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar")));
+
+  // By default write should fail to the commit after our snapshot
+  s = txn->GetForUpdate(read_options, "foo", &value, EXCLUSIVE);
+  ASSERT_TRUE(s.IsBusy());
+  // But the user could direct the db to skip validating the snapshot. The read
+  // value then should be the most recently committed
+  ASSERT_OK(
+      txn->GetForUpdate(read_options, "foo", &value, EXCLUSIVE, !DO_VALIDATE));
+  ASSERT_EQ(value, "bar");
+
+  // Although ValidateSnapshot is skipped the key must have still got locked
+  s = db->Put(write_options, Slice("foo"), Slice("bar"));
+  ASSERT_TRUE(s.IsTimedOut());
+
+  // By default the write operations should fail due to the commit after the
+  // snapshot
+  s = txn->Put(Slice("foo"), Slice("bar1"));
+  ASSERT_TRUE(s.IsBusy());
+  s = txn->Put(db->DefaultColumnFamily(), Slice("foo"), Slice("bar1"),
+               !ASSUME_LOCKED);
+  ASSERT_TRUE(s.IsBusy());
+  // But the user could direct the db that it already assumes exclusive lock on
+  // the key due to the previous GetForUpdate call.
+  ASSERT_OK(txn->Put(db->DefaultColumnFamily(), Slice("foo"), Slice("bar1"),
+                     ASSUME_LOCKED));
+  ASSERT_OK(txn->Merge(db->DefaultColumnFamily(), Slice("foo"), Slice("bar2"),
+                       ASSUME_LOCKED));
+  ASSERT_OK(
+      txn->Delete(db->DefaultColumnFamily(), Slice("foo"), ASSUME_LOCKED));
+  ASSERT_OK(txn->SingleDelete(db->DefaultColumnFamily(), Slice("foo"),
+                              ASSUME_LOCKED));
+
+  ASSERT_OK(txn->Rollback());
+  delete txn;
+}
+
+// This test clarifies the contract of ValidateSnapshot
+TEST_P(TransactionTest, ValidateSnapshotTest) {
+  for (bool with_flush : {true}) {
+    for (bool with_2pc : {true}) {
+      ASSERT_OK(ReOpen());
+      WriteOptions write_options;
+      ReadOptions read_options;
+      std::string value;
+
+      assert(db != nullptr);
+      Transaction* txn1 =
+          db->BeginTransaction(write_options, TransactionOptions());
+      ASSERT_TRUE(txn1);
+      ASSERT_OK(txn1->Put(Slice("foo"), Slice("bar1")));
+      if (with_2pc) {
+        ASSERT_OK(txn1->SetName("xid1"));
+        ASSERT_OK(txn1->Prepare());
+      }
+
+      if (with_flush) {
+        auto db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+        ASSERT_OK(db_impl->TEST_FlushMemTable(true));
+        // Make sure the flushed memtable is not kept in memory
+        int max_memtable_in_history =
+            std::max(
+                options.max_write_buffer_number,
+                static_cast<int>(options.max_write_buffer_size_to_maintain) /
+                    static_cast<int>(options.write_buffer_size)) +
+            1;
+        for (int i = 0; i < max_memtable_in_history; i++) {
+          ASSERT_OK(db->Put(write_options, Slice("key"), Slice("value")));
+          ASSERT_OK(db_impl->TEST_FlushMemTable(true));
+        }
+      }
+
+      Transaction* txn2 =
+          db->BeginTransaction(write_options, TransactionOptions());
+      ASSERT_TRUE(txn2);
+      txn2->SetSnapshot();
+
+      ASSERT_OK(txn1->Commit());
+      delete txn1;
+
+      auto pes_txn2 = dynamic_cast<PessimisticTransaction*>(txn2);
+      // Test the simple case where the key is not tracked yet
+      auto trakced_seq = kMaxSequenceNumber;
+      auto s = pes_txn2->ValidateSnapshot(db->DefaultColumnFamily(), "foo",
+                                          &trakced_seq);
+      ASSERT_TRUE(s.IsBusy());
+      delete txn2;
+    }
+  }
+}
+
+TEST_P(TransactionTest, WaitingTxn) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  TransactionOptions txn_options;
+  std::string value;
+  Status s;
+
+  txn_options.lock_timeout = 1;
+  s = db->Put(write_options, Slice("foo"), Slice("bar"));
+  ASSERT_OK(s);
+
+  /* create second cf */
+  ColumnFamilyHandle* cfa;
+  ColumnFamilyOptions cf_options;
+  s = db->CreateColumnFamily(cf_options, "CFA", &cfa);
+  ASSERT_OK(s);
+  s = db->Put(write_options, cfa, Slice("foo"), Slice("bar"));
+  ASSERT_OK(s);
+
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+  TransactionID id1 = txn1->GetID();
+  ASSERT_TRUE(txn1);
+  ASSERT_TRUE(txn2);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "PointLockManager::AcquireWithTimeout:WaitingTxn", [&](void* /*arg*/) {
+        std::string key;
+        uint32_t cf_id;
+        std::vector<TransactionID> wait = txn2->GetWaitingTxns(&cf_id, &key);
+        ASSERT_EQ(key, "foo");
+        ASSERT_EQ(wait.size(), 1);
+        ASSERT_EQ(wait[0], id1);
+        ASSERT_EQ(cf_id, 0U);
+      });
+
+  get_perf_context()->Reset();
+  // lock key in default cf
+  s = txn1->GetForUpdate(read_options, "foo", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar");
+  ASSERT_EQ(get_perf_context()->key_lock_wait_count, 0);
+
+  // lock key in cfa
+  s = txn1->GetForUpdate(read_options, cfa, "foo", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar");
+  ASSERT_EQ(get_perf_context()->key_lock_wait_count, 0);
+
+  auto lock_data = db->GetLockStatusData();
+  // Locked keys exist in both column family.
+  ASSERT_EQ(lock_data.size(), 2);
+
+  auto cf_iterator = lock_data.begin();
+
+  // The iterator points to an unordered_multimap
+  // thus the test can not assume any particular order.
+
+  // Column family is 1 or 0 (cfa).
+  if (cf_iterator->first != 1 && cf_iterator->first != 0) {
+    FAIL();
+  }
+  // The locked key is "foo" and is locked by txn1
+  ASSERT_EQ(cf_iterator->second.key, "foo");
+  ASSERT_EQ(cf_iterator->second.ids.size(), 1);
+  ASSERT_EQ(cf_iterator->second.ids[0], txn1->GetID());
+
+  cf_iterator++;
+
+  // Column family is 0 (default) or 1.
+  if (cf_iterator->first != 1 && cf_iterator->first != 0) {
+    FAIL();
+  }
+  // The locked key is "foo" and is locked by txn1
+  ASSERT_EQ(cf_iterator->second.key, "foo");
+  ASSERT_EQ(cf_iterator->second.ids.size(), 1);
+  ASSERT_EQ(cf_iterator->second.ids[0], txn1->GetID());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  s = txn2->GetForUpdate(read_options, "foo", &value);
+  ASSERT_TRUE(s.IsTimedOut());
+  ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
+  ASSERT_EQ(get_perf_context()->key_lock_wait_count, 1);
+  ASSERT_GE(get_perf_context()->key_lock_wait_time, 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  delete cfa;
+  delete txn1;
+  delete txn2;
+}
+
+TEST_P(TransactionTest, SharedLocks) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  TransactionOptions txn_options;
+  Status s;
+
+  txn_options.lock_timeout = 1;
+  s = db->Put(write_options, Slice("foo"), Slice("bar"));
+  ASSERT_OK(s);
+
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+  Transaction* txn3 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn1);
+  ASSERT_TRUE(txn2);
+  ASSERT_TRUE(txn3);
+
+  // Test shared access between txns
+  s = txn1->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
+  ASSERT_OK(s);
+
+  s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
+  ASSERT_OK(s);
+
+  s = txn3->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
+  ASSERT_OK(s);
+
+  auto lock_data = db->GetLockStatusData();
+  ASSERT_EQ(lock_data.size(), 1);
+
+  auto cf_iterator = lock_data.begin();
+  ASSERT_EQ(cf_iterator->second.key, "foo");
+
+  // We compare whether the set of txns locking this key is the same. To do
+  // this, we need to sort both vectors so that the comparison is done
+  // correctly.
+  std::vector<TransactionID> expected_txns = {txn1->GetID(), txn2->GetID(),
+                                              txn3->GetID()};
+  std::vector<TransactionID> lock_txns = cf_iterator->second.ids;
+  ASSERT_EQ(expected_txns, lock_txns);
+  ASSERT_FALSE(cf_iterator->second.exclusive);
+
+  ASSERT_OK(txn1->Rollback());
+  ASSERT_OK(txn2->Rollback());
+  ASSERT_OK(txn3->Rollback());
+
+  // Test txn1 and txn2 sharing a lock and txn3 trying to obtain it.
+  s = txn1->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
+  ASSERT_OK(s);
+
+  s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
+  ASSERT_OK(s);
+
+  s = txn3->GetForUpdate(read_options, "foo", nullptr);
+  ASSERT_TRUE(s.IsTimedOut());
+  ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
+
+  txn1->UndoGetForUpdate("foo");
+  s = txn3->GetForUpdate(read_options, "foo", nullptr);
+  ASSERT_TRUE(s.IsTimedOut());
+  ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
+
+  txn2->UndoGetForUpdate("foo");
+  s = txn3->GetForUpdate(read_options, "foo", nullptr);
+  ASSERT_OK(s);
+
+  ASSERT_OK(txn1->Rollback());
+  ASSERT_OK(txn2->Rollback());
+  ASSERT_OK(txn3->Rollback());
+
+  // Test txn1 and txn2 sharing a lock and txn2 trying to upgrade lock.
+  s = txn1->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
+  ASSERT_OK(s);
+
+  s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
+  ASSERT_OK(s);
+
+  s = txn2->GetForUpdate(read_options, "foo", nullptr);
+  ASSERT_TRUE(s.IsTimedOut());
+  ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
+
+  txn1->UndoGetForUpdate("foo");
+  s = txn2->GetForUpdate(read_options, "foo", nullptr);
+  ASSERT_OK(s);
+
+  ASSERT_OK(txn1->Rollback());
+  ASSERT_OK(txn2->Rollback());
+
+  // Test txn1 trying to downgrade its lock.
+  s = txn1->GetForUpdate(read_options, "foo", nullptr, true /* exclusive */);
+  ASSERT_OK(s);
+
+  s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
+  ASSERT_TRUE(s.IsTimedOut());
+  ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
+
+  // Should still fail after "downgrading".
+  s = txn1->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
+  ASSERT_OK(s);
+
+  s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
+  ASSERT_TRUE(s.IsTimedOut());
+  ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
+
+  ASSERT_OK(txn1->Rollback());
+  ASSERT_OK(txn2->Rollback());
+
+  // Test txn1 holding an exclusive lock and txn2 trying to obtain shared
+  // access.
+  s = txn1->GetForUpdate(read_options, "foo", nullptr);
+  ASSERT_OK(s);
+
+  s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
+  ASSERT_TRUE(s.IsTimedOut());
+  ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
+
+  txn1->UndoGetForUpdate("foo");
+  s = txn2->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
+  ASSERT_OK(s);
+
+  delete txn1;
+  delete txn2;
+  delete txn3;
+}
+
+TEST_P(TransactionTest, DeadlockCycleShared) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  TransactionOptions txn_options;
+
+  txn_options.lock_timeout = 1000000;
+  txn_options.deadlock_detect = true;
+
+  // Set up a wait for chain like this:
+  //
+  // Tn -> T(n*2)
+  // Tn -> T(n*2 + 1)
+  //
+  // So we have:
+  // T1 -> T2 -> T4 ...
+  //    |     |> T5 ...
+  //    |> T3 -> T6 ...
+  //          |> T7 ...
+  // up to T31, then T[16 - 31] -> T1.
+  // Note that Tn holds lock on floor(n / 2).
+
+  std::vector<Transaction*> txns(31);
+
+  for (uint32_t i = 0; i < 31; i++) {
+    txns[i] = db->BeginTransaction(write_options, txn_options);
+    ASSERT_TRUE(txns[i]);
+    auto s = txns[i]->GetForUpdate(read_options, std::to_string((i + 1) / 2),
+                                   nullptr, false /* exclusive */);
+    ASSERT_OK(s);
+  }
+
+  std::atomic<uint32_t> checkpoints(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "PointLockManager::AcquireWithTimeout:WaitingTxn",
+      [&](void* /*arg*/) { checkpoints.fetch_add(1); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // We want the leaf transactions to block and hold everyone back.
+  std::vector<port::Thread> threads;
+  for (uint32_t i = 0; i < 15; i++) {
+    std::function<void()> blocking_thread = [&, i] {
+      auto s = txns[i]->GetForUpdate(read_options, std::to_string(i + 1),
+                                     nullptr, true /* exclusive */);
+      ASSERT_OK(s);
+      ASSERT_OK(txns[i]->Rollback());
+      delete txns[i];
+    };
+    threads.emplace_back(blocking_thread);
+  }
+
+  // Wait until all threads are waiting on each other.
+  while (checkpoints.load() != 15) {
+    /* sleep override */
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Complete the cycle T[16 - 31] -> T1
+  for (uint32_t i = 15; i < 31; i++) {
+    auto s =
+        txns[i]->GetForUpdate(read_options, "0", nullptr, true /* exclusive */);
+    ASSERT_TRUE(s.IsDeadlock());
+
+    // Calculate next buffer len, plateau at 5 when 5 records are inserted.
+    const uint32_t curr_dlock_buffer_len_ =
+        (i - 14 > kInitialMaxDeadlocks) ? kInitialMaxDeadlocks : (i - 14);
+
+    auto dlock_buffer = db->GetDeadlockInfoBuffer();
+    ASSERT_EQ(dlock_buffer.size(), curr_dlock_buffer_len_);
+    auto dlock_entry = dlock_buffer[0].path;
+    ASSERT_EQ(dlock_entry.size(), kInitialMaxDeadlocks);
+    int64_t pre_deadlock_time = dlock_buffer[0].deadlock_time;
+    int64_t cur_deadlock_time = 0;
+    for (auto const& dl_path_rec : dlock_buffer) {
+      cur_deadlock_time = dl_path_rec.deadlock_time;
+      ASSERT_NE(cur_deadlock_time, 0);
+      ASSERT_TRUE(cur_deadlock_time <= pre_deadlock_time);
+      pre_deadlock_time = cur_deadlock_time;
+    }
+
+    int64_t curr_waiting_key = 0;
+
+    // Offset of each txn id from the root of the shared dlock tree's txn id.
+    int64_t offset_root = dlock_entry[0].m_txn_id - 1;
+    // Offset of the final entry in the dlock path from the root's txn id.
+    TransactionID leaf_id =
+        dlock_entry[dlock_entry.size() - 1].m_txn_id - offset_root;
+
+    for (auto it = dlock_entry.rbegin(); it != dlock_entry.rend(); ++it) {
+      auto dl_node = *it;
+      ASSERT_EQ(dl_node.m_txn_id, offset_root + leaf_id);
+      ASSERT_EQ(dl_node.m_cf_id, 0U);
+      ASSERT_EQ(dl_node.m_waiting_key, std::to_string(curr_waiting_key));
+      ASSERT_EQ(dl_node.m_exclusive, true);
+
+      if (curr_waiting_key == 0) {
+        curr_waiting_key = leaf_id;
+      }
+      curr_waiting_key /= 2;
+      leaf_id /= 2;
+    }
+  }
+
+  // Rollback the leaf transaction.
+  for (uint32_t i = 15; i < 31; i++) {
+    ASSERT_OK(txns[i]->Rollback());
+    delete txns[i];
+  }
+
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  // Downsize the buffer and verify the 3 latest deadlocks are preserved.
+  auto dlock_buffer_before_resize = db->GetDeadlockInfoBuffer();
+  db->SetDeadlockInfoBufferSize(3);
+  auto dlock_buffer_after_resize = db->GetDeadlockInfoBuffer();
+  ASSERT_EQ(dlock_buffer_after_resize.size(), 3);
+
+  for (uint32_t i = 0; i < dlock_buffer_after_resize.size(); i++) {
+    for (uint32_t j = 0; j < dlock_buffer_after_resize[i].path.size(); j++) {
+      ASSERT_EQ(dlock_buffer_after_resize[i].path[j].m_txn_id,
+                dlock_buffer_before_resize[i].path[j].m_txn_id);
+    }
+  }
+
+  // Upsize the buffer and verify the 3 latest dealocks are preserved.
+  dlock_buffer_before_resize = db->GetDeadlockInfoBuffer();
+  db->SetDeadlockInfoBufferSize(5);
+  dlock_buffer_after_resize = db->GetDeadlockInfoBuffer();
+  ASSERT_EQ(dlock_buffer_after_resize.size(), 3);
+
+  for (uint32_t i = 0; i < dlock_buffer_before_resize.size(); i++) {
+    for (uint32_t j = 0; j < dlock_buffer_before_resize[i].path.size(); j++) {
+      ASSERT_EQ(dlock_buffer_after_resize[i].path[j].m_txn_id,
+                dlock_buffer_before_resize[i].path[j].m_txn_id);
+    }
+  }
+
+  // Downsize to 0 and verify the size is consistent.
+  dlock_buffer_before_resize = db->GetDeadlockInfoBuffer();
+  db->SetDeadlockInfoBufferSize(0);
+  dlock_buffer_after_resize = db->GetDeadlockInfoBuffer();
+  ASSERT_EQ(dlock_buffer_after_resize.size(), 0);
+
+  // Upsize from 0 to verify the size is persistent.
+  dlock_buffer_before_resize = db->GetDeadlockInfoBuffer();
+  db->SetDeadlockInfoBufferSize(3);
+  dlock_buffer_after_resize = db->GetDeadlockInfoBuffer();
+  ASSERT_EQ(dlock_buffer_after_resize.size(), 0);
+
+  // Contrived case of shared lock of cycle size 2 to verify that a shared
+  // lock causing a deadlock is correctly reported as "shared" in the buffer.
+  std::vector<Transaction*> txns_shared(2);
+
+  // Create a cycle of size 2.
+  for (uint32_t i = 0; i < 2; i++) {
+    txns_shared[i] = db->BeginTransaction(write_options, txn_options);
+    ASSERT_TRUE(txns_shared[i]);
+    auto s =
+        txns_shared[i]->GetForUpdate(read_options, std::to_string(i), nullptr);
+    ASSERT_OK(s);
+  }
+
+  std::atomic<uint32_t> checkpoints_shared(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "PointLockManager::AcquireWithTimeout:WaitingTxn",
+      [&](void* /*arg*/) { checkpoints_shared.fetch_add(1); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<port::Thread> threads_shared;
+  for (uint32_t i = 0; i < 1; i++) {
+    std::function<void()> blocking_thread = [&, i] {
+      auto s = txns_shared[i]->GetForUpdate(read_options, std::to_string(i + 1),
+                                            nullptr);
+      ASSERT_OK(s);
+      ASSERT_OK(txns_shared[i]->Rollback());
+      delete txns_shared[i];
+    };
+    threads_shared.emplace_back(blocking_thread);
+  }
+
+  // Wait until all threads are waiting on each other.
+  while (checkpoints_shared.load() != 1) {
+    /* sleep override */
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Complete the cycle T2 -> T1 with a shared lock.
+  auto s = txns_shared[1]->GetForUpdate(read_options, "0", nullptr, false);
+  ASSERT_TRUE(s.IsDeadlock());
+
+  auto dlock_buffer = db->GetDeadlockInfoBuffer();
+
+  // Verify the size of the buffer and the single path.
+  ASSERT_EQ(dlock_buffer.size(), 1);
+  ASSERT_EQ(dlock_buffer[0].path.size(), 2);
+
+  // Verify the exclusivity field of the transactions in the deadlock path.
+  ASSERT_TRUE(dlock_buffer[0].path[0].m_exclusive);
+  ASSERT_FALSE(dlock_buffer[0].path[1].m_exclusive);
+  ASSERT_OK(txns_shared[1]->Rollback());
+  delete txns_shared[1];
+
+  for (auto& t : threads_shared) {
+    t.join();
+  }
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+TEST_P(TransactionStressTest, DeadlockCycle) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  TransactionOptions txn_options;
+
+  // offset by 2 from the max depth to test edge case
+  const uint32_t kMaxCycleLength = 52;
+
+  txn_options.lock_timeout = 1000000;
+  txn_options.deadlock_detect = true;
+
+  for (uint32_t len = 2; len < kMaxCycleLength; len++) {
+    // Set up a long wait for chain like this:
+    //
+    // T1 -> T2 -> T3 -> ... -> Tlen
+
+    std::vector<Transaction*> txns(len);
+
+    for (uint32_t i = 0; i < len; i++) {
+      txns[i] = db->BeginTransaction(write_options, txn_options);
+      ASSERT_TRUE(txns[i]);
+      auto s = txns[i]->GetForUpdate(read_options, std::to_string(i), nullptr);
+      ASSERT_OK(s);
+    }
+
+    std::atomic<uint32_t> checkpoints(0);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "PointLockManager::AcquireWithTimeout:WaitingTxn",
+        [&](void* /*arg*/) { checkpoints.fetch_add(1); });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    // We want the last transaction in the chain to block and hold everyone
+    // back.
+    std::vector<port::Thread> threads;
+    for (uint32_t i = 0; i + 1 < len; i++) {
+      std::function<void()> blocking_thread = [&, i] {
+        auto s =
+            txns[i]->GetForUpdate(read_options, std::to_string(i + 1), nullptr);
+        ASSERT_OK(s);
+        ASSERT_OK(txns[i]->Rollback());
+        delete txns[i];
+      };
+      threads.emplace_back(blocking_thread);
+    }
+
+    // Wait until all threads are waiting on each other.
+    while (checkpoints.load() != len - 1) {
+      /* sleep override */
+      std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+    // Complete the cycle Tlen -> T1
+    auto s = txns[len - 1]->GetForUpdate(read_options, "0", nullptr);
+    ASSERT_TRUE(s.IsDeadlock());
+
+    const uint32_t dlock_buffer_size_ = (len - 1 > 5) ? 5 : (len - 1);
+    uint32_t curr_waiting_key = 0;
+    TransactionID curr_txn_id = txns[0]->GetID();
+
+    auto dlock_buffer = db->GetDeadlockInfoBuffer();
+    ASSERT_EQ(dlock_buffer.size(), dlock_buffer_size_);
+    uint32_t check_len = len;
+    bool check_limit_flag = false;
+
+    // Special case for a deadlock path that exceeds the maximum depth.
+    if (len > 50) {
+      check_len = 0;
+      check_limit_flag = true;
+    }
+    auto dlock_entry = dlock_buffer[0].path;
+    ASSERT_EQ(dlock_entry.size(), check_len);
+    ASSERT_EQ(dlock_buffer[0].limit_exceeded, check_limit_flag);
+
+    int64_t pre_deadlock_time = dlock_buffer[0].deadlock_time;
+    int64_t cur_deadlock_time = 0;
+    for (auto const& dl_path_rec : dlock_buffer) {
+      cur_deadlock_time = dl_path_rec.deadlock_time;
+      ASSERT_NE(cur_deadlock_time, 0);
+      ASSERT_TRUE(cur_deadlock_time <= pre_deadlock_time);
+      pre_deadlock_time = cur_deadlock_time;
+    }
+
+    // Iterates backwards over path verifying decreasing txn_ids.
+    for (auto it = dlock_entry.rbegin(); it != dlock_entry.rend(); ++it) {
+      auto dl_node = *it;
+      ASSERT_EQ(dl_node.m_txn_id, len + curr_txn_id - 1);
+      ASSERT_EQ(dl_node.m_cf_id, 0u);
+      ASSERT_EQ(dl_node.m_waiting_key, std::to_string(curr_waiting_key));
+      ASSERT_EQ(dl_node.m_exclusive, true);
+
+      curr_txn_id--;
+      if (curr_waiting_key == 0) {
+        curr_waiting_key = len;
+      }
+      curr_waiting_key--;
+    }
+
+    // Rollback the last transaction.
+    ASSERT_OK(txns[len - 1]->Rollback());
+    delete txns[len - 1];
+
+    for (auto& t : threads) {
+      t.join();
+    }
+  }
+}
+
+TEST_P(TransactionStressTest, DeadlockStress) {
+  const uint32_t NUM_TXN_THREADS = 10;
+  const uint32_t NUM_KEYS = 100;
+  const uint32_t NUM_ITERS = 1000;
+
+  WriteOptions write_options;
+  ReadOptions read_options;
+  TransactionOptions txn_options;
+
+  txn_options.lock_timeout = 1000000;
+  txn_options.deadlock_detect = true;
+  std::vector<std::string> keys;
+
+  for (uint32_t i = 0; i < NUM_KEYS; i++) {
+    ASSERT_OK(db->Put(write_options, Slice(std::to_string(i)), Slice("")));
+    keys.push_back(std::to_string(i));
+  }
+
+  size_t tid = std::hash<std::thread::id>()(std::this_thread::get_id());
+  Random rnd(static_cast<uint32_t>(tid));
+  std::function<void(uint32_t)> stress_thread = [&](uint32_t seed) {
+    std::default_random_engine g(seed);
+
+    Transaction* txn;
+    for (uint32_t i = 0; i < NUM_ITERS; i++) {
+      txn = db->BeginTransaction(write_options, txn_options);
+      auto random_keys = keys;
+      std::shuffle(random_keys.begin(), random_keys.end(), g);
+
+      // Lock keys in random order.
+      for (const auto& k : random_keys) {
+        // Lock mostly for shared access, but exclusive 1/4 of the time.
+        auto s =
+            txn->GetForUpdate(read_options, k, nullptr, txn->GetID() % 4 == 0);
+        if (!s.ok()) {
+          ASSERT_TRUE(s.IsDeadlock());
+          ASSERT_OK(txn->Rollback());
+          break;
+        }
+      }
+
+      delete txn;
+    }
+  };
+
+  std::vector<port::Thread> threads;
+  for (uint32_t i = 0; i < NUM_TXN_THREADS; i++) {
+    threads.emplace_back(stress_thread, rnd.Next());
+  }
+
+  for (auto& t : threads) {
+    t.join();
+  }
+}
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_P(TransactionTest, CommitTimeBatchFailTest) {
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+
+  std::string value;
+  Status s;
+
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn1);
+
+  ASSERT_OK(txn1->GetCommitTimeWriteBatch()->Put("cat", "dog"));
+
+  s = txn1->Put("foo", "bar");
+  ASSERT_OK(s);
+
+  // fails due to non-empty commit-time batch
+  s = txn1->Commit();
+  ASSERT_EQ(s, Status::InvalidArgument());
+
+  delete txn1;
+}
+
+TEST_P(TransactionTest, LogMarkLeakTest) {
+  TransactionOptions txn_options;
+  WriteOptions write_options;
+  options.write_buffer_size = 1024;
+  ASSERT_OK(ReOpenNoDelete());
+  assert(db != nullptr);
+  Random rnd(47);
+  std::vector<Transaction*> txns;
+  DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+  // At the beginning there should be no log containing prepare data
+  ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
+  for (size_t i = 0; i < 100; i++) {
+    Transaction* txn = db->BeginTransaction(write_options, txn_options);
+    ASSERT_OK(txn->SetName("xid" + std::to_string(i)));
+    ASSERT_OK(txn->Put(Slice("foo" + std::to_string(i)), Slice("bar")));
+    ASSERT_OK(txn->Prepare());
+    ASSERT_GT(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
+    if (rnd.OneIn(5)) {
+      txns.push_back(txn);
+    } else {
+      ASSERT_OK(txn->Commit());
+      delete txn;
+    }
+    ASSERT_OK(db_impl->TEST_FlushMemTable(true));
+  }
+  for (auto txn : txns) {
+    ASSERT_OK(txn->Commit());
+    delete txn;
+  }
+  // At the end there should be no log left containing prepare data
+  ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
+  // Make sure that the underlying data structures are properly truncated and
+  // cause not leak
+  ASSERT_EQ(db_impl->TEST_PreparedSectionCompletedSize(), 0);
+  ASSERT_EQ(db_impl->TEST_LogsWithPrepSize(), 0);
+}
+
+TEST_P(TransactionTest, SimpleTwoPhaseTransactionTest) {
+  for (bool cwb4recovery : {true, false}) {
+    ASSERT_OK(ReOpen());
+    WriteOptions write_options;
+    ReadOptions read_options;
+
+    TransactionOptions txn_options;
+    txn_options.use_only_the_last_commit_time_batch_for_recovery = cwb4recovery;
+
+    std::string value;
+    Status s;
+
+    DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+
+    Transaction* txn = db->BeginTransaction(write_options, txn_options);
+    s = txn->SetName("xid");
+    ASSERT_OK(s);
+
+    ASSERT_EQ(db->GetTransactionByName("xid"), txn);
+
+    // transaction put
+    s = txn->Put(Slice("foo"), Slice("bar"));
+    ASSERT_OK(s);
+    ASSERT_EQ(1, txn->GetNumPuts());
+
+    // regular db put
+    s = db->Put(write_options, Slice("foo2"), Slice("bar2"));
+    ASSERT_OK(s);
+    ASSERT_EQ(1, txn->GetNumPuts());
+
+    // regular db read
+    ASSERT_OK(db->Get(read_options, "foo2", &value));
+    ASSERT_EQ(value, "bar2");
+
+    // commit time put
+    if (cwb4recovery) {
+      ASSERT_OK(
+          txn->GetCommitTimeWriteBatch()->Put(Slice("gtid"), Slice("dogs")));
+      ASSERT_OK(
+          txn->GetCommitTimeWriteBatch()->Put(Slice("gtid2"), Slice("cats")));
+    }
+
+    // nothing has been prepped yet
+    ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
+
+    s = txn->Prepare();
+    ASSERT_OK(s);
+
+    // data not im mem yet
+    s = db->Get(read_options, Slice("foo"), &value);
+    ASSERT_TRUE(s.IsNotFound());
+    s = db->Get(read_options, Slice("gtid"), &value);
+    ASSERT_TRUE(s.IsNotFound());
+
+    // find trans in list of prepared transactions
+    std::vector<Transaction*> prepared_trans;
+    db->GetAllPreparedTransactions(&prepared_trans);
+    ASSERT_EQ(prepared_trans.size(), 1);
+    ASSERT_EQ(prepared_trans.front()->GetName(), "xid");
+
+    auto log_containing_prep =
+        db_impl->TEST_FindMinLogContainingOutstandingPrep();
+    ASSERT_GT(log_containing_prep, 0);
+
+    // make commit
+    s = txn->Commit();
+    ASSERT_OK(s);
+
+    // value is now available
+    s = db->Get(read_options, "foo", &value);
+    ASSERT_OK(s);
+    ASSERT_EQ(value, "bar");
+
+    // we already committed
+    s = txn->Commit();
+    ASSERT_EQ(s, Status::InvalidArgument());
+
+    // no longer is prepared results
+    db->GetAllPreparedTransactions(&prepared_trans);
+    ASSERT_EQ(prepared_trans.size(), 0);
+    ASSERT_EQ(db->GetTransactionByName("xid"), nullptr);
+
+    // heap should not care about prepared section anymore
+    ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
+
+    switch (txn_db_options.write_policy) {
+      case WRITE_COMMITTED:
+        // but now our memtable should be referencing the prep section
+        ASSERT_GE(log_containing_prep, db_impl->MinLogNumberToKeep());
+        ASSERT_EQ(log_containing_prep,
+                  db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+        break;
+      case WRITE_PREPARED:
+      case WRITE_UNPREPARED:
+        // In these modes memtable do not ref the prep sections
+        ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+        break;
+      default:
+        assert(false);
+    }
+
+    ASSERT_OK(db_impl->TEST_FlushMemTable(true));
+    // After flush the recoverable state must be visible
+    if (cwb4recovery) {
+      s = db->Get(read_options, "gtid", &value);
+      ASSERT_OK(s);
+      ASSERT_EQ(value, "dogs");
+
+      s = db->Get(read_options, "gtid2", &value);
+      ASSERT_OK(s);
+      ASSERT_EQ(value, "cats");
+    }
+
+    // after memtable flush we can now relese the log
+    ASSERT_GT(db_impl->MinLogNumberToKeep(), log_containing_prep);
+    ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+
+    delete txn;
+
+    if (cwb4recovery) {
+      // kill and reopen to trigger recovery
+      s = ReOpenNoDelete();
+      ASSERT_OK(s);
+      assert(db != nullptr);
+      s = db->Get(read_options, "gtid", &value);
+      ASSERT_OK(s);
+      ASSERT_EQ(value, "dogs");
+
+      s = db->Get(read_options, "gtid2", &value);
+      ASSERT_OK(s);
+      ASSERT_EQ(value, "cats");
+    }
+  }
+}
+
+TEST_P(TransactionTest, TwoPhaseNameTest) {
+  Status s;
+
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+  Transaction* txn3 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn3);
+  delete txn3;
+
+  // cant prepare txn without name
+  s = txn1->Prepare();
+  ASSERT_EQ(s, Status::InvalidArgument());
+
+  // name too short
+  s = txn1->SetName("");
+  ASSERT_EQ(s, Status::InvalidArgument());
+
+  // name too long
+  s = txn1->SetName(std::string(513, 'x'));
+  ASSERT_EQ(s, Status::InvalidArgument());
+
+  // valid set name
+  s = txn1->SetName("name1");
+  ASSERT_OK(s);
+
+  // cant have duplicate name
+  s = txn2->SetName("name1");
+  ASSERT_EQ(s, Status::InvalidArgument());
+
+  // shouldn't be able to prepare
+  s = txn2->Prepare();
+  ASSERT_EQ(s, Status::InvalidArgument());
+
+  // valid name set
+  s = txn2->SetName("name2");
+  ASSERT_OK(s);
+
+  // cant reset name
+  s = txn2->SetName("name3");
+  ASSERT_EQ(s, Status::InvalidArgument());
+
+  ASSERT_EQ(txn1->GetName(), "name1");
+  ASSERT_EQ(txn2->GetName(), "name2");
+
+  s = txn1->Prepare();
+  ASSERT_OK(s);
+
+  // can't rename after prepare
+  s = txn1->SetName("name4");
+  ASSERT_EQ(s, Status::InvalidArgument());
+
+  ASSERT_OK(txn1->Rollback());
+  ASSERT_OK(txn2->Rollback());
+  delete txn1;
+  delete txn2;
+}
+
+TEST_P(TransactionTest, TwoPhaseEmptyWriteTest) {
+  for (bool cwb4recovery : {true, false}) {
+    for (bool test_with_empty_wal : {true, false}) {
+      if (!cwb4recovery && test_with_empty_wal) {
+        continue;
+      }
+      ASSERT_OK(ReOpen());
+      Status s;
+      std::string value;
+
+      WriteOptions write_options;
+      ReadOptions read_options;
+      TransactionOptions txn_options;
+      txn_options.use_only_the_last_commit_time_batch_for_recovery =
+          cwb4recovery;
+      Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+      ASSERT_TRUE(txn1);
+      Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+      ASSERT_TRUE(txn2);
+
+      s = txn1->SetName("joe");
+      ASSERT_OK(s);
+
+      s = txn2->SetName("bob");
+      ASSERT_OK(s);
+
+      s = txn1->Prepare();
+      ASSERT_OK(s);
+
+      s = txn1->Commit();
+      ASSERT_OK(s);
+
+      delete txn1;
+
+      if (cwb4recovery) {
+        ASSERT_OK(
+            txn2->GetCommitTimeWriteBatch()->Put(Slice("foo"), Slice("bar")));
+      }
+
+      s = txn2->Prepare();
+      ASSERT_OK(s);
+
+      s = txn2->Commit();
+      ASSERT_OK(s);
+
+      delete txn2;
+      if (cwb4recovery) {
+        if (test_with_empty_wal) {
+          DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+          ASSERT_OK(db_impl->TEST_FlushMemTable(true));
+          // After flush the state must be visible
+          s = db->Get(read_options, "foo", &value);
+          ASSERT_OK(s);
+          ASSERT_EQ(value, "bar");
+        }
+        ASSERT_OK(db->FlushWAL(true));
+        // kill and reopen to trigger recovery
+        s = ReOpenNoDelete();
+        ASSERT_OK(s);
+        assert(db != nullptr);
+        s = db->Get(read_options, "foo", &value);
+        ASSERT_OK(s);
+        ASSERT_EQ(value, "bar");
+      }
+    }
+  }
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+TEST_P(TransactionStressTest, TwoPhaseExpirationTest) {
+  Status s;
+
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+  txn_options.expiration = 500;  // 500ms
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn1);
+  ASSERT_TRUE(txn1);
+
+  s = txn1->SetName("joe");
+  ASSERT_OK(s);
+  s = txn2->SetName("bob");
+  ASSERT_OK(s);
+
+  s = txn1->Prepare();
+  ASSERT_OK(s);
+
+  /* sleep override */
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  s = txn2->Prepare();
+  ASSERT_EQ(s, Status::Expired());
+
+  delete txn1;
+  delete txn2;
+}
+
+TEST_P(TransactionTest, TwoPhaseRollbackTest) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+
+  TransactionOptions txn_options;
+
+  std::string value;
+  Status s;
+
+  DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+  Transaction* txn = db->BeginTransaction(write_options, txn_options);
+  s = txn->SetName("xid");
+  ASSERT_OK(s);
+
+  // transaction put
+  s = txn->Put(Slice("tfoo"), Slice("tbar"));
+  ASSERT_OK(s);
+
+  // value is readable form txn
+  s = txn->Get(read_options, Slice("tfoo"), &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "tbar");
+
+  // issue rollback
+  s = txn->Rollback();
+  ASSERT_OK(s);
+
+  // value is nolonger readable
+  s = txn->Get(read_options, Slice("tfoo"), &value);
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_EQ(txn->GetNumPuts(), 0);
+
+  // put new txn values
+  s = txn->Put(Slice("tfoo2"), Slice("tbar2"));
+  ASSERT_OK(s);
+
+  // new value is readable from txn
+  s = txn->Get(read_options, Slice("tfoo2"), &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "tbar2");
+
+  s = txn->Prepare();
+  ASSERT_OK(s);
+
+  // flush to next wal
+  s = db->Put(write_options, Slice("foo"), Slice("bar"));
+  ASSERT_OK(s);
+  ASSERT_OK(db_impl->TEST_FlushMemTable(true));
+
+  // issue rollback (marker written to WAL)
+  s = txn->Rollback();
+  ASSERT_OK(s);
+
+  // value is nolonger readable
+  s = txn->Get(read_options, Slice("tfoo2"), &value);
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_EQ(txn->GetNumPuts(), 0);
+
+  // make commit
+  s = txn->Commit();
+  ASSERT_EQ(s, Status::InvalidArgument());
+
+  // try rollback again
+  s = txn->Rollback();
+  ASSERT_EQ(s, Status::InvalidArgument());
+
+  delete txn;
+}
+
+TEST_P(TransactionTest, PersistentTwoPhaseTransactionTest) {
+  WriteOptions write_options;
+  write_options.sync = true;
+  write_options.disableWAL = false;
+  ReadOptions read_options;
+
+  TransactionOptions txn_options;
+
+  std::string value;
+  Status s;
+
+  DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+
+  Transaction* txn = db->BeginTransaction(write_options, txn_options);
+  s = txn->SetName("xid");
+  ASSERT_OK(s);
+
+  ASSERT_EQ(db->GetTransactionByName("xid"), txn);
+
+  // transaction put
+  s = txn->Put(Slice("foo"), Slice("bar"));
+  ASSERT_OK(s);
+  ASSERT_EQ(1, txn->GetNumPuts());
+
+  // txn read
+  s = txn->Get(read_options, "foo", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar");
+
+  // regular db put
+  s = db->Put(write_options, Slice("foo2"), Slice("bar2"));
+  ASSERT_OK(s);
+  ASSERT_EQ(1, txn->GetNumPuts());
+
+  ASSERT_OK(db_impl->TEST_FlushMemTable(true));
+
+  // regular db read
+  db->Get(read_options, "foo2", &value);
+  ASSERT_EQ(value, "bar2");
+
+  // nothing has been prepped yet
+  ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
+
+  // prepare
+  s = txn->Prepare();
+  ASSERT_OK(s);
+
+  // still not available to db
+  s = db->Get(read_options, Slice("foo"), &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  ASSERT_OK(db->FlushWAL(false));
+  delete txn;
+  // kill and reopen
+  reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+  s = ReOpenNoDelete();
+  ASSERT_OK(s);
+  assert(db != nullptr);
+  db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+
+  // find trans in list of prepared transactions
+  std::vector<Transaction*> prepared_trans;
+  db->GetAllPreparedTransactions(&prepared_trans);
+  ASSERT_EQ(prepared_trans.size(), 1);
+
+  txn = prepared_trans.front();
+  ASSERT_TRUE(txn);
+  ASSERT_EQ(txn->GetName(), "xid");
+  ASSERT_EQ(db->GetTransactionByName("xid"), txn);
+
+  // log has been marked
+  auto log_containing_prep =
+      db_impl->TEST_FindMinLogContainingOutstandingPrep();
+  ASSERT_GT(log_containing_prep, 0);
+
+  // value is readable from txn
+  s = txn->Get(read_options, "foo", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar");
+
+  // make commit
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  // value is now available
+  db->Get(read_options, "foo", &value);
+  ASSERT_EQ(value, "bar");
+
+  // we already committed
+  s = txn->Commit();
+  ASSERT_EQ(s, Status::InvalidArgument());
+
+  // no longer is prepared results
+  prepared_trans.clear();
+  db->GetAllPreparedTransactions(&prepared_trans);
+  ASSERT_EQ(prepared_trans.size(), 0);
+
+  // transaction should no longer be visible
+  ASSERT_EQ(db->GetTransactionByName("xid"), nullptr);
+
+  // heap should not care about prepared section anymore
+  ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
+
+  switch (txn_db_options.write_policy) {
+    case WRITE_COMMITTED:
+      // but now our memtable should be referencing the prep section
+      ASSERT_EQ(log_containing_prep,
+                db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+      ASSERT_GE(log_containing_prep, db_impl->MinLogNumberToKeep());
+
+      break;
+    case WRITE_PREPARED:
+    case WRITE_UNPREPARED:
+      // In these modes memtable do not ref the prep sections
+      ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+      break;
+    default:
+      assert(false);
+  }
+
+  // Add a dummy record to memtable before a flush. Otherwise, the
+  // memtable will be empty and flush will be skipped.
+  s = db->Put(write_options, Slice("foo3"), Slice("bar3"));
+  ASSERT_OK(s);
+
+  ASSERT_OK(db_impl->TEST_FlushMemTable(true));
+
+  // after memtable flush we can now release the log
+  ASSERT_GT(db_impl->MinLogNumberToKeep(), log_containing_prep);
+  ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+
+  delete txn;
+
+  // deleting transaction should unregister transaction
+  ASSERT_EQ(db->GetTransactionByName("xid"), nullptr);
+}
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+// TODO this test needs to be updated with serial commits
+TEST_P(TransactionTest, DISABLED_TwoPhaseMultiThreadTest) {
+  // mix transaction writes and regular writes
+  const uint32_t NUM_TXN_THREADS = 50;
+  std::atomic<uint32_t> txn_thread_num(0);
+
+  std::function<void()> txn_write_thread = [&]() {
+    uint32_t id = txn_thread_num.fetch_add(1);
+
+    WriteOptions write_options;
+    write_options.sync = true;
+    write_options.disableWAL = false;
+    TransactionOptions txn_options;
+    txn_options.lock_timeout = 1000000;
+    if (id % 2 == 0) {
+      txn_options.expiration = 1000000;
+    }
+    TransactionName name("xid_" + std::string(1, 'A' + static_cast<char>(id)));
+    Transaction* txn = db->BeginTransaction(write_options, txn_options);
+    ASSERT_OK(txn->SetName(name));
+    for (int i = 0; i < 10; i++) {
+      std::string key(name + "_" + std::string(1, static_cast<char>('A' + i)));
+      ASSERT_OK(txn->Put(key, "val"));
+    }
+    ASSERT_OK(txn->Prepare());
+    ASSERT_OK(txn->Commit());
+    delete txn;
+  };
+
+  // assure that all thread are in the same write group
+  std::atomic<uint32_t> t_wait_on_prepare(0);
+  std::atomic<uint32_t> t_wait_on_commit(0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::JoinBatchGroup:Wait", [&](void* arg) {
+        auto* writer = reinterpret_cast<WriteThread::Writer*>(arg);
+
+        if (writer->ShouldWriteToWAL()) {
+          t_wait_on_prepare.fetch_add(1);
+          // wait for friends
+          while (t_wait_on_prepare.load() < NUM_TXN_THREADS) {
+            env->SleepForMicroseconds(10);
+          }
+        } else if (writer->ShouldWriteToMemtable()) {
+          t_wait_on_commit.fetch_add(1);
+          // wait for friends
+          while (t_wait_on_commit.load() < NUM_TXN_THREADS) {
+            env->SleepForMicroseconds(10);
+          }
+        } else {
+          FAIL();
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // do all the writes
+  std::vector<port::Thread> threads;
+  for (uint32_t i = 0; i < NUM_TXN_THREADS; i++) {
+    threads.emplace_back(txn_write_thread);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ReadOptions read_options;
+  std::string value;
+  Status s;
+  for (uint32_t t = 0; t < NUM_TXN_THREADS; t++) {
+    TransactionName name("xid_" + std::string(1, 'A' + static_cast<char>(t)));
+    for (int i = 0; i < 10; i++) {
+      std::string key(name + "_" + std::string(1, static_cast<char>('A' + i)));
+      s = db->Get(read_options, key, &value);
+      ASSERT_OK(s);
+      ASSERT_EQ(value, "val");
+    }
+  }
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+TEST_P(TransactionStressTest, TwoPhaseLongPrepareTest) {
+  WriteOptions write_options;
+  write_options.sync = true;
+  write_options.disableWAL = false;
+  ReadOptions read_options;
+  TransactionOptions txn_options;
+
+  std::string value;
+  Status s;
+
+  Transaction* txn = db->BeginTransaction(write_options, txn_options);
+  s = txn->SetName("bob");
+  ASSERT_OK(s);
+
+  // transaction put
+  s = txn->Put(Slice("foo"), Slice("bar"));
+  ASSERT_OK(s);
+
+  // prepare
+  s = txn->Prepare();
+  ASSERT_OK(s);
+
+  delete txn;
+
+  for (int i = 0; i < 1000; i++) {
+    std::string key(i, 'k');
+    std::string val(1000, 'v');
+    assert(db != nullptr);
+    s = db->Put(write_options, key, val);
+    ASSERT_OK(s);
+
+    if (i % 29 == 0) {
+      // crash
+      env->SetFilesystemActive(false);
+      reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+      ReOpenNoDelete();
+    } else if (i % 37 == 0) {
+      // close
+      ReOpenNoDelete();
+    }
+  }
+
+  // commit old txn
+  txn = db->GetTransactionByName("bob");
+  ASSERT_TRUE(txn);
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  // verify data txn data
+  s = db->Get(read_options, "foo", &value);
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_EQ(value, "bar");
+
+  // verify non txn data
+  for (int i = 0; i < 1000; i++) {
+    std::string key(i, 'k');
+    std::string val(1000, 'v');
+    s = db->Get(read_options, key, &value);
+    ASSERT_EQ(s, Status::OK());
+    ASSERT_EQ(value, val);
+  }
+
+  delete txn;
+}
+
+TEST_P(TransactionTest, TwoPhaseSequenceTest) {
+  WriteOptions write_options;
+  write_options.sync = true;
+  write_options.disableWAL = false;
+  ReadOptions read_options;
+
+  TransactionOptions txn_options;
+
+  std::string value;
+  Status s;
+
+  Transaction* txn = db->BeginTransaction(write_options, txn_options);
+  s = txn->SetName("xid");
+  ASSERT_OK(s);
+
+  // transaction put
+  s = txn->Put(Slice("foo"), Slice("bar"));
+  ASSERT_OK(s);
+  s = txn->Put(Slice("foo2"), Slice("bar2"));
+  ASSERT_OK(s);
+  s = txn->Put(Slice("foo3"), Slice("bar3"));
+  ASSERT_OK(s);
+  s = txn->Put(Slice("foo4"), Slice("bar4"));
+  ASSERT_OK(s);
+
+  // prepare
+  s = txn->Prepare();
+  ASSERT_OK(s);
+
+  // make commit
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  delete txn;
+
+  // kill and reopen
+  env->SetFilesystemActive(false);
+  ReOpenNoDelete();
+  assert(db != nullptr);
+
+  // value is now available
+  s = db->Get(read_options, "foo4", &value);
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_EQ(value, "bar4");
+}
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_P(TransactionTest, TwoPhaseDoubleRecoveryTest) {
+  WriteOptions write_options;
+  write_options.sync = true;
+  write_options.disableWAL = false;
+  ReadOptions read_options;
+
+  TransactionOptions txn_options;
+
+  std::string value;
+  Status s;
+
+  Transaction* txn = db->BeginTransaction(write_options, txn_options);
+  s = txn->SetName("a");
+  ASSERT_OK(s);
+
+  // transaction put
+  s = txn->Put(Slice("foo"), Slice("bar"));
+  ASSERT_OK(s);
+
+  // prepare
+  s = txn->Prepare();
+  ASSERT_OK(s);
+
+  delete txn;
+
+  // kill and reopen
+  env->SetFilesystemActive(false);
+  reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+  ReOpenNoDelete();
+
+  // commit old txn
+  assert(db != nullptr);  // Make clang analyze happy.
+  txn = db->GetTransactionByName("a");
+  assert(txn != nullptr);
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "foo", &value);
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_EQ(value, "bar");
+
+  delete txn;
+
+  txn = db->BeginTransaction(write_options, txn_options);
+  s = txn->SetName("b");
+  ASSERT_OK(s);
+
+  s = txn->Put(Slice("foo2"), Slice("bar2"));
+  ASSERT_OK(s);
+
+  s = txn->Prepare();
+  ASSERT_OK(s);
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  delete txn;
+
+  // kill and reopen
+  env->SetFilesystemActive(false);
+  ASSERT_OK(ReOpenNoDelete());
+  assert(db != nullptr);
+
+  // value is now available
+  s = db->Get(read_options, "foo", &value);
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_EQ(value, "bar");
+
+  s = db->Get(read_options, "foo2", &value);
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_EQ(value, "bar2");
+}
+
+TEST_P(TransactionTest, TwoPhaseLogRollingTest) {
+  DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+
+  Status s;
+  std::string v;
+  ColumnFamilyHandle *cfa, *cfb;
+
+  // Create 2 new column families
+  ColumnFamilyOptions cf_options;
+  s = db->CreateColumnFamily(cf_options, "CFA", &cfa);
+  ASSERT_OK(s);
+  s = db->CreateColumnFamily(cf_options, "CFB", &cfb);
+  ASSERT_OK(s);
+
+  WriteOptions wopts;
+  wopts.disableWAL = false;
+  wopts.sync = true;
+
+  TransactionOptions topts1;
+  Transaction* txn1 = db->BeginTransaction(wopts, topts1);
+  s = txn1->SetName("xid1");
+  ASSERT_OK(s);
+
+  TransactionOptions topts2;
+  Transaction* txn2 = db->BeginTransaction(wopts, topts2);
+  s = txn2->SetName("xid2");
+  ASSERT_OK(s);
+
+  // transaction put in two column families
+  s = txn1->Put(cfa, "ka1", "va1");
+  ASSERT_OK(s);
+
+  // transaction put in two column families
+  s = txn2->Put(cfa, "ka2", "va2");
+  ASSERT_OK(s);
+  s = txn2->Put(cfb, "kb2", "vb2");
+  ASSERT_OK(s);
+
+  // write prep section to wal
+  s = txn1->Prepare();
+  ASSERT_OK(s);
+
+  // our log should be in the heap
+  ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
+            txn1->GetLogNumber());
+  ASSERT_EQ(db_impl->TEST_LogfileNumber(), txn1->GetLastLogNumber());
+
+  // flush default cf to crate new log
+  s = db->Put(wopts, "foo", "bar");
+  ASSERT_OK(s);
+  s = db_impl->TEST_FlushMemTable(true);
+  ASSERT_OK(s);
+
+  // make sure we are on a new log
+  ASSERT_GT(db_impl->TEST_LogfileNumber(), txn1->GetLastLogNumber());
+
+  // put txn2 prep section in this log
+  s = txn2->Prepare();
+  ASSERT_OK(s);
+  ASSERT_EQ(db_impl->TEST_LogfileNumber(), txn2->GetLastLogNumber());
+
+  // heap should still see first log
+  ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
+            txn1->GetLogNumber());
+
+  // commit txn1
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  // heap should now show txn2s log
+  ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
+            txn2->GetLogNumber());
+
+  switch (txn_db_options.write_policy) {
+    case WRITE_COMMITTED:
+      // we should see txn1s log refernced by the memtables
+      ASSERT_EQ(txn1->GetLogNumber(),
+                db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+      break;
+    case WRITE_PREPARED:
+    case WRITE_UNPREPARED:
+      // In these modes memtable do not ref the prep sections
+      ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+      break;
+    default:
+      assert(false);
+  }
+
+  // flush default cf to crate new log
+  s = db->Put(wopts, "foo", "bar2");
+  ASSERT_OK(s);
+  s = db_impl->TEST_FlushMemTable(true);
+  ASSERT_OK(s);
+
+  // make sure we are on a new log
+  ASSERT_GT(db_impl->TEST_LogfileNumber(), txn2->GetLastLogNumber());
+
+  // commit txn2
+  s = txn2->Commit();
+  ASSERT_OK(s);
+
+  // heap should not show any logs
+  ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
+
+  switch (txn_db_options.write_policy) {
+    case WRITE_COMMITTED:
+      // should show the first txn log
+      ASSERT_EQ(txn1->GetLogNumber(),
+                db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+      break;
+    case WRITE_PREPARED:
+    case WRITE_UNPREPARED:
+      // In these modes memtable do not ref the prep sections
+      ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+      break;
+    default:
+      assert(false);
+  }
+
+  // flush only cfa memtable
+  s = db_impl->TEST_FlushMemTable(true, false, cfa);
+  ASSERT_OK(s);
+
+  switch (txn_db_options.write_policy) {
+    case WRITE_COMMITTED:
+      // should show the first txn log
+      ASSERT_EQ(txn2->GetLogNumber(),
+                db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+      break;
+    case WRITE_PREPARED:
+    case WRITE_UNPREPARED:
+      // In these modes memtable do not ref the prep sections
+      ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+      break;
+    default:
+      assert(false);
+  }
+
+  // flush only cfb memtable
+  s = db_impl->TEST_FlushMemTable(true, false, cfb);
+  ASSERT_OK(s);
+
+  // should show not dependency on logs
+  ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(), 0);
+  ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
+
+  delete txn1;
+  delete txn2;
+  delete cfa;
+  delete cfb;
+}
+
+TEST_P(TransactionTest, TwoPhaseLogRollingTest2) {
+  DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+
+  Status s;
+  ColumnFamilyHandle *cfa, *cfb;
+
+  ColumnFamilyOptions cf_options;
+  s = db->CreateColumnFamily(cf_options, "CFA", &cfa);
+  ASSERT_OK(s);
+  s = db->CreateColumnFamily(cf_options, "CFB", &cfb);
+  ASSERT_OK(s);
+
+  WriteOptions wopts;
+  wopts.disableWAL = false;
+  wopts.sync = true;
+
+  auto cfh_a = static_cast_with_check<ColumnFamilyHandleImpl>(cfa);
+  auto cfh_b = static_cast_with_check<ColumnFamilyHandleImpl>(cfb);
+
+  TransactionOptions topts1;
+  Transaction* txn1 = db->BeginTransaction(wopts, topts1);
+  s = txn1->SetName("xid1");
+  ASSERT_OK(s);
+  s = txn1->Put(cfa, "boys", "girls1");
+  ASSERT_OK(s);
+
+  Transaction* txn2 = db->BeginTransaction(wopts, topts1);
+  s = txn2->SetName("xid2");
+  ASSERT_OK(s);
+  s = txn2->Put(cfb, "up", "down1");
+  ASSERT_OK(s);
+
+  // prepre transaction in LOG A
+  s = txn1->Prepare();
+  ASSERT_OK(s);
+
+  // prepre transaction in LOG A
+  s = txn2->Prepare();
+  ASSERT_OK(s);
+
+  // regular put so that mem table can actually be flushed for log rolling
+  s = db->Put(wopts, "cats", "dogs1");
+  ASSERT_OK(s);
+
+  auto prepare_log_no = txn1->GetLastLogNumber();
+
+  // roll to LOG B
+  s = db_impl->TEST_FlushMemTable(true);
+  ASSERT_OK(s);
+
+  // now we pause background work so that
+  // imm()s are not flushed before we can check their status
+  s = db_impl->PauseBackgroundWork();
+  ASSERT_OK(s);
+
+  ASSERT_GT(db_impl->TEST_LogfileNumber(), prepare_log_no);
+  switch (txn_db_options.write_policy) {
+    case WRITE_COMMITTED:
+      // This cf is empty and should ref the latest log
+      ASSERT_GT(cfh_a->cfd()->GetLogNumber(), prepare_log_no);
+      ASSERT_EQ(cfh_a->cfd()->GetLogNumber(), db_impl->TEST_LogfileNumber());
+      break;
+    case WRITE_PREPARED:
+    case WRITE_UNPREPARED:
+      // This cf is not flushed yet and should ref the log that has its data
+      ASSERT_EQ(cfh_a->cfd()->GetLogNumber(), prepare_log_no);
+      break;
+    default:
+      assert(false);
+  }
+  ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
+            txn1->GetLogNumber());
+  ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(), 0);
+
+  // commit in LOG B
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  switch (txn_db_options.write_policy) {
+    case WRITE_COMMITTED:
+      ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(),
+                prepare_log_no);
+      break;
+    case WRITE_PREPARED:
+    case WRITE_UNPREPARED:
+      // In these modes memtable do not ref the prep sections
+      ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(), 0);
+      break;
+    default:
+      assert(false);
+  }
+
+  ASSERT_TRUE(!db_impl->TEST_UnableToReleaseOldestLog());
+
+  // request a flush for all column families such that the earliest
+  // alive log file can be killed
+  ASSERT_OK(db_impl->TEST_SwitchWAL());
+  // log cannot be flushed because txn2 has not been commited
+  ASSERT_TRUE(!db_impl->TEST_IsLogGettingFlushed());
+  ASSERT_TRUE(db_impl->TEST_UnableToReleaseOldestLog());
+
+  // assert that cfa has a flush requested
+  ASSERT_TRUE(cfh_a->cfd()->imm()->HasFlushRequested());
+
+  switch (txn_db_options.write_policy) {
+    case WRITE_COMMITTED:
+      // cfb should not be flushed becuse it has no data from LOG A
+      ASSERT_TRUE(!cfh_b->cfd()->imm()->HasFlushRequested());
+      break;
+    case WRITE_PREPARED:
+    case WRITE_UNPREPARED:
+      // cfb should be flushed becuse it has prepared data from LOG A
+      ASSERT_TRUE(cfh_b->cfd()->imm()->HasFlushRequested());
+      break;
+    default:
+      assert(false);
+  }
+
+  // cfb now has data from LOG A
+  s = txn2->Commit();
+  ASSERT_OK(s);
+
+  ASSERT_OK(db_impl->TEST_SwitchWAL());
+  ASSERT_TRUE(!db_impl->TEST_UnableToReleaseOldestLog());
+
+  // we should see that cfb now has a flush requested
+  ASSERT_TRUE(cfh_b->cfd()->imm()->HasFlushRequested());
+
+  // all data in LOG A resides in a memtable that has been
+  // requested for a flush
+  ASSERT_TRUE(db_impl->TEST_IsLogGettingFlushed());
+
+  delete txn1;
+  delete txn2;
+  delete cfa;
+  delete cfb;
+}
+/*
+ * 1) use prepare to keep first log around to determine starting sequence
+ * during recovery.
+ * 2) insert many values, skipping wal, to increase seqid.
+ * 3) insert final value into wal
+ * 4) recover and see that final value was properly recovered - not
+ * hidden behind improperly summed sequence ids
+ */
+TEST_P(TransactionTest, TwoPhaseOutOfOrderDelete) {
+  DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+  WriteOptions wal_on, wal_off;
+  wal_on.sync = true;
+  wal_on.disableWAL = false;
+  wal_off.disableWAL = true;
+  ReadOptions read_options;
+  TransactionOptions txn_options;
+
+  std::string value;
+  Status s;
+
+  Transaction* txn1 = db->BeginTransaction(wal_on, txn_options);
+
+  s = txn1->SetName("1");
+  ASSERT_OK(s);
+
+  s = db->Put(wal_on, "first", "first");
+  ASSERT_OK(s);
+
+  s = txn1->Put(Slice("dummy"), Slice("dummy"));
+  ASSERT_OK(s);
+  s = txn1->Prepare();
+  ASSERT_OK(s);
+
+  s = db->Put(wal_off, "cats", "dogs1");
+  ASSERT_OK(s);
+  s = db->Put(wal_off, "cats", "dogs2");
+  ASSERT_OK(s);
+  s = db->Put(wal_off, "cats", "dogs3");
+  ASSERT_OK(s);
+
+  s = db_impl->TEST_FlushMemTable(true);
+  ASSERT_OK(s);
+
+  s = db->Put(wal_on, "cats", "dogs4");
+  ASSERT_OK(s);
+
+  ASSERT_OK(db->FlushWAL(false));
+
+  // kill and reopen
+  env->SetFilesystemActive(false);
+  reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+  ASSERT_OK(ReOpenNoDelete());
+  assert(db != nullptr);
+
+  s = db->Get(read_options, "first", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "first");
+
+  s = db->Get(read_options, "cats", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "dogs4");
+}
+
+TEST_P(TransactionTest, FirstWriteTest) {
+  WriteOptions write_options;
+
+  // Test conflict checking against the very first write to a db.
+  // The transaction's snapshot will have seq 1 and the following write
+  // will have sequence 1.
+  Status s = db->Put(write_options, "A", "a");
+
+  Transaction* txn = db->BeginTransaction(write_options);
+  txn->SetSnapshot();
+
+  ASSERT_OK(s);
+
+  s = txn->Put("A", "b");
+  ASSERT_OK(s);
+
+  delete txn;
+}
+
+TEST_P(TransactionTest, FirstWriteTest2) {
+  WriteOptions write_options;
+
+  Transaction* txn = db->BeginTransaction(write_options);
+  txn->SetSnapshot();
+
+  // Test conflict checking against the very first write to a db.
+  // The transaction's snapshot is a seq 0 while the following write
+  // will have sequence 1.
+  Status s = db->Put(write_options, "A", "a");
+  ASSERT_OK(s);
+
+  s = txn->Put("A", "b");
+  ASSERT_TRUE(s.IsBusy());
+
+  delete txn;
+}
+
+TEST_P(TransactionTest, WriteOptionsTest) {
+  WriteOptions write_options;
+  write_options.sync = true;
+  write_options.disableWAL = true;
+
+  Transaction* txn = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  ASSERT_TRUE(txn->GetWriteOptions()->sync);
+
+  write_options.sync = false;
+  txn->SetWriteOptions(write_options);
+  ASSERT_FALSE(txn->GetWriteOptions()->sync);
+  ASSERT_TRUE(txn->GetWriteOptions()->disableWAL);
+
+  delete txn;
+}
+
+TEST_P(TransactionTest, WriteConflictTest) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  std::string value;
+  Status s;
+
+  ASSERT_OK(db->Put(write_options, "foo", "A"));
+  ASSERT_OK(db->Put(write_options, "foo2", "B"));
+
+  Transaction* txn = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  s = txn->Put("foo", "A2");
+  ASSERT_OK(s);
+
+  s = txn->Put("foo2", "B2");
+  ASSERT_OK(s);
+
+  // This Put outside of a transaction will conflict with the previous write
+  s = db->Put(write_options, "foo", "xxx");
+  ASSERT_TRUE(s.IsTimedOut());
+
+  s = db->Get(read_options, "foo", &value);
+  ASSERT_EQ(value, "A");
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  db->Get(read_options, "foo", &value);
+  ASSERT_EQ(value, "A2");
+  db->Get(read_options, "foo2", &value);
+  ASSERT_EQ(value, "B2");
+
+  delete txn;
+}
+
+TEST_P(TransactionTest, WriteConflictTest2) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  TransactionOptions txn_options;
+  std::string value;
+  Status s;
+
+  ASSERT_OK(db->Put(write_options, "foo", "bar"));
+
+  txn_options.set_snapshot = true;
+  Transaction* txn = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn);
+
+  // This Put outside of a transaction will conflict with a later write
+  s = db->Put(write_options, "foo", "barz");
+  ASSERT_OK(s);
+
+  s = txn->Put("foo2", "X");
+  ASSERT_OK(s);
+
+  s = txn->Put("foo",
+               "bar2");  // Conflicts with write done after snapshot taken
+  ASSERT_TRUE(s.IsBusy());
+
+  s = txn->Put("foo3", "Y");
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "foo", &value);
+  ASSERT_EQ(value, "barz");
+
+  ASSERT_EQ(2, txn->GetNumKeys());
+
+  s = txn->Commit();
+  ASSERT_OK(s);  // Txn should commit, but only write foo2 and foo3
+
+  // Verify that transaction wrote foo2 and foo3 but not foo
+  db->Get(read_options, "foo", &value);
+  ASSERT_EQ(value, "barz");
+
+  db->Get(read_options, "foo2", &value);
+  ASSERT_EQ(value, "X");
+
+  db->Get(read_options, "foo3", &value);
+  ASSERT_EQ(value, "Y");
+
+  delete txn;
+}
+
+TEST_P(TransactionTest, ReadConflictTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  TransactionOptions txn_options;
+  std::string value;
+  Status s;
+
+  ASSERT_OK(db->Put(write_options, "foo", "bar"));
+  ASSERT_OK(db->Put(write_options, "foo2", "bar"));
+
+  txn_options.set_snapshot = true;
+  Transaction* txn = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn);
+
+  txn->SetSnapshot();
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
+  ASSERT_EQ(value, "bar");
+
+  // This Put outside of a transaction will conflict with the previous read
+  s = db->Put(write_options, "foo", "barz");
+  ASSERT_TRUE(s.IsTimedOut());
+
+  s = db->Get(read_options, "foo", &value);
+  ASSERT_EQ(value, "bar");
+
+  s = txn->Get(read_options, "foo", &value);
+  ASSERT_EQ(value, "bar");
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  delete txn;
+}
+
+TEST_P(TransactionTest, TxnOnlyTest) {
+  // Test to make sure transactions work when there are no other writes in an
+  // empty db.
+
+  WriteOptions write_options;
+  ReadOptions read_options;
+  std::string value;
+  Status s;
+
+  Transaction* txn = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  s = txn->Put("x", "y");
+  ASSERT_OK(s);
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  delete txn;
+}
+
+TEST_P(TransactionTest, FlushTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  std::string value;
+  Status s;
+
+  ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar")));
+  ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar")));
+
+  Transaction* txn = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
+  ASSERT_EQ(value, "bar");
+
+  s = txn->Put(Slice("foo"), Slice("bar2"));
+  ASSERT_OK(s);
+
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
+  ASSERT_EQ(value, "bar2");
+
+  // Put a random key so we have a memtable to flush
+  s = db->Put(write_options, "dummy", "dummy");
+  ASSERT_OK(s);
+
+  // force a memtable flush
+  FlushOptions flush_ops;
+  db->Flush(flush_ops);
+
+  s = txn->Commit();
+  // txn should commit since the flushed table is still in MemtableList History
+  ASSERT_OK(s);
+
+  db->Get(read_options, "foo", &value);
+  ASSERT_EQ(value, "bar2");
+
+  delete txn;
+}
+
+TEST_P(TransactionTest, FlushTest2) {
+  const size_t num_tests = 3;
+
+  for (size_t n = 0; n < num_tests; n++) {
+    // Test different table factories
+    switch (n) {
+      case 0:
+        break;
+      case 1:
+        options.table_factory.reset(new mock::MockTableFactory());
+        break;
+      case 2: {
+        PlainTableOptions pt_opts;
+        pt_opts.hash_table_ratio = 0;
+        options.table_factory.reset(NewPlainTableFactory(pt_opts));
+        break;
+      }
+    }
+
+    Status s = ReOpen();
+    ASSERT_OK(s);
+    assert(db != nullptr);
+
+    WriteOptions write_options;
+    ReadOptions read_options, snapshot_read_options;
+    TransactionOptions txn_options;
+    std::string value;
+
+    DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+
+    ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar")));
+    ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar2")));
+    ASSERT_OK(db->Put(write_options, Slice("foo3"), Slice("bar3")));
+
+    txn_options.set_snapshot = true;
+    Transaction* txn = db->BeginTransaction(write_options, txn_options);
+    ASSERT_TRUE(txn);
+
+    snapshot_read_options.snapshot = txn->GetSnapshot();
+
+    ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
+    ASSERT_EQ(value, "bar");
+
+    s = txn->Put(Slice("foo"), Slice("bar2"));
+    ASSERT_OK(s);
+
+    ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
+    ASSERT_EQ(value, "bar2");
+    // verify foo is locked by txn
+    s = db->Delete(write_options, "foo");
+    ASSERT_TRUE(s.IsTimedOut());
+
+    s = db->Put(write_options, "Z", "z");
+    ASSERT_OK(s);
+    s = db->Put(write_options, "dummy", "dummy");
+    ASSERT_OK(s);
+
+    s = db->Put(write_options, "S", "s");
+    ASSERT_OK(s);
+    s = db->SingleDelete(write_options, "S");
+    ASSERT_OK(s);
+
+    s = txn->Delete("S");
+    // Should fail after encountering a write to S in memtable
+    ASSERT_TRUE(s.IsBusy());
+
+    // force a memtable flush
+    s = db_impl->TEST_FlushMemTable(true);
+    ASSERT_OK(s);
+
+    // Put a random key so we have a MemTable to flush
+    s = db->Put(write_options, "dummy", "dummy2");
+    ASSERT_OK(s);
+
+    // force a memtable flush
+    ASSERT_OK(db_impl->TEST_FlushMemTable(true));
+
+    s = db->Put(write_options, "dummy", "dummy3");
+    ASSERT_OK(s);
+
+    // force a memtable flush
+    // Since our test db has max_write_buffer_number=2, this flush will cause
+    // the first memtable to get purged from the MemtableList history.
+    ASSERT_OK(db_impl->TEST_FlushMemTable(true));
+
+    s = txn->Put("X", "Y");
+    // Should succeed after verifying there is no write to X in SST file
+    ASSERT_OK(s);
+
+    s = txn->Put("Z", "zz");
+    // Should fail after encountering a write to Z in SST file
+    ASSERT_TRUE(s.IsBusy());
+
+    s = txn->GetForUpdate(read_options, "foo2", &value);
+    // should succeed since key was written before txn started
+    ASSERT_OK(s);
+    // verify foo2 is locked by txn
+    s = db->Delete(write_options, "foo2");
+    ASSERT_TRUE(s.IsTimedOut());
+
+    s = txn->Delete("S");
+    // Should fail after encountering a write to S in SST file
+    ASSERT_TRUE(s.IsBusy());
+
+    // Write a bunch of keys to db to force a compaction
+    Random rnd(47);
+    for (int i = 0; i < 1000; i++) {
+      s = db->Put(write_options, std::to_string(i),
+                  test::CompressibleString(&rnd, 0.8, 100, &value));
+      ASSERT_OK(s);
+    }
+
+    s = txn->Put("X", "yy");
+    // Should succeed after verifying there is no write to X in SST file
+    ASSERT_OK(s);
+
+    s = txn->Put("Z", "zzz");
+    // Should fail after encountering a write to Z in SST file
+    ASSERT_TRUE(s.IsBusy());
+
+    s = txn->Delete("S");
+    // Should fail after encountering a write to S in SST file
+    ASSERT_TRUE(s.IsBusy());
+
+    s = txn->GetForUpdate(read_options, "foo3", &value);
+    // should succeed since key was written before txn started
+    ASSERT_OK(s);
+    // verify foo3 is locked by txn
+    s = db->Delete(write_options, "foo3");
+    ASSERT_TRUE(s.IsTimedOut());
+
+    ASSERT_OK(db_impl->TEST_WaitForCompact());
+
+    s = txn->Commit();
+    ASSERT_OK(s);
+
+    // Transaction should only write the keys that succeeded.
+    s = db->Get(read_options, "foo", &value);
+    ASSERT_EQ(value, "bar2");
+
+    s = db->Get(read_options, "X", &value);
+    ASSERT_OK(s);
+    ASSERT_EQ("yy", value);
+
+    s = db->Get(read_options, "Z", &value);
+    ASSERT_OK(s);
+    ASSERT_EQ("z", value);
+
+    delete txn;
+  }
+}
+
+TEST_P(TransactionTest, NoSnapshotTest) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  std::string value;
+  Status s;
+
+  ASSERT_OK(db->Put(write_options, "AAA", "bar"));
+
+  Transaction* txn = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  // Modify key after transaction start
+  ASSERT_OK(db->Put(write_options, "AAA", "bar1"));
+
+  // Read and write without a snap
+  ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
+  ASSERT_EQ(value, "bar1");
+  s = txn->Put("AAA", "bar2");
+  ASSERT_OK(s);
+
+  // Should commit since read/write was done after data changed
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
+  ASSERT_EQ(value, "bar2");
+
+  delete txn;
+}
+
+TEST_P(TransactionTest, MultipleSnapshotTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  std::string value;
+  Status s;
+
+  ASSERT_OK(db->Put(write_options, "AAA", "bar"));
+  ASSERT_OK(db->Put(write_options, "BBB", "bar"));
+  ASSERT_OK(db->Put(write_options, "CCC", "bar"));
+
+  Transaction* txn = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  ASSERT_OK(db->Put(write_options, "AAA", "bar1"));
+
+  // Read and write without a snapshot
+  ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
+  ASSERT_EQ(value, "bar1");
+  s = txn->Put("AAA", "bar2");
+  ASSERT_OK(s);
+
+  // Modify BBB before snapshot is taken
+  ASSERT_OK(db->Put(write_options, "BBB", "bar1"));
+
+  txn->SetSnapshot();
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+
+  // Read and write with snapshot
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "BBB", &value));
+  ASSERT_EQ(value, "bar1");
+  s = txn->Put("BBB", "bar2");
+  ASSERT_OK(s);
+
+  ASSERT_OK(db->Put(write_options, "CCC", "bar1"));
+
+  // Set a new snapshot
+  txn->SetSnapshot();
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+
+  // Read and write with snapshot
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "CCC", &value));
+  ASSERT_EQ(value, "bar1");
+  s = txn->Put("CCC", "bar2");
+  ASSERT_OK(s);
+
+  s = txn->GetForUpdate(read_options, "AAA", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar2");
+  s = txn->GetForUpdate(read_options, "BBB", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar2");
+  s = txn->GetForUpdate(read_options, "CCC", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar2");
+
+  s = db->Get(read_options, "AAA", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar1");
+  s = db->Get(read_options, "BBB", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar1");
+  s = db->Get(read_options, "CCC", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar1");
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "AAA", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar2");
+  s = db->Get(read_options, "BBB", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar2");
+  s = db->Get(read_options, "CCC", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar2");
+
+  // verify that we track multiple writes to the same key at different snapshots
+  delete txn;
+  txn = db->BeginTransaction(write_options);
+
+  // Potentially conflicting writes
+  ASSERT_OK(db->Put(write_options, "ZZZ", "zzz"));
+  ASSERT_OK(db->Put(write_options, "XXX", "xxx"));
+
+  txn->SetSnapshot();
+
+  TransactionOptions txn_options;
+  txn_options.set_snapshot = true;
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+  txn2->SetSnapshot();
+
+  // This should not conflict in txn since the snapshot is later than the
+  // previous write (spoiler alert:  it will later conflict with txn2).
+  s = txn->Put("ZZZ", "zzzz");
+  ASSERT_OK(s);
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  delete txn;
+
+  // This will conflict since the snapshot is earlier than another write to ZZZ
+  s = txn2->Put("ZZZ", "xxxxx");
+  ASSERT_TRUE(s.IsBusy());
+
+  s = txn2->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "ZZZ", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "zzzz");
+
+  delete txn2;
+}
+
+TEST_P(TransactionTest, ColumnFamiliesTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  TransactionOptions txn_options;
+  std::string value;
+  Status s;
+
+  ColumnFamilyHandle *cfa, *cfb;
+  ColumnFamilyOptions cf_options;
+
+  // Create 2 new column families
+  s = db->CreateColumnFamily(cf_options, "CFA", &cfa);
+  ASSERT_OK(s);
+  s = db->CreateColumnFamily(cf_options, "CFB", &cfb);
+  ASSERT_OK(s);
+
+  delete cfa;
+  delete cfb;
+  delete db;
+  db = nullptr;
+
+  // open DB with three column families
+  std::vector<ColumnFamilyDescriptor> column_families;
+  // have to open default column family
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions()));
+  // open the new column families
+  column_families.push_back(
+      ColumnFamilyDescriptor("CFA", ColumnFamilyOptions()));
+  column_families.push_back(
+      ColumnFamilyDescriptor("CFB", ColumnFamilyOptions()));
+
+  std::vector<ColumnFamilyHandle*> handles;
+
+  ASSERT_OK(ReOpenNoDelete(column_families, &handles));
+  assert(db != nullptr);
+
+  Transaction* txn = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  txn->SetSnapshot();
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+
+  txn_options.set_snapshot = true;
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn2);
+
+  // Write some data to the db
+  WriteBatch batch;
+  ASSERT_OK(batch.Put("foo", "foo"));
+  ASSERT_OK(batch.Put(handles[1], "AAA", "bar"));
+  ASSERT_OK(batch.Put(handles[1], "AAAZZZ", "bar"));
+  s = db->Write(write_options, &batch);
+  ASSERT_OK(s);
+  ASSERT_OK(db->Delete(write_options, handles[1], "AAAZZZ"));
+
+  // These keys do not conflict with existing writes since they're in
+  // different column families
+  s = txn->Delete("AAA");
+  ASSERT_OK(s);
+  s = txn->GetForUpdate(snapshot_read_options, handles[1], "foo", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  Slice key_slice("AAAZZZ");
+  Slice value_slices[2] = {Slice("bar"), Slice("bar")};
+  s = txn->Put(handles[2], SliceParts(&key_slice, 1),
+               SliceParts(value_slices, 2));
+  ASSERT_OK(s);
+  ASSERT_EQ(3, txn->GetNumKeys());
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+  s = db->Get(read_options, "AAA", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  s = db->Get(read_options, handles[2], "AAAZZZ", &value);
+  ASSERT_EQ(value, "barbar");
+
+  Slice key_slices[3] = {Slice("AAA"), Slice("ZZ"), Slice("Z")};
+  Slice value_slice("barbarbar");
+
+  s = txn2->Delete(handles[2], "XXX");
+  ASSERT_OK(s);
+  s = txn2->Delete(handles[1], "XXX");
+  ASSERT_OK(s);
+
+  // This write will cause a conflict with the earlier batch write
+  s = txn2->Put(handles[1], SliceParts(key_slices, 3),
+                SliceParts(&value_slice, 1));
+  ASSERT_TRUE(s.IsBusy());
+
+  s = txn2->Commit();
+  ASSERT_OK(s);
+  // In the above the latest change to AAAZZZ in handles[1] is delete.
+  s = db->Get(read_options, handles[1], "AAAZZZ", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  delete txn;
+  delete txn2;
+
+  txn = db->BeginTransaction(write_options, txn_options);
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+
+  txn2 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn);
+
+  std::vector<ColumnFamilyHandle*> multiget_cfh = {handles[1], handles[2],
+                                                   handles[0], handles[2]};
+  std::vector<Slice> multiget_keys = {"AAA", "AAAZZZ", "foo", "foo"};
+  std::vector<std::string> values(4);
+  std::vector<Status> results = txn->MultiGetForUpdate(
+      snapshot_read_options, multiget_cfh, multiget_keys, &values);
+  ASSERT_OK(results[0]);
+  ASSERT_OK(results[1]);
+  ASSERT_OK(results[2]);
+  ASSERT_TRUE(results[3].IsNotFound());
+  ASSERT_EQ(values[0], "bar");
+  ASSERT_EQ(values[1], "barbar");
+  ASSERT_EQ(values[2], "foo");
+
+  s = txn->SingleDelete(handles[2], "ZZZ");
+  ASSERT_OK(s);
+  s = txn->Put(handles[2], "ZZZ", "YYY");
+  ASSERT_OK(s);
+  s = txn->Put(handles[2], "ZZZ", "YYYY");
+  ASSERT_OK(s);
+  s = txn->Delete(handles[2], "ZZZ");
+  ASSERT_OK(s);
+  s = txn->Put(handles[2], "AAAZZZ", "barbarbar");
+  ASSERT_OK(s);
+
+  ASSERT_EQ(5, txn->GetNumKeys());
+
+  // Txn should commit
+  s = txn->Commit();
+  ASSERT_OK(s);
+  s = db->Get(read_options, handles[2], "ZZZ", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  // Put a key which will conflict with the next txn using the previous snapshot
+  ASSERT_OK(db->Put(write_options, handles[2], "foo", "000"));
+
+  results = txn2->MultiGetForUpdate(snapshot_read_options, multiget_cfh,
+                                    multiget_keys, &values);
+  // All results should fail since there was a conflict
+  ASSERT_TRUE(results[0].IsBusy());
+  ASSERT_TRUE(results[1].IsBusy());
+  ASSERT_TRUE(results[2].IsBusy());
+  ASSERT_TRUE(results[3].IsBusy());
+
+  s = db->Get(read_options, handles[2], "foo", &value);
+  ASSERT_EQ(value, "000");
+
+  s = txn2->Commit();
+  ASSERT_OK(s);
+
+  s = db->DropColumnFamily(handles[1]);
+  ASSERT_OK(s);
+  s = db->DropColumnFamily(handles[2]);
+  ASSERT_OK(s);
+
+  delete txn;
+  delete txn2;
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+}
+
+TEST_P(TransactionTest, MultiGetBatchedTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  TransactionOptions txn_options;
+  std::string value;
+  Status s;
+
+  ColumnFamilyHandle* cf;
+  ColumnFamilyOptions cf_options;
+
+  // Create a new column families
+  s = db->CreateColumnFamily(cf_options, "CF", &cf);
+  ASSERT_OK(s);
+
+  delete cf;
+  delete db;
+  db = nullptr;
+
+  // open DB with three column families
+  std::vector<ColumnFamilyDescriptor> column_families;
+  // have to open default column family
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions()));
+  // open the new column families
+  cf_options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  column_families.push_back(ColumnFamilyDescriptor("CF", cf_options));
+
+  std::vector<ColumnFamilyHandle*> handles;
+
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  ASSERT_OK(ReOpenNoDelete(column_families, &handles));
+  assert(db != nullptr);
+
+  // Write some data to the db
+  WriteBatch batch;
+  ASSERT_OK(batch.Put(handles[1], "aaa", "val1"));
+  ASSERT_OK(batch.Put(handles[1], "bbb", "val2"));
+  ASSERT_OK(batch.Put(handles[1], "ccc", "val3"));
+  ASSERT_OK(batch.Put(handles[1], "ddd", "foo"));
+  ASSERT_OK(batch.Put(handles[1], "eee", "val5"));
+  ASSERT_OK(batch.Put(handles[1], "fff", "val6"));
+  ASSERT_OK(batch.Merge(handles[1], "ggg", "foo"));
+  s = db->Write(write_options, &batch);
+  ASSERT_OK(s);
+
+  Transaction* txn = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  txn->SetSnapshot();
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+
+  txn_options.set_snapshot = true;
+  // Write some data to the db
+  s = txn->Delete(handles[1], "bbb");
+  ASSERT_OK(s);
+  s = txn->Put(handles[1], "ccc", "val3_new");
+  ASSERT_OK(s);
+  s = txn->Merge(handles[1], "ddd", "bar");
+  ASSERT_OK(s);
+
+  std::vector<Slice> keys = {"aaa", "bbb", "ccc", "ddd", "eee", "fff", "ggg"};
+  std::vector<PinnableSlice> values(keys.size());
+  std::vector<Status> statuses(keys.size());
+
+  txn->MultiGet(snapshot_read_options, handles[1], keys.size(), keys.data(),
+                values.data(), statuses.data());
+  ASSERT_TRUE(statuses[0].ok());
+  ASSERT_EQ(values[0], "val1");
+  ASSERT_TRUE(statuses[1].IsNotFound());
+  ASSERT_TRUE(statuses[2].ok());
+  ASSERT_EQ(values[2], "val3_new");
+  ASSERT_TRUE(statuses[3].ok());
+  ASSERT_EQ(values[3], "foo,bar");
+  ASSERT_TRUE(statuses[4].ok());
+  ASSERT_EQ(values[4], "val5");
+  ASSERT_TRUE(statuses[5].ok());
+  ASSERT_EQ(values[5], "val6");
+  ASSERT_TRUE(statuses[6].ok());
+  ASSERT_EQ(values[6], "foo");
+  delete txn;
+  for (auto handle : handles) {
+    delete handle;
+  }
+}
+
+// This test calls WriteBatchWithIndex::MultiGetFromBatchAndDB with a large
+// number of keys, i.e greater than MultiGetContext::MAX_BATCH_SIZE, which is
+// is 32. This forces autovector allocations in the MultiGet code paths
+// to use std::vector in addition to stack allocations. The MultiGet keys
+// includes Merges, which are handled specially in MultiGetFromBatchAndDB by
+// allocating an autovector of MergeContexts
+TEST_P(TransactionTest, MultiGetLargeBatchedTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  std::string value;
+  Status s;
+
+  ColumnFamilyHandle* cf;
+  ColumnFamilyOptions cf_options;
+
+  std::vector<std::string> key_str;
+  for (int i = 0; i < 100; ++i) {
+    key_str.emplace_back(std::to_string(i));
+  }
+  // Create a new column families
+  s = db->CreateColumnFamily(cf_options, "CF", &cf);
+  ASSERT_OK(s);
+
+  delete cf;
+  delete db;
+  db = nullptr;
+
+  // open DB with three column families
+  std::vector<ColumnFamilyDescriptor> column_families;
+  // have to open default column family
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions()));
+  // open the new column families
+  cf_options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  column_families.push_back(ColumnFamilyDescriptor("CF", cf_options));
+
+  std::vector<ColumnFamilyHandle*> handles;
+
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  ASSERT_OK(ReOpenNoDelete(column_families, &handles));
+  assert(db != nullptr);
+
+  // Write some data to the db
+  WriteBatch batch;
+  for (int i = 0; i < 3 * MultiGetContext::MAX_BATCH_SIZE; ++i) {
+    std::string val = "val" + std::to_string(i);
+    ASSERT_OK(batch.Put(handles[1], key_str[i], val));
+  }
+  s = db->Write(write_options, &batch);
+  ASSERT_OK(s);
+
+  WriteBatchWithIndex wb;
+  // Write some data to the db
+  s = wb.Delete(handles[1], std::to_string(1));
+  ASSERT_OK(s);
+  s = wb.Put(handles[1], std::to_string(2), "new_val" + std::to_string(2));
+  ASSERT_OK(s);
+  // Write a lot of merges so when we call MultiGetFromBatchAndDB later on,
+  // it is forced to use std::vector in ROCKSDB_NAMESPACE::autovector to
+  // allocate MergeContexts. The number of merges needs to be >
+  // MultiGetContext::MAX_BATCH_SIZE
+  for (int i = 8; i < MultiGetContext::MAX_BATCH_SIZE + 24; ++i) {
+    s = wb.Merge(handles[1], std::to_string(i), "merge");
+    ASSERT_OK(s);
+  }
+
+  // MultiGet a lot of keys in order to force std::vector reallocations
+  std::vector<Slice> keys;
+  for (int i = 0; i < MultiGetContext::MAX_BATCH_SIZE + 32; ++i) {
+    keys.emplace_back(key_str[i]);
+  }
+  std::vector<PinnableSlice> values(keys.size());
+  std::vector<Status> statuses(keys.size());
+
+  wb.MultiGetFromBatchAndDB(db, snapshot_read_options, handles[1], keys.size(),
+                            keys.data(), values.data(), statuses.data(), false);
+  for (size_t i = 0; i < keys.size(); ++i) {
+    if (i == 1) {
+      ASSERT_TRUE(statuses[1].IsNotFound());
+    } else if (i == 2) {
+      ASSERT_TRUE(statuses[2].ok());
+      ASSERT_EQ(values[2], "new_val" + std::to_string(2));
+    } else if (i >= 8 && i < 56) {
+      ASSERT_TRUE(statuses[i].ok());
+      ASSERT_EQ(values[i], "val" + std::to_string(i) + ",merge");
+    } else {
+      ASSERT_TRUE(statuses[i].ok());
+      if (values[i] != "val" + std::to_string(i)) {
+        ASSERT_EQ(values[i], "val" + std::to_string(i));
+      }
+    }
+  }
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+}
+
+TEST_P(TransactionTest, MultiGetSnapshot) {
+  WriteOptions write_options;
+  TransactionOptions transaction_options;
+  Transaction* txn1 = db->BeginTransaction(write_options, transaction_options);
+
+  Slice key = "foo";
+
+  Status s = txn1->Put(key, "bar");
+  ASSERT_OK(s);
+
+  s = txn1->SetName("test");
+  ASSERT_OK(s);
+
+  s = txn1->Prepare();
+  ASSERT_OK(s);
+
+  // Get snapshot between prepare and commit
+  // Un-committed data should be invisible to other transactions
+  const Snapshot* s1 = db->GetSnapshot();
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+  delete txn1;
+
+  Transaction* txn2 = db->BeginTransaction(write_options, transaction_options);
+  ReadOptions read_options;
+  read_options.snapshot = s1;
+
+  std::vector<Slice> keys;
+  std::vector<PinnableSlice> values(1);
+  std::vector<Status> statuses(1);
+  keys.push_back(key);
+  auto cfd = db->DefaultColumnFamily();
+  txn2->MultiGet(read_options, cfd, 1, keys.data(), values.data(),
+                 statuses.data());
+  ASSERT_TRUE(statuses[0].IsNotFound());
+  delete txn2;
+
+  db->ReleaseSnapshot(s1);
+}
+
+TEST_P(TransactionTest, ColumnFamiliesTest2) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  std::string value;
+  Status s;
+
+  ColumnFamilyHandle *one, *two;
+  ColumnFamilyOptions cf_options;
+
+  // Create 2 new column families
+  s = db->CreateColumnFamily(cf_options, "ONE", &one);
+  ASSERT_OK(s);
+  s = db->CreateColumnFamily(cf_options, "TWO", &two);
+  ASSERT_OK(s);
+
+  Transaction* txn1 = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn1);
+  Transaction* txn2 = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn2);
+
+  s = txn1->Put(one, "X", "1");
+  ASSERT_OK(s);
+  s = txn1->Put(two, "X", "2");
+  ASSERT_OK(s);
+  s = txn1->Put("X", "0");
+  ASSERT_OK(s);
+
+  s = txn2->Put(one, "X", "11");
+  ASSERT_TRUE(s.IsTimedOut());
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  // Drop first column family
+  s = db->DropColumnFamily(one);
+  ASSERT_OK(s);
+
+  // Should fail since column family was dropped.
+  s = txn2->Commit();
+  ASSERT_OK(s);
+
+  delete txn1;
+  txn1 = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn1);
+
+  // Should fail since column family was dropped
+  s = txn1->Put(one, "X", "111");
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  s = txn1->Put(two, "X", "222");
+  ASSERT_OK(s);
+
+  s = txn1->Put("X", "000");
+  ASSERT_OK(s);
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, two, "X", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("222", value);
+
+  s = db->Get(read_options, "X", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("000", value);
+
+  s = db->DropColumnFamily(two);
+  ASSERT_OK(s);
+
+  delete txn1;
+  delete txn2;
+
+  delete one;
+  delete two;
+}
+
+TEST_P(TransactionTest, EmptyTest) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  std::string value;
+  Status s;
+
+  s = db->Put(write_options, "aaa", "aaa");
+  ASSERT_OK(s);
+
+  Transaction* txn = db->BeginTransaction(write_options);
+  s = txn->Commit();
+  ASSERT_OK(s);
+  delete txn;
+
+  txn = db->BeginTransaction(write_options);
+  ASSERT_OK(txn->Rollback());
+  delete txn;
+
+  txn = db->BeginTransaction(write_options);
+  s = txn->GetForUpdate(read_options, "aaa", &value);
+  ASSERT_EQ(value, "aaa");
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+  delete txn;
+
+  txn = db->BeginTransaction(write_options);
+  txn->SetSnapshot();
+
+  s = txn->GetForUpdate(read_options, "aaa", &value);
+  ASSERT_EQ(value, "aaa");
+
+  // Conflicts with previous GetForUpdate
+  s = db->Put(write_options, "aaa", "xxx");
+  ASSERT_TRUE(s.IsTimedOut());
+
+  // transaction expired!
+  s = txn->Commit();
+  ASSERT_OK(s);
+  delete txn;
+}
+
+TEST_P(TransactionTest, PredicateManyPreceders) {
+  WriteOptions write_options;
+  ReadOptions read_options1, read_options2;
+  TransactionOptions txn_options;
+  std::string value;
+  Status s;
+
+  txn_options.set_snapshot = true;
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+  read_options1.snapshot = txn1->GetSnapshot();
+
+  Transaction* txn2 = db->BeginTransaction(write_options);
+  txn2->SetSnapshot();
+  read_options2.snapshot = txn2->GetSnapshot();
+
+  std::vector<Slice> multiget_keys = {"1", "2", "3"};
+  std::vector<std::string> multiget_values;
+
+  std::vector<Status> results =
+      txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values);
+  ASSERT_EQ(results.size(), 3);
+  ASSERT_TRUE(results[0].IsNotFound());
+  ASSERT_TRUE(results[1].IsNotFound());
+  ASSERT_TRUE(results[2].IsNotFound());
+
+  s = txn2->Put("2", "x");  // Conflict's with txn1's MultiGetForUpdate
+  ASSERT_TRUE(s.IsTimedOut());
+
+  ASSERT_OK(txn2->Rollback());
+
+  multiget_values.clear();
+  results =
+      txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values);
+  ASSERT_EQ(results.size(), 3);
+  ASSERT_TRUE(results[0].IsNotFound());
+  ASSERT_TRUE(results[1].IsNotFound());
+  ASSERT_TRUE(results[2].IsNotFound());
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  delete txn1;
+  delete txn2;
+
+  txn1 = db->BeginTransaction(write_options, txn_options);
+  read_options1.snapshot = txn1->GetSnapshot();
+
+  txn2 = db->BeginTransaction(write_options, txn_options);
+  read_options2.snapshot = txn2->GetSnapshot();
+
+  s = txn1->Put("4", "x");
+  ASSERT_OK(s);
+
+  s = txn2->Delete("4");  // conflict
+  ASSERT_TRUE(s.IsTimedOut());
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  s = txn2->GetForUpdate(read_options2, "4", &value);
+  ASSERT_TRUE(s.IsBusy());
+
+  ASSERT_OK(txn2->Rollback());
+
+  delete txn1;
+  delete txn2;
+}
+
+TEST_P(TransactionTest, LostUpdate) {
+  WriteOptions write_options;
+  ReadOptions read_options, read_options1, read_options2;
+  TransactionOptions txn_options;
+  std::string value;
+  Status s;
+
+  // Test 2 transactions writing to the same key in multiple orders and
+  // with/without snapshots
+
+  Transaction* txn1 = db->BeginTransaction(write_options);
+  Transaction* txn2 = db->BeginTransaction(write_options);
+
+  s = txn1->Put("1", "1");
+  ASSERT_OK(s);
+
+  s = txn2->Put("1", "2");  // conflict
+  ASSERT_TRUE(s.IsTimedOut());
+
+  s = txn2->Commit();
+  ASSERT_OK(s);
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "1", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("1", value);
+
+  delete txn1;
+  delete txn2;
+
+  txn_options.set_snapshot = true;
+  txn1 = db->BeginTransaction(write_options, txn_options);
+  read_options1.snapshot = txn1->GetSnapshot();
+
+  txn2 = db->BeginTransaction(write_options, txn_options);
+  read_options2.snapshot = txn2->GetSnapshot();
+
+  s = txn1->Put("1", "3");
+  ASSERT_OK(s);
+  s = txn2->Put("1", "4");  // conflict
+  ASSERT_TRUE(s.IsTimedOut());
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  s = txn2->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "1", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("3", value);
+
+  delete txn1;
+  delete txn2;
+
+  txn1 = db->BeginTransaction(write_options, txn_options);
+  read_options1.snapshot = txn1->GetSnapshot();
+
+  txn2 = db->BeginTransaction(write_options, txn_options);
+  read_options2.snapshot = txn2->GetSnapshot();
+
+  s = txn1->Put("1", "5");
+  ASSERT_OK(s);
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  s = txn2->Put("1", "6");
+  ASSERT_TRUE(s.IsBusy());
+  s = txn2->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "1", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("5", value);
+
+  delete txn1;
+  delete txn2;
+
+  txn1 = db->BeginTransaction(write_options, txn_options);
+  read_options1.snapshot = txn1->GetSnapshot();
+
+  txn2 = db->BeginTransaction(write_options, txn_options);
+  read_options2.snapshot = txn2->GetSnapshot();
+
+  s = txn1->Put("1", "7");
+  ASSERT_OK(s);
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  txn2->SetSnapshot();
+  s = txn2->Put("1", "8");
+  ASSERT_OK(s);
+  s = txn2->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "1", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("8", value);
+
+  delete txn1;
+  delete txn2;
+
+  txn1 = db->BeginTransaction(write_options);
+  txn2 = db->BeginTransaction(write_options);
+
+  s = txn1->Put("1", "9");
+  ASSERT_OK(s);
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  s = txn2->Put("1", "10");
+  ASSERT_OK(s);
+  s = txn2->Commit();
+  ASSERT_OK(s);
+
+  delete txn1;
+  delete txn2;
+
+  s = db->Get(read_options, "1", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "10");
+}
+
+TEST_P(TransactionTest, UntrackedWrites) {
+  if (txn_db_options.write_policy == WRITE_UNPREPARED) {
+    // TODO(lth): For WriteUnprepared, validate that untracked writes are
+    // not supported.
+    return;
+  }
+
+  WriteOptions write_options;
+  ReadOptions read_options;
+  std::string value;
+  Status s;
+
+  // Verify transaction rollback works for untracked keys.
+  Transaction* txn = db->BeginTransaction(write_options);
+  txn->SetSnapshot();
+
+  s = txn->PutUntracked("untracked", "0");
+  ASSERT_OK(s);
+  ASSERT_OK(txn->Rollback());
+  s = db->Get(read_options, "untracked", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  delete txn;
+  txn = db->BeginTransaction(write_options);
+  txn->SetSnapshot();
+
+  s = db->Put(write_options, "untracked", "x");
+  ASSERT_OK(s);
+
+  // Untracked writes should succeed even though key was written after snapshot
+  s = txn->PutUntracked("untracked", "1");
+  ASSERT_OK(s);
+  s = txn->MergeUntracked("untracked", "2");
+  ASSERT_OK(s);
+  s = txn->DeleteUntracked("untracked");
+  ASSERT_OK(s);
+
+  // Conflict
+  s = txn->Put("untracked", "3");
+  ASSERT_TRUE(s.IsBusy());
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "untracked", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  delete txn;
+}
+
+TEST_P(TransactionTest, ExpiredTransaction) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  TransactionOptions txn_options;
+  std::string value;
+  Status s;
+
+  // Set txn expiration timeout to 0 microseconds (expires instantly)
+  txn_options.expiration = 0;
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+
+  s = txn1->Put("X", "1");
+  ASSERT_OK(s);
+
+  s = txn1->Put("Y", "1");
+  ASSERT_OK(s);
+
+  Transaction* txn2 = db->BeginTransaction(write_options);
+
+  // txn2 should be able to write to X since txn1 has expired
+  s = txn2->Put("X", "2");
+  ASSERT_OK(s);
+
+  s = txn2->Commit();
+  ASSERT_OK(s);
+  s = db->Get(read_options, "X", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("2", value);
+
+  s = txn1->Put("Z", "1");
+  ASSERT_OK(s);
+
+  // txn1 should fail to commit since it is expired
+  s = txn1->Commit();
+  ASSERT_TRUE(s.IsExpired());
+
+  s = db->Get(read_options, "Y", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = db->Get(read_options, "Z", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  delete txn1;
+  delete txn2;
+}
+
+TEST_P(TransactionTest, ReinitializeTest) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  TransactionOptions txn_options;
+  std::string value;
+  Status s;
+
+  // Set txn expiration timeout to 0 microseconds (expires instantly)
+  txn_options.expiration = 0;
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+
+  // Reinitialize transaction to no long expire
+  txn_options.expiration = -1;
+  txn1 = db->BeginTransaction(write_options, txn_options, txn1);
+
+  s = txn1->Put("Z", "z");
+  ASSERT_OK(s);
+
+  // Should commit since not expired
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  txn1 = db->BeginTransaction(write_options, txn_options, txn1);
+
+  s = txn1->Put("Z", "zz");
+  ASSERT_OK(s);
+
+  // Reinitilize txn1 and verify that Z gets unlocked
+  txn1 = db->BeginTransaction(write_options, txn_options, txn1);
+
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options, nullptr);
+  s = txn2->Put("Z", "zzz");
+  ASSERT_OK(s);
+  s = txn2->Commit();
+  ASSERT_OK(s);
+  delete txn2;
+
+  s = db->Get(read_options, "Z", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "zzz");
+
+  // Verify snapshots get reinitialized correctly
+  txn1->SetSnapshot();
+  s = txn1->Put("Z", "zzzz");
+  ASSERT_OK(s);
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "Z", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "zzzz");
+
+  txn1 = db->BeginTransaction(write_options, txn_options, txn1);
+  const Snapshot* snapshot = txn1->GetSnapshot();
+  ASSERT_FALSE(snapshot);
+
+  txn_options.set_snapshot = true;
+  txn1 = db->BeginTransaction(write_options, txn_options, txn1);
+  snapshot = txn1->GetSnapshot();
+  ASSERT_TRUE(snapshot);
+
+  s = txn1->Put("Z", "a");
+  ASSERT_OK(s);
+
+  ASSERT_OK(txn1->Rollback());
+
+  s = txn1->Put("Y", "y");
+  ASSERT_OK(s);
+
+  txn_options.set_snapshot = false;
+  txn1 = db->BeginTransaction(write_options, txn_options, txn1);
+  snapshot = txn1->GetSnapshot();
+  ASSERT_FALSE(snapshot);
+
+  s = txn1->Put("X", "x");
+  ASSERT_OK(s);
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "Z", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "zzzz");
+
+  s = db->Get(read_options, "Y", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  txn1 = db->BeginTransaction(write_options, txn_options, txn1);
+
+  s = txn1->SetName("name");
+  ASSERT_OK(s);
+
+  s = txn1->Prepare();
+  ASSERT_OK(s);
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  txn1 = db->BeginTransaction(write_options, txn_options, txn1);
+
+  s = txn1->SetName("name");
+  ASSERT_OK(s);
+
+  delete txn1;
+}
+
+TEST_P(TransactionTest, Rollback) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  TransactionOptions txn_options;
+  std::string value;
+  Status s;
+
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+
+  ASSERT_OK(s);
+
+  s = txn1->Put("X", "1");
+  ASSERT_OK(s);
+
+  Transaction* txn2 = db->BeginTransaction(write_options);
+
+  // txn2 should not be able to write to X since txn1 has it locked
+  s = txn2->Put("X", "2");
+  ASSERT_TRUE(s.IsTimedOut());
+
+  ASSERT_OK(txn1->Rollback());
+  delete txn1;
+
+  // txn2 should now be able to write to X
+  s = txn2->Put("X", "3");
+  ASSERT_OK(s);
+
+  s = txn2->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "X", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("3", value);
+
+  delete txn2;
+}
+
+TEST_P(TransactionTest, LockLimitTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  TransactionOptions txn_options;
+  std::string value;
+  Status s;
+
+  delete db;
+  db = nullptr;
+
+  // Open DB with a lock limit of 3
+  txn_db_options.max_num_locks = 3;
+  ASSERT_OK(ReOpen());
+  assert(db != nullptr);
+  ASSERT_OK(s);
+
+  // Create a txn and verify we can only lock up to 3 keys
+  Transaction* txn = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn);
+
+  s = txn->Put("X", "x");
+  ASSERT_OK(s);
+
+  s = txn->Put("Y", "y");
+  ASSERT_OK(s);
+
+  s = txn->Put("Z", "z");
+  ASSERT_OK(s);
+
+  // lock limit reached
+  s = txn->Put("W", "w");
+  ASSERT_TRUE(s.IsBusy());
+
+  // re-locking same key shouldn't put us over the limit
+  s = txn->Put("X", "xx");
+  ASSERT_OK(s);
+
+  s = txn->GetForUpdate(read_options, "W", &value);
+  ASSERT_TRUE(s.IsBusy());
+  s = txn->GetForUpdate(read_options, "V", &value);
+  ASSERT_TRUE(s.IsBusy());
+
+  // re-locking same key shouldn't put us over the limit
+  s = txn->GetForUpdate(read_options, "Y", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("y", value);
+
+  s = txn->Get(read_options, "W", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn2);
+
+  // "X" currently locked
+  s = txn2->Put("X", "x");
+  ASSERT_TRUE(s.IsTimedOut());
+
+  // lock limit reached
+  s = txn2->Put("M", "m");
+  ASSERT_TRUE(s.IsBusy());
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "X", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("xx", value);
+
+  s = db->Get(read_options, "W", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  // Committing txn should release its locks and allow txn2 to proceed
+  s = txn2->Put("X", "x2");
+  ASSERT_OK(s);
+
+  s = txn2->Delete("X");
+  ASSERT_OK(s);
+
+  s = txn2->Put("M", "m");
+  ASSERT_OK(s);
+
+  s = txn2->Put("Z", "z2");
+  ASSERT_OK(s);
+
+  // lock limit reached
+  s = txn2->Delete("Y");
+  ASSERT_TRUE(s.IsBusy());
+
+  s = txn2->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "Z", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("z2", value);
+
+  s = db->Get(read_options, "Y", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("y", value);
+
+  s = db->Get(read_options, "X", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  delete txn;
+  delete txn2;
+}
+
+TEST_P(TransactionTest, IteratorTest) {
+  // This test does writes without snapshot validation, and then tries to create
+  // iterator later, which is unsupported in write unprepared.
+  if (txn_db_options.write_policy == WRITE_UNPREPARED) {
+    return;
+  }
+
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  std::string value;
+  Status s;
+
+  // Write some keys to the db
+  s = db->Put(write_options, "A", "a");
+  ASSERT_OK(s);
+
+  s = db->Put(write_options, "G", "g");
+  ASSERT_OK(s);
+
+  s = db->Put(write_options, "F", "f");
+  ASSERT_OK(s);
+
+  s = db->Put(write_options, "C", "c");
+  ASSERT_OK(s);
+
+  s = db->Put(write_options, "D", "d");
+  ASSERT_OK(s);
+
+  Transaction* txn = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  // Write some keys in a txn
+  s = txn->Put("B", "b");
+  ASSERT_OK(s);
+
+  s = txn->Put("H", "h");
+  ASSERT_OK(s);
+
+  s = txn->Delete("D");
+  ASSERT_OK(s);
+
+  s = txn->Put("E", "e");
+  ASSERT_OK(s);
+
+  txn->SetSnapshot();
+  const Snapshot* snapshot = txn->GetSnapshot();
+
+  // Write some keys to the db after the snapshot
+  s = db->Put(write_options, "BB", "xx");
+  ASSERT_OK(s);
+
+  s = db->Put(write_options, "C", "xx");
+  ASSERT_OK(s);
+
+  read_options.snapshot = snapshot;
+  Iterator* iter = txn->GetIterator(read_options);
+  ASSERT_OK(iter->status());
+  iter->SeekToFirst();
+
+  // Read all keys via iter and lock them all
+  std::string results[] = {"a", "b", "c", "e", "f", "g", "h"};
+  for (int i = 0; i < 7; i++) {
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(results[i], iter->value().ToString());
+
+    s = txn->GetForUpdate(read_options, iter->key(), nullptr);
+    if (i == 2) {
+      // "C" was modified after txn's snapshot
+      ASSERT_TRUE(s.IsBusy());
+    } else {
+      ASSERT_OK(s);
+    }
+
+    iter->Next();
+  }
+  ASSERT_FALSE(iter->Valid());
+
+  iter->Seek("G");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("g", iter->value().ToString());
+
+  iter->Prev();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("f", iter->value().ToString());
+
+  iter->Seek("D");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("e", iter->value().ToString());
+
+  iter->Seek("C");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("c", iter->value().ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("e", iter->value().ToString());
+
+  iter->Seek("");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("a", iter->value().ToString());
+
+  iter->Seek("X");
+  ASSERT_OK(iter->status());
+  ASSERT_FALSE(iter->Valid());
+
+  iter->SeekToLast();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("h", iter->value().ToString());
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  delete iter;
+  delete txn;
+}
+
+TEST_P(TransactionTest, DisableIndexingTest) {
+  // Skip this test for write unprepared. It does not solely rely on WBWI for
+  // read your own writes, so depending on whether batches are flushed or not,
+  // only some writes will be visible.
+  //
+  // Also, write unprepared does not support creating iterators if there has
+  // been txn->Put() without snapshot validation.
+  if (txn_db_options.write_policy == WRITE_UNPREPARED) {
+    return;
+  }
+
+  WriteOptions write_options;
+  ReadOptions read_options;
+  std::string value;
+  Status s;
+
+  Transaction* txn = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  s = txn->Put("A", "a");
+  ASSERT_OK(s);
+
+  s = txn->Get(read_options, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("a", value);
+
+  txn->DisableIndexing();
+
+  s = txn->Put("B", "b");
+  ASSERT_OK(s);
+
+  s = txn->Get(read_options, "B", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  Iterator* iter = txn->GetIterator(read_options);
+  ASSERT_OK(iter->status());
+
+  iter->Seek("B");
+  ASSERT_OK(iter->status());
+  ASSERT_FALSE(iter->Valid());
+
+  s = txn->Delete("A");
+
+  s = txn->Get(read_options, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("a", value);
+
+  txn->EnableIndexing();
+
+  s = txn->Put("B", "bb");
+  ASSERT_OK(s);
+
+  iter->Seek("B");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("bb", iter->value().ToString());
+
+  s = txn->Get(read_options, "B", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("bb", value);
+
+  s = txn->Put("A", "aa");
+  ASSERT_OK(s);
+
+  s = txn->Get(read_options, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("aa", value);
+
+  delete iter;
+  delete txn;
+}
+
+TEST_P(TransactionTest, SavepointTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  std::string value;
+  Status s;
+
+  Transaction* txn = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  ASSERT_EQ(0, txn->GetNumPuts());
+
+  s = txn->RollbackToSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+
+  txn->SetSavePoint();  // 1
+
+  ASSERT_OK(txn->RollbackToSavePoint());  // Rollback to beginning of txn
+  s = txn->RollbackToSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Put("B", "b");
+  ASSERT_OK(s);
+
+  ASSERT_EQ(1, txn->GetNumPuts());
+  ASSERT_EQ(0, txn->GetNumDeletes());
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "B", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("b", value);
+
+  delete txn;
+  txn = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  s = txn->Put("A", "a");
+  ASSERT_OK(s);
+
+  s = txn->Put("B", "bb");
+  ASSERT_OK(s);
+
+  s = txn->Put("C", "c");
+  ASSERT_OK(s);
+
+  txn->SetSavePoint();  // 2
+
+  s = txn->Delete("B");
+  ASSERT_OK(s);
+
+  s = txn->Put("C", "cc");
+  ASSERT_OK(s);
+
+  s = txn->Put("D", "d");
+  ASSERT_OK(s);
+
+  ASSERT_EQ(5, txn->GetNumPuts());
+  ASSERT_EQ(1, txn->GetNumDeletes());
+
+  ASSERT_OK(txn->RollbackToSavePoint());  // Rollback to 2
+
+  ASSERT_EQ(3, txn->GetNumPuts());
+  ASSERT_EQ(0, txn->GetNumDeletes());
+
+  s = txn->Get(read_options, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("a", value);
+
+  s = txn->Get(read_options, "B", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("bb", value);
+
+  s = txn->Get(read_options, "C", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("c", value);
+
+  s = txn->Get(read_options, "D", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Put("A", "a");
+  ASSERT_OK(s);
+
+  s = txn->Put("E", "e");
+  ASSERT_OK(s);
+
+  ASSERT_EQ(5, txn->GetNumPuts());
+  ASSERT_EQ(0, txn->GetNumDeletes());
+
+  // Rollback to beginning of txn
+  s = txn->RollbackToSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_OK(txn->Rollback());
+
+  ASSERT_EQ(0, txn->GetNumPuts());
+  ASSERT_EQ(0, txn->GetNumDeletes());
+
+  s = txn->Get(read_options, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Get(read_options, "B", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("b", value);
+
+  s = txn->Get(read_options, "D", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Get(read_options, "D", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Get(read_options, "E", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Put("A", "aa");
+  ASSERT_OK(s);
+
+  s = txn->Put("F", "f");
+  ASSERT_OK(s);
+
+  ASSERT_EQ(2, txn->GetNumPuts());
+  ASSERT_EQ(0, txn->GetNumDeletes());
+
+  txn->SetSavePoint();  // 3
+  txn->SetSavePoint();  // 4
+
+  s = txn->Put("G", "g");
+  ASSERT_OK(s);
+
+  s = txn->SingleDelete("F");
+  ASSERT_OK(s);
+
+  s = txn->Delete("B");
+  ASSERT_OK(s);
+
+  s = txn->Get(read_options, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("aa", value);
+
+  s = txn->Get(read_options, "F", &value);
+  // According to db.h, doing a SingleDelete on a key that has been
+  // overwritten will have undefinied behavior.  So it is unclear what the
+  // result of fetching "F" should be. The current implementation will
+  // return NotFound in this case.
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Get(read_options, "B", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  ASSERT_EQ(3, txn->GetNumPuts());
+  ASSERT_EQ(2, txn->GetNumDeletes());
+
+  ASSERT_OK(txn->RollbackToSavePoint());  // Rollback to 3
+
+  ASSERT_EQ(2, txn->GetNumPuts());
+  ASSERT_EQ(0, txn->GetNumDeletes());
+
+  s = txn->Get(read_options, "F", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("f", value);
+
+  s = txn->Get(read_options, "G", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "F", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("f", value);
+
+  s = db->Get(read_options, "G", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = db->Get(read_options, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("aa", value);
+
+  s = db->Get(read_options, "B", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("b", value);
+
+  s = db->Get(read_options, "C", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = db->Get(read_options, "D", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = db->Get(read_options, "E", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  delete txn;
+}
+
+TEST_P(TransactionTest, SavepointTest2) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  TransactionOptions txn_options;
+  Status s;
+
+  txn_options.lock_timeout = 1;  // 1 ms
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn1);
+
+  s = txn1->Put("A", "");
+  ASSERT_OK(s);
+
+  txn1->SetSavePoint();  // 1
+
+  s = txn1->Put("A", "a");
+  ASSERT_OK(s);
+
+  s = txn1->Put("C", "c");
+  ASSERT_OK(s);
+
+  txn1->SetSavePoint();  // 2
+
+  s = txn1->Put("A", "a");
+  ASSERT_OK(s);
+  s = txn1->Put("B", "b");
+  ASSERT_OK(s);
+
+  ASSERT_OK(txn1->RollbackToSavePoint());  // Rollback to 2
+
+  // Verify that "A" and "C" is still locked while "B" is not
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn2);
+
+  s = txn2->Put("A", "a2");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("C", "c2");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("B", "b2");
+  ASSERT_OK(s);
+
+  s = txn1->Put("A", "aa");
+  ASSERT_OK(s);
+  s = txn1->Put("B", "bb");
+  ASSERT_TRUE(s.IsTimedOut());
+
+  s = txn2->Commit();
+  ASSERT_OK(s);
+  delete txn2;
+
+  s = txn1->Put("A", "aaa");
+  ASSERT_OK(s);
+  s = txn1->Put("B", "bbb");
+  ASSERT_OK(s);
+  s = txn1->Put("C", "ccc");
+  ASSERT_OK(s);
+
+  txn1->SetSavePoint();                    // 3
+  ASSERT_OK(txn1->RollbackToSavePoint());  // Rollback to 3
+
+  // Verify that "A", "B", "C" are still locked
+  txn2 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn2);
+
+  s = txn2->Put("A", "a2");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("B", "b2");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("C", "c2");
+  ASSERT_TRUE(s.IsTimedOut());
+
+  ASSERT_OK(txn1->RollbackToSavePoint());  // Rollback to 1
+
+  // Verify that only "A" is locked
+  s = txn2->Put("A", "a3");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("B", "b3");
+  ASSERT_OK(s);
+  s = txn2->Put("C", "c3po");
+  ASSERT_OK(s);
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+  delete txn1;
+
+  // Verify "A" "C" "B" are no longer locked
+  s = txn2->Put("A", "a4");
+  ASSERT_OK(s);
+  s = txn2->Put("B", "b4");
+  ASSERT_OK(s);
+  s = txn2->Put("C", "c4");
+  ASSERT_OK(s);
+
+  s = txn2->Commit();
+  ASSERT_OK(s);
+  delete txn2;
+}
+
+TEST_P(TransactionTest, SavepointTest3) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  TransactionOptions txn_options;
+  Status s;
+
+  txn_options.lock_timeout = 1;  // 1 ms
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn1);
+
+  s = txn1->PopSavePoint();  // No SavePoint present
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn1->Put("A", "");
+  ASSERT_OK(s);
+
+  s = txn1->PopSavePoint();  // Still no SavePoint present
+  ASSERT_TRUE(s.IsNotFound());
+
+  txn1->SetSavePoint();  // 1
+
+  s = txn1->Put("A", "a");
+  ASSERT_OK(s);
+
+  s = txn1->PopSavePoint();  // Remove 1
+  ASSERT_TRUE(txn1->RollbackToSavePoint().IsNotFound());
+
+  // Verify that "A" is still locked
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn2);
+
+  s = txn2->Put("A", "a2");
+  ASSERT_TRUE(s.IsTimedOut());
+  delete txn2;
+
+  txn1->SetSavePoint();  // 2
+
+  s = txn1->Put("B", "b");
+  ASSERT_OK(s);
+
+  txn1->SetSavePoint();  // 3
+
+  s = txn1->Put("B", "b2");
+  ASSERT_OK(s);
+
+  ASSERT_OK(txn1->RollbackToSavePoint());  // Roll back to 2
+
+  s = txn1->PopSavePoint();
+  ASSERT_OK(s);
+
+  s = txn1->PopSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+  delete txn1;
+
+  std::string value;
+
+  // tnx1 should have modified "A" to "a"
+  s = db->Get(read_options, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("a", value);
+
+  // tnx1 should have set "B" to just "b"
+  s = db->Get(read_options, "B", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("b", value);
+
+  s = db->Get(read_options, "C", &value);
+  ASSERT_TRUE(s.IsNotFound());
+}
+
+TEST_P(TransactionTest, SavepointTest4) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  TransactionOptions txn_options;
+  Status s;
+
+  txn_options.lock_timeout = 1;  // 1 ms
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn1);
+
+  txn1->SetSavePoint();  // 1
+  s = txn1->Put("A", "a");
+  ASSERT_OK(s);
+
+  txn1->SetSavePoint();  // 2
+  s = txn1->Put("B", "b");
+  ASSERT_OK(s);
+
+  s = txn1->PopSavePoint();  // Remove 2
+  ASSERT_OK(s);
+
+  // Verify that A/B still exists.
+  std::string value;
+  ASSERT_OK(txn1->Get(read_options, "A", &value));
+  ASSERT_EQ("a", value);
+
+  ASSERT_OK(txn1->Get(read_options, "B", &value));
+  ASSERT_EQ("b", value);
+
+  ASSERT_OK(txn1->RollbackToSavePoint());  // Rollback to 1
+
+  // Verify that everything was rolled back.
+  s = txn1->Get(read_options, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn1->Get(read_options, "B", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  // Nothing should be locked
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn2);
+
+  s = txn2->Put("A", "");
+  ASSERT_OK(s);
+
+  s = txn2->Put("B", "");
+  ASSERT_OK(s);
+
+  delete txn2;
+  delete txn1;
+}
+
+TEST_P(TransactionTest, UndoGetForUpdateTest) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  TransactionOptions txn_options;
+  std::string value;
+  Status s;
+
+  txn_options.lock_timeout = 1;  // 1 ms
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn1);
+
+  txn1->UndoGetForUpdate("A");
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+  delete txn1;
+
+  txn1 = db->BeginTransaction(write_options, txn_options);
+
+  txn1->UndoGetForUpdate("A");
+  s = txn1->GetForUpdate(read_options, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  // Verify that A is locked
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+  s = txn2->Put("A", "a");
+  ASSERT_TRUE(s.IsTimedOut());
+
+  txn1->UndoGetForUpdate("A");
+
+  // Verify that A is now unlocked
+  s = txn2->Put("A", "a2");
+  ASSERT_OK(s);
+  ASSERT_OK(txn2->Commit());
+  delete txn2;
+  s = db->Get(read_options, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("a2", value);
+
+  s = txn1->Delete("A");
+  ASSERT_OK(s);
+  s = txn1->GetForUpdate(read_options, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn1->Put("B", "b3");
+  ASSERT_OK(s);
+  s = txn1->GetForUpdate(read_options, "B", &value);
+  ASSERT_OK(s);
+
+  txn1->UndoGetForUpdate("A");
+  txn1->UndoGetForUpdate("B");
+
+  // Verify that A and B are still locked
+  txn2 = db->BeginTransaction(write_options, txn_options);
+  s = txn2->Put("A", "a4");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("B", "b4");
+  ASSERT_TRUE(s.IsTimedOut());
+
+  ASSERT_OK(txn1->Rollback());
+  delete txn1;
+
+  // Verify that A and B are no longer locked
+  s = txn2->Put("A", "a5");
+  ASSERT_OK(s);
+  s = txn2->Put("B", "b5");
+  ASSERT_OK(s);
+  s = txn2->Commit();
+  delete txn2;
+  ASSERT_OK(s);
+
+  txn1 = db->BeginTransaction(write_options, txn_options);
+
+  s = txn1->GetForUpdate(read_options, "A", &value);
+  ASSERT_OK(s);
+  s = txn1->GetForUpdate(read_options, "A", &value);
+  ASSERT_OK(s);
+  s = txn1->GetForUpdate(read_options, "C", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  s = txn1->GetForUpdate(read_options, "A", &value);
+  ASSERT_OK(s);
+  s = txn1->GetForUpdate(read_options, "C", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  s = txn1->GetForUpdate(read_options, "B", &value);
+  ASSERT_OK(s);
+  s = txn1->Put("B", "b5");
+  s = txn1->GetForUpdate(read_options, "B", &value);
+  ASSERT_OK(s);
+
+  txn1->UndoGetForUpdate("A");
+  txn1->UndoGetForUpdate("B");
+  txn1->UndoGetForUpdate("C");
+  txn1->UndoGetForUpdate("X");
+
+  // Verify A,B,C are locked
+  txn2 = db->BeginTransaction(write_options, txn_options);
+  s = txn2->Put("A", "a6");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Delete("B");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("C", "c6");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("X", "x6");
+  ASSERT_OK(s);
+
+  txn1->UndoGetForUpdate("A");
+  txn1->UndoGetForUpdate("B");
+  txn1->UndoGetForUpdate("C");
+  txn1->UndoGetForUpdate("X");
+
+  // Verify A,B are locked and C is not
+  s = txn2->Put("A", "a6");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Delete("B");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("C", "c6");
+  ASSERT_OK(s);
+  s = txn2->Put("X", "x6");
+  ASSERT_OK(s);
+
+  txn1->UndoGetForUpdate("A");
+  txn1->UndoGetForUpdate("B");
+  txn1->UndoGetForUpdate("C");
+  txn1->UndoGetForUpdate("X");
+
+  // Verify B is locked and A and C are not
+  s = txn2->Put("A", "a7");
+  ASSERT_OK(s);
+  s = txn2->Delete("B");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("C", "c7");
+  ASSERT_OK(s);
+  s = txn2->Put("X", "x7");
+  ASSERT_OK(s);
+
+  s = txn2->Commit();
+  ASSERT_OK(s);
+  delete txn2;
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+  delete txn1;
+}
+
+TEST_P(TransactionTest, UndoGetForUpdateTest2) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  TransactionOptions txn_options;
+  std::string value;
+  Status s;
+
+  s = db->Put(write_options, "A", "");
+  ASSERT_OK(s);
+
+  txn_options.lock_timeout = 1;  // 1 ms
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn1);
+
+  s = txn1->GetForUpdate(read_options, "A", &value);
+  ASSERT_OK(s);
+  s = txn1->GetForUpdate(read_options, "B", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn1->Put("F", "f");
+  ASSERT_OK(s);
+
+  txn1->SetSavePoint();  // 1
+
+  txn1->UndoGetForUpdate("A");
+
+  s = txn1->GetForUpdate(read_options, "C", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  s = txn1->GetForUpdate(read_options, "D", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn1->Put("E", "e");
+  ASSERT_OK(s);
+  s = txn1->GetForUpdate(read_options, "E", &value);
+  ASSERT_OK(s);
+
+  s = txn1->GetForUpdate(read_options, "F", &value);
+  ASSERT_OK(s);
+
+  // Verify A,B,C,D,E,F are still locked
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+  s = txn2->Put("A", "a1");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("B", "b1");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("C", "c1");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("D", "d1");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("E", "e1");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("F", "f1");
+  ASSERT_TRUE(s.IsTimedOut());
+
+  txn1->UndoGetForUpdate("C");
+  txn1->UndoGetForUpdate("E");
+
+  // Verify A,B,D,E,F are still locked and C is not.
+  s = txn2->Put("A", "a2");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("B", "b2");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("D", "d2");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("E", "e2");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("F", "f2");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("C", "c2");
+  ASSERT_OK(s);
+
+  txn1->SetSavePoint();  // 2
+
+  s = txn1->Put("H", "h");
+  ASSERT_OK(s);
+
+  txn1->UndoGetForUpdate("A");
+  txn1->UndoGetForUpdate("B");
+  txn1->UndoGetForUpdate("C");
+  txn1->UndoGetForUpdate("D");
+  txn1->UndoGetForUpdate("E");
+  txn1->UndoGetForUpdate("F");
+  txn1->UndoGetForUpdate("G");
+  txn1->UndoGetForUpdate("H");
+
+  // Verify A,B,D,E,F,H are still locked and C,G are not.
+  s = txn2->Put("A", "a3");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("B", "b3");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("D", "d3");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("E", "e3");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("F", "f3");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("H", "h3");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("C", "c3");
+  ASSERT_OK(s);
+  s = txn2->Put("G", "g3");
+  ASSERT_OK(s);
+
+  ASSERT_OK(txn1->RollbackToSavePoint());  // rollback to 2
+
+  // Verify A,B,D,E,F are still locked and C,G,H are not.
+  s = txn2->Put("A", "a3");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("B", "b3");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("D", "d3");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("E", "e3");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("F", "f3");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("C", "c3");
+  ASSERT_OK(s);
+  s = txn2->Put("G", "g3");
+  ASSERT_OK(s);
+  s = txn2->Put("H", "h3");
+  ASSERT_OK(s);
+
+  txn1->UndoGetForUpdate("A");
+  txn1->UndoGetForUpdate("B");
+  txn1->UndoGetForUpdate("C");
+  txn1->UndoGetForUpdate("D");
+  txn1->UndoGetForUpdate("E");
+  txn1->UndoGetForUpdate("F");
+  txn1->UndoGetForUpdate("G");
+  txn1->UndoGetForUpdate("H");
+
+  // Verify A,B,E,F are still locked and C,D,G,H are not.
+  s = txn2->Put("A", "a3");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("B", "b3");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("E", "e3");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("F", "f3");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("C", "c3");
+  ASSERT_OK(s);
+  s = txn2->Put("D", "d3");
+  ASSERT_OK(s);
+  s = txn2->Put("G", "g3");
+  ASSERT_OK(s);
+  s = txn2->Put("H", "h3");
+  ASSERT_OK(s);
+
+  ASSERT_OK(txn1->RollbackToSavePoint());  // rollback to 1
+
+  // Verify A,B,F are still locked and C,D,E,G,H are not.
+  s = txn2->Put("A", "a3");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("B", "b3");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("F", "f3");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("C", "c3");
+  ASSERT_OK(s);
+  s = txn2->Put("D", "d3");
+  ASSERT_OK(s);
+  s = txn2->Put("E", "e3");
+  ASSERT_OK(s);
+  s = txn2->Put("G", "g3");
+  ASSERT_OK(s);
+  s = txn2->Put("H", "h3");
+  ASSERT_OK(s);
+
+  txn1->UndoGetForUpdate("A");
+  txn1->UndoGetForUpdate("B");
+  txn1->UndoGetForUpdate("C");
+  txn1->UndoGetForUpdate("D");
+  txn1->UndoGetForUpdate("E");
+  txn1->UndoGetForUpdate("F");
+  txn1->UndoGetForUpdate("G");
+  txn1->UndoGetForUpdate("H");
+
+  // Verify F is still locked and A,B,C,D,E,G,H are not.
+  s = txn2->Put("F", "f3");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("A", "a3");
+  ASSERT_OK(s);
+  s = txn2->Put("B", "b3");
+  ASSERT_OK(s);
+  s = txn2->Put("C", "c3");
+  ASSERT_OK(s);
+  s = txn2->Put("D", "d3");
+  ASSERT_OK(s);
+  s = txn2->Put("E", "e3");
+  ASSERT_OK(s);
+  s = txn2->Put("G", "g3");
+  ASSERT_OK(s);
+  s = txn2->Put("H", "h3");
+  ASSERT_OK(s);
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+  s = txn2->Commit();
+  ASSERT_OK(s);
+
+  delete txn1;
+  delete txn2;
+}
+
+TEST_P(TransactionTest, TimeoutTest) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  std::string value;
+  Status s;
+
+  delete db;
+  db = nullptr;
+
+  // transaction writes have an infinite timeout,
+  // but we will override this when we start a txn
+  // db writes have infinite timeout
+  txn_db_options.transaction_lock_timeout = -1;
+  txn_db_options.default_lock_timeout = -1;
+
+  s = TransactionDB::Open(options, txn_db_options, dbname, &db);
+  assert(db != nullptr);
+  ASSERT_OK(s);
+
+  s = db->Put(write_options, "aaa", "aaa");
+  ASSERT_OK(s);
+
+  TransactionOptions txn_options0;
+  txn_options0.expiration = 100;   // 100ms
+  txn_options0.lock_timeout = 50;  // txn timeout no longer infinite
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options0);
+
+  s = txn1->GetForUpdate(read_options, "aaa", nullptr);
+  ASSERT_OK(s);
+
+  // Conflicts with previous GetForUpdate.
+  // Since db writes do not have a timeout, this should eventually succeed when
+  // the transaction expires.
+  s = db->Put(write_options, "aaa", "xxx");
+  ASSERT_OK(s);
+
+  ASSERT_GE(txn1->GetElapsedTime(),
+            static_cast<uint64_t>(txn_options0.expiration));
+
+  s = txn1->Commit();
+  ASSERT_TRUE(s.IsExpired());  // expired!
+
+  s = db->Get(read_options, "aaa", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("xxx", value);
+
+  delete txn1;
+  delete db;
+
+  // transaction writes have 10ms timeout,
+  // db writes have infinite timeout
+  txn_db_options.transaction_lock_timeout = 50;
+  txn_db_options.default_lock_timeout = -1;
+
+  s = TransactionDB::Open(options, txn_db_options, dbname, &db);
+  ASSERT_OK(s);
+
+  s = db->Put(write_options, "aaa", "aaa");
+  ASSERT_OK(s);
+
+  TransactionOptions txn_options;
+  txn_options.expiration = 100;  // 100ms
+  txn1 = db->BeginTransaction(write_options, txn_options);
+
+  s = txn1->GetForUpdate(read_options, "aaa", nullptr);
+  ASSERT_OK(s);
+
+  // Conflicts with previous GetForUpdate.
+  // Since db writes do not have a timeout, this should eventually succeed when
+  // the transaction expires.
+  s = db->Put(write_options, "aaa", "xxx");
+  ASSERT_OK(s);
+
+  s = txn1->Commit();
+  ASSERT_NOK(s);  // expired!
+
+  s = db->Get(read_options, "aaa", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("xxx", value);
+
+  delete txn1;
+  txn_options.expiration = 6000000;  // 100 minutes
+  txn_options.lock_timeout = 1;      // 1ms
+  txn1 = db->BeginTransaction(write_options, txn_options);
+  txn1->SetLockTimeout(100);
+
+  TransactionOptions txn_options2;
+  txn_options2.expiration = 10;  // 10ms
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options2);
+  ASSERT_OK(s);
+
+  s = txn2->Put("a", "2");
+  ASSERT_OK(s);
+
+  // txn1 has a lock timeout longer than txn2's expiration, so it will win
+  s = txn1->Delete("a");
+  ASSERT_OK(s);
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  // txn2 should be expired out since txn1 waiting until its timeout expired.
+  s = txn2->Commit();
+  ASSERT_TRUE(s.IsExpired());
+
+  delete txn1;
+  delete txn2;
+  txn_options.expiration = 6000000;  // 100 minutes
+  txn1 = db->BeginTransaction(write_options, txn_options);
+  txn_options2.expiration = 100000000;
+  txn2 = db->BeginTransaction(write_options, txn_options2);
+
+  s = txn1->Delete("asdf");
+  ASSERT_OK(s);
+
+  // txn2 has a smaller lock timeout than txn1's expiration, so it will time out
+  s = txn2->Delete("asdf");
+  ASSERT_TRUE(s.IsTimedOut());
+  ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  s = txn2->Put("asdf", "asdf");
+  ASSERT_OK(s);
+
+  s = txn2->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "asdf", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("asdf", value);
+
+  delete txn1;
+  delete txn2;
+}
+
+TEST_P(TransactionTest, SingleDeleteTest) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  std::string value;
+  Status s;
+
+  Transaction* txn = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  s = txn->SingleDelete("A");
+  ASSERT_OK(s);
+
+  s = txn->Get(read_options, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+  delete txn;
+
+  txn = db->BeginTransaction(write_options);
+
+  s = txn->SingleDelete("A");
+  ASSERT_OK(s);
+
+  s = txn->Put("A", "a");
+  ASSERT_OK(s);
+
+  s = txn->Get(read_options, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("a", value);
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+  delete txn;
+
+  s = db->Get(read_options, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("a", value);
+
+  txn = db->BeginTransaction(write_options);
+
+  s = txn->SingleDelete("A");
+  ASSERT_OK(s);
+
+  s = txn->Get(read_options, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+  delete txn;
+
+  s = db->Get(read_options, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  txn = db->BeginTransaction(write_options);
+  Transaction* txn2 = db->BeginTransaction(write_options);
+  txn2->SetSnapshot();
+
+  s = txn->Put("A", "a");
+  ASSERT_OK(s);
+
+  s = txn->Put("A", "a2");
+  ASSERT_OK(s);
+
+  s = txn->SingleDelete("A");
+  ASSERT_OK(s);
+
+  s = txn->SingleDelete("B");
+  ASSERT_OK(s);
+
+  // According to db.h, doing a SingleDelete on a key that has been
+  // overwritten will have undefinied behavior.  So it is unclear what the
+  // result of fetching "A" should be. The current implementation will
+  // return NotFound in this case.
+  s = txn->Get(read_options, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn2->Put("B", "b");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Commit();
+  ASSERT_OK(s);
+  delete txn2;
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+  delete txn;
+
+  // According to db.h, doing a SingleDelete on a key that has been
+  // overwritten will have undefinied behavior.  So it is unclear what the
+  // result of fetching "A" should be. The current implementation will
+  // return NotFound in this case.
+  s = db->Get(read_options, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = db->Get(read_options, "B", &value);
+  ASSERT_TRUE(s.IsNotFound());
+}
+
+TEST_P(TransactionTest, MergeTest) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  std::string value;
+  Status s;
+
+  Transaction* txn = db->BeginTransaction(write_options, TransactionOptions());
+  ASSERT_TRUE(txn);
+
+  s = db->Put(write_options, "A", "a0");
+  ASSERT_OK(s);
+
+  s = txn->Merge("A", "1");
+  ASSERT_OK(s);
+
+  s = txn->Merge("A", "2");
+  ASSERT_OK(s);
+
+  s = txn->Get(read_options, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("a0,1,2", value);
+
+  s = txn->Put("A", "a");
+  ASSERT_OK(s);
+
+  s = txn->Get(read_options, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("a", value);
+
+  s = txn->Merge("A", "3");
+  ASSERT_OK(s);
+
+  s = txn->Get(read_options, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("a,3", value);
+
+  TransactionOptions txn_options;
+  txn_options.lock_timeout = 1;  // 1 ms
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn2);
+
+  // verify that txn has "A" locked
+  s = txn2->Merge("A", "4");
+  ASSERT_TRUE(s.IsTimedOut());
+
+  s = txn2->Commit();
+  ASSERT_OK(s);
+  delete txn2;
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+  delete txn;
+
+  s = db->Get(read_options, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("a,3", value);
+}
+
+TEST_P(TransactionTest, DeleteRangeSupportTest) {
+  // The `DeleteRange()` API is banned everywhere.
+  ASSERT_TRUE(
+      db->DeleteRange(WriteOptions(), db->DefaultColumnFamily(), "a", "b")
+          .IsNotSupported());
+
+  // But range deletions can be added via the `Write()` API by specifying the
+  // proper flags to promise there are no conflicts according to the DB type
+  // (see `TransactionDB::DeleteRange()` API doc for details).
+  for (bool skip_concurrency_control : {false, true}) {
+    for (bool skip_duplicate_key_check : {false, true}) {
+      ASSERT_OK(db->Put(WriteOptions(), "a", "val"));
+      WriteBatch wb;
+      ASSERT_OK(wb.DeleteRange("a", "b"));
+      TransactionDBWriteOptimizations flags;
+      flags.skip_concurrency_control = skip_concurrency_control;
+      flags.skip_duplicate_key_check = skip_duplicate_key_check;
+      Status s = db->Write(WriteOptions(), flags, &wb);
+      std::string value;
+      switch (txn_db_options.write_policy) {
+        case WRITE_COMMITTED:
+          if (skip_concurrency_control) {
+            ASSERT_OK(s);
+            ASSERT_TRUE(db->Get(ReadOptions(), "a", &value).IsNotFound());
+          } else {
+            ASSERT_NOK(s);
+            ASSERT_OK(db->Get(ReadOptions(), "a", &value));
+          }
+          break;
+        case WRITE_PREPARED:
+          // Intentional fall-through
+        case WRITE_UNPREPARED:
+          if (skip_concurrency_control && skip_duplicate_key_check) {
+            ASSERT_OK(s);
+            ASSERT_TRUE(db->Get(ReadOptions(), "a", &value).IsNotFound());
+          } else {
+            ASSERT_NOK(s);
+            ASSERT_OK(db->Get(ReadOptions(), "a", &value));
+          }
+          break;
+      }
+      // Without any promises from the user, range deletion via other `Write()`
+      // APIs are still banned.
+      ASSERT_OK(db->Put(WriteOptions(), "a", "val"));
+      ASSERT_NOK(db->Write(WriteOptions(), &wb));
+      ASSERT_OK(db->Get(ReadOptions(), "a", &value));
+    }
+  }
+}
+
+TEST_P(TransactionTest, DeferSnapshotTest) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  std::string value;
+  Status s;
+
+  s = db->Put(write_options, "A", "a0");
+  ASSERT_OK(s);
+
+  Transaction* txn1 = db->BeginTransaction(write_options);
+  Transaction* txn2 = db->BeginTransaction(write_options);
+
+  txn1->SetSnapshotOnNextOperation();
+  auto snapshot = txn1->GetSnapshot();
+  ASSERT_FALSE(snapshot);
+
+  s = txn2->Put("A", "a2");
+  ASSERT_OK(s);
+  s = txn2->Commit();
+  ASSERT_OK(s);
+  delete txn2;
+
+  s = txn1->GetForUpdate(read_options, "A", &value);
+  // Should not conflict with txn2 since snapshot wasn't set until
+  // GetForUpdate was called.
+  ASSERT_OK(s);
+  ASSERT_EQ("a2", value);
+
+  s = txn1->Put("A", "a1");
+  ASSERT_OK(s);
+
+  s = db->Put(write_options, "B", "b0");
+  ASSERT_OK(s);
+
+  // Cannot lock B since it was written after the snapshot was set
+  s = txn1->Put("B", "b1");
+  ASSERT_TRUE(s.IsBusy());
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+  delete txn1;
+
+  s = db->Get(read_options, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("a1", value);
+
+  s = db->Get(read_options, "B", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("b0", value);
+}
+
+TEST_P(TransactionTest, DeferSnapshotTest2) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  std::string value;
+  Status s;
+
+  Transaction* txn1 = db->BeginTransaction(write_options);
+
+  txn1->SetSnapshot();
+
+  s = txn1->Put("A", "a1");
+  ASSERT_OK(s);
+
+  s = db->Put(write_options, "C", "c0");
+  ASSERT_OK(s);
+  s = db->Put(write_options, "D", "d0");
+  ASSERT_OK(s);
+
+  snapshot_read_options.snapshot = txn1->GetSnapshot();
+
+  txn1->SetSnapshotOnNextOperation();
+
+  s = txn1->Get(snapshot_read_options, "C", &value);
+  // Snapshot was set before C was written
+  ASSERT_TRUE(s.IsNotFound());
+  s = txn1->Get(snapshot_read_options, "D", &value);
+  // Snapshot was set before D was written
+  ASSERT_TRUE(s.IsNotFound());
+
+  // Snapshot should not have changed yet.
+  snapshot_read_options.snapshot = txn1->GetSnapshot();
+
+  s = txn1->Get(snapshot_read_options, "C", &value);
+  // Snapshot was set before C was written
+  ASSERT_TRUE(s.IsNotFound());
+  s = txn1->Get(snapshot_read_options, "D", &value);
+  // Snapshot was set before D was written
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn1->GetForUpdate(read_options, "C", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("c0", value);
+
+  s = db->Put(write_options, "D", "d00");
+  ASSERT_OK(s);
+
+  // Snapshot is now set
+  snapshot_read_options.snapshot = txn1->GetSnapshot();
+  s = txn1->Get(snapshot_read_options, "D", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("d0", value);
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+  delete txn1;
+}
+
+TEST_P(TransactionTest, DeferSnapshotSavePointTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  std::string value;
+  Status s;
+
+  Transaction* txn1 = db->BeginTransaction(write_options);
+
+  txn1->SetSavePoint();  // 1
+
+  s = db->Put(write_options, "T", "1");
+  ASSERT_OK(s);
+
+  txn1->SetSnapshotOnNextOperation();
+
+  s = db->Put(write_options, "T", "2");
+  ASSERT_OK(s);
+
+  txn1->SetSavePoint();  // 2
+
+  s = db->Put(write_options, "T", "3");
+  ASSERT_OK(s);
+
+  s = txn1->Put("A", "a");
+  ASSERT_OK(s);
+
+  txn1->SetSavePoint();  // 3
+
+  s = db->Put(write_options, "T", "4");
+  ASSERT_OK(s);
+
+  txn1->SetSnapshot();
+  txn1->SetSnapshotOnNextOperation();
+
+  txn1->SetSavePoint();  // 4
+
+  s = db->Put(write_options, "T", "5");
+  ASSERT_OK(s);
+
+  snapshot_read_options.snapshot = txn1->GetSnapshot();
+  s = txn1->Get(snapshot_read_options, "T", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("4", value);
+
+  s = txn1->Put("A", "a1");
+  ASSERT_OK(s);
+
+  snapshot_read_options.snapshot = txn1->GetSnapshot();
+  s = txn1->Get(snapshot_read_options, "T", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("5", value);
+
+  s = txn1->RollbackToSavePoint();  // Rollback to 4
+  ASSERT_OK(s);
+
+  snapshot_read_options.snapshot = txn1->GetSnapshot();
+  s = txn1->Get(snapshot_read_options, "T", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("4", value);
+
+  s = txn1->RollbackToSavePoint();  // Rollback to 3
+  ASSERT_OK(s);
+
+  snapshot_read_options.snapshot = txn1->GetSnapshot();
+  s = txn1->Get(snapshot_read_options, "T", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("3", value);
+
+  s = txn1->Get(read_options, "T", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("5", value);
+
+  s = txn1->RollbackToSavePoint();  // Rollback to 2
+  ASSERT_OK(s);
+
+  snapshot_read_options.snapshot = txn1->GetSnapshot();
+  ASSERT_FALSE(snapshot_read_options.snapshot);
+  s = txn1->Get(snapshot_read_options, "T", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("5", value);
+
+  s = txn1->Delete("A");
+  ASSERT_OK(s);
+
+  snapshot_read_options.snapshot = txn1->GetSnapshot();
+  ASSERT_TRUE(snapshot_read_options.snapshot);
+  s = txn1->Get(snapshot_read_options, "T", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("5", value);
+
+  s = txn1->RollbackToSavePoint();  // Rollback to 1
+  ASSERT_OK(s);
+
+  s = txn1->Delete("A");
+  ASSERT_OK(s);
+
+  snapshot_read_options.snapshot = txn1->GetSnapshot();
+  ASSERT_FALSE(snapshot_read_options.snapshot);
+  s = txn1->Get(snapshot_read_options, "T", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("5", value);
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  delete txn1;
+}
+
+TEST_P(TransactionTest, SetSnapshotOnNextOperationWithNotification) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  std::string value;
+
+  class Notifier : public TransactionNotifier {
+   private:
+    const Snapshot** snapshot_ptr_;
+
+   public:
+    explicit Notifier(const Snapshot** snapshot_ptr)
+        : snapshot_ptr_(snapshot_ptr) {}
+
+    void SnapshotCreated(const Snapshot* newSnapshot) override {
+      *snapshot_ptr_ = newSnapshot;
+    }
+  };
+
+  std::shared_ptr<Notifier> notifier =
+      std::make_shared<Notifier>(&read_options.snapshot);
+  Status s;
+
+  s = db->Put(write_options, "B", "0");
+  ASSERT_OK(s);
+
+  Transaction* txn1 = db->BeginTransaction(write_options);
+
+  txn1->SetSnapshotOnNextOperation(notifier);
+  ASSERT_FALSE(read_options.snapshot);
+
+  s = db->Put(write_options, "B", "1");
+  ASSERT_OK(s);
+
+  // A Get does not generate the snapshot
+  s = txn1->Get(read_options, "B", &value);
+  ASSERT_OK(s);
+  ASSERT_FALSE(read_options.snapshot);
+  ASSERT_EQ(value, "1");
+
+  // Any other operation does
+  s = txn1->Put("A", "0");
+  ASSERT_OK(s);
+
+  // Now change "B".
+  s = db->Put(write_options, "B", "2");
+  ASSERT_OK(s);
+
+  // The original value should still be read
+  s = txn1->Get(read_options, "B", &value);
+  ASSERT_OK(s);
+  ASSERT_TRUE(read_options.snapshot);
+  ASSERT_EQ(value, "1");
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  delete txn1;
+}
+
+TEST_P(TransactionTest, ClearSnapshotTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  std::string value;
+  Status s;
+
+  s = db->Put(write_options, "foo", "0");
+  ASSERT_OK(s);
+
+  Transaction* txn = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  s = db->Put(write_options, "foo", "1");
+  ASSERT_OK(s);
+
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+  ASSERT_FALSE(snapshot_read_options.snapshot);
+
+  // No snapshot created yet
+  s = txn->Get(snapshot_read_options, "foo", &value);
+  ASSERT_EQ(value, "1");
+
+  txn->SetSnapshot();
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+  ASSERT_TRUE(snapshot_read_options.snapshot);
+
+  s = db->Put(write_options, "foo", "2");
+  ASSERT_OK(s);
+
+  // Snapshot was created before change to '2'
+  s = txn->Get(snapshot_read_options, "foo", &value);
+  ASSERT_EQ(value, "1");
+
+  txn->ClearSnapshot();
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+  ASSERT_FALSE(snapshot_read_options.snapshot);
+
+  // Snapshot has now been cleared
+  s = txn->Get(snapshot_read_options, "foo", &value);
+  ASSERT_EQ(value, "2");
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  delete txn;
+}
+
+TEST_P(TransactionTest, ToggleAutoCompactionTest) {
+  Status s;
+
+  ColumnFamilyHandle *cfa, *cfb;
+  ColumnFamilyOptions cf_options;
+
+  // Create 2 new column families
+  s = db->CreateColumnFamily(cf_options, "CFA", &cfa);
+  ASSERT_OK(s);
+  s = db->CreateColumnFamily(cf_options, "CFB", &cfb);
+  ASSERT_OK(s);
+
+  delete cfa;
+  delete cfb;
+  delete db;
+
+  // open DB with three column families
+  std::vector<ColumnFamilyDescriptor> column_families;
+  // have to open default column family
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions()));
+  // open the new column families
+  column_families.push_back(
+      ColumnFamilyDescriptor("CFA", ColumnFamilyOptions()));
+  column_families.push_back(
+      ColumnFamilyDescriptor("CFB", ColumnFamilyOptions()));
+
+  ColumnFamilyOptions* cf_opt_default = &column_families[0].options;
+  ColumnFamilyOptions* cf_opt_cfa = &column_families[1].options;
+  ColumnFamilyOptions* cf_opt_cfb = &column_families[2].options;
+  cf_opt_default->disable_auto_compactions = false;
+  cf_opt_cfa->disable_auto_compactions = true;
+  cf_opt_cfb->disable_auto_compactions = false;
+
+  std::vector<ColumnFamilyHandle*> handles;
+
+  s = TransactionDB::Open(options, txn_db_options, dbname, column_families,
+                          &handles, &db);
+  ASSERT_OK(s);
+
+  auto cfh_default = static_cast_with_check<ColumnFamilyHandleImpl>(handles[0]);
+  auto opt_default = *cfh_default->cfd()->GetLatestMutableCFOptions();
+
+  auto cfh_a = static_cast_with_check<ColumnFamilyHandleImpl>(handles[1]);
+  auto opt_a = *cfh_a->cfd()->GetLatestMutableCFOptions();
+
+  auto cfh_b = static_cast_with_check<ColumnFamilyHandleImpl>(handles[2]);
+  auto opt_b = *cfh_b->cfd()->GetLatestMutableCFOptions();
+
+  ASSERT_EQ(opt_default.disable_auto_compactions, false);
+  ASSERT_EQ(opt_a.disable_auto_compactions, true);
+  ASSERT_EQ(opt_b.disable_auto_compactions, false);
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+}
+
+TEST_P(TransactionStressTest, ExpiredTransactionDataRace1) {
+  // In this test, txn1 should succeed committing,
+  // as the callback is called after txn1 starts committing.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"TransactionTest::ExpirableTransactionDataRace:1"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "TransactionTest::ExpirableTransactionDataRace:1", [&](void* /*arg*/) {
+        WriteOptions write_options;
+        TransactionOptions txn_options;
+
+        // Force txn1 to expire
+        /* sleep override */
+        std::this_thread::sleep_for(std::chrono::milliseconds(1500));
+
+        Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+        Status s;
+        s = txn2->Put("X", "2");
+        ASSERT_TRUE(s.IsTimedOut());
+        s = txn2->Commit();
+        ASSERT_OK(s);
+        delete txn2;
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+
+  txn_options.expiration = 1000;  // 1 second
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+
+  Status s;
+  s = txn1->Put("X", "1");
+  ASSERT_OK(s);
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  ReadOptions read_options;
+  std::string value;
+  s = db->Get(read_options, "X", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("1", value);
+
+  delete txn1;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+namespace {
+// cmt_delay_ms is the delay between prepare and commit
+// first_id is the id of the first transaction
+Status TransactionStressTestInserter(
+    TransactionDB* db, const size_t num_transactions, const size_t num_sets,
+    const size_t num_keys_per_set, Random64* rand,
+    const uint64_t cmt_delay_ms = 0, const uint64_t first_id = 0) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  TransactionOptions txn_options;
+  txn_options.use_only_the_last_commit_time_batch_for_recovery = true;
+
+  // Inside the inserter we might also retake the snapshot. We do both since two
+  // separte functions are engaged for each.
+  txn_options.set_snapshot = rand->OneIn(2);
+
+  RandomTransactionInserter inserter(
+      rand, write_options, read_options, num_keys_per_set,
+      static_cast<uint16_t>(num_sets), cmt_delay_ms, first_id);
+
+  for (size_t t = 0; t < num_transactions; t++) {
+    bool success = inserter.TransactionDBInsert(db, txn_options);
+    if (!success) {
+      // unexpected failure
+      return inserter.GetLastStatus();
+    }
+  }
+  inserter.GetLastStatus().PermitUncheckedError();
+
+  // Make sure at least some of the transactions succeeded.  It's ok if
+  // some failed due to write-conflicts.
+  if (num_transactions != 1 &&
+      inserter.GetFailureCount() > num_transactions / 2) {
+    return Status::TryAgain("Too many transactions failed! " +
+                            std::to_string(inserter.GetFailureCount()) + " / " +
+                            std::to_string(num_transactions));
+  }
+
+  return Status::OK();
+}
+}  // namespace
+
+// Worker threads add a number to a key from each set of keys. The checker
+// threads verify that the sum of all keys in each set are equal.
+TEST_P(MySQLStyleTransactionTest, TransactionStressTest) {
+  // Small write buffer to trigger more compactions
+  options.write_buffer_size = 1024;
+  txn_db_options.rollback_deletion_type_callback =
+      [](TransactionDB*, ColumnFamilyHandle*, const Slice& key) {
+        return RandomTransactionInserter::RollbackDeletionTypeCallback(key);
+      };
+  ASSERT_OK(ReOpenNoDelete());
+  constexpr size_t num_workers = 4;        // worker threads count
+  constexpr size_t num_checkers = 2;       // checker threads count
+  constexpr size_t num_slow_checkers = 2;  // checker threads emulating backups
+  constexpr size_t num_slow_workers = 1;   // slow worker threads count
+  constexpr size_t num_transactions_per_thread = 1000;
+  constexpr uint16_t num_sets = 3;
+  constexpr size_t num_keys_per_set = 100;
+  // Setting the key-space to be 100 keys should cause enough write-conflicts
+  // to make this test interesting.
+
+  std::vector<port::Thread> threads;
+  std::atomic<uint32_t> finished = {0};
+  constexpr bool TAKE_SNAPSHOT = true;
+  uint64_t time_seed = env->NowMicros();
+  printf("time_seed is %" PRIu64 "\n", time_seed);  // would help to reproduce
+
+  std::function<void()> call_inserter = [&] {
+    size_t thd_seed = std::hash<std::thread::id>()(std::this_thread::get_id());
+    Random64 rand(time_seed * thd_seed);
+    ASSERT_OK(TransactionStressTestInserter(db, num_transactions_per_thread,
+                                            num_sets, num_keys_per_set, &rand));
+    finished++;
+  };
+  std::function<void()> call_checker = [&] {
+    size_t thd_seed = std::hash<std::thread::id>()(std::this_thread::get_id());
+    Random64 rand(time_seed * thd_seed);
+    // Verify that data is consistent
+    while (finished < num_workers) {
+      ASSERT_OK(RandomTransactionInserter::Verify(
+          db, num_sets, num_keys_per_set, TAKE_SNAPSHOT, &rand));
+    }
+  };
+  std::function<void()> call_slow_checker = [&] {
+    size_t thd_seed = std::hash<std::thread::id>()(std::this_thread::get_id());
+    Random64 rand(time_seed * thd_seed);
+    // Verify that data is consistent
+    while (finished < num_workers) {
+      uint64_t delay_ms = rand.Uniform(100) + 1;
+      Status s = RandomTransactionInserter::Verify(
+          db, num_sets, num_keys_per_set, TAKE_SNAPSHOT, &rand, delay_ms);
+      ASSERT_OK(s);
+    }
+  };
+  std::function<void()> call_slow_inserter = [&] {
+    size_t thd_seed = std::hash<std::thread::id>()(std::this_thread::get_id());
+    Random64 rand(time_seed * thd_seed);
+    uint64_t id = 0;
+    // Verify that data is consistent
+    while (finished < num_workers) {
+      uint64_t delay_ms = rand.Uniform(500) + 1;
+      ASSERT_OK(TransactionStressTestInserter(db, 1, num_sets, num_keys_per_set,
+                                              &rand, delay_ms, id++));
+    }
+  };
+
+  for (uint32_t i = 0; i < num_workers; i++) {
+    threads.emplace_back(call_inserter);
+  }
+  for (uint32_t i = 0; i < num_checkers; i++) {
+    threads.emplace_back(call_checker);
+  }
+  if (with_slow_threads_) {
+    for (uint32_t i = 0; i < num_slow_checkers; i++) {
+      threads.emplace_back(call_slow_checker);
+    }
+    for (uint32_t i = 0; i < num_slow_workers; i++) {
+      threads.emplace_back(call_slow_inserter);
+    }
+  }
+
+  // Wait for all threads to finish
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  // Verify that data is consistent
+  Status s = RandomTransactionInserter::Verify(db, num_sets, num_keys_per_set,
+                                               !TAKE_SNAPSHOT);
+  ASSERT_OK(s);
+}
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_P(TransactionTest, MemoryLimitTest) {
+  TransactionOptions txn_options;
+  // Header (12 bytes) + NOOP (1 byte) + 2 * 8 bytes for data.
+  txn_options.max_write_batch_size = 29;
+  // Set threshold to unlimited so that the write batch does not get flushed,
+  // and can hit the memory limit.
+  txn_options.write_batch_flush_threshold = 0;
+  std::string value;
+  Status s;
+
+  Transaction* txn = db->BeginTransaction(WriteOptions(), txn_options);
+  ASSERT_TRUE(txn);
+
+  ASSERT_EQ(0, txn->GetNumPuts());
+  ASSERT_LE(0, txn->GetID());
+
+  s = txn->Put(Slice("a"), Slice("...."));
+  ASSERT_OK(s);
+  ASSERT_EQ(1, txn->GetNumPuts());
+
+  s = txn->Put(Slice("b"), Slice("...."));
+  ASSERT_OK(s);
+  ASSERT_EQ(2, txn->GetNumPuts());
+
+  s = txn->Put(Slice("b"), Slice("...."));
+  ASSERT_TRUE(s.IsMemoryLimit());
+  ASSERT_EQ(2, txn->GetNumPuts());
+
+  ASSERT_OK(txn->Rollback());
+  delete txn;
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+// This test clarifies the existing expectation from the sequence number
+// algorithm. It could detect mistakes in updating the code but it is not
+// necessarily the one acceptable way. If the algorithm is legitimately changed,
+// this unit test should be updated as well.
+TEST_P(TransactionStressTest, SeqAdvanceTest) {
+  // TODO(myabandeh): must be test with false before new releases
+  const bool short_test = true;
+  WriteOptions wopts;
+  FlushOptions fopt;
+
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+
+  // Do the test with NUM_BRANCHES branches in it. Each run of a test takes some
+  // of the branches. This is the same as counting a binary number where i-th
+  // bit represents whether we take branch i in the represented by the number.
+  const size_t NUM_BRANCHES = short_test ? 6 : 10;
+  // Helper function that shows if the branch is to be taken in the run
+  // represented by the number n.
+  auto branch_do = [&](size_t n, size_t* branch) {
+    assert(*branch < NUM_BRANCHES);
+    const size_t filter = static_cast<size_t>(1) << *branch;
+    return n & filter;
+  };
+  const size_t max_n = static_cast<size_t>(1) << NUM_BRANCHES;
+  for (size_t n = 0; n < max_n; n++) {
+    DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+    size_t branch = 0;
+    auto seq = db_impl->GetLatestSequenceNumber();
+    exp_seq = seq;
+    TestTxn0(0);
+    seq = db_impl->TEST_GetLastVisibleSequence();
+    ASSERT_EQ(exp_seq, seq);
+
+    if (branch_do(n, &branch)) {
+      ASSERT_OK(db_impl->Flush(fopt));
+      seq = db_impl->TEST_GetLastVisibleSequence();
+      ASSERT_EQ(exp_seq, seq);
+    }
+    if (!short_test && branch_do(n, &branch)) {
+      ASSERT_OK(db_impl->FlushWAL(true));
+      ASSERT_OK(ReOpenNoDelete());
+      db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+      seq = db_impl->GetLatestSequenceNumber();
+      ASSERT_EQ(exp_seq, seq);
+    }
+
+    // Doing it twice might detect some bugs
+    TestTxn0(1);
+    seq = db_impl->TEST_GetLastVisibleSequence();
+    ASSERT_EQ(exp_seq, seq);
+
+    TestTxn1(0);
+    seq = db_impl->TEST_GetLastVisibleSequence();
+    ASSERT_EQ(exp_seq, seq);
+
+    if (branch_do(n, &branch)) {
+      ASSERT_OK(db_impl->Flush(fopt));
+      seq = db_impl->TEST_GetLastVisibleSequence();
+      ASSERT_EQ(exp_seq, seq);
+    }
+    if (!short_test && branch_do(n, &branch)) {
+      ASSERT_OK(db_impl->FlushWAL(true));
+      ASSERT_OK(ReOpenNoDelete());
+      db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+      seq = db_impl->GetLatestSequenceNumber();
+      ASSERT_EQ(exp_seq, seq);
+    }
+
+    TestTxn3(0);
+    seq = db_impl->TEST_GetLastVisibleSequence();
+    ASSERT_EQ(exp_seq, seq);
+
+    if (branch_do(n, &branch)) {
+      ASSERT_OK(db_impl->Flush(fopt));
+      seq = db_impl->TEST_GetLastVisibleSequence();
+      ASSERT_EQ(exp_seq, seq);
+    }
+    if (!short_test && branch_do(n, &branch)) {
+      ASSERT_OK(db_impl->FlushWAL(true));
+      ASSERT_OK(ReOpenNoDelete());
+      db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+      seq = db_impl->GetLatestSequenceNumber();
+      ASSERT_EQ(exp_seq, seq);
+    }
+
+    TestTxn4(0);
+    seq = db_impl->TEST_GetLastVisibleSequence();
+
+    ASSERT_EQ(exp_seq, seq);
+
+    if (branch_do(n, &branch)) {
+      ASSERT_OK(db_impl->Flush(fopt));
+      seq = db_impl->TEST_GetLastVisibleSequence();
+      ASSERT_EQ(exp_seq, seq);
+    }
+    if (!short_test && branch_do(n, &branch)) {
+      ASSERT_OK(db_impl->FlushWAL(true));
+      ASSERT_OK(ReOpenNoDelete());
+      db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+      seq = db_impl->GetLatestSequenceNumber();
+      ASSERT_EQ(exp_seq, seq);
+    }
+
+    TestTxn2(0);
+    seq = db_impl->TEST_GetLastVisibleSequence();
+    ASSERT_EQ(exp_seq, seq);
+
+    if (branch_do(n, &branch)) {
+      ASSERT_OK(db_impl->Flush(fopt));
+      seq = db_impl->TEST_GetLastVisibleSequence();
+      ASSERT_EQ(exp_seq, seq);
+    }
+    if (!short_test && branch_do(n, &branch)) {
+      ASSERT_OK(db_impl->FlushWAL(true));
+      ASSERT_OK(ReOpenNoDelete());
+      db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+      seq = db_impl->GetLatestSequenceNumber();
+      ASSERT_EQ(exp_seq, seq);
+    }
+    ASSERT_OK(ReOpen());
+  }
+}
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+// Verify that the optimization would not compromize the correctness
+TEST_P(TransactionTest, Optimizations) {
+  size_t comb_cnt = size_t(1) << 2;  // 2 is number of optimization vars
+  for (size_t new_comb = 0; new_comb < comb_cnt; new_comb++) {
+    TransactionDBWriteOptimizations optimizations;
+    optimizations.skip_concurrency_control = IsInCombination(0, new_comb);
+    optimizations.skip_duplicate_key_check = IsInCombination(1, new_comb);
+
+    ASSERT_OK(ReOpen());
+    WriteOptions write_options;
+    WriteBatch batch;
+    ASSERT_OK(batch.Put(Slice("k"), Slice("v1")));
+    ASSERT_OK(db->Write(write_options, &batch));
+
+    ReadOptions ropt;
+    PinnableSlice pinnable_val;
+    ASSERT_OK(db->Get(ropt, db->DefaultColumnFamily(), "k", &pinnable_val));
+    ASSERT_TRUE(pinnable_val == ("v1"));
+  }
+}
+
+// A comparator that uses only the first three bytes
+class ThreeBytewiseComparator : public Comparator {
+ public:
+  ThreeBytewiseComparator() {}
+  const char* Name() const override { return "test.ThreeBytewiseComparator"; }
+  int Compare(const Slice& a, const Slice& b) const override {
+    Slice na = Slice(a.data(), a.size() < 3 ? a.size() : 3);
+    Slice nb = Slice(b.data(), b.size() < 3 ? b.size() : 3);
+    return na.compare(nb);
+  }
+  bool Equal(const Slice& a, const Slice& b) const override {
+    Slice na = Slice(a.data(), a.size() < 3 ? a.size() : 3);
+    Slice nb = Slice(b.data(), b.size() < 3 ? b.size() : 3);
+    return na == nb;
+  }
+  // These methods below don't seem relevant to this test. Implement them if
+  // proven othersize.
+  void FindShortestSeparator(std::string* start,
+                             const Slice& limit) const override {
+    const Comparator* bytewise_comp = BytewiseComparator();
+    bytewise_comp->FindShortestSeparator(start, limit);
+  }
+  void FindShortSuccessor(std::string* key) const override {
+    const Comparator* bytewise_comp = BytewiseComparator();
+    bytewise_comp->FindShortSuccessor(key);
+  }
+};
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+TEST_P(TransactionTest, GetWithoutSnapshot) {
+  WriteOptions write_options;
+  std::atomic<bool> finish = {false};
+  ASSERT_OK(db->Put(write_options, "key", "value"));
+  ROCKSDB_NAMESPACE::port::Thread commit_thread([&]() {
+    for (int i = 0; i < 100; i++) {
+      TransactionOptions txn_options;
+      Transaction* txn = db->BeginTransaction(write_options, txn_options);
+      ASSERT_OK(txn->SetName("xid"));
+      ASSERT_OK(txn->Put("key", "overridedvalue"));
+      ASSERT_OK(txn->Put("key", "value"));
+      ASSERT_OK(txn->Prepare());
+      ASSERT_OK(txn->Commit());
+      delete txn;
+    }
+    finish = true;
+  });
+  ROCKSDB_NAMESPACE::port::Thread read_thread([&]() {
+    while (!finish) {
+      ReadOptions ropt;
+      PinnableSlice pinnable_val;
+      ASSERT_OK(db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val));
+      ASSERT_TRUE(pinnable_val == ("value"));
+    }
+  });
+  commit_thread.join();
+  read_thread.join();
+}
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+// Test that the transactional db can handle duplicate keys in the write batch
+TEST_P(TransactionTest, DuplicateKeys) {
+  ColumnFamilyOptions cf_options;
+  std::string cf_name = "two";
+  ColumnFamilyHandle* cf_handle = nullptr;
+  {
+    ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
+    WriteOptions write_options;
+    WriteBatch batch;
+    ASSERT_OK(batch.Put(Slice("key"), Slice("value")));
+    ASSERT_OK(batch.Put(Slice("key2"), Slice("value2")));
+    // duplicate the keys
+    ASSERT_OK(batch.Put(Slice("key"), Slice("value3")));
+    // duplicate the 2nd key. It should not be counted duplicate since a
+    // sub-patch is cut after the last duplicate.
+    ASSERT_OK(batch.Put(Slice("key2"), Slice("value4")));
+    // duplicate the keys but in a different cf. It should not be counted as
+    // duplicate keys
+    ASSERT_OK(batch.Put(cf_handle, Slice("key"), Slice("value5")));
+
+    ASSERT_OK(db->Write(write_options, &batch));
+
+    ReadOptions ropt;
+    PinnableSlice pinnable_val;
+    auto s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val);
+    ASSERT_OK(s);
+    ASSERT_TRUE(pinnable_val == ("value3"));
+    s = db->Get(ropt, db->DefaultColumnFamily(), "key2", &pinnable_val);
+    ASSERT_OK(s);
+    ASSERT_TRUE(pinnable_val == ("value4"));
+    s = db->Get(ropt, cf_handle, "key", &pinnable_val);
+    ASSERT_OK(s);
+    ASSERT_TRUE(pinnable_val == ("value5"));
+
+    delete cf_handle;
+  }
+
+  // Test with non-bytewise comparator
+  {
+    ASSERT_OK(ReOpen());
+    std::unique_ptr<const Comparator> comp_gc(new ThreeBytewiseComparator());
+    cf_options.comparator = comp_gc.get();
+    ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
+    WriteOptions write_options;
+    WriteBatch batch;
+    ASSERT_OK(batch.Put(cf_handle, Slice("key"), Slice("value")));
+    // The first three bytes are the same, do it must be counted as duplicate
+    ASSERT_OK(batch.Put(cf_handle, Slice("key2"), Slice("value2")));
+    // check for 2nd duplicate key in cf with non-default comparator
+    ASSERT_OK(batch.Put(cf_handle, Slice("key2b"), Slice("value2b")));
+    ASSERT_OK(db->Write(write_options, &batch));
+
+    // The value must be the most recent value for all the keys equal to "key",
+    // including "key2"
+    ReadOptions ropt;
+    PinnableSlice pinnable_val;
+    ASSERT_OK(db->Get(ropt, cf_handle, "key", &pinnable_val));
+    ASSERT_TRUE(pinnable_val == ("value2b"));
+
+    // Test duplicate keys with rollback
+    TransactionOptions txn_options;
+    Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+    ASSERT_OK(txn0->SetName("xid"));
+    ASSERT_OK(txn0->Put(cf_handle, Slice("key3"), Slice("value3")));
+    ASSERT_OK(txn0->Merge(cf_handle, Slice("key4"), Slice("value4")));
+    ASSERT_OK(txn0->Rollback());
+    ASSERT_OK(db->Get(ropt, cf_handle, "key5", &pinnable_val));
+    ASSERT_TRUE(pinnable_val == ("value2b"));
+    delete txn0;
+
+    delete cf_handle;
+    cf_options.comparator = BytewiseComparator();
+  }
+
+  for (bool do_prepare : {true, false}) {
+    for (bool do_rollback : {true, false}) {
+      for (bool with_commit_batch : {true, false}) {
+        if (with_commit_batch && !do_prepare) {
+          continue;
+        }
+        if (with_commit_batch && do_rollback) {
+          continue;
+        }
+        ASSERT_OK(ReOpen());
+        ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
+        TransactionOptions txn_options;
+        txn_options.use_only_the_last_commit_time_batch_for_recovery = true;
+        WriteOptions write_options;
+        Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+        auto s = txn0->SetName("xid");
+        ASSERT_OK(s);
+        s = txn0->Put(Slice("foo0"), Slice("bar0a"));
+        ASSERT_OK(s);
+        s = txn0->Put(Slice("foo0"), Slice("bar0b"));
+        ASSERT_OK(s);
+        s = txn0->Put(Slice("foo1"), Slice("bar1"));
+        ASSERT_OK(s);
+        s = txn0->Merge(Slice("foo2"), Slice("bar2a"));
+        ASSERT_OK(s);
+        // Repeat a key after the start of a sub-patch. This should not cause a
+        // duplicate in the most recent sub-patch and hence not creating a new
+        // sub-patch.
+        s = txn0->Put(Slice("foo0"), Slice("bar0c"));
+        ASSERT_OK(s);
+        s = txn0->Merge(Slice("foo2"), Slice("bar2b"));
+        ASSERT_OK(s);
+        // duplicate the keys but in a different cf. It should not be counted as
+        // duplicate.
+        s = txn0->Put(cf_handle, Slice("foo0"), Slice("bar0-cf1"));
+        ASSERT_OK(s);
+        s = txn0->Put(Slice("foo3"), Slice("bar3"));
+        ASSERT_OK(s);
+        s = txn0->Merge(Slice("foo3"), Slice("bar3"));
+        ASSERT_OK(s);
+        s = txn0->Put(Slice("foo4"), Slice("bar4"));
+        ASSERT_OK(s);
+        s = txn0->Delete(Slice("foo4"));
+        ASSERT_OK(s);
+        s = txn0->SingleDelete(Slice("foo4"));
+        ASSERT_OK(s);
+        if (do_prepare) {
+          s = txn0->Prepare();
+          ASSERT_OK(s);
+        }
+        if (do_rollback) {
+          // Test rolling back the batch with duplicates
+          s = txn0->Rollback();
+          ASSERT_OK(s);
+        } else {
+          if (with_commit_batch) {
+            assert(do_prepare);
+            auto cb = txn0->GetCommitTimeWriteBatch();
+            // duplicate a key in the original batch
+            // TODO(myabandeh): the behavior of GetCommitTimeWriteBatch
+            // conflicting with the prepared batch is currently undefined and
+            // gives different results in different implementations.
+
+            // s = cb->Put(Slice("foo0"), Slice("bar0d"));
+            // ASSERT_OK(s);
+            // add a new duplicate key
+            s = cb->Put(Slice("foo6"), Slice("bar6a"));
+            ASSERT_OK(s);
+            s = cb->Put(Slice("foo6"), Slice("bar6b"));
+            ASSERT_OK(s);
+            // add a duplicate key that is removed in the same batch
+            s = cb->Put(Slice("foo7"), Slice("bar7a"));
+            ASSERT_OK(s);
+            s = cb->Delete(Slice("foo7"));
+            ASSERT_OK(s);
+          }
+          s = txn0->Commit();
+          ASSERT_OK(s);
+        }
+        delete txn0;
+        ReadOptions ropt;
+        PinnableSlice pinnable_val;
+
+        if (do_rollback) {
+          s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
+          ASSERT_TRUE(s.IsNotFound());
+          s = db->Get(ropt, cf_handle, "foo0", &pinnable_val);
+          ASSERT_TRUE(s.IsNotFound());
+          s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val);
+          ASSERT_TRUE(s.IsNotFound());
+          s = db->Get(ropt, db->DefaultColumnFamily(), "foo2", &pinnable_val);
+          ASSERT_TRUE(s.IsNotFound());
+          s = db->Get(ropt, db->DefaultColumnFamily(), "foo3", &pinnable_val);
+          ASSERT_TRUE(s.IsNotFound());
+          s = db->Get(ropt, db->DefaultColumnFamily(), "foo4", &pinnable_val);
+          ASSERT_TRUE(s.IsNotFound());
+        } else {
+          s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
+          ASSERT_OK(s);
+          ASSERT_TRUE(pinnable_val == ("bar0c"));
+          s = db->Get(ropt, cf_handle, "foo0", &pinnable_val);
+          ASSERT_OK(s);
+          ASSERT_TRUE(pinnable_val == ("bar0-cf1"));
+          s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val);
+          ASSERT_OK(s);
+          ASSERT_TRUE(pinnable_val == ("bar1"));
+          s = db->Get(ropt, db->DefaultColumnFamily(), "foo2", &pinnable_val);
+          ASSERT_OK(s);
+          ASSERT_TRUE(pinnable_val == ("bar2a,bar2b"));
+          s = db->Get(ropt, db->DefaultColumnFamily(), "foo3", &pinnable_val);
+          ASSERT_OK(s);
+          ASSERT_TRUE(pinnable_val == ("bar3,bar3"));
+          s = db->Get(ropt, db->DefaultColumnFamily(), "foo4", &pinnable_val);
+          ASSERT_TRUE(s.IsNotFound());
+          if (with_commit_batch) {
+            s = db->Get(ropt, db->DefaultColumnFamily(), "foo6", &pinnable_val);
+            if (txn_db_options.write_policy ==
+                TxnDBWritePolicy::WRITE_COMMITTED) {
+              ASSERT_OK(s);
+              ASSERT_TRUE(pinnable_val == ("bar6b"));
+            } else {
+              ASSERT_TRUE(s.IsNotFound());
+            }
+            s = db->Get(ropt, db->DefaultColumnFamily(), "foo7", &pinnable_val);
+            ASSERT_TRUE(s.IsNotFound());
+          }
+        }
+        delete cf_handle;
+      }  // with_commit_batch
+    }    // do_rollback
+  }      // do_prepare
+
+  if (!options.unordered_write) {
+    // Also test with max_successive_merges > 0. max_successive_merges will not
+    // affect our algorithm for duplicate key insertion but we add the test to
+    // verify that.
+    cf_options.max_successive_merges = 2;
+    cf_options.merge_operator = MergeOperators::CreateStringAppendOperator();
+    ASSERT_OK(ReOpen());
+    db->CreateColumnFamily(cf_options, cf_name, &cf_handle);
+    WriteOptions write_options;
+    // Ensure one value for the key
+    ASSERT_OK(db->Put(write_options, cf_handle, Slice("key"), Slice("value")));
+    WriteBatch batch;
+    // Merge more than max_successive_merges times
+    ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("1")));
+    ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("2")));
+    ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("3")));
+    ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("4")));
+    ASSERT_OK(db->Write(write_options, &batch));
+    ReadOptions read_options;
+    std::string value;
+    ASSERT_OK(db->Get(read_options, cf_handle, "key", &value));
+    ASSERT_EQ(value, "value,1,2,3,4");
+    delete cf_handle;
+  }
+
+  {
+    // Test that the duplicate detection is not compromised after rolling back
+    // to a save point
+    TransactionOptions txn_options;
+    WriteOptions write_options;
+    Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+    ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0a")));
+    ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0b")));
+    txn0->SetSavePoint();
+    ASSERT_OK(txn0->RollbackToSavePoint());
+    ASSERT_OK(txn0->Commit());
+    delete txn0;
+  }
+
+  // Test sucessfull recovery after a crash
+  {
+    ASSERT_OK(ReOpen());
+    TransactionOptions txn_options;
+    WriteOptions write_options;
+    ReadOptions ropt;
+    Transaction* txn0;
+    PinnableSlice pinnable_val;
+    Status s;
+
+    std::unique_ptr<const Comparator> comp_gc(new ThreeBytewiseComparator());
+    cf_options.comparator = comp_gc.get();
+    cf_options.merge_operator = MergeOperators::CreateStringAppendOperator();
+    ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
+    delete cf_handle;
+    std::vector<ColumnFamilyDescriptor> cfds{
+        ColumnFamilyDescriptor(kDefaultColumnFamilyName,
+                               ColumnFamilyOptions(options)),
+        ColumnFamilyDescriptor(cf_name, cf_options),
+    };
+    std::vector<ColumnFamilyHandle*> handles;
+    ASSERT_OK(ReOpenNoDelete(cfds, &handles));
+
+    assert(db != nullptr);
+    ASSERT_OK(db->Put(write_options, "foo0", "init"));
+    ASSERT_OK(db->Put(write_options, "foo1", "init"));
+    ASSERT_OK(db->Put(write_options, handles[1], "foo0", "init"));
+    ASSERT_OK(db->Put(write_options, handles[1], "foo1", "init"));
+
+    // one entry
+    txn0 = db->BeginTransaction(write_options, txn_options);
+    ASSERT_OK(txn0->SetName("xid"));
+    ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0a")));
+    ASSERT_OK(txn0->Prepare());
+    delete txn0;
+    // This will check the asserts inside recovery code
+    ASSERT_OK(db->FlushWAL(true));
+    reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+    ASSERT_OK(ReOpenNoDelete(cfds, &handles));
+    txn0 = db->GetTransactionByName("xid");
+    ASSERT_TRUE(txn0 != nullptr);
+    ASSERT_OK(txn0->Commit());
+    delete txn0;
+    s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
+    ASSERT_OK(s);
+    ASSERT_TRUE(pinnable_val == ("bar0a"));
+
+    // two entries, no duplicate
+    txn0 = db->BeginTransaction(write_options, txn_options);
+    ASSERT_OK(txn0->SetName("xid"));
+    ASSERT_OK(txn0->Put(handles[1], Slice("foo0"), Slice("bar0b")));
+    ASSERT_OK(txn0->Put(handles[1], Slice("fol1"), Slice("bar1b")));
+    ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0b")));
+    ASSERT_OK(txn0->Put(Slice("foo1"), Slice("bar1b")));
+    ASSERT_OK(txn0->Prepare());
+    delete txn0;
+    // This will check the asserts inside recovery code
+    ASSERT_OK(db->FlushWAL(true));
+    // Flush only cf 1
+    ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB())
+                  ->TEST_FlushMemTable(true, false, handles[1]));
+    reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+    ASSERT_OK(ReOpenNoDelete(cfds, &handles));
+    txn0 = db->GetTransactionByName("xid");
+    ASSERT_TRUE(txn0 != nullptr);
+    ASSERT_OK(txn0->Commit());
+    delete txn0;
+    pinnable_val.Reset();
+    s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
+    ASSERT_OK(s);
+    ASSERT_TRUE(pinnable_val == ("bar0b"));
+    pinnable_val.Reset();
+    s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val);
+    ASSERT_OK(s);
+    ASSERT_TRUE(pinnable_val == ("bar1b"));
+    pinnable_val.Reset();
+    s = db->Get(ropt, handles[1], "foo0", &pinnable_val);
+    ASSERT_OK(s);
+    ASSERT_TRUE(pinnable_val == ("bar0b"));
+    pinnable_val.Reset();
+    s = db->Get(ropt, handles[1], "fol1", &pinnable_val);
+    ASSERT_OK(s);
+    ASSERT_TRUE(pinnable_val == ("bar1b"));
+
+    // one duplicate with ::Put
+    txn0 = db->BeginTransaction(write_options, txn_options);
+    ASSERT_OK(txn0->SetName("xid"));
+    ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar0c")));
+    ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey1"), Slice("bar1d")));
+    ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0c")));
+    ASSERT_OK(txn0->Put(Slice("foo1"), Slice("bar1c")));
+    ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0d")));
+    ASSERT_OK(txn0->Prepare());
+    delete txn0;
+    // This will check the asserts inside recovery code
+    ASSERT_OK(db->FlushWAL(true));
+    // Flush only cf 1
+    ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB())
+                  ->TEST_FlushMemTable(true, false, handles[1]));
+    reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+    ASSERT_OK(ReOpenNoDelete(cfds, &handles));
+    txn0 = db->GetTransactionByName("xid");
+    ASSERT_TRUE(txn0 != nullptr);
+    ASSERT_OK(txn0->Commit());
+    delete txn0;
+    pinnable_val.Reset();
+    s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
+    ASSERT_OK(s);
+    ASSERT_TRUE(pinnable_val == ("bar0d"));
+    pinnable_val.Reset();
+    s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val);
+    ASSERT_OK(s);
+    ASSERT_TRUE(pinnable_val == ("bar1c"));
+    pinnable_val.Reset();
+    s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val);
+    ASSERT_OK(s);
+    ASSERT_TRUE(pinnable_val == ("bar1d"));
+
+    // Duplicate with ::Put, ::Delete
+    txn0 = db->BeginTransaction(write_options, txn_options);
+    ASSERT_OK(txn0->SetName("xid"));
+    ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar0e")));
+    ASSERT_OK(txn0->Delete(handles[1], Slice("key-nonkey1")));
+    ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0e")));
+    ASSERT_OK(txn0->Delete(Slice("foo0")));
+    ASSERT_OK(txn0->Prepare());
+    delete txn0;
+    // This will check the asserts inside recovery code
+    ASSERT_OK(db->FlushWAL(true));
+    // Flush only cf 1
+    ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB())
+                  ->TEST_FlushMemTable(true, false, handles[1]));
+    reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+    ASSERT_OK(ReOpenNoDelete(cfds, &handles));
+    txn0 = db->GetTransactionByName("xid");
+    ASSERT_TRUE(txn0 != nullptr);
+    ASSERT_OK(txn0->Commit());
+    delete txn0;
+    pinnable_val.Reset();
+    s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
+    ASSERT_TRUE(s.IsNotFound());
+    pinnable_val.Reset();
+    s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val);
+    ASSERT_TRUE(s.IsNotFound());
+
+    // Duplicate with ::Put, ::SingleDelete
+    txn0 = db->BeginTransaction(write_options, txn_options);
+    ASSERT_OK(txn0->SetName("xid"));
+    ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar0g")));
+    ASSERT_OK(txn0->SingleDelete(handles[1], Slice("key-nonkey1")));
+    ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0e")));
+    ASSERT_OK(txn0->SingleDelete(Slice("foo0")));
+    ASSERT_OK(txn0->Prepare());
+    delete txn0;
+    // This will check the asserts inside recovery code
+    ASSERT_OK(db->FlushWAL(true));
+    // Flush only cf 1
+    ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB())
+                  ->TEST_FlushMemTable(true, false, handles[1]));
+    reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+    ASSERT_OK(ReOpenNoDelete(cfds, &handles));
+    txn0 = db->GetTransactionByName("xid");
+    ASSERT_TRUE(txn0 != nullptr);
+    ASSERT_OK(txn0->Commit());
+    delete txn0;
+    pinnable_val.Reset();
+    s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
+    ASSERT_TRUE(s.IsNotFound());
+    pinnable_val.Reset();
+    s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val);
+    ASSERT_TRUE(s.IsNotFound());
+
+    // Duplicate with ::Put, ::Merge
+    txn0 = db->BeginTransaction(write_options, txn_options);
+    ASSERT_OK(txn0->SetName("xid"));
+    ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar1i")));
+    ASSERT_OK(txn0->Merge(handles[1], Slice("key-nonkey1"), Slice("bar1j")));
+    ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0f")));
+    ASSERT_OK(txn0->Merge(Slice("foo0"), Slice("bar0g")));
+    ASSERT_OK(txn0->Prepare());
+    delete txn0;
+    // This will check the asserts inside recovery code
+    ASSERT_OK(db->FlushWAL(true));
+    // Flush only cf 1
+    ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB())
+                  ->TEST_FlushMemTable(true, false, handles[1]));
+    reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+    ASSERT_OK(ReOpenNoDelete(cfds, &handles));
+    txn0 = db->GetTransactionByName("xid");
+    ASSERT_TRUE(txn0 != nullptr);
+    ASSERT_OK(txn0->Commit());
+    delete txn0;
+    pinnable_val.Reset();
+    s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
+    ASSERT_OK(s);
+    ASSERT_TRUE(pinnable_val == ("bar0f,bar0g"));
+    pinnable_val.Reset();
+    s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val);
+    ASSERT_OK(s);
+    ASSERT_TRUE(pinnable_val == ("bar1i,bar1j"));
+
+    for (auto h : handles) {
+      delete h;
+    }
+    delete db;
+    db = nullptr;
+  }
+}
+
+// Test that the reseek optimization in iterators will not result in an infinite
+// loop if there are too many uncommitted entries before the snapshot.
+TEST_P(TransactionTest, ReseekOptimization) {
+  WriteOptions write_options;
+  write_options.sync = true;
+  write_options.disableWAL = false;
+  ColumnFamilyDescriptor cfd;
+  ASSERT_OK(db->DefaultColumnFamily()->GetDescriptor(&cfd));
+  auto max_skip = cfd.options.max_sequential_skip_in_iterations;
+
+  ASSERT_OK(db->Put(write_options, Slice("foo0"), Slice("initv")));
+
+  TransactionOptions txn_options;
+  Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_OK(txn0->SetName("xid"));
+  // Duplicate keys will result into separate sequence numbers in WritePrepared
+  // and WriteUnPrepared
+  for (size_t i = 0; i < 2 * max_skip; i++) {
+    ASSERT_OK(txn0->Put(Slice("foo1"), Slice("bar")));
+  }
+  ASSERT_OK(txn0->Prepare());
+  ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("initv")));
+
+  ReadOptions read_options;
+  // To avoid loops
+  read_options.max_skippable_internal_keys = 10 * max_skip;
+  Iterator* iter = db->NewIterator(read_options);
+  ASSERT_OK(iter->status());
+  size_t cnt = 0;
+  iter->SeekToFirst();
+  while (iter->Valid()) {
+    iter->Next();
+    ASSERT_OK(iter->status());
+    cnt++;
+  }
+  ASSERT_EQ(cnt, 2);
+  cnt = 0;
+  iter->SeekToLast();
+  while (iter->Valid()) {
+    iter->Prev();
+    ASSERT_OK(iter->status());
+    cnt++;
+  }
+  ASSERT_EQ(cnt, 2);
+  delete iter;
+  ASSERT_OK(txn0->Rollback());
+  delete txn0;
+}
+
+// After recovery in kPointInTimeRecovery mode, the corrupted log file remains
+// there. The new log files should be still read succesfully during recovery of
+// the 2nd crash.
+TEST_P(TransactionTest, DoubleCrashInRecovery) {
+  for (const bool manual_wal_flush : {false, true}) {
+    for (const bool write_after_recovery : {false, true}) {
+      options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+      options.manual_wal_flush = manual_wal_flush;
+      ASSERT_OK(ReOpen());
+      std::string cf_name = "two";
+      ColumnFamilyOptions cf_options;
+      ColumnFamilyHandle* cf_handle = nullptr;
+      ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
+
+      // Add a prepare entry to prevent the older logs from being deleted.
+      WriteOptions write_options;
+      TransactionOptions txn_options;
+      Transaction* txn = db->BeginTransaction(write_options, txn_options);
+      ASSERT_OK(txn->SetName("xid"));
+      ASSERT_OK(txn->Put(Slice("foo-prepare"), Slice("bar-prepare")));
+      ASSERT_OK(txn->Prepare());
+
+      FlushOptions flush_ops;
+      ASSERT_OK(db->Flush(flush_ops));
+      // Now we have a log that cannot be deleted
+
+      ASSERT_OK(db->Put(write_options, cf_handle, "foo1", "bar1"));
+      // Flush only the 2nd cf
+      ASSERT_OK(db->Flush(flush_ops, cf_handle));
+
+      // The value is large enough to be touched by the corruption we ingest
+      // below.
+      std::string large_value(400, ' ');
+      // key/value not touched by corruption
+      ASSERT_OK(db->Put(write_options, "foo2", "bar2"));
+      // key/value touched by corruption
+      ASSERT_OK(db->Put(write_options, "foo3", large_value));
+      // key/value not touched by corruption
+      ASSERT_OK(db->Put(write_options, "foo4", "bar4"));
+
+      ASSERT_OK(db->FlushWAL(true));
+      DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+      uint64_t wal_file_id = db_impl->TEST_LogfileNumber();
+      std::string fname = LogFileName(dbname, wal_file_id);
+      reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+      delete txn;
+      delete cf_handle;
+      delete db;
+      db = nullptr;
+
+      // Corrupt the last log file in the middle, so that it is not corrupted
+      // in the tail.
+      std::string file_content;
+      ASSERT_OK(ReadFileToString(env, fname, &file_content));
+      file_content[400] = 'h';
+      file_content[401] = 'a';
+      ASSERT_OK(env->DeleteFile(fname));
+      ASSERT_OK(WriteStringToFile(env, file_content, fname, true));
+
+      // Recover from corruption
+      std::vector<ColumnFamilyHandle*> handles;
+      std::vector<ColumnFamilyDescriptor> column_families;
+      column_families.push_back(ColumnFamilyDescriptor(kDefaultColumnFamilyName,
+                                                       ColumnFamilyOptions()));
+      column_families.push_back(
+          ColumnFamilyDescriptor("two", ColumnFamilyOptions()));
+      ASSERT_OK(ReOpenNoDelete(column_families, &handles));
+      assert(db != nullptr);
+
+      if (write_after_recovery) {
+        // Write data to the log right after the corrupted log
+        ASSERT_OK(db->Put(write_options, "foo5", large_value));
+      }
+
+      // Persist data written to WAL during recovery or by the last Put
+      ASSERT_OK(db->FlushWAL(true));
+      // 2nd crash to recover while having a valid log after the corrupted one.
+      ASSERT_OK(ReOpenNoDelete(column_families, &handles));
+      assert(db != nullptr);
+      txn = db->GetTransactionByName("xid");
+      ASSERT_TRUE(txn != nullptr);
+      ASSERT_OK(txn->Commit());
+      delete txn;
+      for (auto handle : handles) {
+        delete handle;
+      }
+    }
+  }
+}
+
+TEST_P(TransactionTest, CommitWithoutPrepare) {
+  {
+    // skip_prepare = false.
+    WriteOptions write_options;
+    TransactionOptions txn_options;
+    txn_options.skip_prepare = false;
+    Transaction* txn = db->BeginTransaction(write_options, txn_options);
+    ASSERT_TRUE(txn->Commit().IsTxnNotPrepared());
+    delete txn;
+  }
+
+  {
+    // skip_prepare = true.
+    WriteOptions write_options;
+    TransactionOptions txn_options;
+    txn_options.skip_prepare = true;
+    Transaction* txn = db->BeginTransaction(write_options, txn_options);
+    ASSERT_OK(txn->Commit());
+    delete txn;
+  }
+}
+
+TEST_P(TransactionTest, OpenAndEnableU64Timestamp) {
+  ASSERT_OK(ReOpenNoDelete());
+
+  assert(db);
+
+  const std::string test_cf_name = "test_cf";
+  ColumnFamilyOptions cf_opts;
+  cf_opts.comparator = test::BytewiseComparatorWithU64TsWrapper();
+  {
+    ColumnFamilyHandle* cfh = nullptr;
+    const Status s = db->CreateColumnFamily(cf_opts, test_cf_name, &cfh);
+    if (txn_db_options.write_policy == WRITE_COMMITTED) {
+      ASSERT_OK(s);
+      delete cfh;
+    } else {
+      ASSERT_TRUE(s.IsNotSupported());
+      assert(!cfh);
+    }
+  }
+
+  // Bypass transaction db layer.
+  if (txn_db_options.write_policy != WRITE_COMMITTED) {
+    DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+    assert(db_impl);
+    ColumnFamilyHandle* cfh = nullptr;
+    ASSERT_OK(db_impl->CreateColumnFamily(cf_opts, test_cf_name, &cfh));
+    delete cfh;
+  }
+
+  {
+    std::vector<ColumnFamilyDescriptor> cf_descs;
+    cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+    cf_descs.emplace_back(test_cf_name, cf_opts);
+    std::vector<ColumnFamilyHandle*> handles;
+    const Status s = ReOpenNoDelete(cf_descs, &handles);
+    if (txn_db_options.write_policy == WRITE_COMMITTED) {
+      ASSERT_OK(s);
+      for (auto* h : handles) {
+        delete h;
+      }
+    } else {
+      ASSERT_TRUE(s.IsNotSupported());
+    }
+  }
+}
+
+TEST_P(TransactionTest, OpenAndEnableU32Timestamp) {
+  class DummyComparatorWithU32Ts : public Comparator {
+   public:
+    DummyComparatorWithU32Ts() : Comparator(sizeof(uint32_t)) {}
+    const char* Name() const override { return "DummyComparatorWithU32Ts"; }
+    void FindShortSuccessor(std::string*) const override {}
+    void FindShortestSeparator(std::string*, const Slice&) const override {}
+    int Compare(const Slice&, const Slice&) const override { return 0; }
+  };
+
+  std::unique_ptr<Comparator> dummy_ucmp(new DummyComparatorWithU32Ts());
+
+  ASSERT_OK(ReOpenNoDelete());
+
+  assert(db);
+
+  const std::string test_cf_name = "test_cf";
+
+  ColumnFamilyOptions cf_opts;
+  cf_opts.comparator = dummy_ucmp.get();
+  {
+    ColumnFamilyHandle* cfh = nullptr;
+    ASSERT_TRUE(db->CreateColumnFamily(cf_opts, test_cf_name, &cfh)
+                    .IsInvalidArgument());
+  }
+
+  // Bypass transaction db layer.
+  {
+    ColumnFamilyHandle* cfh = nullptr;
+    DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+    assert(db_impl);
+    ASSERT_OK(db_impl->CreateColumnFamily(cf_opts, test_cf_name, &cfh));
+    delete cfh;
+  }
+
+  {
+    std::vector<ColumnFamilyDescriptor> cf_descs;
+    cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+    cf_descs.emplace_back(test_cf_name, cf_opts);
+    std::vector<ColumnFamilyHandle*> handles;
+    ASSERT_TRUE(ReOpenNoDelete(cf_descs, &handles).IsInvalidArgument());
+  }
+}
+
+TEST_P(TransactionTest, WriteWithBulkCreatedColumnFamilies) {
+  ColumnFamilyOptions cf_options;
+  WriteOptions write_options;
+
+  std::vector<std::string> cf_names;
+  std::vector<ColumnFamilyHandle*> cf_handles;
+
+  cf_names.push_back("test_cf");
+
+  ASSERT_OK(db->CreateColumnFamilies(cf_options, cf_names, &cf_handles));
+  ASSERT_OK(db->Put(write_options, cf_handles[0], "foo", "bar"));
+  ASSERT_OK(db->DropColumnFamilies(cf_handles));
+
+  for (auto* h : cf_handles) {
+    delete h;
+  }
+  cf_handles.clear();
+
+  std::vector<ColumnFamilyDescriptor> cf_descriptors;
+
+  cf_descriptors.emplace_back("test_cf", ColumnFamilyOptions());
+
+  ASSERT_OK(db->CreateColumnFamilies(cf_options, cf_names, &cf_handles));
+  ASSERT_OK(db->Put(write_options, cf_handles[0], "foo", "bar"));
+  ASSERT_OK(db->DropColumnFamilies(cf_handles));
+  for (auto* h : cf_handles) {
+    delete h;
+  }
+  cf_handles.clear();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as Transactions are not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/transaction_test.h b/src/rocksdb/utilities/transactions/transaction_test.h
new file mode 100644
index 000000000..0b86453a4
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_test.h
@@ -0,0 +1,578 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <algorithm>
+#include <cinttypes>
+#include <functional>
+#include <string>
+#include <thread>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "table/mock_table.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "test_util/transaction_test_util.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/string_append/stringappend.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/write_unprepared_txn_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Return true if the ith bit is set in combination represented by comb
+bool IsInCombination(size_t i, size_t comb) { return comb & (size_t(1) << i); }
+
+enum WriteOrdering : bool { kOrderedWrite, kUnorderedWrite };
+
+class TransactionTestBase : public ::testing::Test {
+ public:
+  TransactionDB* db;
+  SpecialEnv special_env;
+  FaultInjectionTestEnv* env;
+  std::string dbname;
+  Options options;
+
+  TransactionDBOptions txn_db_options;
+  bool use_stackable_db_;
+
+  TransactionTestBase(bool use_stackable_db, bool two_write_queue,
+                      TxnDBWritePolicy write_policy,
+                      WriteOrdering write_ordering)
+      : db(nullptr),
+        special_env(Env::Default()),
+        env(nullptr),
+        use_stackable_db_(use_stackable_db) {
+    options.create_if_missing = true;
+    options.max_write_buffer_number = 2;
+    options.write_buffer_size = 4 * 1024;
+    options.unordered_write = write_ordering == kUnorderedWrite;
+    options.level0_file_num_compaction_trigger = 2;
+    options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+    special_env.skip_fsync_ = true;
+    env = new FaultInjectionTestEnv(&special_env);
+    options.env = env;
+    options.two_write_queues = two_write_queue;
+    dbname = test::PerThreadDBPath("transaction_testdb");
+
+    EXPECT_OK(DestroyDB(dbname, options));
+    txn_db_options.transaction_lock_timeout = 0;
+    txn_db_options.default_lock_timeout = 0;
+    txn_db_options.write_policy = write_policy;
+    txn_db_options.rollback_merge_operands = true;
+    // This will stress write unprepared, by forcing write batch flush on every
+    // write.
+    txn_db_options.default_write_batch_flush_threshold = 1;
+    // Write unprepared requires all transactions to be named. This setting
+    // autogenerates the name so that existing tests can pass.
+    txn_db_options.autogenerate_name = true;
+    Status s;
+    if (use_stackable_db == false) {
+      s = TransactionDB::Open(options, txn_db_options, dbname, &db);
+    } else {
+      s = OpenWithStackableDB();
+    }
+    EXPECT_OK(s);
+  }
+
+  ~TransactionTestBase() {
+    delete db;
+    db = nullptr;
+    // This is to skip the assert statement in FaultInjectionTestEnv. There
+    // seems to be a bug in btrfs that the makes readdir return recently
+    // unlink-ed files. By using the default fs we simply ignore errors resulted
+    // from attempting to delete such files in DestroyDB.
+    if (getenv("KEEP_DB") == nullptr) {
+      options.env = Env::Default();
+      EXPECT_OK(DestroyDB(dbname, options));
+    } else {
+      fprintf(stdout, "db is still in %s\n", dbname.c_str());
+    }
+    delete env;
+  }
+
+  Status ReOpenNoDelete() {
+    delete db;
+    db = nullptr;
+    env->AssertNoOpenFile();
+    env->DropUnsyncedFileData();
+    env->ResetState();
+    Status s;
+    if (use_stackable_db_ == false) {
+      s = TransactionDB::Open(options, txn_db_options, dbname, &db);
+    } else {
+      s = OpenWithStackableDB();
+    }
+    assert(!s.ok() || db != nullptr);
+    return s;
+  }
+
+  Status ReOpenNoDelete(std::vector<ColumnFamilyDescriptor>& cfs,
+                        std::vector<ColumnFamilyHandle*>* handles) {
+    for (auto h : *handles) {
+      delete h;
+    }
+    handles->clear();
+    delete db;
+    db = nullptr;
+    env->AssertNoOpenFile();
+    env->DropUnsyncedFileData();
+    env->ResetState();
+    Status s;
+    if (use_stackable_db_ == false) {
+      s = TransactionDB::Open(options, txn_db_options, dbname, cfs, handles,
+                              &db);
+    } else {
+      s = OpenWithStackableDB(cfs, handles);
+    }
+    assert(!s.ok() || db != nullptr);
+    return s;
+  }
+
+  Status ReOpen() {
+    delete db;
+    db = nullptr;
+    DestroyDB(dbname, options);
+    Status s;
+    if (use_stackable_db_ == false) {
+      s = TransactionDB::Open(options, txn_db_options, dbname, &db);
+    } else {
+      s = OpenWithStackableDB();
+    }
+    assert(db != nullptr);
+    return s;
+  }
+
+  Status OpenWithStackableDB(std::vector<ColumnFamilyDescriptor>& cfs,
+                             std::vector<ColumnFamilyHandle*>* handles) {
+    std::vector<size_t> compaction_enabled_cf_indices;
+    TransactionDB::PrepareWrap(&options, &cfs, &compaction_enabled_cf_indices);
+    DB* root_db = nullptr;
+    Options options_copy(options);
+    const bool use_seq_per_batch =
+        txn_db_options.write_policy == WRITE_PREPARED ||
+        txn_db_options.write_policy == WRITE_UNPREPARED;
+    const bool use_batch_per_txn =
+        txn_db_options.write_policy == WRITE_COMMITTED ||
+        txn_db_options.write_policy == WRITE_PREPARED;
+    Status s = DBImpl::Open(options_copy, dbname, cfs, handles, &root_db,
+                            use_seq_per_batch, use_batch_per_txn);
+    auto stackable_db = std::make_unique<StackableDB>(root_db);
+    if (s.ok()) {
+      assert(root_db != nullptr);
+      // If WrapStackableDB() returns non-ok, then stackable_db is already
+      // deleted within WrapStackableDB().
+      s = TransactionDB::WrapStackableDB(stackable_db.release(), txn_db_options,
+                                         compaction_enabled_cf_indices,
+                                         *handles, &db);
+    }
+    return s;
+  }
+
+  Status OpenWithStackableDB() {
+    std::vector<size_t> compaction_enabled_cf_indices;
+    std::vector<ColumnFamilyDescriptor> column_families{ColumnFamilyDescriptor(
+        kDefaultColumnFamilyName, ColumnFamilyOptions(options))};
+
+    TransactionDB::PrepareWrap(&options, &column_families,
+                               &compaction_enabled_cf_indices);
+    std::vector<ColumnFamilyHandle*> handles;
+    DB* root_db = nullptr;
+    Options options_copy(options);
+    const bool use_seq_per_batch =
+        txn_db_options.write_policy == WRITE_PREPARED ||
+        txn_db_options.write_policy == WRITE_UNPREPARED;
+    const bool use_batch_per_txn =
+        txn_db_options.write_policy == WRITE_COMMITTED ||
+        txn_db_options.write_policy == WRITE_PREPARED;
+    Status s = DBImpl::Open(options_copy, dbname, column_families, &handles,
+                            &root_db, use_seq_per_batch, use_batch_per_txn);
+    if (!s.ok()) {
+      delete root_db;
+      return s;
+    }
+    StackableDB* stackable_db = new StackableDB(root_db);
+    assert(root_db != nullptr);
+    assert(handles.size() == 1);
+    s = TransactionDB::WrapStackableDB(stackable_db, txn_db_options,
+                                       compaction_enabled_cf_indices, handles,
+                                       &db);
+    delete handles[0];
+    if (!s.ok()) {
+      delete stackable_db;
+    }
+    return s;
+  }
+
+  std::atomic<size_t> linked = {0};
+  std::atomic<size_t> exp_seq = {0};
+  std::atomic<size_t> commit_writes = {0};
+  std::atomic<size_t> expected_commits = {0};
+  // Without Prepare, the commit does not write to WAL
+  std::atomic<size_t> with_empty_commits = {0};
+  void TestTxn0(size_t index) {
+    // Test DB's internal txn. It involves no prepare phase nor a commit marker.
+    auto s = db->Put(WriteOptions(), "key" + std::to_string(index), "value");
+    ASSERT_OK(s);
+    if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) {
+      // Consume one seq per key
+      exp_seq++;
+    } else {
+      // Consume one seq per batch
+      exp_seq++;
+      if (options.two_write_queues) {
+        // Consume one seq for commit
+        exp_seq++;
+      }
+    }
+    with_empty_commits++;
+  }
+
+  void TestTxn1(size_t index) {
+    // Testing directly writing a write batch. Functionality-wise it is
+    // equivalent to commit without prepare.
+    WriteBatch wb;
+    auto istr = std::to_string(index);
+    ASSERT_OK(wb.Put("k1" + istr, "v1"));
+    ASSERT_OK(wb.Put("k2" + istr, "v2"));
+    ASSERT_OK(wb.Put("k3" + istr, "v3"));
+    auto s = db->Write(WriteOptions(), &wb);
+    if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) {
+      // Consume one seq per key
+      exp_seq += 3;
+    } else {
+      // Consume one seq per batch
+      exp_seq++;
+      if (options.two_write_queues) {
+        // Consume one seq for commit
+        exp_seq++;
+      }
+    }
+    ASSERT_OK(s);
+    with_empty_commits++;
+  }
+
+  void TestTxn2(size_t index) {
+    // Commit without prepare. It should write to DB without a commit marker.
+    Transaction* txn =
+        db->BeginTransaction(WriteOptions(), TransactionOptions());
+    auto istr = std::to_string(index);
+    ASSERT_OK(txn->SetName("xid" + istr));
+    ASSERT_OK(txn->Put(Slice("foo" + istr), Slice("bar")));
+    ASSERT_OK(txn->Put(Slice("foo2" + istr), Slice("bar2")));
+    ASSERT_OK(txn->Put(Slice("foo3" + istr), Slice("bar3")));
+    ASSERT_OK(txn->Put(Slice("foo4" + istr), Slice("bar4")));
+    ASSERT_OK(txn->Commit());
+    if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) {
+      // Consume one seq per key
+      exp_seq += 4;
+    } else if (txn_db_options.write_policy ==
+               TxnDBWritePolicy::WRITE_PREPARED) {
+      // Consume one seq per batch
+      exp_seq++;
+      if (options.two_write_queues) {
+        // Consume one seq for commit
+        exp_seq++;
+      }
+    } else {
+      // Flushed after each key, consume one seq per flushed batch
+      exp_seq += 4;
+      // WriteUnprepared implements CommitWithoutPrepareInternal by simply
+      // calling Prepare then Commit. Consume one seq for the prepare.
+      exp_seq++;
+    }
+    delete txn;
+    with_empty_commits++;
+  }
+
+  void TestTxn3(size_t index) {
+    // A full 2pc txn that also involves a commit marker.
+    Transaction* txn =
+        db->BeginTransaction(WriteOptions(), TransactionOptions());
+    auto istr = std::to_string(index);
+    ASSERT_OK(txn->SetName("xid" + istr));
+    ASSERT_OK(txn->Put(Slice("foo" + istr), Slice("bar")));
+    ASSERT_OK(txn->Put(Slice("foo2" + istr), Slice("bar2")));
+    ASSERT_OK(txn->Put(Slice("foo3" + istr), Slice("bar3")));
+    ASSERT_OK(txn->Put(Slice("foo4" + istr), Slice("bar4")));
+    ASSERT_OK(txn->Put(Slice("foo5" + istr), Slice("bar5")));
+    expected_commits++;
+    ASSERT_OK(txn->Prepare());
+    commit_writes++;
+    ASSERT_OK(txn->Commit());
+    if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) {
+      // Consume one seq per key
+      exp_seq += 5;
+    } else if (txn_db_options.write_policy ==
+               TxnDBWritePolicy::WRITE_PREPARED) {
+      // Consume one seq per batch
+      exp_seq++;
+      // Consume one seq per commit marker
+      exp_seq++;
+    } else {
+      // Flushed after each key, consume one seq per flushed batch
+      exp_seq += 5;
+      // Consume one seq per commit marker
+      exp_seq++;
+    }
+    delete txn;
+  }
+
+  void TestTxn4(size_t index) {
+    // A full 2pc txn that also involves a commit marker.
+    Transaction* txn =
+        db->BeginTransaction(WriteOptions(), TransactionOptions());
+    auto istr = std::to_string(index);
+    ASSERT_OK(txn->SetName("xid" + istr));
+    ASSERT_OK(txn->Put(Slice("foo" + istr), Slice("bar")));
+    ASSERT_OK(txn->Put(Slice("foo2" + istr), Slice("bar2")));
+    ASSERT_OK(txn->Put(Slice("foo3" + istr), Slice("bar3")));
+    ASSERT_OK(txn->Put(Slice("foo4" + istr), Slice("bar4")));
+    ASSERT_OK(txn->Put(Slice("foo5" + istr), Slice("bar5")));
+    expected_commits++;
+    ASSERT_OK(txn->Prepare());
+    commit_writes++;
+    ASSERT_OK(txn->Rollback());
+    if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) {
+      // No seq is consumed for deleting the txn buffer
+      exp_seq += 0;
+    } else if (txn_db_options.write_policy ==
+               TxnDBWritePolicy::WRITE_PREPARED) {
+      // Consume one seq per batch
+      exp_seq++;
+      // Consume one seq per rollback batch
+      exp_seq++;
+      if (options.two_write_queues) {
+        // Consume one seq for rollback commit
+        exp_seq++;
+      }
+    } else {
+      // Flushed after each key, consume one seq per flushed batch
+      exp_seq += 5;
+      // Consume one seq per rollback batch
+      exp_seq++;
+      if (options.two_write_queues) {
+        // Consume one seq for rollback commit
+        exp_seq++;
+      }
+    }
+    delete txn;
+  }
+
+  // Test that we can change write policy after a clean shutdown (which would
+  // empty the WAL)
+  void CrossCompatibilityTest(TxnDBWritePolicy from_policy,
+                              TxnDBWritePolicy to_policy, bool empty_wal) {
+    TransactionOptions txn_options;
+    ReadOptions read_options;
+    WriteOptions write_options;
+    uint32_t index = 0;
+    Random rnd(1103);
+    options.write_buffer_size = 1024;  // To create more sst files
+    std::unordered_map<std::string, std::string> committed_kvs;
+    Transaction* txn;
+
+    txn_db_options.write_policy = from_policy;
+    if (txn_db_options.write_policy == WRITE_COMMITTED) {
+      options.unordered_write = false;
+    }
+    ASSERT_OK(ReOpen());
+
+    for (int i = 0; i < 1024; i++) {
+      auto istr = std::to_string(index);
+      auto k = Slice("foo-" + istr).ToString();
+      auto v = Slice("bar-" + istr).ToString();
+      // For test the duplicate keys
+      auto v2 = Slice("bar2-" + istr).ToString();
+      auto type = rnd.Uniform(4);
+      switch (type) {
+        case 0:
+          committed_kvs[k] = v;
+          ASSERT_OK(db->Put(write_options, k, v));
+          committed_kvs[k] = v2;
+          ASSERT_OK(db->Put(write_options, k, v2));
+          break;
+        case 1: {
+          WriteBatch wb;
+          committed_kvs[k] = v;
+          ASSERT_OK(wb.Put(k, v));
+          committed_kvs[k] = v2;
+          ASSERT_OK(wb.Put(k, v2));
+          ASSERT_OK(db->Write(write_options, &wb));
+
+        } break;
+        case 2:
+        case 3:
+          txn = db->BeginTransaction(write_options, txn_options);
+          ASSERT_OK(txn->SetName("xid" + istr));
+          committed_kvs[k] = v;
+          ASSERT_OK(txn->Put(k, v));
+          committed_kvs[k] = v2;
+          ASSERT_OK(txn->Put(k, v2));
+
+          if (type == 3) {
+            ASSERT_OK(txn->Prepare());
+          }
+          ASSERT_OK(txn->Commit());
+          delete txn;
+          break;
+        default:
+          FAIL();
+      }
+
+      index++;
+    }  // for i
+
+    txn_db_options.write_policy = to_policy;
+    if (txn_db_options.write_policy == WRITE_COMMITTED) {
+      options.unordered_write = false;
+    }
+    auto db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+    // Before upgrade/downgrade the WAL must be emptied
+    if (empty_wal) {
+      ASSERT_OK(db_impl->TEST_FlushMemTable());
+    } else {
+      ASSERT_OK(db_impl->FlushWAL(true));
+    }
+    auto s = ReOpenNoDelete();
+    if (empty_wal) {
+      ASSERT_OK(s);
+    } else {
+      // Test that we can detect the WAL that is produced by an incompatible
+      // WritePolicy and fail fast before mis-interpreting the WAL.
+      ASSERT_TRUE(s.IsNotSupported());
+      return;
+    }
+    db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+    // Check that WAL is empty
+    VectorLogPtr log_files;
+    ASSERT_OK(db_impl->GetSortedWalFiles(log_files));
+    ASSERT_EQ(0, log_files.size());
+
+    for (auto& kv : committed_kvs) {
+      std::string value;
+      s = db->Get(read_options, kv.first, &value);
+      if (s.IsNotFound()) {
+        printf("key = %s\n", kv.first.c_str());
+      }
+      ASSERT_OK(s);
+      if (kv.second != value) {
+        printf("key = %s\n", kv.first.c_str());
+      }
+      ASSERT_EQ(kv.second, value);
+    }
+  }
+};
+
+class TransactionTest
+    : public TransactionTestBase,
+      virtual public ::testing::WithParamInterface<
+          std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering>> {
+ public:
+  TransactionTest()
+      : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()),
+                            std::get<2>(GetParam()), std::get<3>(GetParam())){};
+};
+
+class TransactionStressTest : public TransactionTest {};
+
+class MySQLStyleTransactionTest
+    : public TransactionTestBase,
+      virtual public ::testing::WithParamInterface<
+          std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering, bool>> {
+ public:
+  MySQLStyleTransactionTest()
+      : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()),
+                            std::get<2>(GetParam()), std::get<3>(GetParam())),
+        with_slow_threads_(std::get<4>(GetParam())) {
+    if (with_slow_threads_ &&
+        (txn_db_options.write_policy == WRITE_PREPARED ||
+         txn_db_options.write_policy == WRITE_UNPREPARED)) {
+      // The corner case with slow threads involves the caches filling
+      // over which would not happen even with artifial delays. To help
+      // such cases to show up we lower the size of the cache-related data
+      // structures.
+      txn_db_options.wp_snapshot_cache_bits = 1;
+      txn_db_options.wp_commit_cache_bits = 10;
+      options.write_buffer_size = 1024;
+      EXPECT_OK(ReOpen());
+    }
+  };
+
+ protected:
+  // Also emulate slow threads by addin artiftial delays
+  const bool with_slow_threads_;
+};
+
+class WriteCommittedTxnWithTsTest
+    : public TransactionTestBase,
+      public ::testing::WithParamInterface<std::tuple<bool, bool, bool>> {
+ public:
+  WriteCommittedTxnWithTsTest()
+      : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()),
+                            WRITE_COMMITTED, kOrderedWrite) {}
+  ~WriteCommittedTxnWithTsTest() override {
+    for (auto* h : handles_) {
+      delete h;
+    }
+  }
+
+  Status GetFromDb(ReadOptions read_opts, ColumnFamilyHandle* column_family,
+                   const Slice& key, TxnTimestamp ts, std::string* value) {
+    std::string ts_buf;
+    PutFixed64(&ts_buf, ts);
+    Slice ts_slc = ts_buf;
+    read_opts.timestamp = &ts_slc;
+    assert(db);
+    return db->Get(read_opts, column_family, key, value);
+  }
+
+  Transaction* NewTxn(WriteOptions write_opts, TransactionOptions txn_opts) {
+    assert(db);
+    auto* txn = db->BeginTransaction(write_opts, txn_opts);
+    assert(txn);
+    const bool enable_indexing = std::get<2>(GetParam());
+    if (enable_indexing) {
+      txn->EnableIndexing();
+    } else {
+      txn->DisableIndexing();
+    }
+    return txn;
+  }
+
+ protected:
+  std::vector<ColumnFamilyHandle*> handles_{};
+};
+
+class TimestampedSnapshotWithTsSanityCheck
+    : public TransactionTestBase,
+      public ::testing::WithParamInterface<
+          std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering>> {
+ public:
+  explicit TimestampedSnapshotWithTsSanityCheck()
+      : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()),
+                            std::get<2>(GetParam()), std::get<3>(GetParam())) {}
+  ~TimestampedSnapshotWithTsSanityCheck() override {
+    for (auto* h : handles_) {
+      delete h;
+    }
+  }
+
+ protected:
+  std::vector<ColumnFamilyHandle*> handles_{};
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/transactions/transaction_util.cc b/src/rocksdb/utilities/transactions/transaction_util.cc
new file mode 100644
index 000000000..360edc8ec
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_util.cc
@@ -0,0 +1,206 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/transaction_util.h"
+
+#include <cinttypes>
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "util/cast_util.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status TransactionUtil::CheckKeyForConflicts(
+    DBImpl* db_impl, ColumnFamilyHandle* column_family, const std::string& key,
+    SequenceNumber snap_seq, const std::string* const read_ts, bool cache_only,
+    ReadCallback* snap_checker, SequenceNumber min_uncommitted) {
+  Status result;
+
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  auto cfd = cfh->cfd();
+  SuperVersion* sv = db_impl->GetAndRefSuperVersion(cfd);
+
+  if (sv == nullptr) {
+    result = Status::InvalidArgument("Could not access column family " +
+                                     cfh->GetName());
+  }
+
+  if (result.ok()) {
+    SequenceNumber earliest_seq =
+        db_impl->GetEarliestMemTableSequenceNumber(sv, true);
+
+    result = CheckKey(db_impl, sv, earliest_seq, snap_seq, key, read_ts,
+                      cache_only, snap_checker, min_uncommitted);
+
+    db_impl->ReturnAndCleanupSuperVersion(cfd, sv);
+  }
+
+  return result;
+}
+
+Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv,
+                                 SequenceNumber earliest_seq,
+                                 SequenceNumber snap_seq,
+                                 const std::string& key,
+                                 const std::string* const read_ts,
+                                 bool cache_only, ReadCallback* snap_checker,
+                                 SequenceNumber min_uncommitted) {
+  // When `min_uncommitted` is provided, keys are not always committed
+  // in sequence number order, and `snap_checker` is used to check whether
+  // specific sequence number is in the database is visible to the transaction.
+  // So `snap_checker` must be provided.
+  assert(min_uncommitted == kMaxSequenceNumber || snap_checker != nullptr);
+
+  Status result;
+  bool need_to_read_sst = false;
+
+  // Since it would be too slow to check the SST files, we will only use
+  // the memtables to check whether there have been any recent writes
+  // to this key after it was accessed in this transaction.  But if the
+  // Memtables do not contain a long enough history, we must fail the
+  // transaction.
+  if (earliest_seq == kMaxSequenceNumber) {
+    // The age of this memtable is unknown.  Cannot rely on it to check
+    // for recent writes.  This error shouldn't happen often in practice as
+    // the Memtable should have a valid earliest sequence number except in some
+    // corner cases (such as error cases during recovery).
+    need_to_read_sst = true;
+
+    if (cache_only) {
+      result = Status::TryAgain(
+          "Transaction could not check for conflicts as the MemTable does not "
+          "contain a long enough history to check write at SequenceNumber: ",
+          std::to_string(snap_seq));
+    }
+  } else if (snap_seq < earliest_seq || min_uncommitted <= earliest_seq) {
+    // Use <= for min_uncommitted since earliest_seq is actually the largest sec
+    // before this memtable was created
+    need_to_read_sst = true;
+
+    if (cache_only) {
+      // The age of this memtable is too new to use to check for recent
+      // writes.
+      char msg[300];
+      snprintf(msg, sizeof(msg),
+               "Transaction could not check for conflicts for operation at "
+               "SequenceNumber %" PRIu64
+               " as the MemTable only contains changes newer than "
+               "SequenceNumber %" PRIu64
+               ".  Increasing the value of the "
+               "max_write_buffer_size_to_maintain option could reduce the "
+               "frequency "
+               "of this error.",
+               snap_seq, earliest_seq);
+      result = Status::TryAgain(msg);
+    }
+  }
+
+  if (result.ok()) {
+    SequenceNumber seq = kMaxSequenceNumber;
+    std::string timestamp;
+    bool found_record_for_key = false;
+
+    // When min_uncommitted == kMaxSequenceNumber, writes are committed in
+    // sequence number order, so only keys larger than `snap_seq` can cause
+    // conflict.
+    // When min_uncommitted != kMaxSequenceNumber, keys lower than
+    // min_uncommitted will not triggered conflicts, while keys larger than
+    // min_uncommitted might create conflicts, so we need  to read them out
+    // from the DB, and call callback to snap_checker to determine. So only
+    // keys lower than min_uncommitted can be skipped.
+    SequenceNumber lower_bound_seq =
+        (min_uncommitted == kMaxSequenceNumber) ? snap_seq : min_uncommitted;
+    Status s = db_impl->GetLatestSequenceForKey(
+        sv, key, !need_to_read_sst, lower_bound_seq, &seq,
+        !read_ts ? nullptr : &timestamp, &found_record_for_key,
+        /*is_blob_index=*/nullptr);
+
+    if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+      result = s;
+    } else if (found_record_for_key) {
+      bool write_conflict = snap_checker == nullptr
+                                ? snap_seq < seq
+                                : !snap_checker->IsVisible(seq);
+      // Perform conflict checking based on timestamp if applicable.
+      if (!write_conflict && read_ts != nullptr) {
+        ColumnFamilyData* cfd = sv->cfd;
+        assert(cfd);
+        const Comparator* const ucmp = cfd->user_comparator();
+        assert(ucmp);
+        assert(read_ts->size() == ucmp->timestamp_size());
+        assert(read_ts->size() == timestamp.size());
+        // Write conflict if *ts < timestamp.
+        write_conflict = ucmp->CompareTimestamp(*read_ts, timestamp) < 0;
+      }
+      if (write_conflict) {
+        result = Status::Busy();
+      }
+    }
+  }
+
+  return result;
+}
+
+Status TransactionUtil::CheckKeysForConflicts(DBImpl* db_impl,
+                                              const LockTracker& tracker,
+                                              bool cache_only) {
+  Status result;
+
+  std::unique_ptr<LockTracker::ColumnFamilyIterator> cf_it(
+      tracker.GetColumnFamilyIterator());
+  assert(cf_it != nullptr);
+  while (cf_it->HasNext()) {
+    ColumnFamilyId cf = cf_it->Next();
+
+    SuperVersion* sv = db_impl->GetAndRefSuperVersion(cf);
+    if (sv == nullptr) {
+      result = Status::InvalidArgument("Could not access column family " +
+                                       std::to_string(cf));
+      break;
+    }
+
+    SequenceNumber earliest_seq =
+        db_impl->GetEarliestMemTableSequenceNumber(sv, true);
+
+    // For each of the keys in this transaction, check to see if someone has
+    // written to this key since the start of the transaction.
+    std::unique_ptr<LockTracker::KeyIterator> key_it(
+        tracker.GetKeyIterator(cf));
+    assert(key_it != nullptr);
+    while (key_it->HasNext()) {
+      const std::string& key = key_it->Next();
+      PointLockStatus status = tracker.GetPointLockStatus(cf, key);
+      const SequenceNumber key_seq = status.seq;
+
+      // TODO: support timestamp-based conflict checking.
+      // CheckKeysForConflicts() is currently used only by optimistic
+      // transactions.
+      result = CheckKey(db_impl, sv, earliest_seq, key_seq, key,
+                        /*read_ts=*/nullptr, cache_only);
+      if (!result.ok()) {
+        break;
+      }
+    }
+
+    db_impl->ReturnAndCleanupSuperVersion(cf, sv);
+
+    if (!result.ok()) {
+      break;
+    }
+  }
+
+  return result;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/transaction_util.h b/src/rocksdb/utilities/transactions/transaction_util.h
new file mode 100644
index 000000000..a349ba87a
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_util.h
@@ -0,0 +1,85 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <unordered_map>
+
+#include "db/dbformat.h"
+#include "db/read_callback.h"
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "utilities/transactions/lock/lock_tracker.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImpl;
+struct SuperVersion;
+class WriteBatchWithIndex;
+
+class TransactionUtil {
+ public:
+  // Verifies there have been no commits to this key in the db since this
+  // sequence number. If user-defined timestamp is enabled, then also check
+  // no commits to this key in the db since the given ts.
+  //
+  // If cache_only is true, then this function will not attempt to read any
+  // SST files.  This will make it more likely this function will
+  // return an error if it is unable to determine if there are any conflicts.
+  //
+  // See comment of CheckKey() for explanation of `snap_seq`, `ts`,
+  // `snap_checker` and `min_uncommitted`.
+  //
+  // Returns OK on success, BUSY if there is a conflicting write, or other error
+  // status for any unexpected errors.
+  static Status CheckKeyForConflicts(
+      DBImpl* db_impl, ColumnFamilyHandle* column_family,
+      const std::string& key, SequenceNumber snap_seq,
+      const std::string* const ts, bool cache_only,
+      ReadCallback* snap_checker = nullptr,
+      SequenceNumber min_uncommitted = kMaxSequenceNumber);
+
+  // For each key,SequenceNumber pair tracked by the LockTracker, this function
+  // will verify there have been no writes to the key in the db since that
+  // sequence number.
+  //
+  // Returns OK on success, BUSY if there is a conflicting write, or other error
+  // status for any unexpected errors.
+  //
+  // REQUIRED:
+  // This function should only be called on the write thread or if the
+  // mutex is held.
+  // tracker must support point lock.
+  static Status CheckKeysForConflicts(DBImpl* db_impl,
+                                      const LockTracker& tracker,
+                                      bool cache_only);
+
+ private:
+  // If `snap_checker` == nullptr, writes are always commited in sequence number
+  // order. All sequence number <= `snap_seq` will not conflict with any
+  // write, and all keys > `snap_seq` of `key` will trigger conflict.
+  // If `snap_checker` != nullptr, writes may not commit in sequence number
+  // order. In this case `min_uncommitted` is a lower bound.
+  //  seq < `min_uncommitted`: no conflict
+  //  seq > `snap_seq`: applicable to conflict
+  //  `min_uncommitted` <= seq <= `snap_seq`: call `snap_checker` to determine.
+  //
+  // If user-defined timestamp is enabled, a write conflict is detected if an
+  // operation for `key` with timestamp greater than `ts` exists.
+  static Status CheckKey(DBImpl* db_impl, SuperVersion* sv,
+                         SequenceNumber earliest_seq, SequenceNumber snap_seq,
+                         const std::string& key, const std::string* const ts,
+                         bool cache_only, ReadCallback* snap_checker = nullptr,
+                         SequenceNumber min_uncommitted = kMaxSequenceNumber);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc b/src/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc
new file mode 100644
index 000000000..94b8201f7
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc
@@ -0,0 +1,588 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "utilities/merge_operators.h"
+#ifndef ROCKSDB_LITE
+
+#include "test_util/testutil.h"
+#include "utilities/transactions/transaction_test.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+INSTANTIATE_TEST_CASE_P(
+    DBAsBaseDB, WriteCommittedTxnWithTsTest,
+    ::testing::Values(std::make_tuple(false, /*two_write_queue=*/false,
+                                      /*enable_indexing=*/false),
+                      std::make_tuple(false, /*two_write_queue=*/true,
+                                      /*enable_indexing=*/false),
+                      std::make_tuple(false, /*two_write_queue=*/false,
+                                      /*enable_indexing=*/true),
+                      std::make_tuple(false, /*two_write_queue=*/true,
+                                      /*enable_indexing=*/true)));
+
+INSTANTIATE_TEST_CASE_P(
+    DBAsStackableDB, WriteCommittedTxnWithTsTest,
+    ::testing::Values(std::make_tuple(true, /*two_write_queue=*/false,
+                                      /*enable_indexing=*/false),
+                      std::make_tuple(true, /*two_write_queue=*/true,
+                                      /*enable_indexing=*/false),
+                      std::make_tuple(true, /*two_write_queue=*/false,
+                                      /*enable_indexing=*/true),
+                      std::make_tuple(true, /*two_write_queue=*/true,
+                                      /*enable_indexing=*/true)));
+
+TEST_P(WriteCommittedTxnWithTsTest, SanityChecks) {
+  ASSERT_OK(ReOpenNoDelete());
+
+  ColumnFamilyOptions cf_opts;
+  cf_opts.comparator = test::BytewiseComparatorWithU64TsWrapper();
+  const std::string test_cf_name = "test_cf";
+  ColumnFamilyHandle* cfh = nullptr;
+  assert(db);
+  ASSERT_OK(db->CreateColumnFamily(cf_opts, test_cf_name, &cfh));
+  delete cfh;
+  cfh = nullptr;
+
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+  cf_descs.emplace_back(test_cf_name, cf_opts);
+  ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_));
+
+  std::unique_ptr<Transaction> txn(
+      NewTxn(WriteOptions(), TransactionOptions()));
+  assert(txn);
+  ASSERT_OK(txn->Put(handles_[1], "foo", "value"));
+  ASSERT_TRUE(txn->Commit().IsInvalidArgument());
+
+  auto* pessimistic_txn =
+      static_cast_with_check<PessimisticTransaction>(txn.get());
+  ASSERT_TRUE(
+      pessimistic_txn->CommitBatch(/*batch=*/nullptr).IsInvalidArgument());
+
+  {
+    WriteBatchWithIndex* wbwi = txn->GetWriteBatch();
+    assert(wbwi);
+    WriteBatch* wb = wbwi->GetWriteBatch();
+    assert(wb);
+    // Write a key to the batch for nonexisting cf.
+    ASSERT_OK(WriteBatchInternal::Put(wb, /*column_family_id=*/10, /*key=*/"",
+                                      /*value=*/""));
+  }
+
+  ASSERT_OK(txn->SetCommitTimestamp(20));
+
+  ASSERT_TRUE(txn->Commit().IsInvalidArgument());
+  txn.reset();
+
+  std::unique_ptr<Transaction> txn1(
+      NewTxn(WriteOptions(), TransactionOptions()));
+  assert(txn1);
+  ASSERT_OK(txn1->SetName("txn1"));
+  ASSERT_OK(txn1->Put(handles_[1], "foo", "value"));
+  {
+    WriteBatchWithIndex* wbwi = txn1->GetWriteBatch();
+    assert(wbwi);
+    WriteBatch* wb = wbwi->GetWriteBatch();
+    assert(wb);
+    // Write a key to the batch for non-existing cf.
+    ASSERT_OK(WriteBatchInternal::Put(wb, /*column_family_id=*/10, /*key=*/"",
+                                      /*value=*/""));
+  }
+  ASSERT_OK(txn1->Prepare());
+  ASSERT_OK(txn1->SetCommitTimestamp(21));
+  ASSERT_TRUE(txn1->Commit().IsInvalidArgument());
+  txn1.reset();
+}
+
+TEST_P(WriteCommittedTxnWithTsTest, ReOpenWithTimestamp) {
+  options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  ASSERT_OK(ReOpenNoDelete());
+
+  ColumnFamilyOptions cf_opts;
+  cf_opts.comparator = test::BytewiseComparatorWithU64TsWrapper();
+  const std::string test_cf_name = "test_cf";
+  ColumnFamilyHandle* cfh = nullptr;
+  assert(db);
+  ASSERT_OK(db->CreateColumnFamily(cf_opts, test_cf_name, &cfh));
+  delete cfh;
+  cfh = nullptr;
+
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+  cf_descs.emplace_back(test_cf_name, cf_opts);
+  ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_));
+
+  std::unique_ptr<Transaction> txn0(
+      NewTxn(WriteOptions(), TransactionOptions()));
+  assert(txn0);
+  ASSERT_OK(txn0->Put(handles_[1], "foo", "value"));
+  ASSERT_OK(txn0->SetName("txn0"));
+  ASSERT_OK(txn0->Prepare());
+  ASSERT_TRUE(txn0->Commit().IsInvalidArgument());
+  txn0.reset();
+
+  std::unique_ptr<Transaction> txn1(
+      NewTxn(WriteOptions(), TransactionOptions()));
+  assert(txn1);
+  ASSERT_OK(txn1->Put(handles_[1], "foo", "value1"));
+  {
+    std::string buf;
+    PutFixed64(&buf, 23);
+    ASSERT_OK(txn1->Put("id", buf));
+    ASSERT_OK(txn1->Merge("id", buf));
+  }
+  ASSERT_OK(txn1->SetName("txn1"));
+  ASSERT_OK(txn1->Prepare());
+  ASSERT_OK(txn1->SetCommitTimestamp(/*ts=*/23));
+  ASSERT_OK(txn1->Commit());
+  txn1.reset();
+
+  {
+    std::string value;
+    const Status s =
+        GetFromDb(ReadOptions(), handles_[1], "foo", /*ts=*/23, &value);
+    ASSERT_OK(s);
+    ASSERT_EQ("value1", value);
+  }
+
+  {
+    std::string value;
+    const Status s = db->Get(ReadOptions(), handles_[0], "id", &value);
+    ASSERT_OK(s);
+    uint64_t ival = 0;
+    Slice value_slc = value;
+    bool result = GetFixed64(&value_slc, &ival);
+    assert(result);
+    ASSERT_EQ(46, ival);
+  }
+}
+
+TEST_P(WriteCommittedTxnWithTsTest, RecoverFromWal) {
+  ASSERT_OK(ReOpenNoDelete());
+
+  ColumnFamilyOptions cf_opts;
+  cf_opts.comparator = test::BytewiseComparatorWithU64TsWrapper();
+  const std::string test_cf_name = "test_cf";
+  ColumnFamilyHandle* cfh = nullptr;
+  assert(db);
+  ASSERT_OK(db->CreateColumnFamily(cf_opts, test_cf_name, &cfh));
+  delete cfh;
+  cfh = nullptr;
+
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+  cf_descs.emplace_back(test_cf_name, cf_opts);
+  options.avoid_flush_during_shutdown = true;
+  ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_));
+
+  std::unique_ptr<Transaction> txn0(
+      NewTxn(WriteOptions(), TransactionOptions()));
+  assert(txn0);
+  ASSERT_OK(txn0->Put(handles_[1], "foo", "foo_value"));
+  ASSERT_OK(txn0->SetName("txn0"));
+  ASSERT_OK(txn0->Prepare());
+
+  WriteOptions write_opts;
+  write_opts.sync = true;
+  std::unique_ptr<Transaction> txn1(NewTxn(write_opts, TransactionOptions()));
+  assert(txn1);
+  ASSERT_OK(txn1->Put("bar", "bar_value_1"));
+  ASSERT_OK(txn1->Put(handles_[1], "bar", "bar_value_1"));
+  ASSERT_OK(txn1->SetName("txn1"));
+  ASSERT_OK(txn1->Prepare());
+  ASSERT_OK(txn1->SetCommitTimestamp(/*ts=*/23));
+  ASSERT_OK(txn1->Commit());
+  txn1.reset();
+
+  std::unique_ptr<Transaction> txn2(NewTxn(write_opts, TransactionOptions()));
+  assert(txn2);
+  ASSERT_OK(txn2->Put("key1", "value_3"));
+  ASSERT_OK(txn2->Put(handles_[1], "key1", "value_3"));
+  ASSERT_OK(txn2->SetCommitTimestamp(/*ts=*/24));
+  ASSERT_OK(txn2->Commit());
+  txn2.reset();
+
+  txn0.reset();
+
+  ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_));
+
+  {
+    std::string value;
+    Status s = GetFromDb(ReadOptions(), handles_[1], "foo", /*ts=*/23, &value);
+    ASSERT_TRUE(s.IsNotFound());
+
+    s = db->Get(ReadOptions(), handles_[0], "bar", &value);
+    ASSERT_OK(s);
+    ASSERT_EQ("bar_value_1", value);
+
+    value.clear();
+    s = GetFromDb(ReadOptions(), handles_[1], "bar", /*ts=*/23, &value);
+    ASSERT_OK(s);
+    ASSERT_EQ("bar_value_1", value);
+
+    s = GetFromDb(ReadOptions(), handles_[1], "key1", /*ts=*/23, &value);
+    ASSERT_TRUE(s.IsNotFound());
+
+    s = db->Get(ReadOptions(), handles_[0], "key1", &value);
+    ASSERT_OK(s);
+    ASSERT_EQ("value_3", value);
+
+    s = GetFromDb(ReadOptions(), handles_[1], "key1", /*ts=*/24, &value);
+    ASSERT_OK(s);
+    ASSERT_EQ("value_3", value);
+  }
+}
+
+TEST_P(WriteCommittedTxnWithTsTest, TransactionDbLevelApi) {
+  ASSERT_OK(ReOpenNoDelete());
+
+  ColumnFamilyOptions cf_options;
+  cf_options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  cf_options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+  const std::string test_cf_name = "test_cf";
+  ColumnFamilyHandle* cfh = nullptr;
+  assert(db);
+  ASSERT_OK(db->CreateColumnFamily(cf_options, test_cf_name, &cfh));
+  delete cfh;
+  cfh = nullptr;
+
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+  cf_descs.emplace_back(test_cf_name, cf_options);
+
+  ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_));
+
+  std::string key_str = "tes_key";
+  std::string ts_str;
+  std::string value_str = "test_value";
+  PutFixed64(&ts_str, 100);
+  Slice value = value_str;
+
+  assert(db);
+  ASSERT_TRUE(
+      db->Put(WriteOptions(), handles_[1], "foo", "bar").IsNotSupported());
+  ASSERT_TRUE(db->Delete(WriteOptions(), handles_[1], "foo").IsNotSupported());
+  ASSERT_TRUE(
+      db->SingleDelete(WriteOptions(), handles_[1], "foo").IsNotSupported());
+  ASSERT_TRUE(
+      db->Merge(WriteOptions(), handles_[1], "foo", "+1").IsNotSupported());
+  WriteBatch wb1(/*reserved_bytes=*/0, /*max_bytes=*/0,
+                 /*protection_bytes_per_key=*/0, /*default_cf_ts_sz=*/0);
+  ASSERT_OK(wb1.Put(handles_[1], key_str, ts_str, value));
+  ASSERT_TRUE(db->Write(WriteOptions(), &wb1).IsNotSupported());
+  ASSERT_TRUE(db->Write(WriteOptions(), TransactionDBWriteOptimizations(), &wb1)
+                  .IsNotSupported());
+  auto* pessimistic_txn_db =
+      static_cast_with_check<PessimisticTransactionDB>(db);
+  assert(pessimistic_txn_db);
+  ASSERT_TRUE(
+      pessimistic_txn_db->WriteWithConcurrencyControl(WriteOptions(), &wb1)
+          .IsNotSupported());
+
+  ASSERT_OK(db->Put(WriteOptions(), "foo", "value"));
+  ASSERT_OK(db->Put(WriteOptions(), "bar", "value"));
+  ASSERT_OK(db->Delete(WriteOptions(), "bar"));
+  ASSERT_OK(db->SingleDelete(WriteOptions(), "foo"));
+  ASSERT_OK(db->Put(WriteOptions(), "key", "value"));
+  ASSERT_OK(db->Merge(WriteOptions(), "key", "_more"));
+  WriteBatch wb2(/*reserved_bytes=*/0, /*max_bytes=*/0,
+                 /*protection_bytes_per_key=*/0, /*default_cf_ts_sz=*/0);
+  ASSERT_OK(wb2.Put(key_str, value));
+  ASSERT_OK(db->Write(WriteOptions(), &wb2));
+  ASSERT_OK(db->Write(WriteOptions(), TransactionDBWriteOptimizations(), &wb2));
+  ASSERT_OK(
+      pessimistic_txn_db->WriteWithConcurrencyControl(WriteOptions(), &wb2));
+
+  std::unique_ptr<Transaction> txn(
+      NewTxn(WriteOptions(), TransactionOptions()));
+  assert(txn);
+
+  WriteBatch wb3(/*reserved_bytes=*/0, /*max_bytes=*/0,
+                 /*protection_bytes_per_key=*/0, /*default_cf_ts_sz=*/0);
+
+  ASSERT_OK(wb3.Put(handles_[1], "key", "value"));
+  auto* pessimistic_txn =
+      static_cast_with_check<PessimisticTransaction>(txn.get());
+  assert(pessimistic_txn);
+  ASSERT_TRUE(pessimistic_txn->CommitBatch(&wb3).IsNotSupported());
+
+  txn.reset();
+}
+
+TEST_P(WriteCommittedTxnWithTsTest, Merge) {
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  ASSERT_OK(ReOpenNoDelete());
+
+  ColumnFamilyOptions cf_options;
+  cf_options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+  cf_options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  const std::string test_cf_name = "test_cf";
+  ColumnFamilyHandle* cfh = nullptr;
+  assert(db);
+  ASSERT_OK(db->CreateColumnFamily(cf_options, test_cf_name, &cfh));
+  delete cfh;
+  cfh = nullptr;
+
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+  cf_descs.emplace_back(test_cf_name, Options(DBOptions(), cf_options));
+  options.avoid_flush_during_shutdown = true;
+
+  ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_));
+
+  std::unique_ptr<Transaction> txn(
+      NewTxn(WriteOptions(), TransactionOptions()));
+  assert(txn);
+  ASSERT_OK(txn->Put(handles_[1], "foo", "bar"));
+  ASSERT_OK(txn->Merge(handles_[1], "foo", "1"));
+  ASSERT_OK(txn->SetCommitTimestamp(24));
+  ASSERT_OK(txn->Commit());
+  txn.reset();
+  {
+    std::string value;
+    const Status s =
+        GetFromDb(ReadOptions(), handles_[1], "foo", /*ts=*/24, &value);
+    ASSERT_OK(s);
+    ASSERT_EQ("bar,1", value);
+  }
+}
+
+TEST_P(WriteCommittedTxnWithTsTest, GetForUpdate) {
+  ASSERT_OK(ReOpenNoDelete());
+
+  ColumnFamilyOptions cf_options;
+  cf_options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+  const std::string test_cf_name = "test_cf";
+  ColumnFamilyHandle* cfh = nullptr;
+  assert(db);
+  ASSERT_OK(db->CreateColumnFamily(cf_options, test_cf_name, &cfh));
+  delete cfh;
+  cfh = nullptr;
+
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+  cf_descs.emplace_back(test_cf_name, Options(DBOptions(), cf_options));
+  options.avoid_flush_during_shutdown = true;
+
+  ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_));
+
+  std::unique_ptr<Transaction> txn0(
+      NewTxn(WriteOptions(), TransactionOptions()));
+
+  std::unique_ptr<Transaction> txn1(
+      NewTxn(WriteOptions(), TransactionOptions()));
+  ASSERT_OK(txn1->Put(handles_[1], "key", "value1"));
+  ASSERT_OK(txn1->SetCommitTimestamp(24));
+  ASSERT_OK(txn1->Commit());
+  txn1.reset();
+
+  std::string value;
+  ASSERT_OK(txn0->SetReadTimestampForValidation(23));
+  ASSERT_TRUE(
+      txn0->GetForUpdate(ReadOptions(), handles_[1], "key", &value).IsBusy());
+  ASSERT_OK(txn0->Rollback());
+  txn0.reset();
+
+  std::unique_ptr<Transaction> txn2(
+      NewTxn(WriteOptions(), TransactionOptions()));
+  ASSERT_OK(txn2->SetReadTimestampForValidation(25));
+  ASSERT_OK(txn2->GetForUpdate(ReadOptions(), handles_[1], "key", &value));
+  ASSERT_OK(txn2->SetCommitTimestamp(26));
+  ASSERT_OK(txn2->Commit());
+  txn2.reset();
+}
+
+TEST_P(WriteCommittedTxnWithTsTest, BlindWrite) {
+  ASSERT_OK(ReOpenNoDelete());
+
+  ColumnFamilyOptions cf_options;
+  cf_options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+  const std::string test_cf_name = "test_cf";
+  ColumnFamilyHandle* cfh = nullptr;
+  assert(db);
+  ASSERT_OK(db->CreateColumnFamily(cf_options, test_cf_name, &cfh));
+  delete cfh;
+  cfh = nullptr;
+
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+  cf_descs.emplace_back(test_cf_name, Options(DBOptions(), cf_options));
+  options.avoid_flush_during_shutdown = true;
+  ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_));
+
+  std::unique_ptr<Transaction> txn0(
+      NewTxn(WriteOptions(), TransactionOptions()));
+  assert(txn0);
+  std::unique_ptr<Transaction> txn1(
+      NewTxn(WriteOptions(), TransactionOptions()));
+  assert(txn1);
+
+  {
+    std::string value;
+    ASSERT_OK(txn0->SetReadTimestampForValidation(100));
+    // Lock "key".
+    ASSERT_TRUE(txn0->GetForUpdate(ReadOptions(), handles_[1], "key", &value)
+                    .IsNotFound());
+  }
+
+  ASSERT_OK(txn0->Put(handles_[1], "key", "value0"));
+  ASSERT_OK(txn0->SetCommitTimestamp(101));
+  ASSERT_OK(txn0->Commit());
+
+  ASSERT_OK(txn1->Put(handles_[1], "key", "value1"));
+  // In reality, caller needs to ensure commit_ts of txn1 is greater than the
+  // commit_ts of txn0, which is true for lock-based concurrency control.
+  ASSERT_OK(txn1->SetCommitTimestamp(102));
+  ASSERT_OK(txn1->Commit());
+
+  txn0.reset();
+  txn1.reset();
+}
+
+TEST_P(WriteCommittedTxnWithTsTest, RefineReadTimestamp) {
+  ASSERT_OK(ReOpenNoDelete());
+
+  ColumnFamilyOptions cf_options;
+  cf_options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+  const std::string test_cf_name = "test_cf";
+  ColumnFamilyHandle* cfh = nullptr;
+  assert(db);
+  ASSERT_OK(db->CreateColumnFamily(cf_options, test_cf_name, &cfh));
+  delete cfh;
+  cfh = nullptr;
+
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+  cf_descs.emplace_back(test_cf_name, Options(DBOptions(), cf_options));
+  options.avoid_flush_during_shutdown = true;
+
+  ASSERT_OK(ReOpenNoDelete(cf_descs, &handles_));
+
+  std::unique_ptr<Transaction> txn0(
+      NewTxn(WriteOptions(), TransactionOptions()));
+  assert(txn0);
+
+  std::unique_ptr<Transaction> txn1(
+      NewTxn(WriteOptions(), TransactionOptions()));
+  assert(txn1);
+
+  {
+    ASSERT_OK(txn0->SetReadTimestampForValidation(100));
+    // Lock "key0", "key1", ..., "key4".
+    for (int i = 0; i < 5; ++i) {
+      std::string value;
+      ASSERT_TRUE(txn0->GetForUpdate(ReadOptions(), handles_[1],
+                                     "key" + std::to_string(i), &value)
+                      .IsNotFound());
+    }
+  }
+  ASSERT_OK(txn1->Put(handles_[1], "key5", "value5_0"));
+  ASSERT_OK(txn1->SetName("txn1"));
+  ASSERT_OK(txn1->Prepare());
+  ASSERT_OK(txn1->SetCommitTimestamp(101));
+  ASSERT_OK(txn1->Commit());
+  txn1.reset();
+
+  {
+    std::string value;
+    ASSERT_TRUE(txn0->GetForUpdate(ReadOptions(), handles_[1], "key5", &value)
+                    .IsBusy());
+    ASSERT_OK(txn0->SetReadTimestampForValidation(102));
+    ASSERT_OK(txn0->GetForUpdate(ReadOptions(), handles_[1], "key5", &value));
+    ASSERT_EQ("value5_0", value);
+  }
+
+  for (int i = 0; i < 6; ++i) {
+    ASSERT_OK(txn0->Put(handles_[1], "key" + std::to_string(i),
+                        "value" + std::to_string(i)));
+  }
+  ASSERT_OK(txn0->SetName("txn0"));
+  ASSERT_OK(txn0->Prepare());
+  ASSERT_OK(txn0->SetCommitTimestamp(103));
+  ASSERT_OK(txn0->Commit());
+  txn0.reset();
+}
+
+TEST_P(WriteCommittedTxnWithTsTest, CheckKeysForConflicts) {
+  options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+  ASSERT_OK(ReOpen());
+
+  std::unique_ptr<Transaction> txn1(
+      db->BeginTransaction(WriteOptions(), TransactionOptions()));
+  assert(txn1);
+
+  std::unique_ptr<Transaction> txn2(
+      db->BeginTransaction(WriteOptions(), TransactionOptions()));
+  assert(txn2);
+  ASSERT_OK(txn2->Put("foo", "v0"));
+  ASSERT_OK(txn2->SetCommitTimestamp(10));
+  ASSERT_OK(txn2->Commit());
+  txn2.reset();
+
+  // txn1 takes a snapshot after txn2 commits. The writes of txn2 have
+  // a smaller seqno than txn1's snapshot, thus should not affect conflict
+  // checking.
+  txn1->SetSnapshot();
+
+  std::unique_ptr<Transaction> txn3(
+      db->BeginTransaction(WriteOptions(), TransactionOptions()));
+  assert(txn3);
+  ASSERT_OK(txn3->SetReadTimestampForValidation(20));
+  std::string dontcare;
+  ASSERT_OK(txn3->GetForUpdate(ReadOptions(), "foo", &dontcare));
+  ASSERT_OK(txn3->SingleDelete("foo"));
+  ASSERT_OK(txn3->SetName("txn3"));
+  ASSERT_OK(txn3->Prepare());
+  ASSERT_OK(txn3->SetCommitTimestamp(30));
+  // txn3 reads at ts=20 > txn2's commit timestamp, and commits at ts=30.
+  // txn3 can commit successfully, leaving a tombstone with ts=30.
+  ASSERT_OK(txn3->Commit());
+  txn3.reset();
+
+  bool called = false;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::GetLatestSequenceForKey:mem", [&](void* arg) {
+        auto* const ts_ptr = reinterpret_cast<std::string*>(arg);
+        assert(ts_ptr);
+        Slice ts_slc = *ts_ptr;
+        uint64_t last_ts = 0;
+        ASSERT_TRUE(GetFixed64(&ts_slc, &last_ts));
+        ASSERT_EQ(30, last_ts);
+        called = true;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // txn1's read timestamp is 25 < 30 (commit timestamp of txn3). Therefore,
+  // the tombstone written by txn3 causes the conflict checking to fail.
+  ASSERT_OK(txn1->SetReadTimestampForValidation(25));
+  ASSERT_TRUE(txn1->GetForUpdate(ReadOptions(), "foo", &dontcare).IsBusy());
+  ASSERT_TRUE(called);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <cstdio>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as Transactions not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/write_prepared_transaction_test.cc b/src/rocksdb/utilities/transactions/write_prepared_transaction_test.cc
new file mode 100644
index 000000000..86a9511a4
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/write_prepared_transaction_test.cc
@@ -0,0 +1,4078 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <atomic>
+#include <cinttypes>
+#include <functional>
+#include <string>
+#include <thread>
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/debug.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "table/mock_table.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "test_util/transaction_test_util.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/string_append/stringappend.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/transaction_test.h"
+#include "utilities/transactions/write_prepared_txn_db.h"
+
+using std::string;
+
+namespace ROCKSDB_NAMESPACE {
+
+using CommitEntry = WritePreparedTxnDB::CommitEntry;
+using CommitEntry64b = WritePreparedTxnDB::CommitEntry64b;
+using CommitEntry64bFormat = WritePreparedTxnDB::CommitEntry64bFormat;
+
+TEST(PreparedHeap, BasicsTest) {
+  WritePreparedTxnDB::PreparedHeap heap;
+  {
+    MutexLock ml(heap.push_pop_mutex());
+    heap.push(14l);
+    // Test with one element
+    ASSERT_EQ(14l, heap.top());
+    heap.push(24l);
+    heap.push(34l);
+    // Test that old min is still on top
+    ASSERT_EQ(14l, heap.top());
+    heap.push(44l);
+    heap.push(54l);
+    heap.push(64l);
+    heap.push(74l);
+    heap.push(84l);
+  }
+  // Test that old min is still on top
+  ASSERT_EQ(14l, heap.top());
+  heap.erase(24l);
+  // Test that old min is still on top
+  ASSERT_EQ(14l, heap.top());
+  heap.erase(14l);
+  // Test that the new comes to the top after multiple erase
+  ASSERT_EQ(34l, heap.top());
+  heap.erase(34l);
+  // Test that the new comes to the top after single erase
+  ASSERT_EQ(44l, heap.top());
+  heap.erase(54l);
+  ASSERT_EQ(44l, heap.top());
+  heap.pop();  // pop 44l
+  // Test that the erased items are ignored after pop
+  ASSERT_EQ(64l, heap.top());
+  heap.erase(44l);
+  // Test that erasing an already popped item would work
+  ASSERT_EQ(64l, heap.top());
+  heap.erase(84l);
+  ASSERT_EQ(64l, heap.top());
+  {
+    MutexLock ml(heap.push_pop_mutex());
+    heap.push(85l);
+    heap.push(86l);
+    heap.push(87l);
+    heap.push(88l);
+    heap.push(89l);
+  }
+  heap.erase(87l);
+  heap.erase(85l);
+  heap.erase(89l);
+  heap.erase(86l);
+  heap.erase(88l);
+  // Test top remains the same after a random order of many erases
+  ASSERT_EQ(64l, heap.top());
+  heap.pop();
+  // Test that pop works with a series of random pending erases
+  ASSERT_EQ(74l, heap.top());
+  ASSERT_FALSE(heap.empty());
+  heap.pop();
+  // Test that empty works
+  ASSERT_TRUE(heap.empty());
+}
+
+// This is a scenario reconstructed from a buggy trace. Test that the bug does
+// not resurface again.
+TEST(PreparedHeap, EmptyAtTheEnd) {
+  WritePreparedTxnDB::PreparedHeap heap;
+  {
+    MutexLock ml(heap.push_pop_mutex());
+    heap.push(40l);
+  }
+  ASSERT_EQ(40l, heap.top());
+  // Although not a recommended scenario, we must be resilient against erase
+  // without a prior push.
+  heap.erase(50l);
+  ASSERT_EQ(40l, heap.top());
+  {
+    MutexLock ml(heap.push_pop_mutex());
+    heap.push(60l);
+  }
+  ASSERT_EQ(40l, heap.top());
+
+  heap.erase(60l);
+  ASSERT_EQ(40l, heap.top());
+  heap.erase(40l);
+  ASSERT_TRUE(heap.empty());
+
+  {
+    MutexLock ml(heap.push_pop_mutex());
+    heap.push(40l);
+  }
+  ASSERT_EQ(40l, heap.top());
+  heap.erase(50l);
+  ASSERT_EQ(40l, heap.top());
+  {
+    MutexLock ml(heap.push_pop_mutex());
+    heap.push(60l);
+  }
+  ASSERT_EQ(40l, heap.top());
+
+  heap.erase(40l);
+  // Test that the erase has not emptied the heap (we had a bug doing that)
+  ASSERT_FALSE(heap.empty());
+  ASSERT_EQ(60l, heap.top());
+  heap.erase(60l);
+  ASSERT_TRUE(heap.empty());
+}
+
+// Generate random order of PreparedHeap access and test that the heap will be
+// successfully emptied at the end.
+TEST(PreparedHeap, Concurrent) {
+  const size_t t_cnt = 10;
+  ROCKSDB_NAMESPACE::port::Thread t[t_cnt + 1];
+  WritePreparedTxnDB::PreparedHeap heap;
+  port::RWMutex prepared_mutex;
+  std::atomic<size_t> last;
+
+  for (size_t n = 0; n < 100; n++) {
+    last = 0;
+    t[0] = ROCKSDB_NAMESPACE::port::Thread([&]() {
+      Random rnd(1103);
+      for (size_t seq = 1; seq <= t_cnt; seq++) {
+        // This is not recommended usage but we should be resilient against it.
+        bool skip_push = rnd.OneIn(5);
+        if (!skip_push) {
+          MutexLock ml(heap.push_pop_mutex());
+          std::this_thread::yield();
+          heap.push(seq);
+          last.store(seq);
+        }
+      }
+    });
+    for (size_t i = 1; i <= t_cnt; i++) {
+      t[i] =
+          ROCKSDB_NAMESPACE::port::Thread([&heap, &prepared_mutex, &last, i]() {
+            auto seq = i;
+            do {
+              std::this_thread::yield();
+            } while (last.load() < seq);
+            WriteLock wl(&prepared_mutex);
+            heap.erase(seq);
+          });
+    }
+    for (size_t i = 0; i <= t_cnt; i++) {
+      t[i].join();
+    }
+    ASSERT_TRUE(heap.empty());
+  }
+}
+
+// Test that WriteBatchWithIndex correctly counts the number of sub-batches
+TEST(WriteBatchWithIndex, SubBatchCnt) {
+  ColumnFamilyOptions cf_options;
+  std::string cf_name = "two";
+  DB* db;
+  Options options;
+  options.create_if_missing = true;
+  const std::string dbname = test::PerThreadDBPath("transaction_testdb");
+  EXPECT_OK(DestroyDB(dbname, options));
+  ASSERT_OK(DB::Open(options, dbname, &db));
+  ColumnFamilyHandle* cf_handle = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
+  WriteOptions write_options;
+  size_t batch_cnt = 1;
+  size_t save_points = 0;
+  std::vector<size_t> batch_cnt_at;
+  WriteBatchWithIndex batch(db->DefaultColumnFamily()->GetComparator(), 0, true,
+                            0);
+  ASSERT_EQ(batch_cnt, batch.SubBatchCnt());
+  batch_cnt_at.push_back(batch_cnt);
+  batch.SetSavePoint();
+  save_points++;
+  ASSERT_OK(batch.Put(Slice("key"), Slice("value")));
+  ASSERT_EQ(batch_cnt, batch.SubBatchCnt());
+  batch_cnt_at.push_back(batch_cnt);
+  batch.SetSavePoint();
+  save_points++;
+  ASSERT_OK(batch.Put(Slice("key2"), Slice("value2")));
+  ASSERT_EQ(batch_cnt, batch.SubBatchCnt());
+  // duplicate the keys
+  batch_cnt_at.push_back(batch_cnt);
+  batch.SetSavePoint();
+  save_points++;
+  ASSERT_OK(batch.Put(Slice("key"), Slice("value3")));
+  batch_cnt++;
+  ASSERT_EQ(batch_cnt, batch.SubBatchCnt());
+  // duplicate the 2nd key. It should not be counted duplicate since a
+  // sub-patch is cut after the last duplicate.
+  batch_cnt_at.push_back(batch_cnt);
+  batch.SetSavePoint();
+  save_points++;
+  ASSERT_OK(batch.Put(Slice("key2"), Slice("value4")));
+  ASSERT_EQ(batch_cnt, batch.SubBatchCnt());
+  // duplicate the keys but in a different cf. It should not be counted as
+  // duplicate keys
+  batch_cnt_at.push_back(batch_cnt);
+  batch.SetSavePoint();
+  save_points++;
+  ASSERT_OK(batch.Put(cf_handle, Slice("key"), Slice("value5")));
+  ASSERT_EQ(batch_cnt, batch.SubBatchCnt());
+
+  // Test that the number of sub-batches matches what we count with
+  // SubBatchCounter
+  std::map<uint32_t, const Comparator*> comparators;
+  comparators[0] = db->DefaultColumnFamily()->GetComparator();
+  comparators[cf_handle->GetID()] = cf_handle->GetComparator();
+  SubBatchCounter counter(comparators);
+  ASSERT_OK(batch.GetWriteBatch()->Iterate(&counter));
+  ASSERT_EQ(batch_cnt, counter.BatchCount());
+
+  // Test that RollbackToSavePoint will properly resets the number of
+  // sub-batches
+  for (size_t i = save_points; i > 0; i--) {
+    ASSERT_OK(batch.RollbackToSavePoint());
+    ASSERT_EQ(batch_cnt_at[i - 1], batch.SubBatchCnt());
+  }
+
+  // Test the count is right with random batches
+  {
+    const size_t TOTAL_KEYS = 20;  // 20 ~= 10 to cause a few randoms
+    Random rnd(1131);
+    std::string keys[TOTAL_KEYS];
+    for (size_t k = 0; k < TOTAL_KEYS; k++) {
+      int len = static_cast<int>(rnd.Uniform(50));
+      keys[k] = test::RandomKey(&rnd, len);
+    }
+    for (size_t i = 0; i < 1000; i++) {  // 1000 random batches
+      WriteBatchWithIndex rndbatch(db->DefaultColumnFamily()->GetComparator(),
+                                   0, true, 0);
+      for (size_t k = 0; k < 10; k++) {  // 10 key per batch
+        size_t ki = static_cast<size_t>(rnd.Uniform(TOTAL_KEYS));
+        Slice key = Slice(keys[ki]);
+        std::string tmp = rnd.RandomString(16);
+        Slice value = Slice(tmp);
+        ASSERT_OK(rndbatch.Put(key, value));
+      }
+      SubBatchCounter batch_counter(comparators);
+      ASSERT_OK(rndbatch.GetWriteBatch()->Iterate(&batch_counter));
+      ASSERT_EQ(rndbatch.SubBatchCnt(), batch_counter.BatchCount());
+    }
+  }
+
+  delete cf_handle;
+  delete db;
+}
+
+TEST(CommitEntry64b, BasicTest) {
+  const size_t INDEX_BITS = static_cast<size_t>(21);
+  const size_t INDEX_SIZE = static_cast<size_t>(1ull << INDEX_BITS);
+  const CommitEntry64bFormat FORMAT(static_cast<size_t>(INDEX_BITS));
+
+  // zero-initialized CommitEntry64b should indicate an empty entry
+  CommitEntry64b empty_entry64b;
+  uint64_t empty_index = 11ul;
+  CommitEntry empty_entry;
+  bool ok = empty_entry64b.Parse(empty_index, &empty_entry, FORMAT);
+  ASSERT_FALSE(ok);
+
+  // the zero entry is reserved for un-initialized entries
+  const size_t MAX_COMMIT = (1 << FORMAT.COMMIT_BITS) - 1 - 1;
+  // Samples over the numbers that are covered by that many index bits
+  std::array<uint64_t, 4> is = {{0, 1, INDEX_SIZE / 2 + 1, INDEX_SIZE - 1}};
+  // Samples over the numbers that are covered by that many commit bits
+  std::array<uint64_t, 4> ds = {{0, 1, MAX_COMMIT / 2 + 1, MAX_COMMIT}};
+  // Iterate over prepare numbers that have i) cover all bits of a sequence
+  // number, and ii) include some bits that fall into the range of index or
+  // commit bits
+  for (uint64_t base = 1; base < kMaxSequenceNumber; base *= 2) {
+    for (uint64_t i : is) {
+      for (uint64_t d : ds) {
+        uint64_t p = base + i + d;
+        for (uint64_t c : {p, p + d / 2, p + d}) {
+          uint64_t index = p % INDEX_SIZE;
+          CommitEntry before(p, c), after;
+          CommitEntry64b entry64b(before, FORMAT);
+          ok = entry64b.Parse(index, &after, FORMAT);
+          ASSERT_TRUE(ok);
+          if (!(before == after)) {
+            printf("base %" PRIu64 " i %" PRIu64 " d %" PRIu64 " p %" PRIu64
+                   " c %" PRIu64 " index %" PRIu64 "\n",
+                   base, i, d, p, c, index);
+          }
+          ASSERT_EQ(before, after);
+        }
+      }
+    }
+  }
+}
+
+class WritePreparedTxnDBMock : public WritePreparedTxnDB {
+ public:
+  WritePreparedTxnDBMock(DBImpl* db_impl, TransactionDBOptions& opt)
+      : WritePreparedTxnDB(db_impl, opt) {}
+  void SetDBSnapshots(const std::vector<SequenceNumber>& snapshots) {
+    snapshots_ = snapshots;
+  }
+  void TakeSnapshot(SequenceNumber seq) { snapshots_.push_back(seq); }
+
+ protected:
+  const std::vector<SequenceNumber> GetSnapshotListFromDB(
+      SequenceNumber /* unused */) override {
+    return snapshots_;
+  }
+
+ private:
+  std::vector<SequenceNumber> snapshots_;
+};
+
+class WritePreparedTransactionTestBase : public TransactionTestBase {
+ public:
+  WritePreparedTransactionTestBase(bool use_stackable_db, bool two_write_queue,
+                                   TxnDBWritePolicy write_policy,
+                                   WriteOrdering write_ordering)
+      : TransactionTestBase(use_stackable_db, two_write_queue, write_policy,
+                            write_ordering){};
+
+ protected:
+  void UpdateTransactionDBOptions(size_t snapshot_cache_bits,
+                                  size_t commit_cache_bits) {
+    txn_db_options.wp_snapshot_cache_bits = snapshot_cache_bits;
+    txn_db_options.wp_commit_cache_bits = commit_cache_bits;
+  }
+  void UpdateTransactionDBOptions(size_t snapshot_cache_bits) {
+    txn_db_options.wp_snapshot_cache_bits = snapshot_cache_bits;
+  }
+  // If expect_update is set, check if it actually updated old_commit_map_. If
+  // it did not and yet suggested not to check the next snapshot, do the
+  // opposite to check if it was not a bad suggestion.
+  void MaybeUpdateOldCommitMapTestWithNext(uint64_t prepare, uint64_t commit,
+                                           uint64_t snapshot,
+                                           uint64_t next_snapshot,
+                                           bool expect_update) {
+    WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+    // reset old_commit_map_empty_ so that its value indicate whether
+    // old_commit_map_ was updated
+    wp_db->old_commit_map_empty_ = true;
+    bool check_next = wp_db->MaybeUpdateOldCommitMap(prepare, commit, snapshot,
+                                                     snapshot < next_snapshot);
+    if (expect_update == wp_db->old_commit_map_empty_) {
+      printf("prepare: %" PRIu64 " commit: %" PRIu64 " snapshot: %" PRIu64
+             " next: %" PRIu64 "\n",
+             prepare, commit, snapshot, next_snapshot);
+    }
+    EXPECT_EQ(!expect_update, wp_db->old_commit_map_empty_);
+    if (!check_next && wp_db->old_commit_map_empty_) {
+      // do the opposite to make sure it was not a bad suggestion
+      const bool dont_care_bool = true;
+      wp_db->MaybeUpdateOldCommitMap(prepare, commit, next_snapshot,
+                                     dont_care_bool);
+      if (!wp_db->old_commit_map_empty_) {
+        printf("prepare: %" PRIu64 " commit: %" PRIu64 " snapshot: %" PRIu64
+               " next: %" PRIu64 "\n",
+               prepare, commit, snapshot, next_snapshot);
+      }
+      EXPECT_TRUE(wp_db->old_commit_map_empty_);
+    }
+  }
+
+  // Test that a CheckAgainstSnapshots thread reading old_snapshots will not
+  // miss a snapshot because of a concurrent update by UpdateSnapshots that is
+  // writing new_snapshots. Both threads are broken at two points. The sync
+  // points to enforce them are specified by a1, a2, b1, and b2. CommitEntry
+  // entry is expected to be vital for one of the snapshots that is common
+  // between the old and new list of snapshots.
+  void SnapshotConcurrentAccessTestInternal(
+      WritePreparedTxnDB* wp_db,
+      const std::vector<SequenceNumber>& old_snapshots,
+      const std::vector<SequenceNumber>& new_snapshots, CommitEntry& entry,
+      SequenceNumber& version, size_t a1, size_t a2, size_t b1, size_t b2) {
+    // First reset the snapshot list
+    const std::vector<SequenceNumber> empty_snapshots;
+    wp_db->old_commit_map_empty_ = true;
+    wp_db->UpdateSnapshots(empty_snapshots, ++version);
+    // Then initialize it with the old_snapshots
+    wp_db->UpdateSnapshots(old_snapshots, ++version);
+
+    // Starting from the first thread, cut each thread at two points
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+        {"WritePreparedTxnDB::CheckAgainstSnapshots:p:" + std::to_string(a1),
+         "WritePreparedTxnDB::UpdateSnapshots:s:start"},
+        {"WritePreparedTxnDB::UpdateSnapshots:p:" + std::to_string(b1),
+         "WritePreparedTxnDB::CheckAgainstSnapshots:s:" + std::to_string(a1)},
+        {"WritePreparedTxnDB::CheckAgainstSnapshots:p:" + std::to_string(a2),
+         "WritePreparedTxnDB::UpdateSnapshots:s:" + std::to_string(b1)},
+        {"WritePreparedTxnDB::UpdateSnapshots:p:" + std::to_string(b2),
+         "WritePreparedTxnDB::CheckAgainstSnapshots:s:" + std::to_string(a2)},
+        {"WritePreparedTxnDB::CheckAgainstSnapshots:p:end",
+         "WritePreparedTxnDB::UpdateSnapshots:s:" + std::to_string(b2)},
+    });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    {
+      ASSERT_TRUE(wp_db->old_commit_map_empty_);
+      ROCKSDB_NAMESPACE::port::Thread t1(
+          [&]() { wp_db->UpdateSnapshots(new_snapshots, version); });
+      wp_db->CheckAgainstSnapshots(entry);
+      t1.join();
+      ASSERT_FALSE(wp_db->old_commit_map_empty_);
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+    wp_db->old_commit_map_empty_ = true;
+    wp_db->UpdateSnapshots(empty_snapshots, ++version);
+    wp_db->UpdateSnapshots(old_snapshots, ++version);
+    // Starting from the second thread, cut each thread at two points
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+        {"WritePreparedTxnDB::UpdateSnapshots:p:" + std::to_string(a1),
+         "WritePreparedTxnDB::CheckAgainstSnapshots:s:start"},
+        {"WritePreparedTxnDB::CheckAgainstSnapshots:p:" + std::to_string(b1),
+         "WritePreparedTxnDB::UpdateSnapshots:s:" + std::to_string(a1)},
+        {"WritePreparedTxnDB::UpdateSnapshots:p:" + std::to_string(a2),
+         "WritePreparedTxnDB::CheckAgainstSnapshots:s:" + std::to_string(b1)},
+        {"WritePreparedTxnDB::CheckAgainstSnapshots:p:" + std::to_string(b2),
+         "WritePreparedTxnDB::UpdateSnapshots:s:" + std::to_string(a2)},
+        {"WritePreparedTxnDB::UpdateSnapshots:p:end",
+         "WritePreparedTxnDB::CheckAgainstSnapshots:s:" + std::to_string(b2)},
+    });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    {
+      ASSERT_TRUE(wp_db->old_commit_map_empty_);
+      ROCKSDB_NAMESPACE::port::Thread t1(
+          [&]() { wp_db->UpdateSnapshots(new_snapshots, version); });
+      wp_db->CheckAgainstSnapshots(entry);
+      t1.join();
+      ASSERT_FALSE(wp_db->old_commit_map_empty_);
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+
+  // Verify value of keys.
+  void VerifyKeys(const std::unordered_map<std::string, std::string>& data,
+                  const Snapshot* snapshot = nullptr) {
+    std::string value;
+    ReadOptions read_options;
+    read_options.snapshot = snapshot;
+    for (auto& kv : data) {
+      auto s = db->Get(read_options, kv.first, &value);
+      ASSERT_TRUE(s.ok() || s.IsNotFound());
+      if (s.ok()) {
+        if (kv.second != value) {
+          printf("key = %s\n", kv.first.c_str());
+        }
+        ASSERT_EQ(kv.second, value);
+      } else {
+        ASSERT_EQ(kv.second, "NOT_FOUND");
+      }
+
+      // Try with MultiGet API too
+      std::vector<std::string> values;
+      auto s_vec = db->MultiGet(read_options, {db->DefaultColumnFamily()},
+                                {kv.first}, &values);
+      ASSERT_EQ(1, values.size());
+      ASSERT_EQ(1, s_vec.size());
+      s = s_vec[0];
+      ASSERT_TRUE(s.ok() || s.IsNotFound());
+      if (s.ok()) {
+        ASSERT_TRUE(kv.second == values[0]);
+      } else {
+        ASSERT_EQ(kv.second, "NOT_FOUND");
+      }
+    }
+  }
+
+  // Verify all versions of keys.
+  void VerifyInternalKeys(const std::vector<KeyVersion>& expected_versions) {
+    std::vector<KeyVersion> versions;
+    const size_t kMaxKeys = 100000;
+    ASSERT_OK(GetAllKeyVersions(db, expected_versions.front().user_key,
+                                expected_versions.back().user_key, kMaxKeys,
+                                &versions));
+    ASSERT_EQ(expected_versions.size(), versions.size());
+    for (size_t i = 0; i < versions.size(); i++) {
+      ASSERT_EQ(expected_versions[i].user_key, versions[i].user_key);
+      ASSERT_EQ(expected_versions[i].sequence, versions[i].sequence);
+      ASSERT_EQ(expected_versions[i].type, versions[i].type);
+      if (versions[i].type != kTypeDeletion &&
+          versions[i].type != kTypeSingleDeletion) {
+        ASSERT_EQ(expected_versions[i].value, versions[i].value);
+      }
+      // Range delete not supported.
+      ASSERT_NE(expected_versions[i].type, kTypeRangeDeletion);
+    }
+  }
+};
+
+class WritePreparedTransactionTest
+    : public WritePreparedTransactionTestBase,
+      virtual public ::testing::WithParamInterface<
+          std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering>> {
+ public:
+  WritePreparedTransactionTest()
+      : WritePreparedTransactionTestBase(
+            std::get<0>(GetParam()), std::get<1>(GetParam()),
+            std::get<2>(GetParam()), std::get<3>(GetParam())){};
+};
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+class SnapshotConcurrentAccessTest
+    : public WritePreparedTransactionTestBase,
+      virtual public ::testing::WithParamInterface<std::tuple<
+          bool, bool, TxnDBWritePolicy, WriteOrdering, size_t, size_t>> {
+ public:
+  SnapshotConcurrentAccessTest()
+      : WritePreparedTransactionTestBase(
+            std::get<0>(GetParam()), std::get<1>(GetParam()),
+            std::get<2>(GetParam()), std::get<3>(GetParam())),
+        split_id_(std::get<4>(GetParam())),
+        split_cnt_(std::get<5>(GetParam())){};
+
+ protected:
+  // A test is split into split_cnt_ tests, each identified with split_id_ where
+  // 0 <= split_id_ < split_cnt_
+  size_t split_id_;
+  size_t split_cnt_;
+};
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+class SeqAdvanceConcurrentTest
+    : public WritePreparedTransactionTestBase,
+      virtual public ::testing::WithParamInterface<std::tuple<
+          bool, bool, TxnDBWritePolicy, WriteOrdering, size_t, size_t>> {
+ public:
+  SeqAdvanceConcurrentTest()
+      : WritePreparedTransactionTestBase(
+            std::get<0>(GetParam()), std::get<1>(GetParam()),
+            std::get<2>(GetParam()), std::get<3>(GetParam())),
+        split_id_(std::get<4>(GetParam())),
+        split_cnt_(std::get<5>(GetParam())) {
+    special_env.skip_fsync_ = true;
+  };
+
+ protected:
+  // A test is split into split_cnt_ tests, each identified with split_id_ where
+  // 0 <= split_id_ < split_cnt_
+  size_t split_id_;
+  size_t split_cnt_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    WritePreparedTransaction, WritePreparedTransactionTest,
+    ::testing::Values(
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite)));
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+INSTANTIATE_TEST_CASE_P(
+    TwoWriteQueues, SnapshotConcurrentAccessTest,
+    ::testing::Values(
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 0, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 1, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 2, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 3, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 4, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 5, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 6, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 7, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 8, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 9, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 10, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 11, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 12, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 13, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 14, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 15, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 16, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 17, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 18, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 19, 20),
+
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 0, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 1, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 2, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 3, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 4, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 5, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 6, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 7, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 8, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 9, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 10, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 11, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 12, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 13, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 14, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 15, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 16, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 17, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 18, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 19, 20)));
+
+INSTANTIATE_TEST_CASE_P(
+    OneWriteQueue, SnapshotConcurrentAccessTest,
+    ::testing::Values(
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 0, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 1, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 2, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 3, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 4, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 5, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 6, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 7, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 8, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 9, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 10, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 11, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 12, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 13, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 14, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 15, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 16, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 17, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 18, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 19, 20)));
+
+INSTANTIATE_TEST_CASE_P(
+    TwoWriteQueues, SeqAdvanceConcurrentTest,
+    ::testing::Values(
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 0, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 1, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 2, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 3, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 4, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 5, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 6, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 7, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 8, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 9, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 0, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 1, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 2, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 3, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 4, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 5, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 6, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 7, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 8, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 9, 10)));
+
+INSTANTIATE_TEST_CASE_P(
+    OneWriteQueue, SeqAdvanceConcurrentTest,
+    ::testing::Values(
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 0, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 1, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 2, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 3, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 4, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 5, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 6, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 7, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 8, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 9, 10)));
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_P(WritePreparedTransactionTest, CommitMap) {
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  ASSERT_NE(wp_db, nullptr);
+  ASSERT_NE(wp_db->db_impl_, nullptr);
+  size_t size = wp_db->COMMIT_CACHE_SIZE;
+  CommitEntry c = {5, 12}, e;
+  bool evicted = wp_db->AddCommitEntry(c.prep_seq % size, c, &e);
+  ASSERT_FALSE(evicted);
+
+  // Should be able to read the same value
+  CommitEntry64b dont_care;
+  bool found = wp_db->GetCommitEntry(c.prep_seq % size, &dont_care, &e);
+  ASSERT_TRUE(found);
+  ASSERT_EQ(c, e);
+  // Should be able to distinguish between overlapping entries
+  found = wp_db->GetCommitEntry((c.prep_seq + size) % size, &dont_care, &e);
+  ASSERT_TRUE(found);
+  ASSERT_NE(c.prep_seq + size, e.prep_seq);
+  // Should be able to detect non-existent entry
+  found = wp_db->GetCommitEntry((c.prep_seq + 1) % size, &dont_care, &e);
+  ASSERT_FALSE(found);
+
+  // Reject an invalid exchange
+  CommitEntry e2 = {c.prep_seq + size, c.commit_seq + size};
+  CommitEntry64b e2_64b(e2, wp_db->FORMAT);
+  bool exchanged = wp_db->ExchangeCommitEntry(e2.prep_seq % size, e2_64b, e);
+  ASSERT_FALSE(exchanged);
+  // check whether it did actually reject that
+  found = wp_db->GetCommitEntry(e2.prep_seq % size, &dont_care, &e);
+  ASSERT_TRUE(found);
+  ASSERT_EQ(c, e);
+
+  // Accept a valid exchange
+  CommitEntry64b c_64b(c, wp_db->FORMAT);
+  CommitEntry e3 = {c.prep_seq + size, c.commit_seq + size + 1};
+  exchanged = wp_db->ExchangeCommitEntry(c.prep_seq % size, c_64b, e3);
+  ASSERT_TRUE(exchanged);
+  // check whether it did actually accepted that
+  found = wp_db->GetCommitEntry(c.prep_seq % size, &dont_care, &e);
+  ASSERT_TRUE(found);
+  ASSERT_EQ(e3, e);
+
+  // Rewrite an entry
+  CommitEntry e4 = {e3.prep_seq + size, e3.commit_seq + size + 1};
+  evicted = wp_db->AddCommitEntry(e4.prep_seq % size, e4, &e);
+  ASSERT_TRUE(evicted);
+  ASSERT_EQ(e3, e);
+  found = wp_db->GetCommitEntry(e4.prep_seq % size, &dont_care, &e);
+  ASSERT_TRUE(found);
+  ASSERT_EQ(e4, e);
+}
+
+TEST_P(WritePreparedTransactionTest, MaybeUpdateOldCommitMap) {
+  // If prepare <= snapshot < commit we should keep the entry around since its
+  // nonexistence could be interpreted as committed in the snapshot while it is
+  // not true. We keep such entries around by adding them to the
+  // old_commit_map_.
+  uint64_t p /*prepare*/, c /*commit*/, s /*snapshot*/, ns /*next_snapshot*/;
+  p = 10l, c = 15l, s = 20l, ns = 21l;
+  MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false);
+  // If we do not expect the old commit map to be updated, try also with a next
+  // snapshot that is expected to update the old commit map. This would test
+  // that MaybeUpdateOldCommitMap would not prevent us from checking the next
+  // snapshot that must be checked.
+  p = 10l, c = 15l, s = 20l, ns = 11l;
+  MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false);
+
+  p = 10l, c = 20l, s = 20l, ns = 19l;
+  MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false);
+  p = 10l, c = 20l, s = 20l, ns = 21l;
+  MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false);
+
+  p = 20l, c = 20l, s = 20l, ns = 21l;
+  MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false);
+  p = 20l, c = 20l, s = 20l, ns = 19l;
+  MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false);
+
+  p = 10l, c = 25l, s = 20l, ns = 21l;
+  MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, true);
+
+  p = 20l, c = 25l, s = 20l, ns = 21l;
+  MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, true);
+
+  p = 21l, c = 25l, s = 20l, ns = 22l;
+  MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false);
+  p = 21l, c = 25l, s = 20l, ns = 19l;
+  MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false);
+}
+
+// Trigger the condition where some old memtables are skipped when doing
+// TransactionUtil::CheckKey(), and make sure the result is still correct.
+TEST_P(WritePreparedTransactionTest, CheckKeySkipOldMemtable) {
+  const int kAttemptHistoryMemtable = 0;
+  const int kAttemptImmMemTable = 1;
+  for (int attempt = kAttemptHistoryMemtable; attempt <= kAttemptImmMemTable;
+       attempt++) {
+    options.max_write_buffer_number_to_maintain = 3;
+    ASSERT_OK(ReOpen());
+
+    WriteOptions write_options;
+    ReadOptions read_options;
+    TransactionOptions txn_options;
+    txn_options.set_snapshot = true;
+    string value;
+
+    ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar")));
+    ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar")));
+
+    Transaction* txn = db->BeginTransaction(write_options, txn_options);
+    ASSERT_TRUE(txn != nullptr);
+    ASSERT_OK(txn->SetName("txn"));
+
+    Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+    ASSERT_TRUE(txn2 != nullptr);
+    ASSERT_OK(txn2->SetName("txn2"));
+
+    // This transaction is created to cause potential conflict.
+    Transaction* txn_x = db->BeginTransaction(write_options);
+    ASSERT_OK(txn_x->SetName("txn_x"));
+    ASSERT_OK(txn_x->Put(Slice("foo"), Slice("bar3")));
+    ASSERT_OK(txn_x->Prepare());
+
+    // Create snapshots after the prepare, but there should still
+    // be a conflict when trying to read "foo".
+
+    if (attempt == kAttemptImmMemTable) {
+      // For the second attempt, hold flush from beginning. The memtable
+      // will be switched to immutable after calling TEST_SwitchMemtable()
+      // while CheckKey() is called.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+          {{"WritePreparedTransactionTest.CheckKeySkipOldMemtable",
+            "FlushJob::Start"}});
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    }
+
+    // force a memtable flush. The memtable should still be kept
+    FlushOptions flush_ops;
+    if (attempt == kAttemptHistoryMemtable) {
+      ASSERT_OK(db->Flush(flush_ops));
+    } else {
+      ASSERT_EQ(attempt, kAttemptImmMemTable);
+      DBImpl* db_impl = static_cast<DBImpl*>(db->GetRootDB());
+      ASSERT_OK(db_impl->TEST_SwitchMemtable());
+    }
+    uint64_t num_imm_mems;
+    ASSERT_TRUE(db->GetIntProperty(DB::Properties::kNumImmutableMemTable,
+                                   &num_imm_mems));
+    if (attempt == kAttemptHistoryMemtable) {
+      ASSERT_EQ(0, num_imm_mems);
+    } else {
+      ASSERT_EQ(attempt, kAttemptImmMemTable);
+      ASSERT_EQ(1, num_imm_mems);
+    }
+
+    // Put something in active memtable
+    ASSERT_OK(db->Put(write_options, Slice("foo3"), Slice("bar")));
+
+    // Create txn3 after flushing, but this transaction also needs to
+    // check all memtables because of they contains uncommitted data.
+    Transaction* txn3 = db->BeginTransaction(write_options, txn_options);
+    ASSERT_TRUE(txn3 != nullptr);
+    ASSERT_OK(txn3->SetName("txn3"));
+
+    // Commit the pending write
+    ASSERT_OK(txn_x->Commit());
+
+    // Commit txn, txn2 and tx3. txn and tx3 will conflict but txn2 will
+    // pass. In all cases, both memtables are queried.
+    SetPerfLevel(PerfLevel::kEnableCount);
+    get_perf_context()->Reset();
+    ASSERT_TRUE(txn3->GetForUpdate(read_options, "foo", &value).IsBusy());
+    // We should have checked two memtables, active and either immutable
+    // or history memtable, depending on the test case.
+    ASSERT_EQ(2, get_perf_context()->get_from_memtable_count);
+
+    get_perf_context()->Reset();
+    ASSERT_TRUE(txn->GetForUpdate(read_options, "foo", &value).IsBusy());
+    // We should have checked two memtables, active and either immutable
+    // or history memtable, depending on the test case.
+    ASSERT_EQ(2, get_perf_context()->get_from_memtable_count);
+
+    get_perf_context()->Reset();
+    ASSERT_OK(txn2->GetForUpdate(read_options, "foo2", &value));
+    ASSERT_EQ(value, "bar");
+    // We should have checked two memtables, and since there is no
+    // conflict, another Get() will be made and fetch the data from
+    // DB. If it is in immutable memtable, two extra memtable reads
+    // will be issued. If it is not (in history), only one will
+    // be made, which is to the active memtable.
+    if (attempt == kAttemptHistoryMemtable) {
+      ASSERT_EQ(3, get_perf_context()->get_from_memtable_count);
+    } else {
+      ASSERT_EQ(attempt, kAttemptImmMemTable);
+      ASSERT_EQ(4, get_perf_context()->get_from_memtable_count);
+    }
+
+    Transaction* txn4 = db->BeginTransaction(write_options, txn_options);
+    ASSERT_TRUE(txn4 != nullptr);
+    ASSERT_OK(txn4->SetName("txn4"));
+    get_perf_context()->Reset();
+    ASSERT_OK(txn4->GetForUpdate(read_options, "foo", &value));
+    if (attempt == kAttemptHistoryMemtable) {
+      // Active memtable will be checked in snapshot validation and when
+      // getting the value.
+      ASSERT_EQ(2, get_perf_context()->get_from_memtable_count);
+    } else {
+      // Only active memtable will be checked in snapshot validation but
+      // both of active and immutable snapshot will be queried when
+      // getting the value.
+      ASSERT_EQ(attempt, kAttemptImmMemTable);
+      ASSERT_EQ(3, get_perf_context()->get_from_memtable_count);
+    }
+
+    ASSERT_OK(txn2->Commit());
+    ASSERT_OK(txn4->Commit());
+
+    TEST_SYNC_POINT("WritePreparedTransactionTest.CheckKeySkipOldMemtable");
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+    SetPerfLevel(PerfLevel::kDisable);
+
+    delete txn;
+    delete txn2;
+    delete txn3;
+    delete txn4;
+    delete txn_x;
+  }
+}
+
+// Reproduce the bug with two snapshots with the same seuqence number and test
+// that the release of the first snapshot will not affect the reads by the other
+// snapshot
+TEST_P(WritePreparedTransactionTest, DoubleSnapshot) {
+  TransactionOptions txn_options;
+  Status s;
+
+  // Insert initial value
+  ASSERT_OK(db->Put(WriteOptions(), "key", "value1"));
+
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  Transaction* txn =
+      wp_db->BeginTransaction(WriteOptions(), txn_options, nullptr);
+  ASSERT_OK(txn->SetName("txn"));
+  ASSERT_OK(txn->Put("key", "value2"));
+  ASSERT_OK(txn->Prepare());
+  // Three snapshots with the same seq number
+  const Snapshot* snapshot0 = wp_db->GetSnapshot();
+  const Snapshot* snapshot1 = wp_db->GetSnapshot();
+  const Snapshot* snapshot2 = wp_db->GetSnapshot();
+  ASSERT_OK(txn->Commit());
+  SequenceNumber cache_size = wp_db->COMMIT_CACHE_SIZE;
+  SequenceNumber overlap_seq = txn->GetId() + cache_size;
+  delete txn;
+
+  // 4th snapshot with a larger seq
+  const Snapshot* snapshot3 = wp_db->GetSnapshot();
+  // Cause an eviction to advance max evicted seq number
+  // This also fetches the 4 snapshots from db since their seq is lower than the
+  // new max
+  wp_db->AddCommitted(overlap_seq, overlap_seq);
+
+  ReadOptions ropt;
+  // It should see the value before commit
+  ropt.snapshot = snapshot2;
+  PinnableSlice pinnable_val;
+  s = wp_db->Get(ropt, wp_db->DefaultColumnFamily(), "key", &pinnable_val);
+  ASSERT_OK(s);
+  ASSERT_TRUE(pinnable_val == "value1");
+  pinnable_val.Reset();
+
+  wp_db->ReleaseSnapshot(snapshot1);
+
+  // It should still see the value before commit
+  s = wp_db->Get(ropt, wp_db->DefaultColumnFamily(), "key", &pinnable_val);
+  ASSERT_OK(s);
+  ASSERT_TRUE(pinnable_val == "value1");
+  pinnable_val.Reset();
+
+  // Cause an eviction to advance max evicted seq number and trigger updating
+  // the snapshot list
+  overlap_seq += cache_size;
+  wp_db->AddCommitted(overlap_seq, overlap_seq);
+
+  // It should still see the value before commit
+  s = wp_db->Get(ropt, wp_db->DefaultColumnFamily(), "key", &pinnable_val);
+  ASSERT_OK(s);
+  ASSERT_TRUE(pinnable_val == "value1");
+  pinnable_val.Reset();
+
+  wp_db->ReleaseSnapshot(snapshot0);
+  wp_db->ReleaseSnapshot(snapshot2);
+  wp_db->ReleaseSnapshot(snapshot3);
+}
+
+size_t UniqueCnt(std::vector<SequenceNumber> vec) {
+  std::set<SequenceNumber> aset;
+  for (auto i : vec) {
+    aset.insert(i);
+  }
+  return aset.size();
+}
+// Test that the entries in old_commit_map_ get garbage collected properly
+TEST_P(WritePreparedTransactionTest, OldCommitMapGC) {
+  const size_t snapshot_cache_bits = 0;
+  const size_t commit_cache_bits = 0;
+  DBImpl* mock_db = new DBImpl(options, dbname);
+  UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+  std::unique_ptr<WritePreparedTxnDBMock> wp_db(
+      new WritePreparedTxnDBMock(mock_db, txn_db_options));
+
+  SequenceNumber seq = 0;
+  // Take the first snapshot that overlaps with two txn
+  auto prep_seq = ++seq;
+  wp_db->AddPrepared(prep_seq);
+  auto prep_seq2 = ++seq;
+  wp_db->AddPrepared(prep_seq2);
+  auto snap_seq1 = seq;
+  wp_db->TakeSnapshot(snap_seq1);
+  auto commit_seq = ++seq;
+  wp_db->AddCommitted(prep_seq, commit_seq);
+  wp_db->RemovePrepared(prep_seq);
+  auto commit_seq2 = ++seq;
+  wp_db->AddCommitted(prep_seq2, commit_seq2);
+  wp_db->RemovePrepared(prep_seq2);
+  // Take the 2nd and 3rd snapshot that overlap with the same txn
+  prep_seq = ++seq;
+  wp_db->AddPrepared(prep_seq);
+  auto snap_seq2 = seq;
+  wp_db->TakeSnapshot(snap_seq2);
+  seq++;
+  auto snap_seq3 = seq;
+  wp_db->TakeSnapshot(snap_seq3);
+  seq++;
+  commit_seq = ++seq;
+  wp_db->AddCommitted(prep_seq, commit_seq);
+  wp_db->RemovePrepared(prep_seq);
+  // Make sure max_evicted_seq_ will be larger than 2nd snapshot by evicting the
+  // only item in the commit_cache_ via another commit.
+  prep_seq = ++seq;
+  wp_db->AddPrepared(prep_seq);
+  commit_seq = ++seq;
+  wp_db->AddCommitted(prep_seq, commit_seq);
+  wp_db->RemovePrepared(prep_seq);
+
+  // Verify that the evicted commit entries for all snapshots are in the
+  // old_commit_map_
+  {
+    ASSERT_FALSE(wp_db->old_commit_map_empty_.load());
+    ReadLock rl(&wp_db->old_commit_map_mutex_);
+    ASSERT_EQ(3, wp_db->old_commit_map_.size());
+    ASSERT_EQ(2, UniqueCnt(wp_db->old_commit_map_[snap_seq1]));
+    ASSERT_EQ(1, UniqueCnt(wp_db->old_commit_map_[snap_seq2]));
+    ASSERT_EQ(1, UniqueCnt(wp_db->old_commit_map_[snap_seq3]));
+  }
+
+  // Verify that the 2nd snapshot is cleaned up after the release
+  wp_db->ReleaseSnapshotInternal(snap_seq2);
+  {
+    ASSERT_FALSE(wp_db->old_commit_map_empty_.load());
+    ReadLock rl(&wp_db->old_commit_map_mutex_);
+    ASSERT_EQ(2, wp_db->old_commit_map_.size());
+    ASSERT_EQ(2, UniqueCnt(wp_db->old_commit_map_[snap_seq1]));
+    ASSERT_EQ(1, UniqueCnt(wp_db->old_commit_map_[snap_seq3]));
+  }
+
+  // Verify that the 1st snapshot is cleaned up after the release
+  wp_db->ReleaseSnapshotInternal(snap_seq1);
+  {
+    ASSERT_FALSE(wp_db->old_commit_map_empty_.load());
+    ReadLock rl(&wp_db->old_commit_map_mutex_);
+    ASSERT_EQ(1, wp_db->old_commit_map_.size());
+    ASSERT_EQ(1, UniqueCnt(wp_db->old_commit_map_[snap_seq3]));
+  }
+
+  // Verify that the 3rd snapshot is cleaned up after the release
+  wp_db->ReleaseSnapshotInternal(snap_seq3);
+  {
+    ASSERT_TRUE(wp_db->old_commit_map_empty_.load());
+    ReadLock rl(&wp_db->old_commit_map_mutex_);
+    ASSERT_EQ(0, wp_db->old_commit_map_.size());
+  }
+}
+
+TEST_P(WritePreparedTransactionTest, CheckAgainstSnapshots) {
+  std::vector<SequenceNumber> snapshots = {100l, 200l, 300l, 400l, 500l,
+                                           600l, 700l, 800l, 900l};
+  const size_t snapshot_cache_bits = 2;
+  const uint64_t cache_size = 1ul << snapshot_cache_bits;
+  // Safety check to express the intended size in the test. Can be adjusted if
+  // the snapshots lists changed.
+  ASSERT_EQ((1ul << snapshot_cache_bits) * 2 + 1, snapshots.size());
+  DBImpl* mock_db = new DBImpl(options, dbname);
+  UpdateTransactionDBOptions(snapshot_cache_bits);
+  std::unique_ptr<WritePreparedTxnDBMock> wp_db(
+      new WritePreparedTxnDBMock(mock_db, txn_db_options));
+  SequenceNumber version = 1000l;
+  ASSERT_EQ(0, wp_db->snapshots_total_);
+  wp_db->UpdateSnapshots(snapshots, version);
+  ASSERT_EQ(snapshots.size(), wp_db->snapshots_total_);
+  // seq numbers are chosen so that we have two of them between each two
+  // snapshots. If the diff of two consecutive seq is more than 5, there is a
+  // snapshot between them.
+  std::vector<SequenceNumber> seqs = {50l,  55l,  150l, 155l, 250l, 255l, 350l,
+                                      355l, 450l, 455l, 550l, 555l, 650l, 655l,
+                                      750l, 755l, 850l, 855l, 950l, 955l};
+  ASSERT_GT(seqs.size(), 1);
+  for (size_t i = 0; i + 1 < seqs.size(); i++) {
+    wp_db->old_commit_map_empty_ = true;  // reset
+    CommitEntry commit_entry = {seqs[i], seqs[i + 1]};
+    wp_db->CheckAgainstSnapshots(commit_entry);
+    // Expect update if there is snapshot in between the prepare and commit
+    bool expect_update = commit_entry.commit_seq - commit_entry.prep_seq > 5 &&
+                         commit_entry.commit_seq >= snapshots.front() &&
+                         commit_entry.prep_seq <= snapshots.back();
+    ASSERT_EQ(expect_update, !wp_db->old_commit_map_empty_);
+  }
+
+  // Test that search will include multiple snapshot from snapshot cache
+  {
+    // exclude first and last item in the cache
+    CommitEntry commit_entry = {snapshots.front() + 1,
+                                snapshots[cache_size - 1] - 1};
+    wp_db->old_commit_map_empty_ = true;  // reset
+    wp_db->old_commit_map_.clear();
+    wp_db->CheckAgainstSnapshots(commit_entry);
+    ASSERT_EQ(wp_db->old_commit_map_.size(), cache_size - 2);
+  }
+
+  // Test that search will include multiple snapshot from old snapshots
+  {
+    // include two in the middle
+    CommitEntry commit_entry = {snapshots[cache_size] + 1,
+                                snapshots[cache_size + 2] + 1};
+    wp_db->old_commit_map_empty_ = true;  // reset
+    wp_db->old_commit_map_.clear();
+    wp_db->CheckAgainstSnapshots(commit_entry);
+    ASSERT_EQ(wp_db->old_commit_map_.size(), 2);
+  }
+
+  // Test that search will include both snapshot cache and old snapshots
+  // Case 1: includes all in snapshot cache
+  {
+    CommitEntry commit_entry = {snapshots.front() - 1, snapshots.back() + 1};
+    wp_db->old_commit_map_empty_ = true;  // reset
+    wp_db->old_commit_map_.clear();
+    wp_db->CheckAgainstSnapshots(commit_entry);
+    ASSERT_EQ(wp_db->old_commit_map_.size(), snapshots.size());
+  }
+
+  // Case 2: includes all snapshot caches except the smallest
+  {
+    CommitEntry commit_entry = {snapshots.front() + 1, snapshots.back() + 1};
+    wp_db->old_commit_map_empty_ = true;  // reset
+    wp_db->old_commit_map_.clear();
+    wp_db->CheckAgainstSnapshots(commit_entry);
+    ASSERT_EQ(wp_db->old_commit_map_.size(), snapshots.size() - 1);
+  }
+
+  // Case 3: includes only the largest of snapshot cache
+  {
+    CommitEntry commit_entry = {snapshots[cache_size - 1] - 1,
+                                snapshots.back() + 1};
+    wp_db->old_commit_map_empty_ = true;  // reset
+    wp_db->old_commit_map_.clear();
+    wp_db->CheckAgainstSnapshots(commit_entry);
+    ASSERT_EQ(wp_db->old_commit_map_.size(), snapshots.size() - cache_size + 1);
+  }
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+// Test that CheckAgainstSnapshots will not miss a live snapshot if it is run in
+// parallel with UpdateSnapshots.
+TEST_P(SnapshotConcurrentAccessTest, SnapshotConcurrentAccess) {
+  // We have a sync point in the method under test after checking each snapshot.
+  // If you increase the max number of snapshots in this test, more sync points
+  // in the methods must also be added.
+  const std::vector<SequenceNumber> snapshots = {10l, 20l, 30l, 40l, 50l,
+                                                 60l, 70l, 80l, 90l, 100l};
+  const size_t snapshot_cache_bits = 2;
+  // Safety check to express the intended size in the test. Can be adjusted if
+  // the snapshots lists changed.
+  ASSERT_EQ((1ul << snapshot_cache_bits) * 2 + 2, snapshots.size());
+  SequenceNumber version = 1000l;
+  // Choose the cache size so that the new snapshot list could replace all the
+  // existing items in the cache and also have some overflow.
+  DBImpl* mock_db = new DBImpl(options, dbname);
+  UpdateTransactionDBOptions(snapshot_cache_bits);
+  std::unique_ptr<WritePreparedTxnDBMock> wp_db(
+      new WritePreparedTxnDBMock(mock_db, txn_db_options));
+  const size_t extra = 2;
+  size_t loop_id = 0;
+  // Add up to extra items that do not fit into the cache
+  for (size_t old_size = 1; old_size <= wp_db->SNAPSHOT_CACHE_SIZE + extra;
+       old_size++) {
+    const std::vector<SequenceNumber> old_snapshots(
+        snapshots.begin(), snapshots.begin() + old_size);
+
+    // Each member of old snapshot might or might not appear in the new list. We
+    // create a common_snapshots for each combination.
+    size_t new_comb_cnt = size_t(1) << old_size;
+    for (size_t new_comb = 0; new_comb < new_comb_cnt; new_comb++, loop_id++) {
+      if (loop_id % split_cnt_ != split_id_) continue;
+      printf(".");  // To signal progress
+      fflush(stdout);
+      std::vector<SequenceNumber> common_snapshots;
+      for (size_t i = 0; i < old_snapshots.size(); i++) {
+        if (IsInCombination(i, new_comb)) {
+          common_snapshots.push_back(old_snapshots[i]);
+        }
+      }
+      // And add some new snapshots to the common list
+      for (size_t added_snapshots = 0;
+           added_snapshots <= snapshots.size() - old_snapshots.size();
+           added_snapshots++) {
+        std::vector<SequenceNumber> new_snapshots = common_snapshots;
+        for (size_t i = 0; i < added_snapshots; i++) {
+          new_snapshots.push_back(snapshots[old_snapshots.size() + i]);
+        }
+        for (auto it = common_snapshots.begin(); it != common_snapshots.end();
+             ++it) {
+          auto snapshot = *it;
+          // Create a commit entry that is around the snapshot and thus should
+          // be not be discarded
+          CommitEntry entry = {static_cast<uint64_t>(snapshot - 1),
+                               snapshot + 1};
+          // The critical part is when iterating the snapshot cache. Afterwards,
+          // we are operating under the lock
+          size_t a_range =
+              std::min(old_snapshots.size(), wp_db->SNAPSHOT_CACHE_SIZE) + 1;
+          size_t b_range =
+              std::min(new_snapshots.size(), wp_db->SNAPSHOT_CACHE_SIZE) + 1;
+          // Break each thread at two points
+          for (size_t a1 = 1; a1 <= a_range; a1++) {
+            for (size_t a2 = a1 + 1; a2 <= a_range; a2++) {
+              for (size_t b1 = 1; b1 <= b_range; b1++) {
+                for (size_t b2 = b1 + 1; b2 <= b_range; b2++) {
+                  SnapshotConcurrentAccessTestInternal(
+                      wp_db.get(), old_snapshots, new_snapshots, entry, version,
+                      a1, a2, b1, b2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  printf("\n");
+}
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+// This test clarifies the contract of AdvanceMaxEvictedSeq method
+TEST_P(WritePreparedTransactionTest, AdvanceMaxEvictedSeqBasic) {
+  DBImpl* mock_db = new DBImpl(options, dbname);
+  std::unique_ptr<WritePreparedTxnDBMock> wp_db(
+      new WritePreparedTxnDBMock(mock_db, txn_db_options));
+
+  // 1. Set the initial values for max, prepared, and snapshots
+  SequenceNumber zero_max = 0l;
+  // Set the initial list of prepared txns
+  const std::vector<SequenceNumber> initial_prepared = {10,  30,  50, 100,
+                                                        150, 200, 250};
+  for (auto p : initial_prepared) {
+    wp_db->AddPrepared(p);
+  }
+  // This updates the max value and also set old prepared
+  SequenceNumber init_max = 100;
+  wp_db->AdvanceMaxEvictedSeq(zero_max, init_max);
+  const std::vector<SequenceNumber> initial_snapshots = {20, 40};
+  wp_db->SetDBSnapshots(initial_snapshots);
+  // This will update the internal cache of snapshots from the DB
+  wp_db->UpdateSnapshots(initial_snapshots, init_max);
+
+  // 2. Invoke AdvanceMaxEvictedSeq
+  const std::vector<SequenceNumber> latest_snapshots = {20, 110, 220, 300};
+  wp_db->SetDBSnapshots(latest_snapshots);
+  SequenceNumber new_max = 200;
+  wp_db->AdvanceMaxEvictedSeq(init_max, new_max);
+
+  // 3. Verify that the state matches with AdvanceMaxEvictedSeq contract
+  // a. max should be updated to new_max
+  ASSERT_EQ(wp_db->max_evicted_seq_, new_max);
+  // b. delayed prepared should contain every txn <= max and prepared should
+  // only contain txns > max
+  auto it = initial_prepared.begin();
+  for (; it != initial_prepared.end() && *it <= new_max; ++it) {
+    ASSERT_EQ(1, wp_db->delayed_prepared_.erase(*it));
+  }
+  ASSERT_TRUE(wp_db->delayed_prepared_.empty());
+  for (; it != initial_prepared.end() && !wp_db->prepared_txns_.empty();
+       ++it, wp_db->prepared_txns_.pop()) {
+    ASSERT_EQ(*it, wp_db->prepared_txns_.top());
+  }
+  ASSERT_TRUE(it == initial_prepared.end());
+  ASSERT_TRUE(wp_db->prepared_txns_.empty());
+  // c. snapshots should contain everything below new_max
+  auto sit = latest_snapshots.begin();
+  for (size_t i = 0; sit != latest_snapshots.end() && *sit <= new_max &&
+                     i < wp_db->snapshots_total_;
+       sit++, i++) {
+    ASSERT_TRUE(i < wp_db->snapshots_total_);
+    // This test is in small scale and the list of snapshots are assumed to be
+    // within the cache size limit. This is just a safety check to double check
+    // that assumption.
+    ASSERT_TRUE(i < wp_db->SNAPSHOT_CACHE_SIZE);
+    ASSERT_EQ(*sit, wp_db->snapshot_cache_[i]);
+  }
+}
+
+// A new snapshot should always be always larger than max_evicted_seq_
+// Otherwise the snapshot does not go through AdvanceMaxEvictedSeq
+TEST_P(WritePreparedTransactionTest, NewSnapshotLargerThanMax) {
+  WriteOptions woptions;
+  TransactionOptions txn_options;
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  Transaction* txn0 = db->BeginTransaction(woptions, txn_options);
+  ASSERT_OK(txn0->Put(Slice("key"), Slice("value")));
+  ASSERT_OK(txn0->Commit());
+  const SequenceNumber seq = txn0->GetId();  // is also prepare seq
+  delete txn0;
+  std::vector<Transaction*> txns;
+  // Inc seq without committing anything
+  for (int i = 0; i < 10; i++) {
+    Transaction* txn = db->BeginTransaction(woptions, txn_options);
+    ASSERT_OK(txn->SetName("xid" + std::to_string(i)));
+    ASSERT_OK(txn->Put(Slice("key" + std::to_string(i)), Slice("value")));
+    ASSERT_OK(txn->Prepare());
+    txns.push_back(txn);
+  }
+
+  // The new commit is seq + 10
+  ASSERT_OK(db->Put(woptions, "key", "value"));
+  auto snap = wp_db->GetSnapshot();
+  const SequenceNumber last_seq = snap->GetSequenceNumber();
+  wp_db->ReleaseSnapshot(snap);
+  ASSERT_LT(seq, last_seq);
+  // Otherwise our test is not effective
+  ASSERT_LT(last_seq - seq, wp_db->INC_STEP_FOR_MAX_EVICTED);
+
+  // Evict seq out of commit cache
+  const SequenceNumber overwrite_seq = seq + wp_db->COMMIT_CACHE_SIZE;
+  // Check that the next write could make max go beyond last
+  auto last_max = wp_db->max_evicted_seq_.load();
+  wp_db->AddCommitted(overwrite_seq, overwrite_seq);
+  // Check that eviction has advanced the max
+  ASSERT_LT(last_max, wp_db->max_evicted_seq_.load());
+  // Check that the new max has not advanced the last seq
+  ASSERT_LT(wp_db->max_evicted_seq_.load(), last_seq);
+  for (auto txn : txns) {
+    txn->Rollback();
+    delete txn;
+  }
+}
+
+// A new snapshot should always be always larger than max_evicted_seq_
+// In very rare cases max could be below last published seq. Test that
+// taking snapshot will wait for max to catch up.
+TEST_P(WritePreparedTransactionTest, MaxCatchupWithNewSnapshot) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  const size_t commit_cache_bits = 0;    // only 1 entry => frequent eviction
+  UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+  ASSERT_OK(ReOpen());
+  WriteOptions woptions;
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+
+  const int writes = 50;
+  const int batch_cnt = 4;
+  ROCKSDB_NAMESPACE::port::Thread t1([&]() {
+    for (int i = 0; i < writes; i++) {
+      WriteBatch batch;
+      // For duplicate keys cause 4 commit entries, each evicting an entry that
+      // is not published yet, thus causing max evicted seq go higher than last
+      // published.
+      for (int b = 0; b < batch_cnt; b++) {
+        ASSERT_OK(batch.Put("foo", "foo"));
+      }
+      ASSERT_OK(db->Write(woptions, &batch));
+    }
+  });
+
+  ROCKSDB_NAMESPACE::port::Thread t2([&]() {
+    while (wp_db->max_evicted_seq_ == 0) {  // wait for insert thread
+      std::this_thread::yield();
+    }
+    for (int i = 0; i < 10; i++) {
+      SequenceNumber max_lower_bound = wp_db->max_evicted_seq_;
+      auto snap = db->GetSnapshot();
+      if (snap->GetSequenceNumber() != 0) {
+        // Value of max_evicted_seq_ when snapshot was taken in unknown. We thus
+        // compare with the lower bound instead as an approximation.
+        ASSERT_LT(max_lower_bound, snap->GetSequenceNumber());
+      }  // seq 0 is ok to be less than max since nothing is visible to it
+      db->ReleaseSnapshot(snap);
+    }
+  });
+
+  t1.join();
+  t2.join();
+
+  // Make sure that the test has worked and seq number has advanced as we
+  // thought
+  auto snap = db->GetSnapshot();
+  ASSERT_GT(snap->GetSequenceNumber(), batch_cnt * writes - 1);
+  db->ReleaseSnapshot(snap);
+}
+
+// Test that reads without snapshots would not hit an undefined state
+TEST_P(WritePreparedTransactionTest, MaxCatchupWithUnbackedSnapshot) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  const size_t commit_cache_bits = 0;    // only 1 entry => frequent eviction
+  UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+  ASSERT_OK(ReOpen());
+  WriteOptions woptions;
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+
+  const int writes = 50;
+  ROCKSDB_NAMESPACE::port::Thread t1([&]() {
+    for (int i = 0; i < writes; i++) {
+      WriteBatch batch;
+      ASSERT_OK(batch.Put("key", "foo"));
+      ASSERT_OK(db->Write(woptions, &batch));
+    }
+  });
+
+  ROCKSDB_NAMESPACE::port::Thread t2([&]() {
+    while (wp_db->max_evicted_seq_ == 0) {  // wait for insert thread
+      std::this_thread::yield();
+    }
+    ReadOptions ropt;
+    PinnableSlice pinnable_val;
+    TransactionOptions txn_options;
+    for (int i = 0; i < 10; i++) {
+      auto s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val);
+      ASSERT_TRUE(s.ok() || s.IsTryAgain());
+      pinnable_val.Reset();
+      Transaction* txn = db->BeginTransaction(woptions, txn_options);
+      s = txn->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val);
+      ASSERT_TRUE(s.ok() || s.IsTryAgain());
+      pinnable_val.Reset();
+      std::vector<std::string> values;
+      auto s_vec =
+          txn->MultiGet(ropt, {db->DefaultColumnFamily()}, {"key"}, &values);
+      ASSERT_EQ(1, values.size());
+      ASSERT_EQ(1, s_vec.size());
+      s = s_vec[0];
+      ASSERT_TRUE(s.ok() || s.IsTryAgain());
+      Slice key("key");
+      txn->MultiGet(ropt, db->DefaultColumnFamily(), 1, &key, &pinnable_val, &s,
+                    true);
+      ASSERT_TRUE(s.ok() || s.IsTryAgain());
+      delete txn;
+    }
+  });
+
+  t1.join();
+  t2.join();
+
+  // Make sure that the test has worked and seq number has advanced as we
+  // thought
+  auto snap = db->GetSnapshot();
+  ASSERT_GT(snap->GetSequenceNumber(), writes - 1);
+  db->ReleaseSnapshot(snap);
+}
+
+// Check that old_commit_map_ cleanup works correctly if the snapshot equals
+// max_evicted_seq_.
+TEST_P(WritePreparedTransactionTest, CleanupSnapshotEqualToMax) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  const size_t commit_cache_bits = 0;    // only 1 entry => frequent eviction
+  UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+  ASSERT_OK(ReOpen());
+  WriteOptions woptions;
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  // Insert something to increase seq
+  ASSERT_OK(db->Put(woptions, "key", "value"));
+  auto snap = db->GetSnapshot();
+  auto snap_seq = snap->GetSequenceNumber();
+  // Another insert should trigger eviction + load snapshot from db
+  ASSERT_OK(db->Put(woptions, "key", "value"));
+  // This is the scenario that we check agaisnt
+  ASSERT_EQ(snap_seq, wp_db->max_evicted_seq_);
+  // old_commit_map_ now has some data that needs gc
+  ASSERT_EQ(1, wp_db->snapshots_total_);
+  ASSERT_EQ(1, wp_db->old_commit_map_.size());
+
+  db->ReleaseSnapshot(snap);
+
+  // Another insert should trigger eviction + load snapshot from db
+  ASSERT_OK(db->Put(woptions, "key", "value"));
+
+  // the snapshot and related metadata must be properly garbage collected
+  ASSERT_EQ(0, wp_db->snapshots_total_);
+  ASSERT_TRUE(wp_db->snapshots_all_.empty());
+  ASSERT_EQ(0, wp_db->old_commit_map_.size());
+}
+
+TEST_P(WritePreparedTransactionTest, AdvanceSeqByOne) {
+  auto snap = db->GetSnapshot();
+  auto seq1 = snap->GetSequenceNumber();
+  db->ReleaseSnapshot(snap);
+
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  wp_db->AdvanceSeqByOne();
+
+  snap = db->GetSnapshot();
+  auto seq2 = snap->GetSequenceNumber();
+  db->ReleaseSnapshot(snap);
+
+  ASSERT_LT(seq1, seq2);
+}
+
+// Test that the txn Initilize calls the overridden functions
+TEST_P(WritePreparedTransactionTest, TxnInitialize) {
+  TransactionOptions txn_options;
+  WriteOptions write_options;
+  ASSERT_OK(db->Put(write_options, "key", "value"));
+  Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_OK(txn0->SetName("xid"));
+  ASSERT_OK(txn0->Put(Slice("key"), Slice("value1")));
+  ASSERT_OK(txn0->Prepare());
+
+  // SetSnapshot is overridden to update min_uncommitted_
+  txn_options.set_snapshot = true;
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+  auto snap = txn1->GetSnapshot();
+  auto snap_impl = reinterpret_cast<const SnapshotImpl*>(snap);
+  // If ::Initialize calls the overriden SetSnapshot, min_uncommitted_ must be
+  // udpated
+  ASSERT_GT(snap_impl->min_uncommitted_, kMinUnCommittedSeq);
+
+  ASSERT_OK(txn0->Rollback());
+  ASSERT_OK(txn1->Rollback());
+  delete txn0;
+  delete txn1;
+}
+
+// This tests that transactions with duplicate keys perform correctly after max
+// is advancing their prepared sequence numbers. This will not be the case if
+// for example the txn does not add the prepared seq for the second sub-batch to
+// the PreparedHeap structure.
+TEST_P(WritePreparedTransactionTest, AdvanceMaxEvictedSeqWithDuplicates) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  const size_t commit_cache_bits = 1;    // disable commit cache
+  UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+  ASSERT_OK(ReOpen());
+
+  ReadOptions ropt;
+  PinnableSlice pinnable_val;
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+  Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_OK(txn0->SetName("xid"));
+  ASSERT_OK(txn0->Put(Slice("key"), Slice("value1")));
+  ASSERT_OK(txn0->Put(Slice("key"), Slice("value2")));
+  ASSERT_OK(txn0->Prepare());
+
+  ASSERT_OK(db->Put(write_options, "key2", "value"));
+  // Will cause max advance due to disabled commit cache
+  ASSERT_OK(db->Put(write_options, "key3", "value"));
+
+  auto s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val);
+  ASSERT_TRUE(s.IsNotFound());
+  delete txn0;
+
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  ASSERT_OK(wp_db->db_impl_->FlushWAL(true));
+  wp_db->TEST_Crash();
+  ASSERT_OK(ReOpenNoDelete());
+  ASSERT_NE(db, nullptr);
+  s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val);
+  ASSERT_TRUE(s.IsNotFound());
+
+  txn0 = db->GetTransactionByName("xid");
+  ASSERT_OK(txn0->Rollback());
+  delete txn0;
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+// Stress SmallestUnCommittedSeq, which reads from both prepared_txns_ and
+// delayed_prepared_, when is run concurrently with advancing max_evicted_seq,
+// which moves prepared txns from prepared_txns_ to delayed_prepared_.
+TEST_P(WritePreparedTransactionTest, SmallestUnCommittedSeq) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  const size_t commit_cache_bits = 1;    // disable commit cache
+  UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+  ASSERT_OK(ReOpen());
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  ReadOptions ropt;
+  PinnableSlice pinnable_val;
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+  std::vector<Transaction*> txns, committed_txns;
+
+  const int cnt = 100;
+  for (int i = 0; i < cnt; i++) {
+    Transaction* txn = db->BeginTransaction(write_options, txn_options);
+    ASSERT_OK(txn->SetName("xid" + std::to_string(i)));
+    auto key = "key1" + std::to_string(i);
+    auto value = "value1" + std::to_string(i);
+    ASSERT_OK(txn->Put(Slice(key), Slice(value)));
+    ASSERT_OK(txn->Prepare());
+    txns.push_back(txn);
+  }
+
+  port::Mutex mutex;
+  Random rnd(1103);
+  ROCKSDB_NAMESPACE::port::Thread commit_thread([&]() {
+    for (int i = 0; i < cnt; i++) {
+      uint32_t index = rnd.Uniform(cnt - i);
+      Transaction* txn;
+      {
+        MutexLock l(&mutex);
+        txn = txns[index];
+        txns.erase(txns.begin() + index);
+      }
+      // Since commit cache is practically disabled, commit results in immediate
+      // advance in max_evicted_seq_ and subsequently moving some prepared txns
+      // to delayed_prepared_.
+      ASSERT_OK(txn->Commit());
+      committed_txns.push_back(txn);
+    }
+  });
+  ROCKSDB_NAMESPACE::port::Thread read_thread([&]() {
+    while (1) {
+      MutexLock l(&mutex);
+      if (txns.empty()) {
+        break;
+      }
+      auto min_uncommitted = wp_db->SmallestUnCommittedSeq();
+      ASSERT_LE(min_uncommitted, (*txns.begin())->GetId());
+    }
+  });
+
+  commit_thread.join();
+  read_thread.join();
+  for (auto txn : committed_txns) {
+    delete txn;
+  }
+}
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrent) {
+  // Given the sequential run of txns, with this timeout we should never see a
+  // deadlock nor a timeout unless we have a key conflict, which should be
+  // almost infeasible.
+  txn_db_options.transaction_lock_timeout = 1000;
+  txn_db_options.default_lock_timeout = 1000;
+  ASSERT_OK(ReOpen());
+  FlushOptions fopt;
+
+  // Number of different txn types we use in this test
+  const size_t type_cnt = 5;
+  // The size of the first write group
+  // TODO(myabandeh): This should be increase for pre-release tests
+  const size_t first_group_size = 2;
+  // Total number of txns we run in each test
+  // TODO(myabandeh): This should be increase for pre-release tests
+  const size_t txn_cnt = first_group_size + 1;
+
+  size_t base[txn_cnt + 1] = {
+      1,
+  };
+  for (size_t bi = 1; bi <= txn_cnt; bi++) {
+    base[bi] = base[bi - 1] * type_cnt;
+  }
+  const size_t max_n = static_cast<size_t>(std::pow(type_cnt, txn_cnt));
+  printf("Number of cases being tested is %" ROCKSDB_PRIszt "\n", max_n);
+  for (size_t n = 0; n < max_n; n++) {
+    if (n > 0) {
+      ASSERT_OK(ReOpen());
+    }
+
+    if (n % split_cnt_ != split_id_) continue;
+    if (n % 1000 == 0) {
+      printf("Tested %" ROCKSDB_PRIszt " cases so far\n", n);
+    }
+    DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+    auto seq = db_impl->TEST_GetLastVisibleSequence();
+    with_empty_commits = 0;
+    exp_seq = seq;
+    // This is increased before writing the batch for commit
+    commit_writes = 0;
+    // This is increased before txn starts linking if it expects to do a commit
+    // eventually
+    expected_commits = 0;
+    std::vector<port::Thread> threads;
+
+    linked.store(0, std::memory_order_release);
+    std::atomic<bool> batch_formed(false);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "WriteThread::EnterAsBatchGroupLeader:End",
+        [&](void* /*arg*/) { batch_formed = true; });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "WriteThread::JoinBatchGroup:Wait", [&](void* /*arg*/) {
+          size_t orig_linked = linked.fetch_add(1, std::memory_order_acq_rel);
+          if (orig_linked == 0) {
+            // Wait until the others are linked too.
+            while (linked.load(std::memory_order_acquire) < first_group_size) {
+            }
+          } else if (orig_linked == first_group_size) {
+            // Make the 2nd batch of the rest of writes plus any followup
+            // commits from the first batch
+            while (linked.load(std::memory_order_acquire) <
+                   txn_cnt + commit_writes) {
+            }
+          }
+          // Then we will have one or more batches consisting of follow-up
+          // commits from the 2nd batch. There is a bit of non-determinism here
+          // but it should be tolerable.
+        });
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    for (size_t bi = 0; bi < txn_cnt; bi++) {
+      // get the bi-th digit in number system based on type_cnt
+      size_t d = (n % base[bi + 1]) / base[bi];
+      switch (d) {
+        case 0:
+          threads.emplace_back(&TransactionTestBase::TestTxn0, this, bi);
+          break;
+        case 1:
+          threads.emplace_back(&TransactionTestBase::TestTxn1, this, bi);
+          break;
+        case 2:
+          threads.emplace_back(&TransactionTestBase::TestTxn2, this, bi);
+          break;
+        case 3:
+          threads.emplace_back(&TransactionTestBase::TestTxn3, this, bi);
+          break;
+        case 4:
+          threads.emplace_back(&TransactionTestBase::TestTxn3, this, bi);
+          break;
+        default:
+          FAIL();
+      }
+      // wait to be linked
+      while (linked.load(std::memory_order_acquire) <= bi) {
+      }
+      // after a queue of size first_group_size
+      if (bi + 1 == first_group_size) {
+        while (!batch_formed) {
+        }
+        // to make it more deterministic, wait until the commits are linked
+        while (linked.load(std::memory_order_acquire) <=
+               bi + expected_commits) {
+        }
+      }
+    }
+    for (auto& t : threads) {
+      t.join();
+    }
+    if (options.two_write_queues) {
+      // In this case none of the above scheduling tricks to deterministically
+      // form merged batches works because the writes go to separate queues.
+      // This would result in different write groups in each run of the test. We
+      // still keep the test since although non-deterministic and hard to debug,
+      // it is still useful to have.
+      // TODO(myabandeh): Add a deterministic unit test for two_write_queues
+    }
+
+    // Check if memtable inserts advanced seq number as expected
+    seq = db_impl->TEST_GetLastVisibleSequence();
+    ASSERT_EQ(exp_seq, seq);
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+    // Check if recovery preserves the last sequence number
+    ASSERT_OK(db_impl->FlushWAL(true));
+    ASSERT_OK(ReOpenNoDelete());
+    ASSERT_NE(db, nullptr);
+    db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+    seq = db_impl->TEST_GetLastVisibleSequence();
+    ASSERT_LE(exp_seq, seq + with_empty_commits);
+
+    // Check if flush preserves the last sequence number
+    ASSERT_OK(db_impl->Flush(fopt));
+    seq = db_impl->GetLatestSequenceNumber();
+    ASSERT_LE(exp_seq, seq + with_empty_commits);
+
+    // Check if recovery after flush preserves the last sequence number
+    ASSERT_OK(db_impl->FlushWAL(true));
+    ASSERT_OK(ReOpenNoDelete());
+    ASSERT_NE(db, nullptr);
+    db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+    seq = db_impl->GetLatestSequenceNumber();
+    ASSERT_LE(exp_seq, seq + with_empty_commits);
+  }
+}
+
+// Run a couple of different txns among them some uncommitted. Restart the db at
+// a couple points to check whether the list of uncommitted txns are recovered
+// properly.
+TEST_P(WritePreparedTransactionTest, BasicRecovery) {
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+
+  TestTxn0(0);
+
+  TransactionOptions txn_options;
+  WriteOptions write_options;
+  size_t index = 1000;
+  Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+  auto istr0 = std::to_string(index);
+  auto s = txn0->SetName("xid" + istr0);
+  ASSERT_OK(s);
+  s = txn0->Put(Slice("foo0" + istr0), Slice("bar0" + istr0));
+  ASSERT_OK(s);
+  s = txn0->Prepare();
+  ASSERT_OK(s);
+  auto prep_seq_0 = txn0->GetId();
+
+  TestTxn1(0);
+
+  index++;
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+  auto istr1 = std::to_string(index);
+  s = txn1->SetName("xid" + istr1);
+  ASSERT_OK(s);
+  s = txn1->Put(Slice("foo1" + istr1), Slice("bar"));
+  ASSERT_OK(s);
+  s = txn1->Prepare();
+  ASSERT_OK(s);
+  auto prep_seq_1 = txn1->GetId();
+
+  TestTxn2(0);
+
+  ReadOptions ropt;
+  PinnableSlice pinnable_val;
+  // Check the value is not committed before restart
+  s = db->Get(ropt, db->DefaultColumnFamily(), "foo0" + istr0, &pinnable_val);
+  ASSERT_TRUE(s.IsNotFound());
+  pinnable_val.Reset();
+
+  delete txn0;
+  delete txn1;
+  ASSERT_OK(wp_db->db_impl_->FlushWAL(true));
+  wp_db->TEST_Crash();
+  ASSERT_OK(ReOpenNoDelete());
+  ASSERT_NE(db, nullptr);
+  wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  // After recovery, all the uncommitted txns (0 and 1) should be inserted into
+  // delayed_prepared_
+  ASSERT_TRUE(wp_db->prepared_txns_.empty());
+  ASSERT_FALSE(wp_db->delayed_prepared_empty_);
+  ASSERT_LE(prep_seq_0, wp_db->max_evicted_seq_);
+  ASSERT_LE(prep_seq_1, wp_db->max_evicted_seq_);
+  {
+    ReadLock rl(&wp_db->prepared_mutex_);
+    ASSERT_EQ(2, wp_db->delayed_prepared_.size());
+    ASSERT_TRUE(wp_db->delayed_prepared_.find(prep_seq_0) !=
+                wp_db->delayed_prepared_.end());
+    ASSERT_TRUE(wp_db->delayed_prepared_.find(prep_seq_1) !=
+                wp_db->delayed_prepared_.end());
+  }
+
+  // Check the value is still not committed after restart
+  s = db->Get(ropt, db->DefaultColumnFamily(), "foo0" + istr0, &pinnable_val);
+  ASSERT_TRUE(s.IsNotFound());
+  pinnable_val.Reset();
+
+  TestTxn3(0);
+
+  // Test that a recovered txns will be properly marked committed for the next
+  // recovery
+  txn1 = db->GetTransactionByName("xid" + istr1);
+  ASSERT_NE(txn1, nullptr);
+  ASSERT_OK(txn1->Commit());
+  delete txn1;
+
+  index++;
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+  auto istr2 = std::to_string(index);
+  s = txn2->SetName("xid" + istr2);
+  ASSERT_OK(s);
+  s = txn2->Put(Slice("foo2" + istr2), Slice("bar"));
+  ASSERT_OK(s);
+  s = txn2->Prepare();
+  ASSERT_OK(s);
+  auto prep_seq_2 = txn2->GetId();
+
+  delete txn2;
+  ASSERT_OK(wp_db->db_impl_->FlushWAL(true));
+  wp_db->TEST_Crash();
+  ASSERT_OK(ReOpenNoDelete());
+  ASSERT_NE(db, nullptr);
+  wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  ASSERT_TRUE(wp_db->prepared_txns_.empty());
+  ASSERT_FALSE(wp_db->delayed_prepared_empty_);
+
+  // 0 and 2 are prepared and 1 is committed
+  {
+    ReadLock rl(&wp_db->prepared_mutex_);
+    ASSERT_EQ(2, wp_db->delayed_prepared_.size());
+    const auto& end = wp_db->delayed_prepared_.end();
+    ASSERT_NE(wp_db->delayed_prepared_.find(prep_seq_0), end);
+    ASSERT_EQ(wp_db->delayed_prepared_.find(prep_seq_1), end);
+    ASSERT_NE(wp_db->delayed_prepared_.find(prep_seq_2), end);
+  }
+  ASSERT_LE(prep_seq_0, wp_db->max_evicted_seq_);
+  ASSERT_LE(prep_seq_2, wp_db->max_evicted_seq_);
+
+  // Commit all the remaining txns
+  txn0 = db->GetTransactionByName("xid" + istr0);
+  ASSERT_NE(txn0, nullptr);
+  ASSERT_OK(txn0->Commit());
+  txn2 = db->GetTransactionByName("xid" + istr2);
+  ASSERT_NE(txn2, nullptr);
+  ASSERT_OK(txn2->Commit());
+
+  // Check the value is committed after commit
+  s = db->Get(ropt, db->DefaultColumnFamily(), "foo0" + istr0, &pinnable_val);
+  ASSERT_TRUE(s.ok());
+  ASSERT_TRUE(pinnable_val == ("bar0" + istr0));
+  pinnable_val.Reset();
+
+  delete txn0;
+  delete txn2;
+  ASSERT_OK(wp_db->db_impl_->FlushWAL(true));
+  ASSERT_OK(ReOpenNoDelete());
+  ASSERT_NE(db, nullptr);
+  wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  ASSERT_TRUE(wp_db->prepared_txns_.empty());
+  ASSERT_TRUE(wp_db->delayed_prepared_empty_);
+
+  // Check the value is still committed after recovery
+  s = db->Get(ropt, db->DefaultColumnFamily(), "foo0" + istr0, &pinnable_val);
+  ASSERT_TRUE(s.ok());
+  ASSERT_TRUE(pinnable_val == ("bar0" + istr0));
+  pinnable_val.Reset();
+}
+
+// After recovery the commit map is empty while the max is set. The code would
+// go through a different path which requires a separate test. Test that the
+// committed data before the restart is visible to all snapshots.
+TEST_P(WritePreparedTransactionTest, IsInSnapshotEmptyMap) {
+  for (bool end_with_prepare : {false, true}) {
+    ASSERT_OK(ReOpen());
+    WriteOptions woptions;
+    ASSERT_OK(db->Put(woptions, "key", "value"));
+    ASSERT_OK(db->Put(woptions, "key", "value"));
+    ASSERT_OK(db->Put(woptions, "key", "value"));
+    SequenceNumber prepare_seq = kMaxSequenceNumber;
+    if (end_with_prepare) {
+      TransactionOptions txn_options;
+      Transaction* txn = db->BeginTransaction(woptions, txn_options);
+      ASSERT_OK(txn->SetName("xid0"));
+      ASSERT_OK(txn->Prepare());
+      prepare_seq = txn->GetId();
+      delete txn;
+    }
+    dynamic_cast<WritePreparedTxnDB*>(db)->TEST_Crash();
+    auto db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+    ASSERT_OK(db_impl->FlushWAL(true));
+    ASSERT_OK(ReOpenNoDelete());
+    WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+    ASSERT_NE(wp_db, nullptr);
+    ASSERT_GT(wp_db->max_evicted_seq_, 0);  // max after recovery
+    // Take a snapshot right after recovery
+    const Snapshot* snap = db->GetSnapshot();
+    auto snap_seq = snap->GetSequenceNumber();
+    ASSERT_GT(snap_seq, 0);
+
+    for (SequenceNumber seq = 0;
+         seq <= wp_db->max_evicted_seq_ && seq != prepare_seq; seq++) {
+      ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq));
+    }
+    if (end_with_prepare) {
+      ASSERT_FALSE(wp_db->IsInSnapshot(prepare_seq, snap_seq));
+    }
+    // trivial check
+    ASSERT_FALSE(wp_db->IsInSnapshot(snap_seq + 1, snap_seq));
+
+    db->ReleaseSnapshot(snap);
+
+    ASSERT_OK(db->Put(woptions, "key", "value"));
+    // Take a snapshot after some writes
+    snap = db->GetSnapshot();
+    snap_seq = snap->GetSequenceNumber();
+    for (SequenceNumber seq = 0;
+         seq <= wp_db->max_evicted_seq_ && seq != prepare_seq; seq++) {
+      ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq));
+    }
+    if (end_with_prepare) {
+      ASSERT_FALSE(wp_db->IsInSnapshot(prepare_seq, snap_seq));
+    }
+    // trivial check
+    ASSERT_FALSE(wp_db->IsInSnapshot(snap_seq + 1, snap_seq));
+
+    db->ReleaseSnapshot(snap);
+  }
+}
+
+// Shows the contract of IsInSnapshot when called on invalid/released snapshots
+TEST_P(WritePreparedTransactionTest, IsInSnapshotReleased) {
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  WriteOptions woptions;
+  ASSERT_OK(db->Put(woptions, "key", "value"));
+  // snap seq = 1
+  const Snapshot* snap1 = db->GetSnapshot();
+  ASSERT_OK(db->Put(woptions, "key", "value"));
+  ASSERT_OK(db->Put(woptions, "key", "value"));
+  // snap seq = 3
+  const Snapshot* snap2 = db->GetSnapshot();
+  const SequenceNumber seq = 1;
+  // Evict seq out of commit cache
+  size_t overwrite_seq = wp_db->COMMIT_CACHE_SIZE + seq;
+  wp_db->AddCommitted(overwrite_seq, overwrite_seq);
+  SequenceNumber snap_seq;
+  uint64_t min_uncommitted = kMinUnCommittedSeq;
+  bool released;
+
+  released = false;
+  snap_seq = snap1->GetSequenceNumber();
+  ASSERT_LE(seq, snap_seq);
+  // Valid snapshot lower than max
+  ASSERT_LE(snap_seq, wp_db->max_evicted_seq_);
+  ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq, min_uncommitted, &released));
+  ASSERT_FALSE(released);
+
+  released = false;
+  snap_seq = snap1->GetSequenceNumber();
+  // Invaid snapshot lower than max
+  ASSERT_LE(snap_seq + 1, wp_db->max_evicted_seq_);
+  ASSERT_TRUE(
+      wp_db->IsInSnapshot(seq, snap_seq + 1, min_uncommitted, &released));
+  ASSERT_TRUE(released);
+
+  db->ReleaseSnapshot(snap1);
+
+  released = false;
+  // Released snapshot lower than max
+  ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq, min_uncommitted, &released));
+  // The release does not take affect until the next max advance
+  ASSERT_FALSE(released);
+
+  released = false;
+  // Invaid snapshot lower than max
+  ASSERT_TRUE(
+      wp_db->IsInSnapshot(seq, snap_seq + 1, min_uncommitted, &released));
+  ASSERT_TRUE(released);
+
+  // This make the snapshot release to reflect in txn db structures
+  wp_db->AdvanceMaxEvictedSeq(wp_db->max_evicted_seq_,
+                              wp_db->max_evicted_seq_ + 1);
+
+  released = false;
+  // Released snapshot lower than max
+  ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq, min_uncommitted, &released));
+  ASSERT_TRUE(released);
+
+  released = false;
+  // Invaid snapshot lower than max
+  ASSERT_TRUE(
+      wp_db->IsInSnapshot(seq, snap_seq + 1, min_uncommitted, &released));
+  ASSERT_TRUE(released);
+
+  snap_seq = snap2->GetSequenceNumber();
+
+  released = false;
+  // Unreleased snapshot lower than max
+  ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq, min_uncommitted, &released));
+  ASSERT_FALSE(released);
+
+  db->ReleaseSnapshot(snap2);
+}
+
+// Test WritePreparedTxnDB's IsInSnapshot against different ordering of
+// snapshot, max_committed_seq_, prepared, and commit entries.
+TEST_P(WritePreparedTransactionTest, IsInSnapshot) {
+  WriteOptions wo;
+  // Use small commit cache to trigger lots of eviction and fast advance of
+  // max_evicted_seq_
+  const size_t commit_cache_bits = 3;
+  // Same for snapshot cache size
+  const size_t snapshot_cache_bits = 2;
+
+  // Take some preliminary snapshots first. This is to stress the data structure
+  // that holds the old snapshots as it will be designed to be efficient when
+  // only a few snapshots are below the max_evicted_seq_.
+  for (int max_snapshots = 1; max_snapshots < 20; max_snapshots++) {
+    // Leave some gap between the preliminary snapshots and the final snapshot
+    // that we check. This should test for also different overlapping scenarios
+    // between the last snapshot and the commits.
+    for (int max_gap = 1; max_gap < 10; max_gap++) {
+      // Since we do not actually write to db, we mock the seq as it would be
+      // increased by the db. The only exception is that we need db seq to
+      // advance for our snapshots. for which we apply a dummy put each time we
+      // increase our mock of seq.
+      uint64_t seq = 0;
+      // At each step we prepare a txn and then we commit it in the next txn.
+      // This emulates the consecutive transactions that write to the same key
+      uint64_t cur_txn = 0;
+      // Number of snapshots taken so far
+      int num_snapshots = 0;
+      // Number of gaps applied so far
+      int gap_cnt = 0;
+      // The final snapshot that we will inspect
+      uint64_t snapshot = 0;
+      bool found_committed = false;
+      // To stress the data structure that maintain prepared txns, at each cycle
+      // we add a new prepare txn. These do not mean to be committed for
+      // snapshot inspection.
+      std::set<uint64_t> prepared;
+      // We keep the list of txns committed before we take the last snapshot.
+      // These should be the only seq numbers that will be found in the snapshot
+      std::set<uint64_t> committed_before;
+      // The set of commit seq numbers to be excluded from IsInSnapshot queries
+      std::set<uint64_t> commit_seqs;
+      DBImpl* mock_db = new DBImpl(options, dbname);
+      UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+      std::unique_ptr<WritePreparedTxnDBMock> wp_db(
+          new WritePreparedTxnDBMock(mock_db, txn_db_options));
+      // We continue until max advances a bit beyond the snapshot.
+      while (!snapshot || wp_db->max_evicted_seq_ < snapshot + 100) {
+        // do prepare for a transaction
+        seq++;
+        wp_db->AddPrepared(seq);
+        prepared.insert(seq);
+
+        // If cur_txn is not started, do prepare for it.
+        if (!cur_txn) {
+          seq++;
+          cur_txn = seq;
+          wp_db->AddPrepared(cur_txn);
+        } else {  // else commit it
+          seq++;
+          wp_db->AddCommitted(cur_txn, seq);
+          wp_db->RemovePrepared(cur_txn);
+          commit_seqs.insert(seq);
+          if (!snapshot) {
+            committed_before.insert(cur_txn);
+          }
+          cur_txn = 0;
+        }
+
+        if (num_snapshots < max_snapshots - 1) {
+          // Take preliminary snapshots
+          wp_db->TakeSnapshot(seq);
+          num_snapshots++;
+        } else if (gap_cnt < max_gap) {
+          // Wait for some gap before taking the final snapshot
+          gap_cnt++;
+        } else if (!snapshot) {
+          // Take the final snapshot if it is not already taken
+          snapshot = seq;
+          wp_db->TakeSnapshot(snapshot);
+          num_snapshots++;
+        }
+
+        // If the snapshot is taken, verify seq numbers visible to it. We redo
+        // it at each cycle to test that the system is still sound when
+        // max_evicted_seq_ advances.
+        if (snapshot) {
+          for (uint64_t s = 1;
+               s <= seq && commit_seqs.find(s) == commit_seqs.end(); s++) {
+            bool was_committed =
+                (committed_before.find(s) != committed_before.end());
+            bool is_in_snapshot = wp_db->IsInSnapshot(s, snapshot);
+            if (was_committed != is_in_snapshot) {
+              printf("max_snapshots %d max_gap %d seq %" PRIu64 " max %" PRIu64
+                     " snapshot %" PRIu64
+                     " gap_cnt %d num_snapshots %d s %" PRIu64 "\n",
+                     max_snapshots, max_gap, seq,
+                     wp_db->max_evicted_seq_.load(), snapshot, gap_cnt,
+                     num_snapshots, s);
+            }
+            ASSERT_EQ(was_committed, is_in_snapshot);
+            found_committed = found_committed || is_in_snapshot;
+          }
+        }
+      }
+      // Safety check to make sure the test actually ran
+      ASSERT_TRUE(found_committed);
+      // As an extra check, check if prepared set will be properly empty after
+      // they are committed.
+      if (cur_txn) {
+        wp_db->AddCommitted(cur_txn, seq);
+        wp_db->RemovePrepared(cur_txn);
+      }
+      for (auto p : prepared) {
+        wp_db->AddCommitted(p, seq);
+        wp_db->RemovePrepared(p);
+      }
+      ASSERT_TRUE(wp_db->delayed_prepared_.empty());
+      ASSERT_TRUE(wp_db->prepared_txns_.empty());
+    }
+  }
+}
+
+void ASSERT_SAME(ReadOptions roptions, TransactionDB* db, Status exp_s,
+                 PinnableSlice& exp_v, Slice key) {
+  Status s;
+  PinnableSlice v;
+  s = db->Get(roptions, db->DefaultColumnFamily(), key, &v);
+  ASSERT_EQ(exp_s, s);
+  ASSERT_TRUE(s.ok() || s.IsNotFound());
+  if (s.ok()) {
+    ASSERT_TRUE(exp_v == v);
+  }
+
+  // Try with MultiGet API too
+  std::vector<std::string> values;
+  auto s_vec =
+      db->MultiGet(roptions, {db->DefaultColumnFamily()}, {key}, &values);
+  ASSERT_EQ(1, values.size());
+  ASSERT_EQ(1, s_vec.size());
+  s = s_vec[0];
+  ASSERT_EQ(exp_s, s);
+  ASSERT_TRUE(s.ok() || s.IsNotFound());
+  if (s.ok()) {
+    ASSERT_TRUE(exp_v == values[0]);
+  }
+}
+
+void ASSERT_SAME(TransactionDB* db, Status exp_s, PinnableSlice& exp_v,
+                 Slice key) {
+  ASSERT_SAME(ReadOptions(), db, exp_s, exp_v, key);
+}
+
+TEST_P(WritePreparedTransactionTest, Rollback) {
+  ReadOptions roptions;
+  WriteOptions woptions;
+  TransactionOptions txn_options;
+  const size_t num_keys = 4;
+  const size_t num_values = 5;
+  for (size_t ikey = 1; ikey <= num_keys; ikey++) {
+    for (size_t ivalue = 0; ivalue < num_values; ivalue++) {
+      for (bool crash : {false, true}) {
+        ASSERT_OK(ReOpen());
+        WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+        std::string key_str = "key" + std::to_string(ikey);
+        switch (ivalue) {
+          case 0:
+            break;
+          case 1:
+            ASSERT_OK(db->Put(woptions, key_str, "initvalue1"));
+            break;
+          case 2:
+            ASSERT_OK(db->Merge(woptions, key_str, "initvalue2"));
+            break;
+          case 3:
+            ASSERT_OK(db->Delete(woptions, key_str));
+            break;
+          case 4:
+            ASSERT_OK(db->SingleDelete(woptions, key_str));
+            break;
+          default:
+            FAIL();
+        }
+
+        PinnableSlice v1;
+        auto s1 =
+            db->Get(roptions, db->DefaultColumnFamily(), Slice("key1"), &v1);
+        PinnableSlice v2;
+        auto s2 =
+            db->Get(roptions, db->DefaultColumnFamily(), Slice("key2"), &v2);
+        PinnableSlice v3;
+        auto s3 =
+            db->Get(roptions, db->DefaultColumnFamily(), Slice("key3"), &v3);
+        PinnableSlice v4;
+        auto s4 =
+            db->Get(roptions, db->DefaultColumnFamily(), Slice("key4"), &v4);
+        Transaction* txn = db->BeginTransaction(woptions, txn_options);
+        auto s = txn->SetName("xid0");
+        ASSERT_OK(s);
+        s = txn->Put(Slice("key1"), Slice("value1"));
+        ASSERT_OK(s);
+        s = txn->Merge(Slice("key2"), Slice("value2"));
+        ASSERT_OK(s);
+        s = txn->Delete(Slice("key3"));
+        ASSERT_OK(s);
+        s = txn->SingleDelete(Slice("key4"));
+        ASSERT_OK(s);
+        s = txn->Prepare();
+        ASSERT_OK(s);
+
+        {
+          ReadLock rl(&wp_db->prepared_mutex_);
+          ASSERT_FALSE(wp_db->prepared_txns_.empty());
+          ASSERT_EQ(txn->GetId(), wp_db->prepared_txns_.top());
+        }
+
+        ASSERT_SAME(db, s1, v1, "key1");
+        ASSERT_SAME(db, s2, v2, "key2");
+        ASSERT_SAME(db, s3, v3, "key3");
+        ASSERT_SAME(db, s4, v4, "key4");
+
+        if (crash) {
+          delete txn;
+          auto db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+          ASSERT_OK(db_impl->FlushWAL(true));
+          dynamic_cast<WritePreparedTxnDB*>(db)->TEST_Crash();
+          ASSERT_OK(ReOpenNoDelete());
+          ASSERT_NE(db, nullptr);
+          wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+          txn = db->GetTransactionByName("xid0");
+          ASSERT_FALSE(wp_db->delayed_prepared_empty_);
+          ReadLock rl(&wp_db->prepared_mutex_);
+          ASSERT_TRUE(wp_db->prepared_txns_.empty());
+          ASSERT_FALSE(wp_db->delayed_prepared_.empty());
+          ASSERT_TRUE(wp_db->delayed_prepared_.find(txn->GetId()) !=
+                      wp_db->delayed_prepared_.end());
+        }
+
+        ASSERT_SAME(db, s1, v1, "key1");
+        ASSERT_SAME(db, s2, v2, "key2");
+        ASSERT_SAME(db, s3, v3, "key3");
+        ASSERT_SAME(db, s4, v4, "key4");
+
+        s = txn->Rollback();
+        ASSERT_OK(s);
+
+        {
+          ASSERT_TRUE(wp_db->delayed_prepared_empty_);
+          ReadLock rl(&wp_db->prepared_mutex_);
+          ASSERT_TRUE(wp_db->prepared_txns_.empty());
+          ASSERT_TRUE(wp_db->delayed_prepared_.empty());
+        }
+
+        ASSERT_SAME(db, s1, v1, "key1");
+        ASSERT_SAME(db, s2, v2, "key2");
+        ASSERT_SAME(db, s3, v3, "key3");
+        ASSERT_SAME(db, s4, v4, "key4");
+        delete txn;
+      }
+    }
+  }
+}
+
+TEST_P(WritePreparedTransactionTest, DisableGCDuringRecovery) {
+  // Use large buffer to avoid memtable flush after 1024 insertions
+  options.write_buffer_size = 1024 * 1024;
+  ASSERT_OK(ReOpen());
+  std::vector<KeyVersion> versions;
+  uint64_t seq = 0;
+  for (uint64_t i = 1; i <= 1024; i++) {
+    std::string v = "bar" + std::to_string(i);
+    ASSERT_OK(db->Put(WriteOptions(), "foo", v));
+    VerifyKeys({{"foo", v}});
+    seq++;  // one for the key/value
+    KeyVersion kv = {"foo", v, seq, kTypeValue};
+    if (options.two_write_queues) {
+      seq++;  // one for the commit
+    }
+    versions.emplace_back(kv);
+  }
+  std::reverse(std::begin(versions), std::end(versions));
+  VerifyInternalKeys(versions);
+  DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+  ASSERT_OK(db_impl->FlushWAL(true));
+  // Use small buffer to ensure memtable flush during recovery
+  options.write_buffer_size = 1024;
+  ASSERT_OK(ReOpenNoDelete());
+  VerifyInternalKeys(versions);
+}
+
+TEST_P(WritePreparedTransactionTest, SequenceNumberZero) {
+  ASSERT_OK(db->Put(WriteOptions(), "foo", "bar"));
+  VerifyKeys({{"foo", "bar"}});
+  const Snapshot* snapshot = db->GetSnapshot();
+  ASSERT_OK(db->Flush(FlushOptions()));
+  // Dummy keys to avoid compaction trivially move files and get around actual
+  // compaction logic.
+  ASSERT_OK(db->Put(WriteOptions(), "a", "dummy"));
+  ASSERT_OK(db->Put(WriteOptions(), "z", "dummy"));
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  // Compaction will output keys with sequence number 0, if it is visible to
+  // earliest snapshot. Make sure IsInSnapshot() report sequence number 0 is
+  // visible to any snapshot.
+  VerifyKeys({{"foo", "bar"}});
+  VerifyKeys({{"foo", "bar"}}, snapshot);
+  VerifyInternalKeys({{"foo", "bar", 0, kTypeValue}});
+  db->ReleaseSnapshot(snapshot);
+}
+
+// Compaction should not remove a key if it is not committed, and should
+// proceed with older versions of the key as-if the new version doesn't exist.
+TEST_P(WritePreparedTransactionTest, CompactionShouldKeepUncommittedKeys) {
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+  DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+  // Snapshots to avoid keys get evicted.
+  std::vector<const Snapshot*> snapshots;
+  // Keep track of expected sequence number.
+  SequenceNumber expected_seq = 0;
+
+  auto add_key = [&](std::function<Status()> func) {
+    ASSERT_OK(func());
+    expected_seq++;
+    if (options.two_write_queues) {
+      expected_seq++;  // 1 for commit
+    }
+    ASSERT_EQ(expected_seq, db_impl->TEST_GetLastVisibleSequence());
+    snapshots.push_back(db->GetSnapshot());
+  };
+
+  // Each key here represent a standalone test case.
+  add_key([&]() { return db->Put(WriteOptions(), "key1", "value1_1"); });
+  add_key([&]() { return db->Put(WriteOptions(), "key2", "value2_1"); });
+  add_key([&]() { return db->Put(WriteOptions(), "key3", "value3_1"); });
+  add_key([&]() { return db->Put(WriteOptions(), "key4", "value4_1"); });
+  add_key([&]() { return db->Merge(WriteOptions(), "key5", "value5_1"); });
+  add_key([&]() { return db->Merge(WriteOptions(), "key5", "value5_2"); });
+  add_key([&]() { return db->Put(WriteOptions(), "key6", "value6_1"); });
+  add_key([&]() { return db->Put(WriteOptions(), "key7", "value7_1"); });
+  ASSERT_OK(db->Flush(FlushOptions()));
+  add_key([&]() { return db->Delete(WriteOptions(), "key6"); });
+  add_key([&]() { return db->SingleDelete(WriteOptions(), "key7"); });
+
+  auto* transaction = db->BeginTransaction(WriteOptions());
+  ASSERT_OK(transaction->SetName("txn"));
+  ASSERT_OK(transaction->Put("key1", "value1_2"));
+  ASSERT_OK(transaction->Delete("key2"));
+  ASSERT_OK(transaction->SingleDelete("key3"));
+  ASSERT_OK(transaction->Merge("key4", "value4_2"));
+  ASSERT_OK(transaction->Merge("key5", "value5_3"));
+  ASSERT_OK(transaction->Put("key6", "value6_2"));
+  ASSERT_OK(transaction->Put("key7", "value7_2"));
+  // Prepare but not commit.
+  ASSERT_OK(transaction->Prepare());
+  ASSERT_EQ(++expected_seq, db->GetLatestSequenceNumber());
+  ASSERT_OK(db->Flush(FlushOptions()));
+  for (auto* s : snapshots) {
+    db->ReleaseSnapshot(s);
+  }
+  // Dummy keys to avoid compaction trivially move files and get around actual
+  // compaction logic.
+  ASSERT_OK(db->Put(WriteOptions(), "a", "dummy"));
+  ASSERT_OK(db->Put(WriteOptions(), "z", "dummy"));
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  VerifyKeys({
+      {"key1", "value1_1"},
+      {"key2", "value2_1"},
+      {"key3", "value3_1"},
+      {"key4", "value4_1"},
+      {"key5", "value5_1,value5_2"},
+      {"key6", "NOT_FOUND"},
+      {"key7", "NOT_FOUND"},
+  });
+  VerifyInternalKeys({
+      {"key1", "value1_2", expected_seq, kTypeValue},
+      {"key1", "value1_1", 0, kTypeValue},
+      {"key2", "", expected_seq, kTypeDeletion},
+      {"key2", "value2_1", 0, kTypeValue},
+      {"key3", "", expected_seq, kTypeSingleDeletion},
+      {"key3", "value3_1", 0, kTypeValue},
+      {"key4", "value4_2", expected_seq, kTypeMerge},
+      {"key4", "value4_1", 0, kTypeValue},
+      {"key5", "value5_3", expected_seq, kTypeMerge},
+      {"key5", "value5_1,value5_2", 0, kTypeValue},
+      {"key6", "value6_2", expected_seq, kTypeValue},
+      {"key7", "value7_2", expected_seq, kTypeValue},
+  });
+  ASSERT_OK(transaction->Commit());
+  VerifyKeys({
+      {"key1", "value1_2"},
+      {"key2", "NOT_FOUND"},
+      {"key3", "NOT_FOUND"},
+      {"key4", "value4_1,value4_2"},
+      {"key5", "value5_1,value5_2,value5_3"},
+      {"key6", "value6_2"},
+      {"key7", "value7_2"},
+  });
+  delete transaction;
+}
+
+// Compaction should keep keys visible to a snapshot based on commit sequence,
+// not just prepare sequence.
+TEST_P(WritePreparedTransactionTest, CompactionShouldKeepSnapshotVisibleKeys) {
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+  // Keep track of expected sequence number.
+  SequenceNumber expected_seq = 0;
+  auto* txn1 = db->BeginTransaction(WriteOptions());
+  ASSERT_OK(txn1->SetName("txn1"));
+  ASSERT_OK(txn1->Put("key1", "value1_1"));
+  ASSERT_OK(txn1->Prepare());
+  ASSERT_EQ(++expected_seq, db->GetLatestSequenceNumber());
+  ASSERT_OK(txn1->Commit());
+  DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+  ASSERT_EQ(++expected_seq, db_impl->TEST_GetLastVisibleSequence());
+  delete txn1;
+  // Take a snapshots to avoid keys get evicted before compaction.
+  const Snapshot* snapshot1 = db->GetSnapshot();
+  auto* txn2 = db->BeginTransaction(WriteOptions());
+  ASSERT_OK(txn2->SetName("txn2"));
+  ASSERT_OK(txn2->Put("key2", "value2_1"));
+  ASSERT_OK(txn2->Prepare());
+  ASSERT_EQ(++expected_seq, db->GetLatestSequenceNumber());
+  // txn1 commit before snapshot2 and it is visible to snapshot2.
+  // txn2 commit after snapshot2 and it is not visible.
+  const Snapshot* snapshot2 = db->GetSnapshot();
+  ASSERT_OK(txn2->Commit());
+  ASSERT_EQ(++expected_seq, db_impl->TEST_GetLastVisibleSequence());
+  delete txn2;
+  // Take a snapshots to avoid keys get evicted before compaction.
+  const Snapshot* snapshot3 = db->GetSnapshot();
+  ASSERT_OK(db->Put(WriteOptions(), "key1", "value1_2"));
+  expected_seq++;  // 1 for write
+  SequenceNumber seq1 = expected_seq;
+  if (options.two_write_queues) {
+    expected_seq++;  // 1 for commit
+  }
+  ASSERT_EQ(expected_seq, db_impl->TEST_GetLastVisibleSequence());
+  ASSERT_OK(db->Put(WriteOptions(), "key2", "value2_2"));
+  expected_seq++;  // 1 for write
+  SequenceNumber seq2 = expected_seq;
+  if (options.two_write_queues) {
+    expected_seq++;  // 1 for commit
+  }
+  ASSERT_EQ(expected_seq, db_impl->TEST_GetLastVisibleSequence());
+  ASSERT_OK(db->Flush(FlushOptions()));
+  db->ReleaseSnapshot(snapshot1);
+  db->ReleaseSnapshot(snapshot3);
+  // Dummy keys to avoid compaction trivially move files and get around actual
+  // compaction logic.
+  ASSERT_OK(db->Put(WriteOptions(), "a", "dummy"));
+  ASSERT_OK(db->Put(WriteOptions(), "z", "dummy"));
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  VerifyKeys({{"key1", "value1_2"}, {"key2", "value2_2"}});
+  VerifyKeys({{"key1", "value1_1"}, {"key2", "NOT_FOUND"}}, snapshot2);
+  VerifyInternalKeys({
+      {"key1", "value1_2", seq1, kTypeValue},
+      // "value1_1" is visible to snapshot2. Also keys at bottom level visible
+      // to earliest snapshot will output with seq = 0.
+      {"key1", "value1_1", 0, kTypeValue},
+      {"key2", "value2_2", seq2, kTypeValue},
+  });
+  db->ReleaseSnapshot(snapshot2);
+}
+
+TEST_P(WritePreparedTransactionTest, SmallestUncommittedOptimization) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  const size_t commit_cache_bits = 0;    // disable commit cache
+  for (bool has_recent_prepare : {true, false}) {
+    UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+    ASSERT_OK(ReOpen());
+
+    ASSERT_OK(db->Put(WriteOptions(), "key1", "value1"));
+    auto* transaction =
+        db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr);
+    ASSERT_OK(transaction->SetName("txn"));
+    ASSERT_OK(transaction->Delete("key1"));
+    ASSERT_OK(transaction->Prepare());
+    // snapshot1 should get min_uncommitted from prepared_txns_ heap.
+    auto snapshot1 = db->GetSnapshot();
+    ASSERT_EQ(transaction->GetId(),
+              ((SnapshotImpl*)snapshot1)->min_uncommitted_);
+    // Add a commit to advance max_evicted_seq and move the prepared transaction
+    // into delayed_prepared_ set.
+    ASSERT_OK(db->Put(WriteOptions(), "key2", "value2"));
+    Transaction* txn2 = nullptr;
+    if (has_recent_prepare) {
+      txn2 =
+          db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr);
+      ASSERT_OK(txn2->SetName("txn2"));
+      ASSERT_OK(txn2->Put("key3", "value3"));
+      ASSERT_OK(txn2->Prepare());
+    }
+    // snapshot2 should get min_uncommitted from delayed_prepared_ set.
+    auto snapshot2 = db->GetSnapshot();
+    ASSERT_EQ(transaction->GetId(),
+              ((SnapshotImpl*)snapshot1)->min_uncommitted_);
+    ASSERT_OK(transaction->Commit());
+    delete transaction;
+    if (has_recent_prepare) {
+      ASSERT_OK(txn2->Commit());
+      delete txn2;
+    }
+    VerifyKeys({{"key1", "NOT_FOUND"}});
+    VerifyKeys({{"key1", "value1"}}, snapshot1);
+    VerifyKeys({{"key1", "value1"}}, snapshot2);
+    db->ReleaseSnapshot(snapshot1);
+    db->ReleaseSnapshot(snapshot2);
+  }
+}
+
+// Insert two values, v1 and v2, for a key. Between prepare and commit of v2
+// take two snapshots, s1 and s2. Release s1 during compaction.
+// Test to make sure compaction doesn't get confused and think s1 can see both
+// values, and thus compact out the older value by mistake.
+TEST_P(WritePreparedTransactionTest, ReleaseSnapshotDuringCompaction) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  const size_t commit_cache_bits = 0;    // minimum commit cache
+  UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+
+  ASSERT_OK(db->Put(WriteOptions(), "key1", "value1_1"));
+  auto* transaction =
+      db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr);
+  ASSERT_OK(transaction->SetName("txn"));
+  ASSERT_OK(transaction->Put("key1", "value1_2"));
+  ASSERT_OK(transaction->Prepare());
+  auto snapshot1 = db->GetSnapshot();
+  // Increment sequence number.
+  ASSERT_OK(db->Put(WriteOptions(), "key2", "value2"));
+  auto snapshot2 = db->GetSnapshot();
+  ASSERT_OK(transaction->Commit());
+  delete transaction;
+  VerifyKeys({{"key1", "value1_2"}});
+  VerifyKeys({{"key1", "value1_1"}}, snapshot1);
+  VerifyKeys({{"key1", "value1_1"}}, snapshot2);
+  // Add a flush to avoid compaction to fallback to trivial move.
+
+  // The callback might be called twice, record the calling state to
+  // prevent double calling.
+  bool callback_finished = false;
+  auto callback = [&](void*) {
+    if (callback_finished) {
+      return;
+    }
+    // Release snapshot1 after CompactionIterator init.
+    // CompactionIterator need to figure out the earliest snapshot
+    // that can see key1:value1_2 is kMaxSequenceNumber, not
+    // snapshot1 or snapshot2.
+    db->ReleaseSnapshot(snapshot1);
+    // Add some keys to advance max_evicted_seq.
+    ASSERT_OK(db->Put(WriteOptions(), "key3", "value3"));
+    ASSERT_OK(db->Put(WriteOptions(), "key4", "value4"));
+    callback_finished = true;
+  };
+  SyncPoint::GetInstance()->SetCallBack("CompactionIterator:AfterInit",
+                                        callback);
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db->Flush(FlushOptions()));
+  VerifyKeys({{"key1", "value1_2"}});
+  VerifyKeys({{"key1", "value1_1"}}, snapshot2);
+  db->ReleaseSnapshot(snapshot2);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// Insert two values, v1 and v2, for a key. Take two snapshots, s1 and s2,
+// after committing v2. Release s1 during compaction, right after compaction
+// processes v2 and before processes v1. Test to make sure compaction doesn't
+// get confused and believe v1 and v2 are visible to different snapshot
+// (v1 by s2, v2 by s1) and refuse to compact out v1.
+TEST_P(WritePreparedTransactionTest, ReleaseSnapshotDuringCompaction2) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  const size_t commit_cache_bits = 0;    // minimum commit cache
+  UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+
+  ASSERT_OK(db->Put(WriteOptions(), "key1", "value1"));
+  ASSERT_OK(db->Put(WriteOptions(), "key1", "value2"));
+  SequenceNumber v2_seq = db->GetLatestSequenceNumber();
+  auto* s1 = db->GetSnapshot();
+  // Advance sequence number.
+  ASSERT_OK(db->Put(WriteOptions(), "key2", "dummy"));
+  auto* s2 = db->GetSnapshot();
+
+  int count_value = 0;
+  auto callback = [&](void* arg) {
+    auto* ikey = reinterpret_cast<ParsedInternalKey*>(arg);
+    if (ikey->user_key == "key1") {
+      count_value++;
+      if (count_value == 2) {
+        // Processing v1.
+        db->ReleaseSnapshot(s1);
+        // Add some keys to advance max_evicted_seq and update
+        // old_commit_map.
+        ASSERT_OK(db->Put(WriteOptions(), "key3", "dummy"));
+        ASSERT_OK(db->Put(WriteOptions(), "key4", "dummy"));
+      }
+    }
+  };
+  SyncPoint::GetInstance()->SetCallBack("CompactionIterator:ProcessKV",
+                                        callback);
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db->Flush(FlushOptions()));
+  // value1 should be compact out.
+  VerifyInternalKeys({{"key1", "value2", v2_seq, kTypeValue}});
+
+  // cleanup
+  db->ReleaseSnapshot(s2);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// Insert two values, v1 and v2, for a key. Insert another dummy key
+// so to evict the commit cache for v2, while v1 is still in commit cache.
+// Take two snapshots, s1 and s2. Release s1 during compaction.
+// Since commit cache for v2 is evicted, and old_commit_map don't have
+// s1 (it is released),
+// TODO(myabandeh): how can we be sure that the v2's commit info is evicted
+// (and not v1's)? Instead of putting a dummy, we can directly call
+// AddCommitted(v2_seq + cache_size, ...) to evict v2's entry from commit cache.
+TEST_P(WritePreparedTransactionTest, ReleaseSnapshotDuringCompaction3) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  const size_t commit_cache_bits = 1;    // commit cache size = 2
+  UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+
+  // Add a dummy key to evict v2 commit cache, but keep v1 commit cache.
+  // It also advance max_evicted_seq and can trigger old_commit_map cleanup.
+  auto add_dummy = [&]() {
+    auto* txn_dummy =
+        db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr);
+    ASSERT_OK(txn_dummy->SetName("txn_dummy"));
+    ASSERT_OK(txn_dummy->Put("dummy", "dummy"));
+    ASSERT_OK(txn_dummy->Prepare());
+    ASSERT_OK(txn_dummy->Commit());
+    delete txn_dummy;
+  };
+
+  ASSERT_OK(db->Put(WriteOptions(), "key1", "value1"));
+  auto* txn =
+      db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr);
+  ASSERT_OK(txn->SetName("txn"));
+  ASSERT_OK(txn->Put("key1", "value2"));
+  ASSERT_OK(txn->Prepare());
+  // TODO(myabandeh): replace it with GetId()?
+  auto v2_seq = db->GetLatestSequenceNumber();
+  ASSERT_OK(txn->Commit());
+  delete txn;
+  auto* s1 = db->GetSnapshot();
+  // Dummy key to advance sequence number.
+  add_dummy();
+  auto* s2 = db->GetSnapshot();
+
+  // The callback might be called twice, record the calling state to
+  // prevent double calling.
+  bool callback_finished = false;
+  auto callback = [&](void*) {
+    if (callback_finished) {
+      return;
+    }
+    db->ReleaseSnapshot(s1);
+    // Add some dummy entries to trigger s1 being cleanup from old_commit_map.
+    add_dummy();
+    add_dummy();
+    callback_finished = true;
+  };
+  SyncPoint::GetInstance()->SetCallBack("CompactionIterator:AfterInit",
+                                        callback);
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db->Flush(FlushOptions()));
+  // value1 should be compact out.
+  VerifyInternalKeys({{"key1", "value2", v2_seq, kTypeValue}});
+
+  db->ReleaseSnapshot(s2);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(WritePreparedTransactionTest, ReleaseEarliestSnapshotDuringCompaction) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  const size_t commit_cache_bits = 0;    // minimum commit cache
+  UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+
+  ASSERT_OK(db->Put(WriteOptions(), "key1", "value1"));
+  SequenceNumber put_seq = db->GetLatestSequenceNumber();
+  auto* transaction =
+      db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr);
+  ASSERT_OK(transaction->SetName("txn"));
+  ASSERT_OK(transaction->Delete("key1"));
+  ASSERT_OK(transaction->Prepare());
+  SequenceNumber del_seq = db->GetLatestSequenceNumber();
+  auto snapshot1 = db->GetSnapshot();
+  // Increment sequence number.
+  ASSERT_OK(db->Put(WriteOptions(), "key2", "value2"));
+  auto snapshot2 = db->GetSnapshot();
+  ASSERT_OK(transaction->Commit());
+  delete transaction;
+  VerifyKeys({{"key1", "NOT_FOUND"}});
+  VerifyKeys({{"key1", "value1"}}, snapshot1);
+  VerifyKeys({{"key1", "value1"}}, snapshot2);
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  auto callback = [&](void* compaction) {
+    // Release snapshot1 after CompactionIterator init.
+    // CompactionIterator need to double check and find out snapshot2 is now
+    // the earliest existing snapshot.
+    if (compaction != nullptr) {
+      db->ReleaseSnapshot(snapshot1);
+      // Add some keys to advance max_evicted_seq.
+      ASSERT_OK(db->Put(WriteOptions(), "key3", "value3"));
+      ASSERT_OK(db->Put(WriteOptions(), "key4", "value4"));
+    }
+  };
+  SyncPoint::GetInstance()->SetCallBack("CompactionIterator:AfterInit",
+                                        callback);
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Dummy keys to avoid compaction trivially move files and get around actual
+  // compaction logic.
+  ASSERT_OK(db->Put(WriteOptions(), "a", "dummy"));
+  ASSERT_OK(db->Put(WriteOptions(), "z", "dummy"));
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  // Only verify for key1. Both the put and delete for the key should be kept.
+  // Since the delete tombstone is not visible to snapshot2, we need to keep
+  // at least one version of the key, for write-conflict check.
+  VerifyInternalKeys({{"key1", "", del_seq, kTypeDeletion},
+                      {"key1", "value1", put_seq, kTypeValue}});
+  db->ReleaseSnapshot(snapshot2);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(WritePreparedTransactionTest,
+       ReleaseEarliestSnapshotDuringCompaction_WithSD) {
+  constexpr size_t kSnapshotCacheBits = 7;  // same as default
+  constexpr size_t kCommitCacheBits = 0;    // minimum commit cache
+  UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits);
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+
+  ASSERT_OK(db->Put(WriteOptions(), "key", "value"));
+  ASSERT_OK(db->Put(WriteOptions(), "foo", "value"));
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  auto* txn = db->BeginTransaction(WriteOptions(), TransactionOptions(),
+                                   /*old_txn=*/nullptr);
+  ASSERT_OK(txn->SingleDelete("key"));
+  ASSERT_OK(txn->Put("wow", "value"));
+  ASSERT_OK(txn->SetName("txn"));
+  ASSERT_OK(txn->Prepare());
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  const bool two_write_queues = std::get<1>(GetParam());
+  if (two_write_queues) {
+    // In the case of two queues, commit another txn just to bump
+    // last_published_seq so that a subsequent GetSnapshot() call can return
+    // a snapshot with higher sequence.
+    auto* dummy_txn = db->BeginTransaction(WriteOptions(), TransactionOptions(),
+                                           /*old_txn=*/nullptr);
+    ASSERT_OK(dummy_txn->Put("haha", "value"));
+    ASSERT_OK(dummy_txn->Commit());
+    delete dummy_txn;
+  }
+  auto* snapshot = db->GetSnapshot();
+
+  ASSERT_OK(txn->Commit());
+  delete txn;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::NextFromInput:SingleDelete:1", [&](void* arg) {
+        if (!arg) {
+          return;
+        }
+        db->ReleaseSnapshot(snapshot);
+
+        // Advance max_evicted_seq
+        ASSERT_OK(db->Put(WriteOptions(), "bar", "value"));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                             /*end=*/nullptr));
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(WritePreparedTransactionTest,
+       ReleaseEarliestSnapshotDuringCompaction_WithSD2) {
+  constexpr size_t kSnapshotCacheBits = 7;  // same as default
+  constexpr size_t kCommitCacheBits = 0;    // minimum commit cache
+  UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits);
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+
+  ASSERT_OK(db->Put(WriteOptions(), "foo", "value"));
+  ASSERT_OK(db->Put(WriteOptions(), "key", "value"));
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  auto* txn = db->BeginTransaction(WriteOptions(), TransactionOptions(),
+                                   /*old_txn=*/nullptr);
+  ASSERT_OK(txn->Put("bar", "value"));
+  ASSERT_OK(txn->SingleDelete("key"));
+  ASSERT_OK(txn->SetName("txn"));
+  ASSERT_OK(txn->Prepare());
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  ASSERT_OK(txn->Commit());
+  delete txn;
+
+  ASSERT_OK(db->Put(WriteOptions(), "haha", "value"));
+
+  // Create a dummy transaction to take a snapshot for ww-conflict detection.
+  TransactionOptions txn_opts;
+  txn_opts.set_snapshot = true;
+  auto* dummy_txn =
+      db->BeginTransaction(WriteOptions(), txn_opts, /*old_txn=*/nullptr);
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::NextFromInput:SingleDelete:2", [&](void* /*arg*/) {
+        ASSERT_OK(dummy_txn->Rollback());
+        delete dummy_txn;
+
+        ASSERT_OK(db->Put(WriteOptions(), "dontcare", "value"));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db->Put(WriteOptions(), "haha2", "value"));
+  auto* snapshot = db->GetSnapshot();
+
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  db->ReleaseSnapshot(snapshot);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(WritePreparedTransactionTest,
+       ReleaseEarliestSnapshotDuringCompaction_WithDelete) {
+  constexpr size_t kSnapshotCacheBits = 7;  // same as default
+  constexpr size_t kCommitCacheBits = 0;    // minimum commit cache
+  UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits);
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+
+  ASSERT_OK(db->Put(WriteOptions(), "a", "value"));
+  ASSERT_OK(db->Put(WriteOptions(), "b", "value"));
+  ASSERT_OK(db->Put(WriteOptions(), "c", "value"));
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  auto* txn = db->BeginTransaction(WriteOptions(), TransactionOptions(),
+                                   /*old_txn=*/nullptr);
+  ASSERT_OK(txn->Delete("b"));
+  ASSERT_OK(txn->SetName("txn"));
+  ASSERT_OK(txn->Prepare());
+
+  const bool two_write_queues = std::get<1>(GetParam());
+  if (two_write_queues) {
+    // In the case of two queues, commit another txn just to bump
+    // last_published_seq so that a subsequent GetSnapshot() call can return
+    // a snapshot with higher sequence.
+    auto* dummy_txn = db->BeginTransaction(WriteOptions(), TransactionOptions(),
+                                           /*old_txn=*/nullptr);
+    ASSERT_OK(dummy_txn->Put("haha", "value"));
+    ASSERT_OK(dummy_txn->Commit());
+    delete dummy_txn;
+  }
+  auto* snapshot1 = db->GetSnapshot();
+  ASSERT_OK(txn->Commit());
+  delete txn;
+  auto* snapshot2 = db->GetSnapshot();
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::NextFromInput:BottommostDelete:1", [&](void* arg) {
+        if (!arg) {
+          return;
+        }
+        db->ReleaseSnapshot(snapshot1);
+
+        // Advance max_evicted_seq
+        ASSERT_OK(db->Put(WriteOptions(), "dummy1", "value"));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                             /*end=*/nullptr));
+  db->ReleaseSnapshot(snapshot2);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(WritePreparedTransactionTest,
+       ReleaseSnapshotBetweenSDAndPutDuringCompaction) {
+  constexpr size_t kSnapshotCacheBits = 7;  // same as default
+  constexpr size_t kCommitCacheBits = 0;    // minimum commit cache
+  UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits);
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+
+  // Create a dummy transaction to take a snapshot for ww-conflict detection.
+  TransactionOptions txn_opts;
+  txn_opts.set_snapshot = true;
+  auto* dummy_txn =
+      db->BeginTransaction(WriteOptions(), txn_opts, /*old_txn=*/nullptr);
+  // Increment seq
+  ASSERT_OK(db->Put(WriteOptions(), "bar", "value"));
+
+  ASSERT_OK(db->Put(WriteOptions(), "foo", "value"));
+  ASSERT_OK(db->SingleDelete(WriteOptions(), "foo"));
+  auto* snapshot1 = db->GetSnapshot();
+  // Increment seq
+  ASSERT_OK(db->Put(WriteOptions(), "dontcare", "value"));
+  auto* snapshot2 = db->GetSnapshot();
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::NextFromInput:KeepSDForWW", [&](void* /*arg*/) {
+        db->ReleaseSnapshot(snapshot1);
+
+        ASSERT_OK(db->Put(WriteOptions(), "dontcare2", "value2"));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db->Flush(FlushOptions()));
+  db->ReleaseSnapshot(snapshot2);
+  ASSERT_OK(dummy_txn->Commit());
+  delete dummy_txn;
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(WritePreparedTransactionTest,
+       ReleaseEarliestWriteConflictSnapshot_SingleDelete) {
+  constexpr size_t kSnapshotCacheBits = 7;  // same as default
+  constexpr size_t kCommitCacheBits = 0;    // minimum commit cache
+  UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits);
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+
+  ASSERT_OK(db->Put(WriteOptions(), "a", "value"));
+  ASSERT_OK(db->Put(WriteOptions(), "b", "value"));
+  ASSERT_OK(db->Put(WriteOptions(), "c", "value"));
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 2;
+    ASSERT_OK(db->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
+  }
+
+  std::unique_ptr<Transaction> txn;
+  txn.reset(db->BeginTransaction(WriteOptions(), TransactionOptions(),
+                                 /*old_txn=*/nullptr));
+  ASSERT_OK(txn->SetName("txn1"));
+  ASSERT_OK(txn->SingleDelete("b"));
+  ASSERT_OK(txn->Prepare());
+  ASSERT_OK(txn->Commit());
+
+  auto* snapshot1 = db->GetSnapshot();
+
+  // Bump seq of the db by performing writes so that
+  // earliest_snapshot_ < earliest_write_conflict_snapshot_ in
+  // CompactionIterator.
+  ASSERT_OK(db->Put(WriteOptions(), "z", "dontcare"));
+
+  // Create another snapshot for write conflict checking
+  std::unique_ptr<Transaction> txn2;
+  {
+    TransactionOptions txn_opts;
+    txn_opts.set_snapshot = true;
+    txn2.reset(
+        db->BeginTransaction(WriteOptions(), txn_opts, /*old_txn=*/nullptr));
+  }
+
+  // Bump seq so that the subsequent bg flush won't create a snapshot with the
+  // same seq as the previous snapshot for conflict checking.
+  ASSERT_OK(db->Put(WriteOptions(), "y", "dont"));
+
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::NextFromInput:SingleDelete:1", [&](void* /*arg*/) {
+        // Rolling back txn2 should release its snapshot(for ww checking).
+        ASSERT_OK(txn2->Rollback());
+        txn2.reset();
+        // Advance max_evicted_seq
+        ASSERT_OK(db->Put(WriteOptions(), "x", "value"));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                             /*end=*/nullptr));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  db->ReleaseSnapshot(snapshot1);
+}
+
+TEST_P(WritePreparedTransactionTest, ReleaseEarliestSnapshotAfterSeqZeroing) {
+  constexpr size_t kSnapshotCacheBits = 7;  // same as default
+  constexpr size_t kCommitCacheBits = 0;    // minimum commit cache
+  UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits);
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+
+  ASSERT_OK(db->Put(WriteOptions(), "a", "value"));
+  ASSERT_OK(db->Put(WriteOptions(), "b", "value"));
+  ASSERT_OK(db->Put(WriteOptions(), "c", "value"));
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 2;
+    ASSERT_OK(db->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
+  }
+
+  ASSERT_OK(db->SingleDelete(WriteOptions(), "b"));
+
+  // Take a snapshot so that the SD won't be dropped during flush.
+  auto* tmp_snapshot = db->GetSnapshot();
+
+  ASSERT_OK(db->Put(WriteOptions(), "b", "value2"));
+  auto* snapshot = db->GetSnapshot();
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  db->ReleaseSnapshot(tmp_snapshot);
+
+  // Bump the sequence so that the below bg compaction job's snapshot will be
+  // different from snapshot's sequence.
+  ASSERT_OK(db->Put(WriteOptions(), "z", "foo"));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::PrepareOutput:ZeroingSeq", [&](void* arg) {
+        const auto* const ikey =
+            reinterpret_cast<const ParsedInternalKey*>(arg);
+        assert(ikey);
+        if (ikey->user_key == "b") {
+          assert(ikey->type == kTypeValue);
+          db->ReleaseSnapshot(snapshot);
+
+          // Bump max_evicted_seq.
+          ASSERT_OK(db->Put(WriteOptions(), "z", "dontcare"));
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                             /*end=*/nullptr));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(WritePreparedTransactionTest, ReleaseEarliestSnapshotAfterSeqZeroing2) {
+  constexpr size_t kSnapshotCacheBits = 7;  // same as default
+  constexpr size_t kCommitCacheBits = 0;    // minimum commit cache
+  UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits);
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+
+  // Generate an L0 with only SD for one key "b".
+  ASSERT_OK(db->Put(WriteOptions(), "a", "value"));
+  ASSERT_OK(db->Put(WriteOptions(), "b", "value"));
+  // Take a snapshot so that subsequent flush outputs the SD for "b".
+  auto* tmp_snapshot = db->GetSnapshot();
+  ASSERT_OK(db->SingleDelete(WriteOptions(), "b"));
+  ASSERT_OK(db->Put(WriteOptions(), "c", "value"));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::NextFromInput:SingleDelete:3", [&](void* arg) {
+        if (!arg) {
+          db->ReleaseSnapshot(tmp_snapshot);
+          // Bump max_evicted_seq
+          ASSERT_OK(db->Put(WriteOptions(), "x", "dontcare"));
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db->Flush(FlushOptions()));
+  // Finish generating L0 with only SD for "b".
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Move the L0 to L2.
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 2;
+    ASSERT_OK(db->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
+  }
+
+  ASSERT_OK(db->Put(WriteOptions(), "b", "value1"));
+
+  auto* snapshot = db->GetSnapshot();
+
+  // Bump seq so that a subsequent flush/compaction job's snapshot is larger
+  // than the above snapshot's seq.
+  ASSERT_OK(db->Put(WriteOptions(), "x", "dontcare"));
+
+  // Generate a second L0.
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::PrepareOutput:ZeroingSeq", [&](void* arg) {
+        const auto* const ikey =
+            reinterpret_cast<const ParsedInternalKey*>(arg);
+        assert(ikey);
+        if (ikey->user_key == "b") {
+          assert(ikey->type == kTypeValue);
+          db->ReleaseSnapshot(snapshot);
+
+          // Bump max_evicted_seq.
+          ASSERT_OK(db->Put(WriteOptions(), "z", "dontcare"));
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                             /*end=*/nullptr));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// Although the user-contract indicates that a SD can only be issued for a key
+// that exists and has not been overwritten, it is still possible for a Delete
+// to be present when write-prepared transaction is rolled back.
+TEST_P(WritePreparedTransactionTest, SingleDeleteAfterRollback) {
+  constexpr size_t kSnapshotCacheBits = 7;  // same as default
+  constexpr size_t kCommitCacheBits = 0;    // minimum commit cache
+  txn_db_options.rollback_deletion_type_callback =
+      [](TransactionDB*, ColumnFamilyHandle*, const Slice&) { return true; };
+  UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits);
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+
+  // Get a write conflict snapshot by creating a transaction with
+  // set_snapshot=true.
+  TransactionOptions txn_opts;
+  txn_opts.set_snapshot = true;
+  std::unique_ptr<Transaction> dummy_txn(
+      db->BeginTransaction(WriteOptions(), txn_opts));
+
+  std::unique_ptr<Transaction> txn0(
+      db->BeginTransaction(WriteOptions(), TransactionOptions()));
+  ASSERT_OK(txn0->Put("foo", "value"));
+  ASSERT_OK(txn0->SetName("xid0"));
+  ASSERT_OK(txn0->Prepare());
+
+  // Create an SST with only {"foo": "value"}.
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  // Insert a Delete to cancel out the prior Put by txn0.
+  ASSERT_OK(txn0->Rollback());
+  txn0.reset();
+
+  // Create a second SST.
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  ASSERT_OK(db->Put(WriteOptions(), "foo", "value1"));
+
+  auto* snapshot = db->GetSnapshot();
+
+  ASSERT_OK(db->SingleDelete(WriteOptions(), "foo"));
+
+  int count = 0;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::NextFromInput:SingleDelete:1", [&](void* arg) {
+        const auto* const c = reinterpret_cast<const Compaction*>(arg);
+        assert(!c);
+        // Trigger once only for SingleDelete during flush.
+        if (0 == count) {
+          ++count;
+          db->ReleaseSnapshot(snapshot);
+          // Bump max_evicted_seq
+          ASSERT_OK(db->Put(WriteOptions(), "x", "dontcare"));
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Create a third SST containing a SD without its matching PUT.
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  DBImpl* dbimpl = static_cast_with_check<DBImpl>(db->GetRootDB());
+  assert(dbimpl);
+  ASSERT_OK(dbimpl->TEST_CompactRange(
+      /*level=*/0, /*begin=*/nullptr, /*end=*/nullptr,
+      /*column_family=*/nullptr, /*disallow_trivial_mode=*/true));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Release the conflict-checking snapshot.
+  ASSERT_OK(dummy_txn->Rollback());
+}
+
+// A more complex test to verify compaction/flush should keep keys visible
+// to snapshots.
+TEST_P(WritePreparedTransactionTest,
+       CompactionKeepSnapshotVisibleKeysRandomized) {
+  constexpr size_t kNumTransactions = 10;
+  constexpr size_t kNumIterations = 1000;
+
+  std::vector<Transaction*> transactions(kNumTransactions, nullptr);
+  std::vector<size_t> versions(kNumTransactions, 0);
+  std::unordered_map<std::string, std::string> current_data;
+  std::vector<const Snapshot*> snapshots;
+  std::vector<std::unordered_map<std::string, std::string>> snapshot_data;
+
+  Random rnd(1103);
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+
+  for (size_t i = 0; i < kNumTransactions; i++) {
+    std::string key = "key" + std::to_string(i);
+    std::string value = "value0";
+    ASSERT_OK(db->Put(WriteOptions(), key, value));
+    current_data[key] = value;
+  }
+  VerifyKeys(current_data);
+
+  for (size_t iter = 0; iter < kNumIterations; iter++) {
+    auto r = rnd.Next() % (kNumTransactions + 1);
+    if (r < kNumTransactions) {
+      std::string key = "key" + std::to_string(r);
+      if (transactions[r] == nullptr) {
+        std::string value = "value" + std::to_string(versions[r] + 1);
+        auto* txn = db->BeginTransaction(WriteOptions());
+        ASSERT_OK(txn->SetName("txn" + std::to_string(r)));
+        ASSERT_OK(txn->Put(key, value));
+        ASSERT_OK(txn->Prepare());
+        transactions[r] = txn;
+      } else {
+        std::string value = "value" + std::to_string(++versions[r]);
+        ASSERT_OK(transactions[r]->Commit());
+        delete transactions[r];
+        transactions[r] = nullptr;
+        current_data[key] = value;
+      }
+    } else {
+      auto* snapshot = db->GetSnapshot();
+      VerifyKeys(current_data, snapshot);
+      snapshots.push_back(snapshot);
+      snapshot_data.push_back(current_data);
+    }
+    VerifyKeys(current_data);
+  }
+  // Take a last snapshot to test compaction with uncommitted prepared
+  // transaction.
+  snapshots.push_back(db->GetSnapshot());
+  snapshot_data.push_back(current_data);
+
+  ASSERT_EQ(snapshots.size(), snapshot_data.size());
+  for (size_t i = 0; i < snapshots.size(); i++) {
+    VerifyKeys(snapshot_data[i], snapshots[i]);
+  }
+  ASSERT_OK(db->Flush(FlushOptions()));
+  for (size_t i = 0; i < snapshots.size(); i++) {
+    VerifyKeys(snapshot_data[i], snapshots[i]);
+  }
+  // Dummy keys to avoid compaction trivially move files and get around actual
+  // compaction logic.
+  ASSERT_OK(db->Put(WriteOptions(), "a", "dummy"));
+  ASSERT_OK(db->Put(WriteOptions(), "z", "dummy"));
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  for (size_t i = 0; i < snapshots.size(); i++) {
+    VerifyKeys(snapshot_data[i], snapshots[i]);
+  }
+  // cleanup
+  for (size_t i = 0; i < kNumTransactions; i++) {
+    if (transactions[i] == nullptr) {
+      continue;
+    }
+    ASSERT_OK(transactions[i]->Commit());
+    delete transactions[i];
+  }
+  for (size_t i = 0; i < snapshots.size(); i++) {
+    db->ReleaseSnapshot(snapshots[i]);
+  }
+}
+
+// Compaction should not apply the optimization to output key with sequence
+// number equal to 0 if the key is not visible to earliest snapshot, based on
+// commit sequence number.
+TEST_P(WritePreparedTransactionTest,
+       CompactionShouldKeepSequenceForUncommittedKeys) {
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+  // Keep track of expected sequence number.
+  SequenceNumber expected_seq = 0;
+  auto* transaction = db->BeginTransaction(WriteOptions());
+  ASSERT_OK(transaction->SetName("txn"));
+  ASSERT_OK(transaction->Put("key1", "value1"));
+  ASSERT_OK(transaction->Prepare());
+  ASSERT_EQ(++expected_seq, db->GetLatestSequenceNumber());
+  SequenceNumber seq1 = expected_seq;
+  ASSERT_OK(db->Put(WriteOptions(), "key2", "value2"));
+  DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+  expected_seq++;  // one for data
+  if (options.two_write_queues) {
+    expected_seq++;  // one for commit
+  }
+  ASSERT_EQ(expected_seq, db_impl->TEST_GetLastVisibleSequence());
+  ASSERT_OK(db->Flush(FlushOptions()));
+  // Dummy keys to avoid compaction trivially move files and get around actual
+  // compaction logic.
+  ASSERT_OK(db->Put(WriteOptions(), "a", "dummy"));
+  ASSERT_OK(db->Put(WriteOptions(), "z", "dummy"));
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  VerifyKeys({
+      {"key1", "NOT_FOUND"},
+      {"key2", "value2"},
+  });
+  VerifyInternalKeys({
+      // "key1" has not been committed. It keeps its sequence number.
+      {"key1", "value1", seq1, kTypeValue},
+      // "key2" is committed and output with seq = 0.
+      {"key2", "value2", 0, kTypeValue},
+  });
+  ASSERT_OK(transaction->Commit());
+  VerifyKeys({
+      {"key1", "value1"},
+      {"key2", "value2"},
+  });
+  delete transaction;
+}
+
+TEST_P(WritePreparedTransactionTest, CommitAndSnapshotDuringCompaction) {
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+
+  const Snapshot* snapshot = nullptr;
+  ASSERT_OK(db->Put(WriteOptions(), "key1", "value1"));
+  auto* txn = db->BeginTransaction(WriteOptions());
+  ASSERT_OK(txn->SetName("txn"));
+  ASSERT_OK(txn->Put("key1", "value2"));
+  ASSERT_OK(txn->Prepare());
+
+  auto callback = [&](void*) {
+    // Snapshot is taken after compaction start. It should be taken into
+    // consideration for whether to compact out value1.
+    snapshot = db->GetSnapshot();
+    ASSERT_OK(txn->Commit());
+    delete txn;
+  };
+  SyncPoint::GetInstance()->SetCallBack("CompactionIterator:AfterInit",
+                                        callback);
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(db->Flush(FlushOptions()));
+  ASSERT_NE(nullptr, snapshot);
+  VerifyKeys({{"key1", "value2"}});
+  VerifyKeys({{"key1", "value1"}}, snapshot);
+  db->ReleaseSnapshot(snapshot);
+}
+
+TEST_P(WritePreparedTransactionTest, Iterate) {
+  auto verify_state = [](Iterator* iter, const std::string& key,
+                         const std::string& value) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(key, iter->key().ToString());
+    ASSERT_EQ(value, iter->value().ToString());
+  };
+
+  auto verify_iter = [&](const std::string& expected_val) {
+    // Get iterator from a concurrent transaction and make sure it has the
+    // same view as an iterator from the DB.
+    auto* txn = db->BeginTransaction(WriteOptions());
+
+    for (int i = 0; i < 2; i++) {
+      Iterator* iter = (i == 0) ? db->NewIterator(ReadOptions())
+                                : txn->GetIterator(ReadOptions());
+      // Seek
+      iter->Seek("foo");
+      verify_state(iter, "foo", expected_val);
+      // Next
+      iter->Seek("a");
+      verify_state(iter, "a", "va");
+      iter->Next();
+      verify_state(iter, "foo", expected_val);
+      // SeekForPrev
+      iter->SeekForPrev("y");
+      verify_state(iter, "foo", expected_val);
+      // Prev
+      iter->SeekForPrev("z");
+      verify_state(iter, "z", "vz");
+      iter->Prev();
+      verify_state(iter, "foo", expected_val);
+      delete iter;
+    }
+    delete txn;
+  };
+
+  ASSERT_OK(db->Put(WriteOptions(), "foo", "v1"));
+  auto* transaction = db->BeginTransaction(WriteOptions());
+  ASSERT_OK(transaction->SetName("txn"));
+  ASSERT_OK(transaction->Put("foo", "v2"));
+  ASSERT_OK(transaction->Prepare());
+  VerifyKeys({{"foo", "v1"}});
+  // dummy keys
+  ASSERT_OK(db->Put(WriteOptions(), "a", "va"));
+  ASSERT_OK(db->Put(WriteOptions(), "z", "vz"));
+  verify_iter("v1");
+  ASSERT_OK(transaction->Commit());
+  VerifyKeys({{"foo", "v2"}});
+  verify_iter("v2");
+  delete transaction;
+}
+
+TEST_P(WritePreparedTransactionTest, IteratorRefreshNotSupported) {
+  Iterator* iter = db->NewIterator(ReadOptions());
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Refresh().IsNotSupported());
+  delete iter;
+}
+
+// Committing an delayed prepared has two non-atomic steps: update commit cache,
+// remove seq from delayed_prepared_. The read in IsInSnapshot also involves two
+// non-atomic steps of checking these two data structures. This test breaks each
+// in the middle to ensure correctness in spite of non-atomic execution.
+// Note: This test is limitted to the case where snapshot is larger than the
+// max_evicted_seq_.
+TEST_P(WritePreparedTransactionTest, NonAtomicCommitOfDelayedPrepared) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  const size_t commit_cache_bits = 3;    // 8 entries
+  for (auto split_read : {true, false}) {
+    std::vector<bool> split_options = {false};
+    if (split_read) {
+      // Also test for break before mutex
+      split_options.push_back(true);
+    }
+    for (auto split_before_mutex : split_options) {
+      UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+      ASSERT_OK(ReOpen());
+      WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+      DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+      // Fill up the commit cache
+      std::string init_value("value1");
+      for (int i = 0; i < 10; i++) {
+        ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice(init_value)));
+      }
+      // Prepare a transaction but do not commit it
+      Transaction* txn =
+          db->BeginTransaction(WriteOptions(), TransactionOptions());
+      ASSERT_OK(txn->SetName("xid"));
+      ASSERT_OK(txn->Put(Slice("key1"), Slice("value2")));
+      ASSERT_OK(txn->Prepare());
+      // Commit a bunch of entries to advance max evicted seq and make the
+      // prepared a delayed prepared
+      for (int i = 0; i < 10; i++) {
+        ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
+      }
+      // The snapshot should not see the delayed prepared entry
+      auto snap = db->GetSnapshot();
+
+      if (split_read) {
+        if (split_before_mutex) {
+          // split before acquiring prepare_mutex_
+          ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+              {{"WritePreparedTxnDB::IsInSnapshot:prepared_mutex_:pause",
+                "AtomicCommitOfDelayedPrepared:Commit:before"},
+               {"AtomicCommitOfDelayedPrepared:Commit:after",
+                "WritePreparedTxnDB::IsInSnapshot:prepared_mutex_:resume"}});
+        } else {
+          // split right after reading from the commit cache
+          ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+              {{"WritePreparedTxnDB::IsInSnapshot:GetCommitEntry:pause",
+                "AtomicCommitOfDelayedPrepared:Commit:before"},
+               {"AtomicCommitOfDelayedPrepared:Commit:after",
+                "WritePreparedTxnDB::IsInSnapshot:GetCommitEntry:resume"}});
+        }
+      } else {  // split commit
+        // split right before removing from delayed_prepared_
+        ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+            {{"WritePreparedTxnDB::RemovePrepared:pause",
+              "AtomicCommitOfDelayedPrepared:Read:before"},
+             {"AtomicCommitOfDelayedPrepared:Read:after",
+              "WritePreparedTxnDB::RemovePrepared:resume"}});
+      }
+      SyncPoint::GetInstance()->EnableProcessing();
+
+      ROCKSDB_NAMESPACE::port::Thread commit_thread([&]() {
+        TEST_SYNC_POINT("AtomicCommitOfDelayedPrepared:Commit:before");
+        ASSERT_OK(txn->Commit());
+        if (split_before_mutex) {
+          // Do bunch of inserts to evict the commit entry from the cache. This
+          // would prevent the 2nd look into commit cache under prepare_mutex_
+          // to see the commit entry.
+          auto seq = db_impl->TEST_GetLastVisibleSequence();
+          size_t tries = 0;
+          while (wp_db->max_evicted_seq_ < seq && tries < 50) {
+            ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
+            tries++;
+          };
+          ASSERT_LT(tries, 50);
+        }
+        TEST_SYNC_POINT("AtomicCommitOfDelayedPrepared:Commit:after");
+        delete txn;
+      });
+
+      ROCKSDB_NAMESPACE::port::Thread read_thread([&]() {
+        TEST_SYNC_POINT("AtomicCommitOfDelayedPrepared:Read:before");
+        ReadOptions roptions;
+        roptions.snapshot = snap;
+        PinnableSlice value;
+        auto s = db->Get(roptions, db->DefaultColumnFamily(), "key1", &value);
+        ASSERT_OK(s);
+        // It should not see the commit of delayed prepared
+        ASSERT_TRUE(value == init_value);
+        TEST_SYNC_POINT("AtomicCommitOfDelayedPrepared:Read:after");
+        db->ReleaseSnapshot(snap);
+      });
+
+      read_thread.join();
+      commit_thread.join();
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+    }  // for split_before_mutex
+  }    // for split_read
+}
+
+// When max evicted seq advances a prepared seq, it involves two updates: i)
+// adding prepared seq to delayed_prepared_, ii) updating max_evicted_seq_.
+// ::IsInSnapshot also reads these two values in a non-atomic way. This test
+// ensures correctness if the update occurs after ::IsInSnapshot reads
+// delayed_prepared_empty_ and before it reads max_evicted_seq_.
+// Note: this test focuses on read snapshot larger than max_evicted_seq_.
+TEST_P(WritePreparedTransactionTest, NonAtomicUpdateOfDelayedPrepared) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  const size_t commit_cache_bits = 3;    // 8 entries
+  UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+  ASSERT_OK(ReOpen());
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  // Fill up the commit cache
+  std::string init_value("value1");
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice(init_value)));
+  }
+  // Prepare a transaction but do not commit it
+  Transaction* txn = db->BeginTransaction(WriteOptions(), TransactionOptions());
+  ASSERT_OK(txn->SetName("xid"));
+  ASSERT_OK(txn->Put(Slice("key1"), Slice("value2")));
+  ASSERT_OK(txn->Prepare());
+  // Create a gap between prepare seq and snapshot seq
+  ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
+  ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
+  // The snapshot should not see the delayed prepared entry
+  auto snap = db->GetSnapshot();
+  ASSERT_LT(txn->GetId(), snap->GetSequenceNumber());
+
+  // split right after reading delayed_prepared_empty_
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"WritePreparedTxnDB::IsInSnapshot:delayed_prepared_empty_:pause",
+        "AtomicUpdateOfDelayedPrepared:before"},
+       {"AtomicUpdateOfDelayedPrepared:after",
+        "WritePreparedTxnDB::IsInSnapshot:delayed_prepared_empty_:resume"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread commit_thread([&]() {
+    TEST_SYNC_POINT("AtomicUpdateOfDelayedPrepared:before");
+    // Commit a bunch of entries to advance max evicted seq and make the
+    // prepared a delayed prepared
+    size_t tries = 0;
+    while (wp_db->max_evicted_seq_ < txn->GetId() && tries < 50) {
+      ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
+      tries++;
+    };
+    ASSERT_LT(tries, 50);
+    // This is the case on which the test focuses
+    ASSERT_LT(wp_db->max_evicted_seq_, snap->GetSequenceNumber());
+    TEST_SYNC_POINT("AtomicUpdateOfDelayedPrepared:after");
+  });
+
+  ROCKSDB_NAMESPACE::port::Thread read_thread([&]() {
+    ReadOptions roptions;
+    roptions.snapshot = snap;
+    PinnableSlice value;
+    auto s = db->Get(roptions, db->DefaultColumnFamily(), "key1", &value);
+    ASSERT_OK(s);
+    // It should not see the uncommitted value of delayed prepared
+    ASSERT_TRUE(value == init_value);
+    db->ReleaseSnapshot(snap);
+  });
+
+  read_thread.join();
+  commit_thread.join();
+  ASSERT_OK(txn->Commit());
+  delete txn;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// Eviction from commit cache and update of max evicted seq are two non-atomic
+// steps. Similarly the read of max_evicted_seq_ in ::IsInSnapshot and reading
+// from commit cache are two non-atomic steps. This tests if the update occurs
+// after reading max_evicted_seq_ and before reading the commit cache.
+// Note: the test focuses on snapshot larger than max_evicted_seq_
+TEST_P(WritePreparedTransactionTest, NonAtomicUpdateOfMaxEvictedSeq) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  const size_t commit_cache_bits = 3;    // 8 entries
+  UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+  ASSERT_OK(ReOpen());
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  // Fill up the commit cache
+  std::string init_value("value1");
+  std::string last_value("value_final");
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice(init_value)));
+  }
+  // Do an uncommitted write to prevent min_uncommitted optimization
+  Transaction* txn1 =
+      db->BeginTransaction(WriteOptions(), TransactionOptions());
+  ASSERT_OK(txn1->SetName("xid1"));
+  ASSERT_OK(txn1->Put(Slice("key0"), last_value));
+  ASSERT_OK(txn1->Prepare());
+  // Do a write with prepare to get the prepare seq
+  Transaction* txn = db->BeginTransaction(WriteOptions(), TransactionOptions());
+  ASSERT_OK(txn->SetName("xid"));
+  ASSERT_OK(txn->Put(Slice("key1"), last_value));
+  ASSERT_OK(txn->Prepare());
+  ASSERT_OK(txn->Commit());
+  // Create a gap between commit entry and snapshot seq
+  ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
+  ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
+  // The snapshot should see the last commit
+  auto snap = db->GetSnapshot();
+  ASSERT_LE(txn->GetId(), snap->GetSequenceNumber());
+
+  // split right after reading max_evicted_seq_
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"WritePreparedTxnDB::IsInSnapshot:max_evicted_seq_:pause",
+        "NonAtomicUpdateOfMaxEvictedSeq:before"},
+       {"NonAtomicUpdateOfMaxEvictedSeq:after",
+        "WritePreparedTxnDB::IsInSnapshot:max_evicted_seq_:resume"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread commit_thread([&]() {
+    TEST_SYNC_POINT("NonAtomicUpdateOfMaxEvictedSeq:before");
+    // Commit a bunch of entries to advance max evicted seq beyond txn->GetId()
+    size_t tries = 0;
+    while (wp_db->max_evicted_seq_ < txn->GetId() && tries < 50) {
+      ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
+      tries++;
+    };
+    ASSERT_LT(tries, 50);
+    // This is the case on which the test focuses
+    ASSERT_LT(wp_db->max_evicted_seq_, snap->GetSequenceNumber());
+    TEST_SYNC_POINT("NonAtomicUpdateOfMaxEvictedSeq:after");
+  });
+
+  ROCKSDB_NAMESPACE::port::Thread read_thread([&]() {
+    ReadOptions roptions;
+    roptions.snapshot = snap;
+    PinnableSlice value;
+    auto s = db->Get(roptions, db->DefaultColumnFamily(), "key1", &value);
+    ASSERT_OK(s);
+    // It should see the committed value of the evicted entry
+    ASSERT_TRUE(value == last_value);
+    db->ReleaseSnapshot(snap);
+  });
+
+  read_thread.join();
+  commit_thread.join();
+  delete txn;
+  ASSERT_OK(txn1->Commit());
+  delete txn1;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// Test when we add a prepared seq when the max_evicted_seq_ already goes beyond
+// that. The test focuses on a race condition between AddPrepared and
+// AdvanceMaxEvictedSeq functions.
+TEST_P(WritePreparedTransactionTest, AddPreparedBeforeMax) {
+  if (!options.two_write_queues) {
+    // This test is only for two write queues
+    return;
+  }
+  const size_t snapshot_cache_bits = 7;  // same as default
+  // 1 entry to advance max after the 2nd commit
+  const size_t commit_cache_bits = 0;
+  UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+  ASSERT_OK(ReOpen());
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  std::string some_value("value_some");
+  std::string uncommitted_value("value_uncommitted");
+  // Prepare two uncommitted transactions
+  Transaction* txn1 =
+      db->BeginTransaction(WriteOptions(), TransactionOptions());
+  ASSERT_OK(txn1->SetName("xid1"));
+  ASSERT_OK(txn1->Put(Slice("key1"), some_value));
+  ASSERT_OK(txn1->Prepare());
+  Transaction* txn2 =
+      db->BeginTransaction(WriteOptions(), TransactionOptions());
+  ASSERT_OK(txn2->SetName("xid2"));
+  ASSERT_OK(txn2->Put(Slice("key2"), some_value));
+  ASSERT_OK(txn2->Prepare());
+  // Start the txn here so the other thread could get its id
+  Transaction* txn = db->BeginTransaction(WriteOptions(), TransactionOptions());
+  ASSERT_OK(txn->SetName("xid"));
+  ASSERT_OK(txn->Put(Slice("key0"), uncommitted_value));
+  port::Mutex txn_mutex_;
+
+  // t1) Insert prepared entry, t2) commit other entries to advance max
+  // evicted sec and finish checking the existing prepared entries, t1)
+  // AddPrepared, t2) update max_evicted_seq_
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"AddPreparedCallback::AddPrepared::begin:pause",
+       "AddPreparedBeforeMax::read_thread:start"},
+      {"AdvanceMaxEvictedSeq::update_max:pause",
+       "AddPreparedCallback::AddPrepared::begin:resume"},
+      {"AddPreparedCallback::AddPrepared::end",
+       "AdvanceMaxEvictedSeq::update_max:resume"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread write_thread([&]() {
+    txn_mutex_.Lock();
+    ASSERT_OK(txn->Prepare());
+    txn_mutex_.Unlock();
+  });
+
+  ROCKSDB_NAMESPACE::port::Thread read_thread([&]() {
+    TEST_SYNC_POINT("AddPreparedBeforeMax::read_thread:start");
+    // Publish seq number with a commit
+    ASSERT_OK(txn1->Commit());
+    // Since the commit cache size is one the 2nd commit evict the 1st one and
+    // invokes AdcanceMaxEvictedSeq
+    ASSERT_OK(txn2->Commit());
+
+    ReadOptions roptions;
+    PinnableSlice value;
+    // The snapshot should not see the uncommitted value from write_thread
+    auto snap = db->GetSnapshot();
+    ASSERT_LT(wp_db->max_evicted_seq_, snap->GetSequenceNumber());
+    // This is the scenario that we test for
+    txn_mutex_.Lock();
+    ASSERT_GT(wp_db->max_evicted_seq_, txn->GetId());
+    txn_mutex_.Unlock();
+    roptions.snapshot = snap;
+    auto s = db->Get(roptions, db->DefaultColumnFamily(), "key0", &value);
+    ASSERT_TRUE(s.IsNotFound());
+    db->ReleaseSnapshot(snap);
+  });
+
+  read_thread.join();
+  write_thread.join();
+  delete txn1;
+  delete txn2;
+  ASSERT_OK(txn->Commit());
+  delete txn;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// When an old prepared entry gets committed, there is a gap between the time
+// that it is published and when it is cleaned up from old_prepared_. This test
+// stresses such cases.
+TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  for (const size_t commit_cache_bits : {0, 2, 3}) {
+    for (const size_t sub_batch_cnt : {1, 2, 3}) {
+      UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+      ASSERT_OK(ReOpen());
+      std::atomic<const Snapshot*> snap = {nullptr};
+      std::atomic<SequenceNumber> exp_prepare = {0};
+      ROCKSDB_NAMESPACE::port::Thread callback_thread;
+      // Value is synchronized via snap
+      PinnableSlice value;
+      // Take a snapshot after publish and before RemovePrepared:Start
+      auto snap_callback = [&]() {
+        ASSERT_EQ(nullptr, snap.load());
+        snap.store(db->GetSnapshot());
+        ReadOptions roptions;
+        roptions.snapshot = snap.load();
+        auto s = db->Get(roptions, db->DefaultColumnFamily(), "key2", &value);
+        ASSERT_OK(s);
+      };
+      auto callback = [&](void* param) {
+        SequenceNumber prep_seq = *((SequenceNumber*)param);
+        if (prep_seq == exp_prepare.load()) {  // only for write_thread
+          // We need to spawn a thread to avoid deadlock since getting a
+          // snpashot might end up calling AdvanceSeqByOne which needs joining
+          // the write queue.
+          callback_thread = ROCKSDB_NAMESPACE::port::Thread(snap_callback);
+          TEST_SYNC_POINT("callback:end");
+        }
+      };
+      // Wait for the first snapshot be taken in GetSnapshotInternal. Although
+      // it might be updated before GetSnapshotInternal finishes but this should
+      // cover most of the cases.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+          {"WritePreparedTxnDB::GetSnapshotInternal:first", "callback:end"},
+      });
+      SyncPoint::GetInstance()->SetCallBack("RemovePrepared:Start", callback);
+      SyncPoint::GetInstance()->EnableProcessing();
+      // Thread to cause frequent evictions
+      ROCKSDB_NAMESPACE::port::Thread eviction_thread([&]() {
+        // Too many txns might cause commit_seq - prepare_seq in another thread
+        // to go beyond DELTA_UPPERBOUND
+        for (int i = 0; i < 25 * (1 << commit_cache_bits); i++) {
+          ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice("value1")));
+        }
+      });
+      ROCKSDB_NAMESPACE::port::Thread write_thread([&]() {
+        for (int i = 0; i < 25 * (1 << commit_cache_bits); i++) {
+          Transaction* txn =
+              db->BeginTransaction(WriteOptions(), TransactionOptions());
+          ASSERT_OK(txn->SetName("xid"));
+          std::string val_str = "value" + std::to_string(i);
+          for (size_t b = 0; b < sub_batch_cnt; b++) {
+            ASSERT_OK(txn->Put(Slice("key2"), val_str));
+          }
+          ASSERT_OK(txn->Prepare());
+          // Let an eviction to kick in
+          std::this_thread::yield();
+
+          exp_prepare.store(txn->GetId());
+          ASSERT_OK(txn->Commit());
+          delete txn;
+          // Wait for the snapshot taking that is triggered by
+          // RemovePrepared:Start callback
+          callback_thread.join();
+
+          // Read with the snapshot taken before delayed_prepared_ cleanup
+          ReadOptions roptions;
+          roptions.snapshot = snap.load();
+          ASSERT_NE(nullptr, roptions.snapshot);
+          PinnableSlice value2;
+          auto s =
+              db->Get(roptions, db->DefaultColumnFamily(), "key2", &value2);
+          ASSERT_OK(s);
+          // It should see its own write
+          ASSERT_TRUE(val_str == value2);
+          // The value read by snapshot should not change
+          ASSERT_STREQ(value2.ToString().c_str(), value.ToString().c_str());
+
+          db->ReleaseSnapshot(roptions.snapshot);
+          snap.store(nullptr);
+        }
+      });
+      write_thread.join();
+      eviction_thread.join();
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+    }
+  }
+}
+
+// Test that updating the commit map will not affect the existing snapshots
+TEST_P(WritePreparedTransactionTest, AtomicCommit) {
+  for (bool skip_prepare : {true, false}) {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+        {"WritePreparedTxnDB::AddCommitted:start",
+         "AtomicCommit::GetSnapshot:start"},
+        {"AtomicCommit::Get:end",
+         "WritePreparedTxnDB::AddCommitted:start:pause"},
+        {"WritePreparedTxnDB::AddCommitted:end", "AtomicCommit::Get2:start"},
+        {"AtomicCommit::Get2:end",
+         "WritePreparedTxnDB::AddCommitted:end:pause:"},
+    });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    ROCKSDB_NAMESPACE::port::Thread write_thread([&]() {
+      if (skip_prepare) {
+        ASSERT_OK(db->Put(WriteOptions(), Slice("key"), Slice("value")));
+      } else {
+        Transaction* txn =
+            db->BeginTransaction(WriteOptions(), TransactionOptions());
+        ASSERT_OK(txn->SetName("xid"));
+        ASSERT_OK(txn->Put(Slice("key"), Slice("value")));
+        ASSERT_OK(txn->Prepare());
+        ASSERT_OK(txn->Commit());
+        delete txn;
+      }
+    });
+    ROCKSDB_NAMESPACE::port::Thread read_thread([&]() {
+      ReadOptions roptions;
+      TEST_SYNC_POINT("AtomicCommit::GetSnapshot:start");
+      roptions.snapshot = db->GetSnapshot();
+      PinnableSlice val;
+      auto s = db->Get(roptions, db->DefaultColumnFamily(), "key", &val);
+      TEST_SYNC_POINT("AtomicCommit::Get:end");
+      TEST_SYNC_POINT("AtomicCommit::Get2:start");
+      ASSERT_SAME(roptions, db, s, val, "key");
+      TEST_SYNC_POINT("AtomicCommit::Get2:end");
+      db->ReleaseSnapshot(roptions.snapshot);
+    });
+    read_thread.join();
+    write_thread.join();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_P(WritePreparedTransactionTest, BasicRollbackDeletionTypeCb) {
+  options.level0_file_num_compaction_trigger = 2;
+  // Always use SingleDelete to rollback Put.
+  txn_db_options.rollback_deletion_type_callback =
+      [](TransactionDB*, ColumnFamilyHandle*, const Slice&) { return true; };
+
+  const auto write_to_db = [&]() {
+    assert(db);
+    std::unique_ptr<Transaction> txn0(
+        db->BeginTransaction(WriteOptions(), TransactionOptions()));
+    ASSERT_OK(txn0->SetName("txn0"));
+    ASSERT_OK(txn0->Put("a", "v0"));
+    ASSERT_OK(txn0->Prepare());
+
+    // Generate sst1: [PUT('a')]
+    ASSERT_OK(db->Flush(FlushOptions()));
+
+    {
+      CompactRangeOptions cro;
+      cro.change_level = true;
+      cro.target_level = options.num_levels - 1;
+      cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+      ASSERT_OK(db->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
+    }
+
+    ASSERT_OK(txn0->Rollback());
+    txn0.reset();
+
+    ASSERT_OK(db->Put(WriteOptions(), "a", "v1"));
+
+    ASSERT_OK(db->SingleDelete(WriteOptions(), "a"));
+    // Generate another SST with a SD to cover the oldest PUT('a')
+    ASSERT_OK(db->Flush(FlushOptions()));
+
+    auto* dbimpl = static_cast_with_check<DBImpl>(db->GetRootDB());
+    assert(dbimpl);
+    ASSERT_OK(dbimpl->TEST_WaitForCompact());
+
+    {
+      CompactRangeOptions cro;
+      cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+      ASSERT_OK(db->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
+    }
+
+    {
+      std::string value;
+      const Status s = db->Get(ReadOptions(), "a", &value);
+      ASSERT_TRUE(s.IsNotFound());
+    }
+  };
+
+  // Destroy and reopen
+  ASSERT_OK(ReOpen());
+  write_to_db();
+}
+
+// Test that we can change write policy from WriteCommitted to WritePrepared
+// after a clean shutdown (which would empty the WAL)
+TEST_P(WritePreparedTransactionTest, WP_WC_DBBackwardCompatibility) {
+  bool empty_wal = true;
+  CrossCompatibilityTest(WRITE_COMMITTED, WRITE_PREPARED, empty_wal);
+}
+
+// Test that we fail fast if WAL is not emptied between changing the write
+// policy from WriteCommitted to WritePrepared
+TEST_P(WritePreparedTransactionTest, WP_WC_WALBackwardIncompatibility) {
+  bool empty_wal = true;
+  CrossCompatibilityTest(WRITE_COMMITTED, WRITE_PREPARED, !empty_wal);
+}
+
+// Test that we can change write policy from WritePrepare back to WriteCommitted
+// after a clean shutdown (which would empty the WAL)
+TEST_P(WritePreparedTransactionTest, WC_WP_ForwardCompatibility) {
+  bool empty_wal = true;
+  CrossCompatibilityTest(WRITE_PREPARED, WRITE_COMMITTED, empty_wal);
+}
+
+// Test that we fail fast if WAL is not emptied between changing the write
+// policy from WriteCommitted to WritePrepared
+TEST_P(WritePreparedTransactionTest, WC_WP_WALForwardIncompatibility) {
+  bool empty_wal = true;
+  CrossCompatibilityTest(WRITE_PREPARED, WRITE_COMMITTED, !empty_wal);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  if (getenv("CIRCLECI")) {
+    // Looking for backtrace on "Resource temporarily unavailable" exceptions
+    ::testing::FLAGS_gtest_catch_exceptions = false;
+  }
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as Transactions are not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/write_prepared_txn.cc b/src/rocksdb/utilities/transactions/write_prepared_txn.cc
new file mode 100644
index 000000000..16b5cc1cb
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/write_prepared_txn.cc
@@ -0,0 +1,512 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/write_prepared_txn.h"
+
+#include <cinttypes>
+#include <map>
+#include <set>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "util/cast_util.h"
+#include "utilities/transactions/pessimistic_transaction.h"
+#include "utilities/transactions/write_prepared_txn_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct WriteOptions;
+
+WritePreparedTxn::WritePreparedTxn(WritePreparedTxnDB* txn_db,
+                                   const WriteOptions& write_options,
+                                   const TransactionOptions& txn_options)
+    : PessimisticTransaction(txn_db, write_options, txn_options, false),
+      wpt_db_(txn_db) {
+  // Call Initialize outside PessimisticTransaction constructor otherwise it
+  // would skip overridden functions in WritePreparedTxn since they are not
+  // defined yet in the constructor of PessimisticTransaction
+  Initialize(txn_options);
+}
+
+void WritePreparedTxn::Initialize(const TransactionOptions& txn_options) {
+  PessimisticTransaction::Initialize(txn_options);
+  prepare_batch_cnt_ = 0;
+}
+
+void WritePreparedTxn::MultiGet(const ReadOptions& options,
+                                ColumnFamilyHandle* column_family,
+                                const size_t num_keys, const Slice* keys,
+                                PinnableSlice* values, Status* statuses,
+                                const bool sorted_input) {
+  SequenceNumber min_uncommitted, snap_seq;
+  const SnapshotBackup backed_by_snapshot =
+      wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
+  WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted,
+                                        backed_by_snapshot);
+  write_batch_.MultiGetFromBatchAndDB(db_, options, column_family, num_keys,
+                                      keys, values, statuses, sorted_input,
+                                      &callback);
+  if (UNLIKELY(!callback.valid() ||
+               !wpt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) {
+    wpt_db_->WPRecordTick(TXN_GET_TRY_AGAIN);
+    for (size_t i = 0; i < num_keys; i++) {
+      statuses[i] = Status::TryAgain();
+    }
+  }
+}
+
+Status WritePreparedTxn::Get(const ReadOptions& options,
+                             ColumnFamilyHandle* column_family,
+                             const Slice& key, PinnableSlice* pinnable_val) {
+  SequenceNumber min_uncommitted, snap_seq;
+  const SnapshotBackup backed_by_snapshot =
+      wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
+  WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted,
+                                        backed_by_snapshot);
+  Status res = write_batch_.GetFromBatchAndDB(db_, options, column_family, key,
+                                              pinnable_val, &callback);
+  const bool callback_valid =
+      callback.valid();  // NOTE: validity of callback must always be checked
+                         // before it is destructed
+  if (res.ok()) {
+    if (!LIKELY(callback_valid &&
+                wpt_db_->ValidateSnapshot(callback.max_visible_seq(),
+                                          backed_by_snapshot))) {
+      wpt_db_->WPRecordTick(TXN_GET_TRY_AGAIN);
+      res = Status::TryAgain();
+    }
+  }
+
+  return res;
+}
+
+Iterator* WritePreparedTxn::GetIterator(const ReadOptions& options) {
+  // Make sure to get iterator from WritePrepareTxnDB, not the root db.
+  Iterator* db_iter = wpt_db_->NewIterator(options);
+  assert(db_iter);
+
+  return write_batch_.NewIteratorWithBase(db_iter);
+}
+
+Iterator* WritePreparedTxn::GetIterator(const ReadOptions& options,
+                                        ColumnFamilyHandle* column_family) {
+  // Make sure to get iterator from WritePrepareTxnDB, not the root db.
+  Iterator* db_iter = wpt_db_->NewIterator(options, column_family);
+  assert(db_iter);
+
+  return write_batch_.NewIteratorWithBase(column_family, db_iter);
+}
+
+Status WritePreparedTxn::PrepareInternal() {
+  WriteOptions write_options = write_options_;
+  write_options.disableWAL = false;
+  const bool WRITE_AFTER_COMMIT = true;
+  const bool kFirstPrepareBatch = true;
+  auto s = WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(),
+                                              name_, !WRITE_AFTER_COMMIT);
+  assert(s.ok());
+  // For each duplicate key we account for a new sub-batch
+  prepare_batch_cnt_ = GetWriteBatch()->SubBatchCnt();
+  // Having AddPrepared in the PreReleaseCallback allows in-order addition of
+  // prepared entries to PreparedHeap and hence enables an optimization. Refer
+  // to SmallestUnCommittedSeq for more details.
+  AddPreparedCallback add_prepared_callback(
+      wpt_db_, db_impl_, prepare_batch_cnt_,
+      db_impl_->immutable_db_options().two_write_queues, kFirstPrepareBatch);
+  const bool DISABLE_MEMTABLE = true;
+  uint64_t seq_used = kMaxSequenceNumber;
+  s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
+                          /*callback*/ nullptr, &log_number_, /*log ref*/ 0,
+                          !DISABLE_MEMTABLE, &seq_used, prepare_batch_cnt_,
+                          &add_prepared_callback);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  auto prepare_seq = seq_used;
+  SetId(prepare_seq);
+  return s;
+}
+
+Status WritePreparedTxn::CommitWithoutPrepareInternal() {
+  // For each duplicate key we account for a new sub-batch
+  const size_t batch_cnt = GetWriteBatch()->SubBatchCnt();
+  return CommitBatchInternal(GetWriteBatch()->GetWriteBatch(), batch_cnt);
+}
+
+Status WritePreparedTxn::CommitBatchInternal(WriteBatch* batch,
+                                             size_t batch_cnt) {
+  return wpt_db_->WriteInternal(write_options_, batch, batch_cnt, this);
+}
+
+Status WritePreparedTxn::CommitInternal() {
+  ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
+                    "CommitInternal prepare_seq: %" PRIu64, GetID());
+  // We take the commit-time batch and append the Commit marker.
+  // The Memtable will ignore the Commit marker in non-recovery mode
+  WriteBatch* working_batch = GetCommitTimeWriteBatch();
+  const bool empty = working_batch->Count() == 0;
+  auto s = WriteBatchInternal::MarkCommit(working_batch, name_);
+  assert(s.ok());
+
+  const bool for_recovery = use_only_the_last_commit_time_batch_for_recovery_;
+  if (!empty) {
+    // When not writing to memtable, we can still cache the latest write batch.
+    // The cached batch will be written to memtable in WriteRecoverableState
+    // during FlushMemTable
+    if (for_recovery) {
+      WriteBatchInternal::SetAsLatestPersistentState(working_batch);
+    } else {
+      return Status::InvalidArgument(
+          "Commit-time-batch can only be used if "
+          "use_only_the_last_commit_time_batch_for_recovery is true");
+    }
+  }
+
+  auto prepare_seq = GetId();
+  const bool includes_data = !empty && !for_recovery;
+  assert(prepare_batch_cnt_);
+  size_t commit_batch_cnt = 0;
+  if (UNLIKELY(includes_data)) {
+    ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log,
+                   "Duplicate key overhead");
+    SubBatchCounter counter(*wpt_db_->GetCFComparatorMap());
+    s = working_batch->Iterate(&counter);
+    assert(s.ok());
+    commit_batch_cnt = counter.BatchCount();
+  }
+  const bool disable_memtable = !includes_data;
+  const bool do_one_write =
+      !db_impl_->immutable_db_options().two_write_queues || disable_memtable;
+  WritePreparedCommitEntryPreReleaseCallback update_commit_map(
+      wpt_db_, db_impl_, prepare_seq, prepare_batch_cnt_, commit_batch_cnt);
+  // This is to call AddPrepared on CommitTimeWriteBatch
+  const bool kFirstPrepareBatch = true;
+  AddPreparedCallback add_prepared_callback(
+      wpt_db_, db_impl_, commit_batch_cnt,
+      db_impl_->immutable_db_options().two_write_queues, !kFirstPrepareBatch);
+  PreReleaseCallback* pre_release_callback;
+  if (do_one_write) {
+    pre_release_callback = &update_commit_map;
+  } else {
+    pre_release_callback = &add_prepared_callback;
+  }
+  uint64_t seq_used = kMaxSequenceNumber;
+  // Since the prepared batch is directly written to memtable, there is already
+  // a connection between the memtable and its WAL, so there is no need to
+  // redundantly reference the log that contains the prepared data.
+  const uint64_t zero_log_number = 0ull;
+  size_t batch_cnt = UNLIKELY(commit_batch_cnt) ? commit_batch_cnt : 1;
+  // If `two_write_queues && includes_data`, then `do_one_write` is false. The
+  // following `WriteImpl` will insert the data of the commit-time-batch into
+  // the database before updating the commit cache. Therefore, the data of the
+  // commmit-time-batch is considered uncommitted. Furthermore, since data of
+  // the commit-time-batch are not locked, it is possible for two uncommitted
+  // versions of the same key to co-exist for a (short) period of time until
+  // the commit cache is updated by the second write. If the two uncommitted
+  // keys are compacted to the bottommost level in the meantime, it is possible
+  // that compaction iterator will zero out the sequence numbers of both, thus
+  // violating the invariant that an SST does not have two identical internal
+  // keys. To prevent this situation, we should allow the usage of
+  // commit-time-batch only if the user sets
+  // TransactionOptions::use_only_the_last_commit_time_batch_for_recovery to
+  // true. See the comments about GetCommitTimeWriteBatch() in
+  // include/rocksdb/utilities/transaction.h.
+  s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr,
+                          zero_log_number, disable_memtable, &seq_used,
+                          batch_cnt, pre_release_callback);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  const SequenceNumber commit_batch_seq = seq_used;
+  if (LIKELY(do_one_write || !s.ok())) {
+    if (UNLIKELY(!db_impl_->immutable_db_options().two_write_queues &&
+                 s.ok())) {
+      // Note: RemovePrepared should be called after WriteImpl that publishsed
+      // the seq. Otherwise SmallestUnCommittedSeq optimization breaks.
+      wpt_db_->RemovePrepared(prepare_seq, prepare_batch_cnt_);
+    }  // else RemovePrepared is called from within PreReleaseCallback
+    if (UNLIKELY(!do_one_write)) {
+      assert(!s.ok());
+      // Cleanup the prepared entry we added with add_prepared_callback
+      wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt);
+    }
+    return s;
+  }  // else do the 2nd write to publish seq
+  // Note: the 2nd write comes with a performance penality. So if we have too
+  // many of commits accompanied with ComitTimeWriteBatch and yet we cannot
+  // enable use_only_the_last_commit_time_batch_for_recovery_ optimization,
+  // two_write_queues should be disabled to avoid many additional writes here.
+  const size_t kZeroData = 0;
+  // Update commit map only from the 2nd queue
+  WritePreparedCommitEntryPreReleaseCallback update_commit_map_with_aux_batch(
+      wpt_db_, db_impl_, prepare_seq, prepare_batch_cnt_, kZeroData,
+      commit_batch_seq, commit_batch_cnt);
+  WriteBatch empty_batch;
+  s = empty_batch.PutLogData(Slice());
+  assert(s.ok());
+  // In the absence of Prepare markers, use Noop as a batch separator
+  s = WriteBatchInternal::InsertNoop(&empty_batch);
+  assert(s.ok());
+  const bool DISABLE_MEMTABLE = true;
+  const size_t ONE_BATCH = 1;
+  const uint64_t NO_REF_LOG = 0;
+  s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
+                          NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
+                          &update_commit_map_with_aux_batch);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  return s;
+}
+
+Status WritePreparedTxn::RollbackInternal() {
+  ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log,
+                 "RollbackInternal prepare_seq: %" PRIu64, GetId());
+
+  assert(db_impl_);
+  assert(wpt_db_);
+
+  WriteBatch rollback_batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+                            write_options_.protection_bytes_per_key,
+                            0 /* default_cf_ts_sz */);
+  assert(GetId() != kMaxSequenceNumber);
+  assert(GetId() > 0);
+  auto cf_map_shared_ptr = wpt_db_->GetCFHandleMap();
+  auto cf_comp_map_shared_ptr = wpt_db_->GetCFComparatorMap();
+  auto read_at_seq = kMaxSequenceNumber;
+  ReadOptions roptions;
+  // to prevent callback's seq to be overrriden inside DBImpk::Get
+  roptions.snapshot = wpt_db_->GetMaxSnapshot();
+  struct RollbackWriteBatchBuilder : public WriteBatch::Handler {
+    DBImpl* const db_;
+    WritePreparedTxnDB* const wpt_db_;
+    WritePreparedTxnReadCallback callback_;
+    WriteBatch* rollback_batch_;
+    std::map<uint32_t, const Comparator*>& comparators_;
+    std::map<uint32_t, ColumnFamilyHandle*>& handles_;
+    using CFKeys = std::set<Slice, SetComparator>;
+    std::map<uint32_t, CFKeys> keys_;
+    bool rollback_merge_operands_;
+    ReadOptions roptions_;
+
+    RollbackWriteBatchBuilder(
+        DBImpl* db, WritePreparedTxnDB* wpt_db, SequenceNumber snap_seq,
+        WriteBatch* dst_batch,
+        std::map<uint32_t, const Comparator*>& comparators,
+        std::map<uint32_t, ColumnFamilyHandle*>& handles,
+        bool rollback_merge_operands, const ReadOptions& _roptions)
+        : db_(db),
+          wpt_db_(wpt_db),
+          callback_(wpt_db, snap_seq),  // disable min_uncommitted optimization
+          rollback_batch_(dst_batch),
+          comparators_(comparators),
+          handles_(handles),
+          rollback_merge_operands_(rollback_merge_operands),
+          roptions_(_roptions) {}
+
+    Status Rollback(uint32_t cf, const Slice& key) {
+      Status s;
+      CFKeys& cf_keys = keys_[cf];
+      if (cf_keys.size() == 0) {  // just inserted
+        auto cmp = comparators_[cf];
+        keys_[cf] = CFKeys(SetComparator(cmp));
+      }
+      auto it = cf_keys.insert(key);
+      // second is false if a element already existed.
+      if (it.second == false) {
+        return s;
+      }
+
+      PinnableSlice pinnable_val;
+      bool not_used;
+      auto cf_handle = handles_[cf];
+      DBImpl::GetImplOptions get_impl_options;
+      get_impl_options.column_family = cf_handle;
+      get_impl_options.value = &pinnable_val;
+      get_impl_options.value_found = &not_used;
+      get_impl_options.callback = &callback_;
+      s = db_->GetImpl(roptions_, key, get_impl_options);
+      assert(s.ok() || s.IsNotFound());
+      if (s.ok()) {
+        s = rollback_batch_->Put(cf_handle, key, pinnable_val);
+        assert(s.ok());
+      } else if (s.IsNotFound()) {
+        // There has been no readable value before txn. By adding a delete we
+        // make sure that there will be none afterwards either.
+        if (wpt_db_->ShouldRollbackWithSingleDelete(cf_handle, key)) {
+          s = rollback_batch_->SingleDelete(cf_handle, key);
+        } else {
+          s = rollback_batch_->Delete(cf_handle, key);
+        }
+        assert(s.ok());
+      } else {
+        // Unexpected status. Return it to the user.
+      }
+      return s;
+    }
+
+    Status PutCF(uint32_t cf, const Slice& key, const Slice& /*val*/) override {
+      return Rollback(cf, key);
+    }
+
+    Status DeleteCF(uint32_t cf, const Slice& key) override {
+      return Rollback(cf, key);
+    }
+
+    Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+      return Rollback(cf, key);
+    }
+
+    Status MergeCF(uint32_t cf, const Slice& key,
+                   const Slice& /*val*/) override {
+      if (rollback_merge_operands_) {
+        return Rollback(cf, key);
+      } else {
+        return Status::OK();
+      }
+    }
+
+    Status MarkNoop(bool) override { return Status::OK(); }
+    Status MarkBeginPrepare(bool) override { return Status::OK(); }
+    Status MarkEndPrepare(const Slice&) override { return Status::OK(); }
+    Status MarkCommit(const Slice&) override { return Status::OK(); }
+    Status MarkRollback(const Slice&) override {
+      return Status::InvalidArgument();
+    }
+
+   protected:
+    Handler::OptionState WriteAfterCommit() const override {
+      return Handler::OptionState::kDisabled;
+    }
+  } rollback_handler(db_impl_, wpt_db_, read_at_seq, &rollback_batch,
+                     *cf_comp_map_shared_ptr.get(), *cf_map_shared_ptr.get(),
+                     wpt_db_->txn_db_options_.rollback_merge_operands,
+                     roptions);
+  auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&rollback_handler);
+  if (!s.ok()) {
+    return s;
+  }
+  // The Rollback marker will be used as a batch separator
+  s = WriteBatchInternal::MarkRollback(&rollback_batch, name_);
+  assert(s.ok());
+  bool do_one_write = !db_impl_->immutable_db_options().two_write_queues;
+  const bool DISABLE_MEMTABLE = true;
+  const uint64_t NO_REF_LOG = 0;
+  uint64_t seq_used = kMaxSequenceNumber;
+  const size_t ONE_BATCH = 1;
+  const bool kFirstPrepareBatch = true;
+  // We commit the rolled back prepared batches. Although this is
+  // counter-intuitive, i) it is safe to do so, since the prepared batches are
+  // already canceled out by the rollback batch, ii) adding the commit entry to
+  // CommitCache will allow us to benefit from the existing mechanism in
+  // CommitCache that keeps an entry evicted due to max advance and yet overlaps
+  // with a live snapshot around so that the live snapshot properly skips the
+  // entry even if its prepare seq is lower than max_evicted_seq_.
+  AddPreparedCallback add_prepared_callback(
+      wpt_db_, db_impl_, ONE_BATCH,
+      db_impl_->immutable_db_options().two_write_queues, !kFirstPrepareBatch);
+  WritePreparedCommitEntryPreReleaseCallback update_commit_map(
+      wpt_db_, db_impl_, GetId(), prepare_batch_cnt_, ONE_BATCH);
+  PreReleaseCallback* pre_release_callback;
+  if (do_one_write) {
+    pre_release_callback = &update_commit_map;
+  } else {
+    pre_release_callback = &add_prepared_callback;
+  }
+  // Note: the rollback batch does not need AddPrepared since it is written to
+  // DB in one shot. min_uncommitted still works since it requires capturing
+  // data that is written to DB but not yet committed, while
+  // the rollback batch commits with PreReleaseCallback.
+  s = db_impl_->WriteImpl(write_options_, &rollback_batch, nullptr, nullptr,
+                          NO_REF_LOG, !DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
+                          pre_release_callback);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  if (!s.ok()) {
+    return s;
+  }
+  if (do_one_write) {
+    assert(!db_impl_->immutable_db_options().two_write_queues);
+    wpt_db_->RemovePrepared(GetId(), prepare_batch_cnt_);
+    return s;
+  }  // else do the 2nd write for commit
+  uint64_t rollback_seq = seq_used;
+  ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
+                    "RollbackInternal 2nd write rollback_seq: %" PRIu64,
+                    rollback_seq);
+  // Commit the batch by writing an empty batch to the queue that will release
+  // the commit sequence number to readers.
+  WritePreparedRollbackPreReleaseCallback update_commit_map_with_prepare(
+      wpt_db_, db_impl_, GetId(), rollback_seq, prepare_batch_cnt_);
+  WriteBatch empty_batch;
+  s = empty_batch.PutLogData(Slice());
+  assert(s.ok());
+  // In the absence of Prepare markers, use Noop as a batch separator
+  s = WriteBatchInternal::InsertNoop(&empty_batch);
+  assert(s.ok());
+  s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
+                          NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
+                          &update_commit_map_with_prepare);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
+                    "RollbackInternal (status=%s) commit: %" PRIu64,
+                    s.ToString().c_str(), GetId());
+  // TODO(lth): For WriteUnPrepared that rollback is called frequently,
+  // RemovePrepared could be moved to the callback to reduce lock contention.
+  if (s.ok()) {
+    wpt_db_->RemovePrepared(GetId(), prepare_batch_cnt_);
+  }
+  // Note: RemovePrepared for prepared batch is called from within
+  // PreReleaseCallback
+  wpt_db_->RemovePrepared(rollback_seq, ONE_BATCH);
+
+  return s;
+}
+
+Status WritePreparedTxn::ValidateSnapshot(ColumnFamilyHandle* column_family,
+                                          const Slice& key,
+                                          SequenceNumber* tracked_at_seq) {
+  assert(snapshot_);
+
+  SequenceNumber min_uncommitted =
+      static_cast_with_check<const SnapshotImpl>(snapshot_.get())
+          ->min_uncommitted_;
+  SequenceNumber snap_seq = snapshot_->GetSequenceNumber();
+  // tracked_at_seq is either max or the last snapshot with which this key was
+  // trackeed so there is no need to apply the IsInSnapshot to this comparison
+  // here as tracked_at_seq is not a prepare seq.
+  if (*tracked_at_seq <= snap_seq) {
+    // If the key has been previous validated at a sequence number earlier
+    // than the curent snapshot's sequence number, we already know it has not
+    // been modified.
+    return Status::OK();
+  }
+
+  *tracked_at_seq = snap_seq;
+
+  ColumnFamilyHandle* cfh =
+      column_family ? column_family : db_impl_->DefaultColumnFamily();
+
+  WritePreparedTxnReadCallback snap_checker(wpt_db_, snap_seq, min_uncommitted,
+                                            kBackedByDBSnapshot);
+  // TODO(yanqin): support user-defined timestamp
+  return TransactionUtil::CheckKeyForConflicts(
+      db_impl_, cfh, key.ToString(), snap_seq, /*ts=*/nullptr,
+      false /* cache_only */, &snap_checker, min_uncommitted);
+}
+
+void WritePreparedTxn::SetSnapshot() {
+  const bool kForWWConflictCheck = true;
+  SnapshotImpl* snapshot = wpt_db_->GetSnapshotInternal(kForWWConflictCheck);
+  SetSnapshotInternal(snapshot);
+}
+
+Status WritePreparedTxn::RebuildFromWriteBatch(WriteBatch* src_batch) {
+  auto ret = PessimisticTransaction::RebuildFromWriteBatch(src_batch);
+  prepare_batch_cnt_ = GetWriteBatch()->SubBatchCnt();
+  return ret;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/write_prepared_txn.h b/src/rocksdb/utilities/transactions/write_prepared_txn.h
new file mode 100644
index 000000000..30d9bdb99
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/write_prepared_txn.h
@@ -0,0 +1,119 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <atomic>
+#include <mutex>
+#include <stack>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "db/write_callback.h"
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/snapshot.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "util/autovector.h"
+#include "utilities/transactions/pessimistic_transaction.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/transaction_base.h"
+#include "utilities/transactions/transaction_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WritePreparedTxnDB;
+
+// This impl could write to DB also uncommitted data and then later tell apart
+// committed data from uncommitted data. Uncommitted data could be after the
+// Prepare phase in 2PC (WritePreparedTxn) or before that
+// (WriteUnpreparedTxnImpl).
+class WritePreparedTxn : public PessimisticTransaction {
+ public:
+  WritePreparedTxn(WritePreparedTxnDB* db, const WriteOptions& write_options,
+                   const TransactionOptions& txn_options);
+  // No copying allowed
+  WritePreparedTxn(const WritePreparedTxn&) = delete;
+  void operator=(const WritePreparedTxn&) = delete;
+
+  virtual ~WritePreparedTxn() {}
+
+  // To make WAL commit markers visible, the snapshot will be based on the last
+  // seq in the WAL that is also published, LastPublishedSequence, as opposed to
+  // the last seq in the memtable.
+  using Transaction::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* value) override;
+
+  using Transaction::MultiGet;
+  virtual void MultiGet(const ReadOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const size_t num_keys, const Slice* keys,
+                        PinnableSlice* values, Status* statuses,
+                        const bool sorted_input = false) override;
+
+  // Note: The behavior is undefined in presence of interleaved writes to the
+  // same transaction.
+  // To make WAL commit markers visible, the snapshot will be
+  // based on the last seq in the WAL that is also published,
+  // LastPublishedSequence, as opposed to the last seq in the memtable.
+  using Transaction::GetIterator;
+  virtual Iterator* GetIterator(const ReadOptions& options) override;
+  virtual Iterator* GetIterator(const ReadOptions& options,
+                                ColumnFamilyHandle* column_family) override;
+
+  virtual void SetSnapshot() override;
+
+ protected:
+  void Initialize(const TransactionOptions& txn_options) override;
+  // Override the protected SetId to make it visible to the friend class
+  // WritePreparedTxnDB
+  inline void SetId(uint64_t id) override { Transaction::SetId(id); }
+
+ private:
+  friend class WritePreparedTransactionTest_BasicRecoveryTest_Test;
+  friend class WritePreparedTxnDB;
+  friend class WriteUnpreparedTxnDB;
+  friend class WriteUnpreparedTxn;
+
+  Status PrepareInternal() override;
+
+  Status CommitWithoutPrepareInternal() override;
+
+  Status CommitBatchInternal(WriteBatch* batch, size_t batch_cnt) override;
+
+  // Since the data is already written to memtables at the Prepare phase, the
+  // commit entails writing only a commit marker in the WAL. The sequence number
+  // of the commit marker is then the commit timestamp of the transaction. To
+  // make WAL commit markers visible, the snapshot will be based on the last seq
+  // in the WAL that is also published, LastPublishedSequence, as opposed to the
+  // last seq in the memtable.
+  Status CommitInternal() override;
+
+  Status RollbackInternal() override;
+
+  virtual Status ValidateSnapshot(ColumnFamilyHandle* column_family,
+                                  const Slice& key,
+                                  SequenceNumber* tracked_at_seq) override;
+
+  virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) override;
+
+  WritePreparedTxnDB* wpt_db_;
+  // Number of sub-batches in prepare
+  size_t prepare_batch_cnt_ = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/write_prepared_txn_db.cc b/src/rocksdb/utilities/transactions/write_prepared_txn_db.cc
new file mode 100644
index 000000000..595c3df8f
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/write_prepared_txn_db.cc
@@ -0,0 +1,1030 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/write_prepared_txn_db.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/db_impl/db_impl.h"
+#include "logging/logging.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/mutexlock.h"
+#include "util/string_util.h"
+#include "utilities/transactions/pessimistic_transaction.h"
+#include "utilities/transactions/transaction_db_mutex_impl.h"
+
+// This function is for testing only. If it returns true, then all entries in
+// the commit cache will be evicted. Unit and/or stress tests (db_stress)
+// can implement this function and customize how frequently commit cache
+// eviction occurs.
+// TODO: remove this function once we can configure commit cache to be very
+// small so that eviction occurs very frequently. This requires the commit
+// cache entry to be able to encode prepare and commit sequence numbers so that
+// the commit sequence number does not have to be within a certain range of
+// prepare sequence number.
+extern "C" bool rocksdb_write_prepared_TEST_ShouldClearCommitCache(void)
+    __attribute__((__weak__));
+
+namespace ROCKSDB_NAMESPACE {
+
+Status WritePreparedTxnDB::Initialize(
+    const std::vector<size_t>& compaction_enabled_cf_indices,
+    const std::vector<ColumnFamilyHandle*>& handles) {
+  auto dbimpl = static_cast_with_check<DBImpl>(GetRootDB());
+  assert(dbimpl != nullptr);
+  auto rtxns = dbimpl->recovered_transactions();
+  std::map<SequenceNumber, SequenceNumber> ordered_seq_cnt;
+  for (auto rtxn : rtxns) {
+    // There should only one batch for WritePrepared policy.
+    assert(rtxn.second->batches_.size() == 1);
+    const auto& seq = rtxn.second->batches_.begin()->first;
+    const auto& batch_info = rtxn.second->batches_.begin()->second;
+    auto cnt = batch_info.batch_cnt_ ? batch_info.batch_cnt_ : 1;
+    ordered_seq_cnt[seq] = cnt;
+  }
+  // AddPrepared must be called in order
+  for (auto seq_cnt : ordered_seq_cnt) {
+    auto seq = seq_cnt.first;
+    auto cnt = seq_cnt.second;
+    for (size_t i = 0; i < cnt; i++) {
+      AddPrepared(seq + i);
+    }
+  }
+  SequenceNumber prev_max = max_evicted_seq_;
+  SequenceNumber last_seq = db_impl_->GetLatestSequenceNumber();
+  AdvanceMaxEvictedSeq(prev_max, last_seq);
+  // Create a gap between max and the next snapshot. This simplifies the logic
+  // in IsInSnapshot by not having to consider the special case of max ==
+  // snapshot after recovery. This is tested in IsInSnapshotEmptyMapTest.
+  if (last_seq) {
+    db_impl_->versions_->SetLastAllocatedSequence(last_seq + 1);
+    db_impl_->versions_->SetLastSequence(last_seq + 1);
+    db_impl_->versions_->SetLastPublishedSequence(last_seq + 1);
+  }
+
+  db_impl_->SetSnapshotChecker(new WritePreparedSnapshotChecker(this));
+  // A callback to commit a single sub-batch
+  class CommitSubBatchPreReleaseCallback : public PreReleaseCallback {
+   public:
+    explicit CommitSubBatchPreReleaseCallback(WritePreparedTxnDB* db)
+        : db_(db) {}
+    Status Callback(SequenceNumber commit_seq,
+                    bool is_mem_disabled __attribute__((__unused__)), uint64_t,
+                    size_t /*index*/, size_t /*total*/) override {
+      assert(!is_mem_disabled);
+      db_->AddCommitted(commit_seq, commit_seq);
+      return Status::OK();
+    }
+
+   private:
+    WritePreparedTxnDB* db_;
+  };
+  db_impl_->SetRecoverableStatePreReleaseCallback(
+      new CommitSubBatchPreReleaseCallback(this));
+
+  auto s = PessimisticTransactionDB::Initialize(compaction_enabled_cf_indices,
+                                                handles);
+  return s;
+}
+
+Status WritePreparedTxnDB::VerifyCFOptions(
+    const ColumnFamilyOptions& cf_options) {
+  Status s = PessimisticTransactionDB::VerifyCFOptions(cf_options);
+  if (!s.ok()) {
+    return s;
+  }
+  if (!cf_options.memtable_factory->CanHandleDuplicatedKey()) {
+    return Status::InvalidArgument(
+        "memtable_factory->CanHandleDuplicatedKey() cannot be false with "
+        "WritePrpeared transactions");
+  }
+  return Status::OK();
+}
+
+Transaction* WritePreparedTxnDB::BeginTransaction(
+    const WriteOptions& write_options, const TransactionOptions& txn_options,
+    Transaction* old_txn) {
+  if (old_txn != nullptr) {
+    ReinitializeTransaction(old_txn, write_options, txn_options);
+    return old_txn;
+  } else {
+    return new WritePreparedTxn(this, write_options, txn_options);
+  }
+}
+
+Status WritePreparedTxnDB::Write(const WriteOptions& opts,
+                                 WriteBatch* updates) {
+  if (txn_db_options_.skip_concurrency_control) {
+    // Skip locking the rows
+    const size_t UNKNOWN_BATCH_CNT = 0;
+    WritePreparedTxn* NO_TXN = nullptr;
+    return WriteInternal(opts, updates, UNKNOWN_BATCH_CNT, NO_TXN);
+  } else {
+    return PessimisticTransactionDB::WriteWithConcurrencyControl(opts, updates);
+  }
+}
+
+Status WritePreparedTxnDB::Write(
+    const WriteOptions& opts,
+    const TransactionDBWriteOptimizations& optimizations, WriteBatch* updates) {
+  if (optimizations.skip_concurrency_control) {
+    // Skip locking the rows
+    const size_t UNKNOWN_BATCH_CNT = 0;
+    const size_t ONE_BATCH_CNT = 1;
+    const size_t batch_cnt = optimizations.skip_duplicate_key_check
+                                 ? ONE_BATCH_CNT
+                                 : UNKNOWN_BATCH_CNT;
+    WritePreparedTxn* NO_TXN = nullptr;
+    return WriteInternal(opts, updates, batch_cnt, NO_TXN);
+  } else {
+    // TODO(myabandeh): Make use of skip_duplicate_key_check hint
+    // Fall back to unoptimized version
+    return PessimisticTransactionDB::WriteWithConcurrencyControl(opts, updates);
+  }
+}
+
+Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig,
+                                         WriteBatch* batch, size_t batch_cnt,
+                                         WritePreparedTxn* txn) {
+  ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
+                    "CommitBatchInternal");
+  if (batch->Count() == 0) {
+    // Otherwise our 1 seq per batch logic will break since there is no seq
+    // increased for this batch.
+    return Status::OK();
+  }
+
+  if (write_options_orig.protection_bytes_per_key > 0) {
+    auto s = WriteBatchInternal::UpdateProtectionInfo(
+        batch, write_options_orig.protection_bytes_per_key);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (batch_cnt == 0) {  // not provided, then compute it
+    // TODO(myabandeh): add an option to allow user skipping this cost
+    SubBatchCounter counter(*GetCFComparatorMap());
+    auto s = batch->Iterate(&counter);
+    if (!s.ok()) {
+      return s;
+    }
+    batch_cnt = counter.BatchCount();
+    WPRecordTick(TXN_DUPLICATE_KEY_OVERHEAD);
+    ROCKS_LOG_DETAILS(info_log_, "Duplicate key overhead: %" PRIu64 " batches",
+                      static_cast<uint64_t>(batch_cnt));
+  }
+  assert(batch_cnt);
+
+  bool do_one_write = !db_impl_->immutable_db_options().two_write_queues;
+  WriteOptions write_options(write_options_orig);
+  // In the absence of Prepare markers, use Noop as a batch separator
+  auto s = WriteBatchInternal::InsertNoop(batch);
+  assert(s.ok());
+  const bool DISABLE_MEMTABLE = true;
+  const uint64_t no_log_ref = 0;
+  uint64_t seq_used = kMaxSequenceNumber;
+  const size_t ZERO_PREPARES = 0;
+  const bool kSeperatePrepareCommitBatches = true;
+  // Since this is not 2pc, there is no need for AddPrepared but having it in
+  // the PreReleaseCallback enables an optimization. Refer to
+  // SmallestUnCommittedSeq for more details.
+  AddPreparedCallback add_prepared_callback(
+      this, db_impl_, batch_cnt,
+      db_impl_->immutable_db_options().two_write_queues,
+      !kSeperatePrepareCommitBatches);
+  WritePreparedCommitEntryPreReleaseCallback update_commit_map(
+      this, db_impl_, kMaxSequenceNumber, ZERO_PREPARES, batch_cnt);
+  PreReleaseCallback* pre_release_callback;
+  if (do_one_write) {
+    pre_release_callback = &update_commit_map;
+  } else {
+    pre_release_callback = &add_prepared_callback;
+  }
+  s = db_impl_->WriteImpl(write_options, batch, nullptr, nullptr, no_log_ref,
+                          !DISABLE_MEMTABLE, &seq_used, batch_cnt,
+                          pre_release_callback);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  uint64_t prepare_seq = seq_used;
+  if (txn != nullptr) {
+    txn->SetId(prepare_seq);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+  if (do_one_write) {
+    return s;
+  }  // else do the 2nd write for commit
+  ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
+                    "CommitBatchInternal 2nd write prepare_seq: %" PRIu64,
+                    prepare_seq);
+  // Commit the batch by writing an empty batch to the 2nd queue that will
+  // release the commit sequence number to readers.
+  const size_t ZERO_COMMITS = 0;
+  WritePreparedCommitEntryPreReleaseCallback update_commit_map_with_prepare(
+      this, db_impl_, prepare_seq, batch_cnt, ZERO_COMMITS);
+  WriteBatch empty_batch;
+  write_options.disableWAL = true;
+  write_options.sync = false;
+  const size_t ONE_BATCH = 1;  // Just to inc the seq
+  s = db_impl_->WriteImpl(write_options, &empty_batch, nullptr, nullptr,
+                          no_log_ref, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
+                          &update_commit_map_with_prepare);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  // Note: RemovePrepared is called from within PreReleaseCallback
+  return s;
+}
+
+Status WritePreparedTxnDB::Get(const ReadOptions& options,
+                               ColumnFamilyHandle* column_family,
+                               const Slice& key, PinnableSlice* value) {
+  SequenceNumber min_uncommitted, snap_seq;
+  const SnapshotBackup backed_by_snapshot =
+      AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
+  WritePreparedTxnReadCallback callback(this, snap_seq, min_uncommitted,
+                                        backed_by_snapshot);
+  bool* dont_care = nullptr;
+  DBImpl::GetImplOptions get_impl_options;
+  get_impl_options.column_family = column_family;
+  get_impl_options.value = value;
+  get_impl_options.value_found = dont_care;
+  get_impl_options.callback = &callback;
+  auto res = db_impl_->GetImpl(options, key, get_impl_options);
+  if (LIKELY(callback.valid() && ValidateSnapshot(callback.max_visible_seq(),
+                                                  backed_by_snapshot))) {
+    return res;
+  } else {
+    res.PermitUncheckedError();
+    WPRecordTick(TXN_GET_TRY_AGAIN);
+    return Status::TryAgain();
+  }
+}
+
+void WritePreparedTxnDB::UpdateCFComparatorMap(
+    const std::vector<ColumnFamilyHandle*>& handles) {
+  auto cf_map = new std::map<uint32_t, const Comparator*>();
+  auto handle_map = new std::map<uint32_t, ColumnFamilyHandle*>();
+  for (auto h : handles) {
+    auto id = h->GetID();
+    const Comparator* comparator = h->GetComparator();
+    (*cf_map)[id] = comparator;
+    if (id != 0) {
+      (*handle_map)[id] = h;
+    } else {
+      // The pointer to the default cf handle in the handles will be deleted.
+      // Use the pointer maintained by the db instead.
+      (*handle_map)[id] = DefaultColumnFamily();
+    }
+  }
+  cf_map_.reset(cf_map);
+  handle_map_.reset(handle_map);
+}
+
+void WritePreparedTxnDB::UpdateCFComparatorMap(ColumnFamilyHandle* h) {
+  auto old_cf_map_ptr = cf_map_.get();
+  assert(old_cf_map_ptr);
+  auto cf_map = new std::map<uint32_t, const Comparator*>(*old_cf_map_ptr);
+  auto old_handle_map_ptr = handle_map_.get();
+  assert(old_handle_map_ptr);
+  auto handle_map =
+      new std::map<uint32_t, ColumnFamilyHandle*>(*old_handle_map_ptr);
+  auto id = h->GetID();
+  const Comparator* comparator = h->GetComparator();
+  (*cf_map)[id] = comparator;
+  (*handle_map)[id] = h;
+  cf_map_.reset(cf_map);
+  handle_map_.reset(handle_map);
+}
+
+std::vector<Status> WritePreparedTxnDB::MultiGet(
+    const ReadOptions& options,
+    const std::vector<ColumnFamilyHandle*>& column_family,
+    const std::vector<Slice>& keys, std::vector<std::string>* values) {
+  assert(values);
+  size_t num_keys = keys.size();
+  values->resize(num_keys);
+
+  std::vector<Status> stat_list(num_keys);
+  for (size_t i = 0; i < num_keys; ++i) {
+    stat_list[i] = this->Get(options, column_family[i], keys[i], &(*values)[i]);
+  }
+  return stat_list;
+}
+
+// Struct to hold ownership of snapshot and read callback for iterator cleanup.
+struct WritePreparedTxnDB::IteratorState {
+  IteratorState(WritePreparedTxnDB* txn_db, SequenceNumber sequence,
+                std::shared_ptr<ManagedSnapshot> s,
+                SequenceNumber min_uncommitted)
+      : callback(txn_db, sequence, min_uncommitted, kBackedByDBSnapshot),
+        snapshot(s) {}
+
+  WritePreparedTxnReadCallback callback;
+  std::shared_ptr<ManagedSnapshot> snapshot;
+};
+
+namespace {
+static void CleanupWritePreparedTxnDBIterator(void* arg1, void* /*arg2*/) {
+  delete reinterpret_cast<WritePreparedTxnDB::IteratorState*>(arg1);
+}
+}  // anonymous namespace
+
+Iterator* WritePreparedTxnDB::NewIterator(const ReadOptions& options,
+                                          ColumnFamilyHandle* column_family) {
+  constexpr bool expose_blob_index = false;
+  constexpr bool allow_refresh = false;
+  std::shared_ptr<ManagedSnapshot> own_snapshot = nullptr;
+  SequenceNumber snapshot_seq = kMaxSequenceNumber;
+  SequenceNumber min_uncommitted = 0;
+  if (options.snapshot != nullptr) {
+    snapshot_seq = options.snapshot->GetSequenceNumber();
+    min_uncommitted =
+        static_cast_with_check<const SnapshotImpl>(options.snapshot)
+            ->min_uncommitted_;
+  } else {
+    auto* snapshot = GetSnapshot();
+    // We take a snapshot to make sure that the related data in the commit map
+    // are not deleted.
+    snapshot_seq = snapshot->GetSequenceNumber();
+    min_uncommitted =
+        static_cast_with_check<const SnapshotImpl>(snapshot)->min_uncommitted_;
+    own_snapshot = std::make_shared<ManagedSnapshot>(db_impl_, snapshot);
+  }
+  assert(snapshot_seq != kMaxSequenceNumber);
+  auto* cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+  auto* state =
+      new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted);
+  auto* db_iter =
+      db_impl_->NewIteratorImpl(options, cfd, snapshot_seq, &state->callback,
+                                expose_blob_index, allow_refresh);
+  db_iter->RegisterCleanup(CleanupWritePreparedTxnDBIterator, state, nullptr);
+  return db_iter;
+}
+
+Status WritePreparedTxnDB::NewIterators(
+    const ReadOptions& options,
+    const std::vector<ColumnFamilyHandle*>& column_families,
+    std::vector<Iterator*>* iterators) {
+  constexpr bool expose_blob_index = false;
+  constexpr bool allow_refresh = false;
+  std::shared_ptr<ManagedSnapshot> own_snapshot = nullptr;
+  SequenceNumber snapshot_seq = kMaxSequenceNumber;
+  SequenceNumber min_uncommitted = 0;
+  if (options.snapshot != nullptr) {
+    snapshot_seq = options.snapshot->GetSequenceNumber();
+    min_uncommitted =
+        static_cast_with_check<const SnapshotImpl>(options.snapshot)
+            ->min_uncommitted_;
+  } else {
+    auto* snapshot = GetSnapshot();
+    // We take a snapshot to make sure that the related data in the commit map
+    // are not deleted.
+    snapshot_seq = snapshot->GetSequenceNumber();
+    own_snapshot = std::make_shared<ManagedSnapshot>(db_impl_, snapshot);
+    min_uncommitted =
+        static_cast_with_check<const SnapshotImpl>(snapshot)->min_uncommitted_;
+  }
+  iterators->clear();
+  iterators->reserve(column_families.size());
+  for (auto* column_family : column_families) {
+    auto* cfd =
+        static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+    auto* state =
+        new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted);
+    auto* db_iter =
+        db_impl_->NewIteratorImpl(options, cfd, snapshot_seq, &state->callback,
+                                  expose_blob_index, allow_refresh);
+    db_iter->RegisterCleanup(CleanupWritePreparedTxnDBIterator, state, nullptr);
+    iterators->push_back(db_iter);
+  }
+  return Status::OK();
+}
+
+void WritePreparedTxnDB::Init(const TransactionDBOptions& txn_db_opts) {
+  // Adcance max_evicted_seq_ no more than 100 times before the cache wraps
+  // around.
+  INC_STEP_FOR_MAX_EVICTED =
+      std::max(COMMIT_CACHE_SIZE / 100, static_cast<size_t>(1));
+  snapshot_cache_ = std::unique_ptr<std::atomic<SequenceNumber>[]>(
+      new std::atomic<SequenceNumber>[SNAPSHOT_CACHE_SIZE] {});
+  commit_cache_ = std::unique_ptr<std::atomic<CommitEntry64b>[]>(
+      new std::atomic<CommitEntry64b>[COMMIT_CACHE_SIZE] {});
+  dummy_max_snapshot_.number_ = kMaxSequenceNumber;
+  rollback_deletion_type_callback_ =
+      txn_db_opts.rollback_deletion_type_callback;
+}
+
+void WritePreparedTxnDB::CheckPreparedAgainstMax(SequenceNumber new_max,
+                                                 bool locked) {
+  // When max_evicted_seq_ advances, move older entries from prepared_txns_
+  // to delayed_prepared_. This guarantees that if a seq is lower than max,
+  // then it is not in prepared_txns_ and save an expensive, synchronized
+  // lookup from a shared set. delayed_prepared_ is expected to be empty in
+  // normal cases.
+  ROCKS_LOG_DETAILS(
+      info_log_,
+      "CheckPreparedAgainstMax prepared_txns_.empty() %d top: %" PRIu64,
+      prepared_txns_.empty(),
+      prepared_txns_.empty() ? 0 : prepared_txns_.top());
+  const SequenceNumber prepared_top = prepared_txns_.top();
+  const bool empty = prepared_top == kMaxSequenceNumber;
+  // Preliminary check to avoid the synchronization cost
+  if (!empty && prepared_top <= new_max) {
+    if (locked) {
+      // Needed to avoid double locking in pop().
+      prepared_txns_.push_pop_mutex()->Unlock();
+    }
+    WriteLock wl(&prepared_mutex_);
+    // Need to fetch fresh values of ::top after mutex is acquired
+    while (!prepared_txns_.empty() && prepared_txns_.top() <= new_max) {
+      auto to_be_popped = prepared_txns_.top();
+      delayed_prepared_.insert(to_be_popped);
+      ROCKS_LOG_WARN(info_log_,
+                     "prepared_mutex_ overhead %" PRIu64 " (prep=%" PRIu64
+                     " new_max=%" PRIu64 ")",
+                     static_cast<uint64_t>(delayed_prepared_.size()),
+                     to_be_popped, new_max);
+      delayed_prepared_empty_.store(false, std::memory_order_release);
+      // Update prepared_txns_ after updating delayed_prepared_empty_ otherwise
+      // there will be a point in time that the entry is neither in
+      // prepared_txns_ nor in delayed_prepared_, which will not be checked if
+      // delayed_prepared_empty_ is false.
+      prepared_txns_.pop();
+    }
+    if (locked) {
+      prepared_txns_.push_pop_mutex()->Lock();
+    }
+  }
+}
+
+void WritePreparedTxnDB::AddPrepared(uint64_t seq, bool locked) {
+  ROCKS_LOG_DETAILS(info_log_, "Txn %" PRIu64 " Preparing with max %" PRIu64,
+                    seq, max_evicted_seq_.load());
+  TEST_SYNC_POINT("AddPrepared::begin:pause");
+  TEST_SYNC_POINT("AddPrepared::begin:resume");
+  if (!locked) {
+    prepared_txns_.push_pop_mutex()->Lock();
+  }
+  prepared_txns_.push_pop_mutex()->AssertHeld();
+  prepared_txns_.push(seq);
+  auto new_max = future_max_evicted_seq_.load();
+  if (UNLIKELY(seq <= new_max)) {
+    // This should not happen in normal case
+    ROCKS_LOG_ERROR(
+        info_log_,
+        "Added prepare_seq is not larger than max_evicted_seq_: %" PRIu64
+        " <= %" PRIu64,
+        seq, new_max);
+    CheckPreparedAgainstMax(new_max, true /*locked*/);
+  }
+  if (!locked) {
+    prepared_txns_.push_pop_mutex()->Unlock();
+  }
+  TEST_SYNC_POINT("AddPrepared::end");
+}
+
+void WritePreparedTxnDB::AddCommitted(uint64_t prepare_seq, uint64_t commit_seq,
+                                      uint8_t loop_cnt) {
+  ROCKS_LOG_DETAILS(info_log_, "Txn %" PRIu64 " Committing with %" PRIu64,
+                    prepare_seq, commit_seq);
+  TEST_SYNC_POINT("WritePreparedTxnDB::AddCommitted:start");
+  TEST_SYNC_POINT("WritePreparedTxnDB::AddCommitted:start:pause");
+  auto indexed_seq = prepare_seq % COMMIT_CACHE_SIZE;
+  CommitEntry64b evicted_64b;
+  CommitEntry evicted;
+  bool to_be_evicted = GetCommitEntry(indexed_seq, &evicted_64b, &evicted);
+  if (LIKELY(to_be_evicted)) {
+    assert(evicted.prep_seq != prepare_seq);
+    auto prev_max = max_evicted_seq_.load(std::memory_order_acquire);
+    ROCKS_LOG_DETAILS(info_log_,
+                      "Evicting %" PRIu64 ",%" PRIu64 " with max %" PRIu64,
+                      evicted.prep_seq, evicted.commit_seq, prev_max);
+    if (prev_max < evicted.commit_seq) {
+      auto last = db_impl_->GetLastPublishedSequence();  // could be 0
+      SequenceNumber max_evicted_seq;
+      if (LIKELY(evicted.commit_seq < last)) {
+        assert(last > 0);
+        // Inc max in larger steps to avoid frequent updates
+        max_evicted_seq =
+            std::min(evicted.commit_seq + INC_STEP_FOR_MAX_EVICTED, last - 1);
+      } else {
+        // legit when a commit entry in a write batch overwrite the previous one
+        max_evicted_seq = evicted.commit_seq;
+      }
+#ifdef OS_LINUX
+      if (rocksdb_write_prepared_TEST_ShouldClearCommitCache &&
+          rocksdb_write_prepared_TEST_ShouldClearCommitCache()) {
+        max_evicted_seq = last;
+      }
+#endif  // OS_LINUX
+      ROCKS_LOG_DETAILS(info_log_,
+                        "%lu Evicting %" PRIu64 ",%" PRIu64 " with max %" PRIu64
+                        " => %lu",
+                        prepare_seq, evicted.prep_seq, evicted.commit_seq,
+                        prev_max, max_evicted_seq);
+      AdvanceMaxEvictedSeq(prev_max, max_evicted_seq);
+    }
+    if (UNLIKELY(!delayed_prepared_empty_.load(std::memory_order_acquire))) {
+      WriteLock wl(&prepared_mutex_);
+      auto dp_iter = delayed_prepared_.find(evicted.prep_seq);
+      if (dp_iter != delayed_prepared_.end()) {
+        // This is a rare case that txn is committed but prepared_txns_ is not
+        // cleaned up yet. Refer to delayed_prepared_commits_ definition for
+        // why it should be kept updated.
+        delayed_prepared_commits_[evicted.prep_seq] = evicted.commit_seq;
+        ROCKS_LOG_DEBUG(info_log_,
+                        "delayed_prepared_commits_[%" PRIu64 "]=%" PRIu64,
+                        evicted.prep_seq, evicted.commit_seq);
+      }
+    }
+    // After each eviction from commit cache, check if the commit entry should
+    // be kept around because it overlaps with a live snapshot.
+    CheckAgainstSnapshots(evicted);
+  }
+  bool succ =
+      ExchangeCommitEntry(indexed_seq, evicted_64b, {prepare_seq, commit_seq});
+  if (UNLIKELY(!succ)) {
+    ROCKS_LOG_ERROR(info_log_,
+                    "ExchangeCommitEntry failed on [%" PRIu64 "] %" PRIu64
+                    ",%" PRIu64 " retrying...",
+                    indexed_seq, prepare_seq, commit_seq);
+    // A very rare event, in which the commit entry is updated before we do.
+    // Here we apply a very simple solution of retrying.
+    if (loop_cnt > 100) {
+      throw std::runtime_error("Infinite loop in AddCommitted!");
+    }
+    AddCommitted(prepare_seq, commit_seq, ++loop_cnt);
+    return;
+  }
+  TEST_SYNC_POINT("WritePreparedTxnDB::AddCommitted:end");
+  TEST_SYNC_POINT("WritePreparedTxnDB::AddCommitted:end:pause");
+}
+
+void WritePreparedTxnDB::RemovePrepared(const uint64_t prepare_seq,
+                                        const size_t batch_cnt) {
+  TEST_SYNC_POINT_CALLBACK(
+      "RemovePrepared:Start",
+      const_cast<void*>(reinterpret_cast<const void*>(&prepare_seq)));
+  TEST_SYNC_POINT("WritePreparedTxnDB::RemovePrepared:pause");
+  TEST_SYNC_POINT("WritePreparedTxnDB::RemovePrepared:resume");
+  ROCKS_LOG_DETAILS(info_log_,
+                    "RemovePrepared %" PRIu64 " cnt: %" ROCKSDB_PRIszt,
+                    prepare_seq, batch_cnt);
+  WriteLock wl(&prepared_mutex_);
+  for (size_t i = 0; i < batch_cnt; i++) {
+    prepared_txns_.erase(prepare_seq + i);
+    bool was_empty = delayed_prepared_.empty();
+    if (!was_empty) {
+      delayed_prepared_.erase(prepare_seq + i);
+      auto it = delayed_prepared_commits_.find(prepare_seq + i);
+      if (it != delayed_prepared_commits_.end()) {
+        ROCKS_LOG_DETAILS(info_log_, "delayed_prepared_commits_.erase %" PRIu64,
+                          prepare_seq + i);
+        delayed_prepared_commits_.erase(it);
+      }
+      bool is_empty = delayed_prepared_.empty();
+      if (was_empty != is_empty) {
+        delayed_prepared_empty_.store(is_empty, std::memory_order_release);
+      }
+    }
+  }
+}
+
+bool WritePreparedTxnDB::GetCommitEntry(const uint64_t indexed_seq,
+                                        CommitEntry64b* entry_64b,
+                                        CommitEntry* entry) const {
+  *entry_64b = commit_cache_[static_cast<size_t>(indexed_seq)].load(
+      std::memory_order_acquire);
+  bool valid = entry_64b->Parse(indexed_seq, entry, FORMAT);
+  return valid;
+}
+
+bool WritePreparedTxnDB::AddCommitEntry(const uint64_t indexed_seq,
+                                        const CommitEntry& new_entry,
+                                        CommitEntry* evicted_entry) {
+  CommitEntry64b new_entry_64b(new_entry, FORMAT);
+  CommitEntry64b evicted_entry_64b =
+      commit_cache_[static_cast<size_t>(indexed_seq)].exchange(
+          new_entry_64b, std::memory_order_acq_rel);
+  bool valid = evicted_entry_64b.Parse(indexed_seq, evicted_entry, FORMAT);
+  return valid;
+}
+
+bool WritePreparedTxnDB::ExchangeCommitEntry(const uint64_t indexed_seq,
+                                             CommitEntry64b& expected_entry_64b,
+                                             const CommitEntry& new_entry) {
+  auto& atomic_entry = commit_cache_[static_cast<size_t>(indexed_seq)];
+  CommitEntry64b new_entry_64b(new_entry, FORMAT);
+  bool succ = atomic_entry.compare_exchange_strong(
+      expected_entry_64b, new_entry_64b, std::memory_order_acq_rel,
+      std::memory_order_acquire);
+  return succ;
+}
+
+void WritePreparedTxnDB::AdvanceMaxEvictedSeq(const SequenceNumber& prev_max,
+                                              const SequenceNumber& new_max) {
+  ROCKS_LOG_DETAILS(info_log_,
+                    "AdvanceMaxEvictedSeq overhead %" PRIu64 " => %" PRIu64,
+                    prev_max, new_max);
+  // Declare the intention before getting snapshot from the DB. This helps a
+  // concurrent GetSnapshot to wait to catch up with future_max_evicted_seq_ if
+  // it has not already. Otherwise the new snapshot is when we ask DB for
+  // snapshots smaller than future max.
+  auto updated_future_max = prev_max;
+  while (updated_future_max < new_max &&
+         !future_max_evicted_seq_.compare_exchange_weak(
+             updated_future_max, new_max, std::memory_order_acq_rel,
+             std::memory_order_relaxed)) {
+  };
+
+  CheckPreparedAgainstMax(new_max, false /*locked*/);
+
+  // With each change to max_evicted_seq_ fetch the live snapshots behind it.
+  // We use max as the version of snapshots to identify how fresh are the
+  // snapshot list. This works because the snapshots are between 0 and
+  // max, so the larger the max, the more complete they are.
+  SequenceNumber new_snapshots_version = new_max;
+  std::vector<SequenceNumber> snapshots;
+  bool update_snapshots = false;
+  if (new_snapshots_version > snapshots_version_) {
+    // This is to avoid updating the snapshots_ if it already updated
+    // with a more recent vesion by a concrrent thread
+    update_snapshots = true;
+    // We only care about snapshots lower then max
+    snapshots = GetSnapshotListFromDB(new_max);
+  }
+  if (update_snapshots) {
+    UpdateSnapshots(snapshots, new_snapshots_version);
+    if (!snapshots.empty()) {
+      WriteLock wl(&old_commit_map_mutex_);
+      for (auto snap : snapshots) {
+        // This allows IsInSnapshot to tell apart the reads from in valid
+        // snapshots from the reads from committed values in valid snapshots.
+        old_commit_map_[snap];
+      }
+      old_commit_map_empty_.store(false, std::memory_order_release);
+    }
+  }
+  auto updated_prev_max = prev_max;
+  TEST_SYNC_POINT("AdvanceMaxEvictedSeq::update_max:pause");
+  TEST_SYNC_POINT("AdvanceMaxEvictedSeq::update_max:resume");
+  while (updated_prev_max < new_max &&
+         !max_evicted_seq_.compare_exchange_weak(updated_prev_max, new_max,
+                                                 std::memory_order_acq_rel,
+                                                 std::memory_order_relaxed)) {
+  };
+}
+
+const Snapshot* WritePreparedTxnDB::GetSnapshot() {
+  const bool kForWWConflictCheck = true;
+  return GetSnapshotInternal(!kForWWConflictCheck);
+}
+
+SnapshotImpl* WritePreparedTxnDB::GetSnapshotInternal(
+    bool for_ww_conflict_check) {
+  // Note: for this optimization setting the last sequence number and obtaining
+  // the smallest uncommitted seq should be done atomically. However to avoid
+  // the mutex overhead, we call SmallestUnCommittedSeq BEFORE taking the
+  // snapshot. Since we always updated the list of unprepared seq (via
+  // AddPrepared) AFTER the last sequence is updated, this guarantees that the
+  // smallest uncommitted seq that we pair with the snapshot is smaller or equal
+  // the value that would be obtained otherwise atomically. That is ok since
+  // this optimization works as long as min_uncommitted is less than or equal
+  // than the smallest uncommitted seq when the snapshot was taken.
+  auto min_uncommitted = WritePreparedTxnDB::SmallestUnCommittedSeq();
+  SnapshotImpl* snap_impl = db_impl_->GetSnapshotImpl(for_ww_conflict_check);
+  TEST_SYNC_POINT("WritePreparedTxnDB::GetSnapshotInternal:first");
+  assert(snap_impl);
+  SequenceNumber snap_seq = snap_impl->GetSequenceNumber();
+  // Note: Check against future_max_evicted_seq_ (in contrast with
+  // max_evicted_seq_) in case there is a concurrent AdvanceMaxEvictedSeq.
+  if (UNLIKELY(snap_seq != 0 && snap_seq <= future_max_evicted_seq_)) {
+    // There is a very rare case in which the commit entry evicts another commit
+    // entry that is not published yet thus advancing max evicted seq beyond the
+    // last published seq. This case is not likely in real-world setup so we
+    // handle it with a few retries.
+    size_t retry = 0;
+    SequenceNumber max;
+    while ((max = future_max_evicted_seq_.load()) != 0 &&
+           snap_impl->GetSequenceNumber() <= max && retry < 100) {
+      ROCKS_LOG_WARN(info_log_,
+                     "GetSnapshot snap: %" PRIu64 " max: %" PRIu64
+                     " retry %" ROCKSDB_PRIszt,
+                     snap_impl->GetSequenceNumber(), max, retry);
+      ReleaseSnapshot(snap_impl);
+      // Wait for last visible seq to catch up with max, and also go beyond it
+      // by one.
+      AdvanceSeqByOne();
+      snap_impl = db_impl_->GetSnapshotImpl(for_ww_conflict_check);
+      assert(snap_impl);
+      retry++;
+    }
+    assert(snap_impl->GetSequenceNumber() > max);
+    if (snap_impl->GetSequenceNumber() <= max) {
+      throw std::runtime_error(
+          "Snapshot seq " + std::to_string(snap_impl->GetSequenceNumber()) +
+          " after " + std::to_string(retry) +
+          " retries is still less than futre_max_evicted_seq_" +
+          std::to_string(max));
+    }
+  }
+  EnhanceSnapshot(snap_impl, min_uncommitted);
+  ROCKS_LOG_DETAILS(
+      db_impl_->immutable_db_options().info_log,
+      "GetSnapshot %" PRIu64 " ww:%" PRIi32 " min_uncommitted: %" PRIu64,
+      snap_impl->GetSequenceNumber(), for_ww_conflict_check, min_uncommitted);
+  TEST_SYNC_POINT("WritePreparedTxnDB::GetSnapshotInternal:end");
+  return snap_impl;
+}
+
+void WritePreparedTxnDB::AdvanceSeqByOne() {
+  // Inserting an empty value will i) let the max evicted entry to be
+  // published, i.e., max == last_published, increase the last published to
+  // be one beyond max, i.e., max < last_published.
+  WriteOptions woptions;
+  TransactionOptions txn_options;
+  Transaction* txn0 = BeginTransaction(woptions, txn_options, nullptr);
+  std::hash<std::thread::id> hasher;
+  char name[64];
+  snprintf(name, 64, "txn%" ROCKSDB_PRIszt, hasher(std::this_thread::get_id()));
+  assert(strlen(name) < 64 - 1);
+  Status s = txn0->SetName(name);
+  assert(s.ok());
+  if (s.ok()) {
+    // Without prepare it would simply skip the commit
+    s = txn0->Prepare();
+  }
+  assert(s.ok());
+  if (s.ok()) {
+    s = txn0->Commit();
+  }
+  assert(s.ok());
+  delete txn0;
+}
+
+const std::vector<SequenceNumber> WritePreparedTxnDB::GetSnapshotListFromDB(
+    SequenceNumber max) {
+  ROCKS_LOG_DETAILS(info_log_, "GetSnapshotListFromDB with max %" PRIu64, max);
+  InstrumentedMutexLock dblock(db_impl_->mutex());
+  db_impl_->mutex()->AssertHeld();
+  return db_impl_->snapshots().GetAll(nullptr, max);
+}
+
+void WritePreparedTxnDB::ReleaseSnapshotInternal(
+    const SequenceNumber snap_seq) {
+  // TODO(myabandeh): relax should enough since the synchronizatin is already
+  // done by snapshots_mutex_ under which this function is called.
+  if (snap_seq <= max_evicted_seq_.load(std::memory_order_acquire)) {
+    // Then this is a rare case that transaction did not finish before max
+    // advances. It is expected for a few read-only backup snapshots. For such
+    // snapshots we might have kept around a couple of entries in the
+    // old_commit_map_. Check and do garbage collection if that is the case.
+    bool need_gc = false;
+    {
+      WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD);
+      ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead for %" PRIu64,
+                     snap_seq);
+      ReadLock rl(&old_commit_map_mutex_);
+      auto prep_set_entry = old_commit_map_.find(snap_seq);
+      need_gc = prep_set_entry != old_commit_map_.end();
+    }
+    if (need_gc) {
+      WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD);
+      ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead for %" PRIu64,
+                     snap_seq);
+      WriteLock wl(&old_commit_map_mutex_);
+      old_commit_map_.erase(snap_seq);
+      old_commit_map_empty_.store(old_commit_map_.empty(),
+                                  std::memory_order_release);
+    }
+  }
+}
+
+void WritePreparedTxnDB::CleanupReleasedSnapshots(
+    const std::vector<SequenceNumber>& new_snapshots,
+    const std::vector<SequenceNumber>& old_snapshots) {
+  auto newi = new_snapshots.begin();
+  auto oldi = old_snapshots.begin();
+  for (; newi != new_snapshots.end() && oldi != old_snapshots.end();) {
+    assert(*newi >= *oldi);  // cannot have new snapshots with lower seq
+    if (*newi == *oldi) {    // still not released
+      auto value = *newi;
+      while (newi != new_snapshots.end() && *newi == value) {
+        newi++;
+      }
+      while (oldi != old_snapshots.end() && *oldi == value) {
+        oldi++;
+      }
+    } else {
+      assert(*newi > *oldi);  // *oldi is released
+      ReleaseSnapshotInternal(*oldi);
+      oldi++;
+    }
+  }
+  // Everything remained in old_snapshots is released and must be cleaned up
+  for (; oldi != old_snapshots.end(); oldi++) {
+    ReleaseSnapshotInternal(*oldi);
+  }
+}
+
+void WritePreparedTxnDB::UpdateSnapshots(
+    const std::vector<SequenceNumber>& snapshots,
+    const SequenceNumber& version) {
+  ROCKS_LOG_DETAILS(info_log_, "UpdateSnapshots with version %" PRIu64,
+                    version);
+  TEST_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:p:start");
+  TEST_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:s:start");
+#ifndef NDEBUG
+  size_t sync_i = 0;
+#endif
+  ROCKS_LOG_DETAILS(info_log_, "snapshots_mutex_ overhead");
+  WriteLock wl(&snapshots_mutex_);
+  snapshots_version_ = version;
+  // We update the list concurrently with the readers.
+  // Both new and old lists are sorted and the new list is subset of the
+  // previous list plus some new items. Thus if a snapshot repeats in
+  // both new and old lists, it will appear upper in the new list. So if
+  // we simply insert the new snapshots in order, if an overwritten item
+  // is still valid in the new list is either written to the same place in
+  // the array or it is written in a higher palce before it gets
+  // overwritten by another item. This guarantess a reader that reads the
+  // list bottom-up will eventaully see a snapshot that repeats in the
+  // update, either before it gets overwritten by the writer or
+  // afterwards.
+  size_t i = 0;
+  auto it = snapshots.begin();
+  for (; it != snapshots.end() && i < SNAPSHOT_CACHE_SIZE; ++it, ++i) {
+    snapshot_cache_[i].store(*it, std::memory_order_release);
+    TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:p:", ++sync_i);
+    TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:s:", sync_i);
+  }
+#ifndef NDEBUG
+  // Release the remaining sync points since they are useless given that the
+  // reader would also use lock to access snapshots
+  for (++sync_i; sync_i <= 10; ++sync_i) {
+    TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:p:", sync_i);
+    TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:s:", sync_i);
+  }
+#endif
+  snapshots_.clear();
+  for (; it != snapshots.end(); ++it) {
+    // Insert them to a vector that is less efficient to access
+    // concurrently
+    snapshots_.push_back(*it);
+  }
+  // Update the size at the end. Otherwise a parallel reader might read
+  // items that are not set yet.
+  snapshots_total_.store(snapshots.size(), std::memory_order_release);
+
+  // Note: this must be done after the snapshots data structures are updated
+  // with the new list of snapshots.
+  CleanupReleasedSnapshots(snapshots, snapshots_all_);
+  snapshots_all_ = snapshots;
+
+  TEST_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:p:end");
+  TEST_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:s:end");
+}
+
+void WritePreparedTxnDB::CheckAgainstSnapshots(const CommitEntry& evicted) {
+  TEST_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:start");
+  TEST_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:start");
+#ifndef NDEBUG
+  size_t sync_i = 0;
+#endif
+  // First check the snapshot cache that is efficient for concurrent access
+  auto cnt = snapshots_total_.load(std::memory_order_acquire);
+  // The list might get updated concurrently as we are reading from it. The
+  // reader should be able to read all the snapshots that are still valid
+  // after the update. Since the survived snapshots are written in a higher
+  // place before gets overwritten the reader that reads bottom-up will
+  // eventully see it.
+  const bool next_is_larger = true;
+  // We will set to true if the border line snapshot suggests that.
+  bool search_larger_list = false;
+  size_t ip1 = std::min(cnt, SNAPSHOT_CACHE_SIZE);
+  for (; 0 < ip1; ip1--) {
+    SequenceNumber snapshot_seq =
+        snapshot_cache_[ip1 - 1].load(std::memory_order_acquire);
+    TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:",
+                        ++sync_i);
+    TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:", sync_i);
+    if (ip1 == SNAPSHOT_CACHE_SIZE) {  // border line snapshot
+      // snapshot_seq < commit_seq => larger_snapshot_seq <= commit_seq
+      // then later also continue the search to larger snapshots
+      search_larger_list = snapshot_seq < evicted.commit_seq;
+    }
+    if (!MaybeUpdateOldCommitMap(evicted.prep_seq, evicted.commit_seq,
+                                 snapshot_seq, !next_is_larger)) {
+      break;
+    }
+  }
+#ifndef NDEBUG
+  // Release the remaining sync points before accquiring the lock
+  for (++sync_i; sync_i <= 10; ++sync_i) {
+    TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:", sync_i);
+    TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:", sync_i);
+  }
+#endif
+  TEST_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:end");
+  TEST_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:end");
+  if (UNLIKELY(SNAPSHOT_CACHE_SIZE < cnt && search_larger_list)) {
+    // Then access the less efficient list of snapshots_
+    WPRecordTick(TXN_SNAPSHOT_MUTEX_OVERHEAD);
+    ROCKS_LOG_WARN(info_log_,
+                   "snapshots_mutex_ overhead for <%" PRIu64 ",%" PRIu64
+                   "> with %" ROCKSDB_PRIszt " snapshots",
+                   evicted.prep_seq, evicted.commit_seq, cnt);
+    ReadLock rl(&snapshots_mutex_);
+    // Items could have moved from the snapshots_ to snapshot_cache_ before
+    // accquiring the lock. To make sure that we do not miss a valid snapshot,
+    // read snapshot_cache_ again while holding the lock.
+    for (size_t i = 0; i < SNAPSHOT_CACHE_SIZE; i++) {
+      SequenceNumber snapshot_seq =
+          snapshot_cache_[i].load(std::memory_order_acquire);
+      if (!MaybeUpdateOldCommitMap(evicted.prep_seq, evicted.commit_seq,
+                                   snapshot_seq, next_is_larger)) {
+        break;
+      }
+    }
+    for (auto snapshot_seq_2 : snapshots_) {
+      if (!MaybeUpdateOldCommitMap(evicted.prep_seq, evicted.commit_seq,
+                                   snapshot_seq_2, next_is_larger)) {
+        break;
+      }
+    }
+  }
+}
+
+bool WritePreparedTxnDB::MaybeUpdateOldCommitMap(
+    const uint64_t& prep_seq, const uint64_t& commit_seq,
+    const uint64_t& snapshot_seq, const bool next_is_larger = true) {
+  // If we do not store an entry in old_commit_map_ we assume it is committed in
+  // all snapshots. If commit_seq <= snapshot_seq, it is considered already in
+  // the snapshot so we need not to keep the entry around for this snapshot.
+  if (commit_seq <= snapshot_seq) {
+    // continue the search if the next snapshot could be smaller than commit_seq
+    return !next_is_larger;
+  }
+  // then snapshot_seq < commit_seq
+  if (prep_seq <= snapshot_seq) {  // overlapping range
+    WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD);
+    ROCKS_LOG_WARN(info_log_,
+                   "old_commit_map_mutex_ overhead for %" PRIu64
+                   " commit entry: <%" PRIu64 ",%" PRIu64 ">",
+                   snapshot_seq, prep_seq, commit_seq);
+    WriteLock wl(&old_commit_map_mutex_);
+    old_commit_map_empty_.store(false, std::memory_order_release);
+    auto& vec = old_commit_map_[snapshot_seq];
+    vec.insert(std::upper_bound(vec.begin(), vec.end(), prep_seq), prep_seq);
+    // We need to store it once for each overlapping snapshot. Returning true to
+    // continue the search if there is more overlapping snapshot.
+    return true;
+  }
+  // continue the search if the next snapshot could be larger than prep_seq
+  return next_is_larger;
+}
+
+WritePreparedTxnDB::~WritePreparedTxnDB() {
+  // At this point there could be running compaction/flush holding a
+  // SnapshotChecker, which holds a pointer back to WritePreparedTxnDB.
+  // Make sure those jobs finished before destructing WritePreparedTxnDB.
+  if (!db_impl_->shutting_down_) {
+    db_impl_->CancelAllBackgroundWork(true /*wait*/);
+  }
+}
+
+void SubBatchCounter::InitWithComp(const uint32_t cf) {
+  auto cmp = comparators_[cf];
+  keys_[cf] = CFKeys(SetComparator(cmp));
+}
+
+void SubBatchCounter::AddKey(const uint32_t cf, const Slice& key) {
+  CFKeys& cf_keys = keys_[cf];
+  if (cf_keys.size() == 0) {  // just inserted
+    InitWithComp(cf);
+  }
+  auto it = cf_keys.insert(key);
+  if (it.second == false) {  // second is false if a element already existed.
+    batches_++;
+    keys_.clear();
+    InitWithComp(cf);
+    keys_[cf].insert(key);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/write_prepared_txn_db.h b/src/rocksdb/utilities/transactions/write_prepared_txn_db.h
new file mode 100644
index 000000000..25a382473
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/write_prepared_txn_db.h
@@ -0,0 +1,1125 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <cinttypes>
+#include <mutex>
+#include <queue>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "db/db_iter.h"
+#include "db/pre_release_callback.h"
+#include "db/read_callback.h"
+#include "db/snapshot_checker.h"
+#include "logging/logging.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "util/cast_util.h"
+#include "util/set_comparator.h"
+#include "util/string_util.h"
+#include "utilities/transactions/pessimistic_transaction.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/write_prepared_txn.h"
+
+namespace ROCKSDB_NAMESPACE {
+enum SnapshotBackup : bool { kUnbackedByDBSnapshot, kBackedByDBSnapshot };
+
+// A PessimisticTransactionDB that writes data to DB after prepare phase of 2PC.
+// In this way some data in the DB might not be committed. The DB provides
+// mechanisms to tell such data apart from committed data.
+class WritePreparedTxnDB : public PessimisticTransactionDB {
+ public:
+  explicit WritePreparedTxnDB(DB* db,
+                              const TransactionDBOptions& txn_db_options)
+      : PessimisticTransactionDB(db, txn_db_options),
+        SNAPSHOT_CACHE_BITS(txn_db_options.wp_snapshot_cache_bits),
+        SNAPSHOT_CACHE_SIZE(static_cast<size_t>(1ull << SNAPSHOT_CACHE_BITS)),
+        COMMIT_CACHE_BITS(txn_db_options.wp_commit_cache_bits),
+        COMMIT_CACHE_SIZE(static_cast<size_t>(1ull << COMMIT_CACHE_BITS)),
+        FORMAT(COMMIT_CACHE_BITS) {
+    Init(txn_db_options);
+  }
+
+  explicit WritePreparedTxnDB(StackableDB* db,
+                              const TransactionDBOptions& txn_db_options)
+      : PessimisticTransactionDB(db, txn_db_options),
+        SNAPSHOT_CACHE_BITS(txn_db_options.wp_snapshot_cache_bits),
+        SNAPSHOT_CACHE_SIZE(static_cast<size_t>(1ull << SNAPSHOT_CACHE_BITS)),
+        COMMIT_CACHE_BITS(txn_db_options.wp_commit_cache_bits),
+        COMMIT_CACHE_SIZE(static_cast<size_t>(1ull << COMMIT_CACHE_BITS)),
+        FORMAT(COMMIT_CACHE_BITS) {
+    Init(txn_db_options);
+  }
+
+  virtual ~WritePreparedTxnDB();
+
+  virtual Status Initialize(
+      const std::vector<size_t>& compaction_enabled_cf_indices,
+      const std::vector<ColumnFamilyHandle*>& handles) override;
+
+  Transaction* BeginTransaction(const WriteOptions& write_options,
+                                const TransactionOptions& txn_options,
+                                Transaction* old_txn) override;
+
+  using TransactionDB::Write;
+  Status Write(const WriteOptions& opts, WriteBatch* updates) override;
+
+  // Optimized version of ::Write that receives more optimization request such
+  // as skip_concurrency_control.
+  using PessimisticTransactionDB::Write;
+  Status Write(const WriteOptions& opts, const TransactionDBWriteOptimizations&,
+               WriteBatch* updates) override;
+
+  // Write the batch to the underlying DB and mark it as committed. Could be
+  // used by both directly from TxnDB or through a transaction.
+  Status WriteInternal(const WriteOptions& write_options, WriteBatch* batch,
+                       size_t batch_cnt, WritePreparedTxn* txn);
+
+  using DB::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* value) override;
+
+  using DB::MultiGet;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override;
+
+  using DB::NewIterator;
+  virtual Iterator* NewIterator(const ReadOptions& options,
+                                ColumnFamilyHandle* column_family) override;
+
+  using DB::NewIterators;
+  virtual Status NewIterators(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      std::vector<Iterator*>* iterators) override;
+
+  // Check whether the transaction that wrote the value with sequence number seq
+  // is visible to the snapshot with sequence number snapshot_seq.
+  // Returns true if commit_seq <= snapshot_seq
+  // If the snapshot_seq is already released and snapshot_seq <= max, sets
+  // *snap_released to true and returns true as well.
+  inline bool IsInSnapshot(uint64_t prep_seq, uint64_t snapshot_seq,
+                           uint64_t min_uncommitted = kMinUnCommittedSeq,
+                           bool* snap_released = nullptr) const {
+    ROCKS_LOG_DETAILS(info_log_,
+                      "IsInSnapshot %" PRIu64 " in %" PRIu64
+                      " min_uncommitted %" PRIu64,
+                      prep_seq, snapshot_seq, min_uncommitted);
+    assert(min_uncommitted >= kMinUnCommittedSeq);
+    // Caller is responsible to initialize snap_released.
+    assert(snap_released == nullptr || *snap_released == false);
+    // Here we try to infer the return value without looking into prepare list.
+    // This would help avoiding synchronization over a shared map.
+    // TODO(myabandeh): optimize this. This sequence of checks must be correct
+    // but not necessary efficient
+    if (prep_seq == 0) {
+      // Compaction will output keys to bottom-level with sequence number 0 if
+      // it is visible to the earliest snapshot.
+      ROCKS_LOG_DETAILS(
+          info_log_, "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32,
+          prep_seq, snapshot_seq, 1);
+      return true;
+    }
+    if (snapshot_seq < prep_seq) {
+      // snapshot_seq < prep_seq <= commit_seq => snapshot_seq < commit_seq
+      ROCKS_LOG_DETAILS(
+          info_log_, "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32,
+          prep_seq, snapshot_seq, 0);
+      return false;
+    }
+    if (prep_seq < min_uncommitted) {
+      ROCKS_LOG_DETAILS(info_log_,
+                        "IsInSnapshot %" PRIu64 " in %" PRIu64
+                        " returns %" PRId32
+                        " because of min_uncommitted %" PRIu64,
+                        prep_seq, snapshot_seq, 1, min_uncommitted);
+      return true;
+    }
+    // Commit of delayed prepared has two non-atomic steps: add to commit cache,
+    // remove from delayed prepared. Our reads from these two is also
+    // non-atomic. By looking into commit cache first thus we might not find the
+    // prep_seq neither in commit cache not in delayed_prepared_. To fix that i)
+    // we check if there was any delayed prepared BEFORE looking into commit
+    // cache, ii) if there was, we complete the search steps to be these: i)
+    // commit cache, ii) delayed prepared, commit cache again. In this way if
+    // the first query to commit cache missed the commit, the 2nd will catch it.
+    bool was_empty;
+    SequenceNumber max_evicted_seq_lb, max_evicted_seq_ub;
+    CommitEntry64b dont_care;
+    auto indexed_seq = prep_seq % COMMIT_CACHE_SIZE;
+    size_t repeats = 0;
+    do {
+      repeats++;
+      assert(repeats < 100);
+      if (UNLIKELY(repeats >= 100)) {
+        throw std::runtime_error(
+            "The read was intrupted 100 times by update to max_evicted_seq_. "
+            "This is unexpected in all setups");
+      }
+      max_evicted_seq_lb = max_evicted_seq_.load(std::memory_order_acquire);
+      TEST_SYNC_POINT(
+          "WritePreparedTxnDB::IsInSnapshot:max_evicted_seq_:pause");
+      TEST_SYNC_POINT(
+          "WritePreparedTxnDB::IsInSnapshot:max_evicted_seq_:resume");
+      was_empty = delayed_prepared_empty_.load(std::memory_order_acquire);
+      TEST_SYNC_POINT(
+          "WritePreparedTxnDB::IsInSnapshot:delayed_prepared_empty_:pause");
+      TEST_SYNC_POINT(
+          "WritePreparedTxnDB::IsInSnapshot:delayed_prepared_empty_:resume");
+      CommitEntry cached;
+      bool exist = GetCommitEntry(indexed_seq, &dont_care, &cached);
+      TEST_SYNC_POINT("WritePreparedTxnDB::IsInSnapshot:GetCommitEntry:pause");
+      TEST_SYNC_POINT("WritePreparedTxnDB::IsInSnapshot:GetCommitEntry:resume");
+      if (exist && prep_seq == cached.prep_seq) {
+        // It is committed and also not evicted from commit cache
+        ROCKS_LOG_DETAILS(
+            info_log_,
+            "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32,
+            prep_seq, snapshot_seq, cached.commit_seq <= snapshot_seq);
+        return cached.commit_seq <= snapshot_seq;
+      }
+      // else it could be committed but not inserted in the map which could
+      // happen after recovery, or it could be committed and evicted by another
+      // commit, or never committed.
+
+      // At this point we don't know if it was committed or it is still prepared
+      max_evicted_seq_ub = max_evicted_seq_.load(std::memory_order_acquire);
+      if (UNLIKELY(max_evicted_seq_lb != max_evicted_seq_ub)) {
+        continue;
+      }
+      // Note: max_evicted_seq_ when we did GetCommitEntry <= max_evicted_seq_ub
+      if (max_evicted_seq_ub < prep_seq) {
+        // Not evicted from cache and also not present, so must be still
+        // prepared
+        ROCKS_LOG_DETAILS(info_log_,
+                          "IsInSnapshot %" PRIu64 " in %" PRIu64
+                          " returns %" PRId32,
+                          prep_seq, snapshot_seq, 0);
+        return false;
+      }
+      TEST_SYNC_POINT("WritePreparedTxnDB::IsInSnapshot:prepared_mutex_:pause");
+      TEST_SYNC_POINT(
+          "WritePreparedTxnDB::IsInSnapshot:prepared_mutex_:resume");
+      if (!was_empty) {
+        // We should not normally reach here
+        WPRecordTick(TXN_PREPARE_MUTEX_OVERHEAD);
+        ReadLock rl(&prepared_mutex_);
+        ROCKS_LOG_WARN(
+            info_log_, "prepared_mutex_ overhead %" PRIu64 " for %" PRIu64,
+            static_cast<uint64_t>(delayed_prepared_.size()), prep_seq);
+        if (delayed_prepared_.find(prep_seq) != delayed_prepared_.end()) {
+          // This is the order: 1) delayed_prepared_commits_ update, 2) publish
+          // 3) delayed_prepared_ clean up. So check if it is the case of a late
+          // clenaup.
+          auto it = delayed_prepared_commits_.find(prep_seq);
+          if (it == delayed_prepared_commits_.end()) {
+            // Then it is not committed yet
+            ROCKS_LOG_DETAILS(info_log_,
+                              "IsInSnapshot %" PRIu64 " in %" PRIu64
+                              " returns %" PRId32,
+                              prep_seq, snapshot_seq, 0);
+            return false;
+          } else {
+            ROCKS_LOG_DETAILS(info_log_,
+                              "IsInSnapshot %" PRIu64 " in %" PRIu64
+                              " commit: %" PRIu64 " returns %" PRId32,
+                              prep_seq, snapshot_seq, it->second,
+                              snapshot_seq <= it->second);
+            return it->second <= snapshot_seq;
+          }
+        } else {
+          // 2nd query to commit cache. Refer to was_empty comment above.
+          exist = GetCommitEntry(indexed_seq, &dont_care, &cached);
+          if (exist && prep_seq == cached.prep_seq) {
+            ROCKS_LOG_DETAILS(
+                info_log_,
+                "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32,
+                prep_seq, snapshot_seq, cached.commit_seq <= snapshot_seq);
+            return cached.commit_seq <= snapshot_seq;
+          }
+          max_evicted_seq_ub = max_evicted_seq_.load(std::memory_order_acquire);
+        }
+      }
+    } while (UNLIKELY(max_evicted_seq_lb != max_evicted_seq_ub));
+    // When advancing max_evicted_seq_, we move older entires from prepared to
+    // delayed_prepared_. Also we move evicted entries from commit cache to
+    // old_commit_map_ if it overlaps with any snapshot. Since prep_seq <=
+    // max_evicted_seq_, we have three cases: i) in delayed_prepared_, ii) in
+    // old_commit_map_, iii) committed with no conflict with any snapshot. Case
+    // (i) delayed_prepared_ is checked above
+    if (max_evicted_seq_ub < snapshot_seq) {  // then (ii) cannot be the case
+      // only (iii) is the case: committed
+      // commit_seq <= max_evicted_seq_ < snapshot_seq => commit_seq <
+      // snapshot_seq
+      ROCKS_LOG_DETAILS(
+          info_log_, "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32,
+          prep_seq, snapshot_seq, 1);
+      return true;
+    }
+    // else (ii) might be the case: check the commit data saved for this
+    // snapshot. If there was no overlapping commit entry, then it is committed
+    // with a commit_seq lower than any live snapshot, including snapshot_seq.
+    if (old_commit_map_empty_.load(std::memory_order_acquire)) {
+      ROCKS_LOG_DETAILS(info_log_,
+                        "IsInSnapshot %" PRIu64 " in %" PRIu64
+                        " returns %" PRId32 " released=1",
+                        prep_seq, snapshot_seq, 0);
+      assert(snap_released);
+      // This snapshot is not valid anymore. We cannot tell if prep_seq is
+      // committed before or after the snapshot. Return true but also set
+      // snap_released to true.
+      *snap_released = true;
+      return true;
+    }
+    {
+      // We should not normally reach here unless sapshot_seq is old. This is a
+      // rare case and it is ok to pay the cost of mutex ReadLock for such old,
+      // reading transactions.
+      WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD);
+      ReadLock rl(&old_commit_map_mutex_);
+      auto prep_set_entry = old_commit_map_.find(snapshot_seq);
+      bool found = prep_set_entry != old_commit_map_.end();
+      if (found) {
+        auto& vec = prep_set_entry->second;
+        found = std::binary_search(vec.begin(), vec.end(), prep_seq);
+      } else {
+        // coming from compaction
+        ROCKS_LOG_DETAILS(info_log_,
+                          "IsInSnapshot %" PRIu64 " in %" PRIu64
+                          " returns %" PRId32 " released=1",
+                          prep_seq, snapshot_seq, 0);
+        // This snapshot is not valid anymore. We cannot tell if prep_seq is
+        // committed before or after the snapshot. Return true but also set
+        // snap_released to true.
+        assert(snap_released);
+        *snap_released = true;
+        return true;
+      }
+
+      if (!found) {
+        ROCKS_LOG_DETAILS(info_log_,
+                          "IsInSnapshot %" PRIu64 " in %" PRIu64
+                          " returns %" PRId32,
+                          prep_seq, snapshot_seq, 1);
+        return true;
+      }
+    }
+    // (ii) it the case: it is committed but after the snapshot_seq
+    ROCKS_LOG_DETAILS(
+        info_log_, "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32,
+        prep_seq, snapshot_seq, 0);
+    return false;
+  }
+
+  // Add the transaction with prepare sequence seq to the prepared list.
+  // Note: must be called serially with increasing seq on each call.
+  // locked is true if prepared_mutex_ is already locked.
+  void AddPrepared(uint64_t seq, bool locked = false);
+  // Check if any of the prepared txns are less than new max_evicted_seq_. Must
+  // be called with prepared_mutex_ write locked.
+  void CheckPreparedAgainstMax(SequenceNumber new_max, bool locked);
+  // Remove the transaction with prepare sequence seq from the prepared list
+  void RemovePrepared(const uint64_t seq, const size_t batch_cnt = 1);
+  // Add the transaction with prepare sequence prepare_seq and commit sequence
+  // commit_seq to the commit map. loop_cnt is to detect infinite loops.
+  // Note: must be called serially.
+  void AddCommitted(uint64_t prepare_seq, uint64_t commit_seq,
+                    uint8_t loop_cnt = 0);
+
+  struct CommitEntry {
+    uint64_t prep_seq;
+    uint64_t commit_seq;
+    CommitEntry() : prep_seq(0), commit_seq(0) {}
+    CommitEntry(uint64_t ps, uint64_t cs) : prep_seq(ps), commit_seq(cs) {}
+    bool operator==(const CommitEntry& rhs) const {
+      return prep_seq == rhs.prep_seq && commit_seq == rhs.commit_seq;
+    }
+  };
+
+  struct CommitEntry64bFormat {
+    explicit CommitEntry64bFormat(size_t index_bits)
+        : INDEX_BITS(index_bits),
+          PREP_BITS(static_cast<size_t>(64 - PAD_BITS - INDEX_BITS)),
+          COMMIT_BITS(static_cast<size_t>(64 - PREP_BITS)),
+          COMMIT_FILTER(static_cast<uint64_t>((1ull << COMMIT_BITS) - 1)),
+          DELTA_UPPERBOUND(static_cast<uint64_t>((1ull << COMMIT_BITS))) {}
+    // Number of higher bits of a sequence number that is not used. They are
+    // used to encode the value type, ...
+    const size_t PAD_BITS = static_cast<size_t>(8);
+    // Number of lower bits from prepare seq that can be skipped as they are
+    // implied by the index of the entry in the array
+    const size_t INDEX_BITS;
+    // Number of bits we use to encode the prepare seq
+    const size_t PREP_BITS;
+    // Number of bits we use to encode the commit seq.
+    const size_t COMMIT_BITS;
+    // Filter to encode/decode commit seq
+    const uint64_t COMMIT_FILTER;
+    // The value of commit_seq - prepare_seq + 1 must be less than this bound
+    const uint64_t DELTA_UPPERBOUND;
+  };
+
+  // Prepare Seq (64 bits) = PAD ... PAD PREP PREP ... PREP INDEX INDEX ...
+  // INDEX Delta Seq (64 bits)   = 0 0 0 0 0 0 0 0 0  0 0 0 DELTA DELTA ...
+  // DELTA DELTA Encoded Value         = PREP PREP .... PREP PREP DELTA DELTA
+  // ... DELTA DELTA PAD: first bits of a seq that is reserved for tagging and
+  // hence ignored PREP/INDEX: the used bits in a prepare seq number INDEX: the
+  // bits that do not have to be encoded (will be provided externally) DELTA:
+  // prep seq - commit seq + 1 Number of DELTA bits should be equal to number of
+  // index bits + PADs
+  struct CommitEntry64b {
+    constexpr CommitEntry64b() noexcept : rep_(0) {}
+
+    CommitEntry64b(const CommitEntry& entry, const CommitEntry64bFormat& format)
+        : CommitEntry64b(entry.prep_seq, entry.commit_seq, format) {}
+
+    CommitEntry64b(const uint64_t ps, const uint64_t cs,
+                   const CommitEntry64bFormat& format) {
+      assert(ps < static_cast<uint64_t>(
+                      (1ull << (format.PREP_BITS + format.INDEX_BITS))));
+      assert(ps <= cs);
+      uint64_t delta = cs - ps + 1;  // make initialized delta always >= 1
+      // zero is reserved for uninitialized entries
+      assert(0 < delta);
+      assert(delta < format.DELTA_UPPERBOUND);
+      if (delta >= format.DELTA_UPPERBOUND) {
+        throw std::runtime_error(
+            "commit_seq >> prepare_seq. The allowed distance is " +
+            std::to_string(format.DELTA_UPPERBOUND) + " commit_seq is " +
+            std::to_string(cs) + " prepare_seq is " + std::to_string(ps));
+      }
+      rep_ = (ps << format.PAD_BITS) & ~format.COMMIT_FILTER;
+      rep_ = rep_ | delta;
+    }
+
+    // Return false if the entry is empty
+    bool Parse(const uint64_t indexed_seq, CommitEntry* entry,
+               const CommitEntry64bFormat& format) {
+      uint64_t delta = rep_ & format.COMMIT_FILTER;
+      // zero is reserved for uninitialized entries
+      assert(delta < static_cast<uint64_t>((1ull << format.COMMIT_BITS)));
+      if (delta == 0) {
+        return false;  // initialized entry would have non-zero delta
+      }
+
+      assert(indexed_seq < static_cast<uint64_t>((1ull << format.INDEX_BITS)));
+      uint64_t prep_up = rep_ & ~format.COMMIT_FILTER;
+      prep_up >>= format.PAD_BITS;
+      const uint64_t& prep_low = indexed_seq;
+      entry->prep_seq = prep_up | prep_low;
+
+      entry->commit_seq = entry->prep_seq + delta - 1;
+      return true;
+    }
+
+   private:
+    uint64_t rep_;
+  };
+
+  // Struct to hold ownership of snapshot and read callback for cleanup.
+  struct IteratorState;
+
+  std::shared_ptr<std::map<uint32_t, const Comparator*>> GetCFComparatorMap() {
+    return cf_map_;
+  }
+  std::shared_ptr<std::map<uint32_t, ColumnFamilyHandle*>> GetCFHandleMap() {
+    return handle_map_;
+  }
+  void UpdateCFComparatorMap(
+      const std::vector<ColumnFamilyHandle*>& handles) override;
+  void UpdateCFComparatorMap(ColumnFamilyHandle* handle) override;
+
+  virtual const Snapshot* GetSnapshot() override;
+  SnapshotImpl* GetSnapshotInternal(bool for_ww_conflict_check);
+
+ protected:
+  virtual Status VerifyCFOptions(
+      const ColumnFamilyOptions& cf_options) override;
+  // Assign the min and max sequence numbers for reading from the db. A seq >
+  // max is not valid, and a seq < min is valid, and a min <= seq < max requires
+  // further checking. Normally max is defined by the snapshot and min is by
+  // minimum uncommitted seq.
+  inline SnapshotBackup AssignMinMaxSeqs(const Snapshot* snapshot,
+                                         SequenceNumber* min,
+                                         SequenceNumber* max);
+  // Validate is a snapshot sequence number is still valid based on the latest
+  // db status. backed_by_snapshot specifies if the number is baked by an actual
+  // snapshot object. order specified the memory order with which we load the
+  // atomic variables: relax is enough for the default since we care about last
+  // value seen by same thread.
+  inline bool ValidateSnapshot(
+      const SequenceNumber snap_seq, const SnapshotBackup backed_by_snapshot,
+      std::memory_order order = std::memory_order_relaxed);
+  // Get a dummy snapshot that refers to kMaxSequenceNumber
+  Snapshot* GetMaxSnapshot() { return &dummy_max_snapshot_; }
+
+  bool ShouldRollbackWithSingleDelete(ColumnFamilyHandle* column_family,
+                                      const Slice& key) {
+    return rollback_deletion_type_callback_
+               ? rollback_deletion_type_callback_(this, column_family, key)
+               : false;
+  }
+
+  std::function<bool(TransactionDB*, ColumnFamilyHandle*, const Slice&)>
+      rollback_deletion_type_callback_;
+
+ private:
+  friend class AddPreparedCallback;
+  friend class PreparedHeap_BasicsTest_Test;
+  friend class PreparedHeap_Concurrent_Test;
+  friend class PreparedHeap_EmptyAtTheEnd_Test;
+  friend class SnapshotConcurrentAccessTest_SnapshotConcurrentAccess_Test;
+  friend class WritePreparedCommitEntryPreReleaseCallback;
+  friend class WritePreparedTransactionTestBase;
+  friend class WritePreparedTxn;
+  friend class WritePreparedTxnDBMock;
+  friend class WritePreparedTransactionTest_AddPreparedBeforeMax_Test;
+  friend class WritePreparedTransactionTest_AdvanceMaxEvictedSeqBasic_Test;
+  friend class
+      WritePreparedTransactionTest_AdvanceMaxEvictedSeqWithDuplicates_Test;
+  friend class WritePreparedTransactionTest_AdvanceSeqByOne_Test;
+  friend class WritePreparedTransactionTest_BasicRecovery_Test;
+  friend class WritePreparedTransactionTest_CheckAgainstSnapshots_Test;
+  friend class WritePreparedTransactionTest_CleanupSnapshotEqualToMax_Test;
+  friend class WritePreparedTransactionTest_ConflictDetectionAfterRecovery_Test;
+  friend class WritePreparedTransactionTest_CommitMap_Test;
+  friend class WritePreparedTransactionTest_DoubleSnapshot_Test;
+  friend class WritePreparedTransactionTest_IsInSnapshotEmptyMap_Test;
+  friend class WritePreparedTransactionTest_IsInSnapshotReleased_Test;
+  friend class WritePreparedTransactionTest_IsInSnapshot_Test;
+  friend class WritePreparedTransactionTest_NewSnapshotLargerThanMax_Test;
+  friend class WritePreparedTransactionTest_MaxCatchupWithNewSnapshot_Test;
+  friend class WritePreparedTransactionTest_MaxCatchupWithUnbackedSnapshot_Test;
+  friend class
+      WritePreparedTransactionTest_NonAtomicCommitOfDelayedPrepared_Test;
+  friend class
+      WritePreparedTransactionTest_NonAtomicUpdateOfDelayedPrepared_Test;
+  friend class WritePreparedTransactionTest_NonAtomicUpdateOfMaxEvictedSeq_Test;
+  friend class WritePreparedTransactionTest_OldCommitMapGC_Test;
+  friend class WritePreparedTransactionTest_Rollback_Test;
+  friend class WritePreparedTransactionTest_SmallestUnCommittedSeq_Test;
+  friend class WriteUnpreparedTxn;
+  friend class WriteUnpreparedTxnDB;
+  friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
+  friend class MultiOpsTxnsStressTest;
+
+  void Init(const TransactionDBOptions& txn_db_opts);
+
+  void WPRecordTick(uint32_t ticker_type) const {
+    RecordTick(db_impl_->immutable_db_options_.statistics.get(), ticker_type);
+  }
+
+  // A heap with the amortized O(1) complexity for erase. It uses one extra heap
+  // to keep track of erased entries that are not yet on top of the main heap.
+  class PreparedHeap {
+    // The mutex is required for push and pop from PreparedHeap. ::erase will
+    // use external synchronization via prepared_mutex_.
+    port::Mutex push_pop_mutex_;
+    std::deque<uint64_t> heap_;
+    std::priority_queue<uint64_t, std::vector<uint64_t>, std::greater<uint64_t>>
+        erased_heap_;
+    std::atomic<uint64_t> heap_top_ = {kMaxSequenceNumber};
+    // True when testing crash recovery
+    bool TEST_CRASH_ = false;
+    friend class WritePreparedTxnDB;
+
+   public:
+    ~PreparedHeap() {
+      if (!TEST_CRASH_) {
+        assert(heap_.empty());
+        assert(erased_heap_.empty());
+      }
+    }
+    port::Mutex* push_pop_mutex() { return &push_pop_mutex_; }
+
+    inline bool empty() { return top() == kMaxSequenceNumber; }
+    // Returns kMaxSequenceNumber if empty() and the smallest otherwise.
+    inline uint64_t top() { return heap_top_.load(std::memory_order_acquire); }
+    inline void push(uint64_t v) {
+      push_pop_mutex_.AssertHeld();
+      if (heap_.empty()) {
+        heap_top_.store(v, std::memory_order_release);
+      } else {
+        assert(heap_top_.load() < v);
+      }
+      heap_.push_back(v);
+    }
+    void pop(bool locked = false) {
+      if (!locked) {
+        push_pop_mutex()->Lock();
+      }
+      push_pop_mutex_.AssertHeld();
+      heap_.pop_front();
+      while (!heap_.empty() && !erased_heap_.empty() &&
+             // heap_.top() > erased_heap_.top() could happen if we have erased
+             // a non-existent entry. Ideally the user should not do that but we
+             // should be resilient against it.
+             heap_.front() >= erased_heap_.top()) {
+        if (heap_.front() == erased_heap_.top()) {
+          heap_.pop_front();
+        }
+        uint64_t erased __attribute__((__unused__));
+        erased = erased_heap_.top();
+        erased_heap_.pop();
+        // No duplicate prepare sequence numbers
+        assert(erased_heap_.empty() || erased_heap_.top() != erased);
+      }
+      while (heap_.empty() && !erased_heap_.empty()) {
+        erased_heap_.pop();
+      }
+      heap_top_.store(!heap_.empty() ? heap_.front() : kMaxSequenceNumber,
+                      std::memory_order_release);
+      if (!locked) {
+        push_pop_mutex()->Unlock();
+      }
+    }
+    // Concurrrent calls needs external synchronization. It is safe to be called
+    // concurrent to push and pop though.
+    void erase(uint64_t seq) {
+      if (!empty()) {
+        auto top_seq = top();
+        if (seq < top_seq) {
+          // Already popped, ignore it.
+        } else if (top_seq == seq) {
+          pop();
+#ifndef NDEBUG
+          MutexLock ml(push_pop_mutex());
+          assert(heap_.empty() || heap_.front() != seq);
+#endif
+        } else {  // top() > seq
+          // Down the heap, remember to pop it later
+          erased_heap_.push(seq);
+        }
+      }
+    }
+  };
+
+  void TEST_Crash() override { prepared_txns_.TEST_CRASH_ = true; }
+
+  // Get the commit entry with index indexed_seq from the commit table. It
+  // returns true if such entry exists.
+  bool GetCommitEntry(const uint64_t indexed_seq, CommitEntry64b* entry_64b,
+                      CommitEntry* entry) const;
+
+  // Rewrite the entry with the index indexed_seq in the commit table with the
+  // commit entry <prep_seq, commit_seq>. If the rewrite results into eviction,
+  // sets the evicted_entry and returns true.
+  bool AddCommitEntry(const uint64_t indexed_seq, const CommitEntry& new_entry,
+                      CommitEntry* evicted_entry);
+
+  // Rewrite the entry with the index indexed_seq in the commit table with the
+  // commit entry new_entry only if the existing entry matches the
+  // expected_entry. Returns false otherwise.
+  bool ExchangeCommitEntry(const uint64_t indexed_seq,
+                           CommitEntry64b& expected_entry,
+                           const CommitEntry& new_entry);
+
+  // Increase max_evicted_seq_ from the previous value prev_max to the new
+  // value. This also involves taking care of prepared txns that are not
+  // committed before new_max, as well as updating the list of live snapshots at
+  // the time of updating the max. Thread-safety: this function can be called
+  // concurrently. The concurrent invocations of this function is equivalent to
+  // a serial invocation in which the last invocation is the one with the
+  // largest new_max value.
+  void AdvanceMaxEvictedSeq(const SequenceNumber& prev_max,
+                            const SequenceNumber& new_max);
+
+  inline SequenceNumber SmallestUnCommittedSeq() {
+    // Note: We have two lists to look into, but for performance reasons they
+    // are not read atomically. Since CheckPreparedAgainstMax copies the entry
+    // to delayed_prepared_ before removing it from prepared_txns_, to ensure
+    // that a prepared entry will not go unmissed, we look into them in opposite
+    // order: first read prepared_txns_ and then delayed_prepared_.
+
+    // This must be called before calling ::top. This is because the concurrent
+    // thread would call ::RemovePrepared before updating
+    // GetLatestSequenceNumber(). Reading then in opposite order here guarantees
+    // that the ::top that we read would be lower the ::top if we had otherwise
+    // update/read them atomically.
+    auto next_prepare = db_impl_->GetLatestSequenceNumber() + 1;
+    auto min_prepare = prepared_txns_.top();
+    // Since we update the prepare_heap always from the main write queue via
+    // PreReleaseCallback, the prepared_txns_.top() indicates the smallest
+    // prepared data in 2pc transactions. For non-2pc transactions that are
+    // written in two steps, we also update prepared_txns_ at the first step
+    // (via the same mechanism) so that their uncommitted data is reflected in
+    // SmallestUnCommittedSeq.
+    if (!delayed_prepared_empty_.load()) {
+      ReadLock rl(&prepared_mutex_);
+      if (!delayed_prepared_.empty()) {
+        return *delayed_prepared_.begin();
+      }
+    }
+    bool empty = min_prepare == kMaxSequenceNumber;
+    if (empty) {
+      // Since GetLatestSequenceNumber is updated
+      // after prepared_txns_ are, the value of GetLatestSequenceNumber would
+      // reflect any uncommitted data that is not added to prepared_txns_ yet.
+      // Otherwise, if there is no concurrent txn, this value simply reflects
+      // that latest value in the memtable.
+      return next_prepare;
+    } else {
+      return std::min(min_prepare, next_prepare);
+    }
+  }
+
+  // Enhance the snapshot object by recording in it the smallest uncommitted seq
+  inline void EnhanceSnapshot(SnapshotImpl* snapshot,
+                              SequenceNumber min_uncommitted) {
+    assert(snapshot);
+    assert(min_uncommitted <= snapshot->number_ + 1);
+    snapshot->min_uncommitted_ = min_uncommitted;
+  }
+
+  virtual const std::vector<SequenceNumber> GetSnapshotListFromDB(
+      SequenceNumber max);
+
+  // Will be called by the public ReleaseSnapshot method. Does the maintenance
+  // internal to WritePreparedTxnDB
+  void ReleaseSnapshotInternal(const SequenceNumber snap_seq);
+
+  // Update the list of snapshots corresponding to the soon-to-be-updated
+  // max_evicted_seq_. Thread-safety: this function can be called concurrently.
+  // The concurrent invocations of this function is equivalent to a serial
+  // invocation in which the last invocation is the one with the largest
+  // version value.
+  void UpdateSnapshots(const std::vector<SequenceNumber>& snapshots,
+                       const SequenceNumber& version);
+  // Check the new list of new snapshots against the old one to see  if any of
+  // the snapshots are released and to do the cleanup for the released snapshot.
+  void CleanupReleasedSnapshots(
+      const std::vector<SequenceNumber>& new_snapshots,
+      const std::vector<SequenceNumber>& old_snapshots);
+
+  // Check an evicted entry against live snapshots to see if it should be kept
+  // around or it can be safely discarded (and hence assume committed for all
+  // snapshots). Thread-safety: this function can be called concurrently. If it
+  // is called concurrently with multiple UpdateSnapshots, the result is the
+  // same as checking the intersection of the snapshot list before updates with
+  // the snapshot list of all the concurrent updates.
+  void CheckAgainstSnapshots(const CommitEntry& evicted);
+
+  // Add a new entry to old_commit_map_ if prep_seq <= snapshot_seq <
+  // commit_seq. Return false if checking the next snapshot(s) is not needed.
+  // This is the case if none of the next snapshots could satisfy the condition.
+  // next_is_larger: the next snapshot will be a larger value
+  bool MaybeUpdateOldCommitMap(const uint64_t& prep_seq,
+                               const uint64_t& commit_seq,
+                               const uint64_t& snapshot_seq,
+                               const bool next_is_larger);
+
+  // A trick to increase the last visible sequence number by one and also wait
+  // for the in-flight commits to be visible.
+  void AdvanceSeqByOne();
+
+  // The list of live snapshots at the last time that max_evicted_seq_ advanced.
+  // The list stored into two data structures: in snapshot_cache_ that is
+  // efficient for concurrent reads, and in snapshots_ if the data does not fit
+  // into snapshot_cache_. The total number of snapshots in the two lists
+  std::atomic<size_t> snapshots_total_ = {};
+  // The list sorted in ascending order. Thread-safety for writes is provided
+  // with snapshots_mutex_ and concurrent reads are safe due to std::atomic for
+  // each entry. In x86_64 architecture such reads are compiled to simple read
+  // instructions.
+  const size_t SNAPSHOT_CACHE_BITS;
+  const size_t SNAPSHOT_CACHE_SIZE;
+  std::unique_ptr<std::atomic<SequenceNumber>[]> snapshot_cache_;
+  // 2nd list for storing snapshots. The list sorted in ascending order.
+  // Thread-safety is provided with snapshots_mutex_.
+  std::vector<SequenceNumber> snapshots_;
+  // The list of all snapshots: snapshots_ + snapshot_cache_. This list although
+  // redundant but simplifies CleanupOldSnapshots implementation.
+  // Thread-safety is provided with snapshots_mutex_.
+  std::vector<SequenceNumber> snapshots_all_;
+  // The version of the latest list of snapshots. This can be used to avoid
+  // rewriting a list that is concurrently updated with a more recent version.
+  SequenceNumber snapshots_version_ = 0;
+
+  // A heap of prepared transactions. Thread-safety is provided with
+  // prepared_mutex_.
+  PreparedHeap prepared_txns_;
+  const size_t COMMIT_CACHE_BITS;
+  const size_t COMMIT_CACHE_SIZE;
+  const CommitEntry64bFormat FORMAT;
+  // commit_cache_ must be initialized to zero to tell apart an empty index from
+  // a filled one. Thread-safety is provided with commit_cache_mutex_.
+  std::unique_ptr<std::atomic<CommitEntry64b>[]> commit_cache_;
+  // The largest evicted *commit* sequence number from the commit_cache_. If a
+  // seq is smaller than max_evicted_seq_ is might or might not be present in
+  // commit_cache_. So commit_cache_ must first be checked before consulting
+  // with max_evicted_seq_.
+  std::atomic<uint64_t> max_evicted_seq_ = {};
+  // Order: 1) update future_max_evicted_seq_ = new_max, 2)
+  // GetSnapshotListFromDB(new_max), max_evicted_seq_ = new_max. Since
+  // GetSnapshotInternal guarantess that the snapshot seq is larger than
+  // future_max_evicted_seq_, this guarantes that if a snapshot is not larger
+  // than max has already being looked at via a GetSnapshotListFromDB(new_max).
+  std::atomic<uint64_t> future_max_evicted_seq_ = {};
+  // Advance max_evicted_seq_ by this value each time it needs an update. The
+  // larger the value, the less frequent advances we would have. We do not want
+  // it to be too large either as it would cause stalls by doing too much
+  // maintenance work under the lock.
+  size_t INC_STEP_FOR_MAX_EVICTED = 1;
+  // A map from old snapshots (expected to be used by a few read-only txns) to
+  // prepared sequence number of the evicted entries from commit_cache_ that
+  // overlaps with such snapshot. These are the prepared sequence numbers that
+  // the snapshot, to which they are mapped, cannot assume to be committed just
+  // because it is no longer in the commit_cache_. The vector must be sorted
+  // after each update.
+  // Thread-safety is provided with old_commit_map_mutex_.
+  std::map<SequenceNumber, std::vector<SequenceNumber>> old_commit_map_;
+  // A set of long-running prepared transactions that are not finished by the
+  // time max_evicted_seq_ advances their sequence number. This is expected to
+  // be empty normally. Thread-safety is provided with prepared_mutex_.
+  std::set<uint64_t> delayed_prepared_;
+  // Commit of a delayed prepared: 1) update commit cache, 2) update
+  // delayed_prepared_commits_, 3) publish seq, 3) clean up delayed_prepared_.
+  // delayed_prepared_commits_ will help us tell apart the unprepared txns from
+  // the ones that are committed but not cleaned up yet.
+  std::unordered_map<SequenceNumber, SequenceNumber> delayed_prepared_commits_;
+  // Update when delayed_prepared_.empty() changes. Expected to be true
+  // normally.
+  std::atomic<bool> delayed_prepared_empty_ = {true};
+  // Update when old_commit_map_.empty() changes. Expected to be true normally.
+  std::atomic<bool> old_commit_map_empty_ = {true};
+  mutable port::RWMutex prepared_mutex_;
+  mutable port::RWMutex old_commit_map_mutex_;
+  mutable port::RWMutex commit_cache_mutex_;
+  mutable port::RWMutex snapshots_mutex_;
+  // A cache of the cf comparators
+  // Thread safety: since it is a const it is safe to read it concurrently
+  std::shared_ptr<std::map<uint32_t, const Comparator*>> cf_map_;
+  // A cache of the cf handles
+  // Thread safety: since the handle is read-only object it is a const it is
+  // safe to read it concurrently
+  std::shared_ptr<std::map<uint32_t, ColumnFamilyHandle*>> handle_map_;
+  // A dummy snapshot object that refers to kMaxSequenceNumber
+  SnapshotImpl dummy_max_snapshot_;
+};
+
+class WritePreparedTxnReadCallback : public ReadCallback {
+ public:
+  WritePreparedTxnReadCallback(WritePreparedTxnDB* db, SequenceNumber snapshot)
+      : ReadCallback(snapshot),
+        db_(db),
+        backed_by_snapshot_(kBackedByDBSnapshot) {}
+  WritePreparedTxnReadCallback(WritePreparedTxnDB* db, SequenceNumber snapshot,
+                               SequenceNumber min_uncommitted,
+                               SnapshotBackup backed_by_snapshot)
+      : ReadCallback(snapshot, min_uncommitted),
+        db_(db),
+        backed_by_snapshot_(backed_by_snapshot) {
+    (void)backed_by_snapshot_;  // to silence unused private field warning
+  }
+
+  virtual ~WritePreparedTxnReadCallback() {
+    // If it is not backed by snapshot, the caller must check validity
+    assert(valid_checked_ || backed_by_snapshot_ == kBackedByDBSnapshot);
+  }
+
+  // Will be called to see if the seq number visible; if not it moves on to
+  // the next seq number.
+  inline virtual bool IsVisibleFullCheck(SequenceNumber seq) override {
+    auto snapshot = max_visible_seq_;
+    bool snap_released = false;
+    auto ret =
+        db_->IsInSnapshot(seq, snapshot, min_uncommitted_, &snap_released);
+    assert(!snap_released || backed_by_snapshot_ == kUnbackedByDBSnapshot);
+    snap_released_ |= snap_released;
+    return ret;
+  }
+
+  inline bool valid() {
+    valid_checked_ = true;
+    return snap_released_ == false;
+  }
+
+  // TODO(myabandeh): override Refresh when Iterator::Refresh is supported
+ private:
+  WritePreparedTxnDB* db_;
+  // Whether max_visible_seq_ is backed by a snapshot
+  const SnapshotBackup backed_by_snapshot_;
+  bool snap_released_ = false;
+  // Safety check to ensure that the caller has checked invalid statuses
+  bool valid_checked_ = false;
+};
+
+class AddPreparedCallback : public PreReleaseCallback {
+ public:
+  AddPreparedCallback(WritePreparedTxnDB* db, DBImpl* db_impl,
+                      size_t sub_batch_cnt, bool two_write_queues,
+                      bool first_prepare_batch)
+      : db_(db),
+        db_impl_(db_impl),
+        sub_batch_cnt_(sub_batch_cnt),
+        two_write_queues_(two_write_queues),
+        first_prepare_batch_(first_prepare_batch) {
+    (void)two_write_queues_;  // to silence unused private field warning
+  }
+  virtual Status Callback(SequenceNumber prepare_seq,
+                          bool is_mem_disabled __attribute__((__unused__)),
+                          uint64_t log_number, size_t index,
+                          size_t total) override {
+    assert(index < total);
+    // To reduce the cost of lock acquisition competing with the concurrent
+    // prepare requests, lock on the first callback and unlock on the last.
+    const bool do_lock = !two_write_queues_ || index == 0;
+    const bool do_unlock = !two_write_queues_ || index + 1 == total;
+    // Always Prepare from the main queue
+    assert(!two_write_queues_ || !is_mem_disabled);  // implies the 1st queue
+    TEST_SYNC_POINT("AddPreparedCallback::AddPrepared::begin:pause");
+    TEST_SYNC_POINT("AddPreparedCallback::AddPrepared::begin:resume");
+    if (do_lock) {
+      db_->prepared_txns_.push_pop_mutex()->Lock();
+    }
+    const bool kLocked = true;
+    for (size_t i = 0; i < sub_batch_cnt_; i++) {
+      db_->AddPrepared(prepare_seq + i, kLocked);
+    }
+    if (do_unlock) {
+      db_->prepared_txns_.push_pop_mutex()->Unlock();
+    }
+    TEST_SYNC_POINT("AddPreparedCallback::AddPrepared::end");
+    if (first_prepare_batch_) {
+      assert(log_number != 0);
+      db_impl_->logs_with_prep_tracker()->MarkLogAsContainingPrepSection(
+          log_number);
+    }
+    return Status::OK();
+  }
+
+ private:
+  WritePreparedTxnDB* db_;
+  DBImpl* db_impl_;
+  size_t sub_batch_cnt_;
+  bool two_write_queues_;
+  // It is 2PC and this is the first prepare batch. Always the case in 2PC
+  // unless it is WriteUnPrepared.
+  bool first_prepare_batch_;
+};
+
+class WritePreparedCommitEntryPreReleaseCallback : public PreReleaseCallback {
+ public:
+  // includes_data indicates that the commit also writes non-empty
+  // CommitTimeWriteBatch to memtable, which needs to be committed separately.
+  WritePreparedCommitEntryPreReleaseCallback(
+      WritePreparedTxnDB* db, DBImpl* db_impl, SequenceNumber prep_seq,
+      size_t prep_batch_cnt, size_t data_batch_cnt = 0,
+      SequenceNumber aux_seq = kMaxSequenceNumber, size_t aux_batch_cnt = 0)
+      : db_(db),
+        db_impl_(db_impl),
+        prep_seq_(prep_seq),
+        prep_batch_cnt_(prep_batch_cnt),
+        data_batch_cnt_(data_batch_cnt),
+        includes_data_(data_batch_cnt_ > 0),
+        aux_seq_(aux_seq),
+        aux_batch_cnt_(aux_batch_cnt),
+        includes_aux_batch_(aux_batch_cnt > 0) {
+    assert((prep_batch_cnt_ > 0) != (prep_seq == kMaxSequenceNumber));  // xor
+    assert(prep_batch_cnt_ > 0 || data_batch_cnt_ > 0);
+    assert((aux_batch_cnt_ > 0) != (aux_seq == kMaxSequenceNumber));  // xor
+  }
+
+  virtual Status Callback(SequenceNumber commit_seq,
+                          bool is_mem_disabled __attribute__((__unused__)),
+                          uint64_t, size_t /*index*/,
+                          size_t /*total*/) override {
+    // Always commit from the 2nd queue
+    assert(!db_impl_->immutable_db_options().two_write_queues ||
+           is_mem_disabled);
+    assert(includes_data_ || prep_seq_ != kMaxSequenceNumber);
+    // Data batch is what accompanied with the commit marker and affects the
+    // last seq in the commit batch.
+    const uint64_t last_commit_seq = LIKELY(data_batch_cnt_ <= 1)
+                                         ? commit_seq
+                                         : commit_seq + data_batch_cnt_ - 1;
+    if (prep_seq_ != kMaxSequenceNumber) {
+      for (size_t i = 0; i < prep_batch_cnt_; i++) {
+        db_->AddCommitted(prep_seq_ + i, last_commit_seq);
+      }
+    }  // else there was no prepare phase
+    if (includes_aux_batch_) {
+      for (size_t i = 0; i < aux_batch_cnt_; i++) {
+        db_->AddCommitted(aux_seq_ + i, last_commit_seq);
+      }
+    }
+    if (includes_data_) {
+      assert(data_batch_cnt_);
+      // Commit the data that is accompanied with the commit request
+      for (size_t i = 0; i < data_batch_cnt_; i++) {
+        // For commit seq of each batch use the commit seq of the last batch.
+        // This would make debugging easier by having all the batches having
+        // the same sequence number.
+        db_->AddCommitted(commit_seq + i, last_commit_seq);
+      }
+    }
+    if (db_impl_->immutable_db_options().two_write_queues) {
+      assert(is_mem_disabled);  // implies the 2nd queue
+      // Publish the sequence number. We can do that here assuming the callback
+      // is invoked only from one write queue, which would guarantee that the
+      // publish sequence numbers will be in order, i.e., once a seq is
+      // published all the seq prior to that are also publishable.
+      db_impl_->SetLastPublishedSequence(last_commit_seq);
+      // Note RemovePrepared should be called after publishing the seq.
+      // Otherwise SmallestUnCommittedSeq optimization breaks.
+      if (prep_seq_ != kMaxSequenceNumber) {
+        db_->RemovePrepared(prep_seq_, prep_batch_cnt_);
+      }  // else there was no prepare phase
+      if (includes_aux_batch_) {
+        db_->RemovePrepared(aux_seq_, aux_batch_cnt_);
+      }
+    }
+    // else SequenceNumber that is updated as part of the write already does the
+    // publishing
+    return Status::OK();
+  }
+
+ private:
+  WritePreparedTxnDB* db_;
+  DBImpl* db_impl_;
+  // kMaxSequenceNumber if there was no prepare phase
+  SequenceNumber prep_seq_;
+  size_t prep_batch_cnt_;
+  size_t data_batch_cnt_;
+  // Data here is the batch that is written with the commit marker, either
+  // because it is commit without prepare or commit has a CommitTimeWriteBatch.
+  bool includes_data_;
+  // Auxiliary batch (if there is any) is a batch that is written before, but
+  // gets the same commit seq as prepare batch or data batch. This is used in
+  // two write queues where the CommitTimeWriteBatch becomes the aux batch and
+  // we do a separate write to actually commit everything.
+  SequenceNumber aux_seq_;
+  size_t aux_batch_cnt_;
+  bool includes_aux_batch_;
+};
+
+// For two_write_queues commit both the aborted batch and the cleanup batch and
+// then published the seq
+class WritePreparedRollbackPreReleaseCallback : public PreReleaseCallback {
+ public:
+  WritePreparedRollbackPreReleaseCallback(WritePreparedTxnDB* db,
+                                          DBImpl* db_impl,
+                                          SequenceNumber prep_seq,
+                                          SequenceNumber rollback_seq,
+                                          size_t prep_batch_cnt)
+      : db_(db),
+        db_impl_(db_impl),
+        prep_seq_(prep_seq),
+        rollback_seq_(rollback_seq),
+        prep_batch_cnt_(prep_batch_cnt) {
+    assert(prep_seq != kMaxSequenceNumber);
+    assert(rollback_seq != kMaxSequenceNumber);
+    assert(prep_batch_cnt_ > 0);
+  }
+
+  Status Callback(SequenceNumber commit_seq, bool is_mem_disabled, uint64_t,
+                  size_t /*index*/, size_t /*total*/) override {
+    // Always commit from the 2nd queue
+    assert(is_mem_disabled);  // implies the 2nd queue
+    assert(db_impl_->immutable_db_options().two_write_queues);
+#ifdef NDEBUG
+    (void)is_mem_disabled;
+#endif
+    const uint64_t last_commit_seq = commit_seq;
+    db_->AddCommitted(rollback_seq_, last_commit_seq);
+    for (size_t i = 0; i < prep_batch_cnt_; i++) {
+      db_->AddCommitted(prep_seq_ + i, last_commit_seq);
+    }
+    db_impl_->SetLastPublishedSequence(last_commit_seq);
+    return Status::OK();
+  }
+
+ private:
+  WritePreparedTxnDB* db_;
+  DBImpl* db_impl_;
+  SequenceNumber prep_seq_;
+  SequenceNumber rollback_seq_;
+  size_t prep_batch_cnt_;
+};
+
+// Count the number of sub-batches inside a batch. A sub-batch does not have
+// duplicate keys.
+struct SubBatchCounter : public WriteBatch::Handler {
+  explicit SubBatchCounter(std::map<uint32_t, const Comparator*>& comparators)
+      : comparators_(comparators), batches_(1) {}
+  std::map<uint32_t, const Comparator*>& comparators_;
+  using CFKeys = std::set<Slice, SetComparator>;
+  std::map<uint32_t, CFKeys> keys_;
+  size_t batches_;
+  size_t BatchCount() { return batches_; }
+  void AddKey(const uint32_t cf, const Slice& key);
+  void InitWithComp(const uint32_t cf);
+  Status MarkNoop(bool) override { return Status::OK(); }
+  Status MarkEndPrepare(const Slice&) override { return Status::OK(); }
+  Status MarkCommit(const Slice&) override { return Status::OK(); }
+  Status PutCF(uint32_t cf, const Slice& key, const Slice&) override {
+    AddKey(cf, key);
+    return Status::OK();
+  }
+  Status DeleteCF(uint32_t cf, const Slice& key) override {
+    AddKey(cf, key);
+    return Status::OK();
+  }
+  Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+    AddKey(cf, key);
+    return Status::OK();
+  }
+  Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override {
+    AddKey(cf, key);
+    return Status::OK();
+  }
+  Status MarkBeginPrepare(bool) override { return Status::OK(); }
+  Status MarkRollback(const Slice&) override { return Status::OK(); }
+  Handler::OptionState WriteAfterCommit() const override {
+    return Handler::OptionState::kDisabled;
+  }
+};
+
+SnapshotBackup WritePreparedTxnDB::AssignMinMaxSeqs(const Snapshot* snapshot,
+                                                    SequenceNumber* min,
+                                                    SequenceNumber* max) {
+  if (snapshot != nullptr) {
+    *min =
+        static_cast_with_check<const SnapshotImpl>(snapshot)->min_uncommitted_;
+    *max = static_cast_with_check<const SnapshotImpl>(snapshot)->number_;
+    // A duplicate of the check in EnhanceSnapshot().
+    assert(*min <= *max + 1);
+    return kBackedByDBSnapshot;
+  } else {
+    *min = SmallestUnCommittedSeq();
+    *max = 0;  // to be assigned later after sv is referenced.
+    return kUnbackedByDBSnapshot;
+  }
+}
+
+bool WritePreparedTxnDB::ValidateSnapshot(
+    const SequenceNumber snap_seq, const SnapshotBackup backed_by_snapshot,
+    std::memory_order order) {
+  if (backed_by_snapshot == kBackedByDBSnapshot) {
+    return true;
+  } else {
+    SequenceNumber max = max_evicted_seq_.load(order);
+    // Validate that max has not advanced the snapshot seq that is not backed
+    // by a real snapshot. This is a very rare case that should not happen in
+    // real workloads.
+    if (UNLIKELY(snap_seq <= max && snap_seq != 0)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc b/src/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc
new file mode 100644
index 000000000..6c8c62e0e
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc
@@ -0,0 +1,790 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/transaction_test.h"
+#include "utilities/transactions/write_unprepared_txn.h"
+#include "utilities/transactions/write_unprepared_txn_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WriteUnpreparedTransactionTestBase : public TransactionTestBase {
+ public:
+  WriteUnpreparedTransactionTestBase(bool use_stackable_db,
+                                     bool two_write_queue,
+                                     TxnDBWritePolicy write_policy)
+      : TransactionTestBase(use_stackable_db, two_write_queue, write_policy,
+                            kOrderedWrite) {}
+};
+
+class WriteUnpreparedTransactionTest
+    : public WriteUnpreparedTransactionTestBase,
+      virtual public ::testing::WithParamInterface<
+          std::tuple<bool, bool, TxnDBWritePolicy>> {
+ public:
+  WriteUnpreparedTransactionTest()
+      : WriteUnpreparedTransactionTestBase(std::get<0>(GetParam()),
+                                           std::get<1>(GetParam()),
+                                           std::get<2>(GetParam())) {}
+};
+
+INSTANTIATE_TEST_CASE_P(
+    WriteUnpreparedTransactionTest, WriteUnpreparedTransactionTest,
+    ::testing::Values(std::make_tuple(false, false, WRITE_UNPREPARED),
+                      std::make_tuple(false, true, WRITE_UNPREPARED)));
+
+enum StressAction { NO_SNAPSHOT, RO_SNAPSHOT, REFRESH_SNAPSHOT };
+class WriteUnpreparedStressTest : public WriteUnpreparedTransactionTestBase,
+                                  virtual public ::testing::WithParamInterface<
+                                      std::tuple<bool, StressAction>> {
+ public:
+  WriteUnpreparedStressTest()
+      : WriteUnpreparedTransactionTestBase(false, std::get<0>(GetParam()),
+                                           WRITE_UNPREPARED),
+        action_(std::get<1>(GetParam())) {}
+  StressAction action_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    WriteUnpreparedStressTest, WriteUnpreparedStressTest,
+    ::testing::Values(std::make_tuple(false, NO_SNAPSHOT),
+                      std::make_tuple(false, RO_SNAPSHOT),
+                      std::make_tuple(false, REFRESH_SNAPSHOT),
+                      std::make_tuple(true, NO_SNAPSHOT),
+                      std::make_tuple(true, RO_SNAPSHOT),
+                      std::make_tuple(true, REFRESH_SNAPSHOT)));
+
+TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWrite) {
+  // The following tests checks whether reading your own write for
+  // a transaction works for write unprepared, when there are uncommitted
+  // values written into DB.
+  auto verify_state = [](Iterator* iter, const std::string& key,
+                         const std::string& value) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(key, iter->key().ToString());
+    ASSERT_EQ(value, iter->value().ToString());
+  };
+
+  // Test always reseeking vs never reseeking.
+  for (uint64_t max_skip : {0, std::numeric_limits<int>::max()}) {
+    options.max_sequential_skip_in_iterations = max_skip;
+    options.disable_auto_compactions = true;
+    ASSERT_OK(ReOpen());
+
+    TransactionOptions txn_options;
+    WriteOptions woptions;
+    ReadOptions roptions;
+
+    ASSERT_OK(db->Put(woptions, "a", ""));
+    ASSERT_OK(db->Put(woptions, "b", ""));
+
+    Transaction* txn = db->BeginTransaction(woptions, txn_options);
+    WriteUnpreparedTxn* wup_txn = dynamic_cast<WriteUnpreparedTxn*>(txn);
+    txn->SetSnapshot();
+
+    for (int i = 0; i < 5; i++) {
+      std::string stored_value = "v" + std::to_string(i);
+      ASSERT_OK(txn->Put("a", stored_value));
+      ASSERT_OK(txn->Put("b", stored_value));
+      ASSERT_OK(wup_txn->FlushWriteBatchToDB(false));
+
+      // Test Get()
+      std::string value;
+      ASSERT_OK(txn->Get(roptions, "a", &value));
+      ASSERT_EQ(value, stored_value);
+      ASSERT_OK(txn->Get(roptions, "b", &value));
+      ASSERT_EQ(value, stored_value);
+
+      // Test Next()
+      auto iter = txn->GetIterator(roptions);
+      iter->Seek("a");
+      verify_state(iter, "a", stored_value);
+
+      iter->Next();
+      verify_state(iter, "b", stored_value);
+
+      iter->SeekToFirst();
+      verify_state(iter, "a", stored_value);
+
+      iter->Next();
+      verify_state(iter, "b", stored_value);
+
+      delete iter;
+
+      // Test Prev()
+      iter = txn->GetIterator(roptions);
+      iter->SeekForPrev("b");
+      verify_state(iter, "b", stored_value);
+
+      iter->Prev();
+      verify_state(iter, "a", stored_value);
+
+      iter->SeekToLast();
+      verify_state(iter, "b", stored_value);
+
+      iter->Prev();
+      verify_state(iter, "a", stored_value);
+
+      delete iter;
+    }
+
+    delete txn;
+  }
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+TEST_P(WriteUnpreparedStressTest, ReadYourOwnWriteStress) {
+  // This is a stress test where different threads are writing random keys, and
+  // then before committing or aborting the transaction, it validates to see
+  // that it can read the keys it wrote, and the keys it did not write respect
+  // the snapshot. To avoid row lock contention (and simply stressing the
+  // locking system), each thread is mostly only writing to its own set of keys.
+  const uint32_t kNumIter = 1000;
+  const uint32_t kNumThreads = 10;
+  const uint32_t kNumKeys = 5;
+
+  // Test with
+  // 1. no snapshots set
+  // 2. snapshot set on ReadOptions
+  // 3. snapshot set, and refreshing after every write.
+  StressAction a = action_;
+  WriteOptions write_options;
+  txn_db_options.transaction_lock_timeout = -1;
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+
+  std::vector<std::string> keys;
+  for (uint32_t k = 0; k < kNumKeys * kNumThreads; k++) {
+    keys.push_back("k" + std::to_string(k));
+  }
+  RandomShuffle(keys.begin(), keys.end());
+
+  // This counter will act as a "sequence number" to help us validate
+  // visibility logic with snapshots. If we had direct access to the seqno of
+  // snapshots and key/values, then we should directly compare those instead.
+  std::atomic<int64_t> counter(0);
+
+  std::function<void(uint32_t)> stress_thread = [&](int id) {
+    size_t tid = std::hash<std::thread::id>()(std::this_thread::get_id());
+    Random64 rnd(static_cast<uint32_t>(tid));
+
+    Transaction* txn;
+    TransactionOptions txn_options;
+    // batch_size of 1 causes writes to DB for every marker.
+    txn_options.write_batch_flush_threshold = 1;
+    ReadOptions read_options;
+
+    for (uint32_t i = 0; i < kNumIter; i++) {
+      std::set<std::string> owned_keys(keys.begin() + id * kNumKeys,
+                                       keys.begin() + (id + 1) * kNumKeys);
+      // Add unowned keys to make the workload more interesting, but this
+      // increases row lock contention, so just do it sometimes.
+      if (rnd.OneIn(2)) {
+        owned_keys.insert(keys[rnd.Uniform(kNumKeys * kNumThreads)]);
+      }
+
+      txn = db->BeginTransaction(write_options, txn_options);
+      ASSERT_OK(txn->SetName(std::to_string(id)));
+      txn->SetSnapshot();
+      if (a >= RO_SNAPSHOT) {
+        read_options.snapshot = txn->GetSnapshot();
+        ASSERT_TRUE(read_options.snapshot != nullptr);
+      }
+
+      uint64_t buf[2];
+      buf[0] = id;
+
+      // When scanning through the database, make sure that all unprepared
+      // keys have value >= snapshot and all other keys have value < snapshot.
+      int64_t snapshot_num = counter.fetch_add(1);
+
+      Status s;
+      for (const auto& key : owned_keys) {
+        buf[1] = counter.fetch_add(1);
+        s = txn->Put(key, Slice((const char*)buf, sizeof(buf)));
+        if (!s.ok()) {
+          break;
+        }
+        if (a == REFRESH_SNAPSHOT) {
+          txn->SetSnapshot();
+          read_options.snapshot = txn->GetSnapshot();
+          snapshot_num = counter.fetch_add(1);
+        }
+      }
+
+      // Failure is possible due to snapshot validation. In this case,
+      // rollback and move onto next iteration.
+      if (!s.ok()) {
+        ASSERT_TRUE(s.IsBusy());
+        ASSERT_OK(txn->Rollback());
+        delete txn;
+        continue;
+      }
+
+      auto verify_key = [&owned_keys, &a, &id, &snapshot_num](
+                            const std::string& key, const std::string& value) {
+        if (owned_keys.count(key) > 0) {
+          ASSERT_EQ(value.size(), 16);
+
+          // Since this key is part of owned_keys, then this key must be
+          // unprepared by this transaction identified by 'id'
+          ASSERT_EQ(((int64_t*)value.c_str())[0], id);
+          if (a == REFRESH_SNAPSHOT) {
+            // If refresh snapshot is true, then the snapshot is refreshed
+            // after every Put(), meaning that the current snapshot in
+            // snapshot_num must be greater than the "seqno" of any keys
+            // written by the current transaction.
+            ASSERT_LT(((int64_t*)value.c_str())[1], snapshot_num);
+          } else {
+            // If refresh snapshot is not on, then the snapshot was taken at
+            // the beginning of the transaction, meaning all writes must come
+            // after snapshot_num
+            ASSERT_GT(((int64_t*)value.c_str())[1], snapshot_num);
+          }
+        } else if (a >= RO_SNAPSHOT) {
+          // If this is not an unprepared key, just assert that the key
+          // "seqno" is smaller than the snapshot seqno.
+          ASSERT_EQ(value.size(), 16);
+          ASSERT_LT(((int64_t*)value.c_str())[1], snapshot_num);
+        }
+      };
+
+      // Validate Get()/Next()/Prev(). Do only one of them to save time, and
+      // reduce lock contention.
+      switch (rnd.Uniform(3)) {
+        case 0:  // Validate Get()
+        {
+          for (const auto& key : keys) {
+            std::string value;
+            s = txn->Get(read_options, Slice(key), &value);
+            if (!s.ok()) {
+              ASSERT_TRUE(s.IsNotFound());
+              ASSERT_EQ(owned_keys.count(key), 0);
+            } else {
+              verify_key(key, value);
+            }
+          }
+          break;
+        }
+        case 1:  // Validate Next()
+        {
+          Iterator* iter = txn->GetIterator(read_options);
+          ASSERT_OK(iter->status());
+          for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+            verify_key(iter->key().ToString(), iter->value().ToString());
+          }
+          ASSERT_OK(iter->status());
+          delete iter;
+          break;
+        }
+        case 2:  // Validate Prev()
+        {
+          Iterator* iter = txn->GetIterator(read_options);
+          ASSERT_OK(iter->status());
+          for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+            verify_key(iter->key().ToString(), iter->value().ToString());
+          }
+          ASSERT_OK(iter->status());
+          delete iter;
+          break;
+        }
+        default:
+          FAIL();
+      }
+
+      if (rnd.OneIn(2)) {
+        ASSERT_OK(txn->Commit());
+      } else {
+        ASSERT_OK(txn->Rollback());
+      }
+      delete txn;
+    }
+  };
+
+  std::vector<port::Thread> threads;
+  for (uint32_t i = 0; i < kNumThreads; i++) {
+    threads.emplace_back(stress_thread, i);
+  }
+
+  for (auto& t : threads) {
+    t.join();
+  }
+}
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+// This tests how write unprepared behaves during recovery when the DB crashes
+// after a transaction has either been unprepared or prepared, and tests if
+// the changes are correctly applied for prepared transactions if we decide to
+// rollback/commit.
+TEST_P(WriteUnpreparedTransactionTest, RecoveryTest) {
+  WriteOptions write_options;
+  write_options.disableWAL = false;
+  TransactionOptions txn_options;
+  std::vector<Transaction*> prepared_trans;
+  WriteUnpreparedTxnDB* wup_db;
+  options.disable_auto_compactions = true;
+
+  enum Action { UNPREPARED, ROLLBACK, COMMIT };
+
+  // batch_size of 1 causes writes to DB for every marker.
+  for (size_t batch_size : {1, 1000000}) {
+    txn_options.write_batch_flush_threshold = batch_size;
+    for (bool empty : {true, false}) {
+      for (Action a : {UNPREPARED, ROLLBACK, COMMIT}) {
+        for (int num_batches = 1; num_batches < 10; num_batches++) {
+          // Reset database.
+          prepared_trans.clear();
+          ASSERT_OK(ReOpen());
+          wup_db = dynamic_cast<WriteUnpreparedTxnDB*>(db);
+          if (!empty) {
+            for (int i = 0; i < num_batches; i++) {
+              ASSERT_OK(db->Put(WriteOptions(), "k" + std::to_string(i),
+                                "before value" + std::to_string(i)));
+            }
+          }
+
+          // Write num_batches unprepared batches.
+          Transaction* txn = db->BeginTransaction(write_options, txn_options);
+          WriteUnpreparedTxn* wup_txn = dynamic_cast<WriteUnpreparedTxn*>(txn);
+          ASSERT_OK(txn->SetName("xid"));
+          for (int i = 0; i < num_batches; i++) {
+            ASSERT_OK(
+                txn->Put("k" + std::to_string(i), "value" + std::to_string(i)));
+            if (txn_options.write_batch_flush_threshold == 1) {
+              // WriteUnprepared will check write_batch_flush_threshold and
+              // possibly flush before appending to the write batch. No flush
+              // will happen at the first write because the batch is still
+              // empty, so after k puts, there should be k-1 flushed batches.
+              ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), i);
+            } else {
+              ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), 0);
+            }
+          }
+          if (a == UNPREPARED) {
+            // This is done to prevent the destructor from rolling back the
+            // transaction for us, since we want to pretend we crashed and
+            // test that recovery does the rollback.
+            wup_txn->unprep_seqs_.clear();
+          } else {
+            ASSERT_OK(txn->Prepare());
+          }
+          delete txn;
+
+          // Crash and run recovery code paths.
+          ASSERT_OK(wup_db->db_impl_->FlushWAL(true));
+          wup_db->TEST_Crash();
+          ASSERT_OK(ReOpenNoDelete());
+          assert(db != nullptr);
+
+          db->GetAllPreparedTransactions(&prepared_trans);
+          ASSERT_EQ(prepared_trans.size(), a == UNPREPARED ? 0 : 1);
+          if (a == ROLLBACK) {
+            ASSERT_OK(prepared_trans[0]->Rollback());
+            delete prepared_trans[0];
+          } else if (a == COMMIT) {
+            ASSERT_OK(prepared_trans[0]->Commit());
+            delete prepared_trans[0];
+          }
+
+          Iterator* iter = db->NewIterator(ReadOptions());
+          ASSERT_OK(iter->status());
+          iter->SeekToFirst();
+          // Check that DB has before values.
+          if (!empty || a == COMMIT) {
+            for (int i = 0; i < num_batches; i++) {
+              ASSERT_TRUE(iter->Valid());
+              ASSERT_EQ(iter->key().ToString(), "k" + std::to_string(i));
+              if (a == COMMIT) {
+                ASSERT_EQ(iter->value().ToString(),
+                          "value" + std::to_string(i));
+              } else {
+                ASSERT_EQ(iter->value().ToString(),
+                          "before value" + std::to_string(i));
+              }
+              iter->Next();
+            }
+          }
+          ASSERT_FALSE(iter->Valid());
+          ASSERT_OK(iter->status());
+          delete iter;
+        }
+      }
+    }
+  }
+}
+
+// Basic test to see that unprepared batch gets written to DB when batch size
+// is exceeded. It also does some basic checks to see if commit/rollback works
+// as expected for write unprepared.
+TEST_P(WriteUnpreparedTransactionTest, UnpreparedBatch) {
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+  const int kNumKeys = 10;
+
+  // batch_size of 1 causes writes to DB for every marker.
+  for (size_t batch_size : {1, 1000000}) {
+    txn_options.write_batch_flush_threshold = batch_size;
+    for (bool prepare : {false, true}) {
+      for (bool commit : {false, true}) {
+        ASSERT_OK(ReOpen());
+        Transaction* txn = db->BeginTransaction(write_options, txn_options);
+        WriteUnpreparedTxn* wup_txn = dynamic_cast<WriteUnpreparedTxn*>(txn);
+        ASSERT_OK(txn->SetName("xid"));
+
+        for (int i = 0; i < kNumKeys; i++) {
+          ASSERT_OK(txn->Put("k" + std::to_string(i), "v" + std::to_string(i)));
+          if (txn_options.write_batch_flush_threshold == 1) {
+            // WriteUnprepared will check write_batch_flush_threshold and
+            // possibly flush before appending to the write batch. No flush will
+            // happen at the first write because the batch is still empty, so
+            // after k puts, there should be k-1 flushed batches.
+            ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), i);
+          } else {
+            ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), 0);
+          }
+        }
+
+        if (prepare) {
+          ASSERT_OK(txn->Prepare());
+        }
+
+        Iterator* iter = db->NewIterator(ReadOptions());
+        ASSERT_OK(iter->status());
+        iter->SeekToFirst();
+        assert(!iter->Valid());
+        ASSERT_FALSE(iter->Valid());
+        ASSERT_OK(iter->status());
+        delete iter;
+
+        if (commit) {
+          ASSERT_OK(txn->Commit());
+        } else {
+          ASSERT_OK(txn->Rollback());
+        }
+        delete txn;
+
+        iter = db->NewIterator(ReadOptions());
+        ASSERT_OK(iter->status());
+        iter->SeekToFirst();
+
+        for (int i = 0; i < (commit ? kNumKeys : 0); i++) {
+          ASSERT_TRUE(iter->Valid());
+          ASSERT_EQ(iter->key().ToString(), "k" + std::to_string(i));
+          ASSERT_EQ(iter->value().ToString(), "v" + std::to_string(i));
+          iter->Next();
+        }
+        ASSERT_FALSE(iter->Valid());
+        ASSERT_OK(iter->status());
+        delete iter;
+      }
+    }
+  }
+}
+
+// Test whether logs containing unprepared/prepared batches are kept even
+// after memtable finishes flushing, and whether they are removed when
+// transaction commits/aborts.
+//
+// TODO(lth): Merge with TransactionTest/TwoPhaseLogRollingTest tests.
+TEST_P(WriteUnpreparedTransactionTest, MarkLogWithPrepSection) {
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+  // batch_size of 1 causes writes to DB for every marker.
+  txn_options.write_batch_flush_threshold = 1;
+  const int kNumKeys = 10;
+
+  WriteOptions wopts;
+  wopts.sync = true;
+
+  for (bool prepare : {false, true}) {
+    for (bool commit : {false, true}) {
+      ASSERT_OK(ReOpen());
+      auto wup_db = dynamic_cast<WriteUnpreparedTxnDB*>(db);
+      auto db_impl = wup_db->db_impl_;
+
+      Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+      ASSERT_OK(txn1->SetName("xid1"));
+
+      Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+      ASSERT_OK(txn2->SetName("xid2"));
+
+      // Spread this transaction across multiple log files.
+      for (int i = 0; i < kNumKeys; i++) {
+        ASSERT_OK(txn1->Put("k1" + std::to_string(i), "v" + std::to_string(i)));
+        if (i >= kNumKeys / 2) {
+          ASSERT_OK(
+              txn2->Put("k2" + std::to_string(i), "v" + std::to_string(i)));
+        }
+
+        if (i > 0) {
+          ASSERT_OK(db_impl->TEST_SwitchWAL());
+        }
+      }
+
+      ASSERT_GT(txn1->GetLogNumber(), 0);
+      ASSERT_GT(txn2->GetLogNumber(), 0);
+
+      ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
+                txn1->GetLogNumber());
+      ASSERT_GT(db_impl->TEST_LogfileNumber(), txn1->GetLogNumber());
+
+      if (prepare) {
+        ASSERT_OK(txn1->Prepare());
+        ASSERT_OK(txn2->Prepare());
+      }
+
+      ASSERT_GE(db_impl->TEST_LogfileNumber(), txn1->GetLogNumber());
+      ASSERT_GE(db_impl->TEST_LogfileNumber(), txn2->GetLogNumber());
+
+      ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
+                txn1->GetLogNumber());
+      if (commit) {
+        ASSERT_OK(txn1->Commit());
+      } else {
+        ASSERT_OK(txn1->Rollback());
+      }
+
+      ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
+                txn2->GetLogNumber());
+
+      if (commit) {
+        ASSERT_OK(txn2->Commit());
+      } else {
+        ASSERT_OK(txn2->Rollback());
+      }
+
+      ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
+
+      delete txn1;
+      delete txn2;
+    }
+  }
+}
+
+TEST_P(WriteUnpreparedTransactionTest, NoSnapshotWrite) {
+  WriteOptions woptions;
+  TransactionOptions txn_options;
+  txn_options.write_batch_flush_threshold = 1;
+
+  Transaction* txn = db->BeginTransaction(woptions, txn_options);
+
+  // Do some writes with no snapshot
+  ASSERT_OK(txn->Put("a", "a"));
+  ASSERT_OK(txn->Put("b", "b"));
+  ASSERT_OK(txn->Put("c", "c"));
+
+  // Test that it is still possible to create iterators after writes with no
+  // snapshot, if iterator snapshot is fresh enough.
+  ReadOptions roptions;
+  auto iter = txn->GetIterator(roptions);
+  ASSERT_OK(iter->status());
+  int keys = 0;
+  for (iter->SeekToLast(); iter->Valid(); iter->Prev(), keys++) {
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(iter->key().ToString(), iter->value().ToString());
+  }
+  ASSERT_EQ(keys, 3);
+  ASSERT_OK(iter->status());
+
+  delete iter;
+  delete txn;
+}
+
+// Test whether write to a transaction while iterating is supported.
+TEST_P(WriteUnpreparedTransactionTest, IterateAndWrite) {
+  WriteOptions woptions;
+  TransactionOptions txn_options;
+  txn_options.write_batch_flush_threshold = 1;
+
+  enum Action { DO_DELETE, DO_UPDATE };
+
+  for (Action a : {DO_DELETE, DO_UPDATE}) {
+    for (int i = 0; i < 100; i++) {
+      ASSERT_OK(db->Put(woptions, std::to_string(i), std::to_string(i)));
+    }
+
+    Transaction* txn = db->BeginTransaction(woptions, txn_options);
+    // write_batch_ now contains 1 key.
+    ASSERT_OK(txn->Put("9", "a"));
+
+    ReadOptions roptions;
+    auto iter = txn->GetIterator(roptions);
+    ASSERT_OK(iter->status());
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      if (iter->key() == "9") {
+        ASSERT_EQ(iter->value().ToString(), "a");
+      } else {
+        ASSERT_EQ(iter->key().ToString(), iter->value().ToString());
+      }
+
+      if (a == DO_DELETE) {
+        ASSERT_OK(txn->Delete(iter->key()));
+      } else {
+        ASSERT_OK(txn->Put(iter->key(), "b"));
+      }
+    }
+    ASSERT_OK(iter->status());
+
+    delete iter;
+    ASSERT_OK(txn->Commit());
+
+    iter = db->NewIterator(roptions);
+    ASSERT_OK(iter->status());
+    if (a == DO_DELETE) {
+      // Check that db is empty.
+      iter->SeekToFirst();
+      ASSERT_FALSE(iter->Valid());
+    } else {
+      int keys = 0;
+      // Check that all values are updated to b.
+      for (iter->SeekToFirst(); iter->Valid(); iter->Next(), keys++) {
+        ASSERT_OK(iter->status());
+        ASSERT_EQ(iter->value().ToString(), "b");
+      }
+      ASSERT_EQ(keys, 100);
+    }
+    ASSERT_OK(iter->status());
+
+    delete iter;
+    delete txn;
+  }
+}
+
+// Test that using an iterator after transaction clear is not supported
+TEST_P(WriteUnpreparedTransactionTest, IterateAfterClear) {
+  WriteOptions woptions;
+  TransactionOptions txn_options;
+  txn_options.write_batch_flush_threshold = 1;
+
+  enum Action { kCommit, kRollback };
+
+  for (Action a : {kCommit, kRollback}) {
+    for (int i = 0; i < 100; i++) {
+      ASSERT_OK(db->Put(woptions, std::to_string(i), std::to_string(i)));
+    }
+
+    Transaction* txn = db->BeginTransaction(woptions, txn_options);
+    ASSERT_OK(txn->Put("9", "a"));
+
+    ReadOptions roptions;
+    auto iter1 = txn->GetIterator(roptions);
+    auto iter2 = txn->GetIterator(roptions);
+    iter1->SeekToFirst();
+    iter2->Seek("9");
+
+    // Check that iterators are valid before transaction finishes.
+    ASSERT_TRUE(iter1->Valid());
+    ASSERT_TRUE(iter2->Valid());
+    ASSERT_OK(iter1->status());
+    ASSERT_OK(iter2->status());
+
+    if (a == kCommit) {
+      ASSERT_OK(txn->Commit());
+    } else {
+      ASSERT_OK(txn->Rollback());
+    }
+
+    // Check that iterators are invalidated after transaction finishes.
+    ASSERT_FALSE(iter1->Valid());
+    ASSERT_FALSE(iter2->Valid());
+    ASSERT_TRUE(iter1->status().IsInvalidArgument());
+    ASSERT_TRUE(iter2->status().IsInvalidArgument());
+
+    delete iter1;
+    delete iter2;
+    delete txn;
+  }
+}
+
+TEST_P(WriteUnpreparedTransactionTest, SavePoint) {
+  WriteOptions woptions;
+  TransactionOptions txn_options;
+  txn_options.write_batch_flush_threshold = 1;
+
+  Transaction* txn = db->BeginTransaction(woptions, txn_options);
+  txn->SetSavePoint();
+  ASSERT_OK(txn->Put("a", "a"));
+  ASSERT_OK(txn->Put("b", "b"));
+  ASSERT_OK(txn->Commit());
+
+  ReadOptions roptions;
+  std::string value;
+  ASSERT_OK(txn->Get(roptions, "a", &value));
+  ASSERT_EQ(value, "a");
+  ASSERT_OK(txn->Get(roptions, "b", &value));
+  ASSERT_EQ(value, "b");
+  delete txn;
+}
+
+TEST_P(WriteUnpreparedTransactionTest, UntrackedKeys) {
+  WriteOptions woptions;
+  TransactionOptions txn_options;
+  txn_options.write_batch_flush_threshold = 1;
+
+  Transaction* txn = db->BeginTransaction(woptions, txn_options);
+  auto wb = txn->GetWriteBatch()->GetWriteBatch();
+  ASSERT_OK(txn->Put("a", "a"));
+  ASSERT_OK(wb->Put("a_untrack", "a_untrack"));
+  txn->SetSavePoint();
+  ASSERT_OK(txn->Put("b", "b"));
+  ASSERT_OK(txn->Put("b_untrack", "b_untrack"));
+
+  ReadOptions roptions;
+  std::string value;
+  ASSERT_OK(txn->Get(roptions, "a", &value));
+  ASSERT_EQ(value, "a");
+  ASSERT_OK(txn->Get(roptions, "a_untrack", &value));
+  ASSERT_EQ(value, "a_untrack");
+  ASSERT_OK(txn->Get(roptions, "b", &value));
+  ASSERT_EQ(value, "b");
+  ASSERT_OK(txn->Get(roptions, "b_untrack", &value));
+  ASSERT_EQ(value, "b_untrack");
+
+  // b and b_untrack should be rolled back.
+  ASSERT_OK(txn->RollbackToSavePoint());
+  ASSERT_OK(txn->Get(roptions, "a", &value));
+  ASSERT_EQ(value, "a");
+  ASSERT_OK(txn->Get(roptions, "a_untrack", &value));
+  ASSERT_EQ(value, "a_untrack");
+  auto s = txn->Get(roptions, "b", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  s = txn->Get(roptions, "b_untrack", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  // Everything should be rolled back.
+  ASSERT_OK(txn->Rollback());
+  s = txn->Get(roptions, "a", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  s = txn->Get(roptions, "a_untrack", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  s = txn->Get(roptions, "b", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  s = txn->Get(roptions, "b_untrack", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  delete txn;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as Transactions are not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/write_unprepared_txn.cc b/src/rocksdb/utilities/transactions/write_unprepared_txn.cc
new file mode 100644
index 000000000..6e04d3344
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/write_unprepared_txn.cc
@@ -0,0 +1,1053 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/write_unprepared_txn.h"
+
+#include "db/db_impl/db_impl.h"
+#include "util/cast_util.h"
+#include "utilities/transactions/write_unprepared_txn_db.h"
+#include "utilities/write_batch_with_index/write_batch_with_index_internal.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool WriteUnpreparedTxnReadCallback::IsVisibleFullCheck(SequenceNumber seq) {
+  // Since unprep_seqs maps prep_seq => prepare_batch_cnt, to check if seq is
+  // in unprep_seqs, we have to check if seq is equal to prep_seq or any of
+  // the prepare_batch_cnt seq nums after it.
+  //
+  // TODO(lth): Can be optimized with std::lower_bound if unprep_seqs is
+  // large.
+  for (const auto& it : unprep_seqs_) {
+    if (it.first <= seq && seq < it.first + it.second) {
+      return true;
+    }
+  }
+
+  bool snap_released = false;
+  auto ret =
+      db_->IsInSnapshot(seq, wup_snapshot_, min_uncommitted_, &snap_released);
+  assert(!snap_released || backed_by_snapshot_ == kUnbackedByDBSnapshot);
+  snap_released_ |= snap_released;
+  return ret;
+}
+
+WriteUnpreparedTxn::WriteUnpreparedTxn(WriteUnpreparedTxnDB* txn_db,
+                                       const WriteOptions& write_options,
+                                       const TransactionOptions& txn_options)
+    : WritePreparedTxn(txn_db, write_options, txn_options),
+      wupt_db_(txn_db),
+      last_log_number_(0),
+      recovered_txn_(false),
+      largest_validated_seq_(0) {
+  if (txn_options.write_batch_flush_threshold < 0) {
+    write_batch_flush_threshold_ =
+        txn_db_impl_->GetTxnDBOptions().default_write_batch_flush_threshold;
+  } else {
+    write_batch_flush_threshold_ = txn_options.write_batch_flush_threshold;
+  }
+}
+
+WriteUnpreparedTxn::~WriteUnpreparedTxn() {
+  if (!unprep_seqs_.empty()) {
+    assert(log_number_ > 0);
+    assert(GetId() > 0);
+    assert(!name_.empty());
+
+    // We should rollback regardless of GetState, but some unit tests that
+    // test crash recovery run the destructor assuming that rollback does not
+    // happen, so that rollback during recovery can be exercised.
+    if (GetState() == STARTED || GetState() == LOCKS_STOLEN) {
+      auto s = RollbackInternal();
+      assert(s.ok());
+      if (!s.ok()) {
+        ROCKS_LOG_FATAL(
+            wupt_db_->info_log_,
+            "Rollback of WriteUnprepared transaction failed in destructor: %s",
+            s.ToString().c_str());
+      }
+      dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed(
+          log_number_);
+    }
+  }
+
+  // Clear the tracked locks so that ~PessimisticTransaction does not
+  // try to unlock keys for recovered transactions.
+  if (recovered_txn_) {
+    tracked_locks_->Clear();
+  }
+}
+
+void WriteUnpreparedTxn::Initialize(const TransactionOptions& txn_options) {
+  PessimisticTransaction::Initialize(txn_options);
+  if (txn_options.write_batch_flush_threshold < 0) {
+    write_batch_flush_threshold_ =
+        txn_db_impl_->GetTxnDBOptions().default_write_batch_flush_threshold;
+  } else {
+    write_batch_flush_threshold_ = txn_options.write_batch_flush_threshold;
+  }
+
+  unprep_seqs_.clear();
+  flushed_save_points_.reset(nullptr);
+  unflushed_save_points_.reset(nullptr);
+  recovered_txn_ = false;
+  largest_validated_seq_ = 0;
+  assert(active_iterators_.empty());
+  active_iterators_.clear();
+  untracked_keys_.clear();
+}
+
+Status WriteUnpreparedTxn::HandleWrite(std::function<Status()> do_write) {
+  Status s;
+  if (active_iterators_.empty()) {
+    s = MaybeFlushWriteBatchToDB();
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  s = do_write();
+  if (s.ok()) {
+    if (snapshot_) {
+      largest_validated_seq_ =
+          std::max(largest_validated_seq_, snapshot_->GetSequenceNumber());
+    } else {
+      // TODO(lth): We should use the same number as tracked_at_seq in TryLock,
+      // because what is actually being tracked is the sequence number at which
+      // this key was locked at.
+      largest_validated_seq_ = db_impl_->GetLastPublishedSequence();
+    }
+  }
+  return s;
+}
+
+Status WriteUnpreparedTxn::Put(ColumnFamilyHandle* column_family,
+                               const Slice& key, const Slice& value,
+                               const bool assume_tracked) {
+  return HandleWrite([&]() {
+    return TransactionBaseImpl::Put(column_family, key, value, assume_tracked);
+  });
+}
+
+Status WriteUnpreparedTxn::Put(ColumnFamilyHandle* column_family,
+                               const SliceParts& key, const SliceParts& value,
+                               const bool assume_tracked) {
+  return HandleWrite([&]() {
+    return TransactionBaseImpl::Put(column_family, key, value, assume_tracked);
+  });
+}
+
+Status WriteUnpreparedTxn::Merge(ColumnFamilyHandle* column_family,
+                                 const Slice& key, const Slice& value,
+                                 const bool assume_tracked) {
+  return HandleWrite([&]() {
+    return TransactionBaseImpl::Merge(column_family, key, value,
+                                      assume_tracked);
+  });
+}
+
+Status WriteUnpreparedTxn::Delete(ColumnFamilyHandle* column_family,
+                                  const Slice& key, const bool assume_tracked) {
+  return HandleWrite([&]() {
+    return TransactionBaseImpl::Delete(column_family, key, assume_tracked);
+  });
+}
+
+Status WriteUnpreparedTxn::Delete(ColumnFamilyHandle* column_family,
+                                  const SliceParts& key,
+                                  const bool assume_tracked) {
+  return HandleWrite([&]() {
+    return TransactionBaseImpl::Delete(column_family, key, assume_tracked);
+  });
+}
+
+Status WriteUnpreparedTxn::SingleDelete(ColumnFamilyHandle* column_family,
+                                        const Slice& key,
+                                        const bool assume_tracked) {
+  return HandleWrite([&]() {
+    return TransactionBaseImpl::SingleDelete(column_family, key,
+                                             assume_tracked);
+  });
+}
+
+Status WriteUnpreparedTxn::SingleDelete(ColumnFamilyHandle* column_family,
+                                        const SliceParts& key,
+                                        const bool assume_tracked) {
+  return HandleWrite([&]() {
+    return TransactionBaseImpl::SingleDelete(column_family, key,
+                                             assume_tracked);
+  });
+}
+
+// WriteUnpreparedTxn::RebuildFromWriteBatch is only called on recovery. For
+// WriteUnprepared, the write batches have already been written into the
+// database during WAL replay, so all we have to do is just to "retrack" the key
+// so that rollbacks are possible.
+//
+// Calling TryLock instead of TrackKey is also possible, but as an optimization,
+// recovered transactions do not hold locks on their keys. This follows the
+// implementation in PessimisticTransactionDB::Initialize where we set
+// skip_concurrency_control to true.
+Status WriteUnpreparedTxn::RebuildFromWriteBatch(WriteBatch* wb) {
+  struct TrackKeyHandler : public WriteBatch::Handler {
+    WriteUnpreparedTxn* txn_;
+    bool rollback_merge_operands_;
+
+    TrackKeyHandler(WriteUnpreparedTxn* txn, bool rollback_merge_operands)
+        : txn_(txn), rollback_merge_operands_(rollback_merge_operands) {}
+
+    Status PutCF(uint32_t cf, const Slice& key, const Slice&) override {
+      txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber,
+                     false /* read_only */, true /* exclusive */);
+      return Status::OK();
+    }
+
+    Status DeleteCF(uint32_t cf, const Slice& key) override {
+      txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber,
+                     false /* read_only */, true /* exclusive */);
+      return Status::OK();
+    }
+
+    Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+      txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber,
+                     false /* read_only */, true /* exclusive */);
+      return Status::OK();
+    }
+
+    Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override {
+      if (rollback_merge_operands_) {
+        txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber,
+                       false /* read_only */, true /* exclusive */);
+      }
+      return Status::OK();
+    }
+
+    // Recovered batches do not contain 2PC markers.
+    Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); }
+
+    Status MarkEndPrepare(const Slice&) override {
+      return Status::InvalidArgument();
+    }
+
+    Status MarkNoop(bool) override { return Status::InvalidArgument(); }
+
+    Status MarkCommit(const Slice&) override {
+      return Status::InvalidArgument();
+    }
+
+    Status MarkRollback(const Slice&) override {
+      return Status::InvalidArgument();
+    }
+  };
+
+  TrackKeyHandler handler(this,
+                          wupt_db_->txn_db_options_.rollback_merge_operands);
+  return wb->Iterate(&handler);
+}
+
+Status WriteUnpreparedTxn::MaybeFlushWriteBatchToDB() {
+  const bool kPrepared = true;
+  Status s;
+  if (write_batch_flush_threshold_ > 0 &&
+      write_batch_.GetWriteBatch()->Count() > 0 &&
+      write_batch_.GetDataSize() >
+          static_cast<size_t>(write_batch_flush_threshold_)) {
+    assert(GetState() != PREPARED);
+    s = FlushWriteBatchToDB(!kPrepared);
+  }
+  return s;
+}
+
+Status WriteUnpreparedTxn::FlushWriteBatchToDB(bool prepared) {
+  // If the current write batch contains savepoints, then some special handling
+  // is required so that RollbackToSavepoint can work.
+  //
+  // RollbackToSavepoint is not supported after Prepare() is called, so only do
+  // this for unprepared batches.
+  if (!prepared && unflushed_save_points_ != nullptr &&
+      !unflushed_save_points_->empty()) {
+    return FlushWriteBatchWithSavePointToDB();
+  }
+
+  return FlushWriteBatchToDBInternal(prepared);
+}
+
+Status WriteUnpreparedTxn::FlushWriteBatchToDBInternal(bool prepared) {
+  if (name_.empty()) {
+    assert(!prepared);
+#ifndef NDEBUG
+    static std::atomic_ullong autogen_id{0};
+    // To avoid changing all tests to call SetName, just autogenerate one.
+    if (wupt_db_->txn_db_options_.autogenerate_name) {
+      auto s = SetName(std::string("autoxid") +
+                       std::to_string(autogen_id.fetch_add(1)));
+      assert(s.ok());
+    } else
+#endif
+    {
+      return Status::InvalidArgument("Cannot write to DB without SetName.");
+    }
+  }
+
+  struct UntrackedKeyHandler : public WriteBatch::Handler {
+    WriteUnpreparedTxn* txn_;
+    bool rollback_merge_operands_;
+
+    UntrackedKeyHandler(WriteUnpreparedTxn* txn, bool rollback_merge_operands)
+        : txn_(txn), rollback_merge_operands_(rollback_merge_operands) {}
+
+    Status AddUntrackedKey(uint32_t cf, const Slice& key) {
+      auto str = key.ToString();
+      PointLockStatus lock_status =
+          txn_->tracked_locks_->GetPointLockStatus(cf, str);
+      if (!lock_status.locked) {
+        txn_->untracked_keys_[cf].push_back(str);
+      }
+      return Status::OK();
+    }
+
+    Status PutCF(uint32_t cf, const Slice& key, const Slice&) override {
+      return AddUntrackedKey(cf, key);
+    }
+
+    Status DeleteCF(uint32_t cf, const Slice& key) override {
+      return AddUntrackedKey(cf, key);
+    }
+
+    Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+      return AddUntrackedKey(cf, key);
+    }
+
+    Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override {
+      if (rollback_merge_operands_) {
+        return AddUntrackedKey(cf, key);
+      }
+      return Status::OK();
+    }
+
+    // The only expected 2PC marker is the initial Noop marker.
+    Status MarkNoop(bool empty_batch) override {
+      return empty_batch ? Status::OK() : Status::InvalidArgument();
+    }
+
+    Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); }
+
+    Status MarkEndPrepare(const Slice&) override {
+      return Status::InvalidArgument();
+    }
+
+    Status MarkCommit(const Slice&) override {
+      return Status::InvalidArgument();
+    }
+
+    Status MarkRollback(const Slice&) override {
+      return Status::InvalidArgument();
+    }
+  };
+
+  UntrackedKeyHandler handler(
+      this, wupt_db_->txn_db_options_.rollback_merge_operands);
+  auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&handler);
+  assert(s.ok());
+
+  // TODO(lth): Reduce duplicate code with WritePrepared prepare logic.
+  WriteOptions write_options = write_options_;
+  write_options.disableWAL = false;
+  const bool WRITE_AFTER_COMMIT = true;
+  const bool first_prepare_batch = log_number_ == 0;
+  // MarkEndPrepare will change Noop marker to the appropriate marker.
+  s = WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(),
+                                         name_, !WRITE_AFTER_COMMIT, !prepared);
+  assert(s.ok());
+  // For each duplicate key we account for a new sub-batch
+  prepare_batch_cnt_ = GetWriteBatch()->SubBatchCnt();
+  // AddPrepared better to be called in the pre-release callback otherwise there
+  // is a non-zero chance of max advancing prepare_seq and readers assume the
+  // data as committed.
+  // Also having it in the PreReleaseCallback allows in-order addition of
+  // prepared entries to PreparedHeap and hence enables an optimization. Refer
+  // to SmallestUnCommittedSeq for more details.
+  AddPreparedCallback add_prepared_callback(
+      wpt_db_, db_impl_, prepare_batch_cnt_,
+      db_impl_->immutable_db_options().two_write_queues, first_prepare_batch);
+  const bool DISABLE_MEMTABLE = true;
+  uint64_t seq_used = kMaxSequenceNumber;
+  // log_number_ should refer to the oldest log containing uncommitted data
+  // from the current transaction. This means that if log_number_ is set,
+  // WriteImpl should not overwrite that value, so set log_used to nullptr if
+  // log_number_ is already set.
+  s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
+                          /*callback*/ nullptr, &last_log_number_,
+                          /*log ref*/ 0, !DISABLE_MEMTABLE, &seq_used,
+                          prepare_batch_cnt_, &add_prepared_callback);
+  if (log_number_ == 0) {
+    log_number_ = last_log_number_;
+  }
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  auto prepare_seq = seq_used;
+
+  // Only call SetId if it hasn't been set yet.
+  if (GetId() == 0) {
+    SetId(prepare_seq);
+  }
+  // unprep_seqs_ will also contain prepared seqnos since they are treated in
+  // the same way in the prepare/commit callbacks. See the comment on the
+  // definition of unprep_seqs_.
+  unprep_seqs_[prepare_seq] = prepare_batch_cnt_;
+
+  // Reset transaction state.
+  if (!prepared) {
+    prepare_batch_cnt_ = 0;
+    const bool kClear = true;
+    TransactionBaseImpl::InitWriteBatch(kClear);
+  }
+
+  return s;
+}
+
+Status WriteUnpreparedTxn::FlushWriteBatchWithSavePointToDB() {
+  assert(unflushed_save_points_ != nullptr &&
+         unflushed_save_points_->size() > 0);
+  assert(save_points_ != nullptr && save_points_->size() > 0);
+  assert(save_points_->size() >= unflushed_save_points_->size());
+
+  // Handler class for creating an unprepared batch from a savepoint.
+  struct SavePointBatchHandler : public WriteBatch::Handler {
+    WriteBatchWithIndex* wb_;
+    const std::map<uint32_t, ColumnFamilyHandle*>& handles_;
+
+    SavePointBatchHandler(
+        WriteBatchWithIndex* wb,
+        const std::map<uint32_t, ColumnFamilyHandle*>& handles)
+        : wb_(wb), handles_(handles) {}
+
+    Status PutCF(uint32_t cf, const Slice& key, const Slice& value) override {
+      return wb_->Put(handles_.at(cf), key, value);
+    }
+
+    Status DeleteCF(uint32_t cf, const Slice& key) override {
+      return wb_->Delete(handles_.at(cf), key);
+    }
+
+    Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+      return wb_->SingleDelete(handles_.at(cf), key);
+    }
+
+    Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) override {
+      return wb_->Merge(handles_.at(cf), key, value);
+    }
+
+    // The only expected 2PC marker is the initial Noop marker.
+    Status MarkNoop(bool empty_batch) override {
+      return empty_batch ? Status::OK() : Status::InvalidArgument();
+    }
+
+    Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); }
+
+    Status MarkEndPrepare(const Slice&) override {
+      return Status::InvalidArgument();
+    }
+
+    Status MarkCommit(const Slice&) override {
+      return Status::InvalidArgument();
+    }
+
+    Status MarkRollback(const Slice&) override {
+      return Status::InvalidArgument();
+    }
+  };
+
+  // The comparator of the default cf is passed in, similar to the
+  // initialization of TransactionBaseImpl::write_batch_. This comparator is
+  // only used if the write batch encounters an invalid cf id, and falls back to
+  // this comparator.
+  WriteBatchWithIndex wb(wpt_db_->DefaultColumnFamily()->GetComparator(), 0,
+                         true, 0, write_options_.protection_bytes_per_key);
+  // Swap with write_batch_ so that wb contains the complete write batch. The
+  // actual write batch that will be flushed to DB will be built in
+  // write_batch_, and will be read by FlushWriteBatchToDBInternal.
+  std::swap(wb, write_batch_);
+  TransactionBaseImpl::InitWriteBatch();
+
+  size_t prev_boundary = WriteBatchInternal::kHeader;
+  const bool kPrepared = true;
+  for (size_t i = 0; i < unflushed_save_points_->size() + 1; i++) {
+    bool trailing_batch = i == unflushed_save_points_->size();
+    SavePointBatchHandler sp_handler(&write_batch_,
+                                     *wupt_db_->GetCFHandleMap().get());
+    size_t curr_boundary = trailing_batch ? wb.GetWriteBatch()->GetDataSize()
+                                          : (*unflushed_save_points_)[i];
+
+    // Construct the partial write batch up to the savepoint.
+    //
+    // Theoretically, a memcpy between the write batches should be sufficient
+    // since the rewriting into the batch should produce the exact same byte
+    // representation. Rebuilding the WriteBatchWithIndex index is still
+    // necessary though, and would imply doing two passes over the batch though.
+    Status s = WriteBatchInternal::Iterate(wb.GetWriteBatch(), &sp_handler,
+                                           prev_boundary, curr_boundary);
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (write_batch_.GetWriteBatch()->Count() > 0) {
+      // Flush the write batch.
+      s = FlushWriteBatchToDBInternal(!kPrepared);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+
+    if (!trailing_batch) {
+      if (flushed_save_points_ == nullptr) {
+        flushed_save_points_.reset(
+            new autovector<WriteUnpreparedTxn::SavePoint>());
+      }
+      flushed_save_points_->emplace_back(
+          unprep_seqs_, new ManagedSnapshot(db_impl_, wupt_db_->GetSnapshot()));
+    }
+
+    prev_boundary = curr_boundary;
+    const bool kClear = true;
+    TransactionBaseImpl::InitWriteBatch(kClear);
+  }
+
+  unflushed_save_points_->clear();
+  return Status::OK();
+}
+
+Status WriteUnpreparedTxn::PrepareInternal() {
+  const bool kPrepared = true;
+  return FlushWriteBatchToDB(kPrepared);
+}
+
+Status WriteUnpreparedTxn::CommitWithoutPrepareInternal() {
+  if (unprep_seqs_.empty()) {
+    assert(log_number_ == 0);
+    assert(GetId() == 0);
+    return WritePreparedTxn::CommitWithoutPrepareInternal();
+  }
+
+  // TODO(lth): We should optimize commit without prepare to not perform
+  // a prepare under the hood.
+  auto s = PrepareInternal();
+  if (!s.ok()) {
+    return s;
+  }
+  return CommitInternal();
+}
+
+Status WriteUnpreparedTxn::CommitInternal() {
+  // TODO(lth): Reduce duplicate code with WritePrepared commit logic.
+
+  // We take the commit-time batch and append the Commit marker.  The Memtable
+  // will ignore the Commit marker in non-recovery mode
+  WriteBatch* working_batch = GetCommitTimeWriteBatch();
+  const bool empty = working_batch->Count() == 0;
+  auto s = WriteBatchInternal::MarkCommit(working_batch, name_);
+  assert(s.ok());
+
+  const bool for_recovery = use_only_the_last_commit_time_batch_for_recovery_;
+  if (!empty) {
+    // When not writing to memtable, we can still cache the latest write batch.
+    // The cached batch will be written to memtable in WriteRecoverableState
+    // during FlushMemTable
+    if (for_recovery) {
+      WriteBatchInternal::SetAsLatestPersistentState(working_batch);
+    } else {
+      return Status::InvalidArgument(
+          "Commit-time-batch can only be used if "
+          "use_only_the_last_commit_time_batch_for_recovery is true");
+    }
+  }
+
+  const bool includes_data = !empty && !for_recovery;
+  size_t commit_batch_cnt = 0;
+  if (UNLIKELY(includes_data)) {
+    ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log,
+                   "Duplicate key overhead");
+    SubBatchCounter counter(*wpt_db_->GetCFComparatorMap());
+    s = working_batch->Iterate(&counter);
+    assert(s.ok());
+    commit_batch_cnt = counter.BatchCount();
+  }
+  const bool disable_memtable = !includes_data;
+  const bool do_one_write =
+      !db_impl_->immutable_db_options().two_write_queues || disable_memtable;
+
+  WriteUnpreparedCommitEntryPreReleaseCallback update_commit_map(
+      wpt_db_, db_impl_, unprep_seqs_, commit_batch_cnt);
+  const bool kFirstPrepareBatch = true;
+  AddPreparedCallback add_prepared_callback(
+      wpt_db_, db_impl_, commit_batch_cnt,
+      db_impl_->immutable_db_options().two_write_queues, !kFirstPrepareBatch);
+  PreReleaseCallback* pre_release_callback;
+  if (do_one_write) {
+    pre_release_callback = &update_commit_map;
+  } else {
+    pre_release_callback = &add_prepared_callback;
+  }
+  uint64_t seq_used = kMaxSequenceNumber;
+  // Since the prepared batch is directly written to memtable, there is
+  // already a connection between the memtable and its WAL, so there is no
+  // need to redundantly reference the log that contains the prepared data.
+  const uint64_t zero_log_number = 0ull;
+  size_t batch_cnt = UNLIKELY(commit_batch_cnt) ? commit_batch_cnt : 1;
+  s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr,
+                          zero_log_number, disable_memtable, &seq_used,
+                          batch_cnt, pre_release_callback);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  const SequenceNumber commit_batch_seq = seq_used;
+  if (LIKELY(do_one_write || !s.ok())) {
+    if (LIKELY(s.ok())) {
+      // Note RemovePrepared should be called after WriteImpl that publishsed
+      // the seq. Otherwise SmallestUnCommittedSeq optimization breaks.
+      for (const auto& seq : unprep_seqs_) {
+        wpt_db_->RemovePrepared(seq.first, seq.second);
+      }
+    }
+    if (UNLIKELY(!do_one_write)) {
+      wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt);
+    }
+    unprep_seqs_.clear();
+    flushed_save_points_.reset(nullptr);
+    unflushed_save_points_.reset(nullptr);
+    return s;
+  }  // else do the 2nd write to publish seq
+
+  // Populate unprep_seqs_ with commit_batch_seq, since we treat data in the
+  // commit write batch as just another "unprepared" batch. This will also
+  // update the unprep_seqs_ in the update_commit_map callback.
+  unprep_seqs_[commit_batch_seq] = commit_batch_cnt;
+  WriteUnpreparedCommitEntryPreReleaseCallback
+      update_commit_map_with_commit_batch(wpt_db_, db_impl_, unprep_seqs_, 0);
+
+  // Note: the 2nd write comes with a performance penality. So if we have too
+  // many of commits accompanied with ComitTimeWriteBatch and yet we cannot
+  // enable use_only_the_last_commit_time_batch_for_recovery_ optimization,
+  // two_write_queues should be disabled to avoid many additional writes here.
+
+  // Update commit map only from the 2nd queue
+  WriteBatch empty_batch;
+  s = empty_batch.PutLogData(Slice());
+  assert(s.ok());
+  // In the absence of Prepare markers, use Noop as a batch separator
+  s = WriteBatchInternal::InsertNoop(&empty_batch);
+  assert(s.ok());
+  const bool DISABLE_MEMTABLE = true;
+  const size_t ONE_BATCH = 1;
+  const uint64_t NO_REF_LOG = 0;
+  s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
+                          NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
+                          &update_commit_map_with_commit_batch);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  // Note RemovePrepared should be called after WriteImpl that publishsed the
+  // seq. Otherwise SmallestUnCommittedSeq optimization breaks.
+  for (const auto& seq : unprep_seqs_) {
+    wpt_db_->RemovePrepared(seq.first, seq.second);
+  }
+  unprep_seqs_.clear();
+  flushed_save_points_.reset(nullptr);
+  unflushed_save_points_.reset(nullptr);
+  return s;
+}
+
+Status WriteUnpreparedTxn::WriteRollbackKeys(
+    const LockTracker& lock_tracker, WriteBatchWithIndex* rollback_batch,
+    ReadCallback* callback, const ReadOptions& roptions) {
+  // This assertion can be removed when range lock is supported.
+  assert(lock_tracker.IsPointLockSupported());
+  const auto& cf_map = *wupt_db_->GetCFHandleMap();
+  auto WriteRollbackKey = [&](const std::string& key, uint32_t cfid) {
+    const auto& cf_handle = cf_map.at(cfid);
+    PinnableSlice pinnable_val;
+    bool not_used;
+    DBImpl::GetImplOptions get_impl_options;
+    get_impl_options.column_family = cf_handle;
+    get_impl_options.value = &pinnable_val;
+    get_impl_options.value_found = &not_used;
+    get_impl_options.callback = callback;
+    auto s = db_impl_->GetImpl(roptions, key, get_impl_options);
+
+    if (s.ok()) {
+      s = rollback_batch->Put(cf_handle, key, pinnable_val);
+      assert(s.ok());
+    } else if (s.IsNotFound()) {
+      if (wupt_db_->ShouldRollbackWithSingleDelete(cf_handle, key)) {
+        s = rollback_batch->SingleDelete(cf_handle, key);
+      } else {
+        s = rollback_batch->Delete(cf_handle, key);
+      }
+      assert(s.ok());
+    } else {
+      return s;
+    }
+
+    return Status::OK();
+  };
+
+  std::unique_ptr<LockTracker::ColumnFamilyIterator> cf_it(
+      lock_tracker.GetColumnFamilyIterator());
+  assert(cf_it != nullptr);
+  while (cf_it->HasNext()) {
+    ColumnFamilyId cf = cf_it->Next();
+    std::unique_ptr<LockTracker::KeyIterator> key_it(
+        lock_tracker.GetKeyIterator(cf));
+    assert(key_it != nullptr);
+    while (key_it->HasNext()) {
+      const std::string& key = key_it->Next();
+      auto s = WriteRollbackKey(key, cf);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  }
+
+  for (const auto& cfkey : untracked_keys_) {
+    const auto cfid = cfkey.first;
+    const auto& keys = cfkey.second;
+    for (const auto& key : keys) {
+      auto s = WriteRollbackKey(key, cfid);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+Status WriteUnpreparedTxn::RollbackInternal() {
+  // TODO(lth): Reduce duplicate code with WritePrepared rollback logic.
+  WriteBatchWithIndex rollback_batch(
+      wpt_db_->DefaultColumnFamily()->GetComparator(), 0, true, 0,
+      write_options_.protection_bytes_per_key);
+  assert(GetId() != kMaxSequenceNumber);
+  assert(GetId() > 0);
+  Status s;
+  auto read_at_seq = kMaxSequenceNumber;
+  ReadOptions roptions;
+  // to prevent callback's seq to be overrriden inside DBImpk::Get
+  roptions.snapshot = wpt_db_->GetMaxSnapshot();
+  // Note that we do not use WriteUnpreparedTxnReadCallback because we do not
+  // need to read our own writes when reading prior versions of the key for
+  // rollback.
+  WritePreparedTxnReadCallback callback(wpt_db_, read_at_seq);
+  // TODO(lth): We write rollback batch all in a single batch here, but this
+  // should be subdivded into multiple batches as well. In phase 2, when key
+  // sets are read from WAL, this will happen naturally.
+  s = WriteRollbackKeys(*tracked_locks_, &rollback_batch, &callback, roptions);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // The Rollback marker will be used as a batch separator
+  s = WriteBatchInternal::MarkRollback(rollback_batch.GetWriteBatch(), name_);
+  assert(s.ok());
+  bool do_one_write = !db_impl_->immutable_db_options().two_write_queues;
+  const bool DISABLE_MEMTABLE = true;
+  const uint64_t NO_REF_LOG = 0;
+  uint64_t seq_used = kMaxSequenceNumber;
+  // Rollback batch may contain duplicate keys, because tracked_keys_ is not
+  // comparator aware.
+  auto rollback_batch_cnt = rollback_batch.SubBatchCnt();
+  // We commit the rolled back prepared batches. Although this is
+  // counter-intuitive, i) it is safe to do so, since the prepared batches are
+  // already canceled out by the rollback batch, ii) adding the commit entry to
+  // CommitCache will allow us to benefit from the existing mechanism in
+  // CommitCache that keeps an entry evicted due to max advance and yet overlaps
+  // with a live snapshot around so that the live snapshot properly skips the
+  // entry even if its prepare seq is lower than max_evicted_seq_.
+  //
+  // TODO(lth): RollbackInternal is conceptually very similar to
+  // CommitInternal, with the rollback batch simply taking on the role of
+  // CommitTimeWriteBatch. We should be able to merge the two code paths.
+  WriteUnpreparedCommitEntryPreReleaseCallback update_commit_map(
+      wpt_db_, db_impl_, unprep_seqs_, rollback_batch_cnt);
+  // Note: the rollback batch does not need AddPrepared since it is written to
+  // DB in one shot. min_uncommitted still works since it requires capturing
+  // data that is written to DB but not yet committed, while the rollback
+  // batch commits with PreReleaseCallback.
+  s = db_impl_->WriteImpl(write_options_, rollback_batch.GetWriteBatch(),
+                          nullptr, nullptr, NO_REF_LOG, !DISABLE_MEMTABLE,
+                          &seq_used, rollback_batch_cnt,
+                          do_one_write ? &update_commit_map : nullptr);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  if (!s.ok()) {
+    return s;
+  }
+  if (do_one_write) {
+    for (const auto& seq : unprep_seqs_) {
+      wpt_db_->RemovePrepared(seq.first, seq.second);
+    }
+    unprep_seqs_.clear();
+    flushed_save_points_.reset(nullptr);
+    unflushed_save_points_.reset(nullptr);
+    return s;
+  }  // else do the 2nd write for commit
+
+  uint64_t& prepare_seq = seq_used;
+  // Populate unprep_seqs_ with rollback_batch_cnt, since we treat data in the
+  // rollback write batch as just another "unprepared" batch. This will also
+  // update the unprep_seqs_ in the update_commit_map callback.
+  unprep_seqs_[prepare_seq] = rollback_batch_cnt;
+  WriteUnpreparedCommitEntryPreReleaseCallback
+      update_commit_map_with_rollback_batch(wpt_db_, db_impl_, unprep_seqs_, 0);
+
+  ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
+                    "RollbackInternal 2nd write prepare_seq: %" PRIu64,
+                    prepare_seq);
+  WriteBatch empty_batch;
+  const size_t ONE_BATCH = 1;
+  s = empty_batch.PutLogData(Slice());
+  assert(s.ok());
+  // In the absence of Prepare markers, use Noop as a batch separator
+  s = WriteBatchInternal::InsertNoop(&empty_batch);
+  assert(s.ok());
+  s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
+                          NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
+                          &update_commit_map_with_rollback_batch);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  // Mark the txn as rolled back
+  if (s.ok()) {
+    for (const auto& seq : unprep_seqs_) {
+      wpt_db_->RemovePrepared(seq.first, seq.second);
+    }
+  }
+
+  unprep_seqs_.clear();
+  flushed_save_points_.reset(nullptr);
+  unflushed_save_points_.reset(nullptr);
+  return s;
+}
+
+void WriteUnpreparedTxn::Clear() {
+  if (!recovered_txn_) {
+    txn_db_impl_->UnLock(this, *tracked_locks_);
+  }
+  unprep_seqs_.clear();
+  flushed_save_points_.reset(nullptr);
+  unflushed_save_points_.reset(nullptr);
+  recovered_txn_ = false;
+  largest_validated_seq_ = 0;
+  for (auto& it : active_iterators_) {
+    auto bdit = static_cast<BaseDeltaIterator*>(it);
+    bdit->Invalidate(Status::InvalidArgument(
+        "Cannot use iterator after transaction has finished"));
+  }
+  active_iterators_.clear();
+  untracked_keys_.clear();
+  TransactionBaseImpl::Clear();
+}
+
+void WriteUnpreparedTxn::SetSavePoint() {
+  assert((unflushed_save_points_ ? unflushed_save_points_->size() : 0) +
+             (flushed_save_points_ ? flushed_save_points_->size() : 0) ==
+         (save_points_ ? save_points_->size() : 0));
+  PessimisticTransaction::SetSavePoint();
+  if (unflushed_save_points_ == nullptr) {
+    unflushed_save_points_.reset(new autovector<size_t>());
+  }
+  unflushed_save_points_->push_back(write_batch_.GetDataSize());
+}
+
+Status WriteUnpreparedTxn::RollbackToSavePoint() {
+  assert((unflushed_save_points_ ? unflushed_save_points_->size() : 0) +
+             (flushed_save_points_ ? flushed_save_points_->size() : 0) ==
+         (save_points_ ? save_points_->size() : 0));
+  if (unflushed_save_points_ != nullptr && unflushed_save_points_->size() > 0) {
+    Status s = PessimisticTransaction::RollbackToSavePoint();
+    assert(!s.IsNotFound());
+    unflushed_save_points_->pop_back();
+    return s;
+  }
+
+  if (flushed_save_points_ != nullptr && !flushed_save_points_->empty()) {
+    return RollbackToSavePointInternal();
+  }
+
+  return Status::NotFound();
+}
+
+Status WriteUnpreparedTxn::RollbackToSavePointInternal() {
+  Status s;
+
+  const bool kClear = true;
+  TransactionBaseImpl::InitWriteBatch(kClear);
+
+  assert(flushed_save_points_->size() > 0);
+  WriteUnpreparedTxn::SavePoint& top = flushed_save_points_->back();
+
+  assert(save_points_ != nullptr && save_points_->size() > 0);
+  const LockTracker& tracked_keys = *save_points_->top().new_locks_;
+
+  ReadOptions roptions;
+  roptions.snapshot = top.snapshot_->snapshot();
+  SequenceNumber min_uncommitted =
+      static_cast_with_check<const SnapshotImpl>(roptions.snapshot)
+          ->min_uncommitted_;
+  SequenceNumber snap_seq = roptions.snapshot->GetSequenceNumber();
+  WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted,
+                                          top.unprep_seqs_,
+                                          kBackedByDBSnapshot);
+  s = WriteRollbackKeys(tracked_keys, &write_batch_, &callback, roptions);
+  if (!s.ok()) {
+    return s;
+  }
+
+  const bool kPrepared = true;
+  s = FlushWriteBatchToDBInternal(!kPrepared);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // PessimisticTransaction::RollbackToSavePoint will call also call
+  // RollbackToSavepoint on write_batch_. However, write_batch_ is empty and has
+  // no savepoints because this savepoint has already been flushed. Work around
+  // this by setting a fake savepoint.
+  write_batch_.SetSavePoint();
+  s = PessimisticTransaction::RollbackToSavePoint();
+  assert(s.ok());
+  if (!s.ok()) {
+    return s;
+  }
+
+  flushed_save_points_->pop_back();
+  return s;
+}
+
+Status WriteUnpreparedTxn::PopSavePoint() {
+  assert((unflushed_save_points_ ? unflushed_save_points_->size() : 0) +
+             (flushed_save_points_ ? flushed_save_points_->size() : 0) ==
+         (save_points_ ? save_points_->size() : 0));
+  if (unflushed_save_points_ != nullptr && unflushed_save_points_->size() > 0) {
+    Status s = PessimisticTransaction::PopSavePoint();
+    assert(!s.IsNotFound());
+    unflushed_save_points_->pop_back();
+    return s;
+  }
+
+  if (flushed_save_points_ != nullptr && !flushed_save_points_->empty()) {
+    // PessimisticTransaction::PopSavePoint will call also call PopSavePoint on
+    // write_batch_. However, write_batch_ is empty and has no savepoints
+    // because this savepoint has already been flushed. Work around this by
+    // setting a fake savepoint.
+    write_batch_.SetSavePoint();
+    Status s = PessimisticTransaction::PopSavePoint();
+    assert(!s.IsNotFound());
+    flushed_save_points_->pop_back();
+    return s;
+  }
+
+  return Status::NotFound();
+}
+
+void WriteUnpreparedTxn::MultiGet(const ReadOptions& options,
+                                  ColumnFamilyHandle* column_family,
+                                  const size_t num_keys, const Slice* keys,
+                                  PinnableSlice* values, Status* statuses,
+                                  const bool sorted_input) {
+  SequenceNumber min_uncommitted, snap_seq;
+  const SnapshotBackup backed_by_snapshot =
+      wupt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
+  WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted,
+                                          unprep_seqs_, backed_by_snapshot);
+  write_batch_.MultiGetFromBatchAndDB(db_, options, column_family, num_keys,
+                                      keys, values, statuses, sorted_input,
+                                      &callback);
+  if (UNLIKELY(!callback.valid() ||
+               !wupt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) {
+    wupt_db_->WPRecordTick(TXN_GET_TRY_AGAIN);
+    for (size_t i = 0; i < num_keys; i++) {
+      statuses[i] = Status::TryAgain();
+    }
+  }
+}
+
+Status WriteUnpreparedTxn::Get(const ReadOptions& options,
+                               ColumnFamilyHandle* column_family,
+                               const Slice& key, PinnableSlice* value) {
+  SequenceNumber min_uncommitted, snap_seq;
+  const SnapshotBackup backed_by_snapshot =
+      wupt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
+  WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted,
+                                          unprep_seqs_, backed_by_snapshot);
+  auto res = write_batch_.GetFromBatchAndDB(db_, options, column_family, key,
+                                            value, &callback);
+  if (LIKELY(callback.valid() &&
+             wupt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) {
+    return res;
+  } else {
+    res.PermitUncheckedError();
+    wupt_db_->WPRecordTick(TXN_GET_TRY_AGAIN);
+    return Status::TryAgain();
+  }
+}
+
+namespace {
+static void CleanupWriteUnpreparedWBWIIterator(void* arg1, void* arg2) {
+  auto txn = reinterpret_cast<WriteUnpreparedTxn*>(arg1);
+  auto iter = reinterpret_cast<Iterator*>(arg2);
+  txn->RemoveActiveIterator(iter);
+}
+}  // anonymous namespace
+
+Iterator* WriteUnpreparedTxn::GetIterator(const ReadOptions& options) {
+  return GetIterator(options, wupt_db_->DefaultColumnFamily());
+}
+
+Iterator* WriteUnpreparedTxn::GetIterator(const ReadOptions& options,
+                                          ColumnFamilyHandle* column_family) {
+  // Make sure to get iterator from WriteUnprepareTxnDB, not the root db.
+  Iterator* db_iter = wupt_db_->NewIterator(options, column_family, this);
+  assert(db_iter);
+
+  auto iter = write_batch_.NewIteratorWithBase(column_family, db_iter);
+  active_iterators_.push_back(iter);
+  iter->RegisterCleanup(CleanupWriteUnpreparedWBWIIterator, this, iter);
+  return iter;
+}
+
+Status WriteUnpreparedTxn::ValidateSnapshot(ColumnFamilyHandle* column_family,
+                                            const Slice& key,
+                                            SequenceNumber* tracked_at_seq) {
+  // TODO(lth): Reduce duplicate code with WritePrepared ValidateSnapshot logic.
+  assert(snapshot_);
+
+  SequenceNumber min_uncommitted =
+      static_cast_with_check<const SnapshotImpl>(snapshot_.get())
+          ->min_uncommitted_;
+  SequenceNumber snap_seq = snapshot_->GetSequenceNumber();
+  // tracked_at_seq is either max or the last snapshot with which this key was
+  // trackeed so there is no need to apply the IsInSnapshot to this comparison
+  // here as tracked_at_seq is not a prepare seq.
+  if (*tracked_at_seq <= snap_seq) {
+    // If the key has been previous validated at a sequence number earlier
+    // than the curent snapshot's sequence number, we already know it has not
+    // been modified.
+    return Status::OK();
+  }
+
+  *tracked_at_seq = snap_seq;
+
+  ColumnFamilyHandle* cfh =
+      column_family ? column_family : db_impl_->DefaultColumnFamily();
+
+  WriteUnpreparedTxnReadCallback snap_checker(
+      wupt_db_, snap_seq, min_uncommitted, unprep_seqs_, kBackedByDBSnapshot);
+  // TODO(yanqin): Support user-defined timestamp.
+  return TransactionUtil::CheckKeyForConflicts(
+      db_impl_, cfh, key.ToString(), snap_seq, /*ts=*/nullptr,
+      false /* cache_only */, &snap_checker, min_uncommitted);
+}
+
+const std::map<SequenceNumber, size_t>&
+WriteUnpreparedTxn::GetUnpreparedSequenceNumbers() {
+  return unprep_seqs_;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/write_unprepared_txn.h b/src/rocksdb/utilities/transactions/write_unprepared_txn.h
new file mode 100644
index 000000000..5a3227f4e
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/write_unprepared_txn.h
@@ -0,0 +1,341 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <set>
+
+#include "utilities/transactions/write_prepared_txn.h"
+#include "utilities/transactions/write_unprepared_txn_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WriteUnpreparedTxnDB;
+class WriteUnpreparedTxn;
+
+// WriteUnprepared transactions needs to be able to read their own uncommitted
+// writes, and supporting this requires some careful consideration. Because
+// writes in the current transaction may be flushed to DB already, we cannot
+// rely on the contents of WriteBatchWithIndex to determine whether a key should
+// be visible or not, so we have to remember to check the DB for any uncommitted
+// keys that should be visible to us. First, we will need to change the seek to
+// snapshot logic, to seek to max_visible_seq = max(snap_seq, max_unprep_seq).
+// Any key greater than max_visible_seq should not be visible because they
+// cannot be unprepared by the current transaction and they are not in its
+// snapshot.
+//
+// When we seek to max_visible_seq, one of these cases will happen:
+// 1. We hit a unprepared key from the current transaction.
+// 2. We hit a unprepared key from the another transaction.
+// 3. We hit a committed key with snap_seq < seq < max_unprep_seq.
+// 4. We hit a committed key with seq <= snap_seq.
+//
+// IsVisibleFullCheck handles all cases correctly.
+//
+// Other notes:
+// Note that max_visible_seq is only calculated once at iterator construction
+// time, meaning if the same transaction is adding more unprep seqs through
+// writes during iteration, these newer writes may not be visible. This is not a
+// problem for MySQL though because it avoids modifying the index as it is
+// scanning through it to avoid the Halloween Problem. Instead, it scans the
+// index once up front, and modifies based on a temporary copy.
+//
+// In DBIter, there is a "reseek" optimization if the iterator skips over too
+// many keys. However, this assumes that the reseek seeks exactly to the
+// required key. In write unprepared, even after seeking directly to
+// max_visible_seq, some iteration may be required before hitting a visible key,
+// and special precautions must be taken to avoid performing another reseek,
+// leading to an infinite loop.
+//
+class WriteUnpreparedTxnReadCallback : public ReadCallback {
+ public:
+  WriteUnpreparedTxnReadCallback(
+      WritePreparedTxnDB* db, SequenceNumber snapshot,
+      SequenceNumber min_uncommitted,
+      const std::map<SequenceNumber, size_t>& unprep_seqs,
+      SnapshotBackup backed_by_snapshot)
+      // Pass our last uncommitted seq as the snapshot to the parent class to
+      // ensure that the parent will not prematurely filter out own writes. We
+      // will do the exact comparison against snapshots in IsVisibleFullCheck
+      // override.
+      : ReadCallback(CalcMaxVisibleSeq(unprep_seqs, snapshot), min_uncommitted),
+        db_(db),
+        unprep_seqs_(unprep_seqs),
+        wup_snapshot_(snapshot),
+        backed_by_snapshot_(backed_by_snapshot) {
+    (void)backed_by_snapshot_;  // to silence unused private field warning
+  }
+
+  virtual ~WriteUnpreparedTxnReadCallback() {
+    // If it is not backed by snapshot, the caller must check validity
+    assert(valid_checked_ || backed_by_snapshot_ == kBackedByDBSnapshot);
+  }
+
+  virtual bool IsVisibleFullCheck(SequenceNumber seq) override;
+
+  inline bool valid() {
+    valid_checked_ = true;
+    return snap_released_ == false;
+  }
+
+  void Refresh(SequenceNumber seq) override {
+    max_visible_seq_ = std::max(max_visible_seq_, seq);
+    wup_snapshot_ = seq;
+  }
+
+  static SequenceNumber CalcMaxVisibleSeq(
+      const std::map<SequenceNumber, size_t>& unprep_seqs,
+      SequenceNumber snapshot_seq) {
+    SequenceNumber max_unprepared = 0;
+    if (unprep_seqs.size()) {
+      max_unprepared =
+          unprep_seqs.rbegin()->first + unprep_seqs.rbegin()->second - 1;
+    }
+    return std::max(max_unprepared, snapshot_seq);
+  }
+
+ private:
+  WritePreparedTxnDB* db_;
+  const std::map<SequenceNumber, size_t>& unprep_seqs_;
+  SequenceNumber wup_snapshot_;
+  // Whether max_visible_seq_ is backed by a snapshot
+  const SnapshotBackup backed_by_snapshot_;
+  bool snap_released_ = false;
+  // Safety check to ensure that the caller has checked invalid statuses
+  bool valid_checked_ = false;
+};
+
+class WriteUnpreparedTxn : public WritePreparedTxn {
+ public:
+  WriteUnpreparedTxn(WriteUnpreparedTxnDB* db,
+                     const WriteOptions& write_options,
+                     const TransactionOptions& txn_options);
+
+  virtual ~WriteUnpreparedTxn();
+
+  using TransactionBaseImpl::Put;
+  virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& value,
+                     const bool assume_tracked = false) override;
+  virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+                     const SliceParts& value,
+                     const bool assume_tracked = false) override;
+
+  using TransactionBaseImpl::Merge;
+  virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value,
+                       const bool assume_tracked = false) override;
+
+  using TransactionBaseImpl::Delete;
+  virtual Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
+                        const bool assume_tracked = false) override;
+  virtual Status Delete(ColumnFamilyHandle* column_family,
+                        const SliceParts& key,
+                        const bool assume_tracked = false) override;
+
+  using TransactionBaseImpl::SingleDelete;
+  virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+                              const Slice& key,
+                              const bool assume_tracked = false) override;
+  virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+                              const SliceParts& key,
+                              const bool assume_tracked = false) override;
+
+  // In WriteUnprepared, untracked writes will break snapshot validation logic.
+  // Snapshot validation will only check the largest sequence number of a key to
+  // see if it was committed or not. However, an untracked unprepared write will
+  // hide smaller committed sequence numbers.
+  //
+  // TODO(lth): Investigate whether it is worth having snapshot validation
+  // validate all values larger than snap_seq. Otherwise, we should return
+  // Status::NotSupported for untracked writes.
+
+  virtual Status RebuildFromWriteBatch(WriteBatch*) override;
+
+  virtual uint64_t GetLastLogNumber() const override {
+    return last_log_number_;
+  }
+
+  void RemoveActiveIterator(Iterator* iter) {
+    active_iterators_.erase(
+        std::remove(active_iterators_.begin(), active_iterators_.end(), iter),
+        active_iterators_.end());
+  }
+
+ protected:
+  void Initialize(const TransactionOptions& txn_options) override;
+
+  Status PrepareInternal() override;
+
+  Status CommitWithoutPrepareInternal() override;
+  Status CommitInternal() override;
+
+  Status RollbackInternal() override;
+
+  void Clear() override;
+
+  void SetSavePoint() override;
+  Status RollbackToSavePoint() override;
+  Status PopSavePoint() override;
+
+  // Get and GetIterator needs to be overridden so that a ReadCallback to
+  // handle read-your-own-write is used.
+  using Transaction::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* value) override;
+
+  using Transaction::MultiGet;
+  virtual void MultiGet(const ReadOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const size_t num_keys, const Slice* keys,
+                        PinnableSlice* values, Status* statuses,
+                        const bool sorted_input = false) override;
+
+  using Transaction::GetIterator;
+  virtual Iterator* GetIterator(const ReadOptions& options) override;
+  virtual Iterator* GetIterator(const ReadOptions& options,
+                                ColumnFamilyHandle* column_family) override;
+
+  virtual Status ValidateSnapshot(ColumnFamilyHandle* column_family,
+                                  const Slice& key,
+                                  SequenceNumber* tracked_at_seq) override;
+
+ private:
+  friend class WriteUnpreparedTransactionTest_ReadYourOwnWrite_Test;
+  friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
+  friend class WriteUnpreparedTransactionTest_UnpreparedBatch_Test;
+  friend class WriteUnpreparedTxnDB;
+
+  const std::map<SequenceNumber, size_t>& GetUnpreparedSequenceNumbers();
+  Status WriteRollbackKeys(const LockTracker& tracked_keys,
+                           WriteBatchWithIndex* rollback_batch,
+                           ReadCallback* callback, const ReadOptions& roptions);
+
+  Status MaybeFlushWriteBatchToDB();
+  Status FlushWriteBatchToDB(bool prepared);
+  Status FlushWriteBatchToDBInternal(bool prepared);
+  Status FlushWriteBatchWithSavePointToDB();
+  Status RollbackToSavePointInternal();
+  Status HandleWrite(std::function<Status()> do_write);
+
+  // For write unprepared, we check on every writebatch append to see if
+  // write_batch_flush_threshold_ has been exceeded, and then call
+  // FlushWriteBatchToDB if so. This logic is encapsulated in
+  // MaybeFlushWriteBatchToDB.
+  int64_t write_batch_flush_threshold_;
+  WriteUnpreparedTxnDB* wupt_db_;
+
+  // Ordered list of unprep_seq sequence numbers that we have already written
+  // to DB.
+  //
+  // This maps unprep_seq => prepare_batch_cnt for each unprepared batch
+  // written by this transaction.
+  //
+  // Note that this contains both prepared and unprepared batches, since they
+  // are treated similarily in prepare heap/commit map, so it simplifies the
+  // commit callbacks.
+  std::map<SequenceNumber, size_t> unprep_seqs_;
+
+  uint64_t last_log_number_;
+
+  // Recovered transactions have tracked_keys_ populated, but are not actually
+  // locked for efficiency reasons. For recovered transactions, skip unlocking
+  // keys when transaction ends.
+  bool recovered_txn_;
+
+  // Track the largest sequence number at which we performed snapshot
+  // validation. If snapshot validation was skipped because no snapshot was set,
+  // then this is set to GetLastPublishedSequence. This value is useful because
+  // it means that for keys that have unprepared seqnos, we can guarantee that
+  // no committed keys by other transactions can exist between
+  // largest_validated_seq_ and max_unprep_seq. See
+  // WriteUnpreparedTxnDB::NewIterator for an explanation for why this is
+  // necessary for iterator Prev().
+  //
+  // Currently this value only increases during the lifetime of a transaction,
+  // but in some cases, we should be able to restore the previously largest
+  // value when calling RollbackToSavepoint.
+  SequenceNumber largest_validated_seq_;
+
+  struct SavePoint {
+    // Record of unprep_seqs_ at this savepoint. The set of unprep_seq is
+    // used during RollbackToSavepoint to determine visibility when restoring
+    // old values.
+    //
+    // TODO(lth): Since all unprep_seqs_ sets further down the stack must be
+    // subsets, this can potentially be deduplicated by just storing set
+    // difference. Investigate if this is worth it.
+    std::map<SequenceNumber, size_t> unprep_seqs_;
+
+    // This snapshot will be used to read keys at this savepoint if we call
+    // RollbackToSavePoint.
+    std::unique_ptr<ManagedSnapshot> snapshot_;
+
+    SavePoint(const std::map<SequenceNumber, size_t>& seqs,
+              ManagedSnapshot* snapshot)
+        : unprep_seqs_(seqs), snapshot_(snapshot){};
+  };
+
+  // We have 3 data structures holding savepoint information:
+  // 1. TransactionBaseImpl::save_points_
+  // 2. WriteUnpreparedTxn::flushed_save_points_
+  // 3. WriteUnpreparecTxn::unflushed_save_points_
+  //
+  // TransactionBaseImpl::save_points_ holds information about all write
+  // batches, including the current in-memory write_batch_, or unprepared
+  // batches that have been written out. Its responsibility is just to track
+  // which keys have been modified in every savepoint.
+  //
+  // WriteUnpreparedTxn::flushed_save_points_ holds information about savepoints
+  // set on unprepared batches that have already flushed. It holds the snapshot
+  // and unprep_seqs at that savepoint, so that the rollback process can
+  // determine which keys were visible at that point in time.
+  //
+  // WriteUnpreparecTxn::unflushed_save_points_ holds information about
+  // savepoints on the current in-memory write_batch_. It simply records the
+  // size of the write batch at every savepoint.
+  //
+  // TODO(lth): Remove the redundancy between save_point_boundaries_ and
+  // write_batch_.save_points_.
+  //
+  // Based on this information, here are some invariants:
+  // size(unflushed_save_points_) = size(write_batch_.save_points_)
+  // size(flushed_save_points_) + size(unflushed_save_points_)
+  //   = size(save_points_)
+  //
+  std::unique_ptr<autovector<WriteUnpreparedTxn::SavePoint>>
+      flushed_save_points_;
+  std::unique_ptr<autovector<size_t>> unflushed_save_points_;
+
+  // It is currently unsafe to flush a write batch if there are active iterators
+  // created from this transaction. This is because we use WriteBatchWithIndex
+  // to do merging reads from the DB and the write batch. If we flush the write
+  // batch, it is possible that the delta iterator on the iterator will point to
+  // invalid memory.
+  std::vector<Iterator*> active_iterators_;
+
+  // Untracked keys that we have to rollback.
+  //
+  // TODO(lth): Currently we we do not record untracked keys per-savepoint.
+  // This means that when rolling back to savepoints, we have to check all
+  // keys in the current transaction for rollback. Note that this is only
+  // inefficient, but still correct because we take a snapshot at every
+  // savepoint, and we will use that snapshot to construct the rollback batch.
+  // The rollback batch will then contain a reissue of the same marker.
+  //
+  // A more optimal solution would be to only check keys changed since the
+  // last savepoint. Also, it may make sense to merge this into tracked_keys_
+  // and differentiate between tracked but not locked keys to avoid having two
+  // very similar data structures.
+  using KeySet = std::unordered_map<uint32_t, std::vector<std::string>>;
+  KeySet untracked_keys_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/write_unprepared_txn_db.cc b/src/rocksdb/utilities/transactions/write_unprepared_txn_db.cc
new file mode 100644
index 000000000..2ed2d5c59
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/write_unprepared_txn_db.cc
@@ -0,0 +1,473 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/write_unprepared_txn_db.h"
+
+#include "db/arena_wrapped_db_iter.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Instead of reconstructing a Transaction object, and calling rollback on it,
+// we can be more efficient with RollbackRecoveredTransaction by skipping
+// unnecessary steps (eg. updating CommitMap, reconstructing keyset)
+Status WriteUnpreparedTxnDB::RollbackRecoveredTransaction(
+    const DBImpl::RecoveredTransaction* rtxn) {
+  // TODO(lth): Reduce duplicate code with WritePrepared rollback logic.
+  assert(rtxn->unprepared_);
+  auto cf_map_shared_ptr = WritePreparedTxnDB::GetCFHandleMap();
+  auto cf_comp_map_shared_ptr = WritePreparedTxnDB::GetCFComparatorMap();
+  // In theory we could write with disableWAL = true during recovery, and
+  // assume that if we crash again during recovery, we can just replay from
+  // the very beginning. Unfortunately, the XIDs from the application may not
+  // necessarily be unique across restarts, potentially leading to situations
+  // like this:
+  //
+  // BEGIN_PREPARE(unprepared) Put(a) END_PREPARE(xid = 1)
+  // -- crash and recover with Put(a) rolled back as it was not prepared
+  // BEGIN_PREPARE(prepared) Put(b) END_PREPARE(xid = 1)
+  // COMMIT(xid = 1)
+  // -- crash and recover with both a, b
+  //
+  // We could just write the rollback marker, but then we would have to extend
+  // MemTableInserter during recovery to actually do writes into the DB
+  // instead of just dropping the in-memory write batch.
+  //
+  WriteOptions w_options;
+
+  class InvalidSnapshotReadCallback : public ReadCallback {
+   public:
+    InvalidSnapshotReadCallback(SequenceNumber snapshot)
+        : ReadCallback(snapshot) {}
+
+    inline bool IsVisibleFullCheck(SequenceNumber) override {
+      // The seq provided as snapshot is the seq right before we have locked and
+      // wrote to it, so whatever is there, it is committed.
+      return true;
+    }
+
+    // Ignore the refresh request since we are confident that our snapshot seq
+    // is not going to be affected by concurrent compactions (not enabled yet.)
+    void Refresh(SequenceNumber) override {}
+  };
+
+  // Iterate starting with largest sequence number.
+  for (auto it = rtxn->batches_.rbegin(); it != rtxn->batches_.rend(); ++it) {
+    auto last_visible_txn = it->first - 1;
+    const auto& batch = it->second.batch_;
+    WriteBatch rollback_batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+                              w_options.protection_bytes_per_key,
+                              0 /* default_cf_ts_sz */);
+
+    struct RollbackWriteBatchBuilder : public WriteBatch::Handler {
+      DBImpl* db_;
+      ReadOptions roptions;
+      InvalidSnapshotReadCallback callback;
+      WriteBatch* rollback_batch_;
+      std::map<uint32_t, const Comparator*>& comparators_;
+      std::map<uint32_t, ColumnFamilyHandle*>& handles_;
+      using CFKeys = std::set<Slice, SetComparator>;
+      std::map<uint32_t, CFKeys> keys_;
+      bool rollback_merge_operands_;
+      RollbackWriteBatchBuilder(
+          DBImpl* db, SequenceNumber snap_seq, WriteBatch* dst_batch,
+          std::map<uint32_t, const Comparator*>& comparators,
+          std::map<uint32_t, ColumnFamilyHandle*>& handles,
+          bool rollback_merge_operands)
+          : db_(db),
+            callback(snap_seq),
+            // disable min_uncommitted optimization
+            rollback_batch_(dst_batch),
+            comparators_(comparators),
+            handles_(handles),
+            rollback_merge_operands_(rollback_merge_operands) {}
+
+      Status Rollback(uint32_t cf, const Slice& key) {
+        Status s;
+        CFKeys& cf_keys = keys_[cf];
+        if (cf_keys.size() == 0) {  // just inserted
+          auto cmp = comparators_[cf];
+          keys_[cf] = CFKeys(SetComparator(cmp));
+        }
+        auto res = cf_keys.insert(key);
+        if (res.second ==
+            false) {  // second is false if a element already existed.
+          return s;
+        }
+
+        PinnableSlice pinnable_val;
+        bool not_used;
+        auto cf_handle = handles_[cf];
+        DBImpl::GetImplOptions get_impl_options;
+        get_impl_options.column_family = cf_handle;
+        get_impl_options.value = &pinnable_val;
+        get_impl_options.value_found = &not_used;
+        get_impl_options.callback = &callback;
+        s = db_->GetImpl(roptions, key, get_impl_options);
+        assert(s.ok() || s.IsNotFound());
+        if (s.ok()) {
+          s = rollback_batch_->Put(cf_handle, key, pinnable_val);
+          assert(s.ok());
+        } else if (s.IsNotFound()) {
+          // There has been no readable value before txn. By adding a delete we
+          // make sure that there will be none afterwards either.
+          s = rollback_batch_->Delete(cf_handle, key);
+          assert(s.ok());
+        } else {
+          // Unexpected status. Return it to the user.
+        }
+        return s;
+      }
+
+      Status PutCF(uint32_t cf, const Slice& key,
+                   const Slice& /*val*/) override {
+        return Rollback(cf, key);
+      }
+
+      Status DeleteCF(uint32_t cf, const Slice& key) override {
+        return Rollback(cf, key);
+      }
+
+      Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+        return Rollback(cf, key);
+      }
+
+      Status MergeCF(uint32_t cf, const Slice& key,
+                     const Slice& /*val*/) override {
+        if (rollback_merge_operands_) {
+          return Rollback(cf, key);
+        } else {
+          return Status::OK();
+        }
+      }
+
+      // Recovered batches do not contain 2PC markers.
+      Status MarkNoop(bool) override { return Status::InvalidArgument(); }
+      Status MarkBeginPrepare(bool) override {
+        return Status::InvalidArgument();
+      }
+      Status MarkEndPrepare(const Slice&) override {
+        return Status::InvalidArgument();
+      }
+      Status MarkCommit(const Slice&) override {
+        return Status::InvalidArgument();
+      }
+      Status MarkRollback(const Slice&) override {
+        return Status::InvalidArgument();
+      }
+    } rollback_handler(db_impl_, last_visible_txn, &rollback_batch,
+                       *cf_comp_map_shared_ptr.get(), *cf_map_shared_ptr.get(),
+                       txn_db_options_.rollback_merge_operands);
+
+    auto s = batch->Iterate(&rollback_handler);
+    if (!s.ok()) {
+      return s;
+    }
+
+    // The Rollback marker will be used as a batch separator
+    s = WriteBatchInternal::MarkRollback(&rollback_batch, rtxn->name_);
+    if (!s.ok()) {
+      return s;
+    }
+
+    const uint64_t kNoLogRef = 0;
+    const bool kDisableMemtable = true;
+    const size_t kOneBatch = 1;
+    uint64_t seq_used = kMaxSequenceNumber;
+    s = db_impl_->WriteImpl(w_options, &rollback_batch, nullptr, nullptr,
+                            kNoLogRef, !kDisableMemtable, &seq_used, kOneBatch);
+    if (!s.ok()) {
+      return s;
+    }
+
+    // If two_write_queues, we must manually release the sequence number to
+    // readers.
+    if (db_impl_->immutable_db_options().two_write_queues) {
+      db_impl_->SetLastPublishedSequence(seq_used);
+    }
+  }
+
+  return Status::OK();
+}
+
+Status WriteUnpreparedTxnDB::Initialize(
+    const std::vector<size_t>& compaction_enabled_cf_indices,
+    const std::vector<ColumnFamilyHandle*>& handles) {
+  // TODO(lth): Reduce code duplication in this function.
+  auto dbimpl = static_cast_with_check<DBImpl>(GetRootDB());
+  assert(dbimpl != nullptr);
+
+  db_impl_->SetSnapshotChecker(new WritePreparedSnapshotChecker(this));
+  // A callback to commit a single sub-batch
+  class CommitSubBatchPreReleaseCallback : public PreReleaseCallback {
+   public:
+    explicit CommitSubBatchPreReleaseCallback(WritePreparedTxnDB* db)
+        : db_(db) {}
+    Status Callback(SequenceNumber commit_seq,
+                    bool is_mem_disabled __attribute__((__unused__)), uint64_t,
+                    size_t /*index*/, size_t /*total*/) override {
+      assert(!is_mem_disabled);
+      db_->AddCommitted(commit_seq, commit_seq);
+      return Status::OK();
+    }
+
+   private:
+    WritePreparedTxnDB* db_;
+  };
+  db_impl_->SetRecoverableStatePreReleaseCallback(
+      new CommitSubBatchPreReleaseCallback(this));
+
+  // PessimisticTransactionDB::Initialize
+  for (auto cf_ptr : handles) {
+    AddColumnFamily(cf_ptr);
+  }
+  // Verify cf options
+  for (auto handle : handles) {
+    ColumnFamilyDescriptor cfd;
+    Status s = handle->GetDescriptor(&cfd);
+    if (!s.ok()) {
+      return s;
+    }
+    s = VerifyCFOptions(cfd.options);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  // Re-enable compaction for the column families that initially had
+  // compaction enabled.
+  std::vector<ColumnFamilyHandle*> compaction_enabled_cf_handles;
+  compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size());
+  for (auto index : compaction_enabled_cf_indices) {
+    compaction_enabled_cf_handles.push_back(handles[index]);
+  }
+
+  // create 'real' transactions from recovered shell transactions
+  auto rtxns = dbimpl->recovered_transactions();
+  std::map<SequenceNumber, SequenceNumber> ordered_seq_cnt;
+  for (auto rtxn : rtxns) {
+    auto recovered_trx = rtxn.second;
+    assert(recovered_trx);
+    assert(recovered_trx->batches_.size() >= 1);
+    assert(recovered_trx->name_.length());
+
+    // We can only rollback transactions after AdvanceMaxEvictedSeq is called,
+    // but AddPrepared must occur before AdvanceMaxEvictedSeq, which is why
+    // two iterations is required.
+    if (recovered_trx->unprepared_) {
+      continue;
+    }
+
+    WriteOptions w_options;
+    w_options.sync = true;
+    TransactionOptions t_options;
+
+    auto first_log_number = recovered_trx->batches_.begin()->second.log_number_;
+    auto first_seq = recovered_trx->batches_.begin()->first;
+    auto last_prepare_batch_cnt =
+        recovered_trx->batches_.begin()->second.batch_cnt_;
+
+    Transaction* real_trx = BeginTransaction(w_options, t_options, nullptr);
+    assert(real_trx);
+    auto wupt = static_cast_with_check<WriteUnpreparedTxn>(real_trx);
+    wupt->recovered_txn_ = true;
+
+    real_trx->SetLogNumber(first_log_number);
+    real_trx->SetId(first_seq);
+    Status s = real_trx->SetName(recovered_trx->name_);
+    if (!s.ok()) {
+      return s;
+    }
+    wupt->prepare_batch_cnt_ = last_prepare_batch_cnt;
+
+    for (auto batch : recovered_trx->batches_) {
+      const auto& seq = batch.first;
+      const auto& batch_info = batch.second;
+      auto cnt = batch_info.batch_cnt_ ? batch_info.batch_cnt_ : 1;
+      assert(batch_info.log_number_);
+
+      ordered_seq_cnt[seq] = cnt;
+      assert(wupt->unprep_seqs_.count(seq) == 0);
+      wupt->unprep_seqs_[seq] = cnt;
+
+      s = wupt->RebuildFromWriteBatch(batch_info.batch_);
+      assert(s.ok());
+      if (!s.ok()) {
+        return s;
+      }
+    }
+
+    const bool kClear = true;
+    wupt->InitWriteBatch(kClear);
+
+    real_trx->SetState(Transaction::PREPARED);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  // AddPrepared must be called in order
+  for (auto seq_cnt : ordered_seq_cnt) {
+    auto seq = seq_cnt.first;
+    auto cnt = seq_cnt.second;
+    for (size_t i = 0; i < cnt; i++) {
+      AddPrepared(seq + i);
+    }
+  }
+
+  SequenceNumber prev_max = max_evicted_seq_;
+  SequenceNumber last_seq = db_impl_->GetLatestSequenceNumber();
+  AdvanceMaxEvictedSeq(prev_max, last_seq);
+  // Create a gap between max and the next snapshot. This simplifies the logic
+  // in IsInSnapshot by not having to consider the special case of max ==
+  // snapshot after recovery. This is tested in IsInSnapshotEmptyMapTest.
+  if (last_seq) {
+    db_impl_->versions_->SetLastAllocatedSequence(last_seq + 1);
+    db_impl_->versions_->SetLastSequence(last_seq + 1);
+    db_impl_->versions_->SetLastPublishedSequence(last_seq + 1);
+  }
+
+  Status s;
+  // Rollback unprepared transactions.
+  for (auto rtxn : rtxns) {
+    auto recovered_trx = rtxn.second;
+    if (recovered_trx->unprepared_) {
+      s = RollbackRecoveredTransaction(recovered_trx);
+      if (!s.ok()) {
+        return s;
+      }
+      continue;
+    }
+  }
+
+  if (s.ok()) {
+    dbimpl->DeleteAllRecoveredTransactions();
+
+    // Compaction should start only after max_evicted_seq_ is set AND recovered
+    // transactions are either added to PrepareHeap or rolled back.
+    s = EnableAutoCompaction(compaction_enabled_cf_handles);
+  }
+
+  return s;
+}
+
+Transaction* WriteUnpreparedTxnDB::BeginTransaction(
+    const WriteOptions& write_options, const TransactionOptions& txn_options,
+    Transaction* old_txn) {
+  if (old_txn != nullptr) {
+    ReinitializeTransaction(old_txn, write_options, txn_options);
+    return old_txn;
+  } else {
+    return new WriteUnpreparedTxn(this, write_options, txn_options);
+  }
+}
+
+// Struct to hold ownership of snapshot and read callback for iterator cleanup.
+struct WriteUnpreparedTxnDB::IteratorState {
+  IteratorState(WritePreparedTxnDB* txn_db, SequenceNumber sequence,
+                std::shared_ptr<ManagedSnapshot> s,
+                SequenceNumber min_uncommitted, WriteUnpreparedTxn* txn)
+      : callback(txn_db, sequence, min_uncommitted, txn->unprep_seqs_,
+                 kBackedByDBSnapshot),
+        snapshot(s) {}
+  SequenceNumber MaxVisibleSeq() { return callback.max_visible_seq(); }
+
+  WriteUnpreparedTxnReadCallback callback;
+  std::shared_ptr<ManagedSnapshot> snapshot;
+};
+
+namespace {
+static void CleanupWriteUnpreparedTxnDBIterator(void* arg1, void* /*arg2*/) {
+  delete reinterpret_cast<WriteUnpreparedTxnDB::IteratorState*>(arg1);
+}
+}  // anonymous namespace
+
+Iterator* WriteUnpreparedTxnDB::NewIterator(const ReadOptions& options,
+                                            ColumnFamilyHandle* column_family,
+                                            WriteUnpreparedTxn* txn) {
+  // TODO(lth): Refactor so that this logic is shared with WritePrepared.
+  constexpr bool expose_blob_index = false;
+  constexpr bool allow_refresh = false;
+  std::shared_ptr<ManagedSnapshot> own_snapshot = nullptr;
+  SequenceNumber snapshot_seq = kMaxSequenceNumber;
+  SequenceNumber min_uncommitted = 0;
+
+  // Currently, the Prev() iterator logic does not work well without snapshot
+  // validation. The logic simply iterates through values of a key in
+  // ascending seqno order, stopping at the first non-visible value and
+  // returning the last visible value.
+  //
+  // For example, if snapshot sequence is 3, and we have the following keys:
+  // foo: v1 1
+  // foo: v2 2
+  // foo: v3 3
+  // foo: v4 4
+  // foo: v5 5
+  //
+  // Then 1, 2, 3 will be visible, but 4 will be non-visible, so we return v3,
+  // which is the last visible value.
+  //
+  // For unprepared transactions, if we have snap_seq = 3, but the current
+  // transaction has unprep_seq 5, then returning the first non-visible value
+  // would be incorrect, as we should return v5, and not v3. The problem is that
+  // there are committed values at snapshot_seq < commit_seq < unprep_seq.
+  //
+  // Snapshot validation can prevent this problem by ensuring that no committed
+  // values exist at snapshot_seq < commit_seq, and thus any value with a
+  // sequence number greater than snapshot_seq must be unprepared values. For
+  // example, if the transaction had a snapshot at 3, then snapshot validation
+  // would be performed during the Put(v5) call. It would find v4, and the Put
+  // would fail with snapshot validation failure.
+  //
+  // TODO(lth): Improve Prev() logic to continue iterating until
+  // max_visible_seq, and then return the last visible value, so that this
+  // restriction can be lifted.
+  const Snapshot* snapshot = nullptr;
+  if (options.snapshot == nullptr) {
+    snapshot = GetSnapshot();
+    own_snapshot = std::make_shared<ManagedSnapshot>(db_impl_, snapshot);
+  } else {
+    snapshot = options.snapshot;
+  }
+
+  snapshot_seq = snapshot->GetSequenceNumber();
+  assert(snapshot_seq != kMaxSequenceNumber);
+  // Iteration is safe as long as largest_validated_seq <= snapshot_seq. We are
+  // guaranteed that for keys that were modified by this transaction (and thus
+  // might have unprepared values), no committed values exist at
+  // largest_validated_seq < commit_seq (or the contrapositive: any committed
+  // value must exist at commit_seq <= largest_validated_seq). This implies
+  // that commit_seq <= largest_validated_seq <= snapshot_seq or commit_seq <=
+  // snapshot_seq. As explained above, the problem with Prev() only happens when
+  // snapshot_seq < commit_seq.
+  //
+  // For keys that were not modified by this transaction, largest_validated_seq_
+  // is meaningless, and Prev() should just work with the existing visibility
+  // logic.
+  if (txn->largest_validated_seq_ > snapshot->GetSequenceNumber() &&
+      !txn->unprep_seqs_.empty()) {
+    ROCKS_LOG_ERROR(info_log_,
+                    "WriteUnprepared iterator creation failed since the "
+                    "transaction has performed unvalidated writes");
+    return nullptr;
+  }
+  min_uncommitted =
+      static_cast_with_check<const SnapshotImpl>(snapshot)->min_uncommitted_;
+
+  auto* cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+  auto* state =
+      new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted, txn);
+  auto* db_iter = db_impl_->NewIteratorImpl(
+      options, cfd, state->MaxVisibleSeq(), &state->callback, expose_blob_index,
+      allow_refresh);
+  db_iter->RegisterCleanup(CleanupWriteUnpreparedTxnDBIterator, state, nullptr);
+  return db_iter;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/write_unprepared_txn_db.h b/src/rocksdb/utilities/transactions/write_unprepared_txn_db.h
new file mode 100644
index 000000000..c40e96d49
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/write_unprepared_txn_db.h
@@ -0,0 +1,108 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/write_prepared_txn_db.h"
+#include "utilities/transactions/write_unprepared_txn.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WriteUnpreparedTxn;
+
+class WriteUnpreparedTxnDB : public WritePreparedTxnDB {
+ public:
+  using WritePreparedTxnDB::WritePreparedTxnDB;
+
+  Status Initialize(const std::vector<size_t>& compaction_enabled_cf_indices,
+                    const std::vector<ColumnFamilyHandle*>& handles) override;
+
+  Transaction* BeginTransaction(const WriteOptions& write_options,
+                                const TransactionOptions& txn_options,
+                                Transaction* old_txn) override;
+
+  // Struct to hold ownership of snapshot and read callback for cleanup.
+  struct IteratorState;
+
+  using WritePreparedTxnDB::NewIterator;
+  Iterator* NewIterator(const ReadOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        WriteUnpreparedTxn* txn);
+
+ private:
+  Status RollbackRecoveredTransaction(const DBImpl::RecoveredTransaction* rtxn);
+};
+
+class WriteUnpreparedCommitEntryPreReleaseCallback : public PreReleaseCallback {
+  // TODO(lth): Reduce code duplication with
+  // WritePreparedCommitEntryPreReleaseCallback
+ public:
+  // includes_data indicates that the commit also writes non-empty
+  // CommitTimeWriteBatch to memtable, which needs to be committed separately.
+  WriteUnpreparedCommitEntryPreReleaseCallback(
+      WritePreparedTxnDB* db, DBImpl* db_impl,
+      const std::map<SequenceNumber, size_t>& unprep_seqs,
+      size_t data_batch_cnt = 0, bool publish_seq = true)
+      : db_(db),
+        db_impl_(db_impl),
+        unprep_seqs_(unprep_seqs),
+        data_batch_cnt_(data_batch_cnt),
+        includes_data_(data_batch_cnt_ > 0),
+        publish_seq_(publish_seq) {
+    assert(unprep_seqs.size() > 0);
+  }
+
+  virtual Status Callback(SequenceNumber commit_seq,
+                          bool is_mem_disabled __attribute__((__unused__)),
+                          uint64_t, size_t /*index*/,
+                          size_t /*total*/) override {
+    const uint64_t last_commit_seq = LIKELY(data_batch_cnt_ <= 1)
+                                         ? commit_seq
+                                         : commit_seq + data_batch_cnt_ - 1;
+    // Recall that unprep_seqs maps (un)prepared_seq => prepare_batch_cnt.
+    for (const auto& s : unprep_seqs_) {
+      for (size_t i = 0; i < s.second; i++) {
+        db_->AddCommitted(s.first + i, last_commit_seq);
+      }
+    }
+
+    if (includes_data_) {
+      assert(data_batch_cnt_);
+      // Commit the data that is accompanied with the commit request
+      for (size_t i = 0; i < data_batch_cnt_; i++) {
+        // For commit seq of each batch use the commit seq of the last batch.
+        // This would make debugging easier by having all the batches having
+        // the same sequence number.
+        db_->AddCommitted(commit_seq + i, last_commit_seq);
+      }
+    }
+    if (db_impl_->immutable_db_options().two_write_queues && publish_seq_) {
+      assert(is_mem_disabled);  // implies the 2nd queue
+      // Publish the sequence number. We can do that here assuming the callback
+      // is invoked only from one write queue, which would guarantee that the
+      // publish sequence numbers will be in order, i.e., once a seq is
+      // published all the seq prior to that are also publishable.
+      db_impl_->SetLastPublishedSequence(last_commit_seq);
+    }
+    // else SequenceNumber that is updated as part of the write already does the
+    // publishing
+    return Status::OK();
+  }
+
+ private:
+  WritePreparedTxnDB* db_;
+  DBImpl* db_impl_;
+  const std::map<SequenceNumber, size_t>& unprep_seqs_;
+  size_t data_batch_cnt_;
+  // Either because it is commit without prepare or it has a
+  // CommitTimeWriteBatch
+  bool includes_data_;
+  // Should the callback also publishes the commit seq number
+  bool publish_seq_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/ttl/db_ttl_impl.cc b/src/rocksdb/utilities/ttl/db_ttl_impl.cc
new file mode 100644
index 000000000..6ec9d87b0
--- /dev/null
+++ b/src/rocksdb/utilities/ttl/db_ttl_impl.cc
@@ -0,0 +1,609 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#ifndef ROCKSDB_LITE
+
+#include "utilities/ttl/db_ttl_impl.h"
+
+#include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "logging/logging.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+static std::unordered_map<std::string, OptionTypeInfo> ttl_merge_op_type_info =
+    {{"user_operator",
+      OptionTypeInfo::AsCustomSharedPtr<MergeOperator>(
+          0, OptionVerificationType::kByName, OptionTypeFlags::kNone)}};
+
+TtlMergeOperator::TtlMergeOperator(
+    const std::shared_ptr<MergeOperator>& merge_op, SystemClock* clock)
+    : user_merge_op_(merge_op), clock_(clock) {
+  RegisterOptions("TtlMergeOptions", &user_merge_op_, &ttl_merge_op_type_info);
+}
+
+bool TtlMergeOperator::FullMergeV2(const MergeOperationInput& merge_in,
+                                   MergeOperationOutput* merge_out) const {
+  const uint32_t ts_len = DBWithTTLImpl::kTSLength;
+  if (merge_in.existing_value && merge_in.existing_value->size() < ts_len) {
+    ROCKS_LOG_ERROR(merge_in.logger,
+                    "Error: Could not remove timestamp from existing value.");
+    return false;
+  }
+
+  // Extract time-stamp from each operand to be passed to user_merge_op_
+  std::vector<Slice> operands_without_ts;
+  for (const auto& operand : merge_in.operand_list) {
+    if (operand.size() < ts_len) {
+      ROCKS_LOG_ERROR(merge_in.logger,
+                      "Error: Could not remove timestamp from operand value.");
+      return false;
+    }
+    operands_without_ts.push_back(operand);
+    operands_without_ts.back().remove_suffix(ts_len);
+  }
+
+  // Apply the user merge operator (store result in *new_value)
+  bool good = true;
+  MergeOperationOutput user_merge_out(merge_out->new_value,
+                                      merge_out->existing_operand);
+  if (merge_in.existing_value) {
+    Slice existing_value_without_ts(merge_in.existing_value->data(),
+                                    merge_in.existing_value->size() - ts_len);
+    good = user_merge_op_->FullMergeV2(
+        MergeOperationInput(merge_in.key, &existing_value_without_ts,
+                            operands_without_ts, merge_in.logger),
+        &user_merge_out);
+  } else {
+    good = user_merge_op_->FullMergeV2(
+        MergeOperationInput(merge_in.key, nullptr, operands_without_ts,
+                            merge_in.logger),
+        &user_merge_out);
+  }
+
+  // Return false if the user merge operator returned false
+  if (!good) {
+    return false;
+  }
+
+  if (merge_out->existing_operand.data()) {
+    merge_out->new_value.assign(merge_out->existing_operand.data(),
+                                merge_out->existing_operand.size());
+    merge_out->existing_operand = Slice(nullptr, 0);
+  }
+
+  // Augment the *new_value with the ttl time-stamp
+  int64_t curtime;
+  if (!clock_->GetCurrentTime(&curtime).ok()) {
+    ROCKS_LOG_ERROR(
+        merge_in.logger,
+        "Error: Could not get current time to be attached internally "
+        "to the new value.");
+    return false;
+  } else {
+    char ts_string[ts_len];
+    EncodeFixed32(ts_string, (int32_t)curtime);
+    merge_out->new_value.append(ts_string, ts_len);
+    return true;
+  }
+}
+
+bool TtlMergeOperator::PartialMergeMulti(const Slice& key,
+                                         const std::deque<Slice>& operand_list,
+                                         std::string* new_value,
+                                         Logger* logger) const {
+  const uint32_t ts_len = DBWithTTLImpl::kTSLength;
+  std::deque<Slice> operands_without_ts;
+
+  for (const auto& operand : operand_list) {
+    if (operand.size() < ts_len) {
+      ROCKS_LOG_ERROR(logger, "Error: Could not remove timestamp from value.");
+      return false;
+    }
+
+    operands_without_ts.push_back(
+        Slice(operand.data(), operand.size() - ts_len));
+  }
+
+  // Apply the user partial-merge operator (store result in *new_value)
+  assert(new_value);
+  if (!user_merge_op_->PartialMergeMulti(key, operands_without_ts, new_value,
+                                         logger)) {
+    return false;
+  }
+
+  // Augment the *new_value with the ttl time-stamp
+  int64_t curtime;
+  if (!clock_->GetCurrentTime(&curtime).ok()) {
+    ROCKS_LOG_ERROR(
+        logger,
+        "Error: Could not get current time to be attached internally "
+        "to the new value.");
+    return false;
+  } else {
+    char ts_string[ts_len];
+    EncodeFixed32(ts_string, (int32_t)curtime);
+    new_value->append(ts_string, ts_len);
+    return true;
+  }
+}
+
+Status TtlMergeOperator::PrepareOptions(const ConfigOptions& config_options) {
+  if (clock_ == nullptr) {
+    clock_ = config_options.env->GetSystemClock().get();
+  }
+  return MergeOperator::PrepareOptions(config_options);
+}
+
+Status TtlMergeOperator::ValidateOptions(
+    const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const {
+  if (user_merge_op_ == nullptr) {
+    return Status::InvalidArgument(
+        "UserMergeOperator required by TtlMergeOperator");
+  } else if (clock_ == nullptr) {
+    return Status::InvalidArgument("SystemClock required by TtlMergeOperator");
+  } else {
+    return MergeOperator::ValidateOptions(db_opts, cf_opts);
+  }
+}
+
+void DBWithTTLImpl::SanitizeOptions(int32_t ttl, ColumnFamilyOptions* options,
+                                    SystemClock* clock) {
+  if (options->compaction_filter) {
+    options->compaction_filter =
+        new TtlCompactionFilter(ttl, clock, options->compaction_filter);
+  } else {
+    options->compaction_filter_factory =
+        std::shared_ptr<CompactionFilterFactory>(new TtlCompactionFilterFactory(
+            ttl, clock, options->compaction_filter_factory));
+  }
+
+  if (options->merge_operator) {
+    options->merge_operator.reset(
+        new TtlMergeOperator(options->merge_operator, clock));
+  }
+}
+
+static std::unordered_map<std::string, OptionTypeInfo> ttl_type_info = {
+    {"ttl", {0, OptionType::kInt32T}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> ttl_cff_type_info = {
+    {"user_filter_factory",
+     OptionTypeInfo::AsCustomSharedPtr<CompactionFilterFactory>(
+         0, OptionVerificationType::kByNameAllowFromNull,
+         OptionTypeFlags::kNone)}};
+static std::unordered_map<std::string, OptionTypeInfo> user_cf_type_info = {
+    {"user_filter",
+     OptionTypeInfo::AsCustomRawPtr<const CompactionFilter>(
+         0, OptionVerificationType::kByName, OptionTypeFlags::kAllowNull)}};
+
+TtlCompactionFilter::TtlCompactionFilter(
+    int32_t ttl, SystemClock* clock, const CompactionFilter* _user_comp_filter,
+    std::unique_ptr<const CompactionFilter> _user_comp_filter_from_factory)
+    : LayeredCompactionFilterBase(_user_comp_filter,
+                                  std::move(_user_comp_filter_from_factory)),
+      ttl_(ttl),
+      clock_(clock) {
+  RegisterOptions("TTL", &ttl_, &ttl_type_info);
+  RegisterOptions("UserFilter", &user_comp_filter_, &user_cf_type_info);
+}
+
+bool TtlCompactionFilter::Filter(int level, const Slice& key,
+                                 const Slice& old_val, std::string* new_val,
+                                 bool* value_changed) const {
+  if (DBWithTTLImpl::IsStale(old_val, ttl_, clock_)) {
+    return true;
+  }
+  if (user_comp_filter() == nullptr) {
+    return false;
+  }
+  assert(old_val.size() >= DBWithTTLImpl::kTSLength);
+  Slice old_val_without_ts(old_val.data(),
+                           old_val.size() - DBWithTTLImpl::kTSLength);
+  if (user_comp_filter()->Filter(level, key, old_val_without_ts, new_val,
+                                 value_changed)) {
+    return true;
+  }
+  if (*value_changed) {
+    new_val->append(old_val.data() + old_val.size() - DBWithTTLImpl::kTSLength,
+                    DBWithTTLImpl::kTSLength);
+  }
+  return false;
+}
+
+Status TtlCompactionFilter::PrepareOptions(
+    const ConfigOptions& config_options) {
+  if (clock_ == nullptr) {
+    clock_ = config_options.env->GetSystemClock().get();
+  }
+  return LayeredCompactionFilterBase::PrepareOptions(config_options);
+}
+
+Status TtlCompactionFilter::ValidateOptions(
+    const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const {
+  if (clock_ == nullptr) {
+    return Status::InvalidArgument(
+        "SystemClock required by TtlCompactionFilter");
+  } else {
+    return LayeredCompactionFilterBase::ValidateOptions(db_opts, cf_opts);
+  }
+}
+
+TtlCompactionFilterFactory::TtlCompactionFilterFactory(
+    int32_t ttl, SystemClock* clock,
+    std::shared_ptr<CompactionFilterFactory> comp_filter_factory)
+    : ttl_(ttl), clock_(clock), user_comp_filter_factory_(comp_filter_factory) {
+  RegisterOptions("UserOptions", &user_comp_filter_factory_,
+                  &ttl_cff_type_info);
+  RegisterOptions("TTL", &ttl_, &ttl_type_info);
+}
+
+std::unique_ptr<CompactionFilter>
+TtlCompactionFilterFactory::CreateCompactionFilter(
+    const CompactionFilter::Context& context) {
+  std::unique_ptr<const CompactionFilter> user_comp_filter_from_factory =
+      nullptr;
+  if (user_comp_filter_factory_) {
+    user_comp_filter_from_factory =
+        user_comp_filter_factory_->CreateCompactionFilter(context);
+  }
+
+  return std::unique_ptr<TtlCompactionFilter>(new TtlCompactionFilter(
+      ttl_, clock_, nullptr, std::move(user_comp_filter_from_factory)));
+}
+
+Status TtlCompactionFilterFactory::PrepareOptions(
+    const ConfigOptions& config_options) {
+  if (clock_ == nullptr) {
+    clock_ = config_options.env->GetSystemClock().get();
+  }
+  return CompactionFilterFactory::PrepareOptions(config_options);
+}
+
+Status TtlCompactionFilterFactory::ValidateOptions(
+    const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const {
+  if (clock_ == nullptr) {
+    return Status::InvalidArgument(
+        "SystemClock required by TtlCompactionFilterFactory");
+  } else {
+    return CompactionFilterFactory::ValidateOptions(db_opts, cf_opts);
+  }
+}
+
+int RegisterTtlObjects(ObjectLibrary& library, const std::string& /*arg*/) {
+  library.AddFactory<MergeOperator>(
+      TtlMergeOperator::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<MergeOperator>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new TtlMergeOperator(nullptr, nullptr));
+        return guard->get();
+      });
+  library.AddFactory<CompactionFilterFactory>(
+      TtlCompactionFilterFactory::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<CompactionFilterFactory>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new TtlCompactionFilterFactory(0, nullptr, nullptr));
+        return guard->get();
+      });
+  library.AddFactory<CompactionFilter>(
+      TtlCompactionFilter::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<CompactionFilter>* /*guard*/,
+         std::string* /* errmsg */) {
+        return new TtlCompactionFilter(0, nullptr, nullptr);
+      });
+  size_t num_types;
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+// Open the db inside DBWithTTLImpl because options needs pointer to its ttl
+DBWithTTLImpl::DBWithTTLImpl(DB* db) : DBWithTTL(db), closed_(false) {}
+
+DBWithTTLImpl::~DBWithTTLImpl() {
+  if (!closed_) {
+    Close().PermitUncheckedError();
+  }
+}
+
+Status DBWithTTLImpl::Close() {
+  Status ret = Status::OK();
+  if (!closed_) {
+    Options default_options = GetOptions();
+    // Need to stop background compaction before getting rid of the filter
+    CancelAllBackgroundWork(db_, /* wait = */ true);
+    ret = db_->Close();
+    delete default_options.compaction_filter;
+    closed_ = true;
+  }
+  return ret;
+}
+
+void DBWithTTLImpl::RegisterTtlClasses() {
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    ObjectRegistry::Default()->AddLibrary("TTL", RegisterTtlObjects, "");
+  });
+}
+
+Status DBWithTTL::Open(const Options& options, const std::string& dbname,
+                       DBWithTTL** dbptr, int32_t ttl, bool read_only) {
+  DBOptions db_options(options);
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+  std::vector<ColumnFamilyHandle*> handles;
+  Status s = DBWithTTL::Open(db_options, dbname, column_families, &handles,
+                             dbptr, {ttl}, read_only);
+  if (s.ok()) {
+    assert(handles.size() == 1);
+    // i can delete the handle since DBImpl is always holding a reference to
+    // default column family
+    delete handles[0];
+  }
+  return s;
+}
+
+Status DBWithTTL::Open(
+    const DBOptions& db_options, const std::string& dbname,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles, DBWithTTL** dbptr,
+    const std::vector<int32_t>& ttls, bool read_only) {
+  DBWithTTLImpl::RegisterTtlClasses();
+  if (ttls.size() != column_families.size()) {
+    return Status::InvalidArgument(
+        "ttls size has to be the same as number of column families");
+  }
+
+  SystemClock* clock = (db_options.env == nullptr)
+                           ? SystemClock::Default().get()
+                           : db_options.env->GetSystemClock().get();
+
+  std::vector<ColumnFamilyDescriptor> column_families_sanitized =
+      column_families;
+  for (size_t i = 0; i < column_families_sanitized.size(); ++i) {
+    DBWithTTLImpl::SanitizeOptions(
+        ttls[i], &column_families_sanitized[i].options, clock);
+  }
+  DB* db;
+
+  Status st;
+  if (read_only) {
+    st = DB::OpenForReadOnly(db_options, dbname, column_families_sanitized,
+                             handles, &db);
+  } else {
+    st = DB::Open(db_options, dbname, column_families_sanitized, handles, &db);
+  }
+  if (st.ok()) {
+    *dbptr = new DBWithTTLImpl(db);
+  } else {
+    *dbptr = nullptr;
+  }
+  return st;
+}
+
+Status DBWithTTLImpl::CreateColumnFamilyWithTtl(
+    const ColumnFamilyOptions& options, const std::string& column_family_name,
+    ColumnFamilyHandle** handle, int ttl) {
+  RegisterTtlClasses();
+  ColumnFamilyOptions sanitized_options = options;
+  DBWithTTLImpl::SanitizeOptions(ttl, &sanitized_options,
+                                 GetEnv()->GetSystemClock().get());
+
+  return DBWithTTL::CreateColumnFamily(sanitized_options, column_family_name,
+                                       handle);
+}
+
+Status DBWithTTLImpl::CreateColumnFamily(const ColumnFamilyOptions& options,
+                                         const std::string& column_family_name,
+                                         ColumnFamilyHandle** handle) {
+  return CreateColumnFamilyWithTtl(options, column_family_name, handle, 0);
+}
+
+// Appends the current timestamp to the string.
+// Returns false if could not get the current_time, true if append succeeds
+Status DBWithTTLImpl::AppendTS(const Slice& val, std::string* val_with_ts,
+                               SystemClock* clock) {
+  val_with_ts->reserve(kTSLength + val.size());
+  char ts_string[kTSLength];
+  int64_t curtime;
+  Status st = clock->GetCurrentTime(&curtime);
+  if (!st.ok()) {
+    return st;
+  }
+  EncodeFixed32(ts_string, (int32_t)curtime);
+  val_with_ts->append(val.data(), val.size());
+  val_with_ts->append(ts_string, kTSLength);
+  return st;
+}
+
+// Returns corruption if the length of the string is lesser than timestamp, or
+// timestamp refers to a time lesser than ttl-feature release time
+Status DBWithTTLImpl::SanityCheckTimestamp(const Slice& str) {
+  if (str.size() < kTSLength) {
+    return Status::Corruption("Error: value's length less than timestamp's\n");
+  }
+  // Checks that TS is not lesser than kMinTimestamp
+  // Gaurds against corruption & normal database opened incorrectly in ttl mode
+  int32_t timestamp_value = DecodeFixed32(str.data() + str.size() - kTSLength);
+  if (timestamp_value < kMinTimestamp) {
+    return Status::Corruption("Error: Timestamp < ttl feature release time!\n");
+  }
+  return Status::OK();
+}
+
+// Checks if the string is stale or not according to TTl provided
+bool DBWithTTLImpl::IsStale(const Slice& value, int32_t ttl,
+                            SystemClock* clock) {
+  if (ttl <= 0) {  // Data is fresh if TTL is non-positive
+    return false;
+  }
+  int64_t curtime;
+  if (!clock->GetCurrentTime(&curtime).ok()) {
+    return false;  // Treat the data as fresh if could not get current time
+  }
+  int32_t timestamp_value =
+      DecodeFixed32(value.data() + value.size() - kTSLength);
+  return (timestamp_value + ttl) < curtime;
+}
+
+// Strips the TS from the end of the slice
+Status DBWithTTLImpl::StripTS(PinnableSlice* pinnable_val) {
+  if (pinnable_val->size() < kTSLength) {
+    return Status::Corruption("Bad timestamp in key-value");
+  }
+  // Erasing characters which hold the TS
+  pinnable_val->remove_suffix(kTSLength);
+  return Status::OK();
+}
+
+// Strips the TS from the end of the string
+Status DBWithTTLImpl::StripTS(std::string* str) {
+  if (str->length() < kTSLength) {
+    return Status::Corruption("Bad timestamp in key-value");
+  }
+  // Erasing characters which hold the TS
+  str->erase(str->length() - kTSLength, kTSLength);
+  return Status::OK();
+}
+
+Status DBWithTTLImpl::Put(const WriteOptions& options,
+                          ColumnFamilyHandle* column_family, const Slice& key,
+                          const Slice& val) {
+  WriteBatch batch;
+  Status st = batch.Put(column_family, key, val);
+  if (st.ok()) {
+    st = Write(options, &batch);
+  }
+  return st;
+}
+
+Status DBWithTTLImpl::Get(const ReadOptions& options,
+                          ColumnFamilyHandle* column_family, const Slice& key,
+                          PinnableSlice* value) {
+  Status st = db_->Get(options, column_family, key, value);
+  if (!st.ok()) {
+    return st;
+  }
+  st = SanityCheckTimestamp(*value);
+  if (!st.ok()) {
+    return st;
+  }
+  return StripTS(value);
+}
+
+std::vector<Status> DBWithTTLImpl::MultiGet(
+    const ReadOptions& options,
+    const std::vector<ColumnFamilyHandle*>& column_family,
+    const std::vector<Slice>& keys, std::vector<std::string>* values) {
+  auto statuses = db_->MultiGet(options, column_family, keys, values);
+  for (size_t i = 0; i < keys.size(); ++i) {
+    if (!statuses[i].ok()) {
+      continue;
+    }
+    statuses[i] = SanityCheckTimestamp((*values)[i]);
+    if (!statuses[i].ok()) {
+      continue;
+    }
+    statuses[i] = StripTS(&(*values)[i]);
+  }
+  return statuses;
+}
+
+bool DBWithTTLImpl::KeyMayExist(const ReadOptions& options,
+                                ColumnFamilyHandle* column_family,
+                                const Slice& key, std::string* value,
+                                bool* value_found) {
+  bool ret = db_->KeyMayExist(options, column_family, key, value, value_found);
+  if (ret && value != nullptr && value_found != nullptr && *value_found) {
+    if (!SanityCheckTimestamp(*value).ok() || !StripTS(value).ok()) {
+      return false;
+    }
+  }
+  return ret;
+}
+
+Status DBWithTTLImpl::Merge(const WriteOptions& options,
+                            ColumnFamilyHandle* column_family, const Slice& key,
+                            const Slice& value) {
+  WriteBatch batch;
+  Status st = batch.Merge(column_family, key, value);
+  if (st.ok()) {
+    st = Write(options, &batch);
+  }
+  return st;
+}
+
+Status DBWithTTLImpl::Write(const WriteOptions& opts, WriteBatch* updates) {
+  class Handler : public WriteBatch::Handler {
+   public:
+    explicit Handler(SystemClock* clock) : clock_(clock) {}
+    WriteBatch updates_ttl;
+    Status PutCF(uint32_t column_family_id, const Slice& key,
+                 const Slice& value) override {
+      std::string value_with_ts;
+      Status st = AppendTS(value, &value_with_ts, clock_);
+      if (!st.ok()) {
+        return st;
+      }
+      return WriteBatchInternal::Put(&updates_ttl, column_family_id, key,
+                                     value_with_ts);
+    }
+    Status MergeCF(uint32_t column_family_id, const Slice& key,
+                   const Slice& value) override {
+      std::string value_with_ts;
+      Status st = AppendTS(value, &value_with_ts, clock_);
+      if (!st.ok()) {
+        return st;
+      }
+      return WriteBatchInternal::Merge(&updates_ttl, column_family_id, key,
+                                       value_with_ts);
+    }
+    Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
+      return WriteBatchInternal::Delete(&updates_ttl, column_family_id, key);
+    }
+    Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key,
+                         const Slice& end_key) override {
+      return WriteBatchInternal::DeleteRange(&updates_ttl, column_family_id,
+                                             begin_key, end_key);
+    }
+    void LogData(const Slice& blob) override { updates_ttl.PutLogData(blob); }
+
+   private:
+    SystemClock* clock_;
+  };
+  Handler handler(GetEnv()->GetSystemClock().get());
+  Status st = updates->Iterate(&handler);
+  if (!st.ok()) {
+    return st;
+  } else {
+    return db_->Write(opts, &(handler.updates_ttl));
+  }
+}
+
+Iterator* DBWithTTLImpl::NewIterator(const ReadOptions& opts,
+                                     ColumnFamilyHandle* column_family) {
+  return new TtlIterator(db_->NewIterator(opts, column_family));
+}
+
+void DBWithTTLImpl::SetTtl(ColumnFamilyHandle* h, int32_t ttl) {
+  std::shared_ptr<TtlCompactionFilterFactory> filter;
+  Options opts;
+  opts = GetOptions(h);
+  filter = std::static_pointer_cast<TtlCompactionFilterFactory>(
+      opts.compaction_filter_factory);
+  if (!filter) return;
+  filter->SetTtl(ttl);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/ttl/db_ttl_impl.h b/src/rocksdb/utilities/ttl/db_ttl_impl.h
new file mode 100644
index 000000000..dd67a6ddc
--- /dev/null
+++ b/src/rocksdb/utilities/ttl/db_ttl_impl.h
@@ -0,0 +1,245 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+#include <deque>
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "utilities/compaction_filters/layered_compaction_filter_base.h"
+
+#ifdef _WIN32
+// Windows API macro interference
+#undef GetCurrentTime
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+struct ConfigOptions;
+class ObjectLibrary;
+class ObjectRegistry;
+class DBWithTTLImpl : public DBWithTTL {
+ public:
+  static void SanitizeOptions(int32_t ttl, ColumnFamilyOptions* options,
+                              SystemClock* clock);
+
+  static void RegisterTtlClasses();
+  explicit DBWithTTLImpl(DB* db);
+
+  virtual ~DBWithTTLImpl();
+
+  virtual Status Close() override;
+
+  Status CreateColumnFamilyWithTtl(const ColumnFamilyOptions& options,
+                                   const std::string& column_family_name,
+                                   ColumnFamilyHandle** handle,
+                                   int ttl) override;
+
+  Status CreateColumnFamily(const ColumnFamilyOptions& options,
+                            const std::string& column_family_name,
+                            ColumnFamilyHandle** handle) override;
+
+  using StackableDB::Put;
+  virtual Status Put(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& val) override;
+
+  using StackableDB::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* value) override;
+
+  using StackableDB::MultiGet;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override;
+
+  using StackableDB::KeyMayExist;
+  virtual bool KeyMayExist(const ReadOptions& options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           std::string* value,
+                           bool* value_found = nullptr) override;
+
+  using StackableDB::Merge;
+  virtual Status Merge(const WriteOptions& options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) override;
+
+  virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
+
+  using StackableDB::NewIterator;
+  virtual Iterator* NewIterator(const ReadOptions& opts,
+                                ColumnFamilyHandle* column_family) override;
+
+  virtual DB* GetBaseDB() override { return db_; }
+
+  static bool IsStale(const Slice& value, int32_t ttl, SystemClock* clock);
+
+  static Status AppendTS(const Slice& val, std::string* val_with_ts,
+                         SystemClock* clock);
+
+  static Status SanityCheckTimestamp(const Slice& str);
+
+  static Status StripTS(std::string* str);
+
+  static Status StripTS(PinnableSlice* str);
+
+  static const uint32_t kTSLength = sizeof(int32_t);  // size of timestamp
+
+  static const int32_t kMinTimestamp = 1368146402;  // 05/09/2013:5:40PM GMT-8
+
+  static const int32_t kMaxTimestamp = 2147483647;  // 01/18/2038:7:14PM GMT-8
+
+  void SetTtl(int32_t ttl) override { SetTtl(DefaultColumnFamily(), ttl); }
+
+  void SetTtl(ColumnFamilyHandle* h, int32_t ttl) override;
+
+ private:
+  // remember whether the Close completes or not
+  bool closed_;
+};
+
+class TtlIterator : public Iterator {
+ public:
+  explicit TtlIterator(Iterator* iter) : iter_(iter) { assert(iter_); }
+
+  ~TtlIterator() { delete iter_; }
+
+  bool Valid() const override { return iter_->Valid(); }
+
+  void SeekToFirst() override { iter_->SeekToFirst(); }
+
+  void SeekToLast() override { iter_->SeekToLast(); }
+
+  void Seek(const Slice& target) override { iter_->Seek(target); }
+
+  void SeekForPrev(const Slice& target) override { iter_->SeekForPrev(target); }
+
+  void Next() override { iter_->Next(); }
+
+  void Prev() override { iter_->Prev(); }
+
+  Slice key() const override { return iter_->key(); }
+
+  int32_t ttl_timestamp() const {
+    return DecodeFixed32(iter_->value().data() + iter_->value().size() -
+                         DBWithTTLImpl::kTSLength);
+  }
+
+  Slice value() const override {
+    // TODO: handle timestamp corruption like in general iterator semantics
+    assert(DBWithTTLImpl::SanityCheckTimestamp(iter_->value()).ok());
+    Slice trimmed_value = iter_->value();
+    trimmed_value.size_ -= DBWithTTLImpl::kTSLength;
+    return trimmed_value;
+  }
+
+  Status status() const override { return iter_->status(); }
+
+ private:
+  Iterator* iter_;
+};
+
+class TtlCompactionFilter : public LayeredCompactionFilterBase {
+ public:
+  TtlCompactionFilter(int32_t ttl, SystemClock* clock,
+                      const CompactionFilter* _user_comp_filter,
+                      std::unique_ptr<const CompactionFilter>
+                          _user_comp_filter_from_factory = nullptr);
+
+  virtual bool Filter(int level, const Slice& key, const Slice& old_val,
+                      std::string* new_val, bool* value_changed) const override;
+
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "TtlCompactionFilter"; }
+  bool IsInstanceOf(const std::string& name) const override {
+    if (name == "Delete By TTL") {
+      return true;
+    } else {
+      return LayeredCompactionFilterBase::IsInstanceOf(name);
+    }
+  }
+
+  Status PrepareOptions(const ConfigOptions& config_options) override;
+  Status ValidateOptions(const DBOptions& db_opts,
+                         const ColumnFamilyOptions& cf_opts) const override;
+
+ private:
+  int32_t ttl_;
+  SystemClock* clock_;
+};
+
+class TtlCompactionFilterFactory : public CompactionFilterFactory {
+ public:
+  TtlCompactionFilterFactory(
+      int32_t ttl, SystemClock* clock,
+      std::shared_ptr<CompactionFilterFactory> comp_filter_factory);
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override;
+  void SetTtl(int32_t ttl) { ttl_ = ttl; }
+
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "TtlCompactionFilterFactory"; }
+  Status PrepareOptions(const ConfigOptions& config_options) override;
+  Status ValidateOptions(const DBOptions& db_opts,
+                         const ColumnFamilyOptions& cf_opts) const override;
+  const Customizable* Inner() const override {
+    return user_comp_filter_factory_.get();
+  }
+
+ private:
+  int32_t ttl_;
+  SystemClock* clock_;
+  std::shared_ptr<CompactionFilterFactory> user_comp_filter_factory_;
+};
+
+class TtlMergeOperator : public MergeOperator {
+ public:
+  explicit TtlMergeOperator(const std::shared_ptr<MergeOperator>& merge_op,
+                            SystemClock* clock);
+
+  bool FullMergeV2(const MergeOperationInput& merge_in,
+                   MergeOperationOutput* merge_out) const override;
+
+  bool PartialMergeMulti(const Slice& key,
+                         const std::deque<Slice>& operand_list,
+                         std::string* new_value, Logger* logger) const override;
+
+  static const char* kClassName() { return "TtlMergeOperator"; }
+
+  const char* Name() const override { return kClassName(); }
+  bool IsInstanceOf(const std::string& name) const override {
+    if (name == "Merge By TTL") {
+      return true;
+    } else {
+      return MergeOperator::IsInstanceOf(name);
+    }
+  }
+
+  Status PrepareOptions(const ConfigOptions& config_options) override;
+  Status ValidateOptions(const DBOptions& db_opts,
+                         const ColumnFamilyOptions& cf_opts) const override;
+  const Customizable* Inner() const override { return user_merge_op_.get(); }
+
+ private:
+  std::shared_ptr<MergeOperator> user_merge_op_;
+  SystemClock* clock_;
+};
+extern "C" {
+int RegisterTtlObjects(ObjectLibrary& library, const std::string& /*arg*/);
+}  // extern "C"
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/ttl/ttl_test.cc b/src/rocksdb/utilities/ttl/ttl_test.cc
new file mode 100644
index 000000000..a42e0acb4
--- /dev/null
+++ b/src/rocksdb/utilities/ttl/ttl_test.cc
@@ -0,0 +1,912 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include <map>
+#include <memory>
+
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "test_util/testharness.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators/bytesxor.h"
+#include "utilities/ttl/db_ttl_impl.h"
+#ifndef OS_WIN
+#include <unistd.h>
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+using KVMap = std::map<std::string, std::string>;
+
+enum BatchOperation { OP_PUT = 0, OP_DELETE = 1 };
+}  // namespace
+
+class SpecialTimeEnv : public EnvWrapper {
+ public:
+  explicit SpecialTimeEnv(Env* base) : EnvWrapper(base) {
+    EXPECT_OK(base->GetCurrentTime(&current_time_));
+  }
+  const char* Name() const override { return "SpecialTimeEnv"; }
+  void Sleep(int64_t sleep_time) { current_time_ += sleep_time; }
+  Status GetCurrentTime(int64_t* current_time) override {
+    *current_time = current_time_;
+    return Status::OK();
+  }
+
+ private:
+  int64_t current_time_ = 0;
+};
+
+class TtlTest : public testing::Test {
+ public:
+  TtlTest() {
+    env_.reset(new SpecialTimeEnv(Env::Default()));
+    dbname_ = test::PerThreadDBPath("db_ttl");
+    options_.create_if_missing = true;
+    options_.env = env_.get();
+    // ensure that compaction is kicked in to always strip timestamp from kvs
+    options_.max_compaction_bytes = 1;
+    // compaction should take place always from level0 for determinism
+    db_ttl_ = nullptr;
+    EXPECT_OK(DestroyDB(dbname_, Options()));
+  }
+
+  ~TtlTest() override {
+    CloseTtl();
+    EXPECT_OK(DestroyDB(dbname_, Options()));
+  }
+
+  // Open database with TTL support when TTL not provided with db_ttl_ pointer
+  void OpenTtl() {
+    ASSERT_TRUE(db_ttl_ ==
+                nullptr);  //  db should be closed before opening again
+    ASSERT_OK(DBWithTTL::Open(options_, dbname_, &db_ttl_));
+  }
+
+  // Open database with TTL support when TTL provided with db_ttl_ pointer
+  void OpenTtl(int32_t ttl) {
+    ASSERT_TRUE(db_ttl_ == nullptr);
+    ASSERT_OK(DBWithTTL::Open(options_, dbname_, &db_ttl_, ttl));
+  }
+
+  // Open with TestFilter compaction filter
+  void OpenTtlWithTestCompaction(int32_t ttl) {
+    options_.compaction_filter_factory =
+        std::shared_ptr<CompactionFilterFactory>(
+            new TestFilterFactory(kSampleSize_, kNewValue_));
+    OpenTtl(ttl);
+  }
+
+  // Open database with TTL support in read_only mode
+  void OpenReadOnlyTtl(int32_t ttl) {
+    ASSERT_TRUE(db_ttl_ == nullptr);
+    ASSERT_OK(DBWithTTL::Open(options_, dbname_, &db_ttl_, ttl, true));
+  }
+
+  // Call db_ttl_->Close() before delete db_ttl_
+  void CloseTtl() { CloseTtlHelper(true); }
+
+  // No db_ttl_->Close() before delete db_ttl_
+  void CloseTtlNoDBClose() { CloseTtlHelper(false); }
+
+  void CloseTtlHelper(bool close_db) {
+    if (db_ttl_ != nullptr) {
+      if (close_db) {
+        EXPECT_OK(db_ttl_->Close());
+      }
+      delete db_ttl_;
+      db_ttl_ = nullptr;
+    }
+  }
+
+  // Populates and returns a kv-map
+  void MakeKVMap(int64_t num_entries) {
+    kvmap_.clear();
+    int digits = 1;
+    for (int64_t dummy = num_entries; dummy /= 10; ++digits) {
+    }
+    int digits_in_i = 1;
+    for (int64_t i = 0; i < num_entries; i++) {
+      std::string key = "key";
+      std::string value = "value";
+      if (i % 10 == 0) {
+        digits_in_i++;
+      }
+      for (int j = digits_in_i; j < digits; j++) {
+        key.append("0");
+        value.append("0");
+      }
+      AppendNumberTo(&key, i);
+      AppendNumberTo(&value, i);
+      kvmap_[key] = value;
+    }
+    ASSERT_EQ(static_cast<int64_t>(kvmap_.size()),
+              num_entries);  // check all insertions done
+  }
+
+  // Makes a write-batch with key-vals from kvmap_ and 'Write''s it
+  void MakePutWriteBatch(const BatchOperation* batch_ops, int64_t num_ops) {
+    ASSERT_LE(num_ops, static_cast<int64_t>(kvmap_.size()));
+    static WriteOptions wopts;
+    static FlushOptions flush_opts;
+    WriteBatch batch;
+    kv_it_ = kvmap_.begin();
+    for (int64_t i = 0; i < num_ops && kv_it_ != kvmap_.end(); i++, ++kv_it_) {
+      switch (batch_ops[i]) {
+        case OP_PUT:
+          ASSERT_OK(batch.Put(kv_it_->first, kv_it_->second));
+          break;
+        case OP_DELETE:
+          ASSERT_OK(batch.Delete(kv_it_->first));
+          break;
+        default:
+          FAIL();
+      }
+    }
+    ASSERT_OK(db_ttl_->Write(wopts, &batch));
+    ASSERT_OK(db_ttl_->Flush(flush_opts));
+  }
+
+  // Puts num_entries starting from start_pos_map from kvmap_ into the database
+  void PutValues(int64_t start_pos_map, int64_t num_entries, bool flush = true,
+                 ColumnFamilyHandle* cf = nullptr) {
+    ASSERT_TRUE(db_ttl_);
+    ASSERT_LE(start_pos_map + num_entries, static_cast<int64_t>(kvmap_.size()));
+    static WriteOptions wopts;
+    static FlushOptions flush_opts;
+    kv_it_ = kvmap_.begin();
+    advance(kv_it_, start_pos_map);
+    for (int64_t i = 0; kv_it_ != kvmap_.end() && i < num_entries;
+         i++, ++kv_it_) {
+      ASSERT_OK(cf == nullptr
+                    ? db_ttl_->Put(wopts, kv_it_->first, kv_it_->second)
+                    : db_ttl_->Put(wopts, cf, kv_it_->first, kv_it_->second));
+    }
+    // Put a mock kv at the end because CompactionFilter doesn't delete last key
+    ASSERT_OK(cf == nullptr ? db_ttl_->Put(wopts, "keymock", "valuemock")
+                            : db_ttl_->Put(wopts, cf, "keymock", "valuemock"));
+    if (flush) {
+      if (cf == nullptr) {
+        ASSERT_OK(db_ttl_->Flush(flush_opts));
+      } else {
+        ASSERT_OK(db_ttl_->Flush(flush_opts, cf));
+      }
+    }
+  }
+
+  // Runs a manual compaction
+  Status ManualCompact(ColumnFamilyHandle* cf = nullptr) {
+    assert(db_ttl_);
+    if (cf == nullptr) {
+      return db_ttl_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    } else {
+      return db_ttl_->CompactRange(CompactRangeOptions(), cf, nullptr, nullptr);
+    }
+  }
+
+  // Runs a DeleteRange
+  void MakeDeleteRange(std::string start, std::string end,
+                       ColumnFamilyHandle* cf = nullptr) {
+    ASSERT_TRUE(db_ttl_);
+    static WriteOptions wops;
+    WriteBatch wb;
+    ASSERT_OK(cf == nullptr
+                  ? wb.DeleteRange(db_ttl_->DefaultColumnFamily(), start, end)
+                  : wb.DeleteRange(cf, start, end));
+    ASSERT_OK(db_ttl_->Write(wops, &wb));
+  }
+
+  // checks the whole kvmap_ to return correct values using KeyMayExist
+  void SimpleKeyMayExistCheck() {
+    static ReadOptions ropts;
+    bool value_found;
+    std::string val;
+    for (auto& kv : kvmap_) {
+      bool ret = db_ttl_->KeyMayExist(ropts, kv.first, &val, &value_found);
+      if (ret == false || value_found == false) {
+        fprintf(stderr,
+                "KeyMayExist could not find key=%s in the database but"
+                " should have\n",
+                kv.first.c_str());
+        FAIL();
+      } else if (val.compare(kv.second) != 0) {
+        fprintf(stderr,
+                " value for key=%s present in database is %s but"
+                " should be %s\n",
+                kv.first.c_str(), val.c_str(), kv.second.c_str());
+        FAIL();
+      }
+    }
+  }
+
+  // checks the whole kvmap_ to return correct values using MultiGet
+  void SimpleMultiGetTest() {
+    static ReadOptions ropts;
+    std::vector<Slice> keys;
+    std::vector<std::string> values;
+
+    for (auto& kv : kvmap_) {
+      keys.emplace_back(kv.first);
+    }
+
+    auto statuses = db_ttl_->MultiGet(ropts, keys, &values);
+    size_t i = 0;
+    for (auto& kv : kvmap_) {
+      ASSERT_OK(statuses[i]);
+      ASSERT_EQ(values[i], kv.second);
+      ++i;
+    }
+  }
+
+  void CompactCheck(int64_t st_pos, int64_t span, bool check = true,
+                    bool test_compaction_change = false,
+                    ColumnFamilyHandle* cf = nullptr) {
+    static ReadOptions ropts;
+    kv_it_ = kvmap_.begin();
+    advance(kv_it_, st_pos);
+    std::string v;
+    for (int64_t i = 0; kv_it_ != kvmap_.end() && i < span; i++, ++kv_it_) {
+      Status s = (cf == nullptr) ? db_ttl_->Get(ropts, kv_it_->first, &v)
+                                 : db_ttl_->Get(ropts, cf, kv_it_->first, &v);
+      if (s.ok() != check) {
+        fprintf(stderr, "key=%s ", kv_it_->first.c_str());
+        if (!s.ok()) {
+          fprintf(stderr, "is absent from db but was expected to be present\n");
+        } else {
+          fprintf(stderr, "is present in db but was expected to be absent\n");
+        }
+        FAIL();
+      } else if (s.ok()) {
+        if (test_compaction_change && v.compare(kNewValue_) != 0) {
+          fprintf(stderr,
+                  " value for key=%s present in database is %s but "
+                  " should be %s\n",
+                  kv_it_->first.c_str(), v.c_str(), kNewValue_.c_str());
+          FAIL();
+        } else if (!test_compaction_change && v.compare(kv_it_->second) != 0) {
+          fprintf(stderr,
+                  " value for key=%s present in database is %s but "
+                  " should be %s\n",
+                  kv_it_->first.c_str(), v.c_str(), kv_it_->second.c_str());
+          FAIL();
+        }
+      }
+    }
+  }
+  // Sleeps for slp_tim then runs a manual compaction
+  // Checks span starting from st_pos from kvmap_ in the db and
+  // Gets should return true if check is true and false otherwise
+  // Also checks that value that we got is the same as inserted; and =kNewValue
+  //   if test_compaction_change is true
+  void SleepCompactCheck(int slp_tim, int64_t st_pos, int64_t span,
+                         bool check = true, bool test_compaction_change = false,
+                         ColumnFamilyHandle* cf = nullptr) {
+    ASSERT_TRUE(db_ttl_);
+
+    env_->Sleep(slp_tim);
+    ASSERT_OK(ManualCompact(cf));
+    CompactCheck(st_pos, span, check, test_compaction_change, cf);
+  }
+
+  // Similar as SleepCompactCheck but uses TtlIterator to read from db
+  void SleepCompactCheckIter(int slp, int st_pos, int64_t span,
+                             bool check = true) {
+    ASSERT_TRUE(db_ttl_);
+    env_->Sleep(slp);
+    ASSERT_OK(ManualCompact());
+    static ReadOptions ropts;
+    Iterator* dbiter = db_ttl_->NewIterator(ropts);
+    kv_it_ = kvmap_.begin();
+    advance(kv_it_, st_pos);
+
+    dbiter->Seek(kv_it_->first);
+    if (!check) {
+      if (dbiter->Valid()) {
+        ASSERT_NE(dbiter->value().compare(kv_it_->second), 0);
+      }
+    } else {  // dbiter should have found out kvmap_[st_pos]
+      for (int64_t i = st_pos; kv_it_ != kvmap_.end() && i < st_pos + span;
+           i++, ++kv_it_) {
+        ASSERT_TRUE(dbiter->Valid());
+        ASSERT_EQ(dbiter->value().compare(kv_it_->second), 0);
+        dbiter->Next();
+      }
+    }
+    ASSERT_OK(dbiter->status());
+    delete dbiter;
+  }
+
+  // Set ttl on open db
+  void SetTtl(int32_t ttl, ColumnFamilyHandle* cf = nullptr) {
+    ASSERT_TRUE(db_ttl_);
+    cf == nullptr ? db_ttl_->SetTtl(ttl) : db_ttl_->SetTtl(cf, ttl);
+  }
+
+  class TestFilter : public CompactionFilter {
+   public:
+    TestFilter(const int64_t kSampleSize, const std::string& kNewValue)
+        : kSampleSize_(kSampleSize), kNewValue_(kNewValue) {}
+
+    // Works on keys of the form "key<number>"
+    // Drops key if number at the end of key is in [0, kSampleSize_/3),
+    // Keeps key if it is in [kSampleSize_/3, 2*kSampleSize_/3),
+    // Change value if it is in [2*kSampleSize_/3, kSampleSize_)
+    // Eg. kSampleSize_=6. Drop:key0-1...Keep:key2-3...Change:key4-5...
+    bool Filter(int /*level*/, const Slice& key, const Slice& /*value*/,
+                std::string* new_value, bool* value_changed) const override {
+      assert(new_value != nullptr);
+
+      std::string search_str = "0123456789";
+      std::string key_string = key.ToString();
+      size_t pos = key_string.find_first_of(search_str);
+      int num_key_end;
+      if (pos != std::string::npos) {
+        auto key_substr = key_string.substr(pos, key.size() - pos);
+#ifndef CYGWIN
+        num_key_end = std::stoi(key_substr);
+#else
+        num_key_end = std::strtol(key_substr.c_str(), 0, 10);
+#endif
+
+      } else {
+        return false;  // Keep keys not matching the format "key<NUMBER>"
+      }
+
+      int64_t partition = kSampleSize_ / 3;
+      if (num_key_end < partition) {
+        return true;
+      } else if (num_key_end < partition * 2) {
+        return false;
+      } else {
+        *new_value = kNewValue_;
+        *value_changed = true;
+        return false;
+      }
+    }
+
+    const char* Name() const override { return "TestFilter"; }
+
+   private:
+    const int64_t kSampleSize_;
+    const std::string kNewValue_;
+  };
+
+  class TestFilterFactory : public CompactionFilterFactory {
+   public:
+    TestFilterFactory(const int64_t kSampleSize, const std::string& kNewValue)
+        : kSampleSize_(kSampleSize), kNewValue_(kNewValue) {}
+
+    std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+        const CompactionFilter::Context& /*context*/) override {
+      return std::unique_ptr<CompactionFilter>(
+          new TestFilter(kSampleSize_, kNewValue_));
+    }
+
+    const char* Name() const override { return "TestFilterFactory"; }
+
+   private:
+    const int64_t kSampleSize_;
+    const std::string kNewValue_;
+  };
+
+  // Choose carefully so that Put, Gets & Compaction complete in 1 second buffer
+  static const int64_t kSampleSize_ = 100;
+  std::string dbname_;
+  DBWithTTL* db_ttl_;
+  std::unique_ptr<SpecialTimeEnv> env_;
+
+ private:
+  Options options_;
+  KVMap kvmap_;
+  KVMap::iterator kv_it_;
+  const std::string kNewValue_ = "new_value";
+  std::unique_ptr<CompactionFilter> test_comp_filter_;
+};  // class TtlTest
+
+// If TTL is non positive or not provided, the behaviour is TTL = infinity
+// This test opens the db 3 times with such default behavior and inserts a
+// bunch of kvs each time. All kvs should accumulate in the db till the end
+// Partitions the sample-size provided into 3 sets over boundary1 and boundary2
+TEST_F(TtlTest, NoEffect) {
+  MakeKVMap(kSampleSize_);
+  int64_t boundary1 = kSampleSize_ / 3;
+  int64_t boundary2 = 2 * boundary1;
+
+  OpenTtl();
+  PutValues(0, boundary1);             // T=0: Set1 never deleted
+  SleepCompactCheck(1, 0, boundary1);  // T=1: Set1 still there
+  CloseTtl();
+
+  OpenTtl(0);
+  PutValues(boundary1, boundary2 - boundary1);  // T=1: Set2 never deleted
+  SleepCompactCheck(1, 0, boundary2);           // T=2: Sets1 & 2 still there
+  CloseTtl();
+
+  OpenTtl(-1);
+  PutValues(boundary2, kSampleSize_ - boundary2);  // T=3: Set3 never deleted
+  SleepCompactCheck(1, 0, kSampleSize_, true);  // T=4: Sets 1,2,3 still there
+  CloseTtl();
+}
+
+// Rerun the NoEffect test with a different version of CloseTtl
+// function, where db is directly deleted without close.
+TEST_F(TtlTest, DestructWithoutClose) {
+  MakeKVMap(kSampleSize_);
+  int64_t boundary1 = kSampleSize_ / 3;
+  int64_t boundary2 = 2 * boundary1;
+
+  OpenTtl();
+  PutValues(0, boundary1);             // T=0: Set1 never deleted
+  SleepCompactCheck(1, 0, boundary1);  // T=1: Set1 still there
+  CloseTtlNoDBClose();
+
+  OpenTtl(0);
+  PutValues(boundary1, boundary2 - boundary1);  // T=1: Set2 never deleted
+  SleepCompactCheck(1, 0, boundary2);           // T=2: Sets1 & 2 still there
+  CloseTtlNoDBClose();
+
+  OpenTtl(-1);
+  PutValues(boundary2, kSampleSize_ - boundary2);  // T=3: Set3 never deleted
+  SleepCompactCheck(1, 0, kSampleSize_, true);  // T=4: Sets 1,2,3 still there
+  CloseTtlNoDBClose();
+}
+
+// Puts a set of values and checks its presence using Get during ttl
+TEST_F(TtlTest, PresentDuringTTL) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(2);                  // T=0:Open the db with ttl = 2
+  PutValues(0, kSampleSize_);  // T=0:Insert Set1. Delete at t=2
+  SleepCompactCheck(1, 0, kSampleSize_,
+                    true);  // T=1:Set1 should still be there
+  CloseTtl();
+}
+
+// Puts a set of values and checks its absence using Get after ttl
+TEST_F(TtlTest, AbsentAfterTTL) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(1);                  // T=0:Open the db with ttl = 2
+  PutValues(0, kSampleSize_);  // T=0:Insert Set1. Delete at t=2
+  SleepCompactCheck(2, 0, kSampleSize_, false);  // T=2:Set1 should not be there
+  CloseTtl();
+}
+
+// Resets the timestamp of a set of kvs by updating them and checks that they
+// are not deleted according to the old timestamp
+TEST_F(TtlTest, ResetTimestamp) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(3);
+  PutValues(0, kSampleSize_);             // T=0: Insert Set1. Delete at t=3
+  env_->Sleep(2);                         // T=2
+  PutValues(0, kSampleSize_);             // T=2: Insert Set1. Delete at t=5
+  SleepCompactCheck(2, 0, kSampleSize_);  // T=4: Set1 should still be there
+  CloseTtl();
+}
+
+// Similar to PresentDuringTTL but uses Iterator
+TEST_F(TtlTest, IterPresentDuringTTL) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(2);
+  PutValues(0, kSampleSize_);                 // T=0: Insert. Delete at t=2
+  SleepCompactCheckIter(1, 0, kSampleSize_);  // T=1: Set should be there
+  CloseTtl();
+}
+
+// Similar to AbsentAfterTTL but uses Iterator
+TEST_F(TtlTest, IterAbsentAfterTTL) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(1);
+  PutValues(0, kSampleSize_);  // T=0: Insert. Delete at t=1
+  SleepCompactCheckIter(2, 0, kSampleSize_, false);  // T=2: Should not be there
+  CloseTtl();
+}
+
+// Checks presence while opening the same db more than once with the same ttl
+// Note: The second open will open the same db
+TEST_F(TtlTest, MultiOpenSamePresent) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(2);
+  PutValues(0, kSampleSize_);  // T=0: Insert. Delete at t=2
+  CloseTtl();
+
+  OpenTtl(2);                             // T=0. Delete at t=2
+  SleepCompactCheck(1, 0, kSampleSize_);  // T=1: Set should be there
+  CloseTtl();
+}
+
+// Checks absence while opening the same db more than once with the same ttl
+// Note: The second open will open the same db
+TEST_F(TtlTest, MultiOpenSameAbsent) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(1);
+  PutValues(0, kSampleSize_);  // T=0: Insert. Delete at t=1
+  CloseTtl();
+
+  OpenTtl(1);                                    // T=0.Delete at t=1
+  SleepCompactCheck(2, 0, kSampleSize_, false);  // T=2: Set should not be there
+  CloseTtl();
+}
+
+// Checks presence while opening the same db more than once with bigger ttl
+TEST_F(TtlTest, MultiOpenDifferent) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(1);
+  PutValues(0, kSampleSize_);  // T=0: Insert. Delete at t=1
+  CloseTtl();
+
+  OpenTtl(3);                             // T=0: Set deleted at t=3
+  SleepCompactCheck(2, 0, kSampleSize_);  // T=2: Set should be there
+  CloseTtl();
+}
+
+// Checks presence during ttl in read_only mode
+TEST_F(TtlTest, ReadOnlyPresentForever) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(1);                  // T=0:Open the db normally
+  PutValues(0, kSampleSize_);  // T=0:Insert Set1. Delete at t=1
+  CloseTtl();
+
+  OpenReadOnlyTtl(1);
+  ASSERT_TRUE(db_ttl_);
+
+  env_->Sleep(2);
+  Status s = ManualCompact();  // T=2:Set1 should still be there
+  ASSERT_TRUE(s.IsNotSupported());
+  CompactCheck(0, kSampleSize_);
+  CloseTtl();
+}
+
+// Checks whether WriteBatch works well with TTL
+// Puts all kvs in kvmap_ in a batch and writes first, then deletes first half
+TEST_F(TtlTest, WriteBatchTest) {
+  MakeKVMap(kSampleSize_);
+  BatchOperation batch_ops[kSampleSize_];
+  for (int i = 0; i < kSampleSize_; i++) {
+    batch_ops[i] = OP_PUT;
+  }
+
+  OpenTtl(2);
+  MakePutWriteBatch(batch_ops, kSampleSize_);
+  for (int i = 0; i < kSampleSize_ / 2; i++) {
+    batch_ops[i] = OP_DELETE;
+  }
+  MakePutWriteBatch(batch_ops, kSampleSize_ / 2);
+  SleepCompactCheck(0, 0, kSampleSize_ / 2, false);
+  SleepCompactCheck(0, kSampleSize_ / 2, kSampleSize_ - kSampleSize_ / 2);
+  CloseTtl();
+}
+
+// Checks user's compaction filter for correctness with TTL logic
+TEST_F(TtlTest, CompactionFilter) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtlWithTestCompaction(1);
+  PutValues(0, kSampleSize_);  // T=0:Insert Set1. Delete at t=1
+  // T=2: TTL logic takes precedence over TestFilter:-Set1 should not be there
+  SleepCompactCheck(2, 0, kSampleSize_, false);
+  CloseTtl();
+
+  OpenTtlWithTestCompaction(3);
+  PutValues(0, kSampleSize_);  // T=0:Insert Set1.
+  int64_t partition = kSampleSize_ / 3;
+  SleepCompactCheck(1, 0, partition, false);                   // Part dropped
+  SleepCompactCheck(0, partition, partition);                  // Part kept
+  SleepCompactCheck(0, 2 * partition, partition, true, true);  // Part changed
+  CloseTtl();
+}
+
+// Insert some key-values which KeyMayExist should be able to get and check that
+// values returned are fine
+TEST_F(TtlTest, KeyMayExist) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl();
+  PutValues(0, kSampleSize_, false);
+
+  SimpleKeyMayExistCheck();
+
+  CloseTtl();
+}
+
+TEST_F(TtlTest, MultiGetTest) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl();
+  PutValues(0, kSampleSize_, false);
+
+  SimpleMultiGetTest();
+
+  CloseTtl();
+}
+
+TEST_F(TtlTest, ColumnFamiliesTest) {
+  DB* db;
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_.get();
+
+  DB::Open(options, dbname_, &db);
+  ColumnFamilyHandle* handle;
+  ASSERT_OK(db->CreateColumnFamily(ColumnFamilyOptions(options),
+                                   "ttl_column_family", &handle));
+
+  delete handle;
+  delete db;
+
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(ColumnFamilyDescriptor(
+      kDefaultColumnFamilyName, ColumnFamilyOptions(options)));
+  column_families.push_back(ColumnFamilyDescriptor(
+      "ttl_column_family", ColumnFamilyOptions(options)));
+
+  std::vector<ColumnFamilyHandle*> handles;
+
+  ASSERT_OK(DBWithTTL::Open(DBOptions(options), dbname_, column_families,
+                            &handles, &db_ttl_, {3, 5}, false));
+  ASSERT_EQ(handles.size(), 2U);
+  ColumnFamilyHandle* new_handle;
+  ASSERT_OK(db_ttl_->CreateColumnFamilyWithTtl(options, "ttl_column_family_2",
+                                               &new_handle, 2));
+  handles.push_back(new_handle);
+
+  MakeKVMap(kSampleSize_);
+  PutValues(0, kSampleSize_, false, handles[0]);
+  PutValues(0, kSampleSize_, false, handles[1]);
+  PutValues(0, kSampleSize_, false, handles[2]);
+
+  // everything should be there after 1 second
+  SleepCompactCheck(1, 0, kSampleSize_, true, false, handles[0]);
+  SleepCompactCheck(0, 0, kSampleSize_, true, false, handles[1]);
+  SleepCompactCheck(0, 0, kSampleSize_, true, false, handles[2]);
+
+  // only column family 1 should be alive after 4 seconds
+  SleepCompactCheck(3, 0, kSampleSize_, false, false, handles[0]);
+  SleepCompactCheck(0, 0, kSampleSize_, true, false, handles[1]);
+  SleepCompactCheck(0, 0, kSampleSize_, false, false, handles[2]);
+
+  // nothing should be there after 6 seconds
+  SleepCompactCheck(2, 0, kSampleSize_, false, false, handles[0]);
+  SleepCompactCheck(0, 0, kSampleSize_, false, false, handles[1]);
+  SleepCompactCheck(0, 0, kSampleSize_, false, false, handles[2]);
+
+  for (auto h : handles) {
+    delete h;
+  }
+  delete db_ttl_;
+  db_ttl_ = nullptr;
+}
+
+// Puts a set of values and checks its absence using Get after ttl
+TEST_F(TtlTest, ChangeTtlOnOpenDb) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(1);  // T=0:Open the db with ttl = 2
+  SetTtl(3);
+  PutValues(0, kSampleSize_);  // T=0:Insert Set1. Delete at t=2
+  SleepCompactCheck(2, 0, kSampleSize_, true);  // T=2:Set1 should be there
+  CloseTtl();
+}
+
+// Test DeleteRange for DBWithTtl
+TEST_F(TtlTest, DeleteRangeTest) {
+  OpenTtl();
+  ASSERT_OK(db_ttl_->Put(WriteOptions(), "a", "val"));
+  MakeDeleteRange("a", "b");
+  ASSERT_OK(db_ttl_->Put(WriteOptions(), "c", "val"));
+  MakeDeleteRange("b", "d");
+  ASSERT_OK(db_ttl_->Put(WriteOptions(), "e", "val"));
+  MakeDeleteRange("d", "e");
+  // first iteration verifies query correctness in memtable, second verifies
+  // query correctness for a single SST file
+  for (int i = 0; i < 2; i++) {
+    if (i > 0) {
+      ASSERT_OK(db_ttl_->Flush(FlushOptions()));
+    }
+    std::string value;
+    ASSERT_TRUE(db_ttl_->Get(ReadOptions(), "a", &value).IsNotFound());
+    ASSERT_TRUE(db_ttl_->Get(ReadOptions(), "c", &value).IsNotFound());
+    ASSERT_OK(db_ttl_->Get(ReadOptions(), "e", &value));
+  }
+  CloseTtl();
+}
+
+class DummyFilter : public CompactionFilter {
+ public:
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    return false;
+  }
+
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "DummyFilter"; }
+};
+
+class DummyFilterFactory : public CompactionFilterFactory {
+ public:
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "DummyFilterFactory"; }
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context&) override {
+    std::unique_ptr<CompactionFilter> f(new DummyFilter());
+    return f;
+  }
+};
+
+static int RegisterTestObjects(ObjectLibrary& library,
+                               const std::string& /*arg*/) {
+  library.AddFactory<CompactionFilter>(
+      "DummyFilter", [](const std::string& /*uri*/,
+                        std::unique_ptr<CompactionFilter>* /*guard*/,
+                        std::string* /* errmsg */) {
+        static DummyFilter dummy;
+        return &dummy;
+      });
+  library.AddFactory<CompactionFilterFactory>(
+      "DummyFilterFactory", [](const std::string& /*uri*/,
+                               std::unique_ptr<CompactionFilterFactory>* guard,
+                               std::string* /* errmsg */) {
+        guard->reset(new DummyFilterFactory());
+        return guard->get();
+      });
+  return 2;
+}
+
+class TtlOptionsTest : public testing::Test {
+ public:
+  TtlOptionsTest() {
+    config_options_.registry->AddLibrary("RegisterTtlObjects",
+                                         RegisterTtlObjects, "");
+    config_options_.registry->AddLibrary("RegisterTtlTestObjects",
+                                         RegisterTestObjects, "");
+  }
+  ConfigOptions config_options_;
+};
+
+TEST_F(TtlOptionsTest, LoadTtlCompactionFilter) {
+  const CompactionFilter* filter = nullptr;
+
+  ASSERT_OK(CompactionFilter::CreateFromString(
+      config_options_, TtlCompactionFilter::kClassName(), &filter));
+  ASSERT_NE(filter, nullptr);
+  ASSERT_STREQ(filter->Name(), TtlCompactionFilter::kClassName());
+  auto ttl = filter->GetOptions<int32_t>("TTL");
+  ASSERT_NE(ttl, nullptr);
+  ASSERT_EQ(*ttl, 0);
+  ASSERT_OK(filter->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+  delete filter;
+  filter = nullptr;
+
+  ASSERT_OK(CompactionFilter::CreateFromString(
+      config_options_, "id=TtlCompactionFilter; ttl=123", &filter));
+  ASSERT_NE(filter, nullptr);
+  ttl = filter->GetOptions<int32_t>("TTL");
+  ASSERT_NE(ttl, nullptr);
+  ASSERT_EQ(*ttl, 123);
+  ASSERT_OK(filter->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+  delete filter;
+  filter = nullptr;
+
+  ASSERT_OK(CompactionFilter::CreateFromString(
+      config_options_,
+      "id=TtlCompactionFilter; ttl=456; user_filter=DummyFilter;", &filter));
+  ASSERT_NE(filter, nullptr);
+  auto inner = filter->CheckedCast<DummyFilter>();
+  ASSERT_NE(inner, nullptr);
+  ASSERT_OK(filter->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+  std::string mismatch;
+  std::string opts_str = filter->ToString(config_options_);
+  const CompactionFilter* copy = nullptr;
+  ASSERT_OK(
+      CompactionFilter::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(filter->AreEquivalent(config_options_, copy, &mismatch));
+  delete filter;
+  delete copy;
+}
+
+TEST_F(TtlOptionsTest, LoadTtlCompactionFilterFactory) {
+  std::shared_ptr<CompactionFilterFactory> cff;
+
+  ASSERT_OK(CompactionFilterFactory::CreateFromString(
+      config_options_, TtlCompactionFilterFactory::kClassName(), &cff));
+  ASSERT_NE(cff.get(), nullptr);
+  ASSERT_STREQ(cff->Name(), TtlCompactionFilterFactory::kClassName());
+  auto ttl = cff->GetOptions<int32_t>("TTL");
+  ASSERT_NE(ttl, nullptr);
+  ASSERT_EQ(*ttl, 0);
+  ASSERT_OK(cff->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+
+  ASSERT_OK(CompactionFilterFactory::CreateFromString(
+      config_options_, "id=TtlCompactionFilterFactory; ttl=123", &cff));
+  ASSERT_NE(cff.get(), nullptr);
+  ASSERT_STREQ(cff->Name(), TtlCompactionFilterFactory::kClassName());
+  ttl = cff->GetOptions<int32_t>("TTL");
+  ASSERT_NE(ttl, nullptr);
+  ASSERT_EQ(*ttl, 123);
+  ASSERT_OK(cff->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+
+  ASSERT_OK(CompactionFilterFactory::CreateFromString(
+      config_options_,
+      "id=TtlCompactionFilterFactory; ttl=456; "
+      "user_filter_factory=DummyFilterFactory;",
+      &cff));
+  ASSERT_NE(cff.get(), nullptr);
+  auto filter = cff->CreateCompactionFilter(CompactionFilter::Context());
+  ASSERT_NE(filter.get(), nullptr);
+  auto ttlf = filter->CheckedCast<TtlCompactionFilter>();
+  ASSERT_EQ(filter.get(), ttlf);
+  auto user = filter->CheckedCast<DummyFilter>();
+  ASSERT_NE(user, nullptr);
+  ASSERT_OK(cff->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+
+  std::string opts_str = cff->ToString(config_options_);
+  std::string mismatch;
+  std::shared_ptr<CompactionFilterFactory> copy;
+  ASSERT_OK(CompactionFilterFactory::CreateFromString(config_options_, opts_str,
+                                                      &copy));
+  ASSERT_TRUE(cff->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+
+TEST_F(TtlOptionsTest, LoadTtlMergeOperator) {
+  std::shared_ptr<MergeOperator> mo;
+
+  config_options_.invoke_prepare_options = false;
+  ASSERT_OK(MergeOperator::CreateFromString(
+      config_options_, TtlMergeOperator::kClassName(), &mo));
+  ASSERT_NE(mo.get(), nullptr);
+  ASSERT_STREQ(mo->Name(), TtlMergeOperator::kClassName());
+  ASSERT_NOK(mo->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+
+  config_options_.invoke_prepare_options = true;
+  ASSERT_OK(MergeOperator::CreateFromString(
+      config_options_, "id=TtlMergeOperator; user_operator=bytesxor", &mo));
+  ASSERT_NE(mo.get(), nullptr);
+  ASSERT_STREQ(mo->Name(), TtlMergeOperator::kClassName());
+  ASSERT_OK(mo->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+  auto ttl_mo = mo->CheckedCast<TtlMergeOperator>();
+  ASSERT_EQ(mo.get(), ttl_mo);
+  auto user = ttl_mo->CheckedCast<BytesXOROperator>();
+  ASSERT_NE(user, nullptr);
+
+  std::string mismatch;
+  std::string opts_str = mo->ToString(config_options_);
+  std::shared_ptr<MergeOperator> copy;
+  ASSERT_OK(MergeOperator::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(mo->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+// A black-box test for the ttl wrapper around rocksdb
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as DBWithTTL is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/util_merge_operators_test.cc b/src/rocksdb/utilities/util_merge_operators_test.cc
new file mode 100644
index 000000000..fed6f1a75
--- /dev/null
+++ b/src/rocksdb/utilities/util_merge_operators_test.cc
@@ -0,0 +1,100 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class UtilMergeOperatorTest : public testing::Test {
+ public:
+  UtilMergeOperatorTest() {}
+
+  std::string FullMergeV2(std::string existing_value,
+                          std::vector<std::string> operands,
+                          std::string key = "") {
+    std::string result;
+    Slice result_operand(nullptr, 0);
+
+    Slice existing_value_slice(existing_value);
+    std::vector<Slice> operands_slice(operands.begin(), operands.end());
+
+    const MergeOperator::MergeOperationInput merge_in(
+        key, &existing_value_slice, operands_slice, nullptr);
+    MergeOperator::MergeOperationOutput merge_out(result, result_operand);
+    merge_operator_->FullMergeV2(merge_in, &merge_out);
+
+    if (result_operand.data()) {
+      result.assign(result_operand.data(), result_operand.size());
+    }
+    return result;
+  }
+
+  std::string FullMergeV2(std::vector<std::string> operands,
+                          std::string key = "") {
+    std::string result;
+    Slice result_operand(nullptr, 0);
+
+    std::vector<Slice> operands_slice(operands.begin(), operands.end());
+
+    const MergeOperator::MergeOperationInput merge_in(key, nullptr,
+                                                      operands_slice, nullptr);
+    MergeOperator::MergeOperationOutput merge_out(result, result_operand);
+    merge_operator_->FullMergeV2(merge_in, &merge_out);
+
+    if (result_operand.data()) {
+      result.assign(result_operand.data(), result_operand.size());
+    }
+    return result;
+  }
+
+  std::string PartialMerge(std::string left, std::string right,
+                           std::string key = "") {
+    std::string result;
+
+    merge_operator_->PartialMerge(key, left, right, &result, nullptr);
+    return result;
+  }
+
+  std::string PartialMergeMulti(std::deque<std::string> operands,
+                                std::string key = "") {
+    std::string result;
+    std::deque<Slice> operands_slice(operands.begin(), operands.end());
+
+    merge_operator_->PartialMergeMulti(key, operands_slice, &result, nullptr);
+    return result;
+  }
+
+ protected:
+  std::shared_ptr<MergeOperator> merge_operator_;
+};
+
+TEST_F(UtilMergeOperatorTest, MaxMergeOperator) {
+  merge_operator_ = MergeOperators::CreateMaxOperator();
+
+  EXPECT_EQ("B", FullMergeV2("B", {"A"}));
+  EXPECT_EQ("B", FullMergeV2("A", {"B"}));
+  EXPECT_EQ("", FullMergeV2({"", "", ""}));
+  EXPECT_EQ("A", FullMergeV2({"A"}));
+  EXPECT_EQ("ABC", FullMergeV2({"ABC"}));
+  EXPECT_EQ("Z", FullMergeV2({"ABC", "Z", "C", "AXX"}));
+  EXPECT_EQ("ZZZ", FullMergeV2({"ABC", "CC", "Z", "ZZZ"}));
+  EXPECT_EQ("a", FullMergeV2("a", {"ABC", "CC", "Z", "ZZZ"}));
+
+  EXPECT_EQ("z", PartialMergeMulti({"a", "z", "efqfqwgwew", "aaz", "hhhhh"}));
+
+  EXPECT_EQ("b", PartialMerge("a", "b"));
+  EXPECT_EQ("z", PartialMerge("z", "azzz"));
+  EXPECT_EQ("a", PartialMerge("a", ""));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/utilities/wal_filter.cc b/src/rocksdb/utilities/wal_filter.cc
new file mode 100644
index 000000000..98bba3610
--- /dev/null
+++ b/src/rocksdb/utilities/wal_filter.cc
@@ -0,0 +1,23 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/wal_filter.h"
+
+#include <memory>
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/customizable_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status WalFilter::CreateFromString(const ConfigOptions& config_options,
+                                   const std::string& value,
+                                   WalFilter** filter) {
+  Status s =
+      LoadStaticObject<WalFilter>(config_options, value, nullptr, filter);
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc
new file mode 100644
index 000000000..408243b3f
--- /dev/null
+++ b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -0,0 +1,695 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/write_batch_with_index.h"
+
+#include <memory>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "memory/arena.h"
+#include "memtable/skiplist.h"
+#include "options/db_options.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/iterator.h"
+#include "util/cast_util.h"
+#include "util/string_util.h"
+#include "utilities/write_batch_with_index/write_batch_with_index_internal.h"
+
+namespace ROCKSDB_NAMESPACE {
+struct WriteBatchWithIndex::Rep {
+  explicit Rep(const Comparator* index_comparator, size_t reserved_bytes = 0,
+               size_t max_bytes = 0, bool _overwrite_key = false,
+               size_t protection_bytes_per_key = 0)
+      : write_batch(reserved_bytes, max_bytes, protection_bytes_per_key,
+                    index_comparator ? index_comparator->timestamp_size() : 0),
+        comparator(index_comparator, &write_batch),
+        skip_list(comparator, &arena),
+        overwrite_key(_overwrite_key),
+        last_entry_offset(0),
+        last_sub_batch_offset(0),
+        sub_batch_cnt(1) {}
+  ReadableWriteBatch write_batch;
+  WriteBatchEntryComparator comparator;
+  Arena arena;
+  WriteBatchEntrySkipList skip_list;
+  bool overwrite_key;
+  size_t last_entry_offset;
+  // The starting offset of the last sub-batch. A sub-batch starts right before
+  // inserting a key that is a duplicate of a key in the last sub-batch. Zero,
+  // the default, means that no duplicate key is detected so far.
+  size_t last_sub_batch_offset;
+  // Total number of sub-batches in the write batch. Default is 1.
+  size_t sub_batch_cnt;
+
+  // Remember current offset of internal write batch, which is used as
+  // the starting offset of the next record.
+  void SetLastEntryOffset() { last_entry_offset = write_batch.GetDataSize(); }
+
+  // In overwrite mode, find the existing entry for the same key and update it
+  // to point to the current entry.
+  // Return true if the key is found and updated.
+  bool UpdateExistingEntry(ColumnFamilyHandle* column_family, const Slice& key,
+                           WriteType type);
+  bool UpdateExistingEntryWithCfId(uint32_t column_family_id, const Slice& key,
+                                   WriteType type);
+
+  // Add the recent entry to the update.
+  // In overwrite mode, if key already exists in the index, update it.
+  void AddOrUpdateIndex(ColumnFamilyHandle* column_family, const Slice& key,
+                        WriteType type);
+  void AddOrUpdateIndex(const Slice& key, WriteType type);
+
+  // Allocate an index entry pointing to the last entry in the write batch and
+  // put it to skip list.
+  void AddNewEntry(uint32_t column_family_id);
+
+  // Clear all updates buffered in this batch.
+  void Clear();
+  void ClearIndex();
+
+  // Rebuild index by reading all records from the batch.
+  // Returns non-ok status on corruption.
+  Status ReBuildIndex();
+};
+
+bool WriteBatchWithIndex::Rep::UpdateExistingEntry(
+    ColumnFamilyHandle* column_family, const Slice& key, WriteType type) {
+  uint32_t cf_id = GetColumnFamilyID(column_family);
+  return UpdateExistingEntryWithCfId(cf_id, key, type);
+}
+
+bool WriteBatchWithIndex::Rep::UpdateExistingEntryWithCfId(
+    uint32_t column_family_id, const Slice& key, WriteType type) {
+  if (!overwrite_key) {
+    return false;
+  }
+
+  WBWIIteratorImpl iter(column_family_id, &skip_list, &write_batch,
+                        &comparator);
+  iter.Seek(key);
+  if (!iter.Valid()) {
+    return false;
+  } else if (!iter.MatchesKey(column_family_id, key)) {
+    return false;
+  } else {
+    // Move to the end of this key (NextKey-Prev)
+    iter.NextKey();  // Move to the next key
+    if (iter.Valid()) {
+      iter.Prev();  // Move back one entry
+    } else {
+      iter.SeekToLast();
+    }
+  }
+  WriteBatchIndexEntry* non_const_entry =
+      const_cast<WriteBatchIndexEntry*>(iter.GetRawEntry());
+  if (LIKELY(last_sub_batch_offset <= non_const_entry->offset)) {
+    last_sub_batch_offset = last_entry_offset;
+    sub_batch_cnt++;
+  }
+  if (type == kMergeRecord) {
+    return false;
+  } else {
+    non_const_entry->offset = last_entry_offset;
+    return true;
+  }
+}
+
+void WriteBatchWithIndex::Rep::AddOrUpdateIndex(
+    ColumnFamilyHandle* column_family, const Slice& key, WriteType type) {
+  if (!UpdateExistingEntry(column_family, key, type)) {
+    uint32_t cf_id = GetColumnFamilyID(column_family);
+    const auto* cf_cmp = GetColumnFamilyUserComparator(column_family);
+    if (cf_cmp != nullptr) {
+      comparator.SetComparatorForCF(cf_id, cf_cmp);
+    }
+    AddNewEntry(cf_id);
+  }
+}
+
+void WriteBatchWithIndex::Rep::AddOrUpdateIndex(const Slice& key,
+                                                WriteType type) {
+  if (!UpdateExistingEntryWithCfId(0, key, type)) {
+    AddNewEntry(0);
+  }
+}
+
+void WriteBatchWithIndex::Rep::AddNewEntry(uint32_t column_family_id) {
+  const std::string& wb_data = write_batch.Data();
+  Slice entry_ptr = Slice(wb_data.data() + last_entry_offset,
+                          wb_data.size() - last_entry_offset);
+  // Extract key
+  Slice key;
+  bool success =
+      ReadKeyFromWriteBatchEntry(&entry_ptr, &key, column_family_id != 0);
+#ifdef NDEBUG
+  (void)success;
+#endif
+  assert(success);
+
+  const Comparator* const ucmp = comparator.GetComparator(column_family_id);
+  size_t ts_sz = ucmp ? ucmp->timestamp_size() : 0;
+
+  if (ts_sz > 0) {
+    key.remove_suffix(ts_sz);
+  }
+
+  auto* mem = arena.Allocate(sizeof(WriteBatchIndexEntry));
+  auto* index_entry =
+      new (mem) WriteBatchIndexEntry(last_entry_offset, column_family_id,
+                                     key.data() - wb_data.data(), key.size());
+  skip_list.Insert(index_entry);
+}
+
+void WriteBatchWithIndex::Rep::Clear() {
+  write_batch.Clear();
+  ClearIndex();
+}
+
+void WriteBatchWithIndex::Rep::ClearIndex() {
+  skip_list.~WriteBatchEntrySkipList();
+  arena.~Arena();
+  new (&arena) Arena();
+  new (&skip_list) WriteBatchEntrySkipList(comparator, &arena);
+  last_entry_offset = 0;
+  last_sub_batch_offset = 0;
+  sub_batch_cnt = 1;
+}
+
+Status WriteBatchWithIndex::Rep::ReBuildIndex() {
+  Status s;
+
+  ClearIndex();
+
+  if (write_batch.Count() == 0) {
+    // Nothing to re-index
+    return s;
+  }
+
+  size_t offset = WriteBatchInternal::GetFirstOffset(&write_batch);
+
+  Slice input(write_batch.Data());
+  input.remove_prefix(offset);
+
+  // Loop through all entries in Rep and add each one to the index
+  uint32_t found = 0;
+  while (s.ok() && !input.empty()) {
+    Slice key, value, blob, xid;
+    uint32_t column_family_id = 0;  // default
+    char tag = 0;
+
+    // set offset of current entry for call to AddNewEntry()
+    last_entry_offset = input.data() - write_batch.Data().data();
+
+    s = ReadRecordFromWriteBatch(&input, &tag, &column_family_id, &key, &value,
+                                 &blob, &xid);
+    if (!s.ok()) {
+      break;
+    }
+
+    switch (tag) {
+      case kTypeColumnFamilyValue:
+      case kTypeValue:
+        found++;
+        if (!UpdateExistingEntryWithCfId(column_family_id, key, kPutRecord)) {
+          AddNewEntry(column_family_id);
+        }
+        break;
+      case kTypeColumnFamilyDeletion:
+      case kTypeDeletion:
+        found++;
+        if (!UpdateExistingEntryWithCfId(column_family_id, key,
+                                         kDeleteRecord)) {
+          AddNewEntry(column_family_id);
+        }
+        break;
+      case kTypeColumnFamilySingleDeletion:
+      case kTypeSingleDeletion:
+        found++;
+        if (!UpdateExistingEntryWithCfId(column_family_id, key,
+                                         kSingleDeleteRecord)) {
+          AddNewEntry(column_family_id);
+        }
+        break;
+      case kTypeColumnFamilyMerge:
+      case kTypeMerge:
+        found++;
+        if (!UpdateExistingEntryWithCfId(column_family_id, key, kMergeRecord)) {
+          AddNewEntry(column_family_id);
+        }
+        break;
+      case kTypeLogData:
+      case kTypeBeginPrepareXID:
+      case kTypeBeginPersistedPrepareXID:
+      case kTypeBeginUnprepareXID:
+      case kTypeEndPrepareXID:
+      case kTypeCommitXID:
+      case kTypeCommitXIDAndTimestamp:
+      case kTypeRollbackXID:
+      case kTypeNoop:
+        break;
+      default:
+        return Status::Corruption(
+            "unknown WriteBatch tag in ReBuildIndex",
+            std::to_string(static_cast<unsigned int>(tag)));
+    }
+  }
+
+  if (s.ok() && found != write_batch.Count()) {
+    s = Status::Corruption("WriteBatch has wrong count");
+  }
+
+  return s;
+}
+
+WriteBatchWithIndex::WriteBatchWithIndex(
+    const Comparator* default_index_comparator, size_t reserved_bytes,
+    bool overwrite_key, size_t max_bytes, size_t protection_bytes_per_key)
+    : rep(new Rep(default_index_comparator, reserved_bytes, max_bytes,
+                  overwrite_key, protection_bytes_per_key)) {}
+
+WriteBatchWithIndex::~WriteBatchWithIndex() {}
+
+WriteBatchWithIndex::WriteBatchWithIndex(WriteBatchWithIndex&&) = default;
+
+WriteBatchWithIndex& WriteBatchWithIndex::operator=(WriteBatchWithIndex&&) =
+    default;
+
+WriteBatch* WriteBatchWithIndex::GetWriteBatch() { return &rep->write_batch; }
+
+size_t WriteBatchWithIndex::SubBatchCnt() { return rep->sub_batch_cnt; }
+
+WBWIIterator* WriteBatchWithIndex::NewIterator() {
+  return new WBWIIteratorImpl(0, &(rep->skip_list), &rep->write_batch,
+                              &(rep->comparator));
+}
+
+WBWIIterator* WriteBatchWithIndex::NewIterator(
+    ColumnFamilyHandle* column_family) {
+  return new WBWIIteratorImpl(GetColumnFamilyID(column_family),
+                              &(rep->skip_list), &rep->write_batch,
+                              &(rep->comparator));
+}
+
+Iterator* WriteBatchWithIndex::NewIteratorWithBase(
+    ColumnFamilyHandle* column_family, Iterator* base_iterator,
+    const ReadOptions* read_options) {
+  auto wbwiii =
+      new WBWIIteratorImpl(GetColumnFamilyID(column_family), &(rep->skip_list),
+                           &rep->write_batch, &rep->comparator);
+  return new BaseDeltaIterator(column_family, base_iterator, wbwiii,
+                               GetColumnFamilyUserComparator(column_family),
+                               read_options);
+}
+
+Iterator* WriteBatchWithIndex::NewIteratorWithBase(Iterator* base_iterator) {
+  // default column family's comparator
+  auto wbwiii = new WBWIIteratorImpl(0, &(rep->skip_list), &rep->write_batch,
+                                     &rep->comparator);
+  return new BaseDeltaIterator(nullptr, base_iterator, wbwiii,
+                               rep->comparator.default_comparator());
+}
+
+Status WriteBatchWithIndex::Put(ColumnFamilyHandle* column_family,
+                                const Slice& key, const Slice& value) {
+  rep->SetLastEntryOffset();
+  auto s = rep->write_batch.Put(column_family, key, value);
+  if (s.ok()) {
+    rep->AddOrUpdateIndex(column_family, key, kPutRecord);
+  }
+  return s;
+}
+
+Status WriteBatchWithIndex::Put(const Slice& key, const Slice& value) {
+  rep->SetLastEntryOffset();
+  auto s = rep->write_batch.Put(key, value);
+  if (s.ok()) {
+    rep->AddOrUpdateIndex(key, kPutRecord);
+  }
+  return s;
+}
+
+Status WriteBatchWithIndex::Put(ColumnFamilyHandle* column_family,
+                                const Slice& /*key*/, const Slice& /*ts*/,
+                                const Slice& /*value*/) {
+  if (!column_family) {
+    return Status::InvalidArgument("column family handle cannot be nullptr");
+  }
+  // TODO: support WBWI::Put() with timestamp.
+  return Status::NotSupported();
+}
+
+Status WriteBatchWithIndex::Delete(ColumnFamilyHandle* column_family,
+                                   const Slice& key) {
+  rep->SetLastEntryOffset();
+  auto s = rep->write_batch.Delete(column_family, key);
+  if (s.ok()) {
+    rep->AddOrUpdateIndex(column_family, key, kDeleteRecord);
+  }
+  return s;
+}
+
+Status WriteBatchWithIndex::Delete(const Slice& key) {
+  rep->SetLastEntryOffset();
+  auto s = rep->write_batch.Delete(key);
+  if (s.ok()) {
+    rep->AddOrUpdateIndex(key, kDeleteRecord);
+  }
+  return s;
+}
+
+Status WriteBatchWithIndex::Delete(ColumnFamilyHandle* column_family,
+                                   const Slice& /*key*/, const Slice& /*ts*/) {
+  if (!column_family) {
+    return Status::InvalidArgument("column family handle cannot be nullptr");
+  }
+  // TODO: support WBWI::Delete() with timestamp.
+  return Status::NotSupported();
+}
+
+Status WriteBatchWithIndex::SingleDelete(ColumnFamilyHandle* column_family,
+                                         const Slice& key) {
+  rep->SetLastEntryOffset();
+  auto s = rep->write_batch.SingleDelete(column_family, key);
+  if (s.ok()) {
+    rep->AddOrUpdateIndex(column_family, key, kSingleDeleteRecord);
+  }
+  return s;
+}
+
+Status WriteBatchWithIndex::SingleDelete(const Slice& key) {
+  rep->SetLastEntryOffset();
+  auto s = rep->write_batch.SingleDelete(key);
+  if (s.ok()) {
+    rep->AddOrUpdateIndex(key, kSingleDeleteRecord);
+  }
+  return s;
+}
+
+Status WriteBatchWithIndex::SingleDelete(ColumnFamilyHandle* column_family,
+                                         const Slice& /*key*/,
+                                         const Slice& /*ts*/) {
+  if (!column_family) {
+    return Status::InvalidArgument("column family handle cannot be nullptr");
+  }
+  // TODO: support WBWI::SingleDelete() with timestamp.
+  return Status::NotSupported();
+}
+
+Status WriteBatchWithIndex::Merge(ColumnFamilyHandle* column_family,
+                                  const Slice& key, const Slice& value) {
+  rep->SetLastEntryOffset();
+  auto s = rep->write_batch.Merge(column_family, key, value);
+  if (s.ok()) {
+    rep->AddOrUpdateIndex(column_family, key, kMergeRecord);
+  }
+  return s;
+}
+
+Status WriteBatchWithIndex::Merge(const Slice& key, const Slice& value) {
+  rep->SetLastEntryOffset();
+  auto s = rep->write_batch.Merge(key, value);
+  if (s.ok()) {
+    rep->AddOrUpdateIndex(key, kMergeRecord);
+  }
+  return s;
+}
+
+Status WriteBatchWithIndex::PutLogData(const Slice& blob) {
+  return rep->write_batch.PutLogData(blob);
+}
+
+void WriteBatchWithIndex::Clear() { rep->Clear(); }
+
+Status WriteBatchWithIndex::GetFromBatch(ColumnFamilyHandle* column_family,
+                                         const DBOptions& options,
+                                         const Slice& key, std::string* value) {
+  Status s;
+  WriteBatchWithIndexInternal wbwii(&options, column_family);
+  auto result = wbwii.GetFromBatch(this, key, value, &s);
+
+  switch (result) {
+    case WBWIIteratorImpl::kFound:
+    case WBWIIteratorImpl::kError:
+      // use returned status
+      break;
+    case WBWIIteratorImpl::kDeleted:
+    case WBWIIteratorImpl::kNotFound:
+      s = Status::NotFound();
+      break;
+    case WBWIIteratorImpl::kMergeInProgress:
+      s = Status::MergeInProgress();
+      break;
+    default:
+      assert(false);
+  }
+
+  return s;
+}
+
+Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db,
+                                              const ReadOptions& read_options,
+                                              const Slice& key,
+                                              std::string* value) {
+  assert(value != nullptr);
+  PinnableSlice pinnable_val(value);
+  assert(!pinnable_val.IsPinned());
+  auto s = GetFromBatchAndDB(db, read_options, db->DefaultColumnFamily(), key,
+                             &pinnable_val);
+  if (s.ok() && pinnable_val.IsPinned()) {
+    value->assign(pinnable_val.data(), pinnable_val.size());
+  }  // else value is already assigned
+  return s;
+}
+
+Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db,
+                                              const ReadOptions& read_options,
+                                              const Slice& key,
+                                              PinnableSlice* pinnable_val) {
+  return GetFromBatchAndDB(db, read_options, db->DefaultColumnFamily(), key,
+                           pinnable_val);
+}
+
+Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db,
+                                              const ReadOptions& read_options,
+                                              ColumnFamilyHandle* column_family,
+                                              const Slice& key,
+                                              std::string* value) {
+  assert(value != nullptr);
+  PinnableSlice pinnable_val(value);
+  assert(!pinnable_val.IsPinned());
+  auto s =
+      GetFromBatchAndDB(db, read_options, column_family, key, &pinnable_val);
+  if (s.ok() && pinnable_val.IsPinned()) {
+    value->assign(pinnable_val.data(), pinnable_val.size());
+  }  // else value is already assigned
+  return s;
+}
+
+Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db,
+                                              const ReadOptions& read_options,
+                                              ColumnFamilyHandle* column_family,
+                                              const Slice& key,
+                                              PinnableSlice* pinnable_val) {
+  return GetFromBatchAndDB(db, read_options, column_family, key, pinnable_val,
+                           nullptr);
+}
+
+Status WriteBatchWithIndex::GetFromBatchAndDB(
+    DB* db, const ReadOptions& read_options, ColumnFamilyHandle* column_family,
+    const Slice& key, PinnableSlice* pinnable_val, ReadCallback* callback) {
+  const Comparator* const ucmp = rep->comparator.GetComparator(column_family);
+  size_t ts_sz = ucmp ? ucmp->timestamp_size() : 0;
+  if (ts_sz > 0 && !read_options.timestamp) {
+    return Status::InvalidArgument("Must specify timestamp");
+  }
+
+  Status s;
+  WriteBatchWithIndexInternal wbwii(db, column_family);
+
+  // Since the lifetime of the WriteBatch is the same as that of the transaction
+  // we cannot pin it as otherwise the returned value will not be available
+  // after the transaction finishes.
+  std::string& batch_value = *pinnable_val->GetSelf();
+  auto result = wbwii.GetFromBatch(this, key, &batch_value, &s);
+
+  if (result == WBWIIteratorImpl::kFound) {
+    pinnable_val->PinSelf();
+    return s;
+  } else if (!s.ok() || result == WBWIIteratorImpl::kError) {
+    return s;
+  } else if (result == WBWIIteratorImpl::kDeleted) {
+    return Status::NotFound();
+  }
+  assert(result == WBWIIteratorImpl::kMergeInProgress ||
+         result == WBWIIteratorImpl::kNotFound);
+
+  // Did not find key in batch OR could not resolve Merges.  Try DB.
+  if (!callback) {
+    s = db->Get(read_options, column_family, key, pinnable_val);
+  } else {
+    DBImpl::GetImplOptions get_impl_options;
+    get_impl_options.column_family = column_family;
+    get_impl_options.value = pinnable_val;
+    get_impl_options.callback = callback;
+    s = static_cast_with_check<DBImpl>(db->GetRootDB())
+            ->GetImpl(read_options, key, get_impl_options);
+  }
+
+  if (s.ok() || s.IsNotFound()) {  // DB Get Succeeded
+    if (result == WBWIIteratorImpl::kMergeInProgress) {
+      // Merge result from DB with merges in Batch
+      std::string merge_result;
+      if (s.ok()) {
+        s = wbwii.MergeKey(key, pinnable_val, &merge_result);
+      } else {  // Key not present in db (s.IsNotFound())
+        s = wbwii.MergeKey(key, nullptr, &merge_result);
+      }
+      if (s.ok()) {
+        pinnable_val->Reset();
+        *pinnable_val->GetSelf() = std::move(merge_result);
+        pinnable_val->PinSelf();
+      }
+    }
+  }
+
+  return s;
+}
+
+void WriteBatchWithIndex::MultiGetFromBatchAndDB(
+    DB* db, const ReadOptions& read_options, ColumnFamilyHandle* column_family,
+    const size_t num_keys, const Slice* keys, PinnableSlice* values,
+    Status* statuses, bool sorted_input) {
+  MultiGetFromBatchAndDB(db, read_options, column_family, num_keys, keys,
+                         values, statuses, sorted_input, nullptr);
+}
+
+void WriteBatchWithIndex::MultiGetFromBatchAndDB(
+    DB* db, const ReadOptions& read_options, ColumnFamilyHandle* column_family,
+    const size_t num_keys, const Slice* keys, PinnableSlice* values,
+    Status* statuses, bool sorted_input, ReadCallback* callback) {
+  const Comparator* const ucmp = rep->comparator.GetComparator(column_family);
+  size_t ts_sz = ucmp ? ucmp->timestamp_size() : 0;
+  if (ts_sz > 0 && !read_options.timestamp) {
+    for (size_t i = 0; i < num_keys; ++i) {
+      statuses[i] = Status::InvalidArgument("Must specify timestamp");
+    }
+    return;
+  }
+
+  WriteBatchWithIndexInternal wbwii(db, column_family);
+
+  autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
+  autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
+  // To hold merges from the write batch
+  autovector<std::pair<WBWIIteratorImpl::Result, MergeContext>,
+             MultiGetContext::MAX_BATCH_SIZE>
+      merges;
+  // Since the lifetime of the WriteBatch is the same as that of the transaction
+  // we cannot pin it as otherwise the returned value will not be available
+  // after the transaction finishes.
+  for (size_t i = 0; i < num_keys; ++i) {
+    MergeContext merge_context;
+    std::string batch_value;
+    Status* s = &statuses[i];
+    PinnableSlice* pinnable_val = &values[i];
+    pinnable_val->Reset();
+    auto result =
+        wbwii.GetFromBatch(this, keys[i], &merge_context, &batch_value, s);
+
+    if (result == WBWIIteratorImpl::kFound) {
+      *pinnable_val->GetSelf() = std::move(batch_value);
+      pinnable_val->PinSelf();
+      continue;
+    }
+    if (result == WBWIIteratorImpl::kDeleted) {
+      *s = Status::NotFound();
+      continue;
+    }
+    if (result == WBWIIteratorImpl::kError) {
+      continue;
+    }
+    assert(result == WBWIIteratorImpl::kMergeInProgress ||
+           result == WBWIIteratorImpl::kNotFound);
+    key_context.emplace_back(column_family, keys[i], &values[i],
+                             /*timestamp*/ nullptr, &statuses[i]);
+    merges.emplace_back(result, std::move(merge_context));
+  }
+
+  for (KeyContext& key : key_context) {
+    sorted_keys.emplace_back(&key);
+  }
+
+  // Did not find key in batch OR could not resolve Merges.  Try DB.
+  static_cast_with_check<DBImpl>(db->GetRootDB())
+      ->PrepareMultiGetKeys(key_context.size(), sorted_input, &sorted_keys);
+  static_cast_with_check<DBImpl>(db->GetRootDB())
+      ->MultiGetWithCallback(read_options, column_family, callback,
+                             &sorted_keys);
+
+  for (auto iter = key_context.begin(); iter != key_context.end(); ++iter) {
+    KeyContext& key = *iter;
+    if (key.s->ok() || key.s->IsNotFound()) {  // DB Get Succeeded
+      size_t index = iter - key_context.begin();
+      std::pair<WBWIIteratorImpl::Result, MergeContext>& merge_result =
+          merges[index];
+      if (merge_result.first == WBWIIteratorImpl::kMergeInProgress) {
+        std::string merged_value;
+        // Merge result from DB with merges in Batch
+        if (key.s->ok()) {
+          *key.s = wbwii.MergeKey(*key.key, iter->value, merge_result.second,
+                                  &merged_value);
+        } else {  // Key not present in db (s.IsNotFound())
+          *key.s = wbwii.MergeKey(*key.key, nullptr, merge_result.second,
+                                  &merged_value);
+        }
+        if (key.s->ok()) {
+          key.value->Reset();
+          *key.value->GetSelf() = std::move(merged_value);
+          key.value->PinSelf();
+        }
+      }
+    }
+  }
+}
+
+void WriteBatchWithIndex::SetSavePoint() { rep->write_batch.SetSavePoint(); }
+
+Status WriteBatchWithIndex::RollbackToSavePoint() {
+  Status s = rep->write_batch.RollbackToSavePoint();
+
+  if (s.ok()) {
+    rep->sub_batch_cnt = 1;
+    rep->last_sub_batch_offset = 0;
+    s = rep->ReBuildIndex();
+  }
+
+  return s;
+}
+
+Status WriteBatchWithIndex::PopSavePoint() {
+  return rep->write_batch.PopSavePoint();
+}
+
+void WriteBatchWithIndex::SetMaxBytes(size_t max_bytes) {
+  rep->write_batch.SetMaxBytes(max_bytes);
+}
+
+size_t WriteBatchWithIndex::GetDataSize() const {
+  return rep->write_batch.GetDataSize();
+}
+
+const Comparator* WriteBatchWithIndexInternal::GetUserComparator(
+    const WriteBatchWithIndex& wbwi, uint32_t cf_id) {
+  const WriteBatchEntryComparator& ucmps = wbwi.rep->comparator;
+  return ucmps.GetComparator(cf_id);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc
new file mode 100644
index 000000000..3c9205bf7
--- /dev/null
+++ b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc
@@ -0,0 +1,735 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/write_batch_with_index/write_batch_with_index_internal.h"
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+BaseDeltaIterator::BaseDeltaIterator(ColumnFamilyHandle* column_family,
+                                     Iterator* base_iterator,
+                                     WBWIIteratorImpl* delta_iterator,
+                                     const Comparator* comparator,
+                                     const ReadOptions* read_options)
+    : forward_(true),
+      current_at_base_(true),
+      equal_keys_(false),
+      status_(Status::OK()),
+      base_iterator_(base_iterator),
+      delta_iterator_(delta_iterator),
+      comparator_(comparator),
+      iterate_upper_bound_(read_options ? read_options->iterate_upper_bound
+                                        : nullptr) {
+  assert(comparator_);
+  wbwii_.reset(new WriteBatchWithIndexInternal(column_family));
+}
+
+bool BaseDeltaIterator::Valid() const {
+  return status_.ok() ? (current_at_base_ ? BaseValid() : DeltaValid()) : false;
+}
+
+void BaseDeltaIterator::SeekToFirst() {
+  forward_ = true;
+  base_iterator_->SeekToFirst();
+  delta_iterator_->SeekToFirst();
+  UpdateCurrent();
+}
+
+void BaseDeltaIterator::SeekToLast() {
+  forward_ = false;
+  base_iterator_->SeekToLast();
+  delta_iterator_->SeekToLast();
+  UpdateCurrent();
+}
+
+void BaseDeltaIterator::Seek(const Slice& k) {
+  forward_ = true;
+  base_iterator_->Seek(k);
+  delta_iterator_->Seek(k);
+  UpdateCurrent();
+}
+
+void BaseDeltaIterator::SeekForPrev(const Slice& k) {
+  forward_ = false;
+  base_iterator_->SeekForPrev(k);
+  delta_iterator_->SeekForPrev(k);
+  UpdateCurrent();
+}
+
+void BaseDeltaIterator::Next() {
+  if (!Valid()) {
+    status_ = Status::NotSupported("Next() on invalid iterator");
+    return;
+  }
+
+  if (!forward_) {
+    // Need to change direction
+    // if our direction was backward and we're not equal, we have two states:
+    // * both iterators are valid: we're already in a good state (current
+    // shows to smaller)
+    // * only one iterator is valid: we need to advance that iterator
+    forward_ = true;
+    equal_keys_ = false;
+    if (!BaseValid()) {
+      assert(DeltaValid());
+      base_iterator_->SeekToFirst();
+    } else if (!DeltaValid()) {
+      delta_iterator_->SeekToFirst();
+    } else if (current_at_base_) {
+      // Change delta from larger than base to smaller
+      AdvanceDelta();
+    } else {
+      // Change base from larger than delta to smaller
+      AdvanceBase();
+    }
+    if (DeltaValid() && BaseValid()) {
+      if (0 == comparator_->CompareWithoutTimestamp(
+                   delta_iterator_->Entry().key, /*a_has_ts=*/false,
+                   base_iterator_->key(), /*b_has_ts=*/false)) {
+        equal_keys_ = true;
+      }
+    }
+  }
+  Advance();
+}
+
+void BaseDeltaIterator::Prev() {
+  if (!Valid()) {
+    status_ = Status::NotSupported("Prev() on invalid iterator");
+    return;
+  }
+
+  if (forward_) {
+    // Need to change direction
+    // if our direction was backward and we're not equal, we have two states:
+    // * both iterators are valid: we're already in a good state (current
+    // shows to smaller)
+    // * only one iterator is valid: we need to advance that iterator
+    forward_ = false;
+    equal_keys_ = false;
+    if (!BaseValid()) {
+      assert(DeltaValid());
+      base_iterator_->SeekToLast();
+    } else if (!DeltaValid()) {
+      delta_iterator_->SeekToLast();
+    } else if (current_at_base_) {
+      // Change delta from less advanced than base to more advanced
+      AdvanceDelta();
+    } else {
+      // Change base from less advanced than delta to more advanced
+      AdvanceBase();
+    }
+    if (DeltaValid() && BaseValid()) {
+      if (0 == comparator_->CompareWithoutTimestamp(
+                   delta_iterator_->Entry().key, /*a_has_ts=*/false,
+                   base_iterator_->key(), /*b_has_ts=*/false)) {
+        equal_keys_ = true;
+      }
+    }
+  }
+
+  Advance();
+}
+
+Slice BaseDeltaIterator::key() const {
+  return current_at_base_ ? base_iterator_->key()
+                          : delta_iterator_->Entry().key;
+}
+
+Slice BaseDeltaIterator::value() const {
+  if (current_at_base_) {
+    return base_iterator_->value();
+  } else {
+    WriteEntry delta_entry = delta_iterator_->Entry();
+    if (wbwii_->GetNumOperands() == 0) {
+      return delta_entry.value;
+    } else if (delta_entry.type == kDeleteRecord ||
+               delta_entry.type == kSingleDeleteRecord) {
+      status_ =
+          wbwii_->MergeKey(delta_entry.key, nullptr, merge_result_.GetSelf());
+    } else if (delta_entry.type == kPutRecord) {
+      status_ = wbwii_->MergeKey(delta_entry.key, &delta_entry.value,
+                                 merge_result_.GetSelf());
+    } else if (delta_entry.type == kMergeRecord) {
+      if (equal_keys_) {
+        Slice base_value = base_iterator_->value();
+        status_ = wbwii_->MergeKey(delta_entry.key, &base_value,
+                                   merge_result_.GetSelf());
+      } else {
+        status_ =
+            wbwii_->MergeKey(delta_entry.key, nullptr, merge_result_.GetSelf());
+      }
+    }
+    merge_result_.PinSelf();
+    return merge_result_;
+  }
+}
+
+Status BaseDeltaIterator::status() const {
+  if (!status_.ok()) {
+    return status_;
+  }
+  if (!base_iterator_->status().ok()) {
+    return base_iterator_->status();
+  }
+  return delta_iterator_->status();
+}
+
+void BaseDeltaIterator::Invalidate(Status s) { status_ = s; }
+
+void BaseDeltaIterator::AssertInvariants() {
+#ifndef NDEBUG
+  bool not_ok = false;
+  if (!base_iterator_->status().ok()) {
+    assert(!base_iterator_->Valid());
+    not_ok = true;
+  }
+  if (!delta_iterator_->status().ok()) {
+    assert(!delta_iterator_->Valid());
+    not_ok = true;
+  }
+  if (not_ok) {
+    assert(!Valid());
+    assert(!status().ok());
+    return;
+  }
+
+  if (!Valid()) {
+    return;
+  }
+  if (!BaseValid()) {
+    assert(!current_at_base_ && delta_iterator_->Valid());
+    return;
+  }
+  if (!DeltaValid()) {
+    assert(current_at_base_ && base_iterator_->Valid());
+    return;
+  }
+  // we don't support those yet
+  assert(delta_iterator_->Entry().type != kMergeRecord &&
+         delta_iterator_->Entry().type != kLogDataRecord);
+  int compare = comparator_->CompareWithoutTimestamp(
+      delta_iterator_->Entry().key, /*a_has_ts=*/false, base_iterator_->key(),
+      /*b_has_ts=*/false);
+  if (forward_) {
+    // current_at_base -> compare < 0
+    assert(!current_at_base_ || compare < 0);
+    // !current_at_base -> compare <= 0
+    assert(current_at_base_ && compare >= 0);
+  } else {
+    // current_at_base -> compare > 0
+    assert(!current_at_base_ || compare > 0);
+    // !current_at_base -> compare <= 0
+    assert(current_at_base_ && compare <= 0);
+  }
+  // equal_keys_ <=> compare == 0
+  assert((equal_keys_ || compare != 0) && (!equal_keys_ || compare == 0));
+#endif
+}
+
+void BaseDeltaIterator::Advance() {
+  if (equal_keys_) {
+    assert(BaseValid() && DeltaValid());
+    AdvanceBase();
+    AdvanceDelta();
+  } else {
+    if (current_at_base_) {
+      assert(BaseValid());
+      AdvanceBase();
+    } else {
+      assert(DeltaValid());
+      AdvanceDelta();
+    }
+  }
+  UpdateCurrent();
+}
+
+void BaseDeltaIterator::AdvanceDelta() {
+  if (forward_) {
+    delta_iterator_->NextKey();
+  } else {
+    delta_iterator_->PrevKey();
+  }
+}
+void BaseDeltaIterator::AdvanceBase() {
+  if (forward_) {
+    base_iterator_->Next();
+  } else {
+    base_iterator_->Prev();
+  }
+}
+
+bool BaseDeltaIterator::BaseValid() const { return base_iterator_->Valid(); }
+bool BaseDeltaIterator::DeltaValid() const { return delta_iterator_->Valid(); }
+void BaseDeltaIterator::UpdateCurrent() {
+// Suppress false positive clang analyzer warnings.
+#ifndef __clang_analyzer__
+  status_ = Status::OK();
+  while (true) {
+    auto delta_result = WBWIIteratorImpl::kNotFound;
+    WriteEntry delta_entry;
+    if (DeltaValid()) {
+      assert(delta_iterator_->status().ok());
+      delta_result =
+          delta_iterator_->FindLatestUpdate(wbwii_->GetMergeContext());
+      delta_entry = delta_iterator_->Entry();
+    } else if (!delta_iterator_->status().ok()) {
+      // Expose the error status and stop.
+      current_at_base_ = false;
+      return;
+    }
+    equal_keys_ = false;
+    if (!BaseValid()) {
+      if (!base_iterator_->status().ok()) {
+        // Expose the error status and stop.
+        current_at_base_ = true;
+        return;
+      }
+
+      // Base has finished.
+      if (!DeltaValid()) {
+        // Finished
+        return;
+      }
+      if (iterate_upper_bound_) {
+        if (comparator_->CompareWithoutTimestamp(
+                delta_entry.key, /*a_has_ts=*/false, *iterate_upper_bound_,
+                /*b_has_ts=*/false) >= 0) {
+          // out of upper bound -> finished.
+          return;
+        }
+      }
+      if (delta_result == WBWIIteratorImpl::kDeleted &&
+          wbwii_->GetNumOperands() == 0) {
+        AdvanceDelta();
+      } else {
+        current_at_base_ = false;
+        return;
+      }
+    } else if (!DeltaValid()) {
+      // Delta has finished.
+      current_at_base_ = true;
+      return;
+    } else {
+      int compare =
+          (forward_ ? 1 : -1) * comparator_->CompareWithoutTimestamp(
+                                    delta_entry.key, /*a_has_ts=*/false,
+                                    base_iterator_->key(), /*b_has_ts=*/false);
+      if (compare <= 0) {  // delta bigger or equal
+        if (compare == 0) {
+          equal_keys_ = true;
+        }
+        if (delta_result != WBWIIteratorImpl::kDeleted ||
+            wbwii_->GetNumOperands() > 0) {
+          current_at_base_ = false;
+          return;
+        }
+        // Delta is less advanced and is delete.
+        AdvanceDelta();
+        if (equal_keys_) {
+          AdvanceBase();
+        }
+      } else {
+        current_at_base_ = true;
+        return;
+      }
+    }
+  }
+
+  AssertInvariants();
+#endif  // __clang_analyzer__
+}
+
+void WBWIIteratorImpl::AdvanceKey(bool forward) {
+  if (Valid()) {
+    Slice key = Entry().key;
+    do {
+      if (forward) {
+        Next();
+      } else {
+        Prev();
+      }
+    } while (MatchesKey(column_family_id_, key));
+  }
+}
+
+void WBWIIteratorImpl::NextKey() { AdvanceKey(true); }
+
+void WBWIIteratorImpl::PrevKey() {
+  AdvanceKey(false);  // Move to the tail of the previous key
+  if (Valid()) {
+    AdvanceKey(false);  // Move back another key.  Now we are at the start of
+                        // the previous key
+    if (Valid()) {      // Still a valid
+      Next();           // Move forward one onto this key
+    } else {
+      SeekToFirst();  // Not valid, move to the start
+    }
+  }
+}
+
+WBWIIteratorImpl::Result WBWIIteratorImpl::FindLatestUpdate(
+    MergeContext* merge_context) {
+  if (Valid()) {
+    Slice key = Entry().key;
+    return FindLatestUpdate(key, merge_context);
+  } else {
+    merge_context->Clear();  // Clear any entries in the MergeContext
+    return WBWIIteratorImpl::kNotFound;
+  }
+}
+
+WBWIIteratorImpl::Result WBWIIteratorImpl::FindLatestUpdate(
+    const Slice& key, MergeContext* merge_context) {
+  Result result = WBWIIteratorImpl::kNotFound;
+  merge_context->Clear();  // Clear any entries in the MergeContext
+  // TODO(agiardullo): consider adding support for reverse iteration
+  if (!Valid()) {
+    return result;
+  } else if (comparator_->CompareKey(column_family_id_, Entry().key, key) !=
+             0) {
+    return result;
+  } else {
+    // We want to iterate in the reverse order that the writes were added to the
+    // batch.  Since we don't have a reverse iterator, we must seek past the
+    // end. We do this by seeking to the next key, and then back one step
+    NextKey();
+    if (Valid()) {
+      Prev();
+    } else {
+      SeekToLast();
+    }
+
+    // We are at the end of the iterator for this key.  Search backwards for the
+    // last Put or Delete, accumulating merges along the way.
+    while (Valid()) {
+      const WriteEntry entry = Entry();
+      if (comparator_->CompareKey(column_family_id_, entry.key, key) != 0) {
+        break;  // Unexpected error or we've reached a different next key
+      }
+
+      switch (entry.type) {
+        case kPutRecord:
+          return WBWIIteratorImpl::kFound;
+        case kDeleteRecord:
+          return WBWIIteratorImpl::kDeleted;
+        case kSingleDeleteRecord:
+          return WBWIIteratorImpl::kDeleted;
+        case kMergeRecord:
+          result = WBWIIteratorImpl::kMergeInProgress;
+          merge_context->PushOperand(entry.value);
+          break;
+        case kLogDataRecord:
+          break;  // ignore
+        case kXIDRecord:
+          break;  // ignore
+        default:
+          return WBWIIteratorImpl::kError;
+      }  // end switch statement
+      Prev();
+    }  // End while Valid()
+    // At this point, we have been through the whole list and found no Puts or
+    // Deletes. The iterator points to the previous key.  Move the iterator back
+    // onto this one.
+    if (Valid()) {
+      Next();
+    } else {
+      SeekToFirst();
+    }
+  }
+  return result;
+}
+
+Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset,
+                                                  WriteType* type, Slice* Key,
+                                                  Slice* value, Slice* blob,
+                                                  Slice* xid) const {
+  if (type == nullptr || Key == nullptr || value == nullptr ||
+      blob == nullptr || xid == nullptr) {
+    return Status::InvalidArgument("Output parameters cannot be null");
+  }
+
+  if (data_offset == GetDataSize()) {
+    // reached end of batch.
+    return Status::NotFound();
+  }
+
+  if (data_offset > GetDataSize()) {
+    return Status::InvalidArgument("data offset exceed write batch size");
+  }
+  Slice input = Slice(rep_.data() + data_offset, rep_.size() - data_offset);
+  char tag;
+  uint32_t column_family;
+  Status s = ReadRecordFromWriteBatch(&input, &tag, &column_family, Key, value,
+                                      blob, xid);
+  if (!s.ok()) {
+    return s;
+  }
+
+  switch (tag) {
+    case kTypeColumnFamilyValue:
+    case kTypeValue:
+      *type = kPutRecord;
+      break;
+    case kTypeColumnFamilyDeletion:
+    case kTypeDeletion:
+      *type = kDeleteRecord;
+      break;
+    case kTypeColumnFamilySingleDeletion:
+    case kTypeSingleDeletion:
+      *type = kSingleDeleteRecord;
+      break;
+    case kTypeColumnFamilyRangeDeletion:
+    case kTypeRangeDeletion:
+      *type = kDeleteRangeRecord;
+      break;
+    case kTypeColumnFamilyMerge:
+    case kTypeMerge:
+      *type = kMergeRecord;
+      break;
+    case kTypeLogData:
+      *type = kLogDataRecord;
+      break;
+    case kTypeNoop:
+    case kTypeBeginPrepareXID:
+    case kTypeBeginPersistedPrepareXID:
+    case kTypeBeginUnprepareXID:
+    case kTypeEndPrepareXID:
+    case kTypeCommitXID:
+    case kTypeRollbackXID:
+      *type = kXIDRecord;
+      break;
+    default:
+      return Status::Corruption("unknown WriteBatch tag ",
+                                std::to_string(static_cast<unsigned int>(tag)));
+  }
+  return Status::OK();
+}
+
+// If both of `entry1` and `entry2` point to real entry in write batch, we
+// compare the entries as following:
+// 1. first compare the column family, the one with larger CF will be larger;
+// 2. Inside the same CF, we first decode the entry to find the key of the entry
+//    and the entry with larger key will be larger;
+// 3. If two entries are of the same CF and key, the one with larger offset
+//    will be larger.
+// Some times either `entry1` or `entry2` is dummy entry, which is actually
+// a search key. In this case, in step 2, we don't go ahead and decode the
+// entry but use the value in WriteBatchIndexEntry::search_key.
+// One special case is WriteBatchIndexEntry::key_size is kFlagMinInCf.
+// This indicate that we are going to seek to the first of the column family.
+// Once we see this, this entry will be smaller than all the real entries of
+// the column family.
+int WriteBatchEntryComparator::operator()(
+    const WriteBatchIndexEntry* entry1,
+    const WriteBatchIndexEntry* entry2) const {
+  if (entry1->column_family > entry2->column_family) {
+    return 1;
+  } else if (entry1->column_family < entry2->column_family) {
+    return -1;
+  }
+
+  // Deal with special case of seeking to the beginning of a column family
+  if (entry1->is_min_in_cf()) {
+    return -1;
+  } else if (entry2->is_min_in_cf()) {
+    return 1;
+  }
+
+  Slice key1, key2;
+  if (entry1->search_key == nullptr) {
+    key1 = Slice(write_batch_->Data().data() + entry1->key_offset,
+                 entry1->key_size);
+  } else {
+    key1 = *(entry1->search_key);
+  }
+  if (entry2->search_key == nullptr) {
+    key2 = Slice(write_batch_->Data().data() + entry2->key_offset,
+                 entry2->key_size);
+  } else {
+    key2 = *(entry2->search_key);
+  }
+
+  int cmp = CompareKey(entry1->column_family, key1, key2);
+  if (cmp != 0) {
+    return cmp;
+  } else if (entry1->offset > entry2->offset) {
+    return 1;
+  } else if (entry1->offset < entry2->offset) {
+    return -1;
+  }
+  return 0;
+}
+
+int WriteBatchEntryComparator::CompareKey(uint32_t column_family,
+                                          const Slice& key1,
+                                          const Slice& key2) const {
+  if (column_family < cf_comparators_.size() &&
+      cf_comparators_[column_family] != nullptr) {
+    return cf_comparators_[column_family]->CompareWithoutTimestamp(
+        key1, /*a_has_ts=*/false, key2, /*b_has_ts=*/false);
+  } else {
+    return default_comparator_->CompareWithoutTimestamp(
+        key1, /*a_has_ts=*/false, key2, /*b_has_ts=*/false);
+  }
+}
+
+const Comparator* WriteBatchEntryComparator::GetComparator(
+    const ColumnFamilyHandle* column_family) const {
+  return column_family ? column_family->GetComparator() : default_comparator_;
+}
+
+const Comparator* WriteBatchEntryComparator::GetComparator(
+    uint32_t column_family) const {
+  if (column_family < cf_comparators_.size() &&
+      cf_comparators_[column_family]) {
+    return cf_comparators_[column_family];
+  }
+  return default_comparator_;
+}
+
+WriteEntry WBWIIteratorImpl::Entry() const {
+  WriteEntry ret;
+  Slice blob, xid;
+  const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key();
+  // this is guaranteed with Valid()
+  assert(iter_entry != nullptr &&
+         iter_entry->column_family == column_family_id_);
+  auto s = write_batch_->GetEntryFromDataOffset(
+      iter_entry->offset, &ret.type, &ret.key, &ret.value, &blob, &xid);
+  assert(s.ok());
+  assert(ret.type == kPutRecord || ret.type == kDeleteRecord ||
+         ret.type == kSingleDeleteRecord || ret.type == kDeleteRangeRecord ||
+         ret.type == kMergeRecord);
+  // Make sure entry.key does not include user-defined timestamp.
+  const Comparator* const ucmp = comparator_->GetComparator(column_family_id_);
+  size_t ts_sz = ucmp->timestamp_size();
+  if (ts_sz > 0) {
+    ret.key = StripTimestampFromUserKey(ret.key, ts_sz);
+  }
+  return ret;
+}
+
+bool WBWIIteratorImpl::MatchesKey(uint32_t cf_id, const Slice& key) {
+  if (Valid()) {
+    return comparator_->CompareKey(cf_id, key, Entry().key) == 0;
+  } else {
+    return false;
+  }
+}
+
+WriteBatchWithIndexInternal::WriteBatchWithIndexInternal(
+    ColumnFamilyHandle* column_family)
+    : db_(nullptr), db_options_(nullptr), column_family_(column_family) {}
+
+WriteBatchWithIndexInternal::WriteBatchWithIndexInternal(
+    DB* db, ColumnFamilyHandle* column_family)
+    : db_(db), db_options_(nullptr), column_family_(column_family) {
+  if (db_ != nullptr && column_family_ == nullptr) {
+    column_family_ = db_->DefaultColumnFamily();
+  }
+}
+
+WriteBatchWithIndexInternal::WriteBatchWithIndexInternal(
+    const DBOptions* db_options, ColumnFamilyHandle* column_family)
+    : db_(nullptr), db_options_(db_options), column_family_(column_family) {}
+
+Status WriteBatchWithIndexInternal::MergeKey(const Slice& key,
+                                             const Slice* value,
+                                             const MergeContext& context,
+                                             std::string* result) const {
+  if (column_family_ != nullptr) {
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family_);
+    const auto merge_operator = cfh->cfd()->ioptions()->merge_operator.get();
+    if (merge_operator == nullptr) {
+      return Status::InvalidArgument(
+          "Merge_operator must be set for column_family");
+    } else if (db_ != nullptr) {
+      const ImmutableDBOptions& immutable_db_options =
+          static_cast_with_check<DBImpl>(db_->GetRootDB())
+              ->immutable_db_options();
+      Statistics* statistics = immutable_db_options.statistics.get();
+      Logger* logger = immutable_db_options.info_log.get();
+      SystemClock* clock = immutable_db_options.clock;
+      return MergeHelper::TimedFullMerge(
+          merge_operator, key, value, context.GetOperands(), result, logger,
+          statistics, clock, /* result_operand */ nullptr,
+          /* update_num_ops_stats */ false);
+    } else if (db_options_ != nullptr) {
+      Statistics* statistics = db_options_->statistics.get();
+      Env* env = db_options_->env;
+      Logger* logger = db_options_->info_log.get();
+      SystemClock* clock = env->GetSystemClock().get();
+      return MergeHelper::TimedFullMerge(
+          merge_operator, key, value, context.GetOperands(), result, logger,
+          statistics, clock, /* result_operand */ nullptr,
+          /* update_num_ops_stats */ false);
+    } else {
+      const auto cf_opts = cfh->cfd()->ioptions();
+      return MergeHelper::TimedFullMerge(
+          merge_operator, key, value, context.GetOperands(), result,
+          cf_opts->logger, cf_opts->stats, cf_opts->clock,
+          /* result_operand */ nullptr, /* update_num_ops_stats */ false);
+    }
+  } else {
+    return Status::InvalidArgument("Must provide a column_family");
+  }
+}
+
+WBWIIteratorImpl::Result WriteBatchWithIndexInternal::GetFromBatch(
+    WriteBatchWithIndex* batch, const Slice& key, MergeContext* context,
+    std::string* value, Status* s) {
+  *s = Status::OK();
+
+  std::unique_ptr<WBWIIteratorImpl> iter(
+      static_cast_with_check<WBWIIteratorImpl>(
+          batch->NewIterator(column_family_)));
+
+  // Search the iterator for this key, and updates/merges to it.
+  iter->Seek(key);
+  auto result = iter->FindLatestUpdate(key, context);
+  if (result == WBWIIteratorImpl::kError) {
+    (*s) = Status::Corruption("Unexpected entry in WriteBatchWithIndex:",
+                              std::to_string(iter->Entry().type));
+    return result;
+  } else if (result == WBWIIteratorImpl::kNotFound) {
+    return result;
+  } else if (result == WBWIIteratorImpl::Result::kFound) {  // PUT
+    Slice entry_value = iter->Entry().value;
+    if (context->GetNumOperands() > 0) {
+      *s = MergeKey(key, &entry_value, *context, value);
+      if (!s->ok()) {
+        result = WBWIIteratorImpl::Result::kError;
+      }
+    } else {
+      value->assign(entry_value.data(), entry_value.size());
+    }
+  } else if (result == WBWIIteratorImpl::kDeleted) {
+    if (context->GetNumOperands() > 0) {
+      *s = MergeKey(key, nullptr, *context, value);
+      if (s->ok()) {
+        result = WBWIIteratorImpl::Result::kFound;
+      } else {
+        result = WBWIIteratorImpl::Result::kError;
+      }
+    }
+  }
+  return result;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h
new file mode 100644
index 000000000..edabc95bc
--- /dev/null
+++ b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h
@@ -0,0 +1,344 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "db/merge_context.h"
+#include "memtable/skiplist.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MergeContext;
+class WBWIIteratorImpl;
+class WriteBatchWithIndexInternal;
+struct Options;
+
+// when direction == forward
+// * current_at_base_ <=> base_iterator > delta_iterator
+// when direction == backwards
+// * current_at_base_ <=> base_iterator < delta_iterator
+// always:
+// * equal_keys_ <=> base_iterator == delta_iterator
+class BaseDeltaIterator : public Iterator {
+ public:
+  BaseDeltaIterator(ColumnFamilyHandle* column_family, Iterator* base_iterator,
+                    WBWIIteratorImpl* delta_iterator,
+                    const Comparator* comparator,
+                    const ReadOptions* read_options = nullptr);
+
+  ~BaseDeltaIterator() override {}
+
+  bool Valid() const override;
+  void SeekToFirst() override;
+  void SeekToLast() override;
+  void Seek(const Slice& k) override;
+  void SeekForPrev(const Slice& k) override;
+  void Next() override;
+  void Prev() override;
+  Slice key() const override;
+  Slice value() const override;
+  Status status() const override;
+  void Invalidate(Status s);
+
+ private:
+  void AssertInvariants();
+  void Advance();
+  void AdvanceDelta();
+  void AdvanceBase();
+  bool BaseValid() const;
+  bool DeltaValid() const;
+  void UpdateCurrent();
+
+  std::unique_ptr<WriteBatchWithIndexInternal> wbwii_;
+  bool forward_;
+  bool current_at_base_;
+  bool equal_keys_;
+  mutable Status status_;
+  std::unique_ptr<Iterator> base_iterator_;
+  std::unique_ptr<WBWIIteratorImpl> delta_iterator_;
+  const Comparator* comparator_;  // not owned
+  const Slice* iterate_upper_bound_;
+  mutable PinnableSlice merge_result_;
+};
+
+// Key used by skip list, as the binary searchable index of WriteBatchWithIndex.
+struct WriteBatchIndexEntry {
+  WriteBatchIndexEntry(size_t o, uint32_t c, size_t ko, size_t ksz)
+      : offset(o),
+        column_family(c),
+        key_offset(ko),
+        key_size(ksz),
+        search_key(nullptr) {}
+  // Create a dummy entry as the search key. This index entry won't be backed
+  // by an entry from the write batch, but a pointer to the search key. Or a
+  // special flag of offset can indicate we are seek to first.
+  // @_search_key: the search key
+  // @_column_family: column family
+  // @is_forward_direction: true for Seek(). False for SeekForPrev()
+  // @is_seek_to_first: true if we seek to the beginning of the column family
+  //                    _search_key should be null in this case.
+  WriteBatchIndexEntry(const Slice* _search_key, uint32_t _column_family,
+                       bool is_forward_direction, bool is_seek_to_first)
+      // For SeekForPrev(), we need to make the dummy entry larger than any
+      // entry who has the same search key. Otherwise, we'll miss those entries.
+      : offset(is_forward_direction ? 0 : std::numeric_limits<size_t>::max()),
+        column_family(_column_family),
+        key_offset(0),
+        key_size(is_seek_to_first ? kFlagMinInCf : 0),
+        search_key(_search_key) {
+    assert(_search_key != nullptr || is_seek_to_first);
+  }
+
+  // If this flag appears in the key_size, it indicates a
+  // key that is smaller than any other entry for the same column family.
+  static const size_t kFlagMinInCf = std::numeric_limits<size_t>::max();
+
+  bool is_min_in_cf() const {
+    assert(key_size != kFlagMinInCf ||
+           (key_offset == 0 && search_key == nullptr));
+    return key_size == kFlagMinInCf;
+  }
+
+  // offset of an entry in write batch's string buffer. If this is a dummy
+  // lookup key, in which case search_key != nullptr, offset is set to either
+  // 0 or max, only for comparison purpose. Because when entries have the same
+  // key, the entry with larger offset is larger, offset = 0 will make a seek
+  // key small or equal than all the entries with the seek key, so that Seek()
+  // will find all the entries of the same key. Similarly, offset = MAX will
+  // make the entry just larger than all entries with the search key so
+  // SeekForPrev() will see all the keys with the same key.
+  size_t offset;
+  uint32_t column_family;  // column family of the entry.
+  size_t key_offset;       // offset of the key in write batch's string buffer.
+  size_t key_size;         // size of the key. kFlagMinInCf indicates
+                           // that this is a dummy look up entry for
+                           // SeekToFirst() to the beginning of the column
+                           // family. We use the flag here to save a boolean
+                           // in the struct.
+
+  const Slice* search_key;  // if not null, instead of reading keys from
+                            // write batch, use it to compare. This is used
+                            // for lookup key.
+};
+
+class ReadableWriteBatch : public WriteBatch {
+ public:
+  explicit ReadableWriteBatch(size_t reserved_bytes = 0, size_t max_bytes = 0,
+                              size_t protection_bytes_per_key = 0,
+                              size_t default_cf_ts_sz = 0)
+      : WriteBatch(reserved_bytes, max_bytes, protection_bytes_per_key,
+                   default_cf_ts_sz) {}
+  // Retrieve some information from a write entry in the write batch, given
+  // the start offset of the write entry.
+  Status GetEntryFromDataOffset(size_t data_offset, WriteType* type, Slice* Key,
+                                Slice* value, Slice* blob, Slice* xid) const;
+};
+
+class WriteBatchEntryComparator {
+ public:
+  WriteBatchEntryComparator(const Comparator* _default_comparator,
+                            const ReadableWriteBatch* write_batch)
+      : default_comparator_(_default_comparator), write_batch_(write_batch) {}
+  // Compare a and b. Return a negative value if a is less than b, 0 if they
+  // are equal, and a positive value if a is greater than b
+  int operator()(const WriteBatchIndexEntry* entry1,
+                 const WriteBatchIndexEntry* entry2) const;
+
+  int CompareKey(uint32_t column_family, const Slice& key1,
+                 const Slice& key2) const;
+
+  void SetComparatorForCF(uint32_t column_family_id,
+                          const Comparator* comparator) {
+    if (column_family_id >= cf_comparators_.size()) {
+      cf_comparators_.resize(column_family_id + 1, nullptr);
+    }
+    cf_comparators_[column_family_id] = comparator;
+  }
+
+  const Comparator* default_comparator() { return default_comparator_; }
+
+  const Comparator* GetComparator(
+      const ColumnFamilyHandle* column_family) const;
+
+  const Comparator* GetComparator(uint32_t column_family) const;
+
+ private:
+  const Comparator* const default_comparator_;
+  std::vector<const Comparator*> cf_comparators_;
+  const ReadableWriteBatch* const write_batch_;
+};
+
+using WriteBatchEntrySkipList =
+    SkipList<WriteBatchIndexEntry*, const WriteBatchEntryComparator&>;
+
+class WBWIIteratorImpl : public WBWIIterator {
+ public:
+  enum Result : uint8_t {
+    kFound,
+    kDeleted,
+    kNotFound,
+    kMergeInProgress,
+    kError
+  };
+  WBWIIteratorImpl(uint32_t column_family_id,
+                   WriteBatchEntrySkipList* skip_list,
+                   const ReadableWriteBatch* write_batch,
+                   WriteBatchEntryComparator* comparator)
+      : column_family_id_(column_family_id),
+        skip_list_iter_(skip_list),
+        write_batch_(write_batch),
+        comparator_(comparator) {}
+
+  ~WBWIIteratorImpl() override {}
+
+  bool Valid() const override {
+    if (!skip_list_iter_.Valid()) {
+      return false;
+    }
+    const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key();
+    return (iter_entry != nullptr &&
+            iter_entry->column_family == column_family_id_);
+  }
+
+  void SeekToFirst() override {
+    WriteBatchIndexEntry search_entry(
+        nullptr /* search_key */, column_family_id_,
+        true /* is_forward_direction */, true /* is_seek_to_first */);
+    skip_list_iter_.Seek(&search_entry);
+  }
+
+  void SeekToLast() override {
+    WriteBatchIndexEntry search_entry(
+        nullptr /* search_key */, column_family_id_ + 1,
+        true /* is_forward_direction */, true /* is_seek_to_first */);
+    skip_list_iter_.Seek(&search_entry);
+    if (!skip_list_iter_.Valid()) {
+      skip_list_iter_.SeekToLast();
+    } else {
+      skip_list_iter_.Prev();
+    }
+  }
+
+  void Seek(const Slice& key) override {
+    WriteBatchIndexEntry search_entry(&key, column_family_id_,
+                                      true /* is_forward_direction */,
+                                      false /* is_seek_to_first */);
+    skip_list_iter_.Seek(&search_entry);
+  }
+
+  void SeekForPrev(const Slice& key) override {
+    WriteBatchIndexEntry search_entry(&key, column_family_id_,
+                                      false /* is_forward_direction */,
+                                      false /* is_seek_to_first */);
+    skip_list_iter_.SeekForPrev(&search_entry);
+  }
+
+  void Next() override { skip_list_iter_.Next(); }
+
+  void Prev() override { skip_list_iter_.Prev(); }
+
+  WriteEntry Entry() const override;
+
+  Status status() const override {
+    // this is in-memory data structure, so the only way status can be non-ok is
+    // through memory corruption
+    return Status::OK();
+  }
+
+  const WriteBatchIndexEntry* GetRawEntry() const {
+    return skip_list_iter_.key();
+  }
+
+  bool MatchesKey(uint32_t cf_id, const Slice& key);
+
+  // Moves the iterator to first entry of the previous key.
+  void PrevKey();
+  // Moves the iterator to first entry of the next key.
+  void NextKey();
+
+  // Moves the iterator to the Update (Put or Delete) for the current key
+  // If there are no Put/Delete, the Iterator will point to the first entry for
+  // this key
+  // @return kFound if a Put was found for the key
+  // @return kDeleted if a delete was found for the key
+  // @return kMergeInProgress if only merges were fouund for the key
+  // @return kError if an unsupported operation was found for the key
+  // @return kNotFound if no operations were found for this key
+  //
+  Result FindLatestUpdate(const Slice& key, MergeContext* merge_context);
+  Result FindLatestUpdate(MergeContext* merge_context);
+
+ protected:
+  void AdvanceKey(bool forward);
+
+ private:
+  uint32_t column_family_id_;
+  WriteBatchEntrySkipList::Iterator skip_list_iter_;
+  const ReadableWriteBatch* write_batch_;
+  WriteBatchEntryComparator* comparator_;
+};
+
+class WriteBatchWithIndexInternal {
+ public:
+  static const Comparator* GetUserComparator(const WriteBatchWithIndex& wbwi,
+                                             uint32_t cf_id);
+
+  // For GetFromBatchAndDB or similar
+  explicit WriteBatchWithIndexInternal(DB* db,
+                                       ColumnFamilyHandle* column_family);
+  // For GetFromBatchAndDB or similar
+  explicit WriteBatchWithIndexInternal(ColumnFamilyHandle* column_family);
+  // For GetFromBatch or similar
+  explicit WriteBatchWithIndexInternal(const DBOptions* db_options,
+                                       ColumnFamilyHandle* column_family);
+
+  // If batch contains a value for key, store it in *value and return kFound.
+  // If batch contains a deletion for key, return Deleted.
+  // If batch contains Merge operations as the most recent entry for a key,
+  //   and the merge process does not stop (not reaching a value or delete),
+  //   prepend the current merge operands to *operands,
+  //   and return kMergeInProgress
+  // If batch does not contain this key, return kNotFound
+  // Else, return kError on error with error Status stored in *s.
+  WBWIIteratorImpl::Result GetFromBatch(WriteBatchWithIndex* batch,
+                                        const Slice& key, std::string* value,
+                                        Status* s) {
+    return GetFromBatch(batch, key, &merge_context_, value, s);
+  }
+  WBWIIteratorImpl::Result GetFromBatch(WriteBatchWithIndex* batch,
+                                        const Slice& key,
+                                        MergeContext* merge_context,
+                                        std::string* value, Status* s);
+  Status MergeKey(const Slice& key, const Slice* value,
+                  std::string* result) const {
+    return MergeKey(key, value, merge_context_, result);
+  }
+  Status MergeKey(const Slice& key, const Slice* value,
+                  const MergeContext& context, std::string* result) const;
+  size_t GetNumOperands() const { return merge_context_.GetNumOperands(); }
+  MergeContext* GetMergeContext() { return &merge_context_; }
+  Slice GetOperand(int index) const { return merge_context_.GetOperand(index); }
+
+ private:
+  DB* db_;
+  const DBOptions* db_options_;
+  ColumnFamilyHandle* column_family_;
+  MergeContext merge_context_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc
new file mode 100644
index 000000000..350dcc881
--- /dev/null
+++ b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc
@@ -0,0 +1,2419 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/write_batch_with_index.h"
+
+#include <map>
+#include <memory>
+
+#include "db/column_family.h"
+#include "port/stack_trace.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/string_append/stringappend.h"
+#include "utilities/write_batch_with_index/write_batch_with_index_internal.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl {
+ public:
+  explicit ColumnFamilyHandleImplDummy(int id, const Comparator* comparator)
+      : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr),
+        id_(id),
+        comparator_(comparator) {}
+  uint32_t GetID() const override { return id_; }
+  const Comparator* GetComparator() const override { return comparator_; }
+
+ private:
+  uint32_t id_;
+  const Comparator* comparator_;
+};
+
+struct Entry {
+  std::string key;
+  std::string value;
+  WriteType type;
+};
+
+struct TestHandler : public WriteBatch::Handler {
+  std::map<uint32_t, std::vector<Entry>> seen;
+  Status PutCF(uint32_t column_family_id, const Slice& key,
+               const Slice& value) override {
+    Entry e;
+    e.key = key.ToString();
+    e.value = value.ToString();
+    e.type = kPutRecord;
+    seen[column_family_id].push_back(e);
+    return Status::OK();
+  }
+  Status MergeCF(uint32_t column_family_id, const Slice& key,
+                 const Slice& value) override {
+    Entry e;
+    e.key = key.ToString();
+    e.value = value.ToString();
+    e.type = kMergeRecord;
+    seen[column_family_id].push_back(e);
+    return Status::OK();
+  }
+  void LogData(const Slice& /*blob*/) override {}
+  Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
+    Entry e;
+    e.key = key.ToString();
+    e.value = "";
+    e.type = kDeleteRecord;
+    seen[column_family_id].push_back(e);
+    return Status::OK();
+  }
+};
+
+using KVMap = std::map<std::string, std::string>;
+
+class KVIter : public Iterator {
+ public:
+  explicit KVIter(const KVMap* map) : map_(map), iter_(map_->end()) {}
+  bool Valid() const override { return iter_ != map_->end(); }
+  void SeekToFirst() override { iter_ = map_->begin(); }
+  void SeekToLast() override {
+    if (map_->empty()) {
+      iter_ = map_->end();
+    } else {
+      iter_ = map_->find(map_->rbegin()->first);
+    }
+  }
+  void Seek(const Slice& k) override {
+    iter_ = map_->lower_bound(k.ToString());
+  }
+  void SeekForPrev(const Slice& k) override {
+    iter_ = map_->upper_bound(k.ToString());
+    Prev();
+  }
+  void Next() override { ++iter_; }
+  void Prev() override {
+    if (iter_ == map_->begin()) {
+      iter_ = map_->end();
+      return;
+    }
+    --iter_;
+  }
+  Slice key() const override { return iter_->first; }
+  Slice value() const override { return iter_->second; }
+  Status status() const override { return Status::OK(); }
+
+ private:
+  const KVMap* const map_;
+  KVMap::const_iterator iter_;
+};
+
+static std::string PrintContents(WriteBatchWithIndex* batch,
+                                 ColumnFamilyHandle* column_family,
+                                 bool hex = false) {
+  std::string result;
+
+  WBWIIterator* iter;
+  if (column_family == nullptr) {
+    iter = batch->NewIterator();
+  } else {
+    iter = batch->NewIterator(column_family);
+  }
+
+  iter->SeekToFirst();
+  while (iter->Valid()) {
+    WriteEntry e = iter->Entry();
+
+    if (e.type == kPutRecord) {
+      result.append("PUT(");
+      result.append(e.key.ToString(hex));
+      result.append("):");
+      result.append(e.value.ToString(hex));
+    } else if (e.type == kMergeRecord) {
+      result.append("MERGE(");
+      result.append(e.key.ToString(hex));
+      result.append("):");
+      result.append(e.value.ToString(hex));
+    } else if (e.type == kSingleDeleteRecord) {
+      result.append("SINGLE-DEL(");
+      result.append(e.key.ToString(hex));
+      result.append(")");
+    } else {
+      assert(e.type == kDeleteRecord);
+      result.append("DEL(");
+      result.append(e.key.ToString(hex));
+      result.append(")");
+    }
+
+    result.append(",");
+    iter->Next();
+  }
+
+  delete iter;
+  return result;
+}
+
+static std::string PrintContents(WriteBatchWithIndex* batch, KVMap* base_map,
+                                 ColumnFamilyHandle* column_family) {
+  std::string result;
+
+  Iterator* iter;
+  if (column_family == nullptr) {
+    iter = batch->NewIteratorWithBase(new KVIter(base_map));
+  } else {
+    iter = batch->NewIteratorWithBase(column_family, new KVIter(base_map));
+  }
+
+  iter->SeekToFirst();
+  while (iter->Valid()) {
+    assert(iter->status().ok());
+
+    Slice key = iter->key();
+    Slice value = iter->value();
+
+    result.append(key.ToString());
+    result.append(":");
+    result.append(value.ToString());
+    result.append(",");
+
+    iter->Next();
+  }
+
+  delete iter;
+  return result;
+}
+
+void AssertIter(Iterator* iter, const std::string& key,
+                const std::string& value) {
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(key, iter->key().ToString());
+  ASSERT_EQ(value, iter->value().ToString());
+}
+
+void AssertItersMatch(Iterator* iter1, Iterator* iter2) {
+  ASSERT_EQ(iter1->Valid(), iter2->Valid());
+  if (iter1->Valid()) {
+    ASSERT_EQ(iter1->key().ToString(), iter2->key().ToString());
+    ASSERT_EQ(iter1->value().ToString(), iter2->value().ToString());
+  }
+}
+
+void AssertItersEqual(Iterator* iter1, Iterator* iter2) {
+  iter1->SeekToFirst();
+  iter2->SeekToFirst();
+  while (iter1->Valid()) {
+    ASSERT_EQ(iter1->Valid(), iter2->Valid());
+    ASSERT_EQ(iter1->key().ToString(), iter2->key().ToString());
+    ASSERT_EQ(iter1->value().ToString(), iter2->value().ToString());
+    iter1->Next();
+    iter2->Next();
+  }
+  ASSERT_EQ(iter1->Valid(), iter2->Valid());
+}
+
+void AssertIterEqual(WBWIIteratorImpl* wbwii,
+                     const std::vector<std::string>& keys) {
+  wbwii->SeekToFirst();
+  for (auto k : keys) {
+    ASSERT_TRUE(wbwii->Valid());
+    ASSERT_EQ(wbwii->Entry().key, k);
+    wbwii->NextKey();
+  }
+  ASSERT_FALSE(wbwii->Valid());
+  wbwii->SeekToLast();
+  for (auto kit = keys.rbegin(); kit != keys.rend(); ++kit) {
+    ASSERT_TRUE(wbwii->Valid());
+    ASSERT_EQ(wbwii->Entry().key, *kit);
+    wbwii->PrevKey();
+  }
+  ASSERT_FALSE(wbwii->Valid());
+}
+}  // namespace
+
+class WBWIBaseTest : public testing::Test {
+ public:
+  explicit WBWIBaseTest(bool overwrite) : db_(nullptr) {
+    options_.merge_operator =
+        MergeOperators::CreateFromStringId("stringappend");
+    options_.create_if_missing = true;
+    dbname_ = test::PerThreadDBPath("write_batch_with_index_test");
+    EXPECT_OK(DestroyDB(dbname_, options_));
+    batch_.reset(new WriteBatchWithIndex(BytewiseComparator(), 20, overwrite));
+  }
+
+  virtual ~WBWIBaseTest() {
+    if (db_ != nullptr) {
+      ReleaseSnapshot();
+      delete db_;
+      EXPECT_OK(DestroyDB(dbname_, options_));
+    }
+  }
+
+  std::string AddToBatch(ColumnFamilyHandle* cf, const std::string& key) {
+    std::string result;
+    for (size_t i = 0; i < key.size(); i++) {
+      if (key[i] == 'd') {
+        batch_->Delete(cf, key);
+        result = "";
+      } else if (key[i] == 'p') {
+        result = key + std::to_string(i);
+        batch_->Put(cf, key, result);
+      } else if (key[i] == 'm') {
+        std::string value = key + std::to_string(i);
+        batch_->Merge(cf, key, value);
+        if (result.empty()) {
+          result = value;
+        } else {
+          result = result + "," + value;
+        }
+      }
+    }
+    return result;
+  }
+
+  virtual Status OpenDB() { return DB::Open(options_, dbname_, &db_); }
+
+  void ReleaseSnapshot() {
+    if (read_opts_.snapshot != nullptr) {
+      EXPECT_NE(db_, nullptr);
+      db_->ReleaseSnapshot(read_opts_.snapshot);
+      read_opts_.snapshot = nullptr;
+    }
+  }
+
+ public:
+  DB* db_;
+  std::string dbname_;
+  Options options_;
+  WriteOptions write_opts_;
+  ReadOptions read_opts_;
+  std::unique_ptr<WriteBatchWithIndex> batch_;
+};
+
+class WBWIKeepTest : public WBWIBaseTest {
+ public:
+  WBWIKeepTest() : WBWIBaseTest(false) {}
+};
+
+class WBWIOverwriteTest : public WBWIBaseTest {
+ public:
+  WBWIOverwriteTest() : WBWIBaseTest(true) {}
+};
+class WriteBatchWithIndexTest : public WBWIBaseTest,
+                                public testing::WithParamInterface<bool> {
+ public:
+  WriteBatchWithIndexTest() : WBWIBaseTest(GetParam()) {}
+};
+
+void TestValueAsSecondaryIndexHelper(std::vector<Entry> entries,
+                                     WriteBatchWithIndex* batch) {
+  // In this test, we insert <key, value> to column family `data`, and
+  // <value, key> to column family `index`. Then iterator them in order
+  // and seek them by key.
+
+  // Sort entries by key
+  std::map<std::string, std::vector<Entry*>> data_map;
+  // Sort entries by value
+  std::map<std::string, std::vector<Entry*>> index_map;
+  for (auto& e : entries) {
+    data_map[e.key].push_back(&e);
+    index_map[e.value].push_back(&e);
+  }
+
+  ColumnFamilyHandleImplDummy data(6, BytewiseComparator());
+  ColumnFamilyHandleImplDummy index(8, BytewiseComparator());
+  for (auto& e : entries) {
+    if (e.type == kPutRecord) {
+      ASSERT_OK(batch->Put(&data, e.key, e.value));
+      ASSERT_OK(batch->Put(&index, e.value, e.key));
+    } else if (e.type == kMergeRecord) {
+      ASSERT_OK(batch->Merge(&data, e.key, e.value));
+      ASSERT_OK(batch->Put(&index, e.value, e.key));
+    } else {
+      assert(e.type == kDeleteRecord);
+      std::unique_ptr<WBWIIterator> iter(batch->NewIterator(&data));
+      iter->Seek(e.key);
+      ASSERT_OK(iter->status());
+      auto write_entry = iter->Entry();
+      ASSERT_EQ(e.key, write_entry.key.ToString());
+      ASSERT_EQ(e.value, write_entry.value.ToString());
+      ASSERT_OK(batch->Delete(&data, e.key));
+      ASSERT_OK(batch->Put(&index, e.value, ""));
+    }
+  }
+
+  // Iterator all keys
+  {
+    std::unique_ptr<WBWIIterator> iter(batch->NewIterator(&data));
+    for (int seek_to_first : {0, 1}) {
+      if (seek_to_first) {
+        iter->SeekToFirst();
+      } else {
+        iter->Seek("");
+      }
+      for (auto pair : data_map) {
+        for (auto v : pair.second) {
+          ASSERT_OK(iter->status());
+          ASSERT_TRUE(iter->Valid());
+          auto write_entry = iter->Entry();
+          ASSERT_EQ(pair.first, write_entry.key.ToString());
+          ASSERT_EQ(v->type, write_entry.type);
+          if (write_entry.type != kDeleteRecord) {
+            ASSERT_EQ(v->value, write_entry.value.ToString());
+          }
+          iter->Next();
+        }
+      }
+      ASSERT_TRUE(!iter->Valid());
+    }
+    iter->SeekToLast();
+    for (auto pair = data_map.rbegin(); pair != data_map.rend(); ++pair) {
+      for (auto v = pair->second.rbegin(); v != pair->second.rend(); v++) {
+        ASSERT_OK(iter->status());
+        ASSERT_TRUE(iter->Valid());
+        auto write_entry = iter->Entry();
+        ASSERT_EQ(pair->first, write_entry.key.ToString());
+        ASSERT_EQ((*v)->type, write_entry.type);
+        if (write_entry.type != kDeleteRecord) {
+          ASSERT_EQ((*v)->value, write_entry.value.ToString());
+        }
+        iter->Prev();
+      }
+    }
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  // Iterator all indexes
+  {
+    std::unique_ptr<WBWIIterator> iter(batch->NewIterator(&index));
+    for (int seek_to_first : {0, 1}) {
+      if (seek_to_first) {
+        iter->SeekToFirst();
+      } else {
+        iter->Seek("");
+      }
+      for (auto pair : index_map) {
+        for (auto v : pair.second) {
+          ASSERT_OK(iter->status());
+          ASSERT_TRUE(iter->Valid());
+          auto write_entry = iter->Entry();
+          ASSERT_EQ(pair.first, write_entry.key.ToString());
+          if (v->type != kDeleteRecord) {
+            ASSERT_EQ(v->key, write_entry.value.ToString());
+            ASSERT_EQ(v->value, write_entry.key.ToString());
+          }
+          iter->Next();
+        }
+      }
+      ASSERT_TRUE(!iter->Valid());
+    }
+
+    iter->SeekToLast();
+    for (auto pair = index_map.rbegin(); pair != index_map.rend(); ++pair) {
+      for (auto v = pair->second.rbegin(); v != pair->second.rend(); v++) {
+        ASSERT_OK(iter->status());
+        ASSERT_TRUE(iter->Valid());
+        auto write_entry = iter->Entry();
+        ASSERT_EQ(pair->first, write_entry.key.ToString());
+        if ((*v)->type != kDeleteRecord) {
+          ASSERT_EQ((*v)->key, write_entry.value.ToString());
+          ASSERT_EQ((*v)->value, write_entry.key.ToString());
+        }
+        iter->Prev();
+      }
+    }
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  // Seek to every key
+  {
+    std::unique_ptr<WBWIIterator> iter(batch->NewIterator(&data));
+
+    // Seek the keys one by one in reverse order
+    for (auto pair = data_map.rbegin(); pair != data_map.rend(); ++pair) {
+      iter->Seek(pair->first);
+      ASSERT_OK(iter->status());
+      for (auto v : pair->second) {
+        ASSERT_TRUE(iter->Valid());
+        auto write_entry = iter->Entry();
+        ASSERT_EQ(pair->first, write_entry.key.ToString());
+        ASSERT_EQ(v->type, write_entry.type);
+        if (write_entry.type != kDeleteRecord) {
+          ASSERT_EQ(v->value, write_entry.value.ToString());
+        }
+        iter->Next();
+        ASSERT_OK(iter->status());
+      }
+    }
+  }
+
+  // Seek to every index
+  {
+    std::unique_ptr<WBWIIterator> iter(batch->NewIterator(&index));
+
+    // Seek the keys one by one in reverse order
+    for (auto pair = index_map.rbegin(); pair != index_map.rend(); ++pair) {
+      iter->Seek(pair->first);
+      ASSERT_OK(iter->status());
+      for (auto v : pair->second) {
+        ASSERT_TRUE(iter->Valid());
+        auto write_entry = iter->Entry();
+        ASSERT_EQ(pair->first, write_entry.key.ToString());
+        ASSERT_EQ(v->value, write_entry.key.ToString());
+        if (v->type != kDeleteRecord) {
+          ASSERT_EQ(v->key, write_entry.value.ToString());
+        }
+        iter->Next();
+        ASSERT_OK(iter->status());
+      }
+    }
+  }
+
+  // Verify WriteBatch can be iterated
+  TestHandler handler;
+  ASSERT_OK(batch->GetWriteBatch()->Iterate(&handler));
+
+  // Verify data column family
+  {
+    ASSERT_EQ(entries.size(), handler.seen[data.GetID()].size());
+    size_t i = 0;
+    for (auto e : handler.seen[data.GetID()]) {
+      auto write_entry = entries[i++];
+      ASSERT_EQ(e.type, write_entry.type);
+      ASSERT_EQ(e.key, write_entry.key);
+      if (e.type != kDeleteRecord) {
+        ASSERT_EQ(e.value, write_entry.value);
+      }
+    }
+  }
+
+  // Verify index column family
+  {
+    ASSERT_EQ(entries.size(), handler.seen[index.GetID()].size());
+    size_t i = 0;
+    for (auto e : handler.seen[index.GetID()]) {
+      auto write_entry = entries[i++];
+      ASSERT_EQ(e.key, write_entry.value);
+      if (write_entry.type != kDeleteRecord) {
+        ASSERT_EQ(e.value, write_entry.key);
+      }
+    }
+  }
+}
+
+TEST_F(WBWIKeepTest, TestValueAsSecondaryIndex) {
+  Entry entries[] = {
+      {"aaa", "0005", kPutRecord},   {"b", "0002", kPutRecord},
+      {"cdd", "0002", kMergeRecord}, {"aab", "00001", kPutRecord},
+      {"cc", "00005", kPutRecord},   {"cdd", "0002", kPutRecord},
+      {"aab", "0003", kPutRecord},   {"cc", "00005", kDeleteRecord},
+  };
+  std::vector<Entry> entries_list(entries, entries + 8);
+
+  batch_.reset(new WriteBatchWithIndex(nullptr, 20, false));
+
+  TestValueAsSecondaryIndexHelper(entries_list, batch_.get());
+
+  // Clear batch and re-run test with new values
+  batch_->Clear();
+
+  Entry new_entries[] = {
+      {"aaa", "0005", kPutRecord},   {"e", "0002", kPutRecord},
+      {"add", "0002", kMergeRecord}, {"aab", "00001", kPutRecord},
+      {"zz", "00005", kPutRecord},   {"add", "0002", kPutRecord},
+      {"aab", "0003", kPutRecord},   {"zz", "00005", kDeleteRecord},
+  };
+
+  entries_list = std::vector<Entry>(new_entries, new_entries + 8);
+
+  TestValueAsSecondaryIndexHelper(entries_list, batch_.get());
+}
+
+TEST_P(WriteBatchWithIndexTest, TestComparatorForCF) {
+  ColumnFamilyHandleImplDummy cf1(6, nullptr);
+  ColumnFamilyHandleImplDummy reverse_cf(66, ReverseBytewiseComparator());
+  ColumnFamilyHandleImplDummy cf2(88, BytewiseComparator());
+
+  ASSERT_OK(batch_->Put(&cf1, "ddd", ""));
+  ASSERT_OK(batch_->Put(&cf2, "aaa", ""));
+  ASSERT_OK(batch_->Put(&cf2, "eee", ""));
+  ASSERT_OK(batch_->Put(&cf1, "ccc", ""));
+  ASSERT_OK(batch_->Put(&reverse_cf, "a11", ""));
+  ASSERT_OK(batch_->Put(&cf1, "bbb", ""));
+
+  Slice key_slices[] = {"a", "3", "3"};
+  Slice value_slice = "";
+  ASSERT_OK(batch_->Put(&reverse_cf, SliceParts(key_slices, 3),
+                        SliceParts(&value_slice, 1)));
+  ASSERT_OK(batch_->Put(&reverse_cf, "a22", ""));
+
+  {
+    std::unique_ptr<WBWIIterator> iter(batch_->NewIterator(&cf1));
+    iter->Seek("");
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("bbb", iter->Entry().key.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("ccc", iter->Entry().key.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("ddd", iter->Entry().key.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  {
+    std::unique_ptr<WBWIIterator> iter(batch_->NewIterator(&cf2));
+    iter->Seek("");
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("aaa", iter->Entry().key.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("eee", iter->Entry().key.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  {
+    std::unique_ptr<WBWIIterator> iter(batch_->NewIterator(&reverse_cf));
+    iter->Seek("");
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("z");
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a33", iter->Entry().key.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a22", iter->Entry().key.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a11", iter->Entry().key.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("a22");
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a22", iter->Entry().key.ToString());
+
+    iter->Seek("a13");
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a11", iter->Entry().key.ToString());
+  }
+}
+
+TEST_F(WBWIOverwriteTest, TestOverwriteKey) {
+  ColumnFamilyHandleImplDummy cf1(6, nullptr);
+  ColumnFamilyHandleImplDummy reverse_cf(66, ReverseBytewiseComparator());
+  ColumnFamilyHandleImplDummy cf2(88, BytewiseComparator());
+
+  ASSERT_OK(batch_->Merge(&cf1, "ddd", ""));
+  ASSERT_OK(batch_->Put(&cf1, "ddd", ""));
+  ASSERT_OK(batch_->Delete(&cf1, "ddd"));
+  ASSERT_OK(batch_->Put(&cf2, "aaa", ""));
+  ASSERT_OK(batch_->Delete(&cf2, "aaa"));
+  ASSERT_OK(batch_->Put(&cf2, "aaa", "aaa"));
+  ASSERT_OK(batch_->Put(&cf2, "eee", "eee"));
+  ASSERT_OK(batch_->Put(&cf1, "ccc", ""));
+  ASSERT_OK(batch_->Put(&reverse_cf, "a11", ""));
+  ASSERT_OK(batch_->Delete(&cf1, "ccc"));
+  ASSERT_OK(batch_->Put(&reverse_cf, "a33", "a33"));
+  ASSERT_OK(batch_->Put(&reverse_cf, "a11", "a11"));
+  Slice slices[] = {"a", "3", "3"};
+  ASSERT_OK(batch_->Delete(&reverse_cf, SliceParts(slices, 3)));
+
+  {
+    std::unique_ptr<WBWIIterator> iter(batch_->NewIterator(&cf1));
+    iter->Seek("");
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("ccc", iter->Entry().key.ToString());
+    ASSERT_TRUE(iter->Entry().type == WriteType::kDeleteRecord);
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("ddd", iter->Entry().key.ToString());
+    ASSERT_TRUE(iter->Entry().type == WriteType::kDeleteRecord);
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  {
+    std::unique_ptr<WBWIIterator> iter(batch_->NewIterator(&cf2));
+    iter->SeekToLast();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("eee", iter->Entry().key.ToString());
+    ASSERT_EQ("eee", iter->Entry().value.ToString());
+    iter->Prev();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("aaa", iter->Entry().key.ToString());
+    ASSERT_EQ("aaa", iter->Entry().value.ToString());
+    iter->Prev();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("aaa", iter->Entry().key.ToString());
+    ASSERT_EQ("aaa", iter->Entry().value.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("eee", iter->Entry().key.ToString());
+    ASSERT_EQ("eee", iter->Entry().value.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  {
+    std::unique_ptr<WBWIIterator> iter(batch_->NewIterator(&reverse_cf));
+    iter->Seek("");
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("z");
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a33", iter->Entry().key.ToString());
+    ASSERT_TRUE(iter->Entry().type == WriteType::kDeleteRecord);
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a11", iter->Entry().key.ToString());
+    ASSERT_EQ("a11", iter->Entry().value.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->SeekToLast();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a11", iter->Entry().key.ToString());
+    ASSERT_EQ("a11", iter->Entry().value.ToString());
+    iter->Prev();
+
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a33", iter->Entry().key.ToString());
+    ASSERT_TRUE(iter->Entry().type == WriteType::kDeleteRecord);
+    iter->Prev();
+    ASSERT_TRUE(!iter->Valid());
+  }
+}
+
+TEST_P(WriteBatchWithIndexTest, TestWBWIIterator) {
+  ColumnFamilyHandleImplDummy cf1(1, BytewiseComparator());
+  ColumnFamilyHandleImplDummy cf2(2, BytewiseComparator());
+  ASSERT_OK(batch_->Put(&cf1, "a", "a1"));
+  ASSERT_OK(batch_->Put(&cf1, "c", "c1"));
+  ASSERT_OK(batch_->Put(&cf1, "c", "c2"));
+  ASSERT_OK(batch_->Put(&cf1, "e", "e1"));
+  ASSERT_OK(batch_->Put(&cf1, "e", "e2"));
+  ASSERT_OK(batch_->Put(&cf1, "e", "e3"));
+  std::unique_ptr<WBWIIteratorImpl> iter1(
+      static_cast<WBWIIteratorImpl*>(batch_->NewIterator(&cf1)));
+  std::unique_ptr<WBWIIteratorImpl> iter2(
+      static_cast<WBWIIteratorImpl*>(batch_->NewIterator(&cf2)));
+  AssertIterEqual(iter1.get(), {"a", "c", "e"});
+  AssertIterEqual(iter2.get(), {});
+  ASSERT_OK(batch_->Put(&cf2, "a", "a2"));
+  ASSERT_OK(batch_->Merge(&cf2, "b", "b1"));
+  ASSERT_OK(batch_->Merge(&cf2, "b", "b2"));
+  ASSERT_OK(batch_->Delete(&cf2, "d"));
+  ASSERT_OK(batch_->Merge(&cf2, "d", "d2"));
+  ASSERT_OK(batch_->Merge(&cf2, "d", "d3"));
+  ASSERT_OK(batch_->Delete(&cf2, "f"));
+  AssertIterEqual(iter1.get(), {"a", "c", "e"});
+  AssertIterEqual(iter2.get(), {"a", "b", "d", "f"});
+}
+
+TEST_P(WriteBatchWithIndexTest, TestRandomIteraratorWithBase) {
+  std::vector<std::string> source_strings = {"a", "b", "c", "d", "e",
+                                             "f", "g", "h", "i", "j"};
+  for (int rand_seed = 301; rand_seed < 366; rand_seed++) {
+    Random rnd(rand_seed);
+
+    ColumnFamilyHandleImplDummy cf1(6, BytewiseComparator());
+    ColumnFamilyHandleImplDummy cf2(2, BytewiseComparator());
+    ColumnFamilyHandleImplDummy cf3(8, BytewiseComparator());
+    batch_->Clear();
+
+    if (rand_seed % 2 == 0) {
+      ASSERT_OK(batch_->Put(&cf2, "zoo", "bar"));
+    }
+    if (rand_seed % 4 == 1) {
+      ASSERT_OK(batch_->Put(&cf3, "zoo", "bar"));
+    }
+
+    KVMap map;
+    KVMap merged_map;
+    for (auto key : source_strings) {
+      std::string value = key + key;
+      int type = rnd.Uniform(6);
+      switch (type) {
+        case 0:
+          // only base has it
+          map[key] = value;
+          merged_map[key] = value;
+          break;
+        case 1:
+          // only delta has it
+          ASSERT_OK(batch_->Put(&cf1, key, value));
+          map[key] = value;
+          merged_map[key] = value;
+          break;
+        case 2:
+          // both has it. Delta should win
+          ASSERT_OK(batch_->Put(&cf1, key, value));
+          map[key] = "wrong_value";
+          merged_map[key] = value;
+          break;
+        case 3:
+          // both has it. Delta is delete
+          ASSERT_OK(batch_->Delete(&cf1, key));
+          map[key] = "wrong_value";
+          break;
+        case 4:
+          // only delta has it. Delta is delete
+          ASSERT_OK(batch_->Delete(&cf1, key));
+          map[key] = "wrong_value";
+          break;
+        default:
+          // Neither iterator has it.
+          break;
+      }
+    }
+
+    std::unique_ptr<Iterator> iter(
+        batch_->NewIteratorWithBase(&cf1, new KVIter(&map)));
+    std::unique_ptr<Iterator> result_iter(new KVIter(&merged_map));
+
+    bool is_valid = false;
+    for (int i = 0; i < 128; i++) {
+      // Random walk and make sure iter and result_iter returns the
+      // same key and value
+      int type = rnd.Uniform(6);
+      ASSERT_OK(iter->status());
+      switch (type) {
+        case 0:
+          // Seek to First
+          iter->SeekToFirst();
+          result_iter->SeekToFirst();
+          break;
+        case 1:
+          // Seek to last
+          iter->SeekToLast();
+          result_iter->SeekToLast();
+          break;
+        case 2: {
+          // Seek to random key
+          auto key_idx = rnd.Uniform(static_cast<int>(source_strings.size()));
+          auto key = source_strings[key_idx];
+          iter->Seek(key);
+          result_iter->Seek(key);
+          break;
+        }
+        case 3: {
+          // SeekForPrev to random key
+          auto key_idx = rnd.Uniform(static_cast<int>(source_strings.size()));
+          auto key = source_strings[key_idx];
+          iter->SeekForPrev(key);
+          result_iter->SeekForPrev(key);
+          break;
+        }
+        case 4:
+          // Next
+          if (is_valid) {
+            iter->Next();
+            result_iter->Next();
+          } else {
+            continue;
+          }
+          break;
+        default:
+          assert(type == 5);
+          // Prev
+          if (is_valid) {
+            iter->Prev();
+            result_iter->Prev();
+          } else {
+            continue;
+          }
+          break;
+      }
+      AssertItersMatch(iter.get(), result_iter.get());
+      is_valid = iter->Valid();
+    }
+
+    ASSERT_OK(iter->status());
+  }
+}
+
+TEST_P(WriteBatchWithIndexTest, TestIteraratorWithBase) {
+  ColumnFamilyHandleImplDummy cf1(6, BytewiseComparator());
+  ColumnFamilyHandleImplDummy cf2(2, BytewiseComparator());
+  {
+    KVMap map;
+    map["a"] = "aa";
+    map["c"] = "cc";
+    map["e"] = "ee";
+    std::unique_ptr<Iterator> iter(
+        batch_->NewIteratorWithBase(&cf1, new KVIter(&map)));
+
+    iter->SeekToFirst();
+    AssertIter(iter.get(), "a", "aa");
+    iter->Next();
+    AssertIter(iter.get(), "c", "cc");
+    iter->Next();
+    AssertIter(iter.get(), "e", "ee");
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->SeekToLast();
+    AssertIter(iter.get(), "e", "ee");
+    iter->Prev();
+    AssertIter(iter.get(), "c", "cc");
+    iter->Prev();
+    AssertIter(iter.get(), "a", "aa");
+    iter->Prev();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("b");
+    AssertIter(iter.get(), "c", "cc");
+
+    iter->Prev();
+    AssertIter(iter.get(), "a", "aa");
+
+    iter->Seek("a");
+    AssertIter(iter.get(), "a", "aa");
+  }
+
+  // Test the case that there is one element in the write batch
+  ASSERT_OK(batch_->Put(&cf2, "zoo", "bar"));
+  ASSERT_OK(batch_->Put(&cf1, "a", "aa"));
+  {
+    KVMap empty_map;
+    std::unique_ptr<Iterator> iter(
+        batch_->NewIteratorWithBase(&cf1, new KVIter(&empty_map)));
+
+    iter->SeekToFirst();
+    AssertIter(iter.get(), "a", "aa");
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  ASSERT_OK(batch_->Delete(&cf1, "b"));
+  ASSERT_OK(batch_->Put(&cf1, "c", "cc"));
+  ASSERT_OK(batch_->Put(&cf1, "d", "dd"));
+  ASSERT_OK(batch_->Delete(&cf1, "e"));
+
+  {
+    KVMap map;
+    map["b"] = "";
+    map["cc"] = "cccc";
+    map["f"] = "ff";
+    std::unique_ptr<Iterator> iter(
+        batch_->NewIteratorWithBase(&cf1, new KVIter(&map)));
+
+    iter->SeekToFirst();
+    AssertIter(iter.get(), "a", "aa");
+    iter->Next();
+    AssertIter(iter.get(), "c", "cc");
+    iter->Next();
+    AssertIter(iter.get(), "cc", "cccc");
+    iter->Next();
+    AssertIter(iter.get(), "d", "dd");
+    iter->Next();
+    AssertIter(iter.get(), "f", "ff");
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->SeekToLast();
+    AssertIter(iter.get(), "f", "ff");
+    iter->Prev();
+    AssertIter(iter.get(), "d", "dd");
+    iter->Prev();
+    AssertIter(iter.get(), "cc", "cccc");
+    iter->Prev();
+    AssertIter(iter.get(), "c", "cc");
+    iter->Next();
+    AssertIter(iter.get(), "cc", "cccc");
+    iter->Prev();
+    AssertIter(iter.get(), "c", "cc");
+    iter->Prev();
+    AssertIter(iter.get(), "a", "aa");
+    iter->Prev();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("c");
+    AssertIter(iter.get(), "c", "cc");
+
+    iter->Seek("cb");
+    AssertIter(iter.get(), "cc", "cccc");
+
+    iter->Seek("cc");
+    AssertIter(iter.get(), "cc", "cccc");
+    iter->Next();
+    AssertIter(iter.get(), "d", "dd");
+
+    iter->Seek("e");
+    AssertIter(iter.get(), "f", "ff");
+
+    iter->Prev();
+    AssertIter(iter.get(), "d", "dd");
+
+    iter->Next();
+    AssertIter(iter.get(), "f", "ff");
+  }
+
+  {
+    KVMap empty_map;
+    std::unique_ptr<Iterator> iter(
+        batch_->NewIteratorWithBase(&cf1, new KVIter(&empty_map)));
+
+    iter->SeekToFirst();
+    AssertIter(iter.get(), "a", "aa");
+    iter->Next();
+    AssertIter(iter.get(), "c", "cc");
+    iter->Next();
+    AssertIter(iter.get(), "d", "dd");
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->SeekToLast();
+    AssertIter(iter.get(), "d", "dd");
+    iter->Prev();
+    AssertIter(iter.get(), "c", "cc");
+    iter->Prev();
+    AssertIter(iter.get(), "a", "aa");
+
+    iter->Prev();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("aa");
+    AssertIter(iter.get(), "c", "cc");
+    iter->Next();
+    AssertIter(iter.get(), "d", "dd");
+
+    iter->Seek("ca");
+    AssertIter(iter.get(), "d", "dd");
+
+    iter->Prev();
+    AssertIter(iter.get(), "c", "cc");
+  }
+}
+
+TEST_P(WriteBatchWithIndexTest, TestIteraratorWithBaseReverseCmp) {
+  ColumnFamilyHandleImplDummy cf1(6, ReverseBytewiseComparator());
+  ColumnFamilyHandleImplDummy cf2(2, ReverseBytewiseComparator());
+
+  // Test the case that there is one element in the write batch
+  ASSERT_OK(batch_->Put(&cf2, "zoo", "bar"));
+  ASSERT_OK(batch_->Put(&cf1, "a", "aa"));
+  {
+    KVMap empty_map;
+    std::unique_ptr<Iterator> iter(
+        batch_->NewIteratorWithBase(&cf1, new KVIter(&empty_map)));
+
+    iter->SeekToFirst();
+    AssertIter(iter.get(), "a", "aa");
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  ASSERT_OK(batch_->Put(&cf1, "c", "cc"));
+  {
+    KVMap map;
+    std::unique_ptr<Iterator> iter(
+        batch_->NewIteratorWithBase(&cf1, new KVIter(&map)));
+
+    iter->SeekToFirst();
+    AssertIter(iter.get(), "c", "cc");
+    iter->Next();
+    AssertIter(iter.get(), "a", "aa");
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->SeekToLast();
+    AssertIter(iter.get(), "a", "aa");
+    iter->Prev();
+    AssertIter(iter.get(), "c", "cc");
+    iter->Prev();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("b");
+    AssertIter(iter.get(), "a", "aa");
+
+    iter->Prev();
+    AssertIter(iter.get(), "c", "cc");
+
+    iter->Seek("a");
+    AssertIter(iter.get(), "a", "aa");
+  }
+
+  // default column family
+  ASSERT_OK(batch_->Put("a", "b"));
+  {
+    KVMap map;
+    map["b"] = "";
+    std::unique_ptr<Iterator> iter(
+        batch_->NewIteratorWithBase(new KVIter(&map)));
+
+    iter->SeekToFirst();
+    AssertIter(iter.get(), "a", "b");
+    iter->Next();
+    AssertIter(iter.get(), "b", "");
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->SeekToLast();
+    AssertIter(iter.get(), "b", "");
+    iter->Prev();
+    AssertIter(iter.get(), "a", "b");
+    iter->Prev();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("b");
+    AssertIter(iter.get(), "b", "");
+
+    iter->Prev();
+    AssertIter(iter.get(), "a", "b");
+
+    iter->Seek("0");
+    AssertIter(iter.get(), "a", "b");
+  }
+}
+
+TEST_P(WriteBatchWithIndexTest, TestGetFromBatch) {
+  Options options;
+  Status s;
+  std::string value;
+
+  s = batch_->GetFromBatch(options_, "b", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  ASSERT_OK(batch_->Put("a", "a"));
+  ASSERT_OK(batch_->Put("b", "b"));
+  ASSERT_OK(batch_->Put("c", "c"));
+  ASSERT_OK(batch_->Put("a", "z"));
+  ASSERT_OK(batch_->Delete("c"));
+  ASSERT_OK(batch_->Delete("d"));
+  ASSERT_OK(batch_->Delete("e"));
+  ASSERT_OK(batch_->Put("e", "e"));
+
+  s = batch_->GetFromBatch(options_, "b", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("b", value);
+
+  s = batch_->GetFromBatch(options_, "a", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("z", value);
+
+  s = batch_->GetFromBatch(options_, "c", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = batch_->GetFromBatch(options_, "d", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = batch_->GetFromBatch(options_, "x", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = batch_->GetFromBatch(options_, "e", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("e", value);
+
+  ASSERT_OK(batch_->Merge("z", "z"));
+
+  s = batch_->GetFromBatch(options_, "z", &value);
+  ASSERT_NOK(s);  // No merge operator specified.
+
+  s = batch_->GetFromBatch(options_, "b", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("b", value);
+}
+
+TEST_P(WriteBatchWithIndexTest, TestGetFromBatchMerge) {
+  Status s = OpenDB();
+  ASSERT_OK(s);
+
+  ColumnFamilyHandle* column_family = db_->DefaultColumnFamily();
+  std::string value;
+
+  s = batch_->GetFromBatch(options_, "x", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  ASSERT_OK(batch_->Put("x", "X"));
+  std::string expected = "X";
+
+  for (int i = 0; i < 5; i++) {
+    ASSERT_OK(batch_->Merge("x", std::to_string(i)));
+    expected = expected + "," + std::to_string(i);
+
+    if (i % 2 == 0) {
+      ASSERT_OK(batch_->Put("y", std::to_string(i / 2)));
+    }
+
+    ASSERT_OK(batch_->Merge("z", "z"));
+
+    s = batch_->GetFromBatch(column_family, options_, "x", &value);
+    ASSERT_OK(s);
+    ASSERT_EQ(expected, value);
+
+    s = batch_->GetFromBatch(column_family, options_, "y", &value);
+    ASSERT_OK(s);
+    ASSERT_EQ(std::to_string(i / 2), value);
+
+    s = batch_->GetFromBatch(column_family, options_, "z", &value);
+    ASSERT_TRUE(s.IsMergeInProgress());
+  }
+}
+
+TEST_F(WBWIOverwriteTest, TestGetFromBatchMerge2) {
+  Status s = OpenDB();
+  ASSERT_OK(s);
+
+  ColumnFamilyHandle* column_family = db_->DefaultColumnFamily();
+  std::string value;
+
+  s = batch_->GetFromBatch(column_family, options_, "X", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  ASSERT_OK(batch_->Put(column_family, "X", "x"));
+  ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value));
+  ASSERT_EQ("x", value);
+
+  ASSERT_OK(batch_->Put(column_family, "X", "x2"));
+  ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value));
+  ASSERT_EQ("x2", value);
+
+  ASSERT_OK(batch_->Merge(column_family, "X", "aaa"));
+  ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value));
+  ASSERT_EQ("x2,aaa", value);
+
+  ASSERT_OK(batch_->Merge(column_family, "X", "bbb"));
+  ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value));
+  ASSERT_EQ("x2,aaa,bbb", value);
+
+  ASSERT_OK(batch_->Put(column_family, "X", "x3"));
+  ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value));
+  ASSERT_EQ("x3", value);
+
+  ASSERT_OK(batch_->Merge(column_family, "X", "ccc"));
+  ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value));
+  ASSERT_EQ("x3,ccc", value);
+
+  ASSERT_OK(batch_->Delete(column_family, "X"));
+  s = batch_->GetFromBatch(column_family, options_, "X", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  batch_->Merge(column_family, "X", "ddd");
+  ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value));
+  ASSERT_EQ("ddd", value);
+}
+
+TEST_P(WriteBatchWithIndexTest, TestGetFromBatchAndDB) {
+  ASSERT_OK(OpenDB());
+
+  std::string value;
+
+  ASSERT_OK(db_->Put(write_opts_, "a", "a"));
+  ASSERT_OK(db_->Put(write_opts_, "b", "b"));
+  ASSERT_OK(db_->Put(write_opts_, "c", "c"));
+
+  ASSERT_OK(batch_->Put("a", "batch_->a"));
+  ASSERT_OK(batch_->Delete("b"));
+
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
+  ASSERT_EQ("batch_->a", value);
+
+  Status s = batch_->GetFromBatchAndDB(db_, read_opts_, "b", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "c", &value));
+  ASSERT_EQ("c", value);
+
+  s = batch_->GetFromBatchAndDB(db_, read_opts_, "x", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  ASSERT_OK(db_->Delete(write_opts_, "x"));
+
+  s = batch_->GetFromBatchAndDB(db_, read_opts_, "x", &value);
+  ASSERT_TRUE(s.IsNotFound());
+}
+
+TEST_P(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge) {
+  Status s = OpenDB();
+  ASSERT_OK(s);
+
+  std::string value;
+
+  ASSERT_OK(db_->Put(write_opts_, "a", "a0"));
+  ASSERT_OK(db_->Put(write_opts_, "b", "b0"));
+  ASSERT_OK(db_->Merge(write_opts_, "b", "b1"));
+  ASSERT_OK(db_->Merge(write_opts_, "c", "c0"));
+  ASSERT_OK(db_->Merge(write_opts_, "d", "d0"));
+
+  ASSERT_OK(batch_->Merge("a", "a1"));
+  ASSERT_OK(batch_->Merge("a", "a2"));
+  ASSERT_OK(batch_->Merge("b", "b2"));
+  ASSERT_OK(batch_->Merge("d", "d1"));
+  ASSERT_OK(batch_->Merge("e", "e0"));
+
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
+  ASSERT_EQ("a0,a1,a2", value);
+
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "b", &value));
+  ASSERT_EQ("b0,b1,b2", value);
+
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "c", &value));
+  ASSERT_EQ("c0", value);
+
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "d", &value));
+  ASSERT_EQ("d0,d1", value);
+
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "e", &value));
+  ASSERT_EQ("e0", value);
+
+  ASSERT_OK(db_->Delete(write_opts_, "x"));
+
+  s = batch_->GetFromBatchAndDB(db_, read_opts_, "x", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ReadOptions snapshot_read_options;
+  snapshot_read_options.snapshot = snapshot;
+
+  ASSERT_OK(db_->Delete(write_opts_, "a"));
+
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
+  ASSERT_EQ("a1,a2", value);
+
+  ASSERT_OK(
+      s = batch_->GetFromBatchAndDB(db_, snapshot_read_options, "a", &value));
+  ASSERT_EQ("a0,a1,a2", value);
+
+  ASSERT_OK(batch_->Delete("a"));
+
+  s = batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = batch_->GetFromBatchAndDB(db_, snapshot_read_options, "a", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  ASSERT_OK(s = db_->Merge(write_opts_, "c", "c1"));
+
+  ASSERT_OK(s = batch_->GetFromBatchAndDB(db_, read_opts_, "c", &value));
+  ASSERT_EQ("c0,c1", value);
+
+  ASSERT_OK(
+      s = batch_->GetFromBatchAndDB(db_, snapshot_read_options, "c", &value));
+  ASSERT_EQ("c0", value);
+
+  ASSERT_OK(db_->Put(write_opts_, "e", "e1"));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "e", &value));
+  ASSERT_EQ("e1,e0", value);
+
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, snapshot_read_options, "e", &value));
+  ASSERT_EQ("e0", value);
+
+  ASSERT_OK(s = db_->Delete(write_opts_, "e"));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "e", &value));
+  ASSERT_EQ("e0", value);
+
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, snapshot_read_options, "e", &value));
+  ASSERT_EQ("e0", value);
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(WBWIOverwriteTest, TestGetFromBatchAndDBMerge2) {
+  Status s = OpenDB();
+  ASSERT_OK(s);
+
+  std::string value;
+
+  s = batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  ASSERT_OK(batch_->Merge("A", "xxx"));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value));
+  ASSERT_EQ(value, "xxx");
+
+  ASSERT_OK(batch_->Merge("A", "yyy"));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value));
+  ASSERT_EQ(value, "xxx,yyy");
+
+  ASSERT_OK(db_->Put(write_opts_, "A", "a0"));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value));
+  ASSERT_EQ(value, "a0,xxx,yyy");
+
+  ASSERT_OK(batch_->Delete("A"));
+
+  s = batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+}
+
+TEST_P(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge3) {
+  Status s = OpenDB();
+  ASSERT_OK(s);
+
+  FlushOptions flush_options;
+  std::string value;
+
+  ASSERT_OK(db_->Put(write_opts_, "A", "1"));
+  ASSERT_OK(db_->Flush(flush_options, db_->DefaultColumnFamily()));
+  ASSERT_OK(batch_->Merge("A", "2"));
+
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value));
+  ASSERT_EQ(value, "1,2");
+}
+
+TEST_P(WriteBatchWithIndexTest, TestPinnedGetFromBatchAndDB) {
+  Status s = OpenDB();
+  ASSERT_OK(s);
+
+  PinnableSlice value;
+
+  ASSERT_OK(db_->Put(write_opts_, "a", "a0"));
+  ASSERT_OK(db_->Put(write_opts_, "b", "b0"));
+  ASSERT_OK(db_->Merge(write_opts_, "b", "b1"));
+  ASSERT_OK(db_->Merge(write_opts_, "c", "c0"));
+  ASSERT_OK(db_->Merge(write_opts_, "d", "d0"));
+  ASSERT_OK(batch_->Merge("a", "a1"));
+  ASSERT_OK(batch_->Merge("a", "a2"));
+  ASSERT_OK(batch_->Merge("b", "b2"));
+  ASSERT_OK(batch_->Merge("d", "d1"));
+  ASSERT_OK(batch_->Merge("e", "e0"));
+
+  for (int i = 0; i < 2; i++) {
+    if (i == 1) {
+      // Do it again with a flushed DB...
+      ASSERT_OK(db_->Flush(FlushOptions(), db_->DefaultColumnFamily()));
+    }
+    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
+    ASSERT_EQ("a0,a1,a2", value.ToString());
+
+    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "b", &value));
+    ASSERT_EQ("b0,b1,b2", value.ToString());
+
+    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "c", &value));
+    ASSERT_EQ("c0", value.ToString());
+
+    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "d", &value));
+    ASSERT_EQ("d0,d1", value.ToString());
+
+    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "e", &value));
+    ASSERT_EQ("e0", value.ToString());
+    ASSERT_OK(db_->Delete(write_opts_, "x"));
+
+    s = batch_->GetFromBatchAndDB(db_, read_opts_, "x", &value);
+    ASSERT_TRUE(s.IsNotFound());
+  }
+}
+
+void AssertKey(std::string key, WBWIIterator* iter) {
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(key, iter->Entry().key.ToString());
+}
+
+void AssertValue(std::string value, WBWIIterator* iter) {
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(value, iter->Entry().value.ToString());
+}
+
+// Tests that we can write to the WBWI while we iterate (from a single thread).
+// iteration should see the newest writes
+TEST_F(WBWIOverwriteTest, MutateWhileIteratingCorrectnessTest) {
+  for (char c = 'a'; c <= 'z'; ++c) {
+    ASSERT_OK(batch_->Put(std::string(1, c), std::string(1, c)));
+  }
+
+  std::unique_ptr<WBWIIterator> iter(batch_->NewIterator());
+  iter->Seek("k");
+  AssertKey("k", iter.get());
+  iter->Next();
+  AssertKey("l", iter.get());
+  ASSERT_OK(batch_->Put("ab", "cc"));
+  iter->Next();
+  AssertKey("m", iter.get());
+  ASSERT_OK(batch_->Put("mm", "kk"));
+  iter->Next();
+  AssertKey("mm", iter.get());
+  AssertValue("kk", iter.get());
+  ASSERT_OK(batch_->Delete("mm"));
+
+  iter->Next();
+  AssertKey("n", iter.get());
+  iter->Prev();
+  AssertKey("mm", iter.get());
+  ASSERT_EQ(kDeleteRecord, iter->Entry().type);
+
+  iter->Seek("ab");
+  AssertKey("ab", iter.get());
+  ASSERT_OK(batch_->Delete("x"));
+  iter->Seek("x");
+  AssertKey("x", iter.get());
+  ASSERT_EQ(kDeleteRecord, iter->Entry().type);
+  iter->Prev();
+  AssertKey("w", iter.get());
+}
+
+void AssertIterKey(std::string key, Iterator* iter) {
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(key, iter->key().ToString());
+}
+
+void AssertIterValue(std::string value, Iterator* iter) {
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(value, iter->value().ToString());
+}
+
+// same thing as above, but testing IteratorWithBase
+TEST_F(WBWIOverwriteTest, MutateWhileIteratingBaseCorrectnessTest) {
+  WriteBatchWithIndex batch(BytewiseComparator(), 0, true);
+  for (char c = 'a'; c <= 'z'; ++c) {
+    ASSERT_OK(batch_->Put(std::string(1, c), std::string(1, c)));
+  }
+
+  KVMap map;
+  map["aa"] = "aa";
+  map["cc"] = "cc";
+  map["ee"] = "ee";
+  map["em"] = "me";
+
+  std::unique_ptr<Iterator> iter(batch_->NewIteratorWithBase(new KVIter(&map)));
+  iter->Seek("k");
+  AssertIterKey("k", iter.get());
+  iter->Next();
+  AssertIterKey("l", iter.get());
+  ASSERT_OK(batch_->Put("ab", "cc"));
+  iter->Next();
+  AssertIterKey("m", iter.get());
+  ASSERT_OK(batch_->Put("mm", "kk"));
+  iter->Next();
+  AssertIterKey("mm", iter.get());
+  AssertIterValue("kk", iter.get());
+  ASSERT_OK(batch_->Delete("mm"));
+  iter->Next();
+  AssertIterKey("n", iter.get());
+  iter->Prev();
+  // "mm" is deleted, so we're back at "m"
+  AssertIterKey("m", iter.get());
+
+  iter->Seek("ab");
+  AssertIterKey("ab", iter.get());
+  iter->Prev();
+  AssertIterKey("aa", iter.get());
+  iter->Prev();
+  AssertIterKey("a", iter.get());
+  ASSERT_OK(batch_->Delete("aa"));
+  iter->Next();
+  AssertIterKey("ab", iter.get());
+  iter->Prev();
+  AssertIterKey("a", iter.get());
+
+  ASSERT_OK(batch_->Delete("x"));
+  iter->Seek("x");
+  AssertIterKey("y", iter.get());
+  iter->Next();
+  AssertIterKey("z", iter.get());
+  iter->Prev();
+  iter->Prev();
+  AssertIterKey("w", iter.get());
+
+  ASSERT_OK(batch_->Delete("e"));
+  iter->Seek("e");
+  AssertIterKey("ee", iter.get());
+  AssertIterValue("ee", iter.get());
+  ASSERT_OK(batch_->Put("ee", "xx"));
+  // still the same value
+  AssertIterValue("ee", iter.get());
+  iter->Next();
+  AssertIterKey("em", iter.get());
+  iter->Prev();
+  // new value
+  AssertIterValue("xx", iter.get());
+
+  ASSERT_OK(iter->status());
+}
+
+// stress testing mutations with IteratorWithBase
+TEST_F(WBWIOverwriteTest, MutateWhileIteratingBaseStressTest) {
+  for (char c = 'a'; c <= 'z'; ++c) {
+    ASSERT_OK(batch_->Put(std::string(1, c), std::string(1, c)));
+  }
+
+  KVMap map;
+  for (char c = 'a'; c <= 'z'; ++c) {
+    map[std::string(2, c)] = std::string(2, c);
+  }
+
+  std::unique_ptr<Iterator> iter(batch_->NewIteratorWithBase(new KVIter(&map)));
+
+  Random rnd(301);
+  for (int i = 0; i < 1000000; ++i) {
+    int random = rnd.Uniform(8);
+    char c = static_cast<char>(rnd.Uniform(26) + 'a');
+    switch (random) {
+      case 0:
+        ASSERT_OK(batch_->Put(std::string(1, c), "xxx"));
+        break;
+      case 1:
+        ASSERT_OK(batch_->Put(std::string(2, c), "xxx"));
+        break;
+      case 2:
+        ASSERT_OK(batch_->Delete(std::string(1, c)));
+        break;
+      case 3:
+        ASSERT_OK(batch_->Delete(std::string(2, c)));
+        break;
+      case 4:
+        iter->Seek(std::string(1, c));
+        break;
+      case 5:
+        iter->Seek(std::string(2, c));
+        break;
+      case 6:
+        if (iter->Valid()) {
+          iter->Next();
+        }
+        break;
+      case 7:
+        if (iter->Valid()) {
+          iter->Prev();
+        }
+        break;
+      default:
+        assert(false);
+    }
+  }
+  ASSERT_OK(iter->status());
+}
+
+TEST_P(WriteBatchWithIndexTest, TestNewIteratorWithBaseFromWbwi) {
+  ColumnFamilyHandleImplDummy cf1(6, BytewiseComparator());
+  KVMap map;
+  map["a"] = "aa";
+  map["c"] = "cc";
+  map["e"] = "ee";
+  std::unique_ptr<Iterator> iter(
+      batch_->NewIteratorWithBase(&cf1, new KVIter(&map)));
+  ASSERT_NE(nullptr, iter);
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+}
+
+TEST_P(WriteBatchWithIndexTest, SavePointTest) {
+  ColumnFamilyHandleImplDummy cf1(1, BytewiseComparator());
+  KVMap empty_map;
+  std::unique_ptr<Iterator> cf0_iter(
+      batch_->NewIteratorWithBase(new KVIter(&empty_map)));
+  std::unique_ptr<Iterator> cf1_iter(
+      batch_->NewIteratorWithBase(&cf1, new KVIter(&empty_map)));
+  Status s;
+  KVMap kvm_cf0_0 = {{"A", "aa"}, {"B", "b"}};
+  KVMap kvm_cf1_0 = {{"A", "a1"}, {"C", "c1"}, {"E", "e1"}};
+  KVIter kvi_cf0_0(&kvm_cf0_0);
+  KVIter kvi_cf1_0(&kvm_cf1_0);
+
+  ASSERT_OK(batch_->Put("A", "a"));
+  ASSERT_OK(batch_->Put("B", "b"));
+  ASSERT_OK(batch_->Put("A", "aa"));
+  ASSERT_OK(batch_->Put(&cf1, "A", "a1"));
+  ASSERT_OK(batch_->Delete(&cf1, "B"));
+  ASSERT_OK(batch_->Put(&cf1, "C", "c1"));
+  ASSERT_OK(batch_->Put(&cf1, "E", "e1"));
+
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_0);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_0);
+  batch_->SetSavePoint();  // 1
+
+  KVMap kvm_cf0_1 = {{"B", "bb"}, {"C", "cc"}};
+  KVMap kvm_cf1_1 = {{"B", "b1"}, {"C", "c1"}};
+  KVIter kvi_cf0_1(&kvm_cf0_1);
+  KVIter kvi_cf1_1(&kvm_cf1_1);
+
+  ASSERT_OK(batch_->Put("C", "cc"));
+  ASSERT_OK(batch_->Put("B", "bb"));
+  ASSERT_OK(batch_->Delete("A"));
+  ASSERT_OK(batch_->Put(&cf1, "B", "b1"));
+  ASSERT_OK(batch_->Delete(&cf1, "A"));
+  ASSERT_OK(batch_->SingleDelete(&cf1, "E"));
+  batch_->SetSavePoint();  // 2
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_1);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_1);
+
+  KVMap kvm_cf0_2 = {{"A", "xxx"}, {"C", "cc"}};
+  KVMap kvm_cf1_2 = {{"B", "b2"}};
+  KVIter kvi_cf0_2(&kvm_cf0_2);
+  KVIter kvi_cf1_2(&kvm_cf1_2);
+
+  ASSERT_OK(batch_->Put("A", "aaa"));
+  ASSERT_OK(batch_->Put("A", "xxx"));
+  ASSERT_OK(batch_->Delete("B"));
+  ASSERT_OK(batch_->Put(&cf1, "B", "b2"));
+  ASSERT_OK(batch_->Delete(&cf1, "C"));
+  batch_->SetSavePoint();  // 3
+  batch_->SetSavePoint();  // 4
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_2);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_2);
+
+  KVMap kvm_cf0_4 = {{"A", "xxx"}, {"C", "cc"}};
+  KVMap kvm_cf1_4 = {{"B", "b2"}};
+  KVIter kvi_cf0_4(&kvm_cf0_4);
+  KVIter kvi_cf1_4(&kvm_cf1_4);
+  ASSERT_OK(batch_->SingleDelete("D"));
+  ASSERT_OK(batch_->Delete(&cf1, "D"));
+  ASSERT_OK(batch_->Delete(&cf1, "E"));
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_4);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_4);
+
+  ASSERT_OK(batch_->RollbackToSavePoint());  // rollback to 4
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_2);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_2);
+
+  ASSERT_OK(batch_->RollbackToSavePoint());  // rollback to 3
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_2);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_2);
+
+  ASSERT_OK(batch_->RollbackToSavePoint());  // rollback to 2
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_1);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_1);
+
+  batch_->SetSavePoint();  // 5
+  ASSERT_OK(batch_->Put("X", "x"));
+
+  KVMap kvm_cf0_5 = {{"B", "bb"}, {"C", "cc"}, {"X", "x"}};
+  KVIter kvi_cf0_5(&kvm_cf0_5);
+  KVIter kvi_cf1_5(&kvm_cf1_1);
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_5);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_5);
+
+  ASSERT_OK(batch_->RollbackToSavePoint());  // rollback to 5
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_1);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_1);
+
+  ASSERT_OK(batch_->RollbackToSavePoint());  // rollback to 1
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_0);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_0);
+
+  s = batch_->RollbackToSavePoint();  // no savepoint found
+  ASSERT_TRUE(s.IsNotFound());
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_0);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_0);
+
+  batch_->SetSavePoint();  // 6
+
+  batch_->Clear();
+  ASSERT_EQ("", PrintContents(batch_.get(), nullptr));
+  ASSERT_EQ("", PrintContents(batch_.get(), &cf1));
+
+  s = batch_->RollbackToSavePoint();  // rollback to 6
+  ASSERT_TRUE(s.IsNotFound());
+}
+
+TEST_P(WriteBatchWithIndexTest, SingleDeleteTest) {
+  Status s;
+  std::string value;
+
+  ASSERT_OK(batch_->SingleDelete("A"));
+
+  s = batch_->GetFromBatch(options_, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  s = batch_->GetFromBatch(options_, "B", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  batch_->Clear();
+  ASSERT_OK(batch_->Put("A", "a"));
+  ASSERT_OK(batch_->Put("A", "a2"));
+  ASSERT_OK(batch_->Put("B", "b"));
+  ASSERT_OK(batch_->SingleDelete("A"));
+
+  s = batch_->GetFromBatch(options_, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  s = batch_->GetFromBatch(options_, "B", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("b", value);
+
+  ASSERT_OK(batch_->Put("C", "c"));
+  ASSERT_OK(batch_->Put("A", "a3"));
+  ASSERT_OK(batch_->Delete("B"));
+  ASSERT_OK(batch_->SingleDelete("B"));
+  ASSERT_OK(batch_->SingleDelete("C"));
+
+  s = batch_->GetFromBatch(options_, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("a3", value);
+  s = batch_->GetFromBatch(options_, "B", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  s = batch_->GetFromBatch(options_, "C", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  s = batch_->GetFromBatch(options_, "D", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  ASSERT_OK(batch_->Put("B", "b4"));
+  ASSERT_OK(batch_->Put("C", "c4"));
+  ASSERT_OK(batch_->Put("D", "d4"));
+  ASSERT_OK(batch_->SingleDelete("D"));
+  ASSERT_OK(batch_->SingleDelete("D"));
+  ASSERT_OK(batch_->Delete("A"));
+
+  s = batch_->GetFromBatch(options_, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  s = batch_->GetFromBatch(options_, "B", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("b4", value);
+  s = batch_->GetFromBatch(options_, "C", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("c4", value);
+  s = batch_->GetFromBatch(options_, "D", &value);
+  ASSERT_TRUE(s.IsNotFound());
+}
+
+TEST_P(WriteBatchWithIndexTest, SingleDeleteDeltaIterTest) {
+  std::string value;
+  ASSERT_OK(batch_->Put("A", "a"));
+  ASSERT_OK(batch_->Put("A", "a2"));
+  ASSERT_OK(batch_->Put("B", "b"));
+  ASSERT_OK(batch_->SingleDelete("A"));
+  ASSERT_OK(batch_->Delete("B"));
+
+  KVMap map;
+  value = PrintContents(batch_.get(), &map, nullptr);
+  ASSERT_EQ("", value);
+
+  map["A"] = "aa";
+  map["C"] = "cc";
+  map["D"] = "dd";
+
+  ASSERT_OK(batch_->SingleDelete("B"));
+  ASSERT_OK(batch_->SingleDelete("C"));
+  ASSERT_OK(batch_->SingleDelete("Z"));
+
+  value = PrintContents(batch_.get(), &map, nullptr);
+  ASSERT_EQ("D:dd,", value);
+
+  ASSERT_OK(batch_->Put("A", "a3"));
+  ASSERT_OK(batch_->Put("B", "b3"));
+  ASSERT_OK(batch_->SingleDelete("A"));
+  ASSERT_OK(batch_->SingleDelete("A"));
+  ASSERT_OK(batch_->SingleDelete("D"));
+  ASSERT_OK(batch_->SingleDelete("D"));
+  ASSERT_OK(batch_->Delete("D"));
+
+  map["E"] = "ee";
+
+  value = PrintContents(batch_.get(), &map, nullptr);
+  ASSERT_EQ("B:b3,E:ee,", value);
+}
+
+TEST_P(WriteBatchWithIndexTest, MultiGetTest) {
+  // MultiGet a lot of keys in order to force std::vector reallocations
+  std::vector<std::string> keys;
+  for (int i = 0; i < 100; ++i) {
+    keys.emplace_back(std::to_string(i));
+  }
+
+  ASSERT_OK(OpenDB());
+  ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+
+  // Write some data to the db for the even numbered keys
+  {
+    WriteBatch wb;
+    for (size_t i = 0; i < keys.size(); i += 2) {
+      std::string val = "val" + std::to_string(i);
+      ASSERT_OK(wb.Put(cf0, keys[i], val));
+    }
+    ASSERT_OK(db_->Write(write_opts_, &wb));
+    for (size_t i = 0; i < keys.size(); i += 2) {
+      std::string value;
+      ASSERT_OK(db_->Get(read_opts_, cf0, keys[i], &value));
+    }
+  }
+
+  // Write some data to the batch
+  for (size_t i = 0; i < keys.size(); ++i) {
+    if ((i % 5) == 0) {
+      ASSERT_OK(batch_->Delete(cf0, keys[i]));
+    } else if ((i % 7) == 0) {
+      std::string val = "new" + std::to_string(i);
+      ASSERT_OK(batch_->Put(cf0, keys[i], val));
+    }
+    if (i > 0 && (i % 3) == 0) {
+      ASSERT_OK(batch_->Merge(cf0, keys[i], "merge"));
+    }
+  }
+
+  std::vector<Slice> key_slices;
+  for (size_t i = 0; i < keys.size(); ++i) {
+    key_slices.emplace_back(keys[i]);
+  }
+  std::vector<PinnableSlice> values(keys.size());
+  std::vector<Status> statuses(keys.size());
+
+  batch_->MultiGetFromBatchAndDB(db_, read_opts_, cf0, key_slices.size(),
+                                 key_slices.data(), values.data(),
+                                 statuses.data(), false);
+  for (size_t i = 0; i < keys.size(); ++i) {
+    if (i == 0) {
+      ASSERT_TRUE(statuses[i].IsNotFound());
+    } else if ((i % 3) == 0) {
+      ASSERT_OK(statuses[i]);
+      if ((i % 5) == 0) {  // Merge after Delete
+        ASSERT_EQ(values[i], "merge");
+      } else if ((i % 7) == 0) {  // Merge after Put
+        std::string val = "new" + std::to_string(i);
+        ASSERT_EQ(values[i], val + ",merge");
+      } else if ((i % 2) == 0) {
+        std::string val = "val" + std::to_string(i);
+        ASSERT_EQ(values[i], val + ",merge");
+      } else {
+        ASSERT_EQ(values[i], "merge");
+      }
+    } else if ((i % 5) == 0) {
+      ASSERT_TRUE(statuses[i].IsNotFound());
+    } else if ((i % 7) == 0) {
+      ASSERT_OK(statuses[i]);
+      ASSERT_EQ(values[i], "new" + std::to_string(i));
+    } else if ((i % 2) == 0) {
+      ASSERT_OK(statuses[i]);
+      ASSERT_EQ(values[i], "val" + std::to_string(i));
+    } else {
+      ASSERT_TRUE(statuses[i].IsNotFound());
+    }
+  }
+}
+TEST_P(WriteBatchWithIndexTest, MultiGetTest2) {
+  // MultiGet a lot of keys in order to force std::vector reallocations
+  const int num_keys = 700;
+  const int keys_per_pass = 100;
+  std::vector<std::string> keys;
+  for (size_t i = 0; i < num_keys; ++i) {
+    keys.emplace_back(std::to_string(i));
+  }
+  ASSERT_OK(OpenDB());
+  ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+
+  // Keys   0- 99 have a PUT in the batch but not DB
+  // Keys 100-199 have a PUT in the DB
+  // Keys 200-299 Have a PUT/DELETE
+  // Keys 300-399 Have a PUT/DELETE/MERGE
+  // Keys 400-499 have a PUT/MERGE
+  // Keys 500-599 have a MERGE only
+  // Keys 600-699 were never written
+  {
+    WriteBatch wb;
+    for (size_t i = 100; i < 500; i++) {
+      std::string val = std::to_string(i);
+      ASSERT_OK(wb.Put(cf0, keys[i], val));
+    }
+    ASSERT_OK(db_->Write(write_opts_, &wb));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions(), cf0));
+  for (size_t i = 0; i < 100; i++) {
+    ASSERT_OK(batch_->Put(cf0, keys[i], keys[i]));
+  }
+  for (size_t i = 200; i < 400; i++) {
+    ASSERT_OK(batch_->Delete(cf0, keys[i]));
+  }
+  for (size_t i = 300; i < 600; i++) {
+    std::string val = std::to_string(i) + "m";
+    ASSERT_OK(batch_->Merge(cf0, keys[i], val));
+  }
+
+  Random rnd(301);
+  std::vector<PinnableSlice> values(keys_per_pass);
+  std::vector<Status> statuses(keys_per_pass);
+  for (int pass = 0; pass < 40; pass++) {
+    std::vector<Slice> key_slices;
+    for (size_t i = 0; i < keys_per_pass; i++) {
+      int random = rnd.Uniform(num_keys);
+      key_slices.emplace_back(keys[random]);
+    }
+    batch_->MultiGetFromBatchAndDB(db_, read_opts_, cf0, keys_per_pass,
+                                   key_slices.data(), values.data(),
+                                   statuses.data(), false);
+    for (size_t i = 0; i < keys_per_pass; i++) {
+      int key = ParseInt(key_slices[i].ToString());
+      switch (key / 100) {
+        case 0:  // 0-99 PUT only
+          ASSERT_OK(statuses[i]);
+          ASSERT_EQ(values[i], key_slices[i].ToString());
+          break;
+        case 1:  // 100-199 PUT only
+          ASSERT_OK(statuses[i]);
+          ASSERT_EQ(values[i], key_slices[i].ToString());
+          break;
+        case 2:  // 200-299 Deleted
+          ASSERT_TRUE(statuses[i].IsNotFound());
+          break;
+        case 3:  // 300-399 Delete+Merge
+          ASSERT_OK(statuses[i]);
+          ASSERT_EQ(values[i], key_slices[i].ToString() + "m");
+          break;
+        case 4:  // 400-400 Put+ Merge
+          ASSERT_OK(statuses[i]);
+          ASSERT_EQ(values[i], key_slices[i].ToString() + "," +
+                                   key_slices[i].ToString() + "m");
+          break;
+        case 5:  // Merge only
+          ASSERT_OK(statuses[i]);
+          ASSERT_EQ(values[i], key_slices[i].ToString() + "m");
+          break;
+        case 6:  // Never written
+          ASSERT_TRUE(statuses[i].IsNotFound());
+          break;
+        default:
+          assert(false);
+      }  // end switch
+    }    // End for each key
+  }      // end for passes
+}
+
+// This test has merges, but the merge does not play into the final result
+TEST_P(WriteBatchWithIndexTest, FakeMergeWithIteratorTest) {
+  ASSERT_OK(OpenDB());
+  ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+
+  // The map we are starting with
+  KVMap input = {
+      {"odm", "odm0"},
+      {"omd", "omd0"},
+      {"omp", "omp0"},
+  };
+  KVMap result = {
+      {"odm", "odm2"},  // Orig, Delete, Merge
+      {"mp", "mp1"},    // Merge, Put
+      {"omp", "omp2"},  // Origi, Merge, Put
+      {"mmp", "mmp2"}   // Merge, Merge, Put
+  };
+
+  for (auto& iter : result) {
+    EXPECT_EQ(AddToBatch(cf0, iter.first), iter.second);
+  }
+  AddToBatch(cf0, "md");   // Merge, Delete
+  AddToBatch(cf0, "mmd");  // Merge, Merge, Delete
+  AddToBatch(cf0, "omd");  // Orig, Merge, Delete
+
+  KVIter kvi(&result);
+  // First try just the batch
+  std::unique_ptr<Iterator> iter(
+      batch_->NewIteratorWithBase(cf0, new KVIter(&input)));
+  AssertItersEqual(iter.get(), &kvi);
+}
+
+TEST_P(WriteBatchWithIndexTest, IteratorMergeTest) {
+  ASSERT_OK(OpenDB());
+  ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+
+  KVMap result = {
+      {"m", "m0"},                // Merge
+      {"mm", "mm0,mm1"},          // Merge, Merge
+      {"dm", "dm1"},              // Delete, Merge
+      {"dmm", "dmm1,dmm2"},       // Delete, Merge, Merge
+      {"mdm", "mdm2"},            // Merge, Delete, Merge
+      {"mpm", "mpm1,mpm2"},       // Merge, Put, Merge
+      {"pm", "pm0,pm1"},          // Put, Merge
+      {"pmm", "pmm0,pmm1,pmm2"},  // Put, Merge, Merge
+  };
+
+  for (auto& iter : result) {
+    EXPECT_EQ(AddToBatch(cf0, iter.first), iter.second);
+  }
+
+  KVIter kvi(&result);
+  // First try just the batch
+  KVMap empty_map;
+  std::unique_ptr<Iterator> iter(
+      batch_->NewIteratorWithBase(cf0, new KVIter(&empty_map)));
+  AssertItersEqual(iter.get(), &kvi);
+}
+
+TEST_P(WriteBatchWithIndexTest, IteratorMergeTestWithOrig) {
+  ASSERT_OK(OpenDB());
+  ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+  KVMap original;
+  KVMap results = {
+      {"m", "om,m0"},             // Merge
+      {"mm", "omm,mm0,mm1"},      // Merge, Merge
+      {"dm", "dm1"},              // Delete, Merge
+      {"dmm", "dmm1,dmm2"},       // Delete, Merge, Merge
+      {"mdm", "mdm2"},            // Merge, Delete, Merge
+      {"mpm", "mpm1,mpm2"},       // Merge, Put, Merge
+      {"pm", "pm0,pm1"},          // Put, Merge
+      {"pmm", "pmm0,pmm1,pmm2"},  // Put, Merge, Merge
+  };
+
+  for (auto& iter : results) {
+    AddToBatch(cf0, iter.first);
+    original[iter.first] = "o" + iter.first;
+  }
+
+  KVIter kvi(&results);
+  // First try just the batch
+  std::unique_ptr<Iterator> iter(
+      batch_->NewIteratorWithBase(cf0, new KVIter(&original)));
+  AssertItersEqual(iter.get(), &kvi);
+}
+
+TEST_P(WriteBatchWithIndexTest, GetFromBatchAfterMerge) {
+  std::string value;
+  Status s;
+
+  ASSERT_OK(OpenDB());
+  ASSERT_OK(db_->Put(write_opts_, "o", "aa"));
+  batch_->Merge("o", "bb");  // Merging bb under key "o"
+  batch_->Merge("m", "cc");  // Merging bc under key "m"
+  s = batch_->GetFromBatch(options_, "m", &value);
+  ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
+  s = batch_->GetFromBatch(options_, "o", &value);
+  ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
+
+  ASSERT_OK(db_->Write(write_opts_, batch_->GetWriteBatch()));
+  ASSERT_OK(db_->Get(read_opts_, "o", &value));
+  ASSERT_EQ(value, "aa,bb");
+  ASSERT_OK(db_->Get(read_opts_, "m", &value));
+  ASSERT_EQ(value, "cc");
+}
+
+TEST_P(WriteBatchWithIndexTest, GetFromBatchAndDBAfterMerge) {
+  std::string value;
+
+  ASSERT_OK(OpenDB());
+  ASSERT_OK(db_->Put(write_opts_, "o", "aa"));
+  ASSERT_OK(batch_->Merge("o", "bb"));  // Merging bb under key "o"
+  ASSERT_OK(batch_->Merge("m", "cc"));  // Merging bc under key "m"
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "o", &value));
+  ASSERT_EQ(value, "aa,bb");
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "m", &value));
+  ASSERT_EQ(value, "cc");
+}
+
+TEST_F(WBWIKeepTest, GetAfterPut) {
+  std::string value;
+  ASSERT_OK(OpenDB());
+  ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+
+  ASSERT_OK(db_->Put(write_opts_, "key", "orig"));
+
+  ASSERT_OK(batch_->Put("key", "aa"));  // Writing aa under key
+  ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
+  ASSERT_EQ(value, "aa");
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "aa");
+
+  ASSERT_OK(batch_->Merge("key", "bb"));  // Merging bb under key
+  ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
+  ASSERT_EQ(value, "aa,bb");
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "aa,bb");
+
+  ASSERT_OK(batch_->Merge("key", "cc"));  // Merging cc under key
+  ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
+  ASSERT_EQ(value, "aa,bb,cc");
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "aa,bb,cc");
+}
+
+TEST_P(WriteBatchWithIndexTest, GetAfterMergePut) {
+  std::string value;
+  ASSERT_OK(OpenDB());
+  ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+  ASSERT_OK(db_->Put(write_opts_, "key", "orig"));
+
+  ASSERT_OK(batch_->Merge("key", "aa"));  // Merging aa under key
+  Status s = batch_->GetFromBatch(cf0, options_, "key", &value);
+  ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "orig,aa");
+
+  ASSERT_OK(batch_->Merge("key", "bb"));  // Merging bb under key
+  s = batch_->GetFromBatch(cf0, options_, "key", &value);
+  ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "orig,aa,bb");
+
+  ASSERT_OK(batch_->Put("key", "cc"));  // Writing cc under key
+  ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
+  ASSERT_EQ(value, "cc");
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "cc");
+
+  ASSERT_OK(batch_->Merge("key", "dd"));  // Merging dd under key
+  ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
+  ASSERT_EQ(value, "cc,dd");
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "cc,dd");
+}
+
+TEST_P(WriteBatchWithIndexTest, GetAfterMergeDelete) {
+  std::string value;
+  ASSERT_OK(OpenDB());
+  ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+
+  ASSERT_OK(batch_->Merge("key", "aa"));  // Merging aa under key
+  Status s = batch_->GetFromBatch(cf0, options_, "key", &value);
+  ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "aa");
+
+  ASSERT_OK(batch_->Merge("key", "bb"));  // Merging bb under key
+  s = batch_->GetFromBatch(cf0, options_, "key", &value);
+  ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "aa,bb");
+
+  ASSERT_OK(batch_->Delete("key"));  // Delete key from batch
+  s = batch_->GetFromBatch(cf0, options_, "key", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  s = batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  ASSERT_OK(batch_->Merge("key", "cc"));  // Merging cc under key
+  ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
+  ASSERT_EQ(value, "cc");
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "cc");
+  ASSERT_OK(batch_->Merge("key", "dd"));  // Merging dd under key
+  ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
+  ASSERT_EQ(value, "cc,dd");
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "cc,dd");
+}
+
+TEST_F(WBWIOverwriteTest, TestBadMergeOperator) {
+  class FailingMergeOperator : public MergeOperator {
+   public:
+    FailingMergeOperator() {}
+
+    bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+                     MergeOperationOutput* /*merge_out*/) const override {
+      return false;
+    }
+
+    const char* Name() const override { return "Failing"; }
+  };
+  options_.merge_operator.reset(new FailingMergeOperator());
+  ASSERT_OK(OpenDB());
+
+  ColumnFamilyHandle* column_family = db_->DefaultColumnFamily();
+  std::string value;
+
+  ASSERT_OK(db_->Put(write_opts_, "a", "a0"));
+  ASSERT_OK(batch_->Put("b", "b0"));
+
+  ASSERT_OK(batch_->Merge("a", "a1"));
+  ASSERT_NOK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
+  ASSERT_NOK(batch_->GetFromBatch(column_family, options_, "a", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "b", &value));
+  ASSERT_OK(batch_->GetFromBatch(column_family, options_, "b", &value));
+}
+
+TEST_P(WriteBatchWithIndexTest, ColumnFamilyWithTimestamp) {
+  ColumnFamilyHandleImplDummy cf2(2,
+                                  test::BytewiseComparatorWithU64TsWrapper());
+
+  // Sanity checks
+  ASSERT_TRUE(batch_->Put(&cf2, "key", "ts", "value").IsNotSupported());
+  ASSERT_TRUE(batch_->Put(/*column_family=*/nullptr, "key", "ts", "value")
+                  .IsInvalidArgument());
+  ASSERT_TRUE(batch_->Delete(&cf2, "key", "ts").IsNotSupported());
+  ASSERT_TRUE(batch_->Delete(/*column_family=*/nullptr, "key", "ts")
+                  .IsInvalidArgument());
+  ASSERT_TRUE(batch_->SingleDelete(&cf2, "key", "ts").IsNotSupported());
+  ASSERT_TRUE(batch_->SingleDelete(/*column_family=*/nullptr, "key", "ts")
+                  .IsInvalidArgument());
+  {
+    std::string value;
+    ASSERT_TRUE(batch_
+                    ->GetFromBatchAndDB(
+                        /*db=*/nullptr, ReadOptions(), &cf2, "key", &value)
+                    .IsInvalidArgument());
+  }
+  {
+    constexpr size_t num_keys = 2;
+    std::array<Slice, num_keys> keys{{Slice(), Slice()}};
+    std::array<PinnableSlice, num_keys> pinnable_vals{
+        {PinnableSlice(), PinnableSlice()}};
+    std::array<Status, num_keys> statuses{{Status(), Status()}};
+    constexpr bool sorted_input = false;
+    batch_->MultiGetFromBatchAndDB(/*db=*/nullptr, ReadOptions(), &cf2,
+                                   num_keys, keys.data(), pinnable_vals.data(),
+                                   statuses.data(), sorted_input);
+    for (const auto& s : statuses) {
+      ASSERT_TRUE(s.IsInvalidArgument());
+    }
+  }
+
+  constexpr uint32_t kMaxKey = 10;
+
+  const auto ts_sz_lookup = [&cf2](uint32_t id) {
+    if (cf2.GetID() == id) {
+      return sizeof(uint64_t);
+    } else {
+      return std::numeric_limits<size_t>::max();
+    }
+  };
+
+  // Put keys
+  for (uint32_t i = 0; i < kMaxKey; ++i) {
+    std::string key;
+    PutFixed32(&key, i);
+    Status s = batch_->Put(&cf2, key, "value" + std::to_string(i));
+    ASSERT_OK(s);
+  }
+
+  WriteBatch* wb = batch_->GetWriteBatch();
+  assert(wb);
+  ASSERT_OK(
+      wb->UpdateTimestamps(std::string(sizeof(uint64_t), '\0'), ts_sz_lookup));
+
+  // Point lookup
+  for (uint32_t i = 0; i < kMaxKey; ++i) {
+    std::string value;
+    std::string key;
+    PutFixed32(&key, i);
+    Status s = batch_->GetFromBatch(&cf2, Options(), key, &value);
+    ASSERT_OK(s);
+    ASSERT_EQ("value" + std::to_string(i), value);
+  }
+
+  // Iterator
+  {
+    std::unique_ptr<WBWIIterator> it(batch_->NewIterator(&cf2));
+    uint32_t start = 0;
+    for (it->SeekToFirst(); it->Valid(); it->Next(), ++start) {
+      std::string key;
+      PutFixed32(&key, start);
+      ASSERT_OK(it->status());
+      ASSERT_EQ(key, it->Entry().key);
+      ASSERT_EQ("value" + std::to_string(start), it->Entry().value);
+      ASSERT_EQ(WriteType::kPutRecord, it->Entry().type);
+    }
+    ASSERT_EQ(kMaxKey, start);
+  }
+
+  // Delete the keys with Delete() or SingleDelete()
+  for (uint32_t i = 0; i < kMaxKey; ++i) {
+    std::string key;
+    PutFixed32(&key, i);
+    Status s;
+    if (0 == (i % 2)) {
+      s = batch_->Delete(&cf2, key);
+    } else {
+      s = batch_->SingleDelete(&cf2, key);
+    }
+    ASSERT_OK(s);
+  }
+
+  ASSERT_OK(wb->UpdateTimestamps(std::string(sizeof(uint64_t), '\xfe'),
+                                 ts_sz_lookup));
+
+  for (uint32_t i = 0; i < kMaxKey; ++i) {
+    std::string value;
+    std::string key;
+    PutFixed32(&key, i);
+    Status s = batch_->GetFromBatch(&cf2, Options(), key, &value);
+    ASSERT_TRUE(s.IsNotFound());
+  }
+
+  // Iterator
+  {
+    const bool overwrite = GetParam();
+    std::unique_ptr<WBWIIterator> it(batch_->NewIterator(&cf2));
+    uint32_t start = 0;
+    for (it->SeekToFirst(); it->Valid(); it->Next(), ++start) {
+      std::string key;
+      PutFixed32(&key, start);
+      ASSERT_EQ(key, it->Entry().key);
+      if (!overwrite) {
+        ASSERT_EQ(WriteType::kPutRecord, it->Entry().type);
+        it->Next();
+        ASSERT_TRUE(it->Valid());
+      }
+      if (0 == (start % 2)) {
+        ASSERT_EQ(WriteType::kDeleteRecord, it->Entry().type);
+      } else {
+        ASSERT_EQ(WriteType::kSingleDeleteRecord, it->Entry().type);
+      }
+    }
+  }
+}
+
+TEST_P(WriteBatchWithIndexTest, IndexNoTs) {
+  const Comparator* const ucmp = test::BytewiseComparatorWithU64TsWrapper();
+  ColumnFamilyHandleImplDummy cf(1, ucmp);
+  WriteBatchWithIndex wbwi;
+  ASSERT_OK(wbwi.Put(&cf, "a", "a0"));
+  ASSERT_OK(wbwi.Put(&cf, "a", "a1"));
+  {
+    std::string ts;
+    PutFixed64(&ts, 10000);
+    ASSERT_OK(wbwi.GetWriteBatch()->UpdateTimestamps(
+        ts, [](uint32_t cf_id) { return cf_id == 1 ? 8 : 0; }));
+  }
+  {
+    std::string value;
+    Status s = wbwi.GetFromBatch(&cf, options_, "a", &value);
+    ASSERT_OK(s);
+    ASSERT_EQ("a1", value);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(WBWI, WriteBatchWithIndexTest, testing::Bool());
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main() {
+  fprintf(stderr, "SKIPPED\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE